diff --git a/.clang-format b/.clang-format
index aff93435f58c522f5ed1090aef2005f76e91cf31..8b5830627348c6bff12260b7d9adbd357f074718 100644
--- a/.clang-format
+++ b/.clang-format
@@ -19,7 +19,7 @@ BasedOnStyle:  Google
 IndentWidth:     2
 TabWidth:        2
 ContinuationIndentWidth: 4
-AccessModifierOffset: -2  # The private/protected/public has no indent in class
+AccessModifierOffset: -1  # The private/protected/public has no indent in class
 Standard:  Cpp11 
 AllowAllParametersOfDeclarationOnNextLine: true
 BinPackParameters: false
diff --git a/.copyright.hook b/.copyright.hook
deleted file mode 100644
index 09afff2072df3384a429d01d06188218ae6e85d1..0000000000000000000000000000000000000000
--- a/.copyright.hook
+++ /dev/null
@@ -1,121 +0,0 @@
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import argparse
-import io, re
-import sys, os
-import subprocess
-import platform
-
-COPYRIGHT = '''
-  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-'''
-
-LANG_COMMENT_MARK = None
-
-NEW_LINE_MARK = None
-
-COPYRIGHT_HEADER = None
-
-if platform.system() == "Windows":
-    NEW_LINE_MARK = "\r\n"
-else:
-    NEW_LINE_MARK = '\n'
-    COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
-    p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
-    process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
-    date, err = process.communicate()
-    date = date.decode("utf-8").rstrip("\n")
-    COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
-
-
-def generate_copyright(template, lang='C'):
-    if lang == 'Python':
-        LANG_COMMENT_MARK = '#'
-    else:
-        LANG_COMMENT_MARK = "//"
-
-    lines = template.split(NEW_LINE_MARK)
-    BLANK = " "
-    ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK
-    for lino, line in enumerate(lines):
-        if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
-        if len(line)  == 0:
-            BLANK = ""
-        else:
-            BLANK = " "
-        ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK
-
-    return ans + "\n"
-
-
-def lang_type(filename):
-    if filename.endswith(".py"):
-        return "Python"
-    elif filename.endswith(".h"):
-        return "C"
-    elif filename.endswith(".c"):
-        return "C"
-    elif filename.endswith(".hpp"):
-        return "C"
-    elif filename.endswith(".cc"):
-        return "C"
-    elif filename.endswith(".cpp"):
-        return "C"
-    elif filename.endswith(".cu"):
-        return "C"
-    elif filename.endswith(".cuh"):
-        return "C"
-    elif filename.endswith(".go"):
-        return "C"
-    elif filename.endswith(".proto"):
-        return "C"
-    else:
-        print("Unsupported filetype %s", filename)
-        exit(0)
-
-
-PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
-
-
-def main(argv=None):
-    parser = argparse.ArgumentParser(
-        description='Checker for copyright declaration.')
-    parser.add_argument('filenames', nargs='*', help='Filenames to check')
-    args = parser.parse_args(argv)
-
-    retv = 0
-    for filename in args.filenames:
-        fd = io.open(filename, encoding="utf-8")
-        first_line = fd.readline()
-        second_line = fd.readline()
-        if "COPYRIGHT (C)" in first_line.upper(): continue
-        if first_line.startswith("#!") or PYTHON_ENCODE.match(
-                second_line) != None or PYTHON_ENCODE.match(first_line) != None:
-            continue
-        original_contents = io.open(filename, encoding="utf-8").read()
-        new_contents = generate_copyright(
-            COPYRIGHT, lang_type(filename)) + original_contents
-        print('Auto Insert Copyright Header {}'.format(filename))
-        retv = 1
-        with io.open(filename, 'w') as output_file:
-            output_file.write(new_contents)
-
-    return retv
-
-
-if __name__ == '__main__':
-    exit(main())
diff --git a/.gitignore b/.gitignore
index 2badc3bdaa52f2608183fa34393719be66630654..9e3a0b499f9f42856429f3a42bef313ea3df3699 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,12 +25,3 @@ third_party/
 
 # clion workspace.
 cmake-build-*
-
-# generated while compiling
-paddle/pybind/pybind.h
-CMakeFiles
-cmake_install.cmake
-paddle/.timestamp
-python/paddlepaddle.egg-info/
-paddle/fluid/pybind/pybind.h
-python/paddle/version.py
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 89c620bb2f7ef634fa80b64eec7037e8cb9a190c..e718b32cb6c48d11e73600509a17db107f438708 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,3 +1,4 @@
+repos:
 -   repo: https://github.com/Lucas-C/pre-commit-hooks.git
     sha: v1.0.1
     hooks:
@@ -22,9 +23,25 @@
     -   id: clang-format-with-version-check
         name: clang-format
         description: Format files with ClangFormat.
-        entry: bash ./.clang_format.hook -i
+        entry: bash ./tools/codestyle/clang_format.hook -i
         language: system
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
+-   repo: local
+    hooks:
+    -   id: cpplint-cpp-source
+        name: cpplint
+        description: Check C++ code style using cpplint.py.
+        entry: bash ./tools/codestyle/cpplint_pre_commit.hook
+        language: system
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
+-   repo: local
+    hooks:
+    -   id: pylint-doc-string
+        name: pylint
+        description: Check python docstring style using docstring_checker.
+        entry: bash ./tools/codestyle/pylint_pre_commit.hook
+        language: system
+        files: \.(py)$
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
     sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
     hooks:
@@ -35,7 +52,7 @@
     hooks:
     -   id: copyright_checker
         name: copyright_checker
-        entry: python ./.copyright.hook
+        entry: python ./tools/codestyle/copyright.hook
         language: system
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
         exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
diff --git a/.travis.yml b/.travis.yml
index bf6a41d13c4eabc2d8543ab821ce0ff747a061df..8c772030925dcad3909f142b08e4d8057a3f89b7 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -12,45 +12,24 @@ services:
 os:
   - linux
 env:
-  - JOB=build_doc
+  - JOB=doc
   - JOB=check_style
   - JOB=build_android
 addons:
-  apt:
-    packages:
-      - gcc-4.8
-      - g++-4.8
-      - git
-      - build-essential
-      - python
-      - python-pip
-      - python2.7-dev
-      - python-wheel
-      - libboost-dev
-      - curl
-      - swig
-      - graphviz
-      - clang-format-3.8
-      - automake
-      - libtool
-      - ccache
-  ssh_known_hosts: 52.76.173.135
+  ssh_known_hosts: 13.229.163.131
 before_install:
-  - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
-  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
-  # protobuf version.
-  - sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
-  - sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker
+  # For pylint dockstring checker
+  - sudo pip install pylint pytest astroid isort
   - |
     function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
   - |
     # 43min timeout
-    if [[ "$JOB" == "build_android" ]]; then timeout 2580 docker run -it --rm -v "$TRAVIS_BUILD_DIR:/paddle" paddlepaddle/paddle:latest-dev-android;
-    else timeout 2580 paddle/scripts/travis/${JOB}.sh; fi;
-    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else exit 1; fi;
+    paddle/scripts/paddle_docker_build.sh ${JOB}
+    if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
   - |
-    if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
+    if [[ "$JOB" != "doc" ]]; then exit 0; fi;
+    # For document only
     if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
     if [[ "$TRAVIS_BRANCH" != "develop"  && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
     export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
diff --git a/AUTHORS.md b/AUTHORS.md
index 9c6821d9f8681c5907c2fc9938fdb62ba64b9a92..8c4a113fc276783c945867ceae9612339b7f0bbc 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -2,12 +2,15 @@
 |---|---|
 | abhinavarora | Abhinav Arora |
 | backyes | Yan-Fei Wang |
+| baiyfbupt | Yi-Fan Bai |
 | beckett1124 | Bin Qi |
-| JiayiFeng | Jia-Yi Feng |
+| ChengduoZH | Cheng-Duo Zhao|
 | chengxiaohua1105 | Xiao-Hua Cheng |
 | cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
 | cxysteven | Xing-Yi Cheng |
 | dzhwinter | Zhi-Hong Dong |
+| dragonwarrior | Long Wang |
+| dyning | Yuning Du |
 | emailweixu | Wei Xu |
 | gangliao | Gang Liao |
 | gongweibao | Wei-Bao Gong |
@@ -16,6 +19,10 @@
 | hedaoyuan | Dao-Yuan He |
 | helinwang | He-Lin Wang |
 | jacquesqiao | Long-Fei Qiao |
+| jczaja | Jacek Czaja |
+| JiayiFeng | Jia-Yi Feng |
+| kbinias | Krzysztof Binias |
+| kexinzhao | Ke-Xin Zhao |
 | kuke | Yi-Bing Liu |
 | lcy-seso | Ying Cao |
 | lipeng-unisound | Peng Li |
@@ -24,15 +31,20 @@
 | llxxxll | Yong-Feng Liu |
 | luotao01 | Tao Luo |
 | lzhao4ever | Liang Zhao |
+| mozga-intel | Mateusz Ozga |
 | NHZlX | Zhao-Long Xing |
+| Noplz | Yuan Gao |
 | pakchoi | Chuan-Jiang Song |
+| panyx0718 | Xin Pan |
 | pengli09 | Peng Li |
 | pkuyym | Ya-Ming Yang |
+| pzelazko-intel | Pawel Zelazko |
 | QiJune | Jun Qi |
 | qingqing01 | Qing-Qing Dang |
 | reyoung | Yang Yu |
 | Superjom | Chun-Wei Yan |
 | tianbingsz | Tian-Bing Xu |
+| tpatejko | Tomasz Patejko |
 | typhoonzero | Yi Wu |
 | wanghaoshuang | Hao-Shuang Wang |
 | wangyang59 | Yang Wang |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c86889c05c8cf0d521dce9adbf3e918ba91729a1..23bb27e77b9eab0c322a71a8ff570d12d1050377 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,7 +25,6 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
 message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
         "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 
-find_package(Sphinx)
 if(NOT CMAKE_CROSSCOMPILING)
     find_package(CUDA QUIET)
 endif(NOT CMAKE_CROSSCOMPILING)
@@ -36,12 +35,12 @@ include(simd)
 
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
+option(WITH_AMD_GPU     "Compile PaddlePaddle with AMD GPU"             OFF)
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
 option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
-option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check"         ON)
 option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
 option(WITH_DOUBLE      "Compile PaddlePaddle with double precision"    OFF)
 option(WITH_RDMA        "Compile PaddlePaddle with RDMA support"        OFF)
@@ -52,15 +51,21 @@ option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
 option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
-# TODO: Only compile PaddlePaddle fluid version by WITH_FLUID option. 
-option(WITH_FLUID       "Compile PaddlePaddle fluid only(TODO)"         ON)
+option(WITH_FLUID_ONLY  "Compile PaddlePaddle fluid only"               OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
-option(WITH_DISTRIBUTE  "Compile with grpc distributed support"         OFF)
+option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
+option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)
+option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
+option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
+option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
+option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
+option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
+option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -101,6 +106,9 @@ endif()
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
   "A path setting third party libraries download & build directories.")
 
+set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING
+  "A path setting fluid shared and static libraries")
+
 if (WITH_C_API AND WITH_PYTHON)
   message(WARNING "It is suggest not embedded a python interpreter in Paddle "
     "when using C-API. It will give an unpredictable behavior when using a "
@@ -108,7 +116,7 @@ if (WITH_C_API AND WITH_PYTHON)
 endif()
 
 if (WITH_C_API)
-  set(WITH_FLUID OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE)
+  set(WITH_FLUID_ONLY OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE)
 endif()
 
 if(MOBILE_INFERENCE)
@@ -118,13 +126,18 @@ else()
 endif()
 
 set(WITH_MKLML ${WITH_MKL})
-if (WITH_MKL AND AVX2_FOUND)
-    set(WITH_MKLDNN ON)
-else()
-    message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
-    set(WITH_MKLDNN OFF)
+if (NOT DEFINED WITH_MKLDNN)
+    if (WITH_MKL AND AVX2_FOUND)
+        set(WITH_MKLDNN ON)
+    else()
+        message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
+        set(WITH_MKLDNN OFF)
+    endif()
 endif()
 
+if (REPLACE_ENFORCE_GLOG)
+  add_definitions("-DREPLACE_ENFORCE_GLOG")
+endif()
 ########################################################################################
 
 include(external/mklml)     # download mklml package
@@ -143,16 +156,37 @@ include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/cares)
-include(external/grpc)
+
+if(WITH_DISTRIBUTE)
+    if(WITH_GRPC)
+        include(external/grpc)
+        message(STATUS "Use grpc framework.")
+    else()
+        message(STATUS "Use brpc framework.")
+        include(external/leveldb)
+        include(external/brpc)
+    endif()
+endif()
+
+if(WITH_BRPC_RDMA)
+    message(STATUS "Use brpc with rdma.")
+    if(WITH_GRPC)
+        message(FATAL_ERROR "Can't use grpc with brpc rdma.")
+    endif()
+    if(NOT WITH_DISTRIBUTE)
+        message(FATAL_ERROR "Can't use brpc rdma in no distribute env.")
+    endif()
+endif()
+
 include(external/snappy)    # download snappy
 include(external/snappystream)
+include(external/threadpool)
 
 include(cudnn)              # set cudnn libraries, must before configure
 include(cupti)
 include(configure)          # add paddle env configuration
 include(generic)            # simplify cmake module
 include(package)            # set paddle packages
-include(cpplint)            # set paddle c++ style
 include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs
 include(rdma)               # set rdma libraries
@@ -163,7 +197,7 @@ include(inference_lib)      # add paddle fluid inference libraries
 
 
 include_directories("${PADDLE_SOURCE_DIR}")
-include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include")
+include_directories("${PADDLE_SOURCE_DIR}/paddle/legacy/cuda/include")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
 
@@ -178,7 +212,16 @@ set(EXTERNAL_LIBS
 
 if(WITH_GPU)
     include(cuda)
-endif(WITH_GPU)
+    include(tensorrt)
+    include(external/anakin)
+else()
+  set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE)
+endif()
+
+if(WITH_AMD_GPU)
+    find_package(HIP)
+    include(hip)
+endif(WITH_AMD_GPU)
 
 if(WITH_MKLML)
     list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
@@ -195,10 +238,10 @@ endif(USE_NNPACK)
 
 add_subdirectory(proto)
 
-if(NOT MOBILE_INFERENCE)
+if(NOT MOBILE_INFERENCE AND NOT WITH_FLUID_ONLY)
     # "add_subdirectory(go)" should be placed after the following loine,
     # because it depends on paddle/optimizer.
-    add_subdirectory(paddle/optimizer)
+    add_subdirectory(paddle/legacy/optimizer)
 endif()
 
 # "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
@@ -219,5 +262,11 @@ if(WITH_PYTHON)
 endif()
 
 if(WITH_DOC)
+    find_package(Sphinx REQUIRED)
+    find_python_module(recommonmark REQUIRED)
     add_subdirectory(doc)
 endif()
+
+if (WITH_CONTRIB)
+    add_subdirectory(paddle/contrib)
+endif()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3c36cffcb4eeaaf7f8cff5167777628dd2697e7d..b878f37a5b8e807e5aa346e0074a741f2f8b6cc5 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -58,6 +58,8 @@ PaddlePaddle uses this [Git branching model](http://nvie.com/posts/a-successful-
     create mode 100644 233
    ```
 
+	NOTE: The `yapf` installed by `pip install pre-commit` and `conda install -c conda-forge pre-commit` is slightly different. Paddle developers use `pip install pre-commit`.
+
 1. Build and test
 
    Users can build PaddlePaddle natively on Linux and Mac OS X.  But to unify the building environment and to make it easy for debugging, the recommended way is [using Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/build_en.md).
@@ -157,4 +159,4 @@ This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the
 - verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework)
 - verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)
 - verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform)
-- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/math)
+- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/legacy/math)
diff --git a/Dockerfile b/Dockerfile
index 60e76c7f2ede6beaca11659020d5991a75d5b741..fc5069a6c080ed23317695e6822c4c46b5b5c7f9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,6 +1,8 @@
 # A image for building paddle binaries
 # Use cuda devel base image for both cpu and gpu environment
-FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04
+# When you modify it, please be aware of cudnn-runtime version
+# and libcudnn.so.x in paddle/scripts/docker/build.sh
+FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
 ARG UBUNTU_MIRROR
@@ -21,16 +23,16 @@ ENV HOME /root
 COPY ./paddle/scripts/docker/root/ /root/
 
 RUN apt-get update && \
-    apt-get install -y \
-    git python-pip python-dev openssh-server bison \
+    apt-get install -y --allow-downgrades \
+    git python-pip python-dev python-opencv openssh-server bison \
     libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
     wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
     curl sed grep graphviz libjpeg-dev zlib1g-dev  \
     python-matplotlib gcc-4.8 g++-4.8 \
-    automake locales clang-format swig doxygen cmake  \
+    automake locales clang-format swig cmake  \
     liblapack-dev liblapacke-dev \
     clang-3.8 llvm-3.8 libclang-3.8-dev \
-    net-tools libtool && \
+    net-tools libtool ccache && \
     apt-get clean -y
 
 # Install Go and glide
@@ -45,6 +47,17 @@ ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
 # install glide
 RUN curl -s -q https://glide.sh/get | sh
 
+# Install TensorRT
+# following TensorRT.tar.gz is not the default official one, we do two miny changes:
+# 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now,
+#    and its size is only one-third of the official one.
+# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
+#    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
+RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
+    tar -xz -C /usr/local && \
+    cp -rf /usr/local/TensorRT/include /usr && \
+    cp -rf /usr/local/TensorRT/lib /usr
+
 # git credential to skip password typing
 RUN git config --global credential.helper store
 
@@ -53,15 +66,22 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 
 # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
 # version util jupyter fixes this issue.
-RUN pip install --upgrade pip && \
+
+# specify sphinx version as 1.5.6 and remove -U option for [pip install -U
+# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
+# version(1.7.1 for now), which causes building documentation failed.
+RUN easy_install -U pip && \
     pip install -U wheel && \
-    pip install -U docopt PyYAML sphinx && \
-    pip install -U sphinx-rtd-theme==0.1.9 recommonmark
+    pip install -U docopt PyYAML sphinx==1.5.6 && \
+    pip install sphinx-rtd-theme==0.1.9 recommonmark
 
 RUN pip install pre-commit 'ipython==5.3.0' && \
     pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
     pip install opencv-python
 
+#For docstring checker
+RUN pip install pylint pytest astroid isort
+
 COPY ./python/requirements.txt /root/
 RUN pip install -r /root/requirements.txt
 
@@ -84,6 +104,3 @@ RUN echo 'root:root' | chpasswd
 RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
 RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 EXPOSE 22
-
-# development image default do build work
-CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
diff --git a/Dockerfile.android b/Dockerfile.android
index cc022d596b4b74dd1e4f4d0901dd81c91a7decd1..48db2efea21a648657e3f490c95429b9a29ede52 100644
--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -27,7 +27,7 @@ RUN git config --global credential.helper store
 # Fix locales to en_US.UTF-8
 RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 
-RUN pip install --upgrade pip && \
+RUN pip install --upgrade pip==9.0.3 && \
     pip install -U 'protobuf==3.1.0' && \
     pip install -U wheel sphinx && \
     pip install pre-commit
@@ -40,5 +40,3 @@ RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \
     unzip -q android-ndk-r14b-linux-x86_64.zip && \
     mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
     rm -rf /opt/android-ndk-tmp
-
-CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"]
diff --git a/README.md b/README.md
index d06375a444dd65675bdd75baccf8445c1638a87c..eb99ed21d02650ef16cc7da91836909c02895be9 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,6 @@
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
 [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
 [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
-[![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
 
@@ -19,6 +18,8 @@ learning to many products at Baidu.
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
 
+### Lastest PaddlePaddle Version: [Fluid](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid)
+
 ## Features
 
 - **Flexibility**
@@ -62,9 +63,9 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
 ## Installation
 
 It is recommended to check out the
-[Docker installation guide](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html)
+[Docker installation guide](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/docker_install_en.html)
 before looking into the
-[build from source guide](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/build_from_source_en.html).
+[build from source guide](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/build_from_source_en.html).
 
 ## Documentation
 
@@ -75,19 +76,19 @@ We provide [English](http://www.paddlepaddle.org/docs/develop/documentation/en/g
 
   You might want to start from this online interactive book that can run in a Jupyter Notebook.
 
-- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/cluster_train_en.html)
+- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/index_en.html)
 
   You can run distributed training jobs on MPI clusters.
 
-- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/k8s_en.html)
+- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/cluster/multi_cluster/k8s_en.html)
 
    You can also run distributed training jobs on Kubernetes clusters.
 
-- [Python API](http://www.paddlepaddle.org/docs/develop/documentation/en/api/index_en.html)
+- [Python API](http://www.paddlepaddle.org/docs/develop/api/en/overview.html)
 
    Our new API enables much shorter programs.
 
-- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html)
+- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/dev/contribute_to_paddle_en.html)
 
    We appreciate your contributions!
 
diff --git a/benchmark/.gitignore b/benchmark/.gitignore
index 7b66e8a5b5020fd847982db401665d24ba3a069c..fb4114356d4f37efc8ad672316fd4f99443d9fcd 100644
--- a/benchmark/.gitignore
+++ b/benchmark/.gitignore
@@ -7,3 +7,6 @@ paddle/rnn/imdb.pkl
 caffe/image/logs
 tensorflow/image/logs
 tensorflow/rnn/logs
+fluid/models/*.pyc
+fluid/logs
+fluid/nohup.out
diff --git a/benchmark/cluster/README.md b/benchmark/cluster/README.md
deleted file mode 100644
index b619613ea7a5b6e940ec735314e8e47338b2c600..0000000000000000000000000000000000000000
--- a/benchmark/cluster/README.md
+++ /dev/null
@@ -1,78 +0,0 @@
-# Cluster Training Benchmark
-
-## Setup
-
-- Platform
-  - Kubernetes: v1.6.2
-  - Linux Kernel: v3.10.0
-
-- Resource
-  - CPU: 10 Cores per Pod
-  - Memory: 5GB per Pod
-
-- Docker Image
-
-  We use different base Docker Image to run the benchmark on Kubernetes:
-  - PaddlePaddle v2: paddlepaddle/paddle:0.11.0
-  - PaddlePaddle Fluid: paddlepaddle/paddle:[commit-id]
-  - TensorFlow: tensorflow/tensorflow:1.5.0-rc0
-
-- Model
-  vgg16 is used in this benchmark.
-
-## Cases
-
-- Variable
-  - Batch Size of training data.
-  - PServer count of the training job.
-  - The number of trainers.
-
-- Invariant
-  - The resource of trainer/pserver Pod.
-
-### Measure the Performance for Different Batch Size
-
-- PServer Count: 40
-- Trainer Count: 100
-- Metrics: mini-batch / sec
-
-| Batch Size | 32 | 64 | 128 | 256 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - |
-| PaddlePaddle v2 | - | - | - | - |
-| TensorFlow | - | - | - | - |
-
-### Measure the Performance for Different PServer Count
-
-- Trainer Count: 100
-- Batch Size: 64
-- Metrics: mini-batch / sec
-
-| PServer Count | 10 | 20 | 40 | 60 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - |
-| PaddlePaddle v2 | - | - | - | - |
-| TensorFlow | - | - | - | - |
-
-### Measure Parallel Efficiency By Increasing Trainer Count
-
-- PServer Count: 20
-- Batch Size: 64
-- Metrics:
-
-$S = \div(T1, TN)$
-
-which S is the ratio of T1 over TN, training time of 1 and N trainers.
-The parallel efficiency is:
-
-$E = \div(S, N)$
-
-| Trainer Counter | 1 | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
-| -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - | - | - | - | - | - | - | - |
-| PaddlePaddle v2 | - | - | - | - | - | - | - | - | - | - | - | - |
-| TensorFlow | - | - | - | - | - | - | - | - | - | - | - | - | - |
-
-## Reproduce the benchmark
-
-TODO
diff --git a/benchmark/cluster/vgg16/Dockerfile b/benchmark/cluster/vgg16/Dockerfile
deleted file mode 100644
index 13ad8e1b6237e6f41a076c4fb54311728832ae33..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/Dockerfile
+++ /dev/null
@@ -1,35 +0,0 @@
-FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
-
-# you can get mirror list here:
-# https://launchpad.net/ubuntu/+archivemirrors
-ARG UBUNTU_MIRROR
-RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-
-RUN apt-get update && apt-get install -y python python-dev python-pip iputils-ping libgtk2.0-dev
-RUN pip install -U kubernetes opencv-python
-
-RUN pip install paddlepaddle
-# if network is slowly, you may need to add proxy here.
-# ENV https_proxy=
-RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
-RUN pip uninstall -y paddlepaddle
-# unset proxy if it is setted.
-# ENV https_proxy=""
-
-# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
-#       so we must build one with distribute support to install in this image.
-ADD *.whl /
-RUN pip install /*.whl && rm -f /*.whl
-ENV LD_LIBRARY_PATH=/usr/local/lib
-
-# tf k8s
-RUN pip install tensorflow==1.4.0
-ADD tf_k8s /usr/bin
-RUN chmod +x /usr/bin/tf_k8s
-ADD vgg16_tf.py /workspace/
-
-# below lines may change a lot for debugging
-ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
-ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
-RUN chmod +x /usr/bin/paddle_k8s
-ADD vgg16_fluid.py vgg16_v2.py /workspace/
diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
deleted file mode 100644
index cd681a1a282d9a26eac1c267bfa26967f8c3c9fd..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/README.md
+++ /dev/null
@@ -1,77 +0,0 @@
-# Performance for Distributed vgg16
-
-## Test Result
-
-### Hardware Infomation
-
-- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
-- cpu MHz		: 2101.000
-- cache size	: 20480 KB
-
-### Blas settings
-
-Setting environment variable: `MKL_NUM_THREADS=1`.
-
-### Single Node Single Thread
-
-- Metrics: samples / sec
-
-| Batch Size | 32 | 64 | 128 | 256 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
-| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
-| TensorFlow | 9.09 | 9.10 | 9.24 | 8.66 |
-
-### Different Batch Size
-
-- PServer Count: 10
-- Trainer Count: 20
-- Metrics: samples / sec
-
-| Batch Size | 32 | 64 | 128 | 256 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
-| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
-| TensorFlow | - | - | - | - |
-
-
-### Accelerate Rate
-
-- Pserver Count: 20
-- Batch Size: 128
-- Metrics: samples / sec
-
-| Trainer Count | 20 | 40 | 80 | 100 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
-| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
-| TensorFlow | - | - | - | - |
-
-### Different Pserver Count
-
-- Trainer Count: 60
-- Batch Size: 128
-- Metrics: samples/ sec
-
-| PServer Count | 3 | 6 |10 | 20 |
-| -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
-| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
-| TensorFlow | - | - | - | - |
-
-*The performance gap between Fuild and v2 comes from the network interference.*
-
-
-## Steps to Run the Performance Test
-
-1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
-1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
-1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
-1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
-1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
-
-Check the logs for the distributed training progress and analyze the performance.
-
-## Enable Verbos Logs
-
-Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
diff --git a/benchmark/cluster/vgg16/fluid_pserver.yaml b/benchmark/cluster/vgg16/fluid_pserver.yaml
deleted file mode 100644
index ee8b0763b62fc011f40f6197e929a68b48a93e47..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/fluid_pserver.yaml
+++ /dev/null
@@ -1,72 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: ReplicaSet
-metadata:
-  name: vgg16job-pserver
-spec:
-  replicas: 10
-  template:
-    metadata:
-      labels:
-        paddle-job-pserver: vgg16job
-    spec:
-      hostNetwork: true
-      imagePullSecrets:
-      - name: job-registry-secret
-      containers:
-      - name: pserver
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
-        imagePullPolicy: Always
-        ports:
-        - name: jobport-30236
-          containerPort: 30236
-        env:
-        - name: PADDLE_JOB_NAME
-          value: vgg16job
-        - name: MKL_NUM_THREADS
-          value: "1"
-        - name: TRAINING_ROLE
-          value: "PSERVER"
-        - name: TRAINERS
-          value: "20"
-        - name: PSERVERS
-          value: "10"
-        - name: TOPOLOGY
-          value: ""
-        - name: ENTRY
-          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: PADDLE_INIT_PORT
-          value: "30236"
-        - name: PADDLE_INIT_NICS
-          value: "xgbe0"
-        - name: PADDLE_INIT_TRAINER_COUNT
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
-          value: "1"
-        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
-          value: "20"
-        - name: PADDLE_INIT_NUM_PASSES
-          value: "1"
-        - name: PADDLE_INIT_USE_GPU
-          value: "0"
-        - name: LD_LIBRARY_PATH
-          value: "/usr/local/lib:/usr/local/nvidia/lib64"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        - name: POD_IP
-          valueFrom:
-            fieldRef:
-              fieldPath: "status.podIP"
-        command: ["paddle_k8s", "start_fluid"]
-        resources:
-          requests:
-            memory: 10Gi
-            cpu: 4
-          limits:
-            memory: 10Gi
-            cpu: 4
diff --git a/benchmark/cluster/vgg16/fluid_trainer.yaml b/benchmark/cluster/vgg16/fluid_trainer.yaml
deleted file mode 100644
index 3d56caac009464d1073423bb63abff1f8b0cf28f..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ /dev/null
@@ -1,69 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: vgg16job-trainer
-spec:
-  parallelism: 20
-  completions: 20
-  template:
-    metadata:
-      labels:
-        paddle-job: vgg16job
-    spec:
-      imagePullSecrets:
-      - name: job-registry-secret
-      hostNetwork: true
-      containers:
-      - name: trainer
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
-        imagePullPolicy: Always
-        command: ["paddle_k8s", "start_fluid"]
-        env:
-        - name: PADDLE_JOB_NAME
-          value: vgg16job
-        - name: TRAINING_ROLE
-          value: "TRAINER"
-        - name: TRAINERS
-          value: "20"
-        - name: PSERVERS
-          value: "10"
-        - name: TOPOLOGY
-          value: ""
-        - name: ENTRY
-          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: PADDLE_INIT_PORT
-          value: "30236"
-        - name: PADDLE_INIT_NICS
-          value: "xgbe0"
-        - name: PADDLE_INIT_TRAINER_COUNT
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
-          value: "1"
-        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
-          value: "20"
-        - name: PADDLE_INIT_NUM_PASSES
-          value: "1"
-        - name: PADDLE_INIT_USE_GPU
-          value: "0"
-        - name: LD_LIBRARY_PATH
-          value: "/usr/local/lib:/usr/local/nvidia/lib64"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        - name: POD_IP
-          valueFrom:
-            fieldRef:
-              fieldPath: "status.podIP"
-        resources:
-          requests:
-            memory: 40Gi
-            cpu: 2
-          limits:
-            memory: 40Gi
-            cpu: 2
-      restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/tf_k8s b/benchmark/cluster/vgg16/tf_k8s
deleted file mode 100644
index 4fc263d5f681aeabfa71f1758714d269d987b272..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/tf_k8s
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/bin/bash
-check_trainer_ret() {
-  ret=$1
-  stdbuf -oL echo "job returned $ret...setting pod return message..."
-  stdbuf -oL echo "==============================="
-
-  if [ $ret -eq 136 ] ; then
-    echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
-  elif [ $ret -eq 139 ] ; then
-    echo "Segmentation Fault" > /dev/termination-log
-  elif [ $ret -eq 1 ] ; then
-    echo "General Error" > /dev/termination-log
-  elif [ $ret -eq 134 ] ; then
-    echo "Program Abort" > /dev/termination-log
-  fi
-  stdbuf -oL echo "termination log wroted..."
-  exit $ret
-}
-
-g_pservers=""
-g_trainers=""
-
-wait_running_pods(){
-  pserver_label="tf-job-pserver=${JOB_NAME}"
-  trainer_label="tf-job-trainer=${JOB_NAME}"
-
-  stdbuf -oL python /root/k8s_tools.py wait_pods_running ${pserver_label} ${PSERVERS_NUM}
-  stdbuf -oL python /root/k8s_tools.py wait_pods_running ${trainer_label} ${TRAINERS_NUM}
-
-  g_pservers=$(python /root/k8s_tools.py fetch_endpoints ${pserver_label} ${PORT})
-  g_trainers=$(python /root/k8s_tools.py fetch_endpoints ${trainer_label} ${PORT})
-}
-
-start_tf_pserver(){
-  wait_running_pods
-
-  label="tf-job-pserver=${JOB_NAME}"
-  pserver_id=$(python /root/k8s_tools.py fetch_id ${label})
-
-  cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
-  --job_name=${TF_JOB_NAME} --task_index=${pserver_id}"
-
-  stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
-}
-
-start_tf_trainer(){
-  wait_running_pods
-
-  label="tf-job-trainer=${JOB_NAME}"
-  trainer_id=$(python /root/k8s_tools.py fetch_id ${label})
-
-  cmd="${ENTRY} --ps_hosts=${g_pservers} --worker_hosts=${g_trainers} \
-  --job_name=${TF_JOB_NAME} --task_index=${trainer_id} --batch_size=${BATCH_SIZE}"
-
-  stdbuf -oL sh -c "cd ${TRAINER_PACKAGE} && ${cmd}"
-  check_trainer_ret $?
-}
-
-start_tf(){
-    if [[ "${TF_JOB_NAME}" == "worker" ]]; then
-        start_tf_trainer
-    else
-        start_tf_pserver
-    fi
-}
-
-usage() {
-    echo "usage: tf_k8s [<args>]:"
-    echo "  start_tf         Start tensorflow jobs"
-}
-
-case "$1" in
-    start_tf)
-        start_tf
-        ;;
-    --help)
-        usage
-        ;;
-    *)
-        usage
-        ;;
-esac
diff --git a/benchmark/cluster/vgg16/tf_pserver.yaml b/benchmark/cluster/vgg16/tf_pserver.yaml
deleted file mode 100644
index 5e37c700819119c8af05c40fe4b8d13911efc3e1..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/tf_pserver.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: ReplicaSet
-metadata:
-  name: vgg16job-tf-pserver
-spec:
-  replicas: 10
-  template:
-    metadata:
-      labels:
-        tf-job-pserver: vgg16job-tf
-    spec:
-      hostNetwork: true
-      imagePullSecrets:
-      - name: job-registry-secret
-      containers:
-      - name: pserver
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
-        imagePullPolicy: Always
-        command: ["tf_k8s", "start_tf"]
-        ports:
-        - name: jobport-30236
-          containerPort: 30236
-        env:
-        - name: PORT
-          value: "32036"
-        - name: ENTRY
-          value: "python vgg16_tf.py"
-        - name: JOB_NAME
-          value: vgg16job-tf
-        - name: PSERVERS_NUM
-          value: "10"
-        - name: TF_JOB_NAME 
-          value: "ps"
-        - name: TRAINERS_NUM
-          value: "20"
-        - name: BATCH_SIZE
-          value: "128"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: NUM_PASSES
-          value: "1"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        - name: POD_IP
-          valueFrom:
-            fieldRef:
-              fieldPath: "status.podIP"
-        resources:
-          requests:
-            memory: 10Gi
-            cpu: 4
-          limits:
-            memory: 10Gi
-            cpu: 4
diff --git a/benchmark/cluster/vgg16/tf_trainer.yaml b/benchmark/cluster/vgg16/tf_trainer.yaml
deleted file mode 100644
index 08795df3addfa7b618db24a65e57be190e268f06..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/tf_trainer.yaml
+++ /dev/null
@@ -1,58 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: vgg16job-tf-trainer
-spec:
-  parallelism: 20
-  completions: 20
-  template:
-    metadata:
-      labels:
-        tf-job-trainer: vgg16job-tf
-    spec:
-      imagePullSecrets:
-      - name: job-registry-secret
-      hostNetwork: true
-      containers:
-      - name: trainer
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16"
-        imagePullPolicy: Always
-        command: ["tf_k8s", "start_tf"]
-        ports:
-        - name: jobport-30236
-          containerPort: 30236
-        env:
-        - name: PORT
-          value: "32036"
-        - name: JOB_NAME
-          value: vgg16job-tf
-        - name: TF_JOB_NAME 
-          value: "worker"
-        - name: ENTRY
-          value: "python vgg16_tf.py"
-        - name: PSERVERS_NUM
-          value: "10"
-        - name: BATCH_SIZE
-          value: "128"
-        - name: TRAINERS_NUM
-          value: "20"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: NUM_PASSES
-          value: "1"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        - name: POD_IP
-          valueFrom:
-            fieldRef:
-              fieldPath: "status.podIP"
-        resources:
-          requests:
-            memory: 40Gi
-            cpu: 2
-          limits:
-            memory: 40Gi
-            cpu: 2
-      restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/v2_pserver.yaml b/benchmark/cluster/vgg16/v2_pserver.yaml
deleted file mode 100644
index dd1271e0cf399184134c06b3200ee1202c65cef0..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/v2_pserver.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-apiVersion: extensions/v1beta1
-kind: ReplicaSet
-metadata:
-  name: vgg16v2job-pserver
-spec:
-  replicas: 10
-  template:
-    metadata:
-      labels:
-        paddle-job-pserver: vgg16v2job
-    spec:
-      hostNetwork: true
-      imagePullSecrets:
-      - name: job-registry-secret
-      containers:
-      - name: pserver
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
-        imagePullPolicy: Always
-        ports:
-        - name: jobport-30236
-          containerPort: 30236
-        env:
-        - name: PADDLE_JOB_NAME
-          value: vgg16v2job
-        - name: TRAINERS
-          value: "20"
-        - name: PSERVERS
-          value: "10"
-        - name: TOPOLOGY
-          value: ""
-        - name: ENTRY
-          value: "python train.py"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: PADDLE_INIT_PORT
-          value: "30236"
-        - name: PADDLE_INIT_NICS
-          value: "xgbe0"
-        - name: PADDLE_INIT_TRAINER_COUNT
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
-          value: "1"
-        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
-          value: "20"
-        - name: PADDLE_INIT_NUM_PASSES
-          value: "1"
-        - name: PADDLE_INIT_USE_GPU
-          value: "0"
-        - name: LD_LIBRARY_PATH
-          value: "/usr/local/lib:/usr/local/nvidia/lib64"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        command: ["paddle_k8s", "start_pserver"]
-        resources:
-          requests:
-            memory: 10Gi
-            cpu: 4
-          limits:
-            memory: 10Gi
-            cpu: 4
diff --git a/benchmark/cluster/vgg16/v2_trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml
deleted file mode 100644
index 12c8964066cbcfe8d2a44de2f51a3d12ea422fe2..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/v2_trainer.yaml
+++ /dev/null
@@ -1,65 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: vgg16v2job-trainer
-spec:
-  parallelism: 20
-  completions: 20
-  template:
-    metadata:
-      labels:
-        paddle-job: vgg16v2job
-    spec:
-      imagePullSecrets:
-        - name: job-registry-secret
-      hostNetwork: true
-      containers:
-      - name: trainer
-        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
-        imagePullPolicy: Always
-        command: ["paddle_k8s", "start_trainer", "v2"]
-        env:
-        - name: PADDLE_JOB_NAME
-          value: vgg16v2job
-        - name: BATCH_SIZE
-          value: "256"
-        - name: TRAINERS
-          value: "20"
-        - name: PSERVERS
-          value: "10"
-        - name: TOPOLOGY
-          value: ""
-        - name: ENTRY
-          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
-        - name: TRAINER_PACKAGE
-          value: "/workspace"
-        - name: PADDLE_INIT_PORT
-          value: "30236"
-        - name: PADDLE_INIT_NICS
-          value: "xgbe0"
-        - name: PADDLE_INIT_TRAINER_COUNT
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM
-          value: "1"
-        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
-          value: "1"
-        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
-          value: "20"
-        - name: PADDLE_INIT_NUM_PASSES
-          value: "2"
-        - name: PADDLE_INIT_USE_GPU
-          value: "0"
-        - name: LD_LIBRARY_PATH
-          value: "/usr/local/lib:/usr/local/nvidia/lib64"
-        - name: NAMESPACE
-          valueFrom:
-            fieldRef:
-              fieldPath: "metadata.namespace"
-        resources:
-          requests:
-            memory: 40Gi
-            cpu: 2
-          limits:
-            memory: 40Gi
-            cpu: 2
-      restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py
deleted file mode 100644
index 786f224608f7d41c438411de0e09fedbcf2264b8..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ /dev/null
@@ -1,295 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""VGG16 benchmark in Fluid"""
-from __future__ import print_function
-
-import sys
-import time
-import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.profiler as profiler
-import argparse
-import functools
-import os
-
-
-def str2bool(v):
-    if v.lower() in ('yes', 'true', 't', 'y', '1'):
-        return True
-    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
-        return False
-    else:
-        raise argparse.ArgumentTypeError('Boolean value expected.')
-
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    '--batch_size', type=int, default=128, help="Batch size for training.")
-parser.add_argument(
-    '--learning_rate',
-    type=float,
-    default=1e-3,
-    help="Learning rate for training.")
-parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
-parser.add_argument(
-    '--device',
-    type=str,
-    default='CPU',
-    choices=['CPU', 'GPU'],
-    help="The device type.")
-parser.add_argument('--device_id', type=int, default=0, help="The device id.")
-parser.add_argument(
-    '--data_format',
-    type=str,
-    default='NCHW',
-    choices=['NCHW', 'NHWC'],
-    help='The data order, now only support NCHW.')
-parser.add_argument(
-    '--data_set',
-    type=str,
-    default='cifar10',
-    choices=['cifar10', 'flowers'],
-    help='Optional dataset for benchmark.')
-parser.add_argument(
-    '--local',
-    type=str2bool,
-    default=True,
-    help='Whether to run as local mode.')
-
-parser.add_argument(
-    "--ps_hosts",
-    type=str,
-    default="",
-    help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
-    "--trainer_hosts",
-    type=str,
-    default="",
-    help="Comma-separated list of hostname:port pairs")
-
-# Flags for defining the tf.train.Server
-parser.add_argument(
-    "--task_index", type=int, default=0, help="Index of task within the job")
-args = parser.parse_args()
-
-
-def vgg16_bn_drop(input):
-    def conv_block(input, num_filter, groups, dropouts):
-        return fluid.nets.img_conv_group(
-            input=input,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act='relu',
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type='max')
-
-    conv1 = conv_block(input, 64, 2, [0.3, 0])
-    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
-    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
-    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
-    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
-
-    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
-    fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
-    bn = fluid.layers.batch_norm(input=fc1, act='relu')
-    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
-    fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
-    return fc2
-
-
-def main():
-    if args.data_set == "cifar10":
-        classdim = 10
-        if args.data_format == 'NCHW':
-            data_shape = [3, 32, 32]
-        else:
-            data_shape = [32, 32, 3]
-    else:
-        classdim = 102
-        if args.data_format == 'NCHW':
-            data_shape = [3, 224, 224]
-        else:
-            data_shape = [224, 224, 3]
-
-    # Input data
-    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    # Train program
-    net = vgg16_bn_drop(images)
-    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-
-    # Evaluator
-    batch_size = fluid.layers.create_tensor(dtype='int64')
-    batch_acc = fluid.layers.accuracy(
-        input=predict, label=label, total=batch_size)
-
-    # inference program
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(batch_acc)
-
-    # Optimization
-    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
-    optimize_ops, params_grads = optimizer.minimize(avg_cost)
-
-    # Initialize executor
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(
-        args.device_id)
-    exe = fluid.Executor(place)
-
-    # test
-    def test(exe):
-        test_pass_acc = fluid.average.WeightedAverage()
-        for batch_id, data in enumerate(test_reader()):
-            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
-                                    data)).astype("float32")
-            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-            y_data = y_data.reshape([-1, 1])
-
-            outs = exe.run(inference_program,
-                           feed={"pixel": img_data,
-                                 "label": y_data},
-                           fetch_list=[batch_acc, batch_size])
-            test_pass_acc.add(value=np.array(outs[0]), weight=np.array(outs[1]))
-
-        return test_pass_acc.eval()
-
-    def train_loop(exe, trainer_prog):
-        iters = 0
-        ts = time.time()
-        train_pass_acc = fluid.average.WeightedAverage()
-        for pass_id in range(args.num_passes):
-            # train
-            start_time = time.time()
-            num_samples = 0
-            train_pass_acc.reset()
-            with profiler.profiler("CPU", 'total') as prof:
-                for batch_id, data in enumerate(train_reader()):
-                    ts = time.time()
-                    img_data = np.array(
-                        map(lambda x: x[0].reshape(data_shape), data)).astype(
-                            "float32")
-                    y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-                    y_data = y_data.reshape([-1, 1])
-
-                    loss, acc, b_size = exe.run(
-                        trainer_prog,
-                        feed={"pixel": img_data,
-                              "label": y_data},
-                        fetch_list=[avg_cost, batch_acc, batch_size])
-                    iters += 1
-                    num_samples += len(data)
-                    train_pass_acc.add(value=acc, weight=b_size)
-                    print(
-                        "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
-                        % (pass_id, iters, loss, acc,
-                           len(data) / (time.time() - ts))
-                    )  # The accuracy is the accumulation of batches, but not the current batch.
-
-            pass_elapsed = time.time() - start_time
-            pass_train_acc = train_pass_acc.eval()
-            pass_test_acc = test(exe)
-            print(
-                "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
-                % (pass_id, num_samples / pass_elapsed, pass_train_acc,
-                   pass_test_acc))
-
-    if args.local:
-        # Parameter initialization
-        exe.run(fluid.default_startup_program())
-
-        # data reader
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
-                else paddle.dataset.flowers.train(),
-                buf_size=5120),
-            batch_size=args.batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.cifar.test10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
-            batch_size=args.batch_size)
-        train_loop(exe, fluid.default_main_program())
-    else:
-        trainers = int(os.getenv("TRAINERS"))  # total trainer count
-        print("trainers total: ", trainers)
-
-        training_role = os.getenv(
-            "TRAINING_ROLE",
-            "TRAINER")  # get the training role: trainer/pserver
-
-        t = fluid.DistributeTranspiler()
-        t.transpile(
-            optimize_ops,
-            params_grads,
-            trainer_id=args.task_index,
-            pservers=args.ps_hosts,
-            trainers=trainers)
-
-        if training_role == "PSERVER":
-            current_endpoint = os.getenv("POD_IP") + ":" + os.getenv(
-                "PADDLE_INIT_PORT")
-            if not current_endpoint:
-                print("need env SERVER_ENDPOINT")
-                exit(1)
-            pserver_prog = t.get_pserver_program(current_endpoint)
-            pserver_startup = t.get_startup_program(current_endpoint,
-                                                    pserver_prog)
-            print("starting server side startup")
-            exe.run(pserver_startup)
-            print("starting parameter server...")
-            exe.run(pserver_prog)
-        elif training_role == "TRAINER":
-            # Parameter initialization
-            exe.run(fluid.default_startup_program())
-
-            # data reader
-            train_reader = paddle.batch(
-                paddle.reader.shuffle(
-                    paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
-                    else paddle.dataset.flowers.train(),
-                    buf_size=5120),
-                batch_size=args.batch_size)
-            test_reader = paddle.batch(
-                paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else
-                paddle.dataset.flowers.test(),
-                batch_size=args.batch_size)
-
-            trainer_prog = t.get_trainer_program()
-            feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
-            # TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
-            exe.run(fluid.default_startup_program())
-            train_loop(exe, trainer_prog)
-        else:
-            print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
-
-
-def print_arguments():
-    print('-----------  Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == "__main__":
-    print_arguments()
-    main()
diff --git a/benchmark/cluster/vgg16/vgg16_tf.py b/benchmark/cluster/vgg16/vgg16_tf.py
deleted file mode 100644
index 996df0e314b867ea8de618dfd3977f490fbe8372..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/vgg16_tf.py
+++ /dev/null
@@ -1,362 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""VGG16 benchmark in TensorFlow
-You can get distribution example template structure here:
-https://medium.com/clusterone/how-to-write-distributed-tensorflow-code-with-an-example-on-tensorport-70bf3306adcb
-https://www.tensorflow.org/deploy/distributed
-"""
-
-import tensorflow as tf
-import paddle.v2 as paddle
-import numpy as np
-import argparse
-import time
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    '--batch_size', type=int, default=128, help="Batch size for training.")
-parser.add_argument(
-    '--learning_rate',
-    type=float,
-    default=1e-3,
-    help="Learning rate for training.")
-parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
-parser.add_argument(
-    '--device',
-    type=str,
-    default='CPU',
-    choices=['CPU', 'GPU'],
-    help="The device type.")
-parser.add_argument(
-    '--data_format',
-    type=str,
-    default='NHWC',
-    choices=['NCHW', 'NHWC'],
-    help='The data order, NCHW=[batch, channels, height, width].'
-    'Only support NHWC right now.')
-parser.add_argument(
-    '--data_set',
-    type=str,
-    default='cifar10',
-    choices=['cifar10', 'flowers'],
-    help='Optional dataset for benchmark.')
-
-parser.add_argument(
-    "--ps_hosts",
-    type=str,
-    default="",
-    help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
-    "--worker_hosts",
-    type=str,
-    default="",
-    help="Comma-separated list of hostname:port pairs")
-parser.add_argument(
-    "--job_name", type=str, default="", help="One of 'worker', 'ps'")
-# Flags for defining the tf.train.Server
-parser.add_argument(
-    "--task_index", type=int, default=0, help="Index of task within the job")
-
-args = parser.parse_args()
-
-
-class VGG16Model(object):
-    def __init__(self):
-        self.parameters = []
-
-    def batch_norm_relu(self, inputs, is_training):
-        """Performs a batch normalization followed by a ReLU."""
-        # We set fused=True for a significant speed boost. See
-        # https://www.tensorflow.org/speed/speed_guide#common_fused_ops
-        inputs = tf.layers.batch_normalization(
-            inputs=inputs,
-            axis=1 if args.data_format == 'NCHW' else -1,
-            momentum=0.9,
-            epsilon=1e-05,
-            center=True,
-            scale=True,
-            training=is_training,
-            fused=True)
-        inputs = tf.nn.relu(inputs)
-        return inputs
-
-    def conv_bn_layer(self,
-                      name,
-                      images,
-                      kernel_shape,
-                      is_training,
-                      drop_rate=0.0):
-        with tf.name_scope(name) as scope:
-            kernel = tf.Variable(
-                tf.truncated_normal(
-                    kernel_shape, dtype=tf.float32, stddev=1e-1),
-                name='weights')
-            conv = tf.nn.conv2d(
-                images,
-                kernel, [1, 1, 1, 1],
-                data_format=args.data_format,
-                padding='SAME')
-            biases = tf.Variable(
-                tf.constant(
-                    0.0, shape=[kernel_shape[-1]], dtype=tf.float32),
-                trainable=True,
-                name='biases')
-            out = tf.nn.bias_add(conv, biases)
-            out = self.batch_norm_relu(out, is_training)
-            out = tf.layers.dropout(out, rate=drop_rate, training=is_training)
-            return out
-
-    def fc_layer(self, name, inputs, shape):
-        with tf.name_scope(name) as scope:
-            fc_w = tf.Variable(
-                tf.truncated_normal(
-                    shape, dtype=tf.float32, stddev=1e-1),
-                name='weights')
-            fc_b = tf.Variable(
-                tf.constant(
-                    0.0, shape=[shape[-1]], dtype=tf.float32),
-                trainable=True,
-                name='biases')
-            out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b)
-            return out
-
-    def network(self, images, class_dim, is_training):
-        """ VGG16 model structure.
-
-            TODO(kuke): enable this network to support the 'NCHW' data format
-        """
-
-        # conv1
-        conv1_1 = self.conv_bn_layer(
-            'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3)
-        conv1_2 = self.conv_bn_layer(
-            'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0)
-        # pool1
-        pool1 = tf.nn.max_pool(
-            conv1_2,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool1')
-        # conv2
-        conv2_1 = self.conv_bn_layer(
-            'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4)
-        conv2_2 = self.conv_bn_layer(
-            'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0)
-        # pool2
-        pool2 = tf.nn.max_pool(
-            conv2_2,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool2')
-        # conv3
-        conv3_1 = self.conv_bn_layer(
-            'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4)
-        conv3_2 = self.conv_bn_layer(
-            'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4)
-        conv3_3 = self.conv_bn_layer(
-            'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0)
-        # pool3
-        pool3 = tf.nn.max_pool(
-            conv3_3,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool3')
-        # conv4
-        conv4_1 = self.conv_bn_layer(
-            'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4)
-        conv4_2 = self.conv_bn_layer(
-            'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
-        conv4_3 = self.conv_bn_layer(
-            'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
-        # pool4
-        pool4 = tf.nn.max_pool(
-            conv4_3,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool4')
-        # conv5
-        conv5_1 = self.conv_bn_layer(
-            'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4)
-        conv5_2 = self.conv_bn_layer(
-            'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
-        conv5_3 = self.conv_bn_layer(
-            'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
-        # pool5
-        pool5 = tf.nn.max_pool(
-            conv5_3,
-            ksize=[1, 2, 2, 1],
-            strides=[1, 2, 2, 1],
-            padding='SAME',
-            name='pool4')
-        # flatten
-        shape = int(np.prod(pool5.get_shape()[1:]))
-        pool5_flat = tf.reshape(pool5, [-1, shape])
-        # fc1
-        drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training)
-        fc1 = self.fc_layer('fc1', drop, [shape, 512])
-        # fc2
-        bn = self.batch_norm_relu(fc1, is_training)
-        drop = tf.layers.dropout(bn, rate=0.5, training=is_training)
-        fc2 = self.fc_layer('fc2', drop, [512, 512])
-
-        fc3 = self.fc_layer('fc3', fc2, [512, class_dim])
-
-        return fc3
-
-
-def run_benchmark(cluster_spec, server):
-    """Run benchmark on cifar10 or flowers."""
-
-    if args.data_set == "cifar10":
-        class_dim = 10
-        raw_shape = (3, 32, 32)
-        dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else (
-            None, 3, 32, 32)
-    else:
-        class_dim = 102
-        raw_shape = (3, 224, 224)
-        dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else (
-            None, 3, 224, 224)
-
-    device = tf.train.replica_device_setter(
-        worker_device="/job:worker/task:{}".format(args.task_index),
-        cluster=cluster_spec)
-
-    with tf.device(device):
-        images = tf.placeholder(tf.float32, shape=dat_shape)
-        labels = tf.placeholder(tf.int64, shape=(None, ))
-        is_training = tf.placeholder('bool')
-        onehot_labels = tf.one_hot(labels, depth=class_dim)
-
-        vgg16 = VGG16Model()
-        logits = vgg16.network(images, class_dim, is_training)
-        loss = tf.losses.softmax_cross_entropy(
-            onehot_labels=onehot_labels, logits=logits)
-        avg_loss = tf.reduce_mean(loss)
-
-        correct = tf.equal(tf.argmax(logits, 1), labels)
-        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
-
-        optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
-        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
-        global_step = tf.Variable(0, name='global_step', trainable=False)
-        with tf.control_dependencies(update_ops):
-            train_op = optimizer.minimize(avg_loss, global_step=global_step)
-
-        summary_op = tf.summary.merge_all()
-        init_op = tf.global_variables_initializer()
-
-    # data reader
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.cifar.train10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
-            buf_size=5120),
-        batch_size=args.batch_size)
-    test_reader = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.cifar.test10()
-            if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
-            buf_size=5120),
-        batch_size=args.batch_size)
-
-    # test
-    def test():
-        test_accs = []
-        for batch_id, data in enumerate(test_reader()):
-            test_images = np.array(
-         map(lambda x: np.transpose(x[0].reshape(raw_shape),
-         axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
-            test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
-            test_accs.append(
-                accuracy.eval(feed_dict={
-                    images: test_images,
-                    labels: test_labels,
-                    is_training: False
-                }))
-        return np.mean(test_accs)
-
-    config = tf.ConfigProto(
-        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
-    config.gpu_options.allow_growth = True
-
-    hooks = [tf.train.StopAtStepHook(last_step=1000000)]
-
-    with tf.train.MonitoredTrainingSession(
-            master=server.target, is_chief=(args.task_index == 0),
-            hooks=hooks) as sess:
-        iters, num_samples, start_time = 0, 0, 0.0
-        for pass_id in range(args.num_passes):
-            # train
-            num_samples = 0
-            start_time = time.time()
-            for batch_id, data in enumerate(train_reader()):
-                train_images = np.array(
-                    map(lambda x: np.transpose(x[0].reshape(raw_shape),
-                    axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
-                train_labels = np.array(map(lambda x: x[1], data)).astype(
-                    'int64')
-                iter_begin_time = time.time()
-                _, loss, acc = sess.run([train_op, avg_loss, accuracy],
-                                        feed_dict={
-                                            images: train_images,
-                                            labels: train_labels,
-                                            is_training: True
-                                        })
-                iters += 1
-                print(
-                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed=%.2f imgs/sec"
-                    % (pass_id, iters, loss, acc,
-                       len(data) / (time.time() - iter_begin_time)))
-                num_samples += len(data)
-            train_elapsed = time.time() - start_time
-            # test
-            pass_test_acc = test()
-            print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" %
-                  (pass_id, num_samples / train_elapsed, pass_test_acc))
-
-
-def print_arguments():
-    print('-----------  Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
-        print('%s: %s' % (arg, value))
-    print('------------------------------------------------')
-
-
-if __name__ == '__main__':
-    print_arguments()
-
-    ps_hosts = args.ps_hosts.split(",")
-    worker_hosts = args.worker_hosts.split(",")
-
-    # Create a cluster from the parameter server and worker hosts.
-    cluster_spec = tf.train.ClusterSpec({
-        "ps": ps_hosts,
-        "worker": worker_hosts
-    })
-
-    # Create and start a server for the local task.
-    server = tf.train.Server(
-        cluster_spec, job_name=args.job_name, task_index=args.task_index)
-
-    if args.job_name == "ps":
-        print("start pserver")
-        server.join()
-    elif args.job_name == "worker":
-        print("start worker")
-        run_benchmark(cluster_spec, server)
diff --git a/benchmark/cluster/vgg16/vgg16_v2.py b/benchmark/cluster/vgg16/vgg16_v2.py
deleted file mode 100644
index 1a66af32d7131997c63bd3c3042875f33a467084..0000000000000000000000000000000000000000
--- a/benchmark/cluster/vgg16/vgg16_v2.py
+++ /dev/null
@@ -1,154 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-import gzip
-
-import paddle.v2.dataset.cifar as cifar
-import paddle.v2 as paddle
-import time
-import os
-
-DATA_DIM = 3 * 32 * 32
-CLASS_DIM = 10
-BATCH_SIZE = os.getenv("BATCH_SIZE")
-if BATCH_SIZE:
-    BATCH_SIZE = int(BATCH_SIZE)
-else:
-    BATCH_SIZE = 128
-print "batch_size", BATCH_SIZE
-NODE_COUNT = int(os.getenv("TRAINERS"))
-ts = 0
-
-
-def vgg(input, nums, class_dim):
-    def conv_block(input, num_filter, groups, num_channels=None):
-        return paddle.networks.img_conv_group(
-            input=input,
-            num_channels=num_channels,
-            pool_size=2,
-            pool_stride=2,
-            conv_num_filter=[num_filter] * groups,
-            conv_filter_size=3,
-            conv_act=paddle.activation.Relu(),
-            pool_type=paddle.pooling.Max())
-
-    assert len(nums) == 5
-    # the channel of input feature is 3
-    conv1 = conv_block(input, 64, nums[0], 3)
-    conv2 = conv_block(conv1, 128, nums[1])
-    conv3 = conv_block(conv2, 256, nums[2])
-    conv4 = conv_block(conv3, 512, nums[3])
-    conv5 = conv_block(conv4, 512, nums[4])
-
-    fc_dim = 512
-    fc1 = paddle.layer.fc(input=conv5,
-                          size=fc_dim,
-                          act=paddle.activation.Relu(),
-                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
-    fc2 = paddle.layer.fc(input=fc1,
-                          size=fc_dim,
-                          act=paddle.activation.Relu(),
-                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
-    out = paddle.layer.fc(input=fc2,
-                          size=class_dim,
-                          act=paddle.activation.Softmax())
-    return out
-
-
-def vgg13(input, class_dim):
-    nums = [2, 2, 2, 2, 2]
-    return vgg(input, nums, class_dim)
-
-
-def vgg16(input, class_dim):
-    nums = [2, 2, 3, 3, 3]
-    return vgg(input, nums, class_dim)
-
-
-def vgg19(input, class_dim):
-    nums = [2, 2, 4, 4, 4]
-    return vgg(input, nums, class_dim)
-
-
-def main():
-    global ts
-    paddle.init(use_gpu=False)
-    image = paddle.layer.data(
-        name="image", type=paddle.data_type.dense_vector(DATA_DIM))
-    lbl = paddle.layer.data(
-        name="label", type=paddle.data_type.integer_value(CLASS_DIM))
-
-    extra_layers = None
-    # NOTE: for v2 distributed training need averaging updates.
-    learning_rate = 1e-3 / NODE_COUNT
-    out = vgg16(image, class_dim=CLASS_DIM)
-    cost = paddle.layer.classification_cost(input=out, label=lbl)
-
-    # Create parameters
-    parameters = paddle.parameters.create(cost)
-
-    # Create optimizer
-    optimizer = paddle.optimizer.Momentum(
-        momentum=0.9,
-        regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
-                                                         BATCH_SIZE),
-        learning_rate=learning_rate / BATCH_SIZE,
-        learning_rate_decay_a=0.1,
-        learning_rate_decay_b=128000 * 35,
-        learning_rate_schedule="discexp", )
-
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            cifar.train10(),
-            # To use other data, replace the above line with:
-            # reader.train_reader('train.list'),
-            buf_size=1000),
-        batch_size=BATCH_SIZE)
-    test_reader = paddle.batch(
-        cifar.test10(),
-        # To use other data, replace the above line with:
-        # reader.test_reader('val.list'),
-        batch_size=BATCH_SIZE)
-
-    # Create trainer
-    trainer = paddle.trainer.SGD(cost=cost,
-                                 parameters=parameters,
-                                 update_equation=optimizer,
-                                 extra_layers=extra_layers,
-                                 is_local=False)
-
-    # End batch and end pass event handler
-    def event_handler(event):
-        global ts, ts_pass
-        if isinstance(event, paddle.event.BeginPass):
-            ts_pass = time.time()
-        if isinstance(event, paddle.event.BeginIteration):
-            ts = time.time()
-        if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 1 == 0:
-                print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
-                    event.pass_id, event.batch_id, event.cost, event.metrics,
-                    time.time() - ts)
-        if isinstance(event, paddle.event.EndPass):
-            print "Pass %d end, spent: %f" % (event.pass_id,
-                                              time.time() - ts_pass)
-            result = trainer.test(reader=test_reader)
-            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
-
-    trainer.train(
-        reader=train_reader, num_passes=200, event_handler=event_handler)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/benchmark/fluid/Dockerfile b/benchmark/fluid/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..707fadb1fae97cefe8a41715cd57d71754abda41
--- /dev/null
+++ b/benchmark/fluid/Dockerfile
@@ -0,0 +1,31 @@
+FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+
+# Use UBUNTU_MIRROR can speed up apt-get speed.
+# ARG UBUNTU_MIRROR
+# RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
+
+RUN apt-get update && apt-get install -y python python-pip iputils-ping libgtk2.0-dev wget vim net-tools iftop python-opencv
+RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/lib/libcudnn.so && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/libnccl.so
+
+# IMPORTANT:
+# Add "ENV http_proxy=http://ip:port" if your download is slow, and don't forget to unset it at runtime.
+# exmaple: unset http_proxy && unset https_proxy && python fluid_benchmark.py ...
+
+RUN pip install -U pip
+RUN pip install -U kubernetes paddlepaddle
+
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()\npaddle.dataset.flowers.fetch()" | python'
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.mnist.train()\npaddle.dataset.mnist.test()\npaddle.dataset.imdb.fetch()" | python'
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.imikolov.fetch()" | python'
+RUN pip uninstall -y paddlepaddle && mkdir /workspace
+
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
+RUN chmod +x /usr/bin/paddle_k8s
+
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl 
+
+ENV LD_LIBRARY_PATH=/usr/local/lib
+ADD fluid_benchmark.py recordio_converter.py args.py recordio_converter.py run.sh run_fluid_benchmark.sh /workspace/
+ADD models/ /workspace/models/
diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..28cade4634bb62723bf5120169e202657f548234
--- /dev/null
+++ b/benchmark/fluid/README.md
@@ -0,0 +1,99 @@
+# Fluid Benchmark
+
+This directory contains several models configurations and tools that used to run
+Fluid benchmarks for local and distributed training.
+
+
+## Run the Benchmark
+
+To start, run the following command to get the full help message:
+
+```bash
+python fluid_benchmark.py --help
+```
+
+Currently supported `--model` argument include:
+
+* mnist
+* resnet
+    * you can chose to use different dataset using `--data_set cifar10` or
+      `--data_set flowers`.
+* vgg
+* stacked_dynamic_lstm
+* machine_translation
+
+* Run the following command to start a benchmark job locally:
+    ```bash
+      python fluid_benchmark.py --model mnist --device GPU
+    ```
+    You can choose to use GPU/CPU training. With GPU training, you can specify
+    `--gpus <gpu_num>` to run multi GPU training.
+    You can set async mode parameter server. With async mode, you can specify
+    `--async_mode` to train model asynchronous.
+* Run distributed training with parameter servers:
+    * see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example.
+    * start parameter servers:
+        ```bash
+        PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist  --device GPU --update_method pserver
+        sleep 15
+        ```
+    * start trainers:
+        ```bash
+        PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=1 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist  --device GPU --update_method pserver
+        ```
+* Run distributed training using NCCL2
+    ```bash
+    PADDLE_PSERVER_PORT=7164 PADDLE_TRAINER_IPS=192.168.0.2,192.168.0.3  PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model mnist --device GPU --update_method nccl2
+    ```
+
+## Prepare the RecordIO file to Achieve Better Performance
+
+Run the following command will generate RecordIO files like "mnist.recordio" under the path
+and batch_size you choose, you can use batch_size=1 so that later reader can change the batch_size
+at any time using `fluid.batch`.
+
+```bash
+python -c 'from recordio_converter import *; prepare_mnist("data", 1)'
+```
+
+## Run Distributed Benchmark on Kubernetes Cluster
+
+You may need to build a Docker image before submitting a cluster job onto Kubernetes, or you will
+have to start all those processes mannually on each node, which is not recommended.
+
+To build the Docker image, you need to choose a paddle "whl" package to run with, you may either
+download it from
+http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_en.html or
+build it by your own. Once you've got the "whl" package, put it under the current directory and run:
+
+```bash
+docker build -t [your docker image name]:[your docker image tag] .
+```
+
+Then push the image to a Docker registry that your Kubernetes cluster can reach.
+
+We provide a script `kube_gen_job.py` to generate Kubernetes yaml files to submit
+distributed benchmark jobs to your cluster. To generate a job yaml, just run:
+
+```bash
+python kube_gen_job.py --jobname myjob --pscpu 4 --cpu 8 --gpu 8 --psmemory 20 --memory 40 --pservers 4 --trainers 4 --entry "python fluid_benchmark.py --model mnist --gpus 8 --device GPU --update_method pserver " --disttype pserver
+```
+
+Then the yaml files are generated under directory `myjob`, you can run:
+
+```bash
+kubectl create -f myjob/
+```
+
+The job shall start.
+
+
+## Notes for Run Fluid Distributed with NCCL2 and RDMA
+
+Before running NCCL2 distributed jobs, please check that whether your node has multiple network
+interfaces, try to add the environment variable `export NCCL_SOCKET_IFNAME=eth0` to use your actual
+network device.
+
+To run high-performance distributed training, you must prepare your hardware environment to be
+able to run RDMA enabled network communication, please check out [this](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/cluster/nccl2_rdma_training.md)
+note for details.
diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py
new file mode 100644
index 0000000000000000000000000000000000000000..a79f25ccc6ace1594f3f331633130eaace5e175b
--- /dev/null
+++ b/benchmark/fluid/args.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+__all__ = ['parse_args', ]
+
+BENCHMARK_MODELS = [
+    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser('Fluid model benchmarks.')
+    parser.add_argument(
+        '--model',
+        type=str,
+        choices=BENCHMARK_MODELS,
+        default='resnet',
+        help='The model to run benchmark with.')
+    parser.add_argument(
+        '--batch_size', type=int, default=32, help='The minibatch size.')
+    #  args related to learning rate
+    parser.add_argument(
+        '--learning_rate', type=float, default=0.001, help='The learning rate.')
+    # TODO(wuyi): add "--use_fake_data" option back.
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
+    parser.add_argument(
+        '--iterations', type=int, default=80, help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=100, help='The number of passes.')
+    parser.add_argument(
+        '--data_format',
+        type=str,
+        default='NCHW',
+        choices=['NCHW', 'NHWC'],
+        help='The data data_format, now only support NCHW.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--gpus',
+        type=int,
+        default=1,
+        help='If gpus > 1, will use ParallelExecutor to run, else use Executor.')
+    # this option is available only for vgg and resnet.
+    parser.add_argument(
+        '--cpus',
+        type=int,
+        default=1,
+        help='If cpus > 1, will use ParallelDo to run, else use Executor.')
+    parser.add_argument(
+        '--data_set',
+        type=str,
+        default='flowers',
+        choices=['cifar10', 'flowers'],
+        help='Optional dataset for benchmark.')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    parser.add_argument(
+        '--use_cprof', action='store_true', help='If set, use cProfile.')
+    parser.add_argument(
+        '--use_nvprof',
+        action='store_true',
+        help='If set, use nvprof for CUDA.')
+    parser.add_argument(
+        '--no_test',
+        action='store_true',
+        help='If set, do not test the testset during training.')
+    parser.add_argument(
+        '--memory_optimize',
+        action='store_true',
+        help='If set, optimize runtime memory before start.')
+    parser.add_argument(
+        '--use_fake_data',
+        action='store_true',
+        help='If set ommit the actual read data operators.')
+    parser.add_argument(
+        '--profile', action='store_true', help='If set, profile a few steps.')
+    parser.add_argument(
+        '--update_method',
+        type=str,
+        default='local',
+        choices=['local', 'pserver', 'nccl2'],
+        help='Choose parameter update method, can be local, pserver, nccl2.')
+    parser.add_argument(
+        '--no_split_var',
+        action='store_true',
+        default=False,
+        help='Whether split variables into blocks when update_method is pserver')
+    parser.add_argument(
+        '--async_mode',
+        action='store_true',
+        default=False,
+        help='Whether start pserver in async mode to support ASGD')
+    parser.add_argument(
+        '--use_reader_op',
+        action='store_true',
+        help='Whether to use reader op, and must specify the data path if set this to true.'
+    )
+    parser.add_argument(
+        '--data_path',
+        type=str,
+        default="",
+        help='Directory that contains all the training recordio files.')
+    parser.add_argument(
+        '--use_inference_transpiler',
+        action='store_true',
+        help='If set, use inference transpiler to optimize the program.')
+    parser.add_argument(
+        '--no_random',
+        action='store_true',
+        help='If set, keep the random seed and do not shuffle the data.')
+    args = parser.parse_args()
+    return args
diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..94ea7bd6aca7c9595037a2dacc5e36d4c77827e7
--- /dev/null
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -0,0 +1,371 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import cProfile
+import time
+import os
+
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.profiler as profiler
+import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler
+
+from args import *
+
+
+def append_nccl2_prepare(trainer_id):
+    if trainer_id >= 0:
+        # append gen_nccl_id at the end of startup program
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        port = os.getenv("PADDLE_PSERVER_PORT")
+        worker_ips = os.getenv("PADDLE_TRAINER_IPS")
+        worker_endpoints = []
+        for ip in worker_ips.split(","):
+            worker_endpoints.append(':'.join([ip, port]))
+        num_trainers = len(worker_endpoints)
+        current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
+        worker_endpoints.remove(current_endpoint)
+
+        nccl_id_var = fluid.default_startup_program().global_block().create_var(
+            name="NCCLID",
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.RAW)
+        fluid.default_startup_program().global_block().append_op(
+            type="gen_nccl_id",
+            inputs={},
+            outputs={"NCCLID": nccl_id_var},
+            attrs={
+                "endpoint": current_endpoint,
+                "endpoint_list": worker_endpoints,
+                "trainer_id": trainer_id
+            })
+        return nccl_id_var, num_trainers, trainer_id
+    else:
+        raise Exception("must set positive PADDLE_TRAINER_ID env variables for "
+                        "nccl-based dist train.")
+
+
+def dist_transpile(trainer_id, args):
+    if trainer_id < 0:
+        return None, None
+
+    # the port of all pservers, needed by both trainer and pserver
+    port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+    # comma separated ips of all pservers, needed by trainer and
+    # pserver
+    pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
+    eplist = []
+    for ip in pserver_ips.split(","):
+        eplist.append(':'.join([ip, port]))
+    pserver_endpoints = ",".join(eplist)
+    # total number of workers/trainers in the job, needed by
+    # trainer and pserver
+    trainers = int(os.getenv("PADDLE_TRAINERS"))
+    # the IP of the local machine, needed by pserver only
+    current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
+    # the role, should be either PSERVER or TRAINER
+    training_role = os.getenv("PADDLE_TRAINING_ROLE")
+
+    t = distribute_transpiler.DistributeTranspiler()
+    t.transpile(
+        trainer_id,
+        pservers=pserver_endpoints,
+        trainers=trainers,
+        sync_mode=not args.async_mode,
+        slice_var_up=not args.no_split_var)
+    if training_role == "PSERVER":
+        pserver_program = t.get_pserver_program(current_endpoint)
+        pserver_startup_program = t.get_startup_program(current_endpoint,
+                                                        pserver_program)
+        return pserver_program, pserver_startup_program
+    elif training_role == "TRAINER":
+        train_program = t.get_trainer_program()
+        return train_program, fluid.default_startup_program()
+    else:
+        raise ValueError(
+            'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
+        )
+
+
+def test(exe, inference_program, test_reader, feeder, batch_acc):
+    accuracy_evaluator = fluid.metrics.Accuracy()
+    for batch_id, data in enumerate(test_reader()):
+        acc = exe.run(inference_program,
+                      feed=feeder.feed(data),
+                      fetch_list=[batch_acc])
+        accuracy_evaluator.update(value=np.array(acc), weight=len(data))
+
+    return accuracy_evaluator.eval()
+
+
+# TODO(wuyi): replace train, train_parallel, test functions with new trainer
+# API once it is ready.
+def train(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc,
+          args, train_prog, startup_prog):
+    if os.getenv("PADDLE_TRAINING_ROLE") == "PSERVER":
+        place = core.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        exe.run(train_prog)
+        return
+
+    if args.use_fake_data:
+        raise Exception(
+            "fake data is not supported in single GPU test for now.")
+
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(startup_prog)
+
+    # Use inference_transpiler to speedup
+    if not args.use_reader_op:
+        feed_var_list = [
+            var for var in train_prog.global_block().vars.itervalues()
+            if var.is_data
+        ]
+        feeder = fluid.DataFeeder(feed_var_list, place)
+
+    iters, num_samples, start_time = 0, 0, time.time()
+    for pass_id in range(args.pass_num):
+        train_losses = []
+        if not args.use_reader_op:
+            reader_generator = train_reader()
+        batch_id = 0
+        data = None
+        while True:
+            if not args.use_reader_op:
+                data = next(reader_generator, None)
+                if data == None:
+                    break
+            if iters == args.iterations:
+                break
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+
+            if args.use_reader_op:
+                try:
+                    loss = exe.run(train_prog, fetch_list=[avg_loss])
+                except fluid.core.EnforceNotMet as ex:
+                    break
+            else:
+                loss = exe.run(train_prog,
+                               feed=feeder.feed(data),
+                               fetch_list=[avg_loss])
+            iters += 1
+            batch_id += 1
+            # FIXME(wuyi): For use_reader_op, if the current
+            # pass is not the last, the last batch of this pass
+            # is also equal to args.batch_size.
+            if args.use_reader_op:
+                num_samples += args.batch_size * args.gpus
+            else:
+                num_samples += len(data)
+            train_losses.append(loss)
+            print("Pass: %d, Iter: %d, Loss: %f\n" %
+                  (pass_id, iters, np.mean(train_losses)))
+        print_train_time(start_time, time.time(), num_samples)
+        print("Pass: %d, Loss: %f" % (pass_id, np.mean(train_losses))),
+        # evaluation
+        if not args.no_test and batch_acc and not args.use_reader_op:
+            if args.use_inference_transpiler:
+                t = fluid.InferenceTranspiler()
+                t.transpile(infer_prog, place)
+
+            pass_test_acc = test(exe, infer_prog, test_reader, feeder,
+                                 batch_acc)
+            print(", Test Accuracy: %f" % pass_test_acc)
+        print("\n")
+        # TODO(wuyi): add warmup passes to get better perf data.
+        exit(0)
+
+
+# TODO(wuyi): replace train, train_parallel, test functions with new trainer
+# API once it is ready.
+def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
+                   batch_acc, args, train_prog, startup_prog, nccl_id_var,
+                   num_trainers, trainer_id):
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    if not args.use_reader_op:
+        feed_var_list = [
+            var for var in train_prog.global_block().vars.itervalues()
+            if var.is_data
+        ]
+        feeder = fluid.DataFeeder(feed_var_list, place)
+
+    # generate fake:
+    if args.use_fake_data:
+        for var in feed_var_list:
+            v = startup_prog.global_block().clone_variable(var)
+            var.persistable = True
+            v.persistable = True
+
+            real_shape = list(var.shape)
+            real_shape[0] = args.batch_size / args.gpus
+            startup_prog.global_block().append_op(
+                outputs={"Out": v},
+                type="fill_constant",
+                attrs={"shape": real_shape,
+                       "value": 1.0,
+                       "dtype": var.dtype})
+
+    if nccl_id_var and trainer_id == 0:
+        #FIXME(wuyi): wait other trainer to start listening
+        time.sleep(30)
+
+    startup_exe = fluid.Executor(place)
+    startup_exe.run(startup_prog)
+    strategy = fluid.ExecutionStrategy()
+    strategy.num_threads = 1
+    strategy.allow_op_delay = False
+    exe = fluid.ParallelExecutor(
+        True,
+        avg_loss.name,
+        exec_strategy=strategy,
+        num_trainers=num_trainers,
+        trainer_id=trainer_id)
+
+    for pass_id in range(args.pass_num):
+        num_samples = 0
+        iters = 0
+        start_time = time.time()
+        if not args.use_reader_op:
+            reader_generator = train_reader()
+        batch_id = 0
+        data = None
+        while True:
+            if not args.use_reader_op:
+                data = next(reader_generator, None)
+                if data == None:
+                    break
+            if iters == args.iterations:
+                break
+            if args.profile and pass_id == 0 and batch_id == 5:
+                profiler.start_profiler("All")
+            elif args.profile and pass_id == 0 and batch_id == 10:
+                profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id)
+
+            if iters == args.skip_batch_num:
+                start_time = time.time()
+                num_samples = 0
+            if args.use_fake_data or args.use_reader_op:
+                try:
+                    loss, = exe.run([avg_loss.name])
+                except fluid.core.EnforceNotMet as ex:
+                    break
+            else:
+                loss, = exe.run([avg_loss.name], feed=feeder.feed(data))
+            if args.use_reader_op:
+                num_samples += args.batch_size * args.gpus
+            else:
+                num_samples += len(data)
+            iters += 1
+            if batch_id % 1 == 0:
+                print("Pass %d, batch %d, loss %s" %
+                      (pass_id, batch_id, np.array(loss)))
+            batch_id += 1
+
+        print_train_time(start_time, time.time(), num_samples)
+        if not args.no_test and batch_acc and not args.use_reader_op:
+            # we have not implement record io for test
+            # skip test when use args.use_reader_op
+            test_acc = test(startup_exe, infer_prog, test_reader, feeder,
+                            batch_acc)
+            print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc))
+
+
+def print_arguments(args):
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+                                vars(args)['device'] == 'GPU')
+    print('----------- Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def print_train_time(start_time, end_time, num_samples):
+    train_elapsed = end_time - start_time
+    examples_per_sec = num_samples / train_elapsed
+    print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
+          (num_samples, train_elapsed, examples_per_sec))
+
+
+def print_paddle_envs():
+    print('----------- Configuration envs -----------')
+    for k in os.environ:
+        if "PADDLE_" in k:
+            print "ENV %s:%s" % (k, os.environ[k])
+    print('------------------------------------------------')
+
+
+def main():
+    args = parse_args()
+    print_arguments(args)
+    print_paddle_envs()
+    if args.no_random:
+        fluid.default_startup_program().random_seed = 1
+
+    # the unique trainer id, starting from 0, needed by trainer
+    # only
+    nccl_id_var, num_trainers, trainer_id = (
+        None, 1, int(os.getenv("PADDLE_TRAINER_ID", "0")))
+
+    if args.use_cprof:
+        pr = cProfile.Profile()
+        pr.enable()
+    model_def = __import__("models.%s" % args.model, fromlist=["models"])
+    train_args = list(model_def.get_model(args))
+    train_args.append(args)
+    # Run optimizer.minimize(avg_loss)
+    train_args[2].minimize(train_args[0])
+    if args.memory_optimize:
+        fluid.memory_optimize(fluid.default_main_program())
+
+    if args.update_method == "pserver":
+        train_prog, startup_prog = dist_transpile(trainer_id, args)
+        if not train_prog:
+            raise Exception(
+                "Must configure correct environments to run dist train.")
+        train_args.extend([train_prog, startup_prog])
+        if args.gpus > 1 and os.getenv("PADDLE_TRAINING_ROLE") == "TRAINER":
+            train_args.extend([nccl_id_var, num_trainers, trainer_id])
+            train_parallel(*train_args)
+        train(*train_args)
+        exit(0)
+
+    # for other update methods, use default programs
+    train_args.append(fluid.default_main_program())
+    train_args.append(fluid.default_startup_program())
+
+    if args.update_method == "nccl2":
+        nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(trainer_id)
+    if args.gpus == 1:
+        # NOTE: parallel executor use profiler interanlly
+        if args.use_nvprof and args.device == 'GPU':
+            with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+                train(*train_args)
+        else:
+            train(*train_args)
+    else:
+        if args.device == "CPU":
+            raise Exception("Only support GPU perf with parallel exe")
+        train_args.extend([nccl_id_var, num_trainers, trainer_id])
+        train_parallel(*train_args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/fluid/kube_gen_job.py b/benchmark/fluid/kube_gen_job.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfe8b5cdd58456902fa8ec355e9837dface3f7be
--- /dev/null
+++ b/benchmark/fluid/kube_gen_job.py
@@ -0,0 +1,197 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import yaml
+import copy
+import argparse
+import random
+import os
+import copy
+from kube_templates import pserver, trainer, envs
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Generate dist job yamls.')
+
+    parser.add_argument(
+        '--jobname', default="paddlejob", help='unique job name')
+    parser.add_argument(
+        '--cpu', default=1, type=int, help='CPU cores per trainer node')
+    parser.add_argument(
+        '--pscpu', default=1, type=int, help='CPU cores per pserver node')
+    parser.add_argument(
+        '--gpu', default=0, type=int, help='num of GPUs per node')
+    parser.add_argument(
+        '--image',
+        default="bootstrapper:5000/fluid_benchmark:gpu",
+        help='num of GPUs per node')
+    parser.add_argument(
+        '--pservers', default=1, type=int, help='num of pservers')
+    parser.add_argument(
+        '--trainers', default=1, type=int, help='num of trainers')
+    parser.add_argument('--memory', default=1, type=int, help='trainer memory')
+    parser.add_argument(
+        '--psmemory', default=1, type=int, help='pserver memory')
+    parser.add_argument(
+        '--port', default=30236, type=int, help='num of trainers')
+    parser.add_argument(
+        '--entry', default="python train.py", help='command to run')
+    parser.add_argument(
+        '--fluid', default=1, type=int, help='whether is fluid job')
+    parser.add_argument(
+        '--rdma', action='store_true', help='whether mount rdma libs')
+    parser.add_argument(
+        '--disttype',
+        default="pserver",
+        type=str,
+        choices=['pserver', 'nccl2', 'local'],
+        help='pserver or nccl2 or local')
+
+    args = parser.parse_args()
+    return args
+
+
+def gen_job():
+    ps = pserver
+    tn = trainer
+    args = parse_args()
+
+    ps_container = ps["spec"]["template"]["spec"]["containers"][0]
+    tn_container = tn["spec"]["template"]["spec"]["containers"][0]
+
+    if args.fluid == 1:
+        ps_container["command"] = \
+            ["paddle_k8s", "start_fluid"]
+        tn_container["command"] = \
+            ["paddle_k8s", "start_fluid"]
+    ps["metadata"]["name"] = args.jobname + "-pserver"
+    ps["spec"]["template"]["metadata"]["labels"][
+        "paddle-job-pserver"] = args.jobname
+    tn["metadata"]["name"] = args.jobname + "-trainer"
+    tn["spec"]["template"]["metadata"]["labels"]["paddle-job"] = args.jobname
+
+    ps_container["image"] = args.image
+    tn_container["image"] = args.image
+
+    ps_container["resources"]["requests"]["cpu"] = str(args.pscpu)
+    ps_container["resources"]["requests"]["memory"] = str(args.psmemory) + "Gi"
+    ps_container["resources"]["limits"]["cpu"] = str(args.pscpu)
+    ps_container["resources"]["limits"]["memory"] = str(args.psmemory) + "Gi"
+
+    tn_container["resources"]["requests"]["cpu"] = str(args.cpu)
+    tn_container["resources"]["requests"]["memory"] = str(args.memory) + "Gi"
+    tn_container["resources"]["limits"]["cpu"] = str(args.cpu)
+    tn_container["resources"]["limits"]["memory"] = str(args.memory) + "Gi"
+    if args.gpu > 0:
+        tn_container["resources"]["requests"][
+            "alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu)
+        tn_container["resources"]["limits"][
+            "alpha.kubernetes.io/nvidia-gpu"] = str(args.gpu)
+
+    ps["spec"]["replicas"] = int(args.pservers)
+    tn["spec"]["parallelism"] = int(args.trainers)
+    tn["spec"]["completions"] = int(args.trainers)
+    ps_container["ports"][0]["name"] = "jobport-" + str(args.port)
+    ps_container["ports"][0]["containerPort"] = args.port
+    spreadport = random.randint(40000, 60000)
+    tn_container["ports"][0]["name"] = "spr-" + str(spreadport)
+    tn_container["ports"][0]["containerPort"] = spreadport
+
+    envs.append({"name": "PADDLE_JOB_NAME", "value": args.jobname})
+    envs.append({"name": "PADDLE_TRAINERS", "value": str(args.trainers)})
+    envs.append({"name": "PADDLE_PSERVERS", "value": str(args.pservers)})
+    envs.append({"name": "ENTRY", "value": args.entry})
+    envs.append({"name": "PADDLE_PSERVER_PORT", "value": str(args.port)})
+    # NOTE: these directories below are cluster specific, please modify
+    # this settings before you run on your own cluster.
+    envs.append({
+        "name": "LD_LIBRARY_PATH",
+        "value":
+        "/usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind"
+    })
+
+    volumes = [{
+        "name": "nvidia-driver",
+        "hostPath": {
+            "path": "/usr/local/nvidia/lib64"
+        }
+    }]
+    volumeMounts = [{
+        "mountPath": "/usr/local/nvidia/lib64",
+        "name": "nvidia-driver"
+    }]
+
+    if args.rdma:
+        volumes.extend([{
+            "name": "ibetc",
+            "hostPath": {
+                "path": "/etc/libibverbs.d"
+            }
+        }, {
+            "name": "iblibs",
+            "hostPath": {
+                "path": "/usr/local/rdma"
+            }
+        }, {
+            "name": "valgrind",
+            "hostPath": {
+                "path": "/usr/lib64/mlnx_ofed/valgrind"
+            }
+        }])
+        volumeMounts.extend([{
+            "mountPath": "/etc/libibverbs.d",
+            "name": "ibetc"
+        }, {
+            "mountPath": "/usr/local/rdma",
+            "name": "iblibs"
+        }, {
+            "mountPath": "/usr/lib64/mlnx_ofed/valgrind",
+            "name": "valgrind"
+        }])
+        # append shm for NCCL2
+        volumes.append({"name": "dshm", "emptyDir": {"medium": "Memory"}})
+        volumeMounts.append({"mountPath": "/dev/shm", "name": "dshm"})
+
+    tn["spec"]["template"]["spec"]["volumes"] = volumes
+    tn_container["volumeMounts"] = volumeMounts
+
+    ps_container["env"] = copy.deepcopy(envs)
+    ps_container["env"].append({
+        "name": "PADDLE_TRAINING_ROLE",
+        "value": "PSERVER"
+    })
+    tn_container["env"] = envs
+    if args.disttype == "pserver":
+        tn_container["env"].append({
+            "name": "PADDLE_TRAINING_ROLE",
+            "value": "TRAINER"
+        })
+    elif args.disttype == "nccl2" or args.disttype == "local":
+        # NCCL2 have no training role, set to plain WORKER
+        tn_container["env"].append({
+            "name": "PADDLE_TRAINING_ROLE",
+            "value": "WORKER"
+        })
+
+    os.mkdir(args.jobname)
+    if args.disttype == "pserver":
+        with open("%s/pserver.yaml" % args.jobname, "w") as fn:
+            yaml.dump(ps, fn)
+
+    with open("%s/trainer.yaml" % args.jobname, "w") as fn:
+        yaml.dump(tn, fn)
+
+
+if __name__ == "__main__":
+    gen_job()
diff --git a/benchmark/fluid/kube_templates/__init__.py b/benchmark/fluid/kube_templates/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d09d940a5ee638e4b55405d05924e2d76006cfc
--- /dev/null
+++ b/benchmark/fluid/kube_templates/__init__.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pserver import pserver
+from trainer import trainer
+
+__all__ = ["pserver", "trainer", "envs"]
+
+envs = [
+    # envs that don't need to change
+    {
+        "name": "GLOG_v",
+        "value": "0"
+    },
+    {
+        "name": "GLOG_logtostderr",
+        "value": "1"
+    },
+    {
+        "name": "TOPOLOGY",
+        "value": ""
+    },
+    {
+        "name": "TRAINER_PACKAGE",
+        "value": "/workspace"
+    },
+    {
+        "name": "PADDLE_INIT_NICS",
+        "value": "eth2"
+    },
+    {
+        "name": "NAMESPACE",
+        "valueFrom": {
+            "fieldRef": {
+                "fieldPath": "metadata.namespace"
+            }
+        }
+    },
+    {
+        "name": "POD_IP",
+        "valueFrom": {
+            "fieldRef": {
+                "fieldPath": "status.podIP"
+            }
+        }
+    },
+    {
+        "name": "PADDLE_CURRENT_IP",
+        "valueFrom": {
+            "fieldRef": {
+                "fieldPath": "status.podIP"
+            }
+        }
+    }
+]
diff --git a/benchmark/fluid/kube_templates/pserver.py b/benchmark/fluid/kube_templates/pserver.py
new file mode 100644
index 0000000000000000000000000000000000000000..b54982c806ad4229fbd4bd7edf82a4e7eb4c5ad1
--- /dev/null
+++ b/benchmark/fluid/kube_templates/pserver.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+pserver = {
+    "apiVersion": "extensions/v1beta1",
+    "kind": "ReplicaSet",
+    "metadata": {
+        "name": "jobname-pserver"
+    },
+    "spec": {
+        "replicas": 1,
+        "template": {
+            "metadata": {
+                "labels": {
+                    "paddle-job-pserver": "jobname"
+                }
+            },
+            "spec": {
+                "hostNetwork": True,
+                "imagePullSecrets": [{
+                    "name": "job-registry-secret"
+                }],
+                "containers": [{
+                    "name": "pserver",
+                    "image": "",
+                    "imagePullPolicy": "Always",
+                    "ports": [{
+                        "name": "jobport-1",
+                        "containerPort": 1
+                    }],
+                    "env": [],
+                    "command": ["paddle_k8s", "start_pserver"],
+                    "resources": {
+                        "requests": {
+                            "memory": "10Gi",
+                            "cpu": "4"
+                        },
+                        "limits": {
+                            "memory": "10Gi",
+                            "cpu": "4"
+                        }
+                    }
+                }]
+            }
+        }
+    }
+}
diff --git a/benchmark/fluid/kube_templates/trainer.py b/benchmark/fluid/kube_templates/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b915d31e371d9d787ff64d705e32baf301e16abe
--- /dev/null
+++ b/benchmark/fluid/kube_templates/trainer.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+trainer = {
+    "apiVersion": "batch/v1",
+    "kind": "Job",
+    "metadata": {
+        "name": "jobname-pserver"
+    },
+    "spec": {
+        "parallelism": 4,
+        "completions": 4,
+        "template": {
+            "metadata": {
+                "labels": {
+                    "paddle-job": "jobname"
+                }
+            },
+            "spec": {
+                "hostNetwork": True,
+                "imagePullSecrets": [{
+                    "name": "job-registry-secret"
+                }],
+                "restartPolicy": "Never",
+                "containers": [{
+                    "name": "trainer",
+                    "image": "",
+                    "imagePullPolicy": "Always",
+                    # to let container set rlimit
+                    "securityContext": {
+                        "privileged": True
+                        # TODO(wuyi): use below specific cap instead of privileged,
+                        # using privileged will cause all GPU device are visible
+                        # in the container.
+                        # "capabilities": {
+                        #     "add": ["SYS_RESOURCE"]
+                        # }
+                    },
+                    "ports": [{
+                        "name": "jobport-1",
+                        "containerPort": 1
+                    }],
+                    "env": [],
+                    "command": ["paddle_k8s", "start_trainer", "v2"],
+                    "resources": {
+                        "requests": {
+                            "memory": "10Gi",
+                            "cpu": "4",
+                        },
+                        "limits": {
+                            "memory": "10Gi",
+                            "cpu": "4",
+                        }
+                    }
+                }]
+            }
+        }
+    }
+}
diff --git a/benchmark/fluid/models/__init__.py b/benchmark/fluid/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c3fcac8dd4a1ba0496ef013bd4eb468a0075125
--- /dev/null
+++ b/benchmark/fluid/models/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+    "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm"
+]
diff --git a/benchmark/fluid/models/machine_translation.py b/benchmark/fluid/models/machine_translation.py
new file mode 100644
index 0000000000000000000000000000000000000000..17f6b03826ae818a3671ea7f9355a8e8c04b50be
--- /dev/null
+++ b/benchmark/fluid/models/machine_translation.py
@@ -0,0 +1,219 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""seq2seq model for fluid."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import distutils.util
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+from paddle.fluid.executor import Executor
+
+
+def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
+    def linear(inputs):
+        return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
+
+    forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
+
+    cell_t = fluid.layers.sums(input=[
+        fluid.layers.elementwise_mul(
+            x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul(
+                x=input_gate, y=cell_tilde)
+    ])
+
+    hidden_t = fluid.layers.elementwise_mul(
+        x=output_gate, y=fluid.layers.tanh(x=cell_t))
+
+    return hidden_t, cell_t
+
+
+def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
+                   target_dict_dim, is_generating, beam_size, max_length):
+    """Construct a seq2seq network."""
+
+    def bi_lstm_encoder(input_seq, gate_size):
+        # Linear transformation part for input gate, output gate, forget gate
+        # and cell activation vectors need be done outside of dynamic_lstm.
+        # So the output size is 4 times of gate_size.
+        input_forward_proj = fluid.layers.fc(input=input_seq,
+                                             size=gate_size * 4,
+                                             act=None,
+                                             bias_attr=False)
+        forward, _ = fluid.layers.dynamic_lstm(
+            input=input_forward_proj, size=gate_size * 4, use_peepholes=False)
+        input_reversed_proj = fluid.layers.fc(input=input_seq,
+                                              size=gate_size * 4,
+                                              act=None,
+                                              bias_attr=False)
+        reversed, _ = fluid.layers.dynamic_lstm(
+            input=input_reversed_proj,
+            size=gate_size * 4,
+            is_reverse=True,
+            use_peepholes=False)
+        return forward, reversed
+
+    src_word_idx = fluid.layers.data(
+        name='source_sequence', shape=[1], dtype='int64', lod_level=1)
+
+    src_embedding = fluid.layers.embedding(
+        input=src_word_idx,
+        size=[source_dict_dim, embedding_dim],
+        dtype='float32')
+
+    src_forward, src_reversed = bi_lstm_encoder(
+        input_seq=src_embedding, gate_size=encoder_size)
+
+    encoded_vector = fluid.layers.concat(
+        input=[src_forward, src_reversed], axis=1)
+
+    encoded_proj = fluid.layers.fc(input=encoded_vector,
+                                   size=decoder_size,
+                                   bias_attr=False)
+
+    backward_first = fluid.layers.sequence_pool(
+        input=src_reversed, pool_type='first')
+
+    decoder_boot = fluid.layers.fc(input=backward_first,
+                                   size=decoder_size,
+                                   bias_attr=False,
+                                   act='tanh')
+
+    def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj,
+                                    decoder_boot, decoder_size):
+        def simple_attention(encoder_vec, encoder_proj, decoder_state):
+            decoder_state_proj = fluid.layers.fc(input=decoder_state,
+                                                 size=decoder_size,
+                                                 bias_attr=False)
+            decoder_state_expand = fluid.layers.sequence_expand(
+                x=decoder_state_proj, y=encoder_proj)
+            concated = fluid.layers.concat(
+                input=[encoder_proj, decoder_state_expand], axis=1)
+            attention_weights = fluid.layers.fc(input=concated,
+                                                size=1,
+                                                act='tanh',
+                                                bias_attr=False)
+            attention_weights = fluid.layers.sequence_softmax(
+                input=attention_weights)
+            weigths_reshape = fluid.layers.reshape(
+                x=attention_weights, shape=[-1])
+            scaled = fluid.layers.elementwise_mul(
+                x=encoder_vec, y=weigths_reshape, axis=0)
+            context = fluid.layers.sequence_pool(input=scaled, pool_type='sum')
+            return context
+
+        rnn = fluid.layers.DynamicRNN()
+
+        cell_init = fluid.layers.fill_constant_batch_size_like(
+            input=decoder_boot,
+            value=0.0,
+            shape=[-1, decoder_size],
+            dtype='float32')
+        cell_init.stop_gradient = False
+
+        with rnn.block():
+            current_word = rnn.step_input(target_embedding)
+            encoder_vec = rnn.static_input(encoder_vec)
+            encoder_proj = rnn.static_input(encoder_proj)
+            hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
+            cell_mem = rnn.memory(init=cell_init)
+            context = simple_attention(encoder_vec, encoder_proj, hidden_mem)
+            decoder_inputs = fluid.layers.concat(
+                input=[context, current_word], axis=1)
+            h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size)
+            rnn.update_memory(hidden_mem, h)
+            rnn.update_memory(cell_mem, c)
+            out = fluid.layers.fc(input=h,
+                                  size=target_dict_dim,
+                                  bias_attr=True,
+                                  act='softmax')
+            rnn.output(out)
+        return rnn()
+
+    if not is_generating:
+        trg_word_idx = fluid.layers.data(
+            name='target_sequence', shape=[1], dtype='int64', lod_level=1)
+
+        trg_embedding = fluid.layers.embedding(
+            input=trg_word_idx,
+            size=[target_dict_dim, embedding_dim],
+            dtype='float32')
+
+        prediction = lstm_decoder_with_attention(trg_embedding, encoded_vector,
+                                                 encoded_proj, decoder_boot,
+                                                 decoder_size)
+        label = fluid.layers.data(
+            name='label_sequence', shape=[1], dtype='int64', lod_level=1)
+        cost = fluid.layers.cross_entropy(input=prediction, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        feeding_list = ["source_sequence", "target_sequence", "label_sequence"]
+
+        return avg_cost, feeding_list
+
+
+def lodtensor_to_ndarray(lod_tensor):
+    dims = lod_tensor.get_dims()
+    ndarray = np.zeros(shape=dims).astype('float32')
+    for i in xrange(np.product(dims)):
+        ndarray.ravel()[i] = lod_tensor.get_float_element(i)
+    return ndarray
+
+
+def get_model(args):
+    if args.use_reader_op:
+        raise Exception("machine_translation do not support reader op for now.")
+    embedding_dim = 512
+    encoder_size = 512
+    decoder_size = 512
+    dict_size = 30000
+    beam_size = 3
+    max_length = 250
+    avg_cost, feeding_list = seq_to_seq_net(
+        embedding_dim,
+        encoder_size,
+        decoder_size,
+        dict_size,
+        dict_size,
+        False,
+        beam_size=beam_size,
+        max_length=max_length)
+
+    # clone from default main program
+    inference_program = fluid.default_main_program().clone()
+
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+
+    train_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=args.batch_size * args.gpus)
+
+    test_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.test(dict_size), buf_size=1000),
+        batch_size=args.batch_size)
+
+    return avg_cost, inference_program, optimizer, train_batch_generator, \
+           test_batch_generator, None
diff --git a/benchmark/fluid/models/mnist.py b/benchmark/fluid/models/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e740dc6896b7eeeb82170aa13d32987c4df5c48
--- /dev/null
+++ b/benchmark/fluid/models/mnist.py
@@ -0,0 +1,125 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import cProfile
+import os
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+
+SEED = 1
+DTYPE = "float32"
+
+# random seed must set before configuring the network.
+# fluid.default_startup_program().random_seed = SEED
+
+
+def cnn_model(data):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=data,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+
+    # TODO(dzhwinter) : refine the initializer and random seed settting
+    SIZE = 10
+    input_shape = conv_pool_2.shape
+    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
+
+    predict = fluid.layers.fc(
+        input=conv_pool_2,
+        size=SIZE,
+        act="softmax",
+        param_attr=fluid.param_attr.ParamAttr(
+            initializer=fluid.initializer.NormalInitializer(
+                loc=0.0, scale=scale)))
+    return predict
+
+
+def get_model(args):
+    if args.use_reader_op:
+        filelist = [
+            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+        ]
+        data_file = fluid.layers.open_files(
+            filenames=filelist,
+            shapes=[[-1, 1, 28, 28], (-1, 1)],
+            lod_levels=[0, 0],
+            dtypes=["float32", "int64"],
+            thread_num=args.gpus,
+            pass_num=args.pass_num)
+        data_file = fluid.layers.double_buffer(
+            fluid.layers.batch(
+                data_file, batch_size=args.batch_size))
+        images, label = fluid.layers.read_file(data_file)
+    else:
+        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    if args.device == 'CPU' and args.cpus > 1:
+        places = fluid.layers.get_places(args.cpus)
+        pd = fluid.layers.ParallelDo(places)
+        with pd.do():
+            predict = cnn_model(pd.read_input(images))
+            label = pd.read_input(label)
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+            batch_acc = fluid.layers.accuracy(input=predict, label=label)
+
+            pd.write_output(avg_cost)
+            pd.write_output(batch_acc)
+
+        avg_cost, batch_acc = pd()
+        avg_cost = fluid.layers.mean(avg_cost)
+        batch_acc = fluid.layers.mean(batch_acc)
+    else:
+        # Train program
+        predict = cnn_model(images)
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        # Evaluator
+        batch_acc = fluid.layers.accuracy(input=predict, label=label)
+
+    # inference program
+    inference_program = fluid.default_main_program().clone()
+
+    # Optimization
+    opt = fluid.optimizer.AdamOptimizer(
+        learning_rate=0.001, beta1=0.9, beta2=0.999)
+
+    # Reader
+    train_reader = paddle.batch(
+        paddle.dataset.mnist.train(), batch_size=args.batch_size * args.gpus)
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=args.batch_size)
+    return avg_cost, inference_program, opt, train_reader, test_reader, batch_acc
diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..d44a9c07d31cfae9d54ad5949b85c77e60eae258
--- /dev/null
+++ b/benchmark/fluid/models/resnet.py
@@ -0,0 +1,208 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import numpy as np
+import time
+import os
+
+import cProfile, pstats, StringIO
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.profiler as profiler
+from recordio_converter import imagenet_train, imagenet_test
+
+
+def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+    conv1 = fluid.layers.conv2d(
+        input=input,
+        filter_size=filter_size,
+        num_filters=ch_out,
+        stride=stride,
+        padding=padding,
+        act=None,
+        bias_attr=False)
+    return fluid.layers.batch_norm(input=conv1, act=act)
+
+
+def shortcut(input, ch_out, stride):
+    ch_in = input.shape[1]  # if args.data_format == 'NCHW' else input.shape[-1]
+    if ch_in != ch_out:
+        return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+    else:
+        return input
+
+
+def basicblock(input, ch_out, stride):
+    short = shortcut(input, ch_out, stride)
+    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None)
+    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+
+
+def bottleneck(input, ch_out, stride):
+    short = shortcut(input, ch_out * 4, stride)
+    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1)
+    conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None)
+    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
+
+
+def layer_warp(block_func, input, ch_out, count, stride):
+    res_out = block_func(input, ch_out, stride)
+    for i in range(1, count):
+        res_out = block_func(res_out, ch_out, 1)
+    return res_out
+
+
+def resnet_imagenet(input, class_dim, depth=50, data_format='NCHW'):
+
+    cfg = {
+        18: ([2, 2, 2, 1], basicblock),
+        34: ([3, 4, 6, 3], basicblock),
+        50: ([3, 4, 6, 3], bottleneck),
+        101: ([3, 4, 23, 3], bottleneck),
+        152: ([3, 8, 36, 3], bottleneck)
+    }
+    stages, block_func = cfg[depth]
+    conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
+    pool1 = fluid.layers.pool2d(
+        input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
+    res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
+    res2 = layer_warp(block_func, res1, 128, stages[1], 2)
+    res3 = layer_warp(block_func, res2, 256, stages[2], 2)
+    res4 = layer_warp(block_func, res3, 512, stages[3], 2)
+    pool2 = fluid.layers.pool2d(
+        input=res4,
+        pool_size=7,
+        pool_type='avg',
+        pool_stride=1,
+        global_pooling=True)
+    out = fluid.layers.fc(input=pool2, size=class_dim, act='softmax')
+    return out
+
+
+def resnet_cifar10(input, class_dim, depth=32, data_format='NCHW'):
+    assert (depth - 2) % 6 == 0
+
+    n = (depth - 2) // 6
+
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    out = fluid.layers.fc(input=pool, size=class_dim, act='softmax')
+    return out
+
+
+def get_model(args):
+    model = resnet_cifar10
+    if args.data_set == "cifar10":
+        class_dim = 10
+        if args.data_format == 'NCHW':
+            dshape = [3, 32, 32]
+        else:
+            dshape = [32, 32, 3]
+        model = resnet_cifar10
+        train_reader = paddle.dataset.cifar.train10()
+        test_reader = paddle.dataset.cifar.test10()
+    elif args.data_set == "flowers":
+        class_dim = 102
+        if args.data_format == 'NCHW':
+            dshape = [3, 224, 224]
+        else:
+            dshape = [224, 224, 3]
+        model = resnet_imagenet
+        train_reader = paddle.dataset.flowers.train()
+        test_reader = paddle.dataset.flowers.test()
+    elif args.data_set == "imagenet":
+        class_dim = 1000
+        if args.data_format == 'NCHW':
+            dshape = [3, 224, 224]
+        else:
+            dshape = [224, 224, 3]
+        model = resnet_imagenet
+        if not args.data_path:
+            raise Exception(
+                "Must specify --data_path when training with imagenet")
+        train_reader = imagenet_train(args.data_path)
+        test_reader = imagenet_test(args.data_path)
+
+    if args.use_reader_op:
+        filelist = [
+            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+        ]
+        data_file = fluid.layers.open_files(
+            filenames=filelist,
+            shapes=[[-1] + dshape, (-1, 1)],
+            lod_levels=[0, 0],
+            dtypes=["float32", "int64"],
+            thread_num=args.gpus,
+            pass_num=args.pass_num)
+        data_file = fluid.layers.double_buffer(
+            fluid.layers.batch(
+                data_file, batch_size=args.batch_size))
+        input, label = fluid.layers.read_file(data_file)
+    else:
+        input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    if args.device == 'CPU' and args.cpus > 1:
+        places = fluid.layers.get_places(args.cpus)
+        pd = fluid.layers.ParallelDo(places)
+        with pd.do():
+            predict = model(pd.read_input(input), class_dim)
+            label = pd.read_input(label)
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+            batch_acc = fluid.layers.accuracy(input=predict, label=label)
+
+            pd.write_output(avg_cost)
+            pd.write_output(batch_acc)
+
+        avg_cost, batch_acc = pd()
+        avg_cost = fluid.layers.mean(avg_cost)
+        batch_acc = fluid.layers.mean(batch_acc)
+    else:
+        predict = model(input, class_dim)
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        batch_acc = fluid.layers.accuracy(input=predict, label=label)
+
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc])
+
+    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+
+    batched_train_reader = paddle.batch(
+        train_reader if args.no_random else paddle.reader.shuffle(
+            train_reader, buf_size=5120),
+        batch_size=args.batch_size * args.gpus,
+        drop_last=True)
+    batched_test_reader = paddle.batch(
+        test_reader, batch_size=args.batch_size, drop_last=True)
+
+    return avg_cost, inference_program, optimizer, batched_train_reader,\
+                   batched_test_reader, batch_acc
diff --git a/benchmark/fluid/models/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..3231542a17ace99a17c9f9b9bdb3c2527637d9ef
--- /dev/null
+++ b/benchmark/fluid/models/stacked_dynamic_lstm.py
@@ -0,0 +1,127 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import cPickle
+import os
+import random
+import time
+
+import numpy
+import paddle
+import paddle.dataset.imdb as imdb
+import paddle.fluid as fluid
+import paddle.batch as batch
+import paddle.fluid.profiler as profiler
+
+word_dict = imdb.word_dict()
+
+
+def crop_sentence(reader, crop_size):
+    unk_value = word_dict['<unk>']
+
+    def __impl__():
+        for item in reader():
+            if len([x for x in item[0] if x != unk_value]) < crop_size:
+                yield item
+
+    return __impl__
+
+
+def get_model(args):
+    if args.use_reader_op:
+        raise Exception(
+            "stacked_dynamic_lstm do not support reader op for now.")
+    lstm_size = 512
+    emb_dim = 512
+    crop_size = 1500
+
+    data = fluid.layers.data(
+        name="words", shape=[1], lod_level=1, dtype='int64')
+    sentence = fluid.layers.embedding(
+        input=data, size=[len(word_dict), emb_dim])
+
+    sentence = fluid.layers.fc(input=sentence, size=lstm_size, act='tanh')
+
+    rnn = fluid.layers.DynamicRNN()
+    with rnn.block():
+        word = rnn.step_input(sentence)
+        prev_hidden = rnn.memory(value=0.0, shape=[lstm_size])
+        prev_cell = rnn.memory(value=0.0, shape=[lstm_size])
+
+        def gate_common(
+                ipt,
+                hidden,
+                size, ):
+            gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True)
+            gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
+            gate = fluid.layers.sums(input=[gate0, gate1])
+            return gate
+
+        forget_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        input_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        output_gate = fluid.layers.sigmoid(
+            x=gate_common(word, prev_hidden, lstm_size))
+        cell_gate = fluid.layers.tanh(
+            x=gate_common(word, prev_hidden, lstm_size))
+
+        cell = fluid.layers.sums(input=[
+            fluid.layers.elementwise_mul(
+                x=forget_gate, y=prev_cell), fluid.layers.elementwise_mul(
+                    x=input_gate, y=cell_gate)
+        ])
+
+        hidden = fluid.layers.elementwise_mul(
+            x=output_gate, y=fluid.layers.tanh(x=cell))
+
+        rnn.update_memory(prev_cell, cell)
+        rnn.update_memory(prev_hidden, hidden)
+        rnn.output(hidden)
+
+    last = fluid.layers.sequence_pool(rnn(), 'last')
+    logit = fluid.layers.fc(input=last, size=2, act='softmax')
+    loss = fluid.layers.cross_entropy(
+        input=logit,
+        label=fluid.layers.data(
+            name='label', shape=[1], dtype='int64'))
+    loss = fluid.layers.mean(x=loss)
+
+    # add acc
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \
+                shape=[1], dtype='int64'), total=batch_size_tensor)
+
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    adam = fluid.optimizer.Adam()
+
+    train_reader = batch(
+        paddle.reader.shuffle(
+            crop_sentence(imdb.train(word_dict), crop_size), buf_size=25000),
+        batch_size=args.batch_size * args.gpus)
+    test_reader = batch(
+        paddle.reader.shuffle(
+            crop_sentence(imdb.test(word_dict), crop_size), buf_size=25000),
+        batch_size=args.batch_size)
+
+    return loss, inference_program, adam, train_reader, test_reader, batch_acc
diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..932601302d2f5d56b53e3462af886429034d8989
--- /dev/null
+++ b/benchmark/fluid/models/vgg.py
@@ -0,0 +1,121 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in Fluid"""
+from __future__ import print_function
+
+import sys
+import time
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import argparse
+import functools
+import os
+
+
+def vgg16_bn_drop(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    return fc2
+
+
+def get_model(args):
+    if args.data_set == "cifar10":
+        classdim = 10
+        if args.data_format == 'NCHW':
+            data_shape = [3, 32, 32]
+        else:
+            data_shape = [32, 32, 3]
+    else:
+        classdim = 102
+        if args.data_format == 'NCHW':
+            data_shape = [3, 224, 224]
+        else:
+            data_shape = [224, 224, 3]
+
+    if args.use_reader_op:
+        filelist = [
+            os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
+        ]
+        data_file = fluid.layers.open_files(
+            filenames=filelist,
+            shapes=[[-1] + data_shape, (-1, 1)],
+            lod_levels=[0, 0],
+            dtypes=["float32", "int64"],
+            thread_num=args.gpus,
+            pass_num=args.pass_num)
+        data_file = fluid.layers.double_buffer(
+            fluid.layers.batch(
+                data_file, batch_size=args.batch_size))
+        images, label = fluid.layers.read_file(data_file)
+    else:
+        images = fluid.layers.data(
+            name='data', shape=data_shape, dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    net = vgg16_bn_drop(images)
+    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    # inference program
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        inference_program = fluid.io.get_inference_program(
+            target_vars=[batch_acc, batch_size_tensor])
+
+    # Optimization
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+
+    # data reader
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size * args.gpus)
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10()
+        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+        batch_size=args.batch_size)
+
+    return avg_cost, inference_program, optimizer, train_reader, test_reader, batch_acc
diff --git a/benchmark/fluid/recordio_converter.py b/benchmark/fluid/recordio_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2dc39109bf1beaf147b046560c92fbd2416d8e6
--- /dev/null
+++ b/benchmark/fluid/recordio_converter.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.dataset import mnist, cifar, flowers, image
+
+
+def convert_2_recordio(py_reader, outfilepath, batch_size, shape_data,
+                       shape_label):
+    num_batches = 0
+    with fluid.program_guard(fluid.Program(), fluid.Program()):
+        reader = paddle.batch(py_reader(), batch_size=batch_size)
+        feeder = fluid.DataFeeder(
+            feed_list=[  # order is image and label
+                fluid.layers.data(
+                    name='image', shape=shape_data),
+                fluid.layers.data(
+                    name='label', shape=shape_label, dtype='int64'),
+            ],
+            place=fluid.CPUPlace())
+        num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
+            outfilepath, reader, feeder)
+    return num_batches
+
+
+def prepare_mnist(outpath, batch_size):
+    outfilepath = os.path.join(outpath, "mnist.recordio")
+    convert_2_recordio(mnist.train, outfilepath, batch_size, [784], [1])
+
+
+def prepare_cifar10(outpath, batch_size):
+    outfilepath = os.path.join(outpath, "cifar.recordio")
+    convert_2_recordio(cifar.train10, outfilepath, batch_size, [3, 32, 32], [1])
+
+
+def prepare_flowers(outpath, batch_size):
+    outfilepath = os.path.join(outpath, "flowers.recordio")
+    convert_2_recordio(flowers.train, outfilepath, batch_size, [3, 224, 224],
+                       [1])
+
+
+def default_mapper(sample):
+    img, label = sample
+    img = image.simple_transform(
+        img, 256, 224, True, mean=[103.94, 116.78, 123.68])
+    return img.flatten().astype('float32'), label
+
+
+def imagenet_train(data_dir):
+    contents = os.listdir(data_dir)
+    if set(contents) != set(
+        ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
+        raise Exception("Imagenet data contents error!")
+    img2label = dict()
+    imgfilelist = []
+    with open(os.path.join(data_dir, "train.txt")) as fn:
+        while 1:
+            l = fn.readline()
+            if not l:
+                break
+            img, lbl = l[:-1].split(" ")
+            img2label[img] = int(lbl)
+            imgfilelist.append(img)
+    # shuffle all, this is slow
+    random.shuffle(imgfilelist)
+
+    def train_reader():
+        for idx, imgfile in enumerate(imgfilelist):
+            data = image.load_image(
+                os.path.join(data_dir, "train", imgfile.lower()))
+            label = [img2label[imgfile], ]
+            yield [data, label]
+
+    return paddle.reader.map_readers(default_mapper, train_reader)
+
+
+def imagenet_test(data_dir):
+    contents = os.listdir(data_dir)
+    if set(contents) != set(
+        ["train", "train.txt", "val", "val_set", "val.txt", "unzip.sh"]):
+        raise Exception("Imagenet data contents error!")
+    img2label = dict()
+    imgfilelist = []
+    with open(os.path.join(data_dir, "val.txt")) as fn:
+        while 1:
+            l = fn.readline()
+            if not l:
+                break
+            img, lbl = l[:-1].split(" ")
+            img2label[img] = int(lbl)
+            imgfilelist.append(img)
+
+    def test_reader():
+        for idx, imgfile in enumerate(imgfilelist):
+            base_path = os.path.join(data_dir, "val", imgfile.split(".")[0])
+            image_path = ".".join([base_path, "jpeg"])
+            data = image.load_image(image_path)
+            label = [img2label[imgfile], ]
+            yield [data, label]
+
+    return paddle.reader.map_readers(default_mapper, test_reader)
+
+
+# FIXME(wuyi): delete this when https://github.com/PaddlePaddle/Paddle/pull/11066 is merged
+def convert_reader_to_recordio_files(
+        filename,
+        batch_per_file,
+        reader_creator,
+        feeder,
+        compressor=core.RecordIOWriter.Compressor.Snappy,
+        max_num_records=1000,
+        feed_order=None):
+    if feed_order is None:
+        feed_order = feeder.feed_names
+    f_name, f_ext = os.path.splitext(filename)
+    assert (f_ext == ".recordio")
+
+    lines = []
+    f_idx = 0
+    counter = 0
+    for idx, batch in enumerate(reader_creator()):
+        lines.append(batch)
+        if idx >= batch_per_file and idx % batch_per_file == 0:
+            filename = "%s-%05d%s" % (f_name, f_idx, f_ext)
+            with fluid.recordio_writer.create_recordio_writer(
+                    filename, compressor, max_num_records) as writer:
+                for l in lines:
+                    res = feeder.feed(l)
+                    for each in feed_order:
+                        writer.append_tensor(res[each])
+                    writer.complete_append_tensor()
+                    counter += 1
+                lines = []
+                f_idx += 1
+            print("written file: ", filename)
+    return counter
+
+
+def prepare_imagenet(inpath, outpath, batch_size):
+    r = paddle.batch(imagenet_train(inpath), batch_size=batch_size)
+    feeder = fluid.DataFeeder(
+        feed_list=[
+            fluid.layers.data(
+                name="image", shape=[3, 224, 224]), fluid.layers.data(
+                    name="label", shape=[1], dtype='int64')
+        ],
+        place=fluid.CPUPlace())
+    outpath = os.path.join(outpath, "imagenet.recordio")
+    convert_reader_to_recordio_files(outpath, 10000, r, feeder)
diff --git a/benchmark/fluid/run.sh b/benchmark/fluid/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5d9b2db87135e53470b106dcd11a6bcfdc5dbda9
--- /dev/null
+++ b/benchmark/fluid/run.sh
@@ -0,0 +1,109 @@
+#!/bin/bash
+# This script benchmarking the PaddlePaddle Fluid on
+# single thread single GPU.
+
+mkdir -p logs
+#export FLAGS_fraction_of_gpu_memory_to_use=0.0
+export CUDNN_PATH=/paddle/cudnn_v5
+
+# disable openmp and mkl parallel
+#https://github.com/PaddlePaddle/Paddle/issues/7199
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
+if [ $ht -eq 1 ]; then # HT is OFF
+    if [ -z "$KMP_AFFINITY" ]; then
+        export KMP_AFFINITY="granularity=fine,compact,0,0"
+    fi
+    if [ -z "$OMP_DYNAMIC" ]; then
+        export OMP_DYNAMIC="FALSE"
+    fi
+else # HT is ON
+    if [ -z "$KMP_AFFINITY" ]; then
+        export KMP_AFFINITY="granularity=fine,compact,1,0"
+    fi
+fi
+# disable multi-gpu if have more than one
+export CUDA_VISIBLE_DEVICES=0
+export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$CUDNN_PATH:$LD_LIBRARY_PATH
+
+# only query the gpu used
+nohup stdbuf -oL nvidia-smi \
+      --id=${CUDA_VISIBLE_DEVICES} \
+      --query-gpu=timestamp \
+      --query-compute-apps=pid,process_name,used_memory \
+      --format=csv \
+      --filename=mem.log  \
+      -l 1 &
+
+# mnist
+# mnist gpu mnist 128
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=mnist \
+               --device=GPU \
+               --batch_size=128 \
+               --skip_batch_num=5 \
+               --iterations=500 \
+               2>&1 | tee -a logs/mnist_gpu_128.log
+
+# vgg16
+# gpu cifar10 128
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=vgg16 \
+               --device=GPU \
+               --batch_size=128 \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 | tee -a logs/vgg16_gpu_128.log
+
+# flowers gpu  128
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=vgg16 \
+               --device=GPU \
+               --batch_size=32 \
+               --data_set=flowers \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 | tee -a logs/vgg16_gpu_flowers_32.log
+
+# resnet50
+# resnet50 gpu cifar10 128
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=resnet \
+               --device=GPU \
+               --batch_size=128 \
+               --data_set=cifar10 \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 | tee -a logs/resnet50_gpu_128.log
+
+# resnet50 gpu flowers 64
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=resnet \
+               --device=GPU \
+               --batch_size=64 \
+               --data_set=flowers \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 | tee -a logs/resnet50_gpu_flowers_64.log
+
+# lstm
+# lstm gpu imdb 32 # tensorflow only support batch=32
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=stacked_dynamic_lstm \
+               --device=GPU \
+               --batch_size=32 \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 | tee -a logs/lstm_gpu_32.log
+
+# seq2seq
+# seq2seq gpu wmb 128
+FLAGS_benchmark=true stdbuf -oL python fluid_benchmark.py \
+               --model=machine_translation \
+               --device=GPU \
+               --batch_size=128 \
+               --skip_batch_num=5 \
+               --iterations=30 \
+               2>&1 | tee -a logs/lstm_gpu_128.log
diff --git a/benchmark/fluid/run_fluid_benchmark.sh b/benchmark/fluid/run_fluid_benchmark.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4309a3126c1d72fe1eb2d5ec423075aea4d3ec88
--- /dev/null
+++ b/benchmark/fluid/run_fluid_benchmark.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+PADDLE_TRAINING_ROLE=PSERVER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device CPU --update_method pserver --iterations=10000 &
+
+sleep 15
+
+CUDA_VISIBLE_DEVICES=0,1 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=0 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 &
+
+CUDA_VISIBLE_DEVICES=2,3 PADDLE_TRAINING_ROLE=TRAINER PADDLE_PSERVER_PORT=7164 PADDLE_PSERVER_IPS=127.0.0.1 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=127.0.0.1 PADDLE_TRAINER_ID=1 python fluid_benchmark.py --model resnet --device GPU --update_method pserver --iterations=10000 --gpus 2 &
diff --git a/paddle/scripts/check_env.sh b/benchmark/paddle/image/check_env.sh
similarity index 100%
rename from paddle/scripts/check_env.sh
rename to benchmark/paddle/image/check_env.sh
diff --git a/benchmark/tensorflow/machine_translation.py b/benchmark/tensorflow/machine_translation.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f77dce98353af53803246be8dc61063836b7867
--- /dev/null
+++ b/benchmark/tensorflow/machine_translation.py
@@ -0,0 +1,626 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.python.framework import dtypes
+from tensorflow.python.layers.core import Dense
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops.rnn_cell_impl import RNNCell, BasicLSTMCell
+from tensorflow.python.ops.rnn_cell_impl import LSTMStateTuple
+from tensorflow.contrib.rnn.python.ops import core_rnn_cell
+from tensorflow.python.ops import array_ops
+from tensorflow.python.util import nest
+import tensorflow.contrib.seq2seq as seq2seq
+from tensorflow.contrib.seq2seq.python.ops import beam_search_decoder
+import numpy as np
+import os
+import argparse
+import time
+
+import paddle.v2 as paddle
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--embedding_dim",
+    type=int,
+    default=512,
+    help="The dimension of embedding table. (default: %(default)d)")
+parser.add_argument(
+    "--encoder_size",
+    type=int,
+    default=512,
+    help="The size of encoder bi-rnn unit. (default: %(default)d)")
+parser.add_argument(
+    "--decoder_size",
+    type=int,
+    default=512,
+    help="The size of decoder rnn unit. (default: %(default)d)")
+parser.add_argument(
+    "--batch_size",
+    type=int,
+    default=128,
+    help="The sequence number of a mini-batch data. (default: %(default)d)")
+parser.add_argument(
+    "--dict_size",
+    type=int,
+    default=30000,
+    help="The dictionary capacity. Dictionaries of source sequence and "
+    "target dictionary have same capacity. (default: %(default)d)")
+parser.add_argument(
+    "--max_time_steps",
+    type=int,
+    default=81,
+    help="Max number of time steps for sequence. (default: %(default)d)")
+parser.add_argument(
+    "--pass_num",
+    type=int,
+    default=10,
+    help="The pass number to train. (default: %(default)d)")
+parser.add_argument(
+    "--learning_rate",
+    type=float,
+    default=0.0002,
+    help="Learning rate used to train the model. (default: %(default)f)")
+parser.add_argument(
+    "--infer_only", action='store_true', help="If set, run forward only.")
+parser.add_argument(
+    "--beam_size",
+    type=int,
+    default=3,
+    help="The width for beam searching. (default: %(default)d)")
+parser.add_argument(
+    "--max_generation_length",
+    type=int,
+    default=250,
+    help="The maximum length of sequence when doing generation. "
+    "(default: %(default)d)")
+parser.add_argument(
+    "--save_freq",
+    type=int,
+    default=500,
+    help="Save model checkpoint every this interation. (default: %(default)d)")
+parser.add_argument(
+    "--model_dir",
+    type=str,
+    default='./checkpoint',
+    help="Path to save model checkpoints. (default: %(default)d)")
+
+_Linear = core_rnn_cell._Linear  # pylint: disable=invalid-name
+
+START_TOKEN_IDX = 0
+END_TOKEN_IDX = 1
+
+
+class LSTMCellWithSimpleAttention(RNNCell):
+    """Add attention mechanism to BasicLSTMCell.
+    This class is a wrapper based on tensorflow's `BasicLSTMCell`.
+    """
+
+    def __init__(self,
+                 num_units,
+                 encoder_vector,
+                 encoder_proj,
+                 source_sequence_length,
+                 forget_bias=1.0,
+                 state_is_tuple=True,
+                 activation=None,
+                 reuse=None):
+        super(LSTMCellWithSimpleAttention, self).__init__(_reuse=reuse)
+        if not state_is_tuple:
+            logging.warn("%s: Using a concatenated state is slower and will "
+                         "soon be deprecated. Use state_is_tuple=True.", self)
+        self._num_units = num_units
+        # set padding part to 0
+        self._encoder_vector = self._reset_padding(encoder_vector,
+                                                   source_sequence_length)
+        self._encoder_proj = self._reset_padding(encoder_proj,
+                                                 source_sequence_length)
+        self._forget_bias = forget_bias
+        self._state_is_tuple = state_is_tuple
+        self._activation = activation or math_ops.tanh
+        self._linear = None
+
+    @property
+    def state_size(self):
+        return (LSTMStateTuple(self._num_units, self._num_units) \
+                if self._state_is_tuple else 2 * self._num_units)
+
+    @property
+    def output_size(self):
+        return self._num_units
+
+    def zero_state(self, batch_size, dtype):
+        state_size = self.state_size
+        if hasattr(self, "_last_zero_state"):
+            (last_state_size, last_batch_size, last_dtype,
+             last_output) = getattr(self, "_last_zero_state")
+            if (last_batch_size == batch_size and last_dtype == dtype and
+                    last_state_size == state_size):
+                return last_output
+        with ops.name_scope(
+                type(self).__name__ + "ZeroState", values=[batch_size]):
+            output = _zero_state_tensors(state_size, batch_size, dtype)
+        self._last_zero_state = (state_size, batch_size, dtype, output)
+        return output
+
+    def call(self, inputs, state):
+        sigmoid = math_ops.sigmoid
+        # Parameters of gates are concatenated into one multiply for efficiency.
+        if self._state_is_tuple:
+            c, h = state
+        else:
+            c, h = array_ops.split(value=state, num_or_size_splits=2, axis=1)
+
+        # get context from encoder outputs
+        context = self._simple_attention(self._encoder_vector,
+                                         self._encoder_proj, h)
+
+        if self._linear is None:
+            self._linear = _Linear([inputs, context, h], 4 * self._num_units,
+                                   True)
+        # i = input_gate, j = new_input, f = forget_gate, o = output_gate
+        i, j, f, o = array_ops.split(
+            value=self._linear([inputs, context, h]),
+            num_or_size_splits=4,
+            axis=1)
+
+        new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) *
+                 self._activation(j))
+        new_h = self._activation(new_c) * sigmoid(o)
+
+        if self._state_is_tuple:
+            new_state = LSTMStateTuple(new_c, new_h)
+        else:
+            new_state = array_ops.concat([new_c, new_h], 1)
+        return new_h, new_state
+
+    def _simple_attention(self, encoder_vec, encoder_proj, decoder_state):
+        """Implement the attention function.
+        The implementation has the same logic to the fluid decoder.
+        """
+        decoder_state_proj = tf.contrib.layers.fully_connected(
+            inputs=decoder_state,
+            num_outputs=self._num_units,
+            activation_fn=None,
+            biases_initializer=None)
+        decoder_state_expand = tf.tile(
+            tf.expand_dims(
+                input=decoder_state_proj, axis=1),
+            [1, tf.shape(encoder_proj)[1], 1])
+        concated = tf.concat([decoder_state_expand, encoder_proj], axis=2)
+        # need reduce the first dimension
+        attention_weights = tf.contrib.layers.fully_connected(
+            inputs=tf.reshape(
+                concated, shape=[-1, self._num_units * 2]),
+            num_outputs=1,
+            activation_fn=tf.nn.tanh,
+            biases_initializer=None)
+        attention_weights_reshaped = tf.reshape(
+            attention_weights, shape=[tf.shape(encoder_vec)[0], -1, 1])
+        # normalize the attention weights using softmax
+        attention_weights_normed = tf.nn.softmax(
+            attention_weights_reshaped, dim=1)
+        scaled = tf.multiply(attention_weights_normed, encoder_vec)
+        context = tf.reduce_sum(scaled, axis=1)
+        return context
+
+    def _reset_padding(self,
+                       memory,
+                       memory_sequence_length,
+                       check_inner_dims_defined=True):
+        """Reset the padding part for encoder inputs.
+        This funtion comes from tensorflow's `_prepare_memory` function.
+        """
+        memory = nest.map_structure(
+                lambda m: ops.convert_to_tensor(m, name="memory"), memory)
+        if memory_sequence_length is not None:
+            memory_sequence_length = ops.convert_to_tensor(
+                memory_sequence_length, name="memory_sequence_length")
+        if check_inner_dims_defined:
+
+            def _check_dims(m):
+                if not m.get_shape()[2:].is_fully_defined():
+                    raise ValueError(
+                        "Expected memory %s to have fully defined inner dims, "
+                        "but saw shape: %s" % (m.name, m.get_shape()))
+
+            nest.map_structure(_check_dims, memory)
+        if memory_sequence_length is None:
+            seq_len_mask = None
+        else:
+            seq_len_mask = array_ops.sequence_mask(
+                memory_sequence_length,
+                maxlen=array_ops.shape(nest.flatten(memory)[0])[1],
+                dtype=nest.flatten(memory)[0].dtype)
+            seq_len_batch_size = (memory_sequence_length.shape[0].value or
+                                  array_ops.shape(memory_sequence_length)[0])
+
+        def _maybe_mask(m, seq_len_mask):
+            rank = m.get_shape().ndims
+            rank = rank if rank is not None else array_ops.rank(m)
+            extra_ones = array_ops.ones(rank - 2, dtype=dtypes.int32)
+            m_batch_size = m.shape[0].value or array_ops.shape(m)[0]
+            if memory_sequence_length is not None:
+                message = ("memory_sequence_length and memory tensor "
+                           "batch sizes do not match.")
+                with ops.control_dependencies([
+                        check_ops.assert_equal(
+                            seq_len_batch_size, m_batch_size, message=message)
+                ]):
+                    seq_len_mask = array_ops.reshape(
+                        seq_len_mask,
+                        array_ops.concat(
+                            (array_ops.shape(seq_len_mask), extra_ones), 0))
+                return m * seq_len_mask
+            else:
+                return m
+
+        return nest.map_structure(lambda m: _maybe_mask(m, seq_len_mask),
+                                  memory)
+
+
+def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
+                   target_dict_dim, is_generating, beam_size,
+                   max_generation_length):
+    src_word_idx = tf.placeholder(tf.int32, shape=[None, None])
+    src_sequence_length = tf.placeholder(tf.int32, shape=[None, ])
+
+    src_embedding_weights = tf.get_variable("source_word_embeddings",
+                                            [source_dict_dim, embedding_dim])
+    src_embedding = tf.nn.embedding_lookup(src_embedding_weights, src_word_idx)
+
+    src_forward_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size)
+    src_reversed_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size)
+    # no peephole
+    encoder_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
+        cell_fw=src_forward_cell,
+        cell_bw=src_reversed_cell,
+        inputs=src_embedding,
+        sequence_length=src_sequence_length,
+        dtype=tf.float32)
+
+    # concat the forward outputs and backward outputs
+    encoded_vec = tf.concat(encoder_outputs, axis=2)
+
+    # project the encoder outputs to size of decoder lstm
+    encoded_proj = tf.contrib.layers.fully_connected(
+        inputs=tf.reshape(
+            encoded_vec, shape=[-1, embedding_dim * 2]),
+        num_outputs=decoder_size,
+        activation_fn=None,
+        biases_initializer=None)
+    encoded_proj_reshape = tf.reshape(
+        encoded_proj, shape=[-1, tf.shape(encoded_vec)[1], decoder_size])
+
+    # get init state for decoder lstm's H
+    backword_first = tf.slice(encoder_outputs[1], [0, 0, 0], [-1, 1, -1])
+    decoder_boot = tf.contrib.layers.fully_connected(
+        inputs=tf.reshape(
+            backword_first, shape=[-1, embedding_dim]),
+        num_outputs=decoder_size,
+        activation_fn=tf.nn.tanh,
+        biases_initializer=None)
+
+    # prepare the initial state for decoder lstm
+    cell_init = tf.zeros(tf.shape(decoder_boot), tf.float32)
+    initial_state = LSTMStateTuple(cell_init, decoder_boot)
+
+    # create decoder lstm cell
+    decoder_cell = LSTMCellWithSimpleAttention(
+        decoder_size,
+        encoded_vec
+        if not is_generating else seq2seq.tile_batch(encoded_vec, beam_size),
+        encoded_proj_reshape if not is_generating else
+        seq2seq.tile_batch(encoded_proj_reshape, beam_size),
+        src_sequence_length if not is_generating else
+        seq2seq.tile_batch(src_sequence_length, beam_size),
+        forget_bias=0.0)
+
+    output_layer = Dense(target_dict_dim, name='output_projection')
+
+    if not is_generating:
+        trg_word_idx = tf.placeholder(tf.int32, shape=[None, None])
+        trg_sequence_length = tf.placeholder(tf.int32, shape=[None, ])
+        trg_embedding_weights = tf.get_variable(
+            "target_word_embeddings", [target_dict_dim, embedding_dim])
+        trg_embedding = tf.nn.embedding_lookup(trg_embedding_weights,
+                                               trg_word_idx)
+
+        training_helper = seq2seq.TrainingHelper(
+            inputs=trg_embedding,
+            sequence_length=trg_sequence_length,
+            time_major=False,
+            name='training_helper')
+
+        training_decoder = seq2seq.BasicDecoder(
+            cell=decoder_cell,
+            helper=training_helper,
+            initial_state=initial_state,
+            output_layer=output_layer)
+
+        # get the max length of target sequence
+        max_decoder_length = tf.reduce_max(trg_sequence_length)
+
+        decoder_outputs_train, _, _ = seq2seq.dynamic_decode(
+            decoder=training_decoder,
+            output_time_major=False,
+            impute_finished=True,
+            maximum_iterations=max_decoder_length)
+
+        decoder_logits_train = tf.identity(decoder_outputs_train.rnn_output)
+        decoder_pred_train = tf.argmax(
+            decoder_logits_train, axis=-1, name='decoder_pred_train')
+        masks = tf.sequence_mask(
+            lengths=trg_sequence_length,
+            maxlen=max_decoder_length,
+            dtype=tf.float32,
+            name='masks')
+
+        # place holder of label sequence
+        lbl_word_idx = tf.placeholder(tf.int32, shape=[None, None])
+
+        # compute the loss
+        loss = seq2seq.sequence_loss(
+            logits=decoder_logits_train,
+            targets=lbl_word_idx,
+            weights=masks,
+            average_across_timesteps=True,
+            average_across_batch=True)
+
+        # return feeding list and loss operator
+        return {
+            'src_word_idx': src_word_idx,
+            'src_sequence_length': src_sequence_length,
+            'trg_word_idx': trg_word_idx,
+            'trg_sequence_length': trg_sequence_length,
+            'lbl_word_idx': lbl_word_idx
+        }, loss
+    else:
+        start_tokens = tf.ones([tf.shape(src_word_idx)[0], ],
+                               tf.int32) * START_TOKEN_IDX
+        # share the same embedding weights with target word
+        trg_embedding_weights = tf.get_variable(
+            "target_word_embeddings", [target_dict_dim, embedding_dim])
+
+        inference_decoder = beam_search_decoder.BeamSearchDecoder(
+            cell=decoder_cell,
+            embedding=lambda tokens: tf.nn.embedding_lookup(trg_embedding_weights, tokens),
+            start_tokens=start_tokens,
+            end_token=END_TOKEN_IDX,
+            initial_state=tf.nn.rnn_cell.LSTMStateTuple(
+                tf.contrib.seq2seq.tile_batch(initial_state[0], beam_size),
+                tf.contrib.seq2seq.tile_batch(initial_state[1], beam_size)),
+            beam_width=beam_size,
+            output_layer=output_layer)
+
+        decoder_outputs_decode, _, _ = seq2seq.dynamic_decode(
+            decoder=inference_decoder,
+            output_time_major=False,
+            #impute_finished=True,# error occurs
+            maximum_iterations=max_generation_length)
+
+        predicted_ids = decoder_outputs_decode.predicted_ids
+
+        return {
+            'src_word_idx': src_word_idx,
+            'src_sequence_length': src_sequence_length
+        }, predicted_ids
+
+
+def print_arguments(args):
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in vars(args).iteritems():
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def padding_data(data, padding_size, value):
+    data = data + [value] * padding_size
+    return data[:padding_size]
+
+
+def save(sess, path, var_list=None, global_step=None):
+    saver = tf.train.Saver(var_list)
+    save_path = saver.save(sess, save_path=path, global_step=global_step)
+    print('Model save at %s' % save_path)
+
+
+def restore(sess, path, var_list=None):
+    # var_list = None returns the list of all saveable variables
+    saver = tf.train.Saver(var_list)
+    saver.restore(sess, save_path=path)
+    print('model restored from %s' % path)
+
+
+def adapt_batch_data(data):
+    src_seq = map(lambda x: x[0], data)
+    trg_seq = map(lambda x: x[1], data)
+    lbl_seq = map(lambda x: x[2], data)
+
+    src_sequence_length = np.array(
+        [len(seq) for seq in src_seq]).astype('int32')
+    src_seq_maxlen = np.max(src_sequence_length)
+
+    trg_sequence_length = np.array(
+        [len(seq) for seq in trg_seq]).astype('int32')
+    trg_seq_maxlen = np.max(trg_sequence_length)
+
+    src_seq = np.array(
+        [padding_data(seq, src_seq_maxlen, END_TOKEN_IDX)
+         for seq in src_seq]).astype('int32')
+
+    trg_seq = np.array(
+        [padding_data(seq, trg_seq_maxlen, END_TOKEN_IDX)
+         for seq in trg_seq]).astype('int32')
+
+    lbl_seq = np.array(
+        [padding_data(seq, trg_seq_maxlen, END_TOKEN_IDX)
+         for seq in lbl_seq]).astype('int32')
+
+    return {
+        'src_word_idx': src_seq,
+        'src_sequence_length': src_sequence_length,
+        'trg_word_idx': trg_seq,
+        'trg_sequence_length': trg_sequence_length,
+        'lbl_word_idx': lbl_seq
+    }
+
+
+def train():
+    feeding_dict, loss = seq_to_seq_net(
+        embedding_dim=args.embedding_dim,
+        encoder_size=args.encoder_size,
+        decoder_size=args.decoder_size,
+        source_dict_dim=args.dict_size,
+        target_dict_dim=args.dict_size,
+        is_generating=False,
+        beam_size=args.beam_size,
+        max_generation_length=args.max_generation_length)
+
+    global_step = tf.Variable(0, trainable=False, name='global_step')
+    trainable_params = tf.trainable_variables()
+    optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
+
+    gradients = tf.gradients(loss, trainable_params)
+    # may clip the parameters
+    clip_gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
+
+    updates = optimizer.apply_gradients(
+        zip(gradients, trainable_params), global_step=global_step)
+
+    src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size)
+
+    train_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
+        batch_size=args.batch_size)
+
+    test_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.test(args.dict_size), buf_size=1000),
+        batch_size=args.batch_size)
+
+    def do_validataion():
+        total_loss = 0.0
+        count = 0
+        for batch_id, data in enumerate(test_batch_generator()):
+            adapted_batch_data = adapt_batch_data(data)
+            outputs = sess.run([loss],
+                               feed_dict={
+                                   item[1]: adapted_batch_data[item[0]]
+                                   for item in feeding_dict.items()
+                               })
+            total_loss += outputs[0]
+            count += 1
+        return total_loss / count
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+
+    with tf.Session(config=config) as sess:
+        init_g = tf.global_variables_initializer()
+        init_l = tf.local_variables_initializer()
+        sess.run(init_l)
+        sess.run(init_g)
+        for pass_id in xrange(args.pass_num):
+            pass_start_time = time.time()
+            words_seen = 0
+            for batch_id, data in enumerate(train_batch_generator()):
+                adapted_batch_data = adapt_batch_data(data)
+                words_seen += np.sum(adapted_batch_data['src_sequence_length'])
+                words_seen += np.sum(adapted_batch_data['trg_sequence_length'])
+                outputs = sess.run([updates, loss],
+                                   feed_dict={
+                                       item[1]: adapted_batch_data[item[0]]
+                                       for item in feeding_dict.items()
+                                   })
+                print("pass_id=%d, batch_id=%d, train_loss: %f" %
+                      (pass_id, batch_id, outputs[1]))
+            pass_end_time = time.time()
+            test_loss = do_validataion()
+            time_consumed = pass_end_time - pass_start_time
+            words_per_sec = words_seen / time_consumed
+            print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" %
+                  (pass_id, test_loss, words_per_sec, time_consumed))
+
+
+def infer():
+    feeding_dict, predicted_ids = seq_to_seq_net(
+        embedding_dim=args.embedding_dim,
+        encoder_size=args.encoder_size,
+        decoder_size=args.decoder_size,
+        source_dict_dim=args.dict_size,
+        target_dict_dim=args.dict_size,
+        is_generating=True,
+        beam_size=args.beam_size,
+        max_generation_length=args.max_generation_length)
+
+    src_dict, trg_dict = paddle.dataset.wmt14.get_dict(args.dict_size)
+    test_batch_generator = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(args.dict_size), buf_size=1000),
+        batch_size=args.batch_size)
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    with tf.Session(config=config) as sess:
+        restore(sess, './checkpoint/tf_seq2seq-1500')
+        for batch_id, data in enumerate(test_batch_generator()):
+            src_seq = map(lambda x: x[0], data)
+
+            source_language_seq = [
+                src_dict[item] for seq in src_seq for item in seq
+            ]
+
+            src_sequence_length = np.array(
+                [len(seq) for seq in src_seq]).astype('int32')
+            src_seq_maxlen = np.max(src_sequence_length)
+            src_seq = np.array([
+                padding_data(seq, src_seq_maxlen, END_TOKEN_IDX)
+                for seq in src_seq
+            ]).astype('int32')
+
+            outputs = sess.run([predicted_ids],
+                               feed_dict={
+                                   feeding_dict['src_word_idx']: src_seq,
+                                   feeding_dict['src_sequence_length']:
+                                   src_sequence_length
+                               })
+
+            print("\nDecoder result comparison: ")
+            source_language_seq = ' '.join(source_language_seq).lstrip(
+                '<s>').rstrip('<e>').strip()
+            inference_seq = ''
+            print(" --> source: " + source_language_seq)
+            for item in outputs[0][0]:
+                if item[0] == END_TOKEN_IDX: break
+                inference_seq += ' ' + trg_dict.get(item[0], '<unk>')
+            print(" --> inference: " + inference_seq)
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    print_arguments(args)
+    if args.infer_only:
+        infer()
+    else:
+        train()
diff --git a/benchmark/tensorflow/mnist.py b/benchmark/tensorflow/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..7140eed6eaff49b5c65f9ccb2e38f113a4cdbdbf
--- /dev/null
+++ b/benchmark/tensorflow/mnist.py
@@ -0,0 +1,180 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import time
+import numpy as np
+
+import tensorflow as tf
+import paddle.v2 as paddle
+
+DTYPE = tf.float32
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("mnist model benchmark.")
+    parser.add_argument(
+        '--batch_size', type=int, default=128, help='The minibatch size.')
+    parser.add_argument(
+        '--iterations', type=int, default=35, help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=5, help='The number of passes.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    args = parser.parse_args()
+    return args
+
+
+def run_benchmark(args):
+    def weight_variable(dtype, shape):
+        initial = tf.truncated_normal(shape, stddev=0.1, dtype=dtype)
+        return tf.Variable(initial)
+
+    def bias_variable(dtype, shape):
+        initial = tf.constant(0.1, shape=shape, dtype=dtype)
+        return tf.Variable(initial)
+
+    device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0'
+    with tf.device(device):
+        images = tf.placeholder(DTYPE, shape=(None, 28, 28, 1))
+        labels = tf.placeholder(tf.int64, shape=(None, ))
+
+        # conv1, relu, pool1
+        conv1_weights = weight_variable(DTYPE, [5, 5, 1, 20])
+        conv1_bias = bias_variable(DTYPE, [20])
+        conv1 = tf.nn.conv2d(
+            images, conv1_weights, strides=[1, 1, 1, 1], padding="VALID")
+        relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_bias))
+        pool1 = tf.nn.max_pool(
+            relu1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
+
+        # conv2, relu, pool2
+        conv2_weights = weight_variable(DTYPE, [5, 5, 20, 50])
+        conv2_bias = bias_variable(DTYPE, [50])
+        conv2 = tf.nn.conv2d(
+            pool1, conv2_weights, strides=[1, 1, 1, 1], padding="VALID")
+        relu2 = tf.nn.relu(tf.nn.bias_add(conv2, conv2_bias))
+        pool2 = tf.nn.max_pool(
+            relu2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
+
+        # FC 
+        pool_shape = pool2.get_shape().as_list()
+        hidden_dim = reduce(lambda a, b: a * b, pool_shape[1:], 1)
+        reshape = tf.reshape(pool2, shape=(tf.shape(pool2)[0], hidden_dim))
+        fc_weights = weight_variable(DTYPE, [hidden_dim, 10])
+        fc_bias = bias_variable(DTYPE, [10])
+        logits = tf.matmul(reshape, fc_weights) + fc_bias
+
+        # Get prediction
+        prediction = tf.nn.softmax(logits)
+
+        # Loss 
+        one_hot_labels = tf.one_hot(labels, depth=10)
+        cost = -tf.reduce_sum(tf.log(prediction) * one_hot_labels, [1])
+        avg_cost = tf.reduce_mean(cost)
+
+        # Get accuracy
+        correct = tf.equal(tf.argmax(prediction, 1), labels)
+        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
+
+        # metrics, g_accuracy
+        with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
+            g_accuracy = tf.metrics.accuracy(
+                labels, tf.argmax(
+                    prediction, axis=1))
+            vars = tf.contrib.framework.get_variables(
+                scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
+            g_accuracy_reset_op = tf.variables_initializer(vars)
+
+        # Optimizer 
+        opt = tf.train.AdamOptimizer(
+            learning_rate=0.001, beta1=0.9, beta2=0.999)
+        train_op = opt.minimize(avg_cost)
+        # train_op = tf.train.AdamOptimizer(1e-4).minimize(avg_cost)
+
+    train_reader = paddle.batch(
+        paddle.dataset.mnist.train(), batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=args.batch_size)
+
+    def eval_test():
+        sess.run(g_accuracy_reset_op)
+        for batch_id, data in enumerate(test_reader()):
+            images_data = np.array(
+                map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
+            labels_data = np.array(map(lambda x: x[1], data)).astype("int64")
+
+            loss, acc, g_acc = sess.run(
+                [avg_cost, accuracy, g_accuracy],
+                feed_dict={images: images_data,
+                           labels: labels_data})
+        return g_acc[1]
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+
+    with tf.Session(config=config) as sess:
+        init_g = tf.global_variables_initializer()
+        init_l = tf.local_variables_initializer()
+        sess.run(init_g)
+        sess.run(init_l)
+        for pass_id in range(args.pass_num):
+            sess.run(g_accuracy_reset_op)
+
+            pass_start = time.time()
+            for batch_id, data in enumerate(train_reader()):
+                images_data = np.array(
+                    map(lambda x: np.transpose(x[0].reshape([1, 28, 28]), axes=[1,2,0]), data)).astype("float32")
+                labels_data = np.array(map(lambda x: x[1], data)).astype(
+                    "int64")
+
+                start = time.time()
+                _, loss, acc, g_acc = sess.run(
+                    [train_op, avg_cost, accuracy, g_accuracy],
+                    feed_dict={images: images_data,
+                               labels: labels_data})
+                end = time.time()
+
+                print("pass=%d, batch=%d, loss=%f, error=%f, elapse=%f" %
+                      (pass_id, batch_id, loss, 1 - acc, (end - start) / 1000))
+
+            pass_end = time.time()
+            test_avg_acc = eval_test()
+
+            print(
+                "pass=%d, training_avg_accuracy=%f, test_avg_acc=%f, elapse=%f"
+                % (pass_id, g_acc[1], test_avg_acc,
+                   (pass_end - pass_start) / 1000))
+
+
+def print_arguments(args):
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+    run_benchmark(args)
diff --git a/benchmark/tensorflow/resnet.py b/benchmark/tensorflow/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..c432fa8d59571e128b9ff9e3ffa1949b792ef3a4
--- /dev/null
+++ b/benchmark/tensorflow/resnet.py
@@ -0,0 +1,504 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+based on https://github.com/tensorflow/models/blob/master/official/resnet/resnet_model.py
+
+Get help: python resnet.py --help
+See performance on flowers: python resnet.py
+Train on cifar10: python resnet.py --data=cifar10 --with_test
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import time
+import numpy as np
+
+import paddle.v2 as paddle
+import tensorflow as tf
+
+DTYPE = tf.float32
+
+
+def parse_args():
+    parser = argparse.ArgumentParser('Convolution model benchmark.')
+    parser.add_argument(
+        '--model',
+        type=str,
+        choices=['resnet'],
+        default='resnet',
+        help='The model architecture.')
+    parser.add_argument(
+        '--batch_size', type=int, default=32, help='The minibatch size.')
+    parser.add_argument(
+        '--use_fake_data',
+        action='store_true',
+        help='use real data or fake data')
+    parser.add_argument(
+        '--skip_batch_num',
+        type=int,
+        default=5,
+        help='The first num of minibatch num to skip, for better performance test'
+    )
+    parser.add_argument(
+        '--iterations',
+        type=int,
+        default=105,
+        help='The number of minibatches.')
+    parser.add_argument(
+        '--pass_num', type=int, default=300, help='The number of passes.')
+    parser.add_argument(
+        '--order',
+        type=str,
+        default='NHWC',
+        choices=['NCHW', 'NHWC'],
+        help='The data order, now only support NCHW.')
+    parser.add_argument(
+        '--device',
+        type=str,
+        default='GPU',
+        choices=['CPU', 'GPU'],
+        help='The device type.')
+    parser.add_argument(
+        '--data',
+        type=str,
+        default='flowers102',
+        choices=['flowers102', 'cifar10'],
+        help='The kinds of data.')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    parser.add_argument(
+        '--use_cprof', action='store_true', help='If set, use cProfile.')
+    parser.add_argument(
+        '--with_test',
+        action='store_true',
+        help='If set, test the testset during training.')
+    parser.add_argument(
+        '--use_nvprof',
+        action='store_true',
+        help='If set, use nvprof for CUDA.')
+    args = parser.parse_args()
+    return args
+
+
+def print_arguments(args):
+    vars(args)['use_nvprof'] = (vars(args)['use_nvprof'] and
+                                vars(args)['device'] == 'GPU')
+    vars(args)['iterations'] = vars(args)['pass_num'] * 1000 if vars(args)[
+        'with_test'] else vars(args)['iterations']
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def fixed_padding(inputs, kernel_size, data_format):
+    """Pads the input along the spatial dimensions independently of input size.
+  Args:
+    inputs: A tensor of size [batch, channels, height_in, width_in] or
+      [batch, height_in, width_in, channels] depending on data_format.
+    kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
+                 Should be a positive integer.
+    data_format: The input format ('channels_last' or 'channels_first').
+  Returns:
+    A tensor with the same format as the input with the data either intact
+    (if kernel_size == 1) or padded (if kernel_size > 1).
+  """
+    pad_total = kernel_size - 1
+    pad_beg = pad_total // 2
+    pad_end = pad_total - pad_beg
+
+    if data_format == 'channels_first':
+        padded_inputs = tf.pad(inputs, [[0, 0], [0, 0], [pad_beg, pad_end],
+                                        [pad_beg, pad_end]])
+    else:
+        padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end],
+                                        [pad_beg, pad_end], [0, 0]])
+    return padded_inputs
+
+
+def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format):
+    """Strided 2-D convolution with explicit padding."""
+    # The padding is consistent and is based only on `kernel_size`, not on the
+    # dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).
+    # This is consistent with PaddlePaddle.
+    # In addition, the calculation for output size in TensorFlow can refer: 
+    # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/common_shape_fns.cc
+    if strides > 1:
+        inputs = fixed_padding(inputs, kernel_size, data_format)
+
+    return tf.layers.conv2d(
+        inputs=inputs,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=('SAME' if strides == 1 else 'VALID'),
+        use_bias=False,
+        kernel_initializer=tf.variance_scaling_initializer(),
+        data_format=data_format)
+
+
+def conv_bn(inputs,
+            filters,
+            kernel_size,
+            strides,
+            is_training,
+            data_format,
+            act=True):
+    # def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format):
+    # set fused=True for a significant performance boost. See
+    # https://www.tensorflow.org/performance/performance_guide#common_fused_ops
+    inputs = conv2d_fixed_padding(
+        inputs=inputs,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        data_format=data_format)
+    inputs = tf.layers.batch_normalization(
+        inputs=inputs,
+        axis=1 if data_format == 'channels_first' else 3,
+        momentum=0.9,
+        epsilon=1e-05,
+        center=True,
+        scale=True,
+        training=is_training,
+        fused=True)
+    if act:
+        inputs = tf.nn.relu(inputs)
+    return inputs
+
+
+def basicblock(inputs, filters, is_training, projection_shortcut, strides,
+               data_format):
+    shortcut = inputs
+    if projection_shortcut is not None:
+        shortcut = projection_shortcut(inputs)
+    inputs = conv_bn(inputs, filters, 3, strides, is_training, data_format)
+    inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False)
+    inputs = inputs + shortcut
+    inputs = tf.nn.relu(inputs)
+    return inputs
+
+
+def bottleneck(inputs, filters, is_training, projection_shortcut, strides,
+               data_format):
+    shortcut = inputs
+    if projection_shortcut is not None:
+        shortcut = projection_shortcut(inputs)
+    inputs = conv_bn(inputs, filters, 1, strides, is_training, data_format)
+    inputs = conv_bn(inputs, filters, 3, 1, is_training, data_format, act=False)
+    inputs = conv_bn(
+        inputs, filters * 4, 1, 1, is_training, data_format, act=False)
+    inputs = inputs + shortcut
+    inputs = tf.nn.relu(inputs)
+    return inputs
+
+
+def block_layer(inputs, filters, block_fn, blocks, strides, is_training, name,
+                data_format):
+    # Bottleneck blocks end with 4x the number of filters as they start with
+    filters_out = 4 * filters if block_fn is bottleneck else filters
+
+    def projection_shortcut(inputs):
+        return conv2d_fixed_padding(
+            inputs=inputs,
+            filters=filters_out,
+            kernel_size=1,
+            strides=strides,
+            data_format=data_format)
+
+    # Only the first block per block_layer uses projection_shortcut and strides
+    inputs = block_fn(inputs, filters, is_training, projection_shortcut,
+                      strides, data_format)
+
+    for _ in range(1, blocks):
+        inputs = block_fn(inputs, filters, is_training, None, 1, data_format)
+
+    return tf.identity(inputs, name)
+
+
+def resnet_imagenet(depth, class_dim, data_format):
+    """Returns the ResNet model for a given size and number of output classes."""
+
+    def resnet_generator(block_fn,
+                         layers,
+                         num_classes,
+                         data_format='channels_last'):
+        if data_format is None:
+            data_format = ('channels_first'
+                           if tf.test.is_built_with_cuda() else 'channels_last')
+
+        def model(inputs, is_training):
+            """Constructs the ResNet model given the inputs."""
+            if data_format == 'channels_first':
+                # Convert the inputs from channels_last (NHWC) to channels_first (NCHW).
+                # This provides a large performance boost on GPU. See
+                # https://www.tensorflow.org/performance/performance_guide#data_formats
+                inputs = tf.transpose(inputs, [0, 3, 1, 2])
+
+            inputs = conv_bn(inputs, 64, 7, 2, is_training, data_format)
+            inputs = tf.identity(inputs, 'initial_conv')
+            inputs = tf.layers.max_pooling2d(
+                inputs=inputs,
+                pool_size=3,
+                strides=2,
+                padding='SAME',
+                data_format=data_format)
+            inputs = tf.identity(inputs, 'initial_max_pool')
+            inputs = block_layer(inputs, 64, block_fn, layers[0], 1,
+                                 is_training, 'block_layer1', data_format)
+            inputs = block_layer(inputs, 128, block_fn, layers[1], 2,
+                                 is_training, 'block_layer2', data_format)
+            inputs = block_layer(inputs, 256, block_fn, layers[2], 2,
+                                 is_training, 'block_layer3', data_format)
+            inputs = block_layer(inputs, 512, block_fn, layers[3], 2,
+                                 is_training, 'block_layer4', data_format)
+            inputs = tf.layers.average_pooling2d(
+                inputs=inputs,
+                pool_size=7,
+                strides=1,
+                padding='VALID',
+                data_format=data_format)
+            inputs = tf.identity(inputs, 'final_avg_pool')
+            inputs = tf.reshape(inputs,
+                                [-1, 512 if block_fn is basicblock else 2048])
+            inputs = tf.layers.dense(inputs=inputs, units=num_classes)
+            inputs = tf.identity(inputs, 'final_dense')
+            return inputs
+
+        return model
+
+    model_params = {
+        18: {
+            'block': basicblock,
+            'layers': [2, 2, 2, 2]
+        },
+        34: {
+            'block': basicblock,
+            'layers': [3, 4, 6, 3]
+        },
+        50: {
+            'block': bottleneck,
+            'layers': [3, 4, 6, 3]
+        },
+        101: {
+            'block': bottleneck,
+            'layers': [3, 4, 23, 3]
+        },
+        152: {
+            'block': bottleneck,
+            'layers': [3, 8, 36, 3]
+        },
+        200: {
+            'block': bottleneck,
+            'layers': [3, 24, 36, 3]
+        }
+    }
+    if depth not in model_params:
+        raise ValueError('Not a valid depth:', depth)
+    params = model_params[depth]
+    return resnet_generator(params['block'], params['layers'], class_dim,
+                            data_format)
+
+
+def resnet_cifar10(depth, num_classes, data_format):
+    if depth % 6 != 2:
+        raise ValueError('depth must be 6n + 2:', depth)
+
+    num_blocks = (depth - 2) // 6
+
+    if data_format is None:
+        data_format = ('channels_first'
+                       if tf.test.is_built_with_cuda() else 'channels_last')
+
+    def model(inputs, is_training):
+        inputs = conv_bn(inputs, 16, 3, 1, is_training, data_format)
+        inputs = tf.identity(inputs, 'initial_conv')
+        inputs = block_layer(inputs, 16, basicblock, num_blocks, 1, is_training,
+                             'block_layer1', data_format)
+        inputs = block_layer(inputs, 32, basicblock, num_blocks, 2, is_training,
+                             'block_layer2', data_format)
+        inputs = block_layer(inputs, 64, basicblock, num_blocks, 2, is_training,
+                             'block_layer3', data_format)
+        inputs = tf.layers.average_pooling2d(
+            inputs=inputs,
+            pool_size=8,
+            strides=1,
+            padding='VALID',
+            data_format=data_format)
+        inputs = tf.identity(inputs, 'final_avg_pool')
+        inputs = tf.reshape(inputs, [-1, 64])
+        inputs = tf.layers.dense(inputs=inputs, units=num_classes)
+        inputs = tf.identity(inputs, 'final_dense')
+        return inputs
+
+    return model
+
+
+def run_benchmark(args, data_format='channels_last', device='/cpu:0'):
+    """Our model_fn for ResNet to be used with our Estimator."""
+
+    class_dim = 1000
+    dshape = (None, 224, 224, 3)
+
+    pdshape = (3, 224, 224)
+    if args.data == 'flowers102':
+        class_dim = 102
+        dshape = (None, 224, 224, 3)
+        pdshape = (3, 224, 224)
+    elif args.data == 'cifar10':
+        class_dim = 10
+        dshape = (None, 32, 32, 3)
+        pdshape = (3, 32, 32)
+
+    with tf.device(device):
+        images = tf.placeholder(DTYPE, shape=dshape)
+        labels = tf.placeholder(tf.int64, shape=(None, ))
+        is_training = tf.placeholder('bool')
+        onehot_labels = tf.one_hot(labels, depth=class_dim)
+
+        network = resnet_cifar10(
+            32, class_dim,
+            data_format) if args.data == 'cifar10' else resnet_imagenet(
+                50, class_dim, data_format)
+
+        logits = network(inputs=images, is_training=is_training)
+
+        cross_entropy = tf.losses.softmax_cross_entropy(
+            logits=logits, onehot_labels=onehot_labels)
+        avg_cost = tf.reduce_mean(cross_entropy)
+
+        correct = tf.equal(tf.argmax(logits, 1), labels)
+        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
+
+        lr = 0.1 if args.data == 'cifar10' else 0.01
+        optimizer = tf.train.MomentumOptimizer(learning_rate=lr, momentum=0.9)
+
+        # Batch norm requires update_ops to be added as a train_op dependency.
+        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+        with tf.control_dependencies(update_ops):
+            train_op = optimizer.minimize(avg_cost)
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10()
+        if args.data == 'cifar10' else paddle.dataset.flowers.test(),
+        batch_size=100)
+
+    def test():
+        test_accs = []
+        for batch_id, data in enumerate(test_reader()):
+            test_images = np.array(
+                map(lambda x: np.transpose(x[0].reshape(pdshape),
+                axes=[1, 2, 0]), data)).astype("float32")
+            test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
+            test_accs.append(
+                accuracy.eval(feed_dict={
+                    images: test_images,
+                    labels: test_labels,
+                    is_training: False
+                }))
+        print("Pass = %d, Train performance = %f imgs/s, Test accuracy = %f\n" %
+              (pass_id, num_samples / train_elapsed, np.mean(test_accs)))
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+
+    with tf.Session(config=config) as sess:
+        init_g = tf.global_variables_initializer()
+        init_l = tf.local_variables_initializer()
+        sess.run(init_g)
+        sess.run(init_l)
+
+        if args.use_fake_data:
+            data = train_reader().next()
+            images_data = np.array(
+                    map(lambda x: np.transpose(x[0].reshape(pdshape),
+                    axes=[1, 2, 0]), data)).astype("float32")
+            labels_data = np.array(map(lambda x: x[1], data)).astype('int64')
+        iters, num_samples, start_time = 0, 0, 0.0
+        for pass_id in range(args.pass_num):
+            if iters == args.iterations:
+                break
+            train_accs = []
+            train_losses = []
+            for batch_id, data in enumerate(train_reader()):
+                if iters == args.skip_batch_num:
+                    start_time = time.time()
+                    num_samples = 0
+                if iters == args.iterations:
+                    break
+                if not args.use_fake_data:
+                    images_data = np.array(
+                        map(lambda x: np.transpose(x[0].reshape(pdshape),
+                        axes=[1, 2, 0]), data)).astype("float32")
+                    labels_data = np.array(map(lambda x: x[1], data)).astype(
+                        'int64')
+                _, loss, acc = sess.run([train_op, avg_cost, accuracy],
+                                        feed_dict={
+                                            images: images_data,
+                                            labels: labels_data,
+                                            is_training: True
+                                        })
+                iters += 1
+                train_accs.append(acc)
+                train_losses.append(loss)
+                num_samples += len(data)
+                print("Pass=%d, Iter=%d, Loss=%f, Accuray=%f\n" %
+                      (pass_id, iters, loss, acc))
+
+            train_elapsed = time.time() - start_time
+            print("Pass=%d, Loss=%f, Accuray=%f\n" %
+                  (pass_id, np.mean(train_losses), np.mean(train_accs)))
+
+            # evaluation
+            if args.with_test:
+                test()
+
+        if not args.with_test:
+            duration = time.time() - start_time
+            examples_per_sec = num_samples / duration
+            sec_per_batch = duration / (iters - args.skip_batch_num)
+
+            print('Total examples: %d, total time: %.5f' %
+                  (num_samples, duration))
+            print('%.5f examples/sec, %.5f sec/batch' %
+                  (examples_per_sec, sec_per_batch))
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+    if tf.test.is_built_with_cuda():
+        device = '/device:GPU:0'
+        if args.order == 'NHWC':
+            data_format = 'channels_last'
+        else:
+            data_format = 'channels_first'
+    else:
+        device = '/cpu:0'
+        if args.order == 'NHWC':
+            data_format = 'channels_last'
+        else:
+            raise ValueError('Only support NHWC order in CPU mode')
+
+    run_benchmark(args, data_format, device)
diff --git a/benchmark/tensorflow/stacked_dynamic_lstm.py b/benchmark/tensorflow/stacked_dynamic_lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..5285033005044d907d0b7e91eb66ee7281c4f27a
--- /dev/null
+++ b/benchmark/tensorflow/stacked_dynamic_lstm.py
@@ -0,0 +1,220 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import tensorflow as tf
+
+import paddle.v2 as paddle
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("LSTM model benchmark.")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=32,
+        help='The sequence number of a batch data. (default: %(default)d)')
+    parser.add_argument(
+        '--stacked_num',
+        type=int,
+        default=5,
+        help='Number of lstm layers to stack. (default: %(default)d)')
+    parser.add_argument(
+        '--embedding_dim',
+        type=int,
+        default=512,
+        help='Dimension of embedding table. (default: %(default)d)')
+    parser.add_argument(
+        '--hidden_dim',
+        type=int,
+        default=512,
+        help='Hidden size of lstm unit. (default: %(default)d)')
+    parser.add_argument(
+        '--pass_num',
+        type=int,
+        default=10,
+        help='Epoch number to train. (default: %(default)d)')
+    parser.add_argument(
+        '--learning_rate',
+        type=float,
+        default=0.0002,
+        help='Learning rate used to train. (default: %(default)f)')
+    parser.add_argument(
+        '--infer_only', action='store_true', help='If set, run forward only.')
+    args = parser.parse_args()
+    return args
+
+
+def print_arguments(args):
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def dynamic_lstm_model(dict_size,
+                       embedding_dim,
+                       hidden_dim,
+                       stacked_num,
+                       class_num=2,
+                       is_train=True):
+    word_idx = tf.placeholder(tf.int64, shape=[None, None])
+    sequence_length = tf.placeholder(tf.int64, shape=[None, ])
+
+    embedding_weights = tf.get_variable('word_embeddings',
+                                        [dict_size, embedding_dim])
+    embedding = tf.nn.embedding_lookup(embedding_weights, word_idx)
+
+    lstm_cell = tf.nn.rnn_cell.LSTMCell(
+        num_units=hidden_dim, use_peepholes=False)
+    stacked_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * stacked_num)
+
+    # final_state [LSTMTuple(c, h), LSTMTuple(c, h) ...] total stacked_num LSTMTuples
+    _, final_state = tf.nn.dynamic_rnn(
+        cell=stacked_cell,
+        inputs=embedding,
+        dtype=tf.float32,
+        sequence_length=sequence_length)
+
+    w = tf.Variable(
+        tf.truncated_normal([hidden_dim, class_num]), dtype=tf.float32)
+    bias = tf.Variable(
+        tf.constant(
+            value=0.0, shape=[class_num], dtype=tf.float32))
+    prediction = tf.matmul(final_state[-1][1], w) + bias
+
+    if not is_train:
+        return (word_idx, sequence_length), tf.nn.softmax(prediction)
+
+    label = tf.placeholder(tf.int64, shape=[None, ])
+    loss = tf.nn.softmax_cross_entropy_with_logits(
+        labels=tf.one_hot(label, 2), logits=prediction)
+    avg_loss = tf.reduce_mean(loss)
+
+    correct_count = tf.equal(tf.argmax(prediction, 1), label)
+    acc = tf.reduce_mean(tf.cast(correct_count, tf.float32))
+
+    with tf.variable_scope("reset_metrics_accuracy_scope") as scope:
+        g_acc = tf.metrics.accuracy(label, tf.argmax(prediction, axis=1))
+        vars = tf.contrib.framework.get_variables(
+            scope, collection=tf.GraphKeys.LOCAL_VARIABLES)
+        reset_op = tf.variables_initializer(vars)
+
+    return (word_idx, sequence_length, label), avg_loss, acc, g_acc, reset_op
+
+
+def padding_data(data, padding_size, value):
+    data = data + [value] * padding_size
+    return data[:padding_size]
+
+
+def train(args):
+    word_dict = paddle.dataset.imdb.word_dict()
+    dict_size = len(word_dict)
+
+    feeding_list, avg_loss, acc, g_acc, reset_op = dynamic_lstm_model(
+        dict_size, args.embedding_dim, args.hidden_dim, args.stacked_num)
+
+    adam_optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
+    train_op = adam_optimizer.minimize(avg_loss)
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=25000),
+        batch_size=args.batch_size)
+
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.test(word_dict), buf_size=25000),
+        batch_size=args.batch_size)
+
+    def do_validation(sess):
+        sess.run(reset_op)
+        for batch_id, data in enumerate(test_reader()):
+            word_idx = map(lambda x: x[0], data)
+            sequence_length = np.array(
+                [len(seq) for seq in word_idx]).astype('int64')
+            maxlen = np.max(sequence_length)
+            word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
+            word_idx = np.array(word_idx).astype('int64')
+            label = np.array(map(lambda x: x[1], data)).astype('int64')
+
+            _, loss, fetch_acc, fetch_g_acc = sess.run(
+                [train_op, avg_loss, acc, g_acc],
+                feed_dict={
+                    feeding_list[0]: word_idx,
+                    feeding_list[1]: sequence_length,
+                    feeding_list[2]: label
+                })
+
+        return fetch_g_acc[1]
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+    with tf.Session(config=config) as sess:
+        init_g = tf.global_variables_initializer()
+        init_l = tf.local_variables_initializer()
+        sess.run(init_l)
+        sess.run(init_g)
+
+        for pass_id in xrange(args.pass_num):
+            # clear accuracy local variable 
+            sess.run(reset_op)
+            pass_start_time = time.time()
+            words_seen = 0
+
+            for batch_id, data in enumerate(train_reader()):
+                word_idx = map(lambda x: x[0], data)
+                sequence_length = np.array(
+                    [len(seq) for seq in word_idx]).astype('int64')
+                words_seen += np.sum(sequence_length)
+                maxlen = np.max(sequence_length)
+                word_idx = [padding_data(seq, maxlen, 0) for seq in word_idx]
+                word_idx = np.array(word_idx).astype('int64')
+                label = np.array(map(lambda x: x[1], data)).astype('int64')
+
+                _, loss, fetch_acc, fetch_g_acc = sess.run(
+                    [train_op, avg_loss, acc, g_acc],
+                    feed_dict={
+                        feeding_list[0]: word_idx,
+                        feeding_list[1]: sequence_length,
+                        feeding_list[2]: label
+                    })
+
+                print("pass_id=%d, batch_id=%d, loss: %f, acc: %f, avg_acc: %f"
+                      % (pass_id, batch_id, loss, fetch_acc, fetch_g_acc[1]))
+
+            pass_end_time = time.time()
+            time_consumed = pass_end_time - pass_start_time
+            words_per_sec = words_seen / time_consumed
+            test_acc = do_validation(sess)
+            print("pass_id=%d, test_acc: %f, words/s: %f, sec/pass: %f" %
+                  (pass_id, test_acc, words_per_sec, time_consumed))
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    print_arguments(args)
+
+    if args.infer_only:
+        pass
+    else:
+        train(args)
diff --git a/benchmark/tensorflow/vgg.py b/benchmark/tensorflow/vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..fba5ec71a46b3ac8b2e1244424c39fd5192e5458
--- /dev/null
+++ b/benchmark/tensorflow/vgg.py
@@ -0,0 +1,324 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VGG16 benchmark in TensorFlow"""
+import tensorflow as tf
+import paddle.v2 as paddle
+import numpy as np
+import argparse
+import time
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+    '--skip_batch_num',
+    type=int,
+    default=5,
+    help='The first num of minibatch num to skip, for better performance test')
+parser.add_argument(
+    '--iterations', type=int, default=80, help='The number of minibatches.')
+parser.add_argument(
+    '--learning_rate',
+    type=float,
+    default=1e-3,
+    help="Learning rate for training.")
+parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
+parser.add_argument(
+    '--device',
+    type=str,
+    default='GPU',
+    choices=['CPU', 'GPU'],
+    help="The device type.")
+parser.add_argument(
+    '--data_format',
+    type=str,
+    default='NHWC',
+    choices=['NCHW', 'NHWC'],
+    help='The data order, NCHW=[batch, channels, height, width].'
+    'Only support NHWC right now.')
+parser.add_argument(
+    '--data_set',
+    type=str,
+    default='cifar10',
+    choices=['cifar10', 'flowers'],
+    help='Optional dataset for benchmark.')
+args = parser.parse_args()
+
+
+class VGG16Model(object):
+    def __init__(self):
+        self.parameters = []
+
+    def batch_norm_relu(self, inputs, is_training):
+        """Performs a batch normalization followed by a ReLU."""
+        # We set fused=True for a significant speed boost. See
+        # https://www.tensorflow.org/speed/speed_guide#common_fused_ops
+        inputs = tf.layers.batch_normalization(
+            inputs=inputs,
+            axis=1 if args.data_format == 'NCHW' else -1,
+            momentum=0.9,
+            epsilon=1e-05,
+            center=True,
+            scale=True,
+            training=is_training,
+            fused=True)
+        inputs = tf.nn.relu(inputs)
+        return inputs
+
+    def conv_bn_layer(self,
+                      name,
+                      images,
+                      kernel_shape,
+                      is_training,
+                      drop_rate=0.0):
+        with tf.name_scope(name) as scope:
+            kernel = tf.Variable(
+                tf.truncated_normal(
+                    kernel_shape, dtype=tf.float32, stddev=1e-1),
+                name='weights')
+            conv = tf.nn.conv2d(
+                images,
+                kernel, [1, 1, 1, 1],
+                data_format=args.data_format,
+                padding='SAME')
+            biases = tf.Variable(
+                tf.constant(
+                    0.0, shape=[kernel_shape[-1]], dtype=tf.float32),
+                trainable=True,
+                name='biases')
+            out = tf.nn.bias_add(conv, biases)
+            out = self.batch_norm_relu(out, is_training)
+            out = tf.layers.dropout(out, rate=drop_rate, training=is_training)
+            return out
+
+    def fc_layer(self, name, inputs, shape):
+        with tf.name_scope(name) as scope:
+            fc_w = tf.Variable(
+                tf.truncated_normal(
+                    shape, dtype=tf.float32, stddev=1e-1),
+                name='weights')
+            fc_b = tf.Variable(
+                tf.constant(
+                    0.0, shape=[shape[-1]], dtype=tf.float32),
+                trainable=True,
+                name='biases')
+            out = tf.nn.bias_add(tf.matmul(inputs, fc_w), fc_b)
+            return out
+
+    def network(self, images, class_dim, is_training):
+        """ VGG16 model structure.
+
+            TODO(kuke): enable this network to support the 'NCHW' data format
+        """
+
+        # conv1
+        conv1_1 = self.conv_bn_layer(
+            'conv1_1', images, [3, 3, 3, 64], is_training, drop_rate=0.3)
+        conv1_2 = self.conv_bn_layer(
+            'conv1_2', conv1_1, [3, 3, 64, 64], is_training, drop_rate=0.0)
+        # pool1
+        pool1 = tf.nn.max_pool(
+            conv1_2,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool1')
+        # conv2
+        conv2_1 = self.conv_bn_layer(
+            'conv2_1', pool1, [3, 3, 64, 128], is_training, drop_rate=0.4)
+        conv2_2 = self.conv_bn_layer(
+            'conv2_2', conv2_1, [3, 3, 128, 128], is_training, drop_rate=0.0)
+        # pool2
+        pool2 = tf.nn.max_pool(
+            conv2_2,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool2')
+        # conv3
+        conv3_1 = self.conv_bn_layer(
+            'conv3_1', pool2, [3, 3, 128, 256], is_training, drop_rate=0.4)
+        conv3_2 = self.conv_bn_layer(
+            'conv3_2', conv3_1, [3, 3, 256, 256], is_training, drop_rate=0.4)
+        conv3_3 = self.conv_bn_layer(
+            'conv3_3', conv3_2, [3, 3, 256, 256], is_training, drop_rate=0.0)
+        # pool3
+        pool3 = tf.nn.max_pool(
+            conv3_3,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool3')
+        # conv4
+        conv4_1 = self.conv_bn_layer(
+            'conv4_1', pool3, [3, 3, 256, 512], is_training, drop_rate=0.4)
+        conv4_2 = self.conv_bn_layer(
+            'conv4_2', conv4_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
+        conv4_3 = self.conv_bn_layer(
+            'conv4_3', conv4_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
+        # pool4
+        pool4 = tf.nn.max_pool(
+            conv4_3,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool4')
+        # conv5
+        conv5_1 = self.conv_bn_layer(
+            'conv5_1', pool4, [3, 3, 512, 512], is_training, drop_rate=0.4)
+        conv5_2 = self.conv_bn_layer(
+            'conv5_2', conv5_1, [3, 3, 512, 512], is_training, drop_rate=0.4)
+        conv5_3 = self.conv_bn_layer(
+            'conv5_3', conv5_2, [3, 3, 512, 512], is_training, drop_rate=0.0)
+        # pool5
+        pool5 = tf.nn.max_pool(
+            conv5_3,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding='SAME',
+            name='pool4')
+        # flatten
+        shape = int(np.prod(pool5.get_shape()[1:]))
+        pool5_flat = tf.reshape(pool5, [-1, shape])
+        # fc1
+        drop = tf.layers.dropout(pool5_flat, rate=0.5, training=is_training)
+        fc1 = self.fc_layer('fc1', drop, [shape, 512])
+        # fc2
+        bn = self.batch_norm_relu(fc1, is_training)
+        drop = tf.layers.dropout(bn, rate=0.5, training=is_training)
+        fc2 = self.fc_layer('fc2', drop, [512, 512])
+
+        fc3 = self.fc_layer('fc3', fc2, [512, class_dim])
+
+        return fc3
+
+
+def run_benchmark():
+    """Run benchmark on cifar10 or flowers."""
+
+    if args.data_set == "cifar10":
+        class_dim = 10
+        raw_shape = (3, 32, 32)
+        dat_shape = (None, 32, 32, 3) if args.data_format == 'NHWC' else (
+            None, 3, 32, 32)
+    else:
+        class_dim = 102
+        raw_shape = (3, 224, 224)
+        dat_shape = (None, 224, 224, 3) if args.data_format == 'NHWC' else (
+            None, 3, 224, 224)
+
+    device = '/cpu:0' if args.device == 'CPU' else '/device:GPU:0'
+
+    with tf.device(device):
+        images = tf.placeholder(tf.float32, shape=dat_shape)
+        labels = tf.placeholder(tf.int64, shape=(None, ))
+        is_training = tf.placeholder('bool')
+        onehot_labels = tf.one_hot(labels, depth=class_dim)
+
+        vgg16 = VGG16Model()
+        logits = vgg16.network(images, class_dim, is_training)
+        loss = tf.losses.softmax_cross_entropy(
+            onehot_labels=onehot_labels, logits=logits)
+        avg_loss = tf.reduce_mean(loss)
+
+        correct = tf.equal(tf.argmax(logits, 1), labels)
+        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
+
+        optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
+        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+        with tf.control_dependencies(update_ops):
+            train_op = optimizer.minimize(avg_loss)
+
+    # data reader
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+    test_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.test10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+            buf_size=5120),
+        batch_size=args.batch_size)
+
+    # test
+    def test():
+        test_accs = []
+        for batch_id, data in enumerate(test_reader()):
+            test_images = np.array(
+         map(lambda x: np.transpose(x[0].reshape(raw_shape),
+         axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
+            test_labels = np.array(map(lambda x: x[1], data)).astype('int64')
+            test_accs.append(
+                accuracy.eval(feed_dict={
+                    images: test_images,
+                    labels: test_labels,
+                    is_training: False
+                }))
+        return np.mean(test_accs)
+
+    config = tf.ConfigProto(
+        intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+    config.gpu_options.allow_growth = True
+
+    with tf.Session(config=config) as sess:
+        init_g = tf.global_variables_initializer()
+        init_l = tf.local_variables_initializer()
+        sess.run(init_g)
+        sess.run(init_l)
+        iters, num_samples, start_time = 0, 0, time.time()
+        for pass_id in range(args.num_passes):
+            # train
+            num_samples = 0
+            start_time = time.time()
+            for batch_id, data in enumerate(train_reader()):
+                if iters == args.skip_batch_num:
+                    start_time = time.time()
+                    num_samples = 0
+                if iters == args.iterations:
+                    break
+                train_images = np.array(
+                    map(lambda x: np.transpose(x[0].reshape(raw_shape),
+                    axes=[1, 2, 0]) if args.data_format == 'NHWC' else x[0], data)).astype("float32")
+                train_labels = np.array(map(lambda x: x[1], data)).astype(
+                    'int64')
+                _, loss, acc = sess.run([train_op, avg_loss, accuracy],
+                                        feed_dict={
+                                            images: train_images,
+                                            labels: train_labels,
+                                            is_training: True
+                                        })
+                iters += 1
+                num_samples += len(data)
+                print("Pass = %d, Iters = %d, Loss = %f, Accuracy = %f" %
+                      (pass_id, iters, loss, acc))
+            train_elapsed = time.time() - start_time
+            # test
+            pass_test_acc = test()
+            print("Pass = %d, Train speed = %f imgs/s, Test accuracy = %f\n" %
+                  (pass_id, num_samples / train_elapsed, pass_test_acc))
+
+
+def print_arguments():
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == '__main__':
+    print_arguments()
+    run_benchmark()
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 6320b17520a687f88993b6f464d9115838b0f96b..6ed51c648478efb9784d0c43b169c285e740e0f3 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -62,35 +62,41 @@ endif()
 
 
 ## Then find the reference-cblas.  www.netlib.org/blas/
-
-
 set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH
   "Folder contains reference-cblas")
-set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
-  ${REFERENCE_CBLAS_ROOT}/include
-  /usr/include
-  /usr/include/cblas
-)
-
-set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
-  ${REFERENCE_CBLAS_ROOT}/lib
-  /usr/lib
-  /usr/lib/blas/reference/
-  /usr/lib/reference/
-)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
+    ${REFERENCE_CBLAS_ROOT}/include
+    /usr/include
+    /usr/include/cblas
+  )
+
+  set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
+    ${REFERENCE_CBLAS_ROOT}/lib
+    /usr/lib
+    /usr/lib/blas/reference/
+    /usr/lib/reference/
+  )
+else()
+  # Disable the finding of reference cblas under host's system path
+  set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/include)
+  set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib)
+endif()
 
-find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
+if(WITH_SYSTEM_BLAS)
+  find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
         ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS})
-find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
+  find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
         ${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
 
-if (REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
-  set(CBLAS_FOUND ON)
-  set(CBLAS_PROVIDER REFERENCE)
-  set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
-  set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY})
-  add_definitions(-DPADDLE_USE_REFERENCE_CBLAS)
-  message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
+    set(CBLAS_FOUND ON)
+    set(CBLAS_PROVIDER REFERENCE)
+    set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
+    set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY})
+    add_definitions(-DPADDLE_USE_REFERENCE_CBLAS)
+    message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  endif()
 endif()
 
 if(IOS_USE_VECLIB_FOR_BLAS AND VECLIB_FOUND)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 0f76f55270592c5625a9624b33f4c0f82efdc627..e4af34d10ed92c501dd805addb62747c91c00978 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -41,6 +41,10 @@ if(USE_EIGEN_FOR_BLAS)
     add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
 endif(USE_EIGEN_FOR_BLAS)
 
+if(EIGEN_USE_THREADS)
+    add_definitions(-DEIGEN_USE_THREADS)
+endif(EIGEN_USE_THREADS)
+
 if(NOT WITH_PROFILER)
     add_definitions(-DPADDLE_DISABLE_PROFILER)
 endif(NOT WITH_PROFILER)
@@ -57,11 +61,7 @@ if(NOT WITH_GOLANG)
     add_definitions(-DPADDLE_WITHOUT_GOLANG)
 endif(NOT WITH_GOLANG)
 
-if(NOT WITH_GPU)
-    add_definitions(-DHPPL_STUB_FUNC)
-
-    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
-else()
+if(WITH_GPU)
     add_definitions(-DPADDLE_WITH_CUDA)
 
     FIND_PACKAGE(CUDA REQUIRED)
@@ -84,7 +84,27 @@ else()
     # Include cuda and cudnn
     include_directories(${CUDNN_INCLUDE_DIR})
     include_directories(${CUDA_TOOLKIT_INCLUDE})
-endif(NOT WITH_GPU)
+
+    if(TENSORRT_FOUND)
+        if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
+            message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile")
+        endif()
+        if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+            message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile")
+        endif()
+        if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4)
+            message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile")
+        endif()
+        include_directories(${TENSORRT_INCLUDE_DIR})
+    endif()
+elseif(WITH_AMD_GPU)
+    add_definitions(-DPADDLE_WITH_HIP)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__")
+else()
+    add_definitions(-DHPPL_STUB_FUNC)
+    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
+endif()
 
 if (WITH_MKLML AND MKLML_IOMP_LIB)
     message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
@@ -98,6 +118,10 @@ endif()
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
 
+if(WITH_DISTRIBUTE)
+  add_definitions(-DPADDLE_WITH_DISTRIBUTE)
+endif()
+
 if(WITH_GOLANG)
   # we need to symlink Paddle directory into GOPATH. If we
   # don't do it and we have code that depends on Paddle, go
@@ -146,3 +170,11 @@ if(WITH_GOLANG)
   endif()
 
 endif(WITH_GOLANG)
+
+if(WITH_GRPC)
+    add_definitions(-DPADDLE_WITH_GRPC)
+endif(WITH_GRPC)
+
+if(WITH_BRPC_RDMA)
+    add_definitions(-DPADDLE_WITH_BRPC_RDMA)
+endif(WITH_BRPC_RDMA)
diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake
deleted file mode 100644
index 4823dc3e91390002aefac70f7931b4197db05789..0000000000000000000000000000000000000000
--- a/cmake/cpplint.cmake
+++ /dev/null
@@ -1,62 +0,0 @@
-# util to check C++ file style
-# * it basically use google cpplint.py.
-# * It provide "add_style_check_target" for cmake.
-#   Usage see add_style_check_target's document
-#
-# TODO(yuyang18): Add python style check.
-
-set(STYLE_FILTER)
-
-# diable unwanted filters
-
-# paddle do not indent public/potected/private in class
-set(STYLE_FILTER "${STYLE_FILTER}-whitespace/indent,")
-# paddle use mutable reference. BUT IT IS NOT RECOMMANDED
-set(STYLE_FILTER "${STYLE_FILTER}-runtime/references,")
-# paddle use relative path for include.
-set(STYLE_FILTER "${STYLE_FILTER}-build/include,")
-# paddle use <thread>, <mutex>, etc.
-set(STYLE_FILTER "${STYLE_FILTER}-build/c++11,")
-# paddle use c style casting. BUT IT IS NOT RECOMMANDED
-set(STYLE_FILTER "${STYLE_FILTER}-readability/casting")
-
-
-# IGNORE SOME FILES
-set(IGNORE_PATTERN
-    .*ImportanceSampler.*
-    .*cblas\\.h.*
-    .*\\.pb\\.txt
-    .*MultiDataProvider.*
-    .*pb.*
-    .*pybind.h)
-
-# add_style_check_target
-#
-# attach check code style step for target.
-#
-# first argument: target name to attach
-# rest arguments: source list to check code style.
-#
-# NOTE: If WITH_STYLE_CHECK is OFF, then this macro just do nothing.
-macro(add_style_check_target TARGET_NAME)
-    if(WITH_STYLE_CHECK)
-        set(SOURCES_LIST ${ARGN})
-        list(REMOVE_DUPLICATES SOURCES_LIST)
-        foreach(filename ${SOURCES_LIST})
-            foreach(pattern ${IGNORE_PATTERN})
-                if(filename MATCHES ${pattern})
-                    list(REMOVE_ITEM SOURCES_LIST ${filename})
-                endif()
-            endforeach()
-        endforeach()
-
-        if(SOURCES_LIST)
-            add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-                COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/scripts/cpplint.py"
-                        "--filter=${STYLE_FILTER}"
-                        ${SOURCES_LIST}
-                COMMENT "cpplint: Checking source code style"
-                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})        
-        endif()
-    endif()
-endmacro()
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 7edc8637727e300539a46bc3941ace87c87903b8..b520c03a836a9e3f263ba050f151877ffe0d071d 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -172,6 +172,8 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
 list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
 list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+# in cuda9, suppress cuda warning on eigen 
+list(APPEND CUDA_NVCC_FLAGS "-w")
 # Set :expt-relaxed-constexpr to suppress Eigen warnings
 list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
 
diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..d205e3958234cabfbfeba8c3d725fe618ce48ace
--- /dev/null
+++ b/cmake/external/anakin.cmake
@@ -0,0 +1,44 @@
+if (NOT WITH_ANAKIN)
+  return()
+endif()
+
+set(ANAKIN_INSTALL_DIR "${THIRD_PARTY_PATH}/install/anakin" CACHE PATH
+  "Anakin install path." FORCE)
+set(ANAKIN_INCLUDE "${ANAKIN_INSTALL_DIR}" CACHE STRING "root of Anakin header files")
+set(ANAKIN_LIBRARY "${ANAKIN_INSTALL_DIR}" CACHE STRING "path of Anakin library")
+
+set(ANAKIN_COMPILE_EXTRA_FLAGS -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp)
+
+set(ANAKIN_LIBRARY_URL "https://github.com/pangge/Anakin/releases/download/3.0/anakin_release_simple.tar.gz")
+
+# A helper function used in Anakin, currently, to use it, one need to recursively include
+# nearly all the header files.
+function(fetch_include_recursively root_dir)
+    if (IS_DIRECTORY ${root_dir})
+        include_directories(${root_dir})
+    endif()
+
+    file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*)
+    foreach(sub ${ALL_SUB})
+        if (IS_DIRECTORY ${root_dir}/${sub})
+            fetch_include_recursively(${root_dir}/${sub})
+        endif()
+    endforeach()
+endfunction()
+
+if (NOT EXISTS "${ANAKIN_INSTALL_DIR}")
+    # download library
+    message(STATUS "Download Anakin library from ${ANAKIN_LIBRARY_URL}")
+    execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
+    execute_process(COMMAND bash -c "rm -rf ${ANAKIN_INSTALL_DIR}/*")
+    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; wget -q ${ANAKIN_LIBRARY_URL}")
+    execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_INSTALL_DIR}")
+    execute_process(COMMAND bash -c "cd ${ANAKIN_INSTALL_DIR}; tar xzf anakin_release_simple.tar.gz")
+endif()
+
+if (WITH_ANAKIN)
+    message(STATUS "Anakin for inference is enabled")
+    message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
+    fetch_include_recursively(${ANAKIN_INCLUDE})
+    link_directories(${ANAKIN_LIBRARY})
+endif()
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
index d9cd264b49d546c35a2c57a82ead83ea654b60ae..73713d93d5a52738651dda498fac5ea66e3589d2 100644
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -23,8 +23,12 @@ set(BOOST_PROJECT       "extern_boost")
 # checked that the devtools package of CentOS 6 installs boost 1.41.0.
 # So we use 1.41.0 here.
 set(BOOST_VER           "1.41.0")
-set(BOOST_TAR           "boost_1_41_0")
-set(BOOST_URL           "http://paddlepaddledeps.s3-website-us-west-1.amazonaws.com/${BOOST_TAR}.tar.gz")
+if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL))
+    message(STATUS "use pre defined download url")
+    set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
+    set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
+endif()
+MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
 set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..30b227b6452abf44171a1a4e04569e66b16e67a4
--- /dev/null
+++ b/cmake/external/brpc.cmake
@@ -0,0 +1,69 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+find_library(SSL_LIBRARY NAMES ssl)
+ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${SSL_LIBRARY})
+
+find_library(CRYPTO_LIBRARY NAMES crypto)
+ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${CRYPTO_LIBRARY})
+
+
+SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc)
+SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc)
+SET(BRPC_INCLUDE_DIR "${BRPC_INSTALL_DIR}/include" CACHE PATH "brpc include directory." FORCE)
+SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc library." FORCE)
+
+INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})
+
+# Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
+set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib")
+
+# If minimal .a is need, you can set  WITH_DEBUG_SYMBOLS=OFF
+ExternalProject_Add(
+    extern_brpc
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/gongweibao/brpc"
+    GIT_TAG         "7dc04defad1fd4173aae170c3fcbde131b65155a"
+    PREFIX          ${BRPC_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_INSTALL_PREFIX=${BRPC_INSTALL_DIR}
+                    -DCMAKE_INSTALL_LIBDIR=${BRPC_INSTALL_DIR}/lib
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                    -DCMAKE_PREFIX_PATH=${prefix_path}
+                    -DBRPC_WITH_GLOG=ON
+                    -DIOBUF_WITH_HUGE_BLOCK=ON
+                    -DBRPC_WITH_RDMA=${WITH_BRPC_RDMA}
+                    ${EXTERNAL_OPTIONAL_ARGS}
+    LIST_SEPARATOR |
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BRPC_INSTALL_DIR}
+                     -DCMAKE_INSTALL_LIBDIR:PATH=${BRPC_INSTALL_DIR}/lib
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+)
+ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest snappy)
+ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
+ADD_DEPENDENCIES(brpc extern_brpc)
+
+
+LIST(APPEND external_project_dependencies brpc)
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 6a701e076c95372f903a09d35d4208ee73bd584c..e029300eee9b99582f085f6b650e03f7dacc091a 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -4,18 +4,36 @@ SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
 SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
 INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
 
-ExternalProject_Add(
-    extern_eigen3
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
-    GIT_TAG         70661066beef694cadf6c304d0d07e0758825c10
-    PREFIX          ${EIGEN_SOURCE_DIR}
-    UPDATE_COMMAND  ""
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND     ""
-    INSTALL_COMMAND   ""
-    TEST_COMMAND      ""
-)
+if(WITH_AMD_GPU)
+    ExternalProject_Add(
+        extern_eigen3
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY  "https://github.com/sabreshao/hipeigen.git"
+        GIT_TAG         0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9
+        PREFIX          ${EIGEN_SOURCE_DIR}
+        UPDATE_COMMAND  ""
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND     ""
+        INSTALL_COMMAND   ""
+        TEST_COMMAND      ""
+    )
+else()
+    ExternalProject_Add(
+        extern_eigen3
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY  "https://github.com/eigenteam/eigen-git-mirror"
+        # eigen on cuda9.1 missing header of math_funtions.hpp
+        # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
+        GIT_TAG         917060c364181f33a735dc023818d5a54f60e54c
+        PREFIX          ${EIGEN_SOURCE_DIR}
+        DOWNLOAD_NAME   "eigen"
+        UPDATE_COMMAND  ""
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND     ""
+        INSTALL_COMMAND   ""
+        TEST_COMMAND      ""
+    )
+endif()
 
 if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
     set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c)
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index 0853b981813c5d60a12603471df7e0b216b0822f..85f40585da29bab9a107f5546e64870975f4c2d3 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -23,17 +23,29 @@ SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc)
 SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
 SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
 SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
+
+include(ProcessorCount)
+ProcessorCount(NUM_OF_PROCESSOR)
+
 IF(APPLE)
-  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
+  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
 ELSE()
-  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin)
+  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin)
 ENDIF()
 
+# FIXME(wuyi): do not build zlib cares protobuf twice, find a way to build grpc with them
 ExternalProject_Add(
     extern_grpc
     DEPENDS protobuf zlib
-    GIT_REPOSITORY "https://github.com/grpc/grpc.git"
-    GIT_TAG "v1.8.x"
+    # NOTE(wuyi):
+    # this package is generated by following steps:
+    # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git
+    # 2. git submodule update --init
+    # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
+    #    checkout and clean other dirs under third_party
+    # 4. remove .git, and package the directory.
+    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz"
+    URL_MD5  "1f268a2aff6759839dccd256adcc91cf"
     PREFIX          ${GRPC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""
@@ -46,7 +58,6 @@ ExternalProject_Add(
     INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
 )
 
-# FIXME(typhoonzero): hack to get static lib path, try a better way like merge them.
 ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
              "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")
diff --git a/cmake/external/leveldb.cmake b/cmake/external/leveldb.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..fb5091731da02b497a14f119e944905eee4979d5
--- /dev/null
+++ b/cmake/external/leveldb.cmake
@@ -0,0 +1,44 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(LEVELDB_SOURCES_DIR ${THIRD_PARTY_PATH}/leveldb)
+SET(LEVELDB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/leveldb)
+SET(LEVELDB_INCLUDE_DIR "${LEVELDB_INSTALL_DIR}/include" CACHE PATH "leveldb include directory." FORCE)
+SET(LEVELDB_LIBRARIES "${LEVELDB_INSTALL_DIR}/lib/libleveldb.a" CACHE FILEPATH "leveldb library." FORCE)
+INCLUDE_DIRECTORIES(${LEVELDB_INCLUDE_DIR})
+
+ExternalProject_Add(
+    extern_leveldb
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX ${LEVELDB_SOURCES_DIR}
+    URL "https://github.com/google/leveldb/archive/v1.18.tar.gz"
+    URL_MD5 "73770de34a2a5ab34498d2e05b2b7fa0"
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND CXXFLAGS=-fPIC make -j ${NUM_OF_PROCESSOR} libleveldb.a
+    INSTALL_COMMAND mkdir -p ${LEVELDB_INSTALL_DIR}/lib/ 
+        && cp ${LEVELDB_SOURCES_DIR}/src/extern_leveldb/libleveldb.a ${LEVELDB_LIBRARIES}
+        && cp -r ${LEVELDB_SOURCES_DIR}/src/extern_leveldb/include ${LEVELDB_INSTALL_DIR}/
+    BUILD_IN_SOURCE 1
+)
+
+ADD_DEPENDENCIES(extern_leveldb snappy)
+
+ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES})
+ADD_DEPENDENCIES(leveldb extern_leveldb)
+
+LIST(APPEND external_project_dependencies leveldb)
+
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index a25cff5fc567f22d4573625487f31bd4192bb172..20dda35c5ccd98f5672d867c26ab97a215483543 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -36,7 +36,8 @@ MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
 SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
 
-INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR})
+INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers.
+INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include mkldnn.h
 
 IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
     SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
@@ -44,15 +45,16 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
 ELSE()
     MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
 ENDIF()
-
-SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow")
-SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} -Wno-error=strict-overflow")
+SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result")
+SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
+SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
+SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
 ExternalProject_Add(
     ${MKLDNN_PROJECT}
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS             ${MKLDNN_DEPENDS}
     GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "v0.11"
+    GIT_TAG             "a29d8487a63afca3d5b8c5bbdbb473cf8ccc6e51"
     PREFIX              ${MKLDNN_SOURCES_DIR}
     UPDATE_COMMAND      ""
     CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
@@ -60,6 +62,7 @@ ExternalProject_Add(
     CMAKE_ARGS          -DMKLROOT=${MKLML_ROOT}
     CMAKE_ARGS          -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
     CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
+    CMAKE_ARGS          -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF
     CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
                         -DMKLROOT:PATH=${MKLML_ROOT}
 )
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 739a910c7c670b7b9f89e543582a32a80546fb11..82c424fb79d5596c31891bc395699bf9ff4e7e7e 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -27,14 +27,18 @@ ENDIF()
 INCLUDE(ExternalProject)
 
 SET(MKLML_PROJECT       "extern_mklml")
-SET(MKLML_VER           "mklml_lnx_2018.0.1.20171007")
-SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.11/${MKLML_VER}.tgz")
+IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
+  MESSAGE(STATUS "use pre defined download url")
+  SET(MKLML_VER "mklml_lnx_2018.0.3.20180406" CACHE STRING "" FORCE)
+  SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
+ENDIF()
+MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")
 SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
 SET(MKLML_INSTALL_DIR   ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
-SET(MKLML_ROOT          ${MKLML_INSTALL_DIR}/${MKLML_VER})
+SET(MKLML_ROOT          ${MKLML_INSTALL_DIR})
 SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)
 SET(MKLML_LIB_DIR       ${MKLML_ROOT}/lib)
 SET(MKLML_LIB           ${MKLML_LIB_DIR}/libmklml_intel.so)
@@ -46,7 +50,7 @@ INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
 FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt
   "PROJECT(MKLML)\n"
   "cmake_minimum_required(VERSION 3.0)\n"
-  "install(DIRECTORY ${MKLML_VER}\n"
+  "install(DIRECTORY ${MKLML_VER}/include ${MKLML_VER}/lib \n"
   "        DESTINATION ${MKLML_DST_DIR})\n")
 
 ExternalProject_Add(
diff --git a/cmake/external/nccl.cmake b/cmake/external/nccl.cmake
deleted file mode 100644
index af5c689c3524741a88518eeb3f85996872257677..0000000000000000000000000000000000000000
--- a/cmake/external/nccl.cmake
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-if(NOT WITH_GPU)
-  return()
-endif()
-
-include(ExternalProject)
-
-set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)
-
-include_directories(${NCCL_SOURCE_DIR}/src/extern_nccl/src)
-
-if(WITH_DSO)
-  # If we use DSO, we do not build nccl, just download the dependencies
-  set(NCCL_BUILD_COMMAND "")
-  set(NCCL_INSTALL_COMMAND "")
-  set(NCCL_INSTALL_DIR "")
-else()
-  # otherwise, we build nccl and link it.
-  set(NCCL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/nccl)
-  # Note: cuda 8.0 is needed to make nccl
-  # When cuda is not installed on the system directory, need to set CUDA_HOME to your cuda root
-  set(NCCL_BUILD_COMMAND "make -j 8")
-  set(NCCL_INSTALL_COMMAND  "make install PREFIX=${NCCL_INSTALL_DIR}")
-endif()
-
-ExternalProject_Add(
-    extern_nccl
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/NVIDIA/nccl.git"
-    GIT_TAG         "v1.3.4-1"
-    PREFIX          "${NCCL_SOURCE_DIR}"
-    UPDATE_COMMAND  ""
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND     "${NCCL_BUILD_COMMAND}"
-    INSTALL_COMMAND   "${NCCL_INSTALL_COMMAND}"
-    INSTALL_DIR       "${NCCL_INSTALL_DIR}"
-    TEST_COMMAND      ""
-)
-
-if(WITH_DSO)
-  if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
-    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_nccl_dummy.c)
-    file(WRITE ${dummyfile} "const char * dummy_nccl = \"${dummyfile}\";")
-    add_library(nccl STATIC ${dummyfile})
-  else()
-    add_library(nccl INTERFACE)
-  endif()
-else()
-  add_library(nccl STATIC IMPORTED GLOBAL)
-  set_property(TARGET nccl PROPERTY IMPORTED_LOCATION
-               ${NCCL_INSTALL_DIR}/lib/libnccl_static.a)
-endif()
-
-add_dependencies(nccl extern_nccl)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 8af2765f58717408e3a1ef6b500bb01511bfd8d3..ce6a88b51dc98ac46dd3935f12658d60d364ba8c 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -29,6 +29,8 @@ IF(NOT ${CBLAS_FOUND})
         "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
         CACHE FILEPATH "openblas library." FORCE)
 
+    ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS)
+
     SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
     SET(OPENBLAS_COMMIT "v0.2.20")
 
@@ -112,7 +114,12 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
 SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
 FILE(WRITE ${dummyfile} "const char *dummy_cblas = \"${dummyfile}\";")
 ADD_LIBRARY(cblas STATIC ${dummyfile})
-TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
+
+IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
+  TARGET_LINK_LIBRARIES(cblas dynload_mklml)
+ELSE()
+  TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
+ENDIF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
 
 IF(NOT ${CBLAS_FOUND})
     ADD_DEPENDENCIES(cblas extern_openblas)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 0fde4373a4be58e71ff1a305bd4991cc554d7a34..2665996432b1f6681927320a85d6835094abe4cd 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -212,6 +212,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
         ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake
             ${OPTIONAL_ARGS}
             -Dprotobuf_BUILD_TESTS=OFF
+            -DCMAKE_SKIP_RPATH=ON
             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
             -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake
index 71f54c425d4c38e271a8f1b78887d95a27252443..af09ed4d5d6e21cc50aba5198a7e9ea56f49451a 100644
--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@@ -11,19 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
 
-IF(MOBILE_INFERENCE)
+if(MOBILE_INFERENCE OR RPI)
     return()
-ENDIF()
+endif()
 
 include (ExternalProject)
 
 # NOTE: snappy is needed when linking with recordio
 
-SET(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
-SET(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
-SET(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include/" CACHE PATH "snappy include directory." FORCE)
+set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
+set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
+set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
+
+set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
 
 ExternalProject_Add(
     extern_snappy
@@ -46,13 +47,10 @@ ExternalProject_Add(
                      -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPY_INSTALL_DIR}/lib
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-    BUILD_COMMAND   make -j8
-    INSTALL_COMMAND make install
 )
 
 add_library(snappy STATIC IMPORTED GLOBAL)
-set_property(TARGET snappy PROPERTY IMPORTED_LOCATION
-             "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
+set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES})
 
 include_directories(${SNAPPY_INCLUDE_DIR})
 add_dependencies(snappy extern_snappy)
diff --git a/cmake/external/snappystream.cmake b/cmake/external/snappystream.cmake
index 5377a0b046a796cd6f0bb1fb466e1cd0b4b678bf..6df636d7fa8757ade73892bda03a80ba9767472b 100644
--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
@@ -11,9 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
 
-IF(MOBILE_INFERENCE)
+IF(MOBILE_INFERENCE OR RPI)
     return()
 ENDIF()
 
@@ -21,9 +20,11 @@ include (ExternalProject)
 
 # NOTE: snappy is needed when linking with recordio
 
-SET(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
-SET(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
-SET(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include/" CACHE PATH "snappy stream include directory." FORCE)
+set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
+set(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
+set(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include" CACHE PATH "snappy stream include directory." FORCE)
+
+set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
 
 ExternalProject_Add(
         extern_snappystream
@@ -45,14 +46,13 @@ ExternalProject_Add(
                         -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
                         -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
                         -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-        BUILD_COMMAND   make -j8
-        INSTALL_COMMAND make install
         DEPENDS snappy
 )
 
 add_library(snappystream STATIC IMPORTED GLOBAL)
-set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION
-        "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
+set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION ${SNAPPYSTREAM_LIBRARIES})
+
+include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers.
+include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers.
 
-include_directories(${SNAPPYSTREAM_INCLUDE_DIR})
 add_dependencies(snappystream extern_snappystream)
diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..0159815fed81bdff6de3e561af569e9edc75f947
--- /dev/null
+++ b/cmake/external/threadpool.cmake
@@ -0,0 +1,30 @@
+INCLUDE(ExternalProject)
+
+SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool)
+SET(THREADPOOL_INCLUDE_DIR ${THREADPOOL_SOURCE_DIR}/src/extern_threadpool)
+INCLUDE_DIRECTORIES(${THREADPOOL_INCLUDE_DIR})
+
+ExternalProject_Add(
+    extern_threadpool
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/progschj/ThreadPool.git"
+    GIT_TAG         9a42ec1329f259a5f4881a291db1dcb8f2ad9040
+    PREFIX          ${THREADPOOL_SOURCE_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND     ""
+    INSTALL_COMMAND   ""
+    TEST_COMMAND      ""
+)
+
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/threadpool_dummy.c)
+    file(WRITE ${dummyfile} "const char *dummy_threadpool = \"${dummyfile}\";")
+    add_library(simple_threadpool STATIC ${dummyfile})
+else()
+    add_library(simple_threadpool INTERFACE)
+endif()
+
+add_dependencies(simple_threadpool extern_threadpool)
+
+LIST(APPEND external_project_dependencies simple_threadpool)
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 9a9a20f897e09b823dfb19ff841c3f2aeb3f9fe6..07e1137e16afc1e4e9ab9640e1ccaea8008a0cd2 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -38,8 +38,7 @@ ENDIF()
 ExternalProject_Add(
     extern_warpctc
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/gangliao/warp-ctc.git"
-    GIT_TAG         b63a0644654a3e0ed624c85a1767bc8193aead09
+    GIT_REPOSITORY  "https://github.com/dzhwinter/warp-ctc.git"
     PREFIX          ${WARPCTC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@@ -62,7 +61,8 @@ ExternalProject_Add(
 )
 
 MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
-INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers.
+INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include warpctc headers.
 
 ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index 20b8506e678af4db6ccb65bef99d28e085a67bf2..c3d73235453c8c9fd2859c3ab142888e8bda2dbe 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -25,7 +25,8 @@ ELSE(WIN32)
   SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
 ENDIF(WIN32)
 
-INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) # For zlib code to include its own headers.
+INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zlib.h.
 
 ExternalProject_Add(
     extern_zlib
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 471e3929069d0d28105404b4f0f6baa303faf0e0..fd7fc16bff5651f022b484623243048fbd225b5a 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -96,6 +96,20 @@ if(NOT APPLE AND NOT ANDROID)
     set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
 endif(NOT APPLE AND NOT ANDROID)
 
+set_property(GLOBAL PROPERTY FLUID_MODULES "")
+# find all fluid modules is used for paddle fluid static library
+# for building inference libs
+function(find_fluid_modules TARGET_NAME)
+  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
+  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
+  string(FIND "${__target_path}" "fluid" pos)
+  if(pos GREATER 1)
+    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
+    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
+    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
+  endif()
+endfunction(find_fluid_modules)
+
 function(merge_static_libs TARGET_NAME)
   set(libs ${ARGN})
   list(REMOVE_DUPLICATES libs)
@@ -195,14 +209,16 @@ function(cc_library TARGET_NAME)
         list(REMOVE_ITEM cc_library_DEPS warpctc)
         add_dependencies(${TARGET_NAME} warpctc)
       endif()
-      if("${cc_library_DEPS}" MATCHES "ARCHIVE_START")
-        # Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
-        # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
-        target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
-        list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END)
-      else()
-        target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
+      # Only deps libmklml.so, not link
+      if("${cc_library_DEPS};" MATCHES "mklml;")
+        list(REMOVE_ITEM cc_library_DEPS mklml)
+        if(NOT "${TARGET_NAME}" MATCHES "dynload_mklml")
+          list(APPEND cc_library_DEPS dynload_mklml)
+        endif()
+        add_dependencies(${TARGET_NAME} mklml)
+        target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
       endif()
+      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
       add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
     endif()
     
@@ -213,8 +229,6 @@ function(cc_library TARGET_NAME)
         list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
       endif()
     endforeach()
-    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS})
-
   else(cc_library_SRCS)
     if(cc_library_DEPS)
       merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
@@ -238,20 +252,20 @@ endfunction(cc_binary)
 
 function(cc_test TARGET_NAME)
   if(WITH_TESTING)
-    set(options "")
+    set(options SERIAL)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS ARGS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
-    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
-    if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
-      list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
-    endif()
-    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
     add_test(NAME ${TARGET_NAME}
              COMMAND ${TARGET_NAME} ${cc_test_ARGS}
-             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    if (${cc_test_SERIAL})
+        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+    endif()
   endif()
 endfunction(cc_test)
 
@@ -279,7 +293,6 @@ function(nv_library TARGET_NAME)
           list(APPEND nv_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
         endif()
       endforeach()
-      add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS})
     else(nv_library_SRCS)
       if (nv_library_DEPS)
         merge_static_libs(${TARGET_NAME} ${nv_library_DEPS})
@@ -306,17 +319,96 @@ endfunction(nv_binary)
 
 function(nv_test TARGET_NAME)
   if (WITH_GPU AND WITH_TESTING)
-    set(options "")
+    set(options SERIAL)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
-    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags glog)
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
+    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
     add_test(${TARGET_NAME} ${TARGET_NAME})
+    if (nv_test_SERIAL)
+        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
+    endif()
   endif()
 endfunction(nv_test)
 
+function(hip_library TARGET_NAME)
+  if (WITH_AMD_GPU)
+    set(options STATIC static SHARED shared)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(_sources ${hip_library_SRCS})
+    HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
+    if(_source_files)
+      list(REMOVE_ITEM _sources ${_source_files})
+    endif()
+    if(hip_library_SRCS)
+      if (hip_library_SHARED OR hip_library_shared) # build *.so
+        add_library(${TARGET_NAME} SHARED ${_cmake_options} ${_generated_files} ${_sources})
+        set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
+      else()
+        add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources})
+        set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX)
+        target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a)
+	find_fluid_modules(${TARGET_NAME})
+      endif()
+      if (hip_library_DEPS)
+	add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
+	target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
+      endif()
+      # cpplint code style
+      foreach(source_file ${hip_library_SRCS})
+	string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+	if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+	  list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+	endif()
+      endforeach()
+    else(hip_library_SRCS)
+      if (hip_library_DEPS)
+	merge_static_libs(${TARGET_NAME} ${hip_library_DEPS})
+      else()
+	message(FATAL "Please specify source file or library in nv_library.")
+      endif()
+    endif(hip_library_SRCS)
+  endif()
+endfunction(hip_library)
+
+function(hip_binary TARGET_NAME)
+  if (WITH_AMD_GPU)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(hip_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    hip_add_executable(${TARGET_NAME} ${hip_binary_SRCS})
+    if(hip_binary_DEPS)
+      target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS})
+      add_dependencies(${TARGET_NAME} ${hip_binary_DEPS})
+    endif()
+  endif()
+endfunction(hip_binary)
+
+function(hip_test TARGET_NAME)
+  if (WITH_AMD_GPU AND WITH_TESTING)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(hip_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(_sources ${hip_test_SRCS})
+    HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
+    if(_source_files)
+      list(REMOVE_ITEM _sources ${_source_files})
+    endif()
+    add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources})
+    set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
+    target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
+    add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
+    add_test(${TARGET_NAME} ${TARGET_NAME})
+  endif()
+endfunction(hip_test)
+
 function(go_library TARGET_NAME)
   set(options STATIC static SHARED shared)
   set(oneValueArgs "")
@@ -485,9 +577,9 @@ function(py_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS ARGS ENVS)
     cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_ENVS}
+             COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
              ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
-             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif()
 endfunction()
 
@@ -511,6 +603,9 @@ function(grpc_library TARGET_NAME)
   get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE)
   get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
 
+  #FIXME(putcn): the follwoing line is supposed to generate *.pb.h and cc, but
+  # somehow it didn't. line 602 to 604 is to patching this. Leaving this here 
+  # for now to enable dist CI.
   protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
   set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc")
   set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h")
@@ -521,6 +616,9 @@ function(grpc_library TARGET_NAME)
           COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
           ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
           --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}"
+          COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
+          ARGS --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
+          "${ABS_PROTO}"
           DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc)
 
   # FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it
@@ -537,3 +635,21 @@ function(grpc_library TARGET_NAME)
     COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}")
 endfunction()
+
+
+function(brpc_library TARGET_NAME)
+  set(oneValueArgs PROTO)
+  set(multiValueArgs SRCS DEPS)
+  set(options "")
+  cmake_parse_arguments(brpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  message(STATUS "generating brpc ${brpc_library_PROTO}")
+
+  get_filename_component(ABS_PROTO ${brpc_library_PROTO} ABSOLUTE)
+  get_filename_component(PROTO_WE ${brpc_library_PROTO} NAME_WE)
+  get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
+
+  protobuf_generate_cpp(brpc_proto_srcs brpc_proto_hdrs "${ABS_PROTO}")
+  cc_library("${TARGET_NAME}_proto" SRCS "${brpc_proto_srcs}")
+  cc_library("${TARGET_NAME}" SRCS "${brpc_library_SRCS}" DEPS "${TARGET_NAME}_proto" "${brpc_library_DEPS}")
+endfunction()
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..bfe491bd6b7602959d3dd60bd06c67993593cc9b
--- /dev/null
+++ b/cmake/hip.cmake
@@ -0,0 +1,43 @@
+if(NOT WITH_AMD_GPU)
+    return()
+endif()
+
+include_directories("/opt/rocm/include")
+include_directories("/opt/rocm/hipblas/include")
+include_directories("/opt/rocm/hiprand/include")
+include_directories("/opt/rocm/rocrand/include")
+include_directories("/opt/rocm/rccl/include")
+include_directories("/opt/rocm/thrust")
+
+list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc")
+
+set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++14" )
+
+if(WITH_DSO)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO")
+endif(WITH_DSO)
+
+if(WITH_DOUBLE)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_TYPE_DOUBLE")
+endif(WITH_DOUBLE)
+
+if(WITH_TESTING)
+  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING")
+endif(WITH_TESTING)
+
+if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
+    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
+    list(APPEND HIP_HCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
+endif()
+
+if("x${HCC_HOME}" STREQUAL "x")
+  set(HCC_HOME "/opt/rocm/hcc")
+endif()
+
+set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
+set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared")
+set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared")
+
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 6b2237b858380f384be0aa3c6ae24a4c83ad646d..c6979713231f631f8757e4139d6f685d4554b54e 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -1,14 +1,16 @@
-set_property(GLOBAL PROPERTY FLUID_MODULES "")
-# find all fluid modules is used for paddle fluid static library
-function(find_fluid_modules TARGET_NAME)
-  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
-  string(FIND "${__target_path}" "fluid" pos)
-  if(pos GREATER 1)
-    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
-    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
-  endif()
-endfunction(find_fluid_modules)
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 # make package for paddle fluid shared and static library
 function(copy TARGET)
@@ -24,7 +26,7 @@ function(copy TARGET)
         message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers")
     endif()
     math(EXPR len "${copy_lib_SRCS_len} - 1")
-    
+
     add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS})
     foreach(index RANGE ${len})
         list(GET copy_lib_SRCS ${index} src)
@@ -37,43 +39,91 @@ function(copy TARGET)
 endfunction()
 
 # third party
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/eigen3")
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/eigen3")
 copy(eigen3_lib
   SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
   DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported
+  DEPS eigen3
 )
 
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/gflags")
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/gflags")
 copy(gflags_lib
   SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
   DSTS ${dst_dir} ${dst_dir}/lib
+  DEPS gflags
 )
 
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/glog")
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/glog")
 copy(glog_lib
   SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
   DSTS ${dst_dir} ${dst_dir}/lib
+  DEPS glog
+)
+
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/boost/")
+copy(boost_lib
+  SRCS ${BOOST_INCLUDE_DIR}/boost
+  DSTS ${dst_dir}
+  DEPS boost
 )
 
 if(NOT PROTOBUF_FOUND)
-    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf")
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf")
     copy(protobuf_lib
       SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
       DSTS ${dst_dir} ${dst_dir}/lib
+      DEPS extern_protobuf
     )
 endif()
 
 if(NOT CBLAS_FOUND)
-    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/openblas")
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas")
     copy(openblas_lib
       SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
       DSTS ${dst_dir} ${dst_dir}
+      DEPS extern_openblas
     )
+elseif (WITH_MKLML)
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mklml")
+    copy(mklml_lib
+      SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
+      DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
+      DEPS mklml
+    )
+endif()
+
+if(WITH_MKLDNN)
+  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn")
+  copy(mkldnn_lib
+    SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
+    DSTS ${dst_dir} ${dst_dir}/lib
+    DEPS mkldnn
+  )
+endif()
+
+if(NOT MOBILE_INFERENCE AND NOT RPI)
+  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
+  copy(snappy_lib
+    SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
+    DSTS ${dst_dir} ${dst_dir}/lib
+    DEPS snappy)
+
+  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
+  copy(snappystream_lib
+    SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
+    DSTS ${dst_dir} ${dst_dir}/lib
+    DEPS snappystream)
+
+  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
+  copy(zlib_lib
+    SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
+    DSTS ${dst_dir} ${dst_dir}/lib
+    DEPS zlib)
 endif()
 
 # paddle fluid module
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle/fluid")
+set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
 set(module "framework")
 copy(framework_lib DEPS framework_py_proto 
   SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
@@ -86,8 +136,29 @@ copy(memory_lib
   DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
 )
 
+set(inference_deps paddle_fluid_shared paddle_fluid)
+
+if(WITH_CONTRIB)
+    message(STATUS "installing contrib")
+    set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
+    if (WITH_ANAKIN AND WITH_GPU)
+        copy(contrib_anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
+            SRCS
+            ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libinference_anakin_api* # compiled anakin api
+            ${PADDLE_BINARY_DIR}/third_party/install/anakin/*.tar.gz # anakin release
+            DSTS ${contrib_dst_dir}/anakin ${contrib_dst_dir}/anakin)
+        list(APPEND inference_deps contrib_anakin_inference_lib)
+   endif()
+
+  copy(contrib_inference_lib DEPS paddle_inference_api paddle_inference_api_shared
+        SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h
+        ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api*
+        DSTS ${contrib_dst_dir} ${contrib_dst_dir})
+  list(APPEND inference_deps contrib_inference_lib)
+endif()
+
 set(module "inference")
-copy(inference_lib DEPS paddle_fluid_shared paddle_fluid
+copy(inference_lib DEPS ${inference_deps}
   SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
   DSTS ${dst_dir}/${module} ${dst_dir}/${module}
 )
@@ -104,4 +175,31 @@ copy(string_lib
   DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
 )
 
+set(module "pybind")
+copy(pybind_lib
+  SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h
+  DSTS ${dst_dir}/${module}
+)
+
+# CMakeCache Info
+copy(cmake_cache
+  SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
+  DSTS ${FLUID_INSTALL_DIR})
+
 add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep}) 
+
+# paddle fluid version
+execute_process(
+  COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
+  WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
+set(version_file ${FLUID_INSTALL_DIR}/version.txt)
+file(WRITE ${version_file}
+  "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
+  "WITH_MKL: ${WITH_MKL}\n"
+  "WITH_GPU: ${WITH_GPU}\n")
+if(WITH_GPU)
+  file(APPEND ${version_file}
+    "CUDA version: ${CUDA_VERSION}\n"
+    "CUDNN version: v${CUDNN_MAJOR_VERSION}\n")
+endif()
diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..ac19b1651893f18b14c62a0986df75bed25d7e80
--- /dev/null
+++ b/cmake/tensorrt.cmake
@@ -0,0 +1,35 @@
+if(NOT WITH_GPU)
+    return()
+endif()
+
+set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT")
+find_path(TENSORRT_INCLUDE_DIR NvInfer.h
+    PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/include
+    $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/include
+    NO_DEFAULT_PATH
+)
+
+find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a
+    PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/lib
+    $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/lib
+    NO_DEFAULT_PATH
+    DOC "Path to TensorRT library.")
+
+if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY)
+    set(TENSORRT_FOUND ON)
+else()
+    set(TENSORRT_FOUND OFF)
+endif()
+
+if(TENSORRT_FOUND)
+    file(READ ${TENSORRT_INCLUDE_DIR}/NvInfer.h TENSORRT_VERSION_FILE_CONTENTS)
+    string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION
+        "${TENSORRT_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define NV_TENSORRT_MAJOR +([0-9]+)" "\\1"
+        TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}")
+
+    message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
+        "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
+    include_directories(${TENSORRT_INCLUDE_DIR})
+    list(APPEND EXTERNAL_LIBS ${TENSORRT_LIBRARY})
+endif()
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index da67701ec1af57df742dce105990cffa40f45d7c..0f9521616952a2857222feab8c38fb480761ee2d 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -1 +1,11 @@
+add_custom_target(paddle_apis ALL
+                  DEPENDS paddle_v2_apis paddle_fluid_apis)
+
+add_custom_target(paddle_docs ALL
+                  DEPENDS paddle_v2_docs paddle_v2_docs_cn
+                  paddle_fluid_docs paddle_fluid_docs_cn
+                  paddle_mobile_docs paddle_mobile_docs_cn)
+
 add_subdirectory(v2)
+add_subdirectory(fluid)
+add_subdirectory(mobile)
diff --git a/doc/about/about_us.rst b/doc/about/about_us.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f67d8b8130030db8d7e7d10b30271a913bd6272a
--- /dev/null
+++ b/doc/about/about_us.rst
@@ -0,0 +1,53 @@
+=========
+关于我们
+=========
+
+什么是PaddlePaddle
+--------------------
+
+- PaddlePaddle是百度自主研发并开源的深度学习框架，它能够让开发者和企业安全、快速地实现自己的AI想法
+
+- 项目团队汇聚了全球顶级的深度学习科学家，致力于为开发者和企业提供最好的深度学习研发体验
+
+- 框架具有易学、易用、安全、高效四大特性，是最适合中国开发者和企业的深度学习工具
+
+PaddlePaddle的技术特色
+-------------------------
+
+- 新一代深度学习框架： PaddlePaddle是基于“深度学习编程语言”的新一代深度学习框架，在保证性能的同时，极大的提升了框架对模型的表达能力，能够描述任意潜在可能出现的模型
+
+- 对大规模计算更加友好：经过百度内多种大规模计算业务的打磨，PaddlePaddle在分布式计算上表现优异，基于EDL技术能够节约大量计算资源，同时也能支持大规模稀疏模型的训练
+
+- 提供可视化的深度学习：通过Visual DL可以帮助开发者方便的观测训练整体趋势、数据样本质量和中间结果、参数分布和变化趋势、以及模型的结构，帮助开发者更便捷的完成编程过程
+
+提供基于PaddlePaddle的教育体系
+--------------------------------
+
+- 深度学习课程：百度与中国市场顶级的教育、培训机构共同开发了深度学习精品课程以及学习教材，帮助开发者从零掌握深度学习
+
+- 深度学习实训：对于目的是科研和学习的用户，PaddlePaddle提供了无需安装、线上运行的开发环境，并提供算法、算力、数据支持
+
+- 线下培训：提供丰富、高质量的线下教育活动，如青年教师培训、线下实战营、沙龙等多种形式的培训和交流
+
+
+提供基于PaddlePaddle的AI服务
+------------------------------
+
+- EadyDL：可以帮助零算法基础的企业快速完成一个深度学习任务，只需少量的数据即可得到优质的模型
+
+- AI市场：提供标准化的AI 能力、产品的交易机制，帮助企业快速找到所需，有效开展AI业务
+
+- 深度学习竞赛： PaddlePaddle汇聚顶尖深度学习开发者，企业可以发布自己的商业问题，通过竞赛方式快速找到最优的解决方案
+
+你对PaddlePaddle有任何的问题都可以通过以下方式联系到我们
+-----------------------------------------------------------
+
+- 学习/使用问题：可以在 `PaddlePaddle开源社区 <https://github.com/PaddlePaddle/Paddle/issues>`_，以及 `PaddlePaddle中文社区 <http://ai.baidu.com/forum/topic/list/168>`_ 向我们反馈
+
+- 对PaddlePaddle框架发展的建议：可发送邮件至Paddle-better@baidu.com
+
+我们期待与你一起打造世界顶级深度学习框架，共同推动AI技术的进步
+
+
+
+PaddlePaddle团队
diff --git a/doc/design/api.md b/doc/design/api.md
deleted file mode 100644
index e6a4638d9100d9b07c3ee6b92b530a17eae1c162..0000000000000000000000000000000000000000
--- a/doc/design/api.md
+++ /dev/null
@@ -1,262 +0,0 @@
-# PaddlePaddle Design Doc
-
-## Ingredients
-
-As our design principle is starting from the essence: how could we
-allow users to express and solve their problems as neural networks.
-Some essential concepts that our API have to provide include:
-
-1. A *topology* is an expression of *layers*.
-
-1. A layer could be any kind of computation, including *cost*.
-
-1. Some layers have parameters, some don't. Most costs don't have
-   parameters.
-
-1. In some topologies, layers share parameters.  For
-   example,
-   [the network for training a ranking model](https://github.com/PaddlePaddle/Paddle/issues/1311#issuecomment-279121850).
-
-1. At programming time, users specify topologies and possible sharing
-   of parameters.  PaddlePaddle can figure out and create parameters
-   required (and possibly shared) by one or more topologies.
-
-
-## Starting from Examples
-
-As a summarization
-of
-[our disucssion](https://github.com/PaddlePaddle/Paddle/issues/1315),
-let us present two examples here:
-
-
-### Example 1. Sharing Parameters between Layers
-
-We use
-the
-[3-branch ranking](https://github.com/PaddlePaddle/Paddle/issues/1311#issuecomment-279121850) model
-in this example.  For your convenience, I copy-a-paste the model's
-topology as follows:
-
-```
-A -> f -\
-Q -> f --> cost
-B -> f -/
-```
-
-The following program trains the topology including the cost, and then
-use the sub-network in the trained topology in inference:
-
-```python
-def f(in):
-    e = paddle.layer.embedding(in, parameter_name="embedding")
-    o = paddle.layer.softmax(e, parameter_name="semantic")
-    return o
-
-# Create 3 topologies (subnets), they share parameters because all
-# correspoinding layers have the same parameter names.
-fA = f(paddle.layer.data(input_name="A"))
-fB = f(paddle.layer.data(input_name="B"))
-fQ = f(paddle.layer.data(input_name="Q"))
-
-topology = paddle.layer.less_than(
-               paddle.layer.cross_entropy(fA, fQ),
-               paddle.layer.corss_entropy(fB, fQ))
-
-# Derive parameters required in topology and create them in model.
-parameters = paddle.parameters.create(topology)
-
-# Estimate parameters used in topology from data.
-paddle.train(topology, parameters, reader=read_ranking_model_data)
-
-# Inference using fA (or fB or fC, as they share their parameters).
-[testA, testB, testQ] = read_ranking_model_data()
-print "The sematic-vector of testA: ", paddle.infer(fA, parameters, testA)
-```
-
-
-### Example 2. Sharing Parameters between "Models"
-
-We use [GAN](https://github.com/PaddlePaddle/book/tree/develop/gan) in
-this example.  In the following example program, `d0` and `d1`
-correspond to the two networks in the following figure:
-
-<img src="https://github.com/wangyang59/book/raw/00036f4b0da5225041a6824587c1a01cf20159b1/gan/image/gan_ig.png" width=400 />
-
-```python
-def G(in):
-    # over-simplified example as G has only one layers:
-    return paddle.layer.fc(in, parameter_name="G")
-
-def D(in);
-    # again, over-simplified:
-    return paddle.layer.fc(in, parameter_name="D")
-
-# Construct the first topology, which contains both D and G.
-# By learning this topology, we update parameters of G.
-d0 = paddle.layer.should_be_false(D(G(paddle.layer.data())))
-
-# Construct a second topology d1, which contains only D. By
-# training this topology, we update parameters of D.  Note
-# that d1 share parameters with d0.
-d1 = paddle.layer.should_be_true(D(paddle.layer.data()))
-
-# Create parameters from a list of multiple topologies (models) for
-# the chance to share parameters between these topologies.
-parameters = paddle.parameters.create([d0, d1])
-
-# Iterative training of GAN.
-for ...:
-    train(d0, parameters, reader=read_from_rng, immutable_parameters={"D"})
-    train(d1, parameters, reader=read_from_realistic_images)
-
-# Use d1 for inference:
-print "D thinks a batch of images are realistic ", infer(d1, parameters, read_mnist_images)
-```
-
-
-### Summarization
-
-
-Above two programs reveal some important design concerns:
-
-1. Users describe a topology as an expression of layers.  Every layer
-   has a *parameter name*.  If the users don't specify it explicitly, it's automatically generated as a unique name.  By
-   specifying the parameter name, users can specify the sharing of
-   parameters between layers and even between topologies.
-
-1. `paddle.parameters.create` figures out parameters required by one
-   or more topologies from parameter names of layers.  It creates these
-   parameters and returns a `ParameterSet` object, which is in essence
-   a map from *parameter names* to *parameters*.
-
-1. At training and inference time, `paddle.train` and `paddle.infer`
-   requires both a topology and the parameter set that holds the parameters of that topology.  There are some reasons:
-
-   1. This prevents users from forgetting to call
-      `paddle.parameters.create`.
-   1. `paddle.train` needs to know which parameter set to update.
-   1. Users could load another (pre-trained) parameter set and use it
-      with a topology in `train.infer`.
-
-1. By specifying the `immutable_parameters` parameter of
-   `paddle.train`, we can forbid the update of these parameters.
-
-
-## Reader
-
-Not all programming frameworks allow users to define I/O functions.
-An example is Google MapReduce, which can only read from text,
-SSTable, and RecordIO files.  Hadoop MapReduce allows users to define
-readers and writers by deriving from base classes `Reader` and
-`Writer`.  The former is less flexible but also less error-prone.  We
-decide to provide the flexibility to users to define their readers.
-
-
-There are some open questions here:
-
-1. **Should a reader return a Python dictionary?**
-
-1. **How to map multiple outputs from a reader to multiple data layers?**
-
-1. **How to easily compose some existing readers to read more data and
-   feed a topology with more data layers?**
-
-
-## Training
-
-The recommended way to training a model is to call `paddle.train`,
-which simply calls `paddle.trainer.Default`, a global variable of
-type `paddle.trainer.SGD`.  Equivalently, we can do
-
-```python
-opt = paddle.trainer.SGD(..., paddle.updater.Adam(...))
-opt.train(topology, parameters, reader=read, ...)
-```
-
-### Updater
-
-Please be aware that a trainer can accept an updater as its data
-member, where an updater is a class derived from
-`paddle.trainer.Updater`.  This is to make it easier to customize
-trainers, as discussed
-[here](https://github.com/PaddlePaddle/Paddle/issues/1319).
-
-### Event Handler
-
-`paddle.train` and `paddle.trainer.XXX.train` take an optional
-parameter `event_handler`, which should be either `None` or a function
-that handle some events:
-
-1. BeginTraining
-1. EndTraining
-1. BeginIteration
-1. EndIteration
-1. BeginPass
-1. EndPass
-
-where EndPass is sent if and only if the reader yields
-`end_pass=True`.
-
-An example as follows:
-
-```python
-def event_handler(event):
-    if ininstance(event, paddle.event.EndIteration):
-        print paddle.test(...)
-
-paddle.train(topology, parameters, reader, event_handler)
-```
-
-If we are writing a PaddlePaddle program in and for iPython/Jypyter,
-we can use metaplotlib in the event handler to plot a curve of
-cost/error versus iterations, as shown
-[here](https://blog.dominodatalab.com/interactive-dashboards-in-jupyter/).
-
-### Distributed Training
-
-If users want to do distributed training on a cluster, s/he should
-call `paddle.dist_train` and provides access tokens to the cluster as
-a parameter.
-
-For example, if the user has a TLS certificate that allows him to
-access a Kubernetes cluster, s/he should be able to call
-
-```python
-paddle.dist_train(model,
-                  trainer=paddle.trainer.SGD(...,
-                                             paddle.updater.Adam(...)),
-                  reader=read,
-                  k8s_user="yi",
-                  k8s_token="kube_cluster_tls.pem",
-                  k8s_job="hello",
-                  num_parameter_servers=15)
-```
-
-The pseudo code of `paddle.dist_train` is as follows:
-
-```python
-def dist_train(topology, parameters, trainer, reader, ...):
-    if os.getenv("KUBERNETES_SERVICE_HOST") == None:
-        image_name = k8s_user + '/' + k8s_job
-        docker_build(image_name)
-        docker_push()
-        kube_ctrl_start_job(image_name, k8s_user, k8s_token)
-    else:
-        rank = kube_list_containers_in_job_and_return_current_containers_rank()
-        if rank == 0:
-            master()
-        elif rank < 15:
-            parameter_server()
-        else:
-            trainer.train(model, reader=read)
-```
-
-Please be aware that if a process is running on the Kubernetes
-cluster, it will have some environment variables pre-defined.
-
-If `dist_train` doesn't see these environment variables, it knows
-that it's running on users' personal computer, and it should work as a
-*launcher*.  Otherwise, it knows that it's running on the cluster and
-need to figure out its role as either the master, or a trainer, or a
-parameter server.
diff --git a/doc/design/block.md b/doc/design/block.md
deleted file mode 100644
index 907a2def557fd472ac4d679c73447bd9107d1190..0000000000000000000000000000000000000000
--- a/doc/design/block.md
+++ /dev/null
@@ -1,336 +0,0 @@
-# Design Doc: Block and Scope
-
-## The Representation of Computation
-
-Both deep learning systems and programming languages help users describe computation procedures.  These systems use various representations of computation:
-
-- Caffe, Torch, and Paddle: sequences of layers.
-- TensorFlow, Caffe2, Mxnet: graph of operators.
-- PaddlePaddle: nested blocks, like C++ and Java programs.
-
-## Block in Programming Languages and Deep Learning
-
-In programming languages, a block is a pair of curly braces that includes local variables definitions and a sequence of instructions or operators.
-
-Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning:
-
-| programming languages | PaddlePaddle          |
-|-----------------------|-----------------------|
-| for, while loop       | RNN, WhileOp          |
-| if, if-else, switch   | IfElseOp, SwitchOp    |
-| sequential execution  | a sequence of layers  |
-
-A key difference is that a C++ program describes a one pass computation, whereas a deep learning program describes both the forward and backward passes.
-
-## Stack Frames and the Scope Hierarchy
-
-The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs:
-
-| programming languages | PaddlePaddle                    |
-|-----------------------|---------------------------------|
-| stack                 | scope hierarchy                 |
-| stack frame           | scope                           |
-| push at entering block| push at entering block          |
-| pop at leaving block  | destroy when minibatch completes|
-
-1. In traditional programs:
-
-   - When the execution enters the left curly brace of a block, the runtime pushes a frame into the stack, where it realizes local variables.
-   - After the execution leaves the right curly brace, the runtime pops the frame.
-   - The maximum number of frames in the stack is the maximum depth of nested blocks.
-
-1. In PaddlePaddle
-
-   - When the execution enters a block, PaddlePaddle adds a new scope, where it realizes variables.
-   - PaddlePaddle doesn't pop a scope after the execution of the block because variables therein are used by the backward pass.  So it has a stack forest known as a *scope hierarchy*.
-   - The height of the highest tree is the maximum depth of nested blocks.
-   - After the processing of a minibatch, PaddlePaddle destroys the scope hierarchy.
-
-## Use Blocks in C++ and PaddlePaddle Programs
-
-Let us consolidate the discussion by presenting some examples.
-
-### Blocks with `if-else` and `IfElseOp`
-
-The following C++ programs shows how blocks are used with the `if-else` structure:
-
-```c++
-namespace pd = paddle;
-
-int x = 10;
-int y = 1;
-int z = 10;
-bool cond = false;
-int o1, o2;
-if (cond) {
-  int z = x + y;
-  o1 = z;
-  o2 = pd::layer::softmax(z);
-} else {
-  int d = pd::layer::fc(z);
-  o1 = d;
-  o2 = d+1;
-}
-
-```
-
-An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](./if_else_op.md) is as follows:
-
-```python
-import paddle as pd
-
-x = minibatch([10, 20, 30]) # shape=[None, 1]
-y = var(1) # shape=[1], value=1
-z = minibatch([10, 20, 30]) # shape=[None, 1]
-cond = larger_than(x, 15) # [false, true, true]
-
-ie = pd.ifelse()
-with ie.true_block():
-    d = pd.layer.add_scalar(x, y)
-    ie.output(d, pd.layer.softmax(d))
-with ie.false_block():
-    d = pd.layer.fc(z)
-    ie.output(d, d+1)
-o1, o2 = ie(cond)
-```
-
-In both examples, the left branch computes `x+y` and `softmax(x+y)`, the right branch computes `fc(x)` and `x+1` .
-
-The difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances.
-
-
-### Blocks with `for` and `RNNOp`
-
-The following RNN model in PaddlePaddle from the [RNN design doc](./rnn.md) :
-
-```python
-x = sequence([10, 20, 30]) # shape=[None, 1]
-m = var(0) # shape=[1]
-W = var(0.314, param=true) # shape=[1]
-U = var(0.375, param=true) # shape=[1]
-
-rnn = pd.rnn()
-with rnn.step():
-  h = rnn.memory(init = m)
-  h_prev = rnn.previous_memory(h)
-  a = layer.fc(W, x)
-  b = layer.fc(U, h_prev)  
-  s = pd.add(a, b)
-  act = pd.sigmoid(s)
-  rnn.update_memory(h, act)
-  rnn.output(a, b)
-o1, o2 = rnn()
-```
-has its equivalent C++ program as follows
-
-```c++
-int* x = {10, 20, 30};
-int* m = {0};
-int* W = {0.314};
-int* U = {0.375};
-
-int mem[sizeof(x) / sizeof(x[0]) + 1];
-int o1[sizeof(x) / sizeof(x[0]) + 1];
-int o2[sizeof(x) / sizeof(x[0]) + 1];
-for (int i = 1; i <= sizeof(x)/sizeof(x[0]); ++i) {
-  int x = x[i-1];
-  if (i == 1) mem[0] = m;
-  int a = W * x;
-  int b = Y * mem[i-1];
-  int s = fc_out + hidden_out;
-  int act = sigmoid(sum);
-  mem[i] = act;
-  o1[i] = act;
-  o2[i] = hidden_out;
-}
-```
-
-## Compilation and Execution
-
-Like TensorFlow, a PaddlePaddle program is written in Python. The first part describes a neural network as a protobuf message, and the rest executes the message for training or inference.
-
-The generation of this protobuf message is similar to how a compiler generates a binary executable file. The execution of the message is similar to how the OS executes the binary file.
-
-## The "Binary Executable File Format"
-
-The definition of the protobuf message is as follows:
-
-```protobuf
-message BlockDesc {
-  repeated VarDesc vars = 1;
-  repeated OpDesc ops = 2;
-}
-```
-
-The step net in above RNN example would look like
-
-```
-BlockDesc {
-  vars = {
-    VarDesc {...} // x
-    VarDesc {...} // h
-    VarDesc {...} // fc_out
-    VarDesc {...} // hidden_out
-    VarDesc {...} // sum
-    VarDesc {...} // act
-  }
-  ops = {
-    OpDesc {...} // matmul
-    OpDesc {...} // add_two
-    OpDesc {...} // sigmoid
-  }
-};
-```
-
-Also, the RNN operator in above example is serialized into a protobuf message of type `OpDesc` and would look like:
-
-```
-OpDesc {
-  inputs = {0} // the index of x in vars of BlockDesc above
-  outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above
-  attrs {
-    "states" : {1} // the index of h
-    "step_net" : <above step net>
-  }
-};
-```
-
-This `OpDesc` value is in the `ops` field of the `BlockDesc` value representing the global block.
-
-
-## The Compilation of Blocks
-
-During the generation of the Protobuf message, the Block should store VarDesc (the Protobuf message which describes Variable) and OpDesc (the Protobuf message which describes Operator).
-
-VarDesc in a block should have its name scope to avoid local variables affecting parent block's name scope.
-Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that is stored in the parent block. For example:
-
-```python
-a = pd.Variable(shape=[20, 20])
-b = pd.fc(a, params=["fc.w", "fc.b"])
-
-rnn = pd.create_rnn()
-with rnn.stepnet():
-    x = a.as_step_input()
-    # reuse fc's parameter
-    fc_without_b = pd.get_variable("fc.w")
-    rnn.output(fc_without_b)
-
-out = rnn()
-```
-The method `pd.get_variable` can help retrieve a Variable by the name. The Variable may be stored in a parent block, but might be retrieved in a child block, so block should have a variable scope that supports inheritance.
-
-In compiler design, the symbol table is a data structure created and maintained by compilers to store information about the occurrence of various entities such as variable names, function names, classes, etc.
-
-To store the definition of variables and operators, we define a C++ class `SymbolTable`, like the one used in compilers.
-
-`SymbolTable` can do the following:
-
-- store the definitions (some names and attributes) of variables and operators,
-- verify if a variable was declared,
-- make it possible to implement type checking (offer Protobuf message pointers to `InferShape` handlers).
-
-
-```c++
-// Information in SymbolTable is enough to trace the dependency graph. So maybe
-// the Eval() interface takes a SymbolTable is enough.
-class SymbolTable {
- public:
-  SymbolTable(SymbolTable* parent) : parent_(parent) {}
-
-  OpDesc* NewOp(const string& name="");
-
-  // TODO determine whether name is generated by python or C++.
-  // Currently assume that a unique name will be generated by C++ if the
-  // argument name is left default.
-  VarDesc* Var(const string& name="");
-
-  // find a VarDesc by name, if recursive is true, find parent's SymbolTable
-  // recursively.
-  // this interface is introduced to support InferShape, find protobuf messages
-  // of variables and operators, pass pointers into InferShape.
-  //
-  // NOTE maybe some C++ classes such as VarDescBuilder and OpDescBuilder should
-  // be proposed and embedded into pybind to enable python operation on C++ pointers.
-  VarDesc* FindVar(const string& name, bool recursive=true);
-
-  OpDesc* FindOp(const string& name);
-
-  BlockDesc Compile() const;
-
- private:
-  SymbolTable* parent_;
-
-  map<string, OpDesc> ops_;
-  map<string, VarDesc> vars_;
-};
-```
-
-After all the description of variables and operators is added into SymbolTable,
-the block has enough information to run.
-
-The `Block` class takes a `BlockDesc` as input, and provides `Run` and `InferShape` functions.
-
-
-```c++
-namespace {
-
-class Block : OperatorBase {
-public:
-  Block(const BlockDesc& desc) desc_(desc) {}
-
-  void InferShape(const framework::Scope& scope) const override {
-    if (!symbols_ready_) {
-      CreateVariables(scope);
-      CreateOperators();
-    }
-    // should run InferShape first.
-    for (auto& op : runtime_table_.ops()) {
-      op->InferShape(scope);
-    }
-  }
-
-  void Run(const framework::Scope& scope,
-           const platform::Place& place) const override {
-    PADDLE_ENFORCE(symbols_ready_, "operators and variables should be created first.");
-    for (auto& op : runtime_table_.ops()) {
-      op->Run(scope, place);
-    }
-  }
-
-  void CreateVariables(const framework::Scope& scope);
-  void CreateOperators();
-
-  // some other necessary interfaces of NetOp are listed below
-  // ...
-
-private:
-  BlockDesc desc_;
-  bool symbols_ready_{false};
-};
-```
-
-## The Execution of Blocks
-
-Block inherits from OperatorBase, which has a Run method.
-Block's Run method will run its operators sequentially.
-
-There is another important interface called `Eval`, which takes some arguments called targets and generates a minimal graph which treats targets as the end points and creates a new Block. After `Run`, `Eval` will get the latest value and return the targets.
-
-The definition of Eval is as follows:
-
-```c++
-// clean a block description by targets using the corresponding dependency graph.
-// return a new BlockDesc with minimal number of operators.
-// NOTE: The return type is not a Block but the block's description so that this can be distributed
-// to a cluster.
-BlockDesc Prune(const BlockDesc& desc, vector<string> targets);
-
-void Block::Eval(const vector<string>& targets,
-                 const framework::Scope& scope,
-                 const platform::DeviceContext& dev_ctx) {
-  BlockDesc min_desc = Prune(desc_, targets);
-  Block min_block(min_desc);
-  min_block.Run(scope, dev_ctx);
-}
-```
diff --git a/doc/design/build_system/README.md b/doc/design/build_system/README.md
deleted file mode 100644
index bf0e4dddc1b640ecbce489f65820aaf8a4b3b1e7..0000000000000000000000000000000000000000
--- a/doc/design/build_system/README.md
+++ /dev/null
@@ -1,152 +0,0 @@
-A few months ago when we were trying to replace CMake with Bazel, @emailweixu suggested that we rewrite those handy Bazel functions using CMake. Now it seems that it's the right time to get this done, as we are facing problems from the porting of Majel and the development of new the parameter server using Go and C++.
-
-Here are some initial thoughts. Your comments are welcome!
-
-### Required CMake Function
-
-I think we need only the following few CMake functions to make a project description mean and clean:
-
-| C++ | CUDA C++ | Go |
-|---|---|---|
-| cc_library | nv_library | go_library |
-| cc_binary | nv_binary | go_binary |
-| cc_test | nv_test | go_test |
-
-- The `_library` functions generate  .a files from source code.
-- The `_binary` functions generate executable binary files.
-- The `_test` functions generate executable unit test files. They work like `_binary` but links `-lgtest` and `-lgtest_main`.
-
-The difference between `nv_` functions and `cc_` functions is that the former use `nvcc` instead of the system-default C++ compiler.
-
-Both `nv_` and `cc_` functions enables C++11 (-std=c++11).
-
-Also,
-
-- to describe external dependencies, we need `external_library`.
-- to build shared libraries, we need `shared_library`.
-
-### An Example Project
-
-Suppose that we have aforementioned functions defined in our `/cmake` directory.  The following example `CMakeLists.txt` describes a project including the following source files:
-
-- tensor.h
-- tensor.cc
-- tensor_test.cc
-- ops.h
-- ops.cu
-- ops_test.cu
-- api.go
-- api_test.go
-
-Suppose that ops.cu depends on CUDNN.
-
-```cmake
-# cc_binary parses tensor.cc and figures out that target also depend
-# on tensor.h.
-cc_binary(tensor
-  SRCS
-  tensor.cc)
-
-# The dependency to target tensor implies that if any of
-# tensor{.h,.cc,_test.cc} is changed, tensor_test need to be re-built.
-cc_test(tensor_test
-  SRCS
-  tensor_test.cc
-  DEPS
-  tensor)
-
-# I don't have a clear idea what parameters external_library need to
-# have.  @gangliao as a CMake expert would have better ideas.
-external_library(cudnn
-  ....)
-
-# Suppose that ops.cu depends on external target CUDNN.  Also, ops.cu
-# include global functions that take Tensor as their parameters, so
-# ops depend on tensor.  This implies that if any of tensor.{h.cc},
-# ops.{h,cu} is changed, ops need to be re-built.
-nv_library(ops
-  SRCS
-  ops.cu
-  DEPS
-  tensor
-  cudnn)  # cudnn is defined later.
-
-nv_test(ops_test
-  SRCS
-  ops_test.cu
-  DEPS
-  ops)
-
-# Because api.go defines a GO wrapper to ops and tensor, it depends on
-# both.  This implies that if any of tensor.{h,cc}, ops.{h,cu}, or
-# api.go is changed, api need to be re-built.
-go_library(api
-  SRCS
-  api.go
-  DEPS
-  tensor # Because ops depend on tensor, this line is optional.
-  ops)
-
-go_test(api_test
-  SRCS
-  api_test.go
-  DEPS
-  api)
-
-
-# This builds libapi.so.  shared_library might use CMake target
-# api_shared so to distinguish it from above target api.
-shared_library(api
-  DEPS
-  api)
-
-```
-
-### Implementation
-
-As above example CMakeLists.txt executes, each function invocation adds "nodes" to a dependency graph.  It also use this graph to generate CMake commands including `add_executable`, `add_dependencies`, `target_link_libraries`, and `add_test`.
-
-### Using Package Manager For Go
-
-Building Go binaries and libraries need to satisfy their dependencies, generally
-we can do `go get ./...` to download and compile all external dependencies. The
-problems are:
-
-1. `go get` will always get the latest code from the default branch of the
-    remote repo, so changes of dependents might break the build. This is very
-    different with what we already have in `cmake/external` which download a
-    specific version or commit id of the dependency.
-1. Some locations can not access external dependencies through the internet, as mentioned
-   in https://github.com/PaddlePaddle/Paddle/issues/2605. Using package management
-   tools can package the dependencies as a "vendor" package, which can be mirrored
-   at many cloud file hosting, so users what to compile paddle by themselves can
-   download this "vendor" package from a mirror site.
-
-#### Choose A Suitable Tool
-
-As mentioned by @wangkuiyi, [Here](https://github.com/golang/go/wiki/PackageManagementTools)
-list dozens of Go package managers. We choose the tool using following principles:
-
-- Most "active" projects with more stars, more pull requests or commits
-- Widely used project
-
-After comparing all these projects, we shall choose between the most popular
-tools: Godep and Glide.
-
-Here's a brief comparison between Godep and Glide
-: https://github.com/Masterminds/glide/wiki/Go-Package-Manager-Comparison. There are
-also many complaints about using `Godep`. There's also a new "official" pakcage
-management tool has been started at: https://github.com/golang/dep to resolve
-such problems, but it's currently at Alpha stage. So the best choice now is
-glide obviously.
-
-#### Manage Go Packages
-
-- Dependencies: `go/glide.yaml` will store the dependencies and their versions which
-  is directly imported by paddle. `go/glide.lock` will store all dependencies recursively
-  with their commit id. Builds will "lock" to these packages if we don't `glide up`
-  them
-- Vendor package: `go/vendor` directory will generated when running `cmake` command. `cmake`
-  will download the code corresponding to `go/glide.lock`. If we put a vendor folder
-  under `go/`, cmake will just check the commit id to the packages under the folder,
-  if commit id matches, there will be no download at all.
diff --git a/doc/design/cluster_train/large_model_dist_train.md b/doc/design/cluster_train/large_model_dist_train.md
deleted file mode 100644
index 0c4b5bc24c854b7062d509249bea9c50d42bd5f1..0000000000000000000000000000000000000000
--- a/doc/design/cluster_train/large_model_dist_train.md
+++ /dev/null
@@ -1,101 +0,0 @@
-# Alalysis of large model distributed training in Paddle
-
-***NOTE: This is only some note for how we implemeted this scheme in V1, not a new design.***
-
-## What is it
-
-We often encounter cases that the embedding layer parameters(sparse) are so large that we can not store it in the trainer's memory when training. So we need to put them to several servers, and fetch them row by row instead of fetch all of the parameters.
-
-## How to use
-
-Specify command-line argument like  `--loadsave_parameters_in_pserver=true --ports_num_for_sparse=1  --use_old_updater=1` when starting the paddle trainer. And also add something like `--ports_num_for_sparse=1 --pserver_num_threads=5` when starting pserver processes.
-
-Accrodingly, configure your embedding layers like:
-
-```python
-SPARSE_REMOTE=True
-
-w1 = data_layer(name="w1", size=dict_size)
-emb1 = embedding_layer(input=w1, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE))
-w2 = data_layer(name="w2", size=dict_size)
-emb2 = embedding_layer(input=w2, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE))
-...
-```
-
-## Implementation details
-
-```c++
-enum MatType {
-  MAT_NORMAL,
-  MAT_NORMAL_SHARED,
-  MAT_VALUE_SHARED,
-  MAT_SPARSE_ROW_IDS,
-  MAT_SPARSE_ROW_AUTO_GROW,
-  MAT_CACHE_ROW,
-  MAT_SPARSE_ROW,
-  MAT_SPARSE_ROW_PREFETCH,
-  MAT_SPARSE_ROW_PREFETCH_FULL_SIZE,
-};
-```
-
-`MAT_SPARSE_ROW_PREFETCH` is what we use when configured to fetch only row of matrix when training.
-
-In `trainer_internal.cpp:L93 trainOneBatch`:
-
-```c++
-  if (config_->getOptConfig().use_sparse_remote_updater()) {
-    REGISTER_TIMER("prefetch");
-    gradientMachine_->prefetch(inArgs);
-    parameterUpdater_->getParametersRemote();
-  }
-```
-
-When doing actual network forward and backward, at the beginning of each batch, the trainer will try to download one row of data from pserver.
-
-In `trainer/RemoteParameterUpdater.cpp`: `parameterUpdater_->getParametersRemote();`:
-
-```c++
-if (fullSize) {
-    ...
-} else {
-getParams = [&] {
-    parameterClient_->getParameterSparse(
-        /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
-};
-applyL1 = [](Parameter& para, real decayRate) {
-    para.getMat(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate);
-};
-}
-```
-
-Calling `parameterClient_->getParameterSparse` will do remote call to pserver's `getParameterSparse`:
-
-```c++
-void ParameterServer2::getParameterSparse(const SendParameterRequest& request,
-                                          std::vector<Buffer>& inputBuffers,
-                                          SendParameterResponse* response,
-                                          std::vector<Buffer>* outputBuffers) {
-  (void)inputBuffers;
-  auto& buffer = *readWriteBuffer_;
-  size_t numReals = 0;
-  for (const auto& block : request.blocks()) {
-    numReals += getParameterConfig(block).dims(1);
-  }
-  buffer.resize(numReals);
-
-  VLOG(3) << "pserver: getParameterSparse, numReals=" << numReals;
-
-  ReadLockGuard guard(parameterMutex_);
-  size_t offset = 0;
-  for (const auto& block : request.blocks()) {
-    size_t width = getParameterConfig(block).dims(1);
-    Buffer buf = {buffer.data() + offset, width};
-    int type = request.send_back_parameter_type();
-    sendBackParameterSparse(block, type, response, &buf, width, outputBuffers);
-    offset += width;
-  }
-}
-```
-
-`getParameterConfig(block).dims(1)` returns the width of the current "parameter block"(a shard of parameter object),
-then `getParameterSparse` remote call returns only one row of data to the client.
diff --git a/doc/design/concurrent_programming.md b/doc/design/concurrent_programming.md
deleted file mode 100644
index f022e67fd3a048cd7e53c91d9a1fd0506487b665..0000000000000000000000000000000000000000
--- a/doc/design/concurrent_programming.md
+++ /dev/null
@@ -1,163 +0,0 @@
-# Design Doc: Concurrent Programming with Fluid
-
-With PaddlePaddle Fluid, users describe a program other than a model.  The program is a [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto) protobuf message. TensorFlow/MxNet/Caffe2 applications generate protobuf messages too, but their protobuf messages represent the model, a graph of operators, but not the program that trains/uses the model.   
-
-Many know that when we program TensorFlow, we can specify the device on which each operator runs.  This allows us to create a concurrent/parallel AI application.   An interesting questions is **how does a `ProgramDesc` represents a concurrent program?**  
-
-The answer relies on the fact that a `ProgramDesc` is similar to an abstract syntax tree (AST) that describes a program.  So users just program a concurrent program that they do with any concurrent programming language, e.g., [Go](https://golang.org).
-
-## An Analogy
-
-The following table compares concepts in Fluid and Go
-
-| Go | Fluid |
-|----|-------|
-|user-defined functions | [layers](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid) |
-| control-flow and built-in functions | [intrinsics/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) |
-| goroutines, channels | [class ThreadPool](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h) |
-| runtime | [class Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) |
-
-## An Example Concurrent Program
-
-To review all above concepts in an example, let us take a simple program and writes its distributed version.
-
-Suppose that we want to parallelize a naive Fluid program (written in Go and calling Fluid's Go binding) that multiplies two tensors.
-
-```go
-import "fluid"
-
-func paddlepaddle() {
-  X = fluid.read(...)
-  W = fluid.Tensor(...)
-  Y = fluid.mult(X, W)
-}
-```
-
-Please be aware that the Fluid's Go binding provides the default `main` function, which calls the `paddlepaddle` function, which, in this case, is defined in above program and creates the following `ProgramDesc` message.
-
-```protobuf
-message ProgramDesc {
-  block[0] = Block {
-    vars = [X, W, Y],
-    ops = [
-      read(output = X)
-      assign(input = ..., output = W)
-      mult(input = {X, W}, output = Y)
-    ],
-  }
-}
-```
-
-Then, the default `main` function calls `fluid.run()`, which creates an instance of the [`class Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) and calls `Executor.Run(block[0])`, where `block[0]` is the first and only block defined in above `ProgramDesc` message.
-
-The default `main` function is defined as follows:
-
-```go
-func main() {
-  paddlepaddle()
-  fluid.run()
-}
-```
-
-## The Concurrent Version
-
-By parallelizing the above program, we could support very big tensor X by splitting into small pieces {x_1, x_2, ...} and sent each piece to worker process/node for parallel multiplication.
-
-In this case, we can write a transpiler that takes a `ProgramDesc` message that represents the above example program and outputs two `ProgramDesc` messages, one for running on the master process/node, and the other one for worker processes/nodes.
-
-### The Master Program
-
-The master program could look like the following:
-
-```protobuf
-message ProgramDesc {
-  block[0] = Block {
-    vars = [X, L, Y],
-    ops = [
-      read(output = X)
-      kube_get_workers_addrs(output = L)
-      Y = tensor_array(len(L))
-      parallel_for(input = X, output = Y, 
-                   attrs = {L, block_id(1)}) # referring to block 1
-    ]
-  }
-  
-  block[1] = Block {
-    parent = 0,
-    vars = [x, y, index],
-    ops = [
-      slice(input = [X, index], output = x) # index is initialized by parallel_for
-      send(input = x, attrs = L[index])
-      recv(outputs = y, attrs = L[index])
-      assign(input = y, output = Y[index])
-    ]
-  }
-}
-```
-
-The equivalent Fluid program (calling the Go binding) is:
-
-```go
-func main() {  //// block 0
-  X = fluid.read(...)
-  L = fluid.k8s.get_worker_addrs()
-  Y = fluid.tensor_array(len(L))
-  fluid.parallel_for(X, L, 
-                     func(index int) {  //// block 1
-                       x = X[index]
-                       fluid.send(L[index], x)
-                       y = fluid.recv(L[index])
-                       Y[index] = y
-                     })
-}
-```
-
-An explanation of the above program:
-
-- `fluid.k8s` is a package that provides access to Kubernetes API.  
-- `fluid.k8s.get_worker_addrs` returns the list of IP and ports of all pods of the current job except for the current one (the master pod).  
-- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h).  `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed, 
-
-  1. creates `len(L)` scopes, each for the concurrent running of the sub-block (block 1 in this case), and initializes a variable named "index" in the scope to an integer value in the range `[0, len(L)-1]`, and
-  2. creates `len(L)` threads by calling into the `ThreadPool` singleton, each thread  
-     1. creates an Executor instance, and
-     2. calls `Executor.Run(block)`, where `block` is block 1 as explained above.
-1. Please be aware that block 1 is a sub-block of block 0, so ops in block 1 could refer to variables defined in block 0.
-
-### The Worker Program
-
-The worker program looks like
-
-```go
-func main() {
-  W = Tensor(...)
-  x = fluid.listen_and_do(
-        fluid.k8s.self_addr(),
-        func(input Tensor) {
-          output = fluid.mult(input, W)
-        })
-}
-```
-
-where
-
-- `fluid.listen_and_do` creates a `ListenAndDo` intrinsic, which, when executed,
-  1. listens on the current pod's IP address, as returned by `fliud.k8s.self_addr()`,
-  2. once a connection is established,
-     1. creates a scope of two parameters, "input" and "output",
-     2. reads a [Fluid variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h) and saves it into "input",
-     3. creates an Executor instance and calls `Executor.Run(block)`, where the block is generated by running the lambda specified as the second parameter of `fluid.listen_and_do`.
-
-## Summarization
-
-From the above example, we see that:
-
-1. Fluid enables the imperative programming paradigm by:
-   1. letting users describe a program, but not a model (a sequence of layers, or a graph of operators), and
-   2. call the `fluid.run` function that runs the program implicitly.
-1. The program is described as a `ProgramDesc` protobuf message.
-2. Function `Executor.Run` takes a block, instead of a `ProgramDesc`, as its parameter.
-3. `fluid.run` calls `Executor.Run` to run the first block in the `ProgramDesc` message.
-4. `Executor.Run`'s implementation is extremely simple -- it doesn't plan the execution nor create threads; instead, it runs on the current thread and execute intrinsics/operators' `Run` method sequentially as they appear in the `Block.ops` array.
-5. Intrinsics/operators' `Run` method might create threads.  For example, the `ListenAndDo` operator creates a thread to handle each incoming request.
-6. Threads are not necessarily OS thread; instead, they could be [green threads](https://en.wikipedia.org/wiki/Green_threads) managed by ThreadPool.  Multiple green threads might run on the same OS thread.  An example green threads is Go's [goroutines](https://tour.golang.org/concurrency/1).
diff --git a/doc/design/cpp_data_feeding.md b/doc/design/cpp_data_feeding.md
deleted file mode 100644
index 2cbb0083e6b557d703ce180cb0a85050a777aa2f..0000000000000000000000000000000000000000
--- a/doc/design/cpp_data_feeding.md
+++ /dev/null
@@ -1,78 +0,0 @@
-# C++ Data Feeding
-
-While using Paddle V2 API for Training, data feeding completely depends on the Python code. To get rid of the Python environment and achieve the goal of "wrapping the whole training by a while loop op" in Paddle Fluid, a C++ data feeding mechanism is required. 
-
-In this document we show the fundamental design of a C++ data feeding process, which includes data reading, shuffling and batching.
-
-## Reader
-
-In order to handle the above mentioned problem, a new concept called 'Reader' is introduced. `Reader` is a series of inherited classes which can be held by our `Variable` and they are used to read or process file data.
-
-
-### `ReaderBase`
-
-`ReaderBase` is the abstract base class for all readers. It defines the interface for all readers.
-
-```cpp
-class ReaderBase {
- public:
-  explicit ReaderBase(const std::vector<DDim>& shapes) : shapes_(shapes) {
-    PADDLE_ENFORCE(!shapes_.empty());
-  }
-  // Read the next batch of data. (A 'batch' can be only one instance)
-  // If the next batch doesn't exist, '*out' will be an empty std::vector.
-  virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
-  
-  // Reinitialize the reader and read the file from the beginning.
-  virtual void ReInit() = 0;
-  
-  // Get a certain read in data's shape.
-  DDim shape(size_t idx) const;
-  // Get shapes of all read in data.
-  std::vector<DDim> shapes() const { return shapes_; }
-  // Set shapes of read in data.
-  void set_shapes(const std::vector<DDim>& shapes) { shapes_ = shapes; }
-
-  virtual ~ReaderBase() {}
-
- protected:
-  std::vector<DDim> shapes_;
-};
-```
-
-### `FileReader` and `DecoratedReader`
-
-These two classes are derived from the `ReaderBase` and will further be derived by more specific readers. Thus, in our design, there are two kinds of readers: file readers and decorated readers. A file reader reads from a file of some specific format, and yield only one instance of data at a time. For example, RecordIO reader, jpg reader, .... A decorated reader takes another reader(both file reader and decorated reader are OK) as its 'underlying reader'. It gets data from its underlying reader, does some processing on them(shuffling, or batching), then yields processed data. The output data of a decorated reader can be a single instance or a batch. `ShuffleReader` and `BatchReader` are both decorated readers.
-
-All the readers share exactly the same interface as defined in `ReaderBase`. So they can be decorated for more than one time: We can **shuffle** a reader's outputs and then **batch** the shuffle outputs. The interface consistency also allows related ops use readers without knowing what they are exactly.
-
-
-### `ReaderHolder`
-
-Different readers belong to different class types. This leads to a problem: How can we drop them into `Variable`s and fetch them out by a unified method? For example, if a Variable holds a `BatchReader`, we can not get it by the following code:
-
-```cpp
-var->Get<ReaderBase>("batch_reader");
-```
-
-We would have to write:
-
-```cpp
-var->Get<BatchReader>("batch_reader");
-```
-
-This requires that in order to get a reader from a variable, every time, we must know the reader's type exactly. This is nearly impossible.
-
-To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an empty decorator of `ReaderBase`, which hides reader's type. With `ReaderHolder` we are able to fetch all types of readers by `var->Get<ReaderHolder>("...")` and regard the obtained object as a reader.
-
-## Related Operators
-
-To create and invoke readers, some new ops are introduced:
-
-### `CreateReaderOp`
-
-Each reader has its creation op. File readers' creation ops have no input and yield the created file reader as its output. Decorated readers' creation ops take the underlying readers as inputs and then yield new decorated readers.
-
-### `ReadOp`
-
-A reader is only a Variable. It cannot trigger the reading process by itself. So we add the `ReadOp` to execute it. A `ReadOp` takes a reader Variable as its input. Each time it runs, it invokes the reader‘s `ReadNext()` function and gets a new batch of data(or only one instance of data, if we use file reader directly). The output data of a reader are in the form of `std::vector<LoDTenosr>`, so the `ReadOp` also needs to split the vector and move LoDTensors to their respective output Variables.
diff --git a/doc/design/csp.md b/doc/design/csp.md
deleted file mode 100644
index 10d936860fab7e09241e968a63526c7d86d3e568..0000000000000000000000000000000000000000
--- a/doc/design/csp.md
+++ /dev/null
@@ -1,224 +0,0 @@
-# Design Doc: CSP in PaddlePaddle Fluid
-
-## Motivation
-
-Concurrent programming is important for deep learning.  Few example applications are:
-
-1.  The main thread keeps reading the next mini-batch while another thread uses the GPU for computing.
-2.  The main thread performs the computation while another thread uploads the local gradients from each trainer to the parameter server.
-
-Most DL systems, including TensorFlow, Caffe2, and MxNet, can asynchronously execute operators in a graph. However, Fluid doesn't have the concept of a graph at all, as the design goal of Fluid is that of a programming language.
-
-## Concurrent Programming Models
-
-There were many concurrent programming models, implemented in various forms:
-
-| concurrent programming model | implementation |
-|-----|-----|
-| mutex | types and functions in standard libraries |
-| semaphore | types and functions in standard libraries |
-| communicating sequential processes (CSP) | Go programming language |
-| actor model | Erlang programming language |
-| message passing | MPI |
-| bulk synchronous parallel (BSP) | Pregel distributed programming framework |
-
-Since Fluid was designed to be a programming language, we would like to implement CSP in Fluid.
-
-### CSP v.s. Actor Model
-
-A well-known implementation of Actor Model is the Erlang programming language.  In Actor Model, *processes* could send messages to another process and receive messages from another process given the process IDs.  We can find the three ingredients, process with ID, send, and recv, in MPI too.  Indeed, we can rewrite Erlang programs in Python + MPI with possibly fewer lines of code.  Our concern with Actor Model is that it doesn't seem reasonable to implement process management in a programming language's runtime library; instead, it should be the operating systems' responsibility to manage processes and libraries like MPI for send/recv.
-
-## CSP in Fluid
-
-Fluid has two fundamental control-flows: *if-else* and *while*.  If we are to implement CSP, we need the following:
-
-1. a new data type: *channel* and operators *send* and *recv*,
-1. *goroutine* or thread, and
-1. a new control-flow: select.
-
-We also need Python wrappers for the above components.
-
-The type *channel* is conceptually the blocking queue.  In Go, its implemented is a [blocking circular queue](https://github.com/golang/go/blob/68ce117cf17b8debf5754bfd476345779b5b6616/src/runtime/chan.go#L31-L50), which supports send and recv.
-
-The `select` operation has been in OS kernels long before Go language.  All Unix kernels implement system calls *poll* and *select*.  They monitor multiple file descriptors to see if I/O is possible on any of them.  This takes O(N) time.  Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time.  In BSD systems, there is a similar system call *kqueue*.  Go's Linux implementation uses epoll.
-
-It might be a good idea to implement Fluid's select using epoll too.  In this design doc, we start from the O(N) way so that we could focus on Python binding and the syntax.
-
-### Type Channel
-
-Fluid supports many data types:
-
-1. Tensor,
-1. Row-sparse Tensor
-1. LoD Tensor,
-1. Tensor array, etc
-
-Each data type is registered in the [`framework.proto`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L117-L127) as an enum value.  To add a new type channel, we need to add a new type enum.
-
-To expose a C++ type to Python, we need to edit the [`pybind.cc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc) file.  [Here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc#L120-L164) is an example how we expose C++ class LoDTensor.
-
-## Syntax Design
-
-### Create Channel
-
-In Go, we create a channel by specifying the element type and buffer size:
-
-```go
-ch  := make(chan int)       // a channel without buffer
-ch1 := make(chan int, 100)  // a channel that can buffer 100 ints.
-```
-
-In Fluid, we should be able to do the same:
-
-```python
-ch  = fluid.make_channel(dtype=INT)
-ch1 = fluid.make_channel(dtype=INT, 100)
-```
-
-In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16:
-
-```python
-ch = fluid.make_channel(dtype=Tensor, etype=float16)
-```
-
-or Tensors of Tensors of float16 etc.
-
-The point here is that we need a consistent way to compose types, like in C++ we can have `Tensor<Tensor<...<float16>...> >`.
-
-### Send and Recv
-
-Go's CSP implementation depends on data type *channel*. There are two types of channels:
-
-1. The unblocked channel, or buffered channel, is a blocking queue with a non-zero sized buffer. The sending to buffered channel blocks if the buffer is full, and the receive operation blocks if the buffer is empty.
-1. blocked channel, or unbuffered channel, is a blocking queue with no buffer.  Both sending and receiving block with unbuffered channels.
-
-There are four types of actions with a channel:
-
-1. Create a channel
-
-   ```go
-   ch := make(chan int) // this is an unbuffered channel
-   ch := make(chan int, 100) // this is a buffered channel of 100 ints.
-   ```
-
-1. Send
-
-   ```go
-   ch <- 111
-   ```
-
-1. Recv
-
-   ```go
-   y, ok <- ch
-   ```
-
-1. Close
-
-   ```go
-   close(ch)
-   ```
-   
-   Please be aware that a closed channel is not a nil channel, which is `var ch chan int`.
-   
-There are some [axioms with channels](https://dave.cheney.net/2014/03/19/channel-axioms):
-
-1. A send to a nil channel blocks forever
-
-1. A receive from a nil channel blocks forever
-
-1. A send to a closed channel panics
-
-1. A receive from a closed channel returns the residual values and then zeros.
-
-In Fluid, we have [buffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/buffered_channel.h) and [unbuffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/unbuffered_channel.h)
-
-The following program illustrates the Python syntax for accessing Fluid buffers.
-
-```python
-import fluid
-
-buffer_size = 10
-ch = fluid.make_channel(dtype=INT, buffer_size)
-
-# Now write three elements to the channel
-with fluid.while(steps=buffer_size):
-  fluid.send(ch, step)
-
-fluid.close_channel(ch)
-
-with fluid.while(steps=buffer_size):
-  fluid.print(fluid.recv(ch))
-```
-
-The following example shows that to avoid the always-blocking behavior of unbuffered channels, we need to use Fluid's goroutines.
-
-```python
-import fluid
-
-ch = fluid.make_channel(dtype=INT)
-
-with fluid.go():
-  fluid.send(ch)
-
-y = fluid.recv(ch)
-
-fluid.close_channel(ch)
-```
-
-### Select
-
-In Go, the `select` statement lets a goroutine wait on multiple communication operations. A `select` blocks until one of its cases can run, then it executes that case. It chooses one at random if multiple are ready.
-
-```go
-
-ch1  := make(chan int)       
-ch2  := make(chan int, 100)
-
-x := 0
-
-for {
-    select {
-    case ch1 <- x:
-      x := x + 1
-    case y <- ch2:
-      fmt.Println("Received on channel")
-    default:
-      fmt.Println("Default")
-    }
-  }
-
-```
-
-In Fluid, we should be able to do the same:
-
-```python
-ch1  = fluid.make_chan(dtype=INT)
-ch2 = fluid.make_chan(dtype=INT, 100)
-
-sel = fluid.select()
-
-with sel.case(ch1, 'w', X):
-    fluid.layers.increment(X)
-
-with sel.case(ch2, 'r', Y):
-    fluid.print("Received on Channel")
-
-with sel.default():
-    fluid.print("Default")
-
-```
-
-In the above code snippet, `X` and `Y` are variables. Now let us look at each of these statements one by one.
-
-- `sel.case(ch1, 'w', X)` : This specifies that we are writing to `ch1` and we want to write the integer in variable `X` to the channel. The character `w` is used here to make the syntax familiar to write syntax in Python I/O.
-
-- `sel.case(ch2, 'r', Y)` : This specifies that we would like to read the result from `ch2` into variable `Y`. The character `r` is used here to make the syntax familiar to read syntax in Python I/O.
-
-- `sel.default()` : This is equivalent to the default in Go `select`. If none of the channels are ready for read or write, then the fluid code in the default block will be executed.
-
-## Example Programs
-
-### 1. RPC between Trainers and Parameter Servers
-
-### 2. Concurrent Minibatch Loading
diff --git a/doc/design/evaluator.md b/doc/design/evaluator.md
deleted file mode 100644
index 11cc129d56905a9ee666da92fbe6f8559c6d325a..0000000000000000000000000000000000000000
--- a/doc/design/evaluator.md
+++ /dev/null
@@ -1,58 +0,0 @@
-## Evaluator Design
-
-### Problem Statement
-
-During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants.
-
-### Evaluator Design
-Currently, every operation is expressed in the graph. We divide the evaluator process into three steps.
-
-1. Initialize the metric state and add it into the block.
-
-2. Calculate the concerned metrics for every mini-batch. The single evaluator operator is only responsible for calculating the necessary statistics for one mini-batch. For example, the accuracy operator only calculates the accuracy for a minibatch data if run once.
-
-
-3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices.
-
-### Implementation
-This design is shown in the Python API. 
-Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass. 
-
-    
-```python
-class Evaluator(object):
-    """
-    Evaluator Base class.
-    """
-    def __init__(self, name, **kwargs):
-       """
-       Different evaluator may has different metric states. E.g, Accuracy need two variables, total and right sample counts.
-       Auc need four variables, `true_positives`,
-         `true_negatives`, `false_positives` and `false_negatives`. So every evaluator should create its needed variables and append to main_program
-
-       The initialization of Evaluator should be responsible for:
-       create metric states and append to the main_program
-       """ 
-       pass
-
-    def _update_ops(self, input, label, **kwargs)
-       """
-       Add mini-batch evaluator caculate operators to the main_program.
-       Add increment operator to accumulate the metric states.
-       """
-    
-
-    def reset(self, executor, reset_program=None):
-      """
-      Reset metric states at the begin of each pass/user specified batch number.
-      Execute the reset_program to reset the states.
-      """
-      
-
-    def eval(self, executor, eval_program=None):
-      """
-      Merge the mini-batch statistics to form the evaluation result for multiple mini-batches.
-      Execute the eval_program and return the result.
-      """
-      return eval_result
-```
diff --git a/doc/design/executor.md b/doc/design/executor.md
deleted file mode 100644
index 2d4b371cc56db82ce5747da6db07f05aa7f7e6c1..0000000000000000000000000000000000000000
--- a/doc/design/executor.md
+++ /dev/null
@@ -1,29 +0,0 @@
-# Executor Design Doc
-
-## Motivation
-In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message
-[`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
-
-The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains the intrinsics (operators in this case) and variables which will be used, executor explicitly executes the stored precompiled code.
-
-## Overview
-
-An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators in the block. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instances, which is persistent throughout different runs.
-
-## Executor
-
-The `Executor` explicitly executes all the intrinsics (operators here) in the `block_id`th block of a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then runs all the operators in sequence one-by-one.
-It is very similar to how a push stack frame works when entering a block, following which it cleans up all the temporary variables when a mini-batch is finished. It does not however, have the stack frame pop process.
-
-### The interface
-```c++
-  Executor(places);
-```
-A executor does not own any computing resources, a user can only construct an executor using the specified places.
-
-### Running an Executor
-
-```
-  void Run(ProgramDesc, Scope, block_id, create_local_scope);
-```
-An `Executor` only provides a unified way to execute `ProgramDesc`. `ProgramDesc` is the target that will be executed, the `Scope` specifies the variable container, the `block_id` indicates the entrance block and `create_local_scope` is a boolean that states whether it will destroy the temporary variables after the execution is finished.
diff --git a/doc/design/file_manager/README.md b/doc/design/file_manager/README.md
deleted file mode 100644
index 3df10d801e568834729f902aace483d033340e2d..0000000000000000000000000000000000000000
--- a/doc/design/file_manager/README.md
+++ /dev/null
@@ -1,87 +0,0 @@
-# FileManager设计文档
-## 目标
-在本文档中，我们设计说明了名为FileManager系统，方便用户上传自己的训练数据以进行分布式训练
-
-主要功能包括：
-
-- 提供常用的命令行管理命令管理文件和目录
-- 支持大文件的断点上传、下载  
-
-## 名词解释
-- PFS：是`Paddlepaddle cloud File System`的缩写，是对用户文件存储空间的抽象，与之相对的是local filesystem。目前我们用CephFS来搭建。
-- [CephFS](http://docs.ceph.com/docs/master/cephfs/)：一个POSIX兼容的文件系统。
-- Chunk：逻辑划上文件分块的单位。
-
-## 模块
-### 架构图
-<image src=./src/filemanager.png width=900>
-
-### PFSClient
-- 功能： 详细设计[link](./pfs/pfsclient.md)
-	- 提供用户管理文件的命令
-	- 需要可以跨平台执行
-
-- 双向验证   
-	PFSClient需要和Ingress之间做双向验证<sup>[tls](#tls)</sup>，所以用户需要首先在`cloud.paddlepaddle.org`上注册一下，申请用户空间，并且把系统生成的CA(certificate authority)、Key、CRT(CA signed certificate)下载到本地，然后才能使用PFSClient。
-		
-### [Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/)
-- 功能：  
-	提供七层协议的反向代理、基于粘性会话的负载均衡功能。
-	
-- 透传用户身份的办法  
-	Ingress需要把PFSClient的身份信息传给PFSServer，配置的方法参考[link](http://www.integralist.co.uk/posts/clientcertauth.html#3)
-
-### PFSServer
-PFSServer提供RESTful API接口，接收处理PFSClient端的文件管理请求，并且把结果返回PFSClient端。
-
-RESTful API
-
-- /api/v1/files
-	- `GET /api/v1/files`: Get metadata of files or directories.
-	- `POST /api/v1/files`: Create files or directories.
-	- `PATCH /api/v1/files`: Update files or directories.
-	- `DELETE /api/v1/files`: Delete files or directories.
-
-- /api/v1/file/chunks
-	- `GET /api/v1/storage/file/chunks`: Get chunks's metadata of a file.
-
-- /api/v1/storage/files
-	- `GET /api/v1/storage/files`: Download files or directories.
-	- `POST /api/v1/storage/files`: Upload files or directories.
-
-- /api/v1/storage/file/chunks
-	- `GET /api/v1/storage/file/chunks`: Download chunks's data.
-	- `POST /api/v1/storage/file/chunks`: Upload chunks's data.
-
-## 文件传输优化
-
-### 分块文件传输
-用户文件可能是比较大的，上传到Cloud或者下载到本地的时间可能比较长，而且在传输的过程中也可能出现网络不稳定的情况。为了应对以上的问题，我们提出了Chunk的概念，一个Chunk由所在的文件偏移、数据、数据长度及校验值组成。文件的上传和下载都是通过对Chunk的操作来实现的。由于Chunk比较小（默认256K），完成一个传输动作完成的时间也比较短，不容易出错。PFSClient需要在传输完毕最后一个Chunk的时候检查destination文件的MD5值是否和source文件一致。
-
-一个典型的Chunk如下所示：
-
-```
-type Chunk struct {
-	fileOffset int64
-	checksum uint32
-	len     uint32
-	data    []byte
-}
-```  
-
-### 生成sparse文件
-当destination文件不存在或者大小和source文件不一致时，可以用[Fallocate](https://Go.org/pkg/syscall/#Fallocate)生成sparse文件，然后就可以并发写入多个Chunk。
-
-### 覆盖不一致的部分
-文件传输的的关键在于需要PFSClient端对比source和destination的文件Chunks的checksum是否保持一致，不一致的由PFSClient下载或者传输Chunk完成。这样已经传输成功的部分就不用重新传输了。
-
-## 用户使用流程
-参考[link](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md)
-
-## 框架生成
-用[swagger](https://github.com/swagger-api/swagger-codegen)生成PFSClient和PFSServer的框架部分，以便我们可以把更多的精力放到逻辑本身上。
-
-## 参考文档
-- <a name=tls></a>[TLS complete guide](https://github.com/k8sp/tls/blob/master/tls.md)
-- [aws.s3](http://docs.aws.amazon.com/cli/latest/reference/s3/)
-- [linux man document](https://linux.die.net/man/)
diff --git a/doc/design/file_manager/pfs/pfsclient.md b/doc/design/file_manager/pfs/pfsclient.md
deleted file mode 100644
index 56bc70c54bbc92b78d66e04fb495b1300cf8ebe0..0000000000000000000000000000000000000000
--- a/doc/design/file_manager/pfs/pfsclient.md
+++ /dev/null
@@ -1,129 +0,0 @@
-# PFSClient
-
-## Description
-The `pfs` command is a Command Line Interface to manage your files on PaddlePaddle Cloud
-
-## Synopsis
-```
-paddle [options] pfs <subcommand> [parameters]
-```
-
-## Options
-```
---profile (string)
-	Use a specific profile from your credential file.
-
---help (string)
-	Display more information about command
-
---version
-	Output version information and exit
-
---debug
-	Show detailed debugging log	
-	
---only-show-errors (boolean) 
-	Only errors and warnings are displayed. All other output is suppressed.
-```
-
-## Path Arguments
-When using a command, we need to specify path arguments. There are two path argument type: `localpath` and `pfspath`.  
-
-A `pfspath` begin with `/pfs`, eg: `/pfs/$DATACENTER/home/$USER/folder`.
-
-[Here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md#上传训练文件) is how to config datacenters.
-
-## order of Path Arguments
-Commonly, if there are two path arguments, the first is the source, and the second is the destination.
-
-## Subcommonds
-- rm - remove files or directories
-
-```
-Synopsis:
-	rm [-r] [-v] <PFSPath> ...
-
-Options:
-	-r 
-		Remove directories and their contents recursively 
-	-v      
-		Cause rm to be verbose, showing files after they are removed.
-	
-Examples:
-	paddle pfs rm /pfs/$DATACENTER/home/$USER/file
-	paddle pfs rm -r /pfs/$DATACENTER/home/$USER/folder
-```
-- mv - move (rename) files
-
-```
-Synopsis:
-	mv [-f | -n] [-v] <LocalPath> <PFSPath>
-	mv [-f | -n] [-v] <LocalPath> ... <PFSPath>
-	mv [-f | -n] [-v] <PFSPath> <LocalPath> 
-	mv [-f | -n] [-v] <PFSPath> ... <LocalPath> 
-	mv [-f | -n] [-v] <PFSPath> <PFSPath> 
-	mv [-f | -n] [-v] <PFSPath> ... <PFSPath> 
-	
-Options:
-	-f      
-		Do not prompt for confirmation before overwriting the destination path.  (The -f option overrides previous -n options.)
-	-n      
-		Do not overwrite an existing file.  (The -n option overrides previous -f options.)
-	-v      
-		Cause mv to be verbose, showing files after they are moved.
-		
-Examples:
-	paddle pfs mv ./text1.txt /pfs/$DATACENTER/home/$USER/text1.txt
-```
-- cp - copy files or directories
-
-```
-Synopsis:
-	cp [-r] [-f | -n] [-v] [--preserve--links] <LocalPath> <PFSPath>
-	cp [-r] [-f | -n] [-v] [--preserve--links] <LocalPath> ... <PFSPath>
-	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> <LocalPath> 
-	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> ... <LocalPath>
-	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> <PFSPath> 
-	cp [-r] [-f | -n] [-v] [--preserve--links] <PFSPath> ... <PFSPath>
-
-Options:
-	-r
-   		Copy directories recursively
-   	-f      
-		Do not prompt for confirmation before overwriting the destination path.  (The -f option overrides previous -n options.)
-	-n      
-		Do not overwrite an existing file.  (The -n option overrides previous -f options.)
-	-v      
-		Cause cp to be verbose, showing files after they are copied.
-	--preserve--links
-	   Reserve links when copy links
-	   
-Examples:
-	paddle pfs cp ./file /pfs/$DATACENTER/home/$USER/file
-	paddle pfs cp /pfs/$DATACENTER/home/$USER/file ./file
-```
-- ls- list files
-
-```
-Synopsis:
-	ls [-r] <PFSPath> ...
-	
-Options:
-	-R
-   		List directory(ies) recursively
-
-Examples:
-	paddle pfs ls  /pfs/$DATACENTER/home/$USER/file
-	paddle pfs ls  /pfs/$DATACENTER/home/$USER/folder
-```
-
-- mkdir - mkdir directory(ies)
-Create intermediate directory(ies) as required.
-
-```
-Synopsis:
-	mkdir <PFSPath> ...
-
-Examples:
-	paddle pfs mkdir  /pfs/$DATACENTER/home/$USER/folder
-```
diff --git a/doc/design/file_manager/src/filemanager.graffle b/doc/design/file_manager/src/filemanager.graffle
deleted file mode 100644
index 7861a33072bc1908f69d12b37c20491dd8663103..0000000000000000000000000000000000000000
Binary files a/doc/design/file_manager/src/filemanager.graffle and /dev/null differ
diff --git a/doc/design/file_manager/src/filemanager.png b/doc/design/file_manager/src/filemanager.png
deleted file mode 100644
index 8139a19f5722f56d3c211f3ab0d3982f751134b9..0000000000000000000000000000000000000000
Binary files a/doc/design/file_manager/src/filemanager.png and /dev/null differ
diff --git a/doc/design/float16.md b/doc/design/float16.md
deleted file mode 100644
index 1ea95ed6b5d6792171569b6ff76d09be92fcb13e..0000000000000000000000000000000000000000
--- a/doc/design/float16.md
+++ /dev/null
@@ -1,105 +0,0 @@
-# Design Doc: float16
-
-## Why float16
-Half precision (float16) is a binary floating-point format that occupies 16 bits in memory. float16 is half the size of traditional 32-bit single precision format (float) and has lower precision and smaller range. 
-
-When high precision computation is not required, using float16 data type could potentially 
-
-- reduce storage space, memory bandwidth, and power usages; 
-- increase the chance of data fitting into a smaller cache of lower latency; 
-- provide arithmetic speed up if supported by hardware. 
-
-## Survey of current float16 support
-A brief survey of float16 support on different compilers, hardwares, and libraries can be found below. Interested readers can refer to [link1](https://github.com/PaddlePaddle/Paddle/issues/4853) and [link2](https://github.com/Xreki/Xreki.github.io/blob/master/multi_data_types_in_dl_framework/ppt/float16_and_quantized_type.md) for more info.
-
-The goal of float16 is to serve as a key for the executor to find and run the correct version of compute method specialized for float16 in operator kernel. It should be compatible with various natively supported float16 implementations including `__half` for cuda, `float16_t` for ARM, and `Eigen::half` for Eigen to make writing customized float16 kernels easier. 
-
-### Compiler
-- nvcc supports `__half` data type after CUDA 7.5.
-- `__fp16` or `float16_t` is supported as storage type for gcc >= 6.1 and clang >= 3.4.
-- `__fp16` or `float16_t` is supported as arithmetic type for gcc >= 7.1 and clang >= 3.9.
-
-### Hardware
-- `__half` is supported on GPU with compute capability >= 5.3.
-- `__fp16` is supported as storage type for ARMv7-A, ARMv8-A, and above.
-- `__fp16` is supported as arithmetic type after ARMv8.2-A (currently, the only microarchitecture implementing ARMv8.2-A is ARM Cortex-A75, which is announced in May 2017. There seems to be no application processors currently available on market that adopts this architecture. It is reported that Qualcomm Snapdragon 845 uses Cortex-A75 design and will be available in mobile devices in early 2018).
-
-### Libraries
-- [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors.
-- [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU).
-
-### CUDA version issue
-There are currently three versions of CUDA that supports `__half` data type, namely, CUDA 7.5, 8.0, and 9.0. 
-CUDA 7.5 and 8.0 define `__half` as a simple struct that has a `uint16_t` data (see [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/9212ab5a3ddbe48f30ef373f9c1fb546804c7a8c/include/isaac/external/CUDA/cuda_fp16.h)) as follows:
-```
-typedef struct __align__(2) {
-   unsigned short x;
-} __half;
-
-typedef __half half;
-```
-This struct does not define any overloaded arithmetic operators. So you have to directly use `__hadd` instead of `+` to correctly add two half types:
-```
-__global__ void Add() {
-  half a, b, c;
-  c = __hadd(a, b); // correct
-  c = a + b; // compiler error: no operator "+" matches these operands
-}
-```
-CUDA 9.0 provides a major update to the half data type. The related code can be found in the updated [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.h) and the newly added [`cuda_fp16.hpp`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.hpp).
-
-Essentially, CUDA 9.0 renames the original `__half` type in 7.5 and 8.0 as `__half_raw`, and defines a new `__half` class type that has constructors, conversion operators, and also provides overloaded arithmetic operators such as follows:
-```
-typedef struct __CUDA_ALIGN__(2) {
-    unsigned short x;
-} __half_raw;
-
-
-struct __CUDA_ALIGN__(2) __half {
-protected:
-    unsigned short __x;
-public:
-    // constructors and conversion operators from/to 
-    // __half_raw and other built-in data types
-}
-
-typedef __half half;
-
-__device__ __forceinline__ 
-__half operator+(const __half &lh, const __half &rh) { 
-    return __hadd(lh, rh); 
-}
-
-// Other overloaded operators
-``` 
-This new design makes `c = a + b` work correctly for CUDA half data type. 
-
-## Implementation
-The float16 class holds a 16-bit `uint16_t` data internally.
-```
-struct float16 {
-  uint16_t x;
-};
-``` 
-
-float16 supports the following features:
-  - constructors / assignment operators that take input from primitive data types including bool, integers of various length, float, and double. 
-  - constructors / assignment operators that take input from `__half` on cuda, `float16_t` on ARM, and `Eigen::half` on Eigen.
-  - conversion operators to primitive data types and half precision data types on cuda, ARM and Eigen. 
-  - overloaded arithmetic operators for cuda, arm, and non-arm cpu, respectively. These operators will take advantage of the cuda and ARM intrinsics on the corresponding hardware. 
-  
-To support the above features, two fundamental conversion functions are provided:
-```
-float16 float_to_half_rn(float f);  // convert to half precision in round-to-nearest-even mode
-float half_to_float(float16 h);
-```
-which provides one-to-one conversion between float32 and float16. These twos functions will do different conversion routines based on the current hardware. CUDA/ARM instrinsics will be used when the corresonding hardware is available. If the hardware or compiler level does not support float32 to float16 conversion, software emulation will be performed to do the conversion.
-
-## To do
-After float16 class is available, some of the future items are below:
-
-- Update pybind/tensor_py.h to bind c++ float16 with numpy float16. 
-
-- Modify `GetKernelType()` method in `framework/operator.h` to make it compatible with float16.
-
-- Create a type-casting operator that can convert the data type in tensor between float16 and other types.
diff --git a/doc/design/fluid.md b/doc/design/fluid.md
deleted file mode 100644
index f78fa8c1914124f33b9730f918c8887ced4f8d9d..0000000000000000000000000000000000000000
--- a/doc/design/fluid.md
+++ /dev/null
@@ -1,114 +0,0 @@
-# Design Doc: PaddlePaddle Fluid
-
-## Why Fluid
-
-When Baidu developed PaddlePaddle in 2013, the only well-known open source deep learning system at the time was Caffe.  However, when PaddlePaddle was open-sourced in 2016, many other choices were available. There was a challenge -- what is the need for open sourcing yet another deep learning framework?
-
-Fluid is the answer.  Fluid is similar to PyTorch and TensorFlow Eager Execution, which describes the "process" of training or inference using the concept of a model.  In fact in PyTorch, TensorFlow Eager Execution and Fluid, there is no  concept of a model at all. The details are covered in the sections below. Fluid is currently more extreme in the above mentioned idea than PyTorch and Eager Execution, and we are trying to push Fluid towards the directions of a compiler and a new programming language for deep learning.
-
-## The Evolution of Deep Learning Systems
-
-Deep learning infrastructure is one of the fastest evolving technologies. Within four years, there have already been three generations of technologies invented.
-
-| Existed since | model as sequence of layers | model as graph of operators | No model |
-|--|--|--|--|
-| 2013 | Caffe, Theano, Torch, PaddlePaddle | | |
-| 2015 | | TensorFlow, MxNet, Caffe2, ONNX, n-graph | |
-| 2016 | | | PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid |
-
-From the above table, we see that the deep learning technology is evolving towards getting rid of the concept of a model.  To understand the reasons behind this direction, a comparison of the *programming paradigms* or the ways to program deep learning applications using these systems, would be helpful. The following section goes over these.
-
-## Deep Learning Programming Paradigms
-
-With the systems listed as the first or second generation, e.g., Caffe or TensorFlow, an AI application training program looks like the following:
-
-```python
-x = layer.data("image")
-l = layer.data("label")
-f = layer.fc(x, W)
-s = layer.softmax(f)
-c = layer.mse(l, s)
-
-for i in xrange(1000): # train for 1000 iterations
-    m = read_minibatch()
-    forward({input=x, data=m}, minimize=c)
-    backward(...)
-
-print W # print the trained model parameters.
-```
-
-The above program includes two parts:
-
-1. The first part describes the model, and
-2. The second part describes the training process (or inference process) for the model.
-
-This paradigm has a well-known problem that limits the productivity of programmers. If the programmer made a mistake in configuring the model, the error messages wouldn't show up until the second part is executed and `forward` and `backward` propagations are performed. This makes it difficult for the programmer to debug and locate a mistake that is located blocks away from the actual error prompt.
-
-This problem of being hard to debug and re-iterate fast on a program is the primary reason that programmers, in general,  prefer PyTorch over the older systems.  Using PyTorch, we would write the above program as following:
-
-```python
-W = tensor(...)
-
-for i in xrange(1000): # train for 1000 iterations
-    m = read_minibatch()
-    x = m["image"]
-    l = m["label"]
-    f = layer.fc(x, W)
-    s = layer.softmax(f)
-    c = layer.mse(l, s)
-    backward()
-
-print W # print the trained model parameters.
-```
-
-We can see that the main difference is the moving the model configuration part (the first step) into the training loop.  This change would allow the mistakes in model configuration to be reported where they actually appear in the programming block.  This change also represents the model better, or its forward pass, by keeping the configuration process in the training loop.
-
-## Describe Arbitrary Models for the Future
-
-Describing the process instead of the model also brings Fluid, the flexibility to define different non-standard models that haven't been invented yet.
-
-As we write out the program for the process, we can write an RNN as a loop, instead of an RNN as a layer or as an operator.  A PyTorch example would look like the following:
-
-```python
-for i in xrange(1000):
-    m = read_minibatch()
-    x = m["sentence"]
-    for t in xrange x.len():
-        h[t] = the_step(x[t])
-```        
-
-With Fluid, the training loop and the RNN in the above program are not really Python loops, but just a "loop structure" provided by Fluid and implemented in C++ as the following:
-
-```python
-train_loop = layers.While(cond)
-with train_loop.block():
-  m = read_minibatch()
-  x = m["sentence"]
-  rnn = layers.While(...)
-  with rnn.block():
-    h[t] = the_step(input[t])
-```    
-
-An actual Fluid example is described  [here](https://github.com/PaddlePaddle/Paddle/blob/bde090a97564b9c61a6aaa38b72ccc4889d102d9/python/paddle/fluid/tests/unittests/test_while_op.py#L50-L58).
-
-From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
-
-We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/if_else_op.md) structure of Fluid.
-
-## Turing Completeness
-
-In computability theory, a system of data-manipulation rules, such as a programming language, is said to be Turing complete if it can be used to simulate any Turing machine.  For a programming language, if it provides if-then-else and loop, it is Turing complete.  From the above examples, Fluid seems to be Turing complete; however, it is noteworthy to notice that there  is a slight difference between the `if-then-else` of Fluid and that of a programming language. The difference being that the former runs both of its branches and splits the input mini-batch into two -- one for the True condition and another for the False condition. This hasn't been researched in depth if this is equivalent to the `if-then-else` in programming languages that makes them Turing-complete.  Based on a conversation with [Yuang Yu](https://research.google.com/pubs/104812.html), it seems to be the case but this needs to be looked into in-depth.
-
-## The Execution of a Fluid Program
-
-There are two ways to execute a Fluid program.  When a program is executed, it creates a protobuf message [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
-
-There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program.
-
-Fluid is moving towards the direction of a compiler, which is explain in [fluid_compiler.md](fluid_compiler.md).
-
-## Backward Compatibility of Fluid
-
-Given all the advantages from the removal of the concept of a *model*, hardware manufacturers might still prefer the existence of the concept of a model, so it would be easier for them to support multiple frameworks all at once and could run a trained model during inference.  For example, Nervana, a startup company acquired by Intel, has been working on an XPU that reads the models in the format known as [n-graph](https://github.com/NervanaSystems/ngraph).  Similarly, [Movidius](https://www.movidius.com/) is producing a mobile deep learning chip that reads and runs graphs of operators.  The well-known [ONNX](https://github.com/onnx/onnx) is also a file format of graphs of operators.
-
-For Fluid, we can write a converter that extracts the parts in the `ProgramDesc` protobuf message, converts them into a graph of operators, and exports the graph into the ONNX or n-graph format.
diff --git a/doc/design/fluid_compiler.md b/doc/design/fluid_compiler.md
deleted file mode 100644
index 2a6beafc52e815fa067b273bb5887ddcf6ab15ae..0000000000000000000000000000000000000000
--- a/doc/design/fluid_compiler.md
+++ /dev/null
@@ -1,110 +0,0 @@
-# PaddlePaddle Fluid: Towards a Compiled Programming Language
-
-As described in [fluid.md](fluid.md), when a Fluid application program
-runs, it generates a `ProgramDesc` protobuf message as an intermediate
-representation of itself.  The C++ class `Executor` can run this
-protobuf message as an interpreter.  This article describes the Fluid
-compiler.
-
-![](fluid-compiler.png)
-
-## ProgramDesc
-
-Before we go deeper into the idea of compiled language, let us take a
-look at a simple example Fluid application.
-
-```python
-import "fluid"
-
-func paddlepaddle() {
-  X = fluid.read(...)
-  W = fluid.Tensor(...)
-  Y = fluid.mult(X, W)
-}
-```
-
-This program consists of a [block](block.md) of three operators --
-`read`, `assign`, and `mult`.  Its `ProgramDesc` message looks like
-the following
-
-```protobuf
-message ProgramDesc {
-  block[0] = Block {
-    vars = [X, W, Y],
-    ops = [
-      read(output = X)
-      assign(input = ..., output = W)
-      mult(input = {X, W}, output = Y)
-    ],
-  }
-}
-```
- 
-## Transpilers
-
-We can write a transpiler program that takes a `ProgramDesc`, e.g.,
-the above one, and outputs another `ProgramDesc`.  Let us take some
-examples:
-
-1. *Memory optimization transpiler*: We can write a transpiler that
-   inserts some `FreeMemoryOp`s in the above example `ProgramDesc` so
-   to free memory early, before the end of an iteration, so to keep a
-   small memory footprint.
-
-1. *Distributed training transpiler*: We can write a transpiler that
-   converts a`ProgramDesc` into its distributed version of two
-   `ProgramDesc`s -- one for running by the trainer processes and the
-   other for the parameter server.
-
-In the rest of this article, we talk about a special kind of
-transpiler, *Native code generator*, which takes a `ProgramDesc` and
-generates a `.cu` (or `.cc`) file, which could be built by C++
-compilers (gcc, nvcc, icc) into binaries.
-
-## Native Code Generator
-
-For the above example, the native code generator transpiler, say, the
-CUDA code generator, should generate a `main` function:
-
-```c++
-void main() {
-  auto X = fluid_cuda_read(...);
-  auto W = fluid_cuda_create_tensor(...);
-  auto Y = fluid_cuda_mult(X, W);
-}
-```
-
-and the definitions of functions `fluid_cuda_read`,
-`fluid_cuda_create_tensor`, and `fluid_cuda_mult`.  Please be aware
-that each function could just define a C++ instance of an operator and
-run it.  For example
-
-```c++
-paddle::Tensor fluid_cuda_read(...) {
-  paddle::Tensor t;
-  paddle::operator::Read r(&t, ...);
-  r.Run();
-  return t;
-}
-```
-
-For computational operators that have multiple *kernels*, each for a
-specific hardware platform, for example, the `mult` operator, the
-generated code should call its CUDA kernel:
-
-```c++
-paddle::Tensor fluid_cuda_mult(const paddle::Tensor& a, 
-                               const paddle::Tensor& b) {
-  paddle::Tensor t;
-  paddle::operator::Mult m(a, b, ...);
-  Mult.Run(cuda_context);
-}
-```
-
-where `cuda_context` could be a global variable of type
-`paddle::CUDADeviceContext`.
-
-## Multi-Block Code Generation
-
-Most Fluid application programs may have more than one blocks.  To
-execute them, we need to trace [scopes](scope.md).
diff --git a/doc/design/functions_operators_layers.md b/doc/design/functions_operators_layers.md
deleted file mode 100644
index 984b59f4c6971dfb6f46dfe342f2751f392c0e88..0000000000000000000000000000000000000000
--- a/doc/design/functions_operators_layers.md
+++ /dev/null
@@ -1,100 +0,0 @@
-# Design Doc: Functions, Operators, and Layers
-
-In a DL system, we can compose one or more fine grained operators into a coarse grained one.  For example, the FC layer can be composed of a multiplication operator and an add operator.
-
-Historically, some fine grained operations are known as operators, and some coarse level ones are known as layers.  But we need a well-defined separation.
-
-In general, operators are those very fine grained operations, e.g., mul and add. In the implementation, we can write them as C++ functions:
-
-```c++
-template <typename T> T add(T x, T y) { return x + y; }
-template <typename T> T mul(T x, T y) { return x * y; }
-```
-
-Then we can wrap them into operators which are C++ classes and can be created from Python bindings by name.  A C macro can do this. For example, the following macro invocation
-
-```c++
-#define MAKE_FUNCTION_OPERATOR(mul);
-```
-
-generates
-
-```c++
-template <typename T> class mulOp : public OperatorBase {...};
-REGISTER_OP(mulOp<float32>, "mul");
-```
-
-so that in Python we can create operator mul by:
-
-```python
-X1 = Var()
-X2 = Var()
-Y = Var()
-paddle.cpp.create_operator("mul", input=[X1, X2], output=Y)
-```
-
-Also, at the same time, we can compose a coarse level C++ operator class by composing functions `mul` and `add`:
-
-```c++
-template <typename T>
-class FCOp : public OperatorBase {
- public:
-  void Run(...) {
-    add(mul(Input<T>("X"), Input<T>("W")), Input<T>("b");
-  }
-};
-REGISTER_OP(FCOp, "fc");
-```
-
-We need to support such composition in Python as well.  To do so, we need a higher level Python wrapping of operator creation than `paddle.cpp.create_operator`.  This higher level operator API should be compatible with the layer API.
-
-Let's explain using an example.  Suppose that we are going to compose the FC using mul and add in Python, we'd like to have Python functions `mul` and `add` defined in module `operator`:
-
-```python
-def operator.mul(X1, X2):
-    O = Var()
-    paddle.cpp.create_operator("mul", input={X1, Y1}, output=O)
-    return O
-
-def operator.add(X1, X2):
-    O = Var()
-    paddle.cpp.create_operator("add", input={X1, X2}, output=O)
-    return O
-```
-
-Above code snippets are automatically generated.  Given them, users can define
-
-```python
-def layer.fc(X):
-    W = Var()
-    b = Var()
-    return operator.add(operator.mul(X, W), b)
-```
-
-If we don't have `operator.mul` and `operator.add`, the definiton of `layer.fc` would be complicated:
-
-```python
-def layer.fc(X):
-    W = Var()
-    b = Var()
-    O1 = Var()
-    paddle.cpp.create_operator("mul", input=[X, W], output=O1)
-    O2 = Var()
-    paddle.cpp.create_operator("add", input=[O1, b], output=O2)
-    return O2
-```
-
-We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`.  So we have the following concepts in above illustrative example:
-
-
-| C++ functions/functors | mul          | add          |             |          |
-|------------------------|--------------|--------------|-------------|----------|
-| C++ operator class     | mulOp        | addOp        | FCOp        |          |
-| Python binding         | operator.mul | operator.add | operator.fc |          |
-| Python function        |              |              |             | layer.fc |
-
-
-This is how we differentiate layer and operators in PaddlePaddle:
-
-- those defined in C++ and have a lightweighted Python wrapper in module `operators` are operators; whereas
-- those who don't have C++ implementations but a Python implementation that compose C++ operators are known as layers.
diff --git a/doc/design/gan_api.md b/doc/design/gan_api.md
deleted file mode 100644
index fb41df8615f73d9fd4c32995eab265833eac1a55..0000000000000000000000000000000000000000
--- a/doc/design/gan_api.md
+++ /dev/null
@@ -1,253 +0,0 @@
-# Design for GAN
-
-GAN (General Adversarial Net [https://arxiv.org/abs/1406.2661]) is an important model for unsupervised learning and widely used in many areas. 
-
-It applies several important concepts in machine learning system design, including building and running subgraphs, dependency tracing, different optimizers in one executor and so forth.
-
-In our GAN design, we wrap it as a user-friendly easily customized python API to design different models. We take the conditional DC-GAN (Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks [https://arxiv.org/abs/1511.06434]) as an example due to its good performance on image generation.
-
-<p align="center">
-<img src="./test.dot.png" width = "35%" align="center"/><br/>
-Figure 1. The overall running logic of GAN. The black solid arrows indicate the forward pass; the green dashed arrows indicate the backward pass of generator training; the red dashed arrows indicate the backward pass of the discriminator training. The BP pass of the green (red) arrow should only update the parameters in the green (red) boxes. The diamonds indicate the data providers. d\_loss and g\_loss marked in red and green are the two targets we would like to run.
-</p>
-
-The operators, layers and functions required/optional to build a GAN demo is summarized in https://github.com/PaddlePaddle/Paddle/issues/4563.
-
-<p align="center">
-<img src="./dcgan.png" width = "90%" align="center"/><br/>
-Figure 2. Photo borrowed from the original DC-GAN paper.
-</p>
-
-## The Conditional-GAN might be a class. 
-This design we adopt the popular open source design in https://github.com/carpedm20/DCGAN-tensorflow and https://github.com/rajathkmp/DCGAN. It contains following data structure:
-
-- DCGAN(object): which contains everything required to build a GAN model. It provides following member functions methods as API:
-
-- __init__(...): Initialize hyper-parameters (like conv dimension and so forth), and declare model parameters of discriminator and generator as well.
-
-- generator(z, y=None): Generate a fake image from input noise z. If the label y is provided, the conditional GAN model will be chosen.
-Returns a generated image.
-
-- discriminator(image):
-Given an image, decide if it is from a real source or a fake one. 
-Returns a 0/1 binary label.
-
-- build_model(self):
-build the whole GAN model, define training loss for both generator and discrimator.
-
-## Discussion on Engine Functions required to build GAN
-- Trace the tensor and variable dependency in the engine executor. (Very critical, otherwise GAN can'be be trained correctly)
-- Different optimizers responsible for optimizing different loss.
-
-To be more detailed, we introduce our design of DCGAN as following:
-
-### Class member Function: Initializer
-- Set up hyper-parameters, including condtional dimension, noise dimension, batch size and so forth.
-- Declare and define all the model variables. All the discriminator parameters are included in the list self.theta_D and all the generator parameters are included in the list self.theta_G.
-```python
-class DCGAN(object):
-  def __init__(self, y_dim=None):
-  
-    # hyper parameters  
-    self.y_dim = y_dim # conditional gan or not
-    self.batch_size = 100
-    self.z_dim = z_dim # input noise dimension
-
-    # define parameters of discriminators
-    self.D_W0 = pd.Variable(shape=[3,3, 1, 128], data=pd.gaussian_normal_randomizer())
-    self.D_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
-    self.D_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
-    self.D_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
-    self.D_W2 = pd.Varialble(np.random.rand(128, 1))
-    self.D_b2 = pd.Variable(np.zeros(128))
-    self.theta_D = [self.D_W0, self.D_b0, self.D_W1, self.D_b1, self.D_W2, self.D_b2]
-
-    # define parameters of generators
-    self.G_W0 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
-    self.G_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
-    self.G_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
-    self.G_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
-    self.G_W2 = pd.Varialble(np.random.rand(128, 1))
-    self.G_b2 = pd.Variable(np.zeros(128))
-    self.theta_G = [self.G_W0, self.G_b0, self.G_W1, self.G_b1, self.G_W2, self.G_b2]
-```
-
-### Class member Function: Generator
-- Given a noisy input z, returns a fake image.
-- Concatenation, batch-norm, FC operations required;
-- Deconv layer required, which is missing now...
-```python
-class DCGAN(object):
-  def generator(self, z, y = None):
-    # input z: the random noise
-    # input y: input data label (optional)
-    # output G_im: generated fake images
-    
-    if not self.y_dim:
-      z = pd.layer.concat(1, [z, y])
-      
-    G_h0 = pd.layer.fc(z, self.G_w0, self.G_b0)
-    G_h0_bn = pd.layer.batch_norm(G_h0)
-    G_h0_relu = pd.layer.relu(G_h0_bn)
-    
-    G_h1 = pd.layer.deconv(G_h0_relu, self.G_w1, self.G_b1)
-    G_h1_bn = pd.layer.batch_norm(G_h1)
-    G_h1_relu = pd.layer.relu(G_h1_bn)
-    
-    G_h2 = pd.layer.deconv(G_h1_relu, self.G_W2, self.G_b2))
-    G_im = pd.layer.tanh(G_im)
-    return G_im
-```
-
-### Class member function: Discriminator
-- Given a noisy input z, returns a fake image.
-- Concatenation, Convolution, batch-norm, FC, Leaky-ReLU operations required;
-```python
-class DCGAN(object):
-  def discriminator(self, image):
-    # input image: either generated images or real ones
-    # output D_h2: binary logit of the label
-
-    D_h0 = pd.layer.conv2d(image, w=self.D_w0, b=self.D_b0)
-    D_h0_bn = pd.layer.batchnorm(h0)
-    D_h0_relu = pd.layer.lrelu(h0_bn)
-    
-    D_h1 = pd.layer.conv2d(D_h0_relu, w=self.D_w1, b=self.D_b1)
-    D_h1_bn = pd.layer.batchnorm(D_h1)
-    D_h1_relu = pd.layer.lrelu(D_h1_bn)
-    
-    D_h2 = pd.layer.fc(D_h1_relu, w=self.D_w2, b=self.D_b2)
-    return D_h2
-```
-
-### Class member function: Build the model
-- Define data readers as placeholders to hold the data;
-- Build generator and discriminators;
-- Define two training losses for discriminator and generator, respectively. 
-If we have execution dependency engine to back-trace all tensors, the module building our GAN model will be like this:
-```python
-class DCGAN(object):
-  def build_model(self):
-    if self.y_dim:
-        self.y = pd.data(pd.float32, [self.batch_size, self.y_dim])
-    self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
-    self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
-    self.z = pd.data(tf.float32, [None, self.z_size])
-    
-    # step 1: generate images by generator, classify real/fake images with discriminator
-    if self.y_dim: # if conditional GAN, includes label
-        self.G = self.generator(self.z, self.y)
-        self.D_t = self.discriminator(self.images)
-        # generated fake images
-        self.sampled = self.sampler(self.z, self.y)
-        self.D_f = self.discriminator(self.G)
-    else: # original version of GAN
-        self.G = self.generator(self.z)
-        self.D_t = self.discriminator(self.images)
-        # generate fake images
-        self.sampled = self.sampler(self.z)
-        self.D_f = self.discriminator(self.images)
-    
-    # step 2: define the two losses
-    self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size))
-    self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size))
-    self.d_loss = self.d_loss_real + self.d_loss_fake
-    
-    self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_f, np.ones(self.batch_szie))
-```
-
-If we do not have dependency engine but blocks, the module building our GAN model will be like this:
-```python
-class DCGAN(object):
-  def build_model(self, default_block):
-    # input data in the default block
-    if self.y_dim:
-        self.y = pd.data(pd.float32, [self.batch_size, self.y_dim])
-    self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
-    # self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
-    self.z = pd.data(tf.float32, [None, self.z_size])
-
-    # step 1: generate images by generator, classify real/fake images with discriminator
-    with pd.default_block().g_block():
-      if self.y_dim: # if conditional GAN, includes label
-        self.G = self.generator(self.z, self.y)
-        self.D_g = self.discriminator(self.G, self.y)
-      else: # original version of GAN
-        self.G = self.generator(self.z)
-        self.D_g = self.discriminator(self.G, self.y)
-      self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_g, np.ones(self.batch_szie))
-    
-    with pd.default_block().d_block():
-      if self.y_dim: # if conditional GAN, includes label
-        self.D_t = self.discriminator(self.images, self.y)
-        self.D_f = self.discriminator(self.G, self.y)
-      else: # original version of GAN
-        self.D_t = self.discriminator(self.images)
-        self.D_f = self.discriminator(self.G)
-
-      # step 2: define the two losses
-      self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size))
-      self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size))
-      self.d_loss = self.d_loss_real + self.d_loss_fake
-```
-Some small confusion and problems with this design:
-- D\_g and D\_f are actually the same thing, but has to be written twice; i.e., if we want to run two sub-graphs conceptually, the same codes have to be written twice if they are shared by the graph.
-- Requires ability to create a block anytime, rather than in if-else or rnn only;
-
-## Main function for the demo:
-Generally, the user of GAN just need to the following things:
-- Define an object as DCGAN class;
-- Build the DCGAN model;
-- Specify two optimizers for two different losses with respect to different parameters.
-```python
-# pd for short, should be more concise.
-from paddle.v2 as pd
-import numpy as np
-import logging
-
-if __name__ == "__main__":
-    # dcgan class in the default graph/block
-    # if we use dependency engine as tensorflow
-    # the codes, will be slightly different like:
-    # dcgan = DCGAN()
-    # dcgan.build_model()
-    with pd.block() as def_block:
-      dcgan = DCGAN()
-      dcgan.build_model(def_block)
-
-    # load mnist data
-    data_X, data_y = self.load_mnist()
-    
-    # Two subgraphs required!!!
-    with pd.block().d_block():
-      d_optim = pd.train.Adam(lr = .001, beta= .1)
-      d_step = d_optim.minimize(dcgan.d_loss, dcgan.theta_D)
-    with pd.block.g_block():
-      g_optim = pd.train.Adam(lr = .001, beta= .1)
-      g_step = pd.minimize(dcgan.g_loss, dcgan.theta_G)
-
-    # executor
-    sess = pd.executor()
-    
-    # training
-    for epoch in xrange(10000):
-      for batch_id in range(N / batch_size):
-        idx = ...
-        # sample a batch
-        batch_im, batch_label = data_X[idx:idx+batch_size], data_y[idx:idx+batch_size]
-        # sample z
-        batch_z = np.random.uniform(-1., 1., [batch_size, z_dim])
-
-        if batch_id % 2 == 0:
-          sess.run(d_step, 
-                   feed_dict = {dcgan.images: batch_im,
-                                dcgan.y: batch_label,
-                                dcgan.z: batch_z})
-        else:
-          sess.run(g_step,
-                   feed_dict = {dcgan.z: batch_z})
-```
-
-# More thinking about dependency engine v.s. block design:
-- What if we just want to run an intermediate result? Do we need to run the whole block/graph?
-- Should we call eval() to get the fake images in the first stage? And then train the discriminator in the second stage?
diff --git a/doc/design/images/duplicate_op.graffle b/doc/design/images/duplicate_op.graffle
deleted file mode 100644
index 5979f792e252f028a615729215529c2be42d9165..0000000000000000000000000000000000000000
Binary files a/doc/design/images/duplicate_op.graffle and /dev/null differ
diff --git a/doc/design/images/duplicate_op.png b/doc/design/images/duplicate_op.png
deleted file mode 100644
index f299c5d37f260a1bb0daec886f0a4ee1c1f31c92..0000000000000000000000000000000000000000
Binary files a/doc/design/images/duplicate_op.png and /dev/null differ
diff --git a/doc/design/images/duplicate_op2.graffle b/doc/design/images/duplicate_op2.graffle
deleted file mode 100644
index 5cec3bc64dbd44dc99e348485969f29bd128ceb1..0000000000000000000000000000000000000000
Binary files a/doc/design/images/duplicate_op2.graffle and /dev/null differ
diff --git a/doc/design/images/duplicate_op2.png b/doc/design/images/duplicate_op2.png
deleted file mode 100644
index 21cdd5cabf1b5203e1435a75b57770d2f702fa92..0000000000000000000000000000000000000000
Binary files a/doc/design/images/duplicate_op2.png and /dev/null differ
diff --git a/doc/design/images/replica.png b/doc/design/images/replica.png
deleted file mode 100644
index ef59e56b01d792a059279e6bb9a29f3db6a59a41..0000000000000000000000000000000000000000
Binary files a/doc/design/images/replica.png and /dev/null differ
diff --git a/doc/design/images/two_phase_commit.png b/doc/design/images/two_phase_commit.png
deleted file mode 100644
index ef6f7317bd440cc7d9fe08fcbbf2b7a542f99049..0000000000000000000000000000000000000000
Binary files a/doc/design/images/two_phase_commit.png and /dev/null differ
diff --git a/doc/design/kernel_hint_design.md b/doc/design/kernel_hint_design.md
deleted file mode 100644
index a54b7da045e1a362626ef066f9ebb56af2c3181a..0000000000000000000000000000000000000000
--- a/doc/design/kernel_hint_design.md
+++ /dev/null
@@ -1,57 +0,0 @@
-## Problem
-In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
-
-In the current design, we use KernelType to describe one kernel.
-
-```cpp
-struct KernelType {
-  Place place_;
-  DataType data_type_;
-  LayoutType layout_;
-};
-```
- `place_` `data_type_` and `layout_` can be got from the input tensors of the operator, `GetActualKernelType(inputs)` use inputs to infer the proper kernel key that fit the incoming data, but users can not directly configure it.
-
-The [design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md) also provides a virtual method `GetExpectedKernelType` that user can overload and use to choose the KernelType they want to use.
-
-So we should send the information user defined in proto to `GetExpectedKernelType` for choosing a kernel.
-
-The problem is, how should we define and send the information for `GetExpectedKernelType` to use?
-
-## Solution
-
-### Potential choice
-1. Do nothing, let the user add the information they want to operator‘s attribute and get them inside `GetExpectedKernelType`, this can work properly. But there is a little problem that users may define many kinds of hints for the same purpose, such as `force_cpu`, `use_cpu`, `cpu_kernel` to choose CPU kernel, and `use_cudnn`, `force_cudnn`, `cudnn_kernel` to choose CUDNN kernel.
-
-2. Pre-define all the needed option and use a single attr key such as `kernel_hint` for the user, this is not so flexible if the user wants to define some more kind of hint.
-
-### Final choice
-To provide enough flexibility while avoiding confusion definition, we can define some global constants for these attribute names, such as `force_cpu`, `use_cudnn`, `use_mkldnn` for a user to choose.
-
-In C++
-
-```cpp
-const std::string kForceCPU = "force_cpu";
-const std::string kUseCUDNN = "use_cudnn";
-const std::string kUseMKLDNN = "use_mkldnn";
-
-KernelType GetExpectedKernelType() {
-  if (Attr<bool>(kForceCPU)) {
-    return KernelType(CPUPlace, ...)
-  } else {
-    ...
-  }
-}
-```
-
-In Python code
-
-```python
-FORCE_CPU = core.kForceCPU()
-
-def xx_layer(..., force_cpu=false):
-  layer_helper = LayerHelper(...)
-  layer_helper.append_op(
-    type="xx",
-    attr={FORCE_CPU: force_cpu})
-```
diff --git a/doc/design/kernel_selection.md b/doc/design/kernel_selection.md
deleted file mode 100644
index 9719e031c70979cd95400701efd30879662e19bc..0000000000000000000000000000000000000000
--- a/doc/design/kernel_selection.md
+++ /dev/null
@@ -1,99 +0,0 @@
-## Background
-Every operator has many kernels because there are multiple data types, places, data layout, library type that Fluid supports. We use the `OpKernelType ` to describe kernel types that operators can hold.
-
-The `OpKernelType ` is as follows:
-
-```cpp
-struct OpKernelType {
-  Place place_;
-  DataType data_type_;
-  DataLayout data_layout_;
-  LibraryType library_type_;
-};
-```
-
-- The `place_` is a descriptor of the device, e.g., CPUPlace, CUDAPlace.
-
-- The `data_type_` is the data type that this kernel performs on, e.g., `FP32`, `INT64`. Note that one kernel may have inputs with different data types. However, it will be a major `data_type`. For example, the `cross_entropy` takes `int64` as it label, and `double`/`float` as its input logit and output cost. The major `data_type` of `cross_entropy` is `float` or `double`.
-
-- The `data_layout_ ` is useful for some computational library. One example is that MKLDNN uses many kinds of layout, such as `nChw8c`. Each kind of layout will invoke the different kernel.
-
-- The `library_type_` describes the computational library, e.g., `MKLDNN`, `CUDNN`.
-
-## Problem
-
-We register a kernel for every operator and every kernel type ideally. However, it is impracticable for the following situations.
-
-1. Some operators, like CRF, are complicated and inefficient to be implemented on GPU. The CRF operator will only have a CPU kernel.
-2. Some operators will take too many memory. It is better to force them into CPU. However, the rest of operators in this neural network will be performed on GPU, i.e., model parallel problem.
-3. Some layout and place are particular. One example is that MKLDNN uses `nChw8` and there is no other library uses `nChw8c`.
-
-Take one situation to give a detailed explanation, if we have two Operators: OP1 and OP2, OP1 has one output `op1_to_op2`, and `op1_to_op2` is the input of OP2.
-
-If OP1 and OP2 run on the same place(for example CPUPlace), then `op1_2_op2` can be used directly by OP2.
-
-```
-OP1(CPUPlace)
-     |
- op1_2_op2
-     |
-OP2(CPUPlace)
-```
-
-If OP1 and OP2 run one different place, then OP2 cannot `use op1_2_op2` directly.
-
-Problems under these situations are similar. We can formalize this problem as follow.
-
-We register kernels with types $KT = \{kt_1, kt_2, kt_3, ...\}$ for one operator. The inputs of this operator should be run on kernel type $kt_{?}$, which the $kt_{?} \notin KT$. How to cast the input of this operator from $kt_{?}$ to any of kernel type in $KT$.
-
-## Solution: data transform
-
-It is clear that transforming inputs of an operator to adapt another kernel type is not related to the particular operator. So we should register these transformation methods as global methods.
-
-We can infer kernel type for each input of an operator. We let this kernel type as `actual kernel type for var`, which means this kernel type is the kernel type that can process this input variable.
-
-We can get a kernel type by 1) The configuration of operator description. (Users may want to force use `MKL` for `conv` operator). 2) The place of the current executor. (Executor is running on GPU). This kernel type is what we expect the operator will be performed on. We let this kernel type as `expect kernel type`.
-
-We transform the input data from `actual` to `expect` if the actual kernel type is not as same as expect kernel type.
-
-The algorithm is described as following
-
-```cpp
-void OperatorWithKernel::Run(
-        const Scope& scope,
-        const platform::Place& place) const {
-  ExecutionContext ctx(...);
-  auto expected_kernel_key = this->GetExpectedKernelType(ctx);
-
-  Scope& new_scope = scope.NewScope();
-
-  for (auto& var_name : this->Inputs()) {
-    auto* tensor_in = GetTensor(var_name);
-    auto kernel_type_for_var = this->GetKernelTypeForVar(...);
-    if (kernel_type_for_var.place_ != expected_kernel_key.place_) {
-      auto* trans_var = new_scope.Var(var_name);
-      auto* out = DataTransform(expected_kernel_key,
-                                kernel_type_for_var,
-                                *tensor_in);
-      CopyVariableWithTensor(...);
-    }
-  }
-
-  auto kernel = kernels.find(expected_kernel_key);
-  kernel->Compute(ExecutionContext(...));
-}
-```
-
-then the actual process for the multi-device above will be:
-
-```
-OP1(CPUPlace)
-     |
-op1_2_op2(on CPU)
-     |
-[transform](from CPU to GPU)
-     |
-op1_2_op2(on GPU)
-     |
-OP2(CUDAPlace)
-```
diff --git a/doc/design/mkl/mkldnn.md b/doc/design/mkl/mkldnn.md
deleted file mode 100644
index e2fe1e6b26ffa73fda81863abfadf697c0acbfcf..0000000000000000000000000000000000000000
--- a/doc/design/mkl/mkldnn.md
+++ /dev/null
@@ -1,210 +0,0 @@
-# Intel® MKL-DNN on PaddlePaddle: Design Doc
-
-我们计划将英特尔深度神经网络数学库[Intel MKL-DNN](https://github.com/01org/mkl-dnn)
-(Intel Math Kernel Library for Deep Neural Networks)集成到PaddlePaddle，
-充分展现英特尔平台的优势，有效提升PaddlePaddle在英特尔架构上的性能。
-
-<div align="center">
-<img src="image/overview.png"><br/>
-Figure 1. PaddlePaddle on IA
-</div>
-
-近期目标
-
-- 完成常用Layer的MKL-DNN实现。
-- 完成常见深度神经网络VGG，GoogLeNet 和 ResNet的MKL-DNN实现。
-
-目前的优化，主要针对PaddlePaddle在重构之前的代码框架以及V1的API。
-具体的完成状态可以参见[这里](https://github.com/PaddlePaddle/Paddle/projects/21)。
-
-## Contents
-
-- [Overview](#overview)
-- [Actions](#actions)
- 	- [CMake](#cmake)
- 	- [Matrix](#matrix)
-	- [Layers](#layers)
-	- [Activations](#activations)
-	- [Parameters](#parameters)
-	- [Gradients](#gradients)
-	- [Unit Tests](#unit-tests)
-	- [Python API](#python-api)
-	- [Benchmarking](#benchmarking)
-	- [Others](#others)
-- [Design Concerns](#design-concerns)
-
-## Overview
-
-我们会把MKL-DNN会作为第三方库集成进PaddlePaddle，与其他第三方库一样，会在编译PaddlePaddle的时候下载并编译MKL-DNN。
-
-同时，为了进一步提升PaddlePaddle在基本数学运算的计算速度，我们也将MKLML即(MKL small library\[[1](#references)\])
-作为另一个第三方库集成进PaddlePaddle，它只会包括生成好的动态库和头文件。
-
-MKL，MKLML以及MKL-DNN三者关系如下表：
-
-| Name        |  Open Source     | License     | Descriptions  |
-| :---------- | :--------------- | :---------- | :------------ |
-|   MKL       |     No           | Proprietary | Accelerate math processing routines | 
-|   MKLML     |     No           | Proprietary | Small package of MKL, especially for Machine Learning |
-|   MKL-DNN   |     Yes          | Apache 2.0  | Accelerate primitives processing routines especially for Deep Neural Networks  |
-
-MKLML可以与MKL-DNN共同使用，以此达到最好的性能。
-
-<div align="center">
-<img src="image/engine.png"><br/>
-Figure 2. PaddlePaddle with MKL Engines
-</div>
-
-## Actions
-
-添加的相关文件和目录结构如下：
-
-```txt
-PaddlePaddle/Paddle
-├── ...
-├── cmake/
-│   ├── external/
-│   │   ├── ...
-│   │   ├── mkldnn.cmake
-│   │   └── mklml.cmake
-└── paddle/
-    ├── ...
-    ├── math/
-    │   ├── ...
-    │   └── MKLDNNMatrix.*
-    └── gserver/
-        ├── ...
-        ├── layers/
-        │   ├── ...
-        │   └── MKLDNN*Layer.*
-        ├── activations/
-        │   ├── ...
-        │   └── MKLDNNActivations.*
-        └── tests/
-            ├── ...
-            ├── MKLDNNTester.*
-            └── test_MKLDNN.cpp
-```
-
-### CMake
-在`CMakeLists.txt`中提供一个与MKL有关的总开关：`WITH_MKL`，它负责决定编译时是否使用MKLML和MKL-DNN
-
-- `WITH_MKLML` 控制是否使用MKLML库。 
-当打开`WITH_MKL`时，会自动使用MKLML库作为PaddlePaddle的CBLAS和LAPACK库，同时会开启Intel OpenMP用于提高MKLML的性能。
-编译时会把对应的头文件和库放在`build/third_party/install/mklml/*`目录下对应的地方。
-MKLML的库目前都是动态库，主要包括`libiomp5.so`和`libmklml_intel.so`。
-- `WITH_MKLDNN` 控制是否使用MKL-DNN。
-当开启`WITH_MKL`时，会自动根据硬件配置[[2](#references)]选择是否编译MKL-DNN。
-编译时会把对应的头文件和库放在`build/third_party/install/mkldnn/*`目录下对应的地方。
-MKL-DNN的库目前只有动态库`libmkldnn.so`。
-
-### Matrix
-目前在PaddlePaddle中数据都是以`NCHW`的格式存储，但是在MKL-DNN中的排列方式不止这一种。
-所以我们定义了一个`MKLDNNMatrix`用于管理MKL-DNN数据的不同格式以及相互之间的转换。
-
-<div align="center">
-<img src="image/matrix.png"><br/>
-Figure 3. MKLDNNMatrix
-</div>
-
-### Layers
-所有MKL-DNN的Layers都会继承于`MKLDNNLayer`，该类继承于PaddlePaddle的基类`Layer`。
-在`MKLDNNLayer`中会提供一些必要的接口和函数，并且会写好`forward`和`backward`的基本逻辑，
-子类只需要使用定义好的接口，实现具体的函数功能即可。
-
-<div align="center">
-<img src="image/layers.png"><br/>
-Figure 4. MKLDNNLayer
-</div>
-
-每个MKLDNNLayer都包含用于内部存储和外部存储的一系列MKLDNNMatrix：
-
-- 内部存储（internel memory）：`inVal_`,`inGrad_`,`outVal_`和`outGrad_`，分别代表输入数据，输入梯度，输出数据和输出梯度。
-- 外部存储（external memory）：都是以ext开头，比如`extInVal_`和`extInGrad_`，它们主要是用于，
-当数据格式与PaddlePaddle默认的`NCHW`格式不匹配时，转换内存的工作。
-需要注意的是，PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`，
-所以`extOutVal_`和`extOutGrad_`必须分别与`output_.value`和`output_.grad`共享内存，
-如果不需要外部存储用于转换，那么对应的内部存储也会与它们共享内存。
-- 转换函数（resetXXX）： 包括`resetInValue`，`resetInGrad`，`resetOutValue`和`resetOutGrad`，
-表示对输入数据，输入梯度，输出数据和输出梯度的转换。
-这些函数会根据输入参数重新设置内部和外部存储，当然这两者也可以相等，即表示不需要转换。
-
-注意：每个`MKLDNNlayer`的子类只需要使用内部存储就可以了，所有外部的转换工作都会在reset系列函数中都准备好。
-
-### Activations
-在重构前的PaddlePaddle中，激活函数是独立于`Layer`的概念，并且输入输出都是共用一块内存，
-所以添加了对应的`MKLDNNActivation`来实现，方式类似于`MKLDNNLayer`。
-
-### Parameters
-对于有参数的层，我们会保证`MKLDNNLayer`使用的参数与PaddlePaddle申请的buffer共用一块内存。
-如果存在数据排列格式不一样的情况时，我们会在网络训练之前把格式转换为MKL-DNN希望的格式，
-在训练结束的时候再保存为PaddlePaddle的格式，但是整个训练过程中不需要任何转换。
-这样既使得最终保存的参数格式与PaddlePaddle一致，又可以避免不必要的转换。
-
-### Gradients
-由于MKL-DNN的操作都是直接覆盖的形式，也就是说输出的结果不会在原来的数据上累加，
-这样带来的好处就是不需要一直清空memory，节省了不必要的操作。
-但是注意的是，当网络出现分支且在`backward`的时候，需要累加不同Layer传过来的梯度。
-所以在`MKLDNNlayer`中实现了一个merge的方法，此时每个小分支的`Input Gradient`
-会先临时保存在`MKLDNNMatrix`中，由分支处的Layer负责求和，并把结果放到当前层的`output_.grad`中。
-所以整体上，在实现每个子类的时候就不需要关心分支的事情了。
-
-<div align="center">
-<img src="image/gradients.png"><br/>
-Figure 5. Merge Gradients
-</div>
-
-### Unit Tests
-我们会添加`test_MKLDNN.cpp`和`MKLDNNTester.*`用于MKL-DNN的测试。
-测试分为每个Layer（或Activation）的单元测试和简单网络的整体测试。
-每个测试会对比PaddlePaddle中CPU算出的结果与MKL-DNN的结果，小于某个比较小的阈值认为通过。
-
-### Python API
-目前只考虑**v1 API**。
-
-计划在`python/paddle/trainer/config_parser.py`里面添加`use_mkldnn`这个选择，方便用户选择使用MKL-DNN的layers。
-
-具体实现方式比如：
-
-```python
-use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
-if use_mkldnn
-    self.layer_type = mkldnn_*
-```
-
-所有MKL-DNN的`layer_type`会以*mkldnn_*开头，这些会在`MKLDNN*Layer`注册layer的时候保证，以示区分。 
-
-同时,会在`paddle/utils.Flags`中添加一个`use_mkldnn`的flag，用于选择是否使用MKL-DNN的相关功能。
-
-### Benchmarking
-会添加相应的脚本在[这里](https://github.com/PaddlePaddle/Paddle/tree/develop/benchmark/paddle/image)，用于测试和对比在使用MKL-DNN前后的CNN网络性能。
-测试的性能对比结果会在[IntelOptimizedPaddle.md](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md)
-
-### Others
-1. 如果在使用MKL-DNN的情况下，会把CPU的Buffer对齐为4096，具体可以参考MKL-DNN中的[memory](https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp#L673)。
-2. 深入PaddlePaddle，寻找有没有其他可以优化的可能，进一步优化。比如可能会用OpenMP改进SGD的更新性能。
-
-## Design Concerns
-
-为了更好的符合PaddlePaddle的代码风格\[[3](#references)\]，同时又尽可能少的牺牲MKL-DNN的性能\[[4](#references)\]。
-
-我们总结出一些特别需要注意的点：
-
-1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数，
-我们决定使用已有的`deviceId_`变量来区分layer的属性，定义`-2`为`MKLDNNLayer`特有的设备ID。
-2. 重写父类Layer的**init**函数，修改`deviceId_`为`-2`，代表这个layer是用于跑在MKL-DNN的环境下。
-3. 创建`MKLDNNBase`，定义一些除了layer和memory相关的类和函数。
-包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`，和未来可能还会用到`FPGAEngine`等。
-4. 如果MKL-DNN layer的后面接有cpu device，那么就会使`output_.value`与`extOutVal_`共享内存，
-同时数据格式就是`NCHW`，这样下一个cpu device就能拿到正确的数据。
-在有普通的CPU layer时， `extOutVal_`和`extOutGrad_`的格式始终是`NCHW`或者`NC`。
-
-## References
-1. [MKL small library](https://github.com/01org/mkl-dnn#linking-your-application)是[Intel MKL](https://software.intel.com/en-us/mkl)的一个子集。
-主要包括了深度学习相关的数学原语与操作，一般由MKL-DNN在发布[新版本](https://github.com/01org/mkl-dnn/releases)时一起更新。
-2. [MKL-DNN System Requirements](https://github.com/01org/mkl-dnn#system-requirements)。
-目前在PaddlePaddle中，仅会在支持AVX2指令集及以上的机器才使用MKL-DNN。
-3. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。
-但是在PaddlePaddle中，无论是重构前的layer还是重构后的op，都不会想要知道next layer/op的信息。
-4. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的cuDNN部分使用的也是`NCHW`，所以不存在这个问题)。
-所以需要引入一个转换方法，并且只需要在必要的时候转换这种格式，才能更好的发挥MKL-DNN的性能。
diff --git a/doc/design/model_format.md b/doc/design/model_format.md
deleted file mode 100644
index e29129fddf775939c9f7a8b49d850d523e6e5a45..0000000000000000000000000000000000000000
--- a/doc/design/model_format.md
+++ /dev/null
@@ -1,36 +0,0 @@
-# Design Doc: Model Format
-
-## Motivation
-
-A model is an output of the training process. One complete model consists of two parts, the **topology** and the **parameters**. In order to support industrial deployment, the model format must be self-complete and must not expose any training source code.
-
-As a result, In PaddlePaddle, the **topology** is represented as a  [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model. We must support large size parameters and efficient serialization/deserialization of parameters. 
-
-## Implementation
-
-The topology is saved as a plain text in a detailed self-contain protobuf file. 
-
-The parameters are saved as a binary file. As we all know, the protobuf message has a limit of [64M size](https://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.io.coded_stream#CodedInputStream.SetTotalBytesLimit.details). We have done a [benchmark experiment](https://github.com/PaddlePaddle/Paddle/pull/4610), which shows that protobuf is not fit for the task.
-
-As a result, we design a particular format for tensor serialization. By default, an arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of [LoDTensorDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99). We save the DescProto as the byte string header. It contains all the necessary information, such as the `dims`, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). A tensor stores values in a continuous memory buffer. For speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is, 
-
-The table below shows a tensor's byte view in detail. Note that all the signed values are written in the little-endian format.
-
-|field name  | type | description |
-| --- | --- | --- |
-| version | uint32_t | Version of saved file. Always 0 now. |
-| tensor desc length | uint32_t | TensorDesc(Protobuf message) length in bytes. |
-| tensor desc | void* | TensorDesc protobuf binary message |
-| tensor data | void* | Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()` |
-| lod_level | uint64_t | Level of LoD |
-| length of lod[0] | uint64_t | [Optional] length of lod[0] in bytes. |
-| data of lod[0] | uint64_t*  | [Optional] lod[0].data() |
-| ... | ... | ... |
-
-
-
-## Summary
-
-- We introduce a model format.
-- The model represented by its forward-pass computation procedure is saved in a **ProgramDesc** protobuf message.
-- A bunch of specified format binary tensors describe the **parameters**.
diff --git a/doc/design/multi_language_interface/00.why_plain_c.md b/doc/design/multi_language_interface/00.why_plain_c.md
deleted file mode 100644
index a1443093342c5a3ed698fb6b52a751dfc7cb5319..0000000000000000000000000000000000000000
--- a/doc/design/multi_language_interface/00.why_plain_c.md
+++ /dev/null
@@ -1,118 +0,0 @@
-# Paddle多语言接口实现
-## 背景
-
-Paddle需要一个多语言接口，这个接口需要做到:
-
-* 有标准的，良好的文档
-    * 例如Python可以使用[Sphinx](http://www.sphinx-doc.org/en/stable/)生成API文档，golang可以使用[GoDoc](https://godoc.org/golang.org/x/tools/cmd/godoc)生成文档。这都需要这个接口按照约定俗成的规则来注释完备。
-* 不同语言的接口适应不同语言的特性
-    * 例如Java与Python的错误处理是直接扔出来Exception，而对于golang错误处理应该使用返回值。
-
-## 基本要求
-
-Paddle的多语言接口实现包括一下几个方面:
-
-* 我们使用动态库来分发Paddle。在这个动态库中不嵌入任何其他语言的解释器，也不使用其他动态库。
-* 这个动态库使用C99标准的头文件导出一些函数，不使用/导出C++符号。
-* 不导出Paddle内部的结构体、类，仅仅使用`void*`指针作为类型的句柄(handler)。
-* 不使用SWIG这种代码生成器，而是手写多语言绑定。
-
-
-## 原因
-
-### 使用动态库来分发Paddle
-
-* Paddle的链接方式比较复杂
-    * 如果用户要把Paddle的静态库（libpaddle.a）链接到自己的程序里，得使用 `--whole-archive` (for GCC) 或者 `--force_load` (for Clang) 参数，来确保把 libpaddle.a 里所有的符号都写入自己的程序的二进制文件里。这是因为 Paddle 的源码里使用了[object factory design pattern](http://stackoverflow.com/a/1310326/724872)。
-* 编译型语言，例如C/C++使用静态库和动态库难度差不多。但是解释性语言，例如[Python](http://stackoverflow.com/questions/19560594/how-to-import-static-library-in-python)或者[Java](http://stackoverflow.com/questions/24493337/linking-static-library-with-jni)，只能调用Paddle的动态库，否则得把Paddle静态库链接到解释器里。
-    * 解释性语言实际运行的二进制是解释器本身，如果调用静态库只能将静态库与解释器链接。例如对于Java来说，便是将静态库加入JVM中。这对于通常的Java的开发者来说，是不常见的做法。
-
-### 动态库中不嵌入任何其他语言的解释器
-
-* 目前Paddle的进程模型是C++内部驱动Python解释器进行模型配置解析和数据读取
-* 我们最终的动态库中不嵌入Python或者其他任何语言的解释器。模型配置解析，数据读取均交由其他语言完成
-
-现阶段Paddle有一个问题是，Paddle内嵌的Python解释器和外部使用的Python如果版本不同，会直接报错退出。
-
-### Paddle动态库中，不引用其他动态库
-
-* 即这个动态库是不依赖于其他任何文件的，可以在任何机器上执行的。
-
-###  这个动态库使用C99标准的头文件导出一些函数，不使用/导出C++符号
-
-* 由于C++编译器没有[名字修饰](https://en.wikipedia.org/wiki/Name_mangling#C.2B.2B)的规范，不同版本的编译器之间，对于同一段C++代码生成的符号可能不一致。而多语言接口需要直接读取生成的二进制(动态库)，需要有稳定的导出符号。
-* C语言是有导出符号的标准的，并且在常见的平台上，都是ABI调用标准的。
-* 大多数语言都支持使用C语言API
-* 使用C99而不使用C89，是因为C99支持[Fixed-width integer types](https://en.wikipedia.org/wiki/C_data_types#Fixed-width_integer_types)和[Boolean type](https://en.wikipedia.org/wiki/C_data_types#Boolean_type)。
-* 使用C99而不使用C11的原因是，[C11](https://en.wikipedia.org/wiki/C11_(C_standard_revision))并没有Paddle特别需要的特性，且C99相对于C11使用更加广泛。
-
-### 不导出Paddle内部的结构体、类，仅仅使用`void*`指针作为类型的句柄(handler)
-
-* Paddle内部的类为C++书写，直接导出到C的接口比较困难。
-* 在C-API中使用`void*`来表示Paddle内部类。再在每一个API中自己检查类型。
-
-在C的头文件 `paddle_matrix.h` 中:
-
-```C
-typedef void* paddle_matrix;
-typedef int paddle_error;
-
-extern "C"
-paddle_error paddle_matrix_get_shape(paddle_matrix matrix,
-                                     uint64_t* width,
-                                     uint64_t* height);
-```
-而在CPP里面实现这个C的接口，文件 `paddle_matrix.cpp`
-
-```cpp
-#include "paddle/math/matrix.h"
-extern "C"
-paddle_error paddle_matrix_shape(paddle_matrix matrix,
-                                 uint64_t *width,
-                                 uint64_t *height) {
-  auto m = (paddle::capi::CMatrix*)(matrix);
-  *width = m->width();
-  *height = m->height();
-}
-```
-
-其中`paddle/capi/CMatrix.hpp`文件内容为:
-
-```cpp
-namespace paddle {
-namespace math {  
-
-class CMatrix {
-  std::shared_ptr<paddle::Matrix> mat;
-};
-
-}  // namespace math
-}  // namespace paddle
-```
-
-### 不使用SWIG这种代码生成器，而是手写多语言绑定
-
-* [SWIG](http://www.swig.org/)是一个多语言接口的代码生成器。他的目标是使用C/C++写代码，SWIG直接读取C/C++的头文件，生成各种语言的绑定代码。
-    * 对于多语言接口，SWIG需要写一个interface文件。这个文件具有独特的语法，学习成本高。且增加一个第三方语言，就需要对这个第三方语言增加一些定义。有的时候，interface文件的写法非常[tricky](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/api/Paddle.swig#L36)。社区贡献代码学习成本高。
-    * SWIG暴露的接口保留了C++的接口样式，很难保证多语言代码风格的一致性。(函数命名，错误处理)
-        * 因为SWIG在第三方语言中暴露的函数名，类名和C++中完全一致。C++的命名风格并不能适应其他第三方语言。如果使用SWIG我们需要将在interface文件里，将大量的`SomeCppClass`重命名成`some_python_class`，或者`SomeGoTypes`。
-        * 对于不同语言，错误处理的方式也不尽相同。例如对于Java或者Python，最常见的错误处理方式是Exception，而对于Golang，错误处理方式是返回值。而SWIG只能简单的暴露C++接口，无法做到对于各种语言错误处理方式的适配。
-    * 对于大多数语言，直接使用C语言的.h并不困难。例如Python的[cffi](https://cffi.readthedocs.io/en/latest/overview.html#simple-example-abi-level-in-line)或者[Cython](http://cython.org/), golang的[cgo](https://golang.org/cmd/cgo/)。
-    * SWIG支持的语言或者解释器有局限。例如对于Python，使用SWIG只支持CPython解释器，而不支持PyPy解释器。
-
-
-## 原因列表
-
-| 结论 | 对比 | 原因 |
-|---| --- | --- |
-| 使用动态库 | 不使用静态库 | 解释型语言只能调用动态库，Paddle静态库链接复杂 |
-| 不嵌入其他语言解释器 | 不嵌入Python解释器 | Paddle C++目前嵌入Python解释器，会导致不同版本Python在一个进程里的bug |
-| 不引用其他动态库 | | Paddle一个动态库可以在任何Linux系统上运行 |
-| 使用C99做接口 | 不使用C++做接口 | C有标准的ABI，C99是目前C最广泛的使用标准，且C99支持bool类型和定长整数(uint64_t等)类型 |
-| 使用void*作为类句柄 | 不显示的写每个类具体包含什么| 实现简单，并且让接口脱离实现细节 |
-| 手写多语言绑定 | 不使用SWIG | 使用SWIG需要多语言绑定的开发人员熟练掌握SWIG配置，社区参与困难。SWIG生成的代码不能保证多语言代码风格的一致性 |
-
-
-## 实现
-
-参考[Inference implementation](01.inference_implementation.md)
diff --git a/doc/design/operator_kernel_type.md b/doc/design/operator_kernel_type.md
deleted file mode 100644
index f86e6b7a564ed23f2bddbec25da1c110014f941d..0000000000000000000000000000000000000000
--- a/doc/design/operator_kernel_type.md
+++ /dev/null
@@ -1,91 +0,0 @@
-# Design Doc: The Keys of Operator Kernel Type
-## Problem
-An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique kernel. Before an operator runs, a certain type of kernel must be chosen via a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows:
-
-```cpp
-struct OpKernelType {
-  platform::Place place_;
-  proto::DataType data_type_;
-};
-```
-For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/2d5ec16bc8a09fb8e0f62c89b116b0cd1d333907/paddle/framework/operator.h#L348-L374) in github.
-
-It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys do not provide enough information. We need a more complete representation of `OpKernelType`.
-
-We often implement a kernel of an operator with some computing library on certain device(place). Please note that computing library and device do not have a one-to-one correspondence. A device can have a lot of computing libraries and a computing library can also support different devices.
-
-For example, Eigen library supports Nvidia GPU/AMD GPU/CPU and MKLDNN library supports Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`.
-
-Different DataTypes, such as fp64/fp32/int8, will obviously have different kernels. But different data layout of a Tensor will also lead to different implementations. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209) as an example. Data layout should also be taken into consideration.
-
-## Solution
-
-There are four keys to determine a kernel type of an operator: `Place`/`Library`/`DataType`/`Layout`.
-
-```cpp
-struct OpKernelType {
-  platform::Place place_;
-  platform::Library library_;
-  proto::DataType data_type_;
-  framework::Layout layout_;
-};
-```
-
-The details are as follows:
-
-### Place
-
-`Place` is defined as:
-
-```cpp
-typedef boost::variant<CUDAPlace, ROCmPlace, FPGAPlace, CPUPlace> Place;
-```
-
-`Place` represents the device memory where data is located.
-
-
-### Library
-
-One operator kernel is usually implemented based on one library. `Library` is defined as a enum variable:
-
-```cpp
-enum Library { Plain, MKLDNN, CUDNN };
-```
-
-We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on the `Eigen` library, we take `Eigen` library as the `Plain` enumerator.
-A library usually has a corresponding `DeviceContext` which contains some handles needed for computation. Fluid now has two default DeviceContexts for CPU and CUDA, namely, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains an Eigen library handle and `CDUADeviceContext` contains an Eigen library handle and a cuBLAS handle.
-
-If we want to support new library, a new enumerator need to be added to `Library` and a corresponding new `LibraryDeviceContext` need to be created.
-
-
-### DataType
-
-
-`DataType` is defined in [framework.proto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). Currently, int32/int64/fp32/fp64 are supported.
-
-### Layout
-
-Actually, a Tensor is a view of a block of memory. Besides a pointer to the memory, we also have to get some other descriptions of this block of memory, such as shape(ddim), stride, and layout.
-
-Different layout leads to different implementation of the operator kernel. There are mainly 4 principles we have to follow to support layout in our Fluid framework.
-
-- We take layout as a data member of Tensor. Layout is actually a enum variable. If Fluid is built with MKLDNN, then the memory format in MKLDNN will also be added into this enum variable.
-
-- Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout for generating data. Of course, we can have some default layout, like NCHW.
-
-- The inference of Layout is at run-time, not at compile-time.
-
-- Every operator has to implement different kernels for different layouts. Let's take MKLDNN as an example. If we want to implement an MKLDNN convolution operator, we have to implement all the kernels for different layouts, which are listed [here](http://01org.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to  register kernels for MKLDNN operators.
-
-`Layout` is also defined as a enum variable:
-
-```cpp
-enum Layout {
-  kNCHW,
-  kNHWC,
-#ifdef PADDLE_WITH_MKLDNN
-  knChw8c
-  ...
-#endif
-};
-```
diff --git a/doc/design/ops/rnn.md b/doc/design/ops/rnn.md
deleted file mode 100644
index 2f4854793fa1f0b02e4dc17b51a48a972be61c06..0000000000000000000000000000000000000000
--- a/doc/design/ops/rnn.md
+++ /dev/null
@@ -1,153 +0,0 @@
-# RNNOp design
-
-This document describes the RNN (Recurrent Neural Network) operator and how it is implemented in PaddlePaddle. The RNN op requires that all instances in a mini-batch have the same length. We will have a more flexible dynamic RNN operator in the future.
-
-## RNN Algorithm Implementation
-
-<p align="center">
-<img src="./images/rnn.jpg"/>
-</p>
-
-The above diagram shows an RNN unrolled into a full network.
-
-There are several important concepts here:
-
-- *step-net*: the sub-graph that runs at each step.
-- *memory*, $h_t$, the state of the current step.
-- *ex-memory*, $h_{t-1}$, the state of the previous step.
-- *initial memory value*, the memory of the first (initial) step.
-
-### Step-scope
-
-There could be local variables defined in each step-net.  PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step.
-
-<p align="center">
-<img src="./images/rnn.png"/><br/>
-Figure 2 illustrates the RNN's data flow
-</p>
-
-Please be aware that every step runs the same step-net.  Each step does the following:
-
-1. Creates the step-scope.
-2. Initializes the local variables including step-outputs, in the step-scope.
-3. Runs the step-net, which uses the above mentioned variables.
-
-The RNN operator will compose its output from step outputs in each of the step scopes.
-
-### Memory and Ex-memory
-
-Let's give more details about memory and ex-memory using a simple example:
-
-$$
-h_t = U h_{t-1} + W x_t
-$$,
-
-where $h_t$ and $h_{t-1}$ are the memory and ex-memory (previous memory) of step $t$ respectively.
-
-In the implementation, we can make an ex-memory variable either "refer to" the memory variable of the previous step,
-or copy the memory value of the previous step to the current ex-memory variable.
-
-### Usage in Python
-
-For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md).
-
-We can define an RNN's step-net using a Block:
-
-```python
-import paddle as pd
-
-X = some_op() # x is some operator's output and is a LoDTensor
-a = some_op()
-
-# declare parameters
-W = pd.Variable(shape=[20, 30])
-U = pd.Variable(shape=[20, 30])
-
-rnn = pd.create_rnn_op(output_num=1)
-with rnn.stepnet():
-    x = rnn.add_input(X)
-    # declare a memory (rnn's step)
-    h = rnn.add_memory(init=a)
-    # h.pre_state(), the previous memory of rnn
-    new_state = pd.add_two( pd.matmul(W, x) + pd.matmul(U, h.pre_state()))
-    # update current memory
-    h.update(new_state)
-    # indicate that h variables in all step scopes should be merged
-    rnn.add_outputs(h)
-
-out = rnn()
-```
-
-Python API functions in above example:
-
-- `rnn.add_input`: indicates that the parameter is a variable that will be segmented into step-inputs.
-- `rnn.add_memory`: creates a variable used as the memory.
-- `rnn.add_outputs`: marks the variables that will be concatenated across steps into the RNN output.
-
-### Nested RNN and LoDTensor
-
-An RNN whose step-net includes other RNN operators is known as an *nested RNN*.
-
-For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences. Each step of the higher level RNN also receives an input from the corresponding step of the lower level, and additionally the output from the previous time step at the same level.
-
-The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text.
-
-<p align="center">
-<img src="./images/2_level_rnn.png"/>
-</p>
-
-```python
-import paddle as pd
-
-W = pd.Variable(shape=[20, 30])
-U = pd.Variable(shape=[20, 30])
-
-W0 = pd.Variable(shape=[20, 30])
-U0 = pd.Variable(shape=[20, 30])
-
-# a is output of some op
-a = some_op()
-
-# chapter_data is a set of 128-dim word vectors
-# the first level of LoD is sentence
-# the second level of LoD is a chapter
-chapter_data = pd.Variable(shape=[None, 128], type=pd.lod_tensor, level=2)
-
-def lower_level_rnn(paragraph):
-    '''
-    x: the input
-    '''
-    rnn = pd.create_rnn_op(output_num=1)
-    with rnn.stepnet():
-        sentence = rnn.add_input(paragraph, level=0)
-        h = rnn.add_memory(shape=[20, 30])
-        h.update(
-            pd.matmul(W, sentence) + pd.matmul(U, h.pre_state()))
-        # get the last state as sentence's info
-        rnn.add_outputs(h)
-    return rnn
-
-top_level_rnn = pd.create_rnn_op(output_num=1)
-with top_level_rnn.stepnet():
-    paragraph_data = rnn.add_input(chapter_data, level=1)
-    low_rnn = lower_level_rnn(paragraph_data)
-    paragraph_out = low_rnn()
-
-    h = rnn.add_memory(init=a)
-    h.update(
-        pd.matmul(W0, paragraph_data) + pd.matmul(U0, h.pre_state()))
-    top_level_rnn.add_outputs(h)
-
-# output the last step
-chapter_out = top_level_rnn(output_all_steps=False)
-```
-
-In the above example, the construction of the `top_level_rnn` calls  `lower_level_rnn`.  The input is an LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences.
-
-By default, the `RNNOp` will concatenate the outputs from all the time steps.
-If the `output_all_steps` is set to False, it will only output the final time step.
-
-
-<p align="center">
-<img src="images/rnn_2level_data.png"/>
-</p>
diff --git a/doc/design/ops/sequence_decoder.md b/doc/design/ops/sequence_decoder.md
deleted file mode 100644
index c4a9bbeeefca0e05c335dd60233691e8bac33015..0000000000000000000000000000000000000000
--- a/doc/design/ops/sequence_decoder.md
+++ /dev/null
@@ -1,229 +0,0 @@
-# Design: Sequence Decoder Generating LoDTensors
-In tasks such as machine translation and visual captioning,
-a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences, one word at a time.
-
-This documentation describes how to implement the sequence decoder as an operator.
-
-## Beam Search based Decoder
-The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences. It is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set.
-
-In the old version of PaddlePaddle, the C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search, due to the complexity involved, the implementation relies on a lot of special data structures that are quite trivial and hard to be customized by users.
-
-There are a lot of heuristic tricks in the sequence generation tasks, so the flexibility of sequence decoder is very important to users.
-
-During the refactoring of PaddlePaddle, some new concepts are proposed such as:  [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) that can better support the sequence usage, and they can also help make the implementation of beam search based sequence decoder **more transparent and modular** .
-
-For example, the RNN states, candidates IDs and probabilities of beam search can be represented all as `LoDTensors`;
-the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated.
-
-## Changing LoD's absolute offset to relative offsets
-The current `LoDTensor` is designed to store levels of variable-length sequences. It stores several arrays of integers where each represents a level.
-
-The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**,
-let's call this format the **absolute-offset LoD** for clarity.
-
-The absolute-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows
-```python
-[[0, 3, 9]
- [0, 2, 3, 3, 3, 9]]
-```
-The first level tells that there are two sequences:
-- the first's offset is `[0, 3)`
-- the second's offset is `[3, 9)`
-
-while on the second level, there are several empty sequences that both begin and end at `3`.
-It is impossible to tell how many empty second-level sequences exist in the first-level sequences.
-
-There are many scenarios that rely on empty sequence representation, for example in machine translation or visual captioning, one instance has no translation or the empty candidate set for a prefix.
-
-So let's introduce another format of LoD,
-it stores **the offsets of the lower level sequences** and is called **relative-offset** LoD.
-
-For example, to represent the same sequences of the above data
-
-```python
-[[0, 3, 6]
- [0, 2, 3, 3, 3, 9]]
-```
-
-the first level represents that there are two sequences,
-their offsets in the second-level LoD is `[0, 3)` and `[3, 5)`.
-
-The second level is the same with the relative offset example because the lower level is a tensor.
-It is easy to find out the second sequence in the first-level LoD has two empty sequences.
-
-The following examples are based on relative-offset LoD.
-
-## Usage in a simple machine translation model
-Let's start from a simple machine translation model that is simplified from the [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a blueprint of what a sequence decoder can do and how to use it.
-
-The model has an encoder that learns the semantic vector from a sequence, and a decoder which uses the sequence encoder to generate new sentences.
-
-**Encoder**
-```python
-import paddle as pd
-
-dict_size = 8000
-source_dict_size = dict_size
-target_dict_size = dict_size
-word_vector_dim = 128
-encoder_dim = 128
-decoder_dim = 128
-beam_size = 5
-max_length = 120
-
-# encoder
-src_word_id = pd.data(
-    name='source_language_word',
-    type=pd.data.integer_value_sequence(source_dict_dim))
-src_embedding = pd.embedding(size=source_dict_size, size=word_vector_dim)
-
-src_word_vec = pd.lookup(src_embedding, src_word_id)
-
-encoder_out_seq = pd.gru(input=src_word_vec, size=encoder_dim)
-
-encoder_ctx = pd.last_seq(encoder_out_seq)
-# encoder_ctx_proj is the learned semantic vector
-encoder_ctx_proj = pd.fc(
-    encoder_ctx, size=decoder_dim, act=pd.activation.Tanh(), bias=None)
-```
-
-**Decoder**
-
-```python
-def generate():
-    decoder = pd.while_loop()
-    with decoder.step():
-        decoder_mem = decoder.memory(init=encoder_ctx)  # mark the memory
-        generated_ids = decoder.memory() # TODO init to batch_size <s>s
-        generated_scores = decoder.memory() # TODO init to batch_size 1s or 0s
-
-        target_word = pd.lookup(trg_embedding, gendrated_ids)
-        # expand encoder_ctx's batch to fit target_word's lod
-        # for example
-        # decoder_mem.lod is
-        # [[0 1 3],
-        #  [0 1 3 6]]
-        # its tensor content is [a1 a2 a3 a4 a5]
-        # which means there are 2 sentences to translate
-        #   - the first sentence has 1 translation prefixes, the offsets are [0, 1)
-        #   - the second sentence has 2 translation prefixes, the offsets are [1, 3) and [3, 6)
-        # the target_word.lod is
-        # [[0, 1, 6]
-        #  [0, 2, 4, 7, 9 12]]
-        # which means 2 sentences to translate, each has 1 and 5 prefixes
-        # the first prefix has 2 candidates
-        # the following has 2, 3, 2, 3 candidates
-        # the encoder_ctx_expanded's content will be
-        # [a1 a1 a2 a2 a3 a3 a3 a4 a4 a5 a5 a5]
-        encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word)
-        decoder_input = pd.fc(
-            act=pd.activation.Linear(),
-            input=[target_word, encoder_ctx_expanded],
-            size=3 * decoder_dim)
-        gru_out, cur_mem = pd.gru_step(
-            decoder_input, mem=decoder_mem, size=decoder_dim)
-        scores = pd.fc(
-            gru_out,
-            size=trg_dic_size,
-            bias=None,
-            act=pd.activation.Softmax())
-        # K is an config
-        topk_scores, topk_ids = pd.top_k(scores, K)
-        topk_generated_scores = pd.add_scalar(topk_scores, generated_scores)
-
-        selected_ids, selected_generation_scores = decoder.beam_search(
-            topk_ids, topk_generated_scores)
-
-        # update the states
-        decoder_mem.update(cur_mem)  # tells how to update state
-        generated_ids.update(selected_ids)
-        generated_scores.update(selected_generation_scores)
-
-        decoder.output(selected_ids)
-        decoder.output(selected_generation_scores)
-
-translation_ids, translation_scores = decoder()
-```
-The `decoder.beam_search` is an operator that, given the candidates and the scores of translations including the candidates,
-returns the result of the beam search algorithm.
-
-In this way, users can customize anything on the input or output of beam search, for example:
-
-1. Make the corresponding elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate.
-2. Remove some specific candidate in `selected_ids`.
-3. Get the final `translation_ids`, remove the translation sequence in it.
-
-The implementation of sequence decoder can reuse the C++ class:  [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30),
-so the python syntax is quite similar to that of an  [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop).
-
-Both of them are two-level `LoDTensors`:
-
-- The first level represents `batch_size` of (source) sentences.
-- The second level represents the candidate ID sets for translation prefix.
-
-For example, 3 source sentences to translate, and has 2, 3, 1 candidates.
-
-Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape, and an `lod_expand` operator is used to expand the LoD of the previous state to fit the current state.
-
-For example, the previous state:
-
-* LoD is `[0, 1, 3][0, 2, 5, 6]`
-* content of tensor is `a1 a2 b1 b2 b3 c1`
-
-the current state is stored in `encoder_ctx_expanded`:
-
-* LoD is `[0, 2, 7][0 3 5 8 9 11 11]`
-* the content is
-  - a1 a1 a1 (a1 has 3 candidates, so the state should be copied 3 times for each candidates)
-  - a2 a2
-  - b1 b1 b1
-  - b2
-  - b3 b3
-  - None (c1 has 0 candidates, so c1 is dropped)
-
-The benefit from the relative offset LoD is that the empty candidate set can be represented naturally.
-
-The status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor. The corresponding syntax is:
-
-```python
-decoder.output(selected_ids)
-decoder.output(selected_generation_scores)
-```
-
-The `selected_ids` are the candidate ids for the prefixes, and will be `Packed` by `TensorArray` to a two-level `LoDTensor`, where the first level represents the source sequences and the second level represents generated sequences.
-
-Packing the `selected_scores` will get a `LoDTensor` that stores scores of each translation candidate.
-
-Packing the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation.
-
-## LoD and shape changes during decoding
-<p align="center">
-  <img src="./images/LOD-and-shape-changes-during-decoding.jpg"/>
-</p>
-
-According to the image above, the only phase that changes the LoD is beam search.
-
-## Beam search design
-The beam search algorithm will be implemented as one method of the sequence decoder and has 3 inputs:
-
-1. `topk_ids`, the top K candidate ids for each prefix.
-2. `topk_scores`, the corresponding scores for `topk_ids`
-3. `generated_scores`, the score of the prefixes.
-
-All of these are LoDTensors, so that the sequence affiliation is clear. Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix.
-
-It will return three variables:
-
-1. `selected_ids`, the final candidate beam search function selected for the next step.
-2. `selected_scores`, the scores for the candidates.
-3. `generated_scores`, the updated scores for each prefix (with the new candidates appended).
-
-## Introducing the LoD-based `Pack` and `Unpack` methods in `TensorArray`
-The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors that exist at each time step,
-so it is natural to store them in arrays.
-
-Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors. It is better to store the results of beam search in a `TensorArray`.
-
-The `Pack` and `UnPack` in `TensorArray` are used to pack tensors in the array to an `LoDTensor` or split the `LoDTensor` to an array of tensors.
-It needs some extensions to support the packing or unpacking an array of `LoDTensors`.
diff --git a/doc/design/optimizer.md b/doc/design/optimizer.md
deleted file mode 100644
index 691081c268b848811bf5ee6d6a41edfe0f47eec0..0000000000000000000000000000000000000000
--- a/doc/design/optimizer.md
+++ /dev/null
@@ -1,91 +0,0 @@
-## Optimizer Design
-
-### The Problem
-
-A PaddlePaddle program, or a block, is a sequence of operators operating variables.  A training program needs to do three kinds of works:
-
-1. the forward pass, which computes intermediate results and the cost(s),
-1. the backward pass, which derives gradients from intermediate results and costs, and
-1. the optimization pass, which update model parameters to optimize the cost(s).
-
-These works rely on three kinds of operators:
-
-1. forward operators,
-1. gradient operators, and
-1. optimization operators.
-
-It's true that users should be able to create all these operators manually by calling some low-level API, but it would be much more convenient if they could only describe the forward pass and let PaddlePaddle create the backward and optimization operators automatically.
-
-In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass.
-
-
-### High-level Python API to describe the training process
-
-1. User write code to describe the network:
-
-	```python
-	images = layer.data("images")
-	labels = layer.data("labels")
-	w1 = pd.var("w1")
-	b1 = pd.var("b1")
-	hidden = layer.fc(images, w=w1, b=b1)
-	cost = layer.mse(hidden, labels)
-	```
-
-	The above code snippet will create forward operators in [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md).
-
-
-2. Users create a certain kind of Optimizer with some argument.
-
-	```python
-	optimizer = AdagradOptimizer(learing_rate=0.001)
-	```
-
-3. Users use the optimizer to `minimize` a certain `cost` through updating parameters in parameter_list.
-
-	```python
-	opt_op_list = optimizer.minimize(cost, parameter_list=[w1, b1])
-	```
-	The above code snippet will create gradient and optimization operators in Block. The return value of `minimize()` is list of optimization operators that will be run by session.
-
-4. Users use Session/Executor to run this opt_op_list as target to do training.
-
-	```python
-	sess.run(target= opt_op_list, ...)
-	```
-
-#### Optimizer Python interface:
-
-```python
-class Optimizer(object):
-    """Optimizer Base class.
-
-    """
-
-    def __init__(self):
-        pass
-
-    def create_optimization_pass(self, parameters_and_grads):
-        """Add optimization operators to update gradients to variables.
-
-        Args:
-          parameters_and_grads: a list of (variable, gradient) pair to update.
-
-        Returns:
-          optmization_op_list: a list of optimization operator that will update parameter using gradient.
-        """
-        return None
-
-    def minimize(self, loss, parameter_list):
-        """Add operations to minimize `loss` by updating `parameter_list`.
-
-        This method combines interface `append_backward()` and
-        `create_optimization_pass()` into one.
-        """
-        params_grads = self.create_backward_pass(loss, parameter_list)
-        update_ops = self.create_optimization_pass(params_grads)
-        return update_ops
-
-```
-
-Users can inherit the Optimizer above to create their own Optimizer with some special logic, such as AdagradOptimizer.
diff --git a/doc/design/parameter_average.md b/doc/design/parameter_average.md
deleted file mode 100644
index 2c4edee9fe31d502ea62b9fe5c8757c0a4c5e79f..0000000000000000000000000000000000000000
--- a/doc/design/parameter_average.md
+++ /dev/null
@@ -1,72 +0,0 @@
-# Averaging Parameter in PaddlePaddle
-
-## Why Averaging
-In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable if we can obtain the optimal values of parameters by going through the data in as few passes as we can.
-
-Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset.
-
-Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for <img src="./images/theta_star.gif"/><br/> . The averaging is done as follows:
-
-<img src="./images/asgd.gif" align="center"/><br/>
-
-We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above.
-
-### How to perform Parameter Averaging in PaddlePaddle
-
-Parameter Averaging in PaddlePaddle works in the following way during training :
-1. It will take in an instance of a normal optimizer as an input, e.g. RMSPropOptimizer
-2. The optimizer itself is responsible for updating the parameters.
-3. The ParameterAverageOptimizer maintains a separate copy of the parameters for itself:
-    1. In concept, the values of this copy are the average of the values of the parameters in the most recent N batches.
-    2. However, saving all the N instances of the parameters in memory is not feasible.
-    3. Therefore, an approximation algorithm is used.
-
-Hence, overall we have have two copies of the parameters: one for the optimizer itself, and one for the ParameterAverageOptimizer. The former should be used in back propagation, while the latter should be used during testing and should be saved.
-
-During the testing/ saving the model phase, we perform the following steps:
-1. Perform the delayed operations.
-2. Save current values of the parameters to a temporary variable.
-3. Replace the values of the parameters with the averaged values.
-4. Perform testing and/or save the parameters.
-5. Restore the values of the parameters once done.
-
-### How to implement Averaging of Parameter in PaddlePaddle
-
-We can add the ParameterAverageOptimizer op to the graph through Python API. Using this approach, we manually add this op to the graph and direct the output of the optimizer op to this op during training.
-
-	**Advantages**:
-    - Allows for greater flexibility to the users of PaddlePaddle. Using this approach, the users can plug different optimizers into ParameterAverageOptimizer by passing in the optimizer to the op.
-    - Makes it easy for the users to customize and extend the framework.
-
-	**Disadvantages**:
-    - Implementation requires re-writing the averaging methodology in Python.  
-
-### Low-Level implementation
-
-In the new design, we propose to create a new operation for averaging parameter updates (ParameterAverageOptimizer). For now, we can add an op that takes in the following as input:
-- the optimizer
-- the window_size to keep the updates
-
-The ParameterAverageOptimizer op can be like any other operator with its own CPU/GPU implementation either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement the kernel using Eigen following the abstraction pattern implemented for [Operators](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/rmsprop_op.h). We also want to support the case when the Trainer/Optimizer runs on the GPU while ParameterAverageOptimizer runs on a CPU.
-
-The idea of building an op for averaging is in sync with the refactored PaddlePaddle philosophy of using operators to represent any computation unit. The way the op will be added to the computation graph will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API.
-
-### Python API implementation for ParameterAverageOptimizer
-
-Based on Polyak and Juditsky (1992), we can generalize the averaging of updates to any optimizer. The input to the op would be the following:
-- Any optimizer (RMSProp , AdaGrad etc.)
-- A window size. The op keeps accumulating updated parameter values over a window of N batches and takes an average. Move the averaged value to a buffer when window is full to avoid loss of precision.
-
-Using the ParameterAverageOptimizer op, any user can add the operation to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support averaging. As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since ParameterAverageOptimizer will be an operator, it makes sense to create it in the layer functions.
-We will have a wrapper written in Python that will support the functionality and implement the actual core computation in C++ core as we have done for other [Optimizers](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/rmsprop_op.cc)
-
-#### Creation of the ParameterAverageOptimizer operator
-There are two ways for creating the ParameterAverageOptimizer op:
-1. We create the op immediately while building the computation graph.
-2. We add the op in a lazy manner, just before the backward pass, similar to the way the optimization ops are added.
-
-The proposal is to add the op immediately while building the computation graph.
-
-#### High-level API
-
-In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide parameter average functionality in layer functions.
diff --git a/doc/design/profiler.md b/doc/design/profiler.md
deleted file mode 100644
index b20b5efdc1f1f10ce7cec835adcc6fb374ed4e20..0000000000000000000000000000000000000000
--- a/doc/design/profiler.md
+++ /dev/null
@@ -1,97 +0,0 @@
-## Introduction
-
-There are many performance analysis tools for [different programming languages and different software frameworks](https://en.wikipedia.org/wiki/List_of_performance_analysis_tools). For most popular deep learning frameworks, they use several programming languages and adapt to heterogeneous platforms. Similar to most of the deep learning frameworks, PaddlePaddle also uses C++, CUDA and Python as the basic programming languages to adapt to run on CPU and GPU devices.  The [`nvprof` tools](http://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvprof-overview) is usually used to analyse the CUDA program.  We have [a document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/optimization/cpu_profiling.md) to profile CPU and Python program by [yep](https://pypi.python.org/pypi/yep) and [Google's perftools](https://github.com/google/pprof) to profile only the CPU and Python program. But for [PaddlePaddle fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), the operator is the basic computing unit. The developers usually want to collect the time of each operator and locate bottlenecks.  The `nvprof` usually collect the timeline of CUDA-related activities on both CPU and GPU, including kernel execution, memory transfers, memory set and CUDA API calls and events or metrics for CUDA kernels. And the `yep` and `Google's perftools` can't collect the timeline for CUDA program. All these tools can't collect time in the operator level. So we design this profiling tool.
-
-## Architecture
-
-The work flow for most task is as follows. Each operator will run many times in the all iterations. So the profiler must collect the total time of each operator during the iteration. For more, sometimes, the developers may want to collect more detailed time span inside the operator or record time span for elsewhere, this requires that the profiler must support to record the nested time span. And in order to speedup training, all the deep learning frameworks support parallel computing, including multiple threads on CPU and multiple GPUs. So the profiler must be able to collect the timeline for each thread. In addition, the profiler also occupies certain resources. It must can be easily to be enabled or disabled by the developers. At last, the profiler should present a human-readable report.  
-
-```python
-for i in xrange(M):  # M is  the iteration number
-  for op in operator_lists: # The `operator_lists` contains all the operators in the network.
-    op.run();
-```
-
-In summary, the proflier should have following features:
-
-- records time span in loop.
-- supports nested time span.
-- supports multiple threads/multiple GPUs.
-- supports to be enabled and disabled by users.
-
-But how to record the time for the mixed C++ and CUDA program?  There many C++ APIs to get the current calendar time in host program. But for GPU, the CUDA kernels may be executed concurrently if they are in different [streams](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#streams) and the CUDA kernels is asynchronous with the host program if there is no the synchronous aftern the CUDA kernels. CUDA provides [event](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#events) to monitor the device and perform accurate timing. Inspired by PyTorch and CUDA event, we also design and apply the events to record the timeline. Then summarize and present statistics based on these events.  
-
-The overall flow is shown as the following figure.
-
-<img src="./images/profiler.png" align="center"/><br/>
-
-### Event
-
-In above work flow, a pair of events are needed before and after the piece of code to collect time. So the event has a flag to mark whether it is a starting event or an ending event. Except this two kinds of event, sometime, a only marker with a text message is needed, for example, a marker to specify the profiling start or end. There are three kinds of event:
-
-```c++
-enum EventKind {
-  kMark,
-  kPushRange,
-  kPopRange};
-```
-- kMark: only a marker without time range.
-- kPushRange: mark the starting event for time range. 
-- kPopRange: mark the ending event for time range.
-
-For the CPU code, the events only need to record the current time. For the CUDA code, the [event management functions of CUDA](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT) are used.  For many pieces of code, an event lists are used to record each piece. 
-
-```c++
-class Event {
- public:
-  // The DeviceContext is used to get current  CUDA stream.
-  Event(EventKind kind, std::string name, uint32_t thread_id,
-        const platform::DeviceContext* dev_ctx = nullptr);
-  double CpuElapsedUs(const Event& e) const;
-  double CudaElapsedUs(const Event& e) const;
-
- private:
-  EventKind kind_;
-  std::string name_;
-  uint32_t thread_id_;
-  int64_t cpu_ns_;
-#ifdef PADDLE_WITH_CUDA
-  cudaEvent_t event_ = nullptr;
-  int device_ = -1;
-#endif
-};
-
-struct EventList {
-  std::forward_list<std::vector<Event>> event_blocks;
-};
-```
-
-As mentioned above, there is no need to record the timeline when disabling the profiler. So there is a global state to enable or disable the profiler. 
-
-```c++
-enum ProfilerState {
-  kDisabled, 
-  kCPU,
-  kCUDA
-};
-ProfilerState g_state;
-```
-- kDisabled: the disabled state.
-- kCPU: CPU profiling state.
-- kCUDA: GPU profiling state.
-
-A pair of starting and ending events are pushed to event lists in constructor and destructor of `RecordEvent`. So the timeline is recorded for the code in the lifecycle of an object of `RecordEvent`.
-
-```c++
-struct RecordEvent {
-  explicit RecordEvent(const std::string name,
-                       platform::DeviceContext* dev_ctx = nullptr) {
-    if (kState == ProfilerState::kDisabled) return;
-    // push the starting event to the event lists.
-  }
-  ~RecordEvent() {
-    if (kState == ProfilerState::kDisabled) return;
-    // push the ending event to the event lists.
-  }
-};
-```
diff --git a/doc/design/program.md b/doc/design/program.md
deleted file mode 100644
index bd2456787c4e336d357a65255a8274a7c9e465cc..0000000000000000000000000000000000000000
--- a/doc/design/program.md
+++ /dev/null
@@ -1,139 +0,0 @@
-# Design Doc: PaddlePaddle Programs
-
-## Compile and Execution
-
-A PaddlePaddle program consists of two parts -- the first generates a `ProgramDesc` protobuf message that describes the program, and the second runs this message using a C++ class `Executor`.
-
-A simple example PaddlePaddle program can be found in [graph.md](./graph.md):
-
-```python
-x = layer.data("images")
-l = layer.data("label")
-y = layer.fc(x)
-cost = layer.mse(y, l)
-optimize(cost)
-train(cost, reader=mnist.train())
-```
-
-The first five lines of the following PaddlePaddle program generates, or, compiles, the `ProgramDesc` message.  The last line runs it.
-
-## Programs and Blocks
-
-The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program.
-
-- program: some nested blocks
-- [block](./block.md):
-  - some local variable definitions, and
-  - a sequence of operators
-
-The concept of block comes from usual programs.  For example, the following C++ program has three blocks:
-
-```c++
-int main() { // block 0
-  int i = 0;
-  if (i < 10) { // block 1
-    for (int j = 0; j < 10; j++) { // block 2
-    }
-  }
-  return 0;
-}
-```
-
-The following PaddlePaddle program has three blocks:
-
-```python
-import paddle as pd  // block 0
-
-x = minibatch([10, 20, 30]) # shape=[None, 1]
-y = var(1) # shape=[1], value=1
-z = minibatch([10, 20, 30]) # shape=[None, 1]
-cond = larger_than(x, 15) # [false, true, true]
-
-ie = pd.ifelse()
-with ie.true_block():  // block 1
-    d = pd.layer.add_scalar(x, y)
-    ie.output(d, pd.layer.softmax(d))
-with ie.false_block():  // block 2
-    d = pd.layer.fc(z)
-    ie.output(d, d+1)
-o1, o2 = ie(cond)
-```
-
-## `BlockDesc` and `ProgramDesc`
-
-All protobuf messages are defined in `framework.proto`.
-
-`BlockDesc` is straight-forward -- it includes local variable definitions, `vars`, and a sequence of operators, `ops`.
-
-```protobuf
-message BlockDesc {
-  required int32 parent = 1;
-  repeated VarDesc vars = 2;
-  repeated OpDesc ops = 3;
-}
-```
-
-The parent ID indicates the parent block so that operators in a block can refer to variables defined locally and also those defined in their ancestor blocks.
-
-All hierarchical blocks in a program are flattened and stored in an array. The block ID is the index of the block in this array.
-
-```protobuf
-message ProgramDesc {
-  repeated BlockDesc blocks = 1;
-}
-```
-
-
-### Global Block
-
-The global block is the first one in the above array.
-
-## Operators that Use Blocks
-
-In the above example, the operator `IfElseOp` has two blocks -- the true branch and the false branch.
-
-The definition of `OpDesc` shows that an operator could have some attributes:
-
-```protobuf
-message OpDesc {
-  AttrDesc attrs = 1;
-  ...
-}
-```
-
-and an attribute could be of type block, which is, in fact, a block ID as described above:
-
-```
-message AttrDesc {
-  required string name = 1;
-
-  enum AttrType {
-    INT = 1,
-    STRING = 2,
-    ...
-    BLOCK = ...
-  }
-  required AttrType type = 2;
-
-  optional int32 block = 10; // when type == BLOCK
-  ...
-}
-```
-
-## InferShape
-
-With this design, the InferShape function should take the following parameters:
-
-```c++
-void InferShape(int current_block,
-                int current_operator,
-                ProgramDesc* program // might change VarDesc values.
-                ) {
-  ...
-}
-```
-
-where
-
-- `current_block` indices into `ProgramDesc::blocks`,
-- `current_operator` indices into `BlockDesc::ops`.
diff --git a/doc/design/python_api.md b/doc/design/python_api.md
deleted file mode 100644
index 73f6d7b90c7dca0d48109cf3d28d5f7cd56b5c0b..0000000000000000000000000000000000000000
--- a/doc/design/python_api.md
+++ /dev/null
@@ -1,304 +0,0 @@
-# Design Doc: Python API
-
-Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program.
-
-| Python classes | Protobuf messages |
-| --- | --- |
-| Program | ProgramDesc |
-| Block | BlockDesc |
-| Operator | OpDesc |
-| Variable | VarDesc |
-
-Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages.
-
-## Core Concepts
-
-### Program
-
-A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md), which is composed of an array of `BlockDesc`s.  The `BlockDesc`s in a `ProgramDesc` can have a tree-like hierarchical structure. However, the `ProgramDesc` onlys stores a flattened array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array.  For example, operators in the step block of an RNN operator need to be able to access variables in its ancestor blocks.
-
-Whenever we create a block, we need to set its parent block to the current block, hence the Python class `Program` needs to maintain a data member `current_block`.
-
-```python
-class Program(objects):
-    def __init__(self):
-        self.desc = core.NewProgram() # a C++ ProgramDesc pointer.
-        self.blocks = vector<Block>()
-        self.blocks.append(Block(self, -1)) # the global block
-        self.current_block = 0          # initialized to the global block
-
-    def global_block():
-        return self.blocks[0]
-
-    def current_block():
-        return self.get_block(self.current_block)
-
-    def rollback():
-        self.current_block = self.current_block().parent_idx
-
-    def create_block():
-        new_block_idx = len(self.block)
-        self.blocks.append(Block(self, self.current_block))
-        self.current_block = new_block_idx
-        return current_block()
-```
-
-`Program` is an accessor to the protobuf message `ProgramDesc`, which is created in C++ space, because the InferShape function is in C++, which manipulates `VarDesc` messages, which are in turn members of `BlockDesc`, which is a member of `ProgramDesc`.
-
-`Program` creates the first block as the global block in its constructor.  All parameters and their initializer operators are in the global block.
-
-### Block
-
-A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md) includes
-
-1. a map from variable names to an instance of the Python `Variable` class, and
-1. a list of `Operator` instances.
-
-```python
-class Block(objects):
-    def __init__(self, program, parent_idx):
-        self.desc = core.NewBlock(program.desc)
-        self.program = program
-        self.vars = map<string, Variable>()
-        self.ops = vector<Operator>()
-        self.parent_idx = parent_idx
-
-    def create_var(self, ...):
-        return Variable(self, ...)
-
-    def _create_global_var(self, ...):
-        program.global_block().create_var(...)
-
-    def create_parameter(self, name, ...):
-        # Parameter is a subclass of variable. See Parameter section for details.
-        self.vars[name] = Parameter(self._create_global_var(...), ...)
-        return self.vars[name]
-
-    def append_operator(self, ...):
-        self.ops.append(Operator(self, ...))
-
-    def prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
-       self.ops.prepend(Operator(self, ...))
-```
-
-`create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator.
-
-`prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.
-
-### Operator
-
-The `Operator` class fills in the `OpDesc` message and calls the C++ function `InferShape` to infer the output shapes from the input shapes.
-
-```python
-class Operator(object):
-    def __init__(self,
-                 block,  # Block
-                 type,   # string
-                 inputs, # dict<string, Variable>
-                 outputs,# dict<stirng, Variable>
-                 attrs   # dict<string, Any>
-                 ):
-        self.desc = core.NewOpDesc(block.desc, type, inputs, outputs, attrs)
-        core.infer_shape(self.desc, inputs, outputs)
-
-    def type(self):
-        return self.desc.type()
-```
-
-`Operator` creates the `OpDesc` message in C++ space, so that it can call the `InferShape` function, which is in C++.
-
-### Variable
-
-Operators take Variables as its inputs and outputs.
-
-```python
-class Variable(object):
-    def __init__(self,
-                 block=None,      # Block
-                 name=None,       # string
-                 shape,           # tuple
-                 dtype="float32", # string
-                 lod_level=None   # int
-                 ):
-        if name is None:
-            name = unique_name_generator()
-        self.name = name
-        self.block = block
-        self.desc = core.NewVarDesc(block.desc, name, shape, lod_level)
-        self.writer = None
-```
-
-Please be aware of `self.writer`, that tracks operator who creates the variable.  It possible that there are more than one operators who write a variable, but in Python space, each write to a variable is represented by a Variable class.  This is guaranteed by the fact that **`core.NewVarDesc` must NOT create a new `VarDesc` message if its name already exists in the specified block**.
-
-### Parameter
-
-A parameter is a global variable with an initializer (or load) operator.
-
-```python
-class Parameter(Variable):
-    def __init__(self,
-                 block=None,      # Block
-                 name=None,       # string
-                 shape,           # tuple
-                 dtype="float32", # string
-                 lod_level=None   # int
-                 trainable,       # bool
-                 initialize_op_attrs,
-                 optimize_op_attrs):
-        super(Parameter, self).__init__(block, name, shape, dtype, lod_level)
-        self.trainable = trainable
-        self.optimize_op_attrs = optimize_op_attrs
-        block.prepend(Operator(block,  # Block
-                               initialize_op_attrs['type'],   # string
-                               None,   # no inputs
-                               self,   # output is the parameter
-                               initialize_op_attrs)
-```
-
-When users create a parameter, they can call
-
-```python
-program.create_parameter(
-  ...,
-  init_attr={
-    type: "uniform_random",
-    min: -1.0,
-    max: 1.0,
-  })
-)
-```
-
-In above example, `init_attr.type` names an initialize operator.  It can also name the load operator
-
-```python
-init_attr={
- type: "load",
- filename: "something.numpy",
-}
-```
-
-`optimize_op_attrs` is not in the `VarDesc` message, but kept in the Python instance, as it will be used in the Python space when creating the optimize operator's `OpDesc`, and will be in the `OpDesc` message.
-
-## Layer Function
-
-A layer is a Python function that creates some operators and variables. Layers simplify the work of application programmers.
-
-Layer functions take `Variable` and configuration parameters as its input and return the output variable(s).
-
-For example, `FullyConnected` take one or more variable as its input. The input could be input data or another layer's output. There are many configuration options for a `FullyConnected` layer, such as layer size, activation, parameter names, initialization strategies of parameters, and so on. The `FullyConnected` layer will return an output variable.
-
-
-### Necessity for reusing code between layer functions
-
-There are a lot of code that can be reused. Such as
-
-* Give the default value of configuration. e.g., default initialize strategy for parameters is uniform random with `min = -1.0`, `max = 1.0`. and default initialize strategy for bias is to fill zero.
-* Append the activation operator.
-* Create a temporary variable.
-* Create parameter.
-* Generate a unique name.
-* Add a bias.
-* ...
-
-A mechanism to reuse code between layer functions is necessary. It will be around [150 lines of code](https://github.com/PaddlePaddle/Paddle/pull/4724/files#diff-823b27e07e93914ada859232ae23f846R12) if we write a `FullyConnected` layer without any helper functions.
-
-
-
-### Comparision between global functions and helper class
-
-The `FullyConnected` layer will be as follow when we provide global functions:
-
-```python
-def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None):
-  if name is None:
-    name = unique_name("fc")
-  input = multiple_input(input)
-  param_attr = default_param_attr(param_attr)
-  param_attr = multiple_param_attr(param_attr, len(input))
-
-  # mul
-  mul_results = []
-  for ipt, attr in zip(input, param_attr):
-    shape = ipt.shape[1:] + [size]
-    w = g_program.global_block().create_parameter(shape, ipt.dtype, name, attr)
-    tmp = create_tmp_var(name)
-    g_program.current_block().append_op("mul", {ipt, w}, {tmp})
-  mul_results.append(tmp)
-
-  # add sum
-  ...
-  # add bias
-  ...
-  # add activation
-  ...
-  return out
-```
-
-We can provide many helpers functions for layer developers. However, there are several disadvantages for global helper functions:
-
-1. We need a namespace for these methods, then layer developers can quickly figure out what method they can use.
-2. Global functions will force layer developers to pass its parameter time by time.
-
-So we provide a helper class, `LayerHelper`, to share code between layer functions. The `FullyConnected` Layer will be as follow.
-
-```python
-def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None):
-  helper = LayerHelper(locals())  # pass all parameter to LayerHelper
-
-  mul_results = []
-  for ipt, param in helper.iter_multiple_input_and_param():
-    w = helper.create_parameter(shape=ipt.shape[1:] + [size], dtype = ipt.dtype)
-    tmp = helper.create_tmp_variable()
-    helper.append_op('mul', {ipt, w}, {tmp})
-    mul_results.append(tmp)
-
-  pre_bias = helper.add_sum(mul_results)
-  pre_activation = helper.add_bias(pre_bias)
-  return helper.add_activation(pre_activation)
-```
-
-We not only use the fewer lines of code to write `fc_layer` but also make the code clearer to understand. At the same time, layer developers can figure out what function they can invoke by typing `helper.` in a python editor.
-
-
-### Implementation of layer helper
-
-We just keep all parameters of a layer function as a dictionary in layer helper as a private data member. Every method of layer helper will look up the dictionary after it is invoked. In that way, we can implement a layer helper for all layer functions even some layer does not contain some operator. For example, The `activation` is used by the FullyConnected layer or convolution layers, but a cross-entropy layer does not use it. The example code of `add_activation` are:
-
-```python
-class LayerHelper(object):
-  def __init__(self, **kwargs):  # kwargs is short for `keyword arguments`
-    self.kwargs = kwargs
-
-  def add_activation(self, input_var):
-    act = self.kwargs.get("act", None)  # default value is None
-    if act is None:  # do nothing if no act
-      return input_var
-
-    tmp = self.create_tmp_var(self)
-    self.append_op(type=act, input=input_var, output=tmp)
-    return tmp
-```
-
-### Return value of layer functions
-
-The layer will return a Variable, which is also the output of an operator.  However, outputs of a layer function have more attributes than an operator. There are parameter variables, and their gradient variables need to return. To return them is useful. For example,
-
-1. Users can debug the network by printing parameter gradients.
-2. Users can append attributes to a parameter, such as, `param.stop_gradient=True` will make a parameter stop generate the gradient. We can fix the parameter value during training by using this attribute.
-
-However, it is good to return a Variable for layers, since all layers and operators use Variables as their parameters. We can just append a `param` field and a `grad` field for layer function since the Python is dynamic typing.
-
-The sample usage is
-
-```python
-data = fluid.layers.data(...)
-hidden = fluid.layers.fc(data, ...)
-...
-
-executor.run(fetch_list=[hidden.param, hidden.param.grad], ...)
-```
-
-
-## Optimizer
-
-[Optimizer Design Doc](./optimizer.md)
diff --git a/doc/design/refactorization.md b/doc/design/refactorization.md
deleted file mode 100644
index f93d6155e1764386b01d2f0df3f141ab75cd55d4..0000000000000000000000000000000000000000
--- a/doc/design/refactorization.md
+++ /dev/null
@@ -1,249 +0,0 @@
-# Design Doc: Refactorization Overview
-
-The goals of refactoring include:
-
-1. Making it easy for external contributors to write new elementary computation operations.
-1. Making the codebase clean and readable.
-1. Designing a new computation representation -- a computation graph of operators and variables.
-1. Implementing auto-scalability and auto fault recoverable distributed computing with the help of computation graphs.
-
-## Computation Graphs
-
-1. PaddlePaddle represents the computation, training and inference of Deep Learning models, by computation graphs.
-
-  1. Please refer to [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/graph.md) for a concrete example.
-
-1. Users write Python programs to describe the graphs and run them (locally or remotely).
-
-1. A graph is composed of *variables* and *operators*.
-
-1. The description of graphs must be serializable/deserializable, so that:
-
-   1. It can be sent to the cloud for distributed execution, and
-   1. It can be sent to clients for mobile or enterprise deployment.
-
-1. The Python program does two things
-
-   1. *Compilation* runs a Python program to generate a protobuf message representation of the graph and send it to
-      1. the C++ library `libpaddle.so` for local execution,
-      1. the master process of a distributed training job for training, or
-      1. the server process of a Kubernetes serving job for distributed serving.
-   1. *Execution* executes the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L70), according to the protobuf message.
-
-## Description and Realization of Computation Graph
-
-At compile time, the Python program generates a protobuf message representation of the graph, or a description of the graph.
-
-At runtime, the C++ program realizes the graph and runs it.
-
-| | Representation (protobuf messages) | Realization (C++ class objects) |
-|---|---|---|
-|Data|[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107)|[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24)|
-|Operation|[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35)|[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64)|
-|Block|BlockDesc|Block|
-
-The word *graph* is interchangeable with *block* in this document.  A graph consists of computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`).
-
-## Compilation and Execution
-
-1. Run a Python program to describe the graph.  In particular, the Python application program does the following:
-
-   1. Create `VarDesc` to represent local/intermediate variables,
-   1. Create operators and set attributes,
-   1. Validate attribute values,
-   1. Infer the type and the shape of variables,
-   1. Plan memory-reuse for variables,
-   1. Generate the backward graph
-   1. Add optimization operators to the computation graph.
-   1. Optionally, split the graph for distributed training.
-
-1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the Python program does the following:
-
-   1. Create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md) for each run of a block,
-      1. realize local variables defined in the BlockDesc message in the new scope,
-      1. a scope is similar to the stack frame in programming languages,
-
-   1. Create an instance of class `Block`, in which,
-      1. realize operators in the BlockDesc message,
-
-   1. Run the Block by calling
-      1. `Block::Eval(vector<Variable>* targets)` for forward and backward computations, or
-      1. `Block::Eval(vector<Operator>* targets)` for optimization.
-
-
-## Intermediate Representation (IR)
-
-```text
-Compile Time -> IR -> Runtime
-```
-
-### Benefits of IR
-
-- Optimization
-  ```text
-  Compile Time -> IR -> Optimized IR -> Runtime
-  ```
-- Automatically send partitioned IR to different nodes.
-  - Automatic Data Parallelism
-    ```text
-    Compile Time
-    |-> Single GPU IR
-        |-> [trainer-IR-0, trainer-IR-1, pserver-IR]
-            |-> Node-0 (runs trainer-IR-0)
-            |-> Node-1 (runs trainer-IR-1)
-            |-> Node-2 (runs pserver-IR)
-    ```
-  - Automatic Model Parallelism (planned for future)
-
----
-
-# Operator/OpWithKernel/OpKernel
-
-![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/49caf1fb70820fb4a6c217634317c9306f361f36/op_op_with_kern_class_diagram.dot)
-
----
-
-# Operator
-![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot)
-
-* `Operator` is the fundamental building block of the user interface.
-    * Operator stores input/output variable names and attributes.
-    * The `InferShape` interface is used to infer the shape of the output variables based on the shapes of the input variables.
-    * Use `Run` to compute the `output` variables from the `input` variables.
-
----
-
-# OpWithKernel/Kernel
-
-![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/9d7f4eba185cf41c8e2fbfb40ae21890dbddcd39/op_with_kernel.dot)
-
-* `OpWithKernel` inherits `Operator`.
-* `OpWithKernel` contains a Kernel map.
-    * `OpWithKernel::Run` get device's kernel, and invoke `OpKernel::Compute`.
-    * `OpKernelKey` is the map key. Only device place now, but may be data type later.
-
----
-
-# Why separate Kernel and Operator
-
-* Separate GPU and CPU code.
-    * Make Paddle capable of running without GPU.
-* Make one operator (which is a user interface) and create many implementations.
-    * For example, same multiplication op can have different implementations kernels such as FP16 kernel, FP32 kernel, MKL, eigen kernel.
----
-
-# Libraries for Kernel development
-
-* `Eigen::Tensor` contains basic math and element-wise functions.
-    * Note that `Eigen::Tensor` has broadcast implementation.
-    * Limit the number of `tensor.device(dev) = ` in your code.
-* `thrust::transform` and `std::transform`.
-    * `thrust` has the same API as C++ standard library. Using `transform`, one can quickly implement customized element-wise kernels.
-    * `thrust`, in addition, supports more complex APIs, like `scan`, `reduce`, `reduce_by_key`.
-* Hand-writing `GPUKernel` and `CPU` code
-    * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.)
----
-# Operator Registration
-
-## Why is registration necessary?
-We need a method to build mappings between Op type names and Op classes.
-
-## How is registration implemented?
-Maintaining a map, whose key is the type name and the value is the corresponding Op constructor.
-
----
-# The Registry Map
-
-### `OpInfoMap`
-
-`op_type(string)` -> `OpInfo`
-
-`OpInfo`:
-
-- **`creator`**: The Op constructor.
-- **`grad_op_type`**: The type of the gradient Op.
-- **`proto`**: The Op's Protobuf, including inputs, outputs and required attributes.
-- **`checker`**: Used to check attributes.
-
----
-# Related Concepts
-
-### Op_Maker
-It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37))
-
-### Register Macros
-```cpp
-REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, grad_op_class)
-REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
-```
-
----
-# Registration Process
-1. Write an Op class and its gradient Op class, if required.
-2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator.
-3. Invoke the macro `REGISTER_OP`. This macro will
-	1. Call maker class to complete `proto` and `checker`
-	2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap`
-
----
-# Backward Module (1/2)
-### Create Backward Operator
-- Mapping from forward Op to backward Op
-![backward](https://gist.githubusercontent.com/dzhwinter/a6fbd4623ee76c459f7f94591fd1abf0/raw/61026ab6e518e66bde66a889bc42557a1fccff33/backward.png)
-
----
-# Backward Module (2/2)
-### Build Backward Network
-- **Input**: a graph of forward operators
-- **Output**: a graph of backward operators
-- **Corner cases in construction**
-	- Shared Variables => insert an `Add` operator to combine gradients
-	- No Gradient => insert a `fill_zero_grad` operator
-	- Recursive NetOp => call `Backward` recursively
-	- RNN Op => recursively call `Backward` on stepnet
-	- RNN Op => recursively call `Backward` on stepnet
-
-
----
-# Scope, Variable, Tensor
-
-* `Tensor` is an n-dimension array with type.
-	* Only dims and data pointers are stored in `Tensor`.
-	* All operations on `Tensor` are written in `Operator` or global functions.
-	* Variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
-* `Variable` instances are the inputs and the outputs of an operator, not just `Tensor`.
-	* `step_scopes` in RNN is a variable and not a tensor.
-* `Scope` is where variables are stored.
-	* map<string `var name`, Variable>
-	* `Scope` has a hierarchical structure. The local scope can get variables from its parent scope.
-
----
-# Block (in design)
-## the difference between original RNNOp and Block
-- As an operator is more intuitive than `RNNOp`,
-- Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`,
-- Fits the compile-time/ runtime separation design paradigm.
-  - During the compilation, `SymbolTable` stores `VarDesc`s and `OpDesc`s and serialize to a `BlockDesc`
-  - When graph executes, a Block with `BlockDesc` is passed. It then creates `Op` and `Var` instances and then invokes `Run`.
-
----
-# Milestone
-- Take Paddle/books as the main line, the requirement of the models motivates framework refactoring,
-- Model migration
-  - Framework development gives **priority support** to model migration, for example,
-    - the MNIST demo needs a Python interface,
-    - the RNN models require the framework to support `LoDTensor`.
-  - Determine some timelines,
-  - Frequently used Ops need to be migrated first,
-  - Different models can be migrated in parallel.
-- Improve the framework at the same time
-- Accept imperfection, concentrate on solving the specific problem at the right price.
-
----
-# Control the migration quality
-- Compare the performance of migrated models with old ones.
-- Follow the google C++ style guide.
-- Build the automatic workflow of generating Python/C++ documentations.
-  - The documentation of layers and ops should be written inside the code.
-  - Take the documentation quality into account when submitting pull requests.
-  - Preview the documentations, read and improve them from a user's perspective.
diff --git a/doc/design/regularization.md b/doc/design/regularization.md
deleted file mode 100644
index 21280ac898feb4dd5e5a5d9e88d121e856850f0b..0000000000000000000000000000000000000000
--- a/doc/design/regularization.md
+++ /dev/null
@@ -1,72 +0,0 @@
-# Regularization in PaddlePaddle
-
-## Introduction to Regularization
-A central problem in machine learning is how to design an algorithm that will perform well not just on the training data, but also on new data. A frequently faced problem is the problem of **overfitting**, where the model does not make reliable predictions on new unseen data. **Regularization** is the process of introducing additional information in order to prevent overfitting. This is usually done by adding extra penalties to the loss function that restricts the parameter spaces that an optimization algorithm can explore.
-
-### Parameter Norm Penalties
-Most common regularization approaches in deep learning are based on limiting the capacity of the models by adding a parameter norm penalty to the objective function `J`. This is given as follows:
-
-<img src="./images/loss_equation.png" align="center"/><br/>
-
-The parameter `alpha` is a hyperparameter that weights the relative contribution of the norm penalty term, `omega`, relative to the standard objective function `J`.
-
-The most commonly used norm penalties are the L2 norm penalty and the L1 norm penalty. These are given as follows:
-
-##### L2 Regularization:
-<img src="./images/l2_regularization.png" align="center"/><br/>
-
-##### L1 Regularization
-<img src="./images/l1_regularization.png" align="center"/><br/>
-
-A much more detailed mathematical background of regularization can be found [here](http://www.deeplearningbook.org/contents/regularization.html).
-
-## Regularization Survey
-
-A detailed survey of regularization in various deep learning frameworks can be found [here](https://github.com/PaddlePaddle/Paddle/wiki/Regularization-Survey). 
-
-## Proposal for Regularization in PaddlePaddle
-
-### Low-Level implementation
-
-In the new design, we propose to create new operations for regularization. For now, we can add 2 ops that correspond to the most frequently used regularizations:
-- L2_regularization_op
-- L1_regularization_op
-
-These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes other than L1 and L2 norm penalties. 
-
-The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API. 
-
-### Computation Graph
-
-Below is an example of a really simple feed forward neural network.
-
-<img src="./images/feed_forward.png" align="center"/><br/>
-
-The Python API will modify this computation graph to add regularization operators. The modified computation graph will look as follows:
-
-<img src="./images/feed_forward_regularized.png" align="center"/><br/>
-   
-### Python API implementation for Regularization
-
-Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions. 
-
-#### Creation of Regularization ops
-There are two possibilities for creating the regularization ops:
-1. We create these ops immediately while building the computation graph. 
-2. We add these ops in a lazy manner, just before the backward, similar to the way the optimization ops are added. 
-
-The proposal is to add these ops in a lazy manner just before the backward pass. 
-
-#### Storage of Regularization attributes
-
-Since we want to create the regularization ops in a lazy manner, the regularization attributes (type of regularization and weight of regularization penalty) can be stored as attributes of the [`Parameter`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/framework.py#L421) class. This is because regularization is a property of the parameters and storing regularization properties with Parameters also allows for shared parameters. 
-
-#### High-level API
-
-In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers).
-
-
-
-
-
-    
diff --git a/doc/design/releasing_process.md b/doc/design/releasing_process.md
deleted file mode 100644
index b9787261092f1f27377886152cb1596d9ff54188..0000000000000000000000000000000000000000
--- a/doc/design/releasing_process.md
+++ /dev/null
@@ -1,90 +0,0 @@
-# PaddlePaddle发行规范
-
-PaddlePaddle使用git-flow branching model做分支管理，使用[Semantic Versioning](http://semver.org/)标准表示PaddlePaddle版本号。
-
-PaddlePaddle每次发新的版本，遵循以下流程:
-
-1. 从`develop`分支派生出新的分支，分支名为`release/版本号`。例如，`release/0.10.0`
-1. 将新分支的版本打上tag，tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`，第二个为`0.10.0rc2`，依次类推。
-1. 对这个版本的提交，做如下几个操作:
-  * 使用Regression Test List作为检查列表，测试本次release的正确性。
-	  * 如果失败，记录下所有失败的例子，在这个`release/版本号`分支中，修复所有bug后，Patch号加一，到第二步
-	* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。
-	* 编译这个版本的python wheel包，并发布到pypi。
-		* 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513)，在使用twine上传之前，需要重命名wheel包中platform相关的后缀，比如将`linux_x86_64`修改成`manylinux1_x86_64`。
-		* pypi上的package名称为paddlepaddle和paddlepaddle_gpu，如果要上传GPU版本的包，需要修改build/python/setup.py中，name: "paddlepaddle_gpu"并重新打包wheel包：`python setup.py bdist_wheel`。
-		* 上传方法：
-			```
-			cd build/python
-			pip install twine
-			twine upload dist/[package to upload]
-			```
-		* 编译这个版本的Docker发行镜像，发布到dockerhub。如果失败，修复Docker编译镜像问题，Patch号加一，返回第二步
-1. 第三步完成后，将`release/版本号`分支合入master分支，并删除`release/版本号`分支。将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
-1. 协同完成Release Note的书写
-
-
-需要注意的是:
-
-* `release/版本号`分支一旦建立，一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭，方便测试人员测试PaddlePaddle的行为。
-* 在`release/版本号`分支存在的时候，如果有bugfix的行为，需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。
-
-## 发布wheel包到pypi
-
-使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
-完成自动化二进制编译，参考下图，选择需要发布的版本（通常包含一个CPU版本和一个GPU版本），点击"run"右侧的"..."按钮，可以
-弹出下面的选择框，在第二个tab (Changes)里选择需要发布的分支，这里选择0.11.0，然后点击"Run Build"按钮。等待编译完成后
-可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件，分别对应CAPI，`cp27m`和`cp27mu`的版本。然后按照上述的方法
-使用`twine`工具上传即可。
-
-<img src="ci_build_whl.png">
-
-* 注：CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux
-  发型版，如果需要手动编译，也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。
-* pypi不支持覆盖上传，所以一个版本号的wheel包发布之后，不可以更改。下一个wheel包需要更新版本号才可以上传。
-
-## 发布Docker镜像
-
-上述PaddlePaddle CI编译wheel完成后会自动将Docker镜像push到DockerHub，所以，发布Docker镜像只需要对自动push的镜像打上
-版本号对应的tag即可：
-
-1. 进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看latest tag的更新时间是否在上述编译wheel包完成后是否最新。
-1. 执行 `docker pull paddlepaddle/paddle:[latest tag]`，latest tag可以是latest或latest-gpu等。
-1. 执行 `docker tag paddlepaddle/paddle:[latest tag] paddlepaddle/paddle:[version]`
-1. 执行 `docker push paddlepaddle/paddle:[version]`
-
-## PaddlePaddle 分支规范
-
-PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范，并适应github的特性做了一些区别。
-
-* PaddlePaddle的主版本库遵循[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范。其中:
-	* `master`分支为稳定(stable branch)版本分支。每一个`master`分支的版本都是经过单元测试和回归测试的版本。
-	* `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试，但并没有经过回归测试。
-	* `release/版本号`分支为每一次Release时建立的临时分支。在这个阶段的代码正在经历回归测试。
-
-* 其他用户的fork版本库并不需要严格遵守[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范，但所有fork的版本库的所有分支都相当于特性分支。
-	* 建议，开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支
-	* 建议，开发者fork的版本库中，再基于`develop`版本fork出自己的功能分支。
-	* 当功能分支开发完毕后，向PaddlePaddle的主版本库提交`Pull Reuqest`，进而进行代码评审。
-		* 在评审过程中，开发者修改自己的代码，可以继续在自己的功能分支提交代码。 
-
-* BugFix分支也是在开发者自己的fork版本库维护，与功能分支不同的是，BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支，同时提起`Pull Request`。
-
-## PaddlePaddle回归测试列表
-
-本列表说明PaddlePaddle发版之前需要测试的功能点。
-
-### PaddlePaddle Book中所有章节
-
-PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
-
-| | 新手入门章节 | 识别数字 | 图像分类 | 词向量 | 情感分析 | 语意角色标注 | 机器翻译 | 个性化推荐 |
-| --- | --- | --- | --- | --- | --- | --- | --- | --- |
-| API.V2 + Docker + GPU  |  |  |  |  |  |  |  |  |
-| API.V2 + Docker + CPU  |  |  |  |  |  |  |  |  |
-| `paddle_trainer` + Docker + GPU |  |  |  |  |  |  |  |  |
-| `paddle_trainer` + Docker + CPU |  |  |  |  |  |  |  |  |
-| API.V2 + Ubuntu + GPU |  |  |  |  |  |  |  |  |
-| API.V2 + Ubuntu + CPU |  |  |  |  |  |  |  |  |
-| `paddle_trainer` + Ubuntu + GPU |  |  |  |  |  |  |  |  |
-| `paddle_trainer` + Ubuntu + CPU |  |  |  |  |  |  |  |  |
diff --git a/doc/design/scope.md b/doc/design/scope.md
deleted file mode 100644
index 4da76eebb74abcd26ec2b8671399e6bc4fb58574..0000000000000000000000000000000000000000
--- a/doc/design/scope.md
+++ /dev/null
@@ -1,124 +0,0 @@
-# Design of Scope in Paddle
-
-## Overview
-
-Scope is an important concept in programming languages, which defines a program region that a set of bindings between names and entities applies. In a specific scope, a valid name is uniquely associated with an entity, such as a variable. And in another scope, this name may refer to other entity or nothing at all. It clearly restricts the visibility and validity of names in a program. Hence **Scope** is introduced to PaddlePaddle to manage variables in context. But different from the original abstract concept, Scope now becomes an object with two important attributes:
-
-- Scope is an association of a name to variable.
-- Variables in a parent scope can be retrieved from local scope.
-
-A detailed explanation of these two attributes goes as following.
-
-
-## Scope is an association of a name to variable.
-
-Scope is an association of a name to variable. All variables belong to `Scope`. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`. One net can run in different scopes and update different variable in the scope.
-
-
-1. Scope only contains a map of a name to variable.
-
-   All parameters, data, states in a Net should be variables and stored inside a scope. Each op should get inputs and outputs to do computation from a scope, such as data buffer, state (momentum) etc.
-
-1. Variable can only be created by Scope and a variable can only be got from Scope. User cannot create or get a variable outside a scope. This is a constraints of our framework, and will keep our framework simple and clear.
-
-1. Scope only contains methods that are used to Create and Get Variables. Scope do not contain Operators and have no information to run them.
-    `Net` is designed to drive the computation and Scope only contains a map of variables. There is no computation logic inside a `Scope`. Scope just handles the lifetime management of variables.
-    - `Create` is used to create a Variable by its name and add the mapping relation.
-    - `Get` is used to find a Variable by name.
-
-1. Every variable only belongs to one certain Scope.
-
-   Variable can not belong to many scopes. If you want to use variables from parent scope, you can use `parent scope`.
-
-1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else. 
-
-   Because Variable can only be got from Scope. When destroying Scope, we also need to destroy all the Variables in it. If user store `Variable` pointer to private data member or some global variable, the pointer will be an invalid pointer when associated `Scope` is destroyed.
-
-```cpp
-class Scope {
- public:
-  Variable* Var(const std::string& name);
-  const Variable* FindVar(const std::string& name) const;
-
- private:
-    std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
-};
-```
-
-
-## Parent scope and local scope
-
-Just like [scope](https://en.wikipedia.org/wiki/Scope_(computer_science)) in programming languages, `Scope` in the neural network can also be a local scope. There are two attributes about local scope.
-
-1.  We can create local variables in a local scope. When that local scope is destroyed, all local variables should also be destroyed.
-2.  Variables in a parent scope can be retrieved from local scopes of that parent scope, i.e., when user get a variable from a scope, it will try to search this variable in current scope. If there is no such variable in the local scope, `scope` will keep searching from its parent, until the variable is found or there is no parent.
-
-```cpp
-class Scope {
- public:
-  Scope(const std::shared_ptr<Scope>& scope): parent_(scope) {}
-
-  Variable* FindVar(const std::string& name) const {
-    auto it = vars_.find(name);
-    if (it != vars_.end()) {
-      return it->second.get();
-    } else if (parent_ != nullptr) {
-      return parent_->FindVar(name);
-    } else {
-      return nullptr;
-    }
-  }
-
- private:
-  std::shared_ptr<Scope> parent_ {nullptr};
-};
-```
-
-In `Scope` class, there is a private data member called `parent_`. `parent_` is a smart pointer to its parent scope. When user `Get` a variable by its `name`, the `name` will be searched inside the current scope. If the variable cannot be found locally and parent scope is not a `nullptr`, the variable will be searched inside that parent scope. `parent_` pointer's default value is `nullptr`. It means that the scope is a global scope when `parent_` is nullptr.
-
-A local scope is very useful when we implement Recurrent Neural Network. Each timestep of an RNN should be a `Net`. Each `Net` of timestep (`StepNet` for short) should use an independent local scope. Just like variables in a while loop is inside a local scope in programming languages. By using a single `StepNet` and changing local scope, we can implement an RNN easily.
-
-# Interface Design
-
-```cpp
-class Variable {
- private:
-  Variable() = default;
-  friend class Scope;
-};
-
-class Scope {
- private:
-  Scope(const std::shared_ptr<Scope>& parent = nullptr);
-
- public:
-  static std::shared_ptr<Scope> Create(const std::shared_ptr<Scope>& parent = nullptr);
-
-  // return nullptr if not found.
-  Variable* FindVar(const std::string& name) const;
-
-  // return if already contains same name variable.
-  Variable* Var(const std::string& name);
-
- private:
-  std::shared_ptr<Scope> parent_;
-  std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
-};
-```
-## Only scope can create a variable
-
-To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `Var` can construct `Variable`.
-
-## When scope destroyed, all variables inside this scope should be destroyed together
-
-The scope hold unique pointers for all variables. User can `FindVar` from scope, but he should not hold this pointer as a member variable. Because when scope is destroyed, all variables inside this scope will be destroyed together.
-
-## Sharing a parent scope
-
-Local scope contains a `parent_` pointer. It is a linked-list for scopes. Using a `shared_ptr` because when a local scope is using, its parents cannot be destroyed.
-
-Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shared pointer. We cannot construct a scope variable, because it cannot be passed to other scope as `parent` pointer.
-
-## Orthogonal interface
-
-`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `Var` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `Var`, we can implement `Var` easily.
diff --git a/doc/design/speech/deep_speech_2.md b/doc/design/speech/deep_speech_2.md
deleted file mode 100644
index cfdc4d6df04344c70d3334626bd38eca997c31ff..0000000000000000000000000000000000000000
--- a/doc/design/speech/deep_speech_2.md
+++ /dev/null
@@ -1,168 +0,0 @@
-# DeepSpeech2 on PaddlePaddle: Design Doc 
-
-We are planning to build Deep Speech 2 (DS2) \[[1](#references)\], a powerful Automatic Speech Recognition (ASR) engine,  on PaddlePaddle. For the first-stage plan, we have the following short-term goals:
-
-- Release a basic distributed implementation of DS2 on PaddlePaddle.
-- Contribute a chapter of Deep Speech to PaddlePaddle Book.
-
-Intensive system optimization and low-latency inference library (details in \[[1](#references)\]) are not yet covered in this first-stage plan.
-
-## Table of Contents
-
-- [Tasks](#tasks)
-- [Task Dependency](#task-dependency)
-- [Design Details](#design-details)
-    - [Overview](#overview)
-    - [Row Convolution](#row-convolution)
-    - [Beam Search With CTC and LM](#beam-search-with-ctc-and-lm)
-- [Future Work](#future-work)
-- [References](#references)
-
-## Tasks
-
-We roughly break down the project into 14 tasks:
-
-1. Develop an **audio data provider**:
-	- Json filelist generator.
-	- Audio file format transformer.
-	- Spectrogram feature extraction, power normalization etc.
-	- Batch data reader with SortaGrad.
-	- Data augmentation (optional).
-	- Prepare (one or more) public English data sets & baseline.
-2. Create a **simplified DS2 model configuration**:
-   - With only fixed-length (by padding) audio sequences (otherwise need *Task 3*).
-	- With only bidirectional-GRU (otherwise need *Task 4*).
-	- With only greedy decoder (otherwise need *Task 5, 6*).
-3. Develop to support **variable-shaped** dense-vector (image) batches of input data.
-   - Update `DenseScanner` in `dataprovider_converter.py`, etc.
-4. Develop a new **lookahead-row-convolution layer** (See \[[1](#references)\] for details):
-   - Lookahead convolution windows.
-   - Within-row convolution, without kernels shared across rows.
-5. Build KenLM **language model** (5-gram) for beam search decoder:
-   - Use KenLM toolkit.
-   - Prepare the corpus & train the model.
-   - Create infererence interfaces (for Task 6).
-6. Develop a **beam search decoder** with CTC + LM + WORDCOUNT:
-   - Beam search with CTC.
-   - Beam search with external custom scorer (e.g. LM).
-   - Try to design a more general beam search interface.
-7. Develop a **Word Error Rate evaluator**:
-   - update `ctc_error_evaluator`(CER) to support WER.
-8. Prepare internal dataset for Mandarin (optional):
-    - Dataset, baseline, evaluation details.
-    - Particular data preprocessing for Mandarin.
-    - Might need cooperating with the Speech Department.
-9. Create **standard DS2 model configuration**:
-   - With variable-length audio sequences (need *Task 3*).
-	- With unidirectional-GRU + row-convolution (need *Task 4*).
-	- With CTC-LM beam search decoder (need *Task 5, 6*).
-10. Make it run perfectly on **clusters**.
-11. Experiments and **benchmarking** (for accuracy, not efficiency):
-    - With public English dataset.
-    - With internal (Baidu) Mandarin dataset (optional).
-12. Time **profiling** and optimization.
-13. Prepare **docs**.
-14. Prepare PaddlePaddle **Book** chapter with a simplified version.
-
-## Task Dependency
-
-Tasks parallelizable within phases:
-
-Roadmap     | Description                               | Parallelizable Tasks 
------------ | :------------------------------------     | :--------------------
-Phase I	    | Simplified model & components             | *Task 1* ~ *Task 8*
-Phase II    | Standard model & benchmarking & profiling | *Task 9* ~ *Task 12*
-Phase III   | Documentations                            | *Task13* ~ *Task14*
-
-Issue for each task will be created later. Contributions, discussions and comments are all highly appreciated and welcomed!
-
-## Design Details
-
-### Overview
-
-Traditional **ASR** (Automatic Speech Recognition) pipelines require great human efforts devoted to elaborately tuning multiple hand-engineered components (e.g. audio feature design, accoustic model, pronuncation model and language model etc.). **Deep Speech 2** (**DS2**) \[[1](#references)\], however, trains such ASR models in an end-to-end manner, replacing most intermediate modules with only a single deep network architecture. With scaling up both the data and model sizes, DS2 achieves a very significant performance boost.
-
-Please read Deep Speech 2 \[[1](#references),[2](#references)\] paper for more background knowledge.
-
-The classical DS2 network contains 15 layers (from bottom to top):
-
-- **Two** data layers (audio spectrogram, transcription text)
-- **Three** 2D convolution layers
-- **Seven** uni-directional simple-RNN layers
-- **One** lookahead row convolution layers
-- **One** fully-connected layers
-- **One** CTC-loss layer
-
-<div align="center">
-<img src="image/ds2_network.png" width=350><br/>
-Figure 1. Archetecture of Deep Speech 2 Network.
-</div>
-
-We don't have to persist on this 2-3-7-1-1-1 depth \[[2](#references)\]. Similar networks with different depths might also work well. As in \[[1](#references)\], authors use a different depth (e.g. 2-2-3-1-1-1) for final experiments.
-
-Key ingredients about the layers:
-
-- **Data Layers**: 
-   - Frame sequences data of audio **spectrogram** (with FFT).
-   - Token sequences data of **transcription** text (labels). 
-   - These two type of sequences do not have the same lengthes, thus a CTC-loss layer is required.
-- **2D Convolution Layers**: 
-   - Not only temporal convolution, but also **frequency convolution**. Like a 2D image convolution, but with a variable dimension (i.e. temporal dimension).
-   - With striding for only the first convlution layer.
-   - No pooling for all convolution layers.
-- **Uni-directional RNNs** 
-	- Uni-directional + row convolution: for low-latency inference.
-	- Bi-direcitional + without row convolution: if we don't care about the inference latency.
-- **Row convolution**:
-	- For looking only a few steps ahead into the feature, instead of looking into a whole sequence in bi-directional RNNs.
-	- Not nessesary if with bi-direcitional RNNs. 
-	- "**Row**" means convolutions are done within each frequency dimension (row), and no convolution kernels shared across.
-- **Batch Normalization Layers**:
-   - Added to all above layers (except for data and loss layer).
-   - Sequence-wise normalization for RNNs: BatchNorm only performed on input-state projection and not state-state projection, for efficiency consideration.
- 
-
-Required Components                     | PaddlePaddle Support                      | Need to Develop
-:-------------------------------------  | :--------------------------------------   | :-----------------------
-Data Layer I (Spectrogram)	            | Not supported yet.                        |  TBD (Task 3)
-Data Layer II (Transcription)           | `paddle.data_type.integer_value_sequence` | -
-2D Convolution Layer                    | `paddle.layer.image_conv_layer`           | -
-DataType Converter (vec2seq)            | `paddle.layer.block_expand`               | -
-Bi-/Uni-directional RNNs                | `paddle.layer.recurrent_group`            | -
-Row Convolution Layer                   | Not supported yet.                        | TBD (Task 4)
-CTC-loss Layer                          | `paddle.layer.warp_ctc`                   | -
-Batch Normalization Layer               | `paddle.layer.batch_norm`                 | -
-CTC-Beam search                         | Not supported yet.                        | TBD (Task 6)
-
-### Row Convolution
-
-TODO by Assignees
-
-### Beam Search with CTC and LM
-
-<div align="center">
-<img src="image/beam_search.png" width=600><br/>
-Figure 2. Algorithm for CTC Beam Search Decoder.
-</div>
-
-- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts: 
-   - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths; 
-   - 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary.
-- An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding.
-- Such external scorer consists of language model, word count or any other custom scorers.
-- The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7)
-- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality. 
- 
-
-## Future Work
-
-- Efficiency Improvement
-- Accuracy Improvement
-- Low-latency Inference Library
-- Large-scale benchmarking
-
-## References
-
-1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016.
-2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). 	arXiv:1512.02595.
-3. Awni Y. Hannun, etc. [First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent DNNs](https://arxiv.org/abs/1408.2873). arXiv:1408.2873
diff --git a/doc/design/support_new_device.md b/doc/design/support_new_device.md
deleted file mode 100644
index 8983df900460127fc130043c52373dab505363ba..0000000000000000000000000000000000000000
--- a/doc/design/support_new_device.md
+++ /dev/null
@@ -1,240 +0,0 @@
-# Design Doc: Supporting new Device/Library
-
-## Background
-
-Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries in a flexible and efficient manner.
-
-On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example, Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.
-
-On the other hand, users usually do not want to care about the low-level hardware and computing libraries when writing a neural network configuration. In Fluid, `Layer` is exposed in `Python`, and `Operator` is exposed in `C++`. Both `Layer` and `Operator` are hardware independent.
-
-So, how to support a new Device/Library in Fluid becomes a challenge.
-
-
-## Basic: Integrate A New Device/Library
-
-For a general overview of fluid, please refer to the [overview doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/read_source.md).
-
-There are mainly three parts that we have to consider while integrating a new device/library:
-
-- Place and DeviceContext: indicate the device id and manage hardware resources
-
-- Memory and Tensor: malloc/free data on certain device
-
-- Math Functor and OpKernel: implement computing unit on certain devices/libraries
-
-### Place and DeviceContext
-
-Please note that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
-
-#### Place
-Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add the corresponding `DevicePlace`.
-
-```
-        |   CPUPlace
-Place --|   CUDAPlace
-        |   FPGAPlace
-```
-
-And `Place` is defined as follows:
-
-```
-typedef boost::variant<CUDAPlace, CPUPlace, FPGAPlace> Place;
-```
-
-#### DeviceContext
-
-Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L30) to manage the resources in different libraries, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`.
-
-
-```
-                /->  CPUDeviceContext   
-DeviceContext ---->  CUDADeviceContext  
-                \->  FPGADeviceContext
-```
-
-An example of Nvidia GPU is as follows:
-
-- DeviceContext
-
-
-```
-class DeviceContext {
-  virtual Place GetPlace() const = 0;
-};  
-```
-
-
-- CUDADeviceContext
-
-
-```
-class CUDADeviceContext : public DeviceContext {
-  Place GetPlace() const override { return place_; }
-private:
-  CUDAPlace place_;
-  cudaStream_t stream_; 
-  cublasHandle_t cublas_handle_;
-  std::unique_ptr<Eigen::GpuDevice> eigen_device_;  // binds with stream_
-};
-```
-
-### Memory and Tensor
-
-
-#### memory module
-
-Fluid provides the following [memory interfaces](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/memory/memory.h#L36):
-
-```
-template <typename Place>
-void* Alloc(Place place, size_t size);
-
-template <typename Place>
-void Free(Place place, void* ptr);
-
-template <typename Place>
-size_t Used(Place place);
-```
-
-To implement these interfaces, we have to implement MemoryAllocator for different Devices.
-
-
-#### Tensor
-
-[Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h#L36) holds data with some shape in a specific Place.
-
-```cpp
-class Tensor {
- public:
-  /*! Return a pointer to mutable memory block. */
-  template <typename T>
-  inline T* data();
-
-  /**
-   * @brief   Return a pointer to mutable memory block.
-   * @note    If not exist, then allocation.
-   */
-  template <typename T>
-  inline T* mutable_data(platform::Place place);
-
-  /**
-   * @brief     Return a pointer to mutable memory block.
-   *
-   * @param[in] dims    The dimensions of the memory block.
-   * @param[in] place   The place of the memory block.
-   *
-   * @note      If not exist, then allocation.
-   */
-  template <typename T>
-  inline T* mutable_data(DDim dims, platform::Place place);
-
-  /*! Resize the dimensions of the memory block. */
-  inline Tensor& Resize(const DDim& dims);
-
-  /*! Return the dimensions of the memory block. */
-  inline const DDim& dims() const;
-
- private:
-  /*! holds the memory block if allocated. */
-  std::shared_ptr<Placeholder> holder_;
-
-  /*! points to dimensions of memory block. */
-  DDim dim_;
-};
-```
-
-`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configurate its shape, and then call `mutuable_data` to allocate the actual memory.
-
-```cpp
-paddle::framework::Tensor t;
-paddle::platform::CPUPlace place;
-// set size first
-t.Resize({2, 3});
-// allocate memory on CPU later
-t.mutable_data(place);
-```
-
-
-
-### Math Functor and OpKernel
-
-Fluid implements computing units based on different DeviceContexts. Some computing units are shared between operators. This common part will be put in operators/math directory as basic Functors.
-
-Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/math/maxouting.h#L27) as an example:
-
-The interface is defined in the header file.
-
-```
-template <typename DeviceContext, typename T>
-class MaxOutFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  framework::Tensor* output, int groups);
-};
-```
-
-CPU implementation is in .cc file
-
-```
-template <typename T>
-class MaxOutFunctor<platform::CPUDeviceContext, T> {
-  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* output,
-                  int groups) {
-                  ...
-                  }
-};
-```
-
-CUDA implementation is in .cu file
-
-```
-template <typename T>
-class MaxOutFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* output,
-                  int groups) {
-                  ...
-                  }
-};                  
-```
-
-
-We first obtain the computing handle from a concrete DeviceContext and then compute on tensors.
-
-The implementation of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map.
-
-Fluid provides different register interfaces in op_registry.h
-
-
-Let's take [Crop](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/crop_op.cc#L134) operator as an example:
-
-In .cc file:
-
-```
-REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel<float>);
-REGISTER_OP_CPU_KERNEL(
-    crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>);
-```
-
-In .cu file:
-
-```
-REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel<float>);
-REGISTER_OP_CUDA_KERNEL(
-    crop_grad, ops::CropGradKernel<paddle::platform::CUDADeviceContext, float>);
-```
-
-
-## Advanced topics: How to switch between different Device/Library
-
-Generally, we will implement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not suitable on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run on GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
-
-
-For more details, please refer to following docs:
-
-- operator kernel type [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md)
-- switch kernel [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md)
diff --git a/doc/design/switch.md b/doc/design/switch.md
deleted file mode 100644
index 827d0601c621e4a230de28e2baad8e196e69625e..0000000000000000000000000000000000000000
--- a/doc/design/switch.md
+++ /dev/null
@@ -1,31 +0,0 @@
-### Design Doc: Switch
-
-### Background
-
-Many programming languages provide `switch` as a generalization of `if-elif-else`.  We want to add it to Fluid.
-
-The following example shows the usage of `fluid.switch`.
-
-```python
-a = fluid.Var(10)
-b = fluid.Var(0)
-
-with switch() as switch:
-    with switch.case(fluid.less_equal(a, 10)):
-        fluid.print("Case 1")
-    with switch.case(fluid.larger(a, 0)):
-        fluid.print("Case 2")
-    with switch.default():
-        fluid.print("Case 3")
-```
-
-### The Semantics
-
-1. A `switch` control-flow checks cases one-by-one.
-1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
-1. It runs the first matched case, or the default case if there is one.
-1. Once it matches a case, it runs the corresponding branch and only that branch.  It's like there is a C's `break` keyword at the end of each case.
-
-The above program should print and print only "Case 1".
-
-The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches.
diff --git a/doc/design/var_desc.md b/doc/design/var_desc.md
deleted file mode 100644
index 6a45af1995463402ba9c65ddb51c6c8bb107f99e..0000000000000000000000000000000000000000
--- a/doc/design/var_desc.md
+++ /dev/null
@@ -1,81 +0,0 @@
-## Background
-PaddlePaddle divides the description of neural network computation into two stages: compile time and runtime. At compile time, the neural network computation is described as a `ProgramDesc` whereas at runtime an `Executor` interprets the `ProgramDesc` to compute the operations.
-
-PaddlePaddle uses proto message to describe compile time program because :
-
-1. The computation program description must be serializable and saved in a file.
-1. During distributed training, the serialized program will be sent to multiple workers. It should also be possible to break the program into different components, each of which can be executed on a different worker.
-
-The computation `Program` consists of nested `Blocks`. Each `Block` will consist of data(i.e. `Variable`)  and  `Operations`. The concept to represent them is in the table below.
-
-| |compile time|runtime|
-|---|---|---|
-|Data|VarDesc(proto)|Variable(cpp)|
-|Operation|OpDesc(proto)|Operator(cpp)|
-
-
-## Definition of VarType
-
-A VarDesc should have a name, type and whether or not it is persistable. The are different kinds of variable types supported in PaddlePaddle, apart from the POD_Types like: `LOD_TENSOR`, `SELECTED_ROWS`, `FEED_MINIBATCH`, `FETCH_LIST`, `STEP_SCOPES`, `LOD_RANK_TABLE`, `LOD_TENSOR_ARRAY`, `PLACE_LIST`, `READER` and `CHANNEL`. These are declared inside `VarType`. A `VarDesc` then looks as the following:
-
-```proto
-message VarDesc {
-  required string name = 1;
-  required VarType type = 2;
-  optional bool persistable = 3 [ default = false ];
-}
-```
-
-## Definition of TensorDesc
-
-```proto
-message TensorDesc {
-  // Should only be PODType. Is enforced in C++
-  required Type data_type = 1;
-  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
-}
-```
-
-The `Type` here comes from the enum defined inside of `VarType` :
-
-```proto
-enum Type {
-  // Pod Types
-  BOOL = 0;
-  INT16 = 1;
-  INT32 = 2;
-  INT64 = 3;
-  FP16 = 4;
-  FP32 = 5;
-  FP64 = 6;
-
-  // Other types that may need additional descriptions
-  LOD_TENSOR = 7;
-  SELECTED_ROWS = 8;
-  FEED_MINIBATCH = 9;
-  FETCH_LIST = 10;
-  STEP_SCOPES = 11;
-  LOD_RANK_TABLE = 12;
-  LOD_TENSOR_ARRAY = 13;
-  PLACE_LIST = 14;
-  READER = 15;
-  CHANNEL = 16;
-}
-```
-
-A TensorDesc describes `SelectedRows` and `LoDTensor`. For details of `SelectedRows`, please reference [`SelectedRows`](./selected_rows.md).
-
-## Definition of LodTensorDesc
-
-```proto
-message LoDTensorDesc {
-  required TensorDesc tensor = 1;
-  optional int32 lod_level = 2 [ default = 0 ];
-}
-```
-
-A LoDTensorDesc contains a tensor and a lod_level.
-
-## Definition of Variable in Python
-
-For Variable in Python, please reference [`Python API`](./python_api.md).
diff --git a/doc/fluid/CMakeLists.txt b/doc/fluid/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..be92af3902769a65c77953c9f3cb1f3aa3738d79
--- /dev/null
+++ b/doc/fluid/CMakeLists.txt
@@ -0,0 +1,54 @@
+if(NOT DEFINED SPHINX_THEME)
+    set(SPHINX_THEME default)
+endif()
+
+if(NOT DEFINED SPHINX_THEME_DIR)
+    set(SPHINX_THEME_DIR)
+endif()
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+set(IMPORT_PADDLE_STRING "")
+set(IMPORT_PADDLEV2_STRING "")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
+    "${BINARY_BUILD_DIR_EN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_fluid_docs
+                  html
+                  ${BINARY_BUILD_DIR_EN}
+                  ${SPHINX_CACHE_DIR_EN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_EN})
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
+
+# HTML output directory
+set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.cn.in"
+    "${BINARY_BUILD_DIR_CN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_fluid_docs_cn
+                  html
+                  ${BINARY_BUILD_DIR_CN}
+                  ${SPHINX_CACHE_DIR_CN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_CN})
+
+add_subdirectory(api)
diff --git a/doc/fluid/api/CMakeLists.txt b/doc/fluid/api/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..435d6e10fb02e9b2a8147f37da33e8848cc9b98a
--- /dev/null
+++ b/doc/fluid/api/CMakeLists.txt
@@ -0,0 +1,25 @@
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+set(IMPORT_PADDLE_STRING "import paddle")
+set(IMPORT_PADDLEV2_STRING "import paddle.v2")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in"
+    "${BINARY_BUILD_DIR_EN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_fluid_apis
+                  html
+                  ${BINARY_BUILD_DIR_EN}
+                  ${SPHINX_CACHE_DIR_EN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_EN})
+
+add_dependencies(paddle_fluid_apis  gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
diff --git a/doc/fluid/api/average.rst b/doc/fluid/api/average.rst
new file mode 100644
index 0000000000000000000000000000000000000000..496f5b29875443f0c44f50fcb3ca837f4e7bcd12
--- /dev/null
+++ b/doc/fluid/api/average.rst
@@ -0,0 +1,16 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=============
+fluid.average
+=============
+
+.. _api_fluid_average_WeightedAverage:
+
+WeightedAverage
+---------------
+
+..  autoclass:: paddle.fluid.average.WeightedAverage
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/backward.rst b/doc/fluid/api/backward.rst
new file mode 100644
index 0000000000000000000000000000000000000000..115e0d24b39928cfc349f72e0a21d6374cd8cd75
--- /dev/null
+++ b/doc/fluid/api/backward.rst
@@ -0,0 +1,23 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+==============
+fluid.backward
+==============
+
+.. _api_fluid_backward_append_backward:
+
+append_backward
+---------------
+
+..  autofunction:: paddle.fluid.backward.append_backward
+    :noindex:
+
+.. _api_fluid_backward_calc_gradient:
+
+calc_gradient
+-------------
+
+..  autofunction:: paddle.fluid.backward.calc_gradient
+    :noindex:
+
diff --git a/doc/fluid/api/clip.rst b/doc/fluid/api/clip.rst
new file mode 100644
index 0000000000000000000000000000000000000000..aeefbb95a46e5d5ed46375e388a720fad2711779
--- /dev/null
+++ b/doc/fluid/api/clip.rst
@@ -0,0 +1,43 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+==========
+fluid.clip
+==========
+
+.. _api_fluid_clip_ErrorClipByValue:
+
+ErrorClipByValue
+----------------
+
+..  autoclass:: paddle.fluid.clip.ErrorClipByValue
+    :members:
+    :noindex:
+
+.. _api_fluid_clip_GradientClipByValue:
+
+GradientClipByValue
+-------------------
+
+..  autoclass:: paddle.fluid.clip.GradientClipByValue
+    :members:
+    :noindex:
+
+.. _api_fluid_clip_GradientClipByNorm:
+
+GradientClipByNorm
+------------------
+
+..  autoclass:: paddle.fluid.clip.GradientClipByNorm
+    :members:
+    :noindex:
+
+.. _api_fluid_clip_GradientClipByGlobalNorm:
+
+GradientClipByGlobalNorm
+------------------------
+
+..  autoclass:: paddle.fluid.clip.GradientClipByGlobalNorm
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/data/data_reader.rst b/doc/fluid/api/data/data_reader.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1a35d0bbc8f9d751f49c7e1fc26feb1bcb3ae7f0
--- /dev/null
+++ b/doc/fluid/api/data/data_reader.rst
@@ -0,0 +1,72 @@
+=====================
+Data Reader Interface
+=====================
+
+
+DataTypes
+=========
+
+..  autofunction:: paddle.v2.data_type.dense_array
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_non_value_slot
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_value_slot
+    :noindex:
+
+..  autoclass:: paddle.v2.data_type.InputType
+    :members:
+    :noindex:
+
+DataFeeder
+==========
+
+..  automodule:: paddle.v2.data_feeder
+    :members:
+    :noindex:
+
+Reader
+======
+
+..  automodule:: paddle.reader
+    :members:
+    :noindex:
+
+..  automodule:: paddle.reader.creator
+    :members:
+    :noindex:
+
+minibatch
+=========
+
+..  automodule:: paddle.v2.minibatch
+    :members:
+    :noindex:
diff --git a/doc/fluid/api/data/dataset.rst b/doc/fluid/api/data/dataset.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e7c8be4452bf55e0967d750c2e624e8e316e9330
--- /dev/null
+++ b/doc/fluid/api/data/dataset.rst
@@ -0,0 +1,82 @@
+Dataset
+=======
+
+..  automodule:: paddle.dataset
+    :members:
+    :noindex:
+
+mnist
++++++
+
+..  automodule:: paddle.dataset.mnist
+    :members:
+    :noindex:
+
+cifar
++++++
+
+..  automodule:: paddle.dataset.cifar
+    :members:
+    :noindex:
+
+conll05
++++++++
+
+..  automodule:: paddle.dataset.conll05
+    :members: get_dict,get_embedding,test
+    :noindex:
+
+imdb
+++++
+
+..  automodule:: paddle.dataset.imdb
+    :members:
+    :noindex:
+
+imikolov
+++++++++
+
+..  automodule:: paddle.dataset.imikolov
+    :members:
+    :noindex:
+
+movielens
++++++++++
+
+..  automodule:: paddle.dataset.movielens
+    :members:
+    :noindex:
+
+..  autoclass:: paddle.dataset.movielens.MovieInfo
+    :noindex:
+
+..  autoclass:: paddle.dataset.movielens.UserInfo
+    :noindex:
+
+sentiment
++++++++++
+
+..  automodule:: paddle.dataset.sentiment
+    :members:
+    :noindex:
+
+uci_housing
++++++++++++
+
+..  automodule:: paddle.dataset.uci_housing
+    :members:
+    :noindex:
+
+wmt14
++++++
+
+..  automodule:: paddle.dataset.wmt14
+    :members:
+    :noindex:
+
+wmt16
++++++
+
+..  automodule:: paddle.dataset.wmt16
+    :members:
+    :noindex:
diff --git a/doc/fluid/api/data/image.rst b/doc/fluid/api/data/image.rst
new file mode 100644
index 0000000000000000000000000000000000000000..97651ffa6be56cf3ecaca2caca38a353fa5c1f49
--- /dev/null
+++ b/doc/fluid/api/data/image.rst
@@ -0,0 +1,5 @@
+Image Interface
+===============
+
+..  automodule:: paddle.v2.image
+    :members:
diff --git a/doc/fluid/api/data_feeder.rst b/doc/fluid/api/data_feeder.rst
new file mode 100644
index 0000000000000000000000000000000000000000..11d2890f5b3446e37c3ef31e5a17ebebe169dbc8
--- /dev/null
+++ b/doc/fluid/api/data_feeder.rst
@@ -0,0 +1,16 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=================
+fluid.data_feeder
+=================
+
+.. _api_fluid_data_feeder_DataFeeder:
+
+DataFeeder
+----------
+
+..  autoclass:: paddle.fluid.data_feeder.DataFeeder
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/executor.rst b/doc/fluid/api/executor.rst
new file mode 100644
index 0000000000000000000000000000000000000000..db2842e7f23e74130a966bb347004bee1ccb08fd
--- /dev/null
+++ b/doc/fluid/api/executor.rst
@@ -0,0 +1,48 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+==============
+fluid.executor
+==============
+
+.. _api_fluid_executor_Executor:
+
+Executor
+--------
+
+..  autoclass:: paddle.fluid.executor.Executor
+    :members:
+    :noindex:
+
+.. _api_fluid_executor_global_scope:
+
+global_scope
+------------
+
+..  autofunction:: paddle.fluid.executor.global_scope
+    :noindex:
+
+.. _api_fluid_executor_scope_guard:
+
+scope_guard
+-----------
+
+..  autofunction:: paddle.fluid.executor.scope_guard
+    :noindex:
+
+.. _api_fluid_executor__switch_scope:
+
+_switch_scope
+-------------
+
+..  autofunction:: paddle.fluid.executor._switch_scope
+    :noindex:
+
+.. _api_fluid_executor_fetch_var:
+
+fetch_var
+---------
+
+..  autofunction:: paddle.fluid.executor.fetch_var
+    :noindex:
+
diff --git a/doc/fluid/api/fluid.rst b/doc/fluid/api/fluid.rst
new file mode 100644
index 0000000000000000000000000000000000000000..51cdfe0c2ed045a5b3247c4fdec9868d756eae86
--- /dev/null
+++ b/doc/fluid/api/fluid.rst
@@ -0,0 +1,378 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=====
+fluid
+=====
+
+.. _api_fluid_Block:
+
+Block
+-----
+
+..  autoclass:: paddle.fluid.Block
+    :members:
+    :noindex:
+
+.. _api_fluid_Variable:
+
+Variable
+--------
+
+..  autoclass:: paddle.fluid.Variable
+    :members:
+    :noindex:
+
+.. _api_fluid_Program:
+
+Program
+-------
+
+..  autoclass:: paddle.fluid.Program
+    :members:
+    :noindex:
+
+.. _api_fluid_Operator:
+
+Operator
+--------
+
+..  autoclass:: paddle.fluid.Operator
+    :members:
+    :noindex:
+
+.. _api_fluid_default_startup_program:
+
+default_startup_program
+-----------------------
+
+..  autofunction:: paddle.fluid.default_startup_program
+    :noindex:
+
+.. _api_fluid_default_main_program:
+
+default_main_program
+--------------------
+
+..  autofunction:: paddle.fluid.default_main_program
+    :noindex:
+
+.. _api_fluid_program_guard:
+
+program_guard
+-------------
+
+..  autofunction:: paddle.fluid.program_guard
+    :noindex:
+
+.. _api_fluid_get_var:
+
+get_var
+-------
+
+..  autofunction:: paddle.fluid.get_var
+    :noindex:
+
+.. _api_fluid_Executor:
+
+Executor
+--------
+
+..  autoclass:: paddle.fluid.Executor
+    :members:
+    :noindex:
+
+.. _api_fluid_global_scope:
+
+global_scope
+------------
+
+..  autofunction:: paddle.fluid.global_scope
+    :noindex:
+
+.. _api_fluid_scope_guard:
+
+scope_guard
+-----------
+
+..  autofunction:: paddle.fluid.scope_guard
+    :noindex:
+
+.. _api_fluid__switch_scope:
+
+_switch_scope
+-------------
+
+..  autofunction:: paddle.fluid._switch_scope
+    :noindex:
+
+.. _api_fluid_fetch_var:
+
+fetch_var
+---------
+
+..  autofunction:: paddle.fluid.fetch_var
+    :noindex:
+
+.. _api_fluid_Go:
+
+Go
+--
+
+..  autoclass:: paddle.fluid.Go
+    :members:
+    :noindex:
+
+.. _api_fluid_make_channel:
+
+make_channel
+------------
+
+..  autofunction:: paddle.fluid.make_channel
+    :noindex:
+
+.. _api_fluid_channel_send:
+
+channel_send
+------------
+
+..  autofunction:: paddle.fluid.channel_send
+    :noindex:
+
+.. _api_fluid_channel_recv:
+
+channel_recv
+------------
+
+..  autofunction:: paddle.fluid.channel_recv
+    :noindex:
+
+.. _api_fluid_channel_close:
+
+channel_close
+-------------
+
+..  autofunction:: paddle.fluid.channel_close
+    :noindex:
+
+.. _api_fluid_Select:
+
+Select
+------
+
+..  autoclass:: paddle.fluid.Select
+    :members:
+    :noindex:
+
+.. _api_fluid_Trainer:
+
+Trainer
+-------
+
+..  autoclass:: paddle.fluid.Trainer
+    :members:
+    :noindex:
+
+.. _api_fluid_BeginEpochEvent:
+
+BeginEpochEvent
+---------------
+
+..  autoclass:: paddle.fluid.BeginEpochEvent
+    :members:
+    :noindex:
+
+.. _api_fluid_EndEpochEvent:
+
+EndEpochEvent
+-------------
+
+..  autoclass:: paddle.fluid.EndEpochEvent
+    :members:
+    :noindex:
+
+.. _api_fluid_BeginStepEvent:
+
+BeginStepEvent
+--------------
+
+..  autoclass:: paddle.fluid.BeginStepEvent
+    :members:
+    :noindex:
+
+.. _api_fluid_EndStepEvent:
+
+EndStepEvent
+------------
+
+..  autoclass:: paddle.fluid.EndStepEvent
+    :members:
+    :noindex:
+
+.. _api_fluid_CheckpointConfig:
+
+CheckpointConfig
+----------------
+
+..  autoclass:: paddle.fluid.CheckpointConfig
+    :members:
+    :noindex:
+
+.. _api_fluid_Inferencer:
+
+Inferencer
+----------
+
+..  autoclass:: paddle.fluid.Inferencer
+    :members:
+    :noindex:
+
+.. _api_fluid_DistributeTranspiler:
+
+DistributeTranspiler
+--------------------
+
+..  autoclass:: paddle.fluid.DistributeTranspiler
+    :members:
+    :noindex:
+
+.. _api_fluid_memory_optimize:
+
+memory_optimize
+---------------
+
+..  autofunction:: paddle.fluid.memory_optimize
+    :noindex:
+
+.. _api_fluid_release_memory:
+
+release_memory
+--------------
+
+..  autofunction:: paddle.fluid.release_memory
+    :noindex:
+
+.. _api_fluid_ParallelExecutor:
+
+ParallelExecutor
+----------------
+
+..  autoclass:: paddle.fluid.ParallelExecutor
+    :members:
+    :noindex:
+
+.. _api_fluid_ExecutionStrategy:
+
+ExecutionStrategy
+-----------------
+
+..  autoclass:: paddle.fluid.ExecutionStrategy
+    :members:
+    :noindex:
+
+.. _api_fluid_BuildStrategy:
+
+BuildStrategy
+-------------
+
+..  autoclass:: paddle.fluid.BuildStrategy
+    :members:
+    :noindex:
+
+.. _api_fluid_create_lod_tensor:
+
+create_lod_tensor
+-----------------
+
+..  autofunction:: paddle.fluid.create_lod_tensor
+    :noindex:
+
+.. _api_fluid_create_random_int_lodtensor:
+
+create_random_int_lodtensor
+---------------------------
+
+..  autofunction:: paddle.fluid.create_random_int_lodtensor
+    :noindex:
+
+.. _api_fluid_LoDTensor:
+
+LoDTensor
+---------
+
+..  autoclass:: paddle.fluid.LoDTensor
+    :members:
+    :noindex:
+
+.. _api_fluid_CPUPlace:
+
+CPUPlace
+--------
+
+..  autoclass:: paddle.fluid.CPUPlace
+    :members:
+    :noindex:
+
+.. _api_fluid_CUDAPlace:
+
+CUDAPlace
+---------
+
+..  autoclass:: paddle.fluid.CUDAPlace
+    :members:
+    :noindex:
+
+.. _api_fluid_CUDAPinnedPlace:
+
+CUDAPinnedPlace
+---------------
+
+..  autoclass:: paddle.fluid.CUDAPinnedPlace
+    :members:
+    :noindex:
+
+.. _api_fluid_Tensor:
+
+Tensor
+------
+
+..  autoclass:: paddle.fluid.Tensor
+    :members:
+    :noindex:
+
+.. _api_fluid_ParamAttr:
+
+ParamAttr
+---------
+
+..  autoclass:: paddle.fluid.ParamAttr
+    :members:
+    :noindex:
+
+.. _api_fluid_WeightNormParamAttr:
+
+WeightNormParamAttr
+-------------------
+
+..  autoclass:: paddle.fluid.WeightNormParamAttr
+    :members:
+    :noindex:
+
+.. _api_fluid_DataFeeder:
+
+DataFeeder
+----------
+
+..  autoclass:: paddle.fluid.DataFeeder
+    :members:
+    :noindex:
+
+.. _api_fluid_Scope:
+
+Scope
+-----
+
+..  autoclass:: paddle.fluid.Scope
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/gen_doc.py b/doc/fluid/api/gen_doc.py
new file mode 100644
index 0000000000000000000000000000000000000000..02efce2bf8392c62a7600c272bedcadc6563f927
--- /dev/null
+++ b/doc/fluid/api/gen_doc.py
@@ -0,0 +1,125 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import argparse
+import sys
+import types
+
+import paddle.fluid as fluid
+
+
+def parse_arg():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--submodules', nargs="*")
+    parser.add_argument(
+        'module', type=str, help='Generate the documentation of which module')
+    return parser.parse_args()
+
+
+class DocGenerator(object):
+    def __init__(self, module_name=None, stream=sys.stdout):
+        if module_name == "":
+            module_name = None
+        self.stream = stream
+        if module_name is None:
+            self.module_name = "fluid"
+        else:
+            self.module_name = "fluid." + module_name
+        if module_name is None:
+            self.module = fluid
+        else:
+            if not hasattr(fluid, module_name):
+                raise ValueError("Cannot find fluid.{0}".format(module_name))
+            else:
+                self.module = getattr(fluid, module_name)
+        self.stream.write('''..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+''')
+
+        self._print_header_(self.module_name, dot='=', is_title=True)
+
+    def print_submodule(self, submodule_name):
+        submodule = getattr(self.module, submodule_name)
+        if submodule is None:
+            raise ValueError("Cannot find submodule {0}".format(submodule_name))
+        self.print_section(submodule_name)
+
+        for item in submodule.__all__:
+            self.print_item(item)
+
+    def print_current_module(self):
+        for item in self.module.__all__:
+            self.print_item(item)
+
+    def print_section(self, name):
+        self._print_header_(name, dot='=', is_title=False)
+
+    def print_item(self, name):
+        item = getattr(self.module, name, None)
+        if item is None:
+            return
+        if isinstance(item, types.TypeType):
+            self.print_class(name)
+        elif isinstance(item, types.FunctionType):
+            self.print_method(name)
+        else:
+            pass
+
+    def print_class(self, name):
+        self._print_ref_(name)
+        self._print_header_(name, dot='-', is_title=False)
+        self.stream.write('''..  autoclass:: paddle.{0}.{1}
+    :members:
+    :noindex:
+
+'''.format(self.module_name, name))
+
+    def print_method(self, name):
+        self._print_ref_(name)
+        self._print_header_(name, dot='-', is_title=False)
+        self.stream.write('''..  autofunction:: paddle.{0}.{1}
+    :noindex:
+
+'''.format(self.module_name, name))
+
+    def _print_header_(self, name, dot, is_title):
+        dot_line = dot * len(name)
+        if is_title:
+            self.stream.write(dot_line)
+            self.stream.write('\n')
+        self.stream.write(name)
+        self.stream.write('\n')
+        self.stream.write(dot_line)
+        self.stream.write('\n')
+        self.stream.write('\n')
+
+    def _print_ref_(self, name):
+        self.stream.write(".. _api_{0}_{1}:\n\n".format("_".join(
+            self.module_name.split(".")), name))
+
+
+def main():
+    args = parse_arg()
+    gen = DocGenerator(args.module)
+    if args.submodules is None:
+        gen.print_current_module()
+    else:
+        for submodule_name in args.submodules:
+            gen.print_submodule(submodule_name)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/doc/fluid/api/gen_doc.sh b/doc/fluid/api/gen_doc.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b14ee29873c50fd011f6c48b754767ac8918252a
--- /dev/null
+++ b/doc/fluid/api/gen_doc.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+python gen_doc.py layers --submodules control_flow device io nn ops tensor learning_rate_scheduler detection metric_op tensor > layers.rst
+
+for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer transpiler recordio_writer backward average profiler
+do
+  python gen_doc.py ${module} > ${module}.rst
+done
+
+python gen_doc.py "" > fluid.rst
diff --git a/doc/fluid/api/index_en.rst b/doc/fluid/api/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..359406819a993e7eaf2155c839373df44d97b103
--- /dev/null
+++ b/doc/fluid/api/index_en.rst
@@ -0,0 +1,26 @@
+=============
+API Reference
+=============
+
+..  toctree::
+    :maxdepth: 1
+
+    fluid.rst
+    layers.rst
+    data_feeder.rst
+    executor.rst
+    initializer.rst
+    metrics.rst
+    nets.rst
+    clip.rst
+    optimizer.rst
+    param_attr.rst
+    profiler.rst
+    regularizer.rst
+    io.rst
+    data.rst
+    transpiler.rst
+    recordio_writer.rst
+    backward.rst
+    average.rst
+    profiler.rst
diff --git a/doc/fluid/api/initializer.rst b/doc/fluid/api/initializer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..dc0b52b14fd242dfaded1cb9a8e0ab9eb66b0607
--- /dev/null
+++ b/doc/fluid/api/initializer.rst
@@ -0,0 +1,131 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=================
+fluid.initializer
+=================
+
+.. _api_fluid_initializer_Constant:
+
+Constant
+--------
+
+..  autoclass:: paddle.fluid.initializer.Constant
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_Uniform:
+
+Uniform
+-------
+
+..  autoclass:: paddle.fluid.initializer.Uniform
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_Normal:
+
+Normal
+------
+
+..  autoclass:: paddle.fluid.initializer.Normal
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_Xavier:
+
+Xavier
+------
+
+..  autoclass:: paddle.fluid.initializer.Xavier
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_Bilinear:
+
+Bilinear
+--------
+
+..  autoclass:: paddle.fluid.initializer.Bilinear
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_MSRA:
+
+MSRA
+----
+
+..  autoclass:: paddle.fluid.initializer.MSRA
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_force_init_on_cpu:
+
+force_init_on_cpu
+-----------------
+
+..  autofunction:: paddle.fluid.initializer.force_init_on_cpu
+    :noindex:
+
+.. _api_fluid_initializer_init_on_cpu:
+
+init_on_cpu
+-----------
+
+..  autofunction:: paddle.fluid.initializer.init_on_cpu
+    :noindex:
+
+.. _api_fluid_initializer_ConstantInitializer:
+
+ConstantInitializer
+-------------------
+
+..  autoclass:: paddle.fluid.initializer.ConstantInitializer
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_UniformInitializer:
+
+UniformInitializer
+------------------
+
+..  autoclass:: paddle.fluid.initializer.UniformInitializer
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_NormalInitializer:
+
+NormalInitializer
+-----------------
+
+..  autoclass:: paddle.fluid.initializer.NormalInitializer
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_XavierInitializer:
+
+XavierInitializer
+-----------------
+
+..  autoclass:: paddle.fluid.initializer.XavierInitializer
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_BilinearInitializer:
+
+BilinearInitializer
+-------------------
+
+..  autoclass:: paddle.fluid.initializer.BilinearInitializer
+    :members:
+    :noindex:
+
+.. _api_fluid_initializer_MSRAInitializer:
+
+MSRAInitializer
+---------------
+
+..  autoclass:: paddle.fluid.initializer.MSRAInitializer
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/io.rst b/doc/fluid/api/io.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7cee0bc4d9aa2c51517d23a381f14a8f63cc3681
--- /dev/null
+++ b/doc/fluid/api/io.rst
@@ -0,0 +1,127 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+========
+fluid.io
+========
+
+.. _api_fluid_io_save_vars:
+
+save_vars
+---------
+
+..  autofunction:: paddle.fluid.io.save_vars
+    :noindex:
+
+.. _api_fluid_io_save_params:
+
+save_params
+-----------
+
+..  autofunction:: paddle.fluid.io.save_params
+    :noindex:
+
+.. _api_fluid_io_save_persistables:
+
+save_persistables
+-----------------
+
+..  autofunction:: paddle.fluid.io.save_persistables
+    :noindex:
+
+.. _api_fluid_io_load_vars:
+
+load_vars
+---------
+
+..  autofunction:: paddle.fluid.io.load_vars
+    :noindex:
+
+.. _api_fluid_io_load_params:
+
+load_params
+-----------
+
+..  autofunction:: paddle.fluid.io.load_params
+    :noindex:
+
+.. _api_fluid_io_load_persistables:
+
+load_persistables
+-----------------
+
+..  autofunction:: paddle.fluid.io.load_persistables
+    :noindex:
+
+.. _api_fluid_io_save_inference_model:
+
+save_inference_model
+--------------------
+
+..  autofunction:: paddle.fluid.io.save_inference_model
+    :noindex:
+
+.. _api_fluid_io_load_inference_model:
+
+load_inference_model
+--------------------
+
+..  autofunction:: paddle.fluid.io.load_inference_model
+    :noindex:
+
+.. _api_fluid_io_get_inference_program:
+
+get_inference_program
+---------------------
+
+..  autofunction:: paddle.fluid.io.get_inference_program
+    :noindex:
+
+.. _api_fluid_io_save_checkpoint:
+
+save_checkpoint
+---------------
+
+..  autofunction:: paddle.fluid.io.save_checkpoint
+    :noindex:
+
+.. _api_fluid_io_load_checkpoint:
+
+load_checkpoint
+---------------
+
+..  autofunction:: paddle.fluid.io.load_checkpoint
+    :noindex:
+
+.. _api_fluid_io_clean_checkpoint:
+
+clean_checkpoint
+----------------
+
+..  autofunction:: paddle.fluid.io.clean_checkpoint
+    :noindex:
+
+.. _api_fluid_io_load_persist_vars_without_grad:
+
+load_persist_vars_without_grad
+------------------------------
+
+..  autofunction:: paddle.fluid.io.load_persist_vars_without_grad
+    :noindex:
+
+.. _api_fluid_io_save_persist_vars_without_grad:
+
+save_persist_vars_without_grad
+------------------------------
+
+..  autofunction:: paddle.fluid.io.save_persist_vars_without_grad
+    :noindex:
+
+.. _api_fluid_io_get_latest_checkpoint_serial:
+
+get_latest_checkpoint_serial
+----------------------------
+
+..  autofunction:: paddle.fluid.io.get_latest_checkpoint_serial
+    :noindex:
+
diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d443c49657b92583e527035f49e74462cf41487d
--- /dev/null
+++ b/doc/fluid/api/layers.rst
@@ -0,0 +1,1770 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+============
+fluid.layers
+============
+
+control_flow
+============
+
+.. _api_fluid_layers_split_lod_tensor:
+
+split_lod_tensor
+----------------
+
+..  autofunction:: paddle.fluid.layers.split_lod_tensor
+    :noindex:
+
+.. _api_fluid_layers_merge_lod_tensor:
+
+merge_lod_tensor
+----------------
+
+..  autofunction:: paddle.fluid.layers.merge_lod_tensor
+    :noindex:
+
+.. _api_fluid_layers_BlockGuard:
+
+BlockGuard
+----------
+
+..  autoclass:: paddle.fluid.layers.BlockGuard
+    :members:
+    :noindex:
+
+.. _api_fluid_layers_BlockGuardWithCompletion:
+
+BlockGuardWithCompletion
+------------------------
+
+..  autoclass:: paddle.fluid.layers.BlockGuardWithCompletion
+    :members:
+    :noindex:
+
+.. _api_fluid_layers_WhileGuard:
+
+WhileGuard
+----------
+
+..  autoclass:: paddle.fluid.layers.WhileGuard
+    :members:
+    :noindex:
+
+.. _api_fluid_layers_While:
+
+While
+-----
+
+..  autoclass:: paddle.fluid.layers.While
+    :members:
+    :noindex:
+
+.. _api_fluid_layers_Switch:
+
+Switch
+------
+
+..  autoclass:: paddle.fluid.layers.Switch
+    :members:
+    :noindex:
+
+.. _api_fluid_layers_lod_rank_table:
+
+lod_rank_table
+--------------
+
+..  autofunction:: paddle.fluid.layers.lod_rank_table
+    :noindex:
+
+.. _api_fluid_layers_max_sequence_len:
+
+max_sequence_len
+----------------
+
+..  autofunction:: paddle.fluid.layers.max_sequence_len
+    :noindex:
+
+.. _api_fluid_layers_lod_tensor_to_array:
+
+lod_tensor_to_array
+-------------------
+
+..  autofunction:: paddle.fluid.layers.lod_tensor_to_array
+    :noindex:
+
+.. _api_fluid_layers_array_to_lod_tensor:
+
+array_to_lod_tensor
+-------------------
+
+..  autofunction:: paddle.fluid.layers.array_to_lod_tensor
+    :noindex:
+
+.. _api_fluid_layers_increment:
+
+increment
+---------
+
+..  autofunction:: paddle.fluid.layers.increment
+    :noindex:
+
+.. _api_fluid_layers_array_write:
+
+array_write
+-----------
+
+..  autofunction:: paddle.fluid.layers.array_write
+    :noindex:
+
+.. _api_fluid_layers_create_array:
+
+create_array
+------------
+
+..  autofunction:: paddle.fluid.layers.create_array
+    :noindex:
+
+.. _api_fluid_layers_less_than:
+
+less_than
+---------
+
+..  autofunction:: paddle.fluid.layers.less_than
+    :noindex:
+
+.. _api_fluid_layers_equal:
+
+equal
+-----
+
+..  autofunction:: paddle.fluid.layers.equal
+    :noindex:
+
+.. _api_fluid_layers_array_read:
+
+array_read
+----------
+
+..  autofunction:: paddle.fluid.layers.array_read
+    :noindex:
+
+.. _api_fluid_layers_shrink_memory:
+
+shrink_memory
+-------------
+
+..  autofunction:: paddle.fluid.layers.shrink_memory
+    :noindex:
+
+.. _api_fluid_layers_array_length:
+
+array_length
+------------
+
+..  autofunction:: paddle.fluid.layers.array_length
+    :noindex:
+
+.. _api_fluid_layers_IfElse:
+
+IfElse
+------
+
+..  autoclass:: paddle.fluid.layers.IfElse
+    :members:
+    :noindex:
+
+.. _api_fluid_layers_DynamicRNN:
+
+DynamicRNN
+----------
+
+..  autoclass:: paddle.fluid.layers.DynamicRNN
+    :members:
+    :noindex:
+
+.. _api_fluid_layers_ConditionalBlock:
+
+ConditionalBlock
+----------------
+
+..  autoclass:: paddle.fluid.layers.ConditionalBlock
+    :members:
+    :noindex:
+
+.. _api_fluid_layers_StaticRNN:
+
+StaticRNN
+---------
+
+..  autoclass:: paddle.fluid.layers.StaticRNN
+    :members:
+    :noindex:
+
+.. _api_fluid_layers_reorder_lod_tensor_by_rank:
+
+reorder_lod_tensor_by_rank
+--------------------------
+
+..  autofunction:: paddle.fluid.layers.reorder_lod_tensor_by_rank
+    :noindex:
+
+.. _api_fluid_layers_ParallelDo:
+
+ParallelDo
+----------
+
+..  autoclass:: paddle.fluid.layers.ParallelDo
+    :members:
+    :noindex:
+
+.. _api_fluid_layers_Print:
+
+Print
+-----
+
+..  autofunction:: paddle.fluid.layers.Print
+    :noindex:
+
+.. _api_fluid_layers_is_empty:
+
+is_empty
+--------
+
+..  autofunction:: paddle.fluid.layers.is_empty
+    :noindex:
+
+device
+======
+
+.. _api_fluid_layers_get_places:
+
+get_places
+----------
+
+..  autofunction:: paddle.fluid.layers.get_places
+    :noindex:
+
+io
+==
+
+.. _api_fluid_layers_data:
+
+data
+----
+
+..  autofunction:: paddle.fluid.layers.data
+    :noindex:
+
+.. _api_fluid_layers_BlockGuardServ:
+
+BlockGuardServ
+--------------
+
+..  autoclass:: paddle.fluid.layers.BlockGuardServ
+    :members:
+    :noindex:
+
+.. _api_fluid_layers_ListenAndServ:
+
+ListenAndServ
+-------------
+
+..  autoclass:: paddle.fluid.layers.ListenAndServ
+    :members:
+    :noindex:
+
+.. _api_fluid_layers_Send:
+
+Send
+----
+
+..  autofunction:: paddle.fluid.layers.Send
+    :noindex:
+
+.. _api_fluid_layers_Recv:
+
+Recv
+----
+
+..  autofunction:: paddle.fluid.layers.Recv
+    :noindex:
+
+.. _api_fluid_layers_open_recordio_file:
+
+open_recordio_file
+------------------
+
+..  autofunction:: paddle.fluid.layers.open_recordio_file
+    :noindex:
+
+.. _api_fluid_layers_open_files:
+
+open_files
+----------
+
+..  autofunction:: paddle.fluid.layers.open_files
+    :noindex:
+
+.. _api_fluid_layers_read_file:
+
+read_file
+---------
+
+..  autofunction:: paddle.fluid.layers.read_file
+    :noindex:
+
+.. _api_fluid_layers_shuffle:
+
+shuffle
+-------
+
+..  autofunction:: paddle.fluid.layers.shuffle
+    :noindex:
+
+.. _api_fluid_layers_batch:
+
+batch
+-----
+
+..  autofunction:: paddle.fluid.layers.batch
+    :noindex:
+
+.. _api_fluid_layers_double_buffer:
+
+double_buffer
+-------------
+
+..  autofunction:: paddle.fluid.layers.double_buffer
+    :noindex:
+
+.. _api_fluid_layers_random_data_generator:
+
+random_data_generator
+---------------------
+
+..  autofunction:: paddle.fluid.layers.random_data_generator
+    :noindex:
+
+.. _api_fluid_layers_Preprocessor:
+
+Preprocessor
+------------
+
+..  autoclass:: paddle.fluid.layers.Preprocessor
+    :members:
+    :noindex:
+
+.. _api_fluid_layers_load:
+
+load
+----
+
+..  autofunction:: paddle.fluid.layers.load
+    :noindex:
+
+nn
+==
+
+.. _api_fluid_layers_fc:
+
+fc
+--
+
+..  autofunction:: paddle.fluid.layers.fc
+    :noindex:
+
+.. _api_fluid_layers_embedding:
+
+embedding
+---------
+
+..  autofunction:: paddle.fluid.layers.embedding
+    :noindex:
+
+.. _api_fluid_layers_dynamic_lstm:
+
+dynamic_lstm
+------------
+
+..  autofunction:: paddle.fluid.layers.dynamic_lstm
+    :noindex:
+
+.. _api_fluid_layers_dynamic_lstmp:
+
+dynamic_lstmp
+-------------
+
+..  autofunction:: paddle.fluid.layers.dynamic_lstmp
+    :noindex:
+
+.. _api_fluid_layers_dynamic_gru:
+
+dynamic_gru
+-----------
+
+..  autofunction:: paddle.fluid.layers.dynamic_gru
+    :noindex:
+
+.. _api_fluid_layers_gru_unit:
+
+gru_unit
+--------
+
+..  autofunction:: paddle.fluid.layers.gru_unit
+    :noindex:
+
+.. _api_fluid_layers_linear_chain_crf:
+
+linear_chain_crf
+----------------
+
+..  autofunction:: paddle.fluid.layers.linear_chain_crf
+    :noindex:
+
+.. _api_fluid_layers_crf_decoding:
+
+crf_decoding
+------------
+
+..  autofunction:: paddle.fluid.layers.crf_decoding
+    :noindex:
+
+.. _api_fluid_layers_cos_sim:
+
+cos_sim
+-------
+
+..  autofunction:: paddle.fluid.layers.cos_sim
+    :noindex:
+
+.. _api_fluid_layers_cross_entropy:
+
+cross_entropy
+-------------
+
+..  autofunction:: paddle.fluid.layers.cross_entropy
+    :noindex:
+
+.. _api_fluid_layers_square_error_cost:
+
+square_error_cost
+-----------------
+
+..  autofunction:: paddle.fluid.layers.square_error_cost
+    :noindex:
+
+.. _api_fluid_layers_chunk_eval:
+
+chunk_eval
+----------
+
+..  autofunction:: paddle.fluid.layers.chunk_eval
+    :noindex:
+
+.. _api_fluid_layers_sequence_conv:
+
+sequence_conv
+-------------
+
+..  autofunction:: paddle.fluid.layers.sequence_conv
+    :noindex:
+
+.. _api_fluid_layers_conv2d:
+
+conv2d
+------
+
+..  autofunction:: paddle.fluid.layers.conv2d
+    :noindex:
+
+.. _api_fluid_layers_conv3d:
+
+conv3d
+------
+
+..  autofunction:: paddle.fluid.layers.conv3d
+    :noindex:
+
+.. _api_fluid_layers_sequence_pool:
+
+sequence_pool
+-------------
+
+..  autofunction:: paddle.fluid.layers.sequence_pool
+    :noindex:
+
+.. _api_fluid_layers_sequence_softmax:
+
+sequence_softmax
+----------------
+
+..  autofunction:: paddle.fluid.layers.sequence_softmax
+    :noindex:
+
+.. _api_fluid_layers_softmax:
+
+softmax
+-------
+
+..  autofunction:: paddle.fluid.layers.softmax
+    :noindex:
+
+.. _api_fluid_layers_pool2d:
+
+pool2d
+------
+
+..  autofunction:: paddle.fluid.layers.pool2d
+    :noindex:
+
+.. _api_fluid_layers_pool3d:
+
+pool3d
+------
+
+..  autofunction:: paddle.fluid.layers.pool3d
+    :noindex:
+
+.. _api_fluid_layers_batch_norm:
+
+batch_norm
+----------
+
+..  autofunction:: paddle.fluid.layers.batch_norm
+    :noindex:
+
+.. _api_fluid_layers_beam_search_decode:
+
+beam_search_decode
+------------------
+
+..  autofunction:: paddle.fluid.layers.beam_search_decode
+    :noindex:
+
+.. _api_fluid_layers_conv2d_transpose:
+
+conv2d_transpose
+----------------
+
+..  autofunction:: paddle.fluid.layers.conv2d_transpose
+    :noindex:
+
+.. _api_fluid_layers_conv3d_transpose:
+
+conv3d_transpose
+----------------
+
+..  autofunction:: paddle.fluid.layers.conv3d_transpose
+    :noindex:
+
+.. _api_fluid_layers_sequence_expand:
+
+sequence_expand
+---------------
+
+..  autofunction:: paddle.fluid.layers.sequence_expand
+    :noindex:
+
+.. _api_fluid_layers_lstm_unit:
+
+lstm_unit
+---------
+
+..  autofunction:: paddle.fluid.layers.lstm_unit
+    :noindex:
+
+.. _api_fluid_layers_reduce_sum:
+
+reduce_sum
+----------
+
+..  autofunction:: paddle.fluid.layers.reduce_sum
+    :noindex:
+
+.. _api_fluid_layers_reduce_mean:
+
+reduce_mean
+-----------
+
+..  autofunction:: paddle.fluid.layers.reduce_mean
+    :noindex:
+
+.. _api_fluid_layers_reduce_max:
+
+reduce_max
+----------
+
+..  autofunction:: paddle.fluid.layers.reduce_max
+    :noindex:
+
+.. _api_fluid_layers_reduce_min:
+
+reduce_min
+----------
+
+..  autofunction:: paddle.fluid.layers.reduce_min
+    :noindex:
+
+.. _api_fluid_layers_reduce_prod:
+
+reduce_prod
+-----------
+
+..  autofunction:: paddle.fluid.layers.reduce_prod
+    :noindex:
+
+.. _api_fluid_layers_sequence_first_step:
+
+sequence_first_step
+-------------------
+
+..  autofunction:: paddle.fluid.layers.sequence_first_step
+    :noindex:
+
+.. _api_fluid_layers_sequence_last_step:
+
+sequence_last_step
+------------------
+
+..  autofunction:: paddle.fluid.layers.sequence_last_step
+    :noindex:
+
+.. _api_fluid_layers_dropout:
+
+dropout
+-------
+
+..  autofunction:: paddle.fluid.layers.dropout
+    :noindex:
+
+.. _api_fluid_layers_split:
+
+split
+-----
+
+..  autofunction:: paddle.fluid.layers.split
+    :noindex:
+
+.. _api_fluid_layers_ctc_greedy_decoder:
+
+ctc_greedy_decoder
+------------------
+
+..  autofunction:: paddle.fluid.layers.ctc_greedy_decoder
+    :noindex:
+
+.. _api_fluid_layers_edit_distance:
+
+edit_distance
+-------------
+
+..  autofunction:: paddle.fluid.layers.edit_distance
+    :noindex:
+
+.. _api_fluid_layers_l2_normalize:
+
+l2_normalize
+------------
+
+..  autofunction:: paddle.fluid.layers.l2_normalize
+    :noindex:
+
+.. _api_fluid_layers_matmul:
+
+matmul
+------
+
+..  autofunction:: paddle.fluid.layers.matmul
+    :noindex:
+
+.. _api_fluid_layers_topk:
+
+topk
+----
+
+..  autofunction:: paddle.fluid.layers.topk
+    :noindex:
+
+.. _api_fluid_layers_warpctc:
+
+warpctc
+-------
+
+..  autofunction:: paddle.fluid.layers.warpctc
+    :noindex:
+
+.. _api_fluid_layers_sequence_reshape:
+
+sequence_reshape
+----------------
+
+..  autofunction:: paddle.fluid.layers.sequence_reshape
+    :noindex:
+
+.. _api_fluid_layers_transpose:
+
+transpose
+---------
+
+..  autofunction:: paddle.fluid.layers.transpose
+    :noindex:
+
+.. _api_fluid_layers_im2sequence:
+
+im2sequence
+-----------
+
+..  autofunction:: paddle.fluid.layers.im2sequence
+    :noindex:
+
+.. _api_fluid_layers_nce:
+
+nce
+---
+
+..  autofunction:: paddle.fluid.layers.nce
+    :noindex:
+
+.. _api_fluid_layers_beam_search:
+
+beam_search
+-----------
+
+..  autofunction:: paddle.fluid.layers.beam_search
+    :noindex:
+
+.. _api_fluid_layers_row_conv:
+
+row_conv
+--------
+
+..  autofunction:: paddle.fluid.layers.row_conv
+    :noindex:
+
+.. _api_fluid_layers_multiplex:
+
+multiplex
+---------
+
+..  autofunction:: paddle.fluid.layers.multiplex
+    :noindex:
+
+.. _api_fluid_layers_layer_norm:
+
+layer_norm
+----------
+
+..  autofunction:: paddle.fluid.layers.layer_norm
+    :noindex:
+
+.. _api_fluid_layers_softmax_with_cross_entropy:
+
+softmax_with_cross_entropy
+--------------------------
+
+..  autofunction:: paddle.fluid.layers.softmax_with_cross_entropy
+    :noindex:
+
+.. _api_fluid_layers_smooth_l1:
+
+smooth_l1
+---------
+
+..  autofunction:: paddle.fluid.layers.smooth_l1
+    :noindex:
+
+.. _api_fluid_layers_one_hot:
+
+one_hot
+-------
+
+..  autofunction:: paddle.fluid.layers.one_hot
+    :noindex:
+
+.. _api_fluid_layers_autoincreased_step_counter:
+
+autoincreased_step_counter
+--------------------------
+
+..  autofunction:: paddle.fluid.layers.autoincreased_step_counter
+    :noindex:
+
+.. _api_fluid_layers_reshape:
+
+reshape
+-------
+
+..  autofunction:: paddle.fluid.layers.reshape
+    :noindex:
+
+.. _api_fluid_layers_lod_reset:
+
+lod_reset
+---------
+
+..  autofunction:: paddle.fluid.layers.lod_reset
+    :noindex:
+
+.. _api_fluid_layers_lrn:
+
+lrn
+---
+
+..  autofunction:: paddle.fluid.layers.lrn
+    :noindex:
+
+.. _api_fluid_layers_pad:
+
+pad
+---
+
+..  autofunction:: paddle.fluid.layers.pad
+    :noindex:
+
+.. _api_fluid_layers_label_smooth:
+
+label_smooth
+------------
+
+..  autofunction:: paddle.fluid.layers.label_smooth
+    :noindex:
+
+.. _api_fluid_layers_roi_pool:
+
+roi_pool
+--------
+
+..  autofunction:: paddle.fluid.layers.roi_pool
+    :noindex:
+
+.. _api_fluid_layers_dice_loss:
+
+dice_loss
+---------
+
+..  autofunction:: paddle.fluid.layers.dice_loss
+    :noindex:
+
+.. _api_fluid_layers_image_resize:
+
+image_resize
+------------
+
+..  autofunction:: paddle.fluid.layers.image_resize
+    :noindex:
+
+.. _api_fluid_layers_image_resize_short:
+
+image_resize_short
+------------------
+
+..  autofunction:: paddle.fluid.layers.image_resize_short
+    :noindex:
+
+.. _api_fluid_layers_resize_bilinear:
+
+resize_bilinear
+---------------
+
+..  autofunction:: paddle.fluid.layers.resize_bilinear
+    :noindex:
+
+.. _api_fluid_layers_gather:
+
+gather
+------
+
+..  autofunction:: paddle.fluid.layers.gather
+    :noindex:
+
+.. _api_fluid_layers_random_crop:
+
+random_crop
+-----------
+
+..  autofunction:: paddle.fluid.layers.random_crop
+    :noindex:
+
+.. _api_fluid_layers_mean_iou:
+
+mean_iou
+--------
+
+..  autofunction:: paddle.fluid.layers.mean_iou
+    :noindex:
+
+.. _api_fluid_layers_relu:
+
+relu
+----
+
+..  autofunction:: paddle.fluid.layers.relu
+    :noindex:
+
+.. _api_fluid_layers_log:
+
+log
+---
+
+..  autofunction:: paddle.fluid.layers.log
+    :noindex:
+
+.. _api_fluid_layers_crop:
+
+crop
+----
+
+..  autofunction:: paddle.fluid.layers.crop
+    :noindex:
+
+ops
+===
+
+.. _api_fluid_layers_mean:
+
+mean
+----
+
+..  autofunction:: paddle.fluid.layers.mean
+    :noindex:
+
+.. _api_fluid_layers_mul:
+
+mul
+---
+
+..  autofunction:: paddle.fluid.layers.mul
+    :noindex:
+
+.. _api_fluid_layers_scale:
+
+scale
+-----
+
+..  autofunction:: paddle.fluid.layers.scale
+    :noindex:
+
+.. _api_fluid_layers_sigmoid_cross_entropy_with_logits:
+
+sigmoid_cross_entropy_with_logits
+---------------------------------
+
+..  autofunction:: paddle.fluid.layers.sigmoid_cross_entropy_with_logits
+    :noindex:
+
+.. _api_fluid_layers_elementwise_add:
+
+elementwise_add
+---------------
+
+..  autofunction:: paddle.fluid.layers.elementwise_add
+    :noindex:
+
+.. _api_fluid_layers_elementwise_div:
+
+elementwise_div
+---------------
+
+..  autofunction:: paddle.fluid.layers.elementwise_div
+    :noindex:
+
+.. _api_fluid_layers_elementwise_sub:
+
+elementwise_sub
+---------------
+
+..  autofunction:: paddle.fluid.layers.elementwise_sub
+    :noindex:
+
+.. _api_fluid_layers_elementwise_mul:
+
+elementwise_mul
+---------------
+
+..  autofunction:: paddle.fluid.layers.elementwise_mul
+    :noindex:
+
+.. _api_fluid_layers_elementwise_max:
+
+elementwise_max
+---------------
+
+..  autofunction:: paddle.fluid.layers.elementwise_max
+    :noindex:
+
+.. _api_fluid_layers_elementwise_min:
+
+elementwise_min
+---------------
+
+..  autofunction:: paddle.fluid.layers.elementwise_min
+    :noindex:
+
+.. _api_fluid_layers_elementwise_pow:
+
+elementwise_pow
+---------------
+
+..  autofunction:: paddle.fluid.layers.elementwise_pow
+    :noindex:
+
+.. _api_fluid_layers_clip:
+
+clip
+----
+
+..  autofunction:: paddle.fluid.layers.clip
+    :noindex:
+
+.. _api_fluid_layers_clip_by_norm:
+
+clip_by_norm
+------------
+
+..  autofunction:: paddle.fluid.layers.clip_by_norm
+    :noindex:
+
+.. _api_fluid_layers_logical_and:
+
+logical_and
+-----------
+
+..  autofunction:: paddle.fluid.layers.logical_and
+    :noindex:
+
+.. _api_fluid_layers_logical_or:
+
+logical_or
+----------
+
+..  autofunction:: paddle.fluid.layers.logical_or
+    :noindex:
+
+.. _api_fluid_layers_logical_xor:
+
+logical_xor
+-----------
+
+..  autofunction:: paddle.fluid.layers.logical_xor
+    :noindex:
+
+.. _api_fluid_layers_logical_not:
+
+logical_not
+-----------
+
+..  autofunction:: paddle.fluid.layers.logical_not
+    :noindex:
+
+.. _api_fluid_layers_uniform_random_batch_size_like:
+
+uniform_random_batch_size_like
+------------------------------
+
+..  autofunction:: paddle.fluid.layers.uniform_random_batch_size_like
+    :noindex:
+
+.. _api_fluid_layers_gaussian_random:
+
+gaussian_random
+---------------
+
+..  autofunction:: paddle.fluid.layers.gaussian_random
+    :noindex:
+
+.. _api_fluid_layers_gaussian_random_batch_size_like:
+
+gaussian_random_batch_size_like
+-------------------------------
+
+..  autofunction:: paddle.fluid.layers.gaussian_random_batch_size_like
+    :noindex:
+
+.. _api_fluid_layers_scatter:
+
+scatter
+-------
+
+..  autofunction:: paddle.fluid.layers.scatter
+    :noindex:
+
+.. _api_fluid_layers_sum:
+
+sum
+---
+
+..  autofunction:: paddle.fluid.layers.sum
+    :noindex:
+
+.. _api_fluid_layers_slice:
+
+slice
+-----
+
+..  autofunction:: paddle.fluid.layers.slice
+    :noindex:
+
+.. _api_fluid_layers_polygon_box_transform:
+
+polygon_box_transform
+---------------------
+
+..  autofunction:: paddle.fluid.layers.polygon_box_transform
+    :noindex:
+
+.. _api_fluid_layers_shape:
+
+shape
+-----
+
+..  autofunction:: paddle.fluid.layers.shape
+    :noindex:
+
+.. _api_fluid_layers_iou_similarity:
+
+iou_similarity
+--------------
+
+..  autofunction:: paddle.fluid.layers.iou_similarity
+    :noindex:
+
+.. _api_fluid_layers_maxout:
+
+maxout
+------
+
+..  autofunction:: paddle.fluid.layers.maxout
+    :noindex:
+
+.. _api_fluid_layers_sigmoid:
+
+sigmoid
+-------
+
+..  autofunction:: paddle.fluid.layers.sigmoid
+    :noindex:
+
+.. _api_fluid_layers_logsigmoid:
+
+logsigmoid
+----------
+
+..  autofunction:: paddle.fluid.layers.logsigmoid
+    :noindex:
+
+.. _api_fluid_layers_exp:
+
+exp
+---
+
+..  autofunction:: paddle.fluid.layers.exp
+    :noindex:
+
+.. _api_fluid_layers_tanh:
+
+tanh
+----
+
+..  autofunction:: paddle.fluid.layers.tanh
+    :noindex:
+
+.. _api_fluid_layers_tanh_shrink:
+
+tanh_shrink
+-----------
+
+..  autofunction:: paddle.fluid.layers.tanh_shrink
+    :noindex:
+
+.. _api_fluid_layers_softshrink:
+
+softshrink
+----------
+
+..  autofunction:: paddle.fluid.layers.softshrink
+    :noindex:
+
+.. _api_fluid_layers_sqrt:
+
+sqrt
+----
+
+..  autofunction:: paddle.fluid.layers.sqrt
+    :noindex:
+
+.. _api_fluid_layers_abs:
+
+abs
+---
+
+..  autofunction:: paddle.fluid.layers.abs
+    :noindex:
+
+.. _api_fluid_layers_ceil:
+
+ceil
+----
+
+..  autofunction:: paddle.fluid.layers.ceil
+    :noindex:
+
+.. _api_fluid_layers_floor:
+
+floor
+-----
+
+..  autofunction:: paddle.fluid.layers.floor
+    :noindex:
+
+.. _api_fluid_layers_cos:
+
+cos
+---
+
+..  autofunction:: paddle.fluid.layers.cos
+    :noindex:
+
+.. _api_fluid_layers_sin:
+
+sin
+---
+
+..  autofunction:: paddle.fluid.layers.sin
+    :noindex:
+
+.. _api_fluid_layers_round:
+
+round
+-----
+
+..  autofunction:: paddle.fluid.layers.round
+    :noindex:
+
+.. _api_fluid_layers_reciprocal:
+
+reciprocal
+----------
+
+..  autofunction:: paddle.fluid.layers.reciprocal
+    :noindex:
+
+.. _api_fluid_layers_square:
+
+square
+------
+
+..  autofunction:: paddle.fluid.layers.square
+    :noindex:
+
+.. _api_fluid_layers_softplus:
+
+softplus
+--------
+
+..  autofunction:: paddle.fluid.layers.softplus
+    :noindex:
+
+.. _api_fluid_layers_softsign:
+
+softsign
+--------
+
+..  autofunction:: paddle.fluid.layers.softsign
+    :noindex:
+
+.. _api_fluid_layers_brelu:
+
+brelu
+-----
+
+..  autofunction:: paddle.fluid.layers.brelu
+    :noindex:
+
+.. _api_fluid_layers_leaky_relu:
+
+leaky_relu
+----------
+
+..  autofunction:: paddle.fluid.layers.leaky_relu
+    :noindex:
+
+.. _api_fluid_layers_soft_relu:
+
+soft_relu
+---------
+
+..  autofunction:: paddle.fluid.layers.soft_relu
+    :noindex:
+
+.. _api_fluid_layers_elu:
+
+elu
+---
+
+..  autofunction:: paddle.fluid.layers.elu
+    :noindex:
+
+.. _api_fluid_layers_relu6:
+
+relu6
+-----
+
+..  autofunction:: paddle.fluid.layers.relu6
+    :noindex:
+
+.. _api_fluid_layers_pow:
+
+pow
+---
+
+..  autofunction:: paddle.fluid.layers.pow
+    :noindex:
+
+.. _api_fluid_layers_stanh:
+
+stanh
+-----
+
+..  autofunction:: paddle.fluid.layers.stanh
+    :noindex:
+
+.. _api_fluid_layers_hard_sigmoid:
+
+hard_sigmoid
+------------
+
+..  autofunction:: paddle.fluid.layers.hard_sigmoid
+    :noindex:
+
+.. _api_fluid_layers_swish:
+
+swish
+-----
+
+..  autofunction:: paddle.fluid.layers.swish
+    :noindex:
+
+.. _api_fluid_layers_uniform_random:
+
+uniform_random
+--------------
+
+..  autofunction:: paddle.fluid.layers.uniform_random
+    :noindex:
+
+.. _api_fluid_layers_hard_shrink:
+
+hard_shrink
+-----------
+
+..  autofunction:: paddle.fluid.layers.hard_shrink
+    :noindex:
+
+.. _api_fluid_layers_cumsum:
+
+cumsum
+------
+
+..  autofunction:: paddle.fluid.layers.cumsum
+    :noindex:
+
+.. _api_fluid_layers_thresholded_relu:
+
+thresholded_relu
+----------------
+
+..  autofunction:: paddle.fluid.layers.thresholded_relu
+    :noindex:
+
+tensor
+======
+
+.. _api_fluid_layers_create_tensor:
+
+create_tensor
+-------------
+
+..  autofunction:: paddle.fluid.layers.create_tensor
+    :noindex:
+
+.. _api_fluid_layers_create_parameter:
+
+create_parameter
+----------------
+
+..  autofunction:: paddle.fluid.layers.create_parameter
+    :noindex:
+
+.. _api_fluid_layers_create_global_var:
+
+create_global_var
+-----------------
+
+..  autofunction:: paddle.fluid.layers.create_global_var
+    :noindex:
+
+.. _api_fluid_layers_cast:
+
+cast
+----
+
+..  autofunction:: paddle.fluid.layers.cast
+    :noindex:
+
+.. _api_fluid_layers_concat:
+
+concat
+------
+
+..  autofunction:: paddle.fluid.layers.concat
+    :noindex:
+
+.. _api_fluid_layers_sums:
+
+sums
+----
+
+..  autofunction:: paddle.fluid.layers.sums
+    :noindex:
+
+.. _api_fluid_layers_assign:
+
+assign
+------
+
+..  autofunction:: paddle.fluid.layers.assign
+    :noindex:
+
+.. _api_fluid_layers_fill_constant_batch_size_like:
+
+fill_constant_batch_size_like
+-----------------------------
+
+..  autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
+    :noindex:
+
+.. _api_fluid_layers_fill_constant:
+
+fill_constant
+-------------
+
+..  autofunction:: paddle.fluid.layers.fill_constant
+    :noindex:
+
+.. _api_fluid_layers_argmin:
+
+argmin
+------
+
+..  autofunction:: paddle.fluid.layers.argmin
+    :noindex:
+
+.. _api_fluid_layers_argmax:
+
+argmax
+------
+
+..  autofunction:: paddle.fluid.layers.argmax
+    :noindex:
+
+.. _api_fluid_layers_argsort:
+
+argsort
+-------
+
+..  autofunction:: paddle.fluid.layers.argsort
+    :noindex:
+
+.. _api_fluid_layers_ones:
+
+ones
+----
+
+..  autofunction:: paddle.fluid.layers.ones
+    :noindex:
+
+.. _api_fluid_layers_zeros:
+
+zeros
+-----
+
+..  autofunction:: paddle.fluid.layers.zeros
+    :noindex:
+
+.. _api_fluid_layers_reverse:
+
+reverse
+-------
+
+..  autofunction:: paddle.fluid.layers.reverse
+    :noindex:
+
+learning_rate_scheduler
+=======================
+
+.. _api_fluid_layers_exponential_decay:
+
+exponential_decay
+-----------------
+
+..  autofunction:: paddle.fluid.layers.exponential_decay
+    :noindex:
+
+.. _api_fluid_layers_natural_exp_decay:
+
+natural_exp_decay
+-----------------
+
+..  autofunction:: paddle.fluid.layers.natural_exp_decay
+    :noindex:
+
+.. _api_fluid_layers_inverse_time_decay:
+
+inverse_time_decay
+------------------
+
+..  autofunction:: paddle.fluid.layers.inverse_time_decay
+    :noindex:
+
+.. _api_fluid_layers_polynomial_decay:
+
+polynomial_decay
+----------------
+
+..  autofunction:: paddle.fluid.layers.polynomial_decay
+    :noindex:
+
+.. _api_fluid_layers_piecewise_decay:
+
+piecewise_decay
+---------------
+
+..  autofunction:: paddle.fluid.layers.piecewise_decay
+    :noindex:
+
+.. _api_fluid_layers_noam_decay:
+
+noam_decay
+----------
+
+..  autofunction:: paddle.fluid.layers.noam_decay
+    :noindex:
+
+.. _api_fluid_layers_append_LARS:
+
+append_LARS
+-----------
+
+..  autofunction:: paddle.fluid.layers.append_LARS
+    :noindex:
+
+detection
+=========
+
+.. _api_fluid_layers_prior_box:
+
+prior_box
+---------
+
+..  autofunction:: paddle.fluid.layers.prior_box
+    :noindex:
+
+.. _api_fluid_layers_multi_box_head:
+
+multi_box_head
+--------------
+
+..  autofunction:: paddle.fluid.layers.multi_box_head
+    :noindex:
+
+.. _api_fluid_layers_bipartite_match:
+
+bipartite_match
+---------------
+
+..  autofunction:: paddle.fluid.layers.bipartite_match
+    :noindex:
+
+.. _api_fluid_layers_target_assign:
+
+target_assign
+-------------
+
+..  autofunction:: paddle.fluid.layers.target_assign
+    :noindex:
+
+.. _api_fluid_layers_detection_output:
+
+detection_output
+----------------
+
+..  autofunction:: paddle.fluid.layers.detection_output
+    :noindex:
+
+.. _api_fluid_layers_ssd_loss:
+
+ssd_loss
+--------
+
+..  autofunction:: paddle.fluid.layers.ssd_loss
+    :noindex:
+
+.. _api_fluid_layers_detection_map:
+
+detection_map
+-------------
+
+..  autofunction:: paddle.fluid.layers.detection_map
+    :noindex:
+
+.. _api_fluid_layers_iou_similarity:
+
+iou_similarity
+--------------
+
+..  autofunction:: paddle.fluid.layers.iou_similarity
+    :noindex:
+
+.. _api_fluid_layers_box_coder:
+
+box_coder
+---------
+
+..  autofunction:: paddle.fluid.layers.box_coder
+    :noindex:
+
+metric_op
+=========
+
+.. _api_fluid_layers_accuracy:
+
+accuracy
+--------
+
+..  autofunction:: paddle.fluid.layers.accuracy
+    :noindex:
+
+.. _api_fluid_layers_auc:
+
+auc
+---
+
+..  autofunction:: paddle.fluid.layers.auc
+    :noindex:
+
+tensor
+======
+
+.. _api_fluid_layers_create_tensor:
+
+create_tensor
+-------------
+
+..  autofunction:: paddle.fluid.layers.create_tensor
+    :noindex:
+
+.. _api_fluid_layers_create_parameter:
+
+create_parameter
+----------------
+
+..  autofunction:: paddle.fluid.layers.create_parameter
+    :noindex:
+
+.. _api_fluid_layers_create_global_var:
+
+create_global_var
+-----------------
+
+..  autofunction:: paddle.fluid.layers.create_global_var
+    :noindex:
+
+.. _api_fluid_layers_cast:
+
+cast
+----
+
+..  autofunction:: paddle.fluid.layers.cast
+    :noindex:
+
+.. _api_fluid_layers_concat:
+
+concat
+------
+
+..  autofunction:: paddle.fluid.layers.concat
+    :noindex:
+
+.. _api_fluid_layers_sums:
+
+sums
+----
+
+..  autofunction:: paddle.fluid.layers.sums
+    :noindex:
+
+.. _api_fluid_layers_assign:
+
+assign
+------
+
+..  autofunction:: paddle.fluid.layers.assign
+    :noindex:
+
+.. _api_fluid_layers_fill_constant_batch_size_like:
+
+fill_constant_batch_size_like
+-----------------------------
+
+..  autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
+    :noindex:
+
+.. _api_fluid_layers_fill_constant:
+
+fill_constant
+-------------
+
+..  autofunction:: paddle.fluid.layers.fill_constant
+    :noindex:
+
+.. _api_fluid_layers_argmin:
+
+argmin
+------
+
+..  autofunction:: paddle.fluid.layers.argmin
+    :noindex:
+
+.. _api_fluid_layers_argmax:
+
+argmax
+------
+
+..  autofunction:: paddle.fluid.layers.argmax
+    :noindex:
+
+.. _api_fluid_layers_ones:
+
+ones
+----
+
+..  autofunction:: paddle.fluid.layers.ones
+    :noindex:
+
+.. _api_fluid_layers_zeros:
+
+zeros
+-----
+
+..  autofunction:: paddle.fluid.layers.zeros
+    :noindex:
+
+.. _api_fluid_layers_reverse:
+
+reverse
+-------
+
+..  autofunction:: paddle.fluid.layers.reverse
+    :noindex:
+
diff --git a/doc/fluid/api/metrics.rst b/doc/fluid/api/metrics.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0f54b2e2eb7ead353215c5dbd529293794e37123
--- /dev/null
+++ b/doc/fluid/api/metrics.rst
@@ -0,0 +1,88 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=============
+fluid.metrics
+=============
+
+.. _api_fluid_metrics_MetricBase:
+
+MetricBase
+----------
+
+..  autoclass:: paddle.fluid.metrics.MetricBase
+    :members:
+    :noindex:
+
+.. _api_fluid_metrics_CompositeMetric:
+
+CompositeMetric
+---------------
+
+..  autoclass:: paddle.fluid.metrics.CompositeMetric
+    :members:
+    :noindex:
+
+.. _api_fluid_metrics_Precision:
+
+Precision
+---------
+
+..  autoclass:: paddle.fluid.metrics.Precision
+    :members:
+    :noindex:
+
+.. _api_fluid_metrics_Recall:
+
+Recall
+------
+
+..  autoclass:: paddle.fluid.metrics.Recall
+    :members:
+    :noindex:
+
+.. _api_fluid_metrics_Accuracy:
+
+Accuracy
+--------
+
+..  autoclass:: paddle.fluid.metrics.Accuracy
+    :members:
+    :noindex:
+
+.. _api_fluid_metrics_ChunkEvaluator:
+
+ChunkEvaluator
+--------------
+
+..  autoclass:: paddle.fluid.metrics.ChunkEvaluator
+    :members:
+    :noindex:
+
+.. _api_fluid_metrics_EditDistance:
+
+EditDistance
+------------
+
+..  autoclass:: paddle.fluid.metrics.EditDistance
+    :members:
+    :noindex:
+
+.. _api_fluid_metrics_DetectionMAP:
+
+DetectionMAP
+------------
+
+..  autoclass:: paddle.fluid.metrics.DetectionMAP
+    :members:
+    :noindex:
+
+.. _api_fluid_metrics_Auc:
+
+Auc
+---
+
+..  autoclass:: paddle.fluid.metrics.Auc
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/nets.rst b/doc/fluid/api/nets.rst
new file mode 100644
index 0000000000000000000000000000000000000000..059733af18517257b6821d95fd628a9e13e6e98e
--- /dev/null
+++ b/doc/fluid/api/nets.rst
@@ -0,0 +1,39 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+==========
+fluid.nets
+==========
+
+.. _api_fluid_nets_simple_img_conv_pool:
+
+simple_img_conv_pool
+--------------------
+
+..  autofunction:: paddle.fluid.nets.simple_img_conv_pool
+    :noindex:
+
+.. _api_fluid_nets_sequence_conv_pool:
+
+sequence_conv_pool
+------------------
+
+..  autofunction:: paddle.fluid.nets.sequence_conv_pool
+    :noindex:
+
+.. _api_fluid_nets_glu:
+
+glu
+---
+
+..  autofunction:: paddle.fluid.nets.glu
+    :noindex:
+
+.. _api_fluid_nets_scaled_dot_product_attention:
+
+scaled_dot_product_attention
+----------------------------
+
+..  autofunction:: paddle.fluid.nets.scaled_dot_product_attention
+    :noindex:
+
diff --git a/doc/fluid/api/optimizer.rst b/doc/fluid/api/optimizer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8d792120f2f16a8c92606b343eb4c3d4368bed14
--- /dev/null
+++ b/doc/fluid/api/optimizer.rst
@@ -0,0 +1,178 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+===============
+fluid.optimizer
+===============
+
+.. _api_fluid_optimizer_SGD:
+
+SGD
+---
+
+..  autoclass:: paddle.fluid.optimizer.SGD
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_Momentum:
+
+Momentum
+--------
+
+..  autoclass:: paddle.fluid.optimizer.Momentum
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_Adagrad:
+
+Adagrad
+-------
+
+..  autoclass:: paddle.fluid.optimizer.Adagrad
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_Adam:
+
+Adam
+----
+
+..  autoclass:: paddle.fluid.optimizer.Adam
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_Adamax:
+
+Adamax
+------
+
+..  autoclass:: paddle.fluid.optimizer.Adamax
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_DecayedAdagrad:
+
+DecayedAdagrad
+--------------
+
+..  autoclass:: paddle.fluid.optimizer.DecayedAdagrad
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_Ftrl:
+
+Ftrl
+----
+
+..  autoclass:: paddle.fluid.optimizer.Ftrl
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_SGDOptimizer:
+
+SGDOptimizer
+------------
+
+..  autoclass:: paddle.fluid.optimizer.SGDOptimizer
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_MomentumOptimizer:
+
+MomentumOptimizer
+-----------------
+
+..  autoclass:: paddle.fluid.optimizer.MomentumOptimizer
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_AdagradOptimizer:
+
+AdagradOptimizer
+----------------
+
+..  autoclass:: paddle.fluid.optimizer.AdagradOptimizer
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_AdamOptimizer:
+
+AdamOptimizer
+-------------
+
+..  autoclass:: paddle.fluid.optimizer.AdamOptimizer
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_AdamaxOptimizer:
+
+AdamaxOptimizer
+---------------
+
+..  autoclass:: paddle.fluid.optimizer.AdamaxOptimizer
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_DecayedAdagradOptimizer:
+
+DecayedAdagradOptimizer
+-----------------------
+
+..  autoclass:: paddle.fluid.optimizer.DecayedAdagradOptimizer
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_RMSPropOptimizer:
+
+RMSPropOptimizer
+----------------
+
+..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_FtrlOptimizer:
+
+FtrlOptimizer
+-------------
+
+..  autoclass:: paddle.fluid.optimizer.FtrlOptimizer
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_Adadelta:
+
+Adadelta
+--------
+
+..  autoclass:: paddle.fluid.optimizer.Adadelta
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_ModelAverage:
+
+ModelAverage
+------------
+
+..  autoclass:: paddle.fluid.optimizer.ModelAverage
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_Optimizer:
+
+Optimizer
+---------
+
+..  autoclass:: paddle.fluid.optimizer.Optimizer
+    :members:
+    :noindex:
+
+.. _api_fluid_optimizer_RMSPropOptimizer:
+
+RMSPropOptimizer
+----------------
+
+..  autoclass:: paddle.fluid.optimizer.RMSPropOptimizer
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/param_attr.rst b/doc/fluid/api/param_attr.rst
new file mode 100644
index 0000000000000000000000000000000000000000..33035bbc7ca5c8d000adeaf1cb79806a3ea64604
--- /dev/null
+++ b/doc/fluid/api/param_attr.rst
@@ -0,0 +1,25 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+================
+fluid.param_attr
+================
+
+.. _api_fluid_param_attr_ParamAttr:
+
+ParamAttr
+---------
+
+..  autoclass:: paddle.fluid.param_attr.ParamAttr
+    :members:
+    :noindex:
+
+.. _api_fluid_param_attr_WeightNormParamAttr:
+
+WeightNormParamAttr
+-------------------
+
+..  autoclass:: paddle.fluid.param_attr.WeightNormParamAttr
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/profiler.rst b/doc/fluid/api/profiler.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c750a2d588df56728ac7f73051ab7a9e44dee232
--- /dev/null
+++ b/doc/fluid/api/profiler.rst
@@ -0,0 +1,47 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+==============
+fluid.profiler
+==============
+
+.. _api_fluid_profiler_cuda_profiler:
+
+cuda_profiler
+-------------
+
+..  autofunction:: paddle.fluid.profiler.cuda_profiler
+    :noindex:
+
+.. _api_fluid_profiler_reset_profiler:
+
+reset_profiler
+--------------
+
+..  autofunction:: paddle.fluid.profiler.reset_profiler
+    :noindex:
+
+.. _api_fluid_profiler_profiler:
+
+profiler
+--------
+
+..  autofunction:: paddle.fluid.profiler.profiler
+    :noindex:
+
+.. _api_fluid_profiler_start_profiler:
+
+start_profiler
+--------------
+
+..  autofunction:: paddle.fluid.profiler.start_profiler
+    :noindex:
+
+.. _api_fluid_profiler_stop_profiler:
+
+stop_profiler
+-------------
+
+..  autofunction:: paddle.fluid.profiler.stop_profiler
+    :noindex:
+
diff --git a/doc/fluid/api/recordio_writer.rst b/doc/fluid/api/recordio_writer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f0c12fd115478a29fbd178b533b7490b2f663717
--- /dev/null
+++ b/doc/fluid/api/recordio_writer.rst
@@ -0,0 +1,23 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=====================
+fluid.recordio_writer
+=====================
+
+.. _api_fluid_recordio_writer_convert_reader_to_recordio_file:
+
+convert_reader_to_recordio_file
+-------------------------------
+
+..  autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_file
+    :noindex:
+
+.. _api_fluid_recordio_writer_convert_reader_to_recordio_files:
+
+convert_reader_to_recordio_files
+--------------------------------
+
+..  autofunction:: paddle.fluid.recordio_writer.convert_reader_to_recordio_files
+    :noindex:
+
diff --git a/doc/fluid/api/regularizer.rst b/doc/fluid/api/regularizer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..987eaea903520d91c284c8da7a8cb066a1648069
--- /dev/null
+++ b/doc/fluid/api/regularizer.rst
@@ -0,0 +1,51 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+=================
+fluid.regularizer
+=================
+
+.. _api_fluid_regularizer_append_regularization_ops:
+
+append_regularization_ops
+-------------------------
+
+..  autofunction:: paddle.fluid.regularizer.append_regularization_ops
+    :noindex:
+
+.. _api_fluid_regularizer_L1Decay:
+
+L1Decay
+-------
+
+..  autoclass:: paddle.fluid.regularizer.L1Decay
+    :members:
+    :noindex:
+
+.. _api_fluid_regularizer_L2Decay:
+
+L2Decay
+-------
+
+..  autoclass:: paddle.fluid.regularizer.L2Decay
+    :members:
+    :noindex:
+
+.. _api_fluid_regularizer_L1DecayRegularizer:
+
+L1DecayRegularizer
+------------------
+
+..  autoclass:: paddle.fluid.regularizer.L1DecayRegularizer
+    :members:
+    :noindex:
+
+.. _api_fluid_regularizer_L2DecayRegularizer:
+
+L2DecayRegularizer
+------------------
+
+..  autoclass:: paddle.fluid.regularizer.L2DecayRegularizer
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/api/transpiler.rst b/doc/fluid/api/transpiler.rst
new file mode 100644
index 0000000000000000000000000000000000000000..943d39331d26c05764c90cb24f6774997c976bfe
--- /dev/null
+++ b/doc/fluid/api/transpiler.rst
@@ -0,0 +1,50 @@
+..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+    !DO NOT EDIT THIS FILE MANUALLY!
+
+================
+fluid.transpiler
+================
+
+.. _api_fluid_transpiler_DistributeTranspiler:
+
+DistributeTranspiler
+--------------------
+
+..  autoclass:: paddle.fluid.transpiler.DistributeTranspiler
+    :members:
+    :noindex:
+
+.. _api_fluid_transpiler_memory_optimize:
+
+memory_optimize
+---------------
+
+..  autofunction:: paddle.fluid.transpiler.memory_optimize
+    :noindex:
+
+.. _api_fluid_transpiler_release_memory:
+
+release_memory
+--------------
+
+..  autofunction:: paddle.fluid.transpiler.release_memory
+    :noindex:
+
+.. _api_fluid_transpiler_HashName:
+
+HashName
+--------
+
+..  autoclass:: paddle.fluid.transpiler.HashName
+    :members:
+    :noindex:
+
+.. _api_fluid_transpiler_RoundRobin:
+
+RoundRobin
+----------
+
+..  autoclass:: paddle.fluid.transpiler.RoundRobin
+    :members:
+    :noindex:
+
diff --git a/doc/fluid/build_and_install/build_from_source_cn.rst b/doc/fluid/build_and_install/build_from_source_cn.rst
new file mode 120000
index 0000000000000000000000000000000000000000..ae4e8c7c48e584ec16a7be5466f83dd154ffb5fb
--- /dev/null
+++ b/doc/fluid/build_and_install/build_from_source_cn.rst
@@ -0,0 +1 @@
+../../v2/build_and_install/build_from_source_cn.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/build_from_source_en.rst b/doc/fluid/build_and_install/build_from_source_en.rst
new file mode 120000
index 0000000000000000000000000000000000000000..1ac828c973826bb8374c4aa8e17fda3ea1bb939f
--- /dev/null
+++ b/doc/fluid/build_and_install/build_from_source_en.rst
@@ -0,0 +1 @@
+../../v2/build_and_install/build_from_source_en.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/docker_install_cn.rst b/doc/fluid/build_and_install/docker_install_cn.rst
new file mode 120000
index 0000000000000000000000000000000000000000..965b2e20559291989422938c418fadbac16941b9
--- /dev/null
+++ b/doc/fluid/build_and_install/docker_install_cn.rst
@@ -0,0 +1 @@
+../../v2/build_and_install/docker_install_cn.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/docker_install_en.rst b/doc/fluid/build_and_install/docker_install_en.rst
new file mode 120000
index 0000000000000000000000000000000000000000..79d7341a7bbb9e477c773134f24983fd7607769a
--- /dev/null
+++ b/doc/fluid/build_and_install/docker_install_en.rst
@@ -0,0 +1 @@
+../../v2/build_and_install/docker_install_en.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/index_cn.rst b/doc/fluid/build_and_install/index_cn.rst
new file mode 120000
index 0000000000000000000000000000000000000000..f697fcd8fac9131862ae7f8f51c5ebe93737ad2d
--- /dev/null
+++ b/doc/fluid/build_and_install/index_cn.rst
@@ -0,0 +1 @@
+../../v2/build_and_install/index_cn.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/index_en.rst b/doc/fluid/build_and_install/index_en.rst
new file mode 120000
index 0000000000000000000000000000000000000000..502f66a41319d4f41ae1774628ca36da9dca76ce
--- /dev/null
+++ b/doc/fluid/build_and_install/index_en.rst
@@ -0,0 +1 @@
+../../v2/build_and_install/index_en.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/paddleci.png b/doc/fluid/build_and_install/paddleci.png
new file mode 120000
index 0000000000000000000000000000000000000000..c3eb1457acc77cab9360e654240d1e8f548035b4
--- /dev/null
+++ b/doc/fluid/build_and_install/paddleci.png
@@ -0,0 +1 @@
+../../v2/build_and_install/paddleci.png
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/pip_install_cn.rst b/doc/fluid/build_and_install/pip_install_cn.rst
new file mode 120000
index 0000000000000000000000000000000000000000..07deca84b82ff553e0c19324695089dcfb6be90e
--- /dev/null
+++ b/doc/fluid/build_and_install/pip_install_cn.rst
@@ -0,0 +1 @@
+../../v2/build_and_install/pip_install_cn.rst
\ No newline at end of file
diff --git a/doc/fluid/build_and_install/pip_install_en.rst b/doc/fluid/build_and_install/pip_install_en.rst
new file mode 120000
index 0000000000000000000000000000000000000000..7f39c998195b719b05443e96f1c4a6a8d44b98c9
--- /dev/null
+++ b/doc/fluid/build_and_install/pip_install_en.rst
@@ -0,0 +1 @@
+../../v2/build_and_install/pip_install_en.rst
\ No newline at end of file
diff --git a/doc/design/images/asgd.gif b/doc/fluid/design/algorithm/images/asgd.gif
similarity index 100%
rename from doc/design/images/asgd.gif
rename to doc/fluid/design/algorithm/images/asgd.gif
diff --git a/doc/design/images/theta_star.gif b/doc/fluid/design/algorithm/images/theta_star.gif
similarity index 100%
rename from doc/design/images/theta_star.gif
rename to doc/fluid/design/algorithm/images/theta_star.gif
diff --git a/doc/fluid/design/algorithm/index_cn.rst b/doc/fluid/design/algorithm/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0883a9dc9c457f393ac1bdc930cb47ebcb0a25d9
--- /dev/null
+++ b/doc/fluid/design/algorithm/index_cn.rst
@@ -0,0 +1,7 @@
+梯度更新算法
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  parameter_average.md
diff --git a/doc/fluid/design/algorithm/index_en.rst b/doc/fluid/design/algorithm/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..59fe68dcf79ce2ef90b9adc829a0db45a4f0b3dc
--- /dev/null
+++ b/doc/fluid/design/algorithm/index_en.rst
@@ -0,0 +1,7 @@
+Gradient Update Algorithm
+--------------------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  parameter_average.md
diff --git a/doc/fluid/design/algorithm/parameter_average.md b/doc/fluid/design/algorithm/parameter_average.md
new file mode 100644
index 0000000000000000000000000000000000000000..28ad6495d97515442eb8af2050158829814acd33
--- /dev/null
+++ b/doc/fluid/design/algorithm/parameter_average.md
@@ -0,0 +1,74 @@
+# Averaging Parameter in PaddlePaddle
+
+## Why Averaging
+In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable to obtain the optimal values of parameters by going through the data in as few passes as possible.
+
+Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset.
+
+Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/theta_star.gif"/><br/> . The averaging is done as follows:
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/asgd.gif"><br />
+</p>
+
+We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above.
+
+### How to perform Parameter Averaging in PaddlePaddle
+
+Parameter Averaging in PaddlePaddle works in the following way during training :
+1. It will take in an instance of an optimizer as an input, e.g. RMSPropOptimizer
+2. The optimizer itself is responsible for updating the parameters.
+3. The ParameterAverageOptimizer maintains a separate copy of the parameters for itself:
+    1. In theory, the values of this copy are the average of the values of the parameters in the most recent N batches.
+    2. However, saving all N instances of the parameters in memory is not feasible.
+    3. Therefore, an approximation algorithm is used.
+
+Hence, overall we have have two copies of the parameters: one for the optimizer itself, and one for the ParameterAverageOptimizer. The former should be used in back propagation, while the latter should be used during testing and should be saved.
+
+During the testing/saving the model phase, we perform the following steps:
+1. Perform the delayed operations.
+2. Save current values of the parameters to a temporary variable.
+3. Replace the values of the parameters with the averaged values.
+4. Perform testing and/or save the parameters.
+5. Restore the values of the parameters once done.
+
+### How to implement Averaging of Parameter in PaddlePaddle
+
+We can add the ParameterAverageOptimizer op to the graph through Python API. Using this approach, we manually add this op to the graph and direct the output of the optimizer op to this op during training.
+
+	**Advantages**:
+    - Allows for greater flexibility to the users of PaddlePaddle. Using this approach, the users can plug different optimizers into ParameterAverageOptimizer by passing in the optimizer to the op.
+    - Makes it easy for the users to customize and extend the framework.
+
+	**Disadvantages**:
+    - Implementation requires re-writing the averaging methodology in Python.  
+
+### Low-Level implementation
+
+In the new design, we propose to create a new operation for averaging parameter updates (ParameterAverageOptimizer). For now, we can add an op that takes in the following as input:
+- the optimizer
+- the window_size to keep the updates
+
+The ParameterAverageOptimizer op can be like any other operator with its own CPU/GPU implementation either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement the kernel using Eigen following the abstraction pattern implemented for [Operators](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/rmsprop_op.h). We also want to support the case when the Trainer/Optimizer runs on the GPU while ParameterAverageOptimizer runs on a CPU.
+
+The idea of building an op for averaging is in sync with the refactored PaddlePaddle philosophy of using operators to represent any computation unit. The way the op will be added to the computation graph will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) in Python API.
+
+### Python API implementation for ParameterAverageOptimizer
+
+Based on Polyak and Juditsky (1992), we can generalize the averaging of updates to any optimizer. The input to the op would be the following:
+- Any optimizer (RMSProp , AdaGrad etc.)
+- A window size. The op keeps accumulating updated parameter values over a window of N batches and takes an average. Move the averaged value to a buffer when window is full to avoid loss of precision.
+
+Using the ParameterAverageOptimizer op, any user can add the operation to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support averaging. As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since ParameterAverageOptimizer will be an operator, it makes sense to create it in the layer functions.
+We will have a wrapper written in Python that will support the functionality and implement the actual core computation in C++ core as we have done for other [Optimizers](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/rmsprop_op.cc)
+
+#### Creation of the ParameterAverageOptimizer operator
+There are two ways for creating the ParameterAverageOptimizer op:
+1. We create the op immediately while building the computation graph.
+2. We add the op in a lazy manner, just before the backward pass, similar to the way the optimization ops are added.
+
+The proposal is to add the op immediately while building the computation graph.
+
+#### High-level API
+
+In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide parameter average functionality in layer functions.
diff --git a/doc/fluid/design/concepts/README.md b/doc/fluid/design/concepts/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8ded0ad22f4013a521bf3bee260565dc5cf855ae
--- /dev/null
+++ b/doc/fluid/design/concepts/README.md
@@ -0,0 +1,174 @@
+A few months ago when we were trying to replace CMake with Bazel, @emailweixu suggested that we rewrite those handy Bazel functions using CMake. Now it seems that it's the right time to get this done, as we are facing problems from the porting of Majel and the development of new the parameter server using Go and C++.
+
+Here are some initial thoughts. Your comments are welcome!
+
+# Required CMake Function
+
+I think we need only the following few CMake functions to make a project description mean and clean:
+
+<table>
+<thead>
+<tr>
+<th>C++</th>
+<th>CUDA C++</th>
+<th>Go</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>cc_library </td>
+<td>nv_library </td>
+<td>go_library </td>
+</tr>
+<tr>
+<td>cc_binary </td>
+<td>nv_binary </td>
+<td>go_binary </td>
+</tr>
+<tr>
+<td> cc_test </td>
+<td> nv_test </td>
+<td> go_test </td>
+</tr>
+</tbody>
+</table>
+
+
+- The `_library` functions generate  .a files from source code.
+- The `_binary` functions generate executable binary files.
+- The `_test` functions generate executable unit test files. They work like `_binary` but links `-lgtest` and `-lgtest_main`.
+
+The difference between `nv_` functions and `cc_` functions is that the former use `nvcc` instead of the system-default C++ compiler.
+
+Both `nv_` and `cc_` functions enables C++11 (-std=c++11).
+
+Also,
+
+- to describe external dependencies, we need `external_library`.
+- to build shared libraries, we need `shared_library`.
+
+## An Example Project
+
+Suppose that we have aforementioned functions defined in our `/cmake` directory.  The following example `CMakeLists.txt` describes a project including the following source files:
+
+- tensor.h
+- tensor.cc
+- tensor_test.cc
+- ops.h
+- ops.cu
+- ops_test.cu
+- api.go
+- api_test.go
+
+Suppose that ops.cu depends on CUDNN.
+
+```cmake
+# cc_binary parses tensor.cc and figures out that target also depend
+# on tensor.h.
+cc_binary(tensor
+  SRCS
+  tensor.cc)
+
+# The dependency to target tensor implies that if any of
+# tensor{.h,.cc,_test.cc} is changed, tensor_test need to be re-built.
+cc_test(tensor_test
+  SRCS
+  tensor_test.cc
+  DEPS
+  tensor)
+
+# I don't have a clear idea what parameters external_library need to
+# have.  @gangliao as a CMake expert would have better ideas.
+external_library(cudnn
+  ....)
+
+# Suppose that ops.cu depends on external target CUDNN.  Also, ops.cu
+# include global functions that take Tensor as their parameters, so
+# ops depend on tensor.  This implies that if any of tensor.{h.cc},
+# ops.{h,cu} is changed, ops need to be re-built.
+nv_library(ops
+  SRCS
+  ops.cu
+  DEPS
+  tensor
+  cudnn)  # cudnn is defined later.
+
+nv_test(ops_test
+  SRCS
+  ops_test.cu
+  DEPS
+  ops)
+
+# Because api.go defines a GO wrapper to ops and tensor, it depends on
+# both.  This implies that if any of tensor.{h,cc}, ops.{h,cu}, or
+# api.go is changed, api need to be re-built.
+go_library(api
+  SRCS
+  api.go
+  DEPS
+  tensor # Because ops depend on tensor, this line is optional.
+  ops)
+
+go_test(api_test
+  SRCS
+  api_test.go
+  DEPS
+  api)
+
+
+# This builds libapi.so.  shared_library might use CMake target
+# api_shared so to distinguish it from above target api.
+shared_library(api
+  DEPS
+  api)
+
+```
+
+## Implementation
+
+As above example CMakeLists.txt executes, each function invocation adds "nodes" to a dependency graph.  It also use this graph to generate CMake commands including `add_executable`, `add_dependencies`, `target_link_libraries`, and `add_test`.
+
+## Using Package Manager For Go
+
+Building Go binaries and libraries need to satisfy their dependencies, generally
+we can do `go get ./...` to download and compile all external dependencies. The
+problems are:
+
+1. `go get` will always get the latest code from the default branch of the
+    remote repo, so changes of dependents might break the build. This is very
+    different with what we already have in `cmake/external` which download a
+    specific version or commit id of the dependency.
+1. Some locations can not access external dependencies through the internet, as mentioned
+   in https://github.com/PaddlePaddle/Paddle/issues/2605. Using package management
+   tools can package the dependencies as a "vendor" package, which can be mirrored
+   at many cloud file hosting, so users what to compile paddle by themselves can
+   download this "vendor" package from a mirror site.
+
+### Choose A Suitable Tool
+
+As mentioned by @wangkuiyi, [Here](https://github.com/golang/go/wiki/PackageManagementTools)
+list dozens of Go package managers. We choose the tool using following principles:
+
+- Most "active" projects with more stars, more pull requests or commits
+- Widely used project
+
+After comparing all these projects, we shall choose between the most popular
+tools: Godep and Glide.
+
+Here's a brief comparison between Godep and Glide
+: https://github.com/Masterminds/glide/wiki/Go-Package-Manager-Comparison. There are
+also many complaints about using `Godep`. There's also a new "official" pakcage
+management tool has been started at: https://github.com/golang/dep to resolve
+such problems, but it's currently at Alpha stage. So the best choice now is
+glide obviously.
+
+### Manage Go Packages
+
+- Dependencies: `go/glide.yaml` will store the dependencies and their versions which
+  is directly imported by paddle. `go/glide.lock` will store all dependencies recursively
+  with their commit id. Builds will "lock" to these packages if we don't `glide up`
+  them
+- Vendor package: `go/vendor` directory will generated when running `cmake` command. `cmake`
+  will download the code corresponding to `go/glide.lock`. If we put a vendor folder
+  under `go/`, cmake will just check the commit id to the packages under the folder,
+  if commit id matches, there will be no download at all.
diff --git a/doc/fluid/design/concepts/block.md b/doc/fluid/design/concepts/block.md
new file mode 100644
index 0000000000000000000000000000000000000000..3757cd055c818be1e63ee8c0f000f4dd299b59f4
--- /dev/null
+++ b/doc/fluid/design/concepts/block.md
@@ -0,0 +1,375 @@
+# Design Doc: Block and Scope
+
+## The Representation of Computation
+
+Both deep learning systems and programming languages help users describe computation procedures.  These systems use various representations of computation:
+
+- Caffe, Torch, and Paddle: sequences of layers.
+- TensorFlow, Caffe2, Mxnet: graph of operators.
+- PaddlePaddle: nested blocks, like C++ and Java programs.
+
+## Block in Programming Languages and Deep Learning
+
+In programming languages, a block is a pair of curly braces that includes local variables definitions and a sequence of instructions or operators.
+
+Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning:
+
+<table>
+<thead>
+<tr>
+<th>programming languages</th>
+<th>PaddlePaddle</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>for, while loop </td>
+<td>RNN, WhileOp </td>
+</tr>
+<tr>
+<td>if, if-else, switch </td>
+<td>IfElseOp, SwitchOp </td>
+</tr>
+<tr>
+<td>sequential execution </td>
+<td>a sequence of layers </td>
+</tr>
+</tbody>
+</table>
+
+
+A key difference is that a C++ program describes a one pass computation, whereas a deep learning program describes both the forward and backward passes.
+
+## Stack Frames and the Scope Hierarchy
+
+The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs:
+
+<table>
+<thead>
+<tr>
+<th>programming languages</th>
+<th>PaddlePaddle</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>stack </td>
+<td>scope hierarchy </td>
+</tr>
+<tr>
+<td>stack frame  </td>
+<td>scope </td>
+</tr>
+<tr>
+<td>push at entering block </td>
+<td>push at entering block </td>
+</tr>
+<tr>
+<td>pop at leaving block </td>
+<td>destroy when minibatch completes </td>
+</tr>
+</tbody>
+</table>
+
+
+1. In traditional programs:
+
+   - When the execution enters the left curly brace of a block, the runtime pushes a frame into the stack, where it realizes local variables.
+   - After the execution leaves the right curly brace, the runtime pops the frame.
+   - The maximum number of frames in the stack is the maximum depth of nested blocks.
+
+1. In PaddlePaddle
+
+   - When the execution enters a block, PaddlePaddle adds a new scope, where it realizes variables.
+   - PaddlePaddle doesn't pop a scope after the execution of the block because variables therein are used by the backward pass.  So it has a stack forest known as a *scope hierarchy*.
+   - The height of the highest tree is the maximum depth of nested blocks.
+   - After the processing of a minibatch, PaddlePaddle destroys the scope hierarchy.
+
+## Use Blocks in C++ and PaddlePaddle Programs
+
+Let us consolidate the discussion by presenting some examples.
+
+### Blocks with `if-else` and `IfElseOp`
+
+The following C++ programs shows how blocks are used with the `if-else` structure:
+
+```c++
+namespace pd = paddle;
+
+int x = 10;
+int y = 1;
+int z = 10;
+bool cond = false;
+int o1, o2;
+if (cond) {
+  int z = x + y;
+  o1 = z;
+  o2 = pd::layer::softmax(z);
+} else {
+  int d = pd::layer::fc(z);
+  o1 = d;
+  o2 = d+1;
+}
+
+```
+
+An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](../execution/if_else_op.md) is as follows:
+
+```python
+import paddle as pd
+
+x = minibatch([10, 20, 30]) # shape=[None, 1]
+y = var(1) # shape=[1], value=1
+z = minibatch([10, 20, 30]) # shape=[None, 1]
+cond = larger_than(x, 15) # [false, true, true]
+
+ie = pd.ifelse()
+with ie.true_block():
+    d = pd.layer.add_scalar(x, y)
+    ie.output(d, pd.layer.softmax(d))
+with ie.false_block():
+    d = pd.layer.fc(z)
+    ie.output(d, d+1)
+o1, o2 = ie(cond)
+```
+
+In both examples, the left branch computes `x+y` and `softmax(x+y)`, the right branch computes `fc(x)` and `x+1` .
+
+The difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances.
+
+
+### Blocks with `for` and `RNNOp`
+
+The following RNN model in PaddlePaddle from the [RNN design doc](../dynamic_rnn/rnn.md) :
+
+```python
+x = sequence([10, 20, 30]) # shape=[None, 1]
+m = var(0) # shape=[1]
+W = var(0.314, param=true) # shape=[1]
+U = var(0.375, param=true) # shape=[1]
+
+rnn = pd.rnn()
+with rnn.step():
+  h = rnn.memory(init = m)
+  h_prev = rnn.previous_memory(h)
+  a = layer.fc(W, x)
+  b = layer.fc(U, h_prev)  
+  s = pd.add(a, b)
+  act = pd.sigmoid(s)
+  rnn.update_memory(h, act)
+  rnn.output(a, b)
+o1, o2 = rnn()
+```
+has its equivalent C++ program as follows
+
+```c++
+int* x = {10, 20, 30};
+int* m = {0};
+int* W = {0.314};
+int* U = {0.375};
+
+int mem[sizeof(x) / sizeof(x[0]) + 1];
+int o1[sizeof(x) / sizeof(x[0]) + 1];
+int o2[sizeof(x) / sizeof(x[0]) + 1];
+for (int i = 1; i <= sizeof(x)/sizeof(x[0]); ++i) {
+  int x = x[i-1];
+  if (i == 1) mem[0] = m;
+  int a = W * x;
+  int b = Y * mem[i-1];
+  int s = fc_out + hidden_out;
+  int act = sigmoid(sum);
+  mem[i] = act;
+  o1[i] = act;
+  o2[i] = hidden_out;
+}
+```
+
+## Compilation and Execution
+
+Like TensorFlow, a PaddlePaddle program is written in Python. The first part describes a neural network as a protobuf message, and the rest executes the message for training or inference.
+
+The generation of this protobuf message is similar to how a compiler generates a binary executable file. The execution of the message is similar to how the OS executes the binary file.
+
+## The "Binary Executable File Format"
+
+The definition of the protobuf message is as follows:
+
+```protobuf
+message BlockDesc {
+  repeated VarDesc vars = 1;
+  repeated OpDesc ops = 2;
+}
+```
+
+The step net in above RNN example would look like
+
+```
+BlockDesc {
+  vars = {
+    VarDesc {...} // x
+    VarDesc {...} // h
+    VarDesc {...} // fc_out
+    VarDesc {...} // hidden_out
+    VarDesc {...} // sum
+    VarDesc {...} // act
+  }
+  ops = {
+    OpDesc {...} // matmul
+    OpDesc {...} // add_two
+    OpDesc {...} // sigmoid
+  }
+};
+```
+
+Also, the RNN operator in above example is serialized into a protobuf message of type `OpDesc` and would look like:
+
+```
+OpDesc {
+  inputs = {0} // the index of x in vars of BlockDesc above
+  outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above
+  attrs {
+    "states" : {1} // the index of h
+    "step_net" : <above step net>
+  }
+};
+```
+
+This `OpDesc` value is in the `ops` field of the `BlockDesc` value representing the global block.
+
+
+## The Compilation of Blocks
+
+During the generation of the Protobuf message, the Block should store VarDesc (the Protobuf message which describes Variable) and OpDesc (the Protobuf message which describes Operator).
+
+VarDesc in a block should have its name scope to avoid local variables affecting parent block's name scope.
+Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that is stored in the parent block. For example:
+
+```python
+a = pd.Variable(shape=[20, 20])
+b = pd.fc(a, params=["fc.w", "fc.b"])
+
+rnn = pd.create_rnn()
+with rnn.stepnet():
+    x = a.as_step_input()
+    # reuse fc's parameter
+    fc_without_b = pd.get_variable("fc.w")
+    rnn.output(fc_without_b)
+
+out = rnn()
+```
+The method `pd.get_variable` can help retrieve a Variable by the name. The Variable may be stored in a parent block, but might be retrieved in a child block, so block should have a variable scope that supports inheritance.
+
+In compiler design, the symbol table is a data structure created and maintained by compilers to store information about the occurrence of various entities such as variable names, function names, classes, etc.
+
+To store the definition of variables and operators, we define a C++ class `SymbolTable`, like the one used in compilers.
+
+`SymbolTable` can do the following:
+
+- store the definitions (some names and attributes) of variables and operators,
+- verify if a variable was declared,
+- make it possible to implement type checking (offer Protobuf message pointers to `InferShape` handlers).
+
+
+```c++
+// Information in SymbolTable is enough to trace the dependency graph. So maybe
+// the Eval() interface takes a SymbolTable is enough.
+class SymbolTable {
+ public:
+  SymbolTable(SymbolTable* parent) : parent_(parent) {}
+
+  OpDesc* NewOp(const string& name="");
+
+  // TODO determine whether name is generated by python or C++.
+  // Currently assume that a unique name will be generated by C++ if the
+  // argument name is left default.
+  VarDesc* Var(const string& name="");
+
+  // find a VarDesc by name, if recursive is true, find parent's SymbolTable
+  // recursively.
+  // this interface is introduced to support InferShape, find protobuf messages
+  // of variables and operators, pass pointers into InferShape.
+  //
+  // NOTE maybe some C++ classes such as VarDescBuilder and OpDescBuilder should
+  // be proposed and embedded into pybind to enable python operation on C++ pointers.
+  VarDesc* FindVar(const string& name, bool recursive=true);
+
+  OpDesc* FindOp(const string& name);
+
+  BlockDesc Compile() const;
+
+ private:
+  SymbolTable* parent_;
+
+  map<string, OpDesc> ops_;
+  map<string, VarDesc> vars_;
+};
+```
+
+After all the description of variables and operators is added into SymbolTable,
+the block has enough information to run.
+
+The `Block` class takes a `BlockDesc` as input, and provides `Run` and `InferShape` functions.
+
+
+```c++
+namespace {
+
+class Block : OperatorBase {
+public:
+  Block(const BlockDesc& desc) desc_(desc) {}
+
+  void InferShape(const framework::Scope& scope) const override {
+    if (!symbols_ready_) {
+      CreateVariables(scope);
+      CreateOperators();
+    }
+    // should run InferShape first.
+    for (auto& op : runtime_table_.ops()) {
+      op->InferShape(scope);
+    }
+  }
+
+  void Run(const framework::Scope& scope,
+           const platform::Place& place) const override {
+    PADDLE_ENFORCE(symbols_ready_, "operators and variables should be created first.");
+    for (auto& op : runtime_table_.ops()) {
+      op->Run(scope, place);
+    }
+  }
+
+  void CreateVariables(const framework::Scope& scope);
+  void CreateOperators();
+
+  // some other necessary interfaces of NetOp are listed below
+  // ...
+
+private:
+  BlockDesc desc_;
+  bool symbols_ready_{false};
+};
+```
+
+## The Execution of Blocks
+
+Block inherits from OperatorBase, which has a Run method.
+Block's Run method will run its operators sequentially.
+
+There is another important interface called `Eval`, which takes some arguments called targets and generates a minimal graph which treats targets as the end points and creates a new Block. After `Run`, `Eval` will get the latest value and return the targets.
+
+The definition of Eval is as follows:
+
+```c++
+// clean a block description by targets using the corresponding dependency graph.
+// return a new BlockDesc with minimal number of operators.
+// NOTE: The return type is not a Block but the block's description so that this can be distributed
+// to a cluster.
+BlockDesc Prune(const BlockDesc& desc, vector<string> targets);
+
+void Block::Eval(const vector<string>& targets,
+                 const framework::Scope& scope,
+                 const platform::DeviceContext& dev_ctx) {
+  BlockDesc min_desc = Prune(desc_, targets);
+  Block min_block(min_desc);
+  min_block.Run(scope, dev_ctx);
+}
+```
diff --git a/doc/fluid/design/concepts/cpp_data_feeding.md b/doc/fluid/design/concepts/cpp_data_feeding.md
new file mode 100644
index 0000000000000000000000000000000000000000..aabc1ba75a67c5767d409bd6e7e6240dec86b16c
--- /dev/null
+++ b/doc/fluid/design/concepts/cpp_data_feeding.md
@@ -0,0 +1,204 @@
+# C++ Data Feeding
+
+While using Paddle V2 API for training, data feeding completely depends on the Python code. To get rid of the Python environment and achieve the goal of "wrapping the whole training by a while loop op" in Paddle Fluid, a C++ data feeding mechanism is required.
+
+In this document, we show the fundamental design of a C++ data feeding process, which includes data reading, shuffling and batching.
+
+## Overview
+
+![](images/readers.png)
+
+## Reader
+
+In order to handle the above-mentioned problem, a new concept called 'Reader' is introduced. `Reader` is a series of inherited classes which can be held by our `Variable` and they are used to read or process file data.
+
+
+### ReaderBase
+
+`ReaderBase` is the abstract base class for all readers. It defines the interface for all readers.
+
+```cpp
+class ReaderBase {
+ public:
+  // Reads the next batch of data. (A 'batch' can be only one instance)
+  // If the next batch doesn't exist, it throws an exception
+  virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
+  
+  // Checks whether the next instance exists.
+  virtual bool HasNext() = 0;
+  
+  // Reinitializes the reader and read the file from the beginning.
+  virtual void ReInit() = 0;
+
+  virtual ~ReaderBase();
+};
+```
+
+### FileReader
+
+`FileReader` is derived from the `ReaderBase`. It is still an abstract class and will further be derived by Readers of respective specific format.
+
+```cpp
+class FileReader : public ReaderBase {
+ public:
+  explicit FileReader(const std::vector<DDim>& dims);
+
+  void ReadNext(std::vector<LoDTensor>* out) override;
+
+ protected:
+  virtual void ReadNextImpl(std::vector<LoDTensor>* out) = 0;
+
+ private:
+  std::vector<DDim> dims_;
+};
+```
+
+A file reader binds with a single file and reads one data instance at a time. Each type of file reader shall implement its own `ReadNextImpl()`, `HasNext()` and `ReInit()`.
+
+The `ReadNextImpl()` is invoked by `ReadNext()`. Besides invoking `ReadNextImpl()`, `ReadNext()` is also responsible for checking the output, making sure that each shape of `LoDTensor` in `*out` is consistent with the one in `dims_`.  
+
+### DecoratedReader
+
+A decorated reader takes another reader(both file reader and decorated reader are OK) as its 'underlying reader'. It gets data from its underlying reader, does some processing on them(shuffling,  batching or something else), then yields processed data. The output data of a decorated reader can be a single instance or a batch. `ShuffleReader` and `BatchReader` are both decorated readers.
+
+```cpp
+class DecoratedReader : public ReaderBase {
+ public:
+  explicit DecoratedReader(ReaderBase* reader) : ReaderBase(), reader_(reader) {
+    PADDLE_ENFORCE_NOT_NULL(reader_);
+  }
+
+  void ReInit() override { reader_->ReInit(); }
+
+  bool HasNext() const override { return reader_->HasNext(); }
+
+ protected:
+  ReaderBase* reader_;
+};
+```
+
+Both the `FileReader` and `DecoratedReader` share exactly the same interface as defined in `ReaderBase`. So they can be decorated for multiple times: We can **shuffle** a reader's outputs and then **batch** the shuffled outputs. The interface consistency also allows related ops use readers without knowing their underlying type.
+
+### MultipleReader
+
+All `FileReader` binds with a single file and are single-threaded. However, sometimes we need to read data from more than one file. In this case, it's not enough to only have `FileReader` and `DecoratedReader`.
+
+So `MultipleReader` is introduced. It is also derived from `ReaderBase`. A `MultipleReader` holds several prefetching `FileReaders` and these readers run concurrently. Another pivotal part of a `MultipleReader` is a buffer channel. The channel collects data yield by all prefetching readers and makes subsequent OPs or decorated readers be able to fetch data without concerning about multiple readers scheduling.
+
+![](images/multiple_reader.png)
+
+This graph shows how a `MultipleReader` works with three prefetching file readers and two GPUs. There is a queue of files which are going to be read. Each time when a prefetching file reader is free(complete reading from one file), it fetches a new file from the queue. Each prefetching file reader runs in a separated prefetch thread and dumps their outputs to the same channel.
+
+To the subsequent two decorated readers, the `MultipleReader` is **a single reader**. They don't need to concern about how prefetch readers are scheduled. They only need to invoke `MultipleReader::ReadNext()` to get the next data from the buffer channel. 
+
+### ReaderHolder
+
+Different readers belong to different class types. This leads to a problem: How can we drop them into `Variable`s and fetch them out by a unified method? For example, if a Variable holds a `BatchReader`, we can not get it by the following code:
+
+```cpp
+var->Get<ReaderBase>("batch_reader");
+```
+
+We would have to write:
+
+```cpp
+var->Get<BatchReader>("batch_reader");
+```
+
+This requires that in order to get a reader from a variable, every time, we must know the reader's type exactly. This is nearly impossible.
+
+To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an empty decorator of `ReaderBase`, which hides reader's type. With `ReaderHolder` we are able to fetch all types of readers by `var->Get<ReaderHolder>("...")` and regard the obtained object as a reader.
+
+## Related Operators
+
+To create and invoke readers, some new ops are introduced:
+
+### Operators That Create Readers
+
+Each reader has its creation op. File readers' creation ops have no input and yield the created file reader as its output. Decorated readers' creation ops take the underlying readers as inputs and then yield new decorated readers.
+
+However, direct usage of file readers' creation ops is not recommended because a file reader can only read one file via a single thread. Using `OpenFilesOp` is a better choice.
+
+### OpenFilesOp
+
+The `OpenFilesOp` is the creation op of `MultipleReader`. It takes no input but requires a list of file names as one of its attributes. The newly created `MultipleReader` then creates its own prefetching readers according to given file names.
+
+To make sure that created prefetching readers match file formats, we need a name prefix rule to append file format tags to file names, as well as a file reader registry mechanism to map file format tags to their corresponding file readers' constructors.
+
+### HasNextOp
+
+`HasNextOp` is used to check whether the next data batch exists via the reader's `HasNext()` interface.
+
+### ResetOp
+
+`ResetOp` is used to reset a reader via its `ReInit()` interface.
+
+### ReadOp
+
+A reader is only a Variable. It cannot trigger the reading process by itself. So we add the `ReadOp` to execute it. A `ReadOp` takes a reader Variable as its input. Each time it runs, it invokes the reader‘s `ReadNext()` function and gets a new batch of data(or only one instance of data, if we use file reader directly). The output data of a reader are in the form of `std::vector<LoDTenosr>`, so the `ReadOp` also needs to split the vector and move LoDTensors to their respective output Variables.
+
+## Program with Readers
+
+A `Program` holds readers as its persistable variables. These variables are created by `CreateReaderOp` or `OpenFilesOp`. These ops shall run only once. So they shall be settled in the `startup_program`. `HasNextOp`, `ResetOp` and `ReadOp` are required by training loop, so they shall be in the `main_program`.
+
+The ops of a `startup_program` with readers would be like this:
+
+```
+multiple_reader = open_files_op(...)
+batch_reader = create_batch_reader_op(multiple_reader)
+double_buffer_reader = create_double_buffer_op(batch_reader)
+... (other initializers)
+```
+
+The forwarding ops of the corresponding `main_program` would be like this:
+
+```
+not_completed = true
+pass_count = 0
+while_op(not_completed) {
+    has_next = has_next_op(double_buffer_reader)
+    if_else_op(has_next) {
+        batch_data = read_op(double_buffer_reader)
+        ... (subsequent training ops)
+    } else {
+        reset_op(double_buffer_reader)
+        increase_op(pass_count)
+        not_completed = less_than_op(pass_count, reqiured_pass_num)
+    }
+}
+```
+
+A few important considerations for these programs are as follows:
+
+1. `not_completed`, `pass_count` and other variables shown above are all Fluid Variables.
+
+2. The multiple\_reader is the batch\_reader's underlying reader, and the batch\_reader is the double\_buffer\_reader's underlying reader. `read_op`, `has_next_op` and other reader related ops will only invoke the top-most reader. In this case, it's the double\_buffer\_reader.
+
+3. All readers exist in both `startup_program` and `main_program`. And they are persistable.
+
+### Simplify Configuration by MultiPassReader
+
+The Program configuration mentioned above is complicated. Users need to be very familiar to concepts of Program and Block to prevent making mistakes in their code. To make the usage of C++ readers more friendly to new users, we introduce `MultiPassReader`.
+
+`MultiPassReader` is a decorated reader. A multi-pass reader is used to continuously yield data for several training passes. It takes the number of passes to run as one of its attributes('pass_num') and maintains a counter to record how many passes it has completed. Each time its underlying reader reaches the EOF, the multi-pass reader checks whether it has completed the training of given number of pass. If not, the underlying reader will be re-initialized and starts a new pass automatically. Before completing the whole training, the return of MultiPassReader's `HasNext()` will always be `true`.
+
+With `MultiPassReader`, the startup program would be like this:
+
+```
+multiple_reader = open_files_op(...)
+batch_reader = create_batch_reader_op(multiple_reader)
+multi_pass_reader = create_multi_pass_reader_op(batch_reader)
+double_buffer_reader = create_double_buffer_op(multi_pass_reader)
+... (other initializers)
+```
+
+The forwarding part of the corresponding `main_program` would be like this:
+
+```
+not_completed = true
+while_op(not_completed) {
+    batch_data = read_op(double_buffer_reader)
+    ... (subsequent training ops)
+    not_completed = has_next_op(double_buffer_reader)
+}
+```
diff --git a/doc/fluid/design/concepts/executor.md b/doc/fluid/design/concepts/executor.md
new file mode 100644
index 0000000000000000000000000000000000000000..3fcddf4dd90f826ee1a16713f4371fb010f8eac5
--- /dev/null
+++ b/doc/fluid/design/concepts/executor.md
@@ -0,0 +1,29 @@
+# Executor Design Doc
+
+## Motivation
+In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message
+[`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
+
+The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains the intrinsics (operators in this case) and variables which will be used, executor explicitly executes the stored precompiled code.
+
+## Overview
+
+An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators in the block. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instances, which is persistent throughout different runs.
+
+## Executor
+
+The `Executor` explicitly executes all the intrinsics (operators here) in the `block_id`th block of a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then runs all the operators in sequence one-by-one.
+It is very similar to how a push stack frame works when entering a block, following which it cleans up all the temporary variables when a mini-batch is finished. It does not however, have the stack frame pop process.
+
+### The interface
+```c++
+  Executor(places);
+```
+A executor does not own any computing resources, a user can only construct an executor using the specified places.
+
+### Running an Executor
+
+```
+  void Run(ProgramDesc, Scope, block_id, create_local_scope);
+```
+An `Executor` only provides a unified way to execute `ProgramDesc`. `ProgramDesc` is the target that will be executed, the `Scope` specifies the variable container, the `block_id` indicates the entrance block and `create_local_scope` is a boolean that states whether it will destroy the temporary variables after the execution is finished.
diff --git a/doc/fluid/design/concepts/functions_operators_layers.md b/doc/fluid/design/concepts/functions_operators_layers.md
new file mode 100644
index 0000000000000000000000000000000000000000..1f86b99e5197c3e0b85fd76fe704520ef21b06d3
--- /dev/null
+++ b/doc/fluid/design/concepts/functions_operators_layers.md
@@ -0,0 +1,128 @@
+# Design Doc: Functions, Operators, and Layers
+
+In a DL system, we can compose one or more fine grained operators into a coarse grained one.  For example, the FC layer can be composed of a multiplication operator and an add operator.
+
+Historically, some fine grained operations are known as operators, and some coarse level ones are known as layers.  But we need a well-defined separation.
+
+In general, operators are those very fine grained operations, e.g., mul and add. In the implementation, we can write them as C++ functions:
+
+```c++
+template <typename T> T add(T x, T y) { return x + y; }
+template <typename T> T mul(T x, T y) { return x * y; }
+```
+
+Then we can wrap them into operators which are C++ classes and can be created from Python bindings by name.  A C macro can do this. For example, the following macro invocation
+
+```c++
+#define MAKE_FUNCTION_OPERATOR(mul);
+```
+
+generates
+
+```c++
+template <typename T> class mulOp : public OperatorBase {...};
+REGISTER_OP(mulOp<float32>, "mul");
+```
+
+so that in Python we can create operator mul by:
+
+```python
+X1 = Var()
+X2 = Var()
+Y = Var()
+paddle.cpp.create_operator("mul", input=[X1, X2], output=Y)
+```
+
+Also, at the same time, we can compose a coarse level C++ operator class by composing functions `mul` and `add`:
+
+```c++
+template <typename T>
+class FCOp : public OperatorBase {
+ public:
+  void Run(...) {
+    add(mul(Input<T>("X"), Input<T>("W")), Input<T>("b"));
+  }
+};
+REGISTER_OP(FCOp, "fc");
+```
+
+We need to support such composition in Python as well.  To do so, we need a higher level Python wrapping of operator creation than `paddle.cpp.create_operator`.  This higher level operator API should be compatible with the layer API.
+
+Let's explain using an example.  Suppose that we are going to compose the FC using mul and add in Python, we'd like to have Python functions `mul` and `add` defined in module `operator`:
+
+```python
+def operator.mul(X1, X2):
+    O = Var()
+    paddle.cpp.create_operator("mul", input={X1, Y1}, output=O)
+    return O
+
+def operator.add(X1, X2):
+    O = Var()
+    paddle.cpp.create_operator("add", input={X1, X2}, output=O)
+    return O
+```
+
+Above code snippets are automatically generated.  Given them, users can define
+
+```python
+def layer.fc(X):
+    W = Var()
+    b = Var()
+    return operator.add(operator.mul(X, W), b)
+```
+
+If we don't have `operator.mul` and `operator.add`, the definiton of `layer.fc` would be complicated:
+
+```python
+def layer.fc(X):
+    W = Var()
+    b = Var()
+    O1 = Var()
+    paddle.cpp.create_operator("mul", input=[X, W], output=O1)
+    O2 = Var()
+    paddle.cpp.create_operator("add", input=[O1, b], output=O2)
+    return O2
+```
+
+We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`.  So we have the following concepts in above illustrative example:
+
+<table>
+<thead>
+<tr>
+<th>C++ functions/functors</th>
+<th>mul</th>
+<th>add</th>
+<th></th>
+<th></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>C++ operator class </td>
+<td>mulOp</td>
+<td>addOp </td>
+<td>FCOp </td>
+<td></td>
+</tr>
+<tr>
+<td>Python binding  </td>
+<td>operator.mul</td>
+<td> operator.add </td>
+<td>operator.fc </td>
+<td></td>
+</tr>
+<tr>
+<td>Python function   </td>
+<td></td>
+<td></td>
+<td> </td>
+<td>layer.fc</td>
+</tr>
+</tbody>
+</table>
+
+
+This is how we differentiate layer and operators in PaddlePaddle:
+
+- those defined in C++ and have a lightweighted Python wrapper in module `operators` are operators; whereas
+- those who don't have C++ implementations but a Python implementation that compose C++ operators are known as layers.
diff --git a/doc/fluid/design/concepts/images/multiple_reader.png b/doc/fluid/design/concepts/images/multiple_reader.png
new file mode 100644
index 0000000000000000000000000000000000000000..b22126b31db4982c13fc3a0827805e6aaf955046
Binary files /dev/null and b/doc/fluid/design/concepts/images/multiple_reader.png differ
diff --git a/doc/fluid/design/concepts/images/parallel_executor_overview.dot b/doc/fluid/design/concepts/images/parallel_executor_overview.dot
new file mode 100644
index 0000000000000000000000000000000000000000..40753cb140540c08d9d4c449b8d377e315280436
--- /dev/null
+++ b/doc/fluid/design/concepts/images/parallel_executor_overview.dot
@@ -0,0 +1,83 @@
+digraph G {
+  subgraph cluster_init {
+    label="Initialization"
+    startup_program [label="startup", shape=box]
+    node_w_g0 [label="W\nGPU0"]
+    startup_program -> node_w_g0 [label="Initialize"]
+    node_w_g1 [label="W\nGPU1"]
+    node_w_g0 -> node_w_g1 [label="broadcast"]
+  }
+
+  subgraph cluster_train {
+    label="forward_backward"
+
+    subgraph cluster_gpu0 {
+      label="GPU0"
+      fc_0 [label="fc\nGPU0", shape=box]
+      hidden_0 [label="hidden\nGPU0"]
+      node_w_g0 -> fc_0
+      fc_0 -> hidden_0
+      loss0 [label="loss\nGPU0"]
+      hidden_0 -> loss0 [label="many ops omitted"]
+      scale_loss_0 [label="scale_loss_gradient\nGPU0", shape=box]
+      loss_g0 [label="loss_grad\nGPU0"]
+      scale_loss_0->loss_g0
+      
+      fc_g_0 [label="w_grad\nGPU0", shape=box]
+      loss0 -> fc_g_0
+      loss_g0 -> fc_g_0
+      hidden_0 -> fc_g_0
+    }
+
+    subgraph cluster_gpu1 {
+      label="GPU1"
+      fc_1 [label="fc\nGPU1", shape=box]
+      hidden_1 [label="hidden\nGPU1"]
+      node_w_g1 -> fc_1
+      fc_1 -> hidden_1
+      loss1 [label="loss\nGPU1"]
+      hidden_1 -> loss1 [label="many ops omitted"]
+      scale_loss_1 [label="scale_loss_gradient\nGPU1", shape=box]
+      loss_g1 [label="loss_grad\nGPU1"]
+      scale_loss_1->loss_g1
+      
+      fc_g_1 [label="w_grad\nGPU1", shape=box]
+      loss1 -> fc_g_1
+      loss_g1 -> fc_g_1
+      hidden_1 -> fc_g_1
+    }
+  }
+
+  all_reduce_w [label="Merge Gradients(AllReduce)", shape=box]
+  fc_g_0 -> all_reduce_w
+  fc_g_1 -> all_reduce_w
+
+  fc_g_0_merged [label="w_grad\nMerged\nGPU0"]
+  fc_g_1_merged [label="w_grad\nMerged\nGPU1"]
+  all_reduce_w -> fc_g_0_merged
+  all_reduce_w -> fc_g_1_merged
+
+  subgraph cluster_optimization {
+    label="Optimization"
+    subgraph cluster_opt_gpu0 {
+      label="GPU0"
+      sgd_0 [label="SGD Op\nGPU0", shape=box]
+
+      fc_g_0_merged -> sgd_0
+      node_w_g0 -> sgd_0
+      optimized_w_0 [label="Optimized W\nGPU0"]
+      sgd_0 -> optimized_w_0
+    }
+    subgraph cluster_opt_gpu1 {
+      label="GPU1"
+      sgd_1 [label="SGD Op\nGPU1", shape=box]
+
+      fc_g_1_merged -> sgd_1
+      node_w_g1 -> sgd_1
+      optimized_w_1 [label="Optimized W\nGPU0"]
+      sgd_1 -> optimized_w_1
+    }
+  }
+
+
+}
diff --git a/doc/fluid/design/concepts/images/parallel_executor_overview.png b/doc/fluid/design/concepts/images/parallel_executor_overview.png
new file mode 100644
index 0000000000000000000000000000000000000000..d890c0ffee3b38dc7cb74a2b56c2ab4831532211
Binary files /dev/null and b/doc/fluid/design/concepts/images/parallel_executor_overview.png differ
diff --git a/doc/fluid/design/concepts/images/readers.png b/doc/fluid/design/concepts/images/readers.png
new file mode 100644
index 0000000000000000000000000000000000000000..fd59168ce16c9e2a0ef45303c28c997cfd7740be
Binary files /dev/null and b/doc/fluid/design/concepts/images/readers.png differ
diff --git a/doc/fluid/design/concepts/index_cn.rst b/doc/fluid/design/concepts/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..dcdc894937ff328e6002623275ca3c65e87b2bb0
--- /dev/null
+++ b/doc/fluid/design/concepts/index_cn.rst
@@ -0,0 +1,19 @@
+核心概念
+-------------
+
+.. toctree::
+  :maxdepth: 1
+
+  README.md
+  cpp_data_feeding.md
+  functions_operators_layers.md
+  program.md
+  variable.md
+  var_desc.md
+  tensor.md
+  tensor_array.md
+  lod_tensor.md
+  block.md
+  scope.md
+  executor.md
+  parallel_executor.md
diff --git a/doc/fluid/design/concepts/index_en.rst b/doc/fluid/design/concepts/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b85a3055746facaa642e8fc899976b58435f1ef2
--- /dev/null
+++ b/doc/fluid/design/concepts/index_en.rst
@@ -0,0 +1,19 @@
+Core Concepts
+--------------------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  README.md
+  cpp_data_feeding.md
+  functions_operators_layers.md
+  program.md
+  variable.md
+  var_desc.md
+  tensor.md
+  tensor_array.md
+  lod_tensor.md
+  block.md
+  scope.md
+  executor.md
+  parallel_executor.md
diff --git a/doc/fluid/design/concepts/lod_tensor.md b/doc/fluid/design/concepts/lod_tensor.md
new file mode 100644
index 0000000000000000000000000000000000000000..748488f6d5f2f1272e87b89047570632418da8dc
--- /dev/null
+++ b/doc/fluid/design/concepts/lod_tensor.md
@@ -0,0 +1,211 @@
+# Design Doc: LoD (Level-of-Detail) Tensor
+
+Like other deep learning systems, PaddlePaddle supports training models from sequence data.  Also, like other systems, PaddlePaddle represent a mini-batch of sequences as a Tensor.  What is different is that PaddlePaddle doesn't require all sequences in a mini-batch to be of the same length. Thus no need for padding zeros.
+
+<table>
+<thead>
+<tr>
+<th></th>
+<th>TensorFlow</th>
+<th>PaddlePaddle</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>RNN </td>
+<td>Support </td>
+<td>Support </td>
+</tr>
+<tr>
+<td>recursive RNN </td>
+<td>Support </td>
+<td>Support </td>
+</tr>
+<tr>
+<td>padding zeros </td>
+<td> Must </td>
+<td>No need </td>
+</tr>
+<tr>
+<td> blob data type </td>
+<td> Tensor</td>
+<td> LoDTensor </td>
+</tr>
+</tbody>
+</table>
+
+
+PaddlePaddle achieves this flexibility by passing through a new data type, *LoD Tensor*, which is a Tensor attached with segmentation index known as *LoD*, between operators.  The LoD index doesn't only segment a tensor, but also recursively segments sub-sequences.  This document presents the design of LoD and LoDTensor.
+
+
+## The Challenge: Variable-length Sequences
+
+Most deep learning systems represent a mini-batch as a Tensor.  For example, a mini-batch of 10 images, each of size 32x32, is a 10x32x32 Tensor.  Another example is that each mini-batch contains N sentences, where each word is a D-dimensional one-hot vector.  Suppose that all sentences have the same length L, we can represent this mini-batch by a NxLxD tensor.
+
+Both examples show that the elements of sequences are usually of the same size.  In the first example, all images are 32x32, and in the second one, all words are D-dimensional vectors.  It doesn't make sense to allow variable-sized images, as that would require transformations like convolution to handle variable-sized Tensors.
+
+The real challenge is that in most cases, sentences have variable lengths, and we will need an index data structure to segment the tensor into sequences.  Also, sequences might consist of sub-sequences.
+
+
+## A Solution: The LoD Index
+
+To understand our solution, it is best to look at some examples.
+
+### A Mini-Batch of Sentences
+
+Let's imagine a mini-batch of 3 variable lengths sentences composed of 3, 1, and 2 words, respectively.  We can represent the mini-batch by a (3+1+2)xD tensor plus some index information:
+
+```
+3   1 2
+||| | ||
+```
+
+where each `|` represents a D-dimensional word vector.  The numbers, 3, 1, and 2, form a 1-level LoD.
+
+### Recursive Sequences
+
+Let check another example of a 2-level LoD Tensor.  Consider a mini-batch of three articles with 3, 1, and 2 sentences, and each sentence consists of a variable number of words:
+
+```
+3           1  2
+3   2  4    1  2  3
+||| || |||| |  || |||
+```
+
+### A Mini-Batch of Videos
+
+LoD tensors generalize to the case where elements are higher dimensional objects, like images.  Suppose that a mini-batch contains videos of the same frame size 640x480.  Here is a mini-batch of 3 videos with 3, 1, and 2 frames, respectively.
+
+```
+3     1  2
+口口口 口 口口
+```
+
+The underlying tensor is of size (3+1+2)x640x480, and each `口` represents a 640x480 image.
+
+### A Mini-Batch of Images
+
+In traditional cases like a mini-batch with N fixed-sized images,  the LoD Tensor representation is as
+
+```
+1 1 1 1     1
+口口口口 ... 口
+```
+
+In this case, we don't lose any information by ignoring the many 1's in the index and simply considering this LoD Tensor as a usual Tensor:
+
+```
+口口口口 ... 口
+```
+
+### Model Parameters
+
+A model parameter is just a usual Tensor, which, just like the above example, is a **0-level LoD Tensor**.
+
+
+## The LoD Tensor
+
+Let us revisit above example of the 2-level LoD Tensor
+
+```
+3           1  2
+3   2  4    1  2  3
+||| || |||| |  || |||
+```
+
+It is indeed a tree, where leaves are elementary sequences identified by **branches**.
+
+For example, the third sentence in above example is identified by branch <0,2>, where 0 indicates the first article with length 3, and 2 indicates the third sentence in this article with length 4.
+
+### The LoD Index
+
+We can save the LoD index in the above example
+
+```
+3           1  2
+3   2  4    1  2  3
+```
+
+in a not-full 2D matrix:
+
+```c++
+typedef std::vector<std::vector<int> > LoD;
+```
+
+where
+
+- `LoD.size()` is the number of levels, or the maximum length of branches,
+- `LoD[i][j]` is the length of the j-th segment at the i-th level.
+
+## The Offset Representation
+
+To quickly access elementary sequences, we adopt an offset representation -- instead of saving the lengths, we save the beginning and ending elements of sequences.
+
+In the above example, we accumulate the length of elementary sequences:
+
+```
+3 2 4 1 2 3
+```
+
+into offsets
+
+```
+0  3  5   9   10  12   15
+   =  =   =   =   =    =
+   3  2+3 4+5 1+9 2+10 3+12
+```
+
+so we know that the first sentence is from word 0 to word 3, and the second sentence from word 3 to word 5.
+
+Similarly, the lengths in the top level LoD
+
+```
+3 1 2
+```
+
+are transformed into offsets of elements/words as follows:
+
+```
+0 3 4   6
+  = =   =
+  3 3+1 4+2
+```
+
+## Slicing of LoD Tensors
+
+
+When we use the above 2-level LoD Tensor as the input to a nested-RNN, we need to retrieve certain sequences.  Here we define the sequence identified by branch <i,j,...> as the **<i,j,...>-slice**.
+
+For example, the <2>-slice of above example is
+
+```
+10      15
+10  12  15
+  || |||
+```
+
+and the <2,0>-slice of above slice is
+
+```
+10  12
+  ||
+```
+
+## Length Representation vs Offset Representation
+
+The offset representation is an implementation-oriented decision and it makes understanding the idea behind LoDTensor difficult.
+Hence, we encapsulate this implementation detail in C++ and expose the original length representation in our Python API. 
+Specifically, we call this length representation `recursive_sequence_lengths` and users can use the following code to set or get the `recursive_sequence_lengths` of a LoDTensor in Python:
+```Python
+# length representation of lod called recursive_sequence_lengths
+recursive_seq_lens = [[3, 1, 2], [2, 2, 1, 3, 1, 2]]
+# Create a LoDTensor that has the above recursive_sequence_lengths info.
+# This recursive_sequence_lengths will be converted to an offset representation of LoD in the C++ implementation under the hood.
+tensor = fluid.LoDTensor(lod)
+
+# Set/Change the recursive_sequence_lengths info of LoDTensor
+tensor.set_recursive_sequence_lengths([[3, 1, 2]])
+# Get the recursive_sequence_lengths info of a LoDTensor (the offset-based LoD representation stored in C++ will be converted 
+# back to length-based recursive_sequence_lengths), new_recursive_seq_lens = [[3, 1, 2]]
+new_recursive_seq_lens = tensor.recursive_sequence_lengths()
+```
diff --git a/doc/fluid/design/concepts/parallel_executor.md b/doc/fluid/design/concepts/parallel_executor.md
new file mode 100644
index 0000000000000000000000000000000000000000..4f88e27bed722e9f2f535e368926fe49b4e72e56
--- /dev/null
+++ b/doc/fluid/design/concepts/parallel_executor.md
@@ -0,0 +1,104 @@
+# ParallelExecutor
+
+## Background
+
+Neural network models are defined as a `ProgramDesc` in Fluid. The `ProgramDesc` can be executed by an interpreter(i.e. the `executor` concept in Fluid). The instructions or operators in a `Program` will be executed, and the results will be fetched in Python side.
+
+The executor is a very naive interpreter. It runs operators one by one. We can use `Parallel.Do` to support data parallelism, however, lacking device information in `ProgramDesc`; it is not possible to optimize the performance of `Parallel.Do`.
+
+We want a `ProgramDesc` can be run on different nodes. It is better not to contain device information in `ProgramDesc`. However, we can write a high-performance interpreter, which can hold an alternative intermediate representation of `ProgramDesc`, to take full usage of Multi-GPUs. 
+
+ParallelExecutor is an interpreter of `ProgramDesc` which will [out-of-order execute](https://en.wikipedia.org/wiki/Out-of-order_execution) `Program` in data parallelism mode and maximise the utility of Multi-GPUs.
+
+
+## Overview of MultiGPUs logic
+
+The ParallelExecutor takes the startup program and main program as inputs. The parameters will be initialised on `GPU0` by startup program and will broadcast to multi-GPUs. The main program will be duplicated into multi-GPUs. The gradient will be merged during each iteration, and each device will optimize parameters independently. Since the gradients on each device will be merged before parameter optimization, the parameters will be the same on each device and it does not need to be broadcast the parameters.
+
+![alt](images/parallel_executor_overview.png)
+
+There are several optimizations for this logic.
+
+1. We use an alternate representation in ParallelExecutor. It because the device information is critical for performance optimization.
+2. The execution is out-of-order, i.e., an operator will be executed whenever the inputs of the operator are ready. 
+   * GPU is a high-performance device; only one CPU thread cannot fulfil one GPU. So there is a thread pool to execute operators.
+   * Out-of-order also helps transpilers to generate `ProgramDesc`. It is no need to concern about the best order of performance when implementing a transpiler.
+3. The streams of computation, merge gradients and fetch data are different.
+
+The performance of `ResNeXt152` on `TitanX` which `batch_size=12` is shown below.
+
+| Number of GPUs | 1 | 2 | 3 | 4|
+| --- | --- | --- | --- | --- |
+| Image/Sec | 17.9906 | 25.771 | 36.911 | 48.8428 |
+| Speed Up | N/A | 1.43247029 | 2.05168255 | 2.71490667 |
+
+
+## Static single assignment Graph
+
+[Static single assignment form](https://en.wikipedia.org/wiki/Static_single_assignment_form)(`SSA` for short) is a common form for compiler optimization. To implement concurrent execution, we uses an `SSA` graph as an intermedia representation of `ProgramDesc`.
+
+The `Program` is a directed acyclic graph, since a variable can be assigned multiple times. We enforce a variable will be assigned once, by adding version number to varaibles. We parsing the `Program` into a `SSA` graph. Also, ProgramExecutor duplicate `Program` into multi-devices. We also add a device number to varaibles and insert `NCCLAllReduce` into Graph.
+
+The data structure of `SSA` graph is:
+
+```c++
+struct VarHandleBase {
+  OpHandleBase* generated_op_;
+  vector<OpHandleBase*> pending_ops_;
+  
+  string name;
+  Place place;
+  size_t version;
+};
+
+struct OpHandleBase {
+  vector<OpHandleBase*> inputs_;
+  vector<OpHnadleBase*> outputs_;
+};
+
+struct SSAGraph {
+  // vars on each devices. 
+  //   * the vars in each map in vector is on different device.
+  //   * the map is mapping a variable name to variable handles
+  //   with different versions
+  vector<std::unordered_map<string, vector<VarHandleBase>>> vars_;
+  
+  // All ops
+  vector<OpHandleBase> ops_;
+};
+```
+The variable handles are the wrapper of `Variables`. The operator handles are the wrapper of `OperatorBase`. Some `OpHandle` is not an `OperatorBase`, such as `NCCLAllReduceOpHandle`, because `AllReduceOpHandle` will use new device contexts.
+
+When the `ProgramDesc` converted into an `SSA` Graph, the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem is also need to be taken care. The dummy variables, which represent the dependency between operators, will be manually inserted into SSA graph to resolve the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem.
+
+## Execute SSA Graph
+
+The SSA graph can be out-of-order executed by an approximate [topological sorting](https://en.wikipedia.org/wiki/Topological_sorting) algorithm. The algorithm is
+
+1. Maintaining a map of an operator and its needed input number.
+2. If a variable is not generated by an operator, i.e., `var.generated_op == nullptr`, decrease the needed input number of its pending operators.
+3. If there is an operator which needed input number is decreased to zero, just run this operator.
+4. After run this operator, just mark the variables are generated and repeat step 2 until all variables are generated.
+
+Running an operator can be asynchronized. There is a thread pool to execute an `SSA` graph.
+
+## Synchronize GPU Kernels
+
+The GPU is a non-blocking device. The different streams need be synchronized when switching streams. In current implementation, the synchronization based on the following algorithm:
+
+1. `OpHandle` will record `DeviceContext` that it is used.
+2. In `OpHandle::Run`, if the `DeviceContext` of current operator is different from `DeviceContext` of any input variable, just wait the generate operator of this input variable.
+
+The `wait` are implemented by two strategies:
+
+1. Invoke `DeviceContext->Wait()`, It will wait all operators on this device contexts complete.
+2. Uses `cudaStreamWaitEvent` to sending a event to the stream. It is a non-blocking call. The wait operators will be executed in GPU.
+
+Generally, the `cudaStreamWaitEvent` will have a better perforamnce. However, `DeviceContext->Wait()` strategy is easier to debug. The strategy can be changed in runtime.
+
+## What's next?
+
+* Merging gradient of dense parameters has been done. However, the merging of sparse parameters has not been done.
+* The CPU version of Parallel Executor has not been implemented. The out-of-order logic will make CPU compuatation faster, too.
+* A better strategy to merge gradients can be introduced. We can shrink the gradients from `float32` to `int8` or `int4` while merging. It will significantly speed up multi-GPUs training without much loss of precision.
+* Combine multi-Nodes implementation. By the benifit of out-of-order, sending and recving operator can be an blocking operator, and the transpiler does not need to concern about the best position of operator.
diff --git a/doc/fluid/design/concepts/program.md b/doc/fluid/design/concepts/program.md
new file mode 100644
index 0000000000000000000000000000000000000000..cfcd21ecdb9d2844bf93ed98a56db09651077c40
--- /dev/null
+++ b/doc/fluid/design/concepts/program.md
@@ -0,0 +1,139 @@
+# Design Doc: PaddlePaddle Programs
+
+## Compile and Execution
+
+A PaddlePaddle program consists of two parts -- the first generates a `ProgramDesc` protobuf message that describes the program, and the second runs this message using a C++ class `Executor`.
+
+A simple example PaddlePaddle program can be found in [graph.md](../others/graph.md):
+
+```python
+x = layer.data("images")
+l = layer.data("label")
+y = layer.fc(x)
+cost = layer.mse(y, l)
+optimize(cost)
+train(cost, reader=mnist.train())
+```
+
+The first five lines of the following PaddlePaddle program generates, or, compiles, the `ProgramDesc` message.  The last line runs it.
+
+## Programs and Blocks
+
+The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program.
+
+- program: some nested blocks
+- [block](./block.md):
+  - some local variable definitions, and
+  - a sequence of operators
+
+The concept of block comes from usual programs.  For example, the following C++ program has three blocks:
+
+```c++
+int main() { // block 0
+  int i = 0;
+  if (i < 10) { // block 1
+    for (int j = 0; j < 10; j++) { // block 2
+    }
+  }
+  return 0;
+}
+```
+
+The following PaddlePaddle program has three blocks:
+
+```python
+import paddle as pd  // block 0
+
+x = minibatch([10, 20, 30]) # shape=[None, 1]
+y = var(1) # shape=[1], value=1
+z = minibatch([10, 20, 30]) # shape=[None, 1]
+cond = larger_than(x, 15) # [false, true, true]
+
+ie = pd.ifelse()
+with ie.true_block():  // block 1
+    d = pd.layer.add_scalar(x, y)
+    ie.output(d, pd.layer.softmax(d))
+with ie.false_block():  // block 2
+    d = pd.layer.fc(z)
+    ie.output(d, d+1)
+o1, o2 = ie(cond)
+```
+
+## `BlockDesc` and `ProgramDesc`
+
+All protobuf messages are defined in `framework.proto`.
+
+`BlockDesc` is straight-forward -- it includes local variable definitions, `vars`, and a sequence of operators, `ops`.
+
+```protobuf
+message BlockDesc {
+  required int32 parent = 1;
+  repeated VarDesc vars = 2;
+  repeated OpDesc ops = 3;
+}
+```
+
+The parent ID indicates the parent block so that operators in a block can refer to variables defined locally and also those defined in their ancestor blocks.
+
+All hierarchical blocks in a program are flattened and stored in an array. The block ID is the index of the block in this array.
+
+```protobuf
+message ProgramDesc {
+  repeated BlockDesc blocks = 1;
+}
+```
+
+
+### Global Block
+
+The global block is the first one in the above array.
+
+## Operators that Use Blocks
+
+In the above example, the operator `IfElseOp` has two blocks -- the true branch and the false branch.
+
+The definition of `OpDesc` shows that an operator could have some attributes:
+
+```protobuf
+message OpDesc {
+  AttrDesc attrs = 1;
+  ...
+}
+```
+
+and an attribute could be of type block, which is, in fact, a block ID as described above:
+
+```
+message AttrDesc {
+  required string name = 1;
+
+  enum AttrType {
+    INT = 1,
+    STRING = 2,
+    ...
+    BLOCK = ...
+  }
+  required AttrType type = 2;
+
+  optional int32 block = 10; // when type == BLOCK
+  ...
+}
+```
+
+## InferShape
+
+With this design, the InferShape function should take the following parameters:
+
+```c++
+void InferShape(int current_block,
+                int current_operator,
+                ProgramDesc* program // might change VarDesc values.
+                ) {
+  ...
+}
+```
+
+where
+
+- `current_block` indices into `ProgramDesc::blocks`,
+- `current_operator` indices into `BlockDesc::ops`.
diff --git a/doc/fluid/design/concepts/python_data_feeding.md b/doc/fluid/design/concepts/python_data_feeding.md
new file mode 100644
index 0000000000000000000000000000000000000000..dffee8e02bacbc99bdfa8c54f1a146de340ad778
--- /dev/null
+++ b/doc/fluid/design/concepts/python_data_feeding.md
@@ -0,0 +1,130 @@
+# Python Data Feeding
+
+In the former implementation of Paddle Fluid, there are two ways to feed data:
+
+- Use `reader_op` in backend C++ side. This method only supports data feeding from recordio files and random data generators, but supports many kinds of `decorated_readers`. For examples, `double_buffer_reader` uses two threads to achieve better performance: one for time-consuming I/O operations, and the other for `Executor::Run()`. See [C++ Data Feeding](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/cpp_data_feeding.md) for details.
+
+- Feed data directly using `DataFeeder.feed()` in Python codes. It is more flexible than the first way. Many kinds of preprocessing steps can be performed before feeding using Python or any other languages, instead of adding many uncommon `operators` in C++ side. But this method is less efficient: the program cannot read the next mini-batch data before `Executor::Run()` ends. Moreover, `decorated_readers` such as `double_buffer_reader` cannot be used for better performance.
+
+In this document, we design a Python Data Feeding process combining the efficiency of the first way and the flexibility of the second way. A data queue `LoDTensorBlockingQueue` is designed to be shared by the Python and C++ side, while `LoDTensorArray` is pushed into the queue in Python side and `reader_op` in C++ side reads out the data from the queue.
+
+
+## Design of LoDTensorBlockingQueue
+`LoDTensorBlockingQueue` is a blocking queue with a fixed `capacity` and accepts `std::vector<framework::LoDTensor>` with shapes indicated by `dims`. Since `LoDTensorBlockingQueue` must be constructed using `capacity` and `dims`, it cannot be a `Variable` type. Therefore, a `LoDTensorBlockingQueueHolder` is designed to defer construction of `LoDTensorBlockingQueue`.
+
+```C++
+class LoDTensorBlockingQueueHolder;
+
+class LoDTensorBlockingQueue {
+  friend class LoDTensorBlockingQueueHolder;
+ private:
+  // `LoDTensorBlockingQueue` can only be constructed by 
+  // `LoDTensorBlockingQueueHolder::InitOnce()`
+  LoDTensorBlockingQueue(size_t capacity, const std::vector<framework::DDim>& dims);
+ 
+ public:
+  size_t Size() const { return queue_.Size(); } // Get the current size of the queue
+
+  size_t Cap() const { return queue_.Cap(); }// Get the capacity of the queue
+
+  void Close() { return queue_.Close(); }
+
+  bool IsClosed() const { return queue_.IsClosed(); }
+
+  // Block if Size() == Cap()
+  // Return false only when queue_.IsClosed() == true
+  bool Push(const std::vector<framework::LoDTensor> &lod_tensor_vec);
+  
+  // Block if Size() == 0.
+  // *Success == false when queue_.IsClosed() == true
+  std::vector<framework::LoDTensor> Pop(bool *success = nullptr);
+ 
+ private:
+  // Use reader::BlockingQueue as the inner data structure
+  BlockingQueue<std::vector<framework::LoDTensor>> queue_;
+  std::vector<framework::DDim> dims_;
+};
+
+class LoDTensorBlockingQueueHolder {
+ public:  
+  // Call the constructor of `LoDTensorBlockingQueue` to create queue_
+  // `InitOnce` can only called once, otherwise an exception would raise
+  void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims) {
+    PADDLE_ENFORCE(queue_ == nullptr);
+    queue_.reset(new LoDTensorBlockingQueue(capacity, dims));
+  }
+
+  const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const { return queue_; }
+
+ private:
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
+};
+```
+
+There are some major things that must be concerned:
+- `LoDTensorBlockingQueueHolder` should be a `Variable` in global scope, so that `reader_op` can find it when reading data.
+- A `Variable` of `LoDTensorBlockingQueueHolder` but not `VarDesc` must be created in Python code before `Executor::Run()` so that `Executor::Run()` can get the feeding data when it is called.
+- `Create_reader_op` should accept the name of the `LoDTensorBlockingQueueHolder` variable as an input.
+
+
+## Release of the GIL in pybind
+`Pybind11::gil_scoped_release` is used to release GIL (Global Interpreter Lock) when `LoDTensorBlockingQueue::Push()` or `Executor::Run()` method are invoked in Python side, making `LoDTensorBlockingQueue::Push()` and `Executor::Run()` run in parallel.
+
+
+## Design of PyReader
+`PyReader` is a reader which holds a `LoDTensorBlockingQueue` object.
+```C++
+class PyReader : public ReaderBase {
+ public:
+  explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue);
+  
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    bool success;
+    *out = queue_->Pop(&success);
+    if (!success) out->clear();
+  }
+  
+  void ReInit() override { return; }
+
+ private:
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
+};
+```
+
+
+## Design of CreatePyReaderOp
+`CreatePyReaderOp` is used to create the `PyReader` object. It requires an input `blocking_queue` which indicates the name of the `LoDTensorBlockingQueueHolder` variable.
+```C++
+class CreatePyReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) return;
+    
+    const std::string& queue_name = Input("blocking_queue");
+    auto* queue_holder_var = scope.FindVar(queue_name);
+    PADDLE_ENFORCE(queue_holder_var != nullptr);
+		auto* queue_holder = queue_holder_var
+                    ->template GetMutable<framework::LoDTensorBlockingQueueHolder>();
+    out->Reset(new PyReader(queue_holder->GetQueue()));
+  }
+};
+```
+
+## Design of Python codes
+The design of Python codes are as follows. First, we construct a variable of `LoDTensorBlockingQueueHolder` and init it with given parameters, returning the `LoDTensorBlockingQueue` object after initialization. After that, a layer of `CreatePyReaderOp` is constructed and accepts the name of the `LoDTensorBlockingQueueHolder` variable. The `LoDTensorBlockingQueue` object and result of the layer are both returned.
+```Python
+def py_reader(capacity, shapes):
+  queue_name = unique_name.generate("lod_tensor_blocking_queue")
+  var = global_scope().var(feeder_name) # create LoDTensorBlockingQueueHolder Variable
+  feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes) # init the queue
+  out = create_var()
+  create_py_reader_op_with_queue_name(
+      inputs={'blocking_queue': queue_name},
+      outputs={'Out':[out]})  
+  return out, feed_queue
+```
diff --git a/doc/fluid/design/concepts/scope.md b/doc/fluid/design/concepts/scope.md
new file mode 100644
index 0000000000000000000000000000000000000000..dcf76649357aaef80d6bc1a933ece8c4c1063547
--- /dev/null
+++ b/doc/fluid/design/concepts/scope.md
@@ -0,0 +1,124 @@
+# Design of Scope in Paddle
+
+## Overview
+
+Scope is an important concept in programming languages, which defines a program region that a set of bindings between names and entities applies. In a specific scope, a valid name is uniquely associated with an entity, such as a variable. And in another scope, this name may refer to other entity or nothing at all. It clearly restricts the visibility and validity of names in a program. Hence **Scope** is introduced to PaddlePaddle to manage variables in context. But different from the original abstract concept, Scope now becomes an object with two important attributes:
+
+- Scope is an association of a name to variable.
+- Variables in a parent scope can be retrieved from local scope.
+
+A detailed explanation of these two attributes goes as following.
+
+
+## Scope is an association of a name to variable.
+
+Scope is an association of a name to variable. All variables belong to `Scope`. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`. One net can run in different scopes and update different variable in the scope.
+
+
+1. Scope only contains a map of a name to variable.
+
+   All parameters, data, states in a Net should be variables and stored inside a scope. Each op should get inputs and outputs to do computation from a scope, such as data buffer, state (momentum) etc.
+
+1. Variable can only be created by Scope and a variable can only be got from Scope. User cannot create or get a variable outside a scope. This is a constraints of our framework, and will keep our framework simple and clear.
+
+1. Scope only contains methods that are used to Create and Get Variables. Scope do not contain Operators and have no information to run them.
+    `Net` is designed to drive the computation and Scope only contains a map of variables. There is no computation logic inside a `Scope`. Scope just handles the lifetime management of variables.
+    - `Create` is used to create a Variable by its name and add the mapping relation.
+    - `Get` is used to find a Variable by name.
+
+1. Every variable only belongs to one certain Scope.
+
+   Variable can not belong to many scopes. If you want to use variables from parent scope, you can use `parent scope`.
+
+1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else.
+
+   Because Variable can only be got from Scope. When destroying Scope, we also need to destroy all the Variables in it. If user store `Variable` pointer to private data member or some global variable, the pointer will be an invalid pointer when associated `Scope` is destroyed.
+
+```cpp
+class Scope {
+ public:
+  Variable* Var(const std::string& name);
+  const Variable* FindVar(const std::string& name) const;
+
+ private:
+    std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+};
+```
+
+
+## Parent scope and local scope
+
+Just like [scope](https://en.wikipedia.org/wiki/Scope_(computer_science)) in programming languages, `Scope` in the neural network can also be a local scope. There are two attributes about local scope.
+
+1.  We can create local variables in a local scope. When that local scope is destroyed, all local variables should also be destroyed.
+2.  Variables in a parent scope can be retrieved from local scopes of that parent scope, i.e., when user get a variable from a scope, it will try to search this variable in current scope. If there is no such variable in the local scope, `scope` will keep searching from its parent, until the variable is found or there is no parent.
+
+```cpp
+class Scope {
+ public:
+  Scope(const std::shared_ptr<Scope>& scope): parent_(scope) {}
+
+  Variable* FindVar(const std::string& name) const {
+    auto it = vars_.find(name);
+    if (it != vars_.end()) {
+      return it->second.get();
+    } else if (parent_ != nullptr) {
+      return parent_->FindVar(name);
+    } else {
+      return nullptr;
+    }
+  }
+
+ private:
+  std::shared_ptr<Scope> parent_ {nullptr};
+};
+```
+
+In `Scope` class, there is a private data member called `parent_`. `parent_` is a smart pointer to its parent scope. When user `Get` a variable by its `name`, the `name` will be searched inside the current scope. If the variable cannot be found locally and parent scope is not a `nullptr`, the variable will be searched inside that parent scope. `parent_` pointer's default value is `nullptr`. It means that the scope is a global scope when `parent_` is nullptr.
+
+A local scope is very useful when we implement Recurrent Neural Network. Each timestep of an RNN should be a `Net`. Each `Net` of timestep (`StepNet` for short) should use an independent local scope. Just like variables in a while loop is inside a local scope in programming languages. By using a single `StepNet` and changing local scope, we can implement an RNN easily.
+
+## Interface Design
+
+```cpp
+class Variable {
+ private:
+  Variable() = default;
+  friend class Scope;
+};
+
+class Scope {
+ private:
+  Scope(const std::shared_ptr<Scope>& parent = nullptr);
+
+ public:
+  static std::shared_ptr<Scope> Create(const std::shared_ptr<Scope>& parent = nullptr);
+
+  // return nullptr if not found.
+  Variable* FindVar(const std::string& name) const;
+
+  // return if already contains same name variable.
+  Variable* Var(const std::string& name);
+
+ private:
+  std::shared_ptr<Scope> parent_;
+  std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+};
+```
+## Only scope can create a variable
+
+To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `Var` can construct `Variable`.
+
+## When scope destroyed, all variables inside this scope should be destroyed together
+
+The scope hold unique pointers for all variables. User can `FindVar` from scope, but he should not hold this pointer as a member variable. Because when scope is destroyed, all variables inside this scope will be destroyed together.
+
+## Sharing a parent scope
+
+Local scope contains a `parent_` pointer. It is a linked-list for scopes. Using a `shared_ptr` because when a local scope is using, its parents cannot be destroyed.
+
+Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shared pointer. We cannot construct a scope variable, because it cannot be passed to other scope as `parent` pointer.
+
+## Orthogonal interface
+
+`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `Var` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `Var`, we can implement `Var` easily.
diff --git a/paddle/fluid/framework/tensor.md b/doc/fluid/design/concepts/tensor.md
similarity index 100%
rename from paddle/fluid/framework/tensor.md
rename to doc/fluid/design/concepts/tensor.md
diff --git a/doc/design/tensor_array.md b/doc/fluid/design/concepts/tensor_array.md
similarity index 100%
rename from doc/design/tensor_array.md
rename to doc/fluid/design/concepts/tensor_array.md
diff --git a/doc/fluid/design/concepts/var_desc.md b/doc/fluid/design/concepts/var_desc.md
new file mode 100644
index 0000000000000000000000000000000000000000..8db67f6703d142da71cf06bd4f7e2cb13556f9b0
--- /dev/null
+++ b/doc/fluid/design/concepts/var_desc.md
@@ -0,0 +1,100 @@
+# Design Doc: Var_desc
+
+## Background
+PaddlePaddle divides the description of neural network computation into two stages: compile time and runtime. At compile time, the neural network computation is described as a `ProgramDesc` whereas at runtime an `Executor` interprets the `ProgramDesc` to compute the operations.
+
+PaddlePaddle uses proto message to describe compile time program because :
+
+1. The computation program description must be serializable and saved in a file.
+1. During distributed training, the serialized program will be sent to multiple workers. It should also be possible to break the program into different components, each of which can be executed on a different worker.
+
+The computation `Program` consists of nested `Blocks`. Each `Block` will consist of data(i.e. `Variable`)  and  `Operations`. The concept to represent them is in the table below.
+
+<table>
+<thead>
+<tr>
+<th></th>
+<th>compile time</th>
+<th>runtime</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Data </td>
+<td>VarDesc(proto) </td>
+<td>Variable(cpp) </td>
+</tr>
+<tr>
+<td>Operation </td>
+<td>OpDesc(proto) </td>
+<td>Operator(cpp) </td>
+</tr>
+</tbody>
+</table>
+
+
+## Definition of VarType
+
+A VarDesc should have a name, type and whether or not it is persistable. There are different kinds of variable types supported in PaddlePaddle, apart from the POD_Types like: `LOD_TENSOR`, `SELECTED_ROWS`, `FEED_MINIBATCH`, `FETCH_LIST`, `STEP_SCOPES`, `LOD_RANK_TABLE`, `LOD_TENSOR_ARRAY`, `PLACE_LIST`, `READER` and `CHANNEL`. These are declared inside `VarType`. A `VarDesc` then looks as the following:
+
+```proto
+message VarDesc {
+  required string name = 1;
+  required VarType type = 2;
+  optional bool persistable = 3 [ default = false ];
+}
+```
+
+## Definition of TensorDesc
+
+```proto
+message TensorDesc {
+  // Should only be PODType. Is enforced in C++
+  required Type data_type = 1;
+  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+}
+```
+
+The `Type` here comes from the enum defined inside of `VarType` :
+
+```proto
+enum Type {
+  // Pod Types
+  BOOL = 0;
+  INT16 = 1;
+  INT32 = 2;
+  INT64 = 3;
+  FP16 = 4;
+  FP32 = 5;
+  FP64 = 6;
+
+  // Other types that may need additional descriptions
+  LOD_TENSOR = 7;
+  SELECTED_ROWS = 8;
+  FEED_MINIBATCH = 9;
+  FETCH_LIST = 10;
+  STEP_SCOPES = 11;
+  LOD_RANK_TABLE = 12;
+  LOD_TENSOR_ARRAY = 13;
+  PLACE_LIST = 14;
+  READER = 15;
+  CHANNEL = 16;
+}
+```
+
+A TensorDesc describes `SelectedRows` and `LoDTensor`. For details of `SelectedRows`, please reference [`SelectedRows`](./selected_rows.md).
+
+## Definition of LodTensorDesc
+
+```proto
+message LoDTensorDesc {
+  required TensorDesc tensor = 1;
+  optional int32 lod_level = 2 [ default = 0 ];
+}
+```
+
+A LoDTensorDesc contains a tensor and a lod_level.
+
+## Definition of Variable in Python
+
+For Variable in Python, please reference [`Python API`](./python_api.md).
diff --git a/paddle/fluid/framework/variable.md b/doc/fluid/design/concepts/variable.md
similarity index 100%
rename from paddle/fluid/framework/variable.md
rename to doc/fluid/design/concepts/variable.md
diff --git a/doc/fluid/design/concurrent/channel.md b/doc/fluid/design/concurrent/channel.md
new file mode 100644
index 0000000000000000000000000000000000000000..df67438bcc741ac521b00ee962fc13c93db21182
--- /dev/null
+++ b/doc/fluid/design/concurrent/channel.md
@@ -0,0 +1,139 @@
+# Channel Design
+
+## Introduction
+
+A Channel is a data structure that allows for synchronous interprocess
+communication via message passing.  It is a fundemental component of CSP
+(communicating sequential processes), and allows for users to pass data
+between threads without having to worry about synchronization.
+
+## How to use it
+
+Paddle offers python APIs to open and close channels, along with sending
+and receiving data to/from a channel.
+
+### Create a channel
+
+Creates a new channel that takes in variables of a specific dtype.
+
+- **fluid.make_channel(dtype, capacity=0)**
+  - **dtype**: The data type of variables being sent/received through channel
+  - **capacity**: The capacity of the channel.  A capacity of 0 represents
+    an unbuffered channel.  Capacity > 0 represents a buffered channel
+
+```
+ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR, 10)
+```
+
+### Close a channel
+
+Closes a channel.  Any pending senders and receivers will be awoken during
+this time.  Receivers can still receive from a closed channel, but senders
+are not allowed to send any additional data to the channel (Paddle will
+raise an exception if users try to send to a closed channel.)
+
+- **fluid.channel_close(channel)**
+
+```
+fluid.channel_close(ch)
+```
+
+### Send data to a channel
+
+Sends a variable to a channel.  Currently, variables of dtype `LoDTensor`,
+`LoDRankTable`, `LoDTensorArray`, `SelectedRows`, `ReaderHolder`, and
+`ChannelHolder` are supported.
+
+By default, the data of the Variable is moved from the sender to the receiver,
+however the user can optionally copy the data before performing the send.
+
+- **channel_send(channel, variable, is_copy=False)**
+  - **channel**: The channel to send the variable to
+  - **variable**: The variable to send to the channel
+  - **is_copy**: If set to True, channel_send will perform a variable assign
+  to copy the source variable to a new variable to be sent.
+
+```
+ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=100)
+fluid.channel_send(ch, var, True)
+```
+
+### Receive data from a channel
+
+Receives a variable from a channel.  The data of the variable is moved to the
+receiving variable.
+
+- **channel_recv(channel, return_variable)**
+  - **channel**: The channel to receive the variable from
+  - **return_variable**: The destination variable used to store the data of the
+  variable received from the channel
+
+```
+ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=-1)
+fluid.channel_recv(ch, var)
+```
+
+## How it Works
+
+Channels provides a simple interface for different threads to share data.
+To support the synchronization requirements, channels utilizes a series of
+internal queues, locks, and conditional variables.
+
+### QueueMessage
+
+QueueMessage encapsulates the state of the channel send/receive operation to be
+put in the **sendq/recvq**.  It contains a condition variable used to lock the
+thread (when there are no available sends/receives).  In addition, it contains
+a callback function to notify a thread when the QueueMessage is being
+processed by the channel.
+
+### Queues
+
+- **buff_**: This queue holds the data buffer in a buffered channel.  The
+capacity is set to the capacity of the channel.  This data buffer is not
+used in an unbuffered channel.
+
+- **sendq**: This queue holds the QueueMessage of any pending senders of a
+channel.  When a thread performs a channel_send operation on the channel, the
+channel_send operation will put a new QueueMessage on the sendq and block the
+current thread under two conditions:
+  1. The channel is buffered and is full
+  2. The channel is unbuffered and does not have a receiver
+
+- **recvq**:  This queue holds the QueueMessage of any pending receivers of a
+channel.  When a thread performs a channel_recv operation on the channel, the
+channel_recv operation will put a new QueueMessage on the recvq and block the
+current thread under two conditions:
+  1. The channel is buffered and there is no data on the buff_
+  2. The channel is unbuffered and does not have a sender
+
+### State diagram
+
+#### Channel Send
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/channel_send.png"/><br/>
+</p>
+
+#### Channel Receive
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/channel_recv.png"/><br/>
+</p>
+
+## Limitations and Considerations
+
+### Variable Copy
+
+In golang, variables in channels are copied from the sender to the receiver.
+In Paddle, the data from our variables are **moved** from sender to receiver.
+As a result, these variables should not be used after they are sent.  We
+provide a flag in channel_send method to allow users to copy the variable to
+be sent before it is sent.  
+
+Please note that this is acheived by adding an **assign** operator and creating
+a temporary variable that is sent in place of the original variable.  Please
+note that **assign** operator has limited support for only certain variables
+datatypes.
diff --git a/doc/fluid/design/concurrent/concurrent_programming.md b/doc/fluid/design/concurrent/concurrent_programming.md
new file mode 100644
index 0000000000000000000000000000000000000000..0428e74f9e00a87f6b0972057f48479b8ae56ad6
--- /dev/null
+++ b/doc/fluid/design/concurrent/concurrent_programming.md
@@ -0,0 +1,193 @@
+# Design Doc: Concurrent Programming with Fluid
+
+With PaddlePaddle Fluid, users describe a program other than a model.  The program is a [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto) protobuf message. TensorFlow/MxNet/Caffe2 applications generate protobuf messages too, but their protobuf messages represent the model, a graph of operators, but not the program that trains/uses the model.   
+
+Many know that when we program TensorFlow, we can specify the device on which each operator runs.  This allows us to create a concurrent/parallel AI application.   An interesting questions is **how does a `ProgramDesc` represents a concurrent program?**  
+
+The answer relies on the fact that a `ProgramDesc` is similar to an abstract syntax tree (AST) that describes a program.  So users just program a concurrent program that they do with any concurrent programming language, e.g., [Go](https://golang.org).
+
+## An Analogy
+
+The following table compares concepts in Fluid and Go
+
+<table>
+<thead>
+<tr>
+<th></th>
+<th>Go</th>
+<th>Fluid</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>user-defined functions </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid">layers</a></td>
+<td></td>
+</tr>
+<tr>
+<td>control-flow and built-in functions </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators">intrinsics/operators</a></td>
+<td></td>
+</tr>
+<tr>
+<td>goroutines, channels </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/framework/thread_pool.h">class ThreadPool</a></td>
+<td></td>
+</tr>
+<tr>
+<td>runtime </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.h">class Executor</a></td>
+<td></td>
+</tr>
+</tbody>
+</table>
+
+
+## An Example Concurrent Program
+
+To review all above concepts in an example, let us take a simple program and writes its distributed version.
+
+Suppose that we want to parallelize a naive Fluid program (written in Go and calling Fluid's Go binding) that multiplies two tensors.
+
+```go
+import "fluid"
+
+func paddlepaddle() {
+  X = fluid.read(...)
+  W = fluid.Tensor(...)
+  Y = fluid.mult(X, W)
+}
+```
+
+Please be aware that the Fluid's Go binding provides the default `main` function, which calls the `paddlepaddle` function, which, in this case, is defined in above program and creates the following `ProgramDesc` message.
+
+```protobuf
+message ProgramDesc {
+  block[0] = Block {
+    vars = [X, W, Y],
+    ops = [
+      read(output = X)
+      assign(input = ..., output = W)
+      mult(input = {X, W}, output = Y)
+    ],
+  }
+}
+```
+
+Then, the default `main` function calls `fluid.run()`, which creates an instance of the [`class Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.h) and calls `Executor.Run(block[0])`, where `block[0]` is the first and only block defined in above `ProgramDesc` message.
+
+The default `main` function is defined as follows:
+
+```go
+func main() {
+  paddlepaddle()
+  fluid.run()
+}
+```
+
+## The Concurrent Version
+
+By parallelizing the above program, we could support very big tensor X by splitting into small pieces {x_1, x_2, ...} and sent each piece to worker process/node for parallel multiplication.
+
+In this case, we can write a transpiler that takes a `ProgramDesc` message that represents the above example program and outputs two `ProgramDesc` messages, one for running on the master process/node, and the other one for worker processes/nodes.
+
+### The Master Program
+
+The master program could look like the following:
+
+```protobuf
+message ProgramDesc {
+  block[0] = Block {
+    vars = [X, L, Y],
+    ops = [
+      read(output = X)
+      kube_get_workers_addrs(output = L)
+      Y = tensor_array(len(L))
+      parallel_for(input = X, output = Y,
+                   attrs = {L, block_id(1)}) # referring to block 1
+    ]
+  }
+
+  block[1] = Block {
+    parent = 0,
+    vars = [x, y, index],
+    ops = [
+      slice(input = [X, index], output = x) # index is initialized by parallel_for
+      send(input = x, attrs = L[index])
+      recv(outputs = y, attrs = L[index])
+      assign(input = y, output = Y[index])
+    ]
+  }
+}
+```
+
+The equivalent Fluid program (calling the Go binding) is:
+
+```go
+func main() {  //// block 0
+  X = fluid.read(...)
+  L = fluid.k8s.get_worker_addrs()
+  Y = fluid.tensor_array(len(L))
+  fluid.parallel_for(X, L,
+                     func(index int) {  //// block 1
+                       x = X[index]
+                       fluid.send(L[index], x)
+                       y = fluid.recv(L[index])
+                       Y[index] = y
+                     })
+}
+```
+
+An explanation of the above program:
+
+- `fluid.k8s` is a package that provides access to Kubernetes API.  
+- `fluid.k8s.get_worker_addrs` returns the list of IP and ports of all pods of the current job except for the current one (the master pod).  
+- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/lod_tensor_array.h).  `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed,
+
+  1. creates `len(L)` scopes, each for the concurrent running of the sub-block (block 1 in this case), and initializes a variable named "index" in the scope to an integer value in the range `[0, len(L)-1]`, and
+  2. creates `len(L)` threads by calling into the `ThreadPool` singleton, each thread  
+     1. creates an Executor instance, and
+     2. calls `Executor.Run(block)`, where `block` is block 1 as explained above.
+1. Please be aware that block 1 is a sub-block of block 0, so ops in block 1 could refer to variables defined in block 0.
+
+### The Worker Program
+
+The worker program looks like
+
+```go
+func main() {
+  W = Tensor(...)
+  x = fluid.listen_and_do(
+        fluid.k8s.self_addr(),
+        func(input Tensor) {
+          output = fluid.mult(input, W)
+        })
+}
+```
+
+where
+
+- `fluid.listen_and_do` creates a `ListenAndDo` intrinsic, which, when executed,
+  1. listens on the current pod's IP address, as returned by `fliud.k8s.self_addr()`,
+  2. once a connection is established,
+     1. creates a scope of two parameters, "input" and "output",
+     2. reads a [Fluid variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h) and saves it into "input",
+     3. creates an Executor instance and calls `Executor.Run(block)`, where the block is generated by running the lambda specified as the second parameter of `fluid.listen_and_do`.
+
+## Summarization
+
+From the above example, we see that:
+
+1. Fluid enables the imperative programming paradigm by:
+   1. letting users describe a program, but not a model (a sequence of layers, or a graph of operators), and
+   2. call the `fluid.run` function that runs the program implicitly.
+1. The program is described as a `ProgramDesc` protobuf message.
+2. Function `Executor.Run` takes a block, instead of a `ProgramDesc`, as its parameter.
+3. `fluid.run` calls `Executor.Run` to run the first block in the `ProgramDesc` message.
+4. `Executor.Run`'s implementation is extremely simple -- it doesn't plan the execution nor create threads; instead, it runs on the current thread and execute intrinsics/operators' `Run` method sequentially as they appear in the `Block.ops` array.
+5. Intrinsics/operators' `Run` method might create threads.  For example, the `ListenAndDo` operator creates a thread to handle each incoming request.
+6. Threads are not necessarily OS thread; instead, they could be [green threads](https://en.wikipedia.org/wiki/Green_threads) managed by ThreadPool.  Multiple green threads might run on the same OS thread.  An example green threads is Go's [goroutines](https://tour.golang.org/concurrency/1).
diff --git a/doc/fluid/design/concurrent/csp.md b/doc/fluid/design/concurrent/csp.md
new file mode 100644
index 0000000000000000000000000000000000000000..66d19f44baf861c7847e81ca83f61024ec877faf
--- /dev/null
+++ b/doc/fluid/design/concurrent/csp.md
@@ -0,0 +1,251 @@
+# Design Doc: CSP in PaddlePaddle Fluid
+
+## Motivation
+
+Concurrent programming is important for deep learning.  Few example applications are:
+
+1.  The main thread keeps reading the next mini-batch while another thread uses the GPU for computing.
+2.  The main thread performs the computation while another thread uploads the local gradients from each trainer to the parameter server.
+
+Most DL systems, including TensorFlow, Caffe2, and MxNet, can asynchronously execute operators in a graph. However, Fluid doesn't have the concept of a graph at all, as the design goal of Fluid is that of a programming language.
+
+## Concurrent Programming Models
+
+There were many concurrent programming models, implemented in various forms:
+
+<table>
+<thead>
+<tr>
+<th>concurrent programming model</th>
+<th>implementation</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>mutex </td>
+<td>types and functions in standard libraries </td>
+</tr>
+<tr>
+<td>semaphore </td>
+<td> types and functions in standard libraries </td>
+</tr>
+<tr>
+<td> communicating sequential processes (CSP)  </td>
+<td> Go programming language </td>
+</tr>
+<tr>
+<td> actor model  </td>
+<td> Erlang programming language </td>
+</tr>
+<tr>
+<td> message passing  </td>
+<td> MPI </td>
+</tr>
+<tr>
+<td> bulk synchronous parallel (BSP)   </td>
+<td> Pregel distributed programming framework </td>
+</tr>
+</tbody>
+</table>
+
+
+Since Fluid was designed to be a programming language, we would like to implement CSP in Fluid.
+
+### CSP v.s. Actor Model
+
+A well-known implementation of Actor Model is the Erlang programming language.  In Actor Model, *processes* could send messages to another process and receive messages from another process given the process IDs.  We can find the three ingredients, process with ID, send, and recv, in MPI too.  Indeed, we can rewrite Erlang programs in Python + MPI with possibly fewer lines of code.  Our concern with Actor Model is that it doesn't seem reasonable to implement process management in a programming language's runtime library; instead, it should be the operating systems' responsibility to manage processes and libraries like MPI for send/recv.
+
+## CSP in Fluid
+
+Fluid has two fundamental control-flows: *if-else* and *while*.  If we are to implement CSP, we need the following:
+
+1. a new data type: *channel* and operators *send* and *recv*,
+1. *goroutine* or thread, and
+1. a new control-flow: select.
+
+We also need Python wrappers for the above components.
+
+The type *channel* is conceptually the blocking queue.  In Go, its implemented is a [blocking circular queue](https://github.com/golang/go/blob/68ce117cf17b8debf5754bfd476345779b5b6616/src/runtime/chan.go#L31-L50), which supports send and recv.
+
+The `select` operation has been in OS kernels long before Go language.  All Unix kernels implement system calls *poll* and *select*.  They monitor multiple file descriptors to see if I/O is possible on any of them.  This takes O(N) time.  Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time.  In BSD systems, there is a similar system call *kqueue*.  Go's Linux implementation uses epoll.
+
+It might be a good idea to implement Fluid's select using epoll too.  In this design doc, we start from the O(N) way so that we could focus on Python binding and the syntax.
+
+### Type Channel
+
+Fluid supports many data types:
+
+1. Tensor,
+1. Row-sparse Tensor
+1. LoD Tensor,
+1. Tensor array, etc
+
+Each data type is registered in the [`framework.proto`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L117-L127) as an enum value.  To add a new type channel, we need to add a new type enum.
+
+To expose a C++ type to Python, we need to edit the [`pybind.cc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc) file.  [Here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc#L120-L164) is an example how we expose C++ class LoDTensor.
+
+## Syntax Design
+
+### Create Channel
+
+In Go, we create a channel by specifying the element type and buffer size:
+
+```go
+ch  := make(chan int)       // a channel without buffer
+ch1 := make(chan int, 100)  // a channel that can buffer 100 ints.
+```
+
+In Fluid, we should be able to do the same:
+
+```python
+ch  = fluid.make_channel(dtype=INT)
+ch1 = fluid.make_channel(dtype=INT, 100)
+```
+
+In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16:
+
+```python
+ch = fluid.make_channel(dtype=Tensor, etype=float16)
+```
+
+or Tensors of Tensors of float16 etc.
+
+The point here is that we need a consistent way to compose types, like in C++ we can have `Tensor<Tensor<...<float16>...> >`.
+
+### Send and Recv
+
+Go's CSP implementation depends on data type *channel*. There are two types of channels:
+
+1. The unblocked channel, or buffered channel, is a blocking queue with a non-zero sized buffer. The sending to buffered channel blocks if the buffer is full, and the receive operation blocks if the buffer is empty.
+1. blocked channel, or unbuffered channel, is a blocking queue with no buffer.  Both sending and receiving block with unbuffered channels.
+
+There are four types of actions with a channel:
+
+1. Create a channel
+
+   ```go
+   ch := make(chan int) // this is an unbuffered channel
+   ch := make(chan int, 100) // this is a buffered channel of 100 ints.
+   ```
+
+1. Send
+
+   ```go
+   ch <- 111
+   ```
+
+1. Recv
+
+   ```go
+   y, ok <- ch
+   ```
+
+1. Close
+
+   ```go
+   close(ch)
+   ```
+
+   Please be aware that a closed channel is not a nil channel, which is `var ch chan int`.
+
+There are some [axioms with channels](https://dave.cheney.net/2014/03/19/channel-axioms):
+
+1. A send to a nil channel blocks forever
+
+1. A receive from a nil channel blocks forever
+
+1. A send to a closed channel panics
+
+1. A receive from a closed channel returns the residual values and then zeros.
+
+In Fluid, we have [buffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/buffered_channel.h) and [unbuffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/unbuffered_channel.h)
+
+The following program illustrates the Python syntax for accessing Fluid buffers.
+
+```python
+import fluid
+
+buffer_size = 10
+ch = fluid.make_channel(dtype=INT, buffer_size)
+
+# Now write three elements to the channel
+with fluid.while(steps=buffer_size):
+  fluid.send(ch, step)
+
+fluid.close_channel(ch)
+
+with fluid.while(steps=buffer_size):
+  fluid.print(fluid.recv(ch))
+```
+
+The following example shows that to avoid the always-blocking behavior of unbuffered channels, we need to use Fluid's goroutines.
+
+```python
+import fluid
+
+ch = fluid.make_channel(dtype=INT)
+
+with fluid.go():
+  fluid.send(ch)
+
+y = fluid.recv(ch)
+
+fluid.close_channel(ch)
+```
+
+### Select
+
+In Go, the `select` statement lets a goroutine wait on multiple communication operations. A `select` blocks until one of its cases can run, then it executes that case. It chooses one at random if multiple are ready.
+
+```go
+
+ch1  := make(chan int)       
+ch2  := make(chan int, 100)
+
+x := 0
+
+for {
+    select {
+    case ch1 <- x:
+      x := x + 1
+    case y <- ch2:
+      fmt.Println("Received on channel")
+    default:
+      fmt.Println("Default")
+    }
+  }
+
+```
+
+In Fluid, we should be able to do the same:
+
+```python
+ch1  = fluid.make_chan(dtype=INT)
+ch2 = fluid.make_chan(dtype=INT, 100)
+
+sel = fluid.select()
+
+with sel.case(ch1, 'w', X):
+    fluid.layers.increment(X)
+
+with sel.case(ch2, 'r', Y):
+    fluid.print("Received on Channel")
+
+with sel.default():
+    fluid.print("Default")
+
+```
+
+In the above code snippet, `X` and `Y` are variables. Now let us look at each of these statements one by one.
+
+- `sel.case(ch1, 'w', X)` : This specifies that we are writing to `ch1` and we want to write the integer in variable `X` to the channel. The character `w` is used here to make the syntax familiar to write syntax in Python I/O.
+
+- `sel.case(ch2, 'r', Y)` : This specifies that we would like to read the result from `ch2` into variable `Y`. The character `r` is used here to make the syntax familiar to read syntax in Python I/O.
+
+- `sel.default()` : This is equivalent to the default in Go `select`. If none of the channels are ready for read or write, then the fluid code in the default block will be executed.
+
+## Example Programs
+
+### 1. RPC between Trainers and Parameter Servers
+
+### 2. Concurrent Minibatch Loading
diff --git a/doc/fluid/design/concurrent/go_op.md b/doc/fluid/design/concurrent/go_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..c18b788e80f432ebb2f14b15229e7823c112001e
--- /dev/null
+++ b/doc/fluid/design/concurrent/go_op.md
@@ -0,0 +1,231 @@
+# go_op Design
+
+## Introduction
+
+The **go_op** allows user's of PaddlePaddle to run program blocks on a detached
+thread.  It works in conjuction with CSP operators (channel_send, 
+channel_receive, channel_open, channel_close, and select) to allow users to
+concurrently process data and communicate easily between different threads.
+
+## How to use it
+
+```
+channel = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+
+with fluid.Go():
+    # Send a tensor of value 99 to "channel" on a detached thread
+    tensor = fill_constant(shape=[1], dtype='int', value=99)
+    tensor.stop_gradient = True
+    fluid.channel_send(channel, tensor)
+    
+# Receive sent tensor from "channel" on the main thread
+result = fill_constant(shape=[1], dtype='int', value=-1)    
+fluid.channel_recv(ch, result)  
+```
+
+The go operator can be accessed by using the fluid.Go() control flow.  This
+will create a new sub block, where the user can add additional operators
+to be ran on the thread.
+
+**Note:** Since back propegation is currently not support in the go_op, users
+should ensure that operators in the go block does not require gradient 
+calculations.
+
+## How it Works
+
+Similar to other control blocks, go_op will create a sub block and add it
+as a child to the current block.  Operators and variables defined in this
+block will be added to the go sub_block.
+
+In addition, the go operator will create a new child scope whose parent is
+the global scope.  Please refer to [block captures](#block-captures) for more
+information.
+
+When Paddle executor runs go_op, go_op will take the sub_block and pass it to
+the executor.run method (along with a newly created local scope) on a detached
+thread.
+
+An example of the generated program description is shown below.  Take note of
+the **go_op** in particular.  It is added as an operator in the current 
+block (in this example, block0).  The **go_op** contains a `sub_block`
+attribute, which points to the id of the block that will be executed in a 
+detached thread.
+
+```
+blocks {
+  idx: 0
+  parent_idx: -1
+  vars {
+    name: "return_value"
+    type {
+      type: LOD_TENSOR
+      lod_tensor {
+        tensor {
+          data_type: INT64
+        }
+      }
+    }
+  }
+  vars {
+    name: "status_recv"
+    type {
+      type: LOD_TENSOR
+      lod_tensor {
+        tensor {
+          data_type: BOOL
+        }
+      }
+    }
+  }
+  ...
+  ops {
+    outputs {
+      parameter: "Out"
+      arguments: "channel"
+    }
+    type: "channel_create"
+    attrs {
+      name: "data_type"
+      type: INT
+      i: 7
+    }
+    attrs {
+      name: "capacity"
+      type: INT
+      i: 0
+    }
+  }
+  ops {
+    inputs {
+      parameter: "X"
+      arguments: "channel"
+    }
+    type: "go"
+    attrs {
+      name: "sub_block"
+      type: BLOCK
+      block_idx: 1
+    }
+  }
+  ops {
+    inputs {
+      parameter: "Channel"
+      arguments: "channel"
+    }
+    outputs {
+      parameter: "Out"
+      arguments: "return_value"
+    }
+    outputs {
+      parameter: "Status"
+      arguments: "status_recv"
+    }
+    type: "channel_recv"
+  }
+  ...
+}
+
+blocks {
+  idx: 1
+  parent_idx: 0
+  vars {
+    name: "status"
+    type {
+      type: LOD_TENSOR
+      lod_tensor {
+        tensor {
+          data_type: BOOL
+        }
+      }
+    }
+  }
+  ...
+  
+  ops {
+    outputs {
+      parameter: "Out"
+      arguments: "fill_constant_1.tmp_0"
+    }
+    type: "fill_constant"
+    attrs {
+      name: "force_cpu"
+      type: BOOLEAN
+      b: false
+    }
+    attrs {
+      name: "value"
+      type: FLOAT
+      f: 99.0
+    }
+    attrs {
+      name: "shape"
+      type: INTS
+      ints: 1
+    }
+    attrs {
+      name: "dtype"
+      type: INT
+      i: 3
+    }
+  }
+  ops {
+    inputs {
+      parameter: "Channel"
+      arguments: "channel"
+    }
+    inputs {
+      parameter: "X"
+      arguments: "fill_constant_1.tmp_0"
+    }
+    outputs {
+      parameter: "Status"
+      arguments: "status"
+    }
+    type: "channel_send"
+    attrs {
+      name: "copy"
+      type: BOOLEAN
+      b: false
+    }
+  }
+```
+
+## Current Limitations
+
+#### <a name="block-captures"></a>Scopes and block captures:
+
+Paddle utilizes [scopes](./../concepts/scope.md) to store variables used in a
+block.  When a block is executed, a new local scope is created from the parent
+scope (ie: scope derived from the parent block) and associated with the new 
+child block.  After the block finishes executing, then the local scope and
+all associated variables in the scope is deleted.
+
+This works well in a single threaded scenario, however with introduction of
+go_op, a child block may continue to execute even after the parent block has
+exited.  If the go_op tries to access variables located in the parent block's
+scope, it may receive a segmentation fault because the parent scope may have
+been deleted.
+
+We need to implement block closures in order to prevent access to parent
+scope variables from causing a segmentation fault.  As a temporary workaround,
+please ensure that all variables accessed in the go block is not destructed
+before it is being accessed.  Currently, the go_op will explicitly enforce 
+this requirement and raise an exception if a variable could not be found in 
+the scope.
+
+Please refer to [Closure issue](https://github.com/PaddlePaddle/Paddle/issues/8502)
+for more details.
+
+#### Green Threads
+
+Golang utilizes `green threads`, which is a mechnism for the runtime library to 
+manage multiple threads (instead of natively by the OS).  Green threads usually
+allows for faster thread creation and switching, as there is less overhead
+when spawning these threads.  For the first version of CSP, we only support
+OS threads.
+
+
+#### Backward Propegation:
+
+go_op currently does not support backwards propagation.  Please use go_op with
+non training operators.
diff --git a/doc/fluid/design/concurrent/images/channel_recv.png b/doc/fluid/design/concurrent/images/channel_recv.png
new file mode 100644
index 0000000000000000000000000000000000000000..c06cd15ae7b8a8c94d5742f6675e389081fcf789
Binary files /dev/null and b/doc/fluid/design/concurrent/images/channel_recv.png differ
diff --git a/doc/fluid/design/concurrent/images/channel_send.png b/doc/fluid/design/concurrent/images/channel_send.png
new file mode 100644
index 0000000000000000000000000000000000000000..006ebb4a5a4bcd32c97847e9fb7729a740255f7c
Binary files /dev/null and b/doc/fluid/design/concurrent/images/channel_send.png differ
diff --git a/doc/fluid/design/concurrent/images/select_op_workflow.png b/doc/fluid/design/concurrent/images/select_op_workflow.png
new file mode 100644
index 0000000000000000000000000000000000000000..719ed76f9d542d6c4f20c30f27656bb53325aa85
Binary files /dev/null and b/doc/fluid/design/concurrent/images/select_op_workflow.png differ
diff --git a/doc/fluid/design/concurrent/index_cn.rst b/doc/fluid/design/concurrent/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e47135e9fc42760898083710e0a6767252a0225b
--- /dev/null
+++ b/doc/fluid/design/concurrent/index_cn.rst
@@ -0,0 +1,8 @@
+并发编程
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  concurrent_programming.md
+  parallel_do.md
diff --git a/doc/fluid/design/concurrent/index_en.rst b/doc/fluid/design/concurrent/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0727e75798b2a869588f80d3cce7a886554e4ffb
--- /dev/null
+++ b/doc/fluid/design/concurrent/index_en.rst
@@ -0,0 +1,8 @@
+Concurrent Programming
+-------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  concurrent_programming.md
+  parallel_do.md
diff --git a/doc/design/parallel_do.md b/doc/fluid/design/concurrent/parallel_do.md
similarity index 100%
rename from doc/design/parallel_do.md
rename to doc/fluid/design/concurrent/parallel_do.md
diff --git a/doc/fluid/design/concurrent/select_op.md b/doc/fluid/design/concurrent/select_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..4fcae57cc7932cdaebe549486e7f7cebf0bd038a
--- /dev/null
+++ b/doc/fluid/design/concurrent/select_op.md
@@ -0,0 +1,265 @@
+# select_op Design
+
+## Introduction
+
+In golang, the [**select**](https://golang.org/ref/spec#Select_statements)
+statement lets a goroutine wait on multiple communication operations at the
+same time. The **select** blocks until one of its cases can run, then
+executes the case.  If multiple cases are ready to run, then one case is
+choosen at random to be executed.
+
+With the introduction of CSP for Paddle, we mimic this behavior by
+creating a ***select_op***.
+
+## How to use it
+
+The **select_op** is available as a c++ operator.  However most users
+will prefer to use the much simplier Python API.
+
+- **fluid.Select()**: Creates a select operator and adds it to the current
+block within the main program.  Also creates a sub block and adds it to the
+main program.  This sub block is used to hold all variables and operators
+used by the case statements.
+
+Within the select block, users can add cases by
+calling **select.case** or **select.default** method.
+
+- **fluid.Select.case(channel_action, channel, result_variable)**: Represents
+a fluid channel send/recv case.  This method creates a SelectCase block
+guard and adds it to the Select block.  The arguments into this method tells
+the select which channel operation to listen to.
+
+- **fluid.Select.default()**: Represents the fluid default case.  This default
+case is executed if none of the channel send/recv cases are available to
+execute.
+
+**Example:**
+```
+ch1 = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+quit_ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+
+x = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
+y = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=1)
+
+while_cond = fill_constant(shape=[1], dtype=core.VarDesc.VarType.BOOL, value=True)
+while_op = While(cond=while_cond)    
+
+with while_op.block():
+    with fluid.Select() as select:
+        with select.case(fluid.channel_send, channel, x):
+            # Send x, then perform Fibonacci calculation on x and y
+            x_tmp = fill_constant(shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
+            assign(input=x, output=x_tmp)
+            assign(input=y, output=x)
+            assign(elementwise_add(x=x_tmp, y=y), output=y)
+        with select.case(fluid.channel_recv, quit_channel, result2):
+            # Exit out of While loop
+            while_false = fill_constant(shape=[1], dtype=core.VarDesc.VarType.BOOL, value=False)
+            helper = layer_helper.LayerHelper('assign')
+            helper.append_op(
+                type='assign',
+                inputs={'X': [while_false]},
+                outputs={'Out': [while_cond]})
+```
+
+## How it Works
+
+### Program Description
+
+```
+blocks {
+  idx: 0
+  ...
+  // Create "case_to_execute" variable
+  ops {
+    outputs {
+      parameter: "Out"
+      arguments: "fill_constant_110.tmp_0"
+    }
+    type: "fill_constant"
+    attrs {
+      name: "force_cpu"
+      type: BOOLEAN
+      b: false
+    }
+    attrs {
+      name: "value"
+      type: FLOAT
+      f: -1.0
+    }
+    attrs {
+      name: "shape"
+      type: INTS
+      ints: 1
+    }
+    attrs {
+      name: "dtype"
+      type: INT
+      i: 2
+    }
+  }
+  // Create "select" operator.
+  // inputs:
+  //   X: All input variables used by operators within the select block
+  //   case_to_execute: Variable filled in by select_op when it determines
+  //     which case to execute.
+  //  
+  // outputs:
+  //   Out: All output variables referenced by operators within select block.
+  //
+  // attrs:
+  //   sub_block: The block id containing the select "cases"
+  //   cases:  Serialized list of all cases in the select op.
+  //     Each case is serialized as: '<index>,<type>,<channel>,<value>'
+  //     where type is 0 for default, 1 for send, and 2 for receive.
+  //     No channel and values are needed for default cases.
+  ops {
+    inputs {
+      parameter: "X"
+      arguments: "fill_constant_103.tmp_0"
+      arguments: "fill_constant_104.tmp_0"
+    }
+    inputs {
+      parameter: "case_to_execute"
+      arguments: "fill_constant_110.tmp_0"
+    }
+    outputs {
+      parameter: "Out"
+      arguments: "fill_constant_110.tmp_0"
+    }    
+    type: "select"
+    attrs {
+      name: "sub_block"
+      type: BLOCK
+      block_idx: 1
+    }
+    attrs {
+      name: "cases"
+      type: STRINGS
+      strings: "0,1,channel_101,fill_constant_109.tmp_0"
+      strings: "1,2,channel_102,fill_constant_108.tmp_0"
+    }
+  }
+  ...
+}
+```
+
+The python select API will add the **select_op** to the current block.  In addition, it will
+iterate through all it's case statements and add any input variables required by case statements
+into **X**.  It will also create a temp variable called **case_to_execute**.  This variable is
+filled in by the select_op after it has completed processing the case statements.
+
+If there are no available cases to execute (ie: all cases are blocked on channel operations, and
+there is no default statement), then the select_op will block the current thread.  The thread will
+unblock once there is a channel operation affecting one of the case statements, at which point, the
+**select_op** will set the **case_to_execute** variable to the index of the case to execute.
+
+Finally the select_op will call executor.run on the **sub_block**.
+
+```
+blocks {
+  idx: 1
+  parent_idx: 0
+  ...
+  // Fill a tensor with the case index (ie: 0,1,2,3,ect.)
+  ops {
+    outputs {
+      parameter: "Out"
+      arguments: "fill_constant_111.tmp_0"
+    }
+    type: "fill_constant"
+    attrs {
+      name: "force_cpu"
+      type: BOOLEAN
+      b: false
+    }
+    attrs {
+      name: "value"
+      type: FLOAT
+      f: 0.0
+    }
+    attrs {
+      name: "shape"
+      type: INTS
+      ints: 1
+    }
+    attrs {
+      name: "dtype"
+      type: INT
+      i: 2
+    }
+  }
+  // Create an "equal" operator to compare the case index with the "case_to_execute"
+  // tensor (which was filled in by the select op).
+  ops {
+    inputs {
+      parameter: "X"
+      arguments: "fill_constant_111.tmp_0"  // case 0
+    }
+    inputs {
+      parameter: "Y"
+      arguments: "fill_constant_110.tmp_0"  // case_to_execute
+    }
+    outputs {
+      parameter: "Out"
+      arguments: "equal_0.tmp_0"
+    }
+    type: "equal"
+    attrs {
+      name: "axis"
+      type: INT
+      i: -1
+    }
+  }
+  // Use the output of the "equal" operator as a condition for the "conditional_block".
+  // If the condition evaluates to true, then execute the "sub_block" (which represents
+  // the select case's body)
+  ops {
+    inputs {
+      parameter: "Params"
+    }
+    inputs {
+      parameter: "X"
+      arguments: "equal_0.tmp_0"
+    }
+    outputs {
+      parameter: "Out"
+    }
+    outputs {
+      parameter: "Scope"
+      arguments: "_generated_var_0"
+    }
+    type: "conditional_block"
+    attrs {
+      name: "is_scalar_condition"
+      type: BOOLEAN
+      b: true
+    }
+    attrs {
+      name: "sub_block"
+      type: BLOCK
+      block_idx: 4
+    }
+  }
+  ...
+  // Repeat the above operators for each case statements inside the select body
+}
+
+```
+
+Cases are represented by a **conditional_block operator**, whose's condition is set as the output of
+equal(**case_to_execute**, **case_index**).  Since each case index is unique in this sub-block,
+only one case will be executed.
+
+### select_op flow
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/select_op_workflow.png"/><br/>
+</p>
+
+The select algorithm is inspired by golang's select routine.  Please refer to
+http://www.tapirgames.com/blog/golang-concurrent-select-implementation for more information.
+
+## Backward Pass
+
+TODO
diff --git a/doc/fluid/design/data_type/float16.md b/doc/fluid/design/data_type/float16.md
new file mode 100644
index 0000000000000000000000000000000000000000..844d2aafcf257b85057e1ac200ed3d5cf0be2ff0
--- /dev/null
+++ b/doc/fluid/design/data_type/float16.md
@@ -0,0 +1,183 @@
+# Design Doc: float16
+
+## Why float16
+Half precision (float16) is a binary floating-point format that occupies 16 bits in memory. float16 is half the size of traditional 32-bit single precision format (float) and has lower precision and smaller range. 
+
+When high precision computation is not required (which is usually the case at least in the deep learning inference stage), using float16 data type could potentially 
+
+- reduce storage space, memory bandwidth, and power usages; 
+- increase the chance of data fitting into a smaller cache of lower latency; 
+- provide arithmetic speed up if supported by hardware. 
+
+## Survey of current float16 support
+A brief survey of float16 support on different compilers, hardwares, and libraries can be found below. Interested readers can refer to [link1](https://github.com/PaddlePaddle/Paddle/issues/4853) and [link2](https://github.com/Xreki/Xreki.github.io/blob/master/multi_data_types_in_dl_framework/ppt/float16_and_quantized_type.md) for more info.
+
+The goal of float16 is to serve as a key for the executor to find and run the correct version of compute method specialized for float16 in operator kernels. It should be compatible with various natively supported float16 implementations including `__half` for cuda, `float16_t` for ARM, and `Eigen::half` for Eigen to make writing customized float16 kernels easier. 
+
+### Compiler
+- nvcc supports `__half` data type after CUDA 7.5.
+- `__fp16` or `float16_t` is supported as storage type for gcc >= 6.1 and clang >= 3.4.
+- `__fp16` or `float16_t` is supported as arithmetic type for gcc >= 7.1 and clang >= 3.9.
+
+### Hardware
+- `__half` is supported on GPU with compute capability >= 5.3.
+- `__fp16` is supported as storage type for ARMv7-A, ARMv8-A, and above.
+- `__fp16` is supported as arithmetic type after ARMv8.2-A (currently, the only microarchitecture implementing ARMv8.2-A is ARM Cortex-A75, which is announced in May 2017. There seems to be no application processors currently available on market that adopts this architecture. It is reported that Qualcomm Snapdragon 845 uses Cortex-A75 design and will be available in mobile devices in early 2018).
+
+### Libraries
+- [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors.
+- [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU).
+
+### CUDA version issue
+There are currently three versions of CUDA that supports `__half` data type, namely, CUDA 7.5, 8.0, and 9.0. 
+CUDA 7.5 and 8.0 define `__half` as a simple struct that has a `uint16_t` data (see [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/9212ab5a3ddbe48f30ef373f9c1fb546804c7a8c/include/isaac/external/CUDA/cuda_fp16.h)) as follows:
+```
+typedef struct __align__(2) {
+   unsigned short x;
+} __half;
+
+typedef __half half;
+```
+This struct does not define any overloaded arithmetic operators. So you have to directly use `__hadd` instead of `+` to correctly add two half types:
+```
+__global__ void Add() {
+  half a, b, c;
+  c = __hadd(a, b); // correct
+  c = a + b; // compiler error: no operator "+" matches these operands
+}
+```
+CUDA 9.0 provides a major update to the half data type. The related code can be found in the updated [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.h) and the newly added [`cuda_fp16.hpp`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.hpp).
+
+Essentially, CUDA 9.0 renames the original `__half` type in 7.5 and 8.0 as `__half_raw`, and defines a new `__half` class type that has constructors, conversion operators, and also provides overloaded arithmetic operators such as follows:
+```
+typedef struct __CUDA_ALIGN__(2) {
+    unsigned short x;
+} __half_raw;
+
+
+struct __CUDA_ALIGN__(2) __half {
+protected:
+    unsigned short __x;
+public:
+    // constructors and conversion operators from/to 
+    // __half_raw and other built-in data types
+}
+
+typedef __half half;
+
+__device__ __forceinline__ 
+__half operator+(const __half &lh, const __half &rh) { 
+    return __hadd(lh, rh); 
+}
+
+// Other overloaded operators
+``` 
+This new design makes `c = a + b` work correctly for CUDA half data type. 
+
+## Implementation
+The float16 class holds a 16-bit `uint16_t` data internally.
+```
+struct float16 {
+  uint16_t x;
+};
+``` 
+
+float16 supports the following features:
+  - constructors / assignment operators that take input from primitive data types including bool, integers of various length, float, and double. 
+  - constructors / assignment operators that take input from `__half` on cuda, `float16_t` on ARM, and `Eigen::half` on Eigen.
+  - conversion operators to primitive data types and half precision data types on cuda, ARM and Eigen. 
+  - overloaded arithmetic operators for cuda, arm, and non-arm cpu, respectively. These operators will take advantage of the cuda and ARM intrinsics on the corresponding hardware. 
+  
+To support the above features, two fundamental conversion functions are provided:
+```
+float16 float_to_half_rn(float f);  // convert to half precision in round-to-nearest-even mode
+float half_to_float(float16 h);
+```
+which provides one-to-one conversion between float32 and float16. These twos functions will do different conversion routines based on the current hardware. CUDA/ARM instrinsics will be used when the corresonding hardware is available. If the hardware or compiler level does not support float32 to float16 conversion, software emulation will be performed to do the conversion.
+
+## float16 inference
+In Fluid, a neural network is represented as a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), whose Python wrapper is a [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#program). The basic structure of a program is some nested [blocks](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program desc by executing the sequence of operators in the entrance block of the program one by one.  
+
+### Operator level requirement
+Each operator has many kernels for different data types, devices, and library types. The operator will select the appropriate kernel to run based on, among other things, the data type of the input variables. By default, every Fluid operator has a float data type kernel that takes float variables as input and generates float output. 
+
+This means that if we provide float input to the first operator in a program, then each opeartor will use float kernel to compute float output and send it as input to the next operator to trigger the float kernel. Overall, the program will run in float mode and give us a final output of float data type.
+
+The same principle applies if we want a program to run in float16 mode. We provide input variable of float16 data type to the first operator, and then one by one, each operator in the program will run the float16 kernel (provided that each operator in this program has float16 kernels registered) until we finally obtain a float16 output variable.
+
+So the preliminary requirement for float16 inference is to add float16 kernel to operators that are needed in a specific kind of program. For example, float16 inference on an image classification neural network like Vgg or Resnet, typically requires the following operators to have float16 kernels: convolution, pooling, multiplication, addition, batch norm, dropout, relu, and softmax. Please refer to [new_op_en](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/new_op_en.md) for details of how to add new kernels to an operator.
+
+### Variable level requirement
+Operators including convolution and multiplication (used in fully-connected layers) takes as input not only the variables generated by the preceding operators but also [parameter](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#parameter) variables, which contains the trained weights to apply to the input data. These weights are obtained in the Fluid training process and are by default of float data type.
+
+When these operators are running in float16 mode, the float16 kernel requires those parameter variables to contain weights of Fluid float16 data type. Thus, we need a convenient way to convert the original float weights to float16 weights. 
+
+In Fluid, we use tensor to hold actual data for a variable on the c++ end. [Pybind](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/pybind/tensor_py.h) is used to bind c++ tensors of certain data type with numpy array of the correponding numpy data type on the Python end. Each common c++ built-in data type has a corresponding numpy data type of the same name. However, since there is no built-in float16 type in c++, we cannot directly bind numpy float16 data type with the Fluid float16 class. Since both Fluid float16 and numpy float16 use uint16 as the internal data storage type, we use c++ built-in type `uint16_t` and the corresponding numpy uint16 data type to bridge the gap via [Pybind](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/pybind/tensor_py.h). 
+
+The following code demonstrates how to do the tensor conversion.
+```Python
+# var is the variable of float weights
+# tensor is a numpy array of data copied from the tensor data in var 
+# fp16_var is the variable that will contain float16 weights converted from var  
+tensor = numpy.array(var.get_tensor())
+fp16_tensor = fp16_var.get_tensor()
+
+# After the original tensor data is converted to numpy float16 data type, 
+# view(numpy.uint16) is used so that the internal memory of the numpy array 
+# will be reinterpreted to be of uint16 data type, which is binded to 
+# Fluid float16 class via pybind with the help of uint16_t built-in c++ type
+fp16_tensor.set(tensor.astype(numpy.float16).view(numpy.uint16), GPUPlace)  
+```
+
+### Consistent API requirement
+The basic inference in float16 mode requires users to feed input and obtain output both of float16 data type. However, in this way, the inference APIs are not consistent between float16 mode and float mode, and users may find it confusing and diffcult to use float16 inference since they need to do extra steps to provide float16 input data and convert float16 output data back to float. To have consistent API for different inference modes, we need to transpile the program desc in some way so that we can run float16 inference by feeding and fetching variables of float data type.
+
+This problem can be solved by introducing a type-casting operator which takes an input variable of certain data type, cast it to another specified data type, and put the casted data into the output variable. Insert cast operator where needed can make a program internally run in float16 mode.   
+
+### float16 transpiler
+Put all the above requirements in mind, we designed a float16 inference transpiler that can tranpile a float32 mode inference program desc to a float16 mode one.
+
+Given a float inference program and the corresponding variables of float32 weights in the [scope](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/scope.md),
+this transpiler mainly does the following modifications:
+
+1. Insert cast operators at the beginning of the program so that the input float data will be converted to float16 data type before feeding to subsequent operators to invoke the float16 kernel. 
+
+2. Insert cast operators at the end of the program so that the output float16 data will be converted back to float data type before users obtain the result.
+
+3. For each parameter variable of float weights, create in the scope a corresponding variable of float16 weights which are converted from the corresponding float weights and add this new float16 variable to the program.
+
+4. Update the operator information in the program so that each relevant operator use the newly created float16 variable instead of its float counterpart.
+
+Below is an example of usage:
+```Python
+# Get the float inference program
+[float_inference_program, feed_target_names,
+ fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+# Prepare the float input data
+tensor_img = numpy.random.rand(1, 3, 32, 32).astype(numpy.float32)
+
+# Running inference_program in float mode
+float_results = exe.run(float_inference_program,
+                        feed={feed_target_names[0]: tensor_img},
+                        fetch_list=fetch_targets)
+
+# Use float16 transpiler to speedup
+float16_inference_program = float_inference_program.clone()
+t = fluid.InferenceTranspiler()
+t.float16_transpile(float16_inference_program, GPUPlace)
+
+# Running 
+float16_results = exe.run(float16_inference_program,
+                          feed={feed_target_names[0]: tensor_img},
+                          fetch_list=fetch_targets)
+```
+
+As we can see from the example above, users can simply use the `float16_transpile` method provided by the infernece transpiler class on an existing float inference program to run inference in float16 mode.
+
+### Speedup on GPU
+Currently, Fluid inference in float16 mode is only supported on Nvidia GPU device. There is no motivation to support float16 inference on non-ARM CPUs because float16 is not natively supported there and float16 calculation will only be slower than its float counterpart. 
+
+Nvidia started to support its native float16 data type (which has the same internal memory representation as Fluid float16 class) on CUDA 7.5. Moreover, float16 speedups on common computational intensive tasks including GEMM (general matrix-matrix multiplication) and convolution are supported since cublas 7.5 and cuDNN 5.0.
+
+Recently, the introduction of [tensor core](https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/) in volta architecture GPUs and the support of tensor core calculation in CUDA 9.0 and cuDNN 7.0 make float16 truly superior to float in certain deep learning applications. Please refer to this [benchmark report](https://github.com/kexinzhao/Paddle_benchmark/blob/master/float16_benchmark.md) for more details.
diff --git a/doc/fluid/design/data_type/index_cn.rst b/doc/fluid/design/data_type/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b60167b6b1599df69dfc5073ebf32bdbb0a316ec
--- /dev/null
+++ b/doc/fluid/design/data_type/index_cn.rst
@@ -0,0 +1,7 @@
+数据类型
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  float16.md
diff --git a/doc/fluid/design/data_type/index_en.rst b/doc/fluid/design/data_type/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6a88d17943f49134a2d00363845e919537ff4545
--- /dev/null
+++ b/doc/fluid/design/data_type/index_en.rst
@@ -0,0 +1,7 @@
+Data Type
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  float16.md
diff --git a/doc/fluid/design/dist_train/README.md b/doc/fluid/design/dist_train/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..2dd652d8bdcb8f3b6e759347bd55b217be909386
--- /dev/null
+++ b/doc/fluid/design/dist_train/README.md
@@ -0,0 +1,57 @@
+## Distributed training overview doc
+
+Currently Paddle Fluid use parameter server architecture to support distributed training.
+
+For synchronous and asynchronous training, the differences are mostly in the logic of parameter server. Now we have already support synchronous training.
+
+### Synchronous training
+
+The training process of synchronous training is:
+
+![synchronous distributed training](./src/sync_distributed_training.png)
+
+1. Pserver
+	1. set `barrier_condition_` to 0 and waits for trainers to send gradient.
+1. Trainer
+	1. Trainer read minibatch of data, run forward-backward with local parameter copy and get the gradients for parameters.
+	1. Trainer use split op to split all the gradient into blocks. The split method is determined at compile time.
+	1. Trainer use send_op to send all the split gradients to corresponding parameter server.
+	1. After trainer send all the gradients, it will send a `BATCH_BARRIER_MESSAGE` to all pservers.
+	1. Trainer call GetVariable to pserver and wait for `barrier_condition_` on pserver to be 1.
+1. Pserver
+   1. Pserver will count the number of `BATCH_BARRIER_MESSAGE`.
+	1. When the count of `BATCH_BARRIER_MESSAGE` is equal to the number of Trainer. Pserver thinks it received all gradient from all trainers.
+	1. Pserver will run the optimization block to optimize the parameters.
+	1. After optimization, pserver set `barrier_condition_` to 1.
+	1. Pserver wait for `FETCH_BARRIER_MESSAGE`.
+1. Trainer.
+	1. The trainer uses GetVariable to get all the parameters from pserver.
+	1. Trainer sends a `FETCH_BARRIER_MESSAGE` to each pserver.
+1. Pserver.
+	1. when the number of `FETCH_BARRIER_MESSAGE` reach the number of all trainers. Pserver think all the parameters have been got. it will go back to 1. to set `barrier_condition_` to 0.
+
+### Asynchronous training
+In the above process. There are two barriers for all trainers to synchronize with each other. In asynchronous training, these two barriers are not needed. The trainer can just send gradients to pserver and then get parameters back.
+
+The training process of asynchronous training can be:
+
+![asynchronous distributed training](./src/async_distributed_training.png)
+
+1. Pserver:
+	1. Each parameter has a queue to receive its gradient from trainers.
+	1. Each parameter has a thread to read data from the queue and run optimize block, using the gradient to optimize the parameter.
+	1. Using an independent thread to handle RPC call `GetVariable` for trainers to get parameters back.(Maybe here we should use a thread pool to speed up fetching the parameters.)
+
+1. Trainer:
+	1. Trainer read a batch of data. Run forward and backward with local parameter copy and get the gradients for parameters.
+	1. Trainer split all gradients to blocks and then send these gradient blocks to pservers(pserver will put them into the queue).
+	2. Trainer gets all parameters back from pserver.
+
+### Note:
+There are also some conditions that need to consider. For exmaple:
+
+1. If trainer needs to wait for the pserver to apply it's gradient and then get back the parameters back.
+1. If we need a lock between parameter update and parameter fetch.
+1. If one parameter must be on one server, or it can also be split and send to multiple parameter servers.
+
+The above architecture of asynchronous training can support different mode, we can have a detailed test in the future for these problems.
diff --git a/doc/fluid/design/dist_train/async_update.md b/doc/fluid/design/dist_train/async_update.md
new file mode 100644
index 0000000000000000000000000000000000000000..248d2ec18dafdecac9184527638754b6ba4d85b8
--- /dev/null
+++ b/doc/fluid/design/dist_train/async_update.md
@@ -0,0 +1,61 @@
+# Design Doc: Asynchronous Update With Distributed Training
+
+## Background
+
+For the typical synchronous distributed training, some significant steps are as follows:
+
+1. A trainer process will compute the gradients and **send** them to the parameter server (PS) nodes.
+1. After the PS node received gradients came from all the Trainers, It will aggregate the
+gradient variables for the same parameter into one gradient variable and then apply the aggregated
+gradient to the respective parameter, finally using an optimize algorithms(SGD, Monument...)
+to update the parameters.
+1. The Trainer would wait for the PS finished the optimize stage, and GET the parameters from PS,
+so all the Trainers would get the same parameters.
+
+In Synchronous Distributed Training, there is a **barrier** on each PS to wait until all trainers processes
+have completed running current mini-batch. After that, all trainers can continue to run the next
+mini-batch. So, we can find that the overall performance of Synchronous Distributed Training depends 
+on the slowest node.
+
+In Asynchronous Distributed Training, we don't need to wait for a global mini-bach, the optimizer on
+the PS will run immediately when the gradient is uploaded to the PS from one trainer. This mode would
+train such models that achieve scaling, better throughput. In this design doc, we will introduce how to 
+implement the Asynchronous Distributed Training base on PaddlePaddle Fluid.
+
+## Design
+
+<img src="./src/async_update.png" width="600"/>
+
+As the figure above, we describe a global view of the asynchronous update process and use
+the parameter `w1` as an example to introduce the steps:
+1. For each gradient variables, they may distribute on different GPU card and aggregate
+them while they are all calculated.
+1. Split the gradient variable into multiple blocks according to the number of PS
+instances and then send them.
+1. PS would run an `Optimize Block` using a specified optimize algorithm to update
+the specified parameter.
+1. The trainer will fetch the latest parameter from PS before running forward Op which depends
+on the specified parameter.
+1. Broadcast the received variable into multiple GPU cards and continue to run the next
+mini-batch.
+
+### Trainer
+
+- For the multiple devices distributed training, we need to aggregate the gradient
+variables which placed on different devices firstly and then schedule a `SendVars` Operator to
+send the gradient variables to the multiple PS instances.
+- Schedule `FetchVars` operator to fetch the latest parameter from PS before running
+the forward ops.
+- There could be a large number of gradient variables to be sent, so we need to use another
+thread pool(IO Threadpool) whose a number of the schedulable threads is larger than the
+computing thread pool to avoid competitive the thread resources with computing.
+
+### Parameter Server
+
+<img src="./src/async_pserver.png" width="750"/>
+
+- There should be multiple trainer instances want to optimize the same parameter at
+the same time, to avoid the racing, we need one `BlockingQueue` for each gradient
+variable to process them one by one.
+- We need a `Map` structure to map a gradient variable name to the `OptimizeBlock` which
+can optimize the respective parameter.
diff --git a/doc/fluid/design/dist_train/distributed_architecture.md b/doc/fluid/design/dist_train/distributed_architecture.md
index a405cb6aaf80b9d2e8a1a9c774ca85cc7e62bbab..371bbeebf7559eccc77ba0eea4f6f87a1bc5b54a 100644
--- a/doc/fluid/design/dist_train/distributed_architecture.md
+++ b/doc/fluid/design/dist_train/distributed_architecture.md
@@ -40,11 +40,11 @@ computation is only specified in Python code which sits outside of PaddlePaddle,
 
 Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows:
 
-<img src="src/compiler.png"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/compiler.png"/>
 
 PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component:
 
-<img src="src/paddle-compile.png"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/paddle-compile.png"/>
 
 The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation.
 
@@ -60,7 +60,7 @@ For a detailed explanation, refer to this document -
 
 The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so:
 
-<img src="src/distributed_architecture.png"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/distributed_architecture.png"/>
 
 The major components are: *Python API*, *Distribute Transpiler* and *Remote Executor*.
 
@@ -152,7 +152,7 @@ for data in train_reader():
 `JobDesc` object describe the distributed job resource specification to run on
 Cluster environment.
 
-<img src="src/remote_executor.png" width="500" align="center" />
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/remote_executor.png" width="500" align="center" />
 
 `RemoteExecutor.run` sends the `ProgramDesc` and
 [TrainingJob](https://github.com/PaddlePaddle/cloud/blob/unreleased-tpr/doc/autoscale/README.md#training-job-resource)
@@ -171,13 +171,13 @@ In the future, a more general placement algorithm should be implemented, which m
 
 The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
 
-<img src="src/local_architecture.png"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/local_architecture.png"/>
 
 
 ### Training Data
 
 In PaddlePaddle v0.10.0, training data is typically read
-with [data reader](../reader/README.md) from Python. This approach is
+with [data reader](./README.md) from Python. This approach is
 no longer efficient when training distributedly since the Python
 process no longer runs on the same node with the trainer processes,
 the Python reader will need to read from the distributed filesystem
diff --git a/doc/fluid/design/dist_train/distributed_lookup_table_design.md b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..988729138926f035750b59eb245dde82502a3ad2
--- /dev/null
+++ b/doc/fluid/design/dist_train/distributed_lookup_table_design.md
@@ -0,0 +1,128 @@
+# Design Doc: Distributed Lookup Table Operator
+
+A lookup table operator in PaddlePaddle where the table could be out
+of the memory of a computer.
+
+## Background
+
+A lookup table operator is well-used in deep learning for learning the
+representation, or the
+[*embedding*](http://www.cs.toronto.edu/~fritz/absps/ieee-lre.pdf), of
+symbols.
+
+### The Forward Algorithm
+
+The forward algorithm of the lookup table is a multiplication of the
+input vector x and the lookup table matrix W:
+
+$$y = x * W$$
+
+When x is a sparse vector of symbols, the above multiplication
+simplifies into looking up rows in W that correspond to symbols in x,
+denoted by W(x).  Please be aware that W could be huge and out of the
+memory, so we'd need a distributed storage service, which supports the
+lookup of rows.
+
+The following figure illustrates the multiplication of x with two
+non-zero elements, or say, two symbols, and a lookup table W:
+
+![lookup table](./src/lookup_table.png)
+
+### The Backward Algorithm
+
+The backward algorithm computes W'(x) using W(x).  W'(x) has the same
+scale of size as W(x) and is much smaller than W.
+
+To optimize W given W', we can do simple SGD update:
+
+$$W = f(W') = \lambda * W'$$
+
+or some more sophisticated algorithms that rely on both W' and W:
+
+$$W = f(W, W')$$
+
+The following figure illustrates the backward pass of the lookup
+operator: ![lookup table training](./src/lookup_table_training.png)
+
+## Distributed Storage Service
+
+The forward algorithm requires a distributed storage service for W.
+The backward algorithm prefers that the storage system can apply the
+optimization algorithm on W.  The following two sections describe two
+solutions -- the former doesn't require that the storage service can
+do optimization, the latter does.
+
+### Storage Service Doesn't Optimize
+
+In this design, we use highly-optimized distributed storage, e.g.,
+memcached, as the storage service, and we run the optimization
+algorithm on parameter servers of PaddlePaddle.  The following figure
+illustrates the training process.
+
+<!--
+Note: please update the following URL when update this digraph.
+<img src='https://g.gravizo.com/svg?
+digraph G {
+  rankdir="LR";
+  subgraph cluster1 {
+  P1 [label="pserver 1"];
+  P2 [label="pserver 2"];
+  T1 [label="trainer 1"];
+  T2 [label="trainer 2"];
+  T3 [label="trainer 3"];
+  }
+  KV [label="memcached"];
+  T1 -> P1;
+  T1 -> P2;
+  T2 -> P1;
+  T2 -> P2;
+  T3 -> P1;
+  T3 -> P2;
+  P1 -> KV [color=gray, weight=0.1];
+  KV -> P1 [color=gray, weight=0.1];
+  P2 -> KV [color=gray, weight=0.1];
+  KV -> P2 [color=gray, weight=0.1];
+  KV -> T1 [color=gray, weight=0.1];
+  KV -> T2 [color=gray, weight=0.1];
+  KV -> T3 [color=gray, weight=0.1];
+}
+)
+'/>
+-->
+
+<img src='https://g.gravizo.com/svg?%20digraph%20G%20{%20rankdir=%22LR%22;%20subgraph%20cluster1%20{%20P1%20[label=%22pserver%201%22];%20P2%20[label=%22pserver%202%22];%20T1%20[label=%22trainer%201%22];%20T2%20[label=%22trainer%202%22];%20T3%20[label=%22trainer%203%22];%20}%20KV%20[label=%22memcached%22];%20T1%20-%3E%20P1;%20T1%20-%3E%20P2;%20T2%20-%3E%20P1;%20T2%20-%3E%20P2;%20T3%20-%3E%20P1;%20T3%20-%3E%20P2;%20P1%20-%3E%20KV%20[color=gray,%20weight=0.1];%20KV%20-%3E%20P1%20[color=gray,%20weight=0.1];%20P2%20-%3E%20KV%20[color=gray,%20weight=0.1];%20KV%20-%3E%20P2%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T1%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T2%20[color=gray,%20weight=0.1];%20KV%20-%3E%20T3%20[color=gray,%20weight=0.1];%20}'/>
+
+Each trainer runs the forward and backward passes using their local
+data:
+
+1. In the forward pass, when a trainer runs the forward algorithm of a
+   lookup operator, it retrieves W(x) from the storage service.
+1. The trainer computes W'(x) in the backward pass using W(x).
+
+During the global update process:
+
+1. Each trainer uploads its W'(x) to parameter servers.
+1. The parameter server runs the optimization algorithm, e.g., the
+   Adam optimization algorithm, which requires that
+   1. The parameter server retrieves W(x) from memcached, and
+   1. The parameter server pushes $\Delta W(x)=f(W(x), lambda \sum_j
+      W'(x))$ to memcached, where $f$ denotes the optimization
+      algorithm.
+
+### Storage Service Does Optimize
+
+This design is very similar to the above one, except that the
+optimization algorithm $f$ runs on the storage service.
+
+- Pro: parameter servers do not retrieve W(x) from the storage
+  service, thus saves half network communication.
+- Con: the storage service needs to be able to run the optimization
+  algorithm.
+
+## Conclusion
+
+Let us do the "storage service does not optimize" solution first, as a
+baseline at least, because it is easier to use a well-optimized
+distributed storage service like memcached.  We can do the "storage
+service does optimize" solution later or at the same time, which, if
+implemented carefully, should have better performance than the former.
diff --git a/doc/fluid/design/dist_train/distributed_traing_review.md b/doc/fluid/design/dist_train/distributed_traing_review.md
new file mode 100644
index 0000000000000000000000000000000000000000..c09b7c99159ace9b3df989f803ede20bc3585d92
--- /dev/null
+++ b/doc/fluid/design/dist_train/distributed_traing_review.md
@@ -0,0 +1,44 @@
+# Parallelism, Asynchronous,  Synchronous, Codistillation
+
+
+For valuable models, it’s worth using more hardware resources to reduce the training time and improve the final model quality. This doc discuss various solutions, their empirical results and some latest researches.
+
+# Model Parallelism
+In some situations, larger and more complex models can improve the model quality. Sometimes, such models cannot fit in one device. Sometimes, parts of the model can be executed in parallel to improve speed. Model Parallelism address the issues by partitioning a single model and place the shards on several devices for execution.
+
+A common way of model parallelism is partition the logic of “gradient application” to parameter servers, while leaving the forward and backward computation at training servers.
+
+More flexible model parallelism is challenging. For example, multi-level-single-direction LSTM can be partitioned by layers, while such solution is not helpful for bi-directional LSTM. Different models can have quite different ways of partitioning and the benefits also depend on the underlying hardware. Framework needs to provide flexible APIs for user to define the customized partition scheme. For example, in TensorFlow, user can use tf.device() to specify the device placement. In MxNet, mx.AttrScope(ctx_group='dev1') does similar things. Recent research proposes to automatically find the optimal partition scheme with Reinforcement Learning, which is essentially solution space search algorithm that could cost a lot of extra hardware sources.
+
+# Data Parallelism
+Data Parallelism runs the same model on multiple devices, each taking in a partition of the input batch. It’s more commonly used for a few reasons. It generally applies to common SGD mini-batch training. Compared with model parallelism, which requires users to carefully partition their model and tune for good performance, data parallelism usually involves no more than calling an extra API and speed up is more predictable.
+
+# Asynchronous Training
+In asynchronous training, it usually involves a set of trainers and a set of parameter servers. The parameter servers collectively hold a single copy of shared parameters. While the trainers each holds a unique copy of model and trains the model independently. Each trainer pulls parameters from parameter servers and sends gradients to the parameter servers independently. Similarly the parameter servers applies the gradients to parameters as soon as the gradients are received and sends parameters whenever they are requested.
+
+In theory, asynchronous training is not safe and unstable. Each trainer is very likely using stale copy of parameters and parameters are also likely to apply stale gradients. However, in practice, especially for large-scale nonconvex optimization, it is effective [1]. Compared with synchronous solution, which will be discussed later, asynchronous distributed training is easier to implement and scales to a few dozen workers without losing much performance due to network communication or other overhead. Besides, asynchronous training can make progress even in case of random trainer failure in the cluster.
+
+Many production models, such as [3], are trained with distributed asynchronous solutions due to its scalability and effectiveness in practice. However, asynchronous training has its limitations. Usually, it’s not as stable as synchronous training. A warm-up phase is sometimes needed. Learning rate is usually smaller compared with synchronous training and decay is also often needed. Normally, asynchronous training doesn’t scale beyond 100 trainers. In other words, when putting more trainers beyond that, the model cannot converge faster.
+
+# Synchronous Training
+Unlike asynchronous training, synchronous training requires step barriers. Parameter servers needs to wait for gradients from all trainers before they are applied to parameters and trainers will always pull the latest parameters.
+
+An obvious advantage of synchronous training is that the behavior is more clearly defined. Usually, it's more stable than asynchronous training. Learning rate can be set larger and for some vision tasks, the final accuracy can be slightly higher. (In my practical experience, for some models, it can actually be worse).
+
+Synchronous training usually faces scalability and performance issues, if not carefully implemented or deployed. In [2], native synchronous training can be 20%~40% slower than asynchronous training. A common trick to avoid slowness, discussed in [1] and [2], is to have backups. N+M replicas are scheduled while only the first N is needed for the training step the proceed.
+
+Similar to asynchronous training, the benefit of synchronous training diminishes quickly. Depending on the models, increasing the number of trainers (effectively batch size) beyond a point won’t delivers faster converge time or better final model quality.
+
+# Codistillation
+Codistillation is a technique that tries to scale the training further. A few training instance (each training instance can be distributed) are performed during the same period. Each training instance has extra losses that comes from the prediction of other training instances. (likey teacher and student) The training process converges faster and usually converge to a better model quality. [4]
+
+
+# Reference
+
+[1] Jeffrey Dean, Greg Corrado, Rajat Monga, Kai Chen, Matthieu Devin, Mark Mao, Andrew Senior, Paul Tucker, Ke Yang, Quoc V Le, et al. Large scale distributed deep networks.
+
+[2] Jianmin Chen, Rajat Monga, Samy Bengio, and Rafal Jozefowicz. Revisiting distributed synchronous SGD.
+
+[3] Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, et al. Google’s neural machine translation system: Bridging the gap between human and machine translation.
+
+[4] LARGE SCALE DISTRIBUTED NEURAL NETWORK TRAINING THROUGH ONLINE DISTILLATION
diff --git a/doc/fluid/design/dist_train/index_cn.rst b/doc/fluid/design/dist_train/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ed6f3dda271d2de58d92aa7ec804fa9e68dfc48a
--- /dev/null
+++ b/doc/fluid/design/dist_train/index_cn.rst
@@ -0,0 +1,9 @@
+分布式训练
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  distributed_architecture.md
+  distributed_lookup_table_design.md
+  parameter_server.md
diff --git a/doc/fluid/design/dist_train/index_en.rst b/doc/fluid/design/dist_train/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f84688f168021113bd933802709bcd787b474bca
--- /dev/null
+++ b/doc/fluid/design/dist_train/index_en.rst
@@ -0,0 +1,9 @@
+Distributed Training
+---------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  distributed_architecture.md
+  distributed_lookup_table_design.md
+  parameter_server.md
diff --git a/doc/fluid/design/dist_train/mpi_enabled_design.md b/doc/fluid/design/dist_train/mpi_enabled_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..4ad3afc7b7522c60460c6f1f387f9415d3738778
--- /dev/null
+++ b/doc/fluid/design/dist_train/mpi_enabled_design.md
@@ -0,0 +1,46 @@
+# MPI-enabled PaddlePaddle Design doc
+
+# Background
+When we do distribute multi GPU training, the communication overhead between servers become the major bottleneck, because of the following reasons:
+1. Must copy at least once from GPU to CPU memory so that the data can be ready to transfer. And for the pserver side, copy data from CPU to GPU introduce more overhead.
+2. GPU->CPU data transfer is 10 times slower than data transfer between GPUs or between PCIe devices.
+3. TCP connections can not make full use of RDMA 100Gb devices.
+
+We will use OpenMPI API to PaddlePaddle, which can bring two benefits to PaddlePaddle:
+1. Enable RDMA with PaddlePaddle, which bring high-performance low latency networks.
+2. Enable GPUDriect with PaddlePaddle, which bring the highest throughput and lowest latency GPU read and write.
+
+# Change list
+* Compile args: Need add compile args to enable MPI support.
+* Execute args:  Need add execute args to assign when and how to use MPI operations.
+* New ops:  Need new op  ```mpi_send_op``` and ```mpi_listenandserve_op``` to support MPI send and receive.
+* Transpiler optimized: Which can add   ```mpi_send_op``` and ```mpi_listenandserve_op```  to the running graph.
+* MPI utils package: Need MPI utils package as the low-level API supported.
+
+## Compile args
+Because MPI or CUDA need hardware supported, so we will add compile args to enable MPI support and control compiling.Add ```WITH_MPI```  compile args to control MPI to use or not. If the  ```WITH_MPI``` is ```ON```, compile system will find openMPI codes in configuration. We should prepare openMPI environment before compiling.
+
+## Execute args
+Launch the script using the ```mpirun``` launcher, For example: ```mpirun -np 3 -hosts node1,node2,node3 python train.py```. By doing this, We can number the actors (trainer/pserver/master) with o .. (n-1). The node's number is the Rank of the calling process in a group of comm (integer),  The MPI processes identify each other using a Rank ID. We have to create a mapping between PaddlePaddle's nodes and their Rank ID so that we can communicate with the correct destinations when using MPI operations.
+
+## New ops
+We won't replace all the gRPC requests to MPI requests,  the standard gRPC library is used for all administrative operations and the MPI API will be used to transfer tensor or selectRows to Pservers. The base of this idea, we create two new operators to handle requests and receives,  the two operators are ```mpi_send_op``` and ```mpi_listenandserve_op```. They are a little similar to [send_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/send_op.cc) and [listen_and_serv_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/listen_and_serv_op.cc), also, We will build a new module to package MPI send and receive process.
+
+### mpi_send_op
+Very similar with ```send_op```, we will replace gRPC code which used to send gradient with ```mpi_module```, at the same time, we will wrap it with ```framework::Async```.
+
+### mpi_listenandserve_op
+Very similar with ```listen_and_serv_op```, we will replace gRPC code which used to receive gradient with ```mpi_module```, at the same time, we will wrap it with ```framework::Async```.
+
+## Transpiler optimized
+**We can get env ```OMPI_COMM_WORLD_SIZE``` and ```OMPI_COMM_WORLD_RANK``` to distinguish use MPI or not, If we use openMPI, the variable in env must exist.**
+ if  confirm to use MPI, we will modify  ```send_op``` to ```mpi_send_op``` in distribute_transpiler, and modify ```listenandserve_op``` to ```mpi_listenandserve_op``` also.
+
+## MPI utils package
+In this package, We will write openMPI low-level API to use MPI.
+The API included in this package are:
+* MPI send and receive module, We will build a new module to package MPI send and receive process. MPI send and receive are different to gRPC, the MPI [recvice](https://www.open-mpi.org/doc/v1.8/man3/MPI_Irecv.3.php) must know receive buffer size and receive buffer element. For this reason, We have to make communications twice, the first one is to send metadata about gradient through gRPC, the second one is the real communication through MPI which send gradient data to mpi_listenandserve_op.
+The detailed flow is below:
+![](https://github.com/seiriosPlus/Paddle/blob/mpi_enabled/doc/fluid/design/dist_train/src/mpi_module.png)
+* MPI global configurations, which store the Rank ID and the mapping in global variables, for example:
+gRPC client : MPI nodes :``` 127.0.0.1:32004 : 3 ```
diff --git a/doc/fluid/design/dist_train/multi_cpu.md b/doc/fluid/design/dist_train/multi_cpu.md
index a8d8ee0422acc84835170a44eb83f9b5f0c6bb40..38222d083084ebfca3099ce96b47868c42d55101 100644
--- a/doc/fluid/design/dist_train/multi_cpu.md
+++ b/doc/fluid/design/dist_train/multi_cpu.md
@@ -8,11 +8,11 @@ Op graph to a multi-CPU Op graph, and run `ParallelDo` Op to run the graph.
 
 ## Transpiler
 
-<img src="src/multi-threads/single-thread@3x.png" width="300">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/single-thread@3x.png" width="300">
 
 After converted:
 
-<img src="src/multi-threads/multi-threads@3x.png" width="1000">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/multi-threads@3x.png" width="1000">
 
 ## Implement
 
diff --git a/doc/fluid/design/dist_train/parameter_server.md b/doc/fluid/design/dist_train/parameter_server.md
index 6ce48dfbfce8b094684b412ebfda7e505ddc30ae..563b70bc0e852bec953eb40dda3c46b3d45d7e68 100644
--- a/doc/fluid/design/dist_train/parameter_server.md
+++ b/doc/fluid/design/dist_train/parameter_server.md
@@ -41,11 +41,11 @@ We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*.
 Below is an example of converting the user defined graph to the
 subgraphs for the trainer and the parameter server:
 
-<img src="src/local-graph.png" width="300"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/local-graph.png" width="300"/>
 
 After converting:
 
-<img src="src/dist-graph.png" width="700"/>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/dist-graph.png" width="700"/>
 
 1. The parameter variable W and its optimizer program are placed on the parameter server.
 1. Operators are added to the program.
@@ -65,12 +65,11 @@ For embedding layers, the gradient may have many rows containing only 0 when tra
 if the gradient uses a dense tensor to do parameter optimization,
 it could spend unnecessary memory, slow down the calculations and waste
 the bandwidth while doing distributed training.
-In Fluid, we introduce [SelectedRows](../selected_rows.md) to represent a list of rows containing
+In Fluid, we introduce [SelectedRows](../modules/selected_rows.md) to represent a list of rows containing
 non-zero gradient data. So when we do parameter optimization both locally and remotely,
 we only need to send those non-zero rows to the optimizer operators:
 
-<img src="src/sparse_update.png" width="700" />
-
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/sparse_update.png" width="700" />
 ### Benefits
 
 - Model parallelism becomes easier to implement: it is an extension to
diff --git a/doc/fluid/design/dist_train/src/async_distributed_training.png b/doc/fluid/design/dist_train/src/async_distributed_training.png
new file mode 100644
index 0000000000000000000000000000000000000000..3b53ab59c0cd7b44b2956f16f1adc47fe85909d3
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_distributed_training.png differ
diff --git a/doc/fluid/design/dist_train/src/async_pserver.graffle b/doc/fluid/design/dist_train/src/async_pserver.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..d2301611774fcb3866473e3e6470568d1e1312cf
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_pserver.graffle differ
diff --git a/doc/fluid/design/dist_train/src/async_pserver.png b/doc/fluid/design/dist_train/src/async_pserver.png
new file mode 100644
index 0000000000000000000000000000000000000000..7d900b0c0eb291c67537b9cf93227c671bafdc73
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_pserver.png differ
diff --git a/doc/fluid/design/dist_train/src/async_update.graffle b/doc/fluid/design/dist_train/src/async_update.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..3a631888688a0d564a873fcb16d943958c91223e
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_update.graffle differ
diff --git a/doc/fluid/design/dist_train/src/async_update.png b/doc/fluid/design/dist_train/src/async_update.png
new file mode 100644
index 0000000000000000000000000000000000000000..3e8db973f45d6d9ac8dcce1dc7878067e79e6dcc
Binary files /dev/null and b/doc/fluid/design/dist_train/src/async_update.png differ
diff --git a/doc/fluid/design/dist_train/src/distributed_training.graffle b/doc/fluid/design/dist_train/src/distributed_training.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..1168801bc1fadfce310a74cb3110695bd1629f6b
Binary files /dev/null and b/doc/fluid/design/dist_train/src/distributed_training.graffle differ
diff --git a/doc/fluid/design/dist_train/src/lookup_table.png b/doc/fluid/design/dist_train/src/lookup_table.png
new file mode 100644
index 0000000000000000000000000000000000000000..72dfe3547f731d0d090338afb206b0549dff472e
Binary files /dev/null and b/doc/fluid/design/dist_train/src/lookup_table.png differ
diff --git a/doc/fluid/design/dist_train/src/lookup_table_training.png b/doc/fluid/design/dist_train/src/lookup_table_training.png
new file mode 100644
index 0000000000000000000000000000000000000000..cc7cc4aeb3b885850fe2f70f19fb84d5873bed1e
Binary files /dev/null and b/doc/fluid/design/dist_train/src/lookup_table_training.png differ
diff --git a/doc/fluid/design/dist_train/src/mpi_module.png b/doc/fluid/design/dist_train/src/mpi_module.png
new file mode 100644
index 0000000000000000000000000000000000000000..e6b6a3e5d6f68baeeb67d7f71154bd8d85f32b6f
Binary files /dev/null and b/doc/fluid/design/dist_train/src/mpi_module.png differ
diff --git a/doc/fluid/design/dist_train/src/sync_distributed_training.png b/doc/fluid/design/dist_train/src/sync_distributed_training.png
new file mode 100644
index 0000000000000000000000000000000000000000..e4f9a221fea4b7238e8a1d84e609c0371f6ef7a2
Binary files /dev/null and b/doc/fluid/design/dist_train/src/sync_distributed_training.png differ
diff --git a/doc/design/ops/images/2_level_rnn.dot b/doc/fluid/design/dynamic_rnn/2_level_rnn.dot
similarity index 100%
rename from doc/design/ops/images/2_level_rnn.dot
rename to doc/fluid/design/dynamic_rnn/2_level_rnn.dot
diff --git a/doc/design/ops/images/2_level_rnn.png b/doc/fluid/design/dynamic_rnn/2_level_rnn.png
similarity index 100%
rename from doc/design/ops/images/2_level_rnn.png
rename to doc/fluid/design/dynamic_rnn/2_level_rnn.png
diff --git a/doc/fluid/design/dynamic_rnn/index_cn.rst b/doc/fluid/design/dynamic_rnn/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1d224d22cf7103616f44115db01f0ae55f1cb88a
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/index_cn.rst
@@ -0,0 +1,8 @@
+动态RNN
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  rnn.md
+  rnn_design.md
diff --git a/doc/fluid/design/dynamic_rnn/index_en.rst b/doc/fluid/design/dynamic_rnn/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..568f496e4ffe21a5e730488aef905f7e2d98839e
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/index_en.rst
@@ -0,0 +1,8 @@
+Dynamic RNN
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  rnn.md
+  rnn_design.md
diff --git a/doc/design/ops/images/rnn.dot b/doc/fluid/design/dynamic_rnn/rnn.dot
similarity index 100%
rename from doc/design/ops/images/rnn.dot
rename to doc/fluid/design/dynamic_rnn/rnn.dot
diff --git a/doc/design/ops/images/rnn.jpg b/doc/fluid/design/dynamic_rnn/rnn.jpg
similarity index 100%
rename from doc/design/ops/images/rnn.jpg
rename to doc/fluid/design/dynamic_rnn/rnn.jpg
diff --git a/doc/fluid/design/dynamic_rnn/rnn.md b/doc/fluid/design/dynamic_rnn/rnn.md
new file mode 100644
index 0000000000000000000000000000000000000000..b39ae0675c45e56852293d97f45e91861cf31667
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/rnn.md
@@ -0,0 +1,153 @@
+# RNNOp design
+
+This document describes the RNN (Recurrent Neural Network) operator and how it is implemented in PaddlePaddle. The RNN op requires that all instances in a mini-batch have the same length. We will have a more flexible dynamic RNN operator in the future.
+
+## RNN Algorithm Implementation
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/rnn.jpg"/>
+</p>
+
+The above diagram shows an RNN unrolled into a full network.
+
+There are several important concepts here:
+
+- *step-net*: the sub-graph that runs at each step.
+- *memory*, $h_t$, the state of the current step.
+- *ex-memory*, $h_{t-1}$, the state of the previous step.
+- *initial memory value*, the memory of the first (initial) step.
+
+### Step-scope
+
+There could be local variables defined in each step-net.  PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/rnn.png"/><br/>
+Figure 2 illustrates the RNN's data flow
+</p>
+
+Please be aware that every step runs the same step-net.  Each step does the following:
+
+1. Creates the step-scope.
+2. Initializes the local variables including step-outputs, in the step-scope.
+3. Runs the step-net, which uses the above mentioned variables.
+
+The RNN operator will compose its output from step outputs in each of the step scopes.
+
+### Memory and Ex-memory
+
+Let's give more details about memory and ex-memory using a simple example:
+
+$$
+h_t = U h_{t-1} + W x_t
+$$,
+
+where $h_t$ and $h_{t-1}$ are the memory and ex-memory (previous memory) of step $t$ respectively.
+
+In the implementation, we can make an ex-memory variable either "refer to" the memory variable of the previous step,
+or copy the memory value of the previous step to the current ex-memory variable.
+
+### Usage in Python
+
+For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/block.md).
+
+We can define an RNN's step-net using a Block:
+
+```python
+import paddle as pd
+
+X = some_op() # x is some operator's output and is a LoDTensor
+a = some_op()
+
+# declare parameters
+W = pd.Variable(shape=[20, 30])
+U = pd.Variable(shape=[20, 30])
+
+rnn = pd.create_rnn_op(output_num=1)
+with rnn.stepnet():
+    x = rnn.add_input(X)
+    # declare a memory (rnn's step)
+    h = rnn.add_memory(init=a)
+    # h.pre_state(), the previous memory of rnn
+    new_state = pd.add_two( pd.matmul(W, x) + pd.matmul(U, h.pre_state()))
+    # update current memory
+    h.update(new_state)
+    # indicate that h variables in all step scopes should be merged
+    rnn.add_outputs(h)
+
+out = rnn()
+```
+
+Python API functions in above example:
+
+- `rnn.add_input`: indicates that the parameter is a variable that will be segmented into step-inputs.
+- `rnn.add_memory`: creates a variable used as the memory.
+- `rnn.add_outputs`: marks the variables that will be concatenated across steps into the RNN output.
+
+### Nested RNN and LoDTensor
+
+An RNN whose step-net includes other RNN operators is known as an *nested RNN*.
+
+For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences. Each step of the higher level RNN also receives an input from the corresponding step of the lower level, and additionally the output from the previous time step at the same level.
+
+The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/rnn.png"/>
+</p>
+
+```python
+import paddle as pd
+
+W = pd.Variable(shape=[20, 30])
+U = pd.Variable(shape=[20, 30])
+
+W0 = pd.Variable(shape=[20, 30])
+U0 = pd.Variable(shape=[20, 30])
+
+# a is output of some op
+a = some_op()
+
+# chapter_data is a set of 128-dim word vectors
+# the first level of LoD is sentence
+# the second level of LoD is a chapter
+chapter_data = pd.Variable(shape=[None, 128], type=pd.lod_tensor, level=2)
+
+def lower_level_rnn(paragraph):
+    '''
+    x: the input
+    '''
+    rnn = pd.create_rnn_op(output_num=1)
+    with rnn.stepnet():
+        sentence = rnn.add_input(paragraph, level=0)
+        h = rnn.add_memory(shape=[20, 30])
+        h.update(
+            pd.matmul(W, sentence) + pd.matmul(U, h.pre_state()))
+        # get the last state as sentence's info
+        rnn.add_outputs(h)
+    return rnn
+
+top_level_rnn = pd.create_rnn_op(output_num=1)
+with top_level_rnn.stepnet():
+    paragraph_data = rnn.add_input(chapter_data, level=1)
+    low_rnn = lower_level_rnn(paragraph_data)
+    paragraph_out = low_rnn()
+
+    h = rnn.add_memory(init=a)
+    h.update(
+        pd.matmul(W0, paragraph_data) + pd.matmul(U0, h.pre_state()))
+    top_level_rnn.add_outputs(h)
+
+# output the last step
+chapter_out = top_level_rnn(output_all_steps=False)
+```
+
+In the above example, the construction of the `top_level_rnn` calls  `lower_level_rnn`.  The input is an LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences.
+
+By default, the `RNNOp` will concatenate the outputs from all the time steps.
+If the `output_all_steps` is set to False, it will only output the final time step.
+
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/rnn_2level_data.png"/>
+</p>
diff --git a/doc/design/ops/images/rnn.png b/doc/fluid/design/dynamic_rnn/rnn.png
similarity index 100%
rename from doc/design/ops/images/rnn.png
rename to doc/fluid/design/dynamic_rnn/rnn.png
diff --git a/doc/design/ops/images/rnn_2level_data.dot b/doc/fluid/design/dynamic_rnn/rnn_2level_data.dot
similarity index 100%
rename from doc/design/ops/images/rnn_2level_data.dot
rename to doc/fluid/design/dynamic_rnn/rnn_2level_data.dot
diff --git a/doc/design/ops/images/rnn_2level_data.png b/doc/fluid/design/dynamic_rnn/rnn_2level_data.png
similarity index 100%
rename from doc/design/ops/images/rnn_2level_data.png
rename to doc/fluid/design/dynamic_rnn/rnn_2level_data.png
diff --git a/doc/fluid/design/dynamic_rnn/rnn_design.md b/doc/fluid/design/dynamic_rnn/rnn_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..cecfcd3307ae4c4fa603220a360e9e124069fa58
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/rnn_design.md
@@ -0,0 +1,242 @@
+# RNN 变长输入设计
+对变长序列的学习，现有主流框架比如 tensorflow, pytorch, caffe2, mxnet 等均使用了padding的方式，
+即将一个mini-batch内不同长度的序列补0到固定长度参与计算。
+
+现有Paddle包括 `RecurrentLayerGroup` 在内的RNN均实现了无padding的变长序列支持，本文也将基于该模块的思路，设计重构后的变长序列支持。
+
+## 背景介绍
+由于tensor必须有明确的shape，因此基于tensor 的主流框架在存储变长序列时，
+必须用zero-padding的方式将变长序列补全为固定shape的tensor。
+
+由于padding是一种框架实现变长序列的妥协， 从用户角度，在使用RNN类模型时自然会比较介意padding的存在，
+因此会有pytorch中对非padding方式变长序列支持长篇的讨论[3]。
+
+由于padding对内存和计算会有额外的消耗，tensorflow和mxnet均使用了bucketing来进行优化[1][2]，
+但不管是padding还是bucket，对于用户都是额外的使用负担。
+
+因此，**paddle原生支持变长序列的方式，能直接满足用户对变长序列的最直接的需求，在当前主流平台中可以算是一大优势**。
+
+但对变长序列的支持，需要对目前框架做一些修改，下面讨论如何在最小修改下支持变长序列。
+
+## 多层序列数据格式 `LODTensor`
+目前 Paddle 会将一个mini-batch内的数据存储在一维的内存上，
+额外使用 `Argument.sequenceStartPositions` 来存储每个句子的信息。
+
+Paddle里使用 `Argument.subSequenceStartPositions` 来存储2层的序列信息，更高维度的序列则无法直接支持；
+
+为了支持 `N-level` 序列的存储，本文将序列信息定义成如下数据结构:
+
+```c++
+std::shared_ptr<std::vector<std::vector<int>>> lod_start_pos_;
+```
+
+或者更明确的定义
+
+```c++
+typedef std::vector<int> level_t;
+std::vector<level_t> lod_start_pos;
+```
+
+这里的每一个 `level_t` 存储一个粒度(level)的偏移信息，和paddle目前做法一致。
+
+为了更透明地传递序列信息，我们引入了一种新的tensor 称为 `LODTensor`[4]，
+其关于tensor相关的接口都直接继承自 `Tensor`，但另外添加了序列相关接口。
+如此，在操作一个 `LODTensor` 时，普通 `Op` 直接当成 `Tensor` 使用，
+而操作序列的 `Op` 会额外操作 `LODTensor` 的变长序列操作的相关接口。
+
+`LODTensor` 具体定义如下：
+
+```c++
+class LODTensor : public Tensor {
+public:
+  size_t Levels() const { return seq_start_positions_.size(); }
+  size_t Elements(int level = 0) const {
+    return seq_start_positions_[level].size();
+  }
+  // slice of level[elem_begin: elem_end]
+  // NOTE low performance in slice seq_start_positions_.
+  // TODO should call Tensor's Slice.
+  LODTensor LODSlice(int level, int elem_begin, int elem_end) const;
+
+  // slice with tensor's data shared with this.
+  LODTensor LODSliceShared(int level, int elem_begin, int elem_end) const;
+
+  // copy other's lod_start_pos_, to share LOD info.
+  // NOTE the LOD info sould not be changed.
+  void ShareConstLODFrom(const LODTensor &other) {
+    lod_start_pos_ = other.lod_start_pos_;
+  }
+  // copy other's lod_start_pos_'s content, free to mutate.
+  void ShareMutableLODFrom(const LODTensor &other) {
+    lod_start_pos_ = std::make_shared <
+                     std::vector<std::vector<int>>(other.lod_start_pos_.begin(),
+                                                   other.lod_start_pos_.end());
+  }
+
+private:
+  std::shared_ptr<std::vector<std::vector<int>>> lod_start_pos_;
+};
+```
+
+其中， `lod_start_pos_` 使用了 `shared_ptr` 来减少存储和复制的代价，
+可以认为 `LODTensor` 是 `Tensor` 的扩展，几乎完全兼容原始 `Tensor` 的使用。
+
+## 框架支持
+### 框架现有的 `Tensor` 调用替换为 `LODTensor`
+为了实现 `LODTensor` 的传递，框架里很多 `Tensor` 都需要变成 `LODTensor`，
+简单实现，直接 **把之前所有的`Tensor` 全部替换成 `LODTensor`，这里可以直接修改 `pybind.cc` 里面创建`Tensor`的接口**。
+
+此外，用户有可能需要感知序列的存在（比如序列的可视化需要解析模型中输出的序列），因此一些序列操作的API也需要暴露到 python 层。
+
+### `lod_start_pos` 随着Op调用链传递
+框架需要支持下列特性，以实现`lod_start_pos`的传递：
+
+1. 以 `shared_ptr` 的方式实现传递
+    - 不修改 `lod_start_pos` 内容的作为 consumer
+    - 修改 `lod_start_pos` 的作为 producer
+    - 约定 consumer 只需要复制传递过来的 `shared_ptr`
+      - producer 需要创建自己的独立的内存，以存储自己独立的修改，并暴露 `shared_ptr` 给后续 consumer
+    - 由于传递过程是以复制`shared_ptr`的方式实现，因此框架只需要传递一次 `lod_start_pos`
+
+2. 对于不感知 `lod_start_pos` 的Op足够透明
+3. 需要修改 `lod_start_pos` 的producer Op可以在 `Run` 时更新自己的 `lod_start_pos` 数据
+
+具体的设计分为以下3小节
+
+#### `load_start_pos` 的传递
+
+- 对于不需要修改 `lod_start_pos` 的情况，调用 LODTensor的 `ShareConstLODFrom` 接口实现复制
+- 需要修改的，调用`ShareMutableLODFrom` 接口自己分配内存以存储修改
+
+#### 框架透明
+传递这一步需要加入到网络跑之前的初始化操作中，并且只需要初始化一次，基于当前框架设计的初步方案如下
+
+- 在 Op 的 `attrs` 中添加一项 `do_mutate_lod_info` 的属性，默认为 `false`
+  - 有需要修改 `lod_start_pos` 的Op需要在定义 `OpProto` 时设置为 `true`
+- `OperatorBase` 的 `InferShape` 中会读取 `do_mutate_lod_info` ，并且调用 `LODTensor` 相关的方法实现 `lod_start_pos` 的复制。
+- `OperatorBase` 中添加一个 member `is_lod_inited{false}` 来保证传递只进行一次
+
+一些逻辑如下
+
+```c++
+class OperatorBase {
+public:
+  // ...
+  void InferShape() {
+    if (!is_load_inited) {
+      bool do_mutate_lod_info = GetAttr<bool>("do_mutate_load_info");
+      // find a input having LOD to copy
+      auto lod_input = ValidLODInput();
+      for (auto &output : outputs) {
+        if (do_mutate_load_info) {
+          output.ShareMutableLODFrom(lod_input);
+        } else {
+          output.ShareConstLODFrom(load_input);
+        }
+      }
+      is_pod_inited = true;
+    }
+
+    // call op's InferShape
+    // ...
+  }
+
+private:
+  // ...
+  bool is_lod_inited{false};
+};
+```
+
+如此，`lod_start_pos` 的信息的传递对非OLD的Op的实现是完全透明的。
+
+#### `lod_start_pos` 的更新
+上一小节介绍到，对于需要修改 `load_start_pos` 的Op，`OperatorBase` 会分配一块自己的内存以存储修改，
+Op在 `Run` 的实现中，操作更新自己的 `load_start_pos` ，
+而所有依赖其 outputs 的 op 会通过共享的指针自动获取到其更新。
+
+## 根据长度排序
+按照长度排序后，从前往后的时间步的batch size会自然地递减，可以直接塞入 Net 做batch计算
+
+比如原始的输入：
+
+```
+origin:
+xxxx
+xx
+xxx
+
+-> sorted:
+xxxx
+xxx
+xx
+```
+
+经过 `SegmentInputs` 之后，每个会有4个时间步，每个时间步的输入如下（纵向排列）
+
+```
+0    1    2    3
+x    x    x    x
+x    x    x
+x    x
+```
+
+为了追踪排序前后序列的变化，这里用
+```c++
+struct SortedSeqItem {
+   void *start{nullptr};
+   void *end{nullptr};
+};
+
+std::vector<SortedSeqItem> sorted_seqs;
+```
+来追踪序列排序后的位置，并添加一个新的接口
+
+```c++
+std::vector<SortedSeqItem> SortBySeqLen(const LODTensor& tensor);
+```
+
+由于输入序列的顺序变化，以下现有的接口需要针对性地修改：
+
+- InitMemories, memory需要根据 `sorted_seqs` 重新排列
+- SetmentInputs
+- ConcatOutputs
+
+此外，由于 `sorted_seqs` 需要被 `RecurrentGradientOp` 复用，因此会变成 `RecurrentOp` 一个新的output输出，
+之后作为 `RecurrentGradientOp` 的一个输入传入。
+
+## InitMemories
+由于序列顺序的变化，`boot_memories` 的batch上的element的顺序也需要对应重新排列。
+
+## SegmentInputs
+`SegmentInputs` 会依赖 `sorted_seqs` 的信息，将原始的序列按照排序后的序列顺序，从横向切割，转为每个step中的inputs。
+
+即下面的转变：
+```
+origin:
+xxxx
+xx
+xxx
+
+   |
+   |
+  \ /
+   !
+0    1    2    3
+x    x    x    x
+x    x    x
+x    x
+```
+## ConcatOutputs
+`ConcatOutputs` 需要
+
+- 将每个时间步的输出重新还原为原始输入的序列顺序（以防止Infer阶段顺序打乱）
+- 将每个序列concat 为规则的mini-batch表示
+
+## 参考文献
+[Tensorflow Bucketing](https://www.tensorflow.org/versions/r0.12/api_docs/python/contrib.training/bucketing)
+
+[mxnet Bucketing](http://mxnet.io/how_to/bucketing.html)
+
+[variable length input in RNN scenario](https://discuss.pytorch.org/t/about-the-variable-length-input-in-rnn-scenario/345/5)
+
+[Level of details](https://en.wikipedia.org/wiki/Level_of_detail)
diff --git a/doc/fluid/design/dynamic_rnn/rnn_design_en.md b/doc/fluid/design/dynamic_rnn/rnn_design_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..9493908f4f73b3e7d91f5f6364a2a3660257d508
--- /dev/null
+++ b/doc/fluid/design/dynamic_rnn/rnn_design_en.md
@@ -0,0 +1,175 @@
+# Varient Length supported RNN Design
+For the learning of variable length sequences, the existing mainstream frameworks such as tensorflow, pytorch, caffe2, mxnet and so on all use padding.
+
+Different-length sequences in a mini-batch will be padded with zeros and transformed to same length.
+
+The existing RNN implementations of the PaddlePaddle is `RecurrentLayerGroup`, 
+which supports the variable length sequences without padding. 
+This doc will design fluid's RNN based on this idea.
+
+## Multi-layer sequence data format `LODTensor`
+At present, Paddle stores data in one mini-batch in one-dimensional array.
+
+`Argument.sequenceStartPositions` is used to store information for each sentence.
+
+In Paddle, `Argument.subSequenceStartPositions` is used to store 2 levels of sequence information, while higher dimensional sequences can not be supported.
+
+In order to support the storage of `N-level` sequences, we define sequence information as the following data structure.
+
+
+```c++
+std::shared_ptr<std::vector<std::vector<int>>> lod_start_pos_;
+```
+
+Or more clearly defined here
+
+```c++
+typedef std::vector<int> level_t;
+std::vector<level_t> lod_start_pos;
+```
+Each `level_t` here stores a level of offset information consistent with paddle's current practice.
+
+In order to transmit sequence information more transparently, we have introduced a new tensor called `LODTensor`[1].
+Its tensor-related interfaces all inherit directly from `Tensor`, but it also adds serial-related interfaces.
+Thus, when working with a `LODTensor`, ordinary `Op` is used directly as `Tensor`.
+The `Op` of the operation sequence will additionally operate the relevant interface of the `LODTensor` variable-length sequence operation.
+
+The definition of `LODTensor` is as follows:
+
+
+```c++
+class LODTensor : public Tensor {
+public:
+  size_t Levels() const { return seq_start_positions_.size(); }
+  size_t Elements(int level = 0) const {
+    return seq_start_positions_[level].size();
+  }
+  // slice of level[elem_begin: elem_end]
+  // NOTE low performance in slice seq_start_positions_.
+  // TODO should call Tensor's Slice.
+  LODTensor LODSlice(int level, int elem_begin, int elem_end) const;
+
+  // slice with tensor's data shared with this.
+  LODTensor LODSliceShared(int level, int elem_begin, int elem_end) const;
+
+  // copy other's lod_start_pos_, to share LOD info.
+  // NOTE the LOD info sould not be changed.
+  void ShareConstLODFrom(const LODTensor &other) {
+    lod_start_pos_ = other.lod_start_pos_;
+  }
+  // copy other's lod_start_pos_'s content, free to mutate.
+  void ShareMutableLODFrom(const LODTensor &other) {
+    lod_start_pos_ = std::make_shared <
+                     std::vector<std::vector<int>>(other.lod_start_pos_.begin(),
+                                                   other.lod_start_pos_.end());
+  }
+
+private:
+  std::shared_ptr<std::vector<std::vector<int>>> lod_start_pos_;
+};
+```
+Among them, `lod_start_pos_` uses `shared_ptr` to reduce the cost of storage and replication.
+`LODTensor` can be thought as an extension of `Tensor`, which is almost completely compatible with the original `Tensor`.
+
+## How to support the framework
+### Replace `Tensor` with `LoDTensor`
+To implement the passing of `LODTensor`, most `Tensor` in the framework need to be replaced with `LODTensor`.
+Simple implementation, directly **replace all previous `Tensor` with `LODTensor`** , where you can directly modify the `Tensor` interface created in `pybind.cc`.
+
+In addition, the user may need to perceive the existence of a sequence (such as the sequence of the visualization needs to parse the output sequence in the model), so some of the serial operation APIs also need to be exposed to the python layer.
+
+### Transmit `lod_start_pos` along with the Op call chain
+`lod_start_pos` is passed along with the Op call chain
+The framework needs to support the following features to implement the transmit of `lod_start_pos`:
+
+1. Implement the transfer as `shared_ptr`
+    - Do not modify the contents of `lod_start_pos` as a consumer
+    - Modify producer of `lod_start_pos` as producer
+    - Conventions consumer only needs to copy `shared_ptr` passed over
+    - producer needs to create its own independent memory to store its own independent modifications and expose `shared_ptr` to subsequent consumer
+    - Since the transfer process is implemented by copying `shared_ptr`, the framework only needs to pass `lod_start_pos` once.
+
+2. Op is transparent enough not to sense `lod_start_pos`
+3. Producer Op that needs to modify `lod_start_pos` can update its `lod_start_pos` data when `Run`
+
+## sorted by length
+After sorting by length, the batch size from the forward time step will naturally decrement, and you can directly plug it into Net to do the batch calculation.
+
+For example, the original input:
+
+```
+origin:
+xxxx
+xx
+xxx
+
+-> sorted:
+xxxx
+xxx
+xx
+```
+
+After `SegmentInputs`, there will be 4 time steps, the input of each time step is as follows (vertical arrangement)
+
+```
+0    1    2    3
+x    x    x    x
+x    x    x
+x    x
+```
+
+In order to track the changes before and after sorting, use here
+
+```c++
+struct SortedSeqItem {
+   void *start{nullptr};
+   void *end{nullptr};
+};
+
+std::vector<SortedSeqItem> sorted_seqs;
+```
+To track the position of the sequence after sorting, and add a new interface
+
+```c++
+std::vector<SortedSeqItem> SortBySeqLen(const LODTensor& tensor);
+```
+Due to the sequence of input sequences, the following existing interfaces need to be modified:
+
+- InitMemories, memory needs to be rearranged according to `sorted_seqs`
+- SetmentInputs
+- ConcatOutputs
+
+In addition, because `sorted_seqs` needs to be multiplexed with `RecurrentGradientOp`, it will become a new output of `RecurrentOp`.
+It is passed in as an input to `RecurrentGradientOp`.
+
+## InitMemories
+Due to the sequence change, the order of the elements on the `boot_memories` batch also needs to be rearranged accordingly.
+
+## SegmentInputs
+
+`SegmentInputs` relies on the information of `sorted_seqs` to cut the original sequence from the horizontal to the input of each step in the sorted sequence order.
+
+the transition is as follows:
+```
+origin:
+xxxx
+xx
+xxx
+
+   |
+   |
+  \ /
+   !
+0    1    2    3
+x    x    x    x
+x    x    x
+x    x
+```
+## ConcatOutputs
+`ConcatOutputs` needs
+
+- Restore the output of each time step back to the original input sequence order (to prevent the order of Infer phase from being upset)
+- Concat each sequence as a regular mini-batch representation
+
+## references
+1. [Level of details](https://en.wikipedia.org/wiki/Level_of_detail)
diff --git a/doc/design/if_else_op.md b/doc/fluid/design/execution/if_else_op.md
similarity index 100%
rename from doc/design/if_else_op.md
rename to doc/fluid/design/execution/if_else_op.md
diff --git a/doc/fluid/design/execution/index_cn.rst b/doc/fluid/design/execution/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ed31b017429d168b2466d8f6b423f48bd5d78d1f
--- /dev/null
+++ b/doc/fluid/design/execution/index_cn.rst
@@ -0,0 +1,8 @@
+执行流程
+-------------
+
+.. toctree::
+  :maxdepth: 1
+
+  switch.md
+  if_else_op.md
diff --git a/doc/fluid/design/execution/index_en.rst b/doc/fluid/design/execution/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fcf846da348ff0bed707c42718e08314998fbac0
--- /dev/null
+++ b/doc/fluid/design/execution/index_en.rst
@@ -0,0 +1,8 @@
+Execution Process
+--------------------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  switch.md
+  if_else_op.md
diff --git a/doc/fluid/design/execution/switch.md b/doc/fluid/design/execution/switch.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c337bd7159b25e594c2f91f9a143b3f4bc3c8e8
--- /dev/null
+++ b/doc/fluid/design/execution/switch.md
@@ -0,0 +1,31 @@
+# Design Doc: Switch
+
+## Background
+
+Many programming languages provide `switch` as a generalization of `if-elif-else`.  We want to add it to Fluid.
+
+The following example shows the usage of `fluid.switch`.
+
+```python
+a = fluid.Var(10)
+b = fluid.Var(0)
+
+with switch() as switch:
+    with switch.case(fluid.less_equal(a, 10)):
+        fluid.print("Case 1")
+    with switch.case(fluid.larger(a, 0)):
+        fluid.print("Case 2")
+    with switch.default():
+        fluid.print("Case 3")
+```
+
+## The Semantics
+
+1. A `switch` control-flow checks cases one-by-one.
+1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
+1. It runs the first matched case, or the default case if there is one.
+1. Once it matches a case, it runs the corresponding branch and only that branch.  It's like there is a C's `break` keyword at the end of each case.
+
+The above program should print and print only "Case 1".
+
+The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches.
diff --git a/doc/fluid/design/index_cn.rst b/doc/fluid/design/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..31b62a5eb3cd9b5b68d51abcd001fd5b8c39a914
--- /dev/null
+++ b/doc/fluid/design/index_cn.rst
@@ -0,0 +1,19 @@
+设计思想
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  motivation/index_cn.rst
+  execution/index_cn.rst
+  concepts/index_cn.rst
+  data_type/index_cn.rst
+  memory/index_cn.rst
+  multi_devices/index_cn.rst
+  dynamic_rnn/index_cn.rst
+  concurrent/index_cn.rst
+  algorithm/index_cn.rst
+  network/index_cn.rst
+  modules/index_cn.rst
+  interface/index_cn.rst
+  dist_train/index_cn.rst
diff --git a/doc/fluid/design/index_en.rst b/doc/fluid/design/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2bfee02ad4626633b08ddff747e2886faf9ba99f
--- /dev/null
+++ b/doc/fluid/design/index_en.rst
@@ -0,0 +1,19 @@
+Design
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  motivation/index_en.rst
+  execution/index_en.rst
+  concepts/index_en.rst
+  data_type/index_en.rst
+  memory/index_en.rst
+  multi_devices/index_en.rst
+  dynamic_rnn/index_en.rst
+  concurrent/index_en.rst
+  algorithm/index_en.rst
+  network/index_en.rst
+  modules/index_en.rst
+  interface/index_en.rst
+  dist_train/index_en.rst
diff --git a/doc/fluid/design/interface/index_cn.rst b/doc/fluid/design/interface/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..69a8d9bad4fe88935b9fa87757abf0105ca8eb75
--- /dev/null
+++ b/doc/fluid/design/interface/index_cn.rst
@@ -0,0 +1,4 @@
+多语言接口
+------------
+
+TBD
diff --git a/doc/fluid/design/interface/index_en.rst b/doc/fluid/design/interface/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..22abc71f984aa5da7151d5ebf0c3bdbcc69a3624
--- /dev/null
+++ b/doc/fluid/design/interface/index_en.rst
@@ -0,0 +1,4 @@
+Multi-Language Interface
+-----------------------
+
+TBD
diff --git a/paddle/fluid/memory/README.md b/doc/fluid/design/memory/README.md
similarity index 100%
rename from paddle/fluid/memory/README.md
rename to doc/fluid/design/memory/README.md
diff --git a/doc/design/images/control_flow_graph.png b/doc/fluid/design/memory/images/control_flow_graph.png
similarity index 100%
rename from doc/design/images/control_flow_graph.png
rename to doc/fluid/design/memory/images/control_flow_graph.png
diff --git a/doc/design/images/dataflow_equations.png b/doc/fluid/design/memory/images/dataflow_equations.png
similarity index 100%
rename from doc/design/images/dataflow_equations.png
rename to doc/fluid/design/memory/images/dataflow_equations.png
diff --git a/doc/design/images/deep_learning.png b/doc/fluid/design/memory/images/deep_learning.png
similarity index 100%
rename from doc/design/images/deep_learning.png
rename to doc/fluid/design/memory/images/deep_learning.png
diff --git a/doc/fluid/design/memory/index_cn.rst b/doc/fluid/design/memory/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c507c638bd1a6eb428175ed2756a6ecfc6cca198
--- /dev/null
+++ b/doc/fluid/design/memory/index_cn.rst
@@ -0,0 +1,7 @@
+内存管理
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  memory_optimization.md
diff --git a/doc/fluid/design/memory/index_en.rst b/doc/fluid/design/memory/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f7526437a73a09b300f05e138084755f5528b242
--- /dev/null
+++ b/doc/fluid/design/memory/index_en.rst
@@ -0,0 +1,7 @@
+Memory Management
+-------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  memory_optimization.md
diff --git a/doc/design/memory_optimization.md b/doc/fluid/design/memory/memory_optimization.md
similarity index 100%
rename from doc/design/memory_optimization.md
rename to doc/fluid/design/memory/memory_optimization.md
diff --git a/doc/design/backward.md b/doc/fluid/design/modules/backward.md
similarity index 100%
rename from doc/design/backward.md
rename to doc/fluid/design/modules/backward.md
diff --git a/doc/fluid/design/modules/batch_norm_op.md b/doc/fluid/design/modules/batch_norm_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..e451ffcc73b5de2b911e1c6de54b42a5d1d54c37
--- /dev/null
+++ b/doc/fluid/design/modules/batch_norm_op.md
@@ -0,0 +1,134 @@
+# Batch Normalization
+
+## What is batch normalization
+
+Batch normalization is a frequently-used method in deep network training. It adjusts the mean and variance of a layer's output, and make the data distribution easier for next layer's training.
+
+The principle of batch normalization can be summarized into a simple function:
+
+```
+y = (x - E[x]) / STD[x]) * scale + bias
+```
+
+`x` is a batch of output data of a certain layer. `E[x]` and `STD[x]` is the mean and standard deviation of `x`, respectively。 `scale` and `bias` are two trainable parameters. The training of batch normalization layer equals to the learning of best values of `scale` and `bias`.
+
+In our design, we use a single operator(`batch_norm_op`) to implement the whole batch normalization in C++, and wrap it as a layer in Python.
+
+## Differences with normal operators
+
+`batch_norm_op` is a single operator. However, there are a few differences between `BatchNormOp` and normal operators, which we shall take into consideration in our design.
+
+1. `batch_norm_op` shall behave differently in training and inferencing. For example, during inferencing, there is no batch data and it's impossible to compute `E[x]` and `STD[x]`, so we have to use an `estimated_mean` and an `estimated_variance` instead of them. These require our framework to be able to inform operators current running type (training/inferencing), then operators can switch their behaviors.
+
+2. `batch_norm_op` shall have the ability to maintain `estimated_mean` and `estimated_variance` across mini-batch. In each mini-batch, `estimated_mean` is iterated by the following equations:
+
+```
+if batch_id == 0
+  estimated_mean = E[x]
+else
+  estimated_mean = estimated_mean * momentum + (1.0 - momentum_) * E[x]
+```
+
+The iterating of `estimated_variance` is similar. `momentum` is an attribute, which controls estimated_mean updating speed.
+
+## Implementation
+
+Batch normalization is designed as a single operator is C++, and then wrapped as a layer in Python.
+
+### C++
+
+As most C++ operators do, `batch_norm_op` is defined by inputs, outputs, attributes and compute kernels.
+
+#### Inputs
+
+- `x`: The inputs data, which is generated by the previous layer.
+- `estimated_mean`: The estimated mean of all previous data batches. It is updated in each forward propagation and will be used in inferencing to take the role of `E[x]`.
+- `estimated_var`: The estimated standard deviation of all previous data batches. It is updated in each forward propagation and will be used in inferencing to take the role of `STD[x]`.
+- `scale`: trainable parameter 'scale'
+- `bias`: trainable parameter 'bias'
+
+#### Outputs
+
+- `y`: The output data.
+- `batch_mean`: The mean value of batch data.
+- `batch_var`: The standard deviation value of batch data.
+- `saved_mean`: Updated `estimated_mean` with current batch data. It's supposed to share the memory with input `estimated_mean`.
+- `saved_var`: Updated `estimated_var` with current batch data. It's supposed to share the memory with input `estimated_var`.
+
+#### Attributes
+
+- `is_infer`: *bool*. If true, run `batch_norm_op` in inferencing mode.
+- `use_global_est`: *bool*. If true, use `saved_mean` and `saved_var` instead of `E[x]` and `STD[x]` in trainning.
+- `epsilon`: *float*. The epsilon value to avoid division by zero.
+- `momentum`: *float*. Factor used in `estimated_mean` and `estimated_var` updating. The usage is shown above.
+
+#### Kernels
+
+The following graph showes the training computational process of `batch_norm_op`:
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/batch_norm_op_kernel.png" width="800"/>
+
+cudnn provides APIs to finish the whole series of computation, we can use them in our GPU kernel.
+
+### Python
+
+`batch_norm_op` is warpped as a layer in Python:
+
+```python
+def batch_norm_layer(net,
+                     input,
+                     output,
+                     scale,
+                     bias,
+                     use_global_est = False,
+                     epsilon = 1e-6,
+                     momentum = 0.99):
+	mean_cache = scope.new_var(name = 'estimated_mean', trainable = False)
+	var_cache = scop.new_var(name = 'estimated_var', trainable = False)
+	batch_mean = scope.new_var(name = 'batch_mean')
+	batch_var = scope.new_var(name = 'batch_var')
+	batch_norm_op = Operator('batch_norm_op',
+	                         x = input,
+	                         estimated_mean = mean_cache,
+	                         estimated_mean = var_cache,
+	                         scale = scale,
+	                         bias = bias,
+	                         y = output,
+	                         batch_mean = batch_mean,
+	                         batch_var = batch_var,
+	                         saved_mean = mean_cache,
+	                         saved_var = var_cache,
+	                         is_infer = False,
+	                         use_global_est = use_global_est,
+	                         epsilon = epsilon,
+	                         momentum = momentum)
+	net.append_op(batch_norm_op)
+	return output
+```
+
+Because Python API has not been finally decided, the code above can be regarded as pseudo code. There are a few key points we shall note:
+
+1. `estimated_mean` and `estimated_var` are assigned the same variables with `saved_mean` and `saved_var` respectively. So they share same the memories. The output mean and variance values(`saved_mean` and `saved_var`) of a certain batch will be the inputs(`estimated_mean` and `estimated_var`) of the next batch.
+
+2. `is_infer` decided whether `batch_norm_op` will run in training mode or inferencing mode. However, a network may contains both training and inferencing parts. And user may switch `batch_norm_op`'s running mode in Python `for` loop like this:
+
+```python
+for pass_id in range(PASS_NUM):
+    # ...
+    net.train()  # run training model
+    if pass_id % 100 == 0:
+        net.infer(test_image)    # run inferencing model
+    # ...
+```
+
+`is_infer` is an attribute. Once an operator is created, its attributes can not be changed. It suggests us that we shall maintain two `batch_norm_op` in the model, one's `is_infer` is `True`(we call it `infer_batch_norm_op`) and the other one's is `False`(we call it `train_batch_norm_op`). They share all parameters and variables, but be placed in two different branches. That is to say, if a network contains a `batch_norm_op`, it will fork into two branches, one go through `train_batch_norm_op` and the other one go through `infer_batch_norm_op`:
+
+<div align=center>
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/batch_norm_fork.png" width="500"/>
+</div>
+
+Just like what is shown in the above graph, the net forks before `batch_norm_op` and will never merge again. All the operators after `batch_norm_op` will duplicate.
+
+When the net runs in training mode, the end of the left branch will be set as the running target, so the dependency tracking process will ignore right branch automatically. When the net runs in inferencing mode, the process is reversed.
+
+How to set a target is related to Python API design, so I will leave it here waiting for more discussions.
diff --git a/doc/fluid/design/modules/evaluator.md b/doc/fluid/design/modules/evaluator.md
new file mode 100644
index 0000000000000000000000000000000000000000..de9605b0e67a035ab1ef1e4cafbe838f83bc5807
--- /dev/null
+++ b/doc/fluid/design/modules/evaluator.md
@@ -0,0 +1,58 @@
+# Evaluator Design
+
+## Problem Statement
+
+During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants.
+
+## Evaluator Design
+Currently, every operation is expressed in the graph. We divide the evaluator process into three steps.
+
+1. Initialize the metric state and add it into the block.
+
+2. Calculate the concerned metrics for every mini-batch. The single evaluator operator is only responsible for calculating the necessary statistics for one mini-batch. For example, the accuracy operator only calculates the accuracy for a minibatch data if run once.
+
+
+3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices.
+
+## Implementation
+This design is shown in the Python API.
+Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass.
+
+
+```python
+class Evaluator(object):
+    """
+    Evaluator Base class.
+    """
+    def __init__(self, name, **kwargs):
+       """
+       Different evaluator may has different metric states. E.g, Accuracy need two variables, total and right sample counts.
+       Auc need four variables, `true_positives`,
+         `true_negatives`, `false_positives` and `false_negatives`. So every evaluator should create its needed variables and append to main_program
+
+       The initialization of Evaluator should be responsible for:
+       create metric states and append to the main_program
+       """
+       pass
+
+    def _update_ops(self, input, label, **kwargs)
+       """
+       Add mini-batch evaluator caculate operators to the main_program.
+       Add increment operator to accumulate the metric states.
+       """
+
+
+    def reset(self, executor, reset_program=None):
+      """
+      Reset metric states at the begin of each pass/user specified batch number.
+      Execute the reset_program to reset the states.
+      """
+
+
+    def eval(self, executor, eval_program=None):
+      """
+      Merge the mini-batch statistics to form the evaluation result for multiple mini-batches.
+      Execute the eval_program and return the result.
+      """
+      return eval_result
+```
diff --git a/paddle/fluid/operators/images/batch_norm_fork.dot b/doc/fluid/design/modules/images/batch_norm_fork.dot
similarity index 100%
rename from paddle/fluid/operators/images/batch_norm_fork.dot
rename to doc/fluid/design/modules/images/batch_norm_fork.dot
diff --git a/paddle/fluid/operators/images/batch_norm_fork.png b/doc/fluid/design/modules/images/batch_norm_fork.png
similarity index 100%
rename from paddle/fluid/operators/images/batch_norm_fork.png
rename to doc/fluid/design/modules/images/batch_norm_fork.png
diff --git a/paddle/fluid/operators/images/batch_norm_op_kernel.png b/doc/fluid/design/modules/images/batch_norm_op_kernel.png
similarity index 100%
rename from paddle/fluid/operators/images/batch_norm_op_kernel.png
rename to doc/fluid/design/modules/images/batch_norm_op_kernel.png
diff --git a/doc/design/images/feed_forward.png b/doc/fluid/design/modules/images/feed_forward.png
similarity index 100%
rename from doc/design/images/feed_forward.png
rename to doc/fluid/design/modules/images/feed_forward.png
diff --git a/doc/design/images/feed_forward_regularized.png b/doc/fluid/design/modules/images/feed_forward_regularized.png
similarity index 100%
rename from doc/design/images/feed_forward_regularized.png
rename to doc/fluid/design/modules/images/feed_forward_regularized.png
diff --git a/doc/design/images/l1_regularization.png b/doc/fluid/design/modules/images/l1_regularization.png
similarity index 100%
rename from doc/design/images/l1_regularization.png
rename to doc/fluid/design/modules/images/l1_regularization.png
diff --git a/doc/design/images/l2_regularization.png b/doc/fluid/design/modules/images/l2_regularization.png
similarity index 100%
rename from doc/design/images/l2_regularization.png
rename to doc/fluid/design/modules/images/l2_regularization.png
diff --git a/doc/design/images/loss_equation.png b/doc/fluid/design/modules/images/loss_equation.png
similarity index 100%
rename from doc/design/images/loss_equation.png
rename to doc/fluid/design/modules/images/loss_equation.png
diff --git a/doc/fluid/design/modules/index_cn.rst b/doc/fluid/design/modules/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b25783f0f5120991c29ba31b7b512bd4c183eecf
--- /dev/null
+++ b/doc/fluid/design/modules/index_cn.rst
@@ -0,0 +1,14 @@
+代码结构和重要模块
+-----------------
+
+.. toctree::
+  :maxdepth: 1
+
+  backward.md
+  python_api.md
+  regularization.md
+  infer_var_type.md
+  optimizer.md
+  prune.md
+  register_grad_op.md
+  net_op_design.md
diff --git a/doc/fluid/design/modules/index_en.rst b/doc/fluid/design/modules/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2108156e080996916f2650448f0a56f998757204
--- /dev/null
+++ b/doc/fluid/design/modules/index_en.rst
@@ -0,0 +1,14 @@
+Code Structure and Important Modules
+-------------------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  backward.md
+  python_api.md
+  regularization.md
+  infer_var_type.md
+  optimizer.md
+  prune.md
+  register_grad_op.md
+  net_op_design.md
diff --git a/doc/design/infer_var_type.md b/doc/fluid/design/modules/infer_var_type.md
similarity index 100%
rename from doc/design/infer_var_type.md
rename to doc/fluid/design/modules/infer_var_type.md
diff --git a/doc/fluid/design/modules/net_op_design.md b/doc/fluid/design/modules/net_op_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..e64ac2fb1c6898bfeb883250347da3d9a4757b97
--- /dev/null
+++ b/doc/fluid/design/modules/net_op_design.md
@@ -0,0 +1,250 @@
+# Network Design
+
+`Network` is the container and controller of a set of operators,
+user can build a real network from a `NetDesc` which is a protobuf message
+and use `Network.Run()` to run all the operators in the network.
+
+A network object knows all Operators belonging to this network. Variables,
+which are inputs and outputs of these operators,
+are created and managed by a hierarchy of Scope objects.
+
+## API
+
+### Net
+To make the `Network` extendable, a base class is defined like this
+
+```c++
+// operator's index stored in a network.
+typedef int OpIndex;
+
+// The minimum a network should be implemented.
+class Net {
+ public:
+  // run all the operators and return success(true) or not, with all the
+  // variables are located in `scope`. `context` describes the detail execution
+  // environment for ops. `begin` and `end` specify the scope of `ops_` to run,
+  // If no positive indexes are provided, all operators in `ops_` will run.
+  virtual Error Run(Scope *scope, OpContext *context, OpIndex begin = -1,
+                   OpIndex end = -1) const = 0;
+
+  // Add an Operator according to `def`.
+  virtual OpIndex AddOp(const proto::OpDef &def) = 0;
+
+  // Add optimizer operators acctording to `attrs`.
+  virtual Error AddOptimizerOps(const OptAttrs &attrs) = 0;
+
+  // Add backward operators.
+  virtual Error AddBackwardOps() = 0;
+
+  // Infer the shapes of variables required by operators in the network. The
+  // `scope` will be mutated according to the inferred shapes.
+
+  static std::unique_ptr<Net> Create(const NetDesc &def = NetDesc());
+};
+```
+
+All network implementations should build networks from a protobuf message which
+describes the structure of a real network; `Run` method should be implemented by
+all implementations to offer a universal method to forward or backward compute a network.
+
+`Net::Create` is a method of factory pattern and can be implemented like
+
+```c++
+std::unique<Net> Net::Create(const NetDesc& def) {
+  switch (def.model_type()) {
+    case NN:
+      return new Network(def);
+    case Recursive:
+      return new RecursiveNet(def);
+    case Recurrent:
+      return new RecurrentNet(def);
+  }
+  return nullptr;
+}
+```
+
+Network is designed as the container of operators. to make it more extendable,
+we decouple it from the related variable resources.
+
+`Run(Scope* scope)` takes the scope as a argument so that it can run in different scopes.
+
+Finally, `Net` can be used as followed
+
+```c++
+Scope default_scope;
+OpContext default_context;
+auto net = Net::CreateNet(def);
+
+if (net) {
+  net.Run(&default_scope, &default_context);
+}
+```
+
+### `PlainNet` as a simple implementation of `BaseNet`
+
+A very basic implementation is as follows. All it does is simply to run every operators in sequence.
+
+```c++
+class PlainNet : public Net {
+ public:
+  // Create a network describe by `def`.  NetDesc is the definition of a network.
+  PlainNet(const NetDesc &def);
+
+  // Infer all the operators' input and output varialbes' shapes, will be called before every mini-batch
+  training.
+  virtual Error InferShape(Scope *scope) override;
+
+  // Run all the operators with the `scope`, if no scope is provided, default
+  // scope will be used instead. If no OpContext is provicded, default context will be used.
+  virtual Error Run(Scope *scope = nullptr, OpContext *context=nullptr, OpIndex begin = -1,
+                   OpIndex end = -1) const override;
+
+  virtual OpIndex AddOp(const proto::OpDef &def) override;
+
+  virtual Error AddOptimizerOps(const OptAttrs &attrs) override;
+
+  virtual Error AddBackwardOps() override;
+
+ protected:
+  // Create operators accordding to `def`, will be called by the constructor.
+  Error BuildNet(const NetDesc &def);
+
+  // Add a operator which is identified as `type` and has attributes described
+  // in `attrs`, the `inputs` are the keys of readonly input variables,
+  // `outputs` are keys of mutable output variables. An `OpIndex` will be
+  // returned to indicate the offset of the new operator in `ops_`.
+  OpIndex AddOp(const std::string &type, const std::vector<string> &inputs,
+                const std::vector<string> &outputs,
+                const OprAttr &attrs = OprAttr());
+
+ private:
+  // the operators owned by `Network`.
+  std::vector<Operator> ops_;
+};
+```
+
+`PlainNet` will create operators so that a private member `ops_` is defined,
+the operators are created by `CreateNet`, and each operator is created by `AddOp`.
+
+
+## PlainNet Usage
+`PlainNet` can be used to define and run a network as follows
+
+```c++
+// create an empty scope located on CPU device.
+Scope scope(CPUPlace());
+
+// create and init variables described in `net_desc`.
+scope.CreateVariables(net_desc);
+scope.InitVariables(net_desc);
+
+// create a network according to `net_desc`
+auto net = Net::CreateNet(net_desc);
+// Add more operators if needed.
+net->AddOp(add...);
+net->AddOp(fc...);
+
+net->AddBackwardOps();
+net->AddOptimizerOps();
+
+// run the network providing the `scope`.
+net.Run(&scope);
+```
+
+## `NetBuilder` as a C++ syntax wrapper
+This is a detailed description of the user-related C++ network API, and may not needed in the prototype development stage.
+
+The `NetBuilder` will give users a much simpler syntax as follows to create a network, and demonstrates how to use the `BaseNet`'s raw interfaces.
+
+```c++
+Variable* fc_out = builder.AddOp("fc", input=image, size=100, activation="Sigmoid");
+Variable* prediction = builder.AddOp("fc", input=fc_out, size=10, activation="Sigmoid");
+Variable* loss = builder.AddOp("cross_entropy", input=prediction, label=label);
+Variable* avg_loss = builder.AddOp("mean", loss);
+
+builder.BackwardFrom(avg_loss)
+builder.AddOptimization(1e-4, "adam");
+builder.Run();
+```
+
+`NetBuilder` will call `Net` 's virtual functions to change the real network structure, here is a sample definition
+
+```c++
+class NetBuilder final {
+ public:
+  NetBuilder(Net* net) : net_(net) {}
+
+  Variable* AddOp(const string& type, const vector<Variable>& inputs,
+                  size_t size, Activation act) {
+    // much code here.
+    // ...
+    net_->AddOp(def);
+    need_rebuild_net_ = true;
+    net_->InferShape();
+    // ...
+  }
+
+  Error BackwardFrom(const Variable& cost);
+
+  Error Run(Scope* scope, OpContext* context, bool need_backward = true) {
+    // backward.
+    if (need_backward) {
+      if (need_rebuild_net_) {
+        AddBackwardOps();
+        AddOptimizerOps();
+      }
+      net_->Run(scope, context);
+      return;
+    }
+    // just forward.
+    net_->Run(scope, context, 0, last_forward_op_);
+  }
+
+ protected:
+  Error AddBackwardOps();
+  Error AddOptimizerOps();
+
+ private:
+  Net* net_;
+  OpIndex last_forward_op_{-1};
+  bool need_rebuild_net_{true};
+}
+```
+
+### Compatibility with RNN
+
+Benefitting from the decoupling of `PlainNet.Run` and `Scope`, `PlainNet` is compatible with future RNN design,
+for example we can implement a simple recurrent neural network as follows
+
+```c++
+// copy some `vars` form `source` to `target`
+void Copy(const Scope &source, Scope &target,
+          const std::vector<std::string> &vars);
+
+Scope default_scope;
+// some initial mutations on `default_scope` here.
+
+auto rnn_step_net = PlainNet(rnn_step_net_def);
+
+// Create rnn's states, the last scope is used to store rnn outputs.
+Scope *rnn_states = new Scope[num_states + 1];
+
+for (int i = 0; i < num_states + 1; i++) {
+  // Initialize all rnn state scopes, copy parameters and so on.
+  rnn_states[i].CreateVars(rnn_step_net_def);
+  Copy(default_scope, rnn_states[i], rnn_related_vars);
+  // Prepare rnn's inlinks, just copy inlink variables to each state.
+  Copy(default_scope, rnn_states[i], inlink_vars);
+}
+
+// Run the rnn.
+for (int i = 0; i < num_states; i++) {
+  rnn_step_net.Run(rnn_states[i]);
+  // Copy current state's state variables to next state, the related variables
+  // are named like "previous_state_xxx".
+  Copy(rnn_states[i], rnn_states[i + 1], pre_state_vars)
+}
+
+// Copy rnn's final outputs to `default_scope`.
+Copy(rnn_states[num_states], default_scope, outlink_vars);
+```
diff --git a/doc/fluid/design/modules/optimizer.md b/doc/fluid/design/modules/optimizer.md
new file mode 100644
index 0000000000000000000000000000000000000000..1c25fde9cafb322f789662077d3fc6cc1d64ce38
--- /dev/null
+++ b/doc/fluid/design/modules/optimizer.md
@@ -0,0 +1,91 @@
+# Optimizer Design
+
+## The Problem
+
+A PaddlePaddle program, or a block, is a sequence of operators operating variables.  A training program needs to do three kinds of works:
+
+1. the forward pass, which computes intermediate results and the cost(s),
+1. the backward pass, which derives gradients from intermediate results and costs, and
+1. the optimization pass, which update model parameters to optimize the cost(s).
+
+These works rely on three kinds of operators:
+
+1. forward operators,
+1. gradient operators, and
+1. optimization operators.
+
+It's true that users should be able to create all these operators manually by calling some low-level API, but it would be much more convenient if they could only describe the forward pass and let PaddlePaddle create the backward and optimization operators automatically.
+
+In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass.
+
+
+## High-level Python API to describe the training process
+
+1. User write code to describe the network:
+
+	```python
+	images = layer.data("images")
+	labels = layer.data("labels")
+	w1 = pd.var("w1")
+	b1 = pd.var("b1")
+	hidden = layer.fc(images, w=w1, b=b1)
+	cost = layer.mse(hidden, labels)
+	```
+
+	The above code snippet will create forward operators in [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md).
+
+
+2. Users create a certain kind of Optimizer with some argument.
+
+	```python
+	optimizer = AdagradOptimizer(learing_rate=0.001)
+	```
+
+3. Users use the optimizer to `minimize` a certain `cost` through updating parameters in parameter_list.
+
+	```python
+	opt_op_list = optimizer.minimize(cost, parameter_list=[w1, b1])
+	```
+	The above code snippet will create gradient and optimization operators in Block. The return value of `minimize()` is list of optimization operators that will be run by session.
+
+4. Users use Session/Executor to run this opt_op_list as target to do training.
+
+	```python
+	sess.run(target= opt_op_list, ...)
+	```
+
+### Optimizer Python interface:
+
+```python
+class Optimizer(object):
+    """Optimizer Base class.
+
+    """
+
+    def __init__(self):
+        pass
+
+    def create_optimization_pass(self, parameters_and_grads):
+        """Add optimization operators to update gradients to variables.
+
+        Args:
+          parameters_and_grads: a list of (variable, gradient) pair to update.
+
+        Returns:
+          optmization_op_list: a list of optimization operator that will update parameter using gradient.
+        """
+        return None
+
+    def minimize(self, loss, parameter_list):
+        """Add operations to minimize `loss` by updating `parameter_list`.
+
+        This method combines interface `append_backward()` and
+        `create_optimization_pass()` into one.
+        """
+        params_grads = self.create_backward_pass(loss, parameter_list)
+        update_ops = self.create_optimization_pass(params_grads)
+        return update_ops
+
+```
+
+Users can inherit the Optimizer above to create their own Optimizer with some special logic, such as AdagradOptimizer.
diff --git a/doc/design/prune.md b/doc/fluid/design/modules/prune.md
similarity index 100%
rename from doc/design/prune.md
rename to doc/fluid/design/modules/prune.md
diff --git a/doc/fluid/design/modules/python_api.md b/doc/fluid/design/modules/python_api.md
new file mode 100644
index 0000000000000000000000000000000000000000..265732a348ea77d21005e335390d99abcdfbd045
--- /dev/null
+++ b/doc/fluid/design/modules/python_api.md
@@ -0,0 +1,325 @@
+# Design Doc: Python API
+
+Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program.
+
+<table>
+<thead>
+<tr>
+<th>Python classes</th>
+<th>Protobuf messages</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Program </td>
+<td>ProgramDesc </td>
+</tr>
+<tr>
+<td>Block  </td>
+<td>BlockDesc </td>
+</tr>
+<tr>
+<td>Operator </td>
+<td>OpDesc </td>
+</tr>
+<tr>
+<td>Variable </td>
+<td>VarDesc </td>
+</tr>
+</tbody>
+</table>
+
+
+Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages.
+
+## Core Concepts
+
+### Program
+
+A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), which is composed of an array of `BlockDesc`s.  The `BlockDesc`s in a `ProgramDesc` can have a tree-like hierarchical structure. However, the `ProgramDesc` onlys stores a flattened array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array.  For example, operators in the step block of an RNN operator need to be able to access variables in its ancestor blocks.
+
+Whenever we create a block, we need to set its parent block to the current block, hence the Python class `Program` needs to maintain a data member `current_block`.
+
+```python
+class Program(objects):
+    def __init__(self):
+        self.desc = core.NewProgram() # a C++ ProgramDesc pointer.
+        self.blocks = vector<Block>()
+        self.blocks.append(Block(self, -1)) # the global block
+        self.current_block = 0          # initialized to the global block
+
+    def global_block():
+        return self.blocks[0]
+
+    def current_block():
+        return self.get_block(self.current_block)
+
+    def rollback():
+        self.current_block = self.current_block().parent_idx
+
+    def create_block():
+        new_block_idx = len(self.block)
+        self.blocks.append(Block(self, self.current_block))
+        self.current_block = new_block_idx
+        return current_block()
+```
+
+`Program` is an accessor to the protobuf message `ProgramDesc`, which is created in C++ space, because the InferShape function is in C++, which manipulates `VarDesc` messages, which are in turn members of `BlockDesc`, which is a member of `ProgramDesc`.
+
+`Program` creates the first block as the global block in its constructor.  All parameters and their initializer operators are in the global block.
+
+### Block
+
+A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/block.md) includes
+
+1. a map from variable names to an instance of the Python `Variable` class, and
+1. a list of `Operator` instances.
+
+```python
+class Block(objects):
+    def __init__(self, program, parent_idx):
+        self.desc = core.NewBlock(program.desc)
+        self.program = program
+        self.vars = map<string, Variable>()
+        self.ops = vector<Operator>()
+        self.parent_idx = parent_idx
+
+    def create_var(self, ...):
+        return Variable(self, ...)
+
+    def _create_global_var(self, ...):
+        program.global_block().create_var(...)
+
+    def create_parameter(self, name, ...):
+        # Parameter is a subclass of variable. See Parameter section for details.
+        self.vars[name] = Parameter(self._create_global_var(...), ...)
+        return self.vars[name]
+
+    def append_operator(self, ...):
+        self.ops.append(Operator(self, ...))
+
+    def prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
+       self.ops.prepend(Operator(self, ...))
+```
+
+`create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator.
+
+`prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.
+
+### Operator
+
+The `Operator` class fills in the `OpDesc` message and calls the C++ function `InferShape` to infer the output shapes from the input shapes.
+
+```python
+class Operator(object):
+    def __init__(self,
+                 block,  # Block
+                 type,   # string
+                 inputs, # dict<string, Variable>
+                 outputs,# dict<stirng, Variable>
+                 attrs   # dict<string, Any>
+                 ):
+        self.desc = core.NewOpDesc(block.desc, type, inputs, outputs, attrs)
+        core.infer_shape(self.desc, inputs, outputs)
+
+    def type(self):
+        return self.desc.type()
+```
+
+`Operator` creates the `OpDesc` message in C++ space, so that it can call the `InferShape` function, which is in C++.
+
+### Variable
+
+Operators take Variables as its inputs and outputs.
+
+```python
+class Variable(object):
+    def __init__(self,
+                 block=None,      # Block
+                 name=None,       # string
+                 shape,           # tuple
+                 dtype="float32", # string
+                 lod_level=None   # int
+                 ):
+        if name is None:
+            name = unique_name_generator()
+        self.name = name
+        self.block = block
+        self.desc = core.NewVarDesc(block.desc, name, shape, lod_level)
+        self.writer = None
+```
+
+Please be aware of `self.writer`, that tracks operator who creates the variable.  It possible that there are more than one operators who write a variable, but in Python space, each write to a variable is represented by a Variable class.  This is guaranteed by the fact that **`core.NewVarDesc` must NOT create a new `VarDesc` message if its name already exists in the specified block**.
+
+### Parameter
+
+A parameter is a global variable with an initializer (or load) operator.
+
+```python
+class Parameter(Variable):
+    def __init__(self,
+                 block=None,      # Block
+                 name=None,       # string
+                 shape,           # tuple
+                 dtype="float32", # string
+                 lod_level=None   # int
+                 trainable,       # bool
+                 initialize_op_attrs,
+                 optimize_op_attrs):
+        super(Parameter, self).__init__(block, name, shape, dtype, lod_level)
+        self.trainable = trainable
+        self.optimize_op_attrs = optimize_op_attrs
+        block.prepend(Operator(block,  # Block
+                               initialize_op_attrs['type'],   # string
+                               None,   # no inputs
+                               self,   # output is the parameter
+                               initialize_op_attrs)
+```
+
+When users create a parameter, they can call
+
+```python
+program.create_parameter(
+  ...,
+  init_attr={
+    type: "uniform_random",
+    min: -1.0,
+    max: 1.0,
+  })
+)
+```
+
+In above example, `init_attr.type` names an initialize operator.  It can also name the load operator
+
+```python
+init_attr={
+ type: "load",
+ filename: "something.numpy",
+}
+```
+
+`optimize_op_attrs` is not in the `VarDesc` message, but kept in the Python instance, as it will be used in the Python space when creating the optimize operator's `OpDesc`, and will be in the `OpDesc` message.
+
+## Layer Function
+
+A layer is a Python function that creates some operators and variables. Layers simplify the work of application programmers.
+
+Layer functions take `Variable` and configuration parameters as its input and return the output variable(s).
+
+For example, `FullyConnected` take one or more variable as its input. The input could be input data or another layer's output. There are many configuration options for a `FullyConnected` layer, such as layer size, activation, parameter names, initialization strategies of parameters, and so on. The `FullyConnected` layer will return an output variable.
+
+
+### Necessity for reusing code between layer functions
+
+There are a lot of code that can be reused. Such as
+
+* Give the default value of configuration. e.g., default initialize strategy for parameters is uniform random with `min = -1.0`, `max = 1.0`. and default initialize strategy for bias is to fill zero.
+* Append the activation operator.
+* Create a temporary variable.
+* Create parameter.
+* Generate a unique name.
+* Add a bias.
+* ...
+
+A mechanism to reuse code between layer functions is necessary. It will be around [150 lines of code](https://github.com/PaddlePaddle/Paddle/pull/4724/files#diff-823b27e07e93914ada859232ae23f846R12) if we write a `FullyConnected` layer without any helper functions.
+
+
+
+### Comparision between global functions and helper class
+
+The `FullyConnected` layer will be as follow when we provide global functions:
+
+```python
+def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None):
+  if name is None:
+    name = unique_name("fc")
+  input = multiple_input(input)
+  param_attr = default_param_attr(param_attr)
+  param_attr = multiple_param_attr(param_attr, len(input))
+
+  # mul
+  mul_results = []
+  for ipt, attr in zip(input, param_attr):
+    shape = ipt.shape[1:] + [size]
+    w = g_program.global_block().create_parameter(shape, ipt.dtype, name, attr)
+    tmp = create_tmp_var(name)
+    g_program.current_block().append_op("mul", {ipt, w}, {tmp})
+  mul_results.append(tmp)
+
+  # add sum
+  ...
+  # add bias
+  ...
+  # add activation
+  ...
+  return out
+```
+
+We can provide many helpers functions for layer developers. However, there are several disadvantages for global helper functions:
+
+1. We need a namespace for these methods, then layer developers can quickly figure out what method they can use.
+2. Global functions will force layer developers to pass its parameter time by time.
+
+So we provide a helper class, `LayerHelper`, to share code between layer functions. The `FullyConnected` Layer will be as follow.
+
+```python
+def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None):
+  helper = LayerHelper(locals())  # pass all parameter to LayerHelper
+
+  mul_results = []
+  for ipt, param in helper.iter_multiple_input_and_param():
+    w = helper.create_parameter(shape=ipt.shape[1:] + [size], dtype = ipt.dtype)
+    tmp = helper.create_tmp_variable()
+    helper.append_op('mul', {ipt, w}, {tmp})
+    mul_results.append(tmp)
+
+  pre_bias = helper.add_sum(mul_results)
+  pre_activation = helper.add_bias(pre_bias)
+  return helper.add_activation(pre_activation)
+```
+
+We not only use the fewer lines of code to write `fc_layer` but also make the code clearer to understand. At the same time, layer developers can figure out what function they can invoke by typing `helper.` in a python editor.
+
+
+### Implementation of layer helper
+
+We just keep all parameters of a layer function as a dictionary in layer helper as a private data member. Every method of layer helper will look up the dictionary after it is invoked. In that way, we can implement a layer helper for all layer functions even some layer does not contain some operator. For example, The `activation` is used by the FullyConnected layer or convolution layers, but a cross-entropy layer does not use it. The example code of `add_activation` are:
+
+```python
+class LayerHelper(object):
+  def __init__(self, **kwargs):  # kwargs is short for `keyword arguments`
+    self.kwargs = kwargs
+
+  def add_activation(self, input_var):
+    act = self.kwargs.get("act", None)  # default value is None
+    if act is None:  # do nothing if no act
+      return input_var
+
+    tmp = self.create_tmp_var(self)
+    self.append_op(type=act, input=input_var, output=tmp)
+    return tmp
+```
+
+### Return value of layer functions
+
+The layer will return a Variable, which is also the output of an operator.  However, outputs of a layer function have more attributes than an operator. There are parameter variables, and their gradient variables need to return. To return them is useful. For example,
+
+1. Users can debug the network by printing parameter gradients.
+2. Users can append attributes to a parameter, such as, `param.stop_gradient=True` will make a parameter stop generate the gradient. We can fix the parameter value during training by using this attribute.
+
+However, it is good to return a Variable for layers, since all layers and operators use Variables as their parameters. We can just append a `param` field and a `grad` field for layer function since the Python is dynamic typing.
+
+The sample usage is
+
+```python
+data = fluid.layers.data(...)
+hidden = fluid.layers.fc(data, ...)
+...
+
+executor.run(fetch_list=[hidden.param, hidden.param.grad], ...)
+```
+
+
+## Optimizer
+
+[Optimizer Design Doc](./optimizer.md)
diff --git a/doc/design/register_grad_op.md b/doc/fluid/design/modules/register_grad_op.md
similarity index 100%
rename from doc/design/register_grad_op.md
rename to doc/fluid/design/modules/register_grad_op.md
diff --git a/doc/fluid/design/modules/regularization.md b/doc/fluid/design/modules/regularization.md
new file mode 100644
index 0000000000000000000000000000000000000000..519a9143033386678351ff78a465e5ba6e220c52
--- /dev/null
+++ b/doc/fluid/design/modules/regularization.md
@@ -0,0 +1,66 @@
+# Regularization in PaddlePaddle
+
+## Introduction to Regularization
+A central problem in machine learning is how to design an algorithm that will perform well not just on the training data, but also on new data. A frequently faced problem is the problem of **overfitting**, where the model does not make reliable predictions on new unseen data. **Regularization** is the process of introducing additional information in order to prevent overfitting. This is usually done by adding extra penalties to the loss function that restricts the parameter spaces that an optimization algorithm can explore.
+
+### Parameter Norm Penalties
+Most common regularization approaches in deep learning are based on limiting the capacity of the models by adding a parameter norm penalty to the objective function `J`. This is given as follows:
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/loss_equation.png" align="center"/><br/>
+
+The parameter `alpha` is a hyperparameter that weights the relative contribution of the norm penalty term, `omega`, relative to the standard objective function `J`.
+
+The most commonly used norm penalties are the L2 norm penalty and the L1 norm penalty. These are given as follows:
+
+##### L2 Regularization:
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/l2_regularization.png" align="center"/><br/>
+
+##### L1 Regularization
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/l1_regularization.png" align="center"/><br/>
+
+A much more detailed mathematical background of regularization can be found [here](http://www.deeplearningbook.org/contents/regularization.html).
+
+## Regularization Survey
+
+A detailed survey of regularization in various deep learning frameworks can be found [here](https://github.com/PaddlePaddle/Paddle/wiki/Regularization-Survey).
+
+## Proposal for Regularization in PaddlePaddle
+
+### Low-Level implementation
+
+In the new design, we propose to create new operations for regularization. For now, we can add 2 ops that correspond to the most frequently used regularizations:
+- L2_regularization_op
+- L1_regularization_op
+
+These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes other than L1 and L2 norm penalties.
+
+The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) in Python API.
+
+### Computation Graph
+
+Below is an example of a really simple feed forward neural network.
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/feed_forward.png" align="center"/><br/>
+
+The Python API will modify this computation graph to add regularization operators. The modified computation graph will look as follows:
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/feed_forward_regularized.png" align="center"/><br/>
+   
+### Python API implementation for Regularization
+
+Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions.
+
+#### Creation of Regularization ops
+There are two possibilities for creating the regularization ops:
+1. We create these ops immediately while building the computation graph.
+2. We add these ops in a lazy manner, just before the backward, similar to the way the optimization ops are added.
+
+The proposal is to add these ops in a lazy manner just before the backward pass.
+
+#### Storage of Regularization attributes
+
+Since we want to create the regularization ops in a lazy manner, the regularization attributes (type of regularization and weight of regularization penalty) can be stored as attributes of the [`Parameter`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/framework.py#L421) class. This is because regularization is a property of the parameters and storing regularization properties with Parameters also allows for shared parameters.
+
+#### High-level API
+
+In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers).
diff --git a/doc/design/selected_rows.md b/doc/fluid/design/modules/selected_rows.md
similarity index 100%
rename from doc/design/selected_rows.md
rename to doc/fluid/design/modules/selected_rows.md
diff --git a/doc/fluid/design/motivation/api.md b/doc/fluid/design/motivation/api.md
new file mode 100644
index 0000000000000000000000000000000000000000..bc222564e3ec28e306ca0572b6a23104f6e9cbc5
--- /dev/null
+++ b/doc/fluid/design/motivation/api.md
@@ -0,0 +1,261 @@
+# PaddlePaddle Design Doc
+
+## Ingredients
+
+As our design principle is starting from the essence: how could we
+allow users to express and solve their problems as neural networks.
+Some essential concepts that our API have to provide include:
+
+1. A *topology* is an expression of *layers*.
+
+1. A layer could be any kind of computation, including *cost*.
+
+1. Some layers have parameters, some don't. Most costs don't have
+   parameters.
+
+1. In some topologies, layers share parameters.  For
+   example,
+   [the network for training a ranking model](https://github.com/PaddlePaddle/Paddle/issues/1311#issuecomment-279121850).
+
+1. At programming time, users specify topologies and possible sharing
+   of parameters.  PaddlePaddle can figure out and create parameters
+   required (and possibly shared) by one or more topologies.
+
+
+## Starting from Examples
+
+As a summarization
+of
+[our disucssion](https://github.com/PaddlePaddle/Paddle/issues/1315),
+let us present two examples here:
+
+
+### Example 1. Sharing Parameters between Layers
+
+We use
+the
+[3-branch ranking](https://github.com/PaddlePaddle/Paddle/issues/1311#issuecomment-279121850) model
+in this example.  For your convenience, I copy-a-paste the model's
+topology as follows:
+
+```
+A -> f -\
+Q -> f --> cost
+B -> f -/
+```
+
+The following program trains the topology including the cost, and then
+use the sub-network in the trained topology in inference:
+
+```python
+def f(in):
+    e = paddle.layer.embedding(in, parameter_name="embedding")
+    o = paddle.layer.softmax(e, parameter_name="semantic")
+    return o
+
+# Create 3 topologies (subnets), they share parameters because all
+# correspoinding layers have the same parameter names.
+fA = f(paddle.layer.data(input_name="A"))
+fB = f(paddle.layer.data(input_name="B"))
+fQ = f(paddle.layer.data(input_name="Q"))
+
+topology = paddle.layer.less_than(
+               paddle.layer.cross_entropy(fA, fQ),
+               paddle.layer.corss_entropy(fB, fQ))
+
+# Derive parameters required in topology and create them in model.
+parameters = paddle.parameters.create(topology)
+
+# Estimate parameters used in topology from data.
+paddle.train(topology, parameters, reader=read_ranking_model_data)
+
+# Inference using fA (or fB or fC, as they share their parameters).
+[testA, testB, testQ] = read_ranking_model_data()
+print "The sematic-vector of testA: ", paddle.infer(fA, parameters, testA)
+```
+
+
+### Example 2. Sharing Parameters between "Models"
+
+We use GAN in this example.  In the following example program, `d0` and `d1`
+correspond to the two networks in the following figure:
+
+<img src="https://github.com/wangyang59/book/raw/00036f4b0da5225041a6824587c1a01cf20159b1/gan/image/gan_ig.png" width=400 />
+
+```python
+def G(in):
+    # over-simplified example as G has only one layers:
+    return paddle.layer.fc(in, parameter_name="G")
+
+def D(in);
+    # again, over-simplified:
+    return paddle.layer.fc(in, parameter_name="D")
+
+# Construct the first topology, which contains both D and G.
+# By learning this topology, we update parameters of G.
+d0 = paddle.layer.should_be_false(D(G(paddle.layer.data())))
+
+# Construct a second topology d1, which contains only D. By
+# training this topology, we update parameters of D.  Note
+# that d1 share parameters with d0.
+d1 = paddle.layer.should_be_true(D(paddle.layer.data()))
+
+# Create parameters from a list of multiple topologies (models) for
+# the chance to share parameters between these topologies.
+parameters = paddle.parameters.create([d0, d1])
+
+# Iterative training of GAN.
+for ...:
+    train(d0, parameters, reader=read_from_rng, immutable_parameters={"D"})
+    train(d1, parameters, reader=read_from_realistic_images)
+
+# Use d1 for inference:
+print "D thinks a batch of images are realistic ", infer(d1, parameters, read_mnist_images)
+```
+
+
+### Summarization
+
+
+Above two programs reveal some important design concerns:
+
+1. Users describe a topology as an expression of layers.  Every layer
+   has a *parameter name*.  If the users don't specify it explicitly, it's automatically generated as a unique name.  By
+   specifying the parameter name, users can specify the sharing of
+   parameters between layers and even between topologies.
+
+1. `paddle.parameters.create` figures out parameters required by one
+   or more topologies from parameter names of layers.  It creates these
+   parameters and returns a `ParameterSet` object, which is in essence
+   a map from *parameter names* to *parameters*.
+
+1. At training and inference time, `paddle.train` and `paddle.infer`
+   requires both a topology and the parameter set that holds the parameters of that topology.  There are some reasons:
+
+   1. This prevents users from forgetting to call
+      `paddle.parameters.create`.
+   1. `paddle.train` needs to know which parameter set to update.
+   1. Users could load another (pre-trained) parameter set and use it
+      with a topology in `train.infer`.
+
+1. By specifying the `immutable_parameters` parameter of
+   `paddle.train`, we can forbid the update of these parameters.
+
+
+## Reader
+
+Not all programming frameworks allow users to define I/O functions.
+An example is Google MapReduce, which can only read from text,
+SSTable, and RecordIO files.  Hadoop MapReduce allows users to define
+readers and writers by deriving from base classes `Reader` and
+`Writer`.  The former is less flexible but also less error-prone.  We
+decide to provide the flexibility to users to define their readers.
+
+
+There are some open questions here:
+
+1. **Should a reader return a Python dictionary?**
+
+1. **How to map multiple outputs from a reader to multiple data layers?**
+
+1. **How to easily compose some existing readers to read more data and
+   feed a topology with more data layers?**
+
+
+## Training
+
+The recommended way to training a model is to call `paddle.train`,
+which simply calls `paddle.trainer.Default`, a global variable of
+type `paddle.trainer.SGD`.  Equivalently, we can do
+
+```python
+opt = paddle.trainer.SGD(..., paddle.updater.Adam(...))
+opt.train(topology, parameters, reader=read, ...)
+```
+
+### Updater
+
+Please be aware that a trainer can accept an updater as its data
+member, where an updater is a class derived from
+`paddle.trainer.Updater`.  This is to make it easier to customize
+trainers, as discussed
+[here](https://github.com/PaddlePaddle/Paddle/issues/1319).
+
+### Event Handler
+
+`paddle.train` and `paddle.trainer.XXX.train` take an optional
+parameter `event_handler`, which should be either `None` or a function
+that handle some events:
+
+1. BeginTraining
+1. EndTraining
+1. BeginIteration
+1. EndIteration
+1. BeginPass
+1. EndPass
+
+where EndPass is sent if and only if the reader yields
+`end_pass=True`.
+
+An example as follows:
+
+```python
+def event_handler(event):
+    if ininstance(event, paddle.event.EndIteration):
+        print paddle.test(...)
+
+paddle.train(topology, parameters, reader, event_handler)
+```
+
+If we are writing a PaddlePaddle program in and for iPython/Jypyter,
+we can use metaplotlib in the event handler to plot a curve of
+cost/error versus iterations, as shown
+[here](https://blog.dominodatalab.com/interactive-dashboards-in-jupyter/).
+
+### Distributed Training
+
+If users want to do distributed training on a cluster, s/he should
+call `paddle.dist_train` and provides access tokens to the cluster as
+a parameter.
+
+For example, if the user has a TLS certificate that allows him to
+access a Kubernetes cluster, s/he should be able to call
+
+```python
+paddle.dist_train(model,
+                  trainer=paddle.trainer.SGD(...,
+                                             paddle.updater.Adam(...)),
+                  reader=read,
+                  k8s_user="yi",
+                  k8s_token="kube_cluster_tls.pem",
+                  k8s_job="hello",
+                  num_parameter_servers=15)
+```
+
+The pseudo code of `paddle.dist_train` is as follows:
+
+```python
+def dist_train(topology, parameters, trainer, reader, ...):
+    if os.getenv("KUBERNETES_SERVICE_HOST") == None:
+        image_name = k8s_user + '/' + k8s_job
+        docker_build(image_name)
+        docker_push()
+        kube_ctrl_start_job(image_name, k8s_user, k8s_token)
+    else:
+        rank = kube_list_containers_in_job_and_return_current_containers_rank()
+        if rank == 0:
+            master()
+        elif rank < 15:
+            parameter_server()
+        else:
+            trainer.train(model, reader=read)
+```
+
+Please be aware that if a process is running on the Kubernetes
+cluster, it will have some environment variables pre-defined.
+
+If `dist_train` doesn't see these environment variables, it knows
+that it's running on users' personal computer, and it should work as a
+*launcher*.  Otherwise, it knows that it's running on the cluster and
+need to figure out its role as either the master, or a trainer, or a
+parameter server.
diff --git a/doc/design/fluid-compiler.graffle b/doc/fluid/design/motivation/fluid-compiler.graffle
similarity index 100%
rename from doc/design/fluid-compiler.graffle
rename to doc/fluid/design/motivation/fluid-compiler.graffle
diff --git a/doc/design/fluid-compiler.png b/doc/fluid/design/motivation/fluid-compiler.png
similarity index 100%
rename from doc/design/fluid-compiler.png
rename to doc/fluid/design/motivation/fluid-compiler.png
diff --git a/doc/fluid/design/motivation/fluid.md b/doc/fluid/design/motivation/fluid.md
new file mode 100644
index 0000000000000000000000000000000000000000..4b7696cc1bbf57ace72c4d31ffc2bfe6c1071939
--- /dev/null
+++ b/doc/fluid/design/motivation/fluid.md
@@ -0,0 +1,140 @@
+# Design Doc: PaddlePaddle Fluid
+
+## Why Fluid
+
+When Baidu developed PaddlePaddle in 2013, the only well-known open source deep learning system at the time was Caffe.  However, when PaddlePaddle was open-sourced in 2016, many other choices were available. There was a challenge -- what is the need for open sourcing yet another deep learning framework?
+
+Fluid is the answer.  Fluid is similar to PyTorch and TensorFlow Eager Execution, which describes the "process" of training or inference using the concept of a model.  In fact in PyTorch, TensorFlow Eager Execution and Fluid, there is no  concept of a model at all. The details are covered in the sections below. Fluid is currently more extreme in the above mentioned idea than PyTorch and Eager Execution, and we are trying to push Fluid towards the directions of a compiler and a new programming language for deep learning.
+
+## The Evolution of Deep Learning Systems
+
+Deep learning infrastructure is one of the fastest evolving technologies. Within four years, there have already been three generations of technologies invented.
+
+<table>
+<thead>
+<tr>
+<th>Existed since</th>
+<th>model as sequence of layers</th>
+<th>model as graph of operators</th>
+<th>No model</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>2013 </td>
+<td>Caffe, Theano, Torch, PaddlePaddle </td>
+<td> </td>
+<td> </td>
+</tr>
+<tr>
+<td>2015 </td>
+<td> </td>
+<td>TensorFlow, MxNet, Caffe2, ONNX, n-graph </td>
+<td> </td>
+</tr>
+<tr>
+<td>2016 </td>
+<td> </td>
+<td>   </td>
+<td> PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid</td>
+</tr>
+</tbody>
+</table>
+
+
+From the above table, we see that the deep learning technology is evolving towards getting rid of the concept of a model.  To understand the reasons behind this direction, a comparison of the *programming paradigms* or the ways to program deep learning applications using these systems, would be helpful. The following section goes over these.
+
+## Deep Learning Programming Paradigms
+
+With the systems listed as the first or second generation, e.g., Caffe or TensorFlow, an AI application training program looks like the following:
+
+```python
+x = layer.data("image")
+l = layer.data("label")
+f = layer.fc(x, W)
+s = layer.softmax(f)
+c = layer.mse(l, s)
+
+for i in xrange(1000): # train for 1000 iterations
+    m = read_minibatch()
+    forward({input=x, data=m}, minimize=c)
+    backward(...)
+
+print W # print the trained model parameters.
+```
+
+The above program includes two parts:
+
+1. The first part describes the model, and
+2. The second part describes the training process (or inference process) for the model.
+
+This paradigm has a well-known problem that limits the productivity of programmers. If the programmer made a mistake in configuring the model, the error messages wouldn't show up until the second part is executed and `forward` and `backward` propagations are performed. This makes it difficult for the programmer to debug and locate a mistake that is located blocks away from the actual error prompt.
+
+This problem of being hard to debug and re-iterate fast on a program is the primary reason that programmers, in general,  prefer PyTorch over the older systems.  Using PyTorch, we would write the above program as following:
+
+```python
+W = tensor(...)
+
+for i in xrange(1000): # train for 1000 iterations
+    m = read_minibatch()
+    x = m["image"]
+    l = m["label"]
+    f = layer.fc(x, W)
+    s = layer.softmax(f)
+    c = layer.mse(l, s)
+    backward()
+
+print W # print the trained model parameters.
+```
+
+We can see that the main difference is the moving the model configuration part (the first step) into the training loop.  This change would allow the mistakes in model configuration to be reported where they actually appear in the programming block.  This change also represents the model better, or its forward pass, by keeping the configuration process in the training loop.
+
+## Describe Arbitrary Models for the Future
+
+Describing the process instead of the model also brings Fluid, the flexibility to define different non-standard models that haven't been invented yet.
+
+As we write out the program for the process, we can write an RNN as a loop, instead of an RNN as a layer or as an operator.  A PyTorch example would look like the following:
+
+```python
+for i in xrange(1000):
+    m = read_minibatch()
+    x = m["sentence"]
+    for t in xrange x.len():
+        h[t] = the_step(x[t])
+```        
+
+With Fluid, the training loop and the RNN in the above program are not really Python loops, but just a "loop structure" provided by Fluid and implemented in C++ as the following:
+
+```python
+train_loop = layers.While(cond)
+with train_loop.block():
+  m = read_minibatch()
+  x = m["sentence"]
+  rnn = layers.While(...)
+  with rnn.block():
+    h[t] = the_step(input[t])
+```    
+
+An actual Fluid example is described  [here](https://github.com/PaddlePaddle/Paddle/blob/bde090a97564b9c61a6aaa38b72ccc4889d102d9/python/paddle/fluid/tests/unittests/test_while_op.py#L50-L58).
+
+From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
+
+We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md) structure of Fluid.
+
+## Turing Completeness
+
+In computability theory, a system of data-manipulation rules, such as a programming language, is said to be Turing complete if it can be used to simulate any Turing machine.  For a programming language, if it provides if-then-else and loop, it is Turing complete.  From the above examples, Fluid seems to be Turing complete; however, it is noteworthy to notice that there  is a slight difference between the `if-then-else` of Fluid and that of a programming language. The difference being that the former runs both of its branches and splits the input mini-batch into two -- one for the True condition and another for the False condition. This hasn't been researched in depth if this is equivalent to the `if-then-else` in programming languages that makes them Turing-complete.  Based on a conversation with [Yuang Yu](https://research.google.com/pubs/104812.html), it seems to be the case but this needs to be looked into in-depth.
+
+## The Execution of a Fluid Program
+
+There are two ways to execute a Fluid program.  When a program is executed, it creates a protobuf message [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
+
+There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program.
+
+Fluid is moving towards the direction of a compiler, which is explain in [fluid_compiler.md](fluid_compiler.md).
+
+## Backward Compatibility of Fluid
+
+Given all the advantages from the removal of the concept of a *model*, hardware manufacturers might still prefer the existence of the concept of a model, so it would be easier for them to support multiple frameworks all at once and could run a trained model during inference.  For example, Nervana, a startup company acquired by Intel, has been working on an XPU that reads the models in the format known as [n-graph](https://github.com/NervanaSystems/ngraph).  Similarly, [Movidius](https://www.movidius.com/) is producing a mobile deep learning chip that reads and runs graphs of operators.  The well-known [ONNX](https://github.com/onnx/onnx) is also a file format of graphs of operators.
+
+For Fluid, we can write a converter that extracts the parts in the `ProgramDesc` protobuf message, converts them into a graph of operators, and exports the graph into the ONNX or n-graph format.
diff --git a/doc/fluid/design/motivation/fluid_compiler.md b/doc/fluid/design/motivation/fluid_compiler.md
new file mode 100644
index 0000000000000000000000000000000000000000..6dd3840a0734e8593890dcf8044746197350c6f5
--- /dev/null
+++ b/doc/fluid/design/motivation/fluid_compiler.md
@@ -0,0 +1,110 @@
+# PaddlePaddle Fluid: Towards a Compiled Programming Language
+
+As described in [fluid.md](fluid.md), when a Fluid application program
+runs, it generates a `ProgramDesc` protobuf message as an intermediate
+representation of itself.  The C++ class `Executor` can run this
+protobuf message as an interpreter.  This article describes the Fluid
+compiler.
+
+![](fluid-compiler.png)
+
+## ProgramDesc
+
+Before we go deeper into the idea of compiled language, let us take a
+look at a simple example Fluid application.
+
+```python
+import "fluid"
+
+func paddlepaddle() {
+  X = fluid.read(...)
+  W = fluid.Tensor(...)
+  Y = fluid.mult(X, W)
+}
+```
+
+This program consists of a [block](../concepts/block.md) of three operators --
+`read`, `assign`, and `mult`.  Its `ProgramDesc` message looks like
+the following
+
+```protobuf
+message ProgramDesc {
+  block[0] = Block {
+    vars = [X, W, Y],
+    ops = [
+      read(output = X)
+      assign(input = ..., output = W)
+      mult(input = {X, W}, output = Y)
+    ],
+  }
+}
+```
+
+## Transpilers
+
+We can write a transpiler program that takes a `ProgramDesc`, e.g.,
+the above one, and outputs another `ProgramDesc`.  Let us take some
+examples:
+
+1. *Memory optimization transpiler*: We can write a transpiler that
+   inserts some `FreeMemoryOp`s in the above example `ProgramDesc` so
+   to free memory early, before the end of an iteration, so to keep a
+   small memory footprint.
+
+1. *Distributed training transpiler*: We can write a transpiler that
+   converts a`ProgramDesc` into its distributed version of two
+   `ProgramDesc`s -- one for running by the trainer processes and the
+   other for the parameter server.
+
+In the rest of this article, we talk about a special kind of
+transpiler, *Native code generator*, which takes a `ProgramDesc` and
+generates a `.cu` (or `.cc`) file, which could be built by C++
+compilers (gcc, nvcc, icc) into binaries.
+
+## Native Code Generator
+
+For the above example, the native code generator transpiler, say, the
+CUDA code generator, should generate a `main` function:
+
+```c++
+void main() {
+  auto X = fluid_cuda_read(...);
+  auto W = fluid_cuda_create_tensor(...);
+  auto Y = fluid_cuda_mult(X, W);
+}
+```
+
+and the definitions of functions `fluid_cuda_read`,
+`fluid_cuda_create_tensor`, and `fluid_cuda_mult`.  Please be aware
+that each function could just define a C++ instance of an operator and
+run it.  For example
+
+```c++
+paddle::Tensor fluid_cuda_read(...) {
+  paddle::Tensor t;
+  paddle::operator::Read r(&t, ...);
+  r.Run();
+  return t;
+}
+```
+
+For computational operators that have multiple *kernels*, each for a
+specific hardware platform, for example, the `mult` operator, the
+generated code should call its CUDA kernel:
+
+```c++
+paddle::Tensor fluid_cuda_mult(const paddle::Tensor& a,
+                               const paddle::Tensor& b) {
+  paddle::Tensor t;
+  paddle::operator::Mult m(a, b, ...);
+  Mult.Run(cuda_context);
+}
+```
+
+where `cuda_context` could be a global variable of type
+`paddle::CUDADeviceContext`.
+
+## Multi-Block Code Generation
+
+Most Fluid application programs may have more than one blocks.  To
+execute them, we need to trace [scopes](../concepts/scope.md).
diff --git a/doc/fluid/design/motivation/index_cn.rst b/doc/fluid/design/motivation/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7706e73eca644ed6db772fd77da947395313237f
--- /dev/null
+++ b/doc/fluid/design/motivation/index_cn.rst
@@ -0,0 +1,10 @@
+设计动机和目标
+-------------
+
+.. toctree::
+  :maxdepth: 1
+
+  api.md
+  refactorization.md
+  fluid.md
+  fluid_compiler.md
diff --git a/doc/fluid/design/motivation/index_en.rst b/doc/fluid/design/motivation/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..10b64b257c604ced6b957d6d6018e8a363f00fac
--- /dev/null
+++ b/doc/fluid/design/motivation/index_en.rst
@@ -0,0 +1,10 @@
+Design Motivations and Goals
+--------------------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  api.md
+  refactorization.md
+  fluid.md
+  fluid_compiler.md
diff --git a/doc/fluid/design/motivation/refactorization.md b/doc/fluid/design/motivation/refactorization.md
new file mode 100644
index 0000000000000000000000000000000000000000..ad9d0f6d3f3ad9884f108826e8410871fffd51bf
--- /dev/null
+++ b/doc/fluid/design/motivation/refactorization.md
@@ -0,0 +1,275 @@
+# Design Doc: Refactorization Overview
+
+The goals of refactoring include:
+
+1. Making it easy for external contributors to write new elementary computation operations.
+1. Making the codebase clean and readable.
+1. Designing a new computation representation -- a computation graph of operators and variables.
+1. Implementing auto-scalability and auto fault recoverable distributed computing with the help of computation graphs.
+
+## Computation Graphs
+
+1. PaddlePaddle represents the computation, training and inference of Deep Learning models, by computation graphs.
+
+  1. Please refer to [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/others/graph.md) for a concrete example.
+
+1. Users write Python programs to describe the graphs and run them (locally or remotely).
+
+1. A graph is composed of *variables* and *operators*.
+
+1. The description of graphs must be serializable/deserializable, so that:
+
+   1. It can be sent to the cloud for distributed execution, and
+   1. It can be sent to clients for mobile or enterprise deployment.
+
+1. The Python program does two things
+
+   1. *Compilation* runs a Python program to generate a protobuf message representation of the graph and send it to
+      1. the C++ library `libpaddle.so` for local execution,
+      1. the master process of a distributed training job for training, or
+      1. the server process of a Kubernetes serving job for distributed serving.
+   1. *Execution* executes the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L70), according to the protobuf message.
+
+## Description and Realization of Computation Graph
+
+At compile time, the Python program generates a protobuf message representation of the graph, or a description of the graph.
+
+At runtime, the C++ program realizes the graph and runs it.
+
+<table>
+<thead>
+<tr>
+<th></th>
+<th>Representation (protobuf messages)</th>
+<th>Realization (C++ class objects) </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Data</td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L107">VarDesc</a></td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h#L24">Variable</a></td>
+</tr>
+<tr>
+<td>Operation </td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L35">OpDesc</a></td>
+<td>
+<a href="https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L64">Operator</a></td>
+</tr>
+<tr>
+<td>Block </td>
+<td>BlockDesc </td>
+<td>Block </td>
+
+</tbody>
+</table>
+
+
+The word *graph* is interchangeable with *block* in this document.  A graph consists of computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`).
+
+## Compilation and Execution
+
+1. Run a Python program to describe the graph.  In particular, the Python application program does the following:
+
+   1. Create `VarDesc` to represent local/intermediate variables,
+   1. Create operators and set attributes,
+   1. Validate attribute values,
+   1. Infer the type and the shape of variables,
+   1. Plan memory-reuse for variables,
+   1. Generate the backward graph
+   1. Add optimization operators to the computation graph.
+   1. Optionally, split the graph for distributed training.
+
+1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the Python program does the following:
+
+   1. Create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/scope.md) for each run of a block,
+      1. realize local variables defined in the BlockDesc message in the new scope,
+      1. a scope is similar to the stack frame in programming languages,
+
+   1. Create an instance of class `Block`, in which,
+      1. realize operators in the BlockDesc message,
+
+   1. Run the Block by calling
+      1. `Block::Eval(vector<Variable>* targets)` for forward and backward computations, or
+      1. `Block::Eval(vector<Operator>* targets)` for optimization.
+
+
+## Intermediate Representation (IR)
+
+```text
+Compile Time -> IR -> Runtime
+```
+
+### Benefits of IR
+
+- Optimization
+  ```text
+  Compile Time -> IR -> Optimized IR -> Runtime
+  ```
+- Automatically send partitioned IR to different nodes.
+  - Automatic Data Parallelism
+    ```text
+    Compile Time
+    |-> Single GPU IR
+        |-> [trainer-IR-0, trainer-IR-1, pserver-IR]
+            |-> Node-0 (runs trainer-IR-0)
+            |-> Node-1 (runs trainer-IR-1)
+            |-> Node-2 (runs pserver-IR)
+    ```
+  - Automatic Model Parallelism (planned for future)
+
+---
+
+## Operator/OpWithKernel/OpKernel
+
+![class_diagram](https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/op_op_with_kern_class_diagram.dot)
+
+---
+
+## Operator
+![class_diagram](https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/op.dot)
+
+* `Operator` is the fundamental building block of the user interface.
+    * Operator stores input/output variable names and attributes.
+    * The `InferShape` interface is used to infer the shape of the output variables based on the shapes of the input variables.
+    * Use `Run` to compute the `output` variables from the `input` variables.
+
+---
+
+## OpWithKernel/Kernel
+
+![class_diagram](https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/op_with_kernel.dot)
+
+* `OpWithKernel` inherits `Operator`.
+* `OpWithKernel` contains a Kernel map.
+    * `OpWithKernel::Run` get device's kernel, and invoke `OpKernel::Compute`.
+    * `OpKernelKey` is the map key. Only device place now, but may be data type later.
+
+---
+
+## Why separate Kernel and Operator
+
+* Separate GPU and CPU code.
+    * Make Paddle capable of running without GPU.
+* Make one operator (which is a user interface) and create many implementations.
+    * For example, same multiplication op can have different implementations kernels such as FP16 kernel, FP32 kernel, MKL, eigen kernel.
+---
+
+## Libraries for Kernel development
+
+* `Eigen::Tensor` contains basic math and element-wise functions.
+    * Note that `Eigen::Tensor` has broadcast implementation.
+    * Limit the number of `tensor.device(dev) = ` in your code.
+* `thrust::transform` and `std::transform`.
+    * `thrust` has the same API as C++ standard library. Using `transform`, one can quickly implement customized element-wise kernels.
+    * `thrust`, in addition, supports more complex APIs, like `scan`, `reduce`, `reduce_by_key`.
+* Hand-writing `GPUKernel` and `CPU` code
+    * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.)
+---
+## Operator Registration
+
+### Why is registration necessary?
+We need a method to build mappings between Op type names and Op classes.
+
+### How is registration implemented?
+Maintaining a map, whose key is the type name and the value is the corresponding Op constructor.
+
+---
+## The Registry Map
+
+### `OpInfoMap`
+
+`op_type(string)` -> `OpInfo`
+
+`OpInfo`:
+
+- **`creator`**: The Op constructor.
+- **`grad_op_type`**: The type of the gradient Op.
+- **`proto`**: The Op's Protobuf, including inputs, outputs and required attributes.
+- **`checker`**: Used to check attributes.
+
+---
+## Related Concepts
+
+### Op_Maker
+It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L37))
+
+### Register Macros
+```cpp
+REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, grad_op_class)
+REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
+```
+
+---
+## Registration Process
+1. Write an Op class and its gradient Op class, if required.
+2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator.
+3. Invoke the macro `REGISTER_OP`. This macro will
+	1. Call maker class to complete `proto` and `checker`
+	2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap`
+
+---
+## Backward Module (1/2)
+### Create Backward Operator
+- Mapping from forward Op to backward Op
+![backward](https://gist.githubusercontent.com/dzhwinter/a6fbd4623ee76c459f7f94591fd1abf0/raw/61026ab6e518e66bde66a889bc42557a1fccff33/backward.png)
+
+---
+## Backward Module (2/2)
+### Build Backward Network
+- **Input**: a graph of forward operators
+- **Output**: a graph of backward operators
+- **Corner cases in construction**
+	- Shared Variables => insert an `Add` operator to combine gradients
+	- No Gradient => insert a `fill_zero_grad` operator
+	- Recursive NetOp => call `Backward` recursively
+	- RNN Op => recursively call `Backward` on stepnet
+	- RNN Op => recursively call `Backward` on stepnet
+
+
+---
+## Scope, Variable, Tensor
+
+* `Tensor` is an n-dimension array with type.
+	* Only dims and data pointers are stored in `Tensor`.
+	* All operations on `Tensor` are written in `Operator` or global functions.
+	* Variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md)
+* `Variable` instances are the inputs and the outputs of an operator, not just `Tensor`.
+	* `step_scopes` in RNN is a variable and not a tensor.
+* `Scope` is where variables are stored.
+	* map<string `var name`, Variable>
+	* `Scope` has a hierarchical structure. The local scope can get variables from its parent scope.
+
+---
+## Block (in design)
+### the difference between original RNNOp and Block
+- As an operator is more intuitive than `RNNOp`,
+- Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`,
+- Fits the compile-time/ runtime separation design paradigm.
+  - During the compilation, `SymbolTable` stores `VarDesc`s and `OpDesc`s and serialize to a `BlockDesc`
+  - When graph executes, a Block with `BlockDesc` is passed. It then creates `Op` and `Var` instances and then invokes `Run`.
+
+---
+## Milestone
+- Take Paddle/books as the main line, the requirement of the models motivates framework refactoring,
+- Model migration
+  - Framework development gives **priority support** to model migration, for example,
+    - the MNIST demo needs a Python interface,
+    - the RNN models require the framework to support `LoDTensor`.
+  - Determine some timelines,
+  - Frequently used Ops need to be migrated first,
+  - Different models can be migrated in parallel.
+- Improve the framework at the same time
+- Accept imperfection, concentrate on solving the specific problem at the right price.
+
+---
+## Control the migration quality
+- Compare the performance of migrated models with old ones.
+- Follow the google C++ style guide.
+- Build the automatic workflow of generating Python/C++ documentations.
+  - The documentation of layers and ops should be written inside the code.
+  - Take the documentation quality into account when submitting pull requests.
+  - Preview the documentations, read and improve them from a user's perspective.
diff --git a/doc/fluid/design/multi_devices/index_cn.rst b/doc/fluid/design/multi_devices/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1f8439e8623e1c1ae9a12c24d08079f0ec3d761f
--- /dev/null
+++ b/doc/fluid/design/multi_devices/index_cn.rst
@@ -0,0 +1,9 @@
+多设备支持
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  operator_kernel_type.md
+  kernel_selection.md
+  kernel_hint_design.md
diff --git a/doc/fluid/design/multi_devices/index_en.rst b/doc/fluid/design/multi_devices/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..819e9c5d77b2abf8da0e2ce6f494ea5174c1d0a2
--- /dev/null
+++ b/doc/fluid/design/multi_devices/index_en.rst
@@ -0,0 +1,9 @@
+Multi-Device Support
+----------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  operator_kernel_type.md
+  kernel_selection.md
+  kernel_hint_design.md
diff --git a/doc/fluid/design/multi_devices/kernel_hint_design.md b/doc/fluid/design/multi_devices/kernel_hint_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..6edc14ca73b1abf824981b59511a9aca4e0f3b47
--- /dev/null
+++ b/doc/fluid/design/multi_devices/kernel_hint_design.md
@@ -0,0 +1,59 @@
+# Kernel Hint Design
+
+## Problem
+In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
+
+In the current design, we use KernelType to describe one kernel.
+
+```cpp
+struct KernelType {
+  Place place_;
+  DataType data_type_;
+  LayoutType layout_;
+};
+```
+ `place_` `data_type_` and `layout_` can be got from the input tensors of the operator, `GetActualKernelType(inputs)` use inputs to infer the proper kernel key that fit the incoming data, but users can not directly configure it.
+
+The [design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md) also provides a virtual method `GetExpectedKernelType` that user can overload and use to choose the KernelType they want to use.
+
+So we should send the information user defined in proto to `GetExpectedKernelType` for choosing a kernel.
+
+The problem is, how should we define and send the information for `GetExpectedKernelType` to use?
+
+## Solution
+
+### Potential choice
+1. Do nothing, let the user add the information they want to operator‘s attribute and get them inside `GetExpectedKernelType`, this can work properly. But there is a little problem that users may define many kinds of hints for the same purpose, such as `force_cpu`, `use_cpu`, `cpu_kernel` to choose CPU kernel, and `use_cudnn`, `force_cudnn`, `cudnn_kernel` to choose CUDNN kernel.
+
+2. Pre-define all the needed option and use a single attr key such as `kernel_hint` for the user, this is not so flexible if the user wants to define some more kind of hint.
+
+### Final choice
+To provide enough flexibility while avoiding confusion definition, we can define some global constants for these attribute names, such as `force_cpu`, `use_cudnn`, `use_mkldnn` for a user to choose.
+
+In C++
+
+```cpp
+const std::string kForceCPU = "force_cpu";
+const std::string kUseCUDNN = "use_cudnn";
+const std::string kUseMKLDNN = "use_mkldnn";
+
+KernelType GetExpectedKernelType() {
+  if (Attr<bool>(kForceCPU)) {
+    return KernelType(CPUPlace, ...)
+  } else {
+    ...
+  }
+}
+```
+
+In Python code
+
+```python
+FORCE_CPU = core.kForceCPU()
+
+def xx_layer(..., force_cpu=false):
+  layer_helper = LayerHelper(...)
+  layer_helper.append_op(
+    type="xx",
+    attr={FORCE_CPU: force_cpu})
+```
diff --git a/doc/fluid/design/multi_devices/kernel_selection.md b/doc/fluid/design/multi_devices/kernel_selection.md
new file mode 100644
index 0000000000000000000000000000000000000000..4d2aab87b8cf30d03075e96cc4c67070efaf963a
--- /dev/null
+++ b/doc/fluid/design/multi_devices/kernel_selection.md
@@ -0,0 +1,101 @@
+# Kernel Selection
+
+## Background
+Every operator has many kernels because there are multiple data types, places, data layout, library type that Fluid supports. We use the `OpKernelType ` to describe kernel types that operators can hold.
+
+The `OpKernelType ` is as follows:
+
+```cpp
+struct OpKernelType {
+  Place place_;
+  DataType data_type_;
+  DataLayout data_layout_;
+  LibraryType library_type_;
+};
+```
+
+- The `place_` is a descriptor of the device, e.g., CPUPlace, CUDAPlace.
+
+- The `data_type_` is the data type that this kernel performs on, e.g., `FP32`, `INT64`. Note that one kernel may have inputs with different data types. However, it will be a major `data_type`. For example, the `cross_entropy` takes `int64` as it label, and `double`/`float` as its input logit and output cost. The major `data_type` of `cross_entropy` is `float` or `double`.
+
+- The `data_layout_ ` is useful for some computational library. One example is that MKLDNN uses many kinds of layout, such as `nChw8c`. Each kind of layout will invoke the different kernel.
+
+- The `library_type_` describes the computational library, e.g., `MKLDNN`, `CUDNN`.
+
+## Problem
+
+We register a kernel for every operator and every kernel type ideally. However, it is impracticable for the following situations.
+
+1. Some operators, like CRF, are complicated and inefficient to be implemented on GPU. The CRF operator will only have a CPU kernel.
+2. Some operators will take too many memory. It is better to force them into CPU. However, the rest of operators in this neural network will be performed on GPU, i.e., model parallel problem.
+3. Some layout and place are particular. One example is that MKLDNN uses `nChw8` and there is no other library uses `nChw8c`.
+
+Take one situation to give a detailed explanation, if we have two Operators: OP1 and OP2, OP1 has one output `op1_to_op2`, and `op1_to_op2` is the input of OP2.
+
+If OP1 and OP2 run on the same place(for example CPUPlace), then `op1_2_op2` can be used directly by OP2.
+
+```
+OP1(CPUPlace)
+     |
+ op1_2_op2
+     |
+OP2(CPUPlace)
+```
+
+If OP1 and OP2 run one different place, then OP2 cannot `use op1_2_op2` directly.
+
+Problems under these situations are similar. We can formalize this problem as follow.
+
+We register kernels with types $KT = \{kt_1, kt_2, kt_3, ...\}$ for one operator. The inputs of this operator should be run on kernel type $kt_{?}$, which the $kt_{?} \notin KT$. How to cast the input of this operator from $kt_{?}$ to any of kernel type in $KT$.
+
+## Solution: data transform
+
+It is clear that transforming inputs of an operator to adapt another kernel type is not related to the particular operator. So we should register these transformation methods as global methods.
+
+We can infer kernel type for each input of an operator. We let this kernel type as `actual kernel type for var`, which means this kernel type is the kernel type that can process this input variable.
+
+We can get a kernel type by 1) The configuration of operator description. (Users may want to force use `MKL` for `conv` operator). 2) The place of the current executor. (Executor is running on GPU). This kernel type is what we expect the operator will be performed on. We let this kernel type as `expect kernel type`.
+
+We transform the input data from `actual` to `expect` if the actual kernel type is not as same as expect kernel type.
+
+The algorithm is described as following
+
+```cpp
+void OperatorWithKernel::Run(
+        const Scope& scope,
+        const platform::Place& place) const {
+  ExecutionContext ctx(...);
+  auto expected_kernel_key = this->GetExpectedKernelType(ctx);
+
+  Scope& new_scope = scope.NewScope();
+
+  for (auto& var_name : this->Inputs()) {
+    auto* tensor_in = GetTensor(var_name);
+    auto kernel_type_for_var = this->GetKernelTypeForVar(...);
+    if (kernel_type_for_var.place_ != expected_kernel_key.place_) {
+      auto* trans_var = new_scope.Var(var_name);
+      auto* out = TransformData(expected_kernel_key,
+                                kernel_type_for_var,
+                                *tensor_in);
+      SetTensorToVariable(...);
+    }
+  }
+
+  auto kernel = kernels.find(expected_kernel_key);
+  kernel->Compute(ExecutionContext(...));
+}
+```
+
+then the actual process for the multi-device above will be:
+
+```
+OP1(CPUPlace)
+     |
+op1_2_op2(on CPU)
+     |
+[transform](from CPU to GPU)
+     |
+op1_2_op2(on GPU)
+     |
+OP2(CUDAPlace)
+```
diff --git a/doc/fluid/design/multi_devices/operator_kernel_type.md b/doc/fluid/design/multi_devices/operator_kernel_type.md
new file mode 100644
index 0000000000000000000000000000000000000000..5e391bd62b4f4e123a9a6f35b7adf5726f205635
--- /dev/null
+++ b/doc/fluid/design/multi_devices/operator_kernel_type.md
@@ -0,0 +1,91 @@
+# Design Doc: The Keys of Operator Kernel Type
+## Problem
+An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique kernel. Before an operator runs, a certain type of kernel must be chosen via a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows:
+
+```cpp
+struct OpKernelType {
+  platform::Place place_;
+  proto::DataType data_type_;
+};
+```
+For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L348-L374) in github.
+
+It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys do not provide enough information. We need a more complete representation of `OpKernelType`.
+
+We often implement a kernel of an operator with some computing library on certain device(place). Please note that computing library and device do not have a one-to-one correspondence. A device can have a lot of computing libraries and a computing library can also support different devices.
+
+For example, Eigen library supports Nvidia GPU/AMD GPU/CPU and MKLDNN library supports Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`.
+
+Different DataTypes, such as fp64/fp32/int8, will obviously have different kernels. But different data layout of a Tensor will also lead to different implementations. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209) as an example. Data layout should also be taken into consideration.
+
+## Solution
+
+There are four keys to determine a kernel type of an operator: `Place`/`Library`/`DataType`/`Layout`.
+
+```cpp
+struct OpKernelType {
+  platform::Place place_;
+  platform::Library library_;
+  proto::DataType data_type_;
+  framework::Layout layout_;
+};
+```
+
+The details are as follows:
+
+### Place
+
+`Place` is defined as:
+
+```cpp
+typedef boost::variant<CUDAPlace, ROCmPlace, FPGAPlace, CPUPlace> Place;
+```
+
+`Place` represents the device memory where data is located.
+
+
+### Library
+
+One operator kernel is usually implemented based on one library. `Library` is defined as a enum variable:
+
+```cpp
+enum Library { Plain, MKLDNN, CUDNN };
+```
+
+We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on the `Eigen` library, we take `Eigen` library as the `Plain` enumerator.
+A library usually has a corresponding `DeviceContext` which contains some handles needed for computation. Fluid now has two default DeviceContexts for CPU and CUDA, namely, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains an Eigen library handle and `CDUADeviceContext` contains an Eigen library handle and a cuBLAS handle.
+
+If we want to support new library, a new enumerator need to be added to `Library` and a corresponding new `LibraryDeviceContext` need to be created.
+
+
+### DataType
+
+
+`DataType` is defined in [framework.proto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). Currently, int32/int64/fp32/fp64 are supported.
+
+### Layout
+
+Actually, a Tensor is a view of a block of memory. Besides a pointer to the memory, we also have to get some other descriptions of this block of memory, such as shape(ddim), stride, and layout.
+
+Different layout leads to different implementation of the operator kernel. There are mainly 4 principles we have to follow to support layout in our Fluid framework.
+
+- We take layout as a data member of Tensor. Layout is actually a enum variable. If Fluid is built with MKLDNN, then the memory format in MKLDNN will also be added into this enum variable.
+
+- Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout for generating data. Of course, we can have some default layout, like NCHW.
+
+- The inference of Layout is at run-time, not at compile-time.
+
+- Every operator has to implement different kernels for different layouts. Let's take MKLDNN as an example. If we want to implement an MKLDNN convolution operator, we have to implement all the kernels for different layouts, which are listed [here](http://intel.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to  register kernels for MKLDNN operators.
+
+`Layout` is also defined as a enum variable:
+
+```cpp
+enum Layout {
+  kNCHW,
+  kNHWC,
+#ifdef PADDLE_WITH_MKLDNN
+  knChw8c
+  ...
+#endif
+};
+```
diff --git a/doc/fluid/design/network/deep_speech_2.md b/doc/fluid/design/network/deep_speech_2.md
new file mode 100644
index 0000000000000000000000000000000000000000..f32a5b7e8a4d820319a666dab4c3129360e2c924
--- /dev/null
+++ b/doc/fluid/design/network/deep_speech_2.md
@@ -0,0 +1,235 @@
+# DeepSpeech2 on PaddlePaddle: Design Doc
+
+We are planning to build Deep Speech 2 (DS2) \[[1](#references)\], a powerful Automatic Speech Recognition (ASR) engine,  on PaddlePaddle. For the first-stage plan, we have the following short-term goals:
+
+- Release a basic distributed implementation of DS2 on PaddlePaddle.
+- Contribute a chapter of Deep Speech to PaddlePaddle Book.
+
+Intensive system optimization and low-latency inference library (details in \[[1](#references)\]) are not yet covered in this first-stage plan.
+
+## Table of Contents
+
+- [Tasks](#tasks)
+- [Task Dependency](#task-dependency)
+- [Design Details](#design-details)
+    - [Overview](#overview)
+    - [Row Convolution](#row-convolution)
+    - [Beam Search With CTC and LM](#beam-search-with-ctc-and-lm)
+- [Future Work](#future-work)
+- [References](#references)
+
+## Tasks
+
+We roughly break down the project into 14 tasks:
+
+1. Develop an **audio data provider**:
+	- Json filelist generator.
+	- Audio file format transformer.
+	- Spectrogram feature extraction, power normalization etc.
+	- Batch data reader with SortaGrad.
+	- Data augmentation (optional).
+	- Prepare (one or more) public English data sets & baseline.
+2. Create a **simplified DS2 model configuration**:
+   - With only fixed-length (by padding) audio sequences (otherwise need *Task 3*).
+	- With only bidirectional-GRU (otherwise need *Task 4*).
+	- With only greedy decoder (otherwise need *Task 5, 6*).
+3. Develop to support **variable-shaped** dense-vector (image) batches of input data.
+   - Update `DenseScanner` in `dataprovider_converter.py`, etc.
+4. Develop a new **lookahead-row-convolution layer** (See \[[1](#references)\] for details):
+   - Lookahead convolution windows.
+   - Within-row convolution, without kernels shared across rows.
+5. Build KenLM **language model** (5-gram) for beam search decoder:
+   - Use KenLM toolkit.
+   - Prepare the corpus & train the model.
+   - Create infererence interfaces (for Task 6).
+6. Develop a **beam search decoder** with CTC + LM + WORDCOUNT:
+   - Beam search with CTC.
+   - Beam search with external custom scorer (e.g. LM).
+   - Try to design a more general beam search interface.
+7. Develop a **Word Error Rate evaluator**:
+   - update `ctc_error_evaluator`(CER) to support WER.
+8. Prepare internal dataset for Mandarin (optional):
+    - Dataset, baseline, evaluation details.
+    - Particular data preprocessing for Mandarin.
+    - Might need cooperating with the Speech Department.
+9. Create **standard DS2 model configuration**:
+   - With variable-length audio sequences (need *Task 3*).
+	- With unidirectional-GRU + row-convolution (need *Task 4*).
+	- With CTC-LM beam search decoder (need *Task 5, 6*).
+10. Make it run perfectly on **clusters**.
+11. Experiments and **benchmarking** (for accuracy, not efficiency):
+    - With public English dataset.
+    - With internal (Baidu) Mandarin dataset (optional).
+12. Time **profiling** and optimization.
+13. Prepare **docs**.
+14. Prepare PaddlePaddle **Book** chapter with a simplified version.
+
+## Task Dependency
+
+Tasks parallelizable within phases:
+
+<table>
+<thead>
+<tr>
+<th>Roadmap</th>
+<th>Description</th>
+<th> Parallelizable Tasks</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Phase I </td>
+<td>Simplified model & components </td>
+<td>Task 1 ~ Task 8</td>
+</tr>
+<tr>
+<td>Phase II </td>
+<td> Standard model & benchmarking & profiling</td>
+<td>Task 9 ~ Task 12 </td>
+</tr>
+<tr>
+<td>Phase III </td>
+<td> Documentations</td>
+<td> Task13 ~ Task14 </td>
+</tr>
+</tbody>
+</table>
+
+
+Issue for each task will be created later. Contributions, discussions and comments are all highly appreciated and welcomed!
+
+## Design Details
+
+### Overview
+
+Traditional **ASR** (Automatic Speech Recognition) pipelines require great human efforts devoted to elaborately tuning multiple hand-engineered components (e.g. audio feature design, accoustic model, pronuncation model and language model etc.). **Deep Speech 2** (**DS2**) \[[1](#references)\], however, trains such ASR models in an end-to-end manner, replacing most intermediate modules with only a single deep network architecture. With scaling up both the data and model sizes, DS2 achieves a very significant performance boost.
+
+Please read Deep Speech 2 \[[1](#references),[2](#references)\] paper for more background knowledge.
+
+The classical DS2 network contains 15 layers (from bottom to top):
+
+- **Two** data layers (audio spectrogram, transcription text)
+- **Three** 2D convolution layers
+- **Seven** uni-directional simple-RNN layers
+- **One** lookahead row convolution layers
+- **One** fully-connected layers
+- **One** CTC-loss layer
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ds2_network.png" width=350><br/>
+Figure 1. Archetecture of Deep Speech 2 Network.
+</div>
+
+We don't have to persist on this 2-3-7-1-1-1 depth \[[2](#references)\]. Similar networks with different depths might also work well. As in \[[1](#references)\], authors use a different depth (e.g. 2-2-3-1-1-1) for final experiments.
+
+Key ingredients about the layers:
+
+- **Data Layers**:
+   - Frame sequences data of audio **spectrogram** (with FFT).
+   - Token sequences data of **transcription** text (labels).
+   - These two type of sequences do not have the same lengthes, thus a CTC-loss layer is required.
+- **2D Convolution Layers**:
+   - Not only temporal convolution, but also **frequency convolution**. Like a 2D image convolution, but with a variable dimension (i.e. temporal dimension).
+   - With striding for only the first convlution layer.
+   - No pooling for all convolution layers.
+- **Uni-directional RNNs**
+	- Uni-directional + row convolution: for low-latency inference.
+	- Bi-direcitional + without row convolution: if we don't care about the inference latency.
+- **Row convolution**:
+	- For looking only a few steps ahead into the feature, instead of looking into a whole sequence in bi-directional RNNs.
+	- Not nessesary if with bi-direcitional RNNs.
+	- "**Row**" means convolutions are done within each frequency dimension (row), and no convolution kernels shared across.
+- **Batch Normalization Layers**:
+   - Added to all above layers (except for data and loss layer).
+   - Sequence-wise normalization for RNNs: BatchNorm only performed on input-state projection and not state-state projection, for efficiency consideration.
+
+<table>
+<thead>
+<tr>
+<th>Required Components</th>
+<th> PaddlePaddle Support</th>
+<th> Need to Develop</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>Data Layer I (Spectrogram) </td>
+<td>Not supported yet.</td>
+<td>TBD (Task 3)</td>
+</tr>
+<tr>
+<td>Data Layer II (Transcription)  </td>
+<td> paddle.data_type.integer_value_sequence</td>
+<td> - </td>
+</tr>
+<tr>
+<td>2D Convolution Layer </td>
+<td> paddle.layer.image_conv_layer</td>
+<td> - </td>
+</tr>
+<tr>
+<td>DataType Converter (vec2seq)</td>
+<td> paddle.layer.block_expand</td>
+<td> - </td>
+</tr>
+<tr>
+<td>Bi-/Uni-directional RNNs </td>
+<td>paddle.layer.recurrent_group</td>
+<td> - </td>
+</tr>
+<tr>
+<td>Row Convolution Layer </td>
+<td>Not supported yet.</td>
+<td>TBD (Task 4)</td>
+</tr>
+<tr>
+<td>CTC-loss Layer </td>
+<td>paddle.layer.warp_ctc</td>
+<td> - </td>
+</tr>
+<tr>
+<td>Batch Normalization Layer </td>
+<td>paddle.layer.batch_norm</td>
+<td> - </td>
+</tr>
+<tr>
+<td>CTC-Beam search </td>
+<td>Not supported yet.</td>
+<td> TBD (Task 6) </td>
+</tr>
+</tbody>
+</table>
+
+
+### Row Convolution
+
+TODO by Assignees
+
+### Beam Search with CTC and LM
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/beam_search.png" width=600><br/>
+Figure 2. Algorithm for CTC Beam Search Decoder.
+</div>
+
+- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts:
+   - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths;
+   - 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary.
+- An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding.
+- Such external scorer consists of language model, word count or any other custom scorers.
+- The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7)
+- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality.
+
+
+## Future Work
+
+- Efficiency Improvement
+- Accuracy Improvement
+- Low-latency Inference Library
+- Large-scale benchmarking
+
+## References
+
+1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016.
+2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). 	arXiv:1512.02595.
+3. Awni Y. Hannun, etc. [First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent DNNs](https://arxiv.org/abs/1408.2873). arXiv:1408.2873
diff --git a/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg b/doc/fluid/design/network/images/LOD-and-shape-changes-during-decoding.jpg
similarity index 100%
rename from doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg
rename to doc/fluid/design/network/images/LOD-and-shape-changes-during-decoding.jpg
diff --git a/doc/design/speech/image/beam_search.png b/doc/fluid/design/network/images/beam_search.png
similarity index 100%
rename from doc/design/speech/image/beam_search.png
rename to doc/fluid/design/network/images/beam_search.png
diff --git a/doc/design/speech/image/ds2_network.png b/doc/fluid/design/network/images/ds2_network.png
similarity index 100%
rename from doc/design/speech/image/ds2_network.png
rename to doc/fluid/design/network/images/ds2_network.png
diff --git a/doc/fluid/design/network/index_cn.rst b/doc/fluid/design/network/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3557d55fe4dbae1f712e0760ca15111ec6f6792d
--- /dev/null
+++ b/doc/fluid/design/network/index_cn.rst
@@ -0,0 +1,7 @@
+复杂网络设计
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  sequence_decoder.md
diff --git a/doc/fluid/design/network/index_en.rst b/doc/fluid/design/network/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..73a7137236bdf0548d35721609351d6deca3013b
--- /dev/null
+++ b/doc/fluid/design/network/index_en.rst
@@ -0,0 +1,7 @@
+Complex Network Design
+------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  sequence_decoder.md
diff --git a/doc/fluid/design/network/sequence_decoder.md b/doc/fluid/design/network/sequence_decoder.md
new file mode 100644
index 0000000000000000000000000000000000000000..b95773c50ca0dcbd1b93529332e035d4de90faa8
--- /dev/null
+++ b/doc/fluid/design/network/sequence_decoder.md
@@ -0,0 +1,229 @@
+# Design: Sequence Decoder Generating LoDTensors
+In tasks such as machine translation and visual captioning,
+a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences, one word at a time.
+
+This documentation describes how to implement the sequence decoder as an operator.
+
+## Beam Search based Decoder
+The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences. It is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set.
+
+In the old version of PaddlePaddle, the C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search, due to the complexity involved, the implementation relies on a lot of special data structures that are quite trivial and hard to be customized by users.
+
+There are a lot of heuristic tricks in the sequence generation tasks, so the flexibility of sequence decoder is very important to users.
+
+During the refactoring of PaddlePaddle, some new concepts are proposed such as:  [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/tensor_array.md) that can better support the sequence usage, and they can also help make the implementation of beam search based sequence decoder **more transparent and modular** .
+
+For example, the RNN states, candidates IDs and probabilities of beam search can be represented all as `LoDTensors`;
+the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated.
+
+## Changing LoD's absolute offset to relative offsets
+The current `LoDTensor` is designed to store levels of variable-length sequences. It stores several arrays of integers where each represents a level.
+
+The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**,
+let's call this format the **absolute-offset LoD** for clarity.
+
+The absolute-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows
+```python
+[[0, 3, 9]
+ [0, 2, 3, 3, 3, 9]]
+```
+The first level tells that there are two sequences:
+- the first's offset is `[0, 3)`
+- the second's offset is `[3, 9)`
+
+while on the second level, there are several empty sequences that both begin and end at `3`.
+It is impossible to tell how many empty second-level sequences exist in the first-level sequences.
+
+There are many scenarios that rely on empty sequence representation, for example in machine translation or visual captioning, one instance has no translation or the empty candidate set for a prefix.
+
+So let's introduce another format of LoD,
+it stores **the offsets of the lower level sequences** and is called **relative-offset** LoD.
+
+For example, to represent the same sequences of the above data
+
+```python
+[[0, 3, 6]
+ [0, 2, 3, 3, 3, 9]]
+```
+
+the first level represents that there are two sequences,
+their offsets in the second-level LoD is `[0, 3)` and `[3, 5)`.
+
+The second level is the same with the relative offset example because the lower level is a tensor.
+It is easy to find out the second sequence in the first-level LoD has two empty sequences.
+
+The following examples are based on relative-offset LoD.
+
+## Usage in a simple machine translation model
+Let's start from a simple machine translation model that is simplified from the [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a blueprint of what a sequence decoder can do and how to use it.
+
+The model has an encoder that learns the semantic vector from a sequence, and a decoder which uses the sequence encoder to generate new sentences.
+
+**Encoder**
+```python
+import paddle as pd
+
+dict_size = 8000
+source_dict_size = dict_size
+target_dict_size = dict_size
+word_vector_dim = 128
+encoder_dim = 128
+decoder_dim = 128
+beam_size = 5
+max_length = 120
+
+# encoder
+src_word_id = pd.data(
+    name='source_language_word',
+    type=pd.data.integer_value_sequence(source_dict_dim))
+src_embedding = pd.embedding(size=source_dict_size, size=word_vector_dim)
+
+src_word_vec = pd.lookup(src_embedding, src_word_id)
+
+encoder_out_seq = pd.gru(input=src_word_vec, size=encoder_dim)
+
+encoder_ctx = pd.last_seq(encoder_out_seq)
+# encoder_ctx_proj is the learned semantic vector
+encoder_ctx_proj = pd.fc(
+    encoder_ctx, size=decoder_dim, act=pd.activation.Tanh(), bias=None)
+```
+
+**Decoder**
+
+```python
+def generate():
+    decoder = pd.while_loop()
+    with decoder.step():
+        decoder_mem = decoder.memory(init=encoder_ctx)  # mark the memory
+        generated_ids = decoder.memory() # TODO init to batch_size <s>s
+        generated_scores = decoder.memory() # TODO init to batch_size 1s or 0s
+
+        target_word = pd.lookup(trg_embedding, gendrated_ids)
+        # expand encoder_ctx's batch to fit target_word's lod
+        # for example
+        # decoder_mem.lod is
+        # [[0 1 3],
+        #  [0 1 3 6]]
+        # its tensor content is [a1 a2 a3 a4 a5]
+        # which means there are 2 sentences to translate
+        #   - the first sentence has 1 translation prefixes, the offsets are [0, 1)
+        #   - the second sentence has 2 translation prefixes, the offsets are [1, 3) and [3, 6)
+        # the target_word.lod is
+        # [[0, 1, 6]
+        #  [0, 2, 4, 7, 9 12]]
+        # which means 2 sentences to translate, each has 1 and 5 prefixes
+        # the first prefix has 2 candidates
+        # the following has 2, 3, 2, 3 candidates
+        # the encoder_ctx_expanded's content will be
+        # [a1 a1 a2 a2 a3 a3 a3 a4 a4 a5 a5 a5]
+        encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word)
+        decoder_input = pd.fc(
+            act=pd.activation.Linear(),
+            input=[target_word, encoder_ctx_expanded],
+            size=3 * decoder_dim)
+        gru_out, cur_mem = pd.gru_step(
+            decoder_input, mem=decoder_mem, size=decoder_dim)
+        scores = pd.fc(
+            gru_out,
+            size=trg_dic_size,
+            bias=None,
+            act=pd.activation.Softmax())
+        # K is an config
+        topk_scores, topk_ids = pd.top_k(scores, K)
+        topk_generated_scores = pd.add_scalar(topk_scores, generated_scores)
+
+        selected_ids, selected_generation_scores = decoder.beam_search(
+            topk_ids, topk_generated_scores)
+
+        # update the states
+        decoder_mem.update(cur_mem)  # tells how to update state
+        generated_ids.update(selected_ids)
+        generated_scores.update(selected_generation_scores)
+
+        decoder.output(selected_ids)
+        decoder.output(selected_generation_scores)
+
+translation_ids, translation_scores = decoder()
+```
+The `decoder.beam_search` is an operator that, given the candidates and the scores of translations including the candidates,
+returns the result of the beam search algorithm.
+
+In this way, users can customize anything on the input or output of beam search, for example:
+
+1. Make the corresponding elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate.
+2. Remove some specific candidate in `selected_ids`.
+3. Get the final `translation_ids`, remove the translation sequence in it.
+
+The implementation of sequence decoder can reuse the C++ class:  [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30),
+so the python syntax is quite similar to that of an  [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop).
+
+Both of them are two-level `LoDTensors`:
+
+- The first level represents `batch_size` of (source) sentences.
+- The second level represents the candidate ID sets for translation prefix.
+
+For example, 3 source sentences to translate, and has 2, 3, 1 candidates.
+
+Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape, and an `lod_expand` operator is used to expand the LoD of the previous state to fit the current state.
+
+For example, the previous state:
+
+* LoD is `[0, 1, 3][0, 2, 5, 6]`
+* content of tensor is `a1 a2 b1 b2 b3 c1`
+
+the current state is stored in `encoder_ctx_expanded`:
+
+* LoD is `[0, 2, 7][0 3 5 8 9 11 11]`
+* the content is
+  - a1 a1 a1 (a1 has 3 candidates, so the state should be copied 3 times for each candidates)
+  - a2 a2
+  - b1 b1 b1
+  - b2
+  - b3 b3
+  - None (c1 has 0 candidates, so c1 is dropped)
+
+The benefit from the relative offset LoD is that the empty candidate set can be represented naturally.
+
+The status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor. The corresponding syntax is:
+
+```python
+decoder.output(selected_ids)
+decoder.output(selected_generation_scores)
+```
+
+The `selected_ids` are the candidate ids for the prefixes, and will be `Packed` by `TensorArray` to a two-level `LoDTensor`, where the first level represents the source sequences and the second level represents generated sequences.
+
+Packing the `selected_scores` will get a `LoDTensor` that stores scores of each translation candidate.
+
+Packing the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation.
+
+## LoD and shape changes during decoding
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg"/>
+</p>
+
+According to the image above, the only phase that changes the LoD is beam search.
+
+## Beam search design
+The beam search algorithm will be implemented as one method of the sequence decoder and has 3 inputs:
+
+1. `topk_ids`, the top K candidate ids for each prefix.
+2. `topk_scores`, the corresponding scores for `topk_ids`
+3. `generated_scores`, the score of the prefixes.
+
+All of these are LoDTensors, so that the sequence affiliation is clear. Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix.
+
+It will return three variables:
+
+1. `selected_ids`, the final candidate beam search function selected for the next step.
+2. `selected_scores`, the scores for the candidates.
+3. `generated_scores`, the updated scores for each prefix (with the new candidates appended).
+
+## Introducing the LoD-based `Pack` and `Unpack` methods in `TensorArray`
+The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors that exist at each time step,
+so it is natural to store them in arrays.
+
+Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors. It is better to store the results of beam search in a `TensorArray`.
+
+The `Pack` and `UnPack` in `TensorArray` are used to pack tensors in the array to an `LoDTensor` or split the `LoDTensor` to an array of tensors.
+It needs some extensions to support the packing or unpacking an array of `LoDTensors`.
diff --git a/doc/fluid/design/onnx/images/project_structure.png b/doc/fluid/design/onnx/images/project_structure.png
new file mode 100644
index 0000000000000000000000000000000000000000..ab1c2ff23cfff586516876684348bb15bd2084fc
Binary files /dev/null and b/doc/fluid/design/onnx/images/project_structure.png differ
diff --git a/doc/fluid/design/onnx/onnx_convertor.md b/doc/fluid/design/onnx/onnx_convertor.md
new file mode 100644
index 0000000000000000000000000000000000000000..bc1665d7c33eb54cb63e5306a439c1ca67016d1e
--- /dev/null
+++ b/doc/fluid/design/onnx/onnx_convertor.md
@@ -0,0 +1,131 @@
+# Background
+
+[ONNX (Open Neural Network Exchange)](https://github.com/onnx/onnx) bridges different deep learning frameworks by providing an open source graph format for models. The models trained in other frameworks can be converted into the ONNX format to execute inference by utilizing the built-in operators in ONNX - this is called a **frontend**. With the inverse conversion (called a **backend**), different frameworks can share any models supported by ONNX in principle. Now most mainstream frameworks have joined the ONNX community, e.g. Caffe2, PyTorch, and MXNet etc. And there is a momentum driving more and more vendors to begin supporting ONNX or even choose ONNX as the only machine learning runtime in their devices.
+
+Therefore, it is necessary to enable the conversion between PaddlePaddle and ONNX. This design doc is aimed at implementing a convertor, mainly for converting between **Fluid** models and ONNX (it is very likely that we may support older v2 models in the future). A complete convertor should be bidirectional - with a frontend AND a backend, but considering the importance, the we will start with the frontend i.e. Fluid models to ONNX models.
+
+
+# How it works
+
+ONNX has a [working list of operators](https://github.com/onnx/onnx/blob/master/docs/Operators.md) which is versioned.
+
+When prioritizing implementation of a frontend over a backend, choice of coverage of Fluid -> ONNX operators comes down to choices of models to be supported (see section `Supported models`). Eventually, this will allow us to reach a really-wide coverage of all operators.
+
+Here are a few major considerations when it comes to converting models:
+
+- **Op-level conversion**: How to map the inputs, attributes, and outputs of each Paddle operator to those of the ONNX operator. In several cases, these require transformations. For each direction (frontend vs. backend), a different conversion mapping is needed.
+- **Parameters (weights) initialization**: Setting initial parameters on different nodes.
+- **Tensor data type mapping** (Note: Some ONNX data types are not supported in Fluid)
+- **Network representation adaption**: Fluid `ProgramDesc` include nested blocks. Since ONNX is free of nesting, the `ProgramDesc` ops need to be traversed to only include ops from the global scope in the root block. The variables used as inputs and outputs should also be in this scope.
+- **Model validation**: There are two kinds of validations that are necessary:
+   1. We need to ensure that the inference outputs of the ops in run inside a model are the same as those when running the ONNX converted ops through an alternative ONNX backend.
+   2. Checking to see if the generated nodes on the graph are validated by the internal ONNX checkers.
+- **Versioning**: ONNX versions its op listing over versions. In fact, it has versioning on 3 different levels: ops, graphs, and ONNX models. This requires that we are conscious about versioning the convertor and updating tests and op convertor logic for each release. It also implies that we release pre-trained ONNX models upon each version release.
+
+One thing that makes this conversion more feasible in Fluid's case is the use of a static IR - the `ProgramDesc` - as opposed to a dynamic graph, as created in the cases of frameworks like PyTorch.
+
+
+# Project structure
+
+<p align="center">
+<img src="./images/project_structure.png"/>
+</p>
+
+The project contains four important parts:
+
+* **fluid**: The directory that contains wrappers for fluid related APIs. Fluid has provided some low-level APIs to parse or generate the inference model. However, directly using these low-level APIs makes the code tediously long. This module wraps low-level APIs to provide simplified interfaces.
+
+* **onnx**: This is a Python package provided by ONNX containing helpers for creating nodes, graphs, and eventually binary protobuf models with initializer parameters.
+
+* **onnx_fluid**: Contains two-way mapping (Fluid -> ONNX ops and ONNX -> Fluid ops). Called from `convert.py`, the program uses this mapping along with modifier functions to construct ONNX nodes with the help of ONNX's `make_node` helper. It also contains mapping between datatypes and tensor deprecation / amplification logic.
+
+* **convert.py**: The interface exposed to users. This will traverse the global program blocks/variables and construct the write-able model.
+
+
+# Usage
+The converter should be designed to very easy-to-use. Bidirectional conversion between a Fluid inference model and an ONNX binary model will be supported. Model validation will also provided to verify the correctness of converted model.
+
+* Convert Fluid inference model to ONNX binary model
+
+    ```
+    python convert.py --fluid_model <fluid inference model> --onnx_model <ONNX model> validate True
+    ```
+
+* Validate the converted model
+
+    ```
+    python validate.py --fluid_model <fluid inference model> --onnx_model <ONNX model>
+    ```
+
+The conversion and model validation will be completed consecutively, finally output a readable model structure description. And for the converse conversion, users only need to exchange the input and output.
+
+
+# Challenges and mitigation
+
+## Cycles
+
+Cycles are unsupported in ONNX. In Paddle, the `while` op is the most prominent example of a cycle.
+
+*Resolution*: We won't support models with `while`s which can't be substituted until ONNX adds support for such ops.
+
+## Sequences
+
+Sequence processing operators like `sequence_expand`, `sequence_reshape`, `sequence_concat`, and `sequence_pool` are not supported by ONNX as well, because they do not support non-padded datatypes like LoDTensors.
+
+*Resolution*: Since the runtimes using our ONNX exported graphs won't be using LoDTensors in the first place, such sequence operators should be mapped to ONNX ops that will do the necessary transposing ops with the knowledge of the padding and shape of the Tensors.
+
+## Ops that can't easily be mapped
+
+There are ops that just aren't possible to map today:
+
+**Control flow operators**
+
+Paddle supports control flow ops like `If/Else` and `Switch` (if we ignore the CSP operations like `select` for now). ONNX has `If` support in the experimental phase.
+
+*Resolution*: Map Paddle's `If/Else` to ONNX's `If`, but ignore other control flow operators until ONNX brings support for them.
+
+
+**Non-existent in Fluid**
+
+There are several ONNX operators that are not available in Fluid today, e.g. `InstanceNormalization`, `RandomUniform`, `Unsqueeze`, etc.
+
+*Resolution*: For the initial phase, we can choose to not support ops that our models don't care for and are subsequently not available in Fluid. However, for ops that we think might be necessary for Fluid users also, we must implement them on our side and support the ONNX conversion to them. This list is TBD.
+
+
+**Concurrency**
+
+ONNX does not have any considerations for concurrency right now.
+
+*Resolution*: There are two ways to approach this:
+
+a. We choose to not support concurrent models.
+b. We only support `go_op`s (basically threads) shallowly. This could mean that we enqueue `go_op` ops prior to gradient calculations OR even prior to the entire graph, and that's it - since `go_op`s do not have support for backprop anyways. One of the core target use cases of `go_op`: batch reading - can be handled through this approach.
+
+
+**Overloaded in Fluid**
+
+There are ops in ONNX whose job can't be accomplished by a single corresponding Paddle operator (e.g. ), but a collection of operators.
+
+*Resolution*: Chain multiple Paddle operators.
+
+
+## Lack of LoDTensors
+
+As stated above, ONNX only supports simple Tensor values.
+
+*Resolution*: Deprecate to plain old numpy-able tensors.
+
+
+## Reconstruction from deprecated ONNX ops
+
+For higher-level Fluid ops, such as a few offered by the `nn` layer that do not have direct corresponding mappings but can be converted to ONNX by chaining a series of ops without cycles, it would be useful to map them back to the higher-level Fluid ops once converted back from the deprecated ONNX graphs.
+
+*Resolution*: Graphs that have the deprecation from Paddle -> ONNX. When converting back from ONNX, if we encounter the identical graphs by doing a forward search, we can replace the subgraphs with the matching ONNX op.
+
+
+# Supported models
+
+As mentioned above, potential risks may come from the conversion of sequence-related models, including the LodTensor, ```if/else``` and ```while``` operator. So a good choice is to focus on some important feedforward models first, then implement some simple recurrent models.
+
+- Feedforward models: common models selected in PaddleBook, e.g. VGG, ResNet and some other models proposed by application teams.
+- Recurrent models: language model, stacked LSTMs etc.
diff --git a/doc/design/auto_gradient_check.md b/doc/fluid/design/others/auto_gradient_check.md
similarity index 100%
rename from doc/design/auto_gradient_check.md
rename to doc/fluid/design/others/auto_gradient_check.md
diff --git a/doc/design/dcgan.png b/doc/fluid/design/others/dcgan.png
similarity index 100%
rename from doc/design/dcgan.png
rename to doc/fluid/design/others/dcgan.png
diff --git a/doc/fluid/design/others/gan_api.md b/doc/fluid/design/others/gan_api.md
new file mode 100644
index 0000000000000000000000000000000000000000..7167470088766985fa5ad31657410309330fd725
--- /dev/null
+++ b/doc/fluid/design/others/gan_api.md
@@ -0,0 +1,253 @@
+# Design for GAN
+
+GAN (General Adversarial Net [https://arxiv.org/abs/1406.2661]) is an important model for unsupervised learning and widely used in many areas.
+
+It applies several important concepts in machine learning system design, including building and running subgraphs, dependency tracing, different optimizers in one executor and so forth.
+
+In our GAN design, we wrap it as a user-friendly easily customized python API to design different models. We take the conditional DC-GAN (Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks [https://arxiv.org/abs/1511.06434]) as an example due to its good performance on image generation.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/test.dot.png" width = "35%" align="center"/><br/>
+Figure 1. The overall running logic of GAN. The black solid arrows indicate the forward pass; the green dashed arrows indicate the backward pass of generator training; the red dashed arrows indicate the backward pass of the discriminator training. The BP pass of the green (red) arrow should only update the parameters in the green (red) boxes. The diamonds indicate the data providers. d\_loss and g\_loss marked in red and green are the two targets we would like to run.
+</p>
+
+The operators, layers and functions required/optional to build a GAN demo is summarized in https://github.com/PaddlePaddle/Paddle/issues/4563.
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/dcgan.png" width = "90%" align="center"/><br/>
+Figure 2. Photo borrowed from the original DC-GAN paper.
+</p>
+
+## The Conditional-GAN might be a class.
+This design we adopt the popular open source design in https://github.com/carpedm20/DCGAN-tensorflow and https://github.com/rajathkmp/DCGAN. It contains following data structure:
+
+- DCGAN(object): which contains everything required to build a GAN model. It provides following member functions methods as API:
+
+- __init__(...): Initialize hyper-parameters (like conv dimension and so forth), and declare model parameters of discriminator and generator as well.
+
+- generator(z, y=None): Generate a fake image from input noise z. If the label y is provided, the conditional GAN model will be chosen.
+Returns a generated image.
+
+- discriminator(image):
+Given an image, decide if it is from a real source or a fake one.
+Returns a 0/1 binary label.
+
+- build_model(self):
+build the whole GAN model, define training loss for both generator and discrimator.
+
+## Discussion on Engine Functions required to build GAN
+- Trace the tensor and variable dependency in the engine executor. (Very critical, otherwise GAN can'be be trained correctly)
+- Different optimizers responsible for optimizing different loss.
+
+To be more detailed, we introduce our design of DCGAN as following:
+
+### Class member Function: Initializer
+- Set up hyper-parameters, including condtional dimension, noise dimension, batch size and so forth.
+- Declare and define all the model variables. All the discriminator parameters are included in the list self.theta_D and all the generator parameters are included in the list self.theta_G.
+```python
+class DCGAN(object):
+  def __init__(self, y_dim=None):
+
+    # hyper parameters  
+    self.y_dim = y_dim # conditional gan or not
+    self.batch_size = 100
+    self.z_dim = z_dim # input noise dimension
+
+    # define parameters of discriminators
+    self.D_W0 = pd.Variable(shape=[3,3, 1, 128], data=pd.gaussian_normal_randomizer())
+    self.D_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.D_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
+    self.D_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.D_W2 = pd.Varialble(np.random.rand(128, 1))
+    self.D_b2 = pd.Variable(np.zeros(128))
+    self.theta_D = [self.D_W0, self.D_b0, self.D_W1, self.D_b1, self.D_W2, self.D_b2]
+
+    # define parameters of generators
+    self.G_W0 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
+    self.G_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.G_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
+    self.G_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.G_W2 = pd.Varialble(np.random.rand(128, 1))
+    self.G_b2 = pd.Variable(np.zeros(128))
+    self.theta_G = [self.G_W0, self.G_b0, self.G_W1, self.G_b1, self.G_W2, self.G_b2]
+```
+
+### Class member Function: Generator
+- Given a noisy input z, returns a fake image.
+- Concatenation, batch-norm, FC operations required;
+- Deconv layer required, which is missing now...
+```python
+class DCGAN(object):
+  def generator(self, z, y = None):
+    # input z: the random noise
+    # input y: input data label (optional)
+    # output G_im: generated fake images
+
+    if not self.y_dim:
+      z = pd.layer.concat(1, [z, y])
+
+    G_h0 = pd.layer.fc(z, self.G_w0, self.G_b0)
+    G_h0_bn = pd.layer.batch_norm(G_h0)
+    G_h0_relu = pd.layer.relu(G_h0_bn)
+
+    G_h1 = pd.layer.deconv(G_h0_relu, self.G_w1, self.G_b1)
+    G_h1_bn = pd.layer.batch_norm(G_h1)
+    G_h1_relu = pd.layer.relu(G_h1_bn)
+
+    G_h2 = pd.layer.deconv(G_h1_relu, self.G_W2, self.G_b2))
+    G_im = pd.layer.tanh(G_im)
+    return G_im
+```
+
+### Class member function: Discriminator
+- Given a noisy input z, returns a fake image.
+- Concatenation, Convolution, batch-norm, FC, Leaky-ReLU operations required;
+```python
+class DCGAN(object):
+  def discriminator(self, image):
+    # input image: either generated images or real ones
+    # output D_h2: binary logit of the label
+
+    D_h0 = pd.layer.conv2d(image, w=self.D_w0, b=self.D_b0)
+    D_h0_bn = pd.layer.batchnorm(h0)
+    D_h0_relu = pd.layer.lrelu(h0_bn)
+
+    D_h1 = pd.layer.conv2d(D_h0_relu, w=self.D_w1, b=self.D_b1)
+    D_h1_bn = pd.layer.batchnorm(D_h1)
+    D_h1_relu = pd.layer.lrelu(D_h1_bn)
+
+    D_h2 = pd.layer.fc(D_h1_relu, w=self.D_w2, b=self.D_b2)
+    return D_h2
+```
+
+### Class member function: Build the model
+- Define data readers as placeholders to hold the data;
+- Build generator and discriminators;
+- Define two training losses for discriminator and generator, respectively.
+If we have execution dependency engine to back-trace all tensors, the module building our GAN model will be like this:
+```python
+class DCGAN(object):
+  def build_model(self):
+    if self.y_dim:
+        self.y = pd.data(pd.float32, [self.batch_size, self.y_dim])
+    self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    self.z = pd.data(tf.float32, [None, self.z_size])
+
+    # step 1: generate images by generator, classify real/fake images with discriminator
+    if self.y_dim: # if conditional GAN, includes label
+        self.G = self.generator(self.z, self.y)
+        self.D_t = self.discriminator(self.images)
+        # generated fake images
+        self.sampled = self.sampler(self.z, self.y)
+        self.D_f = self.discriminator(self.G)
+    else: # original version of GAN
+        self.G = self.generator(self.z)
+        self.D_t = self.discriminator(self.images)
+        # generate fake images
+        self.sampled = self.sampler(self.z)
+        self.D_f = self.discriminator(self.images)
+
+    # step 2: define the two losses
+    self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size))
+    self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size))
+    self.d_loss = self.d_loss_real + self.d_loss_fake
+
+    self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_f, np.ones(self.batch_szie))
+```
+
+If we do not have dependency engine but blocks, the module building our GAN model will be like this:
+```python
+class DCGAN(object):
+  def build_model(self, default_block):
+    # input data in the default block
+    if self.y_dim:
+        self.y = pd.data(pd.float32, [self.batch_size, self.y_dim])
+    self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    # self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    self.z = pd.data(tf.float32, [None, self.z_size])
+
+    # step 1: generate images by generator, classify real/fake images with discriminator
+    with pd.default_block().g_block():
+      if self.y_dim: # if conditional GAN, includes label
+        self.G = self.generator(self.z, self.y)
+        self.D_g = self.discriminator(self.G, self.y)
+      else: # original version of GAN
+        self.G = self.generator(self.z)
+        self.D_g = self.discriminator(self.G, self.y)
+      self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_g, np.ones(self.batch_szie))
+
+    with pd.default_block().d_block():
+      if self.y_dim: # if conditional GAN, includes label
+        self.D_t = self.discriminator(self.images, self.y)
+        self.D_f = self.discriminator(self.G, self.y)
+      else: # original version of GAN
+        self.D_t = self.discriminator(self.images)
+        self.D_f = self.discriminator(self.G)
+
+      # step 2: define the two losses
+      self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size))
+      self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size))
+      self.d_loss = self.d_loss_real + self.d_loss_fake
+```
+Some small confusion and problems with this design:
+- D\_g and D\_f are actually the same thing, but has to be written twice; i.e., if we want to run two sub-graphs conceptually, the same codes have to be written twice if they are shared by the graph.
+- Requires ability to create a block anytime, rather than in if-else or rnn only;
+
+## Main function for the demo:
+Generally, the user of GAN just need to the following things:
+- Define an object as DCGAN class;
+- Build the DCGAN model;
+- Specify two optimizers for two different losses with respect to different parameters.
+```python
+# pd for short, should be more concise.
+from paddle.v2 as pd
+import numpy as np
+import logging
+
+if __name__ == "__main__":
+    # dcgan class in the default graph/block
+    # if we use dependency engine as tensorflow
+    # the codes, will be slightly different like:
+    # dcgan = DCGAN()
+    # dcgan.build_model()
+    with pd.block() as def_block:
+      dcgan = DCGAN()
+      dcgan.build_model(def_block)
+
+    # load mnist data
+    data_X, data_y = self.load_mnist()
+
+    # Two subgraphs required!!!
+    with pd.block().d_block():
+      d_optim = pd.train.Adam(lr = .001, beta= .1)
+      d_step = d_optim.minimize(dcgan.d_loss, dcgan.theta_D)
+    with pd.block.g_block():
+      g_optim = pd.train.Adam(lr = .001, beta= .1)
+      g_step = pd.minimize(dcgan.g_loss, dcgan.theta_G)
+
+    # executor
+    sess = pd.executor()
+
+    # training
+    for epoch in xrange(10000):
+      for batch_id in range(N / batch_size):
+        idx = ...
+        # sample a batch
+        batch_im, batch_label = data_X[idx:idx+batch_size], data_y[idx:idx+batch_size]
+        # sample z
+        batch_z = np.random.uniform(-1., 1., [batch_size, z_dim])
+
+        if batch_id % 2 == 0:
+          sess.run(d_step,
+                   feed_dict = {dcgan.images: batch_im,
+                                dcgan.y: batch_label,
+                                dcgan.z: batch_z})
+        else:
+          sess.run(g_step,
+                   feed_dict = {dcgan.z: batch_z})
+```
+
+# More thinking about dependency engine v.s. block design:
+- What if we just want to run an intermediate result? Do we need to run the whole block/graph?
+- Should we call eval() to get the fake images in the first stage? And then train the discriminator in the second stage?
diff --git a/doc/design/graph.md b/doc/fluid/design/others/graph.md
similarity index 100%
rename from doc/design/graph.md
rename to doc/fluid/design/others/graph.md
diff --git a/doc/design/graph_survey.md b/doc/fluid/design/others/graph_survey.md
similarity index 100%
rename from doc/design/graph_survey.md
rename to doc/fluid/design/others/graph_survey.md
diff --git a/doc/design/images/graph_construction_example.bash b/doc/fluid/design/others/images/graph_construction_example.bash
similarity index 100%
rename from doc/design/images/graph_construction_example.bash
rename to doc/fluid/design/others/images/graph_construction_example.bash
diff --git a/doc/design/images/graph_construction_example.dot b/doc/fluid/design/others/images/graph_construction_example.dot
similarity index 100%
rename from doc/design/images/graph_construction_example.dot
rename to doc/fluid/design/others/images/graph_construction_example.dot
diff --git a/doc/design/images/graph_construction_example_all.png b/doc/fluid/design/others/images/graph_construction_example_all.png
similarity index 100%
rename from doc/design/images/graph_construction_example_all.png
rename to doc/fluid/design/others/images/graph_construction_example_all.png
diff --git a/doc/design/images/graph_construction_example_forward_backward.png b/doc/fluid/design/others/images/graph_construction_example_forward_backward.png
similarity index 100%
rename from doc/design/images/graph_construction_example_forward_backward.png
rename to doc/fluid/design/others/images/graph_construction_example_forward_backward.png
diff --git a/doc/design/images/graph_construction_example_forward_only.png b/doc/fluid/design/others/images/graph_construction_example_forward_only.png
similarity index 100%
rename from doc/design/images/graph_construction_example_forward_only.png
rename to doc/fluid/design/others/images/graph_construction_example_forward_only.png
diff --git a/doc/design/parameters_in_cpp.md b/doc/fluid/design/others/parameters_in_cpp.md
similarity index 100%
rename from doc/design/parameters_in_cpp.md
rename to doc/fluid/design/others/parameters_in_cpp.md
diff --git a/doc/design/simple_op_design.md b/doc/fluid/design/others/simple_op_design.md
similarity index 100%
rename from doc/design/simple_op_design.md
rename to doc/fluid/design/others/simple_op_design.md
diff --git a/doc/design/test.dot b/doc/fluid/design/others/test.dot
similarity index 100%
rename from doc/design/test.dot
rename to doc/fluid/design/others/test.dot
diff --git a/doc/design/test.dot.png b/doc/fluid/design/others/test.dot.png
similarity index 100%
rename from doc/design/test.dot.png
rename to doc/fluid/design/others/test.dot.png
diff --git a/doc/fluid/dev/api_doc_std_cn.md b/doc/fluid/dev/api_doc_std_cn.md
index 5596b2653ae6ed9917f77dad08f926bcb1fb3419..7d39b8de1e6dc502ffea5f7882bd6a42b1ed6549 100644
--- a/doc/fluid/dev/api_doc_std_cn.md
+++ b/doc/fluid/dev/api_doc_std_cn.md
@@ -1,8 +1,9 @@
 # API注释撰写标准
 
-- [API注释模块](#API注释模块)
-- [格式及示例](#格式及示例)
-- [完整示例](#完整示例)
+- [API注释撰写标准](#api)
+    - [API注释模块](#api)
+    - [格式及示例](#)
+    - [完整示例](#)
 
 
 ## API注释模块
@@ -45,11 +46,11 @@ API文档须使用reStructuredText格式撰写，该格式详情请参考[链接
 - Python API Definition
 
   - 格式：
-    
+
       [Python API Definition]
-    
+
   - 示例
-  
+
       ```
       fc(input,
          size,
@@ -63,19 +64,19 @@ API文档须使用reStructuredText格式撰写，该格式详情请参考[链接
       ```
 
 - Function Description
-  
+
   - 格式
 
       本模块应包含以下内容（排列顺序为文档撰写顺序）：
 
       [Function Description]
-  
+
       [Formula]
-    
+
       [Symbols' Descriptions if necessary]
-    
+
       [References if necessary]
- 
+
   - 示例
 
       [Function Description]
@@ -119,18 +120,18 @@ API文档须使用reStructuredText格式撰写，该格式详情请参考[链接
       [References if necessary]
 
       因fc没有必要列出的参考文献，故该内容省略。其他情况下需明确给出对应的参考文献和对应连接，以 layer_norm 为例：
-      
+
       ```
       Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_ for more details.
       ```
-  
+
 
 - Args Description
-  
+
   - 格式
-  
+
       \[Arg's Name\][(Data Type, Default Value)][Description]
-  
+
   - 示例
 
       fc的部分参数注释如下：
@@ -145,35 +146,35 @@ API文档须使用reStructuredText格式撰写，该格式详情请参考[链接
       ```
 
 - Returns
-  
+
   - 格式
-  
+
       [Name][Shape]
-  
+
   - 示例
-  
+
       ```
       Returns:
           A tensor variable storing the transformation result.
       ```
-  
+
       当返回值为包含多个参数的tuple时，应按顺序逐个介绍各参数，以dynamic_lstm为例：
-  
+
       ```
       Returns:
           A tuple containing:
             The hidden state of LSTM whose shape is (T X D).
             The cell state of LSTM whose shape is (T X D).
       ```
-  
+
 - Raises
 
   - 格式
-  
+
       [Exception Type][Condition]
 
   - 示例
-  
+
       ```
       Raises:
           ValueError: If the rank of the input is less than 2.
@@ -182,7 +183,7 @@ API文档须使用reStructuredText格式撰写，该格式详情请参考[链接
 - Note
 
   - 格式
-  
+
      [Note]
 
   - 示例
@@ -198,15 +199,15 @@ API文档须使用reStructuredText格式撰写，该格式详情请参考[链接
           2. When num_heads == 1, scaled_dot_product_attention has no learnable
              parameters.
       ```
-  
+
 - Examples
 
   - 格式
 
       \[Python Code Snipper]
-  
+
   - 示例
-  
+
       ```
       Examples:
           .. code-block:: python
@@ -217,4 +218,4 @@ API文档须使用reStructuredText格式撰写，该格式详情请参考[链接
 
 ## 完整示例
 
-fc 的完整注释见[示例](src/fc.py)。
+fc 的完整注释见[示例](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。
diff --git a/doc/fluid/dev/api_doc_std_en.md b/doc/fluid/dev/api_doc_std_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..f175b219750d1c765a6a111c2ec3aa732fa46175
--- /dev/null
+++ b/doc/fluid/dev/api_doc_std_en.md
@@ -0,0 +1,227 @@
+# API Doc Standard
+
+- [API Doc Standard](#api-doc-standard)
+    - [API Doc Structure](#api-doc-structure)
+    - [Format and Examples](#format-and-examples)
+    - [Complete Example](#complete-example)
+
+
+## API Doc Structure
+
+API Doc should contain the following parts(please write them in order):
+
+- Python API Definition
+
+  The definition of API
+
+- Function Description
+
+  Description of API's function. 
+  The description includes: meaning, purpose and operation on input of API, reference and corresponding link(if any), formula(if necessary) and explanations of key variables in the formula.
+
+- Args Description
+
+  Description of API parameters.
+  Introduce parameters one by one according to the order in API definition.
+  The introduction includes: data type, default value(if any), meaning, etc.
+
+- Returns
+
+  Introduction of API returned value.
+  Introduce meaning of returned value, provide correspoding format if necessary.
+  If returned value is a tuple containing multiple parameters, then introduce parameters one by one in order.
+
+- Raises（if any）
+
+   Abnormality, error that may occur, and possible reasons. If there are more than one possible abnormity or error, they should be listed in order. 
+
+- Note（if any）
+
+  Matters needing attention. If there are more than one matters, they should be listed in order. 
+
+- Examples
+
+  Examples of how to use API.
+
+
+## Format and Examples
+
+API documentation must obey reStructuredText format, please refer to [here](http://sphinx-doc-zh.readthedocs.io/en/latest/rest.html).
+Format and examples of each part of API documantation are as follows: (take fc for example)
+
+- Python API Definition
+
+  - Format
+
+      [Python API Definition]
+
+  - Example
+
+      ```
+      fc(input,
+         size,
+         num_flatten_dims=1,
+         param_attr=None,
+         bias_attr=None,
+         act=None,
+         name=None,
+         main_program=None,
+         startup_program=None)
+      ```
+
+- Function Description
+
+  - Format
+
+      This part contains (please write them in order):
+
+      [Function Description]
+
+      [Formula]
+
+      [Symbols' Descriptions if necessary]
+
+      [References if necessary]
+
+  - Example
+
+      [Function Description]
+
+       ```
+       **Fully Connected Layer**
+
+       The fully connected layer can take multiple tensors as its inputs. It
+       creates a variable called weights for each input tensor, which represents
+       a fully connected weight matrix from each input unit to each output unit.
+       The fully connected layer multiplies each input tensor with its coresponding
+       weight to produce an output Tensor. If multiple input tensors are given,
+       the results of multiple multiplications will be sumed up. If bias_attr is
+       not None, a bias variable will be created and added to the output. Finally,
+       if activation is not None, it will be applied to the output as well.
+       ```
+
+      [Formula]
+
+      ```
+      This process can be formulated as follows:
+
+      .. math::
+
+           Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
+      ```
+
+      [Symbols' Descriptions if necessary]
+
+      ```
+      In the above equation:
+
+      * :math:`N`: Number of the input.
+      * :math:`X_i`: The input tensor.
+      * :math:`W`: The weights created by this layer.
+      * :math:`b`: The bias parameter created by this layer (if needed).
+      * :math:`Act`: The activation function.
+      * :math:`Out`: The output tensor.
+      ```
+
+      [References if necessary]
+
+      Since there is no need for reference of fc, we omit them here. Under other circumstances, please provide explicit reference and link, take layer_norm for example: 
+
+      ```
+      Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_ for more details.
+      ```
+
+
+- Args Description
+
+  - Format
+
+      \[Arg's Name\][(Data Type, Default Value)][Description]
+
+  - Example
+
+      part of fc parameters are as follows:
+
+      ```
+      Args:
+          input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
+              the input tensor(s) is at least 2.
+          param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
+              parameters/weights of this layer.
+          name (str, default None): The name of this layer.
+      ```
+
+- Returns
+
+  - Format
+
+      [Name][Shape]
+
+  - Example
+
+      ```
+      Returns:
+          A tensor variable storing the transformation result.
+      ```
+
+      when returned value contain more than one tuple, please introduce every parameter in order, take dynamic_lstm for example:
+
+      ```
+      Returns:
+          A tuple containing:
+            The hidden state of LSTM whose shape is (T X D).
+            The cell state of LSTM whose shape is (T X D).
+      ```
+
+- Raises
+
+  - Format
+
+      [Exception Type][Condition]
+
+  - Example
+
+      ```
+      Raises:
+          ValueError: If the rank of the input is less than 2.
+      ```
+
+- Note
+
+  - Format
+
+     [Note]
+
+  - Example
+
+      there is no Note in fc, so we omit this part. If there is any note, please write clearly. If there are more than one notes, please list them in order. Take scaled\_dot\_product\_attention for example:
+
+      ```
+      Note:
+          1. When num_heads > 1, three linear projections are learned respectively
+             to map input queries, keys and values into queries', keys' and values'.
+             queries', keys' and values' have the same shapes with queries, keys
+             and values.
+          2. When num_heads == 1, scaled_dot_product_attention has no learnable
+             parameters.
+      ```
+
+- Examples
+
+  - Format
+
+      \[Python Code Snipper]
+
+  - Example
+
+      ```
+      Examples:
+          .. code-block:: python
+
+            data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+            fc = fluid.layers.fc(input=data, size=1000, act="tanh")
+      ```
+
+## Complete Example
+
+Complete Example of fc please see [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。
diff --git a/doc/design/ci_build_whl.png b/doc/fluid/dev/ci_build_whl.png
similarity index 100%
rename from doc/design/ci_build_whl.png
rename to doc/fluid/dev/ci_build_whl.png
diff --git a/doc/fluid/dev/contribute_to_paddle_cn.md b/doc/fluid/dev/contribute_to_paddle_cn.md
new file mode 120000
index 0000000000000000000000000000000000000000..955216ca62e71b4d3666e1662aa86c9495d2e7d6
--- /dev/null
+++ b/doc/fluid/dev/contribute_to_paddle_cn.md
@@ -0,0 +1 @@
+../../v2/dev/contribute_to_paddle_cn.md
\ No newline at end of file
diff --git a/doc/fluid/dev/contribute_to_paddle_en.md b/doc/fluid/dev/contribute_to_paddle_en.md
new file mode 120000
index 0000000000000000000000000000000000000000..f9fc68c37e17a8a365b0d7fae86c16b0d094631f
--- /dev/null
+++ b/doc/fluid/dev/contribute_to_paddle_en.md
@@ -0,0 +1 @@
+../../v2/dev/contribute_to_paddle_en.md
\ No newline at end of file
diff --git a/doc/fluid/dev/index_cn.rst b/doc/fluid/dev/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..37e608160db0ad5a92297987937bbbfa8f842ea8
--- /dev/null
+++ b/doc/fluid/dev/index_cn.rst
@@ -0,0 +1,16 @@
+开发标准
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  contribute_to_paddle_cn.md
+  write_docs_cn.md
+  api_doc_std_cn.md
+  new_op_cn.md
+  new_op_kernel.md
+  use_eigen_cn.md
+  name_convention.md
+  support_new_device.md
+  releasing_process_cn.md
+  op_markdown_format.md
diff --git a/doc/fluid/dev/index_en.rst b/doc/fluid/dev/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d7f83035010f13c30514673ecbee301f194dc175
--- /dev/null
+++ b/doc/fluid/dev/index_en.rst
@@ -0,0 +1,16 @@
+Development
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  contribute_to_paddle_en.md
+  write_docs_en.md
+  api_doc_std_en.md
+  new_op_en.md
+  new_op_kernel.md
+  use_eigen_en.md
+  name_convention.md
+  support_new_device.md
+  releasing_process_en.md
+  op_markdown_format.md
diff --git a/doc/fluid/dev/name_convention.md b/doc/fluid/dev/name_convention.md
new file mode 100644
index 0000000000000000000000000000000000000000..6b4244d0f506c8cd6c08739141eabad27c581ca7
--- /dev/null
+++ b/doc/fluid/dev/name_convention.md
@@ -0,0 +1,65 @@
+# Operator's Parameter Name Convention
+
+To make the operator document itself more clear, we recommend operator names obey the listing conventions.
+
+## OpProtoMaker names
+
+When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L61) , and will be used in client language to create operator.
+
+- Input/Output.
+  - Input/Output names follow the **CamelCase**. e.g. `X`, `Y`, `Matrix`, `LastAxisInMatrix`. Input/Output much more like Variables, we prefer to meaningful English words.
+  - If an operator's Input/Output are tensors in math, not match to any meaningful words, input name should starts from `X`. e.g. `X`, `Y`, and output name should starts from `Out`. e.g. `Out`. This rule intends making operators which have few inputs/outputs unified.
+
+- Attribute.
+  - Attribute name follows the **snake_case**. e.g. `x`, `y`, `axis`, `rowwise_matrix`. Also, attribute name prefers to meaningful English words.
+
+- Comments.
+  - Input/Output/Attr comment follow the format of **(type,default value) usage**, corresponding to which type it can be and how it will be used in the operator. e.g.  Attribute in Accumulator`"gamma" `,`(float, default 1.0) Accumulation multiplier`.
+  - Operator comment format of` R"DOC(your comment here)DOC"`. You should explain the input/output of the operator first. If there is math calculation in this operator, you should write the equation in the comment. e.g. `Out = X + Y`.
+
+- Order.
+  - Follow the order of Input/Output, then Attribute, then Comments. See the example in best practice.
+
+## Best Practice
+
+Here we give some examples to show how these rules will be used.
+
+- The operator has one input, one output. e.g.`relu`, inputs: `X`, outputs: `Out`.
+
+- The operator has two input, one output. e.g. `rowwise_add`, inputs : `X`, `Y`, outputs : `Out`.
+
+- The operator contains attribute. e.g. `cosine`, inputs : `X`, `axis`, outputs : `Out`.
+
+  We give a full example of Accumulator Operator.
+
+```c++
+class AccumulateOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  AccumulateOpMaker(OpProto *proto,
+                    OpAttrChecker *op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) The input tensor that has to be accumulated to the output tensor.
+    If the output size is not the same as input size,
+    the output tensor is first reshaped and initialized to zero, and only then, accumulation is done.");
+    AddOutput("Out", "(Tensor) Accumulated output tensor");
+    AddAttr<float>("gamma", "(float, default 1.0) Accumulation multiplier").SetDefault(1.0f);
+    AddComment(R"DOC(
+Accumulate Operator.
+
+This operator accumulates the input tensor to the output tensor. If the
+output tensor already has the right size, we add to it; otherwise, we first
+initialize the output tensor to all zeros, and then do accumulation. Any
+further calls to the operator, given that no one else fiddles with the output
+in the interim, will do simple accumulations.
+
+Accumulation is done as follows:
+
+Out = 1*X + gamma*Out
+
+where X is the input tensor, Out is the output tensor and gamma is the multiplier
+argument.
+
+)DOC");
+  }
+};
+```
diff --git a/doc/fluid/dev/new_op_cn.md b/doc/fluid/dev/new_op_cn.md
index 92996585674b46f45549b972b9f295503b1c7f8c..587d819f79fcf82549826359fbf04ad3af404446 100644
--- a/doc/fluid/dev/new_op_cn.md
+++ b/doc/fluid/dev/new_op_cn.md
@@ -26,19 +26,38 @@
 
 依据是否包含kernel，可以将Op分为两种：包含Kernel的Op和不包含kernel的Op，前者Op的定义继承自`OperatorWithKernel`，后者继承自`OperatorBase`。本教程主要介绍带Kernel的Op如何写，简单总结Op需要包含的内容如下：
 
-
- 内容            | 定义位置
---------------  | :----------------------
-OpProtoMake定义  | `.cc`文件，Backward Op不需要定义OpProtoMake
-Op定义           | `.cc`文件
-Kernel实现       | CPU、CUDA共享Kernel实现在`.h`文件中，否则，CPU 实现在`.cc`文件中，CUDA 实现在`.cu`文件中。
-注册Op           | Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，CUDA实现在`.cu`文件中
-
-
-实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
-
-
-下面以矩阵乘操作，即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
+<table>
+<thead>
+<tr>
+<th>内容</th>
+<th>定义位置</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>OpProtoMake定义 </td>
+<td>`.cc`文件，Backward Op不需要定义OpProtoMake </td>
+</tr>
+<tr>
+<td>Op定义 </td>
+<td> `.cc`文件</td>
+</tr>
+<tr>
+<td>Kernel实现 </td>
+<td> CPU、CUDA共享Kernel实现在`.h`文件中，否则，CPU 实现在`.cc`文件中，CUDA 实现在`.cu`文件中。</td>
+</tr>
+<tr>
+<td>注册Op </td>
+<td> Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，CUDA实现在`.cu`文件中</td>
+</tr>
+</tbody>
+</table>
+
+
+实现新的op都添加至目录[paddle/fluid/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
+
+
+下面以矩阵乘操作，即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
 
 
 ## 实现C++类
@@ -66,17 +85,17 @@ The equation is: Out = X * Y
 };
 ```
 
-[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)继承自`framework::OpProtoAndCheckerMaker`，构造函数含有2个参数：
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L76-L127)继承自`framework::OpProtoAndCheckerMaker`，构造函数含有2个参数：
 
    - `framework::OpProto` ： 前者存储Op的输入输出和参数属性，将用于Python API接口的生成。
    - `framework::OpAttrChecker` ：后者用于检查参数属性的合法性。
 
 构造函数里通过`AddInput`添加输入参数，通过`AddOutput`添加输出参数，通过`AddComment`添加Op的注释。这些函数会将对应内容添加到`OpProto`中。
 
-上面的代码在`MulOp`中添加两个输入`X`和`Y`，添加了一个输出`Out`，并解释了各自含义，命名请遵守[命名规范](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md)。
+上面的代码在`MulOp`中添加两个输入`X`和`Y`，添加了一个输出`Out`，并解释了各自含义，命名请遵守[命名规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/name_convention.md)。
 
 
-再以[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)为例：
+再以[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L38-L55)为例：
 
 ```cpp
 template <typename AttrType>
@@ -84,21 +103,21 @@ class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input tensor of scale operator.").NotInGradient();
-    AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
-    AddComment(R"DOC(Scale operator
-The equation is: Out = scale*X
+    AddInput("X", "(Tensor) Input tensor of scale operator.");
+    AddOutput("Out", "(Tensor) Output tensor of scale operator.");
+    AddComment(R"DOC(
+Scale operator
+$$Out = scale*X$$
 )DOC");
-    AddAttr<AttrType>("scale", "scale of scale operator.").SetDefault(1.0);
+    AddAttr<AttrType>("scale",
+                      "(float, default 1.0)"
+                      "The scaling factor of the scale operator.")
+        .SetDefault(1.0);
   }
 };
 ```
 
-这个例子有两处不同：
-
-- `AddInput("X","...").NotInGradient()` : 表示`X`这个输入不参与`ScaleOp`对应的梯度Op计算之中，如果Op的某个输入不参与反向梯度的计算，请显示地调用`.NotInGradient()`进行设置。
-
-- `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
+这个例子有`AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数，作为参数属性，并且设置默认值为1.0。
 
 
 ### 定义Operator类
@@ -128,7 +147,7 @@ class MulOp : public framework::OperatorWithKernel {
 };
 ```
 
-[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22)继承自`OperatorWithKernel`。`public`成员：
+[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L22)继承自`OperatorWithKernel`。`public`成员：
 
 ```cpp
 using framework::OperatorWithKernel::OperatorWithKernel;
@@ -154,7 +173,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
 `MulKernel`继承自`framework::OpKernel`，带有下面两个模板参数:
 
-- `typename DeviceContext`: 表示设备类型，不同设备(CPU、CUDA)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+- `typename DeviceContext`: 表示设备类型，不同设备(CPU、CUDA)共享同一个Kernel时，需加该模板参数，不共享则不加，一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43)。
 
 - `typename T` : 表示数据类型，如`float`, `double`等。
 
@@ -182,10 +201,9 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
 需要注意：**不同设备(CPU、CUDA)共享一个Op定义，是否则共享同一个`OpKernel`，取决于`Compute`调用的函数是否支持不同设备。**
 
-`MulOp`的CPU、CUDA实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考：[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
-
-为了使`OpKernel`的计算过程书写更加简单，并且CPU、CUDA的代码可以复用，我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)。
+`MulOp`的CPU、CUDA实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考：[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43)。
 
+为了使`OpKernel`的计算过程书写更加简单，并且CPU、CUDA的代码可以复用，我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/use_eigen_cn.md)。
 
 到此，前向Op实现完成。接下来，需要在`.cc`文件中注册该op和kernel。
 反向Op类的定义，反向OpKernel的定义与前向Op类似，这里不再赘述。**但需注意反向Op没有`ProtoMaker`**。
@@ -196,7 +214,9 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
     ```cpp
     namespace ops = paddle::operators;
-    REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
+    REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+    REGISTER_OPERATOR(mul_grad, ops::MulGradOp)
     REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
     REGISTER_OP_CPU_KERNEL(mul_grad,
                   ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
@@ -204,8 +224,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
    在上面的代码中：
 
-    - `REGISTER_OP` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`。
-    - `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op。
+    - `REGISTER_OPERATOR` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`。
     - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulGradKernel`类。
 
 
@@ -236,7 +255,7 @@ make mul_op
 
 ## 实现单元测试
 
-单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
+单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_mul_op.py)。
 
 ### 前向Operator单测
 
@@ -296,7 +315,7 @@ Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp
 
 ### 编译和执行
 
-`python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。
+`python/paddle/fluid/tests/unittests/` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。
 
 请注意，**不同于Op的编译测试，运行单元测试测时需要编译整个工程**，并且编译时需要打开`WITH_TESTING`, 即`cmake paddle_dir -DWITH_TESTING=ON`。编译成功后，执行下面的命令来运行单元测试：
 
@@ -312,7 +331,6 @@ ctest -R test_mul_op
 
 ## 注意事项
 
-- 为每个Op创建单独的`*_op.h`（如有）、`*_op.cc`和`*_op.cu`（如有）。不允许一个文件中包含多个Op，这将会导致编译出错。
-- 注册Op时的类型名，需要和该Op的名字一样。即不允许在`A_op.cc`里面，注册`REGISTER_OP(B, ...)`等，这将会导致单元测试出错。
+- 注册Op时的类型名，需要和该Op的名字一样。即不允许在`A_op.cc`里面，注册`REGISTER_OPERATOR(B, ...)`等，这将会导致单元测试出错。
 - 如果Op没有实现CUDA Kernel，请不要创建空的`*_op.cu`，这将会导致单元测试出错。
 - 如果多个Op依赖一些共用的函数，可以创建非`*_op.*`格式的文件来存放，如`gather.h`文件。
diff --git a/doc/fluid/dev/new_op_en.md b/doc/fluid/dev/new_op_en.md
index da8b1bdd1082e439456daf25e9b3a1e8eb534375..f8de271ed4e5e0fb4018478bffd4b525d4319738 100644
--- a/doc/fluid/dev/new_op_en.md
+++ b/doc/fluid/dev/new_op_en.md
@@ -26,18 +26,38 @@ Here are the base types needed. For details, please refer to the design docs.
 Operators can be categorized into two groups: operator with kernel(s) and operator without kernel(s). An operator with kernel(s) inherits from `OperatorWithKernel` while the one without kernel(s) inherits from `OperatorBase`. This tutorial focuses on implementing operators with kernels. In short, an operator includes the following information:
 
 
- Information           | Where is it defined
---------------  | :----------------------
-OpProtoMake definition  | `.cc`files, Backward Op does not need an OpProtoMake interface.
-Op definition           | `.cc` files
-Kernel implementation       | The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.
-Registering the Op           | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.
-
-
-New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions.**
-
-
-Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc), as an example to introduce the writing of an Operator with Kernel.
+<table>
+<thead>
+<tr>
+<th>Information</th>
+<th> Where is it defined</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>OpProtoMake definition </td>
+<td> `.cc`files, Backward Op does not need an OpProtoMake interface. </td>
+</tr>
+<tr>
+<td>Op definition  </td>
+<td> `.cc` files</td>
+</tr>
+<tr>
+<td>Kernel implementation  </td>
+<td> The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.</td>
+</tr>
+<tr>
+<td>Registering the Op  </td>
+<td> Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.</td>
+</tr>
+</tbody>
+</table>
+
+
+New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions.**
+
+
+Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc), as an example to introduce the writing of an Operator with Kernel.
 
 
 ## Implementing C++ Types
@@ -65,17 +85,17 @@ The equation is: Out = X * Y
 };
 ```
 
-[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)is inherited from`framework::OpProtoAndCheckerMaker`, consisting of 2 variables in the constructor：
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L76-L127)is inherited from`framework::OpProtoAndCheckerMaker`, consisting of 2 variables in the constructor：
 
    - `framework::OpProto` stores Operator input and variable attribute, used for generating Python API interfaces.
    - `framework::OpAttrChecker` is used to validate variable attributes.
 
 The constructor utilizes `AddInput`, `AddOutput`, and `AddComment`, so that the corresponding information will be added to `OpProto`.
 
-The code above adds two inputs `X` and `Y` to `MulOp`, an output `Out`, and their corresponding descriptions, in accordance to Paddle's [naming convention](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md).
+The code above adds two inputs `X` and `Y` to `MulOp`, an output `Out`, and their corresponding descriptions, in accordance to Paddle's [naming convention](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/name_convention.md).
 
 
-An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37) is implemented as follows:
+An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/scale_op.cc#L38-L55) is implemented as follows:
 
 ```cpp
 template <typename AttrType>
@@ -93,11 +113,7 @@ The equation is: Out = scale*X
 };
 ```
 
-There are two changes in this example:
-
-- `AddInput("X","...").NotInGradient()` expresses that input `X` is not involved in `ScaleOp`'s corresponding computation. If an input to an operator is not participating in back-propagation, please explicitly set `.NotInGradient()`.
-
-- `AddAttr<AttrType>("scale", "...").SetDefault(1.0);`  adds `scale`constant as an attribute, and sets the default value to 1.0.
+Note `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` adds `scale`constant as an attribute, and sets the default value to 1.0.
 
 
 ### Defining Operator
@@ -127,7 +143,7 @@ class MulOp : public framework::OperatorWithKernel {
 };
 ```
 
-[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22) is inherited from `OperatorWithKernel`. Its `public` member
+[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L24) is inherited from `OperatorWithKernel`. Its `public` member
 
 ```cpp
 using framework::OperatorWithKernel::OperatorWithKernel;
@@ -153,7 +169,7 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
 
 `MulKernel` inherits `framework::OpKernel`, which includes the following templates:
 
-- `typename  DeviceContext` denotes device context type. When different devices, namely the CPUDeviceContext and the CUDADeviceContext, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+- `typename  DeviceContext` denotes device context type. When different devices, namely the CPUDeviceContext and the CUDADeviceContext, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.h#L43).
 
 - `typename T` denotes data type, such as `float` or `double`.
 
@@ -182,9 +198,9 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w
 
 Note that **different devices (CPU, CUDA)share one Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions can support both devices.**
 
-`MulOp`'s CPU and CUDA share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+`MulOp`'s CPU and CUDA share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/cross_entropy_op.cc).
 
-To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md).
+To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/use_eigen_en.md).
 
 
 This concludes the forward implementation of an operator. Next its operation and kernel need to be registered in a `.cc` file.
@@ -197,7 +213,9 @@ The definition of its corresponding backward operator, if applicable, is similar
 
     ```cpp
     namespace ops = paddle::operators;
-    REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
+    REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>)
+    REGISTER_OPERATOR(mul_grad, ops::MulGradOp)
 
     REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
     REGISTER_OP_CPU_KERNEL(mul_grad,
@@ -206,9 +224,8 @@ The definition of its corresponding backward operator, if applicable, is similar
 
    In that code block,
 
-    - `REGISTER_OP` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
+    - `REGISTER_OPERATOR` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
     - `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient.
-
     - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`.
 
 
@@ -248,7 +265,7 @@ Unit tests for an operator include
 
 3. a scaling test for the backward operator.
 
-Here, we introduce the [unit tests for `MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py).
+Here, we introduce the [unit tests for `MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_mul_op.py).
 
 ### Testing Forward Operators
 
@@ -279,7 +296,7 @@ A forward operator unit test inherits `unittest.TestCase` and defines metaclass
 
       def test_check_output(self):
           self.check_output()
-          
+
       def test_check_grad_normal(self):
           self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
 
@@ -312,7 +329,7 @@ Some key points in checking gradient above include:
 ### Compiling and Running
 
 
-Any new unit testing file of the format `test_*.py`  added to the director `python/paddle/v2/framework/tests` is automatically added to the project to compile.
+Any new unit testing file of the format `test_*.py`  added to the director `python/paddle/fluid/tests/unittests/` is automatically added to the project to compile.
 
 Note that **unlike the compile test for Ops, running unit tests requires compiling the entire project** and requires compiling with flag `WITH_TESTING` on i.e. `cmake paddle_dir -DWITH_TESTING=ON`.
 
@@ -330,7 +347,6 @@ ctest -R test_mul_op
 
 ## Remarks
 
-- Every `*_op.h` (if applicable), `*_op.cc`, and `*_op.cu` (if applicable) must be created for a unique Op. Compiling will fail if multiple operators are included per file.
-- The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OP(B, ...)` in `A_op.cc` will cause unit testing failures.
+- The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OPERATOR(B, ...)` in `A_op.cc` will cause unit testing failures.
 - If the operator does not implement a CUDA kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail.
 - If multiple operators rely on some shared methods, a file NOT named `*_op.*` can be created to store them, such as `gather.h`.
diff --git a/doc/fluid/dev/new_op_kernel.md b/doc/fluid/dev/new_op_kernel.md
new file mode 100644
index 0000000000000000000000000000000000000000..87e617d44041bde9c9051151878ffb4304689b3c
--- /dev/null
+++ b/doc/fluid/dev/new_op_kernel.md
@@ -0,0 +1,121 @@
+# Add Kernels for a New Device
+
+## Background
+
+PaddlePaddle Fluid have hundreds of operators.  Each operator could have one or more kernels.  A kernel is an implementation of the operator for a certain device, which could be a hardware device, e.g., the CUDA GPU, or a library that utilizes a device, e.g., Intel MKL that makes full use of the Xeon CPU.
+
+[This document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/new_op_en.md) explains how to add an operator, and its kernels.  The kernels of an operator are indexed by a C++ type [`OpKernelType`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/multi_devices/operator_kernel_type.md).  An operator chooses the right kernel at runtime.  This choosing mechanism is described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md).
+
+## Write Kernels for A New Device
+
+### Add A New Device
+
+  For some historical reaons, we misuse the word *library* for *device*.  For example, we call the deivce type by *library type*.  An example is the header file [`library_type.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/library_type.h#L24).  We will correct this ASAP.
+
+To register a new device, we need to add an enum value to `LibraryType`:
+
+```
+enum class LibraryType {
+  kPlain = 0,
+  kMKLDNN = 1,
+  kCUDNN = 2,
+};
+```
+
+
+### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h#L53)
+
+If you have a new kind of Device, firstly you need to add a new kind of [`Place`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h#L53). For example `CUDAPlace`:
+
+```cpp
+struct CUDAPlace {
+  CUDAPlace() : CUDAPlace(0) {}
+  explicit CUDAPlace(int d) : device(d) {}
+
+  inline int GetDeviceId() const { return device; }
+  // needed for variant equality comparison
+  inline bool operator==(const CUDAPlace &o) const {
+    return device == o.device;
+  }
+  inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); }
+
+  int device;
+};
+
+typedef boost::variant<CUDAPlace, CPUPlace> Place;
+```
+
+### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/device_context.h#L37))
+After a new kind of Device is added, you should add a corresponding [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/device_context.h#L37) for it.
+
+```cpp
+class DeviceContext {
+ public:
+  virtual ~DeviceContext() {}
+  virtual Place GetPlace() const = 0;
+
+  virtual void Wait() const {}
+};
+```
+
+### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L351) for your Device.
+
+A detailed documentation can be found in [`new_op_and_kernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/new_op_en.md)
+
+```cpp
+class OpKernelBase {
+ public:
+  /**
+   * ExecutionContext is the only parameter of Kernel Run function.
+   * Run will get input/output variables, state such as momentum and
+   * device resource such as CUDA stream, cublas handle, etc. from
+   * ExecutionContext. User should construct it before run the Operator.
+   */
+
+  virtual void Compute(const ExecutionContext& context) const = 0;
+
+  virtual ~OpKernelBase() = default;
+};
+
+template <typename T>
+class OpKernel : public OpKernelBase {
+ public:
+  using ELEMENT_TYPE = T;
+};
+```
+
+
+### Register the OpKernel to framework
+
+After writing the components described above, we should register the kernel to the framework.
+
+We use `REGISTER_OP_KERNEL` to do the registration.
+
+```cpp
+REGISTER_OP_KERNEL(
+	op_type,
+	library_type,
+	place_type,
+	kernel0, kernel1, ...)
+```
+
+kernel0, kernel1 are kernels that have the same `op_type`, `library_type`, `place_type` but different `data_types`.
+
+take [`conv2d`]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/conv_cudnn_op.cu.cc#L318)) as an example:
+
+	```cpp
+	REGISTER_OP_KERNEL(conv2d, CPU, paddle::platform::CPUPlace,
+    		paddle::operators::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+    		paddle::operators::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
+
+	REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace,
+	       paddle::operators::CUDNNConvOpKernel<float>,
+	       paddle::operators::CUDNNConvOpKernel<double>);
+	```
+
+In the code above:
+
+ - `conv2d` is the type/name of the operator
+ - `CUDNN/CPU` is `library`
+ - `paddle::platform::CUDAPlace/CPUPlace` is `place`
+ - template parameter `float/double` on `CUDNNConvOpKernel<T>` is `data_type`.
diff --git a/doc/fluid/dev/new_op_kernel_en.md b/doc/fluid/dev/new_op_kernel_en.md
deleted file mode 100644
index 123df0a7ee4943c0b789ef9cfa6e0804d0fdd564..0000000000000000000000000000000000000000
--- a/doc/fluid/dev/new_op_kernel_en.md
+++ /dev/null
@@ -1,121 +0,0 @@
-## Add Kernels for a New Device
-
-### Background
-
-PaddlePaddle Fluid have hundreds of operators.  Each operator could have one or more kernels.  A kernel is an implementation of the operator for a certain device, which could be a hardware device, e.g., the CUDA GPU, or a library that utilizes a device, e.g., Intel MKL that makes full use of the Xeon CPU.
-
-[This document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md) explains how to add an operator, and its kernels.  The kernels of an operator are indexed by a C++ type [`OpKernelType`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md).  An operator chooses the right kernel at runtime.  This choosing mechanism is described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md).
-
-### Write Kernels for A New Device 
-
-#### Add A New Device
-
-  For some historical reaons, we misuse the word *library* for *device*.  For example, we call the deivce type by *library type*.  An example is the header file [`library_type.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/library_type.h#L24).  We will correct this ASAP.
-
-To register a new device, we need to add an enum value to `LibraryType`:
-
-```
-enum class LibraryType {
-  kPlain = 0,
-  kMKLDNN = 1,
-  kCUDNN = 2,
-};
-```
-
-
-#### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53)
-
-If you have a new kind of Device, firstly you need to add a new kind of [`Place`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53). For example `CUDAPlace`:
-
-```cpp
-struct CUDAPlace {
-  CUDAPlace() : CUDAPlace(0) {}
-  explicit CUDAPlace(int d) : device(d) {}
-
-  inline int GetDeviceId() const { return device; }
-  // needed for variant equality comparison
-  inline bool operator==(const CUDAPlace &o) const {
-    return device == o.device;
-  }
-  inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); }
-
-  int device;
-};
-
-typedef boost::variant<CUDAPlace, CPUPlace> Place;
-```
-
-#### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37))
-After a new kind of Device is added, you should add a corresponding [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37) for it.
-
-```cpp
-class DeviceContext {
- public:
-  virtual ~DeviceContext() {}
-  virtual Place GetPlace() const = 0;
-
-  virtual void Wait() const {}
-};
-```
-
-#### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L351) for your Device.
-
-A detailed documentation can be found in [`new_op_and_kernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md)
-
-```cpp
-class OpKernelBase {
- public:
-  /**
-   * ExecutionContext is the only parameter of Kernel Run function.
-   * Run will get input/output variables, state such as momentum and
-   * device resource such as CUDA stream, cublas handle, etc. from
-   * ExecutionContext. User should construct it before run the Operator.
-   */
-
-  virtual void Compute(const ExecutionContext& context) const = 0;
-
-  virtual ~OpKernelBase() = default;
-};
-
-template <typename T>
-class OpKernel : public OpKernelBase {
- public:
-  using ELEMENT_TYPE = T;
-};
-```
-
-
-#### Register the OpKernel to framework
-
-After writing the components described above, we should register the kernel to the framework.
-
-We use `REGISTER_OP_KERNEL` to do the registration.
-
-```cpp
-REGISTER_OP_KERNEL(
-	op_type,
-	library_type,
-	place_type,
-	kernel0, kernel1, ...)
-```
-
-kernel0, kernel1 are kernels that have the same `op_type`, `library_type`, `place_type` but different `data_types`.
-
-take [`conv2d`]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/conv_cudnn_op.cu.cc#L318)) as an example:
-
-	```cpp
-	REGISTER_OP_KERNEL(conv2d, CPU, paddle::platform::CPUPlace,
-    		paddle::operators::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
-    		paddle::operators::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
-    
-	REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace,
-	       paddle::operators::CUDNNConvOpKernel<float>,
-	       paddle::operators::CUDNNConvOpKernel<double>);
-	```
-
-In the code above:
-
- - `conv2d` is the type/name of the operator
- - `CUDNN/CPU` is `library`
- - `paddle::platform::CUDAPlace/CPUPlace` is `place`
- - template parameter `float/double` on `CUDNNConvOpKernel<T>` is `data_type`.
diff --git a/doc/fluid/dev/op_markdown_format.md b/doc/fluid/dev/op_markdown_format.md
new file mode 100644
index 0000000000000000000000000000000000000000..4e539d7992e5f67ee7b07193b59b6b425b73c9e5
--- /dev/null
+++ b/doc/fluid/dev/op_markdown_format.md
@@ -0,0 +1,64 @@
+# Standard Markdown Format for Operators
+The following should be the standard format for documentation for all the operators that will get rendered in the `html`:
+
+```
+Operator Name (In PaddlePaddle)
+
+Operator Name (Standard)
+
+Operator description.
+
+LaTeX equation of how the operator performs an update.
+
+The signature of the operator.
+```
+
+Each section mentioned above has been covered in further detail in the rest of the document.
+
+## PaddlePaddle Operator Name
+This should be in all small letters, in case of multiple words, we separate them with an underscore. For example:
+`array to lod tensor` should be written as `array_to_lod_tensor`.
+
+This naming convention should be standard across all PaddlePaddle operators.
+
+## Standard Operator Name
+This is the standard name of the operator as used in the community. The general standard is usually:
+- Standard abbreviations like `SGD` are written in all capital letters.
+- Operator names that have multiple words inside a single word use `camelCase` (capitalize word boundaries inside of a word).
+- Keep numbers inside a word as is, with no boundary delimiters.
+- Follow the name of the operator with the keyword: `Activation Operator.`
+
+## Operator description
+This section should contain the description of what the operator does, including the operation performed, the literature from where it comes and was introduced first, and other important details. The relevant paper/article including the hyperlink should be cited in this section.
+
+## LaTeX equation
+This section should contain an overall equation of the update or operation that the operator performs. The variables used in the equation should follow the naming convention of operators as described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md). Two words in the same word should be separated by an underscore (`_`).
+
+## The signature
+This section describes the signature of the operator. A list of Inputs and Outputs, each of which have a small description of what the variable represents and the type of variable. The variable names follow the `CamelCase` naming convention. The proposed format for this is:
+`Section :
+VariableName : (VariableType) VariableDescription
+...
+...
+`
+
+
+The following example for an `sgd` operator covers the above mentioned sections as they would ideally look like in the `html`:
+
+```
+sgd
+
+SGD operator
+
+This operator implements one step of the stochastic gradient descent algorithm.
+
+param_out = param_learning_rate * grad
+
+Inputs:
+Param : (Tensor) Input parameter
+LearningRate : (Tensor) Learning rate of SGD
+Grad : (Tensor) Input gradient
+
+Outputs:
+ParamOut : (Tensor) Output parameter
+```
diff --git a/doc/fluid/dev/releasing_process_cn.md b/doc/fluid/dev/releasing_process_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..4c6728fba7150b0f1e180e57590f18a5b677c70d
--- /dev/null
+++ b/doc/fluid/dev/releasing_process_cn.md
@@ -0,0 +1,199 @@
+# PaddlePaddle发行规范
+
+PaddlePaddle使用git-flow branching model做分支管理，使用[Semantic Versioning](http://semver.org/)标准表示PaddlePaddle版本号。
+
+PaddlePaddle每次发新的版本，遵循以下流程:
+
+1. 从`develop`分支派生出新的分支，分支名为`release/版本号`。例如，`release/0.10.0`
+1. 将新分支的版本打上tag，tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`，第二个为`0.10.0rc2`，依次类推。
+1. 对这个版本的提交，做如下几个操作:
+  * 使用Regression Test List作为检查列表，测试本次release的正确性。
+	  * 如果失败，记录下所有失败的例子，在这个`release/版本号`分支中，修复所有bug后，Patch号加一，到第二步
+	* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。
+	* 将这个版本的python wheel包发布到pypi。
+	* 更新Docker镜像（参考后面的操作细节）。
+1. 第三步完成后，将`release/版本号`分支合入master分支，将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。
+1. 协同完成Release Note的书写。
+
+需要注意的是:
+
+* `release/版本号`分支一旦建立，一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭，方便测试人员测试PaddlePaddle的行为。
+* 在`release/版本号`分支存在的时候，如果有bugfix的行为，需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。
+
+## 发布wheel包到pypi
+
+1. 使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
+完成自动化二进制编译，参考下图，选择需要发布的版本（通常包含一个CPU版本和一个GPU版本），点击"run"右侧的"..."按钮，可以
+弹出下面的选择框，在第二个tab (Changes)里选择需要发布的分支，这里选择0.11.0，然后点击"Run Build"按钮。
+	<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ci_build_whl.png">
+1. 等待编译完成后可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件，分别对应CAPI，`cp27m`和`cp27mu`的版本。
+1. 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513)，在使用twine上传之前，需要重命名wheel包中platform相关的后缀，比如将`linux_x86_64`修改成`manylinux1_x86_64`。
+1. 上传：
+```
+cd build/python
+pip install twine
+twine upload dist/[package to upload]
+```
+
+* 注：CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux
+  发型版，如果需要手动编译，也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。
+* pypi不支持覆盖上传，所以一个版本号的wheel包发布之后，不可以更改。下一个wheel包需要更新版本号才可以上传。
+
+## 发布Docker镜像
+
+上述PaddlePaddle CI编译wheel完成后会自动将Docker镜像push到DockerHub，所以，发布Docker镜像只需要对自动push的镜像打上
+版本号对应的tag即可：
+
+```
+docker pull [镜像]:latest
+docker tag [镜像]:latest [镜像]:[version]
+docker push [镜像]:[version]
+```
+
+需要更新的镜像tag包括：
+
+* `[version]`: CPU版本
+* `[version]-openblas`: openblas版本
+* `[version]-gpu`: GPU版本（CUDA 8.0 cudnn 5）
+* `[version]-gpu-[cudaver]-[cudnnver]`: 不同cuda, cudnn版本的镜像
+
+之后可进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看是否发布成功。
+
+## PaddlePaddle 分支规范
+
+PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范，并适应github的特性做了一些区别。
+
+* PaddlePaddle的主版本库遵循[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范。其中:
+	* `master`分支为稳定(stable branch)版本分支。每一个`master`分支的版本都是经过单元测试和回归测试的版本。
+	* `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试，但并没有经过回归测试。
+	* `release/版本号`分支为每一次Release时建立的临时分支。在这个阶段的代码正在经历回归测试。
+
+* 其他用户的fork版本库并不需要严格遵守[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范，但所有fork的版本库的所有分支都相当于特性分支。
+	* 建议，开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支
+	* 建议，开发者fork的版本库中，再基于`develop`版本fork出自己的功能分支。
+	* 当功能分支开发完毕后，向PaddlePaddle的主版本库提交`Pull Reuqest`，进而进行代码评审。
+		* 在评审过程中，开发者修改自己的代码，可以继续在自己的功能分支提交代码。
+
+* BugFix分支也是在开发者自己的fork版本库维护，与功能分支不同的是，BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支，同时提起`Pull Request`。
+
+## PaddlePaddle回归测试列表
+
+本列表说明PaddlePaddle发版之前需要测试的功能点。
+
+### PaddlePaddle Book中所有章节
+
+PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练（V2和Fluid）模型正确性。
+
+<table>
+<thead>
+<tr>
+<th></th>
+<th>新手入门章节 </th>
+<th> 识别数字</th>
+<th> 图像分类</th>
+<th>词向量</th>
+<th> 情感分析</th>
+<th>语意角色标注</th>
+<th> 机器翻译</th>
+<th>个性化推荐</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td>API.V2 + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>API.V2 + Ubuntu + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + CPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+</tbody>
+</table>
diff --git a/doc/fluid/dev/releasing_process_en.md b/doc/fluid/dev/releasing_process_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..f989b964d6d1a329bbe31adc7ec10db017acaefa
--- /dev/null
+++ b/doc/fluid/dev/releasing_process_en.md
@@ -0,0 +1,210 @@
+# PaddlePaddle Releasing Process
+
+PaddlePaddle manages its branches using "git-flow branching model", and [Semantic Versioning](http://semver.org/) as it's version number semantics.
+
+Each time we release a new PaddlePaddle version, we should follow the below steps:
+
+1. Fork a new branch from `develop` named `release/[version]`, e.g. `release/0.10.0`.
+1. Push a new tag on the release branch, the tag name should be like `[version]rc.patch`. The
+   first tag should be `0.10.0rc1`, and the second should be `0.10.0.rc2` and so on.
+1. After that, we should do:
+  * Run all regression test on the Regression Test List (see PaddlePaddle TeamCity CI), to confirm
+      that this release has no major bugs.
+        * If regression test fails, we must fix those bugs and create a new `release/[version]`
+          branch from previous release branch.
+    * Modify `python/setup.py.in`, change the version number and change `ISTAGED` to `True`.
+    * Publish PaddlePaddle release wheel packages to pypi (see below instructions for detail).
+    * Update the Docker images (see below instructions for detail).
+1. After above step, merge `release/[version]` branch to master and push a tag on the master commit,
+   then merge `master` to `develop`.
+1. Update the Release Note.          
+
+***NOTE:***
+
+* Do ***NOT*** merge commits from develop branch to release branches to keep the release branch contain
+  features only for current release, so that we can test on that version.
+* If we want to fix bugs on release branches, we must merge the fix to master, develop and release branch.
+
+## Publish Wheel Packages to pypi
+
+1. Use our [CI tool](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
+   to build all wheel packages needed to publish. As shown in the following picture, choose a build
+     version, click "..." button on the right side of "Run" button, and switch to the second tab in the
+pop-up box, choose the current release branch and click "Run Build" button. You may repeat this
+     step to start different versions of builds.
+    <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/ci_build_whl.png">
+1. After the build succeeds, download the outputs under "Artifacts" including capi, `cp27m` and `cp27mu`.
+1. Since pypi.python.org follows [PEP 513](https://www.python.org/dev/peps/pep-0513), before we
+     upload the package using `twine`, we need to rename the package from `linux_x86_64` to
+     `manylinux1_x86_64`.
+1. Start the upload:
+     ```
+     cd build/python
+     pip install twine
+     twine upload dist/[package to upload]
+     ```
+
+* NOTE: We use a special Docker image to build our releases to support more Linux distributions, you can
+  download it from https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/, or build it using
+    scripts under `tools/manylinux1`.
+* pypi does not allow overwrite the already uploaded version of wheel package, even if you delete the
+  old version. you must change the version number before upload a new one.
+
+## Publish Docker Images
+
+Our CI tool will push latest images to DockerHub, so we only need to push a version tag like:
+
+```
+docker pull [image]:latest
+docker tag [image]:latest [image]:[version]
+docker push [image]:[version]
+```
+
+Tags that need to be updated are:
+* `[version]`: CPU only version image
+* `[version]-openblas`: openblas version image
+* `[version]-gpu`: GPU version（using CUDA 8.0 cudnn 5）
+* `[version]-gpu-[cudaver]-[cudnnver]`: tag for different cuda, cudnn versions
+
+You can then checkout the latest pushed tags at https://hub.docker.com/r/paddlepaddle/paddle/tags/.
+
+## Branching Model
+
+We use [git-flow](http://nvie.com/posts/a-successful-git-branching-model/) as our branching model,
+with some modifications:
+
+* `master` branch is the stable branch. Each version on the master branch is tested and guaranteed.
+* `develop` branch is for development. Each commit on develop branch has passed CI unit test, but no
+  regression tests are run.
+* `release/[version]` branch is used to publish each release. Latest release version branches have
+  bugfix only for that version, but no feature updates.
+* Developer forks are not required to follow
+  [git-flow](http://nvie.com/posts/a-successful-git-branching-model/)
+  branching model, all forks is like a feature branch.
+    * Advise: developer fork's develop branch is used to sync up with main repo's develop branch.
+    * Advise: developer use it's fork's develop branch to for new branch to start developing.
+  * Use that branch on developer's fork to create pull requests and start reviews.
+      * developer can push new commits to that branch when the pull request is open.
+* Bug fixes are also started from developers forked repo. And, bug fixes branch can merge to
+  `master`, `develop` and `releases`.
+
+## PaddlePaddle Regression Test List
+
+### All Chapters of PaddlePaddle Book
+
+We need to guarantee that all the chapters of PaddlePaddle Book can run correctly. Including
+V1 (`paddle_trainer` training) and V2 training and Fluid training.
+
+<table>
+<thead>
+<tr>
+<th></th>
+<th>Linear Regression</th>
+<th>Recognize Digits</th>
+<th>Image Classification</th>
+<th>Word2Vec</th>
+<th>Personalized Recommendation</th>
+<th>Sentiment Analysis</th>
+<th>Semantic Role Labeling</th>
+<th>Machine Translation</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td>API.V2 + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + GPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>`paddle_trainer` + Docker + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> API.V2 + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td>API.V2 + Ubuntu + CPU </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + GPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> `paddle_trainer` + Ubuntu + CPU</td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+<td>  </td>
+<td> </td>
+</tr>
+</tbody>
+</table>
diff --git a/doc/fluid/dev/support_new_device.md b/doc/fluid/dev/support_new_device.md
new file mode 100644
index 0000000000000000000000000000000000000000..051a463cfcf97df2e2d5b6a880923ca70fefbd6e
--- /dev/null
+++ b/doc/fluid/dev/support_new_device.md
@@ -0,0 +1,240 @@
+# Design Doc: Supporting new Device/Library
+
+## Background
+
+Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries in a flexible and efficient manner.
+
+On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example, Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.
+
+On the other hand, users usually do not want to care about the low-level hardware and computing libraries when writing a neural network configuration. In Fluid, `Layer` is exposed in `Python`, and `Operator` is exposed in `C++`. Both `Layer` and `Operator` are hardware independent.
+
+So, how to support a new Device/Library in Fluid becomes a challenge.
+
+
+## Basic: Integrate A New Device/Library
+
+For a general overview of fluid, please refer to the [overview doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/read_source.md).
+
+There are mainly three parts that we have to consider while integrating a new device/library:
+
+- Place and DeviceContext: indicate the device id and manage hardware resources
+
+- Memory and Tensor: malloc/free data on certain device
+
+- Math Functor and OpKernel: implement computing unit on certain devices/libraries
+
+### Place and DeviceContext
+
+Please note that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
+
+#### Place
+Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add the corresponding `DevicePlace`.
+
+```
+        |   CPUPlace
+Place --|   CUDAPlace
+        |   FPGAPlace
+```
+
+And `Place` is defined as follows:
+
+```
+typedef boost::variant<CUDAPlace, CPUPlace, FPGAPlace> Place;
+```
+
+#### DeviceContext
+
+Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/fluid/paddle/platform/device_context.h#L30) to manage the resources in different libraries, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`.
+
+
+```
+                /->  CPUDeviceContext   
+DeviceContext ---->  CUDADeviceContext  
+                \->  FPGADeviceContext
+```
+
+An example of Nvidia GPU is as follows:
+
+- DeviceContext
+
+
+```
+class DeviceContext {
+  virtual Place GetPlace() const = 0;
+};  
+```
+
+
+- CUDADeviceContext
+
+
+```
+class CUDADeviceContext : public DeviceContext {
+  Place GetPlace() const override { return place_; }
+private:
+  CUDAPlace place_;
+  cudaStream_t stream_;
+  cublasHandle_t cublas_handle_;
+  std::unique_ptr<Eigen::GpuDevice> eigen_device_;  // binds with stream_
+};
+```
+
+### Memory and Tensor
+
+
+#### memory module
+
+Fluid provides the following [memory interfaces](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/memory/memory.h#L36):
+
+```
+template <typename Place>
+void* Alloc(Place place, size_t size);
+
+template <typename Place>
+void Free(Place place, void* ptr);
+
+template <typename Place>
+size_t Used(Place place);
+```
+
+To implement these interfaces, we have to implement MemoryAllocator for different Devices.
+
+
+#### Tensor
+
+[Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/tensor.h#L36) holds data with some shape in a specific Place.
+
+```cpp
+class Tensor {
+ public:
+  /*! Return a pointer to mutable memory block. */
+  template <typename T>
+  inline T* data();
+
+  /**
+   * @brief   Return a pointer to mutable memory block.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(platform::Place place);
+
+  /**
+   * @brief     Return a pointer to mutable memory block.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(DDim dims, platform::Place place);
+
+  /*! Resize the dimensions of the memory block. */
+  inline Tensor& Resize(const DDim& dims);
+
+  /*! Return the dimensions of the memory block. */
+  inline const DDim& dims() const;
+
+ private:
+  /*! holds the memory block if allocated. */
+  std::shared_ptr<Placeholder> holder_;
+
+  /*! points to dimensions of memory block. */
+  DDim dim_;
+};
+```
+
+`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configurate its shape, and then call `mutuable_data` to allocate the actual memory.
+
+```cpp
+paddle::framework::Tensor t;
+paddle::platform::CPUPlace place;
+// set size first
+t.Resize({2, 3});
+// allocate memory on CPU later
+t.mutable_data(place);
+```
+
+
+
+### Math Functor and OpKernel
+
+Fluid implements computing units based on different DeviceContexts. Some computing units are shared between operators. This common part will be put in operators/math directory as basic Functors.
+
+Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/math/maxouting.h#L27) as an example:
+
+The interface is defined in the header file.
+
+```
+template <typename DeviceContext, typename T>
+class MaxOutFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* output, int groups);
+};
+```
+
+CPU implementation is in .cc file
+
+```
+template <typename T>
+class MaxOutFunctor<platform::CPUDeviceContext, T> {
+  public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups) {
+                  ...
+                  }
+};
+```
+
+CUDA implementation is in .cu file
+
+```
+template <typename T>
+class MaxOutFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups) {
+                  ...
+                  }
+};                  
+```
+
+
+We first obtain the computing handle from a concrete DeviceContext and then compute on tensors.
+
+The implementation of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map.
+
+Fluid provides different register interfaces in op_registry.h
+
+
+Let's take [Crop](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/crop_op.cc#L134) operator as an example:
+
+In .cc file:
+
+```
+REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel<float>);
+REGISTER_OP_CPU_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>);
+```
+
+In .cu file:
+
+```
+REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel<float>);
+REGISTER_OP_CUDA_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CUDADeviceContext, float>);
+```
+
+
+## Advanced topics: How to switch between different Device/Library
+
+Generally, we will implement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not suitable on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run on GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
+
+
+For more details, please refer to following docs:
+
+- operator kernel type [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/multi_devices/operator_kernel_type.md)
+- switch kernel [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md)
diff --git a/doc/fluid/dev/use_eigen_cn.md b/doc/fluid/dev/use_eigen_cn.md
index f36843b4408c21bdca1fa83853e5b0a40116791c..75922e7d85a13e53ce94619a48d8da8b960e6c9a 100644
--- a/doc/fluid/dev/use_eigen_cn.md
+++ b/doc/fluid/dev/use_eigen_cn.md
@@ -1,16 +1,16 @@
-## 在Paddle中如何使用Eigen
+# 在Paddle中如何使用Eigen
 
 神经网络本质上是一个计算图，计算需要的数据存放在`Tensor`中，而计算过程是由`Operartor`来描述的。在执行时，`Operator`调用对应`OpKernel`中的`Compute`接口，实现对`Tensor`的操作。
 
 
-### Eigen Tensor模块
+## Eigen Tensor模块
 
 Eigen Tensor模块对element-wise计算提供了强大的支持，并且书写一份代码，可以同时在CPU、GPU执行。但Eigen Tensor是一个正在开发中的模块，因此可能测试不够完备，文档较少。
 
 关于Eigen Tensor模块的详细介绍请参考[文档1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) 和[文档2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md)
 
 
-### paddle::framework::Tensor
+## paddle::framework::Tensor
 
 Paddle Tensor定义在framework目录下，其主要接口如下：
 
@@ -20,14 +20,14 @@ class Tensor {
   /*! Return a pointer to mutable memory block. */
   template <typename T>
   inline T* data();
-  
+
   /**
    * @brief   Return a pointer to mutable memory block.
    * @note    If not exist, then allocation.
    */
   template <typename T>
   inline T* mutable_data(platform::Place place);
-  
+
   /**
    * @brief     Return a pointer to mutable memory block.
    *
@@ -38,17 +38,17 @@ class Tensor {
    */
   template <typename T>
   inline T* mutable_data(DDim dims, platform::Place place);
-  
+
   /*! Resize the dimensions of the memory block. */
   inline Tensor& Resize(const DDim& dims);
-  
+
   /*! Return the dimensions of the memory block. */
   inline const DDim& dims() const;
 
  private:  
   /*! holds the memory block if allocated. */
   std::shared_ptr<Placeholder> holder_;
-  
+
   /*! points to dimensions of memory block. */
   DDim dim_;
 };
@@ -129,7 +129,7 @@ From是EigenTensor模板提供的一个接口，可以实现从paddle::framework
 
 
 
-### 实现计算
+## 实现计算
 
 当需要完成计算时，我们需要等式左边的EigenTensor调用device接口。在这里需要注意的是，这里的EigenTensor之间的运算只是改变了原有Tensor中的数据，而不会改变原有Tensor的shape信息。
 
diff --git a/doc/fluid/dev/use_eigen_en.md b/doc/fluid/dev/use_eigen_en.md
index 3a466f73d1f9b94a29b171015279c782ca50bd02..3313d097cb21e40c23aa13187b6a50562f12403a 100644
--- a/doc/fluid/dev/use_eigen_en.md
+++ b/doc/fluid/dev/use_eigen_en.md
@@ -1,9 +1,9 @@
-## How to use Eigen in Paddle
+# How to use Eigen in Paddle
 
 Essentially, a neural network is a compute graph. T data needed for the computation is stored in `Tensor`s and its computation procedure is described by `Operator`s. An `Operator` calls the `Compute` interface in its corresponding `OpKernel` and operates on the `Tensor`.
 
 
-### Eigen Tensor Module
+## Eigen Tensor Module
 
 The Eigen Tensor module supports powerful element-wise computation. In addition, a piece of code written using it can be run on both the CPU and the GPU.
 
@@ -12,7 +12,7 @@ Note that Eigen Tensor is still being actively developed, so its tests are not c
 For details on Eigen Tensor module, please see [doc 1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) and [doc 2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md).
 
 
-### paddle::framework::Tensor
+## paddle::framework::Tensor
 
 Paddle Tensor's is defined in the framework directory with the following interface:
 
@@ -105,7 +105,7 @@ void Compute(const framework::ExecutionContext& context) const override {
 ```
 
 
-### paddle::framework::Tensor到EigenTensor的转换
+## paddle::framework::Tensor到EigenTensor的转换
 
 As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`.
 
@@ -129,7 +129,7 @@ For more transformations, see the [unit tests](https://github.com/PaddlePaddle/P
 
 
 
-### Implementing Computation
+## Implementing Computation
 
 While computing, the device interface is needed from the EigenTensors on the left hand side of the assignments. Note that the computation between EigenTensors only changes the data originally inthe Tensor and does not change all the shape information associated with the Tensor.
 
diff --git a/doc/fluid/dev/write_docs_cn.rst b/doc/fluid/dev/write_docs_cn.rst
new file mode 120000
index 0000000000000000000000000000000000000000..2c281eaaf43bbfad84c3be9ed1d1bd0dbc77fa9b
--- /dev/null
+++ b/doc/fluid/dev/write_docs_cn.rst
@@ -0,0 +1 @@
+../../v2/dev/write_docs_cn.rst
\ No newline at end of file
diff --git a/doc/fluid/dev/write_docs_en.rst b/doc/fluid/dev/write_docs_en.rst
new file mode 120000
index 0000000000000000000000000000000000000000..cb2b9b0ff1f1d9e0e5201d160f6b7d9d451374e2
--- /dev/null
+++ b/doc/fluid/dev/write_docs_en.rst
@@ -0,0 +1 @@
+../../v2/dev/write_docs_en.rst
\ No newline at end of file
diff --git a/doc/fluid/faq/index_cn.rst b/doc/fluid/faq/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..395c1109891b5a00eab6f0b44d855658def7fdd6
--- /dev/null
+++ b/doc/fluid/faq/index_cn.rst
@@ -0,0 +1,2 @@
+FAQ
+------------
diff --git a/doc/fluid/faq/index_en.rst b/doc/fluid/faq/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..395c1109891b5a00eab6f0b44d855658def7fdd6
--- /dev/null
+++ b/doc/fluid/faq/index_en.rst
@@ -0,0 +1,2 @@
+FAQ
+------------
diff --git a/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md b/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
new file mode 100644
index 0000000000000000000000000000000000000000..79df6c59578e2acf495a3453ab61f069c3f09a49
--- /dev/null
+++ b/doc/fluid/getstarted/Developer's_Guide_to_Paddle_Fluid.md
@@ -0,0 +1,1819 @@
+
+# Paddle Fluid 开发者指南
+
+---
+
+### ==1==. 为什么需要 PaddlePaddle Fluid？
+
+---
+
+### 两个基础问题
+
+<font size=6>
+
+1. 如何描述机器学习模型和优化过程？
+    - 完备自洽，表达能力足以支持潜在出现的各种计算需求
+1. 如何充分利用资源高效计算？
+    - 支持异步设备、多卡、分布式计算
+    - 降低计算/计算优化的开发成本
+    - ……
+
+</font>
+
+---
+
+### 如何描述模型和优化过程？
+
+<font size=6>
+
+<table>
+<thead>
+<tr>
+<th> </th>
+<th>一组连续执行的layers</th>
+<th>variable和operator构成的计算图 </th>
+<th>不再有模型的概念 </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> 2013</td>
+<td> Caffe，Theano, Torch, PaddlePaddle </td>
+<td> </td>
+<td> </td>
+</tr>
+
+<tr>
+<td> 2015 </td>
+<td> </td>
+<td> TensorFlow, MxNet, Caffe2, ONNX, n-graph </td>
+<td> </td>
+</tr>
+<tr>
+<td>2016 </td>
+<td> </td>
+<td> </td>
+<td> PyTorch, TensorFlow Eager Execution, <font color=#483D8B>**==PaddlePaddle Fluid==** </td>
+</tr>
+
+</tbody>
+</table>
+
+---
+
+
+### <p align="center">目标 </p>
+
+<font size=6>
+
+- 提高对各类机器学习任务的描述能力：能够描述潜在出现的任意机器学习模型。
+- 代码结构逻辑清晰，各模块充分解耦：内外部贡献者能够专注于自己所需的功能模块，基于框架进行再次开发。
+- 从设计上，留下技术优化的空间和潜力。
+- 代码解耦后降低多设备支持、计算优化等的开发成本。
+- 在统一的设计理念下，实现自动可伸缩，自动容错的分布式计算。
+
+</font>
+
+---
+
+## ==2.== Design Overview
+
+---
+
+# Fluid: 系统形态
+
+- <span style="background-color:#ACD6FF;">[编译器式的执行流程，区分编译时和运行时](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)</span>
+<br>
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid-compiler.png" width=100%>
+</p>
+
+---
+
+#### 让我们在Fluid程序实例中，区分编译时和运行时
+
+---
+### Fluid 编译时
+
+<font size=5>
+
+- ==**定义前向计算**==
+
+  ```python
+  x = fluid.layers.data(name='x',shape=[13], dtype='float32')
+  y_predict = fluid.layers.fc(input=x, size=1, act=None)
+  y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+  cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+  avg_cost = fluid.layers.mean(x=cost)
+  ```
+
+- ==**添加反向、正则、优化**==
+  ```python
+  learning_rate = 0.01
+  sgd_optimizer = fluid.optimizer.SGD(learning_rate)
+  sgd_optimizer.minimize(avg_cost)
+  ```
+</font>
+
+---
+
+### `Program` vs. 计算图
+
+<font size=5>
+
+- 在科学计算领域，计算图是一种描述计算的经典方式。下图展示了从前向计算图（蓝色）开始，通过添加反向（红色）和优化算法相关（绿色）操作，构建出整个计算图的过程：
+-
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/graph_construction_example_all.png" width=60%>
+</p>
+
+
+- Fluid ==使用`Program`而不是计算图==来描述模型和优化过程。`Program`由`Block`、`Operator`和`Variable`构成，相关概念会在后文详细展开。
+- 编译时 Fluid 接受前向计算（这里可以先简单的理解为是一段有序的计算流）`Program`，为这段前向计算按照：前向 -> 反向 -> 梯度 clip -> 正则 -> 优化 的顺序，添加相关 `Operator`和`Variable`到`Program`到完整的计算。
+
+</font>
+
+---
+
+### Fluid 运行时
+
+<font size=5>
+
+- ==**读入数据**==
+
+  ```python
+  train_reader = paddle.batch(
+      paddle.reader.shuffle(paddle.dataset.uci_housing.train(), buf_size=500),
+      batch_size=20)
+  feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+  ```
+- ==**定义执行程序的设备**==
+  ```python
+  place = fluid.CPUPlace()
+  feeder = fluid.DataFeeder(place=place,feed_list=[x, y])
+  ```
+
+- ==创建执行器（Executor），执行初始化 `Program`和训练`Program`==
+
+  ```python
+  exe = fluid.Executor(place)
+  exe.run(fluid.default_startup_program())
+  PASS_NUM = 100
+  for pass_id in range(PASS_NUM):
+      for data in train_reader():
+          avg_loss_value, = exe.run(fluid.default_main_program(),
+                                    feed=feeder.feed(data),
+                                    fetch_list=[avg_cost])
+          print(avg_loss_value)
+  ```
+</font>
+
+---
+
+### 总结：框架做什么？用户做什么？
+<br>
+
+<font size=5>
+<table>
+<thead>
+<tr>
+<th>构建训练</th>
+<th>执行训练</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>
+<span style="background-color:#B3D9D9">用户</span>：描述前向运算<br><span style="background-color:#DAB1D5;">框架</span>：添加反向运算<br><span style="background-color:#DAB1D5;">框架</span>：添加优化运算<br><span style="background-color:#DAB1D5;">框架</span>：添加内存优化<br><span style="background-color:#DAB1D5;">框架</span>：添加并行/多设备/分布式相关的计算单元
+</td>
+
+<td>
+<span style="background-color:#DAB1D5;">框架</span>：创建Operator（计算）+ Variable（数据）<br><span style="background-color:#DAB1D5;">框架</span>：创建`Block`<br><span style="background-color:#DAB1D5;">框架</span>：内存管理/设备管理<br><span style="background-color:#DAB1D5;">框架</span>：执行计算
+</td>
+</tr>
+</tbody>
+</table>
+</font>
+
+---
+
+### <p align="center">总结：编译时</p>
+<font size=5>
+
+<span style="background-color:#A3D1D1;">**用户编写一段Python程序，描述模型的前向计算**</span>
+1. 创建变量描述 `VarDesc`
+1. 创建operators的描述 `OpDesc`
+1. 创建operators的属性
+1. 推断变量的类型和形状，进行静态检查：`inferShape`
+1. 规划变量的内存复用
+1. 创建反向计算
+1. 添加优化相关的Operators
+1. （可选）添加多卡/多机相关的Operator，生成在多卡/多机上运行的程序
+
+</font>
+
+---
+
+### <p align="center">总结：运行时</p>
+<font size=5>
+
+<span style="background-color:#C7C7E2;">**执行规划好的计算**</span>
+1. 创建`Executor`
+1. 为将要执行的一段计算，在层级式的`Scope`空间中创建`Scope`
+1. 创建`Block`，依次执行`Block`
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/compile_run_time.png" width=50%><br>
+<font size=3> Figure. 编译时运行时概览</font>
+</p>
+
+</font>
+
+---
+<!-- *template: invert -->
+## ==3==. 用户如何描述计算？
+---
+
+### Fluid：==像写程序一样==定义计算
+<font size=5>
+
+- 顺序执行
+    ```python
+    x = fluid.layers.data(name='x',shape=[13], dtype='float32')
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    ```
+
+- 条件分支: [swith](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/switch.md)、[ifelse](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md)
+
+   ```python
+   a = fluid.Var(10)
+   b = fluid.Var(0)
+
+   switch = fluid.switch()
+   with switch.block():
+      with switch.case(fluid.less_equal(a, 10)):
+          fluid.print("Case 1")
+      with switch.case(fluid.larger(a, 0)):
+          fluid.print("Case 2")
+      with switch.default():
+          fluid.print("Case 3")
+   ```
+
+>[A Lisp cond form may be compared to a continued if-then-else as found in many algebraic programming languages](https://www.cs.cmu.edu/Groups/AI/html/cltl/clm/node84.html).
+
+</font>
+
+---
+
+### Fluid: ==像写程序一样==定义计算
+
+<font size=5>
+
+- 循环：[while](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py#L105)
+
+  ```python
+  d0 = layers.data("d0", shape=[10], dtype='float32')
+  data_array = layers.array_write(x=d0, i=i)
+  array_len = layers.fill_constant(shape=[1],dtype='int64', value=3)
+
+  cond = layers.less_than(x=i, y=array_len)
+  while_op = layers.While(cond=cond)
+  with while_op.block():
+      d = layers.array_read(array=data_array, i=i)
+      i = layers.increment(x=i, in_place=True)
+      layers.array_write(result, i=i, array=d)
+      layers.less_than(x=i, y=array_len, cond=cond)
+  ```
+
+- 完整实例请点查看 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_while_op.py#L36-L44)
+- beam search  [->]( https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_machine_translation.py#L105)
+
+</font>
+
+---
+
+#### <p align="center">总结</p>
+
+<font size=5>
+
+1. 用户层提供的描述语法具有完备性、自洽性，有能力支持对复杂计算过程描述
+1. 使用方式和核心概念可以类比编程语言，认知能够直接迁移
+1. 能够支持：定义问题，逐步求解
+
+</font>
+
+---
+
+## ==3.== 核心概念
+
+---
+### 编译时概念 ：==变量和计算的描述==
+
+<font size=5>
+
+- `VarDesc` + `TensorDesc` + `OpDesc` -> `BlockDesc` -> `ProgramDesc`
+    - https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto
+
+- <span style="background-color:#DAB1D5;">什么是 Fluid Program</span>
+
+  - 在Fluid中，一个神经网络任务（训练/预测）被描述为一段`Program`
+  - `Program`包含对`Variable`（数据）和 `Operator`（对数据的操作）的描述
+  - `Variable` 和 `Operator` 被组织为多个可以嵌套的`Block`，构成一段完整的`Fluid Program`
+
+
+>编译阶段最终，经过 Transpiler 的执行规划，变换处理，生成使用`protobuf`序列化后的`ProgramDesc`。可以发送给多卡或者网络中的其它计算节点执行
+
+</font>
+
+---
+
+### 编译时概念 ：==**[Transpiler](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)**==
+<font size=5>
+
+1. 接受一段`ProgramDesc`作为输入，生成一段新的`ProgramDesc`
+
+    - *Memory optimization transpiler*：向原始`ProgramDesc` 中插入 `FreeMemoryOps`，在一次迭代优化结束前提前释放内存，使得能够维持较小的 memory footprint
+
+    - *Distributed training transpiler*：将原始的`ProgramDesc`中转化为对应的分布式版本，生成两段新的`ProgramDesc`:
+        1. trainer进程执行的`ProgramDesc`
+        1. parameter server执行的`ProgramDesc`
+
+1. ==**WIP**==: 接受一段`ProgramDesc`，生成可直接被`gcc`, `nvcc`, `icc`等编译的代码，编译后得到可执行文件
+
+</font>
+
+---
+### Transplier
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/transpiler.png" width=70%>
+</p>
+
+---
+
+### 打印 `ProgramDesc`
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/print_fluid_program.png" width=70%>
+</p>
+
+<font size=5>
+
+- `default_startup_program`：创建可学习参数，对参数进行初始化
+- `default_main_program`：由用户定义的模型，包括了前向、反向、优化及所有必要的计算
+
+- 打印可读的 `Program`
+  ```python
+  from paddle.v2.fluid import debuger
+  print debuger.pprint_program_codes(framework.default_main_program().desc)
+  ```
+</font>
+
+---
+### 输出效果
+
+<font size=5>
+
+<table>
+<thead>
+<th>variable in block 0</th>
+<th>variable in block 0</th>
+</thead>
+<tbody>
+<tr>
+<td><img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/program_desc1.png" width=70%></td>
+<td><img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/program_desc2.png" width=70%></td>
+</tr>
+</tbody>
+</table>
+</font>
+
+---
+
+### 运行时概念
+
+<font size=5>
+
+- 数据相关
+  - `Tensor` / `LoDTensor` / `Variable`
+  - `Scope`
+
+- 计算相关
+  - `Block`
+  - `Kernel`、`OpWithKernel`、`OpWithoutKernel`
+
+<table>
+<thead>
+<th></th>
+<th>protobuf messages</th>
+<th>C++ class objects</th>
+</thead>
+<tbody>
+<tr>
+<td>Data</td>
+<td>[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L107)
+</td>
+<td>[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h#L24)
+</td>
+</tr>
+
+<tr>
+<td>Operation</td>
+<td>[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto#L35)
+</td>
+<td>[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h#L64)
+</td>
+</tr>
+<tr>
+<td>Block</td>
+<td>BlockDesc
+</td>
+<td>Block
+</td>
+</tr>
+
+
+</tbody>
+</table>
+
+- 执行相关 ：`Executor`
+
+</font>
+
+---
+#### Tensor 和 LoD(Level-of-Detail) Tensor
+<font size=5>
+
+- Tensor 是$n$-dimensional arry的推广，LoDTensor是在Tensor基础上附加了序列信息
+- Fluid中输入、输出，网络中的可学习参数全部统一使用LoDTensor（n-dimension array）表示
+- 一个mini-batch输入数据是一个LoDTensor
+  - 在Fluid中，RNN 处理变长序列无需padding，得益于 `LoDTensor`表示
+  - 可以简单将 LoD 理解为：`std::vector<std::vector<int>>`
+  - 对非序列数据，LoD 信息为空
+
+<table>
+<thead>
+<th></th>
+<th>TensorFlow</th>
+<th>PaddlePaddle</th>
+</thead>
+<tbody>
+<tr>
+<td>RNN</td>
+<td>Support
+</td>
+<td>Support
+</td>
+</tr>
+
+<tr>
+<td>recursive RNN</td>
+<td>Support
+</td>
+<td>Support
+</td>
+</tr>
+<tr>
+<td>padding zeros</td>
+<td>Must
+</td>
+<td>No need
+</td>
+<tr>
+<td>blob data type</td>
+<td>Tensor
+</td>
+<td>LODTensor
+</td>
+
+</tr>
+</tbody>
+</table>
+
+</font>
+
+---
+#### LoD 信息实例
+
+<font size=4>
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/LoDTensor.png" width=43%>
+</p>
+
+- 图(a)的LoD 信息
+  ```cpp
+  [0, 5, 8, 10, 14]
+  ```
+- 图(b)的 LoD 信息
+  ```cpp
+  [[0, 5, 8, 10, 14] /*level=1*/, [0, 2, 3, 5, 7, 8, 10, 13, 14] /*level=2*/]
+  ```
+</font>
+
+---
+#### Tensor, Variable, Scope 之间的关系
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/scope_variable_tensor.png" width=40%>
+</p>
+<font size=5>
+
+1. `Block` 是一个实现层的概念，不在应用层暴露给用户。目前用户无法自行创建并利用`Block`，用户能够感知的只有`Program`这个概念。
+1. 逻辑上，可以将 `Block` 类比为编程语言中的大括号：定义了一段作用域，其中运行一段代码
+1. `Executor`会为每一个`Block`创建一个`Scope`，`Block`是可嵌套的，因此`Scope`也是可嵌套的
+
+</font>
+
+---
+### Executor
+
+<font size=5>
+
+<table>
+<thead>
+<th>接口</th>
+<th>说明</th>
+</thead>
+<tbody>
+<tr>
+<td><p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/executor.png" width=60%>
+</p></td>
+<td><span style="background-color:#B3D9D9;">输入</span><br>1. `ProgramDesc`<br>2. `Scope`<br> 3.`block_id`<br><br><span style="background-color:#B3D9D9;">解释执行步骤</span><br>1. 创建所有 Variables<br> 2. 逐一创建 Operator 并运行
+</td>
+</tr>
+</tbody>
+</table>
+
+---
+### Operator/OpWithKernel/Kernel
+<font size=5>
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/operator1.png" width=50%>
+</p>
+
+- operator 无状态，Operator的核心是==Run==方法
+- 一个operator可以注册多个kernel
+- operator 可以无 kernel：while_op 、ifelse op
+
+</font>
+
+---
+#### Fluid Operator vs. PaddlePaddle layers
+<font size=5>
+
+<table>
+<thead>
+<th>Layer</th>
+<th>Operator</th>
+</thead>
+<tbody>
+<tr>
+<td><p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/layer.png" width=70%>
+</p></td>
+<td><p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/operator2.png" width=73%>
+</p></td>
+</tr>
+
+<tr>
+<td>1. 内部维护状态<br>2. 包含forward和backward方法</td>
+<td>1. 内部无状态<br>2. 只有Run方法</td>
+</tr>
+</tbody>
+</table>
+
+</font>
+
+---
+
+### ==4.== 内存管理
+
+---
+### 目标
+
+- 为异构设备提供统一的内存分配、回收接口
+- 最小化管理内存所需的时间，最小化管理开销
+- 减少内存碎片
+- 将内存管理与计算（Operators/Kernels）完全剥离
+- 统一内存管理是内存优化的基础
+
+---
+
+<font size=5>
+
+### Memory 接口
+
+- 内存管理模块向上层应用逻辑提供三个基础接口：
+  ```cpp
+  template <typename Place>
+  void* Alloc(Place place, size_t size);
+
+  template <typename Place>
+  void Free(Place place, void* ptr);
+
+  template <typename Place>
+  size_t Used(Place place);
+
+  struct Usage : public boost::static_visitor<size_t> {
+    size_t operator()(const platform::CPUPlace& cpu) const;
+    size_t operator()(const platform::CUDAPlace& gpu) const;
+  };
+  ```
+- 模板参数 `Place` 指示内存分配发生的设备
+- 实现时，需特化支持的 `Place`， 提供以上三个接口的实现
+
+</font>
+
+---
+### 代码结构
+
+<font size=5>
+
+内存管理模块可以理解为由以下两部分构成：
+
+1. SystemAllocator：实际从物理设备上分配、释放的内存的接口
+1. BuddyAllocator：内存管理算法
+
+</font>
+
+---
+### System Allocator
+
+<font size=5>
+
+- SystemAllocator 是实现物理内存分配、回收的基类
+    - 不同设备上的内存分配和回收终将转化为标准接口调用
+    - 为不同设备实现MemoryAllocator，继承自SystemAllocator
+
+  ```cpp
+  class SystemAllocator {
+   public:
+    virtual ~SystemAllocator() {}
+    virtual void* Alloc(size_t& index, size_t size) = 0;
+    virtual void Free(void* p, size_t size, size_t index) = 0;
+    virtual bool UseGpu() const = 0;
+  };
+  ```
+</font>
+
+---
+
+### CPU/GPU Allocator
+
+<font size=5>
+
+```cpp
+class CPUAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+};
+
+#ifdef PADDLE_WITH_CUDA
+class GPUAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+ private:
+  size_t gpu_alloc_size_ = 0;
+  size_t fallback_alloc_size_ = 0;
+};
+#endif
+```
+- CPUAllocator和GPUAllocator分别继承自SystemAllocator，分别调用相应的标准库函数实现物理内存的分配和释放。
+- 一旦大块、连续的物理内存分配之后，将通过内存管理算法实现内存的按块分配、回收、重用等。
+
+</font>
+
+---
+### CPU Allocator
+
+<font size=5>
+
+- CPU 内存的分配提供两种选项：
+    1. non-pinned memory：可分页内存
+    2. pinned memory：页锁定内存
+        - 分配过大的页锁定内存有可能因为系统可使用的分页内存减少，影响系统性能，默认CPU下分配的是可分页内存
+
+- 通过gflags进行设置一次性分配内存的大小以及是否使用页锁定内存。
+
+   ```cpp
+   DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
+   DEFINE_double(fraction_of_cpu_memory_to_use, 1,
+                 "Default use 100% of CPU memory for PaddlePaddle,"
+                 "reserve the rest for page tables, etc");
+   ```
+
+</font>
+
+---
+### GPU Allocator
+
+<font size=5>
+
+- 通过 cudaMalloc 分配GPU显存
+- GPUAllocator::Alloc 首先会计算指定GPU device上的可用显存
+    - 如果可用显存小于请求分配大小，调用cudaMalloc进行分配
+    - 如果可用显存不足，目前会报错退出。
+- 通过gflags控制GPU下一次性分配显存的大小：
+
+  ```cpp
+  DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
+                "Default use 92% of GPU memory for PaddlePaddle,"
+                "reserve the rest for page tables, etc");
+  ```
+
+</font>
+
+---
+#### 内存管理算法:  [Buddy Memory Allocation](https://en.wikipedia.org/wiki/Buddy_memory_allocation)
+
+<font size=5>
+
+- Memory Arena：一次性分配大块连续内存，之后会基于这块内存进行内存管理：动态分配、释放、重用内存块。
+- 伙伴内存分配：
+    - 将内存划分为 2 的幂次方个分区，使用 best-fit 方法来分配内存请求。
+    - 当释放内存时，检查 buddy 块，查看相邻的内存块是否也已被释放。如果是，将内存块合并，以最小化内存碎片。
+    - 分配的内存在物理内存的自然边界对齐，提高内存访问效率。
+    - 算法的时间效率高，单使用 best-fit 方法的缘故，会产生一定的内存浪费
+
+</font>
+
+---
+
+### Buddy Allocator
+
+<font size=5>
+
+- BuddyAllocator 是一个单例，每个设备（如： GPU/CPU(0)/GPU(1)） 拥有一个BuddyAllocator
+- BuddyAllocator 内部拥有一个私有成员变量 SystemAllocator
+- 当请求的内存超过BuddyAllocator管理的空余内存时，将会调用SystemAllocator去指定的设备上分配物理内存
+
+</font>
+
+---
+### 实例：CPU 下内存管理接口的实现
+
+<font size=5>
+
+- 对上层应用，统一通过BuddyAllocator来实现内存的分配、释放以及用量查询
+    ```cpp
+    template <>
+    void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
+      VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+      void* p = GetCPUBuddyAllocator()->Alloc(size);
+      VLOG(10) << "  pointer=" << p;
+      return p;
+    }
+
+    template <>
+    void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
+      VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+      GetCPUBuddyAllocator()->Free(p);
+    }
+
+    template <>
+    size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
+      return GetCPUBuddyAllocator()->Used();
+    }
+    ```
+</font>
+
+---
+### ==5.== 多设备支持
+
+---
+### 多设备支持（一）
+
+<font size=5>
+
+- step 1：添加Place类型，<span style="background-color:#DAB1D5;">由用户实现添加到框架</span>
+   - 可以将Place类型理解为一个整数加上一个枚举型，包括：设备号 + 设备类型
+
+    <p align="center">
+    <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/place.png" width=40%>
+    </p>
+- DeviceContext
+    - 不同的Place会对应一个相应的DeviceContext，用于组织管理与设备相关的信息
+      - 例如，GpuDeviceContext中会管理Cuda stream
+    - 目前实现中一些特殊的库也会对应有自己的DeviceContext：例如：
+      ```cpp
+      class MKLDNNDeviceContext : public CPUDeviceContext {……}
+      ```
+    - 每种设备对应的DeviceContext需要管理的内容不尽相同，视具体需求来实现
+
+</font>
+
+---
+
+### 多设备支持（二）
+
+<font size=5>
+
+- step 2: 增加KernelType，为相应的KernelType注册Kernel对象，<span style="background-color:#DAB1D5;">由用户实现注册给框架</span> 可以按照：
+    1. Place 执行设备
+    1. DataType 执行数据类型 FP32/FP64/INT32/INT64
+    1. Memory layout： 运行时 Tensor 在内存中的排布格式 NCHW、 NHWC
+    1. 使用的库
+
+    来区分Kernel，为同一个operator注册多个 Kernel。
+
+    ```cpp
+    struct OpKernelType {
+      proto::DataType data_type_;
+      DataLayout data_layout_;
+      platform::Place place_;
+      LibraryType library_type_;
+    }
+    ```
+
+</font>
+
+---
+
+### 多设备支持（三）
+
+<font size=5>
+
+step 3: 运行时的 KernelType 推断和Kernel切换，<span style="background-color:#DAB1D5;">按需要修改Kernel推断和Kernel切换规则</span>
+- Expected Kernel：期待调用的Kernel：由（1）`Place`和计算精度决定；或（2）用户在配置中显示指定使用的计算库，如`cudnn`、`mkldnn`等。
+- Actual Kernel：运行时从`Operator`的输入（`Variable`）可以推断出实际需要的`KernelType`
+- 当Expected Kernel和Actual Kernel不一致的时候，框架会插入`data_transformer`或者`data_layerout_transform`等，保证Expected Kernel可以执行，包括：
+   - CPUPlace -> GPUPlace ：跨设备内存复制
+   - NCHW -> nChw8c ：Layout转换
+   - FP32 -> FP16 ：精度转换 _**尚未支持**_
+   - ……
+- 以上过程实现在OperatorWithKernel类的Run方法中 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.cc#L497)
+
+</font>
+
+---
+## ==6.== while_op
+
+---
+### while_op
+
+<font size=5>
+
+- 循环执行一段`Program`，直到条件operator判断循环条件不满足时终止循环
+- while_op 的特殊之处：
+  1. while_op 没有 kernel
+  1. while_op 拥有自己的`Block`，会形成一段嵌套的`Block`
+  1. ==while_op 内部创建了一个 Executor，来循环执行`Block`==
+
+- while_op 输入输出 ： LoDTensorArray
+    ```cpp
+    namespace paddle {
+    namespace framework {
+    using LoDTensorArray = std::vector<LoDTensor>;
+    }
+    }
+    ```
+    - 每一次循环，从原始输入中“切出”一个片段
+    - LoDTensorArray 在Python端暴露，是Fluid支持的基础数据结构之一，用户可以直接创建并使用
+
+</font>
+
+---
+### while_op [Run](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/while_op.cc#L42) 方法概览
+
+<font size=5>
+
+```cpp
+
+void Run(const framework::Scope &scope,
+         const platform::Place &dev_place) const override {
+  PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
+  auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
+  PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
+
+  framework::Executor executor(dev_place);
+  auto *block = Attr<framework::BlockDesc *>(kStepBlock);
+
+  auto *program = block->Program();
+  auto step_scopes =
+      scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>();
+
+  while (cond.data<bool>()[0]) {
+    auto &current_scope = scope.NewScope();
+    step_scopes->push_back(&current_scope);
+    executor.Run(*program, &current_scope, block->ID(),
+                   false /*create_local_scope*/);
+  }
+}
+
+```
+
+</font>
+
+---
+### while_op 的重要应用：Dynamic RNN
+
+---
+
+### 什么是 `dynamicRNN` ?
+
+<font size=5>
+<br>
+
+1. 用户可以自定义在一个时间步之内的计算, 框架接受序列输入数据，在其上循环调用用户定义的单步计算
+1. 可学习参数在多个时间步之间共享
+1. `dynamicRNN` 由 `while_op` 实现
+1. 如果`dynamicRNN`中定义了`memory`，将会构成一个循环神经网络，否则其行为就等于在输入序列上循环调用预定义的单步计算
+
+</font>
+
+---
+
+#### `dynamic RNN` 用户接口
+<font size=5>
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/user_interface.png" width=75%>
+</p>
+
+- `dynamicRNN` 中的重要元素
+  1. **step input**: `dynamicRNN` 每个时间步的输入
+  1. **step function**: 用户定义的单步计算
+  1. **memory**: 用于形成循环连接
+  1. **external/static memory**：单步计算的每一步都可以全部读取到的外部输入
+
+</font>
+
+---
+
+#### dynamicRNN 中的 Memory
+
+<font size=5>
+
+`dynamicRNN`中`memory`的行为非常类似于 C++ 中的引用变量
+  - `memory` “指向” 一个operator的输出变量，记作： A
+  - `memory` 可以被 LoDTensor 初始化（当LoD信息为空时，为非序列，否则为序列）,默认`memory`被初始化为零
+  - `memory` 在 operator A 前向计算之后，进行前向计算
+  - 当 `memory` 的前向计算会 "指向" A 的输出 LoDTensor
+  - `memory` 的输出可以是另一个 operator 的输入，于是形成了“循环”连接
+
+</font>
+
+---
+
+### DynamicRNN 实现细节
+
+<font size=5>
+
+- `while_op` <span style="background-color:#DAB1D5;">无法独立构成dynamicRNN</span>，必须和一组相关的 operator 及数据结构配合
+    - 依赖的 operators (这里仅列出最重要的，并非全部):
+        - `lod_rank_table` operator
+        - `lod_tensor_to_array` operator
+        - `array_to_lod_tensor` operator
+        - `shrink_memory` operator
+    - 依赖的数据结构
+        - `TensorArray`
+        - `LoDRankTable`
+
+- 在Fluid中，RNN接受变长序列输入，无需填充，以上数据结构和相关的operator配合工作，实现了对变长输入以batch计算
+
+</font>
+
+---
+
+### `dynamicRNN` 如何实现 batch 计算 ?
+
+<font size=5>
+
+- 问题：
+  - RNN 可以看作是一个展开的前向网络，前向网络的深度是最长序列的长度
+  - 如果不对变长序列进行填充，将它们填充到一样长度，每个mini-batch输入将会不等长，每个样本展开长度不一致，导致前向和反向计算实现困难
+
+</font>
+
+----
+##### 实例 ：RNN encoder-decoder with attention
+
+<font size=5>
+
+- 以机器翻译的RNN encoder-decoder 模型（涉及了`dynamicRNN`的所有设计要素）为例，下图是 RNN encoder-decoder 的原始输入：
+  <p align="center">
+  <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/raw_input.png" width=100%><br><font size=3> Figure. RNN encoder-decoder 原始batch 输入数据</font>
+  </p>
+
+- source word sequences 是encoder RNN的输出，是一个LoDTensor
+- target word sequences 是look_uptable的输入，是一个LoDTensor
+- 上图中一个矩形方块是CPU/GPU内存中一片连续的内存空间，表示一个dense vector
+
+</font>
+
+---
+
+### `dynamicRNN` 如何实现 batch 计算 ?
+
+<font size=5>
+
+1. 对一个mini batch中不等长样本进行排序，最长样本变成batch中的第一个，最短样本是batch中最后一个
+      - `LoDTensor` -> `LoDRankTable` :heavy_plus_sign: `lod_rank_table operaator`
+          - 可以将`LoDRankTable`理解为对LoDTensor中的多个序列按照长度排序LoDRankTable 存储了排序之后的index
+
+2. 构建每个时间步的batch输入：随着时间步增加，每个时间步的batch输入可能会逐渐缩小
+    - `TensorArray` :heavy_plus_sign: `lod_tensor_to_array` -> `LoDTensor` (without LoD)
+3. 每个时间步输出写入一个输出 `LoDTensorArray`
+3. `dynamicRNN`循环结束后, 按照`LoDRankTable`中记录的信息对输出`LoDTensorArray`重排序，还原会原始输入顺序
+    - `TensorArray` :heavy_plus_sign: `array_to_lod_tensor` -> `LoDTensor`
+
+</font>
+
+---
+
+### 运行实例
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/sorted_input.png" width=100%>
+</p>
+
+---
+### 运行实例
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/1.png" width=100%>
+</p>
+
+<font size=5>
+
+- 执行到第5~7个batch时，batch size将会缩小
+
+</font>
+
+---
+### 运行实例
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/1.png" width=80%>
+</p>
+
+<font size=5>
+
+- 第5 ~ 7个batch时RNN的`memory`会发生什么？
+    - `memory` 指向某个operator的输出Tensor，在该operator前向计算之后，“取回”其计算结果
+    - 5 ~ 7时，遇到了序列的结束，==下一个时间步计算不再需要在已经结束的序列上展开==
+    - 在`dynamicRNN`中`shrink_memory` operator 用来缩小`memory`的batch输入
+
+</font>
+
+---
+### 运行实例：batch 1 ~ 2
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/2.png" width=70%><br><font size=4>Figure. 第1、2个batch输入dynamicRNN的batch输入</font>
+</p>
+
+---
+### 运行实例：batch 3 ~ 4
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/3.png" width=70%><br><font size=4>Figure. 第3、4个batch输入dynamicRNN的batch输入</font>
+</p>
+
+---
+
+### 运行实例：batch 5 ~ 7
+
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/4.png" width=70%><br><font size=4>Figure. 第5、6、7个batch输入dynamicRNN的batch输入</font>
+</p>
+
+---
+### ==7.== Fluid 代码结构
+
+---
+### Fluid 代码结构
+
+<table>
+<thead>
+<tr>
+<th>代码结构</th>
+<th>模块结构</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid_module_1.png" width=60%>
+</p>
+</td>
+<td>
+<p align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid_module_2.png" width=60%>
+</p>
+</td>
+</tr>
+
+</tbody>
+</table>
+
+---
+
+### ==8.== 文档总结
+
+---
+<font size=5>
+
+- 设计概览
+  - 重构概览 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/refactorization.md)
+  - fluid [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md)
+  - fluid_compiler [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/motivation/fluid_compiler.md)
+- 核心概念
+  - variable 描述 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/var_desc.md)
+  - Tensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.md)
+  - LoDTensor [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
+  - TensorArray [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md)
+  - Program [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md)
+  - Block [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md)
+  - Scope [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md)
+
+---
+
+- 重要功能模块
+  - backward [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/backward.md)
+  - 内存优化 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/memory_optimization.md)
+  - evaluator [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/executor.md)
+  - python API [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md)
+  - regularization [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/regularization.md)
+
+- 开发指南
+  - 支持新设硬件设备库 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/support_new_device.md)
+  - 添加新的Operator [->](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_cn.md)
+  - 添加新的Kernel [->](
+https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_kernel_en.md)
+
+</font>
+
+---
+
+### ==9.== 开发指南
+
+---
+
+#### 建议开发环境：使用 Docker 编译和测试
+
+<font size=5>
+
+Docker编译PaddlePaddle源码: [->](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html)
+
+PaddlePaddle 在 Dockerhub 地址：[->](
+    https://hub.docker.com/r/paddlepaddle/paddle/tags/)
+
+1. 获取PaddlePaddle的Docker镜像
+    ```bash
+    docker pull paddlepaddle/paddle:latest-dev
+    ```
+
+1. 启动 docker container
+
+    ```bash
+    docker run -it -v $PWD/Paddle:/paddle paddlepaddle/paddle:latest-dev /bin/bash
+    ```
+
+1. 进入docker container后，从源码编译，请参考文档 [->]( http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/build_from_source_cn.html)
+
+</font>
+
+---
+
+### 一些说明
+
+<font size=5>
+
+1. PaddlePaddle的Docker镜像为了减小体积，默认没有安装vim，可以在容器中执行`apt-get install -y vim`来安装vim。
+1. 开发推荐使用tag为`latest-dev`的镜像，其中打包了所有编译依赖。`latest`及`lastest-gpu`是production镜像，主要用于运行PaddlePaddle程序。
+2. 在Docker中运行GPU程序，推荐使用nvidia-docker，[否则需要将CUDA库和设备挂载到Docker容器内](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/build_and_install/docker_install_cn.html)。
+   <font size=4>
+
+   ```bash
+   nvidia-docker run -it -v $PWD/Paddle:/paddle paddlepaddle/paddle:latest-dev /bin/bash
+   ```
+   </font>
+
+
+</font>
+
+---
+
+### [如何贡献](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/dev/contribute_to_paddle_cn.html)
+
+<font size=5>
+
+- ==提交PullRequest前请务必阅读==： [->](http://www.paddlepaddle.org/docs/develop/documentation/fluid/zh/dev/contribute_to_paddle_cn.html)
+- 代码要求
+    1. 代码注释遵守 Doxygen 的样式
+    1. 确保编译器选项 WITH_STYLE_CHECK 已打开，并且编译能通过代码样式检查
+    1. 所有代码必须具有单元测试，且能够通过所有单元测试
+- 使用 `pre-commit` 钩子提交Pull Request
+    1. 帮助格式化源代码（C++，Python）
+    1. 在提交前自动检查一些基本事宜：如每个文件只有一个 EOL，Git 中不要添加大文件等
+    1. 安装pre-commit，并在PaddlePaddle根目录运行：
+    ```bash
+      ➜  pip install pre-commit
+      ➜  pre-commit install
+    ```
+</font>
+
+---
+
+### 如何贡献
+
+<font size=5>
+
+1. 开始开发之前请先建立issue。
+    - 让其它同学知道某项工作已经有人在进行，以避免多人开发同一功能的情况。
+1. 提交PR必须关联相关的issue。做法请参考：[->](https://help.github.com/articles/closing-issues-using-keywords/)
+    - 目的：为了在提交的版本中留有记录描述这个PR是为了开发什么样的功能，为了解决什么样的问题。
+    - 当PR被merge后，关联的issue会被自动关闭。
+1. PR review 中，reviewer的每条comment都必须回复。
+    - 如修改完可直接回复：Done。
+    - 目的：review comment 中可能会有（1）询问类型的问题；（2）可以在下一个PR修改的问题；（3）comment意见不合理等。需要明确回复，以便reviewer和其他人有历史可查，便于区分是否已经进行修改，或者准备下一个PR修改，或者意见不合理可以不用进行修改。
+
+</font>
+
+---
+
+### ==10.== 添加新的 Operator
+
+---
+
+### 概念简介
+
+<font size=5>
+
+添加一个新的operator，会涉及实现以下C++类的派生类：
+
+1. `framework::OperatorBase`: Operator(简写，Op)基类。
+1. `framework::OpKernel`: Op计算函数的基类，称作Kernel。
+1. `framework::OperatorWithKernel`：继承自OperatorBase，Op有计算函数，称作有Kernel。
+1. `class OpProtoAndCheckerMaker`：描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
+
+依据是否包含kernel，可以将Op分为两种：
+1. 包含Kernel的Op：继承自OperatorWithKernel，==绝大多数operator都属于这一类==
+1. 不包含kernel的Op，继承自OperatorBase，只有少量Op属于这一类，例如while_op，ifelse_op
+
+<span style="background-color:#DAB1D5;">这里主要介绍带Kernel的Op如何编写。</span>
+
+</font>
+
+---
+
+#### 添加新的Operator需要修改/添加哪些文件？
+
+<font size=5>
+
+<table>
+<thead>
+<tr>
+<th>内容</th>
+<th>定义位置</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>
+OpProtoMake定义
+</td>
+<td>
+`.cc`文件，<span style="background-color:#DAB1D5;">Backward Op不需要OpProtoMaker</span>
+</td>
+</tr>
+<tr>
+<td>
+Op定义
+</td>
+<td>
+`.cc`文件
+</td>
+</tr>
+<tr>
+<td>
+Kernel实现
+</td>
+<td>
+<span style="background-color:#DAB1D5;">CPU、CUDA共享Kernel实现在`.h`文件中</span>，否则，CPU 实现在`.cc`文件中，CUDA 实现在`.cu`文件中。
+</td>
+</tr>
+
+<tr>
+<td>
+注册Op
+</td>
+<td>
+Op注册实现在`.cc`文件；Kernel注册CPU实现在`.cc`文件中，CUDA实现在`.cu`文件中
+</td>
+</tr>
+
+</tbody>
+</table>
+
+- 添加 Operator 之前请阅读：[Operator 命名规范](https://github.com/PaddlePaddle/Paddle/blob/63cca04cfd488a4dab6d6273fd04a8017ef45932/doc/fluid/dev/name_convention.md)及[Operator Markdown注释规范](https://github.com/PaddlePaddle/Paddle/blob/63cca04cfd488a4dab6d6273fd04a8017ef45932/doc/fluid/dev/op_markdown_format.md)。
+- 实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators)下，文件命名以`*_op.h`（如有） 、 `*_op.cc` 、`*_op.cu`（如有）结尾。
+- 根据文件名自动构建op和Python端绑定，<span style="background-color:#DAB1D5;">请务必遵守以上命名，否则需要进一步修改PyBind相关文件及CMakeLists.txt</span>。
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step1</span>: 定义ProtoMaker类
+
+<font size=5>
+
+下面均以[clip_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.h)为例进行介绍
+
+- clip_op计算公式：$Out = \min(\max(X, min), max)$
+- 首先定义`ProtoMaker`来描述该Op的输入、输出，并添加注释（<font size=4>*下面代码段的中注释进行了简化，实现时需按照规范添加注释*</font>）：
+
+    ```cpp
+    template <typename AttrType>
+    class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
+     public:
+      ClipOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+          : OpProtoAndCheckerMaker(proto, op_checker) {
+        AddInput("X","(Tensor)The input of clip op.");
+        AddOutput("Out", "(Tensor),The output of clip op.");
+        AddAttr<AttrType>(
+            "min", "(float),Minimum value.");
+        AddAttr<AttrType>(
+            "max", "(float),Maximum value.");
+        AddComment(R"DOC(
+        ……
+    )DOC");
+      }
+    };
+    ```
+
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step2</span>: 定义Operator类
+
+<font size=5>
+
+下面的代码段实现了`clip_op`的定义：
+
+```cpp
+class ClipOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ClipOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ClipOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto max = ctx->Attrs().Get<float>("max");
+    auto min = ctx->Attrs().Get<float>("min");
+    PADDLE_ENFORCE_LT(min, max, "max should be greater than min.");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+```
+</font>
+
+---
+
+### Operator 类中需要完成的工作
+
+<font size=5>
+
+1. clip_op 继承自`OperatorWithKernel`，
+
+    ```cpp
+    using framework::OperatorWithKernel::OperatorWithKernel;
+    ```
+    表示使用基类`OperatorWithKernel`的构造函数。
+
+1. 重写`InferShape`接口。
+    - `InferShape` 为const函数，不能修改Op的成员变
+    - `InferShape` 的参数为 `const framework::InferShapeContext &ctx`，从中可获取到输入输出以及属性
+    - `InferShape` 会被调用两次，一次是编译时（创建op），一次是运行时（调用op的`Run`方法时），需要完成以下功能：
+        1. 做检查， 尽早报错：检查输入数据维度、类型等是否合法
+        2. 设置输出Tensor的形状
+
+<span style="background-color:#DAB1D5;">通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中。</span>
+
+</font>
+
+---
+
+### 补充说明
+
+<font size=5>
+
+1. `InferShape`目前支持两种实现方式，<span style="background-color:#DAB1D5;">二者最后都会生成一个functor注册给OpInfo结构体。</span>
+    1. 继承framework::InferShapeBase，实现为一个functor（参考 [mul_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/mul_op.cc#L22)）
+    2. override InferShape函数（参考 [clip_op](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.cc#L24)）
+
+1. 什么是`functor` ?
+
+   - 类或结构体仅重载了`()`，一般是可被多个kernel复用的计算函数。
+
+        <font size=4>
+
+        ```cpp
+        template <typename T>
+        class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
+         public:
+          void operator()(const platform::CPUDeviceContext& ctx,
+                          framework::Tensor* out,
+                          const framework::Tensor* prob,
+                          const framework::Tensor* labels, const bool softLabel) {
+               ……
+          }
+        };
+        ```
+        </font>
+
+    - 在 clip_op 内也会看到将一段计算函数抽象为functor的使用法： [->](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/operators/clip_op.h#L27)。
+
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step3</span>: 定义OpKernel类
+
+<font size=5>
+
+- `ClipKernel`继承自`framework::OpKernel`，带有下面两个模板参数:
+    1. `typename DeviceContext`: 表示设备类型，不同设备共享同一个Kernel时，需添加该模板参数。不共享时，需要提供针对不同设备的特化实现。
+    1. `typename T` : 表示支持的数据类型，如`float`, `double`等
+
+- 在`ClipKernel`类中重写`Compute`方法
+    1. `Compute`接受输入参数：`const framework::ExecutionContext& context`
+        - `ExecutionContext` 是从 `Scope`中将运行时Op的输入、输出`Variable`组织在一起，使得Op在调用`Compute`方法时，能够简单地通过名字拿到需要的输入输出`Variable`
+        - 与`InferShapeContext`相比，`ExecutionContext` 中增加了设备类型
+    1. 在`Compute`函数里实现`OpKernel`的具体计算逻辑
+
+</font>
+
+---
+#### ClipKernel 代码概览
+
+<font size=5>
+
+```cpp
+template <typename DeviceContext, typename T>
+class ClipKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto max = context.Attr<T>("max");
+    auto min = context.Attr<T>("min");
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    const T* x_data = x->data<T>();
+    int64_t numel = x->numel();
+    Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x_data,
+          x_data + numel, out_data, ClipFunctor<T>(min, max));
+  }
+};
+```
+
+- 为了使`OpKernel`的计算过程书写更加简单，并且CPU、CUDA的代码可以复用， Fluid 使用 Eigen 作为基础的矩阵运算库
+- Fluid对Eigen unsupported Tensor提供了一些基本的封装，可以在`Compute`接口中直接调用
+    - 关于在PaddlePaddle中如何使用Eigen库，请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/use_eigen_cn.md)。
+
+</font>
+
+---
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step4</span>: 实现反向Op
+
+<font size=5>
+
+- ==**反向Op没有`ProtoMaker`**==，除此之外定义与实现方式前向Op完全一致，不再赘述
+- 这里仅对反向Op的输入输出进行说明：
+    1. 反向Op的输入
+        - 前向Op的输出
+        - 反向传播过程中传递给当前Op的梯度
+            - 需要注意，<span style="background-color:#e1c4c4;">Fluid中，不区分Cost Op和中间层Op，所有Op都必须正确处理接收到的梯度</span>
+    2. 反向Op的输出
+        - 对可学习参数的求导结果
+        - 对所有输入的求导结果
+
+
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step5</span>: 注册Op及Kernel
+
+<font size=5>
+
+至此Op和Op kernel都已经实现完毕，接下来，需要在`.cc`和`cu`文件中注册op和kernel
+
+1. 在`.cc`文件中注册前向、反向Op类，注册CPU Kernel。
+
+    <font size=4>
+
+    ```cpp
+    namespace ops = paddle::operators;
+    REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker<float>, clip_grad,
+                ops::ClipOpGrad);
+    REGISTER_OP_CPU_KERNEL(
+        clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>);
+    REGISTER_OP_CPU_KERNEL(
+        clip_grad, ops::ClipGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ```
+
+   - 在上面的代码片段中：
+
+     1. `REGISTER_OP` ： 注册`ops::ClipOp`类，类型名为`clip`，该类的`ProtoMaker`为`ops::ClipOpMaker`，注册`ops::ClipOpGrad`，类型名为`clip_grad`
+     1. `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op，例如：优化算法相关的Op
+     1. `REGISTER_OP_CPU_KERNEL` ：注册`ops::ClipKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::ClipGradKernel`类
+
+    </font>
+1. 按照同样方法，在`.cu`文件中注册GPU Kernel
+   -  <span style="background-color:#e1c4c4;">如果CUDA Kernel的实现基于Eigen，需在 `.cu`的开始加上宏定义 `#define EIGEN_USE_GPU` </span>
+
+</font>
+
+---
+
+##### 编译和Python端绑定
+
+<font size=5>
+
+- 运行下面命令可以仅编译新添加的Op：
+
+  ```
+  make mul_op
+  ```
+  - <span style="background-color:#e1c4c4;">需注意，运行单元测试需要编译整个工程</span>
+
+- 如果遵循前文的文件命名规则，构建过程中，会自动为新增的op添加Python端绑定，并链接到生成的lib库中
+
+</font>
+
+---
+
+###### 实现带Kernel的Operator <span style="background-color:#c4e1e1;">step6</span>: 添加前向单测及梯度检测
+
+<font size=5>
+
+- 新增Op的单元测试统一添加至：[python/paddle/v2/fluid/tests/unittests](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/tests/unittests)目录
+- 前向Operator单测
+
+    1. Op单元测试继承自`OpTest`，各项具体的单元测试在`TestClipOp`里完成，所有单测case都以`TestXX`命名
+    1. 单元测试Operator，需要：
+        1. 在`setUp`函数定义输入、输出，以及相关的属性参数
+        1. 生成随机的输入数据
+        1. 在Python脚本中实现与前向operator相同的计算逻辑，得到输出值，与operator前向计算的输出进行对比
+        1. 反向梯度检测流程测试框架已经实现，直接调用相应接口`check_grad`即可
+
+- `clip_op` 单测代码请参考 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_clip_op.py)，这里不再展开
+
+</font>
+
+---
+#### 编译执行单测
+
+<font size=5>
+
+- `python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译
+
+    - <span style="background-color:#e1c4c4;">运行单元测试测时需要编译整个工程，并且编译时需要打开`WITH_TESTING`</span>, 即`cmake paddle_dir -DWITH_TESTING=ON`
+- 编译成功后，执行下面的命令来运行单元测试：
+
+  ```bash
+  make test ARGS="-R test_mul_op -V"
+  ```
+
+  或者:
+
+  ```
+  ctest -R test_mul_op
+  ```
+</font>
+
+---
+
+### 添加Op的一些注意事项
+
+<font size=5>
+
+- 为每个Op创建单独的`*_op.h`（如有）、`*_op.cc`和`*_op.cu`（如有）。<span style="background-color:#e1c4c4;">不允许一个文件中包含多个Op</span>，将会导致编译出错。
+- 注册Op时的类型名，需要和该Op的名字一样。<span style="background-color:#e1c4c4;">不允许在`A_op.cc`里面，注册`REGISTER_OP(B, ...)`</span>，会导致单元测试出错。
+- 如果Op<span style="background-color:#e1c4c4;">没有实现CUDA Kernel，不要创建空的`*_op.cu`</span>，会导致单元测试出错。
+- 如果多个Op依赖一些共用的函数，可以创建非`*_op.*`格式的文件来存放，如`gather.h`文件。
+
+</font>
+
+---
+
+### ==10.== 使用相关问题
+
+---
+
+### 定义前向计算
+
+<font size=5>
+
+- 当在python端执行时：
+    ```python
+    import paddle.v2.fluid as fluid
+    ```
+    [`framework.py`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/framework.py#L1040)定义了两个全局`Program`:
+    ```python
+    # program is a global instance.
+    _main_program_ = Program()
+    _startup_program_ = Program()
+    ```
+
+- 前向定义的过程就是不断往`mian_program`中添加Op和Variable
+- 如果需要执行一个新的`mian_program`时，可以调用调用：
+    ```python
+    def switch_main_program(program):
+        """
+        Switch the main program to a new program.
+        This funtion returns the previous main program.
+        """
+        ……
+    ```
+</font>
+
+---
+
+### 自定义参数的初始化
+
+<font size=5>
+
+- 调用`fluid.ParamAttr(……)`接口，自定义参数的初始化
+
+  ```python
+  w_param_attrs = ParamAttr(name=None,
+      initializer=UniformInitializer(low=-1.0, high=1.0, seed=0),
+      learning_rate=1.0,
+      regularizer=L1Decay(1.0),
+      trainable=True,
+      clip=GradientClipByValue(-1.0, 1.0),
+  )
+  y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
+  ```
+
+- 补充问题：如何创建 `Variable`
+  ```python
+  cur_program = Program()
+  cur_block = cur_program.current_block()
+  new_var = cur_block.create_var(name="X", shape=[-1, 16, 16], dtype="float32")
+  ```
+
+</font>
+
+---
+
+### 添加反向Op
+
+<font size=5>
+
+- 调用`fluid.backward.append_backward(X)`（`X`是一个Variable），来为一段前向`ProgramDesc`添加反Op
+
+    ```python
+    data = fluid.layers.data(name="data", shape=(2,3,4))
+    out = fluid.layers.fc(input=data,size=128,act=None)
+    loss = fluid.layers.reduce_sum(out)
+    fluid.backward.append_backward(loss=loss)
+    ```
+
+- 添加优化相关的Op
+    ```python
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(loss)
+    ```
+
+- 可以随时调用`print(fluid.default_main_program())`来输出当前的`main_program`
+
+- 当构建完成整个`Program`后，调用下面的接口执行内存优化：
+  ```python
+  fluid.memory_optimize(fluid.default_main_program())
+  ```
+  - _<span style="background-color:#e1c4c4;">注：内存优化目前仍在持续开发中，有可能不够稳定。</span>_
+
+</font>
+
+---
+
+### 总结：编译时执行流程
+
+<font size=5>
+
+- 用户定义前向计算
+- 添加反向Op到`default_main_program`
+- 添加 gradient clipping Op 到
+- 添加 regularization Op 到`default_main_program`
+- 为指定的优化算法，添加相关的状态 variable of optimizer 到`default_startup_program`
+    - 状态相关 variable是指如学习率, 历史 momentum, 二阶momentum等
+- 添加初始化 variable 的Op 到 `default_startup_program`
+- 为整个网络最后一个op，添加设置其接受到的梯度的Op到`default_main_program`
+- 进行内存优化规划
+
+</font>
+
+---
+
+### Feed 数据 (一)：通过 feed 字典
+
+<font size=5>
+
+- 执行executor的run方法时，指定feed字典，feed op 会将指定的数据放到`x`和`y`两个Variable中
+  ```python
+  y_data = np.random.randint(0, 8, [1]).astype("int32")
+  y_tensor = core.Tensor()
+  y_tensor.set(y_data, place)
+
+  x_data = np.random.uniform(0.1, 1, [11, 8]).astype("float32")
+  x_tensor = core.Tensor()
+  x_tensor.set(x_data, place)
+  ……
+  cost = exe.run(
+      fluid.default_main_program(),
+      feed={'x': x_tensor,
+            'y': y_tensor},
+      fetchlist=[avg_cost])
+  ```
+
+- 这种方法较为底层，一般用于单测中
+
+</font>
+
+---
+
+### Feed 数据 (二)：使用 DataFeeder接口
+
+<font size=5>
+
+- 编写一个data_reader函数，data_reader是一个Python generator
+
+  ```python
+  def demo_reader():
+      def random_generator():
+          yield np.random.uniform(0.1, 1, [4]), np.random.randint(0, 1, [1])
+      return random_generator
+  ```
+- 在训练任务中使用 DataFeeder 接口
+  ```python
+  cost = exe.run(
+      fluid.default_main_program(),
+      feed={'x': x_tensor,
+            'y': y_tensor},
+      fetchlist=[avg_cost])
+
+  train_reader = paddle.batch(
+      paddle.reader.shuffle(demo_reader(), buf_size=500), batch_size=4)
+  feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+  for data in train_reader():
+      cost = exe.run(
+          fluid.default_main_program(),
+          feed=feeder.feed(data),
+          fetch_list=[cost])
+  ```
+
+</font>
+
+---
+
+### 常见问题
+
+<font size=5>
+
+- 如何使用 evaluator ? [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_label_semantic_roles.py#L168)
+
+    ```python
+    accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+    for pass_id in range(PASS_NUM):
+        accuracy.reset()
+        for data in train_reader():
+            loss, acc = exe.run(fluid.default_main_program(),
+                                feed=feeder.feed(data),
+                                fetch_list=[avg_cost] + accuracy.metrics)
+             pass_acc = accuracy.eval(exe)
+             # acc 当前一个batch 的 accuracy
+             # pass_acc 当前batch 的 accuracy
+         pass_total_acc = accuracy.eval(exe)  # 整个pass的accuracy
+    ```
+
+- 如何在训练中测试？[->](https://github.com/dzhwinter/benchmark/blob/master/fluid/vgg16.py#L144)
+- 如何保存训练好的模型？[->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py#L143)
+- 如何加载训练好的模型进行预测？[->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py#L154)
+- 如何在同一个训练任务中定义多个Program，并交替运行？ [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/demo/fc_gan.py)
+- 如何profile？Fluid 实现了profile 工具，可以直接调用。请参考示例 [->](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/unittests/test_profiler.py)
+
+
+</font>
+
+---
diff --git a/doc/fluid/getstarted/concepts/index_cn.rst b/doc/fluid/getstarted/concepts/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2e7f70fc4cb871a80ffaffec6c06797973cd2f85
--- /dev/null
+++ b/doc/fluid/getstarted/concepts/index_cn.rst
@@ -0,0 +1,4 @@
+基本使用概念
+============
+
+TBD
diff --git a/doc/fluid/getstarted/concepts/index_en.rst b/doc/fluid/getstarted/concepts/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..78cca1e2a3443c2949ca0655190b0f05502f519a
--- /dev/null
+++ b/doc/fluid/getstarted/concepts/index_en.rst
@@ -0,0 +1,4 @@
+Concepts
+============
+
+TBD
diff --git a/doc/design/reader/README.md b/doc/fluid/getstarted/concepts/reader/README.md
similarity index 100%
rename from doc/design/reader/README.md
rename to doc/fluid/getstarted/concepts/reader/README.md
diff --git a/doc/fluid/getstarted/concepts/save_model/model_format.md b/doc/fluid/getstarted/concepts/save_model/model_format.md
new file mode 100644
index 0000000000000000000000000000000000000000..1f12ba0497369eacc6a2db7984781b5672f45ea1
--- /dev/null
+++ b/doc/fluid/getstarted/concepts/save_model/model_format.md
@@ -0,0 +1,76 @@
+# Design Doc: Model Format
+
+## Motivation
+
+A model is an output of the training process. One complete model consists of two parts, the **topology** and the **parameters**. In order to support industrial deployment, the model format must be self-complete and must not expose any training source code.
+
+As a result, In PaddlePaddle, the **topology** is represented as a  [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model. We must support large size parameters and efficient serialization/deserialization of parameters.
+
+## Implementation
+
+The topology is saved as a plain text in a detailed self-contain protobuf file.
+
+The parameters are saved as a binary file. As we all know, the protobuf message has a limit of [64M size](https://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.io.coded_stream#CodedInputStream.SetTotalBytesLimit.details). We have done a [benchmark experiment](https://github.com/PaddlePaddle/Paddle/pull/4610), which shows that protobuf is not fit for the task.
+
+As a result, we design a particular format for tensor serialization. By default, an arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of [LoDTensorDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99). We save the DescProto as the byte string header. It contains all the necessary information, such as the `dims`, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). A tensor stores values in a continuous memory buffer. For speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is,
+
+The table below shows a tensor's byte view in detail. Note that all the signed values are written in the little-endian format.
+
+<table>
+<thead>
+<tr>
+<th>field name</th>
+<th>type </th>
+<th>description </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> version</td>
+<td> uint32_t </td>
+<td> Version of saved file. Always 0 now.</td>
+</tr>
+
+<tr>
+<td> tensor desc length  </td>
+<td> uint32_t </td>
+<td> TensorDesc(Protobuf message) length in bytes. </td>
+</tr>
+<tr>
+<td>tensor desc </td>
+<td> void*</td>
+<td> TensorDesc protobuf binary message </td>
+</tr>
+<tr>
+<td> tensor data </td>
+<td> void* </td>
+<td> Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()` </td>
+</tr>
+<tr>
+<td> lod_level</td>
+<td> uint64_t </td>
+<td> Level of LoD </td>
+</tr>
+<tr>
+<td> length of lod[0] </td>
+<td> uint64_t </td>
+<td> [Optional] length of lod[0] in bytes. </td>
+</tr>
+<tr>
+<td> data of lod[0] </td>
+<td> uint64_t*   </td>
+<td> [Optional] lod[0].data() </td>
+</tr>
+<tr>
+<td>... </td>
+<td> ... </td>
+<td> ... </td>
+</tr>
+</tbody>
+</table>
+
+## Summary
+
+- We introduce a model format.
+- The model represented by its forward-pass computation procedure is saved in a **ProgramDesc** protobuf message.
+- A bunch of specified format binary tensors describe the **parameters**.
diff --git a/doc/fluid/getstarted/index_cn.rst b/doc/fluid/getstarted/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3daea71d0933a2774227ff2b5e744392ca6b1765
--- /dev/null
+++ b/doc/fluid/getstarted/index_cn.rst
@@ -0,0 +1,20 @@
+新手入门
+============
+
+
+如果需要快速了解PaddlePaddle的使用，可以参考以下指南。
+
+..  toctree::
+  :maxdepth: 1
+
+  quickstart_cn.rst
+
+
+在使用PaddlePaddle构建应用时，需要了解一些基本概念。
+这里以一个线性回归为例子，详细介绍了PaddlePaddle的使用流程，包括数据格式，模型配置与训练等。
+
+..  toctree::
+  :maxdepth: 1
+
+  concepts/use_concepts_cn.rst
+  developer's_guide_to_paddle_fluid.md
diff --git a/doc/fluid/getstarted/index_en.rst b/doc/fluid/getstarted/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fb20bb4f245281c3acf67c417979dc63c144fef3
--- /dev/null
+++ b/doc/fluid/getstarted/index_en.rst
@@ -0,0 +1,19 @@
+GET STARTED
+============
+
+If you want to quickly know how to use PaddlePaddle, please refer to the following guide:
+
+..  toctree::
+  :maxdepth: 1
+
+  quickstart_en.rst
+
+While using PaddlePaddle to build applications, please understand some basic concepts.
+
+Here is an example of linear regression. It introduces workflow of PaddlePaddle, including data format, model configuration and training, etc.
+
+..  toctree::
+  :maxdepth: 1
+
+  concepts/index_en.rst
+  developer's_guide_to_paddle_fluid.md
diff --git a/doc/fluid/getstarted/quickstart_cn.rst b/doc/fluid/getstarted/quickstart_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6a964d4f8561f30aa10936d2399698c51583442c
--- /dev/null
+++ b/doc/fluid/getstarted/quickstart_cn.rst
@@ -0,0 +1,45 @@
+快速开始
+========
+
+快速安装
+--------
+
+PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12，并安装有Python2.7。
+执行下面的命令完成快速安装，版本为cpu_avx_openblas：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+如果需要安装支持GPU的版本（cuda8.0_cudnn5_avx_openblas），需要执行：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+更详细的安装和编译方法参考： :ref:`install_steps` 。
+
+快速使用
+--------
+
+创建一个 housing.py 并粘贴此Python代码：
+
+  .. code-block:: python
+
+     import paddle.dataset.uci_housing as uci_housing
+     import paddle.fluid as fluid
+
+     with fluid.scope_guard(fluid.core.Scope()):
+         # initialize executor with cpu
+         exe = fluid.Executor(place=fluid.CPUPlace())
+         # load inference model
+         [inference_program, feed_target_names,fetch_targets] =  \
+             fluid.io.load_inference_model(uci_housing.fluid_model(), exe)
+         # run inference
+         result = exe.run(inference_program,
+                          feed={feed_target_names[0]: uci_housing.predict_reader()},
+                          fetch_list=fetch_targets)
+         # print predicted price is $12,273.97
+         print 'Predicted price: ${:,.2f}'.format(result[0][0][0] * 1000)
+
+执行 :code:`python housing.py` 瞧！ 它应该打印出预测住房数据的清单。
diff --git a/doc/fluid/getstarted/quickstart_en.rst b/doc/fluid/getstarted/quickstart_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..680122f25893a5a48fac103266bda4788f891f6d
--- /dev/null
+++ b/doc/fluid/getstarted/quickstart_en.rst
@@ -0,0 +1,49 @@
+Quick Start
+============
+
+Quick Install
+-------------
+
+You can use pip to install PaddlePaddle with a single command, supports
+CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
+Simply run the following command to install, the version is cpu_avx_openblas:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+If you need to install GPU version (cuda8.0_cudnn5_avx_openblas), run:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+For more details about installation and build: :ref:`install_steps` .
+
+Quick Use
+---------
+
+Create a new file called housing.py, and paste this Python
+code:
+
+
+  .. code-block:: python
+
+     import paddle.dataset.uci_housing as uci_housing
+     import paddle.fluid as fluid
+
+     with fluid.scope_guard(fluid.core.Scope()):
+         # initialize executor with cpu
+         exe = fluid.Executor(place=fluid.CPUPlace())
+         # load inference model
+         [inference_program, feed_target_names,fetch_targets] =  \
+             fluid.io.load_inference_model(uci_housing.fluid_model(), exe)
+         # run inference
+         result = exe.run(inference_program,
+                          feed={feed_target_names[0]: uci_housing.predict_reader()},
+                          fetch_list=fetch_targets)
+         # print predicted price is $12,273.97
+         print 'Predicted price: ${:,.2f}'.format(result[0][0][0] * 1000)
+
+Run :code:`python housing.py` and voila! It should print out a list of predictions
+for the test housing data.
diff --git a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..55326940ce7c7dbaa5bf19f1950f470527ddf4f0
--- /dev/null
+++ b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md
@@ -0,0 +1,181 @@
+# Fluid 分布式版本使用指南
+本篇文章将说明如何在PaddlePaddle Fluid版本下进行分布式训练的配置和执行，以及将单机训练脚本改造成支持集群训练的版本
+
+## 准备工作
+* 可用的集群
+
+    包含一个或多个计算节点的集群，每一个节点都能够执行PaddlePaddle的训练任务且拥有唯一的IP地址，集群内的所有计算节点可以通过网络相互通信。
+* 安装PaddlePaddle Fluid with Distribution版本
+
+    所有的计算节点上均需要按照分布式版本的PaddlePaddle, 在用于GPU等设备的机器上还需要额外安装好相应的驱动程序和CUDA的库。
+
+    **注意：**当前对外提供的PaddlePaddle版本并不支持分布式，需要通过源码重新编译。编译和安装方法参见[编译和安装指南](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html)。
+    cmake编译命令中需要将WITH_DISTRIBUTE设置为ON，下面是一个cmake编译指令示例：
+``` bash
+cmake .. -DWITH_DOC=OFF -DWITH_GPU=OFF -DWITH_DISTRIBUTE=ON -DWITH_SWIG_PY=ON -DWITH_PYTHON=ON
+```
+
+## 更新训练脚本
+这里，我们以[Deep Learing 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)课程中的第一章 fit a line 为例，描述如何将单机训练脚本改造成支持集群训练的版本。
+### 单机训练脚本示例
+```python
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+
+BATCH_SIZE = 20
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+exe.run(fluid.default_startup_program())
+
+PASS_NUM = 100
+for pass_id in range(PASS_NUM):
+    fluid.io.save_persistables(exe, "./fit_a_line.model/")
+    fluid.io.load_persistables(exe, "./fit_a_line.model/")
+    for data in train_reader():
+        avg_loss_value, = exe.run(fluid.default_main_program(),
+                                  feed=feeder.feed(data),
+                                  fetch_list=[avg_cost])
+
+        if avg_loss_value[0] < 10.0:
+            exit(0)  # if avg cost less than 10.0, we think our code is good.
+exit(1)
+```
+
+我们创建了一个简单的全连接神经网络程序，并且通过Fluid的Executor执行了100次迭代,现在我们需要将该单机版本的程序更新为分布式版本的程序。
+### 介绍Parameter Server
+在非分布式版本的训练脚本中，只存在Trainer一种角色，它不仅处理常规的计算任务，也处理参数相关的计算、保存和优化任务。在分布式版本的训练过程中，由于存在多个Trainer节点进行同样的数据计算任务，因此需要有一个中心化的节点来统一处理参数相关的保存和分配。在PaddlePaddle中，我们称这样的节点为[Parameter Server](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/dist_train/parameter_server.md)
+
+**因此，在分布式的Fluid环境中，我们有两个角色需要创建，分别是Parameter Server和Trainer。**
+
+### 分布式训练
+Fliud专门提供了工具[Distributed Transpiler](https://github.com/PaddlePaddle/Paddle/blob/ba65d54d9d3b41cd3c5171b00f476d4e60133ddb/doc/fluid/design/dist_train/distributed_architecture.md#distributed-transpiler)用于将单机版的训练程序转换为分布式版本的训练程序。工具背后的理念是找出程序的优化算子和梯度参数，将他们分隔为两部分，通过send/recv 操作算子进行连接,优化算子和梯度参数可以在优化器的minimize函数的返回值中获取到。
+```python
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+```
+将Distributed Transpiler、优化算子和梯度函数放在一个代码中如下：
+```python
+... #define the program, cost, and create sgd optimizer
+
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) #get optimize OPs and gradient parameters
+
+t = fluid.DistributeTranspiler() # create the transpiler instance
+# slice the program into 2 pieces with optimizer_ops and gradient parameters list, as well as pserver_endpoints, which is a comma separated list of [IP:PORT] and number of trainers
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+... #create executor
+
+# in pserver, run this
+#current_endpoint here means current pserver IP:PORT you wish to run on
+pserver_prog = t.get_pserver_program(current_endpoint)
+pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+exe.run(pserver_startup)
+exe.run(pserver_prog)
+
+# in trainer, run this
+... # define data reader
+exe.run(fluid.default_startup_program())
+for pass_id in range(100):
+    for data in train_reader():
+        exe.run(t.get_trainer_program())
+```
+### 分布式训练脚本运行说明
+分布式任务的运行需要将表格中说明的多个参数进行赋值:
+
+<table>
+<thead>
+<tr>
+<th>参数名</th>
+<th> 值类型</th>
+<th>说明</th>
+<th> 示例</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>trainer_id </td>
+<td> int</td>
+<td> 当前训练节点的ID，训练节点ID编号为0 - n-1， n为trainers的值 </td>
+<td> 0/1/2/3  </td>
+</tr>
+<tr>
+<td>pservers </td>
+<td> str</td>
+<td> parameter server 列表 </td>
+<td> 127.0.0.1:6710,127.0.0.1:6711 </td>
+</tr>
+<tr>
+<td>trainers </td>
+<td>int </td>
+<td> 训练节点的总个数，>0的数字 </td>
+<td> 4 </td>
+</tr>
+<tr>
+<td> server_endpoint</td>
+<td> str </td>
+<td> 当前所起的服务节点的IP:PORT </td>
+<td> 127.0.0.1:8789 </td>
+</tr>
+<tr>
+<td> training_role</td>
+<td>str </td>
+<td> 节点角色， TRAINER/PSERVER </td>
+<td> PSERVER </td>
+</tr>
+</tbody>
+</table>
+
+
+**注意：** ```training_role```是用来区分当前所起服务的角色的，用于训练程序中，用户可根据需要自行定义，其他参数为fluid.DistributeTranspiler的transpile函数所需要，需要在调用函数前进行定义，样例如下：
+
+```python
+t = fluid.DistributeTranspiler()
+t.transpile(
+    optimize_ops,
+    params_grads,
+    trainer_id,
+    pservers=pserver,
+    trainers=trainers)
+if training_role == "PSERVER":
+    pserver_prog = t.get_pserver_program(server_endpoint)
+    pserver_startup = t.get_startup_program(server_endpoint, pserver_prog)
+```
+
+### Demo
+完整的demo代码位于Fluid的test目录下的[book](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_fit_a_line.py)中。
+
+第一步，进入demo代码所在目录：
+```bash
+cd /paddle/python/paddle/fluid/tests/book
+```
+
+第二步，启动Parameter Server：
+```bash
+PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.2 PADDLE_TRAINERS=2 PADDLE_CURRENT_IP=192.168.1.2 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=PSERVER python test_fit_a_line.py
+```
+执行命令后请等待出现提示： ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。
+
+第三步，启动Trainer：
+```bash
+PADDLE_PSERVER_PORT=6174 PADDLE_PSERVER_IPS=192.168.1.3 PADDLE_TRAINERS=2 PADDLE_CURRENT_IPP=192.168.1.3 PADDLE_TRAINER_ID=1 PADDLE_TRAINING_ROLE=TRAINER python test_fit_a_line.py
+```
+由于我们定义的Trainer的数量是2个，因此需要在另外一个计算节点上再启动一个Trainer。
+
+现在我们就启动了一个包含一个Parameter Server和两个Trainer的分布式训练任务。
diff --git a/doc/fluid/howto/cluster/fluid_recordio.md b/doc/fluid/howto/cluster/fluid_recordio.md
new file mode 100644
index 0000000000000000000000000000000000000000..92859e8f622d0c155128821c54252113c5016989
--- /dev/null
+++ b/doc/fluid/howto/cluster/fluid_recordio.md
@@ -0,0 +1,127 @@
+# How to use RecordIO in Fluid
+
+If you want to use RecordIO as your training data format, you need to convert to your training data
+to RecordIO files and reading them in the process of training, PaddlePaddle Fluid provides some
+interface to deal with the RecordIO files.
+
+## Generate RecordIO File
+
+Before start training with RecordIO files, you need to convert your training data
+to RecordIO format by `fluid.recordio_writer.convert_reader_to_recordio_file`, the sample codes
+as follows:
+
+```python
+    reader = paddle.batch(mnist.train(), batch_size=1)
+    feeder = fluid.DataFeeder(
+        feed_list=[  # order is image and label
+            fluid.layers.data(
+            name='image', shape=[784]),
+            fluid.layers.data(
+            name='label', shape=[1], dtype='int64'),
+        ],
+        place=fluid.CPUPlace())
+    fluid.recordio_writer.convert_reader_to_recordio_file('./mnist.recordio', reader, feeder)
+```
+
+The above code snippet would generate a RecordIO `./mnist.recordio` on your host.
+
+**NOTE**: we recommend users to set `batch_size=1` when generating the recordio files so that users can
+adjust it flexibly while reading it.
+
+## Use the RecordIO file in a Local Training Job
+
+PaddlePaddle Fluid provides an interface `fluid.layers.io.open_recordio_file` to load your RecordIO file
+and then you can use them as a Layer in your network configuration, the sample codes as follows:
+
+```python
+    data_file = fluid.layers.io.open_recordio_file(
+        filename="./mnist.recordio",
+        shapes=[(-1, 784),(-1, 1)],
+        lod_levels=[0, 0],
+        dtypes=["float32", "int32"])
+    data_file = fluid.layers.io.batch(data_file, batch_size=4)
+
+    img, label = fluid.layers.io.read_file(data_file)
+    hidden = fluid.layers.fc(input=img, size=100, act='tanh')
+    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_loss = fluid.layers.mean(loss)
+
+    fluid.optimizer.Adam(learning_rate=1e-3).minimize(avg_loss)
+
+    place = fluid.CPUPlace()
+
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+    avg_loss_np = []
+
+    # train a pass
+    batch_id = 0
+    while True:
+        tmp, = exe.run(fetch_list=[avg_loss])
+
+        avg_loss_np.append(tmp)
+        print(batch_id)
+        batch_id += 1
+```
+
+## Use the RecordIO files in Distributed Training
+
+1. generate multiple RecordIO files
+
+For a distributed training job, you may have multiple trainer nodes,
+and one or more RecordIO files for one trainer node, you can use the interface
+`fluid.recordio_writer.convert_reader_to_recordio_files` to convert your training data
+into multiple RecordIO files, the sample codes as follows:
+
+```python
+    reader = paddle.batch(mnist.train(), batch_size=1)
+    feeder = fluid.DataFeeder(
+        feed_list=[  # order is image and label
+            fluid.layers.data(
+            name='image', shape=[784]),
+            fluid.layers.data(
+            name='label', shape=[1], dtype='int64'),
+        ],
+        place=fluid.CPUPlace())
+    fluid.recordio_writer.convert_reader_to_recordio_files(
+          filename_suffix='./mnist.recordio', batch_per_file=100, reader, feeder)
+```
+
+The above codes would generate multiple RecordIO files on your host like:
+
+```bash
+.
+ \_mnist-00000.recordio
+ |-mnist-00001.recordio
+ |-mnist-00002.recordio
+ |-mnist-00003.recordio
+ |-mnist-00004.recordio
+```
+
+2. open multiple RecordIO files by `fluid.layers.io.open_files`
+
+For a distributed training job, the distributed operator system will schedule trainer process on multiple nodes,
+each trainer process reads parts of the whole training data, we usually take the following approach to make the training
+data allocated by each trainer process as uniform as possiable:
+
+```python
+def gen_train_list(file_pattern, trainers, trainer_id):
+   file_list = glob.glob(file_pattern)
+   ret_list = []
+   for idx, f in enumerate(file_list):
+       if (idx + trainers) % trainers == trainer_id:
+           ret_list.append(f)
+   return ret_list
+
+trainers = int(os.getenv("PADDLE_TRAINERS"))
+trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+data_file = fluid.layers.io.open_files(
+    filenames=gen_train_list("./mnist-[0-9]*.recordio", 2, 0),
+    thread_num=1,
+    shapes=[(-1, 784),(-1, 1)],
+    lod_levels=[0, 0],
+    dtypes=["float32", "int32"])
+img, label = fluid.layers.io.read_file(data_files)
+...
+```
diff --git a/doc/fluid/howto/cluster/nccl2_rdma_training.md b/doc/fluid/howto/cluster/nccl2_rdma_training.md
new file mode 100644
index 0000000000000000000000000000000000000000..cecd5c3a7a7339e3be6772543a534728ec132105
--- /dev/null
+++ b/doc/fluid/howto/cluster/nccl2_rdma_training.md
@@ -0,0 +1,110 @@
+# Distributed Training with NCCL2 and RDMA
+
+When doing distributed multi-GPU training, network bandwith often becomes the
+bottle neck. We introduce a way to use NCCL2 to do such training job to
+achieve best performace.
+
+## Prepare Hardwares with RDMA and Multiple GPUs
+
+I'm using two Linux servers each of them is installed with 8 GPUs and
+one 100Gb RDMA card.
+Base environment is:
+
+* OS: CentOS 7.4
+* RDMA device: "Mellanox Technologies MT27700 Family [ConnectX-4]"
+* Kernel version: `4.4.88-1.el7.elrepo.x86_64`
+* Docker version: `1.12.6`
+* Docker storage driver: `overlay2`
+* IP addresses: 192.168.16.30,192.168.16.34
+
+In general, the steps including:
+
+1. Install GPU drivers
+1. Install RDMA drivers
+1. Install "InfiniBand Support"
+1. Use docker to run tests and make sure GPUs and RDMA can work inside
+   the container.
+
+I'll ommit section "Install GPU drivers" because we can find it easily
+somewhere else.
+
+### Install RDMA drivers
+
+For my case, I've got two machines with device
+"Mellanox Technologies MT27700 Family [ConnectX-4]" installed. The OS was
+"CentOS 7.4" and I updated the kernel to version 4.4 so that docker can
+work with latest overlay2 filesystem.
+
+***NOTE: before you start, make sure you have a way to get a console
+of the server other than ssh because we may need to re-configure the
+network device.***
+
+1. Go to http://www.mellanox.com/page/products_dyn?product_family=26,
+   download `MLNX_OFED` software in the bottom of the page, and upload it
+   onto the server.
+1. Run `./mlnxofedinstall --add-kernel-support` in the software package.
+1. Run `/etc/init.d/openibd restart` to make everything work, note that
+   this operation may cause the network goes down if you are using this
+   RDMA device as default network device and use ssh to login the server.
+1. Re-configure the network interface, for example:
+   `ifconfig eth2 192.168.16.30/20 up`, then add routes if needed:
+   `ip route add default via 192.168.16.1 dev eth2`.
+1. Do the same thing on the other node.
+1. Use `ping` to test if the two nodes have typical ICMP connection.
+1. Use either `udaddy` or `ib_write_bw` to test the network connection is
+   ready and have the desired bandwith.
+
+### Prepare Docker Image to Run RDMA Programs
+
+1. Build a docker image using cuda base image like: `nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04` and install paddlepaddle whl
+   package in it.
+1. Start a docker container and mount GPU driver libs into it (you can
+   skip this step if you are using nvidia-docker).
+1. Mount RDMA dirvers and libs into the docker image (see below section),
+   also `udaddy` and `ib_write_bw` if needed.
+1. Mount GPU devices and RDMA devices into the container using `--device`
+   or just use privileged mode `--privileged`.
+1. Start the container using host network mode: `--net=host`
+
+### RDMA Library Files Needed
+
+Usually, `MLNX_OFED` install latest supported libs under
+`/usr/lib64/mlnx_ofed/valgrind`. Other libs also needed to run RDMA programs
+is listed below. These libs must be mounted into the docker container.
+
+* Libs under `/usr/lib64/mlnx_ofed/valgrind`
+  * libibcm.so
+  * libibverbs.so
+  * libmlx4.so
+  * libmlx5.so
+  * libmlx5-rdmav2.so
+  * librdmacm.so
+* Other libs:
+  * libnl-3.so.200
+  * libnl-route-3.so.200
+  * libnuma.so.1
+
+## Start to Run the Training Job
+
+Setting NCCL environment variables to turn NCCL switches on and off:
+
+
+| Env Name | Description |
+| --- | --- |
+| NCCL_SOCKET_IFNAME | The RDMA device, e.g. eth2 |
+| NCCL_P2P_DISABLE | Set to 1 to disable P2P transfer between GPUs |
+| NCCL_IB_DISABLE | Set to 1 to disable using RDMA |
+| NCCL_IB_CUDA_SUPPORT | Set to 1 to enable GPU Direct if supported |
+| NCCL_DEBUG | Set debug level: VERSION, WARN, INFO |
+
+My two servers are: `192.168.16.30,192.168.16.34`, On node 1, Run :
+
+```bash
+PADDLE_TRAINER_ID=0 PADDLE_PORT=48372 PADDLE_WORKERS=192.168.16.30,192.168.16.34 POD_IP=192.168.16.30 stdbuf -oL python vgg16.py
+```
+
+On node 2, Run:
+
+```bash
+PADDLE_TRAINER_ID=1 PADDLE_PORT=48372 PADDLE_WORKERS=192.168.16.30,192.168.16.34 POD_IP=192.168.16.34 stdbuf -oL python vgg16.py
+```
diff --git a/doc/fluid/howto/index_cn.rst b/doc/fluid/howto/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b57af64f44da82926c4862578f3072960ca5aa92
--- /dev/null
+++ b/doc/fluid/howto/index_cn.rst
@@ -0,0 +1,8 @@
+进阶使用
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  inference/index_cn.rst
+  optimization/index_cn.rst
diff --git a/doc/fluid/howto/index_en.rst b/doc/fluid/howto/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fd21e167ce3a46da167db1e9d7013804f730e047
--- /dev/null
+++ b/doc/fluid/howto/index_en.rst
@@ -0,0 +1,7 @@
+HOW TO
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  optimization/index_en.rst
diff --git a/doc/fluid/howto/inference/build_and_install_lib_cn.rst b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..84005b54e07cf810649370d2c1f6b6c522434bf6
--- /dev/null
+++ b/doc/fluid/howto/inference/build_and_install_lib_cn.rst
@@ -0,0 +1,97 @@
+安装与编译C++预测库
+===========================
+
+直接下载安装
+-------------
+
+======================   ========================================
+版本说明                            C++预测库   
+======================   ========================================
+cpu_avx_mkl              `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/fluid.tgz>`_ 
+cpu_avx_openblas         `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/fluid.tgz>`_
+cpu_noavx_openblas       `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/fluid.tgz>`_
+cuda7.5_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+cuda8.0_cudnn5_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+cuda8.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/fluid.tgz>`_
+cuda9.0_cudnn7_avx_mkl   `fluid.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/fluid.tgz>`_
+======================   ========================================
+
+从源码编译
+----------
+用户也可以从 PaddlePaddle 核心代码编译C++预测库，只需在编译时配制下面这些编译选项：
+
+=================   =========
+选项                 值   
+=================   =========
+CMAKE_BUILD_TYPE    Release
+FLUID_INSTALL_DIR   安装路径    
+WITH_FLUID_ONLY     ON（推荐）
+WITH_SWIG_PY        OFF（推荐
+WITH_PYTHON         OFF（推荐）
+WITH_GPU            ON/OFF
+WITH_MKL            ON/OFF
+=================   =========
+
+建议按照推荐值设置，以避免链接不必要的库。其它可选编译选项按需进行设定。
+
+下面的代码片段从github拉取最新代码，配制编译选项（需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径）：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+     PADDLE_ROOT=/path/of/capi
+     git clone https://github.com/PaddlePaddle/Paddle.git
+     cd Paddle
+     mkdir build
+     cd build
+     cmake -DFLUID_INSTALL_DIR=$PADDLE_ROOT \
+           -DCMAKE_BUILD_TYPE=Release \
+           -DWITH_FLUID_ONLY=ON \
+           -DWITH_SWIG_PY=OFF \
+           -DWITH_PYTHON=OFF \
+           -DWITH_MKL=OFF \
+           -DWITH_GPU=OFF  \
+           ..
+      make
+      make inference_lib_dist
+
+成功编译后，使用C++预测库所需的依赖（包括：（1）编译出的PaddlePaddle预测库和头文件；（2）第三方链接库和头文件；（3）版本信息与编译选项信息）
+均会存放于PADDLE_ROOT目录中。目录结构如下：
+
+  .. code-block:: text
+
+     PaddleRoot/
+     ├── CMakeCache.txt
+     ├── paddle
+     │   └── fluid
+     │       ├── framework
+     │       ├── inference
+     │       ├── memory
+     │       ├── platform
+     │       ├── pybind
+     │       └── string
+     ├── third_party
+     │   ├── boost
+     │   │   └── boost
+     │   ├── eigen3
+     │   │   ├── Eigen
+     │   │   └── unsupported
+     │   └── install
+     │       ├── gflags
+     │       ├── glog
+     │       ├── mklml
+     │       ├── protobuf
+     │       ├── snappy
+     │       ├── snappystream
+     │       └── zlib
+     └── version.txt
+     
+version.txt 中记录了该预测库的版本信息，包括Git Commit ID、使用OpenBlas或MKL数学库、CUDA/CUDNN版本号，如：
+
+  .. code-block:: text
+
+     GIT COMMIT ID: c95cd4742f02bb009e651a00b07b21c979637dc8
+     WITH_MKL: ON
+     WITH_GPU: ON
+     CUDA version: 8.0
+     CUDNN version: v5
diff --git a/doc/fluid/howto/inference/index_cn.rst b/doc/fluid/howto/inference/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a903423548decd0992bf19772fb2cb143f6a12b5
--- /dev/null
+++ b/doc/fluid/howto/inference/index_cn.rst
@@ -0,0 +1,8 @@
+预测库
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  build_and_install_lib_cn.rst
+  inference_support_in_fluid_cn.md
diff --git a/doc/fluid/howto/inference/inference_support_in_fluid_cn.md b/doc/fluid/howto/inference/inference_support_in_fluid_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..309b17fccd5c461c9c22beb64eb4c6792b7e4a7a
--- /dev/null
+++ b/doc/fluid/howto/inference/inference_support_in_fluid_cn.md
@@ -0,0 +1,304 @@
+# 使用指南
+
+## 目录：
+
+- Python Inference API
+- Inference C++ API
+- Inference实例
+- Inference计算优化
+
+## Python Inference API **[改进中]**
+- 保存Inference模型 ([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/io.py#L295))
+
+  ```python
+  def save_inference_model(dirname,
+                           feeded_var_names,
+                           target_vars,
+                           executor,
+                           main_program=None,
+                           model_filename=None,
+                           params_filename=None):
+  ```
+  Inference模型和参数将会保存到`dirname`目录下：
+  - 序列化的模型
+    - `model_filename`为`None`，保存到`dirname/__model__`
+    - `model_filename`非`None`，保存到`dirname/model_filename`
+  - 参数
+    - `params_filename`为`None`，单独保存到各个独立的文件，各文件以参数变量的名字命名
+    - `params_filename`非`None`，保存到`dirname/params_filename`
+
+- 两种存储格式
+  - 参数保存到各个独立的文件
+    - 如，设置`model_filename`为`None`、`params_filename`为`None`
+
+    ```bash
+    $ cd recognize_digits_conv.inference.model
+    $ ls
+    $ __model__ batch_norm_1.w_0 batch_norm_1.w_2 conv2d_2.w_0 conv2d_3.w_0 fc_1.w_0 batch_norm_1.b_0 batch_norm_1.w_1 conv2d_2.b_0 conv2d_3.b_0 fc_1.b_0
+    ```
+  - 参数保存到同一个文件
+    - 如，设置`model_filename`为`None`、`params_filename`为`__params__`
+
+    ```bash
+    $ cd recognize_digits_conv.inference.model
+    $ ls
+    $ __model__ __params__
+    ```
+- 加载Inference模型([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/io.py#L380))
+  ```python
+  def load_inference_model(dirname,
+                           executor,
+                           model_filename=None,
+                           params_filename=None):
+    ...
+    return [program, feed_target_names, fetch_targets]
+  ```
+
+## 链接Fluid Inference库
+- 示例项目([链接](https://github.com/luotao1/fluid_inference_example.git))
+
+  - GCC配置
+    ```bash
+    $ g++ -o a.out -std=c++11 main.cc \
+          -I${PADDLE_ROOT}/ \
+          -I${PADDLE_ROOT}/third_party/install/gflags/include \
+          -I${PADDLE_ROOT}/third_party/install/glog/include \
+          -I${PADDLE_ROOT}/third_party/install/protobuf/include \
+          -I${PADDLE_ROOT}/third_party/eigen3 \
+          -L${PADDLE_ROOT}/paddle/fluid/inference -lpaddle_fluid \
+          -lrt -ldl -lpthread
+    ```
+
+  - CMake配置
+    ```cmake
+    include_directories(${PADDLE_ROOT}/)
+    include_directories(${PADDLE_ROOT}/third_party/install/gflags/include)
+    include_directories(${PADDLE_ROOT}/third_party/install/glog/include)
+    include_directories(${PADDLE_ROOT}/third_party/install/protobuf/include)
+    include_directories(${PADDLE_ROOT}/third_party/eigen3)
+    target_link_libraries(${TARGET_NAME}
+                          ${PADDLE_ROOT}/paddle/fluid/inference/libpaddle_fluid.so
+                          -lrt -ldl -lpthread)
+    ```
+
+  - 设置环境变量：
+  `export LD_LIBRARY_PATH=${PADDLE_ROOT}/paddle/fluid/inference:$LD_LIBRARY_PATH`
+
+
+
+## C++ Inference API
+
+- 推断流程([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/test_helper.h#L91))
+
+  - 1、 初始化设备
+    ```cpp
+    #include "paddle/fluid/framework/init.h"
+    paddle::framework::InitDevices(false);
+    ```
+
+  - 2、 定义place，executor，scope
+    ```cpp
+    auto place = paddle::platform::CPUPlace();
+    auto executor = paddle::framework::Executor(place);
+    auto* scope = new paddle::framework::Scope();
+    ```
+
+  - 3、 加载模型
+    ```cpp
+    #include "paddle/fluid/inference/io.h"
+    auto inference_program = paddle::inference::Load(executor, *scope, dirname);
+    // or
+    auto inference_program = paddle::inference::Load(executor,
+                                                     *scope,
+                                                     dirname + "/" + model_filename,
+                                                     dirname + "/" + params_filename);
+    ```
+
+  - 4、 获取`feed_target_names`和`fetch_target_names`
+    ```cpp
+    const std::vector<std::string>& feed_target_names = inference_program->GetFeedTargetNames();
+    const std::vector<std::string>& fetch_target_names = inference_program->GetFetchTargetNames();
+    ```
+
+  - 5、 准备`feed`数据
+    ```cpp
+    #include "paddle/fluid/framework/lod_tensor.h"
+    std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+    ...
+    std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+    for (size_t i = 0; i < feed_target_names.size(); ++i) {
+      // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
+      feed_targets[feed_target_names[i]] = cpu_feeds[i];
+    }
+    ```
+
+  - 6、 定义`Tensor`来`fetch`结果
+    ```cpp
+    std::vector<paddle::framework::LoDTensor*> cpu_fetchs;
+    std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+    for (size_t i = 0; i < fetch_target_names.size(); ++i) {
+      fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
+    }
+    ```
+
+  - 7、 执行`inference_program`
+    ```cpp
+    executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+    ```
+
+  - 8、 使用`fetch`数据
+    ```cpp
+    for (size_t i = 0; i < cpu_fetchs.size(); ++i) {
+      std::cout << "lod_i: " << cpu_fetchs[i]->lod();
+      std::cout << "dims_i: " << cpu_fetchs[i]->dims();
+      std::cout << "result:";
+      float* output_ptr = cpu_fetchs[i]->data<float>();
+      for (int j = 0; j < cpu_fetchs[i]->numel(); ++j) {
+        std::cout << " " << output_ptr[j];
+      }
+      std::cout << std::endl;
+    }
+    ```
+    针对不同的数据，4. - 8.可执行多次。
+
+  - 9、 释放内存
+    ```cpp
+    delete scope;
+    ```
+
+
+- 接口说明
+
+  ```cpp
+  void Run(const ProgramDesc& program, Scope* scope,
+           std::map<std::string, const LoDTensor*>& feed_targets,
+           std::map<std::string, LoDTensor*>& fetch_targets,
+           bool create_vars = true,
+           const std::string& feed_holder_name = "feed",
+           const std::string& fetch_holder_name = "fetch");
+  ```
+  - 使用Python API `save_inference_model`保存的`program`里面包含了`feed_op`和`fetch_op`，用户提供的`feed_targets`、`fetch_targets`必须和`inference_program`中的`feed_op`、`fetch_op`保持一致。
+  - 用户提供的`feed_holder_name`和`fetch_holder_name`也必须和`inference_program`中`feed_op`、`fetch_op`保持一致，可使用`SetFeedHolderName`和`SetFetchHolderName`接口重新设置`inferece_program`
+  - 默认情况下，除了`persistable`属性设置为`True`的`Variable`之外，每次执行`executor.Run`会创建一个局部`Scope`，并且在这个局部`Scope`中创建和销毁所有的`Variable`，以最小化空闲时的内存占用。
+  - `persistable`属性为`True`的`Variable`有：
+    - Operators的参数`w`、`b`等
+    - `feed_op`的输入变量
+    - `fetch_op`的输出变量
+
+
+- **不在每次执行时创建和销毁变量
+ ([PR](https://github.com/PaddlePaddle/Paddle/pull/9301))**
+  - 执行`inference_program`
+    ```cpp
+    // Call once
+    executor.CreateVariables(*inference_program, scope, 0);
+    // Call as many times as you like
+    executor.Run(
+        *inference_program, scope, feed_targets, fetch_targets, false);
+    ```
+  - **优点**
+    - 节省了频繁创建、销毁变量的时间（约占每次`Run`总时间的1% ~ 12%）
+    - 执行结束后可获取所有Operators的计算结果
+  - **缺点**
+    - 空闲时也会占用大量的内存
+    - 在同一个`Scope`中，相同的变量名是公用同一块内存的，容易引起意想不到的错误
+
+
+- **不在每次执行时创建Op([PR](https://github.com/PaddlePaddle/Paddle/pull/9630))**
+  - 执行`inference_program`
+    ```cpp
+    // Call once
+    auto ctx = executor.Prepare(*inference_program, 0);
+    // Call as many times as you like if you have no need to change the inference_program
+    executor.RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets);
+    ```
+  - **优点**
+    - 节省了频繁创建、销毁Op的时间
+  - **缺点**
+    - 一旦修改了`inference_program`，则需要重新创建`ctx`
+
+
+- **多线程共享Parameters([链接](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/test_multi_thread_helper.h))**
+  - 主线程
+    - 1、 初始化设备
+    - 2、 定义`place`，`executor`，`scope`
+    - 3、 加载模型，得到`inference_program`
+  - 从线程
+    - **复制`inference_program`得到`copy_program`，修改`copy_program`的`feed_holder_name`和`fetch_holder_name`**
+      ```cpp
+      auto copy_program = std::unique_ptr<paddle::framework::ProgramDesc>(
+                 new paddle::framework::ProgramDesc(*inference_program));
+      std::string feed_holder_name = "feed_" + paddle::string::to_string(thread_id);
+      std::string fetch_holder_name = "fetch_" + paddle::string::to_string(thread_id);
+      copy_program->SetFeedHolderName(feed_holder_name);
+      copy_program->SetFetchHolderName(fetch_holder_name);
+      ```
+    - 4、 获取`copy_program`的`feed_target_names`和`fetch_target_names`
+    - 5、 准备feed数据，定义Tensor来fetch结果
+    - 6、 执行`copy_program`
+      ```cpp
+      executor->Run(*copy_program, scope, feed_targets, fetch_targets, true, feed_holder_name, fetch_holder_name);
+      ```
+    - 7、 使用fetch数据
+  - 主线程
+    - 8、 释放资源
+
+
+- 基本概念
+  - 数据相关：
+    - [Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/tensor.md)，一个N维数组，数据可以是任意类型（int，float，double等）
+    - [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/lod_tensor.md)，带LoD(Level-of-Detail)即序列信息的Tensor
+    - [Scope](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md)，记录了变量Variable
+  - 执行相关：
+    - [Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md)，无状态执行器，只跟设备相关
+    - Place
+      - CPUPlace，CPU设备
+      - CUDAPlace，CUDA GPU设备
+  - 神经网络表示：
+    - [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md).
+
+    详细介绍请参考[**Paddle Fluid开发者指南**](https://github.com/lcy-seso/learning_notes/blob/master/Fluid/developer's_guid_for_Fluid/Developer's_Guide_to_Paddle_Fluid.md)
+
+
+
+## Inference实例
+
+  1. fit a line: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_fit_a_line.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc)
+  1. image classification: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_image_classification.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_image_classification.cc)
+  1. label semantic roles: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_label_semantic_roles.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc)
+  1. recognize digits: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recognize_digits.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc)
+  1. recommender system: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_recommender_system.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc)
+  1. understand sentiment: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_understand_sentiment.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc)
+  1. word2vec: [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_word2vec.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/book/test_inference_word2vec.cc)
+
+
+## Inference计算优化
+- 使用Python推理优化工具([inference_transpiler](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/inference_transpiler.py))
+  ```python
+  class InferenceTranspiler:
+    def transpile(self, program, place, scope=None):
+        ...
+        if scope is None:
+            scope = global_scope()
+        ...
+  ```
+  - 使用`InferenceTranspiler`将会直接修改`program`。
+  - 使用`InferenceTranspiler`会修改参数的值，请确保`program`的参数在`scope`内。
+- 支持的优化
+  - 融合batch_norm op的计算
+- 使用示例([链接](https://github.com/Xreki/Xreki.github.io/blob/master/fluid/inference/inference_transpiler.py))
+  ```python
+  import paddle.fluid as fluid
+  # NOTE: Applying the inference transpiler will change the inference_program.
+  t = fluid.InferenceTranspiler()
+  t.transpile(inference_program, place, inference_scope)
+  ```
+
+
+
+
+## 内存使用优化
+- 使用Python内存优化工具([memory_optimization_transipiler](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/memory_optimization_transpiler.py))
+  ```python
+  fluid.memory_optimize(inference_program)
+  ```
diff --git a/doc/fluid/howto/optimization/benchmark/index_cn.rst b/doc/fluid/howto/optimization/benchmark/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9404800eb86ca6d27886258b67393028c76954dc
--- /dev/null
+++ b/doc/fluid/howto/optimization/benchmark/index_cn.rst
@@ -0,0 +1,8 @@
+基准
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  vgg16/README.md
+  README.md
diff --git a/doc/fluid/howto/optimization/benchmark/index_en.rst b/doc/fluid/howto/optimization/benchmark/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1e200b660cc7f6aeaf8b3d94fd7a14999a52bccd
--- /dev/null
+++ b/doc/fluid/howto/optimization/benchmark/index_en.rst
@@ -0,0 +1,8 @@
+Benchmark
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  vgg16/README.md
+  README.md
diff --git a/doc/fluid/howto/optimization/cpu_profiling_cn.md b/doc/fluid/howto/optimization/cpu_profiling_cn.md
index d59be670c2b33b64d9b6f96b53f50e5bf9f0613b..198a05a79e19227e90eaafe116217a164cd51a7d 100644
--- a/doc/fluid/howto/optimization/cpu_profiling_cn.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_cn.md
@@ -1,3 +1,5 @@
+# CPU性能调优
+
 此教程会介绍如何使用Python的cProfile包、Python库yep、Google perftools来进行性能分析 (profiling) 与调优（performance tuning）。
 
 Profling 指发现性能瓶颈。系统中的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。Tuning 指消除瓶颈。性能优化的过程通常是不断重复地 profiling 和 tuning。
@@ -42,14 +44,40 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
 
 每一列的含义是:
 
-| 列名 | 含义 |
-| --- | --- |
-| ncalls | 函数的调用次数 |
-| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 |
-| percall | tottime的每次调用平均时间 |
-| cumtime | 函数总时间。包含这个函数调用其他函数的时间 |
-| percall | cumtime的每次调用平均时间 |
-| filename:lineno(function) | 文件名, 行号，函数名 |
+<table>
+<thead>
+<tr>
+<th>列名</th>
+<th>含义 </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> ncalls</td>
+<td> 函数的调用次数</td>
+</tr>
+<tr>
+<td>tottime</td>
+<td> 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间</td>
+</tr>
+<tr>
+<td> percall </td>
+<td> tottime的每次调用平均时间</td>
+</tr>
+<tr>
+<td> cumtime</td>
+<td> 函数总时间。包含这个函数调用其他函数的时间</td>
+</tr>
+<tr>
+<td> percall</td>
+<td> cumtime的每次调用平均时间</td>
+</tr>
+<tr>
+<td> filename:lineno(function) </td>
+<td> 文件名, 行号，函数名 </td>
+</tr>
+</tbody>
+</table>
 
 
 ### 寻找性能瓶颈
diff --git a/doc/fluid/howto/optimization/cpu_profiling_en.md b/doc/fluid/howto/optimization/cpu_profiling_en.md
index 01e5fddf61547f9fc86ef18a6f2e2ac508d22dbb..216694965b3c878a8a5f3ccd2a0cba8d21d9ce05 100644
--- a/doc/fluid/howto/optimization/cpu_profiling_en.md
+++ b/doc/fluid/howto/optimization/cpu_profiling_en.md
@@ -1,3 +1,5 @@
+# Tune CPU performance
+
 This tutorial introduces techniques we use to profile and tune the
 CPU performance of PaddlePaddle.  We will use Python packages
 `cProfile` and `yep`, and Google's `perftools`.
@@ -57,14 +59,40 @@ port, we will see the output like the following:
 where each line corresponds to Python function, and the meaning of
 each column is as follows:
 
-| column | meaning |
-| --- | --- |
-| ncalls | the number of calls into a function |
-| tottime | the total execution time of the function, not including the execution time of other functions called by the function |
-| percall | tottime divided by ncalls |
-| cumtime | the total execution time of the function, including the execution time of other functions being called |
-| percall | cumtime divided by ncalls |
-| filename:lineno(function) | where the function is defined |
+<table>
+<thead>
+<tr>
+<th>column</th>
+<th>meaning </th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td> ncalls</td>
+<td> the number of calls into a function</td>
+</tr>
+<tr>
+<td>tottime</td>
+<td> the total execution time of the function, not including the execution time of other functions called by the function</td>
+</tr>
+<tr>
+<td> percall </td>
+<td> tottime divided by ncalls</td>
+</tr>
+<tr>
+<td> cumtime</td>
+<td> the total execution time of the function, including the execution time of other functions being called</td>
+</tr>
+<tr>
+<td> percall</td>
+<td> cumtime divided by ncalls</td>
+</tr>
+<tr>
+<td> filename:lineno(function) </td>
+<td> where the function is define </td>
+</tr>
+</tbody>
+</table>
 
 ### Identify Performance Bottlenecks
 
@@ -81,7 +109,7 @@ focus on. We can sort above profiling file by tottime:
 
 We can see that the most time-consuming function is the `built-in
 method run`, which is a C++ function in `libpaddle.so`.  We will
-explain how to profile C++ code in the next section.  At this 
+explain how to profile C++ code in the next section.  At this
 moment, let's look into the third function `sync_with_cpp`, which is a
 Python function.  We can click it to understand more about it:
 
diff --git a/doc/fluid/howto/optimization/host_memory_profiling_cn.md b/doc/fluid/howto/optimization/host_memory_profiling_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..7fb0883dd937465d15479b29df95078edb50e069
--- /dev/null
+++ b/doc/fluid/howto/optimization/host_memory_profiling_cn.md
@@ -0,0 +1,89 @@
+# 堆内存分析和优化
+
+计算机程序都可能有内存泄漏的风险。**内存泄漏**一般是由于程序在堆(heap)上分配了内存而没有释放，随着程序的运行占用的内存越来越大，一方面会影响程序的稳定性，可能让运行速度越来越慢，或者造成oom，甚至会影响运行程序的机器的稳定性，造成宕机。
+
+
+目前有很多内存泄漏分析工具，比较经典的有[valgrind](http://valgrind.org/docs/manual/quick-start.html#quick-start.intro), [gperftools](https://gperftools.github.io/gperftools/)。
+
+因为Fluid是用Python驱动C++ core来运行，valgrind直接分析非常困难，需要自己编译debug版本的、带valgrind支持的专用Python版本，而且输出的信息中大部分是Python自己的符号和调用信息，分析起来很困难，另外使用valgrind会让程序运行速度变得非常慢，所以不建议使用。
+
+本教程主要介绍[gperftools](https://gperftools.github.io/gperftools/)的使用。
+
+gperftool主要支持以下四个功能：
+
+- thread-caching malloc
+- heap-checking using tcmalloc
+- heap-profiling using tcmalloc
+- CPU profiler
+
+Paddle也提供了基于gperftool的[CPU性能分析教程](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/cpu_profiling_cn.md)。
+
+对于堆内存的分析，主要用到thread-caching malloc和heap-profiling using tcmalloc。
+
+## 环境
+
+本教程基于paddle提供的Docker开发环境paddlepaddle/paddle:latest-dev，基于Ubuntu 16.04.4 LTS环境。
+
+## 使用流程
+
+- 安装google-perftools
+
+```
+apt-get install libunwind-dev 
+apt-get install google-perftools
+```
+
+- 安装pprof
+
+```
+go get -u github.com/google/pprof
+```
+
+- 设置运行环境
+
+```
+export PPROF_PATH=/root/gopath/bin/pprof
+export PPROF_BINARY_PATH=/root/gopath/bin/pprof
+export LD_PRELOAD=/usr/lib/libtcmalloc.so.4
+```
+
+- 使用heap profile来运行python程序。本质上是周期性的对堆的分配情况做一次快照。
+
+```
+# HEAPPROFILE 设置生成的堆分析文件的目录和文件前缀
+# HEAP_PROFILE_ALLOCATION_INTERVAL 设置每分配多少存储dump一次dump，默认1GB
+env HEAPPROFILE="./perf_log/test.log" HEAP_PROFILE_ALLOCATION_INTERVAL=209715200 python trainer.py
+```
+
+随着程序的运行，会在perf_log这个文件夹下生成很多文件，如下：
+
+```
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0001.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0002.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0003.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0004.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0005.heap
+-rw-r--r-- 1 root root 1.0M Jun  1 15:00 test.log.0006.heap
+```
+
+- 使用pprof对heap文件进行分析。分析有两种模式：
+	- 完整模式。会对当前heap做一个分析，显示目前分配内存一些调用路径。
+
+	```
+	pprof --pdf python test.log.0012.heap
+	```
+	上述命令会生成一个profile00x.pdf的文件，可以直接打开，例如：[memory_cpu_allocator](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_cpu_allocator.pdf)。从下图可以看出，在CPU版本fluid的运行过程中，分配存储最多的模块式CPUAllocator. 而别的模块相对而言分配内存较少，所以被忽略了，这对于分配内存泄漏是很不方便的，因为泄漏是一个缓慢的过程，在这种图中是无法看到的。
+	
+	![result](https://user-images.githubusercontent.com/3048612/40964027-a54033e4-68dc-11e8-836a-144910c4bb8c.png)
+	
+	- Diff模式。可以对两个时刻的heap做diff，把一些内存分配没有发生变化的模块去掉，而把增量部分显示出来。
+	```
+	pprof --pdf --base test.log.0010.heap python test.log.1045.heap
+	```
+	生成的结果为：[`memory_leak_protobuf`](https://github.com/jacquesqiao/Paddle/blob/bd2ea0e1f84bb6522a66d44a072598153634cade/doc/fluid/howto/optimization/memory_leak_protobuf.pdf)
+	
+	从图中可以看出：ProgramDesc这个结构，在两个版本之间增长了200MB+，所以这里有很大的内存泄漏的可能性，最终结果也确实证明是这里造成了泄漏。
+	
+	![result](https://user-images.githubusercontent.com/3048612/40964057-b434d5e4-68dc-11e8-894b-8ab62bcf26c2.png)
+	![result](https://user-images.githubusercontent.com/3048612/40964063-b7dbee44-68dc-11e8-9719-da279f86477f.png)
+	
diff --git a/doc/fluid/howto/optimization/index_cn.rst b/doc/fluid/howto/optimization/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..27cc96702356703b339db845dc81913bdcc9f23b
--- /dev/null
+++ b/doc/fluid/howto/optimization/index_cn.rst
@@ -0,0 +1,9 @@
+性能优化
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  timeline.md
+  cpu_profiling_cn.md
+  benchmark/index_cn.rst
diff --git a/doc/fluid/howto/optimization/index_en.rst b/doc/fluid/howto/optimization/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4ce624fe8f108a6afc7cd08a1542332755d22e04
--- /dev/null
+++ b/doc/fluid/howto/optimization/index_en.rst
@@ -0,0 +1,9 @@
+Performance Optimization
+---------------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  timeline.md
+  cpu_profiling_en.md
+  benchmark/index_en.rst
diff --git a/doc/fluid/howto/optimization/timeline.md b/doc/fluid/howto/optimization/timeline.md
deleted file mode 100644
index 9d9565a3e698a83ca465c5da83ff892360c33b8f..0000000000000000000000000000000000000000
--- a/doc/fluid/howto/optimization/timeline.md
+++ /dev/null
@@ -1,27 +0,0 @@
-## how to use timeline tool to do profile
-
-1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
-
-	```python
-	with profiler.profiler('All', 'total', '/tmp/profile') as prof:
-	    for pass_id in range(pass_num):
-	        for batch_id, data in enumerate(train_reader()):
-	            exe.run(fluid.default_main_program(),
-	                    feed=feeder.feed(data),
-	                    fetch_list=[],
-	                    use_program_cache=True)
-	            ...
-	```
-
-1. Run `python paddle/tools/timeline.py` to process `/tmp/profile`, it will generate another
-file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at
-[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details.
-
-1. Open chrome and visit <chrome://tracing/>, use `load` button to load the generated `timeline` file.
-
-	![chrome tracing](./tracing.jpeg)
-
-1. The resulting timeline should be like:
-
-
-	![chrome timeline](./timeline.jpeg)
diff --git a/doc/fluid/howto/optimization/timeline_cn.md b/doc/fluid/howto/optimization/timeline_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..5d061e1c00d2ca0194153730a39486b8357fa5b0
--- /dev/null
+++ b/doc/fluid/howto/optimization/timeline_cn.md
@@ -0,0 +1,26 @@
+# 如何使用timeline工具做性能分析
+
+1. 在训练的主循环外加上`with profiler.profiler(...)`。运行之后，代码会在`/tmp/profile`目录下生成一个profile的记录文件。
+
+	**提示：**
+	请不要在timeline记录信息时运行太多次迭代，因为timeline中的记录数量和迭代次数是成正比的。
+
+	```python
+	with profiler.profiler('All', 'total', '/tmp/profile') as prof:
+	    for pass_id in range(pass_num):
+	        for batch_id, data in enumerate(train_reader()):
+	            exe.run(fluid.default_main_program(),
+	                    feed=feeder.feed(data),
+	                    fetch_list=[])
+	            ...
+	```
+
+1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`，这个程序默认会生成一个`/tmp/timeline`文件，你也可以用命令行参数来修改这个路径，请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)。
+
+1. 打开chrome浏览器，访问<chrome://tracing/>，用`load`按钮来加载生成的`timeline`文件。
+
+	![chrome tracing](./tracing.jpeg)
+
+1. 结果如下图所示，可以放到来查看timetime的细节信息。
+
+	![chrome timeline](./timeline.jpeg)
diff --git a/doc/fluid/howto/optimization/timeline_en.md b/doc/fluid/howto/optimization/timeline_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..96481ae2a6e4442d40803f8d5361e5f942502df3
--- /dev/null
+++ b/doc/fluid/howto/optimization/timeline_en.md
@@ -0,0 +1,27 @@
+# how to use timeline tool to do profile
+
+1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
+
+	```python
+	with profiler.profiler('All', 'total', '/tmp/profile') as prof:
+	    for pass_id in range(pass_num):
+	        for batch_id, data in enumerate(train_reader()):
+	            exe.run(fluid.default_main_program(),
+	                    feed=feeder.feed(data),
+	                    fetch_list=[],
+	                    use_program_cache=True)
+	            ...
+	```
+
+1. Run `python paddle/tools/timeline.py` to process `/tmp/profile`, it will generate another
+file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at
+[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details.
+
+1. Open chrome and visit <chrome://tracing/>, use `load` button to load the generated `timeline` file.
+
+	![chrome tracing](./tracing.jpeg)
+
+1. The resulting timeline should be like:
+
+
+	![chrome timeline](./timeline.jpeg)
diff --git a/doc/design/error_clip.md b/doc/fluid/howto/performance/error_clip.md
similarity index 100%
rename from doc/design/error_clip.md
rename to doc/fluid/howto/performance/error_clip.md
diff --git a/doc/design/images/profiler.png b/doc/fluid/howto/performance/images/profiler.png
similarity index 100%
rename from doc/design/images/profiler.png
rename to doc/fluid/howto/performance/images/profiler.png
diff --git a/doc/fluid/howto/performance/profiler.md b/doc/fluid/howto/performance/profiler.md
new file mode 100644
index 0000000000000000000000000000000000000000..ee96e7c74ce317caddb387cbb1d4998937bd5c81
--- /dev/null
+++ b/doc/fluid/howto/performance/profiler.md
@@ -0,0 +1,97 @@
+## Introduction
+
+There are many performance analysis tools for [different programming languages and different software frameworks](https://en.wikipedia.org/wiki/List_of_performance_analysis_tools). For most popular deep learning frameworks, they use several programming languages and adapt to heterogeneous platforms. Similar to most of the deep learning frameworks, PaddlePaddle also uses C++, CUDA and Python as the basic programming languages to adapt to run on CPU and GPU devices.  The [`nvprof` tools](http://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvprof-overview) is usually used to analyse the CUDA program.  We have [a document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/optimization/cpu_profiling.md) to profile CPU and Python program by [yep](https://pypi.python.org/pypi/yep) and [Google's perftools](https://github.com/google/pprof) to profile only the CPU and Python program. But for [PaddlePaddle fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), the operator is the basic computing unit. The developers usually want to collect the time of each operator and locate bottlenecks.  The `nvprof` usually collect the timeline of CUDA-related activities on both CPU and GPU, including kernel execution, memory transfers, memory set and CUDA API calls and events or metrics for CUDA kernels. And the `yep` and `Google's perftools` can't collect the timeline for CUDA program. All these tools can't collect time in the operator level. So we design this profiling tool.
+
+## Architecture
+
+The work flow for most task is as follows. Each operator will run many times in the all iterations. So the profiler must collect the total time of each operator during the iteration. For more, sometimes, the developers may want to collect more detailed time span inside the operator or record time span for elsewhere, this requires that the profiler must support to record the nested time span. And in order to speedup training, all the deep learning frameworks support parallel computing, including multiple threads on CPU and multiple GPUs. So the profiler must be able to collect the timeline for each thread. In addition, the profiler also occupies certain resources. It must can be easily to be enabled or disabled by the developers. At last, the profiler should present a human-readable report.  
+
+```python
+for i in xrange(M):  # M is  the iteration number
+  for op in operator_lists: # The `operator_lists` contains all the operators in the network.
+    op.run();
+```
+
+In summary, the proflier should have following features:
+
+- records time span in loop.
+- supports nested time span.
+- supports multiple threads/multiple GPUs.
+- supports to be enabled and disabled by users.
+
+But how to record the time for the mixed C++ and CUDA program?  There many C++ APIs to get the current calendar time in host program. But for GPU, the CUDA kernels may be executed concurrently if they are in different [streams](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#streams) and the CUDA kernels is asynchronous with the host program if there is no the synchronous aftern the CUDA kernels. CUDA provides [event](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#events) to monitor the device and perform accurate timing. Inspired by PyTorch and CUDA event, we also design and apply the events to record the timeline. Then summarize and present statistics based on these events.  
+
+The overall flow is shown as the following figure.
+
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/profiler.png" align="center"/><br/>
+
+### Event
+
+In above work flow, a pair of events are needed before and after the piece of code to collect time. So the event has a flag to mark whether it is a starting event or an ending event. Except this two kinds of event, sometime, a only marker with a text message is needed, for example, a marker to specify the profiling start or end. There are three kinds of event:
+
+```c++
+enum EventKind {
+  kMark,
+  kPushRange,
+  kPopRange};
+```
+- kMark: only a marker without time range.
+- kPushRange: mark the starting event for time range.
+- kPopRange: mark the ending event for time range.
+
+For the CPU code, the events only need to record the current time. For the CUDA code, the [event management functions of CUDA](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT) are used.  For many pieces of code, an event lists are used to record each piece.
+
+```c++
+class Event {
+ public:
+  // The DeviceContext is used to get current  CUDA stream.
+  Event(EventKind kind, std::string name, uint32_t thread_id,
+        const platform::DeviceContext* dev_ctx = nullptr);
+  double CpuElapsedUs(const Event& e) const;
+  double CudaElapsedUs(const Event& e) const;
+
+ private:
+  EventKind kind_;
+  std::string name_;
+  uint32_t thread_id_;
+  int64_t cpu_ns_;
+#ifdef PADDLE_WITH_CUDA
+  cudaEvent_t event_ = nullptr;
+  int device_ = -1;
+#endif
+};
+
+struct EventList {
+  std::forward_list<std::vector<Event>> event_blocks;
+};
+```
+
+As mentioned above, there is no need to record the timeline when disabling the profiler. So there is a global state to enable or disable the profiler.
+
+```c++
+enum ProfilerState {
+  kDisabled,
+  kCPU,
+  kCUDA
+};
+ProfilerState g_state;
+```
+- kDisabled: the disabled state.
+- kCPU: CPU profiling state.
+- kCUDA: GPU profiling state.
+
+A pair of starting and ending events are pushed to event lists in constructor and destructor of `RecordEvent`. So the timeline is recorded for the code in the lifecycle of an object of `RecordEvent`.
+
+```c++
+struct RecordEvent {
+  explicit RecordEvent(const std::string name,
+                       platform::DeviceContext* dev_ctx = nullptr) {
+    if (kState == ProfilerState::kDisabled) return;
+    // push the starting event to the event lists.
+  }
+  ~RecordEvent() {
+    if (kState == ProfilerState::kDisabled) return;
+    // push the ending event to the event lists.
+  }
+};
+```
diff --git a/doc/design/images/multigpu_allreduce.graffle b/doc/fluid/howto/third_party/images/multigpu_allreduce.graffle
similarity index 100%
rename from doc/design/images/multigpu_allreduce.graffle
rename to doc/fluid/howto/third_party/images/multigpu_allreduce.graffle
diff --git a/doc/design/images/multigpu_allreduce.png b/doc/fluid/howto/third_party/images/multigpu_allreduce.png
similarity index 100%
rename from doc/design/images/multigpu_allreduce.png
rename to doc/fluid/howto/third_party/images/multigpu_allreduce.png
diff --git a/doc/design/images/multigpu_before_convert.graffle b/doc/fluid/howto/third_party/images/multigpu_before_convert.graffle
similarity index 100%
rename from doc/design/images/multigpu_before_convert.graffle
rename to doc/fluid/howto/third_party/images/multigpu_before_convert.graffle
diff --git a/doc/design/images/multigpu_before_convert.png b/doc/fluid/howto/third_party/images/multigpu_before_convert.png
similarity index 100%
rename from doc/design/images/multigpu_before_convert.png
rename to doc/fluid/howto/third_party/images/multigpu_before_convert.png
diff --git a/doc/design/mkl/mkldnn_fluid.md b/doc/fluid/howto/third_party/mkldnn_fluid.md
similarity index 100%
rename from doc/design/mkl/mkldnn_fluid.md
rename to doc/fluid/howto/third_party/mkldnn_fluid.md
diff --git a/doc/design/paddle_nccl.md b/doc/fluid/howto/third_party/paddle_nccl.md
similarity index 100%
rename from doc/design/paddle_nccl.md
rename to doc/fluid/howto/third_party/paddle_nccl.md
diff --git a/doc/fluid/images/1.png b/doc/fluid/images/1.png
new file mode 100644
index 0000000000000000000000000000000000000000..67daf566f91aab570e60971c4ea8e2be876e214d
Binary files /dev/null and b/doc/fluid/images/1.png differ
diff --git a/doc/fluid/images/2.png b/doc/fluid/images/2.png
new file mode 100644
index 0000000000000000000000000000000000000000..43367777f41449a666e7a3b571f09ac5d5dfb1ae
Binary files /dev/null and b/doc/fluid/images/2.png differ
diff --git a/doc/fluid/images/2_level_rnn.dot b/doc/fluid/images/2_level_rnn.dot
new file mode 100644
index 0000000000000000000000000000000000000000..5d77865061ca7bbbfcf254dd938f09aef5553505
--- /dev/null
+++ b/doc/fluid/images/2_level_rnn.dot
@@ -0,0 +1,56 @@
+digraph G {
+
+  rnn [label="1st level RNN" shape=box]
+
+  subgraph cluster0 {
+    label = "time step 0"
+
+    sent0 [label="sentence"]
+    sent1 [label="sentence"]
+
+    rnn1 [label="2nd level RNN" shape=box]
+
+    sent0 -> rnn1
+    sent1 -> rnn1
+  }
+
+  subgraph cluster1 {
+    label = "time step 1"
+
+    sent2 [label="sentence"]
+    sent3 [label="sentence"]
+
+    rnn2 [label="2nd level RNN" shape=box]
+
+    sent2 -> rnn2
+    sent3 -> rnn2
+  }
+
+  subgraph cluster2 {
+    label = "time step 2"
+
+    sent4 [label="sentence"]
+    sent5 [label="sentence"]
+
+    rnn3 [label="2nd level RNN" shape=box]
+
+    sent4 -> rnn3
+    sent5 -> rnn3
+  }
+
+
+  para0 [label="paragraph info 0"]
+  para1 [label="paragraph info 1"]
+  para2 [label="paragraph info 2"]
+
+  rnn1 -> para0
+  rnn2 -> para1
+  rnn3 -> para2
+
+  para0 -> rnn
+  para1 -> rnn
+  para2 -> rnn
+
+  chapter [label="chapter info"]
+  rnn -> chapter
+}
diff --git a/doc/fluid/images/2_level_rnn.png b/doc/fluid/images/2_level_rnn.png
new file mode 100644
index 0000000000000000000000000000000000000000..0537a75beb175c0c284717421f7aa908da2a5038
Binary files /dev/null and b/doc/fluid/images/2_level_rnn.png differ
diff --git a/doc/fluid/images/3.png b/doc/fluid/images/3.png
new file mode 100644
index 0000000000000000000000000000000000000000..481021ef306e2596818aab7fe17a570754f63635
Binary files /dev/null and b/doc/fluid/images/3.png differ
diff --git a/doc/fluid/images/4.png b/doc/fluid/images/4.png
new file mode 100644
index 0000000000000000000000000000000000000000..4279f41e06de459f18b9a622539511d555e9a0af
Binary files /dev/null and b/doc/fluid/images/4.png differ
diff --git a/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg b/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8b0d90f7b9d8184b314b0ee4e521f53eb5f1b455
Binary files /dev/null and b/doc/fluid/images/LOD-and-shape-changes-during-decoding.jpg differ
diff --git a/doc/fluid/images/LoDTensor.png b/doc/fluid/images/LoDTensor.png
new file mode 100644
index 0000000000000000000000000000000000000000..75369f5378309e0f304b83f6bb69bdb195eac079
Binary files /dev/null and b/doc/fluid/images/LoDTensor.png differ
diff --git a/doc/fluid/images/asgd.gif b/doc/fluid/images/asgd.gif
new file mode 100644
index 0000000000000000000000000000000000000000..4a0da7bf6df9326a2aab1638b77c5455c18b8c4e
Binary files /dev/null and b/doc/fluid/images/asgd.gif differ
diff --git a/doc/fluid/images/batch_norm_fork.dot b/doc/fluid/images/batch_norm_fork.dot
new file mode 100644
index 0000000000000000000000000000000000000000..4bc47713cba2cb23f1b34fffe6426ef10ac3a9df
--- /dev/null
+++ b/doc/fluid/images/batch_norm_fork.dot
@@ -0,0 +1,25 @@
+digraph ImageBatchNormForkGragh {
+  subgraph cluster_before {
+    Prev [label="...", shape=plaintext];
+    Rnn [label="rnn_op", shape=box];
+    BatchNorm [label="batch_norm_op", shape=box];
+    Fc [label="fc_op", shape=box];
+    After [label="...", shape=plaintext];
+    Prev -> Rnn -> BatchNorm -> Fc -> After;
+    label="original";
+  }
+
+  subgraph cluster_after {
+    Prev2 [label="...", shape=plaintext];
+    Rnn2 [label="rnn_op", shape=box];
+    BatchNorm2_1 [label="train_batch_norm_op", shape=box];
+    BatchNorm2_2 [label="infer_batch_norm_op", shape=box];
+    Fc2_1 [label="fc_op", shape=box];
+    Fc2_2 [label="fc_op", shape=box];
+    After2_1 [label="...", shape=plaintext];
+    After2_2 [label="...", shape=plaintext];
+    Prev2 -> Rnn2 -> BatchNorm2_1 -> Fc2_1 -> After2_1;
+    Rnn2 -> BatchNorm2_2 ->Fc2_2 ->After2_2
+    label="forked";
+  }
+}
diff --git a/doc/fluid/images/batch_norm_fork.png b/doc/fluid/images/batch_norm_fork.png
new file mode 100644
index 0000000000000000000000000000000000000000..aded62bce5bc268b7a3ef4dc96c89fe21d6ea955
Binary files /dev/null and b/doc/fluid/images/batch_norm_fork.png differ
diff --git a/doc/fluid/images/batch_norm_op_kernel.png b/doc/fluid/images/batch_norm_op_kernel.png
new file mode 100644
index 0000000000000000000000000000000000000000..a99ce81ff3bf42880ebbd6a1297de3bf038e09b2
Binary files /dev/null and b/doc/fluid/images/batch_norm_op_kernel.png differ
diff --git a/doc/fluid/images/beam_search.png b/doc/fluid/images/beam_search.png
new file mode 100644
index 0000000000000000000000000000000000000000..7f7e35f34223162d0f7f0ed97375909c43b830ae
Binary files /dev/null and b/doc/fluid/images/beam_search.png differ
diff --git a/doc/fluid/images/ci_build_whl.png b/doc/fluid/images/ci_build_whl.png
new file mode 100644
index 0000000000000000000000000000000000000000..232762b82a9ae3e979a1f38a7beb715c87438f40
Binary files /dev/null and b/doc/fluid/images/ci_build_whl.png differ
diff --git a/doc/fluid/images/compile_run_time.png b/doc/fluid/images/compile_run_time.png
new file mode 100644
index 0000000000000000000000000000000000000000..0bc9b2fd0e81b4851e6d96171ccb9a05d0f42a48
Binary files /dev/null and b/doc/fluid/images/compile_run_time.png differ
diff --git a/doc/fluid/images/compiler.graffle b/doc/fluid/images/compiler.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..8cc678fea3c820103e7ce81f7a5d625d6c1d92de
Binary files /dev/null and b/doc/fluid/images/compiler.graffle differ
diff --git a/doc/fluid/images/compiler.png b/doc/fluid/images/compiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..65d34f841afce9756def07dd8ecb9ca44e658bfe
Binary files /dev/null and b/doc/fluid/images/compiler.png differ
diff --git a/doc/fluid/images/control_flow_graph.png b/doc/fluid/images/control_flow_graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..3579998e58d07abc50bd3332128d4733a391cb3b
Binary files /dev/null and b/doc/fluid/images/control_flow_graph.png differ
diff --git a/doc/fluid/images/dataflow_equations.png b/doc/fluid/images/dataflow_equations.png
new file mode 100644
index 0000000000000000000000000000000000000000..c10f7f69f4007952e5b0394edaa04efa1cfbb658
Binary files /dev/null and b/doc/fluid/images/dataflow_equations.png differ
diff --git a/doc/fluid/images/dcgan.png b/doc/fluid/images/dcgan.png
new file mode 100644
index 0000000000000000000000000000000000000000..15e8e290a111ff43900934341365cb4360d87d28
Binary files /dev/null and b/doc/fluid/images/dcgan.png differ
diff --git a/doc/fluid/images/deep_learning.png b/doc/fluid/images/deep_learning.png
new file mode 100644
index 0000000000000000000000000000000000000000..026becc4d94e01e407dacb2a5314a0e5723334ff
Binary files /dev/null and b/doc/fluid/images/deep_learning.png differ
diff --git a/doc/fluid/images/dist-graph.graffle b/doc/fluid/images/dist-graph.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..941399c6ced8d5f65b6c595522b770c88259df4b
Binary files /dev/null and b/doc/fluid/images/dist-graph.graffle differ
diff --git a/doc/fluid/images/dist-graph.png b/doc/fluid/images/dist-graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..3546b09f1c2ee3e4f60f519d5e47f823f08051a7
Binary files /dev/null and b/doc/fluid/images/dist-graph.png differ
diff --git a/doc/fluid/images/distributed_architecture.graffle b/doc/fluid/images/distributed_architecture.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..d1b60141342232e06227c2d430ebc60ec349a907
Binary files /dev/null and b/doc/fluid/images/distributed_architecture.graffle differ
diff --git a/doc/fluid/images/distributed_architecture.png b/doc/fluid/images/distributed_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..29c7b0c0783f97c6d33b1db1ed484d6a2b9dd356
Binary files /dev/null and b/doc/fluid/images/distributed_architecture.png differ
diff --git a/doc/fluid/images/ds2_network.png b/doc/fluid/images/ds2_network.png
new file mode 100644
index 0000000000000000000000000000000000000000..1a5b2184d47928cc2849d5a7c8ea2d8cf5337e11
Binary files /dev/null and b/doc/fluid/images/ds2_network.png differ
diff --git a/doc/fluid/images/executor.png b/doc/fluid/images/executor.png
new file mode 100644
index 0000000000000000000000000000000000000000..b29c0d779e3d46b779b5baeabe3176adaeb00a6d
Binary files /dev/null and b/doc/fluid/images/executor.png differ
diff --git a/doc/fluid/images/feed_forward.png b/doc/fluid/images/feed_forward.png
new file mode 100644
index 0000000000000000000000000000000000000000..d312371a04c26aa6cd196e0bd1f51becb425180b
Binary files /dev/null and b/doc/fluid/images/feed_forward.png differ
diff --git a/doc/fluid/images/feed_forward_regularized.png b/doc/fluid/images/feed_forward_regularized.png
new file mode 100644
index 0000000000000000000000000000000000000000..677e99bfd9f8e72ed9fe4b27127af2ced202f447
Binary files /dev/null and b/doc/fluid/images/feed_forward_regularized.png differ
diff --git a/doc/fluid/images/fluid-compiler.graffle b/doc/fluid/images/fluid-compiler.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..c933df2cb855462c52b2d25f7f9a99b95652961d
Binary files /dev/null and b/doc/fluid/images/fluid-compiler.graffle differ
diff --git a/doc/fluid/images/fluid-compiler.png b/doc/fluid/images/fluid-compiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..1b0ffed2039c91a3a00bbb719da08c91c3acf7bb
Binary files /dev/null and b/doc/fluid/images/fluid-compiler.png differ
diff --git a/doc/fluid/images/fluid_examples.png b/doc/fluid/images/fluid_examples.png
new file mode 100644
index 0000000000000000000000000000000000000000..aa99472c0f914cde128fd7b3bd8dc29ac24f94b6
Binary files /dev/null and b/doc/fluid/images/fluid_examples.png differ
diff --git a/doc/fluid/images/fluid_module_1.png b/doc/fluid/images/fluid_module_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..554782ba54e43efc3d6babbb94e3cac3530ac649
Binary files /dev/null and b/doc/fluid/images/fluid_module_1.png differ
diff --git a/doc/fluid/images/fluid_module_2.png b/doc/fluid/images/fluid_module_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..4219efccbb1e87839adf6b5720fe46808b7d2fcf
Binary files /dev/null and b/doc/fluid/images/fluid_module_2.png differ
diff --git a/doc/fluid/images/graph_construction_example.bash b/doc/fluid/images/graph_construction_example.bash
new file mode 100755
index 0000000000000000000000000000000000000000..35e6997abd17588e17a82d448918fc1b3bd7220e
--- /dev/null
+++ b/doc/fluid/images/graph_construction_example.bash
@@ -0,0 +1,11 @@
+cat ./graph_construction_example.dot | \
+    sed 's/color=red/color=red, style=invis/g' | \
+    sed 's/color=green/color=green, style=invis/g' | \
+    dot -Tpng > graph_construction_example_forward_only.png
+
+cat ./graph_construction_example.dot | \
+    sed 's/color=green/color=green, style=invis/g' | \
+    dot -Tpng > graph_construction_example_forward_backward.png
+
+cat ./graph_construction_example.dot | \
+    dot -Tpng > graph_construction_example_all.png
diff --git a/doc/fluid/images/graph_construction_example.dot b/doc/fluid/images/graph_construction_example.dot
new file mode 100644
index 0000000000000000000000000000000000000000..e115f9844bae6ad24f638c8ed4749cea8aff06a9
--- /dev/null
+++ b/doc/fluid/images/graph_construction_example.dot
@@ -0,0 +1,68 @@
+digraph ImageClassificationGraph {
+        ///////// The forward part /////////
+        FeedX [label="Feed", color=blue, shape=box];
+        FeedY [label="Feed", color=blue, shape=box];
+        InitW [label="Init", color=blue, shape=diamond];
+        Initb [label="Init", color=blue, shape=diamond];
+        FC [label="FC", color=blue, shape=box];
+        MSE [label="MSE", color=blue, shape=box];
+
+        x [label="x", color=blue, shape=oval];
+        l [label="l", color=blue, shape=oval];
+        y [label="y", color=blue, shape=oval];
+        W [label="W", color=blue, shape=doublecircle];
+        b [label="b", color=blue, shape=doublecircle];
+        cost [label="cost", color=blue, shape=oval];
+
+        FeedX -> x -> FC -> y -> MSE -> cost [color=blue];
+        FeedY -> l [color=blue];
+        InitW -> W [color=blue];
+        Initb -> b [color=blue];
+        W -> FC [color=blue];
+        b -> FC [color=blue];
+        l -> MSE [color=blue];
+
+        ////////// The backward part /////////
+        MSE_Grad [label="MSE_grad", color=red, shape=box];
+        FC_Grad [label="FC_grad", color=red, shape=box];
+
+        d_cost [label="d cost", color=red, shape=oval];
+        d_y [label="d y", color=red, shape=oval];
+        d_b [label="d b", color=red, shape=oval];
+        d_W [label="d W", color=red, shape=oval];
+
+        cost -> MSE_Grad [color=red];
+        d_cost -> MSE_Grad [color=red];
+        l -> MSE_Grad [color=red];
+        y -> MSE_Grad -> d_y [color=red];
+
+        x -> FC_Grad [color=red];
+        y -> FC_Grad [color=red];
+        d_y -> FC_Grad [color=red];
+        W -> FC_Grad -> d_W [color=red];
+        b -> FC_Grad -> d_b [color=red];
+
+        ////////// The optimizaiton part //////////
+
+        OPT_W [label="SGD", color=green, shape=box];
+        OPT_b [label="SGD", color=green, shape=box];
+
+        W -> OPT_W [color=green];
+        b -> OPT_b [color=green];
+        d_W -> OPT_W -> W [color=green];
+        d_b -> OPT_b -> b [color=green];
+
+        ////////// Groupings //////////
+
+        subgraph clusterMSE {
+                style=invis;
+                MSE;
+                MSE_Grad;
+        }
+
+        subgraph clusterFC {
+                style=invis;
+                FC;
+                FC_Grad;
+        }
+}
diff --git a/doc/fluid/images/graph_construction_example_all.png b/doc/fluid/images/graph_construction_example_all.png
new file mode 100644
index 0000000000000000000000000000000000000000..261611a5721f9aa97874f7e6d897fe48cf667db2
Binary files /dev/null and b/doc/fluid/images/graph_construction_example_all.png differ
diff --git a/doc/fluid/images/graph_construction_example_forward_backward.png b/doc/fluid/images/graph_construction_example_forward_backward.png
new file mode 100644
index 0000000000000000000000000000000000000000..4c69687f4a6a181138f3df72ce5e8aa48487b5be
Binary files /dev/null and b/doc/fluid/images/graph_construction_example_forward_backward.png differ
diff --git a/doc/fluid/images/graph_construction_example_forward_only.png b/doc/fluid/images/graph_construction_example_forward_only.png
new file mode 100644
index 0000000000000000000000000000000000000000..e668c16e0cac73acb4e5dc2b1827557ae77126b4
Binary files /dev/null and b/doc/fluid/images/graph_construction_example_forward_only.png differ
diff --git a/doc/fluid/images/l1_regularization.png b/doc/fluid/images/l1_regularization.png
new file mode 100644
index 0000000000000000000000000000000000000000..e1b9c7a44f94dc027598a98da93ddb8133190972
Binary files /dev/null and b/doc/fluid/images/l1_regularization.png differ
diff --git a/doc/fluid/images/l2_regularization.png b/doc/fluid/images/l2_regularization.png
new file mode 100644
index 0000000000000000000000000000000000000000..d5c2fcbc2ccae75ad083162e5a2dceb0210be298
Binary files /dev/null and b/doc/fluid/images/l2_regularization.png differ
diff --git a/doc/fluid/images/layer.png b/doc/fluid/images/layer.png
new file mode 100644
index 0000000000000000000000000000000000000000..e46db4c9c6f5b65ff274b498b716b11de343a8b0
Binary files /dev/null and b/doc/fluid/images/layer.png differ
diff --git a/doc/fluid/images/local-graph.graffle b/doc/fluid/images/local-graph.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..19e509bd9af3c1e9a3f5e0f16ddd281457a339c5
Binary files /dev/null and b/doc/fluid/images/local-graph.graffle differ
diff --git a/doc/fluid/images/local-graph.png b/doc/fluid/images/local-graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..ada51200f793a9bb18911e7d63cfdb3244b967d7
Binary files /dev/null and b/doc/fluid/images/local-graph.png differ
diff --git a/doc/fluid/images/local_architecture.graffle b/doc/fluid/images/local_architecture.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..49fcc663ebe3824aa234e3a67aadf285cb417877
Binary files /dev/null and b/doc/fluid/images/local_architecture.graffle differ
diff --git a/doc/fluid/images/local_architecture.png b/doc/fluid/images/local_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..14adc9fd72b855bb9f74fbf2c84ac9ec0cf2b122
Binary files /dev/null and b/doc/fluid/images/local_architecture.png differ
diff --git a/doc/fluid/images/lookup_table.png b/doc/fluid/images/lookup_table.png
new file mode 100644
index 0000000000000000000000000000000000000000..72dfe3547f731d0d090338afb206b0549dff472e
Binary files /dev/null and b/doc/fluid/images/lookup_table.png differ
diff --git a/doc/fluid/images/lookup_table_training.png b/doc/fluid/images/lookup_table_training.png
new file mode 100644
index 0000000000000000000000000000000000000000..cc7cc4aeb3b885850fe2f70f19fb84d5873bed1e
Binary files /dev/null and b/doc/fluid/images/lookup_table_training.png differ
diff --git a/doc/fluid/images/loss_equation.png b/doc/fluid/images/loss_equation.png
new file mode 100644
index 0000000000000000000000000000000000000000..14212ec8d36c803de96bde8a9a4b5591bd20434e
Binary files /dev/null and b/doc/fluid/images/loss_equation.png differ
diff --git a/doc/fluid/images/multi-threads.graffle b/doc/fluid/images/multi-threads.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..e71173715fff92a0a933d0c7d83599ba948552c6
Binary files /dev/null and b/doc/fluid/images/multi-threads.graffle differ
diff --git a/doc/fluid/images/multi-threads@3x.png b/doc/fluid/images/multi-threads@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..e40a869987dbbf5019d4cb03c1dab55b74d6c9f9
Binary files /dev/null and b/doc/fluid/images/multi-threads@3x.png differ
diff --git a/doc/fluid/images/multigpu_allreduce.graffle b/doc/fluid/images/multigpu_allreduce.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..cb5bc420ceafe8ba4c87694d44ee4e5e4ad06779
Binary files /dev/null and b/doc/fluid/images/multigpu_allreduce.graffle differ
diff --git a/doc/fluid/images/multigpu_allreduce.png b/doc/fluid/images/multigpu_allreduce.png
new file mode 100644
index 0000000000000000000000000000000000000000..87a1b3e8f6dd4a713ec9df9f0037d1da04e9178a
Binary files /dev/null and b/doc/fluid/images/multigpu_allreduce.png differ
diff --git a/doc/fluid/images/multigpu_before_convert.graffle b/doc/fluid/images/multigpu_before_convert.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..6c35ab1b21fb76ceae82d3693ed0d085b5bc0855
Binary files /dev/null and b/doc/fluid/images/multigpu_before_convert.graffle differ
diff --git a/doc/fluid/images/multigpu_before_convert.png b/doc/fluid/images/multigpu_before_convert.png
new file mode 100644
index 0000000000000000000000000000000000000000..9c8f7711165d80a2fa3911280fdee91855a401b1
Binary files /dev/null and b/doc/fluid/images/multigpu_before_convert.png differ
diff --git a/doc/fluid/images/multiple_reader.png b/doc/fluid/images/multiple_reader.png
new file mode 100644
index 0000000000000000000000000000000000000000..b22126b31db4982c13fc3a0827805e6aaf955046
Binary files /dev/null and b/doc/fluid/images/multiple_reader.png differ
diff --git a/doc/fluid/images/op.dot b/doc/fluid/images/op.dot
new file mode 100644
index 0000000000000000000000000000000000000000..c8ad839cb88788e9b5906402257cc7bbc3ddcb54
--- /dev/null
+++ b/doc/fluid/images/op.dot
@@ -0,0 +1,4 @@
+digraph sample { 
+  graph [rankdir=TD]; node [shape=record];
+  op [label="{Operator| InferShape()=0\lRun()=0\l | map&#60;string, string[]&#62; inputs_\lmap&#60;string, string[]&#62; outputs_ \l AttributeMap attrs_\l}"]; 
+}
\ No newline at end of file
diff --git a/doc/fluid/images/op_op_with_kern_class_diagram.dot b/doc/fluid/images/op_op_with_kern_class_diagram.dot
new file mode 100644
index 0000000000000000000000000000000000000000..8f24e9ea83acf879c7008f2d97113c0a4cc111c3
--- /dev/null
+++ b/doc/fluid/images/op_op_with_kern_class_diagram.dot
@@ -0,0 +1,38 @@
+digraph sample { 
+  graph [rankdir=TD]; node [shape=record];
+  op [label="{Operator| InferShape()=0\lRun()=0\l | map&#60;string, string[]&#62; inputs_\lmap&#60;string, string[]&#62; outputs_ \l AttributeMap attrs_\l}"]; 
+  op_with_kern [label="{OpWithKernel | InferShape()=0\lRun()\l | map&#60;OpKernelKey,OpKernel&#62;kernels_ }"]
+  op_kernel [label="{OpKernel | Compute()=0}"]
+  op_kernel_key [label="{OpKernelKey| Place place\n...}"]
+
+  op -> op_with_kern [dir=back, arrowtail=onormal]
+  op_with_kern -> op_kernel [arrowhead=vee, label="contains many"]
+
+  {
+    rank=same;
+    op_with_kern
+    op_kernel
+  }
+
+  op_kernel -> op_kernel_key [style=invis]
+
+  {
+    rank=same;
+    op_kernel
+    op_kernel_key
+  }
+
+  op_with_kern -> op_kernel_key [arrowhead=vee, label ="\nas map key"]
+
+  mul_op [label="MulOp"]
+  op_with_kern -> mul_op [dir=back, arrowtail=onormal]
+  mul_kernel [label="template &#60;typename Place&#62;\lclass MulOpKernel\l"]
+  op_kernel -> mul_kernel [dir=back, arrowtail=onormal]
+  mul_op -> mul_kernel [arrowhead=vee, label="register many"]
+  
+  {
+    rank=same;
+    mul_op;
+    mul_kernel;
+  }
+}
\ No newline at end of file
diff --git a/doc/fluid/images/op_with_kernel.dot b/doc/fluid/images/op_with_kernel.dot
new file mode 100644
index 0000000000000000000000000000000000000000..4f5af4f7b5f5a69693a058c99eb658900136077a
--- /dev/null
+++ b/doc/fluid/images/op_with_kernel.dot
@@ -0,0 +1,26 @@
+digraph sample { 
+  graph [rankdir=TD]; node [shape=record];
+  op [label="{Operator}"]; 
+  op_with_kern [label="{OpWithKernel | InferShape()=0\lRun()\l | map&#60;OpKernelKey,OpKernel&#62;kernels_ }"]
+  op_kernel [label="{OpKernel | Compute()=0}"]
+  op_kernel_key [label="{OpKernelKey| Place place\n...}"]
+
+  op -> op_with_kern [dir=back, arrowtail=onormal]
+  op_with_kern -> op_kernel [arrowhead=vee, label="contains many"]
+
+  {
+    rank=same;
+    op_with_kern
+    op_kernel
+  }
+
+  op_kernel -> op_kernel_key [style=invis]
+
+  {
+    rank=same;
+    op_kernel
+    op_kernel_key
+  }
+
+  op_with_kern -> op_kernel_key [arrowhead=vee, label ="\nas map key"]
+}
\ No newline at end of file
diff --git a/doc/fluid/images/operator1.png b/doc/fluid/images/operator1.png
new file mode 100644
index 0000000000000000000000000000000000000000..3975b06f615b7a88dfc11e71b6451fdf4ce42d60
Binary files /dev/null and b/doc/fluid/images/operator1.png differ
diff --git a/doc/fluid/images/operator2.png b/doc/fluid/images/operator2.png
new file mode 100644
index 0000000000000000000000000000000000000000..b7bb1fae2050d3a70797517bc20dbbdef3dfcb7c
Binary files /dev/null and b/doc/fluid/images/operator2.png differ
diff --git a/doc/fluid/images/paddle-compile.graffle b/doc/fluid/images/paddle-compile.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..a6348cc3dbcaca923c6e794681b2edb85cb9f8f6
Binary files /dev/null and b/doc/fluid/images/paddle-compile.graffle differ
diff --git a/doc/fluid/images/paddle-compile.png b/doc/fluid/images/paddle-compile.png
new file mode 100644
index 0000000000000000000000000000000000000000..e0f13d551ac41afaec627a57dea79356464bf0bf
Binary files /dev/null and b/doc/fluid/images/paddle-compile.png differ
diff --git a/doc/fluid/images/place.png b/doc/fluid/images/place.png
new file mode 100644
index 0000000000000000000000000000000000000000..14e77511d639af155e5a3725cde05323e0cc94f2
Binary files /dev/null and b/doc/fluid/images/place.png differ
diff --git a/doc/fluid/images/pprof_1.png b/doc/fluid/images/pprof_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..8e9edbf377672d0ef40f2fc7bd39e746923550cb
Binary files /dev/null and b/doc/fluid/images/pprof_1.png differ
diff --git a/doc/fluid/images/pprof_2.png b/doc/fluid/images/pprof_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..172ba20399ba974d27f4c072425277b69b02520b
Binary files /dev/null and b/doc/fluid/images/pprof_2.png differ
diff --git a/doc/fluid/images/print_fluid_program.png b/doc/fluid/images/print_fluid_program.png
new file mode 100644
index 0000000000000000000000000000000000000000..e8e459e1b3d5c8706b3caa05dc371db8d46df4a5
Binary files /dev/null and b/doc/fluid/images/print_fluid_program.png differ
diff --git a/doc/fluid/images/profiler.png b/doc/fluid/images/profiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..d57b71ca88aaba5d05584a6219d84214e285a1e1
Binary files /dev/null and b/doc/fluid/images/profiler.png differ
diff --git a/doc/fluid/images/program_desc1.png b/doc/fluid/images/program_desc1.png
new file mode 100644
index 0000000000000000000000000000000000000000..0656336914ece957f2e5bb4d70ad337a63e31d88
Binary files /dev/null and b/doc/fluid/images/program_desc1.png differ
diff --git a/doc/fluid/images/program_desc2.png b/doc/fluid/images/program_desc2.png
new file mode 100644
index 0000000000000000000000000000000000000000..db5bfa1231345add8661b4f8ef0fc9d861f40d24
Binary files /dev/null and b/doc/fluid/images/program_desc2.png differ
diff --git a/doc/fluid/images/raw_input.png b/doc/fluid/images/raw_input.png
new file mode 100644
index 0000000000000000000000000000000000000000..0725f92d2b169c2b59ec7c68b402859c2a2dd1d8
Binary files /dev/null and b/doc/fluid/images/raw_input.png differ
diff --git a/doc/fluid/images/readers.png b/doc/fluid/images/readers.png
new file mode 100644
index 0000000000000000000000000000000000000000..fd59168ce16c9e2a0ef45303c28c997cfd7740be
Binary files /dev/null and b/doc/fluid/images/readers.png differ
diff --git a/doc/fluid/images/remote_executor.graffle b/doc/fluid/images/remote_executor.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..41b2067311694b56d211a4f32d1b76884eeffd2d
Binary files /dev/null and b/doc/fluid/images/remote_executor.graffle differ
diff --git a/doc/fluid/images/remote_executor.png b/doc/fluid/images/remote_executor.png
new file mode 100644
index 0000000000000000000000000000000000000000..744e2fb2e0f1bbe058e991ba7b2a09000965ee79
Binary files /dev/null and b/doc/fluid/images/remote_executor.png differ
diff --git a/doc/fluid/images/rnn.dot b/doc/fluid/images/rnn.dot
new file mode 100644
index 0000000000000000000000000000000000000000..c1141cd9c981bb3cbf50d8bf7a6ed210280d79a5
--- /dev/null
+++ b/doc/fluid/images/rnn.dot
@@ -0,0 +1,87 @@
+digraph G {
+  label = "simple RNN implementation" 
+
+  ranksep=2;
+
+  //graph [nodesep=1, ranksep=1];
+
+  node[nodesep=1]
+
+  subgraph cluster0 {
+    label = "global scope"
+    rankdir = TB
+    W
+    boot_memory
+    input
+    output
+  }
+
+  subgraph cluster1 {
+    label = "step-scope 0"
+    rankdir = TB
+    memory0[label="memory"]
+    prememory0[label="pre-memory"]
+    step_input0[label="step input"]
+    step_output0[label="step output"]
+  }
+
+  subgraph cluster2 {
+    label = "step-scope 1"
+    rankdir = TB
+    memory1[label="memory"]
+    prememory1[label="pre-memory"]
+    step_input1[label="step input"]
+    step_output1[label="step output"]
+  }
+
+  subgraph cluster3 {
+    label = "step-scope 2"
+    rankdir = TB
+    memory2[label="memory"]
+    prememory2[label="pre-memory"]
+    step_input2[label="step input"]
+    step_output2[label="step output"]
+  }
+
+  stepnet [shape=box]
+  stepnet0 [shape=box, style=dashed]
+  stepnet1 [shape=box, style=dashed]
+  stepnet2 [shape=box, style=dashed]
+
+
+  edge[color=blue]
+  boot_memory -> prememory0 [label="init" color="blue"]
+  memory0 -> prememory1  [label="copy/reference" color="blue"]
+  memory1 -> prememory2 [label="copy/reference" color="blue"]
+
+  edge[color=black]
+  W -> stepnet0[constraint=false, style=dashed]
+  W -> stepnet1[constraint=false, style=dashed]
+  W -> stepnet2[constraint=false, style=dashed]
+
+  memory0 -> stepnet0[style=dashed]
+  prememory0 -> stepnet0 -> step_output0[style=dashed]
+
+  memory1 -> stepnet1[style=dashed]
+  prememory1 -> stepnet1 -> step_output1[style=dashed]
+
+  memory2 -> stepnet2[style=dashed]
+  prememory2 -> stepnet2 -> step_output2[style=dashed]
+
+  input -> step_input0
+  input -> step_input1
+  input -> step_input2
+
+  step_input0 -> stepnet0 [style=dashed]
+  step_input1 -> stepnet1[style=dashed]
+  step_input2 -> stepnet2[style=dashed]
+
+  step_output0 -> output
+  step_output1 -> output
+  step_output2 -> output
+
+  stepnet0 -> stepnet[style=dashed]
+  stepnet1 -> stepnet[style=dashed]
+  stepnet2 -> stepnet[style=dashed]
+
+}
diff --git a/doc/fluid/images/rnn.jpg b/doc/fluid/images/rnn.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9867e404cf959df0dce6ded5222b466c788fb840
Binary files /dev/null and b/doc/fluid/images/rnn.jpg differ
diff --git a/doc/fluid/images/rnn.png b/doc/fluid/images/rnn.png
new file mode 100644
index 0000000000000000000000000000000000000000..e139e373fe8396782044cfd936fdde624f8c66fe
Binary files /dev/null and b/doc/fluid/images/rnn.png differ
diff --git a/doc/fluid/images/rnn_2level_data.dot b/doc/fluid/images/rnn_2level_data.dot
new file mode 100644
index 0000000000000000000000000000000000000000..1d85ae2617a915ad0ad8288d848b607cc37ad297
--- /dev/null
+++ b/doc/fluid/images/rnn_2level_data.dot
@@ -0,0 +1,75 @@
+digraph G {
+  chapter [label="chapter"]
+
+  subgraph cluster0 {
+    label = "paragraph 0"
+
+    top_rnn0[label="top rnn step 0" shape=box]
+
+    p0 [label="paragraph 0"]
+    p1 [label="paragraph 1"]
+  }
+
+  subgraph cluster1{
+    label = "paragraph 1"
+
+    top_rnn1[label="top rnn step 1" shape=box]
+
+    p2 [label="paragraph 0"]
+    p3 [label="paragraph 1"]
+  }
+
+  subgraph cluster_p0 {
+    label = "sentence 0"
+
+    low_rnn0 [label="low rnn step 0" shape=box]
+    s00 [label="sentence 0"]
+    s01 [label="sentence 1"]
+
+    low_rnn0 -> s00
+    low_rnn0 -> s01
+  }
+
+  subgraph cluster_p1 {
+    label = "sentence 1"
+    low_rnn1 [label="low rnn step 1" shape=box]
+    s10 [label="sentence 0"]
+    s11 [label="sentence 1"]
+    low_rnn1 -> s10
+    low_rnn1 -> s11
+  }
+
+  subgraph cluster_p2 {
+    label = "sentence 1"
+    low_rnn2 [label="low rnn step 0" shape=box]
+    s20 [label="sentence 0"]
+    s21 [label="sentence 1"]
+    low_rnn2 -> s20
+    low_rnn2 -> s21
+  }
+
+  subgraph cluster_p3 {
+    label = "sentence 1"
+    low_rnn3 [label="low rnn step 1" shape=box]
+    s30 [label="sentence 0"]
+    s31 [label="sentence 1"]
+    low_rnn3 -> s30
+    low_rnn3 -> s31
+  }
+
+
+  chapter -> top_rnn0
+  chapter -> top_rnn1
+
+  top_rnn0 -> p0
+  top_rnn0 -> p1
+  top_rnn1 -> p2
+  top_rnn1 -> p3
+
+
+  p0 -> low_rnn0
+  p1 -> low_rnn1
+  p2 -> low_rnn2
+  p3 -> low_rnn3
+
+}
diff --git a/doc/fluid/images/rnn_2level_data.png b/doc/fluid/images/rnn_2level_data.png
new file mode 100644
index 0000000000000000000000000000000000000000..4be81b2430717a6a506342a09fc26899568574c6
Binary files /dev/null and b/doc/fluid/images/rnn_2level_data.png differ
diff --git a/doc/fluid/images/scope_variable_tensor.png b/doc/fluid/images/scope_variable_tensor.png
new file mode 100644
index 0000000000000000000000000000000000000000..59b0de6fb36f9f6b469227c05760a7612bb30b4d
Binary files /dev/null and b/doc/fluid/images/scope_variable_tensor.png differ
diff --git a/doc/fluid/images/single-thread@3x.png b/doc/fluid/images/single-thread@3x.png
new file mode 100644
index 0000000000000000000000000000000000000000..4083aebfdd45af5fbac25fa2c4176bc08c3cb44a
Binary files /dev/null and b/doc/fluid/images/single-thread@3x.png differ
diff --git a/doc/fluid/images/sorted_input.png b/doc/fluid/images/sorted_input.png
new file mode 100644
index 0000000000000000000000000000000000000000..ff601128368ee179e3fd33e5e295a9ddd3dcbaeb
Binary files /dev/null and b/doc/fluid/images/sorted_input.png differ
diff --git a/doc/fluid/images/sparse_update.graffle b/doc/fluid/images/sparse_update.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..08d689a58f83698d8c1158ee3990ed8abf3a7a9a
Binary files /dev/null and b/doc/fluid/images/sparse_update.graffle differ
diff --git a/doc/fluid/images/sparse_update.png b/doc/fluid/images/sparse_update.png
new file mode 100644
index 0000000000000000000000000000000000000000..8c872e6ac479f7d1b818a4a207956c43155d0ad7
Binary files /dev/null and b/doc/fluid/images/sparse_update.png differ
diff --git a/doc/fluid/images/test.dot b/doc/fluid/images/test.dot
new file mode 100644
index 0000000000000000000000000000000000000000..62c69b8fc8010a26a54a6ee8ef1488aad94d747a
--- /dev/null
+++ b/doc/fluid/images/test.dot
@@ -0,0 +1,35 @@
+
+digraph Test {
+    z -> generator -> G_img;
+    G_img -> discriminator -> D_f -> d_loss_f;
+    label0 -> d_loss_f -> d_loss;
+
+    img -> discriminator -> D_t -> d_loss_t;
+    label1 -> d_loss_t -> d_loss;
+
+    d_loss -> d_loss_t[color=red, style=dashed];
+    d_loss -> d_loss_f[color=red, style=dashed];
+    d_loss_t -> D_t[color=red, style=dashed];
+    d_loss_f -> D_f[color=red, style=dashed];
+    D_t -> discriminator[color=red, style=dashed];
+    D_f -> discriminator[color=red, style=dashed];
+
+    D_f -> g_loss;
+    label2 -> g_loss;
+
+    g_loss -> D_f[color=green, style=dashed];
+    D_f -> discriminator[color=green, style=dashed];
+    discriminator -> G_img[color=green, style=dashed];
+    G_img -> generator[color=green, style=dashed];
+
+    discriminator [color=red, shape=box];
+    generator [color=green, shape=box];
+    z [shape=diamond];
+    img [shape=diamond];
+    label0 [shape=diamond];
+    label1 [shape=diamond];
+    label2 [shape=diamond];
+
+    d_loss [color=red];
+    g_loss [color=green];
+}
diff --git a/doc/fluid/images/test.dot.png b/doc/fluid/images/test.dot.png
new file mode 100644
index 0000000000000000000000000000000000000000..4e121a40b9f7b2232d7cdda315bad15926446f55
Binary files /dev/null and b/doc/fluid/images/test.dot.png differ
diff --git a/doc/fluid/images/theta_star.gif b/doc/fluid/images/theta_star.gif
new file mode 100644
index 0000000000000000000000000000000000000000..dd24d33e124396be3fc410c9b12f33148f64efe2
Binary files /dev/null and b/doc/fluid/images/theta_star.gif differ
diff --git a/doc/fluid/images/timeline.jpeg b/doc/fluid/images/timeline.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..38ec3f80c982857531f30a8bb0fa26ea5bf05385
Binary files /dev/null and b/doc/fluid/images/timeline.jpeg differ
diff --git a/doc/fluid/images/tracing.jpeg b/doc/fluid/images/tracing.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..3a49fc4f8a401a9463b0157e2f38c164ca02dcc5
Binary files /dev/null and b/doc/fluid/images/tracing.jpeg differ
diff --git a/doc/fluid/images/transpiler.png b/doc/fluid/images/transpiler.png
new file mode 100644
index 0000000000000000000000000000000000000000..422973c0dc7aa2b544d2fc86a97ace706388cb9e
Binary files /dev/null and b/doc/fluid/images/transpiler.png differ
diff --git a/doc/fluid/images/user_interface.png b/doc/fluid/images/user_interface.png
new file mode 100644
index 0000000000000000000000000000000000000000..ffc94e3d8945ec6291460afd90e8fcc600828390
Binary files /dev/null and b/doc/fluid/images/user_interface.png differ
diff --git a/doc/fluid/index_cn.rst b/doc/fluid/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d878d192cae7ee9e8b8fdb4f615839c186fdf334
--- /dev/null
+++ b/doc/fluid/index_cn.rst
@@ -0,0 +1,12 @@
+ PaddlePaddle Fluid
+==========================
+
+..  toctree::
+  :maxdepth: 1
+
+  getstarted/index_cn.rst
+  build_and_install/index_cn.rst
+  design/index_cn.rst
+  howto/index_cn.rst
+  dev/index_cn.rst
+  faq/index_cn.rst
diff --git a/doc/fluid/index_en.rst b/doc/fluid/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2bc76b58982cf50e637d15cca0c5d78166aa73a9
--- /dev/null
+++ b/doc/fluid/index_en.rst
@@ -0,0 +1,12 @@
+ PaddlePaddle Fluid
+==========================
+
+..  toctree::
+  :maxdepth: 1
+
+  getstarted/index_en.rst
+  build_and_install/index_en.rst
+  design/index_en.rst
+  howto/index_en.rst
+  dev/index_en.rst
+  faq/index_en.rst
diff --git a/doc/mobile/CMakeLists.txt b/doc/mobile/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7b34ba8d0768427802b11614c6962f3c3f6ef4e3
--- /dev/null
+++ b/doc/mobile/CMakeLists.txt
@@ -0,0 +1,52 @@
+if(NOT DEFINED SPHINX_THEME)
+    set(SPHINX_THEME default)
+endif()
+
+if(NOT DEFINED SPHINX_THEME_DIR)
+    set(SPHINX_THEME_DIR)
+endif()
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+set(IMPORT_PADDLE_STRING "")
+set(IMPORT_PADDLEV2_STRING "")
+
+configure_file(
+        "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
+        "${BINARY_BUILD_DIR_EN}/conf.py"
+        @ONLY)
+
+sphinx_add_target(paddle_mobile_docs
+        html
+        ${BINARY_BUILD_DIR_EN}
+        ${SPHINX_CACHE_DIR_EN}
+        ${CMAKE_CURRENT_SOURCE_DIR}
+        ${SPHINX_HTML_DIR_EN})
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
+
+configure_file(
+        "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.cn.in"
+        "${BINARY_BUILD_DIR_CN}/conf.py"
+        @ONLY)
+
+sphinx_add_target(paddle_mobile_docs_cn
+        html
+        ${BINARY_BUILD_DIR_CN}
+        ${SPHINX_CACHE_DIR_CN}
+        ${CMAKE_CURRENT_SOURCE_DIR}
+        ${SPHINX_HTML_DIR_CN})
diff --git a/doc/mobile/cross_compiling_for_android_cn.md b/doc/mobile/cross_compiling_for_android_cn.md
index cdd6917239371a660d0df05bb623f0b94f8f11a3..0607748b751e9f2d606236d9e98868335379b05c 100644
--- a/doc/mobile/cross_compiling_for_android_cn.md
+++ b/doc/mobile/cross_compiling_for_android_cn.md
@@ -63,16 +63,16 @@ Android的Docker开发镜像向用户提供两个可配置的参数：
 - 编译`armeabi-v7a`，`Android API 21`的PaddlePaddle库
 
 ```bash
-$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
+$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev ./paddle/scripts/paddle_build.sh build_android
 ```
 
 - 编译`arm64-v8a`，`Android API 21`的PaddlePaddle库
 
 ```bash
-$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
+$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev ./paddle/scripts/paddle_build.sh build_android
 ```
 
-执行上述`docker run`命令时，容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置，并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`，`ANDROID_API<21`时，Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节，根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后，PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录，所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
+执行上述`docker run`命令时，容器执行[paddle/scripts/paddle_build.sh build_android](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/paddle_build.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置，并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`，`ANDROID_API<21`时，Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节，根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后，PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录，所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
 
 ## 基于Linux交叉编译环境的编译方式
 本文档将以Linux x86-64平台为例，介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。
diff --git a/doc/mobile/cross_compiling_for_android_en.md b/doc/mobile/cross_compiling_for_android_en.md
index 6af16fc114a2310e364023ec43cc3c64149af8f7..572063e8012efee2d2e142eb57e459e0e8c6382c 100644
--- a/doc/mobile/cross_compiling_for_android_en.md
+++ b/doc/mobile/cross_compiling_for_android_en.md
@@ -36,7 +36,7 @@ $ docker pull docker.paddlepaddlehub.com/paddle:latest-dev-android
 We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below:
 
 ```bash
-$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" paddle:dev-android
+$ docker run -it --rm -v $PWD:/paddle -w /paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" paddle:dev-android ./paddle/scripts/paddle_build.sh build_android
 ```
 
 The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
@@ -70,7 +70,7 @@ The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
 
 The ARM-64 architecture (`arm64-v8a`) requires at least level 21 of Android API.
 
-The default entry-point of the Docker image, [`paddle/scripts/docker/build_android.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh) generates the [Android cross-compiling standalone toolchain](https://developer.android.com/ndk/guides/standalone_toolchain.html) based on the argument: `ANDROID_ABI` or `ANDROID_API`.  For information about other configuration arguments, please continue reading.
+The build command, [`paddle/scripts/paddle_build.sh build_android`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/paddle_build.sh) generates the [Android cross-compiling standalone toolchain](https://developer.android.com/ndk/guides/standalone_toolchain.html) based on the argument: `ANDROID_ABI` or `ANDROID_API`.  For information about other configuration arguments, please continue reading.
 
 The above command generates and outputs the inference library in `$PWD/install_android` and puts third-party libraries in `$PWD/install_android/third_party`.
 
diff --git a/doc/mobile/index_cn.rst b/doc/mobile/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..56d1515005f6e40b084c6b2184c6a0b3e3a00496
--- /dev/null
+++ b/doc/mobile/index_cn.rst
@@ -0,0 +1,9 @@
+移动端
+======
+
+..  toctree::
+  :maxdepth: 1
+
+  cross_compiling_for_android_cn.md
+  cross_compiling_for_ios_cn.md
+  cross_compiling_for_raspberry_cn.md
diff --git a/doc/mobile/index_en.rst b/doc/mobile/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e0acdff0284e3bc84b2cc4a34a142ee01754f940
--- /dev/null
+++ b/doc/mobile/index_en.rst
@@ -0,0 +1,9 @@
+Mobile
+======
+
+..  toctree::
+  :maxdepth: 1
+
+  cross_compiling_for_android_en.md
+  cross_compiling_for_ios_en.md
+  cross_compiling_for_raspberry_en.md
diff --git a/doc/survey/dynamic_graph.md b/doc/survey/dynamic_graph.md
new file mode 100644
index 0000000000000000000000000000000000000000..6b80b014b1b1dc50f425e1296f70984c9e9b1cbd
--- /dev/null
+++ b/doc/survey/dynamic_graph.md
@@ -0,0 +1,378 @@
+# Automatic Differentiation with the Tape
+
+## Automatic Differentiation
+
+A key challenge in the field of deep learning is to automatically derive the backward pass from the forward pass described algorithmically by researchers.  Such a derivation, or a transformation of the forward pass program, has been long studied before the recent prosperity of deep learning in the field known as [automatic differentiation](https://arxiv.org/pdf/1502.05767.pdf).
+
+## The Tape
+
+Given the forward pass program (usually in Python in practices), there are two strategies to derive the backward pass:
+
+1. from the forward pass program itself, or
+1. from the execution trace of the forward pass program, which is often known as the *tape*.
+
+This article surveys systems that follow the latter strategy.
+
+## Dynamic Network
+
+When we train a deep learning model, the tape changes every iteration as the input data change, so we have to re-derive the backward pass every iteration.  This is known as *dynamic network*.
+
+Deep learning systems that utilize the idea of dynamic network gained their popularities in recent years.  This article surveys two representative systems: [PyTorch](https://pytorch.org/) and [DyNet](https://dynet.readthedocs.io/en/latest/).
+
+## An Overview
+
+Both frameworks record a ‘tape’ of the computation and interpreting (or run-time compiling) a transformation of the tape played back in reverse. This tape is a different kind of entity than the original program.[[link]](http://www.bcl.hamilton.ie/~barak/papers/toplas-reverse.pdf)
+
+Consider the following code feedforward model.
+
+```python
+x = Variable(randn(20, 1)))
+label = Variable(randint(1))
+W_1, W_2 = Variable(randn(20, 20)), Variable(randn(10, 20))
+h = matmul(W_1, x)
+pred = matmul(W_2, x)
+loss = softmax(pred, label)
+loss.backward()
+```
+
+### 1) Dynet uses List to encode the Tape
+
+During the forward execution, a list of operators, in this case `matmul`, `matmul` and `softmax`, are recorded in the tape, along with the necessary information needed to do the backward such as pointers to the inputs and outputs. Then the tape is played in reverse order at `loss.backward()`.
+
+<details> 
+<summary></summary>
+digraph g {
+    graph [
+        rankdir = "LR"
+    ];
+    node [
+        fontsize = "16"
+        shape = "ellipse"
+    ];
+    edge [];
+    "node0" [
+        label = "<f0> type: matmul | <f1> input: W_1, x | <f2> output: h"
+        shape = "record"
+    ];
+    "node1" [
+        label = "<f0> type: matmul | <f1> input: W_2, h | <f2> output: pred"
+        shape = "record"
+    ];
+    "node2" [
+        label = "<f0> type: softmax | <f1> input: pred, label | <f2> output: loss"
+        shape = "record"
+    ];
+    "node0":f0 -> "node1":f0 [];
+    "node1":f0 -> "node2":f0 [];
+}
+</details>
+
+![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22ellipse%22%20];%20edge%20[];%20%22node0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_1,%20x%20|%20%3Cf2%3E%20output:%20h%22%20shape%20=%20%22record%22%20];%20%22node1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20%3Cf1%3E%20input:%20W_2,%20h%20|%20%3Cf2%3E%20output:%20pred%22%20shape%20=%20%22record%22%20];%20%22node2%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20%3Cf1%3E%20input:%20pred,%20label%20|%20%3Cf2%3E%20output:%20loss%22%20shape%20=%20%22record%22%20];%20%22node0%22:f0%20-%3E%20%22node1%22:f0%20[%20id%20=%200%20];%20%22node1%22:f0%20-%3E%20%22node2%22:f0%20[%20id%20=%201%20];%20})
+
+### 2) Pytorch uses Node Graph to encode the Tape
+
+The graph is composed of `Variable`s and `Function`s. During the forward execution, a `Variable` records its creator function, e.g. `h.creator = matmul`. And a Function records its inputs' previous/dependent functions `prev_func` through `creator`, e.g. `matmul.prev_func = matmul1`. At `loss.backward()`, a topological sort is performed on all `prev_func`s. Then the grad op is performed by the sorted order.
+
+<details> 
+<summary></summary>
+digraph g {
+    graph [
+        rankdir = "LR"
+    ];
+    
+    subgraph function {
+        node [
+            fontsize = "16"
+            style = filled
+            shape = "record"
+        ];
+        "matmul0" [ label = "<f0> type: matmul | prev_func: None" ];
+        "matmul1" [ label = "<f0> type: matmul | prev_func: matmul" ];
+        "softmax" [ label = "<f0> type: softmax | prev_func: matmul" ];
+    }
+    
+    subgraph variable {
+        node [
+            fontsize = "16"
+            shape = "Mrecord"
+            style = filled
+            fillcolor = white
+        ];
+        "x" [ label = "<f0> x | <f1> creator: None" ];
+        "label" [ label = "<f0> label | <f1> creator: None" ];
+        "W_1" [ label = "<f0> W_1 | <f1> creator: None" ];
+        "W_2" [ label = "<f0> W_2 | <f1> creator: None" ];
+        "h" [ label = "<f0> h | <f1> creator: None" ];
+        "pred" [ label = "<f0> pred | <f1> creator: matmul" ];
+        "loss" [ label = "<f0> loss | <f1> creator: softmax" ];
+    }
+    
+    subgraph data_flow {
+        "x":f0 -> "matmul0":f0;
+        "W_1":f0 -> "matmul0":f0;
+        "matmul0":f0 -> "h":f0;
+    
+        "h":f0 -> "matmul1":f0;
+        "W_2":f0 -> "matmul1":f0;
+        "matmul1":f0 -> "pred":f0;
+    
+        "pred":f0 -> "softmax":f0;
+        "label":f0 -> "softmax":f0;
+        "softmax":f0 -> "loss":f0;
+    }
+
+    subgraph prev_func {
+        edge [color="red", arrowsize="0.6", penwidth="1", constraint=false];
+        "matmul1":f1 -> "matmul0":f0;
+        "softmax":f1 -> "matmul1":f0;
+        label = "prev_func";
+    }
+}
+</details>
+
+![Alt text](https://g.gravizo.com/svg?digraph%20g%20{%20graph%20[%20rankdir%20=%20%22LR%22%20];%20subgraph%20function%20{%20node%20[%20fontsize%20=%20%2216%22%20style%20=%20filled%20shape%20=%20%22record%22%20];%20%22matmul0%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20None%22%20];%20%22matmul1%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20matmul%20|%20prev_func:%20matmul%22%20];%20%22softmax%22%20[%20label%20=%20%22%3Cf0%3E%20type:%20softmax%20|%20prev_func:%20matmul%22%20];%20}%20subgraph%20variable%20{%20node%20[%20fontsize%20=%20%2216%22%20shape%20=%20%22Mrecord%22%20style%20=%20filled%20fillcolor%20=%20white%20];%20%22x%22%20[%20label%20=%20%22%3Cf0%3E%20x%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22label%22%20[%20label%20=%20%22%3Cf0%3E%20label%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_1%22%20[%20label%20=%20%22%3Cf0%3E%20W_1%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22W_2%22%20[%20label%20=%20%22%3Cf0%3E%20W_2%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22h%22%20[%20label%20=%20%22%3Cf0%3E%20h%20|%20%3Cf1%3E%20creator:%20None%22%20];%20%22pred%22%20[%20label%20=%20%22%3Cf0%3E%20pred%20|%20%3Cf1%3E%20creator:%20matmul%22%20];%20%22loss%22%20[%20label%20=%20%22%3Cf0%3E%20loss%20|%20%3Cf1%3E%20creator:%20softmax%22%20];%20}%20subgraph%20data_flow%20{%20%22x%22:f0%20-%3E%20%22matmul0%22:f0;%20%22W_1%22:f0%20-%3E%20%22matmul0%22:f0;%20%22matmul0%22:f0%20-%3E%20%22h%22:f0;%20%22h%22:f0%20-%3E%20%22matmul1%22:f0;%20%22W_2%22:f0%20-%3E%20%22matmul1%22:f0;%20%22matmul1%22:f0%20-%3E%20%22pred%22:f0;%20%22pred%22:f0%20-%3E%20%22softmax%22:f0;%20%22label%22:f0%20-%3E%20%22softmax%22:f0;%20%22softmax%22:f0%20-%3E%20%22loss%22:f0;%20}%20subgraph%20prev_func%20{%20edge%20[color=%22red%22,%20arrowsize=%220.6%22,%20penwidth=%221%22,%20constraint=false];%20%22matmul1%22:f1%20-%3E%20%22matmul0%22:f0;%20%22softmax%22:f1%20-%3E%20%22matmul1%22:f0;%20label%20=%20%22prev_func%22;%20}%20})
+
+Chainer and Autograd uses the similar techniques to record the forward pass. For details please refer to the appendix.
+
+## Design choices
+
+### 1) Dynet's List vs Pytorch's Node Graph
+
+What's good about List:
+1. It avoids a topological sort. One only needs to traverse the list of operators in reverse and calling the corresponding backward operator.
+1. It promises effient data parallelism implementations. One could count the time of usage of a certain variable during the construction list. Then in the play back, one knows the calculation of a variable has completed. This enables communication and computation overlapping.
+
+What's good about Node Graph:
+1. More flexibility. PyTorch users can mix and match independent graphs however they like, in whatever threads they like (without explicit synchronization). An added benefit of structuring graphs this way is that when a portion of the graph becomes dead, it is automatically freed. [[2]](https://openreview.net/pdf?id=BJJsrmfCZ) Consider the following example, Pytorch only does backward on SmallNet while Dynet does both BigNet and SmallNet.
+```python
+result = BigNet(data)
+loss = SmallNet(data)
+loss.backward()
+```
+
+### 2) Dynet's Lazy evaluation vs Pytorch's Immediate evaluation
+
+Dynet builds the list in a symbolic matter. Consider the following example
+```python
+for epoch in range(num_epochs):
+    for in_words, out_label in training_data:
+        dy.renew_cg()
+        W = dy.parameter(W_p)
+        b = dy.parameter(b_p)
+        score_sym = dy.softmax(W*dy.concatenate([E[in_words[0]],E[in_words[1]]])+b)
+        loss_sym = dy.pickneglogsoftmax(score_sym, out_label)
+        loss_val = loss_sym.value()
+        loss_sym.backward()
+```
+The computation of `lookup`, `concat`, `matmul` and `softmax` didn't happen until the call of `loss_sym.value()`. This defered execution is useful because it allows some graph-like optimization possible, e.g. kernel fusion.
+
+Pytorch chooses immediate evaluation. It avoids ever materializing a "forward graph"/"tape" (no need to explicitly call `dy.renew_cg()` to reset the list), recording only what is necessary to differentiate the computation, i.e. `creator` and `prev_func`.
+
+
+## What can fluid learn from them?
+
+Please refer to `paddle/contrib/dynamic/`.
+
+# Appendix
+
+### Overview
+
+| Framework | Has Tape | Core in C++ | First Release Date |
+|-----------|----------|-------------|--------------------|
+| Autograd  | No       | No          | Mar 5, 2015        |
+| Chainer   | No       | No          | Jun 5, 2015        |
+| Pytorch   | No       | Yes         | Aug 31, 2016       |
+| Dynet     | Yes      | Yes         | Oct 12, 2016       |
+
+### Source Code
+#### Autograd
+[Backward code](https://github.com/HIPS/autograd/blob/442205dfefe407beffb33550846434baa90c4de7/autograd/core.py#L8-L40). In the forward pass, a graph of VJPNode is constructed.
+```python
+# User API
+def make_grad(fun, x):
+    start_node = VJPNode.new_root()
+    end_value, end_node =  trace(start_node, fun, x)
+    return backward_pass(g, end_node), end_value
+
+# trace the forward pass by creating VJPNodes
+def trace(start_node, fun, x):
+    with trace_stack.new_trace() as t:
+        start_box = new_box(x, t, start_node)
+        end_box = fun(start_box)
+        return end_box._value, end_box._node
+
+def backward_pass(g, end_node):
+    outgrads = {end_node : (g, False)}
+    for node in toposort(end_node):
+        outgrad = outgrads.pop(node)
+        ingrads = node.vjp(outgrad[0])
+        for parent, ingrad in zip(node.parents, ingrads):
+            outgrads[parent] = add_outgrads(outgrads.get(parent), ingrad)
+    return outgrad[0]
+
+# Every VJPNode corresponds to a op_grad
+class VJPNode(Node):
+    __slots__ = ['parents', 'vjp']
+    def __init__(self, value, fun, args, kwargs, parent_argnums, parents):
+        self.parents = parents
+        vjpmaker = primitive_vjps[fun]
+        self.vjp = vjpmaker(parent_argnums, value, args, kwargs)
+```
+#### Chainer
+Example Code
+```python
+# (1) Function Set definition, creates FunctionNode
+model = FunctionSet(
+    l1=F.Linear(784, 100),
+    l2=F.Linear(100, 100),
+    l3=F.Linear(100, 10)).to_gpu()
+
+# (2) Optimizer Setup
+opt = optimizers.SGD()
+opt.setup(model)
+
+# (3) Forward computation
+def forward(x, t):
+    h1 = F.relu(model.l1(x))
+    h2 = F.relu(model.l2(h1))
+    y = model.l3(h2)
+    return F.softmax_cross_entropy(y, t)
+
+# (4) Training loop
+for epoch in xrange(n_epoch):
+    for i in xrange(0, N, b_size):
+        x = Variable(to_gpu(...))
+        t = Variable(to_gpu(...))
+        opt.zero_grads()
+        loss = forward(x, t)
+        loss.backward()
+        opt.update()
+```
+In `forward(x, t)`, a graph of [`VariableNode`](https://github.com/chainer/chainer/blob/master/chainer/variable.py#L110) and [`FunctionNode`](https://github.com/chainer/chainer/blob/a69103a4aa59d5b318f39b01dbcb858d465b89cf/chainer/function_node.py#L19) is constructed. Every output's `VariableNode.creator` is pointed to the `FunctionNode`.
+```python
+class FunctionNode(object):
+    ...
+    def apply(self, inputs):
+        outputs = self.forward(inputs)
+        ret = tuple([variable.Variable(y, requires_grad=requires_grad)
+                     for y in outputs])
+        # Topological ordering
+        self.rank = max([x.rank for x in inputs]) if input_vars else 0
+        # Add backward edges
+        for y in ret:
+            y.creator_node = self
+        self.inputs = tuple([x.node for x in input_vars])
+        self.outputs = tuple([y.node for y in ret])
+
+        return ret
+```
+`loss.backward()` will calculate the accumulated gradient of all variables. All the backward of `FunctionNode`s will be called based on the topological order.
+```python
+class VariableNode(object):
+    ...
+    def backward(self, retain_grad, loss_scale):
+        if self.creator_node is None:
+            return
+
+        cand_funcs = []
+        seen_set = set()
+        grads = {}
+
+        # Initialize error by 1, if this is a loss variable
+        if self.data.size == 1 and self._grad_var is None:
+            self.grad = numpy.ones_like(self.data)
+        grads[self._node] = self._grad_var
+
+        def add_cand(cand):
+            if cand not in seen_set:
+                # Negate since heapq is min-heap. This is a global variable
+                heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand))
+                seen_set.add(cand)
+
+        add_cand(self.creator_node)
+
+        while cand_funcs:
+            _, _, func = heapq.heappop(cand_funcs)
+            gxs = func.backward_accumulate(func.inputs, func.outputs, func.outputs.grad)
+
+            for x, gx in enumerate(gxs):
+                if x in grads:
+                    grads[x] += gx
+                else:
+                    grads[x] = gx
+
+                if x.creator_node is not None:
+                    add_cand(x.creator_node)
+```
+
+#### PyTorch
+Example Code
+```python
+x = Variable(torch.ones(5, 5))
+y = Variable(torch.ones(5, 5) * 4)
+z = x ** 2 + x * 2 + x * y + y
+z.backward(torch.ones(5, 5))
+```
+The trace is done by `Variable.creator` and `Function.previous_functions`.
+```python
+class Variable(object):
+    def __init__(self, tensor, creator=None, requires_grad=True):
+        if creator is None:
+            creator = Leaf(self, requires_grad)
+        self.data = tensor
+        self.creator = creator
+        self._grad = None
+
+    def backward(self, gradient=None):
+        if gradient is None:
+            if self.data.numel() != 1:
+                raise RuntimeError('backward should be called only on a scalar (i.e. 1-element tensor) or with gradient w.r.t. the variable')
+            gradient = self.data.new(1).fill_(1)
+        self._execution_engine.run_backward(self, gradient)
+
+class Function(obejct):
+    # ...
+    def _do_forward(self, *input):
+        unpacked_input = tuple(arg.data for arg in input)
+        raw_output = self.forward(*unpacked_input)
+
+        # mark output.creator = self for backward trace
+        output = tuple(Variable(tensor, self) for tensor in raw_output)
+
+        self.previous_functions = [(arg.creator, id(arg)) for arg in input]
+        self.output_ids = {id(var): i for i, var in enumerate(output)}
+        return output
+
+    def _do_backward(self, grad_output):
+        return self.backwaerd(grad_output)
+```
+The [backward](https://github.com/pytorch/pytorch/blob/v0.1.1/torch/autograd/engine.py) is similar to Autograd.
+
+#### DyNet
+Example code
+```python
+model = dy.model()
+W_p = model.add_parameters((20, 100))
+b_p = model.add_parameters(20)
+E = model.add_lookup_parameters((20000, 50))
+for epoch in range(num_epochs):
+    for in_words, out_label in training_data:
+        dy.renew_cg() # init tape
+        W = dy.parameter(W_p)
+        b = dy.parameter(b_p)
+        score_sym = dy.softmax(W*dy.concatenate([E[in_words[0]],E[in_words[1]]])+b)
+        loss_sym = dy.pickneglogsoftmax(score_sym, out_label)
+        loss_val = loss_sym.value()
+        loss_sym.backward()
+```
+[forward](https://github.com/clab/dynet/blob/740a9626a13a2732544de142e256ad0d0a166658/dynet/exec.cc#L84-L158), [backward](https://github.com/clab/dynet/blob/740a9626a13a2732544de142e256ad0d0a166658/dynet/exec.cc#L166-L284). The trace is done by creating a tape of expressions in every iteration. Backward is done by traverse the tape in the reverse order.
+```c++
+void SimpleExecutionEngine::backward(VariableIndex from_where, bool full) {
+  ...  
+  for (int i = num_nodes - 1; i >= 0; --i) {
+    // each node corresponds to an op
+    node->backward(xs, node_fx, node_dEdfx, ai, node_dEdxai);
+  }
+  ...
+}
+```
diff --git a/doc/templates/conf.py.cn.in b/doc/templates/conf.py.cn.in
index 260b6c9fd1b364433cae098bacea77aa7fe9e266..890f70615538af23cd05b9ffd685e870a5644cdb 100644
--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@@ -13,11 +13,11 @@
 # serve to show the default.
 import sys
 import os, subprocess
-sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
+sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
-import paddle
-import paddle.v2
+@IMPORT_PADDLE_STRING@
+@IMPORT_PADDLEV2_STRING@
 
 MarkdownParser = parser.CommonMarkParser
 AutoStructify = transform.AutoStructify
diff --git a/doc/templates/conf.py.en.in b/doc/templates/conf.py.en.in
index e5757b86b43001bc6090d8edd0aaa5ff4fc476ee..5b09464cb991f96127edec40f7dbbc97a8d82582 100644
--- a/doc/templates/conf.py.en.in
+++ b/doc/templates/conf.py.en.in
@@ -13,11 +13,11 @@
 # serve to show the default.
 import sys
 import os, subprocess
-sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
+sys.path.insert(0, os.path.abspath('@PADDLE_BINARY_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
-import paddle
-import paddle.v2
+@IMPORT_PADDLE_STRING@
+@IMPORT_PADDLEV2_STRING@
 
 
 MarkdownParser = parser.CommonMarkParser
diff --git a/doc/v2/CMakeLists.txt b/doc/v2/CMakeLists.txt
index 286fe8845cd7a909d4030540e72362864b536063..d230a1b9217eea6740419822f350096e361a4435 100644
--- a/doc/v2/CMakeLists.txt
+++ b/doc/v2/CMakeLists.txt
@@ -15,12 +15,15 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
 # HTML output director
 set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
 
+set(IMPORT_PADDLE_STRING "")
+set(IMPORT_PADDLEV2_STRING "")
+
 configure_file(
     "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
     "${BINARY_BUILD_DIR_EN}/conf.py"
     @ONLY)
 
-sphinx_add_target(paddle_docs
+sphinx_add_target(paddle_v2_docs
                   html
                   ${BINARY_BUILD_DIR_EN}
                   ${SPHINX_CACHE_DIR_EN}
@@ -41,7 +44,7 @@ configure_file(
     "${BINARY_BUILD_DIR_CN}/conf.py"
     @ONLY)
 
-sphinx_add_target(paddle_docs_cn
+sphinx_add_target(paddle_v2_docs_cn
                   html
                   ${BINARY_BUILD_DIR_CN}
                   ${SPHINX_CACHE_DIR_CN}
diff --git a/doc/v2/api/CMakeLists.txt b/doc/v2/api/CMakeLists.txt
index 2ad589e8a260e48d46cba2300d6e2bcd4bdd8019..0c74522cb089b17c8419e9058f76631b0fe0df93 100644
--- a/doc/v2/api/CMakeLists.txt
+++ b/doc/v2/api/CMakeLists.txt
@@ -7,14 +7,19 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
 # HTML output director
 set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
 
+set(IMPORT_PADDLE_STRING "import paddle")
+set(IMPORT_PADDLEV2_STRING "import paddle.v2")
+
 configure_file(
     "${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in"
     "${BINARY_BUILD_DIR_EN}/conf.py"
     @ONLY)
 
-sphinx_add_target(paddle_api_docs
+sphinx_add_target(paddle_v2_apis
                   html
                   ${BINARY_BUILD_DIR_EN}
                   ${SPHINX_CACHE_DIR_EN}
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_EN})
+
+add_dependencies(paddle_v2_apis  gen_proto_py framework_py_proto copy_paddle_pybind paddle_python)
diff --git a/doc/v2/api/config/evaluators.rst b/doc/v2/api/config/evaluators.rst
index 9ac972fb193a2fb525edc507f7ba1303d2c8eabe..458d892e825a7a9bbe7843ad5c508bd5a31f5f0f 100644
--- a/doc/v2/api/config/evaluators.rst
+++ b/doc/v2/api/config/evaluators.rst
@@ -101,7 +101,7 @@ value_printer
     :noindex:
 
 Detection
-=====
+==========
 
 detection_map
 -------------
diff --git a/doc/v2/api/config/layer.rst b/doc/v2/api/config/layer.rst
index 29388f5005bf779a1bfa63c0d46d35996c0c792d..5a0cfadfce84df41defdf518b7c3a6222d5b30a1 100644
--- a/doc/v2/api/config/layer.rst
+++ b/doc/v2/api/config/layer.rst
@@ -11,7 +11,7 @@ Data layer
 
 data
 ----
-..  autoclass:: paddle.v2.layer.data
+..  autofunction:: paddle.v2.layer.data
     :noindex:
 
 Fully Connected Layers
@@ -21,12 +21,12 @@ Fully Connected Layers
 
 fc
 --
-..  autoclass:: paddle.v2.layer.fc
+..  autofunction:: paddle.v2.layer.fc
     :noindex:
 
 selective_fc
 ------------
-..  autoclass:: paddle.v2.layer.selective_fc
+..  autofunction:: paddle.v2.layer.selective_fc
     :noindex:
 
 Conv Layers
@@ -34,34 +34,34 @@ Conv Layers
 
 conv_operator
 -------------
-..  autoclass:: paddle.v2.layer.conv_operator
+..  autofunction:: paddle.v2.layer.conv_operator
     :noindex:
 
 conv_projection
 ---------------
-..  autoclass:: paddle.v2.layer.conv_projection
+..  autofunction:: paddle.v2.layer.conv_projection
     :noindex:
 
 conv_shift
 ----------
-..  autoclass:: paddle.v2.layer.conv_shift
+..  autofunction:: paddle.v2.layer.conv_shift
     :noindex:
 
 img_conv
 --------
-..  autoclass:: paddle.v2.layer.img_conv
+..  autofunction:: paddle.v2.layer.img_conv
     :noindex:
 
 ..  _api_v2.layer_context_projection:
 
 context_projection
 ------------------
-..  autoclass:: paddle.v2.layer.context_projection
+..  autofunction:: paddle.v2.layer.context_projection
     :noindex:
 
 row_conv
 --------
-..  autoclass:: paddle.v2.layer.row_conv
+..  autofunction:: paddle.v2.layer.row_conv
     :noindex:
 
 Image Pooling Layer
@@ -69,27 +69,27 @@ Image Pooling Layer
 
 img_pool
 --------
-..  autoclass:: paddle.v2.layer.img_pool
+..  autofunction:: paddle.v2.layer.img_pool
     :noindex:
 
 spp
 ---
-..  autoclass:: paddle.v2.layer.spp
+..  autofunction:: paddle.v2.layer.spp
     :noindex:
 
 maxout
 ------
-..  autoclass:: paddle.v2.layer.maxout
+..  autofunction:: paddle.v2.layer.maxout
     :noindex:
 
 roi_pool
 --------
-..  autoclass:: paddle.v2.layer.roi_pool
+..  autofunction:: paddle.v2.layer.roi_pool
     :noindex:
 
 pad
 ----
-..  autoclass:: paddle.v2.layer.pad
+..  autofunction:: paddle.v2.layer.pad
     :noindex:
 
 Norm Layer
@@ -97,27 +97,27 @@ Norm Layer
 
 img_cmrnorm
 -----------
-..  autoclass:: paddle.v2.layer.img_cmrnorm
+..  autofunction:: paddle.v2.layer.img_cmrnorm
     :noindex:
 
 batch_norm
 ----------
-..  autoclass:: paddle.v2.layer.batch_norm
+..  autofunction:: paddle.v2.layer.batch_norm
     :noindex:
 
 sum_to_one_norm
 ---------------
-..  autoclass:: paddle.v2.layer.sum_to_one_norm
+..  autofunction:: paddle.v2.layer.sum_to_one_norm
     :noindex:
 
 cross_channel_norm
 ------------------
-..  autoclass:: paddle.v2.layer.cross_channel_norm
+..  autofunction:: paddle.v2.layer.cross_channel_norm
     :noindex:
 
 row_l2_norm
 -----------
-..  autoclass:: paddle.v2.layer.row_l2_norm
+..  autofunction:: paddle.v2.layer.row_l2_norm
     :noindex:
 
 Recurrent Layers
@@ -125,55 +125,55 @@ Recurrent Layers
 
 recurrent
 ---------
-..  autoclass:: paddle.v2.layer.recurrent
+..  autofunction:: paddle.v2.layer.recurrent
     :noindex:
 
 lstmemory
 ---------
-..  autoclass:: paddle.v2.layer.lstmemory
+..  autofunction:: paddle.v2.layer.lstmemory
     :noindex:
 
 grumemory
 ---------
-..  autoclass:: paddle.v2.layer.grumemory
+..  autofunction:: paddle.v2.layer.grumemory
     :noindex:
 
 gated_unit
 -----------
-..  autoclass:: paddle.v2.layer.gated_unit
+..  autofunction:: paddle.v2.layer.gated_unit
     :noindex:
-    
+
 Recurrent Layer Group
 =====================
 
 memory
 ------
-..  autoclass:: paddle.v2.layer.memory
+..  autofunction:: paddle.v2.layer.memory
     :noindex:
 
 recurrent_group
 ---------------
-..  autoclass:: paddle.v2.layer.recurrent_group
+..  autofunction:: paddle.v2.layer.recurrent_group
     :noindex:
 
 lstm_step
 ---------
-..  autoclass:: paddle.v2.layer.lstm_step
+..  autofunction:: paddle.v2.layer.lstm_step
     :noindex:
 
 gru_step
 --------
-..  autoclass:: paddle.v2.layer.gru_step
+..  autofunction:: paddle.v2.layer.gru_step
     :noindex:
 
 beam_search
 ------------
-..  autoclass:: paddle.v2.layer.beam_search
+..  autofunction:: paddle.v2.layer.beam_search
     :noindex:
 
 get_output
 ----------
-..  autoclass:: paddle.v2.layer.get_output
+..  autofunction:: paddle.v2.layer.get_output
     :noindex:
 
 Mixed Layer
@@ -183,54 +183,54 @@ Mixed Layer
 
 mixed
 -----
-..  autoclass:: paddle.v2.layer.mixed
+..  autofunction:: paddle.v2.layer.mixed
     :noindex:
 
 ..  _api_v2.layer_embedding:
 
 embedding
 ---------
-..  autoclass:: paddle.v2.layer.embedding
+..  autofunction:: paddle.v2.layer.embedding
     :noindex:
 
 scaling_projection
 ------------------
-..  autoclass:: paddle.v2.layer.scaling_projection
+..  autofunction:: paddle.v2.layer.scaling_projection
     :noindex:
 
 dotmul_projection
 -----------------
-..  autoclass:: paddle.v2.layer.dotmul_projection
+..  autofunction:: paddle.v2.layer.dotmul_projection
     :noindex:
 
 dotmul_operator
 ---------------
-..  autoclass:: paddle.v2.layer.dotmul_operator
+..  autofunction:: paddle.v2.layer.dotmul_operator
     :noindex:
 
 full_matrix_projection
 ----------------------
-..  autoclass:: paddle.v2.layer.full_matrix_projection
+..  autofunction:: paddle.v2.layer.full_matrix_projection
     :noindex:
 
 identity_projection
 -------------------
-..  autoclass:: paddle.v2.layer.identity_projection
+..  autofunction:: paddle.v2.layer.identity_projection
     :noindex:
 
 slice_projection
 -------------------
-..  autoclass:: paddle.v2.layer.slice_projection
+..  autofunction:: paddle.v2.layer.slice_projection
     :noindex:
 
 table_projection
 ----------------
-..  autoclass:: paddle.v2.layer.table_projection
+..  autofunction:: paddle.v2.layer.table_projection
     :noindex:
 
 trans_full_matrix_projection
 ----------------------------
-..  autoclass:: paddle.v2.layer.trans_full_matrix_projection
+..  autofunction:: paddle.v2.layer.trans_full_matrix_projection
     :noindex:
 
 Aggregate Layers
@@ -245,51 +245,46 @@ AggregateLevel
 
 pooling
 -------
-..  autoclass:: paddle.v2.layer.pooling
+..  autofunction:: paddle.v2.layer.pooling
     :noindex:
 
 ..  _api_v2.layer_last_seq:
 
 last_seq
 --------
-..  autoclass:: paddle.v2.layer.last_seq
+..  autofunction:: paddle.v2.layer.last_seq
     :noindex:
 
 ..  _api_v2.layer_first_seq:
 
 first_seq
 ---------
-..  autoclass:: paddle.v2.layer.first_seq
+..  autofunction:: paddle.v2.layer.first_seq
     :noindex:
 
 sub_seq
 ---------
-..  autoclass:: paddle.v2.layer.sub_seq
+..  autofunction:: paddle.v2.layer.sub_seq
     :noindex:
 
 concat
 ------
-..  autoclass:: paddle.v2.layer.concat
+..  autofunction:: paddle.v2.layer.concat
     :noindex:
 
 seq_concat
 ----------
-..  autoclass:: paddle.v2.layer.seq_concat
+..  autofunction:: paddle.v2.layer.seq_concat
     :noindex:
 
 seq_slice
 ---------
-..  autoclass:: paddle.v2.layer.seq_slice
-    :noindex:
-
-kmax_sequence_score
--------------------
-..  autoclass:: paddle.v2.layer.kmax_sequence_score
+..  autofunction:: paddle.v2.layer.seq_slice
     :noindex:
 
 sub_nested_seq
 --------------
-..  autoclass:: paddle.v2.layer.sub_nested_seq
+..  autofunction:: paddle.v2.layer.sub_nested_seq
     :noindex:
 
 Reshaping Layers
@@ -297,7 +292,7 @@ Reshaping Layers
 
 block_expand
 ------------
-..  autoclass:: paddle.v2.layer.block_expand
+..  autofunction:: paddle.v2.layer.block_expand
     :noindex:
 
 ..  _api_v2.layer_expand:
@@ -309,22 +304,22 @@ ExpandLevel
 
 expand
 ------
-..  autoclass:: paddle.v2.layer.expand
+..  autofunction:: paddle.v2.layer.expand
     :noindex:
 
 repeat
 ------
-..  autoclass:: paddle.v2.layer.repeat
+..  autofunction:: paddle.v2.layer.repeat
     :noindex:
 
 rotate
 ------
-..  autoclass:: paddle.v2.layer.rotate
+..  autofunction:: paddle.v2.layer.rotate
     :noindex:
 
 seq_reshape
 -----------
-..  autoclass:: paddle.v2.layer.seq_reshape
+..  autofunction:: paddle.v2.layer.seq_reshape
     :noindex:
 
 Math Layers
@@ -332,94 +327,94 @@ Math Layers
 
 addto
 -----
-..  autoclass:: paddle.v2.layer.addto
+..  autofunction:: paddle.v2.layer.addto
     :noindex:
 
 linear_comb
 -----------
-..  autoclass:: paddle.v2.layer.linear_comb
+..  autofunction:: paddle.v2.layer.linear_comb
     :noindex:
 
 interpolation
 -------------
-..  autoclass:: paddle.v2.layer.interpolation
+..  autofunction:: paddle.v2.layer.interpolation
     :noindex:
 
 bilinear_interp
 ---------------
-..  autoclass:: paddle.v2.layer.bilinear_interp
+..  autofunction:: paddle.v2.layer.bilinear_interp
     :noindex:
 
 dropout
 --------
-..  autoclass:: paddle.v2.layer.dropout
+..  autofunction:: paddle.v2.layer.dropout
     :noindex:
-    
+
 dot_prod
 ---------
-.. autoclass:: paddle.v2.layer.dot_prod
+.. autofunction:: paddle.v2.layer.dot_prod
     :noindex:
 
 out_prod
 --------
-.. autoclass:: paddle.v2.layer.out_prod
+.. autofunction:: paddle.v2.layer.out_prod
     :noindex:
 
 power
 -----
-..  autoclass:: paddle.v2.layer.power
+..  autofunction:: paddle.v2.layer.power
     :noindex:
 
 scaling
 -------
-..  autoclass:: paddle.v2.layer.scaling
+..  autofunction:: paddle.v2.layer.scaling
     :noindex:
 
 clip
 ----
-..  autoclass:: paddle.v2.layer.clip
+..  autofunction:: paddle.v2.layer.clip
     :noindex:
 
 resize
 ------
-..  autoclass:: paddle.v2.layer.resize
+..  autofunction:: paddle.v2.layer.resize
     :noindex:
 
 slope_intercept
 ---------------
-..  autoclass:: paddle.v2.layer.slope_intercept
+..  autofunction:: paddle.v2.layer.slope_intercept
     :noindex:
 
 tensor
 ------
-..  autoclass:: paddle.v2.layer.tensor
+..  autofunction:: paddle.v2.layer.tensor
     :noindex:
 
 ..  _api_v2.layer_cos_sim:
 
 cos_sim
 -------
-..  autoclass:: paddle.v2.layer.cos_sim
+..  autofunction:: paddle.v2.layer.cos_sim
     :noindex:
 
 l2_distance
 -----------
-..  autoclass:: paddle.v2.layer.l2_distance
+..  autofunction:: paddle.v2.layer.l2_distance
     :noindex:
 
 trans
 -----
-..  autoclass:: paddle.v2.layer.trans
+..  autofunction:: paddle.v2.layer.trans
     :noindex:
 
 scale_shift
 -----------
-..  autoclass:: paddle.v2.layer.scale_shift
+..  autofunction:: paddle.v2.layer.scale_shift
     :noindex:
 
 factorization_machine
 ---------------------
-..  autoclass:: paddle.v2.layer.factorization_machine
+..  autofunction:: paddle.v2.layer.factorization_machine
     :noindex:
 
 Sampling Layers
@@ -427,17 +422,17 @@ Sampling Layers
 
 maxid
 -----
-..  autoclass:: paddle.v2.layer.max_id
+..  autofunction:: paddle.v2.layer.max_id
     :noindex:
 
 sampling_id
 -----------
-..  autoclass:: paddle.v2.layer.sampling_id
+..  autofunction:: paddle.v2.layer.sampling_id
     :noindex:
 
 multiplex
 ---------
-..  autoclass:: paddle.v2.layer.multiplex
+..  autofunction:: paddle.v2.layer.multiplex
     :noindex:
 
 ..  _api_v2.layer_costs:
@@ -447,100 +442,105 @@ Cost Layers
 
 cross_entropy_cost
 ------------------
-..  autoclass:: paddle.v2.layer.cross_entropy_cost
+..  autofunction:: paddle.v2.layer.cross_entropy_cost
     :noindex:
 
 cross_entropy_with_selfnorm_cost
 --------------------------------
-..  autoclass:: paddle.v2.layer.cross_entropy_with_selfnorm_cost
+..  autofunction:: paddle.v2.layer.cross_entropy_with_selfnorm_cost
     :noindex:
 
 multi_binary_label_cross_entropy_cost
 -------------------------------------
-..  autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
+..  autofunction:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
     :noindex:
 
+classification_cost
+-------------------
+.. autofunction:: paddle.v2.layer.classification_cost
+   :noindex:
+
 huber_regression_cost
 -------------------------
-..  autoclass:: paddle.v2.layer.huber_regression_cost
+..  autofunction:: paddle.v2.layer.huber_regression_cost
     :noindex:
 
 huber_classification_cost
 -------------------------
-..  autoclass:: paddle.v2.layer.huber_classification_cost
+..  autofunction:: paddle.v2.layer.huber_classification_cost
     :noindex:
 
 lambda_cost
 -----------
-..  autoclass:: paddle.v2.layer.lambda_cost
+..  autofunction:: paddle.v2.layer.lambda_cost
     :noindex:
 
 square_error_cost
 -----------------
-..  autoclass:: paddle.v2.layer.square_error_cost
+..  autofunction:: paddle.v2.layer.square_error_cost
     :noindex:
 
 rank_cost
 ---------
-..  autoclass:: paddle.v2.layer.rank_cost
+..  autofunction:: paddle.v2.layer.rank_cost
     :noindex:
 
 sum_cost
 ---------
-..  autoclass:: paddle.v2.layer.sum_cost
+..  autofunction:: paddle.v2.layer.sum_cost
     :noindex:
 
 crf
 ---
-..  autoclass:: paddle.v2.layer.crf
+..  autofunction:: paddle.v2.layer.crf
     :noindex:
 
 crf_decoding
 ------------
-..  autoclass:: paddle.v2.layer.crf_decoding
+..  autofunction:: paddle.v2.layer.crf_decoding
     :noindex:
 
 ctc
 ---
-..  autoclass:: paddle.v2.layer.ctc
+..  autofunction:: paddle.v2.layer.ctc
     :noindex:
 
 warp_ctc
 --------
-..  autoclass:: paddle.v2.layer.warp_ctc
+..  autofunction:: paddle.v2.layer.warp_ctc
     :noindex:
 
 nce
 ---
-..  autoclass:: paddle.v2.layer.nce
+..  autofunction:: paddle.v2.layer.nce
     :noindex:
 
 hsigmoid
 ---------
-..  autoclass:: paddle.v2.layer.hsigmoid
+..  autofunction:: paddle.v2.layer.hsigmoid
     :noindex:
 
 smooth_l1_cost
 --------------
-..  autoclass:: paddle.v2.layer.smooth_l1_cost
+..  autofunction:: paddle.v2.layer.smooth_l1_cost
     :noindex:
 
 multibox_loss
 --------------
-..  autoclass:: paddle.v2.layer.multibox_loss
+..  autofunction:: paddle.v2.layer.multibox_loss
     :noindex:
 
 detection_output
 ----------------
-..  autoclass:: paddle.v2.layer.detection_output
+..  autofunction:: paddle.v2.layer.detection_output
     :noindex:
-    
+
 Check Layer
 ============
 
 eos
 ---
-..  autoclass:: paddle.v2.layer.eos
+..  autofunction:: paddle.v2.layer.eos
     :noindex:
 
 Activation
@@ -548,5 +548,5 @@ Activation
 
 prelu
 --------
-..  autoclass:: paddle.v2.layer.prelu
+..  autofunction:: paddle.v2.layer.prelu
     :noindex:
diff --git a/doc/v2/api/data/data_reader.rst b/doc/v2/api/data/data_reader.rst
index 2ccfec9c284877a7576e9751526b169a4ac78d8e..1a35d0bbc8f9d751f49c7e1fc26feb1bcb3ae7f0 100644
--- a/doc/v2/api/data/data_reader.rst
+++ b/doc/v2/api/data/data_reader.rst
@@ -6,7 +6,43 @@ Data Reader Interface
 DataTypes
 =========
 
-..  automodule:: paddle.v2.data_type
+..  autofunction:: paddle.v2.data_type.dense_array
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.integer_value_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_binary_vector_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_float_vector_sub_sequence
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_non_value_slot
+    :noindex:
+
+..  autofunction:: paddle.v2.data_type.sparse_value_slot
+    :noindex:
+
+..  autoclass:: paddle.v2.data_type.InputType
     :members:
     :noindex:
 
@@ -20,11 +56,11 @@ DataFeeder
 Reader
 ======
 
-..  automodule:: paddle.v2.reader
+..  automodule:: paddle.reader
     :members:
     :noindex:
 
-..  automodule:: paddle.v2.reader.creator
+..  automodule:: paddle.reader.creator
     :members:
     :noindex:
 
diff --git a/doc/v2/api/data/dataset.rst b/doc/v2/api/data/dataset.rst
index 02e41564b1e48c07da6ac071fc4b60089169e05a..e7c8be4452bf55e0967d750c2e624e8e316e9330 100644
--- a/doc/v2/api/data/dataset.rst
+++ b/doc/v2/api/data/dataset.rst
@@ -1,82 +1,82 @@
 Dataset
 =======
 
-..  automodule:: paddle.v2.dataset
+..  automodule:: paddle.dataset
     :members:
     :noindex:
 
 mnist
 +++++
 
-..  automodule:: paddle.v2.dataset.mnist
+..  automodule:: paddle.dataset.mnist
     :members:
     :noindex:
 
 cifar
 +++++
 
-..  automodule:: paddle.v2.dataset.cifar
+..  automodule:: paddle.dataset.cifar
     :members:
     :noindex:
 
 conll05
 +++++++
 
-..  automodule:: paddle.v2.dataset.conll05
+..  automodule:: paddle.dataset.conll05
     :members: get_dict,get_embedding,test
     :noindex:
 
 imdb
 ++++
 
-..  automodule:: paddle.v2.dataset.imdb
+..  automodule:: paddle.dataset.imdb
     :members:
     :noindex:
 
 imikolov
 ++++++++
 
-..  automodule:: paddle.v2.dataset.imikolov
+..  automodule:: paddle.dataset.imikolov
     :members:
     :noindex:
 
 movielens
 +++++++++
 
-..  automodule:: paddle.v2.dataset.movielens
+..  automodule:: paddle.dataset.movielens
     :members:
     :noindex:
 
-..  autoclass:: paddle.v2.dataset.movielens.MovieInfo
+..  autoclass:: paddle.dataset.movielens.MovieInfo
     :noindex:
-    
-..  autoclass:: paddle.v2.dataset.movielens.UserInfo
+
+..  autoclass:: paddle.dataset.movielens.UserInfo
     :noindex:
 
 sentiment
 +++++++++
 
-..  automodule:: paddle.v2.dataset.sentiment
+..  automodule:: paddle.dataset.sentiment
     :members:
     :noindex:
 
 uci_housing
 +++++++++++
 
-..  automodule:: paddle.v2.dataset.uci_housing
+..  automodule:: paddle.dataset.uci_housing
     :members:
     :noindex:
 
 wmt14
 +++++
 
-..  automodule:: paddle.v2.dataset.wmt14
+..  automodule:: paddle.dataset.wmt14
     :members:
     :noindex:
 
 wmt16
 +++++
 
-..  automodule:: paddle.v2.dataset.wmt16
+..  automodule:: paddle.dataset.wmt16
     :members:
     :noindex:
diff --git a/doc/v2/api/fluid/data_feeder.rst b/doc/v2/api/fluid/data_feeder.rst
deleted file mode 100644
index 3df5c0307ffed9d101da58b385840b115920e906..0000000000000000000000000000000000000000
--- a/doc/v2/api/fluid/data_feeder.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-===========
-data_feeder
-===========
-
-DataFeeder
-----------
-
-..  autoclass:: paddle.fluid.data_feeder.DataFeeder
-    :members:
-    :noindex:
-
diff --git a/doc/v2/api/fluid/evaluator.rst b/doc/v2/api/fluid/evaluator.rst
deleted file mode 100644
index ae9daeb7918d773d7330f419de96c6972a836710..0000000000000000000000000000000000000000
--- a/doc/v2/api/fluid/evaluator.rst
+++ /dev/null
@@ -1,21 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-=========
-evaluator
-=========
-
-Accuracy
---------
-
-..  autoclass:: paddle.fluid.evaluator.Accuracy
-    :members:
-    :noindex:
-
-ChunkEvaluator
---------------
-
-..  autoclass:: paddle.fluid.evaluator.ChunkEvaluator
-    :members:
-    :noindex:
-
diff --git a/doc/v2/api/fluid/executor.rst b/doc/v2/api/fluid/executor.rst
deleted file mode 100644
index a9cdf264e49691afc4b9425b7bfe54f8157ae6c2..0000000000000000000000000000000000000000
--- a/doc/v2/api/fluid/executor.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-========
-executor
-========
-
-Executor
---------
-
-..  autoclass:: paddle.fluid.executor.Executor
-    :members:
-    :noindex:
-
-global_scope
-------------
-
-..  autofunction:: paddle.fluid.executor.global_scope
-    :noindex:
-
-scope_guard
------------
-
-..  autofunction:: paddle.fluid.executor.scope_guard
-    :noindex:
-
-switch_scope
-------------
-
-..  autofunction:: paddle.fluid.executor.switch_scope
-    :noindex:
-
diff --git a/doc/v2/api/fluid/gen_doc.py b/doc/v2/api/fluid/gen_doc.py
deleted file mode 100644
index 89ab880301b6ac687fd61f556f87f03792c37da3..0000000000000000000000000000000000000000
--- a/doc/v2/api/fluid/gen_doc.py
+++ /dev/null
@@ -1,109 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import argparse
-import sys
-import types
-
-import paddle.fluid as fluid
-
-
-def parse_arg():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--submodules', nargs="*")
-    parser.add_argument(
-        'module', type=str, help='Generate the documentation of which module')
-    return parser.parse_args()
-
-
-class DocGenerator(object):
-    def __init__(self, module_name, stream=sys.stdout):
-        self.stream = stream
-        self.module_name = module_name
-        if not hasattr(fluid, module_name):
-            raise ValueError("Cannot find fluid.{0}".format(module_name))
-        else:
-            self.module = getattr(fluid, module_name)
-        self.stream.write('''..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-''')
-
-        self._print_header_(module_name, dot='=', is_title=True)
-
-    def print_submodule(self, submodule_name):
-        submodule = getattr(self.module, submodule_name)
-        if submodule is None:
-            raise ValueError("Cannot find submodule {0}".format(submodule_name))
-        self.print_section(submodule_name)
-
-        for item in submodule.__all__:
-            self.print_item(item)
-
-    def print_current_module(self):
-        for item in self.module.__all__:
-            self.print_item(item)
-
-    def print_section(self, name):
-        self._print_header_(name, dot='=', is_title=False)
-
-    def print_item(self, name):
-        item = getattr(self.module, name)
-        if isinstance(item, types.TypeType):
-            self.print_class(name)
-        elif isinstance(item, types.FunctionType):
-            self.print_method(name)
-        else:
-            raise RuntimeError("Unsupported item {0}".format(name))
-
-    def print_class(self, name):
-        self._print_header_(name, dot='-', is_title=False)
-        self.stream.write('''..  autoclass:: paddle.fluid.{0}.{1}
-    :members:
-    :noindex:
-
-'''.format(self.module_name, name))
-
-    def print_method(self, name):
-        self._print_header_(name, dot='-', is_title=False)
-        self.stream.write('''..  autofunction:: paddle.fluid.{0}.{1}
-    :noindex:
-
-'''.format(self.module_name, name))
-
-    def _print_header_(self, name, dot, is_title):
-        dot_line = dot * len(name)
-        if is_title:
-            self.stream.write(dot_line)
-            self.stream.write('\n')
-        self.stream.write(name)
-        self.stream.write('\n')
-        self.stream.write(dot_line)
-        self.stream.write('\n')
-        self.stream.write('\n')
-
-
-def main():
-    args = parse_arg()
-    gen = DocGenerator(args.module)
-    if args.submodules is None:
-        gen.print_current_module()
-    else:
-        for submodule_name in args.submodules:
-            gen.print_submodule(submodule_name)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/doc/v2/api/fluid/gen_doc.sh b/doc/v2/api/fluid/gen_doc.sh
deleted file mode 100755
index ba7b7ba8e51399deb852b0a7c8ddd3128f521e85..0000000000000000000000000000000000000000
--- a/doc/v2/api/fluid/gen_doc.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst
-
-for module in io data_feeder evaluator executor initializer io nets optimizer param_attr profiler regularizer
-do
-  python gen_doc.py ${module} > ${module}.rst
-done
diff --git a/doc/v2/api/fluid/index.rst b/doc/v2/api/fluid/index.rst
deleted file mode 100644
index b0710d8b19956eb235890fdb2a2d764084416aa5..0000000000000000000000000000000000000000
--- a/doc/v2/api/fluid/index.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-======================
-Fluid
-======================
-
-..  toctree::
-    :maxdepth: 1
-
-    layers.rst
-    data_feeder.rst
-    executor.rst
-    initializer.rst
-    evaluator.rst
-    nets.rst
-    optimizer.rst
-    param_attr.rst
-    profiler.rst
-    regularizer.rst
-    io.rst
diff --git a/doc/v2/api/fluid/initializer.rst b/doc/v2/api/fluid/initializer.rst
deleted file mode 100644
index ee69925fda6b3fc850cfb632e8edd359e7fcff9c..0000000000000000000000000000000000000000
--- a/doc/v2/api/fluid/initializer.rst
+++ /dev/null
@@ -1,35 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-===========
-initializer
-===========
-
-Constant
---------
-
-..  autoclass:: paddle.fluid.initializer.Constant
-    :members:
-    :noindex:
-
-Uniform
--------
-
-..  autoclass:: paddle.fluid.initializer.Uniform
-    :members:
-    :noindex:
-
-Normal
-------
-
-..  autoclass:: paddle.fluid.initializer.Normal
-    :members:
-    :noindex:
-
-Xavier
-------
-
-..  autoclass:: paddle.fluid.initializer.Xavier
-    :members:
-    :noindex:
-
diff --git a/doc/v2/api/fluid/io.rst b/doc/v2/api/fluid/io.rst
deleted file mode 100644
index dd9d88b669957c22cd0a07fa4b7e219e2d6e5d61..0000000000000000000000000000000000000000
--- a/doc/v2/api/fluid/io.rst
+++ /dev/null
@@ -1,61 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-==
-io
-==
-
-save_vars
----------
-
-..  autofunction:: paddle.fluid.io.save_vars
-    :noindex:
-
-save_params
------------
-
-..  autofunction:: paddle.fluid.io.save_params
-    :noindex:
-
-save_persistables
------------------
-
-..  autofunction:: paddle.fluid.io.save_persistables
-    :noindex:
-
-load_vars
----------
-
-..  autofunction:: paddle.fluid.io.load_vars
-    :noindex:
-
-load_params
------------
-
-..  autofunction:: paddle.fluid.io.load_params
-    :noindex:
-
-load_persistables
------------------
-
-..  autofunction:: paddle.fluid.io.load_persistables
-    :noindex:
-
-save_inference_model
---------------------
-
-..  autofunction:: paddle.fluid.io.save_inference_model
-    :noindex:
-
-load_inference_model
---------------------
-
-..  autofunction:: paddle.fluid.io.load_inference_model
-    :noindex:
-
-get_inference_program
----------------------
-
-..  autofunction:: paddle.fluid.io.get_inference_program
-    :noindex:
-
diff --git a/doc/v2/api/fluid/layers.rst b/doc/v2/api/fluid/layers.rst
deleted file mode 100644
index ae35d8c53476b34cb18331364267dd7c8b94dd64..0000000000000000000000000000000000000000
--- a/doc/v2/api/fluid/layers.rst
+++ /dev/null
@@ -1,805 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-======
-layers
-======
-
-control_flow
-============
-
-split_lod_tensor
-----------------
-
-..  autofunction:: paddle.fluid.layers.split_lod_tensor
-    :noindex:
-
-merge_lod_tensor
-----------------
-
-..  autofunction:: paddle.fluid.layers.merge_lod_tensor
-    :noindex:
-
-BlockGuard
-----------
-
-..  autoclass:: paddle.fluid.layers.BlockGuard
-    :members:
-    :noindex:
-
-BlockGuardWithCompletion
-------------------------
-
-..  autoclass:: paddle.fluid.layers.BlockGuardWithCompletion
-    :members:
-    :noindex:
-
-StaticRNNMemoryLink
--------------------
-
-..  autoclass:: paddle.fluid.layers.StaticRNNMemoryLink
-    :members:
-    :noindex:
-
-WhileGuard
-----------
-
-..  autoclass:: paddle.fluid.layers.WhileGuard
-    :members:
-    :noindex:
-
-While
------
-
-..  autoclass:: paddle.fluid.layers.While
-    :members:
-    :noindex:
-
-lod_rank_table
---------------
-
-..  autofunction:: paddle.fluid.layers.lod_rank_table
-    :noindex:
-
-max_sequence_len
-----------------
-
-..  autofunction:: paddle.fluid.layers.max_sequence_len
-    :noindex:
-
-topk
-----
-
-..  autofunction:: paddle.fluid.layers.topk
-    :noindex:
-
-lod_tensor_to_array
--------------------
-
-..  autofunction:: paddle.fluid.layers.lod_tensor_to_array
-    :noindex:
-
-array_to_lod_tensor
--------------------
-
-..  autofunction:: paddle.fluid.layers.array_to_lod_tensor
-    :noindex:
-
-increment
----------
-
-..  autofunction:: paddle.fluid.layers.increment
-    :noindex:
-
-array_write
------------
-
-..  autofunction:: paddle.fluid.layers.array_write
-    :noindex:
-
-create_array
-------------
-
-..  autofunction:: paddle.fluid.layers.create_array
-    :noindex:
-
-less_than
----------
-
-..  autofunction:: paddle.fluid.layers.less_than
-    :noindex:
-
-array_read
-----------
-
-..  autofunction:: paddle.fluid.layers.array_read
-    :noindex:
-
-shrink_memory
--------------
-
-..  autofunction:: paddle.fluid.layers.shrink_memory
-    :noindex:
-
-array_length
-------------
-
-..  autofunction:: paddle.fluid.layers.array_length
-    :noindex:
-
-IfElse
-------
-
-..  autoclass:: paddle.fluid.layers.IfElse
-    :members:
-    :noindex:
-
-DynamicRNN
-----------
-
-..  autoclass:: paddle.fluid.layers.DynamicRNN
-    :members:
-    :noindex:
-
-ConditionalBlock
-----------------
-
-..  autoclass:: paddle.fluid.layers.ConditionalBlock
-    :members:
-    :noindex:
-
-StaticRNN
----------
-
-..  autoclass:: paddle.fluid.layers.StaticRNN
-    :members:
-    :noindex:
-
-reorder_lod_tensor_by_rank
---------------------------
-
-..  autofunction:: paddle.fluid.layers.reorder_lod_tensor_by_rank
-    :noindex:
-
-ParallelDo
-----------
-
-..  autoclass:: paddle.fluid.layers.ParallelDo
-    :members:
-    :noindex:
-
-Print
------
-
-..  autofunction:: paddle.fluid.layers.Print
-    :noindex:
-
-device
-======
-
-get_places
-----------
-
-..  autofunction:: paddle.fluid.layers.get_places
-    :noindex:
-
-io
-==
-
-data
-----
-
-..  autofunction:: paddle.fluid.layers.data
-    :noindex:
-
-BlockGuardServ
---------------
-
-..  autoclass:: paddle.fluid.layers.BlockGuardServ
-    :members:
-    :noindex:
-
-ListenAndServ
--------------
-
-..  autoclass:: paddle.fluid.layers.ListenAndServ
-    :members:
-    :noindex:
-
-Send
-----
-
-..  autofunction:: paddle.fluid.layers.Send
-    :noindex:
-
-nn
-==
-
-fc
---
-
-..  autofunction:: paddle.fluid.layers.fc
-    :noindex:
-
-embedding
----------
-
-..  autofunction:: paddle.fluid.layers.embedding
-    :noindex:
-
-dynamic_lstm
-------------
-
-..  autofunction:: paddle.fluid.layers.dynamic_lstm
-    :noindex:
-
-dynamic_lstmp
--------------
-
-..  autofunction:: paddle.fluid.layers.dynamic_lstmp
-    :noindex:
-
-dynamic_gru
------------
-
-..  autofunction:: paddle.fluid.layers.dynamic_gru
-    :noindex:
-
-gru_unit
---------
-
-..  autofunction:: paddle.fluid.layers.gru_unit
-    :noindex:
-
-linear_chain_crf
-----------------
-
-..  autofunction:: paddle.fluid.layers.linear_chain_crf
-    :noindex:
-
-crf_decoding
-------------
-
-..  autofunction:: paddle.fluid.layers.crf_decoding
-    :noindex:
-
-cos_sim
--------
-
-..  autofunction:: paddle.fluid.layers.cos_sim
-    :noindex:
-
-cross_entropy
--------------
-
-..  autofunction:: paddle.fluid.layers.cross_entropy
-    :noindex:
-
-square_error_cost
------------------
-
-..  autofunction:: paddle.fluid.layers.square_error_cost
-    :noindex:
-
-accuracy
---------
-
-..  autofunction:: paddle.fluid.layers.accuracy
-    :noindex:
-
-chunk_eval
-----------
-
-..  autofunction:: paddle.fluid.layers.chunk_eval
-    :noindex:
-
-sequence_conv
--------------
-
-..  autofunction:: paddle.fluid.layers.sequence_conv
-    :noindex:
-
-conv2d
-------
-
-..  autofunction:: paddle.fluid.layers.conv2d
-    :noindex:
-
-sequence_pool
--------------
-
-..  autofunction:: paddle.fluid.layers.sequence_pool
-    :noindex:
-
-pool2d
-------
-
-..  autofunction:: paddle.fluid.layers.pool2d
-    :noindex:
-
-batch_norm
-----------
-
-..  autofunction:: paddle.fluid.layers.batch_norm
-    :noindex:
-
-layer_norm
-----------
-
-..  autofunction:: paddle.fluid.layers.layer_norm
-    :noindex:
-
-beam_search_decode
-------------------
-
-..  autofunction:: paddle.fluid.layers.beam_search_decode
-    :noindex:
-
-conv2d_transpose
-----------------
-
-..  autofunction:: paddle.fluid.layers.conv2d_transpose
-    :noindex:
-
-sequence_expand
----------------
-
-..  autofunction:: paddle.fluid.layers.sequence_expand
-    :noindex:
-
-lstm_unit
----------
-
-..  autofunction:: paddle.fluid.layers.lstm_unit
-    :noindex:
-
-reduce_sum
-----------
-
-..  autofunction:: paddle.fluid.layers.reduce_sum
-    :noindex:
-
-reduce_mean
------------
-
-..  autofunction:: paddle.fluid.layers.reduce_mean
-    :noindex:
-
-reduce_max
-----------
-
-..  autofunction:: paddle.fluid.layers.reduce_max
-    :noindex:
-
-reduce_min
-----------
-
-..  autofunction:: paddle.fluid.layers.reduce_min
-    :noindex:
-
-sequence_first_step
--------------------
-
-..  autofunction:: paddle.fluid.layers.sequence_first_step
-    :noindex:
-
-sequence_last_step
-------------------
-
-..  autofunction:: paddle.fluid.layers.sequence_last_step
-    :noindex:
-
-dropout
--------
-
-..  autofunction:: paddle.fluid.layers.dropout
-    :noindex:
-
-split
------
-
-..  autofunction:: paddle.fluid.layers.split
-    :noindex:
-
-ctc_greedy_decoder
-------------------
-
-..  autofunction:: paddle.fluid.layers.ctc_greedy_decoder
-    :noindex:
-
-edit_distance
--------------
-
-..  autofunction:: paddle.fluid.layers.edit_distance
-    :noindex:
-
-l2_normalize
-------------
-
-..  autofunction:: paddle.fluid.layers.l2_normalize
-    :noindex:
-
-matmul
-------
-
-..  autofunction:: paddle.fluid.layers.matmul
-    :noindex:
-
-warpctc
--------
-
-..  autofunction:: paddle.fluid.layers.warpctc
-    :noindex:
-
-sequence_reshape
-----------------
-
-..  autofunction:: paddle.fluid.layers.sequence_reshape
-    :noindex:
-
-transpose
----------
-
-..  autofunction:: paddle.fluid.layers.transpose
-    :noindex:
-
-im2sequence
------------
-
-..  autofunction:: paddle.fluid.layers.im2sequence
-    :noindex:
-
-nce
----
-
-..  autofunction:: paddle.fluid.layers.nce
-    :noindex:
-
-beam_search
------------
-
-..  autofunction:: paddle.fluid.layers.beam_search
-    :noindex:
-
-row_conv
---------
-
-..  autofunction:: paddle.fluid.layers.row_conv
-    :noindex:
-
-multiplex
----------
-
-..  autofunction:: paddle.fluid.layers.multiplex
-    :noindex:
-
-ops
-===
-
-mean
-----
-
-..  autofunction:: paddle.fluid.layers.mean
-    :noindex:
-
-mul
----
-
-..  autofunction:: paddle.fluid.layers.mul
-    :noindex:
-
-reshape
--------
-
-..  autofunction:: paddle.fluid.layers.reshape
-    :noindex:
-
-scale
------
-
-..  autofunction:: paddle.fluid.layers.scale
-    :noindex:
-
-sigmoid_cross_entropy_with_logits
----------------------------------
-
-..  autofunction:: paddle.fluid.layers.sigmoid_cross_entropy_with_logits
-    :noindex:
-
-elementwise_add
----------------
-
-..  autofunction:: paddle.fluid.layers.elementwise_add
-    :noindex:
-
-elementwise_div
----------------
-
-..  autofunction:: paddle.fluid.layers.elementwise_div
-    :noindex:
-
-elementwise_sub
----------------
-
-..  autofunction:: paddle.fluid.layers.elementwise_sub
-    :noindex:
-
-elementwise_mul
----------------
-
-..  autofunction:: paddle.fluid.layers.elementwise_mul
-    :noindex:
-
-elementwise_max
----------------
-
-..  autofunction:: paddle.fluid.layers.elementwise_max
-    :noindex:
-
-elementwise_min
----------------
-
-..  autofunction:: paddle.fluid.layers.elementwise_min
-    :noindex:
-
-elementwise_pow
----------------
-
-..  autofunction:: paddle.fluid.layers.elementwise_pow
-    :noindex:
-
-clip
-----
-
-..  autofunction:: paddle.fluid.layers.clip
-    :noindex:
-
-clip_by_norm
-------------
-
-..  autofunction:: paddle.fluid.layers.clip_by_norm
-    :noindex:
-
-sequence_softmax
-----------------
-
-..  autofunction:: paddle.fluid.layers.sequence_softmax
-    :noindex:
-
-sigmoid
--------
-
-..  autofunction:: paddle.fluid.layers.sigmoid
-    :noindex:
-
-logsigmoid
-----------
-
-..  autofunction:: paddle.fluid.layers.logsigmoid
-    :noindex:
-
-exp
----
-
-..  autofunction:: paddle.fluid.layers.exp
-    :noindex:
-
-relu
-----
-
-..  autofunction:: paddle.fluid.layers.relu
-    :noindex:
-
-tanh
-----
-
-..  autofunction:: paddle.fluid.layers.tanh
-    :noindex:
-
-tanh_shrink
------------
-
-..  autofunction:: paddle.fluid.layers.tanh_shrink
-    :noindex:
-
-softshrink
-----------
-
-..  autofunction:: paddle.fluid.layers.softshrink
-    :noindex:
-
-sqrt
-----
-
-..  autofunction:: paddle.fluid.layers.sqrt
-    :noindex:
-
-abs
----
-
-..  autofunction:: paddle.fluid.layers.abs
-    :noindex:
-
-ceil
-----
-
-..  autofunction:: paddle.fluid.layers.ceil
-    :noindex:
-
-floor
------
-
-..  autofunction:: paddle.fluid.layers.floor
-    :noindex:
-
-round
------
-
-..  autofunction:: paddle.fluid.layers.round
-    :noindex:
-
-reciprocal
-----------
-
-..  autofunction:: paddle.fluid.layers.reciprocal
-    :noindex:
-
-log
----
-
-..  autofunction:: paddle.fluid.layers.log
-    :noindex:
-
-square
-------
-
-..  autofunction:: paddle.fluid.layers.square
-    :noindex:
-
-softplus
---------
-
-..  autofunction:: paddle.fluid.layers.softplus
-    :noindex:
-
-softsign
---------
-
-..  autofunction:: paddle.fluid.layers.softsign
-    :noindex:
-
-brelu
------
-
-..  autofunction:: paddle.fluid.layers.brelu
-    :noindex:
-
-leaky_relu
-----------
-
-..  autofunction:: paddle.fluid.layers.leaky_relu
-    :noindex:
-
-soft_relu
----------
-
-..  autofunction:: paddle.fluid.layers.soft_relu
-    :noindex:
-
-elu
----
-
-..  autofunction:: paddle.fluid.layers.elu
-    :noindex:
-
-relu6
------
-
-..  autofunction:: paddle.fluid.layers.relu6
-    :noindex:
-
-pow
----
-
-..  autofunction:: paddle.fluid.layers.pow
-    :noindex:
-
-stanh
------
-
-..  autofunction:: paddle.fluid.layers.stanh
-    :noindex:
-
-hard_shrink
------------
-
-..  autofunction:: paddle.fluid.layers.hard_shrink
-    :noindex:
-
-thresholded_relu
-----------------
-
-..  autofunction:: paddle.fluid.layers.thresholded_relu
-    :noindex:
-
-hard_sigmoid
-------------
-
-..  autofunction:: paddle.fluid.layers.hard_sigmoid
-    :noindex:
-
-swish
------
-
-..  autofunction:: paddle.fluid.layers.swish
-    :noindex:
-
-tensor
-======
-
-create_tensor
--------------
-
-..  autofunction:: paddle.fluid.layers.create_tensor
-    :noindex:
-
-create_parameter
-----------------
-
-..  autofunction:: paddle.fluid.layers.create_parameter
-    :noindex:
-
-create_global_var
------------------
-
-..  autofunction:: paddle.fluid.layers.create_global_var
-    :noindex:
-
-cast
-----
-
-..  autofunction:: paddle.fluid.layers.cast
-    :noindex:
-
-concat
-------
-
-..  autofunction:: paddle.fluid.layers.concat
-    :noindex:
-
-sums
-----
-
-..  autofunction:: paddle.fluid.layers.sums
-    :noindex:
-
-assign
-------
-
-..  autofunction:: paddle.fluid.layers.assign
-    :noindex:
-
-fill_constant_batch_size_like
------------------------------
-
-..  autofunction:: paddle.fluid.layers.fill_constant_batch_size_like
-    :noindex:
-
-fill_constant
--------------
-
-..  autofunction:: paddle.fluid.layers.fill_constant
-    :noindex:
-
-ones
-----
-
-..  autofunction:: paddle.fluid.layers.ones
-    :noindex:
-
-zeros
------
-
-..  autofunction:: paddle.fluid.layers.zeros
-    :noindex:
-
diff --git a/doc/v2/api/fluid/nets.rst b/doc/v2/api/fluid/nets.rst
deleted file mode 100644
index 7ae3187304f386a08c5cb8a4ba093423a58a7f36..0000000000000000000000000000000000000000
--- a/doc/v2/api/fluid/nets.rst
+++ /dev/null
@@ -1,31 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-====
-nets
-====
-
-simple_img_conv_pool
---------------------
-
-..  autofunction:: paddle.fluid.nets.simple_img_conv_pool
-    :noindex:
-
-sequence_conv_pool
-------------------
-
-..  autofunction:: paddle.fluid.nets.sequence_conv_pool
-    :noindex:
-
-glu
----
-
-..  autofunction:: paddle.fluid.nets.glu
-    :noindex:
-
-scaled_dot_product_attention
-----------------------------
-
-..  autofunction:: paddle.fluid.nets.scaled_dot_product_attention
-    :noindex:
-
diff --git a/doc/v2/api/fluid/optimizer.rst b/doc/v2/api/fluid/optimizer.rst
deleted file mode 100644
index 9b165f870459b4f9ef2efe24f5604a3fcb96f7f3..0000000000000000000000000000000000000000
--- a/doc/v2/api/fluid/optimizer.rst
+++ /dev/null
@@ -1,49 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-=========
-optimizer
-=========
-
-SGD
----
-
-..  autoclass:: paddle.fluid.optimizer.SGD
-    :members:
-    :noindex:
-
-Momentum
---------
-
-..  autoclass:: paddle.fluid.optimizer.Momentum
-    :members:
-    :noindex:
-
-Adagrad
--------
-
-..  autoclass:: paddle.fluid.optimizer.Adagrad
-    :members:
-    :noindex:
-
-Adam
-----
-
-..  autoclass:: paddle.fluid.optimizer.Adam
-    :members:
-    :noindex:
-
-Adamax
-------
-
-..  autoclass:: paddle.fluid.optimizer.Adamax
-    :members:
-    :noindex:
-
-DecayedAdagrad
---------------
-
-..  autoclass:: paddle.fluid.optimizer.DecayedAdagrad
-    :members:
-    :noindex:
-
diff --git a/doc/v2/api/fluid/param_attr.rst b/doc/v2/api/fluid/param_attr.rst
deleted file mode 100644
index 8e4ddb2b0492d0fcfcade199fdd6dfe43faa7075..0000000000000000000000000000000000000000
--- a/doc/v2/api/fluid/param_attr.rst
+++ /dev/null
@@ -1,21 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-==========
-param_attr
-==========
-
-ParamAttr
----------
-
-..  autoclass:: paddle.fluid.param_attr.ParamAttr
-    :members:
-    :noindex:
-
-WeightNormParamAttr
--------------------
-
-..  autoclass:: paddle.fluid.param_attr.WeightNormParamAttr
-    :members:
-    :noindex:
-
diff --git a/doc/v2/api/fluid/profiler.rst b/doc/v2/api/fluid/profiler.rst
deleted file mode 100644
index 74d102dcb0db35766c34e3d14939a8aa5861686b..0000000000000000000000000000000000000000
--- a/doc/v2/api/fluid/profiler.rst
+++ /dev/null
@@ -1,25 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-========
-profiler
-========
-
-cuda_profiler
--------------
-
-..  autofunction:: paddle.fluid.profiler.cuda_profiler
-    :noindex:
-
-reset_profiler
---------------
-
-..  autofunction:: paddle.fluid.profiler.reset_profiler
-    :noindex:
-
-profiler
---------
-
-..  autofunction:: paddle.fluid.profiler.profiler
-    :noindex:
-
diff --git a/doc/v2/api/fluid/regularizer.rst b/doc/v2/api/fluid/regularizer.rst
deleted file mode 100644
index dc9740c46392567d314121ac401540b0e7382703..0000000000000000000000000000000000000000
--- a/doc/v2/api/fluid/regularizer.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-..  THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
-    !DO NOT EDIT THIS FILE MANUALLY!
-
-===========
-regularizer
-===========
-
-append_regularization_ops
--------------------------
-
-..  autofunction:: paddle.fluid.regularizer.append_regularization_ops
-    :noindex:
-
-L1Decay
--------
-
-..  autoclass:: paddle.fluid.regularizer.L1Decay
-    :members:
-    :noindex:
-
-L2Decay
--------
-
-..  autoclass:: paddle.fluid.regularizer.L2Decay
-    :members:
-    :noindex:
-
diff --git a/doc/v2/api/index_en.rst b/doc/v2/api/index_en.rst
index b11cd449affd1dcd9d3f42492961469331350942..70c5c524aaf0a9ae003bf4340c3f268c225d4419 100644
--- a/doc/v2/api/index_en.rst
+++ b/doc/v2/api/index_en.rst
@@ -8,4 +8,3 @@ API
     model_configs.rst
     data.rst
     run_logic.rst
-    fluid/index.rst
diff --git a/doc/v2/build_and_install/build_from_source_cn.rst b/doc/v2/build_and_install/build_from_source_cn.rst
index 115b92a33888abf1e1be400e1abbb58b632a2976..6421c5308271c2508597d849c79709255caf349a 100644
--- a/doc/v2/build_and_install/build_from_source_cn.rst
+++ b/doc/v2/build_and_install/build_from_source_cn.rst
@@ -19,10 +19,11 @@
 ----------------
 
 PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境Docker镜像
-可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到。或者
-参考下述可选步骤，从源码中构建用于编译PaddlePaddle的Docker镜像。
+可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`__ 找到，您也可以
+在 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`__ 找到 paddle_manylinux_devel
+镜像的编译以及使用方法。或者参考下述可选步骤，从源码中构建用于编译PaddlePaddle的Docker镜像。
 
-如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。
+如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 :ref:`编译依赖 <_compile_deps>` 之后才能开始编译的步骤。
 
 编译PaddlePaddle，需要执行：
 
@@ -34,13 +35,11 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
    # 2. 可选步骤：源码中构建用于编译PaddlePaddle的Docker镜像
    docker build -t paddle:dev .
    # 3. 执行下面的命令编译CPU-Only的二进制
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
    # 4. 或者也可以使用为上述可选步骤构建的镜像（必须先执行第2步）
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
 
-注：上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。如果使用自行
-构建的镜像（上述第4步）会执行 :code:`Dockerfile` 描述的默认入口程序 :code:`build.sh` 可以省略步骤3中
-最后的执行脚本的命令。
+注：上述命令把当前目录（源码树根目录）映射为 container 里的 :code:`/paddle` 目录。
 
 编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：
 
@@ -71,15 +70,15 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 
 .. code-block:: bash
 
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
 
 如果期望执行其中一个单元测试，（比如 :code:`test_sum_op` ）：
 
 .. code-block:: bash
 
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
-   bash /paddle/paddle/scripts/docker/build.sh
-   cd /paddle/build
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+   ./paddle/scripts/paddle_build.sh build
+   cd build
    ctest -R test_sum_op -V
 
 .. _faq_docker:
@@ -107,7 +106,7 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 
 - 学习 Docker 有多难？
 
-  理解 Docker 并不难，大概花十分钟看一下[这篇文章](https://zhuanlan.zhihu.com/p/19902938)。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
+  理解 Docker 并不难，大概花十分钟看一下 `如何使用Docker <https://zhuanlan.zhihu.com/p/19902938>`_ 。这可以帮您省掉花一小时安装和配置各种开发工具，以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
 
 - 我可以用 IDE 吗？
 
@@ -115,17 +114,16 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 
   很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
 
-  ```emacs
-  (global-set-key "\C-cc" 'compile)
-  (setq compile-command
-   "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
-  ```
+  .. code-block:: emacs
+
+    (global-set-key "\C-cc" 'compile)
+    (setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
 
   就可以按 `Ctrl-C` 和 `c` 键来启动编译了。
 
 - 可以并行编译吗？
 
-  是的。我们的 Docker image 运行一个 [Bash 脚本](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
+  是的。我们的 Docker image 运行一个 `Paddle编译Bash脚本 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ 。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
 
 - Docker 需要 sudo
 
@@ -133,11 +131,11 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
 
 - 在 Windows/MacOS 上编译很慢
 
-  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考[这个issue](https://github.com/PaddlePaddle/Paddle/issues/627)。
+  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存，以保证编译高效。具体做法请参考 `如何为Windows/Mac计算机上的Docker增加内存和虚拟机 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 。
 
 - 磁盘不够
 
-  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考[这篇文章](https://zaiste.net/posts/removing_docker_containers/)来清理这些内容。
+  本文中的例子里，`docker run` 命令里都用了 `--rm` 参数，这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果，是没有名字的 images，也会占用磁盘。可以参考 `如何删除Docker Container <https://zaiste.net/posts/removing_docker_containers/>`_ 来清理这些内容。
 
 
 .. _compile_deps:
@@ -197,7 +195,7 @@ BLAS
 
 PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和
 `OpenBlAS <http://www.openblas.net/>`_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集，
-还会下载MKL-DNN数学库，详细参考 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
+还会下载MKL-DNN数学库，详细参考 `mkldnn设计文档 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
 
 如果关闭MKL，则会使用OpenBLAS作为BLAS库。
 
@@ -213,7 +211,7 @@ PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行，
 编译选项的设置
 ++++++++++++++
 
-PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如
 
 ..  code-block:: bash
 
diff --git a/doc/v2/build_and_install/build_from_source_en.rst b/doc/v2/build_and_install/build_from_source_en.rst
index 8fef9e7347e8d924026999bfda985381750c6b51..b08b45d43ec7f1deb2889832079a731ee724a44c 100644
--- a/doc/v2/build_and_install/build_from_source_en.rst
+++ b/doc/v2/build_and_install/build_from_source_en.rst
@@ -11,7 +11,7 @@ To build PaddlePaddle, you need
 1. A computer -- Linux, Windows, MacOS.
 2. Docker.
 
-Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image. 
+Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image.
 We run all the tools by running this image.
 
 .. _build_step:
@@ -22,8 +22,12 @@ How To Build
 You need to use Docker to build PaddlePaddle
 to avoid installing dependencies by yourself. We have several pre-built
 Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ ,
+you can also find how to build and use paddle_manylinux_devel Docker image from
+`here <https://github.com/PaddlePaddle/Paddle/tree/develop/tools/manylinux1/>`__
 Or you can build your own image from source as the optional step below:
 
+If you don't wish to use docker，you need to install several compile dependencies manually as :ref:`Compile Dependencies <_compile_deps>` shows to start compilation.
+
 .. code-block:: bash
 
    # 1. clone the source code
@@ -32,14 +36,12 @@ Or you can build your own image from source as the optional step below:
    # 2. Optional: build development docker image from source
    docker build -t paddle:dev .
    # 3. Run the following command to build a CPU-Only binaries
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh build
    # 4. Or, use your built Docker image to build PaddlePaddle (must run step 2)
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev ./paddle/scripts/paddle_build.sh build
 
 NOTE: The above command try to mount the current working directory (root directory of source code)
-into :code:`/paddle` directory inside docker container. If you are using your own image
-(Step 4) it will run default entry-point :code:`build.sh` , so you could omit the last
-command in step 3.
+into :code:`/paddle` directory inside docker container.
 
 When the compile finishes, you can get the output whl package under
 build/python/dist, then you can choose to install the whl on local
@@ -72,21 +74,21 @@ Set :code:`WITH_GPU=ON` Can also run tests on GPU.
 
 .. code-block:: bash
 
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 ./paddle/scripts/paddle_build.sh test
 
 If you wish to run only one unit test, like :code:`test_sum_op`:
 
 .. code-block:: bash
 
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
-   bash /paddle/paddle/scripts/docker/build.sh
-   cd /paddle/build
+   docker run -it -v $PWD:/paddle -w /paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 /bin/bash
+   ./paddle/scripts/paddle_build.sh build
+   cd build
    ctest -R test_sum_op -V
 
 .. _faq_docker:
 
 Frequently Asked Questions
-----------------
+---------------------------
 
 - What is Docker?
 
@@ -108,7 +110,7 @@ Frequently Asked Questions
 
 - How difficult is it to learn Docker?
 
-    It takes you ten minutes to read [an introductory article](https://docs.docker.com/get-started) and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools.  Not even to mention the time saved when other people trying to reproduce the issue you have.
+    It takes you ten minutes to read `an introductory article <https://docs.docker.com/get-started>`_ and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools.  Not even to mention the time saved when other people trying to reproduce the issue you have.
 
 - Can I use my favorite IDE?
 
@@ -116,17 +118,16 @@ Frequently Asked Questions
 
   Many PaddlePaddle developers are using Emacs.  They add the following few lines into their `~/.emacs` configure file:
 
-  ```emacs
-  (global-set-key "\C-cc" 'compile)
-  (setq compile-command
-   "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
-  ```
+  .. code-block:: emacs
+
+    (global-set-key "\C-cc" 'compile)
+    (setq compile-command "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
 
   so they could type `Ctrl-C` and `c` to build PaddlePaddle from source.
 
 - Does Docker do parallel building?
 
-  Our building Docker image runs a [Bash script](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh), which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
+  Our building Docker image runs a  `Bash script <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh>`_ , which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
 
 - Docker requires sudo
 
@@ -134,16 +135,16 @@ Frequently Asked Questions
 
 - Docker on Windows/MacOS builds slowly
 
-  On Windows and MacOS, Docker containers run in a Linux VM.  You might want to give this VM some more memory and CPUs so to make the building efficient.  Please refer to [this issue](https://github.com/PaddlePaddle/Paddle/issues/627) for details.
+  On Windows and MacOS, Docker containers run in a Linux VM.  You might want to give this VM some more memory and CPUs so to make the building efficient.  Please refer to `this issue  <https://github.com/PaddlePaddle/Paddle/issues/627>`_ for details.
 
 - Not enough disk space
 
-  Examples in this article use option `--rm` with the `docker run` command.  This option ensures that stopped containers do not exist on hard disks.  We can use `docker ps -a` to list all containers, including stopped.  Sometimes `docker build` generates some intermediate dangling images, which also take disk space.  To clean them, please refer to [this article](https://zaiste.net/posts/removing_docker_containers/).
+  Examples in this article use option `--rm` with the `docker run` command.  This option ensures that stopped containers do not exist on hard disks.  We can use `docker ps -a` to list all containers, including stopped.  Sometimes `docker build` generates some intermediate dangling images, which also take disk space.  To clean them, please refer to `this article <https://zaiste.net/posts/removing_docker_containers/>`_ .
 
 .. _compile_deps:
 
 Appendix: Compile Dependencies
-----------------
+-------------------------------
 
 PaddlePaddle need the following dependencies when compiling, other dependencies
 will be downloaded automatically.
@@ -164,11 +165,11 @@ will be downloaded automatically.
 .. _build_options:
 
 Appendix: Build Options
-----------------
+-------------------------
 
 Build options include whether build binaries for CPU or GPU, which BLAS
 library to use etc. You may pass these settings when running cmake.
-For detailed cmake tutorial please refer to `here <https://cmake.org/cmake-tutorial>`_ 。
+For detailed cmake tutorial please refer to `here <https://cmake.org/cmake-tutorial>`__ 。
 
 
 You can add :code:`-D` argument to pass such options, like:
@@ -217,7 +218,7 @@ keep on with latest cuDNN versions. Be sure to run with the same version of cuDN
 you built.
 
 Pass Compile Options
-++++++++++++++
+++++++++++++++++++++++
 
 You can pass compile options to use intended BLAS/CUDA/Cudnn libraries.
 When running cmake command, it will search system paths like
diff --git a/doc/v2/build_and_install/docker_install_cn.rst b/doc/v2/build_and_install/docker_install_cn.rst
index 79d214635a069a739060e0b79424729f6ff90387..106c86bace075764c84bc2a7f7cb09d466fa8794 100644
--- a/doc/v2/build_and_install/docker_install_cn.rst
+++ b/doc/v2/build_and_install/docker_install_cn.rst
@@ -73,6 +73,7 @@
 当然，您也可以进入到Docker容器中，以交互式的方式执行或调试您的代码：
 
   .. code-block:: bash
+
      docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
      cd /work
      python train.py
@@ -97,7 +98,7 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
 
 国内用户可以使用下面的镜像源来加速访问：
 
-  .. code-block: bash
+  .. code-block:: bash
 
     docker run -p 8888:8888 docker.paddlepaddlehub.com/book
 
diff --git a/doc/v2/build_and_install/docker_install_en.rst b/doc/v2/build_and_install/docker_install_en.rst
index e0e0559fb858a093db96a9b4ec1c5a45d6c71a38..25aecb8d0da9feb00006da6259b529b7011d91cb 100644
--- a/doc/v2/build_and_install/docker_install_en.rst
+++ b/doc/v2/build_and_install/docker_install_en.rst
@@ -80,6 +80,7 @@ Also, you can go into the container shell, run or debug your code
 interactively:
 
   .. code-block:: bash
+
      docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
      cd /work
      python train.py
@@ -104,7 +105,7 @@ We provide a packaged book image, simply issue the command:
 
 For users in China, we provide a faster mirror:
 
-  .. code-block: bash
+  .. code-block:: bash
 
     docker run -p 8888:8888 docker.paddlepaddlehub.com/book
 
diff --git a/doc/v2/build_and_install/index_cn.rst b/doc/v2/build_and_install/index_cn.rst
index e079bb661f3a5141a09dfbc6893d1bf945697bc9..1a9305ac4b6578c14a962f223c647a71e3b8a72b 100644
--- a/doc/v2/build_and_install/index_cn.rst
+++ b/doc/v2/build_and_install/index_cn.rst
@@ -6,7 +6,7 @@
 PaddlePaddle针对不同的用户群体提供了多种安装方式。
 
 专注深度学习模型开发
------------------
+--------------------
 
 PaddlePaddle提供了多种python wheel包，可通过pip一键安装：
 
@@ -18,7 +18,7 @@ PaddlePaddle提供了多种python wheel包，可通过pip一键安装：
 这是最便捷的安装方式，请根据机器配置和系统选择对应的安装包。
 
 关注底层框架
-----------
+-------------
 
 PaddlePaddle提供了基于Docker的安装方式，请参照以下教程：
 
@@ -45,7 +45,7 @@ PaddlePaddle提供了基于Docker的安装方式，请参照以下教程：
 
 
 常见问题汇总
------------
+--------------
 
 如果在安装过程中遇到了问题，请先尝试在下面的页面寻找答案：
 
diff --git a/doc/v2/build_and_install/index_en.rst b/doc/v2/build_and_install/index_en.rst
index 7e0ca5bcbdbad0a3c97c0045bb57b51137668161..7990bacbd6966e88e8763e9c5709e410f7e9fed4 100644
--- a/doc/v2/build_and_install/index_en.rst
+++ b/doc/v2/build_and_install/index_en.rst
@@ -1,32 +1,56 @@
-Install and Build
-=================
+install and Compile
+======================
 
 .. _install_steps:
 
-Install Steps
-++++++++
+PaddlePaddle provides various methods of installation for many different users
 
-You can choose either pip or Docker to complete your install:
+Focus on Deep Learning Model Development
+----------------------------------------
+
+PaddlePaddle provides lots of packages of python wheel , that pip can install:
+
+.. toctree::
+	:maxdepth: 1
+
+	pip_install_en.rst
+
+This is the most convenient way of installation. Please choose the right installation package with machine configure and system.
+
+Follow the Bottom Frame
+------------------------
+
+PaddlePaddle also supports installation using Docker. Please refer to the tutorial below:
 
 .. toctree::
-   :maxdepth: 1
+	:maxdepth: 1
 
-   pip_install_en.rst
-   docker_install_en.rst
+	docker_install_en.rst
 
-Build from Source
------------------
+We recommend running PaddlePaddle in Docker. This method has the following advantages：
 
-..  warning::
+- Does not require installation of third-party dependencies. 
+- Easy to share runtime environment. 
 
-    We recommend to directly install via above installation steps, you'll only need to build PaddlePaddle from source when you need a modifed binary.
+Lastly, users can also compile and install PaddlePaddle from source code. The instructions are below:
 
-..  toctree::
+.. toctree::
     :maxdepth: 1
 
-    build_from_source_en.md
+    build_from_source_en.rst
+
+.. warning::
+
+	One caveat with this approach is that developers will have to download, compile and install all third-party dependencies. Thus this process of installation is more time consuming.
+
 
 FAQ
-++++++++++
+-----------
+
+For any problems during installation, please refer to the page below for answers:
+
+:ref:`常见问题解答 <install_faq>`
+
+If the problem still persists, you are welcome to seek assistance from the PaddlePaddle community：
 
-`FAQ <http://www.paddlepaddle.org/docs/develop/documentation/zh/faq/build_and_install/index_en.html>`_
+`创建issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_
diff --git a/doc/v2/build_and_install/pip_install_cn.rst b/doc/v2/build_and_install/pip_install_cn.rst
index b3d882743785e8ee301b71b696230531d2b7ba58..095da19cd41d29bfa72ab23abd24bec45f925a86 100644
--- a/doc/v2/build_and_install/pip_install_cn.rst
+++ b/doc/v2/build_and_install/pip_install_cn.rst
@@ -10,20 +10,38 @@ PaddlePaddle可以使用常用的Python包管理工具
 使用pip安装
 ------------------------------
 
-
-执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件，版本为cpu_avx_openblas。
+执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件。
 
   .. code-block:: bash
 
      pip install paddlepaddle
 
+当前的默认版本为0.12.0，cpu_avx_openblas，您可以通过指定版本号来安装其它版本，例如:
+
+  .. code-block:: bash
+
+      pip install paddlepaddle==0.11.0
+
 
-如果需要安装支持GPU的版本（cuda7.5_cudnn5_avx_openblas），需要执行：
+如果需要安装支持GPU的版本（cuda8.0_cudnn5_avx_openblas），需要执行：
 
   .. code-block:: bash
 
      pip install paddlepaddle-gpu
 
+当前的默认版本也是0.12.0，PaddlePaddle针对不同需求提供了更多版本的安装包，部分列表如下：
+
+=================================   ========================================
+版本号                               版本说明
+=================================   ========================================
+paddlepaddle-gpu==0.12.0            使用CUDA 8.0和cuDNN 5编译的0.12.0版本
+paddlepaddle-gpu==0.11.0.post87     使用CUDA 8.0和cuDNN 7编译的0.11.0版本
+paddlepaddle-gpu==0.11.0.post8      使用CUDA 8.0和cuDNN 5编译的0.11.0版本
+paddlepaddle-gpu==0.11.0            使用CUDA 7.5和cuDNN 5编译的0.11.0版本
+=================================   ========================================
+
+您可以在 `Release History <https://pypi.org/project/paddlepaddle-gpu/#history>`_ 中找到paddlepaddle-gpu的各个发行版本。
+
 如果需要获取并安装最新的（开发分支）PaddlePaddle，可以从我们的CI系统中下载最新的whl安装包和c-api开发包并安装，
 您可以从下面的表格中找到需要的版本：
 
@@ -37,12 +55,12 @@ PaddlePaddle可以使用常用的Python包管理工具
     :header: "版本说明", "cp27-cp27mu", "cp27-cp27m"
     :widths: 1, 3, 3
 
-    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
+    "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
 
 .. _pip_dependency:
 
@@ -69,7 +87,7 @@ PaddlePaddle发布的安装包会尽量对齐 `manylinux1 <https://www.python.or
 ------------------------------
 
 - paddlepaddle*.whl is not a supported wheel on this platform.
-  
+
   出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准，需要使用最新的pip (>9.0.0) 才可以安装。可以使用下面的命令更新您的pip：
 
     .. code-block:: bash
diff --git a/doc/v2/build_and_install/pip_install_en.rst b/doc/v2/build_and_install/pip_install_en.rst
index 1e409d86b9775094998f72f92954f4bbc1013ea1..8406e4aa1fbb953c3b615b10d1bcb2c45974dde0 100644
--- a/doc/v2/build_and_install/pip_install_en.rst
+++ b/doc/v2/build_and_install/pip_install_en.rst
@@ -12,20 +12,38 @@ Install using pip
 ------------------------------
 
 Run the following command to install PaddlePaddle on the current
-machine, it will also download requirements, the version is cpu_avx_openblas.
+machine, it will also download requirements.
 
   .. code-block:: bash
 
      pip install paddlepaddle
 
+the default version is 0.12.0, cpu_avx_openblas, you can specify the versions to satisfy your demands, like:
 
-If you wish to install GPU version (cuda7.5_cudnn5_avx_openblas), just run:
+  .. code-block:: bash
+
+      pip install paddlepaddle==0.11.0
+
+If you need to install a GPU-enabled version (cuda8.0_cudnn5_avx_openblas), you need to run:
 
   .. code-block:: bash
 
      pip install paddlepaddle-gpu
 
-If you wish to install the latest develop branch PaddlePaddle, 
+The default version is also 0.12.0, PaddlePaddle provides several versions of packages for different needs, as shown in the table:
+
+=================================   ========================================
+版本号                               版本说明
+=================================   ========================================
+paddlepaddle-gpu==0.12.0            0.12.0 built with CUDA 8.0 and cuDNN 5
+paddlepaddle-gpu==0.11.0.post87     0.11.0 built with CUDA 8.0 and cuDNN 7
+paddlepaddle-gpu==0.11.0.post8      0.11.0 built with CUDA 8.0 and cuDNN 5
+paddlepaddle-gpu==0.11.0            0.11.0 built with CUDA 7.5 and cuDNN 5
+=================================   ========================================
+
+You can find all versions released of paddlepaddle-gpu in `Release History <https://pypi.org/project/paddlepaddle-gpu/#history>`_ .
+
+If you wish to install the latest develop branch PaddlePaddle,
 you can download the latest whl package from our CI system. Access
 the below links, log in as guest, then click at the "Artifact"
 tab, you'll find the download link of whl packages.
@@ -40,12 +58,12 @@ If the links below shows up the login form, just click "Log in as guest" to star
     :header: "version", "cp27-cp27mu", "cp27-cp27m"
     :widths: 1, 3, 3
 
-    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
-    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
+    "cpu_avx_mkl", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_avx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
+    "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl>`__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl>`__"
 
 .. _pip_dependency:
 
@@ -79,7 +97,7 @@ FAQ
 ------------------------------
 
 - paddlepaddle*.whl is not a supported wheel on this platform.
-  
+
   The main cause of this issue is that your current platform is
   not supported. Please check that you are using Python 2.7 series.
   Besides, pypi only supports manylinux1 standard, you'll need to
diff --git a/doc/design/cluster_train/README.md b/doc/v2/design/cluster_train/README.md
similarity index 100%
rename from doc/design/cluster_train/README.md
rename to doc/v2/design/cluster_train/README.md
diff --git a/doc/design/cluster_train/checkpointing.md b/doc/v2/design/cluster_train/checkpointing.md
similarity index 100%
rename from doc/design/cluster_train/checkpointing.md
rename to doc/v2/design/cluster_train/checkpointing.md
diff --git a/doc/design/cluster_train/data_dispatch.md b/doc/v2/design/cluster_train/data_dispatch.md
similarity index 100%
rename from doc/design/cluster_train/data_dispatch.md
rename to doc/v2/design/cluster_train/data_dispatch.md
diff --git a/doc/v2/design/cluster_train/large_model_dist_train.md b/doc/v2/design/cluster_train/large_model_dist_train.md
new file mode 100644
index 0000000000000000000000000000000000000000..edb0245ea083e791b7f32ac57a330698299fceda
--- /dev/null
+++ b/doc/v2/design/cluster_train/large_model_dist_train.md
@@ -0,0 +1,101 @@
+# Alalysis of large model distributed training in Paddle
+
+***NOTE: This is only some note for how we implemeted this scheme in V1, not a new design.***
+
+## What is it
+
+We often encounter cases that the embedding layer parameters(sparse) are so large that we can not store it in the trainer's memory when training. So we need to put them to several servers, and fetch them row by row instead of fetch all of the parameters.
+
+## How to use
+
+Specify command-line argument like  `--loadsave_parameters_in_pserver=true --ports_num_for_sparse=1  --use_old_updater=1` when starting the paddle trainer. And also add something like `--ports_num_for_sparse=1 --pserver_num_threads=5` when starting pserver processes.
+
+Accrodingly, configure your embedding layers like:
+
+```python
+SPARSE_REMOTE=True
+
+w1 = data_layer(name="w1", size=dict_size)
+emb1 = embedding_layer(input=w1, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE))
+w2 = data_layer(name="w2", size=dict_size)
+emb2 = embedding_layer(input=w2, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE))
+...
+```
+
+## Implementation details
+
+```c++
+enum MatType {
+  MAT_NORMAL,
+  MAT_NORMAL_SHARED,
+  MAT_VALUE_SHARED,
+  MAT_SPARSE_ROW_IDS,
+  MAT_SPARSE_ROW_AUTO_GROW,
+  MAT_CACHE_ROW,
+  MAT_SPARSE_ROW,
+  MAT_SPARSE_ROW_PREFETCH,
+  MAT_SPARSE_ROW_PREFETCH_FULL_SIZE,
+};
+```
+
+`MAT_SPARSE_ROW_PREFETCH` is what we use when configured to fetch only row of matrix when training.
+
+In `trainer_internal.cpp:L93 trainOneBatch`:
+
+```c++
+  if (config_->getOptConfig().use_sparse_remote_updater()) {
+    REGISTER_TIMER("prefetch");
+    gradientMachine_->prefetch(inArgs);
+    parameterUpdater_->getParametersRemote();
+  }
+```
+
+When doing actual network forward and backward, at the beginning of each batch, the trainer will try to download one row of data from pserver.
+
+In `legacy/trainer/RemoteParameterUpdater.cpp`: `parameterUpdater_->getParametersRemote();`:
+
+```c++
+if (fullSize) {
+    ...
+} else {
+getParams = [&] {
+    parameterClient_->getParameterSparse(
+        /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
+};
+applyL1 = [](Parameter& para, real decayRate) {
+    para.getMat(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate);
+};
+}
+```
+
+Calling `parameterClient_->getParameterSparse` will do remote call to pserver's `getParameterSparse`:
+
+```c++
+void ParameterServer2::getParameterSparse(const SendParameterRequest& request,
+                                          std::vector<Buffer>& inputBuffers,
+                                          SendParameterResponse* response,
+                                          std::vector<Buffer>* outputBuffers) {
+  (void)inputBuffers;
+  auto& buffer = *readWriteBuffer_;
+  size_t numReals = 0;
+  for (const auto& block : request.blocks()) {
+    numReals += getParameterConfig(block).dims(1);
+  }
+  buffer.resize(numReals);
+
+  VLOG(3) << "pserver: getParameterSparse, numReals=" << numReals;
+
+  ReadLockGuard guard(parameterMutex_);
+  size_t offset = 0;
+  for (const auto& block : request.blocks()) {
+    size_t width = getParameterConfig(block).dims(1);
+    Buffer buf = {buffer.data() + offset, width};
+    int type = request.send_back_parameter_type();
+    sendBackParameterSparse(block, type, response, &buf, width, outputBuffers);
+    offset += width;
+  }
+}
+```
+
+`getParameterConfig(block).dims(1)` returns the width of the current "parameter block"(a shard of parameter object),
+then `getParameterSparse` remote call returns only one row of data to the client.
diff --git a/doc/design/cluster_train/master_server.md b/doc/v2/design/cluster_train/master_server.md
similarity index 100%
rename from doc/design/cluster_train/master_server.md
rename to doc/v2/design/cluster_train/master_server.md
diff --git a/doc/design/cluster_train/pserver_client.md b/doc/v2/design/cluster_train/pserver_client.md
similarity index 100%
rename from doc/design/cluster_train/pserver_client.md
rename to doc/v2/design/cluster_train/pserver_client.md
diff --git a/doc/design/cluster_train/remote_parameter_updater.md b/doc/v2/design/cluster_train/remote_parameter_updater.md
similarity index 100%
rename from doc/design/cluster_train/remote_parameter_updater.md
rename to doc/v2/design/cluster_train/remote_parameter_updater.md
diff --git a/doc/design/cluster_train/save_model.md b/doc/v2/design/cluster_train/save_model.md
similarity index 100%
rename from doc/design/cluster_train/save_model.md
rename to doc/v2/design/cluster_train/save_model.md
diff --git a/doc/design/cluster_train/src/checkpointing.png b/doc/v2/design/cluster_train/src/checkpointing.png
similarity index 100%
rename from doc/design/cluster_train/src/checkpointing.png
rename to doc/v2/design/cluster_train/src/checkpointing.png
diff --git a/doc/design/cluster_train/src/data_dispatch.png b/doc/v2/design/cluster_train/src/data_dispatch.png
similarity index 100%
rename from doc/design/cluster_train/src/data_dispatch.png
rename to doc/v2/design/cluster_train/src/data_dispatch.png
diff --git a/doc/design/cluster_train/src/dataset.graffle b/doc/v2/design/cluster_train/src/dataset.graffle
similarity index 100%
rename from doc/design/cluster_train/src/dataset.graffle
rename to doc/v2/design/cluster_train/src/dataset.graffle
diff --git a/doc/design/cluster_train/src/dataset.png b/doc/v2/design/cluster_train/src/dataset.png
similarity index 100%
rename from doc/design/cluster_train/src/dataset.png
rename to doc/v2/design/cluster_train/src/dataset.png
diff --git a/doc/design/cluster_train/src/file_storage.graffle b/doc/v2/design/cluster_train/src/file_storage.graffle
similarity index 100%
rename from doc/design/cluster_train/src/file_storage.graffle
rename to doc/v2/design/cluster_train/src/file_storage.graffle
diff --git a/doc/design/cluster_train/src/file_storage.png b/doc/v2/design/cluster_train/src/file_storage.png
similarity index 100%
rename from doc/design/cluster_train/src/file_storage.png
rename to doc/v2/design/cluster_train/src/file_storage.png
diff --git a/doc/design/cluster_train/src/init_lock.graffle b/doc/v2/design/cluster_train/src/init_lock.graffle
similarity index 100%
rename from doc/design/cluster_train/src/init_lock.graffle
rename to doc/v2/design/cluster_train/src/init_lock.graffle
diff --git a/doc/design/cluster_train/src/init_lock.png b/doc/v2/design/cluster_train/src/init_lock.png
similarity index 100%
rename from doc/design/cluster_train/src/init_lock.png
rename to doc/v2/design/cluster_train/src/init_lock.png
diff --git a/doc/design/cluster_train/src/paddle-cloud-in-data-center.png b/doc/v2/design/cluster_train/src/paddle-cloud-in-data-center.png
similarity index 100%
rename from doc/design/cluster_train/src/paddle-cloud-in-data-center.png
rename to doc/v2/design/cluster_train/src/paddle-cloud-in-data-center.png
diff --git a/doc/design/cluster_train/src/paddle-etcd.graffle b/doc/v2/design/cluster_train/src/paddle-etcd.graffle
similarity index 100%
rename from doc/design/cluster_train/src/paddle-etcd.graffle
rename to doc/v2/design/cluster_train/src/paddle-etcd.graffle
diff --git a/doc/design/cluster_train/src/paddle-etcd.png b/doc/v2/design/cluster_train/src/paddle-etcd.png
similarity index 100%
rename from doc/design/cluster_train/src/paddle-etcd.png
rename to doc/v2/design/cluster_train/src/paddle-etcd.png
diff --git a/doc/design/cluster_train/src/paddle-model-sharding.graffle b/doc/v2/design/cluster_train/src/paddle-model-sharding.graffle
similarity index 100%
rename from doc/design/cluster_train/src/paddle-model-sharding.graffle
rename to doc/v2/design/cluster_train/src/paddle-model-sharding.graffle
diff --git a/doc/design/cluster_train/src/paddle-model-sharding.png b/doc/v2/design/cluster_train/src/paddle-model-sharding.png
similarity index 100%
rename from doc/design/cluster_train/src/paddle-model-sharding.png
rename to doc/v2/design/cluster_train/src/paddle-model-sharding.png
diff --git a/doc/design/cluster_train/src/paddle-ps-0.png b/doc/v2/design/cluster_train/src/paddle-ps-0.png
similarity index 100%
rename from doc/design/cluster_train/src/paddle-ps-0.png
rename to doc/v2/design/cluster_train/src/paddle-ps-0.png
diff --git a/doc/design/cluster_train/src/paddle-ps-1.png b/doc/v2/design/cluster_train/src/paddle-ps-1.png
similarity index 100%
rename from doc/design/cluster_train/src/paddle-ps-1.png
rename to doc/v2/design/cluster_train/src/paddle-ps-1.png
diff --git a/doc/design/cluster_train/src/paddle-ps.graffle b/doc/v2/design/cluster_train/src/paddle-ps.graffle
similarity index 100%
rename from doc/design/cluster_train/src/paddle-ps.graffle
rename to doc/v2/design/cluster_train/src/paddle-ps.graffle
diff --git a/doc/design/cluster_train/src/paddle-task-queues.graffle b/doc/v2/design/cluster_train/src/paddle-task-queues.graffle
similarity index 100%
rename from doc/design/cluster_train/src/paddle-task-queues.graffle
rename to doc/v2/design/cluster_train/src/paddle-task-queues.graffle
diff --git a/doc/design/cluster_train/src/paddle-task-queues.png b/doc/v2/design/cluster_train/src/paddle-task-queues.png
similarity index 100%
rename from doc/design/cluster_train/src/paddle-task-queues.png
rename to doc/v2/design/cluster_train/src/paddle-task-queues.png
diff --git a/doc/design/cluster_train/src/paddle-task-states.graffle b/doc/v2/design/cluster_train/src/paddle-task-states.graffle
similarity index 100%
rename from doc/design/cluster_train/src/paddle-task-states.graffle
rename to doc/v2/design/cluster_train/src/paddle-task-states.graffle
diff --git a/doc/design/cluster_train/src/paddle-task-states.png b/doc/v2/design/cluster_train/src/paddle-task-states.png
similarity index 100%
rename from doc/design/cluster_train/src/paddle-task-states.png
rename to doc/v2/design/cluster_train/src/paddle-task-states.png
diff --git a/doc/design/cluster_train/src/pserver_init.graffle b/doc/v2/design/cluster_train/src/pserver_init.graffle
similarity index 100%
rename from doc/design/cluster_train/src/pserver_init.graffle
rename to doc/v2/design/cluster_train/src/pserver_init.graffle
diff --git a/doc/design/cluster_train/src/pserver_init.png b/doc/v2/design/cluster_train/src/pserver_init.png
similarity index 100%
rename from doc/design/cluster_train/src/pserver_init.png
rename to doc/v2/design/cluster_train/src/pserver_init.png
diff --git a/doc/design/cluster_train/src/submit-job.graffle b/doc/v2/design/cluster_train/src/submit-job.graffle
similarity index 100%
rename from doc/design/cluster_train/src/submit-job.graffle
rename to doc/v2/design/cluster_train/src/submit-job.graffle
diff --git a/doc/design/cluster_train/src/submit-job.png b/doc/v2/design/cluster_train/src/submit-job.png
similarity index 100%
rename from doc/design/cluster_train/src/submit-job.png
rename to doc/v2/design/cluster_train/src/submit-job.png
diff --git a/doc/design/cluster_train/src/trainer.graffle b/doc/v2/design/cluster_train/src/trainer.graffle
similarity index 100%
rename from doc/design/cluster_train/src/trainer.graffle
rename to doc/v2/design/cluster_train/src/trainer.graffle
diff --git a/doc/design/cluster_train/src/trainer.png b/doc/v2/design/cluster_train/src/trainer.png
similarity index 100%
rename from doc/design/cluster_train/src/trainer.png
rename to doc/v2/design/cluster_train/src/trainer.png
diff --git a/doc/design/cluster_train/submit-job.md b/doc/v2/design/cluster_train/submit-job.md
similarity index 100%
rename from doc/design/cluster_train/submit-job.md
rename to doc/v2/design/cluster_train/submit-job.md
diff --git a/doc/v2/design/interface/00.why_plain_c.md b/doc/v2/design/interface/00.why_plain_c.md
new file mode 100644
index 0000000000000000000000000000000000000000..826ff3141bc2512b525cb44ac0f18b376ce57e92
--- /dev/null
+++ b/doc/v2/design/interface/00.why_plain_c.md
@@ -0,0 +1,118 @@
+# Paddle多语言接口实现
+## 背景
+
+Paddle需要一个多语言接口，这个接口需要做到:
+
+* 有标准的，良好的文档
+    * 例如Python可以使用[Sphinx](http://www.sphinx-doc.org/en/stable/)生成API文档，golang可以使用[GoDoc](https://godoc.org/golang.org/x/tools/cmd/godoc)生成文档。这都需要这个接口按照约定俗成的规则来注释完备。
+* 不同语言的接口适应不同语言的特性
+    * 例如Java与Python的错误处理是直接扔出来Exception，而对于golang错误处理应该使用返回值。
+
+## 基本要求
+
+Paddle的多语言接口实现包括一下几个方面:
+
+* 我们使用动态库来分发Paddle。在这个动态库中不嵌入任何其他语言的解释器，也不使用其他动态库。
+* 这个动态库使用C99标准的头文件导出一些函数，不使用/导出C++符号。
+* 不导出Paddle内部的结构体、类，仅仅使用`void*`指针作为类型的句柄(handler)。
+* 不使用SWIG这种代码生成器，而是手写多语言绑定。
+
+
+## 原因
+
+### 使用动态库来分发Paddle
+
+* Paddle的链接方式比较复杂
+    * 如果用户要把Paddle的静态库（libpaddle.a）链接到自己的程序里，得使用 `--whole-archive` (for GCC) 或者 `--force_load` (for Clang) 参数，来确保把 libpaddle.a 里所有的符号都写入自己的程序的二进制文件里。这是因为 Paddle 的源码里使用了[object factory design pattern](http://stackoverflow.com/a/1310326/724872)。
+* 编译型语言，例如C/C++使用静态库和动态库难度差不多。但是解释性语言，例如[Python](http://stackoverflow.com/questions/19560594/how-to-import-static-library-in-python)或者[Java](http://stackoverflow.com/questions/24493337/linking-static-library-with-jni)，只能调用Paddle的动态库，否则得把Paddle静态库链接到解释器里。
+    * 解释性语言实际运行的二进制是解释器本身，如果调用静态库只能将静态库与解释器链接。例如对于Java来说，便是将静态库加入JVM中。这对于通常的Java的开发者来说，是不常见的做法。
+
+### 动态库中不嵌入任何其他语言的解释器
+
+* 目前Paddle的进程模型是C++内部驱动Python解释器进行模型配置解析和数据读取
+* 我们最终的动态库中不嵌入Python或者其他任何语言的解释器。模型配置解析，数据读取均交由其他语言完成
+
+现阶段Paddle有一个问题是，Paddle内嵌的Python解释器和外部使用的Python如果版本不同，会直接报错退出。
+
+### Paddle动态库中，不引用其他动态库
+
+* 即这个动态库是不依赖于其他任何文件的，可以在任何机器上执行的。
+
+###  这个动态库使用C99标准的头文件导出一些函数，不使用/导出C++符号
+
+* 由于C++编译器没有[名字修饰](https://en.wikipedia.org/wiki/Name_mangling#C.2B.2B)的规范，不同版本的编译器之间，对于同一段C++代码生成的符号可能不一致。而多语言接口需要直接读取生成的二进制(动态库)，需要有稳定的导出符号。
+* C语言是有导出符号的标准的，并且在常见的平台上，都是ABI调用标准的。
+* 大多数语言都支持使用C语言API
+* 使用C99而不使用C89，是因为C99支持[Fixed-width integer types](https://en.wikipedia.org/wiki/C_data_types#Fixed-width_integer_types)和[Boolean type](https://en.wikipedia.org/wiki/C_data_types#Boolean_type)。
+* 使用C99而不使用C11的原因是，[C11](https://en.wikipedia.org/wiki/C11_(C_standard_revision))并没有Paddle特别需要的特性，且C99相对于C11使用更加广泛。
+
+### 不导出Paddle内部的结构体、类，仅仅使用`void*`指针作为类型的句柄(handler)
+
+* Paddle内部的类为C++书写，直接导出到C的接口比较困难。
+* 在C-API中使用`void*`来表示Paddle内部类。再在每一个API中自己检查类型。
+
+在C的头文件 `paddle_matrix.h` 中:
+
+```C
+typedef void* paddle_matrix;
+typedef int paddle_error;
+
+extern "C"
+paddle_error paddle_matrix_get_shape(paddle_matrix matrix,
+                                     uint64_t* width,
+                                     uint64_t* height);
+```
+而在CPP里面实现这个C的接口，文件 `paddle_matrix.cpp`
+
+```cpp
+#include "paddle/legacy/math/matrix.h"
+extern "C"
+paddle_error paddle_matrix_shape(paddle_matrix matrix,
+                                 uint64_t *width,
+                                 uint64_t *height) {
+  auto m = (paddle::capi::CMatrix*)(matrix);
+  *width = m->width();
+  *height = m->height();
+}
+```
+
+其中`paddle/capi/CMatrix.hpp`文件内容为:
+
+```cpp
+namespace paddle {
+namespace math {  
+
+class CMatrix {
+  std::shared_ptr<paddle::Matrix> mat;
+};
+
+}  // namespace math
+}  // namespace paddle
+```
+
+### 不使用SWIG这种代码生成器，而是手写多语言绑定
+
+* [SWIG](http://www.swig.org/)是一个多语言接口的代码生成器。他的目标是使用C/C++写代码，SWIG直接读取C/C++的头文件，生成各种语言的绑定代码。
+    * 对于多语言接口，SWIG需要写一个interface文件。这个文件具有独特的语法，学习成本高。且增加一个第三方语言，就需要对这个第三方语言增加一些定义。有的时候，interface文件的写法非常[tricky](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/api/Paddle.swig#L36)。社区贡献代码学习成本高。
+    * SWIG暴露的接口保留了C++的接口样式，很难保证多语言代码风格的一致性。(函数命名，错误处理)
+        * 因为SWIG在第三方语言中暴露的函数名，类名和C++中完全一致。C++的命名风格并不能适应其他第三方语言。如果使用SWIG我们需要将在interface文件里，将大量的`SomeCppClass`重命名成`some_python_class`，或者`SomeGoTypes`。
+        * 对于不同语言，错误处理的方式也不尽相同。例如对于Java或者Python，最常见的错误处理方式是Exception，而对于Golang，错误处理方式是返回值。而SWIG只能简单的暴露C++接口，无法做到对于各种语言错误处理方式的适配。
+    * 对于大多数语言，直接使用C语言的.h并不困难。例如Python的[cffi](https://cffi.readthedocs.io/en/latest/overview.html#simple-example-abi-level-in-line)或者[Cython](http://cython.org/), golang的[cgo](https://golang.org/cmd/cgo/)。
+    * SWIG支持的语言或者解释器有局限。例如对于Python，使用SWIG只支持CPython解释器，而不支持PyPy解释器。
+
+
+## 原因列表
+
+| 结论 | 对比 | 原因 |
+|---| --- | --- |
+| 使用动态库 | 不使用静态库 | 解释型语言只能调用动态库，Paddle静态库链接复杂 |
+| 不嵌入其他语言解释器 | 不嵌入Python解释器 | Paddle C++目前嵌入Python解释器，会导致不同版本Python在一个进程里的bug |
+| 不引用其他动态库 | | Paddle一个动态库可以在任何Linux系统上运行 |
+| 使用C99做接口 | 不使用C++做接口 | C有标准的ABI，C99是目前C最广泛的使用标准，且C99支持bool类型和定长整数(uint64_t等)类型 |
+| 使用void*作为类句柄 | 不显示的写每个类具体包含什么| 实现简单，并且让接口脱离实现细节 |
+| 手写多语言绑定 | 不使用SWIG | 使用SWIG需要多语言绑定的开发人员熟练掌握SWIG配置，社区参与困难。SWIG生成的代码不能保证多语言代码风格的一致性 |
+
+
+## 实现
+
+参考[Inference implementation](01.inference_implementation.md)
diff --git a/doc/design/multi_language_interface/01.inference_implementation.md b/doc/v2/design/interface/01.inference_implementation.md
similarity index 100%
rename from doc/design/multi_language_interface/01.inference_implementation.md
rename to doc/v2/design/interface/01.inference_implementation.md
diff --git a/doc/v2/design/interface/index_cn.rst b/doc/v2/design/interface/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2509a5c5f4182d8ce3a16a3b7bd92c0d7bf5b056
--- /dev/null
+++ b/doc/v2/design/interface/index_cn.rst
@@ -0,0 +1,7 @@
+多语言接口
+------------
+
+.. toctree::
+  :maxdepth: 1
+
+  00.why_plain_c.md
diff --git a/doc/v2/design/interface/index_en.rst b/doc/v2/design/interface/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..356e58c39c5ef6ee5ee50ab999b85f88628bfb85
--- /dev/null
+++ b/doc/v2/design/interface/index_en.rst
@@ -0,0 +1,7 @@
+Multilingual Interface
+-----------------------
+
+.. toctree::
+  :maxdepth: 1
+
+  00.why_plain_c.md
diff --git a/doc/design/mkl/image/engine.png b/doc/v2/design/mkl/image/engine.png
similarity index 100%
rename from doc/design/mkl/image/engine.png
rename to doc/v2/design/mkl/image/engine.png
diff --git a/doc/design/mkl/image/gradients.png b/doc/v2/design/mkl/image/gradients.png
similarity index 100%
rename from doc/design/mkl/image/gradients.png
rename to doc/v2/design/mkl/image/gradients.png
diff --git a/doc/design/mkl/image/layers.png b/doc/v2/design/mkl/image/layers.png
similarity index 100%
rename from doc/design/mkl/image/layers.png
rename to doc/v2/design/mkl/image/layers.png
diff --git a/doc/design/mkl/image/matrix.png b/doc/v2/design/mkl/image/matrix.png
similarity index 100%
rename from doc/design/mkl/image/matrix.png
rename to doc/v2/design/mkl/image/matrix.png
diff --git a/doc/design/mkl/image/overview.png b/doc/v2/design/mkl/image/overview.png
similarity index 100%
rename from doc/design/mkl/image/overview.png
rename to doc/v2/design/mkl/image/overview.png
diff --git a/doc/design/mkl/mkl_packed.md b/doc/v2/design/mkl/mkl_packed.md
similarity index 100%
rename from doc/design/mkl/mkl_packed.md
rename to doc/v2/design/mkl/mkl_packed.md
diff --git a/doc/v2/design/mkl/mkldnn.md b/doc/v2/design/mkl/mkldnn.md
new file mode 100644
index 0000000000000000000000000000000000000000..4876de0045979be20fa45bdc84d2594516f71c03
--- /dev/null
+++ b/doc/v2/design/mkl/mkldnn.md
@@ -0,0 +1,237 @@
+# Intel® MKL-DNN on PaddlePaddle: Design Doc
+
+我们计划将英特尔深度神经网络数学库[Intel MKL-DNN](https://github.com/01org/mkl-dnn)
+(Intel Math Kernel Library for Deep Neural Networks)集成到PaddlePaddle，
+充分展现英特尔平台的优势，有效提升PaddlePaddle在英特尔架构上的性能。
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/v2/images/overview.png"><br/>
+Figure 1. PaddlePaddle on IA
+</div>
+
+近期目标
+
+- 完成常用Layer的MKL-DNN实现。
+- 完成常见深度神经网络VGG，GoogLeNet 和 ResNet的MKL-DNN实现。
+
+目前的优化，主要针对PaddlePaddle在重构之前的代码框架以及V1的API。
+具体的完成状态可以参见[这里](https://github.com/PaddlePaddle/Paddle/projects/21)。
+
+## Contents
+
+- [Overview](#overview)
+- [Actions](#actions)
+ 	- [CMake](#cmake)
+ 	- [Matrix](#matrix)
+	- [Layers](#layers)
+	- [Activations](#activations)
+	- [Parameters](#parameters)
+	- [Gradients](#gradients)
+	- [Unit Tests](#unit-tests)
+	- [Python API](#python-api)
+	- [Benchmarking](#benchmarking)
+	- [Others](#others)
+- [Design Concerns](#design-concerns)
+
+## Overview
+
+我们会把MKL-DNN会作为第三方库集成进PaddlePaddle，与其他第三方库一样，会在编译PaddlePaddle的时候下载并编译MKL-DNN。
+
+同时，为了进一步提升PaddlePaddle在基本数学运算的计算速度，我们也将MKLML即(MKL small library\[[1](#references)\])
+作为另一个第三方库集成进PaddlePaddle，它只会包括生成好的动态库和头文件。
+
+MKL，MKLML以及MKL-DNN三者关系如下表：
+
+<table>
+<thead>
+<tr>
+<th>Name</th>
+<th>Open Source</th>
+<th>License</th>
+<th>Descriptions</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>MKL</td>
+<td>No</td>
+<td>Proprietary</td>
+<td>Accelerate math processing routines</td>
+</tr>
+<tr>
+<td>MKLML</td>
+<td>No</td>
+<td>Proprietary</td>
+<td>Small package of MKL, especially for Machine Learning</td>
+</tr>
+
+<tr>
+<td>MKL-DNN</td>
+<td>Yes</td>
+<td>Apache 2.0</td>
+<td>Accelerate primitives processing routines especially for Deep Neural Networks</td>
+</tr>
+
+</tbody>
+</table>
+
+MKLML可以与MKL-DNN共同使用，以此达到最好的性能。
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/v2/images/engine.png"><br/>
+Figure 2. PaddlePaddle with MKL Engines
+</div>
+
+## Actions
+
+添加的相关文件和目录结构如下：
+
+```txt
+PaddlePaddle/Paddle
+├── ...
+├── cmake/
+│   ├── external/
+│   │   ├── ...
+│   │   ├── mkldnn.cmake
+│   │   └── mklml.cmake
+└── paddle/
+    ├── ...
+    ├── math/
+    │   ├── ...
+    │   └── MKLDNNMatrix.*
+    └── gserver/
+        ├── ...
+        ├── layers/
+        │   ├── ...
+        │   └── MKLDNN*Layer.*
+        ├── activations/
+        │   ├── ...
+        │   └── MKLDNNActivations.*
+        └── tests/
+            ├── ...
+            ├── MKLDNNTester.*
+            └── test_MKLDNN.cpp
+```
+
+### CMake
+在`CMakeLists.txt`中提供一个与MKL有关的总开关：`WITH_MKL`，它负责决定编译时是否使用MKLML和MKL-DNN
+
+- `WITH_MKLML` 控制是否使用MKLML库。
+当打开`WITH_MKL`时，会自动使用MKLML库作为PaddlePaddle的CBLAS和LAPACK库，同时会开启Intel OpenMP用于提高MKLML的性能。
+编译时会把对应的头文件和库放在`build/third_party/install/mklml/*`目录下对应的地方。
+MKLML的库目前都是动态库，主要包括`libiomp5.so`和`libmklml_intel.so`。
+- `WITH_MKLDNN` 控制是否使用MKL-DNN。
+当开启`WITH_MKL`时，会自动根据硬件配置[[2](#references)]选择是否编译MKL-DNN。
+编译时会把对应的头文件和库放在`build/third_party/install/mkldnn/*`目录下对应的地方。
+MKL-DNN的库目前只有动态库`libmkldnn.so`。
+
+### Matrix
+目前在PaddlePaddle中数据都是以`NCHW`的格式存储，但是在MKL-DNN中的排列方式不止这一种。
+所以我们定义了一个`MKLDNNMatrix`用于管理MKL-DNN数据的不同格式以及相互之间的转换。
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/v2/images/matrix.png"><br/>
+Figure 3. MKLDNNMatrix
+</div>
+
+### Layers
+所有MKL-DNN的Layers都会继承于`MKLDNNLayer`，该类继承于PaddlePaddle的基类`Layer`。
+在`MKLDNNLayer`中会提供一些必要的接口和函数，并且会写好`forward`和`backward`的基本逻辑，
+子类只需要使用定义好的接口，实现具体的函数功能即可。
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/v2/images/layers.png"><br/>
+Figure 4. MKLDNNLayer
+</div>
+
+每个MKLDNNLayer都包含用于内部存储和外部存储的一系列MKLDNNMatrix：
+
+- 内部存储（internel memory）：`inVal_`,`inGrad_`,`outVal_`和`outGrad_`，分别代表输入数据，输入梯度，输出数据和输出梯度。
+- 外部存储（external memory）：都是以ext开头，比如`extInVal_`和`extInGrad_`，它们主要是用于，
+当数据格式与PaddlePaddle默认的`NCHW`格式不匹配时，转换内存的工作。
+需要注意的是，PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`，
+所以`extOutVal_`和`extOutGrad_`必须分别与`output_.value`和`output_.grad`共享内存，
+如果不需要外部存储用于转换，那么对应的内部存储也会与它们共享内存。
+- 转换函数（resetXXX）： 包括`resetInValue`，`resetInGrad`，`resetOutValue`和`resetOutGrad`，
+表示对输入数据，输入梯度，输出数据和输出梯度的转换。
+这些函数会根据输入参数重新设置内部和外部存储，当然这两者也可以相等，即表示不需要转换。
+
+注意：每个`MKLDNNlayer`的子类只需要使用内部存储就可以了，所有外部的转换工作都会在reset系列函数中都准备好。
+
+### Activations
+在重构前的PaddlePaddle中，激活函数是独立于`Layer`的概念，并且输入输出都是共用一块内存，
+所以添加了对应的`MKLDNNActivation`来实现，方式类似于`MKLDNNLayer`。
+
+### Parameters
+对于有参数的层，我们会保证`MKLDNNLayer`使用的参数与PaddlePaddle申请的buffer共用一块内存。
+如果存在数据排列格式不一样的情况时，我们会在网络训练之前把格式转换为MKL-DNN希望的格式，
+在训练结束的时候再保存为PaddlePaddle的格式，但是整个训练过程中不需要任何转换。
+这样既使得最终保存的参数格式与PaddlePaddle一致，又可以避免不必要的转换。
+
+### Gradients
+由于MKL-DNN的操作都是直接覆盖的形式，也就是说输出的结果不会在原来的数据上累加，
+这样带来的好处就是不需要一直清空memory，节省了不必要的操作。
+但是注意的是，当网络出现分支且在`backward`的时候，需要累加不同Layer传过来的梯度。
+所以在`MKLDNNlayer`中实现了一个merge的方法，此时每个小分支的`Input Gradient`
+会先临时保存在`MKLDNNMatrix`中，由分支处的Layer负责求和，并把结果放到当前层的`output_.grad`中。
+所以整体上，在实现每个子类的时候就不需要关心分支的事情了。
+
+<div align="center">
+<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/v2/images/gradients.png"><br/>
+Figure 5. Merge Gradients
+</div>
+
+### Unit Tests
+我们会添加`test_MKLDNN.cpp`和`MKLDNNTester.*`用于MKL-DNN的测试。
+测试分为每个Layer（或Activation）的单元测试和简单网络的整体测试。
+每个测试会对比PaddlePaddle中CPU算出的结果与MKL-DNN的结果，小于某个比较小的阈值认为通过。
+
+### Python API
+目前只考虑**v1 API**。
+
+计划在`python/paddle/trainer/config_parser.py`里面添加`use_mkldnn`这个选择，方便用户选择使用MKL-DNN的layers。
+
+具体实现方式比如：
+
+```python
+use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+if use_mkldnn
+    self.layer_type = mkldnn_*
+```
+
+所有MKL-DNN的`layer_type`会以*mkldnn_*开头，这些会在`MKLDNN*Layer`注册layer的时候保证，以示区分。
+
+同时,会在`paddle/utils.Flags`中添加一个`use_mkldnn`的flag，用于选择是否使用MKL-DNN的相关功能。
+
+### Benchmarking
+会添加相应的脚本在[这里](https://github.com/PaddlePaddle/Paddle/tree/develop/benchmark/paddle/image)，用于测试和对比在使用MKL-DNN前后的CNN网络性能。
+测试的性能对比结果会在[IntelOptimizedPaddle.md](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md)
+
+### Others
+1. 如果在使用MKL-DNN的情况下，会把CPU的Buffer对齐为4096，具体可以参考MKL-DNN中的[memory](https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp#L673)。
+2. 深入PaddlePaddle，寻找有没有其他可以优化的可能，进一步优化。比如可能会用OpenMP改进SGD的更新性能。
+
+## Design Concerns
+
+为了更好的符合PaddlePaddle的代码风格\[[3](#references)\]，同时又尽可能少的牺牲MKL-DNN的性能\[[4](#references)\]。
+
+我们总结出一些特别需要注意的点：
+
+1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数，
+我们决定使用已有的`deviceId_`变量来区分layer的属性，定义`-2`为`MKLDNNLayer`特有的设备ID。
+2. 重写父类Layer的**init**函数，修改`deviceId_`为`-2`，代表这个layer是用于跑在MKL-DNN的环境下。
+3. 创建`MKLDNNBase`，定义一些除了layer和memory相关的类和函数。
+包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`，和未来可能还会用到`FPGAEngine`等。
+4. 如果MKL-DNN layer的后面接有cpu device，那么就会使`output_.value`与`extOutVal_`共享内存，
+同时数据格式就是`NCHW`，这样下一个cpu device就能拿到正确的数据。
+在有普通的CPU layer时， `extOutVal_`和`extOutGrad_`的格式始终是`NCHW`或者`NC`。
+
+## References
+1. [MKL small library](https://github.com/01org/mkl-dnn#linking-your-application)是[Intel MKL](https://software.intel.com/en-us/mkl)的一个子集。
+主要包括了深度学习相关的数学原语与操作，一般由MKL-DNN在发布[新版本](https://github.com/01org/mkl-dnn/releases)时一起更新。
+2. [MKL-DNN System Requirements](https://github.com/01org/mkl-dnn#system-requirements)。
+目前在PaddlePaddle中，仅会在支持AVX2指令集及以上的机器才使用MKL-DNN。
+3. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。
+但是在PaddlePaddle中，无论是重构前的layer还是重构后的op，都不会想要知道next layer/op的信息。
+4. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的cuDNN部分使用的也是`NCHW`，所以不存在这个问题)。
+所以需要引入一个转换方法，并且只需要在必要的时候转换这种格式，才能更好的发挥MKL-DNN的性能。
diff --git a/doc/v2/dev/contribute_to_paddle_cn.md b/doc/v2/dev/contribute_to_paddle_cn.md
index d8bf093e09b53b302225739fa67146adc7976e4b..3244eedf918b93f9351258f1218dfb2d507c1a9c 100644
--- a/doc/v2/dev/contribute_to_paddle_cn.md
+++ b/doc/v2/dev/contribute_to_paddle_cn.md
@@ -51,6 +51,8 @@ Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理 G
 
 Paddle 使用 `clang-format` 来调整 C/C++ 源代码格式，请确保 `clang-format` 版本在 3.8 以上。
 
+注：通过`pip install pre-commit`和`conda install -c conda-forge pre-commit`安装的`yapf`稍有不同的，Paddle 开发人员使用的是`pip install pre-commit`。
+
 ## 开始开发
 
 在本例中，我删除了 README.md 中的一行，并创建了一个新文件。
@@ -102,7 +104,7 @@ no changes added to commit (use "git add" and/or "git commit -a")
 ➜  docker run -it -v $(pwd):/paddle paddle:latest-dev bash -c "cd /paddle/build && ctest"
 ```
 
-关于构建和测试的更多信息，请参见[这篇文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)。
+关于构建和测试的更多信息，请参见[使用Docker安装运行](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/v2/build_and_install/docker_install_cn.rst)。
 
 ## 提交（commit）
 
diff --git a/doc/v2/dev/index_en.rst b/doc/v2/dev/index_en.rst
index 549f5fa9aace7eb699d229e5f61fe10ae4ed4d66..cbff313fc5b9468b58159cf2b04e8464f9bebc78 100644
--- a/doc/v2/dev/index_en.rst
+++ b/doc/v2/dev/index_en.rst
@@ -1,9 +1,28 @@
 Development
 ------------
 
+
+PaddlePaddle adheres to the following three sections of code and document specifications.
+
+
+PaddlePaddle uses git for version control and Docker is used for building and testing environment. The code includes Cuda, C++, Python, Shell and other programming languages，which comply with Google C++ Style, Pep-8, and the code base includes style checking by an automatic inspection tool. Code comments need to follow the Doxygen specification. The code that does not meet the style requirements will fail to compile. We provide the following guidelines for the use of Git, build tests and code development.
+
 ..  toctree::
   :maxdepth: 1
 
   contribute_to_paddle_en.md
+
+
+PaddlePaddle is well documented in English and Chinese. We recommend using the English version of the documents and problem description. The design documents focus on problem descriptions, backgrounds, and are followed by solutions. As documents are generated by Sphinx, code comments should comply with the Sphinx documentation standard. We recommend to use the paddlepaddle.org tool to compile and generate and preview documents locally. Please refer to:
+
+..  toctree::
+  :maxdepth: 1
+
   write_docs_en.rst
+
+PaddlePaddle V2 defines new operations by adding new Layers. You can implement various complex layers by combining basic APIs to satisfy most applications. If you want to customize layer, please refer to the following, and welcome to propose patch.
+
+..  toctree::
+  :maxdepth: 1
+
   new_layer_en.rst
diff --git a/doc/v2/dev/new_layer_cn.rst b/doc/v2/dev/new_layer_cn.rst
index 0ded1c262adad44f4df000ef2933c7b68050f2fc..e5a14346123d342de0b67757cbbce654bd4180dc 100644
--- a/doc/v2/dev/new_layer_cn.rst
+++ b/doc/v2/dev/new_layer_cn.rst
@@ -16,7 +16,7 @@
 
 下图是一个全连接层的示意图。在全连接层中，每个输出节点都连接到所有的输入节点上。
 
-..  image:: FullyConnected.jpg
+..  image:: src/FullyConnected.jpg
     :align: center
     :scale: 60 %
 
@@ -58,7 +58,7 @@ PaddlePaddle的base layer类可以自动计算上面的导数。
 实现C++类
 ===================
 
-一个网络层的C++类需要实现初始化，前向和后向。全连接层的实现位于:code:`paddle/gserver/layers/FullyConnectedLayer.h`及:code:`paddle/gserver/layers/FullyConnectedLayer.cpp`。这里我们展示一份简化过的代码。
+一个网络层的C++类需要实现初始化，前向和后向。全连接层的实现位于:code:`paddle/legacy/gserver/layers/FullyConnectedLayer.h`及:code:`paddle/legacy/gserver/layers/FullyConnectedLayer.cpp`。这里我们展示一份简化过的代码。
 
 这个类需要继承 :code:`paddle::Layer` 这个基类，并且需要重写基类中的以下几个虚函数：
 
@@ -153,7 +153,7 @@ PaddlePaddle的base layer类可以自动计算上面的导数。
 
 - 每个层在其 :code:`forward` 函数的开头必须调用 :code:`Layer::forward(passType);` 。
 - 之后使用 :code:`reserveOutput(batchSize, size);` 为输出分配内存。由于我们支持训练数据有不同的批次大小，所以这一步是必要的。 :code:`reserveOutput`  会相应地改变输出的尺寸。为了保证效率，如果需要扩大矩阵，我们会重新分配内存；如果需要缩减矩阵，我们会继续使用现有的内存块。
-- 之后使用矩阵运算函数来计算 :math:`\sum_i W_i x + b`。:code:`getInput(i).value` 返回第i个输入矩阵。每个输入都是一个 :math:`batchSize \times dim` 的矩阵，每行表示一个批次中的单个输入。对于我们支持的全部矩阵操作，请参考 :code:`paddle/math/Matrix.h`和:code:`paddle/math/BaseMatrix.h` 。
+- 之后使用矩阵运算函数来计算 :math:`\sum_i W_i x + b`。:code:`getInput(i).value` 返回第i个输入矩阵。每个输入都是一个 :math:`batchSize \times dim` 的矩阵，每行表示一个批次中的单个输入。对于我们支持的全部矩阵操作，请参考 :code:`paddle/legacy/math/Matrix.h`和:code:`paddle/legacy/math/BaseMatrix.h` 。
 - 最终，使用 :code:`forwardActivation();` 进行激活操作。这会自动进行网络配置中声明的激活操作。
 
 
@@ -262,7 +262,7 @@ PaddlePaddle的base layer类可以自动计算上面的导数。
     REGISTER_LAYER(fc, FullyConnectedLayer);
     }
 
-若 :code:`cpp` 被放在 :code:`paddle/gserver/layers` 目录下，其会自动被加入编译列表。
+若 :code:`cpp` 被放在 :code:`paddle/legacy/gserver/layers` 目录下，其会自动被加入编译列表。
 
 
 写梯度检查单元测试
@@ -270,7 +270,7 @@ PaddlePaddle的base layer类可以自动计算上面的导数。
 
 写梯度检查单元测试是一个验证新实现的层是否正确的相对简单的办法。梯度检查单元测试通过有限差分法来验证一个层的梯度。首先对输入做一个小的扰动 :math:`\Delta x` ，然后观察到输出的变化为 :math:`\Delta y` ，那么，梯度就可以通过这个方程计算得到 :math:`\frac{\Delta y}{\Delta x }` 。之后，再用这个梯度去和 :code:`backward` 函数得到的梯度去对比，以保证梯度计算的正确性。需要注意的是梯度检查仅仅验证了梯度的计算，并不保证 :code:`forward` 和 :code:`backward` 函数的实现是正确的。你需要一些更复杂的单元测试来保证你实现的网络层是正确的。
 
-所有网络层的梯度检查单测都位于 :code:`paddle/gserver/tests/test_LayerGrad.cpp` 。我们建议你在写新网络层时把测试代码放入新的文件中。下面列出了全连接层的梯度检查单元测试。它包含以下几步：
+所有网络层的梯度检查单测都位于 :code:`paddle/legacy/gserver/tests/test_LayerGrad.cpp` 。我们建议你在写新网络层时把测试代码放入新的文件中。下面列出了全连接层的梯度检查单元测试。它包含以下几步：
 
 + 生成网络层配置。网络层配置包含以下几项：
    - 偏置参数的大小。（例子中是4096）
@@ -322,7 +322,7 @@ PaddlePaddle的base layer类可以自动计算上面的导数。
       }
     }
 
-如果你要为了测试而增加新的文件，例如 :code:`paddle/gserver/tests/testFCGrad.cpp` ，你需要把该文件加入 :code:`paddle/gserver/tests/CMakeLists.txt` 中。下面给出了一个例子。当你执行命令 :code:`make tests` 时，所有的单测都会被执行一次。注意，有些层可能需要高精度来保证梯度检查单测正确执行。你需要在配置cmake时将 :code:`WITH_DOUBLE` 设置为 `ON` 。
+如果你要为了测试而增加新的文件，例如 :code:`paddle/legacy/gserver/tests/testFCGrad.cpp` ，你需要把该文件加入 :code:`paddle/legacy/gserver/tests/CMakeLists.txt` 中。下面给出了一个例子。当你执行命令 :code:`make tests` 时，所有的单测都会被执行一次。注意，有些层可能需要高精度来保证梯度检查单测正确执行。你需要在配置cmake时将 :code:`WITH_DOUBLE` 设置为 `ON` 。
 
 .. code-block:: bash
 
diff --git a/doc/v2/dev/new_layer_en.rst b/doc/v2/dev/new_layer_en.rst
index 110a9fb38f890a766bb4480e91feb22d3b0838a5..ad723738801908a5f48343574c204bdbfc97ee08 100644
--- a/doc/v2/dev/new_layer_en.rst
+++ b/doc/v2/dev/new_layer_en.rst
@@ -16,7 +16,7 @@ First we need to derive equations of the *forward* and *backward* part of the la
 
 The illustration of a fully connected layer is shown in the following figure. In a fully connected layer, all output nodes are connected to all the input nodes.
 
-..  image:: FullyConnected.jpg
+..  image:: src/FullyConnected.jpg
     :align: center
     :scale: 60 %
 
@@ -58,7 +58,7 @@ Finally we can use chain rule to calculate :math:`\frac{\partial z}{\partial x}`
 Implement C++ Class
 ===================
 
-The C++ class of the layer implements the initialization, forward, and backward part of the layer. The fully connected layer is at :code:`paddle/gserver/layers/FullyConnectedLayer.h` and :code:`paddle/gserver/layers/FullyConnectedLayer.cpp`. We list simplified version of the code below.
+The C++ class of the layer implements the initialization, forward, and backward part of the layer. The fully connected layer is at :code:`paddle/legacy/gserver/layers/FullyConnectedLayer.h` and :code:`paddle/legacy/gserver/layers/FullyConnectedLayer.cpp`. We list simplified version of the code below.
 
 It needs to derive the base class :code:`paddle::Layer`, and it needs to override the following functions:
 
@@ -154,7 +154,7 @@ The implementation of the forward part has the following steps.
 
 - Every layer must call :code:`Layer::forward(passType);` at the beginning of its :code:`forward` function.
 - Then it allocates memory for the output using :code:`reserveOutput(batchSize, size);`. This step is necessary because we support the batches to have different batch sizes. :code:`reserveOutput` will change the size of the output accordingly. For the sake of efficiency, we will allocate new memory if we want to expand the matrix, but we will reuse the existing memory block if we want to shrink the matrix.
-- Then it computes :math:`\sum_i W_i x + b` using Matrix operations. :code:`getInput(i).value` retrieve the matrix of the i-th input. Each input is a :math:`batchSize \times dim` matrix, where each row represents an single input in a batch. For a complete lists of supported matrix operations, please refer to :code:`paddle/math/Matrix.h` and :code:`paddle/math/BaseMatrix.h`.
+- Then it computes :math:`\sum_i W_i x + b` using Matrix operations. :code:`getInput(i).value` retrieve the matrix of the i-th input. Each input is a :math:`batchSize \times dim` matrix, where each row represents an single input in a batch. For a complete lists of supported matrix operations, please refer to :code:`paddle/legacy/math/Matrix.h` and :code:`paddle/legacy/math/BaseMatrix.h`.
 - Finally it applies the activation function using :code:`forwardActivation();`. It will automatically applies the corresponding activation function specifies in the network configuration.
 
 
@@ -263,7 +263,7 @@ Finally, you can use :code:`REGISTER_LAYER(fc, FullyConnectedLayer);` to registe
     REGISTER_LAYER(fc, FullyConnectedLayer);
     }
 
-If the :code:`cpp` file is put into :code:`paddle/gserver/layers`, it will be automatically added to the compilation list.
+If the :code:`cpp` file is put into :code:`paddle/legacy/gserver/layers`, it will be automatically added to the compilation list.
 
 
 Write Gradient Check Unit Test
@@ -271,7 +271,7 @@ Write Gradient Check Unit Test
 
 An easy way to verify the correctness of new layer's implementation is to write a gradient check unit test. Gradient check unit test utilizes finite difference method to verify the gradient of a layer. It modifies the input with a small perturbation :math:`\Delta x` and observes the changes of output :math:`\Delta y`, the gradient can be computed as :math:`\frac{\Delta y}{\Delta x }`. This gradient can be compared with the gradient computed by the :code:`backward` function of the layer to ensure the correctness of the gradient computation. Notice that the gradient check only tests the correctness of the gradient computation, it does not necessarily guarantee the correctness of the implementation of the :code:`forward` and :code:`backward` function. You need to write more sophisticated unit tests to make sure your layer is implemented correctly.
 
-All the gradient check unit tests are located in :code:`paddle/gserver/tests/test_LayerGrad.cpp`. You are recommended to put your test into a new test file if you are planning to write a new layer. The gradient test of the gradient check unit test of the fully connected layer is listed below. It has the following steps.
+All the gradient check unit tests are located in :code:`paddle/legacy/gserver/tests/test_LayerGrad.cpp`. You are recommended to put your test into a new test file if you are planning to write a new layer. The gradient test of the gradient check unit test of the fully connected layer is listed below. It has the following steps.
 
 + Create layer configuration. A layer configuration can include the following attributes:
    - size of the bias parameter. (4096 in our example)
@@ -323,7 +323,7 @@ All the gradient check unit tests are located in :code:`paddle/gserver/tests/tes
       }
     }
 
-If you are creating a new file for the test, such as :code:`paddle/gserver/tests/testFCGrad.cpp`, you need to add the file to :code:`paddle/gserver/tests/CMakeLists.txt`. An example is given below. All the unit tests will run when you execute the command :code:`make tests`. Notice that some layers might need high accuracy for the gradient check unit tests to work well. You need to configure :code:`WITH_DOUBLE` to `ON` when configuring cmake.
+If you are creating a new file for the test, such as :code:`paddle/legacy/gserver/tests/testFCGrad.cpp`, you need to add the file to :code:`paddle/legacy/gserver/tests/CMakeLists.txt`. An example is given below. All the unit tests will run when you execute the command :code:`make tests`. Notice that some layers might need high accuracy for the gradient check unit tests to work well. You need to configure :code:`WITH_DOUBLE` to `ON` when configuring cmake.
 
 .. code-block:: bash
 
@@ -339,7 +339,7 @@ If you are creating a new file for the test, such as :code:`paddle/gserver/tests
 Implement Python Wrapper
 ========================
 
-Implementing Python wrapper allows us to use the added layer in configuration files. All the Python wrappers are in file :code:`python/paddle/trainer/config_parser.py`. An example of the Python wrapper for fully connected layer is listed below. It has the following steps:
+Implementing Python wrapper allows us to use the added layer in configuration files. All the Python wrappers are in file :code:`python/paddle/legacy/trainer/config_parser.py`. An example of the Python wrapper for fully connected layer is listed below. It has the following steps:
 
 - Use :code:`@config_layer('fc')` at the decorator for all the Python wrapper class. :code:`fc` is the identifier of the layer.
 - Implements :code:`__init__` constructor function.
diff --git a/doc/v2/dev/FullyConnected.jpg b/doc/v2/dev/src/FullyConnected.jpg
similarity index 100%
rename from doc/v2/dev/FullyConnected.jpg
rename to doc/v2/dev/src/FullyConnected.jpg
diff --git a/doc/v2/dev/src/doc_en.png b/doc/v2/dev/src/doc_en.png
new file mode 100644
index 0000000000000000000000000000000000000000..ed6b9178fba91a3bdf45ae797a9924f84146fbc8
Binary files /dev/null and b/doc/v2/dev/src/doc_en.png differ
diff --git a/doc/v2/dev/write_docs_cn.rst b/doc/v2/dev/write_docs_cn.rst
index f79769b810b91c6984016d95f40b89186bfb61b0..4231f2bb5cd800c0cd86835b5d07e491fcde4989 100644
--- a/doc/v2/dev/write_docs_cn.rst
+++ b/doc/v2/dev/write_docs_cn.rst
@@ -2,20 +2,20 @@
 如何贡献文档
 #############
 
-PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成，生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
-也可以利用PaddlePaddle 工具来编译文档，这个情况下所有的文件会存在整理过的的文件目录 .ppo_workspace/content 下
+PaddlePaddle的文档包括中英文两个部分。文档都是通过 ``cmake`` 驱动 ``sphinx`` 编译生成的，PaddlePaddle.org工具可以帮助我们实现这一编译过程，并提供更好的预览效果。
 
 如何构建文档
 ============
 
-PaddlePaddle的文档构建有三种方式。
+PaddlePaddle的文档构建有两种方式，分别为使用paddlepaddle.org工具和不使用paddlepaddle.org工具，两种方式都有各自的优点，前者方便预览，后者方便开发者进行调试。这两种方式中又分别有使用docker和不使用docker的两种构建方法。
 
+我们建议使用PaddlePaddle.org工具来构建文档。
 
 使用PaddlePaddle.org工具
---------------
-这个是目前推荐的使用方法。除了可以自动编译文档，也可以直接在网页预览文档。
+------------------------
+这个是目前推荐的使用方法。除了可以自动编译文档，还可以直接在网页中预览文档，需要注意的是，采用后续说明的其它方式虽然也可以预览文档，但是文档的样式与官网文档是不一致的，使用PaddlePaddle.org工具进行编译才能产生与官网文档样式一致的预览效果。
 
-文件工具是使用Docker，需要在系统里先安装好Docker工具包。Docker安装请参考Docker的官网。安装好Docker之后及可用以下命令启动工具
+PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。安装好Docker之后即可用以下命令启动工具
 
 ..  code-block:: bash
 
@@ -32,10 +32,10 @@ PaddlePaddle的文档构建有三种方式。
     docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
 
 注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令
-之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档
+之后再用网页连到 http://localhost:8000 就可以在网页上生成需要的文档
 编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
 
-如果不想使用 Docker，你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。
+如果不想使用Docker，你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。
 
 ..  code-block:: bash
 
@@ -57,42 +57,67 @@ PaddlePaddle的文档构建有三种方式。
     python manage.py runserver
 
 工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。
-之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档。
+之后再用网页连到 http://localhost:8000 就可以在网页上生成需要的文档。
 编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
 
 想了解更多PaddlePaddle.org工具的详细信息，可以 `点击这里 <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.cn.md>`_ 。
 
-使用Docker构建
---------------
+不使用PaddlePaddle.org工具
+--------------------------
 
-使用Docker构建PaddlePaddle的文档，需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。安装好Docker之后可以使用源码目录下的脚本构建文档，即
+使用Docker构建PaddlePaddle的文档，需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。该方法与 `从源码编译PaddlePaddle <http://paddlepaddle.org/docs/develop/documentation/zh/build_and_install/build_from_source_cn.html>`_ 相似，通过从源码中构建可用于编译PaddlePaddle文档的Docker镜像并运行，在进入Docker容器后使用源码中的脚本构建PaddlePaddle文档，具体步骤如下：
 
-..  code-block:: bash
+.. code-block:: bash
 
-    cd TO_YOUR_PADDLE_CLONE_PATH
-    cd paddle/scripts/tools/build_docs
-    sh build_docs.sh
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
 
-编译完成之后，会在当前目录生成两个子目录\: doc(英文文档目录)和 doc_cn(中文文档目录)。
-打开浏览器访问对应目录下的index.html即可访问本地文档。
+   # 从源码中构建可用于编译PaddlePaddle文档的Docker镜像
+   docker build -t paddle:dev .
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash
 
-直接构建
---------
+   # 进入Docker容器后使用build.sh脚本构建PaddlePaddle文档
+   bash -x /paddle/paddle/scripts/docker/build.sh
 
-如果提示正确，可以执行以下命令编译生成文档，即
+注：上述命令把当前目录（源码根目录）映射为 container 里的 :code:`/paddle` 目录。
 
-..  code-block:: bash
+编译完成后，会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录，在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 、 ``api/en/html`` 共三个子目录，分别进入这些目录下，执行以下命令：
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。
+
+如果不想使用Docker，也可以使用以下命令直接构建PaddlePaddle文档，即
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   mkdir -p build
+   cd build
+   cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
+
+   # 如果只需要构建使用文档，则执行以下命令
+   make -j $processors paddle_docs
+
+   # 如果只需要构建API，则执行以下命令
+   make -j $processors paddle_apis
+
+其中$processors代表启动和CPU核一样多的进程来并行编译，可以根据本机的CPU核数设置相应的值。
+
+编译完成后，同样会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录，如果选择构建文档则会在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 两个子目录，选择构建API则会在这两个目录下分别生成 ``api/en/html`` 目录，分别进入这些子目录下，执行以下命令：
+
+.. code-block:: bash
 
-    cd TO_YOUR_PADDLE_CLONE_PATH
-    mkdir -p build
-    cd build
-    cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
-    make gen_proto_py
-    make paddle_docs paddle_docs_cn
+   python -m SimpleHTTPServer 8088
 
-编译完成之后，会在当前目录生成两个子目录\: doc(英文文档目录)和 doc_cn(中文文档目录)。
-打开浏览器访问对应目录下的index.html即可访问本地文档。
+在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。下图为生成的 ``v2`` 英文文档首页示例。注意，示例中由于使用了sphinx的原始主题，所以页面的风格与官网并不一致，但这并不影响开发者进行调试。
 
+..  image:: src/doc_en.png
+    :align: center
+    :scale: 60 %
 
 如何书写文档
 ============
@@ -102,7 +127,7 @@ PaddlePaddle文档使用 `sphinx`_ 自动生成，用户可以参考sphinx教程
 如何更新www.paddlepaddle.org
 ============================
 
-更新的文档以PR的形式提交到github中，提交方式参见 `贡献文档 <http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html>`_ 。
+更新的文档以PR的形式提交到github中，提交方式参见 `如何贡献文档 <http://www.paddlepaddle.org/docs/develop/documentation/zh/dev/write_docs_cn.html>`_ 。
 目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ 和
 `英文文档 <http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html>`_ 。
 
diff --git a/doc/v2/dev/write_docs_en.rst b/doc/v2/dev/write_docs_en.rst
index f3408a84269aaeef19986c220454555fbbe30e23..6105455e202e4704aa25f0fd9916b9b61a569702 100644
--- a/doc/v2/dev/write_docs_en.rst
+++ b/doc/v2/dev/write_docs_en.rst
@@ -2,21 +2,20 @@
 Contribute Documentation
 ########################
 
-PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``.
-Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories.
-When using the PaddlePaddle.org to compile documentations, the compiled documentations will be stored under a consolidated directory: .ppo_workspace/content
+PaddlePaddle's documentation includes both Chinese and English versions. The documentation is built using the ``cmake`` command to drive the ``sphinx`` compiler. The PaddlePaddle.org tool helps us to implement this compilation process and provides better preview results.
 
-How to Build Documentations
-============
+How to build Documentation
+===========================
 
-We recommend using PaddlePaddle.org tool to build documentation
+PaddlePaddle's documentation is built in two ways: using the PaddlePaddle.org tool and without using it. Both methods have their own advantages. The former facilitates previewing, while the latter facilitates debugging by the developer. We could choose to build the documentation with Docker or without it in each of the above ways.
 
+We recommend using PaddlePaddle.org tool to build documentation.
 
-Use PaddlePaddle.org tool
---------------
-This is the recommended method to build documentation. It can compile documentation and preview the documentation in a web browser.
+Using PaddlePaddle.org tool
+-----------------------------
+This is the recommended method to build documentation, because it can automatically compile the documentation and preview the documentation directly in a web page. Note that, although you can preview the documentation in other ways, its style may not be consistent with the official website. Compiling with the PaddlePaddle.org tool produces a preview that will be consistent with the official website documentation style.
 
-The tool uses Docker, please install it on your system. Please check Docker official website on how to install Docker. You may use the following commands to activate the tool
+The PaddlePaddle.org tool can be used with Docker and Docker needs to be installed first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. After installing Docker, you may use the following commands to activate the tool
 
 ..  code-block:: bash
 
@@ -32,8 +31,8 @@ The tool uses Docker, please install it on your system. Please check Docker offi
     # Please specify the working directory through -v
     docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
 
-Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run command
-Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
+Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run commands
+Use a web browser and navigate to http://localhost:8000. Click the buttons to compile the documentation.
 The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
 
 
@@ -58,19 +57,79 @@ If you don't wish to use Docker, you can also activate the tool through Django.
     pip install -r requirements.txt
     python manage.py runserver
 
-Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
+Specify the PaddlePaddle working directory for the environment variable CONTENT_DIR so that the tool could find where the working directory is.
+
+Use a web browser and navigate to http://localhost:8000. Click the buttons to compile the documentation
 The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
 
-If you want to learn more on the PaddlePaddle.org, please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.md>`_ 。
+Please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.md>`_ for more information about the PaddlePaddle.org tool.
+
+
+Manually Building the Documentation
+-------------------------------------
+
+Build PaddlePaddle's documentation with Docker，you need to install Docker first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. This method is quite similar to ` Build From Sources <http://paddlepaddle.org/docs/develop/documentation/en/build_and_install/build_from_source_en.html>`_ , by constructing, from source code, a docker image that can be used to build PaddlePaddle documentation. Enter the Docker container and use the script ``build.sh`` in the source directory to build the PaddlePaddle documentation. The specific steps are as follows:
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+
+   # Construct a docker image from source code
+   docker build -t paddle:dev .
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash
+
+   # Use build.sh to build PaddlePaddle documentation
+   bash -x /paddle/paddle/scripts/docker/build.sh
+
+Note: The above commands maps the current directory (source root directory) to the :code:`/paddle` directory in the container.
+
+After compiling, there should be two generated directories: ``doc/v2`` and ``doc/fluid``, where three subdirectories ``cn/html/``, ``en/html`` and ``api/en/html`` are generated. Please enter these directories respectively and execute the following commands:
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+Use a web browser and navigate to http://localhost:8000, you could see the compiled  ``v2`` 's and ``fluid`` 's Chinese/English documents page and English APIs page.
+
+If you do not wish to use Docker, you can also use the following commands to directly build the PaddlePaddle documentation.
+
+.. code-block:: bash
+
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   mkdir -p build
+   cd build
+   cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
+
+   # If you only need to build documents, use the following commands
+   make -j $processors paddle_docs
+
+   # If you only need to build APIs, use the following commands
+   make -j $processors paddle_apis
+
+$processors indicates that as many processes as the CPU cores are started to compile in parallel. It should be set according to the number of CPU cores of your machine.
+
+After compiling, there also should be two generated directories: ``doc/v2`` and ``doc/fluid`` . If you chose to build documents, two subdirectories ``cn/html/`` and ``en/html``  will be generated in both two directories. If you chose to build APIs，a subdirectory ``api/en/html`` will be generated. Please enter these directories respectively and execute the following commands:
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+Use a web browser and navigate to http://localhost:8000, you could see the compiled  ``v2`` 's and ``fluid`` 's Chinese/English documents page and English APIs page. The following figure is an example of the built ``v2`` 's English documents home page. Note that due to the sphinx's original theme used in the example, the style of the page is not consistent with the official website, but this does not affect the developer's debugging.
 
-How to write Documentations
-============
+..  image:: src/doc_en.png
+    :align: center
+    :scale: 60 %
 
-PaddlePaddle uses `sphinx`_ to compile documentations，Please check sphinx official website for more detail.
+How to write Documentation
+===========================
 
+PaddlePaddle uses `sphinx`_ to compile documentation，Please check sphinx official website for more detail.
 
 How to update www.paddlepaddle.org
-============================
+===================================
 
 Please create PRs and submit them to github, please check `Contribute Code <http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html>`_ 。
 PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ and
diff --git a/doc/v2/faq/build_and_install/index_cn.rst b/doc/v2/faq/build_and_install/index_cn.rst
index 7c7e896d187e4fe1544d7ec933fa4fa9f24df3cd..0d644777287aea0a572adb6fa40f498f9c147af7 100644
--- a/doc/v2/faq/build_and_install/index_cn.rst
+++ b/doc/v2/faq/build_and_install/index_cn.rst
@@ -139,3 +139,86 @@ PaddlePaddle使用avx SIMD指令提高cpu执行效率，因此错误的使用二
     touch ../extern_mklml-stamp/extern_mklml-download
 
     // 4. 接着编译即可
+
+9. 在Mac上无法安装numpy等Python包，权限错误
+------------------
+
+Mac上对自带的Python和包有严格的权限保护，最好不要在自带的Python上安装。建议用virtualenv建立一个新的Python环境来操作。
+
+virtualenv的基本原理是将机器上的Python运行所需的运行环境完整地拷贝一份。我们可以在一台机器上制造多份拷贝，并在这多个拷贝之间自由切换，这样就相当于在一台机器上拥有了多个相互隔离、互不干扰的Python环境。
+
+下面简单介绍下如何用virtualenv为Paddle生成一个专用的Python环境：
+
+安装virtualenv：
+::::::::::::::::
+
+virtualenv本身也是Python的一个包，可以用pip进行安装：
+
+..  code-block:: bash
+
+    sudo -H pip install virtualenv
+
+由于virtualenv需要安装给系统自带的Python，因此需要使用sudo权限。
+
+创建一个新的Python运行环境：
+:::::::::::::::::::
+
+..  code-block:: bash
+
+    virtualenv --no-site-packages paddle
+
+--no-site-packages 参数表示不拷贝已有的任何第三方包，创造一个完全干净的新Python环境。后面的paddle是我们为这个新创建的环境取的名字。
+
+执行完这一步后，当前目录下应该会出现一个名为paddle（或者你取的其他名字）的目录。这个目录里保存了运行一个Python环境所需要的各种文件。
+
+启动运行环境：
+::::::::::::::::
+
+..  code-block:: bash
+
+    source paddle/bin/activate
+
+执行后会发现命令提示符前面增加了(paddle)字样，说明已经成功启动了名为‘paddle’的Python环境。执行which python，可以发现使用的已经是刚刚创建的paddle目录下的Python。
+
+在这个环境中，我们可以自由地进行Paddle的安装、使用和开发工作，无需担心对系统自带Python的影响。
+
+退出运行环境：
+:::::::::::::::
+
+直接执行：
+
+..  code-block:: bash
+
+    deactivate
+
+可以看到命令提示符前面的(paddle)字样消失。
+
+自动启动某一Python环境：
+::::::::::::::::
+
+如果我们经常使用Paddle，我们每次打开终端后都需要执行一下source paddle/bin/activate来启动环境，比较繁琐。为了简便，可以修改终端的配置文件，来让终端每次启动后自动启动特定的Python环境。
+
+执行:
+
+..  code-block:: bash
+
+    vi ~/.bash_profile
+
+打开终端配置文件，并在文件的最后添加一行：
+
+..  code-block:: bash
+
+    source paddle/bin/activate
+
+保存并关闭文件。
+
+这样，每次打开终端时就会自动启动名为‘paddle’的Python环境了。
+
+10. 通过pip安装的PaddlePaddle在  :code:`import paddle.fluid` 报找不到 :code:`libmkldnn.so` 或 :code:`libmklml_intel.so`
+------------------------------------------------------------------------------------------
+出现这种问题的原因是在导入 :code:`paddle.fluid` 时需要加载 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so`，
+但是系统没有找到该文件。一般通过pip安装PaddlePaddle时会将 :code:`libmkldnn.so` 和 :code:`libmklml_intel.so`
+拷贝到 :code:`/usr/local/lib` 路径下，所以解决办法是将该路径加到 :code:`LD_LIBRARY_PATH` 环境变量下，
+即： :code:`export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH` 。
+
+**注意**：如果是在虚拟环境中安装PaddlePaddle， :code:`libmkldnn.so` 和 :code:`libmklml_intel.so` 可能不在 :code:`/usr/local/lib` 路径下。
\ No newline at end of file
diff --git a/doc/v2/faq/build_and_install/index_en.rst b/doc/v2/faq/build_and_install/index_en.rst
index 614db457d715665073cec1a495d4d7df6887532f..7488ed8137d57785f36b9f1e1ed1269f864960bc 100644
--- a/doc/v2/faq/build_and_install/index_en.rst
+++ b/doc/v2/faq/build_and_install/index_en.rst
@@ -1,5 +1,143 @@
-############################
-Install, Build and Unit test
-############################
+.. _install_faq:
 
-TBD
+###############################
+Compile, Install, and Unit Test
+###############################
+
+..  contents::
+
+1. Insufficient CUDA driver version
+----------------------------------------------------------------
+
+Many users usually face issues like `Cuda Error: CUDA driver version is insufficient for CUDA runtime version` when running the PaddlePaddle GPU Docker image. The cause is that you may not map the local CUDA driver to a container directory.
+You can solve the issue by running the following commands:
+
+..  code-block:: bash
+
+    $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+    $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+    $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+For more infomation about Docker's installation and usage, please refer to `PaddlePaddle Docker documentation <http://www.paddlepaddle.org/docs/0.11.0/documentation/zh/getstarted/build_and_install/docker_install_en.html>`_ .
+
+
+2. Version mismatch between PythonLibs and PythonInterpreter
+----------------------------------------------------------------
+
+It is a common bug when CMake looks up Python. If you install multiple versions of Python, Cmake may find the version mismatch between PythonLibs and PythonInterpreter . You are forced to specify a Python version, as follows.
+
+    ..  code-block:: bash
+
+        cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path>  -DPYTHON_INCLUDE_DIR=<inc_path>
+
+You should specify ``<exc_path>``, ``<lib_path>``, ``<inc_path>`` to your local paths.
+
+3. PaddlePaddle version is 0.0.0
+------------------------------------------------
+This issue would happen when you run the code  `paddle version` or `cmake ..`
+
+..  code-block:: bash
+
+    CMake Warning at cmake/version.cmake:20 (message):
+      Cannot add paddle version from git tag
+
+You should pull all remote branches to your local machine with the command :code:`git fetch upstream` and then run :code:`cmake`
+
+4. paddlepaddle\*.whl is not a supported wheel on this platform.
+------------------------------------------------------------------------
+
+The primary cause for this issue is that it can not find the correct PaddlePaddle installation package that matches your current system.The latest PaddlePaddle Python installation package supports Linux x86_64 and MacOS 10.12 os including Python2.7 and Pip 9.0.1.
+
+You can upgrade Pip with the following command\:
+
+..  code-block:: bash
+
+    pip install --upgrade pip
+
+If it does not work for you, you can run the command :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` to get the suffix of Python package which your system may support and then compare it with the suffix of your installation.
+
+If the system supports :code:`linux_x86_64` and  the installation package is :code:`manylinux1_x86_64`, you should upgrade pip to the latest 
+
+if the system supports :code:`manylinux_x86_64` and the local installation package is :code:`linux1_x86_64`, you can rename the whl package to :code:`manylinux1_x86_64` and then try again.
+
+
+5. ImportError: No module named v2
+----------------------------------
+Please uninstall Paddle V1 if you have installed it before.
+
+..  code-block:: bash
+
+    pip uninstall py_paddle paddle
+
+Then install Python for PaddlePaddle , enter the build directory and run the following commands
+
+pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl
+
+6. Illegal instruction
+-----------------------
+This issue may be caused by the wrong usage of PaddlePaddle binary version which uses avx SIMD instructions to increase the performance of cpu. Please choose the correct version.
+
+7.  Python unittest fails
+--------------------------------
+
+If the following python unittest testcases fail:
+
+..  code-block:: bash
+
+    24 - test_PyDataProvider (Failed)
+    26 - test_RecurrentGradientMachine (Failed)
+    27 - test_NetworkCompare (Failed)
+    28 - test_PyDataProvider2 (Failed)
+    32 - test_Prediction (Failed)
+    33 - test_Compare (Failed)
+    34 - test_Trainer (Failed)
+    35 - test_TrainerOnePass (Failed)
+    36 - test_CompareTwoNets (Failed)
+    37 - test_CompareTwoOpts (Failed)
+    38 - test_CompareSparse (Failed)
+    39 - test_recurrent_machine_generation (Failed)
+    40 - test_PyDataProviderWrapper (Failed)
+    41 - test_config_parser (Failed)
+    42 - test_swig_api (Failed)
+    43 - layers_test (Failed)
+
+Please check the PaddlePaddle unittest logs which may suggest the following:
+
+..  code-block:: bash
+
+    paddle package is already in your PYTHONPATH. But unittest need a clean environment.
+    Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
+
+The solution is:
+
+* Remove old PaddlePaddle to make a clean environment for the unit tests. If PaddlePaddle package is already in Python's site-packages, unit tests would refer Python package in site-packages instead of Python package in the :code:`/python` directory of the source directory.  Setting :code:`PYTHONPATH` to :code:`/python` is also useless because Python's search path would give the priority to the installed Python package.
+
+
+8. Failed to download the MKLML library
+----------------------------------------------
+
+..  code-block:: bash
+
+    make[2]: *** [third_party/mklml/src/extern_mklml-stamp/extern_mklml-download] error 4
+    make[1]: *** [CMakeFiles/extern_mklml.dir/all] error 2
+    make[1]: *** waiting for the unfinished  jobs....
+
+Cause: The network speed or SSL link causes the MKLML library to download unsuccessfully.
+
+The solution is: manually download and install, the specific steps are as follows.
+
+..  code-block:: bash
+
+    // 1. enter the directory
+    cd build/third_party/mklml/src/extern_mklml
+
+    // 2. check the size of the package, normally 75M, if less than 75M, the download fails
+    du -sh mklml_lnx_2018.0.1.20171007.tgz
+
+    // 3. manually download and unzip and make the download success tag:
+    wget --no-check-certificate https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz -c -O mklml_lnx_2018.0.1.20171007.tgz 
+    tar zxf mklml_lnx_2018.0.1.20171007.tgz
+    touch ../extern_mklml-stamp/extern_mklml-download
+
+    // 4. then compile
+    
diff --git a/doc/v2/faq/cluster/index_en.rst b/doc/v2/faq/cluster/index_en.rst
index 855b7e8e53307b82a72c156be4ef509e27edf822..fa942a09625bef78b28456beeb735272b686e061 100644
--- a/doc/v2/faq/cluster/index_en.rst
+++ b/doc/v2/faq/cluster/index_en.rst
@@ -2,4 +2,15 @@
 Cluster Training and Prediction
 ###############################
 
-TBD
+.. contents::
+
+1. Network connection errors in the log during multi-node cluster training
+------------------------------------------------
+There are maybe some errors in the log belonging to network connection problem during multi-node cluster training, for example, :code:`Connection reset by peer`.
+This kind of error is usually caused by the abnormal exit of a training process in some node, and the other nodes cannot connect with this node any longer. Steps to troubleshoot the problem are as follows:
+
+* Find the first error in the :code:`train.log`, :code:`server.log`, check whether other fault casued the problem, such as FPE, lacking of memory or disk.
+
+* If the first error in server.log says "Address already used", this may be caused by the port conflict of the non-exclusive execution. Connect the sys-admin to check if the current MPI cluster supports jobs submitted with parameter :code:`resource=full`. If the current MPI cluster does not support this parameter, change the server port and try agian.
+
+* If the current MPI cluster does not support exclusive pattern which allows a process to occupy the whole node, ask the administrator to replace or update the this cluster.
diff --git a/doc/v2/faq/index_en.rst b/doc/v2/faq/index_en.rst
index 57df868f760038b25fae30df7ab20a68875ad36a..3fa220792b252617848a1c76bc2be49928e35f64 100644
--- a/doc/v2/faq/index_en.rst
+++ b/doc/v2/faq/index_en.rst
@@ -1,7 +1,8 @@
 FAQ
 ====
 
- 
+This document provides answers to some of the frequently asked questions about PaddlePaddle. If you have a question that is not covered here, please go to `PaddlePaddle Community <https://github.com/PaddlePaddle/Paddle/issues>`_ , to find an answer or submit new `issue <https://github.com/PaddlePaddle/Paddle/issues/new>`_  , we will reply in time.
+
 ..  toctree::
   :maxdepth: 1
 
diff --git a/doc/v2/faq/local/index_en.rst b/doc/v2/faq/local/index_en.rst
index 4cb43031933a8bbe9aebae04bc3e9c74a6d21b95..fa95b1753dbe293811d7a8601497ad521fa3ecda 100644
--- a/doc/v2/faq/local/index_en.rst
+++ b/doc/v2/faq/local/index_en.rst
@@ -1,5 +1,248 @@
 #############################
-Local Training and Prediction
+Parameter Setting
 #############################
 
-TBD
+..  contents::
+
+1. Reduce Memory Consumption
+-------------------
+
+The training procedure of neural networks demands dozens of gigabytes of host memory or serval gigabytes of device memory, which is a rather memory consuming work. The memory consumed by PaddlePaddle framework mainly includes:
+\:
+
+* Cache memory for DataProvider (only on host memory),
+* Memory for neurons' activation information (on both host memory and device memory),
+* Memory for parameters (on both host memory and device memory),
+* Other memory demands.
+
+Other memory demands is mainly used to support the running demand of PaddlePaddle framework itself, such as string allocation，temporary variables, which are not considered currently.
+
+Reduce DataProvider Cache Memory
+++++++++++++++++++++++++++
+
+PyDataProvider works under asynchronous mechanism, it loads together with the data fetch and shuffle procedure in host memory:
+
+..  graphviz::
+
+    digraph {
+        rankdir=LR;
+        Data Files -> Host Memory Pool -> PaddlePaddle Training
+    }
+
+Thus the reduction of the DataProvider cache memory can reduce memory occupancy, meanwhile speed up the data loading procedure before training. However, the size of the memory pool can actually affect the granularity of shuffle，which means a shuffle operation is needed before each data ﬁle reading process to ensure the randomness of data when try to reduce the size of the memory pool.
+
+..  literalinclude:: src/reduce_min_pool_size.py
+
+In this way, the memory consumption can be significantly reduced and hence the training procedure can be accelerated. More details are demonstrated in :ref:`api_pydataprovider2`.
+
+The Neurons Activation Memory
+++++++++++++++
+
+Each neuron activation operating in a neural network training process contains certain amount of temporary data such as the activation data (like the output value of a neuron). These data will be used to update parameters in back propagation period. The scale of memory consumed by these data is mainly related with two parameters, which are batch size and the length of each Sequence. Therefore, the neurons activation memory consuming is actually in proportion to the information contains in each mini-batch training.
+
+Two practical ways:
+
+* Reduce batch size. Set a smaller value in network configuration settings(batch_size=1000) can be helpful. But setting batch size to a smaller value may affect the training result due to it is a super parameter of the neural network itself.
+* Shorten the sequence length or cut oﬀ those excessively long sequences. For example, if the length of sequences in a dataset are mostly varies between 100 and 200, but there is sequence lengthen out to 10,000, then it’s quite potentially leads to OOM (out of memory), especially in RNN models such as LSTM.
+
+The Parameters Memory
+++++++++
+
+The PaddlePaddle framework supports almost all popular optimizers. Different optimizers have different memory requirement. For example, the :code:`adadelta` consumes approximately 5 times memory
+
+space than the weights parameter’s scale, which means the :code:`adadelta` needs at least :code:`500M` memory if the model ﬁle contains all
+
+parameters needs :code:`100M`.
+
+Some optimization algorithms such as :code:`momentum` are worth giving a shot.
+
+2. Tricks To Speed Up Training
+-------------------
+
+The training procedure of PaddlePaddle may be speed up when considering following aspects:\：
+
+* Reduce the time consumption of data loading
+* Speed up training epochs
+* Introduce more computing resources with the utilization of distribute training frameworks
+
+Reduce The Time Consumption of Data Loading
+++++++++++++++++++
+
+
+The \ :code:`pydataprovider`\ holds big potential to speed up the data loading procedure if the cache pool and enable memory cache when use it. The principle of the reduction of :code:`DataProvider` cache pool is basically the same with the method which reduct the memory occupation with the set of a smaller cache pool.
+
+..  literalinclude:: src/reduce_min_pool_size.py
+
+Beside, the interface :code:`@provider` provides a parameter :code:`cache` to control cache. If set it to :code:`CacheType.CACHE_PASS_IN_MEM`, the data after the first :code:`pass` ( a pass means all data have be fed into the network for training) will be cached in memory and no new data will be read from the :code:`python` side in following :code:`pass` , instead from the cached data in memory. This strategy can also drop the time consuming in data loading process.
+
+
+Accelerating Training Epochs
+++++++++++++
+
+Sparse training is supported in PaddlePaddle. The features needs to be trained is any of :code:`sparse_binary_vector`, :code:`sparse_vector` and :code:`integer_value` . Meanwhile, the Layer interacts with the training data need to turn the Parameter to sparse updating mode by setting :code:`sparse_update=True`.
+Take :code:`word2vec` as an example, to train a language distance, one needs to predict the middle word with two words prior to it and next to it. The DataProvider of this task is:
+
+..  literalinclude:: src/word2vec_dataprovider.py
+
+The configuration of this task is:
+
+..  literalinclude:: src/word2vec_config.py
+
+Introduce More Computing Resources
+++++++++++++++++++
+
+More computing resources can be introduced with following manners:
+* Single CPU platform training
+
+  * Use multi-threading by set :code:`trainer_count`。
+
+* Single GPU platform training
+
+  * Set :code:`use_gpu` to train on single GPU.
+  * Set :code:`use_gpu` and :code:`trainer_count` to enable multiple GPU training support.
+
+* Cluster Training
+
+  * Refer to :ref:`cluster_train` 。
+
+3. Assign GPU Devices
+------------------
+
+Assume a computing platform consists of 4 GPUs which serial number from 0 to 3:
+
+* Method1: specify a GPU as computing device by set:
+ `CUDA_VISIBLE_DEVICES <http://www.acceleware.com/blog/cudavisibledevices-masking-gpus>`_
+
+..      code-block:: bash
+
+        env CUDA_VISIBLE_DEVICES=2,3 paddle train --use_gpu=true --trainer_count=2
+
+* Method2: Assign by —gpu_id:
+
+..      code-block:: bash
+
+        paddle train --use_gpu=true --trainer_count=2 --gpu_id=2
+
+
+4. How to Fix Training Termination Caused By :code:`Floating point exception` During Training.
+------------------------------------------------------------------------
+
+Paddle binary catches floating exceptions during runtime, it will be terminated when NaN or Inf occurs. Floating exceptions are mostly caused by float overflow, divide by zero. There are three main reasons may raise such exception:
+
+* Parameters or gradients during training are oversize, which leads to float overflow during calculation.
+* The model failed to converge and diverges to a big value.
+* Parameters may converge to a singular value due to bad training data. If the scale of input data is too big and contains millions of parameter values, float overflow error may arise when operating matrix multiplication.
+
+Two ways to solve this problem:
+
+1. Set :code:`gradient_clipping_threshold` as:
+
+..  code-block:: python
+
+    optimizer = paddle.optimizer.RMSProp(
+        learning_rate=1e-3,
+        gradient_clipping_threshold=10.0,
+        regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+
+Details can refer to example `nmt_without_attention  <https://github.com/PaddlePaddle/models/blob/develop/nmt_without_attention/train.py#L35>`_ 示例。
+
+2. Set :code:`error_clipping_threshold` as:
+
+..  code-block:: python
+
+    decoder_inputs = paddle.layer.fc(
+        act=paddle.activation.Linear(),
+        size=decoder_size * 3,
+        bias_attr=False,
+        input=[context, current_word],
+        layer_attr=paddle.attr.ExtraLayerAttribute(
+            error_clipping_threshold=100.0))
+
+Details can refer to example `machine translation <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py#L66>`_ 。
+
+The main difference between these two methods are:
+
+1. They both block the gradient, but happen in different occasions，the former one happens when then :code:`optimzier` updates the network parameters while the latter happens when the back propagation computing of activation functions.
+2. The block target are different, the former blocks the trainable parameters’ gradient while the later blocks the gradient to be propagated to prior layers.
+
+Moreover, Such problems may be fixed with smaller learning rates or data normalization.
+
+5.  Fetch Multi Layers’ Prediction Result With Infer Interface
+-----------------------------------------------
+
+* Join the layer to be used as :code:`output_layer` layer to the input parameters of  :code:`paddle.inference.Inference()` interface with:
+
+..  code-block:: python
+
+    inferer = paddle.inference.Inference(output_layer=[layer1, layer2], parameters=parameters)
+
+* Assign certain ﬁelds to output. Take :code:`value` as example, it can be down with following code:
+
+..  code-block:: python
+
+    out = inferer.infer(input=data_batch, field=["value"])
+
+It is important to note that:
+
+* If 2 layers are assigned as output layer, then the output results consists of 2 matrixes.
+* Assume the output of first layer A is a matrix sizes N1 * M1, the output of second layer B is a matrix sizes N2 * M2；
+* By default, paddle.v2 will transverse join A and B, when N1 not equal to N2, it will raise following error:
+
+..      code-block:: python
+
+    ValueError: all the input array dimensions except for the concatenation axis must match exactly
+
+The transverse of diﬀerent matrixes of multi layers mainly happens when:
+
+* Output sequence layer and non sequence layer;
+* Multiple output layers process multiple sequence with different length;
+
+Such issue can be avoided by calling infer interface and set :code:`flatten_result=False`. Thus, the infer interface returns a python list, in which
+
+* The number of elements equals to the number of output layers in the network;
+* Each element in list is a result matrix of a layer, which type is numpy.ndarray;
+* The height of each matrix outputted by each layer equals to the number of samples under non sequential mode or equals to the number of elements in the input sequence under sequential mode. Their width are both equal to the layer size in configuration.
+
+6.  Fetch the Output of A Certain Layer During Training
+-----------------------------------------------
+
+In event_handler, the interface :code:`event.gm.getLayerOutputs("layer_name")` gives the forward output value organized in :code:`numpy.ndarray` corresponding to :code:`layer_name` in the mini-batch.
+The output can be used in custom measurements in following way:
+
+..      code-block:: python
+
+        def score_diff(right_score, left_score):
+            return np.average(np.abs(right_score - left_score))
+
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndIteration):
+                if event.batch_id % 25 == 0:
+                    diff = score_diff(
+                        event.gm.getLayerOutputs("right_score")["right_score"][
+                            "value"],
+                        event.gm.getLayerOutputs("left_score")["left_score"][
+                            "value"])
+                    logger.info(("Pass %d Batch %d : Cost %.6f, "
+                                "average absolute diff scores: %.6f") %
+                                (event.pass_id, event.batch_id, event.cost, diff))
+
+Note: this function can not get content of :code:`paddle.layer.recurrent_group` step, but output of  :code:`paddle.layer.recurrent_group` can be fetched.
+
+7.  Fetch Parameters’ Weight and Gradient During Training
+-----------------------------------------------
+
+Under certain situations, knowing the weights of currently training mini-batch can provide more inceptions of many problems. Their value can be acquired by printing values in :code:`event_handler` (note that to gain such parameters when training on GPU, you should set :code:`paddle.event.EndForwardBackward`). Detailed code is as following:
+
+..      code-block:: python
+
+        ...
+        parameters = paddle.parameters.create(cost)
+        ...
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndForwardBackward):
+                if event.batch_id % 25 == 0:
+                    for p in parameters.keys():
+                        logger.info("Param %s, Grad %s",
+                            parameters.get(p), parameters.get_grad(p))
+
+Note that “acquire the output of a certain layer during training” or “acquire the weights and gradients of parameters during training ” both needs to copy training data from C++ environment to numpy, which have certain degree of inﬂuence on training performance. Don’t use these two functions when the training procedure cares about the performance.
diff --git a/doc/v2/faq/model/index_en.rst b/doc/v2/faq/model/index_en.rst
index cb26f59655f97dc28a2047994643ae16b8857964..67a33e08e192e5627ac3b0abd76e979f21ed2079 100644
--- a/doc/v2/faq/model/index_en.rst
+++ b/doc/v2/faq/model/index_en.rst
@@ -2,4 +2,80 @@
 Model Configuration
 ###################
 
-TBD
+..  contents::
+
+1. How to deal with error :code:`Duplicated layer name`
+----------------------------------------------------------
+
+The general reason for this error is that users may have set the same value for the attribute :code:`name` in different layers. Try to find out the :code:`name` attribute with the same value in diffrent layers and set them differently.
+
+2. How to use :code:`paddle.layer.memory`'s attribute :code:`name`
+----------------------------------------------------------------------
+
+* :code:`paddle.layer.memory` is used to get the output of a layer's last timestep and the layer is specified by the attribute :code:`name` . Thus,  :code:`paddle.layer.memory` will associate with the layer that has the same value of attribute :code:`name` , and uses the output of the layer's last timestep as the input of its current timestep.
+
+* All the PaddlePaddle's layers have a unique name, which is set by the attribute :code:`name` . PaddlePaddle will automatically set it for the user when it is not explicitly set. :code:`paddle.layer.memory` is not a real layer, its name is set by the attribute :code:`memory_name`  and PaddlePaddle will also automatically set it when the user does not explicitly set. The :code:`paddle.layer.memory` attribute :code:`name` is used to specify the layer it is associated with, and needs to be explicitly set by the user.
+
+
+3. What is the difference between the two ways of using dropout
+-----------------------------------------------------------------
+
+* There are two ways to use dropout in PaddlePaddle
+
+  * Set the :code:`drop_rate` parameter in the layer's :code:`layer_atter` attribute. Take :code:`paddle.layer.fc` as an example:
+
+  ..  code-block:: python
+
+      fc = paddle.layer.fc(input=input, layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=0.5))
+
+  * Use :code:`paddle.layer.dropout` layer. Take :code:`paddle.layer.fc` as an example:
+
+  ..  code-block:: python
+
+      fc = paddle.layer.fc(input=input)
+      drop_fc = paddle.layer.dropout(input=fc, dropout_rate=0.5)
+
+* :code:`paddle.layer.dropout` actually uses the :code:`paddle.layer.add_to` layer and sets :code:`drop_rate` as the previous method. This method is very memory intensive.
+
+* PaddlePaddle implements dropout in the activation function rather than in the layer.
+
+* :code:`paddle.layer.lstmemory`, :code:`paddle.layer.grumemory`, :code:`paddle.layer.recurrent` implement activation of output in an unusual way, so we cannot use dropout by setting :code:`drop_rate` . To use dropout for these layers, we could use the second method, which is to use :code:`paddle.layer.dropout`.
+
+4. The differences between different recurrent layers
+--------------------------------------------------------
+Take LSTM as an example. There are several kinds of recurrent layers in PaddlePaddle:
+
+* :code:`paddle.layer.lstmemory`
+* :code:`paddle.networks.simple_lstm`
+* :code:`paddle.networks.lstmemory_group`
+* :code:`paddle.networks.bidirectional_lstm`
+
+According to implementations, recurrent layer can be classified into 2 types:
+
+1. Recurrent layer implemented by recurrent_group:
+
+  * Using this type of recurrent layers, users can access the intermediate value calculated by the recurrent unit within a timestep (eg: hidden states, memory cells, etc.)
+  * :code:`paddle.networks.lstmemory_group` belongs to this type of recurrent layers.
+
+2. Recurrent layer implemented as a complete operation：
+
+  * Users can only access output values when using this type of recurrent layers.
+  * :code:`paddle.networks.lstmemory_group` , :code:`paddle.networks.simple_lstm` and  :code:`paddle.networks.bidirectional_lstm` belong to this type of recurrent layer；
+
+By implementing recurrent layer as a complete operation, CPU and GPU calculations can be optimized. Therefore, the second type of recurrent layer is more efficient than the first one. In practical applications, we propose to use the second type of recurrent layers if there is no need to access the intermediate variable of LSTM.
+
+In addition, PaddlePaddle also contains a kind of LSTM calculation unit: :code:`paddle.networks.lstmemory_unit`:
+
+  * Unlike the recurrent layer described above, :code:`paddle.networks.lstmemory_unit` defines the computational process of an LSTM unit in a timestep. It is not a complete recurrent layer, nor can it receive sequence data as input.
+  * :code:`paddle.networks.lstmemory_unit` can only be used as a step function in recurrent_group.
+
+5. Can Softmax's calculation dimension be specified？
+--------------------------------------------------------------------
+
+We can't specify calculation dimension for PaddlePaddle's softmax. It can only be calculated by rows.
+In image tasks, for NCHW, if you need to calculate softmax in C dimension, you could use :code:`paddle.layer.switch_order` to change the dimension order, that is, convert NCHW to NHWC, then do the reshape operation and calculate softmax.
+
+6. Does PaddlePaddle support variable-dimensional data inputs
+----------------------------------------------------------------
+
+PaddlePaddle provides :code:`paddle.data_type.dense_array` to support variable-dimensional data input. Simply set the dimension of the data layer to a value larger than the dimension of the input data for occupancy.
diff --git a/doc/v2/faq/parameter/index_cn.rst b/doc/v2/faq/parameter/index_cn.rst
index 1fa4b3e1311d2007ccba98fde9ff94300ea42c16..987e8cf088be4ee8daa7c28fdc855506cbfd31c7 100644
--- a/doc/v2/faq/parameter/index_cn.rst
+++ b/doc/v2/faq/parameter/index_cn.rst
@@ -196,6 +196,6 @@ PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数
         obj="process",
         args={"src_dict_path": src_dict_path})
 
-完整源码可参考 `sequence_recurrent <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_recurrent.py>`_ 示例。
+完整源码可参考 `sequence_recurrent <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_recurrent.py>`_ 示例。
 
 
diff --git a/doc/v2/getstarted/index_en.rst b/doc/v2/getstarted/index_en.rst
index 33f299be5680e0aa4a3f36638f51135503193d94..94b306895c9ddf6140cf600131930a6675a583eb 100644
--- a/doc/v2/getstarted/index_en.rst
+++ b/doc/v2/getstarted/index_en.rst
@@ -1,8 +1,19 @@
 GET STARTED
 ============
 
+If you want to quickly know how to use PaddlePaddle, please refer to the following guide:
+
 ..  toctree::
   :maxdepth: 1
 
   quickstart_en.rst
+  
+  
+While using PaddlePaddle to build applications, please understand some basic concepts.
+
+Here is an example of linear regression. It introduces workflow of PaddlePaddle, including data format, model configuration and training, etc.
+  
+..  toctree::
+  :maxdepth: 1
+  
   concepts/use_concepts_en.rst
diff --git a/doc/v2/howto/capi/compile_paddle_lib_cn.md b/doc/v2/howto/capi/compile_paddle_lib_cn.md
index e223fd33a8420abcdfdad53d1cfc5ed160a1b37e..2c87e9afc6911526cd51d6c691f262960accc9e8 100644
--- a/doc/v2/howto/capi/compile_paddle_lib_cn.md
+++ b/doc/v2/howto/capi/compile_paddle_lib_cn.md
@@ -18,7 +18,7 @@
 </tr>
 <tr>
 <td>cpu_avx_openblas</td>
-<td>暂无</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
 </tr>
 <tr>
 <td>cpu_noavx_openblas</td>
@@ -35,7 +35,12 @@
 <tr>
 <td>cuda8.0_cudnn7_avx_mkl</td>
 <td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
-</tr></tbody></table>
+</tr>
+<tr>
+<td>cuda9.0_cudnn7_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+</tr>
+</tbody></table>
 
 ### 从源码编译
 
diff --git a/doc/v2/howto/capi/compile_paddle_lib_en.md b/doc/v2/howto/capi/compile_paddle_lib_en.md
index 11d69b9b79c1a41898d3060d3fe25a31330334a3..3fa8a18a9fbea21b494c416e6b938990fbb68337 100644
--- a/doc/v2/howto/capi/compile_paddle_lib_en.md
+++ b/doc/v2/howto/capi/compile_paddle_lib_en.md
@@ -1,3 +1,180 @@
 ## Install and Build
 
-TBD
+### Download & Install 
+
+  Download the latest C-API development package from CI system and install. You can find the required version in the table below:
+<table>
+<thead>
+<tr>
+<th>Version Tips</th>
+<th>C-API</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>cpu_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cpu_avx_openblas</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cpu_noavx_openblas</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cuda7.5_cudnn5_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cuda8.0_cudnn5_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cuda8.0_cudnn7_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+</tr>
+<tr>
+<td>cuda9.0_cudnn7_avx_mkl</td>
+<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda90cudnn7avxMkl/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
+</tr>
+</tbody></table>
+
+### From source
+
+  Users can also compile the C-API library from PaddlePaddle source code by compiling with the following compilation options:
+  
+<table>
+<thead>
+<tr>
+<th>Options</th>
+<th>Value</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>WITH_C_API</td>
+<td>ON</td>
+</tr>
+<tr>
+<td>WITH_PYTHON</td>
+<td>OFF（recommended）</td>
+</tr>
+<tr>
+<td>WITH_SWIG_PY</td>
+<td>OFF（recommended）</td>
+</tr>
+<tr>
+<td>WITH_GOLANG</td>
+<td>OFF（recommended）</td>
+</tr>
+<tr>
+<td>WITH_GPU</td>
+<td>ON/OFF</td>
+</tr>
+<tr>
+<td>WITH_MKL</td>
+<td>ON/OFF</td>
+</tr></tbody></table>
+
+It is best to set up with recommended values to avoid linking with unnecessary libraries. Set other compilation options as you need.
+
+Pull the latest following code snippet from github, and configure compilation options(replace PADDLE_ROOT with the installation path of the PaddlePaddle C-API inference library):
+
+```shell
+PADDLE_ROOT=/path/of/capi
+git clone https://github.com/PaddlePaddle/Paddle.git
+cd Paddle
+mkdir build
+cd build
+cmake -DCMAKE_INSTALL_PREFIX=$PADDLE_ROOT \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      -DWITH_GOLANG=OFF \
+      -DWITH_PYTHON=OFF \
+      -DWITH_MKL=OFF \
+      -DWITH_GPU=OFF  \
+      ..
+```
+
+After running the above code to generate Makefile , run: `make && make install`.  After successful compilation, the dependencies required by C-API(includes: (1)PaddlePaddle inference library and header files; (2) Third-party libraries and header files) will be stored in the `PADDLE_ROOT` directory.
+
+If the compilation is successful, see the following directory structure under `PADDLE_ROOT`(includes PaddlePaddle header files and libraries, and third-party libraries and header files(determined by the link methods if necessary)):
+
+```text
+├── include
+│   └── paddle
+│       ├── arguments.h
+│       ├── capi.h
+│       ├── capi_private.h
+│       ├── config.h
+│       ├── error.h
+│       ├── gradient_machine.h
+│       ├── main.h
+│       ├── matrix.h
+│       ├── paddle_capi.map
+│       └── vector.h
+├── lib
+│   ├── libpaddle_capi_engine.a
+│   ├── libpaddle_capi_layers.a
+│   ├── libpaddle_capi_shared.so
+│   └── libpaddle_capi_whole.a
+└── third_party
+    ├── gflags
+    │   ├── include
+    │   │   └── gflags
+    │   │       ├── gflags_completions.h
+    │   │       ├── gflags_declare.h
+    │   │       ...
+    │   └── lib
+    │       └── libgflags.a
+    ├── glog
+    │   ├── include
+    │   │   └── glog
+    │   │       ├── config.h
+    │   │       ...
+    │   └── lib
+    │       └── libglog.a
+    ├── openblas
+    │   ├── include
+    │   │   ├── cblas.h
+    │   │   ...
+    │   └── lib
+    │       ...
+    ├── protobuf
+    │   ├── include
+    │   │   └── google
+    │   │       └── protobuf
+    │   │           ...
+    │   └── lib
+    │       └── libprotobuf-lite.a
+    └── zlib
+        ├── include
+        │   ...
+        └── lib
+            ...
+
+```
+
+### Linking Description:
+
+There are three kinds of linking methods:
+
+1. Linking with dynamic library `libpaddle_capi_shared.so`（This way is much more convenient and easier, **Without special requirements, it is recommended**）, refer to the following：
+    1. Compiling with CPU version and using `OpenBLAS`; only need to link one library named `libpaddle_capi_shared.so` to develop prediction program through C-API.
+    1. Compiling with CPU version and using `MKL` lib, you need to link MKL library directly to develop prediction program through PaddlePaddle C-API, due to `MKL` has its own dynamic library.
+    1. Compiling with GPU version, CUDA library will be loaded dynamically on prediction program run-time, and also set CUDA library to  `LD_LIBRARY_PATH` environment variable.
+
+2. Linking with static library `libpaddle_capi_whole.a`，refer to the following：
+    1. Specify `-Wl,--whole-archive` linking options.
+    1. Explicitly link third-party libraries such as `gflags`、`glog`、`libz`、`protobuf` .etc, you can find them under `PADDLE_ROOT/third_party` directory.
+    1. Use OpenBLAS library if compiling C-API，must explicitly link `libopenblas.a`.
+    1. Use MKL when compiling C-API, must explicitly link MKL dynamic library.
+
+3. Linking with static library `libpaddle_capi_layers.a` and `libpaddle_capi_engine.a`，refer to the following：
+    1. This linking methods is mainly used for mobile prediction.
+    1. Split `libpaddle_capi_whole.a` into two static linking library at least to reduce the size of linking libraries.
+    1. Specify `-Wl,--whole-archive -lpaddle_capi_layers`  and  `-Wl,--no-whole-archive -lpaddle_capi_engine` for linking.
+    1. The third-party dependencies need explicitly link same as method 2 above. 
diff --git a/doc/v2/howto/capi/index_en.rst b/doc/v2/howto/capi/index_en.rst
index 2cbbe362fd8e06abe9866d998f60fbb3458a80b5..4ec39c9d5223442cf6872edaf7befeb5053b538e 100644
--- a/doc/v2/howto/capi/index_en.rst
+++ b/doc/v2/howto/capi/index_en.rst
@@ -1,6 +1,23 @@
-C-API Prediction Library
+C-API Inference Library
 ========================
 
+After we train a neural network, we use it to do inference. Inference is the process of preparing input data and propagating it through the model to produce the result.
+
+Compared with model training, prediction has the following features:
+
+#. Inference does not require backpropagation and parameter updates, as required during training.
+#. Labels are not needed in prediction.
+#. Most of the time, predictions need to be integrated with the user system.
+
+Therefore, the model prediction SDK needs to be designed separately and has the following features:
+
+#. The predictive SDK does not include backpropagation and parameter updates to reduce the size of the SDK.
+#. The predictive SDK needs a simple user interface for ease of use.
+#. Since the input data may have a variety of structures, the format of the input data is clearly and compactly packaged.
+#. In order to be compatible with user's system, the SDK's interface must conform to the C-standard interface.
+
+PaddlePaddle provides C-API to solve the above problem. Following are the guidelines to use the C-API:
+
 ..  toctree::
   :maxdepth: 1
 
diff --git a/doc/v2/howto/capi/workflow_of_capi_cn.md b/doc/v2/howto/capi/workflow_of_capi_cn.md
index 1968c1099ac5734cd68b437f2f7aa428d7b5265e..3acdbae28e9b35f8a9104a89c9a5799f8c892334 100644
--- a/doc/v2/howto/capi/workflow_of_capi_cn.md
+++ b/doc/v2/howto/capi/workflow_of_capi_cn.md
@@ -59,7 +59,7 @@
     代码示例如下：
 
     ```python
-    from paddle.utils.merge_model import merge_v2_modelss
+    from paddle.utils.merge_model import merge_v2_model
     from mnist_v2 import network
 
     net = network(is_infer=True)
diff --git a/doc/v2/howto/cluster/index_en.rst b/doc/v2/howto/cluster/index_en.rst
index 2640a09dcc904619bc97c9bd3f3d81a9dc307663..31eda57c4fb3947d92df45ea8dbb9274c9814140 100644
--- a/doc/v2/howto/cluster/index_en.rst
+++ b/doc/v2/howto/cluster/index_en.rst
@@ -1,7 +1,9 @@
 Distributed Training
 ====================
 
-In this section, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job:
+The effectiveness of the deep learning model is often directly related to the scale of the data: it can generally achieve better results after increasing the size of the dataset on the same model. However, it can not fit in one single computer when the amount of data increases to a certain extent. At this point, using multiple computers for distributed training is a natural solution. In distributed training, the training data is divided into multiple copies (sharding), and multiple machines participating in the training read their own data for training and collaboratively update the parameters of the overall model.
+
+Distributed training generally has framwork as shown below:
 
 .. image:: src/ps_en.png
    :width: 500
@@ -10,13 +12,27 @@ In this section, we'll explain how to run distributed training jobs with PaddleP
 - Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training.
 - Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers.
 
-PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD.
+The training of synchronous random gradient descent for neural network can be achieved by cooperation of trainers and parameter servers.
+
+PaddlePaddle supports both synchronize stochastic gradient descent (SGD) and asynchronous SGD.
 
-When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient.
+Before starting the cluster training, you need to prepare the cluster configuration, PaddlePaddle installation, and other preparations. To understand how to configure the basic environment for distributed training, check the link below:
 
 ..  toctree::
   :maxdepth: 1
 
   preparations_en.md
+
+Cluster training has a large number of configurable parameters, such as the number of machines used, communication ports, etc. To learn how to configure the distributed training process by setting startup these parameters, check the link below:
+
+..  toctree::
+  :maxdepth: 1
+
   cmd_argument_en.md
+
+PaddlePaddle is compatible with a variety of different clusters. Each cluster has its own advantages, To learn how to run PaddlePaddle in different types of them, check the link below:
+
+..  toctree::
+  :maxdepth: 1
+
   multi_cluster/index_en.rst
diff --git a/doc/v2/howto/cluster/multi_cluster/index_en.rst b/doc/v2/howto/cluster/multi_cluster/index_en.rst
index dac7aaef085c80851c1bbb89250faf2151de4ca6..9bc1eb2e3796d95dd69b165e916e263ea34b87f6 100644
--- a/doc/v2/howto/cluster/multi_cluster/index_en.rst
+++ b/doc/v2/howto/cluster/multi_cluster/index_en.rst
@@ -1,19 +1,35 @@
 Use different clusters
 ======================
 
-PaddlePaddle supports running jobs on several platforms including:
-- `Kubernetes <http://kubernetes.io>`_ open-source system for automating deployment, scaling, and management of containerized applications from Google.
-- `OpenMPI <https://www.open-mpi.org>`_ Mature high performance parallel computing framework.
-- `Fabric <http://www.fabfile.org>`_ A cluster management tool. Write scripts to submit jobs or manage the cluster.
+The user's cluster environment is not the same. To facilitate everyone's deployment, we provide a variety of cluster deployment methods to facilitate the submission of cluster training tasks, which will be introduced as follows:
 
-We'll introduce cluster job management on these platforms. The examples can be found under `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ .
+`Kubernetes <http://kubernetes.io>`_ is a scheduling framework of Google open source container cluster, supporting a complete cluster solution for large-scale cluster production environment. The following guidelines show PaddlePaddle's support for Kubernetes:
 
-These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc.
+..  toctree::
+  :maxdepth: 1
+
+  k8s_en.md
+  k8s_distributed_en.md
+
+`OpenMPI <https://www.open-mpi.org>`_ is a mature high-performance parallel computing framework, which is widely used in the field of HPC. The following guide describes how to use OpenMPI to build PaddlePaddle's cluster training task:
 
 ..  toctree::
   :maxdepth: 1
 
-  fabric_en.md
   openmpi_en.md
-  k8s_en.md
+
+`Fabric <http://www.fabfile.org>`_ is a convenient tool for program deployment and management. We provide a way to deploy and manage with Fabric. If you want to know more about it, please read the following guidelines:
+
+..  toctree::
+  :maxdepth: 1
+
+  fabric_en.md
+
+We also support the deployment of PaddlePaddle on AWS. Learn more about:
+
+..  toctree::
+  :maxdepth: 1
+
   k8s_aws_en.md
+
+The examples can be found under `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ .
diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md b/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
deleted file mode 120000
index c44cd9a731bed7067cdf19aa2f714abdce6c736a..0000000000000000000000000000000000000000
--- a/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
+++ /dev/null
@@ -1 +0,0 @@
-k8s_aws_en.md
\ No newline at end of file
diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md b/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..afc753aa42f19631c49a451a797f28365e65ed1d
--- /dev/null
+++ b/doc/v2/howto/cluster/multi_cluster/k8s_aws_cn.md
@@ -0,0 +1,672 @@
+# Kubernetes on AWS
+
+我们将向你展示怎么样在AWS的Kubernetes集群上运行分布式PaddlePaddle训练，让我们从核心概念开始
+
+## PaddlePaddle分布式训练的核心概念
+
+### 分布式训练任务
+
+一个分布式训练任务可以看做是一个Kubernetes任务
+每一个Kubernetes任务都有相应的配置文件，此配置文件指定了像任务的pod个数之类的环境变量信息
+
+在分布式训练任务中，我们可以如下操作：
+
+1. 在分布式文件系统中，准备分块数据和配置文件（在此次教学中，我们会用到亚马逊分布式存储服务（EFS））
+2. 创建和提交一个kubernetes任务配置到集群中开始训练
+
+### Parameter Server和Trainer
+
+在paddlepaddle集群中有两个角色：参数服务器（pserver）者和trainer， 每一个参数服务器过程都会保存一部分模型的参数。每一个trainer都保存一份完整的模型参数，并可以利用本地数据更新模型。在这个训练过程中，trainer发送模型更新到参数服务器中，参数服务器职责就是聚合这些更新，以便于trainer可以把全局模型同步到本地。
+
+为了能够和pserver通信，trainer需要每一个pserver的IP地址。在Kubernetes中利用服务发现机制（比如：DNS、hostname）要比静态的IP地址要好一些，因为任何一个pod都会被杀掉然后新的pod被重启到另一个不同IP地址的node上。现在我们可以先用静态的IP地址方式，这种方式是可以更改的。
+
+参数服务器和trainer一块被打包成一个docker镜像，这个镜像会运行在被Kubernetes集群调度的pod中。
+
+### 训练者ID
+
+每一个训练过程都需要一个训练ID，以0作为基础值，作为命令行参数传递。训练过程因此用这个ID去读取数据分片。
+
+### 训练
+
+PaddlePaddle容器的入口是一个shell脚本，这个脚本可以读取Kubernetes内预置的环境变量。这里可以定义任务identity，在任务中identity可以用来远程访问包含所有pod的Kubernetes apiserver服务。
+
+每一个pod通过ip来排序。每一个pod的序列作为“pod id”。因为我们会在每一个pod中运行训练和参数服务，可以用“pod id”作为训练ID。入口脚本详细工作流程如下：
+
+1. 查找apiserver得到pod信息，通过ip排序来分配一个trainer_id。
+2. 从EFS持久化卷中复制训练数据到容器中。
+3. 从环境变量中解析paddle pserver和 paddle trainer的启动参数，然后开始启动流程。
+4. 以trainer_id来训练将自动把结果写入到EFS卷中。
+
+
+## AWS的Kubernetes中的PaddlePaddle
+
+### 选择AWS服务区域
+这个教程需要多个AWS服务工作在一个区域中。在AWS创建任何东西之前，请检查链接https://aws.amazon.com/about-aws/global-infrastructure/regional-product-services/ 选择一个可以提供如下服务的区域：EC2, EFS, VPS, CloudFormation, KMS, VPC, S3。在教程中我们使用“Oregon(us-west-2)”作为例子。
+
+### 创建aws账户和IAM账户
+
+在每一个aws账户下可以创建多个IAM用户。允许为每一个IAM用户赋予权限，作为IAM用户可以创建/操作aws集群
+
+注册aws账户，请遵循用户指南。在AWS账户下创建IAM用户和用户组，请遵循用户指南
+
+请注意此教程需要如下的IAM用户权限：
+
+- AmazonEC2FullAccess
+- AmazonS3FullAccess
+- AmazonRoute53FullAccess
+- AmazonRoute53DomainsFullAccess
+- AmazonElasticFileSystemFullAccess
+- AmazonVPCFullAccess
+- IAMUserSSHKeys
+- IAMFullAccess
+- NetworkAdministrator
+- AWSKeyManagementServicePowerUser
+
+
+### 下载kube-aws and kubectl
+
+#### kube-aws
+
+在AWS中[kube-aws](https://github.com/coreos/kube-aws)是一个自动部署集群的CLI工具
+
+##### kube-aws完整性验证
+提示：如果你用的是非官方版本（e.g RC release）的kube-aws，可以跳过这一步骤。引入coreos的应用程序签名公钥:
+
+```
+gpg2 --keyserver pgp.mit.edu --recv-key FC8A365E
+```
+
+指纹验证：
+
+```
+gpg2 --fingerprint FC8A365E
+```
+正确的指纹是： `18AD 5014 C99E F7E3 BA5F 6CE9 50BD D3E0 FC8A 365E`
+
+我们可以从发布页面中下载kube-aws，教程使用0.9.1版本 [release page](https://github.com/coreos/kube-aws/releases).
+
+验证tar包的GPG签名：
+
+```
+PLATFORM=linux-amd64
+ # Or
+PLATFORM=darwin-amd64
+
+gpg2 --verify kube-aws-${PLATFORM}.tar.gz.sig kube-aws-${PLATFORM}.tar.gz
+```
+##### 安装kube-aws
+解压:
+
+```
+tar zxvf kube-aws-${PLATFORM}.tar.gz
+```
+
+添加到环境变量:
+
+```
+mv ${PLATFORM}/kube-aws /usr/local/bin
+```
+
+
+#### kubectl
+
+[kubectl](https://Kubernetes.io/docs/user-guide/kubectl-overview/) 是一个操作Kubernetes集群的命令行接口
+
+利用`curl`工具从Kubernetes发布页面中下载`kubectl`
+
+```
+# OS X
+curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/darwin/amd64/kubectl
+
+# Linux
+curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/linux/amd64/kubectl
+```
+
+为了能是kubectl运行必须将之添加到环境变量中 (e.g. `/usr/local/bin`):
+
+```
+chmod +x ./kubectl
+sudo mv ./kubectl /usr/local/bin/kubectl
+```
+
+### 配置AWS证书
+
+首先检查这里 [this](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) 安装AWS命令行工具
+
+然后配置aws账户信息:
+
+```
+aws configure
+```
+
+
+添加如下信息:
+
+
+```
+AWS Access Key ID: YOUR_ACCESS_KEY_ID
+AWS Secrete Access Key: YOUR_SECRETE_ACCESS_KEY
+Default region name: us-west-2
+Default output format: json
+```
+
+`YOUR_ACCESS_KEY_ID`, and `YOUR_SECRETE_ACCESS_KEY` 是创建aws账户和IAM账户的IAM的key和密码 [Create AWS Account and IAM Account](#create-aws-account-and-iam-account)
+
+描述任何运行在你账户中的实例来验证凭据是否工作:
+
+```
+aws ec2 describe-instances
+```
+
+### 定义集群参数
+
+#### EC2秘钥对
+
+秘钥对将认证ssh访问你的EC2实例。秘钥对的公钥部分将配置到每一个COREOS节点中。
+
+遵循 [EC2 Keypair User Guide](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html) Keypair用户指南来创建EC2秘钥对
+
+你可以使用创建好的秘钥对名称来配置集群.
+
+在同一工作区中秘钥对为EC2实例唯一码。在教程中使用 us-west-2 ，所以请确认在这个区域（Oregon）中创建秘钥对。
+
+在浏览器中下载一个`key-name.pem`文件用来访问EC2实例，我们待会会用到.
+
+
+#### KMS秘钥
+
+亚马逊的KMS秘钥在TLS秘钥管理服务中用来加密和解密集群。如果你已经有可用的KMS秘钥，你可以跳过创建新秘钥这一步，提供现存秘钥的ARN字符串。
+
+利用aws命令行创建kms秘钥:
+
+```
+aws kms --region=us-west-2 create-key --description="kube-aws assets"
+{
+    "KeyMetadata": {
+        "CreationDate": 1458235139.724,
+        "KeyState": "Enabled",
+        "Arn": "arn:aws:kms:us-west-2:aaaaaaaaaaaaa:key/xxxxxxxxxxxxxxxxxxx",
+        "AWSAccountId": "xxxxxxxxxxxxx",
+        "Enabled": true,
+        "KeyUsage": "ENCRYPT_DECRYPT",
+        "KeyId": "xxxxxxxxx",
+        "Description": "kube-aws assets"
+    }
+}
+```
+
+我们稍后用到`Arn` 的值.
+
+在IAM用户许可中添加多个内联策略.
+
+进入[IAM Console](https://console.aws.amazon.com/iam/home?region=us-west-2#/home)。点击`Users`按钮，点击刚才创建的用户，然后点击`Add inline policy`按钮，选择`Custom Policy`
+
+粘贴内联策略:
+
+```
+ (Caution: node_0, node_1, node_2 directories represents PaddlePaddle node and train_id, not the Kubernetes node){
+    "Version": "2012-10-17",
+    "Statement": [
+        {
+            "Sid": "Stmt1482205552000",
+            "Effect": "Allow",
+            "Action": [
+                "kms:Decrypt",
+                "kms:Encrypt"
+            ],
+            "Resource": [
+                "arn:aws:kms:*:AWS_ACCOUNT_ID:key/*"
+            ]
+        },
+		{
+            "Sid": "Stmt1482205746000",
+            "Effect": "Allow",
+            "Action": [
+                "cloudformation:CreateStack",
+                "cloudformation:UpdateStack",
+                "cloudformation:DeleteStack",
+                "cloudformation:DescribeStacks",
+                "cloudformation:DescribeStackResource",
+                "cloudformation:GetTemplate",
+                "cloudformation:DescribeStackEvents"
+            ],
+            "Resource": [
+                "arn:aws:cloudformation:us-west-2:AWS_ACCOUNT_ID:stack/MY_CLUSTER_NAME/*"
+            ]
+        }
+    ]
+}
+```
+`Version` : 值必须是"2012-10-17".
+`AWS_ACCOUNT_ID`: 你可以从命令行中获取:
+
+```
+aws sts get-caller-identity --output text --query Account
+```
+
+`MY_CLUSTER_NAME`: 选择一个你喜欢的MY_CLUSTER_NAME，稍后会用到。
+请注意，堆栈名称必须是正则表达式：[a-zA-Z][-a-zA-Z0-9*]*， 在名称中不能有"_"或者"-"，否则kube-aws在下面步骤中会抛出异常
+
+#### 外部DNS名称
+
+当集群被创建后，基于DNS名称控制器将会暴露安全的TLS API.
+
+DNS名称含有CNAME指向到集群DNS名称或者记录指向集群的IP地址。
+
+我们稍后会用到DNS名称，如果没有DNS名称的话，你可以选择一个（比如：`paddle`）还可以修改`/etc/hosts`用本机的DNS名称和集群IP关联。还可以在AWS上增加一个名称服务来关联paddle集群IP，稍后步骤中会查找集群IP.
+
+#### S3 bucket
+
+在启动Kubernetes集群前需要创建一个S3 bucket
+
+在AWS上创建s3 bucket会有许多的bugs，所以使用[s3 console](https://console.aws.amazon.com/s3/home?region=us-west-2)。
+
+链接到 `Create Bucket`，确保在us-west-2 (Oregon)上创建一个唯一的BUCKET_NAME。
+
+#### 初始化assets
+
+在本机创建一个目录用来存放产生的assets:
+
+```
+$ mkdir my-cluster
+$ cd my-cluster
+```
+
+利用KMS Arn、秘钥对名称和前一步产生的DNS名称来初始化集群的CloudFormation栈:
+
+```
+kube-aws init \
+--cluster-name=MY_CLUSTER_NAME \
+--external-dns-name=MY_EXTERNAL_DNS_NAME \
+--region=us-west-2 \
+--availability-zone=us-west-2a \
+--key-name=KEY_PAIR_NAME \
+--kms-key-arn="arn:aws:kms:us-west-2:xxxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx"
+```
+
+`MY_CLUSTER_NAME`: the one you picked in [KMS key](#kms-key)
+
+`MY_EXTERNAL_DNS_NAME`: see [External DNS name](#external-dns-name)
+
+`KEY_PAIR_NAME`: see [EC2 key pair](#ec2-key-pair)
+
+`--kms-key-arn`: the "Arn" in [KMS key](#kms-key)
+
+这里的`us-west-2a`用于参数`--availability-zone`，但必须在AWS账户的有效可用区中
+
+如果不能切换到其他的有效可用区（e.g., `us-west-2a`, or `us-west-2b`），请检查`us-west-2a`是支持`aws ec2 --region us-west-2 describe-availability-zones`。
+
+现在在asset目录中就有了集群的主配置文件cluster.yaml。
+
+默认情况下kube-aws会创建一个工作节点，修改`cluster.yaml`让`workerCount`从1个节点变成3个节点.
+
+#### 呈现asset目录内容
+
+在这个简单的例子中，你可以使用kuber-aws生成TLS身份和证书
+
+```
+kube-aws render credentials --generate-ca
+```
+
+下一步在asset目录中生成一组集群assets.
+
+```
+kube-aws render stack
+```
+asserts(模板和凭证)用于创建、更新和当前目录被创建的Kubernetes集群相关联
+
+### 启动Kubernetes集群
+
+#### 创建一个在CloudFormation模板上定义好的实例
+
+现在让我们创建集群（在命令行中选择任意的 `PREFIX`）
+
+```
+kube-aws up --s3-uri s3://BUCKET_NAME/PREFIX
+```
+
+`BUCKET_NAME`: t在[S3 bucket](#s3-bucket)上使用的bucket名称
+
+
+#### 配置DNS
+
+你可以执行命令 `kube-aws status`来查看创建后集群的API.
+
+```
+$ kube-aws status
+Cluster Name:		paddle-cluster
+Controller DNS Name:	paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com
+```
+如果你用DNS名称，在ip上设置任何记录或是安装CNAME点到`Controller DNS Name` (`paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com`)
+
+##### 查询IP地址
+
+用命令`dig`去检查负载均衡器的域名来获取ip地址.
+
+```
+$ dig paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com
+
+;; QUESTION SECTION:
+;paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. IN A
+
+;; ANSWER SECTION:
+paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.241.164.52
+paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.67.102.112
+```
+
+在上面的例子中，`54.241.164.52`, `54.67.102.112`这两个ip都将是工作状态
+
+*如果你有DNS名称*，设置记录到ip上，然后你可以跳过“Access the cluster”这一步
+
+*如果没有自己的DNS名称*
+
+编辑/etc/hosts文件用DNS关联IP
+
+##### 更新本地的DNS关联
+编辑`/etc/hosts`文件用DNS关联IP
+##### 在VPC上添加route53私有名称服务
+ - 打开[Route53 Console](https://console.aws.amazon.com/route53/home)
+ - 根据配置创建域名zone
+   - domain名称为: "paddle"
+   - Type: "Private hosted zone for amazon VPC"
+   - VPC ID: `<Your VPC ID>`
+
+   ![route53 zone setting](src/route53_create_zone.png)
+ - 添加记录
+    - 点击zone中刚创建的“paddle”
+    - 点击按钮“Create record set”
+        - Name : leave blank
+        - type: "A"
+        - Value: `<kube-controller ec2 private ip>`
+
+        ![route53 create recordset](src/route53_create_recordset.png)
+ - 检查名称服务
+    - 连接通过kube-aws via ssh创建的任何实例
+    - 运行命令"host paddle"，看看是否ip为返回的kube-controller的私有IP
+
+#### 进入集群
+
+集群运行后如下命令会看到:
+
+```
+$ kubectl --kubeconfig=kubeconfig get nodes
+NAME                                       STATUS    AGE
+ip-10-0-0-134.us-west-2.compute.internal   Ready     6m
+ip-10-0-0-238.us-west-2.compute.internal   Ready     6m
+ip-10-0-0-50.us-west-2.compute.internal    Ready     6m
+ip-10-0-0-55.us-west-2.compute.internal    Ready     6m
+```
+
+
+### 集群安装弹性文件系统
+
+训练数据存放在AWS上的EFS分布式文件系统中.
+
+1. 在[security group console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId)为EFS创建一个安全组
+  1. 可以看到`paddle-cluster-sg-worker` (在sg-055ee37d镜像中)安全组id
+  <center>![](src/worker_security_group.png)</center>
+
+  2. 增加安全组`paddle-efs` ，以`paddle-cluster-sg-worker`的group id作为用户源和`ALL TCP`入栈规则。增加vpc `paddle-cluster-vpc`, 确保可用区是在[Initialize Assets](#initialize-assets)的时候用到的那一个.
+  <center>![](src/add_security_group.png)</center>
+
+2. 利用`paddle-cluster-vpc`私有网络在[EFS console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2#/wizard/1) 中创建弹性文件系统, 确定子网为`paddle-cluster-Subnet0`和安全区为`paddle-efs`.
+<center>![](src/create_efs.png)</center>
+
+
+### 开始在AWS上进行paddlepaddle的训练
+
+#### 配置Kubernetes卷指向EFS
+
+首先需要创建一个持久卷[PersistentVolume](https://kubernetes.io/docs/user-guide/persistent-volumes/) 到EFS上
+
+用 `pv.yaml`形式来保存
+```
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: efsvol
+spec:
+  capacity:
+    storage: 100Gi
+  accessModes:
+    - ReadWriteMany
+  nfs:
+    server: EFS_DNS_NAME
+    path: "/"
+```
+
+`EFS_DNS_NAME`: DNS名称最好能描述我们创建的`paddle-efs`，看起来像`fs-2cbf7385.efs.us-west-2.amazonaws.com`
+
+运行下面的命令来创建持久卷:
+```
+kubectl --kubeconfig=kubeconfig create -f pv.yaml
+```
+下一步创建 [PersistentVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes/)来声明持久卷
+
+用`pvc.yaml`来保存.
+```
+kind: PersistentVolumeClaim
+apiVersion: v1
+metadata:
+  name: efsvol
+spec:
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 50Gi
+```
+
+行下面命令来创建持久卷声明:
+```
+kubectl --kubeconfig=kubeconfig create -f pvc.yaml
+```
+
+#### 准备训练数据
+
+启动Kubernetes job在我们创建的持久层上进行下载、保存并均匀拆分训练数据为3份.
+
+用`paddle-data-job.yaml`保存
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-data
+spec:
+  template:
+    metadata:
+      name: pi
+    spec:
+      containers:
+      - name: paddle-data
+        image: paddlepaddle/paddle-tutorial:k8s_data
+        imagePullPolicy: Always
+        volumeMounts:
+        - mountPath: "/efs"
+          name: efs
+        env:
+        - name: OUT_DIR
+          value: /efs/paddle-cluster-job
+        - name: SPLIT_COUNT
+          value: "3"
+      volumes:
+        - name: efs
+          persistentVolumeClaim:
+            claimName: efsvol
+      restartPolicy: Never
+```
+
+运行下面的命令来启动任务:
+```
+kubectl --kubeconfig=kubeconfig create -f paddle-data-job.yaml
+```
+任务运行大概需要7分钟，可以使用下面命令查看任务状态，直到`paddle-data`任务的`SUCCESSFUL`状态为`1`时成功，这里here有怎样创建镜像的源码
+```
+$ kubectl --kubeconfig=kubeconfig get jobs
+NAME          DESIRED   SUCCESSFUL   AGE
+paddle-data   1         1            6m
+```
+数据准备完成后的结果是以镜像`paddlepaddle/paddle-tutorial:k8s_data`存放，可以点击这里[here](src/k8s_data/README.md)查看如何创建docker镜像源码
+
+#### 开始训练
+
+现在可以开始运行paddle的训练任务，用`paddle-cluster-job.yaml`进行保存
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-cluster-job
+spec:
+  parallelism: 3
+  completions: 3
+  template:
+    metadata:
+      name: paddle-cluster-job
+    spec:
+      volumes:
+      - name: efs
+        persistentVolumeClaim:
+          claimName: efsvol
+      containers:
+      - name: trainer
+        image: paddlepaddle/paddle-tutorial:k8s_train
+        command: ["bin/bash",  "-c", "/root/start.sh"]
+        env:
+        - name: JOB_NAME
+          value: paddle-cluster-job
+        - name: JOB_PATH
+          value: /home/jobpath
+        - name: JOB_NAMESPACE
+          value: default
+        - name: TRAIN_CONFIG_DIR
+          value: quick_start
+        - name: CONF_PADDLE_NIC
+          value: eth0
+        - name: CONF_PADDLE_PORT
+          value: "7164"
+        - name: CONF_PADDLE_PORTS_NUM
+          value: "2"
+        - name: CONF_PADDLE_PORTS_NUM_SPARSE
+          value: "2"
+        - name: CONF_PADDLE_GRADIENT_NUM
+          value: "3"
+        - name: TRAINER_COUNT
+          value: "3"
+        volumeMounts:
+        - mountPath: "/home/jobpath"
+          name: efs
+        ports:
+        - name: jobport0
+          hostPort: 7164
+          containerPort: 7164
+        - name: jobport1
+          hostPort: 7165
+          containerPort: 7165
+        - name: jobport2
+          hostPort: 7166
+          containerPort: 7166
+        - name: jobport3
+          hostPort: 7167
+          containerPort: 7167
+      restartPolicy: Never
+```
+
+`parallelism: 3, completions: 3` 意思是这个任务会同时开启3个paddlepaddle的pod，当pod启动后3个任务将被完成。
+
+`env` 参数代表容器的环境变量，在这里指定paddlepaddle的参数.
+
+`ports` 指定TCP端口7164 - 7167和`pserver`进行连接，port从`CONF_PADDLE_PORT`(7164)到`CONF_PADDLE_PORT + CONF_PADDLE_PORTS_NUM + CONF_PADDLE_PORTS_NUM_SPARSE - 1`(7167)。我们使用多个端口密集和稀疏参数的更新来提高延迟
+
+运行下面命令来启动任务.
+```
+kubectl --kubeconfig=kubeconfig create -f paddle-claster-job.yaml
+```
+
+检查pods信息
+
+```
+$ kubectl --kubeconfig=kubeconfig get pods
+NAME                       READY     STATUS    RESTARTS   AGE
+paddle-cluster-job-cm469   1/1       Running   0          9m
+paddle-cluster-job-fnt03   1/1       Running   0          9m
+paddle-cluster-job-jx4xr   1/1       Running   0          9m
+```
+
+检查指定pod的控制台输出
+```
+kubectl --kubeconfig=kubeconfig log -f POD_NAME
+```
+
+`POD_NAME`: 任何一个pod的名称 (e.g., `paddle-cluster-job-cm469`).
+
+运行`kubectl --kubeconfig=kubeconfig describe job paddle-cluster-job`来检查训练任务的状态，将会在大约20分钟完成
+
+`pserver`和`trainer`的细节都隐藏在docker镜像`paddlepaddle/paddle-tutorial:k8s_train`中，这里[here](src/k8s_train/README.md) 有创建docker镜像的源码.
+
+#### 检查训练输出
+
+训练输出（模型快照和日志）将被保存在EFS上。我们可以用ssh登录到EC2的工作节点上，查看mount过的EFS和训练输出.
+
+1. ssh登录EC2工作节点
+```
+chmod 400 key-name.pem
+ssh -i key-name.pem core@INSTANCE_IP
+```
+
+`INSTANCE_IP`: EC2上Kubernetes工作节点的公共IP地址，进入[EC2 console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#Instances:sort=instanceId) 中检查任何`paddle-cluster-kube-aws-worker`实例的 `public IP`
+
+2. 挂载EFS
+```
+mkdir efs
+sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 EFS_DNS_NAME:/ efs
+```
+
+`EFS_DNS_NAME`: DNS名称最好能描述我们创建的`paddle-efs`，看起来像`fs-2cbf7385.efs.us-west-2.amazonaws.com`.
+
+文件夹`efs`上有这结构相似的node信息:
+```
+-- paddle-cluster-job
+    |-- ...
+    |-- output
+    |   |-- node_0
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- node_1
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- node_2
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- pass-00000
+    |   |   |-- ___fc_layer_0__.w0
+    |   |   |-- ___fc_layer_0__.wbias
+    |   |   |-- done
+    |   |   |-- path.txt
+    |   |   `-- trainer_config.lr.py
+	|   |-- pass-00001...
+```
+`server.log` 是`pserver`的log日志，`train.log`是`trainer`的log日志，模型快照和描述存放在`pass-0000*`.
+
+### Kubernetes集群卸载或删除
+
+#### 删除EFS
+
+到[EFS Console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2) 中删除创建的EFS卷
+
+#### 删除安全组
+
+去[Security Group Console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId) 删除安全组`paddle-efs`.
+
+#### 删除S3 bucket
+
+进入 [S3 Console](https://console.aws.amazon.com/s3/home?region=us-west-2#)删除S3 bucket
+
+#### 销毁集群
+
+```
+kube-aws destroy
+```
+
+命令会立刻返回，但需要大约5分钟来销毁集群
+
+可以进入 [CludFormation Console](https://us-west-2.console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks?filter=active)检查销毁的过程。
diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md b/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md
index bc3d50b3ffd3b703a3a656caa1f96bdcf683f68b..b2dc4da8451af317df76c5b3df328b6f58429610 100644
--- a/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md
+++ b/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md
@@ -1,3 +1,372 @@
-# Kubernetes Distributed
+# Distributed Training on Kubernetes
 
-TBD
+We introduced how to create a PaddlePaddle Job with a single node on Kuberentes in the
+previous document.
+In this article, we will introduce how to create a PaddlePaddle job with multiple nodes
+on Kubernetes cluster.
+
+## Overall Architecture
+
+Before creating a training job, the users need to slice the training data and deploy
+the Python scripts along with it into the distributed file system
+(We can use the different type of Kuberentes Volumes to mount different distributed
+file systems). Before training starts, The program will copy the training data into the
+Container and also save the models at the same path during training. The global architecture
+is as follows:
+
+![PaddlePaddle on Kubernetes Architecture](src/k8s-paddle-arch.png)
+
+The above figure describes a distributed training architecture which contains 3 nodes, each 
+Pod mounts a folder of the distributed file system to save training data and models
+by Kubernetes Volume. Kubernetes created 3 Pods for this training phase and scheduled these on
+3 nodes, each Pod has a PaddlePaddle container. After the containers car created,
+PaddlePaddle starts up the communication between PServer and Trainer and read training
+data for this training job.
+
+As the description above, we can start up a PaddlePaddle distributed training job on a 
+Kubernetes ready cluster with the following steps:
+
+1. [Build PaddlePaddle Docker Image](#Build a Docker Image)
+1. [Split training data and upload to the distributed file system](#Upload Training Data)
+1. [Edit a YAML file and create a Kubernetes Job](#Create a Job)
+1. [Check the output](#Check The Output)
+
+We will introduce these steps as follows:
+
+### Build a Docker Image
+
+Training docker image needs to package the paddle pserver and paddle trainer runtimes, as well as two more processes before we can kick off the training:
+
+- Copying the training data into container.
+- Generating the initialization arguments for `Paddle PServer` and `Paddle Training` processes.
+
+Since the paddlepaddle official docker image already has the runtimes we need, we'll take it as the base image and pack some additional scripts for the processes mentioned above to build our training image. for more detail, please find from the following link:
+- https://github.com/PaddlePaddle/Paddle/tree/develop/doc/v2/howto/cluster/multi_cluster/src/k8s_train/Dockerfile
+
+
+```bash
+$ cd doc/howto/usage/k8s/src/k8s_train
+$ docker build -t [YOUR_REPO]/paddle:mypaddle .
+```
+
+And then upload the new Docker Image to a Docker hub:
+
+```bash
+docker push  [YOUR_REPO]/paddle:mypaddle
+```
+
+**[NOTE]**, in the above command arguments, `[YOUR_REPO]` represents your Docker repository,
+you need to use your repository instead of it. We will replace it with your respository name to
+represent the Docker Image which built in this step.
+
+### Prepare Training Data
+
+We can download and split the training job by creating a Kubernetes Job, or custom your image
+by editing [k8s_train](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/v2/howto/cluster/multi_cluster/src/k8s_train).
+
+Before creating a Job, we need to bind a [persistenVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes) by the different type of
+the different file system, the generated dataset would be saved on this volume.
+
+```yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-data
+spec:
+  template:
+    metadata:
+      name: pi
+    spec:
+      hostNetwork: true
+      containers:
+      - name: paddle-data
+        image: paddlepaddle/paddle-tutorial:k8s_data
+        imagePullPolicy: Always
+        volumeMounts:
+        - mountPath: "/mnt"
+          name: nfs
+        env:
+        - name: OUT_DIR
+          value: /home/work/mfs/paddle-cluster-job
+        - name: SPLIT_COUNT
+          value: "3"
+      volumes:
+        - name: nfs
+          persistentVolumeClaim:
+            claimName: mfs
+      restartPolicy: Never
+```
+
+Create the Job with the following command:
+
+```bash
+> kubectl create -f xxx.yaml
+```
+
+If created successfully, you can see some information like this:
+
+```base
+[root@paddle-kubernetes-node0 nfsdir]$ tree -d
+.
+`-- paddle-cluster-job
+    |-- 0
+    |   `-- data
+    |-- 1
+    |   `-- data
+    |-- 2
+    |   `-- data
+    |-- output
+    |-- quick_start
+```
+
+The `paddle-cluster-job` above is the job name for this training job; we need 3
+PaddlePaddle training nodes and save the split training data in `paddle-cluster-job` path,
+the folder `0`, `1` and `2` represents the `training_id` on each node, `quick_start` folder is used to store training data, `output` folder is used to store the models and logs.
+
+
+### Create a Job
+
+Kubernetes allow users to create objects with YAML files, and we can use a command-line tool
+to create it.
+
+The Job YAML file describes that which Docker Image would be used in this training job, how much nodes would be created, what's the startup arguments of `Paddle PServer/Trainer` process and what's the type of Volumes. You can find the details of the YAML filed in
+[Kubernetes Job API](http://kubernetes.io/docs/api-reference/batch/v1/definitions/#_v1_job).
+The following is an example for this training job:
+
+```yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-cluster-job
+spec:
+  parallelism: 3
+  completions: 3
+  template:
+    metadata:
+      name: paddle-cluster-job
+    spec:
+      volumes:
+      - name: jobpath
+        hostPath:
+          path: /home/work/mfs
+      containers:
+      - name: trainer
+        image: [YOUR_REPO]/paddle:mypaddle
+        command: ["bin/bash",  "-c", "/root/start.sh"]
+        env:
+        - name: JOB_NAME
+          value: paddle-cluster-job
+        - name: JOB_PATH
+          value: /home/jobpath
+        - name: JOB_NAMESPACE
+          value: default
+        - name: TRAIN_CONFIG_DIR
+          value: recommendation
+        - name: CONF_PADDLE_NIC
+          value: eth0
+        - name: CONF_PADDLE_PORT
+          value: "7164"
+        - name: CONF_PADDLE_PORTS_NUM
+          value: "2"
+        - name: CONF_PADDLE_PORTS_NUM_SPARSE
+          value: "2"
+        - name: CONF_PADDLE_GRADIENT_NUM
+          value: "3"
+        volumeMounts:
+        - name: jobpath
+          mountPath: /home/jobpath
+      restartPolicy: Never
+```
+
+In the above YAML file:
+- `metadata.name`, The job name.
+- `parallelism`, Whether the Kubernetes Job would create `parallelism` Pods at the same time.
+- `completions`, The Job would become the success status only when the number of successful Pod(the exit code is 0)
+  is equal to `completions`.
+- `volumeMounts`, the name field `jobpath` is a key, the `mountPath` field represents
+  the path in the container, and we can define the `jobpath` in `volumes` filed, use `hostPath`
+  to configure the host path we want to mount.
+- `env`, the environment variables in the Container, we pass some startup arguments by
+  this approach, some details are as following:
+  - JOB_PATH：the mount path in the container
+  - JOB_NAME：the job name
+  - TRAIN_CONFIG_DIR：the job path in the container, we can find the training data path by
+    combine with JOB_NAME.
+  - CONF_PADDLE_NIC: the argument `--nics` of `Paddle PServer` process, the network
+    device name.
+  - CONF_PADDLE_PORT: the argument `--port` of `Paddle PServer` process.
+  - CONF_PADDLE_PORTS_NUM: the argument `--ports_num` of `Paddle PServer`, the port number
+    for dense prameter update. 
+  - CONF_PADDLE_PORTS_NUM_SPARSE：the argument `--ports_num_for_sparse` of `Paddle PServer`,
+    the port number for sparse parameter update.
+  - CONF_PADDLE_GRADIENT_NUM：the number of training node, the argument 
+  `--num_gradient_servers` of `Paddle PServer` and `Paddle Trainer`.
+
+You can find some details information at [here]
+(http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cmd_parameter/detail_introduction_cn.html)。
+
+We can use the command-line tool of Kubernetes to create a Job when we finish the YAML file:
+
+```bash
+kubectl create -f job.yaml
+```
+
+Upon successful creation, Kubernetes would create 3 Pods as PaddlePaddle training node,
+pull the Docker image and begin to train.
+
+
+### Checkout the Output
+
+At the process of training, we can check the logs and the output models which is stored in
+the `output` folder.
+
+**NOTE**, `node_0`, `node_1` and `node_2` represent the
+`trainer_id` of the PaddlePaddle training job rather than the node id of Kubernetes.
+
+```bash
+[root@paddle-kubernetes-node0 output]# tree -d
+.
+├── node_0
+│   ├── server.log
+│   └── train.log
+├── node_1
+│   ├── server.log
+│   └── train.log
+├── node_2
+......
+├── pass-00002
+│   ├── done
+│   ├── ___embedding_0__.w0
+│   ├── ___embedding_1__.w0
+......
+```
+
+We can checkout the status of each training Pod by viewing the logs:
+
+```bash
+[root@paddle-kubernetes-node0 node_0]# cat train.log
+I1116 09:10:17.123121    50 Util.cpp:155] commandline:
+ /usr/local/bin/../opt/paddle/bin/paddle_trainer
+    --nics=eth0 --port=7164
+    --ports_num=2 --comment=paddle_process_by_paddle
+    --pservers=192.168.129.66,192.168.223.143,192.168.129.71
+    --ports_num_for_sparse=2 --config=./trainer_config.py
+    --trainer_count=4 --num_passes=10 --use_gpu=0
+    --log_period=50 --dot_period=10 --saving_period=1
+    --local=0 --trainer_id=0
+    --save_dir=/home/jobpath/paddle-cluster-job/output
+I1116 09:10:17.123440    50 Util.cpp:130] Calling runInitFunctions
+I1116 09:10:17.123764    50 Util.cpp:143] Call runInitFunctions done.
+[WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config.
+[INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating]
+[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__square_error_cost_0__]
+I1116 09:10:17.392917    50 Trainer.cpp:170] trainer mode: Normal
+I1116 09:10:17.613910    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.680917    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.681543    50 GradientMachine.cpp:134] Initing parameters..
+I1116 09:10:18.012390    50 GradientMachine.cpp:141] Init parameters done.
+I1116 09:10:18.018641    50 ParameterClient2.cpp:122] pserver 0 192.168.129.66:7164
+I1116 09:10:18.018950    50 ParameterClient2.cpp:122] pserver 1 192.168.129.66:7165
+I1116 09:10:18.019069    50 ParameterClient2.cpp:122] pserver 2 192.168.223.143:7164
+I1116 09:10:18.019492    50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:7165
+I1116 09:10:18.019716    50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164
+I1116 09:10:18.019836    50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165
+```
+
+## Some Additional Details
+
+### Using Environment Variables
+
+Usually we use the environment varialbes to configurate the PaddlePaddle Job which runs in
+Kubernetes, `start_paddle.py` provides a start up script to convert the environment variable
+to the start up arguments of PaddlePaddle process:
+
+```bash
+API = "/api/v1/namespaces/"
+JOBSELECTOR = "labelSelector=job-name="
+JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
+JOB_PATH_OUTPUT = JOB_PATH + "/output"
+JOBNAME = os.getenv("JOB_NAME")
+NAMESPACE = os.getenv("JOB_NAMESPACE")
+PADDLE_NIC = os.getenv("CONF_PADDLE_NIC")
+PADDLE_PORT = os.getenv("CONF_PADDLE_PORT")
+PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM")
+PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
+PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")
+```
+
+### Communication between Pods
+
+At the begin of `start_paddle.py`, it would initializes and parses the arguments.
+
+```python
+parser = argparse.ArgumentParser(prog="start_paddle.py",
+                                     description='simple tool for k8s')
+    args, train_args_list = parser.parse_known_args()
+    train_args = refine_unknown_args(train_args_list)
+    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
+    podlist = getPodList()
+```
+
+And then query the status of all the other Pods of this Job by the function `getPodList()`, and fetch `triner_id` by the function `getIdMap(podlist)` if all the Pods status is `RUNNING`.
+
+```python
+    podlist = getPodList()
+    # need to wait until all pods are running
+    while not isPodAllRunning(podlist):
+        time.sleep(10)
+        podlist = getPodList()
+    idMap = getIdMap(podlist)
+```
+
+**NOTE**: `getPodList()` would prefetch all the Pods in the current namespace, if some 
+Pods are alreay running, it may cause some error. We will use [statfulesets](https://kubernetes.io/docs/concepts/abstractions/controllers/statefulsets) instead of
+Kubernetes Pod or Replicaset in the future.
+
+The function `getIdMap(podlist)` fetches IPs addresses of `podlist` and then sort them
+to generate `trainer_id`.
+
+```python
+def getIdMap(podlist):
+    '''
+    generate tainer_id by ip
+    '''
+    ips = []
+    for pod in podlist["items"]:
+        ips.append(pod["status"]["podIP"])
+    ips.sort()
+    idMap = {}
+    for i in range(len(ips)):
+        idMap[ips[i]] = i
+    return idMap
+```
+
+After getting the `idMap`, we can generate the arguments of `Paddle PServer` and `Paddle Trainer`
+so that we can start up them by `startPaddle(idMap, train_args_dict)`.
+
+### Create Job
+
+The main goal of `startPaddle` is generating the arguments of `Paddle PServer` and
+`Paddle Trainer` processes. Take `Paddle Trainer` as an example, we parse the
+environment variable and then get `PADDLE_NIC`, `PADDLE_PORT`, `PADDLE_PORTS_NUM` and etc...,
+finally find `trainerId` from `idMap` according to its IP address.
+
+```python
+    program = 'paddle train'
+    args = " --nics=" + PADDLE_NIC
+    args += " --port=" + str(PADDLE_PORT)
+    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
+    args += " --comment=" + "paddle_process_by_paddle"
+    ip_string = ""
+    for ip in idMap.keys():
+        ip_string += (ip + ",")
+    ip_string = ip_string.rstrip(",")
+    args += " --pservers=" + ip_string
+    args_ext = ""
+    for key, value in train_args_dict.items():
+        args_ext += (' --' + key + '=' + value)
+    localIP = socket.gethostbyname(socket.gethostname())
+    trainerId = idMap[localIP]
+    args += " " + args_ext + " --trainer_id=" + \
+        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
+```
diff --git a/doc/v2/howto/cmd_parameter/index_en.rst b/doc/v2/howto/cmd_parameter/index_en.rst
index 0e3c72d27aca063f1b6f1c23e55718dba373c40a..f49683948ef78f363e2439cc25332431830eeb24 100644
--- a/doc/v2/howto/cmd_parameter/index_en.rst
+++ b/doc/v2/howto/cmd_parameter/index_en.rst
@@ -2,10 +2,25 @@
 
 Set Command-line Parameters
 ===========================
+The implementation of deep learning algorithms has a variety of characteristics, such as running environment, running stage, structure of the model and the traning strategy. PaddlePaddle supports the user to set various command-line parameters flexibly, which helps to achieve control of the model training or prediction process.
+
+In this part, we take several actual scenarios as an example, and the use of some command-line parameters is displayed:
 
 ..  toctree::
   :maxdepth: 1
 
   use_case_en.md
+
+Then, we summarize and classify the use of all command-line parameters:
+
+..  toctree::
+  :maxdepth: 1
+
   arguments_en.md
+
+Finally, the detailed descriptions are given, and we try to explain the propeties and significance of these command-line parameters in detail:
+
+..  toctree::
+  :maxdepth: 1
+
   detail_introduction_en.md
diff --git a/doc/v2/howto/index_en.rst b/doc/v2/howto/index_en.rst
index 2079be766f2d8e6d63ca11dccd98f80613309ceb..35ef197f58f1f865e2cdbdebb567d5637284637a 100644
--- a/doc/v2/howto/index_en.rst
+++ b/doc/v2/howto/index_en.rst
@@ -1,11 +1,37 @@
 HOW TO
-=======
+========
+
+PaddlePaddle provides the users the ability to flexibly set various command line parameters to control the model training and inference process. Please refer to the following instructions on using PaddlePaddle:
 
 ..  toctree::
   :maxdepth: 1
 
   cmd_parameter/index_en.rst
+
+PaddlePaddle supports distributed training tasks on fabric clusters, MPI clusters, and Kubernetes clusters. For detailed configuration and usage instructions, refer to:
+
+..  toctree::
+  :maxdepth: 1
+
   cluster/index_en.rst
+
+PaddlePaddle provides a C-API for inference. We provide the following guidelines  for using the C-API:
+
+..  toctree::
+  :maxdepth: 1
+
   capi/index_en.rst
+
+PaddlePaddle supports a variety of flexible and efficient recurrent neural networks. For details, please refer to：
+
+..  toctree::
+  :maxdepth: 1
+
   rnn/index_en.rst
+
+How to use the built-in timing tool, nvprof, or nvvp to run performance analysis and tuning, please refer to：
+
+..  toctree::
+  :maxdepth: 1
+
   optimization/gpu_profiling_en.rst
diff --git a/doc/v2/howto/optimization/gpu_profiling_cn.rst b/doc/v2/howto/optimization/gpu_profiling_cn.rst
index 25bcaccb6975bc21fba2e8c5843da15c69948d72..f2396716bddd4810fa77c738d41f5482aa6d6055 100644
--- a/doc/v2/howto/optimization/gpu_profiling_cn.rst
+++ b/doc/v2/howto/optimization/gpu_profiling_cn.rst
@@ -50,12 +50,12 @@ GPU则还需要高并行性，才能发挥其全部能力。这正是它们速
 **nvprof** 是Nvidia性能分析工具， **nvvp** 则是带GUI的Nvidia可视化性能分析工具。
 在这个教程中，我们主要会介绍nvprof和nvvp。
 
-:code:`test_GpuProfiler` from :code:`paddle/math/tests` directory will be used to evaluate
+:code:`test_GpuProfiler` from :code:`paddle/legacy/math/tests` directory will be used to evaluate
 above profilers.
 
-:code:`paddle/math/test` 目录中的 :code:`test_GpuProfiler` 就是用于展示上述分析工具的用法。
+:code:`paddle/legacy/math/test` 目录中的 :code:`test_GpuProfiler` 就是用于展示上述分析工具的用法。
 
-.. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp
+.. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
    :language: c++
    :lines: 137-151
    :linenos:
@@ -83,7 +83,7 @@ program crashes when CPU version of PaddlePaddle invokes them.
 
 1. 加入 :code:`REGISTER_TIMER_INFO` 和 :code:`printAllStatus` 函数（如高亮部分）。
 
-    .. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
         :language: c++
         :lines: 137-151
         :emphasize-lines: 8-12,14
@@ -101,8 +101,8 @@ program crashes when CPU version of PaddlePaddle invokes them.
     .. code-block:: bash
         :emphasize-lines: 1,12-15
 
-        > ./paddle/math/tests/test_GpuProfiler
-        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler
+        > ./paddle/legacy/math/tests/test_GpuProfiler
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/legacy/math/tests/test_GpuProfiler
         I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
         I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
         [==========] Running 1 test from 1 test case.
@@ -130,7 +130,7 @@ nvprof 工具
 
 1. 将 :code:`REGISTER_GPU_PROFILER` 函数加到代码中（参考强调部分）。
 
-    .. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
         :language: c++
         :lines: 137-151
         :emphasize-lines: 6-7
@@ -147,13 +147,13 @@ nvprof 工具
 
     .. code-block:: bash
 
-        nvprof  ./paddle/math/tests/test_GpuProfiler
+        nvprof  ./paddle/legacy/math/tests/test_GpuProfiler
 
 然后，您就能获得如下的分析结果：
 
 .. code-block:: bash
 
-    ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler
+    ==78544== Profiling application: ./paddle/legacy/math/tests/test_GpuProfiler
     ==78544== Profiling result:
     Time(%)     Time     Calls       Avg       Min       Max  Name
     27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]
diff --git a/doc/v2/howto/optimization/gpu_profiling_en.rst b/doc/v2/howto/optimization/gpu_profiling_en.rst
index 50adb7da24906515cb5977db565e9f8a76599fef..6e439be9bba8935cdd65f1c131cfd3725530ec0e 100644
--- a/doc/v2/howto/optimization/gpu_profiling_en.rst
+++ b/doc/v2/howto/optimization/gpu_profiling_en.rst
@@ -51,10 +51,10 @@ For general GPU profiling, a bunch of tools are provided from both NVIDIA and th
 **nvprof** is Nvidia profiler and **nvvp** is (GUI based) Nvidia visual profiler.
 In this tutorial, we will focus on nvprof and nvvp.
 
-:code:`test_GpuProfiler` from :code:`paddle/math/tests` directory will be used to evaluate
+:code:`test_GpuProfiler` from :code:`paddle/legacy/math/tests` directory will be used to evaluate
 above profilers.
 
-.. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp
+.. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
    :language: c++
    :lines: 137-151
    :linenos:
@@ -80,7 +80,7 @@ As a simple example, consider the following:
 
 1. Add :code:`REGISTER_TIMER_INFO` and :code:`printAllStatus` functions (see the emphasize-lines).
 
-    .. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
         :language: c++
         :lines: 137-151
         :emphasize-lines: 8-12,14
@@ -98,8 +98,8 @@ As a simple example, consider the following:
     .. code-block:: bash
         :emphasize-lines: 1,12-15
 
-        > ./paddle/math/tests/test_GpuProfiler
-        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler
+        > ./paddle/legacy/math/tests/test_GpuProfiler
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/legacy/math/tests/test_GpuProfiler
         I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
         I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
         [==========] Running 1 test from 1 test case.
@@ -127,7 +127,7 @@ To use this command line profiler **nvprof**, you can simply issue the following
 
 1. Add :code:`REGISTER_GPU_PROFILER` function (see the emphasize-lines).
 
-    .. literalinclude:: ../../../../paddle/math/tests/test_GpuProfiler.cpp
+    .. literalinclude:: ../../../../paddle/legacy/math/tests/test_GpuProfiler.cpp
         :language: c++
         :lines: 137-151
         :emphasize-lines: 6-7
@@ -144,13 +144,13 @@ To use this command line profiler **nvprof**, you can simply issue the following
 
     .. code-block:: bash
 
-        nvprof  ./paddle/math/tests/test_GpuProfiler
+        nvprof  ./paddle/legacy/math/tests/test_GpuProfiler
 
 Then, you can get the following profiling result:
 
 .. code-block:: bash
 
-    ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler
+    ==78544== Profiling application: ./paddle/legacy/math/tests/test_GpuProfiler
     ==78544== Profiling result:
     Time(%)     Time     Calls       Avg       Min       Max  Name
     27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]
diff --git a/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst b/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
index b05b66415fbb829f471b1491b9881f65137bfe17..9d6d417075485dceb1ee71f527b408aa6a6638ea 100644
--- a/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
+++ b/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst
@@ -4,7 +4,7 @@
 单双层RNN API对比介绍
 #####################
 
-本文以PaddlePaddle的双层RNN单元测试为示例，用多对效果完全相同的、分别使用单双层RNN作为网络配置的模型，来讲解如何使用双层RNN。本文中所有的例子，都只是介绍双层RNN的API接口，并不是使用双层RNN解决实际的问题。如果想要了解双层RNN在具体问题中的使用，请参考\ :ref:`algo_hrnn_demo`\ 。本文中示例所使用的单元测试文件是\ `test_RecurrentGradientMachine.cpp <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
+本文以PaddlePaddle的双层RNN单元测试为示例，用多对效果完全相同的、分别使用单双层RNN作为网络配置的模型，来讲解如何使用双层RNN。本文中所有的例子，都只是介绍双层RNN的API接口，并不是使用双层RNN解决实际的问题。如果想要了解双层RNN在具体问题中的使用，请参考\ :ref:`algo_hrnn_demo`\ 。本文中示例所使用的单元测试文件是\ `test_RecurrentGradientMachine.cpp <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
 
 示例1：双层RNN，子序列间无Memory
 ================================
@@ -13,8 +13,8 @@
 
 在本示例中，单层RNN和双层RNN的网络配置，都是将每一句分好词后的句子，使用LSTM作为encoder，压缩成一个向量。区别是RNN使用两层序列模型，将多句话看成一个整体同时使用encoder压缩。二者语意上完全一致。这组语义相同的示例配置如下：
 
-* 单层RNN\: `sequence_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_layer_group.conf>`_
-* 双层RNN\: `sequence_nest_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_layer_group.conf>`_
+* 单层RNN\: `sequence_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_layer_group.conf>`_
+* 双层RNN\: `sequence_nest_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_nest_layer_group.conf>`_
 
 
 读取双层序列数据
@@ -24,18 +24,18 @@
 
 - 本例中的原始数据一共有10个样本。每个样本由两部分组成，一个label（此处都为2）和一个已经分词后的句子。这个数据也被单层RNN网络直接使用。
 
-..  literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg
     :language: text
 
 
 - 双层序列数据一共有4个样本。 每个样本间用空行分开，整体数据和原始数据完全一样。但于双层序列的LSTM来说，第一个样本同时encode两条数据成两个向量。这四条数据同时处理的句子数量为\ :code:`[2, 3, 2, 3]`\ 。
 
-..  literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg.nest
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest
     :language: text
 
-其次，对于两种不同的输入数据类型，不同DataProvider对比如下(`sequenceGen.py <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequenceGen.py>`_)\：
+其次，对于两种不同的输入数据类型，不同DataProvider对比如下(`sequenceGen.py <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequenceGen.py>`_)\：
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py
     :language: python
     :lines: 21-39
     :linenos:
@@ -47,7 +47,7 @@
     - words是原始数据中的每一句话，所对应的词表index数组。它是integer_value_sequence类型的，即整数数组。words即为这个数据中的单层时间序列。
     - label是原始数据中对于每一句话的分类标签，它是integer_value类型的。
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py
     :language: python
     :lines: 42-71
     :linenos:
@@ -64,7 +64,7 @@
 
 首先，我们看一下单层RNN的配置。代码中9-15行(高亮部分)即为单层RNN序列的使用代码。这里使用了PaddlePaddle预定义好的RNN处理函数。在这个函数中，RNN对于每一个时间步通过了一个LSTM网络。
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_layer_group.conf
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_layer_group.conf
     :language: python
     :lines: 38-63
     :linenos:
@@ -85,7 +85,7 @@
 
 * 至此，\ :code:`lstm_last`\ 便和单层RNN配置中的\ :code:`lstm_last`\ 具有相同的结果了。
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_layer_group.conf
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
     :language: python
     :lines: 38-64
     :linenos:
@@ -107,7 +107,7 @@
 
 - 单层RNN：过了一个很简单的recurrent_group。每一个时间步，当前的输入y和上一个时间步的输出rnn_state做了一个全链接。
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn.conf
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn.conf
     :language: python
     :lines: 36-48
 
@@ -116,7 +116,7 @@
   - 内层inner_step的recurrent_group和单层序列的几乎一样。除了boot_layer=outer_mem，表示将外层的outer_mem作为内层memory的初始状态。外层outer_step中，outer_mem是一个子句的最后一个向量，即整个双层group是将前一个子句的最后一个向量，作为下一个子句memory的初始状态。
   - 从输入数据上看，单双层序列的句子是一样的，只是双层序列将其又做了子序列划分。因此双层序列的配置中，必须将前一个子句的最后一个元素，作为boot_layer传给下一个子句的memory，才能保证和单层序列的配置中“每个时间步都用了上一个时间步的输出结果”一致。
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn.conf
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn.conf
     :language: python
     :lines: 39-66
 
@@ -134,7 +134,7 @@
 
 **输入不等长** 是指recurrent_group的多个输入序列，在每个时间步的子序列长度可以不相等。但序列输出时，需要指定与某一个输入的序列信息是一致的。使用\ :red:`targetInlink`\ 可以指定哪一个输入和输出序列信息一致，默认指定第一个输入。 
 
-示例3的配置分别为\ `单层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.conf>`_\ 和\ `双层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.conf>`_\ 。
+示例3的配置分别为\ `单层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py>`_\ 和\ `双层不等长RNN <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py>`_\ 。
 
 示例3对于单层RNN和双层RNN数据完全相同。
 
@@ -152,14 +152,14 @@
 
 * 单层RNN\:
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
     :language: python
     :lines: 42-59
     :linenos:
 
 * 双层RNN\ \:
 
-..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
     :language: python
     :lines: 41-80
     :linenos:
diff --git a/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst b/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst
index e5aa05c117393e81c557ba67609f787b38587efd..a4485f7b5edf21871444801230ab1ee191b1137b 100644
--- a/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst
+++ b/doc/v2/howto/rnn/hrnn_rnn_api_compare_en.rst
@@ -1,4 +1,226 @@
+..  _algo_hrnn_rnn_api_compare:
+
+#####################
 API comparision between RNN and hierarchical RNN
-================================================
+#####################
+
+This article takes PaddlePaddle's hierarchical RNN unit test as an example. We will use several examples to illestrate the usage of single-layer and hierarchical RNNs. Each example has two model configurations, one for single-layer, and the other for hierarchical RNN. Although the implementations are different, both the two model configurations' effects are the same. All of the examples in this article only describe the API interface of the hierarchical RNN, while we do not use this hierarchical RNN to solve practical problems. If you want to understand the use of hierarchical RNN in specific issues, please refer to \ :ref:`algo_hrnn_demo`\ 。The unit test file used in this article's example is \ `test_RecurrentGradientMachine.cpp <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
+
+Example 1：Hierarchical RNN without Memory between subsequences
+================================
+
+The classical case in the hierarchical RNN is to perform sequence operations on each time series data in the inner layers seperately. And the sequence operations in the inner layers is independent, that is, it does not need to use Memory. 
+
+In this example, the network configuration of single-layer RNNs and hierarchical RNNs are all to use LSTM as en encoder to compress a word-segmented sentence into a vector. The difference is that, RNN uses a hierarchical RNN model, treating multiple sentences as a whole to use encoder to compress simultaneously. They are completely consistent in their semantic meanings. This pair of semantically identical example configurations is as follows：
+
+* RNN\: `sequence_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_layer_group.conf>`_
+* Hierarchical RNN\: `sequence_nest_layer_group.conf <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_nest_layer_group.conf>`_
+
+
+Reading hierarchical sequence data
+----------------
+
+Firstly, the original data in this example is as follows \:
+
+- The original data in this example has 10 samples. Each of the sample includes two components: a lable(all 2 here), and a word-segmented sentence. This data is used by single RNN as well. 
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg
+    :language: text
+
+
+- The data for hierarchical RNN has 4 samples. Every sample is seperated by a blank line, while the content of the data is the same as the original data. But as for hierarchical LSTM, the first sample will encode two sentences into two vectors simultaneously. The sentence count dealed simultaneously by this 4 samples are \ :code:`[2, 3, 2, 3]`\ .
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest
+    :language: text
+
+Secondly, as for these two types of different input data formats, the contrast of different DataProviders are as follows (`sequenceGen.py <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequenceGen.py>`_)\：
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py
+    :language: python
+    :lines: 21-39
+    :linenos:
+
+- This is the DataProvider code for an ordinary single-layer time series. Its description is as follows: 
+  
+  * DataProvider returns two parts, that are "words" and "label"，as line 19 in the above code. 
+
+    - "words" is a list of word table indices corresponding to each word in the sentence in the original data. Its data type is integer_value_sequence, that is integer list. So, "words" is a singler-layer time series in the data. 
+    - "label" is the categorical label of each sentence, whose data type is integer_value. 
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequenceGen.py
+    :language: python
+    :lines: 42-71
+    :linenos:
+
+- As for the same data, the DataProvider code for hierarchical time series. Its description is as follows: 
+
+  - DataProvider returns two lists of data, that are "sentences" and "labels", corresponding to the sentences and labels in each group in the original data of hierarchical time series. 
+  - "sentences" comes from the hierarchical time series original data. As it contains every sentences in each group internally, and each sentences are represented by a list of word table indices, so its data type is integer_value_sub_sequence, which is hierarchical time series. 
+  - "labels" is the categorical lable of each sentence, so it is a sigle-layer time series. 
+
+
+Model configuration
+------------------------------------------
+
+Firstly, let's look at the configuration of single-layer RNN. The hightlighted part of line 9 to line 15 is the usage of single-layer RNN. Here we use the pre-defined RNN process function in PaddlePaddle. In this function, for each time step, RNN passes through an LSTM network. 
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_layer_group.conf
+    :language: python
+    :lines: 38-63
+    :linenos:
+    :emphasize-lines:  9-15
+
+
+Secondly, let's look at the model configuration of hierarchical RNN which has the same semantic meaning. \:
+
+* Most layers in PaddlePaddle do not care about whether the input is time series or not, e.g. \ :code:`embedding_layer`\ . In these layers, every operation is processed on each time step. 
+
+* In the hightlighted part of line 7 to line 26 of this configuration, we transform the hierarchical time series data into single-layer time series data, then process each single-layer time series. 
+
+  * Use the function \ :code:`recurrent_group`\ to transform. Input sequences need to be passed in when transforming. As we want to transform hierarchical time series into single-layer sequences, we need to lable the input data as \ :code:`SubsequenceInput`\ .
+  
+  * In this example, we disassemble every group of the original data into sentences using \ :code:`recurrent_group`\ . Each of the disassembled sentences passes through an LSTM network. This is equivalent to single-layer RNN configuration. 
+
+* Similar to single-layer RNN configuration, we only use the last vector after the encode of LSTM. So we use the operation of \ :code:`last_seq`\ to \ :code:`recurrent_group`\ . But unlike single-layer RNN, we use the last element of every subsequence, so we need to set \ :code:`agg_level=AggregateLevel.TO_SEQUENCE`\ . 
+
+* Till now, \ :code:`lstm_last`\ has the same result as \ :code:`lstm_last`\ in single-layer RNN configuration. 
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
+    :language: python
+    :lines: 38-64
+    :linenos:
+    :emphasize-lines: 7-26
+
+Example 2：Hierarchical RNN with Memory between subsequences
+================================
+
+This example is intended to implement two fully-equivalent fully-connected RNNs using single-layer RNN and hierarchical RNN. 
+
+* As for single-layer RNN, input is a full time series, e.g. \ :code:`[4, 5, 2, 0, 9, 8, 1, 4]`\ .
+
+* As for hierarchical RNN, input is a hierarchical time series which elements are arbitrarily combination of data in single-layer RNN, e.g. \ :code:`[ [4, 5, 2], [0, 9], [8, 1, 4]]`. 
+
+model configuration
+------------------
+
+We select the different parts between single-layer RNN and hierarchical RNN configurations, to compare and analyze the reason why they have same semantic meanings. 
+
+- single-layer RNN：passes through a simple recurrent_group. For each time step, the current input y and the last time step's output rnn_state pass through a fully-connected layer. 
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn.conf
+    :language: python
+    :lines: 36-48
+
+- hierarchical RNN, the outer layer's memory is an element. 
+
+  - The recurrent_group of inner layer's inner_step is nearly the same as single-layer sequence, except for the case of boot_layer=outer_mem, which means using the outer layer's outer_mem as the initial state for the inner layer's memory. In the outer layer's out_step, outer_mem is the last vector of a subsequence, that is, the whole hierarchical group uses the last vector of the previous subsequence as the initial state for the next subsequence's memory. 
+  - From the aspect of the input data, sentences from single-layer and hierarchical RNN are the same. The only difference is that, hierarchical RNN disassembes the sequence into subsequences. So in the hierarchical RNN configuration, we must use the last element of the previous subsequence as a boot_layer for the memory of the next subsequence, so that it makes no difference with "every time step uses the output of last time step" in the sigle-layer RNN configuration. 
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn.conf
+    :language: python
+    :lines: 39-66
+
+..  warning::
+    Currently PaddlePaddle only supports the case that the lengths of the time series of Memory in each time step are the same. 
+
+Example 3：hierarchical RNN with unequal length inputs
+==========================
+
+.. role:: red
+
+.. raw:: html
+
+    <style> .red {color:red} </style>
+
+**unequal length inputs** means in the multiple input sequences of recurrent_group, the lengths of subsequences can be unequal. But the output of the sequence, needs to be consistent with one of the input sequences. Using \ :red:`targetInlink`\ can help you specify which of the input sequences and the output sequence can be consistent, by default is the first input. 
+
+The configurations of Example 3 are \ `sequence_rnn_multi_unequalength_inputs <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py>`_ \ and \ `sequence_nest_rnn_multi_unequalength_inputs <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py>`_\ .
+
+The data for the configurations of Example 3's single-layer RNN and hierarchical RNN are exactly the same. 
+
+* For the single-layer RNN, the data has two samples, which are \ :code:`[1, 2, 4, 5, 2], [5, 4, 1, 3, 1]`\ and \ :code:`[0, 2, 2, 5, 0, 1, 2], [1, 5, 4, 2, 3, 6, 1]`\ . Each of the data for the single-layer RNN has two group of features. 
+
+* On the basis of the single-layer's data, hierarchical RNN's data randomly adds some partitions. For example, the first sample is transformed to \ :code:`[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]]`\ . 
+
+* You need to pay attention that, PaddlePaddle only supports multiple input hierarchical RNNs that have same amount of subsequences currently. In this example, the two features both have 3 subsequences. Although the length of each subsequence can be different, the amount of subsequences should be the same. 
+
+
+model configuration
+--------
+
+Similar to Example 2's configuration, Example 3's configuration uses single-layer and hierarchical RNN to implement 2 fully-equivalent fully-connected RNNs. 
+
+* single-layer RNN\:
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
+    :language: python
+    :lines: 42-59
+    :linenos:
+
+* hierarchical RNN\ \:
+
+..  literalinclude:: ../../../../paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
+    :language: python
+    :lines: 41-80
+    :linenos:
+
+In the above code, the usage of single-layer and hierarchical RNNs are similar to Example 2, which difference is that it processes 2 inputs simultaneously. As for the hierarchical RNN, the lengths of the 2 input's subsequences are not equal. But we use the parameter \ :code:`targetInlink` \ to set the outper layer's \ :code:`recurrent_group` \ 's output format, so the shape of outer layer's output is the same as the shape of \ :code:`emb2`\ . 
+
+
+Glossary
+======
+
+..  _glossary_memory:
+
+Memory
+------
+
+Memory is a concept when PaddlePaddle is implementing RNN. RNN, recurrent neural network, usually requires some dependency between time steps, that is, the neural network in current time step depends on one of the neurons in the neural network in previous time steps, as the following figure shows: 
+
+..  graphviz:: src/glossary_rnn.dot
+
+The dotted connections in the figure, is the network connections across time steps. When PaddlePaddle is implementing RNN, this connection accross time steps is implemented using a special neural network unit, called Memory. Memory can cache the output of one of the neurons in previous time step, then can be passed to another neuron in next time step. The implementation of an RNN using Memory is as follows: 
+
+..  graphviz:: src/glossary_rnn_with_memory.dot
+
+With this method, PaddlePaddle can easily determine which outputs should cross time steps, and which should not. 
+
+..  _glossary_timestep:
+
+time step
+------
+
+refers to time series
+
+
+..  _glossary_sequence:
+
+time series
+--------
+
+Time series is a series of featured data. The order among these featured data is meaningful. So it is a list of features, not a set of features. As for each element of this list, or the featured data in each series, is called a time step. It must be noted that, the concepts of time series and time steps, are not necessarrily related to "time". As long as the "order" in a series of featured data is meaningful, it can be the input of time series. 
+
+For example, in text classification task, we regard a sentence as a time series. So, each word in the sentence can become the index of the word in the word table. So this sentence can be represented as a list of these indices, e.g.:code:`[9, 2, 3, 5, 3]` . 
+
+For a more detailed and accurate definition of the time series, please refer to `Wikipedia of Time series <https://en.wikipedia.org/wiki/Time_series>`_  or `Chinese Wikipedia of time series <https://zh.wikipedia.org/wiki/%E6%99%82%E9%96%93%E5%BA%8F%E5%88%97>`_  . 
+
+In additioin, Paddle always calls time series as :code:`Sequence` . They are a same concept in Paddle's documentations and APIs. 
+
+..  _glossary_RNN:
+
+RNN
+---
+
+In PaddlePaddle's documentations, RNN is usually represented as :code:`Recurrent neural network` . For more information, please refer to `Wikipedia Recurrent neural network <https://en.wikipedia.org/wiki/Recurrent_neural_network>`_ or `Chinese Wikipedia <https://zh.wikipedia.org/wiki/%E9%80%92%E5%BD%92%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C>`_ . 
+
+In PaddlePaddle, RNN usually means, for the input data of a time series, the neural network between each time steps has a certain relevance. For example, the input of a certain neuron is the output of a certain neuron in the neural network of the last time step. Or, as for each time step, the network structure of the neural network has a directed ring structure. 
+
+..  _glossary_hierarchical_RNN:
+
+hierarchical RNN
+-------
+
+Hierarchical RNN, as the name suggests, means there is a nested relationship in RNNs. The input data is a time series, but for each of the inner featured data, it is also a time series, namely 2-dimentional array, or, array of array. Hierarchical RNN is a neural network that can process this type of input data. 
+
+For example, the task of text classification of a paragragh, meaning to classify a paragraph of sentences. We can treat a paragraph as an array of sentences, and each sentence is an array of words. This is a type of the input data for the hierarchical RNN. We encode each sentence of this paragraph into a vector using LSTM, then encode each of the encoded vectors into a vector of this paragraph using LSTM. Finally we use this paragraph vector perform classification, which is the neural network structure of this hierarchical RNN. 
 
-TBD
diff --git a/doc/v2/howto/rnn/index_en.rst b/doc/v2/howto/rnn/index_en.rst
index e1b20ef2e7bf4c521b613e54577ff6a3feaa8936..6e8b5c61b23ca2725dc0c9761c8dd4165033973c 100644
--- a/doc/v2/howto/rnn/index_en.rst
+++ b/doc/v2/howto/rnn/index_en.rst
@@ -1,10 +1,32 @@
 RNN Models
 ==========
+Recurrent neural networks(RNN) are an important tool to model sequential data. PaddlePaddle provides flexible interface for building complex recurrent neural network. We will demonstrate how to use PaddlePaddle to build RNN models in the following 4 parts.
+
+In the first part, we will guide you how to configure recurrent neural network in PaddlePaddle from simple to complex. First, we will use a vanilla recurrent neural network as an example to show how to configure recurrent neural network architecture. Then We will use the sequence to sequence model as an example to demonstrate how you can configure complex recurrent neural network models gradually.
 
 ..  toctree::
   :maxdepth: 1
 
   rnn_config_en.rst
+
+Recurrent Group is the key unit to build complex recurrent neural network models. The second part describes related concepts and Basic principles of Recurrent Group, and give a detailed description of Recurrent Group API interface. In addition, it also introduces Sequence-level RNN(hierarchical sequence as input) and the usage of Recurrent Group in it.
+
+..  toctree::
+  :maxdepth: 1
+  
   recurrent_group_en.md
+  
+In the third part, two-level sequence is demonstrated briefly and then layers supporting two-level sequence as input are listed and described respectively.
+
+..  toctree::
+  :maxdepth: 1
+  
   hierarchical_layer_en.rst
+
+In the last part, the unit test of hierarchical RNN is presented as an example to explain how to use hierarchical RNN. We will use two-level sequence RNN and single-layer sequence RNN which have same effects with former as the network configuration seperately in unit test.
+
+..  toctree::
+  :maxdepth: 1
+  
   hrnn_rnn_api_compare_en.rst
+
diff --git a/doc/v2/howto/rnn/recurrent_group_en.md b/doc/v2/howto/rnn/recurrent_group_en.md
index d264b0a9f85faffd49c1982117cb5a3ac6ffc015..de6b60f29eb97029a54609cd2194bb7faf3ffec5 100644
--- a/doc/v2/howto/rnn/recurrent_group_en.md
+++ b/doc/v2/howto/rnn/recurrent_group_en.md
@@ -1,3 +1,96 @@
 # Recurrent Group Tutorial
 
-TBD
+## Overview
+
+Sequential data is common in natural language processing.
+
+A sentence is a sequence of words and many sentences form a paragraph further. Therefore, a paragraph can be viewed as a nested sequence with two level, where each element of the sequence is another sequence. That is to say, sequential data could be recursive. An example of two-level recursive sequential data is that an article is composed of a sequence of sentences, and each sentence a sequence of words.
+
+PaddlePaddle and PaddlePaddle v2 support two-level recursive sequential data. The two-level sequence is a very flexible data, which helps us to better describe more complex language data such as discribing paragraphs and several rounds of dialogues. Based on two-level sequence input, we can design and build a flexible, hierarchical RNN model that encodes input data from the word and sentence level. For the support of arbitrary levels, please refer to PaddlePaddle Fluid.
+
+In PaddlePaddle, `recurrent_group` is an arbitrarily complex RNN unit. The user only needs to define the calculation that the RNN will complete in one time step. PaddlePaddle is responsible for the propagation of information and error in time series.
+
+Furthermore, `recurrent_group` can also be extended to handle two-level sequence. By defining two nested `recurrent_group` operations at the clause level and the word level respectively, a hierarchical and complex RNN is finally achieved.
+
+Currently, in the PaddlePaddle, there are `recurrent_group` and some Layers that can process bidirectional sequences. For details, refer to the document: <a href = "hierarchical_layer_en.html">Layers for supporting double-layer sequences as input.</a>
+
+## Related Concepts
+
+### Basic Principle 
+`recurrent_group` is an arbitrarily complex RNN unit supported by PaddlePaddle. The user only needs to focus on the calculations that the RNN is designed to complete within a single time step. The PaddlePaddle is responsible for completing the propagation of information and gradients over time.
+
+In PaddlePaddle, a simple call to `recurrent_group` is as follows:
+
+``` python 
+recurrent_group(step, input, reverse) 
+```
+- step: A callable function that defines the calculations completed by the RNN unit within a time step
+- input: The input must be a single-layer sequence or a double-layer sequence
+- reverse: Whether to process the input sequence in reverse order
+
+The core of using `recurrent_group` is to design the logic of the step function. The step function can be freely combined with various layers supported by PaddlePaddle to complete arbitrary arithmetic logic. The input of `recurrent_group` (input) becomes the input of the step function. Since the step function only focuses on the calculation within one time step of RNN, here `recurrent_group` completes the splitting of the original input data for us.
+
+### Input
+The input sequence processed by `recurrent_group` is mainly divided into the following three types:
+
+- **Input Data**: When putting a two-level sequence into `recurrent_group`, it will be disassembled into a single-level sequence. When putting a single-level sequence into `recurrent_group`, it will be disassembled into a non-sequence and then passed to the step function. This process is completely transparent to the user. There are two possible types: 1) User input via data_layer; 2) Output from other layers.
+		
+- **Read-only Memory Input**: `StaticInput` defines a read-only Memory. The input specified by `StaticInput` will not be disassembled by `recurrent_group`, and each time step of the `recurrent_group` loop will always be able to reference all inputs. It may be a non-sequence or a single-layer sequence.
+	  
+- **Input of Sequence Generation Task**: `GeneratedInput` is only used to specify input data in a sequence generation task.
+
+### Input Example
+
+Sequence generation tasks mostly follow the encoder-decoer architecture. The encoder and decoder can be arbitrary neural network units capable of processing sequences and RNN is the most popular choice.
+
+Given the encoder output and the current word, the decoder predicts the next most likely word each time. In this structure, the decoder accepts two inputs:
+
+- Target sequence to be generated: a input of the decoder and the basis of the decoder loop. `recurrent_group` will disassemble this input type.
+
+- Encoder output, an non-sequencce or single-sequence: a unbounded memory. Each time step in the decoder loop will reference the entire result and should not be disassembled. This type of input must be specified via `StaticInput`. For more discussion on Unbounded Memory, please refer to the paper [Neural Turning Machine](https://arxiv.org/abs/1410.5401).
+
+In a sequence generation task, the decoder RNN always refers to the word vector of the word predicted at the previous moment as the current time input. `GeneratedInput` will automate this process.
+
+### Output
+The `step` function must return the output of one or more Layers. The output of this Layer will be the final output of the entire `recurrent_group`. In the output process, `recurrent_group` will concatenate the output of each time step, which is also transparent to the user.
+
+### Memory
+Memory can only be defined and used in `recurrent_group`. Memory cannot exist independently and must point to a layer defined by PaddlePaddle. Memory is referenced to get a momentary output from this layer, so memory can be interpreted as a delay operation.
+
+The user can explicitly specify the output of a layer to initialize the memory. When not specified, memory is initialized to 0 by default.
+
+## Sequence-level RNN Introduction
+
+`recurrent_group` helps us to split the input sequence, merge the output, and loop through the sequence of computational logic.
+
+Using this feature, the two nested `recurrent_group` can handle the nested two-level sequences, implementing sequence-level RNN structures at both the word and sentence levels.
+
+- Word-level RNN:  each state corresponds to a word.
+- Sequence-level RNN: a sequence-layer RNN consists of multiple word-layer RNNs. Each word-layer RNN (ie, each state of a sequence-layer RNN) has a subsequence.
+
+For convenience of description, the following takes the NLP task as an example. A paragraph containing a subsequence is defined as a two-level sequence, and a sentence containing a word is defined as a single-layer sequence. Then, the zero-level sequence is a word.
+
+## Usage of Sequence-level RNN
+
+### Usage of Training Process
+Using `recurrent_group` requires the following conventions:
+
+- **Single-input Single-output**: Both input and output are single layer sequences.
+  - If there are multiple inputs, the number of words in different input sequences must be exactly equal.
+  - A single-layer sequence is output, and the number of words in the output sequence is the same as the input sequence.
+  - memory: define memory to point to a layer in the step function, get a moment output from this layer by referencing memory to form a recurrent connection. The is_seq parameter of memory must be false. If memory is not defined, the operations within each time step are independent.
+  - boot_layer: the initial state of memory, set 0 by default. is_seq in memory must be false.
+ 
+- **Double-input Double-output**: Both input and output are two-level sequence.
+  - If there are multiple input sequences, the number of subsequence contained in different inputs must be strictly equal, but the number of words in the subsequence may not be equal.
+  - output a two-level sequence. The number of subsequence and the number of words are the same as the specified input sequence and the first input is default.
+  - memory: defining memory in the step function, pointing to a layer, by referring to the memory to get the output of this layer at a time, forming a recurrent connection. The memory defined in the outer `recurrent_group` step function can record the state of the previous subsequence, either as a single-level sequence (only as read-only memory) or as a word. If memory is not defined, the operations between subsequence are independent.
+  - boot_layer: the initial state of memory. It is either a single-level sequence (only as read-only memory) or a vector. The default is not set, that is, the initial state is 0.
+
+- **Double-input Single-output**: not support for now, and output the error with "In hierachical RNN, all out links should be from sequences now".
+ 
+### Usage of Generation Process
+Using `beam_search` need follow those conventions: 
+
+- Word-level RNN: generate the next word from a word.
+- Sequence-level RNN: the single-layer RNN generated subsequence is concatenated into a new double-layer sequence. Semantically, there is no case where a subsequence generates the next subseq directly.
diff --git a/doc/v2/images/FullyConnected.jpg b/doc/v2/images/FullyConnected.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..b2241f401434e527f95ee4e0e541a3f2ff78fd1e
Binary files /dev/null and b/doc/v2/images/FullyConnected.jpg differ
diff --git a/doc/v2/images/add_security_group.png b/doc/v2/images/add_security_group.png
new file mode 100644
index 0000000000000000000000000000000000000000..bd34f46c9b0ada7027fd53e553e7d033255d25fc
Binary files /dev/null and b/doc/v2/images/add_security_group.png differ
diff --git a/doc/v2/images/bi_lstm.jpg b/doc/v2/images/bi_lstm.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..adec1606d64d6e35ffe7e62abfa9a09309b05c84
Binary files /dev/null and b/doc/v2/images/bi_lstm.jpg differ
diff --git a/doc/v2/images/checkpointing.png b/doc/v2/images/checkpointing.png
new file mode 100644
index 0000000000000000000000000000000000000000..c221e8474f90f37e31416cbb19c9452207a0d14c
Binary files /dev/null and b/doc/v2/images/checkpointing.png differ
diff --git a/doc/v2/images/create_efs.png b/doc/v2/images/create_efs.png
new file mode 100644
index 0000000000000000000000000000000000000000..e5f1526033d1daf401700989af1d25919bcb7675
Binary files /dev/null and b/doc/v2/images/create_efs.png differ
diff --git a/doc/v2/images/csr.png b/doc/v2/images/csr.png
new file mode 100644
index 0000000000000000000000000000000000000000..3dc10b8de4f6d3f517624956b1694b689405a031
Binary files /dev/null and b/doc/v2/images/csr.png differ
diff --git a/doc/v2/images/data_dispatch.png b/doc/v2/images/data_dispatch.png
new file mode 100644
index 0000000000000000000000000000000000000000..5bdcc24d6a6d193cb014f8c38b362451fded5e54
Binary files /dev/null and b/doc/v2/images/data_dispatch.png differ
diff --git a/doc/v2/images/dataset.graffle b/doc/v2/images/dataset.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..c10a423ed16a23229a9ee33d11bfc82bb59646c8
Binary files /dev/null and b/doc/v2/images/dataset.graffle differ
diff --git a/doc/v2/images/dataset.png b/doc/v2/images/dataset.png
new file mode 100644
index 0000000000000000000000000000000000000000..2fb7f1cce3b6dd21489392557826e95a9f207c34
Binary files /dev/null and b/doc/v2/images/dataset.png differ
diff --git a/doc/v2/images/doc_en.png b/doc/v2/images/doc_en.png
new file mode 100644
index 0000000000000000000000000000000000000000..ed6b9178fba91a3bdf45ae797a9924f84146fbc8
Binary files /dev/null and b/doc/v2/images/doc_en.png differ
diff --git a/doc/v2/images/efs_mount.png b/doc/v2/images/efs_mount.png
new file mode 100644
index 0000000000000000000000000000000000000000..0f9e3cab98445707e5e9baa18ddabe15cdf04576
Binary files /dev/null and b/doc/v2/images/efs_mount.png differ
diff --git a/doc/v2/images/encoder-decoder-attention-model.png b/doc/v2/images/encoder-decoder-attention-model.png
new file mode 100644
index 0000000000000000000000000000000000000000..79f911d4ba12ac0c0d1a936c9df639c302786914
Binary files /dev/null and b/doc/v2/images/encoder-decoder-attention-model.png differ
diff --git a/doc/v2/images/engine.png b/doc/v2/images/engine.png
new file mode 100644
index 0000000000000000000000000000000000000000..1f5f65c2cc765a514a3ba9e7b7f468e1dc4b0c3b
Binary files /dev/null and b/doc/v2/images/engine.png differ
diff --git a/doc/v2/images/file_storage.graffle b/doc/v2/images/file_storage.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..50a17e70fa255495337c529a3bf12a5c0024a5be
Binary files /dev/null and b/doc/v2/images/file_storage.graffle differ
diff --git a/doc/v2/images/file_storage.png b/doc/v2/images/file_storage.png
new file mode 100644
index 0000000000000000000000000000000000000000..fccb4e3e7e738224c7f1584326bd5f351ce799aa
Binary files /dev/null and b/doc/v2/images/file_storage.png differ
diff --git a/doc/v2/images/glossary_rnn.dot b/doc/v2/images/glossary_rnn.dot
new file mode 100644
index 0000000000000000000000000000000000000000..2cd0fb1820c44b0e8e0b869f9d39fcad27efa758
--- /dev/null
+++ b/doc/v2/images/glossary_rnn.dot
@@ -0,0 +1,42 @@
+digraph G{
+	subgraph cluster_timestep0 {
+		label="recurrent timestep i-1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc0_0 [label="fc 0"]
+		fc0_1 [label="fc 1"]
+		fc0_2 [label="fc 2"]
+
+		fc0_0 -> fc0_1
+		fc0_1 -> fc0_2
+	}
+
+	subgraph cluster_timestep1 {
+		label="recurrent timestep i"
+		node [style=filled];
+		fc1_0 [label="fc 0"]
+		fc1_1 [label="fc 1"]
+		fc1_2 [label="fc 2"]
+		color=blue
+
+		fc1_0 -> fc1_1
+		fc1_1 -> fc1_2
+	}
+
+	subgraph cluster_timestep2 {
+		label="recurrent timestep i+1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc2_0 [label="fc 0"]
+		fc2_1 [label="fc 1"]
+		fc2_2 [label="fc 2"]
+
+		fc2_0 -> fc2_1
+		fc2_1 -> fc2_2
+	}
+	
+	
+	fc0_1 -> fc1_1 [style="dotted" constraint=false]
+	fc1_1 -> fc2_1 [style="dotted" constraint=false]
+
+}
\ No newline at end of file
diff --git a/doc/v2/images/glossary_rnn_with_memory.dot b/doc/v2/images/glossary_rnn_with_memory.dot
new file mode 100644
index 0000000000000000000000000000000000000000..0f101ec2d8f15aec76c57f328046b6b55cf0c7eb
--- /dev/null
+++ b/doc/v2/images/glossary_rnn_with_memory.dot
@@ -0,0 +1,48 @@
+digraph G{
+	subgraph cluster_timestep0 {
+		label="recurrent timestep i-1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc0_0 [label="fc 0"]
+		fc0_1 [label="fc 1"]
+		fc0_2 [label="fc 2"]
+		m0 [label="memory"]
+		fc0_0 -> fc0_1
+		fc0_1 -> fc0_2
+		fc0_1 -> m0
+		m0 -> fc0_1
+	}
+
+	subgraph cluster_timestep1 {
+		label="recurrent timestep i"
+		node [style=filled];
+		fc1_0 [label="fc 0"]
+		fc1_1 [label="fc 1"]
+		fc1_2 [label="fc 2"]
+		m1 [label="memory"]
+		color=blue
+		fc1_0 -> fc1_1
+		fc1_1 -> fc1_2
+		fc1_1 -> m1
+		m1 -> fc1_1
+	}
+
+	subgraph cluster_timestep2 {
+		label="recurrent timestep i+1"
+		bgcolor=lightgray
+		node [style=filled,color=white]
+		fc2_0 [label="fc 0"]
+		fc2_1 [label="fc 1"]
+		fc2_2 [label="fc 2"]
+		m2 [label="memory"]
+		fc2_0 -> fc2_1
+		fc2_1 -> fc2_2
+		fc2_1 -> m2
+		m2 -> fc2_1
+	}
+	
+	
+	m0 -> m1 [style="dotted" constraint=false]
+	m1 -> m2 [style="dotted" constraint=false]
+
+}
\ No newline at end of file
diff --git a/doc/v2/images/gradients.png b/doc/v2/images/gradients.png
new file mode 100644
index 0000000000000000000000000000000000000000..f031bcf8e4cec14e63075b8b9d2c7bbd9f1b1a3c
Binary files /dev/null and b/doc/v2/images/gradients.png differ
diff --git a/doc/v2/images/init_lock.graffle b/doc/v2/images/init_lock.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..fa9149f21b1311eed48ef72ec55e556559d0fc94
Binary files /dev/null and b/doc/v2/images/init_lock.graffle differ
diff --git a/doc/v2/images/init_lock.png b/doc/v2/images/init_lock.png
new file mode 100644
index 0000000000000000000000000000000000000000..92404ee6d6c0f9a7727952bae3c869ba338ecd7f
Binary files /dev/null and b/doc/v2/images/init_lock.png differ
diff --git a/doc/v2/images/k8s-paddle-arch.png b/doc/v2/images/k8s-paddle-arch.png
new file mode 100644
index 0000000000000000000000000000000000000000..b3800c4fe81302d35e49f7dbacb9221c4dfa5cde
Binary files /dev/null and b/doc/v2/images/k8s-paddle-arch.png differ
diff --git a/doc/v2/images/layers.png b/doc/v2/images/layers.png
new file mode 100644
index 0000000000000000000000000000000000000000..306f79b7a844610915eb8944128f57d2b7a3065a
Binary files /dev/null and b/doc/v2/images/layers.png differ
diff --git a/doc/v2/images/managed_policy.png b/doc/v2/images/managed_policy.png
new file mode 100644
index 0000000000000000000000000000000000000000..c7ecda555b81d7750e9292a9ab72d2f517f76a2a
Binary files /dev/null and b/doc/v2/images/managed_policy.png differ
diff --git a/doc/v2/images/matrix.png b/doc/v2/images/matrix.png
new file mode 100644
index 0000000000000000000000000000000000000000..c33ce9cf0335e47cc8c1253304d0fe179186e6f2
Binary files /dev/null and b/doc/v2/images/matrix.png differ
diff --git a/doc/v2/images/nvvp1.png b/doc/v2/images/nvvp1.png
new file mode 100644
index 0000000000000000000000000000000000000000..1af23ac3c52929b2b0645d2f9fa4d4c6db1f6e77
Binary files /dev/null and b/doc/v2/images/nvvp1.png differ
diff --git a/doc/v2/images/nvvp2.png b/doc/v2/images/nvvp2.png
new file mode 100644
index 0000000000000000000000000000000000000000..177c9db708da6863d1075f3e615f5962dbe18b29
Binary files /dev/null and b/doc/v2/images/nvvp2.png differ
diff --git a/doc/v2/images/nvvp3.png b/doc/v2/images/nvvp3.png
new file mode 100644
index 0000000000000000000000000000000000000000..d8f393667d6569b6f1e61ffccac43fae5888b6db
Binary files /dev/null and b/doc/v2/images/nvvp3.png differ
diff --git a/doc/v2/images/nvvp4.png b/doc/v2/images/nvvp4.png
new file mode 100644
index 0000000000000000000000000000000000000000..51f2f3e183295de6cf8ddaf2b3b8a0862aa35f01
Binary files /dev/null and b/doc/v2/images/nvvp4.png differ
diff --git a/doc/v2/images/overview.png b/doc/v2/images/overview.png
new file mode 100644
index 0000000000000000000000000000000000000000..8fb7bbb9dd654bf363d701d0c8cd4a557043d188
Binary files /dev/null and b/doc/v2/images/overview.png differ
diff --git a/doc/v2/images/paddle-cloud-in-data-center.png b/doc/v2/images/paddle-cloud-in-data-center.png
new file mode 100644
index 0000000000000000000000000000000000000000..da5d1a77562480ad1d886f5f21dbd84001d3d508
Binary files /dev/null and b/doc/v2/images/paddle-cloud-in-data-center.png differ
diff --git a/doc/v2/images/paddle-etcd.graffle b/doc/v2/images/paddle-etcd.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..f973dc9b9dbf72e9bc31e2d32822916cd281f8d9
Binary files /dev/null and b/doc/v2/images/paddle-etcd.graffle differ
diff --git a/doc/v2/images/paddle-etcd.png b/doc/v2/images/paddle-etcd.png
new file mode 100644
index 0000000000000000000000000000000000000000..57981ceb4b94f0f7d6dfa63f3d28c0402bf9cc31
Binary files /dev/null and b/doc/v2/images/paddle-etcd.png differ
diff --git a/doc/v2/images/paddle-model-sharding.graffle b/doc/v2/images/paddle-model-sharding.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..fba30f0ca2b47f0d202a432821d95e55aac37ec8
Binary files /dev/null and b/doc/v2/images/paddle-model-sharding.graffle differ
diff --git a/doc/v2/images/paddle-model-sharding.png b/doc/v2/images/paddle-model-sharding.png
new file mode 100644
index 0000000000000000000000000000000000000000..8c3f6724ef46c6527e63a4cd8cb0b50fe0167124
Binary files /dev/null and b/doc/v2/images/paddle-model-sharding.png differ
diff --git a/doc/v2/images/paddle-ps-0.png b/doc/v2/images/paddle-ps-0.png
new file mode 100644
index 0000000000000000000000000000000000000000..47ef32806f182cab003da77f1556823b3f6d1721
Binary files /dev/null and b/doc/v2/images/paddle-ps-0.png differ
diff --git a/doc/v2/images/paddle-ps-1.png b/doc/v2/images/paddle-ps-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..f3125db73096c52bac6e7c60e1675552857c0774
Binary files /dev/null and b/doc/v2/images/paddle-ps-1.png differ
diff --git a/doc/v2/images/paddle-ps.graffle b/doc/v2/images/paddle-ps.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..0e536ffdd91cd696008b4c01bad3cb53edebdc16
Binary files /dev/null and b/doc/v2/images/paddle-ps.graffle differ
diff --git a/doc/v2/images/paddle-task-queues.graffle b/doc/v2/images/paddle-task-queues.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..4263ed8bfd2ef0e55058828bf23f2fac3595e5fd
Binary files /dev/null and b/doc/v2/images/paddle-task-queues.graffle differ
diff --git a/doc/v2/images/paddle-task-queues.png b/doc/v2/images/paddle-task-queues.png
new file mode 100644
index 0000000000000000000000000000000000000000..5f980266795776752cebd0c346b85c4a75a47780
Binary files /dev/null and b/doc/v2/images/paddle-task-queues.png differ
diff --git a/doc/v2/images/paddle-task-states.graffle b/doc/v2/images/paddle-task-states.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..cf1a0b9246d9386a949d2dbb8c32fe84f72eea83
Binary files /dev/null and b/doc/v2/images/paddle-task-states.graffle differ
diff --git a/doc/v2/images/paddle-task-states.png b/doc/v2/images/paddle-task-states.png
new file mode 100644
index 0000000000000000000000000000000000000000..4ae43cb66c071aee9eb90d875e2373b29af9c3e0
Binary files /dev/null and b/doc/v2/images/paddle-task-states.png differ
diff --git a/doc/v2/images/ps_cn.png b/doc/v2/images/ps_cn.png
new file mode 100644
index 0000000000000000000000000000000000000000..f9525739cc8bc6506adde642aafa0a85ae3ebebc
Binary files /dev/null and b/doc/v2/images/ps_cn.png differ
diff --git a/doc/v2/images/ps_en.png b/doc/v2/images/ps_en.png
new file mode 100644
index 0000000000000000000000000000000000000000..6537d3d56589ca9f19a77a50a970e4b5275e6ce0
Binary files /dev/null and b/doc/v2/images/ps_en.png differ
diff --git a/doc/v2/images/pserver_and_trainer.png b/doc/v2/images/pserver_and_trainer.png
new file mode 100644
index 0000000000000000000000000000000000000000..f41fe48920590333ad332bb51eb18e03dc251541
Binary files /dev/null and b/doc/v2/images/pserver_and_trainer.png differ
diff --git a/doc/v2/images/pserver_init.graffle b/doc/v2/images/pserver_init.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..5f3f1f52be8aa7f9049a8fcd6b7c93c8560c1676
Binary files /dev/null and b/doc/v2/images/pserver_init.graffle differ
diff --git a/doc/v2/images/pserver_init.png b/doc/v2/images/pserver_init.png
new file mode 100644
index 0000000000000000000000000000000000000000..dfe491ff98dd7db1c336093c80964a260df2cd90
Binary files /dev/null and b/doc/v2/images/pserver_init.png differ
diff --git a/doc/v2/images/route53_create_recordset.png b/doc/v2/images/route53_create_recordset.png
new file mode 100644
index 0000000000000000000000000000000000000000..34e476c7beac30fcdde13fccc4cc8d08b4be3d35
Binary files /dev/null and b/doc/v2/images/route53_create_recordset.png differ
diff --git a/doc/v2/images/route53_create_zone.png b/doc/v2/images/route53_create_zone.png
new file mode 100644
index 0000000000000000000000000000000000000000..25b7ddb831c5cba97f4b2edddd27da3234d621af
Binary files /dev/null and b/doc/v2/images/route53_create_zone.png differ
diff --git a/doc/v2/images/sequence_data.png b/doc/v2/images/sequence_data.png
new file mode 100644
index 0000000000000000000000000000000000000000..6e47a46b8955dfe977e85898fe3c9f33ed28de7e
Binary files /dev/null and b/doc/v2/images/sequence_data.png differ
diff --git a/doc/v2/images/simple_full_hierarchical_recurrent.dot b/doc/v2/images/simple_full_hierarchical_recurrent.dot
new file mode 100644
index 0000000000000000000000000000000000000000..ff278a0323bb2c3ef07bf6f016a3a8df05783581
--- /dev/null
+++ b/doc/v2/images/simple_full_hierarchical_recurrent.dot
@@ -0,0 +1,30 @@
+digraph G {
+  rankdir=LR;
+
+  subgraph cluster_t0 {
+    a [label="4"]
+    b [label="5"]
+    c [label="2"]
+  }
+  
+  subgraph cluster_t1 {
+    d [label="0"]
+    e [label="9"]
+  }
+
+  subgraph cluster_t2 {
+    f [label="8"]
+    g [label="1"]
+    h [label="4"]
+  }
+
+  a -> b;
+  b -> c;
+  c -> d [constraint=false];
+
+  d -> e;
+  e -> f [constraint=false];
+  
+  f -> g;
+  g -> h;
+}
\ No newline at end of file
diff --git a/doc/v2/images/simple_full_recurrent.dot b/doc/v2/images/simple_full_recurrent.dot
new file mode 100644
index 0000000000000000000000000000000000000000..cee281fbac993afbd0cc3416570f95965cdf0a59
--- /dev/null
+++ b/doc/v2/images/simple_full_recurrent.dot
@@ -0,0 +1,19 @@
+digraph G {
+  rankdir=LR;
+  a [label="4"]
+  b [label="5"]
+  c [label="2"]
+  d [label="0"]
+  e [label="9"]
+  f [label="8"]
+  g [label="1"]
+  h [label="4"]
+
+  a -> b;
+  b -> c;
+  c -> d;
+  d -> e;
+  e -> f;
+  f -> g;
+  g -> h;
+}
\ No newline at end of file
diff --git a/doc/v2/images/submit-job.graffle b/doc/v2/images/submit-job.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..677cdfb6d9a32168bf71729eb841fa1ca0dd31d6
Binary files /dev/null and b/doc/v2/images/submit-job.graffle differ
diff --git a/doc/v2/images/submit-job.png b/doc/v2/images/submit-job.png
new file mode 100644
index 0000000000000000000000000000000000000000..3046a460a7ba708079e88a560debaa215a694680
Binary files /dev/null and b/doc/v2/images/submit-job.png differ
diff --git a/doc/v2/images/trainer.graffle b/doc/v2/images/trainer.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..43415ed8cf61a5acfa34f8e56b9577f338dbf254
Binary files /dev/null and b/doc/v2/images/trainer.graffle differ
diff --git a/doc/v2/images/trainer.png b/doc/v2/images/trainer.png
new file mode 100644
index 0000000000000000000000000000000000000000..6537d3d56589ca9f19a77a50a970e4b5275e6ce0
Binary files /dev/null and b/doc/v2/images/trainer.png differ
diff --git a/doc/v2/images/trainer_cn.png b/doc/v2/images/trainer_cn.png
new file mode 100644
index 0000000000000000000000000000000000000000..f9525739cc8bc6506adde642aafa0a85ae3ebebc
Binary files /dev/null and b/doc/v2/images/trainer_cn.png differ
diff --git a/doc/v2/images/worker_security_group.png b/doc/v2/images/worker_security_group.png
new file mode 100644
index 0000000000000000000000000000000000000000..57eb0265a34ad4223b69600d2a3dd355482e0bf5
Binary files /dev/null and b/doc/v2/images/worker_security_group.png differ
diff --git a/doc/v2/images/workflow_of_CAPI.png b/doc/v2/images/workflow_of_CAPI.png
new file mode 100644
index 0000000000000000000000000000000000000000..a4399ade048b3fe10d2d9c714bc34333ca068edb
Binary files /dev/null and b/doc/v2/images/workflow_of_CAPI.png differ
diff --git a/go/pserver/client/c/test/CMakeLists.txt b/go/pserver/client/c/test/CMakeLists.txt
index 411dc50332672143d7a1f7bd0556ae86dc37f6f3..4500b1f288372ed0e2d9d383234df97ae976c60b 100644
--- a/go/pserver/client/c/test/CMakeLists.txt
+++ b/go/pserver/client/c/test/CMakeLists.txt
@@ -13,4 +13,3 @@
 # limitations under the License.
 #
 cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer)
-add_style_check_target(test_cclient test_cclient.c)
diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go
index f17577997bc94b08f3e296c4d6e35682ca3c0e57..eba0c47e195a80fc298f0fdd78c8d6345e963be8 100644
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@@ -16,7 +16,7 @@ package pserver
 
 // #cgo CFLAGS: -I ../../
 // #cgo LDFLAGS: ${SRCDIR}/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
-// #include "paddle/optimizer/optimizer.h"
+// #include "paddle/legacy/optimizer/optimizer.h"
 // #include <stdlib.h>
 // #include <string.h>
 import "C"
diff --git a/paddle/.gitignore b/paddle/.gitignore
index f921eef14156a97e4fd250f014960e306b43f35a..01904aa6ef2057afee95ddd6e30cde064b06c52e 100644
--- a/paddle/.gitignore
+++ b/paddle/.gitignore
@@ -1,3 +1,4 @@
+.timestamp
 *.o
 *.a
 .svn
@@ -10,7 +11,6 @@ GTAGS
 *.pb.cc
 *.pb.h
 *_pb2.py
-paddle_*
 output/
 google/
 Makefile
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index a7b249d43bf3ad9924749d5e66618750f19d8bf7..6653244507742b33d9524a7a0e4a5b2b575d358a 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -1,27 +1,29 @@
-add_subdirectory(cuda)
-add_subdirectory(function)
-add_subdirectory(utils)
-add_subdirectory(math)
-add_subdirectory(gserver)
-add_subdirectory(parameter)
-add_subdirectory(testing)
+if(NOT WITH_FLUID_ONLY)
+  add_subdirectory(legacy/cuda)
+  add_subdirectory(legacy/function)
+  add_subdirectory(legacy/utils)
+  add_subdirectory(legacy/math)
+  add_subdirectory(legacy/gserver)
+  add_subdirectory(legacy/parameter)
 
-if(MOBILE_INFERENCE)
-  add_subdirectory(capi)
-else()
-  add_subdirectory(pserver)
-  add_subdirectory(trainer)
-  add_subdirectory(scripts)
+  if(MOBILE_INFERENCE)
+    add_subdirectory(legacy/capi)
+  else()
+    add_subdirectory(legacy/pserver)
+    add_subdirectory(legacy/trainer)
+    add_subdirectory(scripts)
 
-  if(WITH_C_API)
-    add_subdirectory(capi)
-  endif()
+    if(WITH_C_API)
+      add_subdirectory(legacy/capi)
+    endif()
 
-  if(NOT ANDROID AND NOT IOS)
-    add_subdirectory(fluid)
+    if(WITH_SWIG_PY)
+      add_subdirectory(legacy/api)
+    endif()
   endif()
+endif()
 
-  if(WITH_SWIG_PY)
-    add_subdirectory(api)
-  endif()
+add_subdirectory(testing)
+if(NOT MOBILE_INFERENCE AND NOT RPI AND NOT WITH_C_API)
+  add_subdirectory(fluid)
 endif()
diff --git a/paddle/api/Arguments.cpp b/paddle/api/Arguments.cpp
deleted file mode 100644
index 62d6a574d55d2748635879a21cbbaa474f070cff..0000000000000000000000000000000000000000
--- a/paddle/api/Arguments.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-
-#include "paddle/parameter/Argument.h"
-
-size_t Arguments::getSlotNum() const { return m->outputs.size(); }
-
-Arguments* Arguments::createArguments(size_t slotNum) {
-  auto args = new Arguments();
-  args->m->outputs.resize(slotNum);
-  return args;
-}
-
-void Arguments::resize(size_t slotNum) { m->outputs.resize(slotNum); }
-
-Arguments::Arguments() : m(new ArgumentsPrivate()) {}
-
-Arguments::~Arguments() { delete m; }
-
-Arguments* Arguments::createByPaddleArgumentVector(void* ptr) {
-  auto p = (std::vector<paddle::Argument>*)(ptr);
-  auto args = new Arguments();
-  args->m->outputs = *p;
-  return args;
-}
-
-Arguments* Arguments::createByPaddleArgument(const void* ptr) {
-  auto p = (paddle::Argument*)(ptr);
-  auto args = new Arguments();
-  args->m->outputs.push_back(*p);
-  return args;
-}
-
-Matrix* Arguments::getSlotValue(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return Matrix::createByPaddleMatrixPtr(&a.value);
-}
-
-Matrix* Arguments::getSlotGrad(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return Matrix::createByPaddleMatrixPtr(&a.grad);
-}
-
-IVector* Arguments::getSlotIds(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return IVector::createByPaddleVectorPtr(&a.ids);
-}
-
-Matrix* Arguments::getSlotIn(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return Matrix::createByPaddleMatrixPtr(&a.in);
-}
-
-void Arguments::setSlotValue(size_t idx, Matrix* mat) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.value = m->cast<paddle::Matrix>(mat->getSharedPtr());
-}
-
-void Arguments::setSlotGrad(size_t idx, Matrix* mat) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.grad = m->cast<paddle::Matrix>(mat->getSharedPtr());
-}
-
-void Arguments::setSlotIn(size_t idx, Matrix* mat) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.in = m->cast<paddle::Matrix>(mat->getSharedPtr());
-}
-
-void Arguments::setSlotIds(size_t idx, IVector* vec) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
-  a.ids = v;
-}
-
-template <typename T1>
-static inline void doCopyFromSafely(std::shared_ptr<T1>& dest,
-                                    std::shared_ptr<T1>& src) {
-  if (src) {
-    if (dest) {
-      dest->copyFrom(*src);
-    } else {
-      dest = src;
-    }
-  }
-}
-
-IVector* Arguments::getSlotSequenceStartPositions(size_t idx) const
-    throw(RangeError) {
-  auto& a = m->getArg(idx);
-  if (a.sequenceStartPositions) {
-    return IVector::createByPaddleVectorPtr(
-        &a.sequenceStartPositions->getMutableVector(false));
-  } else {
-    return nullptr;
-  }
-}
-
-IVector* Arguments::getSlotSubSequenceStartPositions(size_t idx) const
-    throw(RangeError) {
-  auto& a = m->getArg(idx);
-  if (a.subSequenceStartPositions) {
-    return IVector::createByPaddleVectorPtr(
-        &a.subSequenceStartPositions->getMutableVector(false));
-  } else {
-    return nullptr;
-  }
-}
-
-void Arguments::setSlotSequenceStartPositions(size_t idx,
-                                              IVector* vec) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
-  a.sequenceStartPositions = std::make_shared<paddle::ICpuGpuVector>(v);
-}
-
-void Arguments::setSlotSubSequenceStartPositions(
-    size_t idx, IVector* vec) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
-  a.subSequenceStartPositions = std::make_shared<paddle::ICpuGpuVector>(v);
-}
-
-IVector* Arguments::getSlotSequenceDim(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return IVector::createByPaddleVectorPtr(&a.cpuSequenceDims);
-}
-
-void Arguments::setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.cpuSequenceDims = m->cast<paddle::IVector>(vec->getSharedPtr());
-}
-
-float Arguments::sum() const { return paddle::Argument::sum(m->outputs); }
-
-int64_t Arguments::getBatchSize(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return a.getBatchSize();
-}
-
-void Arguments::setSlotFrameHeight(size_t idx, size_t h) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.setFrameHeight(h);
-}
-
-void Arguments::setSlotFrameWidth(size_t idx, size_t w) throw(RangeError) {
-  auto& a = m->getArg(idx);
-  a.setFrameWidth(w);
-}
-
-size_t Arguments::getSlotFrameHeight(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return a.getFrameHeight();
-}
-
-size_t Arguments::getSlotFrameWidth(size_t idx) const throw(RangeError) {
-  auto& a = m->getArg(idx);
-  return a.getFrameWidth();
-}
-
-void* Arguments::getInternalArgumentsPtr() const { return &m->outputs; }
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
deleted file mode 100644
index cf84568ecdf1227b0d0ed3606a4a9a6e5186af72..0000000000000000000000000000000000000000
--- a/paddle/api/CMakeLists.txt
+++ /dev/null
@@ -1,119 +0,0 @@
-set(API_SOURCES
-    Arguments.cpp
-    ConfigParser.cpp
-    Evaluator.cpp
-    GradientMachine.cpp
-    Matrix.cpp
-    Parameter.cpp
-    ParameterOptimizer.cpp
-    ParameterUpdater.cpp
-    SequenceGenerator.cpp
-    Trainer.cpp
-    Util.cpp
-    Vector.cpp)
-set(API_HEADER
-    PaddleAPI.h
-    Internal.h)
-
-add_library(paddle_api STATIC ${API_SOURCES})
-add_dependencies(paddle_api paddle_proto paddle_trainer_lib)
-
-INCLUDE(${SWIG_USE_FILE})
-INCLUDE_DIRECTORIES(${PADDLE_SOURCE_DIR}/paddle)
-
-FILE(GLOB PY_PADDLE_PYTHON_FILES ${PADDLE_SOURCE_DIR}/paddle/py_paddle/*.py)
-
-SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON)
-
-SET(SWIG_NEED_FLAGS
-    -ftls-model=global-dynamic
-    -Wno-parentheses-equality
-    -Wno-self-assign
-    -Wno-maybe-uninitialized
-    -Wno-missing-field-initializers)
-  FOREACH(flag ${SWIG_NEED_FLAGS})
-  safe_set_cxxflag(SWIG_CXX_FLAGS ${flag})
-ENDFOREACH()
-
-SET(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR})
-SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SWIG_CXX_FLAGS}")
-
-SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS
-    paddle_parameter
-    paddle_function
-    paddle_math
-    paddle_utils
-    paddle_gserver
-    paddle_pserver
-    paddle_api
-    paddle_cuda
-    paddle_trainer_lib
-    paddle_network
-    paddle_proto
-    ${external_project_dependencies}
-    ${RDMA_LIBS}
-)
-
-IF(APPLE)
-    SET(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load -framework CoreFoundation -framework Security")
-ELSE(APPLE)
-    SET(START_GROUP "-Xlinker -start-group")
-    SET(END_GROUP "-Xlinker -end-group")
-    SET(ARCHIVE_START "-Wl,--whole-archive")
-    SET(ARCHIVE_END "-Wl,--no-whole-archive")
-ENDIF(APPLE)
-
-SWIG_ADD_MODULE(swig_paddle python Paddle.i)
-SWIG_LINK_LIBRARIES(swig_paddle
-    ${MACOS_LD_FLAGS}
-    ${START_GROUP}
-    ${ARCHIVE_START}
-    paddle_gserver
-    paddle_function
-    ${METRIC_LIBS}
-    ${ARCHIVE_END}
-    paddle_pserver
-    paddle_trainer_lib
-    paddle_network
-    paddle_parameter
-    paddle_optimizer
-    paddle_math
-    paddle_utils
-    paddle_proto
-    paddle_cuda
-    paddle_api
-    ${CMAKE_DL_LIBS}
-    ${EXTERNAL_LIBS}
-    ${CMAKE_THREAD_LIBS_INIT}
-    ${RDMA_LD_FLAGS}
-    ${START_END}
-)
-
-add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PADDLE_SOURCE_DIR}/paddle/py_paddle
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PADDLE_SOURCE_DIR}/paddle/py_paddle
-    COMMAND ${CMAKE_COMMAND} -E touch .timestamp
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
-    DEPENDS _swig_paddle
-)
-
-# TODO(yuyang18) : make wheel name calculated by cmake
-add_custom_target(python_api_wheel ALL DEPENDS ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so)
-
-if(WITH_TESTING)
-    IF(NOT PY_PIP_FOUND)
-        SET(PIP_SOURCES_DIR ${PYTHON_SOURCES_DIR}/pip)
-        ExternalProject_Add(pip
-            ${EXTERNAL_PROJECT_LOG_ARGS}
-            GIT_REPOSITORY      https://github.com/pypa/pip.git
-            GIT_TAG             9.0.1
-            PREFIX              ${PIP_SOURCES_DIR}
-            CONFIGURE_COMMAND   ""
-            BUILD_COMMAND       ""
-            INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
-            BUILD_IN_SOURCE     1
-            #DEPENDS python setuptools python_api_wheel
-        )
-    ENDIF()
-    add_subdirectory(test)
-endif()
diff --git a/paddle/api/ConfigParser.cpp b/paddle/api/ConfigParser.cpp
deleted file mode 100644
index d362a1e7cf3c8cd05b8c85cfaf8dbbee8b827d4b..0000000000000000000000000000000000000000
--- a/paddle/api/ConfigParser.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-#include "paddle/trainer/Trainer.h"
-
-struct ParameterConfigPrivate {
-  paddle::ParameterPtr parameter;
-  paddle::ParameterConfig config;
-
-  inline paddle::ParameterConfig* getConfigPtr() {
-    if (parameter != nullptr) {
-      auto& conf = parameter->getConfig();
-      return const_cast<paddle::ParameterConfig*>(&conf);
-    } else {
-      return &config;
-    }
-  }
-};
-
-TrainerConfig::TrainerConfig() : m(new TrainerConfigPrivate()) {}
-
-TrainerConfig::~TrainerConfig() { delete m; }
-
-TrainerConfig* TrainerConfig::createFromTrainerConfigFile(
-    const std::string& confPath) {
-  LOG(INFO) << "load trainer config from " << confPath;
-  auto conf = std::make_shared<paddle::TrainerConfigHelper>(confPath);
-  auto retv = new TrainerConfig();
-  retv->m->conf = conf;
-  return retv;
-}
-
-TrainerConfig* TrainerConfig::createFromProtoString(const std::string& str) {
-  auto retv = new TrainerConfig();
-  paddle::TrainerConfig trainerConfigProto;
-  auto conf = std::make_shared<paddle::TrainerConfigHelper>(trainerConfigProto);
-  CHECK(conf->getMutableConfig().ParseFromString(str));
-  retv->m->conf = conf;
-  return retv;
-}
-
-ModelConfig::ModelConfig() : m(new ModelConfigPrivate()) {}
-
-ModelConfig::~ModelConfig() { delete m; }
-
-ModelConfig* TrainerConfig::getModelConfig() const {
-  auto retv = new ModelConfig();
-  retv->m->conf = m->conf;
-  return retv;
-}
-
-ParameterConfig::ParameterConfig() : m(new ParameterConfigPrivate()) {}
-
-ParameterConfig::~ParameterConfig() { delete m; }
-
-ParameterConfig* ParameterConfig::createParameterConfigFromParameterSharedPtr(
-    void* ptr) {
-  auto& p = *(paddle::ParameterPtr*)(ptr);
-  if (p != nullptr) {
-    auto conf = new ParameterConfig();
-    conf->m->parameter = p;
-    return conf;
-  } else {
-    return nullptr;
-  }
-}
-
-ParameterConfig* ParameterConfig::createParameterConfigFromParameterPtr(
-    void* ptr) {
-  auto& p = *(paddle::Parameter*)(ptr);
-  auto conf = new ParameterConfig();
-  conf->m->config = p.getConfig();
-  return conf;
-}
-
-std::string ParameterConfig::toProtoString() const {
-  return m->getConfigPtr()->SerializeAsString();
-}
-
-void* ParameterConfig::getRawPtr() { return m->getConfigPtr(); }
-
-OptimizationConfig::OptimizationConfig() : m(new OptimizationConfigPrivate()) {}
-
-OptimizationConfig::~OptimizationConfig() { delete m; }
-
-std::string OptimizationConfig::toProtoString() {
-  return m->getConfig().SerializeAsString();
-}
-
-OptimizationConfig* TrainerConfig::getOptimizationConfig() const {
-  auto opt_config = new OptimizationConfig();
-  opt_config->m->trainer_config = m->conf;
-  return opt_config;
-}
-
-OptimizationConfig* OptimizationConfig::createFromProtoString(
-    const std::string& str) {
-  auto conf = new OptimizationConfig();
-  conf->m->config.ParseFromString(str);
-  return conf;
-}
diff --git a/paddle/api/GradientMachine.cpp b/paddle/api/GradientMachine.cpp
deleted file mode 100644
index a3d6f0f080abcf1f45d9bc5fbdb39bb6b6ca1553..0000000000000000000000000000000000000000
--- a/paddle/api/GradientMachine.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-
-#include "Internal.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-
-std::vector<int> GradientMachine::defaultParamTypes = {
-    PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM};
-
-GradientMachine::GradientMachine() : m(new GradientMachinePrivate()) {}
-
-GradientMachine::~GradientMachine() { delete m; }
-
-GradientMachine* GradientMachine::createFromPaddleModelPtr(
-    const void* confPtr,
-    GradientMatchineCreateMode mode,
-    const std::vector<int>& types) {
-  auto& conf = *(const paddle::ModelConfig*)(confPtr);
-  std::vector<ParameterType> realTypes;
-  staticCastVector(&realTypes, types);
-  auto machineRawPtr = paddle::GradientMachine::create(conf, mode, realTypes);
-  auto machinePtr = std::shared_ptr<paddle::GradientMachine>(machineRawPtr);
-  if (machinePtr != nullptr) {
-    auto machine = new GradientMachine();
-    machine->m->machine = machinePtr;
-    return machine;
-  } else {
-    return nullptr;
-  }
-}
-
-GradientMachine* GradientMachine::createByConfigProtoStr(
-    const std::string& protoStr,
-    GradientMatchineCreateMode mode,
-    const std::vector<int>& types) {
-  paddle::ModelConfig conf;
-  conf.ParseFromString(protoStr);
-  if (conf.IsInitialized()) {
-    return GradientMachine::createFromPaddleModelPtr(&conf, mode, types);
-  } else {
-    return nullptr;
-  }
-}
-
-GradientMachine* GradientMachine::createByModelConfig(
-    ModelConfig* conf,
-    GradientMatchineCreateMode mode,
-    const std::vector<int>& types) {
-  auto confPtr = &conf->m->conf->getModelConfig();
-  return GradientMachine::createFromPaddleModelPtr(confPtr, mode, types);
-}
-
-void GradientMachine::start() { m->machine->start(); }
-
-void GradientMachine::finish() { m->machine->finish(); }
-
-void GradientMachine::onPassEnd() { m->machine->onPassEnd(); }
-
-void GradientMachine::prefetch(const Arguments& inArgs) {
-  auto& in =
-      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
-  m->machine->prefetch(in);
-}
-
-void GradientMachine::forward(const Arguments& inArgs,
-                              Arguments* outArgs,
-                              PassType passType) {
-  auto& in =
-      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
-  auto& out = m->cast<std::vector<paddle::Argument>>(
-      outArgs->getInternalArgumentsPtr());
-  paddle::PassType pt = (paddle::PassType)(passType);
-  m->machine->forward(in, &out, pt);
-}
-
-UpdateCallback::~UpdateCallback() {}
-
-void UpdateCallback::apply(Parameter* p) {
-  // UNUSED(p);
-}
-
-class UpdateCallbackWrapper {
-public:
-  explicit UpdateCallbackWrapper(const UpdateCallback& callback)
-      : callback(const_cast<UpdateCallback&>(callback)) {}
-
-  void operator()(paddle::Parameter* param) {
-    auto p = Parameter::createFromRawPtr(&param);
-    // @TODO Use Stack variable instead.
-    callback.apply(p);
-    delete p;
-  }
-
-private:
-  UpdateCallback& callback;
-};
-
-void GradientMachine::backward(const UpdateCallback& callback) {
-  m->machine->backward(UpdateCallbackWrapper(callback));
-}
-
-void GradientMachine::forwardBackward(const Arguments& inArgs,
-                                      Arguments* outArgs,
-                                      PassType passType,
-                                      const UpdateCallback& callback) {
-  auto& in =
-      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
-  auto& out = m->cast<std::vector<paddle::Argument>>(
-      outArgs->getInternalArgumentsPtr());
-  paddle::PassType pt = (paddle::PassType)(passType);
-  m->machine->forwardBackward(in, &out, pt, UpdateCallbackWrapper(callback));
-}
-
-void GradientMachine::loadParameters(const std::string& path) {
-  m->machine->loadParameters(path);
-}
-
-size_t GradientMachine::getParameterSize() const {
-  return m->machine->getParameters().size();
-}
-
-Parameter* GradientMachine::getParameter(size_t i) throw(RangeError) {
-  auto params = m->machine->getParameters();
-  if (i < params.size()) {
-    return Parameter::createFromSharedPtr(&m->machine->getParameters()[i]);
-  } else {
-    throw RangeError();
-  }
-}
-
-size_t GradientMachine::getNonStaticParameterSize() const {
-  return m->machine->getNonStaticParameters().size();
-}
-
-Parameter* GradientMachine::getNonStaticParameter(size_t i) throw(RangeError) {
-  auto params = m->machine->getNonStaticParameters();
-  if (i < params.size()) {
-    return Parameter::createFromSharedPtr(
-        &m->machine->getNonStaticParameters()[i]);
-  } else {
-    throw RangeError();
-  }
-}
-
-void GradientMachine::randParameters() { m->machine->randParameters(); }
-
-Arguments* GradientMachine::getLayerOutput(const std::string& layerName) const
-    throw(UnsupportError) {
-  auto nn = m->machine;
-  if (nn) {
-    auto arg = nn->getLayerOutput(layerName);
-    return Arguments::createByPaddleArgument(&arg);
-  } else {
-    throw UnsupportError();
-  }
-}
-
-SequenceGenerator* GradientMachine::asSequenceGenerator(
-    const std::vector<std::string>& dict,
-    size_t begin_id,
-    size_t end_id,
-    size_t max_length,
-    size_t beam_size) {
-  SequenceGenerator* r =
-      SequenceGenerator::createByGradientMachineSharedPtr(&m->machine);
-  r->setDict(dict);
-  r->setBos(begin_id);
-  r->setEos(end_id);
-  r->setMaxLength(max_length);
-  r->setBeamSize(beam_size);
-  return r;
-}
-
-Evaluator* GradientMachine::makeEvaluator() {
-  auto ev = new Evaluator();
-  ev->m->rawPtr = m->machine->makeEvaluator();
-  return ev;
-}
-
-void GradientMachine::eval(Evaluator* evaluator) {
-  m->machine->eval(evaluator->m->rawPtr);
-}
diff --git a/paddle/api/Matrix.cpp b/paddle/api/Matrix.cpp
deleted file mode 100644
index 8282b4629dc08a7fcd9b52cbc3492ac10d8ed55c..0000000000000000000000000000000000000000
--- a/paddle/api/Matrix.cpp
+++ /dev/null
@@ -1,317 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/math/Matrix.h"
-#include <cstring>
-#include <iostream>
-#include "PaddleAPI.h"
-#include "paddle/math/CpuSparseMatrix.h"
-#include "paddle/math/SparseMatrix.h"
-
-struct MatrixPrivate {
-  std::shared_ptr<paddle::Matrix> mat;
-};
-
-Matrix::Matrix() : m(new MatrixPrivate()) {}
-
-Matrix* Matrix::createByPaddleMatrixPtr(void* sharedPtr) {
-  auto* mat = reinterpret_cast<paddle::MatrixPtr*>(sharedPtr);
-  if ((*mat) != nullptr) {
-    auto m = new Matrix();
-    m->m->mat = *mat;
-    return m;
-  } else {
-    return nullptr;
-  }
-}
-
-Matrix* Matrix::createZero(size_t height, size_t width, bool useGpu) {
-  auto m = new Matrix();
-  m->m->mat = paddle::Matrix::create(height, width, useGpu);
-  m->m->mat->zero();
-  return m;
-}
-
-Matrix* Matrix::createDense(const std::vector<float>& data,
-                            size_t height,
-                            size_t width,
-                            bool useGpu) {
-  auto m = new Matrix();
-  m->m->mat = paddle::Matrix::create(height, width, useGpu);
-  m->m->mat->copyFrom(data.data(), data.size());
-  return m;
-}
-
-Matrix* Matrix::createDenseFromNumpy(float* data,
-                                     int dim1,
-                                     int dim2,
-                                     bool copy,
-                                     bool useGpu) throw(UnsupportError) {
-  if (useGpu) {
-    /// Gpu mode only supports copy=True
-    if (!copy) {
-      throw UnsupportError("Gpu mode only supports copy=True");
-    }
-    return Matrix::createGpuDenseFromNumpy(data, dim1, dim2);
-  } else {
-    return Matrix::createCpuDenseFromNumpy(data, dim1, dim2, copy);
-  }
-}
-
-Matrix* Matrix::createCpuDenseFromNumpy(float* data,
-                                        int dim1,
-                                        int dim2,
-                                        bool copy) {
-  auto m = new Matrix();
-  if (copy) {
-    m->m->mat = paddle::Matrix::create(dim1, dim2);
-    m->m->mat->copyFrom(data, dim1 * dim2);
-  } else {
-    m->m->mat = paddle::Matrix::create(data, dim1, dim2, false);
-  }
-  return m;
-}
-
-Matrix* Matrix::createGpuDenseFromNumpy(float* data, int dim1, int dim2) {
-  auto m = new Matrix();
-  m->m->mat = paddle::Matrix::create(dim1, dim2, false, true);
-  m->m->mat->copyFrom(data, dim1 * dim2);
-  return m;
-}
-
-Matrix* Matrix::createSparse(size_t height,
-                             size_t width,
-                             size_t nnz,
-                             bool isNonVal,
-                             bool isTrans,
-                             bool useGpu) {
-  auto m = new Matrix();
-  m->m->mat = paddle::Matrix::createSparseMatrix(
-      height,
-      width,
-      nnz,
-      isNonVal ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
-      isTrans,
-      useGpu);
-  return m;
-}
-
-Matrix::~Matrix() { delete m; }
-
-size_t Matrix::getHeight() const { return m->mat->getHeight(); }
-
-size_t Matrix::getWidth() const { return m->mat->getWidth(); }
-
-float Matrix::get(size_t x, size_t y) const throw(RangeError) {
-  if (x > this->getWidth() || y > this->getHeight()) {
-    RangeError e;
-    throw e;
-  }
-  return m->mat->getElement(x, y);
-}
-
-void Matrix::set(size_t x, size_t y, float val) throw(RangeError,
-                                                      UnsupportError) {
-  if (x > this->getWidth() || y > this->getHeight()) {
-    RangeError e;
-    throw e;
-  }
-  auto rawMat = m->mat.get();
-  if (auto cDenseMat = dynamic_cast<paddle::CpuMatrix*>(rawMat)) {
-    *(cDenseMat->getData() + x + y * cDenseMat->getWidth()) = val;
-  } else {
-    UnsupportError e;
-    throw e;
-  }
-}
-
-bool Matrix::isSparse() const {
-  auto raw_mat = m->mat.get();
-  return dynamic_cast<paddle::CpuSparseMatrix*>(raw_mat) != nullptr ||
-         dynamic_cast<paddle::GpuSparseMatrix*>(raw_mat) != nullptr;
-}
-
-SparseValueType Matrix::getSparseValueType() const throw(UnsupportError) {
-  auto cpuSparseMat =
-      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
-  if (cpuSparseMat != nullptr) {
-    return (SparseValueType)cpuSparseMat->getValueType();
-  } else {
-    auto gpuSparseMat =
-        std::dynamic_pointer_cast<paddle::GpuSparseMatrix>(m->mat);
-    if (gpuSparseMat != nullptr) {
-      return (SparseValueType)gpuSparseMat->getValueType();
-    } else {
-      UnsupportError e;
-      throw e;
-    }
-  }
-}
-
-SparseFormatType Matrix::getSparseFormat() const throw(UnsupportError) {
-  auto cpuSparseMat =
-      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
-  if (cpuSparseMat != nullptr) {
-    return (SparseFormatType)cpuSparseMat->getFormat();
-  } else {
-    auto gpuSparseMat =
-        std::dynamic_pointer_cast<paddle::GpuSparseMatrix>(m->mat);
-    if (gpuSparseMat != nullptr) {
-      return SPARSE_CSR;
-    } else {
-      UnsupportError e;
-      throw e;
-    }
-  }
-}
-
-IntArray Matrix::getSparseRowCols(size_t i) const
-    throw(UnsupportError, RangeError) {
-  auto cpuSparseMat =
-      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
-  if (cpuSparseMat != nullptr &&
-      cpuSparseMat->getFormat() == paddle::SPARSE_CSR) {
-    if (i < cpuSparseMat->getHeight()) {
-      // cpuSparseMat->print(std::cout);
-      size_t len = cpuSparseMat->getColNum(i);
-      return IntArray(cpuSparseMat->getRowCols(i), len);
-    } else {
-      RangeError e;
-      throw e;
-    }
-  } else {
-    UnsupportError e;
-    throw e;
-  }
-}
-
-IntWithFloatArray Matrix::getSparseRowColsVal(size_t i) const
-    throw(UnsupportError, RangeError) {
-  auto cpuSparseMat =
-      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
-  if (cpuSparseMat != nullptr &&
-      cpuSparseMat->getValueType() == paddle::FLOAT_VALUE) {
-    if (i < cpuSparseMat->getHeight()) {
-      return IntWithFloatArray(cpuSparseMat->getRowValues(i),
-                               cpuSparseMat->getRowCols(i),
-                               cpuSparseMat->getColNum(i));
-    } else {
-      RangeError e;
-      throw e;
-    }
-  } else {
-    UnsupportError e;
-    throw e;
-  }
-}
-
-FloatArray Matrix::getData() const {
-  auto rawMat = m->mat.get();
-  if (dynamic_cast<paddle::GpuMemoryHandle*>(rawMat->getMemoryHandle().get())) {
-    // is gpu. then copy data
-    float* data = rawMat->getData();
-    size_t len = rawMat->getElementCnt();
-    float* cpuData = new float[len];
-    hl_memcpy_device2host(cpuData, data, len * sizeof(float));
-    FloatArray ret_val(cpuData, len);
-    ret_val.needFree = true;
-    return ret_val;
-  } else {
-    FloatArray ret_val(rawMat->getData(), rawMat->getElementCnt());
-    return ret_val;
-  }
-}
-
-void Matrix::sparseCopyFrom(
-    const std::vector<int>& rows,
-    const std::vector<int>& cols,
-    const std::vector<float>& vals) throw(UnsupportError) {
-  auto cpuSparseMat =
-      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
-  if (cpuSparseMat != nullptr) {
-    // LOG(INFO) <<"RowSize = "<<rows.size()
-    //  <<" ColSize = "<<cols.size()
-    //  <<" ValSize = "<<vals.size();
-    cpuSparseMat->copyFrom(const_cast<std::vector<int>&>(rows),
-                           const_cast<std::vector<int>&>(cols),
-                           const_cast<std::vector<float>&>(vals));
-  } else {
-    UnsupportError e;
-    throw e;
-  }
-}
-
-void* Matrix::getSharedPtr() const { return &m->mat; }
-
-void Matrix::toNumpyMatInplace(float** view_data,
-                               int* dim1,
-                               int* dim2) throw(UnsupportError) {
-  auto cpuMat = std::dynamic_pointer_cast<paddle::CpuMatrix>(m->mat);
-  if (cpuMat) {
-    *dim1 = cpuMat->getHeight();
-    *dim2 = cpuMat->getWidth();
-    *view_data = cpuMat->getData();
-  } else {
-    throw UnsupportError();
-  }
-}
-void Matrix::copyToNumpyMat(float** view_m_data,
-                            int* dim1,
-                            int* dim2) throw(UnsupportError) {
-  static_assert(sizeof(paddle::real) == sizeof(float),
-                "Currently PaddleAPI only support for single "
-                "precision version of paddle.");
-  if (this->isSparse()) {
-    throw UnsupportError();
-  } else {
-    *dim1 = m->mat->getHeight();
-    *dim2 = m->mat->getWidth();
-    *view_m_data = new float[(*dim1) * (*dim2)];
-    if (auto cpuMat = dynamic_cast<paddle::CpuMatrix*>(m->mat.get())) {
-      auto src = cpuMat->getData();
-      auto dest = *view_m_data;
-      std::memcpy(dest, src, sizeof(paddle::real) * (*dim1) * (*dim2));
-    } else if (auto gpuMat = dynamic_cast<paddle::GpuMatrix*>(m->mat.get())) {
-      auto src = gpuMat->getData();
-      auto dest = *view_m_data;
-      hl_memcpy_device2host(
-          dest, src, sizeof(paddle::real) * (*dim1) * (*dim2));
-    } else {
-      LOG(WARNING) << "Unexpected Situation";
-      throw UnsupportError();
-    }
-  }
-}
-
-void Matrix::copyFromNumpyMat(float* data,
-                              int dim1,
-                              int dim2) throw(UnsupportError, RangeError) {
-  if (isSparse()) {
-    throw UnsupportError();
-  } else {
-    if (this->getHeight() == (size_t)dim1 && this->getWidth() == (size_t)dim2) {
-      if (m->mat->getData() != data) {
-        m->mat->copyFrom(data, dim1 * dim2);
-      }
-    } else {
-      throw RangeError();
-    }
-  }
-}
-
-bool Matrix::isGpu() const {
-  auto rawPtr = m->mat.get();
-  return dynamic_cast<paddle::GpuMatrix*>(rawPtr) != nullptr ||
-         dynamic_cast<paddle::GpuSparseMatrix*>(rawPtr) != nullptr;
-}
diff --git a/paddle/api/Paddle.i b/paddle/api/Paddle.i
deleted file mode 100644
index 3237e73745dca58bed923b20851f0f0039a3487c..0000000000000000000000000000000000000000
--- a/paddle/api/Paddle.i
+++ /dev/null
@@ -1,202 +0,0 @@
-%module(directors="1") swig_paddle
-%include "std_string.i"
-%{
-#define SWIG_FILE_WITH_INIT
-#include "api/PaddleAPI.h"   
-%}
-
-%include "exception.i"
-%typemap(throws) UnsupportError %{
-  SWIG_exception(SWIG_RuntimeError, $1.what());
-  SWIG_fail;
-%}
-
-%include "std_vector.i"
-%include "std_pair.i"
-#ifdef SWIGPYTHON
-%include "numpy.i"
-#endif
-
-%init %{
-#ifdef SWIGPYTHON
-import_array();
-#endif
-%}
-
-
-namespace std {
-%template(vector_int) vector<int>;
-%template(vector_uint) vector<unsigned int>;
-%template(vector_float) vector<float>;
-%template(vector_string) vector<string>;
-%template(vector_vec_star) vector<Vector*>;
-}
-#ifdef SWIGPYTHON 
-%typemap(in) (int argc, char** argv) { 
-    int i = 0; 
-    if (!PyList_Check($input)) { 
-        PyErr_SetString(PyExc_ValueError, "Expecting a list"); 
-        return NULL; 
-    } 
-    $1 = PyList_Size($input); 
-    $2 = (char **) malloc(($1+1)*sizeof(char *)); 
-    for (i = 0; i < $1; i++) { 
-        PyObject *s = PyList_GetItem($input,i); 
-        if (!PyString_Check(s)) { 
-            free($2); 
-            PyErr_SetString(PyExc_ValueError, "List items must be strings"); 
-            return NULL; 
-        } 
-        $2[i] = PyString_AsString(s); 
-    } 
-    $2[i] = 0; 
-} 
-%typemap(freearg) (int argc, char** argv) { 
-    if ($2) free($2); 
-} 
-
-%typemap(out) FloatArray {
-  $result = PyList_New($1.length);
-  for (size_t i=0; i<$1.length; ++i) {
-    PyList_SetItem($result, i, PyFloat_FromDouble($1.buf[i]));
-  }  
-  if($1.needFree) {
-    delete [] $1.buf;  
-  }
-}
-
-%typemap(out) IntArray {
-  $result = PyList_New($1.length);  
-  for (size_t i=0; i<$1.length; ++i) {
-    PyList_SetItem($result, i, PyInt_FromLong($1.buf[i]));  
-  }
-  if ($1.needFree) {
-    delete [] $1.buf;  
-  }
-}
-
-%typemap(out) IntWithFloatArray {
-  $result = PyList_New($1.length);
-  for (size_t i=0; i<$1.length; ++i) {
-    PyList_SetItem($result, i, PyTuple_Pack(2, 
-      PyInt_FromLong($1.idxBuf[i]),
-      PyFloat_FromDouble($1.valBuf[i])
-    ));
-  }
-  if ($1.needFree) {
-    delete [] $1.idxBuf;
-    delete [] $1.valBuf;
-  } 
-}
-
-
-%rename(__getitem__) IVector::get;
-%rename(__setitem__) IVector::set;
-%rename(__len__) IVector::getSize;
-%rename(__getitem__) Vector::get;
-%rename(__setitem__) Vector::set;
-%rename(__len__) Vector::getSize;
-%rename(__len__) Parameter::getSize;
-%rename(__call__) ParameterTraverseCallback::apply;
-%rename(__repr__) Evaluator::toString;
-
-%apply (float* INPLACE_ARRAY2, int DIM1, int DIM2) { 
-  (float* data, int dim1, int dim2) 
-}
-
-%apply (float** ARGOUTVIEW_ARRAY2, int* DIM1, int* DIM2) { 
-  (float** view_data, int* dim1, int* dim2) 
-}
-
-%apply (float** ARGOUTVIEWM_ARRAY2, int* DIM1, int* DIM2) {
-  (float** view_m_data, int* dim1, int* dim2)  
-}
-
-%apply (int** ARGOUTVIEWM_ARRAY1, int* DIM1) {
-  (int** view_m_data, int* dim1)  
-}
-
-%apply (int* INPLACE_ARRAY1, int DIM1) { 
-  (int* data, int dim) 
-}
-
-%apply (int** ARGOUTVIEW_ARRAY1, int* DIM1) {
-  (int** view_data, int* dim1)  
-}
-
-%apply (float* INPLACE_ARRAY1, int DIM1) {
-  (float* data, int dim)
-}
-
-%apply (float** ARGOUTVIEW_ARRAY1, int* DIM1) {
-  (float** view_data, int* dim1)
-}
-
-%apply (float** ARGOUTVIEWM_ARRAY1, int* DIM1) {
-  (float** view_m_data, int* dim1)
-}
-
-#endif
-// The below functions internally create object by "new", so it should use
-// use SWIG to handle gc. There are hints for SWIG to handle GC.
-%newobject Matrix::createZero;
-%newobject Matrix::createSparse;
-%newobject Matrix::createDense;
-%newobject Matrix::createDenseFromNumpy;
-%newobject Matrix::createCpuDenseFromNumpy;
-%newobject Matrix::createGpuDenseFromNumpy;
-%newobject Vector::createZero;
-%newobject Vector::create;
-%newobject Vector::createVectorFromNumpy;
-%newobject Vector::createCpuVectorFromNumpy;
-%newobject Vector::createGpuVectorFromNumpy;
-%newobject IVector::createZero;
-%newobject IVector::create;
-%newobject IVector::createVectorFromNumpy;
-%newobject IVector::createCpuVectorFromNumpy;
-%newobject IVector::createGpuVectorFromNumpy;
-%newobject Trainer::createByCommandLine;
-%newobject Trainer::getForwardOutput;
-%newobject Trainer::getLayerOutput;
-%newobject Arguments::getSlotValue;
-%newobject Arguments::getSlotIds;
-%newobject Arguments::getSlotIn;
-%newobject Arguments::getSlotSequenceStartPositions;
-%newobject Arguments::getSlotSequenceDim;
-%newobject Arguments::createArguments;
-%newobject GradientMachine::createByConfigProtoStr;
-%newobject GradientMachine::createByModelConfig;
-%newobject GradientMachine::asSequenceGenerator;
-%newobject GradientMachine::getParameter;
-%newobject GradientMachine::getLayerOutput;
-%newobject GradientMachine::makeEvaluator;
-%newobject TrainerConfig::createFromTrainerConfigFile;
-%newobject TrainerConfig::getModelConfig;
-%newobject TrainerConfig::getOptimizationConfig;
-%newobject Parameter::getBuf;
-%newobject Parameter::getConfig;
-%newobject ParameterOptimizer::create;
-%newobject ParameterOptimizer::needSpecialTraversal;
-%newobject ParameterUpdater::createLocalUpdater;
-%newobject ParameterUpdater::createRemoteUpdater;
-%newobject ParameterUpdater::createNewRemoteUpdater;
-
-%feature("director") UpdateCallback;
-%feature("autodoc", 1); // To generate method stub, for code hint in ide
-
-// Ignore many private class, and method cannot be handled by swig.
-%ignore MatrixPrivate;
-%ignore TrainerPrivate;
-%ignore IVector::operator[];
-%ignore ArgumentsPrivate;
-%ignore GradientMachinePrivate;
-%ignore TrainerConfigPrivate;
-%ignore ModelConfigPrivate;
-%ignore ParameterPrivate;
-%ignore SequenceGeneratorPrivate;
-%ignore VectorPrivate;
-%ignore ParameterConfigPrivate;
-%ignore OptimizationConfigPrivate;
-%ignore ParameterTraverseCallbackPrivate;
-%include "utils/GlobalConstants.h"
-%include "api/PaddleAPI.h"
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
deleted file mode 100644
index 67368d1a99d980b248789d24a2ea4f466255687a..0000000000000000000000000000000000000000
--- a/paddle/api/PaddleAPI.h
+++ /dev/null
@@ -1,1053 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdexcept>
-#include <string>
-#include <vector>
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/GlobalConstants.h"
-
-/// Import PaddlePaddle's enumeration into global namespace.
-using namespace paddle::enumeration_wrapper;  // NOLINT
-
-/**
- * @brief Initialize paddle.
- *
- * In python, this method should be invoked as
- * @code
- *  import sys
- *  import paddle
- *  paddle.initPaddle(sys.argv)
- *  or you can change arguments as any list of str.
- * @endcode
- */
-void initPaddle(int argc, char** argv);
-
-/// Return FLAGS_use_gpu
-bool isUsingGpu();
-
-/// Set the Flags_use_gpu to the given parameter
-void setUseGpu(bool useGpu);
-
-/// Return true if this py_paddle is compiled in GPU Version
-bool isGpuVersion();
-
-/// Return FLAGS_trainer_count
-int getTrainerCount();
-
-/// The Error of IO Operation. Such as file not found, etc.
-class IOError {};
-
-/// Out of range error
-class RangeError {};
-
-/// Not support Error, such as access GPU memory directly, etc.
-class UnsupportError : public std::runtime_error {
-public:
-  UnsupportError() : std::runtime_error(" "){};
-  UnsupportError(const std::string& message) : std::runtime_error(message){};
-};
-
-/// This type will map to python's list of float.
-struct FloatArray {
-  const float* buf;
-  const size_t length;
-  bool needFree;  // true if the buf is dynamic alloced.
-  FloatArray(const float* b, const size_t l);
-};
-
-/// This type will map to python's list of int
-struct IntArray {
-  const int* buf;
-  const size_t length;
-  bool needFree;
-  IntArray(const int* b, const size_t l, bool f = false);
-};
-
-/// This type will map to python's list of (int, float)
-struct IntWithFloatArray {
-  const float* valBuf;
-  const int* idxBuf;
-  const size_t length;
-  bool needFree;
-  IntWithFloatArray(const float* v, const int* i, size_t l, bool f = false);
-};
-
-enum SparseValueType { SPARSE_NON_VALUE = 0, SPARSE_VALUE = 1 };
-
-enum SparseFormatType { SPARSE_CSR = 0, SPARSE_CSC = 1 };
-
-/**
- * In Python, -1UL is hard to write. So define a const value used by python
- * side.
- */
-const size_t NO_SPARSE_ID = -1UL;
-
-struct MatrixPrivate;
-class Matrix {
-  Matrix();  // User Cannot Create Matrix.
-  DISABLE_COPY(Matrix);
-  static Matrix* createByPaddleMatrixPtr(void* sharedPtr);
-
-public:
-  virtual ~Matrix();
-
-  /**
-   * Create A Matrix with height,width, which is filled by zero.
-   */
-  static Matrix* createZero(size_t height,
-                            size_t width,
-                            bool useGpu = isUsingGpu());
-
-  /**
-   * Create Sparse Matrix.
-   *
-   * After create sparse, sparseCopyFrom can be used to fill matrix.
-   *
-   * @param nnz  Number of non zero values.
-   *
-   * @note the default sparse type is SPARSE_CSR.
-   */
-  static Matrix* createSparse(size_t height,
-                              size_t width,
-                              size_t nnz,
-                              bool isNonVal = true,
-                              bool trans = false,
-                              bool useGpu = isUsingGpu());
-
-  /**
-   * Create Dense Matrix.
-   *
-   * @param data  list of float should be passed in python.
-   * @note        the value will be copy into a new matrix.
-   */
-  static Matrix* createDense(const std::vector<float>& data,
-                             size_t height,
-                             size_t width,
-                             bool useGpu = isUsingGpu());
-
-  static Matrix* createDenseFromNumpy(
-      float* data,
-      int dim1,
-      int dim2,
-      bool copy = true,
-      bool useGpu = isUsingGpu()) throw(UnsupportError);
-
-  /**
-   *  Create Cpu Dense Matrix from numpy matrix, dtype=float32
-   *
-   *  @param data  a numpy matrix.
-   *  @param dim1  dimension of data.
-   *  @param dim2  dimension of data.
-   *  @param copy  true if copy into a new matrix, false will create
-   *               matrix inplace. copy = false should be used with extreme
-   *               care because Matrix will share the memory with the given
-   *               numpy array. If the numpy array object is no longer valid,
-   *               the memory space will not be usable.
-   */
-  static Matrix* createCpuDenseFromNumpy(float* data,
-                                         int dim1,
-                                         int dim2,
-                                         bool copy = true);
-
-  /// Create Gpu Dense Matrix from numpy matrix, dtype=float32
-  static Matrix* createGpuDenseFromNumpy(float* data, int dim1, int dim2);
-
-  /**
-   * Cast to numpy matrix.
-   *
-   * @note    This method take no parameter in python.
-   * @note    This method in python will return a numpy matrix, not void.
-   * @note    Only CpuDenseMatrix is supported.
-   *
-   * Example:
-   * @code
-   * import paddle
-   * m = paddle.Matrix.createZero(10,2)
-   * numpy_mat = m.toNumpyMat()
-   * @endcode
-   */
-  void toNumpyMatInplace(float** view_data,
-                         int* dim1,
-                         int* dim2) throw(UnsupportError);
-
-  /// Copy To numpy mat.
-  void copyToNumpyMat(float** view_m_data,
-                      int* dim1,
-                      int* dim2) throw(UnsupportError);
-
-  /// Copy From Numpy Mat
-  void copyFromNumpyMat(float* data, int dim1, int dim2) throw(UnsupportError,
-                                                               RangeError);
-
-  /// return true if this matrix is sparse.
-  bool isSparse() const;
-
-  SparseValueType getSparseValueType() const throw(UnsupportError);
-
-  SparseFormatType getSparseFormat() const throw(UnsupportError);
-
-  IntArray getSparseRowCols(size_t i) const throw(UnsupportError, RangeError);
-
-  IntWithFloatArray getSparseRowColsVal(size_t i) const
-      throw(UnsupportError, RangeError);
-
-  size_t getHeight() const;
-
-  size_t getWidth() const;
-
-  float get(size_t x, size_t y) const throw(RangeError);
-
-  void set(size_t x, size_t y, float val) throw(RangeError, UnsupportError);
-
-  /// return type is list of float
-  FloatArray getData() const;
-
-  /**
-   * Copy from rows, cols, values.
-   *
-   * if sparse_nonvalue, the values should be []
-   */
-  void sparseCopyFrom(const std::vector<int>& rows,
-                      const std::vector<int>& cols,
-                      const std::vector<float>& values =
-                          std::vector<float>()) throw(UnsupportError);
-
-  bool isGpu() const;
-
-private:
-  void* getSharedPtr() const;
-
-  MatrixPrivate* m;
-  friend class Trainer;
-  friend class GradientMachine;
-  friend class Arguments;
-};
-
-struct VectorPrivate;
-class Vector {
-  DISABLE_COPY(Vector);
-  Vector();
-  static Vector* createByPaddleVectorPtr(void* ptr);
-
-  void* getSharedPtr();
-
-public:
-  ~Vector();
-
-  /// Create Vector filled with zero.
-  static Vector* createZero(size_t sz, bool useGpu = isUsingGpu());
-
-  /**
-   * Create Vector from list of float.
-   *
-   * It will create a new vector, and copy data into it.
-   */
-  static Vector* create(const std::vector<float>& data,
-                        bool useGpu = isUsingGpu());
-
-  static Vector* createVectorFromNumpy(
-      float* data,
-      int dim,
-      bool copy = true,
-      bool useGpu = isUsingGpu()) throw(UnsupportError);
-  /**
-   * Create Cpu Vector from numpy array, which dtype=float32
-   *
-   * If copy is false, it will create vector inplace.
-   */
-  static Vector* createCpuVectorFromNumpy(float* data,
-                                          int dim,
-                                          bool copy = true);
-
-  /// Create Gpu Vector from numpy array, which dtype=float32
-  static Vector* createGpuVectorFromNumpy(float* data, int dim);
-
-  /**
-   * copy from another vector
-   * throw(RangeError) if size of src vector is different from size of this
-   * vector
-   */
-  void copyFrom(Vector* src) throw(RangeError);
-
-  /// Cast to numpy array inplace.
-  void toNumpyArrayInplace(float** view_data, int* dim1) throw(UnsupportError);
-
-  /// Copy to numpy array.
-  void copyToNumpyArray(float** view_m_data, int* dim1);
-
-  /// Copy from numpy array.
-  void copyFromNumpyArray(float* data, int dim);
-
-  /// __getitem__ in python
-  float get(const size_t idx) const throw(RangeError, UnsupportError);
-
-  /// __setitem__ in python
-  void set(const size_t idx, float val) throw(RangeError, UnsupportError);
-
-  /// Return is GPU vector or not.
-  bool isGpu() const;
-
-  /// Return a list of float, the memory is alloced and copied.
-  FloatArray getData() const;
-
-  /// __len__ in python
-  size_t getSize() const;
-
-private:
-  VectorPrivate* m;
-
-private:
-  friend class Parameter;
-  friend class ParameterOptimizer;
-  friend struct ParameterTraverseCallbackPrivate;
-};
-
-struct IVectorPrivate;
-class IVector {
-  IVector();
-  DISABLE_COPY(IVector);
-  static IVector* createByPaddleVectorPtr(void* ptr);
-
-public:
-  /// Create IVector filled with zero
-  static IVector* createZero(size_t sz, bool useGpu = isUsingGpu());
-
-  /**
-   * Create IVector from list of int.
-   * It will create a new vector, and copy data into it.
-   */
-  static IVector* create(const std::vector<int>& data,
-                         bool useGpu = isUsingGpu());
-
-  static IVector* createVectorFromNumpy(
-      int* data,
-      int dim,
-      bool copy = true,
-      bool useGpu = isUsingGpu()) throw(UnsupportError);
-
-  /**
-   * Create Cpu IVector from numpy array, which dtype=int32
-   *
-   * If copy is false, it will create vector inplace
-   */
-  static IVector* createCpuVectorFromNumpy(int* data,
-                                           int dim,
-                                           bool copy = true);
-  /**
-   * Create Gpu IVector from numpy array, which dtype=int32
-   */
-  static IVector* createGpuVectorFromNumpy(int* data, int dim);
-
-  /// Cast to numpy array inplace.
-  void toNumpyArrayInplace(int** view_data, int* dim1) throw(UnsupportError);
-
-  /// Copy to numpy array.
-  void copyToNumpyArray(int** view_m_data, int* dim1);
-
-  /// Copy from numpy array.
-  void copyFromNumpyArray(int* data, int dim);
-
-  virtual ~IVector();
-
-  /// Return a list of int, the memory is alloced and copied.
-  IntArray getData() const;
-
-  /// This method will map to python [] method.
-  int& operator[](const size_t idx) throw(RangeError, UnsupportError);
-
-  const int& operator[](const size_t idx) const
-      throw(RangeError, UnsupportError);
-
-  inline int get(const size_t idx) const throw(RangeError, UnsupportError) {
-    return (*this)[idx];
-  }
-
-  inline void set(const size_t idx, int val) throw(RangeError, UnsupportError) {
-    (*this)[idx] = val;
-  }
-
-  /// Return true if it is gpu vector.
-  bool isGpu() const;
-
-  /// This method will map to python __len__();
-  size_t getSize() const;
-
-private:
-  void* getSharedPtr() const;
-
-  friend class Arguments;
-  IVectorPrivate* m;
-};
-
-struct ArgumentsPrivate;
-
-/// The Arguments is actual a std::vector<paddle::Argument> in paddle.
-class Arguments {
-private:
-  Arguments();  // Internal Create.
-  DISABLE_COPY(Arguments);
-
-public:
-  /**
-   * Create a arguments with size.
-   * Note that it can be zero.
-   */
-  static Arguments* createArguments(size_t slotNum);
-
-  void resize(size_t slotNum);
-
-  virtual ~Arguments();
-
-  /**
-   * Return the slot number that aguments contains.
-   *
-   * It is actually the vector's size
-   */
-  size_t getSlotNum() const;
-
-  /**
-   * The get functions of Arguments
-   *
-   * the param idx is the slot id
-   */
-  Matrix* getSlotValue(size_t idx) const throw(RangeError);
-  Matrix* getSlotGrad(size_t idx) const throw(RangeError);
-  IVector* getSlotIds(size_t idx) const throw(RangeError);
-  Matrix* getSlotIn(size_t idx) const throw(RangeError);
-  IVector* getSlotSequenceStartPositions(size_t idx) const throw(RangeError);
-  IVector* getSlotSubSequenceStartPositions(size_t idx) const throw(RangeError);
-  IVector* getSlotSequenceDim(size_t idx) const throw(RangeError);
-  // End Of get functions of Arguments
-
-  int64_t getBatchSize(size_t idx = 0) const throw(RangeError);
-
-  /**
-   * The set functions of Arguments.
-   *
-   * The param idx is the slot id.
-   * The other param is the input Matrix or vector.
-   */
-  void setSlotValue(size_t idx, Matrix* mat) throw(RangeError);
-  void setSlotGrad(size_t idx, Matrix* mat) throw(RangeError);
-  void setSlotIn(size_t idx, Matrix* mat) throw(RangeError);
-  void setSlotIds(size_t idx, IVector* vec) throw(RangeError);
-  void setSlotSequenceStartPositions(size_t idx,
-                                     IVector* vec) throw(RangeError);
-  void setSlotSubSequenceStartPositions(size_t idx,
-                                        IVector* vec) throw(RangeError);
-  void setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError);
-
-  /**
-   * Set the frame height of the idx-th Argument.
-   *
-   * @param ids The index of which Argument.
-   * @param h The height value.
-   */
-  void setSlotFrameHeight(size_t idx, size_t h) throw(RangeError);
-
-  /**
-   * Set the frame height of the idx-th Argument.
-   *
-   * @param ids The index of which Argument.
-   * @param h The height value.
-   */
-  void setSlotFrameWidth(size_t idx, size_t w) throw(RangeError);
-
-  size_t getSlotFrameHeight(size_t idx = 0) const throw(RangeError);
-  size_t getSlotFrameWidth(size_t idx = 0) const throw(RangeError);
-
-  float sum() const;
-
-private:
-  static Arguments* createByPaddleArgumentVector(void* ptr);
-  static Arguments* createByPaddleArgument(const void* ptr);
-  void* getInternalArgumentsPtr() const;
-
-private:
-  ArgumentsPrivate* m;
-  friend class Trainer;
-  friend class GradientMachine;
-  friend class SequenceGenerator;
-};
-
-enum GradientMatchineCreateMode {
-  CREATE_MODE_NORMAL = paddle::GradientMachine::kNormal,
-  CREATE_MODE_SGD_SPARSE_CPU_TRAINING =
-      paddle::GradientMachine::kSgdSparseCpuTraining,
-  CREATE_MODE_TESTING = paddle::GradientMachine::kTesting
-};
-
-struct ParameterConfigPrivate;
-class ParameterConfig {
-  DISABLE_COPY(ParameterConfig);
-  ParameterConfig();
-
-  /**
-   * Internal methods
-   */
-  static ParameterConfig* createParameterConfigFromParameterSharedPtr(
-      void* ptr);
-  static ParameterConfig* createParameterConfigFromParameterPtr(void* ptr);
-  void* getRawPtr();
-
-public:
-  ~ParameterConfig();
-
-  /**
-   * return proto buf string.
-   */
-  std::string toProtoString() const;
-
-private:
-  ParameterConfigPrivate* m;
-
-private:
-  friend class Parameter;
-  friend class ParameterOptimizer;
-  friend struct ParameterTraverseCallbackPrivate;
-};
-
-struct OptimizationConfigPrivate;
-class OptimizationConfig {
-  DISABLE_COPY(OptimizationConfig);
-  OptimizationConfig();
-
-public:
-  static OptimizationConfig* createFromProtoString(const std::string& str);
-  ~OptimizationConfig();
-
-  /**
-   * return protobuf string.
-   */
-  std::string toProtoString();
-
-private:
-  OptimizationConfigPrivate* m;
-
-  friend class TrainerConfig;
-  friend class ParameterOptimizer;
-  friend class ParameterUpdater;
-  friend class Trainer;
-};
-
-struct ParameterPrivate;
-class Parameter {
-private:
-  Parameter();
-  DISABLE_COPY(Parameter);
-
-public:
-  virtual ~Parameter();
-
-  /**
-   * get parameter name
-   */
-  std::string getName() const;
-
-  /**
-   * get buf in Parameter
-   */
-  Vector* getBuf(ParameterType type);
-
-  /**
-   * get id
-   */
-  size_t getID() const;
-
-  ParameterConfig* getConfig();
-  void setValueUpdated();
-
-  bool save(const std::string& filename) const;
-
-  bool load(const std::string& filename) const;
-
-  size_t getSize() const;
-
-private:
-  static Parameter* createFromRawPtr(void* ptr);
-  static Parameter* createFromSharedPtr(void* ptr);
-
-private:
-  ParameterPrivate* m;
-  friend class UpdateCallbackWrapper;
-  friend class GradientMachine;
-  friend class ParameterUpdater;
-};
-
-struct ModelConfigPrivate;
-/**
- * You can only get model config from TrainerConfig.
- *
- * It is used by GradientMachine.
- */
-class ModelConfig {
-private:
-  ModelConfig();
-  DISABLE_COPY(ModelConfig);
-
-public:
-  virtual ~ModelConfig();
-
-private:
-  ModelConfigPrivate* m;
-  friend class TrainerConfig;
-  friend struct TrainerConfigPrivate;
-  friend class GradientMachine;
-};
-
-struct TrainerConfigPrivate;
-/**
- * To get TrainerConfig from file.
- *
- * It is used by GradientMachine.
- */
-class TrainerConfig {
-private:
-  TrainerConfig();
-  DISABLE_COPY(TrainerConfig);
-
-public:
-  virtual ~TrainerConfig();
-
-  static TrainerConfig* createFromTrainerConfigFile(
-      const std::string& configPath);
-  static TrainerConfig* createFromProtoString(const std::string& str);
-
-  ModelConfig* getModelConfig() const;
-
-  OptimizationConfig* getOptimizationConfig() const;
-
-private:
-  TrainerConfigPrivate* m;
-  friend class Trainer;
-};
-
-/**
- * The callback in backword.
- *
- * You can inherit this class in python.
- *
- * @code
- * class UpdateCallbackInPython(paddle.UpdateCallback):
- *   def __init__(self):
- *     paddle.UpdateCallback.__init__(self)
- *
- *   def apply(self, param):
- *     assert isinstance(param, paddle.Parameter)
- * @endcode
- */
-class UpdateCallback {
-public:
-  virtual ~UpdateCallback();
-  virtual void apply(Parameter* p);
-};
-
-struct ParameterTraverseCallbackPrivate;
-class ParameterTraverseCallback {
-  DISABLE_COPY(ParameterTraverseCallback);
-  ParameterTraverseCallback();
-
-public:
-  ~ParameterTraverseCallback();
-
-  void apply(const std::vector<Vector*>& vecs,
-             const ParameterConfig& config,
-             size_t sparseId);
-
-private:
-  ParameterTraverseCallbackPrivate* m;
-  friend class ParameterOptimizer;
-};
-
-/**
- * The ParameterOptimizer Wrapper Class.
- *
- * Basically same as common/ParameterOptimizer.h
- */
-struct ParameterOptimizerPrivate;
-class ParameterOptimizer {
-  DISABLE_COPY(ParameterOptimizer);
-  ParameterOptimizer();
-
-public:
-  static ParameterOptimizer* create(OptimizationConfig* config);
-
-  ~ParameterOptimizer();
-
-  void init(size_t numRows, const ParameterConfig* config);
-
-  void startPass();
-
-  void finishPass();
-
-  void startBatch(size_t numSamplesProcessed);
-
-  void finishBatch();
-
-  void update(const std::vector<Vector*>& vecs,
-              const ParameterConfig& conf,
-              size_t sparseId = NO_SPARSE_ID);
-
-  std::vector<int> getParameterTypes() const;
-
-  ParameterTraverseCallback* needSpecialTraversal(
-      const ParameterConfig& config) const;
-
-private:
-  ParameterOptimizerPrivate* m;
-};
-
-class SequenceGenerator;
-class Evaluator;
-struct GradientMachinePrivate;
-class GradientMachine {
-private:
-  GradientMachine();
-  DISABLE_COPY(GradientMachine);
-
-public:
-  virtual ~GradientMachine();
-
-  /**
-   * Create By ProtoStr.
-   *
-   * The ProtoStr can be generate by python's protobuf code.
-   */
-  static GradientMachine* createByConfigProtoStr(
-      const std::string& protoStr,
-      GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
-      const std::vector<int>& parameterTypes = defaultParamTypes);
-
-  /**
-   * Create by ModelConfig object.
-   *
-   * To get ModelConfig, you can get TrainerConfig from config file, then get
-   * model config by TrainerConfig
-   */
-  static GradientMachine* createByModelConfig(
-      ModelConfig* conf,
-      GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
-      const std::vector<int>& parameterTypes = defaultParamTypes);
-
-  /**
-   * @brief finish
-   */
-  void finish();
-
-  void start();
-
-  /**
-   * Prefetch row ids of sparse parameter.
-   */
-  void prefetch(const Arguments& inArgs);
-
-  /**
-   * Do some thing when train pass ended.
-   */
-  void onPassEnd();
-
-  /**
-   * The forward stage of GradientMachine.
-   *
-   * @note  the outArgs could be zero length arguemnts.
-   * @note  THIS METHOD IS VERY USEFULL FOR PREDICT FROM TRAINED MODEL.
-   */
-  void forward(const Arguments& inArgs, Arguments* outArgs, PassType passType);
-
-  /**
-   * The backward stage of GradientMachine.
-   *
-   * @note  Currently the ParameterUpdater is not wrapped in SWIG, so backward
-   * cannot actually train a network. But you can write a update callback to
-   * change the parameter or implement a ParameterUpdater in python side.
-   */
-  void backward(const UpdateCallback& callback = UpdateCallback());
-
-  /**
-   * Combine forward/backward
-   */
-  void forwardBackward(const Arguments& inArgs,
-                       Arguments* outArgs,
-                       PassType passType,
-                       const UpdateCallback& callback = UpdateCallback());
-
-  void loadParameters(const std::string& path);
-
-  size_t getParameterSize() const;
-  Parameter* getParameter(size_t i) throw(RangeError);
-
-  size_t getNonStaticParameterSize() const;
-  Parameter* getNonStaticParameter(size_t i) throw(RangeError);
-
-  void randParameters();
-
-  Arguments* getLayerOutput(const std::string& layerName) const
-      throw(UnsupportError);
-
-  /**
-   * Create a sequence generator.
-   *
-   * @note  It just like a paddle_gen_sequence.
-   */
-  SequenceGenerator* asSequenceGenerator(
-      const std::vector<std::string>& dict = std::vector<std::string>(),
-      size_t begin_id = 0UL,
-      size_t end_id = 0UL,
-      size_t max_length = 100UL,
-      size_t beam_size = -1UL);
-
-  Evaluator* makeEvaluator();
-
-  void eval(Evaluator* evaluator);
-
-private:
-  GradientMachinePrivate* m;
-
-  static GradientMachine* createFromPaddleModelPtr(
-      const void* confPtr,
-      GradientMatchineCreateMode mode,
-      const std::vector<int>& types);
-
-  // Not to use c++ 11 init-list, so we use static var as function default arg.
-  static std::vector<int> defaultParamTypes;
-  friend class Trainer;
-  friend class ParameterUpdater;
-};
-
-struct ParameterUpdaterPrivate;
-class ParameterUpdater {
-private:
-  ParameterUpdater();
-
-public:
-  static ParameterUpdater* createLocalUpdater(OptimizationConfig* config);
-  static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
-                                               int passCount,
-                                               bool useSparseUpdater);
-  static ParameterUpdater* createNewRemoteUpdater(
-      OptimizationConfig* config,
-      const std::string pserverSpec,
-      const bool useEtcd) throw(UnsupportError);
-  ~ParameterUpdater();
-
-  /**
-   * @brief initialize Parameter Updater by GradientMachine.
-   * @param gm
-   */
-  void init(const GradientMachine& gm);
-
-  /**
-   * @brief begin of a training/testing of one pass.
-   */
-  void startPass();
-
-  /**
-   * @brief end of a traning/testing of one pass.
-   */
-  void finishPass();
-
-  /**
-   * @brief begin of a training/testing of one batch.
-   * @param data batch's size
-   * @return PassType, mostly will be training.
-   */
-  PassType startBatch(size_t batchSize);
-
-  /**
-   * @brief end of a traning/testing of one batch
-   * @param cost current batch cost.
-   */
-  void finishBatch(float cost);
-
-  /**
-   * @brief update a parameter (by local optimizer or by cluster pserver)
-   * @param param
-   */
-  void update(Parameter* param);
-
-  /**
-   * @breif only get required sparse rows by default.
-   * @param fullSize: get full matrix parameter if *fullSize* set
-   * @param apply: get PARAMETER_APPLY on pserver if *apply* set
-   */
-  void getParametersRemote(bool fullSize = false, bool apply = false);
-
-  /**
-   * @brief restore the average parameter.
-   * @note It is only used in AverageOptimizer. Restore will get the current
-   * PARAMETER_VALUE back.
-   */
-  void restore();
-
-  /**
-   * @brief apply. Store the average parameter.
-   * @note It is only used in AverageOptimizer. Apply will store the current
-   * PARAMETER_VALUE to buffer, calcaualte current Average Parameter, and save
-   * it to PARAMETER_VALUE.
-   */
-  void apply();
-
-  /**
-   * @brief catchUpWith The Regularization will be delayed in many situations(
-   * pserver, local sparse). Catch Up means catch the regularization up, apply
-   * regularization to all params.
-   */
-  void catchUpWith();
-
-private:
-  ParameterUpdaterPrivate* m;
-};
-
-struct EvaluatorPrivate;
-class Evaluator {
-private:
-  Evaluator();
-  DISABLE_COPY(Evaluator);
-
-public:
-  ~Evaluator();
-
-  /**
-   * @brief begin an evaluate stage.
-   */
-  void start();
-
-  /**
-   * @brief end an evaluate stage.
-   */
-  void finish();
-
-  /**
-   * @brief toString will get a evaluate result.
-   *
-   * __repr__ method in python
-   */
-  std::string toString();
-
-  std::vector<std::string> getNames() const;
-
-  double getValue(const std::string name) const;
-
-private:
-  EvaluatorPrivate* m;
-
-  friend class GradientMachine;
-};
-
-struct TrainerPrivate;
-class Trainer {
-private:
-  TrainerPrivate* m;
-  Trainer();
-  Trainer(TrainerConfig* optConfig, GradientMachine* gm);
-  DISABLE_COPY(Trainer);
-
-public:
-  virtual ~Trainer();
-
-  /// Create A Trainer By TrainerConfig. using paddle command line.
-  static Trainer* createByCommandLine() throw(IOError);
-
-  static Trainer* create(TrainerConfig* optConfig,
-                         GradientMachine* gm) throw(IOError);
-
-  /// Start training
-  void startTrain();
-
-  /// Finish training
-  void finishTrain();
-
-  /// Start a pass.
-  void startTrainPass();
-
-  /// Finish a pass
-  void finishTrainPass();
-
-  /**
-   * Train one batch,
-   *
-   * @return true if all batch finished.
-   */
-  bool trainOneBatch(size_t batchSize);
-
-  void trainOneDataBatch(size_t batchSize, const Arguments& args);
-
-  void startTestPeriod();
-  void testOneDataBatch(size_t batchSize, const Arguments& args);
-  void finishTestPeriod();
-
-  void forwardOneBatch(size_t batchSize);
-
-  Arguments* getForwardOutput();
-
-  Arguments* getLayerOutput(const std::string& layerName) const;
-};
-
-/// the N-Best results generated from one input sequence.
-class ISequenceResults {
-public:
-  virtual ~ISequenceResults();
-
-  /// Number of result.
-  virtual size_t getSize() const = 0;
-
-  /**
-   * Get sentence from dictionary.
-   *
-   * @param id  the index of result.
-   * @param split  if true, the return sentence will be splited with ' ' by
-   *               each word. Default is false.
-   */
-  virtual std::string getSentence(size_t id, bool split = false) const
-      throw(RangeError) = 0;
-  virtual std::vector<int> getSequence(size_t id) const throw(RangeError) = 0;
-  virtual float getScore(size_t id) const throw(RangeError) = 0;
-};
-
-struct SequenceGeneratorPrivate;
-class SequenceGenerator {
-  DISABLE_COPY(SequenceGenerator);
-  SequenceGenerator();
-
-public:
-  virtual ~SequenceGenerator();
-
-  /**
-   * Generate Sequence by input.
-   *
-   * @note  The inArgs is just one sequence of data.
-   * @note  The return will get a N-best generate result by inArgs.
-   *        Sort by score.
-   */
-  ISequenceResults* generateSequence(const Arguments& inArgs) const;
-
-  void setDict(const std::vector<std::string>& dict);
-  void setBos(size_t bos);
-  void setEos(size_t eos);
-  void setMaxLength(size_t maxlength);
-  void setBeamSize(size_t beamSize);
-
-private:
-  static SequenceGenerator* createByGradientMachineSharedPtr(void* ptr);
-  friend class GradientMachine;
-
-private:
-  SequenceGeneratorPrivate* m;
-};
diff --git a/paddle/api/PaddleAPIPrivate.h b/paddle/api/PaddleAPIPrivate.h
deleted file mode 100644
index e141fcd761d7db2d3836a6343700ac4a7ca80c16..0000000000000000000000000000000000000000
--- a/paddle/api/PaddleAPIPrivate.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <memory>
-#include "PaddleAPI.h"
-#include "paddle/gserver/evaluators/Evaluator.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-#include "paddle/parameter/ParameterUpdaterBase.h"
-#include "paddle/trainer/TrainerConfigHelper.h"
-
-struct GradientMachinePrivate {
-  std::shared_ptr<paddle::GradientMachine> machine;
-
-  template <typename T>
-  inline T& cast(void* ptr) {
-    return *(T*)(ptr);
-  }
-};
-
-struct OptimizationConfigPrivate {
-  std::shared_ptr<paddle::TrainerConfigHelper> trainer_config;
-  paddle::OptimizationConfig config;
-
-  const paddle::OptimizationConfig& getConfig() {
-    if (trainer_config != nullptr) {
-      return trainer_config->getOptConfig();
-    } else {
-      return config;
-    }
-  }
-};
-
-struct TrainerConfigPrivate {
-  std::shared_ptr<paddle::TrainerConfigHelper> conf;
-  TrainerConfigPrivate() {}
-};
-
-struct ModelConfigPrivate {
-  std::shared_ptr<paddle::TrainerConfigHelper> conf;
-};
-
-struct ArgumentsPrivate {
-  std::vector<paddle::Argument> outputs;
-
-  inline paddle::Argument& getArg(size_t idx) throw(RangeError) {
-    if (idx < outputs.size()) {
-      return outputs[idx];
-    } else {
-      RangeError e;
-      throw e;
-    }
-  }
-
-  template <typename T>
-  std::shared_ptr<T>& cast(void* rawPtr) const {
-    return *(std::shared_ptr<T>*)(rawPtr);
-  }
-};
-
-struct ParameterUpdaterPrivate {
-  std::unique_ptr<paddle::ParameterUpdater> updater;
-};
-
-struct ParameterPrivate {
-  std::shared_ptr<paddle::Parameter> sharedPtr;
-  paddle::Parameter* rawPtr;  // rawPtr only used in ParameterUpdater,
-                              // in other situation sharedPtr should
-                              // contains value.
-
-  ParameterPrivate() : sharedPtr(nullptr), rawPtr(nullptr) {}
-
-  paddle::Parameter* getPtr() {
-    if (sharedPtr) {
-      return sharedPtr.get();
-    } else {
-      return rawPtr;
-    }
-  }
-};
-
-struct EvaluatorPrivate {
-  paddle::Evaluator* rawPtr;
-
-  EvaluatorPrivate() : rawPtr(nullptr) {}
-  ~EvaluatorPrivate() { delete rawPtr; }
-};
diff --git a/paddle/api/Parameter.cpp b/paddle/api/Parameter.cpp
deleted file mode 100644
index 589d22e74e742de2595a9efd17412ddc55159230..0000000000000000000000000000000000000000
--- a/paddle/api/Parameter.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/parameter/Parameter.h"
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-
-Parameter::Parameter() : m(new ParameterPrivate()) {}
-
-Parameter::~Parameter() { delete m; }
-
-Parameter* Parameter::createFromRawPtr(void* ptr) {
-  auto p = new Parameter();
-  p->m->rawPtr = *static_cast<paddle::Parameter**>(ptr);
-  return p;
-}
-
-Parameter* Parameter::createFromSharedPtr(void* ptr) {
-  auto& p = *(paddle::ParameterPtr*)(ptr);
-  if (p == nullptr) {
-    return nullptr;
-  } else {
-    auto retParam = new Parameter();
-    retParam->m->sharedPtr = p;
-    return retParam;
-  }
-}
-
-std::string Parameter::getName() const { return m->getPtr()->getName(); }
-
-Vector* Parameter::getBuf(ParameterType type) {
-  auto buf = m->getPtr()->getBuf(type);
-  return Vector::createByPaddleVectorPtr(&buf);
-}
-
-ParameterConfig* Parameter::getConfig() {
-  if (m->sharedPtr) {
-    return ParameterConfig::createParameterConfigFromParameterSharedPtr(
-        &m->sharedPtr);
-  } else {
-    return ParameterConfig::createParameterConfigFromParameterPtr(m->rawPtr);
-  }
-}
-
-size_t Parameter::getID() const { return m->getPtr()->getID(); }
-
-void Parameter::setValueUpdated() { m->getPtr()->setValueUpdated(); }
-
-bool Parameter::save(const std::string& filename) const {
-  return m->getPtr()->save(filename);
-}
-
-bool Parameter::load(const std::string& filename) const {
-  return m->getPtr()->load(filename);
-}
-
-size_t Parameter::getSize() const { return m->getPtr()->getSize(); }
diff --git a/paddle/api/ParameterOptimizer.cpp b/paddle/api/ParameterOptimizer.cpp
deleted file mode 100644
index d4620be3e6f26cdd4caffffac712e4ef936b222a..0000000000000000000000000000000000000000
--- a/paddle/api/ParameterOptimizer.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/parameter/ParameterOptimizer.h"
-#include <algorithm>
-#include "Internal.h"
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-
-struct ParameterOptimizerPrivate {
-  std::unique_ptr<paddle::ParameterOptimizer> optimizer;
-};
-
-struct ParameterTraverseCallbackPrivate {
-  paddle::ParameterOptimizer::TraverseCallback callback;
-
-  ParameterTraverseCallbackPrivate() {}
-
-  ParameterTraverseCallbackPrivate(
-      const paddle::ParameterOptimizer::TraverseCallback& callback)
-      : callback(callback) {}
-
-  void apply(const std::vector<Vector*>& vecs,
-             const ParameterConfig& conf,
-             size_t sparseId) {
-    std::vector<paddle::VectorPtr> real_vecs;
-    real_vecs.resize(vecs.size());
-    std::transform(vecs.begin(), vecs.end(), real_vecs.begin(), [](Vector* v) {
-      if (v) {
-        return *(paddle::VectorPtr*)(v->getSharedPtr());
-      } else {
-        return paddle::VectorPtr();
-      }
-    });
-
-    paddle::ParameterConfig& real_conf =
-        *(paddle::ParameterConfig*)(const_cast<ParameterConfig&>(conf)
-                                        .getRawPtr());
-    callback(real_vecs.data(), real_conf, sparseId);
-  }
-};
-
-ParameterOptimizer::ParameterOptimizer() : m(new ParameterOptimizerPrivate()) {}
-
-ParameterOptimizer::~ParameterOptimizer() { delete m; }
-
-ParameterOptimizer* ParameterOptimizer::create(OptimizationConfig* config) {
-  CHECK(config != nullptr);
-  auto retOptimizer = new ParameterOptimizer();
-  retOptimizer->m->optimizer.reset(
-      paddle::ParameterOptimizer::create(config->m->getConfig(), false));
-  return retOptimizer;
-}
-
-void ParameterOptimizer::init(size_t numRows, const ParameterConfig* config) {
-  auto& conf = *(paddle::ParameterConfig*)(const_cast<ParameterConfig*>(config)
-                                               ->getRawPtr());
-  m->optimizer->init(numRows, &conf);
-}
-
-void ParameterOptimizer::startPass() { m->optimizer->startPass(); }
-
-void ParameterOptimizer::finishPass() { m->optimizer->finishPass(); }
-
-void ParameterOptimizer::startBatch(size_t numSamplesProcessed) {
-  constexpr size_t high_1 = 1UL << (sizeof(size_t) * 8 - 1);
-  CHECK_EQ(numSamplesProcessed & high_1, 0UL);  // Safely cast.
-  m->optimizer->startBatch((int64_t)numSamplesProcessed);
-}
-
-void ParameterOptimizer::finishBatch() { m->optimizer->finishBatch(); }
-
-void ParameterOptimizer::update(const std::vector<Vector*>& vecs,
-                                const ParameterConfig& conf,
-                                size_t sparseId) {
-  ParameterTraverseCallbackPrivate invoker(
-      [&](const paddle::VectorPtr _vecs[],
-          const paddle::ParameterConfig& config,
-          size_t sid = -1UL) { m->optimizer->update(_vecs, config, sid); });
-  invoker.apply(vecs, conf, sparseId);
-}
-
-std::vector<int> ParameterOptimizer::getParameterTypes() const {
-  std::vector<int> returnValue;
-  staticCastVector(&returnValue, m->optimizer->getParameterTypes());
-  return returnValue;
-}
-
-ParameterTraverseCallback::ParameterTraverseCallback()
-    : m(new ParameterTraverseCallbackPrivate()) {}
-
-ParameterTraverseCallback::~ParameterTraverseCallback() { delete m; }
-
-void ParameterTraverseCallback::apply(const std::vector<Vector*>& vecs,
-                                      const ParameterConfig& conf,
-                                      size_t sparseId) {
-  m->apply(vecs, conf, sparseId);
-}
-
-ParameterTraverseCallback* ParameterOptimizer::needSpecialTraversal(
-    const ParameterConfig& config) const {
-  auto& param_config =
-      *(paddle::ParameterConfig*)const_cast<ParameterConfig&>(config)
-           .getRawPtr();
-  auto callback = m->optimizer->needSpecialTraversal(param_config);
-  if (callback) {
-    auto retCallback = new ParameterTraverseCallback();
-    retCallback->m->callback = callback;
-    return retCallback;
-  } else {
-    return nullptr;
-  }
-}
diff --git a/paddle/api/ParameterUpdater.cpp b/paddle/api/ParameterUpdater.cpp
deleted file mode 100644
index 63c000c959f67dc682190b73bac24640ca8d0682..0000000000000000000000000000000000000000
--- a/paddle/api/ParameterUpdater.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-
-#include "PaddleAPIPrivate.h"
-#ifndef PADDLE_WITHOUT_GOLANG
-#include "paddle/trainer/NewRemoteParameterUpdater.h"
-#endif
-#include "paddle/trainer/RemoteParameterUpdater.h"
-#include "paddle/trainer/ThreadParameterUpdater.h"
-
-ParameterUpdater::ParameterUpdater() : m(new ParameterUpdaterPrivate()) {}
-
-ParameterUpdater *ParameterUpdater::createLocalUpdater(
-    OptimizationConfig *config) {
-  auto updater = new ParameterUpdater();
-  updater->m->updater.reset(
-      new paddle::SgdThreadUpdater(config->m->getConfig()));
-  return updater;
-}
-
-ParameterUpdater *ParameterUpdater::createNewRemoteUpdater(
-    OptimizationConfig *config,
-    const std::string pserverSpec,
-    const bool useEtcd) throw(UnsupportError) {
-#ifndef PADDLE_WITHOUT_GOLANG
-  auto updater = new ParameterUpdater();
-  updater->m->updater.reset(new paddle::NewRemoteParameterUpdater(
-      config->m->getConfig(), pserverSpec, useEtcd));
-  return updater;
-#else
-  throw UnsupportError("not compiled with WITH_GOLANG");
-#endif
-}
-
-ParameterUpdater *ParameterUpdater::createRemoteUpdater(
-    OptimizationConfig *config, int passCount, bool useSparseUpdater) {
-  auto updater = new ParameterUpdater();
-  auto remoteUpdater = new paddle::RemoteParameterUpdater(
-      config->m->getConfig(), passCount, nullptr);
-  if (useSparseUpdater) {
-    std::unique_ptr<paddle::ParameterUpdater> remoteUpdaterPtr(remoteUpdater);
-    auto sparseRemoteUpdater =
-        new paddle::SparseRemoteParameterUpdaterComposite(
-            config->m->getConfig(),
-            passCount,
-            false,
-            std::move(remoteUpdaterPtr));
-    updater->m->updater.reset(sparseRemoteUpdater);
-  } else {
-    updater->m->updater.reset(remoteUpdater);
-  }
-  return updater;
-}
-
-ParameterUpdater::~ParameterUpdater() { delete m; }
-
-void ParameterUpdater::init(const GradientMachine &gm) {
-  m->updater->init(gm.m->machine->getNonStaticParameters());
-}
-
-void ParameterUpdater::startPass() { m->updater->startPass(); }
-
-void ParameterUpdater::finishPass() { m->updater->finishPass(); }
-
-PassType ParameterUpdater::startBatch(size_t batchSize) {
-  return m->updater->startBatch((int64_t)batchSize);
-}
-
-void ParameterUpdater::finishBatch(float cost) {
-  m->updater->finishBatch(cost);
-}
-
-void ParameterUpdater::update(Parameter *param) {
-  auto paddleParam = param->m->getPtr();
-  m->updater->update(paddleParam);
-}
-
-void ParameterUpdater::getParametersRemote(bool fullSize, bool apply) {
-  m->updater->getParametersRemote(fullSize, apply);
-}
-
-void ParameterUpdater::restore() { m->updater->restore(); }
-
-void ParameterUpdater::apply() { m->updater->apply(); }
-
-void ParameterUpdater::catchUpWith() { m->updater->catchUpWith(); }
diff --git a/paddle/api/SequenceGenerator.cpp b/paddle/api/SequenceGenerator.cpp
deleted file mode 100644
index 1b30aec8f6b6b73764886a7c7274be67851e4815..0000000000000000000000000000000000000000
--- a/paddle/api/SequenceGenerator.cpp
+++ /dev/null
@@ -1,242 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <iterator>
-#include <sstream>
-#include <vector>
-#include "PaddleAPI.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-#include "paddle/parameter/Argument.h"
-#include "paddle/utils/Flags.h"
-
-// used to represent partial sequence
-struct Path {
-  std::vector<int> ids;
-  float logProb;
-  paddle::MachineState machineState;
-
-  Path() { logProb = 0; }
-
-  Path(std::vector<int>& ids, float logProb, paddle::MachineState& machineState)
-      : ids(ids), logProb(logProb), machineState(machineState) {}
-
-  bool operator<(const Path& other) const { return (logProb > other.logProb); }
-};
-
-// Return top k (k == beam_size) optimal paths using beam search. The last
-// element of inArgs is the Argument of feedback. gradMachine has MaxIdLayer
-// as output and outArgs thus stores top k labels and their probabilities per
-// position
-static void findNBest(paddle::GradientMachine* gradMachine,
-                      std::vector<paddle::Argument>& inArgs,
-                      std::vector<Path>& finalPaths,
-                      size_t bos_id,
-                      size_t eos_id,
-                      size_t max_length) {
-  std::vector<Path> paths;
-  Path emptyPath;
-  paths.push_back(emptyPath);
-  finalPaths.clear();
-  gradMachine->resetState();
-  paddle::Argument feedback = inArgs.back();
-  feedback.ids->setElement(0, (int)(bos_id));
-  float minFinalPathLogProb = 0;
-  size_t beam = 0;
-  int id;
-  std::vector<paddle::Argument> outArgs;
-  while (true) {  // iterate over each generated word
-    std::vector<Path> newPaths;
-    paddle::MachineState machineState;
-    for (size_t j = 0; j < paths.size(); j++) {
-      Path& path = paths[j];
-      if (path.machineState.size() > 0) {
-        gradMachine->setState(path.machineState);
-        feedback.ids->setElement(0, path.ids.back());
-      }
-      gradMachine->forward(inArgs, &outArgs, paddle::PASS_TEST);
-      gradMachine->getState(machineState);
-      beam = outArgs[0].ids->getSize();
-      for (size_t k = 0; k < beam; k++) {
-        id = outArgs[0].ids->getElement(k);
-        float prob = outArgs[0].in->getElement(0, k);
-        std::vector<int> nids(path.ids);
-        nids.push_back(id);
-        float newLogProb = path.logProb + log(prob);
-        Path newPath(nids, newLogProb, machineState);
-        if (id == (int)eos_id || nids.size() >= max_length) {
-          finalPaths.push_back(newPath);
-          if (minFinalPathLogProb > newPath.logProb) {
-            minFinalPathLogProb = newPath.logProb;
-          }
-        } else {
-          newPaths.push_back(newPath);
-        }
-      }
-    }
-
-    if (newPaths.size() == 0) {
-      break;
-    }
-    std::nth_element(newPaths.begin(),
-                     newPaths.begin() + std::min(beam, newPaths.size()),
-                     newPaths.end());
-    if (newPaths.size() > beam) {
-      newPaths.resize(beam);
-    }
-    // pathA < pathB means pathA.logProb > pathB.logProb
-    float maxPathLogProb =
-        std::min_element(newPaths.begin(), newPaths.end())->logProb;
-    if (finalPaths.size() >= beam && minFinalPathLogProb >= maxPathLogProb) {
-      break;
-    }
-    paths = newPaths;
-  }  // end while
-
-  std::partial_sort(finalPaths.begin(),
-                    finalPaths.begin() + std::min(beam, finalPaths.size()),
-                    finalPaths.end());
-  if (finalPaths.size() > beam) {
-    finalPaths.resize(beam);
-  }
-}
-
-struct SequenceGeneratorPrivate {
-  std::shared_ptr<paddle::GradientMachine> machine;
-  std::shared_ptr<std::vector<std::string>> dict;
-  size_t beginPos;
-  size_t endPos;
-  size_t maxLength;
-
-  paddle::Argument feedback;
-
-  template <typename T>
-  inline T& cast(void* ptr) {
-    return *(T*)(ptr);
-  }
-
-  inline void findNBest(std::vector<paddle::Argument>& inArgs,
-                        std::vector<Path>& path) {
-    ::findNBest(machine.get(), inArgs, path, beginPos, endPos, maxLength);
-  }
-
-  SequenceGeneratorPrivate()
-      : dict(std::make_shared<std::vector<std::string>>()),
-        beginPos(0UL),
-        endPos(0UL),
-        maxLength(0UL),
-        feedback(__create_feedback__()) {}
-
-private:
-  static paddle::Argument __create_feedback__() {
-    paddle::Argument feedback;
-    feedback.ids = paddle::IVector::create(/* size= */ 1, FLAGS_use_gpu);
-
-    feedback.sequenceStartPositions =
-        paddle::ICpuGpuVector::create(/* size= */ 2, /* useGpu= */ false);
-    feedback.sequenceStartPositions->getMutableData(false)[0] = 0;
-    feedback.sequenceStartPositions->getMutableData(false)[1] = 1;
-    return feedback;
-  }
-};
-
-SequenceGenerator::SequenceGenerator() : m(new SequenceGeneratorPrivate()) {}
-
-SequenceGenerator::~SequenceGenerator() { delete m; }
-
-class PathSequenceResults : public ISequenceResults {
-  // ISequenceResults interface
-public:
-  PathSequenceResults(const std::shared_ptr<std::vector<Path>>& path,
-                      const std::shared_ptr<std::vector<std::string>>& dict)
-      : path_(path), dict_(dict) {}
-
-  size_t getSize() const { return path_->size(); }
-  std::string getSentence(size_t id, bool split) const throw(RangeError) {
-    if (id < getSize()) {
-      Path& p = (*path_)[id];
-      std::ostringstream sout;
-      std::transform(p.ids.begin(),
-                     p.ids.end(),
-                     std::ostream_iterator<std::string>(sout, split ? " " : ""),
-                     [&](int id) { return (*dict_)[id]; });
-      return sout.str();
-    } else {
-      RangeError e;
-      throw e;
-    }
-  }
-  std::vector<int> getSequence(size_t id) const throw(RangeError) {
-    if (id < getSize()) {
-      Path& p = (*path_)[id];
-      return p.ids;
-    } else {
-      RangeError e;
-      throw e;
-    }
-  }
-  float getScore(size_t id) const throw(RangeError) {
-    if (id < getSize()) {
-      Path& p = (*path_)[id];
-      return p.logProb;
-    } else {
-      RangeError e;
-      throw e;
-    }
-  }
-
-private:
-  std::shared_ptr<std::vector<Path>> path_;
-  std::shared_ptr<std::vector<std::string>> dict_;
-};
-
-ISequenceResults* SequenceGenerator::generateSequence(
-    const Arguments& inArgs) const {
-  auto& in_args =
-      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
-  for (auto& arg : in_args) {
-    arg.sequenceStartPositions = m->feedback.sequenceStartPositions;
-  }
-  in_args.push_back(m->feedback);
-  auto path = std::make_shared<std::vector<Path>>();
-  m->findNBest(in_args, *path);
-  return new PathSequenceResults(path, m->dict);
-}
-
-SequenceGenerator* SequenceGenerator::createByGradientMachineSharedPtr(
-    void* ptr) {
-  SequenceGenerator* r = new SequenceGenerator();
-  r->m->machine = r->m->cast<std::shared_ptr<paddle::GradientMachine>>(ptr);
-  return r;
-}
-
-void SequenceGenerator::setDict(const std::vector<std::string>& dict) {
-  *m->dict = dict;
-}
-
-void SequenceGenerator::setBos(size_t bos) { m->beginPos = bos; }
-
-void SequenceGenerator::setEos(size_t eos) { m->endPos = eos; }
-
-void SequenceGenerator::setMaxLength(size_t maxLength) {
-  m->maxLength = maxLength;
-}
-
-void SequenceGenerator::setBeamSize(size_t beamSize) {
-  if (beamSize != -1UL) {
-    FLAGS_beam_size = beamSize;
-  }
-}
-
-ISequenceResults::~ISequenceResults() {}
diff --git a/paddle/api/Trainer.cpp b/paddle/api/Trainer.cpp
deleted file mode 100644
index 795460b65051b4ec0d9772d2503f123c4a6ea3d0..0000000000000000000000000000000000000000
--- a/paddle/api/Trainer.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
-
-#include <stdlib.h>
-#include <atomic>
-#include <memory>
-
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/trainer/ParamUtil.h"
-#include "paddle/trainer/Trainer.h"
-#include "paddle/trainer/TrainerInternal.h"
-#include "paddle/utils/Flags.h"
-
-using paddle::real;
-
-DECLARE_string(config);
-DECLARE_string(init_model_path);
-DECLARE_int32(start_pass);
-
-struct TrainerPrivate : public paddle::Trainer {
-  bool _trainOneBatch(size_t batchSize);
-  bool forwardOneBatch(size_t batchSize);
-  void forwardOneDataBatch(const std::vector<paddle::Argument>& inArgs);
-  void setBatchSize(size_t batchSize);
-  std::vector<paddle::Argument>& getForwardOutput();
-
-  void startTestPeriod();
-  void finishTestPeriod();
-  void testOneDataBatch(const paddle::DataBatch& dataBatch);
-  TrainerPrivate() : paddle::Trainer() {}
-};
-
-Trainer::Trainer() : m(new TrainerPrivate()) {
-  auto conf = paddle::TrainerConfigHelper::createFromFlags();
-  if (conf != nullptr) {
-    m->init(conf);
-  }
-}
-
-Trainer::~Trainer() { delete m; }
-
-Trainer* Trainer::createByCommandLine() throw(IOError) {
-  auto retv = new Trainer();
-  if (retv->m->getConfig().IsInitialized()) {
-    return retv;
-  } else {
-    throw IOError();
-  }
-}
-
-Trainer::Trainer(TrainerConfig* config, GradientMachine* gm)
-    : m(new TrainerPrivate()) {
-  m->init(config->m->conf, /* testing= */ false, gm ? gm->m->machine : nullptr);
-}
-
-Trainer* Trainer::create(TrainerConfig* config,
-                         GradientMachine* gm) throw(IOError) {
-  auto retv = new Trainer(config, gm);
-  if (retv->m->getConfig().IsInitialized()) {
-    return retv;
-  } else {
-    retv->m->getConfig().CheckInitialized();
-    throw IOError();
-  }
-}
-
-void Trainer::startTrain() { m->startTrain(); }
-
-void Trainer::finishTrain() { m->finishTrain(); }
-
-void Trainer::startTrainPass() { m->startTrainPass(); }
-
-void Trainer::finishTrainPass() { m->finishTrainPass(); }
-
-void Trainer::trainOneDataBatch(size_t batchSize, const Arguments& inArgs) {
-  paddle::DataBatch dataBatch;
-  dataBatch.getStreams() = inArgs.m->outputs;
-  dataBatch.setSize(batchSize);
-  m->trainOneDataBatch(dataBatch);
-}
-
-bool Trainer::trainOneBatch(size_t batchSize) {
-  return m->_trainOneBatch(batchSize);
-}
-
-bool TrainerPrivate::_trainOneBatch(size_t batchSize) {
-  paddle::DataBatch dataBatch;
-  CHECK(dataProvider_) << "data_provider is not specified";
-  int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
-  if (num == 0) {
-    return false;
-  }
-  trainOneDataBatch(dataBatch);
-  return false;
-}
-
-void TrainerPrivate::startTestPeriod() {
-  if (!tester_) {
-    createTester();
-  }
-  tester_->startTestPeriod();
-}
-
-void Trainer::startTestPeriod() { m->startTestPeriod(); }
-
-void TrainerPrivate::testOneDataBatch(const paddle::DataBatch& dataBatch) {
-  tester_->testOneDataBatch(dataBatch, &forwardOutput_);
-}
-
-void Trainer::testOneDataBatch(size_t batchSize, const Arguments& args) {
-  paddle::DataBatch dataBatch;
-  dataBatch.getStreams() = args.m->outputs;
-  dataBatch.setSize(batchSize);
-  m->testOneDataBatch(dataBatch);
-}
-
-void TrainerPrivate::finishTestPeriod() { tester_->finishTestPeriod(); }
-void Trainer::finishTestPeriod() { m->finishTestPeriod(); }
-
-Arguments* Trainer::getLayerOutput(const std::string& layerName) const {
-  auto nn = this->m->getGradientMachine();
-  CHECK(nn) << "trainerInternal_.getGradientMachine() is not NeuralNetwork";
-  auto arg = nn->getLayerOutput(layerName);
-  return Arguments::createByPaddleArgument(&arg);
-}
-
-void Trainer::forwardOneBatch(size_t batchSize) {
-  m->forwardOneBatch(batchSize);
-}
-
-bool TrainerPrivate::forwardOneBatch(size_t batchSize) {
-  CHECK(dataProvider_) << "data_provider is not specified";
-  paddle::DataBatch dataBatch;
-  int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
-  if (num == 0) {
-    return false;
-  }
-
-  forwardOneDataBatch(dataBatch.getStreams());
-  return true;
-}
-
-void TrainerPrivate::forwardOneDataBatch(
-    const std::vector<paddle::Argument>& inArgs) {
-  std::vector<paddle::Argument>& outArgs = forwardOutput_;
-
-  if (config_->getOptConfig().use_sparse_remote_updater()) {
-    trainerInternal_.getGradientMachine()->prefetch(inArgs);
-    trainerInternal_.getParameterUpdater()->getParametersRemote();
-  }
-  trainerInternal_.getGradientMachine()->forward(
-      inArgs, &outArgs, paddle::PASS_TEST);
-}
-
-Arguments* Trainer::getForwardOutput() {
-  return Arguments::createByPaddleArgumentVector(&m->getForwardOutput());
-}
-
-std::vector<paddle::Argument>& TrainerPrivate::getForwardOutput() {
-  return forwardOutput_;
-}
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
deleted file mode 100644
index 618e87e96459674302d8b468c3ac410e8d3af6a8..0000000000000000000000000000000000000000
--- a/paddle/api/Util.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-
-#include "paddle/parameter/Parameter.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Util.h"
-
-#include <algorithm>
-#include <iostream>
-#include <iterator>
-
-void initPaddle(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  paddle::initPython(argc, argv);
-  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
-}
-
-FloatArray::FloatArray(const float* b, const size_t l)
-    : buf(b), length(l), needFree(false) {}
-
-IntArray::IntArray(const int* b, const size_t l, bool f)
-    : buf(b), length(l), needFree(f) {}
-
-IntWithFloatArray::IntWithFloatArray(const float* v,
-                                     const int* i,
-                                     size_t l,
-                                     bool f)
-    : valBuf(v), idxBuf(i), length(l), needFree(f) {}
-
-bool isUsingGpu() { return FLAGS_use_gpu; }
-
-void setUseGpu(bool useGpu) { FLAGS_use_gpu = useGpu; }
-
-bool isGpuVersion() {
-#ifndef PADDLE_WITH_CUDA
-  return false;
-#else
-  return true;
-#endif
-}
-
-int getTrainerCount() { return FLAGS_trainer_count; }
-
-static_assert(NUM_PARAMETER_TYPES == paddle::NUM_PARAMETER_TYPES,
-              "The Parameter Type should be same in core/api and core/common");
diff --git a/paddle/api/Vector.cpp b/paddle/api/Vector.cpp
deleted file mode 100644
index e2a7b974ca78ae3e6e0e66c206a40c8811126b53..0000000000000000000000000000000000000000
--- a/paddle/api/Vector.cpp
+++ /dev/null
@@ -1,304 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PaddleAPI.h"
-
-#include "paddle/math/Vector.h"
-
-#include <cstring>
-
-struct IVectorPrivate {
-  paddle::IVectorPtr vec;
-};
-
-IVector::IVector() : m(new IVectorPrivate()) {}
-
-IVector* IVector::createZero(size_t sz, bool useGpu) {
-  auto v = new IVector();
-  v->m->vec = paddle::IVector::create(sz, useGpu);
-  v->m->vec->zeroMem();
-  return v;
-}
-
-IVector* IVector::create(const std::vector<int>& data, bool useGpu) {
-  auto v = new IVector();
-  v->m->vec = paddle::IVector::create(data.size(), useGpu);
-  v->m->vec->copyFrom(data.data(), data.size());
-  return v;
-}
-
-IVector* IVector::createVectorFromNumpy(int* data,
-                                        int dim,
-                                        bool copy,
-                                        bool useGpu) throw(UnsupportError) {
-  if (useGpu) {
-    /// if use gpu only copy=true is supported
-    if (!copy) {
-      throw UnsupportError("Gpu mode only supports copy=True");
-    }
-    return IVector::createGpuVectorFromNumpy(data, dim);
-  } else {
-    return IVector::createCpuVectorFromNumpy(data, dim, copy);
-  }
-}
-
-IVector* IVector::createCpuVectorFromNumpy(int* data, int dim, bool copy) {
-  auto v = new IVector();
-  if (copy) {
-    v->m->vec = paddle::IVector::create(dim, false);
-    v->m->vec->copyFrom(data, dim);
-  } else {
-    v->m->vec = paddle::IVector::create(data, dim, false);
-  }
-  return v;
-}
-
-IVector* IVector::createGpuVectorFromNumpy(int* data, int dim) {
-  auto v = new IVector();
-  v->m->vec = paddle::IVector::create(dim, true);
-  v->m->vec->copyFrom(data, dim);
-  return v;
-}
-
-bool IVector::isGpu() const {
-  return dynamic_cast<paddle::GpuIVector*>(m->vec.get()) != nullptr;
-}
-
-IntArray IVector::getData() const {
-  if (this->isGpu()) {
-    int* src = m->vec->getData();
-    size_t len = m->vec->getSize();
-    int* dest = new int[len];
-    hl_memcpy_device2host(dest, src, len * sizeof(int));
-    return IntArray(dest, len, true);
-  } else {
-    return IntArray(m->vec->getData(), m->vec->getSize());
-  }
-}
-
-int& IVector::operator[](const size_t idx) throw(RangeError, UnsupportError) {
-  if (this->isGpu()) {
-    UnsupportError e;
-    throw e;
-  } else {
-    if (idx >= m->vec->getSize()) {
-      RangeError e;
-      throw e;
-    }
-  }
-  return m->vec->getData()[idx];
-}
-
-const int& IVector::operator[](const size_t idx) const
-    throw(RangeError, UnsupportError) {
-  return (*const_cast<IVector*>(this))[idx];
-}
-
-IVector* IVector::createByPaddleVectorPtr(void* ptr) {
-  auto* p = (paddle::IVectorPtr*)ptr;
-  if ((*p) != nullptr) {
-    IVector* vec = new IVector();
-    vec->m->vec = *p;
-    return vec;
-  } else {
-    return nullptr;
-  }
-}
-
-IVector::~IVector() { delete m; }
-
-void* IVector::getSharedPtr() const { return &m->vec; }
-
-size_t IVector::getSize() const { return m->vec->getSize(); }
-
-void IVector::toNumpyArrayInplace(int** data, int* dim1) throw(UnsupportError) {
-  auto v = std::dynamic_pointer_cast<paddle::CpuIVector>(m->vec);
-  if (v) {
-    *data = v->getData();
-    *dim1 = v->getSize();
-  } else {
-    throw UnsupportError();
-  }
-}
-
-void IVector::copyToNumpyArray(int** view_m_data, int* dim1) {
-  *dim1 = m->vec->getSize();
-  *view_m_data = new int[*dim1];
-  if (auto cpuVec = dynamic_cast<paddle::CpuIVector*>(m->vec.get())) {
-    std::memcpy(*view_m_data, cpuVec->getData(), sizeof(int) * (*dim1));
-  } else if (auto gpuVec = dynamic_cast<paddle::GpuIVector*>(m->vec.get())) {
-    hl_memcpy_device2host(
-        *view_m_data, gpuVec->getData(), sizeof(int) * (*dim1));
-  } else {
-    LOG(INFO) << "Unexpected situation";
-  }
-}
-
-void IVector::copyFromNumpyArray(int* data, int dim) {
-  m->vec->resize(dim);
-  m->vec->copyFrom(data, dim);
-}
-
-struct VectorPrivate {
-  paddle::VectorPtr vec;
-
-  void safeAccessData(const size_t idx,
-                      const std::function<void(float&)>& func) const
-      throw(RangeError, UnsupportError) {
-    auto cpuVec = std::dynamic_pointer_cast<const paddle::CpuVector>(vec);
-    if (cpuVec != nullptr) {
-      if (idx < vec->getSize()) {
-        func(vec->getData()[idx]);
-      } else {
-        throw RangeError();
-      }
-    } else {
-      throw UnsupportError();
-    }
-  }
-};
-
-Vector::Vector() : m(new VectorPrivate()) {}
-
-Vector::~Vector() { delete m; }
-
-Vector* Vector::createZero(size_t sz, bool useGpu) {
-  auto retVec = new Vector();
-  retVec->m->vec = paddle::Vector::create(sz, useGpu);
-  retVec->m->vec->zero();
-  return retVec;
-}
-
-Vector* Vector::create(const std::vector<float>& data, bool useGpu) {
-  auto retVec = new Vector();
-  retVec->m->vec = paddle::Vector::create(data.size(), useGpu);
-  retVec->m->vec->copyFrom(data.data(), data.size());
-  return retVec;
-}
-
-Vector* Vector::createByPaddleVectorPtr(void* ptr) {
-  auto& v = *(paddle::VectorPtr*)(ptr);
-  if (v == nullptr) {
-    return nullptr;
-  } else {
-    auto retVec = new Vector();
-    retVec->m->vec = v;
-    return retVec;
-  }
-}
-
-Vector* Vector::createVectorFromNumpy(float* data,
-                                      int dim,
-                                      bool copy,
-                                      bool useGpu) throw(UnsupportError) {
-  if (useGpu) {
-    /// if use gpu only copy=True is supported
-    if (!copy) {
-      throw UnsupportError("Gpu mode only supports copy=True");
-    }
-    return Vector::createGpuVectorFromNumpy(data, dim);
-  } else {
-    return Vector::createCpuVectorFromNumpy(data, dim, copy);
-  }
-}
-
-Vector* Vector::createCpuVectorFromNumpy(float* data, int dim, bool copy) {
-  CHECK_GT(dim, 0);
-  auto retVec = new Vector();
-  if (copy) {
-    retVec->m->vec = paddle::Vector::create((size_t)dim, false);
-    retVec->m->vec->copyFrom(data, dim);
-  } else {
-    retVec->m->vec = paddle::Vector::create(data, (size_t)dim, false);
-  }
-  return retVec;
-}
-
-Vector* Vector::createGpuVectorFromNumpy(float* data, int dim) {
-  CHECK_GT(dim, 0);
-  auto retVec = new Vector();
-  retVec->m->vec = paddle::Vector::create((size_t)dim, true);
-  retVec->m->vec->copyFrom(data, (size_t)dim);
-  return retVec;
-}
-
-void Vector::toNumpyArrayInplace(float** view_data,
-                                 int* dim1) throw(UnsupportError) {
-  auto v = std::dynamic_pointer_cast<paddle::CpuVector>(m->vec);
-  if (v != nullptr) {
-    *view_data = v->getData();
-    *dim1 = (int)v->getSize();
-  } else {
-    throw UnsupportError();
-  }
-}
-
-void Vector::copyToNumpyArray(float** view_m_data, int* dim1) {
-  *dim1 = m->vec->getSize();
-  *view_m_data = new float[*dim1];
-  if (auto cpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
-    std::memcpy(*view_m_data, cpuVec->getData(), sizeof(float) * (*dim1));
-  } else if (auto gpuVec = dynamic_cast<paddle::GpuVector*>(m->vec.get())) {
-    hl_memcpy_device2host(
-        *view_m_data, gpuVec->getData(), sizeof(float) * (*dim1));
-  } else {
-    LOG(INFO) << "Unexpected situation";
-  }
-}
-
-void Vector::copyFromNumpyArray(float* data, int dim) {
-  m->vec->resize(dim);
-  m->vec->copyFrom(data, dim);
-}
-
-FloatArray Vector::getData() const {
-  if (this->isGpu()) {
-    float* src = m->vec->getData();
-    size_t len = m->vec->getSize();
-    float* dest = new float[len];
-    hl_memcpy_device2host(dest, src, len * sizeof(float));
-    FloatArray ret_val(dest, len);
-    ret_val.needFree = true;
-    return ret_val;
-  } else {
-    FloatArray ret_val(m->vec->getData(), m->vec->getSize());
-    return ret_val;
-  }
-}
-
-void Vector::copyFrom(Vector* src) throw(RangeError) {
-  if (src->m->vec->getSize() != m->vec->getSize()) {
-    throw RangeError();
-  }
-  m->vec->copyFrom(*src->m->vec);
-}
-
-bool Vector::isGpu() const {
-  return std::dynamic_pointer_cast<paddle::GpuVector>(m->vec) != nullptr;
-}
-
-float Vector::get(const size_t idx) const throw(RangeError, UnsupportError) {
-  float r;
-  m->safeAccessData(idx, [&](float& o) { r = o; });
-  return r;
-}
-
-void Vector::set(const size_t idx, float val) throw(RangeError,
-                                                    UnsupportError) {
-  m->safeAccessData(idx, [&](float& o) { o = val; });
-}
-
-size_t Vector::getSize() const { return m->vec->getSize(); }
-
-void* Vector::getSharedPtr() { return &m->vec; }
diff --git a/paddle/api/test/CMakeLists.txt b/paddle/api/test/CMakeLists.txt
deleted file mode 100644
index 761aeb5b174105edece8880a9f5012c13a63fd11..0000000000000000000000000000000000000000
--- a/paddle/api/test/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-py_test(testTrain SRCS testTrain.py)
-py_test(testMatrix SRCS testMatrix.py)
-py_test(testVector SRCS testVector.py)
-py_test(testTrainer SRCS testTrainer.py)
-py_test(testArguments SRCS testArguments.py)
-py_test(testGradientMachine SRCS testGradientMachine.py)
diff --git a/paddle/capi/CMakeLists.txt b/paddle/capi/CMakeLists.txt
deleted file mode 100644
index ebb083c5a477d5be91ef14be74dd9de349d07931..0000000000000000000000000000000000000000
--- a/paddle/capi/CMakeLists.txt
+++ /dev/null
@@ -1,121 +0,0 @@
-if (WITH_DOUBLE)
-  set(PADDLE_FLOAT_TYPE double)
-else ()
-  set(PADDLE_FLOAT_TYPE float)
-endif()
-
-execute_process(
-  COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
-  WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-  OUTPUT_VARIABLE PADDLE_GIT_COMMIT
-  RESULT_VARIABLE PADDLE_GIT_COMMIT_RESULT
-  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-if(NOT PADDLE_GIT_COMMIT)
-  set(PADDLE_GIT_COMMIT "no commit information")
-endif()
-
-# config.h used for C-API. It will store Paddle building configuration as a
-# header. Make user just include PaddleCAPI.h then can get building
-# configuration without explicitly set -DPADDLE_WITH_DOUBLE when building their
-# libraries.
-configure_file(config.h.in config.h @ONLY)
-
-# PaddleCAPI.h is the only header we exposed. It currently only used for model
-# inference.
-file(GLOB CAPI_HEADERS *.h)
-set(CAPI_PRIVATE_HEADER capi_private.h)
-list(REMOVE_ITEM CAPI_HEADERS ${CAPI_PRIVATE_HEADER})
-file(GLOB CAPI_SOURCES *.cpp)
-
-# building paddle_capi
-add_library(paddle_capi STATIC ${CAPI_HEADERS} ${CAPI_PRIVATE_HEADER}
-  ${CAPI_SOURCES})
-
-target_include_directories(paddle_capi PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
-
-add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
-  ${CAPI_PRIVATE_HEADER})
-
-add_dependencies(paddle_capi paddle_proto)
-
-# TODO: paddle_capi_whole will be removed.
-set(PADDLE_CAPI_LAYERS_LIBS
-    paddle_function
-    paddle_gserver)
-if(MOBILE_INFERENCE)
-  set(PADDLE_CAPI_ENGINE_LIBS
-      paddle_utils
-      paddle_parameter
-      paddle_math
-      paddle_cuda
-      paddle_proto)
-else()
-  set(PADDLE_CAPI_ENGINE_LIBS
-      paddle_utils
-      paddle_parameter
-      paddle_math
-      paddle_cuda
-      paddle_proto
-      paddle_pserver
-      paddle_network)
-endif()
-set(PADDLE_CAPI_INFER_LIBS ${PADDLE_CAPI_LAYERS_LIBS} ${PADDLE_CAPI_ENGINE_LIBS})
-cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_CAPI_INFER_LIBS})
-
-# Link the static library for inference
-cc_library(paddle_capi_engine DEPS paddle_capi ${PADDLE_CAPI_ENGINE_LIBS})
-cc_library(paddle_capi_layers DEPS ${PADDLE_CAPI_LAYERS_LIBS})
-
-# Link the shared library for inference
-if(NOT IOS)
-  set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_capi.map")
-  add_library(paddle_capi_shared SHARED ${CAPI_SOURCES})
-  set_target_properties(paddle_capi_shared	PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-  target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
-  link_paddle_exe(paddle_capi_shared)
-endif()
-
-# install library & headers.
-install(FILES ${CAPI_HEADERS} DESTINATION include/paddle)
-install(FILES paddle_capi.map DESTINATION include/paddle)
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config.h DESTINATION include/paddle)
-if(ANDROID)
-  install(TARGETS paddle_capi_whole paddle_capi_engine paddle_capi_layers paddle_capi_shared
-          ARCHIVE DESTINATION lib/${ANDROID_ABI}
-          LIBRARY DESTINATION lib/${ANDROID_ABI})
-  execute_process(
-    COMMAND ${GIT_EXECUTABLE} log --pretty=oneline -1
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-    OUTPUT_VARIABLE GIT_COMMITS_LIST
-    RESULT_VARIABLE GIT_COMMITS_LIST_RESULT
-    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-  if(${GIT_COMMITS_LIST_RESULT})
-    set(GIT_COMMITS_LIST "No commits.")
-  endif()
-  install(CODE "FILE(WRITE ${CMAKE_INSTALL_PREFIX}/lib/${ANDROID_ABI}/BUILD.txt
-          \"Compiler:\n\"
-          \"\\t${CMAKE_C_COMPILER}\\n\"
-          \"\\t${CMAKE_CXX_COMPILER}\\n\"
-          \"Compiler Flags:\\n\"
-          \"\\t${CMAKE_F_FLAGS}\\n\"
-          \"\\t${CMAKE_CXX_FLAGS}\\n\"
-          \"Android API: ${CMAKE_SYSTEM_VERSION}\\n\"
-          \"Lastest commit:\\n\"
-          \"\\t${GIT_COMMITS_LIST}\\n\"
-      )"
-  )
-else(ANDROID)
-  install(TARGETS paddle_capi_whole paddle_capi_engine paddle_capi_layers ARCHIVE DESTINATION lib)
-  if(NOT IOS)
-    install(TARGETS paddle_capi_shared DESTINATION lib)
-  endif()
-endif(ANDROID)
-
-# this variable used for unittest
-set(PADDLE_CAPI_INC_PATH
-  ${CMAKE_CURRENT_BINARY_DIR}
-  ${CMAKE_CURRENT_SOURCE_DIR})
-
-if (WITH_TESTING)
-  add_subdirectory(tests)
-endif()
diff --git a/paddle/capi/Main.cpp b/paddle/capi/Main.cpp
deleted file mode 100644
index 0a289dede65406facf1f1cba584f4330f2569214..0000000000000000000000000000000000000000
--- a/paddle/capi/Main.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fenv.h>
-#include <stdlib.h>
-#include <string.h>
-#include <vector>
-#include "capi_private.h"
-#include "main.h"
-#include "paddle/trainer/TrainerConfigHelper.h"
-#include "paddle/utils/Excepts.h"
-#include "paddle/utils/PythonUtil.h"
-
-static void initPaddle(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  paddle::initPython(argc, argv);
-}
-
-extern "C" {
-paddle_error paddle_init(int argc, char** argv) {
-  static bool isInit = false;
-  if (isInit) return kPD_NO_ERROR;
-
-  std::vector<char*> realArgv;
-  realArgv.reserve(argc + 1);
-  realArgv.push_back(strdup(""));
-  for (int i = 0; i < argc; ++i) {
-    realArgv.push_back(argv[i]);
-  }
-  initPaddle(argc + 1, realArgv.data());
-  free(realArgv[0]);
-  isInit = true;
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_init_thread() {
-  if (FLAGS_use_gpu) {
-    hl_init(FLAGS_gpu_id);
-  }
-  return kPD_NO_ERROR;
-}
-}
diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp
deleted file mode 100644
index 24b0020636c0141b87dc80f5079f7342ec28157c..0000000000000000000000000000000000000000
--- a/paddle/capi/Matrix.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "capi_private.h"
-#include "hl_cuda.h"
-#include "matrix.h"
-
-#define cast(v) paddle::capi::cast<paddle::capi::CMatrix>(v)
-extern "C" {
-paddle_matrix paddle_matrix_create(uint64_t height,
-                                   uint64_t width,
-                                   bool useGpu) {
-  auto ptr = new paddle::capi::CMatrix();
-  ptr->mat = paddle::Matrix::create(height, width, false, useGpu);
-  return ptr;
-}
-
-paddle_matrix paddle_matrix_create_none() {
-  return new paddle::capi::CMatrix();
-}
-
-paddle_error paddle_matrix_destroy(paddle_matrix mat) {
-  if (mat == nullptr) return kPD_NULLPTR;
-  auto ptr = cast(mat);
-  delete ptr;
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_matrix_set_row(paddle_matrix mat,
-                                   uint64_t rowID,
-                                   paddle_real* rowArray) {
-  if (mat == nullptr || rowArray == nullptr) return kPD_NULLPTR;
-  auto ptr = cast(mat);
-  if (ptr->mat == nullptr) return kPD_NULLPTR;
-  if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
-  paddle::real* buf = ptr->mat->getRowBuf(rowID);
-  size_t width = ptr->mat->getWidth();
-#ifdef PADDLE_WITH_CUDA
-  hl_memcpy(buf, rowArray, sizeof(paddle::real) * width);
-#else
-  std::copy(rowArray, rowArray + width, buf);
-#endif
-  return kPD_NO_ERROR;
-}
-
-PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
-                                            paddle_real* value) {
-  if (mat == nullptr || value == nullptr) return kPD_NULLPTR;
-  auto ptr = cast(mat);
-  if (ptr->mat == nullptr) return kPD_NULLPTR;
-  paddle::real* buf = ptr->mat->getRowBuf(0);
-  size_t width = ptr->mat->getWidth();
-  size_t height = ptr->mat->getHeight();
-  if (ptr->mat->useGpu()) {
-#ifdef PADDLE_WITH_CUDA
-    hl_memcpy(buf, value, sizeof(paddle::real) * width * height);
-#else
-    return kPD_NOT_SUPPORTED;
-#endif
-  } else {
-    std::copy(value, value + width * height, buf);
-  }
-  return kPD_NO_ERROR;
-}
-
-PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
-                                            paddle_real* result) {
-  if (mat == nullptr || result == nullptr) return kPD_NULLPTR;
-  auto ptr = cast(mat);
-  if (ptr->mat == nullptr) return kPD_NULLPTR;
-  paddle::real* buf = ptr->mat->getRowBuf(0);
-  size_t width = ptr->mat->getWidth();
-  size_t height = ptr->mat->getHeight();
-  if (ptr->mat->useGpu()) {
-#ifdef PADDLE_WITH_CUDA
-    hl_memcpy(result, buf, width * height * sizeof(paddle::real));
-#else
-    return kPD_NOT_SUPPORTED;
-#endif
-  } else {
-    std::copy(buf, buf + width * height, result);
-  }
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_matrix_get_row(paddle_matrix mat,
-                                   uint64_t rowID,
-                                   paddle_real** rawRowBuffer) {
-  if (mat == nullptr) return kPD_NULLPTR;
-  auto ptr = cast(mat);
-  if (ptr->mat == nullptr) return kPD_NULLPTR;
-  if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
-  *rawRowBuffer = ptr->mat->getRowBuf(rowID);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_matrix_get_shape(paddle_matrix mat,
-                                     uint64_t* height,
-                                     uint64_t* width) {
-  if (mat == nullptr) return kPD_NULLPTR;
-  if (height != nullptr) {
-    *height = cast(mat)->mat->getHeight();
-  }
-  if (width != nullptr) {
-    *width = cast(mat)->mat->getWidth();
-  }
-  return kPD_NO_ERROR;
-}
-}
-
-paddle_matrix paddle_matrix_create_sparse(
-    uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu) {
-#ifndef PADDLE_MOBILE_INFERENCE
-  auto ptr = new paddle::capi::CMatrix();
-  ptr->mat = paddle::Matrix::createSparseMatrix(
-      height,
-      width,
-      nnz,
-      isBinary ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
-      paddle::SPARSE_CSR,
-      false,
-      useGpu);
-  return ptr;
-#else
-  return nullptr;
-#endif
-}
-
-paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
-                                            int* rowArray,
-                                            uint64_t rowSize,
-                                            int* colArray,
-                                            uint64_t colSize,
-                                            float* valueArray,
-                                            uint64_t valueSize) {
-#ifndef PADDLE_MOBILE_INFERENCE
-  if (mat == nullptr) return kPD_NULLPTR;
-  auto ptr = cast(mat);
-  if (rowArray == nullptr || colArray == nullptr ||
-      (valueSize != 0 && valueArray == nullptr) || ptr->mat == nullptr) {
-    return kPD_NULLPTR;
-  }
-  if (auto sparseMat = dynamic_cast<paddle::CpuSparseMatrix*>(ptr->mat.get())) {
-    std::vector<int> row(rowSize);
-    row.assign(rowArray, rowArray + rowSize);
-    std::vector<int> col(colSize);
-    col.assign(colArray, colArray + colSize);
-    std::vector<paddle_real> val(valueSize);
-    if (valueSize) {
-      val.assign(valueArray, valueArray + valueSize);
-    }
-    sparseMat->copyFrom(row, col, val);
-    return kPD_NO_ERROR;
-  } else {
-    return kPD_NOT_SUPPORTED;
-  }
-#else
-  return kPD_NOT_SUPPORTED;
-#endif
-}
diff --git a/paddle/capi/capi_private.h b/paddle/capi/capi_private.h
deleted file mode 100644
index 3332f42a4a6e57fed6ddb20cf7d759d67e7240b5..0000000000000000000000000000000000000000
--- a/paddle/capi/capi_private.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "capi.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-#include "paddle/parameter/Argument.h"
-#pragma once
-
-namespace paddle {
-namespace capi {
-
-enum CType { kIVECTOR = 0, kMATRIX, kARGUMENTS, kGRADIENT_MACHINE };
-
-#define STRUCT_HEADER CType type;
-
-struct CHeader {
-  STRUCT_HEADER
-};
-
-struct CIVector {
-  STRUCT_HEADER
-  IVectorPtr vec;
-
-  CIVector() : type(kIVECTOR) {}
-};
-
-struct CMatrix {
-  STRUCT_HEADER
-  MatrixPtr mat;
-
-  CMatrix() : type(kMATRIX) {}
-};
-
-struct CArguments {
-  STRUCT_HEADER
-  std::vector<paddle::Argument> args;
-
-  CArguments() : type(kARGUMENTS) {}
-
-  template <typename T>
-  paddle_error accessSeqPos(uint64_t ID, uint32_t nestedLevel, T callback) {
-    if (ID >= args.size()) return kPD_OUT_OF_RANGE;
-    switch (nestedLevel) {
-      case 0:
-        callback(args[ID].sequenceStartPositions);
-        break;
-      case 1:
-        callback(args[ID].subSequenceStartPositions);
-        break;
-      default:
-        return kPD_OUT_OF_RANGE;
-    }
-    return kPD_NO_ERROR;
-  }
-};
-
-struct CGradientMachine {
-  STRUCT_HEADER
-  paddle::GradientMachinePtr machine;
-
-  CGradientMachine() : type(kGRADIENT_MACHINE) {}
-};
-
-template <typename T>
-inline T* cast(void* ptr) {
-  return reinterpret_cast<T*>(ptr);
-}
-}  // namespace capi
-}  // namespace paddle
diff --git a/paddle/capi/gradient_machine.cpp b/paddle/capi/gradient_machine.cpp
deleted file mode 100644
index ea9aab00e3d05f1e2ef0c91eab93b67e0a3d5f37..0000000000000000000000000000000000000000
--- a/paddle/capi/gradient_machine.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gradient_machine.h"
-#include "capi_private.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-
-#define cast(v) paddle::capi::cast<paddle::capi::CGradientMachine>(v)
-
-enum GradientMatchineCreateMode {
-  CREATE_MODE_NORMAL = 0,
-  CREATE_MODE_TESTING = 4
-};
-
-namespace paddle {
-
-class MyNeuralNetwork : public NeuralNetwork {
-public:
-  MyNeuralNetwork(const std::string& name, NeuralNetwork* network)
-      : NeuralNetwork(name, network) {}
-};
-
-NeuralNetwork* newCustomNerualNetwork(const std::string& name,
-                                      NeuralNetwork* network) {
-  return new MyNeuralNetwork(name, network);
-}
-}  // namespace paddle
-
-extern "C" {
-paddle_error paddle_gradient_machine_create_for_inference(
-    paddle_gradient_machine* machine, void* modelConfigProtobuf, int size) {
-  if (modelConfigProtobuf == nullptr) return kPD_NULLPTR;
-  paddle::ModelConfig config;
-  if (!config.ParseFromArray(modelConfigProtobuf, size) ||
-      !config.IsInitialized()) {
-    return kPD_PROTOBUF_ERROR;
-  }
-
-  auto ptr = new paddle::capi::CGradientMachine();
-  ptr->machine.reset(paddle::GradientMachine::create(
-      config, CREATE_MODE_TESTING, {paddle::PARAMETER_VALUE}));
-  *machine = ptr;
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_create_for_inference_with_parameters(
-    paddle_gradient_machine* machine, void* mergedModel, uint64_t size) {
-  if (mergedModel == nullptr) return kPD_NULLPTR;
-  std::istringstream is(std::string(static_cast<char*>(mergedModel), size));
-  int64_t modelConfigSize = 0;
-  is.read((char*)(&modelConfigSize), sizeof(modelConfigSize));
-  std::string modelConfigProtobuf;
-  modelConfigProtobuf.resize(modelConfigSize);
-  is.read(&modelConfigProtobuf[0], modelConfigSize);
-  paddle::TrainerConfig config;
-  paddle::ModelConfig modelConfig;
-  if (!config.ParseFromString(modelConfigProtobuf) || !config.IsInitialized()) {
-    if (!modelConfig.ParseFromString(modelConfigProtobuf) ||
-        !modelConfig.IsInitialized()) {
-      return kPD_PROTOBUF_ERROR;
-    }
-  } else {
-    modelConfig = config.model_config();
-  }
-  auto ptr = new paddle::capi::CGradientMachine();
-  ptr->machine.reset(paddle::GradientMachine::create(
-      modelConfig, CREATE_MODE_TESTING, {paddle::PARAMETER_VALUE}));
-  std::vector<paddle::ParameterPtr>& parameters = ptr->machine->getParameters();
-  for (auto& para : parameters) {
-    para->load(is);
-  }
-
-  *machine = ptr;
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_destroy(paddle_gradient_machine machine) {
-  delete cast(machine);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_load_parameter_from_disk(
-    paddle_gradient_machine machine, const char* path) {
-  auto m = cast(machine);
-  if (m == nullptr || path == nullptr || m->machine == nullptr)
-    return kPD_NULLPTR;
-  m->machine->loadParameters(path);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_forward(paddle_gradient_machine machine,
-                                             paddle_arguments inArgs,
-                                             paddle_arguments outArgs,
-                                             bool isTrain) {
-  auto m = cast(machine);
-  auto in = paddle::capi::cast<paddle::capi::CArguments>(inArgs);
-  auto out = paddle::capi::cast<paddle::capi::CArguments>(outArgs);
-  if (m == nullptr || in == nullptr || out == nullptr || m->machine == nullptr)
-    return kPD_NULLPTR;
-  m->machine->forward(
-      in->args, &out->args, isTrain ? paddle::PASS_TRAIN : paddle::PASS_TEST);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_create_shared_param(
-    paddle_gradient_machine origin,
-    void* modelConfigProtobuf,
-    int size,
-    paddle_gradient_machine* slave) {
-  auto o = cast(origin);
-  if (origin == nullptr || slave == nullptr || o->machine == nullptr) {
-    return kPD_NULLPTR;
-  }
-  paddle::ModelConfig config;
-  if (!config.ParseFromArray(modelConfigProtobuf, size) ||
-      !config.IsInitialized()) {
-    return kPD_PROTOBUF_ERROR;
-  }
-
-  std::unique_ptr<paddle::capi::CGradientMachine> ptr(
-      new paddle::capi::CGradientMachine());
-  auto nn = paddle::NeuralNetwork::create(config);
-  nn->init(config,
-           [&o](int paramId, paddle::Parameter* param) {
-             auto p = o->machine->getParameters()[paramId];
-             param->enableSharedType(paddle::PARAMETER_VALUE,
-                                     p->getBuf(paddle::PARAMETER_VALUE));
-           },
-           {paddle::PARAMETER_VALUE},
-           false);
-  ptr->machine.reset(nn);
-  *slave = ptr.release();
-  return kPD_NO_ERROR;
-}
-}
-
-paddle_error paddle_gradient_machine_randomize_param(
-    paddle_gradient_machine machine) {
-  auto m = cast(machine);
-  if (m == nullptr || m->machine == nullptr) return kPD_NULLPTR;
-  m->machine->randParameters();
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_get_layer_output(
-    paddle_gradient_machine machine,
-    const char* layerName,
-    paddle_arguments args) {
-  auto m = cast(machine);
-  auto out = paddle::capi::cast<paddle::capi::CArguments>(args);
-  if (m == nullptr || layerName == nullptr || out == nullptr ||
-      m->machine == nullptr) {
-    return kPD_NULLPTR;
-  }
-
-  auto layerOutput = m->machine->getLayerOutput(layerName);
-  out->args.push_back(layerOutput);
-  return kPD_NO_ERROR;
-}
-
-paddle_error paddle_gradient_machine_release_layer_output(
-    paddle_gradient_machine machine) {
-  auto m = cast(machine);
-  if (m == nullptr || m->machine == nullptr) {
-    return kPD_NULLPTR;
-  }
-  m->machine->releaseOutput();
-  return kPD_NO_ERROR;
-}
diff --git a/paddle/capi/tests/test_Arguments.cpp b/paddle/capi/tests/test_Arguments.cpp
deleted file mode 100644
index bb08adf716bfd6e3c88747616e538e9da89a0e25..0000000000000000000000000000000000000000
--- a/paddle/capi/tests/test_Arguments.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <functional>
-#include "capi.h"
-#include "gtest/gtest.h"
-#include "paddle/utils/ThreadLocal.h"
-
-static std::vector<paddle_real> randomBuffer(size_t bufSize) {
-  auto& eng = paddle::ThreadLocalRandomEngine::get();
-  std::uniform_real_distribution<paddle_real> dist(-1.0, 1.0);
-  std::vector<paddle_real> retv;
-  retv.reserve(bufSize);
-  for (size_t i = 0; i < bufSize; ++i) {
-    retv.push_back(dist(eng));
-  }
-  return retv;
-}
-
-TEST(CAPIArguments, create) {
-  //! TODO(yuyang18): Test GPU Code.
-  paddle_arguments args = paddle_arguments_create_none();
-  uint64_t size;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_size(args, &size));
-  ASSERT_EQ(0UL, size);
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
-}
-
-TEST(CAPIArguments, value) {
-  paddle_arguments args = paddle_arguments_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(args, 1));
-
-  paddle_matrix mat = paddle_matrix_create(128, 64, false);
-  for (size_t i = 0; i < 128; ++i) {
-    std::vector<paddle_real> sampleBuf = randomBuffer(64);
-    paddle_matrix_set_row(mat, i, sampleBuf.data());
-  }
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_value(args, 0, mat));
-
-  paddle_matrix val = paddle_matrix_create_none();
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_value(args, 0, val));
-
-  for (size_t i = 0; i < 128; ++i) {
-    paddle_real* row1;
-    paddle_real* row2;
-
-    ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, i, &row1));
-    ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(val, i, &row2));
-    ASSERT_EQ(row1, row2);
-  }
-
-  paddle_ivector ivec = paddle_ivector_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(ivec));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(val));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
-}
-
-TEST(CAPIArguments, ids) {
-  paddle_arguments args = paddle_arguments_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(args, 1));
-
-  paddle_ivector ivec;
-  int array[3] = {1, 2, 3};
-  ivec = paddle_ivector_create(array, 3, true, false);
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_ids(args, 0, ivec));
-
-  paddle_ivector val = paddle_ivector_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_ids(args, 0, val));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(ivec));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(val));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
-}
-
-template <typename T1, typename T2>
-void testSequenceHelper(T1 setter, T2 getter) {
-  paddle_arguments args = paddle_arguments_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(args, 1));
-
-  paddle_ivector ivec;
-  int array[3] = {1, 2, 3};
-  ivec = paddle_ivector_create(array, 3, true, false);
-  ASSERT_EQ(kPD_NO_ERROR, setter(args, 0, ivec));
-
-  paddle_ivector val = paddle_ivector_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, getter(args, 0, val));
-  uint64_t size;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_get_size(val, &size));
-
-  int* rawBuf;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_get(val, &rawBuf));
-  for (size_t i = 0; i < size; ++i) {
-    ASSERT_EQ(array[i], rawBuf[i]);
-  }
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(ivec));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(val));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
-}
-
-TEST(CAPIArguments, Sequence) {
-  auto testSequence = [](uint32_t nestedLevel) {
-    testSequenceHelper(std::bind(paddle_arguments_set_sequence_start_pos,
-                                 std::placeholders::_1,
-                                 std::placeholders::_2,
-                                 nestedLevel,
-                                 std::placeholders::_3),
-                       std::bind(paddle_arguments_get_sequence_start_pos,
-                                 std::placeholders::_1,
-                                 std::placeholders::_2,
-                                 nestedLevel,
-                                 std::placeholders::_3));
-  };
-  for (uint32_t i = 0; i < 2; ++i) {  // test seq and sub-seq.
-    testSequence(i);
-  }
-}
diff --git a/paddle/capi/tests/test_GradientMachine.cpp b/paddle/capi/tests/test_GradientMachine.cpp
deleted file mode 100644
index 73b9e477b2a2749250e878cf2174dcf4cc599be1..0000000000000000000000000000000000000000
--- a/paddle/capi/tests/test_GradientMachine.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/gserver/gradientmachines/GradientMachine.h>
-#include <paddle/trainer/TrainerConfigHelper.h>
-#include <stdlib.h>
-#include <string.h>
-#include <type_traits>
-#include "capi.h"
-#include "paddle/utils/ThreadLocal.h"
-
-static std::vector<paddle_real> randomBuffer(size_t bufSize) {
-  auto& eng = paddle::ThreadLocalRandomEngine::get();
-  std::uniform_real_distribution<paddle_real> dist(-1.0, 1.0);
-  std::vector<paddle_real> retv;
-  retv.reserve(bufSize);
-  for (size_t i = 0; i < bufSize; ++i) {
-    retv.push_back(dist(eng));
-  }
-  return retv;
-}
-
-TEST(GradientMachine, testPredict) {
-  //! TODO(yuyang18): Test GPU Code.
-  paddle::TrainerConfigHelper config("./test_predict_network.py");
-  std::string buffer;
-  ASSERT_TRUE(config.getModelConfig().SerializeToString(&buffer));
-  paddle_gradient_machine machine;
-
-  ASSERT_EQ(kPD_NO_ERROR,
-            paddle_gradient_machine_create_for_inference(
-                &machine, &buffer[0], (int)buffer.size()));
-  std::unique_ptr<paddle::GradientMachine> gm(
-      paddle::GradientMachine::create(config.getModelConfig()));
-  ASSERT_NE(nullptr, gm);
-  gm->randParameters();
-  gm->saveParameters("./");
-
-  ASSERT_EQ(kPD_NO_ERROR,
-            paddle_gradient_machine_load_parameter_from_disk(machine, "./"));
-
-  paddle_gradient_machine machineSlave;
-  ASSERT_EQ(kPD_NO_ERROR,
-            paddle_gradient_machine_create_shared_param(
-                machine, &buffer[0], (int)buffer.size(), &machineSlave));
-  std::swap(machineSlave, machine);
-  paddle_arguments outArgs = paddle_arguments_create_none();
-
-  paddle_arguments inArgs = paddle_arguments_create_none();
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(inArgs, 1));
-  paddle_matrix mat = paddle_matrix_create(1, 100, false);
-  static_assert(std::is_same<paddle_real, paddle::real>::value, "");
-
-  auto data = randomBuffer(100);
-  paddle_real* rowPtr;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, 0, &rowPtr));
-  memcpy(rowPtr, data.data(), data.size() * sizeof(paddle_real));
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_value(inArgs, 0, mat));
-  ASSERT_EQ(kPD_NO_ERROR,
-            paddle_gradient_machine_forward(machine, inArgs, outArgs, false));
-
-  uint64_t sz;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_size(outArgs, &sz));
-  ASSERT_EQ(1UL, sz);
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_value(outArgs, 0, mat));
-  std::vector<paddle::Argument> paddleInArgs;
-  std::vector<paddle::Argument> paddleOutArgs;
-  paddleInArgs.resize(1);
-  paddleInArgs[0].value =
-      paddle::Matrix::create(data.data(), 1, 100, false, false);
-
-  gm->forward(paddleInArgs, &paddleOutArgs, paddle::PASS_TEST);
-
-  auto matPaddle = paddleOutArgs[0].value;
-
-  uint64_t height, width;
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
-  ASSERT_EQ(matPaddle->getHeight(), height);
-  ASSERT_EQ(matPaddle->getWidth(), width);
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, 0, &rowPtr));
-  for (size_t i = 0; i < width; ++i) {
-    ASSERT_NEAR(matPaddle->getData()[i], rowPtr[i], 1e-5);
-  }
-
-  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(inArgs));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(outArgs));
-  std::swap(machineSlave, machine);
-  ASSERT_EQ(kPD_NO_ERROR, paddle_gradient_machine_destroy(machineSlave));
-  ASSERT_EQ(kPD_NO_ERROR, paddle_gradient_machine_destroy(machine));
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  std::vector<char*> argvs;
-  argvs.push_back(strdup("--use_gpu=false"));
-  paddle_init((int)argvs.size(), argvs.data());
-  for (auto each : argvs) {
-    free(each);
-  }
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/contrib/CMakeLists.txt b/paddle/contrib/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4b19256ef4533a09162edf907f6cd51146517e46
--- /dev/null
+++ b/paddle/contrib/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+add_subdirectory(inference)
diff --git a/paddle/contrib/float16/.gitignore b/paddle/contrib/float16/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..dd28d354f4160b4be68b46a7bebcdf2097d5811a
--- /dev/null
+++ b/paddle/contrib/float16/.gitignore
@@ -0,0 +1 @@
+*.inference.model
diff --git a/paddle/contrib/float16/README.md b/paddle/contrib/float16/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..58b4a50666bfb622af8acbce29355f2a4a870a82
--- /dev/null
+++ b/paddle/contrib/float16/README.md
@@ -0,0 +1,171 @@
+# Float16 Inference in PaddlePaddle Fluid
+
+Kexin Zhao <zhaokexin01@baidu.com>
+
+## Introduction
+Deep learning is usually a two-stage work: training and inference. The training stage estimates model parameters (weights) from data.  The inference stage loads the weights and uses them to interpret inputs. Typically, weights are 32-bit float values (float32).  Some new devices, including NVIDIA Volta GPUs, support higher speed computation using 16-bit float values (float16).
+
+This article explains our efforts with PaddlePaddle to train using float32 and to inference using float16. We describe a [*transpiler*](https://github.com/PaddlePaddle/Paddle/blob/a4d3de0071e1f3912230c3ab3f9ac74cf06b093a/doc/fluid/design/motivation/fluid_compiler.md), which converts a PaddlePaddle Fluid model, which, to be precise, should be called a [Fluid *program*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), into the inference program, and converts the weights from float32 into float16.
+
+
+## What is float16?
+float16 (or FP16) is a half-precision floating-point format that uses 16 bits in memory to represent a value. The advantage over 32-bit single-precision floating-point format (commonly known as float or float32 data type) is that it requires half the storage and bandwidth at the expense of precision and range. Fortunately, DNN inference has a high tolerance for the loss of precision and range when using float16 to represent the weights, and the inference accuracy will only be minimally affected in most cases, which gives us the opportunity to use float16 data type to speed up the inference.
+
+Interested readers can refer to our [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/data_type/float16.md) and [code](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/float16.h) for more details on how we implement the float16 data type.
+
+## Why float16?
+The trend in today's deep learning community is to use bigger and deeper model, which translates to larger memory footprint, higher computation demands, and as a result higher energy consumption on computing devices. The advantages of float16 over float32 are correspondingly three-fold:
+
+1. We only need half the memory size to load the same model using float16 representations. Moreover, most of the intermediate results generated during float16 inference are also of the float16 data type. As a result, the whole memory footprint of float16 inference is roughly half of its float counterpart, which is especially useful when deploying inference on mobile devices with limited available memory. Also given the same available memory, the maximum batch size for float16 inference is about twice that for float inference.
+
+2. Because float16 occupies less memory than float, in theory, hardware devices can achieve much higher floating point operators per second (FLOPS) for float16 data than float data. Right now, NVIDIA's latest Volta GPUs, including Tesla V100 and Titan V, can deliver significantly higher FLOPS for float16 using Tensor Cores. Moreover, float16 takes less time to read from or write to memory, and hence float16 can make inference more efficient especially in memory-bound applications where the performance is mostly affected by how fast it is to read and write data.
+
+3. From the energy efficiency perspective, the energy needed to read, write, and compute float16 data is much less than its float counterpart, which can significantly reduce the battery power consumption on mobile devices or the total cost of ownership (TCO) of data centers.
+
+## Fluid implementation of float16 inference
+### Overview
+Fluid use [Program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#program) instead of computation graph to describe a neural network model and the optimization procedure. Fluid program is a python wrapper around a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md). Similar to programming languages, the basic structure of a Fluid program is some nested [blocks](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program by sequentially executing the operators in the entrance block. 
+
+### Basic requirement
+When an executor runs an operator, it uses a kernel to perform computations on tensors contained in the input variables, and then writes the results to the tensors in the output variables. Each operator has multiple kernels for different combinations of data types, devices, and library types, respectively. The operator will select the appropriate kernel to run based on, among other things, the data type of the input tensors. By default, every Fluid operator has a kernel for float data type that takes float inputs and generates float outputs.
+
+If we provide float input to the first operator in a program, then each operator will use float kernel to compute float output and send it as input to the next operator to trigger its float kernel. This chain effect will make the program run in float mode and gives us a final output of float data type. 
+
+The same principle applies if we want a program to run in float16 mode. We provide input variable of the float16 data type to the first operator, and every subsequent operator will invoke the float16 kernel until we get the final output in float16. So the preliminary requirements for float16 inference are to add float16 kernels to operators that are needed in a specific kind of neural networks. Our current focus is on Convolutional Neural Networks (CNN) and hence we have added float16 kernels to the following operators: convolution, pooling, GEMM, elementwise addition, batch norm, dropout, various activations including relu and tanh, and softmax.
+
+### float16 transpiler
+Furthermore, we need a transpiler to write float16 inference code similar to the following:
+
+```python
+# Get the float32 inference program and load the associated float32 weights
+[inference_program, feed_target_names,
+ fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+# Prepare the float input data
+batch_size = 1
+tensor_img = numpy.random.rand(batch_size, 3, 32, 32).astype(numpy.float32)
+
+# Running inference_program in float mode
+float_results = exe.run(inference_program,
+                        feed={feed_target_names[0]: tensor_img},
+                        fetch_list=fetch_targets)
+
+# Use float16 transpiler to speedup
+float16_inference_program = float_inference_program.clone()
+t = Float16Transpiler()
+t.transpile(float16_inference_program, GPUPlace)
+
+# Running float16_inference_program in float16 mode using the same input data
+float16_results = exe.run(float16_inference_program,
+                          feed={feed_target_names[0]: tensor_img},
+                          fetch_list=fetch_targets)
+
+# Do some tests to verify the correctness of float16 inference
+...
+np.testing.assert_almost_equal(float_results, float16_results, ...)
+...
+
+# Save the float16 inference program and float16 weights for future deployment
+fluid.io.save_inference_model(fp16_save_dirname, feed_target_names,
+                              fetch_targets, exe,
+                              float16_inference_program)
+```
+
+In this scenario, we already have a float32 inference program and some associated float32 weights. We can simply use the `transpile` method of the `Float16Transpiler` class to do certain modifications to the existing program and weights so that we have a new float16 program and the associated float16 weights.
+
+We can then run various inference experiments in float16 mode and save the float16 program and weights on disk for future deployment. To enhance the code usability, we maintain a consistent API so that user can use the same float32 input data to run inference program in either float32 and float16 mode and obtain output data both of float32 data type. Consequently, we need to add cast operators in the float16 inference program for conversions between the float16 tensor and float32 tensor.
+
+The float16 transpiler is implemented to fulfill the requirements mentioned above. The details of the float16 transpiler can be found [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/data_type/float16.md#float16-inference).
+
+### Experiment results
+Simply running the following commands to reproduce the experiment results presented in this section:
+
+```bash
+git clone https://github.com/PaddlePaddle/Paddle.git
+cd Paddle
+# This line will generate a paddle development docker image with cuda 8 and cudnn 7
+# If you want test on cuda 9 instead, change the line 5 in Paddle/Dockerfile 
+# from `FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04`
+# to `FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04` and similarly for other configurations
+nvidia-docker build -t paddle:float16 .
+# After running this, different results will be written to different log files in Paddle/contrib/float16/
+nvidia-docker run -it -v $PWD:/paddle paddle:float16 /paddle/paddle/contrib/float16/run_float16_demo.sh
+```
+
+#### Accuracy
+As is mentioned before, DNN inference has been found to be tolerant against the loss of precision and range incurred by float16, and we want to see how good this tolerance is.
+
+We train a resnet32 model using cifar10 data set, save it when test set accuracy is above 60%, and then test the inference accuracy on the 10000 examples of the cifar10 test set in float16 and float32 mode, respectively.
+
+We repeat the test ten times and get the following results:
+
+|        | float16 | float32  |
+|--------|--------:|--------: |
+| # 1    | 62.75%  | 62.72%   |
+| # 2    | 61.27%  | 61.28%   |
+| # 3    | 62.24%  | 62.23%   |
+| # 4    | 64.16%  | 64.17%   |
+| # 5    | 60.75%  | 60.77%   |
+| # 6    | 63.25%  | 63.24%   |
+| # 7    | 62.15%  | 62.13%   |
+| # 8    | 62.05%  | 62.02%   |
+| # 9    | 65.19%  | 65.20%   |
+| #10    | 62.53%  | 62.48%   |
+| average| 62.63%  | 62.62%   |
+
+We can see that the accuracy of float16 inference is very close to that of float32 inference in every experiment (within 0.05% difference) and is overall 0.01% better than its float32 counterpart averaged over ten tests. 
+
+#### Performance benchmark
+Currently, Fluid only supports float16 inference on NVIDIA GPUs. There is no motivation to support float16 inference on non-ARM CPUs where float16 is not natively supported, and float16 calculation will only be slower than its float32 counterpart. 
+
+NVIDIA started to support its native float16 data type (which has the same internal memory representation as Fluid's float16 class) on CUDA 7.5. Moreover, float16 speedups on computationally intensive tasks including GEMM (general matrix-matrix multiplication) and convolution are supported since cuBLAS 7.5 and cuDNN 5.0.
+
+Recently, the introduction of [Tensor Core](https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/) in Volta architecture GPUs and the support of Tensor Core computation in CUDA 9.0 and cuDNN 7 make float16 genuinely superior to float in some deep learning applications.
+
+We thus benchmark the float16 inference performance on a single NVIDIA Tesla V100 GPU (Volta architecture and with Tensor Cores) and compare it with its float32 counterpart. All the following results are in ms (millisecond) averaged over 1000 mini-batches with respective to different mini-batch(mb) sizes.
+
+Average inference time for one mini-batch on Vgg16 model tested on ImageNet dataset:
+
+| total | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  |
+|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|
+|float32| 14.01 | 9.70  | 22.99 | 28.26 | 53.87  | 84.42 | 178.95 | 
+|float16|  3.32 | 4.11  |  5.88 |  9.41 | 16.54  | 30.47 |  60.23 |
+|Speedup|  4.22 | 2.36  |  3.91 |  3.00 |  3.26  |  2.77 |   2.97 |
+
+We can see that float16 inference provides **2x ~ 4x** speedup on different batch sizes. 
+
+Convolution operation is ususally the computational bottleneck of CNN, so we also check the average time spent on the Fluid convolution operators for one mini-batch as follows:
+
+|conv op| mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | 
+|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|
+|float32| 11.95 | 6.96  | 18.65 | 21.42 | 41.35  | 60.58 | 130.11 |
+|float16|  1.78 | 2.10  |  2.93 |  4.55 |  7.99  | 14.63 |  28.67 |
+|Speedup|  6.71 | 3.31  |  6.37 |  4.71 |  5.18  |  4.14 |   4.54 |
+
+Fluid convolution operator uses cuDNN 7 to implement the kernel, and we can see that with the help of Tensor Core, float16 convolution is significantly faster than its float32 counterpart, which makes the overall float16 inference performance much better.
+
+Similarly, we also list the benchmark results of Resnet50 model tested on the ImageNet dataset:
+
+| total | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | mb=128 |
+|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:|
+|float32| 7.03  | 7.41  | 9.16  | 12.55 | 21.13  | 38.27 | 67.93  | 127.02 | 
+|float16| 6.13  | 6.32  | 6.24  |  7.40 | 10.90  | 18.18 | 33.20  |  64.52 |
+|Speedup| 1.15  | 1.17  | 1.47  |  1.70 |  1.94  |  2.11 |  2.05  |   1.97 |
+
+|conv op| mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | mb=128 |
+|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:|
+|float32| 5.43  | 5.46  | 6.50  | 8.36  | 13.80  | 24.45 | 41.21  | 73.44  |
+|float16| 4.19  | 4.30  | 3.96  | 4.21  |  5.63  |  8.77 | 15.24  | 28.40  |
+|Speedup| 1.30  | 1.27  | 1.64  | 1.99  |  2.45  |  2.79 |  2.70  |  2.59  |
+
+We find that the speedup provided by float16 inference starts relatively small at 1.15x for batch size 1 and gradually increases to about 2x for larger batch sizes. A similar trend can be found for the time spent on the convolution operator. Note that right now Tensor Cores will only be utilized in the convolution operation when the input data and filter meet specific dimensional requirements. The speedup by float16 inference for Resnet50 is smaller than the Vgg16 counterpart partially because the convolution operation in Resnet is much simpler than its Vgg counterpart and this makes the tensor core less utilized in Resnet than in Vgg.
+
+We also did the same benchmark on a single NVIDIA GeForce GTX 1080 Ti GPU that does not support Tensor Core. The results show that for Vgg16, float16 inference provides consistent small speedup (around 1.15x) for all mini-batch sizes, while for Resnet50, float16 inference is slower than its float32 counterpart in small batch sizes (mb = 1 and 2) and then delivers around 1.15x speedup for all larger batch sizes. By comparing the benchmarks on 1080 Ti and V100, we find that Tensor Core, which is specialized for float16 computations, is a critical component of high performance float16 inference.
+
+Please refer to [here](https://github.com/PaddlePaddle/Paddle/blob/develop/contrib/float16/float16_benchmark.md) for complete benchmark results.
+
+### Summary
+1. Fluid is now able to run inference in float16 mode via a float16 transpiler. We currently support CNN programs, including Vgg and Resnet, to run in float16 inference mode.
+2. The accuracy of float16 inference is verified to be almost identical to its float32 counterpart at least on CNN models.
+3. float16 inference provides a significant speedup on large and computationally intensive Vgg16 model on ImageNet dataset. For the much smaller and simpler Resnet50 model, the speedup provided by float16 inference is less significant than for Vgg16 model but still favorable, especially for large batch sizes.
+4. We cannot achieve the superior float16 inference performance without the help of the newly introduced Tensor Cores on NVIDIA Volta architecture GPUs.
diff --git a/paddle/contrib/float16/float16_benchmark.md b/paddle/contrib/float16/float16_benchmark.md
new file mode 100644
index 0000000000000000000000000000000000000000..b51d6bde92fa04d2268afa36b9c4bd18bc28fe73
--- /dev/null
+++ b/paddle/contrib/float16/float16_benchmark.md
@@ -0,0 +1,97 @@
+# float16 benchmark
+
+## Description
+We want to compare the inference benchmark of float16 vs float32 on the "image_classification" example on Nvidia Tesla V100 GPU, where we can enable the tensor core computation for float16 mode. We test Vgg16 and Resnet50 on the imagenet data set, and Vgg16 and Resnet32 on the cifar10 data set. For completeness, we also add the inference benchmark of Vgg16 and Resnet50 on imagenet data set tested on Nvidia GeForce GTX 1080 Ti GPU.
+
+For more details about tensor core, please refer to https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/
+
+## Test environment
+- GPU: single Nvidia Tesla V100 or single Nvidia GeForce GTX 1080 Ti 
+- CUDNN: 7.1.1
+- CUDA: 9.0
+- Code: https://github.com/PaddlePaddle/Paddle/pull/10331 (Tensor core is enabled in float16 mode)
+
+## Benchmark on V100
+All times are in ms (millisecond) averaged over 1000 iterations tested on a single Nvidia V100 GPU with respective to different mini-batch(mb) sizes.
+
+### Vgg16 on imagenet (flowers data set: image.shape = [3, 224, 224]):
+
+Total inference time for one batch:
+
+|       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  |
+|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|
+|float32| 14.01 | 9.70  | 22.99 | 28.26 | 53.87  | 84.42 | 178.95 | 
+|float16|  3.32 | 4.11  |  5.88 |  9.41 | 16.54  | 30.47 |  60.23 |
+|Speedup|  4.22 | 2.36  |  3.91 |  3.00 |  3.26  |  2.77 |   2.97 |
+
+Total time spent on conv op for one batch:
+
+|       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | 
+|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|
+|float32| 11.95 | 6.96  | 18.65 | 21.42 | 41.35  | 60.58 | 130.11 |
+|float16|  1.78 | 2.10  |  2.93 |  4.55 |  7.99  | 14.63 |  28.67 |
+|Speedup|  6.71 | 3.31  |  6.37 |  4.71 |  5.18  |  4.14 |   4.54 |
+
+
+### Resnet50 on imagenet (flowers data set: image.shape = [3, 224, 224]):
+
+Total inference time for one batch:
+
+|       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | mb=128 |
+|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:|
+|float32| 7.03  | 7.41  | 9.16  | 12.55 | 21.13  | 38.27 | 67.93  | 127.02 | 
+|float16| 6.13  | 6.32  | 6.24  |  7.40 | 10.90  | 18.18 | 33.20  |  64.52 |
+|Speedup| 1.15  | 1.17  | 1.47  |  1.70 |  1.94  |  2.11 |  2.05  |   1.97 |
+
+Total time spent on conv op for one batch:
+
+|       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32 | mb=64  | mb=128 |
+|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:|
+|float32| 5.43  | 5.46  | 6.50  | 8.36  | 13.80  | 24.45 | 41.21  | 73.44  |
+|float16| 4.19  | 4.30  | 3.96  | 4.21  |  5.63  |  8.77 | 15.24  | 28.40  |
+|Speedup| 1.30  | 1.27  | 1.64  | 1.99  |  2.45  |  2.79 |  2.70  |  2.59  |
+
+
+### Vgg16 on cifar10 (image.shape = [3, 32, 32]):
+
+Total inference time for one batch:
+
+|       | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | mb=128 | mb=256 | mb=512 |
+|-------|-----:|-----:|-----:|-----:|------:|------:|------:|-------:|-------:|-------:| 
+|float32| 3.13 | 3.17 | 3.19 | 3.58 | 3.98  | 6.23  | 8.42  | 13.44  | 24.19  | 44.97  | 
+|float16| 2.72 | 2.77 | 2.76 | 2,88 | 2.96  | 3.24  | 4.01  |  5.78  |  9.65  | 17.37  |
+|Speedup| 1.15 | 1.14 | 1.16 | 1.24 | 1.34  | 1.92  | 2.10  |  2.33  |  2.51  |  2.59  |
+
+
+### Resnet32 on cifar10 (image.shape = [3, 32, 32]):
+
+Total inference time for one batch:
+
+|       | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | mb=128 | mb=256 | mb=512 |
+|-------|-----:|-----:|-----:|-----:|------:|------:|------:|-------:|-------:|-------:|
+|float32| 3.11 | 3.14 | 2.99 | 3.04 | 3.10  | 3.28  | 4.47  | 6.86   | 11.63  | 21.16  |
+|float16| 3.70 | 3.81 | 3.75 | 3.83 | 3.77  | 3.97  | 3.92  | 4.15   |  6.41  | 11.02  | 
+|Speedup|      |      |      |      |       |       | 1.14  | 1.65   |  1.81  |  1.92  |
+
+
+## Benchmark on 1080 Ti
+All times are in ms (millisecond) averaged over 1000 iterations tested on a single Nvidia GeForce GTX 1080 Ti GPU with respective to different mini-batch(mb) sizes.
+
+### Vgg16 on imagenet (flowers data set: image.shape = [3, 224, 224]):
+Total inference time for one batch:
+
+|       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32  |
+|-------|-----: |-----: |-----: |-----: |------: |-------:|
+|float32| 5.60  | 9.38  | 15.86 | 29.79 | 57.60  | 117.73 |
+|float16| 4.99  | 7.79  | 13.47 | 26.02 | 52.30  | 102.34 |
+|Speedup| 1.12  | 1.20  |  1.18 |  1.15 |  1.10  |   1.15 |
+
+
+### Resnet50 on imagenet (flowers data set: image.shape = [3, 224, 224]):
+Total inference time for one batch:
+
+|       | mb=1  | mb=2  | mb=4  | mb=8  | mb=16  | mb=32  | mb=64  |
+|-------|-----: |-----: |-----: |-----: |------: |-------:|-------:|
+|float32| 5.63  | 6.23  | 8.85  | 14.71 | 26.07  | 52.86  | 108.95 |
+|float16| 5.89  | 6.44  | 7.94  | 12.57 | 22.03  | 45.06  |  92.68 |
+|Speedup|       |       | 1.12  |  1.17 |  1.18  |  1.17  |   1.18 |
diff --git a/paddle/contrib/float16/float16_inference_demo.py b/paddle/contrib/float16/float16_inference_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..063227d5d2586d66ad4091133a8edf014da839f8
--- /dev/null
+++ b/paddle/contrib/float16/float16_inference_demo.py
@@ -0,0 +1,362 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from float16_transpiler import Float16Transpiler
+
+import argparse
+import paddle
+import paddle.fluid as fluid
+import contextlib
+import math
+import sys
+import numpy as np
+import os
+
+parser = argparse.ArgumentParser(
+    'Float16 inference accuracy test and benchmark.')
+parser.add_argument(
+    '--train_batch_size', type=int, default=16, help="Batch size for training.")
+parser.add_argument(
+    '--inf_batch_size', type=int, default=32, help="Batch size for inference.")
+parser.add_argument(
+    '--repeat', type=int, default=1, help="How many times to run the test.")
+parser.add_argument(
+    '--data_set',
+    type=str,
+    default='cifar10',
+    choices=['cifar10', 'imagenet'],
+    help="Optional dataset for benchmark.")
+parser.add_argument(
+    '--model',
+    type=str,
+    default='vgg',
+    choices=['vgg', 'resnet'],
+    help="Optional model for benchmark.")
+parser.add_argument(
+    '--threshold',
+    type=float,
+    default=0.005,
+    help='Save inference model when test accuracy reach this threshold.')
+parser.add_argument('--learning_rate', type=float, default=0.001)
+args = parser.parse_args()
+
+
+def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+    conv1 = fluid.layers.conv2d(
+        input=input,
+        filter_size=filter_size,
+        num_filters=ch_out,
+        stride=stride,
+        padding=padding,
+        act=None,
+        bias_attr=False)
+    return fluid.layers.batch_norm(input=conv1, act=act)
+
+
+def shortcut(input, ch_out, stride):
+    ch_in = input.shape[1]
+    if ch_in != ch_out:
+        return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+    else:
+        return input
+
+
+def basicblock(input, ch_out, stride):
+    short = shortcut(input, ch_out, stride)
+    conv1 = conv_bn_layer(input, ch_out, 3, stride, 1)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None)
+    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+
+
+def bottleneck(input, ch_out, stride):
+    short = shortcut(input, ch_out * 4, stride)
+    conv1 = conv_bn_layer(input, ch_out, 1, stride, 0)
+    conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1)
+    conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None)
+    return fluid.layers.elementwise_add(x=short, y=conv3, act='relu')
+
+
+def layer_warp(block_func, input, ch_out, count, stride):
+    res_out = block_func(input, ch_out, stride)
+    for i in range(1, count):
+        res_out = block_func(res_out, ch_out, 1)
+    return res_out
+
+
+def resnet_imagenet(input, depth=50):
+    cfg = {
+        18: ([2, 2, 2, 1], basicblock),
+        34: ([3, 4, 6, 3], basicblock),
+        50: ([3, 4, 6, 3], bottleneck),
+        101: ([3, 4, 23, 3], bottleneck),
+        152: ([3, 8, 36, 3], bottleneck)
+    }
+    stages, block_func = cfg[depth]
+    conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3)
+    pool1 = fluid.layers.pool2d(
+        input=conv1, pool_type='avg', pool_size=3, pool_stride=2)
+    res1 = layer_warp(block_func, pool1, 64, stages[0], 1)
+    res2 = layer_warp(block_func, res1, 128, stages[1], 2)
+    res3 = layer_warp(block_func, res2, 256, stages[2], 2)
+    res4 = layer_warp(block_func, res3, 512, stages[3], 2)
+    pool2 = fluid.layers.pool2d(
+        input=res4,
+        pool_size=7,
+        pool_type='avg',
+        pool_stride=1,
+        global_pooling=True)
+    return pool2
+
+
+def resnet_cifar10(input, depth=32):
+    assert (depth - 2) % 6 == 0
+
+    n = (depth - 2) // 6
+
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    return pool
+
+
+def vgg16(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
+    return fc2
+
+
+def train(place, save_dirname):
+    if args.data_set == "cifar10":
+        class_dim = 10
+        data_shape = [3, 32, 32]
+    elif args.data_set == "imagenet":
+        class_dim = 102
+        data_shape = [3, 224, 224]
+    else:
+        raise ValueError("%s dataset is not supported" % data_set)
+
+    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    if args.model == "vgg":
+        print("train vgg")
+        net = vgg16(images)
+    elif args.model == "resnet":
+        print("train resnet")
+        if args.data_set == "cifar10":
+            net = resnet_cifar10(images)
+        elif args.data_set == "imagenet":
+            net = resnet_imagenet(images)
+        else:
+            raise ValueError("%s dataset is not supported" % args.data_set)
+    else:
+        raise ValueError("%s network is not supported" % args.model)
+
+    predict = fluid.layers.fc(input=net, size=class_dim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    acc = fluid.layers.accuracy(input=predict, label=label)
+
+    #Test program
+    test_program = fluid.default_main_program().clone(for_test=True)
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    optimizer.minimize(avg_cost)
+
+    BATCH_SIZE = args.train_batch_size
+    PASS_NUM = 100
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.flowers.train()
+            if args.data_set == 'imagenet' else paddle.dataset.cifar.train10(),
+            buf_size=128 * 10),
+        batch_size=args.train_batch_size)
+
+    test_reader = paddle.batch(
+        paddle.dataset.flowers.test()
+        if args.data_set == 'imagenet' else paddle.dataset.cifar.test10(),
+        batch_size=args.inf_batch_size)
+
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
+
+    exe.run(fluid.default_startup_program())
+    main_program = fluid.default_main_program()
+
+    for pass_id in range(PASS_NUM):
+        for batch_id, data in enumerate(train_reader()):
+            train_image = np.array(
+                map(lambda x: x[0].reshape(data_shape), data)).astype("float32")
+            train_label = np.array(map(lambda x: x[1], data)).astype("int64")
+            train_label = train_label.reshape([-1, 1])
+
+            exe.run(main_program,
+                    feed={'pixel': train_image,
+                          'label': train_label})
+
+            if (batch_id % 100) == 0:
+                acc_list = []
+                avg_loss_list = []
+                for tid, test_data in enumerate(test_reader()):
+                    test_image = np.array(
+                        map(lambda x: x[0].reshape(data_shape),
+                            test_data)).astype("float32")
+                    test_label = np.array(map(lambda x: x[1],
+                                              test_data)).astype("int64")
+                    test_label = test_label.reshape([-1, 1])
+
+                    loss_t, acc_t = exe.run(
+                        program=test_program,
+                        feed={"pixel": test_image,
+                              "label": test_label},
+                        fetch_list=[avg_cost, acc])
+                    if math.isnan(float(loss_t)):
+                        sys.exit("got NaN loss, training failed.")
+                    acc_list.append(float(acc_t))
+                    avg_loss_list.append(float(loss_t))
+
+                acc_value = np.array(acc_list).mean()
+                avg_loss_value = np.array(avg_loss_list).mean()
+
+                print(
+                    'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Accuracy {3:2.2}'.
+                    format(pass_id, batch_id + 1,
+                           float(avg_loss_value), float(acc_value)))
+
+                if acc_value > args.threshold:
+                    print(
+                        'Save inference model with test accuracy of {0} at {1}'.
+                        format(float(acc_value), save_dirname))
+                    fluid.io.save_inference_model(save_dirname, ["pixel"],
+                                                  [predict], exe)
+                    return
+
+
+def test_accuracy(executor, inference_program, feed_target_names,
+                  fetch_targets):
+    if args.data_set == "cifar10":
+        data_shape = [3, 32, 32]
+    elif args.data_set == "imagenet":
+        data_shape = [3, 224, 224]
+    else:
+        raise ValueError("%s dataset is not supported" % data_set)
+
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10()
+        if args.data_set == "cifar10" else paddle.dataset.flowers.test(),
+        batch_size=args.inf_batch_size)
+
+    test_num = 0
+    correct_num = 0
+
+    for test_data in test_reader():
+        test_image = np.array(
+            map(lambda x: x[0].reshape(data_shape), test_data)).astype(
+                "float32")
+        test_label = np.array(map(lambda x: x[1], test_data)).astype("int64")
+        test_label = test_label.reshape([-1, 1])
+
+        results = executor.run(program=inference_program,
+                               feed={feed_target_names[0]: test_image},
+                               fetch_list=fetch_targets)
+
+        prediction = np.argmax(results[0], axis=1).reshape([-1, 1])
+        correct_num += np.sum(prediction == test_label)
+        test_num += test_label.size
+
+    print("{0} out of {1} predictions are correct.".format(correct_num,
+                                                           test_num))
+    print("Test accuray is {0}.".format(float(correct_num) / float(test_num)))
+
+
+def infer(place, save_dirname):
+    exe = fluid.Executor(place)
+    inference_scope = fluid.core.Scope()
+
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        print("Load inference model from {0}".format(save_dirname))
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+        print("The test set accuracy of inference in float mode is:")
+        test_accuracy(exe, inference_program, feed_target_names, fetch_targets)
+
+        float16_inference_program = inference_program.clone()
+        t = Float16Transpiler()
+        t.transpile(float16_inference_program, place)
+
+        print("The test set accuracy of inference in float16 mode is:")
+        test_accuracy(exe, float16_inference_program, feed_target_names,
+                      fetch_targets)
+
+        fp16_save_dirname = "float16_" + save_dirname
+        fluid.io.save_inference_model(fp16_save_dirname, feed_target_names,
+                                      fetch_targets, exe,
+                                      float16_inference_program)
+
+
+@contextlib.contextmanager
+def scope_prog_guard():
+    prog = fluid.Program()
+    startup_prog = fluid.Program()
+    scope = fluid.core.Scope()
+    with fluid.scope_guard(scope):
+        with fluid.program_guard(prog, startup_prog):
+            yield
+
+
+if __name__ == "__main__":
+    if not fluid.core.is_compiled_with_cuda():
+        raise Exception("This test requires CUDA GPUs!")
+
+    place = fluid.CUDAPlace(0)
+    if not fluid.core.is_float16_supported(place):
+        raise Exception(
+            "This test requires compute capability of CUDA GPU >= 5.3!")
+
+    for i in range(args.repeat):
+        with scope_prog_guard():
+            save_dirname = "image_classification_" + args.data_set + "_" + args.model + ".inference.model"
+            train(place, save_dirname)
+            infer(place, save_dirname)
diff --git a/paddle/contrib/float16/float16_transpiler.py b/paddle/contrib/float16/float16_transpiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..91ba101edb65cd45bd5e37a0c6ad25e515593a81
--- /dev/null
+++ b/paddle/contrib/float16/float16_transpiler.py
@@ -0,0 +1,256 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.framework import Program
+from paddle.fluid.executor import global_scope
+
+
+class Float16Transpiler:
+    def transpile(self, program, place, scope=None):
+        '''
+        Transpile the program desc and cast the weights to float16 data type to
+        enable float16 inference.
+
+        Since the operator in a program desc will automatically choose the
+        right compute kernel to run based on the data type of the input tensor.
+        We actually don't need to change the program desc to run in float16 mode.
+
+        However, in this way, users who are used to feeding and fetching tensors 
+        of float32 data type when running typical inference may find it confusing
+        and difficult to run inference in float16 mode as they need to convert
+        input data to float16 dtype and then convert the results back to float32 
+        dtype to match the rest of code.
+
+        So this function appends cast ops to the program desc where necessary so 
+        that users are able to run inference in float16 mode while providing input 
+        tensor (feed_holder) of float data type and obtaining output tensor 
+        (fetch_holder) of float data type. 
+
+        Moreover, it is desired that when we have the scope and program desc to run
+        inference in float32 mode, we can use a single API to do the necessary 
+        modification and then user can run float16 inference on the fly. To make 
+        this happen, this function also create new parameters in the scope to have the 
+        converted float16 weights and change the operators in program desc to use 
+        these new parameters.
+
+        :param program: program to transpile 
+        :type program: Program
+        :param place: inference place 
+        :type place: Place
+        :param scope: inference scope 
+        :type scope: Scope         
+        '''
+        if not isinstance(program, Program):
+            raise TypeError("program should be as Program type")
+        if not isinstance(place, core.CPUPlace) and not isinstance(
+                place, core.CUDAPlace):
+            raise TypeError("place should be as CPUPlace/CUDAPlace type")
+        if scope is None:
+            scope = global_scope()
+        if not isinstance(scope, core.Scope):
+            raise TypeError("scope should be as Scope type or None")
+
+        self.scope = scope
+        self.place = place
+        self.block = program.block(0)
+        self.input_map = {}  # store the input names should be adjusted 
+
+        self._modify_feed_fetch()
+        self._convert_param_to_float16()
+        self._adjust_input(skip=True)
+        self._remove_unused_var()
+
+        # TODO(luotao): use clone() method to flush the program.desc in force, 
+        # since some large program.desc will not be flushed immediately. 
+        # And a better solution will be considered later.
+        program = program.clone()
+
+    # ====================== private transpiler functions =====================
+    def _adjust_input(self, skip=False):
+        '''
+        Change the input variable name in operators.
+
+        When we are in the process of modifying a program desc, we usually 
+        replace some variables with some other variables, where we create 
+        a dictionary input_map to record the one-to-one correspondence
+        between each old variable and the new one. 
+
+        After that, this function will search all the operators that use the 
+        old variables and change the info in op to use the new variables. There 
+        maybe some exceptions to this rule when we are using the float16 transpiler
+        and insert cast ops to cast float32 variable to float16 one. After we 
+        insert the cast op to cast var_1 to var_1_fp16, we don't want to change 
+        the input of cast op to var_1_fp16 after using this function.     
+        '''
+        skip_ops = {"cast"}
+        for i in range(len(self.block.ops)):
+            current_op = self.block.ops[i]
+            if skip and current_op.type in skip_ops:
+                continue
+            for input_arg in current_op.input_arg_names:
+                if input_arg in self.input_map:
+                    current_op.rename_input(input_arg,
+                                            self.input_map[input_arg])
+
+    def _remove_unused_var(self):
+        '''
+        remove unused varibles in program
+        '''
+        args = []
+        for i in range(len(self.block.ops)):
+            current_op = self.block.ops[i]
+            args += current_op.input_arg_names
+            args += current_op.output_arg_names
+        args = list(set(args))  # unique the input and output arguments
+
+        for var in self.block.vars.keys():
+            if var not in args:
+                self.block.remove_var(var)
+
+    def _modify_feed_fetch(self):
+        '''
+        Modify feed fetch op/vars for float16 inference.
+
+        For each feed op:
+        feed_op->feed_target_var
+        
+        Change it to:
+        feed_op->feed_target_var->cast_op(from other dtype to float16)->tmp_var
+
+        For each fetch op:
+        fetch_target_var->fetch_op
+
+        Change it to:
+        tmp_var->cast_op(from float16 to other dtype)->fetch_target_var->fetch_op
+
+        :return: None
+        '''
+
+        def find_op(var):
+            # It is possible that var.op is not up to date after some 
+            # modifications to program desc. Here we force to make it up to date.
+            var.op = None
+            for op in self.block.ops:
+                if var.name in op.output_arg_names:
+                    var.op = op
+                    break
+
+            if var.op is None:
+                raise ValueError("The target variable must have an "
+                                 "associated operator that generates it.")
+
+        i = 0
+        while i < len(self.block.ops):
+            cur_op = self.block.ops[i]
+            if cur_op.type == "feed":
+                var_name = cur_op.output("Out")[0]
+                tmp_var_name = var_name + ".fp16"
+                var = self.block.vars[var_name]
+                tmp_var = self.block.create_var(
+                    name=tmp_var_name.encode('ascii'),
+                    type=var.type,
+                    dtype=core.VarDesc.VarType.FP16,
+                    shape=var.shape,
+                    persistable=var.persistable)
+                self.block.insert_op(
+                    i + 1,
+                    type="cast",
+                    inputs={"X": var},
+                    outputs={"Out": tmp_var},
+                    attrs={
+                        'in_dtype': int(var.dtype),
+                        'out_dtype': int(tmp_var.dtype)
+                    })
+                self.input_map[var_name] = tmp_var_name
+                i = i + 1
+            elif cur_op.type == "fetch":
+                var_name = cur_op.input("X")[0]
+                tmp_var_name = var_name + ".fp16"
+                var = self.block.vars[var_name]
+                tmp_var = self.block.create_var(
+                    name=tmp_var_name.encode('ascii'),
+                    type=var.type,
+                    dtype=core.VarDesc.VarType.FP16,
+                    shape=var.shape,
+                    persistable=var.persistable)
+                find_op(var)
+                var.op.rename_output(var_name, tmp_var_name)
+                self.block.insert_op(
+                    i,
+                    type="cast",
+                    inputs={"X": tmp_var},
+                    outputs={"Out": var},
+                    attrs={
+                        'in_dtype': int(tmp_var.dtype),
+                        'out_dtype': int(var.dtype)
+                    })
+                i = i + 1
+            i = i + 1
+
+    def _convert_param_to_float16(self):
+        def _get_no_fp16_conversion_var_names():
+            '''
+            Get the set of input variable names that shouldn't be converted to float16.
+
+            When we want to run inference in float16 mode, most parameters need to be 
+            firstly converted to float16. However, there are some parameters that 
+            shouldn't be converted to float16 because the corresponding operator 
+            requires float32 parameters even in float16 mode (when the input data is 
+            of float16 data type). Currently, the only operator that has this exclusion 
+            is the batch norm op.
+
+            :return: set of input variable names 
+            :type var_names: set         
+            '''
+            op_names = {'batch_norm'}
+            var_names = []
+            for op in self.block.ops:
+                if op.type in op_names:
+                    var_names += op.input_arg_names
+            return set(var_names)
+
+        def _should_be_converted(var):
+            return var.persistable and \
+                   var.name not in self.no_conversion_vars and \
+                   var.type != core.VarDesc.VarType.FEED_MINIBATCH and \
+                   var.type != core.VarDesc.VarType.FETCH_LIST
+
+        self.no_conversion_vars = _get_no_fp16_conversion_var_names()
+        conversion_var_list = filter(_should_be_converted,
+                                     self.block.vars.values())
+        for var in conversion_var_list:
+            fp16_var_name = var.name + ".fp16"
+            fp16_var = self.block.create_parameter(
+                name=fp16_var_name.encode('ascii'),
+                type=var.type,
+                dtype=core.VarDesc.VarType.FP16,
+                shape=var.shape)
+
+            # cast the data in the tensor of the original var to float16
+            # data type and store it in the tensor of the new float16 var
+            self.scope.var(fp16_var_name)
+            fp16_tensor = self.scope.find_var(fp16_var_name).get_tensor()
+            tensor = np.array(self.scope.find_var(var.name).get_tensor())
+            # After the old tensor data is converted to np.float16, view(np.uint16)
+            # is used so that the internal memory of the numpy array will be 
+            # reinterpreted to be of np.uint16 data type, which is binded to fluid 
+            # float16 data type via the help of pybind in tensor_py.h. 
+            fp16_tensor.set(
+                tensor.astype(np.float16).view(np.uint16), self.place)
+
+            # old var will be replaced by the fp16 var in program desc
+            self.input_map[var.name] = fp16_var_name
+            self.block.remove_var(var.name)
diff --git a/paddle/contrib/float16/run_float16_demo.sh b/paddle/contrib/float16/run_float16_demo.sh
new file mode 100755
index 0000000000000000000000000000000000000000..031225a85dabb26e5d9ea06f58909c049e7f0c08
--- /dev/null
+++ b/paddle/contrib/float16/run_float16_demo.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+
+BUILD_PATH=/paddle/fp16_build
+WHEEL_PATH=$BUILD_PATH/python/dist
+INFER_PATH=$BUILD_PATH/paddle/fluid/inference/tests/book
+DEMO_PATH=/paddle/paddle/contrib/float16
+
+# Use the single most powerful CUDA GPU on your machine
+export CUDA_VISIBLE_DEVICES=0
+
+# Build the PaddlePaddle Fluid wheel package and install it.
+mkdir -p $BUILD_PATH && cd $BUILD_PATH
+cmake .. -DWITH_AVX=OFF \
+         -DWITH_MKL=OFF \
+         -DWITH_GPU=ON \
+         -DWITH_TESTING=ON \
+         -DWITH_TIMER=ON \
+         -DWITH_PROFILER=ON \
+         -DWITH_FLUID_ONLY=ON
+make -j `nproc`
+pip install -U "$WHEEL_PATH/$(ls $WHEEL_PATH)"
+
+cd $DEMO_PATH
+# Clear previous log results
+rm -f *.log
+
+# Test the float16 inference accuracy of resnet32 on cifar10 data set
+stdbuf -oL python float16_inference_demo.py \
+       --data_set=cifar10 \
+       --model=resnet \
+       --threshold=0.6 \
+       --repeat=10 \
+       2>&1 | tee -a float16_inference_accuracy.log
+
+# Sleep to cool down the GPU for consistent benchmarking
+sleep 2m
+
+# benchmarking parameters
+REPEAT=1000
+MAXIMUM_BATCH_SIZE=512
+
+for ((batch_size = 1; batch_size <= MAXIMUM_BATCH_SIZE; batch_size *= 2)); 
+do
+
+  # Test inference benchmark of vgg16 on imagenet
+  stdbuf -oL python float16_inference_demo.py \
+         --data_set=imagenet \
+         --model=vgg \
+         --threshold=0.001 \
+         --repeat=1 \
+
+  $INFER_PATH/test_inference_image_classification_vgg \
+      --dirname=$DEMO_PATH/image_classification_imagenet_vgg.inference.model \
+      --fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_vgg.inference.model \
+      --repeat=$REPEAT \
+      --batch_size=$batch_size \
+      --skip_cpu=true \
+      2>&1 | tee -a imagenet_vgg16_benchmark.log
+
+  sleep 2m
+
+  # Test inference benchmark of resnet50 on imagenet
+  stdbuf -oL python float16_inference_demo.py \
+         --data_set=imagenet \
+         --model=resnet \
+         --threshold=0.001 \
+         --repeat=1 \
+
+  $INFER_PATH/test_inference_image_classification_resnet \
+      --dirname=$DEMO_PATH/image_classification_imagenet_resnet.inference.model \
+      --fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_resnet.inference.model \
+      --repeat=$REPEAT \
+      --batch_size=$batch_size \
+      --skip_cpu=true \
+      2>&1 | tee -a imagenet_resnet50_benchmark.log
+
+  sleep 2m
+
+  # Test inference benchmark of vgg16 on cifar10
+  stdbuf -oL python float16_inference_demo.py \
+         --data_set=cifar10 \
+         --model=vgg \
+         --threshold=0.001 \
+         --repeat=1 \
+
+  $INFER_PATH/test_inference_image_classification_vgg \
+      --dirname=$DEMO_PATH/image_classification_cifar10_vgg.inference.model \
+      --fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_vgg.inference.model \
+      --repeat=$REPEAT \
+      --batch_size=$batch_size \
+      --skip_cpu=true \
+      2>&1 | tee -a cifar10_vgg16_benchmark.log
+
+  sleep 1m
+
+  # Test inference benchmark of resnet32 on cifar10
+  stdbuf -oL python float16_inference_demo.py \
+         --data_set=cifar10 \
+         --model=resnet \
+         --threshold=0.001 \
+         --repeat=1 \
+
+  $INFER_PATH/test_inference_image_classification_vgg \
+      --dirname=$DEMO_PATH/image_classification_cifar10_resnet.inference.model \
+      --fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_resnet.inference.model \
+      --repeat=$REPEAT \
+      --batch_size=$batch_size \
+      --skip_cpu=true \
+      2>&1 | tee -a cifar10_resnet32_benchmark.log
+
+  sleep 1m
+
+done
diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a8bbb4eb8081420ae0bbaf761bd27303c0d043cb
--- /dev/null
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -0,0 +1,88 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+if(APPLE)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
+endif(APPLE)
+
+
+set(inference_deps paddle_inference_api paddle_fluid_api)
+if(WITH_GPU AND TENSORRT_FOUND)
+    set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine)
+endif()
+
+function(inference_api_test TARGET_NAME)
+    if (WITH_TESTING)
+        set(options "")
+        set(oneValueArgs "")
+        set(multiValueArgs ARGS)
+        cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+        set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
+        cc_test(${TARGET_NAME}
+                SRCS ${TARGET_NAME}.cc
+                DEPS "${inference_deps}"
+                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
+        if(inference_test_ARGS)
+            set_tests_properties(${TARGET_NAME}
+                    PROPERTIES DEPENDS "${inference_test_ARGS}")
+        endif()
+    endif(WITH_TESTING)
+endfunction(inference_api_test)
+
+cc_library(paddle_inference_api
+    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
+    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+
+cc_library(paddle_inference_api_shared SHARED
+    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
+    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+
+cc_test(test_paddle_inference_api
+        SRCS test_paddle_inference_api.cc
+        DEPS paddle_inference_api)
+
+inference_api_test(test_paddle_inference_api_impl
+                    ARGS test_word2vec test_image_classification)
+
+if(WITH_GPU AND TENSORRT_FOUND)
+cc_library(paddle_inference_tensorrt_subgraph_engine
+        SRCS paddle_inference_api_tensorrt_subgraph_engine.cc
+        DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api)
+
+inference_api_test(test_paddle_inference_api_tensorrt_subgraph_engine ARGS test_word2vec)
+endif()
+
+if (WITH_ANAKIN) # only needed in CI
+    # Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
+    # so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
+    # compile the libinference_anakin_api.a and compile with anakin.so.
+    nv_library(inference_anakin_api SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
+    nv_library(inference_anakin_api_shared SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
+    target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+    target_compile_options(inference_anakin_api_shared BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+    target_link_libraries(inference_anakin_api anakin anakin_saber_common)
+    target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common)
+    if (WITH_TESTING)
+        cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
+                                  ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
+                                  DEPS inference_anakin_api)
+        target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+     endif(WITH_TESTING)
+endif()
+
+if(WITH_TESTING)
+    add_subdirectory(demo)
+endif()
diff --git a/paddle/contrib/inference/README.md b/paddle/contrib/inference/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..20969fac6c8f894ffb4a02b48f795e2a0dcbd096
--- /dev/null
+++ b/paddle/contrib/inference/README.md
@@ -0,0 +1,27 @@
+# Embed Paddle Inference in Your Application
+
+Paddle inference offers the APIs in `C` and `C++` languages.
+
+One can easily deploy a model trained by Paddle following the steps as below:
+
+1. Optimize the native model;
+2. Write some codes for deployment.
+
+
+Let's explain the steps in detail.
+
+## Optimize the native Fluid Model
+
+The native model that get from the training phase needs to be optimized for that.
+
+- Clean the noise such as the cost operators that do not need inference;
+- Prune unnecessary computation fork that has nothing to do with the output;
+- Remove extraneous variables;
+- Memory reuse for native Fluid executor;
+- Translate the model storage format to some third-party engine's, so that the inference API can utilize the engine for acceleration;
+
+We have an official tool to do the optimization, call `paddle_inference_optimize --help` for more information.
+
+## Write some codes
+
+Read `paddle_inference_api.h` for more information.
diff --git a/paddle/contrib/inference/demo/CMakeLists.txt b/paddle/contrib/inference/demo/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ecece6fe3471ad7b89c84c3e2b67af4ae9eb3c36
--- /dev/null
+++ b/paddle/contrib/inference/demo/CMakeLists.txt
@@ -0,0 +1,61 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+inference_api_test(simple_on_word2vec ARGS test_word2vec)
+
+option(WITH_INFERENCE_DEMO "Compile with Inference demo" OFF)
+if(NOT WITH_INFERENCE_DEMO)
+  return()
+endif()
+
+set(DEMO_INSTALL_DIR "${PADDLE_BINARY_DIR}/inference_demo")
+set(URL_ROOT http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F)
+
+function(inference_download_test_demo TARGET)
+    if (NOT WITH_TESTING)
+        return()
+    endif()
+    set(options "")
+    set(oneValueArgs URL)
+    set(multiValueArgs SRCS)
+    cmake_parse_arguments(tests "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    set(test_dir "${DEMO_INSTALL_DIR}/${TARGET}")
+    message(STATUS "inference demo ${test_dir}")
+
+    if(NOT EXISTS "${test_dir}")
+        message(STATUS "Download ${TARGET} model from ${tests_URL}")
+        execute_process(COMMAND bash -c "mkdir -p ${test_dir}")
+        execute_process(COMMAND bash -c "cd ${test_dir}; wget -q ${tests_URL}")
+        execute_process(COMMAND bash -c "cd ${test_dir}; tar xzf *.tar.gz")
+    endif()
+
+    cc_test(${TARGET} SRCS "${tests_SRCS}"
+        DEPS paddle_inference_api paddle_fluid
+        ARGS --data=${test_dir}/data.txt
+             --modeldir=${test_dir}/model
+             --refer=${test_dir}/result.txt)
+endfunction()
+
+# disable mobilenet test
+#inference_download_test_demo(mobilenet_inference_demo
+#    SRCS vis_demo.cc
+#    URL ${URL_ROOT}mobilenet.tar.gz)
+inference_download_test_demo(se_resnext50_inference_demo
+    SRCS vis_demo.cc
+    URL ${URL_ROOT}se_resnext50.tar.gz)
+inference_download_test_demo(ocr_inference_demo
+    SRCS vis_demo.cc
+    URL ${URL_ROOT}ocr.tar.gz)
diff --git a/paddle/contrib/inference/demo/README.md b/paddle/contrib/inference/demo/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f1d256660299a68dc5d9d73dbe4a401a0e7d9680
--- /dev/null
+++ b/paddle/contrib/inference/demo/README.md
@@ -0,0 +1,36 @@
+# Infernce Demos
+
+Input data format:
+
+- Each line contains a single record
+- Each record's format is
+
+```
+<space splitted floats as data>\t<space splitted ints as shape>
+```
+
+Follow the C++ codes in `vis_demo.cc`.
+
+## MobileNet
+
+To execute the demo, simply run
+
+```sh
+./mobilenet_inference_demo --modeldir <model> --data <datafile>
+```
+
+## SE-ResNeXt-50
+
+To execute the demo, simply run
+
+```sh
+./se_resnext50_inference_demo --modeldir <model> --data <datafile>
+```
+
+## OCR
+
+To execute the demo, simply run
+
+```sh
+./ocr_inference_demo --modeldir <model> --data <datafile>
+```
diff --git a/paddle/contrib/inference/demo/simple_on_word2vec.cc b/paddle/contrib/inference/demo/simple_on_word2vec.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c253014642f39a042430992548a285cc7078a959
--- /dev/null
+++ b/paddle/contrib/inference/demo/simple_on_word2vec.cc
@@ -0,0 +1,125 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains a simple demo for how to take a model for inference.
+ */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <memory>
+#include <thread>
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+namespace paddle {
+namespace demo {
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+void Main(bool use_gpu) {
+  //# 1. Create PaddlePredictor with a config.
+  NativeConfig config;
+  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config.use_gpu = use_gpu;
+  config.fraction_of_gpu_memory = 0.15;
+  config.device = 0;
+  auto predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+
+  for (int batch_id = 0; batch_id < 3; batch_id++) {
+    //# 2. Prepare input.
+    int64_t data[4] = {1, 2, 3, 4};
+
+    PaddleTensor tensor{.name = "",
+                        .shape = std::vector<int>({4, 1}),
+                        .data = PaddleBuf(data, sizeof(data)),
+                        .dtype = PaddleDType::INT64};
+
+    // For simplicity, we set all the slots with the same data.
+    std::vector<PaddleTensor> slots(4, tensor);
+
+    //# 3. Run
+    std::vector<PaddleTensor> outputs;
+    CHECK(predictor->Run(slots, &outputs));
+
+    //# 4. Get output.
+    ASSERT_EQ(outputs.size(), 1UL);
+    LOG(INFO) << "output buffer size: " << outputs.front().data.length();
+    const size_t num_elements = outputs.front().data.length() / sizeof(float);
+    // The outputs' buffers are in CPU memory.
+    for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+      LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
+    }
+  }
+}
+
+void MainThreads(int num_threads, bool use_gpu) {
+  // Multi-threads only support on CPU
+  // 0. Create PaddlePredictor with a config.
+  NativeConfig config;
+  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config.use_gpu = use_gpu;
+  config.fraction_of_gpu_memory = 0.15;
+  config.device = 0;
+  auto main_predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+
+  std::vector<std::thread> threads;
+  for (int tid = 0; tid < num_threads; ++tid) {
+    threads.emplace_back([&, tid]() {
+      // 1. clone a predictor which shares the same parameters
+      auto predictor = main_predictor->Clone();
+      constexpr int num_batches = 3;
+      for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
+        // 2. Dummy Input Data
+        int64_t data[4] = {1, 2, 3, 4};
+        PaddleTensor tensor{.name = "",
+                            .shape = std::vector<int>({4, 1}),
+                            .data = PaddleBuf(data, sizeof(data)),
+                            .dtype = PaddleDType::INT64};
+        std::vector<PaddleTensor> inputs(4, tensor);
+        std::vector<PaddleTensor> outputs;
+        // 3. Run
+        CHECK(predictor->Run(inputs, &outputs));
+
+        // 4. Get output.
+        ASSERT_EQ(outputs.size(), 1UL);
+        LOG(INFO) << "TID: " << tid << ", "
+                  << "output buffer size: " << outputs.front().data.length();
+        const size_t num_elements =
+            outputs.front().data.length() / sizeof(float);
+        // The outputs' buffers are in CPU memory.
+        for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+          LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
+        }
+      }
+    });
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    threads[i].join();
+  }
+}
+
+TEST(demo, word2vec_cpu) { Main(false /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_cpu_1) { MainThreads(1, false /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_cpu_4) { MainThreads(4, false /*use_gpu*/); }
+
+#ifdef PADDLE_WITH_CUDA
+TEST(demo, word2vec_gpu) { Main(true /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_gpu_1) { MainThreads(1, true /*use_gpu*/); }
+TEST(demo_multi_threads, word2vec_gpu_4) { MainThreads(4, true /*use_gpu*/); }
+#endif
+
+}  // namespace demo
+}  // namespace paddle
diff --git a/paddle/contrib/inference/demo/utils.h b/paddle/contrib/inference/demo/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5330d8d9d89260cfe3d5214e5a4ceb720cffdf1
--- /dev/null
+++ b/paddle/contrib/inference/demo/utils.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+namespace paddle {
+namespace demo {
+
+static void split(const std::string& str,
+                  char sep,
+                  std::vector<std::string>* pieces) {
+  pieces->clear();
+  if (str.empty()) {
+    return;
+  }
+  size_t pos = 0;
+  size_t next = str.find(sep, pos);
+  while (next != std::string::npos) {
+    pieces->push_back(str.substr(pos, next - pos));
+    pos = next + 1;
+    next = str.find(sep, pos);
+  }
+  if (!str.substr(pos).empty()) {
+    pieces->push_back(str.substr(pos));
+  }
+}
+
+/*
+ * Get a summary of a PaddleTensor content.
+ */
+static std::string SummaryTensor(const PaddleTensor& tensor) {
+  std::stringstream ss;
+  int num_elems = tensor.data.length() / PaddleDtypeSize(tensor.dtype);
+
+  ss << "data[:10]\t";
+  switch (tensor.dtype) {
+    case PaddleDType::INT64: {
+      for (int i = 0; i < std::min(num_elems, 10); i++) {
+        ss << static_cast<int64_t*>(tensor.data.data())[i] << " ";
+      }
+      break;
+    }
+    case PaddleDType::FLOAT32:
+      for (int i = 0; i < std::min(num_elems, 10); i++) {
+        ss << static_cast<float*>(tensor.data.data())[i] << " ";
+      }
+      break;
+  }
+  return ss.str();
+}
+
+}  // namespace demo
+}  // namespace paddle
diff --git a/paddle/contrib/inference/demo/vis_demo.cc b/paddle/contrib/inference/demo/vis_demo.cc
new file mode 100644
index 0000000000000000000000000000000000000000..45575f9a862de430236ae20cf498e542a45b1f4b
--- /dev/null
+++ b/paddle/contrib/inference/demo/vis_demo.cc
@@ -0,0 +1,149 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains demo for mobilenet, se-resnext50 and ocr.
+ */
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>  // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
+#include <gtest/gtest.h>
+#include <fstream>
+#include <iostream>
+#include "paddle/contrib/inference/demo/utils.h"
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+#ifdef PADDLE_WITH_CUDA
+DECLARE_double(fraction_of_gpu_memory_to_use);
+#endif
+
+namespace paddle {
+namespace demo {
+
+DEFINE_string(modeldir, "", "Directory of the inference model.");
+DEFINE_string(refer, "", "path to reference result for comparison.");
+DEFINE_string(
+    data,
+    "",
+    "path of data; each line is a record, format is "
+    "'<space splitted floats as data>\t<space splitted ints as shape'");
+
+struct Record {
+  std::vector<float> data;
+  std::vector<int32_t> shape;
+};
+
+void split(const std::string& str, char sep, std::vector<std::string>* pieces);
+
+Record ProcessALine(const std::string& line) {
+  LOG(INFO) << "process a line";
+  std::vector<std::string> columns;
+  split(line, '\t', &columns);
+  CHECK_EQ(columns.size(), 2UL)
+      << "data format error, should be <data>\t<shape>";
+
+  Record record;
+  std::vector<std::string> data_strs;
+  split(columns[0], ' ', &data_strs);
+  for (auto& d : data_strs) {
+    record.data.push_back(std::stof(d));
+  }
+
+  std::vector<std::string> shape_strs;
+  split(columns[1], ' ', &shape_strs);
+  for (auto& s : shape_strs) {
+    record.shape.push_back(std::stoi(s));
+  }
+  LOG(INFO) << "data size " << record.data.size();
+  LOG(INFO) << "data shape size " << record.shape.size();
+  return record;
+}
+
+void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
+  std::string line;
+  std::ifstream file(referfile);
+  std::getline(file, line);
+  auto refer = ProcessALine(line);
+  file.close();
+
+  size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
+  LOG(INFO) << "predictor output numel " << numel;
+  LOG(INFO) << "reference output numel " << refer.data.size();
+  EXPECT_EQ(numel, refer.data.size());
+  switch (output.dtype) {
+    case PaddleDType::INT64: {
+      for (size_t i = 0; i < numel; ++i) {
+        EXPECT_EQ(static_cast<int64_t*>(output.data.data())[i], refer.data[i]);
+      }
+      break;
+    }
+    case PaddleDType::FLOAT32:
+      for (size_t i = 0; i < numel; ++i) {
+        EXPECT_NEAR(
+            static_cast<float*>(output.data.data())[i], refer.data[i], 1e-5);
+      }
+      break;
+  }
+}
+
+/*
+ * Use the native fluid engine to inference the demo.
+ */
+void Main(bool use_gpu) {
+  NativeConfig config;
+  config.param_file = FLAGS_modeldir + "/__params__";
+  config.prog_file = FLAGS_modeldir + "/__model__";
+  config.use_gpu = use_gpu;
+  config.device = 0;
+#ifdef PADDLE_WITH_CUDA
+  config.fraction_of_gpu_memory = FLAGS_fraction_of_gpu_memory_to_use;
+#endif
+
+  LOG(INFO) << "init predictor";
+  auto predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+
+  LOG(INFO) << "begin to process data";
+  // Just a single batch of data.
+  std::string line;
+  std::ifstream file(FLAGS_data);
+  std::getline(file, line);
+  auto record = ProcessALine(line);
+  file.close();
+
+  // Inference.
+  PaddleTensor input{
+      .name = "xx",
+      .shape = record.shape,
+      .data = PaddleBuf(record.data.data(), record.data.size() * sizeof(float)),
+      .dtype = PaddleDType::FLOAT32};
+
+  LOG(INFO) << "run executor";
+  std::vector<PaddleTensor> output;
+  predictor->Run({input}, &output);
+
+  LOG(INFO) << "output.size " << output.size();
+  auto& tensor = output.front();
+  LOG(INFO) << "output: " << SummaryTensor(tensor);
+
+  // compare with reference result
+  CheckOutput(FLAGS_refer, tensor);
+}
+
+TEST(demo, vis_demo_cpu) { Main(false /*use_gpu*/); }
+#ifdef PADDLE_WITH_CUDA
+TEST(demo, vis_demo_gpu) { Main(true /*use_gpu*/); }
+#endif
+}  // namespace demo
+}  // namespace paddle
diff --git a/paddle/contrib/inference/high_level_api.md b/paddle/contrib/inference/high_level_api.md
new file mode 100644
index 0000000000000000000000000000000000000000..eb92885052a453d8c837bbf6f6e984efb509332a
--- /dev/null
+++ b/paddle/contrib/inference/high_level_api.md
@@ -0,0 +1,60 @@
+# Inference High-level APIs
+This document describes the high-level inference APIs, one can use them to deploy a Paddle model for an application quickly.
+
+The APIs are described in `paddle_inference_api.h`, just one header file, and two libaries `libpaddle_fluid.so` and `libpaddle_fluid_api.so` are needed for a deployment.
+
+## PaddleTensor
+We provide the `PaddleTensor` data structure to give a general tensor interface.
+
+The definition is 
+
+```c++
+struct PaddleTensor {
+  std::string name;  // variable name.
+  std::vector<int> shape;
+  PaddleBuf data;  // blob of data.
+  PaddleDType dtype;
+};
+```
+
+The data is stored in a continuous memory `PaddleBuf,` and a `PaddleDType` specifies tensor's data type. 
+The `name` field is used to specify the name of an input variable, 
+that is important when there are multiple inputs and need to distinguish which variable to set.
+
+## engine
+The inference APIs has two different underlying engines
+
+- the native engine, which is consists of the native operators and framework,
+- the Anakin engine, which has an Anakin library embedded.
+
+The native engine takes a native Paddle model as input, and supports any model that trained by Paddle, 
+the Anakin engine is faster for some model, 
+but it can only take the Anakin model as input(user need to transform the format first manually) and currently not all Paddle models are supported.
+
+```c++
+enum class PaddleEngineKind {
+  kNative = 0,  // Use the native Fluid facility.
+  kAnakin,      // Use Anakin for inference.
+};
+```
+
+## PaddlePredictor and how to create one
+The main interface is `PaddlePredictor,` there are following methods 
+
+- `bool Run(const std::vector<PaddleTensor>& inputs, std::vector<PaddleTensor>* output_data)`
+  - take inputs and output `output_data.`
+- `Clone` to clone a predictor from an existing one, with model parameter shared.
+
+There is a factory method to help create a predictor, and the user takes the ownership of this object.
+
+```c++
+template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+```
+
+By specifying the engine kind and config, one can get a specific implementation.
+
+## Reference
+
+- [paddle_inference_api.h](./paddle_inference_api.h)
+- [some demos](./demo)
diff --git a/paddle/contrib/inference/high_level_api_cn.md b/paddle/contrib/inference/high_level_api_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..a57f015a4e44d43ee4e475cf606faa6f05e095fa
--- /dev/null
+++ b/paddle/contrib/inference/high_level_api_cn.md
@@ -0,0 +1,87 @@
+# Paddle 预测 API
+
+为了更简单方便的预测部署，Fluid 提供了一套高层 API 用来隐藏底层不同的优化实现。
+
+预测库包含:
+
+- 头文件 `paddle_inference_api.h` 定义了所有的接口
+- 库文件`libpaddle_fluid.so` 或 `libpaddle_fluid.a`
+- 库文件 `libpaddle_inference_api.so` 或 `libpaddle_inference_api.a`
+
+下面是详细的一些 API 概念介绍
+
+## PaddleTensor
+
+PaddleTensor 定义了预测最基本的输入输出的数据格式，其定义是
+
+```c++
+struct PaddleTensor {
+  std::string name;  // variable name.
+  std::vector<int> shape;
+  PaddleBuf data;  // blob of data.
+  PaddleDType dtype;
+};
+```
+
+- `name` 用于指定输入数据对应的 模型中variable 的名字 （暂时没有用，但会在后续支持任意 target 时启用）
+- `shape` 表示一个 Tensor 的 shape
+- `data`  数据以连续内存的方式存储在`PaddleBuf` 中，`PaddleBuf` 可以接收外面的数据或者独立`malloc`内存，详细可以参考头文件中相关定义。
+- `dtype` 表示 Tensor 的数据类型
+
+## engine
+
+高层 API 底层有多种优化实现，我们称之为 engine，目前有三种 engine
+
+- 原生 engine，由 paddle 原生的 forward operator 组成，可以天然支持所有paddle 训练出的模型，
+- Anakin engine，封装了 [Anakin](https://github.com/PaddlePaddle/Anakin) ，在某些模型上性能不错，但只能接受自带模型格式，无法支持所有 paddle 模型，
+- TensorRT mixed engine，用子图的方式支持了 [TensorRT](https://developer.nvidia.com/tensorrt) ，支持所有paddle 模型，并自动切割部分计算子图到 TensorRT 上加速（WIP）
+
+其实现为
+
+```c++
+enum class PaddleEngineKind {
+  kNative = 0,       // Use the native Fluid facility.
+  kAnakin,           // Use Anakin for inference.
+  kAutoMixedTensorRT // Automatically mixing TensorRT with the Fluid ops.
+};
+```
+
+## 预测部署过程
+
+总体上分为以下步骤
+
+1. 用合适的配置创建 `PaddlePredictor`
+2. 创建输入用的 `PaddleTensor`，传入到 `PaddlePredictor` 中
+3. 获取输出的 `PaddleTensor` ，将结果取出
+
+下面完整演示一个简单的模型，部分细节代码隐去
+
+```c++
+#include "paddle_inference_api.h"
+
+// 创建一个 config，并修改相关设置
+paddle::NativeConfig config;
+config.model_dir = "xxx";
+config.use_gpu = false;
+// 创建一个原生的 PaddlePredictor
+auto predictor =
+      paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+// 创建输入 tensor
+int64_t data[4] = {1, 2, 3, 4};
+paddle::PaddleTensor tensor{.name = "",
+                            .shape = std::vector<int>({4, 1}),
+                            .data = PaddleBuf(data, sizeof(data)),
+                            .dtype = PaddleDType::INT64};
+// 创建输出 tensor，输出 tensor 的内存可以复用
+std::vector<paddle::PaddleTensor> outputs;
+// 执行预测
+CHECK(predictor->Run(slots, &outputs));
+// 获取 outputs ...
+```
+
+编译时，联编 `libpaddle_fluid.a/.so` 和 `libpaddle_inference_api.a/.so` 便可。 
+
+## 详细代码参考
+
+- [inference demos](./demo)
+- [复杂单线程/多线程例子](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/inference/test_paddle_inference_api_impl.cc)
diff --git a/paddle/contrib/inference/paddle_inference_api.cc b/paddle/contrib/inference/paddle_inference_api.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea46b3006f8d0964cc8229d3683ee7b602d6ef0d
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api.cc
@@ -0,0 +1,78 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+namespace paddle {
+
+int PaddleDtypeSize(PaddleDType dtype) {
+  switch (dtype) {
+    case PaddleDType::FLOAT32:
+      return sizeof(float);
+    case PaddleDType::INT64:
+      return sizeof(int64_t);
+    default:
+      //
+      assert(false);
+      return -1;
+  }
+}
+
+PaddleBuf::PaddleBuf(PaddleBuf&& other)
+    : data_(other.data_),
+      length_(other.length_),
+      memory_owned_(other.memory_owned_) {
+  other.memory_owned_ = false;
+  other.data_ = nullptr;
+  other.length_ = 0;
+}
+
+PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
+
+PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
+  // only the buffer with external memory can be copied
+  assert(!other.memory_owned_);
+  data_ = other.data_;
+  length_ = other.length_;
+  memory_owned_ = other.memory_owned_;
+  return *this;
+}
+
+void PaddleBuf::Resize(size_t length) {
+  // Only the owned memory can be reset, the external memory can't be changed.
+  if (length_ == length) return;
+  assert(memory_owned_);
+  Free();
+  data_ = new char[length];
+  length_ = length;
+  memory_owned_ = true;
+}
+
+void PaddleBuf::Reset(void* data, size_t length) {
+  Free();
+  memory_owned_ = false;
+  data_ = data;
+  length_ = length;
+}
+
+void PaddleBuf::Free() {
+  if (memory_owned_ && data_) {
+    assert(length_ > 0);
+    delete static_cast<char*>(data_);
+    data_ = nullptr;
+    length_ = 0;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api.h b/paddle/contrib/inference/paddle_inference_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..b8ba2d14a5c161d491d838888ea14b776f769f23
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api.h
@@ -0,0 +1,151 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains the definition of a simple Inference API for Paddle.
+ *
+ * ATTENTION: It requires some C++11 features, for lower version C++ or C, we
+ * might release another API.
+ */
+
+#pragma once
+
+#include <cassert>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace paddle {
+
+enum PaddleDType {
+  FLOAT32,
+  INT64,
+};
+
+class PaddleBuf {
+ public:
+  PaddleBuf() = default;
+  PaddleBuf(PaddleBuf&& other);
+  // Copy only available when memory is managed externally.
+  explicit PaddleBuf(const PaddleBuf&);
+  PaddleBuf& operator=(const PaddleBuf&);
+  // Do not own the memory.
+  PaddleBuf(void* data, size_t length)
+      : data_(data), length_(length), memory_owned_{false} {}
+  // Own memory.
+  PaddleBuf(size_t length)
+      : data_(new char[length]), length_(length), memory_owned_(true) {}
+  // Resize to `length` bytes.
+  void Resize(size_t length);
+  // Reset to external memory.
+  void Reset(void* data, size_t length);
+  bool empty() const { return length_ == 0; }
+  void* data() const { return data_; }
+  size_t length() const { return length_; }
+
+  ~PaddleBuf() { Free(); }
+
+ private:
+  void Free();
+  void* data_{nullptr};  // pointer to the data memory.
+  size_t length_{0};     // number of memory bytes.
+  bool memory_owned_{true};
+};
+
+struct PaddleTensor {
+  PaddleTensor() = default;
+  std::string name;  // variable name.
+  std::vector<int> shape;
+  // TODO(Superjomn) for LoD support, add a vector<vector<int>> field if needed.
+  PaddleBuf data;  // blob of data.
+  PaddleDType dtype;
+};
+
+enum class PaddleEngineKind {
+  kNative = 0,         // Use the native Fluid facility.
+  kAnakin,             // Use Anakin for inference.
+  kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
+  // TODO(Superjomn) support following engines latter.
+  // kTensorRT,           // Use TensorRT for inference.
+  // kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
+};
+
+/*
+ * A simple Inference API for Paddle. Currently this API can be used by
+ * non-sequence scenerios.
+ */
+class PaddlePredictor {
+ public:
+  struct Config;
+  PaddlePredictor() = default;
+  PaddlePredictor(const PaddlePredictor&) = delete;
+  PaddlePredictor& operator=(const PaddlePredictor&) = delete;
+
+  // Predict an record.
+  // The caller should be responsible for allocating and releasing the memory of
+  // `inputs`. `inputs` should be available until Run returns. Caller should be
+  // responsible for the output tensor's buffer, either allocated or passed from
+  // outside.
+  virtual bool Run(const std::vector<PaddleTensor>& inputs,
+                   std::vector<PaddleTensor>* output_data) = 0;
+
+  // Clone a predictor that share the model weights, the Cloned predictor should
+  // be thread-safe.
+  virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
+
+  // Destroy the Predictor.
+  virtual ~PaddlePredictor() = default;
+
+  // The common configs for all the predictors.
+  struct Config {
+    std::string model_dir;  // path to the model directory.
+  };
+};
+
+struct NativeConfig : public PaddlePredictor::Config {
+  // GPU related fields.
+  bool use_gpu{false};
+  int device{0};
+  float fraction_of_gpu_memory{-1.f};  // Negative to notify initialization.
+
+  std::string prog_file;
+  std::string param_file;
+};
+
+// Configurations for Anakin engine.
+struct AnakinConfig : public PaddlePredictor::Config {
+  int device;
+  std::string model_file;
+  int max_batch_size{-1};
+};
+
+struct TensorRTConfig : public NativeConfig {
+  // Determine whether a subgraph will be executed by TRT.
+  int min_subgraph_size{1};
+};
+
+// A factory to help create different predictors.
+//
+// FOR EXTENSION DEVELOPER:
+// Different predictors are designated by config type and engine kind. Similar
+// configs can be merged, but there shouldn't be a huge config containing
+// different fields for more than one kind of predictors.
+//
+// Similarly, each engine kind should map to a unique predictor implementation.
+template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+
+int PaddleDtypeSize(PaddleDType dtype);
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ba2d30314715a57c5ab85e5ae1d8ac0512bbc74f
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
@@ -0,0 +1,116 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/contrib/inference/paddle_inference_api_anakin_engine.h"
+#include <cuda.h>
+
+namespace paddle {
+
+PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor(
+    const AnakinConfig &config) {
+  CHECK(Init(config));
+}
+
+bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) {
+  if (!(graph_.load(config.model_file))) {
+    return false;
+  }
+  graph_.ResetBatchSize("input_0", config.max_batch_size);
+  // optimization for graph
+  if (!(graph_.Optimize())) {
+    return false;
+  }
+  // construct executer
+  executor_.init(graph_);
+  return true;
+}
+
+bool PaddleInferenceAnakinPredictor::Run(
+    const std::vector<PaddleTensor> &inputs,
+    std::vector<PaddleTensor> *output_data) {
+  for (const auto &input : inputs) {
+    if (input.dtype != PaddleDType::FLOAT32) {
+      LOG(ERROR) << "Only support float type inputs. " << input.name
+                 << "'s type is not float";
+      return false;
+    }
+    auto d_tensor_in_p = executor_.get_in(input.name);
+    float *d_data_p = d_tensor_in_p->mutable_data();
+    if (cudaMemcpy(d_data_p,
+                   static_cast<float *>(input.data.data()),
+                   d_tensor_in_p->valid_size() * sizeof(float),
+                   cudaMemcpyHostToDevice) != 0) {
+      LOG(ERROR) << "copy data from CPU to GPU error";
+      return false;
+    }
+  }
+
+  executor_.prediction();
+
+  if (output_data->empty()) {
+    LOG(ERROR) << "At least one output should be set with tensors' names.";
+    return false;
+  }
+  for (auto &output : *output_data) {
+    auto *tensor = executor_.get_out(output.name);
+    output.shape = tensor->shape();
+    if (output.data.length() < tensor->valid_size() * sizeof(float)) {
+      output.data.Resize(tensor->valid_size() * sizeof(float));
+    }
+    // Copy data from GPU -> CPU
+    if (cudaMemcpy(output.data.data(),
+                   tensor->mutable_data(),
+                   tensor->valid_size() * sizeof(float),
+                   cudaMemcpyDeviceToHost) != 0) {
+      LOG(ERROR) << "copy data from GPU to CPU error";
+      return false;
+    }
+  }
+  return true;
+}
+
+anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
+    &PaddleInferenceAnakinPredictor::get_executer() {
+  return executor_;
+}
+
+// the cloned new Predictor of anakin share the same net weights from original
+// Predictor
+std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
+  VLOG(3) << "Anakin Predictor::clone";
+  std::unique_ptr<PaddlePredictor> cls(new PaddleInferenceAnakinPredictor());
+  // construct executer from other graph
+  auto anakin_predictor_p =
+      dynamic_cast<PaddleInferenceAnakinPredictor *>(cls.get());
+  if (!anakin_predictor_p) {
+    LOG(ERROR) << "fail to call Init";
+    return nullptr;
+  }
+  anakin_predictor_p->get_executer().init(graph_);
+
+  return std::move(cls);
+}
+
+// A factory to help create difference predictor.
+template <>
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(
+    const AnakinConfig &config) {
+  VLOG(3) << "Anakin Predictor create.";
+  std::unique_ptr<PaddlePredictor> x(
+      new PaddleInferenceAnakinPredictor(config));
+  return x;
+};
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
new file mode 100644
index 0000000000000000000000000000000000000000..212ba41cdf8ff2feccb6b6498f9679d76a2efe7c
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains the implementation of inference API with Anakin engine
+ * embeded, this API can only support Anakin models.
+ */
+
+#pragma once
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+// from anakin
+#include "framework/core/net/net.h"
+#include "saber/saber_types.h"
+
+namespace paddle {
+
+class PaddleInferenceAnakinPredictor : public PaddlePredictor {
+ public:
+  PaddleInferenceAnakinPredictor() {}
+
+  PaddleInferenceAnakinPredictor(const AnakinConfig& config);
+
+  // NOTE Unlike the native engine, the buffers of anakin engine's output_data
+  // should be allocated first.
+  bool Run(const std::vector<PaddleTensor>& inputs,
+           std::vector<PaddleTensor>* output_data) override;
+
+  std::unique_ptr<PaddlePredictor> Clone() override;
+
+  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
+  get_executer();
+
+  ~PaddleInferenceAnakinPredictor() override{};
+
+ private:
+  bool Init(const AnakinConfig& config);
+
+  anakin::graph::Graph<anakin::NV,
+                       anakin::saber::AK_FLOAT,
+                       anakin::Precision::FP32>
+      graph_;
+  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
+      executor_;
+  AnakinConfig config_;
+};
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f92e9d4190412f5847e353ef1dc0324cad668c9a
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+DEFINE_string(model, "", "Directory of the inference model.");
+
+namespace paddle {
+
+AnakinConfig GetConfig() {
+  AnakinConfig config;
+  config.model_file = FLAGS_model;
+  config.device = 0;
+  config.max_batch_size = 1;
+  return config;
+}
+
+TEST(inference, anakin) {
+  AnakinConfig config = GetConfig();
+  auto predictor =
+      CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config);
+
+  float data[1 * 3 * 224 * 224] = {1.0f};
+
+  PaddleTensor tensor{.name = "input_0",
+                      .shape = std::vector<int>({1, 3, 224, 224}),
+                      .data = PaddleBuf(data, sizeof(data)),
+                      .dtype = PaddleDType::FLOAT32};
+
+  // For simplicity, we set all the slots with the same data.
+  std::vector<PaddleTensor> paddle_tensor_feeds;
+  paddle_tensor_feeds.emplace_back(std::move(tensor));
+
+  PaddleTensor tensor_out{.name = "prob_out",
+                          .shape = std::vector<int>({1000, 1}),
+                          .data = PaddleBuf(),
+                          .dtype = PaddleDType::FLOAT32};
+
+  std::vector<PaddleTensor> outputs;
+  outputs.emplace_back(std::move(tensor_out));
+
+  ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
+
+  float* data_o = static_cast<float*>(outputs[0].data.data());
+  for (size_t j = 0; j < 1000; ++j) {
+    LOG(INFO) << "output[" << j << "]: " << data_o[j];
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.cc b/paddle/contrib/inference/paddle_inference_api_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b1e5b875981e0142f6970cf6864b7b598743654b
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_impl.cc
@@ -0,0 +1,290 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <sys/time.h>
+#include <algorithm>
+#include <map>
+#include <set>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/contrib/inference/paddle_inference_api_impl.h"
+
+namespace paddle {
+namespace {
+
+// Timer for timer
+class Timer {
+ public:
+  double start;
+  double startu;
+  void tic() {
+    struct timeval tp;
+    gettimeofday(&tp, NULL);
+    start = tp.tv_sec;
+    startu = tp.tv_usec;
+  }
+  double toc() {
+    struct timeval tp;
+    gettimeofday(&tp, NULL);
+    double used_time_ms =
+        (tp.tv_sec - start) * 1000.0 + (tp.tv_usec - startu) / 1000.0;
+    return used_time_ms;
+  }
+};
+
+template <class T>
+std::string num2str(T a) {
+  std::stringstream istr;
+  istr << a;
+  return istr.str();
+}
+}  // namespace
+
+bool NativePaddlePredictor::Init(
+    std::shared_ptr<framework::Scope> parent_scope) {
+  VLOG(3) << "Predictor::init()";
+
+  if (config_.use_gpu) {
+    place_ = paddle::platform::CUDAPlace(config_.device);
+  } else {
+    place_ = paddle::platform::CPUPlace();
+  }
+  if (parent_scope) {
+    scope_ = parent_scope;
+    sub_scope_ = &(parent_scope->NewScope());
+  } else {
+    paddle::framework::InitDevices(false);
+    scope_.reset(new paddle::framework::Scope());
+  }
+
+  executor_.reset(new paddle::framework::Executor(place_));
+
+  // Initialize the inference program
+  if (!config_.model_dir.empty()) {
+    // Parameters are saved in separate files sited in
+    // the specified `dirname`.
+    inference_program_ = paddle::inference::Load(
+        executor_.get(), scope_.get(), config_.model_dir);
+  } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
+    // All parameters are saved in a single file.
+    // The file names should be consistent with that used
+    // in Python API `fluid.io.save_inference_model`.
+    inference_program_ = paddle::inference::Load(
+        executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
+  } else {
+    LOG(ERROR) << "fail to load inference model.";
+    return false;
+  }
+
+  ctx_ = executor_->Prepare(*inference_program_, 0);
+  executor_->CreateVariables(
+      *inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0);
+
+  // Get the feed_target_names and fetch_target_names
+  feed_target_names_ = inference_program_->GetFeedTargetNames();
+  fetch_target_names_ = inference_program_->GetFetchTargetNames();
+  return true;
+}
+
+NativePaddlePredictor::~NativePaddlePredictor() {
+  if (sub_scope_) {
+    PADDLE_ENFORCE_NOT_NULL(scope_, "Should have parent scope!");
+    scope_->DeleteScope(sub_scope_);
+  }
+};
+
+bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
+                                std::vector<PaddleTensor> *output_data) {
+  VLOG(3) << "Predictor::predict";
+  Timer timer;
+  timer.tic();
+  // set feed variable
+  std::map<std::string, const framework::LoDTensor *> feed_targets;
+  std::vector<framework::LoDTensor> feeds;
+  if (!SetFeed(inputs, &feeds)) {
+    LOG(ERROR) << "fail to set feed";
+    return false;
+  }
+  for (size_t i = 0; i < feed_target_names_.size(); ++i) {
+    VLOG(4) << "setting " << i << "-th target";
+    feed_targets[feed_target_names_[i]] = &feeds[i];
+  }
+  // get fetch variable
+  std::map<std::string, framework::LoDTensor *> fetch_targets;
+  std::vector<framework::LoDTensor> fetchs;
+  fetchs.resize(fetch_target_names_.size());
+  for (size_t i = 0; i < fetch_target_names_.size(); ++i) {
+    fetch_targets[fetch_target_names_[i]] = &fetchs[i];
+  }
+  // Run the inference program
+  // if share variables, we need not create variables
+  VLOG(4) << "Run prepared context";
+  executor_->RunPreparedContext(
+      ctx_.get(),
+      sub_scope_ != nullptr ? sub_scope_ : scope_.get(),
+      &feed_targets,
+      &fetch_targets,
+      false /* don't create variable eatch time */);
+  VLOG(4) << "Finish prepared context";
+  if (!GetFetch(fetchs, output_data)) {
+    LOG(ERROR) << "fail to get fetches";
+    return false;
+  }
+  VLOG(3) << "predict cost: " << timer.toc() << "ms";
+  return true;
+}
+
+std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
+  VLOG(3) << "Predictor::clone";
+  std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_));
+
+  if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init(scope_)) {
+    LOG(ERROR) << "fail to call Init";
+    return nullptr;
+  }
+  // fix manylinux compile error.
+  return std::move(cls);
+}
+
+bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
+                                    std::vector<framework::LoDTensor> *feeds) {
+  VLOG(3) << "Predictor::set_feed";
+  if (inputs.size() != feed_target_names_.size()) {
+    LOG(ERROR) << "wrong feed input size.";
+    return false;
+  }
+  for (size_t i = 0; i < feed_target_names_.size(); ++i) {
+    framework::LoDTensor input;
+    framework::DDim ddim = framework::make_ddim(inputs[i].shape);
+    void *input_ptr;
+    if (inputs[i].dtype == PaddleDType::INT64) {
+      input_ptr = input.mutable_data<int64_t>(ddim, platform::CPUPlace());
+    } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
+      input_ptr = input.mutable_data<float>(ddim, platform::CPUPlace());
+    } else {
+      LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
+      return false;
+    }
+
+    // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
+    std::memcpy(static_cast<void *>(input_ptr),
+                inputs[i].data.data(),
+                inputs[i].data.length());
+    feeds->push_back(input);
+  }
+  return true;
+}
+
+bool NativePaddlePredictor::GetFetch(
+    const std::vector<framework::LoDTensor> &fetchs,
+    std::vector<PaddleTensor> *outputs) {
+  VLOG(3) << "Predictor::get_fetch";
+  outputs->resize(fetchs.size());
+  for (size_t i = 0; i < fetchs.size(); ++i) {
+    // TODO(panyx0718): Support fetch of other types.
+    if (fetchs[i].type() != typeid(float)) {
+      LOG(ERROR) << "only support fetching float now.";
+      return false;
+    }
+    std::vector<int> shape;
+    auto dims_i = fetchs[i].dims();
+    auto lod = fetchs[i].lod();
+    const float *output_ptr = fetchs[i].data<float>();
+    // const int64_t* output_ptr = fetchs[i].data<int64_t>();
+    auto num = fetchs[i].numel();
+    std::vector<float> data;
+    if (0 == lod.size()) {
+      std::copy(output_ptr, output_ptr + num, std::back_inserter(data));
+      for (int j = 0; j < dims_i.size(); ++j) {
+        shape.push_back(dims_i[j]);
+      }
+    } else {
+      // for batch detection
+      // image[0] -> output[0] shape {145, 6}
+      // image[1] -> output[1] shape {176, 6}
+      // then,
+      // the batch output shape {321, 6}
+      // the lod {{0, 145, 321}}
+      // so we should append output[0] to {176, 6}
+      size_t max_dim = 0;
+      for (size_t j = 1; j < lod[0].size(); j++) {
+        max_dim = std::max(max_dim, lod[0][j] - lod[0][j - 1]);
+      }
+      size_t common_dim = lod[0].back() == 0 ? 0 : num / lod[0].back();
+      if (max_dim > 0) {
+        data.resize((lod[0].size() - 1) * max_dim * common_dim, 0);
+      }
+      for (size_t j = 1; j < lod[0].size(); j++) {
+        size_t start = lod[0][j - 1] * common_dim;
+        size_t end = lod[0][j] * common_dim;
+        if (end > start) {
+          std::copy(output_ptr + start,
+                    output_ptr + end,
+                    data.begin() + (j - 1) * max_dim * common_dim);
+        }
+      }
+      shape.push_back(lod[0].size() - 1);
+      shape.push_back(max_dim);
+      for (int j = 1; j < dims_i.size(); ++j) {
+        shape.push_back(dims_i[j]);
+      }
+    }
+
+    outputs->at(i).shape = shape;
+    auto &buffer = outputs->at(i).data;
+    if (buffer.empty() || buffer.length() < sizeof(float) * data.size()) {
+      buffer.Resize(sizeof(float) * data.size());
+    }
+    std::memcpy(buffer.data(), data.data(), buffer.length());
+    outputs->at(i).dtype = PaddleDType::FLOAT32;
+    // TODO(panyx0718): support other types? fill tensor name? avoid a copy.
+  }
+  return true;
+}
+
+template <>
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
+    const NativeConfig &config) {
+  VLOG(3) << "create NativePaddlePredictor";
+  if (config.use_gpu) {
+    // 1. GPU memeroy
+    PADDLE_ENFORCE_GT(
+        config.fraction_of_gpu_memory,
+        0.f,
+        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
+    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
+    std::vector<std::string> flags;
+    if (config.fraction_of_gpu_memory >= 0.0f ||
+        config.fraction_of_gpu_memory <= 0.95f) {
+      flags.push_back("dummpy");
+      std::string flag = "--fraction_of_gpu_memory_to_use=" +
+                         num2str<float>(config.fraction_of_gpu_memory);
+      flags.push_back(flag);
+      VLOG(3) << "set flag: " << flag;
+      framework::InitGflags(flags);
+    }
+  }
+
+  std::unique_ptr<PaddlePredictor> predictor(new NativePaddlePredictor(config));
+  if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
+    return nullptr;
+  }
+  return std::move(predictor);
+}
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_impl.h b/paddle/contrib/inference/paddle_inference_api_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba266b608da342fb71faf05d02ddf74330e21e98
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_impl.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+
+class NativePaddlePredictor : public PaddlePredictor {
+ public:
+  explicit NativePaddlePredictor(const NativeConfig &config)
+      : config_(config) {}
+
+  // will only create sub scope if have global scope
+  bool Init(std::shared_ptr<framework::Scope> parent_scope);
+
+  bool Run(const std::vector<PaddleTensor> &inputs,
+           std::vector<PaddleTensor> *output_data) override;
+
+  std::unique_ptr<PaddlePredictor> Clone() override;
+
+  ~NativePaddlePredictor() override;
+
+ protected:
+  bool SetFeed(const std::vector<PaddleTensor> &input_datas,
+               std::vector<framework::LoDTensor> *feeds);
+  bool GetFetch(const std::vector<framework::LoDTensor> &fetchs,
+                std::vector<PaddleTensor> *output_data);
+
+  NativeConfig config_;
+  platform::Place place_;
+  std::unique_ptr<framework::Executor> executor_;
+  std::shared_ptr<framework::Scope> scope_;
+  std::unique_ptr<framework::ExecutorPrepareContext> ctx_;
+  std::unique_ptr<framework::ProgramDesc> inference_program_;
+  std::vector<std::string> feed_target_names_;
+  std::vector<std::string> fetch_target_names_;
+  // Do not use unique_ptr, use parent scope to delete
+  framework::Scope *sub_scope_{nullptr};
+};
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc b/paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a11396cee91a758e86af2efd9e58b9da68442590
--- /dev/null
+++ b/paddle/contrib/inference/paddle_inference_api_tensorrt_subgraph_engine.cc
@@ -0,0 +1,126 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
+#include "paddle/contrib/inference/paddle_inference_api_impl.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+
+namespace paddle {
+
+using inference::analysis::Argument;
+using inference::Singleton;
+using inference::analysis::Analyzer;
+using framework::proto::ProgramDesc;
+
+class TensorRTSubgraphPredictor : public NativePaddlePredictor {
+ public:
+  explicit TensorRTSubgraphPredictor(const TensorRTConfig& config)
+      : NativePaddlePredictor(config), config_(config) {}
+
+  bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
+    VLOG(3) << "Predictor::init()";
+
+    if (config_.use_gpu) {
+      place_ = paddle::platform::CUDAPlace(config_.device);
+    } else {
+      place_ = paddle::platform::CPUPlace();
+    }
+    if (parent_scope) {
+      scope_ = parent_scope;
+      sub_scope_ = &(parent_scope->NewScope());
+    } else {
+      paddle::framework::InitDevices(false);
+      scope_.reset(new paddle::framework::Scope());
+    }
+
+    executor_.reset(new paddle::framework::Executor(place_));
+
+    // Initialize the inference program
+    if (!config_.model_dir.empty()) {
+      // Parameters are saved in separate files sited in
+      // the specified `dirname`.
+      inference_program_ = paddle::inference::Load(
+          executor_.get(), scope_.get(), config_.model_dir);
+    } else if (!config_.prog_file.empty() && !config_.param_file.empty()) {
+      // All parameters are saved in a single file.
+      // The file names should be consistent with that used
+      // in Python API `fluid.io.save_inference_model`.
+      inference_program_ = paddle::inference::Load(
+          executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
+    } else {
+      LOG(ERROR) << "fail to load inference model.";
+      return false;
+    }
+
+    // Analyze inference_program
+    Argument argument;
+    argument.origin_program_desc.reset(
+        new ProgramDesc(*inference_program_->Proto()));
+    Singleton<Analyzer>::Global().Run(&argument);
+    CHECK(argument.transformed_program_desc);
+    VLOG(5) << "transformed program:\n"
+            << argument.transformed_program_desc->SerializeAsString();
+    VLOG(5) << "to prepare executor";
+    *inference_program_->Proto() = *argument.transformed_program_desc;
+    ctx_ = executor_->Prepare(*inference_program_, 0);
+
+    VLOG(5) << "to create variables";
+    executor_->CreateVariables(
+        *inference_program_, sub_scope_ ? sub_scope_ : scope_.get(), 0);
+
+    // Get the feed_target_names and fetch_target_names
+    feed_target_names_ = inference_program_->GetFeedTargetNames();
+    fetch_target_names_ = inference_program_->GetFetchTargetNames();
+    return true;
+  }
+
+ private:
+  TensorRTConfig config_;
+};
+
+template <>
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
+    const TensorRTConfig& config) {
+  VLOG(3) << "create TensorRTSubgraphPredictor";
+  if (config.use_gpu) {
+    // 1. GPU memeroy
+    PADDLE_ENFORCE_GT(
+        config.fraction_of_gpu_memory,
+        0.f,
+        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
+    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
+    std::vector<std::string> flags;
+    if (config.fraction_of_gpu_memory >= 0.0f ||
+        config.fraction_of_gpu_memory <= 0.95f) {
+      flags.push_back("dummpy");
+      std::string flag = "--fraction_of_gpu_memory_to_use=" +
+                         std::to_string(config.fraction_of_gpu_memory);
+      flags.push_back(flag);
+      VLOG(3) << "set flag: " << flag;
+      framework::InitGflags(flags);
+    }
+  }
+
+  std::unique_ptr<PaddlePredictor> predictor(
+      new TensorRTSubgraphPredictor(config));
+  if (!dynamic_cast<TensorRTSubgraphPredictor*>(predictor.get())
+           ->Init(nullptr)) {
+    return nullptr;
+  }
+  return std::move(predictor);
+}
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/test_paddle_inference_api.cc b/paddle/contrib/inference/test_paddle_inference_api.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bc7faab6e208a66d7a56e41a56bd743c7644eea2
--- /dev/null
+++ b/paddle/contrib/inference/test_paddle_inference_api.cc
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+namespace paddle {
+
+/*
+ * Do not use this, just a demo indicating how to customize a config for a
+ * specific predictor.
+ */
+struct DemoConfig : public PaddlePredictor::Config {
+  float other_config;
+};
+
+/*
+ * Do not use this, just a demo indicating how to customize a Predictor.
+ */
+class DemoPredictor : public PaddlePredictor {
+ public:
+  explicit DemoPredictor(const DemoConfig &config) {
+    LOG(INFO) << "I get other_config " << config.other_config;
+  }
+  bool Run(const std::vector<PaddleTensor> &inputs,
+           std::vector<PaddleTensor> *output_data) override {
+    LOG(INFO) << "Run";
+    return false;
+  }
+
+  std::unique_ptr<PaddlePredictor> Clone() override { return nullptr; }
+
+  ~DemoPredictor() override {}
+};
+
+template <>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<DemoConfig>(
+    const DemoConfig &config) {
+  std::unique_ptr<PaddlePredictor> x(new DemoPredictor(config));
+  return x;
+}
+
+TEST(paddle_inference_api, demo) {
+  DemoConfig config;
+  config.other_config = 1.7;
+  auto predictor = CreatePaddlePredictor(config);
+  std::vector<PaddleTensor> outputs;
+  predictor->Run({}, &outputs);
+}
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..88c4e665a3daed0ed34b23b75d360acbd586401f
--- /dev/null
+++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc
@@ -0,0 +1,288 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <thread>
+
+#include "gflags/gflags.h"
+#include "paddle/contrib/inference/paddle_inference_api_impl.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+namespace paddle {
+
+PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
+  PaddleTensor pt;
+
+  if (t->type() == typeid(int64_t)) {
+    pt.data.Reset(t->data<void>(), t->numel() * sizeof(int64_t));
+    pt.dtype = PaddleDType::INT64;
+  } else if (t->type() == typeid(float)) {
+    pt.data.Reset(t->data<void>(), t->numel() * sizeof(float));
+    pt.dtype = PaddleDType::FLOAT32;
+  } else {
+    LOG(FATAL) << "unsupported type.";
+  }
+  pt.shape = framework::vectorize2int(t->dims());
+  return pt;
+}
+
+NativeConfig GetConfig() {
+  NativeConfig config;
+  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  LOG(INFO) << "dirname  " << config.model_dir;
+  config.fraction_of_gpu_memory = 0.15;
+#ifdef PADDLE_WITH_CUDA
+  config.use_gpu = true;
+#else
+  config.use_gpu = false;
+#endif
+  config.device = 0;
+  return config;
+}
+
+void MainWord2Vec(bool use_gpu) {
+  NativeConfig config = GetConfig();
+  auto predictor = CreatePaddlePredictor<NativeConfig>(config);
+  config.use_gpu = use_gpu;
+
+  framework::LoDTensor first_word, second_word, third_word, fourth_word;
+  framework::LoD lod{{0, 1}};
+  int64_t dict_size = 2073;  // The size of dictionary
+
+  SetupLoDTensor(&first_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&second_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&third_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
+
+  std::vector<PaddleTensor> paddle_tensor_feeds;
+  paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&first_word));
+  paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&second_word));
+  paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&third_word));
+  paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&fourth_word));
+
+  std::vector<PaddleTensor> outputs;
+  ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
+  ASSERT_EQ(outputs.size(), 1UL);
+  size_t len = outputs[0].data.length();
+  float* data = static_cast<float*>(outputs[0].data.data());
+  for (size_t j = 0; j < len / sizeof(float); ++j) {
+    ASSERT_LT(data[j], 1.0);
+    ASSERT_GT(data[j], -1.0);
+  }
+
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&first_word);
+  cpu_feeds.push_back(&second_word);
+  cpu_feeds.push_back(&third_word);
+  cpu_feeds.push_back(&fourth_word);
+
+  framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  TestInference<platform::CPUPlace>(config.model_dir, cpu_feeds, cpu_fetchs1);
+
+  float* lod_data = output1.data<float>();
+  for (int i = 0; i < output1.numel(); ++i) {
+    EXPECT_LT(lod_data[i] - data[i], 1e-3);
+    EXPECT_GT(lod_data[i] - data[i], -1e-3);
+  }
+}
+
+void MainImageClassification(bool use_gpu) {
+  int batch_size = 2;
+  bool repeat = false;
+  NativeConfig config = GetConfig();
+  config.use_gpu = use_gpu;
+  config.model_dir =
+      FLAGS_dirname + "image_classification_resnet.inference.model";
+
+  const bool is_combined = false;
+  std::vector<std::vector<int64_t>> feed_target_shapes =
+      GetFeedTargetShapes(config.model_dir, is_combined);
+
+  framework::LoDTensor input;
+  // Use normilized image pixels as input data,
+  // which should be in the range [0.0, 1.0].
+  feed_target_shapes[0][0] = batch_size;
+  framework::DDim input_dims = framework::make_ddim(feed_target_shapes[0]);
+  SetupTensor<float>(
+      &input, input_dims, static_cast<float>(0), static_cast<float>(1));
+  std::vector<framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&input);
+
+  framework::LoDTensor output1;
+  std::vector<framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  TestInference<platform::CPUPlace, false, true>(
+      config.model_dir, cpu_feeds, cpu_fetchs1, repeat, is_combined);
+
+  auto predictor = CreatePaddlePredictor(config);
+  std::vector<PaddleTensor> paddle_tensor_feeds;
+  paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&input));
+
+  std::vector<PaddleTensor> outputs;
+  ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
+  ASSERT_EQ(outputs.size(), 1UL);
+  size_t len = outputs[0].data.length();
+  float* data = static_cast<float*>(outputs[0].data.data());
+  float* lod_data = output1.data<float>();
+  for (size_t j = 0; j < len / sizeof(float); ++j) {
+    EXPECT_NEAR(lod_data[j], data[j], 1e-3);
+  }
+}
+
+void MainThreadsWord2Vec(bool use_gpu) {
+  NativeConfig config = GetConfig();
+  config.use_gpu = use_gpu;
+  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
+
+  // prepare inputs data and reference results
+  constexpr int num_jobs = 3;
+  std::vector<std::vector<framework::LoDTensor>> jobs(num_jobs);
+  std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs);
+  std::vector<framework::LoDTensor> refs(num_jobs);
+  for (size_t i = 0; i < jobs.size(); ++i) {
+    // each job has 4 words
+    jobs[i].resize(4);
+    for (size_t j = 0; j < 4; ++j) {
+      framework::LoD lod{{0, 1}};
+      int64_t dict_size = 2073;  // The size of dictionary
+      SetupLoDTensor(&jobs[i][j], lod, static_cast<int64_t>(0), dict_size - 1);
+      paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i][j]));
+    }
+
+    // get reference result of each job
+    std::vector<paddle::framework::LoDTensor*> ref_feeds;
+    std::vector<paddle::framework::LoDTensor*> ref_fetches(1, &refs[i]);
+    for (auto& word : jobs[i]) {
+      ref_feeds.push_back(&word);
+    }
+    TestInference<platform::CPUPlace>(config.model_dir, ref_feeds, ref_fetches);
+  }
+
+  // create threads and each thread run 1 job
+  std::vector<std::thread> threads;
+  for (int tid = 0; tid < num_jobs; ++tid) {
+    threads.emplace_back([&, tid]() {
+      auto predictor = main_predictor->Clone();
+      auto& local_inputs = paddle_tensor_feeds[tid];
+      std::vector<PaddleTensor> local_outputs;
+      ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
+
+      // check outputs range
+      ASSERT_EQ(local_outputs.size(), 1UL);
+      const size_t len = local_outputs[0].data.length();
+      float* data = static_cast<float*>(local_outputs[0].data.data());
+      for (size_t j = 0; j < len / sizeof(float); ++j) {
+        ASSERT_LT(data[j], 1.0);
+        ASSERT_GT(data[j], -1.0);
+      }
+
+      // check outputs correctness
+      float* ref_data = refs[tid].data<float>();
+      EXPECT_EQ(refs[tid].numel(), static_cast<int64_t>(len / sizeof(float)));
+      for (int i = 0; i < refs[tid].numel(); ++i) {
+        EXPECT_NEAR(ref_data[i], data[i], 1e-3);
+      }
+    });
+  }
+  for (int i = 0; i < num_jobs; ++i) {
+    threads[i].join();
+  }
+}
+
+void MainThreadsImageClassification(bool use_gpu) {
+  constexpr int num_jobs = 4;  // each job run 1 batch
+  constexpr int batch_size = 1;
+  NativeConfig config = GetConfig();
+  config.use_gpu = use_gpu;
+  config.model_dir =
+      FLAGS_dirname + "image_classification_resnet.inference.model";
+
+  auto main_predictor = CreatePaddlePredictor<NativeConfig>(config);
+  std::vector<framework::LoDTensor> jobs(num_jobs);
+  std::vector<std::vector<PaddleTensor>> paddle_tensor_feeds(num_jobs);
+  std::vector<framework::LoDTensor> refs(num_jobs);
+  for (size_t i = 0; i < jobs.size(); ++i) {
+    // prepare inputs
+    std::vector<std::vector<int64_t>> feed_target_shapes =
+        GetFeedTargetShapes(config.model_dir, /*is_combined*/ false);
+    feed_target_shapes[0][0] = batch_size;
+    framework::DDim input_dims = framework::make_ddim(feed_target_shapes[0]);
+    SetupTensor<float>(&jobs[i], input_dims, 0.f, 1.f);
+    paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i]));
+
+    // get reference result of each job
+    std::vector<framework::LoDTensor*> ref_feeds(1, &jobs[i]);
+    std::vector<framework::LoDTensor*> ref_fetches(1, &refs[i]);
+    TestInference<platform::CPUPlace>(config.model_dir, ref_feeds, ref_fetches);
+  }
+
+  // create threads and each thread run 1 job
+  std::vector<std::thread> threads;
+  for (int tid = 0; tid < num_jobs; ++tid) {
+    threads.emplace_back([&, tid]() {
+      auto predictor = main_predictor->Clone();
+      auto& local_inputs = paddle_tensor_feeds[tid];
+      std::vector<PaddleTensor> local_outputs;
+      ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs));
+
+      // check outputs correctness
+      ASSERT_EQ(local_outputs.size(), 1UL);
+      const size_t len = local_outputs[0].data.length();
+      float* data = static_cast<float*>(local_outputs[0].data.data());
+      float* ref_data = refs[tid].data<float>();
+      EXPECT_EQ(refs[tid].numel(), len / sizeof(float));
+      for (int i = 0; i < refs[tid].numel(); ++i) {
+        EXPECT_NEAR(ref_data[i], data[i], 1e-3);
+      }
+    });
+  }
+  for (int i = 0; i < num_jobs; ++i) {
+    threads[i].join();
+  }
+}
+
+TEST(inference_api_native, word2vec_cpu) { MainWord2Vec(false /*use_gpu*/); }
+TEST(inference_api_native, word2vec_cpu_threads) {
+  MainThreadsWord2Vec(false /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_cpu) {
+  MainThreadsImageClassification(false /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_cpu_threads) {
+  MainThreadsImageClassification(false /*use_gpu*/);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(inference_api_native, word2vec_gpu) { MainWord2Vec(true /*use_gpu*/); }
+TEST(inference_api_native, word2vec_gpu_threads) {
+  MainThreadsWord2Vec(true /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_gpu) {
+  MainThreadsImageClassification(true /*use_gpu*/);
+}
+TEST(inference_api_native, image_classification_gpu_threads) {
+  MainThreadsImageClassification(true /*use_gpu*/);
+}
+
+#endif
+
+}  // namespace paddle
diff --git a/paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc b/paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b100630dbe412ca811f1a8f2b8191356f5ebec2f
--- /dev/null
+++ b/paddle/contrib/inference/test_paddle_inference_api_tensorrt_subgraph_engine.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "paddle/contrib/inference/paddle_inference_api.h"
+
+namespace paddle {
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+void Main(bool use_gpu) {
+  //# 1. Create PaddlePredictor with a config.
+  TensorRTConfig config;
+  config.model_dir = FLAGS_dirname + "word2vec.inference.model";
+  config.use_gpu = use_gpu;
+  config.fraction_of_gpu_memory = 0.15;
+  config.device = 0;
+  auto predictor =
+      CreatePaddlePredictor<TensorRTConfig,
+                            PaddleEngineKind::kAutoMixedTensorRT>(config);
+
+  for (int batch_id = 0; batch_id < 3; batch_id++) {
+    //# 2. Prepare input.
+    int64_t data[4] = {1, 2, 3, 4};
+
+    PaddleTensor tensor{.name = "",
+                        .shape = std::vector<int>({4, 1}),
+                        .data = PaddleBuf(data, sizeof(data)),
+                        .dtype = PaddleDType::INT64};
+
+    // For simplicity, we set all the slots with the same data.
+    std::vector<PaddleTensor> slots(4, tensor);
+
+    //# 3. Run
+    std::vector<PaddleTensor> outputs;
+    CHECK(predictor->Run(slots, &outputs));
+
+    //# 4. Get output.
+    ASSERT_EQ(outputs.size(), 1UL);
+    LOG(INFO) << "output buffer size: " << outputs.front().data.length();
+    const size_t num_elements = outputs.front().data.length() / sizeof(float);
+    // The outputs' buffers are in CPU memory.
+    for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+      LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
+    }
+  }
+}
+
+TEST(paddle_inference_api_tensorrt_subgraph_engine, main) { Main(true); }
+
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
deleted file mode 100755
index efd1b7a73e1655f95eb83a5e2f59e82cbf7eba16..0000000000000000000000000000000000000000
--- a/paddle/cuda/CMakeLists.txt
+++ /dev/null
@@ -1,94 +0,0 @@
-set(AVX_SOURCES
-    src/hl_math.cc
-    src/hl_avx_functions.cc
-)
-
-if(WITH_AVX)
-    set(CUDA_SOURCES
-        src/hl_time.cc
-        src/hl_cpu_functions.cc
-        ${AVX_SOURCES})
-else()
-    set(CUDA_SOURCES
-        src/hl_time.cc
-        src/hl_cpu_functions.cc)
-endif()
-
-set(CUDA_CXX_WITH_GPU_SOURCES
-    src/hl_cuda_cublas.cc
-    src/hl_cuda_cudnn.cc
-    src/hl_cuda_device.cc)
-
-if(WITH_GPU)
-    set(CUDA_CXX_SOURCES
-        src/hl_warpctc_wrap.cc
-        ${CUDA_CXX_WITH_GPU_SOURCES})
-
-    set_source_files_properties(${CUDA_CXX_SOURCES}
-                                PROPERTIES COMPILE_FLAGS "-D__NVCC__")
-else()
-    if (NOT MOBILE_INFERENCE)
-    set(CUDA_CXX_SOURCES src/hl_warpctc_wrap.cc)
-    endif()
-endif()
-
-set(CUDA_CU_SOURCES
-    src/hl_perturbation_util.cu
-    src/hl_cuda_aggregate.cu
-    src/hl_cuda_matrix.cu
-    src/hl_cuda_sparse.cu
-    src/hl_cuda_cnn.cu
-    src/hl_cuda_lstm.cu
-    src/hl_top_k.cu
-    src/hl_batch_transpose.cu
-    src/hl_batch_norm.cu
-    src/hl_cuda_sequence.cu
-    src/hl_table_apply.cu)
-
-set(CUDA_HEADERS
-    include/hl_time.h
-    include/hl_warpctc_wrap.h
-    include/hl_sequence.h
-    include/hl_cuda_cublas.h
-    include/hl_batch_transpose.h
-    include/hl_avx_functions.h
-    include/hl_sparse.h
-    include/hl_functions.h
-    include/hl_cuda_cudnn.h
-    include/hl_activation_functions.h
-    include/hl_base.h
-    include/stub/hl_cuda_cudnn_stub.h
-    include/stub/hl_cuda_stub.h
-    include/stub/hl_cuda_cublas_stub.h
-    include/stub/hl_cnn_stub.h
-    include/stub/hl_lstm_stub.h
-    include/stub/hl_sequence_stub.h
-    include/stub/hl_aggregate_stub.h
-    include/stub/hl_sparse_stub.h
-    include/stub/hl_matrix_stub.h
-    include/hl_aggregate.h
-    include/hl_cuda.h
-    include/hl_lstm.h
-    include/hl_table_apply.h
-    include/hl_gpu.h
-    include/hl_top_k.h
-    include/hl_matrix.h
-    include/hl_cnn.h)
-
-if(WITH_GPU)
-    cuda_add_library(paddle_cuda
-        ${CUDA_SOURCES}
-        ${CUDA_CU_SOURCES}
-        ${CUDA_CXX_SOURCES})
-else()
-    add_library(paddle_cuda
-                ${CUDA_SOURCES}
-                ${CUDA_CXX_SOURCES})
-endif()
-
-add_dependencies(paddle_cuda paddle_proto ${external_project_dependencies})
-
-add_style_check_target(paddle_cuda
-                       ${CUDA_SOURCES}
-                       ${CUDA_HEADERS}
-                       ${CUDA_CXX_SOURCES})
diff --git a/paddle/cuda/include/hl_activation_functions.h b/paddle/cuda/include/hl_activation_functions.h
deleted file mode 100644
index 29ec248420058db08bd1932f702d26074d49f38c..0000000000000000000000000000000000000000
--- a/paddle/cuda/include/hl_activation_functions.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_ACTIVATION_FUNCTIONS_H_
-#define HL_ACTIVATION_FUNCTIONS_H_
-
-#include "hl_functions.h"
-
-/**
- * Active functions: sigmoid, relu, tanh and linear.
- */
-#define HPPL_ACTIVE_FUNCTION \
-  { hppl::sigmoid, hppl::relu, hppl::tanh, hppl::linear }
-
-namespace hppl {
-
-/**
- * Hppl supports sigmoid, relu, tanh, linear active functions
- * for neural networks' forward and backward activation.
- */
-template <class T>
-class Active {
-public:
-  typedef T (*forward)(T);
-  typedef T (*backward)(T, T);
-};
-
-#ifdef __NVCC__
-namespace gpu {
-static __device__ Active<real>::forward forward[] = HPPL_ACTIVE_FUNCTION;
-static __device__ Active<real>::backward backward[] = HPPL_ACTIVE_FUNCTION;
-}  // namespace gpu
-#else
-namespace cpu {
-static Active<real>::forward forward[] = HPPL_ACTIVE_FUNCTION;
-static Active<real>::backward backward[] = HPPL_ACTIVE_FUNCTION;
-}  // namespace cpu
-
-#ifdef __AVX__
-namespace avx {
-static Active<__m256>::forward forward[] = HPPL_ACTIVE_FUNCTION;
-static Active<__m256>::backward backward[] = HPPL_ACTIVE_FUNCTION;
-}  // namespace avx
-#endif
-#endif
-
-}  // namespace hppl
-
-#endif  // HL_ACTIVATION_FUNCTIONS_H_
diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h
deleted file mode 100644
index 6c4f09dacb47c431db2e2610a3e61390a82dcba0..0000000000000000000000000000000000000000
--- a/paddle/cuda/include/hl_base.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_BASE_H_
-#define HL_BASE_H_
-
-#include <cstddef>
-
-#ifdef PADDLE_TYPE_DOUBLE
-#define HL_FLOAT_MAX 3.40282347e+38F
-#define HL_FLOAT_MIN 1.17549435e-38F
-using real = double;
-#else
-#define HL_FLOAT_MAX 1.7976931348623157e+308
-#define HL_FLOAT_MIN 2.2250738585072014e-308
-using real = float;
-#endif
-
-/**
- * The maximum input value for exp, used to avoid overflow problem.
- * currently only used for tanh function.
- */
-#define EXP_MAX_INPUT 40.0
-
-/**
- * @brief DIVUP(x, y) is similar to ceil(x / y).
- * @note  For CUDA, DIVUP will be used to specify
- *        the size of blockDim.
- */
-#ifndef DIVUP
-#define DIVUP(x, y) (((x) + (y)-1) / (y))
-#endif
-
-/**
- * HPPL is an internal high performance parallel computing library
- * for high-level neural network routines, which can support many
- * heterogeneous compute architectures, such as GPU, FPGA, etc.
- */
-
-/**
- * @brief   HPPL CUDA Stream.
- *
- * @note    Each thread can use HPPL_STREAM_* after calling hl_init.
- *          HPPL_STREAM_DEFAULT is HPPL default stream.
- */
-typedef enum {
-  HPPL_STREAM_DEFAULT = 0, /* Thread Default Stream*/
-  HPPL_STREAM_1 = 1,
-  HPPL_STREAM_2 = 2,
-  HPPL_STREAM_3 = 3,
-  HPPL_STREAM_4 = 4,
-  HPPL_THREAD_STREAM_1 = 5,
-  HPPL_THREAD_STREAM_2 = 6,
-  HPPL_THREAD_STREAM_3 = 7,
-  HPPL_THREAD_STREAM_4 = 8,
-  HPPL_STREAM_END
-} hl_stream_t;
-
-/**
- * @brief HPPL activation mode.
- */
-typedef enum {
-  HL_ACTIVATION_SIGMOID = 0,
-  HL_ACTIVATION_RELU = 1,
-  HL_ACTIVATION_TANH = 2,
-  HL_ACTIVATION_LINEAR = 3,
-  HL_ACTIVATION_END
-} hl_activation_mode_t;
-
-/**
- * @brief Transpose type.
- */
-typedef enum {
-  HPPL_OP_N = 0, /* transpose */
-  HPPL_OP_T = 1, /* non transpose */
-  HPPL_OP_END
-} hl_trans_op_t;
-
-/**
- * @brief Lstm value.
- *
- * @param  gateValue         input value.
- * @param  prevStateValue    previous state value.
- * @param  stateValue        state value.
- * @param  stateActiveValue  state active value.
- * @param  outputValue       output value.
- */
-typedef struct {
-  real *gateValue;
-  real *prevStateValue;
-  real *stateValue;
-  real *stateActiveValue;
-  real *outputValue;
-  real *checkIg;
-  real *checkFg;
-  real *checkOg;
-} hl_lstm_value;
-
-/**
- * @brief Lstm gradient.
- *
- * @param  gateGrad          input gradient.
- * @param  prevStateGrad     previous state gradient.
- * @param  stateGrad         state gradient.
- * @param  stateActiveGrad   state active gradient.
- * @param  outputGrad        output gradient.
- */
-typedef struct {
-  real *gateGrad;
-  real *prevStateGrad;
-  real *stateGrad;
-  real *stateActiveGrad;
-  real *outputGrad;
-  real *checkIgGrad;
-  real *checkFgGrad;
-  real *checkOgGrad;
-} hl_lstm_grad;
-
-/**
- * @brief Gru value.
- *
- * @param  gateWeight           gate weight (updateGate + resetGate).
- * @param  stateWeight          frame state weight.
- * @param  gateValue            gate value results.
- * @param  resetOutputValue     resetOutput value.
- * @param  outputValue          output value.
- * @param  prevOutValue         previous output value.
- *
- */
-typedef struct {
-  real *gateWeight;
-  real *stateWeight;
-  real *gateValue;
-  real *resetOutputValue;
-  real *outputValue;
-  real *prevOutValue;
-} hl_gru_value;
-
-/**
- * @brief Gru gradient.
- *
- * @param  gateWeightGrad       gate weight gradient.
- * @param  stateWeightGrad      frame state weight gradient.
- * @param  gateGrad             gate gradient results.
- * @param  resetOutputGrad      resetOutput gradient.
- * @param  outputGrad           output gradient.
- * @param  prevOutGrad          previous output gradient.
- */
-typedef struct {
-  real *gateWeightGrad;
-  real *stateWeightGrad;
-  real *gateGrad;
-  real *resetOutputGrad;
-  real *outputGrad;
-  real *prevOutGrad;
-} hl_gru_grad;
-
-/**
- * @brief  Sparse matrix value type.
- */
-typedef enum {
-  HL_NO_VALUE = 0, /* matrix values only 0 or 1 */
-  HL_FLOAT_VALUE = 1,
-  HL_VALUE_END
-} hl_matrix_value_t;
-
-/**
- * @brief  HPPL matrix format.
- */
-typedef enum {
-  HL_SPARSE_CSR = 0,
-  HL_SPARSE_CSC = 1,
-  HL_SPARSE_END
-} hl_matrix_format_t;
-
-typedef struct _hl_matrix_s *hl_matrix_s;
-
-/**
- * @brief   HPPL sparse matrix.
- *
- * @param  matrix     sparse matrix.
- * @param  format     matrix format.
- * @param  type       the type of matrix values.
- * @param  rows       matrix rows.
- * @param  cols       matrix columns.
- * @param  nnz        nonzero values of sparse matrix.
- */
-typedef struct {
-  hl_matrix_s matrix;
-  hl_matrix_format_t format;
-  hl_matrix_value_t type;
-  int rows;
-  int cols;
-  size_t nnz;
-} _hl_sparse_matrix_s, *hl_sparse_matrix_s;
-
-#ifdef __NVCC__
-
-#include "cuda_runtime.h"
-#include "hl_cuda.h"
-#include "paddle/utils/Logging.h"
-
-extern __thread bool g_sync_flag;
-extern __thread cudaStream_t default_stream;
-#define STREAM_DEFAULT default_stream
-
-/**
- * @brief   Check cuda kernel execution.
- * @param   msg   error string
- */
-#define CHECK_SYNC(msg)                                               \
-  if (true == g_sync_flag) {                                          \
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);                       \
-    cudaError_t err = (cudaError_t)hl_get_device_last_error();        \
-    CHECK_EQ(cudaSuccess, err)                                        \
-        << "[" << msg << "] "                                         \
-        << "CUDA error: " << hl_get_device_error_string((size_t)err); \
-  }
-
-#endif /* __NVCC__ */
-
-#endif /* HL_BASE_H_ */
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
deleted file mode 100644
index 63ec51564793ca2255032d0efbe2c47326f8b698..0000000000000000000000000000000000000000
--- a/paddle/cuda/include/hl_cnn.h
+++ /dev/null
@@ -1,373 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CNN_H_
-#define HL_CNN_H_
-
-#include "hl_base.h"
-
-/**
- * @brief   Maximum pool forward with Mask output.
- *
- * @param[in]   frameCnt    batch size of input image.
- * @param[in]   inputData   input data.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   pooledH     output image height.
- * @param[in]   pooledW     output image width.
- * @param[in]   sizeX       width of pooling window.
- * @param[in]   sizeY       height of pooling window.
- * @param[in]   strideH     pooling stride height.
- * @param[in]   strideW     pooling stride width.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[out]  tgtData     output data.
- * @param[in]   tgtStride   stride between output data samples.
- * @param[out]  maskData    the location indices of select max data.
- */
-extern void hl_maxpool_forward(const int frameCnt,
-                               const real* inputData,
-                               const int channels,
-                               const int height,
-                               const int width,
-                               const int pooledH,
-                               const int pooledW,
-                               const int sizeX,
-                               const int sizeY,
-                               const int strideH,
-                               const int strideW,
-                               const int paddingH,
-                               const int paddingW,
-                               real* tgtData,
-                               const int tgtStride,
-                               real* maskData = NULL);
-
-/**
- * @brief   Maximum pool backward.
- *
- * @param[in]   frameCnt    batch size of input image.
- * @param[in]   inputData   input data.
- * @param[out]  outData     output data.
- * @param[out]  outGrad     output grad data.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   pooledH     output image height.
- * @param[in]   pooledW     output image width.
- * @param[in]   sizeX       width of pooling window.
- * @param[in]   sizeY       height of pooling window.
- * @param[in]   strideH     pooling stride height.
- * @param[in]   strideW     pooling stride width.
- * @param[in]   scaleA      scale.
- * @param[in]   scaleB      scale.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[out]  targetGrad  output grad.
- * @param[in]   outStride   stride between output data samples.
- *
- */
-extern void hl_maxpool_backward(const int frameCnt,
-                                const real* inputData,
-                                const real* outData,
-                                const real* outGrad,
-                                const int channels,
-                                const int height,
-                                const int width,
-                                const int pooledH,
-                                const int pooledW,
-                                const int sizeX,
-                                const int sizeY,
-                                const int strideH,
-                                const int strideW,
-                                const int paddingH,
-                                const int paddingW,
-                                real scaleA,
-                                real scaleB,
-                                real* targetGrad,
-                                const int outStride);
-
-/**
- * @brief   Averge pool forward.
- *
- * @param[in]   frameCnt    batch size of input image.
- * @param[in]   inputData   input data.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   pooledH     output image height.
- * @param[in]   pooledW     output image width.
- * @param[in]   sizeX       width of pooling window.
- * @param[in]   sizeY       height of pooling window.
- * @param[in]   strideH     pooling stride height.
- * @param[in]   strideW     pooling stride width.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[out]  tgtData     output data.
- * @param[in]   tgtStride   stride between output data samples.
- * @param[in]   excludeMode whether to consider paddings for size.
- *
- */
-extern void hl_avgpool_forward(const int frameCnt,
-                               const real* inputData,
-                               const int channels,
-                               const int height,
-                               const int width,
-                               const int pooledH,
-                               const int pooledW,
-                               const int sizeX,
-                               const int sizeY,
-                               const int strideH,
-                               const int strideW,
-                               const int paddingH,
-                               const int paddingW,
-                               real* tgtData,
-                               const int tgtStride,
-                               bool excludeMode);
-
-/**
- * @brief   Maximum pool backward.
- *
- * @param[in]   frameCnt    batch size of input image.
- * @param[in]   outGrad     output grad data.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   pooledH     output image height.
- * @param[in]   pooledW     output image width.
- * @param[in]   sizeX       width of pooling window.
- * @param[in]   sizeY       height of pooling window.
- * @param[in]   strideH     pooling stride height.
- * @param[in]   strideW     pooling stride width.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[in]   scaleA      scale.
- * @param[in]   scaleB      scale.
- * @param[out]  backGrad    output grad.
- * @param[in]   outStride   stride between output data samples.
- * @param[in]   excludeMode whether to consider paddings for size.
- *
- */
-extern void hl_avgpool_backward(const int frameCnt,
-                                const real* outGrad,
-                                const int channels,
-                                const int height,
-                                const int width,
-                                const int pooledH,
-                                const int pooledW,
-                                const int sizeX,
-                                const int sizeY,
-                                const int strideH,
-                                const int strideW,
-                                int paddingH,
-                                int paddingW,
-                                real scaleA,
-                                real scaleB,
-                                real* backGrad,
-                                const int outStride,
-                                bool excludeMode);
-
-extern void hl_maxpool3D_forward(const int frameCnt,
-                                 const real* inputData,
-                                 const int channels,
-                                 const int depth,
-                                 const int height,
-                                 const int width,
-                                 const int pooledD,
-                                 const int pooledH,
-                                 const int pooledW,
-                                 const int sizeZ,
-                                 const int sizeY,
-                                 const int sizeX,
-                                 const int strideD,
-                                 const int strideH,
-                                 const int strideW,
-                                 const int paddingD,
-                                 const int paddingH,
-                                 const int paddingW,
-                                 real* tgtData,
-                                 real* maxPoolIdxData,
-                                 const int tgtStride);
-
-extern void hl_maxpool3D_backward(const int frameCnt,
-                                  const real* outGrad,
-                                  const int channels,
-                                  const int depth,
-                                  const int height,
-                                  const int width,
-                                  const int pooledD,
-                                  const int pooledH,
-                                  const int pooledW,
-                                  const int sizeZ,
-                                  const int sizeY,
-                                  const int sizeX,
-                                  const int strideD,
-                                  const int strideH,
-                                  const int strideW,
-                                  const int paddingD,
-                                  const int paddingH,
-                                  const int paddingW,
-                                  real scaleA,
-                                  real scaleB,
-                                  real* targetGrad,
-                                  real* maxPoolIdxData,
-                                  const int outStride);
-
-extern void hl_avgpool3D_forward(const int frameCnt,
-                                 const real* inputData,
-                                 const int channels,
-                                 const int depth,
-                                 const int height,
-                                 const int width,
-                                 const int pooledD,
-                                 const int pooledH,
-                                 const int pooledW,
-                                 const int sizeZ,
-                                 const int sizeY,
-                                 const int sizeX,
-                                 const int strideD,
-                                 const int strideH,
-                                 const int strideW,
-                                 const int paddingD,
-                                 const int paddingH,
-                                 const int paddingW,
-                                 real* tgtData,
-                                 const int tgtStride);
-
-extern void hl_avgpool3D_backward(const int frameCnt,
-                                  const real* outGrad,
-                                  const int channels,
-                                  const int depth,
-                                  const int height,
-                                  const int width,
-                                  const int pooledD,
-                                  const int pooledH,
-                                  const int pooledW,
-                                  const int sizeZ,
-                                  const int sizeY,
-                                  const int sizeX,
-                                  const int strideD,
-                                  const int strideH,
-                                  const int strideW,
-                                  int paddingD,
-                                  int paddingH,
-                                  int paddingW,
-                                  real scaleA,
-                                  real scaleB,
-                                  real* backGrad,
-                                  const int outStride);
-
-/**
- * @brief   Bilinear interpolation forward.
- *
- * @param[in]   inData      input value.
- * @param[in]   inImgH      input image height.
- * @param[in]   inImgW      input image width.
- * @param[in]   inputH      input batchSize.
- * @param[in]   inputW      input image data dim.
- * @param[out]  outData     output value.
- * @param[in]   outImgH     output image height.
- * @param[in]   outImgW     output image width.
- * @param[in]   outputH     output batchSize.
- * @param[in]   outputW     output image data dim.
- * @param[in]   numChannels number of channels.
- * @param[in]   ratioH      inImgH / outImgH.
- * @param[in]   ratioW      inImgW / outImgW.
- *
- */
-extern void hl_bilinear_forward(const real* inData,
-                                const size_t inImgH,
-                                const size_t inImgW,
-                                const size_t inputH,
-                                const size_t inputW,
-                                real* outData,
-                                const size_t outImgH,
-                                const size_t outImgW,
-                                const size_t outputH,
-                                const size_t outputW,
-                                const size_t numChannels,
-                                const real ratioH,
-                                const real ratioW);
-
-/**
- * @brief   Bilinear interpolation backward.
- *
- * @param[out]  inGrad      input gradient.
- * @param[in]   inImgH      input image height.
- * @param[in]   inImgW      input image width.
- * @param[in]   inputH      input batchSize.
- * @param[in]   inputW      input image data dim.
- * @param[in]   outGrad     output gradient.
- * @param[in]   outImgH     output image height.
- * @param[in]   outImgW     output image width.
- * @param[in]   outputH     output batchSize.
- * @param[in]   outputW     output image data dim.
- * @param[in]   numChannels number of channels.
- * @param[in]   ratioH      inImgH / outImgH.
- * @param[in]   ratioW      inImgW / outImgW.
- *
- */
-extern void hl_bilinear_backward(real* inGrad,
-                                 const size_t inImgH,
-                                 const size_t inImgW,
-                                 const size_t inputH,
-                                 const size_t inputW,
-                                 const real* outGrad,
-                                 const size_t outImgH,
-                                 const size_t outImgW,
-                                 const size_t outputH,
-                                 const size_t outputW,
-                                 const size_t numChannels,
-                                 const real ratioH,
-                                 const real ratioW);
-
-/**
- * @brief   MaxOut forward.
- *
- * @param[in]   inData      input data.
- * @param[out]  outData     output data.
- * @param[out]  idData      output maxId.
- * @param[in]   batchSize   batchSize.
- * @param[in]   size        number of channels * image height * image width.
- * @param[in]   featLen     feature length = image height * image width.
- * @param[in]   groups      number of groups.
- */
-extern void hl_maxout_forward(const real* inData,
-                              real* outData,
-                              int* idData,
-                              size_t batchSize,
-                              size_t size,
-                              size_t featLen,
-                              size_t groups);
-
-/**
- * @brief   MaxOut backward.
- *
- * @param[out]  inGrad      input grad data.
- * @param[in]   outGrad     output grad data.
- * @param[in]   idData      output maxId.
- * @param[in]   batchSize   batchSize.
- * @param[in]   size        number of channels * image height * image width.
- * @param[in]   featLen     feature length = image height * image width.
- * @param[in]   groups      number of groups.
- */
-extern void hl_maxout_backward(real* inGrad,
-                               const real* outGrad,
-                               const int* idData,
-                               size_t batchSize,
-                               size_t size,
-                               size_t featLen,
-                               size_t groups);
-
-#endif  // HL_CNN_H_
diff --git a/paddle/cuda/include/hl_gpu_gru.cuh b/paddle/cuda/include/hl_gpu_gru.cuh
deleted file mode 100644
index 9fcad2c3bc2fa255e3d7cd3e7940a32fd286751b..0000000000000000000000000000000000000000
--- a/paddle/cuda/include/hl_gpu_gru.cuh
+++ /dev/null
@@ -1,393 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_GPU_GRU_CUH_
-#define HL_GPU_GRU_CUH_
-
-#ifdef __NVCC__
-
-#include "paddle/utils/Logging.h"
-
-/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
- */
-template<class OpResetOutput, bool isBatch>
-__global__ void KeGruForwardResetOutput(OpResetOutput opResetOutput,
-                                        real *gateValue,
-                                        real *resetOutputValue,
-                                        real *prevOutputValue,
-                                        int frameSize,
-                                        int batchSize,
-                                        hl_activation_mode_t active_gate) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    resetOutputValue += batchIdx * frameSize;
-  }
-
-  real rPrevOut = 0;
-  real rValueResetOutput;
-  real rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
-  real rValueResetGate  = gateValue[frameIdx + frameSize * 1];
-
-  if (prevOutputValue) {
-    if (isBatch) prevOutputValue += batchIdx * frameSize;
-    rPrevOut = prevOutputValue[frameIdx];
-  }
-
-  opResetOutput(rValueUpdateGate,
-                rValueResetGate,
-                rPrevOut,
-                rValueResetOutput,
-                hppl::gpu::forward[active_gate]);
-
-  gateValue[frameIdx + frameSize * 0] = rValueUpdateGate;
-  gateValue[frameIdx + frameSize * 1] = rValueResetGate;
-  resetOutputValue[frameIdx] = rValueResetOutput;
-}
-
-/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
- */
-template<class OpFinalOutput, bool isBatch>
-__global__ void KeGruForwardFinalOutput(OpFinalOutput opFinalOutput,
-                                        real *gateValue,
-                                        real *prevOutputValue,
-                                        real *outputValue,
-                                        int frameSize,
-                                        int batchSize,
-                                        hl_activation_mode_t active_node) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    outputValue += batchIdx * frameSize;
-  }
-
-  real rOutput;
-  real rPrevOut = 0;
-  real rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
-  real rValueFrameState = gateValue[frameIdx + frameSize * 2];
-
-  if (prevOutputValue) {
-    if (isBatch) prevOutputValue += batchIdx * frameSize;
-    rPrevOut = prevOutputValue[frameIdx];
-  }
-
-  opFinalOutput(rValueUpdateGate,
-                rValueFrameState,
-                rPrevOut,
-                rOutput,
-                hppl::gpu::forward[active_node]);
-
-  gateValue[frameIdx + frameSize * 2] = rValueFrameState;
-  outputValue[frameIdx] = rOutput;
-}
-
-template<class OpResetOutput, class OpFinalOutput>
-void hl_gpu_gru_forward(OpResetOutput opResetOutput,
-                        OpFinalOutput opFinalOutput,
-                        hl_gru_value value,
-                        int frameSize,
-                        int batchSize,
-                        hl_activation_mode_t active_node,
-                        hl_activation_mode_t active_gate) {
-  dim3 threads;
-  dim3 grid;
-  if (batchSize == 1) {
-    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-    int frameBlocks = (frameSize + 1024 - 1) / 1024;
-    threads = dim3(framePerBlock, 1);
-    grid = dim3(frameBlocks, 1);
-  } else {
-    threads = dim3(32, 32);
-    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
-  }
-
-  if (value.prevOutValue) {
-    hl_matrix_mul(value.prevOutValue, HPPL_OP_N,
-                  value.gateWeight, HPPL_OP_N,
-                  value.gateValue,
-                  batchSize, 2*frameSize, frameSize,
-                  /*alpha = */ 1, /*beta = */ 1,
-                  frameSize, 2* frameSize, 3*frameSize);
-  }
-
-  if (batchSize == 1) {
-    KeGruForwardResetOutput<OpResetOutput, /* isBatch= */false>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetOutput,
-        value.gateValue, value.resetOutputValue, value.prevOutValue,
-        frameSize, batchSize, active_gate);
-  } else {
-    KeGruForwardResetOutput<OpResetOutput, /* isBatch= */true>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetOutput,
-        value.gateValue, value.resetOutputValue, value.prevOutValue,
-        frameSize, batchSize, active_gate);
-  }
-
-  if (value.prevOutValue) {
-    hl_matrix_mul(value.resetOutputValue, HPPL_OP_N,
-                  value.stateWeight, HPPL_OP_N,
-                  value.gateValue + 2*frameSize,
-                  batchSize, frameSize, frameSize,
-                  /*alpha = */ 1, /*beta = */ 1,
-                  frameSize, frameSize, 3*frameSize);
-  }
-
-  if (batchSize == 1) {
-    KeGruForwardFinalOutput<OpFinalOutput, /* isBatch= */false>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opFinalOutput,
-        value.gateValue, value.prevOutValue, value.outputValue,
-        frameSize, batchSize, active_node);
-  } else {
-    KeGruForwardFinalOutput<OpFinalOutput, /* isBatch= */true>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opFinalOutput,
-        value.gateValue, value.prevOutValue, value.outputValue,
-        frameSize, batchSize, active_node);
-  }
-
-  CHECK_SYNC("hl_gpu_gru_forward failed");
-}
-
-/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
- */
-template<class OpStateGrad, bool isBatch>
-__global__ void KeGruBackwardStateGrad(OpStateGrad opStateGrad,
-                                       real *gateValue,
-                                       real *gateGrad,
-                                       real *prevOutValue,
-                                       real *prevOutGrad,
-                                       real *outputGrad,
-                                       int frameSize,
-                                       int batchSize,
-                                       hl_activation_mode_t active_node) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    gateGrad  += batchIdx * 3 * frameSize;
-    outputGrad += batchIdx * frameSize;
-  }
-
-  real rUpdateGateGrad;
-  real rFrameStateGrad;
-  real rPrevOutValue = 0;
-  real rPrevOutGrad  = 0;
-  real rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
-  real rFrameStateValue = gateValue[frameIdx + frameSize * 2];
-  real rOutGrad  = outputGrad[frameIdx];
-
-  if (prevOutValue && prevOutGrad) {
-    if (isBatch) prevOutValue += batchIdx * frameSize;
-    rPrevOutValue = prevOutValue[frameIdx];
-
-    if (isBatch) prevOutGrad  += batchIdx * frameSize;
-    rPrevOutGrad  = prevOutGrad[frameIdx];
-  }
-
-  opStateGrad(rUpdateGateValue,
-              rUpdateGateGrad,
-              rFrameStateValue,
-              rFrameStateGrad,
-              rPrevOutValue,
-              rPrevOutGrad,
-              rOutGrad,
-              hppl::gpu::backward[active_node]);
-
-  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
-  gateGrad[frameIdx + frameSize * 2] = rFrameStateGrad;
-  if (prevOutGrad) {
-    prevOutGrad[frameIdx] = rPrevOutGrad;
-  }
-}
-
-/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
- */
-template<class OpResetGrad, bool isBatch>
-__global__ void KeGruBackwardResetGrad(OpResetGrad opResetGrad,
-                                       real *gateValue,
-                                       real *gateGrad,
-                                       real *prevOutValue,
-                                       real *prevOutGrad,
-                                       real *resetOutputGrad,
-                                       int frameSize,
-                                       int batchSize,
-                                       hl_activation_mode_t active_gate) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    gateGrad  += batchIdx * 3 * frameSize;
-    resetOutputGrad += batchIdx * frameSize;
-  }
-
-  real rResetGateGrad;
-  real rPrevOutValue = 0;
-  real rPrevOutGrad  = 0;
-  real rResetOutputGrad = 0;
-  real rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
-  real rUpdateGateGrad  = gateGrad[frameIdx + frameSize * 0];
-  real rResetGateValue  = gateValue[frameIdx + frameSize * 1];
-
-  if (prevOutValue && prevOutGrad) {
-    if (isBatch) prevOutValue += batchIdx * frameSize;
-    if (isBatch) prevOutGrad  += batchIdx * frameSize;
-    rPrevOutValue = prevOutValue[frameIdx];
-    rPrevOutGrad  = prevOutGrad[frameIdx];
-    rResetOutputGrad = resetOutputGrad[frameIdx];
-  }
-
-  opResetGrad(rUpdateGateValue,
-              rUpdateGateGrad,
-              rResetGateValue,
-              rResetGateGrad,
-              rPrevOutValue,
-              rPrevOutGrad,
-              rResetOutputGrad,
-              hppl::gpu::backward[active_gate]);
-
-  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
-  gateGrad[frameIdx + frameSize * 1] = rResetGateGrad;
-  if (prevOutGrad) {
-    prevOutGrad[frameIdx] = rPrevOutGrad;
-  }
-}
-
-template<class OpStateGrad, class OpResetGrad>
-void hl_gpu_gru_backward(OpStateGrad opStateGrad,
-                         OpResetGrad opResetGrad,
-                         hl_gru_value value,
-                         hl_gru_grad  grad,
-                         int frameSize,
-                         int batchSize,
-                         hl_activation_mode_t active_node,
-                         hl_activation_mode_t active_gate) {
-  dim3 threads;
-  dim3 grid;
-  if (batchSize == 1) {
-    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-    int frameBlocks = (frameSize + 1024 - 1) / 1024;
-    threads = dim3(framePerBlock, 1);
-    grid = dim3(frameBlocks, 1);
-  } else {
-    threads = dim3(32, 32);
-    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
-  }
-
-  if (batchSize == 1) {
-    KeGruBackwardStateGrad<OpStateGrad, /* isBatch= */false>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opStateGrad,
-        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
-        grad.outputGrad, frameSize, batchSize, active_node);
-  } else {
-    KeGruBackwardStateGrad<OpStateGrad, /* isBatch= */true>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opStateGrad,
-        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
-        grad.outputGrad, frameSize, batchSize, active_node);
-  }
-
-  if (value.prevOutValue && grad.prevOutGrad) {
-    hl_matrix_mul(grad.gateGrad + 2*frameSize, HPPL_OP_N,
-                  value.stateWeight, HPPL_OP_T,
-                  grad.resetOutputGrad,
-                  batchSize, frameSize, frameSize,
-                  /*alpha = */ 1, /*beta = */ 0,
-                  3*frameSize, frameSize, frameSize);
-    if (grad.stateWeightGrad) {
-      hl_matrix_mul(value.resetOutputValue, HPPL_OP_T,
-                    grad.gateGrad + 2*frameSize, HPPL_OP_N,
-                    grad.stateWeightGrad,
-                    frameSize, frameSize, batchSize,
-                    /*alpha = */ 1, /*beta = */ 1,
-                    frameSize, 3*frameSize, frameSize);
-    }
-  }
-
-  if (batchSize == 1) {
-    KeGruBackwardResetGrad<OpResetGrad, /* isBatch= */false>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetGrad,
-        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
-        grad.resetOutputGrad, frameSize, batchSize, active_gate);
-  } else {
-    KeGruBackwardResetGrad<OpResetGrad, /* isBatch= */true>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetGrad,
-        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
-        grad.resetOutputGrad, frameSize, batchSize, active_gate);
-  }
-
-  if (grad.prevOutGrad && value.prevOutValue) {
-    hl_matrix_mul(grad.gateGrad, HPPL_OP_N,
-                  value.gateWeight, HPPL_OP_T,
-                  grad.prevOutGrad,
-                  batchSize, frameSize, 2*frameSize,
-                  /*alpha = */ 1, /*beta = */ 1,
-                  3*frameSize, 2*frameSize, frameSize);
-    if (grad.gateWeightGrad) {
-      hl_matrix_mul(value.prevOutValue, HPPL_OP_T,
-                    grad.gateGrad, HPPL_OP_N,
-                    grad.gateWeightGrad,
-                    frameSize, 2*frameSize, batchSize,
-                    /*alpha = */ 1, /*beta = */ 1,
-                    frameSize, 3*frameSize, 2*frameSize);
-    }
-  }
-
-  CHECK_SYNC("hl_gpu_gru_backward failed");
-}
-
-#else
-
-template<class OpResetOutput, class OpFinalOutput>
-void hl_gpu_gru_forward(OpResetOutput opResetOutput,
-                        OpFinalOutput opFinalOutput,
-                        hl_gru_value value,
-                        int frameSize,
-                        int batchSize,
-                        hl_activation_mode_t active_node,
-                        hl_activation_mode_t active_gate) {}
-
-template<class OpStateGrad, class OpResetGrad>
-void hl_gpu_gru_backward(OpStateGrad opStateGrad,
-                         OpResetGrad opResetGrad,
-                         hl_gru_value value,
-                         hl_gru_grad  grad,
-                         int frameSize,
-                         int batchSize,
-                         hl_activation_mode_t active_node,
-                         hl_activation_mode_t active_gate) {}
-
-#endif
-
-#endif /* HL_GPU_GRU_CUH_ */
diff --git a/paddle/cuda/include/hl_gpu_lstm.cuh b/paddle/cuda/include/hl_gpu_lstm.cuh
deleted file mode 100644
index 92517a44d2353a42d905708fc9aa98727a13a9e9..0000000000000000000000000000000000000000
--- a/paddle/cuda/include/hl_gpu_lstm.cuh
+++ /dev/null
@@ -1,300 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-#ifndef HL_GPU_LSTM_CUH_
-#define HL_GPU_LSTM_CUH_
-
-#ifdef __NVCC__
-
-#include "paddle/utils/Logging.h"
-#include "hl_device_functions.cuh"
-
-/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
- */
-template<class Op, bool isBatch>
-__global__ void KeLstmForward(Op op,
-                              hl_lstm_value value,
-                              int frameSize,
-                              int batchSize,
-                              hl_activation_mode_t active_node,
-                              hl_activation_mode_t active_gate,
-                              hl_activation_mode_t active_state) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    value.gateValue += batchIdx * frameSize * 4;
-    value.outputValue += batchIdx * frameSize;
-    value.stateValue  += batchIdx * frameSize;
-    value.stateActiveValue += batchIdx * frameSize;
-  }
-
-  real rState;
-  real rPrevState = 0;
-  real rStateAtv;
-  real rOut;
-  real rValueIn;
-  real rValueIg;
-  real rValueFg;
-  real rValueOg;
-  real rCheckI = value.checkIg[frameIdx];
-  real rCheckF = value.checkFg[frameIdx];
-  real rCheckO = value.checkOg[frameIdx];
-
-  rValueIn = value.gateValue[frameIdx];
-  rValueIg = value.gateValue[frameIdx + frameSize];
-  rValueFg = value.gateValue[frameIdx + frameSize * 2];
-  rValueOg = value.gateValue[frameIdx + frameSize * 3];
-
-  if (value.prevStateValue) {
-    if (isBatch) value.prevStateValue += batchIdx * frameSize;
-    rPrevState = value.prevStateValue[frameIdx];
-  }
-
-  op(rValueIn,
-     rValueIg,
-     rValueFg,
-     rValueOg,
-     rPrevState,
-     rState,
-     rStateAtv,
-     rOut,
-     rCheckI,
-     rCheckF,
-     rCheckO,
-     hppl::gpu::forward[active_node],
-     hppl::gpu::forward[active_gate],
-     hppl::gpu::forward[active_state]);
-
-  value.gateValue[frameIdx] = rValueIn;
-  value.gateValue[frameIdx + frameSize] = rValueIg;
-  value.gateValue[frameIdx + frameSize * 2] = rValueFg;
-  value.gateValue[frameIdx + frameSize * 3] = rValueOg;
-
-  value.stateValue[frameIdx] = rState;
-  value.stateActiveValue[frameIdx] = rStateAtv;
-  value.outputValue[frameIdx] = rOut;
-}
-
-/*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
- */
-template<class Op, bool isBatch>
-__global__ void KeLstmBackward(Op op,
-                               hl_lstm_value value,
-                               hl_lstm_grad grad,
-                               int frameSize,
-                               int batchSize,
-                               hl_activation_mode_t active_node,
-                               hl_activation_mode_t active_gate,
-                               hl_activation_mode_t active_state) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    value.gateValue += batchIdx * frameSize * 4;
-    value.stateValue += batchIdx * frameSize;
-    value.stateActiveValue += batchIdx * frameSize;
-    grad.gateGrad += batchIdx * frameSize * 4;
-    grad.stateGrad += batchIdx * frameSize;
-    grad.outputGrad += batchIdx * frameSize;
-  }
-
-  real rValueIn;
-  real rValueIg;
-  real rValueFg;
-  real rValueOg;
-  real rGradIn;
-  real rGradIg;
-  real rGradFg;
-  real rGradOg;
-  real rPrevState = 0;
-  real rPrevStateGrad;
-  real rState;
-  real rStateGrad;
-  real rStateAtv;
-  real rOutputGrad;
-  real rCheckI = value.checkIg[frameIdx];
-  real rCheckF = value.checkFg[frameIdx];
-  real rCheckO = value.checkOg[frameIdx];
-  real rCheckIGrad;
-  real rCheckFGrad;
-  real rCheckOGrad;
-
-  rValueIn = value.gateValue[frameIdx];
-  rValueIg = value.gateValue[frameIdx + frameSize];
-  rValueFg = value.gateValue[frameIdx + frameSize * 2];
-  rValueOg = value.gateValue[frameIdx + frameSize * 3];
-  rState = value.stateValue[frameIdx];
-  rStateAtv = value.stateActiveValue[frameIdx];
-  rOutputGrad = grad.outputGrad[frameIdx];
-  rStateGrad = grad.stateGrad[frameIdx];
-
-  if (value.prevStateValue) {
-    if (isBatch) value.prevStateValue += batchIdx * frameSize;
-    rPrevState = value.prevStateValue[frameIdx];
-  }
-
-  op(rValueIn,
-     rValueIg,
-     rValueFg,
-     rValueOg,
-     rGradIn,
-     rGradIg,
-     rGradFg,
-     rGradOg,
-     rPrevState,
-     rPrevStateGrad,
-     rState,
-     rStateGrad,
-     rStateAtv,
-     rOutputGrad,
-     rCheckI,
-     rCheckF,
-     rCheckO,
-     rCheckIGrad,
-     rCheckFGrad,
-     rCheckOGrad,
-     hppl::gpu::backward[active_node],
-     hppl::gpu::backward[active_gate],
-     hppl::gpu::backward[active_state]);
-
-  grad.gateGrad[frameIdx] = rGradIn;
-  grad.gateGrad[frameIdx + frameSize    ] = rGradIg;
-  grad.gateGrad[frameIdx + frameSize * 2] = rGradFg;
-  grad.gateGrad[frameIdx + frameSize * 3] = rGradOg;
-  grad.stateGrad[frameIdx] = rStateGrad;
-  if (grad.prevStateGrad) {
-    if (isBatch) grad.prevStateGrad += batchIdx * frameSize;
-    grad.prevStateGrad[frameIdx] = rPrevStateGrad;
-  }
-
-  if (isBatch) {
-    if (value.prevStateValue) {
-      if (grad.checkIgGrad) paddle::paddleAtomicAdd(grad.checkIgGrad+frameIdx, rCheckIGrad);
-      if (grad.checkFgGrad) paddle::paddleAtomicAdd(grad.checkFgGrad+frameIdx, rCheckFGrad);
-    }
-    if (grad.checkOgGrad) paddle::paddleAtomicAdd(grad.checkOgGrad+frameIdx, rCheckOGrad);
-  } else {
-    if (value.prevStateValue) {
-      if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad;
-      if (grad.checkFgGrad) grad.checkFgGrad[frameIdx] += rCheckFGrad;
-    }
-    if (grad.checkOgGrad) grad.checkOgGrad[frameIdx] += rCheckOGrad;
-  }
-}
-
-template<class Op>
-void hl_gpu_lstm_forward(Op op,
-                         hl_lstm_value value,
-                         int frameSize,
-                         int batchSize,
-                         hl_activation_mode_t active_node,
-                         hl_activation_mode_t active_gate,
-                         hl_activation_mode_t active_state) {
-  dim3 threads;
-  dim3 grid;
-  if (batchSize == 1) {
-    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-    int frameBlocks = (frameSize + 1024 - 1) / 1024;
-    threads = dim3(framePerBlock, 1);
-    grid = dim3(frameBlocks, 1);
-  } else {
-    /* framePerBlock = 32 batchPerBlock = 32 */
-    threads = dim3(32, 32);
-    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
-  }
-
-  if (batchSize == 1) {
-    KeLstmForward<Op, /* isBatch= */false>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value,
-      frameSize, batchSize, active_node, active_gate, active_state);
-  } else {
-    KeLstmForward<Op, /* isBatch= */true>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value,
-      frameSize, batchSize, active_node, active_gate, active_state);
-  }
-
-  CHECK_SYNC("hl_gpu_lstm_forward failed");
-}
-
-template<class Op>
-void hl_gpu_lstm_backward(Op op,
-                          hl_lstm_value value,
-                          hl_lstm_grad grad,
-                          int frameSize,
-                          int batchSize,
-                          hl_activation_mode_t active_node,
-                          hl_activation_mode_t active_gate,
-                          hl_activation_mode_t active_state) {
-  dim3 threads;
-  dim3 grid;
-  if (batchSize == 1) {
-    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-    int frameBlocks = (frameSize + 1024 - 1) / 1024;
-    threads = dim3(framePerBlock, 1);
-    grid = dim3(frameBlocks, 1);
-  } else {
-    /* framePerBlock = 32 batchPerBlock = 32 */
-    threads = dim3(32, 32);
-    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
-  }
-
-  if (batchSize == 1) {
-    KeLstmBackward<Op, /* isBatch= */false>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value, grad,
-      frameSize, batchSize, active_node, active_gate, active_state);
-  } else {
-    KeLstmBackward<Op, /* isBatch= */true>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value, grad,
-      frameSize, batchSize, active_node, active_gate, active_state);
-  }
-
-  CHECK_SYNC("hl_gpu_lstm_backward failed");
-}
-
-#else
-
-template<class Op>
-void hl_gpu_lstm_forward(Op op,
-                         hl_lstm_value value,
-                         int frameSize,
-                         int batchSize,
-                         hl_activation_mode_t active_node,
-                         hl_activation_mode_t active_gate,
-                         hl_activation_mode_t active_state) {}
-
-template<class Op>
-void hl_gpu_lstm_backward(Op op,
-                          hl_lstm_value value,
-                          hl_lstm_grad grad,
-                          int frameSize,
-                          int batchSize,
-                          hl_activation_mode_t active_node,
-                          hl_activation_mode_t active_gate,
-                          hl_activation_mode_t active_state) {}
-
-#endif
-
-#endif /* HL_GPU_LSTM_CUH_ */
diff --git a/paddle/cuda/include/hl_gpu_matrix_kernel.cuh b/paddle/cuda/include/hl_gpu_matrix_kernel.cuh
deleted file mode 100644
index 0db023ce3745f95ced8b3a33a1d6bcb20066b2ef..0000000000000000000000000000000000000000
--- a/paddle/cuda/include/hl_gpu_matrix_kernel.cuh
+++ /dev/null
@@ -1,629 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-
-
-#ifndef HL_GPU_MATRIX_KERNEL_CUH_
-#define HL_GPU_MATRIX_KERNEL_CUH_
-
-#include <algorithm>
-#include "paddle/utils/Logging.h"
-#include "hl_base.h"
-
-#ifdef __NVCC__
-/* gpu apply interface */
-
-template<class T, class Op>
-__global__ void KeEltWiseUnaryOp(T* A_d, const int border, Op op) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < border) {
-    op.gpuOperator(A_d[idx]);
-  }
-}
-
-template<class T, class Op>
-__global__ void KeEltWiseUnaryOp(T* A_d,
-                                 int dimM,
-                                 int dimN,
-                                 int lda,
-                                 Op op) {
-  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
-    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
-      op.gpuOperator(A_d[i * lda + j]);
-    }
-  }
-}
-
-template<class T, class Op>
-__global__ void KeEltWiseBinaryOp(T* A_d, T *B_d, const int border, Op op) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < border) {
-    op.gpuOperator(A_d[idx], B_d[idx]);
-  }
-}
-
-template<class T, class Op, bool BAsRowVector, bool BAsColVector>
-__global__ void KeEltWiseBinaryOp(T *A_d,
-                                  T *B_d,
-                                  int dimM,
-                                  int dimN,
-                                  int lda,
-                                  int ldb,
-                                  Op op) {
-  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
-    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
-      if (BAsRowVector == 0 && BAsColVector == 0) {
-        op.gpuOperator(A_d[i * lda + j], B_d[i * ldb + j]);
-      } else if (BAsRowVector == 1 && BAsColVector == 0) {
-        op.gpuOperator(A_d[i * lda + j], B_d[j]);
-      } else if (BAsRowVector == 0 && BAsColVector == 1) {
-        op.gpuOperator(A_d[i * lda + j], B_d[i * ldb]);
-      } else {
-        op.gpuOperator(A_d[i * lda + j], B_d[0]);
-      }
-    }
-  }
-}
-
-template<class T, class Op>
-__global__ void KeEltWiseTernaryOp(T* A_d,
-                                   T *B_d,
-                                   T *C_d,
-                                   const int border,
-                                   Op op) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < border) {
-    op.gpuOperator(A_d[idx], B_d[idx], C_d[idx]);
-  }
-}
-
-template<class T, class Op, bool CAsRowVector, bool CAsColVector>
-__global__ void KeEltWiseTernaryOp(T* A_d,
-                                   T* B_d,
-                                   T* C_d,
-                                   int dimM,
-                                   int dimN,
-                                   int lda,
-                                   int ldb,
-                                   int ldc,
-                                   Op op) {
-  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
-    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
-      if (CAsRowVector == 0 && CAsColVector == 0) {
-        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[i*ldc + j]);
-      } else if (CAsRowVector == 1 && CAsColVector == 0) {
-        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[j]);
-      } else if (CAsRowVector == 0 && CAsColVector == 1) {
-        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[i*ldc]);
-      } else {
-        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[0]);
-      }
-    }
-  }
-}
-
-template<class T, class Op>
-__global__ void KeEltWiseQuaternaryOp(T* A_d,
-                                      T* B_d,
-                                      T* C_d,
-                                      T* D_d,
-                                      const int border,
-                                      Op op) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < border) {
-    op.gpuOperator(A_d[idx], B_d[idx], C_d[idx], D_d[idx]);
-  }
-}
-
-template<class T, class Op>
-__global__ void KeEltWiseQuaternaryOp(T* A_d,
-                                      T* B_d,
-                                      T* C_d,
-                                      T* D_d,
-                                      int dimM,
-                                      int dimN,
-                                      int lda,
-                                      int ldb,
-                                      int ldc,
-                                      int ldd,
-                                      Op op) {
-  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
-    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
-      op.gpuOperator(A_d[i*lda + j],
-        B_d[i*ldb + j], C_d[i*ldc + j], D_d[i*ldd + j]);
-    }
-  }
-}
-
-/**
- * @brief   gpu element wise unary operator.
- */
-template <class T, class Op>
-void hl_gpu_apply_unary_op(Op op, T* A_d, int dimM, int dimN, int lda) {
-  CHECK_NOTNULL(A_d);
-
-  if (dimM == 1 || dimN == lda) {
-    int size = dimM * dimN;
-    int blockSize = size <= 1024 ? size : 1024;
-    int gridSize = (size + 1024 - 1) / 1024;
-    KeEltWiseUnaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-      (A_d, size, op);
-  } else {
-    int blockSizeY = std::min(32, dimM);
-    int blockSizeX = (32 / blockSizeY) * 32;
-    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
-    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
-    dim3 threads(blockSizeX, blockSizeY);
-    dim3 grid(gridSizeX, gridSizeY);
-    KeEltWiseUnaryOp<T, Op><<<grid, threads, 0, STREAM_DEFAULT>>>
-      (A_d, dimM, dimN, lda, op);
-  }
-
-  CHECK_SYNC("hl_gpu_apply_unary_op failed");
-}
-
-/**
- * @brief   gpu element wise binary operator.
- */
-template <class T, class Op, bool BAsRowVector, bool BAsColVector>
-void hl_gpu_apply_binary_op(Op op,
-                            T* A_d,
-                            T* B_d,
-                            int dimM,
-                            int dimN,
-                            int lda,
-                            int ldb) {
-  CHECK_NOTNULL(A_d);
-
-  if ((BAsRowVector == 0 && BAsColVector == 0) &&
-      ((dimM == 1) || (dimN == lda && dimN == ldb))) {
-    int size = dimM * dimN;
-    int blockSize = size <= 1024 ? size : 1024;
-    int gridSize = (size + 1024 - 1) / 1024;
-    KeEltWiseBinaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-      (A_d, B_d, size, op);
-  } else {
-    int blockSizeY = std::min(32, dimM);
-    int blockSizeX = (32 / blockSizeY) * 32;
-    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
-    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
-    dim3 threads(blockSizeX, blockSizeY);
-    dim3 grid(gridSizeX, gridSizeY);
-    KeEltWiseBinaryOp<T, Op, BAsRowVector, BAsColVector>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>
-      (A_d, B_d, dimM, dimN, lda, ldb, op);
-  }
-
-  CHECK_SYNC("hl_gpu_apply_binary_op failed");
-}
-
-/**
- * @brief   gpu element wise ternary operator.
- */
-template <class T, class Op, bool CAsRowVector, bool CAsColVector>
-void hl_gpu_apply_ternary_op(Op op,
-                             T* A_d,
-                             T* B_d,
-                             T* C_d,
-                             int dimM,
-                             int dimN,
-                             int lda,
-                             int ldb,
-                             int ldc) {
-  CHECK_NOTNULL(A_d);
-
-  if ((CAsRowVector == 0 && CAsColVector == 0) &&
-      ((dimM == 1) || (dimN == lda && dimN == ldb && dimN == ldc))) {
-    int size = dimM * dimN;
-    int blockSize = size <= 1024 ? size : 1024;
-    int gridSize = (size + 1024 - 1) / 1024;
-    KeEltWiseTernaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-      (A_d, B_d, C_d, size, op);
-  } else {
-    int blockSizeY = std::min(32, dimM);
-    int blockSizeX = (32 / blockSizeY) * 32;
-    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
-    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
-    dim3 threads(blockSizeX, blockSizeY);
-    dim3 grid(gridSizeX, gridSizeY);
-    KeEltWiseTernaryOp<T, Op, CAsRowVector, CAsColVector>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>
-      (A_d, B_d, C_d, dimM, dimN, lda, ldb, ldc, op);
-  }
-
-  CHECK_SYNC("hl_gpu_apply_ternary_op failed");
-}
-
-
-/**
- * @brief   gpu element wise quaternary operator.
- */
-template <class T, class Op>
-void hl_gpu_apply_quaternary_op(Op op,
-                                T* A_d,
-                                T* B_d,
-                                T* C_d,
-                                T* D_d,
-                                int dimM,
-                                int dimN,
-                                int lda,
-                                int ldb,
-                                int ldc,
-                                int ldd) {
-  CHECK_NOTNULL(A_d);
-
-  if ((dimM == 1) ||
-      (dimN == lda && dimN == ldb && dimN == ldc && dimN == ldd)) {
-    int size = dimM * dimN;
-    int blockSize = size <= 1024 ? size : 1024;
-    int gridSize = (size + 1024 - 1) / 1024;
-    KeEltWiseQuaternaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-      (A_d, B_d, C_d, D_d, size, op);
-  } else {
-    int blockSizeY = std::min(32, dimM);
-    int blockSizeX = (32 / blockSizeY) * 32;
-    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
-    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
-    dim3 threads(blockSizeX, blockSizeY);
-    dim3 grid(gridSizeX, gridSizeY);
-    KeEltWiseQuaternaryOp<T, Op><<<grid, threads, 0, STREAM_DEFAULT>>>
-      (A_d, B_d, C_d, D_d, dimM, dimN, lda, ldb, ldc, ldd, op);
-  }
-
-  CHECK_SYNC("hl_gpu_apply_quaternary_op failed");
-}
-
-#else
-
-template <class T, class Op>
-void hl_gpu_apply_unary_op(Op op, T* A_d, int dimM, int dimN, int lda) {}
-
-template <class T, class Op, bool BAsRowVector, bool BAsColVector>
-void hl_gpu_apply_binary_op(Op op,
-                            T* A_d,
-                            T* B_d,
-                            int dimM,
-                            int dimN,
-                            int lda,
-                            int ldb) {}
-
-template <class T, class Op, bool CAsRowVector, bool CAsColVector>
-void hl_gpu_apply_ternary_op(Op op,
-                             T* A_d,
-                             T* B_d,
-                             T* C_d,
-                             int dimM,
-                             int dimN,
-                             int lda,
-                             int ldb,
-                             int ldc) {}
-
-template <class T, class Op>
-void hl_gpu_apply_quaternary_op(Op op,
-                                T* A_d,
-                                T* B_d,
-                                T* C_d,
-                                T* D_d,
-                                int dimM,
-                                int dimN,
-                                int lda,
-                                int ldb,
-                                int ldc,
-                                int ldd) {}
-#endif
-
-#ifdef __NVCC__
-/**
- * @brief   matrix row operator.
- */
-
-template<class Agg, class Op>
-__device__ __inline__ real sumRow(Agg agg, Op op,
-                                  int idx, int blockSize,
-                                  int dimN, real *A) {
-  real tmp = agg.init();
-  int cnt = (dimN + blockSize -1) / blockSize;
-  for (int i = 0; i < cnt && idx < dimN; i++) {
-      tmp = agg(tmp, op(A[idx]));
-      idx += blockSize;
-  }
-  return tmp;
-}
-
-template<class Agg, class Op>
-__device__ __inline__ real sumRow(Agg agg, Op op,
-                                  int idx, int blockSize,
-                                  int dimN, real *A, real *B) {
-  real tmp = agg.init();
-  int cnt = (dimN + blockSize -1) / blockSize;
-  for (int i = 0; i < cnt && idx < dimN; i++) {
-    tmp = agg(tmp, op(A[idx], B[idx]));
-    idx += blockSize;
-  }
-  return tmp;
-}
-
-template<class Agg>
-__device__ __inline__ void aggRow(Agg agg, real *row, int size, int tid) {
-  for (int stride = size/2; stride > 0; stride = stride/2) {
-    if (tid < stride) {
-      row[tid] = agg(row[tid], row[tid + stride]);
-    }
-    __syncthreads();
-  }
-}
-
-template<class Agg, class Op, class Saver, int blockSize>
-__global__ void KeMatrixRowOp(Agg agg, Op op, Saver sv,
-                              int dimN,
-                              real *dst, int ld,
-                              real *A, int lda) {
-  __shared__ real row_s[blockSize];
-  int rowId = blockIdx.x + blockIdx.y*gridDim.x;
-  int tid = threadIdx.x;
-
-  A += rowId*lda;
-  row_s[tid] = sumRow(agg, op, tid, blockSize, dimN, A);
-  __syncthreads();
-
-  aggRow(agg, row_s, blockSize, tid);
-  __syncthreads();
-
-  if (tid == 0) {
-    dst[rowId*ld] = sv(dst[rowId*ld], row_s[0]);
-  }
-}
-
-template<class Agg, class Op, class Saver, int blockSize>
-__global__ void KeMatrixRowOp(Agg agg, Op op, Saver sv,
-                              int dimN,
-                              real *dst, int ld,
-                              real *A, int lda,
-                              real *B, int ldb) {
-  __shared__ real row_s[blockSize];
-  int rowId = blockIdx.x + blockIdx.y*gridDim.x;
-  int tid = threadIdx.x;
-
-  A += rowId*lda;
-  B += rowId*ldb;
-  row_s[tid] = sumRow(agg, op, tid, blockSize, dimN, A, B);
-  __syncthreads();
-
-  aggRow(agg, row_s, blockSize, tid);
-  __syncthreads();
-
-  if (tid == 0) {
-    dst[rowId*ld] = sv(dst[rowId*ld], row_s[0]);
-  }
-}
-
-/**
- * @brief   matrix column operator.
- */
-template <class Agg, class Op>
-__device__ __inline__ real sumCol(Agg agg, Op op,
-                                  int index, int stride,
-                                  int dimM, real *A, int lda) {
-  real tmp = agg.init();
-  for (; index < dimM;) {
-    tmp = agg(tmp, op(A[index*lda]));
-    index += stride;
-  }
-  return tmp;
-}
-
-template <class Agg, class Op>
-__device__ __inline__ real sumCol(Agg agg, Op op,
-                                  int index, int stride, int dimM,
-                                  real *A, int lda, real *B, int ldb) {
-  real tmp = agg.init();
-  for (; index < dimM;) {
-    tmp = agg(tmp, op(A[index*lda], B[index*ldb]));
-    index += stride;
-  }
-  return tmp;
-}
-
-template <class Agg, class Op, class Saver>
-__global__ void KeMatrixColumnOp(Agg agg, Op op, Saver sv,
-                                 int dimM, int dimN,
-                                 real *dst,
-                                 real *A, int lda) {
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (rowIdx < dimN) {
-    A += rowIdx;
-    real tmp = sumCol(agg, op, 0, 1, dimM, A, lda);
-    dst[rowIdx] = sv(dst[rowIdx], tmp);
-  }
-}
-
-template <class Agg, class Op, class Saver, int blockDimX, int blockDimY>
-__global__ void KeMatrixColumnOp_S(Agg agg, Op op, Saver sv,
-                                   int dimM, int dimN,
-                                   real *dst,
-                                   real *A, int lda) {
-  __shared__ real col_s[blockDimX*blockDimY];
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (rowIdx < dimN) {
-    A += rowIdx;
-    real tmp = sumCol(agg, op, threadIdx.y, blockDimY, dimM, A, lda);
-    col_s[threadIdx.x + threadIdx.y*blockDimX] = tmp;
-  }
-  __syncthreads();
-
-  if (rowIdx < dimN) {
-    if (threadIdx.y ==0) {
-      real tmp = agg.init();
-      for (int i=0; i < blockDimY; i++) {
-        tmp = agg(tmp, col_s[threadIdx.x + i*blockDimX]);
-      }
-      dst[rowIdx] = sv(dst[rowIdx], tmp);
-    }
-  }
-}
-
-template <class Agg, class Op, class Saver>
-__global__ void KeMatrixColumnOp(Agg agg, Op op, Saver sv,
-                                 int dimM, int dimN,
-                                 real *dst,
-                                 real *A, int lda,
-                                 real *B, int ldb) {
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (rowIdx < dimN) {
-    A += rowIdx;
-    B += rowIdx;
-    real tmp = sumCol(agg, op, 0, 1, dimM, A, lda, B, ldb);
-    dst[rowIdx] = sv(dst[rowIdx], tmp);
-  }
-}
-
-template <class Agg, class Op, class Saver, int blockDimX, int blockDimY>
-__global__ void KeMatrixColumnOp_S(Agg agg, Op op, Saver sv,
-                                   int dimM, int dimN,
-                                   real *dst,
-                                   real *A, int lda,
-                                   real *B, int ldb) {
-  __shared__ real col_s[blockDimX*blockDimY];
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (rowIdx < dimN) {
-    A += rowIdx;
-    B += rowIdx;
-    real tmp = sumCol(agg, op,
-        threadIdx.y, blockDimY, dimM, A, lda, B, ldb);
-    col_s[threadIdx.x + threadIdx.y*blockDimX] = tmp;
-  }
-  __syncthreads();
-
-  if (rowIdx < dimN) {
-    if (threadIdx.y ==0) {
-      real tmp = agg.init();
-      for (int i=0; i < blockDimY; i++) {
-        tmp = agg(tmp, col_s[threadIdx.x + i*blockDimX]);
-      }
-      dst[rowIdx] = sv(dst[rowIdx], tmp);
-    }
-  }
-}
-
-#endif
-
-template <class Agg, class Op, class Saver>
-void hl_gpu_matrix_row_op(Agg agg, Op op, Saver sv,
-                          int dimM, int dimN,
-                          real *dst, int ld,
-                          real *A, int lda) {
-#ifdef __NVCC__
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(A);
-
-  int blocksX = dimM;
-  int blocksY = 1;
-  dim3 threads(128, 1);
-  dim3 grid(blocksX, blocksY);
-  KeMatrixRowOp<Agg, Op, Saver, 128><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (agg, op, sv, dimN, dst, ld, A, lda);
-
-  CHECK_SYNC("hl_matrix_row_op failed");
-#endif
-}
-
-template <class Agg, class Op, class Saver>
-void hl_gpu_matrix_row_op(Agg agg, Op op, Saver sv,
-                          int dimM, int dimN,
-                          real *dst, int ld,
-                          real *A, int lda,
-                          real *B, int ldb) {
-#ifdef __NVCC__
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(A);
-
-  int blocksX = dimM;
-  int blocksY = 1;
-  dim3 threads(128, 1);
-  dim3 grid(blocksX, blocksY);
-  KeMatrixRowOp<Agg, Op, Saver, 128><<< grid, threads, 0, STREAM_DEFAULT >>>
-    (agg, op, sv, dimN, dst, ld, A, lda, B, ldb);
-
-  CHECK_SYNC("hl_matrix_row_op failed");
-#endif
-}
-
-template <class Agg, class Op, class Saver>
-void hl_gpu_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda) {
-#ifdef __NVCC__
-  if (dimN >= 8192) {
-    int blocksX = (dimN + 128 -1) / 128;
-    int blocksY = 1;
-    dim3 threads(128, 1);
-    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp<Agg, Op, Saver>
-        <<< grid, threads, 0, STREAM_DEFAULT >>>
-        (agg, op, sv, dimM, dimN, dst, A, lda);
-  } else {
-    int blocksX = (dimN + 32 -1) / 32;
-    int blocksY = 1;
-    dim3 threads(32, 32);
-    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp_S<Agg, Op, Saver, 32, 32>
-        <<< grid, threads, 0, STREAM_DEFAULT>>>
-        (agg, op, sv, dimM, dimN, dst, A, lda);
-  }
-
-  CHECK_SYNC("hl_matrix_column_op failed");
-#endif
-}
-
-template <class Agg, class Op, class Saver>
-void hl_gpu_matrix_column_op(Agg agg, Op op, Saver sv,
-                             int dimM, int dimN,
-                             real *dst,
-                             real *A, int lda,
-                             real *B, int ldb) {
-#ifdef __NVCC__
-  if (dimN >= 8192) {
-    int blocksX = (dimN + 128 -1) / 128;
-    int blocksY = 1;
-    dim3 threads(128, 1);
-    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp<Agg, Op, Saver>
-        <<< grid, threads, 0, STREAM_DEFAULT >>>
-        (agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  } else {
-    int blocksX = (dimN + 32 -1) / 32;
-    int blocksY = 1;
-    dim3 threads(32, 32);
-    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp_S<Agg, Op, Saver, 32, 32>
-        <<< grid, threads, 0, STREAM_DEFAULT>>>
-        (agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
-  }
-
-  CHECK_SYNC("hl_matrix_column_op failed");
-#endif
-}
-
-#endif /* HL_GPU_MATRIX_KERNEL_CUH_ */
diff --git a/paddle/cuda/include/hl_tensor_ops.h b/paddle/cuda/include/hl_tensor_ops.h
deleted file mode 100644
index 85a022ff5e26daab97be52b7ea9814c6b8078561..0000000000000000000000000000000000000000
--- a/paddle/cuda/include/hl_tensor_ops.h
+++ /dev/null
@@ -1,536 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_TENSOR_OPS_H_
-#define HL_TENSOR_OPS_H_
-
-#include <cmath>
-#include "hl_matrix_type.cuh"
-
-namespace hppl {
-namespace unary {
-
-template <class T>
-class add_scale {
-private:
-  const T p;
-
-public:
-  INLINE add_scale(const T s) : p(s) {}
-  INLINE T operator()(const T a) const { return a + p; }
-};
-
-template <class T>
-class sub_scale {
-private:
-  const T p;
-
-public:
-  INLINE sub_scale(const T s) : p(s) {}
-  INLINE T operator()(const T a) const { return a - p; }
-};
-
-template <class T>
-class mul_scale {
-private:
-  const T p;
-
-public:
-  INLINE mul_scale(const T s) : p(s) {}
-  INLINE T operator()(const T a) const { return a * p; }
-};
-
-template <class T>
-class div_scale {
-private:
-  const T p;
-
-public:
-  INLINE div_scale(const T s) : p(s) {}
-  INLINE T operator()(const T a) const { return a / p; }
-};
-
-template <class T>
-class neg {
-public:
-  INLINE T operator()(const T a) const { return -a; }
-};
-
-template <class T>
-class exp_op {
-public:
-  INLINE T operator()(const T a) const { return std::exp(a); }
-};
-
-template <class T>
-class log_op {
-public:
-  INLINE T operator()(const T a) const { return std::log(a); }
-};
-
-template <class T>
-class sqrt_op {
-public:
-  INLINE T operator()(const T a) const { return std::sqrt(a); }
-};
-
-template <class T>
-class square {
-public:
-  INLINE T operator()(const T a) const { return a * a; }
-};
-
-template <class T>
-class reciprocal {
-public:
-  INLINE T operator()(const T a) const { return T(1) / a; }
-};
-
-template <class T>
-class abs {
-public:
-  INLINE T operator()(const T a) const { return a > 0 ? a : -a; }
-};
-
-template <class T>
-class sign {
-public:
-  INLINE T operator()(const T a) const { return (a > 0) - (a < 0); }
-};
-
-template <class T>
-class min {
-private:
-  const T p;
-
-public:
-  INLINE min(const T s) : p(s) {}
-  INLINE T operator()(const T a) const { return a > p ? p : a; }
-};
-
-template <class T>
-class max {
-private:
-  const T p;
-
-public:
-  INLINE max(const T s) : p(s) {}
-  INLINE T operator()(const T a) const { return a < p ? p : a; }
-};
-
-template <class T>
-class pow_op {
-private:
-  const T p;
-
-public:
-  INLINE pow_op(const T s) : p(s) {}
-  INLINE T operator()(const T a) const { return std::pow(a, p); }
-};
-
-template <class T>
-class constant {
-private:
-  const T p;
-
-public:
-  INLINE constant(const T s) : p(s) {}
-  INLINE T operator()(int i) const { return p; }
-  INLINE T operator()(int i, int j) const { return p; }
-};
-
-template <class T>
-class cmp_eq {
-private:
-  const T p;
-
-public:
-  INLINE cmp_eq(const T s) : p(s) {}
-  INLINE bool operator()(const T a) const { return a == p; }
-};
-
-template <class T>
-class cmp_ne {
-private:
-  const T p;
-
-public:
-  INLINE cmp_ne(const T s) : p(s) {}
-  INLINE bool operator()(const T a) const { return a != p; }
-};
-
-template <class T>
-class cmp_le {
-private:
-  const T p;
-
-public:
-  INLINE cmp_le(const T s) : p(s) {}
-  INLINE bool operator()(const T a) const { return a <= p; }
-};
-
-template <class T>
-class cmp_lt {
-private:
-  const T p;
-
-public:
-  INLINE cmp_lt(const T s) : p(s) {}
-  INLINE bool operator()(const T a) const { return a < p; }
-};
-
-template <class T>
-class cmp_ge {
-private:
-  const T p;
-
-public:
-  INLINE cmp_ge(const T s) : p(s) {}
-  INLINE bool operator()(const T a) const { return a >= p; }
-};
-
-template <class T>
-class cmp_gt {
-private:
-  const T p;
-
-public:
-  INLINE cmp_gt(const T s) : p(s) {}
-  INLINE bool operator()(const T a) const { return a > p; }
-};
-
-template <class T>
-class and_op {
-private:
-  const T p;
-
-public:
-  INLINE and_op(const T s) : p(s) {}
-  INLINE bool operator()(const T a) const { return a && p; }
-};
-
-template <class T>
-class or_op {
-private:
-  const T p;
-
-public:
-  INLINE or_op(const T s) : p(s) {}
-  INLINE bool operator()(const T a) const { return a || p; }
-};
-
-}  // namespace unary
-
-namespace binary {
-template <class T>
-class add {
-public:
-  INLINE T operator()(const T a, const T b) const { return a + b; }
-};
-
-template <class T>
-class add_scale {
-private:
-  const T p1;
-  const T p2;
-
-public:
-  INLINE add_scale(const T s1, const T s2) : p1(s1), p2(s2) {}
-  INLINE T operator()(const T a, const T b) const { return p1 * a + p2 * b; }
-};
-
-template <class T>
-class sub {
-public:
-  INLINE T operator()(const T a, const T b) const { return a - b; }
-};
-
-template <class T>
-class mul {
-public:
-  INLINE T operator()(const T a, const T b) const { return a * b; }
-};
-
-template <class T>
-class div {
-public:
-  INLINE T operator()(const T a, const T b) const { return a / b; }
-};
-
-template <class T>
-class cmp_eq {
-public:
-  INLINE bool operator()(const T a, const T b) const { return a == b; }
-};
-
-template <class T>
-class cmp_ne {
-public:
-  INLINE bool operator()(const T a, const T b) const { return a != b; }
-};
-
-template <class T>
-class cmp_le {
-public:
-  INLINE bool operator()(const T a, const T b) const { return a <= b; }
-};
-
-template <class T>
-class cmp_lt {
-public:
-  INLINE bool operator()(const T a, const T b) const { return a < b; }
-};
-
-template <class T>
-class cmp_ge {
-public:
-  INLINE bool operator()(const T a, const T b) const { return a >= b; }
-};
-
-template <class T>
-class cmp_gt {
-public:
-  INLINE bool operator()(const T a, const T b) const { return a > b; }
-};
-
-template <class T>
-class and_op {
-public:
-  INLINE bool operator()(const T a, const T b) const { return a && b; }
-};
-
-template <class T>
-class or_op {
-public:
-  INLINE bool operator()(const T a, const T b) const { return a || b; }
-};
-
-template <class T>
-class min {
-public:
-  INLINE T operator()(const T a, const T b) const { return a > b ? b : a; }
-};
-
-template <class T>
-class max {
-public:
-  INLINE T operator()(const T a, const T b) const { return a < b ? b : a; }
-};
-
-#ifdef PADDLE_USE_SSE3
-#ifndef PADDLE_TYPE_DOUBLE
-template <>
-class add<__m128> {
-public:
-  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
-    return _mm_add_ps(a, b);
-  }
-};
-
-template <>
-class add_scale<__m128> {
-private:
-  const __m128 p1;
-  const __m128 p2;
-
-public:
-  INLINE add_scale(const __m128 s1, const __m128 s2) : p1(s1), p2(s2) {}
-  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
-    return _mm_add_ps(_mm_mul_ps(p1, a), _mm_mul_ps(p2, b));
-  }
-};
-
-template <>
-class sub<__m128> {
-public:
-  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
-    return _mm_sub_ps(a, b);
-  }
-};
-
-template <>
-class mul<__m128> {
-public:
-  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
-    return _mm_mul_ps(a, b);
-  }
-};
-
-template <>
-class div<__m128> {
-public:
-  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
-    return _mm_div_ps(a, b);
-  }
-};
-
-template <>
-class min<__m128> {
-public:
-  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
-    return _mm_min_ps(a, b);
-  }
-};
-
-template <>
-class max<__m128> {
-public:
-  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
-    return _mm_max_ps(a, b);
-  }
-};
-#else
-template <>
-class add<__m128d> {
-public:
-  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
-    return _mm_add_pd(a, b);
-  }
-};
-
-template <>
-class add_scale<__m128d> {
-private:
-  const __m128d p1;
-  const __m128d p2;
-
-public:
-  INLINE add_scale(const __m128d s1, const __m128d s2) : p1(s1), p2(s2) {}
-  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
-    return _mm_add_pd(_mm_mul_pd(p1, a), _mm_mul_pd(p2, b));
-  }
-};
-
-template <>
-class sub<__m128d> {
-public:
-  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
-    return _mm_sub_pd(a, b);
-  }
-};
-
-template <>
-class mul<__m128d> {
-public:
-  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
-    return _mm_mul_pd(a, b);
-  }
-};
-
-template <>
-class div<__m128d> {
-public:
-  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
-    return _mm_div_pd(a, b);
-  }
-};
-
-template <>
-class min<__m128d> {
-public:
-  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
-    return _mm_min_pd(a, b);
-  }
-};
-
-template <>
-class max<__m128d> {
-public:
-  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
-    return _mm_max_pd(a, b);
-  }
-};
-#endif  // PADDLE_TYPE_DOUBLE
-#endif  // PADDLE_USE_SSE3
-
-#ifdef PADDLE_USE_NEON
-#ifndef PADDLE_TYPE_DOUBLE
-template <>
-class add<float32x4_t> {
-public:
-  INLINE float32x4_t operator()(const float32x4_t a,
-                                const float32x4_t b) const {
-    return vaddq_f32(a, b);
-  }
-};
-
-template <>
-class add_scale<float32x4_t> {
-private:
-  const float32x4_t p1;
-  const float32x4_t p2;
-
-public:
-  INLINE add_scale(const float32x4_t s1, const float32x4_t s2)
-      : p1(s1), p2(s2) {}
-  INLINE float32x4_t operator()(const float32x4_t a,
-                                const float32x4_t b) const {
-    return vaddq_f32(vmulq_f32(p1, a), vmulq_f32(p2, b));
-  }
-};
-
-template <>
-class sub<float32x4_t> {
-public:
-  INLINE float32x4_t operator()(const float32x4_t a,
-                                const float32x4_t b) const {
-    return vsubq_f32(a, b);
-  }
-};
-
-template <>
-class mul<float32x4_t> {
-public:
-  INLINE float32x4_t operator()(const float32x4_t a,
-                                const float32x4_t b) const {
-    return vmulq_f32(a, b);
-  }
-};
-
-template <>
-class div<float32x4_t> {
-public:
-  INLINE float32x4_t operator()(const float32x4_t a,
-                                const float32x4_t b) const {
-    float32x4_t tmp = vrecpeq_f32(b);
-    return vmulq_f32(a, tmp);
-  }
-};
-
-template <>
-class min<float32x4_t> {
-public:
-  INLINE float32x4_t operator()(const float32x4_t a,
-                                const float32x4_t b) const {
-    return vminq_f32(a, b);
-  }
-};
-
-template <>
-class max<float32x4_t> {
-public:
-  INLINE float32x4_t operator()(const float32x4_t a,
-                                const float32x4_t b) const {
-    return vmaxq_f32(a, b);
-  }
-};
-#else
-#error To be implemented
-#endif  // PADDLE_TYPE_DOUBLE
-#endif  // PADDLE_USE_NEON
-
-}  // namespace binary
-}  // namespace hppl
-
-#endif  // HL_TENSOR_OPS_H_
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
deleted file mode 100644
index c39bd3228d3f2ea7495cd21f5ff60bdfbbd2b51d..0000000000000000000000000000000000000000
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ /dev/null
@@ -1,227 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef HL_CNN_STUB_H_
-#define HL_CNN_STUB_H_
-
-#include "hl_cnn.h"
-
-inline void hl_maxpool_forward(const int frameCnt,
-                               const real* inputData,
-                               const int channels,
-                               const int height,
-                               const int width,
-                               const int pooledH,
-                               const int pooledW,
-                               const int sizeX,
-                               const int sizeY,
-                               const int strideH,
-                               const int strideW,
-                               const int paddingH,
-                               const int paddingW,
-                               real* tgtData,
-                               const int tgtStride,
-                               real* MaskData) {}
-
-inline void hl_maxpool_backward(const int frameCnt,
-                                const real* inputData,
-                                const real* outData,
-                                const real* outGrad,
-                                const int channels,
-                                const int height,
-                                const int width,
-                                const int pooledH,
-                                const int pooledW,
-                                const int sizeX,
-                                const int sizeY,
-                                const int strideH,
-                                const int strideW,
-                                const int paddingH,
-                                const int paddingW,
-                                real scaleA,
-                                real scaleB,
-                                real* targetGrad,
-                                const int outStride) {}
-
-inline void hl_avgpool_forward(const int frameCnt,
-                               const real* inputData,
-                               const int channels,
-                               const int height,
-                               const int width,
-                               const int pooledH,
-                               const int pooledW,
-                               const int sizeX,
-                               const int sizeY,
-                               const int strideH,
-                               const int strideW,
-                               const int paddingH,
-                               const int paddingW,
-                               real* tgtData,
-                               const int tgtStride,
-                               const bool excludeMode) {}
-
-inline void hl_avgpool_backward(const int frameCnt,
-                                const real* outGrad,
-                                const int channels,
-                                const int height,
-                                const int width,
-                                const int pooledH,
-                                const int pooledW,
-                                const int sizeX,
-                                const int sizeY,
-                                const int strideH,
-                                const int strideW,
-                                int paddingH,
-                                int paddingW,
-                                real scaleA,
-                                real scaleB,
-                                real* backGrad,
-                                const int outStride,
-                                const bool excludeMode) {}
-
-inline void hl_maxpool3D_forward(const int frameCnt,
-                                 const real* inputData,
-                                 const int channels,
-                                 const int depth,
-                                 const int height,
-                                 const int width,
-                                 const int pooledD,
-                                 const int pooledH,
-                                 const int pooledW,
-                                 const int sizeZ,
-                                 const int sizeY,
-                                 const int sizeX,
-                                 const int strideD,
-                                 const int strideH,
-                                 const int strideW,
-                                 const int paddingD,
-                                 const int paddingH,
-                                 const int paddingW,
-                                 real* tgtData,
-                                 real* maxPoolIdxData,
-                                 const int tgtStride) {}
-
-inline void hl_maxpool3D_backward(const int frameCnt,
-                                  const real* outGrad,
-                                  const int channels,
-                                  const int depth,
-                                  const int height,
-                                  const int width,
-                                  const int pooledD,
-                                  const int pooledH,
-                                  const int pooledW,
-                                  const int sizeZ,
-                                  const int sizeY,
-                                  const int sizeX,
-                                  const int strideD,
-                                  const int strideH,
-                                  const int strideW,
-                                  const int paddingD,
-                                  const int paddingH,
-                                  const int paddingW,
-                                  real scaleA,
-                                  real scaleB,
-                                  real* targetGrad,
-                                  real* maxPoolIdxData,
-                                  const int outStride) {}
-
-inline void hl_avgpool3D_forward(const int frameCnt,
-                                 const real* inputData,
-                                 const int channels,
-                                 const int depth,
-                                 const int height,
-                                 const int width,
-                                 const int pooledD,
-                                 const int pooledH,
-                                 const int pooledW,
-                                 const int sizeZ,
-                                 const int sizeY,
-                                 const int sizeX,
-                                 const int strideD,
-                                 const int strideH,
-                                 const int strideW,
-                                 const int paddingD,
-                                 const int paddingH,
-                                 const int paddingW,
-                                 real* tgtData,
-                                 const int tgtStride) {}
-
-inline void hl_avgpool3D_backward(const int frameCnt,
-                                  const real* outGrad,
-                                  const int channels,
-                                  const int depth,
-                                  const int height,
-                                  const int width,
-                                  const int pooledD,
-                                  const int pooledH,
-                                  const int pooledW,
-                                  const int sizeZ,
-                                  const int sizeY,
-                                  const int sizeX,
-                                  const int strideD,
-                                  const int strideH,
-                                  const int strideW,
-                                  const int paddingD,
-                                  const int paddingH,
-                                  const int paddingW,
-                                  real scaleA,
-                                  real scaleB,
-                                  real* backGrad,
-                                  const int outStride) {}
-
-inline void hl_bilinear_forward(const real* inData,
-                                const size_t inImgH,
-                                const size_t inImgW,
-                                const size_t inputH,
-                                const size_t inputW,
-                                real* outData,
-                                const size_t outImgH,
-                                const size_t outImgW,
-                                const size_t outputH,
-                                const size_t outputW,
-                                const size_t numChannels,
-                                const real ratioH,
-                                const real ratioW) {}
-
-inline void hl_bilinear_backward(real* inGrad,
-                                 const size_t inImgH,
-                                 const size_t inImgW,
-                                 const size_t inputH,
-                                 const size_t inputW,
-                                 const real* outGrad,
-                                 const size_t outImgH,
-                                 const size_t outImgW,
-                                 const size_t outputH,
-                                 const size_t outputW,
-                                 const size_t numChannels,
-                                 const real ratioH,
-                                 const real ratioW) {}
-
-inline void hl_maxout_forward(const real* inData,
-                              real* outData,
-                              int* idData,
-                              size_t batchSize,
-                              size_t size,
-                              size_t featLen,
-                              size_t group) {}
-
-inline void hl_maxout_backward(real* inGrad,
-                               const real* outGrad,
-                               const int* idData,
-                               size_t batchSize,
-                               size_t size,
-                               size_t featLen,
-                               size_t group) {}
-
-#endif  // HL_CNN_STUB_H_
diff --git a/paddle/cuda/src/hl_cuda_aggregate.cu b/paddle/cuda/src/hl_cuda_aggregate.cu
deleted file mode 100644
index d30c264127f47da9a48acb71c59cb9e134ced127..0000000000000000000000000000000000000000
--- a/paddle/cuda/src/hl_cuda_aggregate.cu
+++ /dev/null
@@ -1,293 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_aggregate.h"
-#include "hl_base.h"
-#include "hl_cuda.h"
-#include "hl_cuda.ph"
-#include "hl_matrix_base.cuh"
-#include "hl_thread.ph"
-#include "paddle/utils/Logging.h"
-
-/**
- * @brief   matrix row operator.
- */
-template <class Agg, int blockSize>
-__global__ void KeMatrixRowOp(Agg agg, real *E, real *Sum, int dimN) {
-  __shared__ real sum_s[blockSize];
-  int cnt = (dimN + blockSize - 1) / blockSize;
-  int rowId = blockIdx.x + blockIdx.y * gridDim.x;
-  int index = rowId * dimN;
-  int tid = threadIdx.x;
-  int lmt = tid;
-
-  real tmp = agg.init();
-  for (int ii = 0; ii < cnt && lmt < dimN; ii++) {
-    tmp = agg(tmp, E[index + lmt]);
-    lmt += blockSize;
-  }
-  sum_s[tid] = tmp;
-  __syncthreads();
-
-  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
-    if (tid < stride) {
-      sum_s[tid] = agg(sum_s[tid], sum_s[tid + stride]);
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-
-  if (tid == 0) {
-    Sum[rowId] = sum_s[0];
-  }
-}
-
-template <class Agg>
-void hl_matrix_row_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
-  int blocksX = dimM;
-  int blocksY = 1;
-  dim3 threads(128, 1);
-  dim3 grid(blocksX, blocksY);
-
-  KeMatrixRowOp<Agg, 128><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      agg, A_d, C_d, dimN);
-}
-
-void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_matrix_row_op(aggregate::sum(), A_d, C_d, dimM, dimN);
-  CHECK_SYNC("hl_matrix_row_sum failed");
-}
-
-void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_matrix_row_op(aggregate::max(), A_d, C_d, dimM, dimN);
-  CHECK_SYNC("hl_matrix_row_max failed");
-}
-
-void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_matrix_row_op(aggregate::min(), A_d, C_d, dimM, dimN);
-  CHECK_SYNC("hl_matrix_row_min failed");
-}
-
-/**
- * @brief   matrix column operator.
- */
-template <class Agg>
-__global__ void KeMatrixColumnOp(
-    Agg agg, real *E, real *Sum, int dimM, int dimN) {
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  real tmp = agg.init();
-  if (rowIdx < dimN) {
-    for (int index = 0; index < dimM; index++) {
-      tmp = agg(tmp, E[dimN * index + rowIdx]);
-    }
-    Sum[rowIdx] = tmp;
-  }
-}
-
-template <class Agg, int blockDimX, int blockDimY>
-__global__ void KeMatrixColumnOp_S(
-    Agg agg, real *E, real *Sum, int dimM, int dimN) {
-  __shared__ real _sum[blockDimX * blockDimY];
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  int index = threadIdx.y;
-
-  real tmp = agg.init();
-  if (rowIdx < dimN) {
-    for (; index < dimM;) {
-      tmp = agg(tmp, E[dimN * index + rowIdx]);
-      index += blockDimY;
-    }
-  }
-  _sum[threadIdx.x + threadIdx.y * blockDimX] = tmp;
-  __syncthreads();
-
-  if (rowIdx < dimN) {
-    if (threadIdx.y == 0) {
-      real tmp = agg.init();
-      for (int i = 0; i < blockDimY; i++) {
-        tmp = agg(tmp, _sum[threadIdx.x + i * blockDimX]);
-      }
-      Sum[rowIdx] = tmp;
-    }
-  }
-}
-
-template <class Agg>
-void hl_matrix_column_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
-  if (dimN >= 8192) {
-    int blocksX = (dimN + 128 - 1) / 128;
-    int blocksY = 1;
-    dim3 threads(128, 1);
-    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp<Agg><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        agg, A_d, C_d, dimM, dimN);
-  } else {
-    int blocksX = (dimN + 32 - 1) / 32;
-    int blocksY = 1;
-    dim3 threads(32, 32);
-    dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp_S<Agg, 32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        agg, A_d, C_d, dimM, dimN);
-  }
-
-  return;
-}
-
-void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_matrix_column_op(aggregate::sum(), A_d, C_d, dimM, dimN);
-
-  CHECK_SYNC("hl_matrix_column_sum failed");
-}
-
-void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_matrix_column_op(aggregate::max(), A_d, C_d, dimM, dimN);
-
-  CHECK_SYNC("hl_matrix_column_max failed");
-}
-
-void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_matrix_column_op(aggregate::min(), A_d, C_d, dimM, dimN);
-
-  CHECK_SYNC("hl_matrix_column_min failed");
-}
-
-template <int blockSize>
-__global__ void KeVectorSum(real *E, real *Sum, int dimM) {
-  __shared__ double sum_s[blockSize];
-  int tid = threadIdx.x;
-  int index = blockIdx.y * blockDim.x + threadIdx.x;
-
-  sum_s[tid] = 0.0f;
-  while (index < dimM) {
-    sum_s[tid] += E[index];
-    index += blockDim.x * gridDim.y;
-  }
-  __syncthreads();
-
-  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
-    if (tid < stride) {
-      sum_s[tid] += sum_s[tid + stride];
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-
-  if (tid == 0) {
-    Sum[blockIdx.y] = sum_s[0];
-  }
-}
-
-void hl_vector_sum(real *A_d, real *C_h, int dimM) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_h);
-
-  int blockSize = 128;
-  int gridSize = 128;
-  int blocksX = 1;
-  int blocksY = gridSize;
-  dim3 threads(blockSize, 1);
-  dim3 grid(blocksX, blocksY);
-
-  struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
-  hl_event_t hl_event = &hl_event_st;
-  while (!hl_cuda_event_is_ready(hl_event)) {
-  }
-
-  KeVectorSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      A_d, t_resource.gpu_mem, dimM);
-  KeVectorSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
-      t_resource.gpu_mem, t_resource.cpu_mem, 128);
-
-  hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
-  hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
-
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  cudaError_t err = (cudaError_t)hl_get_device_last_error();
-  CHECK_EQ(cudaSuccess, err) << "CUDA error: "
-                             << hl_get_device_error_string((size_t)err);
-}
-
-template <int blockSize>
-__global__ void KeVectorAbsSum(real *E, real *Sum, int dimM) {
-  __shared__ double sum_s[blockSize];
-  int tid = threadIdx.x;
-  int index = blockIdx.y * blockDim.x + threadIdx.x;
-
-  sum_s[tid] = 0.0f;
-  while (index < dimM) {
-    sum_s[tid] += abs(E[index]);
-    index += blockDim.x * gridDim.y;
-  }
-  __syncthreads();
-
-  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
-    if (tid < stride) {
-      sum_s[tid] += sum_s[tid + stride];
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-
-  if (tid == 0) {
-    Sum[blockIdx.y] = sum_s[0];
-  }
-}
-
-void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_h);
-
-  int blockSize = 128;
-  int gridSize = 128;
-  int blocksX = 1;
-  int blocksY = gridSize;
-  dim3 threads(blockSize, 1);
-  dim3 grid(blocksX, blocksY);
-
-  struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
-  hl_event_t hl_event = &hl_event_st;
-  while (!hl_cuda_event_is_ready(hl_event)) {
-  }
-
-  KeVectorAbsSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      A_d, t_resource.gpu_mem, dimM);
-  KeVectorAbsSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
-      t_resource.gpu_mem, t_resource.cpu_mem, 128);
-
-  hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
-  hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
-
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  cudaError_t err = (cudaError_t)hl_get_device_last_error();
-  CHECK_EQ(cudaSuccess, err) << "CUDA error: "
-                             << hl_get_device_error_string((size_t)err);
-}
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
deleted file mode 100644
index a4459243e8a7c8be58be2255faf89e29817fbdf5..0000000000000000000000000000000000000000
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ /dev/null
@@ -1,1030 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <float.h>
-#include "hl_base.h"
-#include "hl_cnn.h"
-#include "hl_device_functions.cuh"
-
-__global__ void KeMaxPoolForward(const int nthreads,
-                                 const real* inputData,
-                                 const int channels,
-                                 const int height,
-                                 const int width,
-                                 const int pooledH,
-                                 const int pooledW,
-                                 const int ksizeW,
-                                 const int ksizeH,
-                                 const int strideH,
-                                 const int strideW,
-                                 const int offsetH,
-                                 const int offsetW,
-                                 real* tgtData,
-                                 const int tgtStride,
-                                 real* maskData) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    int pw = index % pooledW;
-    int ph = (index / pooledW) % pooledH;
-    int c = (index / pooledW / pooledH) % channels;
-    int frameNum = index / pooledW / pooledH / channels;
-    int hstart = ph * strideH - offsetH;
-    int wstart = pw * strideW - offsetW;
-    int hend = min(hstart + ksizeH, height);
-    int wend = min(wstart + ksizeW, width);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    real maxval = -FLT_MAX;
-    int max_index = -1;
-    inputData += (frameNum * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        if (maxval < inputData[h * width + w]) {
-          max_index = h * width + w;
-          maxval = inputData[max_index];
-        }
-      }
-    }
-    int tgtIndex =
-        index % (pooledW * pooledH * channels) + frameNum * tgtStride;
-    tgtData[tgtIndex] = maxval;
-    if (maskData != NULL) {
-      maskData[tgtIndex] = max_index;
-    }
-  }
-}
-
-void hl_maxpool_forward(const int frameCnt,
-                        const real* inputData,
-                        const int channels,
-                        const int height,
-                        const int width,
-                        const int pooledH,
-                        const int pooledW,
-                        const int sizeX,
-                        const int sizeY,
-                        const int strideH,
-                        const int strideW,
-                        const int paddingH,
-                        const int paddingW,
-                        real* tgtData,
-                        const int tgtStride,
-                        real* maskData) {
-  int num_kernels = pooledH * pooledW * channels * frameCnt;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-  dim3 threads(1024, 1);
-  dim3 grid(blocks, 1);
-
-  KeMaxPoolForward<<<grid, threads, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                         inputData,
-                                                         channels,
-                                                         height,
-                                                         width,
-                                                         pooledH,
-                                                         pooledW,
-                                                         sizeX,
-                                                         sizeY,
-                                                         strideH,
-                                                         strideW,
-                                                         paddingH,
-                                                         paddingW,
-                                                         tgtData,
-                                                         tgtStride,
-                                                         maskData);
-  CHECK_SYNC("hl_maxpool_forward failed");
-}
-
-__global__ void KeMaxPoolBackward(const int nthreads,
-                                  const real* inputData,
-                                  const real* outData,
-                                  const real* outGrad,
-                                  const int channels,
-                                  const int height,
-                                  const int width,
-                                  const int pooledH,
-                                  const int pooledW,
-                                  const int sizeX,
-                                  const int sizeY,
-                                  const int strideH,
-                                  const int strideW,
-                                  const int padH,
-                                  const int padW,
-                                  real scaleA,
-                                  real scaleB,
-                                  real* targetGrad,
-                                  const int outStride) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    // find out the local index
-    // find out the local offset
-    int offsetW = index % width + padW;
-    int offsetH = (index / width) % height + padH;
-    int offsetC = (index / width / height) % channels;
-
-    int frameNum = index / width / height / channels;
-    int phstart = (offsetH < sizeY) ? 0 : (offsetH - sizeY) / strideH + 1;
-    int pwstart = (offsetW < sizeX) ? 0 : (offsetW - sizeX) / strideW + 1;
-    int phend = offsetH >= 0 ? min(offsetH / strideH + 1, pooledH) : 0;
-    int pwend = offsetW >= 0 ? min(offsetW / strideW + 1, pooledW) : 0;
-    real gradient = 0;
-    real input = inputData[index];
-    outData += (frameNum * outStride + offsetC * pooledH * pooledW);
-    outGrad += (frameNum * outStride + offsetC * pooledH * pooledW);
-    for (int ph = phstart; ph < phend; ++ph) {
-      for (int pw = pwstart; pw < pwend; ++pw) {
-        if (input == outData[ph * pooledW + pw]) {
-          gradient += outGrad[ph * pooledW + pw];
-        }
-      }
-    }
-    targetGrad[index] = scaleB * targetGrad[index] + scaleA * gradient;
-  }
-}
-
-void hl_maxpool_backward(const int frameCnt,
-                         const real* inputData,
-                         const real* outData,
-                         const real* outGrad,
-                         const int channels,
-                         const int height,
-                         const int width,
-                         const int pooledH,
-                         const int pooledW,
-                         const int sizeX,
-                         const int sizeY,
-                         const int strideH,
-                         const int strideW,
-                         const int paddingH,
-                         const int paddingW,
-                         real scaleA,
-                         real scaleB,
-                         real* targetGrad,
-                         const int outStride) {
-  int num_kernels = height * width * channels * frameCnt;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-
-  KeMaxPoolBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                         inputData,
-                                                         outData,
-                                                         outGrad,
-                                                         channels,
-                                                         height,
-                                                         width,
-                                                         pooledH,
-                                                         pooledW,
-                                                         sizeX,
-                                                         sizeY,
-                                                         strideH,
-                                                         strideW,
-                                                         paddingH,
-                                                         paddingW,
-                                                         scaleA,
-                                                         scaleB,
-                                                         targetGrad,
-                                                         outStride);
-  CHECK_SYNC("hl_maxpool_backward");
-}
-
-__global__ void KeAvgPoolForward(const int nthreads,
-                                 const real* inputData,
-                                 const int channels,
-                                 const int height,
-                                 const int width,
-                                 const int pooledH,
-                                 const int pooledW,
-                                 const int sizeX,
-                                 const int sizeY,
-                                 const int strideH,
-                                 const int strideW,
-                                 const int padH,
-                                 const int padW,
-                                 real* tgtData,
-                                 const int tgtStride,
-                                 const bool excludeMode) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    int pw = index % pooledW;
-    int ph = (index / pooledW) % pooledH;
-    int c = (index / pooledW / pooledH) % channels;
-    int frameNum = index / pooledW / pooledH / channels;
-
-    int hstart = ph * strideH - padH;
-    int wstart = pw * strideW - padW;
-    int hend = min(hstart + sizeY, height);
-    int wend = min(wstart + sizeX, width);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    int poolSize =
-        excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
-
-    real aveval = 0;
-    inputData += (frameNum * channels + c) * height * width;
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        aveval += inputData[h * width + w];
-      }
-    }
-    int tgtIndex =
-        index % (pooledW * pooledH * channels) + frameNum * tgtStride;
-    tgtData[tgtIndex] = aveval / poolSize;
-  }
-}
-
-void hl_avgpool_forward(const int frameCnt,
-                        const real* inputData,
-                        const int channels,
-                        const int height,
-                        const int width,
-                        const int pooledH,
-                        const int pooledW,
-                        const int sizeX,
-                        const int sizeY,
-                        const int strideH,
-                        const int strideW,
-                        const int paddingH,
-                        const int paddingW,
-                        real* tgtData,
-                        const int tgtStride,
-                        const bool excludeMode) {
-  int num_kernels = pooledH * pooledW * channels * frameCnt;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-  KeAvgPoolForward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                        inputData,
-                                                        channels,
-                                                        height,
-                                                        width,
-                                                        pooledH,
-                                                        pooledW,
-                                                        sizeX,
-                                                        sizeY,
-                                                        strideH,
-                                                        strideW,
-                                                        paddingH,
-                                                        paddingW,
-                                                        tgtData,
-                                                        tgtStride,
-                                                        excludeMode);
-  CHECK_SYNC("hl_avgpool_forward failed");
-}
-
-__global__ void KeAvgPoolBackward(const int nthreads,
-                                  const real* outGrad,
-                                  const int channels,
-                                  const int height,
-                                  const int width,
-                                  const int pooledH,
-                                  const int pooledW,
-                                  const int sizeX,
-                                  const int sizeY,
-                                  const int strideH,
-                                  const int strideW,
-                                  const int padH,
-                                  const int padW,
-                                  real scaleA,
-                                  real scaleB,
-                                  real* tgtGrad,
-                                  const int outStride,
-                                  const bool excludeMode) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    int offsetW = index % width + padW;
-    int offsetH = (index / width) % height + padH;
-    int offsetC = (index / width / height) % channels;
-    int frameNum = index / width / height / channels;
-
-    int phstart = (offsetH < sizeY) ? 0 : (offsetH - sizeY) / strideH + 1;
-    int pwstart = (offsetW < sizeX) ? 0 : (offsetW - sizeX) / strideW + 1;
-    int phend = offsetH >= 0 ? min(offsetH / strideH + 1, pooledH) : 0;
-    int pwend = offsetW >= 0 ? min(offsetW / strideW + 1, pooledW) : 0;
-    real gradient = 0;
-    outGrad += (frameNum * outStride + offsetC * pooledH * pooledW);
-
-    for (int ph = phstart; ph < phend; ++ph) {
-      int hstart = ph * strideH - padH;
-      int hend = min(hstart + sizeY, height);
-      hstart = max(hstart, 0);
-      for (int pw = pwstart; pw < pwend; ++pw) {
-        // figure out the pooling size
-        int wstart = pw * strideW - padW;
-        int wend = min(wstart + sizeX, width);
-        wstart = max(wstart, 0);
-        int poolSize =
-            excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
-        gradient += outGrad[ph * pooledW + pw] / poolSize;
-      }
-    }
-    tgtGrad[index] = scaleB * tgtGrad[index] + scaleA * gradient;
-  }
-}
-
-void hl_avgpool_backward(const int frameCnt,
-                         const real* outGrad,
-                         const int channels,
-                         const int height,
-                         const int width,
-                         const int pooledH,
-                         const int pooledW,
-                         const int sizeX,
-                         const int sizeY,
-                         const int strideH,
-                         const int strideW,
-                         const int paddingH,
-                         const int paddingW,
-                         real scaleA,
-                         real scaleB,
-                         real* backGrad,
-                         const int outStride,
-                         const bool excludeMode) {
-  int num_kernels = height * width * channels * frameCnt;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-
-  KeAvgPoolBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                         outGrad,
-                                                         channels,
-                                                         height,
-                                                         width,
-                                                         pooledH,
-                                                         pooledW,
-                                                         sizeX,
-                                                         sizeY,
-                                                         strideH,
-                                                         strideW,
-                                                         paddingH,
-                                                         paddingW,
-                                                         scaleA,
-                                                         scaleB,
-                                                         backGrad,
-                                                         outStride,
-                                                         excludeMode);
-  CHECK_SYNC("hl_avgpool_backward failed");
-}
-
-__global__ void KeMaxPool3DForward(const int nthreads,
-                                   const real* inputData,
-                                   const int channels,
-                                   const int depth,
-                                   const int height,
-                                   const int width,
-                                   const int pooledD,
-                                   const int pooledH,
-                                   const int pooledW,
-                                   const int ksizeD,
-                                   const int ksizeH,
-                                   const int ksizeW,
-                                   const int strideD,
-                                   const int strideH,
-                                   const int strideW,
-                                   const int padD,
-                                   const int padH,
-                                   const int padW,
-                                   real* tgtData,
-                                   real* maxPoolIdxData,
-                                   const int tgtStride) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
-       index += blockDim.x * gridDim.x) {
-    int pw = index % pooledW;
-    int ph = (index / pooledW) % pooledH;
-    int pd = (index / pooledW / pooledH) % pooledD;
-    int c = (index / pooledW / pooledH / pooledD) % channels;
-    int frameNum = index / pooledW / pooledH / pooledD / channels;
-    int dstart = pd * strideD - padD;
-    int hstart = ph * strideH - padH;
-    int wstart = pw * strideW - padW;
-    int dend = min(dstart + ksizeD, depth);
-    int hend = min(hstart + ksizeH, height);
-    int wend = min(wstart + ksizeW, width);
-    dstart = max(dstart, 0);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    real maxval = -FLT_MAX;
-    int maxIdx = -1;
-    inputData += (frameNum * channels + c) * depth * height * width;
-    for (int d = dstart; d < dend; ++d) {
-      for (int h = hstart; h < hend; ++h) {
-        for (int w = wstart; w < wend; ++w) {
-          if (maxval < inputData[(d * height + h) * width + w]) {
-            maxval = inputData[(d * height + h) * width + w];
-            maxIdx = (d * height + h) * width + w;
-          }
-        }
-      }
-    }
-    int tgtIndex =
-        index % (pooledW * pooledH * pooledD * channels) + frameNum * tgtStride;
-    tgtData[tgtIndex] = maxval;
-    maxPoolIdxData[tgtIndex] = maxIdx;
-  }
-}
-
-void hl_maxpool3D_forward(const int frameCnt,
-                          const real* inputData,
-                          const int channels,
-                          const int depth,
-                          const int height,
-                          const int width,
-                          const int pooledD,
-                          const int pooledH,
-                          const int pooledW,
-                          const int sizeZ,
-                          const int sizeY,
-                          const int sizeX,
-                          const int strideD,
-                          const int strideH,
-                          const int strideW,
-                          const int padD,
-                          const int padH,
-                          const int padW,
-                          real* tgtData,
-                          real* maxPoolIdxData,
-                          const int tgtStride) {
-  int num_kernels = pooledD * pooledH * pooledW * channels * frameCnt;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-  dim3 threads(1024, 1);
-  dim3 grid(blocks, 1);
-
-  KeMaxPool3DForward<<<grid, threads, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                           inputData,
-                                                           channels,
-                                                           depth,
-                                                           height,
-                                                           width,
-                                                           pooledD,
-                                                           pooledH,
-                                                           pooledW,
-                                                           sizeZ,
-                                                           sizeY,
-                                                           sizeX,
-                                                           strideD,
-                                                           strideH,
-                                                           strideW,
-                                                           padD,
-                                                           padH,
-                                                           padW,
-                                                           tgtData,
-                                                           maxPoolIdxData,
-                                                           tgtStride);
-  CHECK_SYNC("hl_maxpool3D_forward failed");
-}
-
-__global__ void KeMaxPool3DBackward(const int nthreads,
-                                    const real* outGrad,
-                                    const int channels,
-                                    const int depth,
-                                    const int height,
-                                    const int width,
-                                    const int pooledD,
-                                    const int pooledH,
-                                    const int pooledW,
-                                    const int sizeZ,
-                                    const int sizeY,
-                                    const int sizeX,
-                                    const int strideD,
-                                    const int strideH,
-                                    const int strideW,
-                                    const int padD,
-                                    const int padH,
-                                    const int padW,
-                                    real scaleA,
-                                    real scaleB,
-                                    real* targetGrad,
-                                    real* maxPoolIdxData,
-                                    const int outStride) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
-       index += blockDim.x * gridDim.x) {
-    int offsetW = index % width;
-    int offsetH = (index / width) % height;
-    int offsetD = (index / width / height) % depth;
-    int offsetC = (index / width / height / depth) % channels;
-    int frameNum = index / width / height / depth / channels;
-
-    int pdstart =
-        (offsetD + padD < sizeZ) ? 0 : (offsetD + padD - sizeZ) / strideD + 1;
-    int phstart =
-        (offsetH + padH < sizeY) ? 0 : (offsetH + padH - sizeY) / strideH + 1;
-    int pwstart =
-        (offsetW + padW < sizeX) ? 0 : (offsetW + padW - sizeX) / strideW + 1;
-    int pdend = min((offsetD + padD) / strideD + 1, pooledD);
-    int phend = min((offsetH + padH) / strideH + 1, pooledH);
-    int pwend = min((offsetW + padW) / strideW + 1, pooledW);
-
-    real gradient = 0;
-    outGrad += ((frameNum * channels + offsetC) * pooledD * pooledH * pooledW);
-    maxPoolIdxData +=
-        ((frameNum * channels + offsetC) * pooledD * pooledH * pooledW);
-    for (int pd = pdstart; pd < pdend; ++pd) {
-      for (int ph = phstart; ph < phend; ++ph) {
-        for (int pw = pwstart; pw < pwend; ++pw) {
-          if (((offsetD * height + offsetH) * width + offsetW) ==
-              maxPoolIdxData[(pd * pooledH + ph) * pooledW + pw])
-            gradient += outGrad[(pd * pooledH + ph) * pooledW + pw];
-        }
-      }
-    }
-    targetGrad[index] = scaleA * gradient + scaleB * targetGrad[index];
-  }
-}
-
-void hl_maxpool3D_backward(const int frameCnt,
-                           const real* outGrad,
-                           const int channels,
-                           const int depth,
-                           const int height,
-                           const int width,
-                           const int outputD,
-                           const int outputH,
-                           const int outputW,
-                           const int sizeZ,
-                           const int sizeY,
-                           const int sizeX,
-                           const int strideD,
-                           const int strideH,
-                           const int strideW,
-                           const int paddingD,
-                           const int paddingH,
-                           const int paddingW,
-                           real scaleA,
-                           real scaleB,
-                           real* targetGrad,
-                           real* maxPoolIdxData,
-                           const int outStride) {
-  int num_kernels = depth * height * width * channels * frameCnt;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-
-  KeMaxPool3DBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                           outGrad,
-                                                           channels,
-                                                           depth,
-                                                           height,
-                                                           width,
-                                                           outputD,
-                                                           outputH,
-                                                           outputW,
-                                                           sizeZ,
-                                                           sizeY,
-                                                           sizeX,
-                                                           strideD,
-                                                           strideH,
-                                                           strideW,
-                                                           paddingD,
-                                                           paddingH,
-                                                           paddingW,
-                                                           scaleA,
-                                                           scaleB,
-                                                           targetGrad,
-                                                           maxPoolIdxData,
-                                                           outStride);
-  CHECK_SYNC("hl_maxpool3D_backward");
-}
-
-__global__ void KeAvgPool3DForward(const int nthreads,
-                                   const real* inputData,
-                                   const int channels,
-                                   const int depth,
-                                   const int height,
-                                   const int width,
-                                   const int pooledD,
-                                   const int pooledH,
-                                   const int pooledW,
-                                   const int sizeZ,
-                                   const int sizeY,
-                                   const int sizeX,
-                                   const int strideD,
-                                   const int strideH,
-                                   const int strideW,
-                                   const int padD,
-                                   const int padH,
-                                   const int padW,
-                                   real* tgtData,
-                                   const int tgtStride) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
-       index += blockDim.x * gridDim.x) {
-    int pw = index % pooledW;
-    int ph = (index / pooledW) % pooledH;
-    int pd = (index / pooledW / pooledH) % pooledD;
-    int c = (index / pooledW / pooledH / pooledD) % channels;
-    int frameNum = index / pooledW / pooledH / pooledD / channels;
-    int dstart = pd * strideD - padD;
-    int hstart = ph * strideH - padH;
-    int wstart = pw * strideW - padW;
-    int dend = min(dstart + sizeZ, depth);
-    int hend = min(hstart + sizeY, height);
-    int wend = min(wstart + sizeX, width);
-    dstart = max(dstart, 0);
-    hstart = max(hstart, 0);
-    wstart = max(wstart, 0);
-    int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-
-    real aveval = 0;
-    inputData += (frameNum * channels + c) * depth * height * width;
-    for (int d = dstart; d < dend; ++d) {
-      for (int h = hstart; h < hend; ++h) {
-        for (int w = wstart; w < wend; ++w) {
-          aveval += inputData[(d * height + h) * width + w];
-        }
-      }
-    }
-    int tgtIndex =
-        index % (pooledW * pooledH * pooledD * channels) + frameNum * tgtStride;
-    tgtData[tgtIndex] = aveval / pool_size;
-  }
-}
-
-void hl_avgpool3D_forward(const int frameCnt,
-                          const real* inputData,
-                          const int channels,
-                          const int depth,
-                          const int height,
-                          const int width,
-                          const int pooledD,
-                          const int pooledH,
-                          const int pooledW,
-                          const int sizeZ,
-                          const int sizeY,
-                          const int sizeX,
-                          const int strideD,
-                          const int strideH,
-                          const int strideW,
-                          const int paddingD,
-                          const int paddingH,
-                          const int paddingW,
-                          real* tgtData,
-                          const int tgtStride) {
-  int num_kernels = pooledD * pooledH * pooledW * channels * frameCnt;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-  KeAvgPool3DForward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                          inputData,
-                                                          channels,
-                                                          depth,
-                                                          height,
-                                                          width,
-                                                          pooledD,
-                                                          pooledH,
-                                                          pooledW,
-                                                          sizeZ,
-                                                          sizeY,
-                                                          sizeX,
-                                                          strideD,
-                                                          strideH,
-                                                          strideW,
-                                                          paddingD,
-                                                          paddingH,
-                                                          paddingW,
-                                                          tgtData,
-                                                          tgtStride);
-  CHECK_SYNC("hl_avgpool3D_forward failed");
-}
-
-__global__ void KeAvgPool3DBackward(const int nthreads,
-                                    const real* outGrad,
-                                    const int channels,
-                                    const int depth,
-                                    const int height,
-                                    const int width,
-                                    const int pooledD,
-                                    const int pooledH,
-                                    const int pooledW,
-                                    const int sizeZ,
-                                    const int sizeY,
-                                    const int sizeX,
-                                    const int strideD,
-                                    const int strideH,
-                                    const int strideW,
-                                    const int padD,
-                                    const int padH,
-                                    const int padW,
-                                    real scaleA,
-                                    real scaleB,
-                                    real* tgtGrad,
-                                    const int outStride) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
-       index += blockDim.x * gridDim.x) {
-    int offsetW = index % width + padW;
-    int offsetH = (index / width) % height + padH;
-    int offsetD = (index / width / height) % depth + padD;
-    int offsetC = (index / width / height / depth) % channels;
-    int frameNum = index / width / height / depth / channels;
-
-    int pdstart = (offsetD < sizeZ) ? 0 : (offsetD - sizeZ) / strideD + 1;
-    int phstart = (offsetH < sizeY) ? 0 : (offsetH - sizeY) / strideH + 1;
-    int pwstart = (offsetW < sizeX) ? 0 : (offsetW - sizeX) / strideW + 1;
-    int pdend = min(offsetD / strideD + 1, pooledD);
-    int phend = min(offsetH / strideH + 1, pooledH);
-    int pwend = min(offsetW / strideW + 1, pooledW);
-
-    real gradient = 0;
-    outGrad += (frameNum * channels + offsetC) * pooledD * pooledH * pooledW;
-
-    for (int pd = pdstart; pd < pdend; ++pd) {
-      int dstart = pd * strideD - padD;
-      int dend = min(dstart + sizeZ, depth);
-      dstart = max(dstart, 0);
-      for (int ph = phstart; ph < phend; ++ph) {
-        int hstart = ph * strideH - padH;
-        int hend = min(hstart + sizeY, height);
-        hstart = max(hstart, 0);
-        for (int pw = pwstart; pw < pwend; ++pw) {
-          // figure out the pooling size
-          int wstart = pw * strideW - padW;
-          int wend = min(wstart + sizeX, width);
-          wstart = max(wstart, 0);
-          int poolsize = (dend - dstart) * (hend - hstart) * (wend - wstart);
-          gradient += outGrad[(pd * pooledH + ph) * pooledW + pw] / poolsize;
-        }
-      }
-    }
-    tgtGrad[index] = scaleA * gradient + scaleB * tgtGrad[index];
-  }
-}
-
-void hl_avgpool3D_backward(const int frameCnt,
-                           const real* outGrad,
-                           const int channels,
-                           const int depth,
-                           const int height,
-                           const int width,
-                           const int outputD,
-                           const int outputH,
-                           const int outputW,
-                           const int sizeZ,
-                           const int sizeY,
-                           const int sizeX,
-                           const int strideD,
-                           const int strideH,
-                           const int strideW,
-                           int paddingD,
-                           int paddingH,
-                           int paddingW,
-                           real scaleA,
-                           real scaleB,
-                           real* backGrad,
-                           const int outStride) {
-  int num_kernels = depth * height * width * channels * frameCnt;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-
-  KeAvgPool3DBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                           outGrad,
-                                                           channels,
-                                                           depth,
-                                                           height,
-                                                           width,
-                                                           outputD,
-                                                           outputH,
-                                                           outputW,
-                                                           sizeZ,
-                                                           sizeY,
-                                                           sizeX,
-                                                           strideD,
-                                                           strideH,
-                                                           strideW,
-                                                           paddingD,
-                                                           paddingH,
-                                                           paddingW,
-                                                           scaleA,
-                                                           scaleB,
-                                                           backGrad,
-                                                           outStride);
-  CHECK_SYNC("hl_avgpool3D_backward failed");
-}
-
-__global__ void KeBilinearInterpFw(const real* in,
-                                   const size_t inImgH,
-                                   const size_t inImgW,
-                                   const size_t inputH,
-                                   const size_t inputW,
-                                   real* out,
-                                   const size_t outImgH,
-                                   const size_t outImgW,
-                                   const size_t outputH,
-                                   const size_t outputW,
-                                   const size_t numChannels,
-                                   const real ratioH,
-                                   const real ratioW) {
-  int nthreads = outputH * outputW;
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < nthreads) {
-    int outIdH = tid / outputW;
-    int outIdW = tid % outputW;
-    int inImgSize = inputW / numChannels;
-    int outImgSize = outputW / numChannels;
-    int channelId = outIdW / outImgSize;
-
-    int outImgIdy = (outIdW % outImgSize) / outImgW;
-    int inImgIdy = ratioH * outImgIdy;
-    int hId = (inImgIdy < inImgH - 1) ? 1 : 0;
-    real h1lambda = ratioH * outImgIdy - inImgIdy;
-    real h2lambda = 1.f - h1lambda;
-
-    int outImgIdx = tid % outImgW;
-    int inImgIdx = ratioW * outImgIdx;
-    int wId = (inImgIdx < inImgW - 1) ? 1 : 0;
-    real w1lambda = ratioW * outImgIdx - inImgIdx;
-    real w2lambda = 1.f - w1lambda;
-
-    const real* inPos = &in[outIdH * inputW + channelId * inImgSize +
-                            inImgIdy * inImgW + inImgIdx];
-
-    // bilinear interpolation
-    out[outIdH * outputW + outIdW] =
-        h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wId]) +
-        h1lambda * (w2lambda * inPos[hId * inImgW] +
-                    w1lambda * inPos[hId * inImgW + wId]);
-  }
-}
-
-void hl_bilinear_forward(const real* inData,
-                         const size_t inImgH,
-                         const size_t inImgW,
-                         const size_t inputH,
-                         const size_t inputW,
-                         real* outData,
-                         const size_t outImgH,
-                         const size_t outImgW,
-                         const size_t outputH,
-                         const size_t outputW,
-                         const size_t numChannels,
-                         const real ratioH,
-                         const real ratioW) {
-  int threadNum = outputH * outputW;
-  int blocks = (threadNum + 1024 - 1) / 1024;
-
-  KeBilinearInterpFw<<<blocks, 1024, 0, STREAM_DEFAULT>>>(inData,
-                                                          inImgH,
-                                                          inImgW,
-                                                          inputH,
-                                                          inputW,
-                                                          outData,
-                                                          outImgH,
-                                                          outImgW,
-                                                          outputH,
-                                                          outputW,
-                                                          numChannels,
-                                                          ratioH,
-                                                          ratioW);
-  CHECK_SYNC("hl_bilinear_forward failed");
-}
-
-__global__ void KeBilinearInterpBw(real* in,
-                                   const size_t inImgH,
-                                   const size_t inImgW,
-                                   const size_t inputH,
-                                   const size_t inputW,
-                                   const real* out,
-                                   const size_t outImgH,
-                                   const size_t outImgW,
-                                   const size_t outputH,
-                                   const size_t outputW,
-                                   const size_t numChannels,
-                                   const real ratioH,
-                                   const real ratioW) {
-  int nthreads = outputH * outputW;
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < nthreads) {
-    int outIdH = tid / outputW;
-    int outIdW = tid % outputW;
-    int inImgSize = inputW / numChannels;
-    int outImgSize = outputW / numChannels;
-    int channelId = outIdW / outImgSize;
-
-    int outImgIdy = (outIdW % outImgSize) / outImgW;
-    int inImgIdy = ratioH * outImgIdy;
-    int hId = (inImgIdy < inImgH - 1) ? 1 : 0;
-    real h1lambda = ratioH * outImgIdy - inImgIdy;
-    real h2lambda = 1.f - h1lambda;
-
-    int outImgIdx = tid % outImgW;
-    int inImgIdx = ratioW * outImgIdx;
-    int wId = (inImgIdx < inImgW - 1) ? 1 : 0;
-    real w1lambda = ratioW * outImgIdx - inImgIdx;
-    real w2lambda = 1.f - w1lambda;
-
-    real* inPos = &in[outIdH * inputW + channelId * inImgSize +
-                      inImgIdy * inImgW + inImgIdx];
-    const real* outPos = &out[outIdH * outputW + outIdW];
-    paddle::paddleAtomicAdd(&inPos[0], h2lambda * w2lambda * outPos[0]);
-    paddle::paddleAtomicAdd(&inPos[wId], h2lambda * w1lambda * outPos[0]);
-    paddle::paddleAtomicAdd(&inPos[hId * inImgW],
-                            h1lambda * w2lambda * outPos[0]);
-    paddle::paddleAtomicAdd(&inPos[hId * inImgW + wId],
-                            h1lambda * w1lambda * outPos[0]);
-  }
-}
-
-void hl_bilinear_backward(real* inGrad,
-                          const size_t inImgH,
-                          const size_t inImgW,
-                          const size_t inputH,
-                          const size_t inputW,
-                          const real* outGrad,
-                          const size_t outImgH,
-                          const size_t outImgW,
-                          const size_t outputH,
-                          const size_t outputW,
-                          const size_t numChannels,
-                          const real ratioH,
-                          const real ratioW) {
-  int threadNum = outputH * outputW;
-  int blocks = (threadNum + 1024 - 1) / 1024;
-
-  KeBilinearInterpBw<<<blocks, 1024, 0, STREAM_DEFAULT>>>(inGrad,
-                                                          inImgH,
-                                                          inImgW,
-                                                          inputH,
-                                                          inputW,
-                                                          outGrad,
-                                                          outImgH,
-                                                          outImgW,
-                                                          outputH,
-                                                          outputW,
-                                                          numChannels,
-                                                          ratioH,
-                                                          ratioW);
-  CHECK_SYNC("hl_bilinear_backward failed");
-}
-
-__global__ void maxoutFpCompute(size_t nthreads,
-                                const real* inData,
-                                real* outData,
-                                int* idData,
-                                size_t size,
-                                size_t featLen,
-                                size_t groups) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    size_t batch_idx = index / size;
-    size_t i = index % size;
-    size_t channel_idx = i / featLen;
-    size_t feat_idx = i % featLen;
-    size_t data_idx =
-        (batch_idx * size + channel_idx * featLen) * groups + feat_idx;
-    real max = inData[data_idx];
-    int maxId = 0;
-    for (size_t g = 1; g < groups; ++g) {
-      real tmp = inData[data_idx + g * featLen];
-      if (tmp > max) {
-        max = tmp;
-        maxId = g;
-      }
-    }
-    outData[index] = max;
-    idData[index] = maxId;
-  }
-}
-
-void hl_maxout_forward(const real* inData,
-                       real* outData,
-                       int* idData,
-                       size_t batchSize,
-                       size_t size,
-                       size_t featLen,
-                       size_t groups) {
-  int num_kernels = size * batchSize;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-  maxoutFpCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(
-      num_kernels, inData, outData, idData, size, featLen, groups);
-  CHECK_SYNC("hl_maxout_forward failed");
-}
-
-__global__ void maxoutBpCompute(size_t nthreads,
-                                real* inGrad,
-                                const real* outGrad,
-                                const int* idData,
-                                size_t size,
-                                size_t featLen,
-                                size_t groups) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    size_t batch_idx = index / size;
-    size_t i = index % size;
-    size_t channel_idx = i / featLen;
-    size_t feat_idx = i % featLen;
-    size_t newIndex = batch_idx * size;
-    size_t gradIdx =
-        (channel_idx * groups + (idData + newIndex)[i]) * featLen + feat_idx;
-    (inGrad + newIndex * groups)[gradIdx] += (outGrad + newIndex)[i];
-  }
-}
-
-void hl_maxout_backward(real* inGrad,
-                        const real* outGrad,
-                        const int* idData,
-                        size_t batchSize,
-                        size_t size,
-                        size_t featLen,
-                        size_t groups) {
-  int num_kernels = size * batchSize;
-  int blocks = (num_kernels + 1024 - 1) / 1024;
-  maxoutBpCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(
-      num_kernels, inGrad, outGrad, idData, size, featLen, groups);
-  CHECK_SYNC("hl_maxout_backward failed");
-}
diff --git a/paddle/cuda/src/hl_cuda_cublas.cc b/paddle/cuda/src/hl_cuda_cublas.cc
deleted file mode 100644
index 975df4287894090799c44bc0a4e9e08e4144e68f..0000000000000000000000000000000000000000
--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ /dev/null
@@ -1,400 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_cuda_cublas.h"
-#include <sys/time.h>
-#include "hl_cuda.h"
-#include "hl_thread.ph"
-#include "paddle/utils/DynamicLoader.h"
-#include "paddle/utils/Logging.h"
-
-namespace dynload {
-
-std::once_flag cublas_dso_flag;
-void *cublas_dso_handle = nullptr;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cublas routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-#ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                                       \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    cublasStatus_t operator()(Args... args) {                                  \
-      typedef cublasStatus_t (*cublasFunc)(Args...);                           \
-      std::call_once(cublas_dso_flag, GetCublasDsoHandle, &cublas_dso_handle); \
-      void *p_##__name = dlsym(cublas_dso_handle, #__name);                    \
-      return reinterpret_cast<cublasFunc>(p_##__name)(args...);                \
-    }                                                                          \
-  } __name;  // struct DynLoad__##__name
-#else
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)      \
-  struct DynLoad__##__name {                  \
-    template <typename... Args>               \
-    cublasStatus_t operator()(Args... args) { \
-      return __name(args...);                 \
-    }                                         \
-  } __name;  // struct DynLoad__##__name
-#endif
-
-#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) DYNAMIC_LOAD_CUBLAS_WRAP(__name)
-
-// include all needed cublas functions in HPPL
-// clang-format off
-#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
-  __macro(cublasSgemv)                    \
-  __macro(cublasDgemv)                    \
-  __macro(cublasSgemm)                    \
-  __macro(cublasDgemm)                    \
-  __macro(cublasSgeam)                    \
-  __macro(cublasDgeam)                    \
-
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasCreate)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasDestroy)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetStream)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetPointerMode)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasGetPointerMode)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetrfBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetriBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetrfBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetriBatched)
-CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
-
-#undef DYNAMIC_LOAD_CUBLAS_WRAP
-#undef DYNAMIC_LOAD_CUBLAS_V2_WRAP
-#undef CUBLAS_BLAS_ROUTINE_EACH
-
-} /* namespace dynload */
-
-// clang-format on
-#ifndef PADDLE_TYPE_DOUBLE
-#define CUBLAS_GEAM dynload::cublasSgeam
-#define CUBLAS_GEMV dynload::cublasSgemv
-#define CUBLAS_GEMM dynload::cublasSgemm
-#define CUBLAS_GETRF dynload::cublasSgetrfBatched
-#define CUBLAS_GETRI dynload::cublasSgetriBatched
-#else
-#define CUBLAS_GEAM dynload::cublasDgeam
-#define CUBLAS_GEMV dynload::cublasDgemv
-#define CUBLAS_GEMM dynload::cublasDgemm
-#define CUBLAS_GETRF dynload::cublasDgetrfBatched
-#define CUBLAS_GETRI dynload::cublasDgetriBatched
-#endif
-
-const char *hl_cublas_get_error_string(cublasStatus_t status) {
-  switch (status) {
-    case CUBLAS_STATUS_NOT_INITIALIZED:
-      return "[cublas status]: not initialized";
-    case CUBLAS_STATUS_ALLOC_FAILED:
-      return "[cublas status]: allocate failed";
-    case CUBLAS_STATUS_INVALID_VALUE:
-      return "[cublas status]: invalid value";
-    case CUBLAS_STATUS_ARCH_MISMATCH:
-      return "[cublas status]: arch mismatch";
-    case CUBLAS_STATUS_MAPPING_ERROR:
-      return "[cublas status]: mapping error";
-    case CUBLAS_STATUS_EXECUTION_FAILED:
-      return "[cublas status]: execution failed";
-    case CUBLAS_STATUS_INTERNAL_ERROR:
-      return "[cublas status]: internal error";
-    case CUBLAS_STATUS_SUCCESS:
-      return "[cublas status]: success";
-    default:
-      return "[cublas status]: unknown error";
-  }
-}
-
-/**
- * Check build-in cublas function using glog and it also
- * support << operator for more details error info.
- */
-cublasStatus_t g_cublasStat;
-#define CHECK_CUBLAS(cublas_func)               \
-  g_cublasStat = cublas_func;                   \
-  CHECK_EQ(CUBLAS_STATUS_SUCCESS, g_cublasStat) \
-      << "Cublas Error: " << hl_cublas_get_error_string(g_cublasStat) << " "
-
-void hl_cublas_init(cublasHandle_t *cublas_handle, cudaStream_t stream) {
-  CHECK_CUBLAS(dynload::cublasCreate(cublas_handle))
-      << "[cublas init] Cublas create handle faild!";
-
-  CHECK_CUBLAS(dynload::cublasSetStream(*cublas_handle, stream))
-      << "[cublas init] Cublas set stream faild!";
-}
-
-void hl_matrix_transpose(
-    real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc) {
-  real alpha = 1.0;
-  real beta = 0.0;
-
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  CHECK_CUBLAS(CUBLAS_GEAM(t_resource.handle,
-                           CUBLAS_OP_T,
-                           CUBLAS_OP_N,
-                           dimM,
-                           dimN,
-                           &alpha,
-                           A_d,
-                           lda,
-                           &beta,
-                           nullptr,
-                           dimM,
-                           C_d,
-                           ldc));
-  CHECK_SYNC("hl_matrix_transpose failed");
-}
-
-void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN) {
-  hl_matrix_transpose(A_d, C_d, dimM, dimN, dimN, dimM);
-}
-
-void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
-  /* Solve Ax = I */
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  /* Step 1: Compute the LU decomposition of matrix A */
-  real **inout_h = &A_d;
-  real **inout_d = (real **)hl_malloc_device(sizeof(real *));
-  hl_memcpy(inout_d, inout_h, sizeof(real *));
-
-  int *pivot_d = (int *)hl_malloc_device(dimN * sizeof(int));
-  int *info_d = (int *)t_resource.gpu_mem;
-
-  /* Note: cublasSgetrfBatched is used to calculate a number of
-     small-sized matrices. There may be a better way to reconstruct
-     the API for better performance.
-   */
-  CHECK_CUBLAS(
-      CUBLAS_GETRF(t_resource.handle, dimN, inout_d, lda, pivot_d, info_d, 1));
-
-  int info_h;
-  hl_memcpy(&info_h, info_d, sizeof(int));
-  if (info_h != 0) {
-    LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
-  }
-
-  /* Step 2: Compute the inverse of the matrix given its LU decomposition */
-  real **out_h = &C_d;
-  real **out_d = (real **)hl_malloc_device(sizeof(real *));
-  hl_memcpy(out_d, out_h, sizeof(real *));
-
-  CHECK_CUBLAS(CUBLAS_GETRI(t_resource.handle,
-                            dimN,
-                            (const real **)inout_d,
-                            lda,
-                            pivot_d,
-                            out_d,
-                            ldc,
-                            info_d,
-                            1));
-
-  hl_memcpy(&info_h, info_d, sizeof(int));
-  if (info_h != 0) {
-    LOG(FATAL) << "Inversion of matrix failed: matrix may be singular.\n";
-  }
-
-  hl_free_mem_device(inout_d);
-  hl_free_mem_device(pivot_d);
-  hl_free_mem_device(out_d);
-
-  CHECK_SYNC("hl_matrix_inverse failed");
-}
-
-void hl_matrix_mul(real *A_d,
-                   hl_trans_op_t transa,
-                   real *B_d,
-                   hl_trans_op_t transb,
-                   real *C_d,
-                   int dimM,
-                   int dimN,
-                   int dimK,
-                   real alpha,
-                   real beta,
-                   int lda,
-                   int ldb,
-                   int ldc) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-
-  if (dimN == 1 && dimM != 1 && dimK != 1 && transb == HPPL_OP_N) {
-    int m = (transa == HPPL_OP_N) ? dimM : dimK;
-    int n = (transa == HPPL_OP_N) ? dimK : dimM;
-    hl_matrix_mul_vector(
-        A_d, transa, B_d, C_d, m, n, alpha, beta, lda, ldb, ldc);
-    return;
-  }
-
-  if (dimM == 1 && dimN != 1 && dimK != 1 && transa == HPPL_OP_N) {
-    int m = (transb == HPPL_OP_N) ? dimK : dimN;
-    int n = (transb == HPPL_OP_N) ? dimN : dimK;
-    hl_trans_op_t trans = (transb == HPPL_OP_N) ? HPPL_OP_T : HPPL_OP_N;
-    hl_matrix_mul_vector(B_d, trans, A_d, C_d, m, n, alpha, beta, ldb, 1, 1);
-    return;
-  }
-
-  cublasStatus_t stat;
-  if ((HPPL_OP_N == transa) && (HPPL_OP_N == transb)) {
-    stat = CUBLAS_GEMM(t_resource.handle,
-                       CUBLAS_OP_N,
-                       CUBLAS_OP_N,
-                       dimN,
-                       dimM,
-                       dimK,
-                       &alpha,
-                       B_d,
-                       ldb,
-                       A_d,
-                       lda,
-                       &beta,
-                       C_d,
-                       ldc);
-  } else if ((HPPL_OP_T == transa) && (HPPL_OP_N == transb)) {
-    stat = CUBLAS_GEMM(t_resource.handle,
-                       CUBLAS_OP_N,
-                       CUBLAS_OP_T,
-                       dimN,
-                       dimM,
-                       dimK,
-                       &alpha,
-                       B_d,
-                       ldb,
-                       A_d,
-                       lda,
-                       &beta,
-                       C_d,
-                       ldc);
-  } else if ((HPPL_OP_N == transa) && (HPPL_OP_T == transb)) {
-    stat = CUBLAS_GEMM(t_resource.handle,
-                       CUBLAS_OP_T,
-                       CUBLAS_OP_N,
-                       dimN,
-                       dimM,
-                       dimK,
-                       &alpha,
-                       B_d,
-                       ldb,
-                       A_d,
-                       lda,
-                       &beta,
-                       C_d,
-                       ldc);
-  } else {
-    LOG(FATAL) << "parameter transa error!";
-  }
-  CHECK_EQ(stat, CUBLAS_STATUS_SUCCESS) << hl_cublas_get_error_string(stat);
-  CHECK_SYNC("hl_matrix_mul failed");
-}
-
-void hl_matrix_mul(real *A_d,
-                   hl_trans_op_t transa,
-                   real *B_d,
-                   hl_trans_op_t transb,
-                   real *C_d,
-                   int dimM,
-                   int dimN,
-                   int dimK,
-                   real alpha,
-                   real beta) {
-  int lda = (HPPL_OP_N == transa) ? dimK : dimM;
-  int ldb = (HPPL_OP_N == transb) ? dimN : dimK;
-  int ldc = dimN;
-
-  hl_matrix_mul(A_d,
-                transa,
-                B_d,
-                transb,
-                C_d,
-                dimM,
-                dimN,
-                dimK,
-                alpha,
-                beta,
-                lda,
-                ldb,
-                ldc);
-}
-
-void hl_matrix_mul_vector(real *A_d,
-                          hl_trans_op_t trans,
-                          real *B_d,
-                          real *C_d,
-                          int dimM,
-                          int dimN,
-                          real alpha,
-                          real beta,
-                          int lda,
-                          int incb,
-                          int incc) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-
-  cublasStatus_t stat;
-  if (HPPL_OP_N == trans) {
-    stat = CUBLAS_GEMV(t_resource.handle,
-                       CUBLAS_OP_T,
-                       dimN,
-                       dimM,
-                       &alpha,
-                       A_d,
-                       lda,
-                       B_d,
-                       incb,
-                       &beta,
-                       C_d,
-                       incc);
-  } else if (HPPL_OP_T == trans) {
-    stat = CUBLAS_GEMV(t_resource.handle,
-                       CUBLAS_OP_N,
-                       dimN,
-                       dimM,
-                       &alpha,
-                       A_d,
-                       lda,
-                       B_d,
-                       incb,
-                       &beta,
-                       C_d,
-                       incc);
-  } else {
-    LOG(FATAL) << "parameter transa error!";
-  }
-
-  CHECK_EQ(stat, CUBLAS_STATUS_SUCCESS) << hl_cublas_get_error_string(stat);
-  CHECK_SYNC("hl_matrix_mul_vector");
-}
-
-void hl_matrix_mul_vector(real *A_d,
-                          hl_trans_op_t trans,
-                          real *B_d,
-                          real *C_d,
-                          int dimM,
-                          int dimN,
-                          real alpha,
-                          real beta) {
-  hl_matrix_mul_vector(
-      A_d, trans, B_d, C_d, dimM, dimN, alpha, beta, dimN, 1, 1);
-}
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
deleted file mode 100644
index dfa935dcff9f7ae9f710d0f01a0217298d8cec04..0000000000000000000000000000000000000000
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ /dev/null
@@ -1,1117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_cuda_cudnn.h"
-#include <cudnn.h>
-#include <gflags/gflags.h>
-#include "hl_cuda_cudnn.ph"
-#include "hl_thread.ph"
-#include "paddle/utils/DynamicLoader.h"
-#include "paddle/utils/Logging.h"
-
-DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
-             4096,
-             "Specify cuDNN max workspace limit, in units MB, "
-             "4096MB=4GB by default.");
-
-namespace dynload {
-
-std::once_flag cudnn_dso_flag;
-void* cudnn_dso_handle = nullptr;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cudbnn routine
- * via operator overloading: operator ()
- *
- * note: default dynamic linked libs
- **/
-
-#ifdef PADDLE_USE_DSO
-
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                                     \
-  struct DynLoad__##__name {                                                \
-    template <typename... Args>                                             \
-    auto operator()(Args... args) -> decltype(__name(args...)) {            \
-      using cudnn_func = decltype(__name(args...)) (*)(Args...);            \
-      std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, &cudnn_dso_handle); \
-      void* p_##__name = dlsym(cudnn_dso_handle, #__name);                  \
-      return reinterpret_cast<cudnn_func>(p_##__name)(args...);             \
-    }                                                                       \
-  } __name; /* struct DynLoad__##__name */
-
-#else
-
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                          \
-  struct DynLoad__##__name {                                     \
-    template <typename... Args>                                  \
-    auto operator()(Args... args) -> decltype(__name(args...)) { \
-      return __name(args...);                                    \
-    }                                                            \
-  } __name; /* struct DynLoad__##__name */
-
-#endif
-
-/**
- * include all needed cudnn functions in HPPL
- * different cudnn version has different interfaces
- **/
-// clang-format off
-#define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
-  __macro(cudnnSetTensor4dDescriptor)                     \
-  __macro(cudnnSetTensor4dDescriptorEx)                   \
-  __macro(cudnnGetConvolutionNdForwardOutputDim)          \
-  __macro(cudnnGetConvolutionForwardAlgorithm)            \
-  __macro(cudnnCreateTensorDescriptor)                    \
-  __macro(cudnnDestroyTensorDescriptor)                   \
-  __macro(cudnnCreateFilterDescriptor)                    \
-  __macro(cudnnSetFilter4dDescriptor)                     \
-  __macro(cudnnSetPooling2dDescriptor)                    \
-  __macro(cudnnDestroyFilterDescriptor)                   \
-  __macro(cudnnCreateConvolutionDescriptor)               \
-  __macro(cudnnCreatePoolingDescriptor)                   \
-  __macro(cudnnDestroyPoolingDescriptor)                  \
-  __macro(cudnnSetConvolution2dDescriptor)                \
-  __macro(cudnnDestroyConvolutionDescriptor)              \
-  __macro(cudnnCreate)                                    \
-  __macro(cudnnDestroy)                                   \
-  __macro(cudnnSetStream)                                 \
-  __macro(cudnnActivationForward)                         \
-  __macro(cudnnConvolutionForward)                        \
-  __macro(cudnnConvolutionBackwardBias)                   \
-  __macro(cudnnGetConvolutionForwardWorkspaceSize)        \
-  __macro(cudnnTransformTensor)                           \
-  __macro(cudnnPoolingForward)                            \
-  __macro(cudnnPoolingBackward)                           \
-  __macro(cudnnSoftmaxBackward)                           \
-  __macro(cudnnSoftmaxForward)                            \
-  __macro(cudnnGetVersion)                                \
-  __macro(cudnnGetErrorString)
-CUDNN_DNN_ROUTINE_EACH(DYNAMIC_LOAD_CUDNN_WRAP)
-
-#define CUDNN_DNN_ROUTINE_EACH_R2(__macro)                \
-  __macro(cudnnAddTensor)                                 \
-  __macro(cudnnConvolutionBackwardData)                   \
-  __macro(cudnnConvolutionBackwardFilter)
-CUDNN_DNN_ROUTINE_EACH_R2(DYNAMIC_LOAD_CUDNN_WRAP)
-
-// APIs available after R3:
-#if CUDNN_VERSION >= 3000
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)              \
-  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize)     \
-  __macro(cudnnGetConvolutionBackwardDataAlgorithm)           \
-  __macro(cudnnGetConvolutionBackwardFilterAlgorithm)         \
-  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize)
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DYNAMIC_LOAD_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
-#endif
-
-
-// APIs available after R4:
-#if CUDNN_VERSION >= 4007
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)             \
-  __macro(cudnnBatchNormalizationForwardTraining)            \
-  __macro(cudnnBatchNormalizationForwardInference)           \
-  __macro(cudnnBatchNormalizationBackward)
-CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DYNAMIC_LOAD_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
-#endif
-
-// APIs in R5
-#if CUDNN_VERSION >= 5000
-#define CUDNN_DNN_ROUTINE_EACH_R5(__macro)                    \
-  __macro(cudnnCreateActivationDescriptor)                    \
-  __macro(cudnnSetActivationDescriptor)                       \
-  __macro(cudnnGetActivationDescriptor)                       \
-  __macro(cudnnDestroyActivationDescriptor)
-CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_R5
-#endif
-
-#undef CUDNN_DNN_ROUTINE_EACH
-// clang-format on
-} /* namespace dynload */
-
-/**
- * Check build-in cudnn function using glog and it **does not**
- * support << operator for more details error info.
- */
-#define CHECK_CUDNN(cudnnFunc)                                         \
-  do {                                                                 \
-    cudnnStatus_t cudnnStat = cudnnFunc;                               \
-    CHECK_EQ(CUDNN_STATUS_SUCCESS, cudnnStat)                          \
-        << "Cudnn Error: " << dynload::cudnnGetErrorString(cudnnStat); \
-  } while (0)
-
-bool g_is_libcudnn_init = false;
-int g_cudnn_lib_version = 0;
-
-void hl_cudnn_desc_init(cudnnTensorDescriptor_t* cudnn_desc) {
-  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc));
-}
-
-void hl_cudnn_init(cudnnHandle_t* cudnn_handle, cudaStream_t stream) {
-  size_t cudnn_dso_ver = dynload::cudnnGetVersion();
-  size_t cudnn_dso_major = cudnn_dso_ver / 1000;
-  size_t cudnn_cuh_major = CUDNN_VERSION / 1000;
-
-  // Compare cudnn header version with that of cudnn.so.
-  CHECK((cudnn_cuh_major < 4 && cudnn_dso_major < 4) ||
-        (cudnn_cuh_major == cudnn_dso_major))
-      << "[cudnn init] libcudnn v" << cudnn_dso_major << " with header v"
-      << cudnn_cuh_major << " unmatched!\n"
-      << "PaddlePaddle Requirement: "
-      << "(header v[2-3] with libcudnn v[2-3]) Or "
-      << "(header v4 with libcudnn v4) Or "
-      << "(header v5 with libcudnn v5) Or"
-      << "(header v6 with libcudnn v6).";
-
-  CHECK(!(CUDNN_VERSION < 6000 && CUDNN_VERSION >= 5000 && CUDA_VERSION < 7050))
-      << "cudnn v5 requires cuda version >= 7.5";
-
-  CHECK(!(CUDNN_VERSION >= 6000 && CUDA_VERSION < 8000))
-      << "cudnn v6 requires cuda version >= 8.0";
-
-  CHECK_CUDNN(dynload::cudnnCreate(cudnn_handle));
-  CHECK_CUDNN(dynload::cudnnSetStream(*cudnn_handle, stream));
-
-  g_is_libcudnn_init = true;
-  g_cudnn_lib_version = cudnn_dso_ver;
-}
-
-int hl_get_cudnn_lib_version() { return g_cudnn_lib_version; }
-
-void hl_conv_workspace(hl_tensor_descriptor input,
-                       hl_tensor_descriptor output,
-                       hl_filter_descriptor filter,
-                       hl_convolution_descriptor conv,
-                       int* convFwdAlgo,
-                       size_t* fwdLimitBytes,
-                       int* convBwdDataAlgo,
-                       size_t* bwdDataLimitBytes,
-                       int* convBwdFilterAlgo,
-                       size_t* bwdFilterLimitBytes,
-                       bool useDilation) {
-#if CUDNN_VERSION >= 4000
-
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(filter);
-  CHECK_NOTNULL(conv);
-
-  // Specify workspace limit directly
-  size_t memoryLimitBytes =
-      (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
-
-  // For dilation
-  int algo = 0;
-
-  // cudnn convolution forward configuration
-  cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnTensorDescriptor_t fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnFilterDescriptor_t fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
-  cudnnConvolutionDescriptor_t fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-  // cudnn convolution backward data configuration
-  cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter);
-  cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnConvolutionDescriptor_t bwd_data_conv_desc =
-      GET_CONVOLUTION_DESCRIPTOR(conv);
-  // cudnn convolution backward filter configuration
-  cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
-      GET_CONVOLUTION_DESCRIPTOR(conv);
-  cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter);
-
-  if (useDilation) {
-    convFwdAlgo = &algo;
-    convBwdDataAlgo = &algo;
-    convBwdFilterAlgo = &algo;
-  } else {
-    CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
-        t_resource.cudnn_handle,
-        fwd_src_desc,
-        fwd_filter_desc,
-        fwd_conv_desc,
-        fwd_dest_desc,
-        CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-        memoryLimitBytes,
-        reinterpret_cast<cudnnConvolutionFwdAlgo_t*>(convFwdAlgo)));
-    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
-        t_resource.cudnn_handle,
-        bwd_data_filter_desc,
-        bwd_data_diff_desc,
-        bwd_data_conv_desc,
-        bwd_data_grad_desc,
-        CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-        memoryLimitBytes,
-        reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
-    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
-        t_resource.cudnn_handle,
-        bwd_filter_src_desc,
-        bwd_filter_diff_desc,
-        bwd_filter_conv_desc,
-        bwd_filter_grad_desc,
-        CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-        memoryLimitBytes,
-        reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
-  }
-
-  CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
-      t_resource.cudnn_handle,
-      fwd_src_desc,
-      fwd_filter_desc,
-      fwd_conv_desc,
-      fwd_dest_desc,
-      static_cast<cudnnConvolutionFwdAlgo_t>(*convFwdAlgo),
-      fwdLimitBytes));
-
-  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
-      t_resource.cudnn_handle,
-      bwd_data_filter_desc,
-      bwd_data_diff_desc,
-      bwd_data_conv_desc,
-      bwd_data_grad_desc,
-      static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
-      bwdDataLimitBytes));
-
-  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
-      t_resource.cudnn_handle,
-      bwd_filter_src_desc,
-      bwd_filter_diff_desc,
-      bwd_filter_conv_desc,
-      bwd_filter_grad_desc,
-      static_cast<cudnnConvolutionBwdFilterAlgo_t>(*convBwdFilterAlgo),
-      bwdFilterLimitBytes));
-
-#endif
-}
-
-void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc,
-                                 int batch_size,
-                                 int feature_maps,
-                                 int height,
-                                 int width) {
-  CHECK_NOTNULL(image_desc);
-
-  cudnn_tensor_descriptor hl_desc =
-      (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
-  CHECK_NOTNULL(hl_desc);
-
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
-
-  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(hl_desc->desc,
-                                                  CUDNN_TENSOR_NCHW,
-                                                  data_type,
-                                                  batch_size,
-                                                  feature_maps,
-                                                  height,
-                                                  width));
-
-  hl_desc->format = CUDNN_TENSOR_NCHW;
-  hl_desc->data_type = data_type;
-  hl_desc->batch_size = batch_size;
-  hl_desc->feature_maps = feature_maps;
-  hl_desc->height = height;
-  hl_desc->width = width;
-
-  *image_desc = (hl_tensor_descriptor)hl_desc;
-}
-
-void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {
-  CHECK_NOTNULL(image_desc);
-
-  cudnn_tensor_descriptor hl_desc =
-      (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
-  CHECK_NOTNULL(hl_desc);
-
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
-
-  hl_desc->data_type = data_type;
-
-  *image_desc = (hl_tensor_descriptor)hl_desc;
-}
-
-void hl_tensor_reshape(hl_tensor_descriptor image_desc,
-                       int batch_size,
-                       int feature_maps,
-                       int height,
-                       int width) {
-  const int stride_w = 1;
-  const int stride_h = width * stride_w;
-  const int stride_c = height * stride_h;
-  const int stride_n = feature_maps * stride_c;
-  return hl_tensor_reshape(image_desc,
-                           batch_size,
-                           feature_maps,
-                           height,
-                           width,
-                           stride_n,
-                           stride_c,
-                           stride_h,
-                           stride_w);
-}
-
-void hl_tensor_reshape(hl_tensor_descriptor image_desc,
-                       int batch_size,
-                       int feature_maps,
-                       int height,
-                       int width,
-                       int nStride,
-                       int cStride,
-                       int hStride,
-                       int wStride) {
-  CHECK_NOTNULL(image_desc);
-
-  cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
-  CHECK_NOTNULL(hl_desc->desc);
-
-  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptorEx(hl_desc->desc,
-                                                    hl_desc->data_type,
-                                                    batch_size,
-                                                    feature_maps,
-                                                    height,
-                                                    width,
-                                                    nStride,
-                                                    cStride,
-                                                    hStride,
-                                                    wStride));
-
-  hl_desc->batch_size = batch_size;
-  hl_desc->feature_maps = feature_maps;
-  hl_desc->height = height;
-  hl_desc->width = width;
-}
-
-void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc) {
-  CHECK_NOTNULL(image_desc);
-
-  cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
-  CHECK_NOTNULL(hl_desc->desc);
-
-  CHECK_CUDNN(dynload::cudnnDestroyTensorDescriptor(hl_desc->desc));
-
-  hl_desc->desc = NULL;
-
-  free(image_desc);
-}
-
-void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
-                                  hl_pooling_mode_t mode,
-                                  int height,
-                                  int width,
-                                  int height_padding,
-                                  int width_padding,
-                                  int stride_height,
-                                  int stride_width) {
-  cudnnPoolingMode_t cudnn_mode;
-  switch (mode) {
-    case HL_POOLING_MAX:
-      cudnn_mode = CUDNN_POOLING_MAX;
-      break;
-    case HL_POOLING_AVERAGE:
-      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
-      break;
-    case HL_POOLING_AVERAGE_INCLUDE_PADDING:
-      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
-      break;
-    default:
-      LOG(FATAL) << "parameter mode error";
-  }
-
-  CHECK_NOTNULL(pooling_desc);
-
-  cudnn_pooling_descriptor hl_pooling_desc =
-      (cudnn_pooling_descriptor)malloc(sizeof(_cudnn_pooling_descriptor));
-  CHECK_NOTNULL(hl_pooling_desc);
-
-  CHECK_CUDNN(dynload::cudnnCreatePoolingDescriptor(&hl_pooling_desc->desc));
-
-  CHECK_CUDNN(dynload::cudnnSetPooling2dDescriptor(hl_pooling_desc->desc,
-                                                   cudnn_mode,
-#if CUDNN_VERSION >= 5000
-                                                   CUDNN_PROPAGATE_NAN,
-#endif
-                                                   height,
-                                                   width,
-                                                   height_padding,
-                                                   width_padding,
-                                                   stride_height,
-                                                   stride_width));
-
-  hl_pooling_desc->mode = cudnn_mode;
-  hl_pooling_desc->window_height = height;
-  hl_pooling_desc->window_width = width;
-  hl_pooling_desc->stride_height = stride_height;
-  hl_pooling_desc->stride_width = stride_width;
-
-  *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc;
-}
-
-void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc) {
-  CHECK_NOTNULL(pooling_desc);
-
-  cudnn_pooling_descriptor hl_pooling = (cudnn_pooling_descriptor)pooling_desc;
-
-  CHECK_NOTNULL(hl_pooling->desc);
-  CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc));
-
-  hl_pooling->desc = NULL;
-
-  free(pooling_desc);
-}
-
-void hl_pooling_forward(hl_tensor_descriptor input,
-                        real* input_image,
-                        hl_tensor_descriptor output,
-                        real* output_image,
-                        hl_pooling_descriptor pooling) {
-  cudnnPoolingDescriptor_t pooling_desc;
-  cudnnTensorDescriptor_t input_desc;
-  cudnnTensorDescriptor_t output_desc;
-
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(pooling);
-  CHECK_NOTNULL(input_image);
-  CHECK_NOTNULL(output_image);
-
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  input_desc = ((cudnn_tensor_descriptor)input)->desc;
-  output_desc = ((cudnn_tensor_descriptor)output)->desc;
-  pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
-  CHECK_CUDNN(dynload::cudnnPoolingForward(t_resource.cudnn_handle,
-                                           pooling_desc,
-                                           &alpha,
-                                           input_desc,
-                                           input_image,
-                                           &beta,
-                                           output_desc,
-                                           output_image));
-  CHECK_SYNC("hl_pooling_forward failed");
-}
-
-void hl_pooling_backward(hl_tensor_descriptor input,
-                         real* input_image,
-                         real* input_image_grad,
-                         hl_tensor_descriptor output,
-                         real* output_image,
-                         real* output_image_grad,
-                         hl_pooling_descriptor pooling) {
-  cudnnPoolingDescriptor_t pooling_desc;
-  cudnnTensorDescriptor_t input_desc;
-  cudnnTensorDescriptor_t output_desc;
-
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(pooling);
-  CHECK_NOTNULL(input_image);
-  CHECK_NOTNULL(input_image_grad);
-  CHECK_NOTNULL(output_image);
-  CHECK_NOTNULL(output_image_grad);
-
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  input_desc = ((cudnn_tensor_descriptor)input)->desc;
-  output_desc = ((cudnn_tensor_descriptor)output)->desc;
-  pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
-  CHECK_CUDNN(dynload::cudnnPoolingBackward(t_resource.cudnn_handle,
-                                            pooling_desc,
-                                            &alpha,
-                                            output_desc,
-                                            output_image,
-                                            output_desc,
-                                            output_image_grad,
-                                            input_desc,
-                                            input_image,
-                                            &beta,
-                                            input_desc,
-                                            input_image_grad));
-  CHECK_SYNC("hl_pooling_backward failed");
-}
-
-void hl_create_filter_descriptor(hl_filter_descriptor* filter,
-                                 int input_feature_maps,
-                                 int output_feature_maps,
-                                 int height,
-                                 int width) {
-  CHECK_NOTNULL(filter);
-
-  cudnn_filter_descriptor hl_filter =
-      (cudnn_filter_descriptor)malloc(sizeof(_cudnn_filter_descriptor));
-  CHECK_NOTNULL(hl_filter);
-
-  CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc));
-
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnSetFilter4dDescriptor(hl_filter->desc,
-                                                  data_type,
-#if CUDNN_VERSION >= 5000
-                                                  CUDNN_TENSOR_NCHW,
-#endif
-                                                  output_feature_maps,
-                                                  input_feature_maps,
-                                                  height,
-                                                  width));
-
-  hl_filter->data_type = data_type;
-  hl_filter->output_feature_maps = output_feature_maps;
-  hl_filter->input_feature_maps = input_feature_maps;
-  hl_filter->filter_height = height;
-  hl_filter->filter_width = width;
-
-  *filter = (hl_filter_descriptor)hl_filter;
-}
-
-void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {
-  CHECK_NOTNULL(filter);
-
-  cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter;
-  CHECK_NOTNULL(hl_filter->desc);
-
-  CHECK_CUDNN(dynload::cudnnDestroyFilterDescriptor(hl_filter->desc));
-
-  hl_filter->desc = NULL;
-
-  free(filter);
-}
-
-void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
-                                      hl_tensor_descriptor image,
-                                      hl_filter_descriptor filter,
-                                      int padding_height,
-                                      int padding_width,
-                                      int stride_height,
-                                      int stride_width,
-                                      int dilation_h,
-                                      int dilation_w) {
-  CHECK_NOTNULL(conv);
-
-  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)malloc(
-      sizeof(_cudnn_convolution_descriptor));
-
-  CHECK_NOTNULL(hl_conv);
-  CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));
-
-  cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
-
-#if CUDNN_VERSION >= 6000
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
-                                                       padding_height,
-                                                       padding_width,
-                                                       stride_height,
-                                                       stride_width,
-                                                       dilation_h,
-                                                       dilation_w,
-                                                       mode,
-                                                       data_type));
-#else
-  if (dilation_h > 1 || dilation_w > 1) {
-    LOG(FATAL)
-        << "Current cuDNN version does't support for dilation convolution. "
-        << "The dilation convolution requires cuDNN >= v6.0.";
-  }
-
-  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
-                                                       padding_height,
-                                                       padding_width,
-                                                       stride_height,
-                                                       stride_width,
-                                                       dilation_h,
-                                                       dilation_w,
-                                                       mode));
-#endif
-
-  hl_conv->input_image = image;
-  hl_conv->filter = filter;
-  hl_conv->padding_height = padding_height;
-  hl_conv->padding_width = padding_width;
-  hl_conv->stride_height = stride_height;
-  hl_conv->stride_width = stride_width;
-  hl_conv->upscalex = 1;
-  hl_conv->upscaley = 1;
-  hl_conv->mode = mode;
-
-  *conv = (hl_convolution_descriptor)hl_conv;
-}
-
-void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
-                                     hl_tensor_descriptor image,
-                                     hl_filter_descriptor filter,
-                                     int padding_height,
-                                     int padding_width,
-                                     int stride_height,
-                                     int stride_width,
-                                     int dilation_h,
-                                     int dilation_w) {
-  CHECK_NOTNULL(conv);
-  CHECK_NOTNULL(image);
-  CHECK_NOTNULL(filter);
-
-  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-  cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
-
-#if CUDNN_VERSION >= 6000
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(conv_desc,
-                                                       padding_height,
-                                                       padding_width,
-                                                       stride_height,
-                                                       stride_width,
-                                                       dilation_h,
-                                                       dilation_w,
-                                                       mode,
-                                                       data_type));
-#else
-  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(conv_desc,
-                                                       padding_height,
-                                                       padding_width,
-                                                       stride_height,
-                                                       stride_width,
-                                                       dilation_h,
-                                                       dilation_w,
-                                                       mode));
-#endif
-
-  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
-  hl_conv->input_image = image;
-  hl_conv->filter = filter;
-  hl_conv->padding_height = padding_height;
-  hl_conv->padding_width = padding_width;
-  hl_conv->stride_height = stride_height;
-  hl_conv->stride_width = stride_width;
-  hl_conv->upscalex = 1;
-  hl_conv->upscaley = 1;
-  hl_conv->mode = mode;
-}
-
-void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {
-  CHECK_NOTNULL(conv);
-
-  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
-  CHECK_NOTNULL(hl_conv->desc);
-
-  CHECK_CUDNN(dynload::cudnnDestroyConvolutionDescriptor(hl_conv->desc));
-  hl_conv->desc = NULL;
-
-  free(conv);
-}
-
-void hl_convolution_forward(hl_tensor_descriptor input,
-                            real* input_data,
-                            hl_tensor_descriptor output,
-                            real* output_data,
-                            hl_filter_descriptor filter,
-                            real* filter_data,
-                            hl_convolution_descriptor conv,
-                            void* gpuWorkSpace,
-                            size_t sizeInBytes,
-                            int convFwdAlgo) {
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(filter);
-  CHECK_NOTNULL(conv);
-  CHECK_NOTNULL(input_data);
-  CHECK_NOTNULL(output_data);
-  CHECK_NOTNULL(filter_data);
-  cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnTensorDescriptor_t dest_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
-  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  CHECK_CUDNN(dynload::cudnnConvolutionForward(
-      t_resource.cudnn_handle,
-      &alpha,
-      src_desc,
-      input_data,
-      filter_desc,
-      filter_data,
-      conv_desc,
-      static_cast<cudnnConvolutionFwdAlgo_t>(convFwdAlgo),
-      gpuWorkSpace,
-      sizeInBytes,
-      &beta,
-      dest_desc,
-      output_data));
-  CHECK_SYNC("hl_convolution_forward failed");
-}
-
-void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
-                                     real* bias_data,
-                                     hl_tensor_descriptor output,
-                                     real* output_data) {
-  CHECK_NOTNULL(bias);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(bias_data);
-  CHECK_NOTNULL(output_data);
-
-  cudnnTensorDescriptor_t output_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
-  real alpha = 1.0f;
-  real beta = 1.0f;
-
-  CHECK_CUDNN(dynload::cudnnAddTensor(t_resource.cudnn_handle,
-#if CUDNN_VERSION < 4000
-                                      CUDNN_ADD_SAME_C,
-#endif
-                                      &alpha,
-                                      bias_desc,
-                                      bias_data,
-                                      &beta,
-                                      output_desc,
-                                      output_data));
-  CHECK_SYNC("hl_convolution_forward_add_bias failed");
-}
-
-void hl_convolution_backward_bias(hl_tensor_descriptor bias,
-                                  real* bias_grad_data,
-                                  hl_tensor_descriptor output,
-                                  real* output_grad_data) {
-  CHECK_NOTNULL(bias);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(bias_grad_data);
-  CHECK_NOTNULL(output_grad_data);
-
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
-  CHECK_CUDNN(dynload::cudnnConvolutionBackwardBias(t_resource.cudnn_handle,
-                                                    &alpha,
-                                                    diff_desc,
-                                                    output_grad_data,
-                                                    &beta,
-                                                    bias_desc,
-                                                    bias_grad_data));
-  CHECK_SYNC("hl_convolution_backward_bias failed");
-}
-
-void hl_convolution_backward_filter(hl_tensor_descriptor input,
-                                    real* input_data,
-                                    hl_tensor_descriptor output,
-                                    real* output_grad_data,
-                                    hl_filter_descriptor filter,
-                                    real* filter_grad_data,
-                                    hl_convolution_descriptor conv,
-                                    void* gpuWorkSpace,
-                                    size_t sizeInBytes,
-                                    int convBwdFilterAlgo) {
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(filter);
-  CHECK_NOTNULL(conv);
-  CHECK_NOTNULL(input_data);
-  CHECK_NOTNULL(output_grad_data);
-  CHECK_NOTNULL(filter_grad_data);
-
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-  cudnnFilterDescriptor_t grad_desc = GET_FILTER_DESCRIPTOR(filter);
-
-  CHECK_CUDNN(dynload::cudnnConvolutionBackwardFilter(
-      t_resource.cudnn_handle,
-      &alpha,
-      src_desc,
-      input_data,
-      diff_desc,
-      output_grad_data,
-      conv_desc,
-#if CUDNN_VERSION >= 4000
-      static_cast<cudnnConvolutionBwdFilterAlgo_t>(convBwdFilterAlgo),
-      gpuWorkSpace,
-      sizeInBytes,
-#endif
-      &beta,
-      grad_desc,
-      filter_grad_data));
-  CHECK_SYNC("hl_convolution_backward_filter failed");
-}
-
-void hl_convolution_backward_data(hl_tensor_descriptor input,
-                                  real* input_data_grad,
-                                  hl_tensor_descriptor output,
-                                  real* output_grad_data,
-                                  hl_filter_descriptor filter,
-                                  real* filter_data,
-                                  hl_convolution_descriptor conv,
-                                  void* gpuWorkSpace,
-                                  size_t sizeInBytes,
-                                  int convBwdDataAlgo) {
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
-  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnTensorDescriptor_t grad_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
-
-  CHECK_CUDNN(dynload::cudnnConvolutionBackwardData(
-      t_resource.cudnn_handle,
-      &alpha,
-      filter_desc,
-      filter_data,
-      diff_desc,
-      output_grad_data,
-      conv_desc,
-#if CUDNN_VERSION >= 4000
-      static_cast<cudnnConvolutionBwdDataAlgo_t>(convBwdDataAlgo),
-      gpuWorkSpace,
-      sizeInBytes,
-#endif
-      &beta,
-      grad_desc,
-      input_data_grad));
-  CHECK_SYNC("hl_convolution_backward_data failed");
-}
-
-void hl_softmax_forward(real* input, real* output, int height, int width) {
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc,
-                                                  CUDNN_TENSOR_NCHW,
-                                                  data_type,
-                                                  height,
-                                                  width,
-                                                  1,
-                                                  1));
-
-  real alpha = 1.0f;
-  real beta = 0.0f;
-  CHECK_CUDNN(dynload::cudnnSoftmaxForward(t_resource.cudnn_handle,
-                                           CUDNN_SOFTMAX_ACCURATE,
-                                           CUDNN_SOFTMAX_MODE_CHANNEL,
-                                           &alpha,
-                                           t_resource.cudnn_desc,
-                                           input,
-                                           &beta,
-                                           t_resource.cudnn_desc,
-                                           output));
-  CHECK_SYNC("hl_softmax_forward failed");
-}
-
-void hl_softmax_backward(real* output_value,
-                         real* output_grad,
-                         int height,
-                         int width) {
-#ifndef PADDLE_TYPE_DOUBLE
-  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#else
-  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
-#endif
-  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc,
-                                                  CUDNN_TENSOR_NCHW,
-                                                  data_type,
-                                                  height,
-                                                  width,
-                                                  1,
-                                                  1));
-
-  real alpha = 1.0f;
-  real beta = 0.0f;
-  CHECK_CUDNN(dynload::cudnnSoftmaxBackward(t_resource.cudnn_handle,
-                                            CUDNN_SOFTMAX_ACCURATE,
-                                            CUDNN_SOFTMAX_MODE_CHANNEL,
-                                            &alpha,
-                                            t_resource.cudnn_desc,
-                                            output_value,
-                                            t_resource.cudnn_desc,
-                                            output_grad,
-                                            &beta,
-                                            t_resource.cudnn_desc,
-                                            output_grad));
-  CHECK_SYNC("hl_softmax_backward failed");
-}
-
-void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
-                                    real* input,
-                                    hl_tensor_descriptor outputDesc,
-                                    real* output,
-                                    hl_tensor_descriptor bnParamDesc,
-                                    real* scale,
-                                    real* bias,
-                                    double factor,
-                                    real* runningMean,
-                                    real* runningInvVar,
-                                    double epsilon,
-                                    real* savedMean,
-                                    real* savedVar) {
-#if CUDNN_VERSION >= 4007
-  if ((NULL != runningMean && NULL == runningInvVar) ||
-      (NULL == runningMean && NULL != runningInvVar)) {
-    LOG(FATAL) << "runningMean and runningInvVar can be NULL "
-               << "but only at the same time.";
-  }
-  if ((NULL != savedMean && NULL == savedVar) ||
-      (NULL == savedMean && NULL != savedVar)) {
-    LOG(FATAL) << "savedMean and savedVar can be NULL "
-               << "but only at the same time.";
-  }
-
-  cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
-  cudnnTensorDescriptor_t yDesc = GET_TENSOR_DESCRIPTOR(outputDesc);
-  cudnnTensorDescriptor_t bnDesc = GET_TENSOR_DESCRIPTOR(bnParamDesc);
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
-  CHECK_CUDNN(
-      dynload::cudnnBatchNormalizationForwardTraining(t_resource.cudnn_handle,
-                                                      mode,
-                                                      &alpha,
-                                                      &beta,
-                                                      xDesc,
-                                                      input,
-                                                      yDesc,
-                                                      output,
-                                                      bnDesc,
-                                                      scale,
-                                                      bias,
-                                                      factor,
-                                                      runningMean,
-                                                      runningInvVar,
-                                                      epsilon,
-                                                      savedMean,
-                                                      savedVar));
-
-  CHECK_SYNC("hl_batch_norm_forward_training failed");
-#else
-  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
-             << "But cudnn lib version is " << g_cudnn_lib_version;
-#endif
-}
-
-void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
-                                     real* input,
-                                     hl_tensor_descriptor outputDesc,
-                                     real* output,
-                                     hl_tensor_descriptor bnParamDesc,
-                                     real* scale,
-                                     real* bias,
-                                     real* estimatedMean,
-                                     real* estimatedInvVar,
-                                     double epsilon) {
-#if CUDNN_VERSION >= 4007
-  cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
-  cudnnTensorDescriptor_t yDesc = GET_TENSOR_DESCRIPTOR(outputDesc);
-  cudnnTensorDescriptor_t bnDesc = GET_TENSOR_DESCRIPTOR(bnParamDesc);
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
-
-  CHECK_CUDNN(
-      dynload::cudnnBatchNormalizationForwardInference(t_resource.cudnn_handle,
-                                                       mode,
-                                                       &alpha,
-                                                       &beta,
-                                                       xDesc,
-                                                       input,
-                                                       yDesc,
-                                                       output,
-                                                       bnDesc,
-                                                       scale,
-                                                       bias,
-                                                       estimatedMean,
-                                                       estimatedInvVar,
-                                                       epsilon));
-
-  CHECK_SYNC("hl_batch_norm_forward_inference failed");
-#else
-  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
-             << "But cudnn lib version is " << g_cudnn_lib_version;
-#endif
-}
-
-void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
-                            real* input,
-                            hl_tensor_descriptor outGradDesc,
-                            real* outGrad,
-                            hl_tensor_descriptor inGradDesc,
-                            real* inGrad,
-                            hl_tensor_descriptor dBnParamDesc,
-                            real* scale,
-                            real* scaleGrad,
-                            real* biasGrad,
-                            double epsilon,
-                            real* savedMean,
-                            real* savedInvVar) {
-#if CUDNN_VERSION >= 4007
-  if ((NULL != savedMean && NULL == savedInvVar) ||
-      (NULL == savedMean && NULL != savedInvVar)) {
-    LOG(FATAL) << "savedMean and savedVar can be NULL "
-               << "but only at the same time.";
-  }
-
-  cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
-  cudnnTensorDescriptor_t dyDesc = GET_TENSOR_DESCRIPTOR(outGradDesc);
-  cudnnTensorDescriptor_t dxDesc = GET_TENSOR_DESCRIPTOR(inGradDesc);
-  cudnnTensorDescriptor_t bnDesc = GET_TENSOR_DESCRIPTOR(dBnParamDesc);
-  real alpha = 1.0f;
-  real beta = 1.0f;
-  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
-  CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward(t_resource.cudnn_handle,
-                                                       mode,
-                                                       &alpha,
-                                                       &beta,
-                                                       &alpha,
-                                                       &beta,
-                                                       xDesc,
-                                                       input,
-                                                       dyDesc,
-                                                       outGrad,
-                                                       dxDesc,
-                                                       inGrad,
-                                                       bnDesc,
-                                                       scale,
-                                                       scaleGrad,
-                                                       biasGrad,
-                                                       epsilon,
-                                                       savedMean,
-                                                       savedInvVar));
-
-  CHECK_SYNC("hl_batch_norm_backward failed");
-#else
-  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
-             << "But cudnn lib version is " << g_cudnn_lib_version;
-#endif
-}
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
deleted file mode 100644
index 3025aa48523d67fe3d7ed03f44252d1211d2a46a..0000000000000000000000000000000000000000
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ /dev/null
@@ -1,677 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// clang-format off
-// Because clang-format 4.X and clang-format 3.8+ format
-// following lines in different. So disable clang-format.
-#include "hl_cuda.h"
-#include <cuda_profiler_api.h>
-#include <string.h>
-#include <sys/syscall.h>
-#include <sys/time.h>
-#include <unistd.h>
-#include "hl_cuda.ph"
-#include "hl_thread.ph"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/DynamicLoader.h"
-// clang-format on
-
-namespace dynload {
-
-std::once_flag curand_dso_flag;
-void *curand_dso_handle = nullptr;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load curand routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-#ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)                                       \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    curandStatus_t operator()(Args... args) {                                  \
-      typedef curandStatus_t (*curandFunc)(Args...);                           \
-      std::call_once(curand_dso_flag, GetCurandDsoHandle, &curand_dso_handle); \
-      void *p_##__name = dlsym(curand_dso_handle, #__name);                    \
-      return reinterpret_cast<curandFunc>(p_##__name)(args...);                \
-    }                                                                          \
-  } __name; /* struct DynLoad__##__name */
-#else
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)      \
-  struct DynLoad__##__name {                  \
-    template <typename... Args>               \
-    curandStatus_t operator()(Args... args) { \
-      return __name(args...);                 \
-    }                                         \
-  } __name; /* struct DynLoad__##__name */
-#endif
-
-/* include all needed curand functions in HPPL */
-// clang-format off
-#define CURAND_RAND_ROUTINE_EACH(__macro)    \
-  __macro(curandCreateGenerator)             \
-  __macro(curandSetStream)                   \
-  __macro(curandSetPseudoRandomGeneratorSeed)\
-  __macro(curandGenerateUniform)             \
-  __macro(curandGenerateUniformDouble)
-// clang-format on
-
-CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
-
-#undef CURAND_RAND_ROUTINE_EACH
-#undef DYNAMIC_LOAD_CURAND_WRAP
-
-} /* namespace dynload */
-
-/**
- * @brief   global resource.
- */
-int g_system_device_num = 0;                /* system device number */
-int device_num = 0;                         /* use    device number */
-hl_device_prop *g_device;                   /* device info table */
-__thread thread_device_resources *t_device; /* device resources table */
-int g_cuda_lib_version = 0;
-
-/* number of global stream */
-#define NUMBER_OF_GLOBAL_STREAM (HPPL_THREAD_STREAM_1)
-/* number of thread stream */
-#define NUMBER_OF_THREAD_STREAM (HPPL_STREAM_END - HPPL_THREAD_STREAM_1)
-/* sizeof of device memory */
-#define HPPL_GPU_MEMORY_SIZE (256 * 4)
-
-/**
- * Check build-in cuda function using glog and it **does not**
- * support << operator for more details error info.
- */
-#define CHECK_CUDA(cudaFunc)                                         \
-  do {                                                               \
-    cudaError_t cudaStat = cudaFunc;                                 \
-    CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "                \
-                                    << cudaGetErrorString(cudaStat); \
-  } while (0)
-
-/**
- * @brief   thread resource.
- */
-__thread _hl_thread_resource t_resource = {{0},    /* stream */
-                                           0,      /* handle */
-                                           0,      /* gen */
-                                           0,      /* cudnn_handle */
-                                           0,      /* cudnn_desc */
-                                           NULL,   /* gen_mutex */
-                                           NULL,   /* gpu_mem */
-                                           NULL,   /* cpu_mem */
-                                           0,      /* event */
-                                           -1,     /* device */
-                                           0,      /* major */
-                                           false}; /* is_init */
-
-__thread cudaStream_t default_stream = 0;
-__thread bool g_sync_flag = true;
-bool hl_start_flag = false;
-
-inline pid_t gettid() {
-#if defined(__APPLE__) || defined(__OSX__)
-  // syscall is deprecated: first deprecated in macOS 10.12.
-  // syscall is unsupported;
-  // syscall pid_t tid = syscall(SYS_thread_selfid);
-  uint64_t tid;
-  pthread_threadid_np(NULL, &tid);
-#else
-#ifndef __NR_gettid
-#define __NR_gettid 224
-#endif
-  pid_t tid = syscall(__NR_gettid);
-#endif
-  CHECK_NE((int)tid, -1);
-  return tid;
-}
-
-void hl_init(int device) {
-  CHECK(hl_start_flag) << "[Init failed] hl_start() did not succeed.";
-
-  /* thread has been initialized */
-  if (true == t_resource.is_init) {
-    hl_set_device(device);
-    return;
-  }
-
-  /* create thread devcie resources */
-  char *tmp;
-  thread_device_resources device_res;
-  tmp = (char *)malloc(g_system_device_num * sizeof(thread_device_resources *) +
-                       device_num * sizeof(_thread_device_resources));
-  CHECK_NOTNULL(tmp);
-  t_device = (thread_device_resources *)tmp;
-  device_res = (thread_device_resources)(
-      (char *)tmp + g_system_device_num * sizeof(thread_device_resources *));
-  memset(t_device, 0, g_system_device_num * sizeof(thread_device_resources *));
-
-  char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_THREAD_STREAM *
-                                    sizeof(cudaStream_t));
-  CHECK_NOTNULL(tmp_stream);
-
-  int num = 0;
-  for (int dev = 0; dev < g_system_device_num; dev++) {
-    if (!g_device[dev]) {
-      continue;
-    }
-
-    t_device[dev] = &device_res[num];
-    t_device[dev]->stream =
-        (cudaStream_t *)(tmp_stream +
-                         num * NUMBER_OF_THREAD_STREAM * sizeof(cudaStream_t));
-
-    hl_create_thread_resources(dev, t_device[dev]);
-    num++;
-  }
-
-  hl_cudnn_desc_init(&t_resource.cudnn_desc);
-
-  /* thread initialization is complete */
-  t_resource.is_init = true;
-  /* set device */
-  t_resource.device = -1;
-  hl_set_device(device);
-}
-
-void hl_fini() {
-  if (false == t_resource.is_init) {
-    return;
-  }
-
-  /* hppl stream fini */
-  t_resource.device = -1;
-  for (int i = NUMBER_OF_GLOBAL_STREAM; i < HPPL_STREAM_END; i++) {
-    t_resource.stream[i] = 0;
-  }
-
-  char *tmp = (char *)t_device;
-  char *tmp_stream = NULL;
-  for (int dev = 0; dev < g_system_device_num; dev++) {
-    if (!t_device[dev]) {
-      continue;
-    }
-    if (!tmp_stream) {
-      tmp_stream = (char *)t_device[dev]->stream;
-    }
-    for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
-      CHECK_CUDA(cudaStreamDestroy(t_device[dev]->stream[j]));
-    }
-
-    /* free device memory */
-    hl_free_mem_device(t_device[dev]->gpu_mem);
-    hl_free_mem_host(t_device[dev]->cpu_mem);
-    CHECK_CUDA(cudaEventDestroy(t_device[dev]->mem_event));
-  }
-
-  free(tmp);
-  free(tmp_stream);
-  t_resource.is_init = false;
-}
-
-int hl_get_device_count() { return device_num; }
-
-void hl_set_device(int device) {
-  if (device == t_resource.device) {
-    return;
-  }
-
-  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-      << "Device: " << device << " is not specified in startup.";
-
-  CHECK_CUDA(cudaSetDevice(device));
-
-  /* switch thread stream */
-  for (int i = 0; i < NUMBER_OF_GLOBAL_STREAM; i++) {
-    t_resource.stream[i] = g_device[device]->device_resources->stream[i];
-  }
-
-  if (true == t_resource.is_init) {
-    for (int i = NUMBER_OF_GLOBAL_STREAM; i < HPPL_STREAM_END; i++) {
-      t_resource.stream[i] =
-          t_device[device]->stream[i - NUMBER_OF_GLOBAL_STREAM];
-    }
-    t_resource.gpu_mem = t_device[device]->gpu_mem;
-    t_resource.cpu_mem = t_device[device]->cpu_mem;
-    t_resource.event = t_device[device]->mem_event;
-  }
-
-  t_resource.handle = g_device[device]->device_resources->handle;
-  t_resource.gen = g_device[device]->device_resources->gen;
-  t_resource.cudnn_handle = g_device[device]->device_resources->cudnn_handle;
-  t_resource.gen_mutex = g_device[device]->device_resources->gen_mutex;
-  t_resource.device = device;
-  t_resource.major = g_device[device]->major;
-  default_stream = t_resource.stream[0];
-}
-
-int hl_get_device() {
-  int device;
-  CHECK_CUDA(cudaGetDevice(&device));
-  return device;
-}
-
-void *hl_malloc_device(size_t size) {
-  void *dest_d;
-
-  CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(cudaMalloc((void **)&dest_d, size));
-
-  return dest_d;
-}
-
-void hl_free_mem_device(void *dest_d) {
-  CHECK_NOTNULL(dest_d);
-
-  cudaError_t err = cudaFree(dest_d);
-  CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
-      << hl_get_device_error_string();
-}
-
-void *hl_malloc_host(size_t size) {
-  void *dest_h;
-
-  CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
-
-  return dest_h;
-}
-
-void hl_free_mem_host(void *dest_h) {
-  CHECK_NOTNULL(dest_h);
-
-  cudaError_t err = cudaFreeHost(dest_h);
-  CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
-      << hl_get_device_error_string();
-}
-
-void hl_memcpy(void *dst, void *src, size_t size) {
-  if (0 == size) {
-    return;
-  }
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(src);
-  CHECK_CUDA(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
-}
-
-void hl_memset_device(void *dest_d, int value, size_t size) {
-  CHECK_CUDA(cudaMemset(dest_d, value, size));
-}
-
-void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
-  if (0 == size) {
-    return;
-  }
-  CHECK_NOTNULL(src_h);
-  CHECK_NOTNULL(dest_d);
-  CHECK_CUDA(cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
-}
-
-void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
-  if (0 == size) {
-    return;
-  }
-  CHECK_NOTNULL(dest_h);
-  CHECK_NOTNULL(src_d);
-  CHECK_CUDA(cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
-}
-
-void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
-  if (0 == size) {
-    return;
-  }
-  CHECK_NOTNULL(dest_d);
-  CHECK_NOTNULL(src_d);
-  CHECK_CUDA(cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
-}
-
-void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
-  cudaStream_t cu_stream;
-
-  if (0 == size) {
-    return;
-  }
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(src);
-  CHECK_LT(stream, HPPL_STREAM_END);
-  cu_stream = t_resource.stream[stream];
-
-  CHECK_CUDA(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
-}
-
-void hl_start() {
-  hl_specify_devices_start(NULL, 0);
-  /* set default device */
-  hl_set_device(0);
-}
-
-bool hl_device_can_access_peer(int device, int peerDevice) {
-  int canAccessPeer;
-  CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
-
-  if (canAccessPeer == 1) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-void hl_device_enable_peer_access(int peerDevice) {
-  cudaError_t err = cudaDeviceEnablePeerAccess(peerDevice, 0);
-  if (cudaErrorPeerAccessAlreadyEnabled == err) {
-    cudaGetLastError();
-  } else {
-    CHECK_CUDA(err);
-  }
-}
-
-void hl_create_global_resources(hl_device_prop device_prop) {
-  struct cudaDeviceProp cu_prop;
-  int device = device_prop->device;
-  global_device_resources device_res = device_prop->device_resources;
-
-  CHECK_CUDA(cudaSetDevice(device));
-  /* device properties */
-  CHECK_CUDA(cudaGetDeviceProperties(&cu_prop, device));
-
-  device_prop->major = cu_prop.major;
-  device_prop->minor = cu_prop.minor;
-  strncpy(device_prop->device_name, cu_prop.name, 256);
-  device_prop->device_mem = cu_prop.totalGlobalMem;
-
-  /* create device stream */
-  for (int j = 0; j < NUMBER_OF_GLOBAL_STREAM; j++) {
-    CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
-  }
-
-  /* cublas init */
-  hl_cublas_init(&device_res->handle, device_res->stream[0]);
-
-  /* create curand gen */
-  CHECK_EQ(dynload::curandCreateGenerator(&device_res->gen,
-                                          CURAND_RNG_PSEUDO_DEFAULT),
-           CURAND_STATUS_SUCCESS)
-      << "[Start failed] Curand init failed.";
-
-  CHECK_EQ(dynload::curandSetStream(device_res->gen, device_res->stream[0]),
-           CURAND_STATUS_SUCCESS)
-      << "[Start failed] Curand set stream failed!";
-
-  /* create cudnn handle */
-  hl_cudnn_init(&device_res->cudnn_handle, device_res->stream[0]);
-
-  int seed = gettid();
-  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(device_res->gen,
-                                                       seed + device),
-           CURAND_STATUS_SUCCESS);
-
-  device_res->gen_mutex = (pthread_mutex_t *)(malloc(sizeof(pthread_mutex_t)));
-  pthread_mutex_init(device_res->gen_mutex, NULL);
-
-  CHECK_CUDA(cudaRuntimeGetVersion(&g_cuda_lib_version));
-}
-
-int hl_get_cuda_version() { return g_cuda_lib_version; }
-
-void hl_create_thread_resources(int device,
-                                thread_device_resources device_res) {
-  CHECK_CUDA(cudaSetDevice(device));
-
-  /* create thread stream */
-  for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
-    CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
-  }
-
-  /* allocation device memory */
-  device_res->gpu_mem = (real *)hl_malloc_device(HPPL_GPU_MEMORY_SIZE);
-
-  /* allocation host memory */
-  device_res->cpu_mem = (real *)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
-
-  CHECK_CUDA(cudaEventCreate(&device_res->mem_event));
-}
-
-void hl_specify_devices_start(int *device, int number) {
-  if (hl_start_flag) return;
-
-  /* 1. get the number of devices */
-  CHECK_CUDA(cudaGetDeviceCount(&g_system_device_num));
-  CHECK_NE(g_system_device_num, 0) << "[Start failed] there is no GPU device";
-  if (device == NULL) {
-    number = g_system_device_num;
-  }
-
-  /* 2. check device & create device property table */
-  CHECK_LE(number, g_system_device_num)
-      << "[Start failed] System does not have enough device. "
-      << "Device number: " << g_system_device_num << "Input number: " << number;
-
-  char *tmp;
-  hl_device_prop device_prop;
-  tmp = (char *)malloc(g_system_device_num * sizeof(hl_device_prop *) +
-                       number * sizeof(_hl_device_prop));
-  CHECK(tmp) << "[Start failed] System memory is not enough.";
-
-  g_device = (hl_device_prop *)tmp;
-  device_prop = (hl_device_prop)(
-      (char *)tmp + g_system_device_num * sizeof(hl_device_prop *));
-  memset(g_device, 0, g_system_device_num * sizeof(hl_device_prop *));
-  int num = 0;
-  for (int i = 0; i < number; i++) {
-    int dev;
-    if (device == NULL) {
-      dev = i;
-    } else {
-      dev = device[i];
-    }
-
-    CHECK_LT(dev, g_system_device_num)
-        << "[Start failed] The specified device number is "
-        << "out of range. Max device number: " << g_system_device_num - 1
-        << " Specified devcie number: " << dev;
-
-    if (g_device[dev]) {
-      /* Warning */
-      LOG(WARNING) << "[Warning] Repeat specify device: " << dev;
-      continue;
-    }
-
-    g_device[dev] = &device_prop[num];
-    g_device[dev]->device = dev;
-    num++;
-  }
-  device_num = num;
-
-  /* 3.  create global device resources */
-  char *tmp_res = (char *)malloc(device_num * sizeof(_global_device_resources));
-  CHECK_NOTNULL(tmp_res);
-
-  char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_GLOBAL_STREAM *
-                                    sizeof(cudaStream_t));
-  CHECK_NOTNULL(tmp_stream);
-
-  num = 0;
-  for (int i = 0; i < g_system_device_num; i++) {
-    if (!g_device[i]) {
-      continue;
-    }
-
-    g_device[i]->device_resources = (global_device_resources)(
-        tmp_res + num * sizeof(_global_device_resources));
-    g_device[i]->device_resources->stream =
-        (cudaStream_t *)(tmp_stream +
-                         num * NUMBER_OF_GLOBAL_STREAM * sizeof(cudaStream_t));
-
-    hl_create_global_resources(g_device[i]);
-    num++;
-  }
-
-  /* hl_start() is ok */
-  hl_start_flag = true;
-  /* set default device */
-  if (device == NULL) {
-    hl_set_device(0);
-  } else {
-    hl_set_device(device[0]);
-  }
-}
-
-void hl_rand(real *dest_d, size_t num) {
-  pthread_mutex_lock(t_resource.gen_mutex);
-  CHECK_EQ(
-#ifndef PADDLE_TYPE_DOUBLE
-      dynload::curandGenerateUniform(t_resource.gen, dest_d, num),
-#else
-      dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num),
-#endif
-      CURAND_STATUS_SUCCESS);
-  pthread_mutex_unlock(t_resource.gen_mutex);
-  CHECK_SYNC("hl_rand failed");
-}
-
-void hl_srand(unsigned int seed) {
-  pthread_mutex_lock(t_resource.gen_mutex);
-  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(t_resource.gen, seed),
-           CURAND_STATUS_SUCCESS);
-  pthread_mutex_unlock(t_resource.gen_mutex);
-}
-
-void hl_set_sync_flag(bool flag) { g_sync_flag = flag; }
-
-bool hl_get_sync_flag() { return g_sync_flag; }
-
-void hl_stream_synchronize(hl_stream_t stream) {
-  cudaStream_t cu_stream;
-
-  CHECK_LT(stream, HPPL_STREAM_END) << __func__
-                                    << ": the parameter stream is error.";
-
-  cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(cudaStreamSynchronize(cu_stream));
-}
-
-void hl_create_event(hl_event_t *event) {
-  CHECK_NOTNULL(event);
-
-  struct _hl_event_st *st_event =
-      (struct _hl_event_st *)malloc(sizeof(struct _hl_event_st));
-
-  CHECK_CUDA(cudaEventCreate(&st_event->cu_event));
-
-  *event = st_event;
-}
-
-float hl_event_elapsed_time(hl_event_t start, hl_event_t end) {
-  float time;
-  CHECK_NOTNULL(start);
-  CHECK_NOTNULL(end);
-
-  CHECK_CUDA(cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
-  return time;
-}
-
-void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {
-  cudaStream_t cu_stream;
-
-  CHECK_NOTNULL(event);
-  CHECK_LT(stream, HPPL_STREAM_END) << __func__
-                                    << ": the parameter stream is error.";
-
-  cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(cudaEventRecord(event->cu_event, cu_stream));
-}
-
-void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
-  cudaStream_t cu_stream;
-
-  CHECK_NOTNULL(event);
-  CHECK_LT(stream, HPPL_STREAM_END) << __func__
-                                    << ": the parameter stream is error.";
-
-  cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
-}
-
-void hl_destroy_event(hl_event_t event) {
-  CHECK_NOTNULL(event);
-  CHECK_CUDA(cudaEventDestroy(event->cu_event));
-
-  free(event);
-  event = NULL;
-}
-
-void hl_event_synchronize(hl_event_t event) {
-  CHECK_NOTNULL(event);
-  CHECK_CUDA(cudaEventSynchronize(event->cu_event));
-}
-
-void hl_get_device_name(char *name, int len, int device) {
-  CHECK_NOTNULL(name);
-  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-      << "Device(" << device << ") is not specified in startup.";
-
-  strncpy(name, g_device[device]->device_name, len);
-}
-
-void hl_get_device_memory(size_t *mem_size, int device) {
-  CHECK_NOTNULL(mem_size);
-  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-      << "Device(" << device << ") is not specified in startup.";
-
-  *mem_size = g_device[device]->device_mem;
-}
-
-void hl_get_device_compute_capability(int *major, int *minor, int device) {
-  CHECK_NOTNULL(major);
-  CHECK_NOTNULL(minor);
-  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
-      << "Device(" << device << ") is not specified in startup.";
-
-  *major = g_device[device]->major;
-  *minor = g_device[device]->minor;
-}
-
-int hl_get_device_last_error() { return (int)cudaGetLastError(); }
-
-const char *hl_get_device_error_string() {
-  cudaError_t err = cudaGetLastError();
-  return cudaGetErrorString(err);
-}
-
-const char *hl_get_device_error_string(size_t err) {
-  return cudaGetErrorString((cudaError_t)err);
-}
-
-void hl_device_synchronize() { CHECK_CUDA(cudaDeviceSynchronize()); }
-void hl_set_device_flags_block() {
-  CHECK_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
-}
-
-bool hl_cuda_event_is_ready(hl_event_t event) {
-  cudaError_t err = cudaEventQuery(event->cu_event);
-  CHECK(cudaSuccess == err || cudaErrorNotReady == err);
-
-  if (cudaErrorNotReady == err) {
-    return false;
-  }
-  return true;
-}
-
-void hl_profiler_start() { CHECK_CUDA(cudaProfilerStart()); }
-
-void hl_profiler_end() { CHECK_CUDA(cudaProfilerStop()); }
diff --git a/paddle/cuda/src/hl_cuda_lstm.cu b/paddle/cuda/src/hl_cuda_lstm.cu
deleted file mode 100644
index 21c0c26b6ef0420b1a719736a66eeb8114ed9680..0000000000000000000000000000000000000000
--- a/paddle/cuda/src/hl_cuda_lstm.cu
+++ /dev/null
@@ -1,872 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_activation_functions.h"
-#include "hl_base.h"
-#include "hl_cuda_cublas.h"
-#include "hl_device_functions.cuh"
-#include "paddle/utils/Logging.h"
-
-typedef hppl::Active<real>::forward t_forward;
-typedef hppl::Active<real>::backward t_backward;
-
-bool hl_lstm_sequence_parallel(int frameSize) {
-  if (frameSize == 32 || frameSize == 64) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-class frameValue {
-public:
-  real *value_;
-  __device__ frameValue(real *value) : value_(value) {}
-  template <int reversed, int frameSize>
-  __device__ inline void init(int start, int length, int idx) {
-    if (reversed == 0) {
-      value_ += start * frameSize + idx;
-    } else {
-      value_ += (start + length - 1) * frameSize + idx;
-    }
-  }
-  __device__ inline real *getPtr() const { return value_; }
-  __device__ inline real getValue() { return *value_; }
-  __device__ inline void setValue(real value) { *value_ = value; }
-  template <int reversed, int frameSize>
-  __device__ inline void nextFrame() {
-    if (reversed == 0) {
-      value_ += frameSize;
-    } else {
-      value_ -= frameSize;
-    }
-  }
-};
-
-__device__ __forceinline__ void ptx_sync(const int id, const int barriers) {
-  asm volatile("bar.sync %0, %1;" : : "r"(id), "r"(barriers) : "memory");
-}
-
-__device__ __forceinline__ void ptx_arrive(const int id, const int barriers) {
-  asm volatile("bar.arrive %0, %1;" : : "r"(id), "r"(barriers) : "memory");
-}
-
-template <int valueSize, int frameSize>
-__device__ __forceinline__ real forward_sequence(real value,
-                                                 real *shValue,
-                                                 real *state,
-                                                 real *preOutput,
-                                                 real *output,
-                                                 real check,
-                                                 int index,
-                                                 t_forward activeNode,
-                                                 t_forward activeGate,
-                                                 t_forward activeState) {
-  real out;
-  real prevOut;
-  real state_r;
-  const int idx = index % frameSize;
-  const int idy = index / frameSize;
-  // assert(index < valueSize);
-
-  if (idy == 0) {
-    value = activeNode(value);
-    shValue[index] = value;
-  }
-  if (idy == 1 || idy == 2) {
-    state_r = state[idx];
-    value += state_r * check;
-    value = activeGate(value);
-    shValue[index] = value;
-  }
-  ptx_sync(1, valueSize);
-  if (idy == 3) {
-    state_r = state[idx];
-    state_r = state_r * shValue[idx + frameSize * 2];
-    state_r += shValue[idx] * shValue[idx + frameSize];
-    state[idx] = state_r;
-    ptx_arrive(2, frameSize * 2);
-    value += state_r * check;
-    value = activeGate(value);
-    shValue[index] = value;
-    ptx_sync(3, frameSize * 2);
-    prevOut = preOutput[idx];
-    out = prevOut * value;
-    output[idx] = out;
-  }
-  if (idy == 0) {
-    ptx_sync(2, frameSize * 2);
-    prevOut = state[idx];
-    prevOut = activeState(prevOut);
-    preOutput[idx] = prevOut;
-    ptx_arrive(3, frameSize * 2);
-  }
-  return value;
-}
-
-#define OUTPUT_BARRIER_ID 10
-#define OUTPUT_BARRIER_ID2 11
-template <int valueSize,
-          int frameSize,
-          int reversed,
-          int computeThreads,
-          int blockSize>
-__global__ void KeLstmForward(real *gateValue,
-                              real *state,
-                              real *output,
-                              real *preOutput,
-                              real *checkIg,
-                              real *checkFg,
-                              real *checkOg,
-                              real *weight,
-                              const int *starts,
-                              hl_activation_mode_t active_node,
-                              hl_activation_mode_t active_gate,
-                              hl_activation_mode_t active_state) {
-  __shared__ real shValue[valueSize];
-  __shared__ real shState[frameSize];
-  __shared__ real shPrevOutput[frameSize];
-  __shared__ real shOutput[frameSize];
-
-  const int index = threadIdx.x;
-  int start = starts[blockIdx.x];
-  int length = starts[blockIdx.x + 1] - start;
-
-  /* init */
-  real check;
-  real value;
-  frameValue frameGate(gateValue);
-  frameValue frameState(state);
-  frameValue frameOutput(output);
-  frameValue framePreOutput(preOutput);
-  if (index < valueSize) {
-    const int idx = index % frameSize;
-    const int idy = index / frameSize;
-    frameGate.init<reversed, valueSize>(start, length, index);
-    value = frameGate.getValue();
-    if (idy == 0) {
-      shState[idx] = 0.0;
-    } else if (idy == 1) {
-      check = checkIg[idx];
-    } else if (idy == 2) {
-      check = checkFg[idx];
-    } else if (idy == 3) {
-      check = checkOg[idx];
-    }
-
-    if (idy == 3) {
-      frameState.init<reversed, frameSize>(start, length, idx);
-      frameOutput.init<reversed, frameSize>(start, length, idx);
-      framePreOutput.init<reversed, frameSize>(start, length, idx);
-    }
-
-    ptx_sync(1, valueSize);
-  }
-
-  for (int i = 0; i < length; ++i) {
-    if (index < valueSize) {
-      if (valueSize == 128) {
-        if (i != 0) {
-          ptx_sync(OUTPUT_BARRIER_ID2, blockSize);
-          value += shValue[index];
-        }
-      }
-      value = forward_sequence<valueSize, frameSize>(
-          value,
-          shValue,
-          shState,
-          shPrevOutput,
-          shOutput,
-          check,
-          index,
-          hppl::gpu::forward[active_node],
-          hppl::gpu::forward[active_gate],
-          hppl::gpu::forward[active_state]);
-      const int idx = index % frameSize;
-      const int idy = index / frameSize;
-      if (valueSize == 128) {
-        if (idy == 3) {
-          ptx_arrive(OUTPUT_BARRIER_ID, frameSize + 128);
-        }
-      }
-      if (valueSize == 256) {
-        ptx_sync(OUTPUT_BARRIER_ID, valueSize);
-      }
-      frameGate.setValue(value);
-      if (idy == 3) {
-        frameState.setValue(shState[idx]);
-        frameOutput.setValue(shOutput[idx]);
-        framePreOutput.setValue(shPrevOutput[idx]);
-        frameState.nextFrame<reversed, frameSize>();
-        frameOutput.nextFrame<reversed, frameSize>();
-        framePreOutput.nextFrame<reversed, frameSize>();
-      }
-      if (i != length - 1) {
-        frameGate.nextFrame<reversed, valueSize>();
-        value = frameGate.getValue();
-      }
-    }
-    if (i != length - 1) {
-      if (valueSize == 128) {
-        if (valueSize <= index) {
-          real B_r[frameSize];
-          const int computeIdx = index - valueSize;
-          if (i == 0) {
-#pragma unroll
-            for (int n = 0; n < frameSize; n++) {
-              B_r[n] = weight[n * valueSize + computeIdx];
-            }
-          }
-          ptx_sync(OUTPUT_BARRIER_ID, frameSize + 128);
-          real A_r[frameSize];
-          for (int n = 0; n < frameSize; n++) {
-            A_r[n] = shOutput[n];
-          }
-          real sum = 0.0f;
-          for (int n = 0; n < frameSize; n++) {
-            sum += A_r[n] * B_r[n];
-          }
-          shValue[computeIdx] = sum;
-          ptx_arrive(OUTPUT_BARRIER_ID2, blockSize);
-        }
-      }
-      if (valueSize == 256) {
-        real B_r[frameSize];
-        if (i == 0) {
-#pragma unroll
-          for (int n = 0; n < frameSize; n++) {
-            B_r[n] = weight[n * valueSize + index];
-          }
-        }
-        real sum = 0.0f;
-        for (int n = 0; n < frameSize; n++) {
-          sum += shOutput[n] * B_r[n];
-        }
-        value += sum;
-      }
-    }
-  }
-}
-
-void hl_lstm_parallel_forward(real *gateValue,
-                              real *stateValue,
-                              real *preOutputValue,
-                              real *outputValue,
-                              real *checkIg,
-                              real *checkFg,
-                              real *checkOg,
-                              real *weight,
-                              const int *sequence,
-                              int frameSize,
-                              int numSequences,
-                              bool reversed,
-                              hl_activation_mode_t active_node,
-                              hl_activation_mode_t active_gate,
-                              hl_activation_mode_t active_state) {
-  CHECK(frameSize == 32 || frameSize == 64);
-  dim3 grid(numSequences, 1);
-  if (!reversed) {
-    if (frameSize == 32) {
-      KeLstmForward<128, 32, 0, 128, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          stateValue,
-          outputValue,
-          preOutputValue,
-          checkIg,
-          checkFg,
-          checkOg,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 64) {
-      KeLstmForward<256, 64, 0, 256, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          stateValue,
-          outputValue,
-          preOutputValue,
-          checkIg,
-          checkFg,
-          checkOg,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    }
-  } else {
-    if (frameSize == 32) {
-      KeLstmForward<128, 32, 1, 128, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          stateValue,
-          outputValue,
-          preOutputValue,
-          checkIg,
-          checkFg,
-          checkOg,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 64) {
-      KeLstmForward<256, 64, 1, 256, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          stateValue,
-          outputValue,
-          preOutputValue,
-          checkIg,
-          checkFg,
-          checkOg,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    }
-  }
-  CHECK_SYNC("hl_lstm_parallel_forward failed");
-}
-
-__device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
-  int addr = idx % 32;
-#pragma unroll
-  for (int k = 1; k < 32; k++) {
-    // rSrc[k] = __shfl(rSrc[k], (threadIdx.x + k) % 32, 32);
-    addr = __shfl(addr, (idx + 1) % 32, 32);
-    a[k] = __shfl(a[k], addr, 32);
-  }
-
-#pragma unroll
-  for (int tid = 0; tid < 31; tid++) {
-    real tmp = (idx > tid) ? a[0] : a[1];
-#pragma unroll
-    for (int k = 31; k > 0; k--) {
-      a[(k + 1) % 32] = (idx > tid) ? a[k] : a[(k + 1) % 32];
-    }
-    a[1] = tmp;
-  }
-
-  addr = (32 - idx) % 32;
-#pragma unroll
-  for (int k = 0; k < 32; k++) {
-    a[k] = __shfl(a[k], addr, 32);
-    addr = __shfl(addr, (idx + 31) % 32, 32);
-  }
-}
-
-template <int valueSize, int frameSize>
-__device__ void backward_sequence(real rGateValue,
-                                  real rOutputGrad,
-                                  real rPreOutputValue,
-                                  real &rGateGrad,
-                                  real &rStateGrad,
-                                  real *shStateGrad,
-                                  real *shStateValue,
-                                  real *shGateValue,
-                                  real rCheck,
-                                  real &rGateValuePrev,
-                                  int index,
-                                  t_backward activeNode,
-                                  t_backward activeGate,
-                                  t_backward activeState) {
-  const int frameIdx = index % frameSize;
-  const int frameIdy = index / frameSize;
-  if (frameIdy == 3) {
-    real rPrevOutputGrad;
-    rPrevOutputGrad = rOutputGrad * rGateValue;
-    rStateGrad = activeState(rPrevOutputGrad, rPreOutputValue);
-    rGateGrad = rOutputGrad * rPreOutputValue;
-    rGateGrad = activeGate(rGateGrad, rGateValue);
-    rStateGrad += rGateGrad * rCheck;
-    shStateGrad[index] = rStateGrad;
-    ptx_arrive(3, valueSize);
-  } else if (frameIdy == 1) {
-    shGateValue[frameIdx + frameSize] = rGateValue;
-    rStateGrad = rGateGrad * rCheck;
-    shStateGrad[index] = rStateGrad;
-    ptx_sync(3, valueSize);
-    rStateGrad += shStateGrad[frameIdx + frameSize * 2];
-    rStateGrad += shStateGrad[frameIdx + frameSize * 3];
-    rGateGrad = rStateGrad * shGateValue[frameIdx];
-    rGateGrad = activeGate(rGateGrad, rGateValue);
-  } else if (frameIdy == 2) {
-    rStateGrad = rStateGrad * rGateValuePrev;
-    rStateGrad += rGateGrad * rCheck;
-    shStateGrad[index] = rStateGrad;
-    ptx_sync(3, valueSize);
-    rStateGrad += shStateGrad[frameIdx + frameSize];
-    rStateGrad += shStateGrad[frameIdx + frameSize * 3];
-    rGateValuePrev = rGateValue;
-    rGateGrad = rStateGrad * shStateValue[frameIdx];
-    rGateGrad = activeGate(rGateGrad, rGateValue);
-  } else if (frameIdy == 0) {
-    shGateValue[frameIdx] = rGateValue;
-    ptx_sync(3, valueSize);
-    rStateGrad = shStateGrad[frameIdx + frameSize];
-    rStateGrad += shStateGrad[frameIdx + frameSize * 2];
-    rStateGrad += shStateGrad[frameIdx + frameSize * 3];
-    rGateGrad = rStateGrad * shGateValue[frameIdx + frameSize];
-    rGateGrad = activeNode(rGateGrad, rGateValue);
-  }
-}
-
-template <int valueSize, int frameSize>
-__device__ void load_weight(real rWeight[], real *weight, const int index) {
-  if (valueSize == 128) {
-    weight += index;
-#pragma unroll
-    for (int n = 0; n < frameSize; n++) {
-      rWeight[n] = weight[n * valueSize];
-    }
-    transpose_32x32(rWeight, index % 32);
-  }
-  if (valueSize == 256) {
-    int id = (index / 32) % 2;
-    weight += index - id * 32 + id * 32 * valueSize;
-#pragma unroll
-    for (int n = 0; n < 32; n++) {
-      rWeight[n] = weight[n * valueSize];
-      rWeight[n + 32] = weight[n * valueSize + 32];
-    }
-    transpose_32x32(rWeight, index % 32);
-    transpose_32x32(&rWeight[32], index % 32);
-  }
-}
-
-template <int valueSize, int frameSize, int reversed>
-__global__ void KeLstmBackward(real *gateValue,
-                               real *gateGrad,
-                               real *stateValue,
-                               real *stateGrad, /* do not need save */
-                               real *preOutputValue,
-                               real *preOutputGrad, /* do not need save */
-                               real *checkIg,
-                               real *checkIgGrad,
-                               real *checkFg,
-                               real *checkFgGrad,
-                               real *checkOg,
-                               real *checkOgGrad,
-                               real *outputGrad,
-                               real *weightValue,
-                               const int *starts,
-                               hl_activation_mode_t active_node,
-                               hl_activation_mode_t active_gate,
-                               hl_activation_mode_t active_state) {
-  __shared__ real shGateValue[valueSize];
-  __shared__ real shStateGrad[valueSize];
-  __shared__ real shStateValue[frameSize];
-  __shared__ real shGateGrad[4][frameSize];
-  __shared__ real shOutputGrad[4][frameSize];
-  const int index = threadIdx.x;
-  int start = starts[blockIdx.x];
-  int length = starts[blockIdx.x + 1] - start;
-
-  const int frameIdx = index % frameSize;
-  const int frameIdy = index / frameSize;
-  real rCheck;
-  real rCheckGrad;
-  real rGateGrad;
-  real rStateGrad;
-  real rGateValuePrev;
-  real rPreOutputValue;
-  real rOutputGrad;
-  real rGateValue;
-  real rStateValue;
-
-  frameValue frameGateValue(gateValue);
-  frameValue frameGateGrad(gateGrad);
-  frameValue framePreOutputValue(preOutputValue);
-  frameValue frameStateValue(stateValue);
-  frameValue frameOutputGrad(outputGrad);
-  if (frameIdy == 0) {
-  } else if (frameIdy == 1) {
-    rCheck = checkIg[frameIdx];
-  } else if (frameIdy == 2) {
-    rCheck = checkFg[frameIdx];
-    rGateValuePrev = 0.0;
-    rStateGrad = 0.0;
-  } else if (frameIdy == 3) {
-    rCheck = checkOg[frameIdx];
-    framePreOutputValue.init<!reversed, frameSize>(start, length, frameIdx);
-    frameOutputGrad.init<!reversed, frameSize>(start, length, frameIdx);
-    rOutputGrad = frameOutputGrad.getValue();
-    rPreOutputValue = framePreOutputValue.getValue();
-    frameStateValue.init<!reversed, frameSize>(start, length, frameIdx);
-    rStateValue = frameStateValue.getValue();
-  }
-
-  frameGateValue.init<!reversed, valueSize>(start, length, index);
-  frameGateGrad.init<!reversed, valueSize>(start, length, index);
-  rGateValue = frameGateValue.getValue();
-  rGateGrad = 0.0;
-  rCheckGrad = 0.0;
-
-  real B_r[frameSize];
-  load_weight<valueSize, frameSize>(B_r, weightValue, index);
-
-  for (int i = 0; i < length; ++i) {
-    if (frameIdy == 3) {
-      if (i != length - 1) {
-        frameStateValue.nextFrame<!reversed, frameSize>();
-        shStateValue[frameIdx] = frameStateValue.getValue();
-      } else {
-        shStateValue[frameIdx] = 0.0;
-      }
-    }
-    backward_sequence<valueSize, frameSize>(rGateValue,
-                                            rOutputGrad,
-                                            rPreOutputValue,
-                                            rGateGrad,
-                                            rStateGrad,
-                                            shStateGrad,
-                                            shStateValue,
-                                            shGateValue,
-                                            rCheck,
-                                            rGateValuePrev,
-                                            index,
-                                            hppl::gpu::backward[active_node],
-                                            hppl::gpu::backward[active_gate],
-                                            hppl::gpu::backward[active_state]);
-    if (frameIdy == 3) {
-      rCheckGrad += rGateGrad * rStateValue;
-      rStateValue = shStateValue[frameIdx];
-    }
-
-    frameGateGrad.setValue(rGateGrad);
-    frameGateGrad.nextFrame<!reversed, valueSize>();
-
-    if (i != length - 1) {
-      if (frameIdy == 3) {
-        framePreOutputValue.nextFrame<!reversed, frameSize>();
-        rPreOutputValue = framePreOutputValue.getValue();
-        frameOutputGrad.nextFrame<!reversed, frameSize>();
-        rOutputGrad = frameOutputGrad.getValue();
-      } else if (frameIdy == 2) {
-        rCheckGrad += rGateGrad * shStateValue[frameIdx];
-      } else if (frameIdy == 1) {
-        rCheckGrad += rGateGrad * shStateValue[frameIdx];
-      }
-
-      frameGateValue.nextFrame<!reversed, valueSize>();
-      rGateValue = frameGateValue.getValue();
-      shGateGrad[frameIdy][frameIdx] = rGateGrad;
-      if (valueSize == 128) {
-        real sum = 0.0f;
-#pragma unroll
-        for (int n = 0; n < frameSize; n++) {
-          sum += shGateGrad[frameIdy][n] * B_r[n];
-        }
-        if (frameIdy == 3) {
-          rOutputGrad += sum;
-        } else {
-          shOutputGrad[frameIdy][frameIdx] = sum;
-        }
-      }
-      if (valueSize == 256) {
-        ptx_sync(5, valueSize);
-        real A_r[frameSize];
-        for (int n = 0; n < frameSize; n++) {
-          A_r[n] = shGateGrad[frameIdy][n];
-        }
-        real sum = 0.0f;
-        for (int n = 0; n < frameSize; n++) {
-          sum += A_r[n] * B_r[n];
-        }
-        if (frameIdy == 3) {
-          rOutputGrad += sum;
-        } else {
-          shOutputGrad[frameIdy][frameIdx] = sum;
-        }
-      }
-
-      if (frameIdy == 3) {
-        ptx_sync(6, valueSize);
-#pragma unroll
-        for (int i = 0; i < 3; i++) {
-          rOutputGrad += shOutputGrad[i][frameIdx];
-        }
-      } else {
-        ptx_arrive(6, valueSize);
-      }
-    }
-  }
-
-  /* TODO: Temporary save & merger in another kernel */
-  if (frameIdy == 1) {
-    if (checkIgGrad)
-      paddle::paddleAtomicAdd(checkIgGrad + frameIdx, rCheckGrad);
-  } else if (frameIdy == 2) {
-    if (checkFgGrad)
-      paddle::paddleAtomicAdd(checkFgGrad + frameIdx, rCheckGrad);
-  } else if (frameIdy == 3) {
-    if (checkOgGrad)
-      paddle::paddleAtomicAdd(checkOgGrad + frameIdx, rCheckGrad);
-  }
-}
-
-void hl_lstm_parallel_backward_data(real *gateValue,
-                                    real *gateGrad,
-                                    real *stateValue,
-                                    real *stateGrad,
-                                    real *preOutputValue,
-                                    real *preOutputGrad,
-                                    real *outputGrad,
-                                    real *checkIg,
-                                    real *checkIgGrad,
-                                    real *checkFg,
-                                    real *checkFgGrad,
-                                    real *checkOg,
-                                    real *checkOgGrad,
-                                    real *weight,
-                                    const int *sequence,
-                                    int frameSize,
-                                    int numSequences,
-                                    bool reversed,
-                                    hl_activation_mode_t active_node,
-                                    hl_activation_mode_t active_gate,
-                                    hl_activation_mode_t active_state) {
-  CHECK(frameSize == 32 || frameSize == 64 || frameSize == 128 ||
-        frameSize == 256);
-  dim3 grid(numSequences, 1);
-  if (!reversed) {
-    if (frameSize == 32) {
-      KeLstmBackward<128, 32, 0><<<grid, 128, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 64) {
-      KeLstmBackward<256, 64, 0><<<grid, 256, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 128) {
-      KeLstmBackward<512, 128, 0><<<grid, 512, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 256) {
-      KeLstmBackward<1024, 256, 0><<<grid, 1024, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    }
-  } else {
-    if (frameSize == 32) {
-      KeLstmBackward<128, 32, 1><<<grid, 128, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 64) {
-      KeLstmBackward<256, 64, 1><<<grid, 256, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 128) {
-      KeLstmBackward<512, 128, 1><<<grid, 512, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    } else if (frameSize == 256) {
-      KeLstmBackward<1024, 256, 1><<<grid, 1024, 0, STREAM_DEFAULT>>>(
-          gateValue,
-          gateGrad,
-          stateValue,
-          stateGrad,
-          preOutputValue,
-          preOutputGrad,
-          checkIg,
-          checkIgGrad,
-          checkFg,
-          checkFgGrad,
-          checkOg,
-          checkOgGrad,
-          outputGrad,
-          weight,
-          sequence,
-          active_node,
-          active_gate,
-          active_state);
-    }
-  }
-  CHECK_SYNC("hl_lstm_parallel_backward_data");
-}
-
-template <int B_X, int B_Y>
-__global__ void KeSetGradZero(real *gateGrad,
-                              const int *starts,
-                              int valueSize,
-                              int numSequences,
-                              bool reversed) {
-  // const int tid = threadIdx.x;
-
-  const int frameIdx = blockIdx.x * B_X + threadIdx.x;
-  const int numSeqId = blockIdx.y * B_Y + threadIdx.y;
-
-  if (numSeqId >= numSequences || frameIdx >= valueSize) return;
-
-  if (!reversed) {
-    int seqId = starts[numSeqId];
-    gateGrad[seqId * valueSize + frameIdx] = 0.0;
-  } else {
-    int seqId = starts[numSeqId + 1] - 1;
-    gateGrad[seqId * valueSize + frameIdx] = 0.0;
-  }
-}
-
-void hl_lstm_parallel_backward_weight(real *weightGrad,
-                                      real *outputValue,
-                                      real *gateGrad,
-                                      const int *sequence,
-                                      int frameSize,
-                                      int batchSize,
-                                      int numSequences,
-                                      bool reversed) {
-  int valueSize = 4 * frameSize;
-  dim3 threads(32, 32);
-  dim3 grid((valueSize + 32 - 1) / 32, (numSequences + 32 - 1) / 32);
-  KeSetGradZero<32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      gateGrad, sequence, valueSize, numSequences, reversed);
-
-  if (!reversed) {
-    hl_matrix_mul(outputValue,
-                  HPPL_OP_T,
-                  gateGrad + valueSize,
-                  HPPL_OP_N,
-                  weightGrad,
-                  frameSize,
-                  valueSize,
-                  batchSize - 1,
-                  1.0,
-                  1.0);
-  } else {
-    hl_matrix_mul(outputValue + frameSize,
-                  HPPL_OP_T,
-                  gateGrad,
-                  HPPL_OP_N,
-                  weightGrad,
-                  frameSize,
-                  valueSize,
-                  batchSize - 1,
-                  1.0,
-                  1.0);
-  }
-  CHECK_SYNC("hl_lstm_parallel_backward_weight");
-}
diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu
deleted file mode 100644
index 3e17c8090c5036037e936af1d6feaa2239251679..0000000000000000000000000000000000000000
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ /dev/null
@@ -1,806 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_base.h"
-#include "hl_device_functions.cuh"
-#include "hl_gpu_matrix_kernel.cuh"
-#include "hl_matrix.h"
-#include "hl_matrix_apply.cuh"
-#include "hl_matrix_ops.cuh"
-#include "hl_sequence.h"
-#include "hl_sparse.ph"
-#include "paddle/utils/Logging.h"
-
-DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1 * a + p2 * b);
-void hl_matrix_add(real* A_d,
-                   real* B_d,
-                   real* C_d,
-                   int dimM,
-                   int dimN,
-                   real alpha,
-                   real beta) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-
-  hl_gpu_apply_ternary_op<real, ternary::_add<real>, 0, 0>(
-      ternary::_add<real>(alpha, beta),
-      A_d,
-      B_d,
-      C_d,
-      dimM,
-      dimN,
-      dimN,
-      dimN,
-      dimN);
-  CHECK_SYNC("hl_matrix_add failed");
-}
-
-#ifdef PADDLE_TYPE_DOUBLE
-#define THRESHOLD 128
-#else
-#define THRESHOLD 64
-#endif
-__device__ __forceinline__ void findMax(real* I,
-                                        real* dfMax_s,
-                                        int blockSize,
-                                        int base,
-                                        int curIdx,
-                                        int nextIdx,
-                                        int dimN,
-                                        real* max) {
-  dfMax_s[base] = -1.0e20;
-  while (curIdx < dimN) {
-    if (dfMax_s[base] < I[nextIdx]) {
-      dfMax_s[base] = I[nextIdx];
-    }
-    nextIdx += blockSize;
-    curIdx += blockSize;
-  }
-  __syncthreads();
-
-  for (int stride = blockSize >> 1; stride > 0; stride >>= 1) {
-    __syncthreads();
-    if (base < stride) {
-      nextIdx = base + stride;
-      if (dfMax_s[base] < dfMax_s[nextIdx]) {
-        dfMax_s[base] = dfMax_s[nextIdx];
-      }
-    }
-  }
-
-  if (0 == base) {
-    max[0] = dfMax_s[0];
-  }
-  __syncthreads();
-}
-
-__device__ __forceinline__ void subMaxAndExp(real* I,
-                                             real* O,
-                                             int curIdx,
-                                             int nextIdx,
-                                             int blockSize,
-                                             int dimN,
-                                             real max) {
-  real val;
-  while (curIdx < dimN) {
-    val = I[nextIdx] - max;
-    if (val < -THRESHOLD) {
-      val = -THRESHOLD;
-    }
-    I[nextIdx] = val;
-#ifndef PADDLE_TYPE_DOUBLE
-    O[nextIdx] = __expf(val);
-#else
-    O[nextIdx] = exp(val);
-#endif
-    nextIdx += blockSize;
-    curIdx += blockSize;
-  }
-  __syncthreads();
-}
-
-__device__ __forceinline__ void valueSum(real* O,
-                                         real* dfMax_s,
-                                         int blockSize,
-                                         int base,
-                                         int curIdx,
-                                         int nextIdx,
-                                         int dimN) {
-  dfMax_s[base] = 0;
-  while (curIdx < dimN) {
-    dfMax_s[base] += O[nextIdx];
-    nextIdx += blockSize;
-    curIdx += blockSize;
-  }
-  __syncthreads();
-
-  for (int stride = blockSize >> 1; stride > 0; stride >>= 1) {
-    __syncthreads();
-    if (base < stride) {
-      nextIdx = base + stride;
-      dfMax_s[base] += dfMax_s[nextIdx];
-    }
-  }
-  __syncthreads();
-}
-
-__device__ __forceinline__ void divSum(
-    real* O, real sum, int curIdx, int nextIdx, int blockSize, int dimN) {
-  while (curIdx < dimN) {
-    O[nextIdx] /= sum;
-    nextIdx += blockSize;
-    curIdx += blockSize;
-  }
-}
-
-__device__ __forceinline__ void softmax(real* I,
-                                        real* O,
-                                        real* dfMax_s,
-                                        int blockSize,
-                                        int base,
-                                        int curIdx,
-                                        int nextIdx,
-                                        int dimN) {
-  __shared__ real max;
-
-  // find the max number
-  findMax(I, dfMax_s, blockSize, base, curIdx, nextIdx, dimN, &max);
-
-  // sub max Value and do Exp operation
-  subMaxAndExp(I, O, base, nextIdx, blockSize, dimN, max);
-
-  // add dimN values into blockDim.x buffer
-  // sum is in dfMax_s[0]
-  valueSum(O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
-
-  // divided by sum
-  divSum(O, dfMax_s[0], curIdx, nextIdx, blockSize, dimN);
-}
-
-template <int blockSize>
-__global__ void KeMatrixSoftMax(real* O, real* I, int dimN) {
-  int base = threadIdx.x;
-  __shared__ real dfMax_s[blockSize];
-  int nextIdx = blockIdx.x * dimN + base;
-  int curIdx = base;
-
-  softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
-}
-
-void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  dim3 block(512, 1);
-  dim3 grid(dimM, 1);
-  KeMatrixSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
-  CHECK_SYNC("hl_matrix_softmax failed");
-}
-
-template <int blockSize>
-__global__ void KeSequenceSoftMax(real* O, real* I, const int* index) {
-  int base = threadIdx.x;
-  int bid = blockIdx.x;
-  __shared__ real dfMax_s[blockSize];
-
-  int start = index[bid];
-  int dimN = index[bid + 1] - start;
-
-  int nextIdx = start + base;
-  int curIdx = base;
-
-  softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
-}
-
-void hl_sequence_softmax_forward(real* A_d,
-                                 real* C_d,
-                                 const int* index,
-                                 int numSequence) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  dim3 block(512, 1);
-  dim3 grid(numSequence, 1);
-  KeSequenceSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
-  CHECK_SYNC("hl_sequence_softmax_forward failed");
-}
-
-__global__ void KeMatrixDerivative(
-    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  int index;
-
-  if (rowIdx < dimM && colIdx < dimN) {
-    index = rowIdx * dimN + colIdx;
-    grad_d[index] = output_d[index] * (grad_d[index] - sftmaxSum_d[rowIdx]);
-  }
-}
-
-void hl_matrix_softmax_derivative(
-    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
-  CHECK_NOTNULL(grad_d);
-  CHECK_NOTNULL(output_d);
-  CHECK_NOTNULL(sftmaxSum_d);
-
-  int blocksX = (dimM + 0) / 1;
-  int blocksY = (dimN + 1024 - 1) / 1024;
-  dim3 threads(1, 1024);
-  dim3 grid(blocksX, blocksY);
-
-  KeMatrixDerivative<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      grad_d, output_d, sftmaxSum_d, dimM, dimN);
-  CHECK_SYNC("hl_matrix_softmax_derivative failed");
-}
-
-__global__ void KeMatrixMultiBinaryCrossEntropy(
-    real* output, real* entropy, int* row, int* col, int dimM, int dimN) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < dimM) {
-    for (int i = 0; i < dimN; i++) {
-      entropy[index] -= log(1 - output[index * dimN + i]);
-    }
-    int* row_col = col + row[index];
-    int col_num = row[index + 1] - row[index];
-    for (int i = 0; i < col_num; i++) {
-      real o = output[index * dimN + row_col[i]];
-      entropy[index] -= log(o / (1 - o));
-    }
-  }
-}
-
-void hl_matrix_multi_binary_cross_entropy(real* output,
-                                          real* entropy,
-                                          hl_sparse_matrix_s csr_mat,
-                                          int dimM,
-                                          int dimN) {
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(entropy);
-  CHECK_NOTNULL(csr_mat);
-  CHECK_EQ(csr_mat->format, HL_SPARSE_CSR);
-  int n_threads = 1024;
-  int blocks = (dimM + n_threads - 1) / n_threads;
-  dim3 threads(n_threads);
-  dim3 grid(blocks);
-  hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
-  KeMatrixMultiBinaryCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      output, entropy, mat->csr_row, mat->csr_col, dimM, dimN);
-  CHECK_SYNC("hl_matrix_multi_binary_cross_entropy failed");
-}
-
-__global__ void KeMatrixMultiBinaryCrossEntropyBp(
-    real* output, real* grad, int* row, int* col, int dimM, int dimN) {
-  int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (row_idx < dimM) {
-    for (int i = 0; i < dimN; i++) {
-      int index = row_idx * dimN + i;
-      grad[index] += 1.0 / (1 - output[index]);
-    }
-    int col_num = row[row_idx + 1] - row[row_idx];
-    int* row_col = col + row[row_idx];
-    for (int i = 0; i < col_num; i++) {
-      int index = row_idx * dimN + row_col[i];
-      grad[index] -= 1.0 / (output[index] * (1 - output[index]));
-    }
-  }
-}
-
-void hl_matrix_multi_binary_cross_entropy_bp(
-    real* output, real* grad, hl_sparse_matrix_s csr_mat, int dimM, int dimN) {
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(grad);
-  CHECK_NOTNULL(csr_mat);
-  CHECK_EQ(csr_mat->format, HL_SPARSE_CSR);
-  int n_threads = 1024;
-  int blocks = (dimM + n_threads - 1) / n_threads;
-  dim3 threads(n_threads);
-  dim3 grid(blocks);
-  hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
-  KeMatrixMultiBinaryCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      output, grad, mat->csr_row, mat->csr_col, dimM, dimN);
-  CHECK_SYNC("hl_matrix_multi_binary_cross_entropy_bp failed");
-}
-
-__global__ void KeMatrixCrossEntropy(
-    real* O, real* E, int* label, int dimM, int dimN) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int newBase;
-  if (index < dimM) {
-    newBase = label[index];
-    newBase = newBase % dimN;
-    E[index] = -log(O[index * dimN + newBase]);
-  }
-}
-
-void hl_matrix_cross_entropy(
-    real* A_d, real* C_d, int* label_d, int dimM, int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-
-  int blocks = (dimM + 1024 - 1) / 1024;
-  dim3 threads(1024, 1);
-  dim3 grid(blocks, 1);
-  KeMatrixCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      A_d, C_d, label_d, dimM, dimN);
-  CHECK_SYNC("hl_matrix_cross_entropy failed");
-}
-
-__global__ void KeMatrixCrossEntropyBp(
-    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
-  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  int index;
-  if (rowIdx < dimM && colIdx < dimN) {
-    index = rowIdx * dimN + colIdx;
-    if (label_d[rowIdx] == colIdx) {
-      grad_d[index] -= 1.0f / output_d[index];
-    }
-  }
-}
-
-void hl_matrix_cross_entropy_bp(
-    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
-  CHECK_NOTNULL(grad_d);
-  CHECK_NOTNULL(output_d);
-  CHECK_NOTNULL(label_d);
-
-  int blocksX = (dimM + 0) / 1;
-  int blocksY = (dimN + 1024 - 1) / 1024;
-  dim3 threads(1, 1024);
-  dim3 grid(blocksX, blocksY);
-  KeMatrixCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      grad_d, output_d, label_d, dimM, dimN);
-  CHECK_SYNC("hl_matrix_cross_entropy_bp failed");
-}
-
-void hl_matrix_zero_mem(real* data, int num) {
-  hl_gpu_apply_unary_op(unary::Zero<real>(), data, 1, num, num);
-}
-
-__global__ void KeParamReluForward(real* output,
-                                   real* input,
-                                   real* w,
-                                   int width,
-                                   int height,
-                                   int partial_sum) {
-  int tx = blockIdx.x * blockDim.x + threadIdx.x;
-  int ty = blockIdx.y * blockDim.y + threadIdx.y;
-  if (tx < width && ty < height) {
-    int index = ty * width + tx;
-    output[index] =
-        input[index] > 0 ? input[index] : input[index] * w[tx / partial_sum];
-  }
-}
-
-void hl_param_relu_forward(real* output,
-                           real* input,
-                           real* w,
-                           int width,
-                           int height,
-                           int partial_sum) {
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(w);
-  dim3 threads(16, 16);
-  int blockX = (width + 16 - 1) / 16;
-  int blockY = (height + 16 - 1) / 16;
-  dim3 grid(blockX, blockY);
-  KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      output, input, w, width, height, partial_sum);
-  CHECK_SYNC("hl_param_relu_forward failed");
-}
-
-template <int blockSize>
-__global__ void KeParamReluBackWardW(real* grad_w,
-                                     real* grad_o,
-                                     real* input,
-                                     int width,
-                                     int height,
-                                     int partial_sum) {
-  const int tid = threadIdx.x;
-  __shared__ real temp[blockSize];
-  grad_o += partial_sum * blockIdx.x;
-  input += partial_sum * blockIdx.x;
-  real tmp = 0.0;
-  for (int index = tid; index < partial_sum * height; index += blockSize) {
-    int row = index / partial_sum;
-    int offset = row * width + (index - row * partial_sum);
-    if (input[offset] < 0) {
-      tmp += grad_o[offset] * input[offset];
-    }
-  }
-  temp[tid] = tmp;
-  __syncthreads();
-  for (int s = blockSize / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      temp[tid] += temp[tid + s];
-    }
-    __syncthreads();
-  }
-  if (tid == 0) {
-    grad_w[blockIdx.x] += temp[0];
-  }
-}
-
-void hl_param_relu_backward_w(real* grad_w,
-                              real* grad_o,
-                              real* input,
-                              int width,
-                              int height,
-                              int partial_sum) {
-  CHECK_NOTNULL(grad_w);
-  CHECK_NOTNULL(grad_o);
-  CHECK_NOTNULL(input);
-  const int blockSize = 1024;
-  int grid_num = width / partial_sum;
-  dim3 threads(blockSize, 1);
-  dim3 grid(grid_num, 1);
-  KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      grad_w, grad_o, input, width, height, partial_sum);
-  CHECK_SYNC("hl_param_relu_backward_w failed");
-}
-
-__global__ void KeParamReluBackwardDiff(real* grad_o,
-                                        real* input,
-                                        real* w,
-                                        real* diff,
-                                        int width,
-                                        int height,
-                                        int partial_sum) {
-  int tx = blockIdx.x * blockDim.x + threadIdx.x;
-  int ty = blockIdx.y * blockDim.y + threadIdx.y;
-  if (tx < width && ty < height) {
-    int index = ty * width + tx;
-    diff[index] += grad_o[index] * (input[index] > 0 ? 1 : w[tx / partial_sum]);
-  }
-}
-
-void hl_param_relu_backward_diff(real* grad_o,
-                                 real* data,
-                                 real* w,
-                                 real* diff,
-                                 int width,
-                                 int height,
-                                 int partial_sum) {
-  CHECK_NOTNULL(grad_o);
-  CHECK_NOTNULL(data);
-  CHECK_NOTNULL(w);
-  CHECK_NOTNULL(diff);
-  dim3 threads(16, 16);
-  int blockX = (width + 16 - 1) / 16;
-  int blockY = (height + 16 - 1) / 16;
-  dim3 grid(blockX, blockY);
-  KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      grad_o, data, w, diff, width, height, partial_sum);
-  CHECK_SYNC("hl_param_relu_backward_diff failed");
-}
-
-__global__ void KeMatrixAddSharedBias(
-    real* A, real* B, const int channel, const int M, const int N, real scale) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int dim = N / channel;
-  if (index < M * N) {
-    int i = index % N;
-    i = i / dim;
-    A[index] += scale * B[i];
-  }
-}
-
-void hl_matrix_add_shared_bias(real* A_d,
-                               real* B_d,
-                               const int channel,
-                               const int dimM,
-                               const int dimN,
-                               real scale) {
-  const int blocks = 512;
-  const int grids = DIVUP(dimM * dimN, blocks);
-  KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>(
-      A_d, B_d, channel, dimM, dimN, scale);
-  CHECK_SYNC("hl_matrix_add_shared_bias failed");
-}
-
-template <int blockSize>
-__global__ void KeMatrixCollectSharedBias(real* B,
-                                          real* A,
-                                          const int channel,
-                                          const int M,
-                                          const int N,
-                                          const int dim,
-                                          const int limit,
-                                          real scale) {
-  if (dim < limit) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    if (index < channel) {
-      real sum = 0.0;
-      for (int i = 0; i < M; ++i) {
-        for (int j = 0; j < dim; ++j) {
-          sum += A[i * N + index * dim + j];
-        }
-      }
-      B[index] += scale * sum;
-    }
-  } else {
-    const int tid = threadIdx.x;
-    const int bid = blockIdx.x;
-    __shared__ real smem[blockSize];
-    real sum = 0.0;
-    for (int j = 0; j < ((dim * M + blockSize - 1) / blockSize); ++j) {
-      int n = j * blockSize + tid;
-      int m = n / dim;
-      int w = n % dim;
-      smem[tid] = (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0;
-      __syncthreads();
-      simpleReduce(smem, tid, blockSize);
-      sum += smem[0];
-    }
-    if (tid == 0) {
-      B[bid] += scale * sum;
-    }
-  }
-}
-
-void hl_matrix_collect_shared_bias(real* B_d,
-                                   real* A_d,
-                                   const int channel,
-                                   const int dimM,
-                                   const int dimN,
-                                   real scale) {
-  const int dim = dimN / channel;
-  const int blocks = 256;
-  const int limit = 64;
-  int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel;
-
-  KeMatrixCollectSharedBias<blocks><<<grids, blocks, 0, STREAM_DEFAULT>>>(
-      B_d, A_d, channel, dimM, dimN, dim, limit, scale);
-  CHECK_SYNC("hl_matrix_collect_shared_bias failed");
-}
-
-__global__ void keMatrixRotate(
-    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < dimM * dimN) {
-    int i = idx / dimN;
-    int j = idx % dimN;
-    if (clockWise) {
-      matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j];
-    } else {
-      matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)];
-    }
-  }
-}
-
-void hl_matrix_rotate(
-    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
-  CHECK_NOTNULL(mat);
-  CHECK_NOTNULL(matRot);
-  const int threads = 512;
-  const int blocks = DIVUP(dimM * dimN, threads);
-  keMatrixRotate<<<blocks, threads, 0, STREAM_DEFAULT>>>(
-      mat, matRot, dimM, dimN, clockWise);
-  CHECK_SYNC("hl_matrix_rotate failed");
-}
-
-__global__ void keMatrixVol2Col(int num_kernels,
-                                const real* dataSrc,
-                                real* dataDst,
-                                int depth,
-                                int height,
-                                int width,
-                                int filterD,
-                                int filterH,
-                                int filterW,
-                                int strideD,
-                                int strideH,
-                                int strideW,
-                                int paddingD,
-                                int paddingH,
-                                int paddingW,
-                                int depth_col,
-                                int height_col,
-                                int width_col) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
-       index += blockDim.x * gridDim.x) {
-    int w_out = index % width_col;
-    int h_out = (index / width_col) % height_col;
-    int d_out = (index / width_col / height_col) % depth_col;
-    int channel_in = index / width_col / height_col / depth_col;
-    int channel_out = channel_in * filterD * filterH * filterW;
-    int w_in = w_out * strideW - paddingW;
-    int h_in = h_out * strideH - paddingH;
-    int d_in = d_out * strideD - paddingD;
-
-    dataDst +=
-        ((channel_out * depth_col + d_out) * height_col + h_out) * width_col +
-        w_out;
-    dataSrc += ((channel_in * depth + d_in) * height + h_in) * width + w_in;
-    for (int k = 0; k < filterD; ++k) {
-      for (int i = 0; i < filterH; ++i) {
-        for (int j = 0; j < filterW; ++j) {
-          int d = d_in + k;
-          int h = h_in + i;
-          int w = w_in + j;
-          *dataDst = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 &&
-                      w < width)
-                         ? dataSrc[(k * height + i) * width + j]
-                         : 0;
-          dataDst += depth_col * height_col * width_col;
-        }
-      }
-    }
-  }
-}
-
-void hl_matrix_vol2Col(const real* dataSrc,
-                       int channels,
-                       int depth,
-                       int height,
-                       int width,
-                       int filterD,
-                       int filterH,
-                       int filterW,
-                       int strideD,
-                       int strideH,
-                       int strideW,
-                       int paddingD,
-                       int paddingH,
-                       int paddingW,
-                       real* dataDst) {
-  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
-  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
-  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
-  int num_kernels = channels * depth_col * height_col * width_col;
-
-  const int threads = 512;
-  const int blocks = DIVUP(num_kernels, threads);
-
-  keMatrixVol2Col<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                          dataSrc,
-                                                          dataDst,
-                                                          depth,
-                                                          height,
-                                                          width,
-                                                          filterD,
-                                                          filterH,
-                                                          filterW,
-                                                          strideD,
-                                                          strideH,
-                                                          strideW,
-                                                          paddingD,
-                                                          paddingH,
-                                                          paddingW,
-                                                          depth_col,
-                                                          height_col,
-                                                          width_col);
-  CHECK_SYNC("hl_matrix_vol2Col failed");
-}
-
-__global__ void keMatrixCol2Vol(int num_kernels,
-                                real* dataDst,
-                                const real* dataSrc,
-                                int depth,
-                                int height,
-                                int width,
-                                int filterD,
-                                int filterH,
-                                int filterW,
-                                int strideD,
-                                int strideH,
-                                int strideW,
-                                int paddingD,
-                                int paddingH,
-                                int paddingW,
-                                int depth_col,
-                                int height_col,
-                                int width_col,
-                                real alpha,
-                                real beta) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
-       index += blockDim.x * gridDim.x) {
-    real srcVal = 0;
-    real dstVal = dataDst[index];
-    int w = index % width + paddingW;
-    int h = (index / width) % height + paddingH;
-    int d = (index / width / height) % depth + paddingD;
-    int c = index / width / height / depth;
-    // compute the start and end of the output
-    int w_col_start = (w < filterW) ? 0 : (w - filterW) / strideW + 1;
-    int w_col_end = min(w / strideW + 1, width_col);
-    int h_col_start = (h < filterH) ? 0 : (h - filterH) / strideH + 1;
-    int h_col_end = min(h / strideH + 1, height_col);
-    int d_col_start = (d < filterD) ? 0 : (d - filterD) / strideD + 1;
-    int d_col_end = min(d / strideD + 1, depth_col);
-
-    int offset = (c * filterD * filterW * filterH + d * filterW * filterH +
-                  h * filterW + w) *
-                 depth_col * height_col * width_col;
-
-    int coeff_d_col =
-        (1 - strideD * filterW * filterH * depth_col) * height_col * width_col;
-    int coeff_h_col =
-        (1 - strideH * filterW * depth_col * height_col) * width_col;
-    int coeff_w_col = (1 - strideW * depth_col * height_col * width_col);
-
-    for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          srcVal += dataSrc[offset + d_col * coeff_d_col + h_col * coeff_h_col +
-                            w_col * coeff_w_col];
-        }
-      }
-    }
-    dataDst[index] = alpha * srcVal + beta * dstVal;
-  }
-}
-
-void hl_matrix_col2Vol(real* dataDst,
-                       int channels,
-                       int depth,
-                       int height,
-                       int width,
-                       int filterD,
-                       int filterH,
-                       int filterW,
-                       int strideD,
-                       int strideH,
-                       int strideW,
-                       int paddingD,
-                       int paddingH,
-                       int paddingW,
-                       const real* dataSrc,
-                       real alpha,
-                       real beta) {
-  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
-  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
-  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
-  int num_kernels = channels * depth * height * width;
-
-  const int threads = 512;
-  const int blocks = DIVUP(num_kernels, threads);
-
-  keMatrixCol2Vol<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                          dataDst,
-                                                          dataSrc,
-                                                          depth,
-                                                          height,
-                                                          width,
-                                                          filterD,
-                                                          filterH,
-                                                          filterW,
-                                                          strideD,
-                                                          strideH,
-                                                          strideW,
-                                                          paddingD,
-                                                          paddingH,
-                                                          paddingW,
-                                                          depth_col,
-                                                          height_col,
-                                                          width_col,
-                                                          alpha,
-                                                          beta);
-
-  CHECK_SYNC("hl_matrix_col2Vol failed");
-}
-
-__global__ void keVectorCast2Int(int* out, real* vec, int size) {
-  for (int i = threadIdx.x; i < (size); i += blockDim.x) {
-    out[i] = int(vec[i]);
-  }
-}
-
-void hl_vector_cast2int(int* out, real* vec, int size) {
-  keVectorCast2Int<<<1, 512, 0, STREAM_DEFAULT>>>(out, vec, size);
-  CHECK_SYNC("hl_vector_cast2int failed");
-}
diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu
deleted file mode 100644
index a3a5f038de7c0a68ee2e387d83b2272907164e90..0000000000000000000000000000000000000000
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ /dev/null
@@ -1,408 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_base.h"
-#include "hl_device_functions.cuh"
-#include "paddle/utils/Logging.h"
-
-__global__ void KeMaxSequenceForward(real* input,
-                                     const int* sequence,
-                                     real* output,
-                                     int* index,
-                                     int numSequences,
-                                     int dim) {
-  int dimIdx = threadIdx.x;
-  int sequenceId = blockIdx.x;
-  if (sequenceId >= numSequences) return;
-  int start = sequence[sequenceId];
-  int end = sequence[sequenceId + 1];
-
-  for (int i = dimIdx; i < dim; i += blockDim.x) {
-    real tmp = -HL_FLOAT_MAX;
-    int tmpId = -1;
-    for (int insId = start; insId < end; insId++) {
-      if (tmp < input[insId * dim + i]) {
-        tmp = input[insId * dim + i];
-        tmpId = insId;
-      }
-    }
-    output[sequenceId * dim + i] = tmp;
-    index[sequenceId * dim + i] = tmpId;
-  }
-}
-
-void hl_max_sequence_forward(real* input,
-                             const int* sequence,
-                             real* output,
-                             int* index,
-                             int numSequences,
-                             int dim) {
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(index);
-
-  dim3 threads(256, 1);
-  dim3 grid(numSequences, 1);
-  KeMaxSequenceForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      input, sequence, output, index, numSequences, dim);
-  CHECK_SYNC("hl_max_sequence_forward failed");
-}
-
-__global__ void KeMaxSequenceBackward(
-    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int colIdx = idx % dim;
-  if (idx < numSequences * dim) {
-    int insId = index[idx];
-    inputGrad[insId * dim + colIdx] += outputGrad[idx];
-  }
-}
-
-void hl_max_sequence_backward(
-    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
-  CHECK_NOTNULL(outputGrad);
-  CHECK_NOTNULL(index);
-  CHECK_NOTNULL(inputGrad);
-
-  unsigned int blocks = (numSequences * dim + 128 - 1) / 128;
-  dim3 threads(128, 1);
-  dim3 grid(blocks, 1);
-  KeMaxSequenceBackward<<<grid, threads, 0, STREAM_DEFAULT>>>(
-      outputGrad, index, inputGrad, numSequences, dim);
-  CHECK_SYNC("hl_max_sequence_backward failed");
-}
-
-template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
-__global__ void KeMatrixAddRows(real* output,
-                                real* table,
-                                int* ids,
-                                int numSamples,
-                                int tableSize,
-                                int dim) {
-  int idx = threadIdx.x;
-  int idy = threadIdx.y;
-  int sampleId = blockIdx.x + idy * gridDimX;
-
-  while (sampleId < numSamples) {
-    int tableId = ids[sampleId];
-    if ((0 <= tableId) && (tableId < tableSize)) {
-      real* outputData = output + sampleId * dim;
-      real* tableData = table + tableId * dim;
-      for (int i = idx; i < dim; i += blockDimX) {
-        if (AddRow == 0) {
-          outputData[i] += tableData[i];
-        } else {
-          paddle::paddleAtomicAdd(&tableData[i], outputData[i]);
-        }
-      }
-    }
-    sampleId += blockDimY * gridDimX;
-  }
-}
-
-template <int blockDimX,
-          int blockDimY,
-          int gridDimX,
-          bool seq2batch,
-          bool isAdd>
-__global__ void KeSequence2Batch(real* batch,
-                                 real* sequence,
-                                 const int* batchIndex,
-                                 int seqWidth,
-                                 int batchCount) {
-  int idx = threadIdx.x;
-  int idy = threadIdx.y;
-  int id = blockIdx.x + idy * gridDimX;
-  while (id < batchCount) {
-    int seqId = batchIndex[id];
-    real* batchData = batch + id * seqWidth;
-    real* seqData = sequence + seqId * seqWidth;
-    for (int i = idx; i < seqWidth; i += blockDimX) {
-      if (seq2batch) {
-        if (isAdd) {
-          batchData[i] += seqData[i];
-        } else {
-          batchData[i] = seqData[i];
-        }
-      } else {
-        if (isAdd) {
-          seqData[i] += batchData[i];
-        } else {
-          seqData[i] = batchData[i];
-        }
-      }
-    }
-    id += blockDimY * gridDimX;
-  }
-}
-
-void hl_sequence2batch_copy(real* batch,
-                            real* sequence,
-                            const int* batchIndex,
-                            int seqWidth,
-                            int batchCount,
-                            bool seq2batch) {
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(batch);
-  CHECK_NOTNULL(batchIndex);
-
-  dim3 threads(128, 8);
-  dim3 grid(8, 1);
-  if (seq2batch) {
-    KeSequence2Batch<128, 8, 8, 1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        batch, sequence, batchIndex, seqWidth, batchCount);
-  } else {
-    KeSequence2Batch<128, 8, 8, 0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        batch, sequence, batchIndex, seqWidth, batchCount);
-  }
-  CHECK_SYNC("hl_sequence2batch_copy failed");
-}
-
-void hl_sequence2batch_add(real* batch,
-                           real* sequence,
-                           int* batchIndex,
-                           int seqWidth,
-                           int batchCount,
-                           bool seq2batch) {
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(batch);
-  CHECK_NOTNULL(batchIndex);
-
-  dim3 threads(128, 8);
-  dim3 grid(8, 1);
-  if (seq2batch) {
-    KeSequence2Batch<128, 8, 8, 1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        batch, sequence, batchIndex, seqWidth, batchCount);
-  } else {
-    KeSequence2Batch<128, 8, 8, 0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        batch, sequence, batchIndex, seqWidth, batchCount);
-  }
-  CHECK_SYNC("hl_sequence2batch_add failed");
-}
-
-template <bool normByTimes, bool seq2batch>
-__global__ void KeSequence2BatchPadding(real* batch,
-                                        real* sequence,
-                                        const int* sequenceStartPositions,
-                                        const size_t sequenceWidth,
-                                        const size_t maxSequenceLength,
-                                        const size_t numSequences) {
-  int batchIdx = blockIdx.y;
-  int sequenceStart = sequenceStartPositions[batchIdx];
-  int sequenceLength = sequenceStartPositions[batchIdx + 1] - sequenceStart;
-
-  int sequenceIdx = blockIdx.x * blockDim.y + threadIdx.y;
-  int batchBaseIdx = (sequenceIdx * numSequences + batchIdx) * sequenceWidth;
-  int sequenceBaseIdx = (sequenceStart + sequenceIdx) * sequenceWidth;
-
-  real scale = normByTimes ? (1.0f / (real)sequenceLength) : 1.0f;
-
-  if (sequenceIdx < sequenceLength) {
-    if (seq2batch) {
-      /* sequence -> batch */
-      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
-        batch[batchBaseIdx + i] = scale * sequence[sequenceBaseIdx + i];
-      }
-    } else {
-      /* batch -> sequence */
-      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
-        sequence[sequenceBaseIdx + i] = scale * batch[batchBaseIdx + i];
-      }
-    }
-  } else if (sequenceIdx < maxSequenceLength) {
-    if (seq2batch) {
-      /* sequence -> batch */
-      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
-        batch[batchBaseIdx + i] = 0;
-      }
-    }
-  }
-}
-
-void hl_sequence2batch_copy_padding(real* batch,
-                                    real* sequence,
-                                    const int* sequenceStartPositions,
-                                    const size_t sequenceWidth,
-                                    const size_t maxSequenceLength,
-                                    const size_t numSequences,
-                                    bool normByTimes,
-                                    bool seq2batch) {
-  CHECK_NOTNULL(batch);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(sequenceStartPositions);
-
-  if (!normByTimes && numSequences == 1) {
-    size_t elementCount = maxSequenceLength * sequenceWidth;
-    if (seq2batch) {
-      /* sequence -> batch */
-      hl_memcpy_device2device(batch, sequence, sizeof(real) * elementCount);
-    } else {
-      /* batch -> sequence */
-      hl_memcpy_device2device(sequence, batch, sizeof(real) * elementCount);
-    }
-    return;
-  }
-
-  const int CUDA_BLOCK_SIZE = 512;
-
-  /* At least use 32 threads to copy sequenceWidth elements,
-     and at least 8 elements for each thread. */
-  int blockDimX = ((((sequenceWidth + 7) >> 3) + 31) >> 5) << 5;
-  blockDimX = (blockDimX < CUDA_BLOCK_SIZE) ? blockDimX : CUDA_BLOCK_SIZE;
-
-  int blockDimY = CUDA_BLOCK_SIZE / blockDimX;
-  dim3 threads(blockDimX, blockDimY);
-
-  int gridDimX = (maxSequenceLength + blockDimY - 1) / blockDimY;
-  int gridDimY = numSequences;
-  dim3 grid(gridDimX, gridDimY);
-
-  if (seq2batch) {
-    /* sequence -> batch */
-    if (normByTimes) {
-      KeSequence2BatchPadding<1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          batch,
-          sequence,
-          sequenceStartPositions,
-          sequenceWidth,
-          maxSequenceLength,
-          numSequences);
-    } else {
-      KeSequence2BatchPadding<0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          batch,
-          sequence,
-          sequenceStartPositions,
-          sequenceWidth,
-          maxSequenceLength,
-          numSequences);
-    }
-  } else {
-    /* batch -> sequence */
-    if (normByTimes) {
-      KeSequence2BatchPadding<1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          batch,
-          sequence,
-          sequenceStartPositions,
-          sequenceWidth,
-          maxSequenceLength,
-          numSequences);
-    } else {
-      KeSequence2BatchPadding<0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          batch,
-          sequence,
-          sequenceStartPositions,
-          sequenceWidth,
-          maxSequenceLength,
-          numSequences);
-    }
-  }
-
-  CHECK_SYNC("hl_sequence2batch_copy_padding failed");
-}
-
-__device__ inline float my_rsqrt(float x) { return rsqrtf(x); }
-
-__device__ inline double my_rsqrt(double x) { return rsqrt(x); }
-
-__global__ void KeSequenceAvgForward(real* dst,
-                                     real* src,
-                                     const int* starts,
-                                     int height,
-                                     int width,
-                                     const int mode) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int row = gid / width;
-  int col = gid % width;
-
-  if (gid < height * width) {
-    int start = starts[row];
-    int end = starts[row + 1];
-    int seqLength = end - start;
-    if (seqLength == 0) return;
-    real sum = 0.0;
-    for (int i = start; i < end; i++) {
-      sum += src[i * width + col];
-    }
-    sum = mode == 1 ? sum : (mode == 0 ? sum / seqLength
-                                       : sum * my_rsqrt((real)seqLength));
-    dst[gid] += sum;
-  }
-}
-
-void hl_sequence_avg_forward(real* dst,
-                             real* src,
-                             const int* starts,
-                             int height,
-                             int width,
-                             const int mode) {
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(src);
-  CHECK_NOTNULL(starts);
-
-  int block = 512;
-  int grid = DIVUP(width * height, 512);
-
-  CHECK(mode == 0 || mode == 1 || mode == 2)
-      << "mode error in hl_sequence_avg_forward!";
-
-  KeSequenceAvgForward<<<grid, block, 0, STREAM_DEFAULT>>>(
-      dst, src, starts, height, width, mode);
-  CHECK_SYNC("hl_sequence_avg_forward failed");
-}
-
-__global__ void KeSequenceAvgBackward(real* dst,
-                                      real* src,
-                                      const int* starts,
-                                      int height,
-                                      int width,
-                                      const int mode) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int row = gid / width;
-  int col = gid % width;
-
-  if (gid < height * width) {
-    int start = starts[row];
-    int end = starts[row + 1];
-    int seqLength = end - start;
-    if (seqLength == 0) return;
-    real grad = src[gid];
-    grad = mode == 1 ? grad : (mode == 0 ? grad / seqLength
-                                         : grad * my_rsqrt((real)seqLength));
-    for (int i = start; i < end; i++) {
-      dst[i * width + col] += grad;
-    }
-  }
-}
-
-void hl_sequence_avg_backward(real* dst,
-                              real* src,
-                              const int* starts,
-                              int height,
-                              int width,
-                              const int mode) {
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(src);
-  CHECK_NOTNULL(starts);
-
-  int block = 512;
-  int grid = DIVUP(width * height, 512);
-
-  CHECK(mode == 0 || mode == 1 || mode == 2)
-      << "mode error in hl_sequence_avg_backward!";
-
-  KeSequenceAvgBackward<<<grid, block, 0, STREAM_DEFAULT>>>(
-      dst, src, starts, height, width, mode);
-  CHECK_SYNC("hl_sequence_avg_backward failed");
-}
diff --git a/paddle/cuda/src/hl_cuda_sparse.cu b/paddle/cuda/src/hl_cuda_sparse.cu
deleted file mode 100644
index 432041fed5ab1ffc02dabcd4644fa70a6473fba1..0000000000000000000000000000000000000000
--- a/paddle/cuda/src/hl_cuda_sparse.cu
+++ /dev/null
@@ -1,1262 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_cuda.h"
-#include "hl_cuda_sparse.cuh"
-#include "hl_matrix_apply.cuh"
-#include "hl_matrix_ops.cuh"
-#include "hl_sparse.h"
-#include "hl_sparse.ph"
-#include "paddle/utils/Logging.h"
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(mul_scalar, ONE_PARAMETER, a = a * p);
-DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
-
-void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
-                         real *C_d,
-                         int dimM,
-                         int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-  CHECK(dimM > 0 && dimN > 0 && A_d->rows == dimM && A_d->cols == dimN);
-  CHECK(A_d->format == HL_SPARSE_CSR) << "matrix format error!";
-
-  if (A_d->nnz == 0) {
-    hl_gpu_apply_unary_op(unary::Zero<real>(), C_d, dimM, dimN, dimN);
-    return;
-  }
-
-  /* nnz != 0 */
-  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
-  CHECK((A_d2->csr_val || A_d->type == HL_NO_VALUE) && A_d2->csr_row &&
-        A_d2->csr_col)
-      << "parameter transa error!";
-
-  int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
-  int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
-  dim3 threads(CU_CSR2DENSE_THREAD_X, CU_CSR2DENSE_THREAD_X);
-  dim3 grid(blocksX, blocksY);
-
-  if (A_d->type == HL_NO_VALUE) {
-    KeSMatrixCsr2Dense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, C_d, dimM, dimN);
-  } else if (A_d->type == HL_FLOAT_VALUE) {
-    KeSMatrixCsr2Dense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, C_d, dimM, dimN);
-  } else {
-  }
-  CHECK_SYNC("hl_matrix_csr2dense failed");
-}
-
-void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
-                         real *C_d,
-                         int dimM,
-                         int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(C_d);
-  CHECK(dimM > 0 && dimN > 0 && A_d->rows == dimM && A_d->cols == dimN);
-  CHECK(A_d->format == HL_SPARSE_CSC) << "matrix format error!";
-
-  if (A_d->nnz == 0) {
-    hl_gpu_apply_unary_op(unary::Zero<real>(), C_d, dimM, dimN, dimN);
-    return;
-  }
-
-  /* nnz != 0 */
-  hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix);
-  CHECK((A_d2->csc_val || A_d->type == HL_NO_VALUE) && A_d2->csc_row &&
-        A_d2->csc_col)
-      << "parameter transa error!";
-
-  int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
-  int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
-  dim3 threads(CU_CSR2DENSE_THREAD_X, CU_CSR2DENSE_THREAD_X);
-  dim3 grid(blocksX, blocksY);
-
-  if (A_d->type == HL_NO_VALUE) {
-    KeSMatrixCsc2Dense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        A_d2->csc_val, A_d2->csc_row, A_d2->csc_col, C_d, dimM, dimN);
-  } else if (A_d->type == HL_FLOAT_VALUE) {
-    KeSMatrixCsc2Dense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        A_d2->csc_val, A_d2->csc_row, A_d2->csc_col, C_d, dimM, dimN);
-  } else {
-  }
-  CHECK_SYNC("hl_matrix_csc2dense failed");
-}
-
-void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
-                             hl_matrix_format_t format,
-                             hl_matrix_value_t value_type,
-                             int dimM,
-                             int dimN,
-                             int nnz) {
-  CHECK_NOTNULL(A_d);
-  CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
-      << "sparse matrix format error!";
-  CHECK(value_type == HL_FLOAT_VALUE || value_type == HL_NO_VALUE)
-      << "sparse matrix value type error!";
-  /* avoid malloc 0 bytes */
-  int nnz_s = (nnz == 0 ? 1 : nnz);
-
-  if (format == HL_SPARSE_CSR) {
-    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
-
-    char *tmp =
-        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
-    CHECK_NOTNULL(tmp);
-
-    hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
-    csr->sparsity = -1.0;
-
-    if (value_type == HL_NO_VALUE) {
-      csr->csr_val = NULL;
-      csr->nnz_s = nnz_s;
-      csr->row_s = dimM + 1;
-      csr->csr_row = (int *)hl_malloc_device((dimM + 1) * sizeof(int));
-      csr->csr_col = (int *)hl_malloc_device((nnz_s) * sizeof(int));
-
-      *A_d = (hl_sparse_matrix_s)tmp;
-      (*A_d)->matrix = (hl_matrix_s)csr;
-    } else if (value_type == HL_FLOAT_VALUE) {
-      csr->nnz_s = nnz_s;
-      csr->row_s = dimM + 1;
-      csr->csr_val = (real *)hl_malloc_device((nnz_s) * sizeof(real));
-      csr->csr_row = (int *)hl_malloc_device((dimM + 1) * sizeof(int));
-      csr->csr_col = (int *)hl_malloc_device((nnz_s) * sizeof(int));
-
-      *A_d = (hl_sparse_matrix_s)tmp;
-      (*A_d)->matrix = (hl_matrix_s)csr;
-    }
-  } else if (format == HL_SPARSE_CSC) {
-    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
-
-    char *tmp =
-        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
-    CHECK_NOTNULL(tmp);
-
-    hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
-    csc->sparsity = -1.0f;
-
-    if (value_type == HL_NO_VALUE) {
-      csc->csc_val = NULL;
-      csc->nnz_s = nnz_s;
-      csc->col_s = dimN + 1;
-      csc->csc_row = (int *)hl_malloc_device((nnz_s) * sizeof(int));
-      csc->csc_col = (int *)hl_malloc_device((dimN + 1) * sizeof(int));
-
-      *A_d = (hl_sparse_matrix_s)tmp;
-      (*A_d)->matrix = (hl_matrix_s)csc;
-    } else if (value_type == HL_FLOAT_VALUE) {
-      csc->nnz_s = nnz_s;
-      csc->col_s = dimN + 1;
-      csc->csc_val = (real *)hl_malloc_device((nnz_s) * sizeof(real));
-      csc->csc_row = (int *)hl_malloc_device((nnz_s) * sizeof(int));
-      csc->csc_col = (int *)hl_malloc_device((dimN + 1) * sizeof(int));
-
-      *A_d = (hl_sparse_matrix_s)tmp;
-      (*A_d)->matrix = (hl_matrix_s)csc;
-    }
-  }
-
-  (*A_d)->format = format;
-  (*A_d)->type = value_type;
-  (*A_d)->rows = dimM;
-  (*A_d)->cols = dimN;
-  (*A_d)->nnz = nnz;
-}
-
-void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {
-  CHECK_NOTNULL(A_d);
-  CHECK(A_d->format == HL_SPARSE_CSR || A_d->format == HL_SPARSE_CSC)
-      << "sparse matrix format error!";
-
-  if (A_d->matrix == NULL) {
-    free(A_d);
-    return;
-  }
-
-  if (A_d->format == HL_SPARSE_CSR) {
-    hl_csr_matrix csr = (hl_csr_matrix)A_d->matrix;
-    if (csr->csr_val != NULL) {
-      hl_free_mem_device(csr->csr_val);
-      csr->csr_val = NULL;
-    }
-
-    if (csr->csr_row != NULL) {
-      hl_free_mem_device(csr->csr_row);
-      csr->csr_row = NULL;
-    }
-
-    if (csr->csr_col != NULL) {
-      hl_free_mem_device(csr->csr_col);
-      csr->csr_col = NULL;
-    }
-
-    A_d->matrix = NULL;
-    free(A_d);
-  } else if (A_d->format == HL_SPARSE_CSC) {
-    hl_csc_matrix csc = (hl_csc_matrix)A_d->matrix;
-    if (csc->csc_val != NULL) {
-      hl_free_mem_device(csc->csc_val);
-      csc->csc_val = NULL;
-    }
-
-    if (csc->csc_row != NULL) {
-      hl_free_mem_device(csc->csc_row);
-      csc->csc_row = NULL;
-    }
-
-    if (csc->csc_col != NULL) {
-      hl_free_mem_device(csc->csc_col);
-      csc->csc_col = NULL;
-    }
-
-    A_d->matrix = NULL;
-    free(A_d);
-  }
-}
-
-void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                void *dest_d,
-                                size_t size,
-                                hl_matrix_format_t format,
-                                hl_matrix_value_t value_type,
-                                int dimM,
-                                int dimN,
-                                int nnz) {
-  CHECK_NOTNULL(A_d);
-  CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
-      << "sparse matrix format error!";
-
-  if (format == HL_SPARSE_CSR) {
-    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
-
-    size_t size_ = (dimM + 1) * sizeof(int) + nnz * sizeof(int);
-    if (value_type != HL_NO_VALUE) {
-      size_ += nnz * sizeof(real);
-    }
-    CHECK_LE(size_, size) << "dest_d size(" << size
-                          << ") too small, should bigger than(" << size_
-                          << ")!";
-
-    char *tmp =
-        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
-    CHECK_NOTNULL(tmp);
-
-    hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
-
-    if (value_type == HL_NO_VALUE) {
-      csr->csr_val = NULL;
-      csr->csr_row = (int *)dest_d;
-      csr->csr_col = (int *)((char *)dest_d + (dimM + 1) * sizeof(int));
-    } else {
-      csr->csr_val = (real *)dest_d;
-      csr->csr_row = (int *)((char *)dest_d + nnz * sizeof(real));
-      csr->csr_col = (int *)((char *)dest_d + nnz * sizeof(real) +
-                             (dimM + 1) * sizeof(int));
-    }
-    csr->nnz_s = nnz;
-    csr->row_s = dimM + 1;
-    csr->sparsity = -1.0;
-    *A_d = (hl_sparse_matrix_s)tmp;
-    (*A_d)->matrix = (hl_matrix_s)csr;
-  } else if (format == HL_SPARSE_CSC) {
-    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
-
-    size_t size_ = (dimN + 1) * sizeof(int) + nnz * sizeof(int);
-    if (value_type != HL_NO_VALUE) {
-      size_ += nnz * sizeof(real);
-    }
-    CHECK_LE(size_, size) << "dest_d size(" << size
-                          << ") too small, should bigger than(" << size_
-                          << ")!";
-
-    char *tmp =
-        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
-    CHECK_NOTNULL(tmp);
-
-    hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
-    if (value_type == HL_NO_VALUE) {
-      csc->csc_val = NULL;
-      csc->csc_col = (int *)dest_d;
-      csc->csc_row = (int *)((char *)dest_d + (dimN + 1) * sizeof(int));
-    } else {
-      csc->csc_val = (real *)dest_d;
-      csc->csc_col = (int *)((char *)dest_d + nnz * sizeof(real));
-      csc->csc_row = (int *)((char *)dest_d + nnz * sizeof(real) +
-                             (dimN + 1) * sizeof(int));
-    }
-    csc->nnz_s = nnz;
-    csc->col_s = dimN + 1;
-    csc->sparsity = -1.0f;
-    *A_d = (hl_sparse_matrix_s)tmp;
-    (*A_d)->matrix = (hl_matrix_s)csc;
-  }
-
-  (*A_d)->format = format;
-  (*A_d)->type = value_type;
-  (*A_d)->rows = dimM;
-  (*A_d)->cols = dimN;
-  (*A_d)->nnz = nnz;
-}
-
-void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                real *value_d,
-                                int *rows_d,
-                                int *cols_d,
-                                hl_matrix_format_t format,
-                                hl_matrix_value_t value_type,
-                                int dimM,
-                                int dimN,
-                                int nnz) {
-  CHECK_NOTNULL(A_d);
-  CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
-
-  CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
-      << "sparse matrix format error!";
-
-  if (format == HL_SPARSE_CSR) {
-    char *tmp =
-        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
-    CHECK_NOTNULL(tmp);
-
-    hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
-    csr->csr_row = rows_d;
-    csr->csr_col = cols_d;
-    csr->csr_val = value_d;
-    csr->nnz_s = nnz;
-    csr->row_s = dimM + 1;
-    csr->sparsity = -1.0;
-    *A_d = (hl_sparse_matrix_s)tmp;
-    (*A_d)->matrix = (hl_matrix_s)csr;
-  } else if (format == HL_SPARSE_CSC) {
-    char *tmp =
-        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
-    CHECK_NOTNULL(tmp);
-
-    hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
-    csc->csc_row = rows_d;
-    csc->csc_col = cols_d;
-    csc->csc_val = value_d;
-    csc->nnz_s = nnz;
-    csc->col_s = dimN + 1;
-    csc->sparsity = -1.0f;
-    *A_d = (hl_sparse_matrix_s)tmp;
-    (*A_d)->matrix = (hl_matrix_s)csc;
-  }
-
-  (*A_d)->format = format;
-  (*A_d)->type = value_type;
-  (*A_d)->rows = dimM;
-  (*A_d)->cols = dimN;
-  (*A_d)->nnz = nnz;
-}
-
-void hl_destruct_sparse_matrix(hl_sparse_matrix_s A_d) {
-  CHECK_NOTNULL(A_d);
-  free(A_d);
-}
-
-void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
-                          real *csr_val,
-                          int *csr_row,
-                          int *csr_col,
-                          hl_stream_t stream) {
-  CHECK_NOTNULL(csr_matrix);
-  CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR)
-      << "csr_matrix is not csr format!";
-  CHECK_NOTNULL(csr_matrix->matrix);
-
-  hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix);
-  CHECK_LE(csr_matrix->nnz, csr->nnz_s) << "copy size " << csr_matrix->nnz
-                                        << " is big than alloc size "
-                                        << csr->nnz_s;
-
-  CHECK_LE((csr_matrix->rows + 1), csr->row_s)
-      << "copy size " << (csr_matrix->rows + 1) << " is big than alloc size "
-      << csr->row_s;
-
-  CHECK(csr_matrix->type == HL_FLOAT_VALUE || csr_matrix->type == HL_NO_VALUE)
-      << "sparse matrix value type error!";
-
-  if (csr_matrix->type == HL_NO_VALUE) {
-    if (csr_row == NULL && csr_col == NULL) {
-      return;
-    } else if (csr_row != NULL && csr_col != NULL) {
-      hl_memcpy_async(
-          csr->csr_row, csr_row, (csr_matrix->rows + 1) * sizeof(int), stream);
-
-      hl_memcpy_async(
-          csr->csr_col, csr_col, (csr_matrix->nnz) * sizeof(int), stream);
-    } else {
-      LOG(FATAL) << "parameter csr_row or csr_col is null pointer!";
-    }
-  } else if (csr_matrix->type == HL_FLOAT_VALUE) {
-    if (csr_val == NULL && csr_row == NULL && csr_col == NULL) {
-      return;
-    } else if (csr_val != NULL && csr_row == NULL && csr_col == NULL) {
-      hl_memcpy_async(
-          csr->csr_val, csr_val, (csr_matrix->nnz) * sizeof(real), stream);
-    } else if (csr_val != NULL && csr_row != NULL && csr_col != NULL) {
-      hl_memcpy_async(
-          csr->csr_val, csr_val, (csr_matrix->nnz) * sizeof(real), stream);
-      hl_memcpy_async(
-          csr->csr_row, csr_row, (csr_matrix->rows + 1) * sizeof(int), stream);
-      hl_memcpy_async(
-          csr->csr_col, csr_col, (csr_matrix->nnz) * sizeof(int), stream);
-    } else {
-      LOG(FATAL) << "parameter csr_row or csr_col is null pointer!";
-    }
-  }
-
-  csr->sparsity = ((float)csr_matrix->nnz) / ((float)csr_matrix->rows) /
-                  ((float)csr_matrix->cols);
-}
-
-void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
-                          real *csc_val,
-                          int *csc_row,
-                          int *csc_col,
-                          hl_stream_t stream) {
-  CHECK_NOTNULL(csc_matrix);
-  CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC)
-      << "csc_matrix is not csc format error!";
-
-  hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix);
-  CHECK_LE(csc_matrix->nnz, csc->nnz_s) << "copy size " << csc_matrix->nnz
-                                        << " is big than alloc size "
-                                        << csc->nnz_s;
-
-  CHECK_LE((csc_matrix->cols + 1), csc->col_s)
-      << "copy size " << (csc_matrix->cols + 1) << " is big than alloc size "
-      << csc->col_s;
-
-  CHECK(csc_matrix->type == HL_FLOAT_VALUE || csc_matrix->type == HL_NO_VALUE)
-      << "sparse matrix value type error!";
-
-  if (csc_matrix->type == HL_NO_VALUE) {
-    if (csc_row == NULL && csc_col == NULL) {
-      return;
-    } else if (csc_row != NULL && csc_col != NULL) {
-      hl_memcpy_async(
-          csc->csc_row, csc_row, (csc_matrix->nnz) * sizeof(int), stream);
-      hl_memcpy_async(
-          csc->csc_col, csc_col, (csc_matrix->cols + 1) * sizeof(int), stream);
-    } else {
-      LOG(FATAL) << "parameter csc_row or csc_col is null pointer!";
-    }
-  } else if (csc_matrix->type == HL_FLOAT_VALUE) {
-    if (csc_val == NULL && csc_row == NULL && csc_col == NULL) {
-      return;
-    } else if (csc_val != NULL && csc_row == NULL && csc_col == NULL) {
-      hl_memcpy_async(
-          csc->csc_val, csc_val, (csc_matrix->nnz) * sizeof(real), stream);
-    } else if (csc_val != NULL && csc_row != NULL && csc_col != NULL) {
-      hl_memcpy_async(
-          csc->csc_val, csc_val, (csc_matrix->nnz) * sizeof(real), stream);
-      hl_memcpy_async(
-          csc->csc_row, csc_row, (csc_matrix->nnz) * sizeof(int), stream);
-      hl_memcpy_async(
-          csc->csc_col, csc_col, (csc_matrix->cols + 1) * sizeof(int), stream);
-    } else {
-      LOG(FATAL) << "parameter csc_row or csc_col is null pointer!";
-    }
-  }
-
-  csc->sparsity = ((float)csc_matrix->nnz) / ((float)csc_matrix->rows) /
-                  ((float)csc_matrix->cols);
-}
-
-void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst,
-                             hl_sparse_matrix_s src,
-                             hl_stream_t stream) {
-  CHECK(dst && src && dst->matrix && src->matrix)
-      << "parameter dst or src is null pointer!";
-  CHECK_EQ(dst->format, src->format) << "sparse matrix format does not match!";
-  CHECK(dst->type != HL_FLOAT_VALUE || src->type != HL_NO_VALUE)
-      << "src sparse matrix is no value, dst sparse matrix has value!";
-
-  if (dst->format == HL_SPARSE_CSR) {
-    dst->rows = src->rows;
-    dst->cols = src->cols;
-    dst->nnz = src->nnz;
-    hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
-    hl_memcpy_csr_matrix(dst, csr->csr_val, csr->csr_row, csr->csr_col, stream);
-  } else if (dst->format == HL_SPARSE_CSC) {
-    dst->rows = src->rows;
-    dst->cols = src->cols;
-    dst->nnz = src->nnz;
-    hl_csc_matrix csc = (hl_csc_matrix)src->matrix;
-    hl_memcpy_csc_matrix(dst, csc->csc_val, csc->csc_row, csc->csc_col, stream);
-  } else {
-    LOG(FATAL) << "sparse matrix format error!";
-  }
-}
-
-/**
- * Calculate beta * C, if beta is zero, C does not have to be a valid input.
- */
-static void _beta_mul_c(real *c, int dimM, int dimN, real beta) {
-  if (beta == 0.0) {
-    hl_gpu_apply_unary_op(unary::Zero<real>(), c, dimM, dimN, dimN);
-  } else {
-    if (beta != 1.0) {
-      hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta), c, dimM, dimN, dimN);
-    }
-  }
-
-  return;
-}
-
-void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
-                             hl_trans_op_t transa,
-                             real *B_d,
-                             hl_trans_op_t transb,
-                             real *C_d,
-                             int dimM,
-                             int dimN,
-                             int dimK,
-                             real alpha,
-                             real beta) {
-  CHECK_EQ(transb, HPPL_OP_N);
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-  CHECK(dimM > 0 && dimN > 0 && dimK > 0);
-  CHECK_EQ(A_d->format, HL_SPARSE_CSR) << "matrix format error!";
-
-  if ((HPPL_OP_N == transa && (A_d->rows != dimM || A_d->cols != dimK)) ||
-      (HPPL_OP_T == transa && (A_d->rows != dimK || A_d->cols != dimM))) {
-    LOG(FATAL) << "parameter error!";
-  }
-
-  if (A_d->nnz == 0) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-    return;
-  }
-
-  /* nnz != 0 */
-  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
-  if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) ||
-      A_d2->csr_row == NULL || A_d2->csr_col == NULL) {
-    LOG(FATAL) << "parameter error!";
-  }
-
-  if (HPPL_OP_N == transa) {
-    int blocksX = (dimN + CU_CSRMM_BLOCK_N - 1) / CU_CSRMM_BLOCK_N;
-    int blocksY = (dimM + CU_CSRMM_THREAD_Y - 1) / CU_CSRMM_THREAD_Y;
-    dim3 threads(CU_CSRMM_THREAD_X, CU_CSRMM_THREAD_Y);
-    dim3 grid(blocksX, blocksY);
-
-    /* sparsity pattern */
-    // A_d->sparsity;
-    if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCsrMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csr_val,
-          A_d2->csr_col,
-          A_d2->csr_row,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixCsrMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csr_val,
-          A_d2->csr_col,
-          A_d2->csr_row,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else if (HPPL_OP_T == transa) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-
-    int blocksX =
-        (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / CU_CSC_MUL_DENSE_BLOCK_N;
-    int blocksY =
-        (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / CU_CSC_MUL_DENSE_BLOCK_K;
-    dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y);
-    dim3 grid(blocksX, blocksY);
-    if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCscMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csr_val,
-          A_d2->csr_col,
-          A_d2->csr_row,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixCscMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csr_val,
-          A_d2->csr_col,
-          A_d2->csr_row,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else {
-    LOG(FATAL) << "parameter transa error!";
-  }
-
-  CHECK_SYNC("hl_matrix_csr_mul_dense failed");
-}
-
-void hl_matrix_dense_mul_csc(real *A_d,
-                             hl_trans_op_t transa,
-                             hl_sparse_matrix_s B_d,
-                             hl_trans_op_t transb,
-                             real *C_d,
-                             int dimM,
-                             int dimN,
-                             int dimK,
-                             real alpha,
-                             real beta) {
-  CHECK_EQ(transa, HPPL_OP_N);
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-
-  if (dimM <= 0 || dimN <= 0 || dimK <= 0 ||
-      ((transb == HPPL_OP_N) && (B_d->rows != dimK || B_d->cols != dimN)) ||
-      ((transb == HPPL_OP_T) && (B_d->rows != dimN || B_d->cols != dimK))) {
-    LOG(FATAL) << "parameter dims error!";
-  }
-
-  CHECK_EQ(B_d->format, HL_SPARSE_CSC) << "matrix format error!";
-
-  if (B_d->nnz == 0) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-    return;
-  }
-
-  /* nnz != 0 */
-  hl_csc_matrix B_d2 = (hl_csc_matrix)(B_d->matrix);
-  if ((B_d2->csc_val == NULL && B_d->type != HL_NO_VALUE) ||
-      B_d2->csc_row == NULL || B_d2->csc_col == NULL) {
-    LOG(FATAL) << "parameter B is null!";
-  }
-
-  if (transb == HPPL_OP_N) {
-    int blocksX = (dimM + CU_CSCMM_BLOCK_M_BEST - 1) / CU_CSCMM_BLOCK_M_BEST;
-    int blocksY = (dimN + CU_CSCMM_BLOCK_N_BEST - 1) / CU_CSCMM_BLOCK_N_BEST;
-    dim3 threads(CU_CSCMM_THREAD_X_BEST, CU_CSCMM_THREAD_Y_BEST);
-    dim3 grid(blocksX, blocksY);
-
-    if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsc<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csc_val,
-          B_d2->csc_row,
-          B_d2->csc_col,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixDenseMulCsc<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csc_val,
-          B_d2->csc_row,
-          B_d2->csc_col,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else if (transb == HPPL_OP_T) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-    int blocksX = 1 + (dimK - 1) / CU_DM_CSR_THREAD_X;
-    int blocksY = 1 + (dimM - 1) / CU_DM_CSR_BLOCK_M;
-    dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y);
-    dim3 grid(blocksX, blocksY);
-    if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsr<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csc_val,
-          B_d2->csc_col,
-          B_d2->csc_row,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixDenseMulCsr<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csc_val,
-          B_d2->csc_col,
-          B_d2->csc_row,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else {
-    LOG(FATAL) << "parameter transb error!";
-  }
-
-  CHECK_SYNC("hl_matrix_dense_mul_csc failed");
-}
-
-void hl_matrix_dense_mul_csr(real *A_d,
-                             hl_trans_op_t transa,
-                             hl_sparse_matrix_s B_d,
-                             hl_trans_op_t transb,
-                             real *C_d,
-                             int dimM,
-                             int dimN,
-                             int dimK,
-                             real alpha,
-                             real beta) {
-  CHECK_EQ(transa, HPPL_OP_N);
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-
-  if (dimM <= 0 || dimN <= 0 || dimK <= 0 ||
-      (transb == HPPL_OP_N && (B_d->rows != dimK || B_d->cols != dimN)) ||
-      (transb == HPPL_OP_T && (B_d->rows != dimN || B_d->cols != dimK))) {
-    LOG(FATAL) << "parameter dims error!";
-  }
-
-  CHECK_EQ(B_d->format, HL_SPARSE_CSR) << "matrix format error!";
-
-  if (B_d->nnz == 0) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-    return;
-  }
-
-  /* nnz != 0 */
-  hl_csr_matrix B_d2 = (hl_csr_matrix)(B_d->matrix);
-  if ((B_d2->csr_val == NULL && B_d->type != HL_NO_VALUE) ||
-      B_d2->csr_row == NULL || B_d2->csr_col == NULL) {
-    LOG(FATAL) << "parameter transa error!";
-  }
-
-  if (transb == HPPL_OP_N) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-    int blocksX = 1 + (dimK - 1) / CU_DM_CSR_THREAD_X;
-    int blocksY = 1 + (dimM - 1) / CU_DM_CSR_BLOCK_M;
-    dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y);
-    dim3 grid(blocksX, blocksY);
-    if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsr<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csr_val,
-          B_d2->csr_row,
-          B_d2->csr_col,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixDenseMulCsr<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csr_val,
-          B_d2->csr_row,
-          B_d2->csr_col,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else if (transb == HPPL_OP_T) {
-    int blocksX = (dimM + CU_CSCMM_BLOCK_M_BEST - 1) / CU_CSCMM_BLOCK_M_BEST;
-    int blocksY = (dimN + CU_CSCMM_BLOCK_N_BEST - 1) / CU_CSCMM_BLOCK_N_BEST;
-    dim3 threads(CU_CSCMM_THREAD_X_BEST, CU_CSCMM_THREAD_Y_BEST);
-    dim3 grid(blocksX, blocksY);
-    if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsc<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csr_val,
-          B_d2->csr_col,
-          B_d2->csr_row,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixDenseMulCsc<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d,
-          B_d2->csr_val,
-          B_d2->csr_col,
-          B_d2->csr_row,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else {
-    LOG(FATAL) << "parameter transb error!";
-  }
-
-  CHECK_SYNC("hl_matrix_dense_mul_csr failed");
-}
-
-void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
-                             hl_trans_op_t transa,
-                             real *B_d,
-                             hl_trans_op_t transb,
-                             real *C_d,
-                             int dimM,
-                             int dimN,
-                             int dimK,
-                             real alpha,
-                             real beta) {
-  CHECK_EQ(transb, HPPL_OP_N);
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-  CHECK(dimM > 0 && dimN > 0 && dimK > 0) << "parameter error!";
-  CHECK_EQ(A_d->format, HL_SPARSE_CSC) << "matrix format error!";
-
-  if ((HPPL_OP_N == transa && (A_d->rows != dimM || A_d->cols != dimK)) ||
-      (HPPL_OP_T == transa && (A_d->rows != dimK || A_d->cols != dimM))) {
-    LOG(FATAL) << "parameter error!";
-  }
-
-  if (A_d->nnz == 0) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-    return;
-  }
-
-  /* nnz != 0 */
-  hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix);
-  if ((A_d2->csc_val == NULL && A_d->type != HL_NO_VALUE) ||
-      A_d2->csc_row == NULL || A_d2->csc_col == NULL) {
-    LOG(FATAL) << "parameter error!";
-  }
-
-  if (HPPL_OP_N == transa) {
-    _beta_mul_c(C_d, dimM, dimN, beta);
-
-    int blocksX =
-        (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / CU_CSC_MUL_DENSE_BLOCK_N;
-    int blocksY =
-        (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / CU_CSC_MUL_DENSE_BLOCK_K;
-    dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y);
-    dim3 grid(blocksX, blocksY);
-    if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCscMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csc_val,
-          A_d2->csc_row,
-          A_d2->csc_col,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixCscMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csc_val,
-          A_d2->csc_row,
-          A_d2->csc_col,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else if (HPPL_OP_T == transa) {
-    int blocksX = (dimN + CU_CSRMM_BLOCK_N - 1) / CU_CSRMM_BLOCK_N;
-    int blocksY = (dimM + CU_CSRMM_THREAD_Y - 1) / CU_CSRMM_THREAD_Y;
-    dim3 threads(CU_CSRMM_THREAD_X, CU_CSRMM_THREAD_Y);
-    dim3 grid(blocksX, blocksY);
-
-    /* sparsity pattern */
-    // A_d->sparsity;
-    if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCsrMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csc_val,
-          A_d2->csc_row,
-          A_d2->csc_col,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    } else {
-      KeSMatrixCsrMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d,
-          A_d2->csc_val,
-          A_d2->csc_row,
-          A_d2->csc_col,
-          B_d,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-    }
-  } else {
-    LOG(FATAL) << "parameter transa error!";
-  }
-
-  CHECK_SYNC("hl_matrix_csc_mul_dense failed");
-}
-
-void hl_sparse_matrix_mul(real *A_d,
-                          hl_trans_op_t transa,
-                          real *B_d,
-                          hl_trans_op_t transb,
-                          hl_sparse_matrix_s C_d,
-                          int dimM,
-                          int dimN,
-                          int dimK,
-                          real alpha,
-                          real beta) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-  CHECK(dimM > 0 && dimN > 0 && dimK > 0) << "parameter error!";
-  CHECK_NE(C_d->type, HL_NO_VALUE) << "C value type error!";
-
-  if (C_d->nnz == 0) return;
-
-  if (C_d->format == HL_SPARSE_CSC) {
-    hl_csc_matrix C_d2 = (hl_csc_matrix)(C_d->matrix);
-    if (C_d2->csc_val == NULL || C_d2->csc_row == NULL ||
-        C_d2->csc_col == NULL) {
-      LOG(FATAL) << "parameter error!";
-    }
-
-    if (beta != 1.0) {
-      hl_gpu_apply_unary_op(
-          unary::mul_scalar<real>(beta), C_d2->csc_val, 1, C_d->nnz, C_d->nnz);
-    }
-
-    int blocksX = dimN;
-    int blocksY = 1;
-    dim3 threads(CU_CSCMM_DMD2CSC_THREAD_X, 1);
-    dim3 grid(blocksX, blocksY);
-    bool transA = transa == HPPL_OP_T ? 1 : 0;
-    bool transB = transb == HPPL_OP_T ? 1 : 0;
-    KeSMatrixDenseMulDense2CSC<<<grid, threads, 0, STREAM_DEFAULT>>>(
-        C_d2->csc_val,
-        C_d2->csc_row,
-        C_d2->csc_col,
-        A_d,
-        B_d,
-        transA,
-        transB,
-        dimM,
-        dimN,
-        dimK,
-        alpha,
-        beta);
-    CHECK_SYNC("hl_sparse_matrix_mul failed");
-  } else {
-    hl_csr_matrix C_d2 = (hl_csr_matrix)(C_d->matrix);
-    if ((C_d2->csr_val == NULL && C_d->type != HL_NO_VALUE) ||
-        C_d2->csr_row == NULL || C_d2->csr_col == NULL) {
-      LOG(FATAL) << "parameter error!";
-    }
-
-    if (beta != 1.0) {
-      hl_gpu_apply_unary_op(
-          unary::mul_scalar<real>(beta), C_d2->csr_val, 1, C_d->nnz, C_d->nnz);
-    }
-
-    bool transA = transa == HPPL_OP_T ? 1 : 0;
-    bool transB = transb == HPPL_OP_T ? 1 : 0;
-    if (!transB) {
-      int blocksX = dimM;
-      int blocksY = 1;
-      dim3 threads(CU_CSCMM_DMD2CSR_THREAD_X, 1);
-      dim3 grid(blocksX, blocksY);
-
-      KeSMatrixDenseMulDense2CSR<<<grid, threads, 0, STREAM_DEFAULT>>>(
-          C_d2->csr_val,
-          C_d2->csr_row,
-          C_d2->csr_col,
-          A_d,
-          B_d,
-          transA,
-          transB,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-      CHECK_SYNC("hl_sparse_matrix_mul failed");
-    } else {
-      CHECK(!transA) << "Not supported A is trans and B is not trans!";
-
-      dim3 block(CU_BLOCK_SIZE, 1);
-      int avgNnzPerRow = C_d->nnz / dimM;
-      avgNnzPerRow = avgNnzPerRow > 0 ? avgNnzPerRow : 1;
-      int gridx = DIVUP(avgNnzPerRow, CU_BLOCK_SIZE);
-      dim3 grid(gridx, dimM);
-      KeSMatrixDenseMulDenseTrans2CSR<<<grid, block, 0, STREAM_DEFAULT>>>(
-          C_d2->csr_val,
-          C_d2->csr_row,
-          C_d2->csr_col,
-          A_d,
-          B_d,
-          transA,
-          transB,
-          dimM,
-          dimN,
-          dimK,
-          alpha,
-          beta);
-      CHECK_SYNC("hl_sparse_matrix_mul failed");
-    }
-  }
-}
-
-void hl_memcpy_from_csc_matrix(real *csc_val,
-                               size_t val_size,
-                               int *csc_row,
-                               size_t row_size,
-                               int *csc_col,
-                               size_t col_size,
-                               hl_sparse_matrix_s csc_matrix,
-                               hl_stream_t stream) {
-  CHECK_NOTNULL(csc_matrix);
-  CHECK_NOTNULL(csc_row);
-  CHECK_NOTNULL(csc_col);
-
-  CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC)
-      << "csc_matrix is not csc format error!";
-
-  if (csc_matrix->nnz > row_size ||
-      csc_matrix->cols + 1 > static_cast<int>(col_size)) {
-    LOG(FATAL) << "size not match!";
-  }
-
-  hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix);
-  hl_memcpy_async((void *)csc_row,
-                  (void *)csc->csc_row,
-                  (csc_matrix->nnz) * sizeof(int),
-                  stream);
-  hl_memcpy_async((void *)csc_col,
-                  (void *)csc->csc_col,
-                  (csc_matrix->cols + 1) * sizeof(int),
-                  stream);
-  if (csc_matrix->type == HL_FLOAT_VALUE) {
-    if (csc_val != NULL) {
-      CHECK_LE(csc_matrix->nnz, val_size) << "size not match!";
-      hl_memcpy_async((void *)csc_val,
-                      (void *)csc->csc_val,
-                      (csc_matrix->nnz) * sizeof(real),
-                      stream);
-    } else {
-      LOG(FATAL) << "parameter csr_val is null pointer!";
-    }
-  }
-}
-
-void hl_memcpy_from_csr_matrix(real *csr_val,
-                               size_t val_size,
-                               int *csr_row,
-                               size_t row_size,
-                               int *csr_col,
-                               size_t col_size,
-                               hl_sparse_matrix_s csr_matrix,
-                               hl_stream_t stream) {
-  CHECK_NOTNULL(csr_matrix);
-  CHECK_NOTNULL(csr_row);
-  CHECK_NOTNULL(csr_col);
-  CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR)
-      << "csr_matrix is not csr format error!";
-
-  if (csr_matrix->nnz > col_size ||
-      csr_matrix->rows + 1 > static_cast<int>(row_size)) {
-    LOG(FATAL) << "size not match!";
-  }
-
-  hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix);
-  hl_memcpy_async((void *)csr_row,
-                  (void *)csr->csr_row,
-                  (csr_matrix->rows + 1) * sizeof(int),
-                  stream);
-  hl_memcpy_async((void *)csr_col,
-                  (void *)csr->csr_col,
-                  (csr_matrix->nnz) * sizeof(int),
-                  stream);
-  if (csr_matrix->type == HL_FLOAT_VALUE) {
-    if (csr_val != NULL) {
-      CHECK_LE(csr_matrix->nnz, val_size) << "size not match!";
-      hl_memcpy_async((void *)csr_val,
-                      (void *)csr->csr_val,
-                      (csr_matrix->nnz) * sizeof(real),
-                      stream);
-    } else {
-      LOG(FATAL) << "parameter csr_val is null pointer!";
-    }
-  }
-}
-
-void hl_sparse_matrix_column_sum(
-    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {
-  if (B_d->format == HL_SPARSE_CSR) {
-    hl_matrix_csr_column_sum(A_d, B_d, dimM, dimN, scale);
-  } else {
-    LOG(FATAL) << "Not support CSC format error!";
-  }
-}
-
-void hl_matrix_csr_column_sum(
-    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-
-  if (dimM <= 0 || dimN <= 0 || (B_d->rows != dimM || B_d->cols != dimN)) {
-    LOG(FATAL) << "parameter dims error!";
-  }
-
-  hl_csr_matrix B_d2 = (hl_csr_matrix)(B_d->matrix);
-  if ((B_d2->csr_val == NULL && B_d->type != HL_NO_VALUE) ||
-      B_d2->csr_row == NULL || B_d2->csr_col == NULL) {
-    LOG(FATAL) << "parameter B is null!";
-  }
-
-  if (B_d->nnz == 0) return;
-
-  int nnz = B_d->nnz;
-  int block = 512;
-  int grid = DIVUP(nnz, 512);
-  KeSMatrixCsrColumnSum<<<grid, block, 0, STREAM_DEFAULT>>>(
-      A_d, B_d2->csr_val, B_d2->csr_col, nnz);
-
-  CHECK_SYNC("hl_matrix_csr_column_sum failed");
-}
-
-void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d, real *B_d, real scale) {
-  if (A_d->format == HL_SPARSE_CSR) {
-    hl_matrix_csr_add_bias(A_d, B_d, scale);
-  } else {
-    LOG(FATAL) << "Not support CSC format error!";
-  }
-}
-
-void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real *B_d, real scale) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-
-  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
-  if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) ||
-      A_d2->csr_row == NULL || A_d2->csr_col == NULL) {
-    LOG(FATAL) << "parameter A_d is null!";
-  }
-
-  if (A_d->nnz == 0) return;
-
-  int nnz = A_d->nnz;
-  int block = 512;
-  int grid = DIVUP(nnz, 512);
-  KeSMatrixCsrAddBias<<<grid, block, 0, STREAM_DEFAULT>>>(
-      A_d2->csr_val, A_d2->csr_col, B_d, scale, nnz);
-
-  CHECK_SYNC("hl_sparse_matrix_add_bias failed");
-}
-
-void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
-                                real *B_d,
-                                int dimM,
-                                int dimN,
-                                real alpha,
-                                real beta) {
-  if (A_d->format == HL_SPARSE_CSR) {
-    hl_matrix_csr_add_dense(A_d, B_d, dimM, dimN, alpha, beta);
-  } else {
-    LOG(FATAL) << "Not support CSC format error!";
-  }
-}
-
-void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
-                             real *B_d,
-                             int dimM,
-                             int dimN,
-                             real alpha,
-                             real beta) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-
-  if (dimM <= 0 || dimN <= 0 || A_d->rows != dimM || A_d->cols != dimN) {
-    LOG(FATAL) << "parameter dim error!";
-  }
-
-  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
-  if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) ||
-      A_d2->csr_row == NULL || A_d2->csr_col == NULL) {
-    LOG(FATAL) << "parameter A_d is null!";
-  }
-
-  if (A_d->nnz == 0) return;
-
-  int gridX = DIVUP((A_d->nnz / dimM), 512);
-  gridX = gridX > 0 ? gridX : 1;
-  dim3 block(512, 1);
-  dim3 grid(gridX, dimM);
-  KeSMatrixCsrAddDense<<<grid, block, 0, STREAM_DEFAULT>>>(A_d2->csr_val,
-                                                           A_d2->csr_row,
-                                                           A_d2->csr_col,
-                                                           B_d,
-                                                           alpha,
-                                                           beta,
-                                                           dimM,
-                                                           dimN);
-
-  CHECK_SYNC("hl_sparse_matrix_add_dense failed");
-}
-
-int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) {
-  __sparse_get_return__(sMat, row);
-}
-
-int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) {
-  __sparse_get_return__(sMat, col);
-}
-
-real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
-  __sparse_get_return__(sMat, val);
-}
diff --git a/paddle/cuda/src/hl_table_apply.cu b/paddle/cuda/src/hl_table_apply.cu
deleted file mode 100644
index efa4bef02ba5f5fe9ae449b44bbdc844e5745307..0000000000000000000000000000000000000000
--- a/paddle/cuda/src/hl_table_apply.cu
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_base.h"
-#include "hl_cuda.h"
-#include "hl_device_functions.cuh"
-#include "paddle/utils/Logging.h"
-
-template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
-__global__ void KeMatrixAddRows(real* output,
-                                int ldo,
-                                real* table,
-                                int ldt,
-                                int* ids,
-                                int numSamples,
-                                int tableSize,
-                                int dim) {
-  int idx = threadIdx.x;
-  int idy = blockIdx.x + threadIdx.y * gridDimX;
-
-  while (idy < numSamples) {
-    int tableId = ids[idy];
-    if ((0 <= tableId) && (tableId < tableSize)) {
-      real* out = output + idy * ldo;
-      real* tab = table + tableId * ldt;
-      for (int i = idx; i < dim; i += blockDimX) {
-        if (AddRow) {
-          paddle::paddleAtomicAdd(&tab[i], out[i]);
-        } else {
-          out[i] += tab[i];
-        }
-      }
-    }
-    idy += blockDimY * gridDimX;
-  }
-}
-
-void hl_matrix_select_rows(real* output,
-                           int ldo,
-                           real* table,
-                           int ldt,
-                           int* ids,
-                           int numSamples,
-                           int tableSize,
-                           int dim) {
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(table);
-  CHECK_NOTNULL(ids);
-
-  dim3 threads(128, 8);
-  dim3 grid(8, 1);
-  KeMatrixAddRows<128, 8, 8, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      output, ldo, table, ldt, ids, numSamples, tableSize, dim);
-
-  CHECK_SYNC("hl_matrix_select_rows failed");
-}
-
-void hl_matrix_add_to_rows(real* table,
-                           int ldt,
-                           real* input,
-                           int ldi,
-                           int* ids,
-                           int numSamples,
-                           int tableSize,
-                           int dim) {
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(table);
-  CHECK_NOTNULL(ids);
-
-  dim3 threads(128, 8);
-  dim3 grid(8, 1);
-  KeMatrixAddRows<128, 8, 8, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      input, ldi, table, ldt, ids, numSamples, tableSize, dim);
-
-  CHECK_SYNC("hl_matrix_add_to_rows failed");
-}
-
-template <class T, int blockDimX, int gridDimX>
-__global__ void KeVectorSelect(
-    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
-  int idx = threadIdx.x + blockDimX * blockIdx.x;
-  while (idx < sizei) {
-    int index = ids[idx];
-    // check(index < sizes);
-    dst[idx] = src[index];
-    idx += blockDimX * gridDimX;
-  }
-}
-
-template <class T>
-void hl_vector_select_from(
-    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
-  CHECK_NOTNULL(dst);
-  CHECK_NOTNULL(src);
-  CHECK_NOTNULL(ids);
-  CHECK_EQ(sized, sizei);
-
-  dim3 threads(512, 1);
-  dim3 grid(8, 1);
-  KeVectorSelect<T, 512, 8><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      dst, sized, src, sizes, ids, sizei);
-
-  CHECK_SYNC("hl_vector_select_from failed");
-}
-
-template void hl_vector_select_from(real* dst,
-                                    int sized,
-                                    const real* src,
-                                    int sizes,
-                                    const int* ids,
-                                    int sizei);
-template void hl_vector_select_from(
-    int* dst, int sized, const int* src, int sizes, const int* ids, int sizei);
diff --git a/paddle/cuda/src/hl_top_k.cu b/paddle/cuda/src/hl_top_k.cu
deleted file mode 100644
index fea8712a773b1524022f4bba626cf5044edebef6..0000000000000000000000000000000000000000
--- a/paddle/cuda/src/hl_top_k.cu
+++ /dev/null
@@ -1,477 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_base.h"
-#include "hl_sparse.ph"
-#include "hl_top_k.h"
-#include "paddle/utils/Logging.h"
-
-// using namespace hppl;
-
-struct Pair {
-  __device__ __forceinline__ Pair() {}
-
-  __device__ __forceinline__ Pair(real value, int id) : v_(value), id_(id) {}
-
-  __device__ __forceinline__ void set(real value, int id) {
-    v_ = value;
-    id_ = id;
-  }
-
-  __device__ __forceinline__ void operator=(const Pair& in) {
-    v_ = in.v_;
-    id_ = in.id_;
-  }
-
-  __device__ __forceinline__ bool operator<(const real value) const {
-    return (v_ < value);
-  }
-
-  __device__ __forceinline__ bool operator<(const Pair& in) const {
-    return (v_ < in.v_) || ((v_ == in.v_) && (id_ > in.id_));
-  }
-
-  __device__ __forceinline__ bool operator>(const Pair& in) const {
-    return (v_ > in.v_) || ((v_ == in.v_) && (id_ < in.id_));
-  }
-
-  real v_;
-  int id_;
-};
-
-__device__ __forceinline__ void addTo(Pair topK[],
-                                      const Pair& p,
-                                      int beamSize) {
-  for (int k = beamSize - 2; k >= 0; k--) {
-    if (topK[k] < p) {
-      topK[k + 1] = topK[k];
-    } else {
-      topK[k + 1] = p;
-      return;
-    }
-  }
-  topK[0] = p;
-}
-
-template <int beamSize>
-__device__ __forceinline__ void addTo(Pair topK[], const Pair& p) {
-  for (int k = beamSize - 2; k >= 0; k--) {
-    if (topK[k] < p) {
-      topK[k + 1] = topK[k];
-    } else {
-      topK[k + 1] = p;
-      return;
-    }
-  }
-  topK[0] = p;
-}
-
-template <int blockSize>
-__device__ __forceinline__ void getTopK(
-    Pair topK[], real* src, int idx, int dim, int beamSize) {
-  while (idx < dim) {
-    if (topK[beamSize - 1] < src[idx]) {
-      Pair tmp(src[idx], idx);
-      addTo(topK, tmp, beamSize);
-    }
-    idx += blockSize;
-  }
-}
-
-template <int blockSize>
-__device__ __forceinline__ void getTopK(
-    Pair topK[], real* src, int idx, int dim, const Pair& max, int beamSize) {
-  while (idx < dim) {
-    if (topK[beamSize - 1] < src[idx]) {
-      Pair tmp(src[idx], idx);
-      if (tmp < max) {
-        addTo(topK, tmp, beamSize);
-      }
-    }
-    idx += blockSize;
-  }
-}
-
-template <int blockSize>
-__device__ __forceinline__ void getTopK(
-    Pair topK[], real* val, int* col, int idx, int dim, int beamSize) {
-  while (idx < dim) {
-    if (topK[beamSize - 1] < val[idx]) {
-      Pair tmp(val[idx], col[idx]);
-      addTo(topK, tmp, beamSize);
-    }
-    idx += blockSize;
-  }
-}
-
-template <int blockSize>
-__device__ __forceinline__ void getTopK(Pair topK[],
-                                        real* val,
-                                        int* col,
-                                        int idx,
-                                        int dim,
-                                        const Pair& max,
-                                        int beamSize) {
-  while (idx < dim) {
-    if (topK[beamSize - 1] < val[idx]) {
-      Pair tmp(val[idx], col[idx]);
-      if (tmp < max) {
-        addTo(topK, tmp, beamSize);
-      }
-    }
-    idx += blockSize;
-  }
-}
-
-template <int maxLength, int blockSize>
-__device__ __forceinline__ void threadGetTopK(Pair topK[],
-                                              int& beam,
-                                              int beamSize,
-                                              real* src,
-                                              bool& firstStep,
-                                              bool& isEmpty,
-                                              Pair& max,
-                                              int dim,
-                                              const int tid) {
-  if (beam > 0) {
-    int length = beam < beamSize ? beam : beamSize;
-    if (firstStep) {
-      firstStep = false;
-      getTopK<blockSize>(topK, src, tid, dim, length);
-    } else {
-      for (int k = 0; k < maxLength; k++) {
-        if (k < maxLength - beam) {
-          topK[k] = topK[k + beam];
-        } else {
-          topK[k].set(-HL_FLOAT_MAX, -1);
-        }
-      }
-      if (!isEmpty) {
-        getTopK<blockSize>(topK + maxLength - beam, src, tid, dim, max, length);
-      }
-    }
-
-    max = topK[maxLength - 1];
-    if (max.id_ == -1) isEmpty = true;
-    beam = 0;
-  }
-}
-
-template <int maxLength, int blockSize>
-__device__ __forceinline__ void threadGetTopK(Pair topK[],
-                                              int& beam,
-                                              int beamSize,
-                                              real* val,
-                                              int* col,
-                                              bool& firstStep,
-                                              bool& isEmpty,
-                                              Pair& max,
-                                              int dim,
-                                              const int tid) {
-  if (beam > 0) {
-    int length = beam < beamSize ? beam : beamSize;
-    if (firstStep) {
-      firstStep = false;
-      getTopK<blockSize>(topK, val, col, tid, dim, length);
-    } else {
-      for (int k = 0; k < maxLength; k++) {
-        if (k < maxLength - beam) {
-          topK[k] = topK[k + beam];
-        } else {
-          topK[k].set(-HL_FLOAT_MAX, -1);
-        }
-      }
-      if (!isEmpty) {
-        getTopK<blockSize>(
-            topK + maxLength - beam, val, col, tid, dim, max, length);
-      }
-    }
-
-    max = topK[maxLength - 1];
-    if (max.id_ == -1) isEmpty = true;
-    beam = 0;
-  }
-}
-
-template <int maxLength, int blockSize>
-__device__ __forceinline__ void blockReduce(Pair* shTopK,
-                                            int* maxId,
-                                            Pair topK[],
-                                            real** topVal,
-                                            int** topIds,
-                                            int& beam,
-                                            int& beamSize,
-                                            const int tid,
-                                            const int warp) {
-  while (true) {
-    __syncthreads();
-    if (tid < blockSize / 2) {
-      if (shTopK[tid] < shTopK[tid + blockSize / 2]) {
-        maxId[tid] = tid + blockSize / 2;
-      } else {
-        maxId[tid] = tid;
-      }
-    }
-    __syncthreads();
-    for (int stride = blockSize / 4; stride > 0; stride = stride / 2) {
-      if (tid < stride) {
-        if (shTopK[maxId[tid]] < shTopK[maxId[tid + stride]]) {
-          maxId[tid] = maxId[tid + stride];
-        }
-      }
-      __syncthreads();
-    }
-    __syncthreads();
-
-    if (tid == 0) {
-      **topVal = shTopK[maxId[0]].v_;
-      **topIds = shTopK[maxId[0]].id_;
-      (*topVal)++;
-      (*topIds)++;
-    }
-    if (tid == maxId[0]) beam++;
-    if (--beamSize == 0) break;
-    __syncthreads();
-
-    if (tid == maxId[0]) {
-      if (beam < maxLength) {
-        shTopK[tid] = topK[beam];
-      }
-    }
-    if (maxId[0] / 32 == warp) {
-      if (__shfl(beam, (maxId[0]) % 32, 32) == maxLength) break;
-    }
-  }
-}
-
-/**
- * Each block compute one sample.
- * In a block:
- * 1. every thread get top maxLength value;
- * 2. merge to shTopK, block reduce and get max value;
- * 3. go to the second setp, until one thread's topK value is null;
- * 4. go to the first setp, until get the topK value.
- */
-template <int maxLength, int blockSize>
-__global__ void KeMatrixTopK(real* topVal,
-                             int ldv,
-                             int* topIds,
-                             real* src,
-                             int lds,
-                             int dim,
-                             int beamSize) {
-  __shared__ Pair shTopK[blockSize];
-  __shared__ int maxId[blockSize / 2];
-  const int tid = threadIdx.x;
-  const int warp = threadIdx.x / 32;
-  src += blockIdx.x * lds;
-  topVal += blockIdx.x * ldv;
-  topIds += blockIdx.x * beamSize;
-
-  Pair topK[maxLength];  // NOLINT
-  int beam = maxLength;
-  Pair max;
-  bool isEmpty = false;
-  bool firstStep = true;
-
-  for (int k = 0; k < maxLength; k++) {
-    topK[k].set(-HL_FLOAT_MAX, -1);
-  }
-  while (beamSize) {
-    threadGetTopK<maxLength, blockSize>(
-        topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
-
-    shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>(
-        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
-  }
-}
-
-template <int maxLength, int blockSize>
-__global__ void KeSMatrixTopK(real* topVal,
-                              int ldv,
-                              int* topIds,
-                              real* val,
-                              int* row,
-                              int* col,
-                              int beamSize) {
-  __shared__ Pair shTopK[blockSize];
-  __shared__ int maxId[blockSize / 2];
-  const int tid = threadIdx.x;
-  const int warp = threadIdx.x / 32;
-  topVal += blockIdx.x * ldv;
-  topIds += blockIdx.x * beamSize;
-
-  Pair topK[maxLength];  // NOLINT
-  int beam = maxLength;
-  Pair max;
-  bool isEmpty = false;
-  bool firstStep = true;
-
-  int start = row[blockIdx.x];
-  int end = row[blockIdx.x + 1];
-  int dim = end - start;
-  val += start;
-  col += start;
-
-  if (beamSize > dim) {
-    // if the number of values to sort are less than the output size,
-    // use -1 to indicate the end of valid sorted values.
-    if (tid == 0) {
-      topIds[dim] = -1;
-    }
-
-    beamSize = dim;
-  }
-
-  for (int k = 0; k < maxLength; k++) {
-    topK[k].set(-HL_FLOAT_MAX, -1);
-  }
-  while (beamSize) {
-    threadGetTopK<maxLength, blockSize>(
-        topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid);
-
-    shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>(
-        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
-  }
-}
-
-void hl_matrix_top_k(real* topVal,
-                     int ldv,
-                     int* topIds,
-                     real* src,
-                     int lds,
-                     int dim,
-                     int beamSize,
-                     int numSamples) {
-  CHECK_NOTNULL(topVal);
-  CHECK_NOTNULL(topIds);
-  CHECK_NOTNULL(src);
-
-  if (beamSize > dim) beamSize = dim;
-
-  dim3 threads(256, 1);
-  dim3 grid(numSamples, 1);
-  KeMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      topVal, ldv, topIds, src, lds, dim, beamSize);
-
-  CHECK_SYNC("hl_matrix_top_k failed");
-}
-
-void hl_sparse_matrix_top_k(real* topVal,
-                            int ldv,
-                            int* topIds,
-                            hl_sparse_matrix_s src,
-                            int beamSize,
-                            int numSamples) {
-  CHECK_NOTNULL(topVal);
-  CHECK_NOTNULL(topIds);
-  CHECK_NOTNULL(src);
-  CHECK_EQ(src->format, HL_SPARSE_CSR) << "sparse matrix format error!";
-
-  hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
-  if (csr->csr_val == NULL || csr->csr_row == NULL || csr->csr_col == NULL) {
-    LOG(FATAL) << "parameter src is null!";
-  }
-
-  dim3 threads(256, 1);
-  dim3 grid(numSamples, 1);
-  KeSMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize);
-
-  CHECK_SYNC("hl_sparse_matrix_top_k failed");
-}
-
-/**
- * Each block compute one sample.
- * In a block:
- * 1. every thread get top maxLength value;
- * 2. merge to shTopK, block reduce and get max value;
- * 3. go to the second setp, until one thread's topK value is null;
- * 4. go to the first setp, until get the topK value.
- */
-template <int maxLength, int blockSize>
-__global__ void KeMatrixTopKClassificationError(real* topVal,
-                                                int ldv,
-                                                int* topIds,
-                                                real* src,
-                                                int lds,
-                                                int dim,
-                                                int beamSize,
-                                                int* label,
-                                                real* recResult) {
-  __shared__ Pair shTopK[blockSize];
-  __shared__ int maxId[blockSize / 2];
-  const int tid = threadIdx.x;
-  const int warp = threadIdx.x / 32;
-  src += blockIdx.x * lds;
-  topVal += blockIdx.x * ldv;
-  topIds += blockIdx.x * beamSize;
-
-  Pair topK[maxLength];  // NOLINT
-  int beam = maxLength;
-  Pair max;
-  bool isEmpty = false;
-  bool firstStep = true;
-  int topkSize = beamSize;
-
-  for (int k = 0; k < maxLength; k++) {
-    topK[k].set(-HL_FLOAT_MAX, -1);
-  }
-
-  while (beamSize) {
-    threadGetTopK<maxLength, blockSize>(
-        topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
-
-    shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>(
-        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
-  }
-
-  __syncthreads();
-  if (tid == 0) {
-    for (int i = 0; i < topkSize; i++) {
-      if (*--topIds == label[blockIdx.x]) {
-        recResult[blockIdx.x] = 0;
-        break;
-      }
-      recResult[blockIdx.x] = 1.0f;
-    }
-  }
-}
-
-void hl_matrix_classification_error(real* topVal,
-                                    int ldv,
-                                    int* topIds,
-                                    real* src,
-                                    int lds,
-                                    int dim,
-                                    int topkSize,
-                                    int numSamples,
-                                    int* label,
-                                    real* recResult) {
-  CHECK_NOTNULL(topVal);
-  CHECK_NOTNULL(topIds);
-  CHECK_NOTNULL(src);
-
-  if (topkSize > dim) topkSize = dim;
-
-  dim3 threads(256, 1);
-  dim3 grid(numSamples, 1);
-  KeMatrixTopKClassificationError<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
-      topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
-
-  CHECK_SYNC("hl_matrix_top_k classification error failed");
-}
diff --git a/paddle/cuda/src/hl_warpctc_wrap.cc b/paddle/cuda/src/hl_warpctc_wrap.cc
deleted file mode 100644
index 5111bceaff224f2467fe1b6c92daed03414dd12e..0000000000000000000000000000000000000000
--- a/paddle/cuda/src/hl_warpctc_wrap.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "hl_warpctc_wrap.h"
-#include <mutex>
-#include "paddle/utils/DynamicLoader.h"
-#include "paddle/utils/Logging.h"
-
-namespace dynload {
-
-std::once_flag warpctc_dso_flag;
-void* warpctc_dso_handle = nullptr;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load warpctc routine
- * via operator overloading. When PADDLE_USE_DSO is
- * false, you need to add the path of libwarp-ctc.so to
- * the linked-libs of paddle or to LD_PRELOAD.
- */
-#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                              \
-  struct DynLoad__##__name {                                           \
-    template <typename... Args>                                        \
-    auto operator()(Args... args) -> decltype(__name(args...)) {       \
-      using warpctcFunc = decltype(__name(args...)) (*)(Args...);      \
-      std::call_once(                                                  \
-          warpctc_dso_flag, GetWarpCTCDsoHandle, &warpctc_dso_handle); \
-      void* p_##_name = dlsym(warpctc_dso_handle, #__name);            \
-      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);        \
-    }                                                                  \
-  } __name;  // struct DynLoad__##__name
-
-// include all needed warp-ctc functions
-DYNAMIC_LOAD_WARPCTC_WRAP(get_warpctc_version)
-DYNAMIC_LOAD_WARPCTC_WRAP(ctcGetStatusString)
-DYNAMIC_LOAD_WARPCTC_WRAP(compute_ctc_loss)
-DYNAMIC_LOAD_WARPCTC_WRAP(get_workspace_size)
-
-#undef DYNAMIC_LOAD_WARPCTC_WRAP
-
-} /* namespace dynload */
-
-#define WARPCTC_GET_VERSION dynload::get_warpctc_version
-#define WARPCTC_GET_STATUS_STRING dynload::ctcGetStatusString
-
-static int g_warpctcVersion = -1;
-#ifndef PADDLE_TYPE_DOUBLE
-#define WARPCTC_COMPUTE_LOSS dynload::compute_ctc_loss
-#define WARPCTC_GET_WORKSPACE_SIZE dynload::get_workspace_size
-#else
-hl_warpctc_status_t fatal(...) {
-  LOG(FATAL) << "warp-ctc [version " << g_warpctcVersion
-             << "] Error: not support double precision.";
-  // both of get_warpctc_version() and get_workspace_size() return an ctcStatus
-  // type value
-  return CTC_STATUS_EXECUTION_FAILED;
-}
-#define WARPCTC_COMPUTE_LOSS fatal
-#define WARPCTC_GET_WORKSPACE_SIZE fatal
-#endif
-
-/**
- * Check build-in warp-ctc function using glog and it also
- * support << operator for more details error info.
- */
-#define CHECK_WARPCTC(warpctcStat)                \
-  CHECK_EQ(CTC_STATUS_SUCCESS, warpctcStat)       \
-      << "warp-ctc [version " << g_warpctcVersion \
-      << "] Error: " << WARPCTC_GET_STATUS_STRING(warpctcStat) << " "
-
-void hl_warpctc_init(const size_t blank,
-                     bool useGpu,
-                     hl_warpctc_options_t* options) {
-  CHECK_NOTNULL(options);
-
-  g_warpctcVersion = WARPCTC_GET_VERSION();
-
-  if (useGpu) {
-#ifdef __NVCC__
-    options->loc = CTC_GPU;
-    options->stream = STREAM_DEFAULT;
-#else
-    LOG(FATAL) << "[warpctc init] GPU is not enabled.";
-#endif
-  } else {
-    options->loc = CTC_CPU;
-    options->num_threads = 1;
-  }
-
-  options->blank_label = blank;
-}
-
-void hl_warpctc_compute_loss(const real* batchInput,
-                             real* batchGrad,
-                             const int* cpuLabels,
-                             const int* cpuLabelLengths,
-                             const int* cpuInputLengths,
-                             const size_t numClasses,
-                             const size_t numSequences,
-                             real* cpuCosts,
-                             void* workspace,
-                             hl_warpctc_options_t* options) {
-  CHECK_NOTNULL(batchInput);
-  CHECK_NOTNULL(cpuLabels);
-  CHECK_NOTNULL(cpuLabelLengths);
-  CHECK_NOTNULL(cpuInputLengths);
-  CHECK_NOTNULL(cpuCosts);
-  CHECK_NOTNULL(workspace);
-  CHECK_NOTNULL(options);
-
-  CHECK_WARPCTC(WARPCTC_COMPUTE_LOSS(batchInput,
-                                     batchGrad,
-                                     cpuLabels,
-                                     cpuLabelLengths,
-                                     cpuInputLengths,
-                                     numClasses,
-                                     numSequences,
-                                     cpuCosts,
-                                     workspace,
-                                     *options));
-}
-
-void hl_warpctc_get_workspace_size(const int* cpuLabelLengths,
-                                   const int* cpuInputLengths,
-                                   const size_t numClasses,
-                                   const size_t numSequences,
-                                   hl_warpctc_options_t* options,
-                                   size_t* bytes) {
-  CHECK_NOTNULL(cpuLabelLengths);
-  CHECK_NOTNULL(cpuInputLengths);
-  CHECK_NOTNULL(options);
-  CHECK_NOTNULL(bytes);
-
-  CHECK_WARPCTC(WARPCTC_GET_WORKSPACE_SIZE(cpuLabelLengths,
-                                           cpuInputLengths,
-                                           numClasses,
-                                           numSequences,
-                                           *options,
-                                           bytes));
-}
diff --git a/paddle/fluid/framework/.clang-format b/paddle/fluid/.clang-format
similarity index 100%
rename from paddle/fluid/framework/.clang-format
rename to paddle/fluid/.clang-format
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index d725763b01d5953985f8e090605f68a8419b5498..d274d96c29bdbf5973d568d783369c3975bdc436 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -3,6 +3,7 @@ add_subdirectory(platform)
 add_subdirectory(framework)
 add_subdirectory(operators)
 add_subdirectory(pybind)
-add_subdirectory(inference)
 add_subdirectory(string)
 add_subdirectory(recordio)
+# NOTE: please add subdirectory inference at last.
+add_subdirectory(inference)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 15e5574ecfd406b87db8370948352b7e736937ea..6286dda4a54991b7a1042aed9886fdcb694198ba 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -1,14 +1,15 @@
+add_subdirectory(details)
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)
 
 cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
-
+cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
 if(WITH_GPU)
-  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto)
+  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type)
 else()
-  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto)
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type)
 endif()
 
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
@@ -20,9 +21,9 @@ endif()
 
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
-nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place paddle_memory device_context init)
+nv_test(mixed_vector_test SRCS mixed_vector_test.cu DEPS place memory device_context init)
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio)
-cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
+cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
 
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
@@ -56,7 +57,7 @@ cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
 cc_library(attribute SRCS attribute.cc DEPS framework_proto boost)
 cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
 device_context)
-cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
+cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute glog)
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
@@ -73,19 +74,25 @@ py_proto_compile(framework_py_proto SRCS framework.proto)
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
 add_custom_command(TARGET framework_py_proto POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto
-    COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
+    COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
     COMMENT "Copy generated python proto into directory paddle/fluid/proto."
     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
-cc_library(backward SRCS backward.cc DEPS net_op)
-cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)
 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
 
-cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
-framework_proto backward glog lod_rank_table feed_fetch_method)
+if(WITH_DISTRIBUTE)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr)
+  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+else()
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method)
+endif()
+
+
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor)
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
@@ -100,7 +107,8 @@ cc_test(init_test SRCS init_test.cc DEPS init)
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
       
-cc_test(channel_test SRCS channel_test.cc)
+# cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )
 cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
-        channel_send_op channel_recv_op sum_op elementwise_add_op executor proto_desc)
+        channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op
+        conditional_block_op while_op assign_op print_op executor proto_desc)
diff --git a/paddle/fluid/framework/backward.cc b/paddle/fluid/framework/backward.cc
deleted file mode 100644
index 1314af2b3dab281bd201e6a77bfbe87e0bd58ffb..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/backward.cc
+++ /dev/null
@@ -1,585 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/backward.h"
-#include "paddle/fluid/operators/net_op.h"
-
-#include <deque>
-#include <list>
-#include <memory>
-#include <unordered_set>
-
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/net_op.h"
-
-namespace paddle {
-namespace framework {
-
-static std::unordered_set<std::string>* g_ctrl_flow_ops_ = nullptr;
-// Control Flow operators's backward is significantly different from
-// computational operators. Hack Code here.
-// We should design a better way to backward CtrlFlowOps.
-static std::unordered_set<std::string>& CtrlFlowOps() {
-  if (g_ctrl_flow_ops_ == nullptr) {
-    g_ctrl_flow_ops_ = new std::unordered_set<std::string>{
-        "increment", "lod_rank_table", "less_than"};
-  }
-  return *g_ctrl_flow_ops_;
-}
-
-static inline std::unique_ptr<OperatorBase> CreateGradOp(
-    const OperatorBase& op, const std::unordered_set<std::string>& no_grad_set,
-    std::unordered_map<std::string, std::string>* grad_to_var) {
-  OpDesc op_desc;
-  op_desc.SetInputMap(op.Inputs());
-  op_desc.SetOutputMap(op.Outputs());
-  op_desc.SetType(op.Type());
-  op_desc.SetAttrMap(op.Attrs());
-  auto& info = OpInfoMap::Instance().Get(op.Type());
-  auto grad_descs = info.GradOpMaker()(op_desc, no_grad_set, grad_to_var, {});
-  std::vector<std::unique_ptr<OperatorBase>> grad_ops;
-  grad_ops.reserve(grad_descs.size());
-  std::transform(grad_descs.begin(), grad_descs.end(),
-                 std::back_inserter(grad_ops),
-                 [](const std::unique_ptr<OpDesc>& grad_desc) {
-                   return OpRegistry::CreateOp(*grad_desc);
-                 });
-  PADDLE_ENFORCE(!grad_ops.empty());
-  if (grad_ops.size() == 1) {
-    return std::move(grad_ops[0]);
-  } else {
-    auto net_op = new operators::NetOp();
-    for (auto& grad_op : grad_ops) {
-      net_op->AppendOp(std::move(grad_op));
-    }
-    net_op->CompleteAddOp();
-    return std::unique_ptr<OperatorBase>(net_op);
-  }
-}
-
-template <typename Map, typename T>
-static void ForEachVarName(const Map& names, T callback) {
-  for (auto& name : names) {
-    for (auto& n : name.second) {
-      if (callback(n)) return;
-    }
-  }
-}
-
-// return whether all the names + suffixes in the set
-static bool AllInSet(
-    const std::map<std::string, std::vector<std::string>>& names,
-    const std::string& suffix, const std::unordered_set<std::string>& set) {
-  bool all_in_set = true;
-  ForEachVarName(names, [&all_in_set, &set, &suffix](const std::string& n) {
-    all_in_set = set.find(n + suffix) != set.end();
-    return !all_in_set;
-  });
-  return all_in_set;
-}
-
-static std::unique_ptr<OperatorBase> NOP() {
-  auto net_op = new operators::NetOp();
-  net_op->SetType("@NOP@");
-  net_op->CompleteAddOp();
-  return std::unique_ptr<OperatorBase>(net_op);
-}
-
-//  Get backward operator from a forward operator, a recursive implementation.
-//
-//  no_grad_names the gradient variable names without gradient calculating.
-//
-//  uniq_id is a unique index used inside recursively calling
-//  BackwardRecursive. use `uid = uniq_id++;` to get the unique index, and
-//  pass `uniq_id` through recursive calling.
-//
-//  returns The backward operator. In a simple situation, it may be a simple
-//  operator, in a complex situation, it maybe a NetOp.
-//
-//  See Backward.h for details
-static std::unique_ptr<OperatorBase> BackwardRecursive(
-    const OperatorBase& forwardOp,
-    std::unordered_set<std::string>& no_grad_names,
-    std::unordered_map<std::string, std::string>* grad_to_var,
-    size_t& uniq_id) {
-  //  If all input gradients of forwarding operator do not need to calculate,
-  //  just return an NOP. Not return null ptr because NOP does not take
-  //  too much time for calculation, but it is useful for simplifying logic.
-  if (AllInSet(forwardOp.Inputs() /*names*/, kGradVarSuffix /*suffix*/,
-               no_grad_names /*set*/)) {
-    return NOP();
-  }
-
-  //  All output gradients of forwarding operator do not need to calculate.
-  //  Then all input gradients cannot be computed at all, and we put them into
-  //  `no_grad_names` set. Return an NOP.
-  if (AllInSet(forwardOp.Outputs() /*names*/, kGradVarSuffix /*suffix*/,
-               no_grad_names /*set*/)) {
-    ForEachVarName(forwardOp.Inputs(),
-                   [&no_grad_names](const std::string& name) -> bool {
-                     no_grad_names.insert(GradVarName(name));
-                     return false;
-                   });
-    return NOP();
-  }
-
-  // Returned gradient network
-  auto net = std::unique_ptr<operators::NetOp>(new operators::NetOp());
-
-  if (forwardOp.IsNetOp()) {
-    // Because forwardOp is a net op, it can static_cast.
-    auto& forwardNet = static_cast<const operators::NetOp&>(forwardOp);
-
-    // Map from output gradient variable name to operator's indices in
-    // backward net's ops_. That operator generates that variable.
-    std::unordered_map<std::string, std::vector<size_t>> dup_output_ops;
-
-    size_t local_op_id = 0;
-    // reversely travel forwardNet and collect all duplicate outputs.
-    for (auto it = forwardNet.ops_.rbegin(); it != forwardNet.ops_.rend();
-         ++it, ++local_op_id) {
-      auto& fwd = *it;
-      auto bwd = BackwardRecursive(*fwd, no_grad_names, grad_to_var, uniq_id);
-      ForEachVarName(bwd->Outputs(),
-                     [&dup_output_ops, local_op_id](const std::string& out) {
-                       dup_output_ops[out].emplace_back(local_op_id);
-                       return false;
-                     });
-      net->AppendOp(std::move(bwd));
-    }
-    // Get unique ID for this method.
-    auto uid = uniq_id++;
-    // TODO(dzh): more comment
-    // multiple operators which have the same output (y for example) may
-    // overwrite the same y variable when backward, special operations are token
-    // to handle this case. For each duplicate output, rename it to an alias
-    // (original name with a offset), append an `add` op for its operator,
-    // and finally sum all the alias variable to the final output variable y.
-    using Pos = std::pair<size_t, std::unique_ptr<OperatorBase>>;
-    std::list<Pos> insert_position;
-    for (auto& dup_output_op : dup_output_ops) {
-      const std::string& name = dup_output_op.first;
-      // duplicate @Empty@ don't need to be added
-      if (name == kEmptyVarName) continue;
-
-      auto& dup_op = dup_output_op.second;
-      // no duplicate output
-      if (dup_op.size() == 1) continue;
-
-      // process the duplicate outputs
-      std::vector<std::string> dup_outputs;
-      for (size_t i = 0; i < dup_op.size(); ++i) {
-        // rename each duplicate output to an alias
-        auto op_offset = dup_op[i];
-        dup_outputs.push_back(name + "@RENAME@" + std::to_string(uid) + "@" +
-                              std::to_string(i));
-        net->ops_[op_offset]->Rename(name, dup_outputs.back());
-      }
-      // collect all the offset for each alias,
-      // insert a sum operator to add all aliases to output
-      insert_position.push_back(
-          {dup_op.back(),
-           OpRegistry::CreateOp("sum", {{"X", dup_outputs}}, {{"Out", {name}}},
-                                AttributeMap{})});
-    }
-
-    // make sure the inserted `sum` ops follow the BFS order.
-    insert_position.sort(
-        [](const Pos& l, const Pos& r) { return l.first > r.first; });
-
-    for (auto& pos : insert_position) {
-      net->InsertOp(pos.first + 1, std::move(pos.second));
-    }
-  } else {
-    std::unique_ptr<OperatorBase> grad_op(
-        CreateGradOp(forwardOp, no_grad_names, grad_to_var));
-
-    ForEachVarName(grad_op->Inputs(), [&no_grad_names, &net, &grad_op](
-                                          const std::string& grad_input) {
-      if (no_grad_names.count(grad_input)) {
-        // +1 for \0
-        std::string prefix = grad_input.substr(
-            0, grad_input.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
-        grad_op->Rename(grad_input, prefix + kZeroVarSuffix);
-
-        // If part of input gradient of that operator is not calculated, fill
-        // zero variables to that input gradient.
-        net->AppendOp(OpRegistry::CreateOp("fill_zeros_like", {{"X", {prefix}}},
-                                           {{"Out", {grad_input}}},
-                                           AttributeMap{}));
-      }
-      return false;
-    });
-
-    ForEachVarName(grad_op->Outputs(),
-                   [&no_grad_names, &grad_op](const std::string& grad_output) {
-                     if (no_grad_names.count(grad_output)) {
-                       grad_op->Rename(grad_output, kEmptyVarName);
-                     }
-                     return false;
-                   });
-
-    if (net->ops_.empty()) {  // Current no aux op is added to network
-      return grad_op;
-    }
-    net->AppendOp(std::move(grad_op));
-  }
-  net->SetType("@GENERATED_BACKWARD@");
-  net->CompleteAddOp();
-  return std::unique_ptr<OperatorBase>(
-      static_cast<OperatorBase*>(net.release()));
-}
-
-// See header for comments
-std::unique_ptr<OperatorBase> Backward(
-    const OperatorBase& forwardOp,
-    const std::unordered_set<std::string>& no_grad_vars) {
-  std::unordered_set<std::string> no_grad_names;
-  no_grad_names.reserve(no_grad_vars.size() + 1);
-
-  no_grad_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
-
-  for (auto& name : no_grad_vars) {
-    no_grad_names.insert(name + kGradVarSuffix);
-  }
-  size_t uid = 0;
-  std::unordered_map<std::string, std::string> grad_to_var;
-  return BackwardRecursive(forwardOp, no_grad_names, &grad_to_var, uid);
-}
-
-// ====================================  //
-
-static bool AllGradInSet(const std::vector<std::string>& names,
-                         const std::unordered_set<std::string>& set) {
-  for (const std::string& name : names) {
-    if (!set.count(GradVarName(name))) {
-      return false;
-    }
-  }
-  if (VLOG_IS_ON(10)) {
-    std::ostringstream sout;
-    sout << "All input {";
-    for (auto& name : names) {
-      sout << name << ",";
-    }
-    sout << "} is in {";
-    for (auto& name : set) {
-      sout << name << ",";
-    }
-    sout << "}";
-    VLOG(10) << sout.str();
-  }
-  return true;
-}
-
-static std::string FwdName(const std::string& grad_name) {
-  auto pos = grad_name.find("@GRAD");
-  if (pos == std::string::npos) {
-    return "";
-  } else {
-    return grad_name.substr(0, pos);
-  }
-}
-
-static void CreateGradVarInBlock(
-    size_t grad_op_start_index,
-    const std::unordered_map<std::string, std::string>& param_name_map,
-    BlockDesc* block_desc,
-    std::unordered_map<std::string, GradVarInfo>* grad_var_record) {
-  auto ops = block_desc->AllOps();
-  for (size_t op_index = grad_op_start_index; op_index < ops.size();
-       ++op_index) {
-    std::unordered_set<std::string> new_vars;
-    auto& ctrl_flow_ops = CtrlFlowOps();
-    ForEachVarName(ops[op_index]->Outputs(),
-                   [&](const std::string& grad_var_name) {
-                     if (ctrl_flow_ops.find(ops[op_index]->Type()) !=
-                         ctrl_flow_ops.end()) {
-                       if (block_desc->HasVarRecursive(grad_var_name)) {
-                         return false;
-                       }
-                     } else {
-                       if (block_desc->HasVar(grad_var_name)) {
-                         return false;
-                       }
-                     }
-                     if (grad_var_name == framework::kEmptyVarName) {
-                       return false;
-                     }
-                     auto var = block_desc->Var(grad_var_name);
-                     VLOG(10) << "Creating Variable " << grad_var_name;
-                     new_vars.insert(var->Name());
-                     auto it = param_name_map.find(grad_var_name);
-                     if (it == param_name_map.end()) {
-                       return false;
-                     }
-                     auto param_var_name = it->second;
-                     auto& grad_record = (*grad_var_record)[param_var_name];
-                     grad_record.name_ = grad_var_name;
-                     grad_record.block_idx_ = block_desc->ID();
-                     grad_record.op_idx_ = static_cast<int>(op_index);
-                     return false; /* not break */
-                   });
-    ops[op_index]->InferVarType(block_desc);
-    for (auto& arg : ops[op_index]->OutputArgumentNames()) {
-      if (new_vars.find(arg) == new_vars.end()) {
-        continue;
-      }
-      auto pname = FwdName(arg);
-      auto* param = block_desc->FindVarRecursive(pname);
-      auto* grad = block_desc->FindVar(arg);
-      if (param == nullptr) {
-        grad->SetDataType(proto::VarType::FP32);
-      } else {
-        grad->SetDataType(param->GetDataType());
-      }
-    }
-    ops[op_index]->InferShape(*block_desc);
-  }
-}
-
-std::vector<std::unique_ptr<OpDesc>> MakeOpGrad(
-    const OpDesc* op_desc, std::unordered_set<std::string>* no_grad_vars,
-    std::unordered_map<std::string, std::string>* grad_to_var,
-    const std::vector<BlockDesc*>& grad_block = std::vector<BlockDesc*>()) {
-  std::vector<std::unique_ptr<OpDesc>> grad_op_descs;
-  // All input gradients of forwarding operator do not need to calculate.
-  const std::vector<std::string>& inputs = op_desc->InputArgumentNames();
-  if (AllGradInSet(inputs, *no_grad_vars)) {
-    VLOG(10) << "Drop operator  " << op_desc->Type();
-    return grad_op_descs;  // empty vector
-  }
-
-  // All output gradients of forwarding operator do not need to calculate.
-  const std::vector<std::string>& outputs = op_desc->OutputArgumentNames();
-
-  if (AllGradInSet(outputs, *no_grad_vars)) {
-    VLOG(10) << "Drop operator " << op_desc->Type();
-    // FIXME: Hack code here
-    auto& ctrl_flow_ops = CtrlFlowOps();
-    if (ctrl_flow_ops.find(op_desc->Type()) == ctrl_flow_ops.end()) {
-      // Only computational op need drop input's gradient.
-      for (const std::string& name : inputs) {
-        no_grad_vars->insert(GradVarName(name));
-        VLOG(10) << " Also drop " << GradVarName(name);
-      }
-    }
-
-    return grad_op_descs;  // empty vector
-  }
-
-  grad_op_descs =
-      OpInfoMap::Instance()
-          .Get(op_desc->Type())
-          .GradOpMaker()(*op_desc, *no_grad_vars, grad_to_var, grad_block);
-
-  std::list<std::unique_ptr<OpDesc>> pending_fill_zeros_ops;
-  for (auto& desc : grad_op_descs) {
-    for (const std::string& in_name : desc->InputArgumentNames()) {
-      if (no_grad_vars->count(in_name)) {
-        std::string prefix = in_name.substr(
-            0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
-        std::string new_name = prefix + kZeroVarSuffix;
-        desc->Rename(in_name, new_name);
-        std::unique_ptr<OpDesc> fill_zeros_op(
-            new OpDesc("fill_zeros_like", {{"X", {prefix}}},
-                       {{"Out", {new_name}}}, AttributeMap{}));
-        pending_fill_zeros_ops.push_back(std::move(fill_zeros_op));
-      }
-    }
-  }
-
-  for (auto& p : pending_fill_zeros_ops) {
-    grad_op_descs.insert(grad_op_descs.begin(), std::move(p));
-  }
-  return grad_op_descs;
-}
-
-static BlockDesc* CreateStepBlock(
-    ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
-    std::unordered_map<std::string, std::string>* grad_to_var,
-    int step_block_idx);
-
-std::vector<std::unique_ptr<OpDesc>> MakeBlockBackward(
-    ProgramDesc& program_desc, int block_idx,
-    std::unordered_set<std::string>* no_grad_vars,
-    std::unordered_map<std::string, std::string>* grad_to_var) {
-  VLOG(5) << "MakeBlockBackward";
-  BlockDesc* cur_block = program_desc.MutableBlock(block_idx);
-  std::vector<OpDesc*> op_descs = cur_block->AllOps();
-  std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
-  size_t grad_desc_idx = 0;
-  std::vector<std::unique_ptr<OpDesc>> backward_descs;
-
-  for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
-    VLOG(5) << "Making backward " << (*it)->Type() << " op";
-    std::vector<std::unique_ptr<OpDesc>> op_grads;
-
-    if ((*it)->Type() == "recurrent" || (*it)->Type() == "while" ||
-        (*it)->Type() == "parallel_do") {
-      int step_block_idx = (*it)->GetBlockAttr("sub_block");
-      BlockDesc* backward_block = CreateStepBlock(program_desc, no_grad_vars,
-                                                  grad_to_var, step_block_idx);
-      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
-    } else if ((*it)->Type() == "conditional_block") {
-      BlockDesc* backward_block =
-          CreateStepBlock(program_desc, no_grad_vars, grad_to_var,
-                          (*it)->GetBlockAttr("sub_block"));
-      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
-    } else {
-      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var);
-    }
-
-    if (VLOG_IS_ON(10)) {
-      std::ostringstream sout;
-      sout << "Made ";
-      for (auto& op_grad : op_grads) {
-        sout << op_grad->Type() << " ";
-      }
-      VLOG(10) << sout.str();
-    }
-
-    for (const auto& desc : op_grads) {
-      for (const std::string& out_name : desc->OutputArgumentNames()) {
-        if (out_name.find("@GRAD") == std::string::npos) {
-          // Not all outputs of a backward operator is a gradient. Only gradient
-          // need to be sum. Skip variables are not gradient.
-          continue;
-        }
-        dup_out_ops[out_name].emplace_back(grad_desc_idx);
-      }
-      ++grad_desc_idx;
-    }
-    std::transform(op_grads.begin(), op_grads.end(),
-                   std::back_inserter(backward_descs),
-                   [](std::unique_ptr<OpDesc>& ptr) { return std::move(ptr); });
-  }
-
-  VLOG(5) << "Appending Sums";
-  // Check whether some variables are written more than once
-  std::list<std::pair<size_t, std::unique_ptr<OpDesc>>> pending_sum_ops;
-  for (const auto& dup : dup_out_ops) {
-    const std::string& out_name = dup.first;
-    const std::vector<size_t> dup_op = dup.second;
-    if (out_name != kEmptyVarName && dup_op.size() > 1) {
-      std::vector<std::string> sum_op_inputs;
-      std::string next_g_name = out_name;
-      for (size_t i = 0; i < dup_op.size(); ++i) {
-        VLOG(10) << backward_descs[dup_op[i]]->Type() << " has " << out_name
-                 << " duplicated";
-        std::string new_name = out_name + "@RENAME@" + std::to_string(i);
-        backward_descs[dup_op[i]]->RenameOutput(out_name, new_name);
-        backward_descs[dup_op[i]]->RenameInput(out_name, next_g_name);
-        sum_op_inputs.emplace_back(new_name);
-        next_g_name = sum_op_inputs.back();
-      }
-      std::unique_ptr<OpDesc> sum_op(new OpDesc("sum", {{"X", sum_op_inputs}},
-                                                {{"Out", {out_name}}},
-                                                AttributeMap{}));
-      pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
-    }
-  }
-
-  pending_sum_ops.sort([](const std::pair<size_t, std::unique_ptr<OpDesc>>& a,
-                          const std::pair<size_t, std::unique_ptr<OpDesc>>& b) {
-    return a.first > b.first;
-  });
-  for (auto& p : pending_sum_ops) {
-    backward_descs.insert(backward_descs.begin() + p.first + 1,
-                          std::move(p.second));
-  }
-
-  VLOG(5) << "MakeBlockBackward Finished";
-
-  return backward_descs;
-}
-
-static BlockDesc* CreateStepBlock(
-    ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
-    std::unordered_map<std::string, std::string>* grad_to_var,
-    int step_block_idx) {
-  auto backward_block_op_descs = MakeBlockBackward(program_desc, step_block_idx,
-                                                   no_grad_vars, grad_to_var);
-  BlockDesc* backward_block =
-      program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
-  for (auto& ptr : backward_block_op_descs) {
-    backward_block->AppendAllocatedOp(move(ptr));
-  }
-  return backward_block;
-}
-
-ParamGradInfoMap AppendBackward(
-    ProgramDesc& program_desc, const VarDesc& target,
-    const std::unordered_set<std::string>& no_grad_vars) {
-  std::unordered_set<std::string> no_grad_var_names;
-  no_grad_var_names.reserve(no_grad_vars.size() + 1);
-  no_grad_var_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
-  for (auto& name : no_grad_vars) {
-    no_grad_var_names.insert(GradVarName(name));
-  }
-
-  const int root_block_idx = 0;
-  auto root_block = program_desc.MutableBlock(root_block_idx);
-
-  std::string fill_one_op_out = GradVarName(target.Name());
-  bool is_scalar = target.GetShape() == std::vector<int64_t>{1};
-  PADDLE_ENFORCE(is_scalar, "target should be scalar");
-  VLOG(3) << "backward from loss=" << target.Name()
-          << " data_type=" << target.GetDataType();
-  std::unique_ptr<OpDesc> fill_one_op(
-      new OpDesc("fill_constant", {}, {{"Out", {fill_one_op_out}}},
-                 {{"shape", std::vector<int>{1}},
-                  {"value", static_cast<float>(1.0)},
-                  {"dtype", target.GetDataType()}}));
-  // infer var type of fill_one_op
-  fill_one_op->InferVarType(root_block);
-
-  root_block->AppendAllocatedOp(std::move(fill_one_op));
-  size_t forward_op_num = root_block->OpSize();
-  size_t forward_block_num = program_desc.Size();
-
-  // Insert backward operators
-  std::unordered_map<std::string, std::string> grad_to_var;
-  auto backward_op_descs = MakeBlockBackward(program_desc, root_block_idx,
-                                             &no_grad_var_names, &grad_to_var);
-
-  for (auto& ptr : backward_op_descs) {
-    root_block->AppendAllocatedOp(std::move(ptr));
-  }
-  // Create Variable
-
-  // Create target gradient variable
-  std::unordered_map<std::string, GradVarInfo> retv;
-
-  auto var = root_block->Var(fill_one_op_out);
-  var->SetDataType(target.GetDataType());
-  var->SetShape(target.GetShape());
-  auto& target_grad = retv[target.Name()];
-  target_grad.name_ = fill_one_op_out;
-  target_grad.block_idx_ = root_block_idx;
-  target_grad.op_idx_ = static_cast<int>(forward_op_num);
-
-  // create grad_var for all blocks in this program
-  CreateGradVarInBlock(forward_op_num, grad_to_var, root_block, &retv);
-  for (size_t block_index = forward_block_num;
-       block_index < program_desc.Size(); ++block_index) {
-    CreateGradVarInBlock(0, grad_to_var, program_desc.MutableBlock(block_index),
-                         &retv);
-  }
-  return retv;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/backward.h b/paddle/fluid/framework/backward.h
deleted file mode 100644
index 3a971090c25c85efbf976532c364371baba9a870..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/backward.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-
-namespace paddle {
-namespace framework {
-
-// Create the backward operator from a forward operator.
-// TODO(yuyang18): Add more API reference comment.
-extern std::unique_ptr<OperatorBase> Backward(
-    const OperatorBase& forwardOp,
-    const std::unordered_set<std::string>& no_grad_vars);
-
-struct GradVarInfo {
-  GradVarInfo() {}
-  GradVarInfo(const std::string& name, int block_idx, int op_idx)
-      : name_(name), block_idx_(block_idx), op_idx_(op_idx) {}
-
-  bool operator==(const GradVarInfo& b) const {
-    return name_ == b.name_ && block_idx_ == b.block_idx_ &&
-           op_idx_ == b.op_idx_;
-  }
-
-  std::string name_;
-  int block_idx_;
-  int op_idx_;
-};
-
-using ParamGradInfoMap = std::unordered_map<std::string /*fwd_var_name*/,
-                                            GradVarInfo /*grad_var_info*/>;
-
-ParamGradInfoMap AppendBackward(
-    ProgramDesc& program_desc, const VarDesc& target,
-    const std::unordered_set<std::string>& no_grad_vars);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/backward_test.cc b/paddle/fluid/framework/backward_test.cc
deleted file mode 100644
index cc1f871360ed3f7071364dbb0f932bfd997cadb0..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/backward_test.cc
+++ /dev/null
@@ -1,918 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/backward.h"
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/var_desc.h"
-#include "paddle/fluid/operators/net_op.h"
-
-USE_NO_KERNEL_OP(fill_constant);
-
-namespace paddle {
-namespace framework {
-
-using DeviceContext = platform::DeviceContext;
-
-class NoneOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {}
-};
-
-template <typename Place, typename T>
-class NoneKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {}
-};
-
-class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
- public:
-  RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input X of Add");
-    AddInput("b", "Bias of Add");
-    AddOutput("Out", "Out of Add");
-    AddComment("Add Op");
-  }
-};
-
-class RowWiseAddGradMaker : public SingleGradOpDescMaker {
- public:
-  using SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<OpDesc> Apply() const override {
-    auto grad_op = new OpDesc();
-    grad_op->SetInput(GradVarName("Out"), OutputGrad("Out"));
-    grad_op->SetOutput(GradVarName("X"), InputGrad("X"));
-    grad_op->SetOutput(GradVarName("b"), InputGrad("b"));
-    grad_op->SetType("rowwise_add_grad");
-    return std::unique_ptr<OpDesc>(grad_op);
-  }
-};
-
-class MulOpMaker : public OpProtoAndCheckerMaker {
- public:
-  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "A");
-    AddInput("Y", "B");
-    AddOutput("Out", "Out");
-    AddAttr<int>("x_num_col_dims", "").SetDefault(1).EqualGreaterThan(1);
-    AddAttr<int>("y_num_col_dims", "").SetDefault(1).EqualGreaterThan(1);
-    AddComment("Mul");
-  }
-};
-
-class SigmoidOpMaker : public OpProtoAndCheckerMaker {
- public:
-  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "X");
-    AddOutput("Out", "Y");
-    AddComment("Sigmoid");
-  }
-};
-
-class NoGradOpMaker : public OpProtoAndCheckerMaker {
- public:
-  NoGradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "X input");
-    AddOutput("Out", "Y output");
-    AddComment("NoGradOp, same input output. no Grad");
-  }
-};
-
-class FcOp : public operators::NetOp {
- public:
-  FcOp(const std::string &type, const VariableNameMap &inputs,
-       const VariableNameMap &outputs, const AttributeMap &attrs)
-      : NetOp(type, inputs, outputs, attrs) {
-    AppendOp(OpRegistry::CreateOp(
-        "mul", {{"X", {Input("X")}}, {"Y", {Input("W")}}},
-        {{"Out", {Output("mul_result")}}}, AttributeMap{}));
-    auto input_b = Inputs("b");
-    std::string before_act = "mul_result";
-    if (input_b.size() != 0) {
-      AppendOp(OpRegistry::CreateOp(
-          "rowwise_add", {{"X", {Output("mul_result")}}, {"b", {input_b[0]}}},
-          {{"Out", {Output("add_result")}}}, AttributeMap{}));
-      before_act = "add_result";
-    } else {
-      auto out_varname = Output("add_result");
-      if (out_varname != kEmptyVarName) {
-        this->Rename(out_varname, kEmptyVarName);
-      }
-    }
-
-    AppendOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}},
-                                  {{"Out", {Output("Out")}}}, AttributeMap{}));
-    CompleteAddOp(false);
-  }
-};
-
-class FcOpMaker : public OpProtoAndCheckerMaker {
- public:
-  FcOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "x");
-    AddInput("W", "w");
-    AddInput("b", "b");
-    AddOutput("mul_result", "").AsIntermediate();
-    AddOutput("add_result", "").AsIntermediate();
-    AddOutput("Out", "");
-    AddComment("");
-  }
-};
-
-class ManyOutputOpMaker : public OpProtoAndCheckerMaker {
- public:
-  ManyOutputOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("x", "x");
-    AddOutput("y", "y");
-    AddOutput("z", "z");
-    AddComment("");
-  }
-};
-
-class FillZeroOpMaker : public OpProtoAndCheckerMaker {
- public:
-  FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "x");
-    AddOutput("Out", "out");
-    AddComment("");
-  }
-};
-
-class SumOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "the input tensors of sum operator.").AsDuplicable();
-    AddOutput("Out", "the output tensor of sum operator.");
-    AddComment("");
-  }
-};
-
-class MultInOutOpMaker : public OpProtoAndCheckerMaker {
- public:
-  MultInOutOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "x");
-    AddInput("H", "h");
-    AddOutput("Y", "y");
-    AddOutput("Z", "z");
-    AddComment("");
-  }
-};
-
-class MinusGradOpDescMaker : public GradOpDescMakerBase {
- public:
-  using GradOpDescMakerBase::GradOpDescMakerBase;
-
-  std::vector<std::unique_ptr<OpDesc>> operator()() const override {
-    std::vector<std::unique_ptr<OpDesc>> retv;
-    auto x_g = InputGrad("X");
-    if (!x_g.empty()) {
-      auto *op_desc = new OpDesc();
-      op_desc->SetType("scale");
-      op_desc->SetInput("X", OutputGrad("Out"));
-      op_desc->SetOutput("Out", x_g);
-      op_desc->SetAttr("scale", 1.0f);
-      retv.emplace_back(op_desc);
-    }
-
-    auto y_g = InputGrad("Y");
-    if (!y_g.empty()) {
-      auto *op_desc = new OpDesc();
-      op_desc->SetType("scale");
-      op_desc->SetInput("X", OutputGrad("Out"));
-      op_desc->SetOutput("Out", y_g);
-      op_desc->SetAttr("scale", -1.0f);
-      retv.emplace_back(op_desc);
-    }
-    return retv;
-  }
-};
-
-class MinusOpMaker : public OpProtoAndCheckerMaker {
- public:
-  MinusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "");
-    AddInput("Y", "");
-    AddOutput("Out", "");
-    AddComment("minus for unittest");
-  }
-};
-}  // namespace framework
-}  // namespace paddle
-
-namespace f = paddle::framework;
-namespace ops = paddle::operators;
-using EnforceNotMet = paddle::platform::EnforceNotMet;
-// rowwise_add
-REGISTER_OPERATOR(rowwise_add, f::NoneOp, f::RowWiseAddOpMaker,
-                  f::RowWiseAddGradMaker);
-REGISTER_OP_CPU_KERNEL(rowwise_add,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OPERATOR(rowwise_add_grad, f::NoneOp);
-REGISTER_OP_CPU_KERNEL(rowwise_add_grad,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-// mul
-REGISTER_OP(mul, f::NoneOp, f::MulOpMaker, mul_grad, f::NoneOp);
-REGISTER_OP_CPU_KERNEL(mul, f::NoneKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(mul_grad,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-// sigmoid
-REGISTER_OP(sigmoid, f::NoneOp, f::SigmoidOpMaker, sigmoid_grad, f::NoneOp);
-REGISTER_OP_CPU_KERNEL(sigmoid,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NoneOp, f::NoGradOpMaker);
-// fill_zeros_like
-REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NoneOp, f::FillZeroOpMaker);
-REGISTER_OP_CPU_KERNEL(fill_zeros_like,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-// sum
-REGISTER_OP(sum, f::NoneOp, f::SumOpMaker, sum_grad, f::NoneOp);
-REGISTER_OP_CPU_KERNEL(sum, f::NoneKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(sum_grad,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-// fc
-REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker);
-// many_output_op
-REGISTER_OP(many_output_op, f::NoneOp, f::ManyOutputOpMaker,
-            many_output_op_grad, f::NoneOp);
-// mult_in_out
-REGISTER_OP(mult_in_out, f::NoneOp, f::MultInOutOpMaker, mult_in_out_grad,
-            f::NoneOp);
-REGISTER_OP_CPU_KERNEL(mult_in_out,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(mult_in_out_grad,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-// minus
-REGISTER_OPERATOR(minus, f::NoneOp, f::MinusOpMaker, f::MinusGradOpDescMaker);
-REGISTER_OP_CPU_KERNEL(minus, f::NoneKernel<paddle::platform::CPUPlace, float>);
-// scale
-REGISTER_OPERATOR(scale, f::NoneOp);
-REGISTER_OP_CPU_KERNEL(scale, f::NoneKernel<paddle::platform::CPUPlace, float>);
-
-TEST(Backward, simple_op_not_need_grad) {
-  auto fwd =
-      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
-                              {{"Out", {"out"}}}, f::AttributeMap{});
-  ASSERT_NE(fwd, nullptr);
-  auto gop = f::Backward(*fwd, {"x"});
-  ASSERT_EQ(gop->Output(f::GradVarName("X")), f::kEmptyVarName);
-
-  auto no_input_gop = f::Backward(*fwd, {"x", "b"});
-  ASSERT_NE(no_input_gop, nullptr);
-  ASSERT_TRUE(no_input_gop->IsNetOp());
-  ASSERT_EQ(0UL, static_cast<ops::NetOp *>(no_input_gop.get())->ops_.size());
-}
-
-TEST(Backward, net_fc_backward_normal) {
-  std::shared_ptr<f::OperatorBase> fwd =
-      f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {"b"}}},
-                              {{"mul_result", {"mul_res"}},
-                               {"add_result", {"add_re"}},
-                               {"Out", {"out"}}},
-                              f::AttributeMap{});
-  ASSERT_NE(fwd, nullptr);
-  std::shared_ptr<f::OperatorBase> gop =
-      f::Backward(*fwd, std::unordered_set<std::string>{});
-  ASSERT_TRUE(gop->IsNetOp());
-  auto net = static_cast<ops::NetOp *>(gop.get());
-
-  ASSERT_NO_THROW(net->DebugString());
-
-  ASSERT_EQ(3UL, net->ops_.size());
-
-  f::OperatorBase &d_sigmoid = *net->ops_[0];
-  ASSERT_EQ("sigmoid_grad", d_sigmoid.Type());
-
-  f::OperatorBase &d_add = *net->ops_[1];
-  ASSERT_EQ("rowwise_add_grad", d_add.Type());
-
-  f::OperatorBase &d_mul = *net->ops_[2];
-  ASSERT_EQ("mul_grad", d_mul.Type());
-}
-
-TEST(Backward, net_fc_backward_not_have_b) {
-  std::shared_ptr<f::OperatorBase> fwd =
-      f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {}}},
-                              {{"mul_result", {"mul_res"}},
-                               {"add_result", {"add_res"}},
-                               {"Out", {"tmp"}}},
-                              f::AttributeMap{});
-  ASSERT_NE(fwd, nullptr);
-  std::shared_ptr<f::OperatorBase> gop =
-      f::Backward(*fwd, std::unordered_set<std::string>{});
-  ASSERT_TRUE(gop->IsNetOp());
-  auto net = static_cast<ops::NetOp *>(gop.get());
-
-  ASSERT_NO_THROW(net->DebugString());
-
-  ASSERT_EQ(2UL, net->ops_.size());
-
-  f::OperatorBase &d_sigmoid = *net->ops_[0];
-  ASSERT_EQ("sigmoid_grad", d_sigmoid.Type());
-
-  f::OperatorBase &d_mul = *net->ops_[1];
-  ASSERT_EQ("mul_grad", d_mul.Type());
-}
-
-TEST(Backward, net_input_of_network_not_need_grad) {
-  ops::NetOp net;
-  net.AppendOp(f::OpRegistry::CreateOp(
-      "fc", {{"X", {"x"}}, {"W", {"W1"}}, {"b", {"b1"}}},
-      {{"mul_result", {"mul_tmp_0"}},
-       {"add_result", {"add_tmp_0"}},
-       {"Out", {"hidden0"}}},
-      f::AttributeMap{}));
-  net.AppendOp(f::OpRegistry::CreateOp(
-      "fc", {{"X", {"hidden0"}}, {"W", {"W2"}}, {"b", {"b2"}}},
-      {{"mul_result", {"mul_tmp_1"}},
-       {"add_result", {"add_tmp_1"}},
-       {"Out", {"hidden1"}}},
-      f::AttributeMap{}));
-  net.CompleteAddOp();
-  auto bwd = Backward(net, {"x"});  // x@GRAD is not need.
-  ASSERT_TRUE(bwd->IsNetOp());
-  auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
-
-  auto output_vars = bwd_net->OutputVars(true);
-  std::unordered_set<std::string> all_outputs =
-      std::unordered_set<std::string>(output_vars.begin(), output_vars.end());
-  all_outputs.erase(f::kEmptyVarName);
-
-  for (auto &out : {"W1", "b1", "hidden0", "W2", "b2"}) {
-    ASSERT_NE(all_outputs.find(f::GradVarName(out)), all_outputs.end());
-  }
-
-  // Not Generated X
-  ASSERT_EQ(all_outputs.find(f::GradVarName("X")), all_outputs.end());
-
-  ASSERT_EQ(2UL, bwd_net->ops_.size());
-  ASSERT_TRUE(bwd_net->ops_[1]->IsNetOp());
-  auto first_fc_grad = static_cast<ops::NetOp *>(bwd_net->ops_[1].get());
-  ASSERT_EQ(3UL, first_fc_grad->ops_.size());
-  ASSERT_EQ(f::kEmptyVarName,
-            first_fc_grad->ops_[2]->Output(f::GradVarName("X")));
-}
-
-TEST(Backward, net_shared_weight) {
-  ops::NetOp net;
-  net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}},
-                                       {{"Out", {"out"}}}, f::AttributeMap{}));
-  net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}},
-                                       {{"Out", {"FinalOut"}}},
-                                       f::AttributeMap{}));
-  net.CompleteAddOp();
-
-  auto bwd = f::Backward(net, std::unordered_set<std::string>{});
-  ASSERT_TRUE(bwd->IsNetOp());
-  auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
-  ASSERT_EQ(3UL, bwd_net->ops_.size());
-  ASSERT_EQ("sum", bwd_net->ops_[2]->Type());
-}
-
-TEST(Backward, op_all_input_are_not_need) {
-  auto fwd =
-      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
-                              {{"Out", {"out"}}}, f::AttributeMap{});
-  auto backward = f::Backward(*fwd, {"x", "b"});
-  ASSERT_TRUE(backward->IsNetOp());
-  auto net = static_cast<ops::NetOp *>(backward.get());
-  ASSERT_TRUE(net->ops_.empty());
-}
-
-TEST(Backward, op_all_output_are_not_need) {
-  auto fwd =
-      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
-                              {{"Out", {"out"}}}, f::AttributeMap{});
-  auto backward = f::Backward(*fwd, {"out"});
-  ASSERT_TRUE(backward->IsNetOp());
-  auto net = static_cast<ops::NetOp *>(backward.get());
-  ASSERT_TRUE(net->ops_.empty());
-}
-
-TEST(Backward, op_part_of_output_are_not_need) {
-  auto fwd =
-      f::OpRegistry::CreateOp("many_output_op", {{"x", {"X"}}},
-                              {{"y", {"Y"}}, {"z", {"Z"}}}, f::AttributeMap{});
-  auto backward = f::Backward(*fwd, {"Z"});
-  ASSERT_TRUE(backward->IsNetOp());
-  auto net = static_cast<ops::NetOp *>(backward.get());
-  ASSERT_EQ(net->ops_.size(), 2UL);
-
-  auto &fill_zero = *net->ops_[0];
-  ASSERT_EQ("fill_zeros_like", fill_zero.Type());
-  ASSERT_EQ(1UL, fill_zero.Inputs("X").size());
-  ASSERT_EQ("Z", fill_zero.Input("X"));
-  ASSERT_EQ(1UL, fill_zero.Outputs("Out").size());
-  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Out"));
-
-  auto &d_many_out = *net->ops_[1];
-  ASSERT_EQ("many_output_op_grad", d_many_out.Type());
-  ASSERT_EQ(1UL + 2UL + 2UL, d_many_out.Inputs().size());  // I/O/OG
-  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix,
-            d_many_out.Input(f::GradVarName("z")));
-  ASSERT_EQ(f::GradVarName("Y"), d_many_out.Input(f::GradVarName("y")));
-  ASSERT_EQ(f::GradVarName("X"), d_many_out.Output(f::GradVarName("x")));
-}
-
-TEST(Backward, op_part_of_input_are_not_need) {
-  auto fwd = f::OpRegistry::CreateOp("mul", {{"X", {"a"}}, {"Y", {"b"}}},
-                                     {{"Out", {"out"}}}, f::AttributeMap{});
-  auto backward = f::Backward(*fwd, {"a"});
-  auto &grad_mul = *backward;
-  ASSERT_EQ(grad_mul.Type(), "mul_grad");
-  ASSERT_EQ(grad_mul.Inputs().size(), 2UL + 1UL + 1UL);
-  ASSERT_EQ(grad_mul.Outputs().size(), 2UL);
-  ASSERT_EQ(grad_mul.Output(f::GradVarName("X")), f::kEmptyVarName);
-  ASSERT_EQ(grad_mul.Output(f::GradVarName("Y")), f::GradVarName("b"));
-  ASSERT_EQ(grad_mul.Input(f::GradVarName("Out")), f::GradVarName("out"));
-  ASSERT_EQ(grad_mul.Input("X"), "a");
-  ASSERT_EQ(grad_mul.Input("Y"), "b");
-  ASSERT_EQ(grad_mul.Input("Out"), "out");
-}
-
-TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
-  ops::NetOp net;
-  net.AppendOp(f::OpRegistry::CreateOp(
-      "fc", {{"X", {"x1"}}, {"W", {"w1"}}, {"b", {"b1"}}},
-      {{"mul_result", {"mul_out1"}},
-       {"add_result", {"add_out1"}},
-       {"Out", {"out1"}}},
-      f::AttributeMap{}));
-  net.AppendOp(f::OpRegistry::CreateOp(
-      "fc", {{"X", {"out1"}}, {"W", {"w2"}}, {"b", {"b2"}}},
-      {{"mul_result", {"mul_out2"}},
-       {"add_result", {"tmp_out2"}},
-       {"Out", {"out2"}}},
-      f::AttributeMap{}));
-  net.AppendOp(f::OpRegistry::CreateOp(
-      "fc", {{"X", {"out2"}}, {"W", {"w3"}}, {"b", {"b3"}}},
-      {{"mul_result", {"mul_out3"}},
-       {"add_result", {"tmp_out3"}},
-       {"Out", {"out3"}}},
-      f::AttributeMap{}));
-  net.CompleteAddOp();
-
-  auto backward = f::Backward(net, {"mul_out2", "tmp_out2", "out2"});
-  ASSERT_TRUE(backward->IsNetOp());
-  auto bwd_net = static_cast<ops::NetOp *>(backward.get());
-  ASSERT_EQ(bwd_net->ops_.size(), 3UL);
-  auto &grad_fc = *bwd_net->ops_[0];
-
-  const char *all = paddle::operators::NetOp::kAll;
-  EXPECT_EQ(grad_fc.Inputs(all).size(),
-            2UL       /* external input number */
-                + 1UL /* external output number*/
-                + 1UL /* number of gradient of external output*/
-                + 2UL /* internal variable number*/
-            );
-  EXPECT_EQ(grad_fc.Outputs(all).size(),
-            2UL       /* input number of mul*/
-                + 2UL /* input number of rowwise_add*/
-                + 1UL /* input number of sigmod */
-                - 1UL /* out2 is not needed*/);
-  EXPECT_EQ(bwd_net->ops_[1]->Inputs(all).size(), 0UL);
-  EXPECT_EQ(bwd_net->ops_[1]->Outputs(all).size(), 0UL);
-  EXPECT_EQ(bwd_net->ops_[2]->Inputs(all).size(), 0UL);
-  EXPECT_EQ(bwd_net->ops_[2]->Outputs(all).size(), 0UL);
-}
-
-TEST(Backward, simple_single_op) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-
-  f::OpDesc *op = block->AppendOp();
-  op->SetType("rowwise_add");
-  op->SetInput("X", {"x"});
-  op->SetInput("b", {"b"});
-  op->SetOutput("Out", {"out"});
-
-  auto target = f::VarDesc("out");
-  target.SetShape({1});
-  auto var_to_grad =
-      AppendBackward(program, target, std::unordered_set<std::string>{});
-
-  ASSERT_EQ(block->AllOps().size(), 3UL);
-  f::OpDesc *fill_op = block->AllOps()[1];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op = block->AllOps()[2];
-  EXPECT_EQ(grad_op->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out")}));
-  EXPECT_EQ(grad_op->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("x")}));
-  EXPECT_EQ(grad_op->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b")}));
-
-  EXPECT_EQ(var_to_grad.size(), 3UL);
-  EXPECT_EQ(var_to_grad.at("b"), f::GradVarInfo(f::GradVarName("b"), 0, 2));
-  EXPECT_EQ(var_to_grad.at("x"), f::GradVarInfo(f::GradVarName("x"), 0, 2));
-
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("x")));
-}
-
-TEST(Backward, default_attribute) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDesc *op = block->AppendOp();
-  op->SetType("mul");
-  op->SetInput("X", {"x"});
-  op->SetInput("Y", {"y"});
-  op->SetOutput("Out", {"out"});
-  op->CheckAttrs();
-
-  auto target = f::VarDesc("out");
-  target.SetShape({1});
-  AppendBackward(program, target, std::unordered_set<std::string>{});
-
-  ASSERT_EQ(block->AllOps().size(), 3UL);
-  EXPECT_EQ(boost::get<int>(op->GetAttr("x_num_col_dims")), 1);
-  EXPECT_EQ(boost::get<int>(op->GetAttr("y_num_col_dims")), 1);
-
-  f::OpDesc *fill_op = block->AllOps()[1];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op = block->AllOps()[2];
-  ASSERT_EQ(grad_op->Type(), "mul_grad");
-  EXPECT_EQ(boost::get<int>(grad_op->GetAttr("x_num_col_dims")), 1);
-  EXPECT_EQ(boost::get<int>(grad_op->GetAttr("y_num_col_dims")), 1);
-}
-
-TEST(Backward, simple_mult_op) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDesc *op1 = block->AppendOp();
-  op1->SetType("rowwise_add");
-  op1->SetInput("X", {"x1"});
-  op1->SetInput("b", {"b1"});
-  op1->SetOutput("Out", {"out1"});
-
-  f::OpDesc *op2 = block->AppendOp();
-  op2->SetType("mul");
-  op2->SetInput("X", {"out1"});
-  op2->SetInput("Y", {"y2"});
-  op2->SetOutput("Out", {"out2"});
-
-  f::OpDesc *op3 = block->AppendOp();
-  op3->SetType("rowwise_add");
-  op3->SetInput("X", {"out2"});
-  op3->SetInput("b", {"b3"});
-  op3->SetOutput("Out", {"out3"});
-
-  auto target = f::VarDesc("out3");
-  target.SetShape({1});
-  size_t forward_len = block->AllOps().size();
-  auto var_to_grad =
-      AppendBackward(program, target, std::unordered_set<std::string>{});
-
-  ASSERT_EQ(block->AllOps().size(), 6UL + 1);
-  f::OpDesc *fill_op = block->AllOps()[forward_len];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op1 = block->AllOps()[6];
-  EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("x1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b1")}));
-
-  f::OpDesc *grad_op2 = block->AllOps()[5];
-  EXPECT_EQ(grad_op2->Type(), "mul_grad");
-  ASSERT_EQ(grad_op2->InputNames().size(), 4UL);
-  ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op2->Input("X"), std::vector<std::string>({"out1"}));
-  EXPECT_EQ(grad_op2->Input("Y"), std::vector<std::string>({"y2"}));
-  EXPECT_EQ(grad_op2->Input("Out"), std::vector<std::string>({"out2"}));
-  EXPECT_EQ(grad_op2->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out2")}));
-  EXPECT_EQ(grad_op2->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op2->Output(f::GradVarName("Y")),
-            std::vector<std::string>({f::GradVarName("y2")}));
-
-  f::OpDesc *grad_op3 = block->AllOps()[4];
-  EXPECT_EQ(grad_op3->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out3")}));
-  EXPECT_EQ(grad_op3->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("out2")}));
-  EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b3")}));
-
-  EXPECT_EQ(var_to_grad.size(), 7UL);
-  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6));
-  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6));
-  EXPECT_EQ(var_to_grad.at("out1"),
-            f::GradVarInfo(f::GradVarName("out1"), 0, 5));
-  EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5));
-  EXPECT_EQ(var_to_grad.at("out2"),
-            f::GradVarInfo(f::GradVarName("out2"), 0, 4));
-  EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4));
-
-  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("y2")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("out2")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b3")));
-}
-
-TEST(Backward, intermedia_var_no_grad) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDesc *op1 = block->AppendOp();
-  op1->SetType("rowwise_add");
-  op1->SetInput("X", {"x1"});
-  op1->SetInput("b", {"b1"});
-  op1->SetOutput("Out", {"out1"});
-
-  f::OpDesc *op2 = block->AppendOp();
-  op2->SetType("mul");
-  op2->SetInput("X", {"x2"});
-  op2->SetInput("Y", {"y2"});
-  op2->SetOutput("Out", {"out2"});
-
-  f::OpDesc *op3 = block->AppendOp();
-  op3->SetType("rowwise_add");
-  op3->SetInput("X", {"out2"});
-  op3->SetInput("b", {"b3"});
-  op3->SetOutput("Out", {"out3"});
-
-  f::OpDesc *op4 = block->AppendOp();
-  op4->SetType("mul");
-  op4->SetInput("X", {"out1"});
-  op4->SetInput("Y", {"out3"});
-  op4->SetOutput("Out", {"out4"});
-
-  auto target = f::VarDesc("out4");
-  target.SetShape({1});
-  size_t forward_len = block->AllOps().size();
-  auto var_to_grad = AppendBackward(program, target, {"out3"});
-
-  ASSERT_EQ(block->AllOps().size(), 7UL);
-  f::OpDesc *fill_op = block->AllOps()[forward_len];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op1 = block->AllOps()[6];
-  EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("x1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b1")}));
-
-  f::OpDesc *grad_op4 = block->AllOps()[5];
-  EXPECT_EQ(grad_op4->Type(), "mul_grad");
-  ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
-  ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op4->Input("X"), std::vector<std::string>({"out1"}));
-  EXPECT_EQ(grad_op4->Input("Y"), std::vector<std::string>({"out3"}));
-  EXPECT_EQ(grad_op4->Input("Out"), std::vector<std::string>({"out4"}));
-  EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out4")}));
-  EXPECT_EQ(grad_op4->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")), std::vector<std::string>());
-
-  EXPECT_EQ(var_to_grad.size(), 4UL);
-  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6));
-  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6));
-  EXPECT_EQ(var_to_grad.at("out1"),
-            f::GradVarInfo(f::GradVarName("out1"), 0, 5));
-
-  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
-}
-
-TEST(Backward, var_no_grad) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDesc *op1 = block->AppendOp();
-  op1->SetType("mult_in_out");
-  op1->SetInput("X", {"x1"});
-  op1->SetInput("H", {"h1"});
-  op1->SetOutput("Y", {"y1"});
-  op1->SetOutput("Z", {"z1"});
-
-  f::OpDesc *op2 = block->AppendOp();
-  op2->SetType("mult_in_out");
-  op2->SetInput("X", {"y1"});
-  op2->SetInput("H", {"z1"});
-  op2->SetOutput("Y", {"y2"});
-  op2->SetOutput("Z", {"z2"});
-
-  auto target = f::VarDesc("z2");
-  target.SetShape({1});
-  size_t forward_len = block->AllOps().size();
-  auto var_to_grad = AppendBackward(program, target, {"z1"});
-
-  ASSERT_EQ(block->AllOps().size(), 6UL);
-  f::OpDesc *fill_op = block->AllOps()[forward_len];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op2 = block->AllOps()[3];
-  ASSERT_EQ(grad_op2->Type(), "mult_in_out_grad");
-  ASSERT_EQ(grad_op2->InputNames().size(), 6UL);
-  ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op2->Input("X"), std::vector<std::string>({"y1"}));
-  EXPECT_EQ(grad_op2->Input("H"), std::vector<std::string>({"z1"}));
-  EXPECT_EQ(grad_op2->Input("Y"), std::vector<std::string>({"y2"}));
-  EXPECT_EQ(grad_op2->Input("Z"), std::vector<std::string>({"z2"}));
-  EXPECT_EQ(grad_op2->Input(f::GradVarName("Y")),
-            std::vector<std::string>({f::GradVarName("y2")}));
-  EXPECT_EQ(grad_op2->Input(f::GradVarName("Z")),
-            std::vector<std::string>({f::GradVarName("z2")}));
-  EXPECT_EQ(grad_op2->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("y1")}));
-  EXPECT_EQ(grad_op2->Output(f::GradVarName("H")), std::vector<std::string>());
-
-  f::OpDesc *fill_zero_op = block->AllOps()[4];
-  ASSERT_EQ(fill_zero_op->Type(), "fill_zeros_like");
-  ASSERT_EQ(fill_zero_op->InputNames().size(), 1UL);
-  ASSERT_EQ(fill_zero_op->OutputNames().size(), 1UL);
-  EXPECT_EQ(fill_zero_op->Input("X"), std::vector<std::string>({"z1"}));
-  EXPECT_EQ(fill_zero_op->Output("Out"),
-            std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
-
-  f::OpDesc *grad_op1 = block->AllOps()[5];
-  ASSERT_EQ(grad_op1->Type(), "mult_in_out_grad");
-  ASSERT_EQ(grad_op1->InputNames().size(), 6UL);
-  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op1->Input("X"), std::vector<std::string>({"x1"}));
-  EXPECT_EQ(grad_op1->Input("H"), std::vector<std::string>({"h1"}));
-  EXPECT_EQ(grad_op1->Input("Y"), std::vector<std::string>({"y1"}));
-  EXPECT_EQ(grad_op1->Input("Z"), std::vector<std::string>({"z1"}));
-  EXPECT_EQ(grad_op1->Input(f::GradVarName("Y")),
-            std::vector<std::string>({f::GradVarName("y1")}));
-  EXPECT_EQ(grad_op1->Input(f::GradVarName("Z")),
-            std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("x1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("H")),
-            std::vector<std::string>({f::GradVarName("h1")}));
-
-  EXPECT_EQ(var_to_grad.size(), 4UL);
-  EXPECT_EQ(var_to_grad.at("y1"), f::GradVarInfo(f::GradVarName("y1"), 0, 3));
-  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 5));
-  EXPECT_EQ(var_to_grad.at("h1"), f::GradVarInfo(f::GradVarName("h1"), 0, 5));
-
-  EXPECT_TRUE(block->HasVar(f::GradVarName("y1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("h1")));
-}
-
-TEST(Backward, shared_var) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDesc *op1 = block->AppendOp();
-  op1->SetType("rowwise_add");
-  op1->SetInput("X", {"x1"});
-  op1->SetInput("b", {"b1"});
-  op1->SetOutput("Out", {"out1"});
-
-  f::OpDesc *op2 = block->AppendOp();
-  op2->SetType("mul");
-  op2->SetInput("X", {"out1"});
-  op2->SetInput("Y", {"y2"});
-  op2->SetOutput("Out", {"out2"});
-
-  f::OpDesc *op3 = block->AppendOp();
-  op3->SetType("rowwise_add");
-  op3->SetInput("X", {"out1"});
-  op3->SetInput("b", {"b3"});
-  op3->SetOutput("Out", {"out3"});
-
-  auto target = f::VarDesc("out3");
-  target.SetShape({1});
-  size_t forward_len = block->AllOps().size();
-  auto var_to_grad =
-      AppendBackward(program, target, std::unordered_set<std::string>{});
-
-  ASSERT_EQ(block->AllOps().size(), 8UL);
-  f::OpDesc *fill_op = block->AllOps()[forward_len];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op3 = block->AllOps()[4];
-  ASSERT_EQ(grad_op3->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out3")}));
-  EXPECT_EQ(grad_op3->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@0"}));
-  EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b3")}));
-
-  f::OpDesc *grad_op4 = block->AllOps()[5];
-  ASSERT_EQ(grad_op4->Type(), "mul_grad");
-  ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
-  ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op4->Input("X"), std::vector<std::string>({"out1"}));
-  EXPECT_EQ(grad_op4->Input("Y"), std::vector<std::string>({"y2"}));
-  EXPECT_EQ(grad_op4->Input("Out"), std::vector<std::string>({"out2"}));
-  EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out2")}));
-  EXPECT_EQ(grad_op4->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@1"}));
-  EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")),
-            std::vector<std::string>({f::GradVarName("y2")}));
-
-  f::OpDesc *sum_op = block->AllOps()[6];
-  ASSERT_EQ(sum_op->Type(), "sum");
-  ASSERT_EQ(sum_op->InputNames().size(), 1UL);
-  ASSERT_EQ(sum_op->OutputNames().size(), 1UL);
-  EXPECT_EQ(sum_op->Input("X"),
-            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@0",
-                                      f::GradVarName("out1") + "@RENAME@1"}));
-  EXPECT_EQ(sum_op->Output("Out"),
-            std::vector<std::string>({f::GradVarName("out1")}));
-
-  f::OpDesc *grad_op1 = block->AllOps()[7];
-  ASSERT_EQ(grad_op1->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("x1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b1")}));
-
-  EXPECT_EQ(var_to_grad.size(), 6UL);
-  EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4));
-  EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5));
-  EXPECT_EQ(var_to_grad.at("out1"),
-            f::GradVarInfo(f::GradVarName("out1"), 0, 6));
-  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 7));
-  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 7));
-
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b3")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("y2")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
-}
-
-TEST(Backward, half_backward) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  auto *op1 = block->AppendOp();
-  op1->SetType("minus");
-  op1->SetInput("X", {"a"});
-  op1->SetInput("Y", {"b"});
-  op1->SetOutput("Out", {"out"});
-
-  auto target = f::VarDesc("out");
-  target.SetShape({1});
-  size_t forward_len = block->AllOps().size();
-  auto var_to_grad = AppendBackward(program, target, {"b"});
-  f::OpDesc *fill_op = block->AllOps()[forward_len];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-  auto ops = block->AllOps();
-  ASSERT_EQ(3UL, ops.size());
-
-  EXPECT_EQ(var_to_grad.size(), 2UL);
-  EXPECT_EQ(var_to_grad.at("a"),
-            f::GradVarInfo(f::GradVarName("a"), 0, forward_len + 1));
-}
diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc
index 3693bc25d81a8309df1a6ddf3d9b08d484596ea9..f537e4b9e569dd4c513ac0efde7240833bcf04b6 100644
--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/block_desc.h"
+#include <queue>
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 
-#include <queue>
-
 namespace paddle {
 namespace framework {
 
@@ -135,6 +134,11 @@ OpDesc *BlockDesc::PrependOp() {
   return ops_.front().get();
 }
 
+void BlockDesc::PrependAllocatedOp(std::unique_ptr<OpDesc> &&op_desc) {
+  need_update_ = true;
+  ops_.emplace_front(std::move(op_desc));
+}
+
 OpDesc *BlockDesc::InsertOp(size_t index) {
   need_update_ = true;
   auto it = ops_.begin() + index;
@@ -144,17 +148,10 @@ OpDesc *BlockDesc::InsertOp(size_t index) {
 }
 
 void BlockDesc::RemoveOp(size_t s, size_t e) {
-  if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) {
+  if (ops_.begin() + s >= ops_.end() || ops_.begin() + e > ops_.end()) {
     return;
   }
   need_update_ = true;
-  for (auto it = ops_.begin() + s; it != ops_.begin() + e; it++) {
-    auto names = (*it)->InputArgumentNames();
-    for (auto n : names) {
-      // TODO(typhoonzero): delete vars if no other op use it.
-      VLOG(3) << "deleting var " << n;
-    }
-  }
   ops_.erase(ops_.begin() + s, ops_.begin() + e);
 }
 
@@ -172,17 +169,13 @@ void BlockDesc::Flush() {
   }
 
   if (need_update_) {
-    auto &op_field = *this->desc_->mutable_ops();
-    this->ClearPBOps();
-    op_field.Reserve(static_cast<int>(ops_.size()));
+    this->desc_->mutable_ops()->Clear();
     for (auto &op_desc : ops_) {
-      op_field.AddAllocated(op_desc->Proto());
+      this->desc_->mutable_ops()->Add()->CopyFrom(*op_desc->Proto());
     }
-    auto &var_field = *this->desc_->mutable_vars();
-    this->ClearPBVars();
-    var_field.Reserve(static_cast<int>(vars_.size()));
+    this->desc_->mutable_vars()->Clear();
     for (auto &var_desc : vars_) {
-      var_field.AddAllocated(var_desc.second->Proto());
+      this->desc_->mutable_vars()->Add()->CopyFrom(*var_desc.second->Proto());
     }
     need_update_ = false;
   }
@@ -203,7 +196,7 @@ BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc)
     vars_[var_desc.name()].reset(new VarDesc(var_desc));
   }
   for (const proto::OpDesc &op_desc : desc_->ops()) {
-    ops_.emplace_back(new OpDesc(op_desc, prog, this));
+    ops_.emplace_back(new OpDesc(op_desc, this));
   }
 }
 
@@ -212,7 +205,7 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
     : prog_(prog), desc_(desc) {
   need_update_ = true;
   for (auto &op : other.ops_) {
-    ops_.emplace_back(new OpDesc(*op->Proto(), prog, this));
+    ops_.emplace_back(new OpDesc(*op, this));
   }
   for (auto &it : other.vars_) {
     auto *var = new VarDesc(*it.second);
@@ -220,22 +213,6 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
   }
 }
 
-void BlockDesc::ClearPBOps() {
-  auto ops = this->desc_->mutable_ops();
-  while (!ops->empty()) {
-    // we do not own the OpDesc, so release the ownership.
-    ops->ReleaseLast();
-  }
-}
-
-void BlockDesc::ClearPBVars() {
-  auto vars = this->desc_->mutable_vars();
-  while (!vars->empty()) {
-    // we do not own the VarDesc, so release the ownership.
-    vars->ReleaseLast();
-  }
-}
-
 void BlockDesc::SetForwardBlockID(int32_t forward_block_id) {
   PADDLE_ENFORCE(!desc_->has_forward_block_idx(),
                  "Parent block ID has been set to %d. Cannot set to %d",
diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
index 185f018ac1b5863e0ee86fdaa17df1ccbc6e030e..ce48548418478cc5c9f9ca1244df9e66dca884e6 100644
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <deque>
 #include <memory>
 #include <set>
+#include <string>
 #include <unordered_map>
 #include <vector>
 
@@ -40,11 +41,6 @@ class BlockDesc {
 
   BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, ProgramDesc *prog);
 
-  ~BlockDesc() {
-    this->ClearPBVars();
-    this->ClearPBOps();
-  }
-
   int32_t ID() const { return desc_->idx(); }
 
   int32_t Parent() const { return desc_->parent_idx(); }
@@ -87,15 +83,24 @@ class BlockDesc {
 
   OpDesc *PrependOp();
 
+  void PrependAllocatedOp(std::unique_ptr<OpDesc> &&op_desc);
+
   OpDesc *InsertOp(size_t index);
 
+  /*
+   * Remove Op and its input/output variables.
+   * Note that for either input or output variable, if it is also an input or
+   * output variable of other ops, we should remain it.
+   */
   void RemoveOp(size_t s, size_t e);
 
+  void RemoveVar(const std::string &name) { vars_.erase(name); }
+
   std::vector<OpDesc *> AllOps() const;
 
   size_t OpSize() const { return ops_.size(); }
 
-  OpDesc *Op(int idx) { return ops_.at(idx).get(); }
+  OpDesc *Op(int idx) const { return ops_.at(idx).get(); }
 
   void Flush();
 
@@ -103,10 +108,6 @@ class BlockDesc {
 
   ProgramDesc *Program() const { return this->prog_; }
 
- private:
-  void ClearPBOps();
-  void ClearPBVars();
-
  private:
   ProgramDesc *prog_;       // not_own
   proto::BlockDesc *desc_;  // not_own
diff --git a/paddle/fluid/framework/blocking_queue.h b/paddle/fluid/framework/blocking_queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..a19558c0ae59005bee575e8c469c7f95d8780ab1
--- /dev/null
+++ b/paddle/fluid/framework/blocking_queue.h
@@ -0,0 +1,74 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <condition_variable>  // NOLINT
+#include <deque>
+#include <mutex>  // NOLINT
+#include <utility>
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+class BlockingQueue {
+ public:
+  void Push(const T &item) {
+    {
+      std::lock_guard<std::mutex> g(mutex_);
+      q_.emplace_back(item);
+    }
+    cv_.notify_one();
+  }
+
+  template <typename U>
+  void Extend(const U &items) {
+    {
+      std::lock_guard<std::mutex> g(mutex_);
+      for (auto &item : items) {
+        q_.emplace_back(item);
+      }
+    }
+    cv_.notify_all();
+  }
+
+  std::deque<T> PopAll(size_t ms, bool *timeout) {
+    auto time =
+        std::chrono::system_clock::now() + std::chrono::milliseconds(ms);
+    std::unique_lock<std::mutex> lock(mutex_);
+    *timeout = !cv_.wait_until(lock, time, [this] { return !q_.empty(); });
+    std::deque<T> ret;
+    if (!*timeout) {
+      std::swap(ret, q_);
+    }
+    return ret;
+  }
+
+  T Pop() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cv_.wait(lock, [=] { return !q_.empty(); });
+    T rc(std::move(q_.front()));
+    q_.pop_front();
+    return rc;
+  }
+
+ private:
+  std::mutex mutex_;
+  std::condition_variable cv_;
+  std::deque<T> q_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h
index 9f8fb12098d622058a86f83c1c42a1feb1cfb2e2..722bf8e8ecba0c9cbc5e3ad737dbf73148d2873c 100644
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -14,24 +14,44 @@ limitations under the License. */
 
 #pragma once
 
-#include <stddef.h>  // for size_t
+#include <stddef.h>            // for size_t
+#include <condition_variable>  // NOLINT
 #include <typeindex>
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
 
+enum class ChannelAction {
+  SEND = 0,
+  RECEIVE = 1,
+  CLOSE = 2,
+};
+
 // Channel is the abstract class of buffered and un-buffered channels.
 template <typename T>
 class Channel {
  public:
-  virtual bool Send(T*) = 0;
+  virtual bool CanSend() = 0;
+  virtual bool CanReceive() = 0;
+  virtual void Send(T*) = 0;
   virtual bool Receive(T*) = 0;
   virtual size_t Cap() = 0;
   virtual void Lock() = 0;
+
   virtual void Unlock() = 0;
+  virtual bool IsClosed() = 0;
   virtual void Close() = 0;
   virtual ~Channel() {}
+
+  virtual void AddToSendQ(const void* referrer, T* data,
+                          std::shared_ptr<std::condition_variable_any> cond,
+                          std::function<bool(ChannelAction)> cb) = 0;
+  virtual void AddToReceiveQ(const void* referrer, T* data,
+                             std::shared_ptr<std::condition_variable_any> cond,
+                             std::function<bool(ChannelAction)> cb) = 0;
+  virtual void RemoveFromSendQ(const void* referrer) = 0;
+  virtual void RemoveFromReceiveQ(const void* referrer) = 0;
 };
 
 // Forward declaration of channel implementations.
@@ -64,43 +84,113 @@ class ChannelHolder {
   }
 
   template <typename T>
-  bool Send(T* data) {
-    if (!IsInitialized()) return false;
-    PADDLE_ENFORCE_EQ(holder_->Type(), std::type_index(typeid(T)));
+  void Send(T* data) {
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    PADDLE_ENFORCE_EQ(
+        holder_->Type(), std::type_index(typeid(T)),
+        "Channel type is not same as the type of the data being sent");
     // Static cast should be safe because we have ensured that types are same
     Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-    return channel != nullptr ? channel->Send(data) : false;
+    PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null.");
+    channel->Send(data);
   }
 
   template <typename T>
   bool Receive(T* data) {
-    if (!IsInitialized()) return false;
-    PADDLE_ENFORCE_EQ(holder_->Type(), std::type_index(typeid(T)));
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    PADDLE_ENFORCE_EQ(
+        holder_->Type(), std::type_index(typeid(T)),
+        "Channel type is not same as the type of the data being sent");
     Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-    return channel != nullptr ? channel->Receive(data) : false;
+    PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null.");
+    return channel->Receive(data);
+  }
+
+  bool IsClosed() {
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    return holder_->IsClosed();
+  }
+
+  bool CanSend() {
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    return holder_->CanSend();
+  }
+
+  bool CanReceive() {
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    return holder_->CanReceive();
   }
 
   void close() {
-    if (IsInitialized()) holder_->Close();
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    holder_->Close();
   }
 
   size_t Cap() {
-    if (IsInitialized()) return holder_->Cap();
-    return -1;
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    return holder_->Cap();
   }
 
   void Lock() {
-    if (IsInitialized()) holder_->Lock();
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    holder_->Lock();
   }
 
   void Unlock() {
-    if (IsInitialized()) holder_->Unlock();
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    holder_->Unlock();
+  }
+
+  template <typename T>
+  void AddToSendQ(const void* referrer, T* data,
+                  std::shared_ptr<std::condition_variable_any> cond,
+                  std::function<bool(ChannelAction)> cb) {
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
+    if (channel != nullptr) {
+      channel->AddToSendQ(referrer, data, cond, cb);
+    }
+  }
+
+  template <typename T>
+  void AddToReceiveQ(const void* referrer, T* data,
+                     std::shared_ptr<std::condition_variable_any> cond,
+                     std::function<bool(ChannelAction)> cb) {
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
+    if (channel != nullptr) {
+      channel->AddToReceiveQ(referrer, data, cond, cb);
+    }
+  }
+
+  void RemoveFromSendQ(const void* referrer) {
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    holder_->RemoveFromSendQ(referrer);
+  }
+
+  void RemoveFromReceiveQ(const void* referrer) {
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
+    holder_->RemoveFromReceiveQ(referrer);
   }
 
   inline bool IsInitialized() const { return holder_ != nullptr; }
 
   inline const std::type_index Type() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true);
+    PADDLE_ENFORCE_EQ(IsInitialized(), true,
+                      "The Channel hasn't been initialized");
     return holder_->Type();
   }
 
@@ -113,6 +203,11 @@ class ChannelHolder {
     virtual ~Placeholder() {}
     virtual const std::type_index Type() const = 0;
     virtual void* Ptr() const = 0;
+    virtual bool IsClosed() = 0;
+    virtual bool CanSend() = 0;
+    virtual bool CanReceive() = 0;
+    virtual void RemoveFromSendQ(const void* referrer) = 0;
+    virtual void RemoveFromReceiveQ(const void* referrer) = 0;
     virtual void Close() = 0;
     virtual void Lock() = 0;
     virtual void Unlock() = 0;
@@ -121,7 +216,8 @@ class ChannelHolder {
 
   template <typename T>
   struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(size_t buffer_size) : type_(std::type_index(typeid(T))) {
+    explicit PlaceholderImpl(size_t buffer_size)
+        : type_(std::type_index(typeid(T))) {
       channel_.reset(MakeChannel<T>(buffer_size));
     }
 
@@ -129,6 +225,39 @@ class ChannelHolder {
 
     virtual void* Ptr() const { return static_cast<void*>(channel_.get()); }
 
+    virtual bool IsClosed() {
+      if (channel_) {
+        return channel_->IsClosed();
+      }
+      return false;
+    }
+
+    virtual bool CanSend() {
+      if (channel_) {
+        return channel_->CanSend();
+      }
+      return false;
+    }
+
+    virtual bool CanReceive() {
+      if (channel_) {
+        return channel_->CanReceive();
+      }
+      return false;
+    }
+
+    virtual void RemoveFromSendQ(const void* referrer) {
+      if (channel_) {
+        channel_->RemoveFromSendQ(referrer);
+      }
+    }
+
+    virtual void RemoveFromReceiveQ(const void* referrer) {
+      if (channel_) {
+        channel_->RemoveFromReceiveQ(referrer);
+      }
+    }
+
     virtual void Close() {
       if (channel_) channel_->Close();
     }
diff --git a/paddle/fluid/framework/channel_impl.h b/paddle/fluid/framework/channel_impl.h
index a4561031fd8c49613269e7008ce558f25f9765e4..26d454534e1ae38c4f83376c0836a45781ea9101 100644
--- a/paddle/fluid/framework/channel_impl.h
+++ b/paddle/fluid/framework/channel_impl.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <stddef.h>  // for size_t
 #include <atomic>
-#include <condition_variable>
+#include <condition_variable>  // NOLINT
 #include <deque>
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -29,39 +29,56 @@ class ChannelImpl : public paddle::framework::Channel<T> {
   friend void paddle::framework::CloseChannel<T>(Channel<T> *);
 
  public:
-  virtual bool Send(T *);
+  virtual bool CanSend();
+  virtual bool CanReceive();
+  virtual void Send(T *);
   virtual bool Receive(T *);
   virtual size_t Cap() { return cap_; }
   virtual void Lock();
   virtual void Unlock();
+  virtual bool IsClosed();
   virtual void Close();
-
-  ChannelImpl(size_t);
+  explicit ChannelImpl(size_t);
   virtual ~ChannelImpl();
 
+  virtual void AddToSendQ(const void *referrer, T *data,
+                          std::shared_ptr<std::condition_variable_any> cond,
+                          std::function<bool(ChannelAction)> cb);
+  virtual void AddToReceiveQ(const void *referrer, T *data,
+                             std::shared_ptr<std::condition_variable_any> cond,
+                             std::function<bool(ChannelAction)> cb);
+
+  virtual void RemoveFromSendQ(const void *referrer);
+  virtual void RemoveFromReceiveQ(const void *referrer);
+
  private:
   struct QueueMessage {
     T *data;
-    std::condition_variable_any cond;
+    std::shared_ptr<std::condition_variable_any> cond;
     bool chan_closed = false;
     bool completed = false;
+    const void *referrer;  // TODO(thuan): figure out better way to do this
+    std::function<bool(ChannelAction)> callback;
+
+    explicit QueueMessage(T *item)
+        : data(item), cond(std::make_shared<std::condition_variable_any>()) {}
 
-    QueueMessage(T *item) : data(item) {}
+    QueueMessage(T *item, std::shared_ptr<std::condition_variable_any> cond)
+        : data(item), cond(cond) {}
 
     void Wait(std::unique_lock<std::recursive_mutex> &lock) {
-      cond.wait(lock, [this]() { return completed; });
+      cond->wait(lock, [this]() { return completed; });
     }
 
     void Notify() {
       completed = true;
-      cond.notify_all();
+      cond->notify_all();
     }
   };
 
-  bool send_return(bool value) {
+  void send_return() {
     send_ctr--;
     destructor_cond_.notify_all();
-    return value;
   }
 
   bool recv_return(bool value) {
@@ -70,6 +87,21 @@ class ChannelImpl : public paddle::framework::Channel<T> {
     return value;
   }
 
+  std::shared_ptr<QueueMessage> get_first_message(
+      std::deque<std::shared_ptr<QueueMessage>> *queue, ChannelAction action) {
+    while (!queue->empty()) {
+      // Check whether this message was added by Select
+      // If this was added by Select then execute the callback
+      // to check if you can execute this message. The callback
+      // can return false if some other case was executed in Select.
+      // In that case just discard this QueueMessage and process next.
+      std::shared_ptr<QueueMessage> m = queue->front();
+      queue->pop_front();
+      if (m->callback == nullptr || m->callback(action)) return m;
+    }
+    return nullptr;
+  }
+
   size_t cap_;
   std::recursive_mutex mu_;
   bool closed_;
@@ -88,28 +120,45 @@ ChannelImpl<T>::ChannelImpl(size_t capacity)
 }
 
 template <typename T>
-bool ChannelImpl<T>::Send(T *item) {
+bool ChannelImpl<T>::CanSend() {
+  std::lock_guard<std::recursive_mutex> lock{mu_};
+  return !closed_ && (!recvq.empty() || buf_.size() < cap_);
+}
+
+template <typename T>
+bool ChannelImpl<T>::CanReceive() {
+  std::lock_guard<std::recursive_mutex> lock{mu_};
+  return !(closed_ && buf_.empty()) && (!sendq.empty() || buf_.size() > 0);
+}
+
+template <typename T>
+void ChannelImpl<T>::Send(T *item) {
   send_ctr++;
   std::unique_lock<std::recursive_mutex> lock{mu_};
 
-  // If channel is closed, do nothing
+  // If channel is closed, throw exception
   if (closed_) {
+    send_return();
     lock.unlock();
-    // TODO(abhinavarora) Should panic on closed channel
-    return send_return(false);
+    PADDLE_THROW("Cannot send on closed channel");
   }
 
   // If there is a receiver, directly pass the value we want
   // to send to the receiver, bypassing the channel buffer if any
   if (!recvq.empty()) {
-    std::shared_ptr<QueueMessage> m = recvq.front();
-    recvq.pop_front();
-    // Do the data transfer
-    *(m->data) = std::move(*item);
-    // Wake up the blocked process and unlock
-    m->Notify();
-    lock.unlock();
-    return send_return(true);
+    std::shared_ptr<QueueMessage> m =
+        get_first_message(&recvq, ChannelAction::SEND);
+
+    if (m != nullptr) {
+      *(m->data) = std::move(*item);
+      m->Notify();
+      send_return();
+      return;
+    } else {
+      Send(item);
+      send_return();
+      return;
+    }
   }
 
   // Unbuffered channel will always bypass this
@@ -118,9 +167,8 @@ bool ChannelImpl<T>::Send(T *item) {
   if (buf_.size() < cap_) {
     // Copy to buffer
     buf_.push_back(std::move(*item));
-    // Release lock and return true
-    lock.unlock();
-    return send_return(true);
+    send_return();
+    return;
   }
 
   // Block on channel, because some receiver will complete
@@ -128,8 +176,12 @@ bool ChannelImpl<T>::Send(T *item) {
   auto m = std::make_shared<QueueMessage>(item);
   sendq.push_back(m);
   m->Wait(lock);
-  // TODO(abhinavarora) Should panic on closed channel
-  return send_return(!m->chan_closed);
+  if (m->chan_closed) {
+    send_return();
+    lock.unlock();
+    PADDLE_THROW("Cannot send on closed channel");
+  }
+  send_return();
 }
 
 template <typename T>
@@ -139,21 +191,38 @@ bool ChannelImpl<T>::Receive(T *item) {
 
   // If channel is closed and buffer is empty or
   // channel is unbuffered
-  if (closed_ && buf_.empty()) {
-    lock.unlock();
-    return recv_return(false);
-  }
+  if (closed_ && buf_.empty()) return recv_return(false);
 
   // If there is a sender, directly receive the value we want
-  // from the sender, bypassing the channel buffer if any
+  // from the sender. In case of a buffered channel, read from
+  // buffer and move front of send queue to the buffer
   if (!sendq.empty()) {
-    std::shared_ptr<QueueMessage> m = sendq.front();
-    sendq.pop_front();
-    // Do the data transfer
-    *item = std::move(*(m->data));
-    // Wake up the blocked process and unlock
-    m->Notify();
-    lock.unlock();
+    std::shared_ptr<QueueMessage> m =
+        get_first_message(&sendq, ChannelAction::RECEIVE);
+    if (buf_.size() > 0) {
+      // Case 1 : Channel is Buffered
+      // Do Data transfer from front of buffer
+      // and add a QueueMessage to the buffer
+      *item = std::move(buf_.front());
+      buf_.pop_front();
+      // If first message from sendq is not null
+      // add it to the buffer and notify it
+      if (m != nullptr) {
+        // Copy to buffer
+        buf_.push_back(std::move(*(m->data)));
+        m->Notify();
+      }  // Ignore if there is no first message
+    } else {
+      // Case 2: Channel is Unbuffered
+      // Do data transfer from front of SendQ
+      // If front is nullptr, then recursively call itself
+      if (m != nullptr) {
+        *item = std::move(*(m->data));
+        m->Notify();
+      } else {
+        return recv_return(Receive(item));
+      }
+    }
     return recv_return(true);
   }
 
@@ -162,8 +231,7 @@ bool ChannelImpl<T>::Receive(T *item) {
     // Directly read from buffer
     *item = std::move(buf_.front());
     buf_.pop_front();
-    // Release lock and return true
-    lock.unlock();
+    // return true
     return recv_return(true);
   }
 
@@ -186,6 +254,12 @@ void ChannelImpl<T>::Unlock() {
   mu_.unlock();
 }
 
+template <typename T>
+bool ChannelImpl<T>::IsClosed() {
+  std::lock_guard<std::recursive_mutex> lock{mu_};
+  return closed_;
+}
+
 template <typename T>
 void ChannelImpl<T>::Close() {
   std::unique_lock<std::recursive_mutex> lock{mu_};
@@ -203,6 +277,12 @@ void ChannelImpl<T>::Close() {
     std::shared_ptr<QueueMessage> m = recvq.front();
     recvq.pop_front();
     m->chan_closed = true;
+
+    // Execute callback function (if any)
+    if (m->callback != nullptr) {
+      m->callback(ChannelAction::CLOSE);
+    }
+
     m->Notify();
   }
 
@@ -211,10 +291,70 @@ void ChannelImpl<T>::Close() {
     std::shared_ptr<QueueMessage> m = sendq.front();
     sendq.pop_front();
     m->chan_closed = true;
+
+    // Execute callback function (if any)
+    if (m->callback != nullptr) {
+      m->callback(ChannelAction::CLOSE);
+    }
+
     m->Notify();
   }
 }
 
+template <typename T>
+void ChannelImpl<T>::AddToSendQ(
+    const void *referrer, T *data,
+    std::shared_ptr<std::condition_variable_any> cond,
+    std::function<bool(ChannelAction)> cb) {
+  std::lock_guard<std::recursive_mutex> lock{mu_};
+  auto m = std::make_shared<QueueMessage>(data, cond);
+  m->referrer = referrer;
+  m->callback = cb;
+  sendq.push_back(m);
+}
+
+template <typename T>
+void ChannelImpl<T>::AddToReceiveQ(
+    const void *referrer, T *data,
+    std::shared_ptr<std::condition_variable_any> cond,
+    std::function<bool(ChannelAction)> cb) {
+  std::lock_guard<std::recursive_mutex> lock{mu_};
+  auto m = std::make_shared<QueueMessage>(data, cond);
+  m->referrer = referrer;
+  m->callback = cb;
+  recvq.push_back(m);
+}
+
+template <typename T>
+void ChannelImpl<T>::RemoveFromSendQ(const void *referrer) {
+  std::lock_guard<std::recursive_mutex> lock{mu_};
+
+  for (auto it = sendq.begin(); it != sendq.end();) {
+    std::shared_ptr<QueueMessage> sendMsg = (std::shared_ptr<QueueMessage>)*it;
+
+    if (sendMsg->referrer == referrer) {
+      it = sendq.erase(it);
+    } else {
+      ++it;
+    }
+  }
+}
+
+template <typename T>
+void ChannelImpl<T>::RemoveFromReceiveQ(const void *referrer) {
+  std::lock_guard<std::recursive_mutex> lock{mu_};
+
+  for (auto it = recvq.begin(); it != recvq.end();) {
+    std::shared_ptr<QueueMessage> recvMsg = (std::shared_ptr<QueueMessage>)*it;
+
+    if (recvMsg->referrer == referrer) {
+      it = recvq.erase(it);
+    } else {
+      ++it;
+    }
+  }
+}
+
 template <typename T>
 ChannelImpl<T>::~ChannelImpl() {
   Close();
diff --git a/paddle/fluid/framework/channel_test.cc b/paddle/fluid/framework/channel_test.cc
index edfb41c72489113d9803c2957baed1ce44f8296d..542d791f6bbdf7d68a4786998ccc0233fff6473d 100644
--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
@@ -14,9 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/channel.h"
 
-#include <chrono>
-#include <thread>
-
+#include <chrono>  // NOLINT
+#include <thread>  // NOLINT
 #include "gtest/gtest.h"
 
 using paddle::framework::Channel;
@@ -37,23 +36,25 @@ TEST(Channel, ChannelCapacityTest) {
   delete ch;
 }
 
-void RecevingOrderEqualToSendingOrder(Channel<int> *ch) {
+void RecevingOrderEqualToSendingOrder(Channel<int> *ch, int num_items) {
   unsigned sum_send = 0;
   std::thread t([&]() {
-    for (int i = 0; i < 5; i++) {
-      EXPECT_EQ(ch->Send(&i), true);
+    for (int i = 0; i < num_items; i++) {
+      ch->Send(&i);
       sum_send += i;
     }
   });
-  for (int i = 0; i < 5; i++) {
-    int recv = 999;
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));
+  for (int i = 0; i < num_items; i++) {
+    int recv = -1;
     EXPECT_EQ(ch->Receive(&recv), true);
     EXPECT_EQ(recv, i);
   }
   std::this_thread::sleep_for(std::chrono::milliseconds(200));
   CloseChannel(ch);
   t.join();
-  EXPECT_EQ(sum_send, 10U);
+  unsigned expected_sum = (num_items * (num_items - 1)) / 2;
+  EXPECT_EQ(sum_send, expected_sum);
   delete ch;
 }
 
@@ -61,7 +62,7 @@ TEST(Channel, SufficientBufferSizeDoesntBlock) {
   const size_t buffer_size = 10;
   auto ch = MakeChannel<size_t>(buffer_size);
   for (size_t i = 0; i < buffer_size; ++i) {
-    EXPECT_EQ(ch->Send(&i), true);  // should not block
+    ch->Send(&i);
   }
 
   size_t out;
@@ -82,7 +83,7 @@ void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) {
   const size_t data = 5;
   std::thread send_thread{[&]() {
     size_t i = data;
-    EXPECT_EQ(ch->Send(&i), true);  // should not block
+    ch->Send(&i);  // should not block
   }};
 
   std::thread recv_thread{[&]() {
@@ -94,12 +95,18 @@ void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) {
   send_thread.join();
   recv_thread.join();
 
-  // After closing send should return false. Receive should
-  // also return false as there is no data in queue.
+  // After closing send should panic. Receive should
+  // also  false as there is no data in queue.
   CloseChannel(ch);
   send_thread = std::thread{[&]() {
     size_t i = data;
-    EXPECT_EQ(ch->Send(&i), false);  // should return false
+    bool is_exception = false;
+    try {
+      ch->Send(&i);
+    } catch (paddle::platform::EnforceNotMet e) {
+      is_exception = true;
+    }
+    EXPECT_EQ(is_exception, true);
   }};
   recv_thread = std::thread{[&]() {
     size_t i;
@@ -129,7 +136,7 @@ TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {
   auto ch = MakeChannel<size_t>(buffer_size);
 
   for (size_t i = 0; i < buffer_size; ++i) {
-    EXPECT_EQ(ch->Send(&i), true);  // sending should not block
+    ch->Send(&i);  // sending should not block
   }
 
   size_t out;
@@ -159,10 +166,17 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
   std::thread t([&]() {
     // Try to write more than buffer size.
     for (size_t i = 0; i < 2 * buffer_size; ++i) {
-      if (i < buffer_size)
-        EXPECT_EQ(ch->Send(&i), true);  // should block after 10 iterations
-      else
-        EXPECT_EQ(ch->Send(&i), false);
+      if (i < buffer_size) {
+        ch->Send(&i);  // should block after 10 iterations
+      } else {
+        bool is_exception = false;
+        try {
+          ch->Send(&i);
+        } catch (paddle::platform::EnforceNotMet e) {
+          is_exception = true;
+        }
+        EXPECT_EQ(is_exception, true);
+      }
     }
   });
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
@@ -173,21 +187,37 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
 
 TEST(Channel, RecevingOrderEqualToSendingOrderWithUnBufferedChannel) {
   auto ch = MakeChannel<int>(0);
-  RecevingOrderEqualToSendingOrder(ch);
+  RecevingOrderEqualToSendingOrder(ch, 20);
 }
 
-TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel) {
+TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel1) {
+  // Test that Receive Order is same as Send Order when number of items
+  // sent is less than size of buffer
   auto ch = MakeChannel<int>(10);
-  RecevingOrderEqualToSendingOrder(ch);
+  RecevingOrderEqualToSendingOrder(ch, 5);
+}
+
+TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel2) {
+  // Test that Receive Order is same as Send Order when number of items
+  // sent is equal to size of buffer
+  auto ch = MakeChannel<int>(10);
+  RecevingOrderEqualToSendingOrder(ch, 10);
+}
+
+TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel3) {
+  // Test that Receive Order is same as Send Order when number of items
+  // sent is greater than the size of buffer
+  auto ch = MakeChannel<int>(10);
+  RecevingOrderEqualToSendingOrder(ch, 20);
 }
 
 void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
 
   // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     t[i] = std::thread(
         [&](bool *p) {
@@ -200,7 +230,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
 
   // Verify that all the threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], false);
   }
 
@@ -211,27 +241,33 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
 
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-  bool send_success[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
+  bool send_success[kNumThreads];
 
   // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     send_success[i] = false;
     t[i] = std::thread(
         [&](bool *ended, bool *success) {
           int data = 10;
-          *success = ch->Send(&data);
+          bool is_exception = false;
+          try {
+            ch->Send(&data);
+          } catch (paddle::platform::EnforceNotMet e) {
+            is_exception = true;
+          }
+          *success = !is_exception;
           *ended = true;
         },
         &thread_ended[i], &send_success[i]);
@@ -241,13 +277,13 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
   if (isBuffered) {
     // If ch is Buffered, atleast 4 threads must be blocked.
     int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       if (!thread_ended[i]) ct++;
     }
     EXPECT_GE(ct, 4);
   } else {
     // If ch is UnBuffered, all the threads should be blocked.
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       EXPECT_EQ(thread_ended[i], false);
     }
   }
@@ -258,21 +294,21 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
 
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
   if (isBuffered) {
     // Verify that only 1 send was successful
     int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       if (send_success[i]) ct++;
     }
     // Only 1 send must be successful
     EXPECT_EQ(ct, 1);
   }
 
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 // This tests that closing a buffered channel also unblocks
@@ -316,8 +352,11 @@ TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
     // Try to send more number of times
     // than receivers
     for (int i = 0; i < 4; i++) {
-      ch->Send(&i);
-      sum_send += i;
+      try {
+        ch->Send(&i);
+        sum_send += i;
+      } catch (paddle::platform::EnforceNotMet e) {
+      }
     }
   });
   for (int i = 0; i < 3; i++) {
@@ -370,19 +409,25 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
 // This tests that destroying a channel unblocks
 //  any senders waiting for channel to have write space
 void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-  bool send_success[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
+  bool send_success[kNumThreads];
 
   // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     send_success[i] = false;
     t[i] = std::thread(
         [&](bool *ended, bool *success) {
           int data = 10;
-          *success = ch->Send(&data);
+          bool is_exception = false;
+          try {
+            ch->Send(&data);
+          } catch (paddle::platform::EnforceNotMet e) {
+            is_exception = true;
+          }
+          *success = !is_exception;
           *ended = true;
         },
         &thread_ended[i], &send_success[i]);
@@ -393,14 +438,14 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
   if (isBuffered) {
     // If channel is buffered, verify that atleast 4 threads are blocked
     int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       if (thread_ended[i] == false) ct++;
     }
     // Atleast 4 threads must be blocked
     EXPECT_GE(ct, 4);
   } else {
     // Verify that all the threads are blocked
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       EXPECT_EQ(thread_ended[i], false);
     }
   }
@@ -409,13 +454,13 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
 
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
   // Count number of successful sends
   int ct = 0;
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     if (send_success[i]) ct++;
   }
 
@@ -428,18 +473,18 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
   }
 
   // Join all threads
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 // This tests that destroying a channel also unblocks
 //  any receivers waiting on the channel
 void ChannelDestroyUnblockReceivers(Channel<int> *ch) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
 
   // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     t[i] = std::thread(
         [&](bool *p) {
@@ -453,18 +498,18 @@ void ChannelDestroyUnblockReceivers(Channel<int> *ch) {
   std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
 
   // Verify that all threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], false);
   }
   // delete the channel
   delete ch;
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) {
@@ -508,7 +553,7 @@ void ChannelHolderSendReceive(ChannelHolder *ch) {
   unsigned sum_send = 0;
   std::thread t([&]() {
     for (int i = 0; i < 5; i++) {
-      EXPECT_EQ(ch->Send(&i), true);
+      ch->Send(&i);
       sum_send += i;
     }
   });
@@ -541,8 +586,22 @@ TEST(ChannelHolder, ChannelUninitializedTest) {
   ChannelHolder *ch = new ChannelHolder();
   EXPECT_EQ(ch->IsInitialized(), false);
   int i = 10;
-  EXPECT_EQ(ch->Send(&i), false);
-  EXPECT_EQ(ch->Receive(&i), false);
+  bool send_exception = false;
+  try {
+    ch->Send(&i);
+  } catch (paddle::platform::EnforceNotMet e) {
+    send_exception = true;
+  }
+  EXPECT_EQ(send_exception, true);
+
+  bool recv_exception = false;
+  try {
+    ch->Receive(&i);
+  } catch (paddle::platform::EnforceNotMet e) {
+    recv_exception = true;
+  }
+  EXPECT_EQ(recv_exception, true);
+
   bool is_exception = false;
   try {
     ch->Type();
@@ -620,12 +679,12 @@ TEST(ChannelHolder, TypeMismatchReceiveTest) {
 }
 
 void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
 
   // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     t[i] = std::thread(
         [&](bool *p) {
@@ -638,7 +697,7 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
 
   // Verify that all the threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], false);
   }
 
@@ -649,27 +708,33 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
 
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-  bool send_success[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
+  bool send_success[kNumThreads];
 
   // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     send_success[i] = false;
     t[i] = std::thread(
         [&](bool *ended, bool *success) {
           int data = 10;
-          *success = ch->Send(&data);
+          bool is_exception = false;
+          try {
+            ch->Send(&data);
+          } catch (paddle::platform::EnforceNotMet e) {
+            is_exception = true;
+          }
+          *success = !is_exception;
           *ended = true;
         },
         &thread_ended[i], &send_success[i]);
@@ -679,13 +744,13 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
   if (isBuffered) {
     // If ch is Buffered, atleast 4 threads must be blocked.
     int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       if (!thread_ended[i]) ct++;
     }
     EXPECT_GE(ct, 4);
   } else {
     // If ch is UnBuffered, all the threads should be blocked.
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       EXPECT_EQ(thread_ended[i], false);
     }
   }
@@ -696,21 +761,21 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
 
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
   if (isBuffered) {
     // Verify that only 1 send was successful
     int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       if (send_success[i]) ct++;
     }
     // Only 1 send must be successful
     EXPECT_EQ(ct, 1);
   }
 
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 // This tests that closing a channelholder unblocks
@@ -748,19 +813,25 @@ TEST(Channel, ChannelHolderCloseUnblocksSendersTest) {
 // This tests that destroying a channelholder unblocks
 //  any senders waiting for channel
 void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
-  bool send_success[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
+  bool send_success[kNumThreads];
 
   // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     send_success[i] = false;
     t[i] = std::thread(
         [&](bool *ended, bool *success) {
           int data = 10;
-          *success = ch->Send(&data);
+          bool is_exception = false;
+          try {
+            ch->Send(&data);
+          } catch (paddle::platform::EnforceNotMet e) {
+            is_exception = true;
+          }
+          *success = !is_exception;
           *ended = true;
         },
         &thread_ended[i], &send_success[i]);
@@ -770,14 +841,14 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
   if (isBuffered) {
     // If channel is buffered, verify that atleast 4 threads are blocked
     int ct = 0;
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       if (thread_ended[i] == false) ct++;
     }
     // Atleast 4 threads must be blocked
     EXPECT_GE(ct, 4);
   } else {
     // Verify that all the threads are blocked
-    for (size_t i = 0; i < num_threads; i++) {
+    for (size_t i = 0; i < kNumThreads; i++) {
       EXPECT_EQ(thread_ended[i], false);
     }
   }
@@ -786,13 +857,13 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
 
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
   // Count number of successfuld sends
   int ct = 0;
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     if (send_success[i]) ct++;
   }
 
@@ -805,18 +876,18 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
   }
 
   // Join all threads
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 // This tests that destroying a channelholder also unblocks
 //  any receivers waiting on the channel
 void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
-  size_t num_threads = 5;
-  std::thread t[num_threads];
-  bool thread_ended[num_threads];
+  const size_t kNumThreads = 5;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
 
   // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     thread_ended[i] = false;
     t[i] = std::thread(
         [&](bool *p) {
@@ -830,18 +901,18 @@ void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
 
   // Verify that all threads are blocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], false);
   }
   // delete the channel
   delete ch;
   std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
   // Verify that all threads got unblocked
-  for (size_t i = 0; i < num_threads; i++) {
+  for (size_t i = 0; i < kNumThreads; i++) {
     EXPECT_EQ(thread_ended[i], true);
   }
 
-  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
 }
 
 TEST(ChannelHolder, ChannelHolderDestroyUnblocksReceiversTest) {
@@ -871,3 +942,67 @@ TEST(ChannelHolder, ChannelHolderDestroyUnblocksSendersTest) {
   ch->Reset<int>(0);
   ChannelHolderDestroyUnblockSenders(ch, false);
 }
+
+// This tests that closing a channelholder many times.
+void ChannelHolderManyTimesClose(ChannelHolder *ch) {
+  const int kNumThreads = 15;
+  std::thread t[kNumThreads];
+  bool thread_ended[kNumThreads];
+
+  // Launches threads that try to send data to channel.
+  for (size_t i = 0; i < kNumThreads / 3; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *ended) {
+          int data = 10;
+          ch->Send(&data);
+          *ended = true;
+        },
+        &thread_ended[i]);
+  }
+
+  // Launches threads that try to receive data to channel.
+  for (size_t i = kNumThreads / 3; i < 2 * kNumThreads / 3; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data;
+          if (ch->Receive(&data)) {
+            EXPECT_EQ(data, 10);
+          }
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+
+  // Launches threads that try to close the channel.
+  for (size_t i = 2 * kNumThreads / 3; i < kNumThreads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          if (!ch->IsClosed()) {
+            ch->close();
+          }
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+
+  // Verify that all threads are unblocked
+  for (size_t i = 0; i < kNumThreads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+  EXPECT_TRUE(ch->IsClosed());
+  // delete the channel
+  delete ch;
+  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
+}
+
+TEST(ChannelHolder, ChannelHolderManyTimesCloseTest) {
+  // Check for Buffered Channel
+  ChannelHolder *ch = new ChannelHolder();
+  ch->Reset<int>(10);
+  ChannelHolderManyTimesClose(ch);
+}
diff --git a/paddle/fluid/framework/concurrency_test.cc b/paddle/fluid/framework/concurrency_test.cc
index 5770b0a5a18659e615e80a7c48113d8b543b69ec..bbf67f5ba92150f70cf45d49e3f4ca0a16393541 100644
--- a/paddle/fluid/framework/concurrency_test.cc
+++ b/paddle/fluid/framework/concurrency_test.cc
@@ -12,14 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <thread>
+#include <thread>  // NOLINT
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
 
 USE_NO_KERNEL_OP(go);
 USE_NO_KERNEL_OP(channel_close);
@@ -27,6 +26,12 @@ USE_NO_KERNEL_OP(channel_create);
 USE_NO_KERNEL_OP(channel_recv);
 USE_NO_KERNEL_OP(channel_send);
 USE_NO_KERNEL_OP(elementwise_add);
+USE_NO_KERNEL_OP(select);
+USE_NO_KERNEL_OP(conditional_block);
+USE_NO_KERNEL_OP(equal);
+USE_NO_KERNEL_OP(assign);
+USE_NO_KERNEL_OP(while);
+USE_NO_KERNEL_OP(print);
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
@@ -35,27 +40,15 @@ namespace paddle {
 namespace framework {
 
 template <typename T>
-void CreateIntVariable(Scope &scope, p::CPUPlace &place, std::string name,
-                       T value) {
-  // Create LoDTensor<int> of dim [1,1]
-  auto var = scope.Var(name);
+LoDTensor *CreateVariable(Scope *scope, const p::CPUPlace &place,
+                          std::string name, T value) {
+  // Create LoDTensor<int> of dim [1]
+  auto var = scope->Var(name);
   auto tensor = var->GetMutable<LoDTensor>();
-  tensor->Resize({1, 1});
+  tensor->Resize({1});
   T *expect = tensor->mutable_data<T>(place);
   expect[0] = value;
-}
-
-void InitTensorsInScope(Scope &scope, p::CPUPlace &place) {
-  p::CPUDeviceContext ctx(place);
-
-  // Create channel variable
-  scope.Var("Channel");
-
-  // Create Variables, x0 will be put into channel,
-  // result will be pulled from channel
-  CreateIntVariable(scope, place, "Status", false);
-  CreateIntVariable(scope, place, "x0", 99);
-  CreateIntVariable(scope, place, "result", 0);
+  return tensor;
 }
 
 void AddOp(const std::string &type, const VariableNameMap &inputs,
@@ -73,12 +66,117 @@ void AddOp(const std::string &type, const VariableNameMap &inputs,
   op->SetAttrMap(attrs);
 }
 
+void AddCase(ProgramDesc *program, Scope *scope, p::CPUPlace *place,
+             BlockDesc *casesBlock, int caseId, int caseType,
+             std::string caseChannel, std::string caseVarName,
+             std::function<void(BlockDesc *, Scope *)> func) {
+  std::string caseCondName = std::string("caseCond") + std::to_string(caseId);
+  std::string caseCondXVarName =
+      std::string("caseCondX") + std::to_string(caseId);
+
+  BlockDesc *caseBlock = program->AppendBlock(*casesBlock);
+  func(caseBlock, scope);
+
+  CreateVariable(scope, *place, caseCondName, false);
+  CreateVariable(scope, *place, caseCondXVarName, caseId);
+  CreateVariable(scope, *place, caseVarName, caseId);
+
+  scope->Var("step_scope");
+
+  AddOp("equal", {{"X", {caseCondXVarName}}, {"Y", {"caseToExecute"}}},
+        {{"Out", {caseCondName}}}, {}, casesBlock);
+
+  AddOp("conditional_block", {{"X", {caseCondName}}, {"Params", {}}},
+        {{"Out", {}}, {"Scope", {"step_scope"}}},
+        {{"sub_block", caseBlock}, {"is_scalar_condition", true}}, casesBlock);
+}
+
+void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program,
+                        BlockDesc *parentBlock, std::string dataChanName,
+                        std::string quitChanName) {
+  BlockDesc *whileBlock = program->AppendBlock(*parentBlock);
+
+  CreateVariable(scope, *place, "whileExitCond", true);
+  CreateVariable(scope, *place, "caseToExecute", -1);
+  CreateVariable(scope, *place, "case1var", 0);
+
+  CreateVariable(scope, *place, "xtemp", 0);
+
+  // TODO(thuan): Need to create fibXToSend, since channel send moves the actual
+  // data,
+  // which causes the data to be no longer accessible to do the fib calculation
+  // TODO(abhinav): Change channel send to do a copy instead of a move!
+  CreateVariable(scope, *place, "fibXToSend", 0);
+
+  CreateVariable(scope, *place, "fibX", 0);
+  CreateVariable(scope, *place, "fibY", 1);
+  CreateVariable(scope, *place, "quitVar", 0);
+
+  BlockDesc *casesBlock = program->AppendBlock(*whileBlock);
+  std::function<void(BlockDesc * caseBlock)> f = [](BlockDesc *caseBlock) {};
+
+  // TODO(thuan): Remove this once we change channel send to do a copy instead
+  // of move
+  AddOp("assign", {{"X", {"fibX"}}}, {{"Out", {"fibXToSend"}}}, {}, whileBlock);
+
+  // Case 0: Send to dataChanName
+  std::function<void(BlockDesc * caseBlock, Scope * scope)> case0Func = [&](
+      BlockDesc *caseBlock, Scope *scope) {
+    AddOp("assign", {{"X", {"fibX"}}}, {{"Out", {"xtemp"}}}, {}, caseBlock);
+    AddOp("assign", {{"X", {"fibY"}}}, {{"Out", {"fibX"}}}, {}, caseBlock);
+    AddOp("elementwise_add", {{"X", {"xtemp"}}, {"Y", {"fibY"}}},
+          {{"Out", {"fibY"}}}, {}, caseBlock);
+  };
+  AddCase(program, scope, place, casesBlock, 0, 1, dataChanName, "fibXToSend",
+          case0Func);
+  std::string case0Config =
+      std::string("0,1,") + dataChanName + std::string(",fibXToSend");
+
+  // Case 1: Receive from quitChanName
+  std::function<void(BlockDesc * caseBlock, Scope * scope)> case2Func = [&](
+      BlockDesc *caseBlock, Scope *scope) {
+    // Exit the while loop after we receive from quit channel.
+    // We assign a false to "whileExitCond" variable, which will
+    // break out of while_op loop
+    CreateVariable(scope, *place, "whileFalse", false);
+    AddOp("assign", {{"X", {"whileFalse"}}}, {{"Out", {"whileExitCond"}}}, {},
+          caseBlock);
+  };
+  AddCase(program, scope, place, casesBlock, 1, 2, quitChanName, "quitVar",
+          case2Func);
+  std::string case1Config =
+      std::string("1,2,") + quitChanName + std::string(",quitVar");
+
+  // Select block
+  AddOp("select", {{"X", {dataChanName, quitChanName}},
+                   {"case_to_execute", {"caseToExecute"}}},
+        {{"Out", {}}},
+        {{"sub_block", casesBlock},
+         {"cases", std::vector<std::string>{case0Config, case1Config}}},
+        whileBlock);
+
+  scope->Var("stepScopes");
+  AddOp("while",
+        {{"X", {dataChanName, quitChanName}}, {"Condition", {"whileExitCond"}}},
+        {{"Out", {}}, {"StepScopes", {"stepScopes"}}},
+        {{"sub_block", whileBlock}}, parentBlock);
+}
+
 TEST(Concurrency, Go_Op) {
   Scope scope;
   p::CPUPlace place;
 
   // Initialize scope variables
-  InitTensorsInScope(scope, place);
+  p::CPUDeviceContext ctx(place);
+
+  // Create channel variable
+  scope.Var("Channel");
+
+  // Create Variables, x0 will be put into channel,
+  // result will be pulled from channel
+  CreateVariable(&scope, place, "Status", false);
+  CreateVariable(&scope, place, "x0", 99);
+  CreateVariable(&scope, place, "result", 0);
 
   framework::Executor executor(place);
   ProgramDesc program;
@@ -112,11 +210,83 @@ TEST(Concurrency, Go_Op) {
 
   executor.Run(program, &scope, 0, true, true);
 
-  // After we call executor.run, the Go operator should do a channel_send to set
-  // the
-  // "result" variable to 99
+  // After we call executor.run, the Go operator should do a channel_send to
+  // set the "result" variable to 99.
   auto *finalData = tensor.data<int>();
   EXPECT_EQ(finalData[0], 99);
 }
+
+/**
+ * This test implements the fibonacci function using go_op and select_op
+ */
+TEST(Concurrency, Select) {
+  Scope scope;
+  p::CPUPlace place;
+
+  // Initialize scope variables
+  p::CPUDeviceContext ctx(place);
+
+  CreateVariable(&scope, place, "Status", false);
+  CreateVariable(&scope, place, "result", 0);
+  CreateVariable(&scope, place, "currentXFib", 0);
+
+  framework::Executor executor(place);
+  ProgramDesc program;
+  BlockDesc *block = program.MutableBlock(0);
+
+  // Create channel OP
+  std::string dataChanName = "Channel";
+  scope.Var(dataChanName);
+  AddOp("channel_create", {}, {{"Out", {dataChanName}}},
+        {{"capacity", 0}, {"data_type", f::proto::VarType::LOD_TENSOR}}, block);
+
+  std::string quitChanName = "Quit";
+  scope.Var(quitChanName);
+  AddOp("channel_create", {}, {{"Out", {quitChanName}}},
+        {{"capacity", 0}, {"data_type", f::proto::VarType::LOD_TENSOR}}, block);
+
+  // Create Go Op routine, which loops 10 times over fibonacci sequence
+  CreateVariable(&scope, place, "xReceiveVar", 0);
+
+  BlockDesc *goOpBlock = program.AppendBlock(program.Block(0));
+  for (int i = 0; i < 10; ++i) {
+    AddOp("channel_recv", {{"Channel", {dataChanName}}},
+          {{"Status", {"Status"}}, {"Out", {"currentXFib"}}}, {}, goOpBlock);
+    AddOp("print", {{"In", {"currentXFib"}}}, {{"Out", {"currentXFib"}}},
+          {{"first_n", 100},
+           {"summarize", -1},
+           {"print_tensor_name", false},
+           {"print_tensor_type", true},
+           {"print_tensor_shape", false},
+           {"print_tensor_lod", false},
+           {"print_phase", std::string("FORWARD")},
+           {"message", std::string("X: ")}},
+          goOpBlock);
+  }
+
+  CreateVariable(&scope, place, "quitSignal", 0);
+  AddOp("channel_send", {{"Channel", {quitChanName}}, {"X", {"quitSignal"}}},
+        {{"Status", {"Status"}}}, {}, goOpBlock);
+
+  // Create Go Op
+  AddOp("go", {{"X", {dataChanName, quitChanName}}}, {},
+        {{"sub_block", goOpBlock}}, block);
+
+  AddFibonacciSelect(&scope, &place, &program, block, dataChanName,
+                     quitChanName);
+
+  // Create Channel Close Op
+  AddOp("channel_close", {{"Channel", {dataChanName}}}, {}, {}, block);
+  AddOp("channel_close", {{"Channel", {quitChanName}}}, {}, {}, block);
+
+  executor.Run(program, &scope, 0, true, true);
+
+  // After we call executor.run, "result" variable should be equal to 34
+  // (which is 10 loops through fibonacci sequence)
+  const LoDTensor &tensor = (scope.FindVar("currentXFib"))->Get<LoDTensor>();
+  auto *finalData = tensor.data<int>();
+  EXPECT_EQ(finalData[0], 34);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc
index 85dbb39e6fba735471446b5e5e71a612282c498a..6bcfc6cd55f02f0d4f0f6e3170e7cc19ce666a28 100644
--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -16,29 +16,25 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-static const platform::DeviceContext* GetDeviceContext(
-    const platform::Place& src_place, const platform::Place& dst_place) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-
-  if (platform::is_gpu_place(src_place) && platform::is_cpu_place(dst_place)) {
-    return pool.Get(src_place);
-  } else if (platform::is_cpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
-    return pool.Get(dst_place);
-  } else {
-    PADDLE_THROW(
-        "Currently, model parallelism is only supported between CPU and CUDA");
-  }
-}
-
-void TransDataDevice(const Tensor& in, const platform::Place& dst_place,
-                     Tensor* out) {
+void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
+                     Tensor *out) {
   VLOG(3) << "DeviceTransform in, src_place " << in.place()
           << " dst_place: " << dst_place;
-  auto* dev_ctx = GetDeviceContext(in.place(), dst_place);
-  dev_ctx->Wait();
-  TensorCopy(in, dst_place, *dev_ctx, out);
-  dev_ctx->Wait();
+
+  PADDLE_ENFORCE_NE(
+      in.place().which(), dst_place.which(),
+      "Currently, model parallelism is only supported between CPU and CUDA");
+
+  // FIXME(zcd): TransDataDevice is used to transform data from GPU to CPU and
+  // the enforced checkings have been done in GetDeviceContext, so the
+  // `dev_ctx->Wait()` is necessary. But `dev_ctx->Wait()` will make the program
+  // slow, especially when the number of elements is little, for example,
+  // the elements of learning rate are one and it's CPU side.
+  // One solution is to use a CUDA kernel to complete the copy operation when
+  // the transforming is from CPU to GPU and the number of elements is little.
+  // But the embarrassment is that this solution this solution makes training
+  // slower.
+  TensorCopySync(in, dst_place, out);
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index e896a06162527ed0289767901f4b4a33fcd2875f..a91fe5c99d397ef1bf04f6d22e988b6d3f33e500 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -32,8 +32,7 @@ struct AddFunctor {
 
 class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
  public:
-  OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() {
     AddInput("input", "input1 of test op");
     AddOutput("output", "output of test op");
     AddAttr<bool>("use_gpu", "force to use gpu kernel").SetDefault(false);
@@ -103,9 +102,7 @@ static void BuildVar(const std::string& param_name,
 }
 
 TEST(Operator, CPUtoGPU) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  InitDevices();
+  paddle::framework::InitDevices(true);
 
   paddle::framework::Scope scope;
   paddle::platform::CPUPlace cpu_place;
@@ -118,8 +115,9 @@ TEST(Operator, CPUtoGPU) {
 
   auto cpu_op = paddle::framework::OpRegistry::CreateOp(cpu_op_desc);
   // prepare input
-  auto* in_t = scope.Var("IN1")->GetMutable<LoDTensor>();
-  auto* src_ptr = in_t->mutable_data<float>({2, 3}, CPUPlace());
+  auto* in_t = scope.Var("IN1")->GetMutable<paddle::framework::LoDTensor>();
+  auto* src_ptr =
+      in_t->mutable_data<float>({2, 3}, paddle::platform::CPUPlace());
   for (int i = 0; i < 2 * 3; ++i) {
     src_ptr[i] = static_cast<float>(i);
   }
@@ -128,7 +126,7 @@ TEST(Operator, CPUtoGPU) {
   auto* output = scope.Var("OUT1");
   cpu_op->Run(scope, cpu_place);
 
-  auto* output_ptr = output->Get<LoDTensor>().data<float>();
+  auto* output_ptr = output->Get<paddle::framework::LoDTensor>().data<float>();
   for (int i = 0; i < 2 * 3; ++i) {
     ASSERT_EQ(output_ptr[i], static_cast<float>(i) * 2);
   }
@@ -153,12 +151,14 @@ TEST(Operator, CPUtoGPU) {
   VLOG(3) << "after gpu_op run";
 
   // auto* output2_ptr = output2->Get<LoDTensor>().data<float>();
-  DeviceContextPool& pool = DeviceContextPool::Instance();
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
   auto dev_ctx = pool.Get(cuda_place);
 
   paddle::framework::Tensor output_tensor;
-  TensorCopy(output2->Get<LoDTensor>(), paddle::platform::CPUPlace(), *dev_ctx,
-             &output_tensor);
+  paddle::framework::TensorCopy(output2->Get<paddle::framework::LoDTensor>(),
+                                paddle::platform::CPUPlace(), *dev_ctx,
+                                &output_tensor);
 
   dev_ctx->Wait();
   float* output2_ptr = output_tensor.data<float>();
diff --git a/paddle/fluid/framework/data_layout.h b/paddle/fluid/framework/data_layout.h
index 39222fc4ed6656dac4773c0c8829608bb954b4c6..b611bb77b4e1ec05b8bd029ac37cefba346c6eb0 100644
--- a/paddle/fluid/framework/data_layout.h
+++ b/paddle/fluid/framework/data_layout.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <cctype>
 #include <ostream>
+#include <string>
 
 #include "paddle/fluid/platform/enforce.h"
 
@@ -26,6 +27,7 @@ enum class DataLayout {
   kNHWC = 0,
   kNCHW = 1,
   kAnyLayout = 2,
+  kMKLDNN = 3,  // all layouts supported by MKLDNN internally
 };
 
 inline DataLayout StringToDataLayout(const std::string& str) {
@@ -40,6 +42,8 @@ inline DataLayout StringToDataLayout(const std::string& str) {
     return DataLayout::kNCHW;
   } else if (s == "ANYLAYOUT") {
     return DataLayout::kAnyLayout;
+  } else if (s == "MKLDNNLAYOUT") {
+    return DataLayout::kMKLDNN;
   } else {
     PADDLE_THROW("Unknown storage order string: %s", s);
   }
@@ -53,8 +57,10 @@ inline std::string DataLayoutToString(const DataLayout& data_layout) {
       return "NCHW";
     case DataLayout::kAnyLayout:
       return "ANY_LAYOUT";
+    case DataLayout::kMKLDNN:
+      return "MKLDNNLAYOUT";
     default:
-      PADDLE_THROW("unknown DataLayou %d", data_layout);
+      PADDLE_THROW("unknown DataLayout %d", data_layout);
   }
 }
 
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 4ca447d50a7262f44e5feb3739dce653604a6ed8..cd00b7de7338982308acfa1f1e8c38e010c6a43b 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -13,8 +13,12 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/data_layout_transform.h"
+#include <vector>
 
 #include "paddle/fluid/operators/math/math_function.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 
 namespace paddle {
 namespace framework {
@@ -87,5 +91,84 @@ void TransDataLayout(const OpKernelType& kernel_type_for_var,
   out->set_layout(expected_kernel_type.data_layout_);
 }
 
+#ifdef PADDLE_WITH_MKLDNN
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::reorder;
+
+void* GetDataFromTensor(const Tensor& tensor, mkldnn::memory::data_type type) {
+  switch (type) {
+    case mkldnn::memory::data_type::f32:
+      return platform::to_void_cast(tensor.data<float>());
+    case mkldnn::memory::data_type::s8:
+      return platform::to_void_cast(tensor.data<char>());
+    case mkldnn::memory::data_type::u8:
+      return platform::to_void_cast(tensor.data<unsigned char>());
+    case mkldnn::memory::data_type::s16:
+      return platform::to_void_cast(tensor.data<int16_t>());
+    case mkldnn::memory::data_type::s32:
+      return platform::to_void_cast(tensor.data<int32_t>());
+    default:
+      PADDLE_THROW("wrong mkldnn type provided");
+  }
+}
+#endif
+
+void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
+                               const OpKernelType& expected_kernel_type,
+                               const Tensor& in, Tensor* out) {
+  auto in_layout = kernel_type_for_var.data_layout_;
+  auto out_layout = expected_kernel_type.data_layout_;
+
+  PADDLE_ENFORCE(
+      in_layout == DataLayout::kMKLDNN && out_layout != DataLayout::kMKLDNN,
+      "TransDataLayoutFromMKLDNN only supports transform from MKLDNN to "
+      "non-MKLDNN");
+
+#ifdef PADDLE_WITH_MKLDNN
+  PADDLE_ENFORCE(in.format() != memory::format::format_undef &&
+                     in.format() != memory::format::any,
+                 "Input tensor should have specified memory format");
+
+  // Set default as NCHW in case not specified
+  out_layout =
+      out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout;
+
+  auto& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = dynamic_cast<platform::MKLDNNDeviceContext*>(
+      pool.Get(expected_kernel_type.place_));
+  auto& cpu_engine = dev_ctx->GetEngine();
+
+  std::vector<int> in_tz = paddle::framework::vectorize2int(in.dims());
+  std::vector<int> out_tz = in_tz;
+
+  memory::data_type in_type = ToMKLDNNDataType(in.type());
+  PADDLE_ENFORCE(in_type != memory::data_type::data_undef,
+                 "Input tensor type is not supported: ", in.type().name());
+  memory::data_type out_type = in_type;
+
+  auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
+  auto out_format =
+      platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
+
+  void* in_data = GetDataFromTensor(in, in_type);
+
+  // output tensor has the same dims as input. Reorder don't change dims
+  out->Resize(in.dims());
+
+  auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
+
+  auto in_memory = memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
+  auto out_memory =
+      memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
+
+  platform::Reorder(in_memory, out_memory);
+
+  out->set_layout(out_layout);
+  // reset format since the out tensor will be feed to non-MKLDNN OPkernel
+  out->set_format(memory::format::format_undef);
+#endif
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
index ba15be9fc77b8405cb4bbca3f62a8be44a3f604e..90bb206ec6b698bc23ad1a5c9609a25186ec6de8 100644
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <map>
+#include <vector>
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
@@ -21,6 +23,51 @@
 namespace paddle {
 namespace framework {
 
+#ifdef PADDLE_WITH_MKLDNN
+using MKLDNNFormat = mkldnn::memory::format;
+using MKLDNNDataType = mkldnn::memory::data_type;
+
+inline MKLDNNFormat ToMKLDNNFormat(const DataLayout& layout) {
+  switch (layout) {
+    case DataLayout::kNHWC:
+      return MKLDNNFormat::nhwc;
+    case DataLayout::kNCHW:
+      return MKLDNNFormat::nchw;
+    default:
+      PADDLE_THROW("Fail to convert layout %s to MKLDNN format",
+                   DataLayoutToString(layout));
+  }
+}
+
+inline DataLayout ToPaddleLayout(const MKLDNNFormat& format) {
+  switch (format) {
+    case MKLDNNFormat::nhwc:
+      return DataLayout::kNHWC;
+    case MKLDNNFormat::nchw:
+      return DataLayout::kNCHW;
+    default:
+      PADDLE_THROW("Fail to convert MKLDNN format to paddle layout");
+  }
+}
+
+inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) {
+  static const std::map<std::type_index, MKLDNNDataType> dict{
+      {std::type_index(typeid(float)), MKLDNNDataType::f32},  // NOLINT
+      {std::type_index(typeid(char)), MKLDNNDataType::s8},    // NOLINT
+      {std::type_index(typeid(unsigned char)), MKLDNNDataType::u8},
+      {std::type_index(typeid(int16_t)), MKLDNNDataType::s16},
+      {std::type_index(typeid(int32_t)), MKLDNNDataType::s32}};
+  auto iter = dict.find(type);
+  if (iter != dict.end()) return iter->second;
+  return MKLDNNDataType::data_undef;
+}
+
+#endif
+
+void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
+                               const OpKernelType& expected_kernel_type,
+                               const Tensor& in, Tensor* out);
+
 std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to);
 
 void TransDataLayout(const OpKernelType& kernel_type_for_var,
diff --git a/paddle/fluid/framework/data_layout_transform_test.cc b/paddle/fluid/framework/data_layout_transform_test.cc
index dd17cac0e10db0d058d399cc725e18dcb14be507..a0d08826b854fea9256382f0e065fd59dda8c8b3 100644
--- a/paddle/fluid/framework/data_layout_transform_test.cc
+++ b/paddle/fluid/framework/data_layout_transform_test.cc
@@ -18,27 +18,28 @@
 #include "paddle/fluid/platform/device_context.h"
 
 TEST(DataTransform, DataLayoutFunction) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-
-  auto place = CPUPlace();
-  Tensor in = Tensor();
-  Tensor out = Tensor();
-  in.mutable_data<double>(make_ddim({2, 3, 1, 2}), place);
-  in.set_layout(DataLayout::kNHWC);
-
-  auto kernel_nhwc = OpKernelType(proto::VarType::FP32, place,
-                                  DataLayout::kNHWC, LibraryType::kPlain);
-  auto kernel_ncwh = OpKernelType(proto::VarType::FP32, place,
-                                  DataLayout::kNCHW, LibraryType::kPlain);
-
-  TransDataLayout(kernel_nhwc, kernel_ncwh, in, &out);
-
-  EXPECT_TRUE(out.layout() == DataLayout::kNCHW);
-  EXPECT_TRUE(out.dims() == make_ddim({2, 2, 3, 1}));
+  auto place = paddle::platform::CPUPlace();
+  paddle::framework::Tensor in = paddle::framework::Tensor();
+  paddle::framework::Tensor out = paddle::framework::Tensor();
+  in.mutable_data<double>(paddle::framework::make_ddim({2, 3, 1, 2}), place);
+  in.set_layout(paddle::framework::DataLayout::kNHWC);
+
+  auto kernel_nhwc = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::FP32, place,
+      paddle::framework::DataLayout::kNHWC,
+      paddle::framework::LibraryType::kPlain);
+  auto kernel_ncwh = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::FP32, place,
+      paddle::framework::DataLayout::kNCHW,
+      paddle::framework::LibraryType::kPlain);
+
+  paddle::framework::TransDataLayout(kernel_nhwc, kernel_ncwh, in, &out);
+
+  EXPECT_TRUE(out.layout() == paddle::framework::DataLayout::kNCHW);
+  EXPECT_TRUE(out.dims() == paddle::framework::make_ddim({2, 2, 3, 1}));
 
   TransDataLayout(kernel_ncwh, kernel_nhwc, in, &out);
 
-  EXPECT_TRUE(in.layout() == DataLayout::kNHWC);
-  EXPECT_TRUE(in.dims() == make_ddim({2, 3, 1, 2}));
+  EXPECT_TRUE(in.layout() == paddle::framework::DataLayout::kNHWC);
+  EXPECT_TRUE(in.dims() == paddle::framework::make_ddim({2, 3, 1, 2}));
 }
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index bfad9ac1e9cad1936ed961ad1da55787d2faa23e..82872224501709080ff02a13464d58543a0abda8 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -18,26 +18,57 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/data_type_transform.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace framework {
 
-static void PassTensorData(Tensor* from, Tensor* to) {
+static void PassTensorData(Tensor *from, Tensor *to) {
   to->ShareDataWith(*from);
   *from = Tensor();
 }
 
-void DataTransform(const OpKernelType& expected_kernel_type,
-                   const OpKernelType& kernel_type_for_var,
-                   const Tensor& input_tensor, Tensor* output_tensor) {
+void TransformData(const OpKernelType &expected_kernel_type,
+                   const OpKernelType &kernel_type_for_var,
+                   const Tensor &input_tensor, Tensor *output_tensor) {
   bool transformed = false;
   Tensor in;
   in.ShareDataWith(input_tensor);
   Tensor out;
+  DataLayout lin = kernel_type_for_var.data_layout_;
+  DataLayout lout = expected_kernel_type.data_layout_;
 
   // do layout transform
-  if (NeedTransformLayout(expected_kernel_type.data_layout_,
-                          kernel_type_for_var.data_layout_)) {
-    TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out);
+  if (NeedTransformLayout(lout, lin)) {
+    if (lin == DataLayout::kMKLDNN || lout == DataLayout::kMKLDNN) {
+      PADDLE_ENFORCE(
+          !(lin == DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN),
+          "No layout transform needed between two MKLDNN OPKernels");
+
+      if (lin != DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN) {
+#ifdef PADDLE_WITH_MKLDNN
+        // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
+        // Just set layout/format. No real transform occur
+
+        auto out_format = platform::MKLDNNFormatForSize(in.dims().size(),
+                                                        ToMKLDNNFormat(lin));
+
+        out.ShareDataWith(input_tensor);
+        out.set_layout(DataLayout::kMKLDNN);
+        out.set_format(out_format);
+#endif
+      } else {
+        // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
+        // Do transform via MKLDNN lib
+        TransDataLayoutFromMKLDNN(kernel_type_for_var, expected_kernel_type, in,
+                                  &out);
+      }
+    } else {
+      // Case3 - transfrom between Non-MKLDNN OPKernels
+      TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out);
+    }
     transformed = true;
     PassTensorData(&out, &in);
   }
@@ -62,17 +93,17 @@ void DataTransform(const OpKernelType& expected_kernel_type,
   output_tensor->ShareDataWith(in);
 }
 
-void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
-                            Variable& out_var) {
+void SetTensorToVariable(const Variable &in_var, const Tensor &tensor,
+                         Variable *out_var) {
   if (in_var.IsType<LoDTensor>()) {
-    auto& in_lod_tensor = in_var.Get<LoDTensor>();
-    auto* tran_lod_tensor = out_var.GetMutable<LoDTensor>();
+    auto &in_lod_tensor = in_var.Get<LoDTensor>();
+    auto *tran_lod_tensor = out_var->GetMutable<LoDTensor>();
     tran_lod_tensor->set_lod(in_lod_tensor.lod());
     tran_lod_tensor->set_layout(in_lod_tensor.layout());
     tran_lod_tensor->ShareDataWith(tensor);
   } else if (in_var.IsType<SelectedRows>()) {
-    auto& in_selected_rows = in_var.Get<SelectedRows>();
-    auto* trans_selected_rows = out_var.GetMutable<SelectedRows>();
+    auto &in_selected_rows = in_var.Get<SelectedRows>();
+    auto *trans_selected_rows = out_var->GetMutable<SelectedRows>();
     trans_selected_rows->set_height(in_selected_rows.height());
     trans_selected_rows->set_rows(in_selected_rows.rows());
     trans_selected_rows->mutable_value()->ShareDataWith(tensor);
diff --git a/paddle/fluid/framework/data_transform.h b/paddle/fluid/framework/data_transform.h
index 9ec67e6f3d6358cd658e198602f5e802a0ba4cc9..ae3ab051bda2e698801cc6fe6e3ddddf039f5385 100644
--- a/paddle/fluid/framework/data_transform.h
+++ b/paddle/fluid/framework/data_transform.h
@@ -30,12 +30,15 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-void DataTransform(const OpKernelType& expected_kernel_type,
-                   const OpKernelType& kernel_type_for_var,
-                   const Tensor& input_tensor, Tensor* out);
-
-void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
-                            Variable& out_var);
+void TransformData(const OpKernelType &expected_kernel_type,
+                   const OpKernelType &kernel_type_for_var,
+                   const Tensor &input_tensor, Tensor *out);
+
+/**
+ * Set OutVar from InVar, except the tensor is shared with `tensor`
+ */
+void SetTensorToVariable(const Variable &in_var, const Tensor &tensor,
+                         Variable *out_var);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc
new file mode 100644
index 0000000000000000000000000000000000000000..60382faffb8e53870658b2d1ff83abc4008cb4cf
--- /dev/null
+++ b/paddle/fluid/framework/data_type.cc
@@ -0,0 +1,105 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/data_type.h"
+#include <stdint.h>
+#include <string>
+#include <unordered_map>
+
+namespace paddle {
+namespace framework {
+
+struct DataTypeMap {
+  std::unordered_map<std::type_index, proto::VarType::Type> cpp_to_proto_;
+  std::unordered_map<int, std::type_index> proto_to_cpp_;
+  std::unordered_map<int, std::string> proto_to_str_;
+  std::unordered_map<std::type_index, size_t> cpp_to_size_;
+};
+
+static DataTypeMap* InitDataTypeMap();
+// C++11 removes the need for manual locking. Concurrent execution shall wait if
+// a static local variable is already being initialized.
+// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex
+static DataTypeMap& gDataTypeMap() {
+  static DataTypeMap* g_data_type_map_ = InitDataTypeMap();
+  return *g_data_type_map_;
+}
+
+template <typename T>
+static inline void RegisterType(DataTypeMap* map,
+                                proto::VarType::Type proto_type,
+                                const std::string& name) {
+  map->proto_to_cpp_.emplace(static_cast<int>(proto_type), typeid(T));
+  map->cpp_to_proto_.emplace(typeid(T), proto_type);
+  map->proto_to_str_.emplace(static_cast<int>(proto_type), name);
+  map->cpp_to_size_.emplace(typeid(T), sizeof(T));
+}
+
+static DataTypeMap* InitDataTypeMap() {
+  auto retv = new DataTypeMap();
+
+#define RegType(cc_type, proto_type) \
+  RegisterType<cc_type>(retv, proto_type, #cc_type)
+
+  // NOTE: Add your customize type here.
+  RegType(platform::float16, proto::VarType::FP16);
+  RegType(float, proto::VarType::FP32);
+  RegType(double, proto::VarType::FP64);
+  RegType(int, proto::VarType::INT32);
+  RegType(int64_t, proto::VarType::INT64);
+  RegType(bool, proto::VarType::BOOL);
+  RegType(size_t, proto::VarType::SIZE_T);
+  RegType(int16_t, proto::VarType::INT16);
+  RegType(uint8_t, proto::VarType::UINT8);
+
+#undef RegType
+  return retv;
+}
+
+proto::VarType::Type ToDataType(std::type_index type) {
+  auto it = gDataTypeMap().cpp_to_proto_.find(type);
+  if (it != gDataTypeMap().cpp_to_proto_.end()) {
+    return it->second;
+  }
+  PADDLE_THROW("Not support %s as tensor type", type.name());
+}
+
+std::type_index ToTypeIndex(proto::VarType::Type type) {
+  auto it = gDataTypeMap().proto_to_cpp_.find(static_cast<int>(type));
+  if (it != gDataTypeMap().proto_to_cpp_.end()) {
+    return it->second;
+  }
+  PADDLE_THROW("Not support proto::VarType::Type(%d) as tensor type",
+               static_cast<int>(type));
+}
+
+std::string DataTypeToString(const proto::VarType::Type type) {
+  auto it = gDataTypeMap().proto_to_str_.find(static_cast<int>(type));
+  if (it != gDataTypeMap().proto_to_str_.end()) {
+    return it->second;
+  }
+  PADDLE_THROW("Not support proto::VarType::Type(%d) as tensor type",
+               static_cast<int>(type));
+}
+
+size_t SizeOfType(std::type_index type) {
+  auto it = gDataTypeMap().cpp_to_size_.find(type);
+  if (it != gDataTypeMap().cpp_to_size_.end()) {
+    return it->second;
+  }
+  PADDLE_THROW("Not support %s as tensor type", type.name());
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index 4c1b3e7581fe716271c62389c6053a24158913d2..491413db8c8d66fd907801131e89d9303bdef9f2 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -13,56 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
 #include <typeindex>
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/platform/enforce.h"
+
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace framework {
 
-inline proto::VarType::Type ToDataType(std::type_index type) {
-  using namespace paddle::framework::proto;
-  if (typeid(platform::float16).hash_code() == type.hash_code()) {
-    return proto::VarType::FP16;
-  } else if (typeid(float).hash_code() == type.hash_code()) {
-    return proto::VarType::FP32;
-  } else if (typeid(double).hash_code() == type.hash_code()) {
-    return proto::VarType::FP64;
-  } else if (typeid(int).hash_code() == type.hash_code()) {
-    return proto::VarType::INT32;
-  } else if (typeid(int64_t).hash_code() == type.hash_code()) {
-    return proto::VarType::INT64;
-  } else if (typeid(bool).hash_code() == type.hash_code()) {
-    return proto::VarType::BOOL;
-  } else {
-    PADDLE_THROW("Not supported");
-  }
-}
-
-inline std::type_index ToTypeIndex(proto::VarType::Type type) {
-  using namespace paddle::framework::proto;
-  switch (type) {
-    case proto::VarType::FP16:
-      return typeid(platform::float16);
-    case proto::VarType::FP32:
-      return typeid(float);
-    case proto::VarType::FP64:
-      return typeid(double);
-    case proto::VarType::INT32:
-      return typeid(int);
-    case proto::VarType::INT64:
-      return typeid(int64_t);
-    case proto::VarType::BOOL:
-      return typeid(bool);
-    default:
-      PADDLE_THROW("Not support type %d", type);
-  }
-}
+extern proto::VarType::Type ToDataType(std::type_index type);
+extern std::type_index ToTypeIndex(proto::VarType::Type type);
 
 template <typename Visitor>
 inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
-  using namespace paddle::framework::proto;
   switch (type) {
     case proto::VarType::FP16:
       visitor.template operator()<platform::float16>();
@@ -82,38 +47,23 @@ inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
     case proto::VarType::BOOL:
       visitor.template operator()<bool>();
       break;
-    default:
-      PADDLE_THROW("Not supported");
-  }
-}
-
-inline std::string DataTypeToString(const proto::VarType::Type type) {
-  using namespace paddle::framework::proto;
-  switch (type) {
-    case proto::VarType::FP16:
-      return "float16";
-    case proto::VarType::FP32:
-      return "float32";
-    case proto::VarType::FP64:
-      return "float64";
+    case proto::VarType::UINT8:
+      visitor.template operator()<uint8_t>();
+      break;
     case proto::VarType::INT16:
-      return "int16";
-    case proto::VarType::INT32:
-      return "int32";
-    case proto::VarType::INT64:
-      return "int64";
-    case proto::VarType::BOOL:
-      return "bool";
+      visitor.template operator()<int16_t>();
+      break;
     default:
-      PADDLE_THROW("Not support type %d", type);
+      PADDLE_THROW("Not supported %d", type);
   }
 }
 
+extern std::string DataTypeToString(const proto::VarType::Type type);
+extern size_t SizeOfType(std::type_index type);
 inline std::ostream& operator<<(std::ostream& out,
                                 const proto::VarType::Type& type) {
   out << DataTypeToString(type);
   return out;
 }
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
index c0523f3c795b103c0c27081ec5dc717f6a0f11e0..5a57ec20585c26dbcd4251464718fc819148a7a5 100644
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -91,6 +91,12 @@ void TransDataType(const OpKernelType& kernel_type_for_var,
     case proto::VarType::BOOL:
       framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
       break;
+    case proto::VarType::INT16:
+      framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
+      break;
+    case proto::VarType::UINT8:
+      framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
+      break;
     default:
       PADDLE_THROW("Not support type %d", src_type);
   }
diff --git a/paddle/fluid/framework/data_type_transform.h b/paddle/fluid/framework/data_type_transform.h
index e75da2588d07a754783f052173c3e0dce118f1b8..1c281b03ed61ac70e16a43d75a79854bdafd8836 100644
--- a/paddle/fluid/framework/data_type_transform.h
+++ b/paddle/fluid/framework/data_type_transform.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <utility>
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
diff --git a/paddle/fluid/framework/data_type_transform_test.cc b/paddle/fluid/framework/data_type_transform_test.cc
index 6b9a8f5e28b372c45abfaa2c20575a55d9a9dd03..bbebea9f13fd37469a0e9b7be9719aca128f5687 100644
--- a/paddle/fluid/framework/data_type_transform_test.cc
+++ b/paddle/fluid/framework/data_type_transform_test.cc
@@ -17,43 +17,58 @@ limitations under the License. */
 #include "gtest/gtest.h"
 
 TEST(DataTypeTransform, CPUTransform) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-
-  auto place = CPUPlace();
-
-  auto kernel_fp16 = OpKernelType(proto::VarType::FP16, place,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_fp32 = OpKernelType(proto::VarType::FP32, place,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_fp64 = OpKernelType(proto::VarType::FP64, place,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_int32 = OpKernelType(proto::VarType::INT32, place,
-                                   DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_int64 = OpKernelType(proto::VarType::INT64, place,
-                                   DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_bool = OpKernelType(proto::VarType::BOOL, place,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto place = paddle::platform::CPUPlace();
+
+  auto kernel_fp16 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::FP16, place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_fp32 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::FP32, place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_fp64 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::FP64, place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_int32 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::INT32, place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_int64 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::INT64, place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_bool = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::BOOL, place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
 
   // data type transform from float32
   {
-    Tensor in;
-    Tensor out;
+    paddle::framework::Tensor in;
+    paddle::framework::Tensor out;
 
-    float* ptr = in.mutable_data<float>(make_ddim({2, 3}), place);
+    float* ptr =
+        in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), place);
     int data_number = 2 * 3;
 
     for (int i = 0; i < data_number; ++i) {
       ptr[i] = i / 3;
     }
 
-    TransDataType(kernel_fp32, kernel_fp64, in, &out);
+    paddle::framework::TransDataType(kernel_fp32, kernel_fp64, in, &out);
     double* out_data_double = out.data<double>();
     for (int i = 0; i < data_number; ++i) {
       EXPECT_EQ(out_data_double[i], static_cast<double>(i / 3));
     }
 
-    TransDataType(kernel_fp32, kernel_int32, in, &out);
+    paddle::framework::TransDataType(kernel_fp32, kernel_int32, in, &out);
     int* out_data_int = out.data<int>();
     for (int i = 0; i < data_number; ++i) {
       EXPECT_EQ(out_data_int[i], static_cast<int>(i / 3));
@@ -62,10 +77,11 @@ TEST(DataTypeTransform, CPUTransform) {
 
   // data type transform from/to float16
   {
-    Tensor in;
-    Tensor out;
+    paddle::framework::Tensor in;
+    paddle::framework::Tensor out;
 
-    float16* ptr = in.mutable_data<float16>(make_ddim({2, 3}), place);
+    paddle::platform::float16* ptr = in.mutable_data<paddle::platform::float16>(
+        paddle::framework::make_ddim({2, 3}), place);
     int data_number = 2 * 3;
 
     for (int i = 0; i < data_number; ++i) {
@@ -73,94 +89,104 @@ TEST(DataTypeTransform, CPUTransform) {
     }
 
     // transform from float16 to other data types
-    TransDataType(kernel_fp16, kernel_fp32, in, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_fp32, in, &out);
     float* out_data_float = out.data<float>();
     for (int i = 0; i < data_number; ++i) {
       EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
     }
 
-    TransDataType(kernel_fp16, kernel_fp64, in, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_fp64, in, &out);
     double* out_data_double = out.data<double>();
     for (int i = 0; i < data_number; ++i) {
       EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
     }
 
-    TransDataType(kernel_fp16, kernel_int32, in, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_int32, in, &out);
     int* out_data_int = out.data<int>();
     for (int i = 0; i < data_number; ++i) {
       EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
     }
 
-    TransDataType(kernel_fp16, kernel_int64, in, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_int64, in, &out);
     int64_t* out_data_int64 = out.data<int64_t>();
     for (int i = 0; i < data_number; ++i) {
       EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
     }
 
-    TransDataType(kernel_fp16, kernel_bool, in, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_bool, in, &out);
     bool* out_data_bool = out.data<bool>();
     for (int i = 0; i < data_number; ++i) {
       EXPECT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
     }
 
     // transform float to float16
-    float* in_data_float = in.mutable_data<float>(make_ddim({2, 3}), place);
+    float* in_data_float =
+        in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_float[i] = i;
     }
 
-    TransDataType(kernel_fp32, kernel_fp16, in, &out);
-    ptr = out.data<float16>();
+    paddle::framework::TransDataType(kernel_fp32, kernel_fp16, in, &out);
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_float[i]).x);
     }
 
     // transform double to float16
-    double* in_data_double = in.mutable_data<double>(make_ddim({2, 3}), place);
+    double* in_data_double =
+        in.mutable_data<double>(paddle::framework::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_double[i] = i;
     }
 
-    TransDataType(kernel_fp64, kernel_fp16, in, &out);
-    ptr = out.data<float16>();
+    paddle::framework::TransDataType(kernel_fp64, kernel_fp16, in, &out);
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_double[i]).x);
     }
 
     // transform int to float16
-    int* in_data_int = in.mutable_data<int>(make_ddim({2, 3}), place);
+    int* in_data_int =
+        in.mutable_data<int>(paddle::framework::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_int[i] = i;
     }
 
-    TransDataType(kernel_int32, kernel_fp16, in, &out);
-    ptr = out.data<float16>();
+    paddle::framework::TransDataType(kernel_int32, kernel_fp16, in, &out);
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_int[i]).x);
     }
 
     // transform int64 to float16
-    int64_t* in_data_int64 = in.mutable_data<int64_t>(make_ddim({2, 3}), place);
+    int64_t* in_data_int64 =
+        in.mutable_data<int64_t>(paddle::framework::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_int64[i] = i;
     }
 
-    TransDataType(kernel_int64, kernel_fp16, in, &out);
-    ptr = out.data<float16>();
+    paddle::framework::TransDataType(kernel_int64, kernel_fp16, in, &out);
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_int64[i]).x);
     }
 
     // transform bool to float16
-    bool* in_data_bool = in.mutable_data<bool>(make_ddim({2, 3}), place);
+    bool* in_data_bool =
+        in.mutable_data<bool>(paddle::framework::make_ddim({2, 3}), place);
     for (int i = 0; i < data_number; ++i) {
       in_data_bool[i] = i;
     }
 
-    TransDataType(kernel_bool, kernel_fp16, in, &out);
-    ptr = out.data<float16>();
+    paddle::framework::TransDataType(kernel_bool, kernel_fp16, in, &out);
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_bool[i]).x);
     }
   }
 }
diff --git a/paddle/fluid/framework/data_type_transform_test.cu b/paddle/fluid/framework/data_type_transform_test.cu
index de389ddabcb86de0155757406a406e44086c5474..0874509a8797cd2ff1b1fcb347b4ef3b74a39047 100644
--- a/paddle/fluid/framework/data_type_transform_test.cu
+++ b/paddle/fluid/framework/data_type_transform_test.cu
@@ -18,42 +18,58 @@ limitations under the License. */
 #include "gtest/gtest.h"
 
 TEST(DataTypeTransform, GPUTransform) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-
-  auto cpu_place = CPUPlace();
-  auto gpu_place = CUDAPlace(0);
-  CUDADeviceContext context(gpu_place);
-
-  auto kernel_fp16 = OpKernelType(proto::VarType::FP16, gpu_place,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_fp32 = OpKernelType(proto::VarType::FP32, gpu_place,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_fp64 = OpKernelType(proto::VarType::FP64, gpu_place,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_int32 = OpKernelType(proto::VarType::INT32, gpu_place,
-                                   DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_int64 = OpKernelType(proto::VarType::INT64, gpu_place,
-                                   DataLayout::kAnyLayout, LibraryType::kPlain);
-  auto kernel_bool = OpKernelType(proto::VarType::BOOL, gpu_place,
-                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto cpu_place = paddle::platform::CPUPlace();
+  auto gpu_place = paddle::platform::CUDAPlace(0);
+  paddle::platform::CUDADeviceContext context(gpu_place);
+
+  auto kernel_fp16 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::FP16, gpu_place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_fp32 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::FP32, gpu_place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_fp64 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::FP64, gpu_place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_int32 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::INT32, gpu_place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_int64 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::INT64, gpu_place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
+  auto kernel_bool = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::BOOL, gpu_place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
 
   // data type transform from float32
   {
-    Tensor in;
-    Tensor in_gpu;
-    Tensor out_gpu;
-    Tensor out;
+    paddle::framework::Tensor in;
+    paddle::framework::Tensor in_gpu;
+    paddle::framework::Tensor out_gpu;
+    paddle::framework::Tensor out;
 
-    float* in_ptr = in.mutable_data<float>(make_ddim({2, 3}), cpu_place);
+    float* in_ptr =
+        in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), cpu_place);
     float arr[6] = {0, 1, 2, 3, 4, 5};
     int data_number = sizeof(arr) / sizeof(arr[0]);
     memcpy(in_ptr, arr, sizeof(arr));
 
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
     context.Wait();
-    TransDataType(kernel_fp32, kernel_fp64, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_fp32, kernel_fp64, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
     double* out_data_double = out.data<double>();
@@ -61,8 +77,9 @@ TEST(DataTypeTransform, GPUTransform) {
       EXPECT_EQ(out_data_double[i], static_cast<double>(arr[i]));
     }
 
-    TransDataType(kernel_fp32, kernel_int32, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_fp32, kernel_int32, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
     int* out_data_int = out.data<int>();
@@ -73,22 +90,27 @@ TEST(DataTypeTransform, GPUTransform) {
 
   // data type transform from/to float16
   {
-    Tensor in;
-    Tensor in_gpu;
-    Tensor out_gpu;
-    Tensor out;
-
-    float16* ptr = in.mutable_data<float16>(make_ddim({2, 3}), cpu_place);
-    float16 arr[6] = {float16(0), float16(1), float16(2),
-                      float16(3), float16(4), float16(5)};
+    paddle::framework::Tensor in;
+    paddle::framework::Tensor in_gpu;
+    paddle::framework::Tensor out_gpu;
+    paddle::framework::Tensor out;
+
+    paddle::platform::float16* ptr = in.mutable_data<paddle::platform::float16>(
+        paddle::framework::make_ddim({2, 3}), cpu_place);
+    paddle::platform::float16 arr[6] = {
+        paddle::platform::float16(0), paddle::platform::float16(1),
+        paddle::platform::float16(2), paddle::platform::float16(3),
+        paddle::platform::float16(4), paddle::platform::float16(5)};
+
     int data_number = sizeof(arr) / sizeof(arr[0]);
     memcpy(ptr, arr, sizeof(arr));
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
     context.Wait();
 
     // transform from float16 to other data types
-    TransDataType(kernel_fp16, kernel_fp32, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_fp32, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
     float* out_data_float = out.data<float>();
@@ -96,8 +118,9 @@ TEST(DataTypeTransform, GPUTransform) {
       EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
     }
 
-    TransDataType(kernel_fp16, kernel_fp64, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_fp64, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
     double* out_data_double = out.data<double>();
@@ -105,8 +128,9 @@ TEST(DataTypeTransform, GPUTransform) {
       EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
     }
 
-    TransDataType(kernel_fp16, kernel_int32, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_int32, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
     int* out_data_int = out.data<int>();
@@ -114,8 +138,9 @@ TEST(DataTypeTransform, GPUTransform) {
       EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
     }
 
-    TransDataType(kernel_fp16, kernel_int64, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_int64, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
     int64_t* out_data_int64 = out.data<int64_t>();
@@ -123,8 +148,9 @@ TEST(DataTypeTransform, GPUTransform) {
       EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
     }
 
-    TransDataType(kernel_fp16, kernel_bool, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_fp16, kernel_bool, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
     bool* out_data_bool = out.data<bool>();
@@ -133,90 +159,103 @@ TEST(DataTypeTransform, GPUTransform) {
     }
 
     // transform float to float16
-    float* in_data_float = in.mutable_data<float>(make_ddim({2, 3}), cpu_place);
+    float* in_data_float =
+        in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), cpu_place);
     for (int i = 0; i < data_number; ++i) {
       in_data_float[i] = i;
     }
 
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
     context.Wait();
-    TransDataType(kernel_fp32, kernel_fp16, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_fp32, kernel_fp16, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
-    ptr = out.data<float16>();
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_float[i]).x);
     }
 
     // transform double to float16
-    double* in_data_double =
-        in.mutable_data<double>(make_ddim({2, 3}), cpu_place);
+    double* in_data_double = in.mutable_data<double>(
+        paddle::framework::make_ddim({2, 3}), cpu_place);
     for (int i = 0; i < data_number; ++i) {
       in_data_double[i] = i;
     }
 
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
     context.Wait();
-    TransDataType(kernel_fp64, kernel_fp16, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_fp64, kernel_fp16, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
-    ptr = out.data<float16>();
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_double[i]).x);
     }
 
     // transform int to float16
-    int* in_data_int = in.mutable_data<int>(make_ddim({2, 3}), cpu_place);
+    int* in_data_int =
+        in.mutable_data<int>(paddle::framework::make_ddim({2, 3}), cpu_place);
     for (int i = 0; i < data_number; ++i) {
       in_data_int[i] = i;
     }
 
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
     context.Wait();
-    TransDataType(kernel_int32, kernel_fp16, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_int32, kernel_fp16, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
-    ptr = out.data<float16>();
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_int[i]).x);
     }
 
     // transform int64 to float16
-    int64_t* in_data_int64 =
-        in.mutable_data<int64_t>(make_ddim({2, 3}), cpu_place);
+    int64_t* in_data_int64 = in.mutable_data<int64_t>(
+        paddle::framework::make_ddim({2, 3}), cpu_place);
     for (int i = 0; i < data_number; ++i) {
       in_data_int64[i] = i;
     }
 
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
     context.Wait();
-    TransDataType(kernel_int64, kernel_fp16, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_int64, kernel_fp16, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
-    ptr = out.data<float16>();
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_int64[i]).x);
     }
 
     // transform bool to float16
-    bool* in_data_bool = in.mutable_data<bool>(make_ddim({2, 3}), cpu_place);
+    bool* in_data_bool =
+        in.mutable_data<bool>(paddle::framework::make_ddim({2, 3}), cpu_place);
     for (int i = 0; i < data_number; ++i) {
       in_data_bool[i] = i;
     }
 
-    TensorCopy(in, gpu_place, context, &in_gpu);
+    paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu);
     context.Wait();
-    TransDataType(kernel_bool, kernel_fp16, in_gpu, &out_gpu);
-    TensorCopy(out_gpu, cpu_place, context, &out);
+    paddle::framework::TransDataType(kernel_bool, kernel_fp16, in_gpu,
+                                     &out_gpu);
+    paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out);
     context.Wait();
 
-    ptr = out.data<float16>();
+    ptr = out.data<paddle::platform::float16>();
     for (int i = 0; i < data_number; ++i) {
-      EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x);
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::float16>(in_data_bool[i]).x);
     }
   }
 }
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4fb4ec38ee965a2790d11378a1ce6befa0ef5a00
--- /dev/null
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -0,0 +1,48 @@
+cc_library(var_handle SRCS var_handle.cc DEPS place)
+cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
+cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
+cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
+cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
+cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry)
+
+cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
+cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
+cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder)
+cc_library(ssa_graph_checker SRCS ssa_graph_checker.cc DEPS ssa_graph_builder)
+
+cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
+
+if(WITH_GPU)
+    nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+            dynload_cuda variable_visitor)
+    nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda)
+    nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda)
+
+else()
+    cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
+             variable_visitor)
+    cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim)
+    cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
+endif()
+
+cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_base scope lod_tensor)
+cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
+cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
+
+cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
+        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle)
+
+
+cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker)
+
+cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
+cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
+        simple_threadpool device_context)
+
+cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
+        device_context broadcast_op_handle)
+cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
+        device_context gather_op_handle)
+cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor)
+#cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
+#        device_context reduce_op_handle )
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b335d3a0d364c916e19574de8d3ed89aaec7de41
--- /dev/null
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -0,0 +1,137 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <algorithm>
+
+#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/reduce_and_gather.h"
+#include "paddle/fluid/framework/details/variable_visitor.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+#ifdef PADDLE_WITH_CUDA
+AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+                                     const std::vector<platform::Place> &places,
+                                     const platform::NCCLContextMap *ctxs)
+    : local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) {
+  if (nccl_ctxs_) {
+    for (auto &p : places_) {
+      this->dev_ctxes_[p] = nccl_ctxs_->DevCtx(p);
+    }
+  }
+}
+#else
+AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+                                     const std::vector<platform::Place> &places)
+    : local_scopes_(local_scopes), places_(places) {}
+#endif
+
+void AllReduceOpHandle::RunImpl() {
+  if (NoDummyInputSize() == 1) {
+    return;  // No need to all reduce when GPU count = 1;
+  } else {
+    // Wait input done
+    WaitInputVarGenerated();
+    auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
+    auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
+    PADDLE_ENFORCE_EQ(
+        in_var_handles.size(), places_.size(),
+        "The NoDummyInputSize should be equal to the number of places.");
+    PADDLE_ENFORCE_EQ(
+        in_var_handles.size(), out_var_handles.size(),
+        "The NoDummyInputSize and NoDummyOutputSize should be equal.");
+
+    std::vector<const LoDTensor *> lod_tensors;
+    for (size_t i = 0; i < local_scopes_.size(); ++i) {
+      auto *s = local_scopes_[i];
+      auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get<Scope *>();
+      auto &lod_tensor =
+          local_scope.FindVar(in_var_handles[i]->name_)->Get<LoDTensor>();
+      lod_tensors.emplace_back(&lod_tensor);
+      PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_,
+                        "The name of input and output should be equal.");
+    }
+
+    if (platform::is_gpu_place(lod_tensors[0]->place())) {
+#ifdef PADDLE_WITH_CUDA
+      PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
+      int dtype = -1;
+      size_t numel = 0;
+      std::vector<std::function<void()>> all_reduce_calls;
+      for (size_t i = 0; i < local_scopes_.size(); ++i) {
+        auto &p = places_[i];
+        auto &lod_tensor = *lod_tensors[i];
+        void *buffer = const_cast<void *>(lod_tensor.data<void>());
+
+        if (dtype == -1) {
+          dtype = platform::ToNCCLDataType(lod_tensor.type());
+        }
+
+        if (numel == 0) {
+          numel = static_cast<size_t>(lod_tensor.numel());
+        }
+
+        int dev_id = boost::get<platform::CUDAPlace>(p).device;
+        auto &nccl_ctx = nccl_ctxs_->at(dev_id);
+        auto stream = nccl_ctx.stream();
+        auto comm = nccl_ctx.comm_;
+        all_reduce_calls.emplace_back([=] {
+          PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+              buffer, buffer, numel, static_cast<ncclDataType_t>(dtype),
+              ncclSum, comm, stream));
+        });
+      }
+      this->RunAndRecordEvent([&] {
+        platform::NCCLGroupGuard guard;
+        for (auto &call : all_reduce_calls) {
+          call();
+        }
+      });
+#else
+      PADDLE_THROW("Not compiled with CUDA");
+#endif
+    } else {  // Special handle CPU only Operator's gradient. Like CRF
+      auto &trg = *this->local_scopes_[0]
+                       ->FindVar(kLocalExecScopeName)
+                       ->Get<Scope *>()
+                       ->FindVar(out_var_handles[0]->name_)
+                       ->GetMutable<framework::LoDTensor>();
+
+      // Reduce All Tensor to trg in CPU
+      ReduceLoDTensor func(lod_tensors, &trg);
+      VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+
+      for (size_t i = 1; i < local_scopes_.size(); ++i) {
+        auto &scope =
+            *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
+        auto &p = places_[i];
+        auto *var = scope.FindVar(out_var_handles[i]->name_);
+        auto *dev_ctx = dev_ctxes_[p];
+
+        RunAndRecordEvent(p, [&trg, var, dev_ctx, p] {
+          auto &tensor_gpu = *var->GetMutable<framework::LoDTensor>();
+          auto &tensor_cpu = trg;
+          TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu);
+        });
+      }
+    }
+  }
+}
+
+std::string AllReduceOpHandle::Name() const { return "all_reduce"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..fdd250b0d3eb166249271a95f7592b9fadee5265
--- /dev/null
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.h
@@ -0,0 +1,59 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct AllReduceOpHandle : public OpHandleBase {
+#ifdef PADDLE_WITH_CUDA
+  AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+                    const std::vector<platform::Place> &places,
+                    const platform::NCCLContextMap *ctxs);
+#else
+  AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
+                    const std::vector<platform::Place> &places);
+#endif
+  std::string Name() const override;
+
+  // Delay and buffer nccl_all_reduce together can significantly increase
+  // performance. Disable this feature by returning false.
+  bool IsMultiDeviceTransfer() override { return true; };
+
+ protected:
+  void RunImpl() override;
+
+ private:
+  std::vector<Scope *> local_scopes_;
+  std::vector<platform::Place> places_;
+#ifdef PADDLE_WITH_CUDA
+  const platform::NCCLContextMap *nccl_ctxs_;
+#endif
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1d9f1bd6e417e30f0799f0bbed1739cedb4e8fbf
--- /dev/null
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -0,0 +1,166 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/broadcast_op_handle.h"
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/variable_visitor.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+void BroadcastOpHandle::RunImpl() {
+  if (places_.size() == 1) return;
+
+  // The input and output may have dummy vars.
+  VarHandle *in_var_handle;
+  {
+    auto in_var_handles = DynamicCast<VarHandle>(inputs_);
+    PADDLE_ENFORCE_EQ(in_var_handles.size(), 1,
+                      "The number of input should be one.");
+    in_var_handle = in_var_handles[0];
+  }
+
+  auto out_var_handles = DynamicCast<VarHandle>(outputs_);
+
+  PADDLE_ENFORCE_EQ(
+      out_var_handles.size(), places_.size(),
+      "The number of output should equal to the number of places.");
+
+  WaitInputVarGenerated();
+
+  std::vector<const Scope *> var_scopes;
+  for (auto *s : local_scopes_) {
+    var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
+  }
+
+  auto *in_var =
+      var_scopes.at(in_var_handle->scope_idx_)->FindVar(in_var_handle->name_);
+  PADDLE_ENFORCE_NOT_NULL(in_var);
+  Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
+
+  InitOutputValue(*in_var_handle, out_var_handles);
+
+  if (platform::is_cpu_place(in_tensor.place())) {
+    for (auto *out_var_handle : out_var_handles) {
+      if (out_var_handle->IsTheSameVar(*in_var_handle)) {
+        continue;
+      }
+      auto &out_p = out_var_handle->place_;
+      auto *out_var = var_scopes.at(out_var_handle->scope_idx_)
+                          ->FindVar(out_var_handle->name_);
+
+      RunAndRecordEvent(out_p, [in_tensor, out_var] {
+        paddle::framework::TensorCopy(
+            in_tensor, platform::CPUPlace(),
+            &VariableVisitor::GetMutableTensor(out_var));
+      });
+    }
+  } else {
+#ifdef PADDLE_WITH_CUDA
+    VarHandle *out_handle = nullptr;
+    int root_id = boost::get<platform::CUDAPlace>(in_tensor.place()).device;
+    std::vector<std::function<void()>> broadcast_calls;
+
+    int type = platform::ToNCCLDataType(in_tensor.type());
+    size_t numel = static_cast<size_t>(in_tensor.numel());
+
+    for (auto out_var_handle : out_var_handles) {
+      Variable *out_var = var_scopes.at(out_var_handle->scope_idx_)
+                              ->FindVar(out_var_handle->name_);
+
+      int dst_id =
+          boost::get<platform::CUDAPlace>(out_var_handle->place_).device;
+
+      auto &nccl_ctx = nccl_ctxs_->at(dst_id);
+
+      void *send_recv_buffer = nullptr;
+      if (root_id == dst_id) {
+        send_recv_buffer = const_cast<void *>(in_tensor.data<void>());
+        out_handle = out_var_handle;
+      } else {
+        send_recv_buffer = VariableVisitor::GetMutableTensor(out_var)
+                               .Resize(in_tensor.dims())
+                               .mutable_data(out_var_handle->place_);
+      }
+
+      broadcast_calls.emplace_back(
+          [send_recv_buffer, numel, type, root_id, &nccl_ctx] {
+            PADDLE_ENFORCE(platform::dynload::ncclBcast(
+                send_recv_buffer, numel, static_cast<ncclDataType_t>(type),
+                root_id, nccl_ctx.comm_, nccl_ctx.stream()));
+          });
+    }
+
+    this->RunAndRecordEvent([&] {
+      {
+        platform::NCCLGroupGuard guard;
+        for (auto &call : broadcast_calls) {
+          call();
+        }
+      }
+
+      if (!out_handle->IsTheSameVar(*in_var_handle)) {
+        auto out_var = var_scopes.at(in_var_handle->scope_idx_)
+                           ->FindVar(out_var_handles[0]->name_);
+        paddle::framework::TensorCopy(
+            in_tensor, in_var_handle->place_,
+            *(dev_ctxes_.at(in_var_handle->place_)),
+            &VariableVisitor::GetMutableTensor(out_var));
+      }
+    });
+#else
+    PADDLE_THROW("CUDA is not enabled.");
+#endif
+  }
+}
+
+void BroadcastOpHandle::InitOutputValue(
+    const VarHandle &in_var_handle,
+    const std::vector<VarHandle *> &out_var_handles) const {
+  std::vector<const Scope *> var_scopes;
+  for (auto *s : local_scopes_) {
+    var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
+  }
+  auto *in_var =
+      var_scopes.at(in_var_handle.scope_idx_)->FindVar(in_var_handle.name_);
+
+  Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
+
+  // NOTE: The tensors' Place of input and output must be all on GPU or all on
+  // CPU.
+  for (auto *out_var_handle : out_var_handles) {
+    if (out_var_handle->IsTheSameVar(in_var_handle)) {
+      continue;
+    }
+    auto t_out_p = out_var_handle->place_;
+    auto *out_var = var_scopes.at(out_var_handle->scope_idx_)
+                        ->FindVar(out_var_handle->name_);
+    PADDLE_ENFORCE_NOT_NULL(out_var);
+    if (is_gpu_place(in_tensor.place())) {
+      PADDLE_ENFORCE(platform::is_gpu_place(t_out_p),
+                     "Places of input and output must be all on GPU.");
+    } else {
+      t_out_p = platform::CPUPlace();
+    }
+    VariableVisitor::ShareDimsAndLoD(*in_var, out_var);
+    VariableVisitor::GetMutableTensor(out_var).mutable_data(t_out_p,
+                                                            in_tensor.type());
+  }
+}
+
+std::string BroadcastOpHandle::Name() const { return "broadcast"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..8036f756b6d6506684c109ab881d546f38176a10
--- /dev/null
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -0,0 +1,73 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/device_context.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct BroadcastOpHandle : public OpHandleBase {
+ public:
+#ifdef PADDLE_WITH_CUDA
+  BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
+                    const std::vector<platform::Place> &places,
+                    const platform::NCCLContextMap *nccl_ctxs)
+      : local_scopes_(local_scopes), places_(places), nccl_ctxs_(nccl_ctxs) {
+    if (nccl_ctxs_) {
+      for (auto &p_ctx : nccl_ctxs_->contexts_) {
+        dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
+      }
+    }
+  }
+#else
+  BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
+                    const std::vector<platform::Place> &places)
+      : local_scopes_(local_scopes), places_(places) {}
+#endif
+
+  std::string Name() const override;
+
+  bool IsMultiDeviceTransfer() override { return false; };
+
+ protected:
+  void RunImpl() override;
+
+ private:
+  std::vector<Scope *> local_scopes_;
+  std::vector<platform::Place> places_;
+#ifdef PADDLE_WITH_CUDA
+  const platform::NCCLContextMap *nccl_ctxs_;
+#endif
+
+  void InitOutputValue(const VarHandle &in_var_handle,
+                       const std::vector<VarHandle *> &out_var_handles) const;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c6e923ef77ff03413eefe4f26457a5322747618e
--- /dev/null
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@@ -0,0 +1,264 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/broadcast_op_handle.h"
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+// test data amount
+const f::DDim kDims = {20, 20};
+
+struct TestBroadcastOpHandle {
+  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
+  std::vector<Scope*> local_scopes_;
+  std::vector<Scope*> param_scopes_;
+  Scope g_scope_;
+  std::unique_ptr<OpHandleBase> op_handle_;
+  std::vector<std::unique_ptr<VarHandleBase>> vars_;
+  std::vector<p::Place> gpu_list_;
+  bool use_gpu_;
+#ifdef PADDLE_WITH_CUDA
+  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
+#endif
+
+  void WaitAll() {
+    for (size_t j = 0; j < ctxs_.size(); ++j) {
+      ctxs_[j]->Wait();
+    }
+#ifdef PADDLE_WITH_CUDA
+    if (nccl_ctxs_) {
+      nccl_ctxs_->WaitAll();
+    }
+#endif
+  }
+
+  void InitCtxOnGpu(bool use_gpu) {
+    use_gpu_ = use_gpu;
+    if (use_gpu_) {
+#ifdef PADDLE_WITH_CUDA
+      int count = p::GetCUDADeviceCount();
+      if (count <= 1) {
+        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
+                        "device count is "
+                     << count;
+        exit(0);
+      }
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CUDAPlace(i);
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CUDADeviceContext(p));
+      }
+      nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_));
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+      int count = 8;
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CPUPlace();
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CPUDeviceContext(p));
+      }
+#ifdef PADDLE_WITH_CUDA
+      nccl_ctxs_.reset(nullptr);
+#endif
+    }
+  }
+
+  void InitBroadcastOp(size_t input_scope_idx) {
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      local_scopes_.push_back(&(g_scope_.NewScope()));
+      Scope& local_scope = local_scopes_.back()->NewScope();
+      *local_scopes_.back()
+           ->Var(details::kLocalExecScopeName)
+           ->GetMutable<Scope*>() = &local_scope;
+      local_scope.Var("out");
+      param_scopes_.emplace_back(&local_scope);
+    }
+    param_scopes_[input_scope_idx]->Var("input");
+
+    if (use_gpu_) {
+#ifdef PADDLE_WITH_CUDA
+      op_handle_.reset(
+          new BroadcastOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      op_handle_.reset(
+          new BroadcastOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+#else
+      op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_));
+#endif
+    }
+
+    auto* in_var_handle =
+        new VarHandle(1, input_scope_idx, "input", gpu_list_[input_scope_idx]);
+    vars_.emplace_back(in_var_handle);
+    op_handle_->AddInput(in_var_handle);
+
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle* dummy_var_handle =
+        static_cast<DummyVarHandle*>(vars_.back().get());
+    dummy_var_handle->generated_op_ = nullptr;
+    op_handle_->AddInput(dummy_var_handle);
+
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      if (!use_gpu_) {
+        op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
+      }
+      VarHandle* out_var_handle = new VarHandle(2, j, "out", gpu_list_[j]);
+      vars_.emplace_back(out_var_handle);
+      op_handle_->AddOutput(out_var_handle);
+    }
+
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle* out_dummy_var_handle =
+        static_cast<DummyVarHandle*>(vars_.back().get());
+    out_dummy_var_handle->generated_op_ = nullptr;
+    op_handle_->AddOutput(out_dummy_var_handle);
+  }
+
+  void TestBroadcastLodTensor(size_t input_scope_idx) {
+    auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
+    PADDLE_ENFORCE_NOT_NULL(in_var);
+    auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
+    in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
+
+    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k;
+    }
+    f::LoD lod{{0, 10, 20}};
+    paddle::framework::TensorFromVector<float>(
+        send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
+    in_lod_tensor->set_lod(lod);
+    in_lod_tensor->Resize(kDims);
+
+    op_handle_->Run(false);
+
+    WaitAll();
+
+    p::CPUPlace cpu_place;
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      auto out_var = param_scopes_[j]->FindVar("out");
+      PADDLE_ENFORCE_NOT_NULL(out_var);
+      auto out_tensor = out_var->Get<f::LoDTensor>();
+      PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal.");
+
+      f::Tensor result_tensor;
+      f::TensorCopySync(out_tensor, cpu_place, &result_tensor);
+      float* ct = result_tensor.mutable_data<float>(cpu_place);
+
+      for (int64_t i = 0; i < f::product(kDims); ++i) {
+        ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
+      }
+    }
+  }
+
+  void TestBroadcastSelectedRows(size_t input_scope_idx) {
+    auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
+    PADDLE_ENFORCE_NOT_NULL(in_var);
+    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+    auto value = in_selected_rows->mutable_value();
+    value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
+    int height = static_cast<int>(kDims[0]) * 2;
+    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
+                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
+    in_selected_rows->set_height(height);
+    in_selected_rows->set_rows(rows);
+
+    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k;
+    }
+    paddle::framework::TensorFromVector<float>(
+        send_vector, *(ctxs_[input_scope_idx]), value);
+
+    op_handle_->Run(false);
+
+    WaitAll();
+
+    p::CPUPlace cpu_place;
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      auto out_var = param_scopes_[j]->FindVar("out");
+      PADDLE_ENFORCE_NOT_NULL(out_var);
+      auto& out_select_rows = out_var->Get<f::SelectedRows>();
+      auto rt = out_select_rows.value();
+
+      PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
+                        "height is not equal.");
+      for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
+        PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k]);
+      }
+
+      f::Tensor result_tensor;
+      f::TensorCopySync(rt, cpu_place, &result_tensor);
+      float* ct = result_tensor.data<float>();
+
+      for (int64_t i = 0; i < f::product(kDims); ++i) {
+        ASSERT_NEAR(ct[i], send_vector[i], 1e-5);
+      }
+    }
+  }
+};
+
+TEST(BroadcastTester, TestCPUBroadcastTestLodTensor) {
+  TestBroadcastOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitBroadcastOp(input_scope_idx);
+  test_op.TestBroadcastLodTensor(input_scope_idx);
+}
+
+TEST(BroadcastTester, TestCPUBroadcastTestSelectedRows) {
+  TestBroadcastOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitBroadcastOp(input_scope_idx);
+  test_op.TestBroadcastSelectedRows(input_scope_idx);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(BroadcastTester, TestGPUBroadcastTestLodTensor) {
+  TestBroadcastOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(true);
+  test_op.InitBroadcastOp(input_scope_idx);
+  test_op.TestBroadcastLodTensor(input_scope_idx);
+}
+
+TEST(BroadcastTester, TestGPUBroadcastTestSelectedRows) {
+  TestBroadcastOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(true);
+  test_op.InitBroadcastOp(input_scope_idx);
+  test_op.TestBroadcastSelectedRows(input_scope_idx);
+}
+#endif
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c2c845c6efb206fb1ad5150189430b9a6fe9ea3
--- /dev/null
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct BuildStrategy {
+  enum class ReduceStrategy { kAllReduce = 0, kReduce = 1 };
+
+  enum class GradientScaleStrategy {
+    kCoeffNumDevice = 0,
+    kOne = 1,
+    kCustomized = 2,
+  };
+
+  ReduceStrategy reduce_{ReduceStrategy::kAllReduce};
+  GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice};
+
+  std::string debug_graphviz_path_{""};
+
+  bool enable_data_balance_{true};
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..df05bb06333d6b964f2f5434c3d43214e5d2cb7a
--- /dev/null
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -0,0 +1,46 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+
+#include <string>
+
+namespace paddle {
+namespace framework {
+namespace details {
+ComputationOpHandle::ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
+                                         platform::Place place)
+    : op_(framework::OpRegistry::CreateOp(op_desc)),
+      scope_(scope),
+      place_(place) {}
+
+void ComputationOpHandle::RunImpl() {
+  WaitInputVarGenerated(place_);
+
+  this->RunAndRecordEvent([this] {
+    op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
+  });
+}
+
+bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) {
+  bool need_wait =
+      in_var && in_var->generated_op_ &&
+      in_var->generated_op_->DeviceContext(place_) != dev_ctxes_[place_];
+  return need_wait;
+}
+
+std::string ComputationOpHandle::Name() const { return op_->Type(); }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..f048f973fdeb6cf7d1485cda8cea7d530d9ba465
--- /dev/null
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@@ -0,0 +1,48 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+struct ComputationOpHandle : public OpHandleBase {
+ public:
+  ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
+                      platform::Place place);
+
+  std::string Name() const override;
+
+ protected:
+  void RunImpl() override;
+
+  bool NeedWait(VarHandleBase *in_var) override;
+
+ private:
+  std::unique_ptr<OperatorBase> op_;
+  Scope *scope_;
+  platform::Place place_;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/container_cast.h b/paddle/fluid/framework/details/container_cast.h
new file mode 100644
index 0000000000000000000000000000000000000000..a42ae78dc45c2a885f98315a21f1d5558725bca3
--- /dev/null
+++ b/paddle/fluid/framework/details/container_cast.h
@@ -0,0 +1,40 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <type_traits>
+#include <vector>
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+template <typename ResultType, typename ElemType>
+std::vector<ResultType*> DynamicCast(const std::vector<ElemType*>& container) {
+  static_assert(std::is_base_of<ElemType, ResultType>::value,
+                "ElementType must be a base class of ResultType");
+  std::vector<ResultType*> res;
+  for (auto* ptr : container) {
+    auto* derived = dynamic_cast<ResultType*>(ptr);
+    if (derived) {
+      res.emplace_back(derived);
+    }
+  }
+  return res;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/cow_ptr.h b/paddle/fluid/framework/details/cow_ptr.h
index 69bcea625288eba897e761a1d634f19c41dc0f79..21f75957be5f33f3dfc09c41fa9a1e1ca590f99e 100644
--- a/paddle/fluid/framework/details/cow_ptr.h
+++ b/paddle/fluid/framework/details/cow_ptr.h
@@ -14,7 +14,7 @@
 
 #pragma once
 #include <memory>
-#include <thread>
+#include <thread>  // NOLINT
 
 namespace paddle {
 namespace framework {
@@ -23,7 +23,7 @@ namespace details {
 // Change it to thread safe flags if needed.
 class ThreadUnsafeOwnershipFlags {
  public:
-  ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
+  explicit ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
 
   ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
   ThreadUnsafeOwnershipFlags& operator=(
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d07235df5856591f8ad707c86fa5b3b65868c3d1
--- /dev/null
+++ b/paddle/fluid/framework/details/data_balance_op_handle.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/data_balance_op_handle.h"
+#include <algorithm>
+#include "paddle/fluid/framework/details/container_cast.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+#ifdef PADDLE_WITH_CUDA
+DataBalanceOpHandle::DataBalanceOpHandle(
+    const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places,
+    const platform::NCCLContextMap *ctxs)
+    : local_scopes_(local_scopes), places_(places) {
+  if (ctxs) {
+    for (auto &p : places_) {
+      this->dev_ctxes_[p] = ctxs->DevCtx(p);
+    }
+  }
+}
+#else
+DataBalanceOpHandle::DataBalanceOpHandle(
+    const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places)
+    : local_scopes_(local_scopes), places_(places) {}
+#endif
+
+std::string DataBalanceOpHandle::Name() const { return "data balance"; }
+
+std::vector<std::array<int, 3>> DataBalanceOpHandle::GetBalancePlan(
+    const std::vector<int> &device_sizes) {
+  int device_num = device_sizes.size();
+  int total_size = 0;
+  int empty_num = 0;
+  std::vector<std::array<int, 2>> size_device_vec;
+  size_device_vec.reserve(device_num);
+  for (int i = 0; i < device_num; ++i) {
+    if (device_sizes[i] == 0) {
+      ++empty_num;
+    }
+    total_size += device_sizes[i];
+    size_device_vec.push_back({{device_sizes[i], i}});
+  }
+  std::vector<std::array<int, 3>> res;
+  if (empty_num == 0) {
+    // No need to do data balance.
+    return res;
+  }
+  if (total_size < device_num) {
+    // No enough data.
+    PADDLE_THROW_EOF();
+  }
+  std::sort(size_device_vec.begin(), size_device_vec.end(),
+            [](const std::array<int, 2> &a, const std::array<int, 2> &b) {
+              return a[0] > b[0];
+            });
+  int expected_device_size = total_size / device_num;
+  int src_idx = 0;
+  for (int dst_idx = device_num - empty_num; dst_idx < device_num; ++dst_idx) {
+    if (size_device_vec[src_idx][0] <= expected_device_size) {
+      ++src_idx;
+      PADDLE_ENFORCE_LT(
+          src_idx, device_num - empty_num,
+          "In current srategy an empty tensor should not be copy source.");
+    }
+    size_device_vec[src_idx][0] -= expected_device_size;
+    size_device_vec[dst_idx][0] += expected_device_size;
+    res.push_back({{size_device_vec[src_idx][1], size_device_vec[dst_idx][1],
+                    expected_device_size}});
+  }
+  return res;
+}
+
+void DataBalanceOpHandle::RunImpl() {
+  if (places_.size() == 1) {
+    return;
+  }
+  auto in_var_handles = DynamicCast<VarHandle>(inputs_);
+  auto out_var_handles = DynamicCast<VarHandle>(outputs_);
+  PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0);
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), out_var_handles.size(),
+      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
+  int data_num = in_var_handles.size() / places_.size();
+  WaitInputVarGenerated();
+  std::vector<std::vector<LoDTensor *>> lod_tensors(data_num);
+  std::vector<int> device_sizes;
+  for (int i = 0; i < static_cast<int>(in_var_handles.size()); ++i) {
+    PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_,
+                      "The name of input and output should be equal.");
+    int place_idx = i / data_num;
+    int data_idx = i % data_num;
+    auto *local_scope =
+        local_scopes_[place_idx]->FindVar(kLocalExecScopeName)->Get<Scope *>();
+    auto *tensor_var = local_scope->FindVar(in_var_handles[i]->name_);
+    PADDLE_ENFORCE(tensor_var->IsType<LoDTensor>());
+    auto *tensor = tensor_var->GetMutable<LoDTensor>();
+    lod_tensors[data_idx].push_back(tensor);
+    int ins_size =
+        tensor->lod().empty() ? tensor->dims()[0] : tensor->NumElements();
+    if (data_idx == 0) {
+      device_sizes.emplace_back(ins_size);
+    } else {
+      PADDLE_ENFORCE_EQ(
+          ins_size, device_sizes.at(place_idx),
+          "All data on the same device shall have the same batch size.");
+    }
+  }
+  const auto &balance_plan = GetBalancePlan(device_sizes);
+
+  for (const auto &trans : balance_plan) {
+    for (int data_idx = 0; data_idx < data_num; ++data_idx) {
+      LoDTensor *src_tensor = lod_tensors[data_idx][trans[0]];
+      LoDTensor *dst_tensor = lod_tensors[data_idx][trans[1]];
+      int trans_ins_size = trans[2];
+      LoD src_lod = src_tensor->lod();
+      int src_ins_size =
+          src_lod.empty() ? src_tensor->dims()[0] : src_tensor->NumElements();
+      int cut_point = src_ins_size - trans_ins_size;
+      if (!src_lod.empty()) {
+        for (auto &level : src_lod) {
+          cut_point = level[cut_point];
+        }
+      }
+      TensorCopySync(src_tensor->Slice(cut_point, src_tensor->dims()[0]),
+                     dst_tensor->place(), dst_tensor);
+      src_tensor->ShareDataWith(src_tensor->Slice(0, cut_point));
+      if (!src_lod.empty()) {
+        dst_tensor->set_lod(SliceInLevel(
+            src_lod, 0, src_ins_size - trans_ins_size, src_ins_size));
+        src_tensor->set_lod(
+            SliceInLevel(src_lod, 0, 0, src_ins_size - trans_ins_size));
+      }
+    }
+  }
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.h b/paddle/fluid/framework/details/data_balance_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..76a407e3610e8bb48facf1f814779f4c23f92d98
--- /dev/null
+++ b/paddle/fluid/framework/details/data_balance_op_handle.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct DataBalanceOpHandle : public OpHandleBase {
+ public:
+#ifdef PADDLE_WITH_CUDA
+  DataBalanceOpHandle(const std::vector<Scope *> &local_scopes,
+                      const std::vector<platform::Place> &places,
+                      const platform::NCCLContextMap *ctxs);
+#else
+  DataBalanceOpHandle(const std::vector<Scope *> &local_scopes,
+                      const std::vector<platform::Place> &places);
+#endif
+
+  std::string Name() const override;
+
+  bool IsMultiDeviceTransfer() override { return false; };
+
+ protected:
+  void RunImpl() override;
+
+ private:
+  // std::vector<(src_dev_id, dst_dev_id, trans_size)>
+  std::vector<std::array<int, 3>> GetBalancePlan(
+      const std::vector<int> &batch_size_per_device);
+
+  const std::vector<Scope *> local_scopes_;
+  const std::vector<platform::Place> places_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h
new file mode 100644
index 0000000000000000000000000000000000000000..716d674fa29bad9321fc20979775c06f26bf4679
--- /dev/null
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct ExecutionStrategy {
+  size_t num_threads_{0};
+  bool use_cuda_{true};
+  bool allow_op_delay_{false};
+  size_t num_iteration_per_drop_scope_{100};
+};
+
+}  //  namespace details
+}  //  namespace framework
+}  //  namespace paddle
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d646c944601e81477787740189d7ac60ae97fa80
--- /dev/null
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -0,0 +1,90 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/fetch_op_handle.h"
+
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+FetchOpHandle::FetchOpHandle(FeedFetchList *data, size_t offset,
+                             std::vector<Scope *> *local_scopes)
+    : data_(data), offset_(offset), local_scopes_(local_scopes) {}
+
+FetchOpHandle::~FetchOpHandle() {
+  for (auto *input_var : inputs_) {
+    input_var->pending_ops_.erase(this);
+  }
+}
+
+void FetchOpHandle::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
+  PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error");
+}
+
+void FetchOpHandle::WaitAndMergeCPUTensors() const {
+  std::vector<const LoDTensor *> tensors_ptr;
+  tensors_ptr.reserve(tensors_.size());
+  for (auto &t : tensors_) {
+    tensors_ptr.emplace_back(&t);
+  }
+  data_->at(offset_).MergeLoDTensor(tensors_ptr, platform::CPUPlace());
+}
+
+void FetchOpHandle::RunImpl() {
+  WaitInputVarGenerated(platform::CPUPlace());
+
+  tensors_.resize(inputs_.size());
+  platform::CPUPlace cpu;
+  auto &scopes = *local_scopes_;
+
+  for (size_t i = 0; i < inputs_.size(); ++i) {
+    auto *var_handle = static_cast<VarHandle *>(inputs_[i]);
+    auto &scope = scopes.at(var_handle->scope_idx_);
+    auto *var = scope->FindVar(kLocalExecScopeName)
+                    ->Get<Scope *>()
+                    ->FindVar(var_handle->name_);
+    PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope",
+                            var_handle->name_);
+
+    auto &t = var->Get<framework::LoDTensor>();
+    if (platform::is_gpu_place(t.place())) {
+#ifdef PADDLE_WITH_CUDA
+      TensorCopySync(t, cpu, &tensors_[i]);
+#endif
+    } else {
+      tensors_[i].ShareDataWith(t);
+    }
+    tensors_[i].set_lod(t.lod());
+  }
+
+  this->WaitAndMergeCPUTensors();
+}
+
+void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) {
+  auto cpu_ctx = platform::DeviceContextPool::Instance().Get(place);
+  for (auto *input : inputs_) {
+    if (input->generated_op_) {
+      input->generated_op_->RecordWaitEventOnCtx(cpu_ctx);
+    }
+  }
+}
+
+std::string FetchOpHandle::Name() const { return "Fetch"; }
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..e09bdd1d3338bb175c1ddae35b53f98197b68e9a
--- /dev/null
+++ b/paddle/fluid/framework/details/fetch_op_handle.h
@@ -0,0 +1,56 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct FetchOpHandle : public OpHandleBase {
+ public:
+  FetchOpHandle(FeedFetchList *data, size_t offset,
+                std::vector<Scope *> *local_scopes);
+
+  ~FetchOpHandle();
+
+  void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) override;
+
+  void WaitAndMergeCPUTensors() const;
+
+  std::string Name() const override;
+
+ protected:
+  void RunImpl() override;
+
+  void WaitInputVarGenerated(const platform::Place &place) override;
+
+ private:
+  FeedFetchList *data_;
+  size_t offset_;
+  std::vector<Scope *> *local_scopes_;
+  std::vector<LoDTensor> tensors_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.cc b/paddle/fluid/framework/details/fuse_vars_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..018c9bff71e553d8a3641f06f10b350453676b24
--- /dev/null
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.cc
@@ -0,0 +1,51 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/fuse_vars_op_handle.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+void FuseVarsOpHandle::RunImpl() {
+  WaitInputVarGenerated(place_);
+
+  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
+  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
+  PADDLE_ENFORCE_EQ(in_var_handles.size(), 0);
+  PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), "");
+
+  auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+
+  auto out_var_handle = out_var_handles[0];
+  auto out_var = scope->Var(out_var_handle->name_);
+
+  auto out_tensor = out_var->GetMutable<LoDTensor>();
+  out_tensor->Resize({total_numel_}).mutable_data(this->place_, type_);
+
+  int64_t s = 0;
+  for (size_t i = 1; i < out_var_handles.size(); ++i) {
+    auto out_name = out_var_handles[i]->name_;
+    auto out_t = scope->Var(out_name)->GetMutable<LoDTensor>();
+    auto numel = this->inputs_numel_.at(out_name);
+    out_t->ShareDataWith(out_tensor->Slice(s, s + numel));
+    s += numel;
+  }
+  this->RunAndRecordEvent([] {});
+}
+
+std::string FuseVarsOpHandle::Name() const { return "fuse vars"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.h b/paddle/fluid/framework/details/fuse_vars_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..140fb5bb49a33146de974b6d79559b4cf15bdd7b
--- /dev/null
+++ b/paddle/fluid/framework/details/fuse_vars_op_handle.h
@@ -0,0 +1,63 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct FuseVarsOpHandle : public OpHandleBase {
+ public:
+  FuseVarsOpHandle(Scope *local_scope, const platform::Place &place,
+                   const std::unordered_map<std::string, int64_t> &inputs_numel,
+                   const std::type_index &var_type)
+      : local_scope_(local_scope),
+        place_(place),
+        inputs_numel_(inputs_numel),
+        type_(var_type) {
+    total_numel_ = 0;
+    for (auto in_numel : inputs_numel) {
+      PADDLE_ENFORCE_GT(in_numel.second, 0);
+      total_numel_ += in_numel.second;
+    }
+  }
+
+  std::string Name() const override;
+
+  bool IsMultiDeviceTransfer() override { return false; };
+
+ protected:
+  void RunImpl() override;
+
+ private:
+  Scope *local_scope_;
+  const platform::Place place_;
+  const std::unordered_map<std::string, int64_t> inputs_numel_;
+  const std::type_index type_;
+  int64_t total_numel_;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2be02304566cf5dbe348fa01fc4171990eafd158
--- /dev/null
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@@ -0,0 +1,117 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/gather_op_handle.h"
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/variable_visitor.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+GatherOpHandle::GatherOpHandle(const std::vector<Scope *> &local_scopes,
+                               const std::vector<platform::Place> &places)
+    : local_scopes_(local_scopes), places_(places) {}
+
+void GatherOpHandle::RunImpl() {
+  if (places_.size() == 1) return;
+  // the input and output may have dummy var.
+  auto in_var_handles = DynamicCast<VarHandle>(inputs_);
+
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), places_.size(),
+      "The number of output should equal to the number of places.");
+
+  VarHandle *out_var_handle;
+  {
+    auto out_var_handles = DynamicCast<VarHandle>(outputs_);
+    PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
+                      "The number of output should be one.");
+    out_var_handle = out_var_handles.front();
+  }
+
+  std::vector<const Scope *> var_scopes;
+  for (auto *s : local_scopes_) {
+    var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
+  }
+
+  auto in_0_handle = in_var_handles[0];
+  auto pre_in_var =
+      var_scopes.at(in_0_handle->scope_idx_)->FindVar(in_0_handle->name_);
+  PADDLE_ENFORCE_NOT_NULL(pre_in_var);
+
+  PADDLE_ENFORCE(pre_in_var->IsType<framework::SelectedRows>(),
+                 "Currently, gather_op only can gather SelectedRows.");
+
+  // Wait input done, this Wait is asynchronous operation
+  WaitInputVarGenerated();
+
+  auto &pre_in_value = pre_in_var->Get<framework::SelectedRows>();
+  std::vector<int64_t> out_rows;
+  std::vector<Tensor> in_tensors;
+
+  // Gather the inputs
+  for (auto *in_handle : in_var_handles) {
+    auto *in_var =
+        var_scopes.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
+    PADDLE_ENFORCE_NOT_NULL(in_var);
+    VariableVisitor::EnforceShapeAndDTypeEQ(*in_var, *pre_in_var);
+
+    auto &in_sr_value = in_var->Get<framework::SelectedRows>();
+
+    auto &in_sr_rows = in_sr_value.rows();
+    out_rows.insert(out_rows.end(), in_sr_rows.begin(), in_sr_rows.end());
+    in_tensors.emplace_back(in_sr_value.value());
+  }
+
+  // NOTE: The Places of all input tensor must be all on CPU or all on GPU.
+  platform::Place t_out_p = out_var_handle->place_;
+  if (platform::is_gpu_place(pre_in_value.place())) {
+    PADDLE_ENFORCE(platform::is_gpu_place(t_out_p),
+                   "Places of input and output must be all on GPU.");
+  } else {
+    t_out_p = platform::CPUPlace();
+  }
+
+  auto out_var =
+      var_scopes.at(out_var_handle->scope_idx_)->FindVar(out_var_handle->name_);
+  PADDLE_ENFORCE_NOT_NULL(out_var);
+  auto out_value = out_var->GetMutable<framework::SelectedRows>();
+  out_value->set_height(pre_in_value.height());
+  out_value->set_rows(out_rows);
+  size_t rows = out_rows.size();
+  DDim out_dim = pre_in_value.GetCompleteDims();
+  out_dim[0] = static_cast<int64_t>(rows);
+  out_value->mutable_value()->Resize(out_dim).mutable_data(
+      t_out_p, pre_in_value.value().type());
+  Tensor *out_tensor = out_value->mutable_value();
+
+  // copy
+  auto dev_ctx = dev_ctxes_[out_var_handle->place_];
+  RunAndRecordEvent(out_var_handle->place_, [in_tensors, out_tensor, &dev_ctx,
+                                             t_out_p] {
+    int s = 0, e = 0;
+    for (size_t j = 0; j < in_tensors.size(); ++j) {
+      e += in_tensors[j].dims()[0];
+      auto sub_out = out_tensor->Slice(s, e);
+      paddle::framework::TensorCopy(in_tensors[j], t_out_p, *dev_ctx, &sub_out);
+      s = e;
+    }
+  });
+}
+
+std::string GatherOpHandle::Name() const { return "gather"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/gather_op_handle.h b/paddle/fluid/framework/details/gather_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..d11ef8556aa8840949ca8dc7aa176413f70b9f22
--- /dev/null
+++ b/paddle/fluid/framework/details/gather_op_handle.h
@@ -0,0 +1,50 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct GatherOpHandle : public OpHandleBase {
+ public:
+  GatherOpHandle(const std::vector<Scope *> &local_scopes,
+                 const std::vector<platform::Place> &places);
+
+  std::string Name() const override;
+
+  bool IsMultiDeviceTransfer() override { return false; };
+
+ protected:
+  void RunImpl() override;
+
+ private:
+  const std::vector<Scope *> &local_scopes_;
+  const std::vector<platform::Place> &places_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3cce2cc1640b3866130126424ff8fef18b8befc6
--- /dev/null
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -0,0 +1,193 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/gather_op_handle.h"
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+// test data amount
+const f::DDim kDims = {20, 20};
+
+struct TestGatherOpHandle {
+  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
+  std::vector<Scope*> local_scopes_;
+  std::vector<Scope*> param_scopes_;
+  Scope g_scope_;
+  std::unique_ptr<OpHandleBase> op_handle_;
+  std::vector<std::unique_ptr<VarHandleBase>> vars_;
+  std::vector<p::Place> gpu_list_;
+
+  void WaitAll() {
+    for (size_t j = 0; j < ctxs_.size(); ++j) {
+      ctxs_[j]->Wait();
+    }
+  }
+
+  void InitCtxOnGpu(bool use_gpu) {
+    if (use_gpu) {
+#ifdef PADDLE_WITH_CUDA
+      int count = p::GetCUDADeviceCount();
+      if (count <= 1) {
+        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
+                        "device count is "
+                     << count;
+        exit(0);
+      }
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CUDAPlace(i);
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CUDADeviceContext(p));
+      }
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+      int count = 8;
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CPUPlace();
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CPUDeviceContext(p));
+      }
+    }
+  }
+
+  void InitGatherOp(size_t input_scope_idx) {
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      local_scopes_.push_back(&(g_scope_.NewScope()));
+      Scope& local_scope = local_scopes_.back()->NewScope();
+      *local_scopes_.back()
+           ->Var(details::kLocalExecScopeName)
+           ->GetMutable<Scope*>() = &local_scope;
+      local_scope.Var("input");
+      param_scopes_.emplace_back(&local_scope);
+    }
+    param_scopes_[input_scope_idx]->Var("out");
+
+    op_handle_.reset(new GatherOpHandle(local_scopes_, gpu_list_));
+    // add input
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
+      auto* in_var_handle = new VarHandle(1, j, "input", gpu_list_[j]);
+      vars_.emplace_back(in_var_handle);
+      op_handle_->AddInput(in_var_handle);
+    }
+
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle* in_dummy_var_handle =
+        static_cast<DummyVarHandle*>(vars_.back().get());
+    in_dummy_var_handle->generated_op_ = nullptr;
+    op_handle_->AddInput(in_dummy_var_handle);
+
+    // add output
+    auto* out_var_handle =
+        new VarHandle(2, input_scope_idx, "out", gpu_list_[input_scope_idx]);
+    vars_.emplace_back(out_var_handle);
+    op_handle_->AddOutput(out_var_handle);
+
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle* dummy_var_handle =
+        static_cast<DummyVarHandle*>(vars_.back().get());
+    op_handle_->AddOutput(dummy_var_handle);
+  }
+
+  void TestGatherSelectedRows(size_t output_scope_idx) {
+    int height = kDims[0] * 2;
+    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
+                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
+    std::vector<float> send_vector(f::product(kDims));
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k;
+    }
+
+    for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
+         ++input_scope_idx) {
+      auto in_var = param_scopes_.at(input_scope_idx)->FindVar("input");
+      PADDLE_ENFORCE_NOT_NULL(in_var);
+      auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+      auto value = in_selected_rows->mutable_value();
+      value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
+
+      in_selected_rows->set_height(height);
+      in_selected_rows->set_rows(rows);
+
+      paddle::framework::TensorFromVector<float>(
+          send_vector, *(ctxs_[input_scope_idx]), value);
+      value->Resize(kDims);
+    }
+
+    auto out_var = param_scopes_.at(output_scope_idx)->FindVar("out");
+    PADDLE_ENFORCE_NOT_NULL(out_var);
+    auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
+
+    auto in_var = param_scopes_.at(output_scope_idx)->FindVar("input");
+    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+
+    out_selected_rows->mutable_value()->ShareDataWith(
+        in_selected_rows->value());
+
+    op_handle_->Run(false);
+
+    WaitAll();
+
+    p::CPUPlace cpu_place;
+
+    auto& out_select_rows = out_var->Get<f::SelectedRows>();
+    auto rt = out_select_rows.value();
+
+    PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
+    for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
+      PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]);
+    }
+
+    f::Tensor result_tensor;
+    f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor);
+    float* ct = result_tensor.data<float>();
+
+    for (int64_t j = 0;
+         j < f::product(kDims) * static_cast<int64_t>(gpu_list_.size()); ++j) {
+      ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
+    }
+  }
+};
+
+TEST(GatherTester, TestCPUGatherTestSelectedRows) {
+  TestGatherOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitGatherOp(input_scope_idx);
+  test_op.TestGatherSelectedRows(input_scope_idx);
+}
+
+#ifdef PADDLE_WITH_CUDA
+
+TEST(GatherTester, TestGPUGatherTestSelectedRows) {
+  TestGatherOpHandle test_op;
+  size_t input_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitGatherOp(input_scope_idx);
+  test_op.TestGatherSelectedRows(input_scope_idx);
+}
+#endif
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..46d0c2769cb334f5cb75ae0ef5e48da45448c48f
--- /dev/null
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -0,0 +1,597 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <algorithm>
+#include <fstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
+#include "paddle/fluid/framework/details/broadcast_op_handle.h"
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/data_balance_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include "paddle/fluid/framework/details/reduce_op_handle.h"
+#include "paddle/fluid/framework/details/rpc_op_handle.h"
+#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+#ifdef PADDLE_WITH_CUDA
+MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
+    const std::vector<platform::Place> &places,
+    const std::string &loss_var_name,
+    const std::unordered_set<std::string> &params,
+    const std::vector<Scope *> &local_scopes,
+    platform::NCCLContextMap *nccl_ctxs, const BuildStrategy &strategy)
+    : loss_var_name_(loss_var_name),
+      places_(places),
+      local_scopes_(local_scopes),
+      nccl_ctxs_(nccl_ctxs),
+      strategy_(strategy) {
+#else
+MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
+    const std::vector<platform::Place> &places,
+    const std::string &loss_var_name,
+    const std::unordered_set<std::string> &params,
+    const std::vector<Scope *> &local_scopes, const BuildStrategy &strategy)
+    : loss_var_name_(loss_var_name),
+      places_(places),
+      local_scopes_(local_scopes),
+      strategy_(strategy) {
+#endif
+  for (auto &p : params) {
+    grad_names_.insert(GradVarName(p));
+  }
+  balance_vars_.resize(places_.size(), 0);
+}
+
+void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
+                                                const OpDesc &op,
+                                                size_t place_id) const {
+  auto p = places_[place_id];
+  auto *op_handle = result->ops_.back().get();
+  op_handle->SetDeviceContext(p,
+                              platform::DeviceContextPool::Instance().Get(p));
+
+  for (auto &each_var_name : op.InputArgumentNames()) {
+    VarHandle *var =
+        CreateOrGetLatestVarHandle(result, each_var_name, p, place_id);
+    op_handle->AddInput(var);
+  }
+
+  for (auto &each_var_name : op.OutputArgumentNames()) {
+    CreateOpOutput(result, op_handle, each_var_name, p, place_id);
+  }
+}
+
+std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainSendVars(
+    const ProgramDesc &program) const {
+  std::vector<std::string> send_vars;
+  // since parameters are all in block 0,
+  // it's enough to only scan send ops in block 0
+  for (auto *op : program.Block(0).AllOps()) {
+    // TODO(Yancey1989): use a graceful method to find send op,
+    // instead of the the hard code string
+    if (op->Type() == "send") {
+      auto op_vars = op->InputArgumentNames();
+      send_vars.reserve(send_vars.size() +
+                        std::distance(op_vars.begin(), op_vars.end()));
+      send_vars.insert(send_vars.end(), op_vars.begin(), op_vars.end());
+    }
+  }
+  return send_vars;
+}
+
+std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainRecvVars(
+    const ProgramDesc &program) const {
+  std::vector<std::string> recv_vars;
+  for (auto *op : program.Block(0).AllOps()) {
+    // TODO(Yancey1989): use a graceful method to find recv op,
+    // instead of the hard code string
+    if (op->Type() == "recv") {
+      auto op_vars = op->OutputArgumentNames();
+      recv_vars.reserve(recv_vars.size() +
+                        std::distance(op_vars.begin(), op_vars.end()));
+      recv_vars.insert(recv_vars.end(), op_vars.begin(), op_vars.end());
+    }
+  }
+  return recv_vars;
+}
+
+bool MultiDevSSAGraphBuilder::IsDistTrainOp(
+    const OpDesc &op, const std::vector<std::string> &send_vars,
+    const std::vector<std::string> &recv_vars) const {
+  if (send_vars.size() == 0 || recv_vars.size() == 0) {
+    return false;
+  }
+
+  /**
+   * Check any of opvars contains `.block` and in sendvars
+   */
+  auto checker = [](const std::vector<std::string> &opvars,
+                    const std::vector<std::string> &rpc_vars) -> bool {
+    for (auto &var : opvars) {
+      // a variable name with the suffix `.block` means it's a splited
+      // variable by (DistributeTranspiler)
+      // [python/paddle/fluid/transpiler/distribute_transpiler.py]
+      if (var.find(".block") != std::string::npos &&
+          std::find(rpc_vars.begin(), rpc_vars.end(), var) != rpc_vars.end()) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  return checker(op.OutputArgumentNames(), send_vars) ||
+         checker(op.InputArgumentNames(), recv_vars);
+}
+
+size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
+    const std::vector<std::string> &var_names) const {
+  int64_t numel_sum = 0;
+  for (auto var_name : var_names) {
+    auto var_desc = all_vars_.at(var_name);
+    PADDLE_ENFORCE_NOT_NULL(var_desc);
+    auto dim = framework::make_ddim(var_desc->GetShape());
+    int64_t numel = framework::product(dim);
+    PADDLE_ENFORCE_GT(numel, 0);
+    numel_sum += numel;
+  }
+
+  auto smallest =
+      std::min_element(std::begin(balance_vars_), std::end(balance_vars_));
+  size_t dev_id =
+      static_cast<size_t>(std::distance(std::begin(balance_vars_), smallest));
+  balance_vars_[dev_id] += numel_sum;
+  return dev_id;
+}
+
+std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
+    const ProgramDesc &program) const {
+  for (auto *var : program.Block(0).AllVars()) {
+    all_vars_.emplace(var->Name(), var);
+  }
+
+  auto graph = new SSAGraph();
+  SSAGraph &result = *graph;
+  std::unordered_set<std::string> og_has_been_broadcast;
+
+  // We cannot invoke resize. It is a bug of GCC 4.8
+  result.vars_ = std::vector<
+      std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>(
+      places_.size());
+
+  // find send/recv vars so that we can place the distributed training
+  // realted op in the place 0
+  auto send_vars = FindDistTrainSendVars(program);
+  auto recv_vars = FindDistTrainRecvVars(program);
+
+  std::vector<std::unordered_set<std::string>> bcast_var_name_set;
+  bcast_var_name_set.resize(places_.size());
+
+  size_t cur_device_id = 0;
+  bool is_forwarding = true;
+
+  for (auto *op : program.Block(0).AllOps()) {
+    if (boost::get<int>(
+            op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+        static_cast<int>(OpRole::kRPC)) {
+      CreateRPCOp(&result, *op);
+    } else if (IsDistTrainOp(*op, send_vars, recv_vars)) {
+      CreateDistTrainOp(&result, *op);
+    } else if (IsScaleLossOp(*op)) {
+      // user can customize loss@grad if not use_default_grad_scale_
+      if (strategy_.gradient_scale_ !=
+          BuildStrategy::GradientScaleStrategy::kCustomized) {
+        CreateScaleLossGradOp(&result);
+      }
+      // This assumes the backward generating code will ensure IsScaleLossOp
+      // is true only for the op that scale the final scalar loss.
+      // It also assumes backward op will always follow the forward op in
+      // the block.
+      is_forwarding = false;
+    } else {
+      int op_dev_id = GetOpDeviceID(*op);
+      if (op_dev_id != -1) {  // This op only runs on one specific device.
+        CreateComputationalOp(&result, *op, op_dev_id);
+        for (auto &var_name : op->OutputArgumentNames()) {
+          var_name_on_devices_.emplace(var_name, op_dev_id);
+        }
+      } else {
+        // This op runs on all devices, and its output may have parameter's
+        // gradients.
+        if (op->Type() == "read" && strategy_.enable_data_balance_) {
+          op->SetAttr("throw_eof_exp", false);
+          CreateComputationalOps(&result, *op, places_.size());
+          const auto &data_var_names = op->Output("Out");
+          InsertDataBalanceOp(&result, data_var_names);
+        } else {
+          CreateComputationalOps(&result, *op, places_.size());
+        }
+
+        if (!is_forwarding && places_.size() > 1) {
+          // Currently, we assume that once gradient is generated, it can be
+          // broadcast, and each gradient is only broadcast once.
+          if (static_cast<bool>(boost::get<int>(op->GetAttr(
+                                    OpProtoAndCheckerMaker::OpRoleAttrName())) &
+                                static_cast<int>(OpRole::kBackward))) {
+            try {
+              auto backward_vars =
+                  boost::get<std::vector<std::string>>(op->GetNullableAttr(
+                      OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+
+              PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
+
+              for (size_t i = 0; i < backward_vars.size(); i += 2) {
+                auto &p_name = backward_vars[i];
+                auto &g_name = backward_vars[i + 1];
+                VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
+
+                switch (strategy_.reduce_) {
+                  case BuildStrategy::ReduceStrategy::kReduce:
+                    cur_device_id = GetAppropriateDeviceID({g_name});
+                    CreateReduceOp(&result, g_name, cur_device_id);
+                    var_name_on_devices_.emplace(g_name, cur_device_id);
+                    bcast_var_name_set[cur_device_id].emplace(p_name);
+                    break;
+                  case BuildStrategy::ReduceStrategy::kAllReduce:
+                    if (IsSparseGradient(g_name)) {
+                      CreateReduceOp(&result, g_name, 0);
+                      CreateBroadcastOp(&result, g_name, 0);
+                    } else {
+                      InsertAllReduceOp(&result, g_name);
+                    }
+                    break;
+                  default:
+                    LOG(FATAL) << "Unknown reduce strategy ";
+                    break;
+                }
+              }
+            } catch (boost::bad_get e) {
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Insert BCast Ops
+  for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
+    auto &to_bcast_set = bcast_var_name_set[dev_id];
+    for (auto &bcast_name : to_bcast_set) {
+      CreateBroadcastOp(&result, bcast_name, dev_id);
+    }
+  }
+  /*
+    Dependency graph has been constructed. However, there are still data
+    hazards need to be handled.
+   */
+  PolishGraphToSupportDataHazards(&result);
+
+  /*
+   * Only variables should be the leaves of graph.
+   */
+  AddOutputToLeafOps(&result);
+
+  return std::unique_ptr<SSAGraph>(graph);
+}
+
+bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const {
+  PADDLE_ENFORCE(all_vars_.count(og) != 0);
+  if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) {
+    return true;
+  }
+  return false;
+}
+
+void MultiDevSSAGraphBuilder::SetCommunicationContext(
+    OpHandleBase *op_handle, const platform::Place &p) const {
+#ifdef PADDLE_WITH_CUDA
+  if (nccl_ctxs_ == nullptr) {
+    op_handle->SetDeviceContext(p,
+                                platform::DeviceContextPool::Instance().Get(p));
+  }
+#else
+  op_handle->SetDeviceContext(p,
+                              platform::DeviceContextPool::Instance().Get(p));
+#endif
+}
+
+void MultiDevSSAGraphBuilder::CreateBroadcastOp(SSAGraph *result,
+                                                const std::string &p_name,
+                                                size_t src_dev_id) const {
+#ifdef PADDLE_WITH_CUDA
+  auto *op_handle = new BroadcastOpHandle(local_scopes_, places_, nccl_ctxs_);
+#else
+  auto *op_handle = new BroadcastOpHandle(local_scopes_, places_);
+#endif
+
+  result->ops_.emplace_back(op_handle);
+  auto *in = result->vars_.at(src_dev_id).at(p_name).back().get();
+  op_handle->AddInput(in);
+
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto &p = places_[i];
+    SetCommunicationContext(op_handle, p);
+    auto &vars = result->vars_.at(i).at(p_name);
+    auto *out_var = new VarHandle(vars.size(), i, p_name, p);
+    vars.emplace_back(out_var);
+    op_handle->AddOutput(out_var);
+  }
+}
+
+void MultiDevSSAGraphBuilder::CreateComputationalOp(SSAGraph *result,
+                                                    const OpDesc &op,
+                                                    int dev_id) const {
+  result->ops_.emplace_back(
+      new ComputationOpHandle(op, local_scopes_[dev_id], places_[dev_id]));
+  CreateOpHandleIOs(result, op, dev_id);
+}
+
+void MultiDevSSAGraphBuilder::InsertAllReduceOp(SSAGraph *result,
+                                                const std::string &og) const {
+#ifdef PADDLE_WITH_CUDA
+  result->ops_.emplace_back(
+      new AllReduceOpHandle(local_scopes_, places_, nccl_ctxs_));
+#else
+  result->ops_.emplace_back(new AllReduceOpHandle(local_scopes_, places_));
+#endif
+  auto *op_handle = result->ops_.back().get();
+
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto &p = places_[i];
+    SetCommunicationContext(op_handle, p);
+    auto &vars = result->vars_[i][og];
+    PADDLE_ENFORCE(!vars.empty());
+    auto &prev_grad = vars.back();
+    op_handle->AddInput(prev_grad.get());
+
+    auto var = new VarHandle(vars.size(), i, og, p);
+    vars.emplace_back(var);
+    op_handle->AddOutput(var);
+  }
+}
+
+void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
+    SSAGraph *result, const std::vector<std::string> &datas) const {
+#ifdef PADDLE_WITH_CUDA
+  result->ops_.emplace_back(
+      new DataBalanceOpHandle(local_scopes_, places_, nccl_ctxs_));
+#else
+  result->ops_.emplace_back(new DataBalanceOpHandle(local_scopes_, places_));
+#endif
+  auto *op_handle = result->ops_.back().get();
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto &p = places_[i];
+    SetCommunicationContext(op_handle, p);
+    for (const std::string &d_name : datas) {
+      auto &vars = result->vars_[i][d_name];
+      PADDLE_ENFORCE(!vars.empty());
+      op_handle->AddInput(vars.back().get());
+      auto var = new VarHandle(vars.size(), i, d_name, p);
+      vars.emplace_back(var);
+      op_handle->AddOutput(var);
+    }
+  }
+}
+
+bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
+    const std::string &og,
+    std::unordered_set<std::string> *og_has_been_broadcast) const {
+  bool is_pg_once =
+      grad_names_.count(og) != 0 && og_has_been_broadcast->count(og) == 0;
+  if (is_pg_once) {
+    // Insert NCCL AllReduce Op
+    og_has_been_broadcast->insert(og);
+  }
+  return is_pg_once;
+}
+
+int MultiDevSSAGraphBuilder::GetOpDeviceID(const OpDesc &op) const {
+  if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
+    return -1;
+  }
+
+  for (auto &varname : op.InputArgumentNames()) {
+    int dev_id = GetVarDeviceID(varname);
+    if (dev_id != -1) {
+      return dev_id;
+    }
+  }
+  return -1;
+}
+
+int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const {
+  auto got = var_name_on_devices_.find(varname);
+  return got == var_name_on_devices_.end() ? -1 : got->second;
+}
+
+void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
+  for (size_t i = 0; i < places_.size(); ++i) {
+// Insert ScaleCost OpHandle
+#ifdef PADDLE_WITH_CUDA
+    auto *communication_dev_ctx =
+        nccl_ctxs_ ? nccl_ctxs_->DevCtx(places_[i])
+                   : platform::DeviceContextPool::Instance().Get(places_[i]);
+#else
+    auto *communication_dev_ctx =
+        platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
+#endif
+
+    auto *op_handle =
+        new ScaleLossGradOpHandle(local_scopes_.size(), local_scopes_[i],
+                                  places_[i], communication_dev_ctx);
+    result->ops_.emplace_back(op_handle);
+
+    // FIXME: Currently ScaleLossGradOp only use device_count as scale
+    // factor. So it does not depend on any other operators.
+    // VarHandle *loss = GetVarHandle(loss_var_name, place);
+    // loss->pending_ops_.emplace_back(op_handle);
+    // op_handle->inputs_.emplace_back(loss);
+
+    CreateOpOutput(result, op_handle, GradVarName(loss_var_name_), places_[i],
+                   i);
+  }
+}
+
+void MultiDevSSAGraphBuilder::CreateComputationalOps(SSAGraph *result,
+                                                     const OpDesc &op,
+                                                     size_t num_places) const {
+  for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) {
+    auto p = places_[scope_idx];
+    auto s = local_scopes_[scope_idx];
+    result->ops_.emplace_back(new ComputationOpHandle(op, s, p));
+    CreateOpHandleIOs(result, op, scope_idx);
+  }
+}
+
+VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result,
+                                                   const std::string &og,
+                                                   int dst_dev_id) const {
+#ifdef PADDLE_WITH_CUDA
+  result->ops_.emplace_back(
+      new ReduceOpHandle(local_scopes_, places_, nccl_ctxs_));
+#else
+  result->ops_.emplace_back(new ReduceOpHandle(local_scopes_, places_));
+#endif
+  auto *op_handle = result->ops_.back().get();
+
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto &p = places_[i];
+    SetCommunicationContext(op_handle, p);
+    auto &vars = result->vars_[i][og];
+    PADDLE_ENFORCE(!vars.empty());
+    auto &prev_grad = vars.back();
+    op_handle->AddInput(prev_grad.get());
+  }
+  auto &vars = result->vars_[dst_dev_id][og];
+  auto var = new VarHandle(vars.size(), dst_dev_id, og, places_[dst_dev_id]);
+  vars.emplace_back(var);
+  op_handle->AddOutput(var);
+  return var;
+}
+
+// Find the first occurence of `prev_op_name` and make current `op` depend
+// on it.
+void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op,
+                                        const std::string &prev_op_name) const {
+  for (auto &prev_op : result->ops_) {
+    if (prev_op->Name() == prev_op_name) {
+      auto *dep_var = new DummyVarHandle();
+      prev_op->AddOutput(dep_var);
+      result->dep_vars_.emplace(dep_var);
+      op->AddInput(dep_var);
+    }
+  }
+}
+
+void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
+                                                const OpDesc &op) const {
+  int op_dev_id = -1;
+  if (op.Type() == "split_byref" || op.Type() == "split_selected_rows") {
+    op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
+    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
+      op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
+      for (auto &varname : op.InputArgumentNames()) {
+        var_name_on_devices_.emplace(varname, op_dev_id);
+      }
+    }
+    for (auto &varname : op.OutputArgumentNames()) {
+      var_name_on_devices_.emplace(varname, op_dev_id);
+    }
+  } else if (op.Type() == "concat") {
+    op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
+    for (auto &varname : op.OutputArgumentNames()) {
+      var_name_on_devices_.emplace(varname, op_dev_id);
+    }
+  } else {
+    PADDLE_ENFORCE(
+        "the distribute training related op should be in [split_byref, "
+        "concat].");
+  }
+
+  PADDLE_ENFORCE(op_dev_id != -1,
+                 "can not find right place for distributed op: %s", op.Type());
+
+  CreateComputationalOp(result, op, op_dev_id);
+  if (op.Type() == "concat") {
+    ConnectOp(result, result->ops_.back().get(), "fetch_barrier");
+  }
+}
+
+// Create RPC related op handles that connects its in ops and out ops.
+void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
+                                          const OpDesc &op) const {
+  int op_dev_id = -1;
+  if (op.Type() == "send") {
+    op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
+    // the variable name which contains .block means it was splited by
+    // split_byref op
+    // so that we can balance the variable blocks to all the pserver
+    // instances.
+    if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce &&
+        op.InputArgumentNames()[0].find(".block") == std::string::npos) {
+      op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
+      for (auto &varname : op.InputArgumentNames()) {
+        var_name_on_devices_.emplace(varname, op_dev_id);
+      }
+    }
+  } else if (op.Type() == "recv") {
+    op_dev_id = GetAppropriateDeviceID(op.OutputArgumentNames());
+    for (auto &varname : op.OutputArgumentNames()) {
+      var_name_on_devices_.emplace(varname, op_dev_id);
+    }
+  } else {
+    // send_barrier and fetch_barrier op can be scheduled on device 0
+    op_dev_id = 0;
+  }
+
+  PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s",
+                 op.Type());
+
+  result->ops_.emplace_back(new RPCOpHandle(op, local_scopes_[op_dev_id],
+                                            op.Type(), places_[op_dev_id]));
+
+  if (op.Type() == "send_barrier") {
+    ConnectOp(result, result->ops_.back().get(), "send");
+  } else if (op.Type() == "recv") {
+    ConnectOp(result, result->ops_.back().get(), "send_barrier");
+  } else if (op.Type() == "fetch_barrier") {
+    ConnectOp(result, result->ops_.back().get(), "recv");
+  } else if (op.Type() == "send") {
+    // do nothing
+  } else {
+    PADDLE_THROW(
+        "rpc op should be in ["
+        "send, send_barrier. recv, fetch_barrier]");
+  }
+
+  CreateOpHandleIOs(result, op, op_dev_id);
+}
+
+bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
+  return boost::get<int>(
+             op.GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+             (static_cast<int>(OpRole::kBackward) |
+              static_cast<int>(OpRole::kLoss)) &&
+         !loss_var_name_.empty();  // If loss_var is empty. This is test mode
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..a964e024885e56693224a6199e00ff30beaa1df4
--- /dev/null
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -0,0 +1,126 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+
+namespace paddle {
+namespace platform {
+class NCCLContextMap;
+}
+
+namespace framework {
+class Scope;
+namespace details {
+
+class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
+ public:
+#ifdef PADDLE_WITH_CUDA
+  MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
+                          const std::string &loss_var_name,
+                          const std::unordered_set<std::string> &params,
+                          const std::vector<Scope *> &local_scopes,
+                          platform::NCCLContextMap *nccl_ctxs,
+                          const BuildStrategy &strategy);
+#else
+  MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
+                          const std::string &loss_var_name,
+                          const std::unordered_set<std::string> &params,
+                          const std::vector<Scope *> &local_scopes,
+                          const BuildStrategy &strategy);
+#endif
+
+  std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
+  int GetVarDeviceID(const std::string &varname) const override;
+
+ private:
+  void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op,
+                         size_t device_id) const;
+
+ private:
+  std::string loss_var_name_;
+  const std::vector<platform::Place> &places_;
+  const std::vector<Scope *> &local_scopes_;
+  std::unordered_set<std::string> grad_names_;
+
+#ifdef PADDLE_WITH_CUDA
+  platform::NCCLContextMap *nccl_ctxs_;
+#endif
+
+  bool IsScaleLossOp(const OpDesc &op) const;
+
+  void CreateRPCOp(SSAGraph *result, const OpDesc &op) const;
+  void CreateDistTrainOp(SSAGraph *result, const OpDesc &op) const;
+
+  /**
+   * Is this operator as the end-point operator before/after send operator.
+   */
+  bool IsDistTrainOp(const OpDesc &op,
+                     const std::vector<std::string> &send_vars,
+                     const std::vector<std::string> &recv_vars) const;
+
+  std::vector<std::string> FindDistTrainSendVars(
+      const ProgramDesc &program) const;
+
+  std::vector<std::string> FindDistTrainRecvVars(
+      const ProgramDesc &program) const;
+
+  void ConnectOp(SSAGraph *result, OpHandleBase *op,
+                 const std::string &prev_op_name) const;
+
+  void CreateComputationalOps(SSAGraph *result, const OpDesc &op,
+                              size_t num_places) const;
+
+  void CreateScaleLossGradOp(SSAGraph *result) const;
+  VarHandle *CreateReduceOp(SSAGraph *result, const std::string &og,
+                            int dst_dev_id) const;
+  void CreateComputationalOp(SSAGraph *result, const OpDesc &op,
+                             int dev_id) const;
+
+  bool IsParameterGradientOnce(
+      const std::string &og,
+      std::unordered_set<std::string> *og_has_been_broadcast) const;
+
+  int GetOpDeviceID(const OpDesc &op) const;
+
+  void InsertAllReduceOp(SSAGraph *result, const std::string &og) const;
+
+  void InsertDataBalanceOp(SSAGraph *result,
+                           const std::vector<std::string> &datas) const;
+
+  void CreateBroadcastOp(SSAGraph *result, const std::string &p_name,
+                         size_t src_dev_id) const;
+
+  bool IsSparseGradient(const std::string &og) const;
+
+  size_t GetAppropriateDeviceID(
+      const std::vector<std::string> &var_names) const;
+
+ private:
+  BuildStrategy strategy_;
+  mutable std::unordered_map<std::string, VarDesc *> all_vars_;
+  mutable std::unordered_map<std::string, int> var_name_on_devices_;
+  mutable std::vector<int64_t> balance_vars_;
+
+  void SetCommunicationContext(OpHandleBase *op_handle,
+                               const platform::Place &p) const;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d80bdcf15d798925c137460125964d3d7e65f67e
--- /dev/null
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -0,0 +1,161 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include <map>
+
+namespace paddle {
+namespace framework {
+namespace details {
+std::string OpHandleBase::DebugString() const {
+  std::stringstream ss;
+  ss << "(";
+  for (auto *var : inputs_) {
+    ss << var->DebugString() << ", ";
+  }
+  ss << ") --> (";
+  for (auto *var : outputs_) {
+    ss << var->DebugString() << ", ";
+  }
+  ss << ")\n";
+  return ss.str();
+}
+
+OpHandleBase::~OpHandleBase() {
+#ifdef PADDLE_WITH_CUDA
+  for (auto &ev : events_) {
+    PADDLE_ENFORCE(cudaEventDestroy(ev.second));
+  }
+#endif
+}
+
+void OpHandleBase::Run(bool use_cuda) {
+#ifdef PADDLE_WITH_CUDA
+  if (events_.empty() && use_cuda) {
+    for (auto &p : dev_ctxes_) {
+      int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
+      PADDLE_ENFORCE(cudaSetDevice(dev_id));
+      PADDLE_ENFORCE(
+          cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
+    }
+  }
+#else
+  PADDLE_ENFORCE(!use_cuda);
+#endif
+
+  RunImpl();
+}
+
+void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
+#ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE_NOT_NULL(waited_ctx);
+  if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) {
+    for (auto &dev_ctx : dev_ctxes_) {
+      PADDLE_ENFORCE_NOT_NULL(dev_ctx.second);
+      dev_ctx.second->Wait();
+    }
+  } else {
+    auto stream =
+        static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream();
+    for (auto &ev : events_) {
+      PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0));
+    }
+  }
+#else
+  for (auto &dev_ctx : dev_ctxes_) {
+    dev_ctx.second->Wait();
+  }
+#endif
+}
+
+void OpHandleBase::AddInput(VarHandleBase *in) {
+  this->inputs_.emplace_back(in);
+  in->pending_ops_.insert(this);
+}
+
+void OpHandleBase::AddOutput(VarHandleBase *out) {
+  outputs_.emplace_back(out);
+  out->generated_op_ = this;
+}
+
+void OpHandleBase::WaitInputVarGenerated() {
+  for (auto in_var : inputs_) {
+    if (NeedWait(in_var)) {
+      for (auto &pair : dev_ctxes_) {
+        in_var->generated_op_->RecordWaitEventOnCtx(pair.second);
+      }
+    }
+  }
+}
+
+void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
+  for (auto *in : inputs_) {
+    if (NeedWait(in)) {
+      in->generated_op_->RecordWaitEventOnCtx(dev_ctxes_[place]);
+    }
+  }
+}
+
+size_t OpHandleBase::NoDummyInputSize() const {
+  size_t cnt = 0;
+  for (auto *in : inputs_) {
+    if (dynamic_cast<DummyVarHandle *>(in) == nullptr) {
+      ++cnt;
+    }
+  }
+  return cnt;
+}
+
+bool OpHandleBase::NeedWait(VarHandleBase *in_var) {
+  return in_var && in_var->generated_op_;
+}
+
+void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
+#ifdef PADDLE_WITH_CUDA
+  if (!events_.empty()) {  // Use event
+    std::function<void()> method = callback;
+    for (auto &p : dev_ctxes_) {
+      method = [method, p, this]() {
+        static_cast<platform::CUDADeviceContext *>(p.second)->RecordEvent(
+            events_.at(boost::get<platform::CUDAPlace>(p.first).device),
+            method);
+      };
+    }
+    method();
+  } else {
+#endif
+    callback();
+#ifdef PADDLE_WITH_CUDA
+  }
+#endif
+}
+
+void OpHandleBase::RunAndRecordEvent(platform::Place p,
+                                     const std::function<void()> &callback) {
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_cpu_place(p) || events_.empty()) {
+    callback();
+  } else {
+    auto *ctx = dev_ctxes_.at(p);
+    auto *cuda_ctx = static_cast<platform::CUDADeviceContext *>(ctx);
+    cuda_ctx->RecordEvent(events_.at(boost::get<platform::CUDAPlace>(p).device),
+                          callback);
+  }
+#else
+  callback();
+#endif
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..6aec178831161f8ac1306fc3ed72e3267ca3c7e5
--- /dev/null
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -0,0 +1,106 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <map>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/var_handle.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@";
+
+class OpHandleBase {
+ public:
+  OpHandleBase() {}
+
+  virtual ~OpHandleBase();
+
+  std::string DebugString() const;
+
+  virtual std::string Name() const = 0;
+
+  void Run(bool use_cuda);
+
+  virtual void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx);
+
+  void AddInput(VarHandleBase *in);
+
+  void AddOutput(VarHandleBase *out);
+
+  // This method adds the wait events of all the input on all the device
+  // context.
+  // NODE: This Wait is asynchronous operation.
+  virtual void WaitInputVarGenerated();
+
+  // This method adds the wait events of all the input on the specified device
+  // context.
+  // NODE: This Wait is asynchronous operation.
+  virtual void WaitInputVarGenerated(const platform::Place &place);
+
+  virtual bool NeedWait(VarHandleBase *in_var);
+
+  // If the Op involves data transfer of multiple devices that
+  // will likely block other computations.
+  virtual bool IsMultiDeviceTransfer() { return false; }
+
+  const platform::DeviceContext *DeviceContext(platform::Place place) {
+    return dev_ctxes_[place];
+  }
+
+  void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) {
+    dev_ctxes_[place] = ctx_;
+  }
+
+  const std::vector<VarHandleBase *> &Inputs() const { return inputs_; }
+
+  size_t NoDupInputSize() const {
+    std::unordered_set<VarHandleBase *> res;
+    for (auto *var : inputs_) {
+      res.emplace(var);
+    }
+    return res.size();
+  }
+
+  const std::vector<VarHandleBase *> &Outputs() const { return outputs_; }
+
+  size_t NoDummyInputSize() const;
+
+ protected:
+  void RunAndRecordEvent(const std::function<void()> &callback);
+
+  void RunAndRecordEvent(platform::Place p,
+                         const std::function<void()> &callback);
+
+  virtual void RunImpl() = 0;
+
+  std::vector<VarHandleBase *> inputs_;
+  std::vector<VarHandleBase *> outputs_;
+  std::map<platform::Place, platform::DeviceContext *> dev_ctxes_;
+
+#ifdef PADDLE_WITH_CUDA
+  std::unordered_map<int, cudaEvent_t> events_;
+#endif
+
+  DISABLE_COPY_AND_ASSIGN(OpHandleBase);
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h
index d73604ad185a66ade0168f585d1951d0d7d4a5f9..eea7e712f8f6e187cdceedce77cc76d1d4ca2101 100644
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
+#include <tuple>
+#include <vector>
 #include "paddle/fluid/framework/grad_op_desc_maker.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
@@ -92,8 +95,8 @@ struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
   void operator()(const char* op_type, OpInfo* info) const {
     info->proto_ = new proto::OpProto;
     info->checker_ = new OpAttrChecker();
-    auto maker = T(info->proto_, info->checker_);
-    maker.Validate();
+    T maker;
+    maker(info->proto_, info->checker_);
     info->proto_->set_type(op_type);
     PADDLE_ENFORCE(
         info->proto_->IsInitialized(),
diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0cd873a1d83fa8c2c7b7cd5acfaad9949bcff7d
--- /dev/null
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -0,0 +1,95 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <map>
+#include <vector>
+#include "paddle/fluid/framework/details/reduce_and_gather.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/selected_rows.h"
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct ReduceLoDTensor {
+  const std::vector<const LoDTensor *> &src_tensors_;
+  LoDTensor &dst_tensor_;
+
+  ReduceLoDTensor(const std::vector<const LoDTensor *> &src, LoDTensor *dst)
+      : src_tensors_(src), dst_tensor_(*dst) {}
+
+  template <typename T>
+  void operator()() const {
+    PADDLE_ENFORCE(!src_tensors_.empty());
+    auto &t0 = *src_tensors_[0];
+    PADDLE_ENFORCE_NE(t0.numel(), 0);
+    dst_tensor_.Resize(t0.dims());
+    T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
+    if (dst != t0.data<T>()) {
+      std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
+    }
+
+    for (size_t i = 1; i < src_tensors_.size(); ++i) {
+      auto &t = *src_tensors_[i];
+      PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
+      PADDLE_ENFORCE_EQ(t.type(), t0.type());
+      std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
+                     [](T a, T b) -> T { return a + b; });
+    }
+  }
+};
+
+inline void GatherSelectedRows(
+    const std::vector<const SelectedRows *> &src_selecte_rows_,
+    const std::vector<platform::Place> &in_places,
+    const std::map<platform::Place, platform::DeviceContext *> &dev_ctxes,
+    const platform::Place &out_place, SelectedRows *dst_selecte_rows) {
+  PADDLE_ENFORCE(!src_selecte_rows_.empty());
+
+  std::vector<Tensor> in_tensors;
+  std::vector<int64_t> out_rows;
+
+  for (auto in_sr_ptr : src_selecte_rows_) {
+    auto &in_sr = *in_sr_ptr;
+    in_tensors.emplace_back(in_sr.value());
+    out_rows.insert(out_rows.end(), in_sr.rows().begin(), in_sr.rows().end());
+  }
+
+  auto &pre_in = src_selecte_rows_[0];
+
+  auto &dst_tensor = *dst_selecte_rows;
+  dst_tensor.set_height(pre_in->height());
+  dst_tensor.set_rows(out_rows);
+  size_t rows = out_rows.size();
+  DDim out_dim = pre_in->GetCompleteDims();
+  out_dim[0] = static_cast<int64_t>(rows);
+  dst_tensor.mutable_value()->Resize(out_dim);
+  dst_tensor.mutable_value()->mutable_data(out_place, pre_in->value().type());
+  Tensor *out_tensor = dst_tensor.mutable_value();
+
+  // copy
+  int s = 0, e = 0;
+  for (size_t j = 0; j < in_tensors.size(); ++j) {
+    e += in_tensors[j].dims()[0];
+    auto sub_out = out_tensor->Slice(s, e);
+    paddle::framework::TensorCopy(in_tensors[j], out_place,
+                                  *(dev_ctxes.at(in_places[j])), &sub_out);
+    s = e;
+  }
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7160e346dad0615e2fd32b70c096880af0359e1a
--- /dev/null
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -0,0 +1,165 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/reduce_op_handle.h"
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/reduce_and_gather.h"
+#include "paddle/fluid/framework/details/variable_visitor.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+void ReduceOpHandle::RunImpl() {
+  if (places_.size() == 1) return;
+  // the input and output may have dummy var.
+  auto in_var_handles = DynamicCast<VarHandle>(inputs_);
+
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), places_.size(),
+      "The number of output should equal to the number of places.");
+
+  VarHandle *out_var_handle;
+  {
+    auto out_var_handles = DynamicCast<VarHandle>(outputs_);
+
+    PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
+                      "The number of output should be one.");
+    out_var_handle = out_var_handles.front();
+  }
+
+  auto in_0_handle = in_var_handles[0];
+
+  std::vector<const Scope *> var_scopes;
+  for (auto *s : local_scopes_) {
+    var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get<Scope *>());
+  }
+
+  auto pre_in_var =
+      var_scopes.at(in_0_handle->scope_idx_)->FindVar(in_0_handle->name_);
+  PADDLE_ENFORCE_NOT_NULL(pre_in_var);
+
+  // Wait input done, this Wait is asynchronous operation
+  WaitInputVarGenerated();
+
+  // NOTE: The Places of all input tensor must be all on CPU or all on GPU.
+  std::vector<platform::Place> in_places;  // used to get dev_ctx
+  for (auto *in_handle : in_var_handles) {
+    in_places.emplace_back(in_handle->place_);
+    auto in_var =
+        var_scopes.at(in_handle->scope_idx_)->FindVar(in_handle->name_);
+    PADDLE_ENFORCE_NOT_NULL(in_var);
+    VariableVisitor::EnforceShapeAndDTypeEQ(*pre_in_var, *in_var);
+  }
+
+  auto out_var =
+      var_scopes.at(out_var_handle->scope_idx_)->FindVar(out_var_handle->name_);
+  PADDLE_ENFORCE_NOT_NULL(out_var);
+
+  // NOTE: The tensors' Place of input and output must be all on GPU or all on
+  // CPU.
+  auto in_p = VariableVisitor::GetMutableTensor(pre_in_var).place();
+  platform::Place t_out_p;
+  if (platform::is_gpu_place(in_p)) {
+    PADDLE_ENFORCE(platform::is_gpu_place(out_var_handle->place_),
+                   "Places of input and output must be all on GPU.");
+    t_out_p = out_var_handle->place_;
+  } else {
+    t_out_p = platform::CPUPlace();
+  }
+
+  if (pre_in_var->IsType<framework::SelectedRows>()) {
+    this->RunAndRecordEvent([&] {
+      std::vector<const SelectedRows *> in_selected_rows =
+          GetInputValues<SelectedRows>(in_var_handles, var_scopes);
+      GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_, t_out_p,
+                         out_var->GetMutable<framework::SelectedRows>());
+    });
+  } else {
+    std::vector<const LoDTensor *> lod_tensors =
+        GetInputValues<LoDTensor>(in_var_handles, var_scopes);
+    if (paddle::platform::is_cpu_place(lod_tensors[0]->place())) {
+      this->RunAndRecordEvent([&] {
+        ReduceLoDTensor func(lod_tensors,
+                             out_var->GetMutable<framework::LoDTensor>());
+        VisitDataType(ToDataType(lod_tensors[0]->type()), func);
+      });
+    } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
+#ifdef PADDLE_WITH_CUDA
+      auto pre_in = pre_in_var->Get<framework::LoDTensor>();
+      VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var);
+      VariableVisitor::GetMutableTensor(out_var).mutable_data(
+          out_var_handle->place_, pre_in.type());
+
+      auto out_p = out_var_handle->place_;
+      int root_id = boost::get<platform::CUDAPlace>(out_p).device;
+      std::vector<std::function<void()>> all_reduce_calls;
+      for (size_t i = 0; i < var_scopes.size(); ++i) {
+        auto &p = in_places[i];
+        auto &lod_tensor = *lod_tensors[i];
+
+        int dev_id = boost::get<platform::CUDAPlace>(p).device;
+        auto &nccl_ctx = nccl_ctxs_->at(dev_id);
+
+        void *buffer = const_cast<void *>(lod_tensor.data<void>());
+        void *recvbuffer = nullptr;
+        if (root_id == dev_id) {
+          recvbuffer =
+              out_var->GetMutable<framework::LoDTensor>()->mutable_data(
+                  out_var_handle->place_);
+        }
+
+        int type = platform::ToNCCLDataType(lod_tensor.type());
+        size_t numel = static_cast<size_t>(lod_tensor.numel());
+        all_reduce_calls.emplace_back(
+            [buffer, recvbuffer, type, numel, root_id, &nccl_ctx] {
+              PADDLE_ENFORCE(platform::dynload::ncclReduce(
+                  buffer, recvbuffer, numel, static_cast<ncclDataType_t>(type),
+                  ncclSum, root_id, nccl_ctx.comm_, nccl_ctx.stream()));
+            });
+      }
+
+      this->RunAndRecordEvent([&] {
+        platform::NCCLGroupGuard guard;
+        for (auto &call : all_reduce_calls) {
+          call();
+        }
+      });
+#else
+      PADDLE_THROW("CUDA is not enabled.");
+#endif
+    } else {
+      PADDLE_THROW("Place should be CPUPlace or CUDAPlace.");
+    }
+  }
+}
+
+template <typename T>
+std::vector<const T *> ReduceOpHandle::GetInputValues(
+    const std::vector<VarHandle *> &in_var_handles,
+    const std::vector<const Scope *> &var_scopes) const {
+  std::vector<const T *> in_selected_rows;
+  for (auto *in_handle : in_var_handles) {
+    auto &in_sr = var_scopes.at(in_handle->scope_idx_)
+                      ->FindVar(in_handle->name_)
+                      ->Get<T>();
+    in_selected_rows.emplace_back(&in_sr);
+  }
+  return in_selected_rows;
+}
+
+std::string ReduceOpHandle::Name() const { return "reduce"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d14334cdfe06e2e805c2577458d6689e6324cc7
--- /dev/null
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -0,0 +1,71 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/device_context.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct ReduceOpHandle : public OpHandleBase {
+  std::vector<Scope *> local_scopes_;
+  std::vector<platform::Place> places_;
+
+#ifdef PADDLE_WITH_CUDA
+  const platform::NCCLContextMap *nccl_ctxs_;
+  ReduceOpHandle(const std::vector<Scope *> &local_scopes,
+                 const std::vector<platform::Place> &places,
+                 const platform::NCCLContextMap *nccl_ctxs)
+      : local_scopes_(local_scopes), places_(places), nccl_ctxs_(nccl_ctxs) {
+    if (nccl_ctxs_) {
+      for (auto &p_ctx : nccl_ctxs_->contexts_) {
+        dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
+      }
+    }
+  }
+#else
+  ReduceOpHandle(const std::vector<Scope *> &local_scopes,
+                 const std::vector<platform::Place> &places)
+      : local_scopes_(local_scopes), places_(places) {}
+#endif
+
+  std::string Name() const override;
+
+  bool IsMultiDeviceTransfer() override { return true; };
+
+ protected:
+  void RunImpl() override;
+
+  template <typename T>
+  std::vector<const T *> GetInputValues(
+      const std::vector<VarHandle *> &in_var_handles,
+      const std::vector<const Scope *> &var_scopes) const;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ffdd7c14eb5097cc8285da090e4a72e1e3f43d86
--- /dev/null
+++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc
@@ -0,0 +1,286 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/reduce_op_handle.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+// test data amount
+const f::DDim kDims = {20, 20};
+
+struct TestReduceOpHandle {
+  bool use_gpu_;
+  Scope g_scope_;
+  std::vector<Scope *> local_scopes_;
+  std::vector<Scope *> param_scopes_;
+  std::unique_ptr<OpHandleBase> op_handle_;
+  std::vector<std::unique_ptr<VarHandleBase>> vars_;
+  std::vector<p::Place> gpu_list_;
+  std::vector<std::unique_ptr<p::DeviceContext>> ctxs_;
+
+#ifdef PADDLE_WITH_CUDA
+  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
+#endif
+
+  void WaitAll() {
+    for (size_t j = 0; j < ctxs_.size(); ++j) {
+      ctxs_[j]->Wait();
+    }
+#ifdef PADDLE_WITH_CUDA
+    if (nccl_ctxs_) {
+      nccl_ctxs_->WaitAll();
+    }
+#endif
+  }
+
+  void InitCtxOnGpu(bool use_gpu) {
+    use_gpu_ = use_gpu;
+    if (use_gpu) {
+#ifdef PADDLE_WITH_CUDA
+      int count = p::GetCUDADeviceCount();
+      if (count <= 1) {
+        LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
+                        "device count is "
+                     << count;
+        exit(0);
+      }
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CUDAPlace(i);
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CUDADeviceContext(p));
+      }
+      nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_));
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+      int count = 8;
+      for (int i = 0; i < count; ++i) {
+        auto p = p::CPUPlace();
+        gpu_list_.push_back(p);
+        ctxs_.emplace_back(new p::CPUDeviceContext(p));
+      }
+#ifdef PADDLE_WITH_CUDA
+      nccl_ctxs_.reset(nullptr);
+#endif
+    }
+  }
+
+  void InitReduceOp(size_t out_scope_idx) {
+    // init scope
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      local_scopes_.push_back(&(g_scope_.NewScope()));
+      Scope &local_scope = local_scopes_.back()->NewScope();
+      *local_scopes_.back()
+           ->Var(details::kLocalExecScopeName)
+           ->GetMutable<Scope *>() = &local_scope;
+      local_scope.Var("input");
+      param_scopes_.emplace_back(&local_scope);
+    }
+    param_scopes_[out_scope_idx]->Var("out");
+
+    if (use_gpu_) {
+#ifdef PADDLE_WITH_CUDA
+      op_handle_.reset(
+          new ReduceOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+#else
+      PADDLE_THROW("CUDA is not support.");
+#endif
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      op_handle_.reset(
+          new ReduceOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
+#else
+      op_handle_.reset(new ReduceOpHandle(local_scopes_, gpu_list_));
+#endif
+    }
+
+    // init op handle
+    // add input
+    for (size_t j = 0; j < gpu_list_.size(); ++j) {
+      if (!use_gpu_) {
+        op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
+      }
+      auto *in_var_handle = new VarHandle(1, j, "input", gpu_list_[j]);
+      in_var_handle->generated_op_ = nullptr;
+      vars_.emplace_back(in_var_handle);
+      op_handle_->AddInput(in_var_handle);
+    }
+
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle *in_dummy_var_handle =
+        static_cast<DummyVarHandle *>(vars_.back().get());
+    in_dummy_var_handle->generated_op_ = nullptr;
+    op_handle_->AddInput(in_dummy_var_handle);
+
+    // add output
+    auto *out_var_handle =
+        new VarHandle(2, out_scope_idx, "out", gpu_list_[out_scope_idx]);
+    vars_.emplace_back(out_var_handle);
+    op_handle_->AddOutput(out_var_handle);
+
+    // add dummy var
+    vars_.emplace_back(new DummyVarHandle());
+    DummyVarHandle *dummy_var_handle =
+        static_cast<DummyVarHandle *>(vars_.back().get());
+    op_handle_->AddOutput(dummy_var_handle);
+  }
+
+  void TestReduceSelectedRows(size_t output_scope_idx) {
+    int height = kDims[0] * 2;
+    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
+                              2, 4, 6, 3, 1, 1, 1,  1, 3, 7};
+    std::vector<float> send_vector(f::product(kDims));
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k;
+    }
+
+    for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
+         ++input_scope_idx) {
+      auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
+      PADDLE_ENFORCE_NOT_NULL(in_var);
+      auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+      auto value = in_selected_rows->mutable_value();
+      value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
+
+      in_selected_rows->set_height(height);
+      in_selected_rows->set_rows(rows);
+
+      paddle::framework::TensorFromVector<float>(
+          send_vector, *(ctxs_[input_scope_idx]), value);
+      value->Resize(kDims);
+    }
+
+    auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
+    PADDLE_ENFORCE_NOT_NULL(out_var);
+    auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
+
+    auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
+    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
+
+    out_selected_rows->mutable_value()->ShareDataWith(
+        in_selected_rows->value());
+
+    op_handle_->Run(false);
+
+    WaitAll();
+
+    p::CPUPlace cpu_place;
+
+    auto &out_select_rows = out_var->Get<f::SelectedRows>();
+    auto rt = out_select_rows.value();
+
+    PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal.");
+    for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
+      PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]);
+    }
+
+    f::Tensor result_tensor;
+    f::TensorCopySync(rt, cpu_place, &result_tensor);
+    float *ct = result_tensor.data<float>();
+
+    for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) {
+      ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
+    }
+  }
+
+  void TestReduceLodTensors(size_t output_scope_idx) {
+    std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
+    for (size_t k = 0; k < send_vector.size(); ++k) {
+      send_vector[k] = k;
+    }
+    f::LoD lod{{0, 10, 20}};
+
+    for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
+         ++input_scope_idx) {
+      auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
+      PADDLE_ENFORCE_NOT_NULL(in_var);
+      auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
+      in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
+      in_lod_tensor->set_lod(lod);
+
+      paddle::framework::TensorFromVector<float>(
+          send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor);
+    }
+
+    auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
+    PADDLE_ENFORCE_NOT_NULL(out_var);
+    auto out_lodtensor = out_var->GetMutable<f::LoDTensor>();
+
+    auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
+    auto in_lodtensor = in_var->Get<f::LoDTensor>();
+
+    out_lodtensor->ShareDataWith(in_lodtensor);
+
+    op_handle_->Run(false);
+
+    WaitAll();
+
+    p::CPUPlace cpu_place;
+
+    auto &rt = out_var->Get<f::LoDTensor>();
+
+    f::Tensor result_tensor;
+    f::TensorCopySync(rt, cpu_place, &result_tensor);
+    float *ct = result_tensor.data<float>();
+
+    for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) {
+      ASSERT_NEAR(ct[j], send_vector[j] * gpu_list_.size(), 1e-5);
+    }
+  }
+};
+
+TEST(ReduceTester, TestCPUReduceTestSelectedRows) {
+  TestReduceOpHandle test_op;
+  size_t out_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitReduceOp(out_scope_idx);
+  test_op.TestReduceSelectedRows(out_scope_idx);
+}
+TEST(ReduceTester, TestCPUReduceTestLodTensor) {
+  TestReduceOpHandle test_op;
+  size_t out_scope_idx = 0;
+  test_op.InitCtxOnGpu(false);
+  test_op.InitReduceOp(out_scope_idx);
+  test_op.TestReduceLodTensors(out_scope_idx);
+}
+#ifdef PADDLE_WITH_CUDA
+
+TEST(ReduceTester, TestGPUReduceTestSelectedRows) {
+  TestReduceOpHandle test_op;
+  size_t out_scope_idx = 0;
+  test_op.InitCtxOnGpu(true);
+  test_op.InitReduceOp(out_scope_idx);
+  test_op.TestReduceSelectedRows(out_scope_idx);
+}
+
+TEST(ReduceTester, TestGPUReduceTestLodTensor) {
+  TestReduceOpHandle test_op;
+  size_t out_scope_idx = 0;
+  test_op.InitCtxOnGpu(true);
+  test_op.InitReduceOp(out_scope_idx);
+  test_op.TestReduceLodTensors(out_scope_idx);
+}
+#endif
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..586465f99fd94117c821be2952bffda385fbcf75
--- /dev/null
+++ b/paddle/fluid/framework/details/rpc_op_handle.cc
@@ -0,0 +1,51 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/rpc_op_handle.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+RPCOpHandle::RPCOpHandle(const framework::OpDesc &op_desc,
+                         const Scope *local_scope, const std::string &name,
+                         const platform::Place &place)
+    : op_(framework::OpRegistry::CreateOp(op_desc)),
+      local_scope_(local_scope),
+      name_(name),
+      place_(place) {}
+
+void RPCOpHandle::RunImpl() {
+  // TODO(wuyi): need further analysis whether wait VarDummyHandle.
+  // Wait input done
+  for (auto *in : inputs_) {
+    auto &p = static_cast<VarHandle *>(in)->place_;
+    // FIXME(Yancey1989): need a better solution instead of use DebugString()
+    if (in->DebugString() == "dummy") {  // HACK
+      continue;
+    }
+    if (in->generated_op_) {
+      in->generated_op_->RecordWaitEventOnCtx(dev_ctxes_[p]);
+    }
+  }
+  auto &tmp_scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  // FIXME(wuyi): can not use RunAndRecordEvent here, for it will cause dead
+  // lock.
+  op_->Run(*tmp_scope, place_);
+}
+
+std::string RPCOpHandle::Name() const { return name_; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/rpc_op_handle.h b/paddle/fluid/framework/details/rpc_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae38c7fe19e102a330455d89a1068414a7835fab
--- /dev/null
+++ b/paddle/fluid/framework/details/rpc_op_handle.h
@@ -0,0 +1,52 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct RPCOpHandle : public OpHandleBase {
+  RPCOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
+              const std::string& name, const platform::Place& place);
+
+  std::string Name() const override;
+
+  // Delay and buffer nccl_all_reduce together can significantly increase
+  // performance. Disable this feature by returning false.
+  bool IsMultiDeviceTransfer() override { return false; };
+
+ protected:
+  void RunImpl() override;
+
+ private:
+  std::unique_ptr<OperatorBase> op_;
+  const Scope* local_scope_;
+  const std::string name_;
+  platform::Place place_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d9c387e79dc71288e7330597fed57171d447f31b
--- /dev/null
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -0,0 +1,59 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
+
+#include <string>
+
+namespace paddle {
+namespace framework {
+namespace details {
+ScaleLossGradOpHandle::ScaleLossGradOpHandle(size_t num_dev, Scope *scope,
+                                             platform::Place place,
+                                             platform::DeviceContext *dev_ctx)
+    : coeff_(static_cast<float>(1.0 / num_dev)), scope_(scope), place_(place) {
+  dev_ctxes_[place_] = dev_ctx;
+}
+
+ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {}
+
+void ScaleLossGradOpHandle::RunImpl() {
+  // Doesn't wait any event
+  std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name_;
+  auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+
+  float *tmp = local_scope.FindVar(var_name)
+                   ->GetMutable<LoDTensor>()
+                   ->mutable_data<float>(make_ddim({1}), place_);
+
+  if (platform::is_cpu_place(place_)) {
+    *tmp = coeff_;
+  } else {
+#ifdef PADDLE_WITH_CUDA
+    this->RunAndRecordEvent([&] {
+      auto stream =
+          static_cast<platform::CUDADeviceContext *>(this->dev_ctxes_[place_])
+              ->stream();
+      memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
+                   platform::CPUPlace(), &coeff_, sizeof(float), stream);
+      VLOG(1) << place_ << "RUN Scale loss grad op";
+    });
+#endif
+  }
+}
+
+std::string ScaleLossGradOpHandle::Name() const { return "Scale LossGrad"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..d93d599d46f130cf98f39f15697ce994a31e20c3
--- /dev/null
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
@@ -0,0 +1,46 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct ScaleLossGradOpHandle : public OpHandleBase {
+  ScaleLossGradOpHandle(size_t num_dev, Scope *scope, platform::Place place,
+                        platform::DeviceContext *context);
+
+  ~ScaleLossGradOpHandle() final;
+
+  std::string Name() const override;
+
+ protected:
+  void RunImpl() override;
+
+ private:
+  float coeff_;
+  Scope *scope_;
+  platform::Place place_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eb4e7ec52f907f9403e21ec2734d61824f51a58b
--- /dev/null
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -0,0 +1,76 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/executor.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
+    ExecutionStrategy strategy, std::vector<Scope *> local_scopes,
+    std::vector<VariableInfo> var_infos, std::vector<platform::Place> places,
+    std::unique_ptr<SSAGraphExecutor> &&underlying_executor)
+    : strategy_(std::move(strategy)),
+      underlying_executor_(std::move(underlying_executor)),
+      local_scopes_(std::move(local_scopes)),
+      var_infos_(std::move(var_infos)),
+      places_(std::move(places)) {}
+
+FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
+    const std::vector<std::string> &fetch_tensors) {
+  if (drop_scope_counter_ == 0) {
+    // Create local scopes.
+    for (auto it = local_scopes_.rbegin(); it != local_scopes_.rend(); ++it) {
+      auto &scope = *it;
+      Scope &local_scope = scope->NewScope();
+      *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
+          &local_scope;
+
+      for (auto &info : var_infos_) {
+        if (scope->FindVar(info.name_) != nullptr) {
+          continue;
+        }
+
+        if (info.persistable_) {  // Persistable
+          InitializeVariable(scope->Var(info.name_), info.type_);
+        } else {
+          InitializeVariable(local_scope.Var(info.name_), info.type_);
+        }
+      }
+    }
+  }
+
+  auto fetch_data = underlying_executor_->Run(fetch_tensors);
+  drop_scope_counter_ += 1;
+  if (!fetch_tensors.empty() ||
+      drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
+    drop_scope_counter_ = 0;
+    // Wait All computational streams
+    for (auto p : places_) {
+      platform::DeviceContextPool::Instance().Get(p)->Wait();
+    }
+    for (auto &scope : local_scopes_) {
+      auto &local_scope =
+          *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
+      scope->DeleteScope(local_scope);
+    }
+  }
+  return fetch_data;
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..20df7a4722d589ffd168f842e927cff8411096bb
--- /dev/null
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/execution_strategy.h"
+#include "paddle/fluid/framework/details/ssa_graph_executor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/place.h"
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct VariableInfo {
+  std::string name_;
+  proto::VarType::Type type_;
+  bool persistable_;
+};
+
+class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
+ public:
+  ScopeBufferedSSAGraphExecutor(
+      ExecutionStrategy strategy, std::vector<Scope*> local_scopes,
+      std::vector<VariableInfo> var_infos, std::vector<platform::Place> places,
+      std::unique_ptr<SSAGraphExecutor>&& underlying_executor);
+  FeedFetchList Run(const std::vector<std::string>& fetch_tensors) override;
+
+ private:
+  size_t drop_scope_counter_{0};
+
+  ExecutionStrategy strategy_;
+  std::unique_ptr<SSAGraphExecutor> underlying_executor_;
+  std::vector<Scope*> local_scopes_;
+  std::vector<VariableInfo> var_infos_;
+  std::vector<platform::Place> places_;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph.cc b/paddle/fluid/framework/details/ssa_graph.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1b8c889449059c563ea39f86250075ac2537cdbe
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph.cc
@@ -0,0 +1,15 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/ssa_graph.h"
diff --git a/paddle/fluid/framework/details/ssa_graph.h b/paddle/fluid/framework/details/ssa_graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..e996a00c162186e47e77d007503ac67caa9f8024
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph.h
@@ -0,0 +1,49 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/details/var_handle.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+// A SSA graph used by parallel executor.
+struct SSAGraph {
+  // all variable in each devices.
+  // The outside vector is the device vector. Each element of this vector is a
+  // map from variable name to variables. The variables, who have the same name,
+  // will have a different version. The offset in the
+  // `std::vector<std::unique_ptr<VarHandle>>` is the version of varaibles.
+  std::vector<
+      std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>
+      vars_;
+
+  // aux variables to represent dependency. Useful to resolve data hazard.
+  std::unordered_set<std::unique_ptr<VarHandleBase>> dep_vars_;
+
+  // all operators. NOTE that even we use a vector here, the operators is
+  // unordered.
+  std::vector<std::unique_ptr<OpHandleBase>> ops_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..88a21f48879a15450051ad94ed76e1c48bf23014
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_builder.cc
@@ -0,0 +1,88 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+#include <utility>
+
+namespace paddle {
+namespace framework {
+namespace details {
+void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) {
+  for (auto &var_map : graph->vars_) {
+    for (auto &name_pair : var_map) {
+      if (name_pair.second.size() <= 1) {
+        continue;
+      }
+      auto it_new = name_pair.second.rbegin();
+      auto it_old = name_pair.second.rbegin();
+      ++it_old;
+      for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) {
+        auto *write_op = (*it_new)->generated_op_;
+        auto &read_ops = (*it_old)->pending_ops_;
+
+        for (auto *read_op : read_ops) {
+          // Manually add a dependency var from read_op to write_op;
+          if (read_op == write_op) {
+            // Read Write is the same op.
+            continue;
+          }
+
+          auto *dep_var = new DummyVarHandle();
+          read_op->AddOutput(dep_var);
+          write_op->AddInput(dep_var);
+          graph->dep_vars_.emplace(dep_var);
+        }
+      }
+    }
+  }
+}
+
+VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
+    SSAGraph *graph, const std::string &each_var_name,
+    const platform::Place &place, size_t place_offset) {
+  auto &var_holders = graph->vars_[place_offset];
+  auto &var_holder = var_holders[each_var_name];
+  VarHandle *var = nullptr;
+  if (var_holder.empty()) {
+    var = new VarHandle(0, place_offset, each_var_name, place);
+    var_holder.emplace_back(var);
+  } else {
+    var = var_holder.rbegin()->get();
+  }
+  return var;
+}
+
+void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
+                                     const std::string &each_var_name,
+                                     const platform::Place &place,
+                                     size_t place_offset) {
+  auto &vars = graph->vars_[place_offset][each_var_name];
+  size_t version = vars.size();
+  auto var = new VarHandle(version, place_offset, each_var_name, place);
+  vars.emplace_back(var);
+  op_handle->AddOutput(var);
+}
+
+void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) {
+  for (auto &op : graph->ops_) {
+    if (!op->Outputs().empty()) {
+      continue;
+    }
+    auto *dummy_leaf = new DummyVarHandle();
+    graph->dep_vars_.emplace(dummy_leaf);
+    op->AddOutput(dummy_leaf);
+  }
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..18612c3c1b62cf4c2ebdc221c301c59ec81c2da7
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_builder.h
@@ -0,0 +1,62 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/details/ssa_graph.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class SSAGraphBuilder {
+ public:
+  SSAGraphBuilder() {}
+  virtual ~SSAGraphBuilder() {}
+  virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0;
+  virtual int GetVarDeviceID(const std::string &var_name) const = 0;
+
+  DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
+
+ protected:
+  /**
+   * We only handle write after read(WAR), since it should not have a write
+   * after write in program. If there are write after write operators, we need
+   * prune them.
+   *
+   * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
+   */
+  static void PolishGraphToSupportDataHazards(SSAGraph *graph);
+
+  static VarHandle *CreateOrGetLatestVarHandle(SSAGraph *graph,
+                                               const std::string &each_var_name,
+                                               const platform::Place &place,
+                                               size_t place_offset);
+
+  // Add an output variable (each_var_name, place, place_offset) to op_handle,
+  // which belongs to graph
+  static void CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
+                             const std::string &each_var_name,
+                             const platform::Place &place, size_t place_offset);
+
+  static void AddOutputToLeafOps(SSAGraph *graph);
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b4b49d3de6da2e5fd7836668619e42d10bb6b35a
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h"
+#include <fstream>
+#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include "paddle/fluid/framework/details/ssa_graph_checker.h"
+#include "paddle/fluid/framework/details/ssa_graph_printer.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+std::unique_ptr<SSAGraphBuilder> SSAGraphBuilderFactory::Create() {
+  std::unique_ptr<SSAGraphBuilder> res(
+#ifdef PADDLE_WITH_CUDA
+      new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_,
+                                  local_scopes_, nccl_ctxs_, strategy_)
+#else
+      new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_,
+                                  local_scopes_, strategy_)
+#endif
+          );  // NOLINT
+
+  if (!strategy_.debug_graphviz_path_.empty()) {
+    std::unique_ptr<std::ostream> fout(
+        new std::ofstream(strategy_.debug_graphviz_path_));
+    PADDLE_ENFORCE(fout->good());
+    std::unique_ptr<GraphvizSSAGraphPrinter> graphviz_printer(
+        new GraphvizSSAGraphPrinter());
+    res.reset(new SSAGraghBuilderWithPrinter(
+        std::move(fout), std::move(graphviz_printer), std::move(res)));
+  }
+  res.reset(new SSAGraghBuilderWithChecker(std::move(res)));
+
+  return res;
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.h b/paddle/fluid/framework/details/ssa_graph_builder_factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..91a119de83ed3d1573803e48faf86c874eed98d6
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_builder_factory.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+#include "paddle/fluid/platform/place.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace details {
+
+class SSAGraphBuilderFactory {
+ public:
+  SSAGraphBuilderFactory(const std::vector<platform::Place>& places,
+                         const std::string& loss_var_name,
+                         const std::unordered_set<std::string>& param_names,
+                         const std::vector<Scope*>& local_scopes,
+                         const BuildStrategy& strategy)
+      : places_(places),
+        loss_var_name_(loss_var_name),
+        param_names_(param_names),
+        local_scopes_(local_scopes),
+        strategy_(strategy) {
+#ifdef PADDLE_WITH_CUDA
+    nccl_ctxs_ = nullptr;
+#endif
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  void SetNCCLContextMap(platform::NCCLContextMap* nccl_ctxs) {
+    nccl_ctxs_ = nccl_ctxs;
+  }
+#endif
+
+  std::unique_ptr<SSAGraphBuilder> Create();
+
+ private:
+  std::vector<platform::Place> places_;
+  std::string loss_var_name_;
+  std::unordered_set<std::string> param_names_;
+  std::vector<Scope*> local_scopes_;
+  BuildStrategy strategy_;
+
+#ifdef PADDLE_WITH_CUDA
+  platform::NCCLContextMap* nccl_ctxs_;
+#endif
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_checker.cc b/paddle/fluid/framework/details/ssa_graph_checker.cc
new file mode 100644
index 0000000000000000000000000000000000000000..da5428946ee588e8eac1f78929dc0432df532975
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_checker.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/ssa_graph.h"
+#include <string>
+#include "paddle/fluid/framework/details/ssa_graph_checker.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {
+  std::unordered_map<OpHandleBase *, size_t> pending_ops;
+  std::unordered_set<VarHandleBase *> pending_vars;
+  std::unordered_set<VarHandleBase *> ready_vars;
+  std::unordered_set<OpHandleBase *> ready_ops;
+
+  auto insert_pending_var = [&](VarHandleBase *var) {
+    pending_vars.insert(var);
+    if (var->generated_op_ == nullptr) {
+      ready_vars.emplace(var);
+    }
+  };
+
+  for (auto &var_map : graph->vars_) {
+    for (auto &name_pair : var_map) {
+      for (auto &version_pair : name_pair.second) {
+        insert_pending_var(version_pair.get());
+      }
+    }
+  }
+
+  for (auto &var : graph->dep_vars_) {
+    insert_pending_var(var.get());
+  }
+
+  for (auto &op : graph->ops_) {
+    if (op->Inputs().empty()) {
+      ready_ops.insert(op.get());
+    } else {
+      pending_ops.insert({op.get(), op.get()->NoDupInputSize()});
+    }
+  }
+
+  auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
+    for (auto *op : set) {
+      for (auto out : op->Outputs()) {
+        ready_vars.emplace(out);
+      }
+    }
+    set.clear();
+  };
+
+  while (!pending_vars.empty()) {
+    run_all_ops(ready_ops);
+
+    if (ready_vars.empty()) {
+      return false;
+    }
+
+    for (auto ready_var : ready_vars) {
+      pending_vars.erase(ready_var);
+      for (auto *op : ready_var->pending_ops_) {
+        auto &deps = --pending_ops[op];
+        if (deps == 0) {
+          ready_ops.insert(op);
+        }
+      }
+    }
+    ready_vars.clear();
+  }
+  return true;
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h
new file mode 100644
index 0000000000000000000000000000000000000000..331aa9d2b5864c470dbd5e29ef6faccffdcf781c
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_checker.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+
+#include <string>
+
+namespace paddle {
+namespace framework {
+namespace details {
+struct SSAGraph;
+
+class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
+ public:
+  explicit SSAGraghBuilderWithChecker(
+      std::unique_ptr<SSAGraphBuilder>&& builder)
+      : builder_(std::move(builder)) {}
+
+  std::unique_ptr<SSAGraph> Build(const ProgramDesc& program) const override {
+    auto graph = builder_->Build(program);
+    PADDLE_ENFORCE(IsValidGraph(graph.get()));
+    return graph;
+  }
+
+  int GetVarDeviceID(const std::string& var_name) const override {
+    return builder_->GetVarDeviceID(var_name);
+  }
+
+  bool IsValidGraph(const SSAGraph* graph) const;
+
+ private:
+  std::unique_ptr<SSAGraphBuilder> builder_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.cc b/paddle/fluid/framework/details/ssa_graph_executor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..09b97bd0d98dc4ad1124dcbc495cff921bf03efc
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_executor.cc
@@ -0,0 +1,24 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/ssa_graph_executor.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+SSAGraphExecutor::~SSAGraphExecutor() {}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..958086033607a4ed8fb840f5b14fe5779625bd82
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_executor.h
@@ -0,0 +1,39 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/ssa_graph.h"
+#include "paddle/fluid/framework/feed_fetch_type.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+class SSAGraphExecutor {
+  DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor);
+
+ public:
+  SSAGraphExecutor() {}
+
+  virtual ~SSAGraphExecutor();
+
+  virtual FeedFetchList Run(const std::vector<std::string> &fetch_tensors) = 0;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.cc b/paddle/fluid/framework/details/ssa_graph_printer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..22a40ca4b25cdd8ed9856b6c71bffc79561edcac
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_printer.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/ssa_graph_printer.h"
+#include <string>
+#include "paddle/fluid/framework/details/ssa_graph.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+template <typename Callback>
+static inline void IterAllVar(const SSAGraph &graph, Callback callback) {
+  for (auto &each : graph.vars_) {
+    for (auto &pair1 : each) {
+      for (auto &pair2 : pair1.second) {
+        callback(*pair2);
+      }
+    }
+  }
+
+  for (auto &var : graph.dep_vars_) {
+    callback(*var);
+  }
+}
+
+void GraphvizSSAGraphPrinter::Print(const SSAGraph &graph,
+                                    std::ostream &sout) const {
+  size_t var_id = 0;
+  std::unordered_map<const VarHandleBase *, size_t> vars;
+
+  sout << "digraph G {\n";
+
+  IterAllVar(graph, [&](const VarHandleBase &var) {
+    auto *var_ptr = &var;
+    auto *var_handle_ptr = dynamic_cast<const VarHandle *>(var_ptr);
+    auto *dummy_ptr = dynamic_cast<const DummyVarHandle *>(var_ptr);
+
+    size_t cur_var_id = var_id++;
+    vars[var_ptr] = cur_var_id;
+
+    if (var_handle_ptr) {
+      sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_
+           << "\\n"
+           << var_handle_ptr->place_ << "\\n"
+           << var_handle_ptr->version_ << "\"]" << std::endl;
+    } else if (dummy_ptr) {
+      sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl;
+    }
+  });
+
+  size_t op_id = 0;
+  for (auto &op : graph.ops_) {
+    std::string op_name = "op_" + std::to_string(op_id++);
+    sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
+         << std::endl;
+    for (auto in : op->Inputs()) {
+      std::string var_name = "var_" + std::to_string(vars[in]);
+      sout << var_name << " -> " << op_name << std::endl;
+    }
+
+    for (auto out : op->Outputs()) {
+      std::string var_name = "var_" + std::to_string(vars[out]);
+      sout << op_name << " -> " << var_name << std::endl;
+    }
+  }
+
+  sout << "}\n";
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h
new file mode 100644
index 0000000000000000000000000000000000000000..09b0333ef2cb43a306133aa5af98d37c11454d4d
--- /dev/null
+++ b/paddle/fluid/framework/details/ssa_graph_printer.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <iosfwd>
+#include <string>
+#include "paddle/fluid/framework/details/ssa_graph_builder.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+struct SSAGraph;
+class SSAGraphPrinter {
+ public:
+  virtual ~SSAGraphPrinter() {}
+  virtual void Print(const SSAGraph& graph, std::ostream& sout) const = 0;
+};
+
+class GraphvizSSAGraphPrinter : public SSAGraphPrinter {
+ public:
+  void Print(const SSAGraph& graph, std::ostream& sout) const override;
+};
+
+class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
+ public:
+  SSAGraghBuilderWithPrinter(std::ostream& sout,
+                             std::unique_ptr<SSAGraphPrinter>&& printer,
+                             std::unique_ptr<SSAGraphBuilder>&& builder)
+      : printer_(std::move(printer)),
+        builder_(std::move(builder)),
+        stream_ref_(sout) {}
+
+  SSAGraghBuilderWithPrinter(std::unique_ptr<std::ostream>&& sout,
+                             std::unique_ptr<SSAGraphPrinter>&& printer,
+                             std::unique_ptr<SSAGraphBuilder>&& builder)
+      : printer_(std::move(printer)),
+        builder_(std::move(builder)),
+        stream_ptr_(std::move(sout)),
+        stream_ref_(*stream_ptr_) {}
+
+  std::unique_ptr<SSAGraph> Build(const ProgramDesc& program) const override {
+    auto graph = builder_->Build(program);
+    printer_->Print(*graph, stream_ref_);
+    return graph;
+  }
+
+  int GetVarDeviceID(const std::string& var_name) const override {
+    return builder_->GetVarDeviceID(var_name);
+  }
+
+ private:
+  std::unique_ptr<SSAGraphPrinter> printer_;
+  std::unique_ptr<SSAGraphBuilder> builder_;
+  std::unique_ptr<std::ostream> stream_ptr_;
+  std::ostream& stream_ref_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..99b10254a7961bf7b27b256acaece573a71c4115
--- /dev/null
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -0,0 +1,232 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
+    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places,
+    std::unique_ptr<SSAGraph> &&graph)
+    : graph_(std::move(graph)),
+      pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
+                                       : nullptr),
+      local_scopes_(local_scopes),
+      places_(places),
+      fetch_ctxs_(places),
+      running_ops_(0),
+      strategy_(strategy) {}
+
+FeedFetchList ThreadedSSAGraphExecutor::Run(
+    const std::vector<std::string> &fetch_tensors) {
+  std::unordered_map<OpHandleBase *, size_t> pending_ops;
+  std::unordered_set<VarHandleBase *> pending_vars;
+  BlockingQueue<VarHandleBase *> ready_vars;
+  std::unordered_set<OpHandleBase *> ready_ops;
+  // For ops (e.g. nccl_all_reduce) that need to coordinate multiple
+  // streams from multiple GPUs, it's faster to buffer them and schedule
+  // together since we currently cannot overlap computation and memcpy streams.
+  // Should revisit it if overlapping is available.
+  std::unordered_set<OpHandleBase *> delayed_ops;
+
+  // Transform SSAGraph to pending_ops & pending_vars
+  for (auto &var_map : graph_->vars_) {
+    for (auto &name_pair : var_map) {
+      for (auto &version_pair : name_pair.second) {
+        InsertPendingVar(&pending_vars, &ready_vars, version_pair.get());
+      }
+    }
+  }
+  for (auto &var : graph_->dep_vars_) {
+    InsertPendingVar(&pending_vars, &ready_vars, var.get());
+  }
+
+  for (auto &op : graph_->ops_) {
+    if (op->Inputs().empty()) {  // Special case, Op has no input.
+      ready_ops.insert(op.get());
+    } else {
+      InsertPendingOp(&pending_ops, op.get());
+    }
+  }
+
+  // Step 2. Insert FetchOps
+  std::vector<std::unique_ptr<FetchOpHandle>> fetch_ops;
+  std::unordered_set<std::unique_ptr<VarHandleBase>> fetch_dependencies;
+  FeedFetchList fetch_data(fetch_tensors.size());
+
+  InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &pending_ops,
+                 &pending_vars, &ready_vars, &fetch_data);
+
+  auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
+    for (auto *op : set) {
+      running_ops_++;
+      RunOp(&ready_vars, op);
+    }
+    set.clear();
+  };
+
+  // Step 3. Execution
+  while (!pending_vars.empty()) {
+    // 1. Run All Ready ops
+    // Keep loop until all vars are ready.
+    //
+    // NOTE: DelayedOps have a lower priority. It will be scheduled after all
+    // ready_ops have been performed.
+    if (ready_ops.empty() && strategy_.allow_op_delay_ && running_ops_ == 0) {
+      run_all_ops(delayed_ops);
+    } else {
+      run_all_ops(ready_ops);
+    }
+
+    // 2. Find ready variable
+    bool timeout;
+    auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
+
+    if (timeout) {
+      std::lock_guard<std::mutex> l(exception_mu_);
+      if (exception_) {
+        std::exception *exp = exception_.get();
+        if (dynamic_cast<platform::EOFException *>(exp)) {
+          auto e = *static_cast<platform::EOFException *>(exp);
+          exception_.reset();
+          throw e;
+        } else if (dynamic_cast<platform::EnforceNotMet *>(exp)) {
+          auto e = *static_cast<platform::EnforceNotMet *>(exp);
+          exception_.reset();
+          throw e;
+        } else {
+          LOG(FATAL) << "Unknown exception.";
+        }
+      } else {
+        continue;
+      }
+    }
+    // 3. Remove the dependency of ready_var.
+    // Find the ready_ops after the ready_var.
+    for (auto ready_var : cur_ready_vars) {
+      pending_vars.erase(ready_var);
+      for (auto *op : ready_var->pending_ops_) {
+        auto &deps = pending_ops[op];
+        --deps;
+        if (deps == 0) {
+          if (op->IsMultiDeviceTransfer() && strategy_.allow_op_delay_) {
+            delayed_ops.insert(op);
+          } else {
+            ready_ops.insert(op);
+          }
+        }
+      }
+    }
+  }
+  PADDLE_ENFORCE(ready_ops.empty());
+
+  // Wait FetchOps.
+  if (!fetch_ops.empty()) {
+    fetch_ops.clear();
+  }
+
+  return fetch_data;
+}
+
+void ThreadedSSAGraphExecutor::InsertFetchOps(
+    const std::vector<std::string> &fetch_tensors,
+    std::vector<std::unique_ptr<FetchOpHandle>> *fetch_ops,
+    std::unordered_set<std::unique_ptr<VarHandleBase>> *fetch_dependencies,
+    std::unordered_map<OpHandleBase *, size_t> *pending_ops,
+    std::unordered_set<VarHandleBase *> *pending_vars,
+    BlockingQueue<VarHandleBase *> *ready_vars, FeedFetchList *fetch_data) {
+  std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
+
+  for (auto &fetch_var_name : fetch_tensors) {
+    for (auto &var_map : graph_->vars_) {
+      auto it = var_map.find(fetch_var_name);
+      if (it != var_map.end()) {
+        fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get());
+      }
+    }
+  }
+
+  for (size_t i = 0; i < fetch_tensors.size(); ++i) {
+    auto &var_name = fetch_tensors[i];
+    auto &vars = fetched_vars.at(var_name);
+    auto *op = new FetchOpHandle(fetch_data, i, &local_scopes_);
+    fetch_ops->emplace_back(op);
+
+    for (auto &p : places_) {
+      op->SetDeviceContext(p, fetch_ctxs_.Get(p));
+    }
+
+    for (auto *var : vars) {
+      op->AddInput(var);
+    }
+
+    auto *fetch_dummy = new DummyVarHandle();
+    op->AddOutput(fetch_dummy);
+    fetch_dependencies->emplace(fetch_dummy);
+    this->InsertPendingVar(pending_vars, ready_vars, fetch_dummy);
+    this->InsertPendingOp(pending_ops, op);
+  }
+}
+
+void ThreadedSSAGraphExecutor::InsertPendingOp(
+    std::unordered_map<OpHandleBase *, size_t> *pending_ops,
+    OpHandleBase *op_instance) const {
+  pending_ops->insert({op_instance, op_instance->NoDupInputSize()});
+}
+
+void ThreadedSSAGraphExecutor::InsertPendingVar(
+    std::unordered_set<VarHandleBase *> *pending_vars,
+    BlockingQueue<VarHandleBase *> *ready_vars, VarHandleBase *var) const {
+  pending_vars->insert(var);
+  if (var->generated_op_ == nullptr) {
+    ready_vars->Push(var);
+  }
+}
+
+void ThreadedSSAGraphExecutor::RunOp(
+    BlockingQueue<VarHandleBase *> *ready_var_q, details::OpHandleBase *op) {
+  auto op_run = [ready_var_q, op, this] {
+    try {
+      if (VLOG_IS_ON(10)) {
+        VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
+      }
+      op->Run(strategy_.use_cuda_);
+      VLOG(10) << op << " " << op->Name() << " Done ";
+      running_ops_--;
+      ready_var_q->Extend(op->Outputs());
+      VLOG(10) << op << " " << op->Name() << "Signal posted";
+    } catch (platform::EOFException ex) {
+      std::lock_guard<std::mutex> l(exception_mu_);
+      // EOFException will not cover up existing EnforceNotMet.
+      if (exception_.get() == nullptr) {
+        exception_.reset(new platform::EOFException(ex));
+      }
+    } catch (platform::EnforceNotMet ex) {
+      std::lock_guard<std::mutex> l(exception_mu_);
+      exception_.reset(new platform::EnforceNotMet(ex));
+    } catch (...) {
+      LOG(FATAL) << "Unknown exception catched";
+    }
+  };
+  if (pool_) {
+    pool_->enqueue(op_run);
+  } else {
+    op_run();
+  }
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..c69e0487e2e503a0d445300aa2fd6bb9c30b06c9
--- /dev/null
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -0,0 +1,84 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <deque>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include <functional>
+#include "ThreadPool.h"  // ThreadPool in thrird party
+#include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/details/execution_strategy.h"
+#include "paddle/fluid/framework/details/fetch_op_handle.h"
+#include "paddle/fluid/framework/details/ssa_graph_executor.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace details {
+
+class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
+ public:
+  ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
+                           const std::vector<Scope *> &local_scopes,
+                           const std::vector<platform::Place> &places,
+                           std::unique_ptr<SSAGraph> &&graph);
+
+  // Run a SSAGraph by a thread pool
+  // Use topological sort algorithm
+  FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
+
+  ~ThreadedSSAGraphExecutor() {}
+
+ private:
+  void RunOp(BlockingQueue<VarHandleBase *> *ready_var_q,
+             details::OpHandleBase *op);
+
+ private:
+  std::unique_ptr<SSAGraph> graph_;
+  std::unique_ptr<::ThreadPool> pool_;
+  std::vector<Scope *> local_scopes_;
+  std::vector<platform::Place> places_;
+  platform::DeviceContextPool fetch_ctxs_;
+  std::mutex exception_mu_;
+  std::unique_ptr<std::exception> exception_;
+  std::atomic<int> running_ops_;
+
+  void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops,
+                       OpHandleBase *op_instance) const;
+
+  void InsertPendingVar(std::unordered_set<VarHandleBase *> *pending_vars,
+                        BlockingQueue<VarHandleBase *> *ready_vars,
+                        VarHandleBase *var) const;
+
+  void InsertFetchOps(
+      const std::vector<std::string> &fetch_tensors,
+      std::vector<std::unique_ptr<FetchOpHandle>> *fetch_ops,
+      std::unordered_set<std::unique_ptr<VarHandleBase>> *fetch_dependencies,
+      std::unordered_map<OpHandleBase *, size_t> *pending_ops,
+      std::unordered_set<VarHandleBase *> *pending_vars,
+      BlockingQueue<VarHandleBase *> *ready_vars, FeedFetchList *fetch_data);
+
+ private:
+  ExecutionStrategy strategy_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/var_handle.cc b/paddle/fluid/framework/details/var_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6f00abd9473a84a77ed1a39015e2ae079e00be79
--- /dev/null
+++ b/paddle/fluid/framework/details/var_handle.cc
@@ -0,0 +1,32 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/var_handle.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+VarHandleBase::~VarHandleBase() {}
+
+std::string VarHandle::DebugString() const {
+  std::stringstream ss;
+  ss << name_ << ":" << place_;
+  return ss.str();
+}
+
+std::string DummyVarHandle::DebugString() const { return "dummy"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..cae9af7217660fb7e4b8535ee8e022fb3a127668
--- /dev/null
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -0,0 +1,78 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <sstream>
+#include <string>
+#include <unordered_set>
+#include <utility>
+
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+class OpHandleBase;
+
+// VarHandleBase is the var node in the dependency graph.
+// A variable can only be generated by a single operator. i.e.
+// This is a single assignment graph.
+struct VarHandleBase {
+  virtual ~VarHandleBase();
+  virtual std::string DebugString() const = 0;
+
+  // The operator who generate this variable. nullptr if the variable
+  // is a root node.
+  OpHandleBase* generated_op_{nullptr};
+
+  // Operators which depend on this variable ready.
+  std::unordered_set<OpHandleBase*> pending_ops_;
+};
+
+// VarHandle is actually a single version of Runtime Variable.
+// Variable in Runtime mapped to many VarHandles in Graph.
+// Each assignment will generate a new var handle with newer version.
+//
+// NOTE: runtime variables have place.
+struct VarHandle : public VarHandleBase {
+  std::string DebugString() const override;
+
+  VarHandle(size_t version, size_t scope_index, std::string name,
+            platform::Place place)
+      : version_(version),
+        scope_idx_(scope_index),
+        name_(std::move(name)),
+        place_(std::move(place)) {}
+
+  // version field currently is not used, however, just store the version to
+  // debug easily.
+  size_t version_;
+  size_t scope_idx_;
+  std::string name_;
+  platform::Place place_;
+
+  bool IsTheSameVar(const VarHandle& o) const {
+    return o.generated_op_ == generated_op_ && o.name_ == name_ &&
+           o.scope_idx_ == scope_idx_;
+  }
+};
+
+// Dummy Variable. It is used to represent dependencies between operators
+struct DummyVarHandle : public VarHandleBase {
+  std::string DebugString() const override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3dfd14419d94379a0bf79f55d7a139acd77cbd7e
--- /dev/null
+++ b/paddle/fluid/framework/details/variable_visitor.cc
@@ -0,0 +1,139 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/framework/selected_rows.h"
+namespace paddle {
+namespace framework {
+namespace details {
+template <typename Func>
+static void VisitVariable(Variable* var, Func* func) {
+  if (var->IsType<LoDTensor>()) {
+    (*func)(var->GetMutable<LoDTensor>());
+  } else if (var->IsType<SelectedRows>()) {
+    (*func)(var->GetMutable<SelectedRows>());
+  } else {
+    PADDLE_THROW("Not supported type %s", var->Type().name());
+  }
+}
+
+template <typename Func>
+static void VisitVariable(const Variable& var, Func* func) {
+  if (var.IsType<LoDTensor>()) {
+    (*func)(var.Get<LoDTensor>());
+  } else if (var.IsType<SelectedRows>()) {
+    (*func)(var.Get<SelectedRows>());
+  } else {
+    PADDLE_THROW("Not supported type %s", var.Type().name());
+  }
+}
+
+struct TensorVisitor {
+  Tensor* result_{nullptr};
+
+  void operator()(LoDTensor* tensor) { result_ = tensor; }
+
+  void operator()(SelectedRows* selected_rows) {
+    result_ = selected_rows->mutable_value();
+  }
+
+  template <typename T>
+  void operator()() {
+    PADDLE_THROW("Not Support to get LoDTensor from %s", typeid(T).name());
+  }
+};
+
+Tensor& VariableVisitor::GetMutableTensor(Variable* var) {
+  TensorVisitor vistor;
+  VisitVariable(var, &vistor);
+  return *vistor.result_;
+}
+
+struct ShareDimsAndLoDVisitor {
+  Variable* trg_;
+  void operator()(const LoDTensor& val) {
+    auto* tensor = trg_->GetMutable<LoDTensor>();
+    tensor->set_layout(val.layout());
+    tensor->set_lod(val.lod());
+    tensor->Resize(val.dims());
+  }
+
+  void operator()(const SelectedRows& val) {
+    auto* selected_rows = trg_->GetMutable<SelectedRows>();
+    selected_rows->set_rows(val.rows());
+    selected_rows->set_height(val.height());
+    selected_rows->mutable_value()->Resize(val.value().dims());
+  }
+
+  template <typename T>
+  void operator()(const T&) {
+    PADDLE_ENFORCE("ShareDimsAndLoD is not supported by type %s",
+                   typeid(T).name());
+  }
+};
+
+void VariableVisitor::ShareDimsAndLoD(const Variable& src, Variable* trg) {
+  ShareDimsAndLoDVisitor visitor{trg};
+  VisitVariable(src, &visitor);
+}
+
+struct EnforceShapeAndDTypeEQVisitor {
+  const Variable* trg_;
+
+  void operator()(const LoDTensor& src) {
+    auto& tensor = trg_->Get<LoDTensor>();
+    PADDLE_ENFORCE_EQ(
+        src.place().which(), tensor.place().which(),
+        "The Places of the two Variable must be all on CPU or all on GPU.");
+    PADDLE_ENFORCE_EQ(src.type(), tensor.type(),
+                      "The dtype of the two Variable is not equal.");
+    PADDLE_ENFORCE_EQ(src.dims(), tensor.dims(),
+                      "The dims of the two Variable is not equal.");
+    PADDLE_ENFORCE_EQ(src.lod(), tensor.lod(),
+                      "The lod of the two Variable is not equal.");
+    PADDLE_ENFORCE_EQ(src.layout(), tensor.layout(),
+                      "The layout of the two Variable's tensor is not equal.");
+  }
+
+  void operator()(const SelectedRows& src) {
+    auto& selected_rows = trg_->Get<SelectedRows>();
+    PADDLE_ENFORCE_EQ(
+        src.place().which(), selected_rows.place().which(),
+        "The Places of the two Variable must be all on CPU or all on GPU.");
+    PADDLE_ENFORCE_EQ(src.value().type(), selected_rows.value().type(),
+                      "The dtype of the two Variable is not equal.");
+    PADDLE_ENFORCE_EQ(src.value().layout(), selected_rows.value().layout(),
+                      "The layout of the two Variable's tensor is not equal.");
+    PADDLE_ENFORCE_EQ(src.height(), selected_rows.height(),
+                      "The height of the two Variable is not equal.");
+    PADDLE_ENFORCE_EQ(src.GetCompleteDims(), selected_rows.GetCompleteDims(),
+                      "The dims of the two Variable is not equal.");
+  }
+
+  template <typename T>
+  void operator()(const T&) {
+    PADDLE_ENFORCE("EnforceShapeAndDTypeEQ is not supported by type %s",
+                   typeid(T).name());
+  }
+};
+
+void VariableVisitor::EnforceShapeAndDTypeEQ(const Variable& var1,
+                                             const Variable& var2) {
+  EnforceShapeAndDTypeEQVisitor visitor{&var1};
+  VisitVariable(var2, &visitor);
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/variable_visitor.h b/paddle/fluid/framework/details/variable_visitor.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca9a19bdcf1be7bf0e1d2b0de560a38f528a2d2c
--- /dev/null
+++ b/paddle/fluid/framework/details/variable_visitor.h
@@ -0,0 +1,36 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/variable.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class VariableVisitor {
+ public:
+  static Tensor &GetMutableTensor(Variable *var);
+
+  static void ShareDimsAndLoD(const Variable &src, Variable *trg);
+
+  static void EnforceShapeAndDTypeEQ(const Variable &var1,
+                                     const Variable &var2);
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 5cae38b2a857b2037f0e5ae4da50d1591da0c11a..84f67fafa19ac545ebb7a1019059e3c74c363c56 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -14,38 +14,52 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/executor.h"
 
-#include <set>
-
-#include "gflags/gflags.h"
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(benchmark);
-DEFINE_bool(check_nan_inf, false,
-            "Checking whether operator produce NAN/INF or not. It will be "
-            "extremely slow so please use this flag wisely.");
+DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
 
 namespace paddle {
 namespace framework {
+namespace {
+// block id starts from 0. This id is used to represent the codeblock
+// wrapping the first block 0.
+int kProgramId = -1;
+}  // namespace
+
+ExecutorPrepareContext::ExecutorPrepareContext(
+    const framework::ProgramDesc& prog, size_t block_id)
+    : prog_(prog), block_id_(block_id) {}
+
+ExecutorPrepareContext::~ExecutorPrepareContext() {
+  VLOG(5) << "destroy ExecutorPrepareContext";
+}
 
-struct ExecutorPrepareContext {
-  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id)
-      : prog_(prog), block_id_(block_id) {}
+Executor::Executor(const platform::Place& place) : place_(place) {}
 
-  framework::ProgramDesc prog_;
-  size_t block_id_;
-  std::vector<std::unique_ptr<OperatorBase>> ops_;
-};
+#ifdef PADDLE_WITH_DISTRIBUTE
+void Executor::BeginPass() {
+  ::paddle::operators::distributed::RPCClient::GetInstance<
+      ::paddle::operators::distributed::GRPCClient>()
+      ->SendBeginPass();
+}
 
-Executor::Executor(const platform::Place& place) : place_(place) {}
+void Executor::EndPass() {
+  ::paddle::operators::distributed::RPCClient::GetInstance<
+      ::paddle::operators::distributed::GRPCClient>()
+      ->SendEndPass();
+}
+#endif
 
-static void CreateTensor(Variable* var, proto::VarType::Type var_type) {
+void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
   if (var_type == proto::VarType::LOD_TENSOR) {
     var->GetMutable<LoDTensor>();
   } else if (var_type == proto::VarType::SELECTED_ROWS) {
@@ -77,26 +91,49 @@ static void CreateTensor(Variable* var, proto::VarType::Type var_type) {
   }
 }
 
-static void CheckTensorNANOrInf(const std::string& name,
-                                const framework::Tensor& tensor) {
-  if (tensor.memory_size() == 0) {
-    return;
+void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
+                               int block_id) {
+  auto& global_block = pdesc.Block(block_id);
+
+  const Scope* ancestor_scope = scope;
+  while (ancestor_scope->parent()) {
+    ancestor_scope = ancestor_scope->parent();
   }
-  if (tensor.type().hash_code() != typeid(float).hash_code() &&
-      tensor.type().hash_code() != typeid(double).hash_code()) {
-    return;
+
+  if (ancestor_scope != scope) {
+    for (auto& var : global_block.AllVars()) {
+      if (var->Name() == framework::kEmptyVarName) {
+        continue;
+      }
+
+      if (var->Persistable()) {
+        auto* ptr = const_cast<Scope*>(ancestor_scope)->Var(var->Name());
+        InitializeVariable(ptr, var->GetType());
+        VLOG(3) << "Create Variable " << var->Name()
+                << " global, which pointer is " << ptr;
+      } else {
+        auto* ptr = scope->Var(var->Name());
+        InitializeVariable(ptr, var->GetType());
+        VLOG(3) << "Create Variable " << var->Name()
+                << " locally, which pointer is " << ptr;
+      }
+    }
+  } else {
+    for (auto& var : global_block.AllVars()) {
+      auto* ptr = scope->Var(var->Name());
+      InitializeVariable(ptr, var->GetType());
+      VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
+              << ptr;
+    }
   }
-  PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
-                 "Tensor %s contains Inf", name);
-  PADDLE_ENFORCE(!framework::TensorContainsNAN(tensor),
-                 "Tensor %s contains NAN", name);
 }
 
 void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                    bool create_local_scope, bool create_vars) {
-  auto* ctx = Prepare(pdesc, block_id);
-  RunPreparedContext(ctx, scope, create_local_scope, create_vars);
-  delete ctx;
+  platform::RecordBlock b(block_id);
+  if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
+  auto ctx = Prepare(pdesc, block_id);
+  RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars);
 }
 
 // Check whether the block already has feed operators and feed_holder.
@@ -106,12 +143,14 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
 // and feed_holder_name. Raise exception when any mismatch is found.
 // Return true if the block has feed operators and holder of matching info.
 static bool has_feed_operators(
-    BlockDesc* block, std::map<std::string, const LoDTensor*>& feed_targets,
+    const BlockDesc& block,
+    const std::map<std::string, const LoDTensor*>& feed_targets,
     const std::string& feed_holder_name) {
   size_t feed_count = 0;
-  for (auto* op : block->AllOps()) {
+  for (auto* op : block.AllOps()) {
     if (op->Type() == kFeedOpType) {
       feed_count++;
+      // The input variable's name of feed_op should be feed_holder_name.
       PADDLE_ENFORCE_EQ(op->Input("X")[0], feed_holder_name,
                         "Input to feed op should be '%s'", feed_holder_name);
       std::string feed_target_name = op->Output("Out")[0];
@@ -127,13 +166,15 @@ static bool has_feed_operators(
         feed_count, feed_targets.size(),
         "The number of feed operators should match 'feed_targets'");
 
-    // When feed operator are present, so should be feed_holder
-    auto var = block->FindVar(feed_holder_name);
-    PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
-                            feed_holder_name);
-    PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FEED_MINIBATCH,
-                      "'%s' variable should be 'FEED_MINIBATCH' type",
-                      feed_holder_name);
+    if (!feed_holder_name.empty()) {
+      // When feed operator are present, so should be feed_holder.
+      auto var = block.FindVar(feed_holder_name);
+      PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
+                              feed_holder_name);
+      PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FEED_MINIBATCH,
+                        "'%s' variable should be 'FEED_MINIBATCH' type",
+                        feed_holder_name);
+    }
   }
 
   return feed_count > 0;
@@ -146,12 +187,14 @@ static bool has_feed_operators(
 // and fetch_holder_name. Raise exception when any mismatch is found.
 // Return true if the block has fetch operators and holder of matching info.
 static bool has_fetch_operators(
-    BlockDesc* block, std::map<std::string, LoDTensor*>& fetch_targets,
+    const BlockDesc& block,
+    const std::map<std::string, LoDTensor*>& fetch_targets,
     const std::string& fetch_holder_name) {
   size_t fetch_count = 0;
-  for (auto* op : block->AllOps()) {
+  for (auto* op : block.AllOps()) {
     if (op->Type() == kFetchOpType) {
       fetch_count++;
+      // The output variable's name of fetch_op should be fetch_holder_name.
       PADDLE_ENFORCE_EQ(op->Output("Out")[0], fetch_holder_name,
                         "Output of fetch op should be '%s'", fetch_holder_name);
       std::string fetch_target_name = op->Input("X")[0];
@@ -167,34 +210,49 @@ static bool has_fetch_operators(
         fetch_count, fetch_targets.size(),
         "The number of fetch operators should match 'fetch_targets'");
 
-    // When fetch operator are present, so should be fetch_holder
-    auto var = block->FindVar(fetch_holder_name);
-    PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
-                            fetch_holder_name);
-    PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FETCH_LIST,
-                      "'%s' variable should be 'FETCH_LIST' type",
-                      fetch_holder_name);
+    if (!fetch_holder_name.empty()) {
+      // When fetch operator are present, so should be fetch_holder.
+      auto var = block.FindVar(fetch_holder_name);
+      PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
+                              fetch_holder_name);
+      PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FETCH_LIST,
+                        "'%s' variable should be 'FETCH_LIST' type",
+                        fetch_holder_name);
+    }
   }
 
   return fetch_count > 0;
 }
 
 void Executor::Run(const ProgramDesc& program, Scope* scope,
-                   std::map<std::string, const LoDTensor*>& feed_targets,
-                   std::map<std::string, LoDTensor*>& fetch_targets,
+                   std::map<std::string, const LoDTensor*>* feed_targets,
+                   std::map<std::string, LoDTensor*>* fetch_targets,
+                   bool create_local_scope, bool create_vars,
                    const std::string& feed_holder_name,
                    const std::string& fetch_holder_name) {
-  auto* copy_program = new ProgramDesc(program);
+  platform::RecordBlock b(kProgramId);
+  if (FLAGS_use_mkldnn) EnableMKLDNN(program);
+  bool has_feed_ops =
+      has_feed_operators(program.Block(0), *feed_targets, feed_holder_name);
+  bool has_fetch_ops =
+      has_fetch_operators(program.Block(0), *fetch_targets, fetch_holder_name);
+
+  ProgramDesc* copy_program = const_cast<ProgramDesc*>(&program);
+  std::unique_ptr<ProgramDesc> unique_ptr_of_copy_program;
+  if (!has_feed_ops || !has_fetch_ops) {
+    unique_ptr_of_copy_program.reset(new ProgramDesc(program));
+    copy_program = unique_ptr_of_copy_program.get();
+  }
   auto* global_block = copy_program->MutableBlock(0);
 
-  if (!has_feed_operators(global_block, feed_targets, feed_holder_name)) {
+  if (!has_feed_ops) {
     // create feed_holder variable
     auto* feed_holder = global_block->Var(feed_holder_name);
     feed_holder->SetType(proto::VarType::FEED_MINIBATCH);
     feed_holder->SetPersistable(true);
 
     int i = 0;
-    for (auto& feed_target : feed_targets) {
+    for (auto& feed_target : (*feed_targets)) {
       std::string var_name = feed_target.first;
       VLOG(3) << "feed target's name: " << var_name;
 
@@ -210,24 +268,14 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
     }
   }
 
-  // map the data of feed_targets to feed_holder
-  for (auto* op : global_block->AllOps()) {
-    if (op->Type() == kFeedOpType) {
-      std::string feed_target_name = op->Output("Out")[0];
-      int idx = boost::get<int>(op->GetAttr("col"));
-      SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name,
-                      idx);
-    }
-  }
-
-  if (!has_fetch_operators(global_block, fetch_targets, fetch_holder_name)) {
+  if (!has_fetch_ops) {
     // create fetch_holder variable
     auto* fetch_holder = global_block->Var(fetch_holder_name);
     fetch_holder->SetType(proto::VarType::FETCH_LIST);
     fetch_holder->SetPersistable(true);
 
     int i = 0;
-    for (auto& fetch_target : fetch_targets) {
+    for (auto& fetch_target : (*fetch_targets)) {
       std::string var_name = fetch_target.first;
       VLOG(3) << "fetch target's name: " << var_name;
 
@@ -243,24 +291,16 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
     }
   }
 
-  Run(*copy_program, scope, 0, true, true);
-
-  // obtain the data of fetch_targets from fetch_holder
-  for (auto* op : global_block->AllOps()) {
-    if (op->Type() == kFetchOpType) {
-      std::string fetch_target_name = op->Input("X")[0];
-      int idx = boost::get<int>(op->GetAttr("col"));
-      *fetch_targets[fetch_target_name] =
-          GetFetchVariable(*scope, fetch_holder_name, idx);
-    }
-  }
-
-  delete copy_program;
+  auto ctx = Prepare(*copy_program, 0);
+  RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets,
+                     create_local_scope, create_vars, feed_holder_name,
+                     fetch_holder_name);
 }
 
-ExecutorPrepareContext* Executor::Prepare(const ProgramDesc& program,
-                                          int block_id) {
-  auto* ctx = new ExecutorPrepareContext(program, block_id);
+std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
+    const ProgramDesc& program, int block_id) {
+  std::unique_ptr<ExecutorPrepareContext> ctx(
+      new ExecutorPrepareContext(program, block_id));
   PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
   auto& block = program.Block(block_id);
   for (auto& op_desc : block.AllOps()) {
@@ -269,63 +309,60 @@ ExecutorPrepareContext* Executor::Prepare(const ProgramDesc& program,
   return ctx;
 }
 
-void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
-                                  bool create_local_scope, bool create_vars) {
-  auto& block = ctx->prog_.Block(ctx->block_id_);
+std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
+    const ProgramDesc& program, const std::vector<int>& block_ids) {
+  std::vector<std::shared_ptr<ExecutorPrepareContext>> result;
+  for (auto& bid : block_ids) {
+    auto* ctx = new ExecutorPrepareContext(program, bid);
+    PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
+    auto& block = program.Block(bid);
+    for (auto& op_desc : block.AllOps()) {
+      ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
+    }
+    result.push_back(std::shared_ptr<ExecutorPrepareContext>(ctx));
+  }
+  return result;
+}
 
+void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
+                                  bool create_local_scope, bool create_vars,
+                                  bool keep_kids) {
   Scope* local_scope = scope;
   if (create_vars) {
     if (create_local_scope) {
       local_scope = &scope->NewScope();
-      for (auto& var : block.AllVars()) {
-        if (var->Name() == framework::kEmptyVarName) {
-          continue;
-        }
-
-        if (var->Persistable()) {
-          auto* ptr = scope->Var(var->Name());
-          CreateTensor(ptr, var->GetType());
-          VLOG(3) << "Create Variable " << var->Name()
-                  << " global, which pointer is " << ptr;
-        } else {
-          auto* ptr = local_scope->Var(var->Name());
-          CreateTensor(ptr, var->GetType());
-          VLOG(3) << "Create Variable " << var->Name()
-                  << " locally, which pointer is " << ptr;
-        }
-      }
-    } else {
-      for (auto& var : block.AllVars()) {
-        auto* ptr = local_scope->Var(var->Name());
-        CreateTensor(ptr, var->GetType());
-        VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
-                << ptr;
-      }
-    }  // if (create_local_scope)
-  }    // if (create_vars)
+    }
+    CreateVariables(ctx->prog_, local_scope, ctx->block_id_);
+  }
 
   for (auto& op : ctx->ops_) {
     VLOG(4) << place_ << " " << op->DebugStringEx(local_scope);
     op->Run(*local_scope, place_);
+    // NOTE! Please do not delete this line, it's usefull because the debug
+    // string before and after op.run are different, after run the output
+    // will have right shape which is usefull for debug.
     VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
 
     if (FLAGS_benchmark) {
       VLOG(2) << "Memory used after operator " + op->Type() + " running: "
               << memory::memory_usage(place_);
     }
-    if (FLAGS_check_nan_inf) {
-      for (auto& vname : op->OutputVars(true)) {
-        auto* var = local_scope->FindVar(vname);
-        if (var == nullptr) continue;
-        if (var->IsType<framework::LoDTensor>()) {
-          CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
-        }
-      }
-    }
   }
-  if (create_vars && create_local_scope) {
+  platform::DeviceContextPool::Instance().Get(place_)->Wait();
+  if (local_scope != scope) {
     scope->DeleteScope(local_scope);
+  } else {
+    if (!keep_kids) {
+      // By default, we should delete all kid scopes after run executor because
+      // some operators may create local scope when running, such as while_op.
+      // But when while_op also create a local executor to run it's sub block,
+      // the sub scopes it created should not be dropped immediately, because
+      // while_grad_op will use some variables created during while_op run, so
+      // we need to keep the kids and wait for the outer executor to drop them.
+      scope->DropKids();
+    }
   }
+
   if (FLAGS_benchmark) {
     VLOG(2) << "-------------------------------------------------------";
     VLOG(2) << "Memory used after deleting local scope: "
@@ -334,5 +371,60 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   }
 }
 
+void Executor::RunPreparedContext(
+    ExecutorPrepareContext* ctx, Scope* scope,
+    std::map<std::string, const LoDTensor*>* feed_targets,
+    std::map<std::string, LoDTensor*>* fetch_targets, bool create_local_scope,
+    bool create_vars, const std::string& feed_holder_name,
+    const std::string& fetch_holder_name) {
+  auto& global_block = ctx->prog_.Block(ctx->block_id_);
+
+  PADDLE_ENFORCE(
+      has_feed_operators(global_block, *feed_targets, feed_holder_name),
+      "Program in ExecutorPrepareContext should has feed_ops.");
+  PADDLE_ENFORCE(
+      has_fetch_operators(global_block, *fetch_targets, fetch_holder_name),
+      "Program in the prepared context should has fetch_ops.");
+
+  // map the data of feed_targets to feed_holder
+  for (auto* op : global_block.AllOps()) {
+    if (op->Type() == kFeedOpType) {
+      std::string feed_target_name = op->Output("Out")[0];
+      int idx = boost::get<int>(op->GetAttr("col"));
+      SetFeedVariable(scope, *(*feed_targets)[feed_target_name],
+                      feed_holder_name, idx);
+    }
+  }
+
+  RunPreparedContext(ctx, scope, create_local_scope, create_vars);
+
+  // obtain the data of fetch_targets from fetch_holder
+  for (auto* op : global_block.AllOps()) {
+    if (op->Type() == kFetchOpType) {
+      std::string fetch_target_name = op->Input("X")[0];
+      int idx = boost::get<int>(op->GetAttr("col"));
+      *(*fetch_targets)[fetch_target_name] =
+          GetFetchVariable(*scope, fetch_holder_name, idx);
+    }
+  }
+}
+
+void Executor::EnableMKLDNN(const ProgramDesc& program) {
+#ifdef PADDLE_WITH_MKLDNN
+  VLOG(3) << "use_mkldnn=True";
+  for (size_t bid = 0; bid < program.Size(); ++bid) {
+    auto* block = const_cast<ProgramDesc&>(program).MutableBlock(bid);
+    for (auto* op : block->AllOps()) {
+      if (op->HasAttr("use_mkldnn")) {
+        op->SetAttr("use_mkldnn", true);
+      }
+    }
+  }
+#else
+  LOG(WARNING)
+      << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option";
+#endif
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 28ce3315154cea45412984df4daf7385ce2cf572..563a4b2bb65dad481a755f67c7f23939816ce8e8 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <map>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
@@ -22,7 +25,17 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-struct ExecutorPrepareContext;
+extern void InitializeVariable(Variable* var, proto::VarType::Type var_type);
+
+struct ExecutorPrepareContext {
+  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id);
+  ~ExecutorPrepareContext();
+
+  const framework::ProgramDesc& prog_;
+  size_t block_id_;
+  std::vector<std::unique_ptr<OperatorBase>> ops_;
+};
+
 class Executor {
  public:
   // TODO(dzhwinter) : Do not rely on this function, it will be removed
@@ -31,6 +44,18 @@ class Executor {
 
   explicit Executor(const platform::Place& place);
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+  /*
+   * Sending signal to pserver to mark current pass started.
+   */
+  void BeginPass();
+
+  /*
+   * Sending signal to pserver to mark current pass finished.
+   */
+  void EndPass();
+#endif
+
   /* @Brief
    * Runtime evaluation of the given ProgramDesc under certain Scope
    *
@@ -42,17 +67,33 @@ class Executor {
            bool create_local_scope = true, bool create_vars = true);
 
   void Run(const ProgramDesc& program, Scope* scope,
-           std::map<std::string, const LoDTensor*>& feed_targets,
-           std::map<std::string, LoDTensor*>& fetch_targets,
+           std::map<std::string, const LoDTensor*>* feed_targets,
+           std::map<std::string, LoDTensor*>* fetch_targets,
+           bool create_local_scope = true, bool create_vars = true,
            const std::string& feed_holder_name = "feed",
            const std::string& fetch_holder_name = "fetch");
 
-  static ExecutorPrepareContext* Prepare(const ProgramDesc& program,
-                                         int block_id);
+  static std::unique_ptr<ExecutorPrepareContext> Prepare(
+      const ProgramDesc& program, int block_id);
+
+  static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare(
+      const ProgramDesc& program, const std::vector<int>& block_ids);
+
+  void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id);
 
   void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                           bool create_local_scope = true,
-                          bool create_vars = true);
+                          bool create_vars = true, bool keep_kids = false);
+
+  void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
+                          std::map<std::string, const LoDTensor*>* feed_targets,
+                          std::map<std::string, LoDTensor*>* fetch_targets,
+                          bool create_local_scope = true,
+                          bool create_vars = true,
+                          const std::string& feed_holder_name = "feed",
+                          const std::string& fetch_holder_name = "fetch");
+
+  void EnableMKLDNN(const ProgramDesc& program);
 
  private:
   const platform::Place place_;
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index a8c3e227db3f8f3781d0acd5e233d7bea1123df1..8e1f93c5ebd448903d70f9668539e077875836e4 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/feed_fetch_method.h"
+#include <string>
+#include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/variable.h"
 
diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h
index d6130f421e122047c2f4ed315e6f2fb7484cda1a..7f504bfd232862c014cb59b6e8301eec74e0351f 100644
--- a/paddle/fluid/framework/feed_fetch_method.h
+++ b/paddle/fluid/framework/feed_fetch_method.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/scope.h"
 
diff --git a/paddle/fluid/framework/feed_fetch_type.h b/paddle/fluid/framework/feed_fetch_type.h
index b0d1e9f0a7074da33af1cd279ab913ab604150b1..fae792ad9fa766f456ed706cc9adeb4e34d20123 100644
--- a/paddle/fluid/framework/feed_fetch_type.h
+++ b/paddle/fluid/framework/feed_fetch_type.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <string>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 
@@ -22,7 +21,8 @@ namespace framework {
 using FeedFetchType = LoDTensor;
 using FeedFetchList = std::vector<FeedFetchType>;
 
-static const std::string kFeedOpType = "feed";
-static const std::string kFetchOpType = "fetch";
+static const char kFeedOpType[] = "feed";
+static const char kFetchOpType[] = "fetch";
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto
index 96f53dc1bc8747e1b8ea84166614f98ff363ae5e..2cf14bd371831ab682166f4256d6966b5ab278c8 100644
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -27,6 +27,7 @@ enum AttrType {
   BOOLEANS = 7;
   BLOCK = 8;
   LONG = 9;
+  BLOCKS = 10;
 }
 
 // OpDesc describes an instance of a C++ framework::OperatorBase
@@ -46,6 +47,7 @@ message OpDesc {
     repeated bool bools = 11;
     optional int32 block_idx = 12;
     optional int64 l = 13;
+    repeated int32 blocks_idx = 14;
   };
 
   message Var {
@@ -71,6 +73,7 @@ message OpProto {
     optional bool duplicable = 3 [ default = false ];
     optional bool intermediate = 4 [ default = false ];
     optional bool dispensable = 5 [ default = false ];
+    optional string reuse = 6;
   }
 
   // AttrProto describes the C++ type Attribute.
@@ -101,6 +104,9 @@ message VarType {
     FP16 = 4;
     FP32 = 5;
     FP64 = 6;
+    // Tensor<size_t> is used in C++.
+    SIZE_T = 19;
+    UINT8 = 20;
 
     // Other types that may need additional descriptions
     LOD_TENSOR = 7;
diff --git a/paddle/fluid/framework/grad_op_desc_maker.h b/paddle/fluid/framework/grad_op_desc_maker.h
index cf697187d6225f3a1d2506120eebe14d4a41dff9..b4d3fa25c35fbf25b3d2fdd9fa1045dda0f773ec 100644
--- a/paddle/fluid/framework/grad_op_desc_maker.h
+++ b/paddle/fluid/framework/grad_op_desc_maker.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
 #include <string>
 #include <unordered_set>
 #include <vector>
@@ -69,8 +70,7 @@ class GradOpDescMakerBase {
                       " for input argument with a list of variables, "
                       " drop_empty_grad is not allowed because it makes"
                       " the correspondence bewteen a variable and its gradient"
-                      " ambiguous. Use REGISTER_OP_EX to register the op"
-                      " or call InputGrad(?,false) in GradOpDescMaker."
+                      " ambiguous."
                       " Op type %s",
                       fwd_op_.Type());
 
diff --git a/paddle/fluid/framework/init.cc b/paddle/fluid/framework/init.cc
index 2e0a224ff5df749fd8c809dc88a85a1643542abf..a1094976f6c0965ac0a601d7e37575969146fdab 100644
--- a/paddle/fluid/framework/init.cc
+++ b/paddle/fluid/framework/init.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/piece.h"
@@ -26,9 +27,11 @@ namespace paddle {
 namespace framework {
 
 std::once_flag gflags_init_flag;
+std::once_flag p2p_init_flag;
 
-void InitGflags(std::vector<std::string> &argv) {
+void InitGflags(std::vector<std::string> argv) {
   std::call_once(gflags_init_flag, [&]() {
+    argv.insert(argv.begin(), "dummy");
     int argc = argv.size();
     char **arr = new char *[argv.size()];
     std::string line;
@@ -42,13 +45,52 @@ void InitGflags(std::vector<std::string> &argv) {
   });
 }
 
-void InitDevices() {
-  /*Init all avaiable devices by default */
+void InitP2P(std::vector<int> devices) {
+#ifdef PADDLE_WITH_CUDA
+  std::call_once(p2p_init_flag, [&]() {
+    int count = devices.size();
+    for (int i = 0; i < count; ++i) {
+      for (int j = 0; j < count; ++j) {
+        if (devices[i] == devices[j]) continue;
+        int can_acess = -1;
+        PADDLE_ENFORCE(
+            cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]),
+            "Failed to test P2P access.");
+        if (can_acess != 1) {
+          LOG(WARNING) << "Cannot enable P2P access from " << devices[i]
+                       << " to " << devices[j];
+        } else {
+          cudaSetDevice(devices[i]);
+          cudaDeviceEnablePeerAccess(devices[j], 0);
+        }
+      }
+    }
+  });
+#endif
+}
+
+void InitDevices(bool init_p2p) {
+  /*Init all available devices by default */
+  std::vector<int> devices;
+#ifdef PADDLE_WITH_CUDA
+  try {
+    int count = platform::GetCUDADeviceCount();
+    for (int i = 0; i < count; ++i) {
+      devices.push_back(i);
+    }
+  } catch (const std::exception &exp) {
+    LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
+  }
+#else
+  LOG(WARNING)
+      << "'CUDA' is not supported, Please re-compile with WITH_GPU option";
+#endif
+  InitDevices(init_p2p, devices);
+}
 
+void InitDevices(bool init_p2p, const std::vector<int> devices) {
   std::vector<platform::Place> places;
-  places.emplace_back(platform::CPUPlace());
   int count = 0;
-
 #ifdef PADDLE_WITH_CUDA
   try {
     count = platform::GetCUDADeviceCount();
@@ -60,11 +102,21 @@ void InitDevices() {
       << "'CUDA' is not supported, Please re-compile with WITH_GPU option";
 #endif
 
-  for (int i = 0; i < count; ++i) {
-    places.emplace_back(platform::CUDAPlace(i));
+  for (size_t i = 0; i < devices.size(); ++i) {
+    if (devices[i] >= count || devices[i] < 0) {
+      LOG(WARNING) << "Invalid devices id.";
+      continue;
+    }
+    places.emplace_back(platform::CUDAPlace(devices[i]));
   }
-
+  if (init_p2p) {
+    InitP2P(devices);
+  }
+  places.emplace_back(platform::CPUPlace());
   platform::DeviceContextPool::Init(places);
+#ifndef PADDLE_WITH_MKLDNN
+  operators::math::SetNumThreads(1);
+#endif
 }
 
 void InitGLOG(const std::string &prog_name) {
diff --git a/paddle/fluid/framework/init.h b/paddle/fluid/framework/init.h
index 7d86d1581190780f513776c69b18ad41eb2ce14d..0e30594672927253cc8083dcb88bb867d63ec729 100644
--- a/paddle/fluid/framework/init.h
+++ b/paddle/fluid/framework/init.h
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include <mutex>
+#include <mutex>  // NOLINT
+#include <string>
+#include <vector>
 
 #include "gflags/gflags.h"
 #include "glog/logging.h"
@@ -20,11 +22,13 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-void InitGflags(std::vector<std::string> &argv);
+void InitGflags(std::vector<std::string> argv);
 
 void InitGLOG(const std::string &prog_name);
 
-void InitDevices();
+void InitDevices(bool init_p2p);
+
+void InitDevices(bool init_p2p, const std::vector<int> devices);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/init_test.cc b/paddle/fluid/framework/init_test.cc
index 2a03f0afe657e4b3ac173e8718dd6f6f81ee5e6a..928e2d14abea604cf483f4bc1e1c58fbae04dd21 100644
--- a/paddle/fluid/framework/init_test.cc
+++ b/paddle/fluid/framework/init_test.cc
@@ -21,7 +21,7 @@ TEST(InitDevices, CPU) {
   using paddle::platform::DeviceContextPool;
 
 #ifndef PADDLE_WITH_CUDA
-  InitDevices();
+  InitDevices(true);
   DeviceContextPool& pool = DeviceContextPool::Instance();
   ASSERT_EQ(pool.size(), 1U);
 #endif
@@ -33,7 +33,7 @@ TEST(InitDevices, CUDA) {
 
 #ifdef PADDLE_WITH_CUDA
   int count = paddle::platform::GetCUDADeviceCount();
-  InitDevices();
+  InitDevices(true);
   DeviceContextPool& pool = DeviceContextPool::Instance();
   ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
 #endif
diff --git a/paddle/fluid/framework/library_type.h b/paddle/fluid/framework/library_type.h
index ea538731b469901a3357d624c5bb0fddc4058488..904cc013012b9c3ea8054816446844f6d2cda26b 100644
--- a/paddle/fluid/framework/library_type.h
+++ b/paddle/fluid/framework/library_type.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <cctype>
+#include <string>
 
 namespace paddle {
 namespace framework {
@@ -67,5 +68,5 @@ inline std::ostream& operator<<(std::ostream& out, LibraryType l) {
   return out;
 }
 
-}  // namespace
-}  // framework
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/lod_rank_table.h b/paddle/fluid/framework/lod_rank_table.h
index ef83e71160e0e52071b033ea8b86e6da91bbfad2..8c6e8b0c66ead96f0e53b56ee951887730b0d77f 100644
--- a/paddle/fluid/framework/lod_rank_table.h
+++ b/paddle/fluid/framework/lod_rank_table.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <iosfwd>
+#include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 8155cb55a468a09320b1196b49fc3e34cea261b1..cba0064f38f89c1dd27cfac1ddb2339a5ee6c93f 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -12,9 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/lod_tensor.h"
+#include <stdint.h>
+#include <string.h>
+#include <algorithm>
+#include <iterator>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/var_type.h"
 
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
@@ -22,11 +28,6 @@ limitations under the License. */
 #include "paddle/fluid/recordio/scanner.h"
 #include "paddle/fluid/recordio/writer.h"
 
-#include <stdint.h>
-#include <string.h>
-#include <algorithm>
-#include <iterator>
-
 namespace paddle {
 namespace framework {
 
@@ -51,8 +52,6 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
 }
 
 std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
-  PADDLE_ENFORCE(t.type().hash_code() == typeid(float).hash_code());
-
   if (!platform::is_cpu_place(t.place())) {
     LoDTensor tt;
     framework::TensorCopy(t, platform::CPUPlace(), &tt);
@@ -70,7 +69,13 @@ std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
   // only print first ten elements
   int64_t size = t.numel() < 10 ? t.numel() : 10;
   for (int64_t i = 0; i < size; ++i) {
-    os << t.data<float>()[i] << " ";
+    if (IsType<float>(t.type())) {
+      os << t.data<float>()[i] << " ";
+    } else if (IsType<int64_t>(t.type())) {
+      os << t.data<int64_t>()[i] << " ";
+    } else {
+      PADDLE_THROW("LoDTensor data type not in [float, int64_t]");
+    }
   }
 
   return os;
@@ -85,6 +90,7 @@ std::string LoDToString(const LoD &lod) {
 LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
                  size_t elem_end) {
   PADDLE_ENFORCE_LT(level, in.size());
+  PADDLE_ENFORCE_LT(elem_begin, elem_end);
   PADDLE_ENFORCE_LT(elem_end, in[level].size());
 
   LoD res;
@@ -294,7 +300,7 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
   TensorFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
 }
 
-void WriteToRecordIO(recordio::Writer &writer,
+void WriteToRecordIO(recordio::Writer *writer,
                      const std::vector<LoDTensor> &tensor,
                      const platform::DeviceContext &dev_ctx) {
   std::stringstream buffer;
@@ -303,18 +309,20 @@ void WriteToRecordIO(recordio::Writer &writer,
   for (auto &each : tensor) {
     SerializeToStream(buffer, each, dev_ctx);
   }
-  writer.Write(buffer.str());
+  writer->Write(buffer.str());
 }
 
 std::vector<LoDTensor> ReadFromRecordIO(
-    recordio::Scanner &scanner, const platform::DeviceContext &dev_ctx) {
-  std::istringstream sin(scanner.Next());
-  uint32_t sz;
-  sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t));
+    recordio::Scanner *scanner, const platform::DeviceContext &dev_ctx) {
   std::vector<LoDTensor> result;
-  result.resize(sz);
-  for (uint32_t i = 0; i < sz; ++i) {
-    DeserializeFromStream(sin, &result[i], dev_ctx);
+  if (scanner->HasNext()) {
+    std::istringstream sin(scanner->Next());
+    uint32_t sz;
+    sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t));
+    result.resize(sz);
+    for (uint32_t i = 0; i < sz; ++i) {
+      DeserializeFromStream(sin, &result[i], dev_ctx);
+    }
   }
   return result;
 }
@@ -378,7 +386,7 @@ void LoDTensor::MergeLoDTensor(
   LoD new_lod = lod_tensors[0]->lod();
   for (size_t i = 1; i < lod_tensors.size(); ++i) {
     auto *t = lod_tensors[i];
-    PADDLE_ENFORCE_EQ(new_type.hash_code(), t->type().hash_code());
+    PADDLE_ENFORCE_EQ(new_type, t->type());
     PADDLE_ENFORCE_EQ(new_layout, t->layout());
 
     PADDLE_ENFORCE_EQ(framework::product(new_dim) / new_dim[0],
@@ -386,6 +394,7 @@ void LoDTensor::MergeLoDTensor(
     new_dim[0] += t->dims()[0];
 
     auto &lod = t->lod();
+    PADDLE_ENFORCE_EQ(new_lod.size(), lod.size());
     for (size_t j = 0; j < lod.size(); ++j) {
       auto &sub_lod = new_lod[j];
       auto &offset = sub_lod.back();
@@ -408,5 +417,38 @@ void LoDTensor::MergeLoDTensor(
   }
 }
 
+LoD ConvertToLengthBasedLoD(const LoD &offset_lod) {
+  LoD length_lod;
+  length_lod.reserve(offset_lod.size());
+  for (size_t lvl = 0; lvl < offset_lod.size(); ++lvl) {
+    std::vector<size_t> level;
+    if (offset_lod[lvl].size() > 0) {
+      level.reserve(offset_lod[lvl].size() - 1);
+    }
+    for (size_t idx = 0; idx < offset_lod[lvl].size() - 1; ++idx) {
+      level.push_back(offset_lod[lvl][idx + 1] - offset_lod[lvl][idx]);
+    }
+    length_lod.push_back(level);
+  }
+  return length_lod;
+}
+
+LoD ConvertToOffsetBasedLoD(const LoD &length_lod) {
+  LoD offset_lod;
+  offset_lod.reserve(length_lod.size());
+  for (size_t lvl = 0; lvl < length_lod.size(); ++lvl) {
+    std::vector<size_t> level;
+    level.reserve(length_lod[lvl].size() + 1);
+    size_t tmp = 0;
+    level.push_back(tmp);
+    for (size_t idx = 0; idx < length_lod[lvl].size(); ++idx) {
+      tmp += length_lod[lvl][idx];
+      level.push_back(tmp);
+    }
+    offset_lod.push_back(level);
+  }
+  return offset_lod;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index dee505fee0dccd8d60bb290a8bec4df243e504a2..4a2729373b5c63176ed1e856f4acf29fd1e73254 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -15,6 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <memory>
+#include <string>
+#include <utility>
+#include <vector>
 #ifdef PADDLE_WITH_CUDA
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
@@ -142,6 +145,7 @@ class LoDTensor : public Tensor {
     return (lod_)[level].size() - 1;
   }
 
+  // Split LoDTensor and copy to each place specified in places.
   std::vector<LoDTensor> SplitLoDTensor(
       const std::vector<platform::Place> places) const;
 
@@ -215,12 +219,26 @@ void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
 void DeserializeFromStream(std::istream& is, LoDTensor* tensor,
                            const platform::DeviceContext& dev_ctx);
 
-extern void WriteToRecordIO(recordio::Writer& writer,
+extern void WriteToRecordIO(recordio::Writer* writer,
                             const std::vector<LoDTensor>& tensor,
                             const platform::DeviceContext& dev_ctx);
 
 extern std::vector<LoDTensor> ReadFromRecordIO(
-    recordio::Scanner& scanner, const platform::DeviceContext& dev_ctx);
+    recordio::Scanner* scanner, const platform::DeviceContext& dev_ctx);
+
+/*
+ * Convert between length-based LoD and offset-based LoD.
+ * The implementation of LoDTensor class use offset-based LoD.
+ * However, we want to expose the more user-friendly length-based
+ * LoD to the Python side instead.
+ *
+ * Example:
+ * If offset_lod = [[0, 2, 3],[0, 3, 5, 9]]
+ * then length_lod = [[2, 1], [3, 2, 4]]
+ */
+LoD ConvertToLengthBasedLoD(const LoD& offset_lod);
+
+LoD ConvertToOffsetBasedLoD(const LoD& length_lod);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/lod_tensor.md b/paddle/fluid/framework/lod_tensor.md
deleted file mode 100644
index 10a8a7867fbf072f585fe3bfb1243e4e6bef4ec8..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/lod_tensor.md
+++ /dev/null
@@ -1,165 +0,0 @@
-# Design Doc: LoD (Level-of-Detail) Tensor
-
-Like other deep learning systems, PaddlePaddle supports training models from sequence data.  Also, like other systems, PaddlePaddle represent a mini-batch of sequences as a Tensor.  What is different is that PaddlePaddle doesn't require all sequences in a mini-batch to be of the same length. Thus no need for padding zeros.
-
-|                       | TensorFlow | PaddlePaddle |
-|-----------------------|------------|--------------|
-| RNN                   | Support    | Support      |
-| recursive RNN         | Support    | Support      |
-| padding zeros         | Must       | No need      |
-| blob data type        | Tensor     | LoDTensor    |
-
-PaddlePaddle achieves this flexibility by passing through a new data type, *LoD Tensor*, which is a Tensor attached with segmentation index known as *LoD*, between operators.  The LoD index doesn't only segment a tensor, but also recursively segments sub-sequences.  This document presents the design of LoD and LoDTensor.
-
-
-## The Challenge: Variable-length Sequences
-
-Most deep learning systems represent a mini-batch as a Tensor.  For example, a mini-batch of 10 images, each of size 32x32, is a 10x32x32 Tensor.  Another example is that each mini-batch contains N sentences, where each word is a D-dimensional one-hot vector.  Suppose that all sentences have the same length L, we can represent this mini-batch by a NxLxD tensor.
-
-Both examples show that the elements of sequences are usually of the same size.  In the first example, all images are 32x32, and in the second one, all words are D-dimensional vectors.  It doesn't make sense to allow variable-sized images, as that would require transformations like convolution to handle variable-sized Tensors.
-
-The real challenge is that in most cases, sentences have variable lengths, and we will need an index data structure to segment the tensor into sequences.  Also, sequences might consist of sub-sequences.
-
-
-## A Solution: The LoD Index
-
-To understand our solution, it is best to look at some examples.
-
-### A Mini-Batch of Sentences
-
-Let's imagine a mini-batch of 3 variable lengths sentences composed of 3, 1, and 2 words, respectively.  We can represent the mini-batch by a (3+1+2)xD tensor plus some index information:
-
-```
-3   1 2
-||| | ||
-```
-
-where each `|` represents a D-dimensional word vector.  The numbers, 3, 1, and 2, form a 1-level LoD.
-
-### Recursive Sequences
-
-Let check another example of a 2-level LoD Tensor.  Consider a mini-batch of three articles with 3, 1, and 2 sentences, and each sentence consists of a variable number of words:
-
-```
-3           1  2
-3   2  4    1  2  3
-||| || |||| |  || |||
-```
-
-### A Mini-Batch of Videos
-
-LoD tensors generalize to the case where elements are higher dimensional objects, like images.  Suppose that a mini-batch contains videos of the same frame size 640x480.  Here is a mini-batch of 3 videos with 3, 1, and 2 frames, respectively.
-
-```
-3     1  2
-口口口 口 口口
-```
-
-The underlying tensor is of size (3+1+2)x640x480, and each `口` represents a 640x480 image.
-
-### A Mini-Batch of Images
-
-In traditional cases like a mini-batch with N fixed-sized images,  the LoD Tensor representation is as
-
-```
-1 1 1 1     1
-口口口口 ... 口
-```
-
-In this case, we don't lose any information by ignoring the many 1's in the index and simply considering this LoD Tensor as a usual Tensor:
-
-```
-口口口口 ... 口
-```
-
-### Model Parameters
-
-A model parameter is just a usual Tensor, which, just like the above example, is a **0-level LoD Tensor**.
-
-
-## The LoD Tensor
-
-Let us revisit above example of the 2-level LoD Tensor
-
-```
-3           1  2
-3   2  4    1  2  3
-||| || |||| |  || |||
-```
-
-It is indeed a tree, where leaves are elementary sequences identified by **branches**.
-
-For example, the third sentence in above example is identified by branch <0,2>, where 0 indicates the first article with length 3, and 2 indicates the third sentence in this article with length 4.
-
-### The LoD Index
-
-We can save the LoD index in the above example
-
-```
-3           1  2
-3   2  4    1  2  3
-```
-
-in a not-full 2D matrix:
-
-```c++
-typedef std::vector<std::vector<int> > LoD;
-```
-
-where
-
-- `LoD.size()` is the number of levels, or the maximum length of branches,
-- `LoD[i][j]` is the length of the j-th segment at the i-th level.
-
-## The Offset Representation
-
-To quickly access elementary sequences, we adopt an offset representation -- instead of saving the lengths, we save the beginning and ending elements of sequences.
-
-In the above example, we accumulate the length of elementary sequences:
-
-```
-3 2 4 1 2 3
-```
-
-into offsets
-
-```
-0  3  5   9   10  12   15
-   =  =   =   =   =    =
-   3  2+3 4+5 1+9 2+10 3+12
-```
-
-so we know that the first sentence is from word 0 to word 3, and the second sentence from work 3 to word 5.
-
-Similarly, the lengths in the top level LoD
-
-```
-3 1 2
-```
-
-are transformed into offsets of elements/words as follows:
-
-```
-0 3 4   6
-  = =   =
-  3 3+1 4+2
-```
-
-## Slicing of LoD Tensors
-
-When we use the above 2-level LoD Tensor as the input to a nested-RNN, we need to retrieve certain sequences.  Here we define the sequence identified by branch <i,j,...> as the **<i,j,...>-slice**.
-
-For example, the <2>-slice of above example is
-
-```
-10      15
-10  12  15
-  || |||
-```
-
-and the <2,0>-slice of above slice is
-
-```
-10  12
-  ||
-```
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
index e691e29383d4842b80769021e0e494967d38e9bb..38d3cd96d65f0a54b0ea87b4c677013f3802adfb 100644
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -12,20 +12,34 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/lod_tensor.h"
-
-#include "paddle/fluid/recordio/scanner.h"
-#include "paddle/fluid/recordio/writer.h"
-
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <algorithm>
 #include <memory>
 #include <vector>
 
+#include "paddle/fluid/framework/lod_tensor.h"
+
+#include "paddle/fluid/recordio/scanner.h"
+#include "paddle/fluid/recordio/writer.h"
+
 namespace paddle {
 namespace framework {
 
+TEST(LoD, PrintLoDTensor) {
+  LoDTensor tensor1;
+  tensor1.mutable_data<float>(platform::CPUPlace());
+  tensor1.data<float>()[0] = 0.2;
+  tensor1.data<float>()[1] = 0.5;
+  LOG(INFO) << tensor1;
+
+  LoDTensor tensor2;
+  tensor2.mutable_data<int64_t>(platform::CPUPlace());
+  tensor2.data<int64_t>()[0] = 1;
+  tensor2.data<int64_t>()[1] = 2;
+  LOG(INFO) << tensor2;
+}
+
 TEST(LoD, data) {
   LoD lod{{0, 1, 2}};
   lod.push_back({0, 2, 4, 5});
@@ -37,7 +51,7 @@ TEST(LoD, data) {
   }
 }
 
-TEST(LodExpand, test) {
+TEST(LoD, ExpandLoD) {
   LoD lod{{0, 2}};
   LoDTensor tensor;
   tensor.set_lod(lod);
@@ -228,11 +242,44 @@ TEST(LoD, CheckAbsLoD) {
   ASSERT_FALSE(CheckAbsLoD(abs_lod0));
 }
 
-TEST(LoDTensor, RecordIO) {
+TEST(LoD, ConvertToLengthBasedLoD) {
+  LoD offset_lod;
+  offset_lod.push_back(std::vector<size_t>({0, 2}));
+  offset_lod.push_back(std::vector<size_t>({0, 1, 3}));
+  offset_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  LoD length_lod = ConvertToLengthBasedLoD(offset_lod);
+
+  LoD expected;
+  expected.push_back(std::vector<size_t>({2}));
+  expected.push_back(std::vector<size_t>({1, 2}));
+  expected.push_back(std::vector<size_t>({2, 2, 1}));
+
+  EXPECT_EQ(length_lod, expected);
+}
+
+TEST(LoD, ConvertToOffsetBasedLoD) {
+  LoD length_lod;
+  length_lod.push_back(std::vector<size_t>({2}));
+  length_lod.push_back(std::vector<size_t>({1, 2}));
+  length_lod.push_back(std::vector<size_t>({2, 2, 1}));
+
+  LoD offset_lod = ConvertToOffsetBasedLoD(length_lod);
+
+  LoD expected;
+  expected.push_back(std::vector<size_t>({0, 2}));
+  expected.push_back(std::vector<size_t>({0, 1, 3}));
+  expected.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  EXPECT_EQ(offset_lod, expected);
+}
+
+template <typename T>
+static void TestRecordIO() {
   LoDTensor tensor;
-  int* tmp = tensor.mutable_data<int>(make_ddim({4, 5}), platform::CPUPlace());
+  T* tmp = tensor.mutable_data<T>(make_ddim({4, 5}), platform::CPUPlace());
   for (int i = 0; i < 20; ++i) {
-    tmp[i] = i;
+    tmp[i] = static_cast<T>(i);
   }
 
   std::stringstream* stream = new std::stringstream();
@@ -240,30 +287,38 @@ TEST(LoDTensor, RecordIO) {
       *platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
   {
     recordio::Writer writer(stream, recordio::Compressor::kSnappy);
-    WriteToRecordIO(writer, {tensor, tensor}, ctx);
-    WriteToRecordIO(writer, {tensor, tensor}, ctx);
+    WriteToRecordIO(&writer, {tensor, tensor}, ctx);
+    WriteToRecordIO(&writer, {tensor, tensor}, ctx);
     writer.Flush();
   }
 
   auto assert_tensor_ok = [](const LoDTensor& tensor) {
     for (int i = 0; i < 20; ++i) {
-      ASSERT_EQ(tensor.data<int>()[i], i);
+      ASSERT_EQ(tensor.data<T>()[i], static_cast<T>(i));
     }
   };
 
   {
     std::unique_ptr<std::istream> stream_ptr(stream);
     recordio::Scanner scanner(std::move(stream_ptr));
-    auto tensors = ReadFromRecordIO(scanner, ctx);
-    ASSERT_EQ(tensors.size(), 2);
+    auto tensors = ReadFromRecordIO(&scanner, ctx);
+    ASSERT_EQ(tensors.size(), static_cast<size_t>(2));
     assert_tensor_ok(tensors[0]);
     assert_tensor_ok(tensors[1]);
-    tensors = ReadFromRecordIO(scanner, ctx);
-    ASSERT_EQ(tensors.size(), 2);
+    tensors = ReadFromRecordIO(&scanner, ctx);
+    ASSERT_EQ(tensors.size(), static_cast<size_t>(2));
     assert_tensor_ok(tensors[0]);
     assert_tensor_ok(tensors[1]);
   }
 }
 
+TEST(LoDTensor, RecordIO) {
+  TestRecordIO<int>();
+  TestRecordIO<int16_t>();
+  TestRecordIO<uint8_t>();
+  TestRecordIO<float>();
+  TestRecordIO<double>();
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/lod_tensor_test.cu b/paddle/fluid/framework/lod_tensor_test.cu
index be65da5ba230e4bb15b09a07431d3107ffe19522..e3efbe4c464493af87e33510647d8c67d457a76d 100644
--- a/paddle/fluid/framework/lod_tensor_test.cu
+++ b/paddle/fluid/framework/lod_tensor_test.cu
@@ -30,7 +30,7 @@ __global__ void test(size_t* a, int size) {
 }
 
 TEST(LoD, data) {
-  paddle::framework::InitDevices();
+  paddle::framework::InitDevices(true);
 
   paddle::framework::LoD lod{{0, 1, 2}};
   lod.push_back({0, 2, 4, 5});
@@ -46,7 +46,7 @@ TEST(LoD, data) {
 }
 
 TEST(LoDTensor, LoDInGPU) {
-  paddle::framework::InitDevices();
+  paddle::framework::InitDevices(true);
 
   paddle::framework::LoDTensor lod_tensor;
   paddle::platform::CUDAPlace place(0);
diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h
index 6a6fa538718837a958b7d82c37f583f62f4bf96e..29b3396bc9854cd3d3ac8d4283f48019c9a9c55f 100644
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <algorithm>
 #include <initializer_list>
 #include <vector>
 
@@ -176,7 +177,7 @@ class Vector {
 
   // resize the vector
   void resize(size_t size) {
-    if (size + 1 < capacity()) {
+    if (size + 1 <= capacity()) {
       size_ = size;
     } else {
       MutableCPU();
diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu
index 4bf78499f2fda2d2631e05ddcbbd0bc49498af1a..d57f82510833d6a0cea7009cf1f0b49543812f8d 100644
--- a/paddle/fluid/framework/mixed_vector_test.cu
+++ b/paddle/fluid/framework/mixed_vector_test.cu
@@ -104,3 +104,11 @@ TEST(mixed_vector, ForEach) {
   for (auto& v : tmp) {
   }
 }
+
+TEST(mixed_vector, Reserve) {
+  paddle::framework::Vector<int> vec;
+  vec.reserve(1);
+  vec.push_back(0);
+  vec.push_back(0);
+  vec.push_back(0);
+}
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index eabfdc11a8b314c4af9626ded3edd1bcba212de1..a190199f1cb1361f67f20c755b8e7ef52c284adc 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_desc.h"
+#include <algorithm>
 #include <functional>
-#include <mutex>
+#include <mutex>  // NOLINT
+#include <string>
 #include <unordered_map>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/shape_inference.h"
@@ -100,7 +103,7 @@ void OpDesc::CopyFrom(const OpDesc &op_desc) {
   need_update_ = true;
 }
 
-OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block)
+OpDesc::OpDesc(const proto::OpDesc &desc, BlockDesc *block)
     : desc_(desc), need_update_(false) {
   // restore inputs_
   int input_size = desc_.inputs_size();
@@ -203,8 +206,14 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
   need_update_ = true;
 }
 
-void OpDesc::SetBlockAttr(const std::string &name, BlockDesc &block) {
-  this->attrs_[name] = &block;
+void OpDesc::SetBlockAttr(const std::string &name, BlockDesc *block) {
+  this->attrs_[name] = block;
+  need_update_ = true;
+}
+
+void OpDesc::SetBlocksAttr(const std::string &name,
+                           std::vector<BlockDesc *> blocks) {
+  this->attrs_[name] = blocks;
   need_update_ = true;
 }
 
@@ -220,6 +229,15 @@ Attribute OpDesc::GetAttr(const std::string &name) const {
   return it->second;
 }
 
+Attribute OpDesc::GetNullableAttr(const std::string &name) const {
+  auto it = attrs_.find(name);
+  if (it != attrs_.end()) {
+    return it->second;
+  } else {
+    return Attribute();
+  }
+}
+
 int OpDesc::GetBlockAttr(const std::string &name) const {
   auto it = attrs_.find(name);
   PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
@@ -231,13 +249,8 @@ const std::unordered_map<std::string, Attribute> &OpDesc::GetAttrMap() const {
 }
 
 void OpDesc::Rename(const std::string &old_name, const std::string &new_name) {
-  for (auto &input : inputs_) {
-    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
-  }
-  for (auto &output : outputs_) {
-    std::replace(output.second.begin(), output.second.end(), old_name,
-                 new_name);
-  }
+  RenameInput(old_name, new_name);
+  RenameOutput(old_name, new_name);
   need_update_ = true;
 }
 
@@ -247,6 +260,13 @@ void OpDesc::RenameOutput(const std::string &old_name,
     std::replace(output.second.begin(), output.second.end(), old_name,
                  new_name);
   }
+
+  auto it = attrs_.find(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName());
+  if (it != attrs_.end()) {
+    auto &op_vars = boost::get<std::vector<std::string>>(it->second);
+    std::replace(op_vars.begin(), op_vars.end(), old_name, new_name);
+  }
+
   need_update_ = true;
 }
 
@@ -255,6 +275,13 @@ void OpDesc::RenameInput(const std::string &old_name,
   for (auto &input : inputs_) {
     std::replace(input.second.begin(), input.second.end(), old_name, new_name);
   }
+
+  auto it = attrs_.find(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName());
+  if (it != attrs_.end()) {
+    auto &op_vars = boost::get<std::vector<std::string>>(it->second);
+    std::replace(op_vars.begin(), op_vars.end(), old_name, new_name);
+  }
+
   need_update_ = true;
 }
 
@@ -284,6 +311,13 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
   void operator()(const std::vector<bool> &v) const {
     VectorToRepeated(v, attr_->mutable_bools());
   }
+  void operator()(const std::vector<BlockDesc *> &v) const {
+    std::vector<int> blocks_idx;
+    for (auto blk : v) {
+      blocks_idx.push_back(blk->ID());
+    }
+    VectorToRepeated(blocks_idx, attr_->mutable_blocks_idx());
+  }
   void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); }
   void operator()(int64_t v) const { attr_->set_l(v); }
   void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index 614dd8cd00eb866cb8cbc41c3e03c25f968a7d2b..74dd8ec002005dd080424b48b5db1a2574a6974f 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
 #include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/attribute.h"
@@ -32,13 +33,14 @@ class OpDesc {
   OpDesc(const std::string &type, const VariableNameMap &inputs,
          const VariableNameMap &outputs, const AttributeMap &attrs);
 
-  OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block);
+  OpDesc(const proto::OpDesc &desc, BlockDesc *block);
 
   explicit OpDesc(BlockDesc *block) : block_(block) {}
 
   OpDesc(const OpDesc &other, BlockDesc *block) {
     *this = other;
     block_ = block;
+    need_update_ = true;
   }
 
   void CopyFrom(const OpDesc &op_desc);
@@ -73,10 +75,14 @@ class OpDesc {
 
   void SetAttr(const std::string &name, const Attribute &v);
 
-  void SetBlockAttr(const std::string &name, BlockDesc &block);
+  void SetBlockAttr(const std::string &name, BlockDesc *block);
+
+  void SetBlocksAttr(const std::string &name, std::vector<BlockDesc *> blocks);
 
   Attribute GetAttr(const std::string &name) const;
 
+  Attribute GetNullableAttr(const std::string &name) const;
+
   int GetBlockAttr(const std::string &name) const;
 
   void Rename(const std::string &old_name, const std::string &new_name);
@@ -119,7 +125,7 @@ class OpDesc {
 
   void InferVarType(BlockDesc *block) const;
 
-  void MarkAsTarget() { desc_.set_is_target(true); }
+  void SetIsTarget(bool is_target) { desc_.set_is_target(is_target); }
 
   void Flush();
 
diff --git a/paddle/fluid/framework/op_info.cc b/paddle/fluid/framework/op_info.cc
index b99e82f8c4358b60a014c6fc7c61c9bbb8683834..f1261dee0319440995951d1bee145404186a8ad4 100644
--- a/paddle/fluid/framework/op_info.cc
+++ b/paddle/fluid/framework/op_info.cc
@@ -17,12 +17,11 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-static OpInfoMap* g_op_info_map = nullptr;
-
+// C++11 removes the need for manual locking. Concurrent execution shall wait if
+// a static local variable is already being initialized.
+// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex
 OpInfoMap& OpInfoMap::Instance() {
-  if (g_op_info_map == nullptr) {
-    g_op_info_map = new OpInfoMap();
-  }
+  static OpInfoMap* g_op_info_map = new OpInfoMap();
   return *g_op_info_map;
 }
 }  // namespace framework
diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h
index 3a1036742c206961fe52660106ae947153e9b244..c59b232191c49ccb47bb9f51dcaf2fd9280fae19 100644
--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/library_type.h"
@@ -86,10 +87,17 @@ inline std::string KernelTypeToString(const OpKernelType& kernel_key) {
 }
 
 inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) {
-  return l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r;
+  bool ret =
+      (l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r);
+#ifdef PADDLE_WITH_MKLDNN
+  // Layout transform needed for either non-MKLDNN to MKLDNN or vice versa
+  ret |= (l != DataLayout::kMKLDNN && r == DataLayout::kMKLDNN);
+  ret |= (l == DataLayout::kMKLDNN && r != DataLayout::kMKLDNN);
+#endif
+  return ret;
 }
 
-inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) {
+inline bool NeedTransform(const OpKernelType& l, const OpKernelType& r) {
   return (!platform::places_are_same_class(l.place_, r.place_)) ||
          (l.data_type_ != r.data_type_) ||
          NeedTransformLayout(l.data_layout_, r.data_layout_);
diff --git a/paddle/fluid/framework/op_kernel_type_test.cc b/paddle/fluid/framework/op_kernel_type_test.cc
index d37ce149ce3df63692b41289bb03448d54e392f5..db95861c510b52a5b52229541434e6437d3fb9f4 100644
--- a/paddle/fluid/framework/op_kernel_type_test.cc
+++ b/paddle/fluid/framework/op_kernel_type_test.cc
@@ -27,7 +27,7 @@ TEST(OpKernelType, ToString) {
                               LibraryType::kCUDNN);
 
   ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type),
-            "data_type[float32]:data_layout[NCHW]:place[CPUPlace]:library_type["
+            "data_type[float]:data_layout[NCHW]:place[CPUPlace]:library_type["
             "CUDNN]");
 }
 
diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
index 3116b03d0433ddf98613796b272238e5fe72ce6a..001b5cb5a8eb57cbe0a2e0ad7f64ef05f8149922 100644
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -12,6 +12,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_proto_maker.h"
+#include <string>
+#include <vector>
 
 namespace paddle {
 namespace framework {
@@ -19,6 +21,7 @@ namespace framework {
 void OpProtoAndCheckerMaker::Validate() {
   validated_ = true;
   CheckNoDuplicatedInOutAttrs();
+  CheckReuseVars();
 }
 
 OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddInput(
@@ -54,5 +57,46 @@ void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
   }
 }
 
+void OpProtoAndCheckerMaker::CheckReuseVars() {
+  std::unordered_set<std::string> names;
+  for (auto& input : proto_->inputs()) {
+    names.insert(input.name());
+  }
+  auto checker = [&](const std::string& name, const std::string& reused) {
+    PADDLE_ENFORCE(
+        names.count(reused),
+        "Output [%s] reuse Input [%s], but the input is not registered.", name,
+        reused);
+  };
+  for (auto& output : proto_->outputs()) {
+    if (output.has_reuse()) {
+      checker(output.name(), output.reuse());
+    }
+  }
+}
+
+void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
+                                        OpAttrChecker* attr_checker) {
+  proto_ = proto;
+  op_checker_ = attr_checker;
+  Make();
+
+  AddAttr<int>(OpRoleAttrName(), "The role of this operator")
+      .InEnum(
+          {static_cast<int>(OpRole::kForward),
+           static_cast<int>(OpRole::kBackward),
+           static_cast<int>(OpRole::kOptimize), static_cast<int>(OpRole::kRPC),
+           static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kForward),
+           static_cast<int>(OpRole::kLoss) |
+               static_cast<int>(OpRole::kBackward),
+           static_cast<int>(OpRole::kNotSpecified)})
+      .SetDefault(static_cast<int>(OpRole::kNotSpecified));
+  AddAttr<std::vector<std::string>>(OpRoleVarAttrName(),
+                                    "Optimized for variable")
+      .SetDefault({});
+
+  Validate();
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index cf56b0fa1894374956b3011c88bc70acdba1e464..92f86bb5de520878d0a7b8d7214620580242c061 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -13,56 +13,76 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
+#include <unordered_set>
+
+#include "glog/logging.h"
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/framework.pb.h"
-
 namespace paddle {
 namespace framework {
 
+enum class OpRole {
+  kForward = 0x0000,
+  kBackward = 0x0001,
+  kOptimize = 0x0002,
+  kRPC = 0x0003,
+
+  kLoss = 0x0100,
+  // The default value of op's role. This should be only used for unittests and
+  // CreateOp inside a operator.
+  kNotSpecified = 0x1000,
+};
+
 // this class not only make proto but also init attribute checkers.
 class OpProtoAndCheckerMaker {
  public:
-  using OpProto = proto::OpProto;
-  using OpAttrChecker = framework::OpAttrChecker;
-  OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : proto_(proto), op_checker_(op_checker) {}
+  static const char *OpRoleAttrName() { return "op_role"; }
+  static const char *OpRoleVarAttrName() { return "op_role_var"; }
+
+  void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);
+
+  virtual void Make() = 0;
 
   virtual ~OpProtoAndCheckerMaker() {
-    PADDLE_ENFORCE(validated_, "should call Validate after build");
+    CHECK(validated_) << "should call Validate after build";
   }
 
-  void Validate();
-
  protected:
   struct VariableBuilder {
-    OpProto::Var* var_;
+    proto::OpProto::Var *var_;
 
-    VariableBuilder& AsDuplicable() {
+    VariableBuilder &AsDuplicable() {
       var_->set_duplicable(true);
       return *this;
     }
 
-    VariableBuilder& AsIntermediate() {
+    VariableBuilder &AsIntermediate() {
       var_->set_intermediate(true);
       return *this;
     }
 
-    VariableBuilder& AsDispensable() {
+    VariableBuilder &AsDispensable() {
       var_->set_dispensable(true);
       return *this;
     }
+
+    VariableBuilder &Reuse(const std::string &name) {
+      var_->set_reuse(name);
+      return *this;
+    }
   };
 
-  VariableBuilder AddInput(const std::string& name, const std::string& comment);
+  VariableBuilder AddInput(const std::string &name, const std::string &comment);
 
-  VariableBuilder AddOutput(const std::string& name,
-                            const std::string& comment);
+  VariableBuilder AddOutput(const std::string &name,
+                            const std::string &comment);
 
   template <typename T>
-  TypedAttrChecker<T>& AddAttr(const std::string& name,
-                               const std::string& comment,
+  TypedAttrChecker<T> &AddAttr(const std::string &name,
+                               const std::string &comment,
                                bool generated = false) {
-    auto* attr = proto_->add_attrs();
+    auto *attr = proto_->add_attrs();
     attr->set_name(name);
     attr->set_comment(comment);
     attr->set_generated(generated);
@@ -70,21 +90,17 @@ class OpProtoAndCheckerMaker {
     return op_checker_->AddAttrChecker<T>(name);
   }
 
-  void AddComment(const std::string& comment) { proto_->set_comment(comment); }
+  void AddComment(const std::string &comment) { proto_->set_comment(comment); }
 
  private:
   void CheckNoDuplicatedInOutAttrs();
+  void Validate();
 
-  OpProto* proto_;
-  OpAttrChecker* op_checker_;
-  bool validated_{false};
-};
+  void CheckReuseVars();
 
-class NOPMaker : public OpProtoAndCheckerMaker {
- public:
-  NOPMaker(OpProto* proto, framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {}
+  proto::OpProto *proto_;
+  OpAttrChecker *op_checker_;
+  bool validated_{false};
 };
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc
index a8d8c6386af940d4a14016b30de344e1c7877b22..58f70cb39c0d96ed3b9ff35ea132ba75a37f5405 100644
--- a/paddle/fluid/framework/op_proto_maker_test.cc
+++ b/paddle/fluid/framework/op_proto_maker_test.cc
@@ -18,9 +18,7 @@ limitations under the License. */
 
 class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
  public:
-  TestAttrProtoMaker(paddle::framework::proto::OpProto* proto,
-                     paddle::framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() {
     AddAttr<float>("scale", "scale of test op");
     AddAttr<float>("scale", "scale of test op");
   }
@@ -29,15 +27,14 @@ class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
 TEST(ProtoMaker, DuplicatedAttr) {
   paddle::framework::proto::OpProto op_proto;
   paddle::framework::OpAttrChecker op_checker;
-  auto proto_maker = TestAttrProtoMaker(&op_proto, &op_checker);
-  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
+  TestAttrProtoMaker proto_maker;
+  ASSERT_THROW(proto_maker(&op_proto, &op_checker),
+               paddle::platform::EnforceNotMet);
 }
 
 class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
  public:
-  TestInOutProtoMaker(paddle::framework::proto::OpProto* proto,
-                      paddle::framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() {
     AddInput("input", "input of test op");
     AddInput("input", "input of test op");
   }
@@ -46,6 +43,27 @@ class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
 TEST(ProtoMaker, DuplicatedInOut) {
   paddle::framework::proto::OpProto op_proto;
   paddle::framework::OpAttrChecker op_checker;
-  auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker);
-  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
+  TestAttrProtoMaker proto_maker;
+  ASSERT_THROW(proto_maker(&op_proto, &op_checker),
+               paddle::platform::EnforceNotMet);
+}
+
+class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "input of test op");
+    AddOutput("XOut", "output of test op").Reuse("X");
+    AddOutput("NoOut", "output of test op").Reuse("NotExists");
+  }
+};
+
+TEST(ProtoMaker, InplaceOutput) {
+  paddle::framework::proto::OpProto op_proto;
+  paddle::framework::OpAttrChecker op_checker;
+  TestInplaceProtoMaker proto_maker;
+  ASSERT_THROW(proto_maker(&op_proto, &op_checker),
+               paddle::platform::EnforceNotMet);
+  // proto_maker(&op_proto, &op_checker);
+  // proto_maker.Make();
+  // ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
 }
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index f1424f13b445155fe4f28732408a2445ab1aa9b7..3314e41cc51d74f87be0e2cd5eba9bb260c16be7 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -16,6 +16,8 @@ limitations under the License. */
 
 #include <algorithm>
 #include <atomic>
+#include <string>
+#include <tuple>
 #include <type_traits>
 #include <typeinfo>
 #include <unordered_map>
@@ -74,6 +76,20 @@ class OpRegistry {
 template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
 struct OpKernelRegistrarFunctor;
 
+template <typename PlaceType, typename T, typename Func>
+inline void RegisterKernelClass(const char* op_type, const char* library_type,
+                                Func func) {
+  std::string library(library_type);
+  std::string data_layout = "ANYLAYOUT";
+  if (library == "MKLDNN") {
+    data_layout = "MKLDNNLAYOUT";
+  }
+  OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
+                   StringToDataLayout(data_layout),
+                   StringToLibraryType(library_type));
+  OperatorWithKernel::AllOpKernels()[op_type][key] = func;
+}
+
 template <typename PlaceType, size_t I, typename... KernelTypes>
 struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
   using KERNEL_TYPE =
@@ -81,10 +97,10 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
 
   void operator()(const char* op_type, const char* library_type) const {
     using T = typename KERNEL_TYPE::ELEMENT_TYPE;
-    OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
-                     DataLayout::kAnyLayout, StringToLibraryType(library_type));
-    OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
-
+    RegisterKernelClass<PlaceType, T>(
+        op_type, library_type, [](const framework::ExecutionContext& ctx) {
+          KERNEL_TYPE().Compute(ctx);
+        });
     constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
     OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
         func;
@@ -97,7 +113,8 @@ struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
   void operator()(const char* op_type, const char* library_type) const {}
 };
 
-// User can register many kernel in one place. The data type could be different.
+// User can register many kernel in one place. The data type could be
+// different.
 template <typename PlaceType, typename... KernelType>
 class OpKernelRegistrar : public Registrar {
  public:
@@ -107,6 +124,47 @@ class OpKernelRegistrar : public Registrar {
   }
 };
 
+template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
+struct OpKernelRegistrarFunctorEx;
+
+template <typename PlaceType, typename... DataTypeAndKernelType>
+class OpKernelRegistrarEx : public Registrar {
+ public:
+  explicit OpKernelRegistrarEx(const char* op_type, const char* library_type) {
+    OpKernelRegistrarFunctorEx<PlaceType, false, 0, DataTypeAndKernelType...>
+        func;
+    func(op_type, library_type);
+  }
+};
+
+template <typename PlaceType, size_t I, typename... DataTypeAndKernelType>
+struct OpKernelRegistrarFunctorEx<PlaceType, true, I,
+                                  DataTypeAndKernelType...> {
+  void operator()(const char* op_type, const char* library_type) const {}
+};
+
+template <typename PlaceType, size_t I, typename... DataTypeAndKernelType>
+struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
+                                  DataTypeAndKernelType...> {
+  using Functor =
+      typename std::tuple_element<I + 1,
+                                  std::tuple<DataTypeAndKernelType...>>::type;
+  using T =
+      typename std::tuple_element<I,
+                                  std::tuple<DataTypeAndKernelType...>>::type;
+
+  void operator()(const char* op_type, const char* library_type) const {
+    RegisterKernelClass<PlaceType, T>(op_type, library_type, Functor());
+
+    constexpr auto size =
+        std::tuple_size<std::tuple<DataTypeAndKernelType...>>::value;
+    OpKernelRegistrarFunctorEx<PlaceType, I + 2 >= size, I + 2,
+                               DataTypeAndKernelType...>
+        func;
+    func(op_type, library_type);
+  }
+};
+
 /**
  * check if MACRO is used in GLOBAL NAMESPACE.
  */
@@ -141,51 +199,21 @@ class OpKernelRegistrar : public Registrar {
     return 0;                                                          \
   }
 
-/**
- * Macro to register Operator. When the input is duplicable, you should
- * use REGISTER_OP_EX with drop_empty_grad=false instead.
- */
-#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \
-                    grad_op_class)                                   \
-  REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type,    \
-                 grad_op_class, true)
-
-// When an argument is duplicable, we need to use this version.
-// Perhaps we can omit DropEmptyIG template parameter and
-// only have one version of REGISTER_OP.
-#define REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type,       \
-                       grad_op_class, drop_empty_grad)                        \
-  REGISTER_OPERATOR(grad_op_type, grad_op_class);                             \
-  class _GradOpDescMaker_##grad_op_type##_                                    \
-      : public ::paddle::framework::DefaultGradOpDescMaker<drop_empty_grad> { \
-    using ::paddle::framework::DefaultGradOpDescMaker<                        \
-        drop_empty_grad>::DefaultGradOpDescMaker;                             \
-                                                                              \
-   protected:                                                                 \
-    virtual std::string GradOpType() const { return #grad_op_type; }          \
-  };                                                                          \
-  REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_,    \
-                    op_maker_class);
-
-#define REGISTER_OP_WITH_KERNEL(op_type, ...)                         \
-  REGISTER_OPERATOR(op_type, ::paddle::framework::OperatorWithKernel, \
-                    ##__VA_ARGS__)
-
 #define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
   REGISTER_OPERATOR(op_type, op_class, op_maker_class)
 
 /**
  * Macro to register OperatorKernel.
  */
-#define REGISTER_OP_KERNEL(op_type, LIBRARY_TYPE, place_class, ...)        \
+#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...)        \
   STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
-      __reg_op_kernel_##op_type##_##LIBRARY_TYPE##__,                      \
+      __reg_op_kernel_##op_type##_##library_type##__,                      \
       "REGISTER_OP_KERNEL must be called in global namespace");            \
   static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__>  \
-      __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__(#op_type,       \
-                                                           #LIBRARY_TYPE); \
-  int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE() {                \
-    __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__.Touch();          \
+      __op_kernel_registrar_##op_type##_##library_type##__(#op_type,       \
+                                                           #library_type); \
+  int TouchOpKernelRegistrar_##op_type##_##library_type() {                \
+    __op_kernel_registrar_##op_type##_##library_type##__.Touch();          \
     return 0;                                                              \
   }
 
@@ -195,6 +223,25 @@ class OpKernelRegistrar : public Registrar {
 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
 
+#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, ...)      \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
+      __reg_op_kernel_##op_type##_##library_type##__,                       \
+      "REGISTER_OP_KERNEL_EX must be called in global namespace");          \
+  static ::paddle::framework::OpKernelRegistrarEx<place_class, __VA_ARGS__> \
+      __op_kernel_registrar_##op_type##_##library_type##__(#op_type,        \
+                                                           #library_type);  \
+  int TouchOpKernelRegistrar_##op_type##_##library_type() {                 \
+    __op_kernel_registrar_##op_type##_##library_type##__.Touch();           \
+    return 0;                                                               \
+  }
+
+#define REGISTER_OP_CUDA_KERNEL_FUNCTOR(op_type, ...)                 \
+  REGISTER_OP_KERNEL_EX(op_type, CUDA, ::paddle::platform::CUDAPlace, \
+                        __VA_ARGS__)
+
+#define REGISTER_OP_CPU_KERNEL_FUNCTOR(op_type, ...) \
+  REGISTER_OP_KERNEL_EX(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
+
 /**
  * Macro to mark what Operator and Kernel
  * we will use and tell the compiler to
diff --git a/paddle/fluid/framework/op_registry_test.cc b/paddle/fluid/framework/op_registry_test.cc
index 0d791c8583537d410b838c1662755938353052a9..18b1649cc71d5edd5b07740bbad1fe8f81128898 100644
--- a/paddle/fluid/framework/op_registry_test.cc
+++ b/paddle/fluid/framework/op_registry_test.cc
@@ -33,8 +33,7 @@ class CosineOp : public OperatorBase {
 
 class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
  public:
-  CosineOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() {
     AddInput("input", "input of cosine op");
     AddOutput("output", "output of cosine op");
     AddAttr<float>("scale", "scale of cosine op")
@@ -55,8 +54,7 @@ class MyTestOp : public OperatorBase {
 
 class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
  public:
-  MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() {
     AddInput("input", "input of cosine op").AsDuplicable();
     AddOutput("output", "output of cosine op").AsIntermediate();
     auto my_checker = [](int i) {
@@ -202,8 +200,9 @@ class CosineOpComplete : public paddle::framework::CosineOp {
 };
 
 TEST(OperatorRegistrar, Test) {
-  using namespace paddle::framework;
-  OperatorRegistrar<CosineOpComplete, CosineOpProtoAndCheckerMaker> reg("cos");
+  paddle::framework::OperatorRegistrar<
+      CosineOpComplete, paddle::framework::CosineOpProtoAndCheckerMaker>
+      reg("cos");
 }
 
 namespace paddle {
@@ -211,10 +210,7 @@ namespace framework {
 
 class OpKernelTestMaker : public OpProtoAndCheckerMaker {
  public:
-  OpKernelTestMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddComment("NoGradOp, same input output. no Grad");
-  }
+  void Make() { AddComment("NoGradOp, same input output. no Grad"); }
 };
 
 class OpWithKernelTest : public OperatorWithKernel {
@@ -274,9 +270,9 @@ TEST(OperatorRegistrar, CUDA) {
 
 static int op_test_value = 0;
 
-using paddle::platform::DeviceContext;
 using paddle::platform::CPUDeviceContext;
 using paddle::platform::CUDADeviceContext;
+using paddle::platform::DeviceContext;
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 371c2fad97b1efd06eea9ac631122f194e65d656..3cf8e8696d739e3f2894e490161b9fb5b459bc41 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -24,6 +24,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 
 DECLARE_bool(benchmark);
+DEFINE_bool(check_nan_inf, false,
+            "Checking whether operator produce NAN/INF or not. It will be "
+            "extremely slow so please use this flag wisely.");
 
 namespace paddle {
 namespace framework {
@@ -35,7 +38,19 @@ std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
     std::make_tuple(platform::CPUPlace(), LibraryType::kPlain),
 };
 
-static DDim GetDims(const Scope& scope, const std::string& name) {
+proto::VarType::Type GetDataTypeOfVar(const Variable* var) {
+  if (var->IsType<framework::LoDTensor>()) {
+    return framework::ToDataType(var->Get<framework::LoDTensor>().type());
+  } else if (var->IsType<framework::SelectedRows>()) {
+    return framework::ToDataType(
+        var->Get<framework::SelectedRows>().value().type());
+  } else {
+    PADDLE_THROW("Var should be LoDTensor or SelectedRows");
+  }
+}
+
+static DDim GetDims(const Scope& scope, const std::string& name,
+                    bool get_actual_dim = false) {
   Variable* var = scope.FindVar(name);
   if (var == nullptr) {
     return DDim({-1});
@@ -44,12 +59,29 @@ static DDim GetDims(const Scope& scope, const std::string& name) {
   if (var->IsType<LoDTensor>()) {
     return var->Get<LoDTensor>().dims();
   } else if (var->IsType<SelectedRows>()) {
-    return var->Get<SelectedRows>().GetCompleteDims();
+    if (get_actual_dim) {
+      return var->Get<SelectedRows>().value().dims();
+    } else {
+      return var->Get<SelectedRows>().GetCompleteDims();
+    }
   } else {
     return DDim({-1});
   }
 }
 
+static int GetRowSize(const Scope& scope, const std::string& name) {
+  Variable* var = scope.FindVar(name);
+  if (var == nullptr) {
+    return -1;
+  }
+
+  if (var->IsType<SelectedRows>()) {
+    return var->Get<SelectedRows>().rows().size();
+  }
+
+  return -1;
+}
+
 static LoD GetLoD(const Scope& scope, const std::string& name) {
   Variable* var = scope.FindVar(name);
   auto default_lod = LoD({{}});
@@ -66,6 +98,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
 }
 
 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
+  VLOG(10) << "- " << DebugStringEx(&scope);
   if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
     PADDLE_THROW("Cannot run operator on place %s", place);
@@ -74,10 +107,16 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
     platform::SetDeviceId(dev_id);
 #endif
   }
-  // profile
-  auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-  platform::RecordEvent record_event(Type(), dev_ctx);
   RunImpl(scope, place);
+  VLOG(10) << "+ " << DebugStringEx(&scope);
+}
+
+bool OperatorBase::HasInputs(const std::string& name) const {
+  if (inputs_.find(name) != inputs_.end()) {
+    return true;
+  } else {
+    return false;
+  }
 }
 
 std::string OperatorBase::Input(const std::string& name) const {
@@ -96,6 +135,14 @@ const std::vector<std::string>& OperatorBase::Inputs(
   return it->second;
 }
 
+bool OperatorBase::HasOutputs(const std::string& name) const {
+  if (outputs_.find(name) != outputs_.end()) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
 std::string OperatorBase::Output(const std::string& name) const {
   auto& outs = Outputs(name);
   PADDLE_ENFORCE_LE(outs.size(), 1UL,
@@ -121,7 +168,11 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
     for (size_t i = 0; i < input.second.size(); ++i) {
       ss << input.second[i];
       if (scope) {
-        ss << "[" << GetDims(*scope, input.second[i]) << "]";
+        int row_size = GetRowSize(*scope, input.second[i]);
+        if (row_size >= 0) {
+          ss << "[row_size=" << row_size << "]";
+        }
+        ss << "[" << GetDims(*scope, input.second[i], true) << "]";
         ss << "(" << GetLoD(*scope, input.second[i]) << ")";
       }
       if (i != input.second.size() - 1) {
@@ -141,7 +192,11 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
     for (size_t i = 0; i < output.second.size(); ++i) {
       ss << output.second[i];
       if (scope) {
-        ss << "[" << GetDims(*scope, output.second[i]) << "]";
+        int row_size = GetRowSize(*scope, output.second[i]);
+        if (row_size >= 0) {
+          ss << "[row_size=" << row_size << "]";
+        }
+        ss << "[" << GetDims(*scope, output.second[i], true) << "]";
         ss << "(" << GetLoD(*scope, output.second[i]) << ")";
       }
       if (i != output.second.size() - 1) {
@@ -158,17 +213,6 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
   return ss.str();
 }
 
-void OperatorBase::Rename(const std::string& old_name,
-                          const std::string& new_name) {
-  for (auto& input : inputs_) {
-    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
-  }
-  for (auto& output : outputs_) {
-    std::replace(output.second.begin(), output.second.end(), old_name,
-                 new_name);
-  }
-}
-
 OperatorBase::OperatorBase(const std::string& type,
                            const VariableNameMap& inputs,
                            const VariableNameMap& outputs,
@@ -218,13 +262,18 @@ void OperatorBase::CheckAllInputOutputSet() const {
   if (op_info == nullptr || op_info->proto_ == nullptr) return;
 
   for (auto& in : op_info->Proto().inputs()) {
-    PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(),
-                   "Type %s's input %s is not set", Type(), in.name());
+    if (!in.dispensable()) {
+      PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(),
+                     "Operator %s's input, %s, is not set", Type(), in.name());
+    }
   }
 
   for (auto& out : op_info->Proto().outputs()) {
-    PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(),
-                   "Type %s's output %s is not set", Type(), out.name());
+    if (!out.dispensable()) {
+      PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(),
+                     "Operator %s's output, %s, is not set", Type(),
+                     out.name());
+    }
   }
 }
 
@@ -267,6 +316,38 @@ static Tensor* GetMutableTensorFromVar(Variable* var) {
   }
 }
 
+bool ExecutionContext::HasInput(const std::string& name) const {
+  if (!op_.HasInputs(name)) {
+    return false;
+  }
+  auto& ins = Inputs(name);
+  size_t length = ins.size();
+  if (length == 0) {
+    return false;
+  }
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Input %s should not have more than one inputs", name);
+  auto arg = ins[0];
+  auto* var = arg == kEmptyVarName ? nullptr : scope_.FindVar(arg);
+  return var != nullptr;
+}
+
+bool ExecutionContext::HasOutput(const std::string& name) const {
+  if (!op_.HasOutputs(name)) {
+    return false;
+  }
+  auto& outs = Outputs(name);
+  size_t length = outs.size();
+  if (length == 0) {
+    return false;
+  }
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Output %s should not have more than one inputs", name);
+  auto arg = outs[0];
+  auto* var = arg == kEmptyVarName ? nullptr : scope_.FindVar(arg);
+  return var != nullptr;
+}
+
 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
   auto* var = InputVar(name);
@@ -314,7 +395,6 @@ bool OpSupportGPU(const std::string& op_type) {
   auto it = all_kernels.find(op_type);
   if (it == all_kernels.end()) {
     // All control operator must support GPU
-
     return true;
   }
   for (auto& kern_pair : it->second) {
@@ -331,6 +411,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
       : op_(op), scope_(scope) {}
 
   bool HasInput(const std::string& name) const override {
+    if (!op_.HasInputs(name)) {
+      return false;
+    }
     auto& ins = Inputs(name);
     size_t length = ins.size();
     if (length == 0) {
@@ -344,6 +427,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
   }
 
   bool HasOutput(const std::string& name) const override {
+    if (!op_.HasOutputs(name)) {
+      return false;
+    }
     auto& outs = Outputs(name);
     size_t length = outs.size();
     if (length == 0) {
@@ -357,6 +443,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
   }
 
   bool HasInputs(const std::string& name) const override {
+    if (!op_.HasInputs(name)) {
+      return false;
+    }
     auto inputs = op_.Inputs(name);
     if (inputs.empty()) {
       return false;
@@ -370,6 +459,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
   }
 
   bool HasOutputs(const std::string& name) const override {
+    if (!op_.HasOutputs(name)) {
+      return false;
+    }
     auto outputs = op_.Outputs(name);
     if (outputs.empty()) {
       return false;
@@ -407,10 +499,25 @@ class RuntimeInferShapeContext : public InferShapeContext {
     auto* out_tensor = out_var->GetMutable<LoDTensor>();
     out_tensor->set_lod(in_tensor.lod());
 
-    // TODO(dzhwinter) : reuse ShareLoD in most operators.
-    // Need to call ShareLayout explicitly in sequence related ops.
-    // Shall we have a better method to shared info between in/out Tensor?
-    out_tensor->set_layout(in_tensor.layout());
+// TODO(dzhwinter) : reuse ShareLoD in most operators.
+// Need to call ShareLayout explicitly in sequence related ops.
+// Shall we have a better method to shared info between in/out Tensor?
+#ifdef PADDLE_WITH_MKLDNN
+    // Fix me: ugly workaround below
+    // Correct solution:
+    //    set_layout() should NOT be called here (i.e. ShareLoD). Instead,
+    //    layout of output tensor should be set "manually" in Compute()
+    //    of each OPKernel. The reason layout should NOT be shared between
+    //    input and output "automatically" (now by InferShape()->ShareLoD())
+    //    is that layout transform may occur after InferShape().
+    // Workaround:
+    //    Skip set_layout() when input layout is kMKLDNN
+    //    This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN
+    //    OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called
+    //    in Compute()
+    if (in_tensor.layout() != DataLayout::kMKLDNN)
+#endif
+      out_tensor->set_layout(in_tensor.layout());
   }
 
   void ShareLayout(const std::string& in, const std::string& out, size_t i = 0,
@@ -432,6 +539,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
  protected:
   DDim GetDim(const std::string& name) const override {
     Variable* var = scope_.FindVar(name);
+    PADDLE_ENFORCE_NOT_NULL(var);
     if (var->IsType<LoDTensor>()) {
       return var->Get<LoDTensor>().dims();
     } else if (var->IsType<SelectedRows>()) {
@@ -445,15 +553,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
   }
 
   std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
-    Variable* var = scope_.FindVar(name);
-    if (var->IsType<ReaderHolder>()) {
-      return var->Get<ReaderHolder>().shapes();
-    } else {
-      PADDLE_THROW(
-          "Only ReaderHolder support 'GetRepeatedDims', but Variable %s's "
-          "type_id is %s.",
-          name, var->Type().name());
-    }
+    PADDLE_THROW("Only compile time support this method");
   }
 
   void SetDim(const std::string& name, const DDim& dim) override {
@@ -470,15 +570,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
 
   void SetRepeatedDims(const std::string& name,
                        const std::vector<DDim>& dims) override {
-    Variable* var = scope_.FindVar(name);
-    if (var->IsType<ReaderHolder>()) {
-      var->GetMutable<ReaderHolder>()->set_shapes(dims);
-    } else {
-      PADDLE_THROW(
-          "Only ReaderHolder support 'SetRepeatedDims', but Variable %s's "
-          "type_id is %s.",
-          name, var->Type().name());
-    }
+    PADDLE_THROW("Only compile time support this method");
   }
 
   proto::VarType::Type GetVarType(const std::string& name) const override {
@@ -495,12 +587,30 @@ class RuntimeInferShapeContext : public InferShapeContext {
   const Scope& scope_;
 };
 
+static void CheckTensorNANOrInf(const std::string& name,
+                                const framework::Tensor& tensor) {
+  if (tensor.memory_size() == 0) {
+    return;
+  }
+  if (!IsType<float>(tensor.type()) && !IsType<double>(tensor.type())) {
+    return;
+  }
+  PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
+                 "Tensor %s contains Inf", name);
+  PADDLE_ENFORCE(!framework::TensorContainsNAN(tensor),
+                 "Tensor %s contains NAN", name);
+}
+
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
   RuntimeInferShapeContext infer_shape_ctx(*this, scope);
   this->InferShape(&infer_shape_ctx);
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
+
+  // For profiling, don't move out of this function because that will result
+  // in the failure of multi-GPU profiling.
+  platform::RecordEvent record_event(Type(), dev_ctx);
   // check if op[type] has kernel registered.
   auto& all_op_kernels = AllOpKernels();
   auto kernels_iter = all_op_kernels.find(type_);
@@ -509,8 +619,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
         "There are no kernels which are registered in the %s operator.", type_);
   }
 
-  ExecutionContext ctx(*this, scope, *dev_ctx);
-
   OpKernelMap& kernels = kernels_iter->second;
 
   // TODO(dzhwinter) : kernel fallback mechanism will be added when all the
@@ -520,7 +628,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   //   Do selection
   // }
 
-  auto expected_kernel_key = this->GetExpectedKernelType(ctx);
+  auto expected_kernel_key =
+      this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
   VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
   auto kernel_iter = kernels.find(expected_kernel_key);
@@ -529,47 +638,98 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
                  KernelTypeToString(expected_kernel_key));
   }
 
-  // do data transform
-  Scope& new_scope = scope.NewScope();
+  // do data transformScope &transfer_scope;
+  std::vector<std::string> transfered_inplace_vars;
+  auto* transfer_scope =
+      TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars);
 
-  for (auto& var_name_item : this->Inputs()) {
-    for (auto& var_name : var_name_item.second) {
-      auto* var = scope.FindVar(var_name);
-      if (var && VarIsTensor(var)) {
-        auto* tensor_in = GetTensorFromVar(var);
-        if (tensor_in->IsInitialized()) {
-          auto kernel_type_for_var = this->GetKernelTypeForVar(
-              var_name_item.first, *tensor_in, expected_kernel_key);
-          if (TransFromNeeded(kernel_type_for_var, expected_kernel_key)) {
-            auto out_var_names = OutputVars(true);
-            if (std::find(out_var_names.begin(), out_var_names.end(),
-                          var_name) != out_var_names.end()) {
-              PADDLE_THROW(
-                  "var %s is both input and output, "
-                  "does not support transform",
-                  var_name);
-            }
-            VLOG(3) << "Transform Variable " << var_name << " from "
-                    << kernel_type_for_var << " to " << expected_kernel_key;
-            auto* trans_var = new_scope.Var(var_name);
-            std::shared_ptr<Tensor> out(new Tensor);
-            DataTransform(expected_kernel_key, kernel_type_for_var, *tensor_in,
-                          out.get());
-            CopyVariableWithTensor(*var, *(out.get()), *trans_var);
-          }
-        }
-      }
-    }
+  // exec scope is the scope that kernel actually executed on.
+  const Scope& exec_scope =
+      (transfer_scope == nullptr ? scope : *transfer_scope);
+
+  if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) {
+    dev_ctx = pool.Get(expected_kernel_key.place_);
   }
 
-  auto* new_dev_ctx = pool.Get(expected_kernel_key.place_);
-  kernel_iter->second->Compute(
-      ExecutionContext(*this, new_scope, *new_dev_ctx));
+  kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx));
+
+  if (!transfered_inplace_vars.empty()) {
+    // there is inplace variable has been transfered.
+    TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope);
+  }
 
   /*For profiling/benchmark only*/
   if (FLAGS_benchmark) {
-    new_dev_ctx->Wait();
+    dev_ctx->Wait();
+  }
+
+  if (FLAGS_check_nan_inf) {
+    for (auto& vname : OutputVars(true)) {
+      auto* var = exec_scope.FindVar(vname);
+      if (var == nullptr) continue;
+      if (var->IsType<framework::LoDTensor>()) {
+        CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
+      }
+    }
+  }
+}
+void OperatorWithKernel::TransferInplaceVarsBack(
+    const Scope& scope, const std::vector<std::string>& inplace_vars,
+    const Scope& transfer_scope) const {
+  for (auto& var_name : inplace_vars) {
+    VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
+    auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name));
+    auto* transformed_tensor =
+        GetTensorFromVar(transfer_scope.FindVar(var_name));
+    original_tensor->ShareDataWith(*transformed_tensor);
+  }
+}
+
+Scope* OperatorWithKernel::TryTransferData(
+    const Scope& scope, const OpKernelType& expected_kernel_key,
+    std::vector<std::string>* transfered_inplace_vars) const {
+  Scope* new_scope = nullptr;
+  for (auto& var_name_item : Inputs()) {
+    for (auto& var_name : var_name_item.second) {
+      auto* var = scope.FindVar(var_name);
+      // Only tensor can be tranfer to another device.
+      if (var == nullptr || !VarIsTensor(var)) {
+        continue;
+      }
+
+      auto* tensor_in = GetTensorFromVar(var);
+      if (!tensor_in->IsInitialized()) {
+        continue;
+      }
+
+      auto kernel_type_for_var = GetKernelTypeForVar(
+          var_name_item.first, *tensor_in, expected_kernel_key);
+
+      if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
+        continue;
+      }
+
+      auto out_var_names = OutputVars(true);
+      if (std::find(out_var_names.begin(), out_var_names.end(), var_name) !=
+          out_var_names.end()) {
+        transfered_inplace_vars->emplace_back(var_name);
+      }
+
+      VLOG(3) << "Transform Variable " << var_name << " from "
+              << kernel_type_for_var << " to " << expected_kernel_key;
+
+      if (new_scope == nullptr) {
+        new_scope = &scope.NewScope();
+      }
+
+      auto* trans_var = new_scope->Var(var_name);
+      Tensor out;
+      TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);
+      SetTensorToVariable(*var, out, trans_var);
+    }
   }
+
+  return new_scope;
 }
 
 proto::VarType::Type OperatorWithKernel::IndicateDataType(
@@ -590,8 +750,10 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
         }
         if (t != nullptr) {
           int tmp = static_cast<int>(ToDataType(t->type()));
-          PADDLE_ENFORCE(tmp == data_type || data_type == -1,
-                         "DataType of Paddle Op %s must be the same.", Type());
+          PADDLE_ENFORCE(
+              tmp == data_type || data_type == -1,
+              "DataType of Paddle Op %s must be the same. Get %d != %d", Type(),
+              data_type, tmp);
           data_type = tmp;
         }
       }
@@ -609,7 +771,8 @@ OpKernelType OperatorWithKernel::GetExpectedKernelType(
 OpKernelType OperatorWithKernel::GetKernelTypeForVar(
     const std::string& var_name, const Tensor& tensor,
     const OpKernelType& expected_kernel_type) const {
-  return OpKernelType(expected_kernel_type.data_type_, tensor.place());
+  return OpKernelType(expected_kernel_type.data_type_, tensor.place(),
+                      tensor.layout());
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 41214b41cb68cbd7049552f39195ae5257e0d06f..01d750efbb8aaa35701f6caa7ec103ec21dd529e 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -33,7 +33,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/variant.h"
-#include "paddle/utils/Error.h"
 
 namespace paddle {
 namespace framework {
@@ -61,6 +60,8 @@ inline std::string GradVarName(const std::string& var_name) {
   return var_name + kGradVarSuffix;
 }
 
+proto::VarType::Type GetDataTypeOfVar(const Variable* var);
+
 class OperatorBase;
 class ExecutionContext;
 
@@ -77,54 +78,49 @@ class OperatorBase {
 
   virtual ~OperatorBase() {}
 
-  template <typename T>
-  inline const T& Attr(const std::string& name) const {
-    PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
-                   name);
-    return boost::get<T>(attrs_.at(name));
-  }
-
-  /// if scope is not null, also show dimensions of arguments
-  virtual std::string DebugStringEx(const Scope* scope) const;
-
-  std::string DebugString() const { return DebugStringEx(nullptr); }
-
-  /// Net will call this interface function to Run an op.
+  /// Executor will call this interface function to Run an op.
   //  The implementation should be written at RunImpl
   void Run(const Scope& scope, const platform::Place& place);
 
   // FIXME(typhoonzero): this is only used for recv_op to stop event_loop.
   virtual void Stop() {}
 
-  virtual bool IsNetOp() const { return false; }
+  /// if scope is not null, also show dimensions of arguments
+  virtual std::string DebugStringEx(const Scope* scope) const;
+  std::string DebugString() const { return DebugStringEx(nullptr); }
 
   virtual bool SupportGPU() const { return false; }
 
-  /// rename inputs outputs name
-  void Rename(const std::string& old_name, const std::string& new_name);
+  const std::string& Type() const { return type_; }
+
+  template <typename T>
+  inline const T& Attr(const std::string& name) const {
+    PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
+                   name);
+    return boost::get<T>(attrs_.at(name));
+  }
+  const AttributeMap& Attrs() const { return attrs_; }
 
   const VariableNameMap& Inputs() const { return inputs_; }
   const VariableNameMap& Outputs() const { return outputs_; }
 
+  bool HasInputs(const std::string& name) const;
   //! Get a input with argument's name described in `op_proto`
   std::string Input(const std::string& name) const;
   //! Get a input which has multiple variables.
   const std::vector<std::string>& Inputs(const std::string& name) const;
-
+  //! Get all inputs variable names
   std::vector<std::string> InputVars() const;
 
+  bool HasOutputs(const std::string& name) const;
   //! Get a output with argument's name described in `op_proto`
   std::string Output(const std::string& name) const;
   //! Get an output which has multiple variables.
   //! TODO add a vector_view to prevent memory copy.
   const std::vector<std::string>& Outputs(const std::string& name) const;
-
+  //! Get all outputs variable names
   virtual std::vector<std::string> OutputVars(bool has_intermediate) const;
 
-  const std::string& Type() const { return type_; }
-  void SetType(const std::string& type) { type_ = type; }
-  const AttributeMap& Attrs() const { return attrs_; }
-
   // Return a new operator instance, which is as same as this.
   // Use unique_ptr to prevent caller forget to delete this pointer.
   virtual std::unique_ptr<OperatorBase> Clone() const = 0;
@@ -195,6 +191,10 @@ class ExecutionContext {
     return op_.Attr<T>(name);
   }
 
+  bool HasInput(const std::string& name) const;
+
+  bool HasOutput(const std::string& name) const;
+
   size_t InputSize(const std::string& name) const {
     return op_.Inputs(name).size();
   }
@@ -276,20 +276,6 @@ class ExecutionContext {
     return res;
   }
 
-  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
-                size_t j = 0) const {
-    PADDLE_ENFORCE_LT(i, InputSize(in));
-    PADDLE_ENFORCE_LT(j, OutputSize(out));
-    auto* in_var = MultiInputVar(in)[i];
-    auto* out_var = MultiOutputVar(out)[j];
-    if (!in_var->IsType<LoDTensor>()) return;
-    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
-                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
-    auto in_tensor = in_var->Get<LoDTensor>();
-    auto* out_tensor = out_var->GetMutable<LoDTensor>();
-    out_tensor->set_lod(in_tensor.lod());
-  }
-
   platform::Place GetPlace() const { return device_context_.GetPlace(); }
 
   template <typename DeviceContextType>
@@ -361,9 +347,9 @@ class OpKernel : public OpKernelBase {
 
 class OperatorWithKernel : public OperatorBase {
  public:
+  using OpKernelFunc = std::function<void(const ExecutionContext&)>;
   using OpKernelMap =
-      std::unordered_map<OpKernelType, std::unique_ptr<OpKernelBase>,
-                         OpKernelType::Hash>;
+      std::unordered_map<OpKernelType, OpKernelFunc, OpKernelType::Hash>;
 
   OperatorWithKernel(const std::string& type, const VariableNameMap& inputs,
                      const VariableNameMap& outputs, const AttributeMap& attrs)
@@ -398,6 +384,20 @@ class OperatorWithKernel : public OperatorBase {
   // same.
   proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const;
   void RunImpl(const Scope& scope, const platform::Place& place) const final;
+
+  /**
+   * Transfer data from scope to a transfered scope. If there is no data need to
+   * be tranfered, it returns nullptr.
+   *
+   * * transfered_inplace_vars is a output vector.
+   */
+  Scope* TryTransferData(
+      const Scope& scope, const OpKernelType& expected_kernel_key,
+      std::vector<std::string>* transfered_inplace_vars) const;
+
+  void TransferInplaceVarsBack(const Scope& scope,
+                               const std::vector<std::string>& inplace_vars,
+                               const Scope& exec_scope) const;
 };
 
 extern bool OpSupportGPU(const std::string& op_type);
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
index 44ca4d7ca564515ae267c5949d29feaf22790251..74043b5d7990178976baf2fad991ae03f9c8dd25 100644
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -46,8 +46,7 @@ class OpWithoutKernelTest : public OperatorBase {
 
 class OpWithoutKernelCheckerMaker : public OpProtoAndCheckerMaker {
  public:
-  OpWithoutKernelCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() {
     AddInput("input", "input of test op");
     AddOutput("output", "output of test op");
     AddAttr<float>("scale", "scale of cosine op");
@@ -72,7 +71,7 @@ REGISTER_OP_WITHOUT_GRADIENT(test_operator,
                              paddle::framework::OpWithoutKernelCheckerMaker);
 
 TEST(OperatorBase, all) {
-  paddle::framework::InitDevices();
+  paddle::framework::InitDevices(true);
   paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("test_operator");
   BuildVar("input", {"IN1"}, op_desc.add_inputs());
@@ -98,8 +97,7 @@ namespace framework {
 
 class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
  public:
-  OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() {
     AddInput("x", "input of test op");
     AddOutput("y", "output of test op");
     AddAttr<float>("scale", "scale of cosine op")
@@ -137,9 +135,7 @@ class CPUKernelTest : public OpKernel<float> {
 class OpKernelTestMultiInputsProtoAndCheckerMaker
     : public OpProtoAndCheckerMaker {
  public:
-  OpKernelTestMultiInputsProtoAndCheckerMaker(OpProto* proto,
-                                              OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() {
     AddInput("xs", "inputs of test op").AsDuplicable();
     AddInput("k", "input of test op");
     AddOutput("ys", "outputs of test op").AsDuplicable();
@@ -198,7 +194,7 @@ REGISTER_OP_CPU_KERNEL(op_with_kernel,
 
 // test with single input
 TEST(OpKernel, all) {
-  paddle::framework::InitDevices();
+  paddle::framework::InitDevices(true);
   paddle::framework::proto::OpDesc op_desc;
   op_desc.set_type("op_with_kernel");
   BuildVar("x", {"IN1"}, op_desc.add_inputs());
@@ -226,10 +222,8 @@ REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
 
 // test with multi inputs
 TEST(OpKernel, multi_inputs) {
-  using namespace paddle::framework;
-
-  paddle::framework::InitDevices();
-  proto::OpDesc op_desc;
+  paddle::framework::InitDevices(true);
+  paddle::framework::proto::OpDesc op_desc;
 
   op_desc.set_type("op_multi_inputs_with_kernel");
   BuildVar("xs", {"x0", "x1", "x2"}, op_desc.add_inputs());
@@ -243,12 +237,12 @@ TEST(OpKernel, multi_inputs) {
 
   paddle::platform::CPUPlace cpu_place;
   paddle::framework::Scope scope;
-  scope.Var("x0")->GetMutable<LoDTensor>();
-  scope.Var("x1")->GetMutable<LoDTensor>();
-  scope.Var("x2")->GetMutable<LoDTensor>();
-  scope.Var("k0")->GetMutable<LoDTensor>();
-  scope.Var("y0")->GetMutable<LoDTensor>();
-  scope.Var("y1")->GetMutable<LoDTensor>();
+  scope.Var("x0")->GetMutable<paddle::framework::LoDTensor>();
+  scope.Var("x1")->GetMutable<paddle::framework::LoDTensor>();
+  scope.Var("x2")->GetMutable<paddle::framework::LoDTensor>();
+  scope.Var("k0")->GetMutable<paddle::framework::LoDTensor>();
+  scope.Var("y0")->GetMutable<paddle::framework::LoDTensor>();
+  scope.Var("y1")->GetMutable<paddle::framework::LoDTensor>();
 
   auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
   op->Run(scope, cpu_place);
@@ -269,7 +263,7 @@ class OperatorClone : public paddle::framework::OperatorBase {
 };
 
 TEST(Operator, Clone) {
-  paddle::framework::InitDevices();
+  paddle::framework::InitDevices(true);
   OperatorClone a("ABC", paddle::framework::VariableNameMap{},
                   paddle::framework::VariableNameMap{},
                   paddle::framework::AttributeMap{});
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b53a6f43fbd1f23e69d23ad0fcc54d5c25d352a3
--- /dev/null
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -0,0 +1,267 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/parallel_executor.h"
+
+#include <string>
+#include <tuple>
+#include <vector>
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
+#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h"
+#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace framework {
+
+class ParallelExecutorPrivate {
+ public:
+  explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
+      : places_(places) {}
+
+  std::vector<platform::Place> places_;
+  std::vector<Scope *> local_scopes_;
+  Scope *global_scope_;
+  std::unique_ptr<details::SSAGraphExecutor> executor_;
+
+#ifdef PADDLE_WITH_CUDA
+  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
+#endif
+  bool own_local_scope_;
+  bool use_cuda_;
+};
+
+std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
+  return member_->local_scopes_;
+}
+
+ParallelExecutor::ParallelExecutor(
+    const std::vector<platform::Place> &places,
+    const std::unordered_set<std::string> &params,
+    const std::unordered_set<std::string> &bcast_vars,
+    const ProgramDesc &main_program, const std::string &loss_var_name,
+    Scope *scope, const std::vector<Scope *> &local_scopes,
+    const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
+    size_t num_trainers, size_t trainer_id)
+    : member_(new ParallelExecutorPrivate(places)) {
+  member_->global_scope_ = scope;
+  member_->use_cuda_ = exec_strategy.use_cuda_;
+
+  // Step 1. Bcast the params to devs.
+  // Create local scopes
+  if (local_scopes.empty()) {
+    member_->own_local_scope_ = true;
+    member_->local_scopes_.emplace_back(member_->global_scope_);
+    for (size_t i = 1; i < member_->places_.size(); ++i) {
+      member_->local_scopes_.emplace_back(&scope->NewScope());
+    }
+  } else {
+    member_->own_local_scope_ = false;
+    PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
+    for (size_t i = 0; i < member_->places_.size(); ++i) {
+      member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope());
+    }
+  }
+
+  if (member_->use_cuda_) {
+// Bcast Parameters to all GPUs
+#ifdef PADDLE_WITH_CUDA
+    auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
+    ncclUniqueId *nccl_id = nullptr;
+    if (nccl_id_var != nullptr) {
+      nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
+    }
+    member_->nccl_ctxs_.reset(new platform::NCCLContextMap(
+        member_->places_, nccl_id, num_trainers, trainer_id));
+#else
+    PADDLE_THROW("Not compiled with CUDA");
+#endif
+  }
+
+  if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
+    BCastParamsToGPUs(bcast_vars);
+  }
+  // Startup Program has been run. All local scopes has correct parameters.
+
+  // Step 2. Create vars in each scope;
+  std::vector<details::VariableInfo> var_infos;
+  for (auto *var : main_program.Block(0).AllVars()) {
+    var_infos.emplace_back();
+    var_infos.back().name_ = var->Name();
+    var_infos.back().type_ = var->GetType();
+    var_infos.back().persistable_ = var->Persistable();
+  }
+
+  // Step 3. Convert main_program to SSA form and dependency graph. Also, insert
+  // ncclOp
+  details::SSAGraphBuilderFactory builder_factory(
+      member_->places_, loss_var_name, params, member_->local_scopes_,
+      build_strategy);
+  if (member_->use_cuda_) {
+#ifdef PADDLE_WITH_CUDA
+    builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get());
+#else
+    PADDLE_THROW("Not compiled with CUDA");
+#endif
+  }
+
+  builder_ = builder_factory.Create();
+  member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
+      exec_strategy, member_->local_scopes_, places,
+      builder_->Build(main_program)));
+
+  member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
+      exec_strategy, member_->local_scopes_, std::move(var_infos),
+      member_->places_, std::move(member_->executor_)));
+}
+
+void ParallelExecutor::BCastParamsToGPUs(
+    const std::unordered_set<std::string> &vars) const {
+  // the the initializing bcast, all vars would be bcast from device(0),
+  // otherwise
+  // bcast from the specified device.
+  bool initializing = builder_.get() == nullptr ? true : false;
+
+  for (auto &var : vars) {
+    int var_dev_id =
+        builder_.get() == nullptr ? -1 : builder_->GetVarDeviceID(var);
+    if (!initializing && var_dev_id == -1) continue;
+
+    framework::Variable *main_var = nullptr;
+    if (initializing) {
+      main_var = member_->local_scopes_[0]->FindVar(var);
+    } else {
+      main_var = member_->local_scopes_[var_dev_id]->FindVar(var);
+    }
+
+    if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
+      continue;
+    }
+
+    auto &main_tensor = main_var->Get<LoDTensor>();
+    auto &dims = main_tensor.dims();
+    if (paddle::platform::is_gpu_place(main_tensor.place())) {
+#ifdef PADDLE_WITH_CUDA
+      std::vector<void *> buffers;
+      size_t numel = main_tensor.numel();
+      ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
+      for (size_t i = 0; i < member_->places_.size(); ++i) {
+        auto place = member_->places_[i];
+        void *buffer;
+
+        if ((initializing && i == 0) ||
+            (!initializing && static_cast<int>(i) == var_dev_id)) {
+          buffer = const_cast<void *>(main_tensor.data<void>());
+        } else {
+          auto local_scope = member_->local_scopes_[i];
+          auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
+          t->Resize(dims);
+          buffer = t->mutable_data(place, main_tensor.type());
+        }
+        buffers.push_back(buffer);
+      }
+
+      PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(),
+                        "variables' buffer size to bcast NOT equal to places");
+      {
+        platform::NCCLGroupGuard guard;
+        for (size_t i = 0; i < member_->places_.size(); ++i) {
+          auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]);
+          if (initializing) {
+            platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
+                                         nccl_ctx.comm_, nccl_ctx.stream());
+          } else {
+            if (var_dev_id >= 0) {
+              platform::dynload::ncclBcast(buffers[i], numel, data_type,
+                                           var_dev_id, nccl_ctx.comm_,
+                                           nccl_ctx.stream());
+            }
+          }
+        }
+        member_->nccl_ctxs_->WaitAll();
+      }
+
+#else
+      PADDLE_THROW("Not compiled with CUDA");
+#endif
+    } else {
+      platform::CPUPlace cpu;
+      for (size_t i = 1; i < member_->places_.size(); ++i) {
+        auto local_scope = member_->local_scopes_[i];
+        auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
+        t->Resize(dims);
+        t->mutable_data(cpu, main_tensor.type());
+        paddle::framework::TensorCopy(main_tensor, cpu, t);
+      }
+    }
+  }
+}
+
+void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
+                           const std::string &fetched_var_name) {
+  platform::RecordBlock b(0);
+  auto fetch_data = member_->executor_->Run(fetch_tensors);
+  *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
+      fetch_data;
+}
+
+void ParallelExecutor::FeedTensorsIntoLocalScopes(
+    const std::vector<std::unordered_map<std::string, LoDTensor>> &tensors) {
+  PADDLE_ENFORCE_EQ(member_->local_scopes_.size(), tensors.size());
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    auto &map = tensors[i];
+    auto *scope = member_->local_scopes_[i];
+    for (auto &pair : map) {
+      auto *trg = scope->Var(pair.first)->GetMutable<LoDTensor>();
+      trg->ShareDataWith(pair.second);
+      trg->set_lod(pair.second.lod());
+    }
+  }
+}
+
+void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
+    const std::unordered_map<std::string, LoDTensor> &tensors) {
+  for (auto pair : tensors) {
+    auto lod_tensors = pair.second.SplitLoDTensor(member_->places_);
+    PADDLE_ENFORCE_EQ(
+        member_->places_.size(), lod_tensors.size(),
+        "The number of samples of current batch is less than the count of "
+        "devices, currently, it is not allowed. (%d vs %d)",
+        member_->places_.size(), lod_tensors.size());
+    for (size_t j = 0; j < member_->places_.size(); ++j) {
+      // TODO(panxy0718): Do I need to delete this var?
+      auto t =
+          member_->local_scopes_[j]->Var(pair.first)->GetMutable<LoDTensor>();
+      t->ShareDataWith(lod_tensors[j]);
+      t->set_lod(lod_tensors[j].lod());
+    }
+  }
+}
+
+ParallelExecutor::~ParallelExecutor() {
+  if (member_->own_local_scope_) {
+    for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
+      member_->global_scope_->DeleteScope(member_->local_scopes_[i]);
+    }
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..058f83f07c26224e3180d140630c08a24c40cd80
--- /dev/null
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -0,0 +1,77 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <paddle/fluid/framework/details/build_strategy.h>
+#include <string>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/details/execution_strategy.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+class ParallelExecutorPrivate;
+
+using details::BuildStrategy;
+using details::ExecutionStrategy;
+
+class ParallelExecutor {
+  DISABLE_COPY_AND_ASSIGN(ParallelExecutor);
+
+ public:
+  explicit ParallelExecutor(const std::vector<platform::Place> &places,
+                            const std::unordered_set<std::string> &params,
+                            const std::unordered_set<std::string> &bcast_vars,
+                            const ProgramDesc &main_program,
+                            const std::string &loss_var_name, Scope *scope,
+                            const std::vector<Scope *> &local_scopes,
+                            const ExecutionStrategy &exec_strategy,
+                            const BuildStrategy &build_strategy,
+                            size_t num_trainers = 1, size_t trainer_id = 0);
+
+  ~ParallelExecutor();
+
+  std::vector<Scope *> &GetLocalScopes();
+
+  /**
+   * Feed tensors to local scopes. The size of tensors should be equal to the
+   * size of local scopes.
+   */
+  void FeedTensorsIntoLocalScopes(
+      const std::vector<std::unordered_map<std::string, LoDTensor>> &tensors);
+
+  void FeedAndSplitTensorIntoLocalScopes(
+      const std::unordered_map<std::string, LoDTensor> &tensors);
+
+  void Run(const std::vector<std::string> &fetch_tensors,
+           const std::string &fetched_var_name);
+
+  void BCastParamsToGPUs(const std::unordered_set<std::string> &vars) const;
+
+ private:
+  ParallelExecutorPrivate *member_;
+  std::unique_ptr<details::SSAGraphBuilder> builder_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc
index 049731c7216e542dedcf8754eef79f0a672291d6..1e01a6e900404990e16674755367d2fc6d832725 100644
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -27,10 +27,14 @@ BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) {
   return blocks_.back().get();
 }
 
-proto::ProgramDesc *ProgramDesc::Proto() {
+void ProgramDesc::Flush() {
   for (auto &block : blocks_) {
     block->Flush();
   }
+}
+
+proto::ProgramDesc *ProgramDesc::Proto() {
+  Flush();
   return &desc_;
 }
 
@@ -47,12 +51,15 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
     auto *block = desc_.mutable_blocks(i);
     blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this));
   }
-  for (auto &block : blocks_) {
-    for (auto *op : block->AllOps()) {
-      for (const auto &attr : op->Proto()->attrs()) {
-        if (attr.type() == proto::AttrType::BLOCK) {
-          size_t blk_idx = attr.block_idx();
-          op->SetBlockAttr(attr.name(), *this->MutableBlock(blk_idx));
+  for (size_t block_id = 0; block_id < blocks_.size(); ++block_id) {
+    auto all_ops = blocks_[block_id]->AllOps();
+    for (size_t op_id = 0; op_id < all_ops.size(); ++op_id) {
+      auto &op = all_ops[op_id];
+      for (const std::string &attr_name : op->AttrNames()) {
+        if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) {
+          int sub_block_id =
+              o.Block(block_id).Op(op_id)->GetBlockAttr(attr_name);
+          op->SetBlockAttr(attr_name, MutableBlock(sub_block_id));
         }
       }
     }
@@ -69,7 +76,7 @@ ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
       for (const auto &attr : op->Proto()->attrs()) {
         if (attr.type() == proto::AttrType::BLOCK) {
           size_t blk_idx = attr.block_idx();
-          op->SetBlockAttr(attr.name(), *this->MutableBlock(blk_idx));
+          op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
         }
       }
     }
@@ -82,12 +89,22 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
   for (auto &block_desc : *desc_.mutable_blocks()) {
     blocks_.emplace_back(new BlockDesc(this, &block_desc));
   }
+  for (auto &block : blocks_) {
+    for (auto *op : block->AllOps()) {
+      for (const auto &attr : op->Proto()->attrs()) {
+        if (attr.type() == proto::AttrType::BLOCK) {
+          size_t blk_idx = attr.block_idx();
+          op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
+        }
+      }
+    }
+  }
 }
 
 const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
-  BlockDesc *global_block = blocks_[0].get();
+  auto &global_block = Block(0);
   std::vector<std::string> feed_target_names;
-  for (auto *op : global_block->AllOps()) {
+  for (auto *op : global_block.AllOps()) {
     if (op->Type() == kFeedOpType) {
       feed_target_names.insert(feed_target_names.begin(), op->Output("Out")[0]);
     }
@@ -96,9 +113,9 @@ const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
 }
 
 const std::vector<std::string> ProgramDesc::GetFetchTargetNames() {
-  BlockDesc *global_block = blocks_[0].get();
+  auto &global_block = Block(0);
   std::vector<std::string> fetch_target_names;
-  for (auto *op : global_block->AllOps()) {
+  for (auto *op : global_block.AllOps()) {
     if (op->Type() == kFetchOpType) {
       fetch_target_names.push_back(op->Input("X")[0]);
     }
@@ -106,5 +123,43 @@ const std::vector<std::string> ProgramDesc::GetFetchTargetNames() {
   return fetch_target_names;
 }
 
+void ProgramDesc::SetFeedHolderName(const std::string &feed_holder_name) {
+  auto *global_block = MutableBlock(0);
+  int index = 0;
+  for (auto *op : global_block->AllOps()) {
+    if (op->Type() == kFeedOpType) {
+      // Unify the input's name of all feed_ops to feed_holder_name
+      global_block->RemoveVar(op->Input("X")[0]);
+      op->SetInput("X", {feed_holder_name});
+      op->SetAttr("col", {index});
+      op->CheckAttrs();
+      index++;
+    }
+  }
+
+  auto *feed_holder = global_block->Var(feed_holder_name);
+  feed_holder->SetType(proto::VarType::FEED_MINIBATCH);
+  feed_holder->SetPersistable(true);
+}
+
+void ProgramDesc::SetFetchHolderName(const std::string &fetch_holder_name) {
+  auto *global_block = MutableBlock(0);
+  int index = 0;
+  for (auto *op : global_block->AllOps()) {
+    if (op->Type() == kFetchOpType) {
+      // Unify the output's name of all fetch_ops to fetch_holder_name
+      global_block->RemoveVar(op->Output("Out")[0]);
+      op->SetOutput("Out", {fetch_holder_name});
+      op->SetAttr("col", {index});
+      op->CheckAttrs();
+      index++;
+    }
+  }
+
+  auto *fetch_holder = global_block->Var(fetch_holder_name);
+  fetch_holder->SetType(proto::VarType::FETCH_LIST);
+  fetch_holder->SetPersistable(true);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h
index 538a0372116e6f90fd2fae5f00097b8efc5dcb5c..65fa0a0cfd5ba6d9b8765cee1309e118cb74348a 100644
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <memory>
+#include <string>
 #include <vector>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/framework.pb.h"
@@ -50,11 +51,30 @@ class ProgramDesc {
 
   size_t Size() const { return blocks_.size(); }
 
+  void Flush();
+
   proto::ProgramDesc *Proto();
 
+  // The output variable of feed_op is referenced as feed_target.
+  // This function is used to collect the output variable's name of all
+  // feed_ops.
   const std::vector<std::string> GetFeedTargetNames();
+
+  // The input variable of fetch_op is referenced as fetch_target.
+  // This function is used to collect the input variable's name of all
+  // fetch_ops.
   const std::vector<std::string> GetFetchTargetNames();
 
+  // The input variable of feed_op that holds input Tensor provided by users is
+  // referenced as feed_holder.
+  // This function is used to change or unify the feed_holder variables' name.
+  void SetFeedHolderName(const std::string &feed_holder_name);
+
+  // The output variable of fetch_op that holds output Tensor needed by users is
+  // referenced as fetch_holder.
+  // This function is used to change or unify the fetch_holder variables' name.
+  void SetFetchHolderName(const std::string &fetch_holder_name);
+
  private:
   proto::ProgramDesc desc_;
 
diff --git a/paddle/fluid/framework/program_desc_test.cc b/paddle/fluid/framework/program_desc_test.cc
index 66618a291b59996836e822587af618927a4263c7..6c46e9aad5b7fbf67fdcc07a12e7932ac8b6412b 100644
--- a/paddle/fluid/framework/program_desc_test.cc
+++ b/paddle/fluid/framework/program_desc_test.cc
@@ -66,7 +66,7 @@ TEST(ProgramDesc, copy_ctor) {
 
   for (size_t i = 0; i < global_block->OpSize(); ++i) {
     auto op_origin = global_block->Op(i);
-    auto op_copy = global_block->Op(i);
+    auto op_copy = global_block_copy->Op(i);
 
     ASSERT_EQ(op_origin->Type(), op_copy->Type());
     ASSERT_EQ(op_origin->Inputs(), op_copy->Inputs());
@@ -131,7 +131,7 @@ TEST(ProgramDescBind, serialize_and_deserialize) {
 
   for (size_t i = 0; i < global_block->OpSize(); ++i) {
     auto op_origin = global_block->Op(i);
-    auto op_restored = global_block->Op(i);
+    auto op_restored = global_block_restored->Op(i);
 
     ASSERT_EQ(op_origin->Type(), op_restored->Type());
     ASSERT_EQ(op_origin->Inputs(), op_restored->Inputs());
diff --git a/paddle/fluid/framework/prune.cc b/paddle/fluid/framework/prune.cc
index 107c5bf8ecbc3b46dd5fae87c73d0be4f74d1587..57c1b822d8d4f095f33cba2bfd5210f7ee19dd9f 100644
--- a/paddle/fluid/framework/prune.cc
+++ b/paddle/fluid/framework/prune.cc
@@ -14,19 +14,19 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/prune.h"
 
+#include <glog/logging.h>
+
 #include <algorithm>
 #include <set>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
-#include <glog/logging.h>
-
 namespace paddle {
 namespace framework {
 
-const std::string kFeedOpType = "feed";
-const std::string kFetchOpType = "fetch";
+const char kFeedOpType[] = "feed";
+const char kFetchOpType[] = "fetch";
 
 bool HasDependentVar(const proto::OpDesc& op_desc,
                      const std::set<std::string>& dependent_vars) {
@@ -68,7 +68,7 @@ bool HasSubBlock(const proto::OpDesc& op_desc) {
 // the child block to help pruning
 void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
                 int block_id, int parent_block_id,
-                std::set<std::string>& dependent_vars) {
+                std::set<std::string>* dependent_vars) {
   auto& block = input.blocks(block_id);
   auto& ops = block.ops();
 
@@ -90,11 +90,11 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
   std::vector<bool> should_run;
   for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) {
     auto& op_desc = *op_iter;
-    if (IsTarget(op_desc) || HasDependentVar(op_desc, dependent_vars)) {
+    if (IsTarget(op_desc) || HasDependentVar(op_desc, *dependent_vars)) {
       // insert its input to the dependency graph
       for (auto& var : op_desc.inputs()) {
         for (auto& argu : var.arguments()) {
-          dependent_vars.insert(argu);
+          dependent_vars->insert(argu);
         }
       }
       should_run.push_back(true);
@@ -138,7 +138,7 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
         // GetSubBlockIndex(*op) is the idx of the sub_block in the input desc
         // output_block_id is the idx of the current block in the output desc
         prune_impl(input, output, GetSubBlockIndex(*op), output_block_id,
-                   sub_block_dependent_vars);
+                   &sub_block_dependent_vars);
       }
     }
   }
@@ -181,7 +181,7 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
 void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output) {
   std::set<std::string> dependent_vars;
   output->clear_blocks();
-  prune_impl(input, output, 0, -1, dependent_vars);
+  prune_impl(input, output, 0, -1, &dependent_vars);
 }
 
 void inference_optimize_impl(proto::ProgramDesc* input, int block_id) {
diff --git a/paddle/fluid/framework/prune_test.cc b/paddle/fluid/framework/prune_test.cc
index 0e44b34383027ef58a033eb082f4bb2118b5d8a3..8af7d2d510d36e4c24ce3ae8dbc13c24ad5d4a0f 100644
--- a/paddle/fluid/framework/prune_test.cc
+++ b/paddle/fluid/framework/prune_test.cc
@@ -14,18 +14,17 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/prune.h"
 
+#include <gtest/gtest.h>
+#include <string>
+
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/net_op.h"
 
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
 
-#include <gtest/gtest.h>
-
 namespace f = paddle::framework;
-namespace ops = paddle::operators;
 
 void AddOp(const std::string &type, const f::VariableNameMap &inputs,
            const f::VariableNameMap &outputs, f::AttributeMap attrs,
diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc
index 91879d6d45868bb37ca44baafb8b0e8677cd6d1a..0b36f1116d15004b355e854e101abb9ad3297836 100644
--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -16,14 +16,26 @@
 
 namespace paddle {
 namespace framework {
+ReaderBase::~ReaderBase() {}
 
-DDim ReaderBase::shape(size_t idx) const {
-  PADDLE_ENFORCE_LT(
-      idx, shapes_.size(),
-      "Cannot get the %d'th shape, 'shapes_' only has %d elements.", idx,
-      shapes_.size());
-  return shapes_[idx];
-}
+FileReader::FileReader(const std::vector<DDim> &dims) : dims_(dims) {}
+
+void FileReader::ReadNext(std::vector<LoDTensor> *out) {
+  ReadNextImpl(out);
+  if (out->empty()) {
+    return;
+  }
 
+  PADDLE_ENFORCE_EQ(out->size(), dims_.size());
+  for (size_t i = 0; i < dims_.size(); ++i) {
+    auto &actual = (*out)[i].dims();
+    auto &expect = dims_[i];
+
+    PADDLE_ENFORCE_EQ(actual.size(), expect.size());
+    for (int j = 0; j < actual.size(); ++j) {
+      //      PADDLE_ENFORCE(actual[i] == expect[i] || expect[i] == -1);
+    }
+  }
+}
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h
index 18064ddc669aad7dda98d502119e56e7ddedcff3..64d4ceab624312ed366d7e835072899f1f033a88 100644
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@@ -14,51 +14,49 @@
 
 #pragma once
 
+#include <memory>
+#include <vector>
+
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace framework {
 
 class ReaderBase {
  public:
-  explicit ReaderBase(const std::vector<DDim>& shapes) : shapes_(shapes) {
-    PADDLE_ENFORCE(!shapes_.empty());
-  }
   virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
 
   virtual void ReInit() = 0;
 
-  DDim shape(size_t idx) const;
-  std::vector<DDim> shapes() const { return shapes_; }
-  void set_shapes(const std::vector<DDim>& shapes) { shapes_ = shapes; }
-
-  virtual bool HasNext() const = 0;
-
-  virtual ~ReaderBase() {}
-
- protected:
-  std::vector<DDim> shapes_;
-};
-
-class FileReader : public ReaderBase {
- public:
-  explicit FileReader(const std::vector<DDim>& shapes) : ReaderBase(shapes) {}
+  virtual ~ReaderBase();
 };
 
 class DecoratedReader : public ReaderBase {
  public:
-  explicit DecoratedReader(ReaderBase* reader)
-      : ReaderBase(reader->shapes()), reader_(reader) {
+  explicit DecoratedReader(const std::shared_ptr<ReaderBase>& reader)
+      : ReaderBase(), reader_(reader) {
     PADDLE_ENFORCE_NOT_NULL(reader_);
   }
 
   void ReInit() override { reader_->ReInit(); }
 
-  bool HasNext() const override { return reader_->HasNext(); }
+ protected:
+  std::shared_ptr<ReaderBase> reader_;
+};
+
+class FileReader : public ReaderBase {
+ public:
+  explicit FileReader(const std::vector<DDim>& dims);
+
+  void ReadNext(std::vector<LoDTensor>* out) override;
 
  protected:
-  ReaderBase* reader_;
+  virtual void ReadNextImpl(std::vector<LoDTensor>* out) = 0;
+
+ private:
+  std::vector<DDim> dims_;
 };
 
 // The ReaderHolder is used as reader' unified wrapper,
@@ -67,7 +65,7 @@ class ReaderHolder {
  public:
   void Reset(ReaderBase* reader) { reader_.reset(reader); }
 
-  ReaderBase* Get() const { return reader_.get(); }
+  std::shared_ptr<ReaderBase> Get() const { return reader_; }
 
   void ReadNext(std::vector<LoDTensor>* out) {
     PADDLE_ENFORCE_NOT_NULL(reader_);
@@ -78,23 +76,8 @@ class ReaderHolder {
     reader_->ReInit();
   }
 
-  DDim shape(size_t idx) const {
-    PADDLE_ENFORCE_NOT_NULL(reader_);
-    return reader_->shape(idx);
-  }
-  std::vector<DDim> shapes() const {
-    PADDLE_ENFORCE_NOT_NULL(reader_);
-    return reader_->shapes();
-  }
-  void set_shapes(const std::vector<DDim>& shapes) {
-    PADDLE_ENFORCE_NOT_NULL(reader_);
-    reader_->set_shapes(shapes);
-  }
-
-  bool HasNext() const { return reader_->HasNext(); }
-
  private:
-  std::unique_ptr<ReaderBase> reader_;
+  std::shared_ptr<ReaderBase> reader_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 17e38b1cf042657834b4d0d1c12cbbb92f19fa45..50f374e3703a97f6c1fdb4b14fdeb0b603f9ac86 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 
 #include <memory>  // for unique_ptr
-#include <mutex>   // for call_once
 #include <set>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/threadpool.h"
@@ -27,62 +26,54 @@ DEFINE_bool(benchmark, false,
             "Default cuda is asynchronous device, set to True will"
             "force op run in synchronous mode.");
 
+DEFINE_bool(
+    eager_delete_scope, true,
+    "Delete local scope eagerly. It will reduce GPU memory usage but "
+    "slow down the destruction of variables.(around 1% performance harm)");
+
 namespace paddle {
 namespace framework {
 
-Scope::~Scope() {
-  DropKids();
-  for (auto& kv : vars_) {
-    VLOG(3) << "Destroy variable " << kv.first;
-    delete kv.second;
-  }
-}
+Scope::~Scope() { DropKids(); }
 
 Scope& Scope::NewScope() const {
+  std::unique_lock<std::mutex> lock(mutex_);
   kids_.push_back(new Scope(this));
   return *kids_.back();
 }
 
 Variable* Scope::Var(const std::string& name) {
-  auto* v = FindVarLocally(name);
-  if (v != nullptr) return v;
-  v = new Variable();
-  vars_[name] = v;
-  VLOG(3) << "Create variable " << name;
-  v->name_ = &(vars_.find(name)->first);
-  return v;
+  std::unique_lock<std::mutex> lock(mutex_);
+  return VarInternal(name);
 }
 
 Variable* Scope::Var(std::string* name) {
-  auto var_name = string::Sprintf("%p.%d", this, vars_.size());
+  std::unique_lock<std::mutex> lock(mutex_);
+  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   if (name != nullptr) {
-    *name = var_name;
+    *name = new_name;
   }
-  return Var(var_name);
+  return VarInternal(new_name);
 }
 
 Variable* Scope::FindVar(const std::string& name) const {
-  auto var = FindVarLocally(name);
-  if (var != nullptr) {
-    return var;
-  }
-  return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
+  std::unique_lock<std::mutex> lock(mutex_);
+  return FindVarInternal(name);
 }
 
 const Scope* Scope::FindScope(const Variable* var) const {
-  for (auto& kv : vars_) {
-    if (kv.second == var) {
-      return this;
-    }
-  }
-  return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
+  std::unique_lock<std::mutex> lock(mutex_);
+  return FindScopeInternal(var);
 }
+
 void Scope::DropKids() {
+  std::unique_lock<std::mutex> lock(mutex_);
   for (Scope* s : kids_) delete s;
   kids_.clear();
 }
 
 std::vector<std::string> Scope::LocalVarNames() const {
+  std::unique_lock<std::mutex> lock(mutex_);
   std::vector<std::string> known_vars;
   known_vars.reserve(this->vars_.size());
   for (auto& p : vars_) {
@@ -91,23 +82,24 @@ std::vector<std::string> Scope::LocalVarNames() const {
   return known_vars;
 }
 
-void Scope::DeleteScope(Scope* scope) {
+void Scope::DeleteScope(Scope* scope) const {
+  std::unique_lock<std::mutex> lock(mutex_);
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
   this->kids_.erase(it);
   // When making memory benchmark on Fluid, we have to delete scope sync.
-  if (FLAGS_benchmark) {
+  if (FLAGS_benchmark || FLAGS_eager_delete_scope) {
     delete scope;
   } else {
     Async([scope] { delete scope; });
   }
 }
 
-void Scope::EraseVars(std::vector<std::string>& var_names) {
+void Scope::EraseVars(const std::vector<std::string>& var_names) {
+  std::unique_lock<std::mutex> lock(mutex_);
   std::set<std::string> var_set(var_names.begin(), var_names.end());
   for (auto it = vars_.begin(); it != vars_.end();) {
     if (var_set.find(it->first) != var_set.end()) {
-      delete it->second;
       it = vars_.erase(it);
     } else {
       ++it;
@@ -117,25 +109,60 @@ void Scope::EraseVars(std::vector<std::string>& var_names) {
 
 void Scope::Rename(const std::string& origin_name,
                    const std::string& new_name) const {
+  std::unique_lock<std::mutex> lock(mutex_);
+  RenameInternal(origin_name, new_name);
+}
+
+std::string Scope::Rename(const std::string& origin_name) const {
+  std::unique_lock<std::mutex> lock(mutex_);
+  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
+  RenameInternal(origin_name, new_name);
+  return new_name;
+}
+
+Variable* Scope::VarInternal(const std::string& name) {
+  auto* v = FindVarLocally(name);
+  if (v != nullptr) return v;
+
+  v = new Variable();
+  vars_[name].reset(v);
+  VLOG(3) << "Create variable " << name;
+  v->name_ = &(vars_.find(name)->first);
+  return v;
+}
+
+const Scope* Scope::FindScopeInternal(const Variable* var) const {
+  for (auto& kv : vars_) {
+    if (kv.second.get() == var) {
+      return this;
+    }
+  }
+  return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
+}
+
+void Scope::RenameInternal(const std::string& origin_name,
+                           const std::string& new_name) const {
   auto origin_it = vars_.find(origin_name);
   PADDLE_ENFORCE(origin_it != vars_.end(),
                  "Cannot find original variable with name %s", origin_name);
   auto new_it = vars_.find(new_name);
   PADDLE_ENFORCE(new_it == vars_.end(),
                  "The variable with name %s is already in the scope", new_name);
-  vars_[new_name] = origin_it->second;
+  vars_[new_name].reset(origin_it->second.release());
   vars_.erase(origin_it);
 }
 
-std::string Scope::Rename(const std::string& origin_name) const {
-  auto var_name = string::Sprintf("%p.%d", this, vars_.size());
-  Rename(origin_name, var_name);
-  return var_name;
+Variable* Scope::FindVarInternal(const std::string& name) const {
+  auto var = FindVarLocally(name);
+  if (var != nullptr) {
+    return var;
+  }
+  return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
 }
 
 Variable* Scope::FindVarLocally(const std::string& name) const {
   auto it = vars_.find(name);
-  if (it != vars_.end()) return it->second;
+  if (it != vars_.end()) return it->second.get();
   return nullptr;
 }
 
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index c1e1f49caaa5a60df0e97289aada465b45213971..e246241c0abfbc7bdcaf38d073cc58fc36a4f737 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <list>
+#include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -46,23 +47,26 @@ class Scope {
   Scope& NewScope() const;
 
   /// Create a variable with given name if it doesn't exist.
+  /// Caller doesn't own the returned Variable.
   Variable* Var(const std::string& name);
 
   /// Create a variable with a scope-unique name.
+  /// Caller doesn't own the returned Variable.
   Variable* Var(std::string* name = nullptr);
 
-  void EraseVars(std::vector<std::string>& var_names);
+  void EraseVars(const std::vector<std::string>& var_names);
 
   /// Find a variable in the scope or any of its ancestors.  Returns
   /// nullptr if cannot find.
+  /// Caller doesn't own the returned Variable.
   Variable* FindVar(const std::string& name) const;
 
-  const Scope& parent() const { return *parent_; }
+  const Scope* parent() const { return parent_; }
 
   /// Find the scope or an ancestor scope that contains the given variable.
   const Scope* FindScope(const Variable* var) const;
 
-  void DeleteScope(Scope* scope);
+  void DeleteScope(Scope* scope) const;
 
   /// Drop all kids scopes belonged to this scope.
   void DropKids();
@@ -77,17 +81,37 @@ class Scope {
   // Rename variable to a new name and return the new name
   std::string Rename(const std::string& origin_name) const;
 
-  Variable* FindVarLocally(const std::string& name) const;
+ protected:
+  mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
 
  private:
   // Call Scope::NewScope for a sub-scope.
   explicit Scope(Scope const* parent) : parent_(parent) {}
 
-  mutable std::unordered_map<std::string, Variable*> vars_;
+  // Called by Var.
+  Variable* VarInternal(const std::string& name);
+
+  // Called by FindScope.
+  const Scope* FindScopeInternal(const Variable* var) const;
+
+  // Called by Rename.
+  void RenameInternal(const std::string& origin_name,
+                      const std::string& new_name) const;
+
+  // Called by FindVar recursively.
+  Variable* FindVarInternal(const std::string& name) const;
+
+  // Called by FindVarInternal and Var.
+  Variable* FindVarLocally(const std::string& name) const;
+
+  // Scope in `kids_` are owned by this class.
   mutable std::list<Scope*> kids_;
   Scope const* parent_{nullptr};
 
   DISABLE_COPY_AND_ASSIGN(Scope);
+
+ private:
+  mutable std::mutex mutex_;
 };
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index 504344e937dfdc362cdc22298a5f963d87011e9d..06ed87e7e8a2d5324b48a466b05207042ec1b7fa 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -1,8 +1,11 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,6 +16,53 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+
+struct ReAllocateVisitor {
+  ReAllocateVisitor(const framework::DDim& dims, framework::Tensor* tensor)
+      : dims_(dims), tensor_(tensor) {}
+
+  template <typename T>
+  void operator()() const {
+    framework::Tensor cpu_tensor;
+    platform::CPUPlace cpu;
+    T* ptr = cpu_tensor.mutable_data<T>(dims_, cpu);
+    const T* old_ptr =
+        tensor_->memory_size() == 0 ? nullptr : tensor_->data<T>();
+    if (old_ptr != nullptr) {
+      std::copy(old_ptr, old_ptr + tensor_->numel(), ptr);
+    }
+    tensor_->ShareDataWith(cpu_tensor);
+  }
+
+  framework::DDim dims_;
+  framework::Tensor* tensor_;
+};
+
+struct TensorCopyVisitor {
+  TensorCopyVisitor(framework::Tensor* dst, int64_t dst_offset,
+                    const framework::Tensor src, int64_t src_offset,
+                    int64_t size)
+      : dst_(dst),
+        dst_offset_(dst_offset),
+        src_(src),
+        src_offset_(src_offset),
+        size_(size) {}
+
+  template <typename T>
+  void operator()() const {
+    // TODO(Yancey1989): support other place
+    platform::CPUPlace cpu;
+    memory::Copy(cpu, dst_->mutable_data<T>(cpu) + dst_offset_, cpu,
+                 src_.data<T>() + src_offset_, size_ * sizeof(T));
+  }
+
+  framework::Tensor* dst_;
+  int64_t dst_offset_;
+  framework::Tensor src_;
+  int64_t src_offset_;
+  int64_t size_;
+};
+
 void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
                        const platform::DeviceContext& dev_ctx) {
   {  // the 1st field, uint32_t version
@@ -65,5 +115,72 @@ void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
   TensorFromStream(is, selected_rows->mutable_value(), dev_ctx);
 }
 
+bool SelectedRows::HasKey(int64_t key) const {
+  return std::find(rows_.begin(), rows_.end(), key) == rows_.end() ? false
+                                                                   : true;
+}
+
+std::vector<std::pair<int64_t, int64_t>> SelectedRows::Get(
+    const std::vector<int64_t>& keys, framework::Tensor* value) const {
+  PADDLE_ENFORCE(value->IsInitialized(),
+                 "The value tensor should be initialized.");
+  std::vector<std::pair<int64_t, int64_t>> non_keys_pair;
+  if (keys.empty()) {
+    VLOG(3) << "keys is empty, please check data!";
+  } else {
+    int64_t value_width = value_->numel() / value_->dims()[0];
+    PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
+                      "output tensor should have the same shape with table "
+                      "except the dims[0].");
+
+    for (size_t i = 0; i < keys.size(); ++i) {
+      int64_t index = Index(keys[i]);
+      if (index == -1) {
+        non_keys_pair.push_back(
+            std::make_pair(keys[i], static_cast<int64_t>(i)));
+      } else {
+        framework::VisitDataType(
+            framework::ToDataType(value_->type()),
+            TensorCopyVisitor(value, i * value_width, *value_.get(),
+                              index * value_width, value_width));
+      }
+    }
+  }
+  return non_keys_pair;
+}
+
+bool SelectedRows::Set(int64_t key, const framework::Tensor& value) {
+  PADDLE_ENFORCE(value.IsInitialized(), "The value should be initialized.");
+  if (value_->IsInitialized()) {
+    PADDLE_ENFORCE_EQ(
+        value.type(), value_->type(),
+        "The type of the value should be same with the original value");
+  }
+  PADDLE_ENFORCE_EQ(value.dims()[0], static_cast<size_t>(1),
+                    "The first dim of value should be 1.");
+  std::lock_guard<std::mutex> lock(*auto_grown_mutex_.get());
+  auto index = Index(key);
+  bool is_new_key = false;
+  if (index == -1) {
+    rows_.push_back(key);
+    index = rows_.size() - 1;
+    is_new_key = true;
+    // whether need to resize the table
+    if (static_cast<int64_t>(rows_.size()) > value_->dims()[0]) {
+      auto dims = value_->dims();
+      dims[0] = (dims[0] + 1) << 1;
+      framework::VisitDataType(framework::ToDataType(value.type()),
+                               ReAllocateVisitor(dims, value_.get()));
+    }
+  }
+
+  framework::VisitDataType(
+      framework::ToDataType(value.type()),
+      TensorCopyVisitor(value_.get(),
+                        index * value_->numel() / value_->dims()[0], value,
+                        static_cast<int64_t>(0), value.numel()));
+  return is_new_key;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index c9c2c1bb721f2c527fa52f45cc54883f639f4ef8..7160670ddd204c20021ea87cdd67ee4721d03451 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -1,8 +1,11 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -10,22 +13,48 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
+#include <algorithm>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <utility>
+#include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/memory/memcpy.h"
 
 namespace paddle {
 namespace framework {
 
 class SelectedRows {
+  /*
+   * @brief We can use the SelectedRows structure to reproduce a sparse table.
+   *  A sparse table is a key-value structure that the key is an `int64_t`
+   * number,
+   *  and the value is a Tensor which the first dimension is 0.
+   *  You can use the following interface to operate the sparse table, and you
+   * can find
+   *  some detail information from the comments of each interface:
+   *
+   *  HasKey(key), whether the sparse table has the specified key.
+   *  Set(key, value), set a key-value pair into the sparse table.
+   *  Get(keys, value*), get value by given key list and apply it to the given
+   * value pointer
+   *    with the specified offset.
+   *
+   */
  public:
   SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
       : rows_(rows), height_(height) {
     value_.reset(new Tensor());
+    auto_grown_mutex_.reset(new std::mutex);
   }
 
   SelectedRows() {
     height_ = 0;
     value_.reset(new Tensor());
+    auto_grown_mutex_.reset(new std::mutex);
   }
 
   platform::Place place() const { return value_->place(); }
@@ -44,6 +73,49 @@ class SelectedRows {
 
   void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
 
+  /*
+   * @brief wheter has the specified key in the table.
+   *
+   * @return true if the key is exists.
+   */
+  bool HasKey(int64_t key) const;
+
+  /*
+   * @brief Get value by the key list, if the
+   *
+   * @return a list of pair which contains the non-exists key and the index in
+   * the value
+   */
+  std::vector<std::pair<int64_t, int64_t>> Get(const std::vector<int64_t>& keys,
+                                               framework::Tensor* value) const;
+
+  /*
+   * @brief Set a key-value pair into the table.
+   *  This function will double the value memory if it's not engouth.
+   *
+   * @note:
+   *    1. The first dim of the value should be 1
+   *    2. The value should be initialized and the data type
+   *       should be the same with the table.
+   *
+   * @return true if the key is a new one, otherwise false
+   *
+   */
+  bool Set(int64_t key, const Tensor& value);
+
+  /*
+   * @brief Get the index of key in rows
+   *
+   * @return -1 if the key does not exists.
+   */
+  int64_t Index(int64_t key) const {
+    auto it = std::find(rows_.begin(), rows_.end(), key);
+    if (it == rows_.end()) {
+      return static_cast<int64_t>(-1);
+    }
+    return static_cast<int64_t>(std::distance(rows_.begin(), it));
+  }
+
   DDim GetCompleteDims() const {
     std::vector<int64_t> dims = vectorize(value_->dims());
     dims[0] = height_;
@@ -52,11 +124,12 @@ class SelectedRows {
 
  private:
   // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here.
-  // SelectedRows are simplely concated when adding together. Until a
+  // SelectedRows are simply concated when adding together. Until a
   // SelectedRows add a Tensor, will the duplicate rows be handled.
   Vector<int64_t> rows_;
   std::unique_ptr<Tensor> value_{nullptr};
   int64_t height_;
+  std::unique_ptr<std::mutex> auto_grown_mutex_{nullptr};
 };
 
 /*
diff --git a/paddle/fluid/framework/selected_rows_test.cc b/paddle/fluid/framework/selected_rows_test.cc
index 960d8d64f04a819217413ff881977ce5fb5a30f2..eefcaa5672c5a3debf162f5c8eda653408dcf221 100644
--- a/paddle/fluid/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_test.cc
@@ -17,7 +17,7 @@ namespace framework {
 
 class SelectedRowsTester : public ::testing::Test {
  public:
-  virtual void SetUp() override {
+  void SetUp() override {
     std::vector<int64_t> rows{0, 4, 7};
     int64_t height = 10;
     int64_t row_numel = 100;
@@ -59,5 +59,40 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
   ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
 }
 
+TEST_F(SelectedRowsTester, SparseTable) {
+  platform::CPUPlace cpu;
+  SelectedRows table;
+  // initialize a sparse table
+  table.mutable_value()->Resize(framework::make_ddim({1, 100}));
+  table.mutable_value()->mutable_data<float>(cpu);
+  table.mutable_rows()->push_back(1);
+
+  int64_t key = 10000;
+  int64_t non_key = 999;
+  framework::Tensor value;
+  value.Resize(framework::make_ddim({1, 100}));
+  auto ptr = value.mutable_data<float>(cpu);
+  ptr[0] = static_cast<float>(10);
+
+  ASSERT_EQ(table.rows().size(), static_cast<size_t>(1));
+  ASSERT_EQ(table.HasKey(key), false);
+
+  table.Set(key, value);
+
+  ASSERT_EQ(table.rows().size(), static_cast<size_t>(2));
+  ASSERT_EQ(table.HasKey(key), true);
+  // check re-allocate
+  ASSERT_EQ(table.value().dims()[0], static_cast<int64_t>(4));
+
+  framework::Tensor get_value;
+  get_value.mutable_data<float>(framework::make_ddim({2, 100}), cpu);
+  std::vector<int64_t> keys({non_key, key});
+  auto non_key_pairs = table.Get(keys, &get_value);
+
+  ASSERT_EQ(get_value.data<float>()[100], static_cast<float>(10));
+  ASSERT_EQ(non_key_pairs.size(), static_cast<size_t>(1));
+  ASSERT_EQ(non_key_pairs[0].first, non_key);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/shape_inference.cc b/paddle/fluid/framework/shape_inference.cc
index dc9a79020f103dadfd9837cffb18ad5946f95f31..ddff2c7c261746ac9986e79cff3da7e0a9654adc 100644
--- a/paddle/fluid/framework/shape_inference.cc
+++ b/paddle/fluid/framework/shape_inference.cc
@@ -11,8 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #include "paddle/fluid/framework/shape_inference.h"
-#include "grad_op_desc_maker.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/grad_op_desc_maker.h"
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h
index bc02d700da5186cea5f370b9676e408f62a66a68..5f497cafa0f75f7c23d550ef767d55274de7c900 100644
--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/framework.pb.h"
@@ -61,6 +63,7 @@ class InferShapeContext {
 
   std::vector<InferShapeVarPtr> GetInputVarPtrs(const std::string &name);
   std::vector<InferShapeVarPtr> GetOutputVarPtrs(const std::string &name);
+  virtual InferShapeVarPtr GetVarPtr(const std::string &name) = 0;
 
   // Note: In while op, we need this to be public
   void SetDims(const std::vector<std::string> &names,
@@ -79,8 +82,6 @@ class InferShapeContext {
       const std::vector<std::string> &names) const;
 
   virtual proto::VarType::Type GetVarType(const std::string &name) const = 0;
-
-  virtual InferShapeVarPtr GetVarPtr(const std::string &name) = 0;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index e97ada06f06d0538f17160220e3aa3f4ffc55520..c7286dacf01659f3af0927a71856e5a6496cb877 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -15,5 +15,102 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 
 namespace paddle {
-namespace framework {}
+namespace framework {
+extern size_t SizeOfType(std::type_index type);
+void Tensor::check_memory_size() const {
+  PADDLE_ENFORCE_NOT_NULL(
+      holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
+  PADDLE_ENFORCE_LE(
+      numel() * SizeOfType(type()), memory_size(),
+      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
+      "first to re-allocate memory.\n"
+      "or maybe the required data-type mismatches the data already stored.");
+}
+
+size_t Tensor::memory_size() const {
+  return holder_ == nullptr ? 0UL : holder_->size() - offset_;
+}
+
+void* Tensor::mutable_data(platform::Place place, std::type_index type) {
+  if (holder_ != nullptr) {
+    holder_->set_type(type);
+  }
+  PADDLE_ENFORCE_GE(numel(), 0,
+                    "When calling this method, the Tensor's numel must be "
+                    "equal or larger than zero. "
+                    "Please check Tensor::Resize has been called first.");
+  int64_t size = numel() * SizeOfType(type);
+  /* some versions of boost::variant don't have operator!= */
+  if (holder_ == nullptr || !(holder_->place() == place) ||
+      holder_->size() < size + offset_) {
+    if (platform::is_cpu_place(place)) {
+      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
+          boost::get<platform::CPUPlace>(place), size, type));
+    } else if (platform::is_gpu_place(place) ||
+               platform::is_cuda_pinned_place(place)) {
+#ifndef PADDLE_WITH_CUDA
+      PADDLE_THROW(
+          "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
+    }
+#else
+      if (platform::is_gpu_place(place)) {
+        holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
+            boost::get<platform::CUDAPlace>(place), size, type));
+      } else if (platform::is_cuda_pinned_place(place)) {
+        holder_.reset(new PlaceholderImpl<platform::CUDAPinnedPlace>(
+            boost::get<platform::CUDAPinnedPlace>(place), size, type));
+      }
+    }
+#endif
+    offset_ = 0;
+  }
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                                 offset_);
+}
+
+void* Tensor::mutable_data(platform::Place place) {
+  PADDLE_ENFORCE(this->holder_ != nullptr,
+                 "Cannot invoke mutable data if current hold nothing.");
+  return mutable_data(place, holder_->type());
+}
+
+Tensor& Tensor::ShareDataWith(const Tensor& src) {
+  src.check_memory_size();
+  *this = src;
+  return *this;
+}
+
+Tensor Tensor::Slice(int begin_idx, int end_idx) const {
+  check_memory_size();
+  PADDLE_ENFORCE_GE(begin_idx, 0,
+                    "The start row index must be greater than 0.");
+  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound.");
+  PADDLE_ENFORCE_LT(
+      begin_idx, end_idx,
+      "The start row index must be lesser than the end row index.");
+
+  if (dims_[0] == 1) {
+    return *this;
+  } else {
+    size_t base = numel() / dims_[0];
+    Tensor dst;
+    dst.holder_ = holder_;
+    dst.set_layout(layout_);
+    DDim dst_dims = dims_;
+    dst_dims[0] = end_idx - begin_idx;
+    dst.Resize(dst_dims);
+    dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
+    return dst;
+  }
+}
+
+Tensor& Tensor::Resize(const DDim& dims) {
+  dims_ = dims;
+  return *this;
+}
+
+const DDim& Tensor::dims() const { return dims_; }
+
+int64_t Tensor::numel() const { return product(dims_); }
+}  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 6f878541e6de1deec1829145b1b325ecd176a034..ef224d68f1fc561f45e9d7a81425e62655457648 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -34,6 +34,28 @@ namespace framework {
 class LoDTensor;
 
 class Tensor {
+#ifdef PADDLE_WITH_MKLDNN
+
+ public:
+  inline mkldnn::memory::format format() const { return format_; }
+
+  inline void set_format(const mkldnn::memory::format format) {
+    format_ = format;
+  }
+
+ protected:
+  /**
+   * @brief the detail format of memory block which have layout as kMKLDNN
+   *
+   * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
+   *       nChw16c, etc. For a MKLDNN memory block, layout will be set as
+   *       DataLayout::kMKLDNN meanwhile detail memory format will be kept in
+   *       this field.
+   */
+
+  mkldnn::memory::format format_ = mkldnn::memory::format::format_undef;
+#endif
+
  public:
   template <typename T, size_t D, int MajorType, typename IndexType>
   friend struct EigenTensor;
@@ -54,26 +76,24 @@ class Tensor {
 
   /*! Return a pointer to mutable memory block. */
   template <typename T>
-  inline T* data();
+  T* data();
 
   /*! Return a pointer to constant memory block. */
   template <typename T>
-  inline const T* data() const;
+  const T* data() const;
 
-  inline bool IsInitialized() const;
-
-  inline void switch_place(platform::Place new_place);
+  bool IsInitialized() const;
 
   /**
    * @brief   Return a pointer to mutable memory block.
    * @note    If not exist, then allocation.
    */
   template <typename T>
-  inline T* mutable_data(platform::Place place);
+  T* mutable_data(platform::Place place);
 
-  inline void* mutable_data(platform::Place place, std::type_index type);
+  void* mutable_data(platform::Place place, std::type_index type);
 
-  inline void* mutable_data(platform::Place place);
+  void* mutable_data(platform::Place place);
 
   /**
    * @brief     Return a pointer to mutable memory block.
@@ -84,19 +104,19 @@ class Tensor {
    * @note      If not exist, then allocation.
    */
   template <typename T>
-  inline T* mutable_data(DDim dims, platform::Place place);
+  T* mutable_data(DDim dims, platform::Place place);
 
   /*! Return the dimensions of the memory block. */
-  inline const DDim& dims() const;
+  const DDim& dims() const;
 
   /*! Return the numel of the memory block. */
-  inline int64_t numel() const;
+  int64_t numel() const;
 
   /*! Resize the dimensions of the memory block. */
-  inline Tensor& Resize(const DDim& dims);
+  Tensor& Resize(const DDim& dims);
 
   /*! The internal of two tensors share the same memory block. */
-  inline Tensor& ShareDataWith(const Tensor& src);
+  Tensor& ShareDataWith(const Tensor& src);
 
   /**
    * @brief  Return a sub-tensor of the given tensor.
@@ -106,7 +126,7 @@ class Tensor {
    * @param[in] end_idx     The index of the end row(exclusive) to slice.
    *                        The index number begins from 0.
    */
-  inline Tensor Slice(int begin_idx, int end_idx) const;
+  Tensor Slice(int begin_idx, int end_idx) const;
 
   platform::Place place() const {
     PADDLE_ENFORCE_NOT_NULL(
@@ -123,11 +143,11 @@ class Tensor {
   // memory size returns the holding memory size in byte.
   size_t memory_size() const;
 
-  inline void check_memory_size() const;
+  void check_memory_size() const;
 
-  inline DataLayout layout() const { return layout_; }
+  DataLayout layout() const { return layout_; }
 
-  inline void set_layout(const DataLayout layout) { layout_ = layout; }
+  void set_layout(const DataLayout layout) { layout_ = layout; }
 
  private:
   /**
@@ -197,8 +217,10 @@ class Tensor {
    *       N,C,H,W for respectively the batch size, the number of
    *       feature maps, the height.
    */
-
-  DataLayout layout_ = DataLayout::kNHWC;
+  // Fix me: here just change the default layout to kNCHW
+  // it doesn't fix the real issue, i.e. feeder should set up tensor layout
+  // according to actual input data
+  DataLayout layout_ = DataLayout::kNCHW;
 
   /**
    * @brief   A PlaceHolder may be shared by more than one tensor.
@@ -210,15 +232,6 @@ class Tensor {
   size_t offset_;
 };
 
-inline void Tensor::switch_place(platform::Place new_place) {
-  if (holder_->place() == new_place) {
-    return;
-  }
-
-  // TODO(tonyyang-svail): do memcpy here.
-  PADDLE_THROW("Not Implemented");
-}
-
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index 638bd0db9d7025199c31a9327b96062512aa5adb..7f678f869aac4616c8bca440d0431f765da41dd6 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -13,74 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace framework {
-
-template <typename... T>
-struct SizeOfTypeFunctor;
-
-template <typename T>
-struct SizeOfTypeFunctor<T> {
-  size_t operator()(std::type_index type) const {
-    if (typeid(T).hash_code() == type.hash_code()) {
-      return sizeof(T);
-    } else {
-      return 0UL;
-    }
-  }
-};
-
-template <>
-struct SizeOfTypeFunctor<> {
-  size_t operator()(std::type_index type) const { return 0UL; }
-};
-
-template <typename HEAD, typename... TAIL>
-struct SizeOfTypeFunctor<HEAD, TAIL...> {
-  size_t operator()(std::type_index type) const {
-    SizeOfTypeFunctor<HEAD> head;
-    size_t head_size = head(type);
-    if (head_size != 0) {
-      return head_size;
-    }
-    SizeOfTypeFunctor<TAIL...> tail;
-    return tail(type);
-  }
-};
-
-static inline size_t SizeOfType(std::type_index type) {
-  SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool, size_t,
-                    platform::float16>
-      functor;
-  size_t size = functor(type);
-  PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
-  return size;
-}
-
-inline void Tensor::check_memory_size() const {
-  PADDLE_ENFORCE_NOT_NULL(
-      holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
-  PADDLE_ENFORCE_LE(
-      numel() * SizeOfType(type()), memory_size(),
-      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
-      "first to re-allocate memory.\n"
-      "or maybe the required data-type mismatches the data already stored.");
-}
-
-inline size_t Tensor::memory_size() const {
-  return holder_ == nullptr ? 0UL : holder_->size() - offset_;
-}
-
 template <typename T>
 inline const T* Tensor::data() const {
   check_memory_size();
-  PADDLE_ENFORCE(std::is_same<T, void>::value ||
-                     holder_->type().hash_code() == typeid(T).hash_code(),
-                 "Tensor holds the wrong type, it holds %s",
+  bool valid = std::is_same<T, void>::value ||
+               holder_->type() == std::type_index(typeid(T));
+  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
                  this->holder_->type().name());
 
   return reinterpret_cast<const T*>(
@@ -92,9 +37,9 @@ inline bool Tensor::IsInitialized() const { return holder_ != nullptr; }
 template <typename T>
 inline T* Tensor::data() {
   check_memory_size();
-  PADDLE_ENFORCE(std::is_same<T, void>::value ||
-                     holder_->type().hash_code() == typeid(T).hash_code(),
-                 "Tensor holds the wrong type, it holds %s",
+  bool valid = std::is_same<T, void>::value ||
+               holder_->type() == std::type_index(typeid(T));
+  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
                  this->holder_->type().name());
   return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                               offset_);
@@ -113,81 +58,6 @@ inline T* Tensor::mutable_data(platform::Place place) {
   return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
 }
 
-inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
-  if (holder_ != nullptr) {
-    holder_->set_type(type);
-  }
-  PADDLE_ENFORCE_GT(
-      numel(), 0,
-      "When calling this method, the Tensor's numel must be larger than zero. "
-      "Please check Tensor::Resize has been called first.");
-  int64_t size = numel() * SizeOfType(type);
-  /* some versions of boost::variant don't have operator!= */
-  if (holder_ == nullptr || !(holder_->place() == place) ||
-      holder_->size() < size + offset_) {
-    if (platform::is_cpu_place(place)) {
-      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
-          boost::get<platform::CPUPlace>(place), size, type));
-    } else if (platform::is_gpu_place(place)) {
-#ifndef PADDLE_WITH_CUDA
-      PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
-    }
-#else
-      holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
-          boost::get<platform::CUDAPlace>(place), size, type));
-    }
-#endif
-    offset_ = 0;
-  }
-  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                 offset_);
-}
-
-inline void* Tensor::mutable_data(platform::Place place) {
-  PADDLE_ENFORCE(this->holder_ != nullptr,
-                 "Cannot invoke mutable data if current hold nothing");
-  return mutable_data(place, holder_->type());
-}
-
-inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
-  src.check_memory_size();
-  *this = src;
-  return *this;
-}
-
-inline Tensor Tensor::Slice(int begin_idx, int end_idx) const {
-  check_memory_size();
-  PADDLE_ENFORCE_GE(begin_idx, 0,
-                    "The start row index must be greater than 0.");
-  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound.");
-  PADDLE_ENFORCE_LT(
-      begin_idx, end_idx,
-      "The start row index must be lesser than the end row index.");
-
-  if (dims_[0] == 1) {
-    return *this;
-  } else {
-    size_t base = numel() / dims_[0];
-    Tensor dst;
-    dst.holder_ = holder_;
-    dst.set_layout(layout_);
-    DDim dst_dims = dims_;
-    dst_dims[0] = end_idx - begin_idx;
-    dst.Resize(dst_dims);
-    dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
-    return dst;
-  }
-}
-
-inline Tensor& Tensor::Resize(const DDim& dims) {
-  dims_ = dims;
-  return *this;
-}
-
-inline const DDim& Tensor::dims() const { return dims_; }
-
-inline int64_t Tensor::numel() const { return product(dims_); }
-
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
   Tensor res;
   res.ShareDataWith(src);
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index e1012de2ec36eb4a858202d56a678b6a204c2f0a..0a1cb6d5703dace5e6be73285655ecd9d2ad89fb 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -209,7 +209,7 @@ TEST(Tensor, ReshapeToMatrix) {
 
 TEST(Tensor, Layout) {
   framework::Tensor src;
-  ASSERT_EQ(src.layout(), framework::DataLayout::kNHWC);
+  ASSERT_EQ(src.layout(), framework::DataLayout::kNCHW);
   src.set_layout(framework::DataLayout::kAnyLayout);
   ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout);
 }
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 8b7533ce712b0a01060842b6f71449ed6bd23e2c..f98011e896f4033ef210e0eb69f93ce7800a3cd6 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -11,8 +11,10 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License. */
-
 #include "paddle/fluid/framework/tensor_util.h"
+#include <algorithm>
+#include <limits>
+#include <vector>
 
 namespace paddle {
 namespace framework {
@@ -45,9 +47,9 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
     auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
     PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
-    memory::Copy(
-        dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
   } else if (platform::is_cpu_place(src_place) &&
              platform::is_gpu_place(dst_place)) {
     auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
@@ -56,20 +58,33 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
     auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
     PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
-    memory::Copy(
-        dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
   } else if (platform::is_gpu_place(src_place) &&
              platform::is_gpu_place(dst_place)) {
     auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
     auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
     auto ctx_place = ctx.GetPlace();
     PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
-    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
-    memory::Copy(
-        dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
+    if (platform::is_same_place(src_place, dst_place)) {
+      memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+                   stream);
+    } else {
+      if (platform::is_same_place(ctx_place, src_place)) {
+        memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+                     stream);
+        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
+      } else if (platform::is_same_place(ctx_place, dst_place)) {
+        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
+        memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+                     stream);
+      } else {
+        PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place.");
+      }
+    }
   }
 #endif
 }
@@ -78,14 +93,49 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 Tensor* dst) {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   const platform::DeviceContext* dev_ctx;
-  if (platform::is_gpu_place(src.place())) {
-    dev_ctx = pool.Get(src.place());
-  } else {
+  if (platform::is_gpu_place(dst_place)) {
     dev_ctx = pool.Get(dst_place);
+  } else {
+    dev_ctx = pool.Get(src.place());
   }
   TensorCopy(src, dst_place, *dev_ctx, dst);
 }
 
+void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
+                    Tensor* dst) {
+  VLOG(3) << "TensorCopySync " << src.dims() << " from " << src.place()
+          << " to " << dst_place;
+  src.check_memory_size();
+  dst->Resize(src.dims());
+  dst->set_layout(src.layout());
+  auto src_place = src.place();
+  auto src_ptr = src.data<void>();
+  auto dst_ptr = dst->mutable_data(dst_place, src.type());
+  auto size = src.numel() * SizeOfType(src.type());
+  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
+    auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
+    memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
+  } else if (platform::is_cpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
+    memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr);
+  } else if (platform::is_gpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
+    memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
+  }
+#endif
+}
+
 template <typename Predicate, typename DevCtx>
 struct AnyDTypeVisitor {
   Predicate predicate_;
@@ -148,6 +198,11 @@ struct AnyVisitor : public boost::static_visitor<bool> {
                  const platform::CPUPlace& cpu) const {
     return *out.data<bool>();
   }
+
+  bool GetResult(const framework::Tensor& out,
+                 const platform::CUDAPinnedPlace& cpu) const {
+    return *out.data<bool>();
+  }
 };
 
 template <typename Predicate>
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 38b6d1c5c46dcce718f91d574ceea5de2099b787..4457382ade37a12f5f3613fc4113fbf1f6f91124 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <vector>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/framework.pb.h"
@@ -22,11 +23,28 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+// NOTE(zcd): Because TensorCopy is an async operation, when the src_place
+// and dst_place are two different GPU, to ensure that the operation can
+// be carried out correctly, there is a src_ctx wait operation in TensorCopy.
+// If ctx_place and src_place are the same, src_ctx.Wait() is added
+// after memory::Copy; if ctx_place and dst_place are the same,
+// src_ctx.Wait() is added before memory::Copy.
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 const platform::DeviceContext& ctx, Tensor* dst);
+
+// NOTE(zcd): If the src.place() and dst_place are two different GPU,
+// the copy operation is carried out on the dst_place's stream. This is
+// very important, because TensorCopy is an async operator, and in most
+// case, once this copy operator returns, dst is to be used in dst_place's
+// stream, if this copy operation is carried out on the src_place's stream,
+// when dst is used in dst_place's stream the copy operation may be
+// not completed.
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 Tensor* dst);
 
+void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
+                    Tensor* dst);
+
 template <typename T>
 void TensorFromVector(const std::vector<T>& src,
                       const platform::DeviceContext& ctx, Tensor* dst);
diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index 9687a86ca25be7886e67028a38e54b3065c8e4b5..6e10885890cd2d4a0d77834944b37e291197b637 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -105,16 +105,14 @@ TEST(TensorCopy, Tensor) {
 }
 
 TEST(TensorFromVector, Tensor) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    Tensor cpu_tensor;
+    paddle::framework::Tensor cpu_tensor;
 
     // Copy to CPU Tensor
-    cpu_tensor.Resize(make_ddim({3, 3}));
+    cpu_tensor.Resize(paddle::framework::make_ddim({3, 3}));
     auto cpu_place = new paddle::platform::CPUPlace();
-    TensorFromVector<int>(src_vec, &cpu_tensor);
+    paddle::framework::TensorFromVector<int>(src_vec, &cpu_tensor);
 
     // Compare Tensors
     const int* cpu_ptr = cpu_tensor.data<int>();
@@ -125,8 +123,8 @@ TEST(TensorFromVector, Tensor) {
     }
 
     src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
-    cpu_tensor.Resize(make_ddim({2, 2}));
-    TensorFromVector<int>(src_vec, &cpu_tensor);
+    cpu_tensor.Resize(paddle::framework::make_ddim({2, 2}));
+    paddle::framework::TensorFromVector<int>(src_vec, &cpu_tensor);
     cpu_ptr = cpu_tensor.data<int>();
     src_ptr = src_vec.data();
     ASSERT_NE(src_ptr, cpu_ptr);
@@ -140,23 +138,23 @@ TEST(TensorFromVector, Tensor) {
 #ifdef PADDLE_WITH_CUDA
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    Tensor cpu_tensor;
-    Tensor gpu_tensor;
-    Tensor dst_tensor;
+    paddle::framework::Tensor cpu_tensor;
+    paddle::framework::Tensor gpu_tensor;
+    paddle::framework::Tensor dst_tensor;
 
     // Copy to CPU Tensor
     cpu_tensor.Resize(make_ddim({3, 3}));
     auto cpu_place = new paddle::platform::CPUPlace();
-    CPUDeviceContext cpu_ctx(*cpu_place);
-    TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+    paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place);
+    paddle::framework::TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
 
     // Copy to GPUTensor
-    gpu_tensor.Resize(make_ddim({3, 3}));
+    gpu_tensor.Resize(paddle::framework::make_ddim({3, 3}));
     auto gpu_place = new paddle::platform::CUDAPlace();
-    CUDADeviceContext gpu_ctx(*gpu_place);
-    TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+    paddle::platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
     // Copy from GPU to CPU tensor for comparison
-    TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+    paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
 
     // Sync before Compare Tensors
     gpu_ctx.Wait();
@@ -172,11 +170,11 @@ TEST(TensorFromVector, Tensor) {
 
     src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
 
-    cpu_tensor.Resize(make_ddim({2, 2}));
-    TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
-    gpu_tensor.Resize(make_ddim({2, 2}));
-    TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
-    TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+    cpu_tensor.Resize(paddle::framework::make_ddim({2, 2}));
+    paddle::framework::TensorFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+    gpu_tensor.Resize(paddle::framework::make_ddim({2, 2}));
+    paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+    paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
 
     // Sync before Compare Tensors
     gpu_ctx.Wait();
@@ -197,18 +195,16 @@ TEST(TensorFromVector, Tensor) {
 }
 
 TEST(TensorToVector, Tensor) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
   {
-    Tensor src;
-    int* src_ptr = src.mutable_data<int>({3, 3}, CPUPlace());
+    paddle::framework::Tensor src;
+    int* src_ptr = src.mutable_data<int>({3, 3}, paddle::platform::CPUPlace());
     for (int i = 0; i < 3 * 3; ++i) {
       src_ptr[i] = i;
     }
 
-    CPUPlace place;
+    paddle::platform::CPUPlace place;
     std::vector<int> dst;
-    TensorToVector<int>(src, &dst);
+    paddle::framework::TensorToVector<int>(src, &dst);
 
     for (int i = 0; i < 3 * 3; ++i) {
       EXPECT_EQ(src_ptr[i], dst[i]);
@@ -217,13 +213,13 @@ TEST(TensorToVector, Tensor) {
 #ifdef PADDLE_WITH_CUDA
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    Tensor gpu_tensor;
-    CUDAPlace place;
-    CUDADeviceContext gpu_ctx(place);
-    TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+    paddle::framework::Tensor gpu_tensor;
+    paddle::platform::CUDAPlace place;
+    paddle::platform::CUDADeviceContext gpu_ctx(place);
+    paddle::framework::TensorFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
 
     std::vector<int> dst;
-    TensorToVector<int>(gpu_tensor, gpu_ctx, &dst);
+    paddle::framework::TensorToVector<int>(gpu_tensor, gpu_ctx, &dst);
 
     for (int i = 0; i < 3 * 3; ++i) {
       EXPECT_EQ(src_vec[i], dst[i]);
@@ -233,54 +229,54 @@ TEST(TensorToVector, Tensor) {
 }
 
 TEST(TensorContainsNAN, CPU) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
   {
-    Tensor src;
-    float* buf = src.mutable_data<float>({3}, CPUPlace());
+    paddle::framework::Tensor src;
+    float* buf = src.mutable_data<float>({3}, paddle::platform::CPUPlace());
     buf[0] = 0.0;
     buf[1] = NAN;
     buf[2] = 0.0;
-    ASSERT_TRUE(TensorContainsNAN(src));
+    ASSERT_TRUE(paddle::framework::TensorContainsNAN(src));
     buf[1] = 0.0;
-    ASSERT_FALSE(TensorContainsNAN(src));
+    ASSERT_FALSE(paddle::framework::TensorContainsNAN(src));
   }
 
   {
-    Tensor src;
-    float16* buf = src.mutable_data<float16>({3}, CPUPlace());
+    paddle::framework::Tensor src;
+    paddle::platform::float16* buf =
+        src.mutable_data<paddle::platform::float16>(
+            {3}, paddle::platform::CPUPlace());
     buf[0] = 0.0;
     buf[1].x = 0x7fff;
     buf[2] = 0.0;
-    ASSERT_TRUE(TensorContainsNAN(src));
+    ASSERT_TRUE(paddle::framework::TensorContainsNAN(src));
     buf[1] = 0.0;
-    ASSERT_FALSE(TensorContainsNAN(src));
+    ASSERT_FALSE(paddle::framework::TensorContainsNAN(src));
   }
 }
 
 TEST(TensorContainsInf, CPU) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
   {
-    Tensor src;
-    double* buf = src.mutable_data<double>({3}, CPUPlace());
+    paddle::framework::Tensor src;
+    double* buf = src.mutable_data<double>({3}, paddle::platform::CPUPlace());
     buf[0] = 1.0;
     buf[1] = INFINITY;
     buf[2] = 0.0;
-    ASSERT_TRUE(TensorContainsInf(src));
+    ASSERT_TRUE(paddle::framework::TensorContainsInf(src));
     buf[1] = 1.0;
-    ASSERT_FALSE(TensorContainsInf(src));
+    ASSERT_FALSE(paddle::framework::TensorContainsInf(src));
   }
 
   {
-    Tensor src;
-    float16* buf = src.mutable_data<float16>({3}, CPUPlace());
+    paddle::framework::Tensor src;
+    paddle::platform::float16* buf =
+        src.mutable_data<paddle::platform::float16>(
+            {3}, paddle::platform::CPUPlace());
     buf[0] = 1.0;
     buf[1].x = 0x7c00;
     buf[2] = 0.0;
-    ASSERT_TRUE(TensorContainsInf(src));
+    ASSERT_TRUE(paddle::framework::TensorContainsInf(src));
     buf[1] = 1.0;
-    ASSERT_FALSE(TensorContainsInf(src));
+    ASSERT_FALSE(paddle::framework::TensorContainsInf(src));
   }
 }
 
diff --git a/paddle/fluid/framework/tensor_util_test.cu b/paddle/fluid/framework/tensor_util_test.cu
index 4766ec28aa3cff6be3259f258f1c9543ae471f5d..b4cff1e6c2293fa44f0fd0bb398a538c08dd4fb1 100644
--- a/paddle/fluid/framework/tensor_util_test.cu
+++ b/paddle/fluid/framework/tensor_util_test.cu
@@ -45,9 +45,8 @@ static __global__ void FillInf(platform::float16* buf) {
 }
 
 TEST(TensorContainsNAN, GPU) {
-  using namespace paddle::platform;
-  CUDAPlace gpu(0);
-  auto& pool = DeviceContextPool::Instance();
+  paddle::platform::CUDAPlace gpu(0);
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
   auto* cuda_ctx = pool.GetByPlace(gpu);
   {
     Tensor tensor;
@@ -58,7 +57,8 @@ TEST(TensorContainsNAN, GPU) {
   }
   {
     Tensor tensor;
-    float16* buf = tensor.mutable_data<float16>({3}, gpu);
+    paddle::platform::float16* buf =
+        tensor.mutable_data<paddle::platform::float16>({3}, gpu);
     FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
     cuda_ctx->Wait();
     ASSERT_TRUE(TensorContainsNAN(tensor));
@@ -66,9 +66,8 @@ TEST(TensorContainsNAN, GPU) {
 }
 
 TEST(TensorContainsInf, GPU) {
-  using namespace paddle::platform;
-  CUDAPlace gpu(0);
-  auto& pool = DeviceContextPool::Instance();
+  paddle::platform::CUDAPlace gpu(0);
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
   auto* cuda_ctx = pool.GetByPlace(gpu);
   {
     Tensor tensor;
@@ -79,7 +78,8 @@ TEST(TensorContainsInf, GPU) {
   }
   {
     Tensor tensor;
-    float16* buf = tensor.mutable_data<float16>({3}, gpu);
+    paddle::platform::float16* buf =
+        tensor.mutable_data<paddle::platform::float16>({3}, gpu);
     FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
     cuda_ctx->Wait();
     ASSERT_TRUE(TensorContainsInf(tensor));
diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc
index 9854d618d2b29ed123833f55198179638c95d6db..f26f212d4d5793b88fd1e6d782cdf983bf341879 100644
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@@ -14,8 +14,12 @@
 
 #include "paddle/fluid/framework/threadpool.h"
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/platform/enforce.h"
 
+DEFINE_int32(io_threadpool_size, 100,
+             "number of threads used for doing IO, default 100");
+
 namespace paddle {
 namespace framework {
 
@@ -91,5 +95,20 @@ void ThreadPool::TaskLoop() {
   }
 }
 
+std::unique_ptr<ThreadPool> ThreadPoolIO::io_threadpool_(nullptr);
+std::once_flag ThreadPoolIO::io_init_flag_;
+
+ThreadPool* ThreadPoolIO::GetInstanceIO() {
+  std::call_once(io_init_flag_, &ThreadPoolIO::InitIO);
+  return io_threadpool_.get();
+}
+
+void ThreadPoolIO::InitIO() {
+  if (io_threadpool_.get() == nullptr) {
+    // TODO(typhoonzero1986): make this configurable
+    io_threadpool_.reset(new ThreadPool(FLAGS_io_threadpool_size));
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h
index df51fb24a588c84788d7d0b671f932ff4c40f9c2..94111ee335b1a5df327b3e46d62069b4735c54f6 100644
--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
@@ -14,12 +14,12 @@ limitations under the License. */
 
 #pragma once
 
-#include <condition_variable>
+#include <condition_variable>  // NOLINT
 #include <functional>
-#include <future>
-#include <mutex>
+#include <future>  // NOLINT
+#include <mutex>   // NOLINT
 #include <queue>
-#include <thread>
+#include <thread>  // NOLINT
 #include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -28,10 +28,28 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+struct ExceptionHandler {
+  mutable std::future<std::unique_ptr<platform::EnforceNotMet>> future_;
+  explicit ExceptionHandler(
+      std::future<std::unique_ptr<platform::EnforceNotMet>>&& f)
+      : future_(std::move(f)) {}
+  void operator()() const {
+    auto ex = this->future_.get();
+    if (ex != nullptr) {
+      LOG(FATAL) << "The exception is thrown inside the thread pool. You "
+                    "should use RunAndGetException to handle the exception.\n"
+                    "The default exception handler is LOG(FATAL)."
+                 << ex->what();
+    }
+  }
+};
+
 // ThreadPool maintains a queue of tasks, and runs them using a fixed
 // number of threads.
 class ThreadPool {
  public:
+  explicit ThreadPool(int num_threads);
+
   using Task = std::packaged_task<std::unique_ptr<platform::EnforceNotMet>()>;
 
   // Returns the singleton of ThreadPool.
@@ -85,26 +103,8 @@ class ThreadPool {
   void Wait();
 
  private:
-  struct ExceptionHandler {
-    mutable std::future<std::unique_ptr<platform::EnforceNotMet>> future_;
-    explicit ExceptionHandler(
-        std::future<std::unique_ptr<platform::EnforceNotMet>>&& f)
-        : future_(std::move(f)) {}
-    void operator()() const {
-      auto ex = this->future_.get();
-      if (ex != nullptr) {
-        LOG(FATAL) << "The exception is thrown inside the thread pool. You "
-                      "should use RunAndGetException to handle the exception.\n"
-                      "The default exception handler is LOG(FATAL)."
-                   << ex->what();
-      }
-    }
-  };
-
   DISABLE_COPY_AND_ASSIGN(ThreadPool);
 
-  explicit ThreadPool(int num_threads);
-
   // If the task queue is empty and avaialbe is equal to the number of
   // threads, means that all tasks are completed.  Note: this function
   // is not thread-safe.  Returns true if all tasks are completed.
@@ -135,6 +135,17 @@ class ThreadPool {
   std::condition_variable completed_;
 };
 
+class ThreadPoolIO : ThreadPool {
+ public:
+  static ThreadPool* GetInstanceIO();
+  static void InitIO();
+
+ private:
+  // NOTE: threadpool in base will be inhereted here.
+  static std::unique_ptr<ThreadPool> io_threadpool_;
+  static std::once_flag io_init_flag_;
+};
+
 // Run a function asynchronously.
 // NOTE: The function must return void. If the function need to return a value,
 // you can use lambda to capture a value pointer.
@@ -143,5 +154,10 @@ std::future<void> Async(Callback callback) {
   return ThreadPool::GetInstance()->Run(callback);
 }
 
+template <typename Callback>
+std::future<void> AsyncIO(Callback callback) {
+  return ThreadPoolIO::GetInstanceIO()->Run(callback);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/threadpool_test.cc b/paddle/fluid/framework/threadpool_test.cc
index 4da83d630a5632233ddff6f08174dcabc1c696f8..27a4ffd4fcbf293a3dea1744b29384d0bee0c137 100644
--- a/paddle/fluid/framework/threadpool_test.cc
+++ b/paddle/fluid/framework/threadpool_test.cc
@@ -15,14 +15,14 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <atomic>
 
-#include "threadpool.h"
+#include "paddle/fluid/framework/threadpool.h"
 
 namespace framework = paddle::framework;
 
-void do_sum(framework::ThreadPool* pool, std::atomic<int>& sum, int cnt) {
+void do_sum(framework::ThreadPool* pool, std::atomic<int>* sum, int cnt) {
   std::vector<std::future<void>> fs;
   for (int i = 0; i < cnt; ++i) {
-    fs.push_back(framework::Async([&sum]() { sum.fetch_add(1); }));
+    fs.push_back(framework::Async([sum]() { sum->fetch_add(1); }));
   }
 }
 
@@ -46,7 +46,7 @@ TEST(ThreadPool, ConcurrentRun) {
   int n = 50;
   // sum = (n * (n + 1)) / 2
   for (int i = 1; i <= n; ++i) {
-    std::thread t(do_sum, pool, std::ref(sum), i);
+    std::thread t(do_sum, pool, &sum, i);
     threads.push_back(std::move(t));
   }
   for (auto& t : threads) {
diff --git a/paddle/fluid/framework/tuple.h b/paddle/fluid/framework/tuple.h
index 78996908b18a5a0935d8de9920e8ccef9069e74b..f6c6a1fec13d8b12efd1fa71a7a93316e484d045 100644
--- a/paddle/fluid/framework/tuple.h
+++ b/paddle/fluid/framework/tuple.h
@@ -35,24 +35,25 @@ class Tuple {
  public:
   using ElementVars = std::vector<ElementVar>;
 
-  Tuple(std::vector<ElementVar>& var, std::vector<VarDesc>& var_desc)
+  Tuple(const std::vector<ElementVar>& var,
+        const std::vector<VarDesc>& var_desc)
       : var_(var), var_desc_(var_desc) {}
-  Tuple(std::vector<ElementVar>& var) : var_(var) {}
+  explicit Tuple(std::vector<ElementVar>& var) : var_(var) {}
 
-  ElementVar get(int idx) const { return var_[idx]; };
+  ElementVar get(int idx) const { return var_[idx]; }
 
-  ElementVar& get(int idx) { return var_[idx]; };
+  ElementVar& get(int idx) { return var_[idx]; }
 
-  bool isSameType(Tuple& t) const;
+  bool isSameType(const Tuple& t) const;
 
-  size_t getSize() const { return var_.size(); };
+  size_t getSize() const { return var_.size(); }
 
  private:
   ElementVars var_;
   std::vector<VarDesc> var_desc_;
 };
 
-bool Tuple::isSameType(Tuple& t) const {
+bool Tuple::isSameType(const Tuple& t) const {
   size_t tuple_size = getSize();
   if (tuple_size != t.getSize()) {
     return false;
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index 4879209ece9fdfea91e484a4118c00a2a2a2b4f7..e099e40f121ff13657e563eb608feecbca0551be 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -35,7 +35,8 @@ using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 using Attribute =
     boost::variant<boost::blank, int, float, std::string, std::vector<int>,
                    std::vector<float>, std::vector<std::string>, bool,
-                   std::vector<bool>, BlockDesc*, int64_t>;
+                   std::vector<bool>, BlockDesc*, int64_t,
+                   std::vector<BlockDesc*>>;
 
 using AttributeMap = std::unordered_map<std::string, Attribute>;
 
diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h
index f62415fda67a506763494886eb499fbb09c5caa6..9f7a21ef42b8d3e74b6e211d6254294ba1fa2341 100644
--- a/paddle/fluid/framework/var_desc.h
+++ b/paddle/fluid/framework/var_desc.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
+#include <string>
 #include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/framework.pb.h"
diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h
index 2b646d78f0b23ec3e065c891826856c2341d4ac1..429997c8b89fef7aa164e878095ab3b5c9998e5b 100644
--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
@@ -24,18 +24,24 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+
+template <typename T>
+bool IsType(const std::type_index& type_index) {
+  return type_index == std::type_index(typeid(T));
+}
+
 inline proto::VarType::Type ToVarType(std::type_index type) {
-  if (type.hash_code() == typeid(LoDTensor).hash_code()) {
+  if (IsType<LoDTensor>(type)) {
     return proto::VarType_Type_LOD_TENSOR;
-  } else if (type.hash_code() == typeid(LoDRankTable).hash_code()) {
+  } else if (IsType<LoDRankTable>(type)) {
     return proto::VarType_Type_LOD_RANK_TABLE;
-  } else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) {
+  } else if (IsType<LoDTensorArray>(type)) {
     return proto::VarType_Type_LOD_TENSOR_ARRAY;
-  } else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
+  } else if (IsType<SelectedRows>(type)) {
     return proto::VarType_Type_SELECTED_ROWS;
-  } else if (type.hash_code() == typeid(ReaderHolder).hash_code()) {
+  } else if (IsType<ReaderHolder>(type)) {
     return proto::VarType_Type_READER;
-  } else if (type.hash_code() == typeid(ChannelHolder).hash_code()) {
+  } else if (IsType<ChannelHolder>(type)) {
     return proto::VarType_Type_CHANNEL;
   } else {
     PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc
index 1dced845ed7849d9f5a6de16dfe627d52fdb5488..14b81ddfecb8c996ae8709910c022a074e91eb3c 100644
--- a/paddle/fluid/framework/var_type_inference_test.cc
+++ b/paddle/fluid/framework/var_type_inference_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/var_type_inference.h"
+#include <string>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -23,8 +24,7 @@ namespace framework {
 
 class SumOpMaker : public OpProtoAndCheckerMaker {
  public:
-  SumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() {
     AddInput("X", "").AsDuplicable();
     AddOutput("Out", "");
     AddComment("");
diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h
index 87ddfe2ff9abfa3f4d99033686b197b10d8231fa..067e0c2b8389f88639fd9b95bd680702517efee1 100644
--- a/paddle/fluid/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
@@ -14,6 +14,7 @@
 #pragma once
 
 #include <memory>
+#include <string>
 #include <typeindex>
 #include <typeinfo>
 
@@ -67,7 +68,7 @@ class Variable {
   // parameter of Variable.
   template <typename T>
   struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(T* ptr) : ptr_(ptr), type_(typeid(T)) {}
+    explicit PlaceholderImpl(T* ptr) : ptr_(ptr), type_(typeid(T)) {}
 
     virtual const std::type_info& Type() const { return type_; }
     virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 17ccca8cdcbcaabaddbbc0ca1d3ca4fdf054b0fb..7071eea19c355c04711a11c224985be96c6589f4 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -1,19 +1,37 @@
-set(FLUID_CORE_MODULES proto_desc paddle_memory lod_tensor executor prune init)
+set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
 
+# TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal?
 cc_library(paddle_fluid_api
     SRCS io.cc
     DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 
-# Create static library
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-cc_library(paddle_fluid DEPS ${fluid_modules})
 
+if(WITH_CONTRIB)
+  set(fluid_modules "${fluid_modules}" paddle_inference_api)
+endif()
+
+# Create static library
+cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api)
 # Create shared library
 cc_library(paddle_fluid_shared SHARED
     SRCS io.cc
-    DEPS ARCHIVE_START ${GLOB_OP_LIB} ${FLUID_CORE_MODULES} ARCHIVE_END)
+    DEPS ${fluid_modules} paddle_fluid_api)
+
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
+if(NOT APPLE)
+  # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
+  set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.map")
+  set_target_properties(paddle_fluid_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+endif()
 
 if(WITH_TESTING)
+  # both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book
   add_subdirectory(tests/book)
 endif()
+
+add_subdirectory(analysis)
+
+if (TENSORRT_FOUND)
+  add_subdirectory(tensorrt)
+endif()
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..cdd67fdc929851979fe0a38afe1af74ec7321b8a
--- /dev/null
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -0,0 +1,38 @@
+cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc
+  fluid_to_data_flow_graph_pass.cc
+  data_flow_graph_to_fluid_pass.cc
+  dfg_graphviz_draw_pass.cc
+  tensorrt_subgraph_pass.cc
+  tensorrt_subgraph_node_mark_pass.cc
+  analyzer.cc
+  helper.cc
+  DEPS framework_proto proto_desc)
+cc_test(test_node SRCS node_tester.cc DEPS analysis)
+cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
+
+set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
+
+function (inference_analysis_test TARGET)
+    if(WITH_TESTING)
+        set(options "")
+        set(oneValueArgs "")
+        set(multiValueArgs SRCS)
+        cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+        cc_test(${TARGET}
+                SRCS "${analysis_test_SRCS}"
+                DEPS analysis
+                ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model --fraction_of_gpu_memory_to_use=0.5)
+        set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
+    endif(WITH_TESTING)
+endfunction(inference_analysis_test)
+
+inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc)
+inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc)
+inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc)
+inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc)
+inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc)
+inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc)
+inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc)
+inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc)
+inference_analysis_test(test_analyzer SRCS analyzer_tester.cc)
diff --git a/paddle/fluid/inference/analysis/README.md b/paddle/fluid/inference/analysis/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6fd73958bc480fe3983b9622c03ac77fba9ec8a7
--- /dev/null
+++ b/paddle/fluid/inference/analysis/README.md
@@ -0,0 +1,57 @@
+# Inference Analysis
+
+The `inference/analysis` module is used to analyze and optimize the inference program,
+it references some philosophy from `LLVM/analysis`, 
+and make the various optimization features be pluggable and co-exist in a pipeline.
+
+We borrowed some concepts from LLVM, such as
+
+- [Pass](./pass.h)es to implement optimization that traverse the inference program,
+- [DataFlowGraph](./data_flow_graph.h) to represent the data flow graph built from a program,
+- [PassManager](./pass_manager.h) to manage a sequence of `Pass`es over a graph.
+
+There are some other basic concepts here
+
+- [Node](./node.h), the node in a `DataFlowGraph`,
+  - `Function`, the Operator in Fluid,
+  - `Value`, the Variable in Fluid;
+- [Argument](./argument.h), the argument that treat as the input and output of all `Pass`es in the pipeline,
+
+## How it works
+
+The `inference/analysis` module make all the passes in a pipeline, and works in such way:
+
+1. Build a `DataFlowGraph` from a Fluid inference ProgramDesc,
+2. Call the middle passes one by one, the same `DataFlowGraph` is passed across all the passes,
+3. Transform a new ProgramDesc from the modified `DataFlowGraph`.
+
+The new optimization features can be added as an independent `Pass` and controlled by gflags,
+each pass will generate unified debug information or visualization for better debugging.
+
+## Supported Passes
+
+### `FluidToDataFlowGraphPass`
+Transform the fluid `ProgramDesc` to a `DataFlowGraph` to give an abstract representation for all the middle passes, 
+this should be the first pass of the pipeline.
+
+### `DataFlowGraphToFluidPass`
+Generate a final `ProgramDesc` from a data flow graph, this should be the last pass of the pipeline.
+
+### `TensorRTSubgraphNodeMarkPass`
+Mark the `Node` that are supported by TensorRT, 
+this pass will generate a visualization file which can be used for debugging.
+
+### `TensorRTSubGraphPass`
+Split the sub-graph that are can be accelerated by TensorRT.
+
+### `DFG_GraphvizDrawPass`
+This pass is just for debug, it will visualize the `DataFlowGraph` using the [graphviz](http://www.graphviz.org) tool.
+
+It can be used as a helper class that draws the modified graph after each pass.
+
+## Utilities
+
+There is some helper legacy/function/class for analysis.
+
+- [dot.h](./dot.h) give a easy to use interface for generating `DOT` codes,
+- [graph_traits.h](./graph_traits.h) contains the graph traversal algorithms, it uses `iterator` to make the algorithms easy to share across different passes.
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a4625f008c15300b88ef0bce71cd7d8aa473c9a8
--- /dev/null
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include <string>
+#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/pass_manager.h"
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
+            "Enable subgraph to TensorRT engine for acceleration");
+
+DEFINE_string(inference_analysis_graphviz_log_root, "./",
+              "Graphviz debuger for data flow graphs.");
+
+class DfgPassManagerImpl final : public DfgPassManager {
+ public:
+  DfgPassManagerImpl() {
+    // TODO(Superjomn) set the key with pass reprs.
+    AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass);
+    if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) {
+      auto trt_teller = [](const Node* node) {
+        if (!node->IsFunction()) return false;
+        return static_cast<const Function*>(node)->func_type() == "mul";
+      };
+      AddPass("tensorrt-subgraph-marker",
+              new TensorRTSubgraphNodeMarkPass(trt_teller));
+      AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller));
+    }
+    AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass);
+  }
+
+  std::string repr() const override { return "dfg-pass-manager"; }
+  std::string description() const override { return "DFG pass manager."; }
+
+ private:
+  void AddPass(const std::string& name, Pass* pass) {
+    LOG(INFO) << "Adding pass " << name;
+    Register(name, pass);
+    AddGraphvizDebugerPass(pass);
+  }
+
+  // Add the graphviz debuger pass if the parent pass has one.
+  void AddGraphvizDebugerPass(Pass* pass) {
+    auto* debuger_pass = pass->CreateGraphvizDebugerPass();
+    if (debuger_pass) {
+      LOG(INFO) << " - register debug pass [" << debuger_pass->repr() << "]";
+      Register(debuger_pass->repr(), debuger_pass);
+    }
+  }
+};
+
+Analyzer::Analyzer() { Register("manager1", new DfgPassManagerImpl); }
+
+void Analyzer::Run(Argument* argument) {
+  for (auto& x : data_) {
+    PADDLE_ENFORCE(x->Initialize(argument));
+    x->RunAll();
+    PADDLE_ENFORCE(x->Finalize());
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9e14fb1947da059c8d126d3da182ce446f6421e
--- /dev/null
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+/*
+ * This file contains Analyzer, an class that exposed as a library that analyze
+ * and optimize
+ * Fluid ProgramDesc for inference. Similar to LLVM, it has multiple flags to
+ * control whether
+ * an process is applied on the program.
+ *
+ * The processes are called Passes in analysis, the Passes are placed in a
+ * pipeline, the first
+ * Pass is the FluidToDataFlowGraphPass which transforms a Fluid ProgramDesc to
+ * a data flow
+ * graph, the last Pass is DataFlowGraphToFluidPass which transforms a data flow
+ * graph to a
+ * Fluid ProgramDesc. The passes in the middle of the pipeline can be any Passes
+ * which take a
+ * node or data flow graph as input.
+ *
+ * The Analyzer can be used in two methods, the first is a executable file which
+ * can be used to
+ * pre-process the inference model and can be controlled by passing difference
+ * command flags;
+ * the other way is to compose inside the inference API as a runtime pre-process
+ * phase in the
+ * inference service.
+ */
+
+#include <gflags/gflags.h>
+#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/pass_manager.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
+// flag if not available.
+DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine);
+DECLARE_string(inference_analysis_graphviz_log_root);
+
+class Analyzer : public OrderedRegistry<PassManager> {
+ public:
+  // Register all the pass-managers.
+  Analyzer();
+
+  void Run(Argument* argument);
+
+  DISABLE_COPY_AND_ASSIGN(Analyzer);
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d7c1a72932a39f878add2bb884e280b91d3c38c0
--- /dev/null
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST_F(DFG_Tester, main) {
+  Analyzer analyser;
+  analyser.Run(&argument);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/argument.cc b/paddle/fluid/inference/analysis/argument.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cb0263d5d98e86b612696ebde66d17fb2543809b
--- /dev/null
+++ b/paddle/fluid/inference/analysis/argument.cc
@@ -0,0 +1,15 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/argument.h"
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d316f20bff7a68754b0afec6463bd5d7579227f
--- /dev/null
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file defines the class Argument, which is the input and output of the
+ * analysis module. All the fields that needed either by Passes or PassManagers
+ * are contained in Argument.
+ *
+ * TODO(Superjomn) Find some way better to contain the fields when it grow too
+ * big.
+ */
+
+#pragma once
+
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * The argument definition of both Pass and PassManagers.
+ *
+ * All the fields should be registered here for clearness.
+ */
+struct Argument {
+  // The graph that process by the Passes or PassManagers.
+  std::unique_ptr<DataFlowGraph> main_dfg;
+
+  // The original program desc.
+  std::unique_ptr<framework::proto::ProgramDesc> origin_program_desc;
+
+  // The processed program desc.
+  std::unique_ptr<framework::proto::ProgramDesc> transformed_program_desc;
+};
+
+#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
+#define ANALYSIS_ARGUMENT_CHECK_FIELD(field__)               \
+  if (UNLIKELY(!(field__))) {                                \
+    LOG(ERROR) << "field " << #field__ << " should be set."; \
+    return false;                                            \
+  }
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d09bf3ed161703b0cf273522921e157c7360a0bc
--- /dev/null
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -0,0 +1,213 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/inference/analysis/dot.h"
+#include "paddle/fluid/inference/analysis/node.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+// It is a better idea that the inputs and outputs of this graph is set manually
+// before, but there must be a Pass that helps to prune the unnecessary ops that
+// do not contribute to the given targets, so in this pass, analysis and get the
+// inputs and outputs is OK.
+void DataFlowGraph::Build() {
+  inputs.clear();
+  outputs.clear();
+  std::unordered_set<Node *> ins;
+  std::unordered_set<Node *> outs;
+  for (auto &node : nodes.nodes()) {
+    for (auto *in : node->inlinks) {
+      ins.insert(in);
+    }
+    for (auto *out : node->outlinks) {
+      outs.insert(out);
+    }
+  }
+
+  // The nodes that in ins but not in outs is the graph's inputs
+  // similarly, the nodes that in outs but not in ins is the graphs' outputs
+  for (auto *in : ins) {
+    if (!outs.count(in)) {
+      inputs.push_back(in);
+    }
+  }
+  for (auto *out : outs) {
+    if (!outs.count(out)) {
+      outputs.push_back(out);
+    }
+  }
+
+  Clean();
+}
+
+void DataFlowGraph::Clean() {
+  for (auto &node : nodes.nodes()) {
+    std::unordered_set<Node *> inlinks_set(node->inlinks.begin(),
+                                           node->inlinks.end());
+    std::unordered_set<Node *> outlinks_set(node->outlinks.begin(),
+                                            node->outlinks.end());
+    if (inlinks_set.size() < node->inlinks.size()) {
+      LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs";
+      node->inlinks.assign(inlinks_set.begin(), inlinks_set.end());
+    }
+    if (outlinks_set.size() < node->outlinks.size()) {
+      LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs";
+      node->outlinks.assign(outlinks_set.begin(), outlinks_set.end());
+    }
+  }
+}
+
+std::string DataFlowGraph::DotString() const {
+  Dot dot;
+
+  // Add nodes
+  for (size_t i = 0; i < nodes.size(); i++) {
+    const Node &node = nodes.Get(i);
+    dot.AddNode(node.repr(), node.dot_attrs());
+  }
+
+  // Add edges
+  for (size_t i = 0; i < nodes.size(); i++) {
+    const Node &node = nodes.Get(i);
+    for (auto &in : node.inlinks) {
+      dot.AddEdge(in->repr(), node.repr(), {});
+    }
+  }
+  return dot.Build();
+}
+
+//
+// NodesBFSIterator
+//
+
+GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
+    const std::vector<Node *> &source)
+    : queue_(source.begin(), source.end()) {}
+
+// GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
+//     GraphTraits<DataFlowGraph>::NodesBFSIterator &&other) noexcept
+//     : queue_(std::move(other.queue_)),
+//       visited_(std::move(other.visited_)) {}
+
+GraphTraits<DataFlowGraph>::NodesBFSIterator::NodesBFSIterator(
+    const GraphTraits<DataFlowGraph>::NodesBFSIterator &other)
+    : queue_(other.queue_), visited_(other.visited_) {}
+
+Node &GraphTraits<DataFlowGraph>::NodesBFSIterator::operator*() {
+  PADDLE_ENFORCE(!queue_.empty());
+  return *queue_.front();
+}
+
+Node *GraphTraits<DataFlowGraph>::NodesBFSIterator::operator->() {
+  PADDLE_ENFORCE(!queue_.empty());
+  return queue_.front();
+}
+
+GraphTraits<DataFlowGraph>::NodesBFSIterator &
+GraphTraits<DataFlowGraph>::NodesBFSIterator::operator=(
+    const GraphTraits<DataFlowGraph>::NodesBFSIterator &other) {
+  queue_ = other.queue_;
+  visited_ = other.visited_;
+  return *this;
+}
+
+GraphTraits<DataFlowGraph>::NodesBFSIterator
+    &GraphTraits<DataFlowGraph>::NodesBFSIterator::operator++() {
+  PADDLE_ENFORCE(!queue_.empty());
+  auto *cur = queue_.front();
+  visited_.insert(cur);
+  queue_.pop_front();
+  for (auto *output : cur->outlinks) {
+    if (!visited_.count(output)) {
+      queue_.push_back(output);
+      visited_.insert(output);
+    }
+  }
+  return *this;
+}
+
+bool GraphTraits<DataFlowGraph>::NodesBFSIterator::operator==(
+    const GraphTraits<DataFlowGraph>::NodesBFSIterator &other) {
+  if (queue_.empty()) return other.queue_.empty();
+  if ((!queue_.empty()) && (!other.queue_.empty())) {
+    return queue_.front() == other.queue_.front() &&
+           visited_.size() == other.visited_.size();  // here need to check the
+                                                      // equality of queue and
+    // visited. Just a light but week implementation.
+  }
+  return false;
+}
+
+//
+// NodesDFSIterator
+//
+GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
+    const std::vector<Node *> &source) {
+  for (auto *x : source) stack_.push(x);
+}
+
+// GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
+//     GraphTraits<DataFlowGraph>::NodesDFSIterator &&other) noexcept
+//     : stack_(std::move(other.stack_)),
+//       visited_(std::move(other.visited_)) {}
+
+GraphTraits<DataFlowGraph>::NodesDFSIterator::NodesDFSIterator(
+    const GraphTraits<DataFlowGraph>::NodesDFSIterator &other)
+    : stack_(other.stack_), visited_(other.visited_) {}
+
+Node &GraphTraits<DataFlowGraph>::NodesDFSIterator::operator*() {
+  PADDLE_ENFORCE(!stack_.empty());
+  return *stack_.top();
+}
+
+GraphTraits<DataFlowGraph>::NodesDFSIterator
+    &GraphTraits<DataFlowGraph>::NodesDFSIterator::operator++() {
+  if (stack_.empty()) return *this;
+  visited_.insert(stack_.top());
+  auto *cur = stack_.top();
+  stack_.pop();
+  for (auto *x : cur->outlinks) {
+    if (!visited_.count(x)) {
+      stack_.push(x);
+      visited_.insert(x);
+    }
+  }
+  return *this;
+}
+bool GraphTraits<DataFlowGraph>::NodesDFSIterator::operator==(
+    const GraphTraits<DataFlowGraph>::NodesDFSIterator &other) {
+  if (stack_.empty()) return other.stack_.empty();
+  if ((!stack_.empty()) && (!other.stack_.empty())) {
+    return stack_.top() == other.stack_.top();
+  }
+  return false;
+}
+
+GraphTraits<DataFlowGraph>::NodesDFSIterator &
+GraphTraits<DataFlowGraph>::NodesDFSIterator::operator=(
+    const GraphTraits<DataFlowGraph>::NodesDFSIterator &other) {
+  stack_ = other.stack_;
+  visited_ = other.visited_;
+  return *this;
+}
+Node *GraphTraits<DataFlowGraph>::NodesDFSIterator::operator->() {
+  return stack_.top();
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4fefc83e0c551d52bec87299bcbc966e7a2dbf7
--- /dev/null
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -0,0 +1,173 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * Data flow graph is an pass that build the basic graph. It contains a graph
+ * and the iterators that enable the iteration over the graph.
+ */
+
+#pragma once
+
+#include <deque>
+#include <stack>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/inference/analysis/graph_traits.h"
+#include "paddle/fluid/inference/analysis/node.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * DataFlowGraph - A container of Value and Function Nodes.
+ */
+struct DataFlowGraph {
+  NodeMap nodes;
+  std::vector<Node *> inputs;
+  std::vector<Node *> outputs;
+
+  // Extract inputs and outputs of the graph.
+  void Build();
+
+  // Output a DOT graph file for debug.
+  std::string DotString() const;
+
+ private:
+  // Remove duplicate edges and so on.
+  void Clean();
+};
+
+/*
+ * An graph trait help to traverse the graph using BFS.
+ * The BFS start from a graph's inputs, the graph should be fully-connected, so
+ * that the iterator can reach the end.
+ */
+template <>
+struct GraphTraits<DataFlowGraph> {
+  // BFS iterator on nodes.
+  struct NodesBFSIterator
+      : public std::iterator<std::forward_iterator_tag, Node *> {
+    NodesBFSIterator() = default;
+    explicit NodesBFSIterator(const std::vector<Node *> &source);
+    // NodesBFSIterator(NodesBFSIterator &&other) noexcept;
+    // NOTE Heavy to use.
+    NodesBFSIterator(const NodesBFSIterator &other);
+
+    Node &operator*();
+    NodesBFSIterator &operator++();
+    Node *operator->();
+    // TODO(Superjomn) current implementation just compare the first
+    // element, need to compare the graph and all the elements in the queue and
+    // set.
+    NodesBFSIterator &operator=(const NodesBFSIterator &other);
+    bool operator==(const NodesBFSIterator &other);
+    bool operator!=(const NodesBFSIterator &other) { return !(*this == other); }
+
+   private:
+    std::deque<Node *> queue_;
+    std::unordered_set<Node *> visited_;
+  };
+
+  // DFS iterator on nodes.
+  struct NodesDFSIterator
+      : public std::iterator<std::forward_iterator_tag, Node *> {
+    NodesDFSIterator() = default;
+    explicit NodesDFSIterator(const std::vector<Node *> &source);
+    // NodesDFSIterator(NodesDFSIterator &&other) noexcept;
+    NodesDFSIterator(const NodesDFSIterator &other);
+
+    Node &operator*();
+    NodesDFSIterator &operator++();
+    // TODO(Superjomn) current implementation just compare the first
+    // element, need to compare the graph and all the elements in the queue and
+    // set.
+    NodesDFSIterator &operator=(const NodesDFSIterator &other);
+    bool operator==(const NodesDFSIterator &other);
+    bool operator!=(const NodesDFSIterator &other) { return !(*this == other); }
+    Node *operator->();
+
+   private:
+    std::stack<Node *> stack_;
+    std::unordered_set<Node *> visited_;
+  };
+
+  explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {}
+
+  // default use BFS to visit the nodes.
+  iterator_range<NodesBFSIterator> nodes() {
+    return iterator_range<NodesBFSIterator>(nodes_bfs_begin(), nodes_bfs_end());
+  }
+  iterator_range<NodesBFSIterator> nodes_in_BFS() {
+    return iterator_range<NodesBFSIterator>(nodes_bfs_begin(), nodes_bfs_end());
+  }
+  iterator_range<NodesDFSIterator> nodes_in_DFS() {
+    return iterator_range<NodesDFSIterator>(nodes_dfs_begin(), nodes_dfs_end());
+  }
+
+ private:
+  NodesBFSIterator nodes_bfs_begin() {
+    return NodesBFSIterator(graph_->inputs);
+  }
+  NodesBFSIterator nodes_bfs_end() { return NodesBFSIterator(); }
+  NodesDFSIterator nodes_dfs_begin() {
+    return NodesDFSIterator(graph_->inputs);
+  }
+  NodesDFSIterator nodes_dfs_end() { return NodesDFSIterator(); }
+
+ private:
+  DataFlowGraph *graph_;
+};
+
+// Extract the inputs and outputs of a graph. The inputs and outputs of a
+// sub-graph is the inputs nodes and output nodes that doesn't inside the
+// sub-graph.
+static std::pair<std::vector<Node *>, std::vector<Node *>>
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
+  std::unordered_set<Node *> nodes(graph.begin(), graph.end());
+  std::unordered_set<Node *> inputs;
+  std::unordered_set<Node *> outputs;
+  // Input a Value, check whether its inlink is in the subgraph.
+  auto inlink_in_subgraph = [&](Node *n) {
+    for (auto *in : n->inlinks) {
+      if (nodes.count(in)) return true;
+    }
+    return false;
+  };
+  for (auto &node : graph) {
+    for (auto *in : node->inlinks) {
+      // The Value that is written by nodes inside a sub-graph shouldn't be the
+      // input of the sub-graph.
+      if (!nodes.count(in) && in->type() == Node::Type::kValue &&
+          !inlink_in_subgraph(in)) {
+        inputs.insert(in);
+      }
+    }
+    for (auto *out : node->outlinks) {
+      if (!nodes.count(out) && out->type() == Node::Type::kValue) {
+        outputs.insert(out);
+      }
+    }
+  }
+  return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
+                        std::vector<Node *>(outputs.begin(), outputs.end()));
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9d7cceeb65888b8ba3fdf39e88fc2877abd82d11
--- /dev/null
+++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
@@ -0,0 +1,62 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST(DataFlowGraph, BFS) {
+  auto desc = LoadProgramDesc();
+  auto dfg = ProgramDescToDFG(desc);
+  dfg.Build();
+
+  for (auto* in : dfg.inputs) {
+    LOG(INFO) << "inputs: " << in->name() << " "
+              << static_cast<int>(in->type());
+  }
+  for (auto* out : dfg.outputs) {
+    LOG(INFO) << "outputs: " << out->name() << " "
+              << static_cast<int>(out->type());
+  }
+
+  GraphTraits<DataFlowGraph> trait(&dfg);
+  auto nodes = trait.nodes();
+  size_t count = 0;
+  for (auto it = nodes.begin(); it != nodes.end(); ++it) {
+    LOG(INFO) << "visiting " << it->name();
+    ++count;
+  }
+  ASSERT_EQ(count, dfg.nodes.size());
+}
+
+TEST(DataFlowGraph, DFS) {
+  auto desc = LoadProgramDesc();
+  auto dfg = ProgramDescToDFG(desc);
+  dfg.Build();
+  GraphTraits<DataFlowGraph> trait(&dfg);
+  auto nodes = trait.nodes_in_DFS();
+  size_t count = 0;
+  for (auto it = nodes.begin(); it != nodes.end(); ++it) {
+    LOG(INFO) << "visiting " << it->name();
+    ++count;
+  }
+  ASSERT_EQ(count, dfg.nodes.size());
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..29ca008123addf07959b965a4b54bf55b18c401d
--- /dev/null
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -0,0 +1,171 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
+#include <vector>
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/proto_desc.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+using framework::proto::ProgramDesc;
+
+std::vector<std::string> ExtractParameters(
+    const std::vector<std::unique_ptr<Node>>& nodes);
+
+bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {
+  ANALYSIS_ARGUMENT_CHECK_FIELD(argument)
+  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc)
+  PADDLE_ENFORCE(!argument->transformed_program_desc);
+  // The transformed_program_desc should inherit all the VarDesc and BlockDesc
+  // from the original program desc. The operators of the main block(the first
+  // block) should rewritten by data flow graph.
+  argument->transformed_program_desc.reset(
+      new ProgramDesc(*argument->origin_program_desc));
+  argument->transformed_program_desc->mutable_blocks(framework::kRootBlockIndex)
+      ->clear_ops();
+  desc_ = argument->transformed_program_desc.get();
+  argument_ = argument;
+  return true;
+}
+
+bool DataFlowGraphToFluidPass::Finalize() { return true; }
+
+void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) {
+  auto traits = GraphTraits<DataFlowGraph>(graph);
+  for (auto it = traits.nodes().begin(); it != traits.nodes().end(); ++it) {
+    if (it->deleted()) continue;
+
+    switch (it->type()) {
+      case Node::Type::kFunction: {
+        LOG(INFO) << "add function " << it->repr();
+        AddFluidOp(&(*it));
+      } break;
+      case Node::Type::kFunctionBlock: {
+        LOG(INFO) << "add engine op " << it->repr() << " , "
+                  << static_cast<FunctionBlock*>(&(*it))->subgraph.size();
+        AddEngineOp(&(*it));
+      } break;
+      default:
+        continue;
+    }
+  }
+}
+
+void DataFlowGraphToFluidPass::AddFluidOp(Node* node) {
+  auto* ori_op = static_cast<framework::proto::OpDesc*>(node->pb_desc());
+  // currently only the main block is analyzed.
+  auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
+  auto* op = main_block->add_ops();
+  *op = *ori_op;  // copy the attributes, by default, these will not be changed
+                  // by analysis phrase.
+  // The inputs and outputs of the existing ops are not changed by tensorrt
+  // subgraph pass.
+  // NOTE It might be changed by other passes in the long run.
+}
+
+void CreateTrtEngineOp(Node* node, const DataFlowGraph& graph,
+                       const framework::proto::BlockDesc& block) {
+  static int counter{0};
+  PADDLE_ENFORCE(node->IsFunctionBlock());
+  framework::OpDesc desc;
+  auto* func = static_cast<FunctionBlock*>(node);
+
+  // collect inputs
+  std::vector<std::string> io;
+  for (auto* x : func->inlinks) {
+    io.push_back(x->name());
+  }
+  desc.SetInput("Xs", io);
+
+  // collect outputs
+  io.clear();
+  for (auto* x : func->outlinks) {
+    io.push_back(x->name());
+  }
+  desc.SetOutput("Ys", io);
+
+  desc.SetType("tensorrt_engine");
+  // Set attrs
+  SetAttr(desc.Proto(), "subgraph", block.SerializeAsString());
+  SetAttr(desc.Proto(), "engine_unique_key",
+          "trt-" + std::to_string(counter++));
+  SetAttr(desc.Proto(), "max_batch", 100);  // TODO(Superjomn) add config latter
+  SetAttr(desc.Proto(), "max_workspace",
+          1024);  // TODO(Superjomn) add config latter
+  SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
+  node->SetPbMsg(desc.Proto()->SerializeAsString());
+}
+
+std::vector<std::string> ExtractParameters(
+    const std::vector<std::unique_ptr<Node>>& nodes) {
+  std::vector<std::string> parameters;
+  for (const auto& node : nodes) {
+    if (!node->IsValue()) continue;
+    PADDLE_ENFORCE(!node->pb_msg().empty(), "pb_msg should be set first");
+    framework::proto::VarDesc var;
+    var.ParseFromString(node->pb_msg());
+    if (var.persistable()) {
+      parameters.push_back(var.name());
+    }
+  }
+  return parameters;
+}
+
+void DataFlowGraphToFluidPass::AddEngineOp(Node* node) {
+  // TODO(Superjomn) Here need to expose some arguments for default setting.
+  PADDLE_ENFORCE(node->IsFunctionBlock());
+  auto* block_node = static_cast<FunctionBlock*>(node);
+  framework::proto::BlockDesc proto;
+  framework::BlockDesc block_desc(nullptr, &proto);
+  // copy ops.
+  for (auto* node : block_node->subgraph) {
+    auto* op = block_desc.AppendOp();
+    PADDLE_ENFORCE(!node->pb_msg().empty());
+    op->Proto()->ParseFromString(node->pb_msg());
+  }
+  CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto());
+  auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
+  auto* op = main_block->add_ops();
+  PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
+  op->ParseFromString(node->pb_msg());
+}
+
+namespace {
+class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
+ public:
+  using Config = DFG_GraphvizDrawPass::Config;
+  explicit DFG_DebuggerPass(const Config& config)
+      : DFG_GraphvizDrawPass(config) {}
+
+  std::string repr() const override { return "dfg-to-fluid-debuger-pass"; }
+
+  bool Finalize() override { return true; }
+};
+}  // namespace
+
+Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
+  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
+      FLAGS_inference_analysis_graphviz_log_root,
+      "data_flow_graph_to_fluid_graphviz_debugger"));
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..edc84b02ed20991e3e7c6c437d2b1fac169bae03
--- /dev/null
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+/*
+ * This file implements the transformation from fluid ProgramDesc to data flow
+ * graph.
+ */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/inference/analysis/pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
+ public:
+  DataFlowGraphToFluidPass() = default;
+
+  bool Initialize(Argument *argument) override;
+  bool Finalize() override;
+
+  void Run(DataFlowGraph *graph) override;
+
+  std::string repr() const override { return "DFG to fluid"; }
+  std::string description() const override {
+    return "Transform a DFG to a Fluid ProgramDesc";
+  }
+
+  Pass *CreateGraphvizDebugerPass() const override;
+
+ protected:
+  // Add a Fluid Op into the ProgramDesc.
+  void AddFluidOp(Node *node);
+  // Add a EngineOp into the ProgramDesc.
+  void AddEngineOp(Node *node);
+
+ private:
+  framework::proto::ProgramDesc *desc_;
+  Argument *argument_;
+};
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d8fc5e580a98f76233f01fdc4d7987311f78ee45
--- /dev/null
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc
@@ -0,0 +1,48 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
+
+#include <glog/logging.h>
+#include <google/protobuf/text_format.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/io.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST_F(DFG_Tester, Test) {
+  DataFlowGraph graph;
+
+  FluidToDataFlowGraphPass pass0;
+  DataFlowGraphToFluidPass pass1;
+  ASSERT_TRUE(pass0.Initialize(&argument));
+  ASSERT_TRUE(pass1.Initialize(&argument));
+
+  pass0.Run(&graph);
+  pass1.Run(&graph);
+
+  pass0.Finalize();
+  pass1.Finalize();
+
+  LOG(INFO) << graph.nodes.size();
+}
+
+};  // namespace analysis
+};  // namespace inference
+};  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/device.h b/paddle/fluid/inference/analysis/device.h
new file mode 100644
index 0000000000000000000000000000000000000000..585c9923291e5f9cb6e50dbc4bcd28c374191048
--- /dev/null
+++ b/paddle/fluid/inference/analysis/device.h
@@ -0,0 +1,24 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+enum class Device { CPU, GPU };
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a6f85484756417e103cbb60bcb664e8b800b9f28
--- /dev/null
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
@@ -0,0 +1,59 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+int DFG_GraphvizDrawPass::counter_{0};
+
+void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) {
+  auto content = Draw(graph);
+  auto dot_path = GenDotPath();
+  std::ofstream file(dot_path);
+  file.write(content.c_str(), content.size());
+  file.close();
+
+  auto png_path = dot_path.substr(0, dot_path.size() - 4) + ".png";
+  std::string message;
+  LOG(INFO) << "draw to " << png_path;
+  ExecShellCommand("dot -Tpng " + dot_path + " -o " + png_path, &message);
+}
+
+std::string DFG_GraphvizDrawPass::Draw(DataFlowGraph *graph) {
+  Dot dot;
+  // Add nodes
+  for (size_t i = 0; i < graph->nodes.size(); i++) {
+    const Node &node = graph->nodes.Get(i);
+    if (config_.display_deleted_node || !node.deleted()) {
+      dot.AddNode(node.repr(), node.dot_attrs());
+    }
+  }
+  // Add edges
+  for (size_t i = 0; i < graph->nodes.size(); i++) {
+    const Node &node = graph->nodes.Get(i);
+    if (!config_.display_deleted_node && node.deleted()) continue;
+    for (auto &in : node.inlinks) {
+      if (!config_.display_deleted_node && in->deleted()) continue;
+      dot.AddEdge(in->repr(), node.repr(), {});
+    }
+  }
+  return dot.Build();
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..17445ab4407a159ca11345bc9a9226b3ad0044f0
--- /dev/null
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file create an DFG_GraphvizDrawPass which helps to draw a data flow
+ * graph's structure using graphviz.
+ */
+
+#pragma once
+
+#include <fstream>
+#include <string>
+#include "paddle/fluid/inference/analysis/dot.h"
+#include "paddle/fluid/inference/analysis/pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Output a dot file and write to some place.
+ */
+class DFG_GraphvizDrawPass : public DataFlowGraphPass {
+ public:
+  struct Config {
+    Config(const std::string &dir, const std::string &id,
+           bool display_deleted_node = false)
+        : dir(dir), id(id), display_deleted_node(display_deleted_node) {}
+
+    // The directory to store the .dot or .png files.
+    const std::string dir;
+    // The identifier for this dot file.
+    const std::string id;
+    // Whether to display deleted nodes, default false.
+    const bool display_deleted_node;
+  };
+
+  explicit DFG_GraphvizDrawPass(const Config &config) : config_(config) {}
+
+  bool Initialize(Argument *argument) override { return true; }
+  void Run(DataFlowGraph *graph) override;
+  bool Finalize() override { return true; }
+
+  std::string repr() const override { return "DFG graphviz drawer"; }
+  std::string description() const override {
+    return "Debug a DFG by draw with graphviz";
+  }
+
+ protected:
+  // A counter to add a number prefix to the debugger image output so that they
+  // will sort in the triggered order.
+  static int counter_;
+
+  // Path of the dot file to output.
+  std::string GenDotPath() const {
+    return config_.dir + "/" + std::to_string(counter_++) + "-graph_" +
+           config_.id + ".dot";
+  }
+
+  virtual std::string Draw(DataFlowGraph *graph);
+
+  Config config_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..162455b9c4e06b7fbb4bdede30444faf6a8a1509
--- /dev/null
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc
@@ -0,0 +1,48 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+
+#include <gtest/gtest.h>
+#include <fstream>
+#include <string>
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
+  auto dfg = ProgramDescToDFG(*argument.origin_program_desc);
+  DFG_GraphvizDrawPass::Config config("./", "test");
+  DFG_GraphvizDrawPass pass(config);
+  pass.Initialize(&argument);
+  pass.Run(&dfg);
+
+  // test content
+  std::ifstream file("./0-graph_test.dot");
+  ASSERT_TRUE(file.is_open());
+
+  std::string line;
+  int no{0};
+  while (std::getline(file, line)) {
+    no++;
+  }
+  // DFG is sensitive to ProgramDesc, be careful to change the existing models.
+  ASSERT_EQ(no, 82);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/dot.cc b/paddle/fluid/inference/analysis/dot.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d5471ffcb594a6915e9e65c0fee5adc5f5bdf40c
--- /dev/null
+++ b/paddle/fluid/inference/analysis/dot.cc
@@ -0,0 +1,23 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/dot.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+size_t Dot::counter = 0;
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/dot.h b/paddle/fluid/inference/analysis/dot.h
new file mode 100644
index 0000000000000000000000000000000000000000..4bf1840fdda8508b52d7274a338c5b1c95baf354
--- /dev/null
+++ b/paddle/fluid/inference/analysis/dot.h
@@ -0,0 +1,155 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file implements some helper classes and methods for DOT programming
+ * support. It will give a visualization of the graph and that helps to debug
+ * the logics of each Pass.
+ */
+#pragma once
+
+#include <glog/logging.h>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * A Dot template that helps to build a DOT graph definition.
+ */
+class Dot {
+ public:
+  static size_t counter;
+
+  struct Attr {
+    std::string key;
+    std::string value;
+
+    Attr(const std::string& key, const std::string& value)
+        : key(key), value(value) {}
+
+    std::string repr() const {
+      std::stringstream ss;
+      ss << key << "=" << '"' << value << '"';
+      return ss.str();
+    }
+  };
+
+  struct Node {
+    std::string name;
+    std::vector<Attr> attrs;
+
+    Node(const std::string& name, const std::vector<Attr>& attrs)
+        : name(name),
+          attrs(attrs),
+          id_("node_" + std::to_string(Dot::counter++)) {}
+
+    std::string id() const { return id_; }
+
+    std::string repr() const {
+      std::stringstream ss;
+      CHECK(!name.empty());
+      ss << id_;
+      for (size_t i = 0; i < attrs.size(); i++) {
+        if (i == 0) {
+          ss << "[label=" << '"' << name << '"' << " ";
+        }
+        ss << attrs[i].repr();
+        ss << ((i < attrs.size() - 1) ? " " : "]");
+      }
+      return ss.str();
+    }
+
+   private:
+    std::string id_;
+  };
+
+  struct Edge {
+    std::string source;
+    std::string target;
+    std::vector<Attr> attrs;
+
+    Edge(const std::string& source, const std::string& target,
+         const std::vector<Attr>& attrs)
+        : source(source), target(target), attrs(attrs) {}
+
+    std::string repr() const {
+      std::stringstream ss;
+      CHECK(!source.empty());
+      CHECK(!target.empty());
+      ss << source << "->" << target;
+      for (size_t i = 0; i < attrs.size(); i++) {
+        if (i == 0) {
+          ss << "[";
+        }
+        ss << attrs[i].repr();
+        ss << ((i < attrs.size() - 1) ? " " : "]");
+      }
+      return ss.str();
+    }
+  };
+
+  Dot() = default;
+
+  explicit Dot(const std::vector<Attr>& attrs) : attrs_(attrs) {}
+
+  void AddNode(const std::string& name, const std::vector<Attr>& attrs) {
+    CHECK(!nodes_.count(name)) << "duplicate Node '" << name << "'";
+    nodes_.emplace(name, Node{name, attrs});
+  }
+
+  void AddEdge(const std::string& source, const std::string& target,
+               const std::vector<Attr>& attrs) {
+    CHECK(!source.empty());
+    CHECK(!target.empty());
+    auto sid = nodes_.at(source).id();
+    auto tid = nodes_.at(target).id();
+    edges_.emplace_back(sid, tid, attrs);
+  }
+
+  // Compile to DOT language codes.
+  std::string Build() const {
+    std::stringstream ss;
+    const std::string indent = "   ";
+    ss << "digraph G {" << '\n';
+
+    // Add graph attrs
+    for (const auto& attr : attrs_) {
+      ss << indent << attr.repr() << '\n';
+    }
+    // add nodes
+    for (auto& item : nodes_) {
+      ss << indent << item.second.repr() << '\n';
+    }
+    // add edges
+    for (auto& edge : edges_) {
+      ss << indent << edge.repr() << '\n';
+    }
+    ss << "} // end G";
+    return ss.str();
+  }
+
+ private:
+  std::unordered_map<std::string, Node> nodes_;
+  std::vector<Edge> edges_;
+  std::vector<Attr> attrs_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/dot_tester.cc b/paddle/fluid/inference/analysis/dot_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..56ceb9bd5d6f41a601d66f6124fb7b4099c9337e
--- /dev/null
+++ b/paddle/fluid/inference/analysis/dot_tester.cc
@@ -0,0 +1,62 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/dot.h"
+
+#include <gtest/gtest.h>
+#include <memory>
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+class DotTester : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    std::vector<Dot::Attr> attrs({{"title", "hello"}});
+    dot.reset(new Dot(attrs));
+    dot->AddNode("a", {Dot::Attr{"shape", "box"}, Dot::Attr("color", "blue")});
+    dot->AddNode("b", {});
+    dot->AddNode("c", {});
+    dot->AddEdge("a", "b", {});
+    dot->AddEdge("b", "c", {});
+    dot->AddEdge("a", "c", {});
+  }
+
+  std::unique_ptr<Dot> dot;
+};
+
+TEST_F(DotTester, Build) {
+  auto codes = dot->Build();
+  // Output the DOT language code, the generated codes are too long to compare
+  // the string.
+  //
+  // The output is
+  //
+  // digraph G {
+  //   title="hello"
+  //   node_1
+  //   node_2
+  //   node_0[label="a" shape="box" color="blue"]
+  //   node_0->node_1
+  //   node_1->node_2
+  //   node_0->node_2
+  // } // end G
+  LOG(INFO) << '\n' << codes;
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e918622d74cfb11d83090555be2a768cc14e7742
--- /dev/null
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -0,0 +1,105 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+bool FluidToDataFlowGraphPass::Initialize(Argument *argument) {
+  ANALYSIS_ARGUMENT_CHECK_FIELD(argument);
+  ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc);
+  PADDLE_ENFORCE(argument);
+  if (!argument->main_dfg) {
+    LOG(INFO) << "Init DFG";
+    argument->main_dfg.reset(new DataFlowGraph);
+  }
+  desc_ = argument->origin_program_desc.get();
+  return true;
+}
+
+bool FluidToDataFlowGraphPass::Finalize() { return true; }
+
+void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
+  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE(desc_);
+  // insert vars
+  std::unordered_map<std::string, size_t> var2id;
+  auto &main_block = desc_->blocks(framework::kRootBlockIndex);
+  for (int i = 0; i < main_block.vars_size(); i++) {
+    const auto &var = main_block.vars(i);
+    auto *v = graph->nodes.Create(Node::Type::kValue);
+    v->SetName(var.name());
+    v->SetPbDesc(const_cast<void *>(static_cast<const void *>(&var)));
+    v->SetPbMsg(var.SerializeAsString());
+    var2id[var.name()] = v->id();
+  }
+  for (int i = 0; i < main_block.ops_size(); i++) {
+    const auto &op = main_block.ops(i);
+    auto *o = graph->nodes.Create(Node::Type::kFunction);
+    o->SetName(op.type());
+    static_cast<Function *>(o)->SetFuncType(op.type());
+    // Link to the original protobuf message's memory, make it easier to
+    // generate from a data flow graph to fluid ProgramDesc.
+    o->SetPbDesc(const_cast<void *>(static_cast<const void *>(&op)));
+    o->SetPbMsg(op.SerializeAsString());
+
+    // set inputs and outputs
+    // TODO(Superjomn) make sure the InputNames is the real variable name.
+    for (int j = 0; j < op.inputs_size(); j++) {
+      auto &in_var = op.inputs(j);
+      for (int k = 0; k < in_var.arguments_size(); k++) {
+        auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k)));
+        in->outlinks.push_back(o);
+        o->inlinks.push_back(in);
+      }
+    }
+    for (int j = 0; j < op.outputs_size(); j++) {
+      auto &out_var = op.outputs(j);
+      for (int k = 0; k < out_var.arguments_size(); k++) {
+        auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]);
+        out->inlinks.push_back(o);
+        o->outlinks.push_back(out);
+      }
+    }
+  }
+  // Analysis and extract the inputs and outputs of this graph.
+  graph->Build();
+}
+
+namespace {
+class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
+ public:
+  using Config = DFG_GraphvizDrawPass::Config;
+  explicit DFG_DebuggerPass(const Config &config)
+      : DFG_GraphvizDrawPass(config) {}
+  std::string repr() const override { return "fluid-to-dfg-debuger-pass"; }
+  bool Finalize() override { return true; }
+};
+}
+
+Pass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const {
+  return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
+      FLAGS_inference_analysis_graphviz_log_root, "fluid-to-dfg-debuger"));
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..da8463b63bd0bb1633bfcb9d7d41a884ddd632c7
--- /dev/null
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+/*
+ * This file implements the transformation from data flow graph to fluid
+ * ProgramDesc.
+ */
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/inference/analysis/pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Transform a FluidDesc to a data flow graph.
+ */
+class FluidToDataFlowGraphPass final : public DataFlowGraphPass {
+ public:
+  FluidToDataFlowGraphPass() = default;
+
+  bool Initialize(Argument *argument) override;
+  bool Finalize() override;
+
+  void Run(DataFlowGraph *graph) override;
+
+  std::string repr() const override { return "fluid-to-data-flow-graph"; }
+  std::string description() const override {
+    return "transform a fluid ProgramDesc to a data flow graph.";
+  }
+
+  Pass *CreateGraphvizDebugerPass() const override;
+
+ private:
+  framework::proto::ProgramDesc const *desc_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cfbbc284e491bd62a6108d6d14e7896a57d1b63e
--- /dev/null
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc
@@ -0,0 +1,37 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST_F(DFG_Tester, Init) {
+  FluidToDataFlowGraphPass pass;
+  pass.Initialize(&argument);
+  DataFlowGraph graph;
+  pass.Run(&graph);
+  // Analysis is sensitive to ProgramDesc, careful to change the original model.
+  ASSERT_EQ(graph.nodes.size(), 37);
+  pass.Finalize();
+  LOG(INFO) << '\n' << graph.DotString();
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/graph_traits.cc b/paddle/fluid/inference/analysis/graph_traits.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2ea70a1d2060e03769d67060dc6f008207342b52
--- /dev/null
+++ b/paddle/fluid/inference/analysis/graph_traits.cc
@@ -0,0 +1,15 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/graph_traits.h"
diff --git a/paddle/fluid/inference/analysis/graph_traits.h b/paddle/fluid/inference/analysis/graph_traits.h
new file mode 100644
index 0000000000000000000000000000000000000000..aed2b1e8e27d94b430201d70ecf09d4acc33c8fa
--- /dev/null
+++ b/paddle/fluid/inference/analysis/graph_traits.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file defines the GraphTraits<X> template class that should be specified
+ * by classes that want to be iteratable by generic graph iterators.
+ *
+ * This file also defines the marker class Inverse that is used to iterate over
+ * graphs in a graph defined, inverse ordering...
+ */
+
+#pragma once
+
+#include "paddle/fluid/inference/analysis/helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * This class should be specialized by different graph types...
+ * That's why the base class is empty.
+ */
+template <typename GraphType>
+struct GraphTraits {
+  // using NodesBFSIterator = xxx
+
+  // NodesBFSIterator nodes_begin();
+  // NodesBFSIterator nodes_end();
+};
+
+/*
+ * Inverse - This class is used as a marker class to tell the graph iterator to
+ * iterate in a graph defined Inverse order.
+ */
+template <typename GraphType>
+struct Inverse {
+  const GraphType &graph;
+
+  explicit Inverse(const GraphType &graph) : graph(graph) {}
+};
+
+/*
+ * Provide a partial specialization of GraphTraits so that the inverse of an
+ * inverse turns into the original graph.
+ */
+template <typename GraphType>
+struct GraphTraits<Inverse<Inverse<GraphType>>> : GraphTraits<GraphType> {};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/helper.cc b/paddle/fluid/inference/analysis/helper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ca40c01fc57dbcc2ca16770a1b7d798de8b5625b
--- /dev/null
+++ b/paddle/fluid/inference/analysis/helper.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/framework/framework.pb.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+template <>
+void SetAttr<std::string>(framework::proto::OpDesc *op, const std::string &name,
+                          const std::string &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::STRING);
+  attr->set_s(data);
+}
+template <>
+void SetAttr<int>(framework::proto::OpDesc *op, const std::string &name,
+                  const int &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::INT);
+  attr->set_i(data);
+}
+template <>
+void SetAttr<int64_t>(framework::proto::OpDesc *op, const std::string &name,
+                      const int64_t &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::LONG);
+  attr->set_l(data);
+}
+template <>
+void SetAttr<std::vector<std::string>>(framework::proto::OpDesc *op,
+                                       const std::string &name,
+                                       const std::vector<std::string> &data) {
+  auto *attr = op->add_attrs();
+  attr->set_name(name);
+  attr->set_type(paddle::framework::proto::AttrType::STRINGS);
+  for (const auto &s : data) {
+    attr->add_strings(s.c_str());
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..f1064cd20f28092d80d3fd23a862da080b6cc2f3
--- /dev/null
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -0,0 +1,145 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdio>
+#include <string>
+#include <typeindex>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+template <typename T>
+void SetAttr(framework::proto::OpDesc *op, const std::string &name,
+             const T &data);
+
+template <typename Vec>
+int AccuDims(Vec &&vec, int size) {
+  int res = 1;
+  for (int i = 0; i < size; i++) {
+    res *= std::forward<Vec>(vec)[i];
+  }
+  return res;
+}
+
+#define SET_TYPE(type__) dic_[std::type_index(typeid(type__))] = #type__;
+/*
+ * Map typeid to representation.
+ */
+struct DataTypeNamer {
+  static const DataTypeNamer &Global() {
+    static auto *x = new DataTypeNamer();
+    return *x;
+  }
+
+  template <typename T>
+  const std::string &repr() const {
+    auto x = std::type_index(typeid(T));
+    PADDLE_ENFORCE(dic_.count(x), "unknown type for representation");
+    return dic_.at(x);
+  }
+
+  const std::string &repr(const std::type_index &type) const {  // NOLINT
+    PADDLE_ENFORCE(dic_.count(type), "unknown type for representation");
+    return dic_.at(type);
+  }
+
+ private:
+  DataTypeNamer() {
+    SET_TYPE(int);
+    SET_TYPE(bool);
+    SET_TYPE(float);
+    SET_TYPE(void *);
+  }
+
+  std::unordered_map<std::type_index, std::string> dic_;
+};
+#undef SET_TYPE
+
+template <typename IteratorT>
+class iterator_range {
+  IteratorT begin_, end_;
+
+ public:
+  template <typename Container>
+  explicit iterator_range(Container &&c) : begin_(c.begin()), end_(c.end()) {}
+
+  iterator_range(const IteratorT &begin, const IteratorT &end)
+      : begin_(begin), end_(end) {}
+
+  const IteratorT &begin() const { return begin_; }
+  const IteratorT &end() const { return end_; }
+};
+
+/*
+ * An registry helper class, with its records keeps the order they registers.
+ */
+template <typename T>
+class OrderedRegistry {
+ public:
+  T *Register(const std::string &name, T *x) {
+    PADDLE_ENFORCE(!dic_.count(name), "duplicate key [%s]", name);
+    dic_[name] = data_.size();
+    data_.emplace_back(std::unique_ptr<T>(x));
+    return data_.back().get();
+  }
+
+  T *Lookup(const std::string &name) {
+    auto it = dic_.find(name);
+    if (it == dic_.end()) return nullptr;
+    return data_[it->second].get();
+  }
+
+ protected:
+  std::unordered_map<std::string, int> dic_;
+  std::vector<std::unique_ptr<T>> data_;
+};
+
+template <typename T>
+T &GetFromScope(const framework::Scope &scope, const std::string &name) {
+  framework::Variable *var = scope.FindVar(name);
+  PADDLE_ENFORCE(var != nullptr);
+  return *var->GetMutable<T>();
+}
+
+static void ExecShellCommand(const std::string &cmd, std::string *message) {
+  char buffer[128];
+  std::shared_ptr<FILE> pipe(popen(cmd.c_str(), "r"), pclose);
+  if (!pipe) {
+    LOG(ERROR) << "error running command: " << cmd;
+    return;
+  }
+  while (!feof(pipe.get())) {
+    if (fgets(buffer, 128, pipe.get()) != nullptr) {
+      *message += buffer;
+    }
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
+
+#define PADDLE_DISALLOW_COPY_AND_ASSIGN(type__) \
+  type__(const type__ &) = delete;              \
+  void operator=(const type__ &) = delete;
diff --git a/paddle/fluid/inference/analysis/node.cc b/paddle/fluid/inference/analysis/node.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f2e918f3ff41d9db0c3ec38561015967bed26f4e
--- /dev/null
+++ b/paddle/fluid/inference/analysis/node.cc
@@ -0,0 +1,81 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/node.h"
+#include "glog/logging.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+template <>
+std::string &NodeAttr::As<std::string>() {
+  if (data_.empty()) {
+    type_index_ = std::type_index(typeid(std::string));
+  }
+  PADDLE_ENFORCE_EQ(type_index_, std::type_index(typeid(std::string)));
+  return data_;
+}
+
+std::string &NodeAttr::String() { return As<std::string>(); }
+
+std::vector<Dot::Attr> Value::dot_attrs() const {
+  return std::vector<Dot::Attr>({Dot::Attr("style", "filled,rounded"),
+                                 Dot::Attr("shape", "box"),
+                                 Dot::Attr("fillcolor", "red")});
+}
+
+std::vector<Dot::Attr> Function::dot_attrs() const {
+  return std::vector<Dot::Attr>({Dot::Attr("style", "filled,rounded"),
+                                 Dot::Attr("shape", "diamond"),
+                                 Dot::Attr("fillcolor", "yellow")});
+}
+
+Node *NodeMap::Create(Node::Type type) {
+  switch (type) {
+    case Node::Type::kFunction:
+      nodes_.emplace_back(new Function);
+      break;
+    case Node::Type::kValue:
+      nodes_.emplace_back(new Value);
+      break;
+    case Node::Type::kFunctionBlock:
+      nodes_.emplace_back(new FunctionBlock);
+      break;
+    default:
+      PADDLE_THROW("Not supported node type.");
+  }
+  nodes_.back()->id_ = size() - 1;
+  return nodes_.back().get();
+}
+
+Node *NodeMap::GetMutable(size_t id) {
+  PADDLE_ENFORCE_GT(size(), id);
+  return nodes_[id].get();
+}
+
+const Node &NodeMap::Get(size_t id) const {
+  PADDLE_ENFORCE_GT(size(), id);
+  return *nodes_[id].get();
+}
+
+void NodeMap::Delete(size_t id) {
+  PADDLE_ENFORCE_LT(id, size());
+  nodes_[id]->SetDeleted();
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/node.h b/paddle/fluid/inference/analysis/node.h
new file mode 100644
index 0000000000000000000000000000000000000000..47e524bc5c4a6b1324d5f182053129311487522d
--- /dev/null
+++ b/paddle/fluid/inference/analysis/node.h
@@ -0,0 +1,248 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file defines the Node class and its subclasses. A Node is the basis
+ * analysis element in a computation graph.
+ * There are basically two kinds of nodes, the function node and value node.
+ */
+#pragma once
+
+#include <limits>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/inference/analysis/device.h"
+#include "paddle/fluid/inference/analysis/dot.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+class NodeMap;
+
+// A helper class to maintain the status from Pass.
+struct NodeAttr {
+  // NOTE T should be a primary type or a struct combined by several primary
+  // types.
+  // NOTE the STL containers should not use here.
+  // Some usages
+  //   Attr attr;
+  //   attr.Bool() = true;
+
+  bool &Bool() { return As<bool>(); }
+  float &Float() { return As<float>(); }
+  int32_t &Int32() { return As<int32_t>(); }
+  int64_t &Int64() { return As<int64_t>(); }
+  void *&Pointer() { return As<void *>(); }
+  std::string &String();
+
+ private:
+  template <typename T>
+  T &As() {
+    // init storage in the first usage.
+    if (data_.empty()) {
+      VLOG(4) << "resize data to " << sizeof(T);
+      type_index_ = std::type_index(typeid(T));
+      data_.resize(sizeof(T));
+    }
+    PADDLE_ENFORCE(framework::IsType<T>(type_index_),
+                   "type not matched, origin is %s, want %s",
+                   DataTypeNamer::Global().repr(type_index_),
+                   DataTypeNamer::Global().repr<T>());
+    PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error");
+    return *reinterpret_cast<T *>(&data_[0]);
+  }
+
+ private:
+  std::string data_;
+  std::type_index type_index_{typeid(NodeAttr)};
+};
+
+/*
+ * Node Representation.
+ *
+ * This is a very important class for analysis. It is the base class of all
+ * nodes computed by a program that may be used as operands to other nodes.
+ * Node is the super class of other important classes such as Function and
+ * Value, some nodes can have a name.
+ */
+class Node {
+ public:
+  // Node type. NOTE the new node types should add here.
+  enum class Type { kNone = -1, kFunction, kValue, kFunctionBlock };
+
+  Node() = default;
+
+  // Cast to a subclass type, Function for example.
+  template <typename Subclass>
+  Subclass &As() {
+    return *dynamic_cast<Subclass *>(this);
+  }
+
+  // Formatted representation of this Node.
+  virtual std::string repr() const {
+    return name() + "(" + std::to_string(id()) + ")";
+  }
+
+  // DOT node representation. One Node type can customize its own node
+  // representation.
+  virtual std::vector<Dot::Attr> dot_attrs() const {
+    return std::vector<Dot::Attr>({Dot::Attr("style", "filled")});
+  }
+
+  // Get an additional attribute and convert it to T data type. NOTE this will
+  // silently create a new attribute if not exists.
+  NodeAttr &attr(const std::string &name) const { return attrs_[name]; }
+
+  int id() const { return id_; }
+
+  // The Protobuf description is set/get with a void* to decouple Node interface
+  // from a specific kind of Protobuf message.
+  void SetPbDesc(void *pb) { attr("pb_desc").Pointer() = pb; }
+  void *pb_desc() const { return attr("pb_desc").Pointer(); }
+
+  void SetPbMsg(const std::string &s) { attr("pb_msg").String() = s; }
+  const std::string &pb_msg() const { return attr("pb_msg").String(); }
+
+  void SetDeleted() { deleted_ = true; }
+  bool deleted() const { return deleted_; }
+
+  void SetName(const std::string &name) { name_ = name; }
+  const std::string &name() const { return name_; }
+
+  void SetType(Type type) { type_ = type; }
+  Type type() const { return type_; }
+
+  // Input links.
+  std::vector<Node *> inlinks;
+  // Output links.
+  std::vector<Node *> outlinks;
+
+  // Type checks.
+  bool IsFunction() const { return type_ == Node::Type::kFunction; }
+  bool IsValue() const { return type_ == Node::Type::kValue; }
+  bool IsFunctionBlock() const { return type_ == Node::Type::kFunctionBlock; }
+
+  virtual ~Node() {}
+
+  friend class NodeMap;
+
+  PADDLE_DISALLOW_COPY_AND_ASSIGN(Node);
+
+ protected:
+  // The id number not the name is a node's unique identifier in the computation
+  // graph.
+  int id_{-1};
+  std::string name_;
+  Type type_{Type::kNone};
+  // Mark this node is deleted by some pass.
+  bool deleted_{false};
+  mutable std::unordered_map<std::string, NodeAttr> attrs_;
+};
+
+class Function;
+/*
+ * Value represents a value node, it has some attributes including dims, data
+ * type and so on.
+ */
+class Value : public Node {
+ public:
+  enum class DataType { kInt32, kInt64, kFloat32, kFloat64 };
+  using Dims = std::vector<int>;
+
+  void SetDataType(DataType data_type) { data_type_ = data_type; }
+  DataType data_type() const { return data_type_; }
+
+  void SetDims(const Dims &dims) { dims_ = dims; }
+  const Dims &dims() const { return dims_; }
+
+  Device device() const { return device_; }
+  void SetDevice(Device device) { device_ = device; }
+
+  std::vector<Dot::Attr> dot_attrs() const override;
+
+  PADDLE_DISALLOW_COPY_AND_ASSIGN(Value);
+
+ protected:
+  Value() { SetType(Node::Type::kValue); }
+  friend class NodeMap;
+
+ private:
+  DataType data_type_;
+  Dims dims_;
+  Device device_;
+};
+
+/*
+ * Function represents any kind of executable concepts that takes several Values
+ * as input, and outputs several Values.
+ */
+class Function : public Node {
+ public:
+  std::vector<Dot::Attr> dot_attrs() const override;
+
+  // Get the operator's type from Desc.
+  const std::string &func_type() const { return func_type_; }
+  // Set the operator's type.
+  void SetFuncType(const std::string &func_type) { func_type_ = func_type; }
+
+  PADDLE_DISALLOW_COPY_AND_ASSIGN(Function);
+
+ protected:
+  std::string func_type_;
+  Function() { SetType(Node::Type::kFunction); }
+  friend class NodeMap;
+};
+
+/*
+ * FunctionBlock is a Node that contains a sub-graph multiple Node.
+ */
+struct FunctionBlock : public Node {
+  std::string repr() const override { return "block-" + std::to_string(id()); }
+  std::vector<Node *> subgraph;
+
+ protected:
+  FunctionBlock() { SetType(Node::Type::kFunctionBlock); }
+  friend class NodeMap;
+};
+
+class NodeMap {
+ public:
+  // Create a new node with type.
+  Node *Create(Node::Type type);
+
+  // Get a node by its id.
+  Node *GetMutable(size_t id);
+
+  const Node &Get(size_t id) const;
+
+  void Delete(size_t id);
+
+  const std::vector<std::unique_ptr<Node>> &nodes() const { return nodes_; }
+
+  size_t size() const { return nodes_.size(); }
+
+ private:
+  std::vector<std::unique_ptr<Node>> nodes_;
+  std::unordered_map<std::string, Node *> map_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/node_attr_flags.h b/paddle/fluid/inference/analysis/node_attr_flags.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3f70e5419a66969e8fb20152a8a8ace39316f57
--- /dev/null
+++ b/paddle/fluid/inference/analysis/node_attr_flags.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file contains all the flags that declared in Node::Attr.
+ *
+ * The Node::Attr is designed to share information between different passes, one
+ * can get other's attributes in a Node by the flags in this file.
+ */
+#pragma once
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+#define DECLARE_NODE_ATTR(flag__) const char ATTR_##flag__[] = #flag__;
+
+DECLARE_NODE_ATTR(supported_by_tensorrt)  // bool
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/node_tester.cc b/paddle/fluid/inference/analysis/node_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea832a3a7e47758be9b6bd59a4325ddb576ec446
--- /dev/null
+++ b/paddle/fluid/inference/analysis/node_tester.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/node.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST(Node, Attr) {
+  // Node is an abstract class, use Value instead for they share the same Attr
+  // logic.
+  NodeMap nodes;
+  auto* node = nodes.Create(Node::Type::kValue);
+  node->attr("v0").Int32() = 2008;
+  ASSERT_EQ(node->attr("v0").Int32(), 2008);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass.cc b/paddle/fluid/inference/analysis/pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..121b72c0a0aa9a0c568b04f7ee9a5bc5c1d6f5f8
--- /dev/null
+++ b/paddle/fluid/inference/analysis/pass.cc
@@ -0,0 +1,15 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/pass.h"
diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..25c566ebfa41abe3a247bc6c6e5583c8620a6abb
--- /dev/null
+++ b/paddle/fluid/inference/analysis/pass.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+#include <iosfwd>
+#include <string>
+
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/inference/analysis/argument.h"
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/analysis/node.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+class Pass {
+ public:
+  Pass() = default;
+  virtual ~Pass() = default;
+  // Virtual method overridden by subclasses to do only necessary initialization
+  // before any pass is run.
+  // virtual bool Initialize() { return false; }
+  // There is some passes such as FlowToDataFlowGraphPass that needs a
+  // ProgramDesc. Here use the native ProgramDesc ProtoBuf message, so that it
+  // only couple with the proto file.
+  // virtual bool Initialize(const framework::proto::ProgramDesc &desc) { return
+  // false; }
+  // There are some Passes such as DataFlowGraphToFluidPass that will output a
+  // ProgramDesc.
+  // virtual bool Initialize(framework::proto::ProgramDesc *desc) { return
+  // false; }
+
+  // Mutable Pass.
+  virtual bool Initialize(Argument *argument) { return false; }
+  // Readonly Pass.
+  virtual bool Initialize(const Argument &argument) { return false; }
+
+  // Virtual method overriden by subclasses to do any necessary clean up after
+  // all passes have run.
+  virtual bool Finalize() { return false; }
+
+  // Get a Pass appropriate to print the Node this pass operates on.
+  virtual Pass *CreatePrinterPass(std::ostream &os,
+                                  const std::string &banner) const {
+    return nullptr;
+  }
+
+  // Create a debugger Pass that draw the DFG by graphviz toolkit.
+  virtual Pass *CreateGraphvizDebugerPass() const { return nullptr; }
+
+  // Run on a single Node.
+  virtual void Run(Node *x) { LOG(FATAL) << "not valid"; }
+  // Run on a single Function.
+  virtual void Run(Function *x) { LOG(FATAL) << "not valid"; }
+  // Run on a single FunctionBlock.
+  virtual void Run(FunctionBlock *x) { LOG(FATAL) << "not valid"; }
+  // Run on a single DataFlowGraph.
+  virtual void Run(DataFlowGraph *x) { LOG(FATAL) << "not valid"; }
+
+  // Human-readable short representation.
+  virtual std::string repr() const = 0;
+  // Human-readable long description.
+  virtual std::string description() const = 0;
+};
+
+// NodePass process on any Node types.
+class NodePass : public Pass {
+ public:
+  virtual void Run(Node *node) = 0;
+};
+
+// NodePass process on any Function node types.
+class FunctionPass : public Pass {
+ public:
+  virtual void Run(Function *node) = 0;
+};
+
+// NodePass process on any FunctionBlock node types.
+class FunctionBlockPass : public Pass {
+ public:
+  virtual void Run(FunctionBlock *node) = 0;
+};
+
+// GraphPass processes on any GraphType.
+class DataFlowGraphPass : public Pass {
+ public:
+  virtual void Run(DataFlowGraph *graph) = 0;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass_manager.cc b/paddle/fluid/inference/analysis/pass_manager.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b428bb22b1f0c5c1a47fc4c46c9070c1ace4a228
--- /dev/null
+++ b/paddle/fluid/inference/analysis/pass_manager.cc
@@ -0,0 +1,56 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/pass_manager.h"
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+bool PassManager::Initialize(Argument* argument) {
+  argument_ = argument;
+  for (auto& pass : data_) {
+    LOG(INFO) << "Initializing pass " << pass->repr();
+    if (!pass->Initialize(argument)) {
+      LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
+      return false;
+    }
+  }
+  return true;
+}
+
+void DfgPassManager::RunAll() {
+  PADDLE_ENFORCE(argument_);
+  for (auto& pass : data_) {
+    VLOG(4) << "Running pass [" << pass->repr() << "]";
+    pass->Run(argument_->main_dfg.get());
+  }
+}
+
+void NodePassManager::RunAll() {
+  PADDLE_ENFORCE(argument_);
+  PADDLE_ENFORCE(argument_->main_dfg.get());
+  auto trait =
+      GraphTraits<DataFlowGraph>(argument_->main_dfg.get()).nodes_in_DFS();
+  for (auto& node : trait) {
+    for (auto& pass : data_) {
+      pass->Run(&node);
+    }
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass_manager.h b/paddle/fluid/inference/analysis/pass_manager.h
new file mode 100644
index 0000000000000000000000000000000000000000..81a17e0287a5aef8a328e43380ee3691f5a32379
--- /dev/null
+++ b/paddle/fluid/inference/analysis/pass_manager.h
@@ -0,0 +1,106 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file defines the logic of pass management. The analysis for inference is
+ * a pipeline of Passes, a PassManager is a agency that helps to manage the
+ * executation of the Passes.
+ *
+ * There are two modes of Passes, the first one is called NodePass and takes
+ * an Node as input and output; the second one is called DFGPass and takes a
+ * DFG(Data Flow Graph) as input and output. It is hard to put all the passes in
+ * the same pipeline, there are two kinds of PassManagers, both takes a DFG as
+ * input and output a DFG, but the Passes inside are different:
+ *
+ *   1. NodePassManager: the passes inside are all NodePasses, it can have
+ *      different graph trivial algorithm, for example, DFS_NodePassManager will
+ *      trigger the passes in depth first order;
+ *   2. DfgPassManager: the passes inside are all DfgPasses.
+ */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/analysis/pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * PassManager is the base class for all pass managers, a pass manager has
+ * several Pass-es registered, and execute them in the linear order.
+ */
+class PassManager : public OrderedRegistry<Pass> {
+ public:
+  PassManager() = default;
+  // Call all the passes' Initialize methods. The desc and data_flow_graph are
+  // globally shared, so pass them as the arguemnts for all the pass managers.
+  virtual bool Initialize(const Argument& argument) { return false; }
+
+  virtual bool Initialize(Argument* argument);
+
+  // Call all the passes' Finalize methods.
+  virtual bool Finalize() {
+    for (auto& pass : data_) {
+      if (!pass->Finalize()) {
+        LOG(ERROR) << "Failed to finalize pass [" << pass->repr() << "]";
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Run all the passes.
+  virtual void RunAll() = 0;
+
+  // Short identifier.
+  virtual std::string repr() const = 0;
+  // Long description.
+  virtual std::string description() const = 0;
+
+  virtual ~PassManager() = default;
+
+ protected:
+  Argument* argument_{nullptr};
+};
+
+/*
+ * A pass manager that process a DFG.
+ */
+class DfgPassManager : public PassManager {
+ public:
+  DfgPassManager() = default;
+
+  void RunAll() override;
+
+  virtual ~DfgPassManager() = default;
+};
+
+/*
+ * A pass manager that process a Node each time.
+ */
+class NodePassManager : public PassManager {
+ public:
+  NodePassManager() = default;
+
+  void RunAll() override;
+
+  virtual ~NodePassManager() = default;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/pass_manager_tester.cc b/paddle/fluid/inference/analysis/pass_manager_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dac1c509d728114bd24a2ea1150c407646026fd4
--- /dev/null
+++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/pass_manager.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+class TestDfgPassManager final : public DfgPassManager {
+ public:
+  TestDfgPassManager() = default;
+  virtual ~TestDfgPassManager() = default;
+  // Short identifier.
+  std::string repr() const override { return "test-pass-manager"; }
+  // Long description.
+  std::string description() const override { return "test doc"; }
+};
+
+class TestNodePassManager final : public NodePassManager {
+ public:
+  virtual ~TestNodePassManager() = default;
+
+  std::string repr() const override { return "test-node-pass-manager"; }
+  std::string description() const override { return "test doc"; }
+};
+
+class TestNodePass final : public NodePass {
+ public:
+  virtual ~TestNodePass() = default;
+
+  bool Initialize(Argument* argument) override { return true; }
+
+  void Run(Node* node) override {
+    LOG(INFO) << "- Processing node " << node->repr();
+  }
+
+  std::string repr() const override { return "test-node"; }
+  std::string description() const override { return "some doc"; }
+};
+
+TEST_F(DFG_Tester, DFG_pass_manager) {
+  TestDfgPassManager manager;
+  DFG_GraphvizDrawPass::Config config("./", "dfg.dot");
+
+  manager.Register("fluid-to-flow-graph", new FluidToDataFlowGraphPass);
+  manager.Register("graphviz", new DFG_GraphvizDrawPass(config));
+  manager.Register("dfg-to-fluid", new DataFlowGraphToFluidPass);
+
+  ASSERT_TRUE(&argument);
+  ASSERT_TRUE(manager.Initialize(&argument));
+  manager.RunAll();
+}
+
+TEST_F(DFG_Tester, Node_pass_manager) {
+  // Pre-process: initialize the DFG with the ProgramDesc first.
+  FluidToDataFlowGraphPass pass0;
+  pass0.Initialize(&argument);
+  pass0.Run(argument.main_dfg.get());
+
+  TestNodePassManager manager;
+  manager.Register("test-node-pass", new TestNodePass);
+  ASSERT_TRUE(manager.Initialize(&argument));
+  manager.RunAll();
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..389f9e1a9148a4daf0e5b751cce5cb6325252a4e
--- /dev/null
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -0,0 +1,160 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+const char *SubGraphSplitter::kMarkerAttrName =
+    "_sub_graph_splitter_inside_sub_graph";
+
+std::vector<std::vector<Node *>> SubGraphSplitter::operator()() {
+  MarkNodesInsideSubGraph();
+  return ExtractSubGraphs();
+}
+
+// Mark the output variables inside a subgraph with the func.
+inline void MarkOutLinksInSubGraph(const Function *func) {
+  for (auto *var : func->outlinks) {
+    var->attr(SubGraphSplitter::kMarkerAttrName).Bool() = true;
+  }
+}
+
+void SubGraphSplitter::MarkNodesInsideSubGraph() {
+  for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes()) {
+    if (node_inside_subgraph_teller_(&node)) {
+      node.attr(kMarkerAttrName).Bool() = true;
+      if (node.type() == Node::Type::kFunction) {
+        // If a function is inside the sub-graph, mark all the output variables
+        // to be inside too, so that two marked functions will be inside a same
+        // sub-graph, lets take a example:  A_function->var->B_function, if
+        // A_function is marked, var should also be marked, so that B_function
+        // will be in the same sub-graph with A_function if B_function is
+        // marked.
+        MarkOutLinksInSubGraph(static_cast<const Function *>(&node));
+      }
+    }
+  }
+}
+
+const char *kUnionFindParent = "_sub_graph_splitter_union_find_parent_";
+
+// Use the Union Find(UF) algorithm to find fully connected sub-graphs, if node
+// a's output is node b, that is a and b is in the same sub-graph. The UF
+// algorithm will group them to the same cluster.
+using node_map_t = std::unordered_map<int, Node *>;
+// Find the ancestor id of a node.
+int UnionFindGetAncestor(const node_map_t &node_map, size_t id) {
+  int tmp = id;
+  do {
+    tmp = node_map.at(tmp)->attr(kUnionFindParent).Int32();
+  } while (node_map.at(tmp)->attr(kUnionFindParent).Int32() != tmp);
+  return tmp;
+}
+// Make this two node share the same ancestor.
+// TODO(Superjom) bad performance, make a balanced tree latter.
+void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) {
+  int a_ancestor = UnionFindGetAncestor(node_map, a);
+  int b_ancestor = UnionFindGetAncestor(node_map, b);
+  node_map.at(b_ancestor)->attr(kUnionFindParent).Int32() = a_ancestor;
+  node_map.at(a)->attr(kUnionFindParent).Int32() = a_ancestor;
+  node_map.at(b)->attr(kUnionFindParent).Int32() = a_ancestor;
+}
+
+std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
+  std::vector<Node *> marked_nodes;
+  for (auto &node : GraphTraits<DataFlowGraph>(graph_).nodes()) {
+    if (node.attr(kMarkerAttrName).Bool()) {
+      marked_nodes.push_back(&node);
+    }
+  }
+  // extract sub-graphs in the marked node set, use Union Find algorithm.
+  node_map_t node_map;  // id to ptr
+  for (auto *n : marked_nodes) {
+    // n's parent == n.id means it is the ancestor
+    n->attr(kUnionFindParent).Int32() = n->id();
+    node_map[n->id()] = n;
+  }
+  std::unordered_set<Node *> visited;
+  for (auto *n : marked_nodes) {
+    for (auto *out : n->outlinks) {
+      if (node_map.count(out->id())) {
+        UnionFindCombine(node_map, n->id(), out->id());
+      }
+    }
+  }
+
+  std::unordered_map<int /*ancestor*/, std::vector<Node *>> clusters;
+  for (auto *n : marked_nodes) {
+    if (n->type() == Node::Type::kFunction) {
+      clusters[UnionFindGetAncestor(node_map,
+                                    n->attr(kUnionFindParent).Int32())]
+          .push_back(n);
+    }
+  }
+  std::vector<std::vector<Node *>> result;
+  std::for_each(clusters.begin(), clusters.end(),
+                [&](const decltype(clusters)::value_type &it) {
+                  result.push_back(it.second);
+                });
+
+  return result;
+}
+
+void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); }
+
+void SubGraphFuse::ReplaceNodesWithSubGraphs() {
+  auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)();
+  for (auto &subgraph : subgraphs) {
+    std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
+    // replace this sub-graph with the first node. Two steps: 1. Create a Block
+    // Node that contains this subgraph 2. Mark the nodes inside the sub-graph
+    // as deleted. 3. Replace the deleted node with the new Block Node.
+    auto *block_node = static_cast<FunctionBlock *>(
+        graph_->nodes.Create(Node::Type::kFunctionBlock));
+    auto io = ExtractInputAndOutputOfSubGraph(subgraph);
+    block_node->inlinks = std::move(io.first);
+    block_node->outlinks = std::move(io.second);
+    for (auto *node : subgraph) {
+      // TODO(Superjomn) need a unified mechanism to treat deleted node in each
+      // pass.
+      node->SetDeleted();
+      block_node->subgraph.push_back(node);
+    }
+
+    // Change all the sub-graph's inputs and outputs corresponding inlink and
+    // outlink to this sub-graph node.
+    auto inlink_or_outlink_cleaner = [&](std::vector<Node *> &nodes) {
+      for (auto *&n : nodes) {
+        if (subgraph_uniq.count(n)) {
+          n = block_node;
+        }
+      }
+      std::unordered_set<Node *> uniq(nodes.begin(), nodes.end());
+      nodes.assign(uniq.begin(), uniq.end());
+    };
+    for (auto *i : block_node->inlinks) {
+      inlink_or_outlink_cleaner(i->outlinks);
+    }
+    for (auto *&o : block_node->outlinks) {
+      inlink_or_outlink_cleaner(o->inlinks);
+    }
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.h b/paddle/fluid/inference/analysis/subgraph_splitter.h
new file mode 100644
index 0000000000000000000000000000000000000000..a31afbe6933da8d3c7a88142cc12d63b98b55796
--- /dev/null
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.h
@@ -0,0 +1,83 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file defines the the class to partition a graph.
+ */
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/inference/analysis/node.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Detect the nodes in a sub-graph that meet some conditions. This class doesn't
+ * modify the graph.
+ */
+class SubGraphSplitter {
+ public:
+  static const char *kMarkerAttrName;
+  // Tell whether a node is inside a sub-graph.
+  using NodeInsideSubgraphTeller = std::function<bool(const Node *)>;
+
+  SubGraphSplitter(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller)
+      : graph_(graph), node_inside_subgraph_teller_(teller) {}
+
+  std::vector<std::vector<Node *>> operator()();
+
+ protected:
+  // Mark the nodes inside the accepted sub-graph using
+  // node_inside_subgraph_teller.
+  void MarkNodesInsideSubGraph();
+
+  // Merge the marked nodes into sub-graphs and return the sub-graphs.
+  std::vector<std::vector<Node *>> ExtractSubGraphs();
+
+ private:
+  DataFlowGraph *graph_;
+  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+};
+
+/*
+ * SubGraphFuse - Replace some nodes with the sub-graph node they are inside. To
+ * some extent, the TensorRT engine is just a fusion op for a model.
+ */
+class SubGraphFuse {
+ public:
+  using NodeInsideSubgraphTeller = SubGraphSplitter::NodeInsideSubgraphTeller;
+
+  SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller)
+      : graph_(graph), node_inside_subgraph_teller_(teller) {}
+
+  // The main method which run all the logic.
+  void operator()();
+
+ protected:
+  // Remove the nodes inside sub-graphs and replace with the SubGraphNode.
+  void ReplaceNodesWithSubGraphs();
+
+ private:
+  DataFlowGraph *graph_;
+  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8134494f8bccb132f2ed7d1ba1fb615a298596ed
--- /dev/null
+++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
@@ -0,0 +1,90 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) {
+  if (node->type() != Node::Type::kFunction) return false;
+  const auto* func = static_cast<const Function*>(node);
+  if (func->func_type() == "elementwise_add" || func->func_type() == "relu" ||
+      func->func_type() == "conv2d" || func->func_type() == "mul" ||
+      func->func_type() == "sigmoid" || func->func_type() == "softmax") {
+    LOG(INFO) << "sub-graph marked " << node->repr();
+    return true;
+  }
+  return false;
+};
+
+TEST_F(DFG_Tester, Split) {
+  auto desc = LoadProgramDesc();
+  auto dfg = ProgramDescToDFG(desc);
+  LOG(INFO) << "spliter\n" << dfg.DotString();
+
+  ASSERT_GT(dfg.nodes.size(), 5UL);
+
+  auto subgraphs = SubGraphSplitter(&dfg, teller)();
+
+  // Check the number of the marked nodes.
+  int marked_nodes = 0;
+  for (auto& node : dfg.nodes.nodes()) {
+    if (node->IsFunction() &&
+        node->attr(SubGraphSplitter::kMarkerAttrName).Bool()) {
+      ++marked_nodes;
+    }
+  }
+  EXPECT_EQ(marked_nodes, 6);
+
+  // For human debug.
+  for (auto& subgraph : subgraphs) {
+    LOG(INFO) << "subgraph size " << subgraph.size();
+    for (auto* node : subgraph) {
+      LOG(INFO) << "node " << node->repr();
+    }
+  }
+
+  ASSERT_EQ(subgraphs.size(), 1UL);
+  // The last sub-graph has 5 Functions.
+  ASSERT_EQ(subgraphs.back().size(), 6UL);
+}
+
+TEST_F(DFG_Tester, Fuse) {
+  auto desc = LoadProgramDesc();
+  auto dfg = ProgramDescToDFG(desc);
+
+  size_t count0 = dfg.nodes.size();
+
+  SubGraphFuse fuse(&dfg, teller);
+  fuse();
+
+  int count1 = 0;
+  for (auto& node : dfg.nodes.nodes()) {
+    if (node->deleted()) {
+      LOG(INFO) << "deleted " << node->repr();
+    }
+    count1 += node->deleted();
+  }
+
+  // At least one nodes should be deleted.
+  ASSERT_EQ(dfg.nodes.size(), count0 + 1);  // added a new FunctionBlock
+  ASSERT_EQ(6UL, count1);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f736e385c11add152dc9ab9485bf1de40f80b2f3
--- /dev/null
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+#include "paddle/fluid/inference/analysis/node_attr_flags.h"
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void TensorRTSubgraphNodeMarkPass::Run(DataFlowGraph *graph) {
+  for (auto &node : graph->nodes.nodes()) {
+    node->attr(ATTR_supported_by_tensorrt).Bool() = teller_(node.get());
+  }
+}
+
+class DfgDebuggerPass : public DFG_GraphvizDrawPass {
+ public:
+  explicit DfgDebuggerPass(const DFG_GraphvizDrawPass::Config &config)
+      : DFG_GraphvizDrawPass(config) {}
+
+  std::string repr() const override {
+    return "tensorrt-subgraph-node-mark-debugger";
+  }
+
+  bool Finalize() override { return true; }
+
+ protected:
+  std::string Draw(DataFlowGraph *graph) override {
+    Dot dot;
+    // Add nodes
+    for (size_t i = 0; i < graph->nodes.size(); i++) {
+      const Node &node = graph->nodes.Get(i);
+      if (config_.display_deleted_node || !node.deleted()) {
+        auto dot_attr = node.dot_attrs();
+        if (node.attr(ATTR_supported_by_tensorrt).Bool()) {
+          dot_attr.assign(
+              {Dot::Attr{"color", "green"}, Dot::Attr{"style", "filled"}});
+        }
+        dot.AddNode(node.repr(), dot_attr);
+      }
+    }
+    // Add edges
+    for (size_t i = 0; i < graph->nodes.size(); i++) {
+      const Node &node = graph->nodes.Get(i);
+      if (!config_.display_deleted_node && node.deleted()) continue;
+      for (auto &in : node.inlinks) {
+        if (!config_.display_deleted_node && in->deleted()) continue;
+        dot.AddEdge(in->repr(), node.repr(), {});
+      }
+    }
+    return dot.Build();
+  }
+};
+
+Pass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const {
+  DFG_GraphvizDrawPass::Config config(
+      FLAGS_inference_analysis_graphviz_log_root, "tensorrt_marked_node");
+  return new DfgDebuggerPass(config);
+}
+bool TensorRTSubgraphNodeMarkPass::Finalize() { return true; }
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..c558a6ebbde371071c7330a14cc986bf764d1773
--- /dev/null
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file defines TensorRTSubgraphNodeMarkPass which helps to mark the ops
+ * that supported by TensorRT engine.
+ */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Mark the operators that TensorRT engine supports.
+ */
+class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass {
+ public:
+  using teller_t = SubGraphSplitter::NodeInsideSubgraphTeller;
+
+  explicit TensorRTSubgraphNodeMarkPass(const teller_t& teller)
+      : teller_(teller) {}
+
+  bool Initialize(Argument* argument) override { return true; }
+
+  // This class get a sub-graph as input and determine whether to transform this
+  // sub-graph into TensorRT.
+  void Run(DataFlowGraph* graph) override;
+
+  std::string repr() const override { return "tensorrt-sub-subgraph-mark"; }
+  std::string description() const override {
+    return "tensorrt sub-graph mark pass";
+  }
+
+  Pass* CreateGraphvizDebugerPass() const override;
+  bool Finalize() override;
+
+ private:
+  teller_t teller_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a6c15e848b99ca318f4583e3d4b88345fe8e5ebc
--- /dev/null
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc
@@ -0,0 +1,50 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/node_attr_flags.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) {
+  // init
+  FluidToDataFlowGraphPass pass;
+  ASSERT_TRUE(pass.Initialize(&argument));
+  argument.main_dfg.reset(new DataFlowGraph);
+  pass.Run(argument.main_dfg.get());
+
+  TensorRTSubgraphNodeMarkPass::teller_t teller = [](const Node* node) {
+    return node->IsFunction() &&
+           static_cast<const Function*>(node)->func_type() == "mul";
+  };
+  TensorRTSubgraphNodeMarkPass pass1(teller);
+  ASSERT_TRUE(pass1.Initialize(&argument));
+  pass1.Run(argument.main_dfg.get());
+
+  int counter{0};
+  for (auto& node : argument.main_dfg->nodes.nodes()) {
+    counter += node->attr(ATTR_supported_by_tensorrt).Bool();
+  }
+
+  LOG(INFO) << counter << " nodes marked";
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9993de22800bc0aafdcbf46618e6b479ac1eb187
--- /dev/null
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
@@ -0,0 +1,33 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
+#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TensorRTSubGraphPass::TensorRTSubGraphPass(
+    const TensorRTSubGraphPass::NodeInsideSubgraphTeller &teller)
+    : node_inside_subgraph_teller_(teller) {}
+
+void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
+  SubGraphFuse(graph, node_inside_subgraph_teller_)();
+}
+
+}  // namespace analysis
+}  // namespace inference
+
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6741a92095d33d261a4e1667c87a8ca02e51a9f
--- /dev/null
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/inference/analysis/node.h"
+#include "paddle/fluid/inference/analysis/pass.h"
+#include "paddle/fluid/inference/analysis/subgraph_splitter.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * Parse the graph and replace TensorRT supported nodes with SubGraphNode
+ */
+class TensorRTSubGraphPass : public DataFlowGraphPass {
+ public:
+  // Tell whether to transform a sub-graph into TensorRT.
+  using NodeInsideSubgraphTeller = SubGraphFuse::NodeInsideSubgraphTeller;
+
+  explicit TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller);
+
+  bool Initialize(Argument* argument) override { return true; }
+
+  // This class get a sub-graph as input and determine whether to transform this
+  // sub-graph into TensorRT.
+  void Run(DataFlowGraph* graph) override;
+
+  bool Finalize() override { return true; }
+
+  std::string repr() const override { return "tensorrt-sub-graph"; }
+  std::string description() const override { return "tensorrt sub graph pass"; }
+
+ private:
+  NodeInsideSubgraphTeller node_inside_subgraph_teller_;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1d749d3fa3f39b351ccee6ebeb82467f7220a0b6
--- /dev/null
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
@@ -0,0 +1,70 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+DEFINE_string(dot_dir, "./", "");
+
+TEST_F(DFG_Tester, tensorrt_single_pass) {
+  std::unordered_set<std::string> teller_set(
+      {"elementwise_add", "mul", "sigmoid"});
+  SubGraphSplitter::NodeInsideSubgraphTeller teller = [&](const Node* node) {
+    if (node->type() != Node::Type::kFunction) return false;
+    const auto* func = static_cast<const Function*>(node);
+    if (teller_set.count(func->func_type())) return true;
+    return false;
+  };
+
+  LOG(INFO) << "init";
+  DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"};
+  DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"};
+
+  DFG_GraphvizDrawPass dfg_pass(config);
+  DFG_GraphvizDrawPass dfg_pass1(config1);
+  FluidToDataFlowGraphPass pass0;
+  TensorRTSubGraphPass trt_pass(std::move(teller));
+
+  LOG(INFO) << "Initialize";
+  dfg_pass.Initialize(&argument);
+  dfg_pass1.Initialize(&argument);
+  pass0.Initialize(&argument);
+  trt_pass.Initialize(&argument);
+
+  LOG(INFO) << "Run";
+  argument.main_dfg.reset(new DataFlowGraph);
+  pass0.Run(argument.main_dfg.get());
+  dfg_pass.Run(argument.main_dfg.get());
+  trt_pass.Run(argument.main_dfg.get());
+  dfg_pass1.Run(argument.main_dfg.get());
+
+  // Check the TRT op's block desc
+  for (auto& node : argument.main_dfg->nodes.nodes()) {
+    if (node->IsFunctionBlock()) {
+      LOG(INFO) << "get function block";
+    }
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce1191a567a4198f003520c40bf02487c48c56eb
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ut_helper.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <fstream>
+#include <string>
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/inference/analysis/data_flow_graph.h"
+#include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+
+// Read ProgramDesc from a __model__ file, defined in io.cc
+extern void ReadBinaryFile(const std::string& filename, std::string* contents);
+
+namespace analysis {
+
+DEFINE_string(inference_model_dir, "", "inference test model dir");
+
+static framework::proto::ProgramDesc LoadProgramDesc(
+    const std::string& model_dir = FLAGS_inference_model_dir) {
+  std::string msg;
+  std::string net_file = FLAGS_inference_model_dir + "/__model__";
+  std::ifstream fin(net_file, std::ios::in | std::ios::binary);
+  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", net_file);
+  fin.seekg(0, std::ios::end);
+  msg.resize(fin.tellg());
+  fin.seekg(0, std::ios::beg);
+  fin.read(&(msg.at(0)), msg.size());
+  fin.close();
+  framework::proto::ProgramDesc program_desc;
+  program_desc.ParseFromString(msg);
+  return program_desc;
+}
+
+static DataFlowGraph ProgramDescToDFG(
+    const framework::proto::ProgramDesc& desc) {
+  DataFlowGraph graph;
+  FluidToDataFlowGraphPass pass;
+  Argument argument;
+  argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc));
+  pass.Initialize(&argument);
+  pass.Run(&graph);
+  pass.Finalize();
+  return graph;
+}
+
+class DFG_Tester : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    auto desc = LoadProgramDesc(FLAGS_inference_model_dir);
+    argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc));
+  }
+
+  Argument argument;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/engine.h b/paddle/fluid/inference/engine.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce2b8161715a3fa2278ce950dbac82c6d0042bef
--- /dev/null
+++ b/paddle/fluid/inference/engine.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/framework/framework.pb.h"
+
+namespace paddle {
+namespace inference {
+
+struct Buffer;
+enum class DeviceType { UNK = -1, CPU, GPU };
+
+/*
+ * EngineBase is the base class of all inference engines. An inference engine
+ * takes a paddle program as input, and outputs the result in fluid Tensor
+ * format. It can be used to optimize performance of computation sub-blocks, for
+ * example, break down the original block into sub-blocks and execute each
+ * sub-blocks in different engines.
+ *
+ * For example:
+ *   When inference, the resnet50 model can put most of the model into subgraph
+ * and run it on a TensorRT engine.
+ *
+ * There are several engines such as TensorRT and other frameworks, so an
+ * EngineBase is put forward to give an unified interface for all the
+ * different engine implemention.
+ */
+class EngineBase {
+ public:
+  using DescType = ::paddle::framework::proto::BlockDesc;
+
+  // Build the model and do some preparation, for example, in TensorRT, run
+  // createInferBuilder, buildCudaEngine.
+  virtual void Build(const DescType& paddle_model) = 0;
+
+  // Execute the engine, that will run the inference network.
+  virtual void Execute(int batch_size) = 0;
+
+  // Return the IO buffer that allocated in engine. One can read/write directly
+  // on the buffer. If the buffer's buffer is nullptr, one can also allocate
+  // memory and maintain it outside the engine.
+  virtual Buffer& buffer(const std::string& name) = 0;
+
+  virtual ~EngineBase() {}
+};  // class EngineBase
+
+struct Buffer {
+  void* buffer{nullptr};               // buffer should be allocated only once.
+  size_t max_size;                     // buffer allocated space.
+  size_t size;                         // data size.
+  DeviceType device{DeviceType::UNK};  // tells which device this buffer is on.
+};
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index 52e9c0baa64508f82d0a86a88c8c5f8d23f9f7f2..6b03ac7119b117e442e6af34c719c8a4f736bde9 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -14,21 +14,44 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/io.h"
 
+#include <algorithm>
 #include <fstream>
+#include <vector>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/pybind/pybind.h"
+
+DEFINE_string(devices, "", "The devices to be used which is joined by comma.");
+DEFINE_bool(init_p2p, false, "Whether to init p2p.");
+DEFINE_int32(math_num_threads, 1,
+             "Number of threads used to run math functions.");
 
 namespace paddle {
 namespace inference {
 
-void ReadBinaryFile(const std::string& filename, std::string& contents) {
+void Init(const std::vector<std::string> argv) {
+  framework::InitGflags(argv);
+  operators::math::SetNumThreads(FLAGS_math_num_threads);
+  // init devices
+  std::vector<int> devices;
+  std::string token;
+  std::istringstream tokenStream(FLAGS_devices);
+  while (std::getline(tokenStream, token, ',')) {
+    devices.push_back(std::stoi(token));
+  }
+  framework::InitDevices(FLAGS_init_p2p, devices);
+}
+
+void ReadBinaryFile(const std::string& filename, std::string* contents) {
   std::ifstream fin(filename, std::ios::in | std::ios::binary);
   PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
   fin.seekg(0, std::ios::end);
-  contents.clear();
-  contents.resize(fin.tellg());
+  contents->clear();
+  contents->resize(fin.tellg());
   fin.seekg(0, std::ios::beg);
-  fin.read(&contents[0], contents.size());
+  fin.read(&(contents->at(0)), contents->size());
   fin.close();
 }
 
@@ -41,8 +64,7 @@ bool IsPersistable(const framework::VarDesc* var) {
   return false;
 }
 
-void LoadPersistables(framework::Executor& executor,
-                      framework::Scope& scope,
+void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
                       const framework::ProgramDesc& main_program,
                       const std::string& dirname,
                       const std::string& param_filename) {
@@ -87,18 +109,18 @@ void LoadPersistables(framework::Executor& executor,
     op->CheckAttrs();
   }
 
-  executor.Run(*load_program, &scope, 0, true, true);
+  executor->Run(*load_program, scope, 0, true, true);
 
   delete load_program;
 }
 
-std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
-                                             framework::Scope& scope,
+std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
+                                             framework::Scope* scope,
                                              const std::string& dirname) {
   std::string model_filename = dirname + "/__model__";
   std::string program_desc_str;
   VLOG(3) << "loading model from " << model_filename;
-  ReadBinaryFile(model_filename, program_desc_str);
+  ReadBinaryFile(model_filename, &program_desc_str);
 
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
@@ -108,13 +130,11 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
 }
 
 std::unique_ptr<framework::ProgramDesc> Load(
-    framework::Executor& executor,
-    framework::Scope& scope,
-    const std::string& prog_filename,
-    const std::string& param_filename) {
+    framework::Executor* executor, framework::Scope* scope,
+    const std::string& prog_filename, const std::string& param_filename) {
   std::string model_filename = prog_filename;
   std::string program_desc_str;
-  ReadBinaryFile(model_filename, program_desc_str);
+  ReadBinaryFile(model_filename, &program_desc_str);
 
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h
index 6817a6fca047c9336233697a7bee4e5e16eedd5e..caf599b1a68783f155cd134c2a29e9ffa49a0895 100644
--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
@@ -18,24 +18,26 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 
 namespace paddle {
 namespace inference {
 
-void LoadPersistables(framework::Executor& executor,
-                      framework::Scope& scope,
+void Init(const std::vector<std::string> argv);
+
+void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
                       const framework::ProgramDesc& main_program,
                       const std::string& dirname,
                       const std::string& param_filename);
 
-std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
-                                             framework::Scope& scope,
+std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
+                                             framework::Scope* scope,
                                              const std::string& dirname);
 
-std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
-                                             framework::Scope& scope,
+std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
+                                             framework::Scope* scope,
                                              const std::string& prog_filename,
                                              const std::string& param_filename);
 
diff --git a/paddle/fluid/inference/paddle_fluid.map b/paddle/fluid/inference/paddle_fluid.map
new file mode 100644
index 0000000000000000000000000000000000000000..5203784dc1fcb672eb6a26d9dfd3ffbe02e08038
--- /dev/null
+++ b/paddle/fluid/inference/paddle_fluid.map
@@ -0,0 +1,6 @@
+{
+	global:
+		*paddle*;
+	local:
+		*;
+};
diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b52d083f280e5e7713600a7b748dedd37aca0a1e
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -0,0 +1,4 @@
+nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto)
+nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
+nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
+add_subdirectory(convert)
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..748f5a084e8c880df215a60fe51c835ba5cd3110
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -0,0 +1,15 @@
+# Add TRT tests
+nv_library(tensorrt_converter
+  SRCS mul_op.cc conv2d_op.cc fc_op.cc
+  DEPS tensorrt_engine mul_op)
+
+nv_test(test_op_converter SRCS test_op_converter.cc DEPS
+  ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_converter)
+
+nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
+nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
+nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL)
+nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL)
diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e1cace9cc1b06f036f52e82b7b86c99a02d50f50
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -0,0 +1,50 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class ReluOpConverter : public OpConverter {
+ public:
+  ReluOpConverter() {}
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    // Here the two nullptr looks strange, that's because the
+    // framework::OpDesc's constructor is strange.
+    framework::OpDesc op_desc(op, nullptr);
+    LOG(INFO) << "convert a fluid relu op to tensorrt activation layer whose "
+                 "type is Relu";
+    const nvinfer1::ITensor* input_tensor =
+        engine_->GetITensor(op_desc.Input("X")[0]);
+    nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor),
+        nvinfer1::ActivationType::kRELU);
+    auto output_name = op_desc.Output("Out")[0];
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8e7e23377d4b2fe7afd51f1f58048fc4ed3c6d99
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -0,0 +1,35 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class Conv2dOpConverter : public OpConverter {
+ public:
+  Conv2dOpConverter() {}
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    LOG(INFO)
+        << "convert a fluid conv2d op to tensorrt conv layer without bias";
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(conv2d, Conv2dOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bb603efaf30bb72d74b5583abc45d01a16c076a3
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -0,0 +1,121 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+// Reorder the elements from istrides to ostrides, borrowed from TRT convert in
+// tensorflow.
+// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/tensorrt/convert/convert_nodes.cc#L318
+template <typename T>
+void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
+              T* odata, nvinfer1::DimsHW ostrides) {
+  for (int h = 0; h < shape.h(); ++h) {
+    for (int w = 0; w < shape.w(); ++w) {
+      odata[h * ostrides.h() + w * ostrides.w()] =
+          idata[h * ostrides.h() + w * ostrides.w()];
+    }
+  }
+}
+
+// Reorder the data layout from CK to KC.
+void ReorderCKtoKC(TensorRTEngine::Weight& iweights,
+                   TensorRTEngine::Weight* oweights) {
+  int c = iweights.dims[0];
+  int k = iweights.dims[1];
+  oweights->dims.assign({k, c});
+  nvinfer1::DimsHW istrides = {1, k};
+  nvinfer1::DimsHW ostrides = {c, 1};
+  Reorder2({k, c}, static_cast<float const*>(iweights.get().values), istrides,
+           static_cast<float*>(const_cast<void*>(oweights->get().values)),
+           ostrides);
+}
+
+/*
+ * FC converter convert a MUL op in Fluid to a FC layer in TRT.
+ */
+class FcOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert a fluid fc op to tensorrt fc layer without bias";
+
+    framework::OpDesc op_desc(op, nullptr);
+    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+
+    // Declare inputs
+    auto* X = engine_->GetITensor(op_desc.Input("X").front());
+
+    // Declare weights
+    auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
+    PADDLE_ENFORCE_NOT_NULL(Y_v);
+    auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
+    // This may trigger a GPU->CPU copy, because TRT's weight can only be
+    // assigned from CPU memory, that can't be avoided.
+    auto* weight_data = Y_t->mutable_data<float>(platform::CPUPlace());
+    PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL);  // a matrix
+    size_t n_output = Y_t->dims()[1];
+
+    framework::LoDTensor tmp;
+    tmp.Resize(Y_t->dims());
+    memcpy(tmp.mutable_data<float>(platform::CPUPlace()), Y_t->data<float>(),
+           Y_t->dims()[0] * Y_t->dims()[1]);
+
+    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+                                  static_cast<void*>(weight_data),
+                                  Y_t->memory_size() / sizeof(float)};
+    TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT,
+                                      static_cast<void*>(tmp.data<float>()),
+                                      Y_t->memory_size() / sizeof(float));
+    weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]});
+    tmp_weight.dims = weight.dims;
+
+    // The data layout of TRT FC layer's weight is different from fluid's FC,
+    // need to reorder the elements.
+    ReorderCKtoKC(tmp_weight, &weight);
+
+    // Currently, the framework can only handle one fluid op -> one TRT layer,
+    // but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just
+    // handle `mul`, leave `add` as another layer.
+    // DEBUG
+    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
+
+    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected,
+                                       *const_cast<nvinfer1::ITensor*>(X),
+                                       n_output, weight.get(), bias.get());
+
+    auto output_name = op_desc.Output("Out").front();
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter);
+USE_OP(mul);
diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.cc b/paddle/fluid/inference/tensorrt/convert/io_converter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..854f434d93e81237dc85c5df62debcf3b3824b78
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/io_converter.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/io_converter.h"
+#include <cuda.h>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+using platform::is_gpu_place;
+using platform::is_cpu_place;
+
+class DefaultIOConverter : public EngineIOConverter {
+ public:
+  DefaultIOConverter() {}
+  // NOTE out is GPU memory.
+  virtual void operator()(const LoDTensor& in, void* out,
+                          size_t max_size) override {
+    PADDLE_ENFORCE(out != nullptr);
+    PADDLE_ENFORCE(stream_ != nullptr);
+    const auto& place = in.place();
+    size_t size = in.memory_size();
+    PADDLE_ENFORCE_LE(size, max_size);
+    if (is_cpu_place(place)) {
+      PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out, in.data<float>(), size,
+                                           cudaMemcpyHostToDevice, *stream_));
+    } else if (is_gpu_place(place)) {
+      PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out, in.data<float>(), size,
+                                           cudaMemcpyDeviceToDevice, *stream_));
+    } else {
+      PADDLE_THROW("Unknown device for converter");
+    }
+    cudaStreamSynchronize(*stream_);
+  }
+  // NOTE in is GPU memory.
+  virtual void operator()(const void* in, LoDTensor* out,
+                          size_t max_size) override {
+    PADDLE_ENFORCE(in != nullptr);
+    PADDLE_ENFORCE(stream_ != nullptr);
+    const auto& place = out->place();
+    size_t size = out->memory_size();
+    PADDLE_ENFORCE_LE(size, max_size);
+    if (is_cpu_place(place)) {
+      PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out->data<float>(), in, size,
+                                           cudaMemcpyDeviceToHost, *stream_));
+    } else if (is_gpu_place(place)) {
+      PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out->data<float>(), in, size,
+                                           cudaMemcpyDeviceToDevice, *stream_));
+    } else {
+      PADDLE_THROW("Unknown device for converter");
+    }
+    cudaStreamSynchronize(*stream_);
+  }
+};
+
+// fluid LodTensor <-> tensorrt ITensor
+REGISTER_TENSORRT_IO_CONVERTER(default, DefaultIOConverter);
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.h b/paddle/fluid/inference/tensorrt/convert/io_converter.h
new file mode 100644
index 0000000000000000000000000000000000000000..71c48e085d25d2bc6720d93735f661f9e3af7b40
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/io_converter.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+using framework::LoDTensor;
+
+/*
+ * Convert Input from Fluid to TensorRT Engine.
+ * Convert Output from TensorRT Engine to Fluid.
+ *
+ * Note that TensorRT's ITensor follows row major, NCHW. Fluid is also row
+ * major,
+ * so in the default case just need to copy the data.
+ */
+class EngineIOConverter {
+ public:
+  EngineIOConverter() {}
+
+  virtual void operator()(const LoDTensor& in, void* out, size_t max_size) {}
+  virtual void operator()(const void* in, LoDTensor* out, size_t max_size) {}
+
+  void SetStream(cudaStream_t* stream) { stream_ = stream; }
+
+  static void ConvertInput(const std::string& op_type, const LoDTensor& in,
+                           void* out, size_t max_size, cudaStream_t* stream) {
+    PADDLE_ENFORCE(stream != nullptr);
+    auto* converter = Registry<EngineIOConverter>::Lookup(
+        op_type, "default" /* default_type */);
+    PADDLE_ENFORCE_NOT_NULL(converter);
+    converter->SetStream(stream);
+    (*converter)(in, out, max_size);
+  }
+
+  static void ConvertOutput(const std::string& op_type, const void* in,
+                            LoDTensor* out, size_t max_size,
+                            cudaStream_t* stream) {
+    PADDLE_ENFORCE(stream != nullptr);
+    auto* converter = Registry<EngineIOConverter>::Lookup(
+        op_type, "default" /* default_type */);
+    PADDLE_ENFORCE_NOT_NULL(converter);
+    converter->SetStream(stream);
+    (*converter)(in, out, max_size);
+  }
+
+  virtual ~EngineIOConverter() {}
+
+ protected:
+  cudaStream_t* stream_{nullptr};
+};
+
+#define REGISTER_TENSORRT_IO_CONVERTER(op_type__, Converter__)        \
+  struct trt_io_##op_type__##_converter {                             \
+    trt_io_##op_type__##_converter() {                                \
+      Registry<EngineIOConverter>::Register<Converter__>(#op_type__); \
+    }                                                                 \
+  };                                                                  \
+  trt_io_##op_type__##_converter trt_io_##op_type__##_converter__;
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3c342957360ad4192d838147bf37e84d233c2629
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
@@ -0,0 +1,53 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * MulOp, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights.
+ */
+class MulOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias";
+
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
+    // Both the input1 and input2 do not need transpose.
+    auto* layer = TRT_ENGINE_ADD_LAYER(
+        engine_, MatrixMultiply, *const_cast<nvinfer1::ITensor*>(input1), false,
+        *const_cast<nvinfer1::ITensor*>(input2), false);
+
+    auto output_name = op_desc.Output("Out")[0];
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(mul);
+REGISTER_TRT_OP_CONVERTER(mul, MulOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
new file mode 100644
index 0000000000000000000000000000000000000000..6697952051c4b1997ca6b550da17a52e64cb3454
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -0,0 +1,117 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Convert Op from Fluid to TensorRT Engine.
+ */
+class OpConverter {
+ public:
+  OpConverter() {}
+
+  // Converter logic for an op.
+  virtual void operator()(const framework::proto::OpDesc& op,
+                          const framework::Scope& scope,
+                          bool test_mode = false) {}
+
+  // Convert a single fluid operator and add the corresponding layer to TRT.
+  // test_mode: whether the instance executes in an unit test.
+  void ConvertOp(const framework::proto::OpDesc& op,
+                 const std::unordered_set<std::string>& parameters,
+                 const framework::Scope& scope, TensorRTEngine* engine,
+                 bool test_mode = false) {
+    framework::OpDesc op_desc(op, nullptr);
+
+    OpConverter* it{nullptr};
+
+    if (op_desc.Type() == "mul") {
+      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
+      std::string Y = op_desc.Input("Y")[0];
+      if (parameters.count(Y)) {
+        it = Registry<OpConverter>::Lookup("fc");
+      }
+    }
+    if (!it) {
+      it = Registry<OpConverter>::Lookup(op_desc.Type());
+    }
+    PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
+                            op_desc.Type());
+    it->SetEngine(engine);
+    (*it)(op, scope, test_mode);
+  }
+
+  // Convert a fluid block to tensorrt network, NOTE it just convert operators,
+  // the INetwork's inputs and outputs should specified in some other modules.
+  void ConvertBlock(const framework::proto::BlockDesc& block,
+                    const std::unordered_set<std::string>& parameters,
+                    const framework::Scope& scope, TensorRTEngine* engine) {
+    for (int i = 0; i < block.ops_size(); i++) {
+      const auto& op = block.ops(i);
+      ConvertOp(op, parameters, scope, engine);
+    }
+  }
+
+  void SetEngine(TensorRTEngine* engine) { engine_ = engine; }
+
+  virtual ~OpConverter() {}
+
+  // TensorRT engine
+  TensorRTEngine* engine_{nullptr};
+
+ protected:
+  bool test_mode_;
+
+ private:
+  // registered op converter map, whose key is the fluid op type, and value is
+  // the pointer position of corresponding OpConverter class.
+  std::unordered_map<std::string, OpConverter*> converters_;
+  // fluid inference scope
+  framework::Scope* scope_{nullptr};
+};
+
+#define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__)                      \
+  struct trt_##op_type__##_converter : public ::paddle::framework::Registrar { \
+    trt_##op_type__##_converter() {                                            \
+      ::paddle::inference::                                                    \
+          Registry<paddle::inference::tensorrt::OpConverter>::Register<        \
+              ::paddle::inference::tensorrt::Converter__>(#op_type__);         \
+    }                                                                          \
+  };                                                                           \
+  trt_##op_type__##_converter trt_##op_type__##_converter__;                   \
+  int TouchConverterRegister_##op_type__() {                                   \
+    trt_##op_type__##_converter__.Touch();                                     \
+    return 0;                                                                  \
+  }
+
+#define USE_TRT_CONVERTER(op_type__)                                    \
+  extern int TouchConverterRegister_##op_type__();                      \
+  static int use_op_converter_trt_##op_type__ __attribute__((unused)) = \
+      TouchConverterRegister_##op_type__();
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0a02a7bebf9efbd0555707e6cfa701ef1e7d9659
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -0,0 +1,47 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(ReluOpConverter, main) {
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("relu-X", nvinfer1::Dims2(10, 6));
+  validator.DeclOutputVar("relu-Out", nvinfer1::Dims2(10, 6));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("relu");
+  desc.SetInput("X", {"relu-X"});
+  desc.SetOutput("Out", {"relu-Out"});
+
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+
+  validator.Execute(10);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(relu);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a30253072ac581ceca85ca10151a176f87a7cb39
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(fc_op, test) {
+  std::unordered_set<std::string> parameters({"mul-Y"});
+  framework::Scope scope;
+  TRTConvertValidation validator(20, parameters, scope, 1000);
+
+  validator.DeclInputVar("mul-X", nvinfer1::Dims4(8, 3, 1, 1));
+  validator.DeclParamVar("mul-Y", nvinfer1::Dims2(3, 2));
+  validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(8, 2));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("mul");
+  desc.SetInput("X", {"mul-X"});
+  desc.SetInput("Y", {"mul-Y"});
+  desc.SetOutput("Out", {"mul-Out"});
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(10);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8f91309a0a00d5131268f026c319e25ba3cb964a
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc
@@ -0,0 +1,76 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/inference/tensorrt/convert/io_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+void IOConverterTester(const platform::DeviceContext& ctx) {
+  cudaStream_t stream;
+  ASSERT_EQ(0, cudaStreamCreate(&stream));
+
+  // init fluid in_tensor
+  framework::LoDTensor in_tensor;
+  in_tensor.Resize({10, 10});
+  auto place = ctx.GetPlace();
+  in_tensor.mutable_data<float>(place);
+  std::vector<float> init;
+  for (int64_t i = 0; i < 10 * 10; ++i) {
+    init.push_back(i);
+  }
+  framework::TensorFromVector(init, ctx, &in_tensor);
+
+  // init tensorrt buffer
+  void* buffer;
+  size_t size = in_tensor.memory_size();
+  ASSERT_EQ(cudaMalloc(&buffer, size), 0);
+
+  // convert fluid in_tensor to tensorrt buffer
+  EngineIOConverter::ConvertInput("test", in_tensor, buffer, size, &stream);
+
+  // convert tensorrt buffer to fluid out_tensor
+  framework::LoDTensor out_tensor;
+  out_tensor.Resize({10, 10});
+  out_tensor.mutable_data<float>(place);
+  EngineIOConverter::ConvertOutput("test", buffer, &out_tensor, size, &stream);
+
+  // compare in_tensor and out_tensor
+  std::vector<float> result;
+  framework::TensorToVector(out_tensor, ctx, &result);
+  EXPECT_EQ(init.size(), result.size());
+  for (size_t i = 0; i < init.size(); i++) {
+    EXPECT_EQ(init[i], result[i]);
+  }
+  cudaStreamDestroy(stream);
+}
+
+TEST(EngineIOConverterTester, DefaultCPU) {
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+  IOConverterTester(ctx);
+}
+
+TEST(EngineIOConverterTester, DefaultGPU) {
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext ctx(place);
+  IOConverterTester(ctx);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1ce1130e5d660d717a1262a1fbdb4b620462c0b3
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc
@@ -0,0 +1,49 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(MulOpConverter, main) {
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("mul-X", nvinfer1::Dims2(10, 6));
+  validator.DeclInputVar("mul-Y", nvinfer1::Dims2(6, 10));
+  validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(10, 10));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("mul");
+  desc.SetInput("X", {"mul-X"});
+  desc.SetInput("Y", {"mul-Y"});
+  desc.SetOutput("Out", {"mul-Out"});
+
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+
+  validator.Execute(10);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(mul);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9b79f86b0edba983019bd932f52b08711ff36d41
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -0,0 +1,40 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(OpConverter, ConvertBlock) {
+  framework::ProgramDesc prog;
+  auto* block = prog.MutableBlock(0);
+  auto* conv2d_op = block->AppendOp();
+  conv2d_op->SetType("conv2d");
+
+  OpConverter converter;
+  framework::Scope scope;
+  converter.ConvertBlock(*block->Proto(), {}, scope,
+                         nullptr /*TensorRTEngine*/);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_TRT_CONVERTER(conv2d)
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b1f531adc5d756259df1c350f7f44bf71ee1f93
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -0,0 +1,174 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file implements a UT framework to make the validation of transforming
+ * Fluid Op to TRT Layer.
+ */
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Get a random float value between [low, high]
+ */
+float random(float low, float high) {
+  static std::random_device rd;
+  static std::mt19937 mt(rd());
+  std::uniform_real_distribution<double> dist(1.0, 10.0);
+  return dist(mt);
+}
+
+void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place,
+                     const platform::DeviceContext& ctx) {
+  auto dims = tensor->dims();
+  size_t num_elements = analysis::AccuDims(dims, dims.size());
+  PADDLE_ENFORCE_GT(num_elements, 0);
+  auto* data = tensor->mutable_data<float>(place);
+  for (size_t i = 0; i < num_elements; i++) {
+    *(data + i) = random(0., 1.);
+  }
+}
+
+/*
+ * Help to validate the correctness between Fluid Op and the corresponding TRT
+ * layer.
+ */
+class TRTConvertValidation {
+ public:
+  TRTConvertValidation() = delete;
+
+  TRTConvertValidation(int batch_size,
+                       const std::unordered_set<std::string>& parameters,
+                       framework::Scope& scope,  // NOLINT
+                       int workspace_size = 1 << 10)
+      : parameters_(parameters), scope_(scope) {
+    // create engine.
+    engine_.reset(new TensorRTEngine(10, 1 << 10, &stream_));
+    engine_->InitNetwork();
+
+    PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
+  }
+
+  // Declare a Variable as input with random initialization.
+  void DeclInputVar(const std::string& name, const nvinfer1::Dims& dims) {
+    DeclVar(name, dims);
+    // Declare TRT inputs.
+    engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, dims);
+  }
+
+  // Declare a parameter varaible in the scope.
+  void DeclParamVar(const std::string& name, const nvinfer1::Dims& dims) {
+    DeclVar(name, dims);
+  }
+
+  void DeclOutputVar(const std::string& name, const nvinfer1::Dims& dims) {
+    DeclVar(name, dims);
+  }
+
+  // Declare a variable in a fluid Scope.
+  void DeclVar(const std::string& name, const nvinfer1::Dims& dims) {
+    platform::CPUPlace place;
+    platform::CPUDeviceContext ctx(place);
+
+    // Init Fluid tensor.
+    std::vector<int> dim_vec(dims.d, dims.d + dims.nbDims);
+    auto* x = scope_.Var(name);
+    auto* x_tensor = x->GetMutable<framework::LoDTensor>();
+    x_tensor->Resize(framework::make_ddim(dim_vec));
+    RandomizeTensor(x_tensor, place, ctx);
+  }
+
+  void SetOp(const framework::proto::OpDesc& desc) {
+    op_ = framework::OpRegistry::CreateOp(desc);
+
+    Singleton<OpConverter>::Global().ConvertOp(
+        desc, parameters_, scope_, engine_.get(), true /*test_mode*/);
+
+    engine_->FreezeNetwork();
+
+    // Declare outputs.
+    op_desc_.reset(new framework::OpDesc(desc, nullptr));
+
+    // Set Inputs.
+    for (const auto& input : op_desc_->InputArgumentNames()) {
+      if (parameters_.count(input)) continue;
+      auto* var = scope_.FindVar(input);
+      PADDLE_ENFORCE(var);
+      auto tensor = var->GetMutable<framework::LoDTensor>();
+
+      engine_->SetInputFromCPU(
+          input, static_cast<void*>(tensor->data<void>()),
+          sizeof(float) *
+              analysis::AccuDims(tensor->dims(), tensor->dims().size()));
+    }
+  }
+
+  void Execute(int batch_size) {
+    // Execute Fluid Op
+    platform::CPUPlace place;
+    platform::CPUDeviceContext ctx(place);
+    op_->Run(scope_, place);
+    // Execute TRT.
+    engine_->Execute(batch_size);
+    cudaStreamSynchronize(*engine_->stream());
+
+    ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
+    const size_t output_space_size = 200;
+    for (const auto& output : op_desc_->OutputArgumentNames()) {
+      std::vector<float> fluid_out;
+      std::vector<float> trt_out(output_space_size);
+      engine_->GetOutputInCPU(output, &trt_out[0],
+                              output_space_size * sizeof(float));
+      cudaStreamSynchronize(*engine_->stream());
+
+      auto* var = scope_.FindVar(output);
+      auto tensor = var->GetMutable<framework::LoDTensor>();
+      framework::TensorToVector(*tensor, ctx, &fluid_out);
+      // Compare two output
+      ASSERT_FALSE(fluid_out.empty());
+      for (size_t i = 0; i < fluid_out.size(); i++) {
+        // Loose the threshold for CI in different machine model.
+        EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5);
+      }
+    }
+  }
+
+  framework::Scope& scope() { return scope_; }
+
+ private:
+  std::unique_ptr<TensorRTEngine> engine_;
+  cudaStream_t stream_;
+  std::unique_ptr<framework::OperatorBase> op_;
+  std::unique_ptr<framework::OpDesc> op_desc_;
+  const std::unordered_set<std::string>& parameters_;
+  framework::Scope& scope_;
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
new file mode 100644
index 0000000000000000000000000000000000000000..596e0fe9da3d272ecb1c0f8dbef09a75d08a4b1a
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -0,0 +1,212 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/engine.h"
+
+#include <NvInfer.h>
+#include <cuda.h>
+#include <glog/logging.h>
+#include <string>
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+void TensorRTEngine::Build(const DescType& paddle_model) {
+  PADDLE_ENFORCE(false, "not implemented");
+}
+
+void TensorRTEngine::Execute(int batch_size) {
+  std::vector<void*> buffers;
+  for (auto& buf : buffers_) {
+    PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated");
+    PADDLE_ENFORCE_GT(buf.max_size, 0);
+    PADDLE_ENFORCE(buf.device == DeviceType::GPU);
+    buffers.push_back(buf.buffer);
+  }
+  infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
+  cudaStreamSynchronize(*stream_);
+}
+
+TensorRTEngine::~TensorRTEngine() {
+  cudaStreamSynchronize(*stream_);
+  // clean buffer
+  for (auto& buf : buffers_) {
+    if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
+      PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer));
+      buf.buffer = nullptr;
+      buf.max_size = 0;
+    }
+  }
+}
+
+void TensorRTEngine::FreezeNetwork() {
+  PADDLE_ENFORCE(infer_builder_ != nullptr,
+                 "Call InitNetwork first to initialize network.");
+  PADDLE_ENFORCE(infer_network_ != nullptr,
+                 "Call InitNetwork first to initialize network.");
+  // build engine.
+  infer_builder_->setMaxBatchSize(max_batch_);
+  infer_builder_->setMaxWorkspaceSize(max_workspace_);
+
+  infer_engine_.reset(infer_builder_->buildCudaEngine(*infer_network_));
+  PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!");
+
+  infer_context_.reset(infer_engine_->createExecutionContext());
+
+  // allocate GPU buffers.
+  buffers_.resize(buffer_sizes_.size());
+  for (auto& item : buffer_sizes_) {
+    if (item.second == 0) {
+      auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());
+      auto dims = infer_engine_->getBindingDimensions(slot_offset);
+      item.second = kDataTypeSize[static_cast<int>(
+                        infer_engine_->getBindingDataType(slot_offset))] *
+                    analysis::AccuDims(dims.d, dims.nbDims);
+    }
+    auto& buf = buffer(item.first);
+    CHECK(buf.buffer == nullptr);  // buffer should be allocated only once.
+    PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second));
+    VLOG(4) << "buffer malloc " << item.first << " " << item.second << " "
+            << buf.buffer;
+    buf.size = buf.max_size = item.second;
+    buf.device = DeviceType::GPU;
+  }
+}
+
+nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
+                                                nvinfer1::DataType dtype,
+                                                const nvinfer1::Dims& dims) {
+  PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate input name %s",
+                    name);
+
+  PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first");
+  auto* input = infer_network_->addInput(name.c_str(), dtype, dims);
+  PADDLE_ENFORCE(input, "infer network add input %s failed", name);
+  buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] *
+                        analysis::AccuDims(dims.d, dims.nbDims);
+  PADDLE_ENFORCE(input->isNetworkInput());
+  TensorRTEngine::SetITensor(name, input);
+  return input;
+}
+
+void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
+                                   const std::string& name) {
+  PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
+                    name);
+
+  auto* output = layer->getOutput(offset);
+  SetITensor(name, output);
+  PADDLE_ENFORCE(output != nullptr);
+  output->setName(name.c_str());
+  PADDLE_ENFORCE(!output->isNetworkInput());
+  infer_network_->markOutput(*output);
+  PADDLE_ENFORCE(output->isNetworkOutput());
+  // output buffers' size can only be decided latter, set zero here to mark this
+  // and will reset latter.
+  buffer_sizes_[name] = 0;
+}
+
+void TensorRTEngine::DeclareOutput(const std::string& name) {
+  PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
+                    name);
+
+  auto* output = TensorRTEngine::GetITensor(name);
+  PADDLE_ENFORCE(output != nullptr);
+  output->setName(name.c_str());
+  PADDLE_ENFORCE(!output->isNetworkInput());
+  infer_network_->markOutput(*output);
+  // output buffers' size can only be decided latter, set zero here to mark this
+  // and will reset latter.
+  buffer_sizes_[name] = 0;
+}
+
+void* TensorRTEngine::GetOutputInGPU(const std::string& name) {
+  return buffer(name).buffer;
+}
+
+void TensorRTEngine::GetOutputInGPU(const std::string& name, void* dst,
+                                    size_t max_size) {
+  // determine data size
+  auto it = buffer_sizes_.find(name);
+  PADDLE_ENFORCE(it != buffer_sizes_.end());
+  PADDLE_ENFORCE_GT(it->second, 0);
+  PADDLE_ENFORCE_GE(max_size, it->second);
+  auto& buf = buffer(name);
+  PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
+  PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, it->second,
+                                    cudaMemcpyDeviceToDevice, *stream_),
+                    0);
+}
+
+void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst,
+                                    size_t max_size) {
+  // determine data size
+  auto it = buffer_sizes_.find(name);
+  PADDLE_ENFORCE(it != buffer_sizes_.end());
+  PADDLE_ENFORCE_GT(it->second, 0);
+  PADDLE_ENFORCE_GE(max_size, it->second);
+  auto& buf = buffer(name);
+  PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
+  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, it->second,
+                                       cudaMemcpyDeviceToHost, *stream_));
+}
+
+Buffer& TensorRTEngine::buffer(const std::string& name) {
+  PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
+  auto it = buffer_sizes_.find(name);
+  PADDLE_ENFORCE(it != buffer_sizes_.end());
+  auto slot_offset = infer_engine_->getBindingIndex(name.c_str());
+  return buffers_[slot_offset];
+}
+
+void TensorRTEngine::SetInputFromCPU(const std::string& name, const void* data,
+                                     size_t size) {
+  auto& buf = buffer(name);
+  PADDLE_ENFORCE_NOT_NULL(buf.buffer);
+  PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
+  PADDLE_ENFORCE(buf.device == DeviceType::GPU);
+  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
+                                       cudaMemcpyHostToDevice, *stream_));
+}
+
+void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data,
+                                     size_t size) {
+  auto& buf = buffer(name);
+  PADDLE_ENFORCE_NOT_NULL(buf.buffer);
+  PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
+  PADDLE_ENFORCE(buf.device == DeviceType::GPU);
+  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
+                                       cudaMemcpyDeviceToDevice, *stream_));
+}
+
+void TensorRTEngine::SetITensor(const std::string& name,
+                                nvinfer1::ITensor* tensor) {
+  PADDLE_ENFORCE(tensor != nullptr);
+  PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s",
+                    name);
+  itensor_map_[name] = tensor;
+}
+
+nvinfer1::ITensor* TensorRTEngine::GetITensor(const std::string& name) {
+  PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name);
+  return itensor_map_[name];
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
new file mode 100644
index 0000000000000000000000000000000000000000..b06a9bbc6758ae9410b2fce99ef2b1a9e7ab98c0
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -0,0 +1,200 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <NvInfer.h>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/inference/engine.h"
+#include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * TensorRT Engine.
+ *
+ * There are two alternative ways to use it, one is  to build from a paddle
+ * protobuf model, another way is to manully construct the network.
+ */
+class TensorRTEngine : public EngineBase {
+ public:
+  // Weight is model parameter.
+  class Weight {
+   public:
+    Weight(nvinfer1::DataType dtype, void* value, size_t num_elem) {
+      w_.type = dtype;
+      w_.values = value;
+      w_.count = num_elem;
+    }
+    const nvinfer1::Weights& get() { return w_; }
+
+    std::vector<int64_t> dims;
+
+   private:
+    nvinfer1::Weights w_;
+  };
+
+  TensorRTEngine(int max_batch, int max_workspace,
+                 cudaStream_t* stream = nullptr,
+                 nvinfer1::ILogger& logger = NaiveLogger::Global())
+      : max_batch_(max_batch),
+        max_workspace_(max_workspace),
+        stream_(stream ? stream : &default_stream_),
+        logger_(logger) {}
+
+  virtual ~TensorRTEngine();
+
+  // TODO(Superjomn) implement it later when graph segmentation is supported.
+  void Build(const DescType& paddle_model) override;
+
+  void Execute(int batch_size) override;
+
+  // Initialize the inference network, so that TensorRT layers can add to this
+  // network.
+  void InitNetwork() {
+    infer_builder_.reset(createInferBuilder(&logger_));
+    infer_network_.reset(infer_builder_->createNetwork());
+  }
+  // After finishing adding ops, freeze this network and creates the executation
+  // environment.
+  void FreezeNetwork();
+
+  // Add an input and set its name, data type and dimention.
+  nvinfer1::ITensor* DeclareInput(const std::string& name,
+                                  nvinfer1::DataType dtype,
+                                  const nvinfer1::Dims& dim);
+  // Set the offset-th output from a layer as the network's output, and set its
+  // name.
+  void DeclareOutput(const nvinfer1::ILayer* layer, int offset,
+                     const std::string& name);
+  // Set the itensor_map_[name] as the network's output, and set its name.
+  void DeclareOutput(const std::string& name);
+
+  // GPU memory address for an ITensor with specific name. One can operate on
+  // these memory directly for acceleration, for example, output the converted
+  // data directly to the buffer to save data copy overhead.
+  // NOTE this should be used after calling `FreezeNetwork`.
+  Buffer& buffer(const std::string& name) override;
+
+  cudaStream_t* stream() { return stream_; }
+
+  // Fill an input from CPU memory with name and size.
+  void SetInputFromCPU(const std::string& name, const void* data, size_t size);
+  // TODO(Superjomn) is this method necessary given that buffer(xxx) can be
+  // accessed directly. Fill an input from GPU memory with name and size.
+  void SetInputFromGPU(const std::string& name, const void* data, size_t size);
+  // Get an output called name, the output of tensorrt is in GPU, so this method
+  // Return the output's GPU memory address without copy.
+  void* GetOutputInGPU(const std::string& name);
+  // Copy data into dst inside the GPU device.
+  void GetOutputInGPU(const std::string& name, void* dst, size_t max_size);
+  // LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU
+  // to CPU.
+  void GetOutputInCPU(const std::string& name, void* dst, size_t max_size);
+  // Fill an ITensor into map itensor_map_.
+  void SetITensor(const std::string& name, nvinfer1::ITensor* tensor);
+  // Get an ITensor called name.
+  nvinfer1::ITensor* GetITensor(const std::string& name);
+
+  nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
+  nvinfer1::INetworkDefinition* network() { return infer_network_.get(); }
+
+ private:
+  // the max batch size
+  int max_batch_;
+  // the max memory size the engine uses
+  int max_workspace_;
+  cudaStream_t* stream_;
+  // If stream_ is not set from outside, hold its own stream.
+  cudaStream_t default_stream_;
+  nvinfer1::ILogger& logger_;
+
+  std::vector<Buffer> buffers_;
+  // max data size for the buffers.
+  std::unordered_map<std::string /*name*/, size_t /*max size*/> buffer_sizes_;
+  std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/>
+      itensor_map_;
+
+  // TensorRT related internal members
+  template <typename T>
+  struct Destroyer {
+    void operator()(T* x) {
+      if (x) {
+        x->destroy();
+      }
+    }
+  };
+  template <typename T>
+  using infer_ptr = std::unique_ptr<T, Destroyer<T>>;
+  infer_ptr<nvinfer1::IBuilder> infer_builder_;
+  infer_ptr<nvinfer1::INetworkDefinition> infer_network_;
+  infer_ptr<nvinfer1::ICudaEngine> infer_engine_;
+  infer_ptr<nvinfer1::IExecutionContext> infer_context_;
+};  // class TensorRTEngine
+
+// Add an layer__ into engine__ with args ARGS.
+// For example:
+//   TRT_ENGINE_ADD_LAYER(xxx, FullyConnected, input, dim, weights, bias)
+//
+// Reference
+// https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#charRNN_define_network
+//
+// will add a fully connected layer into the engine.
+// TensorRT has too many layers, so that is not wise to add member functions for
+// them, and an macro like this is more extensible when underlying TensorRT
+// library add new layer supports.
+#define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \
+  engine__->network()->add##layer__(ARGS);
+
+/*
+ * Helper to control the TensorRT engine's creation and deletion.
+ */
+class TRT_EngineManager {
+ public:
+  bool HasEngine(const std::string& name) const {
+    return engines_.count(name) != 0;
+  }
+
+  // Get an engine called `name`.
+  TensorRTEngine* Get(const std::string& name) const {
+    return engines_.at(name).get();
+  }
+
+  // Create or get an engine called `name`
+  TensorRTEngine* Create(int max_batch, int max_workspace, cudaStream_t* stream,
+                         const std::string& name) {
+    auto* p = new TensorRTEngine(max_batch, max_workspace, stream);
+    engines_[name].reset(p);
+    return p;
+  }
+
+  void DeleteALl() {
+    for (auto& item : engines_) {
+      item.second.reset(nullptr);
+    }
+  }
+
+ private:
+  std::unordered_map<std::string, std::unique_ptr<TensorRTEngine>> engines_;
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..b6e7968108403c9c9c192759c44eac040d1c5073
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/helper.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <NvInfer.h>
+#include <cuda.h>
+#include <glog/logging.h>
+#include "paddle/fluid/platform/dynload/tensorrt.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+namespace dy = paddle::platform::dynload;
+
+// TensorRT data type to size
+const int kDataTypeSize[] = {
+    4,  // kFLOAT
+    2,  // kHALF
+    1,  // kINT8
+    4   // kINT32
+};
+
+// The following two API are implemented in TensorRT's header file, cannot load
+// from the dynamic library. So create our own implementation and directly
+// trigger the method from the dynamic library.
+static nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger* logger) {
+  return static_cast<nvinfer1::IBuilder*>(
+      dy::createInferBuilder_INTERNAL(logger, NV_TENSORRT_VERSION));
+}
+static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) {
+  return static_cast<nvinfer1::IRuntime*>(
+      dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION));
+}
+
+// A logger for create TensorRT infer builder.
+class NaiveLogger : public nvinfer1::ILogger {
+ public:
+  void log(nvinfer1::ILogger::Severity severity, const char* msg) override {
+    switch (severity) {
+      case Severity::kINFO:
+        LOG(INFO) << msg;
+        break;
+      case Severity::kWARNING:
+        LOG(WARNING) << msg;
+        break;
+      case Severity::kINTERNAL_ERROR:
+      case Severity::kERROR:
+        LOG(ERROR) << msg;
+        break;
+      default:
+        break;
+    }
+  }
+
+  static nvinfer1::ILogger& Global() {
+    static nvinfer1::ILogger* x = new NaiveLogger;
+    return *x;
+  }
+
+  ~NaiveLogger() override {}
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e635f0f87d577a1f1ac74687ee60f762be525418
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -0,0 +1,113 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class TensorRTEngineTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    ASSERT_EQ(0, cudaStreamCreate(&stream_));
+    engine_ = new TensorRTEngine(1, 1 << 10, &stream_);
+    engine_->InitNetwork();
+  }
+
+  void TearDown() override {
+    delete engine_;
+    cudaStreamDestroy(stream_);
+  }
+
+ protected:
+  TensorRTEngine* engine_;
+  cudaStream_t stream_;
+};
+
+TEST_F(TensorRTEngineTest, add_layer) {
+  const int size = 1;
+
+  float raw_weight[size] = {2.};  // Weight in CPU memory.
+  float raw_bias[size] = {3.};
+
+  LOG(INFO) << "create weights";
+  TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, size);
+  TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, size);
+  auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
+                                  nvinfer1::DimsCHW{1, 1, 1});
+  auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size,
+                                        weight.get(), bias.get());
+  PADDLE_ENFORCE(fc_layer != nullptr);
+
+  engine_->DeclareOutput(fc_layer, 0, "y");
+  LOG(INFO) << "freeze network";
+  engine_->FreezeNetwork();
+  ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
+
+  // fill in real data
+  float x_v = 1234;
+  engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
+                           1 * sizeof(float));
+  LOG(INFO) << "to execute";
+  engine_->Execute(1);
+
+  LOG(INFO) << "to get output";
+  float y_cpu;
+  engine_->GetOutputInCPU("y", &y_cpu, sizeof(float));
+
+  LOG(INFO) << "to checkout output";
+  ASSERT_EQ(y_cpu, x_v * 2 + 3);
+}
+
+TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
+  // Weight in CPU memory.
+  // It seems tensorrt FC use col-major: [[1.0, 3.3], [1.1, 4.4]]
+  // instead of row-major, which is [[1.0, 1.1], [3.3, 4.4]]
+  float raw_weight[4] = {1.0, 1.1, 3.3, 4.4};
+  float raw_bias[2] = {1.3, 2.4};
+
+  TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 4);
+  TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 2);
+  auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
+                                  nvinfer1::DimsCHW{1, 2, 1});
+  auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2,
+                                        weight.get(), bias.get());
+  PADDLE_ENFORCE(fc_layer != nullptr);
+
+  engine_->DeclareOutput(fc_layer, 0, "y");
+  engine_->FreezeNetwork();
+  ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
+
+  float x_v[2] = {1.0, 2.0};
+  engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
+                           2 * sizeof(float));
+  engine_->Execute(1);
+
+  LOG(INFO) << "to get output";
+  float y_cpu[2] = {-1., -1.};
+  engine_->GetOutputInCPU("y", &y_cpu[0], sizeof(float) * 2);
+  ASSERT_EQ(y_cpu[0], 4.5);
+  ASSERT_EQ(y_cpu[1], 14.5);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/test_tensorrt.cc b/paddle/fluid/inference/tensorrt/test_tensorrt.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a07537985738ab0ad4092b794f3b62ba53dfa866
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/test_tensorrt.cc
@@ -0,0 +1,155 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "NvInfer.h"
+#include "paddle/fluid/platform/dynload/tensorrt.h"
+
+namespace dy = paddle::platform::dynload;
+
+class Logger : public nvinfer1::ILogger {
+ public:
+  void log(nvinfer1::ILogger::Severity severity, const char* msg) override {
+    switch (severity) {
+      case Severity::kINFO:
+        LOG(INFO) << msg;
+        break;
+      case Severity::kWARNING:
+        LOG(WARNING) << msg;
+        break;
+      case Severity::kINTERNAL_ERROR:
+      case Severity::kERROR:
+        LOG(ERROR) << msg;
+        break;
+      default:
+        break;
+    }
+  }
+};
+
+class ScopedWeights {
+ public:
+  explicit ScopedWeights(float value) : value_(value) {
+    w.type = nvinfer1::DataType::kFLOAT;
+    w.values = &value_;
+    w.count = 1;
+  }
+  const nvinfer1::Weights& get() { return w; }
+
+ private:
+  float value_;
+  nvinfer1::Weights w;
+};
+
+// The following two API are implemented in TensorRT's header file, cannot load
+// from the dynamic library. So create our own implementation and directly
+// trigger the method from the dynamic library.
+nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger* logger) {
+  return static_cast<nvinfer1::IBuilder*>(
+      dy::createInferBuilder_INTERNAL(logger, NV_TENSORRT_VERSION));
+}
+nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) {
+  return static_cast<nvinfer1::IRuntime*>(
+      dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION));
+}
+
+const char* kInputTensor = "input";
+const char* kOutputTensor = "output";
+
+// Creates a network to compute y = 2x + 3
+nvinfer1::IHostMemory* CreateNetwork() {
+  Logger logger;
+  // Create the engine.
+  nvinfer1::IBuilder* builder = createInferBuilder(&logger);
+  ScopedWeights weights(2.);
+  ScopedWeights bias(3.);
+
+  nvinfer1::INetworkDefinition* network = builder->createNetwork();
+  // Add the input
+  auto input = network->addInput(kInputTensor, nvinfer1::DataType::kFLOAT,
+                                 nvinfer1::DimsCHW{1, 1, 1});
+  EXPECT_NE(input, nullptr);
+  // Add the hidden layer.
+  auto layer = network->addFullyConnected(*input, 1, weights.get(), bias.get());
+  EXPECT_NE(layer, nullptr);
+  // Mark the output.
+  auto output = layer->getOutput(0);
+  output->setName(kOutputTensor);
+  network->markOutput(*output);
+  // Build the engine.
+  builder->setMaxBatchSize(1);
+  builder->setMaxWorkspaceSize(1 << 10);
+  auto engine = builder->buildCudaEngine(*network);
+  EXPECT_NE(engine, nullptr);
+  // Serialize the engine to create a model, then close.
+  nvinfer1::IHostMemory* model = engine->serialize();
+  network->destroy();
+  engine->destroy();
+  builder->destroy();
+  return model;
+}
+
+void Execute(nvinfer1::IExecutionContext* context, const float* input,
+             float* output) {
+  const nvinfer1::ICudaEngine& engine = context->getEngine();
+  // Two binds, input and output
+  ASSERT_EQ(engine.getNbBindings(), 2);
+  const int input_index = engine.getBindingIndex(kInputTensor);
+  const int output_index = engine.getBindingIndex(kOutputTensor);
+  // Create GPU buffers and a stream
+  void* buffers[2];
+  ASSERT_EQ(0, cudaMalloc(&buffers[input_index], sizeof(float)));
+  ASSERT_EQ(0, cudaMalloc(&buffers[output_index], sizeof(float)));
+  cudaStream_t stream;
+  ASSERT_EQ(0, cudaStreamCreate(&stream));
+  // Copy the input to the GPU, execute the network, and copy the output back.
+  ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], input, sizeof(float),
+                               cudaMemcpyHostToDevice, stream));
+  context->enqueue(1, buffers, stream, nullptr);
+  ASSERT_EQ(0, cudaMemcpyAsync(output, buffers[output_index], sizeof(float),
+                               cudaMemcpyDeviceToHost, stream));
+  cudaStreamSynchronize(stream);
+
+  // Release the stream and the buffers
+  cudaStreamDestroy(stream);
+  ASSERT_EQ(0, cudaFree(buffers[input_index]));
+  ASSERT_EQ(0, cudaFree(buffers[output_index]));
+}
+
+TEST(TensorrtTest, BasicFunction) {
+  // Create the network serialized model.
+  nvinfer1::IHostMemory* model = CreateNetwork();
+
+  // Use the model to create an engine and an execution context.
+  Logger logger;
+  nvinfer1::IRuntime* runtime = createInferRuntime(&logger);
+  nvinfer1::ICudaEngine* engine =
+      runtime->deserializeCudaEngine(model->data(), model->size(), nullptr);
+  model->destroy();
+  nvinfer1::IExecutionContext* context = engine->createExecutionContext();
+
+  // Execute the network.
+  float input = 1234;
+  float output;
+  Execute(context, &input, &output);
+  EXPECT_EQ(output, input * 2 + 3);
+
+  // Destroy the engine.
+  context->destroy();
+  engine->destroy();
+  runtime->destroy();
+}
diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index e7ffb00ec8d8926193fe510ebdb7185f75c90906..2fa5a9540ba1311c7f87e6675a53044b23dd8276 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -4,7 +4,7 @@ function(inference_test TARGET_NAME)
   set(multiValueArgs ARGS)
   cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
-  set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/fluid/tests)
+  set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
   set(arg_list "")
   if(inference_test_ARGS)
     foreach(arg ${inference_test_ARGS})
@@ -17,18 +17,32 @@ function(inference_test TARGET_NAME)
     string(REGEX REPLACE "^_$" "" arg "${arg}")
     cc_test(test_inference_${TARGET_NAME}${arg}
         SRCS test_inference_${TARGET_NAME}.cc
-        DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
+        DEPS paddle_fluid
         ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model)
     set_tests_properties(test_inference_${TARGET_NAME}${arg}
         PROPERTIES DEPENDS test_${TARGET_NAME})
   endforeach()
 endfunction(inference_test)
 
-inference_test(fit_a_line)
+####################
+# Inference tests here depend on fluid/tests/book. If users want to run
+# individual test with ctest, they need to run tests in fluid/tests/book
+# first to generate saved model.
+####################
+# This unittest is buggy!
+#inference_test(fit_a_line)
 inference_test(image_classification ARGS vgg resnet)
 inference_test(label_semantic_roles)
 inference_test(recognize_digits ARGS mlp conv)
 inference_test(recommender_system)
 #inference_test(rnn_encoder_decoder)
-inference_test(understand_sentiment ARGS conv)
+#inference_test(understand_sentiment ARGS conv)
 inference_test(word2vec)
+
+# This is an unly work around to make this test run
+# TODO(TJ): clean me up
+cc_test(test_inference_nlp
+  SRCS test_inference_nlp.cc
+  DEPS paddle_fluid
+  ARGS
+  --model_path=${PADDLE_BINARY_DIR}/python/paddle/fluid/tests/book/recognize_digits_mlp.inference.model)
diff --git a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
index 9ab808efec3abdb86724fb16725962958c5cf55c..2c5b66a32903f4ffdedb074b31aec53ae6cacaf3 100644
--- a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc
@@ -9,9 +9,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
+#include "paddle/fluid/inference/tests/test_multi_thread_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
 
@@ -26,32 +27,63 @@ TEST(inference, fit_a_line) {
   // 0. Call `paddle::framework::InitDevices()` initialize all the devices
   // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
 
-  paddle::framework::LoDTensor input;
-  // The second dim of the input tensor should be 13
-  // The input data should be >= 0
-  int64_t batch_size = 10;
-  SetupTensor<float>(
-      input, {batch_size, 13}, static_cast<float>(0), static_cast<float>(10));
-  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
-  cpu_feeds.push_back(&input);
+  for (int num_threads : {1, 2}) {
+    std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_feeds;
+    cpu_feeds.resize(num_threads);
+    for (int i = 0; i < num_threads; ++i) {
+      auto* input = new paddle::framework::LoDTensor();
+      // The second dim of the input tensor should be 13
+      // The input data should be >= 0
+      int64_t batch_size = 10;
+      SetupTensor<float>(input, {batch_size, 13}, static_cast<float>(0),
+                         static_cast<float>(10));
+      cpu_feeds[i].push_back(input);
+    }
 
-  paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-  cpu_fetchs1.push_back(&output1);
+    std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs1;
+    cpu_fetchs1.resize(num_threads);
+    for (int i = 0; i < num_threads; ++i) {
+      auto* output = new paddle::framework::LoDTensor();
+      cpu_fetchs1[i].push_back(output);
+    }
 
-  // Run inference on CPU
-  TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
-  LOG(INFO) << output1.dims();
+    // Run inference on CPU
+    LOG(INFO) << "--- CPU Runs (num_threads: " << num_threads << "): ---";
+    if (num_threads == 1) {
+      TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds[0],
+                                                cpu_fetchs1[0]);
+    } else {
+      TestMultiThreadInference<paddle::platform::CPUPlace>(
+          dirname, cpu_feeds, cpu_fetchs1, num_threads);
+    }
 
 #ifdef PADDLE_WITH_CUDA
-  paddle::framework::LoDTensor output2;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
-  cpu_fetchs2.push_back(&output2);
+    std::vector<std::vector<paddle::framework::LoDTensor*>> cpu_fetchs2;
+    cpu_fetchs2.resize(num_threads);
+    for (int i = 0; i < num_threads; ++i) {
+      auto* output = new paddle::framework::LoDTensor();
+      cpu_fetchs2[i].push_back(output);
+    }
 
-  // Run inference on CUDA GPU
-  TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
-  LOG(INFO) << output2.dims();
+    // Run inference on CUDA GPU
+    LOG(INFO) << "--- GPU Runs (num_threads: " << num_threads << "): ---";
+    if (num_threads == 1) {
+      TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds[0],
+                                                 cpu_fetchs2[0]);
+    } else {
+      TestMultiThreadInference<paddle::platform::CUDAPlace>(
+          dirname, cpu_feeds, cpu_fetchs2, num_threads);
+    }
 
-  CheckError<float>(output1, output2);
+    for (int i = 0; i < num_threads; ++i) {
+      CheckError<float>(*cpu_fetchs1[i][0], *cpu_fetchs2[i][0]);
+      delete cpu_fetchs2[i][0];
+    }
 #endif
+
+    for (int i = 0; i < num_threads; ++i) {
+      delete cpu_feeds[i][0];
+      delete cpu_fetchs1[i][0];
+    }
+  }  // num_threads-loop
 }
diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
index e9a27171f1cd68e7b10c860fb4a1417b930ed565..60c761c5281e2f535aab0200c93fb738addcdb87 100644
--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@@ -12,13 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
+DEFINE_string(fp16_dirname, "", "Directory of the float16 inference model.");
 DEFINE_int32(batch_size, 1, "Batch size of input data");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times");
+DEFINE_bool(skip_cpu, false, "Skip the cpu test");
 
 TEST(inference, image_classification) {
   if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
@@ -32,25 +34,34 @@ TEST(inference, image_classification) {
   // 0. Call `paddle::framework::InitDevices()` initialize all the devices
   // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
 
+  const bool is_combined = false;
+  std::vector<std::vector<int64_t>> feed_target_shapes =
+      GetFeedTargetShapes(dirname, is_combined);
+
   paddle::framework::LoDTensor input;
   // Use normilized image pixels as input data,
   // which should be in the range [0.0, 1.0].
-  SetupTensor<float>(input,
-                     {FLAGS_batch_size, 3, 32, 32},
-                     static_cast<float>(0),
+  feed_target_shapes[0][0] = FLAGS_batch_size;
+  paddle::framework::DDim input_dims =
+      paddle::framework::make_ddim(feed_target_shapes[0]);
+  LOG(INFO) << input_dims;
+  SetupTensor<float>(&input, input_dims, static_cast<float>(0),
                      static_cast<float>(1));
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&input);
 
   paddle::framework::LoDTensor output1;
-  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
-  cpu_fetchs1.push_back(&output1);
+  if (!FLAGS_skip_cpu) {
+    std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+    cpu_fetchs1.push_back(&output1);
 
-  // Run inference on CPU
-  LOG(INFO) << "--- CPU Runs: ---";
-  TestInference<paddle::platform::CPUPlace>(
-      dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat);
-  LOG(INFO) << output1.dims();
+    // Run inference on CPU
+    LOG(INFO) << "--- CPU Runs: ---";
+    LOG(INFO) << "Batch size is " << FLAGS_batch_size;
+    TestInference<paddle::platform::CPUPlace, false, true>(
+        dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined);
+    LOG(INFO) << output1.dims();
+  }
 
 #ifdef PADDLE_WITH_CUDA
   paddle::framework::LoDTensor output2;
@@ -59,10 +70,29 @@ TEST(inference, image_classification) {
 
   // Run inference on CUDA GPU
   LOG(INFO) << "--- GPU Runs: ---";
-  TestInference<paddle::platform::CUDAPlace>(
-      dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat);
+  LOG(INFO) << "Batch size is " << FLAGS_batch_size;
+  TestInference<paddle::platform::CUDAPlace, false, true>(
+      dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined);
   LOG(INFO) << output2.dims();
 
-  CheckError<float>(output1, output2);
+  if (!FLAGS_skip_cpu) {
+    CheckError<float>(output1, output2);
+  }
+
+  // float16 inference requires cuda GPUs with >= 5.3 compute capability
+  if (!FLAGS_fp16_dirname.empty() &&
+      paddle::platform::GetCUDAComputeCapability(0) >= 53) {
+    paddle::framework::LoDTensor output3;
+    std::vector<paddle::framework::LoDTensor*> cpu_fetchs3;
+    cpu_fetchs3.push_back(&output3);
+
+    LOG(INFO) << "--- GPU Runs in float16 mode: ---";
+    LOG(INFO) << "Batch size is " << FLAGS_batch_size;
+
+    TestInference<paddle::platform::CUDAPlace, false, true>(
+        FLAGS_fp16_dirname, cpu_feeds, cpu_fetchs3, FLAGS_repeat);
+
+    CheckError<float>(output2, output3);
+  }
 #endif
 }
diff --git a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
index 184924016634bba26204d937744ca5fa87cd443c..84bb855fea5fa397ff71e2c922fea3302951b7ca 100644
--- a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -36,37 +36,21 @@ TEST(inference, label_semantic_roles) {
   int64_t predicate_dict_len = 3162;
   int64_t mark_dict_len = 2;
 
-  SetupLoDTensor(word,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&word, lod, static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(predicate,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&predicate, lod, static_cast<int64_t>(0),
                  static_cast<int64_t>(predicate_dict_len - 1));
-  SetupLoDTensor(ctx_n2,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&ctx_n2, lod, static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(ctx_n1,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&ctx_n1, lod, static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(ctx_0,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&ctx_0, lod, static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(ctx_p1,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&ctx_p1, lod, static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(ctx_p2,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&ctx_p2, lod, static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
-  SetupLoDTensor(mark,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&mark, lod, static_cast<int64_t>(0),
                  static_cast<int64_t>(mark_dict_len - 1));
 
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
new file mode 100644
index 0000000000000000000000000000000000000000..03b0b6946339772ac535b3471d50fbd74554239d
--- /dev/null
+++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc
@@ -0,0 +1,228 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <sys/time.h>
+#include <time.h>
+#include <fstream>
+#include <thread>  // NOLINT
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/inference/tests/test_helper.h"
+#include "paddle/fluid/operators/math/blas.h"
+#ifdef PADDLE_WITH_MKLML
+#include <omp.h>
+#endif
+
+DEFINE_string(model_path, "", "Directory of the inference model.");
+DEFINE_string(data_file, "", "File of input index data.");
+DEFINE_int32(repeat, 100, "Running the inference program repeat times");
+DEFINE_bool(prepare_vars, true, "Prepare variables before executor");
+DEFINE_int32(num_threads, 1, "Number of threads should be used");
+DECLARE_bool(use_mkldnn);
+
+inline double GetCurrentMs() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec;
+}
+
+// This function just give dummy data for recognize_digits model.
+size_t DummyData(std::vector<paddle::framework::LoDTensor>* out) {
+  paddle::framework::LoDTensor input;
+  SetupTensor<float>(&input, {1, 1, 28, 28}, -1.f, 1.f);
+  out->emplace_back(input);
+  return 1;
+}
+
+// Load the input word index data from file and save into LodTensor.
+// Return the size of words.
+size_t LoadData(std::vector<paddle::framework::LoDTensor>* out,
+                const std::string& filename) {
+  if (filename.empty()) {
+    return DummyData(out);
+  }
+
+  size_t sz = 0;
+  std::fstream fin(filename);
+  std::string line;
+  out->clear();
+  while (getline(fin, line)) {
+    std::istringstream iss(line);
+    std::vector<int64_t> ids;
+    std::string field;
+    while (getline(iss, field, ' ')) {
+      ids.push_back(stoi(field));
+    }
+    if (ids.size() >= 1024) {
+      // Synced with NLP guys, they will ignore input larger then 1024
+      continue;
+    }
+
+    paddle::framework::LoDTensor words;
+    paddle::framework::LoD lod{{0, ids.size()}};
+    words.set_lod(lod);
+    int64_t* pdata = words.mutable_data<int64_t>(
+        {static_cast<int64_t>(ids.size()), 1}, paddle::platform::CPUPlace());
+    memcpy(pdata, ids.data(), words.numel() * sizeof(int64_t));
+    out->emplace_back(words);
+    sz += ids.size();
+  }
+  return sz;
+}
+
+// Split input data samples into small pieces jobs as balanced as possible,
+// according to the number of threads.
+void SplitData(
+    const std::vector<paddle::framework::LoDTensor>& datasets,
+    std::vector<std::vector<const paddle::framework::LoDTensor*>>* jobs,
+    const int num_threads) {
+  size_t s = 0;
+  jobs->resize(num_threads);
+  while (s < datasets.size()) {
+    for (auto it = jobs->begin(); it != jobs->end(); it++) {
+      it->emplace_back(&datasets[s]);
+      s++;
+      if (s >= datasets.size()) {
+        break;
+      }
+    }
+  }
+}
+
+void ThreadRunInfer(
+    const int tid, paddle::framework::Scope* scope,
+    const std::vector<std::vector<const paddle::framework::LoDTensor*>>& jobs) {
+  // maybe framework:ProgramDesc is not thread-safe
+  paddle::platform::CPUPlace place;
+  paddle::framework::Executor executor(place);
+  auto& sub_scope = scope->NewScope();
+  auto inference_program =
+      paddle::inference::Load(&executor, scope, FLAGS_model_path);
+
+  auto ctx = executor.Prepare(*inference_program, /*block_id*/ 0);
+  executor.CreateVariables(*inference_program, &sub_scope, /*block_id*/ 0);
+
+  const std::vector<std::string>& feed_target_names =
+      inference_program->GetFeedTargetNames();
+  const std::vector<std::string>& fetch_target_names =
+      inference_program->GetFetchTargetNames();
+
+  PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL);
+  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+  paddle::framework::LoDTensor outtensor;
+  fetch_targets[fetch_target_names[0]] = &outtensor;
+
+  std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+  PADDLE_ENFORCE_EQ(feed_target_names.size(), 1UL);
+
+  auto& inputs = jobs[tid];
+  auto start_ms = GetCurrentMs();
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    feed_targets[feed_target_names[0]] = inputs[i];
+    executor.RunPreparedContext(ctx.get(), &sub_scope, &feed_targets,
+                                &fetch_targets, false /*create_local_scope*/);
+  }
+  auto stop_ms = GetCurrentMs();
+  scope->DeleteScope(&sub_scope);
+  LOG(INFO) << "Tid: " << tid << ", process " << inputs.size()
+            << " samples, avg time per sample: "
+            << (stop_ms - start_ms) / inputs.size() << " ms";
+}
+
+TEST(inference, nlp) {
+  if (FLAGS_model_path.empty()) {
+    LOG(FATAL) << "Usage: ./example --model_path=path/to/your/model";
+  }
+  if (FLAGS_data_file.empty()) {
+    LOG(WARNING) << "No data file provided, will use dummy data!"
+                 << "Note: if you use nlp model, please provide data file.";
+  }
+  LOG(INFO) << "Model Path: " << FLAGS_model_path;
+  LOG(INFO) << "Data File: " << FLAGS_data_file;
+
+  std::vector<paddle::framework::LoDTensor> datasets;
+  size_t num_total_words = LoadData(&datasets, FLAGS_data_file);
+  LOG(INFO) << "Number of samples (seq_len<1024): " << datasets.size();
+  LOG(INFO) << "Total number of words: " << num_total_words;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  std::unique_ptr<paddle::framework::Scope> scope(
+      new paddle::framework::Scope());
+
+#ifdef PADDLE_WITH_MKLML
+  // only use 1 thread number per std::thread
+  omp_set_dynamic(0);
+  omp_set_num_threads(1);
+  paddle::operators::math::SetNumThreads(1);
+#endif
+
+  double start_ms = 0, stop_ms = 0;
+  if (FLAGS_num_threads > 1) {
+    std::vector<std::vector<const paddle::framework::LoDTensor*>> jobs;
+    SplitData(datasets, &jobs, FLAGS_num_threads);
+    std::vector<std::unique_ptr<std::thread>> threads;
+    start_ms = GetCurrentMs();
+    for (int i = 0; i < FLAGS_num_threads; ++i) {
+      threads.emplace_back(
+          new std::thread(ThreadRunInfer, i, scope.get(), std::ref(jobs)));
+    }
+    for (int i = 0; i < FLAGS_num_threads; ++i) {
+      threads[i]->join();
+    }
+    stop_ms = GetCurrentMs();
+  } else {
+    // 1. Define place, executor, scope
+    paddle::platform::CPUPlace place;
+    paddle::framework::Executor executor(place);
+
+    // 2. Initialize the inference_program and load parameters
+    std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
+    inference_program = InitProgram(&executor, scope.get(), FLAGS_model_path,
+                                    /*model combined*/ false);
+    // always prepare context
+    std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
+    ctx = executor.Prepare(*inference_program, 0);
+    if (FLAGS_prepare_vars) {
+      executor.CreateVariables(*inference_program, scope.get(), 0);
+    }
+    // preapre fetch
+    const std::vector<std::string>& fetch_target_names =
+        inference_program->GetFetchTargetNames();
+    PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL);
+    std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+    paddle::framework::LoDTensor outtensor;
+    fetch_targets[fetch_target_names[0]] = &outtensor;
+
+    // prepare feed
+    const std::vector<std::string>& feed_target_names =
+        inference_program->GetFeedTargetNames();
+    PADDLE_ENFORCE_EQ(feed_target_names.size(), 1UL);
+    std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+
+    // feed data and run
+    start_ms = GetCurrentMs();
+    for (size_t i = 0; i < datasets.size(); ++i) {
+      feed_targets[feed_target_names[0]] = &(datasets[i]);
+      executor.RunPreparedContext(ctx.get(), scope.get(), &feed_targets,
+                                  &fetch_targets, !FLAGS_prepare_vars);
+    }
+    stop_ms = GetCurrentMs();
+    LOG(INFO) << "Tid: 0, process " << datasets.size()
+              << " samples, avg time per sample: "
+              << (stop_ms - start_ms) / datasets.size() << " ms";
+  }
+  LOG(INFO) << "Total inference time with " << FLAGS_num_threads
+            << " threads : " << (stop_ms - start_ms) / 1000.0
+            << " sec, QPS: " << datasets.size() / ((stop_ms - start_ms) / 1000);
+}
diff --git a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
index 1fb0f9e77797cf6e61e918700763ee33a495cb96..f12828a2685305c20d26492dbf04fa9ddacf9317 100644
--- a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -35,10 +35,8 @@ TEST(inference, recognize_digits) {
   paddle::framework::LoDTensor input;
   // Use normilized image pixels as input data,
   // which should be in the range [-1.0, 1.0].
-  SetupTensor<float>(input,
-                     {FLAGS_batch_size, 1, 28, 28},
-                     static_cast<float>(-1),
-                     static_cast<float>(1));
+  SetupTensor<float>(&input, {FLAGS_batch_size, 1, 28, 28},
+                     static_cast<float>(-1), static_cast<float>(1));
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&input);
 
@@ -49,8 +47,8 @@ TEST(inference, recognize_digits) {
 
     // Run inference on CPU
     LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---";
-    TestInference<paddle::platform::CPUPlace>(
-        dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined);
+    TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1,
+                                              FLAGS_repeat, is_combined);
     LOG(INFO) << output1.dims();
 
 #ifdef PADDLE_WITH_CUDA
@@ -60,8 +58,8 @@ TEST(inference, recognize_digits) {
 
     // Run inference on CUDA GPU
     LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---";
-    TestInference<paddle::platform::CUDAPlace>(
-        dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined);
+    TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2,
+                                               FLAGS_repeat, is_combined);
     LOG(INFO) << output2.dims();
 
     CheckError<float>(output1, output2);
diff --git a/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
index b42a33c9a90b5feafaed343a197da0e4db11b7ea..70aa6b194d4417fc85384cc3f615089f024f928e 100644
--- a/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -36,25 +36,25 @@ TEST(inference, recommender_system) {
 
   // Use the first data from paddle.dataset.movielens.test() as input
   std::vector<int64_t> user_id_data = {1};
-  SetupTensor<int64_t>(user_id, {batch_size, 1}, user_id_data);
+  SetupTensor<int64_t>(&user_id, {batch_size, 1}, user_id_data);
 
   std::vector<int64_t> gender_id_data = {1};
-  SetupTensor<int64_t>(gender_id, {batch_size, 1}, gender_id_data);
+  SetupTensor<int64_t>(&gender_id, {batch_size, 1}, gender_id_data);
 
   std::vector<int64_t> age_id_data = {0};
-  SetupTensor<int64_t>(age_id, {batch_size, 1}, age_id_data);
+  SetupTensor<int64_t>(&age_id, {batch_size, 1}, age_id_data);
 
   std::vector<int64_t> job_id_data = {10};
-  SetupTensor<int64_t>(job_id, {batch_size, 1}, job_id_data);
+  SetupTensor<int64_t>(&job_id, {batch_size, 1}, job_id_data);
 
   std::vector<int64_t> movie_id_data = {783};
-  SetupTensor<int64_t>(movie_id, {batch_size, 1}, movie_id_data);
+  SetupTensor<int64_t>(&movie_id, {batch_size, 1}, movie_id_data);
 
   std::vector<int64_t> category_id_data = {10, 8, 9};
-  SetupLoDTensor<int64_t>(category_id, {3, 1}, {{0, 3}}, category_id_data);
+  SetupLoDTensor<int64_t>(&category_id, {3, 1}, {{0, 3}}, category_id_data);
 
   std::vector<int64_t> movie_title_data = {1069, 4140, 2923, 710, 988};
-  SetupLoDTensor<int64_t>(movie_title, {5, 1}, {{0, 5}}, movie_title_data);
+  SetupLoDTensor<int64_t>(&movie_title, {5, 1}, {{0, 5}}, movie_title_data);
 
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&user_id);
diff --git a/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
index a0523905bd1631cd8768b1601e459cb9d110a84d..e15c3f59acb1eac535120554a3799c37e9d4e951 100644
--- a/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -32,10 +32,10 @@ TEST(inference, rnn_encoder_decoder) {
   paddle::framework::LoDTensor word_data, trg_word;
   paddle::framework::LoD lod{{0, 4, 10}};
 
-  SetupLoDTensor(
-      word_data, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
-  SetupLoDTensor(
-      trg_word, lod, static_cast<int64_t>(0), static_cast<int64_t>(1));
+  SetupLoDTensor(&word_data, lod, static_cast<int64_t>(0),
+                 static_cast<int64_t>(1));
+  SetupLoDTensor(&trg_word, lod, static_cast<int64_t>(0),
+                 static_cast<int64_t>(1));
 
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&word_data);
diff --git a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
index 824b3274ebc7ba046e61798b3f61ef9924a75679..0dbb6a30405eb64133613052ad57b1f705a9e7b4 100644
--- a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -33,9 +33,7 @@ TEST(inference, understand_sentiment) {
   paddle::framework::LoD lod{{0, 4, 10}};
   int64_t word_dict_len = 5147;
 
-  SetupLoDTensor(words,
-                 lod,
-                 static_cast<int64_t>(0),
+  SetupLoDTensor(&words, lod, static_cast<int64_t>(0),
                  static_cast<int64_t>(word_dict_len - 1));
 
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
diff --git a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
index 1481760c529c29a7290f476e2a22e1ded5ab7787..c9328eb21b4fdb06c5f65ba0f7337b1e79fa1927 100644
--- a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
 #include "gflags/gflags.h"
+#include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
@@ -33,10 +33,10 @@ TEST(inference, word2vec) {
   paddle::framework::LoD lod{{0, 1}};
   int64_t dict_size = 2073;  // The size of dictionary
 
-  SetupLoDTensor(first_word, lod, static_cast<int64_t>(0), dict_size - 1);
-  SetupLoDTensor(second_word, lod, static_cast<int64_t>(0), dict_size - 1);
-  SetupLoDTensor(third_word, lod, static_cast<int64_t>(0), dict_size - 1);
-  SetupLoDTensor(fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&first_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&second_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&third_word, lod, static_cast<int64_t>(0), dict_size - 1);
+  SetupLoDTensor(&fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
 
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&first_word);
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index dce541c0971a6ff9a3728e915fe8c3d009c23550..44c36b1683b037832a218df02184e7cd2ba143e9 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -11,59 +11,62 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#pragma once
+
+#include <map>
+#include <random>
+#include <string>
+#include <vector>
 
-#include <time.h>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/profiler.h"
 
+DECLARE_bool(use_mkldnn);
+
 template <typename T>
-void SetupTensor(paddle::framework::LoDTensor& input,
-                 paddle::framework::DDim dims,
-                 T lower,
-                 T upper) {
-  srand(time(0));
-  T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
-  for (int i = 0; i < input.numel(); ++i) {
-    input_ptr[i] =
-        (static_cast<T>(rand()) / static_cast<T>(RAND_MAX)) * (upper - lower) +
-        lower;
+void SetupTensor(paddle::framework::LoDTensor* input,
+                 paddle::framework::DDim dims, T lower, T upper) {
+  static unsigned int seed = 100;
+  std::mt19937 rng(seed++);
+  std::uniform_real_distribution<double> uniform_dist(0, 1);
+
+  T* input_ptr = input->mutable_data<T>(dims, paddle::platform::CPUPlace());
+  for (int i = 0; i < input->numel(); ++i) {
+    input_ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
   }
 }
 
 template <typename T>
-void SetupTensor(paddle::framework::LoDTensor& input,
-                 paddle::framework::DDim dims,
-                 std::vector<T>& data) {
+void SetupTensor(paddle::framework::LoDTensor* input,
+                 paddle::framework::DDim dims, const std::vector<T>& data) {
   CHECK_EQ(paddle::framework::product(dims), static_cast<int64_t>(data.size()));
-  T* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
-  memcpy(input_ptr, data.data(), input.numel() * sizeof(T));
+  T* input_ptr = input->mutable_data<T>(dims, paddle::platform::CPUPlace());
+  memcpy(input_ptr, data.data(), input->numel() * sizeof(T));
 }
 
 template <typename T>
-void SetupLoDTensor(paddle::framework::LoDTensor& input,
-                    paddle::framework::LoD& lod,
-                    T lower,
-                    T upper) {
-  input.set_lod(lod);
+void SetupLoDTensor(paddle::framework::LoDTensor* input,
+                    const paddle::framework::LoD& lod, T lower, T upper) {
+  input->set_lod(lod);
   int dim = lod[0][lod[0].size() - 1];
   SetupTensor<T>(input, {dim, 1}, lower, upper);
 }
 
 template <typename T>
-void SetupLoDTensor(paddle::framework::LoDTensor& input,
+void SetupLoDTensor(paddle::framework::LoDTensor* input,
                     paddle::framework::DDim dims,
-                    paddle::framework::LoD lod,
-                    std::vector<T>& data) {
+                    const paddle::framework::LoD lod,
+                    const std::vector<T>& data) {
   const size_t level = lod.size() - 1;
   CHECK_EQ(dims[0], static_cast<int64_t>((lod[level]).back()));
-  input.set_lod(lod);
+  input->set_lod(lod);
   SetupTensor<T>(input, dims, data);
 }
 
 template <typename T>
-void CheckError(paddle::framework::LoDTensor& output1,
-                paddle::framework::LoDTensor& output2) {
+void CheckError(const paddle::framework::LoDTensor& output1,
+                const paddle::framework::LoDTensor& output2) {
   // Check lod information
   EXPECT_EQ(output1.lod(), output2.lod());
 
@@ -88,12 +91,55 @@ void CheckError(paddle::framework::LoDTensor& output1,
   EXPECT_EQ(count, 0U) << "There are " << count << " different elements.";
 }
 
-template <typename Place>
+std::unique_ptr<paddle::framework::ProgramDesc> InitProgram(
+    paddle::framework::Executor* executor, paddle::framework::Scope* scope,
+    const std::string& dirname, const bool is_combined = false) {
+  std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
+  if (is_combined) {
+    // All parameters are saved in a single file.
+    // Hard-coding the file names of program and parameters in unittest.
+    // The file names should be consistent with that used in Python API
+    //  `fluid.io.save_inference_model`.
+    std::string prog_filename = "__model_combined__";
+    std::string param_filename = "__params_combined__";
+    inference_program =
+        paddle::inference::Load(executor, scope, dirname + "/" + prog_filename,
+                                dirname + "/" + param_filename);
+  } else {
+    // Parameters are saved in separate files sited in the specified
+    // `dirname`.
+    inference_program = paddle::inference::Load(executor, scope, dirname);
+  }
+  return inference_program;
+}
+
+std::vector<std::vector<int64_t>> GetFeedTargetShapes(
+    const std::string& dirname, const bool is_combined = false) {
+  auto place = paddle::platform::CPUPlace();
+  auto executor = paddle::framework::Executor(place);
+  auto* scope = new paddle::framework::Scope();
+
+  auto inference_program = InitProgram(&executor, scope, dirname, is_combined);
+  auto& global_block = inference_program->Block(0);
+
+  const std::vector<std::string>& feed_target_names =
+      inference_program->GetFeedTargetNames();
+  std::vector<std::vector<int64_t>> feed_target_shapes;
+  for (size_t i = 0; i < feed_target_names.size(); ++i) {
+    auto* var = global_block.FindVar(feed_target_names[i]);
+    std::vector<int64_t> var_shape = var->GetShape();
+    feed_target_shapes.push_back(var_shape);
+  }
+
+  delete scope;
+  return feed_target_shapes;
+}
+
+template <typename Place, bool CreateVars = true, bool PrepareContext = false>
 void TestInference(const std::string& dirname,
                    const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
-                   std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
-                   const int repeat = 1,
-                   const bool is_combined = false) {
+                   const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
+                   const int repeat = 1, const bool is_combined = false) {
   // 1. Define place, executor, scope
   auto place = Place();
   auto executor = paddle::framework::Executor(place);
@@ -105,7 +151,7 @@ void TestInference(const std::string& dirname,
     state = paddle::platform::ProfilerState::kCPU;
   } else {
 #ifdef PADDLE_WITH_CUDA
-    state = paddle::platform::ProfilerState::kCUDA;
+    state = paddle::platform::ProfilerState::kAll;
     // The default device_id of paddle::platform::CUDAPlace is 0.
     // Users can get the device_id using:
     //   int device_id = place.GetDeviceId();
@@ -124,28 +170,11 @@ void TestInference(const std::string& dirname,
     paddle::platform::RecordEvent record_event(
         "init_program",
         paddle::platform::DeviceContextPool::Instance().Get(place));
-
-    if (is_combined) {
-      // All parameters are saved in a single file.
-      // Hard-coding the file names of program and parameters in unittest.
-      // The file names should be consistent with that used in Python API
-      //  `fluid.io.save_inference_model`.
-      std::string prog_filename = "__model_combined__";
-      std::string param_filename = "__params_combined__";
-      inference_program =
-          paddle::inference::Load(executor,
-                                  *scope,
-                                  dirname + "/" + prog_filename,
-                                  dirname + "/" + param_filename);
-    } else {
-      // Parameters are saved in separate files sited in the specified
-      // `dirname`.
-      inference_program = paddle::inference::Load(executor, *scope, dirname);
-    }
+    inference_program = InitProgram(&executor, scope, dirname, is_combined);
   }
   // Disable the profiler and print the timing information
   paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
-                                    "load_program_profiler.txt");
+                                    "load_program_profiler");
   paddle::platform::ResetProfiler();
 
   // 3. Get the feed_target_names and fetch_target_names
@@ -167,10 +196,28 @@ void TestInference(const std::string& dirname,
     fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
   }
 
-  // 6. Run the inference program
+  // 6. If export Flags_use_mkldnn=True, use mkldnn related ops.
+  if (FLAGS_use_mkldnn) executor.EnableMKLDNN(*inference_program);
+
+  // 7. Run the inference program
   {
+    if (!CreateVars) {
+      // If users don't want to create and destroy variables every time they
+      // run, they need to set `create_vars` to false and manually call
+      // `CreateVariables` before running.
+      executor.CreateVariables(*inference_program, scope, 0);
+    }
+
     // Ignore the profiling results of the first run
-    executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+    std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx;
+    if (PrepareContext) {
+      ctx = executor.Prepare(*inference_program, 0);
+      executor.RunPreparedContext(ctx.get(), scope, &feed_targets,
+                                  &fetch_targets, true, CreateVars);
+    } else {
+      executor.Run(*inference_program, scope, &feed_targets, &fetch_targets,
+                   true, CreateVars);
+    }
 
     // Enable the profiler
     paddle::platform::EnableProfiler(state);
@@ -181,13 +228,20 @@ void TestInference(const std::string& dirname,
           "run_inference",
           paddle::platform::DeviceContextPool::Instance().Get(place));
 
-      executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+      if (PrepareContext) {
+        // Note: if you change the inference_program, you need to call
+        // executor.Prepare() again to get a new ExecutorPrepareContext.
+        executor.RunPreparedContext(ctx.get(), scope, &feed_targets,
+                                    &fetch_targets, CreateVars);
+      } else {
+        executor.Run(*inference_program, scope, &feed_targets, &fetch_targets,
+                     CreateVars);
+      }
     }
 
     // Disable the profiler and print the timing information
     paddle::platform::DisableProfiler(
-        paddle::platform::EventSortingKey::kDefault,
-        "run_inference_profiler.txt");
+        paddle::platform::EventSortingKey::kDefault, "run_inference_profiler");
     paddle::platform::ResetProfiler();
   }
 
diff --git a/paddle/fluid/inference/tests/test_multi_thread_helper.h b/paddle/fluid/inference/tests/test_multi_thread_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..56745f115db231d4350da72b7de7967175ac9fe8
--- /dev/null
+++ b/paddle/fluid/inference/tests/test_multi_thread_helper.h
@@ -0,0 +1,90 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/inference/io.h"
+
+void ThreadedRunInference(
+    const std::unique_ptr<paddle::framework::ProgramDesc>& inference_program,
+    paddle::framework::Executor* executor, paddle::framework::Scope* scope,
+    const int thread_id,
+    const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
+    const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
+  auto copy_program = std::unique_ptr<paddle::framework::ProgramDesc>(
+      new paddle::framework::ProgramDesc(*inference_program));
+
+  std::string feed_holder_name = "feed_" + paddle::string::to_string(thread_id);
+  std::string fetch_holder_name =
+      "fetch_" + paddle::string::to_string(thread_id);
+  copy_program->SetFeedHolderName(feed_holder_name);
+  copy_program->SetFetchHolderName(fetch_holder_name);
+
+  // 3. Get the feed_target_names and fetch_target_names
+  const std::vector<std::string>& feed_target_names =
+      copy_program->GetFeedTargetNames();
+  const std::vector<std::string>& fetch_target_names =
+      copy_program->GetFetchTargetNames();
+
+  // 4. Prepare inputs: set up maps for feed targets
+  std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+  for (size_t i = 0; i < feed_target_names.size(); ++i) {
+    // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
+    feed_targets[feed_target_names[i]] = cpu_feeds[i];
+  }
+
+  // 5. Define Tensor to get the outputs: set up maps for fetch targets
+  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+  for (size_t i = 0; i < fetch_target_names.size(); ++i) {
+    fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
+  }
+
+  // 6. Run the inference program
+  executor->Run(*copy_program, scope, feed_targets, fetch_targets, true,
+                feed_holder_name, fetch_holder_name);
+}
+
+template <typename Place>
+void TestMultiThreadInference(
+    const std::string& dirname,
+    const std::vector<std::vector<paddle::framework::LoDTensor*>>& cpu_feeds,
+    const std::vector<std::vector<paddle::framework::LoDTensor*>>& cpu_fetchs,
+    const int num_threads) {
+  // 1. Define place, executor, scope
+  auto place = Place();
+  auto executor = paddle::framework::Executor(place);
+  auto* scope = new paddle::framework::Scope();
+
+  // 2. Initialize the inference_program and load parameters
+  std::unique_ptr<paddle::framework::ProgramDesc> inference_program =
+      paddle::inference::Load(executor, *scope, dirname);
+
+  std::vector<std::thread*> threads;
+  for (int i = 0; i < num_threads; ++i) {
+    threads.push_back(new std::thread(
+        ThreadedRunInference, std::ref(inference_program), &executor, scope, i,
+        std::ref(cpu_feeds[i]), std::ref(cpu_fetchs[i])));
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    threads[i]->join();
+    delete threads[i];
+  }
+
+  delete scope;
+}
diff --git a/paddle/fluid/inference/utils/singleton.h b/paddle/fluid/inference/utils/singleton.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfb89e704457a11a3cd6e89dba5efad5acae0bce
--- /dev/null
+++ b/paddle/fluid/inference/utils/singleton.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+
+// NOTE not thread-safe.
+template <typename T>
+struct Singleton {
+  static T& Global() {
+    static T* x = new T;
+    return *x;
+  }
+
+  Singleton() = delete;
+  Singleton& operator=(const Singleton&) = delete;
+};
+
+/*
+ * An registor for any type.
+ * NOTE not thread-safe.
+ */
+template <typename ItemParent>
+struct Registry {
+  static Registry& Global() {
+    static auto* x = new Registry<ItemParent>;
+    return *x;
+  }
+
+  template <typename ItemChild>
+  static void Register(const std::string& name) {
+    PADDLE_ENFORCE_EQ(items_.count(name), 0);
+    items_[name] = new ItemChild;
+  }
+
+  static ItemParent* Lookup(const std::string& name,
+                            const std::string& default_name = "") {
+    auto it = items_.find(name);
+    if (it == items_.end()) {
+      if (default_name == "")
+        return nullptr;
+      else
+        return items_.find(default_name)->second;
+    }
+    return it->second;
+  }
+
+  ~Registry() {
+    for (auto& item : items_) {
+      delete item.second;
+    }
+  }
+
+ private:
+  Registry() = default;
+  static std::unordered_map<std::string, ItemParent*> items_;
+};
+
+template <typename ItemParent>
+std::unordered_map<std::string, ItemParent*> Registry<ItemParent>::items_;
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/memory/.clang-format b/paddle/fluid/memory/.clang-format
deleted file mode 100644
index 29282dc87e2c499988c17d90d47d44cd5cf7f115..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/.clang-format
+++ /dev/null
@@ -1,5 +0,0 @@
----
-Language:        Cpp
-BasedOnStyle:  Google
-Standard:  Cpp11 
-...
diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
index 1a61c484823b292234d4758cdc1959d7a21510e6..709fc7e12e1db537ceece30c405c0e8a2582e8ca 100644
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -1,16 +1,15 @@
 add_subdirectory(detail)
 
-cc_library(memory SRCS memory.cc DEPS place enforce)
+cc_library(malloc SRCS malloc.cc DEPS buddy_allocator place enforce)
 cc_library(memcpy SRCS memcpy.cc DEPS place)
 
-cc_library(paddle_memory
-    DEPS
-    memory
-    memcpy
-    meta_data
-    meta_cache
-    memory_block
-    buddy_allocator
-    system_allocator)
+cc_library(memory
+        DEPS
+        malloc
+        memcpy)
 
-cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
+cc_test(malloc_test SRCS malloc_test.cc DEPS malloc)
+
+#if (WITH_GPU)
+#   nv_test(pinned_memory_test SRCS pinned_memory_test.cu  DEPS place memory)
+#endif()
diff --git a/paddle/fluid/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt
index b9c3fc31c1523abf3acbd116745bbf1596454aac..c725dba5e98c200c2542d97cb8f53a938f6b614a 100644
--- a/paddle/fluid/memory/detail/CMakeLists.txt
+++ b/paddle/fluid/memory/detail/CMakeLists.txt
@@ -1,3 +1,5 @@
+cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc)
+
 if(${WITH_GPU})
   nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info)
 else(${WITH_GPU})
@@ -6,10 +8,4 @@ endif(${WITH_GPU})
 
 cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
 
-cc_library(meta_data SRCS meta_data.cc)
-
-cc_library(meta_cache SRCS meta_cache.cc)
-
-cc_library(memory_block SRCS memory_block.cc)
-
-cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS glog)
+cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS memory_block system_allocator glog)
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index 876837838648d6733b104a5496454f5dc58bbb71..4194ba197948b47003863196efdac1c08a7ae4f6 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -46,7 +46,8 @@ inline size_t align(size_t size, size_t alignment) {
 
 void* BuddyAllocator::Alloc(size_t unaligned_size) {
   // adjust allocation alignment
-  size_t size = align(unaligned_size + sizeof(Metadata), min_chunk_size_);
+  size_t size =
+      align(unaligned_size + sizeof(MemoryBlock::Desc), min_chunk_size_);
 
   // acquire the allocator lock
   std::lock_guard<std::mutex> lock(mutex_);
@@ -103,7 +104,7 @@ void BuddyAllocator::Free(void* p) {
     return;
   }
 
-  block->mark_as_free(cache_);
+  block->mark_as_free(&cache_);
 
   total_used_ -= block->total_size(cache_);
   total_free_ += block->total_size(cache_);
@@ -122,7 +123,7 @@ void BuddyAllocator::Free(void* p) {
                                    right_buddy));
 
       // merge its right buddy to the block
-      block->merge(cache_, right_buddy);
+      block->merge(&cache_, right_buddy);
     }
   }
 
@@ -139,7 +140,7 @@ void BuddyAllocator::Free(void* p) {
                                    left_buddy->total_size(cache_), left_buddy));
 
       // merge the block to its left buddy
-      left_buddy->merge(cache_, block);
+      left_buddy->merge(&cache_, block);
       block = left_buddy;
     }
   }
@@ -163,13 +164,13 @@ size_t BuddyAllocator::Used() { return total_used_; }
 
 void* BuddyAllocator::SystemAlloc(size_t size) {
   size_t index = 0;
-  void* p = system_allocator_->Alloc(index, size);
+  void* p = system_allocator_->Alloc(&index, size);
 
   VLOG(10) << "Allocated " << p << " from system allocator.";
 
   if (p == nullptr) return nullptr;
 
-  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::HUGE_CHUNK, index,
+  static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::HUGE_CHUNK, index,
                                      size, nullptr, nullptr);
 
   return static_cast<MemoryBlock*>(p)->data();
@@ -187,14 +188,14 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
 
   // Allocate a new maximum sized block
   size_t index = 0;
-  void* p = system_allocator_->Alloc(index, max_chunk_size_);
+  void* p = system_allocator_->Alloc(&index, max_chunk_size_);
 
   if (p == nullptr) return pool_.end();
 
   VLOG(10) << "Creating and inserting new block " << p
            << " from system allocator";
 
-  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index,
+  static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
                                      max_chunk_size_, nullptr, nullptr);
 
   // gpu fallback allocation
@@ -238,11 +239,11 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
 
   VLOG(10) << "Split block (" << block << ", " << block->total_size(cache_)
            << ") into";
-  block->split(cache_, size);
+  block->split(&cache_, size);
 
   VLOG(10) << "Left block (" << block << ", " << block->total_size(cache_)
            << ")";
-  block->set_type(cache_, MemoryBlock::ARENA_CHUNK);
+  block->set_type(&cache_, MemoryBlock::ARENA_CHUNK);
 
   // the rest of memory if exist
   if (block->has_right_buddy(cache_)) {
diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
index a4ee70c2586f37e3b2328dedfe28135e14d8b18d..2f39d774d6fb6a2bc37877eb2f8b90bebd3cda28 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -14,18 +14,18 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/memory/detail/meta_cache.h"
-#include "paddle/fluid/memory/detail/meta_data.h"
+#include <mutex>  // NOLINT
+#include <set>
+#include <tuple>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/memory/detail/memory_block.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
-#include <mutex>
-#include <set>
-#include <unordered_map>
-#include <vector>
-
 namespace paddle {
 namespace memory {
 namespace detail {
diff --git a/paddle/fluid/memory/detail/memory_block.cc b/paddle/fluid/memory/detail/memory_block.cc
index 07123f2669c3a829ff28e9fab5a404047c5a09c7..f34b922b25a0110690671d487f190e1b977a67bb 100644
--- a/paddle/fluid/memory/detail/memory_block.cc
+++ b/paddle/fluid/memory/detail/memory_block.cc
@@ -13,143 +13,142 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/memory/detail/memory_block.h"
-#include "paddle/fluid/memory/detail/meta_cache.h"
-#include "paddle/fluid/memory/detail/meta_data.h"
 #include "paddle/fluid/platform/assert.h"
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
-void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size,
+void MemoryBlock::init(MetadataCache* cache, Type t, size_t index, size_t size,
                        void* left_buddy, void* right_buddy) {
-  cache.store(this, Metadata(t, index, size - sizeof(Metadata), size,
-                             static_cast<MemoryBlock*>(left_buddy),
-                             static_cast<MemoryBlock*>(right_buddy)));
+  cache->save(
+      this, MemoryBlock::Desc(t, index, size - sizeof(MemoryBlock::Desc), size,
+                              static_cast<MemoryBlock*>(left_buddy),
+                              static_cast<MemoryBlock*>(right_buddy)));
 }
 
-MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const {
+MemoryBlock::Type MemoryBlock::type(const MetadataCache& cache) const {
   return cache.load(this).type;
 }
 
-size_t MemoryBlock::size(MetadataCache& cache) const {
+size_t MemoryBlock::size(const MetadataCache& cache) const {
   return cache.load(this).size;
 }
 
-size_t MemoryBlock::total_size(MetadataCache& cache) const {
+size_t MemoryBlock::index(const MetadataCache& cache) const {
+  return cache.load(this).index;
+}
+
+size_t MemoryBlock::total_size(const MetadataCache& cache) const {
   return cache.load(this).total_size;
 }
 
-MemoryBlock* MemoryBlock::left_buddy(MetadataCache& cache) const {
+bool MemoryBlock::has_left_buddy(const MetadataCache& cache) const {
+  return left_buddy(cache) != nullptr;
+}
+
+bool MemoryBlock::has_right_buddy(const MetadataCache& cache) const {
+  return right_buddy(cache) != nullptr;
+}
+
+MemoryBlock* MemoryBlock::left_buddy(const MetadataCache& cache) const {
   return cache.load(this).left_buddy;
 }
 
-MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const {
+MemoryBlock* MemoryBlock::right_buddy(const MetadataCache& cache) const {
   return cache.load(this).right_buddy;
 }
 
-void MemoryBlock::split(MetadataCache& cache, size_t size) {
+void MemoryBlock::split(MetadataCache* cache, size_t size) {
   // make sure the split fits
-  PADDLE_ASSERT(total_size(cache) >= size);
+  PADDLE_ASSERT(total_size(*cache) >= size);
 
   // bail out if there is no room for another partition
-  if (total_size(cache) - size <= sizeof(Metadata)) {
+  if (total_size(*cache) - size <= sizeof(MemoryBlock::Desc)) {
     return;
   }
 
   // find the position of the split
   void* right_partition = reinterpret_cast<uint8_t*>(this) + size;
 
-  size_t remaining_size = total_size(cache) - size;
+  size_t remaining_size = total_size(*cache) - size;
 
   // Add the new block as a buddy
-  auto metadata = cache.load(this);
+  auto metadata = cache->load(this);
 
   // Write the metadata for the new block
   auto new_block_right_buddy = metadata.right_buddy;
 
-  cache.store(
-      static_cast<MemoryBlock*>(right_partition),
-      Metadata(FREE_CHUNK, index(cache), remaining_size - sizeof(Metadata),
-               remaining_size, this, new_block_right_buddy));
+  cache->save(static_cast<MemoryBlock*>(right_partition),
+              MemoryBlock::Desc(FREE_CHUNK, index(*cache),
+                                remaining_size - sizeof(MemoryBlock::Desc),
+                                remaining_size, this, new_block_right_buddy));
 
   metadata.right_buddy = static_cast<MemoryBlock*>(right_partition);
-  metadata.size = size - sizeof(Metadata);
+  metadata.size = size - sizeof(MemoryBlock::Desc);
   metadata.total_size = size;
 
-  cache.store(this, metadata);
+  cache->save(this, metadata);
 
   // Write metadata for the new block's right buddy
   if (new_block_right_buddy != nullptr) {
-    auto buddy_metadata = cache.load(new_block_right_buddy);
+    auto buddy_metadata = cache->load(new_block_right_buddy);
 
     buddy_metadata.left_buddy = static_cast<MemoryBlock*>(right_partition);
 
-    cache.store(new_block_right_buddy, buddy_metadata);
+    cache->save(new_block_right_buddy, buddy_metadata);
   }
 }
 
-void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) {
+void MemoryBlock::merge(MetadataCache* cache, MemoryBlock* right_buddy) {
   // only free blocks can be merged
-  PADDLE_ASSERT(type(cache) == FREE_CHUNK);
-  PADDLE_ASSERT(right_buddy->type(cache) == FREE_CHUNK);
+  PADDLE_ASSERT(type(*cache) == FREE_CHUNK);
+  PADDLE_ASSERT(right_buddy->type(*cache) == FREE_CHUNK);
 
-  auto metadata = cache.load(this);
+  auto metadata = cache->load(this);
 
   // link this->buddy's buddy
-  metadata.right_buddy = right_buddy->right_buddy(cache);
+  metadata.right_buddy = right_buddy->right_buddy(*cache);
 
   // link buddy's buddy -> this
   if (metadata.right_buddy != nullptr) {
-    auto buddy_metadata = cache.load(metadata.right_buddy);
+    auto buddy_metadata = cache->load(metadata.right_buddy);
 
     buddy_metadata.left_buddy = this;
 
-    cache.store(metadata.right_buddy, buddy_metadata);
+    cache->save(metadata.right_buddy, buddy_metadata);
   }
 
-  metadata.size += right_buddy->total_size(cache);
-  metadata.total_size += right_buddy->total_size(cache);
+  metadata.size += right_buddy->total_size(*cache);
+  metadata.total_size += right_buddy->total_size(*cache);
 
-  cache.store(this, metadata);
-  cache.store(right_buddy, Metadata(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr));
+  cache->save(this, metadata);
+  cache->save(right_buddy,
+              MemoryBlock::Desc(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr));
 }
 
-void MemoryBlock::mark_as_free(MetadataCache& cache) {
+void MemoryBlock::mark_as_free(MetadataCache* cache) {
   // check for double free or corruption
-  PADDLE_ASSERT(type(cache) != FREE_CHUNK);
-  PADDLE_ASSERT(type(cache) != INVALID_CHUNK);
-
+  PADDLE_ASSERT(type(*cache) != FREE_CHUNK);
+  PADDLE_ASSERT(type(*cache) != INVALID_CHUNK);
   set_type(cache, FREE_CHUNK);
 }
 
-void MemoryBlock::set_type(MetadataCache& cache, Type t) {
-  auto metadata = cache.load(this);
-
+void MemoryBlock::set_type(MetadataCache* cache, Type t) {
+  auto metadata = cache->load(this);
   metadata.type = t;
-
-  cache.store(this, metadata);
-}
-
-bool MemoryBlock::has_left_buddy(MetadataCache& cache) const {
-  return left_buddy(cache) != nullptr;
-}
-
-bool MemoryBlock::has_right_buddy(MetadataCache& cache) const {
-  return right_buddy(cache) != nullptr;
-}
-
-size_t MemoryBlock::index(MetadataCache& cache) const {
-  return cache.load(this).index;
+  cache->save(this, metadata);
 }
 
 void* MemoryBlock::data() const {
-  return const_cast<Metadata*>(reinterpret_cast<const Metadata*>(this)) + 1;
+  return const_cast<MemoryBlock::Desc*>(
+             reinterpret_cast<const MemoryBlock::Desc*>(this)) +
+         1;
 }
 
 MemoryBlock* MemoryBlock::metadata() const {
   return const_cast<MemoryBlock*>(reinterpret_cast<const MemoryBlock*>(
-      reinterpret_cast<const Metadata*>(this) - 1));
+      reinterpret_cast<const MemoryBlock::Desc*>(this) - 1));
 }
 
 }  // namespace detail
diff --git a/paddle/fluid/memory/detail/memory_block.h b/paddle/fluid/memory/detail/memory_block.h
index 72b40b73177d086aa912416e7f9cb3cd4ad5b45e..5cceba659beeec1b3c986dc43229f6725e3e11de 100644
--- a/paddle/fluid/memory/detail/memory_block.h
+++ b/paddle/fluid/memory/detail/memory_block.h
@@ -11,21 +11,21 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
-#include <cstddef>
+#include <cstdint>
+#include <unordered_map>
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
-// Forward Declarations
+// Forward declaration.
 class MetadataCache;
 
-/*! \brief A class used to interpret the contents of a memory block */
-class MemoryBlock {
- public:
+// MemoryBlock represents Each allocated memory block, which contains
+// MemoryBlock::Desc and the payload.
+struct MemoryBlock {
   enum Type {
     FREE_CHUNK,    // memory is free and idle
     ARENA_CHUNK,   // memory is being occupied
@@ -33,57 +33,96 @@ class MemoryBlock {
     INVALID_CHUNK  // memory is invalid
   };
 
- public:
-  void init(MetadataCache& cache, Type t, size_t index, size_t size,
+  // init saves the MemoryBlock::Desc of the memory block in a MetadataCache.
+  // If it is a CPU memory block, the MetadataCache writes the
+  // MemoryBlock::Desc to the beginning of the block; or, if it is a GPU memory
+  // block, the MetadataCache writes the Meatadata to a std::map in
+  // the CPU.
+  void init(MetadataCache* cache, Type t, size_t index, size_t size,
             void* left_buddy, void* right_buddy);
 
- public:
-  /*! \brief The type of the allocation */
-  Type type(MetadataCache& cache) const;
-
-  /*! \brief The size of the data region */
-  size_t size(MetadataCache& cache) const;
+  // All these accessors returns fields in the MemoryBlock::Desc of the memory
+  // block.  They all need a MetadataCache instance as their first
+  // parameter because they read the MemoryBlock::Desc from the cache.
+  Type type(const MetadataCache& cache) const;
+  size_t size(const MetadataCache& cache) const;
+  size_t index(const MetadataCache& cache) const;
+  size_t total_size(const MetadataCache& cache) const;
+  bool has_left_buddy(const MetadataCache& cache) const;
+  bool has_right_buddy(const MetadataCache& cache) const;
+  MemoryBlock* left_buddy(const MetadataCache& cache) const;
+  MemoryBlock* right_buddy(const MetadataCache& cache) const;
 
-  /*! \brief An index to track the allocator */
-  size_t index(MetadataCache& cache) const;
+  // Split the allocation into left/right blocks.
+  void split(MetadataCache* cache, size_t size);
 
-  /*! \brief The total size of the block */
-  size_t total_size(MetadataCache& cache) const;
+  // Merge left and right blocks together.
+  void merge(MetadataCache* cache, MemoryBlock* right_buddy);
 
-  /*! \brief Check the left buddy of the block */
-  bool has_left_buddy(MetadataCache& cache) const;
+  // Mark the allocation as free.
+  void mark_as_free(MetadataCache* cache);
 
-  /*! \brief Check the right buddy of the block */
-  bool has_right_buddy(MetadataCache& cache) const;
-
-  /*! \brief Get the left buddy */
-  MemoryBlock* left_buddy(MetadataCache& cache) const;
-
-  /*! \brief Get the right buddy */
-  MemoryBlock* right_buddy(MetadataCache& cache) const;
-
- public:
-  /*! \brief Split the allocation into left/right blocks */
-  void split(MetadataCache& cache, size_t size);
+  // Change the type of the allocation.
+  void set_type(MetadataCache* cache, Type t);
 
-  /*! \brief Merge left and right blocks together */
-  void merge(MetadataCache& cache, MemoryBlock* right_buddy);
-
-  /*! \brief Mark the allocation as free */
-  void mark_as_free(MetadataCache& cache);
-
-  /*! \brief Change the type of the allocation */
-  void set_type(MetadataCache& cache, Type t);
-
- public:
-  /*! \brief Get a pointer to the memory block's data */
   void* data() const;
-
-  /*! \brief Get a pointer to the memory block's metadata */
   MemoryBlock* metadata() const;
 
+  // MemoryBlock::Desc describes a MemoryBlock.
+  struct Desc {
+    Desc(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
+         MemoryBlock* r);
+    Desc();
+
+    // Updates guard_begin and guard_end by hashes of the Metadata object.
+    void update_guards();
+
+    // Checks that guard_begin and guard_end are hashes of the Metadata object.
+    bool check_guards() const;
+
+    // TODO(gangliao): compress this
+    size_t guard_begin = 0;
+    MemoryBlock::Type type = MemoryBlock::INVALID_CHUNK;
+    size_t index = 0;
+    size_t size = 0;
+    size_t total_size = 0;
+    MemoryBlock* left_buddy = nullptr;
+    MemoryBlock* right_buddy = nullptr;
+    size_t guard_end = 0;
+  };
+};
+
+// A cache for accessing memory block meta-data that may be expensive
+// to access directly.  This class exists to unify the
+// MemoryBlock::Desc format between GPU and CPU allocations. It should
+// be removed when the CPU can access all GPU allocations directly via
+// UVM.
+class MetadataCache {
  public:
-  static size_t overhead();
+  explicit MetadataCache(bool uses_gpu);
+
+  // Disable copying and assignment.
+  MetadataCache(const MetadataCache&) = delete;
+  MetadataCache& operator=(const MetadataCache&) = delete;
+
+  // Returns the MemoryBlock::Desc for a memory block.  When MetadataCache is
+  // used to manage CPU memory, the MemoryBlock::Desc resides at the beginning
+  // of the memory block; when used to manage GPU memory, the
+  // Meatadata resides in CPU memory indexed by cache_.
+  MemoryBlock::Desc load(const MemoryBlock* memory_block) const;
+
+  // Saves the MemoryBlock::Desc of a memory block into the cache.  For CPU
+  // memory block, writes the MemoryBlock::Desc to the beginning of the memory
+  // block; whereas for GPU memory, writes it to cache_.
+  void save(MemoryBlock* memory_block, const MemoryBlock::Desc& meta_data);
+
+  // For GPU memory block, erases its MemoryBlock::Desc from cache_.
+  void invalidate(MemoryBlock* memory_block);
+
+ private:
+  typedef std::unordered_map<const MemoryBlock*, MemoryBlock::Desc> MetadataMap;
+  MetadataMap cache_;
+  bool uses_gpu_;
 };
 
 }  // namespace detail
diff --git a/paddle/fluid/memory/detail/memory_block_desc.cc b/paddle/fluid/memory/detail/memory_block_desc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..393dd9209c0aa443cd17c29b2f9de6eafb48bac9
--- /dev/null
+++ b/paddle/fluid/memory/detail/memory_block_desc.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <functional>
+
+#include "paddle/fluid/memory/detail/memory_block.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+MemoryBlock::Desc::Desc(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
+                        MemoryBlock* l, MemoryBlock* r)
+    : type(t),
+      index(i),
+      size(s),
+      total_size(ts),
+      left_buddy(l),
+      right_buddy(r) {}
+
+MemoryBlock::Desc::Desc()
+    : type(MemoryBlock::INVALID_CHUNK),
+      index(0),
+      size(0),
+      total_size(0),
+      left_buddy(nullptr),
+      right_buddy(nullptr) {}
+
+namespace {
+
+template <class T>
+inline void hash_combine(std::size_t* seed, const T& v) {
+  std::hash<T> hasher;
+  (*seed) ^= hasher(v) + 0x9e3779b9 + ((*seed) << 6) + ((*seed) >> 2);
+}
+
+inline size_t hash(const MemoryBlock::Desc& metadata, size_t initial_seed) {
+  size_t seed = initial_seed;
+
+  hash_combine(&seed, static_cast<size_t>(metadata.type));
+  hash_combine(&seed, metadata.index);
+  hash_combine(&seed, metadata.size);
+  hash_combine(&seed, metadata.total_size);
+  hash_combine(&seed, metadata.left_buddy);
+  hash_combine(&seed, metadata.right_buddy);
+
+  return seed;
+}
+
+}  // namespace
+
+void MemoryBlock::Desc::update_guards() {
+  guard_begin = hash(*this, 1);
+  guard_end = hash(*this, 2);
+}
+
+bool MemoryBlock::Desc::check_guards() const {
+  return guard_begin == hash(*this, 1) && guard_end == hash(*this, 2);
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/meta_cache.cc b/paddle/fluid/memory/detail/meta_cache.cc
index 43249e842ad4d2419fed041e6c9056021e9663cd..b86e4f38c42a26e155f276f9b73cbed1d0d83f7d 100644
--- a/paddle/fluid/memory/detail/meta_cache.cc
+++ b/paddle/fluid/memory/detail/meta_cache.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/memory/detail/meta_cache.h"
 #include "glog/logging.h"
 #include "paddle/fluid/memory/detail/memory_block.h"
 #include "paddle/fluid/platform/assert.h"
@@ -23,29 +22,28 @@ namespace detail {
 
 MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {}
 
-Metadata MetadataCache::load(const MemoryBlock* block) {
+MemoryBlock::Desc MetadataCache::load(const MemoryBlock* block) const {
   if (uses_gpu_) {
-    auto existing_metadata = cache_.find(block);
-    PADDLE_ASSERT(existing_metadata->second.check_guards());
-    return existing_metadata->second;
+    auto existing_desc = cache_.find(block);
+    PADDLE_ASSERT(existing_desc->second.check_guards());
+    return existing_desc->second;
   } else {
-    auto* meta = reinterpret_cast<const Metadata*>(block);
-    VLOG(10) << "Load MetaData type=" << meta->type;
-    PADDLE_ASSERT(meta->check_guards());
-    return *reinterpret_cast<const Metadata*>(block);
+    auto* desc = reinterpret_cast<const MemoryBlock::Desc*>(block);
+    VLOG(10) << "Load MemoryBlock::Desc type=" << desc->type;
+    PADDLE_ASSERT(desc->check_guards());
+    return *reinterpret_cast<const MemoryBlock::Desc*>(block);
   }
 }
 
-void MetadataCache::store(MemoryBlock* block,
-                          const Metadata& original_metadata) {
-  auto metadata = original_metadata;
-
-  metadata.update_guards();
+void MetadataCache::save(MemoryBlock* block,
+                         const MemoryBlock::Desc& original_desc) {
+  auto desc = original_desc;
+  desc.update_guards();
 
   if (uses_gpu_) {
-    cache_[block] = metadata;
+    cache_[block] = desc;
   } else {
-    *reinterpret_cast<Metadata*>(block) = metadata;
+    *reinterpret_cast<MemoryBlock::Desc*>(block) = desc;
   }
 }
 
diff --git a/paddle/fluid/memory/detail/meta_cache.h b/paddle/fluid/memory/detail/meta_cache.h
deleted file mode 100644
index 3283d756a6e7f7f1750442039797846bdad51125..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/detail/meta_cache.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/memory/detail/memory_block.h"
-#include "paddle/fluid/memory/detail/meta_data.h"
-
-#include <unordered_map>
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-/**
- *  \brief A cache for accessing memory block meta-data that may be expensive
- *         to access directly.
- *
- *  \note  This class exists to unify the metadata format between GPU and CPU
- *         allocations. It should be removed when the CPU can access all GPU
- *         allocations directly via UVM.
- */
-class MetadataCache {
- public:
-  explicit MetadataCache(bool uses_gpu);
-
- public:
-  /*! \brief Load the associated metadata for the specified memory block. */
-  Metadata load(const MemoryBlock* memory_block);
-
-  /*! \brief Store the associated metadata for the specified memory block. */
-  void store(MemoryBlock* memory_block, const Metadata& meta_data);
-
-  /*! \brief Indicate that the specified metadata will no longer be used. */
-  void invalidate(MemoryBlock* memory_block);
-
- public:
-  MetadataCache(const MetadataCache&) = delete;
-  MetadataCache& operator=(const MetadataCache&) = delete;
-
- private:
-  bool uses_gpu_;
-
- private:
-  typedef std::unordered_map<const MemoryBlock*, Metadata> MetadataMap;
-
- private:
-  MetadataMap cache_;
-};
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/meta_data.cc b/paddle/fluid/memory/detail/meta_data.cc
deleted file mode 100644
index ad862af1705835c495a30232aa2bba2d2a56ad89..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/detail/meta_data.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/memory/detail/meta_data.h"
-
-#include <functional>
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
-                   MemoryBlock* l, MemoryBlock* r)
-    : type(t),
-      index(i),
-      size(s),
-      total_size(ts),
-      left_buddy(l),
-      right_buddy(r) {}
-
-Metadata::Metadata()
-    : type(MemoryBlock::INVALID_CHUNK),
-      index(0),
-      size(0),
-      total_size(0),
-      left_buddy(nullptr),
-      right_buddy(nullptr) {}
-
-template <class T>
-inline void hash_combine(std::size_t& seed, const T& v) {
-  std::hash<T> hasher;
-  seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-}
-
-inline size_t hash(const Metadata* metadata, size_t initial_seed) {
-  size_t seed = initial_seed;
-
-  hash_combine(seed, (size_t)metadata->type);
-  hash_combine(seed, metadata->index);
-  hash_combine(seed, metadata->size);
-  hash_combine(seed, metadata->total_size);
-  hash_combine(seed, metadata->left_buddy);
-  hash_combine(seed, metadata->right_buddy);
-
-  return seed;
-}
-
-void Metadata::update_guards() {
-  guard_begin = hash(this, 1);
-  guard_end = hash(this, 2);
-}
-
-bool Metadata::check_guards() const {
-  return guard_begin == hash(this, 1) && guard_end == hash(this, 2);
-}
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/meta_data.h b/paddle/fluid/memory/detail/meta_data.h
deleted file mode 100644
index 14895ee8727e98186b1f1295321951e12753fef6..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/detail/meta_data.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/memory/detail/memory_block.h"
-
-#include <stddef.h>
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-class Metadata {
- public:
-  Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
-           MemoryBlock* r);
-  Metadata();
-
- public:
-  /*! \brief Update the guards when metadata is changed */
-  void update_guards();
-
-  /*! \brief Check consistency to previous modification */
-  bool check_guards() const;
-
- public:
-  // TODO(gangliao): compress this
-  // clang-format off
-  size_t            guard_begin = 0;
-  MemoryBlock::Type type        = MemoryBlock::INVALID_CHUNK;
-  size_t            index       = 0;
-  size_t            size        = 0;
-  size_t            total_size  = 0;
-  MemoryBlock*      left_buddy  = nullptr;
-  MemoryBlock*      right_buddy = nullptr;
-  size_t            guard_end   = 0;
-  // clang-format on
-};
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 8ac8978120ad5930cd80272189ac0a83a77b2617..9b1ab1e228dd758b52975abc4c4aa0bdeadbe2de 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -13,15 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/memory/detail/system_allocator.h"
-#include "paddle/fluid/platform/assert.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
 
 #include <stdlib.h>    // for malloc and free
 #include <sys/mman.h>  // for mlock and munlock
 #include <algorithm>   // for std::max
 
 #include "gflags/gflags.h"
+#include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gpu_info.h"
 
 // If use_pinned_memory is true, CPUAllocator calls mlock, which
 // returns pinned and locked memory as staging areas for data exchange
@@ -34,28 +35,30 @@ namespace paddle {
 namespace memory {
 namespace detail {
 
-void* CPUAllocator::Alloc(size_t& index, size_t size) {
+void* CPUAllocator::Alloc(size_t* index, size_t size) {
   // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
   // malloc might not return nullptr if size is zero, but the returned
   // pointer shall not be dereferenced -- so we make it nullptr.
   if (size <= 0) return nullptr;
 
-  index = 0;  // unlock memory
+  *index = 0;  // unlock memory
 
-  void* p;
+  void* p = nullptr;
 
 #ifdef PADDLE_WITH_MKLDNN
   // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
   // memory alignment
-  PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0);
+  PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0, "Alloc %ld error!",
+                    size);
 #else
-  PADDLE_ENFORCE_EQ(posix_memalign(&p, 32ul, size), 0);
+  PADDLE_ENFORCE_EQ(posix_memalign(&p, 32ul, size), 0, "Alloc %ld error!",
+                    size);
 #endif
   PADDLE_ENFORCE(p, "Fail to allocate CPU memory: size = %d .", size);
 
   if (p != nullptr) {
     if (FLAGS_use_pinned_memory) {
-      index = 1;
+      *index = 1;
       mlock(p, size);  // lock memory
     }
   }
@@ -74,14 +77,25 @@ bool CPUAllocator::UseGpu() const { return false; }
 
 #ifdef PADDLE_WITH_CUDA
 
-void* GPUAllocator::Alloc(size_t& index, size_t size) {
+void* GPUAllocator::Alloc(size_t* index, size_t size) {
   // CUDA documentation doesn't explain if cudaMalloc returns nullptr
   // if size is 0.  We just make sure it does.
   if (size <= 0) return nullptr;
   void* p;
+  int prev_id;
+  cudaGetDevice(&prev_id);
+  if (prev_id != gpu_id_) {
+    cudaSetDevice(gpu_id_);
+  }
+
   cudaError_t result = cudaMalloc(&p, size);
+
+  if (prev_id != gpu_id_) {
+    cudaSetDevice(prev_id);
+  }
+
   if (result == cudaSuccess) {
-    index = 0;
+    *index = 0;
     gpu_alloc_size_ += size;
     return p;
   } else {
@@ -119,6 +133,60 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) {
 
 bool GPUAllocator::UseGpu() const { return true; }
 
+// PINNED memory allows direct DMA transfers by the GPU to and from system
+// memory. It’s locked to a physical address.
+void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
+  if (size <= 0) return nullptr;
+
+  // NOTE: here, we use CUDAPinnedMaxAllocSize as the maximum memory size
+  // of host pinned allocation. Allocates too much would reduce
+  // the amount of memory available to the underlying system for paging.
+  size_t usable =
+      paddle::platform::CUDAPinnedMaxAllocSize() - cuda_pinnd_alloc_size_;
+
+  if (size > usable) {
+    LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
+                 << " MB pinned memory."
+                 << ", available " << usable / 1024.0 / 1024.0 << " MB";
+    return nullptr;
+  }
+
+  void* p;
+  // PINNED memory is visible to all CUDA contexts.
+  cudaError_t result = cudaMallocHost(&p, size);
+
+  if (result == cudaSuccess) {
+    *index = 1;  // PINNED memory
+    cuda_pinnd_alloc_size_ += size;
+    return p;
+  } else {
+    LOG(WARNING) << "cudaMallocHost failed.";
+    return nullptr;
+  }
+
+  return nullptr;
+}
+
+void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
+  cudaError_t err;
+  PADDLE_ASSERT(index == 1);
+
+  PADDLE_ASSERT(cuda_pinnd_alloc_size_ >= size);
+  cuda_pinnd_alloc_size_ -= size;
+  err = cudaFreeHost(p);
+
+  // Purposefully allow cudaErrorCudartUnloading, because
+  // that is returned if you ever call cudaFreeHost after the
+  // driver has already shutdown. This happens only if the
+  // process is terminating, in which case we don't care if
+  // cudaFreeHost succeeds.
+  if (err != cudaErrorCudartUnloading) {
+    PADDLE_ENFORCE(err, "cudaFreeHost failed in GPUPinnedAllocator::Free.");
+  }
+}
+
+bool CUDAPinnedAllocator::UseGpu() const { return false; }
+
 #endif
 
 }  // namespace detail
diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h
index e93c2c1e3231f7f42794dd78121072dbdb6abc41..a0386a2dad1bb7faf54197a47ca7a5b6d9488817 100644
--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@@ -21,21 +21,22 @@ namespace memory {
 namespace detail {
 
 /**
- * \brief SystemAllocator is the parent class of CPUAllocator and GPUAllocator.
- *        A BuddyAllocator object uses a SystemAllocator* pointing to the
+ * \brief SystemAllocator is the parent class of CPUAllocator,
+ *        CUDAPinnedAllocator and GPUAllocator. A BuddyAllocator
+ *        object uses a SystemAllocator* pointing to the
  *        underlying system allocator.
  */
 class SystemAllocator {
  public:
   virtual ~SystemAllocator() {}
-  virtual void* Alloc(size_t& index, size_t size) = 0;
+  virtual void* Alloc(size_t* index, size_t size) = 0;
   virtual void Free(void* p, size_t size, size_t index) = 0;
   virtual bool UseGpu() const = 0;
 };
 
 class CPUAllocator : public SystemAllocator {
  public:
-  virtual void* Alloc(size_t& index, size_t size);
+  virtual void* Alloc(size_t* index, size_t size);
   virtual void Free(void* p, size_t size, size_t index);
   virtual bool UseGpu() const;
 };
@@ -43,13 +44,26 @@ class CPUAllocator : public SystemAllocator {
 #ifdef PADDLE_WITH_CUDA
 class GPUAllocator : public SystemAllocator {
  public:
-  virtual void* Alloc(size_t& index, size_t size);
+  explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {}
+
+  virtual void* Alloc(size_t* index, size_t size);
   virtual void Free(void* p, size_t size, size_t index);
   virtual bool UseGpu() const;
 
  private:
   size_t gpu_alloc_size_ = 0;
   size_t fallback_alloc_size_ = 0;
+  int gpu_id_;
+};
+
+class CUDAPinnedAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t* index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+
+ private:
+  size_t cuda_pinnd_alloc_size_ = 0;
 };
 #endif
 
diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc
index d5df9e6897e9e788f14d2625e424c13949eeaa26..268260142c579ea9301d89fcec1613ce5b0e15a5 100644
--- a/paddle/fluid/memory/detail/system_allocator_test.cc
+++ b/paddle/fluid/memory/detail/system_allocator_test.cc
@@ -22,11 +22,11 @@ limitations under the License. */
 
 DECLARE_bool(use_pinned_memory);
 
-void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
+void TestAllocator(paddle::memory::detail::SystemAllocator* a, size_t size) {
   bool freed = false;
   {
     size_t index;
-    void* p = a.Alloc(index, size);
+    void* p = a->Alloc(&index, size);
     if (size > 0) {
       EXPECT_NE(p, nullptr);
     } else {
@@ -36,7 +36,7 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
     int* i = static_cast<int*>(p);
     std::shared_ptr<int> ptr(i, [&](void* p) {
       freed = true;
-      a.Free(p, size, index);
+      a->Free(p, size, index);
     });
   }
   EXPECT_TRUE(freed);
@@ -45,21 +45,21 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
 TEST(CPUAllocator, NoLockMem) {
   FLAGS_use_pinned_memory = false;
   paddle::memory::detail::CPUAllocator a;
-  TestAllocator(a, 2048);
-  TestAllocator(a, 0);
+  TestAllocator(&a, 2048);
+  TestAllocator(&a, 0);
 }
 
 TEST(CPUAllocator, LockMem) {
   FLAGS_use_pinned_memory = true;
   paddle::memory::detail::CPUAllocator a;
-  TestAllocator(a, 2048);
-  TestAllocator(a, 0);
+  TestAllocator(&a, 2048);
+  TestAllocator(&a, 0);
 }
 
 #ifdef PADDLE_WITH_CUDA
 TEST(GPUAllocator, Alloc) {
-  paddle::memory::detail::GPUAllocator a;
-  TestAllocator(a, 2048);
-  TestAllocator(a, 0);
+  paddle::memory::detail::GPUAllocator a(0);
+  TestAllocator(&a, 2048);
+  TestAllocator(&a, 0);
 }
 #endif
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bd98ed81899440a46415d30b6d74fec2dac4c155
--- /dev/null
+++ b/paddle/fluid/memory/malloc.cc
@@ -0,0 +1,189 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/memory/malloc.h"
+
+#include "glog/logging.h"
+
+#include "paddle/fluid/memory/detail/buddy_allocator.h"
+#include "paddle/fluid/memory/detail/system_allocator.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+DEFINE_bool(init_allocated_mem, false,
+            "It is a mistake that the values of the memory allocated by "
+            "BuddyAllocator are always zeroed in some op's implementation. "
+            "To find this error in time, we use init_allocated_mem to indicate "
+            "that initializing the allocated memory with a small value "
+            "during unit testing.");
+DECLARE_double(fraction_of_gpu_memory_to_use);
+
+namespace paddle {
+namespace memory {
+
+using BuddyAllocator = detail::BuddyAllocator;
+
+BuddyAllocator* GetCPUBuddyAllocator() {
+  static detail::BuddyAllocator* a = nullptr;
+  if (a == nullptr) {
+    a = new detail::BuddyAllocator(new detail::CPUAllocator,
+                                   platform::CpuMinChunkSize(),
+                                   platform::CpuMaxChunkSize());
+  }
+  return a;
+}
+
+template <>
+void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  void* p = GetCPUBuddyAllocator()->Alloc(size);
+  if (FLAGS_init_allocated_mem) {
+    memset(p, 0xEF, size);
+  }
+  VLOG(10) << "  pointer=" << p;
+  return p;
+}
+
+template <>
+void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
+  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+  GetCPUBuddyAllocator()->Free(p);
+}
+
+template <>
+size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
+  return GetCPUBuddyAllocator()->Used();
+}
+
+#ifdef PADDLE_WITH_CUDA
+
+BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
+  static BuddyAllocator** as = NULL;
+  if (as == NULL) {
+    int gpu_num = platform::GetCUDADeviceCount();
+    as = new BuddyAllocator*[gpu_num];
+    for (int gpu = 0; gpu < gpu_num; gpu++) {
+      as[gpu] = nullptr;
+    }
+  }
+  platform::SetDeviceId(gpu_id);
+  if (!as[gpu_id]) {
+    as[gpu_id] = new BuddyAllocator(new detail::GPUAllocator(gpu_id),
+                                    platform::GpuMinChunkSize(),
+                                    platform::GpuMaxChunkSize());
+    VLOG(10) << "\n\nNOTE: each GPU device use "
+             << FLAGS_fraction_of_gpu_memory_to_use * 100
+             << "% of GPU memory.\n"
+             << "You can set GFlags environment variable '"
+             << "FLAGS_fraction_of_gpu_memory_to_use"
+             << "' to change the fraction of GPU usage.\n\n";
+  }
+  return as[gpu_id];
+}
+
+template <>
+size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
+  return GetGPUBuddyAllocator(place.device)->Used();
+}
+
+template <>
+void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
+  auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
+  auto* ptr = buddy_allocator->Alloc(size);
+  if (ptr == nullptr) {
+    int cur_dev = platform::GetCurrentDeviceId();
+    platform::SetDeviceId(place.device);
+    size_t avail, total;
+    platform::GpuMemoryUsage(&avail, &total);
+    LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
+                 << place.device << ", available " << avail << " bytes";
+    LOG(WARNING) << "total " << total;
+    LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize();
+    LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize();
+    LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
+    platform::SetDeviceId(cur_dev);
+  }
+  if (FLAGS_init_allocated_mem) {
+    cudaMemset(ptr, 0xEF, size);
+  }
+  return ptr;
+}
+
+template <>
+void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
+  GetGPUBuddyAllocator(place.device)->Free(p);
+}
+
+BuddyAllocator* GetCUDAPinnedBuddyAllocator() {
+  static BuddyAllocator* ba = NULL;
+  if (ba == NULL) {
+    ba = new BuddyAllocator(new detail::CUDAPinnedAllocator,
+                            platform::CUDAPinnedMinChunkSize(),
+                            platform::CUDAPinnedMaxChunkSize());
+  }
+  return ba;
+}
+
+template <>
+size_t Used<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place) {
+  return GetCUDAPinnedBuddyAllocator()->Used();
+}
+
+template <>
+void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place,
+                                       size_t size) {
+  auto* buddy_allocator = GetCUDAPinnedBuddyAllocator();
+  void* ptr = buddy_allocator->Alloc(size);
+
+  if (ptr == nullptr) {
+    LOG(WARNING) << "cudaMallocHost Cannot allocate " << size
+                 << " bytes in CUDAPinnedPlace";
+  }
+  if (FLAGS_init_allocated_mem) {
+    memset(ptr, 0xEF, size);
+  }
+  return ptr;
+}
+
+template <>
+void Free<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place, void* p) {
+  GetCUDAPinnedBuddyAllocator()->Free(p);
+}
+#endif
+
+size_t Usage::operator()(const platform::CPUPlace& cpu) const {
+  return Used(cpu);
+}
+
+size_t Usage::operator()(const platform::CUDAPlace& gpu) const {
+#ifdef PADDLE_WITH_CUDA
+  return Used(gpu);
+#else
+  PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+#endif
+}
+
+size_t Usage::operator()(const platform::CUDAPinnedPlace& cuda_pinned) const {
+#ifdef PADDLE_WITH_CUDA
+  return Used(cuda_pinned);
+#else
+  PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device.");
+#endif
+}
+
+size_t memory_usage(const platform::Place& p) {
+  return boost::apply_visitor(Usage(), p);
+}
+
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e6bfddd69cb16edf323d040ea5369cd551f299e
--- /dev/null
+++ b/paddle/fluid/memory/malloc.h
@@ -0,0 +1,104 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace memory {
+
+/**
+ * \brief   Allocate memory block in one place.
+ *
+ * \param[in]  place  Allocation place (CPU or GPU).
+ * \param[in]  size   Allocation size.
+ *
+ * \return  Allocated memory block address.
+ *
+ * \note    If return nullptr, it indicates memory allocation failed
+ *          because insufficient memory in current system. When Alloc
+ *          function is invoked, you must check the returned memory
+ *          address is valid or not.
+ */
+template <typename Place>
+void* Alloc(Place place, size_t size);
+
+/**
+ * \brief   Free memory block in one place.
+ *
+ * \param[in]  place  Allocation place (CPU or GPU).
+ * \param[in]  ptr    Memory block address to free.
+ *
+ */
+template <typename Place>
+void Free(Place place, void* ptr);
+
+/**
+ * \brief   Total size of used memory in one place.
+ *
+ * \param[in]  place  Allocation place (CPU or GPU).
+ *
+ */
+template <typename Place>
+size_t Used(Place place);
+
+struct Usage : public boost::static_visitor<size_t> {
+  size_t operator()(const platform::CPUPlace& cpu) const;
+  size_t operator()(const platform::CUDAPlace& gpu) const;
+  size_t operator()(const platform::CUDAPinnedPlace& cuda_pinned) const;
+};
+
+size_t memory_usage(const platform::Place& p);
+
+/**
+ * \brief   Free memory block in one place.
+ *
+ * \note    In some cases, custom deleter is used to
+ *          deallocate the memory automatically for
+ *          std::unique_ptr<T> in tensor.h.
+ *
+ */
+template <typename T, typename Place>
+class PODDeleter {
+  static_assert(std::is_pod<T>::value, "T must be POD");
+
+ public:
+  explicit PODDeleter(Place place) : place_(place) {}
+  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
+
+ private:
+  Place place_;
+};
+
+/**
+ * \brief   Free memory block in one place does not meet POD
+ *
+ * \note    In some cases, custom deleter is used to
+ *          deallocate the memory automatically for
+ *          std::unique_ptr<T> in tensor.h.
+ *
+ */
+template <typename T, typename Place>
+class PlainDeleter {
+ public:
+  explicit PlainDeleter(Place place) : place_(place) {}
+  void operator()(T* ptr) { Free(place_, reinterpret_cast<void*>(ptr)); }
+
+ private:
+  Place place_;
+};
+
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/malloc_test.cc b/paddle/fluid/memory/malloc_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d39466ef60c3750600dea726a6570397423d42f6
--- /dev/null
+++ b/paddle/fluid/memory/malloc_test.cc
@@ -0,0 +1,198 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/memory/malloc.h"
+
+#include <unordered_map>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/memory/detail/memory_block.h"
+#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/place.h"
+
+inline bool is_aligned(void const *p) {
+  return 0 == (reinterpret_cast<uintptr_t>(p) & 0x3);
+}
+
+size_t align(size_t size, paddle::platform::CPUPlace place) {
+  size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
+  size_t alignment = paddle::platform::CpuMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+TEST(BuddyAllocator, CPUAllocation) {
+  void *p = nullptr;
+
+  EXPECT_EQ(p, nullptr);
+
+  paddle::platform::CPUPlace cpu;
+  p = paddle::memory::Alloc(cpu, 4096);
+
+  EXPECT_NE(p, nullptr);
+
+  paddle::platform::Place place = cpu;
+  EXPECT_EQ(paddle::memory::Used(cpu), paddle::memory::memory_usage(place));
+
+  paddle::memory::Free(cpu, p);
+}
+
+TEST(BuddyAllocator, CPUMultAlloc) {
+  paddle::platform::CPUPlace cpu;
+
+  std::unordered_map<void *, size_t> ps;
+
+  size_t total_size = paddle::memory::Used(cpu);
+  EXPECT_EQ(total_size, 0UL);
+
+  for (auto size :
+       {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+    ps[paddle::memory::Alloc(cpu, size)] = size;
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(size, cpu);
+    total_size += aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
+  }
+
+  for (auto p : ps) {
+    EXPECT_EQ(is_aligned(p.first), true);
+    paddle::memory::Free(cpu, p.first);
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(p.second, cpu);
+    total_size -= aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
+  }
+}
+
+#ifdef PADDLE_WITH_CUDA
+
+size_t align(size_t size, paddle::platform::CUDAPlace place) {
+  size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
+  size_t alignment = paddle::platform::GpuMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+TEST(BuddyAllocator, GPUAllocation) {
+  void *p = nullptr;
+
+  EXPECT_EQ(p, nullptr);
+
+  paddle::platform::CUDAPlace gpu(0);
+  p = paddle::memory::Alloc(gpu, 4096);
+
+  EXPECT_NE(p, nullptr);
+
+  paddle::platform::Place place = gpu;
+  EXPECT_EQ(paddle::memory::Used(gpu), paddle::memory::memory_usage(place));
+
+  paddle::memory::Free(gpu, p);
+}
+
+TEST(BuddyAllocator, GPUMultAlloc) {
+  paddle::platform::CUDAPlace gpu;
+
+  std::unordered_map<void *, size_t> ps;
+
+  size_t total_size = paddle::memory::Used(gpu);
+  EXPECT_EQ(total_size, 0UL);
+
+  for (auto size :
+       {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+    ps[paddle::memory::Alloc(gpu, size)] = size;
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(gpu) == total_size) continue;
+
+    size_t aligned_size = align(size, gpu);
+    total_size += aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
+  }
+
+  for (auto p : ps) {
+    EXPECT_EQ(is_aligned(p.first), true);
+    paddle::memory::Free(gpu, p.first);
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(gpu) == total_size) continue;
+
+    size_t aligned_size = align(p.second, gpu);
+    total_size -= aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
+  }
+}
+
+size_t align(size_t size, paddle::platform::CUDAPinnedPlace place) {
+  size += sizeof(paddle::memory::detail::MemoryBlock::Desc);
+  size_t alignment = paddle::platform::CUDAPinnedMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+TEST(BuddyAllocator, CUDAPinnedAllocator) {
+  void *p = nullptr;
+
+  EXPECT_EQ(p, nullptr);
+
+  paddle::platform::CUDAPinnedPlace cpu;
+  p = paddle::memory::Alloc(cpu, 4096);
+
+  EXPECT_NE(p, nullptr);
+
+  paddle::platform::Place place = cpu;
+  EXPECT_EQ(paddle::memory::Used(cpu), paddle::memory::memory_usage(place));
+
+  paddle::memory::Free(cpu, p);
+}
+
+TEST(BuddyAllocator, CUDAPinnedMultAllocator) {
+  paddle::platform::CUDAPinnedPlace cpu;
+
+  std::unordered_map<void *, size_t> ps;
+
+  size_t total_size = paddle::memory::Used(cpu);
+  EXPECT_EQ(total_size, 0UL);
+
+  for (auto size :
+       {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+    ps[paddle::memory::Alloc(cpu, size)] = size;
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(size, cpu);
+    total_size += aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
+  }
+
+  for (auto p : ps) {
+    EXPECT_EQ(is_aligned(p.first), true);
+    paddle::memory::Free(cpu, p.first);
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(p.second, cpu);
+    total_size -= aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
+  }
+}
+#endif
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index b991360d0442ec2d258443a931a9dcf10b332f1e..a177d4985fd0e2cca983b6873af89c60f526b811 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -32,7 +32,11 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
     platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
     const void* src, size_t num, cudaStream_t stream) {
   platform::SetDeviceId(src_place.device);
-  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+  if (stream) {
+    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+  } else {
+    platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
+  }
 }
 
 template <>
@@ -40,7 +44,11 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
     platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
     const void* src, size_t num, cudaStream_t stream) {
   platform::SetDeviceId(dst_place.device);
-  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
+  if (stream) {
+    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
+  } else {
+    platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
+  }
 }
 
 template <>
@@ -49,10 +57,66 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
     const void* src, size_t num, cudaStream_t stream) {
   if (dst_place == src_place) {
     platform::SetDeviceId(src_place.device);
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
+    if (stream) {
+      platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
+    } else {
+      platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
+    }
+  } else {
+    if (stream) {
+      platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
+                                   num, stream);
+    } else {
+      platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
+                                  num);
+    }
+  }
+}
+
+template <>
+void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>(
+    platform::CPUPlace dst_place, void* dst,
+    platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
+template <>
+void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
+    platform::CUDAPinnedPlace dst_place, void* dst,
+    platform::CPUPlace src_place, const void* src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
+template <>
+void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
+    platform::CUDAPinnedPlace dst_place, void* dst,
+    platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
+template <>
+void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
+    platform::CUDAPinnedPlace dst_place, void* dst,
+    platform::CUDAPlace src_place, const void* src, size_t num,
+    cudaStream_t stream) {
+  platform::SetDeviceId(src_place.device);
+  if (stream) {
+    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+  } else {
+    platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
+  }
+}
+
+template <>
+void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
+    platform::CUDAPlace dst_place, void* dst,
+    platform::CUDAPinnedPlace src_place, const void* src, size_t num,
+    cudaStream_t stream) {
+  platform::SetDeviceId(dst_place.device);
+  if (stream) {
+    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
   } else {
-    platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,
-                            stream);
+    platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
   }
 }
 
diff --git a/paddle/fluid/memory/memory.cc b/paddle/fluid/memory/memory.cc
deleted file mode 100644
index d07f89439a1ec37682f79799d5569cad2ab75818..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/memory.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/memory/memory.h"
-
-#include "glog/logging.h"
-
-#include "paddle/fluid/memory/detail/buddy_allocator.h"
-#include "paddle/fluid/memory/detail/system_allocator.h"
-#include "paddle/fluid/platform/gpu_info.h"
-
-DECLARE_double(fraction_of_gpu_memory_to_use);
-
-namespace paddle {
-namespace memory {
-
-using BuddyAllocator = detail::BuddyAllocator;
-
-BuddyAllocator* GetCPUBuddyAllocator() {
-  static detail::BuddyAllocator* a = nullptr;
-  if (a == nullptr) {
-    a = new detail::BuddyAllocator(new detail::CPUAllocator,
-                                   platform::CpuMinChunkSize(),
-                                   platform::CpuMaxChunkSize());
-  }
-  return a;
-}
-
-template <>
-void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
-  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
-  void* p = GetCPUBuddyAllocator()->Alloc(size);
-  VLOG(10) << "  pointer=" << p;
-  return p;
-}
-
-template <>
-void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
-  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
-  GetCPUBuddyAllocator()->Free(p);
-}
-
-template <>
-size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
-  return GetCPUBuddyAllocator()->Used();
-}
-
-#ifdef PADDLE_WITH_CUDA
-
-BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
-  static BuddyAllocator** as = NULL;
-  if (as == NULL) {
-    int gpu_num = platform::GetCUDADeviceCount();
-    as = new BuddyAllocator*[gpu_num];
-    for (int gpu = 0; gpu < gpu_num; gpu++) {
-      as[gpu] = nullptr;
-    }
-  }
-  platform::SetDeviceId(gpu_id);
-  if (!as[gpu_id]) {
-    as[gpu_id] = new BuddyAllocator(new detail::GPUAllocator,
-                                    platform::GpuMinChunkSize(),
-                                    platform::GpuMaxChunkSize());
-    VLOG(10) << "\n\nNOTE: each GPU device use "
-             << FLAGS_fraction_of_gpu_memory_to_use * 100
-             << "% of GPU memory.\n"
-             << "You can set GFlags environment variable '"
-             << "FLAGS_fraction_of_gpu_memory_to_use"
-             << "' to change the fraction of GPU usage.\n\n";
-  }
-  return as[gpu_id];
-}
-
-template <>
-size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
-  return GetGPUBuddyAllocator(place.device)->Used();
-}
-
-template <>
-void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
-  auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
-  auto* ptr = buddy_allocator->Alloc(size);
-  if (ptr == nullptr) {
-    int cur_dev = platform::GetCurrentDeviceId();
-    platform::SetDeviceId(place.device);
-    size_t avail, total;
-    platform::GpuMemoryUsage(avail, total);
-    LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
-                 << place.device << ", available " << avail << " bytes";
-    LOG(WARNING) << "total " << total;
-    LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize();
-    LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize();
-    LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
-    platform::SetDeviceId(cur_dev);
-  }
-  return ptr;
-}
-
-template <>
-void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
-  GetGPUBuddyAllocator(place.device)->Free(p);
-}
-
-#endif
-
-size_t Usage::operator()(const platform::CPUPlace& cpu) const {
-  return Used(cpu);
-}
-
-size_t Usage::operator()(const platform::CUDAPlace& gpu) const {
-#ifdef PADDLE_WITH_CUDA
-  return Used(gpu);
-#else
-  PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
-#endif
-}
-
-size_t memory_usage(const platform::Place& p) {
-  return boost::apply_visitor(Usage(), p);
-}
-
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/memory.h b/paddle/fluid/memory/memory.h
index 7c5db815d6543f026ab99f7cf895a87db4e5a3d8..8d904e3be56abf0974ba7379f7ca1b676fcb0409 100644
--- a/paddle/fluid/memory/memory.h
+++ b/paddle/fluid/memory/memory.h
@@ -14,90 +14,5 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace memory {
-
-/**
- * \brief   Allocate memory block in one place.
- *
- * \param[in]  place  Allocation place (CPU or GPU).
- * \param[in]  size   Allocation size.
- *
- * \return  Allocated memory block address.
- *
- * \note    If return nullptr, it indicates memory allocation failed
- *          because insufficient memory in current system. When Alloc
- *          function is invoked, you must check the returned memory
- *          address is valid or not.
- */
-template <typename Place>
-void* Alloc(Place place, size_t size);
-
-/**
- * \brief   Free memory block in one place.
- *
- * \param[in]  place  Allocation place (CPU or GPU).
- * \param[in]  ptr    Memory block address to free.
- *
- */
-template <typename Place>
-void Free(Place place, void* ptr);
-
-/**
- * \brief   Total size of used memory in one place.
- *
- * \param[in]  place  Allocation place (CPU or GPU).
- *
- */
-template <typename Place>
-size_t Used(Place place);
-
-struct Usage : public boost::static_visitor<size_t> {
-  size_t operator()(const platform::CPUPlace& cpu) const;
-  size_t operator()(const platform::CUDAPlace& gpu) const;
-};
-
-size_t memory_usage(const platform::Place& p);
-
-/**
- * \brief   Free memory block in one place.
- *
- * \note    In some cases, custom deleter is used to
- *          deallocate the memory automatically for
- *          std::unique_ptr<T> in tensor.h.
- *
- */
-template <typename T, typename Place>
-class PODDeleter {
-  static_assert(std::is_pod<T>::value, "T must be POD");
-
- public:
-  explicit PODDeleter(Place place) : place_(place) {}
-  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
-
- private:
-  Place place_;
-};
-
-/**
- * \brief   Free memory block in one place does not meet POD
- *
- * \note    In some cases, custom deleter is used to
- *          deallocate the memory automatically for
- *          std::unique_ptr<T> in tensor.h.
- *
- */
-template <typename T, typename Place>
-class PlainDeleter {
- public:
-  explicit PlainDeleter(Place place) : place_(place) {}
-  void operator()(T* ptr) { Free(place_, reinterpret_cast<void*>(ptr)); }
-
- private:
-  Place place_;
-};
-
-}  // namespace memory
-}  // namespace paddle
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
diff --git a/paddle/fluid/memory/memory_test.cc b/paddle/fluid/memory/memory_test.cc
deleted file mode 100644
index ae98d0d52542c49620a5d598b1089c168d39ede4..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/memory_test.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/memory/detail/memory_block.h"
-#include "paddle/fluid/memory/detail/meta_data.h"
-
-#include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/gpu_info.h"
-#include "paddle/fluid/platform/place.h"
-
-#include <gtest/gtest.h>
-#include <unordered_map>
-
-inline bool is_aligned(void const *p) {
-  return 0 == (reinterpret_cast<uintptr_t>(p) & 0x3);
-}
-
-size_t align(size_t size, paddle::platform::CPUPlace place) {
-  size += sizeof(paddle::memory::detail::Metadata);
-  size_t alignment = paddle::platform::CpuMinChunkSize();
-  size_t remaining = size % alignment;
-  return remaining == 0 ? size : size + (alignment - remaining);
-}
-
-TEST(BuddyAllocator, CPUAllocation) {
-  void *p = nullptr;
-
-  EXPECT_EQ(p, nullptr);
-
-  paddle::platform::CPUPlace cpu;
-  p = paddle::memory::Alloc(cpu, 4096);
-
-  EXPECT_NE(p, nullptr);
-
-  paddle::platform::Place place = cpu;
-  EXPECT_EQ(paddle::memory::Used(cpu), paddle::memory::memory_usage(place));
-
-  paddle::memory::Free(cpu, p);
-}
-
-TEST(BuddyAllocator, CPUMultAlloc) {
-  paddle::platform::CPUPlace cpu;
-
-  std::unordered_map<void *, size_t> ps;
-
-  size_t total_size = paddle::memory::Used(cpu);
-  EXPECT_EQ(total_size, 0UL);
-
-  for (auto size :
-       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
-    ps[paddle::memory::Alloc(cpu, size)] = size;
-
-    // Buddy Allocator doesn't manage too large memory chunk
-    if (paddle::memory::Used(cpu) == total_size) continue;
-
-    size_t aligned_size = align(size, cpu);
-    total_size += aligned_size;
-    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
-  }
-
-  for (auto p : ps) {
-    EXPECT_EQ(is_aligned(p.first), true);
-    paddle::memory::Free(cpu, p.first);
-
-    // Buddy Allocator doesn't manage too large memory chunk
-    if (paddle::memory::Used(cpu) == total_size) continue;
-
-    size_t aligned_size = align(p.second, cpu);
-    total_size -= aligned_size;
-    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
-  }
-}
-
-#ifdef PADDLE_WITH_CUDA
-
-size_t align(size_t size, paddle::platform::CUDAPlace place) {
-  size += sizeof(paddle::memory::detail::Metadata);
-  size_t alignment = paddle::platform::GpuMinChunkSize();
-  size_t remaining = size % alignment;
-  return remaining == 0 ? size : size + (alignment - remaining);
-}
-
-TEST(BuddyAllocator, GPUAllocation) {
-  void *p = nullptr;
-
-  EXPECT_EQ(p, nullptr);
-
-  paddle::platform::CUDAPlace gpu(0);
-  p = paddle::memory::Alloc(gpu, 4096);
-
-  EXPECT_NE(p, nullptr);
-
-  paddle::platform::Place place = gpu;
-  EXPECT_EQ(paddle::memory::Used(gpu), paddle::memory::memory_usage(place));
-
-  paddle::memory::Free(gpu, p);
-}
-
-TEST(BuddyAllocator, GPUMultAlloc) {
-  paddle::platform::CUDAPlace gpu;
-
-  std::unordered_map<void *, size_t> ps;
-
-  size_t total_size = paddle::memory::Used(gpu);
-  EXPECT_EQ(total_size, 0UL);
-
-  for (auto size :
-       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
-    ps[paddle::memory::Alloc(gpu, size)] = size;
-
-    // Buddy Allocator doesn't manage too large memory chunk
-    if (paddle::memory::Used(gpu) == total_size) continue;
-
-    size_t aligned_size = align(size, gpu);
-    total_size += aligned_size;
-    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
-  }
-
-  for (auto p : ps) {
-    EXPECT_EQ(is_aligned(p.first), true);
-    paddle::memory::Free(gpu, p.first);
-
-    // Buddy Allocator doesn't manage too large memory chunk
-    if (paddle::memory::Used(gpu) == total_size) continue;
-
-    size_t aligned_size = align(p.second, gpu);
-    total_size -= aligned_size;
-    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
-  }
-}
-
-#endif
diff --git a/paddle/fluid/memory/pinned_memory_test.cu b/paddle/fluid/memory/pinned_memory_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0d898f59ee1b8c783c5357aa7e27581a993a6d30
--- /dev/null
+++ b/paddle/fluid/memory/pinned_memory_test.cu
@@ -0,0 +1,146 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include <unordered_map>
+
+#include "paddle/fluid/memory/detail/memory_block.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/memory/memory.h"
+
+#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/place.h"
+
+// This unit test is an example comparing the performance between using pinned
+// memory and not. In general, using pinned memory will be faster.
+template <typename T>
+__global__ void Kernel(T* output, int dim) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < dim) {
+    output[tid] = output[tid] * output[tid] / 100;
+  }
+}
+
+template <typename Place>
+float test_pinned_memory() {
+  Place cpu_place;
+  paddle::platform::CUDAPlace cuda_place;
+
+  const int data_size = 4096;
+  const int iteration = 10;
+
+  // create event start and end
+  cudaEvent_t start_e, stop_e, copying_e;
+  float elapsedTime = 0;
+  cudaEventCreate(&start_e);
+  cudaEventCreate(&stop_e);
+  cudaEventCreate(&copying_e);
+
+  // create computation stream, data copying stream
+  cudaStream_t computation_stream, copying_stream;
+  cudaStreamCreate(&computation_stream);
+  cudaStreamCreate(&copying_stream);
+
+  // create record event, pinned memory, gpu memory
+  std::vector<cudaEvent_t> record_event(iteration);
+  std::vector<float*> input_pinned_mem(iteration);
+  std::vector<float*> gpu_mem(iteration);
+  std::vector<float*> output_pinned_mem(iteration);
+
+  // initial data
+  for (int j = 0; j < iteration; ++j) {
+    cudaEventCreateWithFlags(&record_event[j], cudaEventDisableTiming);
+    cudaEventCreate(&(record_event[j]));
+    input_pinned_mem[j] = static_cast<float*>(
+        paddle::memory::Alloc(cpu_place, data_size * sizeof(float)));
+    output_pinned_mem[j] = static_cast<float*>(
+        paddle::memory::Alloc(cpu_place, data_size * sizeof(float)));
+    gpu_mem[j] = static_cast<float*>(
+        paddle::memory::Alloc(cuda_place, data_size * sizeof(float)));
+
+    for (int k = 0; k < data_size; ++k) {
+      input_pinned_mem[j][k] = k;
+    }
+  }
+
+  cudaEventRecord(start_e, computation_stream);
+
+  // computation
+  for (int m = 0; m < 30; ++m) {
+    for (int i = 0; i < iteration; ++i) {
+      // cpu -> GPU on computation stream.
+      // note: this operation is async for pinned memory.
+      paddle::memory::Copy(cuda_place, gpu_mem[i], cpu_place,
+                           input_pinned_mem[i], data_size * sizeof(float),
+                           computation_stream);
+
+      // call kernel on computation stream.
+      Kernel<<<4, 1024, 0, computation_stream>>>(gpu_mem[i], data_size);
+
+      // record event_computation on computation stream
+      cudaEventRecord(record_event[i], computation_stream);
+
+      // wait event_computation on copy stream.
+      // note: this operation is async.
+      cudaStreamWaitEvent(copying_stream, record_event[i], 0);
+
+      // copy data GPU->CPU, on copy stream.
+      // note: this operation is async for pinned memory.
+      paddle::memory::Copy(cpu_place, output_pinned_mem[i], cuda_place,
+                           gpu_mem[i], data_size * sizeof(float),
+                           copying_stream);
+    }
+  }
+
+  cudaEventRecord(copying_e, copying_stream);
+  cudaStreamWaitEvent(computation_stream, copying_e, 0);
+
+  cudaEventRecord(stop_e, computation_stream);
+
+  cudaEventSynchronize(start_e);
+  cudaEventSynchronize(stop_e);
+  cudaEventElapsedTime(&elapsedTime, start_e, stop_e);
+
+  // std::cout << cpu_place << " "
+  //          << "time consume:" << elapsedTime / 30 << std::endl;
+
+  for (int l = 0; l < iteration; ++l) {
+    for (int k = 0; k < data_size; ++k) {
+      float temp = input_pinned_mem[l][k];
+      temp = temp * temp / 100;
+      EXPECT_FLOAT_EQ(temp, output_pinned_mem[l][k]);
+    }
+  }
+
+  // destroy resource
+  cudaEventDestroy(copying_e);
+  cudaEventDestroy(start_e);
+  cudaEventDestroy(stop_e);
+  for (int j = 0; j < 10; ++j) {
+    cudaEventDestroy((record_event[j]));
+    paddle::memory::Free(cpu_place, input_pinned_mem[j]);
+    paddle::memory::Free(cpu_place, output_pinned_mem[j]);
+    paddle::memory::Free(cuda_place, gpu_mem[j]);
+  }
+  return elapsedTime / 30;
+}
+
+TEST(CPUANDCUDAPinned, CPUAllocatorAndCUDAPinnedAllocator) {
+  // Generally speaking, operation on pinned_memory is faster than that on
+  // unpinned-memory, but if this unit test fails frequently, please close this
+  // test for the time being.
+  float time1 = test_pinned_memory<paddle::platform::CPUPlace>();
+  float time2 = test_pinned_memory<paddle::platform::CUDAPinnedPlace>();
+  EXPECT_GT(time1, time2);
+}
diff --git a/paddle/fluid/operators/.clang-format b/paddle/fluid/operators/.clang-format
deleted file mode 100644
index 29282dc87e2c499988c17d90d47d44cd5cf7f115..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/.clang-format
+++ /dev/null
@@ -1,5 +0,0 @@
----
-Language:        Cpp
-BasedOnStyle:  Google
-Standard:  Cpp11 
-...
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 625e0f7561899d30b40f9daa56f743a37bdaa27f..ab1d2143330fb8cbfd535758a83bc71de939c4e0 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -3,8 +3,8 @@ string(REPLACE "_mkldnn" "" GENERAL_OPS "${GENERAL_OPS}")
 string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
 list(REMOVE_DUPLICATES GENERAL_OPS)
 set(DEPS_OPS "")
-set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/pybind.h)
-file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
+set(pybind_file ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h)
+file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
 function(op_library TARGET)
     # op_library is a function to create op library. The interface is same as
     # cc_library. But it handle split GPU/CPU code and link some common library
@@ -12,6 +12,8 @@ function(op_library TARGET)
     set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
     set(cc_srcs)
     set(cu_srcs)
+    set(hip_cu_srcs)
+    set(miopen_hip_cc_srcs)
     set(cu_cc_srcs)
     set(cudnn_cu_cc_srcs)
     set(CUDNN_FILE)
@@ -36,10 +38,19 @@ function(op_library TARGET)
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
             list(APPEND cu_srcs ${TARGET}.cu)
         endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
+            list(APPEND hip_cu_srcs ${TARGET}.hip.cu)
+        endif()
         string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}")
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
             list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
         endif()
+        if(WITH_AMD_GPU)
+            string(REPLACE "_op" "_miopen_op" MIOPEN_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cc)
+                list(APPEND miopen_hip_cc_srcs ${MIOPEN_FILE}.hip.cc)
+            endif()
+        endif()
         if(WITH_MKLDNN)
             string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
@@ -48,10 +59,14 @@ function(op_library TARGET)
         endif()
     else()
         foreach(src ${op_library_SRCS})
-            if (${src} MATCHES ".*\\.cu$")
+            if (${src} MATCHES ".*\\.hip.cu$")
+                list(APPEND hip_cu_srcs ${src})
+            elseif (${src} MATCHES ".*\\.cu$")
                 list(APPEND cu_srcs ${src})
             elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
                 list(APPEND cudnn_cu_cc_srcs ${src})
+            elseif(WITH_AMD_GPU AND ${src} MATCHES ".*_miopen_op.hip.cc$")
+                list(APPEND miopen_hip_cc_srcs ${src})
             elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
                 list(APPEND mkldnn_cc_srcs ${src})
             elseif(${src} MATCHES ".*\\.cu.cc$")
@@ -76,28 +91,31 @@ function(op_library TARGET)
     if (WITH_GPU)
         nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
+    elseif (WITH_AMD_GPU)
+        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
     else()
         cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
             ${op_common_deps})
     endif()
 
     # Define operators that don't need pybind here.
-    foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
+    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
         endif()
     endforeach()
 
-    # The registration of USE_OP, please refer to paddle/framework/op_registry.h.
+    # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
     # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
     # And for detail pybind information, please see generated paddle/pybind/pybind.h.
     file(READ ${TARGET}.cc TARGET_CONTENT)
-    string(REGEX MATCH "REGISTER_OP\\(.*REGISTER_OP\\(" multi_register "${TARGET_CONTENT}")
-    string(REGEX MATCH "REGISTER_OP\\([a-z0-9_]*," one_register "${multi_register}")
+    string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
+    string(REGEX MATCH "REGISTER_OPERATOR\\([a-z0-9_]*," one_register "${multi_register}")
     if (one_register STREQUAL "")
         string(REPLACE "_op" "" TARGET "${TARGET}")
     else ()
-        string(REPLACE "REGISTER_OP(" "" TARGET "${one_register}")
+        string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}")
         string(REPLACE "," "" TARGET "${TARGET}")
     endif()
 
@@ -114,7 +132,10 @@ function(op_library TARGET)
     list(LENGTH cu_srcs cu_srcs_len)
     list(LENGTH cu_cc_srcs cu_cc_srcs_len)
     list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
-    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
+    list(LENGTH hip_cu_srcs hip_cu_srcs_len)
+    list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
+    if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
+        ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0)
         file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
         set(pybind_flag 1)
     endif()
@@ -125,14 +146,31 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
     endif()
 
+    # pybind USE_OP_DEVICE_KERNEL for MIOPEN
+    if (WITH_AMD_GPU AND ${miopen_hip_cc_srcs_len} GREATER 0)
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
+    endif()
+
     # pybind USE_OP_DEVICE_KERNEL for MKLDNN
     if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
+      # Append first implemented MKLDNN activation operator
+      if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
+      else()
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
+      endif()
     endif()
 
     # pybind USE_OP
     if (${pybind_flag} EQUAL 0)
+      # NOTE(*): activation use macro to regist the kernels, set use_op manually.
+      if(${TARGET} STREQUAL "activation")
+        file(APPEND ${pybind_file} "USE_OP(relu);\n")
+      elseif(${TARGET} STREQUAL "fake_dequantize")
+        file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
+      else()
         file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
+      endif()
     endif()
 endfunction()
 
@@ -146,27 +184,66 @@ else()
     set(DEPS_OPS ${DEPS_OPS} nccl_op)
 endif()
 
-add_subdirectory(detail)
+set(DISTRIBUTE_DEPS "")
 if(WITH_DISTRIBUTE)
-    set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
+    add_subdirectory(distributed)
+    
+    set(DISTRIBUTE_DEPS "")
+    if(WITH_GRPC)
+        set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
+    else()
+        set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib)
+        if(WITH_BRPC_RDMA)
+            find_library(IBVERBS_LIBRARY NAMES ibverbs)
+            ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
+            SET_PROPERTY(TARGET ibverbs PROPERTY IMPORTED_LOCATION ${IBVERBS_LIBRARY})
+
+
+            find_library(RDMACM_LIBRARY NAMES rdmacm)
+            ADD_LIBRARY(rdmacm SHARED IMPORTED GLOBAL)
+            SET_PROPERTY(TARGET rdmacm PROPERTY IMPORTED_LOCATION ${RDMACM_LIBRARY})
+
+            set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} ibverbs rdmacm)
+        endif()
+    endif()
+
     set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-    op_library(send_op DEPS ${DISTRIBUTE_DEPS})
-    set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
-    set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
-    set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor)
+    foreach(dist_op "prefetch_op" "checkpoint_notify_op" "listen_and_serv_op" "send_op" "recv_op" "send_barrier_op" "fetch_barrier_op")
+        op_library(${dist_op} DEPS ${DISTRIBUTE_DEPS})
+        set_source_files_properties(${dist_op}.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    endforeach()
+    
+    #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op
+    #        listen_and_serv_op sum_op executor SERIAL)
+    if(WITH_GPU)
+        set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+        cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op ${DISTRIBUTE_DEPS} executor SERIAL)
+        if(WITH_GRPC)
+            op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc)
+        else()
+            op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_brpc)
+        endif()
+        set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    else()
+        set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op)
+    endif()
 else()
-    set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op)
+    set(DEPS_OPS ${DEPS_OPS}  checkpoint_notify_op prefetch_op recv_op listen_and_serv_op send_op send_barrier_op fetch_barrier_op gen_nccl_id_op)
 endif()
 
-op_library(cond_op DEPS framework_proto tensor net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
 op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 op_library(softmax_op DEPS softmax)
-op_library(detection_output_op DEPS softmax)
 op_library(sequence_softmax_op DEPS softmax)
+if (WITH_GPU AND TENSORRT_FOUND)
+    op_library(tensorrt_engine_op DEPS tensorrt_engine)
+    nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
+      DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter
+      analysis)
+else()
+    set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
+endif()
 op_library(sum_op DEPS selected_rows_functor)
 op_library(sgd_op DEPS selected_rows_functor)
 op_library(print_op DEPS lod_tensor)
@@ -203,10 +280,23 @@ op_library(save_combine_op DEPS lod_tensor)
 op_library(load_combine_op DEPS lod_tensor)
 op_library(concat_op DEPS concat)
 
+# FIXME(thuan): Move CSP operators to paddle/fluid/framework/operators/concurrency
+add_subdirectory(concurrency)
+op_library(channel_send_op DEPS concurrency)
+op_library(channel_recv_op DEPS concurrency)
+
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
+
+# The fully connected layer is deleted when the WITH_MKLDNN flag is OFF
+# Because the fully connected layer has only one MKLDNN's operator
+if(NOT WITH_MKLDNN)
+    list(REMOVE_ITEM GENERAL_OPS fc_op)
+endif(NOT WITH_MKLDNN)
+
 foreach(src ${GENERAL_OPS})
     op_library(${src})
 endforeach()
+
 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
 
 add_subdirectory(reader)
@@ -214,14 +304,20 @@ foreach(src ${READER_LIBRARY})
     set(OP_LIBRARY ${src} ${OP_LIBRARY})
 endforeach()
 
+add_subdirectory(detection)
+foreach(src ${DETECTION_LIBRARY})
+    set(OP_LIBRARY ${src} ${OP_LIBRARY})
+endforeach()
+
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
+set(GLOB_DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} CACHE INTERNAL "distributed dependency")
 
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
-cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
 cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
-cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
+cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
 nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
+nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
diff --git a/paddle/fluid/operators/accuracy_op.cc b/paddle/fluid/operators/accuracy_op.cc
index ac10d759fecb56635d1303fd383a5f9ea18f0a4d..42fcace17926641b5caf677eb3c8ba5222e37190 100644
--- a/paddle/fluid/operators/accuracy_op.cc
+++ b/paddle/fluid/operators/accuracy_op.cc
@@ -63,8 +63,7 @@ class AccuracyOp : public framework::OperatorWithKernel {
 
 class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AccuracyOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     // TODO(typhoonzero): support both inference value and indices.
     AddInput("Out", "The network output of topk (inferences)");
     AddInput("Indices", "The the network output of topk (indices)");
diff --git a/paddle/fluid/operators/accuracy_op.cu b/paddle/fluid/operators/accuracy_op.cu
index 630a4a2df2ca8f6afe81be3c455d255a0693fcc3..23b48c6fdf427348879de07c671c65327d6436d7 100644
--- a/paddle/fluid/operators/accuracy_op.cu
+++ b/paddle/fluid/operators/accuracy_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <thrust/execution_policy.h>
 #include <thrust/reduce.h>
 #include "paddle/fluid/operators/accuracy_op.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/activation_mkldnn_op.cc b/paddle/fluid/operators/activation_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..137bca5e2b8e2754aed274970e08b03ee816a7f2
--- /dev/null
+++ b/paddle/fluid/operators/activation_mkldnn_op.cc
@@ -0,0 +1,349 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::DataLayout;
+using framework::Tensor;
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::stream;
+using platform::GetMKLDNNFormat;
+using platform::MKLDNNDeviceContext;
+using platform::to_void_cast;
+
+namespace {
+std::string gethash(const mkldnn::memory::dims &operand_dims,
+                    const mkldnn::algorithm algorithm) {
+  auto dim2str = [](const mkldnn::memory::dims &operand_dims) {
+    std::string dstr = "";
+    for (size_t i = 0; i < operand_dims.size(); ++i) {
+      dstr += std::to_string(operand_dims[i]) + "-";
+    }
+    return dstr;
+  };
+  return dim2str(operand_dims) + std::to_string(algorithm);
+}
+}  // namespace
+
+template <typename Functor>
+class MKLDNNActivationKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *x = ctx.Input<Tensor>("X");
+    PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
+                       x->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input x tensor");
+
+    Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto &attr : attrs) {
+      *attr.second = ctx.Attr<float>(attr.first);
+    }
+    functor(ctx);
+  }
+};
+
+template <typename Functor>
+class MKLDNNActivationGradKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    PADDLE_ENFORCE(diff_y->layout() == DataLayout::kMKLDNN &&
+                       diff_y->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input OutGrad tensor");
+
+    Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto &attr : attrs) {
+      *attr.second = ctx.Attr<float>(attr.first);
+    }
+    functor(ctx);
+  }
+};
+
+template <typename T>
+void eltwise_forward(const framework::ExecutionContext &ctx,
+                     mkldnn::algorithm algorithm, const T alpha = 0,
+                     const T beta = 0) {
+  PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                 "It must use CPUPlace.");
+  auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+  const auto &mkldnn_engine = dev_ctx.GetEngine();
+
+  const auto *x = ctx.Input<Tensor>("X");
+  auto *y = ctx.Output<Tensor>("Out");
+
+  const T *x_data = x->data<T>();
+  T *y_data = y->mutable_data<T>(ctx.GetPlace());
+
+  PADDLE_ENFORCE(x->dims().size() == 2 || x->dims().size() == 4,
+                 "Input dim must be with 2 or 4");
+
+  std::vector<int> src_tz = framework::vectorize2int(x->dims());
+
+  auto src_format =
+      src_tz.size() == 2 ? mkldnn::memory::format::nc : x->format();
+
+  const std::string key = gethash(src_tz, algorithm);
+  const std::string key_src_data =
+      key + ctx.op().Output("Out") + "@eltwise_fwd_src_data";
+  const std::string key_src_layout =
+      key + ctx.op().Output("Out") + "@eltwise_fwd_src_layout";
+  const std::string key_with_layout = key + std::to_string(src_format);
+  const std::string key_src_mem = key_with_layout + "@eltwise_fwd_src_mem";
+  const std::string key_dst_mem = key_with_layout + "@eltwise_fwd_dst_mem";
+  const std::string key_fwd = key_with_layout + "@eltwise_fwd";
+  const std::string key_fwd_pd = key_with_layout + "@eltwise_fwd_pd";
+
+  // save input data and layout to be referred in backward path
+  auto p_src_data = std::make_shared<const T *>(x_data);
+  dev_ctx.SetBlob(key_src_data, p_src_data);
+  auto p_src_layout = std::make_shared<memory::format>(src_format);
+  dev_ctx.SetBlob(key_src_layout, p_src_layout);
+
+  auto p_fwd = std::static_pointer_cast<mkldnn::eltwise_forward>(
+      dev_ctx.GetBlob(key_fwd));
+
+  std::shared_ptr<memory> dst_memory;
+
+  if (p_fwd == nullptr) {
+    // create mkldnn memory for input X
+    auto src_md = platform::MKLDNNMemDesc(
+        src_tz, platform::MKLDNNGetDataType<T>(), src_format);
+    auto src_memory = std::shared_ptr<memory>(
+        new memory({src_md, mkldnn_engine}, to_void_cast(x_data)));
+    // save src_memory to be referred in backward path
+    dev_ctx.SetBlob(key_src_mem, src_memory);
+
+    // create primitive descriptor for activation forward and save it
+    auto forward_desc = mkldnn::eltwise_forward::desc(
+        mkldnn::prop_kind::forward_training, algorithm,
+        src_memory->get_primitive_desc().desc(), alpha, beta);
+    auto forward_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
+        forward_desc, mkldnn_engine);
+
+    // save prim desc into global device context to be referred in backward path
+    dev_ctx.SetBlob(key_fwd_pd, forward_pd);
+
+    // create mkldnn memory for output y
+    dst_memory =
+        std::make_shared<memory>(forward_pd->dst_primitive_desc(), y_data);
+
+    dev_ctx.SetBlob(key_dst_mem, dst_memory);
+
+    // create activation primitive
+    p_fwd = std::make_shared<mkldnn::eltwise_forward>(*forward_pd, *src_memory,
+                                                      *dst_memory);
+    dev_ctx.SetBlob(key_fwd, p_fwd);
+  } else {
+    // primitives already exist
+    auto src_memory =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_src_mem));
+    PADDLE_ENFORCE(src_memory != nullptr,
+                   "Fail to find eltwise src_memory in device context.");
+    dst_memory =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_dst_mem));
+    PADDLE_ENFORCE(dst_memory != nullptr,
+                   "Fail to find eltwise dst_memory in device context.");
+
+    src_memory->set_data_handle(platform::to_void_cast(x_data));
+    dst_memory->set_data_handle(y_data);
+  }
+
+  // push primitive to stream and wait until it's executed
+  std::vector<primitive> pipeline;
+  pipeline.push_back(*p_fwd);
+  stream(stream::kind::eager).submit(pipeline).wait();
+
+  y->set_layout(DataLayout::kMKLDNN);
+  y->set_format(GetMKLDNNFormat(*dst_memory));
+}
+
+template <typename T>
+void eltwise_grad(const framework::ExecutionContext &ctx,
+                  mkldnn::algorithm algorithm, const T alpha = 0,
+                  const T beta = 0) {
+  auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+  const auto &mkldnn_engine = dev_ctx.GetEngine();
+
+  const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
+  auto *diff_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+  const T *diff_y_data = diff_y->data<T>();
+  T *diff_x_data = diff_x->mutable_data<T>(ctx.GetPlace());
+
+  std::vector<int> diff_dst_tz = framework::vectorize2int(diff_y->dims());
+
+  auto diff_y_format =
+      diff_dst_tz.size() == 2 ? mkldnn::memory::format::nc : diff_y->format();
+
+  const std::string key = gethash(diff_dst_tz, algorithm);
+  const std::string key_src_data =
+      key + ctx.op().Input("Out") + "@eltwise_fwd_src_data";
+  const std::string key_src_layout =
+      key + ctx.op().Input("Out") + "@eltwise_fwd_src_layout";
+  const auto p_src_layout =
+      std::static_pointer_cast<memory::format>(dev_ctx.GetBlob(key_src_layout));
+  const std::string key_src_mem =
+      key + std::to_string(*p_src_layout) + "@eltwise_fwd_src_mem";
+  const std::string key_fwd_pd =
+      key + std::to_string(*p_src_layout) + "@eltwise_fwd_pd";
+  const std::string key_with_layouts =
+      key + std::to_string(*p_src_layout) + "-" + std::to_string(diff_y_format);
+  const std::string key_diff_src_mem =
+      key_with_layouts + "@eltwise_diff_src_mem";
+  const std::string key_diff_dst_mem =
+      key_with_layouts + "@eltwise_diff_dst_mem";
+  const std::string key_grad = key_with_layouts + "@eltwise_grad";
+
+  const auto p_src_data =
+      std::static_pointer_cast<T *>(dev_ctx.GetBlob(key_src_data));
+
+  auto src_memory =
+      std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(key_src_mem));
+  PADDLE_ENFORCE(src_memory != nullptr,
+                 "Fail to find src_memory in device context");
+  src_memory->set_data_handle(*p_src_data.get());
+
+  std::shared_ptr<memory> diff_src_memory;
+
+  auto p_grad = std::static_pointer_cast<mkldnn::eltwise_backward>(
+      dev_ctx.GetBlob(key_grad));
+
+  if (p_grad == nullptr) {
+    // create mkldnn memory for input diff_y
+    auto diff_dst_md = platform::MKLDNNMemDesc(
+        diff_dst_tz, platform::MKLDNNGetDataType<T>(), diff_y_format);
+    auto diff_dst_memory = std::shared_ptr<memory>(
+        new memory({diff_dst_md, mkldnn_engine}, to_void_cast(diff_y_data)));
+    dev_ctx.SetBlob(key_diff_dst_mem, diff_dst_memory);
+
+    // retrieve eltwise primitive desc from device context
+    auto forward_pd =
+        std::static_pointer_cast<mkldnn::eltwise_forward::primitive_desc>(
+            dev_ctx.GetBlob(key_fwd_pd));
+    PADDLE_ENFORCE(forward_pd != nullptr,
+                   "Fail to find eltwise_fwd_pd in device context");
+
+    // ceate primitive descriptor for activation backward
+    auto backward_desc = mkldnn::eltwise_backward::desc(
+        algorithm, diff_dst_memory->get_primitive_desc().desc(),
+        src_memory->get_primitive_desc().desc(), alpha, beta);
+    auto backward_pd = mkldnn::eltwise_backward::primitive_desc(
+        backward_desc, mkldnn_engine, *forward_pd);
+
+    // create mkldnn memory for output diff_src
+    diff_src_memory = std::make_shared<memory>(
+        backward_pd.diff_src_primitive_desc(), diff_x_data);
+    dev_ctx.SetBlob(key_diff_src_mem, diff_src_memory);
+
+    // create activation backward primitive
+    p_grad = std::make_shared<mkldnn::eltwise_backward>(
+        backward_pd, *src_memory, *diff_dst_memory, *diff_src_memory);
+    dev_ctx.SetBlob(key_grad, p_grad);
+  } else {
+    // primitives already exist
+    diff_src_memory = std::static_pointer_cast<mkldnn::memory>(
+        dev_ctx.GetBlob(key_diff_src_mem));
+    auto diff_dst_memory = std::static_pointer_cast<mkldnn::memory>(
+        dev_ctx.GetBlob(key_diff_dst_mem));
+
+    diff_src_memory->set_data_handle(
+        platform::to_void_reinterpret_cast(diff_x_data));
+    diff_dst_memory->set_data_handle(
+        platform::to_void_reinterpret_cast(diff_y_data));
+  }
+
+  // push primitive to stream and wait until it's executed
+  std::vector<primitive> pipeline;
+  pipeline.push_back(*p_grad);
+  stream(stream::kind::eager).submit(pipeline).wait();
+
+  diff_x->set_layout(DataLayout::kMKLDNN);
+  diff_x->set_format(GetMKLDNNFormat(*diff_src_memory));
+}
+
+template <typename T, mkldnn::algorithm algorithm>
+struct MKLDNNActivationFunc : public BaseActivationFunctor<T> {
+  void operator()(const framework::ExecutionContext &ctx) const {
+    eltwise_forward<T>(ctx, algorithm);
+  }
+};
+
+template <typename T, mkldnn::algorithm algorithm>
+struct MKLDNNActivationGradFunc : public BaseActivationFunctor<T> {
+  void operator()(const framework::ExecutionContext &ctx) const {
+    eltwise_grad<T>(ctx, algorithm);
+  }
+};
+
+template <typename T>
+using ReluMKLDNNFunctor =
+    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_relu>;
+
+template <typename T>
+using TanhMKLDNNFunctor =
+    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_tanh>;
+
+template <typename T>
+using SqrtMKLDNNFunctor =
+    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_sqrt>;
+
+template <typename T>
+using AbsMKLDNNFunctor =
+    MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_abs>;
+
+template <typename T>
+using ReluMKLDNNGradFunctor =
+    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_relu>;
+
+template <typename T>
+using TanhMKLDNNGradFunctor =
+    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_tanh>;
+
+template <typename T>
+using SqrtMKLDNNGradFunctor =
+    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_sqrt>;
+
+template <typename T>
+using AbsMKLDNNGradFunctor =
+    MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_abs>;
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+#define REGISTER_ACTIVATION_MKLDNN_KERNEL(act_type, functor, grad_functor) \
+  REGISTER_OP_KERNEL(act_type, MKLDNN, ::paddle::platform::CPUPlace,       \
+                     ops::MKLDNNActivationKernel<ops::functor<float>>);    \
+  REGISTER_OP_KERNEL(                                                      \
+      act_type##_grad, MKLDNN, ::paddle::platform::CPUPlace,               \
+      ops::MKLDNNActivationGradKernel<ops::grad_functor<float>>);
+
+#define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro)            \
+  __macro(relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \
+  __macro(tanh, TanhMKLDNNFunctor, TanhMKLDNNGradFunctor); \
+  __macro(sqrt, SqrtMKLDNNFunctor, SqrtMKLDNNGradFunctor); \
+  __macro(abs, AbsMKLDNNFunctor, AbsMKLDNNGradFunctor);
+
+FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL);
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index d74c47b981e51f12d99098818c71f3f6ec455d98..286b03d7b7d11a50f33f0190c1a5b9097ed0f4a2 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,327 +13,278 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/activation_op.h"
+#include <string>
+#include "paddle/fluid/operators/mkldnn_activation_op.h"
 
 namespace paddle {
 namespace operators {
 
+using paddle::framework::Tensor;
+
+#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)               \
+  class OP_NAME##OpMaker                                                \
+      : public ::paddle::framework::OpProtoAndCheckerMaker {            \
+   public:                                                              \
+    void Make() override {                                              \
+      AddInput("X", "Input of " #OP_NAME " operator");                  \
+      AddOutput("Out", "Output of " #OP_NAME " operator").Reuse("X");   \
+      AddAttr<bool>("use_mkldnn",                                       \
+                    "(bool, default false) Only used in mkldnn kernel") \
+          .SetDefault(false);                                           \
+      AddComment(#OP_COMMENT);                                          \
+    }                                                                   \
+  }
+
+#define REGISTER_ACTIVATION_OP_GRAD_MAKER(OP_NAME, KERNEL_TYPE)              \
+  class OP_NAME##GradMaker                                                   \
+      : public ::paddle::framework::SingleGradOpDescMaker {                  \
+   public:                                                                   \
+    using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker; \
+                                                                             \
+   protected:                                                                \
+    std::unique_ptr<::paddle::framework::OpDesc> Apply() const override {    \
+      auto* op = new ::paddle::framework::OpDesc();                          \
+      op->SetType(#KERNEL_TYPE "_grad");                                     \
+      op->SetInput("Out", Output("Out"));                                    \
+      op->SetInput(::paddle::framework::GradVarName("Out"),                  \
+                   OutputGrad("Out"));                                       \
+                                                                             \
+      op->SetAttrMap(Attrs());                                               \
+                                                                             \
+      op->SetOutput(::paddle::framework::GradVarName("X"), InputGrad("X"));  \
+      return std::unique_ptr<::paddle::framework::OpDesc>(op);               \
+    }                                                                        \
+  }
+
+framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
+                                      const framework::OperatorWithKernel& oper,
+                                      const std::string& name) {
+  framework::LibraryType library{framework::LibraryType::kPlain};
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+#ifdef PADDLE_WITH_MKLDNN
+  auto it = oper.Attrs().find("use_mkldnn");
+  if (library == framework::LibraryType::kPlain && it != oper.Attrs().end() &&
+      platform::CanMKLDNNBeUsed(ctx)) {
+    library = framework::LibraryType::kMKLDNN;
+    layout = framework::DataLayout::kMKLDNN;
+  }
+#endif
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<framework::Tensor>(name)->type()),
+      ctx.GetPlace(), layout, library);
+}
+
 class ActivationOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
     ctx->ShareLoD("X", /*->*/ "Out");
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return GetKernelType(ctx, *this, "X");
+  }
 };
 
 class ActivationOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return GetKernelType(ctx, *this, "Out");
+  }
 };
 
-class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Sigmoid operator");
-    AddOutput("Out", "Output of Sigmoid operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char SigmoidDoc[] = R"DOC(
 Sigmoid Activation Operator
 
 $$out = \frac{1}{1 + e^{-x}}$$
 
-)DOC");
-  }
-};
+)DOC";
 
-class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LogSigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of LogSigmoid operator");
-    AddOutput("Out", "Output of LogSigmoid operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char LogSigmoidDoc[] = R"DOC(
 Logsigmoid Activation Operator
 
-$$out = \log \frac{1}{1 + e^{-x}}$$
+$$out = \\log \\frac{1}{1 + e^{-x}}$$
 
-)DOC");
-  }
-};
+)DOC";
 
-class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ExpOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Exp operator");
-    AddOutput("Out", "Output of Exp operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char ExpDoc[] = R"DOC(
 Exp Activation Operator.
 
 $out = e^x$
 
-)DOC");
-  }
-};
+)DOC";
 
-class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Relu operator");
-    AddOutput("Out", "Output of Relu operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char ReluDoc[] = R"DOC(
 Relu Activation Operator.
 
 $out = \max(x, 0)$
 
-)DOC");
-  }
-};
+)DOC";
 
-class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LeakyReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of LeakyRelu operator");
-    AddOutput("Out", "Output of LeakyRelu operator");
-    AddAttr<float>("alpha", "The small negative slope").SetDefault(0.02f);
-    AddComment(R"DOC(
-LeakyRelu Activation Operator.
-
-$out = \max(x, \alpha * x)$
-
-)DOC");
-  }
-};
-
-class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SoftShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Softshrink operator");
-    AddOutput("Out", "Output of Softshrink operator");
-    AddAttr<float>("lambda", "non-negative offset").SetDefault(0.5f);
-    AddComment(R"DOC(
-Softshrink Activation Operator.
-
-$$
-out = \begin{cases} 
-    x - \lambda, \text{if } x > \lambda \\
-    x + \lambda, \text{if } x < -\lambda \\
-    0,  \text{otherwise}
-    \end{cases}
-$$
-
-)DOC");
-  }
-};
-
-class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  TanhOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Tanh operator");
-    AddOutput("Out", "Output of Tanh operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char TanhDoc[] = R"DOC(
 Tanh Activation Operator.
 
-$$out = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
+$$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
-)DOC");
-  }
-};
+)DOC";
 
-class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  TanhShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of TanhShrink operator");
-    AddOutput("Out", "Output of TanhShrink operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char TanhShrinkDoc[] = R"DOC(
 TanhShrink Activation Operator.
 
-$$out = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
+$$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
-)DOC");
-  }
-};
+)DOC";
 
-class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  HardShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of HardShrink operator");
-    AddOutput("Out", "Output of HardShrink operator");
-    AddAttr<float>("threshold", "The value of threshold for HardShrink")
-        .SetDefault(0.5f);
-    AddComment(R"DOC(
-HardShrink Activation Operator.
-
-$$
-out = \begin{cases} 
-    x, \text{if } x > \lambda \\
-    x, \text{if } x < -\lambda \\
-    0,  \text{otherwise}
-    \end{cases}
-$$
-
-)DOC");
-  }
-};
-
-class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SqrtOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Sqrt operator");
-    AddOutput("Out", "Output of Sqrt operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char SqrtDoc[] = R"DOC(
 Sqrt Activation Operator.
 
 $out = \sqrt{x}$
 
-)DOC");
-  }
-};
+)DOC";
 
-class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  AbsOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Abs operator");
-    AddOutput("Out", "Output of Abs operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char AbsDoc[] = R"DOC(
 Abs Activation Operator.
 
 $out = |x|$
 
-)DOC");
-  }
-};
+)DOC";
 
-class CeilOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CeilOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Ceil operator");
-    AddOutput("Out", "Output of Ceil operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char CeilDoc[] = R"DOC(
 Ceil Activation Operator.
 
 $out = ceil(x)$
 
-)DOC");
-  }
-};
+)DOC";
 
-class FloorOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  FloorOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Floor operator");
-    AddOutput("Out", "Output of Floor operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char FloorDoc[] = R"DOC(
 Floor Activation Operator.
 
 $out = floor(x)$
 
-)DOC");
-  }
-};
+)DOC";
 
-class RoundOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  RoundOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Round operator");
-    AddOutput("Out", "Output of Round operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char CosDoc[] = R"DOC(
+Cosine Activation Operator.
+
+$out = cos(x)$
+
+)DOC";
+
+__attribute__((unused)) constexpr char SinDoc[] = R"DOC(
+Sine Activation Operator.
+
+$out = sin(x)$
+
+)DOC";
+
+__attribute__((unused)) constexpr char RoundDoc[] = R"DOC(
 Round Activation Operator.
 
 $out = [x]$
 
-)DOC");
-  }
-};
+)DOC";
 
-class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ReciprocalOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Reciprocal operator");
-    AddOutput("Out", "Output of Reciprocal operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char ReciprocalDoc[] = R"DOC(
 Reciprocal Activation Operator.
 
-$$out = \frac{1}{x}$$
+$$out = \\frac{1}{x}$$
 
-)DOC");
-  }
-};
+)DOC";
 
-class LogOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  LogOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Log operator");
-    AddOutput("Out", "Output of Log operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char LogDoc[] = R"DOC(
 Log Activation Operator.
 
 $out = \ln(x)$
 
 Natural logarithm of x.
 
-)DOC");
-  }
-};
+)DOC";
 
-class SquareOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SquareOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Square operator");
-    AddOutput("Out", "Output of Square operator");
-    AddComment(R"DOC(
+__attribute__((unused)) constexpr char SquareDoc[] = R"DOC(
 Square Activation Operator.
 
 $out = x^2$
 
+)DOC";
+
+__attribute__((unused)) constexpr char SoftplusDoc[] = R"DOC(
+Softplus Activation Operator.
+
+$out = \ln(1 + e^{x})$
+
+)DOC";
+
+__attribute__((unused)) constexpr char SoftsignDoc[] = R"DOC(
+Softsign Activation Operator.
+
+$$out = \frac{x}{1 + |x|}$$
+
+)DOC";
+
+class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Input of LeakyRelu operator");
+    AddOutput("Out", "Output of LeakyRelu operator");
+    AddAttr<float>("alpha", "The small negative slope").SetDefault(0.02f);
+    AddComment(R"DOC(
+LeakyRelu Activation Operator.
+
+$out = \max(x, \alpha * x)$
+
 )DOC");
   }
 };
 
-class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
+class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftplusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Softplus operator");
-    AddOutput("Out", "Output of Softplus operator");
+  void Make() override {
+    AddInput("X", "Input of Softshrink operator");
+    AddOutput("Out", "Output of Softshrink operator");
+    AddAttr<float>("lambda", "non-negative offset").SetDefault(0.5f);
     AddComment(R"DOC(
-Softplus Activation Operator.
+:strong:`Softshrink Activation Operator`
 
-$out = \ln(1 + e^{x})$
+..  math::
+    out = \begin{cases} 
+         x - \lambda, \text{if } x > \lambda \\
+         x + \lambda, \text{if } x < -\lambda \\
+         0,  \text{otherwise}
+         \end{cases}
 
 )DOC");
   }
 };
 
-class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker {
+class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftsignOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Softsign operator");
-    AddOutput("Out", "Output of Softsign operator");
+  void Make() override {
+    AddInput("X", "Input of HardShrink operator");
+    AddOutput("Out", "Output of HardShrink operator");
+    AddAttr<float>("threshold",
+                   "The value of threshold for HardShrink. [default: 0.5]")
+        .SetDefault(0.5f);
     AddComment(R"DOC(
-Softsign Activation Operator.
+:strong:`HardShrink activation operator`
 
-$$out = \frac{x}{1 + |x|}$$
+..  math::
+    out = \begin{cases}
+            x, \text{if } x > \lambda \\
+            x, \text{if } x < -\lambda \\
+            0,  \text{otherwise}
+          \end{cases}
 
 )DOC");
   }
@@ -341,8 +292,7 @@ $$out = \frac{x}{1 + |x|}$$
 
 class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "Input of BRelu operator");
     AddOutput("Out", "Output of BRelu operator");
     AddAttr<float>("t_min", "The min marginal value of BRelu")
@@ -360,8 +310,7 @@ $out = \max(\min(x, t_{min}), t_{max})$
 
 class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "Input of SoftRelu operator");
     AddOutput("Out", "Output of SoftRelu operator");
     AddAttr<float>("threshold", "The threshold value of SoftRelu")
@@ -377,8 +326,7 @@ $out = \ln(1 + \exp(\max(\min(x, threshold), threshold))$
 
 class ELUOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ELUOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "Input of ELU operator");
     AddOutput("Out", "Output of ELU operator");
     AddAttr<float>("alpha", "The alpha value of ELU").SetDefault(1.0f);
@@ -396,8 +344,7 @@ $out = \max(0, x) + \min(0, \alpha * (e^x - 1))$
 
 class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Relu6OpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "Input of Relu6 operator");
     AddOutput("Out", "Output of Relu6 operator");
     AddAttr<float>("threshold", "The threshold value of Relu6")
@@ -413,8 +360,7 @@ $out = \min(\max(0, x), 6)$
 
 class PowOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PowOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "Input of Pow operator");
     AddOutput("Out", "Output of Pow operator");
     AddAttr<float>("factor", "The exponential factor of Pow").SetDefault(1.0f);
@@ -429,8 +375,7 @@ $out = x^{factor}$
 
 class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  STanhOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "Input of STanh operator");
     AddOutput("Out", "Output of STanh operator");
     AddAttr<float>("scale_a", "The scale parameter of a for the input")
@@ -440,7 +385,7 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 STanh Activation Operator.
 
-$$out = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
+$$out = b * \\frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
 
 )DOC");
   }
@@ -448,30 +393,28 @@ $$out = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
 
 class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ThresholdedReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "Input of ThresholdedRelu operator");
     AddOutput("Out", "Output of ThresholdedRelu operator");
-    AddAttr<float>("threshold", "The threshold location of activation")
+    AddAttr<float>("threshold",
+                   "The threshold location of activation. [default 1.0].")
         .SetDefault(1.0f);
     AddComment(R"DOC(
-ThresholdedRelu Activation Operator.
+:strong:`ThresholdedRelu activation operator`
 
-$$
-out = \begin{cases} 
-    x, \text{if } x > threshold \\
-    0,  \text{otherwise}
-    \end{cases}
-$$
+..  math::
 
+    out = \begin{cases}
+             x,  \text{if } x > threshold \\
+             0,  \text{otherwise}
+          \end{cases}
 )DOC");
   }
 };
 
 class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  HardSigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "Input of HardSigmoid operator");
     AddOutput("Out", "Output of HardSigmoid operator");
     AddAttr<float>("slope", "Slope for linear approximation of sigmoid")
@@ -496,108 +439,99 @@ It is recommended to use the defaults for this activation.
 
 class SwishOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SwishOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "Input of Swish operator");
     AddOutput("Out", "Output of Swish operator");
     AddAttr<float>("beta", "Constant beta of swish operator").SetDefault(1.0f);
     AddComment(R"DOC(
 Swish Activation Operator.
 
-$$out = \frac{x}{1 + e^{- \beta x}}$$
+$$out = \\frac{x}{1 + e^{- \beta x}}$$
 
 )DOC");
   }
 };
 
+REGISTER_ACTIVATION_OP_MAKER(Sigmoid, SigmoidDoc);
+REGISTER_ACTIVATION_OP_MAKER(LogSigmoid, LogSigmoidDoc);
+REGISTER_ACTIVATION_OP_MAKER(Exp, ExpDoc);
+REGISTER_ACTIVATION_OP_MAKER(Relu, ReluDoc);
+REGISTER_ACTIVATION_OP_MAKER(Tanh, TanhDoc);
+REGISTER_ACTIVATION_OP_MAKER(TanhShrink, TanhShrinkDoc);
+REGISTER_ACTIVATION_OP_MAKER(Sqrt, SqrtDoc);
+REGISTER_ACTIVATION_OP_MAKER(Abs, AbsDoc);
+REGISTER_ACTIVATION_OP_MAKER(Ceil, CeilDoc);
+REGISTER_ACTIVATION_OP_MAKER(Floor, FloorDoc);
+REGISTER_ACTIVATION_OP_MAKER(Cos, CosDoc);
+REGISTER_ACTIVATION_OP_MAKER(Sin, SinDoc);
+REGISTER_ACTIVATION_OP_MAKER(Round, RoundDoc);
+REGISTER_ACTIVATION_OP_MAKER(Reciprocal, ReciprocalDoc);
+REGISTER_ACTIVATION_OP_MAKER(Log, LogDoc);
+REGISTER_ACTIVATION_OP_MAKER(Square, SquareDoc);
+REGISTER_ACTIVATION_OP_MAKER(Softplus, SoftplusDoc);
+REGISTER_ACTIVATION_OP_MAKER(Softsign, SoftsignDoc);
+
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Sigmoid, sigmoid);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Relu, relu);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Exp, exp);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Tanh, tanh);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Ceil, ceil);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Floor, floor);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Sqrt, sqrt);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(SoftRelu, soft_relu);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Relu6, relu6);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(Reciprocal, reciprocal);
+REGISTER_ACTIVATION_OP_GRAD_MAKER(HardSigmoid, hard_sigmoid);
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(sigmoid, ops::ActivationOp, ops::SigmoidOpMaker, sigmoid_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(logsigmoid, ops::ActivationOp, ops::LogSigmoidOpMaker,
-            logsigmoid_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(relu, ops::ActivationOp, ops::ReluOpMaker, relu_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
-            tanh_shrink_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker,
-            softshrink_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(floor, ops::ActivationOp, ops::FloorOpMaker, floor_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(round, ops::ActivationOp, ops::RoundOpMaker, round_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(reciprocal, ops::ActivationOp, ops::ReciprocalOpMaker,
-            reciprocal_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(log, ops::ActivationOp, ops::LogOpMaker, log_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(square, ops::ActivationOp, ops::SquareOpMaker, square_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(softplus, ops::ActivationOp, ops::SoftplusOpMaker, softplus_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(softsign, ops::ActivationOp, ops::SoftsignOpMaker, softsign_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(brelu, ops::ActivationOp, ops::BReluOpMaker, brelu_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(leaky_relu, ops::ActivationOp, ops::LeakyReluOpMaker,
-            leaky_relu_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker, soft_relu_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(elu, ops::ActivationOp, ops::ELUOpMaker, elu_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(relu6, ops::ActivationOp, ops::Relu6OpMaker, relu6_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker, pow_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker, stanh_grad,
-            ops::ActivationOpGrad);
-
-REGISTER_OP(hard_shrink, ops::ActivationOp, ops::HardShrinkOpMaker,
-            hard_shrink_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(thresholded_relu, ops::ActivationOp, ops::ThresholdedReluOpMaker,
-            thresholded_relu_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker,
-            hard_sigmoid_grad, ops::ActivationOpGrad);
-
-REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
-            ops::ActivationOpGrad);
+#define FOR_EACH_INPLACE_OP_FUNCTOR(__macro) \
+  __macro(Sigmoid, sigmoid);                 \
+  __macro(Relu, relu);                       \
+  __macro(Exp, exp);                         \
+  __macro(Tanh, tanh);                       \
+  __macro(Ceil, ceil);                       \
+  __macro(Floor, floor);                     \
+  __macro(Sqrt, sqrt);                       \
+  __macro(SoftRelu, soft_relu);              \
+  __macro(Relu6, relu6);                     \
+  __macro(Reciprocal, reciprocal);           \
+  __macro(HardSigmoid, hard_sigmoid);
+
+#define FOR_EACH_OP_FUNCTOR(__macro) \
+  __macro(LogSigmoid, logsigmoid);   \
+  __macro(SoftShrink, softshrink);   \
+  __macro(Abs, abs);                 \
+  __macro(Cos, cos);                 \
+  __macro(Sin, sin);                 \
+  __macro(Round, round);             \
+  __macro(Log, log);                 \
+  __macro(Square, square);           \
+  __macro(BRelu, brelu);             \
+  __macro(Pow, pow);                 \
+  __macro(STanh, stanh);             \
+  __macro(Softplus, softplus);       \
+  __macro(Softsign, softsign);       \
+  __macro(LeakyRelu, leaky_relu);    \
+  __macro(TanhShrink, tanh_shrink);  \
+  __macro(ELU, elu);                 \
+  __macro(HardShrink, hard_shrink);  \
+  __macro(Swish, swish);             \
+  __macro(ThresholdedRelu, thresholded_relu);
+
+#define REGISTER_INPLACE_ACTIVATION_OP(OP_NAME, KERNEL_TYPE)        \
+  REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp, \
+                    ::paddle::operators::OP_NAME##OpMaker,          \
+                    ::paddle::operators::OP_NAME##GradMaker);       \
+  REGISTER_OPERATOR(KERNEL_TYPE##_grad, ::paddle::operators::ActivationOpGrad)
+
+#define REGISTER_ACTIVATION_OP(OP_NAME, KERNEL_TYPE)                    \
+  REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp,     \
+                    ::paddle::operators::OP_NAME##OpMaker,              \
+                    ::paddle::framework::DefaultGradOpDescMaker<true>); \
+  REGISTER_OPERATOR(KERNEL_TYPE##_grad, ::paddle::operators::ActivationOpGrad)
 
 #define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)   \
   REGISTER_OP_CPU_KERNEL(                                                 \
@@ -612,4 +546,6 @@ REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
       ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
                                 ops::grad_functor<double>>);
 
+FOR_EACH_OP_FUNCTOR(REGISTER_ACTIVATION_OP);
+FOR_EACH_INPLACE_OP_FUNCTOR(REGISTER_INPLACE_ACTIVATION_OP);
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index b2633d017623c3a6a3bab2b416009d6d7c8fc1d4..27487b396ccf63d962defa6b270063ccb409164e 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -1,33 +1,31 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 
-#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, functor, grad_functor)   \
-  REGISTER_OP_CUDA_KERNEL(                                                 \
-      act_type, ops::ActivationKernel<paddle::platform::CUDADeviceContext, \
-                                      ops::functor<float>>,                \
-      ops::ActivationKernel<paddle::platform::CUDADeviceContext,           \
-                            ops::functor<double>>);                        \
-  REGISTER_OP_CUDA_KERNEL(                                                 \
-      act_type##_grad,                                                     \
-      ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,       \
-                                ops::grad_functor<float>>,                 \
-      ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,       \
+#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, functor, grad_functor)    \
+  REGISTER_OP_CUDA_KERNEL(                                                  \
+      act_type,                                                             \
+      ops::ActivationKernel<plat::CUDADeviceContext, ops::functor<float>>,  \
+      ops::ActivationKernel<plat::CUDADeviceContext, ops::functor<double>>, \
+      ops::ActivationKernel<plat::CUDADeviceContext,                        \
+                            ops::functor<plat::float16>>);                  \
+  REGISTER_OP_CUDA_KERNEL(                                                  \
+      act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,   \
+                                                 ops::grad_functor<float>>, \
+      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
                                 ops::grad_functor<double>>);
 
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 8f791a6ca81c13a92fd8adf0d1620203bd4cf7d6..912415192659dc004f54a76e9cd1a20581d512a6 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -1,11 +1,8 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,13 +10,34 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <glog/logging.h>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/platform/float16.h"
+
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 
 namespace paddle {
 namespace operators {
 
+/* Use ugly global variable, for the using in python layer side
+   Please refer to the layer_helper.py and get the details.
+ */
+static std::unordered_set<std::string> InplaceOpSet = {
+    "sigmoid", "exp",        "relu",  "tanh",      "sqrt",         "ceil",
+    "floor",   "reciprocal", "relu6", "soft_relu", "hard_sigmoid",
+};
+
+static bool IsInplace(std::string op) { return InplaceOpSet.count(op); }
+
 template <typename DeviceContext, typename Functor>
 class ActivationKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -55,7 +73,6 @@ class ActivationGradKernel
  public:
   using T = typename Functor::ELEMENT_TYPE;
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
     auto* Out = context.Input<framework::Tensor>("Out");
     auto* dOut =
         context.Input<framework::Tensor>(framework::GradVarName("Out"));
@@ -63,7 +80,6 @@ class ActivationGradKernel
     dX->mutable_data<T>(context.GetPlace());
 
     auto dout = framework::EigenVector<T>::Flatten(*dOut);
-    auto x = framework::EigenVector<T>::Flatten(*X);
     auto out = framework::EigenVector<T>::Flatten(*Out);
     auto dx = framework::EigenVector<T>::Flatten(*dX);
     auto* place =
@@ -73,7 +89,16 @@ class ActivationGradKernel
     for (auto& attr : attrs) {
       *attr.second = context.Attr<float>(attr.first);
     }
-    functor(*place, x, out, dout, dx);
+    bool inplace = functor.Inplace();
+    if (!inplace) {
+      auto* X = context.Input<framework::Tensor>("X");
+      auto x = framework::EigenVector<T>::Flatten(*X);
+      functor(*place, x, out, dout, dx);
+    } else {
+      VLOG(10) << " Inplace activation ";
+      auto x = framework::EigenVector<T>::Flatten(*dX);
+      functor(*place, x, out, dout, dx);
+    }
   }
 };
 
@@ -84,6 +109,14 @@ struct BaseActivationFunctor {
   using AttrPair = std::vector<std::pair<const char*, float*>>;
 
   AttrPair GetAttrs() { return AttrPair(); }
+
+  /* NOTE(*): Output reuse X memory if X is not dependented by its Gradient.
+     For example, sigmoid op's gradient didn't involve x, so its output can
+     reuse
+     input memory. But abs op's gradient use x, it can not be inplaced.
+     gradient did use x.
+   */
+  bool Inplace() const { return false; }
 };
 
 // sigmoid(x) = 1 / (1 + exp(-x))
@@ -97,6 +130,7 @@ struct SigmoidFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("sigmoid"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -151,6 +185,7 @@ struct ExpFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct ExpGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("exp"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -169,10 +204,11 @@ struct ReluFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct ReluGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("relu"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (x > static_cast<T>(0)).template cast<T>();
+    dx.device(d) = dout * (out > static_cast<T>(0)).template cast<T>();
   }
 };
 
@@ -187,6 +223,7 @@ struct TanhFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct TanhGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("tanh"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -292,6 +329,7 @@ struct SqrtFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct SqrtGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("sqrt"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -311,10 +349,11 @@ struct CeilFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct ZeroGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("ceil"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = static_cast<T>(0) / x;
+    dx.device(d) = static_cast<T>(0) / out;
   }
 };
 
@@ -327,6 +366,68 @@ struct FloorFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct Sine {
+  HOSTDEVICE T operator()(const T& val) const { return sin(val); }
+};
+
+template <>
+struct Sine<platform::float16> {
+  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
+    return platform::float16(sin(static_cast<float>(val)));
+  }
+};
+
+template <typename T>
+struct Cosine {
+  HOSTDEVICE T operator()(const T& val) const { return cos(val); }
+};
+
+template <>
+struct Cosine<platform::float16> {
+  HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
+    return platform::float16(cos(static_cast<float>(val)));
+  }
+};
+
+// cosine'(x) = -sin(x)
+template <typename T>
+struct CosGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = -dout * x.unaryExpr(Sine<T>());
+  }
+};
+
+// cosine(x) = cos(x)
+template <typename T>
+struct CosFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Cosine<T>());
+  }
+};
+
+// sine'(x) = cos(x)
+template <typename T>
+struct SinGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * x.unaryExpr(Cosine<T>());
+  }
+};
+
+// sine(x) = sin(x)
+template <typename T>
+struct SinFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Sine<T>());
+  }
+};
+
 // round(x) = [x]
 template <typename T>
 struct RoundFunctor : public BaseActivationFunctor<T> {
@@ -365,6 +466,7 @@ struct ReciprocalFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
+  bool Inplace() const { return IsInplace("reciprocal"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -464,12 +566,14 @@ struct Relu6GradFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"threshold", &threshold}};
   }
+  bool Inplace() const { return IsInplace("relu6"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout *
-                   ((x > static_cast<T>(0)) * (x < static_cast<T>(threshold)))
-                       .template cast<T>();
+    dx.device(d) =
+        dout *
+        ((out > static_cast<T>(0)) * (out < static_cast<T>(threshold)))
+            .template cast<T>();
   }
 };
 
@@ -544,11 +648,12 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"threshold", &threshold}};
   }
+  bool Inplace() const { return IsInplace("soft_relu"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto tmp = static_cast<T>(threshold);
-    auto temp = ((x > -tmp) * (x < tmp)).template cast<T>().eval();
+    auto temp = ((out > -tmp) * (out < tmp)).template cast<T>().eval();
     dx.device(d) = dout * (static_cast<T>(1) - (-out).exp()) * temp;
   }
 };
@@ -724,7 +829,7 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
     return {{"slope", &slope}, {"offset", &offset}};
   }
-
+  bool Inplace() { return IsInplace("hard_sigmoid"); }
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
@@ -779,6 +884,8 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
   __macro(abs, AbsFunctor, AbsGradFunctor);                          \
   __macro(ceil, CeilFunctor, ZeroGradFunctor);                       \
   __macro(floor, FloorFunctor, ZeroGradFunctor);                     \
+  __macro(cos, CosFunctor, CosGradFunctor);                          \
+  __macro(sin, SinFunctor, SinGradFunctor);                          \
   __macro(round, RoundFunctor, ZeroGradFunctor);                     \
   __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);     \
   __macro(log, LogFunctor, LogGradFunctor);                          \
diff --git a/paddle/fluid/operators/adadelta_op.cc b/paddle/fluid/operators/adadelta_op.cc
index c9ed221a6e662e8c213fe1d34ff85a3f77483a3c..d1970515f58969948b1d2db5847e4344112f77f9 100644
--- a/paddle/fluid/operators/adadelta_op.cc
+++ b/paddle/fluid/operators/adadelta_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
 class AdadeltaOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -55,12 +56,17 @@ class AdadeltaOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("AvgSquaredGradOut", param_dim);
     ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim);
   }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 
 class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AdadeltaOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
     AddInput("AvgSquaredGrad", "(Tensor) Input average of squared gradient");
diff --git a/paddle/fluid/operators/adagrad_op.cc b/paddle/fluid/operators/adagrad_op.cc
index c990fe784380bf78a7f3594c0f49ef5e06e6caea..a3ef9ad9f91f1f626bd33876693ecc17ad76b96b 100644
--- a/paddle/fluid/operators/adagrad_op.cc
+++ b/paddle/fluid/operators/adagrad_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/adagrad_op.h"
+#include <vector>
 
 #include <cmath>
 
@@ -22,6 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
 class AdagradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -55,12 +57,17 @@ class AdagradOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("ParamOut", param_dims);
     ctx->SetOutputDim("MomentOut", param_dims);
   }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 
 class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AdagradOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
     AddInput("Moment", "(Tensor) Second moment");
diff --git a/paddle/fluid/operators/adagrad_op.cu b/paddle/fluid/operators/adagrad_op.cu
index e798101ca6a3a44de749a2d2219295bd8911dfac..b25268786d622bc7a94117849763833e528bef48 100644
--- a/paddle/fluid/operators/adagrad_op.cu
+++ b/paddle/fluid/operators/adagrad_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/adagrad_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/adam_op.cc b/paddle/fluid/operators/adam_op.cc
index 267dcab8104c337c8590180c8093098c756ab27d..5d670fe3b9d99a31a628ff707ff860564eca952e 100644
--- a/paddle/fluid/operators/adam_op.cc
+++ b/paddle/fluid/operators/adam_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
 class AdamOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -55,9 +56,12 @@ class AdamOp : public framework::OperatorWithKernel {
                       "Beta2 power accumulator should have 1 dimension");
 
     auto param_dims = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Grad"),
-        "Param and Grad input of AdamOp should have same dimension");
+    if (ctx->GetInputsVarType("Grad")[0] ==
+        framework::proto::VarType::LOD_TENSOR) {
+      PADDLE_ENFORCE_EQ(
+          param_dims, ctx->GetInputDim("Grad"),
+          "Param and Grad input of AdamOp should have same dimension");
+    }
     PADDLE_ENFORCE_EQ(
         param_dims, ctx->GetInputDim("Moment1"),
         "Param and Moment1 input of AdamOp should have same dimension");
@@ -69,12 +73,17 @@ class AdamOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Moment1Out", param_dims);
     ctx->SetOutputDim("Moment2Out", param_dims);
   }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 
 class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AdamOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
     AddInput("LearningRate", "(Tensor) Learning rate");
@@ -83,9 +92,9 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
     AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
 
-    AddOutput("ParamOut", "(Tensor) Output parameter");
-    AddOutput("Moment1Out", "(Tensor) Output first moment");
-    AddOutput("Moment2Out", "(Tensor) Output second moment");
+    AddOutput("ParamOut", "(Tensor) Output parameter").Reuse("Param");
+    AddOutput("Moment1Out", "(Tensor) Output first moment").Reuse("Moment1");
+    AddOutput("Moment2Out", "(Tensor) Output second moment").Reuse("Moment2");
 
     AddAttr<float>("beta1",
                    "(float, default 0.9) "
diff --git a/paddle/fluid/operators/adam_op.h b/paddle/fluid/operators/adam_op.h
index b332b6716369ca355454dc57fbd6cc2cbc71c658..a7a28b02b67f2ef180ec0e273dbe7ef555f88ce2 100644
--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <math.h>  // for sqrt in CPU and CUDA
+#include <Eigen/Dense>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
@@ -24,8 +25,14 @@ namespace operators {
 
 namespace scatter = paddle::operators::math::scatter;
 
+struct GPUAdam;
+struct CPUAdam;
+
+template <typename T, typename Flavour>
+struct AdamFunctor;
+
 template <typename T>
-struct AdamFunctor {
+struct AdamFunctor<T, GPUAdam> {
   T beta1_;
   T beta2_;
   T epsilon_;
@@ -71,6 +78,7 @@ struct AdamFunctor {
 
     // Calculation
     lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
+
     mom1 = beta1_ * mom1 + (1 - beta1_) * g;
     mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
     p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
@@ -82,6 +90,71 @@ struct AdamFunctor {
   }
 };
 
+template <typename T>
+struct AdamFunctor<T, CPUAdam> {
+  T beta1_;
+  T beta2_;
+  T epsilon_;
+
+  const T* beta1_pow_;
+  const T* beta2_pow_;
+  const T* moment1_;
+  T* moment1_out_;
+  const T* moment2_;
+  T* moment2_out_;
+  const T* lr_;
+  const T* grad_;
+  const T* param_;
+  T* param_out_;
+
+  AdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
+              const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2,
+              T* mom2_out, const T* lr, const T* grad, const T* param,
+              T* param_out)
+      : beta1_(beta1),
+        beta2_(beta2),
+        epsilon_(epsilon),
+        beta1_pow_(beta1_pow),
+        beta2_pow_(beta2_pow),
+        moment1_(mom1),
+        moment1_out_(mom1_out),
+        moment2_(mom2),
+        moment2_out_(mom2_out),
+        lr_(lr),
+        grad_(grad),
+        param_(param),
+        param_out_(param_out) {}
+
+  void operator()(size_t numel) const {
+    Eigen::Map<const Eigen::Array<T, 1, Eigen::Dynamic>> g{
+        grad_, static_cast<Eigen::Index>(numel)};
+    Eigen::Map<const Eigen::Array<T, 1, Eigen::Dynamic>> mom1{
+        moment1_, static_cast<Eigen::Index>(numel)};
+    Eigen::Map<const Eigen::Array<T, 1, Eigen::Dynamic>> mom2{
+        moment2_, static_cast<Eigen::Index>(numel)};
+    Eigen::Map<const Eigen::Array<T, 1, Eigen::Dynamic>> param{
+        param_, static_cast<Eigen::Index>(numel)};
+
+    Eigen::Map<Eigen::Array<T, 1, Eigen::Dynamic>> param_out{
+        param_out_, static_cast<Eigen::Index>(numel)};
+    Eigen::Map<Eigen::Array<T, 1, Eigen::Dynamic>> moment1_out{
+        moment1_out_, static_cast<Eigen::Index>(numel)};
+    Eigen::Map<Eigen::Array<T, 1, Eigen::Dynamic>> moment2_out{
+        moment2_out_, static_cast<Eigen::Index>(numel)};
+
+    T lr = *lr_;
+    T beta1_pow = *beta1_pow_;
+    T beta2_pow = *beta2_pow_;
+
+    // Calculation
+    lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
+
+    moment1_out = beta1_ * mom1 + (1 - beta1_) * g;
+    moment2_out = beta2_ * mom2 + (1 - beta2_) * g * g;
+    param_out = param - lr * (moment1_out / (moment2_out.sqrt() + epsilon_));
+  }
+};
+
 template <typename T>
 struct SparseAdamFunctor {
   T beta1_;
@@ -134,6 +207,7 @@ struct SparseAdamFunctor {
       T p = param_[rows_[i] * row_numel_ + j];
 
       lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
+
       mom1 = beta1_ * mom1 + (1 - beta1_) * g;
       mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
       p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
@@ -177,22 +251,41 @@ class AdamOpKernel : public framework::OpKernel<T> {
 
     if (grad_var->IsType<framework::LoDTensor>()) {
       auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
-      AdamFunctor<T> functor(
-          beta1, beta2, epsilon, beta1_pow.template data<T>(),
-          beta2_pow.template data<T>(), mom1.template data<T>(),
-          mom1_out.template mutable_data<T>(ctx.GetPlace()),
-          mom2.template data<T>(),
-          mom2_out.template mutable_data<T>(ctx.GetPlace()),
-          lr.template data<T>(), grad.template data<T>(),
-          param.template data<T>(),
-          param_out.template mutable_data<T>(ctx.GetPlace()));
-      platform::ForRange<DeviceContext> for_range(
-          static_cast<const DeviceContext&>(ctx.device_context()),
-          param.numel());
-      for_range(functor);
+
+      if (platform::is_cpu_place(ctx.GetPlace())) {
+        AdamFunctor<T, CPUAdam> functor(
+            beta1, beta2, epsilon, beta1_pow.template data<T>(),
+            beta2_pow.template data<T>(), mom1.template data<T>(),
+            mom1_out.template mutable_data<T>(ctx.GetPlace()),
+            mom2.template data<T>(),
+            mom2_out.template mutable_data<T>(ctx.GetPlace()),
+            lr.template data<T>(), grad.template data<T>(),
+            param.template data<T>(),
+            param_out.template mutable_data<T>(ctx.GetPlace()));
+        functor(param.numel());
+      } else if (platform::is_gpu_place(ctx.GetPlace())) {
+        AdamFunctor<T, GPUAdam> functor(
+            beta1, beta2, epsilon, beta1_pow.template data<T>(),
+            beta2_pow.template data<T>(), mom1.template data<T>(),
+            mom1_out.template mutable_data<T>(ctx.GetPlace()),
+            mom2.template data<T>(),
+            mom2_out.template mutable_data<T>(ctx.GetPlace()),
+            lr.template data<T>(), grad.template data<T>(),
+            param.template data<T>(),
+            param_out.template mutable_data<T>(ctx.GetPlace()));
+
+        platform::ForRange<DeviceContext> for_range(
+            static_cast<const DeviceContext&>(ctx.device_context()),
+            param.numel());
+        for_range(functor);
+      }
     } else if (grad_var->IsType<framework::SelectedRows>()) {
       auto& grad =
           Ref(ctx.Input<framework::SelectedRows>("Grad"), "Must set Grad");
+      if (grad.rows().size() == 0) {
+        VLOG(3) << "grad row size is 0!!";
+        return;
+      }
       // merge duplicated rows if any.
       scatter::MergeAdd<DeviceContext, T> merge_func;
       auto grad_merge =
diff --git a/paddle/fluid/operators/adamax_op.cc b/paddle/fluid/operators/adamax_op.cc
index 7e2f1cc66ebf8b7deebf55057a27129844129d5d..32062574bcf71ff96e451eaa6865b6bbfc3b1c80 100644
--- a/paddle/fluid/operators/adamax_op.cc
+++ b/paddle/fluid/operators/adamax_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
 class AdamaxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -63,12 +64,17 @@ class AdamaxOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("MomentOut", param_dims);
     ctx->SetOutputDim("InfNormOut", param_dims);
   }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 
 class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AdamaxOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
     AddInput("LearningRate", "(Tensor) Learning rate");
diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8174d3735859b1fac40cd4c07545f34874d31ab7
--- /dev/null
+++ b/paddle/fluid/operators/arg_max_op.cc
@@ -0,0 +1,33 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/arg_min_max_op_base.h"
+
+REGISTER_OPERATOR(arg_max, paddle::operators::ArgMinMaxOp,
+                  paddle::operators::ArgMaxOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    arg_max,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, float>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, double>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
+                                    int64_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
+                                    int32_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
+                                    int16_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext, size_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CPUDeviceContext,
+                                    uint8_t>);
diff --git a/paddle/fluid/operators/arg_max_op.cu b/paddle/fluid/operators/arg_max_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a147d77a9e9c577984028e1a6ed9582dda622069
--- /dev/null
+++ b/paddle/fluid/operators/arg_max_op.cu
@@ -0,0 +1,31 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/arg_min_max_op_base.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    arg_max,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext, float>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
+                                    double>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
+                                    int64_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
+                                    int32_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
+                                    int16_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
+                                    size_t>,
+    paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
+                                    uint8_t>);
diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..6cbdaefeda099c36a864289ef8195c20d09c55e6
--- /dev/null
+++ b/paddle/fluid/operators/arg_min_max_op_base.h
@@ -0,0 +1,160 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include <type_traits>
+#include <vector>
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace paddle {
+namespace operators {
+
+enum ArgMinMaxType { kArgMin, kArgMax };
+
+template <typename DeviceContext, typename T, typename Tout, int64_t Rank,
+          ArgMinMaxType argMinMaxValue>
+struct ArgMinMaxFunctor {};
+
+#define DECLARE_ARG_MIN_MAX_FUNCTOR(eigen_op_type, enum_argminmax_value)      \
+  template <typename DeviceContext, typename T, typename Tout, int64_t Rank>  \
+  struct ArgMinMaxFunctor<DeviceContext, T, Tout, Rank,                       \
+                          enum_argminmax_value> {                             \
+    void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \
+                    framework::LoDTensor* out, int64_t axis) {                \
+      auto in_eigen = framework::EigenTensor<T, Rank>::From(in);              \
+      auto out_eigen = framework::EigenTensor<Tout, Rank - 1>::From(*out);    \
+      out_eigen.device(*(ctx.eigen_device())) =                               \
+          in_eigen.eigen_op_type(axis).template cast<Tout>();                 \
+    }                                                                         \
+  }
+
+DECLARE_ARG_MIN_MAX_FUNCTOR(argmin, ArgMinMaxType::kArgMin);
+DECLARE_ARG_MIN_MAX_FUNCTOR(argmax, ArgMinMaxType::kArgMax);
+
+template <typename DeviceContext, typename T, typename Tout,
+          ArgMinMaxType EnumArgMinMaxValue>
+class ArgMinMaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& x = *(ctx.Input<framework::LoDTensor>("X"));
+    auto& out = *(ctx.Output<framework::LoDTensor>("Out"));
+    out.mutable_data<Tout>(ctx.GetPlace());
+    auto axis = ctx.Attr<int64_t>("axis");
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+#define CALL_ARG_MINMAX_FUNCTOR(rank)                                \
+  ArgMinMaxFunctor<DeviceContext, T, Tout, rank, EnumArgMinMaxValue> \
+      functor##rank;                                                 \
+  functor##rank(dev_ctx, x, &out, axis)
+
+    switch (x.dims().size()) {
+      case 1:
+        CALL_ARG_MINMAX_FUNCTOR(1);
+        break;
+      case 2:
+        CALL_ARG_MINMAX_FUNCTOR(2);
+        break;
+      case 3:
+        CALL_ARG_MINMAX_FUNCTOR(3);
+        break;
+      case 4:
+        CALL_ARG_MINMAX_FUNCTOR(4);
+        break;
+      case 5:
+        CALL_ARG_MINMAX_FUNCTOR(5);
+        break;
+      case 6:
+        CALL_ARG_MINMAX_FUNCTOR(6);
+        break;
+      default:
+        PADDLE_THROW(
+            "%s operator doesn't supports tensors whose ranks are greater "
+            "than 6.",
+            (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax"));
+        break;
+#undef CALL_ARG_MINMAX_FUNCTOR
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+using ArgMinKernel =
+    ArgMinMaxKernel<DeviceContext, T, int64_t, ArgMinMaxType::kArgMin>;
+
+template <typename DeviceContext, typename T>
+using ArgMaxKernel =
+    ArgMinMaxKernel<DeviceContext, T, int64_t, ArgMinMaxType::kArgMax>;
+
+class ArgMinMaxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
+    const auto& x_dims = ctx->GetInputDim("X");
+    int64_t axis = ctx->Attrs().Get<int64_t>("axis");
+    PADDLE_ENFORCE(axis >= -x_dims.size() && axis < x_dims.size(),
+                   "'axis' must be inside [-Rank(X), Rank(X))");
+
+    auto x_rank = x_dims.size();
+    if (axis < 0) axis += x_rank;
+
+    std::vector<int64_t> vec;
+    for (int64_t i = 0; i < axis; i++) vec.push_back(x_dims[i]);
+    for (int64_t i = axis + 1; i < x_rank; i++) vec.push_back(x_dims[i]);
+    ctx->SetOutputDim("Out", framework::make_ddim(vec));
+  }
+};
+
+class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker {
+ protected:
+  virtual const char* OpName() const = 0;
+  virtual const char* Name() const = 0;
+
+ public:
+  void Make() override {
+    AddInput("X", "Input tensor.");
+    AddOutput("Out", "Output tensor.");
+    AddAttr<int64_t>("axis", "The axis in which to compute the arg indics.");
+    AddComment(string::Sprintf(R"DOC(
+      %s Operator.
+
+      Computes the indices of the %s elements of the input tensor's element
+      along the provided axis.
+)DOC",
+                               OpName(), Name()));
+  }
+};
+
+class ArgMinOpMaker : public BaseArgMinMaxOpMaker {
+ protected:
+  const char* OpName() const override { return "ArgMin"; }
+  const char* Name() const override { return "min"; }
+};
+
+class ArgMaxOpMaker : public BaseArgMinMaxOpMaker {
+ protected:
+  const char* OpName() const override { return "ArgMax"; }
+  const char* Name() const override { return "max"; }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..41f188029f17dbe8717afc0ca0760a39edc24b54
--- /dev/null
+++ b/paddle/fluid/operators/arg_min_op.cc
@@ -0,0 +1,33 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/arg_min_max_op_base.h"
+
+REGISTER_OPERATOR(arg_min, paddle::operators::ArgMinMaxOp,
+                  paddle::operators::ArgMinOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    arg_min,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, float>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, double>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
+                                    int64_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
+                                    int32_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
+                                    int16_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext, size_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CPUDeviceContext,
+                                    uint8_t>);
diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/fluid/operators/arg_min_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4d020508505a6ebac8be41ce1e4f99d436b67ab5
--- /dev/null
+++ b/paddle/fluid/operators/arg_min_op.cu
@@ -0,0 +1,31 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/arg_min_max_op_base.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    arg_min,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext, float>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
+                                    double>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
+                                    int64_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
+                                    int32_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
+                                    int16_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
+                                    size_t>,
+    paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
+                                    uint8_t>);
diff --git a/paddle/fluid/operators/argsort_op.cc b/paddle/fluid/operators/argsort_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a2f5a2545701991263c1ef842e9275b1edbfd2ca
--- /dev/null
+++ b/paddle/fluid/operators/argsort_op.cc
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/argsort_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ArgsortOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ArgsortOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ArgsortOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Indices"),
+                   "Output(Indices) of ArgsortOp should not be null.");
+
+    auto in_dims = ctx->GetInputDim("X");
+    int axis = ctx->Attrs().Get<int>("axis");
+
+    auto num_dims = in_dims.size();
+    PADDLE_ENFORCE(axis < num_dims,
+                   "Attr(axis) %d of ArgsortOp is out of bounds for Input(X)'s "
+                   "rank %d.",
+                   axis, num_dims);
+    PADDLE_ENFORCE(axis >= -num_dims,
+                   "Attr(axis) %d of ArgsortOp must be not less than "
+                   "-rank(Input(X)) (%d).",
+                   axis, num_dims);
+
+    ctx->SetOutputDim("Out", in_dims);
+    ctx->SetOutputDim("Indices", in_dims);
+    ctx->ShareLoD("X", "Out");
+    ctx->ShareLoD("X", "Indices");
+  }
+};
+
+class ArgsortOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) The input of Argsort op.");
+    AddOutput("Out",
+              "(Tensor) The sorted tensor of Argsort op, with the same "
+              "shape as Input(X).");
+    AddOutput("Indices",
+              "(Tensor) The indices of a tensor giving the sorted order, with "
+              "the same shape as Input(X).");
+    AddComment(R"DOC(
+Argsort operator
+
+Performs sorting on the input tensor along the given axis and outputs two 
+tensors, Output(Out) and Output(Indices). They reserve the same shape 
+with Input(X), and Output(Out) represents the sorted tensor while 
+Output(Indices) gives the sorted order along the given axis Attr(axis).
+
+ )DOC");
+    AddAttr<int>("axis",
+                 "(int, default -1) The axis along which to sort the tensor. "
+                 "When axis < 0, the actual axis will be the |axis|'th "
+                 "counting backwards. Default -1, the last dimension.")
+        .SetDefault(-1);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(argsort, ops::ArgsortOp, ops::ArgsortOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(argsort,
+                       ops::ArgsortKernel<paddle::platform::CPUPlace, float>,
+                       ops::ArgsortKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7d5199aae7da4eed5afa6b8bd64c04a540b915d4
--- /dev/null
+++ b/paddle/fluid/operators/argsort_op.cu
@@ -0,0 +1,151 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thrust/execution_policy.h>
+#include <thrust/sort.h>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/argsort_op.h"
+#include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+const int kMaxRank = 9;  // The max rank of a tensor allowed in Fluid
+
+__global__ void ComputeTargetIdx(const int64_t* in_dims, int dims_size,
+                                 int axis, int64_t n, int64_t* trg_idx,
+                                 int64_t* med_ids) {
+  int64_t index = threadIdx.x + blockDim.x * blockIdx.x;
+  if (index < n) {
+    int64_t shape_out_axis[kMaxRank - 1] = {0};
+    int64_t dims_out_axis[kMaxRank - 1] = {0};
+    int64_t tmp = index;
+    int64_t pos_in_axis = 0;
+    int64_t i = dims_size - 2;
+    int64_t dim_axis = 0;
+    for (int64_t j = dims_size - 1; j >= 0; --j) {
+      int64_t dim = in_dims[j];
+      if (j != axis) {
+        shape_out_axis[i] = tmp % dim;
+        dims_out_axis[i] = dim;
+        i--;
+      } else {
+        dim_axis = dim;
+        pos_in_axis = tmp % dim_axis;
+      }
+      tmp /= dim;
+    }
+    int64_t group = (dims_size > 1) ? shape_out_axis[0] : 0;
+    for (int64_t j = 0; j < dims_size - 2; ++j) {
+      group = group * dims_out_axis[j + 1] + shape_out_axis[j + 1];
+    }
+
+    int64_t traget_idx = group * dim_axis + pos_in_axis;
+    trg_idx[index] = traget_idx;
+    med_ids[traget_idx] = pos_in_axis;
+  }
+}
+
+template <typename T>
+__global__ void PermuteInData(const T* in, const int64_t* trg_idx, int64_t n,
+                              T* med_out) {
+  int index = threadIdx.x + blockDim.x * blockIdx.x;
+  if (index < n) {
+    med_out[trg_idx[index]] = in[index];
+  }
+}
+
+template <typename T>
+__global__ void Sort(int64_t axis_dim, int64_t groups, T* med_out,
+                     int64_t* med_ids) {
+  int index = threadIdx.x + blockDim.x * blockIdx.x;
+  if (index < groups) {
+    thrust::sort_by_key(thrust::device, med_out + index * axis_dim,
+                        med_out + axis_dim * (1 + index),
+                        med_ids + index * axis_dim);
+  }
+}
+
+template <typename T>
+__global__ void PermuteMediateData(const T* med_out, const int64_t* med_ids,
+                                   const int64_t* trg_idx, int64_t n, T* out,
+                                   int64_t* indices) {
+  int index = threadIdx.x + blockDim.x * blockIdx.x;
+  if (index < n) {
+    out[index] = med_out[trg_idx[index]];
+    indices[index] = med_ids[trg_idx[index]];
+  }
+}
+
+template <typename T>
+class ArgsortOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    auto* indices = ctx.Output<Tensor>("Indices");
+    int axis = ctx.Attr<int>("axis");
+
+    auto in_dims = input->dims();
+    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+
+    const T* in_data = input->data<T>();
+    T* out_data = output->mutable_data<T>(ctx.GetPlace());
+    int64_t* ids_data = indices->mutable_data<int64_t>(ctx.GetPlace());
+
+    int64_t numel = input->numel();
+    int64_t groups = numel / in_dims[axis];
+
+    std::vector<int64_t> in_dims_vec = vectorize(in_dims);
+    thrust::device_vector<int64_t> in_dims_dev(in_dims_vec.begin(),
+                                               in_dims_vec.end());
+    int64_t* in_dims_data = thrust::raw_pointer_cast(in_dims_dev.data());
+    // Mediate tensor for sorting data and indices
+    Tensor mediate_output, mediate_indices;
+    T* med_out_data =
+        mediate_output.mutable_data<T>(input->dims(), ctx.GetPlace());
+    int64_t* med_ids_data =
+        mediate_indices.mutable_data<int64_t>(in_dims, ctx.GetPlace());
+    // Target index of each element along the given axis in the mediate tensors
+    Tensor trg_idx_t;
+    int64_t* trg_idx = trg_idx_t.mutable_data<int64_t>(in_dims, ctx.GetPlace());
+
+    auto stream = ctx.cuda_device_context().stream();
+    const int num_threads = PADDLE_CUDA_NUM_THREADS;
+
+    ComputeTargetIdx<<<(numel - 1) / num_threads + 1, num_threads, 0, stream>>>(
+        in_dims_data, in_dims.size(), axis, numel, trg_idx, med_ids_data);
+
+    PermuteInData<<<(numel - 1) / num_threads + 1, num_threads, 0, stream>>>(
+        in_data, trg_idx, numel, med_out_data);
+
+    Sort<<<(groups - 1) / num_threads + 1, num_threads, 0, stream>>>(
+        in_dims[axis], groups, med_out_data, med_ids_data);
+
+    PermuteMediateData<<<(numel - 1) / num_threads + 1, num_threads, 0,
+                         stream>>>(med_out_data, med_ids_data, trg_idx, numel,
+                                   out_data, ids_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(argsort, paddle::operators::ArgsortOpCUDAKernel<float>,
+                        paddle::operators::ArgsortOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/argsort_op.h b/paddle/fluid/operators/argsort_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e9112cfb7cbe5f783b04729fb4dff3676c922bc
--- /dev/null
+++ b/paddle/fluid/operators/argsort_op.h
@@ -0,0 +1,81 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ArgsortKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<framework::Tensor>("X");
+    auto* output = ctx.Output<framework::Tensor>("Out");
+    auto* indices = ctx.Output<framework::Tensor>("Indices");
+    int axis = ctx.Attr<int>("axis");
+
+    auto in_dims = input->dims();
+    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
+
+    const T* in_data = input->data<T>();
+    T* out_data = output->mutable_data<T>(ctx.GetPlace());
+    int64_t* ids_data = indices->mutable_data<int64_t>(ctx.GetPlace());
+
+    int64_t groups = input->numel() / in_dims[axis];
+    int64_t stride = (axis == in_dims.size() - 1)
+                         ? 1
+                         : framework::product(framework::slice_ddim(
+                               in_dims, axis + 1, in_dims.size()));
+
+    for (int64_t i = 0; i < groups; ++i) {
+      int64_t idx = i;
+      std::vector<int64_t> shape_vec(in_dims.size(), 0);
+      for (int64_t dim = in_dims.size() - 1; dim >= 0; --dim) {
+        if (dim != axis) {
+          shape_vec[dim] = idx % in_dims[dim];
+          idx /= in_dims[dim];
+        }
+      }
+
+      int64_t start_index = shape_vec[0];
+      for (int64_t dim = 0; dim < in_dims.size() - 1; ++dim) {
+        start_index = start_index * in_dims[dim + 1] + shape_vec[dim + 1];
+      }
+
+      std::vector<int64_t> org_index_vec(in_dims[axis], start_index);
+      for (int64_t j = 1; j < in_dims[axis]; ++j) {
+        org_index_vec[j] += j * stride;
+      }
+
+      std::sort(org_index_vec.begin(), org_index_vec.end(),
+                [in_data](const int64_t v1, const int64_t v2) {
+                  return in_data[v1] < in_data[v2];
+                });
+
+      for (size_t j = 0; j < org_index_vec.size(); ++j) {
+        int64_t index = start_index + j * stride;
+        out_data[index] = in_data[org_index_vec[j]];
+        ids_data[index] = (org_index_vec[j] - start_index) / stride;
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/array_operator.h b/paddle/fluid/operators/array_operator.h
index dbcc7abb0996268b5a3571ba113d9cc56f6f65a3..4309f0a5497456065e5c43bc8f7b265fa711f699 100644
--- a/paddle/fluid/operators/array_operator.h
+++ b/paddle/fluid/operators/array_operator.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index 5db2e4540ef170079328f24ac8d30f7b1901fa1e..149226e92d4d08a25c211bce686ff03c5d7ddf40 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -123,8 +123,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
 
 class ArrayToLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ArrayToLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(std::vector<LodTensor>) A vector of tensors that is going to "
              "be casted to a big LoDTensor.");
diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc
index 39ae3c0040d04a6d901f1d6c992d547a6778c28e..d9294048a9e89662958fd5c6af4fcbe5da3814c2 100644
--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
@@ -56,6 +56,7 @@ class AssignFunctor {
  private:
   void copy_tensor(const framework::LoDTensor &lod_tensor,
                    framework::LoDTensor *out) const {
+    if (lod_tensor.numel() == 0) return;
     auto &out_tensor = *out;
     TensorCopy(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor);
     out_tensor.set_lod(lod_tensor.lod());
@@ -93,8 +94,7 @@ class AssignOp : public framework::OperatorBase {
 
 class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AssignOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(LoDTensor, SelectedRows or LoDTensorArray) The input variable "
              "could be LoDTensor, SelectedRows or LoDTensorArray.")
diff --git a/paddle/fluid/operators/assign_value_op.cc b/paddle/fluid/operators/assign_value_op.cc
index e8123cb1a490be642d1061bba8129f63e681d3c3..a757916be7f6ece9b783d51d1051aac6a276795b 100644
--- a/paddle/fluid/operators/assign_value_op.cc
+++ b/paddle/fluid/operators/assign_value_op.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/assign_value_op.h"
+#include <string>
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -43,8 +45,7 @@ class AssignValueOp : public framework::OperatorWithKernel {
 
 class AssignValueOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AssignValueOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddOutput("Out", "(Tensor) Output tensor of assign_value operator.");
     AddAttr<std::vector<int>>("shape",
                               "(vector<int>) "
@@ -69,6 +70,7 @@ $$Out = values$$
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker);
+REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(assign_value, ops::AssignValueKernel<int>,
                        ops::AssignValueKernel<float>);
diff --git a/paddle/fluid/operators/assign_value_op.h b/paddle/fluid/operators/assign_value_op.h
index c7b1a55a5cd52bd2bacbdea3ee22c75c2a2c12d5..e749d6f6d3685f207f0ad4f2ebc7c3c7ae32992c 100644
--- a/paddle/fluid/operators/assign_value_op.h
+++ b/paddle/fluid/operators/assign_value_op.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/operators/auc_op.cc b/paddle/fluid/operators/auc_op.cc
index 71de78b1181daf4bd0b6d73508638857bafcf560..c9871a9fe6b3b0d0cf671c2d155715f92c94fd8f 100644
--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/auc_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -49,8 +50,7 @@ class AucOp : public framework::OperatorWithKernel {
 
 class AucOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AucOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Out",
              "A floating point 2D tensor, values are in the range [0, 1]."
              "Each row is sorted in descending order. This input should be the"
diff --git a/paddle/fluid/operators/auc_op.h b/paddle/fluid/operators/auc_op.h
index f4e8208c3f2e238a4acecab4579fc955092d5978..8b016c3d31ad83e66baeb298c61840cc529efa1e 100644
--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -40,7 +42,7 @@ class AucKernel : public framework::OpKernel<T> {
     std::vector<float> thresholds_list;
     thresholds_list.reserve(num_thresholds);
     for (int i = 1; i < num_thresholds - 1; i++) {
-      thresholds_list[i] = (float)i / (num_thresholds - 1);
+      thresholds_list[i] = static_cast<float>(i) / (num_thresholds - 1);
     }
     const float kEpsilon = 1e-7;
     thresholds_list[0] = 0.0f - kEpsilon;
@@ -105,11 +107,12 @@ class AucKernel : public framework::OpKernel<T> {
     float* fp_rate_data = fp_rate.mutable_data<float>(ctx.GetPlace());
     float* rec_rate_data = rec_rate.mutable_data<float>(ctx.GetPlace());
     for (int i = 0; i < num_thresholds; i++) {
-      tp_rate_data[i] =
-          ((float)tp_data[i] + epsilon) / (tp_data[i] + fn_data[i] + epsilon);
-      fp_rate_data[i] = (float)fp_data[i] / (fp_data[i] + tn_data[i] + epsilon);
-      rec_rate_data[i] =
-          ((float)tp_data[i] + epsilon) / (tp_data[i] + fp_data[i] + epsilon);
+      tp_rate_data[i] = (static_cast<float>(tp_data[i]) + epsilon) /
+                        (tp_data[i] + fn_data[i] + epsilon);
+      fp_rate_data[i] =
+          static_cast<float>(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon);
+      rec_rate_data[i] = (static_cast<float>(tp_data[i]) + epsilon) /
+                         (tp_data[i] + fp_data[i] + epsilon);
     }
     *auc_data = 0.0f;
     if (curve == "ROC") {
diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f389eab605e087c535b9918264e6502217062505
--- /dev/null
+++ b/paddle/fluid/operators/average_accumulates_op.cc
@@ -0,0 +1,215 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/average_accumulates_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <>
+void GetAccumulators<paddle::platform::CPUDeviceContext>(
+    const framework::ExecutionContext& ctx, int64_t* num_updates,
+    int64_t* num_accumulates, int64_t* old_num_accumulates) {
+  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
+  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
+  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
+
+  *old_num_accumulates = in_old_num_accumulates->data<int64_t>()[0];
+  *num_accumulates = in_num_accumulates->data<int64_t>()[0];
+  *num_updates = in_num_updates->data<int64_t>()[0];
+}
+
+template <>
+void SetAccumulators<paddle::platform::CPUDeviceContext>(
+    const framework::ExecutionContext& ctx, int64_t num_updates,
+    int64_t num_accumulates, int64_t old_num_accumulates) {
+  auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
+  auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
+  auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
+
+  out_old_num_accumulates->data<int64_t>()[0] = old_num_accumulates;
+  out_num_accumulates->data<int64_t>()[0] = num_accumulates;
+  out_num_updates->data<int64_t>()[0] = num_updates;
+}
+
+class AverageAccumulatesOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("param"),
+        "Input (param) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("in_sum_1"),
+        "Input (sum_1) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("in_sum_2"),
+        "Input (sum_2) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("in_sum_3"),
+        "Input (sum_3) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("in_num_accumulates"),
+        "Input (in_num_accumulates) of average_accumulates op should "
+        "not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("in_old_num_accumulates"),
+                   "Input (old_num_accumulates) of average_accumulates op "
+                   "should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("in_num_updates"),
+        "Input (num_updates) of average_accumulates op should not be null.");
+
+    PADDLE_ENFORCE(
+        ctx->HasOutput("out_sum_1"),
+        "Output (sum_1) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("out_sum_2"),
+        "Output (sum_2) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("out_sum_3"),
+        "Output (sum_3) of average_accumulates op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("out_num_accumulates"),
+                   "Output (num_accumulates) of average_accumulates op should "
+                   "not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("out_old_num_accumulates"),
+                   "Output (old_num_accumulates) of average_accumulates op "
+                   "should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("out_num_updates"),
+        "Output (num_updates) of average_accumulates op should not be null.");
+
+    auto in_dim = ctx->GetInputDim("param");
+
+    ctx->SetOutputDim("out_sum_1", in_dim);
+    ctx->SetOutputDim("out_sum_2", in_dim);
+    ctx->SetOutputDim("out_sum_3", in_dim);
+    ctx->SetOutputDim("out_num_accumulates", {1});
+    ctx->SetOutputDim("out_old_num_accumulates", {1});
+    ctx->SetOutputDim("out_num_updates", {1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("param")->type()),
+        ctx.GetPlace());
+  }
+};
+
+class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("param", "(Tensor), The parameter to be accumulated.");
+    AddInput("in_sum_1",
+             "(Tensor), A tensor used to store the parameter "
+             "sums with the same shape as input(param).");
+    AddInput("in_sum_2",
+             "(Tensor), A auxiliary tensor to help "
+             "accumulating sums of parameter values with the same shape as "
+             "input(param). It is used to avoid loss of precision due to too "
+             "many sums.");
+    AddInput("in_sum_3",
+             "(Tensor), A auxiliary tensor to help "
+             "accumulating sums of parameter values with the same shape as "
+             "input(param).");
+    AddInput("in_num_accumulates",
+             "(Tensor<int64_t>), The accumulating times of current window with "
+             "shape [1].");
+    AddInput(
+        "in_old_num_accumulates",
+        "(Tensor<int64_t>), The accumulating times of previous window with "
+        "shape [1].");
+    AddInput("in_num_updates",
+             "(Tensor<int64_t>), The total number of batches used by trainning "
+             "before this batch with shape [1].");
+
+    AddOutput("out_sum_1",
+              "(Tensor), A tensor used to store the "
+              "parameter sums with the same shape as input(param).");
+    AddOutput("out_sum_2",
+              "(Tensor), A auxiliary tensor to help "
+              "accumulating sums of parameter values with the same shape as "
+              "input(param). It is used to avoid loss of precision due to too "
+              "many sums.");
+    AddOutput("out_sum_3",
+              "(Tensor), A auxiliary tensor to help "
+              "accumulating sums of parameter values with the same shape as "
+              "input(param).");
+    AddOutput(
+        "out_num_accumulates",
+        "(Tensor<int64_t>), The accumulating times of current window with "
+        "shape [1].");
+    AddOutput(
+        "out_old_num_accumulates",
+        "(Tensor<int64_t>) The accumulating times of previous window with "
+        "shape [1].");
+    AddOutput(
+        "out_num_updates",
+        "(Tensor<int64_t>), The total number of batches used by trainning "
+        "before this batch with shape [1].");
+
+    AddAttr<float>("average_window",
+                   "(float, default 0) "
+                   "The rate of average window size relative to num_updates.")
+        .SetDefault(0);
+    AddAttr<int64_t>("max_average_window",
+                     "(int64_t) "
+                     "Maximum size of average window. It suggests that the "
+                     "number of mini-batches "
+                     "in one pass is appropriate value to set.");
+    AddAttr<int64_t>("min_average_window",
+                     "(int64_t, default 10000L) "
+                     "Minimu size of average window.")
+        .SetDefault(10000L);
+
+    AddComment(R"DOC(
+AverageAccumulates Operator.
+Accumulate the sum of parameter within sliding window. The size of sliding window is
+determined by 'average_window', 'max_average_window' and 'min_average_window'.
+Memory was shared by Input(in_sum_1) and Output(out_sum_1) which acts as an accumulator 'sum_1'.
+'sum_2', 'sum_3', 'num_accumulates', 'old_num_accumulates' and 'num_updates' were the same as 'sum_1'.
+
+All the accumulators were inited to zero before training.
+
+And for a mini-batch in training, accumulators were computed as below steps:
+    num_updates += 1
+    num_accumulates += 1
+    sum_1 += param
+    if num_updates % kMaxNumAccumulates == 0:
+        sum_2 += sum_1
+        sum_1 = 0
+    if num_accumulates >= min_average_window && num_accumulates >= min(max_average_window, num_updates * average_window):
+        sum_3 = sum_1 + sum_2
+        sum_1 = 0
+        sum_2 = 0
+        old_num_accumulates = num_accumulates
+        num_accumulates = 0
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(average_accumulates, ops::AverageAccumulatesOp,
+                  ops::AverageAccumulatesOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    average_accumulates,
+    ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AverageAccumulatesKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/average_accumulates_op.cu b/paddle/fluid/operators/average_accumulates_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..104e24f6ee2e2503d98f3a3991a903d8dbc4bdfe
--- /dev/null
+++ b/paddle/fluid/operators/average_accumulates_op.cu
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/average_accumulates_op.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+template <>
+void GetAccumulators<paddle::platform::CUDADeviceContext>(
+    const framework::ExecutionContext& ctx, int64_t* num_updates_,
+    int64_t* num_accumulates_, int64_t* old_num_accumulates_) {
+  auto* in_old_num_accumulates = ctx.Input<Tensor>("in_old_num_accumulates");
+  auto* in_num_accumulates = ctx.Input<Tensor>("in_num_accumulates");
+  auto* in_num_updates = ctx.Input<Tensor>("in_num_updates");
+  auto stream = ctx.cuda_device_context().stream();
+  auto cuda_place =
+      boost::get<platform::CUDAPlace>(in_old_num_accumulates->place());
+  memory::Copy(platform::CPUPlace(), old_num_accumulates_, cuda_place,
+               in_old_num_accumulates->data<int64_t>(), sizeof(int64_t),
+               stream);
+  memory::Copy(platform::CPUPlace(), num_accumulates_, cuda_place,
+               in_num_accumulates->data<int64_t>(), sizeof(int64_t), stream);
+  memory::Copy(platform::CPUPlace(), num_updates_, cuda_place,
+               in_num_updates->data<int64_t>(), sizeof(int64_t), stream);
+}
+
+template <>
+void SetAccumulators<paddle::platform::CUDADeviceContext>(
+    const framework::ExecutionContext& ctx, int64_t num_updates_,
+    int64_t num_accumulates_, int64_t old_num_accumulates_) {
+  auto stream = ctx.cuda_device_context().stream();
+  auto* out_old_num_accumulates = ctx.Output<Tensor>("out_old_num_accumulates");
+  auto* out_num_accumulates = ctx.Output<Tensor>("out_num_accumulates");
+  auto* out_num_updates = ctx.Output<Tensor>("out_num_updates");
+  auto cuda_place =
+      boost::get<platform::CUDAPlace>(out_old_num_accumulates->place());
+
+  memory::Copy(cuda_place, out_old_num_accumulates->data<int64_t>(),
+               platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t),
+               stream);
+  memory::Copy(cuda_place, out_num_accumulates->data<int64_t>(),
+               platform::CPUPlace(), &num_accumulates_, sizeof(int64_t),
+               stream);
+  memory::Copy(cuda_place, out_num_updates->data<int64_t>(),
+               platform::CPUPlace(), &num_updates_, sizeof(int64_t), stream);
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    average_accumulates,
+    ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AverageAccumulatesKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..3958d3f685470f2505abf0e8bfd269d3834970ae
--- /dev/null
+++ b/paddle/fluid/operators/average_accumulates_op.h
@@ -0,0 +1,114 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext>
+void GetAccumulators(const framework::ExecutionContext& ctx,
+                     int64_t* num_updates, int64_t* num_accumulates,
+                     int64_t* old_num_accumulates);
+
+template <typename DeviceContext>
+void SetAccumulators(const framework::ExecutionContext& ctx,
+                     int64_t num_updates, int64_t num_accumulates,
+                     int64_t old_num_accumulates);
+
+template <typename DeviceContext, typename T>
+class AverageAccumulatesKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // It is used to avoid loss of precision
+    static const int64_t kMaxNumAccumulates = 16384;
+    // Get accumulators from input
+    int64_t num_updates = 0;
+    int64_t num_accumulates = 0;
+    int64_t old_num_accumulates = 0;
+    GetAccumulators<DeviceContext>(ctx, &num_updates, &num_accumulates,
+                                   &old_num_accumulates);
+
+    // Get attrs
+    float average_window = ctx.Attr<float>("average_window");
+    int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
+    int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
+    PADDLE_ENFORCE_LE(min_average_window, max_average_window,
+                      "min_average_window shouldn't be larger than "
+                      "max_average_window");
+
+    // Get inputs
+    auto* param = ctx.Input<Tensor>("param");
+    auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
+    auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
+    auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
+    auto param_tensor = EigenVector<T>::Flatten(*param);
+    auto in_sum_1_tensor = EigenVector<T>::Flatten(*in_sum_1);
+    auto in_sum_2_tensor = EigenVector<T>::Flatten(*in_sum_2);
+    auto in_sum_3_tensor = EigenVector<T>::Flatten(*in_sum_3);
+
+    // Get outputs
+    auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
+    auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
+    auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
+    auto out_sum_1_tensor = EigenVector<T>::Flatten(*out_sum_1);
+    auto out_sum_2_tensor = EigenVector<T>::Flatten(*out_sum_2);
+    auto out_sum_3_tensor = EigenVector<T>::Flatten(*out_sum_3);
+
+    // Compute
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    math::SetConstant<DeviceContext, T> constant_functor;
+    ++num_updates;
+    ++num_accumulates;
+    out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
+    out_sum_2_tensor.device(place) = in_sum_2_tensor;
+    out_sum_3_tensor.device(place) = in_sum_3_tensor;
+    if (num_updates % kMaxNumAccumulates == 0) {
+      // Move the sum to a different buffer to avoid loss of precision due to
+      // too many sums.
+      out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor;
+      constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
+                       0.0);
+    }
+    if (num_accumulates >= min_average_window &&
+        num_accumulates >= std::min<int64_t>(max_average_window,
+                                             num_updates * average_window)) {
+      //  Now the average window is too long, discard the old sum.
+      out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor;
+      constant_functor(ctx.template device_context<DeviceContext>(), out_sum_1,
+                       0.0);
+      constant_functor(ctx.template device_context<DeviceContext>(), out_sum_2,
+                       0.0);
+      old_num_accumulates = num_accumulates;
+      num_accumulates = 0;
+    }
+
+    // Set accumulators to output
+    SetAccumulators<DeviceContext>(ctx, num_updates, num_accumulates,
+                                   old_num_accumulates);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9ab2179b5fe689762704039c5f67dd080e530aa5
--- /dev/null
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
@@ -0,0 +1,356 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "mkldnn.hpp"
+#include "paddle/fluid/operators/batch_norm_op.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using batch_norm_bwd = mkldnn::batch_normalization_backward;
+using batch_norm_fwd = mkldnn::batch_normalization_forward;
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::reorder;
+using mkldnn::stream;
+using paddle::platform::MKLDNNDeviceContext;
+using paddle::platform::MKLDNNMemDesc;
+using platform::to_void_cast;
+
+namespace {
+template <typename T>
+struct bn_type_traits {
+  using op_type = T;
+  using op_desc = typename op_type::desc;
+  using op_prim = typename op_type::primitive_desc;
+};
+
+template <typename T, typename Container>
+void copy_to_weights(T scale_begin, T scale_end, T shift_begin, T shift_end,
+                     Container *c) {
+  auto it = std::begin(*c);
+
+  std::copy(scale_begin, scale_end, std::inserter(*c, it));
+  std::copy(
+      shift_begin, shift_end,
+      std::inserter(*c, std::next(it, std::distance(scale_begin, scale_end))));
+}
+
+template <typename Op, typename... Args>
+void run_batch_norm_op(Args &&... args) {
+  Op batch_norm_op{args...};
+
+  std::vector<mkldnn::primitive> pipeline;
+  pipeline.push_back(batch_norm_op);
+  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+}
+
+}  // namespace
+
+template <typename T>
+class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    const float momentum = ctx.Attr<float>("momentum");
+    const bool is_test = ctx.Attr<bool>("is_test");
+    const bool fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *mean = ctx.Input<Tensor>("Mean");
+    const auto *variance = ctx.Input<Tensor>("Variance");
+
+    auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    auto mkldnn_engine = dev_ctx.GetEngine();
+
+    auto *y = ctx.Output<Tensor>("Y");
+    auto *mean_out = ctx.Output<Tensor>("MeanOut");
+    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
+    auto *batch_mean = ctx.Output<Tensor>("SavedMean");
+    auto *batch_variance = ctx.Output<Tensor>("SavedVariance");
+
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *shift = ctx.Input<Tensor>("Bias");
+
+    PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
+                       x->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input x tensor");
+
+    const T *x_data = x->data<T>();
+    const T *mean_data = mean->data<T>();
+    const T *variance_data = variance->data<T>();
+    T *y_data = y->mutable_data<T>(ctx.GetPlace());
+    T *mean_out_data = mean_out->mutable_data<T>(ctx.GetPlace());
+    T *variance_out_data = variance_out->mutable_data<T>(ctx.GetPlace());
+    T *batch_mean_data = nullptr;
+    T *batch_variance_data = nullptr;
+
+    if (!is_test) {
+      batch_mean_data = batch_mean->mutable_data<T>(ctx.GetPlace());
+      batch_variance_data = batch_variance->mutable_data<T>(ctx.GetPlace());
+    }
+
+    auto propagation = is_test == true ? mkldnn::prop_kind::forward_scoring
+                                       : mkldnn::prop_kind::forward_training;
+
+    auto src_tz = paddle::framework::vectorize2int(x->dims());
+    auto scale_tz = paddle::framework::vectorize2int(scale->dims());
+    PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1");
+    const unsigned int ic = scale_tz[0];
+
+    unsigned flags = mkldnn::use_scale_shift;
+    if (is_test) flags |= mkldnn::use_global_stats;
+    if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
+
+    // create mkldnn memory from input x tensor
+    mkldnn::memory::format input_format =
+        platform::MKLDNNFormatForSize(src_tz.size(), x->format());
+
+    auto src_memory = memory(
+        {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
+        to_void_cast(x_data));
+
+    // create primitive descriptor for batch norm forward
+    using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
+    auto batch_norm_fwd_desc = bn_fwd_types::op_desc{
+        propagation, src_memory.get_primitive_desc().desc(), epsilon, flags};
+    std::shared_ptr<batch_norm_fwd::primitive_desc> batch_norm_fwd_pd =
+        std::shared_ptr<batch_norm_fwd::primitive_desc>(
+            new batch_norm_fwd::primitive_desc(batch_norm_fwd_desc,
+                                               mkldnn_engine));
+
+    // Save the pd to be used in backward pass
+    const std::string key = ctx.op().Output("SavedMean");
+    const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
+    dev_ctx.SetBlob(key_batch_norm_fwd_pd, batch_norm_fwd_pd);
+
+    // MKLDNN requires a single piece of memory for scale and shift/bias data
+    const size_t scaleshift_size = 2 * ic;
+    std::vector<T> scaleshift_data;
+    scaleshift_data.reserve(scaleshift_size);
+
+    copy_to_weights(scale->data<T>(), scale->data<T>() + ic, shift->data<T>(),
+                    shift->data<T>() + ic, &scaleshift_data);
+
+    // crate mkldnn memory for weights(scale/shift)
+    auto scaleshift_memory = memory(batch_norm_fwd_pd->weights_primitive_desc(),
+                                    scaleshift_data.data());
+
+    // create mkldnn memory for output y tensor
+    auto dst_memory = memory(batch_norm_fwd_pd->dst_primitive_desc(), y_data);
+
+    if (is_test) {
+      // create mkldnn memory for stats (as input)
+      auto mean_memory = memory(batch_norm_fwd_pd->mean_primitive_desc(),
+                                to_void_cast(mean_data));
+      auto variance_memory =
+          memory(batch_norm_fwd_pd->variance_primitive_desc(),
+                 to_void_cast(variance_data));
+
+      run_batch_norm_op<typename bn_fwd_types::op_type>(
+          *batch_norm_fwd_pd, src_memory,
+          (const mkldnn::primitive::at &)mean_memory,
+          (const mkldnn::primitive::at &)variance_memory, scaleshift_memory,
+          dst_memory);
+    } else {
+      // create mkldnn memory for stats (as output)
+      auto mean_memory =
+          memory(batch_norm_fwd_pd->mean_primitive_desc(), batch_mean_data);
+      auto variance_memory = memory(
+          batch_norm_fwd_pd->variance_primitive_desc(), batch_variance_data);
+
+      run_batch_norm_op<bn_fwd_types::op_type>(*batch_norm_fwd_pd, src_memory,
+                                               scaleshift_memory, dst_memory,
+                                               mean_memory, variance_memory);
+    }
+
+    if (!is_test) {
+      // mkldnn only compute stats for current batch
+      // so we need compute momentum stats via Eigen lib
+      EigenVectorArrayMap<T> batch_mean_e(batch_mean_data, ic);
+      EigenVectorArrayMap<T> batch_variance_e(batch_variance_data, ic);
+      ConstEigenVectorArrayMap<T> mean_e(mean_data, ic);
+      ConstEigenVectorArrayMap<T> variance_e{variance_data, ic};
+
+      EigenVectorArrayMap<T> running_mean_e(mean_out_data, ic);
+      EigenVectorArrayMap<T> running_variance_e(variance_out_data, ic);
+
+      auto one_minus_momentum = 1. - momentum;
+      running_mean_e = mean_e * momentum + batch_mean_e * one_minus_momentum;
+      running_variance_e =
+          variance_e * momentum + batch_variance_e * one_minus_momentum;
+    }
+
+    y->set_layout(DataLayout::kMKLDNN);
+    y->set_format(
+        (memory::format)dst_memory.get_primitive_desc().desc().data.format);
+  }
+};
+
+template <typename T>
+class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext &ctx) const override {
+    auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    auto mkldnn_engine = dev_ctx.GetEngine();
+
+    const float epsilon = ctx.Attr<float>("epsilon");
+
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *shift = ctx.Input<Tensor>("Bias");
+    const auto *batch_mean = ctx.Input<Tensor>("SavedMean");
+    const auto *batch_variance = ctx.Input<Tensor>("SavedVariance");
+
+    const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto *diff_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *diff_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *diff_shift = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    PADDLE_ENFORCE(diff_y->layout() == DataLayout::kMKLDNN &&
+                       diff_y->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input diff_y tensor");
+
+    const T *x_data = x->data<T>();
+    const T *diff_y_data = diff_y->data<T>();
+    const T *batch_mean_data = batch_mean->data<T>();
+    const T *batch_variance_data = batch_variance->data<T>();
+    const T *scale_data = scale->data<T>();
+    const T *shift_data = shift->data<T>();
+    T *diff_x_data = diff_x->mutable_data<T>(ctx.GetPlace());
+    T *diff_scale_data = diff_scale->mutable_data<T>(ctx.GetPlace());
+    T *diff_shift_data = diff_shift->mutable_data<T>(ctx.GetPlace());
+
+    auto src_tz = paddle::framework::vectorize2int(x->dims());
+    auto diff_src_tz = src_tz;
+    auto dst_tz = src_tz;
+    auto diff_dst_tz = dst_tz;
+    auto scale_tz = paddle::framework::vectorize2int(scale->dims());
+    PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1");
+
+    const unsigned int ic = scale_tz[0];
+
+    // Retrieve bn_fwd_pd from device context
+    const std::string key = ctx.op().Input("SavedMean");
+    const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
+    auto batch_norm_fwd_pd =
+        std::static_pointer_cast<batch_norm_fwd::primitive_desc>(
+            dev_ctx.GetBlob(key_batch_norm_fwd_pd));
+    PADDLE_ENFORCE(batch_norm_fwd_pd != nullptr,
+                   "Fail to find batch_norm_fwd_pd in device context");
+
+    using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
+
+    // create mkldnn memory from input diff_y tensor
+
+    mkldnn::memory::format dst_format =
+        platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format());
+
+    auto user_diff_dst_memory = memory(
+        {{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine},
+        to_void_cast(diff_y_data));
+
+    // create mkldnn memory from input x tensor
+    mkldnn::memory::format input_format =
+        platform::MKLDNNFormatForSize(src_tz.size(), x->format());
+
+    auto src_memory = memory(
+        {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
+        to_void_cast(x_data));
+
+    // for diff_dst, try to use same format as dst in forward pass
+    auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc();
+    auto diff_dst_md = diff_dst_pd.desc();
+
+    // create primitive descriptor for batch norm backward
+    unsigned flags = mkldnn::use_scale_shift;
+    auto batch_norm_bwd_desc = bn_bwd_types::op_desc{
+        mkldnn::prop_kind::backward, diff_dst_md,
+        src_memory.get_primitive_desc().desc(), epsilon, flags};
+    auto batch_norm_bwd_pd = bn_bwd_types::op_prim{
+        batch_norm_bwd_desc, mkldnn_engine, *batch_norm_fwd_pd};
+
+    // reorder user_diff_dst if it's not in preferred format
+    auto diff_dst_memory = user_diff_dst_memory;
+    primitive reorder_diff_dst;
+    bool is_diff_dst_reordered = false;
+    if (diff_dst_pd != user_diff_dst_memory.get_primitive_desc()) {
+      diff_dst_memory = memory(diff_dst_pd);
+      reorder_diff_dst = reorder(user_diff_dst_memory, diff_dst_memory);
+      is_diff_dst_reordered = true;
+    }
+
+    // create mkldnn memory for input tensors (src/mean/variance)
+    auto mean_memory = memory(batch_norm_bwd_pd.mean_primitive_desc(),
+                              to_void_cast(batch_mean_data));
+    auto variance_memory = memory(batch_norm_bwd_pd.variance_primitive_desc(),
+                                  to_void_cast(batch_variance_data));
+
+    // MKLDNN requires a single piece of memory for scale and shift/bias data
+    const size_t scaleshift_size = 2 * ic;
+
+    std::vector<T> scaleshift_data;
+    scaleshift_data.reserve(scaleshift_size);
+    copy_to_weights(scale_data, scale_data + ic, shift_data, shift_data + ic,
+                    &scaleshift_data);
+
+    // create mkldnn memory for input tensors (scale/shift)
+    auto scaleshift_memory = memory(batch_norm_bwd_pd.weights_primitive_desc(),
+                                    scaleshift_data.data());
+
+    // create mkldnn memory for output diff weights (combined scale/shift)
+    std::vector<T> diff_scaleshift_data;
+    diff_scaleshift_data.reserve(scaleshift_size);
+    auto diff_scaleshift_memory =
+        memory(batch_norm_bwd_pd.diff_weights_primitive_desc(),
+               diff_scaleshift_data.data());
+
+    // here assume diff_src is in the same format of src
+    auto diff_src_memory = memory(src_memory.get_primitive_desc(), diff_x_data);
+
+    // finally create batch_norm backward primitive
+    auto batch_norm_bwd_prim =
+        batch_norm_bwd(batch_norm_bwd_pd, src_memory, mean_memory,
+                       variance_memory, diff_dst_memory, scaleshift_memory,
+                       diff_src_memory, diff_scaleshift_memory);
+
+    // execute optional reorder and batch_norm backward primitive
+    std::vector<primitive> pipeline;
+    if (is_diff_dst_reordered) pipeline.push_back(reorder_diff_dst);
+    pipeline.push_back(batch_norm_bwd_prim);
+    stream(stream::kind::eager).submit(pipeline).wait();
+
+    // copy back diff sacle/shift to output tensors (diff scale/shift)
+    diff_scaleshift_data.resize(scaleshift_size);
+    auto it = std::begin(diff_scaleshift_data);
+    std::copy(it, std::next(it, ic), diff_scale_data);
+    std::copy(std::next(it, ic), std::end(diff_scaleshift_data),
+              diff_shift_data);
+
+    // set layout/format of output tensors
+    diff_x->set_layout(DataLayout::kMKLDNN);
+    diff_x->set_format((memory::format)diff_src_memory.get_primitive_desc()
+                           .desc()
+                           .data.format);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(batch_norm, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::BatchNormMKLDNNOpKernel<float>);
+REGISTER_OP_KERNEL(batch_norm_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::BatchNormMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 215ae229aff96d76fc948e19bdb42db319af65dc..693bf973c2b8790d2c50cee9b86b365493e8c754 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -13,27 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/batch_norm_op.h"
+#include <string>
 #include "paddle/fluid/framework/data_layout.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DataLayout = framework::DataLayout;
-
-template <typename T>
-using EigenArrayMap =
-    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using ConstEigenArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
-template <typename T>
-using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
-template <typename T>
-using ConstEigenVectorArrayMap =
-    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
-
 class BatchNormOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -80,12 +68,51 @@ class BatchNormOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("SavedVariance", {C});
     ctx->ShareLoD("X", "Y");
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    // By default, the type of the scale, bias, mean,
+    // and var tensors should both be float. (For float or float16 input tensor)
+    // or double (For double input tensor).
+    auto bn_param_type = framework::proto::VarType::FP32;
+    if (input_data_type == framework::proto::VarType::FP64) {
+      bn_param_type = framework::proto::VarType::FP64;
+    }
+    PADDLE_ENFORCE_EQ(bn_param_type,
+                      framework::ToDataType(ctx.Input<Tensor>("Scale")->type()),
+                      "Scale input should be of float type");
+    PADDLE_ENFORCE_EQ(bn_param_type,
+                      framework::ToDataType(ctx.Input<Tensor>("Bias")->type()),
+                      "Bias input should be of float type");
+    PADDLE_ENFORCE_EQ(bn_param_type,
+                      framework::ToDataType(ctx.Input<Tensor>("Mean")->type()),
+                      "Mean input should be of float type");
+    PADDLE_ENFORCE_EQ(bn_param_type, framework::ToDataType(
+                                         ctx.Input<Tensor>("Variance")->type()),
+                      "Variance input should be of float type");
+
+    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+#ifdef PADDLE_WITH_MKLDNN
+    if (library == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library = framework::LibraryType::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
+    }
+#endif
+
+    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
+                                   library);
+  }
 };
 
 class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BatchNormOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddAttr<bool>("is_test", "").SetDefault(false);
     AddAttr<float>("momentum", "").SetDefault(0.9);
     AddAttr<float>("epsilon", "")
@@ -108,13 +135,15 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Variance",
              "The global variance (for training) "
              "or estimated Variance (for testing)");
-    AddOutput("Y", "result after normalization");
+    AddOutput("Y", "result after normalization").Reuse("X");
     AddOutput("MeanOut",
               "Share memory with Mean. "
-              "Store the global mean when training");
+              "Store the global mean when training")
+        .Reuse("Mean");
     AddOutput("VarianceOut",
               "Share memory with Variance. "
-              "Store the global Variance when training");
+              "Store the global Variance when training")
+        .Reuse("Variance");
     AddOutput("SavedMean",
               "Mean of the current mini batch, "
               "will apply to output when training")
@@ -123,6 +152,12 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
               "Variance of the current mini batch, "
               "will apply to output when training")
         .AsIntermediate();
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
+    AddAttr<bool>("fuse_with_relu",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddComment(R"DOC(
 Batch Normalization.
 
@@ -321,8 +356,22 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
     if (t == nullptr) {
       PADDLE_THROW("can't find Y@GRAD");
     }
-    return framework::OpKernelType(framework::ToDataType(t->type()),
-                                   ctx.GetPlace());
+
+    // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (library == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library = framework::LibraryType::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
+    }
+#endif
+
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+        layout, library);
   }
 };
 
@@ -434,15 +483,44 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
   }
 };
 
+class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *op = new framework::OpDesc();
+    op->SetType("batch_norm_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
+
+    op->SetInput("Scale", Input("Scale"));
+    op->SetInput("Bias", Input("Bias"));
+    op->SetInput("SavedMean", Output("SavedMean"));
+    op->SetInput("SavedVariance", Output("SavedVariance"));
+
+    op->SetAttrMap(Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale"));
+    op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
+
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
-            batch_norm_grad, ops::BatchNormGradOp);
+REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
+                  ops::BatchNormGradMaker);
+REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp);
+
 REGISTER_OP_CPU_KERNEL(
-    batch_norm,
-    ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>);
+    batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::BatchNormKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     batch_norm_grad,
-    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu.cc
index 2d1556efc66826ea9847de8311ccecdee0ea7871..550dd32d36767f90e880415bfffaf01aeb623609 100644
--- a/paddle/fluid/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/batch_norm_op.h"
-#include "paddle/fluid/framework/data_layout.h"
-
 #include <cfloat>
+#include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -26,6 +26,8 @@ using Tensor = framework::Tensor;
 using DataLayout = framework::DataLayout;
 template <typename T>
 using CudnnDataType = platform::CudnnDataType<T>;
+template <typename T>
+using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
 void ExtractNCWHD(const framework::DDim &dims, const DataLayout &data_layout,
                   int *N, int *C, int *H, int *W, int *D) {
@@ -104,29 +106,19 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
         data_desc_, CudnnDataType<T>::type,
         x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
+    // Note: PERSISTENT not implemented for inference
     CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
-        bn_param_desc_, data_desc_, mode_));
+        bn_param_desc_, data_desc_, is_test ? CUDNN_BATCHNORM_SPATIAL : mode_));
 
     const auto *scale = ctx.Input<Tensor>("Scale");
     const auto *bias = ctx.Input<Tensor>("Bias");
 
     auto *y = ctx.Output<Tensor>("Y");
-    auto *mean_out = ctx.Output<Tensor>("MeanOut");
-    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
 
     // alloc memory
     y->mutable_data<T>(ctx.GetPlace());
-    mean_out->mutable_data<T>(ctx.GetPlace());
-    variance_out->mutable_data<T>(ctx.GetPlace());
-    saved_mean->mutable_data<T>(ctx.GetPlace());
-    saved_variance->mutable_data<T>(ctx.GetPlace());
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    math::SetConstant<platform::CUDADeviceContext, T> functor;
-    functor(dev_ctx, saved_mean, 0);
-    functor(dev_ctx, saved_variance, 0);
 
     auto handle = dev_ctx.cudnn_handle();
 
@@ -147,23 +139,45 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
           CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(),
           CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
           data_desc_, y->template mutable_data<T>(ctx.GetPlace()),
-          bn_param_desc_, scale->template data<T>(), bias->template data<T>(),
-          est_mean->template data<T>(), est_var->template data<T>(), epsilon));
+          bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
+          bias->template data<BatchNormParamType<T>>(),
+          est_mean->template data<BatchNormParamType<T>>(),
+          est_var->template data<BatchNormParamType<T>>(), epsilon));
     } else {
       // Run training mode.
       // obtain running mean and running inv var, and see if we need to
       // initialize them.
+
+      auto *mean_out = ctx.Output<Tensor>("MeanOut");
+      auto *variance_out = ctx.Output<Tensor>("VarianceOut");
+      mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+      variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+
+      auto *saved_mean = ctx.Output<Tensor>("SavedMean");
+      auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
+      saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+      saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+      math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
+          functor;
+      functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
+      functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
+
       double this_factor = 1. - momentum;
 
       CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining(
           handle, mode_, CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
           data_desc_, x->template data<T>(), data_desc_,
           y->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
-          scale->template data<T>(), bias->template data<T>(), this_factor,
-          mean_out->template mutable_data<T>(ctx.GetPlace()),
-          variance_out->template mutable_data<T>(ctx.GetPlace()), epsilon,
-          saved_mean->template mutable_data<T>(ctx.GetPlace()),
-          saved_variance->template mutable_data<T>(ctx.GetPlace())));
+          scale->template data<BatchNormParamType<T>>(),
+          bias->template data<BatchNormParamType<T>>(), this_factor,
+          mean_out->template mutable_data<BatchNormParamType<T>>(
+              ctx.GetPlace()),
+          variance_out->template mutable_data<BatchNormParamType<T>>(
+              ctx.GetPlace()),
+          epsilon, saved_mean->template mutable_data<BatchNormParamType<T>>(
+                       ctx.GetPlace()),
+          saved_variance->template mutable_data<BatchNormParamType<T>>(
+              ctx.GetPlace())));
     }
 
     // clean when exit.
@@ -270,9 +284,11 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
-    batch_norm,
-    ops::BatchNormKernel<paddle::platform::CUDADeviceContext, float>);
+    batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>,
+    ops::BatchNormKernel<plat::CUDADeviceContext, double>,
+    ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
-    batch_norm_grad,
-    ops::BatchNormGradKernel<paddle::platform::CUDADeviceContext, float>);
+    batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
+    ops::BatchNormGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h
index 9e5fc41598f29336074335f3624a2300ad018d09..5e3d630d6889e445c5e84fa836d2d81bb7266779 100644
--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
@@ -19,6 +19,22 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+
 template <typename DeviceContext, typename T>
 class BatchNormKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/batch_size_like.h b/paddle/fluid/operators/batch_size_like.h
index 0bdf27e620a3a7c7b62b955f708a5e2aad1a6986..fc15d56891cf7af10a91ca22a09c84fa2e52d465 100644
--- a/paddle/fluid/operators/batch_size_like.h
+++ b/paddle/fluid/operators/batch_size_like.h
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
+#include <algorithm>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -52,22 +53,25 @@ class BatchSizeLikeOp : public framework::OperatorWithKernel {
 
 class BatchSizeLikeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Input",
-             "(Tensor) Tensor "
-             "whose input_dim_idx'th dimension specifies the batch_size");
+  void Make() final {
+    AddInput(
+        "Input",
+        "Tensor whose input_dim_idx'th dimension specifies the batch_size");
     AddOutput("Out",
-              "(Tensor) Tensor of specified shape will be filled "
+              "Tensor of specified shape will be filled "
               "with the specified value");
-    AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
+    AddAttr<std::vector<int>>("shape", "The shape of the output");
     AddAttr<int>("input_dim_idx",
-                 "(int, default 0) The index of input's batch size dimension")
+                 "default 0. The index of input's batch size dimension")
         .SetDefault(0);
     AddAttr<int>("output_dim_idx",
-                 "(int, default 0) The index of output's batch size dimension")
+                 "default 0. The index of output's batch size dimension")
         .SetDefault(0);
+    Apply();
   }
+
+ protected:
+  virtual void Apply() = 0;
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index 718f469d38c3c6b7272c1531fae0a1e9ad2e8e3e..10d678111f5325e495b24286e6ecf651230393fe 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <algorithm>
+#include <string>
+
 #include "paddle/fluid/operators/beam_search_decode_op.h"
 #include "paddle/fluid/platform/device_context.h"
 
@@ -21,26 +24,80 @@ namespace operators {
 struct BeamSearchDecodeFunctor {
   BeamSearchDecodeFunctor(const LoDTensorArray& step_ids,
                           const LoDTensorArray& step_scores,
-                          LoDTensor* id_tensor, LoDTensor* score_tensor)
-      : step_ids_(step_ids),
-        step_scores_(step_scores),
+                          LoDTensor* id_tensor, LoDTensor* score_tensor,
+                          size_t beam_size, int end_id)
+      : beam_size_(beam_size),
+        end_id_(end_id),
+        step_ids_origin_(step_ids),
+        step_scores_origin_(step_scores),
         id_tensor_(id_tensor),
-        score_tensor_(score_tensor) {}
+        score_tensor_(score_tensor) {
+    tensor_on_gpu_ = false;
+    // First make a copy of GPU data on CPU
+    if (platform::is_gpu_place(step_ids_origin_[0].place())) {
+      tensor_on_gpu_ = true;
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      auto* dev_ctx = pool.Get(step_ids_origin_[0].place());
+      // Copy all tensors in the input tensor array
+      for (auto& step_id : step_ids_origin_) {
+        framework::LoDTensor out;
+        if (step_id.numel() > 0) {
+          dev_ctx->Wait();
+          framework::TensorCopy(step_id, platform::CPUPlace(), *dev_ctx, &out);
+          dev_ctx->Wait();
+        }
+
+        out.set_lod(step_id.lod());
+        step_ids_.push_back(out);
+      }
+    }
+    if (platform::is_gpu_place(step_scores_origin_[0].place())) {
+      tensor_on_gpu_ = true;
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      auto* dev_ctx = pool.Get(step_scores_origin_[0].place());
+      // Copy all tensors in the input tensor array
+      for (auto& step_score : step_scores_origin_) {
+        framework::LoDTensor out;
+        if (step_score.numel() > 0) {
+          dev_ctx->Wait();
+          framework::TensorCopy(step_score, platform::CPUPlace(), *dev_ctx,
+                                &out);
+          dev_ctx->Wait();
+        }
+
+        out.set_lod(step_score.lod());
+        step_scores_.push_back(out);
+      }
+    }
+  }
 
   template <typename T>
   void operator()() const;
 
-  const LoDTensorArray& step_ids_;
-  const LoDTensorArray& step_scores_;
+  bool tensor_on_gpu_;
+  size_t beam_size_;
+  int end_id_;
+  const LoDTensorArray& step_ids_origin_;
+  const LoDTensorArray& step_scores_origin_;
+  LoDTensorArray step_ids_ = LoDTensorArray();
+  LoDTensorArray step_scores_ = LoDTensorArray();
   LoDTensor* id_tensor_;
   LoDTensor* score_tensor_;
 };
 
 template <typename T>
 void BeamSearchDecodeFunctor::operator()() const {
-  BeamSearchDecoder<T> beam_search_decoder;
-  beam_search_decoder.PackAllSteps(step_ids_, step_scores_, id_tensor_,
-                                   score_tensor_);
+  BeamSearchDecoder<T> beam_search_decoder(beam_size_, end_id_);
+  // Check if the tensor is on GPU. If so, use the CPU copy instead
+  if (tensor_on_gpu_) {
+    beam_search_decoder.Backtrace(step_ids_, step_scores_, id_tensor_,
+                                  score_tensor_);
+  } else {
+    beam_search_decoder.Backtrace(step_ids_origin_, step_scores_origin_,
+                                  id_tensor_, score_tensor_);
+  }
 }
 
 template <>
@@ -77,34 +134,51 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
                         "Level of LodTensor should be 2");
     }
 
+    size_t beam_size = ctx.Attr<int>("beam_size");
+    int end_id = ctx.Attr<int>("end_id");
+
     // prepare output
     LoDTensor* sentenceIds = ctx.Output<LoDTensor>("SentenceIds");
     LoDTensor* sentenceScores = ctx.Output<LoDTensor>("SentenceScores");
 
     framework::VisitDataType(
         framework::ToDataType(scores->at(0).type()),
-        BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores));
+        BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores,
+                                beam_size, end_id));
   }
 };
 
 class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BeamSearchDecodeOpProtoMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Ids",
              "(LodTensorArray)"
-             "score of the candidate words in each step");
+             "The LodTensorArray containing the selected ids of all steps");
     AddInput("Scores",
              "(LodTensorArray)"
-             "score of the candidate words in each step");
-    AddOutput("SentenceIds",
-              "(LodTensor)"
-              "All possible result sentences of word ids");
-    AddOutput("SentenceScores",
-              "(LodTensor)"
-              "All possible result sentences of word scores");
+             "The LodTensorArray containing the selected scores of all steps");
+    AddOutput(
+        "SentenceIds",
+        "(LodTensor)"
+        "An LodTensor containing all generated id sequences for all source "
+        "sentences");
+    AddOutput(
+        "SentenceScores",
+        "(LodTensor)"
+        "An LodTensor containing scores corresponding to Output(SentenceIds)");
+    AddAttr<int>("beam_size", "beam size for beam search");
+    AddAttr<int>("end_id",
+                 "the token id which indicates the end of a sequence");
     AddComment(R"DOC(
-Pack the result of Beam search op into SentenceIds and SentenceScores.
+Beam Search Decode Operator. This Operator constructs the full hypotheses for
+each source sentence by walking back along the LoDTensorArray Input(ids)
+whose lods can be used to restore the path in the beam search tree.
+
+The Output(SentenceIds) and Output(SentenceScores) separately contain the 
+generated id sequences and the corresponding scores. The shapes and lods of the 
+two LodTensor are same. The lod level is 2 and the two levels separately 
+indicate how many hypotheses each source sentence has and how many ids each 
+hypothesis has.
 )DOC");
   }
 };
@@ -128,10 +202,12 @@ class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
   void operator()(const framework::OpDesc& op_desc,
                   framework::BlockDesc* block) const override {
     for (auto& o : op_desc.Output("SentenceIds")) {
-      block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR);
+      auto& sentence_ids = block->FindRecursiveOrCreateVar(o);
+      sentence_ids.SetType(framework::proto::VarType::LOD_TENSOR);
     }
     for (auto& o : op_desc.Output("SentenceScores")) {
-      block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR);
+      auto& sentence_scores = block->FindRecursiveOrCreateVar(o);
+      sentence_scores.SetType(framework::proto::VarType::LOD_TENSOR);
     }
   }
 };
diff --git a/paddle/fluid/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h
index 3cc6ed310575473fae8e91a8507fb9146107e841..6aefc5446f167eebb0da673b3fbdf7ed128daa98 100644
--- a/paddle/fluid/operators/beam_search_decode_op.h
+++ b/paddle/fluid/operators/beam_search_decode_op.h
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
+#include <vector>
+
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -24,42 +27,12 @@ using LoDTensor = framework::LoDTensor;
 using LoDTensorArray = framework::LoDTensorArray;
 
 // all the lod have 2 levels.
-// The First is source level, the second is sentence level.
-// source level describe how many candidate words for this source.
-// sentence level describe these candidates belong to which prefix
+// The first is source level, the second is sentence level.
+// source level describe how many prefixes (branchs) for each source sentece
+// (beam). sentence level describe how these candidates belong to the prefixes.
 const size_t kSourceLevel = 0;
 const size_t kSentenceLevel = 1;
 
-template <typename T>
-struct BeamNode {
-  BeamNode(int64_t word_id, T score) : word_id_(word_id), score_(score) {}
-
-  ~BeamNode() {
-    if (parent_) {
-      parent_->DropKid(this);
-      if (parent_->kids_.size() == 0UL) {
-        delete parent_;
-      }
-    }
-    VLOG(3) << "Delete BeamNode root with word_id:" << this->word_id_;
-  }
-
-  void AppendTo(BeamNode* parent) {
-    parent_ = parent;
-    parent->kids_.insert(this);
-  }
-
-  void DropKid(BeamNode* kid) { kids_.erase(kid); }
-
-  BeamNode* parent_ = nullptr;
-  std::unordered_set<BeamNode*> kids_;
-  int64_t word_id_;
-  T score_;
-};
-
-template <typename T>
-using BeamNodeVector = std::vector<std::unique_ptr<BeamNode<T>>>;
-
 template <typename T>
 struct Sentence {
   std::vector<int64_t> word_ids;
@@ -71,24 +44,8 @@ using SentenceVector = std::vector<Sentence<T>>;
 
 template <typename T>
 struct BeamSearchDecoder {
-  /**
-   * make a BeamNode and all it's related prefix BeanNode into a Sentence.
-   */
-  Sentence<T> MakeSentence(const BeamNode<T>* node) const;
-
-  /**
-   * Param:
-   *  cur_ids: LoDTensor of One step for word ID
-   *  cur_scores: LoDTensor of One Step for word score
-   *  prefixes_list: prefixes for each source sentence.
-   *  sentence_vector_list: result sentence_vector for each source sentence.
-   * Return:
-   *  a new prefixes list for each source of current step
-   */
-  std::vector<BeamNodeVector<T>> PackTwoSteps(
-      const LoDTensor& cur_ids, const LoDTensor& cur_scores,
-      std::vector<BeamNodeVector<T>>& prefixes_list,
-      std::vector<SentenceVector<T>>* sentence_vector_list) const;
+  BeamSearchDecoder(size_t beam_size, int end_id)
+      : beam_size_(beam_size), end_id_(end_id) {}
 
   /**
    * convert the result sentence_vector for each source sentence into two
@@ -99,107 +56,30 @@ struct BeamSearchDecoder {
    *  sentence_vector_list: sentence_vector for each source sentence.
    *  id_tensor: result LoDTensor for sentences of id.
    *  score_tensor: result LoDTensor for sentences of score.
+   *  reverse: whether ids of sentence in sentence_vector_list is reversed
+   *  sort_by_score: whether to sort hypotheses of each sentence by scores.
    */
   void ConvertSentenceVectorToLodTensor(
       std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
-      LoDTensor* score_tensor) const;
+      LoDTensor* score_tensor, bool reverse = true,
+      bool sort_by_score = true) const;
 
   /**
-   * Pack all steps of id/score LodTensor into sentence LoDTensor
-   * it's main logic is:
-   * ```python
-   *   prefix
-   *   result_sentence
-   *   result_lod_tensor
-   *
-   *   for (step in steps):
-   *     prefix = PackTwoSteps(prefix, step, &result_sentence)
-   *   ConvertSentenceVector<T>ToLodTensor(result_sentence, &result_lod_tensor)
-   * ```
+   * Gather the hypotheses for each source sentence by backtrace though the
+   * LoDTensorArray step_ids whose lods reserve the path in the tree.
    */
-  void PackAllSteps(const LoDTensorArray& step_ids,
-                    const LoDTensorArray& step_scores, LoDTensor* id_tensor,
-                    LoDTensor* score_tensor) const;
-};
-
-template <typename T>
-Sentence<T> BeamSearchDecoder<T>::MakeSentence(const BeamNode<T>* node) const {
-  Sentence<T> sentence;
-  while (node != nullptr) {
-    sentence.word_ids.emplace_back(node->word_id_);
-    sentence.scores.emplace_back(node->score_);
-    node = node->parent_;
-  }
-
-  std::reverse(std::begin(sentence.word_ids), std::end(sentence.word_ids));
-  std::reverse(std::begin(sentence.scores), std::end(sentence.scores));
-
-  return sentence;
-}
-
-template <typename T>
-std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps(
-    const LoDTensor& cur_ids, const LoDTensor& cur_scores,
-    std::vector<BeamNodeVector<T>>& prefixes_list,
-    std::vector<SentenceVector<T>>* sentence_vector_list) const {
-  std::vector<BeamNodeVector<T>> result;
+  void Backtrace(const LoDTensorArray& step_ids,
+                 const LoDTensorArray& step_scores, LoDTensor* id_tensor,
+                 LoDTensor* score_tensor) const;
 
-  for (size_t src_idx = 0; src_idx < cur_ids.lod()[kSourceLevel].size() - 1;
-       ++src_idx) {
-    size_t src_start = cur_ids.lod().at(kSourceLevel)[src_idx];
-    size_t src_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
-
-    BeamNodeVector<T> beam_nodes;
-
-    // if prefixes size is 0, it means this is the first step. In this step,
-    // all candidate id is the start of candidate sentences.
-    if (prefixes_list.empty()) {
-      PADDLE_ENFORCE_EQ(cur_ids.lod().at(kSourceLevel).back(),
-                        cur_ids.lod().at(kSentenceLevel).back(),
-                        "in the first step");
-      for (size_t id_idx = src_start; id_idx < src_end; ++id_idx) {
-        beam_nodes.push_back(std::unique_ptr<BeamNode<T>>(new BeamNode<T>(
-            cur_ids.data<int64_t>()[id_idx], cur_scores.data<T>()[id_idx])));
-      }
-    } else {
-      BeamNodeVector<T>& prefixes = prefixes_list[src_idx];
-      SentenceVector<T>& sentence_vector = (*sentence_vector_list)[src_idx];
-
-      PADDLE_ENFORCE_EQ(src_end - src_start, prefixes.size(),
-                        "prefix and candidate set number should be the same");
-
-      auto candidate_offset = cur_ids.lod()[kSentenceLevel];
-      for (size_t prefix_idx = 0; prefix_idx < prefixes.size(); ++prefix_idx) {
-        std::unique_ptr<BeamNode<T>>& prefix = prefixes[prefix_idx];
-        size_t candidate_start = candidate_offset[src_start + prefix_idx];
-        size_t candidate_end = candidate_offset[src_start + prefix_idx + 1];
-        if (candidate_start == candidate_end) {
-          VLOG(3) << "this sentence has no more candidate, "
-                     "add to result sentence and rm it from beam tree";
-          sentence_vector.push_back(MakeSentence(prefix.get()));
-          prefix.reset();
-        } else {
-          for (size_t candidate_idx = candidate_start;
-               candidate_idx < candidate_end; ++candidate_idx) {
-            auto* candidate =
-                new BeamNode<T>(cur_ids.data<int64_t>()[candidate_idx],
-                                cur_scores.data<T>()[candidate_idx]);
-            candidate->AppendTo(prefix.get());
-            beam_nodes.push_back(std::unique_ptr<BeamNode<T>>(candidate));
-          }
-          prefix.release();
-        }
-      }
-    }
-    result.push_back(std::move(beam_nodes));
-  }
-  return result;
-}
+  size_t beam_size_;
+  int end_id_;
+};
 
 template <typename T>
 void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
     std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
-    LoDTensor* score_tensor) const {
+    LoDTensor* score_tensor, bool reverse, bool sort_by_score) const {
   size_t src_num = sentence_vector_list.size();
 
   PADDLE_ENFORCE_NE(src_num, 0, "src_num should not be 0");
@@ -210,11 +90,29 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
   std::vector<T> score_data;
 
   for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+    if (sort_by_score) {
+      sort(sentence_vector_list[src_idx].begin(),
+           sentence_vector_list[src_idx].end(),
+           [reverse](const Sentence<T>& a, const Sentence<T>& b) {
+             if (reverse)
+               return a.scores.front() > b.scores.front();
+             else
+               return a.scores.back() > b.scores.back();
+           });
+    }
     for (Sentence<T>& sentence : sentence_vector_list[src_idx]) {
-      id_data.insert(id_data.end(), sentence.word_ids.begin(),
-                     sentence.word_ids.end());
-      score_data.insert(score_data.end(), sentence.scores.begin(),
-                        sentence.scores.end());
+      if (reverse) {
+        id_data.insert(id_data.end(), sentence.word_ids.rbegin(),
+                       sentence.word_ids.rend());
+        score_data.insert(score_data.end(), sentence.scores.rbegin(),
+                          sentence.scores.rend());
+      } else {
+        id_data.insert(id_data.end(), sentence.word_ids.begin(),
+                       sentence.word_ids.end());
+        score_data.insert(score_data.end(), sentence.scores.begin(),
+                          sentence.scores.end());
+      }
+
       sentence_level_lod.push_back(sentence_level_lod.back() +
                                    sentence.word_ids.size());
     }
@@ -222,8 +120,9 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
                                sentence_vector_list[src_idx].size());
   }
 
-  auto cpu_place = new paddle::platform::CPUPlace();
-  paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place);
+  auto cpu_place = std::unique_ptr<paddle::platform::CPUPlace>(
+      new paddle::platform::CPUPlace());
+  paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place.get());
 
   framework::LoD lod;
   lod.push_back(source_level_lod);
@@ -241,39 +140,75 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
 }
 
 template <typename T>
-void BeamSearchDecoder<T>::PackAllSteps(const LoDTensorArray& step_ids,
-                                        const LoDTensorArray& step_scores,
-                                        LoDTensor* id_tensor,
-                                        LoDTensor* score_tensor) const {
+void BeamSearchDecoder<T>::Backtrace(const LoDTensorArray& step_ids,
+                                     const LoDTensorArray& step_scores,
+                                     LoDTensor* id_tensor,
+                                     LoDTensor* score_tensor) const {
   PADDLE_ENFORCE(!step_ids.empty(), "step num should be larger than 0");
   PADDLE_ENFORCE_EQ(step_ids.size(), step_scores.size(),
                     "step_ids and step_scores should be the same");
   const size_t step_num = step_ids.size();
   const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1;
+  std::vector<SentenceVector<T>> sentence_vector_list(
+      src_num, SentenceVector<T>(beam_size_));
+  std::vector<std::vector<size_t>> prefix_idx_vector_list(src_num);
+  for (int step_id = step_num - 1; step_id >= 0; --step_id) {
+    auto& cur_ids = step_ids.at(step_id);
+    auto& cur_scores = step_scores.at(step_id);
+    for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+      // for each source sentence
+      auto& sentence_vector = sentence_vector_list.at(src_idx);
+      auto& prefix_idx_vector = prefix_idx_vector_list.at(src_idx);
+      size_t src_prefix_start = cur_ids.lod().at(kSourceLevel)[src_idx];
+      size_t src_prefix_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
+      if (prefix_idx_vector.empty()) {  // be finished and pruned at this step
+                                        // or the last time step
+        for (size_t prefix_idx = src_prefix_start; prefix_idx < src_prefix_end;
+             ++prefix_idx) {
+          size_t candidate_start = cur_ids.lod().at(kSentenceLevel)[prefix_idx];
+          size_t candidate_end =
+              cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1];
+          for (size_t candidate_idx = candidate_start;
+               candidate_idx < candidate_end; ++candidate_idx) {
+            prefix_idx_vector.push_back(prefix_idx);
+            size_t idx = prefix_idx_vector.size() - 1;
+            auto cur_id = cur_ids.data<int64_t>()[candidate_idx];
+            auto cur_score = cur_scores.data<T>()[candidate_idx];
+            sentence_vector.at(idx).word_ids.push_back(cur_id);
+            sentence_vector.at(idx).scores.push_back(cur_score);
+          }
+        }
+      } else {  // use prefix_idx_vector to backtrace
+        size_t src_candidate_start =
+            cur_ids.lod().at(kSentenceLevel)[src_prefix_start];
+        size_t prefix_idx = src_prefix_start;
+        size_t candidate_num =
+            cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
+            cur_ids.lod().at(kSentenceLevel)[prefix_idx];
+        for (size_t idx = 0; idx < prefix_idx_vector.size(); ++idx) {
+          auto candidate_idx = prefix_idx_vector.at(idx);
+          auto cur_id = cur_ids.data<int64_t>()[candidate_idx];
+          auto cur_score = cur_scores.data<T>()[candidate_idx];
+          if (cur_id != end_id_ || sentence_vector.at(idx).word_ids.empty()) {
+            // to skip redundant end tokens
+            sentence_vector.at(idx).word_ids.push_back(cur_id);
+            sentence_vector.at(idx).scores.push_back(cur_score);
+          }
 
-  PADDLE_ENFORCE_GT(src_num, 0UL, "source num should be larger than 0");
-
-  // previous prefixes for each step,
-  // the init length is 0, means this is the first step.
-  std::vector<BeamNodeVector<T>> beamnode_vector_list(0);
-  std::vector<SentenceVector<T>> sentence_vector_list(src_num);
-
-  // pack all steps for one batch first, then another batch
-  for (size_t step_id = 0; step_id < step_num; ++step_id) {
-    beamnode_vector_list =
-        PackTwoSteps(step_ids.at(step_id), step_scores.at(step_id),
-                     beamnode_vector_list, &sentence_vector_list);
-  }
-  // append last beam_node to result
-  for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
-    for (auto& beam_node : beamnode_vector_list.at(src_idx)) {
-      sentence_vector_list[src_idx].push_back(MakeSentence(beam_node.get()));
-      beam_node.reset();
+          while (src_candidate_start + candidate_num <=
+                 candidate_idx) {  // search the corresponding prefix
+            prefix_idx++;
+            candidate_num += cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
+                             cur_ids.lod().at(kSentenceLevel)[prefix_idx];
+          }
+          prefix_idx_vector.at(idx) = prefix_idx;
+        }
+      }
     }
   }
 
   ConvertSentenceVectorToLodTensor(sentence_vector_list, id_tensor,
-                                   score_tensor);
+                                   score_tensor, true, true);
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/beam_search_decode_op_test.cc b/paddle/fluid/operators/beam_search_decode_op_test.cc
index c3faf46e09bb40d01049fd9cfd79836c1d2bd5bb..88339e38d89db3f79abf232d6b0d035b759739a6 100644
--- a/paddle/fluid/operators/beam_search_decode_op_test.cc
+++ b/paddle/fluid/operators/beam_search_decode_op_test.cc
@@ -20,15 +20,11 @@ using LoD = paddle::framework::LoD;
 using LoDTensor = paddle::framework::LoDTensor;
 using LoDTensorArray = paddle::framework::LoDTensorArray;
 
-template <typename T>
-using BeamNode = paddle::operators::BeamNode<T>;
 template <typename T>
 using BeamSearchDecoder = paddle::operators::BeamSearchDecoder<T>;
 template <typename T>
 using Sentence = paddle::operators::Sentence<T>;
 template <typename T>
-using BeamNodeVector = paddle::operators::BeamNodeVector<T>;
-template <typename T>
 using SentenceVector = paddle::operators::SentenceVector<T>;
 
 namespace paddle {
@@ -77,138 +73,50 @@ void GenerateExample(const std::vector<size_t>& level_0,
 }  // namespace test
 }  // namespace paddle
 
-TEST(BeamSearchDecodeOp, DeleteBeamNode) {
-  auto* root = new BeamNode<float>(0, 0);
-  auto* b1 = new BeamNode<float>(1, 1);
-  auto* b2 = new BeamNode<float>(2, 2);
-  auto* b3 = new BeamNode<float>(3, 3);
-
-  b1->AppendTo(root);
-  b2->AppendTo(root);
-  b3->AppendTo(b1);
-
-  delete b3;
-  delete b2;
-}
-
-TEST(BeamSearchDecodeOp, MakeSentence) {
-  auto* root = new BeamNode<float>(0, 0);
-  auto* b1 = new BeamNode<float>(1, 1);
-  auto* end = new BeamNode<float>(2, 2);
-  b1->AppendTo(root);
-  end->AppendTo(b1);
-
-  BeamSearchDecoder<float> helper;
-  Sentence<float> sentence = helper.MakeSentence(end);
-  delete end;
-
-  std::vector<int64_t> expect_ids = {0, 1, 2};
-  ASSERT_EQ(sentence.word_ids, expect_ids);
-
-  std::vector<float> expect_scores = {0, 1, 2};
-  ASSERT_EQ(sentence.scores, expect_scores);
-}
-
-TEST(BeamSearchDecodeOp, PackTwoStepsFistStep) {
-  CPUPlace place;
-
-  LoDTensorArray ids;
-  LoDTensorArray scores;
-
-  paddle::test::GenerateExample(
-      std::vector<size_t>{0, 2, 6}, std::vector<size_t>{0, 1, 2, 3, 4, 5, 6},
-      std::vector<int>{1, 2, 3, 4, 5, 6}, &ids, &scores);
-
-  std::vector<BeamNodeVector<float>> beamnode_vector_list;
-  std::vector<SentenceVector<float>> sentence_vector_list(
-      2, SentenceVector<float>());
-
-  BeamSearchDecoder<float> helper;
-  beamnode_vector_list = helper.PackTwoSteps(
-      ids[0], scores[0], beamnode_vector_list, &sentence_vector_list);
-  ASSERT_EQ(beamnode_vector_list.size(), 2UL);
-  ASSERT_EQ(beamnode_vector_list[0].size(), 2UL);
-  ASSERT_EQ(beamnode_vector_list[1].size(), 4UL);
-}
-
-TEST(BeamSearchDecodeOp, PackTwoSteps) {
-  CPUPlace place;
-
-  // first source has three prefix
-  BeamNodeVector<float> source0_prefixes;
-  source0_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(1, 1)));
-  source0_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(0, 0)));
-  source0_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(3, 3)));
-
-  // second source has two prefix
-  BeamNodeVector<float> source1_prefixes;
-  source1_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(4, 4)));
-  source1_prefixes.push_back(
-      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(5, 5)));
-
-  std::vector<BeamNodeVector<float>> beamnode_vector_list;
-  std::vector<SentenceVector<float>> sentence_vector_list(
-      2, SentenceVector<float>());
-
-  beamnode_vector_list.push_back(std::move(source0_prefixes));
-  beamnode_vector_list.push_back(std::move(source1_prefixes));
-
-  // generate data for one step
-  LoDTensorArray ids;
-  LoDTensorArray scores;
-
-  paddle::test::GenerateExample(std::vector<size_t>{0, 3, 5},
-                                std::vector<size_t>{0, 1, 1, 3, 4, 5},
-                                std::vector<int>{0, 1, 2, 3, 4}, &ids, &scores);
-
-  BeamSearchDecoder<float> helper1;
-  beamnode_vector_list = helper1.PackTwoSteps(
-      ids[0], scores[0], beamnode_vector_list, &sentence_vector_list);
-
-  ASSERT_EQ(sentence_vector_list[0].size(), 1UL);
-  ASSERT_EQ(sentence_vector_list[1].size(), 0UL);
-  ASSERT_EQ(beamnode_vector_list[0].size(), 3UL);
-  ASSERT_EQ(beamnode_vector_list[1].size(), 2UL);
-}
-
-TEST(BeamSearchDecodeOp, PackAllSteps) {
+TEST(BeamSearchDecodeOp, Backtrace) {
   CPUPlace place;
 
-  // we will constuct a sample data with 3 steps and 2 source sentences
+  // Construct sample data with 5 steps and 2 source sentences
+  // beam_size = 2, start_id = 0, end_id = 1
   LoDTensorArray ids;
   LoDTensorArray scores;
 
   paddle::test::GenerateExample(
-      std::vector<size_t>{0, 3, 6}, std::vector<size_t>{0, 1, 2, 3, 4, 5, 6},
-      std::vector<int>{1, 2, 3, 4, 5, 6}, &ids, &scores);
+      std::vector<size_t>{0, 1, 2}, std::vector<size_t>{0, 1, 2},
+      std::vector<int>{0, 0}, &ids, &scores);  // start with start_id
+  paddle::test::GenerateExample(std::vector<size_t>{0, 1, 2},
+                                std::vector<size_t>{0, 2, 4},
+                                std::vector<int>{2, 3, 4, 5}, &ids, &scores);
+  paddle::test::GenerateExample(std::vector<size_t>{0, 2, 4},
+                                std::vector<size_t>{0, 2, 2, 4, 4},
+                                std::vector<int>{3, 1, 5, 4}, &ids, &scores);
+  paddle::test::GenerateExample(std::vector<size_t>{0, 2, 4},
+                                std::vector<size_t>{0, 1, 2, 3, 4},
+                                std::vector<int>{1, 1, 3, 5}, &ids, &scores);
   paddle::test::GenerateExample(
-      std::vector<size_t>{0, 3, 6}, std::vector<size_t>{0, 1, 1, 3, 5, 5, 6},
-      std::vector<int>{0, 1, 2, 3, 4, 5}, &ids, &scores);
-  paddle::test::GenerateExample(std::vector<size_t>{0, 3, 6},
-                                std::vector<size_t>{0, 0, 1, 2, 3, 4, 5},
-                                std::vector<int>{0, 1, 2, 3, 4}, &ids, &scores);
+      std::vector<size_t>{0, 2, 4},
+      std::vector<size_t>{0, 0, 0, 2,
+                          2},  // the branchs of the first source sentence
+                               // are pruned since finished
+      std::vector<int>{5, 1},
+      &ids, &scores);
 
-  ASSERT_EQ(ids.size(), 3UL);
-  ASSERT_EQ(scores.size(), 3UL);
+  ASSERT_EQ(ids.size(), 5UL);
+  ASSERT_EQ(scores.size(), 5UL);
 
-  BeamSearchDecoder<float> helper;
+  BeamSearchDecoder<float> helper(2, 1);  // beam_size = 2, end_id = 1
 
   LoDTensor id_tensor;
   LoDTensor score_tensor;
-  helper.PackAllSteps(ids, scores, &id_tensor, &score_tensor);
+  helper.Backtrace(ids, scores, &id_tensor, &score_tensor);
 
   LoD lod = id_tensor.lod();
-  std::vector<size_t> expect_source_lod = {0, 4, 8};
+  std::vector<size_t> expect_source_lod = {0, 2, 4};
   EXPECT_EQ(lod[0], expect_source_lod);
-  std::vector<size_t> expect_sentence_lod = {0, 1, 3, 6, 9, 10, 13, 16, 19};
+  std::vector<size_t> expect_sentence_lod = {0, 4, 7, 12, 17};
   EXPECT_EQ(lod[1], expect_sentence_lod);
-  // 2| 1, 0| 3, 1, 0| 3, 2, 1| 5| 4, 3, 2| 4, 4, 3| 6, 5, 4
-  std::vector<int> expect_data = {2, 1, 0, 3, 1, 0, 3, 2, 1, 5,
-                                  4, 3, 2, 4, 4, 3, 6, 5, 4};
+  std::vector<int> expect_data = {0, 2, 3, 1, 0, 2, 1, 0, 4,
+                                  5, 3, 5, 0, 4, 5, 3, 1};
   ASSERT_EQ(id_tensor.dims()[0], static_cast<int64_t>(expect_data.size()));
   for (size_t i = 0; i < expect_data.size(); ++i) {
     ASSERT_EQ(id_tensor.data<int64_t>()[i],
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
index e848b1f12cb9f1ce1d37e0e0233bfc361dc35a33..62771d09f112785ca1ba741a0ba239b1f0234633 100644
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -12,22 +12,26 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/beam_search_op.h"
-
+#include <algorithm>
 #include <map>
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/beam_search_op.h"
 
 namespace paddle {
 namespace operators {
 
 void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
+                            const framework::LoDTensor &pre_scores,
                             framework::LoDTensor *selected_ids,
                             framework::LoDTensor *selected_scores) {
   auto abs_lod = framework::ToAbsOffset(ids_->lod());
   auto &high_level = abs_lod[lod_level_];
 
-  auto items = SelectTopBeamSizeItems();
+  auto items = SelectTopBeamSizeItems(pre_ids, pre_scores);
   auto selected_items = ToMap(items, high_level.back());
   VLOG(3) << "selected_items:";
   for (size_t i = 0; i < selected_items.size(); ++i) {
@@ -36,7 +40,8 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
       VLOG(3) << ItemToString(item);
     }
   }
-  PruneEndidCandidates(pre_ids, &selected_items);
+
+  PruneEndBeams(pre_ids, &selected_items);
   // calculate the output tensor's height
   size_t num_instances = std::accumulate(
       std::begin(selected_items), std::end(selected_items), 0,
@@ -58,12 +63,6 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
   size_t low_offset = 0;
   for (auto &items : selected_items) {
     low_level.push_back(low_offset);
-    sort(items.begin(), items.end(), [](const Item &a, const Item &b) {
-      if (a.offset < b.offset) {
-        return true;
-      }
-      return a.id < b.id;
-    });
     for (auto &item : items) {
       ids_data[low_offset] = item.id;
       scores_data[low_offset] = item.score;
@@ -83,21 +82,31 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
   selected_scores->set_lod(lod);
 }
 
-int BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids,
-                                     std::vector<std::vector<Item>> *items) {
+void BeamSearch::PruneEndBeams(const framework::LoDTensor &pre_ids,
+                               std::vector<std::vector<Item>> *items) {
   auto *pre_ids_data = pre_ids.data<int64_t>();
-
-  int res = 0;
-  for (size_t offset = 0; offset < items->size(); offset++) {
-    auto prefix_id = pre_ids_data[offset];
-    if (prefix_id == end_id_) {
-      items->at(offset).clear();
-    } else {
-      res++;
+  auto abs_lod = framework::ToAbsOffset(ids_->lod());
+  auto &high_level = abs_lod[lod_level_];
+  for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) {
+    size_t src_prefix_start = high_level[src_idx];
+    size_t src_prefix_end = high_level[src_idx + 1];
+    bool finish_flag = true;
+    for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++) {
+      for (auto &item : items->at(offset)) {
+        if (item.id != static_cast<size_t>(end_id_) ||
+            pre_ids_data[offset] != end_id_) {
+          finish_flag = false;
+          break;
+        }
+      }
+      if (!finish_flag) break;
+    }
+    if (finish_flag) {  // all branchs of the beam (source sentence) end and
+                        // prune this beam
+      for (size_t offset = src_prefix_start; offset < src_prefix_end; offset++)
+        items->at(offset).clear();
     }
   }
-
-  return res;
 }
 
 std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
@@ -112,19 +121,17 @@ std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
   return result;
 }
 
-std::vector<std::vector<BeamSearch::Item>>
-BeamSearch::SelectTopBeamSizeItems() {
+std::vector<std::vector<BeamSearch::Item>> BeamSearch::SelectTopBeamSizeItems(
+    const framework::LoDTensor &pre_ids,
+    const framework::LoDTensor &pre_scores) {
   std::vector<std::vector<Item>> result;
   std::vector<Item> items;
   // for each source sentence, select the top beam_size items across all
   // candidate sets.
-  while (NextItemSet(&items)) {
-    std::nth_element(std::begin(items), std::begin(items) + beam_size_,
-                     std::end(items), [](const Item &a, const Item &b) {
-                       // TODO(superjom) make score's comparation customizable.
-                       // partial sort in descending order
-                       return a.score > b.score;
-                     });
+  while (NextItemSet(pre_ids, pre_scores, &items)) {
+    std::nth_element(
+        std::begin(items), std::begin(items) + beam_size_, std::end(items),
+        [](const Item &a, const Item &b) { return a.score > b.score; });
     // prune the top beam_size items.
     if (items.size() > beam_size_) {
       items.resize(beam_size_);
@@ -143,7 +150,9 @@ BeamSearch::SelectTopBeamSizeItems() {
 }
 
 // the candidates of a source
-bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
+bool BeamSearch::NextItemSet(const framework::LoDTensor &pre_ids,
+                             const framework::LoDTensor &pre_scores,
+                             std::vector<BeamSearch::Item> *items) {
   if (sent_offset_ >= ids_->NumElements(lod_level_)) {
     return false;
   }
@@ -161,14 +170,24 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
     instance_dim *= ids.dims()[i];
   }
 
+  auto *pre_ids_data = pre_ids.data<int64_t>();
+  auto *pre_scores_data = pre_scores.data<float>();
   items->clear();
   items->reserve(framework::product(ids.dims()));
   for (size_t offset = abs_lod[lod_level_][sent_offset_];
        offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) {
-    for (size_t d = 0; d < instance_dim; d++) {
-      const size_t dim_offset = offset * instance_dim + d;
-      items->emplace_back(offset, ids_data[dim_offset],
-                          scores_data[dim_offset]);
+    auto pre_id = pre_ids_data[offset];
+    auto pre_score = pre_scores_data[offset];
+    if (pre_id == end_id_) {
+      // Allocate all probability mass to eos_id for finished branchs and the
+      // other candidate ids can be ignored.
+      items->emplace_back(offset, end_id_, pre_score);
+    } else {
+      for (size_t d = 0; d < instance_dim; d++) {
+        const size_t dim_offset = offset * instance_dim + d;
+        items->emplace_back(offset, ids_data[dim_offset],
+                            scores_data[dim_offset]);
+      }
     }
   }
 
@@ -192,21 +211,31 @@ std::string ItemToString(const BeamSearch::Item &item) {
   return stream.str();
 }
 
-class BeamSearchProtoAndCheckerMaker
-    : public framework::OpProtoAndCheckerMaker {
+class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BeamSearchProtoAndCheckerMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     // inputs and outputs stored in proto
-    AddInput("pre_ids", "ids in previous step");
-    AddInput("ids", "a LoDTensor of shape of [None,k]");
+    AddInput("pre_ids",
+             "(LoDTensor) The LoDTensor containing the selected ids at the "
+             "previous step. It should be a tensor with shape (batch_size, 1) "
+             "and lod `[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at "
+             "thefirst step.");
+    AddInput("pre_scores",
+             "(LoDTensor) The LoDTensor containing the accumulated "
+             "scores corresponding to the selected ids at the previous step.");
+    AddInput("ids",
+             "(LoDTensor) The LoDTensor containing the candidates ids. Its "
+             "shape should be (batch_size * beam_size, K), where K supposed to "
+             "be beam_size.");
     AddInput("scores",
-             "a LoDTensor that has the same shape and LoD with `ids`");
+             "(LoDTensor) The LodTensor containing the accumulated scores "
+             "corresponding to Input(ids) and its shape is the same as the "
+             "shape of Input(ids).");
     AddOutput("selected_ids",
-              "a LoDTensor that stores the IDs selected by beam search");
-    AddOutput(
-        "selected_scores",
-        "a LoDTensor that has the same shape and LoD with `selected_ids`");
+              "A LodTensor that stores the IDs selected by beam search.");
+    AddOutput("selected_scores",
+              "A LoDTensor containing the accumulated scores corresponding to "
+              "Output(selected_ids).");
 
     // Attributes stored in AttributeMap
     AddAttr<int>("level", "the level of LoDTensor");
@@ -214,25 +243,50 @@ class BeamSearchProtoAndCheckerMaker
     AddAttr<int>("end_id",
                  "the token id which indicates the end of a sequence");
 
-    AddComment(
-        "This is a beam search operator that help to generate sequences.");
+    AddComment(R"DOC(
+This operator does the search in beams for one time step. 
+Specifically, it selects the top-K candidate word ids of current step from
+Input(ids) according to their Input(scores) for all source sentences,
+where K is Attr(beam_size) and Input(ids), Input(scores) are predicted results
+from the computation cell. Additionally, Input(pre_ids) and Input(pre_scores)
+are the output of beam_search at previous step, they are needed for special use
+to handle ended candidate translations. The paths linking prefixes and selected
+candidates are organized and reserved in lod.
+
+Note that the Input(scores) passed in should be accumulated scores, and
+length penalty should be done with extra operators before calculating the
+accumulated scores if needed, also suggest finding top-K before it and
+using the top-K candidates following.
+)DOC");
   }
 };
 
-class BeamSearchInferShape : public framework::InferShapeBase {
+class BeamSearchOp : public framework::OperatorWithKernel {
  public:
-  void operator()(framework::InferShapeContext *context) const override {
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
     for (const std::string &arg :
          std::vector<std::string>({"pre_ids", "ids", "scores"})) {
-      PADDLE_ENFORCE(context->HasInput(arg),
-                     "BeamSearch need input argument '%s'", arg);
+      PADDLE_ENFORCE(ctx->HasInput(arg), "BeamSearch need input argument '%s'",
+                     arg);
     }
     for (const std::string &arg :
          std::vector<std::string>({"selected_ids", "selected_scores"})) {
-      PADDLE_ENFORCE(context->HasOutput(arg),
+      PADDLE_ENFORCE(ctx->HasOutput(arg),
                      "BeamSearch need output argument '%s'", arg);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::OpKernelType kt = framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>("pre_ids")->type()),
+        platform::CPUPlace());
+    return kt;
+  }
 };
 
 class BeamSearchInferVarType : public framework::VarTypeInference {
@@ -240,10 +294,12 @@ class BeamSearchInferVarType : public framework::VarTypeInference {
   void operator()(const framework::OpDesc &op_desc,
                   framework::BlockDesc *block) const override {
     for (auto &o : op_desc.Output("selected_ids")) {
-      block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR);
+      auto &selected_ids = block->FindRecursiveOrCreateVar(o);
+      selected_ids.SetType(framework::proto::VarType::LOD_TENSOR);
     }
     for (auto &o : op_desc.Output("selected_scores")) {
-      block->Var(o)->SetType(framework::proto::VarType::LOD_TENSOR);
+      auto &selected_scores = block->FindRecursiveOrCreateVar(o);
+      selected_scores.SetType(framework::proto::VarType::LOD_TENSOR);
     }
   }
 };
@@ -251,8 +307,13 @@ class BeamSearchInferVarType : public framework::VarTypeInference {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OPERATOR(beam_search, paddle::operators::BeamSearchOp,
-                  paddle::operators::BeamSearchProtoAndCheckerMaker,
-                  paddle::operators::BeamSearchInferShape,
-                  paddle::operators::BeamSearchInferVarType,
-                  paddle::framework::EmptyGradOpMaker);
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(beam_search, ops::BeamSearchOp, ops::BeamSearchOpMaker,
+                  ops::BeamSearchInferVarType);
+REGISTER_OP_CPU_KERNEL(
+    beam_search,
+    ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::BeamSearchOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h
index b333ef4e6c73be15dfea2cadb153d2484b3daaf7..b5e2ed05924cc8b7bc06058b9b1103ba10be486e 100644
--- a/paddle/fluid/operators/beam_search_op.h
+++ b/paddle/fluid/operators/beam_search_op.h
@@ -14,10 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef PADDLE_WITH_TESTING
-#include "gtest/gtest.h"
-#endif
-
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/operator.h"
 
@@ -134,6 +132,7 @@ class BeamSearch {
    * that means no candidates is provided, and the task will stop running.
    */
   void operator()(const framework::LoDTensor& pre_ids,
+                  const framework::LoDTensor& pre_scores,
                   framework::LoDTensor* selected_ids,
                   framework::LoDTensor* selected_scores);
   /*
@@ -155,14 +154,16 @@ class BeamSearch {
 
  protected:
   /*
-   * Delete all the records that follows the end token.
+   * Prune the source sentences all branchs finished, and it is optional.
+   * Pruning must one step later than finishing (thus pre_ids is needed here),
+   * since the end tokens must be writed out.
    */
-  int PruneEndidCandidates(const framework::LoDTensor& pre_ids,
-                           std::vector<std::vector<Item>>* items);
+  void PruneEndBeams(const framework::LoDTensor& pre_ids,
+                     std::vector<std::vector<Item>>* items);
 
   /*
    * Transform the items into a map whose key is offset, value is the items.
-   * NOTE low performance
+   * NOTE low performance.
    */
   std::vector<std::vector<Item>> ToMap(
       const std::vector<std::vector<Item>>& inputs, size_t element_num);
@@ -170,12 +171,16 @@ class BeamSearch {
   /*
    * For each source, select top beam_size records.
    */
-  std::vector<std::vector<Item>> SelectTopBeamSizeItems();
+  std::vector<std::vector<Item>> SelectTopBeamSizeItems(
+      const framework::LoDTensor& pre_ids,
+      const framework::LoDTensor& pre_scores);
 
   /*
    * Get the items of next source sequence, return false if no remaining items.
    */
-  bool NextItemSet(std::vector<Item>* items);
+  bool NextItemSet(const framework::LoDTensor& pre_ids,
+                   const framework::LoDTensor& pre_scores,
+                   std::vector<Item>* items);
 
  private:
   size_t beam_size_;
@@ -190,49 +195,30 @@ std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item);
 
 std::string ItemToString(const BeamSearch::Item& item);
 
-class BeamSearchOp : public framework::OperatorBase {
+template <typename DeviceContext, typename T>
+class BeamSearchOpKernel : public framework::OpKernel<T> {
  public:
-  BeamSearchOp(const std::string& type,
-               const framework::VariableNameMap& inputs,
-               const framework::VariableNameMap& outputs,
-               const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  BeamSearchOp(const BeamSearchOp& o)
-      : framework::OperatorBase(
-            static_cast<const framework::OperatorBase&>(o)) {
-    PADDLE_THROW("Not Implemented");
-  }
-
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    auto ids_var = scope.FindVar(Input("ids"));
-    auto scores_var = scope.FindVar(Input("scores"));
-    auto pre_ids_var = scope.FindVar(Input("pre_ids"));
-    PADDLE_ENFORCE_NOT_NULL(ids_var);
-    PADDLE_ENFORCE_NOT_NULL(scores_var);
-    PADDLE_ENFORCE_NOT_NULL(pre_ids_var);
-
-    auto& ids = ids_var->Get<framework::LoDTensor>();
-    auto& scores = scores_var->Get<framework::LoDTensor>();
-    auto& pre_ids = pre_ids_var->Get<framework::LoDTensor>();
-    size_t level = Attr<int>("level");
-    size_t beam_size = Attr<int>("beam_size");
-    int end_id = Attr<int>("end_id");
-    BeamSearch alg(ids, scores, level, beam_size, end_id);
-
-    auto selected_ids_var = scope.FindVar(Output("selected_ids"));
-    auto selected_scores_var = scope.FindVar(Output("selected_scores"));
-    PADDLE_ENFORCE_NOT_NULL(selected_ids_var);
-    PADDLE_ENFORCE_NOT_NULL(selected_scores_var);
-    auto& selected_ids_tensor =
-        *selected_ids_var->GetMutable<framework::LoDTensor>();
-    auto& selected_scores_tensor =
-        *selected_scores_var->GetMutable<framework::LoDTensor>();
-    alg(pre_ids, &selected_ids_tensor, &selected_scores_tensor);
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* ids = context.Input<framework::LoDTensor>("ids");
+    auto* scores = context.Input<framework::LoDTensor>("scores");
+    auto* pre_ids = context.Input<framework::LoDTensor>("pre_ids");
+    auto* pre_scores = context.Input<framework::LoDTensor>("pre_scores");
+    PADDLE_ENFORCE_NOT_NULL(ids);
+    PADDLE_ENFORCE_NOT_NULL(scores);
+    PADDLE_ENFORCE_NOT_NULL(pre_ids);
+    PADDLE_ENFORCE_NOT_NULL(pre_scores);
+
+    size_t level = context.Attr<int>("level");
+    size_t beam_size = context.Attr<int>("beam_size");
+    int end_id = context.Attr<int>("end_id");
+    BeamSearch alg(*ids, *scores, level, beam_size, end_id);
+    auto selected_ids = context.Output<framework::LoDTensor>("selected_ids");
+    auto selected_scores =
+        context.Output<framework::LoDTensor>("selected_scores");
+    PADDLE_ENFORCE_NOT_NULL(selected_ids);
+    PADDLE_ENFORCE_NOT_NULL(selected_scores);
+    alg(*pre_ids, *pre_scores, selected_ids, selected_scores);
   }
 };
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc
index ec666359aa2bd81f1323b54f9a03235740c3a696..c4f4b478fbfc87e4178155132781214575c1e6b0 100644
--- a/paddle/fluid/operators/beam_search_op_test.cc
+++ b/paddle/fluid/operators/beam_search_op_test.cc
@@ -30,7 +30,7 @@ using std::endl;
 
 void CreateInput(LoDTensor* ids, LoDTensor* scores) {
   LoD lod;
-  vector<size_t> level0({0, 1, 4});
+  vector<size_t> level0({0, 2, 4});
   vector<size_t> level1({0, 1, 2, 3, 4});
   lod.push_back(level0);
   lod.push_back(level1);
@@ -64,17 +64,22 @@ TEST(beam_search_op, run) {
   for (int i = 0; i < 4; i++) {
     pre_ids.mutable_data<int64_t>(place)[i] = i + 1;
   }
+  LoDTensor pre_scores;
+  pre_scores.Resize(framework::make_ddim(vector<int64_t>(4, 1)));
+  for (int i = 0; i < 4; i++) {
+    pre_scores.mutable_data<float>(place)[i] = 0.1 * (i + 1);
+  }
 
-  BeamSearch beamsearch(ids, scores, (int64_t)0, (int64_t)2, 0);
+  BeamSearch beamsearch(ids, scores, (size_t)0, (size_t)2, 0);
   LoDTensor sids, sscores;
-  beamsearch(pre_ids, &sids, &sscores);
+  beamsearch(pre_ids, pre_scores, &sids, &sscores);
 
   LOG(INFO) << "score: " << sscores << endl;
 
   ASSERT_EQ(sids.lod(), sscores.lod());
 
-  vector<int> tids({2, 4, 3, 8});
-  vector<float> tscores({0.3, 0.5, 0.9, 0.7});
+  vector<int> tids({4, 2, 3, 8});
+  vector<float> tscores({0.5, 0.6, 0.9, 0.7});
 
   for (int i = 0; i < 4; i++) {
     ASSERT_EQ(tids[i], sids.data<int64_t>()[i]);
diff --git a/paddle/fluid/operators/bilinear_interp_op.cc b/paddle/fluid/operators/bilinear_interp_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2dc3399da183fbcf7664066f6f7ce12db3dc6d5e
--- /dev/null
+++ b/paddle/fluid/operators/bilinear_interp_op.cc
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/operators/bilinear_interp_op.h"
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class BilinearInterpOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of BilinearInterOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of BilinearInterOp should not be null.");
+
+    auto dim_x = ctx->GetInputDim("X");  // NCHW format
+    int out_h = ctx->Attrs().Get<int>("out_h");
+    int out_w = ctx->Attrs().Get<int>("out_w");
+    PADDLE_ENFORCE_EQ(dim_x.size(), 4, "X's dimension must be 4");
+
+    if (ctx->HasInput("OutSize")) {
+      auto out_size_dim = ctx->GetInputDim("OutSize");
+      PADDLE_ENFORCE_EQ(out_size_dim.size(), 1,
+                        "OutSize's dimension size must be 1");
+      PADDLE_ENFORCE_EQ(out_size_dim[0], 2, "OutSize's dim[0] must be 2");
+    }
+    std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
+    ctx->SetOutputDim("Out", framework::make_ddim(dim_out));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
+  }
+};
+
+class BilinearInterpOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input tensor of bilinear interpolation, "
+             "This is a 4-D tensor with shape of (N x C x h x w)");
+    AddInput("OutSize",
+             "This is a 1-D tensor with two number. "
+             "The first number is height and the second number is width.")
+        .AsDispensable();
+    AddOutput("Out", "The dimension of output is (N x C x out_h x out_w)");
+
+    AddAttr<int>("out_h", "output height of bilinear interpolation op.");
+    AddAttr<int>("out_w", "output width of bilinear interpolation op.");
+    AddComment(R"DOC(
+          Bilinear interpolation is an extension of linear interpolation for 
+          interpolating functions of two variables (e.g. H-direction and 
+          W-direction in this op) on a rectilinear 2D grid. 
+          
+          The key idea is to perform linear interpolation first in one 
+          direction, and then again in the other direction.
+            
+          For details, please refer to Wikipedia: 
+          https://en.wikipedia.org/wiki/Bilinear_interpolation
+         )DOC");
+  }
+};
+
+class BilinearInterpOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto dim_x = ctx->GetInputDim("X");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
+    }
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(bilinear_interp, ops::BilinearInterpOp,
+                  ops::BilinearInterpOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(bilinear_interp_grad, ops::BilinearInterpOpGrad);
+REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::BilinearInterpKernel<float>,
+                       ops::BilinearInterpKernel<uint8_t>);
+REGISTER_OP_CPU_KERNEL(bilinear_interp_grad,
+                       ops::BilinearInterpGradKernel<float>);
diff --git a/paddle/fluid/operators/bilinear_interp_op.cu b/paddle/fluid/operators/bilinear_interp_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4c1971538495c6f111e9db18f4014786f6f0dd58
--- /dev/null
+++ b/paddle/fluid/operators/bilinear_interp_op.cu
@@ -0,0 +1,207 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/operators/bilinear_interp_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T>
+__global__ void KeBilinearInterpFw(
+    const T* in, const size_t in_img_h, const size_t in_img_w,
+    const size_t input_h, const size_t input_w, T* out, const size_t out_img_h,
+    const size_t out_img_w, const size_t output_h, const size_t output_w,
+    const size_t num_channels, const T ratio_h, const T ratioW) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < nthreads) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+    int channel_id = out_id_w / out_img_size;
+
+    int out_img_idy = (out_id_w % out_img_size) / out_img_w;
+    int in_img_idy = ratio_h * out_img_idy;
+    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
+    T h1lambda = ratio_h * out_img_idy - in_img_idy;
+    T h2lambda = 1.f - h1lambda;
+
+    int out_img_idx = tid % out_img_w;
+    int in_img_idx = ratioW * out_img_idx;
+    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
+    T w1lambda = ratioW * out_img_idx - in_img_idx;
+    T w2lambda = 1.f - w1lambda;
+
+    const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
+                          in_img_idy * in_img_w + in_img_idx];
+
+    // bilinear interpolation
+    out[out_id_h * output_w + out_id_w] =
+        h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[w_id]) +
+        h1lambda * (w2lambda * in_pos[h_id * in_img_w] +
+                    w1lambda * in_pos[h_id * in_img_w + w_id]);
+  }
+}
+
+template <typename T>
+__global__ void KeBilinearInterpBw(
+    T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
+    const size_t input_w, const T* out, const size_t out_img_h,
+    const size_t out_img_w, const size_t output_h, const size_t output_w,
+    const size_t num_channels, const T ratio_h, const T ratioW) {
+  int nthreads = output_h * output_w;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < nthreads) {
+    int out_id_h = tid / output_w;
+    int out_id_w = tid % output_w;
+    int in_img_size = input_w / num_channels;
+    int out_img_size = output_w / num_channels;
+    int channel_id = out_id_w / out_img_size;
+
+    int out_img_idy = (out_id_w % out_img_size) / out_img_w;
+    int in_img_idy = ratio_h * out_img_idy;
+    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
+    T h1lambda = ratio_h * out_img_idy - in_img_idy;
+    T h2lambda = 1.f - h1lambda;
+
+    int out_img_idx = tid % out_img_w;
+    int in_img_idx = ratioW * out_img_idx;
+    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
+    T w1lambda = ratioW * out_img_idx - in_img_idx;
+    T w2lambda = 1.f - w1lambda;
+
+    T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
+                    in_img_idy * in_img_w + in_img_idx];
+    const T* out_pos = &out[out_id_h * output_w + out_id_w];
+    atomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
+    atomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]);
+    atomicAdd(&in_pos[h_id * in_img_w], h1lambda * w2lambda * out_pos[0]);
+    atomicAdd(&in_pos[h_id * in_img_w + w_id],
+              h1lambda * w1lambda * out_pos[0]);
+  }
+}
+
+template <typename T>
+class BilinearInterpOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto* input_t = ctx.Input<Tensor>("X");      // float tensor
+    auto* output_t = ctx.Output<Tensor>("Out");  // float tensor
+    auto* input = input_t->data<T>();
+
+    int out_h = ctx.Attr<int>("out_h");
+    int out_w = ctx.Attr<int>("out_w");
+    auto out_dims = output_t->dims();
+    auto out_size_t = ctx.Input<Tensor>("OutSize");
+    if (out_size_t != nullptr) {
+      Tensor sizes;
+      framework::TensorCopy(*out_size_t, platform::CPUPlace(), &sizes);
+      auto size_data = sizes.data<int>();
+      out_h = size_data[0];
+      out_w = size_data[1];
+    }
+    auto* output = output_t->mutable_data<T>(
+        {out_dims[0], out_dims[1], out_h, out_w}, ctx.GetPlace());
+
+    int batch_size = input_t->dims()[0];
+    int channels = input_t->dims()[1];
+    int in_h = input_t->dims()[2];
+    int in_w = input_t->dims()[3];
+
+    int in_hw = in_h * in_w;
+    int out_hw = out_h * out_w;
+    int in_chw = channels * in_hw;
+    int out_chw = channels * out_hw;
+
+    T ratio_h = (out_h > 1) ? static_cast<T>(in_h - 1) / (out_h - 1) : 0.f;
+    T ratio_w = (out_w > 1) ? static_cast<T>(in_w - 1) / (out_w - 1) : 0.f;
+
+    if (in_h == out_h && in_w == out_w) {
+      memcpy(output, input, input_t->numel() * sizeof(T));
+    } else {
+      int threadNum = batch_size * out_chw;
+      int blocks = (threadNum + 1024 - 1) / 1024;
+
+      KeBilinearInterpFw<
+          T><<<blocks, 1024, 0, ctx.cuda_device_context().stream()>>>(
+          input, in_h, in_w, batch_size, in_chw, output, out_h, out_w,
+          batch_size, out_chw, channels, ratio_h, ratio_w);
+    }
+  }
+};
+
+template <typename T>
+class BilinearInterpGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* d_input_t = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_output_t = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_output = d_output_t->data<T>();
+    auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
+
+    auto& device_ctx =
+        ctx.template device_context<platform::CUDADeviceContext>();
+    math::SetConstant<platform::CUDADeviceContext, T> zero;
+    zero(device_ctx, d_input_t, static_cast<T>(0.0));
+
+    int out_h = ctx.Attr<int>("out_h");
+    int out_w = ctx.Attr<int>("out_w");
+
+    auto out_size_t = ctx.Input<Tensor>("OutSize");
+    if (out_size_t != nullptr) {
+      Tensor sizes;
+      framework::TensorCopy(*out_size_t, platform::CPUPlace(), &sizes);
+      auto size_data = sizes.data<int>();
+      out_h = size_data[0];
+      out_w = size_data[1];
+    }
+
+    int batch_size = d_input_t->dims()[0];
+    int channels = d_input_t->dims()[1];
+    int in_h = d_input_t->dims()[2];
+    int in_w = d_input_t->dims()[3];
+
+    int in_hw = in_h * in_w;
+    int out_hw = out_h * out_w;
+    int in_chw = channels * in_hw;
+    int out_chw = channels * out_hw;
+
+    T ratio_h = (out_h > 1) ? static_cast<T>(in_h - 1) / (out_h - 1) : 0.f;
+    T ratio_w = (out_w > 1) ? static_cast<T>(in_w - 1) / (out_w - 1) : 0.f;
+
+    if (in_h == out_h && in_w == out_w) {
+      memcpy(d_input, d_output, d_input_t->numel() * sizeof(T));
+    } else {
+      int threadNum = batch_size * out_chw;
+      int blocks = (threadNum + 1024 - 1) / 1024;
+
+      KeBilinearInterpBw<
+          T><<<blocks, 1024, 0, ctx.cuda_device_context().stream()>>>(
+          d_input, in_h, in_w, batch_size, in_chw, d_output, out_h, out_w,
+          batch_size, out_chw, channels, ratio_h, ratio_w);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(bilinear_interp,
+                        ops::BilinearInterpOpCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(bilinear_interp_grad,
+                        ops::BilinearInterpGradOpCUDAKernel<float>);
diff --git a/paddle/fluid/operators/bilinear_interp_op.h b/paddle/fluid/operators/bilinear_interp_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..70847cb8c1abe2e94bc844ab8117d1f23fea533b
--- /dev/null
+++ b/paddle/fluid/operators/bilinear_interp_op.h
@@ -0,0 +1,163 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class BilinearInterpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input_t = ctx.Input<Tensor>("X");      // float tensor
+    auto* output_t = ctx.Output<Tensor>("Out");  // float tensor
+    auto out_dims = output_t->dims();
+    auto* input = input_t->data<T>();
+    int out_h = ctx.Attr<int>("out_h");
+    int out_w = ctx.Attr<int>("out_w");
+    auto out_size_t = ctx.Input<Tensor>("OutSize");
+    if (out_size_t != nullptr) {
+      auto out_size_data = out_size_t->data<int>();
+      out_h = out_size_data[0];
+      out_w = out_size_data[1];
+    }
+    auto* output = output_t->mutable_data<T>(
+        {out_dims[0], out_dims[1], out_h, out_w}, ctx.GetPlace());
+    int batch_size = input_t->dims()[0];
+    int channels = input_t->dims()[1];
+    int in_h = input_t->dims()[2];
+    int in_w = input_t->dims()[3];
+
+    int in_hw = in_h * in_w;
+    int out_hw = out_h * out_w;
+    int in_chw = channels * in_hw;
+    int out_chw = channels * out_hw;
+
+    float ratio_h =
+        (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
+    float ratio_w =
+        (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
+
+    if (in_h == out_h && in_w == out_w) {
+      memcpy(output, input, input_t->numel() * sizeof(T));
+    } else {
+      for (int k = 0; k < batch_size; ++k) {  // loop for batches
+        for (int i = 0; i < out_h; ++i) {     // loop for images
+          int h = ratio_h * i;
+          int hid = (h < in_h - 1) ? 1 : 0;
+          float h1lambda = ratio_h * i - h;
+          float h2lambda = 1.f - h1lambda;
+
+          for (int j = 0; j < out_w; ++j) {
+            int w = ratio_w * j;
+            int wid = (w < in_w - 1) ? 1 : 0;
+            float w1lambda = ratio_w * j - w;
+            float w2lambda = 1.f - w1lambda;
+            // calculate four position for bilinear interpolation
+            const T* in_pos = &input[k * in_chw + h * in_w + w];
+            T* out_pos = &output[k * out_chw + i * out_w + j];
+
+            for (int c = 0; c < channels; ++c) {  // loop for channels
+              // bilinear interpolation
+              out_pos[0] = static_cast<T>(
+                  h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[wid]) +
+                  h1lambda * (w2lambda * in_pos[hid * in_w] +
+                              w1lambda * in_pos[hid * in_w + wid]));
+              in_pos += in_hw;
+              out_pos += out_hw;
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class BilinearInterpGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* d_input_t = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_output_t = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_output = d_output_t->data<T>();
+    auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
+    auto& device_ctx =
+        ctx.template device_context<platform::CPUDeviceContext>();
+    math::SetConstant<platform::CPUDeviceContext, T> zero;
+    zero(device_ctx, d_input_t, static_cast<T>(0.0));
+
+    int out_h = ctx.Attr<int>("out_h");
+    int out_w = ctx.Attr<int>("out_w");
+
+    auto out_size_t = ctx.Input<Tensor>("OutSize");
+    if (out_size_t != nullptr) {
+      auto out_size_data = out_size_t->data<int>();
+      out_h = out_size_data[0];
+      out_w = out_size_data[1];
+    }
+
+    int batch_size = d_input_t->dims()[0];
+    int channels = d_input_t->dims()[1];
+    int in_h = d_input_t->dims()[2];
+    int in_w = d_input_t->dims()[3];
+
+    int in_hw = in_h * in_w;
+    int out_hw = out_h * out_w;
+    int in_chw = channels * in_hw;
+    int out_chw = channels * out_hw;
+
+    float ratio_h =
+        (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
+    float ratio_w =
+        (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
+
+    if (in_h == out_h && in_w == out_w) {
+      memcpy(d_input, d_output, d_input_t->numel() * sizeof(T));
+    } else {
+      for (int k = 0; k < batch_size; ++k) {  // loop for batches
+        for (int i = 0; i < out_h; ++i) {     // loop for images
+          int h = ratio_h * i;
+          int hid = (h < in_h - 1) ? 1 : 0;
+          float h1lambda = ratio_h * i - h;
+          float h2lambda = 1 - h1lambda;
+
+          for (int j = 0; j < out_w; ++j) {
+            int w = ratio_w * j;
+            int wid = (w < in_w - 1) ? 1 : 0;
+            float w1lambda = ratio_w * j - w;
+            float w2lambda = 1 - w1lambda;
+            T* in_pos = &d_input[k * in_chw + h * in_w + w];
+            const T* out_pos = &d_output[k * out_chw + i * out_w + j];
+
+            for (int c = 0; c < channels; ++c) {  // loop for channels
+              in_pos[0] += static_cast<T>(h2lambda * w2lambda * out_pos[0]);
+              in_pos[wid] += static_cast<T>(h2lambda * w1lambda * out_pos[0]);
+              in_pos[hid * in_w] +=
+                  static_cast<T>(h1lambda * w2lambda * out_pos[0]);
+              in_pos[hid * in_w + wid] +=
+                  static_cast<T>(h1lambda * w1lambda * out_pos[0]);
+              in_pos += in_hw;
+              out_pos += out_hw;
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc
index 2ec984d8e0f07b741f5e36f281134c0469079afd..8d261a118a75ee16027faf60341cefd30c3cdbba 100644
--- a/paddle/fluid/operators/bilinear_tensor_product_op.cc
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc
@@ -65,8 +65,7 @@ class BilinearTensorProductOp : public framework::OperatorWithKernel {
 
 class BilinearTensorProductOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BilinearTensorProductOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The first input of bilinear_tensor_product operator.");
     AddInput("Y", "The second input of bilinear_tensor_product operator.");
     AddInput("Weight",
@@ -153,9 +152,11 @@ class BilinearTensorProductOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(bilinear_tensor_product, ops::BilinearTensorProductOp,
-            ops::BilinearTensorProductOpMaker, bilinear_tensor_product_grad,
-            ops::BilinearTensorProductOpGrad);
+REGISTER_OPERATOR(bilinear_tensor_product, ops::BilinearTensorProductOp,
+                  ops::BilinearTensorProductOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(bilinear_tensor_product_grad,
+                  ops::BilinearTensorProductOpGrad);
 REGISTER_OP_CPU_KERNEL(
     bilinear_tensor_product,
     ops::BilinearTensorProductKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.h b/paddle/fluid/operators/bilinear_tensor_product_op.h
index ca80e6085c4f8b242cd26803202475cd50474bcd..f23336f7b98d6d71d155373cff3515a8463aecbe 100644
--- a/paddle/fluid/operators/bilinear_tensor_product_op.h
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/blas.h"
 
 namespace paddle {
 namespace operators {
@@ -61,9 +61,9 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
       auto output_col_vec = output_mat.chip(i, 1);
       Tensor weight_mat =
           weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim}));
-      math::gemm<DeviceContext, T>(dev_ctx, CblasNoTrans, CblasNoTrans,
-                                   batch_size, y_dim, x_dim, 1, x->data<T>(),
-                                   weight_mat.data<T>(), 0, left_mul.data<T>());
+      math::GetBlas<DeviceContext, T>(dev_ctx).GEMM(
+          CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1, x->data<T>(),
+          weight_mat.data<T>(), 0, left_mul.data<T>());
       output_col_vec.device(place) =
           (left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1));
     }
@@ -125,6 +125,8 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
       set_zero(dev_ctx, d_y, static_cast<T>(0));
     }
 
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+
     // Caculate the Output(X@Grad) and Output(Y@Grad).
     if (d_x || d_y) {
       Eigen::DSizes<int, 2> bcast_for_x(1, y_dim);
@@ -138,18 +140,16 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
               output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
                   .broadcast(bcast_for_x) *
               y_mat;
-          math::gemm<DeviceContext, T>(
-              dev_ctx, CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1,
-              y_scale.data<T>(), weight_i.data<T>(), 1, d_x->data<T>());
+          blas.GEMM(CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1,
+                    y_scale.data<T>(), weight_i.data<T>(), 1, d_x->data<T>());
         }
         if (d_y) {
           x_scale_mat.device(place) =
               output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
                   .broadcast(bcast_for_y) *
               x_mat;
-          math::gemm<DeviceContext, T>(
-              dev_ctx, CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1,
-              x_scale.data<T>(), weight_i.data<T>(), 1, d_y->data<T>());
+          blas.GEMM(CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1,
+                    x_scale.data<T>(), weight_i.data<T>(), 1, d_y->data<T>());
         }
       }
     }
@@ -166,9 +166,8 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
             output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
                 .broadcast(bcast_for_weight) *
             x_mat;
-        math::gemm<DeviceContext, T>(dev_ctx, CblasTrans, CblasNoTrans, x_dim,
-                                     y_dim, batch_size, 1, x_scale.data<T>(),
-                                     y->data<T>(), 0, d_weight_i.data<T>());
+        blas.GEMM(CblasTrans, CblasNoTrans, x_dim, y_dim, batch_size, 1,
+                  x_scale.data<T>(), y->data<T>(), 0, d_weight_i.data<T>());
       }
     }
 
diff --git a/paddle/fluid/operators/bipartite_match_op.cc b/paddle/fluid/operators/bipartite_match_op.cc
deleted file mode 100644
index 1218d9fdc1e6101d17bc09a4ae769f5fbf8e7b15..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/bipartite_match_op.cc
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-class BipartiteMatchOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("DistMat"),
-                   "Input(DistMat) of BipartiteMatch should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("ColToRowMatchIndices"),
-        "Output(ColToRowMatchIndices) of BipartiteMatch should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("ColToRowMatchDist"),
-        "Output(ColToRowMatchDist) of BipartiteMatch should not be null.");
-
-    auto dims = ctx->GetInputDim("DistMat");
-    PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2.");
-
-    ctx->SetOutputDim("ColToRowMatchIndices", dims);
-    ctx->SetOutputDim("ColToRowMatchDist", dims);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<LoDTensor>("DistMat")->type()),
-        platform::CPUPlace());
-  }
-};
-
-template <typename T>
-class BipartiteMatchKernel : public framework::OpKernel<T> {
- public:
-  // The match_indices must be initialized to -1 at first.
-  // The match_dist must be initialized to 0 at first.
-  void BipartiteMatch(const Tensor& dist, int* match_indices,
-                      T* match_dist) const {
-    constexpr T kEPS = static_cast<T>(1e-6);
-    PADDLE_ENFORCE_EQ(dist.dims().size(), 2, "The rank of dist must be 2.");
-    int64_t row = dist.dims()[0];
-    int64_t col = dist.dims()[1];
-    auto* dist_data = dist.data<T>();
-    std::vector<int> row_pool;
-    for (int i = 0; i < row; ++i) {
-      row_pool.push_back(i);
-    }
-    while (row_pool.size() > 0) {
-      int max_idx = -1;
-      int max_row_idx = -1;
-      T max_dist = -1;
-      for (int64_t j = 0; j < col; ++j) {
-        if (match_indices[j] != -1) {
-          continue;
-        }
-        for (size_t k = 0; k < row_pool.size(); ++k) {
-          int m = row_pool[k];
-          // distance is 0 between m-th row and j-th column
-          if (dist_data[m * col + j] < kEPS) {
-            continue;
-          }
-          if (dist_data[m * col + j] > max_dist) {
-            max_idx = j;
-            max_row_idx = m;
-            max_dist = dist_data[m * col + j];
-          }
-        }
-      }
-      if (max_idx == -1) {
-        // Cannot find good match.
-        break;
-      } else {
-        PADDLE_ENFORCE_EQ(match_indices[max_idx], -1);
-        match_indices[max_idx] = max_row_idx;
-        match_dist[max_idx] = max_dist;
-        // Erase the row index.
-        row_pool.erase(
-            std::find(row_pool.begin(), row_pool.end(), max_row_idx));
-      }
-    }
-  }
-
-  void ArgMaxMatch(const Tensor& dist, int* match_indices, T* match_dist,
-                   T overlap_threshold) const {
-    constexpr T kEPS = static_cast<T>(1e-6);
-    int64_t row = dist.dims()[0];
-    int64_t col = dist.dims()[1];
-    auto* dist_data = dist.data<T>();
-    for (int64_t j = 0; j < col; ++j) {
-      if (match_indices[j] != -1) {
-        // the j-th column has been matched to one entity.
-        continue;
-      }
-      int max_row_idx = -1;
-      T max_dist = -1;
-      for (int i = 0; i < row; ++i) {
-        T dist = dist_data[i * col + j];
-        if (dist < kEPS) {
-          // distance is 0 between m-th row and j-th column
-          continue;
-        }
-        if (dist >= overlap_threshold && dist > max_dist) {
-          max_row_idx = i;
-          max_dist = dist;
-        }
-      }
-      if (max_row_idx != -1) {
-        PADDLE_ENFORCE_EQ(match_indices[j], -1);
-        match_indices[j] = max_row_idx;
-        match_dist[j] = max_dist;
-      }
-    }
-  }
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* dist_mat = context.Input<LoDTensor>("DistMat");
-    auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices");
-    auto* match_dist = context.Output<Tensor>("ColToRowMatchDist");
-
-    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
-
-    auto col = dist_mat->dims()[1];
-
-    int64_t n = dist_mat->lod().size() == 0UL
-                    ? 1
-                    : static_cast<int64_t>(dist_mat->lod().back().size() - 1);
-    if (dist_mat->lod().size()) {
-      PADDLE_ENFORCE_EQ(dist_mat->lod().size(), 1UL,
-                        "Only support 1 level of LoD.");
-    }
-    match_indices->mutable_data<int>({n, col}, context.GetPlace());
-    match_dist->mutable_data<T>({n, col}, context.GetPlace());
-
-    math::SetConstant<platform::CPUDeviceContext, int> iset;
-    iset(dev_ctx, match_indices, static_cast<int>(-1));
-    math::SetConstant<platform::CPUDeviceContext, T> tset;
-    tset(dev_ctx, match_dist, static_cast<T>(0));
-
-    int* indices = match_indices->data<int>();
-    T* dist = match_dist->data<T>();
-    auto type = context.Attr<std::string>("match_type");
-    auto threshold = context.Attr<float>("dist_threshold");
-    if (n == 1) {
-      BipartiteMatch(*dist_mat, indices, dist);
-      if (type == "per_prediction") {
-        ArgMaxMatch(*dist_mat, indices, dist, threshold);
-      }
-    } else {
-      auto lod = dist_mat->lod().back();
-      for (size_t i = 0; i < lod.size() - 1; ++i) {
-        Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]);
-        BipartiteMatch(one_ins, indices + i * col, dist + i * col);
-        if (type == "per_prediction") {
-          ArgMaxMatch(one_ins, indices + i * col, dist + i * col, threshold);
-        }
-      }
-    }
-  }
-};
-
-class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  BipartiteMatchOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "DistMat",
-        "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
-        "[K, M]. It is pair-wise distance matrix between the entities "
-        "represented by each row and each column. For example, assumed one "
-        "entity is A with shape [K], another entity is B with shape [M]. The "
-        "DistMat[i][j] is the distance between A[i] and B[j]. The bigger "
-        "the distance is, the better macthing the pairs are. Please note, "
-        "This tensor can contain LoD information to represent a batch of "
-        "inputs. One instance of this batch can contain different numbers of "
-        "entities.");
-    AddAttr<std::string>(
-        "match_type",
-        "(string, defalut: per_prediction) "
-        "The type of matching method, should be 'bipartite' or "
-        "'per_prediction', 'bipartite' by defalut.")
-        .SetDefault("bipartite")
-        .InEnum({"bipartite", "per_prediction"});
-    AddAttr<float>(
-        "dist_threshold",
-        "(float, defalut: 0.5) "
-        "If `match_type` is 'per_prediction', this threshold is to determine "
-        "the extra matching bboxes based on the maximum distance.")
-        .SetDefault(0.5);
-    AddOutput("ColToRowMatchIndices",
-              "(Tensor) A 2-D Tensor with shape [N, M] in int type. "
-              "N is the batch size. If ColToRowMatchIndices[i][j] is -1, it "
-              "means B[j] does not match any entity in i-th instance. "
-              "Otherwise, it means B[j] is matched to row "
-              "ColToRowMatchIndices[i][j] in i-th instance. The row number of "
-              "i-th instance is saved in ColToRowMatchIndices[i][j].");
-    AddOutput("ColToRowMatchDist",
-              "(Tensor) A 2-D Tensor with shape [N, M] in float type. "
-              "N is batch size. If ColToRowMatchIndices[i][j] is -1, "
-              "ColToRowMatchDist[i][j] is also -1.0. Otherwise, assumed "
-              "ColToRowMatchIndices[i][j] = d, and the row offsets of each "
-              "instance are called LoD. Then "
-              "ColToRowMatchDist[i][j] = DistMat[d+LoD[i]][j]");
-    AddComment(R"DOC(
-This operator is a greedy bipartite matching algorithm, which is used to
-obtain the matching with the maximum distance based on the input
-distance matrix. For input 2D matrix, the bipartite matching algorithm can
-find the matched column for each row, also can find the matched row for
-each column. And this operator only calculate matched indices from column
-to row. For each instance, the number of matched indices is the number of
-of columns of the input distance matrix.
-
-There are two outputs to save matched indices and distance.
-A simple description, this algorithm matched the best (maximum distance)
-row entity to the column entity and the matched indices are not duplicated
-in each row of ColToRowMatchIndices. If the column entity is not matched
-any row entity, set -1 in ColToRowMatchIndices.
-
-Please note that the input DistMat can be LoDTensor (with LoD) or Tensor.
-If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size.
-If Tensor, the height of ColToRowMatchIndices is 1.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(bipartite_match, ops::BipartiteMatchOp,
-                  ops::BipartiteMatchOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(bipartite_match, ops::BipartiteMatchKernel<float>,
-                       ops::BipartiteMatchKernel<double>);
diff --git a/paddle/fluid/operators/box_coder_op.cc b/paddle/fluid/operators/box_coder_op.cc
deleted file mode 100644
index eccdd408a17a07a541480705242b137f8207c139..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/box_coder_op.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/box_coder_op.h"
-
-namespace paddle {
-namespace operators {
-
-class BoxCoderOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
-                   "Input(PriorBox) of BoxCoderOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("PriorBoxVar"),
-                   "Input(PriorBoxVar) of BoxCoderOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("TargetBox"),
-                   "Input(TargetBox) of BoxCoderOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("OutputBox"),
-                   "Output(OutputBox) of BoxCoderOp should not be null.");
-
-    auto prior_box_dims = ctx->GetInputDim("PriorBox");
-    auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
-    auto target_box_dims = ctx->GetInputDim("TargetBox");
-
-    PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
-                      "The rank of Input of PriorBoxVar must be 2");
-    PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]");
-    PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
-
-    auto code_type = GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
-    if (code_type == BoxCodeType::kEncodeCenterSize) {
-      PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
-                        "The rank of Input of TargetBox must be 2");
-      PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
-                        "The shape of TargetBox is [M, 4]");
-    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
-      PADDLE_ENFORCE_EQ(target_box_dims.size(), 3,
-                        "The rank of Input of TargetBox must be 3");
-      PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]);
-      PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]);
-    }
-
-    ctx->SetOutputDim(
-        "OutputBox",
-        framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4}));
-    ctx->ShareLoD("TargetBox", /*->*/ "OutputBox");
-  }
-};
-
-class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  BoxCoderOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "PriorBox",
-        "(Tensor, default Tensor<float>) "
-        "Box list PriorBox is a 2-D Tensor with shape [M, 4] holds M boxes, "
-        "each box is represented as [xmin, ymin, xmax, ymax], "
-        "[xmin, ymin] is the left top coordinate of the anchor box, "
-        "if the input is image feature map, they are close to the origin "
-        "of the coordinate system. [xmax, ymax] is the right bottom "
-        "coordinate of the anchor box.");
-    AddInput("PriorBoxVar",
-             "(Tensor, default Tensor<float>) "
-             "PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group "
-             "of variance.");
-    AddInput(
-        "TargetBox",
-        "(LoDTensor or Tensor) This input can be a 2-D LoDTensor with shape "
-        "[N, 4] when code_type is 'encode_center_size'. This input also can "
-        "be a 3-D Tensor with shape [N, M, 4] when code_type is "
-        "'decode_center_size'. [N, 4], each box is represented as "
-        "[xmin, ymin, xmax, ymax], [xmin, ymin] is the left top coordinate "
-        "of the box if the input is image feature map, they are close to "
-        "the origin of the coordinate system. [xmax, ymax] is the right "
-        "bottom coordinate of the box. This tensor can contain LoD "
-        "information to represent a batch of inputs. One instance of this "
-        "batch can contain different numbers of entities.");
-    AddAttr<std::string>("code_type",
-                         "(string, default encode_center_size) "
-                         "the code type used with the target box")
-        .SetDefault("encode_center_size")
-        .InEnum({"encode_center_size", "decode_center_size"});
-    AddOutput("OutputBox",
-              "(LoDTensor or Tensor) "
-              "When code_type is 'encode_center_size', the output tensor of "
-              "box_coder_op with shape [N, M, 4] representing the result of N "
-              "target boxes encoded with M Prior boxes and variances. When "
-              "code_type is 'decode_center_size', N represents the batch size "
-              "and M represents the number of deocded boxes.");
-
-    AddComment(R"DOC(
-Bounding Box Coder Operator.
-Encode/Decode the target bounding box with the priorbox information.
-The Encoding schema described below:
-ox = (tx - px) / pw / pxv
-oy = (ty - py) / ph / pyv
-ow = log(abs(tw / pw)) / pwv 
-oh = log(abs(th / ph)) / phv 
-The Decoding schema described below:
-ox = (pw * pxv * tx * + px) - tw / 2
-oy = (ph * pyv * ty * + py) - th / 2
-ow = exp(pwv * tw) * pw + tw / 2
-oh = exp(phv * th) * ph + th / 2
-where tx, ty, tw, th denote the target box's center coordinates, width and
-height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor)
-center coordinates, width and height. pxv, pyv, pwv, phv denote the variance
-of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates,
-width and height.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker);
-REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel<float>,
-                       ops::BoxCoderKernel<double>);
diff --git a/paddle/fluid/operators/box_coder_op.cu b/paddle/fluid/operators/box_coder_op.cu
deleted file mode 100644
index 0944e9c95d4a66cc4a51751a8c70cd7a3fefaf1a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/box_coder_op.cu
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/box_coder_op.h"
-#include "paddle/fluid/platform/cuda_helper.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void EncodeCenterSizeKernel(const T* prior_box_data,
-                                       const T* prior_box_var_data,
-                                       const T* target_box_data, const int row,
-                                       const int col, const int len,
-                                       T* output) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < row * col) {
-    const int row_idx = idx / col;
-    const int col_idx = idx % col;
-    T prior_box_width =
-        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
-    T prior_box_height =
-        prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
-    T prior_box_center_x =
-        (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
-    T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
-                            prior_box_data[col_idx * len + 1]) /
-                           2;
-
-    T target_box_center_x =
-        (target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) /
-        2;
-    T target_box_center_y = (target_box_data[row_idx * len + 3] +
-                             target_box_data[row_idx * len + 1]) /
-                            2;
-    T target_box_width =
-        target_box_data[row_idx * len + 2] - target_box_data[row_idx * len];
-    T target_box_height =
-        target_box_data[row_idx * len + 3] - target_box_data[row_idx * len + 1];
-
-    output[idx * len] = (target_box_center_x - prior_box_center_x) /
-                        prior_box_width / prior_box_var_data[col_idx * len];
-    output[idx * len + 1] = (target_box_center_y - prior_box_center_y) /
-                            prior_box_height /
-                            prior_box_var_data[col_idx * len + 1];
-    output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)) /
-                            prior_box_var_data[col_idx * len + 2];
-    output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)) /
-                            prior_box_var_data[col_idx * len + 3];
-  }
-}
-
-template <typename T>
-__global__ void DecodeCenterSizeKernel(const T* prior_box_data,
-                                       const T* prior_box_var_data,
-                                       const T* target_box_data, const int row,
-                                       const int col, const int len,
-                                       T* output) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < row * col) {
-    const int col_idx = idx % col;
-    T prior_box_width =
-        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
-    T prior_box_height =
-        prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
-    T prior_box_center_x =
-        (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
-    T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
-                            prior_box_data[col_idx * len + 1]) /
-                           2;
-
-    T target_box_width = exp(prior_box_var_data[col_idx * len + 2] *
-                             target_box_data[idx * len + 2]) *
-                         prior_box_width;
-    T target_box_height = exp(prior_box_var_data[col_idx * len + 3] *
-                              target_box_data[idx * len + 3]) *
-                          prior_box_height;
-    T target_box_center_x = prior_box_var_data[col_idx * len] *
-                                target_box_data[idx * len] * prior_box_width +
-                            prior_box_center_x;
-    T target_box_center_y = prior_box_var_data[col_idx * len + 1] *
-                                target_box_data[idx * len + 1] *
-                                prior_box_height +
-                            prior_box_center_y;
-
-    output[idx * len] = target_box_center_x - target_box_width / 2;
-    output[idx * len + 1] = target_box_center_y - target_box_height / 2;
-    output[idx * len + 2] = target_box_center_x + target_box_width / 2;
-    output[idx * len + 3] = target_box_center_y + target_box_height / 2;
-  }
-}
-
-template <typename T>
-class BoxCoderCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    auto* prior_box = context.Input<framework::Tensor>("PriorBox");
-    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
-    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
-    auto* output_box = context.Output<framework::Tensor>("OutputBox");
-
-    if (target_box->lod().size()) {
-      PADDLE_ENFORCE_EQ(target_box->lod().size(), 1,
-                        "Only support 1 level of LoD.");
-    }
-    auto row = target_box->dims()[0];
-    auto col = prior_box->dims()[0];
-    auto len = prior_box->dims()[1];
-    int block = 512;
-    int grid = (row * col + block - 1) / block;
-    auto& device_ctx = context.cuda_device_context();
-
-    const T* prior_box_data = prior_box->data<T>();
-    const T* prior_box_var_data = prior_box_var->data<T>();
-    const T* target_box_data = target_box->data<T>();
-
-    output_box->mutable_data<T>({row, col, len}, context.GetPlace());
-    T* output = output_box->data<T>();
-
-    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
-    if (code_type == BoxCodeType::kEncodeCenterSize) {
-      EncodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
-          prior_box_data, prior_box_var_data, target_box_data, row, col, len,
-          output);
-    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
-      DecodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
-          prior_box_data, prior_box_var_data, target_box_data, row, col, len,
-          output);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(box_coder, ops::BoxCoderCUDAKernel<float>,
-                        ops::BoxCoderCUDAKernel<double>);
diff --git a/paddle/fluid/operators/box_coder_op.h b/paddle/fluid/operators/box_coder_op.h
deleted file mode 100644
index 3c7cac1cd17042994287effc31a918ebd4353c4c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/box_coder_op.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 };
-
-inline BoxCodeType GetBoxCodeType(const std::string& type) {
-  if (type == "encode_center_size") {
-    return BoxCodeType::kEncodeCenterSize;
-  } else if (type == "decode_center_size") {
-    return BoxCodeType::kDecodeCenterSize;
-  }
-  PADDLE_THROW("Not support type %s.", type);
-}
-
-template <typename T>
-class BoxCoderKernel : public framework::OpKernel<T> {
- public:
-  void EncodeCenterSize(const framework::Tensor& target_box,
-                        const framework::Tensor& prior_box,
-                        const framework::Tensor& prior_box_var,
-                        T* output) const {
-    int64_t row = target_box.dims()[0];
-    int64_t col = prior_box.dims()[0];
-    int64_t len = prior_box.dims()[1];
-    auto* target_box_data = target_box.data<T>();
-    auto* prior_box_data = prior_box.data<T>();
-    auto* prior_box_var_data = prior_box_var.data<T>();
-
-    for (int64_t i = 0; i < row; ++i) {
-      for (int64_t j = 0; j < col; ++j) {
-        T prior_box_width =
-            prior_box_data[j * len + 2] - prior_box_data[j * len];
-        T prior_box_height =
-            prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
-        T prior_box_center_x =
-            (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
-        T prior_box_center_y =
-            (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
-
-        T target_box_center_x =
-            (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
-        T target_box_center_y =
-            (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
-        T target_box_width =
-            target_box_data[i * len + 2] - target_box_data[i * len];
-        T target_box_height =
-            target_box_data[i * len + 3] - target_box_data[i * len + 1];
-
-        size_t offset = i * col * len + j * len;
-        output[offset] = (target_box_center_x - prior_box_center_x) /
-                         prior_box_width / prior_box_var_data[j * len];
-        output[offset + 1] = (target_box_center_y - prior_box_center_y) /
-                             prior_box_height / prior_box_var_data[j * len + 1];
-        output[offset + 2] =
-            std::log(std::fabs(target_box_width / prior_box_width)) /
-            prior_box_var_data[j * len + 2];
-        output[offset + 3] =
-            std::log(std::fabs(target_box_height / prior_box_height)) /
-            prior_box_var_data[j * len + 3];
-      }
-    }
-  }
-  void DecodeCenterSize(const framework::Tensor& target_box,
-                        const framework::Tensor& prior_box,
-                        const framework::Tensor& prior_box_var,
-                        T* output) const {
-    int64_t row = target_box.dims()[0];
-    int64_t col = prior_box.dims()[0];
-    int64_t len = prior_box.dims()[1];
-
-    auto* target_box_data = target_box.data<T>();
-    auto* prior_box_data = prior_box.data<T>();
-    auto* prior_box_var_data = prior_box_var.data<T>();
-
-    for (int64_t i = 0; i < row; ++i) {
-      for (int64_t j = 0; j < col; ++j) {
-        size_t offset = i * col * len + j * len;
-        T prior_box_width =
-            prior_box_data[j * len + 2] - prior_box_data[j * len];
-        T prior_box_height =
-            prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
-        T prior_box_center_x =
-            (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
-        T prior_box_center_y =
-            (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
-
-        T target_box_center_x = prior_box_var_data[j * len] *
-                                    target_box_data[offset] * prior_box_width +
-                                prior_box_center_x;
-        T target_box_center_y = prior_box_var_data[j * len + 1] *
-                                    target_box_data[offset + 1] *
-                                    prior_box_height +
-                                prior_box_center_y;
-        T target_box_width = std::exp(prior_box_var_data[j * len + 2] *
-                                      target_box_data[offset + 2]) *
-                             prior_box_width;
-        T target_box_height = std::exp(prior_box_var_data[j * len + 3] *
-                                       target_box_data[offset + 3]) *
-                              prior_box_height;
-
-        output[offset] = target_box_center_x - target_box_width / 2;
-        output[offset + 1] = target_box_center_y - target_box_height / 2;
-        output[offset + 2] = target_box_center_x + target_box_width / 2;
-        output[offset + 3] = target_box_center_y + target_box_height / 2;
-      }
-    }
-  }
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* prior_box = context.Input<framework::Tensor>("PriorBox");
-    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
-    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
-    auto* output_box = context.Output<framework::Tensor>("OutputBox");
-
-    if (target_box->lod().size()) {
-      PADDLE_ENFORCE_EQ(target_box->lod().size(), 1UL,
-                        "Only support 1 level of LoD.");
-    }
-    auto row = target_box->dims()[0];
-    auto col = prior_box->dims()[0];
-    auto len = prior_box->dims()[1];
-
-    output_box->mutable_data<T>({row, col, len}, context.GetPlace());
-
-    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
-    T* output = output_box->data<T>();
-    if (code_type == BoxCodeType::kEncodeCenterSize) {
-      EncodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
-    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
-      DecodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc
index 72f8cb04f2de3af4ee526c3d9b86ff96e34f0b0a..8d6a498dc941e44688ec8a2b49a6e080608f9b85 100644
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -14,14 +14,14 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/cast_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
 
 class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CastOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The input tensor of cast op");
     AddOutput("Out", "The output tensor of cast op");
     AddAttr<int>("out_dtype", "output data type");
@@ -88,4 +88,6 @@ REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>,
                        ops::CastOpKernel<CPU, double>,
                        ops::CastOpKernel<CPU, int>,
                        ops::CastOpKernel<CPU, int64_t>,
-                       ops::CastOpKernel<CPU, bool>);
+                       ops::CastOpKernel<CPU, bool>,
+                       ops::CastOpKernel<CPU, uint8_t>,
+                       ops::CastOpKernel<CPU, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu
index 507e9a531aae70e60bc6748bfab800310d6e0c21..657d162878c108760585ca9bd58e2fd34bf1fef3 100644
--- a/paddle/fluid/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/cast_op.h"
+#include "paddle/fluid/platform/float16.h"
 
 template <typename T>
 using CastOpKernel =
@@ -20,4 +21,5 @@ using CastOpKernel =
 
 REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel<float>, CastOpKernel<double>,
                         CastOpKernel<int>, CastOpKernel<int64_t>,
-                        CastOpKernel<bool>);
+                        CastOpKernel<bool>, CastOpKernel<uint8_t>,
+                        CastOpKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/channel_close_op.cc b/paddle/fluid/operators/channel_close_op.cc
index 5892650c49e2e9d7345fb94465d124cff57f0a6f..8e2db250a069c488ee98f618bc03df6485022456 100644
--- a/paddle/fluid/operators/channel_close_op.cc
+++ b/paddle/fluid/operators/channel_close_op.cc
@@ -50,8 +50,7 @@ class ChannelCloseOpOpInferShape : public framework::InferShapeBase {
 
 class ChannelCloseOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ChannelCloseOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(kChannel,
              "The Channel Variable that should be closed by"
              " the ChannelClose Op.");
diff --git a/paddle/fluid/operators/channel_create_op.cc b/paddle/fluid/operators/channel_create_op.cc
index b2fdfd0e1f24ed071bb57b7de8f99b2d5e1d3196..a7f59e4088e3fb328e5b5a83eed65f0f90edb9f0 100644
--- a/paddle/fluid/operators/channel_create_op.cc
+++ b/paddle/fluid/operators/channel_create_op.cc
@@ -91,8 +91,7 @@ class ChannelCreateOpOpInferShape : public framework::InferShapeBase {
 
 class ChannelCreateOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ChannelCreateOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddOutput(kOutput,
               "The object of a Channel type created by ChannelCreate Op.");
     AddAttr<int>("capacity", "The size of the buffer of Channel.")
diff --git a/paddle/fluid/operators/channel_recv_op.cc b/paddle/fluid/operators/channel_recv_op.cc
index c12b88e7a91c4ea7044223464a2f902db494d1a8..101015e837e28b504b71d919abd5f908a102c812 100644
--- a/paddle/fluid/operators/channel_recv_op.cc
+++ b/paddle/fluid/operators/channel_recv_op.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <paddle/fluid/framework/reader.h>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/concurrency/channel_util.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
 static constexpr char Channel[] = "Channel";
@@ -28,33 +29,14 @@ namespace paddle {
 namespace operators {
 
 void SetReceiveStatus(const platform::Place &dev_place,
-                      framework::Variable &status_var, bool status) {
+                      framework::Variable *status_var, bool status) {
   auto cpu = platform::CPUPlace();
   auto status_tensor =
-      status_var.GetMutable<framework::LoDTensor>()->mutable_data<bool>({1},
-                                                                        cpu);
+      status_var->GetMutable<framework::LoDTensor>()->mutable_data<bool>({1},
+                                                                         cpu);
   status_tensor[0] = status;
 }
 
-bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var) {
-  // Get type of channel and use that to call mutable data for Variable
-  auto type = framework::ToVarType(ch->Type());
-  if (type == framework::proto::VarType_Type_LOD_TENSOR)
-    return ch->Receive(var->GetMutable<framework::LoDTensor>());
-  else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
-    return ch->Receive(var->GetMutable<framework::LoDRankTable>());
-  else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
-    return ch->Receive(var->GetMutable<framework::LoDTensorArray>());
-  else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
-    return ch->Receive(var->GetMutable<framework::SelectedRows>());
-  else if (type == framework::proto::VarType_Type_READER)
-    return ch->Receive(var->GetMutable<framework::ReaderHolder>());
-  else if (type == framework::proto::VarType_Type_CHANNEL)
-    return ch->Receive(var->GetMutable<framework::ChannelHolder>());
-  else
-    PADDLE_THROW("ChannelReceive:Unsupported type");
-}
-
 class ChannelRecvOp : public framework::OperatorBase {
  public:
   ChannelRecvOp(const std::string &type,
@@ -81,17 +63,16 @@ class ChannelRecvOp : public framework::OperatorBase {
         scope.FindVar(Input(Channel))->GetMutable<framework::ChannelHolder>();
     auto output_var = scope.FindVar(Output(Out));
     // Receive the data from the channel.
-    bool ok = ChannelReceive(ch, output_var);
+    bool ok = concurrency::ChannelReceive(ch, output_var);
 
     // Set the status output of the `ChannelReceive` call.
-    SetReceiveStatus(dev_place, *scope.FindVar(Output(Status)), ok);
+    SetReceiveStatus(dev_place, scope.FindVar(Output(Status)), ok);
   }
 };
 
 class ChannelRecvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ChannelRecvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(Channel,
              "(Channel) A variable which \"receives\" the a value sent"
              "to it by a channel_send op.")
diff --git a/paddle/fluid/operators/channel_send_op.cc b/paddle/fluid/operators/channel_send_op.cc
index 6d7715ad229e821f02437246e3326063cb1ee757..67d6deb511d883ac69426ddd34be2199367cd4c7 100644
--- a/paddle/fluid/operators/channel_send_op.cc
+++ b/paddle/fluid/operators/channel_send_op.cc
@@ -18,43 +18,15 @@ limitations under the License. */
 #include <paddle/fluid/framework/reader.h>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/concurrency/channel_util.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
 static constexpr char Channel[] = "Channel";
 static constexpr char X[] = "X";
-static constexpr char Status[] = "Status";
-static constexpr char copy[] = "copy";
 
 namespace paddle {
 namespace operators {
 
-void SetSendStatus(const platform::Place &dev_place,
-                   framework::Variable &status_var, bool status) {
-  auto cpu = platform::CPUPlace();
-  auto status_tensor =
-      status_var.GetMutable<framework::LoDTensor>()->mutable_data<bool>({1},
-                                                                        cpu);
-  status_tensor[0] = status;
-}
-
-bool ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) {
-  auto type = framework::ToVarType(var->Type());
-  if (type == framework::proto::VarType_Type_LOD_TENSOR)
-    return ch->Send(var->GetMutable<framework::LoDTensor>());
-  else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
-    return ch->Send(var->GetMutable<framework::LoDRankTable>());
-  else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
-    return ch->Send(var->GetMutable<framework::LoDTensorArray>());
-  else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
-    return ch->Send(var->GetMutable<framework::SelectedRows>());
-  else if (type == framework::proto::VarType_Type_READER)
-    return ch->Send(var->GetMutable<framework::ReaderHolder>());
-  else if (type == framework::proto::VarType_Type_CHANNEL)
-    return ch->Send(var->GetMutable<framework::ChannelHolder>());
-  else
-    PADDLE_THROW("ChannelSend:Unsupported type");
-}
-
 class ChannelSendOp : public framework::OperatorBase {
  public:
   ChannelSendOp(const std::string &type,
@@ -68,9 +40,6 @@ class ChannelSendOp : public framework::OperatorBase {
                    "Input(Channel) of ChannelSendOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput(X),
                    "Input(X) of ChannelSendOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(Status),
-                   "Output(Status) of ChannelSendOp should not be null.");
-    ctx->SetOutputDim("Status", {1});
   }
 
  private:
@@ -82,29 +51,19 @@ class ChannelSendOp : public framework::OperatorBase {
     auto input_var = scope.FindVar(Input(X));
 
     // Send the input data through the channel.
-    bool ok = ChannelSend(ch, input_var);
-
-    // Set the status output of the `ChannelSend` call.
-    SetSendStatus(dev_place, *scope.FindVar(Output(Status)), ok);
+    concurrency::ChannelSend(ch, input_var);
   }
 };
 
 class ChannelSendOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ChannelSendOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(Channel,
              "(Channel) A variable which \"sends\" the passed in value to "
              "a listening receiver.")
         .AsDuplicable();
     AddInput(X, "(Variable) The value which gets sent by the channel.")
         .AsDuplicable();
-    AddOutput(Status,
-              "(Tensor) An LoD Tensor that returns a boolean status of the"
-              "result of the send operation.")
-        .AsDuplicable();
-    AddAttr<bool>(copy, "(bool, default false) Should copy before send")
-        .SetDefault(false);
     AddComment(R"DOC(
 )DOC");
   }
diff --git a/paddle/fluid/operators/checkpoint_notify_op.cc b/paddle/fluid/operators/checkpoint_notify_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c4219a429a53eb4869426a2674109555fb784b85
--- /dev/null
+++ b/paddle/fluid/operators/checkpoint_notify_op.cc
@@ -0,0 +1,88 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <future>  // NOLINT
+#include <ostream>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/send_recv_util.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace paddle {
+namespace operators {
+
+class CheckpointNotifyOp : public framework::OperatorBase {
+ public:
+  CheckpointNotifyOp(const std::string& type,
+                     const framework::VariableNameMap& inputs,
+                     const framework::VariableNameMap& outputs,
+                     const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+    std::string dir = Attr<std::string>("dir");
+    std::string lookup_table_name = Attr<std::string>("lookup_table");
+
+    distributed::RPCClient* rpc_client =
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
+    for (size_t i = 0; i < epmap.size(); i++) {
+      auto lookup_table_save_dir =
+          string::Sprintf("%s/%s_%d", dir, lookup_table_name, i);
+      rpc_client->AsyncCheckpointNotify(epmap[i], lookup_table_save_dir);
+      VLOG(3) << "checkpoint notify sending lookup table: " << lookup_table_name
+              << " and dir:" << dir << " to " << epmap[i];
+    }
+    rpc_client->Wait();
+  }
+};
+
+class CheckpointNotifyOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddAttr<std::vector<std::string>>("epmap",
+                                      "(string vector, default  127.0.0.1:6164)"
+                                      "Parameter Server endpoints in the order")
+        .SetDefault({"127.0.0.1:6164"});
+    AddAttr<std::string>(
+        "dir", "(string, default '') indicate the folder checkpoint will use");
+    AddAttr<std::string>("lookup_table",
+                         "(string, default '') the lookup table name");
+    AddComment(R"DOC(
+CheckpointNotify operator
+
+This operator will send lookup table and it's checkpoint direcoty to listen_and_serve op at
+the parameter server.
+)DOC");
+  }
+};
+
+class CheckpointNotifyOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(checkpoint_notify, ops::CheckpointNotifyOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::CheckpointNotifyOpMaker,
+                  ops::CheckpointNotifyOpShapeInference);
diff --git a/paddle/fluid/operators/chunk_eval_op.cc b/paddle/fluid/operators/chunk_eval_op.cc
index 77d3cffe7c19affe66223363eba26e2d77cdcd43..dc43c69be0bcea2b82e1d61a9a5b2e03129d4f8e 100644
--- a/paddle/fluid/operators/chunk_eval_op.cc
+++ b/paddle/fluid/operators/chunk_eval_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/chunk_eval_op.h"
+#include <string>
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -64,8 +66,7 @@ class ChunkEvalOp : public framework::OperatorWithKernel {
 
 class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ChunkEvalOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Inference",
              "(Tensor, default: Tensor<int64_t>). "
              "Predictions from the network.");
@@ -90,32 +91,31 @@ class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
         "(int64_t). The number of chunks both in Inference and Label on the "
         "given mini-batch.");
     AddAttr<int>("num_chunk_types",
-                 "(int). The number of chunk type. See below for details.");
-    AddAttr<std::string>(
-        "chunk_scheme",
-        "(string, default IOB). The labeling scheme indicating "
-        "how to encode the chunks. Must be IOB, IOE, IOBES or plain. See below "
-        "for details.")
+                 "The number of chunk type. See the description for details.");
+    AddAttr<std::string>("chunk_scheme",
+                         "The labeling scheme indicating "
+                         "how to encode the chunks. Must be IOB, IOE, IOBES or "
+                         "plain. See the description"
+                         "for details.")
         .SetDefault("IOB");
     AddAttr<std::vector<int>>("excluded_chunk_types",
-                              "(list<int>) A list including chunk type ids "
+                              "A list including chunk type ids "
                               "indicating chunk types that are not counted. "
-                              "See below for details.")
+                              "See the description for details.")
         .SetDefault(std::vector<int>{});
     AddComment(R"DOC(
 For some basics of chunking, please refer to
-‘Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>’.
+'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'.
 
-
-CheckEvalOp computes the precision, recall, and F1-score of chunk detection,
+ChunkEvalOp computes the precision, recall, and F1-score of chunk detection,
 and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
 Here is a NER example of labeling for these tagging schemes:
-
- 	     Li     Ming    works  at  Agricultural   Bank   of    China  in  Beijing.
-  IO:    I-PER  I-PER   O      O   I-ORG          I-ORG  I-ORG I-ORG  O   I-LOC
-  IOB:   B-PER  I-PER   O      O   B-ORG          I-ORG  I-ORG I-ORG  O   B-LOC
-  IOE:   I-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   E-LOC
-  IOBES: B-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   S-LOC
+   
+          Li     Ming    works  at  Agricultural   Bank   of    China  in  Beijing.
+   IO     I-PER  I-PER   O      O   I-ORG          I-ORG  I-ORG I-ORG  O   I-LOC
+   IOB    B-PER  I-PER   O      O   B-ORG          I-ORG  I-ORG I-ORG  O   B-LOC
+   IOE    I-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   E-LOC
+   IOBES  B-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   S-LOC
 
 There are three chunk types(named entity types) including PER(person), ORG(organization)
 and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chunk type>.
@@ -123,31 +123,31 @@ and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chun
 Since the calculations actually use label ids rather than labels, extra attention
 should be paid when mapping labels to ids to make CheckEvalOp work. The key point
 is that the listed equations are satisfied by ids.
-
-    tag_type = label % num_tag_type
-    chunk_type = label / num_tag_type
+   
+   tag_type = label % num_tag_type
+   chunk_type = label / num_tag_type
 
 where `num_tag_type` is the num of tag types in the tagging scheme, `num_chunk_type`
 is the num of chunk types, and `tag_type` get its value from the following table.
-
-    Scheme Begin Inside End   Single
-     plain   0     -      -     -
-     IOB     0     1      -     -
-     IOE     -     0      1     -
-     IOBES   0     1      2     3
+   
+   Scheme Begin Inside End   Single
+    plain   0     -      -     -
+    IOB     0     1      -     -
+    IOE     -     0      1     -
+    IOBES   0     1      2     3
 
 Still use NER as example, assuming the tagging scheme is IOB while chunk types are ORG,
 PER and LOC. To satisfy the above equations, the label map can be like this:
 
-    B-ORG  0
-    I-ORG  1
-    B-PER  2
-    I-PER  3
-    B-LOC  4
-    I-LOC  5
-    O      6
+   B-ORG  0
+   I-ORG  1
+   B-PER  2
+   I-PER  3
+   B-LOC  4
+   I-LOC  5
+   O      6
 
-It’s not hard to verify the equations noting that the num of chunk types
+It's not hard to verify the equations noting that the num of chunk types
 is 3 and the num of tag types in IOB scheme is 2. For example, the label
 id of I-LOC is 5, the tag type id of I-LOC is 1, and the chunk type id of
 I-LOC is 2, which consistent with the results from the equations.
diff --git a/paddle/fluid/operators/chunk_eval_op.h b/paddle/fluid/operators/chunk_eval_op.h
index 9e97f7c7762ed6bded94be35ae8a094466e0aec0..8631415062db839476e2536a9836e4b9f069a3e2 100644
--- a/paddle/fluid/operators/chunk_eval_op.h
+++ b/paddle/fluid/operators/chunk_eval_op.h
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #pragma once
 #include <set>
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -36,11 +39,11 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
   };
 
   void GetSegments(const int64_t* label, int length,
-                   std::vector<Segment>& segments, int num_chunk_types,
+                   std::vector<Segment>* segments, int num_chunk_types,
                    int num_tag_types, int other_chunk_type, int tag_begin,
                    int tag_inside, int tag_end, int tag_single) const {
-    segments.clear();
-    segments.reserve(length);
+    segments->clear();
+    segments->reserve(length);
     int chunk_start = 0;
     bool in_chunk = false;
     int tag = -1;
@@ -58,7 +61,7 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
             i - 1,        // end
             prev_type,
         };
-        segments.push_back(segment);
+        segments->push_back(segment);
         in_chunk = false;
       }
       if (ChunkBegin(prev_tag, prev_type, tag, type, other_chunk_type,
@@ -73,7 +76,7 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
           length - 1,   // end
           type,
       };
-      segments.push_back(segment);
+      segments->push_back(segment);
     }
   }
 
@@ -177,8 +180,8 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
     for (int i = 0; i < num_sequences; ++i) {
       int seq_length = lod[0][i + 1] - lod[0][i];
       EvalOneSeq(inference_data + lod[0][i], label_data + lod[0][i], seq_length,
-                 output_segments, label_segments, *num_infer_chunks_data,
-                 *num_label_chunks_data, *num_correct_chunks_data,
+                 &output_segments, &label_segments, num_infer_chunks_data,
+                 num_label_chunks_data, num_correct_chunks_data,
                  num_chunk_types, num_tag_types, other_chunk_type, tag_begin,
                  tag_inside, tag_end, tag_single, excluded_chunk_types);
     }
@@ -197,10 +200,10 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
   }
 
   void EvalOneSeq(const int64_t* output, const int64_t* label, int length,
-                  std::vector<Segment>& output_segments,
-                  std::vector<Segment>& label_segments,
-                  int64_t& num_output_segments, int64_t& num_label_segments,
-                  int64_t& num_correct, int num_chunk_types, int num_tag_types,
+                  std::vector<Segment>* output_segments,
+                  std::vector<Segment>* label_segments,
+                  int64_t* num_output_segments, int64_t* num_label_segments,
+                  int64_t* num_correct, int num_chunk_types, int num_tag_types,
                   int other_chunk_type, int tag_begin, int tag_inside,
                   int tag_end, int tag_single,
                   const std::set<int>& excluded_chunk_types) const {
@@ -209,25 +212,29 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
     GetSegments(label, length, label_segments, num_chunk_types, num_tag_types,
                 other_chunk_type, tag_begin, tag_inside, tag_end, tag_single);
     size_t i = 0, j = 0;
-    while (i < output_segments.size() && j < label_segments.size()) {
-      if (output_segments[i] == label_segments[j] &&
-          excluded_chunk_types.count(output_segments[i].type) != 1) {
-        ++num_correct;
+    while (i < output_segments->size() && j < label_segments->size()) {
+      if (output_segments->at(i) == label_segments->at(j) &&
+          excluded_chunk_types.count(output_segments->at(i).type) != 1) {
+        ++(*num_correct);
       }
-      if (output_segments[i].end < label_segments[j].end) {
+      if (output_segments->at(i).end < label_segments->at(j).end) {
         ++i;
-      } else if (output_segments[i].end > label_segments[j].end) {
+      } else if (output_segments->at(i).end > label_segments->at(j).end) {
         ++j;
       } else {
         ++i;
         ++j;
       }
     }
-    for (auto& segment : label_segments) {
-      if (excluded_chunk_types.count(segment.type) != 1) ++num_label_segments;
+    for (auto& segment : (*label_segments)) {
+      if (excluded_chunk_types.count(segment.type) != 1) {
+        ++(*num_label_segments);
+      }
     }
-    for (auto& segment : output_segments) {
-      if (excluded_chunk_types.count(segment.type) != 1) ++num_output_segments;
+    for (auto& segment : (*output_segments)) {
+      if (excluded_chunk_types.count(segment.type) != 1) {
+        ++(*num_output_segments);
+      }
     }
   }
 };
diff --git a/paddle/fluid/operators/clip_by_norm_op.cc b/paddle/fluid/operators/clip_by_norm_op.cc
index f43726b4793f284f14226f90c94ac6eebf632bd5..eae86a373be278cbb3ea9425b2ff0169f8faa99e 100644
--- a/paddle/fluid/operators/clip_by_norm_op.cc
+++ b/paddle/fluid/operators/clip_by_norm_op.cc
@@ -37,8 +37,7 @@ class ClipByNormOp : public framework::OperatorWithKernel {
 
 class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ClipByNormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(Tensor) The input of clip_by_norm op."
              "The number of dimensions must be between [1, 9].");
@@ -55,10 +54,19 @@ be linearly scaled to make the L2 norm of $Out$ equal to $max\_norm$, as
 shown in the following formula:
 
 $$
-Out = \frac{max\_norm * X}{norm(X)},
+Out = \\frac{max\\_norm * X}{norm(X)},
 $$
 
 where $norm(X)$ represents the L2 norm of $X$.
+
+Examples:
+        .. code-block:: python
+
+            data = fluid.layer.data(
+                name='data', shape=[2, 4, 6], dtype='float32')
+            reshaped = fluid.layers.clip_by_norm(
+                x=data, max_norm=0.5)
+
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/clip_op.cc b/paddle/fluid/operators/clip_op.cc
index a3b67964c79268e6ce07018501c46163847897ad..a679f7e2536a0a44148193f423f5ffe11b5e35fc 100644
--- a/paddle/fluid/operators/clip_op.cc
+++ b/paddle/fluid/operators/clip_op.cc
@@ -38,8 +38,7 @@ class ClipOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ClipOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(Tensor)The input of clip op."
              "The number of dimensions must be between [1, 9].");
@@ -81,8 +80,9 @@ class ClipOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker<float>, clip_grad,
-            ops::ClipOpGrad);
+REGISTER_OPERATOR(clip, ops::ClipOp, ops::ClipOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(clip_grad, ops::ClipOpGrad);
 REGISTER_OP_CPU_KERNEL(
     clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/compare_op.cc b/paddle/fluid/operators/compare_op.cc
index 86f7046058c7001fcaa588727b1cdc0f3f20c35f..f40b1ba338d429c248103eeb930ac7e1bb690218 100644
--- a/paddle/fluid/operators/compare_op.cc
+++ b/paddle/fluid/operators/compare_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/compare_op.h"
+#include <string>
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -20,28 +21,28 @@ namespace operators {
 template <typename OpComment>
 class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CompareOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     OpComment comment;
-    AddInput("X",
-             string::Sprintf("(LoDTensor) the left hand operand of %s operator",
-                             comment.type));
-    AddInput("Y", string::Sprintf(
-                      "(LoDTensor) the right hand operand of %s operator",
-                      comment.type));
-    AddOutput("Out", string::Sprintf(
-                         "(LoDTensor) n-dim bool tensor. Each element is %s",
-                         comment.equation));
-    AddComment(string::Sprintf(R"DOC(%s Operator
-
+    AddInput("X", string::Sprintf("the left hand operand of %s operator",
+                                  comment.type));
+    AddInput("Y", string::Sprintf("the right hand operand of %s operator",
+                                  comment.type));
+    AddAttr<bool>("force_cpu",
+                  "Force fill output variable to cpu "
+                  "memory. Otherwise, fill output variable to the running "
+                  "device [default true].")
+        .SetDefault(true);
+    AddOutput("Out", string::Sprintf("n-dim bool tensor. Each element is %s",
+                                     comment.equation));
+    AddComment(string::Sprintf(R"DOC(
 It operates element-wise on X and Y, and returns the Out. Each of them is a
 N-dim tensor. X and Y could be any type.  The each element of the Out tensor is
-calculated by %s
+calculated by $%s$
 )DOC",
-                               comment.type, comment.equation));
-    AddAttr<int>("axis",
-                 "(int, default -1). The start dimension index "
-                 "for broadcasting Y onto X.")
+                               comment.equation));
+    AddAttr<int>(
+        "axis",
+        "The start dimension index for broadcasting Y onto X. [default -1]")
         .SetDefault(-1)
         .EqualGreaterThan(-1);
   }
@@ -75,7 +76,9 @@ class CompareOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
     // CompareOp kernel's device type is decided by input tensor place
-    kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
+    bool force_cpu = ctx.Attr<bool>("force_cpu");
+    kt.place_ = force_cpu ? platform::CPUPlace()
+                          : ctx.Input<framework::LoDTensor>("X")->place();
     return kt;
   }
 };
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 0eedd8ee51ebfff6f553d8e19e97c3a45a95fa6a..c72405593788493e10a1293b0c722e2d11c6e312 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/concat_op.h"
+
+#include <string>
 #include <vector>
 
 namespace paddle {
@@ -33,7 +35,10 @@ class ConcatOp : public framework::OperatorWithKernel {
     size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
     const size_t n = ins.size();
 
-    PADDLE_ENFORCE_GT(n, 1, "Input tensors count should > 1.");
+    PADDLE_ENFORCE_GT(n, 0, "Input tensors count should > 0.");
+    if (n == 1) {
+      VLOG(3) << "Warning: concat op have only one input, may waste memory";
+    }
 
     auto out_dims = ins[0];
     size_t in_zero_dims_size = out_dims.size();
@@ -58,8 +63,7 @@ class ConcatOp : public framework::OperatorWithKernel {
 
 class ConcatOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ConcatOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "Input tensors of concat operator.").AsDuplicable();
     AddOutput("Out", "Output tensor of concat operator.");
     AddAttr<int>("axis",
@@ -98,10 +102,18 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_EX(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad,
-               ops::ConcatOpGrad, false)
+REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<
+                      false> /* set false to disable empty grad */);
+REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad);
 REGISTER_OP_CPU_KERNEL(
-    concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>)
+    concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ConcatKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ConcatKernel<paddle::platform::CPUDeviceContext, int>);
 REGISTER_OP_CPU_KERNEL(
     concat_grad,
-    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, float>)
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, int>);
diff --git a/paddle/fluid/operators/concat_op.cu.cc b/paddle/fluid/operators/concat_op.cu.cc
index 590eca9d066ff7549939e62ddbfedc8ab76bb5e7..8e38e5231fbf6955ff8a9680a241a4a4ba1b924d 100644
--- a/paddle/fluid/operators/concat_op.cu.cc
+++ b/paddle/fluid/operators/concat_op.cu.cc
@@ -15,7 +15,13 @@ limitations under the License. */
 #include "paddle/fluid/operators/concat_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    concat, ops::ConcatKernel<paddle::platform::CUDADeviceContext, float>);
+    concat, ops::ConcatKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ConcatKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ConcatKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ConcatKernel<paddle::platform::CUDADeviceContext, int>);
 REGISTER_OP_CUDA_KERNEL(
     concat_grad,
-    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, int>);
diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h
index 92c8ab6d9ff11ec6acd46a39877eb67d624748a9..a496301526f58875ff51aeaa5b2094c3c656531c 100644
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -60,34 +60,45 @@ template <typename DeviceContext, typename T>
 class ConcatGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
-    auto* in = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto out_var_names = ctx.Outputs(framework::GradVarName("X"));
     auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
     int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
 
+    // get output tensor that the name is not kEmptyVarName
+    std::vector<framework::Tensor*> outputs;
+    for (size_t j = 0; j < outs.size(); ++j) {
+      if (out_var_names[j] != framework::kEmptyVarName) {
+        outs[j]->mutable_data<T>(ctx.GetPlace());
+        outputs.push_back(outs[j]);
+      } else {
+        outputs.push_back(nullptr);
+      }
+    }
+
     // Sometimes direct copies will be faster, this maybe need deeply analysis.
     if (axis == 0 && outs.size() < 10) {
       size_t input_offset = 0;
-      auto in_stride = framework::stride_numel(in->dims());
+      const auto in_stride = framework::stride_numel(out_grad->dims());
 
-      for (auto& out : outs) {
-        out->mutable_data<T>(ctx.GetPlace());
-        auto out_stride = framework::stride_numel(out->dims());
-        StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
-                                    out_stride, in->data<T>() + input_offset,
-                                    in_stride, out_stride[axis]);
+      for (size_t i = 0; i < outs.size(); ++i) {
+        auto out_stride = framework::stride_numel(ins[i]->dims());
+        auto* out = outputs[i];
+        if (out != nullptr) {
+          StridedNumelCopyWithAxis<T>(
+              ctx.device_context(), axis, out->data<T>(), out_stride,
+              out_grad->data<T>() + input_offset, in_stride, out_stride[axis]);
+        }
         input_offset += out_stride[axis];
       }
     } else {
-      std::vector<framework::Tensor> outputs(outs.size());
-      for (size_t j = 0; j < outs.size(); ++j) {
-        outs[j]->mutable_data<T>(ctx.GetPlace());
-        outputs[j] = *outs[j];
-      }
-
       auto& dev_ctx = ctx.template device_context<DeviceContext>();
       paddle::operators::math::ConcatGradFunctor<DeviceContext, T>
           concat_grad_functor;
-      concat_grad_functor(dev_ctx, *in, static_cast<int>(axis), outputs);
+      concat_grad_functor(dev_ctx, *out_grad, ins, static_cast<int>(axis),
+                          &outputs);
     }
   }
 };
diff --git a/paddle/fluid/operators/concurrency/CMakeLists.txt b/paddle/fluid/operators/concurrency/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e4617440d152b4c15d09e81cd19c76739b95b979
--- /dev/null
+++ b/paddle/fluid/operators/concurrency/CMakeLists.txt
@@ -0,0 +1 @@
+cc_library(concurrency SRCS channel_util.cc DEPS device_context framework_proto boost eigen3)
diff --git a/paddle/fluid/operators/concurrency/channel_util.cc b/paddle/fluid/operators/concurrency/channel_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fba4abf1897bceea615222b2438700085ed8e551
--- /dev/null
+++ b/paddle/fluid/operators/concurrency/channel_util.cc
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/concurrency/channel_util.h"
+#include "paddle/fluid/framework/var_type.h"
+
+namespace poc = paddle::operators::concurrency;
+
+void poc::ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) {
+  auto type = framework::ToVarType(var->Type());
+  if (type == framework::proto::VarType_Type_LOD_TENSOR)
+    ch->Send(var->GetMutable<framework::LoDTensor>());
+  else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
+    ch->Send(var->GetMutable<framework::LoDRankTable>());
+  else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
+    ch->Send(var->GetMutable<framework::LoDTensorArray>());
+  else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
+    ch->Send(var->GetMutable<framework::SelectedRows>());
+  else if (type == framework::proto::VarType_Type_READER)
+    ch->Send(var->GetMutable<framework::ReaderHolder>());
+  else if (type == framework::proto::VarType_Type_CHANNEL)
+    ch->Send(var->GetMutable<framework::ChannelHolder>());
+  else
+    PADDLE_THROW("ChannelSend:Unsupported type");
+}
+
+bool poc::ChannelReceive(framework::ChannelHolder *ch,
+                         framework::Variable *var) {
+  // Get type of channel and use that to call mutable data for Variable
+  auto type = framework::ToVarType(ch->Type());
+  if (type == framework::proto::VarType_Type_LOD_TENSOR)
+    return ch->Receive(var->GetMutable<framework::LoDTensor>());
+  else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
+    return ch->Receive(var->GetMutable<framework::LoDRankTable>());
+  else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
+    return ch->Receive(var->GetMutable<framework::LoDTensorArray>());
+  else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
+    return ch->Receive(var->GetMutable<framework::SelectedRows>());
+  else if (type == framework::proto::VarType_Type_READER)
+    return ch->Receive(var->GetMutable<framework::ReaderHolder>());
+  else if (type == framework::proto::VarType_Type_CHANNEL)
+    return ch->Receive(var->GetMutable<framework::ChannelHolder>());
+  else
+    PADDLE_THROW("ChannelReceive:Unsupported type");
+}
+
+void poc::ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer,
+                            framework::Variable *var,
+                            std::shared_ptr<std::condition_variable_any> cond,
+                            std::function<bool(framework::ChannelAction)> cb) {
+  auto type = framework::ToVarType(var->Type());
+  if (type == framework::proto::VarType_Type_LOD_TENSOR) {
+    ch->AddToSendQ(referrer, var->GetMutable<framework::LoDTensor>(), cond, cb);
+  } else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) {
+    ch->AddToSendQ(referrer, var->GetMutable<framework::LoDRankTable>(), cond,
+                   cb);
+  } else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) {
+    ch->AddToSendQ(referrer, var->GetMutable<framework::LoDTensorArray>(), cond,
+                   cb);
+  } else if (type == framework::proto::VarType_Type_SELECTED_ROWS) {
+    ch->AddToSendQ(referrer, var->GetMutable<framework::SelectedRows>(), cond,
+                   cb);
+  } else if (type == framework::proto::VarType_Type_READER) {
+    ch->AddToSendQ(referrer, var->GetMutable<framework::ReaderHolder>(), cond,
+                   cb);
+  } else if (type == framework::proto::VarType_Type_CHANNEL) {
+    ch->AddToSendQ(referrer, var->GetMutable<framework::ChannelHolder>(), cond,
+                   cb);
+  } else {
+    PADDLE_THROW("ChannelAddToSendQ:Unsupported type");
+  }
+}
+
+void poc::ChannelAddToReceiveQ(
+    framework::ChannelHolder *ch, const void *referrer,
+    framework::Variable *var, std::shared_ptr<std::condition_variable_any> cond,
+    std::function<bool(framework::ChannelAction)> cb) {
+  auto type = framework::ToVarType(var->Type());
+  if (type == framework::proto::VarType_Type_LOD_TENSOR) {
+    ch->AddToReceiveQ(referrer, var->GetMutable<framework::LoDTensor>(), cond,
+                      cb);
+  } else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) {
+    ch->AddToReceiveQ(referrer, var->GetMutable<framework::LoDRankTable>(),
+                      cond, cb);
+  } else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) {
+    ch->AddToReceiveQ(referrer, var->GetMutable<framework::LoDTensorArray>(),
+                      cond, cb);
+  } else if (type == framework::proto::VarType_Type_SELECTED_ROWS) {
+    ch->AddToReceiveQ(referrer, var->GetMutable<framework::SelectedRows>(),
+                      cond, cb);
+  } else if (type == framework::proto::VarType_Type_READER) {
+    ch->AddToReceiveQ(referrer, var->GetMutable<framework::ReaderHolder>(),
+                      cond, cb);
+  } else if (type == framework::proto::VarType_Type_CHANNEL) {
+    ch->AddToReceiveQ(referrer, var->GetMutable<framework::ChannelHolder>(),
+                      cond, cb);
+  } else {
+    PADDLE_THROW("ChannelAddToReceiveQ:Unsupported type");
+  }
+}
diff --git a/paddle/fluid/operators/concurrency/channel_util.h b/paddle/fluid/operators/concurrency/channel_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd18ca78c6fdecdc6c72748611ccdd9c2690ef46
--- /dev/null
+++ b/paddle/fluid/operators/concurrency/channel_util.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/variable.h"
+
+namespace paddle {
+namespace operators {
+namespace concurrency {
+
+void ChannelSend(framework::ChannelHolder *ch, framework::Variable *var);
+bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var);
+
+void ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer,
+                       framework::Variable *var,
+                       std::shared_ptr<std::condition_variable_any> cond,
+                       std::function<bool(framework::ChannelAction)> cb);
+void ChannelAddToReceiveQ(framework::ChannelHolder *ch, const void *referrer,
+                          framework::Variable *var,
+                          std::shared_ptr<std::condition_variable_any> cond,
+                          std::function<bool(framework::ChannelAction)> cb);
+
+}  // namespace concurrency
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/cond_op.cc b/paddle/fluid/operators/cond_op.cc
deleted file mode 100644
index 15dce9e3e28fa0200e332534f42752838da4db92..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cond_op.cc
+++ /dev/null
@@ -1,235 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/cond_op.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-using Scope = framework::Scope;
-using Variable = framework::Variable;
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DDim = framework::DDim;
-
-framework::Scope& CondOp::AddSubScope(const Scope& scope) const {
-  auto sub_scopes_var = scope.FindVar("SubScopes");
-  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
-                          "Output(SubScopes) of CondOp should not be null.");
-  auto sub_scopes = sub_scopes_var->GetMutable<std::vector<Scope*>>();
-  auto& sub_scope = scope.NewScope();
-  sub_scopes->push_back(&sub_scope);
-  return sub_scope;
-}
-
-std::vector<framework::Scope*>& CondOp::GetSubScopes(
-    const framework::Scope& scope) const {
-  auto sub_scopes_var = scope.FindVar("SubScopes");
-  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
-                          "Output(SubScopes) of CondOp should not be null.");
-  return *sub_scopes_var->GetMutable<std::vector<framework::Scope*>>();
-}
-
-LoDTensor& CondOp::AddIndexTensor(const Scope& scope) const {
-  auto index_tensors_var = scope.FindVar("IndexTensors");
-  PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
-                          "Output(IndexTensors) of CondOp should not be null.");
-  auto& index_tensors =
-      *index_tensors_var->GetMutable<std::vector<LoDTensor>>();
-  index_tensors.push_back(LoDTensor());
-  return index_tensors.back();
-}
-
-std::vector<framework::LoDTensor>& CondOp::GetIndexTensors(
-    const framework::Scope& scope) const {
-  auto* index_tensors_var = scope.FindVar("IndexTensors");
-  PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
-                          "Output(IndexTensors) of CondOp should not be null.");
-  return *index_tensors_var->GetMutable<std::vector<framework::LoDTensor>>();
-}
-
-void CondOp::PrepareDataForSubnet(
-    const framework::Scope& scope,
-    const platform::DeviceContext& dev_ctx) const {
-  PADDLE_ENFORCE(!Inputs("Xs").empty(), "Inputs(Xs) of CondOp can't be empty.");
-
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    // Create two sub scopes for true and false branches
-    //   sub_scopes[0] for the true branch
-    //   sub_scopes[1] for the false branch
-    AddSubScope(scope);
-    // Create two tensors for true and false indices:
-    //   index_tensors[0] for the true branch
-    //   index_tensors[1] for the false branch
-    AddIndexTensor(scope);
-  }
-
-  Variable* cond_var = scope.FindVar(Input("Cond"));
-  PADDLE_ENFORCE_NOT_NULL(cond_var,
-                          "Input(Cond) of CondOp should not be null.");
-  const LoDTensor* cond = cond_var->GetMutable<LoDTensor>();
-
-  // get the true/false index at runtime according to cond tensor
-  // index_vectors[0]: vector<int>, contains all index for cond[i] == true
-  // index_vectors[1]: vector<int>, contains all index for cond[i] == false
-  std::vector<std::vector<int>> index_vectors;
-  index_vectors.resize(BRANCH_NUM);
-
-  const int* cond_data = cond->data<int>();
-  for (int i = 0; i < cond->dims()[0]; ++i) {
-    if (cond_data[i])
-      index_vectors[TRUE_BRANCH].push_back(i);
-    else
-      index_vectors[FALSE_BRANCH].push_back(i);
-  }
-
-  // put index_vectors[0] and index_vectors[1] into two tensors:
-  // index_tensors[0] and index_tensors[1]
-  std::vector<framework::LoDTensor>& index_tensors = GetIndexTensors(scope);
-  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
-
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    DDim dim = {static_cast<int64_t>(index_vectors[i].size())};
-    int* index_tensor_data_ptr =
-        index_tensors[i].mutable_data<int>(dim, platform::CPUPlace());
-    memcpy(index_tensor_data_ptr, index_vectors[i].data(),
-           dim[0] * sizeof(int));
-  }
-
-  // create input in subscopes according to index_vectors
-  for (auto& input : Inputs("Xs")) {
-    Variable* var_parent = scope.FindVar(input);
-    PADDLE_ENFORCE_NOT_NULL(var_parent);
-    const auto* tensor_parent = &var_parent->Get<LoDTensor>();
-
-    for (int i = 0; i < BRANCH_NUM; ++i) {
-      Variable* var_child = sub_scopes[i]->FindVar(input);
-      PADDLE_ENFORCE_NOT_NULL(var_child);
-      auto* tensor_child = var_child->GetMutable<LoDTensor>();
-
-      // Resize child
-      DDim dim = tensor_parent->dims();
-      dim[0] = index_tensors[i].dims()[0];
-      tensor_child->mutable_data<float>(dim, platform::CPUPlace());
-
-      CPUGather<float>(dev_ctx, *tensor_parent, index_tensors[i], tensor_child);
-    }
-  }
-
-  // create output_tensors in subscope for sub_net
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    for (auto& output : (*sub_net_op_[i]).Outputs()) {
-      for (auto& var_name : output.second) {
-        sub_scopes[i]->Var(var_name);
-      }
-    }
-  }
-}
-
-void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
-                                 const platform::DeviceContext& dev_ctx) const {
-  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
-  const std::vector<framework::LoDTensor>& index_tensors =
-      GetIndexTensors(scope);
-
-  // Infer the output dim, out_dim[0] = true_dim[0] + false_dim[0]
-  PADDLE_ENFORCE(!Outputs("Outs").empty(),
-                 "Outputs(Outs) of CondOp can't be empty.");
-  for (auto& output : Outputs("Outs")) {
-    const LoDTensor* tensor_t_out =
-        &sub_scopes[TRUE_BRANCH]->FindVar(output)->Get<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_t_out, "True output should not be NULL");
-    const LoDTensor* tensor_f_out =
-        &sub_scopes[FALSE_BRANCH]->FindVar(output)->Get<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_f_out, "False output should not be NULL");
-
-    auto* var_out = scope.FindVar(output);
-    PADDLE_ENFORCE_NOT_NULL(var_out, "Output not found");
-    LoDTensor* tensor_out = var_out->GetMutable<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_t_out,
-                            "True output tensor should not be NULL");
-
-    DDim true_dim = tensor_t_out->dims();
-    DDim false_dim = tensor_f_out->dims();
-    true_dim[0] = 0;
-    false_dim[0] = 0;
-    PADDLE_ENFORCE_EQ(true_dim, false_dim,
-                      "Outputs not of the same shape except the first dim");
-
-    DDim out_dim = tensor_t_out->dims();
-    out_dim[0] = tensor_t_out->dims()[0] + tensor_f_out->dims()[0];
-    tensor_out->Resize(out_dim);
-    tensor_out->mutable_data<float>(platform::CPUPlace());
-  }
-
-  // merge output results:
-  // output_tensor = true_output_tensor + false_output_tensor
-  for (auto& output : Outputs("Outs")) {
-    Variable* var_parent = scope.FindVar(output);
-    PADDLE_ENFORCE_NOT_NULL(var_parent);
-    auto* tensor_parent = var_parent->GetMutable<LoDTensor>();
-
-    for (int i = 0; i < BRANCH_NUM; ++i) {
-      Variable* var_child = sub_scopes[i]->FindVar(output);
-      PADDLE_ENFORCE_NOT_NULL(var_child);
-      auto* tensor_child = &var_child->Get<LoDTensor>();
-      ScatterAssign<float>(dev_ctx, *tensor_child, index_tensors[i],
-                           tensor_parent);
-    }
-  }
-}
-
-void CondOp::RunImpl(const Scope& scope, const platform::Place& place) const {
-  // get device context from pool
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& dev_ctx = *pool.Get(place);
-
-  PrepareDataForSubnet(scope, dev_ctx);
-  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    sub_net_op_[i]->Run(*sub_scopes[i], place);
-  }
-  MergeDataFromSubnet(scope, dev_ctx);
-}
-
-class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CondOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Cond", "The condition, which is a bool vector");
-    AddInput("Xs", "Inputs of Subnets").AsDuplicable();
-    AddOutput("Outs", "Outputs of Cond_Op after merge").AsDuplicable();
-
-    AddOutput("SubScopes", "sub scopes for true and false branches");
-    AddOutput("IndexTensors", "Index Tensors contains indices for true/false");
-
-    AddComment(R"DOC(
-Sample Dependent Conditional Operator.
-
-Given Cond[i] as a 1/0 vector to indicate true/false:
-Out[i] = subnet_true[i], if Cond[i] == true
-Out[i] = subnet_false[i], if Cond[i] == false
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_WITHOUT_GRADIENT(cond, paddle::operators::CondOp,
-                             paddle::operators::CondOpProtoAndCheckerMaker);
diff --git a/paddle/fluid/operators/cond_op.h b/paddle/fluid/operators/cond_op.h
deleted file mode 100644
index a04fae2182005d4eb08305e943449977bfb637f9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/cond_op.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "glog/logging.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/net_op.h"
-
-namespace paddle {
-namespace operators {
-
-/*
- * @brief CondOp is a dynamic if-else Operator
- *
- * It has a input tensor named cond indicating which netop each instance will
- * run.
- *
- * if cond == 1, it will run true_net, which is a NetOp.
- *
- * if cond == 0, it will run false_net, which is another NetOp.
- */
-class CondOp : public framework::OperatorBase {
- public:
-  CondOp(const std::string& type, const framework::VariableNameMap& inputs,
-         const framework::VariableNameMap& outputs,
-         const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {
-    sub_net_op_.resize(BRANCH_NUM);
-  }
-
-  CondOp(const CondOp& o)
-      : framework::OperatorBase(
-            static_cast<const framework::OperatorBase&>(o)) {
-    // TODO(yuyang18): Implement copy ctor well.
-    PADDLE_THROW("Not implemented");
-  }
-
-  framework::Scope& AddSubScope(const framework::Scope& scope) const;
-  std::vector<framework::Scope*>& GetSubScopes(
-      const framework::Scope& scope) const;
-
-  framework::LoDTensor& AddIndexTensor(const framework::Scope& scope) const;
-  std::vector<framework::LoDTensor>& GetIndexTensors(
-      const framework::Scope& scope) const;
-
-  void PrepareDataForSubnet(const framework::Scope& scope,
-                            const platform::DeviceContext& dev_ctx) const;
-  void MergeDataFromSubnet(const framework::Scope& scope,
-                           const platform::DeviceContext& dev_ctx) const;
-
-  /*
-   * Set True Block
-   */
-  void set_truenet(std::unique_ptr<OperatorBase>&& net) {
-    sub_net_op_[TRUE_BRANCH] = std::move(net);
-  }
-
-  /*
-   * Set False Block
-   */
-  void set_falsenet(std::unique_ptr<OperatorBase>&& net) {
-    sub_net_op_[FALSE_BRANCH] = std::move(net);
-  }
-
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override;
-
- private:
-  const int TRUE_BRANCH = 0;
-  const int FALSE_BRANCH = 1;
-  const int BRANCH_NUM = 2;
-
-  // sub_net_op_[0]: subnet_t
-  // sub_net_op_[1]: subnet_f
-  std::vector<std::unique_ptr<framework::OperatorBase>> sub_net_op_;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/conditional_block_op.cc b/paddle/fluid/operators/conditional_block_op.cc
index 337b34e8f0bf4cb89753235205be9eb058dd01ab..8cc1d94260baccfe28d213b7e021956819e2e79e 100644
--- a/paddle/fluid/operators/conditional_block_op.cc
+++ b/paddle/fluid/operators/conditional_block_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/var_type.h"
 
 namespace paddle {
 namespace operators {
@@ -47,14 +48,25 @@ class ConditionalOp : public framework::OperatorBase {
     if (!(ips.size() == 1UL && ips[0]->IsInitialized())) {
       PADDLE_THROW("should have one initialized input as condition");
     }
-    if (!(ips[0]->type().hash_code() == typeid(bool).hash_code() &&
+    if (!(framework::IsType<bool>(ips[0]->type()) &&  // NOLINT
           ips[0]->numel() == 1)) {
       PADDLE_THROW(
           "condition input's data type should be bool, "
           "numel should be 1, actual numel is %d",
           ips[0]->numel());
     }
-    return ips[0]->data<bool>()[0];
+    bool res = false;
+    if (platform::is_gpu_place(ips[0]->place())) {
+#ifdef PADDLE_WITH_CUDA
+      framework::LoDTensor cpu_tensor;
+      framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor);
+      platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait();
+      res = cpu_tensor.data<bool>()[0];
+#endif
+    } else {
+      res = ips[0]->data<bool>()[0];
+    }
+    return res;
   }
 };
 
@@ -97,8 +109,7 @@ class ConditionalBlockOp : public ConditionalOp {
 
 class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ConditionalBlockOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "The conditional variable of this operator. If X is empty, the "
              "whole sub-block will not be executed.")
@@ -216,7 +227,7 @@ class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
     grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
     grad_op->SetOutput(framework::GradVarName("Params"),
                        InputGrad("Params", false));
-    grad_op->SetBlockAttr("sub_block", *this->grad_block_[0]);
+    grad_op->SetBlockAttr("sub_block", this->grad_block_[0]);
     grad_op->SetAttr("is_scalar_condition", GetAttr("is_scalar_condition"));
     return std::unique_ptr<framework::OpDesc>(grad_op);
   }
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index ff0fbf21f86269885df5491afab7443df813f13f..1828be57b5a54005a0066b18ebebdb740726f67a 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -18,6 +18,12 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_op.h"
 #include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/float16.h"
+
+DEFINE_bool(cudnn_deterministic, true,
+            "Whether allow using an autotuning algorithm for convolution "
+            "operator. The autotuning algorithm may be non-deterministic. If "
+            "false, the algorithm is deterministic.");
 
 namespace paddle {
 namespace operators {
@@ -27,6 +33,8 @@ using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
 using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
 using DataLayout = platform::DataLayout;
+template <typename T>
+using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
 
 static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
     static_cast<size_t>(1024) * 1024 * 1024;
@@ -125,15 +133,37 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
         cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
         workspace_size_limit, &algo));
+
+#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
+    // Tensor core is supported since the volta GPU and
+    // is only enabled when input and filter data are float16
+    if (dev_ctx.GetComputeCapability() >= 70 &&
+        std::type_index(typeid(T)) ==
+            std::type_index(typeid(platform::float16))) {
+      PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
+      // Currently tensor core is only enabled using this algo
+      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+    } else {
+      PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_DEFAULT_MATH));
+    }
+#endif
+
     // get workspace size able to allocate
     PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
         handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
         cudnn_output_desc, algo, &workspace_size_in_bytes));
+    // It is possible for float16 on Volta GPU to allocate more memory than
+    // the limit because the algo is overrided to use tensor core.
+    PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
+                      "workspace_size to be allocated exceeds the limit");
+
     // Allocate on GPU memory
     platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
     cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
     // ------------------- cudnn conv forward ---------------------
-    T alpha = 1.0f, beta = 0.0f;
+    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
     for (int i = 0; i < groups; i++) {
       PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
           handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
@@ -242,17 +272,23 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
     if (input_grad) {
-      PADDLE_ENFORCE(
-          platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
-              handle, cudnn_filter_desc,
-              // dyDesc: Handle to the previously initialized input differential
-              // tensor descriptor.
-              cudnn_output_grad_desc, cudnn_conv_desc,
-              // dxDesc: Handle to the previously initialized output tensor
-              // descriptor.
-              cudnn_input_desc,
-              CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-              workspace_size_limit, &data_algo));
+      if (FLAGS_cudnn_deterministic) {
+        PADDLE_ENFORCE(
+            platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+                handle, cudnn_filter_desc,
+                // dyDesc: Handle to the previously initialized input
+                // differential
+                // tensor descriptor.
+                cudnn_output_grad_desc, cudnn_conv_desc,
+                // dxDesc: Handle to the previously initialized output tensor
+                // descriptor.
+                cudnn_input_desc,
+                CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+                workspace_size_limit, &data_algo));
+      } else {
+        data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+      }
+
       PADDLE_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
               handle, cudnn_filter_desc, cudnn_output_grad_desc,
@@ -261,12 +297,16 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     }
 
     if (filter_grad) {
-      PADDLE_ENFORCE(
-          platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
-              handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
-              cudnn_filter_desc,
-              CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-              workspace_size_limit, &filter_algo));
+      if (FLAGS_cudnn_deterministic) {
+        PADDLE_ENFORCE(
+            platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+                handle, cudnn_input_desc, cudnn_output_grad_desc,
+                cudnn_conv_desc, cudnn_filter_desc,
+                CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+                workspace_size_limit, &filter_algo));
+      } else {
+        filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+      }
 
       PADDLE_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
@@ -280,7 +320,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
     cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
     // ------------------- cudnn conv backward data ---------------------
-    T alpha = 1.0f, beta = 0.0f;
+    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
     if (input_grad) {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset input_grad.
@@ -315,16 +355,19 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace,
+namespace plat = paddle::platform;
+REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<double>);
-REGISTER_OP_KERNEL(conv2d_grad, CUDNN, ::paddle::platform::CUDAPlace,
+                   paddle::operators::CUDNNConvOpKernel<double>,
+                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
+REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvGradOpKernel<float>,
                    paddle::operators::CUDNNConvGradOpKernel<double>);
 
-REGISTER_OP_KERNEL(conv3d, CUDNN, ::paddle::platform::CUDAPlace,
+REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<double>);
-REGISTER_OP_KERNEL(conv3d_grad, CUDNN, ::paddle::platform::CUDAPlace,
+                   paddle::operators::CUDNNConvOpKernel<double>,
+                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
+REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvGradOpKernel<float>,
                    paddle::operators::CUDNNConvGradOpKernel<double>);
diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc
index 0a8a5d4c71c4510f04eea2f7ef12f836d1fd9c9b..6b06913d1c83f4534238ac3dd22ac4035c0f0fbf 100644
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -18,6 +18,17 @@
 namespace paddle {
 namespace operators {
 
+using conv_bwd_data = mkldnn::convolution_backward_data;
+using conv_bwd_weights = mkldnn::convolution_backward_weights;
+using conv_fwd = mkldnn::convolution_forward;
+using framework::DataLayout;
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::reorder;
+using mkldnn::stream;
+using platform::to_void_cast;
+using platform::GetMKLDNNFormat;
+
 template <typename T>
 class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
@@ -25,6 +36,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                    "It must use CPUPlace.");
 
+    // Get unique name for index
+    const std::string key = ctx.op().Output("Output");
+    const std::string key_conv_pd = key + "@conv_pd";
+
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
@@ -33,10 +48,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto* filter = ctx.Input<Tensor>("Filter");
     auto* output = ctx.Output<Tensor>("Output");
 
-    // Get an unique name from "argument" name of "Output" variable
-    // This name will be used as key when saving info into device context
-    const std::string key = ctx.op().Output("Output");
-    const std::string key_conv_pd = key + "@conv_pd";
+    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
+                       input->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input tensor");
+    PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
+                       filter->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Filter tensor");
 
     std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
@@ -63,58 +80,86 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         paddle::framework::vectorize2int(filter->dims());
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
 
-    // TODO(pzelazko-intel): support more formats
-    auto src_md = platform::MKLDNNMemDesc(
-        src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
-    auto weights_md =
-        platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32,
-                                mkldnn::memory::format::oihw);
-    auto dst_md = platform::MKLDNNMemDesc(
-        dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
-
-    auto src_memory =
-        mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data);
-    auto weights_memory =
-        mkldnn::memory({weights_md, mkldnn_engine}, (void*)filter_data);
-    auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine}, output_data);
-
-    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd =
-        ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
-                             mkldnn_engine);
-
-    // save conv_pd into global device context to be referred in backward path
-    dev_ctx.SetBlob(key_conv_pd, conv_pd);
+    // create mkldnn memory from input tensors (data/weights)
+    auto user_src_memory = memory(
+        {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine},
+        to_void_cast(input_data));
+    auto user_weights_memory =
+        memory({{{weights_tz}, memory::data_type::f32, filter->format()},
+                mkldnn_engine},
+               to_void_cast(filter_data));
+
+    /* create memory descriptor for convolution without specified format
+     * ('any') which lets a primitive (convolution in this case) choose
+     * the memory format preferred for best performance
+     */
+    auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32,
+                                          memory::format::any);
+    auto weights_md = platform::MKLDNNMemDesc(
+        weights_tz, memory::data_type::f32, memory::format::any);
+    auto dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32,
+                                          memory::format::any);
+
+    // create a conv primitive descriptor and save it for usage in backward
+    std::shared_ptr<conv_fwd::primitive_desc> conv_pd = ConvFwdPrimitiveDesc(
+        src_md, weights_md, dst_md, strides, paddings, mkldnn_engine);
+
+    // create reorder primitive if the input format is not the preferred one
+    auto src_memory = user_src_memory;
+    primitive reorder_src;
+    bool is_src_reordered = false;
+    if (memory::primitive_desc(conv_pd->src_primitive_desc()) !=
+        user_src_memory.get_primitive_desc()) {
+      src_memory = memory(conv_pd->src_primitive_desc());
+      reorder_src = reorder(user_src_memory, src_memory);
+      is_src_reordered = true;
+    }
+    auto weights_memory = user_weights_memory;
+    primitive reorder_weights;
+    bool is_weights_reordered = false;
+    if (memory::primitive_desc(conv_pd->weights_primitive_desc()) !=
+        user_weights_memory.get_primitive_desc()) {
+      weights_memory = memory(conv_pd->weights_primitive_desc());
+      reorder_weights = reorder(user_weights_memory, weights_memory);
+      is_weights_reordered = true;
+    }
+
+    // create memory primitive for conv dst
+    auto dst_memory = memory(conv_pd->dst_primitive_desc(), output_data);
 
     // create convolution op primitive
-    auto conv_prim = mkldnn::convolution_forward(*conv_pd, src_memory,
-                                                 weights_memory, dst_memory);
+    auto conv_prim = conv_fwd(*conv_pd, src_memory, weights_memory, dst_memory);
 
     // push primitive to stream and wait until it's executed
-    std::vector<mkldnn::primitive> pipeline{conv_prim};
-    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+    std::vector<primitive> pipeline;
+    if (is_src_reordered) pipeline.push_back(reorder_src);
+    if (is_weights_reordered) pipeline.push_back(reorder_weights);
+    pipeline.push_back(conv_prim);
+    stream(stream::kind::eager).submit(pipeline).wait();
+
+    // Save conv_pd/src_memory/weights_memory for backward pass
+    dev_ctx.SetBlob(key_conv_pd, conv_pd);
+
+    output->set_layout(DataLayout::kMKLDNN);
+    output->set_format(GetMKLDNNFormat(dst_memory));
   }
 
  private:
-  std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
-  ConvFwdPrimitiveDesc(const mkldnn::memory::desc& src,
-                       const mkldnn::memory::desc& weights,
-                       const mkldnn::memory::desc& dst,
-                       const std::vector<int>& strides,
-                       const std::vector<int>& paddings,
-                       const mkldnn::engine& engine) const {
-    mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
-    mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
-
-    auto conv_desc = mkldnn::convolution_forward::desc(
-        mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights,
-        dst, stride_dims, padding_dims, padding_dims,
-        mkldnn::padding_kind::zero);
-
-    auto p_conv_pd =
-        new mkldnn::convolution_forward::primitive_desc(conv_desc, engine);
-
-    return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
-        p_conv_pd);
+  std::unique_ptr<conv_fwd::primitive_desc> ConvFwdPrimitiveDesc(
+      const memory::desc& src, const memory::desc& weights,
+      const memory::desc& dst, const std::vector<int>& strides,
+      const std::vector<int>& paddings, const mkldnn::engine& engine) const {
+    memory::dims stride_dims = {strides[0], strides[1]};
+    memory::dims padding_dims = {paddings[0], paddings[1]};
+
+    auto conv_desc =
+        conv_fwd::desc(mkldnn::prop_kind::forward, mkldnn::convolution_direct,
+                       src, weights, dst, stride_dims, padding_dims,
+                       padding_dims, mkldnn::padding_kind::zero);
+
+    auto p_conv_pd = new conv_fwd::primitive_desc(conv_desc, engine);
+
+    return std::unique_ptr<conv_fwd::primitive_desc>(p_conv_pd);
   }
 };
 
@@ -137,6 +182,19 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
     Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
 
+    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
+                       input->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input tensor");
+    PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
+                       filter->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Filter tensor");
+    PADDLE_ENFORCE(output->layout() == DataLayout::kMKLDNN &&
+                       output->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Output tensor");
+    PADDLE_ENFORCE(output_grad->layout() == DataLayout::kMKLDNN &&
+                       output_grad->format() != memory::format::format_undef,
+                   "Wrong layout/format set for output_grad tensor");
+
     if (!input_grad && !filter_grad) return;
 
     // Get an unique name from "argument" name of "Output" variable
@@ -165,103 +223,147 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
         paddle::framework::vectorize2int(filter->dims());
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
 
-    // TODO(pzelazko-intel): support more formats
-    auto src_md = platform::MKLDNNMemDesc(
-        src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
-    auto diff_src_md = platform::MKLDNNMemDesc(
-        src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
-    auto weights_md =
-        platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32,
-                                mkldnn::memory::format::oihw);
-    auto diff_weights_md =
-        platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32,
-                                mkldnn::memory::format::oihw);
-    auto diff_dst_md = platform::MKLDNNMemDesc(
-        dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
-
-    // create memory
-    auto diff_dst_memory = mkldnn::memory({diff_weights_md, mkldnn_engine},
-                                          (void*)output_grad_data);
+    // create mkldnn memory from input tensors (input/weights/output_grad)
+    auto user_src_memory = memory(
+        {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine},
+        to_void_cast(input_data));
+    auto user_weights_memory =
+        memory({{{weights_tz}, memory::data_type::f32, filter->format()},
+                mkldnn_engine},
+               to_void_cast(filter_data));
+    auto user_diff_dst_memory =
+        memory({{{dst_tz}, memory::data_type::f32, output_grad->format()},
+                mkldnn_engine},
+               to_void_cast(output_grad_data));
+
+    /* create memory descriptor for conv backward without specified format
+     * ('any') which lets a primitive (conv backward in this case) choose
+     * the memory format preferred for best performance
+     */
+    auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32,
+                                          memory::format::any);
+    auto diff_src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32,
+                                               memory::format::any);
+    auto weights_md = platform::MKLDNNMemDesc(
+        weights_tz, memory::data_type::f32, memory::format::any);
+    auto diff_weights_md = platform::MKLDNNMemDesc(
+        weights_tz, memory::data_type::f32, memory::format::any);
+    auto diff_dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32,
+                                               memory::format::any);
+
     // Retrieve conv_pd from device context
-    auto conv_pd =
-        std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>(
-            dev_ctx.GetBlob(key_conv_pd));
+    auto conv_pd = std::static_pointer_cast<conv_fwd::primitive_desc>(
+        dev_ctx.GetBlob(key_conv_pd));
     PADDLE_ENFORCE(conv_pd != nullptr,
                    "Fail to find conv_pd in device context");
 
     // create backward conv primitive for weights
     if (filter_grad) {
-      // create primitive descriptor
-      mkldnn::convolution_backward_weights::primitive_desc conv_bwd_weights_pd =
-          ConvBwdWeightsPrimitiveDesc(src_md, diff_weights_md, diff_dst_md,
-                                      strides, paddings, *conv_pd,
-                                      mkldnn_engine);
-
-      // create memory
-      auto diff_weights_memory = mkldnn::memory(
-          {diff_weights_md, mkldnn_engine}, (void*)filter_grad_data);
-      auto src_memory =
-          mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data);
+      // create backward convolution primitive descriptor
+      auto conv_bwd_weights_desc = conv_bwd_weights::desc(
+          mkldnn::convolution_direct, src_md, diff_weights_md, diff_dst_md,
+          strides, paddings, paddings, mkldnn::padding_kind::zero);
+      auto conv_bwd_weights_pd = conv_bwd_weights::primitive_desc(
+          conv_bwd_weights_desc, mkldnn_engine, *conv_pd);
+
+      // create reorder primitive if the input format is not the preferred one
+      auto src_memory = user_src_memory;
+      primitive reorder_src;
+      bool is_src_reordered = false;
+      if (memory::primitive_desc(conv_bwd_weights_pd.src_primitive_desc()) !=
+          user_src_memory.get_primitive_desc()) {
+        src_memory = memory(conv_bwd_weights_pd.src_primitive_desc());
+        reorder_src = reorder(user_src_memory, src_memory);
+        is_src_reordered = true;
+      }
+
+      auto diff_dst_memory_4filter = user_diff_dst_memory;
+      primitive reorder_diff_dst_4filter;
+      bool is_diff_dst_reordered_4filter = false;
+      if (memory::primitive_desc(
+              conv_bwd_weights_pd.diff_dst_primitive_desc()) !=
+          user_diff_dst_memory.get_primitive_desc()) {
+        diff_dst_memory_4filter =
+            memory(conv_bwd_weights_pd.diff_dst_primitive_desc());
+        reorder_diff_dst_4filter =
+            reorder(user_diff_dst_memory, diff_dst_memory_4filter);
+        is_diff_dst_reordered_4filter = true;
+      }
+
+      // create mkldnn memory for output (i.e. diff weights)
+      auto diff_weights_memory =
+          memory(conv_bwd_weights_pd.diff_weights_primitive_desc(),
+                 reinterpret_cast<void*>(filter_grad_data));
 
       // create backward conv primitive for weights
-      auto conv_bwd_weights_prim = mkldnn::convolution_backward_weights(
-          conv_bwd_weights_pd, src_memory, diff_dst_memory,
-          diff_weights_memory);
+      auto conv_bwd_weights_prim =
+          conv_bwd_weights(conv_bwd_weights_pd, src_memory,
+                           diff_dst_memory_4filter, diff_weights_memory);
 
       // push primitive and execute it
-      std::vector<mkldnn::primitive> pipeline{conv_bwd_weights_prim};
-      mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+      std::vector<primitive> pipeline;
+      if (is_src_reordered) pipeline.push_back(reorder_src);
+      if (is_diff_dst_reordered_4filter)
+        pipeline.push_back(reorder_diff_dst_4filter);
+      pipeline.push_back(conv_bwd_weights_prim);
+      stream(stream::kind::eager).submit(pipeline).wait();
+
+      filter_grad->set_layout(DataLayout::kMKLDNN);
+      filter_grad->set_format(GetMKLDNNFormat(diff_weights_memory));
     }
 
     if (input_grad) {
-      // create primitive descriptor
-      mkldnn::convolution_backward_data::primitive_desc conv_bwd_data_pd =
-          ConvBwdDataPrimitiveDesc(diff_src_md, weights_md, diff_dst_md,
-                                   strides, paddings, *conv_pd, mkldnn_engine);
-
-      // create memory
-      auto diff_src_memory =
-          mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)input_grad_data);
-      auto weights_memory =
-          mkldnn::memory({weights_md, mkldnn_engine}, (void*)filter_data);
+      // create backward convolution primitive descriptor
+      auto conv_bwd_data_desc = conv_bwd_data::desc(
+          mkldnn::convolution_direct, diff_src_md, weights_md, diff_dst_md,
+          strides, paddings, paddings, mkldnn::padding_kind::zero);
+      auto conv_bwd_data_pd = conv_bwd_data::primitive_desc(
+          conv_bwd_data_desc, mkldnn_engine, *conv_pd);
+
+      // create reorder primitive if the input format is not the preferred one
+      auto weights_memory = user_weights_memory;
+      primitive reorder_weights;
+      bool is_weights_reordered = false;
+      if (memory::primitive_desc(conv_bwd_data_pd.weights_primitive_desc()) !=
+          user_weights_memory.get_primitive_desc()) {
+        weights_memory = memory(conv_bwd_data_pd.weights_primitive_desc());
+        reorder_weights = reorder(user_weights_memory, weights_memory);
+        is_weights_reordered = true;
+      }
+
+      auto diff_dst_memory_4data = user_diff_dst_memory;
+      primitive reorder_diff_dst_4data;
+      bool is_diff_dst_reordered_4data = false;
+      if (memory::primitive_desc(conv_bwd_data_pd.diff_dst_primitive_desc()) !=
+          user_diff_dst_memory.get_primitive_desc()) {
+        diff_dst_memory_4data =
+            memory(conv_bwd_data_pd.diff_dst_primitive_desc());
+        reorder_diff_dst_4data =
+            reorder(user_diff_dst_memory, diff_dst_memory_4data);
+        is_diff_dst_reordered_4data = true;
+      }
+
+      // create mkldnn memory for output (i.e. diff src)
+      auto diff_src_memory = memory(conv_bwd_data_pd.diff_src_primitive_desc(),
+                                    reinterpret_cast<void*>(input_grad_data));
 
       // create backward conv primitive for data
-      auto conv_bwd_data_prim = mkldnn::convolution_backward_data(
-          conv_bwd_data_pd, diff_dst_memory, weights_memory, diff_src_memory);
+      auto conv_bwd_data_prim =
+          conv_bwd_data(conv_bwd_data_pd, diff_dst_memory_4data, weights_memory,
+                        diff_src_memory);
 
-      // push primitive to stream and wait until it's executed
-      std::vector<mkldnn::primitive> pipeline{conv_bwd_data_prim};
-      mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+      // push primitive and execute it
+      std::vector<primitive> pipeline;
+      if (is_weights_reordered) pipeline.push_back(reorder_weights);
+      if (is_diff_dst_reordered_4data)
+        pipeline.push_back(reorder_diff_dst_4data);
+      pipeline.push_back(conv_bwd_data_prim);
+      stream(stream::kind::eager).submit(pipeline).wait();
+
+      input_grad->set_layout(DataLayout::kMKLDNN);
+      input_grad->set_format(GetMKLDNNFormat(diff_src_memory));
     }
   }  // Compute()
-
- private:
-  mkldnn::convolution_backward_weights::primitive_desc
-  ConvBwdWeightsPrimitiveDesc(
-      const mkldnn::memory::desc& src, const mkldnn::memory::desc& diff_weights,
-      const mkldnn::memory::desc& diff_dst, const std::vector<int>& strides,
-      const std::vector<int>& paddings,
-      const mkldnn::convolution_forward::primitive_desc& conv_pd,
-      const mkldnn::engine& engine) const {
-    auto conv_bwd_weights_desc = mkldnn::convolution_backward_weights::desc(
-        mkldnn::convolution_direct, src, diff_weights, diff_dst, strides,
-        paddings, paddings, mkldnn::padding_kind::zero);
-    return mkldnn::convolution_backward_weights::primitive_desc(
-        conv_bwd_weights_desc, engine, conv_pd);
-  }
-
-  mkldnn::convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc(
-      const mkldnn::memory::desc& diff_src, const mkldnn::memory::desc& weights,
-      const mkldnn::memory::desc& diff_dst, const std::vector<int>& strides,
-      const std::vector<int>& paddings,
-      const mkldnn::convolution_forward::primitive_desc& conv_pd,
-      const mkldnn::engine& engine) const {
-    auto conv_bwd_data_desc = mkldnn::convolution_backward_data::desc(
-        mkldnn::convolution_direct, diff_src, weights, diff_dst, strides,
-        paddings, paddings, mkldnn::padding_kind::zero);
-    return mkldnn::convolution_backward_data::primitive_desc(conv_bwd_data_desc,
-                                                             engine, conv_pd);
-  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 4b02b80d7772fa15d2333692551da5e59d93765f..37153d58439a90190eb2ad82d5dcc145e22dfa48 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/conv_op.h"
+
+#include <string>
+#include <vector>
+
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
@@ -70,29 +74,41 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 
 framework::OpKernelType ConvOp::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
-  framework::LibraryType library_{framework::LibraryType::kPlain};
+  framework::LibraryType library{framework::LibraryType::kPlain};
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout = framework::StringToDataLayout(data_format);
+
 #ifdef PADDLE_WITH_CUDA
   if (platform::CanCUDNNBeUsed(ctx)) {
-    library_ = framework::LibraryType::kCUDNN;
+    library = framework::LibraryType::kCUDNN;
   }
 #endif
 #ifdef PADDLE_WITH_MKLDNN
-  if (library_ == framework::LibraryType::kPlain &&
+  if (library == framework::LibraryType::kPlain &&
       platform::CanMKLDNNBeUsed(ctx)) {
-    library_ = framework::LibraryType::kMKLDNN;
+    library = framework::LibraryType::kMKLDNN;
+    layout = framework::DataLayout::kMKLDNN;
   }
 #endif
 
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
-      layout_, library_);
+  auto input_data_type =
+      framework::ToDataType(ctx.Input<Tensor>("Input")->type());
+  auto filter_data_type =
+      framework::ToDataType(ctx.Input<Tensor>("Filter")->type());
+  PADDLE_ENFORCE_EQ(input_data_type, filter_data_type,
+                    "input and filter data type should be consistent");
+
+  if (input_data_type == framework::proto::VarType::FP16) {
+    PADDLE_ENFORCE_EQ(library, framework::LibraryType::kCUDNN,
+                      "float16 can only be used when CUDNN is used");
+  }
+
+  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
+                                 library);
 }
 
-Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-    : OpProtoAndCheckerMaker(proto, op_checker) {
+void Conv2DOpMaker::Make() {
   AddInput(
       "Input",
       "(Tensor) The input tensor of convolution operator. "
@@ -108,7 +124,8 @@ Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
            "input image channels divided by the groups.");
   AddOutput("Output",
             "(Tensor) The output tensor of convolution operator. "
-            "The format of output tensor is also NCHW.");
+            "The format of output tensor is also NCHW.")
+      .Reuse("Input");
   AddAttr<std::vector<int>>("strides",
                             "(vector<int> default:{1, 1}), the "
                             "strides(h_stride, w_stride) of "
@@ -185,8 +202,7 @@ $$
 )DOC");
 }
 
-Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-    : OpProtoAndCheckerMaker(proto, op_checker) {
+void Conv3DOpMaker::Make() {
   AddInput(
       "Input",
       "(Tensor) The input tensor of convolution operator. "
@@ -204,7 +220,8 @@ Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
            "input image channels divided by the groups.");
   AddOutput("Output",
             "(Tensor) The output tensor of convolution operator."
-            "The format of output tensor is also NCDHW.");
+            "The format of output tensor is also NCDHW.")
+      .Reuse("Input");
   AddAttr<std::vector<int>>("strides",
                             "(vector<int>, default:{1, 1, 1}), the "
                             "strides(d_stride, h_stride, w_stride) of "
@@ -296,6 +313,10 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
     const framework::ExecutionContext& ctx) const {
   framework::LibraryType library_{framework::LibraryType::kPlain};
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+
 #ifdef PADDLE_WITH_CUDA
   if (platform::CanCUDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kCUDNN;
@@ -305,12 +326,10 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
   if (library_ == framework::LibraryType::kPlain &&
       platform::CanMKLDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kMKLDNN;
+    layout_ = framework::DataLayout::kMKLDNN;
   }
 #endif
 
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
       layout_, library_);
@@ -320,14 +339,17 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
-            ops::ConvOpGrad);
+REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad);
 
 // depthwise convolution op
-REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
-            depthwise_conv2d_grad, ops::ConvOpGrad);
-REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
-            ops::ConvOpGrad);
+REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad);
+REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad);
 
 // depthwise conv kernel
 // TODO(xingzhaolong): neon kernel for mobile
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index 12b45f1d65019f623268cb9da9004bac5e1f72a3..b3140116dfe6a17a400bb88219ff43b249ecb32a 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -14,11 +14,12 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/vol2col.h"
 
 namespace paddle {
@@ -41,9 +42,10 @@ inline int ConvOutputSize(int input_size, int filter_size, int dilation,
 
   return output_size;
 }
-inline bool IsExpand(std::vector<int64_t>& filter_dim,
-                     std::vector<int>& strides, std::vector<int>& paddings,
-                     std::vector<int>& dilations) {
+inline bool IsExpand(const std::vector<int64_t>& filter_dim,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations) {
   bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
   for (size_t j = 0; j < strides.size(); ++j) {
     filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
@@ -58,12 +60,12 @@ inline bool IsExpand(std::vector<int64_t>& filter_dim,
 // operator implementations can reuse the code.
 class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker);
+  void Make() override;
 };
 
 class Conv3DOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker);
+  void Make() override;
 };
 
 class ConvOp : public framework::OperatorWithKernel {
@@ -159,6 +161,7 @@ class GemmConvKernel : public framework::OpKernel<T> {
     math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
 
     auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     for (int i = 0; i < batch_size; i++) {
       Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
       Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
@@ -184,8 +187,8 @@ class GemmConvKernel : public framework::OpKernel<T> {
         // gemm
         Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
         Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-        math::matmul<DeviceContext, T>(dev_ctx, filter_slice, false, col_matrix,
-                                       false, T(1.0), &out_slice, T(0.0));
+        blas.MatMul(filter_slice, false, col_matrix, false, T(1.0), &out_slice,
+                    T(0.0));
       }
     }
   }
@@ -272,6 +275,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
 
     math::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
 
     if (input_grad) {
       input_grad->mutable_data<T>(context.GetPlace());
@@ -301,9 +305,8 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
             col_matrix.ShareDataWith(in_grad_slice);
             col_matrix.Resize(col_matrix_shape);
           }
-          math::matmul<DeviceContext, T>(dev_ctx, filter_slice, true,
-                                         out_grad_slice, false, T(1.0),
-                                         &col_matrix, T(0.0));
+          blas.MatMul(filter_slice, true, out_grad_slice, false, T(1.0),
+                      &col_matrix, T(0.0));
 
           if (is_expand && data_dim == 2U) {
             col2im(dev_ctx, col, dilations, strides,
@@ -350,9 +353,8 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
           // gemm
           Tensor filter_grad_slice =
               filter_grad_.Slice(g * out_step, (g + 1) * out_step);
-          math::matmul<DeviceContext, T>(dev_ctx, out_grad_slice, false,
-                                         col_matrix, true, T(1.0),
-                                         &filter_grad_slice, T(1.0));
+          blas.MatMul(out_grad_slice, false, col_matrix, true, T(1.0),
+                      &filter_grad_slice, T(1.0));
         }
       }
     }
diff --git a/paddle/fluid/operators/conv_shift_op.cc b/paddle/fluid/operators/conv_shift_op.cc
index a1a0b00208fe77ad462062b5d0cb0c5f3065f584..f2549e814d6f3b5674fe2eec1139f1c3dc6fa0b4 100644
--- a/paddle/fluid/operators/conv_shift_op.cc
+++ b/paddle/fluid/operators/conv_shift_op.cc
@@ -75,8 +75,7 @@ class ConvShiftGradOp : public framework::OperatorWithKernel {
 
 class ConvShiftOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ConvShiftOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(Tensor, default Tensor<float>), a 2-D tensor with shape B x M, "
              "where B is the batch size and M is the data dimension.");
@@ -193,8 +192,9 @@ class ConvShiftGradKernel<platform::CPUPlace, T>
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(conv_shift, ops::ConvShiftOp, ops::ConvShiftOpMaker,
-            conv_shift_grad, ops::ConvShiftGradOp);
+REGISTER_OPERATOR(conv_shift, ops::ConvShiftOp, ops::ConvShiftOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(conv_shift_grad, ops::ConvShiftGradOp);
 REGISTER_OP_CPU_KERNEL(conv_shift,
                        ops::ConvShiftKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/conv_shift_op.cu b/paddle/fluid/operators/conv_shift_op.cu
index 344bbade7055aa8e0aede61dd31dab246bddd169..314d33310588ed960eecaf1a0319ebf56d925c55 100644
--- a/paddle/fluid/operators/conv_shift_op.cu
+++ b/paddle/fluid/operators/conv_shift_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/conv_shift_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
index 901682edbb01c563be6ea407228336b14f942778..038ea8999072f562104c5386ed18b6b275816345 100644
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc
@@ -44,6 +44,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
     // cudnn v5 does not support dilations
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
     int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
 
     const T* input_data = input->data<T>();
@@ -64,13 +65,13 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
 
     // (N, M, H, W) or (N, M, D, H, W)
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, framework::vectorize2int(input->dims()));
+        layout, framework::vectorize2int(input->dims()), groups);
     // (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
     cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, framework::vectorize2int(output->dims()));
+        layout, framework::vectorize2int(output->dims()), groups);
     // (M, C, K_h, K_w) or (M, C, K_d, K_h, K_w)
     cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
-        layout, framework::vectorize2int(filter->dims()));
+        layout, framework::vectorize2int(filter->dims()), groups);
     cudnnConvolutionDescriptor_t cudnn_conv_desc =
         conv_desc.descriptor<T>(paddings, strides, dilations);
 
@@ -104,11 +105,17 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
     cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
 
     // ------------------- cudnn conv transpose forward ---------------------
+    int input_offset = input->numel() / input->dims()[0] / groups;
+    int output_offset = output->numel() / output->dims()[0] / groups;
+    int filter_offset = filter->numel() / groups;
     T alpha = 1.0f, beta = 0.0f;
-    PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
-        handle, &alpha, cudnn_filter_desc, filter_data, cudnn_input_desc,
-        input_data, cudnn_conv_desc, algo, cudnn_workspace,
-        workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
+    for (int g = 0; g < groups; g++) {
+      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+          handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
+          cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
+          algo, cudnn_workspace, workspace_size_in_bytes, &beta,
+          cudnn_output_desc, output_data + output_offset * g));
+    }
 
     // Release the cudnn workspace
     paddle::memory::Free(gpu, cudnn_workspace);
@@ -134,6 +141,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
     // cudnn v5 does not support dilations
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
     int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
 
     // ------------------- cudnn descriptors ---------------------
@@ -145,13 +153,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
 
     // Input: (N, M, H, W) or (N, M, D, H, W)
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        layout, framework::vectorize2int(input->dims()));
+        layout, framework::vectorize2int(input->dims()), groups);
     // Output: (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
     cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        layout, framework::vectorize2int(output_grad->dims()));
+        layout, framework::vectorize2int(output_grad->dims()), groups);
     // Filter (M, C, K_h, K_w) or (M, C, K_d K_h, K_w)
     cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
-        layout, framework::vectorize2int(filter->dims()));
+        layout, framework::vectorize2int(filter->dims()), groups);
 
     cudnnConvolutionDescriptor_t cudnn_conv_desc =
         conv_desc.descriptor<T>(paddings, strides, dilations);
@@ -205,15 +213,22 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
     cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
     // ------------------- cudnn conv backward data ---------------------
     // FIXME(typhoonzero): template type T may not be the same as cudnn call.
+    int input_offset = input->numel() / input->dims()[0] / groups;
+    int output_grad_offset =
+        output_grad->numel() / output_grad->dims()[0] / groups;
+    int filter_offset = filter->numel() / groups;
     T alpha = 1.0f, beta = 0.0f;
     if (input_grad) {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset input_grad.
-      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
-          handle, &alpha, cudnn_output_desc, output_grad_data,
-          cudnn_filter_desc, filter_data, cudnn_conv_desc, data_algo,
-          cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
-          input_grad_data));
+      for (int g = 0; g < groups; g++) {
+        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
+            handle, &alpha, cudnn_output_desc,
+            output_grad_data + output_grad_offset * g, cudnn_filter_desc,
+            filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
+            cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
+            input_grad_data + input_offset * g));
+      }
     }
 
     // ------------------- cudnn conv backward filter ---------------------
@@ -221,11 +236,16 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
       T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset filter_grad.
       // Gradient with respect to the filter
-      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
-          handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc,
-          input_data, cudnn_conv_desc, filter_algo, cudnn_workspace,
-          workspace_size_in_bytes, &beta, cudnn_filter_desc, filter_grad_data));
+      for (int g = 0; g < groups; g++) {
+        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+            handle, &alpha, cudnn_output_desc,
+            output_grad_data + output_grad_offset * g, cudnn_input_desc,
+            input_data + input_offset * g, cudnn_conv_desc, filter_algo,
+            cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc,
+            filter_grad_data + filter_offset * g));
+      }
     }
+
     // Release the cudnn workspace
     paddle::memory::Free(gpu, cudnn_workspace);
   }
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index b2a3cfc89f18eff24c941c664b1184b4485ab895..eeb98ee44f206dbfbe1f61689aa9843122ae3f92 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/conv_transpose_op.h"
+#include <string>
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -30,6 +32,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
   std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
   std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
   std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
+  int groups = ctx->Attrs().Get<int>("groups");
 
   PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
                  "ConvTransposeOp intput should be 4-D or 5-D tensor.");
@@ -46,10 +49,10 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
                     "ConvTransposeOp paddings dimension and dilations "
                     "dimension should be the same.");
   PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
-                    "In ConvTransposeOp, The input channel should be the same "
-                    "as the number of filters.");
+                    "In ConvTransposeOp, The number of input channels should "
+                    "be equal to the number of filter's channels.");
 
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[1]});
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
   for (size_t i = 0; i < strides.size(); ++i) {
     auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
     output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] +
@@ -82,9 +85,7 @@ framework::OpKernelType ConvTransposeOp::GetExpectedKernelType(
       layout_, library_);
 }
 
-Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(OpProto* proto,
-                                               OpAttrChecker* op_checker)
-    : OpProtoAndCheckerMaker(proto, op_checker) {
+void Conv2DTransposeOpMaker::Make() {
   AddInput(
       "Input",
       "(Tensor) The input tensor of convolution transpose operator. "
@@ -102,7 +103,10 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(OpProto* proto,
   AddOutput("Output",
             "(Tensor) The output tensor of convolution transpose operator. "
             "The format of output tensor is also NCHW.");
-
+  AddAttr<int>("groups",
+               "(int default:1), the groups number of the convolution "
+               "transpose operator. ")
+      .SetDefault(1);
   AddAttr<std::vector<int>>("dilations",
                             "(vector<int> default:{1, 1}), the "
                             "dilations(h_dilation, w_dilation) of convolution "
@@ -152,7 +156,7 @@ Parameters(strides, paddings) are two elements. These two elements represent hei
 and width, respectively.
 The input(X) size and output(Out) size may be different.
 
-Example:
+For an example:
   Input:
        Input shape: $(N, C_{in}, H_{in}, W_{in})$
        Filter shape: $(C_{in}, C_{out}, H_f, W_f)$
@@ -166,9 +170,7 @@ Example:
 )DOC");
 }
 
-Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(OpProto* proto,
-                                               OpAttrChecker* op_checker)
-    : OpProtoAndCheckerMaker(proto, op_checker) {
+void Conv3DTransposeOpMaker::Make() {
   AddInput("Input",
            "(Tensor) The input tensor of convolution transpose operator."
            "The format of input tensor is NCDHW. Where N is batch size, C is "
@@ -206,6 +208,10 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(OpProto* proto,
                             "(vector<int> default:{0, 0, 0}), paddings(d_pad, "
                             "h_pad, w_pad) of convolution transpose operator.")
       .SetDefault({0, 0, 0});
+  AddAttr<int>("groups",
+               "(int default:1), the groups number of the convolution3d "
+               "transpose operator. ")
+      .SetDefault(1);
   AddAttr<bool>(
       "use_cudnn",
       "(bool, default false) Only used in cudnn kernel, need install cudnn")
@@ -296,8 +302,11 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker,
-            conv2d_transpose_grad, ops::ConvTransposeOpGrad);
+// conv2d_transpose
+REGISTER_OPERATOR(conv2d_transpose, ops::ConvTransposeOp,
+                  ops::Conv2DTransposeOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(conv2d_transpose_grad, ops::ConvTransposeOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     conv2d_transpose,
@@ -309,8 +318,11 @@ REGISTER_OP_CPU_KERNEL(
     ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
                                      double>);
 
-REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker,
-            conv3d_transpose_grad, ops::ConvTransposeOpGrad);
+// conv3d_transpose
+REGISTER_OPERATOR(conv3d_transpose, ops::ConvTransposeOp,
+                  ops::Conv3DTransposeOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(conv3d_transpose_grad, ops::ConvTransposeOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     conv3d_transpose,
@@ -321,3 +333,19 @@ REGISTER_OP_CPU_KERNEL(
     ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
                                      double>);
+
+// depthwise conv2d_transpose
+REGISTER_OPERATOR(depthwise_conv2d_transpose, ops::ConvTransposeOp,
+                  ops::Conv2DTransposeOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(depthwise_conv2d_transpose_grad, ops::ConvTransposeOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    depthwise_conv2d_transpose,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    depthwise_conv2d_transpose_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
diff --git a/paddle/fluid/operators/conv_transpose_op.cu.cc b/paddle/fluid/operators/conv_transpose_op.cu.cc
index 640fa7d14a079debeceb54d8775c4ede7da1b536..a6d5665df83ae5c89d42840e91a6abd853fedd12 100644
--- a/paddle/fluid/operators/conv_transpose_op.cu.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cu.cc
@@ -15,25 +15,28 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_transpose_op.h"
 
 namespace ops = paddle::operators;
+using CUDA = paddle::platform::CUDADeviceContext;
 
-REGISTER_OP_CUDA_KERNEL(
-    conv2d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    conv2d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
-                                     float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
-                                     double>);
-
-REGISTER_OP_CUDA_KERNEL(
-    conv3d_transpose,
-    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    conv3d_transpose_grad,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
-                                     float>,
-    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
-                                     double>);
+// conv2d
+REGISTER_OP_CUDA_KERNEL(conv2d_transpose,
+                        ops::GemmConvTransposeKernel<CUDA, float>,
+                        ops::GemmConvTransposeKernel<CUDA, double>);
+REGISTER_OP_CUDA_KERNEL(conv2d_transpose_grad,
+                        ops::GemmConvTransposeGradKernel<CUDA, float>,
+                        ops::GemmConvTransposeGradKernel<CUDA, double>);
+
+// conv3d
+REGISTER_OP_CUDA_KERNEL(conv3d_transpose,
+                        ops::GemmConvTransposeKernel<CUDA, float>,
+                        ops::GemmConvTransposeKernel<CUDA, double>);
+REGISTER_OP_CUDA_KERNEL(conv3d_transpose_grad,
+                        ops::GemmConvTransposeGradKernel<CUDA, float>,
+                        ops::GemmConvTransposeGradKernel<CUDA, double>);
+
+// depthwise conv2d
+REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose,
+                        ops::DepthwiseConvTransposeKernel<CUDA, float>,
+                        ops::DepthwiseConvTransposeKernel<CUDA, double>);
+REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose_grad,
+                        ops::DepthwiseConvTransposeGradKernel<CUDA, float>,
+                        ops::DepthwiseConvTransposeGradKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h
index d4e4b641ece9ed120904ded6f8baed65a2666213..0d9c6a62fec1ea24bee5c24b4a7b792781f14d9e 100644
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/vol2col.h"
 
 namespace paddle {
@@ -30,12 +31,12 @@ using DDim = framework::DDim;
 // operator implementations can reuse the code.
 class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Conv2DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker);
+  void Make() override;
 };
 
 class Conv3DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Conv3DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker);
+  void Make() override;
 };
 
 class ConvTransposeOp : public framework::OperatorWithKernel {
@@ -70,7 +71,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
-    // groups will alway be disabled in conv2dtranspose.
+    int groups = context.Attr<int>("groups");
 
     const int batch_size = static_cast<int>(input->dims()[0]);
 
@@ -81,10 +82,10 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
 
     // use col_shape in the im2col and col2im (or vol2col and col2vol)
     // calculation
-    // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w}
+    // col_shape_vec: {c/g, k_h, k_w, h, w} or {c/g, k_d, k_h, k_w, d, h, w}
     size_t data_dim = filter_shape_vec.size() - 2;
     std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    col_shape_vec[0] = output->dims()[1];
+    col_shape_vec[0] = output->dims()[1] / groups;
     for (size_t j = 0; j < data_dim; ++j) {
       col_shape_vec[j + 1] = filter_shape_vec[j + 2];
       col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2];
@@ -92,7 +93,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
     DDim col_shape(framework::make_ddim(col_shape_vec));
 
     // use col_matrix_shape in the gemm calculation
-    // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
+    // size: (c/g * k_h * k_w, h * w) or (c/g * k_d * k_h * k_w, d * h * w)
     DDim col_matrix_shape = framework::flatten_to_2d(col_shape, data_dim + 1);
 
     Tensor col;
@@ -111,15 +112,18 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
     // input matrix size: (m, h * w) or (m, d * h * w)
     DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
 
-    // filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w)
+    // filter size: (m, c/g * k_h * k_w) or (m, c/g * k_d * k_h * k_w)
     DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
     filter.Resize(filter_matrix_shape);
 
     output->mutable_data<T>(context.GetPlace());
     math::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     set_zero(dev_ctx, output, static_cast<T>(0));
 
+    int in_step = static_cast<int>(input->dims()[1]) / groups;
+    int out_step = static_cast<int>(output->dims()[1]) / groups;
     math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
     math::Col2VolFunctor<DeviceContext, T> col2vol;
 
@@ -132,23 +136,29 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
       // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
       Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape);
 
-      // col_matrix = filter * input_batch
-      // of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
-      math::matmul<DeviceContext, T>(dev_ctx, filter, true, input_batch, false,
-                                     static_cast<T>(1.0), &col_matrix,
-                                     static_cast<T>(0.0));
-
-      if (data_dim == 2U) {
-        // col2im: col_matrix -> dy
-        // from (c * k_h * k_w, h * w) to (c, o_h, o_w)
-        col2im(dev_ctx, col, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &output_batch);
-      } else if (data_dim == 3U) {
-        // col2vol: col_matrix -> dy
-        // from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w)
-        col2vol(dev_ctx, col, dilations, strides, paddings, &output_batch);
+      for (int g = 0; g < groups; g++) {
+        Tensor in_slice = input_batch.Slice(g * in_step, (g + 1) * in_step);
+        Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step);
+        Tensor out_slice = output_batch.Slice(g * out_step, (g + 1) * out_step);
+
+        // col_matrix = filter_slice * input_slice
+        // of shape (c/g * k_h * k_w, h * w)
+        // or (c/g * k_d * k_h * k_w, d * h * w)
+        blas.MatMul(filter_slice, true, in_slice, false, static_cast<T>(1.0),
+                    &col_matrix, static_cast<T>(0.0));
+
+        if (data_dim == 2U) {
+          // col2im: col_matrix -> dy
+          // from (c/g * k_h * k_w, h * w) to (c/g, o_h, o_w)
+          col2im(dev_ctx, col, dilations, strides,
+                 std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                  paddings[1]},
+                 &out_slice);
+        } else if (data_dim == 3U) {
+          // col2vol: col_matrix -> dy
+          // from (c/g * k_d * k_h * k_w, d * h * w) to (c/g, o_d, o_h, o_w)
+          col2vol(dev_ctx, col, dilations, strides, paddings, &out_slice);
+        }
       }
     }
   }
@@ -174,6 +184,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    int groups = context.Attr<int>("groups");
 
     const int batch_size = static_cast<int>(input->dims()[0]);
 
@@ -205,14 +216,17 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
     // input matrix size: (m, h * w) or (m, d * h * w)
     DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
 
-    // filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w)
-    DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
+    // filter size: (m, c/g * k_h * k_w) or (m, c/g * k_d * k_h * k_w)
+    DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0] / groups};
     filter.Resize(filter_matrix_shape);
+    int in_step = static_cast<int>(input->dims()[1]) / groups;
+    int col_step = static_cast<int>(col_matrix_shape[0]) / groups;
 
     // convolution transpose grad on input:
     // im2col + gemm (similar to conv-forward)
     // input need to compute gradient
     auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     if (input_grad || filter_grad) {
       Tensor col;
       col.mutable_data<T>(col_shape, context.GetPlace());
@@ -232,7 +246,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
       if (input_grad) {
         input_grad->mutable_data<T>(context.GetPlace());
       }
-      if (filter_grad) {  // filter size (m, c, k_h, k_w)
+      if (filter_grad) {  // filter size (m, c/g, k_h, k_w)
         filter_grad->mutable_data<T>(context.GetPlace());
         set_zero(dev_ctx, filter_grad, static_cast<T>(0));
         filter_grad_ = *filter_grad;
@@ -267,9 +281,17 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
           // or
           // (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m,
           // d, h, w)
-          math::matmul<DeviceContext, T>(
-              dev_ctx, filter, false, col_matrix, false, static_cast<T>(1.0),
-              &input_grad_batch, static_cast<T>(0.0));
+          for (int g = 0; g < groups; g++) {
+            Tensor input_grad_slice =
+                input_grad_batch.Slice(g * in_step, (g + 1) * in_step);
+            Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step);
+            Tensor col_matrix_slice =
+                col_matrix.Slice(g * col_step, (g + 1) * col_step);
+
+            blas.MatMul(filter_slice, false, col_matrix_slice, false,
+                        static_cast<T>(1.0), &input_grad_slice,
+                        static_cast<T>(0.0));
+          }
         }
         if (filter_grad) {
           // input batch
@@ -279,13 +301,90 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
           // or
           // (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d *
           // k_h * k_w)
-          math::matmul<DeviceContext, T>(dev_ctx, in_batch, false, col_matrix,
-                                         true, static_cast<T>(1.0),
-                                         &filter_grad_, static_cast<T>(1.0));
+          for (int g = 0; g < groups; g++) {
+            Tensor in_batch_slice =
+                in_batch.Slice(g * in_step, (g + 1) * in_step);
+            Tensor filter_grad_slice =
+                filter_grad_.Slice(g * in_step, (g + 1) * in_step);
+            Tensor col_matrix_slice =
+                col_matrix.Slice(g * col_step, (g + 1) * col_step);
+            blas.MatMul(in_batch_slice, false, col_matrix_slice, true,
+                        static_cast<T>(1.0), &filter_grad_slice,
+                        static_cast<T>(1.0));
+          }
         }
       }
     }
   }
 };
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    Tensor filter = *context.Input<Tensor>("Filter");
+    Tensor* output = context.Output<Tensor>("Output");
+    output->mutable_data<T>(context.GetPlace());
+
+    int groups = context.Attr<int>("groups");
+    PADDLE_ENFORCE_EQ(groups, filter.dims()[0]);
+
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    for (auto v : dilations) {
+      PADDLE_ENFORCE_EQ(v, 1);
+    }
+
+    output->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> set_zero;
+    set_zero(dev_ctx, output, static_cast<T>(0));
+
+    math::DepthwiseConvInputGradFunctor<DeviceContext, T>
+        depthwiseConvInputGrad;
+    depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings,
+                           output);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    const Tensor* output_grad =
+        context.Input<Tensor>(framework::GradVarName("Output"));
+    Tensor* input_grad =
+        context.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad =
+        context.Output<Tensor>(framework::GradVarName("Filter"));
+    Tensor filter = *context.Input<Tensor>("Filter");
+
+    if (!input_grad && !filter_grad) return;
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    if (input_grad) {
+      math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
+      depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings,
+                    input_grad);
+    }
+
+    if (filter_grad) {
+      math::SetConstant<DeviceContext, T> set_zero;
+      filter_grad->mutable_data<T>(context.GetPlace());
+      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+
+      math::DepthwiseConvFilterGradFunctor<DeviceContext, T>
+          depthwiseConvFilterGrad;
+      depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings,
+                              filter_grad);
+    }
+  }
+};
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/cos_sim_op.cc b/paddle/fluid/operators/cos_sim_op.cc
index 4c8af408f62453eaf22cc23d19844e8ca7625bfa..8f3644039f9950a8a70e2fd66c20837a5f52bd7f 100644
--- a/paddle/fluid/operators/cos_sim_op.cc
+++ b/paddle/fluid/operators/cos_sim_op.cc
@@ -62,8 +62,7 @@ class CosSimOp : public framework::OperatorWithKernel {
 
 class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CosSimOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The 1st input of cos_sim op.");
     AddInput("Y", "The 2nd input of cos_sim op.");
     AddOutput("Out", "The output of cos_sim op.");
@@ -77,9 +76,9 @@ class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsIntermediate();
 
     AddComment(R"DOC(
-Cosine Similarity Operator.
+**Cosine Similarity Operator**
 
-$Out = X^T * Y / (\sqrt{X^T * X} * \sqrt{Y^T * Y})$
+$Out = \frac{X^T * Y}{(\sqrt{X^T * X} * \sqrt{Y^T * Y})}$
 
 The input X and Y must have the same shape, except that the 1st dimension
 of input Y could be just 1 (different from input X), which will be
@@ -153,8 +152,9 @@ class CosSimOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(cos_sim, ops::CosSimOp, ops::CosSimOpMaker, cos_sim_grad,
-            ops::CosSimOpGrad);
+REGISTER_OPERATOR(cos_sim, ops::CosSimOp, ops::CosSimOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(cos_sim_grad, ops::CosSimOpGrad);
 REGISTER_OP_CPU_KERNEL(
     cos_sim, ops::CosSimKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc
index a83013c428a77a0ead545d87852e1017bc927edf..c27befe1143baa68add4b56f3572eab75272c3a5 100644
--- a/paddle/fluid/operators/crf_decoding_op.cc
+++ b/paddle/fluid/operators/crf_decoding_op.cc
@@ -18,8 +18,7 @@ namespace paddle {
 namespace operators {
 class CRFDecodingOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CRFDecodingOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Emission",
              "(LoDTensor, default: LoDTensor<float>). A LoDTensor with shape "
              "[N x D] where N is the size of the mini-batch and D is the total "
@@ -54,21 +53,18 @@ sequence of observed tags.
 The output of this operator changes according to whether Input(Label) is given:
 
 1. Input(Label) is given:
-
-This happens in training. This operator is used to co-work with the chunk_eval
-operator.
-
-When Input(Label) is given, the crf_decoding operator returns a row vector
-with shape [N x 1] whose values are fixed to be 0, indicating an incorrect
-prediction, or 1 indicating a tag is correctly predicted. Such an output is the
-input to chunk_eval operator.
+   This happens in training. This operator is used to co-work with the chunk_eval
+   operator.
+   When Input(Label) is given, the crf_decoding operator returns a row vector
+   with shape [N x 1] whose values are fixed to be 0, indicating an incorrect
+   prediction, or 1 indicating a tag is correctly predicted. Such an output is the
+   input to chunk_eval operator.
 
 2. Input(Label) is not given:
-
-This is the standard decoding process.
+   This is the standard decoding process.
 
 The crf_decoding operator returns a row vector with shape [N x 1] whose values
-range from 0 to maximum tag number - 1. Each element indicates an index of a
+range from 0 to maximum tag number - 1, Each element indicates an index of a
 predicted tag.
 )DOC");
   }
diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h
index 2b2a733fb9f162755e5c548fec617937d86689dd..3f5fab3b382bea97f43e4bc1b2cd436c956ba264 100644
--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <limits>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index fd7ea70c64fafd0a7ea55ec1e3a29eb66d84a2c6..5b5a220cf90e7813f914ae35733e7a4103391b2d 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -48,12 +48,18 @@ class CropOp : public framework::OperatorWithKernel {
       ctx->SetOutputDim("Out", y_dim);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
 };
 
 class CropOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CropOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "The input of pad op. "
              "The input should be a k-D tensor(k > 0 and k < 7).");
@@ -61,13 +67,19 @@ class CropOpMaker : public framework::OpProtoAndCheckerMaker {
              "The input used as reference for cropping, "
              "which is of the same dimensions as X.")
         .AsDispensable();
+    AddInput("Offsets",
+             "The input used to describe offsets in runtime, which is a "
+             "1-D vector whose size equals to the rank of input 'X'. The "
+             "elements data type must be int.")
+        .AsDispensable();
     AddOutput("Out",
               "The output of crop op, "
               "which is of the same dimensions as X.");
     AddAttr<std::vector<int>>("offsets",
                               "A list<int> describing offsets to be cropped. "
                               "The size of offsets list should be the same as "
-                              "the dimension size of input X.");
+                              "the dimension size of input X.")
+        .SetDefault(std::vector<int>());
     AddAttr<std::vector<int>>("shape",
                               "A list<int> describing the shape of output. "
                               "The size of shape list should be the same as "
@@ -78,6 +90,17 @@ Crop Operator.
 
 Crop input into output, as specified by offsets and shape.
 
+There are two ways to set the offsets:
+1. In runtime: Using the input 'Offsets', which is a Vairbale and can be 
+               output of other operators. This way is suitable for 
+               dynamic offsets.
+2. In network configuration: Using the attribute 'offsets', which will be 
+                             set in Python configure script. This way is 
+                             suitable for fixed offsets.
+You CANNOT use these two ways at the same time. An exception will be raised 
+if input 'Offset' is configured and meanwhile the attribute 'offsets' is 
+not empty.
+
 There are two ways to set shape:
 1. reference input: crop input X into the same shape as reference input.
                     The dimension of reference input should
@@ -147,13 +170,24 @@ class CropOpGrad : public framework::OperatorWithKernel {
       ctx->SetOutputDim(x_grad_name, x_dims);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))
+                ->type()),
+        ctx.device_context());
+  }
 };
 
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(crop, ops::CropOp, ops::CropOpMaker, crop_grad, ops::CropOpGrad);
+REGISTER_OPERATOR(crop, ops::CropOp, ops::CropOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(crop_grad, ops::CropOpGrad);
 REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel<float>);
 REGISTER_OP_CPU_KERNEL(
     crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h
index c5ac6849789587f2f41588f79bd538f7b79a7478..772e80bbea4f2db654cefd0dcb404bc33803bd7a 100644
--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
@@ -26,6 +27,37 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor,
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 using framework::Tensor;
 
+static std::vector<int> GetOffsets(const framework::ExecutionContext& ctx) {
+  std::vector<int> res;
+  int rank = ctx.Input<Tensor>("X")->dims().size();
+  if (ctx.HasInput("Offsets")) {
+    PADDLE_ENFORCE(ctx.Attr<std::vector<int>>("offsets").empty(),
+                   "Input 'Offsets' and attribute 'offsets' should not be used "
+                   "at the same time.");
+    const auto* offsets_tensor = ctx.Input<Tensor>("Offsets");
+    PADDLE_ENFORCE_EQ(offsets_tensor->dims().size(), 1);
+    PADDLE_ENFORCE_EQ(
+        rank, offsets_tensor->dims()[0],
+        "Offsets size should be equal to dimension size of input tensor.");
+    const int* offsets_data;
+    framework::Tensor cpu_tmp_tensor;
+    if (platform::is_cpu_place(offsets_tensor->place())) {
+      offsets_data = offsets_tensor->data<int>();
+    } else {
+      framework::TensorCopySync(*offsets_tensor, platform::CPUPlace(),
+                                &cpu_tmp_tensor);
+      offsets_data = cpu_tmp_tensor.data<int>();
+    }
+    res = std::vector<int>(offsets_data, offsets_data + rank);
+  } else {
+    res = ctx.Attr<std::vector<int>>("offsets");
+    PADDLE_ENFORCE_EQ(
+        rank, static_cast<int>(res.size()),
+        "Offsets size should be equal to dimension size of input tensor.");
+  }
+  return res;
+}
+
 template <typename T>
 class CropKernel : public framework::OpKernel<T> {
  public:
@@ -36,10 +68,7 @@ class CropKernel : public framework::OpKernel<T> {
     T* out_data = out->mutable_data<T>(context.GetPlace());
     auto x_stride = framework::stride(x->dims());
     auto out_stride = framework::stride(out->dims());
-    auto offsets = context.Attr<std::vector<int>>("offsets");
-    PADDLE_ENFORCE_EQ(
-        x->dims().size(), static_cast<int64_t>(offsets.size()),
-        "Offsets size should be equal to dimension size of input tensor.");
+    auto offsets = GetOffsets(context);
     int64_t offset = 0;
     for (size_t i = 0; i < offsets.size(); ++i) {
       offset += (x_stride[i] * offsets[i]);
@@ -55,7 +84,7 @@ void CropGradFunction(const framework::ExecutionContext& context) {
   if (d_x != nullptr) {
     auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
     d_x->mutable_data<T>(context.GetPlace());
-    auto offsets = context.Attr<std::vector<int>>("offsets");
+    auto offsets = GetOffsets(context);
     Eigen::array<std::pair<int, int>, D> paddings;
     for (size_t i = 0; i < D; ++i) {
       paddings[i].first = offsets[i];
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index 55810371c8d354483138b0673721a1ea39fa6f35..d5e095f9cad95b74b8ff79e4a60ccbdf11512a5a 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -111,8 +111,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
 
 class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(Tensor, default Tensor<float>), a 2-D tensor with shape [N x D],"
              " where N is the batch size and D is the number of classes. "
@@ -125,7 +124,8 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
              "Tensor<float/double> with shape [N x D].");
     AddOutput("Y",
               "(Tensor, default Tensor<float>), a 2-D tensor with shape "
-              "[N x 1]. The cross entropy loss.");
+              "[N x 1]. The cross entropy loss.")
+        .Reuse("X");
     AddAttr<bool>("soft_label",
                   "(bool, default false), a flag indicating whether to "
                   "interpretate the given labels as soft labels.")
@@ -164,10 +164,13 @@ or not. But the output only shares the LoD information with input X.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
-            cross_entropy_grad, ops::CrossEntropyGradientOp);
-REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<float>,
-                       ops::CrossEntropyOpKernel<double>);
+using CPUCtx = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp);
+REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>,
+                       ops::CrossEntropyOpKernel<CPUCtx, double>);
 REGISTER_OP_CPU_KERNEL(cross_entropy_grad,
-                       ops::CrossEntropyGradientOpKernel<float>,
-                       ops::CrossEntropyGradientOpKernel<double>);
+                       ops::CrossEntropyGradientOpKernel<CPUCtx, float>,
+                       ops::CrossEntropyGradientOpKernel<CPUCtx, double>);
diff --git a/paddle/fluid/operators/cross_entropy_op.cu b/paddle/fluid/operators/cross_entropy_op.cu
index 6449149d4b55962e84baafffc0c2c03f8108866f..30dbd5bd3d39dd2992c3dd91364003bb7715a2eb 100644
--- a/paddle/fluid/operators/cross_entropy_op.cu
+++ b/paddle/fluid/operators/cross_entropy_op.cu
@@ -14,98 +14,11 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/cross_entropy_op.h"
 
-namespace paddle {
-namespace operators {
-
-namespace {
-
-template <typename T>
-__global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
-                                           const int64_t* label, const int N,
-                                           const int D) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
-    int idx = i * D + label[i];
-    dX[idx] = -dY[i] / X[idx];
-  }
-}
-
-template <typename T>
-__global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
-                                               const T* label, const int N,
-                                               const int D) {
-  int ids = blockIdx.x * blockDim.x + threadIdx.x;
-  if (ids < N * D) {
-    int row_ids = ids / D;
-    dX[ids] = -label[ids] * dY[row_ids] / X[ids];
-  }
-}
-}  // namespace
-
-template <typename T>
-class CrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* label = ctx.Input<Tensor>("Label");
-    Tensor* y = ctx.Output<Tensor>("Y");
-    y->mutable_data<T>(ctx.GetPlace());
-
-    math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
-        ctx.template device_context<platform::CUDADeviceContext>(), y, x, label,
-        ctx.Attr<bool>("soft_label"));
-  }
-};
-
-template <typename T>
-class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "This kernel only runs on GPU device.");
-
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* label = ctx.Input<Tensor>("Label");
-    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    const T* dy_data =
-        ctx.Input<Tensor>(framework::GradVarName("Y"))->data<T>();
-    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    const T* x_data = x->data<T>();
-
-    int64_t batch_size = x->dims()[0];
-    int64_t class_num = x->dims()[1];
-
-    int block = 512;
-    int grid = (batch_size * class_num + block - 1) / block;
-
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto stream = dev_ctx.stream();
-
-    if (ctx.Attr<bool>("soft_label")) {
-      auto* label_data = label->data<T>();
-      SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
-          dx_data, dy_data, x_data, label_data, batch_size, class_num);
-    } else {
-      math::SetConstant<platform::CUDADeviceContext, T> functor;
-      functor(dev_ctx, dx, 0);
-      auto* label_data = label->data<int64_t>();
-      grid = (batch_size + block - 1) / block;
-      CrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
-          dx_data, dy_data, x_data, label_data, batch_size, class_num);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel<float>,
-                        ops::CrossEntropyOpCUDAKernel<double>);
+using CUDACtx = paddle::platform::CUDADeviceContext;
+REGISTER_OP_CUDA_KERNEL(cross_entropy,
+                        ops::CrossEntropyOpKernel<CUDACtx, float>,
+                        ops::CrossEntropyOpKernel<CUDACtx, double>);
 REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
-                        ops::CrossEntropyGradientOpCUDAKernel<float>,
-                        ops::CrossEntropyGradientOpCUDAKernel<double>);
+                        ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
+                        ops::CrossEntropyGradientOpKernel<CUDACtx, double>);
diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h
index ec315695a68befc2e3de798fdb3fa146a903aaff..19a2aec92b267ece94685ce34604b7d1cfa5d209 100644
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -17,69 +17,106 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/for_range.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename T>
+template <typename DeviceContext, typename T>
 class CrossEntropyOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "This kernel only runs on CPU.");
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* labels = ctx.Input<Tensor>("Label");
-    Tensor* y = ctx.Output<Tensor>("Y");
+    auto* x = ctx.Input<Tensor>("X");
+    auto* labels = ctx.Input<Tensor>("Label");
+    auto* y = ctx.Output<Tensor>("Y");
     y->mutable_data<T>(ctx.GetPlace());
 
-    math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
-        ctx.template device_context<platform::CPUDeviceContext>(), y, x, labels,
+    math::CrossEntropyFunctor<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), y, x, labels,
         ctx.Attr<bool>("soft_label"));
   }
 };
 
 template <typename T>
+class XeSoftlabelGradFunctor {
+ public:
+  XeSoftlabelGradFunctor(T* dx,
+                         const T* dy,     // NOLINT
+                         const T* x,      // NOLINT
+                         const T* label,  // NOLINT
+                         size_t num_classes)
+      : dx_(dx), dy_(dy), x_(x), label_(label), num_classes_(num_classes) {}
+
+  HOSTDEVICE void operator()(size_t i) {
+    auto row_ids = i / num_classes_;
+    dx_[i] = -label_[i] * dy_[row_ids] / x_[i];
+  }
+
+ private:
+  T* dx_;
+  const T* dy_;
+  const T* x_;
+  const T* label_;
+  size_t num_classes_;
+};
+
+template <typename T>
+class XeGradFunctor {
+ public:
+  XeGradFunctor(T* dx,
+                const T* dy,           // NOLINT
+                const T* x,            // NOLINT
+                const int64_t* label,  // NOLINT
+                size_t num_classes)
+      : dx_(dx), dy_(dy), x_(x), label_(label), num_classes_(num_classes) {}
+
+  HOSTDEVICE void operator()(size_t sample_id) {
+    auto x_is_true_offset = sample_id * num_classes_ + label_[sample_id];
+    for (size_t x_offset = sample_id * num_classes_;
+         x_offset < (sample_id + 1) * num_classes_; ++x_offset) {
+      dx_[x_offset] = x_offset != x_is_true_offset
+                          ? static_cast<T>(0)
+                          : -dy_[sample_id] / x_[x_offset];
+    }
+  }
+
+ private:
+  T* dx_;
+  const T* dy_;
+  const T* x_;
+  const int64_t* label_;
+  size_t num_classes_;
+};
+
+template <typename DeviceContext, typename T>
 class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "This kernel only runs on CPU.");
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const Tensor* label = ctx.Input<Tensor>("Label");
-    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    auto* x = ctx.Input<Tensor>("X");
+    auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto* label = ctx.Input<Tensor>("Label");
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
 
     int64_t class_num = x->dims()[1];
     if (ctx.Attr<bool>("soft_label")) {
-      auto x_mat = EigenMatrix<T>::From(*x);
-      auto dy_mat = EigenMatrix<T>::From(*dy);
-      auto lbl_mat = EigenMatrix<T>::From(*label);
-      auto dx_mat = EigenMatrix<T>::From(*dx);
-
-      dx_mat.device(*ctx.template device_context<platform::CPUDeviceContext>()
-                         .eigen_device()) =
-          -(lbl_mat *
-            dy_mat.broadcast(Eigen::DSizes<int64_t, 2>(1, class_num)) / x_mat);
+      XeSoftlabelGradFunctor<T> functor(dx_data, dy->data<T>(), x->data<T>(),
+                                        label->data<T>(),
+                                        static_cast<size_t>(class_num));
+      platform::ForRange<DeviceContext> for_range(
+          ctx.template device_context<DeviceContext>(),
+          static_cast<size_t>(dx->numel()));
+      for_range(functor);
     } else {
-      int64_t batch_size = x->dims()[0];
-      const T* dy_data = dy->data<T>();
-      const T* x_data = x->data<T>();
-      const int64_t* label_data = label->data<int64_t>();
-
-      math::SetConstant<platform::CPUDeviceContext, T> functor;
-      functor(ctx.template device_context<platform::CPUDeviceContext>(), dx, 0);
-
-      for (int64_t i = 0; i < batch_size; ++i) {
-        PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
-        int64_t index = i * class_num + label_data[i];
-        dx_data[index] = -dy_data[i] / x_data[index];
-      }
+      XeGradFunctor<T> functor(dx_data, dy->data<T>(), x->data<T>(),
+                               label->data<int64_t>(),
+                               static_cast<size_t>(class_num));
+      platform::ForRange<DeviceContext> for_range(
+          ctx.template device_context<DeviceContext>(),
+          static_cast<size_t>(dy->numel()));
+      for_range(functor);
     }
   }
 };
diff --git a/paddle/fluid/operators/ctc_align_op.cc b/paddle/fluid/operators/ctc_align_op.cc
index 19e7649660edd0bc90bc6a9537b1cdbb2e7e8ebc..d2b440d9d2e50340af7a7bb4e76e55beea1bcb46 100644
--- a/paddle/fluid/operators/ctc_align_op.cc
+++ b/paddle/fluid/operators/ctc_align_op.cc
@@ -44,8 +44,7 @@ class CTCAlignOp : public framework::OperatorWithKernel {
 
 class CTCAlignOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CTCAlignOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Input",
              "(LodTensor, default: LoDTensor<int>), Its shape is "
              "[Lp, 1], where Lp is the sum of all input sequences' length.");
diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu
index 54e0b1d9ad83c5f01f3f0dfbc2a95c642c0aaadc..bbad74e96d9c6c1be24639b63e472f18a599cfab 100644
--- a/paddle/fluid/operators/ctc_align_op.cu
+++ b/paddle/fluid/operators/ctc_align_op.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <stdio.h>
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
+#include <vector>
 #include "paddle/fluid/operators/ctc_align_op.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/ctc_align_op.h b/paddle/fluid/operators/ctc_align_op.h
index 70698d99589ae9e2e18ec8b1c1bb3bc8c7476131..9c5c6f5aa03632fe3079074d4b164f871fad634d 100644
--- a/paddle/fluid/operators/ctc_align_op.h
+++ b/paddle/fluid/operators/ctc_align_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <string.h>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc
index 0da6f188523a78693929307a08601e04002bc8ec..5302b822d6b9f232e9ccd0d03cc549d7d5044ebf 100644
--- a/paddle/fluid/operators/cumsum_op.cc
+++ b/paddle/fluid/operators/cumsum_op.cc
@@ -29,21 +29,20 @@ class CumOp : public framework::OperatorWithKernel {
 
 class CumsumOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  CumsumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input of Cumsum operator");
-    AddOutput("Out", "Output of Cumsum operator");
+  void Make() override {
+    AddInput("X", "Input of cumsum operator");
+    AddOutput("Out", "Output of cumsum operator");
     AddAttr<int>("axis",
-                 "(int, default -1). The dimenstion to accumulate along. "
-                 "-1 means the last dimenstion")
+                 "The dimenstion to accumulate along. -1 means the last "
+                 "dimenstion [default -1].")
         .SetDefault(-1)
         .EqualGreaterThan(-1);
     AddAttr<bool>("exclusive",
-                  "bool, default false). Whether to perform exclusive cumsum")
+                  "Whether to perform exclusive cumsum. [default false].")
         .SetDefault(false);
     AddAttr<bool>("reverse",
-                  "bool, default false). If true, the cumsum is performed in "
-                  "the reversed direction")
+                  "If true, the cumsum is performed in the reversed direction. "
+                  "[default false].")
         .SetDefault(false);
     AddComment(R"DOC(
 The cumulative sum of the elements along a given axis.
@@ -79,4 +78,4 @@ using CPU = paddle::platform::CPUDeviceContext;
 REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker, ops::CumsumGradMaker);
 REGISTER_OP_CPU_KERNEL(cumsum, ops::CumKernel<CPU, ops::CumsumFunctor<float>>,
                        ops::CumKernel<CPU, ops::CumsumFunctor<double>>,
-                       ops::CumKernel<CPU, ops::CumsumFunctor<int>>)
+                       ops::CumKernel<CPU, ops::CumsumFunctor<int>>);
diff --git a/paddle/fluid/operators/cumsum_op.cu b/paddle/fluid/operators/cumsum_op.cu
index 70e2a1de5e24302646611cfea3b8dbe1562274e2..eb5fd99ccb844b1f1717b818e7807a384d6515eb 100644
--- a/paddle/fluid/operators/cumsum_op.cu
+++ b/paddle/fluid/operators/cumsum_op.cu
@@ -19,4 +19,4 @@ using CUDA = paddle::platform::CUDADeviceContext;
 
 REGISTER_OP_CUDA_KERNEL(cumsum, ops::CumKernel<CUDA, ops::CumsumFunctor<float>>,
                         ops::CumKernel<CUDA, ops::CumsumFunctor<double>>,
-                        ops::CumKernel<CUDA, ops::CumsumFunctor<int>>)
+                        ops::CumKernel<CUDA, ops::CumsumFunctor<int>>);
diff --git a/paddle/fluid/operators/decayed_adagrad_op.cc b/paddle/fluid/operators/decayed_adagrad_op.cc
index 5eeb3dee095e330174f35fa8eebd418bf764b132..c0f2b49a04d9e88502c4b63bca493cd2b7ad1c5c 100644
--- a/paddle/fluid/operators/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/decayed_adagrad_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
 class DecayedAdagradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -51,12 +52,17 @@ class DecayedAdagradOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("ParamOut", param_dims);
     ctx->SetOutputDim("MomentOut", param_dims);
   }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 
 class DecayedAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  DecayedAdagradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Param", "(Tensor) Input parameter");
     AddInput("Grad", "(Tensor) Input gradient");
     AddInput("Moment", "(Tensor) Second moment");
diff --git a/paddle/fluid/operators/delete_var_op.cc b/paddle/fluid/operators/delete_var_op.cc
index 1fe9404c00335edbe3594486f8c403e69f2ab08f..d7a9bfbc437dbf4c723b9c87ff62ec6b62c38638 100644
--- a/paddle/fluid/operators/delete_var_op.cc
+++ b/paddle/fluid/operators/delete_var_op.cc
@@ -34,8 +34,7 @@ class DeleteVarOp : public framework::OperatorBase {
 
 class DeleteVarOpInfoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  DeleteVarOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The input of delete op").AsDuplicable();
     AddComment(R"DOC(
 Delete Operator.
diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt
deleted file mode 100644
index 94395ccfbcbd74ee40552a5c70dc8b8063a5f851..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-if(WITH_DISTRIBUTE)
-  grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
-  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-  set_source_files_properties(test_serde.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  cc_test(serde_test SRCS test_serde.cc DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc)
-endif()
diff --git a/paddle/fluid/operators/detail/bytebuffer_stream.cc b/paddle/fluid/operators/detail/bytebuffer_stream.cc
deleted file mode 100644
index 741dd51de9e75feb608161579e56cb160b058ebb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/bytebuffer_stream.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// NOTE: This file was originally created by tensorflow
-//       (https://github.com/tensorflow/tensorflow/) we borrow this
-//       file and did some modifications so that we can send gRPC
-//       requests without too much copying of the tensor data.
-
-#include "bytebuffer_stream.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-GrpcByteBufferSource::GrpcByteBufferSource() {}
-
-bool GrpcByteBufferSource::Init(const grpc::ByteBuffer& src) {
-  cur_ = -1;
-  left_ = 0;
-  ptr_ = nullptr;
-  byte_count_ = 0;
-  bool ok = src.Dump(&slices_).ok();
-  if (!ok) {
-    slices_.clear();
-  }
-  return ok;
-}
-
-bool GrpcByteBufferSource::Next(const void** data, int* size) {
-  // Use loop instead of if in case buffer contained empty slices.
-  while (left_ == 0) {
-    // Advance to next slice.
-    cur_++;
-    if (cur_ >= slices_.size()) {
-      return false;
-    }
-    const ::grpc::Slice& s = slices_[cur_];
-    left_ = s.size();
-    ptr_ = reinterpret_cast<const char*>(s.begin());
-  }
-
-  *data = ptr_;
-  *size = left_;
-  byte_count_ += left_;
-  ptr_ += left_;
-  left_ = 0;
-  return true;
-}
-
-void GrpcByteBufferSource::BackUp(int count) {
-  ptr_ -= count;
-  left_ += count;
-  byte_count_ -= count;
-}
-
-bool GrpcByteBufferSource::Skip(int count) {
-  const void* data;
-  int size;
-  while (Next(&data, &size)) {
-    if (size >= count) {
-      BackUp(size - count);
-      return true;
-    }
-    // size < count;
-    count -= size;
-  }
-  // error or we have too large count;
-  return false;
-}
-
-google::protobuf::int64 GrpcByteBufferSource::ByteCount() const {
-  return byte_count_;
-}
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/bytebuffer_stream.h b/paddle/fluid/operators/detail/bytebuffer_stream.h
deleted file mode 100644
index 099deb12d0e436427c147ab9b1eb553b712e14fb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/bytebuffer_stream.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// NOTE: This file was originally created by tensorflow
-//       (https://github.com/tensorflow/tensorflow/) we borrow this
-//       file and did some modifications so that we can send gRPC
-//       requests without too much copying of the tensor data.
-
-#pragma once
-
-#include <grpc++/grpc++.h>
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-// A ZeroCopyInputStream that reads from a grpc::ByteBuffer.
-class GrpcByteBufferSource
-    : public ::google::protobuf::io::ZeroCopyInputStream {
- public:
-  GrpcByteBufferSource();
-  bool Init(const ::grpc::ByteBuffer& src);  // Can be called multiple times.
-  bool Next(const void** data, int* size) override;
-  void BackUp(int count) override;
-  bool Skip(int count) override;
-  ::google::protobuf::int64 ByteCount() const override;
-
- private:
-  std::vector<::grpc::Slice> slices_;
-  size_t cur_;       // Current slice index.
-  int left_;         // Number of bytes in slices_[cur_] left to yield.
-  const char* ptr_;  // Address of next byte in slices_[cur_] to yield.
-  ::google::protobuf::int64 byte_count_;
-};
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
deleted file mode 100644
index ddeeebec58e02f1686fd2e3d3e5ac1a4c4fd3c59..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ /dev/null
@@ -1,201 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "grpc_client.h"
-#include "paddle/fluid/framework/threadpool.h"
-namespace paddle {
-namespace operators {
-namespace detail {
-
-bool RPCClient::AsyncSendVariable(const std::string& ep,
-                                  const platform::DeviceContext& ctx,
-                                  const framework::Scope& scope,
-                                  const std::string& var_name,
-                                  int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string var_name_val = var_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
-
-  framework::Async([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, this] {
-    auto* var = p_scope->FindVar(var_name_val);
-    sendrecv::VariableMessage req;
-    SerializeToMessage(var_name_val, var, *p_ctx, &req);
-
-    // varhandle
-    VarHandle var_h;
-    var_h.ep = ep_val;
-    var_h.scope = p_scope;
-    var_h.name = var_name_val;
-    var_h.ctx = p_ctx;
-
-    // stub context
-    SendProcessor* s = new SendProcessor(ch);
-    s->Prepare(var_h, time_out);
-    s->response_call_back_ = NULL;
-
-    auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
-    rpc->Finish(&s->reply_, &s->status_, (void*)s);
-  });
-
-  req_count_++;
-
-  return true;
-}
-
-void ProcGetResponse(const VarHandle& var_h,
-                     const sendrecv::VariableMessage& ret_msg) {
-  auto* outvar = var_h.scope->FindVar(var_h.name);
-  DeserializeFromMessage(ret_msg, *var_h.ctx, outvar);
-}
-
-bool RPCClient::AsyncGetVariable(const std::string& ep,
-                                 const platform::DeviceContext& ctx,
-                                 const framework::Scope& scope,
-                                 const std::string& var_name,
-                                 int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string var_name_val = var_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
-
-  framework::Async([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {
-    sendrecv::VariableMessage req;
-    req.set_varname(var_name_val);
-
-    // varhandle
-    VarHandle var_h;
-    var_h.ep = ep_val;
-    var_h.scope = p_scope;
-    var_h.name = var_name_val;
-    var_h.ctx = p_ctx;
-
-    // stub context
-    GetProcessor* s = new GetProcessor(ch);
-    s->Prepare(var_h, time_out);
-    s->response_call_back_ = ProcGetResponse;
-
-    auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
-    rpc->Finish(&s->reply_, &s->status_, (void*)s);
-  });
-
-  req_count_++;
-
-  return true;
-}
-
-void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
-  const auto ch = GetChannel(ep);
-
-  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  s->Prepare(time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_varname(BATCH_BARRIER_MESSAGE);
-  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, (void*)s);
-  req_count_++;
-}
-
-void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) {
-  const auto ch = GetChannel(ep);
-  FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
-  s->Prepare(time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_varname(FETCH_BARRIER_MESSAGE);
-  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, (void*)s);
-  req_count_++;
-}
-
-bool RPCClient::Wait() {
-  if (req_count_ <= 0) {
-    return true;
-  }
-  const size_t kReqCnt = req_count_;
-  bool a[kReqCnt];
-  std::vector<std::future<void>> waits(req_count_);
-
-  for (int i = 0; i < req_count_; i++) {
-    waits[i] = framework::Async([i, &a, this] { a[i] = Proceed(); });
-  }
-
-  for (int i = 0; i < req_count_; i++) {
-    waits[i].wait();
-  }
-
-  int last_req_count = req_count_;
-  req_count_ = 0;
-
-  for (int i = 0; i < last_req_count; i++) {
-    if (!a[i]) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
-bool RPCClient::Proceed() {
-  void* tag = NULL;
-  bool ok = false;
-
-  // request counts.
-  if (!cq_.Next(&tag, &ok)) {
-    LOG(ERROR) << "Get meets CompletionQueue error";
-    return false;
-  }
-
-  GPR_ASSERT(ok);
-  PADDLE_ENFORCE(tag);
-
-  // TODO(gongwb): add more retries.
-  BaseProcessor* c = static_cast<BaseProcessor*>(tag);
-  if (!c->status_.ok()) {
-    LOG(ERROR) << "proc param error:" << c->var_h_.String()
-               << " grpc error:" << c->status_.error_message();
-    delete c;
-    return false;
-  }
-
-  c->Process();
-  delete c;
-  return true;
-}
-
-std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) {
-  auto it = channels_.find(ep);
-  if (it != channels_.end()) {
-    return it->second;
-  }
-
-  grpc::ChannelArguments args;
-  args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 5000);
-  args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE);
-  args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
-  args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
-
-  auto ch =
-      grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args);
-
-  channels_[ep] = ch;
-  return ch;
-}
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/grpc_client.h b/paddle/fluid/operators/detail/grpc_client.h
deleted file mode 100644
index f520367dd981288416631fdad15241fb5d811d07..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/grpc_client.h
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <grpc++/grpc++.h>
-#include <grpc/support/log.h>
-#include <time.h>
-#include <chrono>
-#include <ctime>
-#include <functional>
-#include <iostream>
-#include <map>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
-#include "paddle/fluid/operators/detail/simple_block_queue.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-struct VarHandle {
-  std::string ep;
-  const platform::DeviceContext* ctx;
-  const framework::Scope* scope;
-  std::string name;
-
-  std::string String() const {
-    std::ostringstream s;
-    s << "name:[" << name << "] ep:[" << ep << "]";
-    return s.str();
-  }
-};
-
-void ProcGetResponse(const VarHandle& var_h,
-                     const sendrecv::VariableMessage& msg);
-
-class BaseProcessor {
- public:
-  explicit BaseProcessor(std::shared_ptr<grpc::Channel> ch) {
-    stub_ = sendrecv::SendRecvService::NewStub(ch);
-    context_ = NULL;
-  }
-
-  virtual ~BaseProcessor() {}
-
-  virtual void Prepare(const VarHandle& var_info, int64_t time_out) {
-    context_.reset(new grpc::ClientContext());
-    var_h_ = var_info;
-
-    std::chrono::system_clock::time_point deadline =
-        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
-
-    context_->set_deadline(deadline);
-  }
-
-  virtual void Prepare(int64_t time_out) {
-    context_.reset(new grpc::ClientContext());
-
-    std::chrono::system_clock::time_point deadline =
-        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
-
-    context_->set_deadline(deadline);
-  }
-
-  virtual void Process() = 0;
-
-  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
-  std::unique_ptr<grpc::ClientContext> context_;
-  grpc::Status status_;
-  VarHandle var_h_;
-};
-
-typedef std::function<void(const VarHandle&, const sendrecv::VoidMessage&)>
-    RequestSendCallBack;
-
-class SendProcessor : public BaseProcessor {
- public:
-  explicit SendProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {}
-
-  virtual ~SendProcessor() {}
-
-  virtual void Process() {
-    if (response_call_back_) {
-      response_call_back_(var_h_, reply_);
-    }
-  }
-
-  sendrecv::VoidMessage reply_;
-  RequestSendCallBack response_call_back_ = NULL;
-};
-
-typedef std::function<void(const VarHandle&, const sendrecv::VariableMessage&)>
-    RequestGetCallBack;
-
-class GetProcessor : public BaseProcessor {
- public:
-  explicit GetProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {}
-
-  virtual ~GetProcessor() {}
-
-  virtual void Process() {
-    if (response_call_back_) {
-      response_call_back_(var_h_, reply_);
-    }
-  }
-
-  sendrecv::VariableMessage reply_;
-  RequestGetCallBack response_call_back_ = ProcGetResponse;
-};
-
-class BatchBarrierProcessor : public BaseProcessor {
- public:
-  explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {}
-
-  virtual ~BatchBarrierProcessor() {}
-
-  virtual void Process() {}
-  sendrecv::VoidMessage reply_;
-};
-
-class FetchBarrierProcessor : public BaseProcessor {
- public:
-  explicit FetchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(ch) {}
-
-  virtual ~FetchBarrierProcessor() {}
-
-  virtual void Process() {}
-  sendrecv::VariableMessage reply_;
-};
-
-class RPCClient {
- public:
-  bool AsyncSendVariable(const std::string& ep,
-                         const platform::DeviceContext& ctx,
-                         const framework::Scope& scope,
-                         const std::string& var_name,
-                         int64_t time_out = 600 * 1000);
-
-  bool AsyncGetVariable(const std::string& ep,
-                        const platform::DeviceContext& ctx,
-                        const framework::Scope& scope,
-                        const std::string& var_name,
-                        int64_t time_out = 600 * 1000);
-
-  void AsyncSendBatchBarrier(const std::string& ep,
-                             int64_t time_out = 600 * 1000);
-
-  void AsyncSendFetchBarrier(const std::string& ep,
-                             int64_t time_out = 600 * 1000);
-
-  bool Wait();
-
- private:
-  bool Proceed();
-  std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
-
- private:
-  grpc::CompletionQueue cq_;
-  std::map<std::string, std::shared_ptr<grpc::Channel>> channels_;
-  int64_t req_count_ = 0;
-};
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
deleted file mode 100644
index 8fff430cc4890925e4edba2fadb8eb7fc647d181..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ /dev/null
@@ -1,265 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detail/grpc_server.h"
-
-using grpc::ServerAsyncResponseWriter;
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-enum CallStatus { PROCESS = 0, FINISH };
-
-// reference:
-// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server
-class RequestBase {
- public:
-  explicit RequestBase(sendrecv::SendRecvService::AsyncService* service,
-                       grpc::ServerCompletionQueue* cq)
-      : service_(service), cq_(cq), status_(PROCESS) {
-    PADDLE_ENFORCE(cq_);
-  }
-  virtual ~RequestBase() {}
-  virtual void Process() { assert(false); }
-
-  CallStatus Status() { return status_; }
-  void SetStatus(CallStatus status) { status_ = status; }
-  virtual std::string GetReqName() {
-    assert(false);
-    return "";
-  }
-
- protected:
-  grpc::ServerContext ctx_;
-  sendrecv::SendRecvService::AsyncService* service_;
-  grpc::ServerCompletionQueue* cq_;
-  CallStatus status_;
-};
-
-typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
-
-class RequestSend final : public RequestBase {
- public:
-  explicit RequestSend(sendrecv::SendRecvService::AsyncService* service,
-                       grpc::ServerCompletionQueue* cq,
-                       SimpleBlockQueue<MessageWithName>* queue)
-      : RequestBase(service, cq), queue_(queue), responder_(&ctx_) {
-    service_->RequestSendVariable(&ctx_, &request_, &responder_, cq_, cq_,
-                                  this);
-  }
-
-  virtual ~RequestSend() {}
-
-  virtual std::string GetReqName() { return request_.varname(); }
-
-  virtual void Process() {
-    MessageWithName msg_with_name =
-        std::make_pair(request_.varname(), std::move(request_));
-    queue_->Push(std::move(msg_with_name));
-    responder_.Finish(reply_, grpc::Status::OK, this);
-    status_ = FINISH;
-  }
-
- protected:
-  sendrecv::VariableMessage request_;
-  sendrecv::VoidMessage reply_;
-  SimpleBlockQueue<MessageWithName>* queue_;
-  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
-};
-
-class RequestGet final : public RequestBase {
- public:
-  explicit RequestGet(sendrecv::SendRecvService::AsyncService* service,
-                      grpc::ServerCompletionQueue* cq, framework::Scope* scope,
-                      const platform::DeviceContext* dev_ctx,
-                      SimpleBlockQueue<MessageWithName>* queue)
-      : RequestBase(service, cq),
-        responder_(&ctx_),
-        scope_(scope),
-        dev_ctx_(dev_ctx),
-        queue_(queue) {
-    service_->RequestGetVariable(&ctx_, &request_, &responder_, cq_, cq_, this);
-  }
-
-  virtual ~RequestGet() {}
-
-  virtual std::string GetReqName() { return request_.varname(); }
-
-  virtual void Process() {
-    // proc request.
-    std::string var_name = request_.varname();
-    auto* var = scope_->FindVar(var_name);
-    if (var_name != FETCH_BARRIER_MESSAGE) {
-      SerializeToMessage(var_name, var, *dev_ctx_, &reply_);
-    }
-    // TODO(gongwb): check var's info.
-    responder_.Finish(reply_, grpc::Status::OK, this);
-    status_ = FINISH;
-    MessageWithName msg_with_name =
-        //          request name    reply
-        std::make_pair(var_name, std::move(reply_));
-    queue_->Push(msg_with_name);
-  }
-
- protected:
-  sendrecv::VariableMessage request_;
-  sendrecv::VariableMessage reply_;
-  ServerAsyncResponseWriter<sendrecv::VariableMessage> responder_;
-  framework::Scope* scope_;
-  const platform::DeviceContext* dev_ctx_;
-  SimpleBlockQueue<MessageWithName>* queue_;
-};
-
-void AsyncGRPCServer::WaitClientGet(int count) {
-  int fetch_barriers = 0;
-  while (fetch_barriers < count) {
-    auto msg = var_get_queue_.Pop();
-    if (msg.first == FETCH_BARRIER_MESSAGE) {
-      fetch_barriers++;
-    }
-  }
-}
-
-void AsyncGRPCServer::RunSyncUpdate() {
-  grpc::ServerBuilder builder;
-  builder.AddListeningPort(address_, grpc::InsecureServerCredentials());
-  builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
-  builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
-  builder.RegisterService(&service_);
-
-  cq_send_ = builder.AddCompletionQueue();
-  cq_get_ = builder.AddCompletionQueue();
-
-  server_ = builder.BuildAndStart();
-  LOG(INFO) << "Server listening on " << address_ << std::endl;
-
-  std::function<void()> send_register =
-      std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this);
-  std::function<void()> get_register =
-      std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this);
-
-  t_send_.reset(
-      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
-                                cq_send_.get(), "cq_send", send_register)));
-
-  t_get_.reset(
-      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
-                                cq_get_.get(), "cq_get", get_register)));
-
-  // wait server
-  server_->Wait();
-  t_send_->join();
-  t_get_->join();
-}
-
-void AsyncGRPCServer::ShutdownQueue() {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  cq_send_->Shutdown();
-  cq_get_->Shutdown();
-  is_shut_down_ = true;
-}
-
-// This URL explains why shutdown is complicate:
-void AsyncGRPCServer::ShutDown() {
-  server_->Shutdown();
-  ShutdownQueue();
-}
-
-void AsyncGRPCServer::TryToRegisterNewSendOne() {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  if (is_shut_down_) {
-    return;
-  }
-  RequestSend* send =
-      new RequestSend(&service_, cq_send_.get(), &var_recv_queue_);
-  VLOG(4) << "Create RequestSend status:" << send->Status();
-}
-
-void AsyncGRPCServer::TryToRegisterNewGetOne() {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  if (is_shut_down_) {
-    return;
-  }
-  RequestGet* get = new RequestGet(&service_, cq_get_.get(), scope_, dev_ctx_,
-                                   &var_get_queue_);
-  VLOG(4) << "Create RequestGet status:" << get->Status();
-}
-
-// FIXME(typhoonzero): change cq_name to enum.
-void AsyncGRPCServer::HandleRequest(grpc::ServerCompletionQueue* cq,
-                                    std::string cq_name,
-                                    std::function<void()> TryToRegisterNewOne) {
-  TryToRegisterNewOne();
-
-  void* tag = NULL;
-  bool ok = false;
-  while (true) {
-    if (!cq->Next(&tag, &ok)) {
-      LOG(INFO) << cq_name << " get CompletionQueue shutdown!";
-      break;
-    }
-
-    PADDLE_ENFORCE(tag);
-    // FIXME(typhoonzero): de-couple the barriers with recv_op
-    if (cq_name == "cq_get") WaitCond(1);
-    if (cq_name == "cq_send") WaitCond(0);
-
-    RequestBase* base = (RequestBase*)tag;
-    // reference:
-    // https://github.com/tensorflow/tensorflow/issues/5596
-    // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
-    // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
-    if (!ok) {
-      LOG(WARNING) << cq_name << " recv no regular event:argument name"
-                   << base->GetReqName();
-      TryToRegisterNewOne();
-      delete base;
-      continue;
-    }
-
-    switch (base->Status()) {
-      case PROCESS: {
-        VLOG(4) << cq_name << " status:" << base->Status();
-        TryToRegisterNewOne();
-        base->Process();
-        break;
-      }
-      case FINISH: {
-        VLOG(4) << cq_name << " status:" << base->Status();
-        delete base;
-        break;
-      }
-      default: { assert(false); }
-    }
-  }
-}
-
-void AsyncGRPCServer::WaitCond(int cond) {
-  std::unique_lock<std::mutex> lock(this->barrier_mutex_);
-  barrier_condition_.wait(lock,
-                          [=] { return this->barrier_cond_step_ == cond; });
-}
-
-void AsyncGRPCServer::SetCond(int cond) {
-  {
-    std::lock_guard<std::mutex> lock(this->barrier_mutex_);
-    barrier_cond_step_ = cond;
-  }
-  barrier_condition_.notify_all();
-}
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h
deleted file mode 100644
index b6666bcf96e484b0b17b935c0efb2930f19b19f2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/grpc_server.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/detail/simple_block_queue.h"
-
-#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/detail/send_recv.pb.h"
-
-#include <grpc++/grpc++.h>
-#include <grpc/support/log.h>
-#include <thread>
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
-class RequestBase;
-
-class AsyncGRPCServer final : public sendrecv::SendRecvService::Service {
- public:
-  explicit AsyncGRPCServer(const std::string &address) : address_(address) {}
-
-  void RunSyncUpdate();
-
-  // functions to sync server barrier status.
-  void WaitCond(int cond);
-  void SetCond(int cond);
-  void WaitClientGet(int count);
-
-  void SetScope(framework::Scope *scope) { scope_ = scope; }
-
-  void SetDevCtx(const platform::DeviceContext *dev_ctx) { dev_ctx_ = dev_ctx; }
-
-  const MessageWithName Get() { return this->var_recv_queue_.Pop(); }
-
-  void Push(const MessageWithName &msg) { this->var_recv_queue_.Push(msg); }
-
-  void ShutDown();
-
- protected:
-  void HandleRequest(grpc::ServerCompletionQueue *cq, std::string cq_name,
-                     std::function<void()> TryToRegisterNewOne);
-  void TryToRegisterNewSendOne();
-  void TryToRegisterNewGetOne();
-  void ShutdownQueue();
-
- private:
-  std::mutex cq_mutex_;
-  volatile bool is_shut_down_ = false;
-  std::unique_ptr<grpc::ServerCompletionQueue> cq_send_;
-  std::unique_ptr<grpc::ServerCompletionQueue> cq_get_;
-
-  sendrecv::SendRecvService::AsyncService service_;
-  std::unique_ptr<grpc::Server> server_;
-
-  std::string address_;
-  framework::Scope *scope_;
-  const platform::DeviceContext *dev_ctx_;
-  // received variable from RPC, operators fetch variable from this queue.
-  SimpleBlockQueue<MessageWithName> var_recv_queue_;
-  SimpleBlockQueue<MessageWithName> var_get_queue_;
-
-  // condition of the sub program
-  std::mutex barrier_mutex_;
-  mutable int barrier_cond_step_;
-  std::condition_variable barrier_condition_;
-
-  std::unique_ptr<std::thread> t_send_;
-  std::unique_ptr<std::thread> t_get_;
-};
-
-};  // namespace detail
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/detail/macros.h b/paddle/fluid/operators/detail/macros.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f4a15caa5542a45cd8e26a72b055ca8948069d0
--- /dev/null
+++ b/paddle/fluid/operators/detail/macros.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_DISTRIBUTE
+
+#ifdef PADDLE_WITH_GRPC
+
+#include "paddle/fluid/operators/distributed/grpc_client.h"
+#include "paddle/fluid/operators/distributed/grpc_server.h"
+#define RPCSERVER_T paddle::operators::distributed::AsyncGRPCServer
+#define RPCCLIENT_T paddle::operators::distributed::GRPCClient
+
+#else  // PADDLE_WITH_GRPC
+
+#include "paddle/fluid/operators/distributed/brpc_client.h"
+#include "paddle/fluid/operators/distributed/brpc_server.h"
+#define RPCSERVER_T paddle::operators::distributed::AsyncBRPCServer
+#define RPCCLIENT_T paddle::operators::distributed::BRPCClient
+
+#endif  // PADDLE_WITH_GRPC
+
+#endif  // PADDLE_WITH_DISTRIBUTE
diff --git a/paddle/fluid/operators/detail/proto_encoder_helper.h b/paddle/fluid/operators/detail/proto_encoder_helper.h
deleted file mode 100644
index 4a7bfb8bd586fe84c9243bc64117d146c4386674..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/proto_encoder_helper.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// NOTE: This file was originally created by tensorflow
-//       (https://github.com/tensorflow/tensorflow/) we borrow this
-//       file and did some modifications so that we can send gRPC
-//       requests without too much copying of the tensor data.
-
-#pragma once
-
-#include <grpc++/grpc++.h>
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-char* EncodeVarint32(char* dst, uint32_t v) {
-  // Operate on characters as unsigneds
-  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
-  static const int B = 128;
-  if (v < (1 << 7)) {
-    *(ptr++) = v;
-  } else if (v < (1 << 14)) {
-    *(ptr++) = v | B;
-    *(ptr++) = v >> 7;
-  } else if (v < (1 << 21)) {
-    *(ptr++) = v | B;
-    *(ptr++) = (v >> 7) | B;
-    *(ptr++) = v >> 14;
-  } else if (v < (1 << 28)) {
-    *(ptr++) = v | B;
-    *(ptr++) = (v >> 7) | B;
-    *(ptr++) = (v >> 14) | B;
-    *(ptr++) = v >> 21;
-  } else {
-    *(ptr++) = v | B;
-    *(ptr++) = (v >> 7) | B;
-    *(ptr++) = (v >> 14) | B;
-    *(ptr++) = (v >> 21) | B;
-    *(ptr++) = v >> 28;
-  }
-  return reinterpret_cast<char*>(ptr);
-}
-
-char* EncodeVarint64(char* dst, uint64_t v) {
-  static const int B = 128;
-  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
-  while (v >= B) {
-    *(ptr++) = (v & (B - 1)) | B;
-    v >>= 7;
-  }
-  *(ptr++) = static_cast<unsigned char>(v);
-  return reinterpret_cast<char*>(ptr);
-}
-
-int VarintLength(uint64_t v) {
-  int len = 1;
-  while (v >= 128) {
-    v >>= 7;
-    len++;
-  }
-  return len;
-}
-
-class ProtoEncodeHelper {
- public:
-  ProtoEncodeHelper(char* buf, int max_size)
-      : base_(buf), p_(buf), limit_(base_ + max_size) {}
-
-  ~ProtoEncodeHelper() {
-    // Make sure callers didn't do operations that went over max_size promised
-    PADDLE_ENFORCE_LE(p_, limit_);
-  }
-
-  const char* data() const { return base_; }
-  size_t size() const { return p_ - base_; }
-
-  void WriteUint64(int tag, uint64_t v) {
-    Encode32(combine(tag, WIRETYPE_VARINT));
-    Encode64(v);
-  }
-  void WriteBool(int tag, bool v) {
-    Encode32(combine(tag, WIRETYPE_VARINT));
-    EncodeBool(v);
-  }
-  void WriteString(int tag, const std::string& v) {
-    Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
-    Encode32(v.size());
-    EncodeBytes(v.data(), v.size());
-  }
-  void WriteVarlengthBeginning(int tag, uint32_t len) {
-    Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
-    Encode32(len);
-  }
-  void WriteRawBytes(const std::string& v) { EncodeBytes(v.data(), v.size()); }
-
- private:
-  // Note: this module's behavior must match the protocol buffer wire encoding
-  // format.
-  enum {
-    WIRETYPE_VARINT = 0,
-    WIRETYPE_LENGTH_DELIMITED = 2,
-  };
-  static uint32_t combine(uint32_t tag, uint32_t type) {
-    return ((tag << 3) | type);
-  }
-  inline void Encode32(uint32_t v) {
-    if (v < 128) {
-      // Fast path for single-byte values.  Many of the calls will use a
-      // constant value for v, so the comparison will get optimized away
-      // when Encode32 is inlined into the caller.
-      *p_ = v;
-      p_++;
-    } else {
-      p_ = EncodeVarint32(p_, v);
-    }
-  }
-  void Encode64(uint64_t v) { p_ = EncodeVarint64(p_, v); }
-  void EncodeBool(bool v) {
-    *p_ = (v ? 1 : 0);  // Equal to varint32 encoding of 0 or 1
-    p_++;
-  }
-  void EncodeBytes(const char* bytes, int N) {
-    memcpy(p_, bytes, N);
-    p_ += N;
-  }
-
-  char* base_;
-  char* p_;
-  char* limit_;  // Just for CHECKs
-};
-
-}  // detail
-}  // operators
-}  // paddle
diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto
deleted file mode 100644
index b0215d4a80c9440f09c35434903fd6166b03e8b0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/send_recv.proto
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under
-the Apache License, Version 2.0 (the "License"); you may not use this file
-except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-syntax = "proto3";
-package sendrecv;
-
-service SendRecvService {
-  // For parameter server round-robin like hashing, do not split tensors.
-  // Send and recv only one tensor
-  // TODO(typhoonzero): add streaming API
-  rpc SendVariable(VariableMessage) returns (VoidMessage) {}
-  // Argument VariableMessage for GetVariable should only contain varname.
-  rpc GetVariable(VariableMessage) returns (VariableMessage) {}
-}
-
-// VariableMessage is serialized paddle variable message.
-// It can be:
-// LoDTensor
-// SelectedRows
-enum VarType {
-  LOD_TENSOR = 0;
-  SELECTED_ROWS = 1;
-}
-
-message VariableMessage {
-  enum Type {
-    // Pod Types
-    BOOL = 0;
-    INT16 = 1;
-    INT32 = 2;
-    INT64 = 3;
-    FP16 = 4;
-    FP32 = 5;
-    FP64 = 6;
-  }
-
-  message LodData { repeated int64 lod_data = 1; }
-
-  string varname = 1;
-  // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
-  VarType type = 2;
-  // bool persistable is not needed for sending.
-  // tensor info:
-  Type data_type = 3;
-  repeated int64 dims = 4;
-
-  // lod details:
-  int64 lod_level = 5;
-  repeated LodData lod = 6;
-  // tensor data
-  bytes serialized = 7;
-  // selected_rows data
-  bytes rows = 8;
-}
-
-message VoidMessage {}
diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc
deleted file mode 100644
index 39117eeeb611b025c426938c60ddf82c6af232ca..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ /dev/null
@@ -1,300 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/detail/bytebuffer_stream.h"
-#include "paddle/fluid/operators/detail/proto_encoder_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-void SerializeToMessage(const std::string& name, const framework::Variable* var,
-                        const platform::DeviceContext& ctx,
-                        sendrecv::VariableMessage* msg) {
-  msg->set_varname(name);
-  std::ostringstream oss;
-  switch (framework::ToVarType(var->Type())) {
-    case framework::proto::VarType_Type_LOD_TENSOR:
-      msg->set_type(sendrecv::VarType::LOD_TENSOR);
-      framework::SerializeToStream(oss, var->Get<framework::LoDTensor>(), ctx);
-      break;
-    case framework::proto::VarType_Type_SELECTED_ROWS:
-      msg->set_type(sendrecv::VarType::SELECTED_ROWS);
-      framework::SerializeToStream(oss, var->Get<framework::SelectedRows>(),
-                                   ctx);
-      break;
-    default: {
-      PADDLE_THROW("Serialize does not support type: %s",
-                   typeid(var->Type()).name());
-      break;
-    }
-  }
-  msg->set_serialized(oss.str());
-}
-
-void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
-                            const platform::DeviceContext& ctx,
-                            framework::Variable* var) {
-  std::istringstream iss(msg.serialized());
-  switch (msg.type()) {
-    case sendrecv::VarType::LOD_TENSOR:
-      DeserializeFromStream(iss, var->GetMutable<framework::LoDTensor>(), ctx);
-      break;
-    case sendrecv::VarType::SELECTED_ROWS: {
-      DeserializeFromStream(iss, var->GetMutable<framework::SelectedRows>(),
-                            ctx);
-      break;
-    }
-    default: {
-      PADDLE_THROW("Deserialize does not support type: %s",
-                   typeid(var->Type()).name());
-      break;
-    }
-  }
-}
-
-void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
-                           const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg) {
-  using VarMsg = sendrecv::VariableMessage;
-  sendrecv::VariableMessage request;
-  std::string header;
-  request.AppendToString(&header);
-  // When using GPU, need to free the copied CPU buffer
-  // when the ByteBuffer destroies
-  // TODO(typhoonzero): add unref here, if we have dependent
-  // parallelism execution, need to know when to free the tensor.
-  DestroyCallback destroy_callback = [](void* backing) {};
-
-  void* buf = malloc(1024);
-  void* payload = nullptr;
-  size_t payload_size;
-  ProtoEncodeHelper e((char*)buf, 1024);
-  e.WriteString(VarMsg::kVarnameFieldNumber, name);
-  if (var->IsType<framework::LoDTensor>()) {
-    e.WriteUint64(VarMsg::kTypeFieldNumber, 0);
-  } else if (var->IsType<framework::SelectedRows>()) {
-    e.WriteUint64(VarMsg::kTypeFieldNumber, 1);
-  }
-
-  switch (framework::ToVarType(var->Type())) {
-    case framework::proto::VarType_Type_LOD_TENSOR: {
-      auto tensor = var->Get<framework::LoDTensor>();
-      e.WriteUint64(VarMsg::kDataTypeFieldNumber,
-                    framework::ToDataType(tensor.type()));
-      for (auto& dim : framework::vectorize(tensor.dims())) {
-        e.WriteUint64(VarMsg::kDimsFieldNumber, dim);
-      }
-      auto lod = tensor.lod();  // std::vector<Vector<size_t>>
-      if (lod.size() > 0) {
-        e.WriteUint64(VarMsg::kLodLevelFieldNumber, lod.size());
-
-        for (auto& each : lod) {
-          e.WriteVarlengthBeginning(VarMsg::kLodFieldNumber,
-                                    2 +      // tag + varintlength of submessage
-                                        1 +  // kLodDataFieldNumber
-                                        each.size());
-          // auto copied from GPU
-          for (auto& d : each) {
-            e.WriteUint64(VarMsg::LodData::kLodDataFieldNumber, d);
-          }
-        }
-      }
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
-        PADDLE_ENFORCE(platform::is_gpu_place(tensor.place()));
-        platform::CPUPlace cpu;
-        auto& gpu_dev_ctx =
-            static_cast<const platform::CUDADeviceContext&>(ctx);
-        auto copy_size = tensor.memory_size();
-        payload = memory::Alloc(cpu, copy_size);
-        memory::Copy(cpu, payload,
-                     boost::get<platform::CUDAPlace>(tensor.place()),
-                     reinterpret_cast<const void*>(tensor.data<void>()),
-                     copy_size, gpu_dev_ctx.stream());
-        ctx.Wait();
-        destroy_callback = [](void* backing) {
-          platform::CPUPlace cpu;
-          memory::Free(cpu, backing);
-        };
-#endif
-      } else {
-        payload = tensor.data<void>();
-      }
-      payload_size = tensor.memory_size();
-      e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
-    } break;
-    case framework::proto::VarType_Type_SELECTED_ROWS: {
-      // TODO(typhoonzero): selectedrows implement should not use unique_ptr
-      auto* slr = var->GetMutable<framework::SelectedRows>();
-      e.WriteUint64(VarMsg::kDataTypeFieldNumber,
-                    framework::ToDataType(slr->value().type()));
-      for (auto& dim : framework::vectorize(slr->value().dims())) {
-        e.WriteUint64(VarMsg::kDimsFieldNumber, dim);
-      }
-      e.WriteUint64(VarMsg::kLodLevelFieldNumber, 0);
-      auto* tensor = slr->mutable_value();
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
-        platform::CPUPlace cpu;
-        auto& gpu_dev_ctx =
-            static_cast<const platform::CUDADeviceContext&>(ctx);
-        auto copy_size = tensor->memory_size();
-        payload = memory::Alloc(cpu, copy_size);
-        memory::Copy(cpu, payload,
-                     boost::get<platform::CUDAPlace>(tensor->place()),
-                     reinterpret_cast<const void*>(tensor->data<void>()),
-                     copy_size, gpu_dev_ctx.stream());
-        ctx.Wait();
-        destroy_callback = [](void* backing) {
-          platform::CPUPlace cpu;
-          memory::Free(cpu, backing);
-        };
-#endif
-      } else {
-        payload = slr->mutable_value()->data<void>();
-      }
-      payload_size = tensor->memory_size();
-      e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
-    } break;
-    default:
-      PADDLE_THROW("Serialize does not support type: %s",
-                   typeid(var->Type()).name());
-      break;
-  }
-  // steal reference of tensor data
-  ::grpc::Slice slices[4];  // metadata, tensor, rows meta, rows
-  int num_slices = 2;       // only SelectedRows have rows buffer
-  slices[0] = ::grpc::Slice(e.size());
-  memcpy(const_cast<uint8_t*>(slices[0].begin()), e.data(), e.size());
-  slices[1] = ::grpc::Slice(
-      grpc_slice_new_with_user_data(payload, payload_size, destroy_callback,
-                                    static_cast<char*>(payload)),
-      ::grpc::Slice::STEAL_REF);
-
-  if (framework::ToVarType(var->Type()) ==
-      framework::proto::VarType_Type_SELECTED_ROWS) {
-    auto* slr = var->GetMutable<framework::SelectedRows>();
-
-    ProtoEncodeHelper e2((char*)buf, 128);
-    // NOTE: rows is of type int64_t
-    size_t rows_memory_size =
-        slr->rows().capacity() * framework::SizeOfType(typeid(int64_t));
-    e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
-    slices[2] = ::grpc::Slice(e2.size());
-    memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
-
-    slices[3] = ::grpc::Slice(
-        grpc_slice_new_with_user_data(
-            const_cast<void*>(
-                reinterpret_cast<const void*>(slr->rows().data())),
-            rows_memory_size,
-            [](void* backing) {
-              // TODO(typhoonzero): add unref here, same as above.
-            },
-            const_cast<char*>(
-                reinterpret_cast<const char*>(slr->rows().data()))),
-        ::grpc::Slice::STEAL_REF);
-    num_slices = 4;
-  }
-
-  ::grpc::ByteBuffer tmp(&slices[0], num_slices);
-  msg->Swap(&tmp);
-}
-
-void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
-                               const platform::DeviceContext& ctx,
-                               framework::Variable* var) {
-  sendrecv::VariableMessage meta;
-  GrpcByteBufferSource source;
-  source.Init(msg);
-  ::google::protobuf::io::CodedInputStream input(&source);
-  // do zerocopy parsing
-  PADDLE_ENFORCE(meta.ParseFromCodedStream(&input));
-  PADDLE_ENFORCE(input.ConsumedEntireMessage());
-  // dims is needed by both tensor and selectedrows
-  std::vector<int> vecdims;
-  for (auto& d : meta.dims()) {
-    vecdims.push_back(d);
-  }
-  framework::DDim dims = framework::make_ddim(vecdims);
-
-  if (meta.type() == sendrecv::LOD_TENSOR) {
-    auto* tensor = var->GetMutable<framework::LoDTensor>();
-    tensor->Resize(dims);
-    void* tensor_data = tensor->mutable_data(
-        ctx.GetPlace(),
-        paddle::operators::detail::ToTypeIndex(meta.data_type()));
-    framework::LoD lod;
-    for (int i = 0; i < meta.lod_level(); ++i) {
-      framework::Vector<size_t> v;
-      for (int j = 0; j < meta.lod(i).lod_data_size(); ++j) {
-        v.push_back(meta.lod(i).lod_data(j));
-      }
-      lod.push_back(v);
-    }
-    tensor->set_lod(lod);
-    // How to avoid copying and use the message buffer directly?
-    // Maybe need to find a way to release all memory except tensor content.
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
-      platform::CPUPlace cpu;
-      auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
-      memory::Copy(boost::get<platform::CUDAPlace>(tensor->place()),
-                   tensor_data, cpu,
-                   reinterpret_cast<const void*>(meta.serialized().data()),
-                   meta.serialized().size(), gpu_dev_ctx.stream());
-      ctx.Wait();
-#endif
-    } else {
-      memcpy(tensor_data,
-             reinterpret_cast<const void*>(meta.serialized().data()),
-             meta.serialized().size());
-    }
-  } else if (meta.type() == sendrecv::SELECTED_ROWS) {
-    auto* slr = var->GetMutable<framework::SelectedRows>();
-    auto* tensor = slr->mutable_value();
-    int64_t* rows_data = slr->mutable_rows()->data();
-    tensor->Resize(dims);
-    void* tensor_data = tensor->mutable_data(
-        ctx.GetPlace(),
-        paddle::operators::detail::ToTypeIndex(meta.data_type()));
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-#ifdef PADDLE_WITH_CUDA
-      platform::CPUPlace cpu;
-      auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
-      memory::Copy(boost::get<platform::CUDAPlace>(tensor->place()),
-                   tensor_data, cpu,
-                   reinterpret_cast<const void*>(meta.serialized().data()),
-                   meta.serialized().size(), gpu_dev_ctx.stream());
-      ctx.Wait();
-#endif
-    } else {
-      memcpy(tensor_data,
-             reinterpret_cast<const void*>(meta.serialized().data()),
-             meta.serialized().size());
-    }
-    // copy rows CPU data, GPU data will be copied lazly
-    memcpy(rows_data, reinterpret_cast<const void*>(meta.rows().data()),
-           meta.rows().size());
-  }
-}
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.h b/paddle/fluid/operators/detail/sendrecvop_utils.h
deleted file mode 100644
index 4fa6aefd3e0b1bd45ac52b1eff3b29126d79f03a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/sendrecvop_utils.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-
-#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/detail/send_recv.pb.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
-#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
-#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
-
-typedef void (*DestroyCallback)(void*);
-
-void SerializeToMessage(const std::string& name, const framework::Variable* var,
-                        const platform::DeviceContext& ctx,
-                        sendrecv::VariableMessage* msg);
-
-void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
-                            const platform::DeviceContext& ctx,
-                            framework::Variable* var);
-
-void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
-                           const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg);
-
-void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
-                               const platform::DeviceContext& ctx,
-                               framework::Variable* var);
-
-inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) {
-  switch (type) {
-    case sendrecv::VariableMessage::FP32:
-      return typeid(float);  // NOLINT
-    case sendrecv::VariableMessage::FP64:
-      return typeid(double);  // NOLINT
-    case sendrecv::VariableMessage::INT32:
-      return typeid(int);  // NOLINT
-    case sendrecv::VariableMessage::INT64:
-      return typeid(int64_t);  // NOLINT
-    case sendrecv::VariableMessage::BOOL:
-      return typeid(bool);  // NOLINT
-    default:
-      PADDLE_THROW("Not support type %d", type);
-  }
-}
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/simple_block_queue.h b/paddle/fluid/operators/detail/simple_block_queue.h
deleted file mode 100644
index 36b58b0c6700b5af7eaea92d2b0c32adaba35bb8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/simple_block_queue.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <condition_variable>
-#include <deque>
-#include <mutex>
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-template <typename T>
-class SimpleBlockQueue {
- private:
-  std::mutex mutex_;
-  std::condition_variable condition_;
-  std::deque<T> queue_;
-
- public:
-  void Push(T const& value) {
-    {
-      std::unique_lock<std::mutex> lock(this->mutex_);
-      queue_.push_front(value);
-    }
-    this->condition_.notify_one();
-  }
-
-  T Pop() {
-    std::unique_lock<std::mutex> lock(this->mutex_);
-    this->condition_.wait(lock, [=] { return !this->queue_.empty(); });
-    T rc(std::move(this->queue_.back()));
-    this->queue_.pop_back();
-    return rc;
-  }
-};
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/detail/test_serde.cc b/paddle/fluid/operators/detail/test_serde.cc
deleted file mode 100644
index 2f06e5a686b996858d21930a1afa2861efca4a9b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detail/test_serde.cc
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-#include <string>
-#include <thread>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/printf.h"
-
-namespace framework = paddle::framework;
-namespace platform = paddle::platform;
-namespace operators = paddle::operators;
-namespace math = paddle::operators::math;
-namespace memory = paddle::memory;
-
-void RunSerdeTestTensor(platform::Place place) {
-  // serialize var to ByteBuffer
-  framework::Variable var;
-  auto* tensor = var.GetMutable<framework::LoDTensor>();
-  tensor->Resize(framework::make_ddim({4, 8, 4, 2}));
-  framework::LoD lod;
-  lod.push_back(framework::Vector<size_t>({1, 3, 8}));
-  tensor->set_lod(lod);
-  int tensor_numel = 4 * 8 * 4 * 2;
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-  tensor->mutable_data<float>(place);
-  math::set_constant(ctx, tensor, 31.9);
-
-  ::grpc::ByteBuffer msg;
-  operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
-  EXPECT_GT(msg.Length(), 0);
-
-  // deserialize
-  std::vector<::grpc::Slice> slices;
-  (void)msg.Dump(&slices);
-  std::string tmp;
-  for (const auto& s : slices) {
-    tmp.append(reinterpret_cast<const char*>(s.begin()), s.size());
-  }
-  sendrecv::VariableMessage varmsg;
-  EXPECT_TRUE(varmsg.ParseFromString(tmp));
-  EXPECT_EQ(varmsg.varname(), "myvar");
-  EXPECT_EQ(varmsg.type(), 0);
-  EXPECT_EQ(varmsg.dims()[0], 4);
-  EXPECT_EQ(varmsg.dims()[1], 8);
-  EXPECT_EQ(varmsg.dims()[2], 4);
-  EXPECT_EQ(varmsg.dims()[3], 2);
-  EXPECT_EQ(varmsg.lod_level(), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
-  EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
-
-  const float* tensor_data =
-      reinterpret_cast<const float*>(varmsg.serialized().data());
-  for (int i = 0; i < tensor_numel; ++i) {
-    EXPECT_FLOAT_EQ(tensor_data[i], 31.9);
-  }
-
-  // deserialize zero-copy
-  framework::Variable var2;
-  operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
-  auto tensor2 = var2.Get<framework::LoDTensor>();
-  float* tensor_data2 = nullptr;
-  framework::Tensor tmp_tensor;
-
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-    platform::CPUPlace cpu;
-    framework::TensorCopy(tensor2, cpu, &tmp_tensor);
-    tensor_data2 = tmp_tensor.data<float>();
-  } else {
-    tensor_data2 = const_cast<float*>(tensor2.data<float>());
-  }
-
-  EXPECT_EQ(varmsg.lod_level(), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
-  EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
-  for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9);
-}
-
-void RunSerdeTestSelectedRows(platform::Place place) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-
-  // serialize var to ByteBuffer
-  framework::Variable var;
-  auto* slr = var.GetMutable<framework::SelectedRows>();
-  auto* tensor = slr->mutable_value();
-  auto* rows = slr->mutable_rows();
-  tensor->Resize(framework::make_ddim({2, 10}));
-  tensor->mutable_data<float>(place);
-  int tensor_numel = 2 * 10;
-  math::set_constant(ctx, tensor, 32.7);
-  rows->push_back(3);
-  rows->push_back(10);
-
-  ::grpc::ByteBuffer msg;
-  operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
-  EXPECT_GT(msg.Length(), 0);
-
-  // deserialize
-  std::vector<::grpc::Slice> slices;
-  (void)msg.Dump(&slices);
-  std::string tmp;
-  for (const auto& s : slices) {
-    tmp.append(reinterpret_cast<const char*>(s.begin()), s.size());
-  }
-  sendrecv::VariableMessage varmsg;
-  EXPECT_TRUE(varmsg.ParseFromString(tmp));
-
-  EXPECT_EQ(varmsg.varname(), "myvar");
-  EXPECT_EQ(varmsg.type(), 1);
-
-  const float* tensor_data =
-      reinterpret_cast<const float*>(varmsg.serialized().data());
-  const int64_t* rows_data =
-      reinterpret_cast<const int64_t*>(varmsg.rows().data());
-  for (int i = 0; i < tensor_numel; ++i) {
-    EXPECT_FLOAT_EQ(tensor_data[i], 32.7);
-  }
-  EXPECT_EQ(rows_data[0], 3);
-  EXPECT_EQ(rows_data[1], 10);
-  // deserialize zero-copy
-  framework::Variable var2;
-  operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
-
-  auto* slr2 = var2.GetMutable<framework::SelectedRows>();
-  auto* tensor2 = slr2->mutable_value();
-  auto* rows2 = slr2->mutable_rows();
-  float* tensor_data2 = nullptr;
-  framework::Tensor tmp_tensor;
-
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-    platform::CPUPlace cpu;
-    framework::TensorCopy(*tensor2, cpu, &tmp_tensor);
-    tensor_data2 = tmp_tensor.data<float>();
-  } else {
-    tensor_data2 = const_cast<float*>(tensor2->data<float>());
-  }
-  const int64_t* rows_data2 = rows2->data();
-
-  for (int i = 0; i < tensor_numel; ++i) {
-    EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
-  }
-  EXPECT_EQ(rows_data2[0], 3);
-  EXPECT_EQ(rows_data2[1], 10);
-}
-
-TEST(SelectedRows, CPU) {
-  platform::CPUPlace place;
-  RunSerdeTestSelectedRows(place);
-}
-
-TEST(SelectedRows, GPU) {
-  platform::CUDAPlace place;
-  RunSerdeTestSelectedRows(place);
-}
-
-TEST(Tensor, CPU) {
-  platform::CPUPlace place;
-  RunSerdeTestTensor(place);
-}
-
-TEST(Tensor, GPU) {
-  platform::CUDAPlace place;
-  RunSerdeTestTensor(place);
-}
\ No newline at end of file
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6d296ff7bf14de9175dc589dfa8b46c534127ca1
--- /dev/null
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -0,0 +1,33 @@
+set(LOCAL_DETECTION_LIBS)
+
+function(detection_library TARGET_NAME)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    set(options "")
+    set(common_deps op_registry)
+    set(pybind_flag 0)
+    cmake_parse_arguments(detection_library "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN})
+    op_library(${TARGET_NAME} SRCS ${detection_library_SRCS} DEPS ${common_deps} ${detection_library_DEPS})
+    set(LOCAL_DETECTION_LIBS
+            ${TARGET_NAME}
+            ${LOCAL_DETECTION_LIBS}
+        PARENT_SCOPE)
+endfunction()
+
+detection_library(bipartite_match_op SRCS bipartite_match_op.cc)
+detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu)
+detection_library(iou_similarity_op SRCS iou_similarity_op.cc
+iou_similarity_op.cu)
+detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc)
+detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc)
+detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
+detection_library(anchor_generator_op SRCS anchor_generator_op.cc
+anchor_generator_op.cu)
+detection_library(target_assign_op SRCS target_assign_op.cc
+target_assign_op.cu)
+detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc
+    polygon_box_transform_op.cu)
+
+# Export local libraries to parent
+set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cc b/paddle/fluid/operators/detection/anchor_generator_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0c0155a0a977846b1300d93b4c3fef0e71fc1d26
--- /dev/null
+++ b/paddle/fluid/operators/detection/anchor_generator_op.cc
@@ -0,0 +1,154 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/anchor_generator_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AnchorGeneratorOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of AnchorGeneratorOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Anchors"),
+                   "Output(Anchors) of AnchorGeneratorOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Variances"),
+        "Output(Variances) of AnchorGeneratorOp should not be null.");
+
+    auto input_dims = ctx->GetInputDim("Input");
+    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
+
+    auto anchor_sizes = ctx->Attrs().Get<std::vector<float>>("anchor_sizes");
+    auto aspect_ratios = ctx->Attrs().Get<std::vector<float>>("aspect_ratios");
+    auto stride = ctx->Attrs().Get<std::vector<float>>("stride");
+    auto variances = ctx->Attrs().Get<std::vector<float>>("variances");
+
+    size_t num_anchors = aspect_ratios.size() * anchor_sizes.size();
+
+    std::vector<int64_t> dim_vec(4);
+    dim_vec[0] = input_dims[2];
+    dim_vec[1] = input_dims[3];
+    dim_vec[2] = num_anchors;
+    dim_vec[3] = 4;
+    ctx->SetOutputDim("Anchors", framework::make_ddim(dim_vec));
+    ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
+        ctx.device_context());
+  }
+};
+
+class AnchorGeneratorOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Input",
+             "(Tensor, default Tensor<float>), "
+             "the input feature is a tensor with a rank of 4. "
+             "The layout is NCHW.");
+    AddOutput("Anchors",
+              "(Tensor, default Tensor<float>), the output is a "
+              "tensor with a rank of 4. The layout is [H, W, num_anchors, 4]. "
+              "H is the height of input, W is the width of input, num_anchors "
+              "is the box count of each position. "
+              "Each anchor is in (xmin, ymin, xmax, ymax) format");
+    AddOutput("Variances",
+              "(Tensor, default Tensor<float>), the expanded variances for "
+              "normalizing bbox regression targets. The layout is [H, W, "
+              "num_anchors, 4]. "
+              "H is the height of input, W is the width of input, num_anchors "
+              "is the box count of each position. "
+              "Each variance is in (xcenter, ycenter, w, h) format");
+
+    AddAttr<std::vector<float>>(
+        "anchor_sizes",
+        "(vector<float>) List of Region Proposal Network(RPN) anchor sizes "
+        " given in absolute pixels e.g. (64, 128, 256, 512)."
+        " For instance, the anchor size of 64 means the area of this anchor "
+        "equals to 64**2.")
+        .AddCustomChecker([](const std::vector<float>& anchor_sizes) {
+          PADDLE_ENFORCE_GT(anchor_sizes.size(), 0,
+                            "Size of anchor_sizes must be at least 1.");
+          for (size_t i = 0; i < anchor_sizes.size(); ++i) {
+            PADDLE_ENFORCE_GT(anchor_sizes[i], 0.0,
+                              "anchor_sizes[%d] must be positive.", i);
+          }
+        });
+    AddAttr<std::vector<float>>(
+        "aspect_ratios",
+        "(vector<float>) List of Region Proposal Network(RPN) anchor aspect "
+        "ratios, e.g. (0.5, 1, 2)."
+        "For instacne, the aspect ratio of 0.5 means the height / width of "
+        "this anchor equals 0.5.");
+
+    AddAttr<std::vector<float>>("variances",
+                                "(vector<float>) List of variances to be used "
+                                "in box regression deltas")
+        .AddCustomChecker([](const std::vector<float>& variances) {
+          PADDLE_ENFORCE_EQ(variances.size(), 4,
+                            "Must and only provide 4 variance.");
+          for (size_t i = 0; i < variances.size(); ++i) {
+            PADDLE_ENFORCE_GT(variances[i], 0.0,
+                              "variance[%d] must be greater than 0.", i);
+          }
+        });
+
+    AddAttr<std::vector<float>>("stride",
+                                "Anchors stride across width and height, "
+                                "with a default of (16, 16)")
+        .SetDefault(std::vector<float>(2, 16.0))
+        .AddCustomChecker([](const std::vector<float>& stride) {
+          PADDLE_ENFORCE_EQ(
+              stride.size(), 2,
+              "Must and only provide 2 stride for width and height.");
+          for (size_t i = 0; i < stride.size(); ++i) {
+            PADDLE_ENFORCE_GT(stride[i], 0.0,
+                              "stride[%d] should be larger than 0.", i);
+          }
+        });
+
+    AddAttr<float>("offset",
+                   "(float) "
+                   "Anchor center offset, with a default of 0.5")
+        .SetDefault(0.5);
+    AddComment(R"DOC(
+AnchorGenerator operator
+Generates anchors for Faster RCNN, FPN etc. algorithm.
+Each position of the input produce N anchors, N =
+ size(anchor_sizes) * size(aspect_ratios).
+
+Please get more information from the following papers:
+https://arxiv.org/abs/1506.01497.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(anchor_generator, ops::AnchorGeneratorOp,
+                  ops::AnchorGeneratorOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+
+REGISTER_OP_CPU_KERNEL(anchor_generator, ops::AnchorGeneratorOpKernel<float>,
+                       ops::AnchorGeneratorOpKernel<double>);
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cu b/paddle/fluid/operators/detection/anchor_generator_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3cc9bbeee1eeed17142a6b1bd23b45aff9cf745f
--- /dev/null
+++ b/paddle/fluid/operators/detection/anchor_generator_op.cu
@@ -0,0 +1,132 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/anchor_generator_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void GenAnchors(T* out, const T* aspect_ratios, const int ar_num,
+                           const T* anchor_sizes, const int as_num,
+                           const T* stride, const int sd_num, const int height,
+                           const int width, const T offset) {
+  int num_anchors = as_num * ar_num;
+  int box_num = height * width * num_anchors;
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num;
+       i += blockDim.x * gridDim.x) {
+    int h_idx = i / (num_anchors * width);
+    int w_idx = (i / num_anchors) % width;
+    T stride_width = stride[0];
+    T stride_height = stride[1];
+    T x_ctr = (w_idx * stride_width) + offset * (stride_width - 1);
+    T y_ctr = (h_idx * stride_height) + offset * (stride_height - 1);
+    T area, area_ratios;
+    T base_w, base_h;
+    T scale_w, scale_h;
+    T anchor_width, anchor_height;
+    int anch_idx = i % num_anchors;
+    int ar_idx = anch_idx / as_num;
+    int as_idx = anch_idx % as_num;
+    T aspect_ratio = aspect_ratios[ar_idx];
+    T anchor_size = anchor_sizes[as_idx];
+    area = stride_width * stride_height;
+    area_ratios = area / aspect_ratio;
+    base_w = round(sqrt(area_ratios));
+    base_h = round(base_w * aspect_ratio);
+    scale_w = anchor_size / stride_width;
+    scale_h = anchor_size / stride_height;
+    anchor_width = scale_w * base_w;
+    anchor_height = scale_h * base_h;
+
+    T xmin = (x_ctr - 0.5 * (anchor_width - 1));
+    T ymin = (y_ctr - 0.5 * (anchor_height - 1));
+    T xmax = (x_ctr + 0.5 * (anchor_width - 1));
+    T ymax = (y_ctr + 0.5 * (anchor_height - 1));
+    out[i * 4] = xmin;
+    out[i * 4 + 1] = ymin;
+    out[i * 4 + 2] = xmax;
+    out[i * 4 + 3] = ymax;
+  }
+}
+
+template <typename T>
+__global__ void SetVariance(T* out, const T* var, const int vnum,
+                            const int num) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+       i += blockDim.x * gridDim.x) {
+    out[i] = var[i % vnum];
+  }
+}
+
+template <typename T>
+class AnchorGeneratorOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
+    auto* anchors = ctx.Output<paddle::framework::Tensor>("Anchors");
+    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+
+    auto anchor_sizes = ctx.Attr<std::vector<float>>("anchor_sizes");
+    auto aspect_ratios = ctx.Attr<std::vector<float>>("aspect_ratios");
+    auto stride = ctx.Attr<std::vector<float>>("stride");
+    auto variances = ctx.Attr<std::vector<float>>("variances");
+
+    T offset = static_cast<T>(ctx.Attr<float>("offset"));
+
+    auto width = input->dims()[3];
+    auto height = input->dims()[2];
+
+    int num_anchors = aspect_ratios.size() * anchor_sizes.size();
+
+    int box_num = width * height * num_anchors;
+
+    int block = 512;
+    int grid = (box_num + block - 1) / block;
+
+    auto stream =
+        ctx.template device_context<platform::CUDADeviceContext>().stream();
+
+    anchors->mutable_data<T>(ctx.GetPlace());
+    vars->mutable_data<T>(ctx.GetPlace());
+
+    framework::Tensor ar;
+    framework::TensorFromVector(aspect_ratios, ctx.device_context(), &ar);
+
+    framework::Tensor as;
+    framework::TensorFromVector(anchor_sizes, ctx.device_context(), &as);
+
+    framework::Tensor sd;
+    framework::TensorFromVector(stride, ctx.device_context(), &sd);
+
+    GenAnchors<T><<<grid, block, 0, stream>>>(
+        anchors->data<T>(), ar.data<T>(), aspect_ratios.size(), as.data<T>(),
+        anchor_sizes.size(), sd.data<T>(), stride.size(), height, width,
+        offset);
+
+    framework::Tensor v;
+    framework::TensorFromVector(variances, ctx.device_context(), &v);
+    grid = (box_num * 4 + block - 1) / block;
+    SetVariance<T><<<grid, block, 0, stream>>>(vars->data<T>(), v.data<T>(),
+                                               variances.size(), box_num * 4);
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(anchor_generator,
+                        ops::AnchorGeneratorOpCUDAKernel<float>,
+                        ops::AnchorGeneratorOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.h b/paddle/fluid/operators/detection/anchor_generator_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e0e499d76a19ba5f6b91ba4c8797684fb53c7caa
--- /dev/null
+++ b/paddle/fluid/operators/detection/anchor_generator_op.h
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class AnchorGeneratorOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
+    auto* anchors = ctx.Output<paddle::framework::Tensor>("Anchors");
+    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+
+    auto anchor_sizes = ctx.Attr<std::vector<float>>("anchor_sizes");
+    auto aspect_ratios = ctx.Attr<std::vector<float>>("aspect_ratios");
+    auto stride = ctx.Attr<std::vector<float>>("stride");
+    auto variances = ctx.Attr<std::vector<float>>("variances");
+
+    T offset = static_cast<T>(ctx.Attr<float>("offset"));
+
+    auto feature_width = input->dims()[3];
+    auto feature_height = input->dims()[2];
+
+    T stride_width, stride_height;
+    stride_width = stride[0];
+    stride_height = stride[1];
+
+    int num_anchors = aspect_ratios.size() * anchor_sizes.size();
+
+    anchors->mutable_data<T>(ctx.GetPlace());
+    vars->mutable_data<T>(ctx.GetPlace());
+
+    auto e_anchors = framework::EigenTensor<T, 4>::From(*anchors);
+    for (int h_idx = 0; h_idx < feature_height; ++h_idx) {
+      for (int w_idx = 0; w_idx < feature_width; ++w_idx) {
+        T x_ctr = (w_idx * stride_width) + offset * (stride_width - 1);
+        T y_ctr = (h_idx * stride_height) + offset * (stride_height - 1);
+        T area, area_ratios;
+        T base_w, base_h;
+        T scale_w, scale_h;
+        T anchor_width, anchor_height;
+        int idx = 0;
+        for (size_t r = 0; r < aspect_ratios.size(); ++r) {
+          auto ar = aspect_ratios[r];
+          for (size_t s = 0; s < anchor_sizes.size(); ++s) {
+            auto anchor_size = anchor_sizes[s];
+            area = stride_width * stride_height;
+            area_ratios = area / ar;
+            base_w = round(sqrt(area_ratios));
+            base_h = round(base_w * ar);
+            scale_w = anchor_size / stride_width;
+            scale_h = anchor_size / stride_height;
+            anchor_width = scale_w * base_w;
+            anchor_height = scale_h * base_h;
+            e_anchors(h_idx, w_idx, idx, 0) =
+                (x_ctr - 0.5 * (anchor_width - 1));
+            e_anchors(h_idx, w_idx, idx, 1) =
+                (y_ctr - 0.5 * (anchor_height - 1));
+            e_anchors(h_idx, w_idx, idx, 2) =
+                (x_ctr + 0.5 * (anchor_width - 1));
+            e_anchors(h_idx, w_idx, idx, 3) =
+                (y_ctr + 0.5 * (anchor_height - 1));
+            idx++;
+          }
+        }
+      }
+    }
+
+    framework::Tensor var_t;
+    var_t.mutable_data<T>(
+        framework::make_ddim({1, static_cast<int>(variances.size())}),
+        ctx.GetPlace());
+    auto var_et = framework::EigenTensor<T, 2>::From(var_t);
+    for (size_t i = 0; i < variances.size(); ++i) {
+      var_et(0, i) = variances[i];
+    }
+
+    int anchor_num = feature_height * feature_width * num_anchors;
+    auto var_dim = vars->dims();
+    vars->Resize({anchor_num, static_cast<int>(variances.size())});
+
+    auto e_vars = framework::EigenMatrix<T, Eigen::RowMajor>::From(*vars);
+    e_vars = var_et.broadcast(Eigen::DSizes<int, 2>(anchor_num, 1));
+
+    vars->Resize(var_dim);
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/bipartite_match_op.cc b/paddle/fluid/operators/detection/bipartite_match_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c23b65fe4dead3ca01a447d03877e3359b19e656
--- /dev/null
+++ b/paddle/fluid/operators/detection/bipartite_match_op.cc
@@ -0,0 +1,291 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+class BipartiteMatchOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("DistMat"),
+                   "Input(DistMat) of BipartiteMatch should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("ColToRowMatchIndices"),
+        "Output(ColToRowMatchIndices) of BipartiteMatch should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("ColToRowMatchDist"),
+        "Output(ColToRowMatchDist) of BipartiteMatch should not be null.");
+
+    auto dims = ctx->GetInputDim("DistMat");
+    PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2.");
+
+    ctx->SetOutputDim("ColToRowMatchIndices", dims);
+    ctx->SetOutputDim("ColToRowMatchDist", dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<LoDTensor>("DistMat")->type()),
+        platform::CPUPlace());
+  }
+};
+
+template <class T>
+bool DistPairDescend(std::tuple<int, int, T> pair1,
+                     std::tuple<int, int, T> pair2) {
+  return std::get<2>(pair1) > std::get<2>(pair2);
+}
+
+template <typename T>
+class BipartiteMatchKernel : public framework::OpKernel<T> {
+ public:
+  // The match_indices must be initialized to -1 at first.
+  // The match_dist must be initialized to 0 at first.
+  void BipartiteMatch(const Tensor& dist, int* match_indices,
+                      T* match_dist) const {
+    PADDLE_ENFORCE_EQ(dist.dims().size(), 2, "The rank of dist must be 2.");
+    int64_t row = dist.dims()[0];
+    int64_t col = dist.dims()[1];
+    auto* dist_data = dist.data<T>();
+    // Test result: When row==130 the speed of these two methods almost the same
+    if (row >= 130) {
+      std::vector<std::tuple<int, int, T>> match_pair;
+
+      for (int64_t i = 0; i < row; ++i) {
+        for (int64_t j = 0; j < col; ++j) {
+          match_pair.push_back(std::make_tuple(i, j, dist_data[i * col + j]));
+        }
+      }
+      std::sort(match_pair.begin(), match_pair.end(), DistPairDescend<T>);
+      std::vector<int> row_indices(row, -1);
+
+      int64_t idx = 0;
+      for (int64_t k = 0; k < row * col; ++k) {
+        int64_t i = std::get<0>(match_pair[k]);
+        int64_t j = std::get<1>(match_pair[k]);
+        T dist = std::get<2>(match_pair[k]);
+
+        if (idx >= row) {
+          break;
+        }
+        if (match_indices[j] == -1 && row_indices[i] == -1 && dist > 0) {
+          match_indices[j] = i;
+          row_indices[i] = j;
+          match_dist[j] = dist;
+          idx += 1;
+        }
+      }
+    } else {
+      constexpr T kEPS = static_cast<T>(1e-6);
+      std::vector<int> row_pool;
+      for (int i = 0; i < row; ++i) {
+        row_pool.push_back(i);
+      }
+      while (row_pool.size() > 0) {
+        int max_idx = -1;
+        int max_row_idx = -1;
+        T max_dist = -1;
+        for (int64_t j = 0; j < col; ++j) {
+          if (match_indices[j] != -1) {
+            continue;
+          }
+          for (size_t k = 0; k < row_pool.size(); ++k) {
+            int m = row_pool[k];
+            // distance is 0 between m-th row and j-th column
+            if (dist_data[m * col + j] < kEPS) {
+              continue;
+            }
+            if (dist_data[m * col + j] > max_dist) {
+              max_idx = j;
+              max_row_idx = m;
+              max_dist = dist_data[m * col + j];
+            }
+          }
+        }
+        if (max_idx == -1) {
+          // Cannot find good match.
+          break;
+        } else {
+          PADDLE_ENFORCE_EQ(match_indices[max_idx], -1);
+          match_indices[max_idx] = max_row_idx;
+          match_dist[max_idx] = max_dist;
+          // Erase the row index.
+          row_pool.erase(
+              std::find(row_pool.begin(), row_pool.end(), max_row_idx));
+        }
+      }
+    }
+  }
+
+  void ArgMaxMatch(const Tensor& dist, int* match_indices, T* match_dist,
+                   T overlap_threshold) const {
+    constexpr T kEPS = static_cast<T>(1e-6);
+    int64_t row = dist.dims()[0];
+    int64_t col = dist.dims()[1];
+    auto* dist_data = dist.data<T>();
+    for (int64_t j = 0; j < col; ++j) {
+      if (match_indices[j] != -1) {
+        // the j-th column has been matched to one entity.
+        continue;
+      }
+      int max_row_idx = -1;
+      T max_dist = -1;
+      for (int i = 0; i < row; ++i) {
+        T dist = dist_data[i * col + j];
+        if (dist < kEPS) {
+          // distance is 0 between m-th row and j-th column
+          continue;
+        }
+        if (dist >= overlap_threshold && dist > max_dist) {
+          max_row_idx = i;
+          max_dist = dist;
+        }
+      }
+      if (max_row_idx != -1) {
+        PADDLE_ENFORCE_EQ(match_indices[j], -1);
+        match_indices[j] = max_row_idx;
+        match_dist[j] = max_dist;
+      }
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* dist_mat = context.Input<LoDTensor>("DistMat");
+    auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices");
+    auto* match_dist = context.Output<Tensor>("ColToRowMatchDist");
+
+    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
+
+    auto col = dist_mat->dims()[1];
+
+    int64_t n = dist_mat->lod().size() == 0UL
+                    ? 1
+                    : static_cast<int64_t>(dist_mat->lod().back().size() - 1);
+    if (dist_mat->lod().size()) {
+      PADDLE_ENFORCE_EQ(dist_mat->lod().size(), 1UL,
+                        "Only support 1 level of LoD.");
+    }
+    match_indices->mutable_data<int>({n, col}, context.GetPlace());
+    match_dist->mutable_data<T>({n, col}, context.GetPlace());
+
+    math::SetConstant<platform::CPUDeviceContext, int> iset;
+    iset(dev_ctx, match_indices, static_cast<int>(-1));
+    math::SetConstant<platform::CPUDeviceContext, T> tset;
+    tset(dev_ctx, match_dist, static_cast<T>(0));
+
+    int* indices = match_indices->data<int>();
+    T* dist = match_dist->data<T>();
+    auto type = context.Attr<std::string>("match_type");
+    auto threshold = context.Attr<float>("dist_threshold");
+    if (n == 1) {
+      BipartiteMatch(*dist_mat, indices, dist);
+      if (type == "per_prediction") {
+        ArgMaxMatch(*dist_mat, indices, dist, threshold);
+      }
+    } else {
+      auto lod = dist_mat->lod().back();
+      for (size_t i = 0; i < lod.size() - 1; ++i) {
+        Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]);
+        BipartiteMatch(one_ins, indices + i * col, dist + i * col);
+        if (type == "per_prediction") {
+          ArgMaxMatch(one_ins, indices + i * col, dist + i * col, threshold);
+        }
+      }
+    }
+  }
+};
+
+class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "DistMat",
+        "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
+        "[K, M]. It is pair-wise distance matrix between the entities "
+        "represented by each row and each column. For example, assumed one "
+        "entity is A with shape [K], another entity is B with shape [M]. The "
+        "DistMat[i][j] is the distance between A[i] and B[j]. The bigger "
+        "the distance is, the better macthing the pairs are. Please note, "
+        "This tensor can contain LoD information to represent a batch of "
+        "inputs. One instance of this batch can contain different numbers of "
+        "entities.");
+    AddAttr<std::string>(
+        "match_type",
+        "(string, defalut: per_prediction) "
+        "The type of matching method, should be 'bipartite' or "
+        "'per_prediction', 'bipartite' by defalut.")
+        .SetDefault("bipartite")
+        .InEnum({"bipartite", "per_prediction"});
+    AddAttr<float>(
+        "dist_threshold",
+        "(float, defalut: 0.5) "
+        "If `match_type` is 'per_prediction', this threshold is to determine "
+        "the extra matching bboxes based on the maximum distance.")
+        .SetDefault(0.5);
+    AddOutput("ColToRowMatchIndices",
+              "(Tensor) A 2-D Tensor with shape [N, M] in int type. "
+              "N is the batch size. If ColToRowMatchIndices[i][j] is -1, it "
+              "means B[j] does not match any entity in i-th instance. "
+              "Otherwise, it means B[j] is matched to row "
+              "ColToRowMatchIndices[i][j] in i-th instance. The row number of "
+              "i-th instance is saved in ColToRowMatchIndices[i][j].");
+    AddOutput("ColToRowMatchDist",
+              "(Tensor) A 2-D Tensor with shape [N, M] in float type. "
+              "N is batch size. If ColToRowMatchIndices[i][j] is -1, "
+              "ColToRowMatchDist[i][j] is also -1.0. Otherwise, assumed "
+              "ColToRowMatchIndices[i][j] = d, and the row offsets of each "
+              "instance are called LoD. Then "
+              "ColToRowMatchDist[i][j] = DistMat[d+LoD[i]][j]");
+    AddComment(R"DOC(
+This operator is a greedy bipartite matching algorithm, which is used to
+obtain the matching with the maximum distance based on the input
+distance matrix. For input 2D matrix, the bipartite matching algorithm can
+find the matched column for each row, also can find the matched row for
+each column. And this operator only calculate matched indices from column
+to row. For each instance, the number of matched indices is the number of
+of columns of the input distance matrix.
+
+There are two outputs to save matched indices and distance.
+A simple description, this algorithm matched the best (maximum distance)
+row entity to the column entity and the matched indices are not duplicated
+in each row of ColToRowMatchIndices. If the column entity is not matched
+any row entity, set -1 in ColToRowMatchIndices.
+
+Please note that the input DistMat can be LoDTensor (with LoD) or Tensor.
+If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size.
+If Tensor, the height of ColToRowMatchIndices is 1.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(bipartite_match, ops::BipartiteMatchOp,
+                  ops::BipartiteMatchOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(bipartite_match, ops::BipartiteMatchKernel<float>,
+                       ops::BipartiteMatchKernel<double>);
diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d0f95f727fdbc82777147e3e8ada6ad4f7a35e60
--- /dev/null
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
@@ -0,0 +1,151 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/box_coder_op.h"
+
+namespace paddle {
+namespace operators {
+
+class BoxCoderOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
+                   "Input(PriorBox) of BoxCoderOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("TargetBox"),
+                   "Input(TargetBox) of BoxCoderOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutputBox"),
+                   "Output(OutputBox) of BoxCoderOp should not be null.");
+
+    auto prior_box_dims = ctx->GetInputDim("PriorBox");
+    auto target_box_dims = ctx->GetInputDim("TargetBox");
+
+    PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
+                      "The rank of Input of PriorBoxVar must be 2");
+    PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]");
+    if (ctx->HasInput("PriorBoxVar")) {
+      auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
+      PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
+    }
+
+    auto code_type = GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
+    if (code_type == BoxCodeType::kEncodeCenterSize) {
+      PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
+                        "The rank of Input of TargetBox must be 2");
+      PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
+                        "The shape of TargetBox is [M, 4]");
+    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
+      PADDLE_ENFORCE_EQ(target_box_dims.size(), 3,
+                        "The rank of Input of TargetBox must be 3");
+      PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]);
+      PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]);
+    }
+
+    ctx->SetOutputDim(
+        "OutputBox",
+        framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4}));
+    ctx->ShareLoD("TargetBox", /*->*/ "OutputBox");
+  }
+};
+
+class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "PriorBox",
+        "(Tensor, default Tensor<float>) "
+        "Box list PriorBox is a 2-D Tensor with shape [M, 4] holds M boxes, "
+        "each box is represented as [xmin, ymin, xmax, ymax], "
+        "[xmin, ymin] is the left top coordinate of the anchor box, "
+        "if the input is image feature map, they are close to the origin "
+        "of the coordinate system. [xmax, ymax] is the right bottom "
+        "coordinate of the anchor box.");
+    AddInput("PriorBoxVar",
+             "(Tensor, default Tensor<float>, optional) "
+             "PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group "
+             "of variance. PriorBoxVar will set all elements to 1 by "
+             "default.")
+        .AsDispensable();
+    AddInput(
+        "TargetBox",
+        "(LoDTensor or Tensor) This input can be a 2-D LoDTensor with shape "
+        "[N, 4] when code_type is 'encode_center_size'. This input also can "
+        "be a 3-D Tensor with shape [N, M, 4] when code_type is "
+        "'decode_center_size'. [N, 4], each box is represented as "
+        "[xmin, ymin, xmax, ymax], [xmin, ymin] is the left top coordinate "
+        "of the box if the input is image feature map, they are close to "
+        "the origin of the coordinate system. [xmax, ymax] is the right "
+        "bottom coordinate of the box. This tensor can contain LoD "
+        "information to represent a batch of inputs. One instance of this "
+        "batch can contain different numbers of entities.");
+    AddAttr<std::string>("code_type",
+                         "(string, default encode_center_size) "
+                         "the code type used with the target box")
+        .SetDefault("encode_center_size")
+        .InEnum({"encode_center_size", "decode_center_size"});
+    AddAttr<bool>("box_normalized",
+                  "(bool, default true) "
+                  "whether treat the priorbox as a noramlized box")
+        .SetDefault(true);
+    AddOutput("OutputBox",
+              "(LoDTensor or Tensor) "
+              "When code_type is 'encode_center_size', the output tensor of "
+              "box_coder_op with shape [N, M, 4] representing the result of N "
+              "target boxes encoded with M Prior boxes and variances. When "
+              "code_type is 'decode_center_size', N represents the batch size "
+              "and M represents the number of deocded boxes.");
+
+    AddComment(R"DOC(
+
+Bounding Box Coder.
+
+Encode/Decode the target bounding box with the priorbox information.
+
+The Encoding schema described below:
+
+    ox = (tx - px) / pw / pxv
+
+    oy = (ty - py) / ph / pyv
+
+    ow = log(abs(tw / pw)) / pwv 
+
+    oh = log(abs(th / ph)) / phv 
+
+The Decoding schema described below:
+
+    ox = (pw * pxv * tx * + px) - tw / 2
+
+    oy = (ph * pyv * ty * + py) - th / 2
+
+    ow = exp(pwv * tw) * pw + tw / 2
+
+    oh = exp(phv * th) * ph + th / 2
+
+where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width
+and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
+priorbox's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`,
+`phv` denote the variance of the priorbox and `ox`, `oy`, `ow`, `oh` denote the
+encoded/decoded coordinates, width and height.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    box_coder, ops::BoxCoderKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::BoxCoderKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/detection/box_coder_op.cu b/paddle/fluid/operators/detection/box_coder_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a7af111f63d654319dd1d90d2032956951dfe49e
--- /dev/null
+++ b/paddle/fluid/operators/detection/box_coder_op.cu
@@ -0,0 +1,171 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/box_coder_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void EncodeCenterSizeKernel(const T* prior_box_data,
+                                       const T* prior_box_var_data,
+                                       const T* target_box_data, const int row,
+                                       const int col, const int len,
+                                       const bool normalized, T* output) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < row * col) {
+    const int row_idx = idx / col;
+    const int col_idx = idx % col;
+    T prior_box_width = prior_box_data[col_idx * len + 2] -
+                        prior_box_data[col_idx * len] + (normalized == false);
+    T prior_box_height = prior_box_data[col_idx * len + 3] -
+                         prior_box_data[col_idx * len + 1] +
+                         (normalized == false);
+    T prior_box_center_x =
+        (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
+    T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
+                            prior_box_data[col_idx * len + 1]) /
+                           2;
+
+    T target_box_center_x =
+        (target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) /
+        2;
+    T target_box_center_y = (target_box_data[row_idx * len + 3] +
+                             target_box_data[row_idx * len + 1]) /
+                            2;
+    T target_box_width = target_box_data[row_idx * len + 2] -
+                         target_box_data[row_idx * len] + (normalized == false);
+    T target_box_height = target_box_data[row_idx * len + 3] -
+                          target_box_data[row_idx * len + 1] +
+                          (normalized == false);
+
+    output[idx * len] =
+        (target_box_center_x - prior_box_center_x) / prior_box_width;
+    output[idx * len + 1] =
+        (target_box_center_y - prior_box_center_y) / prior_box_height;
+    output[idx * len + 2] = log(fabs(target_box_width / prior_box_width));
+    output[idx * len + 3] = log(fabs(target_box_height / prior_box_height));
+    if (prior_box_var_data) {
+      output[idx * len] /= prior_box_var_data[col_idx * len];
+      output[idx * len + 1] /= prior_box_var_data[col_idx * len + 1];
+      output[idx * len + 2] /= prior_box_var_data[col_idx * len + 2];
+      output[idx * len + 3] /= prior_box_var_data[col_idx * len + 3];
+    }
+  }
+}
+
+template <typename T>
+__global__ void DecodeCenterSizeKernel(const T* prior_box_data,
+                                       const T* prior_box_var_data,
+                                       const T* target_box_data, const int row,
+                                       const int col, const int len,
+                                       const bool normalized, T* output) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < row * col) {
+    const int col_idx = idx % col;
+    T prior_box_width = prior_box_data[col_idx * len + 2] -
+                        prior_box_data[col_idx * len] + (normalized == false);
+    T prior_box_height = prior_box_data[col_idx * len + 3] -
+                         prior_box_data[col_idx * len + 1] +
+                         (normalized == false);
+    T prior_box_center_x =
+        (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
+    T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
+                            prior_box_data[col_idx * len + 1]) /
+                           2;
+    T target_box_width, target_box_height;
+    T target_box_center_x, target_box_center_y;
+    if (prior_box_var_data) {
+      target_box_width = exp(prior_box_var_data[col_idx * len + 2] *
+                             target_box_data[idx * len + 2]) *
+                         prior_box_width;
+      target_box_height = exp(prior_box_var_data[col_idx * len + 3] *
+                              target_box_data[idx * len + 3]) *
+                          prior_box_height;
+      target_box_center_x = prior_box_var_data[col_idx * len] *
+                                target_box_data[idx * len] * prior_box_width +
+                            prior_box_center_x;
+      target_box_center_y = prior_box_var_data[col_idx * len + 1] *
+                                target_box_data[idx * len + 1] *
+                                prior_box_height +
+                            prior_box_center_y;
+    } else {
+      target_box_width = exp(target_box_data[idx * len + 2]) * prior_box_width;
+      target_box_height =
+          exp(target_box_data[idx * len + 3]) * prior_box_height;
+      target_box_center_x =
+          target_box_data[idx * len] * prior_box_width + prior_box_center_x;
+      target_box_center_y = target_box_data[idx * len + 1] * prior_box_height +
+                            prior_box_center_y;
+    }
+
+    output[idx * len] = target_box_center_x - target_box_width / 2;
+    output[idx * len + 1] = target_box_center_y - target_box_height / 2;
+    output[idx * len + 2] =
+        target_box_center_x + target_box_width / 2 - (normalized == false);
+    output[idx * len + 3] =
+        target_box_center_y + target_box_height / 2 - (normalized == false);
+  }
+}
+
+template <typename DeviceContext, typename T>
+class BoxCoderCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto* prior_box = context.Input<framework::Tensor>("PriorBox");
+    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
+    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
+    auto* output_box = context.Output<framework::Tensor>("OutputBox");
+
+    const T* prior_box_data = prior_box->data<T>();
+    const T* target_box_data = target_box->data<T>();
+    const T* prior_box_var_data = nullptr;
+    if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
+
+    if (target_box->lod().size()) {
+      PADDLE_ENFORCE_EQ(target_box->lod().size(), 1,
+                        "Only support 1 level of LoD.");
+    }
+    auto row = target_box->dims()[0];
+    auto col = prior_box->dims()[0];
+    auto len = prior_box->dims()[1];
+    int block = 512;
+    int grid = (row * col + block - 1) / block;
+    auto& device_ctx = context.cuda_device_context();
+
+    output_box->mutable_data<T>({row, col, len}, context.GetPlace());
+    T* output = output_box->data<T>();
+
+    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
+    bool normalized = context.Attr<bool>("box_normalized");
+    if (code_type == BoxCodeType::kEncodeCenterSize) {
+      EncodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
+          prior_box_data, prior_box_var_data, target_box_data, row, col, len,
+          normalized, output);
+    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
+      DecodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
+          prior_box_data, prior_box_var_data, target_box_data, row, col, len,
+          normalized, output);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    box_coder,
+    ops::BoxCoderCUDAKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::BoxCoderCUDAKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/detection/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ed8520acddfa8fe2105a7c1615bcb3243cb130f
--- /dev/null
+++ b/paddle/fluid/operators/detection/box_coder_op.h
@@ -0,0 +1,179 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 };
+
+inline BoxCodeType GetBoxCodeType(const std::string& type) {
+  if (type == "encode_center_size") {
+    return BoxCodeType::kEncodeCenterSize;
+  } else if (type == "decode_center_size") {
+    return BoxCodeType::kDecodeCenterSize;
+  }
+  PADDLE_THROW("Not support type %s.", type);
+}
+
+template <typename DeviceContext, typename T>
+class BoxCoderKernel : public framework::OpKernel<T> {
+ public:
+  void EncodeCenterSize(const framework::Tensor* target_box,
+                        const framework::Tensor* prior_box,
+                        const framework::Tensor* prior_box_var,
+                        const bool normalized, T* output) const {
+    int64_t row = target_box->dims()[0];
+    int64_t col = prior_box->dims()[0];
+    int64_t len = prior_box->dims()[1];
+    auto* target_box_data = target_box->data<T>();
+    auto* prior_box_data = prior_box->data<T>();
+    const T* prior_box_var_data = nullptr;
+    if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
+
+    for (int64_t i = 0; i < row; ++i) {
+      for (int64_t j = 0; j < col; ++j) {
+        T prior_box_width = prior_box_data[j * len + 2] -
+                            prior_box_data[j * len] + (normalized == false);
+        T prior_box_height = prior_box_data[j * len + 3] -
+                             prior_box_data[j * len + 1] +
+                             (normalized == false);
+        T prior_box_center_x =
+            (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
+        T prior_box_center_y =
+            (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
+
+        T target_box_center_x =
+            (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
+        T target_box_center_y =
+            (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
+        T target_box_width = target_box_data[i * len + 2] -
+                             target_box_data[i * len] + (normalized == false);
+        T target_box_height = target_box_data[i * len + 3] -
+                              target_box_data[i * len + 1] +
+                              (normalized == false);
+
+        size_t offset = i * col * len + j * len;
+        output[offset] =
+            (target_box_center_x - prior_box_center_x) / prior_box_width;
+        output[offset + 1] =
+            (target_box_center_y - prior_box_center_y) / prior_box_height;
+        output[offset + 2] =
+            std::log(std::fabs(target_box_width / prior_box_width));
+        output[offset + 3] =
+            std::log(std::fabs(target_box_height / prior_box_height));
+        if (prior_box_var) {
+          output[offset] /= prior_box_var_data[j * len];
+          output[offset + 1] /= prior_box_var_data[j * len + 1];
+          output[offset + 2] /= prior_box_var_data[j * len + 2];
+          output[offset + 3] /= prior_box_var_data[j * len + 3];
+        }
+      }
+    }
+  }
+  void DecodeCenterSize(const framework::Tensor* target_box,
+                        const framework::Tensor* prior_box,
+                        const framework::Tensor* prior_box_var,
+                        const bool normalized, T* output) const {
+    int64_t row = target_box->dims()[0];
+    int64_t col = prior_box->dims()[0];
+    int64_t len = prior_box->dims()[1];
+
+    auto* target_box_data = target_box->data<T>();
+    auto* prior_box_data = prior_box->data<T>();
+    const T* prior_box_var_data = nullptr;
+    if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
+
+    for (int64_t i = 0; i < row; ++i) {
+      for (int64_t j = 0; j < col; ++j) {
+        size_t offset = i * col * len + j * len;
+        T prior_box_width = prior_box_data[j * len + 2] -
+                            prior_box_data[j * len] + (normalized == false);
+        T prior_box_height = prior_box_data[j * len + 3] -
+                             prior_box_data[j * len + 1] +
+                             (normalized == false);
+        T prior_box_center_x =
+            (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
+        T prior_box_center_y =
+            (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
+
+        T target_box_center_x = 0, target_box_center_y = 0;
+        T target_box_width = 0, target_box_height = 0;
+        if (prior_box_var) {
+          target_box_center_x = prior_box_var_data[j * len] *
+                                    target_box_data[offset] * prior_box_width +
+                                prior_box_center_x;
+          target_box_center_y = prior_box_var_data[j * len + 1] *
+                                    target_box_data[offset + 1] *
+                                    prior_box_height +
+                                prior_box_center_y;
+          target_box_width = std::exp(prior_box_var_data[j * len + 2] *
+                                      target_box_data[offset + 2]) *
+                             prior_box_width;
+          target_box_height = std::exp(prior_box_var_data[j * len + 3] *
+                                       target_box_data[offset + 3]) *
+                              prior_box_height;
+        } else {
+          target_box_center_x =
+              target_box_data[offset] * prior_box_width + prior_box_center_x;
+          target_box_center_y = target_box_data[offset + 1] * prior_box_height +
+                                prior_box_center_y;
+          target_box_width =
+              std::exp(target_box_data[offset + 2]) * prior_box_width;
+          target_box_height =
+              std::exp(target_box_data[offset + 3]) * prior_box_height;
+        }
+
+        output[offset] = target_box_center_x - target_box_width / 2;
+        output[offset + 1] = target_box_center_y - target_box_height / 2;
+        output[offset + 2] =
+            target_box_center_x + target_box_width / 2 - (normalized == false);
+        output[offset + 3] =
+            target_box_center_y + target_box_height / 2 - (normalized == false);
+      }
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* prior_box = context.Input<framework::Tensor>("PriorBox");
+    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
+    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
+    auto* output_box = context.Output<framework::Tensor>("OutputBox");
+
+    if (target_box->lod().size()) {
+      PADDLE_ENFORCE_EQ(target_box->lod().size(), 1UL,
+                        "Only support 1 level of LoD.");
+    }
+    auto row = target_box->dims()[0];
+    auto col = prior_box->dims()[0];
+    auto len = prior_box->dims()[1];
+
+    output_box->mutable_data<T>({row, col, len}, context.GetPlace());
+
+    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
+    bool normalized = context.Attr<bool>("box_normalized");
+    T* output = output_box->data<T>();
+    if (code_type == BoxCodeType::kEncodeCenterSize) {
+      EncodeCenterSize(target_box, prior_box, prior_box_var, normalized,
+                       output);
+    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
+      DecodeCenterSize(target_box, prior_box, prior_box_var, normalized,
+                       output);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/iou_similarity_op.cc b/paddle/fluid/operators/detection/iou_similarity_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9c89b7ca9af1b235659554afc805600d31ef8ea6
--- /dev/null
+++ b/paddle/fluid/operators/detection/iou_similarity_op.cc
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/iou_similarity_op.h"
+
+namespace paddle {
+namespace operators {
+
+class IOUSimilarityOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of IOUSimilarityOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"),
+                   "Input(Y) of IOUSimilarityOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "The rank of Input(X) must be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[1], 4UL, "The shape of X is [N, 4]");
+    PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The rank of Input(Y) must be 2.");
+    PADDLE_ENFORCE_EQ(y_dims[1], 4UL, "The shape of Y is [M, 4]");
+
+    ctx->ShareLoD("X", /*->*/ "Out");
+    ctx->SetOutputDim("Out", framework::make_ddim({x_dims[0], y_dims[0]}));
+  }
+};
+
+class IOUSimilarityOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(LoDTensor, default LoDTensor<float>) "
+             "Box list X is a 2-D LoDTensor with shape [N, 4] holds N boxes, "
+             "each box is represented as [xmin, ymin, xmax, ymax], "
+             "the shape of X is [N, 4]. [xmin, ymin] is the left top "
+             "coordinate of the box if the input is image feature map, they "
+             "are close to the origin of the coordinate system. "
+             "[xmax, ymax] is the right bottom coordinate of the box. "
+             "This tensor can contain LoD information to represent a batch "
+             "of inputs. One instance of this batch can contain different "
+             "numbers of entities.");
+    AddInput("Y",
+             "(Tensor, default Tensor<float>) "
+             "Box list Y holds M boxes, each box is represented as "
+             "[xmin, ymin, xmax, ymax], the shape of X is [N, 4]. "
+             "[xmin, ymin] is the left top coordinate of the box if the "
+             "input is image feature map, and [xmax, ymax] is the right "
+             "bottom coordinate of the box.");
+
+    AddOutput("Out",
+              "(LoDTensor, the lod is same as input X) The output of "
+              "iou_similarity op, a tensor with shape [N, M] "
+              "representing pairwise iou scores.");
+
+    AddComment(R"DOC(
+**IOU Similarity Operator**
+
+Computes intersection-over-union (IOU) between two box lists.
+Box list 'X' should be a LoDTensor and 'Y' is a common Tensor,
+boxes in 'Y' are shared by all instance of the batched inputs of X.
+Given two boxes A and B, the calculation of IOU is as follows:
+
+$$
+IOU(A, B) = 
+\\frac{area(A\\cap B)}{area(A)+area(B)-area(A\\cap B)}
+$$
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(iou_similarity, ops::IOUSimilarityOp,
+                  ops::IOUSimilarityOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    iou_similarity,
+    ops::IOUSimilarityKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::IOUSimilarityKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/detection/iou_similarity_op.cu b/paddle/fluid/operators/detection/iou_similarity_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8342b4138c87e6ea1803146bac6d6954a569ef5f
--- /dev/null
+++ b/paddle/fluid/operators/detection/iou_similarity_op.cu
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/iou_similarity_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    iou_similarity,
+    ops::IOUSimilarityKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::IOUSimilarityKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/detection/iou_similarity_op.h b/paddle/fluid/operators/detection/iou_similarity_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f193ebc59b7be44b987db7d068c209ef7f5a8da
--- /dev/null
+++ b/paddle/fluid/operators/detection/iou_similarity_op.h
@@ -0,0 +1,92 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/for_range.h"
+
+template <typename T>
+inline HOSTDEVICE T IOUSimilarity(T xmin1, T ymin1, T xmax1, T ymax1, T xmin2,
+                                  T ymin2, T xmax2, T ymax2) {
+  constexpr T zero = static_cast<T>(0);
+  T area1 = (ymax1 - ymin1) * (xmax1 - xmin1);
+  T area2 = (ymax2 - ymin2) * (xmax2 - xmin2);
+  T inter_xmax = xmax1 > xmax2 ? xmax2 : xmax1;
+  T inter_ymax = ymax1 > ymax2 ? ymax2 : ymax1;
+  T inter_xmin = xmin1 > xmin2 ? xmin1 : xmin2;
+  T inter_ymin = ymin1 > ymin2 ? ymin1 : ymin2;
+  T inter_height = inter_ymax - inter_ymin;
+  T inter_width = inter_xmax - inter_xmin;
+  inter_height = inter_height > zero ? inter_height : zero;
+  inter_width = inter_width > zero ? inter_width : zero;
+  T inter_area = inter_width * inter_height;
+  T union_area = area1 + area2 - inter_area;
+  T sim_score = inter_area / union_area;
+  return sim_score;
+}
+
+template <typename T>
+struct IOUSimilarityFunctor {
+  IOUSimilarityFunctor(const T* x, const T* y, T* z, int cols)
+      : x_(x), y_(y), z_(z), cols_(static_cast<size_t>(cols)) {}
+
+  inline HOSTDEVICE void operator()(size_t tid) const {
+    size_t row_id = tid / cols_;
+    size_t col_id = tid % cols_;
+
+    T x_min1 = x_[row_id * 4];
+    T y_min1 = x_[row_id * 4 + 1];
+    T x_max1 = x_[row_id * 4 + 2];
+    T y_max1 = x_[row_id * 4 + 3];
+
+    T x_min2 = y_[col_id * 4];
+    T y_min2 = y_[col_id * 4 + 1];
+    T x_max2 = y_[col_id * 4 + 2];
+    T y_max2 = y_[col_id * 4 + 3];
+
+    T sim = IOUSimilarity(x_min1, y_min1, x_max1, y_max1, x_min2, y_min2,
+                          x_max2, y_max2);
+
+    z_[row_id * cols_ + col_id] = sim;
+  }
+  const T* x_;
+  const T* y_;
+  T* z_;
+  const size_t cols_;
+};
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class IOUSimilarityKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const framework::LoDTensor* in_x = ctx.Input<framework::LoDTensor>("X");
+    const framework::Tensor* in_y = ctx.Input<framework::Tensor>("Y");
+    framework::LoDTensor* out = ctx.Output<framework::LoDTensor>("Out");
+
+    int x_n = in_x->dims()[0];
+    int y_n = in_y->dims()[0];
+    IOUSimilarityFunctor<T> functor(in_x->data<T>(), in_y->data<T>(),
+                                    out->mutable_data<T>(ctx.GetPlace()), y_n);
+
+    platform::ForRange<DeviceContext> for_range(
+        static_cast<const DeviceContext&>(ctx.device_context()), x_n * y_n);
+    for_range(functor);
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/mine_hard_examples_op.cc b/paddle/fluid/operators/detection/mine_hard_examples_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d4a09bae3a98e4518f9885c1e9182f7033a0d262
--- /dev/null
+++ b/paddle/fluid/operators/detection/mine_hard_examples_op.cc
@@ -0,0 +1,333 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+enum MiningType { kNone = 0, kMaxNegative, kHardExample };
+
+template <typename T>
+bool SortScoreDescend(const std::pair<float, T>& pair1,
+                      const std::pair<float, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+inline bool IsEligibleMining(const MiningType mining_type, const int match_idx,
+                             const float match_dist,
+                             const float neg_dist_threshold) {
+  if (mining_type == MiningType::kMaxNegative) {
+    return match_idx == -1 && match_dist < neg_dist_threshold;
+  } else if (mining_type == MiningType::kHardExample) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+inline MiningType GetMiningType(std::string str) {
+  if (str == "max_negative") {
+    return MiningType::kMaxNegative;
+  } else if (str == "hard_example") {
+    return MiningType::kHardExample;
+  } else {
+    return MiningType::kNone;
+  }
+}
+
+template <typename DeviceContext, typename T>
+class MineHardExamplesKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_cls_loss = ctx.Input<framework::Tensor>("ClsLoss");
+    auto* in_loc_loss = ctx.Input<framework::Tensor>("LocLoss");
+    auto* in_matched_indices = ctx.Input<framework::Tensor>("MatchIndices");
+    auto* in_match_dist = ctx.Input<framework::Tensor>("MatchDist");
+    float neg_pos_ratio = ctx.Attr<float>("neg_pos_ratio");
+    T neg_dist_threshold =
+        static_cast<T>(ctx.Attr<float>("neg_dist_threshold"));
+    int sample_size = ctx.Attr<int>("sample_size");
+    MiningType mining_type =
+        GetMiningType(ctx.Attr<std::string>("mining_type"));
+
+    auto out_neg_indices = ctx.Output<framework::LoDTensor>("NegIndices");
+    auto out_match_indices =
+        ctx.Output<framework::Tensor>("UpdatedMatchIndices");
+
+    framework::TensorCopy(*in_matched_indices, ctx.GetPlace(),
+                          out_match_indices);
+
+    int batch_size = in_matched_indices->dims()[0];
+    int prior_num = in_matched_indices->dims()[1];
+
+    auto match_indices = framework::EigenMatrix<int>::From(*in_matched_indices);
+
+    auto match_indices_et =
+        framework::EigenMatrix<int>::From(*out_match_indices);
+
+    auto match_dist = framework::EigenMatrix<T>::From(*in_match_dist);
+
+    const T* cls_loss = in_cls_loss->data<T>();
+    const T* loc_loss = nullptr;
+    if (in_loc_loss) {
+      loc_loss = in_loc_loss->data<T>();
+    }
+
+    std::vector<std::vector<int>> all_neg_indices;
+    std::vector<size_t> batch_starts = {0};
+    for (int n = 0; n < batch_size; ++n) {
+      std::vector<std::pair<T, size_t>> loss_idx;
+      int neg_sel = 0;
+      for (int m = 0; m < prior_num; ++m) {
+        if (IsEligibleMining(mining_type, match_indices(n, m), match_dist(n, m),
+                             neg_dist_threshold)) {
+          T loss = cls_loss[n * prior_num + m];
+          if (mining_type == MiningType::kHardExample && loc_loss != nullptr) {
+            loss = cls_loss[n * prior_num + m] + loc_loss[n * prior_num + m];
+          }
+          loss_idx.push_back(std::make_pair(loss, m));
+          ++neg_sel;
+        }
+      }
+
+      if (mining_type == MiningType::kMaxNegative) {
+        int num_pos = 0;
+        for (int m = 0; m < prior_num; ++m) {
+          if (match_indices(n, m) != -1) ++num_pos;
+        }
+        neg_sel = std::min(static_cast<int>(num_pos * neg_pos_ratio), neg_sel);
+      } else if (mining_type == MiningType::kHardExample) {
+        neg_sel = std::min(sample_size, neg_sel);
+      }
+
+      std::sort(loss_idx.begin(), loss_idx.end(), SortScoreDescend<size_t>);
+      std::set<int> sel_indices;
+      std::vector<int> neg_indices;
+      std::transform(loss_idx.begin(), loss_idx.begin() + neg_sel,
+                     std::inserter(sel_indices, sel_indices.begin()),
+                     [](std::pair<T, size_t>& l) -> int {
+                       return static_cast<int>(l.second);
+                     });
+
+      if (mining_type == MiningType::kHardExample) {
+        for (int m = 0; m < prior_num; ++m) {
+          if (match_indices(n, m) > -1) {
+            if (sel_indices.find(m) == sel_indices.end()) {
+              match_indices_et(n, m) = -1;
+            }
+          } else {
+            if (sel_indices.find(m) != sel_indices.end()) {
+              neg_indices.push_back(m);
+            }
+          }
+        }
+      } else {
+        neg_indices.resize(sel_indices.size());
+        std::copy(sel_indices.begin(), sel_indices.end(), neg_indices.begin());
+      }
+
+      all_neg_indices.push_back(neg_indices);
+      batch_starts.push_back(batch_starts.back() + neg_indices.size());
+    }
+
+    framework::LoD out_neg_indices_lod;
+    out_neg_indices_lod.emplace_back(batch_starts);
+    int neg_offset = 0;
+    auto neg_data = out_neg_indices->mutable_data<int>(
+        framework::make_ddim({static_cast<int>(batch_starts.back()), 1}),
+        ctx.GetPlace());
+
+    for (auto neg_indices : all_neg_indices) {
+      std::copy(neg_indices.begin(), neg_indices.end(), neg_data + neg_offset);
+      neg_offset += neg_indices.size();
+    }
+    out_neg_indices->set_lod(out_neg_indices_lod);
+    return;
+  }
+};
+
+class MineHardExamplesOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("ClsLoss"),
+                   "Input(ClsLoss) of MineHardExamplesOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("MatchIndices"),
+        "Input(MatchIndices) of MineHardExamplesOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("MatchDist"),
+        "Input(MatchDist) of MineHardExamplesOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("NegIndices"),
+        "Output(NegIndices) of MineHardExamplesOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("UpdatedMatchIndices"),
+                   "Output(UpdatedMatchIndices) of MineHardExamplesOp should "
+                   "not be null.");
+
+    auto cls_loss_dims = ctx->GetInputDim("ClsLoss");
+    auto idx_dims = ctx->GetInputDim("MatchIndices");
+    auto dis_dims = ctx->GetInputDim("MatchDist");
+
+    PADDLE_ENFORCE_EQ(cls_loss_dims.size(), 2UL,
+                      "The shape of ClsLoss is [N, Np].");
+    PADDLE_ENFORCE_EQ(idx_dims.size(), 2UL,
+                      "The shape of MatchIndices is [N, Np].");
+    PADDLE_ENFORCE_EQ(dis_dims.size(), 2UL,
+                      "The shape of MatchDist is [N, Np].");
+
+    if (ctx->HasInput("LocLoss")) {
+      auto loc_loss_dims = ctx->GetInputDim("LocLoss");
+      PADDLE_ENFORCE_EQ(loc_loss_dims.size(), 2UL,
+                        "The shape of LocLoss is [N, Np].");
+      PADDLE_ENFORCE_EQ(cls_loss_dims[0], loc_loss_dims[0],
+                        "Batch size of ClsLoss and LocLoss must be the same.");
+      PADDLE_ENFORCE_EQ(
+          cls_loss_dims[1], loc_loss_dims[1],
+          "Prior box number of ClsLoss and LocLoss must be the same.");
+    }
+
+    PADDLE_ENFORCE_EQ(
+        cls_loss_dims[0], idx_dims[0],
+        "Batch size of ClsLoss and MatchIndices must be the same.");
+    PADDLE_ENFORCE_EQ(
+        cls_loss_dims[1], idx_dims[1],
+        "Prior box number of ClsLoss and MatchIndices must be the same.");
+
+    PADDLE_ENFORCE_EQ(cls_loss_dims[0], dis_dims[0],
+                      "Batch size of ClsLoss and MatchDist must be the same.");
+    PADDLE_ENFORCE_EQ(
+        cls_loss_dims[1], idx_dims[1],
+        "Prior box number of ClsLoss and MatchDist must be the same.");
+
+    auto mining_type =
+        GetMiningType(ctx->Attrs().Get<std::string>("mining_type"));
+
+    PADDLE_ENFORCE_NE(mining_type, MiningType::kNone,
+                      "mining_type must be hard_example or max_negative");
+
+    if (mining_type == MiningType::kMaxNegative) {
+      auto neg_pos_ratio = ctx->Attrs().Get<float>("neg_pos_ratio");
+      auto neg_dist_threshold = ctx->Attrs().Get<float>("neg_dist_threshold");
+      PADDLE_ENFORCE_GT(
+          neg_pos_ratio, 0.0f,
+          "neg_pos_ratio must greater than zero in max_negative mode");
+      PADDLE_ENFORCE_GT(
+          neg_dist_threshold, 0.0f,
+          "neg_dist_threshold must greater than zero in max_negative mode");
+    } else if (mining_type == MiningType::kHardExample) {
+      auto sample_size = ctx->Attrs().Get<int>("sample_size");
+      PADDLE_ENFORCE_GT(
+          sample_size, 0,
+          "sample_size must greater than zero in hard_example mode");
+    }
+
+    ctx->SetOutputDim("UpdatedMatchIndices", idx_dims);
+    // The first dimension of NegIndices will be set correcttly in Compute.
+    ctx->SetOutputDim("NegIndices", {-1, 1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("ClsLoss")->type()),
+        platform::CPUPlace());
+  }
+};
+
+class MineHardExamplesOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "ClsLoss",
+        "(Tensor, default Tensor<float>), The classification loss with shape "
+        "[N, Np], N is the batch size and Np is the number of prior box.");
+    AddInput("LocLoss",
+             "(Tensor, optional, default Tensor<float>), The localization loss "
+             "with shape [N, Np], N is the batch size and Np is the number of "
+             "prior box.")
+        .AsDispensable();
+    AddInput("MatchIndices",
+             "(Tensor, Tensor<int>), Matched indices with shape [N, Np], N is "
+             "the batch size and Np is the number of prior box. "
+             "MatchIndices[i][j] equal -1 means the j-th prior box in i-th "
+             "instance does not match any entity, otherwise means it is "
+             "matched to row.");
+    AddInput("MatchDist",
+             "(Tensor, default Tensor<float>) Matched indices with shape [N, "
+             "Np], N is the batch size and Np is the number of prior box.");
+    AddAttr<float>("neg_pos_ratio",
+                   "(float) The ratio of the negative box to the positive "
+                   "box. Use only when mining_type is max_negative.")
+        .SetDefault(1.0);
+    AddAttr<float>("neg_dist_threshold",
+                   "(float) The negative overlap upper bound for the unmatched "
+                   "predictions. Use only when mining_type is max_negative.")
+        .SetDefault(0.5);
+    AddAttr<int>("sample_size",
+                 "(float) The max sample size of negative box. Use only when "
+                 "mining_type is hard_example.")
+        .SetDefault(0);
+    AddAttr<std::string>("mining_type",
+                         "(float) The mining algorithm name, the value is "
+                         "hard_example or max_negative.")
+        .SetDefault("max_negative")
+        .InEnum({"hard_example", "max_negative"});
+
+    AddOutput(
+        "NegIndices",
+        "(LoDTensor<int>) The output of negative example indices. a LoDTensor "
+        "with shape [Neg, 1]. The size of lod[0] minus 1 is batch size, "
+        "and each element is the prior box index. "
+        "For example, the batch size is 2, the lod is [[0, 1, 2]], "
+        "the sample 0's box 1(MatchIndices[0][1]) is selected, "
+        "and sample 1's box 0 is selected. The output NegIndices is "
+        "[[1], [0]].");
+
+    AddOutput("UpdatedMatchIndices",
+              "(Tensor<int>) The output of updated MatchIndices, a tensor with "
+              "shape [N, Np]. Only update when mining_type is "
+              "hard_example. The input MatchIndices elements will be update to "
+              "-1 when it is not in the candidate high loss list of negative "
+              "examples.");
+
+    AddComment(R"DOC(
+Mine hard examples Operator.
+This operator implements hard example mining to select a subset of negative box indices.
+For each image, selects the box with highest losses. subject to the condition that the 
+box cannot have an Matcht > neg_dist_threshold when mining_type is max_negative. 
+The selected number is min(sample_size, max_negative_box_number) when mining_type is 
+hard_example, or min(neg_pos_ratio * positive_box_number, max_negative_box_number) 
+when mining_type is max_negative, where the max_negative_box_number is the count of 
+MatchIndices elements with value -1.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(mine_hard_examples, ops::MineHardExamplesOp,
+                  ops::MineHardExamplesOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    mine_hard_examples,
+    ops::MineHardExamplesKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MineHardExamplesKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..60b93efdce810f8552374449fe5a6fc79b1a92c1
--- /dev/null
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -0,0 +1,392 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+constexpr int64_t kOutputDim = 6;
+constexpr int64_t kBBoxSize = 4;
+
+class MultiClassNMSOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("BBoxes"),
+                   "Input(BBoxes) of MultiClassNMS should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Scores"),
+                   "Input(Scores) of MultiClassNMS should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of MultiClassNMS should not be null.");
+
+    auto box_dims = ctx->GetInputDim("BBoxes");
+    auto score_dims = ctx->GetInputDim("Scores");
+
+    PADDLE_ENFORCE_EQ(box_dims.size(), 3,
+                      "The rank of Input(BBoxes) must be 3.");
+    PADDLE_ENFORCE_EQ(score_dims.size(), 3,
+                      "The rank of Input(Scores) must be 3.");
+    PADDLE_ENFORCE_EQ(box_dims[2], 4,
+                      "The 2nd dimension of Input(BBoxes) must be 4, "
+                      "represents the layout of coordinate "
+                      "[xmin, ymin, xmax, ymax]");
+    PADDLE_ENFORCE_EQ(box_dims[1], score_dims[2],
+                      "The 1st dimensiong of Input(BBoxes) must be equal to "
+                      "3rd dimension of Input(Scores), which represents the "
+                      "predicted bboxes.");
+
+    // Here the box_dims[0] is not the real dimension of output.
+    // It will be rewritten in the computing kernel.
+    ctx->SetOutputDim("Out", {box_dims[1], 6});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>("Scores")->type()),
+        platform::CPUPlace());
+  }
+};
+
+template <class T>
+bool SortScorePairDescend(const std::pair<float, T>& pair1,
+                          const std::pair<float, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+template <class T>
+static inline void GetMaxScoreIndex(
+    const std::vector<T>& scores, const T threshold, int top_k,
+    std::vector<std::pair<T, int>>* sorted_indices) {
+  for (size_t i = 0; i < scores.size(); ++i) {
+    if (scores[i] > threshold) {
+      sorted_indices->push_back(std::make_pair(scores[i], i));
+    }
+  }
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
+                   SortScorePairDescend<int>);
+  // Keep top_k scores if needed.
+  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
+    sorted_indices->resize(top_k);
+  }
+}
+
+template <class T>
+static inline T BBoxArea(const T* box, const bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+template <class T>
+static inline T JaccardOverlap(const T* box1, const T* box2,
+                               const bool normalized) {
+  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+      box2[3] < box1[1]) {
+    return static_cast<T>(0.);
+  } else {
+    const T inter_xmin = std::max(box1[0], box2[0]);
+    const T inter_ymin = std::max(box1[1], box2[1]);
+    const T inter_xmax = std::min(box1[2], box2[2]);
+    const T inter_ymax = std::min(box1[3], box2[3]);
+    const T inter_w = inter_xmax - inter_xmin;
+    const T inter_h = inter_ymax - inter_ymin;
+    const T inter_area = inter_w * inter_h;
+    const T bbox1_area = BBoxArea<T>(box1, normalized);
+    const T bbox2_area = BBoxArea<T>(box2, normalized);
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+
+template <typename T>
+class MultiClassNMSKernel : public framework::OpKernel<T> {
+ public:
+  void NMSFast(const Tensor& bbox, const Tensor& scores,
+               const T score_threshold, const T nms_threshold, const T eta,
+               const int64_t top_k, std::vector<int>* selected_indices) const {
+    // The total boxes for each instance.
+    int64_t num_boxes = bbox.dims()[0];
+    // 4: [xmin ymin xmax ymax]
+    int64_t box_size = bbox.dims()[1];
+
+    std::vector<T> scores_data(num_boxes);
+    std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
+    std::vector<std::pair<T, int>> sorted_indices;
+    GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
+
+    selected_indices->clear();
+    T adaptive_threshold = nms_threshold;
+    const T* bbox_data = bbox.data<T>();
+
+    while (sorted_indices.size() != 0) {
+      const int idx = sorted_indices.front().second;
+      bool keep = true;
+      for (size_t k = 0; k < selected_indices->size(); ++k) {
+        if (keep) {
+          const int kept_idx = (*selected_indices)[k];
+          T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
+                                        bbox_data + kept_idx * box_size, true);
+          keep = overlap <= adaptive_threshold;
+        } else {
+          break;
+        }
+      }
+      if (keep) {
+        selected_indices->push_back(idx);
+      }
+      sorted_indices.erase(sorted_indices.begin());
+      if (keep && eta < 1 && adaptive_threshold > 0.5) {
+        adaptive_threshold *= eta;
+      }
+    }
+  }
+
+  void MultiClassNMS(const framework::ExecutionContext& ctx,
+                     const Tensor& scores, const Tensor& bboxes,
+                     std::map<int, std::vector<int>>* indices,
+                     int* num_nmsed_out) const {
+    int64_t background_label = ctx.Attr<int>("background_label");
+    int64_t nms_top_k = ctx.Attr<int>("nms_top_k");
+    int64_t keep_top_k = ctx.Attr<int>("keep_top_k");
+    T nms_threshold = static_cast<T>(ctx.Attr<float>("nms_threshold"));
+    T nms_eta = static_cast<T>(ctx.Attr<float>("nms_eta"));
+    T score_threshold = static_cast<T>(ctx.Attr<float>("score_threshold"));
+
+    int64_t class_num = scores.dims()[0];
+    int64_t predict_dim = scores.dims()[1];
+    int num_det = 0;
+    for (int64_t c = 0; c < class_num; ++c) {
+      if (c == background_label) continue;
+      Tensor score = scores.Slice(c, c + 1);
+      NMSFast(bboxes, score, score_threshold, nms_threshold, nms_eta, nms_top_k,
+              &((*indices)[c]));
+      num_det += (*indices)[c].size();
+    }
+
+    *num_nmsed_out = num_det;
+    const T* scores_data = scores.data<T>();
+    if (keep_top_k > -1 && num_det > keep_top_k) {
+      std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
+      for (const auto& it : *indices) {
+        int label = it.first;
+        const T* sdata = scores_data + label * predict_dim;
+        const std::vector<int>& label_indices = it.second;
+        for (size_t j = 0; j < label_indices.size(); ++j) {
+          int idx = label_indices[j];
+          PADDLE_ENFORCE_LT(idx, predict_dim);
+          score_index_pairs.push_back(
+              std::make_pair(sdata[idx], std::make_pair(label, idx)));
+        }
+      }
+      // Keep top k results per image.
+      std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
+                       SortScorePairDescend<std::pair<int, int>>);
+      score_index_pairs.resize(keep_top_k);
+
+      // Store the new indices.
+      std::map<int, std::vector<int>> new_indices;
+      for (size_t j = 0; j < score_index_pairs.size(); ++j) {
+        int label = score_index_pairs[j].second.first;
+        int idx = score_index_pairs[j].second.second;
+        new_indices[label].push_back(idx);
+      }
+      new_indices.swap(*indices);
+      *num_nmsed_out = keep_top_k;
+    }
+  }
+
+  void MultiClassOutput(const Tensor& scores, const Tensor& bboxes,
+                        const std::map<int, std::vector<int>>& selected_indices,
+                        Tensor* outs) const {
+    int predict_dim = scores.dims()[1];
+    auto* scores_data = scores.data<T>();
+    auto* bboxes_data = bboxes.data<T>();
+    auto* odata = outs->data<T>();
+
+    int count = 0;
+    for (const auto& it : selected_indices) {
+      int label = it.first;
+      const T* sdata = scores_data + label * predict_dim;
+      const std::vector<int>& indices = it.second;
+      for (size_t j = 0; j < indices.size(); ++j) {
+        int idx = indices[j];
+        const T* bdata = bboxes_data + idx * kBBoxSize;
+        odata[count * kOutputDim] = label;           // label
+        odata[count * kOutputDim + 1] = sdata[idx];  // score
+        // xmin, ymin, xmax, ymax
+        std::memcpy(odata + count * kOutputDim + 2, bdata, 4 * sizeof(T));
+        count++;
+      }
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* boxes = ctx.Input<Tensor>("BBoxes");
+    auto* scores = ctx.Input<Tensor>("Scores");
+    auto* outs = ctx.Output<LoDTensor>("Out");
+
+    auto score_dims = scores->dims();
+
+    int64_t batch_size = score_dims[0];
+    int64_t class_num = score_dims[1];
+    int64_t predict_dim = score_dims[2];
+    int64_t box_dim = boxes->dims()[2];
+
+    std::vector<std::map<int, std::vector<int>>> all_indices;
+    std::vector<size_t> batch_starts = {0};
+    for (int64_t i = 0; i < batch_size; ++i) {
+      Tensor ins_score = scores->Slice(i, i + 1);
+      ins_score.Resize({class_num, predict_dim});
+
+      Tensor ins_boxes = boxes->Slice(i, i + 1);
+      ins_boxes.Resize({predict_dim, box_dim});
+
+      std::map<int, std::vector<int>> indices;
+      int num_nmsed_out = 0;
+      MultiClassNMS(ctx, ins_score, ins_boxes, &indices, &num_nmsed_out);
+      all_indices.push_back(indices);
+      batch_starts.push_back(batch_starts.back() + num_nmsed_out);
+    }
+
+    int num_kept = batch_starts.back();
+    if (num_kept == 0) {
+      T* od = outs->mutable_data<T>({1}, ctx.GetPlace());
+      od[0] = -1;
+    } else {
+      outs->mutable_data<T>({num_kept, kOutputDim}, ctx.GetPlace());
+      for (int64_t i = 0; i < batch_size; ++i) {
+        Tensor ins_score = scores->Slice(i, i + 1);
+        ins_score.Resize({class_num, predict_dim});
+
+        Tensor ins_boxes = boxes->Slice(i, i + 1);
+        ins_boxes.Resize({predict_dim, box_dim});
+
+        int64_t s = batch_starts[i];
+        int64_t e = batch_starts[i + 1];
+        if (e > s) {
+          Tensor out = outs->Slice(s, e);
+          MultiClassOutput(ins_score, ins_boxes, all_indices[i], &out);
+        }
+      }
+    }
+
+    framework::LoD lod;
+    lod.emplace_back(batch_starts);
+
+    outs->set_lod(lod);
+  }
+};
+
+class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("BBoxes",
+             "(Tensor) A 3-D Tensor with shape [N, M, 4] represents the "
+             "predicted locations of M bounding bboxes, N is the batch size. "
+             "Each bounding box has four coordinate values and the layout is "
+             "[xmin, ymin, xmax, ymax].");
+    AddInput("Scores",
+             "(Tensor) A 3-D Tensor with shape [N, C, M] represents the "
+             "predicted confidence predictions. N is the batch size, C is the "
+             "class number, M is number of bounding boxes. For each category "
+             "there are total M scores which corresponding M bounding boxes. "
+             " Please note, M is equal to the 1st dimension of BBoxes. ");
+    AddAttr<int>(
+        "background_label",
+        "(int, defalut: 0) "
+        "The index of background label, the background label will be ignored. "
+        "If set to -1, then all categories will be considered.")
+        .SetDefault(0);
+    AddAttr<float>("score_threshold",
+                   "(float) "
+                   "Threshold to filter out bounding boxes with low "
+                   "confidence score. If not provided, consider all boxes.");
+    AddAttr<int>("nms_top_k",
+                 "(int64_t) "
+                 "Maximum number of detections to be kept according to the "
+                 "confidences aftern the filtering detections based on "
+                 "score_threshold");
+    AddAttr<float>("nms_threshold",
+                   "(float, defalut: 0.3) "
+                   "The threshold to be used in NMS.")
+        .SetDefault(0.3);
+    AddAttr<float>("nms_eta",
+                   "(float) "
+                   "The parameter for adaptive NMS.")
+        .SetDefault(1.0);
+    AddAttr<int>("keep_top_k",
+                 "(int64_t) "
+                 "Number of total bboxes to be kept per image after NMS "
+                 "step. -1 means keeping all bboxes after NMS step.");
+    AddOutput("Out",
+              "(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the "
+              "detections. Each row has 6 values: "
+              "[label, confidence, xmin, ymin, xmax, ymax], No is the total "
+              "number of detections in this mini-batch. For each instance, "
+              "the offsets in first dimension are called LoD, the number of "
+              "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
+              "no detected bbox.");
+    AddComment(R"DOC(
+This operator is to do multi-class non maximum suppression (NMS) on a batched
+of boxes and scores.
+
+In the NMS step, this operator greedily selects a subset of detection bounding
+boxes that have high scores larger than score_threshold, if providing this
+threshold, then selects the largest nms_top_k confidences scores if nms_top_k
+is larger than -1. Then this operator pruns away boxes that have high IOU
+(intersection over union) overlap with already selected boxes by adaptive
+threshold NMS based on parameters of nms_threshold and nms_eta.
+
+Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
+per image if keep_top_k is larger than -1.
+
+This operator support multi-class and batched inputs. It applying NMS
+independently for each class. The outputs is a 2-D LoDTenosr, for each
+image, the offsets in first dimension of LoDTensor are called LoD, the number
+of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0,
+means there is no detected bbox for this image. If there is no detected boxes
+for all images, all the elements in LoD are 0, and the Out only contains one
+value which is -1.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(multiclass_nms, ops::MultiClassNMSOp,
+                  ops::MultiClassNMSOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(multiclass_nms, ops::MultiClassNMSKernel<float>,
+                       ops::MultiClassNMSKernel<double>);
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cc b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..568d50d457d838d5f11605710c0d3b987af01d10
--- /dev/null
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
@@ -0,0 +1,107 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class PolygonBoxTransformCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto* in = ctx.Input<Tensor>("Input");
+    auto in_dims = in->dims();
+    const T* in_data = in->data<T>();
+    auto* out = ctx.Output<Tensor>("Output");
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    int batch_size = in_dims[0];
+    int geo_channel = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    int id = 0;
+    for (int id_n = 0; id_n < batch_size * geo_channel; ++id_n) {
+      for (int id_h = 0; id_h < height; ++id_h) {
+        for (int id_w = 0; id_w < width; ++id_w) {
+          id = id_n * height * width + width * id_h + id_w;
+          if (id_n % 2 == 0) {
+            out_data[id] = id_w - in_data[id];
+          } else {
+            out_data[id] = id_h - in_data[id];
+          }
+        }
+      }
+    }
+  }
+};
+
+class PolygonBoxTransformOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("Input"),
+        "Input (Input) of polygon_box transform op should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Output"),
+        "Output (Output) of polygon_box transform op should not be null.");
+
+    auto in_dim = ctx->GetInputDim("Input");
+
+    PADDLE_ENFORCE_EQ(in_dim.size(), 4, "input's rank must be 4.");
+    PADDLE_ENFORCE_EQ(in_dim[1] % 2, 0,
+                      "input's second dimension must be even.");
+
+    ctx->SetOutputDim("Output", in_dim);
+  }
+};
+
+class PolygonBoxTransformOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "Input",
+        "The input with shape [batch_size, geometry_channels, height, width]");
+    AddOutput("Output", "The output with the same shape as input");
+
+    AddComment(R"DOC(
+PolygonBoxTransform Operator.
+
+PolygonBoxTransform Operator is used to transform the coordinate shift to the real coordinate.
+
+The input is the final geometry output in detection network.
+We use 2*n numbers to denote the coordinate shift from n corner vertices of
+the polygon_box to the pixel location. As each distance offset contains two numbers (xi, yi),
+the geometry output contains 2*n channels.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(polygon_box_transform, ops::PolygonBoxTransformOp,
+                  ops::PolygonBoxTransformOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    polygon_box_transform,
+    ops::PolygonBoxTransformCPUKernel<paddle::platform::CPUPlace, float>,
+    ops::PolygonBoxTransformCPUKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cu b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6187ac6622c65d2bbc525c3fe2cb397cf74ac612
--- /dev/null
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
@@ -0,0 +1,76 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using platform::PADDLE_CUDA_NUM_THREADS;
+#define CUDA_BLOCK_SIZE 16
+
+template <typename T>
+__global__ void PolygonBoxTransformKernel(const int n, const int h, const int w,
+                                          const T* input, T* output) {
+  int id_n = threadIdx.x + blockDim.x * blockIdx.x;
+  int id_h = threadIdx.y + blockDim.y * blockIdx.y;
+  int id_w = threadIdx.z + blockDim.z * blockIdx.z;
+  if (id_n < n && id_h < h && id_w < w) {
+    int id = id_n * h * w + w * id_h + id_w;
+    if (id_n % 2 == 0) {
+      output[id] = id_w - input[id];
+    } else {
+      output[id] = id_h - input[id];
+    }
+  }
+}
+
+template <typename T>
+class PolygonBoxTransformOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto* in = ctx.Input<Tensor>("Input");
+    auto in_dims = in->dims();
+    const T* in_data = in->data<T>();
+    auto* out = ctx.Output<Tensor>("Output");
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    int batch_size = in_dims[0];
+    int geo_channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    dim3 threadsPerBlock(
+        PADDLE_CUDA_NUM_THREADS / (CUDA_BLOCK_SIZE * CUDA_BLOCK_SIZE),
+        CUDA_BLOCK_SIZE, CUDA_BLOCK_SIZE);
+    dim3 numBlocks((batch_size * geo_channels) / threadsPerBlock.x,
+                   (height + threadsPerBlock.y - 1) / threadsPerBlock.y,
+                   (width + threadsPerBlock.z - 1) / threadsPerBlock.z);
+    auto stream = ctx.cuda_device_context().stream();
+    PolygonBoxTransformKernel<T><<<numBlocks, threadsPerBlock, 0, stream>>>(
+        batch_size * geo_channels, height, width, in_data, out_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(
+    polygon_box_transform,
+    paddle::operators::PolygonBoxTransformOpCUDAKernel<float>,
+    paddle::operators::PolygonBoxTransformOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/detection/prior_box_op.cc b/paddle/fluid/operators/detection/prior_box_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4e35c38e4e03d4d0f00601812fdc4803519b89ae
--- /dev/null
+++ b/paddle/fluid/operators/detection/prior_box_op.cc
@@ -0,0 +1,174 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/prior_box_op.h"
+
+namespace paddle {
+namespace operators {
+
+class PriorBoxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of PriorBoxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Image"),
+                   "Input(Image) of PriorBoxOp should not be null.");
+
+    auto image_dims = ctx->GetInputDim("Image");
+    auto input_dims = ctx->GetInputDim("Input");
+    PADDLE_ENFORCE(image_dims.size() == 4, "The layout of image is NCHW.");
+    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
+
+    PADDLE_ENFORCE_LT(input_dims[2], image_dims[2],
+                      "The height of input must smaller than image.");
+
+    PADDLE_ENFORCE_LT(input_dims[3], image_dims[3],
+                      "The width of input must smaller than image.");
+
+    auto min_sizes = ctx->Attrs().Get<std::vector<float>>("min_sizes");
+    auto max_sizes = ctx->Attrs().Get<std::vector<float>>("max_sizes");
+    auto variances = ctx->Attrs().Get<std::vector<float>>("variances");
+    auto aspect_ratios = ctx->Attrs().Get<std::vector<float>>("aspect_ratios");
+    bool flip = ctx->Attrs().Get<bool>("flip");
+
+    std::vector<float> aspect_ratios_vec;
+    ExpandAspectRatios(aspect_ratios, flip, &aspect_ratios_vec);
+
+    size_t num_priors = aspect_ratios_vec.size() * min_sizes.size();
+    if (max_sizes.size() > 0) {
+      PADDLE_ENFORCE_EQ(max_sizes.size(), min_sizes.size(),
+                        "The number of min_size and max_size must be equal.");
+      num_priors += max_sizes.size();
+      for (size_t i = 0; i < max_sizes.size(); ++i) {
+        PADDLE_ENFORCE_GT(max_sizes[i], min_sizes[i],
+                          "max_size[%d] must be greater than min_size[%d].", i,
+                          i);
+      }
+    }
+
+    std::vector<int64_t> dim_vec(4);
+    dim_vec[0] = input_dims[2];
+    dim_vec[1] = input_dims[3];
+    dim_vec[2] = num_priors;
+    dim_vec[3] = 4;
+    ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
+    ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
+        ctx.device_context());
+  }
+};
+
+class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Input",
+             "(Tensor, default Tensor<float>), "
+             "the input feature data of PriorBoxOp, The layout is NCHW.");
+    AddInput("Image",
+             "(Tensor, default Tensor<float>), "
+             "the input image data of PriorBoxOp, The layout is NCHW.");
+    AddOutput("Boxes",
+              "(Tensor, default Tensor<float>), the output prior boxes of "
+              "PriorBoxOp. The layout is [H, W, num_priors, 4]. "
+              "H is the height of input, W is the width of input, num_priors "
+              "is the box count of each position.");
+    AddOutput("Variances",
+              "(Tensor, default Tensor<float>), the expanded variances of "
+              "PriorBoxOp. The layout is [H, W, num_priors, 4]. "
+              "H is the height of input, W is the width of input, num_priors "
+              "is the box count of each position.");
+
+    AddAttr<std::vector<float>>("min_sizes",
+                                "(vector<float>) List of min sizes "
+                                "of generated prior boxes.")
+        .AddCustomChecker([](const std::vector<float>& min_sizes) {
+          PADDLE_ENFORCE_GT(min_sizes.size(), 0,
+                            "Size of min_sizes must be at least 1.");
+          for (size_t i = 0; i < min_sizes.size(); ++i) {
+            PADDLE_ENFORCE_GT(min_sizes[i], 0.0,
+                              "min_sizes[%d] must be positive.", i);
+          }
+        });
+    AddAttr<std::vector<float>>(
+        "max_sizes",
+        "(vector<float>) List of max sizes of generated prior boxes.")
+        .SetDefault(std::vector<float>{});
+    AddAttr<std::vector<float>>(
+        "aspect_ratios",
+        "(vector<float>) List of aspect ratios of generated prior boxes.");
+
+    AddAttr<std::vector<float>>(
+        "variances",
+        "(vector<float>) List of variances to be encoded in prior boxes.")
+        .AddCustomChecker([](const std::vector<float>& variances) {
+          PADDLE_ENFORCE_EQ(variances.size(), 4,
+                            "Must and only provide 4 variance.");
+          for (size_t i = 0; i < variances.size(); ++i) {
+            PADDLE_ENFORCE_GT(variances[i], 0.0,
+                              "variance[%d] must be greater than 0.", i);
+          }
+        });
+    AddAttr<bool>("flip", "(bool) Whether to flip aspect ratios.")
+        .SetDefault(true);
+    AddAttr<bool>("clip", "(bool) Whether to clip out-of-boundary boxes.")
+        .SetDefault(true);
+
+    AddAttr<float>("step_w",
+                   "Prior boxes step across width, 0.0 for auto calculation.")
+        .SetDefault(0.0)
+        .AddCustomChecker([](const float& step_w) {
+          PADDLE_ENFORCE_GE(step_w, 0.0, "step_w should be larger than 0.");
+        });
+    AddAttr<float>("step_h",
+                   "Prior boxes step across height, 0.0 for auto calculation.")
+        .SetDefault(0.0)
+        .AddCustomChecker([](const float& step_h) {
+          PADDLE_ENFORCE_GE(step_h, 0.0, "step_h should be larger than 0.");
+        });
+
+    AddAttr<float>("offset",
+                   "(float) "
+                   "Prior boxes center offset.")
+        .SetDefault(0.5);
+    AddComment(R"DOC(
+Prior box operator
+Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
+Each position of the input produce N prior boxes, N is determined by
+ the count of min_sizes, max_sizes and aspect_ratios, The size of the
+ box is in range(min_size, max_size) interval, which is generated in
+ sequence according to the aspect_ratios.
+
+Please get more information from the following papers:
+https://arxiv.org/abs/1512.02325.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+
+REGISTER_OP_CPU_KERNEL(prior_box, ops::PriorBoxOpKernel<float>,
+                       ops::PriorBoxOpKernel<double>);
diff --git a/paddle/fluid/operators/detection/prior_box_op.cu b/paddle/fluid/operators/detection/prior_box_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f67e6ca91c0852b5a3be35d23246884d1157caa4
--- /dev/null
+++ b/paddle/fluid/operators/detection/prior_box_op.cu
@@ -0,0 +1,167 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/prior_box_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__device__ inline T clip(T in) {
+  return min(max(in, 0.), 1.);
+}
+
+template <typename T>
+__global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
+                            const int width, const int im_height,
+                            const int im_width, const int as_num,
+                            const T offset, const T step_width,
+                            const T step_height, const T* min_sizes,
+                            const T* max_sizes, const int min_num,
+                            bool is_clip) {
+  int num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num;
+  int box_num = height * width * num_priors;
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num;
+       i += blockDim.x * gridDim.x) {
+    int h = i / (num_priors * width);
+    int w = (i / num_priors) % width;
+    int p = i % num_priors;
+    int m = max_sizes ? p / (as_num + 1) : p / as_num;
+    T cx = (w + offset) * step_width;
+    T cy = (h + offset) * step_height;
+    T bw, bh;
+    T min_size = min_sizes[m];
+    if (max_sizes) {
+      int s = p % (as_num + 1);
+      if (s < as_num) {
+        T ar = aspect_ratios[s];
+        bw = min_size * sqrt(ar) / 2.;
+        bh = min_size / sqrt(ar) / 2.;
+      } else {
+        T max_size = max_sizes[m];
+        bw = sqrt(min_size * max_size) / 2.;
+        bh = bw;
+      }
+    } else {
+      int s = p % as_num;
+      T ar = aspect_ratios[s];
+      bw = min_size * sqrt(ar) / 2.;
+      bh = min_size / sqrt(ar) / 2.;
+    }
+    T xmin = (cx - bw) / im_width;
+    T ymin = (cy - bh) / im_height;
+    T xmax = (cx + bw) / im_width;
+    T ymax = (cy + bh) / im_height;
+    out[i * 4] = is_clip ? clip<T>(xmin) : xmin;
+    out[i * 4 + 1] = is_clip ? clip<T>(ymin) : ymin;
+    out[i * 4 + 2] = is_clip ? clip<T>(xmax) : xmax;
+    out[i * 4 + 3] = is_clip ? clip<T>(ymax) : ymax;
+  }
+}
+
+template <typename T>
+__global__ void SetVariance(T* out, const T* var, const int vnum,
+                            const int num) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+       i += blockDim.x * gridDim.x) {
+    out[i] = var[i % vnum];
+  }
+}
+
+template <typename T>
+class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
+    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
+    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
+    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+
+    auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
+    auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
+    auto input_aspect_ratio = ctx.Attr<std::vector<float>>("aspect_ratios");
+    auto variances = ctx.Attr<std::vector<float>>("variances");
+    auto flip = ctx.Attr<bool>("flip");
+    auto clip = ctx.Attr<bool>("clip");
+
+    std::vector<float> aspect_ratios;
+    ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
+
+    T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
+    T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
+    T offset = static_cast<T>(ctx.Attr<float>("offset"));
+
+    auto im_width = image->dims()[3];
+    auto im_height = image->dims()[2];
+
+    auto width = input->dims()[3];
+    auto height = input->dims()[2];
+
+    T step_width, step_height;
+    if (step_w == 0 || step_h == 0) {
+      step_width = static_cast<T>(im_width) / width;
+      step_height = static_cast<T>(im_height) / height;
+    } else {
+      step_width = step_w;
+      step_height = step_h;
+    }
+
+    int num_priors = aspect_ratios.size() * min_sizes.size();
+    if (max_sizes.size() > 0) {
+      num_priors += max_sizes.size();
+    }
+    int min_num = static_cast<int>(min_sizes.size());
+    int box_num = width * height * num_priors;
+
+    int block = 512;
+    int grid = (box_num + block - 1) / block;
+
+    auto stream =
+        ctx.template device_context<platform::CUDADeviceContext>().stream();
+
+    boxes->mutable_data<T>(ctx.GetPlace());
+    vars->mutable_data<T>(ctx.GetPlace());
+
+    framework::Tensor r;
+    framework::TensorFromVector(aspect_ratios, ctx.device_context(), &r);
+
+    framework::Tensor min;
+    framework::TensorFromVector(min_sizes, ctx.device_context(), &min);
+
+    T* max_data = nullptr;
+    framework::Tensor max;
+    if (max_sizes.size() > 0) {
+      framework::TensorFromVector(max_sizes, ctx.device_context(), &max);
+      max_data = max.data<T>();
+    }
+
+    GenPriorBox<T><<<grid, block, 0, stream>>>(
+        boxes->data<T>(), r.data<T>(), height, width, im_height, im_width,
+        aspect_ratios.size(), offset, step_width, step_height, min.data<T>(),
+        max_data, min_num, clip);
+
+    framework::Tensor v;
+    framework::TensorFromVector(variances, ctx.device_context(), &v);
+    grid = (box_num * 4 + block - 1) / block;
+    SetVariance<T><<<grid, block, 0, stream>>>(vars->data<T>(), v.data<T>(),
+                                               variances.size(), box_num * 4);
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(prior_box, ops::PriorBoxOpCUDAKernel<float>,
+                        ops::PriorBoxOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/detection/prior_box_op.h b/paddle/fluid/operators/detection/prior_box_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c62fd8d2c4d4e4deba4ca6442efbaff83e36c35
--- /dev/null
+++ b/paddle/fluid/operators/detection/prior_box_op.h
@@ -0,0 +1,165 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
+                               bool flip,
+                               std::vector<float>* output_aspect_ratior) {
+  constexpr float epsilon = 1e-6;
+  output_aspect_ratior->clear();
+  output_aspect_ratior->push_back(1.0f);
+  for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
+    float ar = input_aspect_ratior[i];
+    bool already_exist = false;
+    for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
+      if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
+        already_exist = true;
+        break;
+      }
+    }
+    if (!already_exist) {
+      output_aspect_ratior->push_back(ar);
+      if (flip) {
+        output_aspect_ratior->push_back(1.0f / ar);
+      }
+    }
+  }
+}
+
+template <typename T>
+struct ClipFunctor {
+  HOSTDEVICE inline T operator()(T in) const {
+    return std::min<T>(std::max<T>(in, 0.), 1.);
+  }
+};
+
+template <typename T>
+class PriorBoxOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
+    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
+    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
+    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+
+    auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
+    auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
+    auto input_aspect_ratio = ctx.Attr<std::vector<float>>("aspect_ratios");
+    auto variances = ctx.Attr<std::vector<float>>("variances");
+    auto flip = ctx.Attr<bool>("flip");
+    auto clip = ctx.Attr<bool>("clip");
+
+    std::vector<float> aspect_ratios;
+    ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
+
+    T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
+    T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
+    T offset = static_cast<T>(ctx.Attr<float>("offset"));
+
+    auto img_width = image->dims()[3];
+    auto img_height = image->dims()[2];
+
+    auto feature_width = input->dims()[3];
+    auto feature_height = input->dims()[2];
+
+    T step_width, step_height;
+    if (step_w == 0 || step_h == 0) {
+      step_width = static_cast<T>(img_width) / feature_width;
+      step_height = static_cast<T>(img_height) / feature_height;
+    } else {
+      step_width = step_w;
+      step_height = step_h;
+    }
+
+    int num_priors = aspect_ratios.size() * min_sizes.size();
+    if (max_sizes.size() > 0) {
+      num_priors += max_sizes.size();
+    }
+
+    boxes->mutable_data<T>(ctx.GetPlace());
+    vars->mutable_data<T>(ctx.GetPlace());
+
+    auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes);
+    for (int h = 0; h < feature_height; ++h) {
+      for (int w = 0; w < feature_width; ++w) {
+        T center_x = (w + offset) * step_width;
+        T center_y = (h + offset) * step_height;
+        T box_width, box_height;
+        int idx = 0;
+        for (size_t s = 0; s < min_sizes.size(); ++s) {
+          auto min_size = min_sizes[s];
+          // priors with different aspect ratios
+          for (size_t r = 0; r < aspect_ratios.size(); ++r) {
+            float ar = aspect_ratios[r];
+            box_width = min_size * sqrt(ar) / 2.;
+            box_height = min_size / sqrt(ar) / 2.;
+            e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
+            e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
+            e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
+            e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
+            idx++;
+          }
+          if (max_sizes.size() > 0) {
+            auto max_size = max_sizes[s];
+            // square prior with size sqrt(minSize * maxSize)
+            box_width = box_height = sqrt(min_size * max_size) / 2.;
+            e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
+            e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
+            e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
+            e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
+            idx++;
+          }
+        }
+      }
+    }
+
+    if (clip) {
+      platform::Transform<platform::CPUDeviceContext> trans;
+      ClipFunctor<T> clip_func;
+      trans(ctx.template device_context<platform::CPUDeviceContext>(),
+            boxes->data<T>(), boxes->data<T>() + boxes->numel(),
+            boxes->data<T>(), clip_func);
+    }
+
+    framework::Tensor var_t;
+    var_t.mutable_data<T>(
+        framework::make_ddim({1, static_cast<int>(variances.size())}),
+        ctx.GetPlace());
+    auto var_et = framework::EigenTensor<T, 2>::From(var_t);
+    for (size_t i = 0; i < variances.size(); ++i) {
+      var_et(0, i) = variances[i];
+    }
+
+    int box_num = feature_height * feature_width * num_priors;
+    auto var_dim = vars->dims();
+    vars->Resize({box_num, static_cast<int>(variances.size())});
+
+    auto e_vars = framework::EigenMatrix<T, Eigen::RowMajor>::From(*vars);
+    e_vars = var_et.broadcast(Eigen::DSizes<int, 2>(box_num, 1));
+
+    vars->Resize(var_dim);
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/target_assign_op.cc b/paddle/fluid/operators/detection/target_assign_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..367001939251114a9cf442fd85c734958ccb2da8
--- /dev/null
+++ b/paddle/fluid/operators/detection/target_assign_op.cc
@@ -0,0 +1,160 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/target_assign_op.h"
+
+namespace paddle {
+namespace operators {
+
+class TargetAssignOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of TargetAssignOp should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("MatchIndices"),
+                   "Input(MatchIndices) of TargetAssignOp should not be null");
+
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of TargetAssignOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutWeight"),
+                   "Output(OutWeight) of TargetAssignOp should not be null.");
+
+    auto in_dims = ctx->GetInputDim("X");
+    auto mi_dims = ctx->GetInputDim("MatchIndices");
+
+    PADDLE_ENFORCE_EQ(in_dims.size(), 3, "The rank of Input(X) must be 3.");
+    PADDLE_ENFORCE_EQ(mi_dims.size(), 2,
+                      "The rank of Input(MatchIndices) must be 2.");
+
+    if (ctx->HasInput("NegIndices")) {
+      auto neg_dims = ctx->GetInputDim("NegIndices");
+      PADDLE_ENFORCE_EQ(neg_dims.size(), 2,
+                        "The rank of Input(NegIndices) must be 2.");
+      PADDLE_ENFORCE_EQ(neg_dims[1], 1,
+                        "The last dimenstion of Out(NegIndices) must be 1.");
+    }
+
+    auto n = mi_dims[0];
+    auto m = mi_dims[1];
+    auto k = in_dims[in_dims.size() - 1];
+    ctx->SetOutputDim("Out", {n, m, k});
+    ctx->SetOutputDim("OutWeight", {n, m, 1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class TargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(LoDTensor), This input is a 3D LoDTensor with shape [M, P, K]. "
+             "Some elements in X will be assigned to Out based on the "
+             "MatchIndices and NegIndices.");
+    AddInput("MatchIndices",
+             "(Tensor, default Tensor<int>), The input matched indices "
+             "with shape [N, P], If MatchIndices[i][j] is -1, the j-th entity "
+             "of column is not matched to any entity of row in i-th instance.");
+    AddInput("NegIndices",
+             "(LoDTensor, default LoDTensor<int>), The input negative example "
+             "indices are an optional input with shape [Neg, 1], where Neg is "
+             "the total number of negative example indices.")
+        .AsDispensable();
+    AddAttr<int>("mismatch_value",
+                 "(int, default 0), Fill this value to the "
+                 "mismatched location.")
+        .SetDefault(0);
+    AddOutput("Out",
+              "(Tensor), The output is a 3D Tensor with shape [N, P, K], "
+              "N and P is the same as they are in NegIndices, K is the "
+              "same as it in input of X. If MatchIndices[i][j] "
+              "is -1, the Out[i][j][0 : K] is the mismatch_value.");
+    AddOutput("OutWeight",
+              "(Tensor), The weight for output with the shape of [N, P, 1]");
+    AddComment(R"DOC(
+This operator can be, for given the target bounding boxes or labels,
+to assign classification and regression targets to each prediction as well as
+weights to prediction. The weights is used to specify which prediction would
+not contribute to training loss.
+
+For each instance, the output `Out` and`OutWeight` are assigned based on
+`MatchIndices` and `NegIndices`.
+Assumed that the row offset for each instance in `X` is called lod,
+this operator assigns classification/regression targets by performing the
+following steps:
+
+1. Assigning all outpts based on `MatchIndices`:
+
+If id = MatchIndices[i][j] > 0,
+
+    Out[i][j][0 : K] = X[lod[i] + id][j % P][0 : K]
+    OutWeight[i][j] = 1.
+
+Otherwise, 
+
+    Out[j][j][0 : K] = {mismatch_value, mismatch_value, ...}
+    OutWeight[i][j] = 0.
+
+2. Assigning OutWeight based on `NegIndices` if `NegIndices` is provided:
+
+Assumed that the row offset for each instance in `NegIndices` is called neg_lod,
+for i-th instance and each `id` of NegIndices in this instance:
+
+    Out[i][id][0 : K] = {mismatch_value, mismatch_value, ...}
+    OutWeight[i][id] = 1.0
+
+    )DOC");
+  }
+};
+
+template <typename T, typename WT>
+struct NegTargetAssignFunctor<platform::CPUDeviceContext, T, WT> {
+  void operator()(const platform::CPUDeviceContext& ctx, const int* neg_indices,
+                  const size_t* lod, const int N, const int M, const int K,
+                  const int mismatch_value, T* out, WT* out_wt) {
+    for (int i = 0; i < N; ++i) {
+      for (size_t j = lod[i]; j < lod[i + 1]; ++j) {
+        int id = neg_indices[j];
+        int off = (i * M + id) * K;
+        for (int k = 0; k < K; ++k) {
+          out[off + k] = mismatch_value;
+          out_wt[off + k] = static_cast<WT>(1.0);
+        }
+      }
+    }
+  }
+};
+
+template struct NegTargetAssignFunctor<platform::CPUDeviceContext, int, float>;
+template struct NegTargetAssignFunctor<platform::CPUDeviceContext, float,
+                                       float>;
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(target_assign, ops::TargetAssignOp, ops::TargetAssignOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    target_assign,
+    ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, int, float>,
+    ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, float, float>);
diff --git a/paddle/fluid/operators/detection/target_assign_op.cu b/paddle/fluid/operators/detection/target_assign_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ddf6889942355457fb281b6c33430ab8337db3ed
--- /dev/null
+++ b/paddle/fluid/operators/detection/target_assign_op.cu
@@ -0,0 +1,63 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/target_assign_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, typename WT>
+__global__ void NegTargetAssignKernel(const int* neg_indices, const size_t* lod,
+                                      const int N, const int M, const int K,
+                                      const int mismatch_value, T* out,
+                                      WT* out_wt) {
+  int bidx = blockIdx.x;
+  int st = lod[bidx];
+  int ed = lod[bidx + 1];
+
+  int row_start = bidx * M;
+  for (int i = st + threadIdx.x; i < ed; i += blockDim.x) {
+    int id = row_start + neg_indices[i];
+    for (int k = 0; k < K; ++k) {
+      out[id * K + k] = T(mismatch_value);
+      out_wt[id * K + k] = WT(1.);
+    }
+  }
+}
+
+template <typename T, typename WT>
+struct NegTargetAssignFunctor<platform::CUDADeviceContext, T, WT> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const int* neg_indices, const size_t* lod, const int N,
+                  const int M, const int K, const int mismatch_value, T* out,
+                  WT* out_wt) {
+    const int block_size = 256;
+    const int grid_size = N;
+    NegTargetAssignKernel<T, WT><<<grid_size, block_size, 0, ctx.stream()>>>(
+        neg_indices, lod, N, M, K, mismatch_value, out, out_wt);
+  }
+};
+
+template struct NegTargetAssignFunctor<platform::CUDADeviceContext, int, float>;
+template struct NegTargetAssignFunctor<platform::CUDADeviceContext, float,
+                                       float>;
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    target_assign,
+    ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, int, float>,
+    ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, float, float>);
diff --git a/paddle/fluid/operators/target_assign_op.h b/paddle/fluid/operators/detection/target_assign_op.h
similarity index 100%
rename from paddle/fluid/operators/target_assign_op.h
rename to paddle/fluid/operators/detection/target_assign_op.h
diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc
index 73c84c2fe0155d21d7059938330e44fa3668c6df..d7f49a9590e4ef4ca4d2ad5a92572c70e6bfb6ac 100644
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detection_map_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -50,7 +51,8 @@ class DetectionMAPOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(label_dims.size(), 2,
                       "The rank of Input(Label) must be 2, "
                       "the shape is [N, 6].");
-    PADDLE_ENFORCE_EQ(label_dims[1], 6, "The shape is of Input(Label) [N, 6].");
+    PADDLE_ENFORCE(label_dims[1] == 6 || label_dims[1] == 5,
+                   "The shape of Input(Label) is [N, 6] or [N, 5].");
 
     if (ctx->HasInput("PosCount")) {
       PADDLE_ENFORCE(ctx->HasInput("TruePos"),
@@ -77,8 +79,7 @@ class DetectionMAPOp : public framework::OperatorWithKernel {
 
 class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  DetectionMAPOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("DetectRes",
              "(LoDTensor) A 2-D LoDTensor with shape [M, 6] represents the "
              "detections. Each row has 6 values: "
@@ -88,9 +89,10 @@ class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker {
              "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
              "no detected data.");
     AddInput("Label",
-             "(LoDTensor) A 2-D LoDTensor with shape[N, 6] represents the"
+             "(LoDTensor) A 2-D LoDTensor represents the"
              "Labeled ground-truth data. Each row has 6 values: "
-             "[label, is_difficult, xmin, ymin, xmax, ymax], N is the total "
+             "[label, xmin, ymin, xmax, ymax, is_difficult] or 5 values: "
+             "[label, xmin, ymin, xmax, ymax], where N is the total "
              "number of ground-truth data in this mini-batch. For each "
              "instance, the offsets in first dimension are called LoD, "
              "the number of offset is N + 1, if LoD[i + 1] - LoD[i] == 0, "
@@ -173,12 +175,12 @@ class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Detection mAP evaluate operator.
 The general steps are as follows. First, calculate the true positive and
- false positive according to the input of detection and labels, then
- calculate the mAP evaluate value.
- Supporting '11 point' and 'integral' mAP algorithm. Please get more information
- from the following articles:
- https://sanchom.wordpress.com/tag/average-precision/
- https://arxiv.org/abs/1512.02325
+false positive according to the input of detection and labels, then
+calculate the mAP evaluate value.
+Supporting '11 point' and 'integral' mAP algorithm. Please get more information
+from the following articles:
+https://sanchom.wordpress.com/tag/average-precision/
+https://arxiv.org/abs/1512.02325
 
 )DOC");
   }
@@ -188,8 +190,8 @@ The general steps are as follows. First, calculate the true positive and
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(detection_map, ops::DetectionMAPOp,
-                             ops::DetectionMAPOpMaker);
+REGISTER_OPERATOR(detection_map, ops::DetectionMAPOp, ops::DetectionMAPOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(
     detection_map, ops::DetectionMAPOpKernel<paddle::platform::CPUPlace, float>,
     ops::DetectionMAPOpKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/detection_map_op.h b/paddle/fluid/operators/detection_map_op.h
index 8c15bfa36bfe72586cfcbdbd8efc4542253adaca..dd1ab85fd8d0c8170afcd9dd2a49ee55c41dc8be 100644
--- a/paddle/fluid/operators/detection_map_op.h
+++ b/paddle/fluid/operators/detection_map_op.h
@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -67,7 +72,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
     auto* out_false_pos = ctx.Output<framework::LoDTensor>("AccumFalsePos");
 
     float overlap_threshold = ctx.Attr<float>("overlap_threshold");
-    float evaluate_difficult = ctx.Attr<bool>("evaluate_difficult");
+    bool evaluate_difficult = ctx.Attr<bool>("evaluate_difficult");
     auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type"));
     int class_num = ctx.Attr<int>("class_num");
 
@@ -82,7 +87,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
     std::vector<std::map<int, std::vector<Box>>> gt_boxes;
     std::vector<std::map<int, std::vector<std::pair<T, Box>>>> detect_boxes;
 
-    GetBoxes(*in_label, *in_detect, gt_boxes, detect_boxes);
+    GetBoxes(*in_label, *in_detect, &gt_boxes, detect_boxes);
 
     std::map<int, int> label_pos_count;
     std::map<int, std::vector<std::pair<T, int>>> true_pos;
@@ -95,20 +100,20 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
     }
 
     if (in_pos_count != nullptr && state) {
-      GetInputPos(*in_pos_count, *in_true_pos, *in_false_pos, label_pos_count,
-                  true_pos, false_pos, class_num);
+      GetInputPos(*in_pos_count, *in_true_pos, *in_false_pos, &label_pos_count,
+                  &true_pos, &false_pos, class_num);
     }
 
     CalcTrueAndFalsePositive(gt_boxes, detect_boxes, evaluate_difficult,
-                             overlap_threshold, label_pos_count, true_pos,
-                             false_pos);
+                             overlap_threshold, &label_pos_count, &true_pos,
+                             &false_pos);
 
     int background_label = ctx.Attr<int>("background_label");
     T map = CalcMAP(ap_type, label_pos_count, true_pos, false_pos,
                     background_label);
 
-    GetOutputPos(ctx, label_pos_count, true_pos, false_pos, *out_pos_count,
-                 *out_true_pos, *out_false_pos, class_num);
+    GetOutputPos(ctx, label_pos_count, true_pos, false_pos, out_pos_count,
+                 out_true_pos, out_false_pos, class_num);
 
     T* map_data = out_map->mutable_data<T>(ctx.GetPlace());
     map_data[0] = map;
@@ -155,7 +160,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
 
   void GetBoxes(const framework::LoDTensor& input_label,
                 const framework::LoDTensor& input_detect,
-                std::vector<std::map<int, std::vector<Box>>>& gt_boxes,
+                std::vector<std::map<int, std::vector<Box>>>* gt_boxes,
                 std::vector<std::map<int, std::vector<std::pair<T, Box>>>>&
                     detect_boxes) const {
     auto labels = framework::EigenTensor<T, 2>::From(input_label);
@@ -170,16 +175,22 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
     for (int n = 0; n < batch_size; ++n) {
       std::map<int, std::vector<Box>> boxes;
       for (size_t i = label_index[n]; i < label_index[n + 1]; ++i) {
-        Box box(labels(i, 2), labels(i, 3), labels(i, 4), labels(i, 5));
         int label = labels(i, 0);
-        auto is_difficult = labels(i, 1);
-        if (std::abs(is_difficult - 0.0) < 1e-6)
-          box.is_difficult = false;
-        else
-          box.is_difficult = true;
-        boxes[label].push_back(box);
+        if (input_label.dims()[1] == 6) {
+          Box box(labels(i, 2), labels(i, 3), labels(i, 4), labels(i, 5));
+          auto is_difficult = labels(i, 1);
+          if (std::abs(is_difficult - 0.0) < 1e-6)
+            box.is_difficult = false;
+          else
+            box.is_difficult = true;
+          boxes[label].push_back(box);
+        } else {
+          PADDLE_ENFORCE_EQ(input_label.dims()[1], 5);
+          Box box(labels(i, 1), labels(i, 2), labels(i, 3), labels(i, 4));
+          boxes[label].push_back(box);
+        }
       }
-      gt_boxes.push_back(boxes);
+      gt_boxes->push_back(boxes);
     }
 
     auto detect_index = detect_lod[0];
@@ -200,9 +211,9 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
       const std::map<int, int>& label_pos_count,
       const std::map<int, std::vector<std::pair<T, int>>>& true_pos,
       const std::map<int, std::vector<std::pair<T, int>>>& false_pos,
-      framework::Tensor& output_pos_count,
-      framework::LoDTensor& output_true_pos,
-      framework::LoDTensor& output_false_pos, const int class_num) const {
+      framework::Tensor* output_pos_count,
+      framework::LoDTensor* output_true_pos,
+      framework::LoDTensor* output_false_pos, const int class_num) const {
     int true_pos_count = 0;
     int false_pos_count = 0;
     for (auto it = true_pos.begin(); it != true_pos.end(); ++it) {
@@ -214,12 +225,12 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
       false_pos_count += fp.size();
     }
 
-    int* pos_count_data = output_pos_count.mutable_data<int>(
+    int* pos_count_data = output_pos_count->mutable_data<int>(
         framework::make_ddim({class_num, 1}), ctx.GetPlace());
 
-    T* true_pos_data = output_true_pos.mutable_data<T>(
+    T* true_pos_data = output_true_pos->mutable_data<T>(
         framework::make_ddim({true_pos_count, 2}), ctx.GetPlace());
-    T* false_pos_data = output_false_pos.mutable_data<T>(
+    T* false_pos_data = output_false_pos->mutable_data<T>(
         framework::make_ddim({false_pos_count, 2}), ctx.GetPlace());
     true_pos_count = 0;
     false_pos_count = 0;
@@ -261,21 +272,21 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
     framework::LoD false_pos_lod;
     false_pos_lod.emplace_back(false_pos_starts);
 
-    output_true_pos.set_lod(true_pos_lod);
-    output_false_pos.set_lod(false_pos_lod);
+    output_true_pos->set_lod(true_pos_lod);
+    output_false_pos->set_lod(false_pos_lod);
     return;
   }
 
   void GetInputPos(const framework::Tensor& input_pos_count,
                    const framework::LoDTensor& input_true_pos,
                    const framework::LoDTensor& input_false_pos,
-                   std::map<int, int>& label_pos_count,
-                   std::map<int, std::vector<std::pair<T, int>>>& true_pos,
-                   std::map<int, std::vector<std::pair<T, int>>>& false_pos,
+                   std::map<int, int>* label_pos_count,
+                   std::map<int, std::vector<std::pair<T, int>>>* true_pos,
+                   std::map<int, std::vector<std::pair<T, int>>>* false_pos,
                    const int class_num) const {
     const int* pos_count_data = input_pos_count.data<int>();
     for (int i = 0; i < class_num; ++i) {
-      label_pos_count[i] = pos_count_data[i];
+      (*label_pos_count)[i] = pos_count_data[i];
     }
 
     auto SetData = [](const framework::LoDTensor& pos_tensor,
@@ -291,8 +302,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
       }
     };
 
-    SetData(input_true_pos, true_pos);
-    SetData(input_false_pos, false_pos);
+    SetData(input_true_pos, *true_pos);
+    SetData(input_false_pos, *false_pos);
     return;
   }
 
@@ -301,9 +312,9 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
       const std::vector<std::map<int, std::vector<std::pair<T, Box>>>>&
           detect_boxes,
       bool evaluate_difficult, float overlap_threshold,
-      std::map<int, int>& label_pos_count,
-      std::map<int, std::vector<std::pair<T, int>>>& true_pos,
-      std::map<int, std::vector<std::pair<T, int>>>& false_pos) const {
+      std::map<int, int>* label_pos_count,
+      std::map<int, std::vector<std::pair<T, int>>>* true_pos,
+      std::map<int, std::vector<std::pair<T, int>>>* false_pos) const {
     int batch_size = gt_boxes.size();
     for (int n = 0; n < batch_size; ++n) {
       auto image_gt_boxes = gt_boxes[n];
@@ -320,10 +331,10 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
           continue;
         }
         int label = it->first;
-        if (label_pos_count.find(label) == label_pos_count.end()) {
-          label_pos_count[label] = count;
+        if (label_pos_count->find(label) == label_pos_count->end()) {
+          (*label_pos_count)[label] = count;
         } else {
-          label_pos_count[label] += count;
+          (*label_pos_count)[label] += count;
         }
       }
     }
@@ -338,8 +349,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
           int label = it->first;
           for (size_t i = 0; i < pred_boxes.size(); ++i) {
             auto score = pred_boxes[i].first;
-            true_pos[label].push_back(std::make_pair(score, 0));
-            false_pos[label].push_back(std::make_pair(score, 1));
+            (*true_pos)[label].push_back(std::make_pair(score, 0));
+            (*false_pos)[label].push_back(std::make_pair(score, 1));
           }
         }
         continue;
@@ -351,8 +362,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
         if (image_gt_boxes.find(label) == image_gt_boxes.end()) {
           for (size_t i = 0; i < pred_boxes.size(); ++i) {
             auto score = pred_boxes[i].first;
-            true_pos[label].push_back(std::make_pair(score, 0));
-            false_pos[label].push_back(std::make_pair(score, 1));
+            (*true_pos)[label].push_back(std::make_pair(score, 0));
+            (*false_pos)[label].push_back(std::make_pair(score, 1));
           }
           continue;
         }
@@ -381,17 +392,17 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
                 (!evaluate_difficult && !matched_bboxes[max_idx].is_difficult);
             if (match_evaluate_difficult) {
               if (!visited[max_idx]) {
-                true_pos[label].push_back(std::make_pair(score, 1));
-                false_pos[label].push_back(std::make_pair(score, 0));
+                (*true_pos)[label].push_back(std::make_pair(score, 1));
+                (*false_pos)[label].push_back(std::make_pair(score, 0));
                 visited[max_idx] = true;
               } else {
-                true_pos[label].push_back(std::make_pair(score, 0));
-                false_pos[label].push_back(std::make_pair(score, 1));
+                (*true_pos)[label].push_back(std::make_pair(score, 0));
+                (*false_pos)[label].push_back(std::make_pair(score, 1));
               }
             }
           } else {
-            true_pos[label].push_back(std::make_pair(score, 0));
-            false_pos[label].push_back(std::make_pair(score, 1));
+            (*true_pos)[label].push_back(std::make_pair(score, 0));
+            (*false_pos)[label].push_back(std::make_pair(score, 1));
           }
         }
       }
diff --git a/paddle/fluid/operators/detection_output_op.cc b/paddle/fluid/operators/detection_output_op.cc
deleted file mode 100644
index f7520475917ff23535f11ccfde0ee915112bba30..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection_output_op.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection_output_op.h"
-namespace paddle {
-namespace operators {
-
-class DetectionOutputOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  DetectionOutputOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Loc",
-             "(Tensor) The input tensor of detection_output operator."
-             "The input predict locations"
-             "The format of input tensor is kNCHW. Where K is priorbox point "
-             "numbers,"
-             "N is How many boxes are there on each point, "
-             "C is 4, H and W both are 1.");
-    AddInput("Conf",
-             "(Tensor) The input tensor of detection_output operator."
-             "The input priorbox confidence."
-             "The format of input tensor is kNCHW. Where K is priorbox point "
-             "numbers,"
-             "N is How many boxes are there on each point, "
-             "C is the number of classes, H and W both are 1.");
-    AddInput("PriorBox",
-             "(Tensor) The input tensor of detection_output operator."
-             "The format of input tensor is the position and variance "
-             "of the boxes");
-    AddOutput("Out",
-              "(Tensor) The output tensor of detection_output operator.");
-    AddAttr<int>("background_label_id", "(int), The background class index.");
-    AddAttr<int>("num_classes", "(int), The number of the classification.");
-    AddAttr<float>("nms_threshold",
-                   "(float), The Non-maximum suppression threshold.");
-    AddAttr<float>("confidence_threshold",
-                   "(float), The classification confidence threshold.");
-    AddAttr<int>("top_k", "(int), The bbox number kept of the layer’s output.");
-    AddAttr<int>("nms_top_k",
-                 "(int), The bbox number kept of the NMS’s output.");
-    AddComment(R"DOC(
-          detection output for SSD(single shot multibox detector)
-          Apply the NMS to the output of network and compute the predict
-          bounding box location. The output’s shape of this layer could
-          be zero if there is no valid bounding box.
-        )DOC");
-  }
-};
-
-class DetectionOutputOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Loc"),
-                   "Input(X) of DetectionOutputOp"
-                   "should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Conf"),
-                   "Input(X) of DetectionOutputOp"
-                   "should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
-                   "Input(X) of DetectionOutputOp"
-                   "should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of DetectionOutputOp should not be null.");
-    std::vector<int64_t> output_shape({1, 7});
-    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(detection_output, ops::DetectionOutputOp,
-                             ops::DetectionOutputOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    detection_output,
-    ops::DetectionOutputKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DetectionOutputKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/detection_output_op.cu.cc b/paddle/fluid/operators/detection_output_op.cu.cc
deleted file mode 100644
index 0f48765c9c67c1d3fa32b19d5e87b2acaa3c486a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection_output_op.cu.cc
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-Indicesou may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/detection_output_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    detection_output,
-    ops::DetectionOutputKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::DetectionOutputKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/detection_output_op.h b/paddle/fluid/operators/detection_output_op.h
deleted file mode 100644
index af9081c93436776b6ca6ee7139e340054111e440..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/detection_output_op.h
+++ /dev/null
@@ -1,167 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   Indicesou may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/math/detection_util.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/softmax.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-inline void transpose_fun(const framework::ExecutionContext& context,
-                          const framework::Tensor& src,
-                          framework::Tensor* dst) {
-  int input_nums = src.dims()[0];
-  int offset = 0;
-  for (int j = 0; j < input_nums; ++j) {
-    framework::Tensor in_p_tensor = src.Slice(j, j + 1);
-    std::vector<int64_t> shape_vec(
-        {in_p_tensor.dims()[0], in_p_tensor.dims()[1], in_p_tensor.dims()[3],
-         in_p_tensor.dims()[4], in_p_tensor.dims()[2]});
-    framework::DDim shape(framework::make_ddim(shape_vec));
-    framework::Tensor in_p_tensor_transpose;
-    in_p_tensor_transpose.mutable_data<T>(shape, context.GetPlace());
-    std::vector<int> shape_axis({0, 1, 3, 4, 2});
-    math::Transpose<DeviceContext, T, 5> trans5;
-    trans5(context.template device_context<DeviceContext>(), in_p_tensor,
-           &in_p_tensor_transpose, shape_axis);
-    auto dst_stride = framework::stride(dst->dims());
-    auto src_stride = framework::stride(in_p_tensor_transpose.dims());
-    StridedMemcpy<T>(context.device_context(), in_p_tensor_transpose.data<T>(),
-                     src_stride, in_p_tensor_transpose.dims(), dst_stride,
-                     dst->data<T>() + offset);
-    offset += in_p_tensor_transpose.dims()[4] * src_stride[4];
-  }
-}
-template <typename DeviceContext, typename T>
-class DetectionOutputKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor* in_loc = context.Input<framework::Tensor>("Loc");
-    const framework::Tensor* in_conf = context.Input<framework::Tensor>("Conf");
-    const framework::Tensor* in_priorbox =
-        context.Input<framework::Tensor>("PriorBox");
-    auto* out = context.Output<framework::Tensor>("Out");
-    int num_classes = context.template Attr<int>("num_classes");
-    int top_k = context.template Attr<int>("top_k");
-    int nms_top_k = context.template Attr<int>("nms_top_k");
-    int background_label_id = context.template Attr<int>("background_label_id");
-    float nms_threshold = context.template Attr<float>("nms_threshold");
-    float confidence_threshold =
-        context.template Attr<float>("confidence_threshold");
-    size_t batch_size = in_conf->dims()[1];
-    int conf_sum_size = in_conf->numel();
-    // for softmax
-    std::vector<int64_t> conf_shape_softmax_vec(
-        {conf_sum_size / num_classes, num_classes});
-    framework::DDim conf_shape_softmax(
-        framework::make_ddim(conf_shape_softmax_vec));
-    // for knchw => nhwc
-    std::vector<int64_t> loc_shape_vec({1, in_loc->dims()[1], in_loc->dims()[3],
-                                        in_loc->dims()[4],
-                                        in_loc->dims()[2] * in_loc->dims()[0]});
-    std::vector<int64_t> conf_shape_vec(
-        {1, in_conf->dims()[1], in_conf->dims()[3], in_conf->dims()[4],
-         in_conf->dims()[2] * in_conf->dims()[0]});
-    framework::DDim loc_shape(framework::make_ddim(loc_shape_vec));
-    framework::DDim conf_shape(framework::make_ddim(conf_shape_vec));
-    framework::Tensor loc_tensor;
-    framework::Tensor conf_tensor;
-    loc_tensor.mutable_data<T>(loc_shape, context.GetPlace());
-    conf_tensor.mutable_data<T>(conf_shape, context.GetPlace());
-    // for cpu
-    framework::Tensor loc_cpu;
-    framework::Tensor conf_cpu;
-    framework::Tensor priorbox_cpu;
-    const T* priorbox_data = in_priorbox->data<T>();
-    transpose_fun<DeviceContext, T>(context, *in_loc, &loc_tensor);
-    transpose_fun<DeviceContext, T>(context, *in_conf, &conf_tensor);
-    conf_tensor.Resize(conf_shape_softmax);
-    math::SoftmaxFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), &conf_tensor,
-        &conf_tensor);
-    T* loc_data = loc_tensor.data<T>();
-    T* conf_data = conf_tensor.data<T>();
-    if (platform::is_gpu_place(context.GetPlace())) {
-      loc_cpu.mutable_data<T>(loc_tensor.dims(), platform::CPUPlace());
-      framework::TensorCopy(loc_tensor, platform::CPUPlace(),
-                            context.device_context(), &loc_cpu);
-      loc_data = loc_cpu.data<T>();
-      conf_cpu.mutable_data<T>(conf_tensor.dims(), platform::CPUPlace());
-      framework::TensorCopy(conf_tensor, platform::CPUPlace(),
-                            context.device_context(), &conf_cpu);
-      conf_data = conf_cpu.data<T>();
-      priorbox_cpu.mutable_data<T>(in_priorbox->dims(), platform::CPUPlace());
-      framework::TensorCopy(*in_priorbox, platform::CPUPlace(),
-                            context.device_context(), &priorbox_cpu);
-      priorbox_data = priorbox_cpu.data<T>();
-    }
-    // get decode bboxes
-    size_t num_priors = in_priorbox->numel() / 8;
-    std::vector<std::vector<operators::math::BBox<T>>> all_decoded_bboxes;
-    for (size_t n = 0; n < batch_size; ++n) {
-      std::vector<operators::math::BBox<T>> decoded_bboxes;
-      for (size_t i = 0; i < num_priors; ++i) {
-        size_t prior_offset = i * 8;
-        size_t loc_pred_offset = n * num_priors * 4 + i * 4;
-        std::vector<math::BBox<T>> prior_bbox_vec;
-        math::GetBBoxFromPriorData<T>(priorbox_data + prior_offset, 1,
-                                      prior_bbox_vec);
-        std::vector<std::vector<T>> prior_bbox_var;
-        math::GetBBoxVarFromPriorData<T>(priorbox_data + prior_offset, 1,
-                                         prior_bbox_var);
-        std::vector<T> loc_pred_data;
-        for (size_t j = 0; j < 4; ++j)
-          loc_pred_data.push_back(*(loc_data + loc_pred_offset + j));
-        math::BBox<T> bbox = math::DecodeBBoxWithVar<T>(
-            prior_bbox_vec[0], prior_bbox_var[0], loc_pred_data);
-        decoded_bboxes.push_back(bbox);
-      }
-      all_decoded_bboxes.push_back(decoded_bboxes);
-    }
-    std::vector<std::map<size_t, std::vector<size_t>>> all_indices;
-    int num_kept = math::GetDetectionIndices<T>(
-        conf_data, num_priors, num_classes, background_label_id, batch_size,
-        confidence_threshold, nms_top_k, nms_threshold, top_k,
-        all_decoded_bboxes, &all_indices);
-
-    if (num_kept <= 0) {
-      std::vector<int64_t> out_shape_vec({0, 0});
-      framework::DDim out_shape(framework::make_ddim(out_shape_vec));
-      out->Resize(out_shape);
-      return;
-    }
-    std::vector<int64_t> out_shape_vec({num_kept, 7});
-    framework::DDim out_shape(framework::make_ddim(out_shape_vec));
-    out->mutable_data<T>(out_shape, context.GetPlace());
-    framework::Tensor out_cpu;
-    T* out_data = out->data<T>();
-    if (platform::is_gpu_place(context.GetPlace())) {
-      out_cpu.mutable_data<T>(out->dims(), platform::CPUPlace());
-      out_data = out_cpu.data<T>();
-    }
-    math::GetDetectionOutput<T>(conf_data, num_kept, num_priors, num_classes,
-                                batch_size, all_indices, all_decoded_bboxes,
-                                out_data);
-    if (platform::is_gpu_place(context.GetPlace())) {
-      framework::TensorCopy(out_cpu, platform::CUDAPlace(),
-                            context.device_context(), out);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..312f80e09077f21a47985c1c936c2ac41c292ead
--- /dev/null
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -0,0 +1,33 @@
+if(WITH_GRPC)
+  grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
+      request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor
+      selected_rows memory)
+  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  cc_test(serde_test SRCS grpc_serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
+          cares zlib protobuf sendrecvop_grpc SERIAL)
+  cc_test(grpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc
+          grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
+          proto_desc lookup_table_op SERIAL)
+  return()
+endif()
+
+
+set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc  rpc_client.cc request_handler_impl.cc
+  PROTO send_recv.proto
+  DEPS lod_tensor selected_rows memory)
+
+find_library(OPENSSL_CRYPTO_LIBRARY_STATIC NAMES libcrypto.so)
+ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY_STATIC})
+
+
+find_library(OPENSSL_SSL_LIBRARY_STATIC NAMES libssl.so)
+ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY_STATIC})
+
+cc_test(brpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_brpc 
+       brpc protobuf leveldb gflags glog
+       protobuf executor proto_desc lookup_table_op snappystream snappy ssl crypto SERIAL)
diff --git a/paddle/fluid/operators/distributed/brpc_client.cc b/paddle/fluid/operators/distributed/brpc_client.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b394c678fb6503eb73a1e11e6feb814251e9e940
--- /dev/null
+++ b/paddle/fluid/operators/distributed/brpc_client.cc
@@ -0,0 +1,180 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/distributed/brpc_client.h"
+#include "paddle/fluid/framework/threadpool.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+DEFINE_int32(brpc_channel_num, 24,
+             "Number of channels to send requests connected to one server");
+DEFINE_int32(timeout_ms, 30000, "RPC timeout in milliseconds");
+DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)");
+
+BRPCClient::~BRPCClient() { Wait(); }
+
+void HandleSendResponse(brpc::Controller* cntl,
+                        sendrecv::VoidMessage* response) {
+  // std::unique_ptr makes sure cntl/response will be deleted before returning.
+  std::unique_ptr<brpc::Controller> cntl_guard(cntl);
+  std::unique_ptr<sendrecv::VoidMessage> response_guard(response);
+
+  if (cntl->Failed()) {
+    LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText();
+    return;
+  }
+  LOG(INFO) << "Received response from " << cntl->remote_side()
+            << " latency=" << cntl->latency_us() << "us";
+}
+
+bool BRPCClient::AsyncSendVar(const std::string& ep,
+                              const platform::DeviceContext& ctx,
+                              const framework::Scope& scope,
+                              const std::string& var_name, int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string var_name_val = var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch_ptr = GetChannel(ep_val);
+
+  framework::AsyncIO(
+      [var_name_val, p_ctx, ep_val, p_scope, time_out, ch_ptr, this] {
+        auto ch_ctx = ch_ptr->Pop();
+        brpc::Controller* cntl = new brpc::Controller();
+        sendrecv::VoidMessage* response = new sendrecv::VoidMessage();
+        cntl->set_timeout_ms(time_out);
+
+        google::protobuf::Closure* done =
+            brpc::NewCallback(&HandleSendResponse, cntl, response);
+
+        sendrecv::VariableMessage request;
+        ch_ctx->stub->SendVariable(cntl, &request, response, done);
+      });
+  req_count_++;
+
+  return true;
+}
+
+void HandleGetResponse(brpc::Controller* cntl,
+                       sendrecv::VariableMessage* response) {
+  // std::unique_ptr makes sure cntl/response will be deleted before returning.
+  std::unique_ptr<brpc::Controller> cntl_guard(cntl);
+  std::unique_ptr<sendrecv::VariableMessage> response_guard(response);
+
+  if (cntl->Failed()) {
+    LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText();
+    return;
+  }
+  LOG(INFO) << "Received response from " << cntl->remote_side()
+            << " latency=" << cntl->latency_us() << "us";
+
+  // framework::Variable* outvar = nullptr;
+  // DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar);
+}
+
+bool BRPCClient::AsyncGetVar(const std::string& ep,
+                             const platform::DeviceContext& ctx,
+                             const framework::Scope& scope,
+                             const std::string& var_name, int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string var_name_val = var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+
+  framework::AsyncIO(
+      [var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {});
+
+  req_count_++;
+
+  return true;
+}
+
+bool BRPCClient::AsyncPrefetchVar(const std::string& ep,
+                                  const platform::DeviceContext& ctx,
+                                  const framework::Scope& scope,
+                                  const std::string& in_var_name,
+                                  const std::string& out_var_name,
+                                  int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string in_var_name_val = in_var_name;
+  const std::string out_var_name_val = out_var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+
+  framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
+                      time_out, ch, this] {});
+
+  req_count_++;
+  return true;
+}
+
+void BRPCClient::AsyncSendBatchBarrier(const std::string& ep,
+                                       int64_t time_out) {
+  req_count_++;
+}
+
+void BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
+                                       int64_t time_out) {
+  req_count_++;
+}
+
+void BRPCClient::Wait() {
+  std::unique_lock<std::mutex> lk(sync_mutex_);
+  sync_cond_.wait(lk, [this] { return req_count_ == 0; });
+}
+
+ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
+  {
+    std::lock_guard<std::mutex> guard(chan_mutex_);
+    auto it = channels_.find(ep);
+    if (it != channels_.end()) {
+      return it->second;
+    }
+  }
+
+  ChannelQueuePtr q(new framework::BlockingQueue<ChannelContextPtr>());
+
+  brpc::ChannelOptions options;
+  options.protocol = "baidu_std";
+  options.connection_type = "pooled";
+  options.connect_timeout_ms = 100;
+  options.timeout_ms = FLAGS_timeout_ms /*milliseconds*/;
+  options.max_retry = FLAGS_max_retry;
+  for (int i = 0; i < FLAGS_brpc_channel_num; ++i) {
+    std::shared_ptr<ChannelContext> c(new ChannelContext());
+    if (c->channel.Init(ep.c_str(), &options) != 0) {
+      LOG(ERROR) << "Fail to initialize channel";
+      return nullptr;
+    }
+
+    c->stub.reset(new sendrecv::SendRecvService_Stub(
+        static_cast<google::protobuf::RpcChannel*>(&c->channel)));
+    q->Push(c);
+  }
+
+  {
+    std::lock_guard<std::mutex> guard(chan_mutex_);
+    channels_[ep] = q;
+  }
+
+  return q;
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc_client.h b/paddle/fluid/operators/distributed/brpc_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ff1f0a6076b3574c42065edcbac50eb75b3b483
--- /dev/null
+++ b/paddle/fluid/operators/distributed/brpc_client.h
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <time.h>
+
+#include <chrono>  // NOLINT
+#include <ctime>
+#include <functional>
+#include <iostream>
+#include <map>
+#include <mutex>  // NOLINT
+#include <string>
+#include <vector>
+
+#include "brpc/channel.h"
+#include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+struct ChannelContext {
+  brpc::Channel channel;
+  std::shared_ptr<sendrecv::SendRecvService_Stub> stub;
+};
+
+typedef std::shared_ptr<ChannelContext> ChannelContextPtr;
+typedef std::shared_ptr<framework::BlockingQueue<ChannelContextPtr>>
+    ChannelQueuePtr;
+
+class BRPCClient : public RPCClient {
+ public:
+  BRPCClient() {}
+  virtual ~BRPCClient();
+
+  bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
+                    const framework::Scope& scope, const std::string& var_name,
+                    int64_t time_out = FLAGS_rpc_deadline) override;
+
+  bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
+                   const framework::Scope& scope, const std::string& var_name,
+                   int64_t time_out = FLAGS_rpc_deadline) override;
+
+  bool AsyncPrefetchVar(const std::string& ep,
+                        const platform::DeviceContext& ctx,
+                        const framework::Scope& scope,
+                        const std::string& in_var_name,
+                        const std::string& out_var_name,
+                        int64_t time_out = FLAGS_rpc_deadline) override;
+
+  void AsyncSendBatchBarrier(const std::string& ep,
+                             int64_t time_out = FLAGS_rpc_deadline) override;
+
+  void AsyncSendFetchBarrier(const std::string& ep,
+                             int64_t time_out = FLAGS_rpc_deadline) override;
+
+  void Wait() override;
+
+ private:
+  void Proceed();
+  ChannelQueuePtr GetChannel(const std::string& ep);
+
+ private:
+  std::unordered_map<std::string, ChannelQueuePtr> channels_;
+
+  // mutex for Wait client sync
+  std::mutex sync_mutex_;
+  std::condition_variable sync_cond_;
+  std::atomic<int64_t> req_count_{0};
+
+  // mutex for GetChannel thread safety
+  std::mutex chan_mutex_;
+  DISABLE_COPY_AND_ASSIGN(BRPCClient);
+};
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc_server.cc b/paddle/fluid/operators/distributed/brpc_server.cc
new file mode 100644
index 0000000000000000000000000000000000000000..862167f02084cfe81db1c0936bbfb0415fa85721
--- /dev/null
+++ b/paddle/fluid/operators/distributed/brpc_server.cc
@@ -0,0 +1,144 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/distributed/brpc_server.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+
+namespace sendrecv {
+
+typedef std::unordered_map<std::string,
+                           paddle::operators::distributed::RequestHandler*>
+    HandlerMap;
+
+class BRPCServiceImpl : public SendRecvService {
+ public:
+  explicit BRPCServiceImpl(const HandlerMap& rpc_call_map)
+      : request_send_h_(nullptr),
+        request_get_h_(nullptr),
+        request_prefetch_h_(nullptr) {
+    auto it = rpc_call_map.find(paddle::operators::distributed::kRequestSend);
+    if (it != rpc_call_map.end()) {
+      request_send_h_ = it->second;
+    }
+
+    it = rpc_call_map.find(paddle::operators::distributed::kRequestSend);
+    if (it != rpc_call_map.end()) {
+      request_get_h_ = it->second;
+    }
+
+    it = rpc_call_map.find(paddle::operators::distributed::kRequestPrefetch);
+    if (it != rpc_call_map.end()) {
+      request_prefetch_h_ = it->second;
+    }
+  }
+
+  virtual ~BRPCServiceImpl() {}
+
+  void SendVariable(google::protobuf::RpcController* cntl_butil,
+                    const VariableMessage* request, VoidMessage* response,
+                    google::protobuf::Closure* done) override {
+    PADDLE_ENFORCE(request_send_h_ != nullptr,
+                   "RequestSend handler should be registed first!");
+    brpc::ClosureGuard done_guard(done);
+
+    paddle::framework::Scope* local_scope = request_send_h_->scope();
+    paddle::framework::Variable* outvar = nullptr;
+    paddle::framework::Variable* invar = nullptr;
+
+    std::string varname = request->varname();
+
+    if (!request_send_h_->sync_mode()) {
+      local_scope = &request_send_h_->scope()->NewScope();
+      invar = local_scope->Var(varname);
+    } else {
+      invar = local_scope->FindVar(varname);
+    }
+
+    request_send_h_->Handle(varname, local_scope, invar, &outvar);
+
+    if (!request_send_h_->sync_mode()) {
+      request_send_h_->scope()->DeleteScope(local_scope);
+    }
+  }
+
+  void GetVariable(google::protobuf::RpcController* cntl_butil,
+                   const VariableMessage* request, VariableMessage* response,
+                   google::protobuf::Closure* done) override {
+    PADDLE_ENFORCE(request_get_h_ != nullptr,
+                   "RequestGet handler should be registed first!");
+  }
+
+  void PrefetchVariable(google::protobuf::RpcController* cntl_butil,
+                        const VariableMessage* request,
+                        VariableMessage* response,
+                        google::protobuf::Closure* done) override {
+    PADDLE_ENFORCE(request_prefetch_h_ != nullptr,
+                   "kRequestPrefetch handler should be registed first!");
+  }
+
+ private:
+  paddle::operators::distributed::RequestHandler* request_send_h_;
+  paddle::operators::distributed::RequestHandler* request_get_h_;
+  paddle::operators::distributed::RequestHandler* request_prefetch_h_;
+};
+}  // namespace sendrecv
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+void AsyncBRPCServer::StartServer() {
+  // Instance of your service.
+  sendrecv::BRPCServiceImpl service_impl(rpc_call_map_);
+
+  // Add the service into server. Notice the second parameter, because the
+  // service is put on stack, we don't want server to delete it, otherwise
+  // use brpc::SERVER_OWNS_SERVICE.
+  if (server_.AddService(&service_impl, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) {
+    LOG(FATAL) << "Fail to add service";
+    return;
+  }
+
+  brpc::ServerOptions options;
+  options.idle_timeout_sec = idle_timeout_s_;
+  options.max_concurrency = max_concurrency_;
+  if (server_.Start(bind_address_.c_str(), &options) != 0) {
+    LOG(FATAL) << "Fail to start EchoServer" << bind_address_;
+    return;
+  }
+
+  butil::EndPoint ep = server_.listen_address();
+  selected_port_ = ep.port;
+
+  {
+    std::lock_guard<std::mutex> lock(this->mutex_ready_);
+    ready_ = 1;
+  }
+  condition_ready_.notify_all();
+
+  server_.Join();
+}
+
+void AsyncBRPCServer::ShutDownImpl() { server_.Stop(1000); }
+
+void AsyncBRPCServer::WaitServerReady() {
+  VLOG(3) << "AsyncGRPCServer is wait server ready";
+  std::unique_lock<std::mutex> lock(this->mutex_ready_);
+  condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
+  VLOG(3) << "AsyncGRPCServer WaitSeverReady";
+}
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc_server.h b/paddle/fluid/operators/distributed/brpc_server.h
new file mode 100644
index 0000000000000000000000000000000000000000..85a7ad0dfe843dad483d43631b69a79d75211ce9
--- /dev/null
+++ b/paddle/fluid/operators/distributed/brpc_server.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <condition_variable>  // NOLINT
+#include <mutex>               // NOLINT
+#include <string>
+
+#include "brpc/server.h"
+#include "paddle/fluid/operators/distributed/rpc_server.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+class AsyncBRPCServer final : public RPCServer {
+ public:
+  explicit AsyncBRPCServer(const std::string& address, int client_num)
+      : RPCServer(address, client_num), ready_(0) {}
+
+  virtual ~AsyncBRPCServer() {}
+  void StartServer() override;
+  void WaitServerReady() override;
+
+ private:
+  void ShutDownImpl() override;
+
+  brpc::Server server_;
+
+  static constexpr int idle_timeout_s_ = -1;
+  static constexpr int max_concurrency_ = 0;
+
+  std::mutex mutex_ready_;
+  std::condition_variable condition_ready_;
+  int ready_;
+};
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/bytebuffer_stream.cc b/paddle/fluid/operators/distributed/bytebuffer_stream.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6e91b447db838c9095432eda22e9e1171e938d31
--- /dev/null
+++ b/paddle/fluid/operators/distributed/bytebuffer_stream.cc
@@ -0,0 +1,88 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// NOTE: This file was originally created by tensorflow
+//       (https://github.com/tensorflow/tensorflow/) we borrow this
+//       file and did some modifications so that we can send gRPC
+//       requests without too much copying of the tensor data.
+
+#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+GrpcByteBufferSource::GrpcByteBufferSource() {}
+
+bool GrpcByteBufferSource::Init(const grpc::ByteBuffer& src) {
+  cur_ = -1;
+  left_ = 0;
+  ptr_ = nullptr;
+  byte_count_ = 0;
+  bool ok = src.Dump(&slices_).ok();
+  if (!ok) {
+    slices_.clear();
+  }
+  return ok;
+}
+
+bool GrpcByteBufferSource::Next(const void** data, int* size) {
+  // Use loop instead of if in case buffer contained empty slices.
+  while (left_ == 0) {
+    // Advance to next slice.
+    cur_++;
+    if (cur_ >= slices_.size()) {
+      return false;
+    }
+    const ::grpc::Slice& s = slices_[cur_];
+    left_ = s.size();
+    ptr_ = reinterpret_cast<const char*>(s.begin());
+  }
+
+  *data = ptr_;
+  *size = left_;
+  byte_count_ += left_;
+  ptr_ += left_;
+  left_ = 0;
+  return true;
+}
+
+void GrpcByteBufferSource::BackUp(int count) {
+  ptr_ -= count;
+  left_ += count;
+  byte_count_ -= count;
+}
+
+bool GrpcByteBufferSource::Skip(int count) {
+  const void* data;
+  int size;
+  while (Next(&data, &size)) {
+    if (size >= count) {
+      BackUp(size - count);
+      return true;
+    }
+    // size < count;
+    count -= size;
+  }
+  // error or we have too large count;
+  return false;
+}
+
+google::protobuf::int64 GrpcByteBufferSource::ByteCount() const {
+  return byte_count_;
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/bytebuffer_stream.h b/paddle/fluid/operators/distributed/bytebuffer_stream.h
new file mode 100644
index 0000000000000000000000000000000000000000..e7de172c79c30761483b5d96f5bad19860208832
--- /dev/null
+++ b/paddle/fluid/operators/distributed/bytebuffer_stream.h
@@ -0,0 +1,188 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// NOTE: This file was originally created by tensorflow
+//       (https://github.com/tensorflow/tensorflow/) we borrow this
+//       file and did some modifications so that we can send gRPC
+//       requests without too much copying of the tensor data.
+
+#pragma once
+
+#include <vector>
+
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
+#include "grpc++/grpc++.h"
+
+namespace grpc {
+// A ZeroCopyInputStream that reads from grpc_byte_buffer
+class GrpcBufferReader final
+    : public ::google::protobuf::io::ZeroCopyInputStream {
+  typedef void (CoreCodegenInterface::*OldReaderInitAPI)(
+      grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer);
+  typedef int (CoreCodegenInterface::*NewReaderInitAPI)(
+      grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer);
+  void ReaderInit(OldReaderInitAPI ptr, grpc_byte_buffer_reader* reader,
+                  grpc_byte_buffer* buffer) {
+    (g_core_codegen_interface->*ptr)(reader, buffer);
+  }
+  void ReaderInit(NewReaderInitAPI ptr, grpc_byte_buffer_reader* reader,
+                  grpc_byte_buffer* buffer) {
+    int result = (g_core_codegen_interface->*ptr)(reader, buffer);
+    (void)result;
+  }
+
+ public:
+  explicit GrpcBufferReader(grpc_byte_buffer* buffer)
+      : byte_count_(0), backup_count_(0) {
+    ReaderInit(&CoreCodegenInterface::grpc_byte_buffer_reader_init, &reader_,
+               buffer);
+  }
+  ~GrpcBufferReader() override {
+    g_core_codegen_interface->grpc_byte_buffer_reader_destroy(&reader_);
+  }
+
+  bool Next(const void** data, int* size) override {
+    if (backup_count_ > 0) {
+      *data = GRPC_SLICE_START_PTR(slice_) + GRPC_SLICE_LENGTH(slice_) -
+              backup_count_;
+      GPR_CODEGEN_ASSERT(backup_count_ <= INT_MAX);
+      *size = static_cast<int>(backup_count_);
+      backup_count_ = 0;
+      return true;
+    }
+    if (!g_core_codegen_interface->grpc_byte_buffer_reader_next(&reader_,
+                                                                &slice_)) {
+      return false;
+    }
+    g_core_codegen_interface->grpc_slice_unref(slice_);
+    *data = GRPC_SLICE_START_PTR(slice_);
+    // On win x64, int is only 32bit
+    GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX);
+    byte_count_ += * size = static_cast<int>(GRPC_SLICE_LENGTH(slice_));
+    return true;
+  }
+
+  void BackUp(int count) override { backup_count_ = count; }
+
+  bool Skip(int count) override {
+    const void* data;
+    int size;
+    while (Next(&data, &size)) {
+      if (size >= count) {
+        BackUp(size - count);
+        return true;
+      }
+      // size < count;
+      count -= size;
+    }
+    // error or we have too large count;
+    return false;
+  }
+
+  ::google::protobuf::int64 ByteCount() const override {
+    return byte_count_ - backup_count_;
+  }
+
+ private:
+  int64_t byte_count_;
+  int64_t backup_count_;
+  grpc_byte_buffer_reader reader_;
+  grpc_slice slice_;
+};
+
+};  // namespace grpc
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+// Source provides a way for a particular RPC implementation to provide
+// received data to ParseFrom.
+class Source {
+ public:
+  virtual ~Source() {}
+
+  // Return the stream that contains the data to be parsed.
+  // Note that this method might be invoked more than once if
+  // ParseFrom needs to fall back to a more expensive parsing method.
+  // Every call must return a stream pointing at the beginning of
+  // the serialized RecvTensorResponse.
+  //
+  // Note that a subsequent call to contents() invalidates previous
+  // results of contents().
+  //
+  // Ownership of the returned stream is retained by the Source and
+  // should not be deleted by the caller.
+  virtual ::google::protobuf::io::ZeroCopyInputStream* contents() = 0;
+};
+
+// A ZeroCopyInputStream that reads from a grpc::ByteBuffer.
+class GrpcByteBufferSource
+    : public ::google::protobuf::io::ZeroCopyInputStream {
+ public:
+  GrpcByteBufferSource();
+  bool Init(const ::grpc::ByteBuffer& src);  // Can be called multiple times.
+  bool Next(const void** data, int* size) override;
+  void BackUp(int count) override;
+  bool Skip(int count) override;
+  ::google::protobuf::int64 ByteCount() const override;
+
+ private:
+  std::vector<::grpc::Slice> slices_;
+  size_t cur_;       // Current slice index.
+  int left_;         // Number of bytes in slices_[cur_] left to yield.
+  const char* ptr_;  // Address of next byte in slices_[cur_] to yield.
+  ::google::protobuf::int64 byte_count_;
+};
+
+class GrpcByteBufferSourceWrapper : public Source {
+ public:
+  explicit GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source)
+      : source_(source) {}
+  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
+    return source_;
+  }
+
+ private:
+  GrpcByteBufferSource* source_;
+};
+
+class GrpcByteSource : public Source {
+ public:
+  explicit GrpcByteSource(grpc_byte_buffer* buffer) : buffer_(buffer) {}
+  ~GrpcByteSource() override { DeleteStream(); }
+
+  typedef ::grpc::GrpcBufferReader Reader;
+
+  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
+    DeleteStream();
+    stream_ = new (&space_) Reader(buffer_);
+    return stream_;
+  }
+
+ private:
+  void DeleteStream() {
+    if (stream_) {
+      stream_->~Reader();
+    }
+  }
+
+  grpc_byte_buffer* buffer_;  // Not owned
+  Reader* stream_ = nullptr;  // Points into space_ if non-nullptr
+  char space_[sizeof(Reader)];
+};
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4a09f3870d64d8e14b2db41ff3ea7c2f9e67b558
--- /dev/null
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -0,0 +1,333 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/distributed/grpc_client.h"
+
+#include <sys/time.h>
+
+#include <limits>
+
+#include "glog/logging.h"  // For VLOG
+#include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+void GRPCClient::InitImpl() { InitEventLoop(); }
+
+void GRPCClient::InitEventLoop() {
+  // start the client process thread
+  // TODO(wuyi): can make this in a threadpool
+  client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this)));
+}
+
+void GRPCClient::SendBeginPass() {
+  for (auto& it : channels_) {
+    VLOG(3) << "send begin pass to: " << it.first;
+    this->AsyncSendBeginPass(it.first);
+  }
+  this->Wait();
+}
+
+void GRPCClient::SendEndPass() {
+  for (auto& it : channels_) {
+    VLOG(3) << "send end pass to " << it.first;
+    this->AsyncSendEndPass(it.first);
+  }
+  this->Wait();
+}
+
+GRPCClient::~GRPCClient() {
+  Wait();
+  cq_.Shutdown();
+  {
+    std::lock_guard<std::mutex> guard(chan_mutex_);
+    for (auto& it : channels_) {
+      it.second.reset();
+    }
+  }
+  client_thread_->join();
+}
+
+bool GRPCClient::AsyncSendVar(const std::string& ep,
+                              const platform::DeviceContext& ctx,
+                              const framework::Scope& scope,
+                              const std::string& var_name, int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string var_name_val = var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+
+  framework::AsyncIO([var_name_val, p_ctx, ep_val, p_scope, time_out, ch,
+                      this] {
+    auto* var = p_scope->FindVar(var_name_val);
+
+    ::grpc::ByteBuffer req;
+    SerializeToByteBuffer(var_name_val, var, *p_ctx, &req);
+
+    // varhandle
+    VarHandle var_h;
+    var_h.ep = ep_val;
+    var_h.scope = p_scope;
+    var_h.name = var_name_val;
+    var_h.ctx = p_ctx;
+    var_h.method = "Send";
+
+    VLOG(3) << var_h.String() << " begin";
+
+    // stub context
+    SendProcessor* s = new SendProcessor(ch);
+    s->Prepare(var_h, time_out);
+    s->response_call_back_ = nullptr;
+
+    auto call = s->stub_g_.PrepareUnaryCall(
+        s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
+    call->StartCall();
+    call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  });
+  req_count_++;
+
+  return true;
+}
+
+void ProcGetResponse(const VarHandle& var_h,
+                     const ::grpc::ByteBuffer& ret_msg) {
+  framework::Variable* outvar = nullptr;
+  DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar);
+}
+
+template <typename T>
+void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
+  ::grpc::Slice slice(proto.ByteSizeLong());
+  proto.SerializeWithCachedSizesToArray(const_cast<uint8_t*>(slice.begin()));
+  ::grpc::ByteBuffer tmp(&slice, 1);
+  result->Swap(&tmp);
+}
+
+bool GRPCClient::AsyncGetVar(const std::string& ep,
+                             const platform::DeviceContext& ctx,
+                             const framework::Scope& scope,
+                             const std::string& var_name, int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string var_name_val = var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+
+  framework::AsyncIO([var_name_val, ep_val, p_scope, p_ctx, time_out, ch,
+                      this] {
+    // prepare input
+    sendrecv::VariableMessage req;
+    req.set_varname(var_name_val);
+    ::grpc::ByteBuffer buf;
+    RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
+
+    // var handle
+    VarHandle var_h;
+    var_h.ep = ep_val;
+    var_h.scope = p_scope;
+    var_h.name = var_name_val;
+    var_h.ctx = p_ctx;
+    var_h.method = "Get";
+
+    VLOG(3) << var_h.String() << " begin";
+
+    // stub context
+    GetProcessor* s = new GetProcessor(ch);
+    s->Prepare(var_h, time_out);
+    s->response_call_back_ = ProcGetResponse;
+
+    auto call = s->stub_g_.PrepareUnaryCall(
+        s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
+    call->StartCall();
+    call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  });
+
+  req_count_++;
+
+  return true;
+}
+
+bool GRPCClient::AsyncPrefetchVar(const std::string& ep,
+                                  const platform::DeviceContext& ctx,
+                                  const framework::Scope& scope,
+                                  const std::string& in_var_name,
+                                  const std::string& out_var_name,
+                                  int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string in_var_name_val = in_var_name;
+  const std::string out_var_name_val = out_var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+
+  framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
+                      time_out, ch, this] {
+    auto* var = p_scope->FindVar(in_var_name_val);
+
+    ::grpc::ByteBuffer req;
+    SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val);
+
+    // var handle
+    VarHandle var_h;
+    var_h.ep = ep_val;
+    var_h.scope = p_scope;
+    var_h.name = out_var_name_val;
+    var_h.ctx = p_ctx;
+    var_h.method = "Prefetch";
+
+    VLOG(3) << var_h.String() << " begin";
+
+    // stub context
+    GetProcessor* s = new GetProcessor(ch);
+    s->Prepare(var_h, time_out);
+    s->response_call_back_ = ProcGetResponse;
+
+    auto call = s->stub_g_.PrepareUnaryCall(
+        s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
+        &cq_);
+    call->StartCall();
+    call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
+  });
+
+  req_count_++;
+  return true;
+}
+
+void GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
+                                       int64_t time_out) {
+  const auto ch = GetChannel(ep);
+
+  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
+  s->Prepare(time_out);
+
+  sendrecv::VariableMessage req;
+  req.set_varname(BATCH_BARRIER_MESSAGE);
+  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  req_count_++;
+}
+
+void GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
+                                       int64_t time_out) {
+  const auto ch = GetChannel(ep);
+  FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
+  s->Prepare(time_out);
+
+  sendrecv::VariableMessage req;
+  req.set_varname(FETCH_BARRIER_MESSAGE);
+  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  req_count_++;
+}
+
+void GRPCClient::AsyncSendBeginPass(const std::string& ep, int64_t time_out) {
+  const auto ch = GetChannel(ep);
+
+  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
+  s->Prepare(time_out);
+
+  sendrecv::VariableMessage req;
+  req.set_varname(BEGIN_PASS_MESSAGE);
+  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  req_count_++;
+}
+
+void GRPCClient::AsyncSendEndPass(const std::string& ep, int64_t time_out) {
+  const auto ch = GetChannel(ep);
+
+  FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
+  s->Prepare(time_out);
+
+  sendrecv::VariableMessage req;
+  req.set_varname(END_PASS_MESSAGE);
+  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  req_count_++;
+}
+
+void GRPCClient::AsyncCheckpointNotify(const std::string& ep,
+                                       const std::string& dir,
+                                       int64_t time_out) {
+  const auto ch = GetChannel(ep);
+
+  CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch);
+  s->Prepare(time_out);
+
+  sendrecv::VariableMessage req;
+  req.set_varname(CHECKPOINT_SAVE_MESSAGE);
+  req.set_out_varname(dir);
+
+  auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  req_count_++;
+}
+
+void GRPCClient::Wait() {
+  std::unique_lock<std::mutex> lk(sync_mutex_);
+  sync_cond_.wait(lk, [this] { return req_count_ == 0; });
+}
+
+void GRPCClient::Proceed() {
+  void* tag = nullptr;
+  bool ok = false;
+
+  while (cq_.Next(&tag, &ok)) {
+    BaseProcessor* c = static_cast<BaseProcessor*>(tag);
+    GPR_ASSERT(ok);
+    PADDLE_ENFORCE(c);
+    if (c->status_.ok()) {
+      VLOG(3) << c->var_h_.String() << " process";
+      c->Process();
+    } else {
+      LOG(FATAL) << c->var_h_.String()
+                 << " meets grpc error:" << c->status_.error_message();
+    }
+    delete c;
+    {
+      std::lock_guard<std::mutex> lk(sync_mutex_);
+      req_count_--;
+    }
+    sync_cond_.notify_all();
+  }
+}
+
+std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
+  std::lock_guard<std::mutex> guard(chan_mutex_);
+  auto it = channels_.find(ep);
+  if (it != channels_.end()) {
+    return it->second;
+  }
+
+  // Channel configurations:
+  grpc::ChannelArguments args;
+  args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 2000);
+  args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE);
+  args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
+  args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
+
+  auto ch =
+      grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args);
+  channels_[ep] = ch;
+  return ch;
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..5dae20155edcf9edd746a5d9a9bbe0ccd789f431
--- /dev/null
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -0,0 +1,258 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <time.h>
+
+#include <chrono>              // NOLINT
+#include <condition_variable>  // NOLINT
+#include <ctime>
+#include <functional>
+#include <iostream>
+#include <map>
+#include <mutex>  // NOLINT
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "grpc++/channel.h"
+#include "grpc++/generic/generic_stub.h"
+#include "grpc++/grpc++.h"
+#include "grpc++/support/byte_buffer.h"
+#include "grpc++/support/slice.h"
+#include "grpc/support/log.h"
+#include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+struct VarHandle {
+  // RPC endpoint.
+  std::string ep;
+  const platform::DeviceContext* ctx;
+  const framework::Scope* scope;
+  // Variable name.
+  std::string name;
+  // RPC method name.
+  std::string method;
+
+  std::string String() const {
+    std::ostringstream s;
+    s << method << " name:[" << name << "], ep:[" << ep << "]";
+    return s.str();
+  }
+};
+
+void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
+
+class BaseProcessor {
+ public:
+  explicit BaseProcessor(std::shared_ptr<grpc::Channel> ch) {
+    context_ = nullptr;
+  }
+
+  virtual ~BaseProcessor() {}
+
+  virtual void Prepare(const VarHandle& var_info, int64_t time_out) {
+    context_.reset(new grpc::ClientContext());
+    var_h_ = var_info;
+    context_->set_wait_for_ready(true);
+    if (time_out) {
+      std::chrono::system_clock::time_point deadline =
+          std::chrono::system_clock::now() +
+          std::chrono::milliseconds(time_out);
+      context_->set_deadline(deadline);
+    }
+  }
+
+  virtual void Prepare(int64_t time_out) {
+    context_.reset(new grpc::ClientContext());
+    context_->set_wait_for_ready(true);
+
+    std::chrono::system_clock::time_point deadline =
+        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
+
+    context_->set_deadline(deadline);
+  }
+
+  virtual void Process() = 0;
+
+  std::unique_ptr<grpc::ClientContext> context_;
+  grpc::Status status_;
+  VarHandle var_h_;
+};
+
+typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
+    RequestSendCallBack;
+
+class SendProcessor : public BaseProcessor {
+ public:
+  explicit SendProcessor(std::shared_ptr<grpc::Channel> ch)
+      : BaseProcessor(ch), stub_g_(ch) {}
+
+  virtual ~SendProcessor() {}
+
+  virtual void Process() {
+    if (response_call_back_) {
+      response_call_back_(var_h_, reply_);
+    }
+  }
+
+  ::grpc::GenericStub stub_g_;
+  ::grpc::ByteBuffer reply_;
+  RequestSendCallBack response_call_back_ = nullptr;
+};
+
+typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
+    RequestGetCallBack;
+
+class GetProcessor : public BaseProcessor {
+ public:
+  explicit GetProcessor(std::shared_ptr<grpc::Channel> ch)
+      : BaseProcessor(ch), stub_g_(ch) {}
+
+  virtual ~GetProcessor() {}
+
+  virtual void Process() {
+    if (response_call_back_) {
+      response_call_back_(var_h_, reply_);
+    }
+  }
+
+  ::grpc::ByteBuffer reply_;
+  ::grpc::GenericStub stub_g_;
+  RequestGetCallBack response_call_back_ = ProcGetResponse;
+};
+
+class BatchBarrierProcessor : public BaseProcessor {
+ public:
+  explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
+      : BaseProcessor(ch) {
+    stub_ = sendrecv::SendRecvService::NewStub(ch);
+  }
+
+  virtual ~BatchBarrierProcessor() {}
+
+  virtual void Process() {}
+  sendrecv::VoidMessage reply_;
+  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
+};
+
+class FetchBarrierProcessor : public BaseProcessor {
+ public:
+  explicit FetchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
+      : BaseProcessor(ch) {
+    stub_ = sendrecv::SendRecvService::NewStub(ch);
+  }
+
+  virtual ~FetchBarrierProcessor() {}
+
+  virtual void Process() {}
+  sendrecv::VariableMessage reply_;
+  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
+};
+
+class CheckpointNotifyProcessor : public BaseProcessor {
+ public:
+  explicit CheckpointNotifyProcessor(std::shared_ptr<grpc::Channel> ch)
+      : BaseProcessor(ch) {
+    stub_ = sendrecv::SendRecvService::NewStub(ch);
+  }
+
+  virtual ~CheckpointNotifyProcessor() {}
+
+  virtual void Process() {}
+  sendrecv::VoidMessage reply_;
+  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
+};
+
+class GRPCClient : public RPCClient {
+ public:
+  GRPCClient() {}
+  virtual ~GRPCClient();
+
+  bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
+                    const framework::Scope& scope, const std::string& var_name,
+                    int64_t time_out = FLAGS_rpc_deadline) override;
+
+  bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
+                   const framework::Scope& scope, const std::string& var_name,
+                   int64_t time_out = FLAGS_rpc_deadline) override;
+
+  bool AsyncPrefetchVar(const std::string& ep,
+                        const platform::DeviceContext& ctx,
+                        const framework::Scope& scope,
+                        const std::string& in_var_name,
+                        const std::string& out_var_name,
+                        int64_t time_out = FLAGS_rpc_deadline) override;
+
+  void AsyncSendBatchBarrier(const std::string& ep,
+                             int64_t time_out = FLAGS_rpc_deadline) override;
+
+  void AsyncSendFetchBarrier(const std::string& ep,
+                             int64_t time_out = FLAGS_rpc_deadline) override;
+
+  void AsyncCheckpointNotify(const std::string& ep, const std::string& dir,
+                             int64_t time_out = FLAGS_rpc_deadline) override;
+
+  void AsyncSendBeginPass(const std::string& ep,
+                          int64_t time_out = FLAGS_rpc_deadline) override;
+
+  void AsyncSendEndPass(const std::string& ep,
+                        int64_t time_out = FLAGS_rpc_deadline) override;
+
+  void Wait() override;
+
+  void SendBeginPass() override;
+
+  void SendEndPass() override;
+
+ protected:
+  void InitImpl() override;
+
+ private:
+  // InitEventLoop should only be called by Init()
+  void InitEventLoop();
+
+  void Proceed();
+
+  std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
+
+ private:
+  grpc::CompletionQueue cq_;
+  std::unordered_map<std::string, std::shared_ptr<grpc::Channel>> channels_;
+  std::unique_ptr<std::thread> client_thread_;
+
+  // mutex for Wait client sync
+  std::mutex sync_mutex_;
+  std::condition_variable sync_cond_;
+  std::atomic<int64_t> req_count_{0};
+
+  // mutex for GetChannel thread safety
+  std::mutex chan_mutex_;
+  DISABLE_COPY_AND_ASSIGN(GRPCClient);
+};
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc_serde_test.cc b/paddle/fluid/operators/distributed/grpc_serde_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d107b533bcb7bfef3f9b13ec99afbd579a62e52
--- /dev/null
+++ b/paddle/fluid/operators/distributed/grpc_serde_test.cc
@@ -0,0 +1,221 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <string>
+#include <thread>  // NOLINT
+
+#include "google/protobuf/text_format.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace operators = paddle::operators;
+namespace math = paddle::operators::math;
+namespace memory = paddle::memory;
+
+void RunSerdeTestSelectedRows(platform::Place place) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& ctx = *pool.Get(place);
+
+  // serialize var to ByteBuffer
+  framework::Variable var;
+  auto* slr = var.GetMutable<framework::SelectedRows>();
+  slr->set_height(1000);
+  auto* tensor = slr->mutable_value();
+  auto* rows = slr->mutable_rows();
+  tensor->Resize(framework::make_ddim({564, 128}));
+  tensor->mutable_data<float>(place);
+  int tensor_numel = 564 * 128;
+  math::set_constant(ctx, tensor, 32.7);
+  for (int i = 0; i < 564; ++i) rows->push_back(i);
+
+  ::grpc::ByteBuffer msg;
+  operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg);
+  EXPECT_GT(msg.Length(), static_cast<size_t>(0));
+
+  // deserialize
+  std::vector<::grpc::Slice> slices;
+  (void)msg.Dump(&slices);
+  std::string tmp;
+  for (const auto& s : slices) {
+    tmp.append(reinterpret_cast<const char*>(s.begin()), s.size());
+  }
+
+  sendrecv::VariableMessage varmsg;
+  EXPECT_TRUE(varmsg.ParseFromString(tmp));
+
+  // deserialize bytebuffer
+  EXPECT_EQ(varmsg.varname(), "myvar");
+  EXPECT_EQ(varmsg.type(), 1);
+
+  const float* tensor_data =
+      reinterpret_cast<const float*>(varmsg.serialized().data());
+  const int64_t* rows_data =
+      reinterpret_cast<const int64_t*>(varmsg.rows().data());
+  for (int i = 0; i < tensor_numel; ++i) {
+    EXPECT_FLOAT_EQ(tensor_data[i], 32.7);
+  }
+  for (int i = 0; i < 564; ++i) {
+    EXPECT_EQ(rows_data[i], i);
+  }
+
+  // deserialize zero-copy
+  // framework::Variable var2;
+  // operators::distributed::DeserializeFromByteBuffer(msg, ctx, &var2);
+  framework::Scope scope;
+  scope.Var("myvar");
+  operators::distributed::VariableResponse resp(&scope, &ctx);
+  EXPECT_EQ(resp.Parse(msg), 0);
+
+  framework::Variable* var2 = resp.GetVar();
+
+  auto* slr2 = var2->GetMutable<framework::SelectedRows>();
+  auto* tensor2 = slr2->mutable_value();
+  auto* rows2 = slr2->mutable_rows();
+  float* tensor_data2 = nullptr;
+  framework::Tensor tmp_tensor;
+
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+    platform::CPUPlace cpu;
+    framework::TensorCopy(*tensor2, cpu, &tmp_tensor);
+    tensor_data2 = tmp_tensor.data<float>();
+  } else {
+    tensor_data2 = const_cast<float*>(tensor2->data<float>());
+  }
+  const int64_t* rows_data2 = rows2->data();
+
+  for (int i = 0; i < tensor_numel; ++i) {
+    EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
+  }
+  for (size_t i = 0; i < rows2->size(); ++i) {
+    EXPECT_EQ(rows_data2[i], static_cast<int64_t>(i));
+  }
+  EXPECT_EQ(slr2->height(), 1000);
+}
+
+void RunTestLodTensor(platform::Place place, int from_type = 0) {
+  // serialize var to ByteBuffer
+  framework::Variable var;
+  auto* tensor = var.GetMutable<framework::LoDTensor>();
+  tensor->Resize(framework::make_ddim({512, 8, 4, 2}));
+  framework::LoD lod;
+  lod.push_back(framework::Vector<size_t>({1, 3, 8}));
+  tensor->set_lod(lod);
+  int tensor_numel = 512 * 8 * 4 * 2;
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& ctx = *pool.Get(place);
+  tensor->mutable_data<float>(place);
+  math::set_constant(ctx, tensor, 31.9);
+
+  ::grpc::ByteBuffer msg;
+  operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg);
+  EXPECT_GT(msg.Length(), static_cast<size_t>(0));
+
+  // deserialize
+  std::vector<::grpc::Slice> slices;
+  (void)msg.Dump(&slices);
+  std::string tmp;
+  for (const auto& s : slices) {
+    tmp.append(reinterpret_cast<const char*>(s.begin()), s.size());
+  }
+  sendrecv::VariableMessage varmsg;
+  EXPECT_TRUE(varmsg.ParseFromString(tmp));
+  EXPECT_EQ(varmsg.varname(), "myvar");
+  EXPECT_EQ(varmsg.type(), 0);
+  EXPECT_EQ(varmsg.dims()[0], 512);
+  EXPECT_EQ(varmsg.dims()[1], 8);
+  EXPECT_EQ(varmsg.dims()[2], 4);
+  EXPECT_EQ(varmsg.dims()[3], 2);
+  EXPECT_EQ(varmsg.lod_level(), 1);
+  EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
+  EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
+  EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
+
+  const float* tensor_data =
+      reinterpret_cast<const float*>(varmsg.serialized().data());
+  for (int i = 0; i < tensor_numel; ++i) {
+    EXPECT_FLOAT_EQ(tensor_data[i], 31.9);
+  }
+
+  // message binary
+  std::string str;
+  varmsg.SerializeToString(&str);
+
+  // message bytebuffer
+  ::grpc::Slice slices_2[1];
+  int num_slices = 1;
+  slices_2[0] = ::grpc::Slice(str.length());
+  memcpy(const_cast<uint8_t*>(slices_2[0].begin()), str.c_str(), str.length());
+  ::grpc::ByteBuffer bytebuffer2(&slices_2[0], num_slices);
+
+  // deserialize zero-copy
+  framework::Scope scope;
+  scope.Var("myvar");
+  operators::distributed::VariableResponse resp(&scope, &ctx);
+  if (from_type == 0) {
+    EXPECT_EQ(resp.Parse(msg), 0);
+  } else {
+    EXPECT_EQ(resp.Parse(bytebuffer2), 0);
+  }
+
+  framework::Variable* var2 = resp.GetVar();
+
+  auto tensor2 = var2->Get<framework::LoDTensor>();
+  float* tensor_data2 = nullptr;
+  framework::Tensor tmp_tensor;
+
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+    platform::CPUPlace cpu;
+    framework::TensorCopy(tensor2, cpu, &tmp_tensor);
+    tensor_data2 = tmp_tensor.data<float>();
+  } else {
+    tensor_data2 = const_cast<float*>(tensor2.data<float>());
+  }
+
+  EXPECT_EQ(varmsg.lod_level(), 1);
+  EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
+  EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
+  EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
+  for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9);
+}
+
+TEST(LodTensor, Run) {
+  platform::CPUPlace place;
+  RunTestLodTensor(place);
+  RunTestLodTensor(place, 1);
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu(0);
+  RunTestLodTensor(gpu);
+  RunTestLodTensor(gpu, 1);
+#endif
+}
+
+TEST(SelectedRows, Run) {
+  platform::CPUPlace place;
+  RunSerdeTestSelectedRows(place);
+
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu;
+  RunSerdeTestSelectedRows(gpu);
+#endif
+}
diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f35e268f6ad36da02f17db2feb3fbf1fdf6c1e41
--- /dev/null
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@@ -0,0 +1,414 @@
+/*Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <limits>
+#include <string>
+
+#include "paddle/fluid/operators/distributed/grpc_server.h"
+
+using ::grpc::ServerAsyncResponseWriter;
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+enum CallStatus { PROCESS = 0, FINISH };
+
+// reference:
+// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server
+class RequestBase {
+ public:
+  explicit RequestBase(GrpcService::AsyncService* service,
+                       ::grpc::ServerCompletionQueue* cq,
+                       RequestHandler* request_handler, int req_id)
+      : service_(service),
+        cq_(cq),
+        status_(PROCESS),
+        request_handler_(request_handler),
+        req_id_(req_id) {
+    PADDLE_ENFORCE(cq_);
+  }
+  virtual ~RequestBase() {}
+  virtual void Process() = 0;
+
+  std::string Status2String(const std::string& method) {
+    std::string status = "Process";
+    if (status_ == FINISH) {
+      status = "Finish";
+    }
+
+    std::ostringstream s;
+    s << method << " name:[" << GetReqName() << "]"
+      << ", ep:[" << ctx_.peer() << "]"
+      << " " << status << " using req_id:" << req_id_;
+    return s.str();
+  }
+
+  CallStatus Status() const {
+    std::lock_guard<std::mutex> l(status_mu_);
+    return status_;
+  }
+
+  template <typename T>
+  void Finish(const T& reply, ServerAsyncResponseWriter<T>* responder) {
+    std::lock_guard<std::mutex> l(status_mu_);
+    status_ = FINISH;
+    responder->Finish(reply, ::grpc::Status::OK,
+                      reinterpret_cast<void*>(static_cast<intptr_t>(req_id_)));
+  }
+  virtual std::string GetReqName() = 0;
+
+ protected:
+  mutable std::mutex status_mu_;
+  ::grpc::ServerContext ctx_;
+  GrpcService::AsyncService* service_;
+  ::grpc::ServerCompletionQueue* cq_;
+  CallStatus status_;
+  RequestHandler* request_handler_;
+  int req_id_;
+};
+
+class RequestSend final : public RequestBase {
+ public:
+  explicit RequestSend(GrpcService::AsyncService* service,
+                       ::grpc::ServerCompletionQueue* cq,
+                       RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
+    request_.reset(new VariableResponse(request_handler->scope(),
+                                        request_handler->dev_ctx(),
+                                        !request_handler->sync_mode()));
+    int method_id = static_cast<int>(distributed::GrpcMethod::kSendVariable);
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
+  }
+  virtual ~RequestSend() {}
+  std::string GetReqName() override { return request_->Varname(); }
+
+  void Process() override {
+    std::string varname = GetReqName();
+    VLOG(4) << "RequestSend var_name:" << varname;
+
+    auto scope = request_->GetMutableLocalScope();
+    auto invar = request_->GetVar();
+    framework::Variable* outvar = nullptr;
+
+    request_handler_->Handle(varname, scope, invar, &outvar);
+    Finish(reply_, &responder_);
+  }
+
+ protected:
+  sendrecv::VoidMessage reply_;
+  std::shared_ptr<VariableResponse> request_;
+  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
+};
+
+class RequestGet final : public RequestBase {
+ public:
+  explicit RequestGet(GrpcService::AsyncService* service,
+                      ::grpc::ServerCompletionQueue* cq,
+                      RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
+    auto method_id = static_cast<int>(distributed::GrpcMethod::kGetVariable);
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, &request_, &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
+  }
+
+  virtual ~RequestGet() {}
+
+  std::string GetReqName() override { return request_.varname(); }
+
+  void Process() override {
+    // proc request.
+    std::string varname = request_.varname();
+    VLOG(4) << "RequestGet " << varname;
+
+    auto scope = request_handler_->scope();
+    auto invar = scope->FindVar(varname);
+    framework::Variable* outvar = nullptr;
+
+    request_handler_->Handle(varname, scope, invar, &outvar);
+
+    if (outvar) {
+      SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(),
+                            &reply_);
+    }
+    Finish(reply_, &responder_);
+  }
+
+ protected:
+  sendrecv::VariableMessage request_;
+  ::grpc::ByteBuffer reply_;
+  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
+};
+
+class RequestPrefetch final : public RequestBase {
+ public:
+  explicit RequestPrefetch(GrpcService::AsyncService* service,
+                           ::grpc::ServerCompletionQueue* cq,
+                           RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id),
+        responder_(&ctx_),
+        local_scope_(nullptr) {
+    request_.reset(new VariableResponse(request_handler->scope(),
+                                        request_handler->dev_ctx(), true));
+    int method_id =
+        static_cast<int>(distributed::GrpcMethod::kPrefetchVariable);
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
+  }
+
+  virtual ~RequestPrefetch() {}
+
+  std::string GetReqName() override { return request_->Varname(); }
+
+  void Process() override {
+    // prefetch process...
+    std::string in_var_name = request_->Varname();
+    std::string out_var_name = request_->OutVarname();
+    VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name
+            << " out_var_name: " << out_var_name;
+
+    auto scope = request_->GetMutableLocalScope();
+    auto invar = scope->FindVar(in_var_name);
+    // out var must be created in local scope!
+    framework::Variable* outvar = scope->Var(out_var_name);
+
+    request_handler_->Handle(in_var_name, scope, invar, &outvar, out_var_name);
+
+    SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(),
+                          &reply_);
+    Finish(reply_, &responder_);
+  }
+
+ protected:
+  std::shared_ptr<VariableResponse> request_;
+  ::grpc::ByteBuffer reply_;
+  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
+  framework::Scope* local_scope_;
+};
+
+class RequestCheckpointNotify final : public RequestBase {
+ public:
+  explicit RequestCheckpointNotify(GrpcService::AsyncService* service,
+                                   ::grpc::ServerCompletionQueue* cq,
+                                   RequestHandler* request_handler, int req_id)
+      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
+    request_.reset(new VariableResponse(request_handler->scope(),
+                                        request_handler->dev_ctx()));
+    int method_id =
+        static_cast<int>(distributed::GrpcMethod::kCheckpointNotify);
+    service_->RequestAsyncUnary(
+        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
+        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
+  }
+
+  virtual ~RequestCheckpointNotify() {}
+
+  std::string GetReqName() override { return request_->Varname(); }
+
+  void Process() override {
+    auto scope = request_->GetMutableLocalScope();
+
+    std::string checkpoint_notify = request_->Varname();
+    std::string checkpoint_dir = request_->OutVarname();
+
+    VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify
+            << ", dir: " << checkpoint_dir;
+
+    request_handler_->Handle(checkpoint_notify, scope, nullptr, nullptr,
+                             checkpoint_dir);
+    Finish(reply_, &responder_);
+  }
+
+ protected:
+  std::shared_ptr<VariableResponse> request_;
+  sendrecv::VoidMessage reply_;
+  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
+};
+
+void AsyncGRPCServer::WaitServerReady() {
+  VLOG(4) << "AsyncGRPCServer is wait server ready";
+  std::unique_lock<std::mutex> lock(this->mutex_ready_);
+  condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
+  VLOG(4) << "AsyncGRPCServer WaitSeverReady";
+}
+
+void AsyncGRPCServer::StartServer() {
+  ::grpc::ServerBuilder builder;
+  builder.AddListeningPort(bind_address_, ::grpc::InsecureServerCredentials(),
+                           &selected_port_);
+
+  builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
+  builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
+  builder.RegisterService(&service_);
+
+  for (auto t : rpc_call_map_) {
+    rpc_cq_[t.first].reset(builder.AddCompletionQueue().release());
+  }
+
+  server_ = builder.BuildAndStart();
+  LOG(INFO) << "Server listening on " << bind_address_
+            << " selected port: " << selected_port_;
+
+  std::function<void(const std::string&, int)> f =
+      std::bind(&AsyncGRPCServer::TryToRegisterNewOne, this,
+                std::placeholders::_1, std::placeholders::_2);
+
+  for (auto& t : rpc_call_map_) {
+    auto& rpc_name = t.first;
+    auto& cq = rpc_cq_[rpc_name];
+    auto threadnum = rpc_thread_num_[rpc_name];
+    auto& reqs = rpc_reqs_[rpc_name];
+
+    reqs.reserve(kRequestBufSize);
+
+    for (int i = 0; i < kRequestBufSize; i++) {
+      VLOG(6) << "TryToRegisterNewOne on RPC NAME: " << rpc_name << " I: " << i;
+      TryToRegisterNewOne(rpc_name, i);
+    }
+
+    for (int i = 0; i < threadnum; i++) {
+      rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind(
+          &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f)));
+      VLOG(4) << t.first << " creates threads!";
+    }
+  }
+
+  {
+    std::lock_guard<std::mutex> lock(this->mutex_ready_);
+    ready_ = 1;
+  }
+  condition_ready_.notify_all();
+
+  // wait server
+  server_->Wait();
+
+  for (auto& t : rpc_threads_) {
+    auto& threads = t.second;
+    for (size_t i = 0; i < threads.size(); ++i) {
+      threads[i]->join();
+      VLOG(4) << t.first << " threads ends!";
+    }
+  }
+}
+
+void AsyncGRPCServer::ShutdownQueue() {
+  for (auto& t : rpc_cq_) {
+    t.second->Shutdown();
+    VLOG(4) << t.first << " queue shutdown!";
+  }
+}
+
+void AsyncGRPCServer::ShutDownImpl() {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
+  is_shut_down_ = true;
+  ShutdownQueue();
+
+  VLOG(4) << "server_ shutdown!";
+  server_->Shutdown();
+}
+
+void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
+                                          int req_id) {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
+  if (is_shut_down_) {
+    VLOG(4) << "shutdown, do not TryToRegisterNewSendOne";
+    return;
+  }
+
+  VLOG(4) << "TryToRegisterNewOne on RPC NAME: " << rpc_name
+          << " REQ ID: " << req_id;
+
+  auto& reqs = rpc_reqs_[rpc_name];
+  auto& handler = rpc_call_map_[rpc_name];
+  auto& cq = rpc_cq_[rpc_name];
+
+  RequestBase* b = nullptr;
+  if (rpc_name == kRequestSend) {
+    b = new RequestSend(&service_, cq.get(), handler, req_id);
+  } else if (rpc_name == kRequestGet) {
+    b = new RequestGet(&service_, cq.get(), handler, req_id);
+  } else if (rpc_name == kRequestPrefetch) {
+    b = new RequestPrefetch(&service_, cq.get(), handler, req_id);
+  } else if (rpc_name == kRequestCheckpoint) {
+    b = new RequestCheckpointNotify(&service_, cq.get(), handler, req_id);
+  } else {
+    PADDLE_ENFORCE(false, "not supported rpc");
+  }
+
+  reqs[req_id] = b;
+
+  VLOG(4) << "Create RequestSend status:" << b->Status();
+}
+
+void AsyncGRPCServer::HandleRequest(
+    ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name,
+    std::function<void(const std::string&, int)> TryToRegisterNewOne) {
+  void* tag = NULL;
+  bool ok = false;
+
+  while (true) {
+    VLOG(4) << "HandleRequest " << rpc_name << " wait next";
+    if (!cq->Next(&tag, &ok)) {
+      VLOG(3) << "CompletionQueue " << rpc_name << " shutdown!";
+      break;
+    }
+
+    int req_id = static_cast<int>(reinterpret_cast<intptr_t>(tag));
+    VLOG(4) << "HandleRequest " << rpc_name << ", req_id:" << req_id
+            << " get next";
+
+    auto& reqs = rpc_reqs_[rpc_name];
+    RequestBase* base = nullptr;
+    {
+      PADDLE_ENFORCE(req_id >= 0 && req_id < kRequestBufSize);
+      std::unique_lock<std::mutex> lock(cq_mutex_);
+      base = reqs[req_id];
+    }
+
+    VLOG(3) << base->Status2String(rpc_name);
+
+    // reference:
+    // https://github.com/tensorflow/tensorflow/issues/5596
+    // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
+    // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
+    if (!ok) {
+      LOG(WARNING) << "completion queue:" << rpc_name
+                   << " recv no regular event"
+                   << " context:" << base->Status2String(rpc_name);
+      TryToRegisterNewOne(rpc_name, req_id);
+      delete base;
+      continue;
+    }
+
+    switch (base->Status()) {
+      case PROCESS: {
+        base->Process();
+        break;
+      }
+      case FINISH: {
+        TryToRegisterNewOne(rpc_name, req_id);
+        delete base;
+        break;
+      }
+      default: { assert(false); }
+    }
+  }
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc_server.h b/paddle/fluid/operators/distributed/grpc_server.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2524f5e65db6dedab78f45e17380359b58a3d11
--- /dev/null
+++ b/paddle/fluid/operators/distributed/grpc_server.h
@@ -0,0 +1,89 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <set>
+#include <string>
+#include <thread>  // NOLINT
+#include <utility>
+#include <vector>
+
+#include "grpc++/grpc++.h"
+#include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/distributed/grpc_service.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+#include "paddle/fluid/operators/distributed/rpc_server.h"
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+class RequestBase;
+
+class AsyncGRPCServer final : public RPCServer {
+ public:
+  explicit AsyncGRPCServer(const std::string& address, int client_num)
+      : RPCServer(address, client_num), ready_(0) {}
+
+  virtual ~AsyncGRPCServer() {}
+  void WaitServerReady() override;
+  void StartServer() override;
+
+ private:
+  // HandleRequest needs to be thread-safe.
+  void HandleRequest(
+      ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name,
+      std::function<void(const std::string&, int)> TryToRegisterNewOne);
+
+  void TryToRegisterNewOne(const std::string& rpc_name, int req_id);
+  void ShutdownQueue();
+  void ShutDownImpl() override;
+
+ private:
+  static const int kRequestBufSize = 100;
+
+  std::mutex cq_mutex_;
+  volatile bool is_shut_down_ = false;
+
+  GrpcService::AsyncService service_;
+  std::unique_ptr<::grpc::Server> server_;
+
+  // condition of the sub program
+  std::condition_variable barrier_condition_;
+
+  std::mutex mutex_ready_;
+  std::condition_variable condition_ready_;
+
+  int ready_;
+
+  std::map<std::string, std::unique_ptr<::grpc::ServerCompletionQueue>> rpc_cq_;
+  std::map<std::string, std::vector<std::unique_ptr<std::thread>>> rpc_threads_;
+  std::map<std::string, std::vector<RequestBase*>> rpc_reqs_;
+};
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc_service.h b/paddle/fluid/operators/distributed/grpc_service.h
new file mode 100644
index 0000000000000000000000000000000000000000..cdc4e7b79276d6aac55aeac8ac121ca28d2cc1f0
--- /dev/null
+++ b/paddle/fluid/operators/distributed/grpc_service.h
@@ -0,0 +1,127 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <grpc++/impl/codegen/async_stream.h>
+#include <grpc++/impl/codegen/async_unary_call.h>
+#include <grpc++/impl/codegen/proto_utils.h>
+#include <grpc++/impl/codegen/rpc_method.h>
+#include <grpc++/impl/codegen/service_type.h>
+#include <grpc++/impl/codegen/status.h>
+#include <grpc++/impl/codegen/stub_options.h>
+#include <grpc++/impl/codegen/sync_stream.h>
+#include <grpc++/support/byte_buffer.h>
+#include "paddle/fluid/operators/distributed/variable_response.h"
+
+#include "paddle/fluid/platform/profiler.h"
+
+// NOTE: This method was originally created by tensorflow
+//       (https://github.com/tensorflow/tensorflow/) we borrow this
+//       method and did some modifications so that we can parse gRPC
+//       requests without too much copying of the tensor data.
+
+namespace grpc {
+class CompletionQueue;
+class Channel;
+class RpcService;
+class ServerCompletionQueue;
+class ServerContext;
+
+// Support parsing/unparsing of tensorflow::VariableResponse.
+// Wire-format is identical to RecvVariableResponse.
+template <>
+class SerializationTraits<paddle::operators::distributed::VariableResponse> {
+ public:
+  static Status Serialize(
+      const paddle::operators::distributed::VariableResponse& msg,
+      grpc_byte_buffer** bp, bool* own_buffer) {
+    PADDLE_ENFORCE(false, "SerializationTraits::Serialize not implemented!");
+    return Status();
+  }
+  static Status Deserialize(
+      grpc_byte_buffer* buffer,
+      paddle::operators::distributed::VariableResponse* msg,
+      int max_message_size = INT_MAX) {
+    if (buffer == nullptr) {
+      return Status(StatusCode::INTERNAL, "No payload");
+    }
+
+    Status result = g_core_codegen_interface->ok();
+    if (result.ok()) {
+      paddle::operators::distributed::GrpcByteSource source(buffer);
+      int ret = msg->Parse(&source);
+      if (ret != 0) {
+        result = Status(StatusCode::INTERNAL, "VariableResponse parse error");
+      }
+    }
+    g_core_codegen_interface->grpc_byte_buffer_destroy(buffer);
+    return result;
+  }
+};
+}  // namespace grpc
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+enum class GrpcMethod {
+  kSendVariable,
+  kGetVariable,
+  kPrefetchVariable,
+  kCheckpointNotify,
+};
+
+static const int kGrpcNumMethods =
+    static_cast<int>(GrpcMethod::kCheckpointNotify) + 1;
+
+inline const char* GrpcMethodName(GrpcMethod id) {
+  switch (id) {
+    case GrpcMethod::kSendVariable:
+      return "/sendrecv.SendRecvService/SendVariable";
+    case GrpcMethod::kGetVariable:
+      return "/sendrecv.SendRecvService/GetVariable";
+    case GrpcMethod::kPrefetchVariable:
+      return "/sendrecv.SendRecvService/PrefetchVariable";
+    case GrpcMethod::kCheckpointNotify:
+      return "/sendrecv.SendRecvService/CheckpointNotify";
+  }
+
+  // Shouldn't be reached.
+  PADDLE_ENFORCE(false, "Invalid id: not found valid method name");
+  return nullptr;
+}
+
+class GrpcService final {
+ public:
+  class AsyncService : public ::grpc::Service {
+   public:
+    AsyncService() {
+      for (int i = 0; i < kGrpcNumMethods; ++i) {
+        AddMethod(new ::grpc::internal::RpcServiceMethod(
+            GrpcMethodName(static_cast<GrpcMethod>(i)),
+            ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
+        ::grpc::Service::MarkMethodAsync(i);
+      }
+    }
+    virtual ~AsyncService() {}
+
+    // Make RequestAsyncUnary public for grpc_call.h
+    using ::grpc::Service::RequestAsyncUnary;
+  };
+};
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/proto_encoder_helper.h b/paddle/fluid/operators/distributed/proto_encoder_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..2fab02e32fe18ee04f86a69bb5bae1cbe7c6762c
--- /dev/null
+++ b/paddle/fluid/operators/distributed/proto_encoder_helper.h
@@ -0,0 +1,149 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// NOTE: This file was originally created by tensorflow
+//       (https://github.com/tensorflow/tensorflow/) we borrow this
+//       file and did some modifications so that we can send gRPC
+//       requests without too much copying of the tensor data.
+
+#pragma once
+
+#include <string>
+
+#include "grpc++/grpc++.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+char* EncodeVarint32(char* dst, uint32_t v) {
+  // Operate on characters as unsigneds
+  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
+  static const int B = 128;
+  if (v < (1 << 7)) {
+    *(ptr++) = v;
+  } else if (v < (1 << 14)) {
+    *(ptr++) = v | B;
+    *(ptr++) = v >> 7;
+  } else if (v < (1 << 21)) {
+    *(ptr++) = v | B;
+    *(ptr++) = (v >> 7) | B;
+    *(ptr++) = v >> 14;
+  } else if (v < (1 << 28)) {
+    *(ptr++) = v | B;
+    *(ptr++) = (v >> 7) | B;
+    *(ptr++) = (v >> 14) | B;
+    *(ptr++) = v >> 21;
+  } else {
+    *(ptr++) = v | B;
+    *(ptr++) = (v >> 7) | B;
+    *(ptr++) = (v >> 14) | B;
+    *(ptr++) = (v >> 21) | B;
+    *(ptr++) = v >> 28;
+  }
+  return reinterpret_cast<char*>(ptr);
+}
+
+char* EncodeVarint64(char* dst, uint64_t v) {
+  static const int B = 128;
+  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
+  while (v >= B) {
+    *(ptr++) = (v & (B - 1)) | B;
+    v >>= 7;
+  }
+  *(ptr++) = static_cast<unsigned char>(v);
+  return reinterpret_cast<char*>(ptr);
+}
+
+int VarintLength(uint64_t v) {
+  int len = 1;
+  while (v >= 128) {
+    v >>= 7;
+    len++;
+  }
+  return len;
+}
+
+class ProtoEncodeHelper {
+ public:
+  ProtoEncodeHelper(char* buf, int max_size)
+      : base_(buf), p_(buf), limit_(base_ + max_size) {}
+
+  ~ProtoEncodeHelper() {
+    // Make sure callers didn't do operations that went over max_size promised
+    PADDLE_ENFORCE_LE(p_, limit_);
+  }
+
+  const char* data() const { return base_; }
+  size_t size() const { return p_ - base_; }
+
+  void WriteUint64(int tag, uint64_t v) {
+    Encode32(combine(tag, WIRETYPE_VARINT));
+    Encode64(v);
+  }
+  void WriteBool(int tag, bool v) {
+    Encode32(combine(tag, WIRETYPE_VARINT));
+    EncodeBool(v);
+  }
+  void WriteString(int tag, const std::string& v) {
+    Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
+    Encode32(v.size());
+    EncodeBytes(v.data(), v.size());
+  }
+  void WriteVarlengthBeginning(int tag, uint32_t len) {
+    Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
+    Encode32(len);
+  }
+  void WriteRawBytes(const std::string& v) { EncodeBytes(v.data(), v.size()); }
+
+ private:
+  // Note: this module's behavior must match the protocol buffer wire encoding
+  // format.
+  enum {
+    WIRETYPE_VARINT = 0,
+    WIRETYPE_LENGTH_DELIMITED = 2,
+  };
+  static uint32_t combine(uint32_t tag, uint32_t type) {
+    return ((tag << 3) | type);
+  }
+  inline void Encode32(uint32_t v) {
+    if (v < 128) {
+      // Fast path for single-byte values.  Many of the calls will use a
+      // constant value for v, so the comparison will get optimized away
+      // when Encode32 is inlined into the caller.
+      *p_ = v;
+      p_++;
+    } else {
+      p_ = EncodeVarint32(p_, v);
+    }
+  }
+  void Encode64(uint64_t v) { p_ = EncodeVarint64(p_, v); }
+  void EncodeBool(bool v) {
+    *p_ = (v ? 1 : 0);  // Equal to varint32 encoding of 0 or 1
+    p_++;
+  }
+  void EncodeBytes(const char* bytes, int N) {
+    memcpy(p_, bytes, N);
+    p_ += N;
+  }
+
+  char* base_;
+  char* p_;
+  char* limit_;  // Just for CHECKs
+};
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
new file mode 100644
index 0000000000000000000000000000000000000000..271306d5d20f1b849a81a9bfa6436f2faf261204
--- /dev/null
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -0,0 +1,143 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <time.h>
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+constexpr char kRequestSend[] = "RequestSend";
+constexpr char kRequestGet[] = "RequestGet";
+constexpr char kRequestPrefetch[] = "RequestPrefetch";
+constexpr char kRequestCheckpoint[] = "RequestCheckpoint";
+constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
+
+#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
+#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
+#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
+#define COMPLETE_MESSAGE "COMPLETE@RECV"
+#define BEGIN_PASS_MESSAGE "BEGIN_PASS@RECV"
+#define END_PASS_MESSAGE "END_PASS@RECV"
+
+#define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY"
+#define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY"
+
+class RPCServer;
+
+class RequestHandler {
+ public:
+  explicit RequestHandler(bool sync_mode)
+      : sync_mode_(sync_mode),
+        dev_ctx_(nullptr),
+        executor_(nullptr),
+        scope_(nullptr),
+        program_(nullptr),
+        rpc_server_(nullptr) {}
+
+  virtual ~RequestHandler() {}
+
+  // Set attributes.
+  void SetScope(framework::Scope* scope) { scope_ = scope; }
+  void SetDevCtx(const platform::DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; }
+  void SetProgram(framework::ProgramDesc* program) { program_ = program; }
+  void SetExecutor(framework::Executor* executor) { executor_ = executor; }
+
+  // Used for dist lookup table prefetch
+  void SetPrefetchPreparedCtx(
+      std::unordered_map<
+          std::string, std::shared_ptr<framework::ExecutorPrepareContext>>* g) {
+    prefetch_var_name_to_prepared_ctx_ = g;
+  }
+
+  void SetCheckpointNotifyPreparedCtx(
+      std::shared_ptr<framework::ExecutorPrepareContext> g) {
+    checkpoint_prepared_ctx_ = g;
+  }
+
+  // Used for async.
+  void SetGradToPreparedCtx(
+      std::unordered_map<
+          std::string, std::shared_ptr<framework::ExecutorPrepareContext>>* g) {
+    grad_to_prepared_ctx_ = g;
+  }
+
+  void SetRPCServer(RPCServer* rpc_server) { rpc_server_ = rpc_server; }
+
+  // Get attributes.
+  bool sync_mode() { return sync_mode_; }
+  framework::Scope* scope() { return scope_; }
+  const platform::DeviceContext* dev_ctx() { return dev_ctx_; }
+  framework::ProgramDesc* program() { return program_; }
+  framework::Executor* executor() { return executor_; }
+
+  // This function processes user's rpc request.
+  // The implemention is in request_handler_impl.
+  // example:
+  //    std::string varname = request_.varname();
+  //
+  //    auto scope = request_handler_->scope();
+  //    auto invar = scope->FindVar(varname);
+  //    framework::Variable* outvar = nullptr;
+  //
+  //    request_handler_->Handle(varname, scope, invar, &outvar);
+  //    if (outvar) {
+  //        SerializeToByteBuffer(varname, outvar,
+  //           *request_handler_->dev_ctx(), &reply_);
+  //    }
+  virtual bool Handle(const std::string& varname, framework::Scope* scope,
+                      framework::Variable* var, framework::Variable** outvar,
+                      const std::string& out_var_name = "") = 0;
+
+ protected:
+  const bool sync_mode_;
+
+  const platform::DeviceContext* dev_ctx_;
+  framework::Executor* executor_;
+  framework::Scope* scope_;
+  framework::ProgramDesc* program_;
+
+  // used for distribute lookup table prefetch
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>*
+      prefetch_var_name_to_prepared_ctx_;
+  // used for checkpoint notify
+  std::shared_ptr<framework::ExecutorPrepareContext> checkpoint_prepared_ctx_;
+
+  // Used for async.
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>*
+      grad_to_prepared_ctx_;
+
+  RPCServer* rpc_server_;
+};
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5e6bff20f5f8c06e1497c697e3aabf7b9cb94ad6
--- /dev/null
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -0,0 +1,147 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
+#include "paddle/fluid/operators/distributed/rpc_server.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables
+// to directory specified.
+constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
+
+bool RequestSendHandler::Handle(const std::string& varname,
+                                framework::Scope* scope,
+                                framework::Variable* invar,
+                                framework::Variable** outvar,
+                                const std::string& out_var_name) {
+  VLOG(4) << "RequestSendHandler:" << varname;
+
+  // Async
+  if (!sync_mode_) {
+    try {
+      executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
+                                    scope);
+    } catch (std::exception& e) {
+      LOG(ERROR) << "async: run sub program error " << e.what();
+      return false;
+    }
+    return true;
+  }
+
+  // Sync
+  if (varname == BATCH_BARRIER_MESSAGE) {
+    VLOG(3) << "sync: recv batch barrier message";
+    rpc_server_->IncreaseBatchBarrier(kRequestSend);
+  } else if (varname == BEGIN_PASS_MESSAGE) {
+    VLOG(3) << "sync: recv begin pass message";
+    rpc_server_->WaitCond(kRequestSend);
+    rpc_server_->BeginPass();
+  } else {
+    VLOG(3) << "sync: received var_name: " << varname;
+    rpc_server_->WaitCond(kRequestSend);
+    VLOG(3) << "sync: processing received var: " << varname;
+
+    if (invar == nullptr) {
+      LOG(ERROR) << "sync: Can not find server side var: " << varname;
+      PADDLE_THROW("sync: Can not find server side var");
+      return false;
+    }
+    if (invar->IsType<framework::SelectedRows>()) {
+      std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
+      sparse_vars_.push_back(invar);
+    }
+  }
+  return true;
+}
+
+void RequestSendHandler::ResetSparseVarRecorder() {
+  std::unique_lock<std::mutex> lock(mutex_sparse_vars_);
+  for (auto* var : sparse_vars_) {
+    var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
+  }
+  sparse_vars_.clear();
+}
+
+bool RequestGetHandler::Handle(const std::string& varname,
+                               framework::Scope* scope,
+                               framework::Variable* invar,
+                               framework::Variable** outvar,
+                               const std::string& out_var_name) {
+  VLOG(4) << "RequestGetHandler:" << varname;
+  if (sync_mode_) {
+    if (varname == FETCH_BARRIER_MESSAGE) {
+      VLOG(3) << "sync: recv fetch barrier message";
+      rpc_server_->IncreaseBatchBarrier(kRequestGet);
+    } else if (varname == END_PASS_MESSAGE) {
+      rpc_server_->EndPass();
+    } else {
+      rpc_server_->WaitCond(kRequestGet);
+      *outvar = scope_->FindVar(varname);
+    }
+  } else {
+    if (varname != FETCH_BARRIER_MESSAGE && varname != END_PASS_MESSAGE) {
+      *outvar = scope_->FindVar(varname);
+    }
+  }
+  return true;
+}
+
+bool RequestPrefetchHandler::Handle(const std::string& varname,
+                                    framework::Scope* scope,
+                                    framework::Variable* invar,
+                                    framework::Variable** outvar,
+                                    const std::string& out_var_name) {
+  VLOG(4) << "RequestPrefetchHandler " << varname;
+
+  auto var_desc = program_->Block(0).FindVar(out_var_name);
+  InitializeVariable(*outvar, var_desc->GetType());
+  executor_->RunPreparedContext(
+      (*prefetch_var_name_to_prepared_ctx_)[varname].get(), scope);
+
+  return true;
+}
+
+bool RequestCheckpointHandler::Handle(const std::string& varname,
+                                      framework::Scope* scope,
+                                      framework::Variable* invar,
+                                      framework::Variable** outvar,
+                                      const std::string& out_var_name) {
+  PADDLE_ENFORCE(
+      checkpoint_notify_id != -1,
+      "when checkpoint_notify_id = -1, there should be no RPC invoke.");
+
+  auto* lt_var = scope->FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
+  lt_var->clear();
+  lt_var->append(out_var_name);
+  VLOG(4) << "RequestCheckpointHandler update var kLookupTablePath to: "
+          << out_var_name;
+  executor_->RunPreparedContext(checkpoint_prepared_ctx_.get(), scope);
+  return true;
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..87185500f2ffc3a8578eea339cc7a1e2b0e46631
--- /dev/null
+++ b/paddle/fluid/operators/distributed/request_handler_impl.h
@@ -0,0 +1,86 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <time.h>
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+class RequestSendHandler final : public RequestHandler {
+ public:
+  explicit RequestSendHandler(bool sync_mode) : RequestHandler(sync_mode) {}
+  virtual ~RequestSendHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar,
+              const std::string& out_var_name = "") override;
+  void ResetSparseVarRecorder();
+
+ private:
+  std::mutex mutex_sparse_vars_;
+  std::vector<framework::Variable*> sparse_vars_;
+};
+
+class RequestGetHandler final : public RequestHandler {
+ public:
+  explicit RequestGetHandler(bool sync_mode) : RequestHandler(sync_mode) {}
+  virtual ~RequestGetHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar,
+              const std::string& out_var_name = "") override;
+};
+
+class RequestPrefetchHandler final : public RequestHandler {
+ public:
+  explicit RequestPrefetchHandler(bool sync_mode) : RequestHandler(sync_mode) {}
+  virtual ~RequestPrefetchHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar,
+              const std::string& out_var_name = "") override;
+};
+
+class RequestCheckpointHandler final : public RequestHandler {
+ public:
+  explicit RequestCheckpointHandler(bool sync_mode, int checkpoint_notify_id)
+      : RequestHandler(sync_mode) {
+    this->checkpoint_notify_id = checkpoint_notify_id;
+  }
+  virtual ~RequestCheckpointHandler() {}
+  bool Handle(const std::string& varname, framework::Scope* scope,
+              framework::Variable* var, framework::Variable** outvar,
+              const std::string& out_var_name = "") override;
+
+ private:
+  int checkpoint_notify_id;
+};
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_client.cc b/paddle/fluid/operators/distributed/rpc_client.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b5ec9fe5367beb97b3cc7298102deff1e8ca4ec9
--- /dev/null
+++ b/paddle/fluid/operators/distributed/rpc_client.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "gflags/gflags.h"
+
+// default to 3min to avoid temprary network failures.
+DEFINE_int32(rpc_deadline, 180000, "deadline timeouts for rpc");
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+std::once_flag RPCClient::init_flag_;
+std::unique_ptr<RPCClient> RPCClient::rpc_client_(nullptr);
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..6479d3a97bafba37b74a1d1c04852a6e60e01be8
--- /dev/null
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -0,0 +1,101 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "gflags/gflags.h"
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+
+DECLARE_int32(rpc_deadline);
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+class RPCClient {
+ public:
+  RPCClient() {}
+  virtual ~RPCClient() {}
+  virtual bool AsyncSendVar(const std::string& ep,
+                            const platform::DeviceContext& ctx,
+                            const framework::Scope& scope,
+                            const std::string& var_name,
+                            int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual bool AsyncGetVar(const std::string& ep,
+                           const platform::DeviceContext& ctx,
+                           const framework::Scope& scope,
+                           const std::string& var_name,
+                           int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual bool AsyncPrefetchVar(const std::string& ep,
+                                const platform::DeviceContext& ctx,
+                                const framework::Scope& scope,
+                                const std::string& in_var_name,
+                                const std::string& out_var_name,
+                                int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual void AsyncSendBatchBarrier(const std::string& ep,
+                                     int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual void AsyncSendFetchBarrier(const std::string& ep,
+                                     int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual void AsyncCheckpointNotify(const std::string& ep,
+                                     const std::string& dir,
+                                     int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual void AsyncSendBeginPass(const std::string& ep,
+                                  int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  virtual void AsyncSendEndPass(const std::string& ep,
+                                int64_t time_out = FLAGS_rpc_deadline) = 0;
+
+  // BeginePass/EndPass tells all the pserver that start/end a pass, so that
+  // the pserver can increase/reduce it's barrier count, and continue to train
+  // with other trainers.
+  virtual void SendBeginPass() = 0;
+  virtual void SendEndPass() = 0;
+
+  virtual void Wait() = 0;
+
+  template <typename T>
+  static RPCClient* GetInstance() {
+    std::call_once(init_flag_, &RPCClient::Init<T>);
+    return rpc_client_.get();
+  }
+
+  // Init is called by GetInstance.
+  template <typename T>
+  static void Init() {
+    if (rpc_client_.get() == nullptr) {
+      rpc_client_.reset(new T());
+      rpc_client_->InitImpl();
+    }
+  }
+
+ protected:
+  virtual void InitImpl() {}
+
+ private:
+  static std::once_flag init_flag_;
+  static std::unique_ptr<RPCClient> rpc_client_;
+};
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d49ee34eeaf4e80f6fd4f8cdc548cc2b938d0f2a
--- /dev/null
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -0,0 +1,134 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <string>
+
+#include "paddle/fluid/operators/distributed/rpc_server.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+void RPCServer::ShutDown() {
+  LOG(INFO) << "RPCServer ShutDown ";
+  ShutDownImpl();
+
+  exit_flag_ = true;
+  barrier_cond_.notify_all();
+  rpc_cond_.notify_all();
+}
+
+void RPCServer::SavePort() const {
+  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
+  std::ofstream port_file;
+  port_file.open(file_path);
+  port_file << selected_port_;
+  port_file.close();
+  VLOG(4) << "selected port written to " << file_path;
+}
+
+void RPCServer::WaitBarrier(const std::string& rpc_name) {
+  std::unique_lock<std::mutex> lock(this->mutex_);
+  barrier_cond_.wait(lock, [this, &rpc_name] {
+    return ((barrier_counter_[rpc_name] == client_num_ && client_num_ != 0) ||
+            exit_flag_.load());
+  });
+
+  VLOG(3) << "batch_barrier_: " << rpc_name << " "
+          << barrier_counter_[rpc_name];
+}
+
+void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
+  VLOG(4) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
+  int b = 0;
+  std::unique_lock<std::mutex> lock(mutex_);
+  b = ++barrier_counter_[rpc_name];
+  if (b >= client_num_) {
+    lock.unlock();
+    barrier_cond_.notify_all();
+    lock.lock();
+  }
+}
+
+void RPCServer::BeginPass() {
+  VLOG(4) << "RPCServer begin increase pass barrier";
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    client_num_++;
+    VLOG(4) << "increase client_num to: " << client_num_;
+  }
+  barrier_cond_.notify_all();
+}
+
+void RPCServer::EndPass() {
+  VLOG(4) << "RPCServer begin increase pass barrier";
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    client_num_--;
+    VLOG(4) << "decrease client_num to: " << client_num_;
+    if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) {
+      barrier_counter_[kRequestGet]--;
+    }
+  }
+  barrier_cond_.notify_all();
+}
+
+void RPCServer::ResetBarrierCounter() {
+  VLOG(3) << "RPCServer ResetBarrierCounter ";
+  std::unique_lock<std::mutex> lock(mutex_);
+  for (auto& t : barrier_counter_) {
+    t.second = 0;
+  }
+}
+
+void RPCServer::RegisterRPC(const std::string& rpc_name,
+                            RequestHandler* handler, int thread_num) {
+  rpc_call_map_[rpc_name] = handler;
+  rpc_thread_num_[rpc_name] = thread_num;
+
+  static int cond = -1;
+  rpc_cond_map_[rpc_name] = ++cond;
+  VLOG(4) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler
+          << ", cond:" << rpc_cond_map_[rpc_name];
+}
+
+void RPCServer::SetCond(const std::string& rpc_name) {
+  VLOG(3) << "RPCServer SetCond " << rpc_name;
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cur_cond_ = rpc_cond_map_[rpc_name];
+  }
+
+  rpc_cond_.notify_all();
+}
+
+void RPCServer::WaitCond(const std::string& rpc_name) {
+  VLOG(4) << "RPCServer WaitCond " << rpc_name;
+  int cond = 0;
+  {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cond = rpc_cond_map_[rpc_name];
+  }
+
+  std::unique_lock<std::mutex> lock(mutex_);
+  rpc_cond_.wait(
+      lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); });
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h
new file mode 100644
index 0000000000000000000000000000000000000000..833991c8aa6e7cfd10f2aa52f9218be7ff8ccebf
--- /dev/null
+++ b/paddle/fluid/operators/distributed/rpc_server.h
@@ -0,0 +1,97 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <set>
+#include <string>
+#include <thread>  // NOLINT
+#include <utility>
+#include <vector>
+#include "paddle/fluid/operators/distributed/request_handler.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+class RPCServer {
+ public:
+  explicit RPCServer(const std::string& address, int client_num)
+      : cur_cond_(0),
+        bind_address_(address),
+        exit_flag_(false),
+        selected_port_(0),
+        client_num_(client_num) {}
+
+  virtual ~RPCServer() {}
+  virtual void StartServer() = 0;
+  virtual void WaitServerReady() = 0;
+
+  void ShutDown();
+
+  bool IsExit() { return exit_flag_.load(); }
+
+  int GetSelectedPort() const { return selected_port_; }
+
+  int GetClientNum() const;
+
+  void SavePort() const;
+
+  // RegisterRPC, register the rpc method name to a handler
+  // class, and auto generate a condition id for this call
+  // to be used for the barrier.
+  void RegisterRPC(const std::string& rpc_name, RequestHandler* handler,
+                   int thread_num = 5);
+
+  // Wait util all the clients have reached the barrier for one
+  // rpc method. This function should be called in the
+  // RequestHandler if you want to run the server/client in a
+  // synchronous mode.
+  void WaitBarrier(const std::string& rpc_name);
+
+  void SetCond(const std::string& rpc_name);
+  void WaitCond(const std::string& rpc_name);
+  void IncreaseBatchBarrier(const std::string rpc_name);
+
+  void BeginPass();
+  void EndPass();
+
+  void ResetBarrierCounter();
+
+ protected:
+  virtual void ShutDownImpl() = 0;
+
+ private:
+  std::mutex mutex_;
+  std::unordered_map<std::string, int> barrier_counter_;
+  std::condition_variable barrier_cond_;
+
+  std::unordered_map<std::string, int> rpc_cond_map_;
+  std::atomic<int> cur_cond_;
+  std::condition_variable rpc_cond_;
+
+ protected:
+  std::string bind_address_;
+  std::atomic<int> exit_flag_;
+  int selected_port_;
+  int client_num_;
+
+  std::unordered_map<std::string, RequestHandler*> rpc_call_map_;
+  std::unordered_map<std::string, int> rpc_thread_num_;
+  friend class RequestHandler;
+};
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a0693cffabcc561b0adfafc2c49027a890dd5efc
--- /dev/null
+++ b/paddle/fluid/operators/distributed/rpc_server_test.cc
@@ -0,0 +1,164 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <string>
+#include <thread>  // NOLINT
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
+#include "paddle/fluid/operators/distributed/rpc_client.h"
+#include "paddle/fluid/operators/distributed/rpc_server.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace distributed = paddle::operators::distributed;
+
+USE_OP(lookup_table);
+
+std::unique_ptr<distributed::RPCServer> g_rpc_service;
+std::unique_ptr<distributed::RequestHandler> g_req_handler;
+
+framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) {
+  auto root_block = program->MutableBlock(0);
+  auto* block = program->AppendBlock(*root_block);
+
+  framework::VariableNameMap input({{"W", {"w"}}, {"Ids", {"ids"}}});
+  framework::VariableNameMap output({{"Output", {"out"}}});
+  auto op = block->AppendOp();
+  op->SetType("lookup_table");
+  op->SetInput("W", {"w"});
+  op->SetInput("Ids", {"ids"});
+  op->SetOutput("Out", {"out"});
+
+  auto& out = *root_block->Var("out");
+  out.SetType(framework::proto::VarType::SELECTED_ROWS);
+  out.SetShape({10, 10});
+
+  return block;
+}
+
+void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
+  auto w_var = scope->Var("w");
+  w_var->GetMutable<framework::SelectedRows>();
+
+  auto out_var = scope->Var("out");
+  out_var->GetMutable<framework::SelectedRows>();
+
+  auto ids_var = scope->Var("ids");
+  ids_var->GetMutable<framework::SelectedRows>();
+}
+
+void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
+                         int64_t rows_numel) {
+  CreateVarsOnScope(scope, place);
+  auto ids_var = scope->Var("ids")->GetMutable<framework::SelectedRows>();
+  auto rows = ids_var->mutable_rows();
+  for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i * 2);
+  ids_var->mutable_value()->Resize({rows_numel, 1});
+  ids_var->mutable_value()->mutable_data<float>(*place);
+}
+
+void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
+                         int64_t rows_numel) {
+  CreateVarsOnScope(scope, place);
+  auto w = scope->Var("w")->GetMutable<framework::SelectedRows>();
+  auto rows = w->mutable_rows();
+  for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i);
+  auto w_value = w->mutable_value();
+  w_value->Resize({rows_numel, 10});
+
+  auto ptr = w_value->mutable_data<float>(*place);
+
+  for (int64_t i = 0; i < w_value->numel(); ++i) {
+    ptr[i] = static_cast<float>(i / 10);
+  }
+}
+
+void StartServer() {
+  framework::ProgramDesc program;
+  framework::Scope scope;
+  platform::CPUPlace place;
+  framework::Executor exe(place);
+  platform::CPUDeviceContext ctx(place);
+  auto* block = AppendPrefetchBlcok(&program);
+  std::string in_var_name("ids");
+  std::vector<int> prefetch_block_ids{block->ID()};
+  auto prepared = exe.Prepare(program, prefetch_block_ids);
+  InitTensorsOnServer(&scope, &place, 10);
+
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>
+      prefetch_var_name_to_prepared;
+  prefetch_var_name_to_prepared[in_var_name] = prepared[0];
+  g_req_handler->SetProgram(&program);
+  g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared);
+  g_req_handler->SetDevCtx(&ctx);
+  g_req_handler->SetScope(&scope);
+  g_req_handler->SetExecutor(&exe);
+
+  g_rpc_service->RegisterRPC(distributed::kRequestPrefetch,
+                             g_req_handler.get());
+  g_req_handler->SetRPCServer(g_rpc_service.get());
+
+  std::thread server_thread(
+      std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
+
+  server_thread.join();
+}
+
+TEST(PREFETCH, CPU) {
+  g_req_handler.reset(new distributed::RequestPrefetchHandler(true));
+  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
+  distributed::RPCClient* client =
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>();
+
+  std::thread server_thread(StartServer);
+  g_rpc_service->WaitServerReady();
+
+  int port = g_rpc_service->GetSelectedPort();
+  std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
+
+  framework::Scope scope;
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+  {
+    // create var on local scope
+    int64_t rows_numel = 5;
+    InitTensorsOnClient(&scope, &place, rows_numel);
+    std::string in_var_name("ids");
+    std::string out_var_name("out");
+
+    client->AsyncPrefetchVar(ep, ctx, scope, in_var_name, out_var_name);
+    client->Wait();
+    auto var = scope.Var(out_var_name);
+    auto value = var->GetMutable<framework::SelectedRows>()->value();
+    auto ptr = value.mutable_data<float>(place);
+
+    for (int64_t i = 0; i < rows_numel; ++i) {
+      EXPECT_EQ(ptr[0 + i * value.dims()[1]], static_cast<float>(i * 2));
+    }
+  }
+
+  g_rpc_service->ShutDown();
+  server_thread.join();
+  LOG(INFO) << "begin reset";
+  g_rpc_service.reset(nullptr);
+  g_req_handler.reset(nullptr);
+}
diff --git a/paddle/fluid/operators/distributed/send_recv.proto b/paddle/fluid/operators/distributed/send_recv.proto
new file mode 100644
index 0000000000000000000000000000000000000000..e0902320cff003797b12ed0204f7f99c44554b62
--- /dev/null
+++ b/paddle/fluid/operators/distributed/send_recv.proto
@@ -0,0 +1,83 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under
+the Apache License, Version 2.0 (the "License"); you may not use this file
+except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax = "proto3";
+package sendrecv;
+
+// option cc_generic_services = true;
+
+service SendRecvService {
+  // For parameter server round-robin like hashing, do not split tensors.
+  // Send and recv only one tensor
+  // TODO(typhoonzero): add streaming API
+  rpc SendVariable(VariableMessage) returns (VoidMessage) {}
+  // Argument VariableMessage for GetVariable should only contain varname.
+  rpc GetVariable(VariableMessage) returns (VariableMessage) {}
+  // pre-fetch variable by given variable name and Ids
+  rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
+
+  rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
+}
+
+// VariableMessage is serialized paddle variable message.
+// It can be:
+// LoDTensor
+// SelectedRows
+enum VarType {
+  LOD_TENSOR = 0;
+  SELECTED_ROWS = 1;
+  NCCL_ID = 2;
+}
+
+// NOTICE(gongwb):don't modify this proto if you are not
+//   not familar with how we serialize in sendrecvop_utils.h
+//   and deserilize it in  variable_response.h.
+message VariableMessage {
+  enum Type {
+    // Pod Types
+    BOOL = 0;
+    INT16 = 1;
+    INT32 = 2;
+    INT64 = 3;
+    FP16 = 4;
+    FP32 = 5;
+    FP64 = 6;
+  }
+
+  message LodData { repeated int64 lod_data = 1; }
+  string varname = 1;
+  // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
+  VarType type = 2;
+  // bool persistable is not needed for sending.
+  // tensor info:
+  Type data_type = 3;
+  repeated int64 dims = 4;
+
+  // lod details:
+  int64 lod_level = 5;
+  repeated LodData lod = 6;
+  // selected_rows height, aka. original dim0
+  int64 slr_height = 7;
+  // tensor data
+  bytes serialized = 8;
+  // selected_rows data
+  bytes rows = 9;
+  // Look up table block execution output variable name.
+  string out_varname = 10;
+  // If 1, the ps server will start profiling, the ps
+  // server stops profiling and generates a profile to /tmp/profile_ps_*
+  // when profile switches from 1 to 2.
+  int64 profile = 11;
+}
+
+message VoidMessage {}
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..98129d9f1014c39347e3409533f2bc10092611d2
--- /dev/null
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
@@ -0,0 +1,232 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include <nccl.h>
+#endif
+#include <sys/time.h>
+#include <thread>  // NOLINT
+
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
+#include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+using VarMsg = sendrecv::VariableMessage;
+
+void GetTensorPayload(framework::Variable* var,
+                      const platform::DeviceContext& ctx, VarMsg* request,
+                      void** payload, size_t* payload_size) {
+  auto tensor = var->Get<framework::LoDTensor>();
+  // FIXME(wuyi): data types in send_recv.proto is copied from
+  // framework.proto
+  request->set_data_type(
+      static_cast<VarMsg::Type>(framework::ToDataType(tensor.type())));
+  for (auto& dim : framework::vectorize(tensor.dims())) {
+    request->add_dims(dim);
+  }
+  const framework::LoD lod = tensor.lod();
+  if (lod.size() > 0) {
+    request->set_lod_level(lod.size());
+    for (auto& each : lod) {
+      VarMsg::LodData* lod_inner = request->add_lod();
+      for (auto& d : each) {
+        lod_inner->add_lod_data(d);
+      }
+    }
+  }
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+    PADDLE_ENFORCE(platform::is_gpu_place(tensor.place()));
+    platform::CUDAPinnedPlace cuda_pinned;
+    auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
+    auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type());
+    *payload = memory::Alloc(cuda_pinned, copy_size);
+
+    memory::Copy(cuda_pinned, *payload,
+                 boost::get<platform::CUDAPlace>(tensor.place()),
+                 reinterpret_cast<const void*>(tensor.data<void>()), copy_size,
+                 gpu_dev_ctx.stream());
+    ctx.Wait();
+#endif
+  } else {
+    *payload = tensor.data<void>();
+  }
+  *payload_size = tensor.numel() * framework::SizeOfType(tensor.type());
+}
+
+void GetSelectedRowsPayload(framework::Variable* var,
+                            const platform::DeviceContext& ctx, VarMsg* request,
+                            void** payload, size_t* payload_size) {
+  auto* slr = var->GetMutable<framework::SelectedRows>();
+  request->set_data_type(
+      static_cast<VarMsg::Type>(framework::ToDataType(slr->value().type())));
+  request->set_lod_level(0);
+  request->set_slr_height(slr->height());
+
+  for (auto& dim : framework::vectorize(slr->value().dims())) {
+    request->add_dims(dim);
+  }
+
+  auto* tensor = slr->mutable_value();
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+    platform::CUDAPinnedPlace cuda_pinned;
+    auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
+    auto copy_size = tensor->numel() * framework::SizeOfType(tensor->type());
+    *payload = memory::Alloc(cuda_pinned, copy_size);
+    memory::Copy(cuda_pinned, *payload,
+                 boost::get<platform::CUDAPlace>(tensor->place()),
+                 reinterpret_cast<const void*>(tensor->data<void>()), copy_size,
+                 gpu_dev_ctx.stream());
+    ctx.Wait();
+#endif
+  } else {
+    *payload = slr->mutable_value()->data<void>();
+  }
+  *payload_size = tensor->numel() * framework::SizeOfType(tensor->type());
+}
+
+void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
+                           const platform::DeviceContext& ctx,
+                           ::grpc::ByteBuffer* msg,
+                           const std::string& out_name) {
+  // Default DestroyCallback does nothing, When using GPU
+  // the CPU buffer need to be freed.
+  DestroyCallback destroy_callback = [](void* backing) {};
+  VarMsg request;
+  void* payload = nullptr;
+  size_t payload_size;
+
+  request.set_varname(name);
+  // Note: normally the profiler is enabled in 1 trainer, hence only
+  // 1 trainer returns true for ShouldSendProfileState(). It tells PS
+  // servers the trainer's profiling state so that PS can follow the
+  // trainer.
+  if (platform::ShouldSendProfileState()) {
+    if (platform::IsProfileEnabled()) {
+      request.set_profile(platform::kEnableProfiler);
+    } else {
+      request.set_profile(platform::kDisableProfiler);
+    }
+  }
+  if (!out_name.empty()) {
+    request.set_out_varname(out_name);
+  }
+  if (var->IsType<framework::LoDTensor>()) {
+    request.set_type(::sendrecv::LOD_TENSOR);
+    GetTensorPayload(var, ctx, &request, &payload, &payload_size);
+  } else if (var->IsType<framework::SelectedRows>()) {
+    request.set_type(::sendrecv::SELECTED_ROWS);
+    GetSelectedRowsPayload(var, ctx, &request, &payload, &payload_size);
+#ifdef PADDLE_WITH_CUDA
+  } else if (var->IsType<ncclUniqueId>()) {
+    request.set_type(::sendrecv::NCCL_ID);
+#endif
+  } else {
+    PADDLE_THROW("Serialize does not support type: %s",
+                 typeid(var->Type()).name());
+  }
+
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+    // GPU data is copied to CPU buffer when sending,
+    // free the buffer when possible.
+    destroy_callback = [](void* backing) {
+      platform::CUDAPinnedPlace cuda_pinned;
+      memory::Free(cuda_pinned, backing);
+    };
+#endif
+  }
+
+  std::string header;
+  request.AppendToString(&header);
+  auto buffer = std::unique_ptr<char[]>(new char[1024]);
+  void* buf = buffer.get();
+  ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
+  e.WriteRawBytes(std::string(header.data(), header.size()));
+// NCCLID is copied directly to the message, return bytebuffer
+// with only one slice if serializing NCCLID.
+#ifdef PADDLE_WITH_CUDA
+  if (var->IsType<ncclUniqueId>()) {
+    e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
+                              NCCL_UNIQUE_ID_BYTES);
+    const ncclUniqueId& uid = var->Get<ncclUniqueId>();
+    e.WriteRawBytes(std::string(uid.internal, NCCL_UNIQUE_ID_BYTES));
+
+    // for serialize NCCL_ID
+    ::grpc::Slice slices(e.size());
+    memcpy(const_cast<uint8_t*>(slices.begin()), e.data(), e.size());
+    ::grpc::ByteBuffer tmp(&slices, 1);
+    msg->Swap(&tmp);
+    return;
+  }
+#endif
+
+  e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
+  // steal reference of tensor data
+  ::grpc::Slice slices[4];  // metadata, tensor, rows meta, rows
+  int num_slices = 2;       // only SelectedRows have rows buffer
+  slices[0] = ::grpc::Slice(e.size());
+  memcpy(const_cast<uint8_t*>(slices[0].begin()), e.data(), e.size());
+  slices[1] = ::grpc::Slice(
+      grpc_slice_new_with_user_data(payload, payload_size, destroy_callback,
+                                    static_cast<char*>(payload)),
+      ::grpc::Slice::STEAL_REF);
+
+  if (var->IsType<framework::SelectedRows>()) {
+    auto* slr = var->GetMutable<framework::SelectedRows>();
+    ProtoEncodeHelper e2(static_cast<char*>(buf), 128);
+    size_t rows_memory_size =
+        slr->rows().size() * framework::SizeOfType(typeid(int64_t));
+    e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
+    slices[2] = ::grpc::Slice(e2.size());
+    memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
+
+    slices[3] = ::grpc::Slice(
+        grpc_slice_new_with_user_data(
+            const_cast<void*>(
+                reinterpret_cast<const void*>(slr->rows().data())),
+            rows_memory_size, [](void* backing) {},
+            const_cast<char*>(
+                reinterpret_cast<const char*>(slr->rows().data()))),
+        ::grpc::Slice::STEAL_REF);
+    num_slices = 4;
+  }
+
+  ::grpc::ByteBuffer tmp(&slices[0], num_slices);
+  msg->Swap(&tmp);
+}
+
+void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
+                               const platform::DeviceContext& ctx,
+                               const framework::Scope* scope,
+                               framework::Variable** var) {
+  operators::distributed::VariableResponse resp(scope, &ctx);
+  PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
+  *var = resp.GetVar();
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe25e73fa608727ba0bb912a82776b330ec8d83a
--- /dev/null
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <sys/time.h>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/var_type.h"
+
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+typedef void (*DestroyCallback)(void*);
+
+void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
+                           const platform::DeviceContext& ctx,
+                           ::grpc::ByteBuffer* msg,
+                           const std::string& out_varname = std::string());
+
+void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
+                               const platform::DeviceContext& ctx,
+                               const framework::Scope* scope,
+                               framework::Variable** var);
+
+inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) {
+  switch (type) {
+    case sendrecv::VariableMessage::FP32:
+      return typeid(float);  // NOLINT
+    case sendrecv::VariableMessage::FP64:
+      return typeid(double);  // NOLINT
+    case sendrecv::VariableMessage::INT32:
+      return typeid(int);  // NOLINT
+    case sendrecv::VariableMessage::INT64:
+      return typeid(int64_t);  // NOLINT
+    case sendrecv::VariableMessage::BOOL:
+      return typeid(bool);  // NOLINT
+    default:
+      PADDLE_THROW("Not support type %d", type);
+  }
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc
new file mode 100644
index 0000000000000000000000000000000000000000..45832c60bf9172497afabac927ba39a7cbfb9a52
--- /dev/null
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -0,0 +1,489 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/distributed/variable_response.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+#ifdef PADDLE_WITH_CUDA
+#include <nccl.h>
+#endif
+#include "paddle/fluid/platform/profiler.h"
+
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+enum WireType {
+  WIRETYPE_VARINT = 0,
+  WIRETYPE_LENGTH_DELIMITED = 2,
+};
+
+inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; }
+
+inline WireType GetTagWireType(uint32_t tag) {
+  return static_cast<WireType>(tag & 0x7);
+}
+
+bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input,
+                         int* result) {
+  uint64_t v;
+  if (input->ReadVarint64(&v) && v <= static_cast<uint64_t>(INT_MAX)) {
+    *result = static_cast<int>(v);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
+             const platform::DeviceContext& dev_ctx, platform::Place place,
+             void* dest, int size) {
+  const void* data = NULL;
+  int size_to_write = 0;
+  int length = size;
+  int total_written = 0;
+
+  if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_WITH_CUDA
+    auto& gpu_dev_ctx =
+        static_cast<const platform::CUDADeviceContext&>(dev_ctx);
+    platform::CPUPlace cpu;
+
+    char* p = reinterpret_cast<char*>(dest);
+    while (total_written < length) {
+      if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
+        return false;
+      }
+      // NOTE: if raw buffer is large and have two neighbor fields of raw
+      // buffers GetDirectBufferPointer can get all of them, use length to
+      // truncate it.
+      if (total_written + size_to_write > length) {
+        size_to_write = length - total_written;
+      }
+      // This log is useful to see how long a internal block size is of rpc.
+      VLOG(7) << "copy " << size_to_write << " data to CUDAPlace";
+      memory::Copy(boost::get<platform::CUDAPlace>(place),
+                   reinterpret_cast<void*>(p), cpu, data, size_to_write,
+                   gpu_dev_ctx.stream());
+      p += size_to_write;
+      total_written += size_to_write;
+
+      input->Skip(size_to_write);
+    }
+    gpu_dev_ctx.Wait();
+#else
+    PADDLE_THROW("Unexpected branch");
+#endif
+    return true;
+  }
+
+  char* p = reinterpret_cast<char*>(dest);
+  while (total_written < length) {
+    if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
+      return false;
+    }
+    // NOTE: if raw buffer is large and have two neighbor fields of raw buffers
+    // GetDirectBufferPointer can get all of them, use length to truncate it.
+    if (total_written + size_to_write > length) {
+      size_to_write = length - total_written;
+    }
+    // TODO(gongwb): can we avoid copy?
+    platform::CPUPlace cpu;
+    // This log is useful to see how long a internal block size is of rpc.
+    VLOG(7) << "copy " << size_to_write << " data to CPUPlace";
+    memory::Copy(cpu, reinterpret_cast<void*>(p), cpu, data, size_to_write);
+
+    p += size_to_write;
+    total_written += size_to_write;
+
+    input->Skip(size_to_write);
+  }
+
+  return true;
+}
+
+bool VariableResponse::CopyLodTensorData(
+    ::google::protobuf::io::CodedInputStream* input,
+    const platform::DeviceContext& ctx, const framework::DDim& dims,
+    int length) {
+  auto* tensor = GetVar()->GetMutable<framework::LoDTensor>();
+  tensor->Resize(dims);
+
+  framework::LoD lod;
+  for (int i = 0; i < meta_.lod_level(); ++i) {
+    framework::Vector<size_t> v;
+    for (int j = 0; j < meta_.lod(i).lod_data_size(); ++j) {
+      v.push_back(meta_.lod(i).lod_data(j));
+    }
+    lod.push_back(v);
+  }
+  tensor->set_lod(lod);
+
+  void* tensor_data =
+      tensor->mutable_data(ctx.GetPlace(), ToTypeIndex(meta_.data_type()));
+
+  if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
+    return false;
+  }
+
+  return true;
+}
+
+inline framework::DDim GetDims(
+    const ::google::protobuf::RepeatedField<::google::protobuf::int64>& dims) {
+  std::vector<int> vecdims;
+  for (auto& d : dims) {
+    vecdims.push_back(d);
+  }
+  return framework::make_ddim(vecdims);
+}
+
+bool VariableResponse::CopySelectRowsTensorData(
+    ::google::protobuf::io::CodedInputStream* input,
+    const platform::DeviceContext& ctx, const framework::DDim& dims,
+    int length) {
+  auto* slr = GetVar()->GetMutable<framework::SelectedRows>();
+  slr->set_height(meta_.slr_height());
+  auto* tensor = slr->mutable_value();
+  tensor->Resize(dims);
+  PADDLE_ENFORCE_EQ(static_cast<size_t>(tensor->numel()),
+                    length / framework::SizeOfType(
+                                 paddle::operators::distributed::ToTypeIndex(
+                                     meta_.data_type())));
+  void* tensor_data = tensor->mutable_data(
+      ctx.GetPlace(),
+      paddle::operators::distributed::ToTypeIndex(meta_.data_type()));
+
+  if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
+    return false;
+  }
+
+  return true;
+}
+
+bool VariableResponse::CopySelectRowsData(
+    ::google::protobuf::io::CodedInputStream* input,
+    const platform::DeviceContext& ctx, int length) {
+  auto* slr = GetVar()->GetMutable<framework::SelectedRows>();
+  slr->mutable_rows()->resize(length /
+                              framework::SizeOfType(typeid(int64_t)));  // int64
+  int64_t* rows_data = slr->mutable_rows()->data();
+
+  // copy rows CPU data, GPU data will be copied lazily.
+  platform::CPUPlace cpu;
+  if (!ReadRaw(input, ctx, cpu, rows_data, length)) {
+    return false;
+  }
+
+  return true;
+}
+
+bool ParseLodData(::google::protobuf::io::CodedInputStream* input,
+                  std::vector<int64_t>* lod) {
+  while (true) {
+    auto p = input->ReadTagWithCutoff(127);
+    int tag = GetTagFieldNumber(p.first);
+    WireType wt = GetTagWireType(p.first);
+
+    if (!p.second) {
+      return (tag == 0);
+    }
+
+    switch (tag) {
+      case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: {
+        uint64_t v;
+        if (wt == WIRETYPE_VARINT) {
+          if (!input->ReadVarint64(&v)) {
+            return false;
+          }
+          lod->push_back(v);
+          break;
+        }
+
+        if (wt == WIRETYPE_LENGTH_DELIMITED) {
+          int num_bytes = 0;
+          if (!input->ReadVarintSizeAsInt(&num_bytes)) {
+            return tag;
+          }
+          int start_pos = input->CurrentPosition();
+          while (input->CurrentPosition() - start_pos < num_bytes) {
+            uint64_t v;
+            if (!input->ReadVarint64(&v)) {
+              return tag;
+            }
+            lod->push_back(v);
+          }
+          break;
+        }
+
+        return false;
+      }
+      default: { return false; }
+    }
+  }
+
+  return true;
+}
+
+int VariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) {
+  GrpcByteBufferSource source;
+  source.Init(byte_buffer);
+  GrpcByteBufferSourceWrapper r(&source);
+
+  return Parse(&r);
+}
+
+int VariableResponse::Parse(Source* source) {
+  ::google::protobuf::io::ZeroCopyInputStream* input_stream =
+      source->contents();
+  ::google::protobuf::io::CodedInputStream input(input_stream);
+  input.SetTotalBytesLimit(INT_MAX, INT_MAX);
+
+  while (true) {
+    auto p = input.ReadTagWithCutoff(127);
+    int tag = GetTagFieldNumber(p.first);
+    WireType wt = GetTagWireType(p.first);
+    if (!p.second) {
+      if (tag != 0) {
+        return -1;
+      }
+      return 0;
+    }
+
+    switch (tag) {
+      case sendrecv::VariableMessage::kVarnameFieldNumber: {
+        uint32_t length;
+        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
+          return tag;
+        }
+
+        std::string temp;
+        if (!input.ReadString(&temp, length)) {
+          return tag;
+        }
+
+        meta_.set_varname(temp);
+        break;
+      }
+      case sendrecv::VariableMessage::kTypeFieldNumber: {
+        uint32_t v;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
+          return tag;
+        }
+
+        meta_.set_type(static_cast<::sendrecv::VarType>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kDataTypeFieldNumber: {
+        uint32_t v = 0;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
+          return tag;
+        }
+
+        meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kDimsFieldNumber: {
+        // not packed
+        if (wt == WIRETYPE_VARINT) {
+          uint64_t v;
+          if (!input.ReadVarint64(&v)) {
+            return tag;
+          }
+          meta_.add_dims(v);
+          break;
+        }
+
+        // packed
+        if (wt == WIRETYPE_LENGTH_DELIMITED) {
+          int num_bytes = 0;
+          if (!input.ReadVarintSizeAsInt(&num_bytes)) {
+            return tag;
+          }
+          int start_pos = input.CurrentPosition();
+          while (input.CurrentPosition() - start_pos < num_bytes) {
+            uint64_t v;
+            if (!input.ReadVarint64(&v)) {
+              return tag;
+            }
+            meta_.add_dims(v);
+          }
+          break;
+        }
+        return tag;
+      }
+      case sendrecv::VariableMessage::kLodLevelFieldNumber: {
+        uint64_t v = 0;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
+          return tag;
+        }
+        meta_.set_lod_level(static_cast<int64_t>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kLodFieldNumber: {
+        int length = 0;
+        if (wt != WIRETYPE_LENGTH_DELIMITED ||
+            !ReadVarintSizeAsInt(&input, &length)) {
+          return tag;
+        }
+
+        std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p =
+            input.IncrementRecursionDepthAndPushLimit(length);
+
+        std::vector<int64_t> lod_data;
+        if (p.second < 0 || !ParseLodData(&input, &lod_data)) {
+          return tag;
+        }
+
+        if (!input.DecrementRecursionDepthAndPopLimit(p.first)) {
+          return false;
+        }
+
+        if (lod_data.size() == 0) {
+          break;
+        }
+
+        auto lod = meta_.add_lod();
+        for (uint32_t i = 0; i < lod_data.size(); i++) {
+          lod->add_lod_data(lod_data[i]);
+        }
+        break;
+      }
+      case sendrecv::VariableMessage::kSlrHeightFieldNumber: {
+        uint64_t v = 0;
+        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
+          return tag;
+        }
+        meta_.set_slr_height(static_cast<int64_t>(v));
+        break;
+      }
+      case sendrecv::VariableMessage::kSerializedFieldNumber: {
+        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
+                        meta_.type() == sendrecv::LOD_TENSOR ||
+                        meta_.type() == sendrecv::NCCL_ID) &&
+                           meta_.varname() != "",
+                       "meta info should be got first!");
+
+        int num_bytes = 0;
+        if (wt != WIRETYPE_LENGTH_DELIMITED ||
+            !ReadVarintSizeAsInt(&input, &num_bytes)) {
+          return tag;
+        }
+
+        if (meta_.type() == sendrecv::NCCL_ID) {
+#ifdef PADDLE_WITH_CUDA
+          auto* var = scope_->FindVar(meta_.varname());
+          if (var != nullptr) {
+            ncclUniqueId* id = var->GetMutable<ncclUniqueId>();
+            if (!ReadRaw(&input, *dev_ctx_, platform::CPUPlace(), id->internal,
+                         num_bytes)) {
+              return tag;
+            }
+          }
+          break;
+#else
+          PADDLE_THROW("Not compiled with CUDA!");
+#endif
+        }
+
+        framework::DDim dims = GetDims(meta_.dims());
+        if (meta_.type() == sendrecv::LOD_TENSOR) {
+          PADDLE_ENFORCE(meta_.lod_size() >= 0,
+                         "lod info should be got first!");
+          if (!CopyLodTensorData(&input, *dev_ctx_, dims, num_bytes)) {
+            return tag;
+          }
+          break;
+        }
+
+        if (meta_.type() == sendrecv::SELECTED_ROWS) {
+          if (!CopySelectRowsTensorData(&input, *dev_ctx_, dims, num_bytes)) {
+            return tag;
+          }
+          break;
+        }
+
+        return tag;
+      }
+      case sendrecv::VariableMessage::kRowsFieldNumber: {
+        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
+                        meta_.type() == sendrecv::LOD_TENSOR) &&
+                           meta_.varname() != "",
+                       "meta info should be got first!");
+
+        int num_bytes = 0;
+        if (wt != WIRETYPE_LENGTH_DELIMITED ||
+            !ReadVarintSizeAsInt(&input, &num_bytes)) {
+          return tag;
+        }
+
+        if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) {
+          return tag;
+        }
+        break;
+      }
+      case sendrecv::VariableMessage::kOutVarnameFieldNumber: {
+        uint32_t length;
+        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
+          return tag;
+        }
+
+        std::string temp;
+        if (!input.ReadString(&temp, length)) {
+          return tag;
+        }
+
+        meta_.set_out_varname(temp);
+        break;
+      }
+      case sendrecv::VariableMessage::kProfileFieldNumber: {
+        uint64_t profiling = 0;
+        if (!input.ReadVarint64(&profiling)) {
+          return tag;
+        }
+        meta_.set_profile(profiling);
+        int64_t listener_id = platform::ListenerId();
+        if (listener_id <= 0) {
+          break;
+        }
+        if (profiling == platform::kEnableProfiler &&
+            !platform::IsProfileEnabled()) {
+          platform::EnableProfiler(platform::ProfilerState::kCPU);
+        } else if (profiling == platform::kDisableProfiler &&
+                   platform::IsProfileEnabled()) {
+          // TODO(panyx0718): Should we allow to customize file dir.
+          platform::DisableProfiler(
+              platform::EventSortingKey::kDefault,
+              string::Sprintf("/tmp/profile_ps_%lld", listener_id));
+        }
+        break;
+      }
+      default: {
+        // Unknown tag, return unknown error.
+        return -1;
+      }
+    }
+  }
+
+  return 0;
+}
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h
new file mode 100644
index 0000000000000000000000000000000000000000..1db4a0a522654ff2497b8bd9ee1381b5ab64067a
--- /dev/null
+++ b/paddle/fluid/operators/distributed/variable_response.h
@@ -0,0 +1,104 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+
+#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+class VariableResponse {
+ public:
+  VariableResponse(const framework::Scope* scope,
+                   const platform::DeviceContext* dev_ctx,
+                   bool create_scope = false)
+      : scope_(scope), dev_ctx_(dev_ctx), create_scope_(create_scope) {
+    if (create_scope) {
+      local_scope_ = &scope->NewScope();
+    }
+  }
+
+  virtual ~VariableResponse() {
+    if (create_scope_) {
+      scope_->DeleteScope(local_scope_);
+    }
+  }
+
+  // return:
+  // 0:ok.
+  // -1: unkown error.
+  // other: number of error field.
+  int Parse(Source* source);
+
+  // return:
+  // 0:ok.
+  // -1: unkown error.
+  // other: number of error field.
+  int Parse(const ::grpc::ByteBuffer& byte_buffer);
+
+  const framework::Scope& GetLocalScope() const { return *local_scope_; }
+
+  framework::Scope* GetMutableLocalScope() const { return local_scope_; }
+
+  inline std::string Varname() const { return meta_.varname(); }
+  inline std::string OutVarname() const { return meta_.out_varname(); }
+
+  // should call parse first.
+  framework::Variable* GetVar() {
+    if (create_scope_) {
+      return local_scope_->Var(meta_.varname());
+    }
+    return scope_->FindVar(meta_.varname());
+  }
+
+ private:
+  bool CopySelectRowsTensorData(::google::protobuf::io::CodedInputStream* input,
+                                const platform::DeviceContext& ctx,
+                                const framework::DDim& dims, int length);
+
+  bool CopySelectRowsData(::google::protobuf::io::CodedInputStream* input,
+                          const platform::DeviceContext& ctx, int length);
+
+  bool CopyLodTensorData(::google::protobuf::io::CodedInputStream* input,
+                         const platform::DeviceContext& ctx,
+                         const framework::DDim& dims, int length);
+
+ private:
+  const framework::Scope* scope_;
+  const platform::DeviceContext* dev_ctx_;
+  bool create_scope_ = false;
+  framework::Scope* local_scope_ = nullptr;
+  // only Skeleton
+  sendrecv::VariableMessage meta_;
+};
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc
index 1074ed6acc22a81f46c466d917ef973945a12898..07322e720f26213ea777be3cd22f2fead28507f0 100644
--- a/paddle/fluid/operators/dropout_op.cc
+++ b/paddle/fluid/operators/dropout_op.cc
@@ -35,11 +35,9 @@ class DropoutOp : public framework::OperatorWithKernel {
   }
 };
 
-template <typename AttrType>
 class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  DropoutOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The input of dropout op.");
     AddOutput("Out", "The output of dropout op.");
     AddOutput("Mask", "The random sampled dropout mask.").AsIntermediate();
@@ -73,7 +71,6 @@ are set equal to their corresponding inputs.
   }
 };
 
-template <typename AttrType>
 class DropoutOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -103,11 +100,11 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker<float>, dropout_grad,
-            ops::DropoutOpGrad<float>);
+REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad);
 REGISTER_OP_CPU_KERNEL(
-    dropout,
-    ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float, float>);
+    dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
     dropout_grad,
     ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu
index d6f9c04359d733cb4f3f0586e9239ee67deb7078..1dd66e0280c46c0624ff70e822cb6fa6f06b7aa9 100644
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -18,43 +18,54 @@ limitations under the License. */
 #include <thrust/random.h>
 #include <thrust/transform.h>
 #include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename T, typename AttrType>
+template <typename T>
 __global__ void RandomGenerator(const size_t n, const int seed,
-                                const AttrType dropout_prob, const T* src,
+                                const float dropout_prob, const T* src,
                                 T* mask_data, T* dst) {
   thrust::minstd_rand rng;
   rng.seed(seed);
-  thrust::uniform_real_distribution<AttrType> dist(0, 1);
+  thrust::uniform_real_distribution<float> dist(0, 1);
 
   int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int step_size = 0;
+
+  T mask;
+  T dest;
   for (; idx < n; idx += blockDim.x * gridDim.x) {
+    T s = src[idx];
+    if (step_size == 0) {
+      rng.discard(idx);
+      step_size = blockDim.x * gridDim.x;
+    } else {
+      rng.discard(step_size);
+    }
     if (dist(rng) < dropout_prob) {
-      mask_data[idx] = static_cast<T>(0);
+      mask = static_cast<T>(0);
     } else {
-      mask_data[idx] = static_cast<T>(1);
+      mask = static_cast<T>(1);
     }
-    dst[idx] = mask_data[idx] * src[idx];
+    dest = s * mask;
+    mask_data[idx] = mask;
+    dst[idx] = dest;
   }
 }
 
 // It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT.
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
-template <typename Place, typename T, typename AttrType>
+template <typename Place, typename T>
 class GPUDropoutKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* x = context.Input<Tensor>("X");
     auto* y = context.Output<Tensor>("Out");
     y->mutable_data<T>(context.GetPlace());
-    AttrType dropout_prob = context.Attr<AttrType>("dropout_prob");
-
-    auto X = EigenMatrix<T>::Reshape(*x, 1);
-    auto Y = EigenMatrix<T>::Reshape(*y, 1);
+    float dropout_prob = context.Attr<float>("dropout_prob");
 
     auto& place = *context.template device_context<Place>().eigen_device();
     if (!context.Attr<bool>("is_test")) {
@@ -70,11 +81,13 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
 
       int threads = 512;
       int grid = (x->numel() + threads - 1) / threads;
-      RandomGenerator<T, AttrType><<<grid, threads, 0,
-                                     context.cuda_device_context().stream()>>>(
+      RandomGenerator<
+          T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
           size, seed, dropout_prob, x_data, mask_data, y_data);
     } else {
-      Y.device(place) = X * (1.0f - dropout_prob);
+      auto X = EigenMatrix<T>::Reshape(*x, 1);
+      auto Y = EigenMatrix<T>::Reshape(*y, 1);
+      Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
     }
   }
 };
@@ -83,9 +96,9 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
-    dropout,
-    ops::GPUDropoutKernel<paddle::platform::CUDADeviceContext, float, float>);
-REGISTER_OP_CUDA_KERNEL(
-    dropout_grad,
-    ops::DropoutGradKernel<paddle::platform::CUDADeviceContext, float>);
+    dropout, ops::GPUDropoutKernel<plat::CUDADeviceContext, float>,
+    ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(dropout_grad,
+                        ops::DropoutGradKernel<plat::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h
index 209e4dec1756dc65fbf147c4dbbf0913d3c6ef7e..0628b4b826d2730a8e3fb4842e4ae550b8c00569 100644
--- a/paddle/fluid/operators/dropout_op.h
+++ b/paddle/fluid/operators/dropout_op.h
@@ -11,9 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
+
 #include <random>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -25,7 +26,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
-template <typename DeviceContext, typename T, typename AttrType>
+template <typename DeviceContext, typename T>
 class CPUDropoutKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..424d273c34b7e8d70c88b591c4fe45db61465f38
--- /dev/null
+++ b/paddle/fluid/operators/dropout_op_test.cc
@@ -0,0 +1,104 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(dropout);
+
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto var = scope->Var("X");
+  auto tensor = var->GetMutable<f::LoDTensor>();
+  tensor->Resize({10, 10});
+
+  std::vector<float> init;
+  for (int64_t i = 0; i < 10 * 10; ++i) {
+    init.push_back(1.0);
+  }
+
+  TensorFromVector(init, ctx, tensor);
+
+  auto place = ctx.GetPlace();
+  auto out_var = scope->Var("Out");
+  auto out_tensor = out_var->GetMutable<f::LoDTensor>();
+  out_tensor->Resize({10, 10});
+  out_tensor->mutable_data<float>(place);  // allocate
+
+  auto mask_var = scope->Var("Mask");
+  auto mask_tensor = mask_var->GetMutable<f::LoDTensor>();
+  mask_tensor->Resize({10, 10});
+  mask_tensor->mutable_data<float>(place);  // allocate
+
+  // run
+  f::AttributeMap attrs;
+  float dropout_prob = 0.5;
+  attrs.insert({"fix_seed", 1});
+  attrs.insert({"seed", 3});
+  attrs.insert({"dropout_prob", dropout_prob});
+  auto dropout_op = f::OpRegistry::CreateOp(
+      "dropout", {{"X", {"X"}}}, {{"Out", {"Out"}}, {"Mask", {"Mask"}}}, attrs);
+
+  dropout_op->Run(*scope, place);
+
+  std::vector<float> out_vec;
+  TensorToVector(*out_tensor, ctx, &out_vec);
+
+  std::vector<float> std_out = {
+      0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
+      1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,
+      1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,
+      1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
+      1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1};
+
+  EXPECT_EQ(out_vec.size(), std_out.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], std_out[i]);
+  }
+}
+
+// TODO(wyi): Due to
+// https://github.com/PaddlePaddle/Paddle/issues/9507, I temporarily
+// disable this test to remove the prevention of the merge of
+// unrelated PRs.
+/*
+TEST(Dropout, CPUDense) {
+  f::Scope scope;
+  p::CPUPlace place;
+  p::CPUDeviceContext ctx(place);
+  Compare(scope, ctx);
+}
+
+TEST(Dropout, GPUDense) {
+  f::Scope scope;
+  p::CUDAPlace place;
+  p::CUDADeviceContext ctx(place);
+  Compare(scope, ctx);
+}
+*/
diff --git a/paddle/fluid/operators/edit_distance_op.cc b/paddle/fluid/operators/edit_distance_op.cc
index c7f037d2df4372d0c4e3a261c0dff1fd6704d182..de25a3dab53492e38a92fbcf07ccbe43f7546950 100644
--- a/paddle/fluid/operators/edit_distance_op.cc
+++ b/paddle/fluid/operators/edit_distance_op.cc
@@ -49,8 +49,7 @@ class EditDistanceOp : public framework::OperatorWithKernel {
 
 class EditDistanceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  EditDistanceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Hyps",
              "(2-D LoDTensor<int64_t>, 2nd dim. equal to 1) "
              "The indices for hypothesis strings.");
diff --git a/paddle/fluid/operators/edit_distance_op.cu b/paddle/fluid/operators/edit_distance_op.cu
index 3b89ad5d49c339cf05abc0f8577e895f30dddfd4..c25b7d2f9ec32bcef44db239de43feefd855bfe5 100644
--- a/paddle/fluid/operators/edit_distance_op.cu
+++ b/paddle/fluid/operators/edit_distance_op.cu
@@ -14,8 +14,9 @@ limitations under the License. */
 
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/edit_distance_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f612256840825a75f49944ab97ff957d572a863
--- /dev/null
+++ b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
@@ -0,0 +1,190 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/elementwise_add_op.h"
+#include "paddle/fluid/operators/elementwise_op_function.h"
+
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::DataLayout;
+using framework::Tensor;
+using mkldnn::memory;
+using mkldnn::reorder;
+using mkldnn::primitive;
+using mkldnn::stream;
+using mkldnn::sum;
+
+template <typename T>
+class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    const T* x_data = x->data<T>();
+    const T* y_data = y->data<T>();
+    T* z_data = z->mutable_data<T>(ctx.GetPlace());
+
+    int axis = ctx.Attr<int>("axis");
+
+    auto x_dims = x->dims();
+    auto y_dims = y->dims();
+    auto z_dims = z->dims();
+
+    // Execute default elementwise_add operator when
+    // broadcast operations need to performed.
+    if (x_dims != y_dims) {
+      auto sum_func = [](T a, T b) -> T { return a + b; };
+
+      TransformFunctor<decltype(sum_func), T,
+                       paddle::platform::CPUDeviceContext, T>
+          functor(
+              x, y, z,
+              ctx.template device_context<paddle::platform::CPUDeviceContext>(),
+              sum_func);
+
+      axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+      PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+                     "Axis should be in range [0, x_dims)");
+
+      trim_trailing_singular_dims(&y_dims);
+      axis = (y_dims.size() == 0) ? x_dims.size() : axis;
+
+      int pre, n, post;
+      get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
+
+      if (post == 1) {
+        functor.RunRowWise(n, pre);
+      } else {
+        functor.RunMidWise(n, pre, post);
+      }
+      z->set_layout(DataLayout::kMKLDNN);
+      z->set_format(x->format());
+    } else {
+      PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
+                         x->format() != memory::format::format_undef,
+                     "Wrong layout/format set for X tensor");
+      PADDLE_ENFORCE(y->layout() == DataLayout::kMKLDNN &&
+                         y->format() != memory::format::format_undef,
+                     "Wrong layout/format set for X tensor");
+
+      std::vector<int> src_x_tz = framework::vectorize2int(x_dims);
+      std::vector<int> src_y_tz = framework::vectorize2int(y_dims);
+      std::vector<int> dst_tz = framework::vectorize2int(z_dims);
+
+      std::vector<memory::primitive_desc> srcs_pd;
+      std::vector<memory> srcs;
+      std::vector<float> scales = {1.0f, 1.0f};
+
+      auto src_x_pd = memory::primitive_desc(
+          {{src_x_tz}, memory::data_type::f32, x->format()}, mkldnn_engine);
+      auto src_y_pd = memory::primitive_desc(
+          {{src_y_tz}, memory::data_type::f32, y->format()}, mkldnn_engine);
+      auto src_x_memory =
+          memory(src_x_pd, paddle::platform::to_void_cast(x_data));
+      auto src_y_memory =
+          memory(src_y_pd, paddle::platform::to_void_cast(y_data));
+
+      srcs_pd.push_back(src_x_pd);
+      srcs_pd.push_back(src_y_pd);
+      srcs.push_back(src_x_memory);
+      srcs.push_back(src_y_memory);
+
+      auto dst_md =
+          memory::desc({dst_tz}, memory::data_type::f32, memory::format::any);
+
+      // create primitive descriptor for sum
+      auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_pd);
+
+      // create mkldnn memory for dst
+      memory dst_memory = memory(sum_pd.dst_primitive_desc(), z_data);
+
+      std::vector<primitive::at> inputs;
+      inputs.push_back(srcs[0]);
+      inputs.push_back(srcs[1]);
+
+      // create sum primitive
+      auto sum_prim = sum(sum_pd, inputs, dst_memory);
+
+      std::vector<primitive> pipeline;
+      pipeline.push_back(sum_prim);
+      stream(stream::kind::eager).submit(pipeline).wait();
+
+      z->set_layout(DataLayout::kMKLDNN);
+      z->set_format(
+          (memory::format)dst_memory.get_primitive_desc().desc().data.format);
+    }
+  }
+};
+
+template <typename T>
+class EltwiseAddMKLDNNGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
+
+    auto set_mkldnn_format = [](Tensor* in, const Tensor* out) {
+      in->set_layout(DataLayout::kMKLDNN);
+      in->set_format(out->format());
+    };
+
+    if (x->dims() == y->dims()) {
+      auto blas = math::GetBlas<paddle::platform::CPUDeviceContext, T>(ctx);
+      if (dx) {
+        blas.VCOPY(dout->numel(), dout->data<T>(),
+                   dx->mutable_data<T>(ctx.GetPlace()));
+        set_mkldnn_format(dx, dout);
+      }
+
+      if (dy) {
+        blas.VCOPY(dout->numel(), dout->data<T>(),
+                   dy->mutable_data<T>(ctx.GetPlace()));
+        set_mkldnn_format(dy, dout);
+      }
+    } else {
+      // Execute default kernel when broadcast is needed
+      ElemwiseGradCompute<paddle::platform::CPUDeviceContext, T,
+                          IdentityGrad<T>, IdentityGrad<T>>(
+          ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
+          IdentityGrad<T>());
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(elementwise_add, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::EltwiseAddMKLDNNKernel<float>)
+
+REGISTER_OP_KERNEL(elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::EltwiseAddMKLDNNGradKernel<float>)
diff --git a/paddle/fluid/operators/elementwise_add_op.cc b/paddle/fluid/operators/elementwise_add_op.cc
index 4aab54f60236ecc5fa7f70e22f1553c3bfe68198..d2c20537136fc3ac9d1bece24a2238f26215c922 100644
--- a/paddle/fluid/operators/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise_add_op.cc
@@ -14,26 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise_add_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-class ElementwiseAddOpMaker : public ElementwiseOpMaker {
- public:
-  ElementwiseAddOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Add", "Out = X + Y");
-    AddComment(comment_);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(elementwise_add, ops::ElementwiseOp,
-                  ops::ElementwiseAddOpMaker, ops::ElementwiseOpInferVarType,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(elementwise_add_grad, ops::ElementwiseOpGrad);
-
+REGISTER_ELEMWISE_OP(elementwise_add, "Add", "Out = X + Y");
 REGISTER_OP_CPU_KERNEL(
     elementwise_add,
     ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_add_op.cu b/paddle/fluid/operators/elementwise_add_op.cu
index 19dc4a52152e2a7aa71476d4f0ef692d0af97b4a..dfff518f170b56d180b6883c363effb8dbd677b6 100644
--- a/paddle/fluid/operators/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise_add_op.cu
@@ -14,19 +14,20 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise_add_op.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
-    elementwise_add,
-    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, int64_t>);
+    elementwise_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, double>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int64_t>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
     elementwise_add_grad,
-    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>,
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, double>,
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise_add_op.h b/paddle/fluid/operators/elementwise_add_op.h
index 253964562c8d34e0fda3b4760761206895f749aa..baf04c30b17cb333fc8a6544afd6c479442f835b 100644
--- a/paddle/fluid/operators/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise_add_op.h
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/blas.h"
 
 namespace paddle {
 namespace operators {
@@ -24,19 +26,57 @@ struct AddFunctor {
   inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
 };
 
+template <typename DeviceContext, typename T>
+void default_elementwise_add(const framework::ExecutionContext& ctx,
+                             const framework::Tensor* x,
+                             const framework::Tensor* y, framework::Tensor* z) {
+  int axis = ctx.Attr<int>("axis");
+  ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                        AddFunctor<T>(), z);
+}
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_floating_point<T>::value &&
+    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+elementwise_add(const framework::ExecutionContext& ctx,
+                const framework::Tensor* x, const framework::Tensor* y,
+                framework::Tensor* z) {
+  auto eigen_x = framework::EigenVector<T>::Flatten(*x);
+  auto eigen_y = framework::EigenVector<T>::Flatten(*y);
+  auto eigen_z = framework::EigenVector<T>::Flatten(*z);
+
+  auto blas = math::GetBlas<DeviceContext, T>(ctx);
+  blas.VADD(x->numel(), eigen_x.data(), eigen_y.data(), eigen_z.data());
+}
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    !std::is_floating_point<T>::value ||
+    !std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+elementwise_add(const framework::ExecutionContext& ctx,
+                const framework::Tensor* x, const framework::Tensor* y,
+                framework::Tensor* z) {
+  default_elementwise_add<DeviceContext, T>(ctx, x, y, z);
+}
+
 template <typename DeviceContext, typename T>
 class ElementwiseAddKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     using Tensor = framework::Tensor;
 
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* z = ctx.Output<Tensor>("Out");
+    const auto x = ctx.Input<Tensor>("X");
+    const auto y = ctx.Input<Tensor>("Y");
+    auto z = ctx.Output<Tensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          AddFunctor<T>(), z);
+
+    auto dims_equal = x->dims() == y->dims();
+    if (dims_equal) {
+      elementwise_add<DeviceContext, T>(ctx, x, y, z);
+    } else {
+      default_elementwise_add<DeviceContext, T>(ctx, x, y, z);
+    }
   }
 };
 
@@ -45,6 +85,55 @@ struct IdentityGrad {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
 };
 
+template <typename DeviceContext, typename T>
+void default_elementwise_add_grad(const framework::ExecutionContext& ctx,
+                                  const framework::Tensor* x,
+                                  const framework::Tensor* y,
+                                  const framework::Tensor* out,
+                                  const framework::Tensor* dout,
+                                  framework::Tensor* dx,
+                                  framework::Tensor* dy) {
+  int axis = ctx.Attr<int>("axis");
+
+  ElemwiseGradCompute<DeviceContext, T, IdentityGrad<T>, IdentityGrad<T>>(
+      ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
+      IdentityGrad<T>());
+}
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_floating_point<T>::value &&
+    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+elementwise_add_grad(const framework::ExecutionContext& ctx,
+                     const framework::Tensor* x, const framework::Tensor* y,
+                     const framework::Tensor* out,
+                     const framework::Tensor* dout, framework::Tensor* dx,
+                     framework::Tensor* dy) {
+  auto blas = math::GetBlas<DeviceContext, T>(ctx);
+
+  if (dx) {
+    blas.VCOPY(dout->numel(), dout->data<T>(),
+               dx->mutable_data<T>(ctx.GetPlace()));
+  }
+
+  if (dy) {
+    blas.VCOPY(dout->numel(), dout->data<T>(),
+               dy->mutable_data<T>(ctx.GetPlace()));
+  }
+}
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    !std::is_floating_point<T>::value ||
+    !std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+elementwise_add_grad(const framework::ExecutionContext& ctx,
+                     const framework::Tensor* x, const framework::Tensor* y,
+                     const framework::Tensor* out,
+                     const framework::Tensor* dout, framework::Tensor* dx,
+                     framework::Tensor* dy) {
+  default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
+}
+
 template <typename DeviceContext, typename T>
 class ElementwiseAddGradKernel : public framework::OpKernel<T> {
  public:
@@ -57,10 +146,13 @@ class ElementwiseAddGradKernel : public framework::OpKernel<T> {
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
-    ElemwiseGradCompute<DeviceContext, T, IdentityGrad<T>, IdentityGrad<T>>(
-        ctx, *x, *y, *out, *dout, axis, dx, dy, IdentityGrad<T>(),
-        IdentityGrad<T>());
+
+    if (platform::is_cpu_place(ctx.GetPlace()) && (x->dims() == y->dims())) {
+      elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
+    } else {
+      default_elementwise_add_grad<DeviceContext, T>(ctx, x, y, out, dout, dx,
+                                                     dy);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise_div_op.cc b/paddle/fluid/operators/elementwise_div_op.cc
index 6f9a090c8ea660d023acece096b48d29aa2f35f7..824b1221e5a77c8799dc34820b7f0db180c2439e 100644
--- a/paddle/fluid/operators/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise_div_op.cc
@@ -14,24 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise_div_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-class ElementwiseDivOpMaker : public ElementwiseOpMaker {
- public:
-  ElementwiseDivOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Div", "Out = X / Y");
-    AddComment(comment_);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_div, ops::ElementwiseOp, ops::ElementwiseDivOpMaker,
-            elementwise_div_grad, ops::ElementwiseOpGrad);
+REGISTER_ELEMWISE_OP(elementwise_div, "Div", "Out = X / Y");
 REGISTER_OP_CPU_KERNEL(
     elementwise_div,
     ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_max_op.cc b/paddle/fluid/operators/elementwise_max_op.cc
index 61da7c59441df22d71316b13f131399d3cd55f3a..411671335a19ae2283ca9db8b8f6bcbb6a6b630a 100644
--- a/paddle/fluid/operators/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise_max_op.cc
@@ -14,23 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise_max_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-class ElementwiseMaxOpMaker : public ElementwiseOpMaker {
- public:
-  ElementwiseMaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Max", "Out = max(X, Y)");
-    AddComment(comment_);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_max, ops::ElementwiseOp, ops::ElementwiseMaxOpMaker,
-            elementwise_max_grad, ops::ElementwiseOpGrad);
+REGISTER_ELEMWISE_OP(elementwise_max, "Max", "Out = max(X, Y)");
 REGISTER_OP_CPU_KERNEL(
     elementwise_max,
     ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_min_op.cc b/paddle/fluid/operators/elementwise_min_op.cc
index c74ff36db17579182e3c7e93a5adc5fe79fbcadd..816192083d2275b26e6dd9afc76f2c021a01cf73 100644
--- a/paddle/fluid/operators/elementwise_min_op.cc
+++ b/paddle/fluid/operators/elementwise_min_op.cc
@@ -14,23 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise_min_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-class ElementwiseMinOpMaker : public ElementwiseOpMaker {
- public:
-  ElementwiseMinOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Max", "Out = min(X, Y)");
-    AddComment(comment_);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_min, ops::ElementwiseOp, ops::ElementwiseMinOpMaker,
-            elementwise_min_grad, ops::ElementwiseOpGrad);
+REGISTER_ELEMWISE_OP(elementwise_min, "Min", "Out = min(X, Y)");
 REGISTER_OP_CPU_KERNEL(
     elementwise_min,
     ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise_mul_op.cc
index 5d7f2cdffd11dfef8df22175dd0570b277c0e13a..7cd67e74de6b9c4fbc718f60b4f671ccab2f9956 100644
--- a/paddle/fluid/operators/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise_mul_op.cc
@@ -14,25 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise_mul_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-
-class ElementwiseMulOpMaker : public ElementwiseOpMaker {
- public:
-  ElementwiseMulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Mul", "Out = X \\odot\\ Y");
-    AddComment(comment_);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker,
-            elementwise_mul_grad, ops::ElementwiseOpGrad);
+REGISTER_ELEMWISE_OP(elementwise_mul, "Mul", "Out = X \\\\odot Y");
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul,
     ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h
index f04d8d8fd82ed2336dff9c5b88808dc32de6630a..bb88970e42c194d9437609b62435f1a89e2b446b 100644
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -13,8 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
+#include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -39,86 +44,92 @@ class ElementwiseOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", x_dim);
     ctx->ShareLoD("X", /*->*/ "Out");
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (platform::CanMKLDNNBeUsed(ctx)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 
 class ElementwiseOpInferVarType : public framework::VarTypeInference {
  public:
   void operator()(const framework::OpDesc& op_desc,
                   framework::BlockDesc* block) const override {
-    auto x_var = op_desc.Input("X")[0];
-    auto out_var = op_desc.Output("Out")[0];
-    block->Var(out_var)->SetType(block->Var(x_var)->GetType());
+    auto x_name = op_desc.Input("X")[0];
+    auto out_name = op_desc.Output("Out")[0];
+    auto& x = block->FindRecursiveOrCreateVar(x_name);
+    auto& out = block->FindRecursiveOrCreateVar(out_name);
+    out.SetType(x.GetType());
   }
 };
 
 class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ElementwiseOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() final {
     AddInput("X", "(Tensor), The first input tensor of elementwise op.");
     AddInput("Y", "(Tensor), The second input tensor of elementwise op.");
-    AddOutput("Out", "The output of elementwise op.");
+    AddOutput("Out", "The output of elementwise op.").Reuse("X");
     AddAttr<int>("axis",
                  "(int, default -1). The start dimension index "
                  "for broadcasting Y onto X.")
         .SetDefault(-1)
         .EqualGreaterThan(-1);
-    comment_ = R"DOC(
-Limited Elementwise {name} Operator.
+    AddAttr<bool>("use_mkldnn", "(bool, default false). Used by MKLDNN.")
+        .SetDefault(false);
+    AddComment(string::Sprintf(R"DOC(
+Limited Elementwise %s Operator
 
 The equation is:
 
-$${equation}$$
+$$%s$$
 
-$X$ is a tensor of any dimension and the dimensions of tensor $Y$ must be
-smaller than or equal to the dimensions of $X$.
+- $X$: a tensor of any dimension. 
+- $Y$: a tensor whose dimensions must be less than or equal to the dimensions of $X$.
 
 There are two cases for this operator:
-1. The shape of $Y$ is same with $X$;
-2. The shape of $Y$ is a congiguous subsequencet of $X$. The trailing dimensions
-   of size 1 for $Y$ will be ignored for the consideration of subsequence.
 
+1. The shape of $Y$ is the same with $X$.
+2. The shape of $Y$ is a continuous subsequence of $X$.
 
 For case 2:
 
-$Y$ will be broadcasted to match the shape of $X$ and axis should be
-set to index of the start dimension to broadcast $Y$ onto $X$.
+1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index 
+   for broadcasting $Y$ onto $X$. 
+2. If $axis$ is -1 (default), $axis = rank(X) - rank(Y)$.
+3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of 
+   subsequence, such as shape(Y) = (2, 1) => (2).
 
-If axis is -1, it is treated as axis=rank(X)-rank(Y).
+For example:
 
-For example
   .. code-block:: python
 
     shape(X) = (2, 3, 4, 5), shape(Y) = (,)
     shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
-    shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
+    shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5), with axis=-1(default) or axis=2
     shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
     shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
     shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0
 
-Either of the inputs $X$ and $Y$ or none can carry the LoD (Level of Details)
-information. However, the output only shares the LoD information with input $X$.
+The inputs $X$ and $Y$ can carry the different LoD information. 
+But the output only shares the LoD information with the input $X$.
 
-)DOC";
-    AddComment(comment_);
+)DOC",
+                               GetName(), GetEquation()));
   }
 
  protected:
-  std::string comment_;
-
-  void Replace(std::string& src, std::string from, std::string to) {
-    std::size_t len_from = std::strlen(from.c_str());
-    std::size_t len_to = std::strlen(to.c_str());
-    for (std::size_t pos = src.find(from); pos != std::string::npos;
-         pos = src.find(from, pos + len_to)) {
-      src.replace(pos, len_from, to);
-    }
-  }
-
-  void SetComment(std::string name, std::string equation) {
-    Replace(comment_, "{name}", name);
-    Replace(comment_, "{equation}", equation);
-  }
+  virtual std::string GetName() const = 0;
+  virtual std::string GetEquation() const = 0;
 };
 
 class ElementwiseOpGrad : public framework::OperatorWithKernel {
@@ -148,6 +159,34 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
       ctx->SetOutputDim(y_grad_name, y_dims);
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (platform::CanMKLDNNBeUsed(ctx)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 }  // namespace operators
 }  // namespace paddle
+
+#define REGISTER_ELEMWISE_OP(op_type, op_name, equation)                \
+  class __ElemwiseOp##op_type##Maker__                                  \
+      : public ::paddle::operators::ElementwiseOpMaker {                \
+   protected:                                                           \
+    virtual std::string GetName() const { return op_name; }             \
+    virtual std::string GetEquation() const { return equation; }        \
+  };                                                                    \
+  REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp,        \
+                    __ElemwiseOp##op_type##Maker__,                     \
+                    ::paddle::operators::ElementwiseOpInferVarType,     \
+                    ::paddle::framework::DefaultGradOpDescMaker<true>); \
+  REGISTER_OPERATOR(op_type##_grad, ::paddle::operators::ElementwiseOpGrad)
diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h
index 0b4238436ffcc586fe8bc7abbe4cfbc1654dcb88..8b052611f80ddf874ca48c1c58e13346528a834e 100644
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -13,14 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/transform.h"
 
 #ifdef __NVCC__
+#include <cuda.h>
 #include <thrust/iterator/iterator_adaptor.h>
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
 #endif
 
@@ -43,35 +46,35 @@ namespace operators {
  */
 inline void get_mid_dims(const framework::DDim& x_dims,
                          const framework::DDim& y_dims, const int axis,
-                         int& pre, int& n, int& post) {
-  pre = 1;
-  n = 1;
-  post = 1;
+                         int* pre, int* n, int* post) {
+  *pre = 1;
+  *n = 1;
+  *post = 1;
   for (int i = 0; i < axis; ++i) {
-    pre *= x_dims[i];
+    (*pre) *= x_dims[i];
   }
 
   for (int i = 0; i < y_dims.size(); ++i) {
     PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i],
                       "Broadcast dimension mismatch.");
-    n *= y_dims[i];
+    (*n) *= y_dims[i];
   }
 
   for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
-    post *= x_dims[i];
+    (*post) *= x_dims[i];
   }
 }
 
-inline void trim_trailing_singular_dims(framework::DDim& dims) {
+inline void trim_trailing_singular_dims(framework::DDim* dims) {
   // Remove trailing dimensions of size 1 for y
-  auto actual_dims_size = dims.size();
+  auto actual_dims_size = dims->size();
   for (; actual_dims_size != 0; --actual_dims_size) {
-    if (dims[actual_dims_size - 1] != 1) break;
+    if ((*dims)[actual_dims_size - 1] != 1) break;
   }
-  if (actual_dims_size != dims.size()) {
-    auto actual_dims = framework::vectorize(dims);
+  if (actual_dims_size != dims->size()) {
+    auto actual_dims = framework::vectorize(*dims);
     actual_dims.resize(actual_dims_size);
-    dims = framework::make_ddim(actual_dims);
+    *dims = framework::make_ddim(actual_dims);
   }
 }
 
@@ -159,7 +162,7 @@ class RowwiseTransformIterator<T, platform::CUDADeviceContext>
       RowwiseTransformIterator<T, platform::CUDADeviceContext>, const T*>
       super_t;
   HOSTDEVICE RowwiseTransformIterator(const T* x, int n)
-      : super_t(x), begin_(x), n_(n){};
+      : super_t(x), begin_(x), n_(n) {}
   friend class thrust::iterator_core_access;
 
  private:
@@ -179,7 +182,7 @@ class MidWiseTransformIterator<T, platform::CUDADeviceContext>
       MidWiseTransformIterator<T, platform::CUDADeviceContext>, const T*>
       super_t;
   HOSTDEVICE MidWiseTransformIterator(const T* x, int n, int post)
-      : super_t(x), begin_(x), n_(n), post_(post){};
+      : super_t(x), begin_(x), n_(n), post_(post) {}
   friend class thrust::iterator_core_access;
 
  private:
@@ -332,6 +335,7 @@ static void ElemwiseGradBroadcast1CPU(const T* x, const T* y, const T* out,
     }
   }
 }
+
 #ifdef __NVCC__
 template <typename T, typename DX_OP, typename DY_OP>
 static __global__ void ElemwiseGradBroadcast1CUDAKernel(
@@ -355,7 +359,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
 
   if (dy) {
     h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
-    val = platform::reduceSum(val, tid, h);
+    val = paddle::platform::reduceSum(val, tid, h);
     if (threadIdx.x == 0) {
       dy[j] = val;
     }
@@ -432,7 +436,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
   if (dy) {
     int h = pre * post;
     h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
-    val = platform::reduceSum(val, tid, h);
+    val = paddle::platform::reduceSum(val, tid, h);
     if (threadIdx.x == 0) {
       dy[j] = val;
     }
@@ -472,11 +476,11 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
     auto y_dim = y.dims();
 
     axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis);
-    trim_trailing_singular_dims(y_dim);
+    trim_trailing_singular_dims(&y_dim);
     axis = (y_dim.size() == 0) ? x_dim.size() : axis;
 
     int pre, n, post;
-    get_mid_dims(x_dim, y_dim, axis, pre, n, post);
+    get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
     if (post == 1) {
       int h = pre;
       int w = n;
@@ -514,7 +518,7 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
       }
     }
   }
-};
+}
 
 template <typename DeviceContext, typename T, typename functor,
           typename broadcastfunctor, typename broadcast2functor>
@@ -543,11 +547,11 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
   }
 
   axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-  trim_trailing_singular_dims(y_dims);
+  trim_trailing_singular_dims(&y_dims);
   axis = (y_dims.size() == 0) ? x_dims.size() : axis;
 
   int pre, n, post;
-  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
+  get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
 
   if (post == 1) {
     broadcastfunctor f;
@@ -582,11 +586,11 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
   axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
   PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
                  "Axis should be in range [0, x_dims)");
-  trim_trailing_singular_dims(y_dims);
+  trim_trailing_singular_dims(&y_dims);
   axis = (y_dims.size() == 0) ? x_dims.size() : axis;
 
   int pre, n, post;
-  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
+  get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
   if (post == 1) {
     functor.RunRowWise(n, pre);
     return;
diff --git a/paddle/fluid/operators/elementwise_pow_op.cc b/paddle/fluid/operators/elementwise_pow_op.cc
index 60302c5e59f8ce595861405713045b05d90002e3..5fd6bde9ba0930e29f2161f1ff23ff9f5e7dc85d 100644
--- a/paddle/fluid/operators/elementwise_pow_op.cc
+++ b/paddle/fluid/operators/elementwise_pow_op.cc
@@ -13,17 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise_pow_op.h"
+#include <string>
 #include "paddle/fluid/operators/elementwise_op.h"
 
 namespace paddle {
 namespace operators {
 class ElementwisePowOpMaker : public ElementwiseOpMaker {
- public:
-  ElementwisePowOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Pow", "Out = X ^ Y");
-    AddComment(comment_);
-  }
+ protected:
+  std::string GetName() const override { return "Pow"; }
+  std::string GetEquation() const override { return "Out = X ^ Y"; }
 };
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise_sub_op.cc
index 6f770820c80310a183018b586cb7545ca1e9de51..a7562b166b373ee2a8c9b6f379431d88d3e45fcb 100644
--- a/paddle/fluid/operators/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise_sub_op.cc
@@ -14,23 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise_sub_op.h"
 #include "paddle/fluid/operators/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-class ElementwiseSubOpMaker : public ElementwiseOpMaker {
- public:
-  ElementwiseSubOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Sub", "Out = X - Y");
-    AddComment(comment_);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
-REGISTER_OP(elementwise_sub, ops::ElementwiseOp, ops::ElementwiseSubOpMaker,
-            elementwise_sub_grad, ops::ElementwiseOpGrad);
+REGISTER_ELEMWISE_OP(elementwise_sub, "Sub", "Out = X - Y");
 REGISTER_OP_CPU_KERNEL(
     elementwise_sub,
     ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index 51a66bd832fbdface953d9b7b509b32ce26d33ca..5ad0ec251328cc1ba580026bb47bf05316e7dc77 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/expand_op.h"
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -55,8 +56,7 @@ class ExpandOp : public framework::OperatorWithKernel {
 
 class ExpandOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
              "X is the input to be expanded.");
@@ -128,8 +128,9 @@ class ExpandGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(expand, ops::ExpandOp, ops::ExpandOpMaker, expand_grad,
-            ops::ExpandGradOp);
+REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp);
 REGISTER_OP_CPU_KERNEL(
     expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h
index 2c2d5c7c42c0cc918199eff054d1656f01a281e8..75dbf1d8bf5cb692dcf7b88e9f4c486ab3839701 100644
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@@ -14,13 +14,14 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
+
 #include <boost/preprocessor/arithmetic/div.hpp>
 #include <boost/preprocessor/arithmetic/mod.hpp>
 #include <boost/preprocessor/comparison/greater.hpp>
 #include <boost/preprocessor/comparison/greater_equal.hpp>
 #include <boost/preprocessor/control/if.hpp>
 #include <boost/preprocessor/repetition/repeat.hpp>
-#include <iostream>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..43f949111104ee56efc8625bdd609e412ef7f37d
--- /dev/null
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
@@ -0,0 +1,76 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fake_dequantize_op.h"
+#include <string>
+
+namespace paddle {
+namespace operators {
+
+class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel {
+ public:
+  FakeDequantizeMaxAbsOp(const std::string &type,
+                         const framework::VariableNameMap &inputs,
+                         const framework::VariableNameMap &outputs,
+                         const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of FakeDequantizeMaxAbsOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of FakeDequantizeMaxAbsOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class FakeDequantizeMaxAbsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor) The input with float-32/64 type is the "
+             "low precision tensor.");
+    AddOutput("Out",
+              "(Tensor) The output is the dequantized high "
+              "precision tensor.");
+    AddAttr<int>("num_bits",
+                 "(int) `num_bits` is the quantization level bits, "
+                 "such as 2, 5, 8.");
+    AddAttr<float>("scale",
+                   "(float) The maximum absolute value of low precision tensor."
+                   "It is usually calculated by the fake_quantize_max_abs_op.");
+    AddComment(R"DOC(
+FakeDequantizeMaxAbsOp operator.
+
+This calculation is an opposite operation of FakeQuantizeMaxAbsOp:
+
+$$Out = \frac{scale*X}{2^{num_bits} - 1}$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsOp,
+                  ops::FakeDequantizeMaxAbsOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(fake_dequantize_max_abs,
+                       ops::FakeDequantizeMaxAbsKernel<CPU, float>,
+                       ops::FakeDequantizeMaxAbsKernel<CPU, double>);
diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1bd38d1bd2c3a6f90d2fbad415d61efaead3afe9
--- /dev/null
+++ b/paddle/fluid/operators/fake_dequantize_op.cu
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fake_dequantize_op.h"
+
+namespace ops = paddle::operators;
+using CUDA = paddle::platform::CUDADeviceContext;
+REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs,
+                        ops::FakeDequantizeMaxAbsKernel<CUDA, float>,
+                        ops::FakeDequantizeMaxAbsKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..0901e68b3761159c3cc9c6684567bee38ec3f16d
--- /dev/null
+++ b/paddle/fluid/operators/fake_dequantize_op.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& ctx) const {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(in->place());
+
+    int num_bits = ctx.Attr<int>("num_bits");
+    T scale = static_cast<T>(ctx.Attr<float>("scale"));
+    int range = std::pow(2, num_bits) - 1;
+
+    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
+    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+    eigen_out.device(dev) = (scale / range) * eigen_in;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fc_mkldnn_op.cc b/paddle/fluid/operators/fc_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..99fa659a351249a4a93f71700e1c646465861aba
--- /dev/null
+++ b/paddle/fluid/operators/fc_mkldnn_op.cc
@@ -0,0 +1,304 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/fc_op.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+
+template <typename T>
+class MKLDNNMD {
+ public:
+  explicit MKLDNNMD(const T* in, const T* w, bool bias)
+      : in(paddle::framework::vectorize2int(in->dims())),
+        w(paddle::framework::vectorize2int(w->dims())) {
+    with_bias_ = bias;
+  }
+
+  mkldnn::memory::desc dst() const {
+    return platform::MKLDNNMemDesc({in[0], w[1]},
+                                   mkldnn::memory::data_type::f32,
+                                   mkldnn::memory::format::nc);
+  }
+
+  mkldnn::memory::desc src() const {
+    return is_spatial()
+               ? platform::MKLDNNMemDesc({in[0], in[1], in[2], in[3]},
+                                         mkldnn::memory::data_type::f32,
+                                         mkldnn::memory::format::nchw)
+               : platform::MKLDNNMemDesc({in[0], in[1]},
+                                         mkldnn::memory::data_type::f32,
+                                         mkldnn::memory::format::nc);
+  }
+
+  mkldnn::memory::desc weights() const {
+    return is_spatial()
+               ? platform::MKLDNNMemDesc({w[1], in[1], in[2], in[3]},
+                                         mkldnn::memory::data_type::f32,
+                                         mkldnn::memory::format::oihw)
+               : platform::MKLDNNMemDesc({w[1], in[1]},
+                                         mkldnn::memory::data_type::f32,
+                                         mkldnn::memory::format::oi);
+  }
+
+  mkldnn::memory::desc bias() const {
+    return with_bias_
+               ? platform::MKLDNNMemDesc({w[1]}, mkldnn::memory::data_type::f32,
+                                         mkldnn::memory::format::format_undef)
+               : platform::MKLDNNMemDesc({}, mkldnn::memory::data_type::f32,
+                                         mkldnn::memory::format::format_undef);
+  }
+
+ private:
+  bool is_spatial() const { return in.size() > 1 && w.size() > 1; }
+
+  std::vector<int> in;
+  std::vector<int> w;
+  bool with_bias_;
+  bool is_spatial_;
+};
+
+class MKLDNNMemory {
+ public:
+  MKLDNNMemory(MKLDNNMD<Tensor>* t, const mkldnn::engine& e)
+      : md_(t), engine_(e) {}
+  virtual ~MKLDNNMemory() = default;
+
+  template <typename Output>
+  mkldnn::memory dst(const Output* out) {
+    return mkldnn::memory({md_->dst(), engine_},
+                          static_cast<void*>(const_cast<float*>(out)));
+  }
+
+  template <typename Output>
+  mkldnn::memory dst(Output* out) {
+    return mkldnn::memory({md_->dst(), engine_}, out);
+  }
+
+  template <typename Input>
+  mkldnn::memory src(const Input* in) {
+    return mkldnn::memory({md_->src(), engine_},
+                          static_cast<void*>(const_cast<float*>(in)));
+  }
+
+  template <typename Weight>
+  mkldnn::memory weights(const Weight* w) {
+    return mkldnn::memory({md_->weights(), engine_},
+                          static_cast<void*>(const_cast<float*>(w)));
+  }
+
+  mkldnn::memory bias() {
+    return mkldnn::memory(mkldnn::memory::primitive_desc(md_->bias(), engine_));
+  }
+
+ private:
+  MKLDNNMD<Tensor>* md_;
+  const mkldnn::engine& engine_;
+};
+
+template <typename T>
+class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    auto input = ctx.Input<Tensor>("Input");
+    auto w = ctx.Input<Tensor>("W");
+
+    PADDLE_ENFORCE(input->dims().size() == 2 || input->dims().size() == 4,
+                   "Input must be with 2 or 4 dimensions, i.e. NCHW");
+    PADDLE_ENFORCE(w->dims().size() == 2 || w->dims().size() == 4,
+                   "Weights must be with 2 or 4 dimensions, i.e. OI or OIHW");
+
+    bool with_bias = ctx.Attr<bool>("bias_attr");
+    MKLDNNMD<Tensor> md(input, w, with_bias);
+
+    std::shared_ptr<mkldnn::inner_product_forward::primitive_desc> pd =
+        FcFwdPrimitiveDesc(md.src(), md.weights(), md.dst(), md.bias(),
+                           with_bias, mkldnn_engine);
+
+    const std::string key = ctx.op().Output("Out");
+    const std::string key_fc_pd = key + "@fc_pd";
+
+    dev_ctx.SetBlob(key_fc_pd, pd);
+
+    MKLDNNMemory mem(&md, mkldnn_engine);
+
+    const T* input_data = input->data<T>();
+    const T* w_data = w->data<T>();
+
+    auto output = ctx.Output<Tensor>("Out");
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+
+    auto dst_memory = mem.dst(output_data);
+    auto src_memory = mem.src(input_data);
+    auto weights_memory = mem.weights(w_data);
+    auto bias_memory = mem.bias();
+
+    auto forward = with_bias ? mkldnn::inner_product_forward(
+                                   *pd, src_memory, weights_memory, bias_memory,
+                                   dst_memory)
+                             : mkldnn::inner_product_forward(
+                                   *pd, src_memory, weights_memory, dst_memory);
+
+    std::vector<mkldnn::primitive> pipeline = {forward};
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  }
+
+ private:
+  std::unique_ptr<mkldnn::inner_product_forward::primitive_desc>
+  FcFwdPrimitiveDesc(const mkldnn::memory::desc& src,
+                     const mkldnn::memory::desc& weights,
+                     const mkldnn::memory::desc& dst,
+                     const mkldnn::memory::desc& bias, const bool with_bias,
+                     const mkldnn::engine& engine) const {
+    auto desc = with_bias
+                    ? mkldnn::inner_product_forward::desc(
+                          mkldnn::prop_kind::forward, src, weights, bias, dst)
+                    : mkldnn::inner_product_forward::desc(
+                          mkldnn::prop_kind::forward, src, weights, dst);
+
+    auto pd = new mkldnn::inner_product_forward::primitive_desc(desc, engine);
+    return std::unique_ptr<mkldnn::inner_product_forward::primitive_desc>(pd);
+  }
+};
+
+template <typename T>
+class FCMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    T* input_grad_data = nullptr;
+    T* w_grad_data = nullptr;
+
+    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* w_grad = ctx.Output<Tensor>(framework::GradVarName("W"));
+
+    if (input_grad) {
+      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+    }
+    if (w_grad) {
+      w_grad_data = w_grad->mutable_data<T>(ctx.GetPlace());
+    }
+
+    const Tensor* input = ctx.Input<Tensor>("Input");
+    const T* input_data = input->data<T>();
+
+    const Tensor* w = ctx.Input<Tensor>("W");
+    const T* w_data = w->data<T>();
+
+    const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    const T* out_grad_data = out_grad->data<T>();
+
+    bool with_bias = ctx.Attr<bool>("bias_attr");
+
+    MKLDNNMD<Tensor> md(input, w, with_bias);
+    MKLDNNMemory mem(&md, mkldnn_engine);
+
+    auto dst_memory = mem.dst(out_grad_data);
+    auto src_memory = mem.src(input_data);
+    auto weights_memory = mem.weights(w_data);
+    auto bias_memory = mem.bias();
+
+    const std::string key = ctx.op().Input("Out");
+    const std::string key_fc_pd = key + "@fc_pd";
+
+    auto pd =
+        std::static_pointer_cast<mkldnn::inner_product_forward::primitive_desc>(
+            dev_ctx.GetBlob(key_fc_pd));
+
+    PADDLE_ENFORCE(pd != nullptr, "Fail to find key_fc_pd in device context");
+
+    if (w_grad) {
+      auto weights_grad_memory = mem.weights(w_grad_data);
+
+      mkldnn::inner_product_backward_weights::primitive_desc bwd_weight_pd =
+          FcBwdWeightsPrimitiveDesc(md.src(), md.weights(), md.dst(), md.bias(),
+                                    with_bias, *pd, mkldnn_engine);
+
+      auto bwd_weights_prim = mkldnn::inner_product_backward_weights(
+          bwd_weight_pd, src_memory, dst_memory, weights_grad_memory,
+          bias_memory);
+
+      std::vector<mkldnn::primitive> pipeline{bwd_weights_prim};
+      mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+    }
+
+    if (input_grad) {
+      auto src_grad_memory = mem.src(input_grad_data);
+
+      mkldnn::inner_product_backward_data::primitive_desc bwd_data_pd =
+          FcBwdDataPrimitiveDesc(md.src(), md.weights(), md.dst(), *pd,
+                                 mkldnn_engine);
+
+      auto bwd_data_prim = mkldnn::inner_product_backward_data(
+          bwd_data_pd, dst_memory, weights_memory, src_grad_memory);
+
+      std::vector<mkldnn::primitive> pipeline{bwd_data_prim};
+      mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+    }
+  }
+
+ private:
+  mkldnn::inner_product_backward_weights::primitive_desc
+  FcBwdWeightsPrimitiveDesc(
+      const mkldnn::memory::desc& src, const mkldnn::memory::desc& diff_weights,
+      const mkldnn::memory::desc& diff_dst, const mkldnn::memory::desc& bias,
+      const bool with_bias,
+      const mkldnn::inner_product_forward::primitive_desc& pd,
+      const mkldnn::engine& engine) const {
+    auto bwd_weight_desc = with_bias
+                               ? mkldnn::inner_product_backward_weights::desc(
+                                     src, diff_weights, bias, diff_dst)
+                               : mkldnn::inner_product_backward_weights::desc(
+                                     src, diff_weights, bias, diff_dst);
+
+    return mkldnn::inner_product_backward_weights::primitive_desc(
+        bwd_weight_desc, engine, pd);
+  }
+
+  mkldnn::inner_product_backward_data::primitive_desc FcBwdDataPrimitiveDesc(
+      const mkldnn::memory::desc& diff_src, const mkldnn::memory::desc& weights,
+      const mkldnn::memory::desc& diff_dst,
+      const mkldnn::inner_product_forward::primitive_desc& pd,
+      const mkldnn::engine& engine) const {
+    auto bwd_data_desc =
+        mkldnn::inner_product_backward_data::desc(diff_src, weights, diff_dst);
+    return mkldnn::inner_product_backward_data::primitive_desc(bwd_data_desc,
+                                                               engine, pd);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_KERNEL(fc, MKLDNN, ::paddle::platform::CPUPlace,
+                   paddle::operators::FCMKLDNNOpKernel<float>);
+
+REGISTER_OP_KERNEL(fc_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   paddle::operators::FCMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a9ae1396db8d7dab0364779e506d5c0a3e2ff6ed
--- /dev/null
+++ b/paddle/fluid/operators/fc_op.cc
@@ -0,0 +1,102 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fc_op.h"
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+void FCOp::InferShape(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("Input"),
+                 "X(Input) of Fully Connected should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                 "Out(Output) of Fully Connected should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("W"),
+                 "W(Input) of Fully Connected should not be null.");
+
+  auto in_dims = ctx->GetInputDim("Input");
+  auto w_dims = ctx->GetInputDim("W");
+  std::vector<int64_t> output_shape({in_dims[0], w_dims[1]});
+
+  PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4,
+                 "Fully Connected input should be 2-D or 4-D tensor.");
+
+  PADDLE_ENFORCE(w_dims.size() == 2 || w_dims.size() == 4,
+                 "Fully Connected input should be 2-D or 4-D tensor.");
+
+  ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  ctx->ShareLoD("Input", "Out");
+}
+
+framework::OpKernelType FCOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  framework::LibraryType library{framework::LibraryType::kMKLDNN};
+  framework::DataLayout layout{framework::DataLayout::kMKLDNN};
+
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
+      layout, library);
+}
+
+void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const {
+  auto in_dims = ctx->GetInputDim("Input");
+  auto w_dims = ctx->GetInputDim("W");
+
+  if (ctx->HasOutput(framework::GradVarName("Input"))) {
+    ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
+  }
+  if (ctx->HasOutput(framework::GradVarName("W"))) {
+    ctx->SetOutputDim(framework::GradVarName("W"), w_dims);
+  }
+}
+
+framework::OpKernelType FCOpGrad::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  framework::LibraryType library{framework::LibraryType::kMKLDNN};
+  framework::DataLayout layout{framework::DataLayout::kMKLDNN};
+
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
+      layout, library);
+}
+
+void FCOpMaker::Make() {
+  AddInput("Input", "(Tensor) The input tensor of fully connected operator. ");
+  AddInput("W", "(Tensor), The second input tensor of fc op.");
+  AddOutput("Out", "(Tensor) The output tensor of fully connected operator. ");
+  AddAttr<bool>("use_mkldnn",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
+  AddAttr<bool>("bias_attr", "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
+  AddComment(R"DOC(
+  Fully Connected Operator.
+
+  The fully connected operation calculates the output based on the input, weights and bias attribute.
+  The size of each dimension of the parameters checked in the infer-shape.
+  The matrix of bias is generated by the mkldnn framework, when the bias_attr is True.
+  Additional parametrs are use_mkldnn and bias_attr.
+  The input(X) size and output(Out) size may be diffrent.
+
+  The fully connected layer only supports MKLDNN version
+)DOC");
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(fc, paddle::operators::FCOp, paddle::operators::FCOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(fc_grad, paddle::operators::FCOpGrad);
diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1b780fc0c401fbf34a9db03aa31137cbc016939
--- /dev/null
+++ b/paddle/fluid/operators/fc_op.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class FCOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class FCOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class FCOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/feed_op.cc b/paddle/fluid/operators/feed_op.cc
index 90c31877f6a87d1e237283d489353b4aba26c97b..bcb3e63ed7dbc775c1de6c4522f0548ea48a6cf0 100644
--- a/paddle/fluid/operators/feed_op.cc
+++ b/paddle/fluid/operators/feed_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -28,6 +29,10 @@ class FeedOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
+    // get device context from pool
+    auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    platform::RecordEvent record_event(Type(), dev_ctx);
+
     auto feed_var_name = Input("X");
     auto *feed_var = scope.FindVar(feed_var_name);
 
@@ -50,14 +55,10 @@ class FeedOp : public framework::OperatorBase {
     auto &feed_item = feed_list.at(static_cast<size_t>(col));
     auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
 
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
     if (platform::is_same_place(feed_item.place(), place)) {
       out_item->ShareDataWith(feed_item);
     } else {
-      framework::TensorCopy(feed_item, place, dev_ctx, out_item);
+      framework::TensorCopy(feed_item, place, *dev_ctx, out_item);
     }
     out_item->set_lod(feed_item.lod());
   }
@@ -65,8 +66,7 @@ class FeedOp : public framework::OperatorBase {
 
 class FeedOpInfoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FeedOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The input of feed op");
     AddOutput("Out", "The output of feed op");
     AddAttr<int>("col", "(int) The column of feed");
diff --git a/paddle/fluid/operators/fetch_barrier_op.cc b/paddle/fluid/operators/fetch_barrier_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..02beb80fc8a9f451393dcdd54492c4f88f908497
--- /dev/null
+++ b/paddle/fluid/operators/fetch_barrier_op.cc
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <future>  // NOLINT
+#include <ostream>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+
+class FetchBarrierOp : public framework::OperatorBase {
+ public:
+  FetchBarrierOp(const std::string& type,
+                 const framework::VariableNameMap& inputs,
+                 const framework::VariableNameMap& outputs,
+                 const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
+
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& ctx = *pool.Get(place);
+    // For profiling
+    platform::RecordEvent record_event(Type(), &ctx);
+
+    distributed::RPCClient* rpc_client =
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
+
+    rpc_client->Wait();
+
+    for (auto& ep : eps) {
+      VLOG(3) << "fetch barrier, ep: " << ep;
+      rpc_client->AsyncSendFetchBarrier(ep);
+    }
+    rpc_client->Wait();
+  }
+};
+
+class FetchBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddComment(R"DOC(
+SendBarrier operator
+
+This operator will send a send barrier signal to list_and_serv op, so that
+the Parameter Server would knew all variables have been sent.
+)DOC");
+
+    AddAttr<std::vector<std::string>>("endpoints",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints to send variables to.")
+        .SetDefault({"127.0.0.1:6164"});
+  }
+};
+
+class FetchBarrierOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(fetch_barrier, ops::FetchBarrierOp,
+                  paddle::framework::EmptyGradOpMaker, ops::FetchBarrierOpMaker,
+                  ops::FetchBarrierOpShapeInference);
diff --git a/paddle/fluid/operators/fetch_op.cc b/paddle/fluid/operators/fetch_op.cc
index d66f01d1b7ce8528a7c0177b2889aff7e0c5a12b..1640a2a22c69a0e3ab81a2889d6105b2cf4162b7 100644
--- a/paddle/fluid/operators/fetch_op.cc
+++ b/paddle/fluid/operators/fetch_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -29,6 +30,9 @@ class FetchOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    platform::RecordEvent record_event(Type(), pool.Get(place));
+
     auto fetch_var_name = Input("X");
     auto *fetch_var = scope.FindVar(fetch_var_name);
     PADDLE_ENFORCE(fetch_var != nullptr,
@@ -53,11 +57,7 @@ class FetchOp : public framework::OperatorBase {
 
     // FIXME(yuyang18): Should we assume the fetch operator always generate
     // CPU outputs?
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(src_item.place());
-
-    TensorCopy(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
-    dev_ctx.Wait();
+    TensorCopySync(src_item, platform::CPUPlace(), &dst_item);
     dst_item.set_lod(src_item.lod());
 
     VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name;
@@ -66,8 +66,7 @@ class FetchOp : public framework::OperatorBase {
 
 class FetchOpInfoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FetchOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The input of fetch op");
     AddOutput("Out", "The output of fetch op");
     AddAttr<int>("col", "(int) The column of fetch");
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
index 72da80baaf9bb3286f09b7ae5fcf24326b391906..453a1b32a0171a2ca88879ab3287e89c4d3c7759 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
@@ -30,19 +30,18 @@ class FillConstantBatchSizeLikeOp : public BatchSizeLikeOp {
 };
 
 class FillConstantBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
- public:
-  FillConstantBatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : BatchSizeLikeOpMaker(proto, op_checker) {
-    AddAttr<int>("dtype",
-                 "(int, default 5 (FP32)) "
-                 "Output data type")
+ protected:
+  void Apply() override {
+    AddAttr<int>(
+        "dtype",
+        "It could be numpy.dtype. Output data type. Default is float32")
         .SetDefault(framework::proto::VarType::FP32);
-    AddAttr<float>("value", "(float, default 0) The value to be filled")
+    AddAttr<float>("value", "default 0. The value to be filled")
         .SetDefault(0.0f);
     AddComment(R"DOC(
-FillConstantBatchSizeLike Operator.
-
-Fill up a variable with specified constant value.
+This function creates a tensor of specified *shape*, *dtype* and batch size,
+and initializes this with a constant supplied in *value*. The batch size is
+obtained from the `input` tensor.
 
 )DOC");
   }
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.h b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
index 2a7df149a9f4b03676f172da980c927d7fa5e8a4..63ea60678f80708f5a8340edd22588553b9ec139 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.h
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
@@ -24,6 +24,14 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* in = ctx.Input<framework::LoDTensor>("Input");
+    if (in->lod().size() && ctx.Attr<int>("input_dim_idx") == 0) {
+      // set the correct batch size for the LoDTensor.
+      auto odims = out->dims();
+      int output_dim_idx = ctx.Attr<int>("output_dim_idx");
+      odims[output_dim_idx] = static_cast<int>(in->lod().back().size()) - 1;
+      out->mutable_data<T>(odims, ctx.GetPlace());
+    }
     out->mutable_data<T>(ctx.GetPlace());
     auto value = ctx.Attr<float>("value");
 
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 07e0a80f8d644d4d011f2821785d49ece6cecfb5..130f18dde4f979a6a9925ede9cbf745fcec14d48 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -59,8 +59,7 @@ class FillConstantOp : public framework::OperatorBase {
 
 class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FillConstantOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddAttr<int>("dtype",
                  "(int, default 5 (FP32)) "
                  "Output data type")
diff --git a/paddle/fluid/operators/fill_op.cc b/paddle/fluid/operators/fill_op.cc
index ee8a2fc353f86cdabd35459a9195c3aa35f63e31..925dc19061e2196a40411f415eb6e5ad59ab52ff 100644
--- a/paddle/fluid/operators/fill_op.cc
+++ b/paddle/fluid/operators/fill_op.cc
@@ -82,8 +82,7 @@ class FillOp : public framework::OperatorBase {
 
 class FillOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FillOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddComment(R"DOC(Fill operator
 
 Fill an tensor with `value` and `shape`. The type of the tensor is specify by
diff --git a/paddle/fluid/operators/fill_zeros_like_op.cc b/paddle/fluid/operators/fill_zeros_like_op.cc
index 58c814ba6413626a48310da595a13238994f5ef1..d67bec36b3248be8602da562a88aeb58f5effe39 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cc
@@ -33,8 +33,7 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {
 
 class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FillZerosLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The input of fill-zeros-like op.");
     AddOutput("Out", "The variable will be filled up with zeros.");
     AddComment(R"DOC(
diff --git a/paddle/fluid/operators/ftrl_op.cc b/paddle/fluid/operators/ftrl_op.cc
index 0a456f0981e5753a7de5a6f2ba029681beb347a5..70ba25c213046cc934f46be067080d5fdbb42f9e 100644
--- a/paddle/fluid/operators/ftrl_op.cc
+++ b/paddle/fluid/operators/ftrl_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
 class FTRLOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -53,12 +54,17 @@ class FTRLOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("SquaredAccumOut", param_dim);
     ctx->SetOutputDim("LinearAccumOut", param_dim);
   }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 
 class FTRLOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  FTRLOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Param",
              "(Tensor, default Tensor<float>) "
              "Input parameter value that has to be updated.");
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 6be06b8816ce65641b49d7b7b3861cdd8460feaa..aa3e05b83b23569a4dd9c83294916e289f993abc 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -33,7 +33,6 @@ class GatherOp : public framework::OperatorWithKernel {
     auto index_dims = ctx->GetInputDim("Index");
     PADDLE_ENFORCE(index_dims.size() == 1);
     int batch_size = ctx->GetInputDim("Index")[0];
-    PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0");
     framework::DDim output_dims(ctx->GetInputDim("X"));
     output_dims[0] = batch_size;
     ctx->SetOutputDim("Out", output_dims);
@@ -67,8 +66,7 @@ class GatherGradOp : public framework::OperatorWithKernel {
 
 class GatherOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  GatherOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The source input of gather op");
     AddInput("Index", "The index input of gather op");
     AddOutput("Out", "The output of gather op");
@@ -100,7 +98,8 @@ Out = [[3, 4],
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(gather, ops::GatherOp, ops::GatherOpMaker, gather_grad,
-            ops::GatherGradOp);
+REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(gather_grad, ops::GatherGradOp);
 REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>);
 REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>);
diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu
index 3819549c7112c5e4a6de1a9aee54e469dd5a4618..7e014dd1cb47ee0575308dc13ba7bc7617baebff 100644
--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "gather.cu.h"
 #include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/gather_op.h"
-#include "scatter.cu.h"
+#include "paddle/fluid/operators/scatter.cu.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h
index 5a8b1ebbe3fe5f242a4d6395c921c75247587c6a..2dd726bebb1bc2e4d83844c0b98df01c390e622f 100644
--- a/paddle/fluid/operators/gather_op.h
+++ b/paddle/fluid/operators/gather_op.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "gather.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "scatter.h"
+#include "paddle/fluid/operators/gather.h"
+#include "paddle/fluid/operators/scatter.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc
index 7625bd45d968720099a973a6988484ec8332d1c1..f6b156eb30dae154395b34dcfc26319cd89edbca 100644
--- a/paddle/fluid/operators/gather_test.cc
+++ b/paddle/fluid/operators/gather_test.cc
@@ -12,39 +12,39 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/place.h"
-
 #include <gtest/gtest.h>
 #include <iostream>
 #include <string>
 
-TEST(Gather, GatherData) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  using namespace paddle::operators;
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/gather.h"
+#include "paddle/fluid/platform/place.h"
 
-  Tensor* src = new Tensor();
-  Tensor* index = new Tensor();
-  Tensor* output = new Tensor();
+TEST(Gather, GatherData) {
+  paddle::framework::Tensor* src = new paddle::framework::Tensor();
+  paddle::framework::Tensor* index = new paddle::framework::Tensor();
+  paddle::framework::Tensor* output = new paddle::framework::Tensor();
 
   int* p_src = nullptr;
   int* p_index = nullptr;
-  p_src = src->mutable_data<int>(make_ddim({3, 4}), CPUPlace());
-  p_index = index->mutable_data<int>(make_ddim({2}), CPUPlace());
+  p_src = src->mutable_data<int>(paddle::framework::make_ddim({3, 4}),
+                                 paddle::platform::CPUPlace());
+  p_index = index->mutable_data<int>(paddle::framework::make_ddim({2}),
+                                     paddle::platform::CPUPlace());
 
   for (int i = 0; i < 12; ++i) p_src[i] = i;
   p_index[0] = 1;
   p_index[1] = 0;
 
-  int* p_output = output->mutable_data<int>(make_ddim({2, 4}), CPUPlace());
+  int* p_output = output->mutable_data<int>(
+      paddle::framework::make_ddim({2, 4}), paddle::platform::CPUPlace());
 
   auto* cpu_place = new paddle::platform::CPUPlace();
   paddle::platform::CPUDeviceContext ctx(*cpu_place);
-  CPUGather<int>(ctx, *src, *index, output);
-
+  paddle::operators::CPUGather<int>(ctx, *src, *index, output);
+  delete cpu_place;
+  cpu_place = NULL;
   for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
   for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
 
diff --git a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
index 53c706a83e5bfb9e93d485141314e8b652d73593..4a974281481c8bc02589b428098475d73b8a0ba5 100644
--- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
@@ -32,16 +32,16 @@ class GaussianRandomBatchSizeLikeOp : public BatchSizeLikeOp {
 };
 
 class GaussianRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
- public:
-  GaussianRandomBatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : BatchSizeLikeOpMaker(proto, op_checker) {
+ protected:
+  void Apply() override {
     AddAttr<float>("mean",
                    "(float, default 0.0) "
-                   "mean of random tensor.")
+                   "The mean (or center) of the gaussian distribution.")
         .SetDefault(.0f);
     AddAttr<float>("std",
                    "(float, default 1.0) "
-                   "std of random tensor.")
+                   "The standard deviation (std, or spread) of the "
+                   "gaussian distribution.")
         .SetDefault(1.0f);
     AddAttr<int>("seed",
                  "(int, default 0) "
@@ -56,9 +56,11 @@ class GaussianRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
         .SetDefault(framework::proto::VarType::FP32);
 
     AddComment(R"DOC(
-GaussianRandom Operator.
 
 Used to initialize tensors with gaussian random generator.
+The defalut mean of the distribution is 0. and defalut standard
+deviation (std) of the distribution is 1.. Uers can set mean and std
+by input arguments.
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/gaussian_random_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..76b00b396c1349eff5db1059268e7cf280a8fc64
--- /dev/null
+++ b/paddle/fluid/operators/gaussian_random_mkldnn_op.cc
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+#include "paddle/fluid/operators/mean_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::DataLayout;
+template <typename T>
+class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    float mean = context.Attr<float>("mean");
+    float std = context.Attr<float>("std");
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    std::minstd_rand engine;
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    engine.seed(seed);
+    std::normal_distribution<T> dist(mean, std);
+    int64_t size = tensor->numel();
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = dist(engine);
+    }
+
+    // The format of output is set as the mkldnn's format
+    // TODO(@mozga-intel) The format of matrix sets inside the another layers.
+    tensor->set_layout(DataLayout::kMKLDNN);
+    tensor->set_format(mkldnn::memory::format::oihw);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(gaussian_random, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::GaussianMKLDNNKernel<float>);
diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index 4d197637b3f49f7e63f5b1a5cba212d1bf774f7e..1488aab1926b5b4ba7bceed582700f5a11fc6c93 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -15,6 +15,10 @@ limitations under the License. */
 #include <random>
 #include "paddle/fluid/framework/op_registry.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -62,16 +66,26 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library{framework::LibraryType::kPlain};
+    framework::DataLayout layout{framework::DataLayout::kAnyLayout};
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (library == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library = framework::LibraryType::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
+    }
+#endif
+
     return framework::OpKernelType(
         static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
-        ctx.device_context());
+        ctx.device_context(), layout, library);
   }
 };
 
 class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  GaussianRandomOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddOutput("Out", "Output matrix of gaussian random op");
 
     AddAttr<std::vector<int>>("shape",
@@ -96,7 +110,9 @@ class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
                  "(int, default 5(FP32)) "
                  "Output data type.")
         .SetDefault(framework::proto::VarType::FP32);
-
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddComment(R"DOC(
 GaussianRandom Operator.
 
diff --git a/paddle/fluid/operators/gen_nccl_id_op.cc b/paddle/fluid/operators/gen_nccl_id_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..697c239e59d158428ae9ba9f7feded19637dff28
--- /dev/null
+++ b/paddle/fluid/operators/gen_nccl_id_op.cc
@@ -0,0 +1,141 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <nccl.h>
+#include <stdint.h>
+#include <ostream>
+#include <string>
+
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+
+namespace paddle {
+namespace operators {
+
+class GenNCCLIdOp : public framework::OperatorBase {
+ public:
+  GenNCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
+              const framework::VariableNameMap& outputs,
+              const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    // put nccl id in CPUPlace
+    auto& dev_ctx = *pool.Get(platform::CPUPlace());
+    int trainer_id = Attr<int>("trainer_id");
+    framework::Scope& local_scope = scope.NewScope();
+
+    if (trainer_id == 0) {
+      GenerateAndSend(&local_scope, dev_ctx);
+    } else {
+      GetIdByServer(&local_scope, dev_ctx);
+    }
+  }
+
+ private:
+  void GenerateAndSend(framework::Scope* scope,
+                       const platform::DeviceContext& dev_ctx) const {
+    auto var = scope->FindVar(NCCL_ID_VARNAME);
+    PADDLE_ENFORCE_NOT_NULL(var);
+    auto id = var->GetMutable<ncclUniqueId>();
+    PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(id));
+
+    std::vector<std::string> endpoint_list =
+        Attr<std::vector<std::string>>("endpoint_list");
+    distributed::RPCClient* client =
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
+
+    for (auto& ep : endpoint_list) {
+      VLOG(3) << "sending nccl id to " << ep;
+      client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME);
+    }
+    client->Wait();
+    for (auto& ep : endpoint_list) {
+      client->AsyncSendBatchBarrier(ep);
+    }
+    client->Wait();
+    VLOG(3) << "sending completed...";
+  }
+
+  void GetIdByServer(framework::Scope* scope,
+                     const platform::DeviceContext& dev_ctx) const {
+    std::string endpoint = Attr<std::string>("endpoint");
+    // NOTE: Can not use unique_ptr here because the default
+    // deleter will call GRPC Server's base class's dtor and
+    // that will cause a wired crash.
+    distributed::RequestSendHandler rpc_h(true);
+    std::unique_ptr<distributed::RPCServer> rpc_service(
+        new RPCSERVER_T(endpoint, 1));
+
+    rpc_service->RegisterRPC(distributed::kRequestSend, &rpc_h);
+    rpc_h.SetRPCServer(rpc_service.get());
+
+    framework::ProgramDesc empty_program;
+    framework::Executor executor(dev_ctx.GetPlace());
+    rpc_h.SetScope(scope);
+    rpc_h.SetDevCtx(&dev_ctx);
+    rpc_h.SetProgram(&empty_program);
+    rpc_h.SetExecutor(&executor);
+
+    std::thread server_thread(
+        std::bind(&distributed::RPCServer::StartServer, rpc_service.get()));
+
+    rpc_service->SetCond(distributed::kRequestSend);
+    VLOG(3) << "start getting nccl id from trainer 0...";
+    rpc_service->WaitBarrier(distributed::kRequestSend);
+    VLOG(3) << "got nccl id and stop server...";
+    rpc_service->ShutDown();
+    VLOG(3) << "rpc server stopped";
+    server_thread.join();
+  }
+};
+
+class GenNCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddOutput("NCCLID", "Raw variable contains a NCCL UniqueId instaces.");
+    AddComment(R"DOC(
+GenNCCLId operator
+
+For trainer 0: generate a new UniqueId and send it to all the other trainers.
+For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
+)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string), e.g. 127.0.0.1:6175 "
+                         "current listen endpoint");
+    AddAttr<std::vector<std::string>>(
+        "endpoint_list",
+        "['trainer1_ip:port', 'trainer2_ip:port', ...] "
+        "list of trainer endpoints start from trainer 1")
+        .SetDefault({});
+    AddAttr<int>("trainer_id",
+                 "(int default 0) "
+                 "The index of the trainer in distributed training.")
+        .SetDefault(0);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(gen_nccl_id, ops::GenNCCLIdOp, ops::GenNCCLIdOpMaker);
diff --git a/paddle/fluid/operators/get_places_op.cc b/paddle/fluid/operators/get_places_op.cc
index 9002ce4717c6e75e7204ef62094e4680bba3f88b..db6ff7825690176ded0ab957764ed8411d3cd804 100644
--- a/paddle/fluid/operators/get_places_op.cc
+++ b/paddle/fluid/operators/get_places_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <thread>
+#include <thread>  // NOLINT
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/platform/place.h"
@@ -78,15 +78,14 @@ class GetPlacesOp : public framework::OperatorBase {
 
 class GetPlacesOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  GetPlacesOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddOutput("Out", "vector of Place");
     AddAttr<int>("device_count", "device count").SetDefault(0);
     AddAttr<std::string>("device_type", "device type")
         .InEnum({"CUDA", "CPU", "AUTO"})
         .SetDefault("AUTO");
     AddComment(R"DOC(
-Returns a list of places based on flags. The list will be used for parallel
+Returns a list of places based on arguments. The list will be used for parallel
 execution.
 )DOC");
   }
diff --git a/paddle/fluid/operators/go_op.cc b/paddle/fluid/operators/go_op.cc
index cfa797717d78aa72e1b931b6db6e153270b3424e..48f9d967adc90838dc4c7a09bfaf5a5a1ac9c99b 100644
--- a/paddle/fluid/operators/go_op.cc
+++ b/paddle/fluid/operators/go_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <thread>
+#include <thread>  // NOLINT
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -56,11 +56,11 @@ class GoOp : public framework::OperatorBase {
 
     // TODO(varunarora): Consider moving this root scope lookup to scope.h.
     const framework::Scope *root_scope = &scope;
-    const framework::Scope *parent_scope = &(root_scope->parent());
+    const framework::Scope *parent_scope = root_scope->parent();
 
     while (parent_scope != nullptr) {
       root_scope = parent_scope;
-      parent_scope = &(parent_scope->parent());
+      parent_scope = parent_scope->parent();
     }
 
     framework::BlockDesc *block = Attr<framework::BlockDesc *>(kBlock);
@@ -89,8 +89,7 @@ class GoOp : public framework::OperatorBase {
 
 class GoOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  GoOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(kX,
              "A set of variables, which are required by operators inside the "
              "block of Go Op.")
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index 2a91dcbcd418fcd61445b7d744789bdeee11d2f2..5c746878823b3dcde2573feec00d3d9dac5ceab8 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/gru_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -70,8 +71,7 @@ class GRUOp : public framework::OperatorWithKernel {
 
 class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  GRUOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Input",
              "(LoDTensor) The first input is a LodTensor, which supports "
              "variable-time length input sequence. The underlying tensor in "
@@ -215,7 +215,9 @@ class GRUGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(gru, ops::GRUOp, ops::GRUOpMaker, gru_grad, ops::GRUGradOp);
+REGISTER_OPERATOR(gru, ops::GRUOp, ops::GRUOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(gru_grad, ops::GRUGradOp);
 REGISTER_OP_CPU_KERNEL(
     gru, ops::GRUKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GRUKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h
index 0886bebc41d8b0f28745e88685f3954f86c823a1..3b0d93e54b72910de1429ddf41eb6b0fe9646942 100644
--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
@@ -13,15 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/gru_compute.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
 
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
 namespace paddle {
 namespace operators {
 
@@ -35,7 +34,7 @@ inline void ReorderInitState(const DeviceContext& ctx,
                              framework::Tensor* dst, bool indexed_src) {
   math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
   dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index_lod, *dst, indexed_src);
+  row_shuffle(ctx, src, index_lod, dst, indexed_src);
 }
 
 template <typename DeviceContext, typename T>
@@ -57,14 +56,12 @@ class GRUKernel : public framework::OpKernel<T> {
     auto* hidden = context.Output<LoDTensor>("Hidden");
     hidden->mutable_data<T>(context.GetPlace());
 
-    context.ShareLoD("Input", "Hidden");
-
     auto hidden_dims = hidden->dims();
 
     bool is_reverse = context.Attr<bool>("is_reverse");
     math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    to_batch(dev_ctx, *input, *batch_gate, true, is_reverse);
+    to_batch(dev_ctx, *input, batch_gate, true, is_reverse);
 
     if (bias) {
       math::RowwiseAdd<DeviceContext, T> add_bias;
@@ -116,7 +113,7 @@ class GRUKernel : public framework::OpKernel<T> {
 
     math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batch_hidden->set_lod(batch_gate->lod());
-    to_seq(dev_ctx, *batch_hidden, *hidden);
+    to_seq(dev_ctx, *batch_hidden, hidden);
   }
 
   void Compute(const framework::ExecutionContext& context) const override {
@@ -177,7 +174,7 @@ class GRUGradKernel : public framework::OpKernel<T> {
 
     bool is_reverse = context.Attr<bool>("is_reverse");
     batch_hidden_grad.set_lod(batch_hidden->lod());
-    to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse);
+    to_batch(dev_ctx, *hidden_grad, &batch_hidden_grad, false, is_reverse);
 
     math::GRUMetaValue<T> gru_value;
     gru_value.gate_weight = const_cast<T*>(weight_data);
@@ -239,7 +236,7 @@ class GRUGradKernel : public framework::OpKernel<T> {
       input_grad->mutable_data<T>(context.GetPlace());
       math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
       batch_gate_grad.set_lod(batch_gate->lod());
-      to_seq(dev_ctx, batch_gate_grad, *input_grad);
+      to_seq(dev_ctx, batch_gate_grad, input_grad);
     }
     if (bias_grad) {
       bias_grad->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/gru_unit_op.cc b/paddle/fluid/operators/gru_unit_op.cc
index f4c766db0a12b9d2167b0ee3b1d7666c4f1813f1..82a808b01e99ec33b0ca00a065fb301d3c633b19 100644
--- a/paddle/fluid/operators/gru_unit_op.cc
+++ b/paddle/fluid/operators/gru_unit_op.cc
@@ -71,8 +71,7 @@ class GRUUnitOp : public framework::OperatorWithKernel {
 
 class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  GRUUnitOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Input",
              "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the "
              "input.");
@@ -124,7 +123,7 @@ $$
 
 which is same as one time step of GRU Operator.
 
-@note To implement the complete GRU unit, fully-connected operator must be 
+@note To implement the complete GRU unit, fully-connected operator must be
 used before to feed xu, xr and xc as the Input of GRUUnit operator.
 
 )DOC");
@@ -194,12 +193,45 @@ class GRUUnitGradOp : public framework::OperatorWithKernel {
   }
 };
 
+class GRUUnitGradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("gru_unit_grad");
+
+    op->SetInput("Input", Input("Input"));
+    op->SetInput("HiddenPrev", Input("HiddenPrev"));
+    op->SetInput("Weight", Input("Weight"));
+    op->SetInput("Bias", Input("Bias"));
+
+    op->SetInput("Hidden", Output("Hidden"));
+    op->SetInput("Gate", Output("Gate"));
+    op->SetInput("ResetHiddenPrev", Output("ResetHiddenPrev"));
+    op->SetInput(framework::GradVarName("Hidden"), OutputGrad("Hidden"));
+
+    op->SetAttrMap(Attrs());
+
+    op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
+    op->SetOutput(framework::GradVarName("HiddenPrev"),
+                  InputGrad("HiddenPrev"));
+    op->SetOutput(framework::GradVarName("Weight"), InputGrad("Weight"));
+    op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker, gru_unit_grad,
-            ops::GRUUnitGradOp);
+
+REGISTER_OPERATOR(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker,
+                  ops::GRUUnitGradOpMaker);
+REGISTER_OPERATOR(gru_unit_grad, ops::GRUUnitGradOp);
+
 REGISTER_OP_CPU_KERNEL(
     gru_unit, ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h
index 15d91ca30593871e2b343eb0e5c0b76aa8055968..2d9faed648aef78da60706e13db3862080c96514 100644
--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
@@ -14,11 +14,10 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/math/blas.h"
 
 namespace paddle {
 namespace operators {
@@ -87,10 +86,10 @@ class GRUUnitKernel : public framework::OpKernel<T> {
     const T* weight_data = weight->data<T>();
     T* gate_data = gate->data<T>();
     T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
-    math::gemm<DeviceContext, T>(
-        context.template device_context<DeviceContext>(), false, false,
-        batch_size, 2 * frame_size, frame_size, 1, hidden_prev_data, frame_size,
-        weight_data, frame_size * 2, 1, gate_data, frame_size * 3);
+    auto blas = math::GetBlas<DeviceContext, T>(context);
+    blas.GEMM(false, false, batch_size, 2 * frame_size, frame_size, 1,
+              hidden_prev_data, frame_size, weight_data, frame_size * 2, 1,
+              gate_data, frame_size * 3);
 
     // calculate activited gate
     Eigen::array<int, 2> extents({{batch_size, frame_size}});
@@ -103,11 +102,10 @@ class GRUUnitKernel : public framework::OpKernel<T> {
                g.slice(r_offsets, extents), g.slice(r_offsets, extents));
     auto r = g.slice(r_offsets, extents);  // reset gate
     r_h_p.device(place) = r * h_p;         // reset previous hidden state
-    math::gemm<DeviceContext, T>(
-        context.template device_context<DeviceContext>(), false, false,
-        batch_size, frame_size, frame_size, 1, reset_hidden_prev_data,
-        frame_size, weight_data + frame_size * frame_size * 2, frame_size, 1,
-        gate_data + frame_size * 2, frame_size * 3);
+    blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
+              reset_hidden_prev_data, frame_size,
+              weight_data + frame_size * frame_size * 2, frame_size, 1,
+              gate_data + frame_size * 2, frame_size * 3);
 
     Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
     ActCompute(context.Attr<int>("activation"), place,
@@ -188,11 +186,11 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
     ActGradCompute(context.Attr<int>("activation"), place, c, c,
                    d_g.slice(c_offsets, extents), d_h * u);
     // backward for reset_hidden_prev
-    math::gemm<DeviceContext, T>(
-        context.template device_context<DeviceContext>(), false, true,
-        batch_size, frame_size, frame_size, 1, gate_grad_data + frame_size * 2,
-        frame_size * 3, weight_data + frame_size * frame_size * 2, frame_size,
-        0, reset_hidden_prev_grad_data, frame_size);
+    auto blas = math::GetBlas<DeviceContext, T>(context);
+    blas.GEMM(false, true, batch_size, frame_size, frame_size, 1,
+              gate_grad_data + frame_size * 2, frame_size * 3,
+              weight_data + frame_size * frame_size * 2, frame_size, 0,
+              reset_hidden_prev_grad_data, frame_size);
     // backward for unactivated reset gate
     ActGradCompute(context.Attr<int>("gate_activation"), place, r, r,
                    d_g.slice(r_offsets, extents), d_r_h_p * h_p);
@@ -200,18 +198,15 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
     if (weight_grad) {
       T* weight_grad_data = weight_grad->mutable_data<T>(context.GetPlace());
       // backward for state_weight
-      math::gemm<DeviceContext, T>(
-          context.template device_context<DeviceContext>(), true, false,
-          frame_size, frame_size, batch_size, 1, reset_hidden_prev_data,
-          frame_size, gate_grad_data + frame_size * 2, frame_size * 3, 0,
-          weight_grad_data + frame_size * frame_size * 2, frame_size);
+      blas.GEMM(true, false, frame_size, frame_size, batch_size, 1,
+                reset_hidden_prev_data, frame_size,
+                gate_grad_data + frame_size * 2, frame_size * 3, 0,
+                weight_grad_data + frame_size * frame_size * 2, frame_size);
 
       // backward for update_gate_weight and reset_gate_weight
-      math::gemm<DeviceContext, T>(
-          context.template device_context<DeviceContext>(), true, false,
-          frame_size, frame_size * 2, batch_size, 1, hidden_prev_data,
-          frame_size, gate_grad_data, frame_size * 3, 0, weight_grad_data,
-          frame_size * 2);
+      blas.GEMM(true, false, frame_size, frame_size * 2, batch_size, 1,
+                hidden_prev_data, frame_size, gate_grad_data, frame_size * 3, 0,
+                weight_grad_data, frame_size * 2);
     }
     // backward for hidden_prev
     if (hidden_prev_grad) {
@@ -219,11 +214,9 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
           hidden_prev_grad->mutable_data<T>(context.GetPlace());
       auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
       d_h_p.device(place) = d_r_h_p * r + d_h * (u.constant(T(1)) - u);
-      math::gemm<DeviceContext, T>(
-          context.template device_context<DeviceContext>(), false, true,
-          batch_size, frame_size, frame_size * 2, 1, gate_grad_data,
-          frame_size * 3, weight_data, frame_size * 2, 1, hidden_prev_grad_data,
-          frame_size);
+      blas.GEMM(false, true, batch_size, frame_size, frame_size * 2, 1,
+                gate_grad_data, frame_size * 3, weight_data, frame_size * 2, 1,
+                hidden_prev_grad_data, frame_size);
     }
     // backward for input
     if (input_grad) {
diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
index efe84f14098028675cb332efd9545c9709528cb3..69e7fa4490b892373d85898b13b976a474a6096a 100644
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -46,8 +46,7 @@ class HingeLossOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class HingeLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  HingeLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Logits",
              "The input value (Logits) of Hinge loss op."
              "Logits is a 2-D tensor with shape [batch_size, 1].");
@@ -103,8 +102,9 @@ class HingeLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker<float>,
-            hinge_loss_grad, ops::HingeLossGradOp);
+REGISTER_OPERATOR(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(hinge_loss_grad, ops::HingeLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     hinge_loss,
     ops::HingeLossKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc
index 134b23b4612b478f9aeb06454c9fd9a6c25fffb4..4ecd8634ff41ff4eba6b5ed1d0fc78068190dce5 100644
--- a/paddle/fluid/operators/huber_loss_op.cc
+++ b/paddle/fluid/operators/huber_loss_op.cc
@@ -45,8 +45,7 @@ class HuberLossOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class HuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  HuberLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "The input value of huber loss op."
              "X is a 2-D tensor with shape [batch_size, 1].");
@@ -121,8 +120,9 @@ class HuberLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
-            huber_loss_grad, ops::HuberLossGradOp);
+REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(huber_loss_grad, ops::HuberLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     huber_loss,
     ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
index 048391549dd8df24cc215d04431c306ac4c7e5be..0669661d225c664010fce97f0a526b62988b92c5 100644
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/im2sequence_op.h"
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -53,8 +54,7 @@ class Im2SequenceOp : public framework::OperatorWithKernel {
 
 class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Im2SequenceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(Tensor) The input tensor has NCHW format."
              "N: batch size"
@@ -147,8 +147,9 @@ class Im2SequenceGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(im2sequence, ops::Im2SequenceOp, ops::Im2SequenceOpMaker,
-            im2sequence_grad, ops::Im2SequenceGradOp);
+REGISTER_OPERATOR(im2sequence, ops::Im2SequenceOp, ops::Im2SequenceOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(im2sequence_grad, ops::Im2SequenceGradOp);
 REGISTER_OP_CPU_KERNEL(
     im2sequence,
     ops::Im2SequenceKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h
index a6a83fefbc6266fa718dcad78b3a018526f124db..d792c68f784d8ffec0eb303a6ab9b59c9f121fa7 100644
--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
@@ -13,7 +13,7 @@
    limitations under the License. */
 
 #pragma once
-
+#include <vector>
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc
index 6b5c3db13c0929ae0dd2fb2c981867df0a36c1ce..f0ffc9706689f5afe4546c3483114b38bc2b7872 100644
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
@@ -1,78 +1,53 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/increment_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
 
-class IncrementInferShape : public framework::InferShapeBase {
+class IncrementOp : public framework::OperatorWithKernel {
  public:
-  void operator()(framework::InferShapeContext *ctx) const override {
+  IncrementOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of IncrementOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of IncrementOp should not be null.");
     PADDLE_ENFORCE_EQ(1, framework::product(ctx->GetInputDim("X")));
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", "Out");
   }
-};
-
-struct IncrementFunctor {
-  IncrementFunctor(const framework::LoDTensor &x, framework::LoDTensor *out,
-                   float value)
-      : x_(x), out_(out), value_(value) {}
-
-  template <typename T>
-  void operator()() const {
-    *out_->data<T>() = *x_.data<T>() + static_cast<T>(value_);
-  }
-
-  const framework::LoDTensor &x_;
-  framework::LoDTensor *out_;
-  float value_;
-};
-
-class IncrementOp : public framework::OperatorBase {
- public:
-  IncrementOp(const std::string &type, const framework::VariableNameMap &inputs,
-              const framework::VariableNameMap &outputs,
-              const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
-    auto &out =
-        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
 
-    PADDLE_ENFORCE(platform::is_cpu_place(x.place()));
-    out.Resize(x.dims());
-    out.mutable_data(x.place(), x.type());
-    float value = Attr<float>("step");
-    VLOG(10) << Output("Out") << " increase " << Input("X") << " with "
-             << value;
-    framework::VisitDataType(framework::ToDataType(out.type()),
-                             IncrementFunctor(x, &out, value));
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
+    // IncrementOp kernel's device type is decided by input tensor place
+    kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
+    return kt;
   }
 };
 
 class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  IncrementOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(Tensor) The input tensor of increment operator");
     AddOutput("Out", "(Tensor) The output tensor of increment operator.");
     AddAttr<float>("step",
@@ -108,5 +83,10 @@ class IncrementGradOpMaker : public framework::SingleGradOpDescMaker {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementInferShape,
-                  ops::IncrementOpMaker, ops::IncrementGradOpMaker);
+REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker,
+                  ops::IncrementGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    increment, ops::IncrementKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::IncrementKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::IncrementKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::IncrementKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/increment_op.cu b/paddle/fluid/operators/increment_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..228063bf3d4b24bbd03649189f6ddba9a5f0ca30
--- /dev/null
+++ b/paddle/fluid/operators/increment_op.cu
@@ -0,0 +1,22 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/increment_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    increment, ops::IncrementKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::IncrementKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::IncrementKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::IncrementKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/increment_op.h b/paddle/fluid/operators/increment_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0e8c66255ef68b975701fb6b3c145be2590e271
--- /dev/null
+++ b/paddle/fluid/operators/increment_op.h
@@ -0,0 +1,39 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class IncrementKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x_tensor = context.Input<framework::Tensor>("X");
+    auto* out_tensor = context.Output<framework::Tensor>("Out");
+    float step = context.Attr<float>("step");
+
+    out_tensor->mutable_data<T>(context.GetPlace());
+    auto& dev =
+        *context.template device_context<DeviceContext>().eigen_device();
+    framework::EigenScalar<T>::From(*out_tensor).device(dev) =
+        framework::EigenScalar<T>::From(*x_tensor) + static_cast<T>(step);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/iou_similarity_op.cc b/paddle/fluid/operators/iou_similarity_op.cc
deleted file mode 100755
index ffbd7c7814c3fdec9fef0580ccd1ea3661ac0012..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/iou_similarity_op.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/iou_similarity_op.h"
-
-namespace paddle {
-namespace operators {
-
-class IOUSimilarityOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of IOUSimilarityOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"),
-                   "Input(Y) of IOUSimilarityOp should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "The rank of Input(X) must be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[1], 4UL, "The shape of X is [N, 4]");
-    PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The rank of Input(Y) must be 2.");
-    PADDLE_ENFORCE_EQ(y_dims[1], 4UL, "The shape of Y is [M, 4]");
-
-    ctx->ShareLoD("X", /*->*/ "Out");
-    ctx->SetOutputDim("Out", framework::make_ddim({x_dims[0], y_dims[0]}));
-  }
-};
-
-class IOUSimilarityOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  IOUSimilarityOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(LoDTensor, default LoDTensor<float>) "
-             "Box list X is a 2-D LoDTensor with shape [N, 4] holds N boxes, "
-             "each box is represented as [xmin, ymin, xmax, ymax], "
-             "the shape of X is [N, 4]. [xmin, ymin] is the left top "
-             "coordinate of the box if the input is image feature map, they "
-             "are close to the origin of the coordinate system. "
-             "[xmax, ymax] is the right bottom coordinate of the box. "
-             "This tensor can contain LoD information to represent a batch "
-             "of inputs. One instance of this batch can contain different "
-             "numbers of entities.");
-    AddInput("Y",
-             "(Tensor, default Tensor<float>) "
-             "Box list Y holds M boxes, each box is represented as "
-             "[xmin, ymin, xmax, ymax], the shape of X is [N, 4]. "
-             "[xmin, ymin] is the left top coordinate of the box if the "
-             "input is image feature map, and [xmax, ymax] is the right "
-             "bottom coordinate of the box.");
-
-    AddOutput("Out",
-              "(LoDTensor, the lod is same as input X) The output of "
-              "iou_similarity op, a tensor with shape [N, M] "
-              "representing pairwise iou scores.");
-
-    AddComment(R"DOC(
-IOU Similarity Operator.
-Computes intersection-over-union (IOU) between two box lists.
- Box list 'X' should be a LoDTensor and 'Y' is a common Tensor,
- boxes in 'Y' are shared by all instance of the batched inputs of X.
- Given two boxes A and B, the calculation of IOU is as follows:
-
-$$
-IOU(A, B) = 
-\frac{area(A\cap B)}{area(A)+area(B)-area(A\cap B)}
-$$
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(iou_similarity, ops::IOUSimilarityOp,
-                             ops::IOUSimilarityOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    iou_similarity,
-    ops::IOUSimilarityKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IOUSimilarityKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/iou_similarity_op.cu b/paddle/fluid/operators/iou_similarity_op.cu
deleted file mode 100755
index f40a388d62e66a110656ebb71094d46b5ac147eb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/iou_similarity_op.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/iou_similarity_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    iou_similarity,
-    ops::IOUSimilarityKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IOUSimilarityKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/iou_similarity_op.h b/paddle/fluid/operators/iou_similarity_op.h
deleted file mode 100644
index c76448c736847a536be2021a8a5fef23bef23a50..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/iou_similarity_op.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-
-template <typename T>
-inline HOSTDEVICE T IOUSimilarity(T xmin1, T ymin1, T xmax1, T ymax1, T xmin2,
-                                  T ymin2, T xmax2, T ymax2) {
-  constexpr T zero = static_cast<T>(0);
-  T area1 = (ymax1 - ymin1) * (xmax1 - xmin1);
-  T area2 = (ymax2 - ymin2) * (xmax2 - xmin2);
-  T inter_xmax = xmax1 > xmax2 ? xmax2 : xmax1;
-  T inter_ymax = ymax1 > ymax2 ? ymax2 : ymax1;
-  T inter_xmin = xmin1 > xmin2 ? xmin1 : xmin2;
-  T inter_ymin = ymin1 > ymin2 ? ymin1 : ymin2;
-  T inter_height = inter_ymax - inter_ymin;
-  T inter_width = inter_xmax - inter_xmin;
-  inter_height = inter_height > zero ? inter_height : zero;
-  inter_width = inter_width > zero ? inter_width : zero;
-  T inter_area = inter_width * inter_height;
-  T union_area = area1 + area2 - inter_area;
-  T sim_score = inter_area / union_area;
-  return sim_score;
-}
-
-template <typename T>
-struct IOUSimilarityFunctor {
-  IOUSimilarityFunctor(const T* x, const T* y, T* z, int cols)
-      : x_(x), y_(y), z_(z), cols_(static_cast<size_t>(cols)) {}
-
-  inline HOSTDEVICE void operator()(size_t row_id) const {
-    T x_min1 = x_[row_id * 4];
-    T y_min1 = x_[row_id * 4 + 1];
-    T x_max1 = x_[row_id * 4 + 2];
-    T y_max1 = x_[row_id * 4 + 3];
-    for (size_t i = 0; i < cols_; ++i) {
-      T x_min2 = y_[i * 4];
-      T y_min2 = y_[i * 4 + 1];
-      T x_max2 = y_[i * 4 + 2];
-      T y_max2 = y_[i * 4 + 3];
-
-      T sim = IOUSimilarity(x_min1, y_min1, x_max1, y_max1, x_min2, y_min2,
-                            x_max2, y_max2);
-
-      z_[row_id * cols_ + i] = sim;
-    }
-  }
-  const T* x_;
-  const T* y_;
-  T* z_;
-  const size_t cols_;
-};
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class IOUSimilarityKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::LoDTensor* in_x = ctx.Input<framework::LoDTensor>("X");
-    const framework::Tensor* in_y = ctx.Input<framework::Tensor>("Y");
-    framework::LoDTensor* out = ctx.Output<framework::LoDTensor>("Out");
-
-    int x_n = in_x->dims()[0];
-    int y_n = in_y->dims()[0];
-    IOUSimilarityFunctor<T> functor(in_x->data<T>(), in_y->data<T>(),
-                                    out->mutable_data<T>(ctx.GetPlace()), y_n);
-
-    platform::ForRange<DeviceContext> for_range(
-        static_cast<const DeviceContext&>(ctx.device_context()), x_n);
-    for_range(functor);
-  }
-};  // namespace operators
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/is_empty_op.cc b/paddle/fluid/operators/is_empty_op.cc
index 2a7be90dab1cc23ffe5e1c296c37a4bbeacb7d8e..29b73951bbddd9bfd73c932d7801797590de5e8e 100644
--- a/paddle/fluid/operators/is_empty_op.cc
+++ b/paddle/fluid/operators/is_empty_op.cc
@@ -12,46 +12,41 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/is_empty_op.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace operators {
 
-constexpr char kInput[] = "X";
-constexpr char kOutput[] = "Out";
-
-class IsEmptyOp : public framework::OperatorBase {
+class IsEmptyOp : public framework::OperatorWithKernel {
  public:
-  IsEmptyOp(const std::string &type, const framework::VariableNameMap &inputs,
-            const framework::VariableNameMap &outputs,
-            const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    // get input
-    auto *var = scope.FindVar(Input(kInput));
-    PADDLE_ENFORCE_NOT_NULL(var);
-    auto &tensor = var->Get<framework::LoDTensor>();
-    // get output
-    auto *out = scope.FindVar(Output(kOutput));
-    PADDLE_ENFORCE_NOT_NULL(out);
-    auto *out_tensor = out->GetMutable<framework::LoDTensor>();
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of IsEmptyOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of IsEmptyOp should not be null.");
+    ctx->SetOutputDim("Out", {1});
+  }
 
-    out_tensor->Resize({1});
-    out_tensor->mutable_data<bool>(platform::CPUPlace())[0] =
-        framework::product(tensor.dims()) == 0;
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::OpKernelType kt = framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        platform::CPUPlace());
+    return kt;
   }
 };
 
-class IsEmptyOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+class IsEmptyOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  IsEmptyOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(kInput, "(Tensor) Tensor which is to be checked.");
-    AddOutput(kOutput, "(Tensor) a boolean Tensor that indicate empty or not.");
+  void Make() override {
+    AddInput("X", "(LoDTensor) Tensor which is to be checked.");
+    AddOutput("Out",
+              "(LoDTensor) a boolean Tensor that indicate empty or not.");
     AddComment(R"DOC(
 IsEmpty Operator which checks whether a tensor is empty.
 
@@ -63,5 +58,12 @@ It will just return product(tensor.ddims()) > 0;
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_WITHOUT_GRADIENT(is_empty, paddle::operators::IsEmptyOp,
-                             paddle::operators::IsEmptyOpProtoMaker);
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(is_empty, ops::IsEmptyOp, ops::IsEmptyOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    is_empty, ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::IsEmptyOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/is_empty_op.h b/paddle/fluid/operators/is_empty_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e3af22fa8d842b6a1e67418446f1a40949e046b
--- /dev/null
+++ b/paddle/fluid/operators/is_empty_op.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class IsEmptyOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    // get input
+    auto* input_tensor = context.Input<framework::LoDTensor>("X");
+    // get output
+    auto* output_tensor = context.Output<framework::LoDTensor>("Out");
+
+    output_tensor->mutable_data<bool>(platform::CPUPlace())[0] =
+        framework::product(input_tensor->dims()) == 0;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc
index 963b0587c386c72c05f8cc5d0b63074e9e726579..bc115090acb473ac3175999ca96c5e00c0aeaeae 100644
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
@@ -48,8 +48,7 @@ class L1NormGradOp : public framework::OperatorWithKernel {
 
 class L1NormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  L1NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(Tensor) The input of l1_norm op.");
     AddOutput("Out", "(Scalar) The output of l1_norm op.");
     AddComment(R"DOC(
@@ -67,8 +66,9 @@ $$Out = \sum{|X|}$$
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(l1_norm, ops::L1NormOp, ops::L1NormOpMaker, l1_norm_grad,
-            ops::L1NormGradOp);
+REGISTER_OPERATOR(l1_norm, ops::L1NormOp, ops::L1NormOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(l1_norm_grad, ops::L1NormGradOp);
 REGISTER_OP_CPU_KERNEL(
     l1_norm, ops::L1NormKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc
index eef25f8a06ddb3311f3cfea21b64d8f7d7e58f24..da59bd53bce010d0d6ad2ab14acaffb9cc2f99e6 100644
--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/label_smooth_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -46,8 +47,7 @@ class LabelSmoothOp : public framework::OperatorWithKernel {
 
 class LabelSmoothOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LabelSmoothOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(LoDTensor) The input labels of LabelSmooth operator. This "
              "input can be batched labels in one-hot encoding or output from "
@@ -116,8 +116,9 @@ class LabelSmoothGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
 
-REGISTER_OP(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker,
-            label_smooth_grad, ops::LabelSmoothGradOp);
+REGISTER_OPERATOR(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(label_smooth_grad, ops::LabelSmoothGradOp);
 REGISTER_OP_CPU_KERNEL(
     label_smooth,
     ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
index 88b3b08af57eaf2d1086d778e3313c3dea6300fb..14ce1da2e97186a50ed8bd52223a500c4c57b328 100644
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -61,38 +61,34 @@ class LayerNormOp : public framework::OperatorWithKernel {
 
 class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LayerNormOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(LoDTensor) The input tensor.");
+  void Make() override {
+    AddInput("X", "The input tensor.");
     AddInput("Scale",
-             "(Tensor, optional) Scale is a 1-dimensional tensor of size "
+             "(optional) Scale is a 1-dimensional tensor of size "
              "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
              "It is applied to the output.")
         .AsDispensable();
     AddInput("Bias",
-             "(Tensor, optional) Bias is a 1-dimensional tensor of size "
+             "(optional) Bias is a 1-dimensional tensor of size "
              "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
              "It is applied to the output.")
         .AsDispensable();
-    AddOutput("Y", "(LoDTensor) Result after normalization.");
-    AddOutput("Mean", "(Tensor) Mean of the current mini batch.")
-        .AsIntermediate();
-    AddOutput("Variance", "(Tensor) Variance of the current mini batch.")
+    AddOutput("Y", "Result after normalization.");
+    AddOutput("Mean", "Mean of the current mini batch.").AsIntermediate();
+    AddOutput("Variance", "Variance of the current mini batch.")
         .AsIntermediate();
 
     AddAttr<float>("epsilon",
-                   "(float, default 1e-5) Constant for "
-                   "numerical stability")
+                   "Constant for numerical stability [default 1e-5].")
         .SetDefault(1e-5)
         .AddCustomChecker([](const float &epsilon) {
           PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
                          "'epsilon' should be between 0.0 and 0.001.");
         });
     AddAttr<int>("begin_norm_axis",
-                 "(int default:1), the "
-                 "axis of `begin_norm_axis ... Rank(X) - 1` will be "
+                 "the axis of `begin_norm_axis ... Rank(X) - 1` will be "
                  "normalized. `begin_norm_axis` splits the tensor(`X`) to a "
-                 "matrix [N,H].")
+                 "matrix [N,H]. [default 1].")
         .SetDefault(1)
         .AddCustomChecker([](const int &begin_norm_axis) {
           PADDLE_ENFORCE_GT(begin_norm_axis, 0,
@@ -100,10 +96,14 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
         });
 
     AddComment(R"DOC(
-Layer Normalization.
-Layer Norm has been implemented as discussed in the paper:
-https://arxiv.org/abs/1607.06450
-...
+Assume feature vectors exist on dimensions
+:attr:`begin_norm_axis ... rank(input)` and calculate the moment statistics
+along these dimensions for each feature vector :math:`a` with size
+:math:`H`, then normalize each feature vector using the corresponding
+statistics. After that, apply learnable gain and bias on the normalized
+tensor to scale and shift if :attr:`scale` and :attr:`shift` are set.
+
+Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
 )DOC");
   }
 };
@@ -162,8 +162,9 @@ class LayerNormGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
-            layer_norm_grad, ops::LayerNormGradOp);
+REGISTER_OPERATOR(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(layer_norm_grad, ops::LayerNormGradOp);
 REGISTER_OP_CPU_KERNEL(
     layer_norm, ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>,
     ops::LayerNormKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h
index 605b5c258ca57b1a63c9b741a1a30dcb9fca2248..2e54bb497dec11eaeda03a1aa6acfd4cc261dbfe 100644
--- a/paddle/fluid/operators/layer_norm_op.h
+++ b/paddle/fluid/operators/layer_norm_op.h
@@ -15,13 +15,110 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-
 #include "paddle/fluid/operators/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
 
+// Wrap RowwiseMean and ColwiseMean.
+// Reuse the cpu codes and replace the gpu codes with cublas_gemv, which is
+// significantly faster. Unlike the RowwiseMean and ColwiseMean, the
+// implementation only considers 2D.
+template <typename DeviceContext, typename T>
+struct RowwiseMean2D {
+  RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx);
+
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* vec);
+};
+
+#ifdef PADDLE_WITH_CUDA
+template <typename T>
+class RowwiseMean2D<platform::CUDADeviceContext, T> {
+ public:
+  RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx)
+      : left_(left), right_(right) {
+    framework::DDim ones_dim({right_});
+    divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
+    math::set_constant(dev_ctx, &divisor_, 1.0 / right);
+  }
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    math::GetBlas<platform::CUDADeviceContext, T>(context).GEMV(
+        false, left_, right_, 1., input.data<T>(), divisor_.data<T>(), 0.,
+        out->data<T>());
+  }
+
+ private:
+  int left_;
+  int right_;
+  framework::Tensor divisor_;
+};
+#endif
+
+template <typename T>
+class RowwiseMean2D<platform::CPUDeviceContext, T> {
+ public:
+  RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx) {}
+
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    row_mean_(context, input, out);
+  }
+
+ private:
+  math::RowwiseMean<platform::CPUDeviceContext, T> row_mean_;
+};
+
+template <typename DeviceContext, typename T>
+struct ColwiseSum2D {
+  ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx);
+
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* vec);
+};
+
+#ifdef PADDLE_WITH_CUDA
+template <typename T>
+class ColwiseSum2D<platform::CUDADeviceContext, T> {
+ public:
+  ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx)
+      : left_(left), right_(right) {
+    framework::DDim ones_dim({left_});
+    divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
+    math::set_constant(dev_ctx, &divisor_, 1.0);
+  }
+
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    math::GetBlas<platform::CUDADeviceContext, T>(context).GEMV(
+        true, left_, right_, 1., input.data<T>(), divisor_.data<T>(), 0.,
+        out->data<T>());
+  }
+
+ private:
+  int left_;
+  int right_;
+  framework::Tensor divisor_;
+};
+#endif
+
+template <typename T>
+class ColwiseSum2D<platform::CPUDeviceContext, T> {
+ public:
+  ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx) {}
+
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    col_wise_(context, input, out);
+  }
+
+ private:
+  math::ColwiseSum<platform::CPUDeviceContext, T> col_wise_;
+};
+
 template <typename T>
 struct SubAndSquareFunctor {
   inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); }
@@ -67,15 +164,15 @@ using DataLayout = framework::DataLayout;
 template <typename DeviceContext, typename T>
 class LayerNormKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
     const float epsilon = ctx.Attr<float>("epsilon");
-    auto *scale = ctx.Input<Tensor>("Scale");
-    auto *bias = ctx.Input<Tensor>("Bias");
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
     auto x = *ctx.Input<Tensor>("X");
 
-    auto *y = ctx.Output<Tensor>("Y");
-    auto *mean = ctx.Output<Tensor>("Mean");
-    auto *var = ctx.Output<Tensor>("Variance");
+    auto* y = ctx.Output<Tensor>("Y");
+    auto* mean = ctx.Output<Tensor>("Mean");
+    auto* var = ctx.Output<Tensor>("Variance");
     const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
 
     const auto x_dims = x.dims();
@@ -94,8 +191,8 @@ class LayerNormKernel : public framework::OpKernel<T> {
     out.ShareDataWith(*y);
     out.Resize(matrix_shape);
 
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    math::RowwiseMean<DeviceContext, T> row_mean;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    RowwiseMean2D<DeviceContext, T> row_mean(left, right, ctx.device_context());
 
     // get mean
     row_mean(dev_ctx, x, mean);
@@ -126,31 +223,32 @@ class LayerNormKernel : public framework::OpKernel<T> {
 template <typename DeviceContext, typename T>
 class LayerNormGradKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
     const float epsilon = ctx.Attr<float>("epsilon");
     auto x = *ctx.Input<Tensor>("X");
-    auto *y = ctx.Input<Tensor>("Y");
-    auto *mean = ctx.Input<Tensor>("Mean");
-    auto *var = ctx.Input<Tensor>("Variance");
-    auto *scale = ctx.Input<Tensor>("Scale");
-    auto *bias = ctx.Input<Tensor>("Bias");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* mean = ctx.Input<Tensor>("Mean");
+    auto* var = ctx.Input<Tensor>("Variance");
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
     auto d_y = *ctx.Input<Tensor>(framework::GradVarName("Y"));
     const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
 
     // init output
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
 
-    const auto &x_dims = x.dims();
+    const auto& x_dims = x.dims();
     auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
     int left = static_cast<int>(matrix_dim[0]);
     int right = static_cast<int>(matrix_dim[1]);
     framework::DDim matrix_shape({left, right});
 
     d_y.Resize(matrix_shape);
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    math::ColwiseSum<DeviceContext, T> colwise_sum;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    ColwiseSum2D<DeviceContext, T> colwise_sum(left, right,
+                                               ctx.device_context());
 
     Tensor temp;
     Tensor temp_norm;
@@ -190,7 +288,8 @@ class LayerNormGradKernel : public framework::OpKernel<T> {
       Tensor temp_vec;
       temp_vec.mutable_data<T>(vec_shape, ctx.GetPlace());
 
-      math::RowwiseMean<DeviceContext, T> row_mean;
+      RowwiseMean2D<DeviceContext, T> row_mean(left, right,
+                                               ctx.device_context());
 
       if (d_scale) {
         // dy_dx
diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc
index ef568a578b0b97ea402a2a521f0fe1431013d1b7..ea1ca7f59db22bee973a8827a88e2fb80265fa51 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -19,8 +19,7 @@ namespace operators {
 
 class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LinearChainCRFOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Emission",
              "(LoDTensor, default LoDTensor<float>) "
              "A 2-D LoDTensor with shape [N x D], where N is the size of the "
@@ -68,8 +67,6 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
         "mini-batch. Note: S is equal to the sequence number in a mini-batch. "
         "The output is no longer a LoDTensor.");
     AddComment(R"DOC(
-LinearChainCRF Operator.
-
 Conditional Random Field defines an undirected probabilistic graph with nodes
 denoting random variables and edges denoting dependencies between these
 variables. CRF learns the conditional probability $P(Y|X)$, where
@@ -87,6 +84,7 @@ CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and
 http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for details.
 
 Equation:
+
 1. Denote Input(Emission) to this operator as $x$ here.
 2. The first D values of Input(Transition) to this operator are for starting
 weights, denoted as $a$ here.
@@ -109,6 +107,7 @@ Finally, the linear chain CRF operator outputs the logarithm of the conditional
 likelihood of each training sample in a mini-batch.
 
 NOTE:
+
 1. The feature function for a CRF is made up of the emission features and the
 transition features. The emission feature weights are NOT computed in
 this operator. They MUST be computed first before this operator is called.
@@ -256,8 +255,10 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(linear_chain_crf, ops::LinearChainCRFOp, ops::LinearChainCRFOpMaker,
-            linear_chain_crf_grad, ops::LinearChainCRFGradOp);
+REGISTER_OPERATOR(linear_chain_crf, ops::LinearChainCRFOp,
+                  ops::LinearChainCRFOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(linear_chain_crf_grad, ops::LinearChainCRFGradOp);
 REGISTER_OP_CPU_KERNEL(
     linear_chain_crf,
     ops::LinearChainCRFOpKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h
index 800a1303e1a427e7bd5e6c04354b8a5fbd816712..d5162bcd742c05980c89394b5d011bd078b61211 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.h
+++ b/paddle/fluid/operators/linear_chain_crf_op.h
@@ -100,7 +100,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
     auto x_row_max = EigenMatrix<T>::From(emission_row_max);
     x_row_max.device(place) =
         x.maximum(Eigen::DSizes<int, 1>(1))
-            .reshape(Eigen::DSizes<int, 2>(int(batch_size), 1));
+            .reshape(Eigen::DSizes<int, 2>(static_cast<int>(batch_size), 1));
 
     auto x_exps = EigenMatrix<T>::From(*emission_exps);
     x_exps.device(place) =
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 4253300788462a3704076fc79241a864f2f130a0..56e39649b409f7eed108027f6df58c19dd3c8ab8 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -12,175 +12,377 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <stdint.h>
-#include <sys/stat.h>
-#include <ostream>
-#include <thread>
-
-#include <unistd.h>
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/proto_desc.h"
-#include "paddle/fluid/operators/detail/grpc_server.h"
-#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
-#include "paddle/fluid/operators/detail/simple_block_queue.h"
-#include "paddle/fluid/string/printf.h"
+#include <stdio.h>  // for removing the port file
+#include <csignal>
+#include <cstdlib>
+#include <fstream>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "paddle/fluid/operators/detail/macros.h"
+
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
+#include "paddle/fluid/operators/listen_and_serv_op.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
 
-constexpr char kOptimizeBlock[] = "OptimizeBlock";
-
-void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
-  service->RunSyncUpdate();
+void RunServer(std::shared_ptr<distributed::RPCServer> service) {
+  service->StartServer();
   VLOG(4) << "RunServer thread end";
 }
+static void split(const std::string &str, char sep,
+                  std::vector<std::string> *pieces) {
+  pieces->clear();
+  if (str.empty()) {
+    return;
+  }
+  size_t pos = 0;
+  size_t next = str.find(sep, pos);
+  while (next != std::string::npos) {
+    pieces->push_back(str.substr(pos, next - pos));
+    pos = next + 1;
+    next = str.find(sep, pos);
+  }
+  if (!str.substr(pos).empty()) {
+    pieces->push_back(str.substr(pos));
+  }
+}
 
-static void CreateTensorFromMessageType(framework::Variable *var,
-                                        sendrecv::VarType var_type) {
-  if (var_type == sendrecv::VarType::LOD_TENSOR) {
-    var->GetMutable<framework::LoDTensor>();
-  } else if (var_type == sendrecv::VarType::SELECTED_ROWS) {
-    var->GetMutable<framework::SelectedRows>();
-  } else {
-    PADDLE_THROW(
-        "VariableMessage type %d is not in "
-        "[LoDTensor, SelectedRows]",
-        var_type);
+static void ParallelExecuteBlocks(
+    const std::vector<size_t> &parallel_blkids, framework::Executor *executor,
+    const std::vector<std::shared_ptr<framework::ExecutorPrepareContext>>
+        &prepared,
+    framework::ProgramDesc *program, framework::Scope *scope) {
+  std::vector<std::future<void>> fs;
+  for (size_t idx : parallel_blkids) {
+    fs.push_back(
+        framework::Async([&executor, &prepared, &program, &scope, idx]() {
+          int run_block = idx;  // thread local
+          try {
+            executor->RunPreparedContext(prepared[run_block].get(), scope);
+          } catch (const std::exception &e) {
+            LOG(ERROR) << "run sub program error " << e.what();
+          }
+        }));
   }
+  for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
 }
 
-class ListenAndServOp : public framework::OperatorBase {
- public:
-  ListenAndServOp(const std::string &type,
-                  const framework::VariableNameMap &inputs,
-                  const framework::VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {
-    if (!rpc_service_) {
-      std::string endpoint = Attr<std::string>("endpoint");
-      rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
-      server_thread_.reset(new std::thread(RunServer, rpc_service_));
+ListenAndServOp::ListenAndServOp(const std::string &type,
+                                 const framework::VariableNameMap &inputs,
+                                 const framework::VariableNameMap &outputs,
+                                 const framework::AttributeMap &attrs)
+    : OperatorBase(type, inputs, outputs, attrs) {}
+
+ListenAndServOp::~ListenAndServOp() { Stop(); }
+
+void ListenAndServOp::Stop() {
+  rpc_service_->ShutDown();
+  server_thread_->join();
+  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
+  remove(file_path.c_str());
+}
+
+void ListenAndServOp::SavePort() const {
+  // NOTE: default write file to /tmp/paddle.selected_port
+  rpc_service_->SavePort();
+}
+
+static int64_t GetTimestamp() {
+  struct timeval tp;
+  gettimeofday(&tp, NULL);
+  return tp.tv_sec * 1000 + tp.tv_usec / 1000;
+}
+
+void ListenAndServOp::RunSyncLoop(
+    framework::Executor *executor, framework::ProgramDesc *program,
+    framework::Scope *recv_scope,
+    const std::vector<int> &prefetch_block_id_list,
+    const int checkpoint_point_block_id) const {
+  size_t num_blocks = program->Size();
+  auto optimize_blocks =
+      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
+  PADDLE_ENFORCE_GE(num_blocks, 2,
+                    "server program should have at least 2 blocks");
+
+  std::vector<int> optimize_blocks_idx;
+  for (auto blk : optimize_blocks) {
+    optimize_blocks_idx.push_back(blk->ID());
+  }
+  auto optimize_prepared = executor->Prepare(*program, optimize_blocks_idx);
+  // Insert placeholder for block0 which holds current op itself.
+  optimize_prepared.insert(
+      optimize_prepared.begin(),
+      std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
+
+  rpc_service_->ResetBarrierCounter();
+  while (true) {
+    // Get from multiple trainers, we don't care about the order in which
+    // the gradients arrives, just add suffix 0~n and merge the gradient.
+    rpc_service_->SetCond(distributed::kRequestSend);
+    rpc_service_->WaitBarrier(distributed::kRequestSend);
+
+    if (rpc_service_->IsExit()) {
+      LOG(WARNING) << "get exit!rpc_processor break!";
+      rpc_service_->SetCond(distributed::kRequestGet);
+      break;
     }
+
+    // NOTE: if is_gpu_place, CUDA kernels are launched by multiple threads
+    // and this will still work.
+    // The optimize blocks which have the same parent ID would run parallel
+    // TODO(Yancey1989): need to use ParallelExecutor for future
+    int32_t last_parent_blkid = optimize_blocks[0]->Parent();
+    std::vector<size_t> parallel_blkids;
+    parallel_blkids.push_back(optimize_blocks[0]->ID());
+    double ts = GetTimestamp();
+    for (size_t i = 1; i < optimize_blocks.size(); ++i) {
+      // skip the first optimize block because it is already in the
+      // parallel_blkids.
+      int blkid = optimize_blocks[i]->ID();
+      if (program->Block(blkid).Parent() != last_parent_blkid) {
+        ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
+                              program, recv_scope);
+        parallel_blkids.clear();
+        last_parent_blkid = program->Block(blkid).Parent();
+      }
+      parallel_blkids.push_back(blkid);
+    }
+    ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program,
+                          recv_scope);
+    VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
+
+    rpc_service_->SetCond(distributed::kRequestGet);
+    rpc_service_->WaitBarrier(distributed::kRequestGet);
+    rpc_service_->ResetBarrierCounter();
+    // reset received sparse vars to avoid reuse it in the next mini-batch
+    dynamic_cast<distributed::RequestSendHandler *>(request_send_handler_.get())
+        ->ResetSparseVarRecorder();
+  }  // while(true)
+}
+
+void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
+                                   framework::ProgramDesc *program,
+                                   framework::Scope *recv_scope) const {
+  // grad name to block id
+  std::unordered_map<std::string, int32_t> grad_to_block_id;
+  std::unordered_map<int32_t, std::string> id_to_grad;
+
+  auto grad_to_block_id_str =
+      Attr<std::vector<std::string>>("grad_to_block_id");
+  for (const auto &grad_and_id : grad_to_block_id_str) {
+    std::vector<std::string> pieces;
+    split(grad_and_id, ':', &pieces);
+    VLOG(3) << "after split, grad = " << pieces[0] << ", id=" << pieces[1];
+    PADDLE_ENFORCE_EQ(pieces.size(), 2);
+    PADDLE_ENFORCE_EQ(grad_to_block_id.count(pieces[0]), 0);
+
+    int block_id = std::stoi(pieces[1]);
+    grad_to_block_id[pieces[0]] = block_id;
+    id_to_grad[block_id] = pieces[0];
   }
+  size_t num_blocks = program->Size();
+  PADDLE_ENFORCE_GE(num_blocks, 2,
+                    "server program should have at least 2 blocks");
 
-  void Stop() override {
-    detail::MessageWithName term_msg;
-    term_msg.first = LISTEN_TERMINATE_MESSAGE;
-    rpc_service_->Push(term_msg);
-    rpc_service_->ShutDown();
-    server_thread_->join();
+  std::vector<int> block_list;
+  for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
+    block_list.push_back(blkid);
+  }
+  auto optimize_prepared = executor->Prepare(*program, block_list);
+  // execute global block if needed
+  if (block_list[0] == 1 && id_to_grad.count(1) == 0) {
+    executor->RunPreparedContext(optimize_prepared[0].get(), recv_scope);
+  }
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>
+      grad_to_prepared_ctx;
+  for (size_t i = 0; i < block_list.size(); ++i) {
+    grad_to_prepared_ctx[id_to_grad[block_list[i]]] = optimize_prepared[i];
   }
 
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(dev_place);
-    framework::Scope &recv_scope = scope.NewScope();
-
-    // FIXME(Yancey1989): initialize rpc server with lazy mode.
-    rpc_service_->SetScope(&recv_scope);
-    rpc_service_->SetDevCtx(&dev_ctx);
-    auto ins = Inputs("X");
-    auto fan_in = Attr<int>("Fanin");
-
-    auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
-    auto *program = block->Program();
-    framework::Executor executor(dev_place);
-
-    // TODO(typhoonzero): change this to a while_op for every cluster-batch.
-    bool exit_flag = false;
-    // Record received sparse variables, so that
-    // we could reset those after execute optimize program
-    std::vector<framework::Variable *> sparse_vars;
-    while (!exit_flag) {
-      // Get from multiple trainers, we don't care about the order in which
-      // the gradients arrives, just add suffix 0~n and merge the gradient.
-      rpc_service_->SetCond(0);
-      size_t recv_var_cnt = 0;
-      int batch_barrier = 0;
-      while (batch_barrier != fan_in) {
-        const detail::MessageWithName &v = rpc_service_->Get();
-        auto recv_var_name = v.first;
-        if (recv_var_name == LISTEN_TERMINATE_MESSAGE) {
-          LOG(INFO) << "received terminate message and exit";
-          exit_flag = true;
-          break;
-        } else if (recv_var_name == BATCH_BARRIER_MESSAGE) {
-          VLOG(3) << "recv batch barrier message";
-          batch_barrier++;
-          continue;
-        } else {
-          VLOG(3) << "received grad: " << recv_var_name;
-          recv_var_cnt++;
-          auto *var = recv_scope.FindVar(recv_var_name);
-          if (var == nullptr) {
-            LOG(ERROR) << "Can not find server side var: " << recv_var_name;
-            PADDLE_THROW("Can not find server side var");
-          }
-          detail::DeserializeFromMessage(v.second, dev_ctx, var);
-          if (var->IsType<framework::SelectedRows>()) {
-            sparse_vars.push_back(var);
-          }
-        }
-      }
-      if (exit_flag) {
-        rpc_service_->SetCond(1);
-        rpc_service_->ShutDown();
-        break;
-      }
-      try {
-        executor.Run(*program, &recv_scope, block->ID(), /*global_block*/
-                     false /*create_local_scope*/, false /*create_vars*/);
-      } catch (std::exception &e) {
-        LOG(ERROR) << "run sub program error " << e.what();
-      }
-      // Reset the received sparse variables, the sum operator would not
-      // sum the input sparse variables which rows is empty at the next
-      // mini-batch.
-      // TODO(Yancey1989): move the reset action into an operator, we couldn't
-      // have any hide logic in the operator.
-      for (auto &var : sparse_vars) {
-        var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
-      }
-      rpc_service_->SetCond(1);
-      // FIXME(typhoonzero): use another condition to sync wait clients get.
-      rpc_service_->WaitClientGet(fan_in);
-      sparse_vars.clear();
-    }  // while(true)
+  request_send_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
+  request_get_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
+  request_prefetch_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
+
+  while (true) {
+    if (rpc_service_->IsExit()) {
+      VLOG(4) << "get exit!rpc_processor break!";
+      break;
+    }
+
+    sleep(1);
+  }  // while(true)
+}
+
+static void FillRequestCtx(
+    distributed::RequestHandler *h, framework::Scope *scope,
+    platform::DeviceContext *dev_ctx, framework::Executor *executor,
+    framework::ProgramDesc *program,
+    std::unordered_map<std::string,
+                       std::shared_ptr<framework::ExecutorPrepareContext>>
+        *prefetch_ctx,
+    std::shared_ptr<framework::ExecutorPrepareContext> checkpoint_ctx,
+    distributed::RPCServer *rpc_server) {
+  h->SetScope(scope);
+  h->SetDevCtx(dev_ctx);
+  h->SetExecutor(executor);
+  h->SetProgram(program);
+  h->SetPrefetchPreparedCtx(prefetch_ctx);
+  h->SetRPCServer(rpc_server);
+  h->SetCheckpointNotifyPreparedCtx(checkpoint_ctx);
+}
+
+void ListenAndServOp::RunImpl(const framework::Scope &scope,
+                              const platform::Place &dev_place) const {
+  // Mark this as PS that it should decide profiling by listening from trainer.
+  platform::SetProfileListener();
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto &dev_ctx = *pool.Get(dev_place);
+  framework::Scope &recv_scope = scope.NewScope();
+
+  bool sync_mode = Attr<bool>("sync_mode");
+  auto fan_in = Attr<int>("Fanin");
+
+  PADDLE_ENFORCE(!rpc_service_);
+  std::string endpoint = Attr<std::string>("endpoint");
+  int checkpoint_block_id = Attr<int>(kCheckpointBlockId);
+
+  VLOG(4) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in
+          << ", end_point:" << endpoint
+          << ", checkpoint_block_id: " << checkpoint_block_id;
+
+  rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in));
+
+  request_send_handler_.reset(new distributed::RequestSendHandler(sync_mode));
+  request_get_handler_.reset(new distributed::RequestGetHandler(sync_mode));
+  request_prefetch_handler_.reset(
+      new distributed::RequestPrefetchHandler(sync_mode));
+  request_checkpoint_handler_.reset(new distributed::RequestCheckpointHandler(
+      sync_mode, checkpoint_block_id));
+
+  rpc_service_->RegisterRPC(distributed::kRequestSend,
+                            request_send_handler_.get());
+  rpc_service_->RegisterRPC(distributed::kRequestGet,
+                            request_get_handler_.get());
+  rpc_service_->RegisterRPC(distributed::kRequestPrefetch,
+                            request_prefetch_handler_.get());
+  rpc_service_->RegisterRPC(distributed::kRequestCheckpoint,
+                            request_checkpoint_handler_.get());
+
+  auto optimize_blocks =
+      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
+  PADDLE_ENFORCE(optimize_blocks.size() >= 1,
+                 "optimize blocks should be 1 at least on the pserver side.");
+  auto *program = optimize_blocks[0]->Program();
+  framework::Executor executor(dev_place);
+
+  std::shared_ptr<framework::ExecutorPrepareContext> ckpt_pre_context = nullptr;
+  if (checkpoint_block_id != -1) {
+    auto ctx = executor.Prepare(*program, checkpoint_block_id);
+    // see: https://stackoverflow.com/a/14856553
+    ckpt_pre_context = std::move(ctx);
   }
 
- protected:
-  std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
-  std::shared_ptr<std::thread> server_thread_;
-};
+  // prepare for prefetch
+  std::vector<int> prefetch_block_id_list;
+  std::unordered_map<int, std::string> block_id_to_prefetch_var_name;
+
+  auto prefetch_var_name_to_block_id_str =
+      Attr<std::vector<std::string>>(kPrefetchVarNameToBlockId);
+  for (const auto &prefetch_var_name_and_id :
+       prefetch_var_name_to_block_id_str) {
+    std::vector<std::string> pieces;
+    split(prefetch_var_name_and_id, ':', &pieces);
+    VLOG(3) << "after split, prefetch_var = " << pieces[0]
+            << ", id=" << pieces[1];
+    PADDLE_ENFORCE_EQ(pieces.size(), 2);
+
+    int block_id = std::stoi(pieces[1]);
+    prefetch_block_id_list.push_back(block_id);
+    block_id_to_prefetch_var_name[block_id] = pieces[0];
+  }
+
+  auto prefetch_prepared = executor.Prepare(*program, prefetch_block_id_list);
+
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>
+      prefetch_var_name_to_prepared_ctx;
+  for (size_t i = 0; i < prefetch_block_id_list.size(); ++i) {
+    auto block_id = prefetch_block_id_list[i];
+    auto prefetch_var_name = block_id_to_prefetch_var_name[block_id];
+    prefetch_var_name_to_prepared_ctx[prefetch_var_name] = prefetch_prepared[i];
+  }
+
+  auto f =
+      std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope, &dev_ctx,
+                &executor, program, &prefetch_var_name_to_prepared_ctx,
+                ckpt_pre_context, rpc_service_.get());
+
+  f(request_send_handler_.get());
+  f(request_get_handler_.get());
+  f(request_prefetch_handler_.get());
+  f(request_checkpoint_handler_.get());
+
+  // start the server listening after all member initialized.
+  server_thread_.reset(new std::thread(RunServer, rpc_service_));
+  VLOG(3) << "wait server thread to become ready...";
+  rpc_service_->WaitServerReady();
+
+  // register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers
+  signal(SIGINT, SignalHandler::StopAndExit);
+  signal(SIGTERM, SignalHandler::StopAndExit);
+
+  // Write to a file of server selected port for python use.
+  SavePort();
+  if (sync_mode) {
+    RunSyncLoop(&executor, program, &recv_scope, prefetch_block_id_list,
+                checkpoint_block_id);
+  } else {
+    RunAsyncLoop(&executor, program, &recv_scope);
+  }
+}
 
 class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ListenAndServOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() {
     AddInput("X", "(Tensor) Variables that server recv.").AsDuplicable();
-    AddComment(R"DOC(
-ListenAndServ operator
-
-This operator will start a RPC server which can receive variables
-from send_op and send back variables to recv_op.
-)DOC");
+    AddComment(R"DOC(" + "ListenAndServ operator" + "\n" + "This operator" +
+" will start a RPC server which can receive variables from send_op and send" +
+"back variables to recv_op.)DOC");
     AddAttr<std::string>("endpoint",
                          "(string, default 127.0.0.1:6164)"
                          "IP address to listen on.")
         .SetDefault("127.0.0.1:6164")
         .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
-    AddAttr<framework::BlockDesc *>(kOptimizeBlock,
-                                    "BlockID to run on server side.");
+    AddAttr<std::vector<std::string>>(
+        "grad_to_block_id",
+        "['param1@GRAD.block0:1', 'param2@GRAD.blockn:2'] "
+        "a map from grad name to it's optimize block id")
+        .SetDefault({});
+    AddAttr<bool>("sync_mode", "if works at sync_mode or not").SetDefault(true);
+    AddAttr<std::vector<framework::BlockDesc *>>(
+        kOptimizeBlocks, "Optimize blocks to run on server side.")
+        .SetDefault({});
+    AddAttr<std::vector<std::string>>(kPrefetchVarNameToBlockId,
+                                      "prefetch blocks to run on server side.")
+        .SetDefault({});
     AddAttr<int>("Fanin", "How many clients send to this server.")
         .SetDefault(1);
+    AddAttr<int>(kCheckpointBlockId,
+                 "BolckID to run save checkpoint on pserer.")
+        .SetDefault(-1);
   }
 };
 
+void SignalHandler::StopAndExit(int signal_num) {
+  // Do not use VLOG here for the device for printing maybe already released.
+  // exit will release interal allocated resoureces.
+  exit(0);
+}
+
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..978969cc515c7954b59f2bf7a4f2c0e1b13f9bc0
--- /dev/null
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -0,0 +1,88 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <atomic>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
+#include "paddle/fluid/operators/distributed/rpc_server.h"
+
+namespace paddle {
+namespace operators {
+
+constexpr char kOptimizeBlocks[] = "optimize_blocks";
+constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";
+constexpr char kCheckpointBlockId[] = "checkpint_block_id";
+
+void RunServer(std::shared_ptr<distributed::RPCServer> service);
+
+class ListenAndServOp : public framework::OperatorBase {
+ public:
+  ListenAndServOp(const std::string& type,
+                  const framework::VariableNameMap& inputs,
+                  const framework::VariableNameMap& outputs,
+                  const framework::AttributeMap& attrs);
+
+  virtual ~ListenAndServOp();
+
+  void RunSyncLoop(framework::Executor* executor,
+                   framework::ProgramDesc* program,
+                   framework::Scope* recv_scope,
+                   const std::vector<int>& prefetch_block_id_list,
+                   const int checkpoint_point_block_id) const;
+
+  void RunAsyncLoop(framework::Executor* executor,
+                    framework::ProgramDesc* program,
+                    framework::Scope* recv_scope) const;
+
+  void SavePort() const;
+
+  int GetSelectedPort() { return rpc_service_->GetSelectedPort(); }
+
+  void Stop() override;
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override;
+
+ protected:
+  mutable std::shared_ptr<distributed::RPCServer> rpc_service_;
+  mutable std::shared_ptr<distributed::RequestHandler> request_send_handler_;
+  mutable std::shared_ptr<distributed::RequestHandler> request_get_handler_;
+  mutable std::shared_ptr<distributed::RequestHandler>
+      request_prefetch_handler_;
+  mutable std::shared_ptr<distributed::RequestHandler>
+      request_checkpoint_handler_;
+
+  mutable std::shared_ptr<std::thread> server_thread_;
+};
+
+class SignalHandler {
+ public:
+  static void StopAndExit(int signal_num);
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(SignalHandler);
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc
index e5353144e91455fc71460459e6e799b54f750f71..0522a94195786c767194ec727d982a60451e7c62 100644
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <fstream>
-
+#include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
 
@@ -31,6 +31,7 @@ class LoadCombineOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
     auto filename = Attr<std::string>("file_path");
+    auto load_as_fp16 = Attr<bool>("load_as_fp16");
 
     std::ifstream fin(filename);
     PADDLE_ENFORCE(static_cast<bool>(fin),
@@ -59,17 +60,25 @@ class LoadCombineOp : public framework::OperatorBase {
       // Get data from fin to tensor
       DeserializeFromStream(fin, tensor, dev_ctx);
 
-      if (platform::is_gpu_place(place)) {
-        // copy CPU to GPU
-        framework::LoDTensor cpu_tensor;
-        cpu_tensor.ShareDataWith(*tensor);
-        cpu_tensor.set_lod(tensor->lod());
-
-        // reset tensor
+      auto in_dtype = framework::ToDataType(tensor->type());
+      auto out_dtype =
+          load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
+
+      if (in_dtype != out_dtype) {
+        // convert to float16 tensor
+        auto in_kernel_type = framework::OpKernelType(in_dtype, place);
+        auto out_kernel_type = framework::OpKernelType(out_dtype, place);
+        framework::LoDTensor fp16_tensor;
+        // copy LoD info to the new tensor
+        fp16_tensor.set_lod(tensor->lod());
+        framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
+                                 &fp16_tensor);
+
+        // reset output tensor
         out_var->Clear();
         tensor = out_var->GetMutable<framework::LoDTensor>();
-        tensor->set_lod(cpu_tensor.lod());
-        TensorCopy(cpu_tensor, place, dev_ctx, tensor);
+        tensor->set_lod(fp16_tensor.lod());
+        tensor->ShareDataWith(fp16_tensor);
       }
     }
   }
@@ -77,12 +86,18 @@ class LoadCombineOp : public framework::OperatorBase {
 
 class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LoadCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddOutput(
         "Out",
         "(vector) The output LoDTensors that will be read from the input file.")
         .AsDuplicable();
+    AddAttr<bool>(
+        "load_as_fp16",
+        "(boolean, default false)"
+        "If true, the tensor will be first loaded and then "
+        "converted to float16 data type. Otherwise, the tensor will be "
+        "directly loaded without data type conversion.")
+        .SetDefault(false);
     AddAttr<std::string>("file_path",
                          "(string) "
                          "LoDTensors will be loaded from \"file_path\".")
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index 05f809ac5628420251957116bb2390b4502f11b8..ac35cf0b89bfaa0c0f8e64445f18a3bbd478e70a 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <fstream>
 
+#include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -29,6 +31,11 @@ class LoadOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
+    auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    platform::RecordEvent record_event(Type(), dev_ctx);
+
+    // FIXME(yuyang18): We save variable to local file now, but we should change
+    // it to save an output stream.
     auto filename = Attr<std::string>("file_path");
     std::ifstream fin(filename);
     PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
@@ -39,43 +46,75 @@ class LoadOp : public framework::OperatorBase {
     PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
                    out_var_name);
 
-    auto *tensor = out_var->GetMutable<framework::LoDTensor>();
+    if (out_var->IsType<framework::LoDTensor>()) {
+      LoadLodTensor(fin, place, out_var);
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      LoadSelectedRows(fin, place, out_var);
+    } else {
+      PADDLE_ENFORCE(
+          false,
+          "Load only support LoDTensor and SelectedRows, %s has wrong type",
+          out_var_name);
+    }
+  }
 
+  void LoadLodTensor(std::istream &fin, const platform::Place &place,
+                     framework::Variable *var) const {
+    // get device context from pool
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(place);
+    auto *tensor = var->GetMutable<framework::LoDTensor>();
     DeserializeFromStream(fin, tensor, dev_ctx);
 
-    if (platform::is_gpu_place(place)) {
-      // copy CPU to GPU
-      framework::LoDTensor cpu_tensor;
-      cpu_tensor.ShareDataWith(*tensor);
-      cpu_tensor.set_lod(tensor->lod());
-
-      // reset tensor
-      out_var->Clear();
-      tensor = out_var->GetMutable<framework::LoDTensor>();
-      tensor->set_lod(cpu_tensor.lod());
-      TensorCopy(cpu_tensor, place, dev_ctx, tensor);
+    auto load_as_fp16 = Attr<bool>("load_as_fp16");
+    auto in_dtype = framework::ToDataType(tensor->type());
+    auto out_dtype = load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
+
+    if (in_dtype != out_dtype) {
+      // convert to float16 tensor
+      auto in_kernel_type = framework::OpKernelType(in_dtype, place);
+      auto out_kernel_type = framework::OpKernelType(out_dtype, place);
+      framework::LoDTensor fp16_tensor;
+      // copy LoD info to the new tensor
+      fp16_tensor.set_lod(tensor->lod());
+      framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
+                               &fp16_tensor);
+
+      // reset output tensor
+      var->Clear();
+      tensor = var->GetMutable<framework::LoDTensor>();
+      tensor->set_lod(fp16_tensor.lod());
+      tensor->ShareDataWith(fp16_tensor);
     }
   }
+
+  void LoadSelectedRows(std::istream &fin, const platform::Place &place,
+                        framework::Variable *var) const {
+    auto *selectedRows = var->GetMutable<framework::SelectedRows>();
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    framework::DeserializeFromStream(fin, selectedRows, dev_ctx);
+  }
 };
 
 class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LoadOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddOutput("Out", "(Tensor) The tensor need to be loaded");
+  void Make() override {
+    AddOutput("Out", "The LoDTensor / SelectedRows need to be loaded");
+    AddAttr<bool>(
+        "load_as_fp16",
+        "If true, the tensor will be first loaded and then "
+        "converted to float16 data type. Otherwise, the tensor will be "
+        "directly loaded without data type conversion. Default is false.")
+        .SetDefault(false);
     AddAttr<std::string>("file_path",
-                         "(string) "
-                         "Variable will be loaded from \"file_path\".")
+                         R"(Variable will be loaded from "file_path")")
         .AddCustomChecker(
             [](const std::string &path) { return !path.empty(); });
-    AddComment(R"DOC(
-Load Operator.
-
-Load operator will load a tensor variable from disk file.
-
-)DOC");
+    AddComment(
+        "Load operator will load a LoDTensor / SelectedRows variable from disk "
+        "file.");
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/lod_array_length_op.cc b/paddle/fluid/operators/lod_array_length_op.cc
index e6212405770093455ec89bde9dc0a092b956fc83..e4551b8ba681fe92ac5f21bb0b509f43439f6b66 100644
--- a/paddle/fluid/operators/lod_array_length_op.cc
+++ b/paddle/fluid/operators/lod_array_length_op.cc
@@ -40,8 +40,7 @@ class LoDArrayLengthOp : public framework::OperatorBase {
 
 class LoDArrayLengthProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LoDArrayLengthProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(LoDTensorArray) The input tensor array.");
     AddOutput("Out", "(Tensor) 1x1 CPU Tensor of length, int64_t");
     AddComment(R"DOC(
diff --git a/paddle/fluid/operators/lod_rank_table_op.cc b/paddle/fluid/operators/lod_rank_table_op.cc
index 590b44e14f518c3c60c141c9a0dfe7f2b96f69c6..166952fe23192799443ef9c9d1f7ba5056d19290 100644
--- a/paddle/fluid/operators/lod_rank_table_op.cc
+++ b/paddle/fluid/operators/lod_rank_table_op.cc
@@ -38,8 +38,7 @@ class LoDRankTableOp : public framework::OperatorBase {
 
 class LoDRankTableOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LoDRankTableOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(LoDTensor) input lod tensor, must contain lod information.");
     AddOutput("Out", "(LoDRankTable) The rank table of specific level.");
diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc
index 6a66297cb843ead1a507a6867c1c562224861cbf..0d4e84e85083399e3803d0648dc7a10aa276d536 100644
--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
@@ -22,17 +22,16 @@ class LoDResetOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    // input check
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of LoDResetOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of LoDResetOp should not be null.");
-    // If target LoD is not set form Input(), then it must be set from Attr().
-    if (!ctx->HasInput("TargetLoD")) {
+
+    if (!ctx->HasInput("Y")) {
       auto level0 = ctx->Attrs().Get<std::vector<int>>("target_lod");
-      PADDLE_ENFORCE(level0.size() > 1,
-                     "Target LoD is not found, should be set to be a valid one "
-                     "through Input() or Attr().");
+      PADDLE_ENFORCE_GT(level0.size(), 1,
+                        "If Input(Y) not provided, the target lod should be "
+                        "specified by attribute `target_lod`.");
     }
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
   }
@@ -48,38 +47,78 @@ class LoDResetOp : public framework::OperatorWithKernel {
 
 class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LoDResetOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(LoDTensor) The input tensor of lod_reset operator.");
-    AddInput("TargetLoD",
-             "(Tensor, optional) The target level 0 LoD from Input().")
+  void Make() override {
+    AddInput("X",
+             "(Tensor, LoDTensor) Input variable of LoDResetOp which "
+             "could be a Tensor or LoDTensor, where the data of output "
+             "variable inherits from.");
+    AddInput("Y",
+             "(Tensor, LoDTensor, optional) If provided and Y is LoDTensor, "
+             "lod of Input(Y) would be considered as the target lod first, "
+             "otherwise data of Input(Y) would be considered as the "
+             "target lod.")
         .AsDispensable();
-    AddOutput("Out", "(LoDTensor) The output tensor of lod_reset operator.");
+    AddOutput("Out",
+              "(LoDTensor) Output variable of LoDResetOp which should be a "
+              "LoDTensor.");
     AddAttr<std::vector<int>>("target_lod",
                               "The target level 0 LoD from Attr().")
         .SetDefault(std::vector<int>{});
     AddComment(R"DOC(LoDReset operator
 
-Reset LoD of Input(X) into a new one specified by Input(TargetLoD) or
-Attr(target_lod), or set LoD for Input(X) if it doesn't have one.
-Currently the lod_reset operator only supports the reset of level 0 LoD.
-At least one of Input(TargetLoD) and Attr(target_lod) must be set,
-and if both of them are set, Input(TargetLoD) will be chosen as the
-target LoD.
+Set LoD of `X` to a new one specified by `Y` or attribute `target_lod`. When `Y`
+provided and `Y` is a LoDTensor, `Y.lod` would be considered as target LoD
+first, otherwise `Y.data` would be considered as target LoD. If `Y` is not
+provided, target LoD should be specified by attribute `target_lod`.
+If target LoD is specified by `Y.data` or `target_lod`, only one level LoD
+is supported.
+
+Example 1:
+
+Given a 1-level LoDTensor input(X):
+    X.lod =  [[ 0,     2,                   5      6 ]]
+    X.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+    X.dims = [6, 1]
+
+attr(target_lod): [0, 4, 6]
+
+then we get a 1-level LoDTensor:
+    Out.lod =  [[ 0,                   4,            6 ]]
+    Out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+    Out.dims = [6, 1]
+
+Example 2:
 
-An example:
-Given a float LoDTensor X with shape (6, 1), its transpose form represents
+Given a 1-level LoDTensor input(X):
+    X.lod =  [[ 0,     2,                   5      6 ]]
+    X.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+    X.dims = [6, 1]
 
-    [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
+input(Y) is a Tensor:
+    Y.data = [[0, 2, 6]]
+    Y.dims = [1, 3]
 
-with LoD = [[0, 2, 5, 6]] and the three (transposed) sequences look like
+then we get a 1-level LoDTensor:
+    Out.lod =  [[ 0,     2,                          6 ]]
+    Out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+    Out.dims = [6, 1]
 
-    [1.0, 2.0], [3.0, 4.0, 5.0], [6.0].
+Example 3:
 
-If target LoD = [0, 4, 6], the lod_reset operator will reset the LoD and
-the sequences that the LoDTensor Output(Out) contains becomes:
+Given a 1-level LoDTensor input(X):
+    X.lod =  [[ 0,      2,                   5     6 ]]
+    X.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+    X.dims = [6, 1]
 
-    [1.0, 2.0, 3.0, 4.0], [5.0, 6.0].
+input(Y) is a 2-level LoDTensor:
+    Y.lod =  [[0, 2, 4], [0, 2, 5, 6]]
+    Y.data = [[1.1], [2.1], [3.1], [4.1], [5.1], [6.1]]
+    Y.dims = [6, 1]
+
+then we get a 2-level LoDTensor:
+    Out.lod =  [[0, 2, 4], [0, 2, 5, 6]]
+    Out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+    Out.dims = [6, 1]
 
 )DOC");
   }
@@ -90,10 +129,16 @@ class LoDResetGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of LoDResetGradOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) shouldn't be null.");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+                   "Input(Out@Grad) of LoDResetGradOp should not be null.");
+
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X"));
+      ctx->ShareLoD("X", /*->*/ x_grad_name);
+    }
   }
 
  protected:
@@ -109,11 +154,16 @@ class LoDResetGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker, lod_reset_grad,
-            ops::LoDResetGradOp);
-REGISTER_OP_CPU_KERNEL(lod_reset,
-                       ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
-                       ops::LoDResetKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OPERATOR(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp);
+REGISTER_OP_CPU_KERNEL(
+    lod_reset, ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
+    ops::LoDResetKernel<paddle::platform::CPUPlace, double>,
+    ops::LoDResetKernel<paddle::platform::CPUPlace, int>,
+    ops::LoDResetKernel<paddle::platform::CPUPlace, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     lod_reset_grad, ops::LoDResetGradKernel<paddle::platform::CPUPlace, float>,
-    ops::LoDResetGradKernel<paddle::platform::CPUPlace, double>);
+    ops::LoDResetGradKernel<paddle::platform::CPUPlace, double>,
+    ops::LoDResetGradKernel<paddle::platform::CPUPlace, int>,
+    ops::LoDResetGradKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/fluid/operators/lod_reset_op.cu b/paddle/fluid/operators/lod_reset_op.cu
index b0e87a851a77a1cc98d419a63d4d9e5e1b9dd163..888d4c12eb4e3f4fd94d8dd4178c59acd0abb23b 100644
--- a/paddle/fluid/operators/lod_reset_op.cu
+++ b/paddle/fluid/operators/lod_reset_op.cu
@@ -18,8 +18,12 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_CUDA_KERNEL(
     lod_reset, ops::LoDResetKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LoDResetKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::LoDResetKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LoDResetKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::LoDResetKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     lod_reset_grad,
     ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/lod_reset_op.h b/paddle/fluid/operators/lod_reset_op.h
index 8186d4f8262101edc723af390eee1aec4fa6f3a5..d36aa0ce025a1c0f717913131fcc75040d16afac 100644
--- a/paddle/fluid/operators/lod_reset_op.h
+++ b/paddle/fluid/operators/lod_reset_op.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -26,35 +28,45 @@ class LoDResetKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* out = ctx.Output<framework::LoDTensor>("Out");
     auto* in = ctx.Input<framework::LoDTensor>("X");
-    auto* lod_t = ctx.Input<framework::Tensor>("TargetLoD");
+    auto* lod_t = ctx.Input<framework::LoDTensor>("Y");
+
+    out->ShareDataWith(*in);
 
     std::vector<int> level0;
     if (lod_t) {
-      auto* lod = lod_t->data<int>();
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-        framework::Tensor lod_cpu;
-        framework::TensorCopy(*lod_t, platform::CPUPlace(),
-                              ctx.device_context(), &lod_cpu);
-        lod = lod_cpu.data<int>();
+      if (lod_t->lod().size() > 0) {
+        auto y_lod = lod_t->lod();
+        auto last_level = y_lod[y_lod.size() - 1];
+        PADDLE_ENFORCE_EQ((int64_t)(last_level.back()), in->dims()[0],
+                          "Last value of `Y`'s last level LoD should be equal "
+                          "to the first dimension of `X`");
+        out->set_lod(y_lod);
+        return;  // early return, since lod already set
+      } else {
+        auto* lod = lod_t->data<int>();
+        if (platform::is_gpu_place(ctx.GetPlace())) {
+          framework::Tensor lod_cpu;
+          framework::TensorCopySync(*lod_t, platform::CPUPlace(), &lod_cpu);
+          lod = lod_cpu.data<int>();
+        }
+        level0 = std::vector<int>(lod, lod + lod_t->numel());
       }
-      level0 = std::vector<int>(lod, lod + lod_t->numel());
     } else {
       level0 = ctx.Attr<std::vector<int>>("target_lod");
     }
 
-    PADDLE_ENFORCE(level0.size() > 1UL,
-                   "The size of target LoD should be greater than 1.");
-    PADDLE_ENFORCE(level0[0] == 0,
-                   "Target LoD should be a vector starting from 0.");
-    PADDLE_ENFORCE(level0.back() == in->dims()[0],
-                   "Target LoD should be a vector end with the "
-                   "first dimension of Input(X).");
+    PADDLE_ENFORCE_GT(level0.size(), 1UL,
+                      "Size of target LoD should be greater than 1.");
+    PADDLE_ENFORCE_EQ(level0[0], 0,
+                      "Target LoD should be a vector starting from 0.");
+    PADDLE_ENFORCE_EQ(level0.back(), in->dims()[0],
+                      "Target LoD should be a vector end with the "
+                      "first dimension of Input(X).");
     for (size_t i = 0; i < level0.size() - 1; ++i) {
       PADDLE_ENFORCE(level0[i + 1] > level0[i],
                      "Target LoD should be an ascending vector.");
     }
 
-    out->ShareDataWith(*in);
     // cast level0 to size_t
     std::vector<size_t> ulevel0(level0.size(), 0);
     std::transform(level0.begin(), level0.end(), ulevel0.begin(),
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index 543495ce4e66c0955c9ce1b0db480088069b36db..00ba5ce8ee5e4084c8af204cfc37fe80c437f0d7 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -105,8 +105,7 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
 
 class LoDTensorToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LoDTensorToArrayOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "");
     AddInput("RankTable", "");
     AddOutput("Out", "");
diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc
index f44996d8ac746a33750a979eff2cbbc84e10214b..9d248e03218b83a65b9786cb317aafbe3dbb67ee 100644
--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
@@ -46,8 +46,7 @@ class LogLossOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class LogLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LogLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Predicted",
              "The input value (Predicted) of Log loss op."
              "Predicted is a 2-D tensor with shape [batch_size, 1].");
@@ -106,8 +105,9 @@ class LogLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>, log_loss_grad,
-            ops::LogLossGradOp);
+REGISTER_OPERATOR(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     log_loss, ops::LogLossKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/logical_op.cc b/paddle/fluid/operators/logical_op.cc
index 6a7db31cf36f31064259abeb0348e682be9f917c..26970db8d2af62bb06fce4eb1a1f21fd41617bd1 100644
--- a/paddle/fluid/operators/logical_op.cc
+++ b/paddle/fluid/operators/logical_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/logical_op.h"
+#include <string>
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
@@ -20,8 +21,7 @@ namespace operators {
 template <typename OpComment>
 class BinaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  BinaryLogicalOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     OpComment comment;
     AddInput("X",
              string::Sprintf("(LoDTensor) Left hand operand of %s operator",
@@ -44,8 +44,7 @@ Each element of Out is calculated by %s
 template <typename OpComment>
 class UnaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  UnaryLogicalOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     OpComment comment;
     AddInput("X", string::Sprintf("(LoDTensor) Operand of %s operator",
                                   comment.type));
@@ -147,6 +146,6 @@ REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$");
 REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU,
                               paddle::operators::LogicalNotFunctor);
 REGISTER_BINARY_LOGICAL_OP(logical_xor,
-                           "$$Out = (X || Y) \\, \\&\\& \\, !(X \\&\\& Y)$$");
+                           "$$Out = (X || Y) \\&\\& !(X \\&\\& Y)$$");
 REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU,
                                paddle::operators::LogicalXorFunctor);
diff --git a/paddle/fluid/operators/lookup_sparse_table_op.cc b/paddle/fluid/operators/lookup_sparse_table_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2ce11e712fb1a8aa9748313ec7cf4e895a931465
--- /dev/null
+++ b/paddle/fluid/operators/lookup_sparse_table_op.cc
@@ -0,0 +1,164 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+constexpr int64_t kNoPadding = -1;
+
+class LookupSparseTableInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of LookupSparseTableOp should not be null.");
+    auto shape_w = ctx->GetInputDim("W");
+    auto shape_ids = ctx->GetInputDim("Ids");
+    shape_w[0] = shape_ids.size();
+    ctx->SetOutputDim("Out", shape_w);
+  }
+};
+
+class LookupSparseTableOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    auto out_var = scope.FindVar(Output("Out"));
+    auto w_var = scope.FindVar(Input("W"));
+    auto ids_var = scope.FindVar(Input("Ids"));
+    unsigned int seed = static_cast<unsigned int>(Attr<int>("seed"));
+    float min = Attr<float>("min");
+    float max = Attr<float>("max");
+    bool auto_grown_table = Attr<bool>("auto_grown_table");
+
+    PADDLE_ENFORCE(out_var->IsType<framework::LoDTensor>(),
+                   "The type of Out var should be LodTensor.");
+    PADDLE_ENFORCE(w_var->IsType<framework::SelectedRows>(),
+                   "The type of W var should be SelectedRows.");
+    PADDLE_ENFORCE(ids_var->IsType<framework::LoDTensor>(),
+                   "The type of Ids var should be LoDTensor.");
+    auto &ids_t = ids_var->Get<framework::LoDTensor>();
+    auto out_t = out_var->GetMutable<framework::LoDTensor>();
+    auto w_t = w_var->GetMutable<framework::SelectedRows>();
+    std::vector<int64_t> keys;
+    keys.resize(ids_t.numel());
+    for (int64_t i = 0; i < ids_t.numel(); ++i) {
+      keys[i] = ids_t.data<int64_t>()[i];
+    }
+
+    // TODO(Yancey1989): support CUDA Place for the sparse table
+    platform::CPUPlace cpu;
+    auto out_shape = w_t->value().dims();
+    out_shape[0] = keys.size();
+    out_t->Resize(out_shape);
+    out_t->mutable_data(cpu, w_t->value().type());
+    PADDLE_ENFORCE_EQ(framework::ToDataType(w_t->value().type()),
+                      framework::proto::VarType::FP32,
+                      "The sparse table only support FP32");
+    auto non_keys_pair = w_t->Get(keys, out_t);
+    if (!auto_grown_table) {
+      PADDLE_ENFORCE_EQ(non_keys_pair.size(), static_cast<size_t>(0),
+                        "there is some keys does exists in the sparse table.");
+    }
+    auto value_shape = w_t->value().dims();
+    value_shape[0] = 1;
+    for (const auto &it : non_keys_pair) {
+      const auto key = it.first;
+      const auto index = it.second;
+      framework::Tensor value;
+      value.Resize(value_shape);
+      auto data = value.mutable_data<float>(cpu);
+
+      std::minstd_rand engine;
+      engine.seed(seed);
+      std::uniform_real_distribution<float> dist(min, max);
+      int64_t size = value.numel();
+      for (int64_t i = 0; i < size; ++i) {
+        data[i] = dist(engine);
+      }
+      w_t->Set(key, value);
+      memory::Copy(cpu, out_t->mutable_data<float>(cpu) + index * value.numel(),
+                   cpu, value.data<float>(), value.numel() * sizeof(float));
+    }
+  }
+};
+
+class LookupSparseTableOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("W",
+             "(SelectedRows) The input represents embedding table, "
+             "which is a learnable parameter.");
+    AddInput("Ids",
+             "(LoDTensor) Ids's type should be LoDTensor"
+             "THe ids to be looked up in W.");
+    AddOutput("Out",
+              "(LoDTensor) The lookup results, which have the "
+              "same type as W.");
+    AddAttr<int64_t>("padding_idx",
+                     "(int64, default -1) "
+                     "If the value is -1, it makes no effect to lookup. "
+                     "Otherwise the given value indicates padding the output "
+                     "with zeros whenever lookup encounters it in Ids.")
+        .SetDefault(kNoPadding);
+    AddAttr<float>("min",
+                   "(float, default -1.0) "
+                   "Minimum value of uniform random")
+        .SetDefault(-1.0f);
+    AddAttr<float>("max",
+                   "(float, default 1.0) "
+                   "Maximum value of uniform random")
+        .SetDefault(1.0f);
+    AddAttr<int>("seed",
+                 "(int, default 0) "
+                 "Random seed used for generating samples. "
+                 "0 means use a seed generated by the system."
+                 "Note that if seed is not 0, this operator will always "
+                 "generate the same random numbers every time.")
+        .SetDefault(0);
+    AddAttr<bool>("auto_grown_table",
+                  "(bool default false)"
+                  "Whether create new value if for nonexistent key.")
+        .SetDefault(true);
+    AddComment(R"DOC(
+Lookup Sprase Tablel Operator.
+
+This operator is used to perform lookup on parameter W,
+then concatenated into a sparse tensor.
+
+The type of Ids(Input) is SelectedRows, the rows of Ids contains
+the ids to be looked up in W;
+if the Id is not in the sparse table, this operator will return a
+random value and set the value into the table for the next looking up.
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(lookup_sparse_table, ops::LookupSparseTableOp,
+                  ops::LookupSparseTableInferShape,
+                  ops::LookupSparseTableOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index 50eeadab72e71f39325c5eda69e9a3c3e6517d7d..bda499432214b8841c8dfc406ee45ca0367920e7 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -51,16 +51,14 @@ class LookupTableOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
-        ctx.device_context());
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
+    return framework::OpKernelType(data_type, ctx.device_context());
   }
 };
 
 class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LookupTableOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("W",
              "(Tensor) The input represents embedding tensors, "
              "which is a learnable parameter.");
@@ -79,12 +77,15 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(boolean, default false) "
                   "Sparse update.")
         .SetDefault(false);
+    AddAttr<bool>("is_distributed",
+                  "(boolean, default false) distributed lookup table.")
+        .SetDefault(false);
     AddAttr<int64_t>("padding_idx",
                      "(int64, default -1) "
                      "If the value is -1, it makes no effect to lookup. "
                      "Otherwise the given value indicates padding the output "
                      "with zeros whenever lookup encounters it in Ids.")
-        .SetDefault(-1);
+        .SetDefault(kNoPadding);
     AddComment(R"DOC(
 Lookup Table Operator.
 
@@ -124,9 +125,8 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
-        ctx.device_context());
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W"));
+    return framework::OpKernelType(data_type, ctx.device_context());
   }
 };
 
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index 6d81fccd2059c511f71d403229e04587e553e93d..77722c50d39003d9342afb04a61ae3aaf6b21100 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/lookup_table_op.h"
 #include "paddle/fluid/platform/assert.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index c92ce78eeffb8f1517e61c6d6624d406e04d974d..d482506bf0361c11a019e32efbf348a64aaf5164 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -25,16 +28,33 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 using SelectedRows = framework::SelectedRows;
+using DDim = framework::DDim;
+
+constexpr int64_t kNoPadding = -1;
 
 template <typename T>
 class LookupTableKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* table_t = context.Input<LoDTensor>("W");
-    auto* ids_var = context.InputVar("Ids");
-    Tensor* output_t = context.Output<Tensor>("Out");
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *table_var = context.InputVar("W");
+    auto *ids_var = context.InputVar("Ids");
+    Tensor *output_t = context.Output<Tensor>("Out");
+    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+
+    DDim table_dim;
 
-    int64_t* ids;
+    if (table_var->IsType<LoDTensor>()) {
+      table_dim = context.Input<LoDTensor>("W")->dims();
+    } else if (table_var->IsType<SelectedRows>()) {
+      auto *table_t = context.Input<SelectedRows>("W");
+      table_dim = table_t->value().dims();
+    } else {
+      PADDLE_THROW(
+          "The parameter W of a LookupTable "
+          "must be either LoDTensor or SelectedRows");
+    }
+
+    int64_t *ids;
     int64_t ids_numel;
 
     // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type
@@ -42,39 +62,51 @@ class LookupTableKernel : public framework::OpKernel<T> {
     // when Ids's type is SelectedRows, the rows of Ids contains the
     // ids to be looked up in W.
     if (ids_var->IsType<LoDTensor>()) {
-      auto* ids_t = context.Input<LoDTensor>("Ids");
-      ids = const_cast<int64_t*>(ids_t->data<int64_t>());
+      auto *ids_t = context.Input<LoDTensor>("Ids");
+      ids = const_cast<int64_t *>(ids_t->data<int64_t>());
       ids_numel = ids_t->numel();
     } else if (ids_var->IsType<SelectedRows>()) {
-      auto* ids_t = context.Input<SelectedRows>("Ids");
-      ids = const_cast<int64_t*>(ids_t->rows().data());
+      auto *ids_t = context.Input<SelectedRows>("Ids");
+      ids = const_cast<int64_t *>(ids_t->rows().data());
       ids_numel = ids_t->rows().size();
-      output_t->Resize({ids_numel, table_t->dims()[1]});
+      output_t->Resize({ids_numel, table_dim[1]});
     } else {
       PADDLE_THROW("Unsupported Variable Type of Ids");
     }
 
-    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+    if (table_var->IsType<LoDTensor>()) {
+      auto *table_t = context.Input<LoDTensor>("W");
+      int64_t row_number = table_t->dims()[0];
+      int64_t row_width = table_t->dims()[1];
 
-    int N = table_t->dims()[0];
-    int D = table_t->dims()[1];
-    auto* table = table_t->data<T>();
-    auto* output = output_t->mutable_data<T>(context.GetPlace());
+      auto *table = table_t->data<T>();
+      auto *output = output_t->mutable_data<T>(context.GetPlace());
 
-    if (padding_idx == -1) {
       for (int64_t i = 0; i < ids_numel; ++i) {
-        PADDLE_ENFORCE_LT(ids[i], N);
-        PADDLE_ENFORCE_GE(ids[i], 0);
-        memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+          memset(output + i * row_width, 0, row_width * sizeof(T));
+        } else {
+          PADDLE_ENFORCE_LT(ids[i], row_number);
+          PADDLE_ENFORCE_GE(ids[i], 0);
+          memcpy(output + i * row_width, table + ids[i] * row_width,
+                 row_width * sizeof(T));
+        }
       }
-    } else {
+    } else if (table_var->IsType<SelectedRows>()) {
+      const auto &table_t = table_var->Get<SelectedRows>();
+      int64_t row_width = table_t.value().dims()[1];
+      const auto *table = table_t.value().data<T>();
+      auto *output = output_t->mutable_data<T>(context.GetPlace());
+
       for (int64_t i = 0; i < ids_numel; ++i) {
-        if (ids[i] == padding_idx) {
-          memset(output + i * D, 0, D * sizeof(T));
+        if (padding_idx != kNoPadding && ids[i] == padding_idx) {
+          memset(output + i * row_width, 0, row_width * sizeof(T));
         } else {
-          PADDLE_ENFORCE_LT(ids[i], N);
           PADDLE_ENFORCE_GE(ids[i], 0);
-          memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+          auto id_index = table_t.Index(ids[i]);
+          PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists.");
+          memcpy(output + i * row_width, table + id_index * row_width,
+                 row_width * sizeof(T));
         }
       }
     }
@@ -84,17 +116,29 @@ class LookupTableKernel : public framework::OpKernel<T> {
 template <typename T>
 class LookupTableGradKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *table_var = context.InputVar("W");
+    DDim table_dim;
+    if (table_var->IsType<LoDTensor>()) {
+      table_dim = context.Input<LoDTensor>("W")->dims();
+    } else if (table_var->IsType<SelectedRows>()) {
+      auto *table_t = context.Input<SelectedRows>("W");
+      table_dim = table_t->value().dims();
+    } else {
+      PADDLE_THROW(
+          "The parameter W of a LookupTable "
+          "must be either LoDTensor or SelectedRows");
+    }
+
     bool is_sparse = context.Attr<bool>("is_sparse");
     // Since paddings are not trainable and fixed in forward, the gradient of
     // paddings makes no sense and we don't deal with it in backward.
     if (is_sparse) {
-      auto* ids = context.Input<LoDTensor>("Ids");
-      auto* table = context.Input<LoDTensor>("W");
-      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto* d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+      auto *ids = context.Input<LoDTensor>("Ids");
+      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
 
-      auto* ids_data = ids->data<int64_t>();
+      auto *ids_data = ids->data<int64_t>();
       auto ids_dim = ids->dims();
 
       framework::Vector<int64_t> new_rows;
@@ -104,31 +148,30 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
       }
       d_table->set_rows(new_rows);
 
-      auto* d_table_value = d_table->mutable_value();
-      d_table_value->Resize({ids_dim[0], table->dims()[1]});
+      auto *d_table_value = d_table->mutable_value();
+      d_table_value->Resize({ids_dim[0], table_dim[1]});
       d_table_value->mutable_data<T>(context.GetPlace());
 
-      d_table->set_height(table->dims()[0]);
+      d_table->set_height(table_dim[0]);
 
-      auto* d_output_data = d_output->data<T>();
-      auto* d_table_data = d_table_value->data<T>();
+      auto *d_output_data = d_output->data<T>();
+      auto *d_table_data = d_table_value->data<T>();
 
       PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
       memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
     } else {
-      auto* ids = context.Input<LoDTensor>("Ids");
-      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
-      auto* d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
-      auto* table = context.Input<LoDTensor>("W");
+      auto *ids = context.Input<LoDTensor>("Ids");
+      auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
 
-      auto* ids_data = ids->data<int64_t>();
+      auto *ids_data = ids->data<int64_t>();
       auto ids_dim = ids->dims();
 
-      int N = table->dims()[0];
+      int N = table_dim[0];
       int D = d_output->dims()[1];
 
-      auto* d_output_data = d_output->data<T>();
-      auto* d_table_data = d_table->mutable_data<T>(context.GetPlace());
+      auto *d_output_data = d_output->data<T>();
+      auto *d_table_data = d_table->mutable_data<T>(context.GetPlace());
 
       memset(d_table_data, 0, d_table->numel() * sizeof(T));
 
diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0a18882e8199c2a375a230a693b8b01d12aabfa0
--- /dev/null
+++ b/paddle/fluid/operators/lrn_mkldnn_op.cc
@@ -0,0 +1,212 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/lrn_op.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+
+namespace {
+template <typename T, typename... Args>
+std::shared_ptr<T> insert_to_context(const std::string& key,
+                                     const MKLDNNDeviceContext& dev_ctx,
+                                     Args&&... args) {
+  auto p = std::static_pointer_cast<T, void>(dev_ctx.GetBlob(key));
+
+  if (!p) {
+    p = std::make_shared<T>(args...);
+    dev_ctx.SetBlob(key, std::static_pointer_cast<void, T>(p));
+  }
+
+  return p;
+}
+
+template <typename... Args>
+void run_primitive(Args&&... args) {
+  auto forward_op = mkldnn::lrn_forward{args...};
+
+  std::vector<mkldnn::primitive> pipeline = {forward_op};
+  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+}
+}  // namespace
+
+template <typename T>
+class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(std::is_same<T, float>::value,
+                   "MKLDNN LRN must use float data.");
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "MKLDNN LRN must use CPUPlace.");
+
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    auto x = ctx.Input<Tensor>("X");
+    auto out = ctx.Output<Tensor>("Out");
+    auto mid = ctx.Output<Tensor>("MidOut");
+
+    auto input_data = x->data<T>();
+    auto output_data = out->mutable_data<T>(ctx.GetPlace());
+    mid->mutable_data<T>(ctx.GetPlace());
+
+    const int n = ctx.Attr<int>("n");
+    const float alpha = ctx.Attr<float>("alpha");
+    const float beta = ctx.Attr<float>("beta");
+    const float k = ctx.Attr<float>("k");
+    const bool is_test = ctx.Attr<bool>("is_test");
+
+    auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
+    e_mid = e_mid.constant(k);
+
+    auto dims = paddle::framework::vectorize2int(x->dims());
+
+    auto src_md = paddle::platform::MKLDNNMemDesc(
+        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+
+    auto dst_md = paddle::platform::MKLDNNMemDesc(
+        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+
+    auto forward_desc = mkldnn::lrn_forward::desc{mkldnn::prop_kind::forward,
+                                                  mkldnn::lrn_across_channels,
+                                                  src_md,
+                                                  n,
+                                                  alpha,
+                                                  beta,
+                                                  k};
+
+    auto src_memory_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine};
+    auto dst_memory = mkldnn::memory{{dst_md, mkldnn_engine},
+                                     static_cast<void*>(output_data)};
+
+    if (!is_test) {
+      const std::string key = ctx.op().Output("Out");
+      const std::string key_src_memory = key + "@lrn_src_memory";
+      const std::string key_pd = key + "@lrn_pd";
+      const std::string key_workspace_memory = key + "@lrn_workspace_memory";
+
+      auto forward_pd = insert_to_context<mkldnn::lrn_forward::primitive_desc>(
+          key_pd, dev_ctx, forward_desc, mkldnn_engine);
+
+      auto src_memory = insert_to_context<mkldnn::memory>(
+          key_src_memory, dev_ctx, src_memory_pd);
+
+      src_memory->set_data_handle(
+          static_cast<void*>(const_cast<T*>(input_data)));
+
+      auto workspace_memory = insert_to_context<mkldnn::memory>(
+          key_workspace_memory, dev_ctx,
+          forward_pd->workspace_primitive_desc());
+
+      run_primitive(*forward_pd, *src_memory, *workspace_memory, dst_memory);
+    } else {
+      auto forward_pd =
+          mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine};
+      auto src_memory = mkldnn::memory{
+          src_memory_pd, static_cast<void*>(const_cast<T*>(input_data))};
+      auto workspace_memory =
+          mkldnn::memory{forward_pd.workspace_primitive_desc()};
+
+      run_primitive(forward_pd, src_memory, workspace_memory, dst_memory);
+    }
+  }
+};
+
+template <typename T>
+class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(std::is_same<T, float>::value,
+                   "MKLDNN LRN must use float data.");
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "MKLDNN LRN must use CPUPlace.");
+    PADDLE_ENFORCE(
+        !ctx.Attr<bool>("is_test"),
+        "is_test attribute should be set to False in training phase.");
+
+    auto x = ctx.Input<Tensor>("X");
+
+    auto out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    const std::string key = ctx.op().Input("Out");
+    const std::string key_src_memory = key + "@lrn_src_memory";
+    const std::string key_pd = key + "@lrn_pd";
+    const std::string key_workspace_memory = key + "@lrn_workspace_memory";
+
+    const int n = ctx.Attr<int>("n");
+    const float alpha = ctx.Attr<float>("alpha");
+    const float beta = ctx.Attr<float>("beta");
+    const float k = ctx.Attr<float>("k");
+
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    auto x_grad_data = x_grad->mutable_data<T>(ctx.GetPlace());
+    auto out_grad_data = out_grad->data<T>();
+
+    auto dims = paddle::framework::vectorize2int(x->dims());
+
+    auto src_md = paddle::platform::MKLDNNMemDesc(
+        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+
+    auto diff_src_md = paddle::platform::MKLDNNMemDesc(
+        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+
+    auto diff_dst_md = paddle::platform::MKLDNNMemDesc(
+        dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
+
+    auto diff_dst_memory =
+        mkldnn::memory{{diff_dst_md, mkldnn_engine},
+                       static_cast<void*>(const_cast<float*>(out_grad_data))};
+
+    auto diff_src_memory = mkldnn::memory{{diff_src_md, mkldnn_engine},
+                                          static_cast<void*>(x_grad_data)};
+
+    auto backward_desc = mkldnn::lrn_backward::desc{
+        mkldnn::lrn_across_channels, src_md, diff_src_md, n, alpha, beta, k};
+
+    auto forward_pd = dev_ctx.GetBlob(key_pd);
+
+    auto backward_pd = mkldnn::lrn_backward::primitive_desc{
+        backward_desc, mkldnn_engine,
+        *static_cast<mkldnn::lrn_forward::primitive_desc*>(forward_pd.get())};
+
+    std::shared_ptr<void> workspace_memory =
+        dev_ctx.GetBlob(key_workspace_memory);
+
+    auto src_memory = dev_ctx.GetBlob(key_src_memory);
+    auto backward_op = mkldnn::lrn_backward{
+        backward_pd, *static_cast<mkldnn::memory*>(src_memory.get()),
+        diff_dst_memory, *static_cast<mkldnn::memory*>(workspace_memory.get()),
+        diff_src_memory};
+
+    std::vector<mkldnn::primitive> pipeline = {backward_op};
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(lrn, MKLDNN, paddle::platform::CPUPlace,
+                   ops::LRNMKLDNNOpKernel<float>);
+REGISTER_OP_KERNEL(lrn_grad, MKLDNN, paddle::platform::CPUPlace,
+                   ops::LRNMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index b0c213d637c244e4cbacbe75218537973efed047..52b459a6a2e56b7c256efdb535b4652c64bae23c 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lrn_op.h"
+#include <string>
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -36,7 +40,7 @@ struct LRNFunctor<platform::CPUDeviceContext, T> {
     auto e_x = framework::EigenTensor<T, 4>::From(input);
     for (int m = 0; m < N; m++) {
       for (int i = 0; i < C; i++) {
-        for (int c = start; c <= end; c++) {
+        for (int c = start; c < end; c++) {
           int ch = i + c;
           if (ch >= 0 && ch < C) {
             auto s = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
@@ -92,7 +96,7 @@ struct LRNGradFunctor<platform::CPUDeviceContext, T> {
                                  Eigen::array<int, 4>({{1, 1, H, W}}));
 
         i_x_g = i_mid.pow(-beta) * i_out_g;
-        for (int c = start; c <= end; c++) {
+        for (int c = start; c < end; c++) {
           int ch = i + c;
           if (ch < 0 || ch >= C) {
             continue;
@@ -116,6 +120,27 @@ struct LRNGradFunctor<platform::CPUDeviceContext, T> {
 template struct LRNGradFunctor<platform::CPUDeviceContext, float>;
 template struct LRNGradFunctor<platform::CPUDeviceContext, double>;
 
+namespace {
+framework::OpKernelType GetExpectedLRNKernel(
+    const framework::ExecutionContext& ctx) {
+  framework::LibraryType library_{framework::LibraryType::kPlain};
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+#ifdef PADDLE_WITH_MKLDNN
+  if (library_ == framework::LibraryType::kPlain &&
+      platform::CanMKLDNNBeUsed(ctx)) {
+    library_ = framework::LibraryType::kMKLDNN;
+    layout_ = framework::DataLayout::kMKLDNN;
+  }
+#endif
+
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+      layout_, library_);
+}
+}  // namespace
+
 class LRNOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -132,16 +157,20 @@ class LRNOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(x_dim.size(), 4, "Input(X)'rank of LRNOp should be 4.");
 
     ctx->SetOutputDim("Out", x_dim);
-    ctx->SetOutputDim("MidOut", x_dim);
     ctx->ShareLoD("X", /*->*/ "Out");
+    ctx->SetOutputDim("MidOut", x_dim);
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return GetExpectedLRNKernel(ctx);
   }
 };
 
 template <typename T>
 class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LRNOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(Tensor) The input of LRN operator. "
              "It must be a 4D tenor with NCHW format.");
@@ -176,6 +205,20 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
                "beta is the power number.")
         .SetDefault(0.75)
         .GreaterThan(0.0);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "data_format",
+        "(string, default NCHW) Only used in "
+        "An optional string from: \"NHWC\", \"NCHW\". "
+        "Defaults to \"NHWC\". Specify the data format of the output data, "
+        "the input will be transformed automatically. ")
+        .SetDefault("AnyLayout");
+    AddAttr<bool>("is_test",
+                  "Turns on memory optimization that optimizes away "
+                  "unnecessary memory allocations. Used by MKLDNN.")
+        .SetDefault(false);
 
     AddComment(R"DOC(
 Local Response Normalization Operator.
@@ -223,13 +266,19 @@ class LRNOpGrad : public framework::OperatorWithKernel {
     auto x_dims = ctx->GetInputDim("X");
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
   }
-};
 
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return GetExpectedLRNKernel(ctx);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(lrn, ops::LRNOp, ops::LRNOpMaker<float>, lrn_grad, ops::LRNOpGrad);
+REGISTER_OPERATOR(lrn, ops::LRNOp, ops::LRNOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(lrn_grad, ops::LRNOpGrad);
 REGISTER_OP_CPU_KERNEL(
     lrn, ops::LRNKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/lrn_op.h b/paddle/fluid/operators/lrn_op.h
index 95796f7eecd2bcd61aab7944f557ca568b03e027..0fd3175e8579df9e61368cc151a94fa45e433884 100644
--- a/paddle/fluid/operators/lrn_op.h
+++ b/paddle/fluid/operators/lrn_op.h
@@ -121,6 +121,10 @@ class LRNGradKernel : public framework::OpKernel<T> {
     T alpha = ctx.Attr<T>("alpha");
     T beta = ctx.Attr<T>("beta");
 
+    PADDLE_ENFORCE(
+        !ctx.Attr<bool>("is_test"),
+        "is_test attribute should be set to False in training phase.");
+
     LRNGradFunctor<DeviceContext, T> f;
     f(ctx, x, out, mid, x_g, out_g, N, C, H, W, n, alpha, beta);
   }
diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc
index d75537741ef1d13b61ad6e244b2bba1ae5509da5..3225bf9bb63d57969ce9ae0e4a74e8f466c8c2d0 100644
--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lstm_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -102,8 +103,7 @@ class LSTMOp : public framework::OperatorWithKernel {
 
 class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LSTMOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Input",
              "(LoDTensor) the first input is a LodTensor, which support "
              "variable-time length input sequence. The underlying tensor in "
@@ -184,34 +184,32 @@ Long-Short Term Memory (LSTM) Operator.
 The defalut implementation is diagonal/peephole connection
 (https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows:
 
-$$
-i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) \\
+$$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) $$
 
-f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) \\
+$$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) $$
 
-\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) \\
+$$ \\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) $$
 
-o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) \\
+$$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) $$
 
-c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\
+$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
 
-h_t = o_t \odot act_h(c_t)
-$$
+$$ h_t = o_t \\odot act_h(c_t) $$
 
-where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
-of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
-are diagonal weight matrices for peephole connections. In our implementation,
-we use vectors to reprenset these diagonal weight matrices. The b terms
-denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$
-is the non-line activations, such as logistic sigmoid function, and
-$i, f, o$ and $c$ are the input gate, forget gate, output gate,
-and cell activation vectors, respectively, all of which have the same size as
-the cell output activation vector $h$.
-
-The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
-are the cell input and cell output activation functions and `tanh` is usually
-used for them. $\tilde{c_t}$ is also called candidate hidden state,
-which is computed based on the current input and the previous hidden state.
+- W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
+  of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
+  are diagonal weight matrices for peephole connections. In our implementation,
+  we use vectors to reprenset these diagonal weight matrices.
+- The b terms denote bias vectors ($b_i$ is the input gate bias vector).
+- $\sigma$ is the non-line activations, such as logistic sigmoid function.
+- $i, f, o$ and $c$ are the input gate, forget gate, output gate,
+  and cell activation vectors, respectively, all of which have the same size as
+  the cell output activation vector $h$.
+- The $\odot$ is the element-wise product of the vectors.
+- $act_g$ and $act_h$ are the cell input and cell output activation functions
+  and `tanh` is usually used for them.
+- $\tilde{c_t}$ is also called candidate hidden state,
+  which is computed based on the current input and the previous hidden state.
 
 Set `use_peepholes` False to disable peephole connection. The formula
 is omitted here, please refer to the paper
@@ -272,7 +270,9 @@ class LSTMGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(lstm, ops::LSTMOp, ops::LSTMOpMaker, lstm_grad, ops::LSTMGradOp);
+REGISTER_OPERATOR(lstm, ops::LSTMOp, ops::LSTMOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(lstm_grad, ops::LSTMGradOp);
 REGISTER_OP_CPU_KERNEL(
     lstm, ops::LSTMKernel<paddle::platform::CPUDeviceContext, float>,
     ops::LSTMKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h
index 11f9f223b5d9a8091c51c93cee3f9c23b62e5573..7d62d2d020ec2e3a29ad8720a8f04fead3a90a63 100644
--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -13,10 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
 
 namespace paddle {
@@ -32,7 +33,7 @@ inline void ReorderInitState(const DeviceContext& ctx,
                              framework::Tensor* dst, bool indexed_src) {
   math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
   dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index_lod, *dst, indexed_src);
+  row_shuffle(ctx, src, index_lod, dst, indexed_src);
 }
 
 template <typename DeviceContext, typename T>
@@ -56,7 +57,7 @@ class LSTMKernel : public framework::OpKernel<T> {
     bool is_reverse = ctx.Attr<bool>("is_reverse");
     math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
     auto& device_ctx = ctx.template device_context<DeviceContext>();
-    to_batch(device_ctx, *input, *batch_gate, true, is_reverse);
+    to_batch(device_ctx, *input, batch_gate, true, is_reverse);
 
     auto in_dims = input->dims();
     int frame_size = static_cast<int>(in_dims[1] / 4);
@@ -113,6 +114,7 @@ class LSTMKernel : public framework::OpKernel<T> {
     auto cand_act = math::detail::GetActivationType(
         ctx.Attr<std::string>("candidate_activation"));
 
+    auto blas = math::GetBlas<DeviceContext, T>(device_ctx);
     for (size_t n = 0; n < num_batch; n++) {
       int bstart = static_cast<int>(batch_starts[n]);
       int bend = static_cast<int>(batch_starts[n + 1]);
@@ -128,9 +130,8 @@ class LSTMKernel : public framework::OpKernel<T> {
         int pre_h_start = static_cast<int>(batch_starts[n - 1]);
         int pre_h_end = pre_h_start + cur_batch_size;
         auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
-        math::matmul<DeviceContext, T>(device_ctx, pre_hidden_t, false, *weight,
-                                       false, static_cast<T>(1.0), &gate_t,
-                                       static_cast<T>(1.0));
+        blas.MatMul(pre_hidden_t, false, *weight, false, static_cast<T>(1.0),
+                    &gate_t, static_cast<T>(1.0));
       } else if (hidden_t0) {
         // If n == 0 and there is no initialized hidden state, that is to say
         // the H0 is zeros, the calculation W_h * H0 will be skiped.
@@ -142,9 +143,8 @@ class LSTMKernel : public framework::OpKernel<T> {
         Tensor ordered_h0;
         ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
                                            &ordered_h0, true);
-        math::matmul<DeviceContext, T>(device_ctx, ordered_h0, false, *weight,
-                                       false, static_cast<T>(1.0), &gate_t,
-                                       static_cast<T>(1.0));
+        blas.MatMul(ordered_h0, false, *weight, false, static_cast<T>(1.0),
+                    &gate_t, static_cast<T>(1.0));
       }
 
       lstm_value.gate_value = gate_t.data<T>();
@@ -160,11 +160,11 @@ class LSTMKernel : public framework::OpKernel<T> {
     math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batch_hidden.set_lod(batch_gate->lod());
     // restore the output hidden in LoDTensor from the batch hidden
-    to_seq(device_ctx, batch_hidden, *hidden_out);
+    to_seq(device_ctx, batch_hidden, hidden_out);
 
     batch_cell.set_lod(batch_gate->lod());
     // restore the output cell state in LoDTensor from the batch cell
-    to_seq(device_ctx, batch_cell, *cell_out);
+    to_seq(device_ctx, batch_cell, cell_out);
   }
 };
 
@@ -256,7 +256,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
         const framework::DDim& dims, framework::LoDTensor& dst) {
       dst.mutable_data<T>(dims, ctx.GetPlace());
       dst.set_lod(batch_gate->lod());
-      to_batch(ctx, src, dst, false);
+      to_batch(ctx, src, &dst, false);
     };
 
     LoDTensor batch_hidden, batch_hidden_g, batch_cell;
@@ -281,6 +281,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
 
     auto batch_starts = batch_gate->lod()[0];
     size_t num_batch = batch_starts.size() - 1;
+    auto blas = math::GetBlas<DeviceContext, T>(device_ctx);
     for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
       int bstart = static_cast<int>(batch_starts[n]);
       int bend = static_cast<int>(batch_starts[n + 1]);
@@ -319,29 +320,25 @@ class LSTMGradKernel : public framework::OpKernel<T> {
         int pre_h_start = static_cast<int>(batch_starts[n - 1]);
         int pre_h_end = pre_h_start + cur_batch_size;
         auto pre_hidden_g = batch_hidden_g.Slice(pre_h_start, pre_h_end);
-        math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight, true,
-                                       static_cast<T>(1.0), &pre_hidden_g,
-                                       static_cast<T>(1.0));
+        blas.MatMul(gate_g, false, *weight, true, static_cast<T>(1.0),
+                    &pre_hidden_g, static_cast<T>(1.0));
         if (weight_g) {
           /* backward weight */
           auto pre_hidden = batch_hidden.Slice(pre_h_start, pre_h_end);
-          math::matmul<DeviceContext, T>(device_ctx, pre_hidden, true, gate_g,
-                                         false, static_cast<T>(1.0), weight_g,
-                                         static_cast<T>(1.0));
+          blas.MatMul(pre_hidden, true, gate_g, false, static_cast<T>(1.0),
+                      weight_g, static_cast<T>(1.0));
         }
       } else {
         if (h0 && weight_g) {
           ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
                                              &ordered_h0, true);
-          math::matmul<DeviceContext, T>(device_ctx, ordered_h0, true, gate_g,
-                                         false, static_cast<T>(1.0), weight_g,
-                                         static_cast<T>(1.0));
+          blas.MatMul(ordered_h0, true, gate_g, false, static_cast<T>(1.0),
+                      weight_g, static_cast<T>(1.0));
         }
         if (h0 && h0_g) {
           ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
-          math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight,
-                                         true, static_cast<T>(1.0),
-                                         &ordered_h0_g, static_cast<T>(0.0));
+          blas.MatMul(gate_g, false, *weight, true, static_cast<T>(1.0),
+                      &ordered_h0_g, static_cast<T>(0.0));
         }
       }
     }
@@ -350,7 +347,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     if (in_g) {
       /* backward data */
       in_g->mutable_data<T>(ctx.GetPlace());
-      to_seq(device_ctx, batch_gate_g, *in_g);
+      to_seq(device_ctx, batch_gate_g, in_g);
     }
     if (bias && bias_g) {
       /* backward bias */
diff --git a/paddle/fluid/operators/lstm_unit_op.cc b/paddle/fluid/operators/lstm_unit_op.cc
index b3c9d7c34d1ac54fb3e15a60bcc470f392bf5027..0895c58f5f58afd444000ebeac7a92e3eb7778d3 100644
--- a/paddle/fluid/operators/lstm_unit_op.cc
+++ b/paddle/fluid/operators/lstm_unit_op.cc
@@ -48,8 +48,7 @@ class LstmUnitOp : public framework::OperatorWithKernel {
 
 class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LstmUnitOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "Lstm unit only applies non-linear activations, please make sure"
              "that linear tranformation has already been applied to `X`. "
@@ -97,8 +96,9 @@ class LstmUnitGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker, lstm_unit_grad,
-            ops::LstmUnitGradOp);
+REGISTER_OPERATOR(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(lstm_unit_grad, ops::LstmUnitGradOp);
 REGISTER_OP_CPU_KERNEL(lstm_unit,
                        ops::LstmUnitKernel<paddle::platform::CPUPlace, float>,
                        ops::LstmUnitKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/lstm_unit_op.cu b/paddle/fluid/operators/lstm_unit_op.cu
index 76245a1b5a9c8ba9c7ee7d7c03a95e2595a01591..acf094238fff92711edf00b4180266138362add1 100644
--- a/paddle/fluid/operators/lstm_unit_op.cu
+++ b/paddle/fluid/operators/lstm_unit_op.cu
@@ -18,6 +18,7 @@ https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.c
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/cross_entropy_op.h"
+#include "paddle/fluid/operators/lstm_unit_op.h"
 #include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc
index a881ef82ec3cefa826f5f0856cc4fc13c7d7afc0..e398b51480f6fc0c6c568770b3b2a9746360744e 100644
--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lstmp_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -119,8 +120,7 @@ class LSTMPOp : public framework::OperatorWithKernel {
 
 class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  LSTMPOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Input",
              "(LoDTensor) the input for sequence data, which supports "
              "variable-time length input sequence. The underlying tensor in "
@@ -321,8 +321,9 @@ class LSTMPGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker, lstmp_grad,
-            ops::LSTMPGradOp);
+REGISTER_OPERATOR(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(lstmp_grad, ops::LSTMPGradOp);
 REGISTER_OP_CPU_KERNEL(
     lstmp, ops::LSTMPKernel<paddle::platform::CPUDeviceContext, float>,
     ops::LSTMPKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
index dfa7f74d5116b4e3f1508f8bef94c598711e8124..370dd04d1449a8e211febf9a4f9e90e6f5008e20 100644
--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
 
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
 namespace paddle {
 namespace operators {
 
@@ -39,7 +39,7 @@ inline void ReorderInitState(const DeviceContext& ctx,
                              framework::Tensor* dst, bool indexed_src) {
   math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
   dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index, *dst, indexed_src);
+  row_shuffle(ctx, src, index, dst, indexed_src);
 }
 
 template <typename DeviceContext, typename T>
@@ -80,7 +80,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
     bool is_reverse = ctx.Attr<bool>("is_reverse");
     math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
     auto& device_ctx = ctx.template device_context<DeviceContext>();
-    to_batch(device_ctx, *input, *batch_gate, true, is_reverse);
+    to_batch(device_ctx, *input, batch_gate, true, is_reverse);
 
     auto in_dims = input->dims();
     int frame_size = static_cast<int>(in_dims[1] / 4);
@@ -142,7 +142,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
     auto proj_act = math::detail::GetActivationType(
         ctx.Attr<std::string>("proj_activation"));
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-
+    auto blas = math::GetBlas<DeviceContext, T>(device_ctx);
     for (size_t n = 0; n < num_batch; n++) {
       int bstart = static_cast<int>(batch_starts[n]);
       int bend = static_cast<int>(batch_starts[n + 1]);
@@ -159,9 +159,8 @@ class LSTMPKernel : public framework::OpKernel<T> {
         int pre_h_start = static_cast<int>(batch_starts[n - 1]);
         int pre_h_end = pre_h_start + cur_batch_size;
         auto pre_proj_t = batch_proj.Slice(pre_h_start, pre_h_end);
-        math::matmul<DeviceContext, T>(device_ctx, pre_proj_t, false, *weight,
-                                       false, static_cast<T>(1.0), &gate_t,
-                                       static_cast<T>(1.0));
+        blas.MatMul(pre_proj_t, false, *weight, false, static_cast<T>(1.0),
+                    &gate_t, static_cast<T>(1.0));
       } else if (hidden_t0) {
         // If n == 0 and there is no initialized hidden state, that is to say
         // the H0 is zeros, the calculation W_h * H0 will be skiped.
@@ -175,16 +174,14 @@ class LSTMPKernel : public framework::OpKernel<T> {
         ordered_proj0->mutable_data<T>(ctx.GetPlace());
         ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
                                            &ordered_h0, true);
-        math::matmul<DeviceContext, T>(device_ctx, ordered_h0, false,
-                                       *proj_weight, false, static_cast<T>(1.0),
-                                       ordered_proj0, static_cast<T>(0.0));
+        blas.MatMul(ordered_h0, false, *proj_weight, false, static_cast<T>(1.0),
+                    ordered_proj0, static_cast<T>(0.0));
         if (proj_act != math::detail::ActivationType::kIdentity) {
           auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
           ActCompute(cell_act, place, proj0_dev, proj0_dev);
         }
-        math::matmul<DeviceContext, T>(device_ctx, *ordered_proj0, false,
-                                       *weight, false, static_cast<T>(1.0),
-                                       &gate_t, static_cast<T>(1.0));
+        blas.MatMul(*ordered_proj0, false, *weight, false, static_cast<T>(1.0),
+                    &gate_t, static_cast<T>(1.0));
       }
 
       lstmp_value.gate_value = gate_t.data<T>();
@@ -195,9 +192,8 @@ class LSTMPKernel : public framework::OpKernel<T> {
           device_ctx, lstmp_value, frame_size, cur_batch_size, gate_act,
           cell_act, cand_act);
       lstmp_value.prev_state_value = lstmp_value.state_value;
-      math::matmul<DeviceContext, T>(device_ctx, hidden_t, false, *proj_weight,
-                                     false, static_cast<T>(1.0), &proj_t,
-                                     static_cast<T>(0.0));
+      blas.MatMul(hidden_t, false, *proj_weight, false, static_cast<T>(1.0),
+                  &proj_t, static_cast<T>(0.0));
       if (proj_act != math::detail::ActivationType::kIdentity) {
         auto proj_t_dev = EigenMatrix<T>::From(proj_t);
         ActCompute(cell_act, place, proj_t_dev, proj_t_dev);
@@ -207,11 +203,11 @@ class LSTMPKernel : public framework::OpKernel<T> {
     math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
     batch_proj.set_lod(batch_gate->lod());
     // restore the output hidden in LoDTensor from the batch hidden
-    to_seq(device_ctx, batch_proj, *proj_out);
+    to_seq(device_ctx, batch_proj, proj_out);
 
     batch_cell.set_lod(batch_gate->lod());
     // restore the output cell state in LoDTensor from the batch cell
-    to_seq(device_ctx, batch_cell, *cell_out);
+    to_seq(device_ctx, batch_cell, cell_out);
   }
 };
 
@@ -331,7 +327,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
         const framework::DDim& dims, framework::LoDTensor& dst) {
       dst.mutable_data<T>(dims, ctx.GetPlace());
       dst.set_lod(batch_gate->lod());
-      to_batch(ctx, src, dst, false);
+      to_batch(ctx, src, &dst, false);
     };
 
     LoDTensor batch_hidden_g, batch_proj, batch_proj_g, batch_cell;
@@ -360,6 +356,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
 
     auto batch_starts = batch_gate->lod()[0];
     size_t num_batch = batch_starts.size() - 1;
+    auto blas = math::GetBlas<DeviceContext, T>(device_ctx);
     for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
       int bstart = static_cast<int>(batch_starts[n]);
       int bend = static_cast<int>(batch_starts[n + 1]);
@@ -374,15 +371,13 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
       }
       /* hidden state backwarad */
       Tensor out_g = batch_hidden_g.Slice(bstart, bend);
-      math::matmul<DeviceContext, T>(device_ctx, proj_g, false, *proj_weight,
-                                     true, static_cast<T>(1.0), &out_g,
-                                     static_cast<T>(0.0));
+      blas.MatMul(proj_g, false, *proj_weight, true, static_cast<T>(1.0),
+                  &out_g, static_cast<T>(0.0));
       /* projection weight backward*/
       if (proj_weight_g) {
         Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-        math::matmul<DeviceContext, T>(device_ctx, hidden_t, true, proj_g,
-                                       false, static_cast<T>(1.0),
-                                       proj_weight_g, static_cast<T>(1.0));
+        blas.MatMul(hidden_t, true, proj_g, false, static_cast<T>(1.0),
+                    proj_weight_g, static_cast<T>(1.0));
       }
 
       Tensor gate = batch_gate->Slice(bstart, bend);
@@ -418,24 +413,21 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
         int pre_h_start = static_cast<int>(batch_starts[n - 1]);
         int pre_h_end = pre_h_start + cur_batch_size;
         auto pre_proj_g = batch_proj_g.Slice(pre_h_start, pre_h_end);
-        math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight, true,
-                                       static_cast<T>(1.0), &pre_proj_g,
-                                       static_cast<T>(1.0));
+        blas.MatMul(gate_g, false, *weight, true, static_cast<T>(1.0),
+                    &pre_proj_g, static_cast<T>(1.0));
         if (weight_g) {
           /* weight backward*/
           auto pre_proj = batch_proj.Slice(pre_h_start, pre_h_end);
-          math::matmul<DeviceContext, T>(device_ctx, pre_proj, true, gate_g,
-                                         false, static_cast<T>(1.0), weight_g,
-                                         static_cast<T>(1.0));
+          blas.MatMul(pre_proj, true, gate_g, false, static_cast<T>(1.0),
+                      weight_g, static_cast<T>(1.0));
         }
       } else {
         if (h0 && weight_g) {
           ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
                                              &ordered_h0, true);
           if (weight_g) {
-            math::matmul<DeviceContext, T>(device_ctx, *ordered_proj0, true,
-                                           gate_g, false, static_cast<T>(1.0),
-                                           weight_g, static_cast<T>(1.0));
+            blas.MatMul(*ordered_proj0, true, gate_g, false,
+                        static_cast<T>(1.0), weight_g, static_cast<T>(1.0));
           }
         }
         if (h0 && (h0_g || proj_weight_g)) {
@@ -443,9 +435,8 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
           Tensor proj0_g;
           proj0_g.Resize({in_dims[0], proj_weight->dims()[1]});
           proj0_g.mutable_data<T>(ctx.GetPlace());
-          math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight,
-                                         true, static_cast<T>(1.0), &proj0_g,
-                                         static_cast<T>(0.0));
+          blas.MatMul(gate_g, false, *weight, true, static_cast<T>(1.0),
+                      &proj0_g, static_cast<T>(0.0));
           if (proj_act != math::detail::ActivationType::kIdentity) {
             auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
             auto proj0_g_dev = EigenMatrix<T>::From(proj0_g);
@@ -453,14 +444,12 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
                            proj0_g_dev);
           }
           if (h0_g) {
-            math::matmul<DeviceContext, T>(
-                device_ctx, proj0_g, false, *proj_weight, true,
-                static_cast<T>(1.0), &ordered_h0_g, static_cast<T>(0.0));
+            blas.MatMul(proj0_g, false, *proj_weight, true, static_cast<T>(1.0),
+                        &ordered_h0_g, static_cast<T>(0.0));
           }
           if (proj_weight_g) {
-            math::matmul<DeviceContext, T>(device_ctx, ordered_h0, true,
-                                           proj0_g, false, static_cast<T>(1.0),
-                                           proj_weight_g, static_cast<T>(1.0));
+            blas.MatMul(ordered_h0, true, proj0_g, false, static_cast<T>(1.0),
+                        proj_weight_g, static_cast<T>(1.0));
           }
         }
       }
@@ -470,7 +459,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
     if (in_g) {
       /* backward data */
       in_g->mutable_data<T>(ctx.GetPlace());
-      to_seq(device_ctx, batch_gate_g, *in_g);
+      to_seq(device_ctx, batch_gate_g, in_g);
     }
     if (bias && bias_g) {
       /* backward bias */
diff --git a/paddle/fluid/operators/margin_rank_loss_op.cc b/paddle/fluid/operators/margin_rank_loss_op.cc
index b146b5088321efcee5a4511b3fedd047a0d54f00..b643ba9d7fa61d758e871ebe7a463c22e937fa2c 100644
--- a/paddle/fluid/operators/margin_rank_loss_op.cc
+++ b/paddle/fluid/operators/margin_rank_loss_op.cc
@@ -42,8 +42,7 @@ class MarginRankLossOp : public framework::OperatorWithKernel {
 template <typename T>
 class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MarginRankLossOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X1",
              "(2-D tensor with shape [batch_size x 1]) The score for "
              "one item X1 to be ranked, from pairwise ranking model.");
@@ -111,9 +110,10 @@ class MarginRankLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
 
-REGISTER_OP(margin_rank_loss, ops::MarginRankLossOp,
-            ops::MarginRankLossOpMaker<float>, margin_rank_loss_grad,
-            ops::MarginRankLossGradOp);
+REGISTER_OPERATOR(margin_rank_loss, ops::MarginRankLossOp,
+                  ops::MarginRankLossOpMaker<float>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(margin_rank_loss_grad, ops::MarginRankLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     margin_rank_loss,
     ops::MarginRankLossKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index a181d802262d15b188060dae4330cec0e24714ab..53a478c1ac0bdf8c0a3f3721161779ef10cb14f8 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -6,6 +6,7 @@ function(math_library TARGET)
     # But it handle split GPU/CPU code and link some common library.
     set(cc_srcs)
     set(cu_srcs)
+    set(hip_srcs)
     set(math_common_deps device_context framework_proto)
     set(multiValueArgs DEPS)
     cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
@@ -17,10 +18,15 @@ function(math_library TARGET)
     if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
         list(APPEND cu_srcs ${TARGET}.cu)
     endif()
+    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
+        list(APPEND hip_srcs ${TARGET}.hip.cu)
+    endif()
 
     list(LENGTH cc_srcs cc_srcs_len)
     if (WITH_GPU)
         nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
+    elseif (WITH_AMD_GPU)
+        hip_library(${TARGET} SRCS ${cc_srcs} ${hip_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
     elseif(${cc_srcs_len} GREATER 0)
         cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
     endif()
@@ -35,15 +41,16 @@ math_library(depthwise_conv)
 math_library(gru_compute DEPS activation_functions math_function)
 math_library(im2col)
 math_library(lstm_compute DEPS activation_functions)
-math_library(math_function DEPS cblas)
+cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
+math_library(math_function DEPS blas)
 math_library(maxouting)
 math_library(pooling)
-math_library(selected_rows_functor DEPS selected_rows)
+math_library(selected_rows_functor DEPS selected_rows math_function)
 math_library(sequence2batch)
 math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function)
 math_library(sequence_scale)
-math_library(softmax)
+math_library(softmax DEPS math_function)
 math_library(unpooling)
 math_library(vol2col)
 
diff --git a/paddle/fluid/operators/math/blas.cc b/paddle/fluid/operators/math/blas.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6a143b3c056455595fdedc131b0c5f4ee756e1e0
--- /dev/null
+++ b/paddle/fluid/operators/math/blas.cc
@@ -0,0 +1,52 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/math/blas.h"
+
+#include <utility>
+namespace paddle {
+namespace operators {
+namespace math {
+MatDescriptor CreateMatrixDescriptor(const framework::DDim &tensor_dim,
+                                     int num_flatten_cols, bool trans) {
+  PADDLE_ENFORCE_GT(tensor_dim.size(), 1);
+  MatDescriptor retv;
+  if (num_flatten_cols > 1) {
+    auto flatten_dim = framework::flatten_to_2d(tensor_dim, num_flatten_cols);
+    retv.height_ = flatten_dim[0];
+    retv.width_ = flatten_dim[1];
+  } else {
+    if (tensor_dim.size() == 2) {
+      retv.height_ = tensor_dim[0];
+      retv.width_ = tensor_dim[1];
+    } else {
+      auto dim_vec = framework::vectorize(tensor_dim);
+      retv.batch_size_ = 1;
+      for (size_t i = 0; i < dim_vec.size() - 2; ++i) {
+        retv.batch_size_ *= dim_vec[i];
+      }
+      retv.height_ = dim_vec[dim_vec.size() - 2];
+      retv.width_ = dim_vec[dim_vec.size() - 1];
+      retv.stride_ = retv.height_ * retv.width_;
+    }
+  }
+  if (trans) {
+    std::swap(retv.width_, retv.height_);
+  }
+  retv.trans_ = trans;
+  return retv;
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
new file mode 100644
index 0000000000000000000000000000000000000000..a907d6a71b7a16983e601073b039b48406853a0b
--- /dev/null
+++ b/paddle/fluid/operators/math/blas.h
@@ -0,0 +1,229 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+
+#ifdef PADDLE_WITH_MKLML
+#include "paddle/fluid/platform/dynload/mklml.h"
+#endif
+
+#ifdef PADDLE_USE_OPENBLAS
+#include <cblas.h>
+#ifdef LAPACK_FOUND
+#include <lapacke.h>
+#endif
+#endif
+
+#ifndef LAPACK_FOUND
+extern "C" {
+#include <cblas.h>  // NOLINT
+int LAPACKE_sgetrf(int matrix_layout, int m, int n, float* a, int lda,
+                   int* ipiv);
+int LAPACKE_dgetrf(int matrix_layout, int m, int n, double* a, int lda,
+                   int* ipiv);
+int LAPACKE_sgetri(int matrix_layout, int n, float* a, int lda,
+                   const int* ipiv);
+int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
+                   const int* ipiv);
+}
+#endif
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+static void SetNumThreads(int num_threads) {
+#ifdef PADDLE_USE_OPENBLAS
+  int real_num_threads = num_threads > 1 ? num_threads : 1;
+  openblas_set_num_threads(real_num_threads);
+#elif defined(PADDLE_WITH_MKLML)
+  int real_num_threads = num_threads > 1 ? num_threads : 1;
+  platform::dynload::MKL_Set_Num_Threads(real_num_threads);
+#else
+  PADDLE_ENFORCE(false, "To be implemented.");
+#endif
+}
+
+/**
+ * Matrix Descriptor of a memory buffer.
+ *
+ * It is used for Blas::MatMul. MatMul operator can be batched.
+ * if Mat A is [BatchSize, H, W], Mat B is [BatchSize, H, W]. It will be a
+ * `batch_size` times of GEMM. The batched GEMM could be faster base on the
+ * implementation of the blas library. The batch size could be zero. If any
+ * matrix of `matmul` has a batch size, the will be a batched GEMM, too. e.g.,
+ * Mat A is [BatchSize, H1, W2], and Mat B [H2, W2], The result matrix wil be
+ * [BatchSize, H1, W2]
+ *
+ * The boolean flag, `trans`, describe the memory is the transpose of matrix or
+ * not. If the trans is true, the last two dims of matrix are transposed. The
+ * memory layout of the matrix is [Width, Height] or [BatchSize, Width, Height].
+ *
+ * The MatDescriptor is not only the dimension or shape of a matrix, it also
+ * contains the layout, stride of matrix. It is clearer to have a structure than
+ * reuse `DDim`.
+ */
+struct MatDescriptor {
+  int64_t height_;
+  int64_t width_;
+  int64_t stride_{0};
+  int64_t batch_size_{0};
+  bool trans_;
+};
+
+/**
+ * Create Matrix Descriptor from a tensor dim, num_flatten_cols, and transpose
+ * flag
+ *
+ * @param tensor_dim: The dimension of the tensor. The rank of this dimension
+ * must larger than 1.
+ *
+ * @param num_flatten_cols:  Reshape a tensor to a matrix. The matrix's first
+ * dimension(column length) will be the product of tensor's first `num_col_dims`
+ * dimensions. If num_flatten_cols is zero, the first N-2 dimension will be the
+ * batch_size of descriptor.
+ *
+ * @param trans: True if the matrix is transposed.
+ */
+extern MatDescriptor CreateMatrixDescriptor(const framework::DDim& tensor_dim,
+                                            int num_flatten_cols, bool trans);
+
+template <typename DeviceContext>
+class Blas {
+ public:
+  explicit Blas(const DeviceContext& context) : context_(context) {}
+
+  template <typename T>
+  void GEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+            T alpha, const T* A, const T* B, T beta, T* C) const;
+
+  template <typename T>
+  void GEMM(bool transA, bool transB, int M, int N, int K, T alpha, const T* A,
+            int lda, const T* B, int ldb, T beta, T* C, int ldc) const;
+
+  template <typename T>
+  void MatMul(const framework::Tensor& mat_a, bool trans_a,
+              const framework::Tensor& mat_b, bool trans_b, T alpha,
+              framework::Tensor* mat_out, T beta) const;
+
+  template <typename T>
+  void MatMul(const framework::Tensor& mat_a, bool trans_a,
+              const framework::Tensor& mat_b, bool trans_b,
+              framework::Tensor* mat_out) const {
+    MatMul(mat_a, trans_a, mat_b, trans_b, static_cast<T>(1.0), mat_out,
+           static_cast<T>(0.0));
+  }
+
+  template <typename T>
+  void MatMul(const framework::Tensor& mat_a, const framework::Tensor& mat_b,
+              framework::Tensor* mat_out) const {
+    this->template MatMul<T>(mat_a, false, mat_b, false, mat_out);
+  }
+
+  template <typename T>
+  void AXPY(int n, T alpha, const T* x, T* y) const;
+
+  template <typename T>
+  void VADD(int n, const T* x, const T* y, T* z) const;
+
+  template <typename T>
+  void VCOPY(int n, const T* x, T* y) const;
+
+  template <typename T>
+  void GEMV(bool trans_a, int M, int N, T alpha, const T* A, const T* B, T beta,
+            T* C) const;
+
+  template <typename T>
+  void BatchedGEMM(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N,
+                   int K, T alpha, const T* A, const T* B, T beta, T* C,
+                   int batchCount, int64_t strideA, int64_t strideB) const;
+
+  template <typename T>
+  void MatMul(const framework::Tensor& mat_a, const MatDescriptor& dim_a,
+              const framework::Tensor& mat_b, const MatDescriptor& dim_b,
+              T alpha, framework::Tensor* mat_out, T beta) const;
+
+ private:
+  const DeviceContext& context_;
+};
+
+template <typename DeviceContext, typename T>
+class BlasT : private Blas<DeviceContext> {
+ public:
+  using Blas<DeviceContext>::Blas;
+
+  template <typename... ARGS>
+  void GEMM(ARGS... args) const {
+    Base()->template GEMM<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void MatMul(ARGS... args) const {
+    Base()->template MatMul<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void AXPY(ARGS... args) const {
+    Base()->template AXPY<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void VADD(ARGS... args) const {
+    Base()->template VADD<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void VCOPY(ARGS... args) const {
+    Base()->template VCOPY<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void GEMV(ARGS... args) const {
+    Base()->template GEMV<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void BatchedGEMM(ARGS... args) const {
+    Base()->template BatchedGEMM<T>(args...);
+  }
+
+ private:
+  const Blas<DeviceContext>* Base() const {
+    return static_cast<const Blas<DeviceContext>*>(this);
+  }
+};
+
+template <typename DeviceContext, typename T>
+inline BlasT<DeviceContext, T> GetBlas(
+    const framework::ExecutionContext& exe_ctx) {
+  return BlasT<DeviceContext, T>(
+      exe_ctx.template device_context<DeviceContext>());
+}
+
+template <typename DeviceContext, typename T>
+inline BlasT<DeviceContext, T> GetBlas(const DeviceContext& dev_ctx) {
+  return BlasT<DeviceContext, T>(dev_ctx);
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
+
+#include "paddle/fluid/operators/math/blas_impl.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/operators/math/blas_impl.cu.h"
+#endif
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..d84c88cb3bc1a13acb83b3444dbd1bfca3cba503
--- /dev/null
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -0,0 +1,248 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/dynload/cublas.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct CUBlas;
+
+template <>
+struct CUBlas<float> {
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    PADDLE_ENFORCE(platform::dynload::cublasSgemm(args...));
+  }
+
+  template <typename... ARGS>
+  static void AXPY(ARGS... args) {
+    PADDLE_ENFORCE(platform::dynload::cublasSaxpy(args...));
+  }
+
+  template <typename... ARGS>
+  static void GEMV(ARGS... args) {
+    PADDLE_ENFORCE(platform::dynload::cublasSgemv(args...));
+  }
+
+  template <typename... ARGS>
+  static void GEMM_BATCH(ARGS... args) {
+#if CUDA_VERSION >= 8000
+    PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(args...));
+#else
+    PADDLE_THROW("SgemmStridedBatched is not supported on cuda <= 7.5");
+#endif
+  }
+};
+
+template <>
+struct CUBlas<double> {
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    PADDLE_ENFORCE(platform::dynload::cublasDgemm(args...));
+  }
+
+  template <typename... ARGS>
+  static void AXPY(ARGS... args) {
+    PADDLE_ENFORCE(platform::dynload::cublasDaxpy(args...));
+  }
+
+  template <typename... ARGS>
+  static void GEMV(ARGS... args) {
+    PADDLE_ENFORCE(platform::dynload::cublasDgemv(args...));
+  }
+
+  template <typename... ARGS>
+  static void GEMM_BATCH(ARGS... args) {
+#if CUDA_VERSION >= 8000
+    PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(args...));
+#else
+    PADDLE_THROW("DgemmStridedBatched is not supported on cuda <= 7.5");
+#endif
+  }
+};
+
+template <>
+struct CUBlas<platform::float16> {
+  using float16 = platform::float16;
+
+  static void GEMM(cublasHandle_t handle, cublasOperation_t transa,
+                   cublasOperation_t transb, int m, int n, int k,
+                   const float16 *alpha, const float16 *A, int lda,
+                   const float16 *B, int ldb, const float16 *beta, float16 *C,
+                   int ldc) {
+    PADDLE_ENFORCE(
+        platform::dynload::cublasHgemm(handle, transa, transb, m, n, k,
+                                       reinterpret_cast<const __half *>(alpha),
+                                       reinterpret_cast<const __half *>(A), lda,
+                                       reinterpret_cast<const __half *>(B), ldb,
+                                       reinterpret_cast<const __half *>(beta),
+                                       reinterpret_cast<__half *>(C), ldc));
+  }
+
+  static void GEMM_BATCH(cublasHandle_t handle, cublasOperation_t transa,
+                         cublasOperation_t transb, int m, int n, int k,
+                         const float16 *alpha, const float16 *A, int lda,
+                         long long int strideA, const float16 *B,  // NOLINT
+                         int ldb, long long int strideB,           // NOLINT
+                         const float16 *beta, float16 *C, int ldc,
+                         long long int strideC,  // NOLINT
+                         int batchCount) {
+#if CUDA_VERSION >= 8000
+    PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
+        handle, transa, transb, m, n, k,
+        reinterpret_cast<const __half *>(alpha),
+        reinterpret_cast<const __half *>(A), lda, strideA,
+        reinterpret_cast<const __half *>(B), ldb, strideB,
+        reinterpret_cast<const __half *>(beta), reinterpret_cast<__half *>(C),
+        ldc, strideC, batchCount));
+#else
+    PADDLE_THROW("HgemmStridedBatched is not supported on cuda <= 7.5");
+#endif
+  }
+};
+
+template <>
+template <typename T>
+void Blas<platform::CUDADeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                             CBLAS_TRANSPOSE transB, int M,
+                                             int N, int K, T alpha, const T *A,
+                                             const T *B, T beta, T *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha,
+                  B, ldb, A, lda, &beta, C, N);
+}
+
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::GEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::float16 alpha, const platform::float16 *A,
+    const platform::float16 *B, platform::float16 beta,
+    platform::float16 *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(context_.GetComputeCapability(), 53,
+                    "cublas fp16 gemm requires GPU compute capability >= 53");
+
+#if CUDA_VERSION >= 8000
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+
+  cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+#if CUDA_VERSION >= 9000
+  if (context_.GetComputeCapability() >= 70) {
+    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(
+        context_.cublas_handle(), CUBLAS_TENSOR_OP_MATH));
+    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+  } else {
+    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(
+        context_.cublas_handle(), CUBLAS_DEFAULT_MATH));
+  }
+#endif  // CUDA_VERSION >= 9000
+
+  // cublasHgemm does true FP16 computation which is slow for non-Volta
+  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
+  // input/output in fp16, computation in fp32, which can also be accelerated
+  // using tensor cores in volta GPUs.
+  PADDLE_ENFORCE(platform::dynload::cublasGemmEx(
+      context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, B,
+      CUDA_R_16F, ldb, A, CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N,
+      CUDA_R_32F, algo));
+#else
+  // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
+  CUBlas<platform::float16>::GEMM(context_.cublas_handle(), cuTransB, cuTransA,
+                                  N, M, K, &h_alpha, h_B, ldb, h_A, lda,
+                                  &h_beta, h_C, N);
+#endif  // CUDA_VERSION >= 8000
+}
+
+template <>
+template <typename T>
+void Blas<platform::CUDADeviceContext>::GEMM(bool transA, bool transB, int M,
+                                             int N, int K, T alpha, const T *A,
+                                             int lda, const T *B, int ldb,
+                                             T beta, T *C, int ldc) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
+  CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha,
+                  B, ldb, A, lda, &beta, C, ldc);
+}
+
+template <>
+template <typename T>
+void Blas<platform::CUDADeviceContext>::AXPY(int n, T alpha, const T *x,
+                                             T *y) const {
+  CUBlas<T>::AXPY(context_.cublas_handle(), n, &alpha, x, 1, y, 1);
+}
+
+template <>
+template <typename T>
+void Blas<platform::CUDADeviceContext>::GEMV(bool trans_a, int M, int N,
+                                             T alpha, const T *A, const T *B,
+                                             T beta, T *C) const {
+  cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+  CUBlas<T>::GEMV(context_.cublas_handle(), cuTransA, N, M, &alpha, A, N, B, 1,
+                  &beta, C, 1);
+}
+
+template <>
+template <typename T>
+void Blas<platform::CUDADeviceContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    T alpha, const T *A, const T *B, T beta, T *C, int batchCount,
+    int64_t strideA, int64_t strideB) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  const int64_t strideC = M * N;
+
+  CUBlas<T>::GEMM_BATCH(context_.cublas_handle(), cuTransB, cuTransA, N, M, K,
+                        &alpha, B, ldb, strideB, A, lda, strideA, &beta, C, ldc,
+                        strideC, batchCount);
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ce94cfc93823aa891114ef8fd1e851727ebc623
--- /dev/null
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -0,0 +1,293 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <vector>
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct CBlas;
+
+#ifdef PADDLE_WITH_MKLML
+template <>
+struct CBlas<float> {
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    platform::dynload::cblas_sgemm(args...);
+  }
+
+  template <typename... ARGS>
+  static void AXPY(ARGS... args) {
+    platform::dynload::cblas_saxpy(args...);
+  }
+
+  template <typename... ARGS>
+  static void VCOPY(ARGS... args) {
+    platform::dynload::cblas_scopy(args...);
+  }
+
+  template <typename... ARGS>
+  static void GEMV(ARGS... args) {
+    platform::dynload::cblas_sgemv(args...);
+  }
+
+  template <typename... ARGS>
+  static void GEMM_BATCH(ARGS... args) {
+    platform::dynload::cblas_sgemm_batch(args...);
+  }
+
+  template <typename... ARGS>
+  static void VADD(ARGS... args) {
+    platform::dynload::vsAdd(args...);
+  }
+};
+
+template <>
+struct CBlas<double> {
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    platform::dynload::cblas_dgemm(args...);
+  }
+
+  template <typename... ARGS>
+  static void AXPY(ARGS... args) {
+    platform::dynload::cblas_daxpy(args...);
+  }
+
+  template <typename... ARGS>
+  static void VCOPY(ARGS... args) {
+    platform::dynload::cblas_dcopy(args...);
+  }
+
+  template <typename... ARGS>
+  static void GEMV(ARGS... args) {
+    platform::dynload::cblas_dgemv(args...);
+  }
+
+  template <typename... ARGS>
+  static void GEMM_BATCH(ARGS... args) {
+    platform::dynload::cblas_dgemm_batch(args...);
+  }
+
+  template <typename... ARGS>
+  static void VADD(ARGS... args) {
+    platform::dynload::vdAdd(args...);
+  }
+};
+
+#else
+
+template <>
+struct CBlas<float> {
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    cblas_sgemm(args...);
+  }
+
+  template <typename... ARGS>
+  static void AXPY(ARGS... args) {
+    cblas_saxpy(args...);
+  }
+
+  template <typename... ARGS>
+  static void VCOPY(ARGS... args) {
+    cblas_scopy(args...);
+  }
+
+  template <typename... ARGS>
+  static void GEMV(ARGS... args) {
+    cblas_sgemv(args...);
+  }
+};
+
+template <>
+struct CBlas<double> {
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    cblas_dgemm(args...);
+  }
+
+  template <typename... ARGS>
+  static void AXPY(ARGS... args) {
+    cblas_daxpy(args...);
+  }
+
+  template <typename... ARGS>
+  static void VCOPY(ARGS... args) {
+    cblas_dcopy(args...);
+  }
+
+  template <typename... ARGS>
+  static void GEMV(ARGS... args) {
+    cblas_dgemv(args...);
+  }
+};
+#endif
+template <>
+struct CBlas<platform::float16> {
+  static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
+#ifdef PADDLE_WITH_MKLML
+  static void GEMM_BATCH(...) {
+    PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
+  }
+#endif
+};
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                            CBLAS_TRANSPOSE transB, int M,
+                                            int N, int K, T alpha, const T *A,
+                                            const T *B, T beta, T *C) const {
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
+                 beta, C, ldc);
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::GEMM(bool transA, bool transB, int M,
+                                            int N, int K, T alpha, const T *A,
+                                            int lda, const T *B, int ldb,
+                                            T beta, T *C, int ldc) const {
+  CBlas<T>::GEMM(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
+                 transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
+                 lda, B, ldb, beta, C, ldc);
+}
+
+template <typename DeviceContext>
+template <typename T>
+void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a, bool trans_a,
+                                 const framework::Tensor &mat_b, bool trans_b,
+                                 T alpha, framework::Tensor *mat_out,
+                                 T beta) const {
+  auto dim_a = mat_a.dims();
+  auto dim_b = mat_b.dims();
+  auto dim_out = mat_out->dims();
+  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
+                 "The input and output of matmul be matrix");
+  PADDLE_ENFORCE(
+      mat_a.place() == mat_b.place() && mat_a.place() == mat_out->place(),
+      "The places of matrices must be same");
+
+  int M = dim_out[0];
+  int N = dim_out[1];
+  int K = !trans_a ? dim_a[1] : dim_a[0];
+
+  CBLAS_TRANSPOSE transA = !trans_a ? CblasNoTrans : CblasTrans;
+  CBLAS_TRANSPOSE transB = !trans_b ? CblasNoTrans : CblasTrans;
+
+  this->GEMM(transA, transB, M, N, K, alpha, mat_a.data<T>(), mat_b.data<T>(),
+             beta, mat_out->data<T>());
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::AXPY(int n, T alpha, const T *x,
+                                            T *y) const {
+  CBlas<T>::AXPY(n, alpha, x, 1, y, 1);
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::VCOPY(int n, const T *x, T *y) const {
+  CBlas<T>::VCOPY(n, x, 1, y, 1);
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::VADD(int n, const T *x, const T *y,
+                                            T *z) const {
+#ifdef PADDLE_WITH_MKLML
+  CBlas<T>::VADD(n, x, y, z);
+#else
+  this->template VCOPY<T>(n, y, z);
+  this->template AXPY<T>(n, 1., x, z);
+#endif
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::GEMV(bool trans_a, int M, int N, T alpha,
+                                            const T *A, const T *B, T beta,
+                                            T *C) const {
+  CBLAS_TRANSPOSE transA = !trans_a ? CblasNoTrans : CblasTrans;
+  CBlas<T>::GEMV(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
+}
+
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    T alpha, const T *A, const T *B, T beta, T *C, int batchCount,
+    int64_t strideA, int64_t strideB) const {
+#ifdef PADDLE_WITH_MKLML
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  auto a_array = std::vector<const T *>(batchCount);
+  auto b_array = std::vector<const T *>(batchCount);
+  auto c_array = std::vector<T *>(batchCount);
+  for (int k = 0; k < batchCount; ++k) {
+    a_array[k] = &A[k * strideA];
+    b_array[k] = &B[k * strideB];
+    c_array[k] = &C[k * M * N];
+  }
+
+  CBlas<T>::GEMM_BATCH(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha,
+                       a_array.data(), &lda, b_array.data(), &ldb, &beta,
+                       c_array.data(), &ldc, 1 /* group_count */, &batchCount);
+#else
+  for (int k = 0; k < batchCount; ++k) {
+    auto *Ak = &A[k * strideA];
+    auto *Bk = &B[k * strideB];
+    auto *Ck = &C[k * M * N];
+    this->template GEMM<T>(transA, transB, M, N, K, alpha, Ak, Bk, beta, Ck);
+  }
+#endif
+}
+
+template <typename DeviceContext>
+template <typename T>
+void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
+                                 const MatDescriptor &dim_a,
+                                 const framework::Tensor &mat_b,
+                                 const MatDescriptor &dim_b, T alpha,
+                                 framework::Tensor *mat_out, T beta) const {
+  PADDLE_ENFORCE_EQ(dim_a.width_, dim_b.height_);
+  CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
+  CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
+  if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
+    this->template GEMM<T>(transA, transB, dim_a.height_, dim_b.width_,
+                           dim_a.width_, alpha, mat_a.data<T>(),
+                           mat_b.data<T>(), beta, mat_out->data<T>());
+  } else {
+    PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ ||
+                   dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0);
+    this->template BatchedGEMM<T>(
+        transA, transB, dim_a.height_, dim_b.width_, dim_a.width_, alpha,
+        mat_a.data<T>(), mat_b.data<T>(), beta, mat_out->data<T>(),
+        dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_,
+        dim_a.stride_, dim_b.stride_);
+  }
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/concat.cc b/paddle/fluid/operators/math/concat.cc
index b542143419e05e9baf29e9a2322447f32ddd9829..55c8a472aca7fe700ef6a3f96bed1496d7b12b80 100644
--- a/paddle/fluid/operators/math/concat.cc
+++ b/paddle/fluid/operators/math/concat.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/concat.h"
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -20,7 +21,7 @@ namespace math {
 
 /*
  * All tensors' dimension should be the same and the values of
- * each dimension are the same, except the axis dimension.
+ * each dimension must be the same, except the axis dimension.
  */
 template <typename T>
 class ConcatFunctor<platform::CPUDeviceContext, T> {
@@ -44,7 +45,7 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
       out_cols += t_cols;
       input_cols[i] = t_cols;
     }
-    auto& cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
+    auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
 
     // computation
     for (int k = 0; k < out_rows; ++k) {
@@ -63,41 +64,46 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
 
 /*
  * All tensors' dimension should be the same and the values of
- * each dimension are the same, except the axis dimension.
+ * each dimension must be the same, except the axis dimension.
  */
 template <typename T>
 class ConcatGradFunctor<platform::CPUDeviceContext, T> {
  public:
   void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, const int axis,
-                  std::vector<framework::Tensor>& outputs) {
+                  const framework::Tensor& input,
+                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const int axis, std::vector<framework::Tensor*>* outputs) {
     // TODO(zcd): Add input data validity checking
-    int num = outputs.size();
+    size_t num = outputs->size();
 
     int input_rows = 1;
-    auto dim_0 = outputs[0].dims();
+    auto dim_0 = ref_inputs[0]->dims();
     for (int i = 0; i < axis; ++i) {
       input_rows *= dim_0[i];
     }
+
     int input_cols = 0;
 
-    std::vector<int64_t> output_cols(outputs.size());
-    for (int i = 0; i < num; ++i) {
-      int t_cols = outputs[i].numel() / input_rows;
+    std::vector<int64_t> output_cols(outputs->size());
+    for (size_t i = 0; i < num; ++i) {
+      int t_cols = ref_inputs[i]->numel() / input_rows;
       input_cols += t_cols;
       output_cols[i] = t_cols;
     }
-    auto& cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
+    auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
 
     // computation
     for (int k = 0; k < input_rows; ++k) {
       const T* src_ptr = input.data<T>() + k * input_cols;
       int col_idx = 0;
-      for (int j = 0; j < num; ++j) {
+      for (size_t j = 0; j < num; ++j) {
         int col_len = output_cols[j];
-        T* dst_ptr = outputs[j].data<T>() + k * col_len;
-        memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
-                     sizeof(T) * col_len);
+        auto* out_tensor = outputs->at(j);
+        if (out_tensor != nullptr) {
+          T* dst_ptr = out_tensor->data<T>() + k * col_len;
+          memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
+                       sizeof(T) * col_len);
+        }
         col_idx += col_len;
       }
     }
diff --git a/paddle/fluid/operators/math/concat.cu b/paddle/fluid/operators/math/concat.cu
index 60b266f08fb2d4217c5933902d69de96fc2abe22..5863d74fca21de8b77bc208fb95d8fd52562f7a7 100644
--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat.cu
@@ -12,51 +12,34 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <algorithm>
+#include <vector>
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/operators/math/concat.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename T>
-__device__ T upper_bound(const T* first, T count, T val) {
-  const T* orig = first;
-  const T* it = nullptr;
-  T step = 0;
-  while (count > 0) {
-    it = first;
-    step = count / 2;
-    it += step;
-    if (!(val < *it)) {
-      first = ++it;
-      count -= step + 1;
-    } else {
-      count = step;
-    }
-  }
-  return first - orig;
-}
-
 template <typename T>
 __global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
                              const int output_rows, const int output_cols,
                              T* output) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int segment = upper_bound<int>(input_cols, col_size, tid_x) - 1;
-
-  int curr_offset = input_cols[segment];
-  int curr_segment = segment;
+  int curr_segment = 0;
+  int curr_offset = input_cols[0];
   for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
-    T curr_col_offset;
-    while ((curr_col_offset = input_cols[curr_segment + 1]) <= tid_x) {
+    int curr_col_offset = input_cols[curr_segment + 1];
+    while (curr_col_offset <= tid_x) {
       curr_offset = curr_col_offset;
       ++curr_segment;
+      curr_col_offset = input_cols[curr_segment + 1];
     }
 
     int local_col = tid_x - curr_offset;
     int segment_width = curr_col_offset - curr_offset;
+
     T* input_ptr = inputs[curr_segment];
     int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
     for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
@@ -66,68 +49,70 @@ __global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
 }
 
 template <typename T>
-__global__ void KernelConcat(T** inputs, const int input_col,
-                             const int output_rows, const int output_cols,
-                             T* output) {
+__global__ void KernelConcat(T** inputs_data, const int fixed_in_col,
+                             const int out_rows, const int out_cols,
+                             T* output_data) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  double inv_input_col = 1.0 / input_col;
-  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
-    int split = tid_x * inv_input_col;
-    int in_offset = tid_x - split * input_col;
-    T* input_ptr = inputs[split];
+  for (; tid_x < out_cols; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x * 1.0 / fixed_in_col;
+    int in_offset = tid_x - split * fixed_in_col;
+    T* input_ptr = inputs_data[split];
     int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) {
-      output[tid_y * output_cols + tid_x] =
-          input_ptr[tid_y * input_col + in_offset];
+    for (; tid_y < out_rows; tid_y += blockDim.y * gridDim.y) {
+      output_data[tid_y * out_cols + tid_x] =
+          input_ptr[tid_y * fixed_in_col + in_offset];
     }
   }
 }
 
 template <typename T>
-__global__ void KernelConcatGrad(const T* input, const int input_row,
-                                 const int input_col, const int* output_cols,
-                                 int col_size, T** outputs) {
+__global__ void KernelConcatGrad(const T* input_data, const int in_row,
+                                 const int in_col, const int* out_cols,
+                                 int out_cols_size, T** outputs_data) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int segment = upper_bound<int>(output_cols, col_size, tid_x) - 1;
-  int curr_offset = output_cols[segment];
-  int curr_segment = segment;
-  for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
-    T curr_col_offset;
-    while ((curr_col_offset = output_cols[curr_segment + 1]) <= tid_x) {
+  int curr_segment = 0;
+  int curr_offset = out_cols[0];
+  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
+    int curr_col_offset = out_cols[curr_segment + 1];
+    while (curr_col_offset <= tid_x) {
       curr_offset = curr_col_offset;
       ++curr_segment;
+      curr_col_offset = out_cols[curr_segment + 1];
     }
 
     int local_col = tid_x - curr_offset;
     int segment_width = curr_col_offset - curr_offset;
-    T* output_ptr = outputs[curr_segment];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
-      output_ptr[tid_y * segment_width + local_col] =
-          input[tid_y * input_col + tid_x];
+    T* output_ptr = outputs_data[curr_segment];
+    if (output_ptr != nullptr) {
+      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
+        output_ptr[tid_y * segment_width + local_col] =
+            input_data[tid_y * in_col + tid_x];
+    }
   }
 }
 
 template <typename T>
-__global__ void KernelConcatGrad(const T* input, const int input_row,
-                                 const int input_col, const int output_cols,
-                                 T** outputs) {
+__global__ void KernelConcatGrad(const T* input_data, const int in_row,
+                                 const int in_col, const int fixed_out_col,
+                                 T** outputs_data) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  double inv_input_col = 1.0 / input_col;
-  for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
-    int split = tid_x * inv_input_col;
-    int in_offset = tid_x - split * input_col;
-    T* output_ptr = outputs[split];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
-      output_ptr[tid_y * output_cols + in_offset] =
-          input[tid_y * input_col + tid_x];
+  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x / fixed_out_col;
+    int in_offset = tid_x - split * fixed_out_col;
+    T* output_ptr = outputs_data[split];
+    if (output_ptr != nullptr) {
+      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
+        output_ptr[tid_y * fixed_out_col + in_offset] =
+            input_data[tid_y * in_col + tid_x];
+    }
   }
 }
 
 /*
  * All tensors' dimension should be the same and the values of
- * each dimension are the same, except the axis dimension.
+ * each dimension must be the same, except the axis dimension.
  */
 template <typename T>
 class ConcatFunctor<platform::CUDADeviceContext, T> {
@@ -136,41 +121,40 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
                   const std::vector<framework::Tensor>& input, const int axis,
                   framework::Tensor* output) {
     // TODO(zcd): Add input data validity checking
-    int num = input.size();
-    int rows = 1;
+    int in_num = input.size();
+    int in_row = 1;
     auto dim_0 = input[0].dims();
     for (int i = 0; i < axis; ++i) {
-      rows *= dim_0[i];
+      in_row *= dim_0[i];
     }
-    int cols = input[0].numel() / rows;
-    int out_rows = rows, out_cols = 0;
+    int in_col = input[0].numel() / in_row;
+    int out_row = in_row, out_col = 0;
 
-    framework::Vector<int16_t> inputs_data(num * sizeof(T*) / 2);
-    framework::Vector<int> inputs_cols(num + 1);
-    inputs_cols[0] = 0;
+    framework::Vector<int16_t> inputs_data(in_num * sizeof(T*) / 2);
+    framework::Vector<int> inputs_col(in_num + 1);
     T** inputs_ptr = reinterpret_cast<T**>(inputs_data.data());
 
+    inputs_col[0] = 0;
     bool sameShape = true;
-    for (int i = 0; i < num; ++i) {
-      int t_cols = input[i].numel() / rows;
+    for (int i = 0; i < in_num; ++i) {
+      int t_cols = input[i].numel() / in_row;
       if (sameShape) {
-        if (t_cols != cols) sameShape = false;
+        if (t_cols != in_col) sameShape = false;
       }
-      out_cols += t_cols;
-      inputs_cols[i + 1] = out_cols;
+      out_col += t_cols;
+      inputs_col[i + 1] = out_col;
       inputs_ptr[i] = const_cast<T*>(input[i].data<T>());
     }
 
-    T** ins_gpu =
+    T** dev_ins_data =
         reinterpret_cast<T**>(inputs_data.CUDAMutableData(context.GetPlace()));
-    const int* ins_col_gpu = inputs_cols.CUDAData(context.GetPlace());
 
     // computation
     // set the thread block and grid according to CurrentDeviceId
     const int kThreadsPerBlock = 1024;
     int block_cols = kThreadsPerBlock;
-    if (out_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
-      block_cols = ((out_cols + 31) >> 5) << 5;
+    if (out_col < kThreadsPerBlock) {  // block_cols is aligned by 32.
+      block_cols = ((out_col + 31) >> 5) << 5;
     }
     int block_rows = kThreadsPerBlock / block_cols;
     dim3 block_size = dim3(block_cols, block_rows, 1);
@@ -179,68 +163,73 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
 
     int grid_cols =
-        std::min((out_cols + block_cols - 1) / block_cols, max_blocks);
+        std::min((out_col + block_cols - 1) / block_cols, max_blocks);
     int grid_rows =
-        std::min(max_blocks / grid_cols, std::max(out_rows / block_rows, 1));
+        std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1));
     dim3 grid_size = dim3(grid_cols, grid_rows, 1);
 
     if (sameShape) {
       KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
-          ins_gpu, cols, out_rows, out_cols, output->data<T>());
+          dev_ins_data, in_col, out_row, out_col, output->data<T>());
     } else {
+      const int* dev_ins_col_data = inputs_col.CUDAData(context.GetPlace());
       KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
-          ins_gpu, ins_col_gpu, static_cast<int>(inputs_cols.size()), out_rows,
-          out_cols, output->data<T>());
+          dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()),
+          out_row, out_col, output->data<T>());
     }
   }
 };
 
 /*
  * All tensors' dimension should be the same and the values of
- * each dimension are the same, except the axis dimension.
+ * each dimension must be the same, except the axis dimension.
  */
 template <typename T>
 class ConcatGradFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, const int axis,
-                  std::vector<framework::Tensor>& outputs) {
+                  const framework::Tensor& input,
+                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const int axis, std::vector<framework::Tensor*>* outputs) {
     // TODO(zcd): Add input data validity checking
-    int num = outputs.size();
-    int input_row = 1;
-    auto dim_0 = outputs[0].dims();
+    int o_num = outputs->size();
+    int out_row = 1;
+    auto dim_0 = ref_inputs[0]->dims();
     for (int i = 0; i < axis; ++i) {
-      input_row *= dim_0[i];
+      out_row *= dim_0[i];
     }
 
-    int output_col_0 = outputs[0].numel() / input_row;
-    int input_col = 0;
+    int out0_col = ref_inputs[0]->numel() / out_row;
+    int in_col = 0, in_row = out_row;
     bool sameShape = true;
 
-    framework::Vector<int16_t> outputs_data(num * sizeof(T*) / 2);
-    framework::Vector<int> outputs_cols(num + 1);
-    outputs_cols[0] = 0;
+    framework::Vector<int16_t> outputs_data(o_num * sizeof(T*) / 2);
+    framework::Vector<int> outputs_cols(o_num + 1);
     T** outputs_ptr = reinterpret_cast<T**>(outputs_data.data());
 
-    for (int i = 0; i < num; ++i) {
-      int t_col = outputs[i].numel() / input_row;
+    outputs_cols[0] = 0;
+    for (int i = 0; i < o_num; ++i) {
+      int t_col = ref_inputs.at(i)->numel() / out_row;
       if (sameShape) {
-        if (t_col != output_col_0) sameShape = false;
+        if (t_col != out0_col) sameShape = false;
+      }
+      in_col += t_col;
+      outputs_cols[i + 1] = in_col;
+      if (outputs->at(i) != nullptr) {
+        outputs_ptr[i] = outputs->at(i)->data<T>();
+      } else {
+        outputs_ptr[i] = nullptr;
       }
-      input_col += t_col;
-      outputs_cols[i + 1] = input_col;
-      outputs_ptr[i] = outputs[i].data<T>();
     }
 
-    T** outs_gpu =
+    T** dev_out_gpu_data =
         reinterpret_cast<T**>(outputs_data.CUDAMutableData(context.GetPlace()));
-    const int* outs_col_gpu = outputs_cols.CUDAData(context.GetPlace());
 
     // computation
     const int kThreadsPerBlock = 1024;
     int block_cols = kThreadsPerBlock;
-    if (input_col < kThreadsPerBlock) {  // block_cols is aligned by 32.
-      block_cols = ((input_col + 31) >> 5) << 5;
+    if (in_col < kThreadsPerBlock) {  // block_cols is aligned by 32.
+      block_cols = ((in_col + 31) >> 5) << 5;
     }
     int block_rows = kThreadsPerBlock / block_cols;
     dim3 block_size = dim3(block_cols, block_rows, 1);
@@ -249,18 +238,19 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
     int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
 
     int grid_cols =
-        std::min((input_col + block_cols - 1) / block_cols, max_blocks);
+        std::min((in_col + block_cols - 1) / block_cols, max_blocks);
     int grid_rows =
-        std::min(max_blocks / grid_cols, std::max(input_row / block_rows, 1));
+        std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1));
     dim3 grid_size = dim3(grid_cols, grid_rows, 1);
 
     if (sameShape) {
       KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
-          input.data<T>(), input_row, input_col, output_col_0, outs_gpu);
+          input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
     } else {
+      const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace());
       KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
-          input.data<T>(), input_row, input_col, outs_col_gpu,
-          static_cast<int>(outputs_cols.size()), outs_gpu);
+          input.data<T>(), in_row, in_col, dev_outs_col_data,
+          static_cast<int>(outputs_cols.size()), dev_out_gpu_data);
     }
   }
 };
diff --git a/paddle/fluid/operators/math/concat.h b/paddle/fluid/operators/math/concat.h
index 22147d79e4b1eeee76f7445dd963bf5062049a34..9e080f2e8be23768dcea47b577043beef37b2eaf 100644
--- a/paddle/fluid/operators/math/concat.h
+++ b/paddle/fluid/operators/math/concat.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/tensor.h"
 
 namespace paddle {
@@ -55,7 +57,8 @@ template <typename DeviceContext, typename T>
 class ConcatGradFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const int axis, std::vector<framework::Tensor>& outputs);
+                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const int axis, std::vector<framework::Tensor*>* outputs);
 };
 
 }  // namespace math
diff --git a/paddle/fluid/operators/math/concat.hip.cu b/paddle/fluid/operators/math/concat.hip.cu
new file mode 100644
index 0000000000000000000000000000000000000000..eacef0438883891671fec6e4001f862f619723cb
--- /dev/null
+++ b/paddle/fluid/operators/math/concat.hip.cu
@@ -0,0 +1,15 @@
+/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <hip/hip_runtime.h>
diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc
index 1741af8148bb90863f294ba4930006a58b5ddbf9..a46f2d51ca64501a622b5b48b424dffa16efc5b4 100644
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
@@ -17,17 +17,14 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/tensor_util.h"
 
-using namespace paddle::framework;
-using namespace paddle::platform;
-
 template <typename DeviceContext, typename Place>
 void testConcat() {
-  Tensor input_a_cpu;
-  Tensor input_b_cpu;
-  Tensor out_cpu;
-  Tensor input_a;
-  Tensor input_b;
-  Tensor out;
+  paddle::framework::Tensor input_a_cpu;
+  paddle::framework::Tensor input_b_cpu;
+  paddle::framework::Tensor out_cpu;
+  paddle::framework::Tensor input_a;
+  paddle::framework::Tensor input_b;
+  paddle::framework::Tensor out;
 
   DeviceContext* context = new DeviceContext(Place());
   //  DeviceContext context(Place());
@@ -40,18 +37,18 @@ void testConcat() {
    *    output:
    *        out.shape: [5, 3, 4]
    */
-  auto dim_a = make_ddim({2, 3, 4});
-  auto dim_b = make_ddim({3, 3, 4});
-  auto dim_out = make_ddim({5, 3, 4});
+  auto dim_a = paddle::framework::make_ddim({2, 3, 4});
+  auto dim_b = paddle::framework::make_ddim({3, 3, 4});
+  auto dim_out = paddle::framework::make_ddim({5, 3, 4});
 
   input_a.mutable_data<int>(dim_a, Place());
   input_b.mutable_data<int>(dim_b, Place());
   out.mutable_data<int>(dim_out, Place());
 
   if (paddle::platform::is_gpu_place(Place())) {
-    input_a_cpu.mutable_data<int>(dim_a, CPUPlace());
-    input_b_cpu.mutable_data<int>(dim_b, CPUPlace());
-    out_cpu.mutable_data<int>(dim_out, CPUPlace());
+    input_a_cpu.mutable_data<int>(dim_a, paddle::platform::CPUPlace());
+    input_b_cpu.mutable_data<int>(dim_b, paddle::platform::CPUPlace());
+    out_cpu.mutable_data<int>(dim_out, paddle::platform::CPUPlace());
   }
 
   int* a_ptr;
@@ -72,11 +69,11 @@ void testConcat() {
   }
 
   if (paddle::platform::is_gpu_place(Place())) {
-    TensorCopy(input_a_cpu, Place(), *context, &input_a);
-    TensorCopy(input_b_cpu, Place(), *context, &input_b);
+    paddle::framework::TensorCopySync(input_a_cpu, Place(), &input_a);
+    paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b);
   }
 
-  std::vector<Tensor> input;
+  std::vector<paddle::framework::Tensor> input;
   input.push_back(input_a);
   input.push_back(input_b);
 
@@ -89,7 +86,8 @@ void testConcat() {
 
   int* out_ptr;
   if (paddle::platform::is_gpu_place(Place())) {
-    TensorCopy(out, CPUPlace(), *context, &out_cpu);
+    paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(),
+                                      &out_cpu);
     out_ptr = out_cpu.data<int>();
   } else {
     out_ptr = out.data<int>();
@@ -115,9 +113,9 @@ void testConcat() {
     *    output:
     *        out.shape: [2, 7, 4]
     */
-  dim_a = make_ddim({2, 3, 4});
-  dim_b = make_ddim({2, 4, 4});
-  dim_out = make_ddim({2, 7, 4});
+  dim_a = paddle::framework::make_ddim({2, 3, 4});
+  dim_b = paddle::framework::make_ddim({2, 4, 4});
+  dim_out = paddle::framework::make_ddim({2, 7, 4});
 
   input_a.Resize(dim_a);
   input_b.Resize(dim_b);
@@ -144,8 +142,8 @@ void testConcat() {
   }
 
   if (paddle::platform::is_gpu_place(Place())) {
-    TensorCopy(input_a_cpu, Place(), *context, &input_a);
-    TensorCopy(input_b_cpu, Place(), *context, &input_b);
+    paddle::framework::TensorCopySync(input_a_cpu, Place(), &input_a);
+    paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b);
   }
 
   input.clear();
@@ -159,7 +157,8 @@ void testConcat() {
   PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
 
   if (paddle::platform::is_gpu_place(Place())) {
-    TensorCopy(out, CPUPlace(), *context, &out_cpu);
+    paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(),
+                                      &out_cpu);
     out_ptr = out_cpu.data<int>();
   } else {
     out_ptr = out.data<int>();
@@ -187,9 +186,9 @@ void testConcat() {
     *    output:
     *        out.shape: [2, 3, 9]
     */
-  dim_a = make_ddim({2, 3, 4});
-  dim_b = make_ddim({2, 3, 5});
-  dim_out = make_ddim({2, 3, 9});
+  dim_a = paddle::framework::make_ddim({2, 3, 4});
+  dim_b = paddle::framework::make_ddim({2, 3, 5});
+  dim_out = paddle::framework::make_ddim({2, 3, 9});
 
   input_a.Resize(dim_a);
   input_b.Resize(dim_b);
@@ -216,8 +215,8 @@ void testConcat() {
   }
 
   if (paddle::platform::is_gpu_place(Place())) {
-    TensorCopy(input_a_cpu, Place(), *context, &input_a);
-    TensorCopy(input_b_cpu, Place(), *context, &input_b);
+    paddle::framework::TensorCopySync(input_a_cpu, Place(), &input_a);
+    paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b);
   }
 
   input.clear();
@@ -231,7 +230,8 @@ void testConcat() {
   PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
 
   if (paddle::platform::is_gpu_place(Place())) {
-    TensorCopy(out, CPUPlace(), *context, &out_cpu);
+    paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(),
+                                      &out_cpu);
     out_ptr = out_cpu.data<int>();
   } else {
     out_ptr = out.data<int>();
@@ -261,9 +261,9 @@ void testConcat() {
     *    output:
     *        out.shape: [2, 6, 4]
     */
-  dim_a = make_ddim({2, 3, 4});
-  dim_b = make_ddim({2, 3, 4});
-  dim_out = make_ddim({2, 6, 4});
+  dim_a = paddle::framework::make_ddim({2, 3, 4});
+  dim_b = paddle::framework::make_ddim({2, 3, 4});
+  dim_out = paddle::framework::make_ddim({2, 6, 4});
 
   input_a.Resize(dim_a);
   input_b.Resize(dim_b);
@@ -290,8 +290,8 @@ void testConcat() {
   }
 
   if (paddle::platform::is_gpu_place(Place())) {
-    TensorCopy(input_a_cpu, Place(), *context, &input_a);
-    TensorCopy(input_b_cpu, Place(), *context, &input_b);
+    paddle::framework::TensorCopySync(input_a_cpu, Place(), &input_a);
+    paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b);
   }
 
   input.clear();
@@ -305,7 +305,8 @@ void testConcat() {
   PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
 
   if (paddle::platform::is_gpu_place(Place())) {
-    TensorCopy(out, CPUPlace(), *context, &out_cpu);
+    paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(),
+                                      &out_cpu);
     out_ptr = out_cpu.data<int>();
   } else {
     out_ptr = out.data<int>();
diff --git a/paddle/fluid/operators/math/context_project.h b/paddle/fluid/operators/math/context_project.h
index 4da94383af6c927a2db3337be6e82ca95a8cb036..bc0df3f3551c7a100d5d285cab585bb81c07fc5e 100644
--- a/paddle/fluid/operators/math/context_project.h
+++ b/paddle/fluid/operators/math/context_project.h
@@ -14,9 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
+#include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/im2col.h"
-#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -209,6 +211,7 @@ class ContextProjectGradFunctor {
     int input_row_begin, input_row_end;
     int sequence_height, sequence_width;
     sequence_width = in.dims()[1];
+    auto blas = math::GetBlas<DeviceContext, T>(context);
 
     if (input_grad) {
       for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
@@ -260,8 +263,8 @@ class ContextProjectGradFunctor {
               Tensor out_t_sub = out_t.Slice(k * context_length,
                                              k * context_length + padding_size);
               Tensor w_sub = padding_data->Slice(k, k + padding_size);
-              axpy<DeviceContext, T>(context, w_sub.numel(), static_cast<T>(1),
-                                     out_t_sub.data<T>(), w_sub.data<T>());
+              blas.AXPY(w_sub.numel(), static_cast<T>(1), out_t_sub.data<T>(),
+                        w_sub.data<T>());
             }
           }
           if (down_pad > 0) {
@@ -292,8 +295,8 @@ class ContextProjectGradFunctor {
                   (down_pad_begin_row + t) * context_length);
               Tensor w_sub = padding_data->Slice(
                   up_pad + padding_idx, up_pad + padding_idx + padding_size);
-              axpy<DeviceContext, T>(context, w_sub.numel(), static_cast<T>(1),
-                                     out_t_sub.data<T>(), w_sub.data<T>());
+              blas.AXPY(w_sub.numel(), static_cast<T>(1), out_t_sub.data<T>(),
+                        w_sub.data<T>());
             }
           }
           out_t.Resize({sequence_height, context_length * sequence_width});
diff --git a/paddle/fluid/operators/math/cos_sim_functor.cu b/paddle/fluid/operators/math/cos_sim_functor.cu
index 55c1e726335dfe010e39847ac90b84cc49955360..4e6ff5ee0a449b42762748ba1a103876beee01f2 100644
--- a/paddle/fluid/operators/math/cos_sim_functor.cu
+++ b/paddle/fluid/operators/math/cos_sim_functor.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/cos_sim_functor.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/cross_entropy.cc b/paddle/fluid/operators/math/cross_entropy.cc
index fc0fca5ad3370633b2f60db65fdb7c01c417dc50..caff35e03ae3a144f799d982c859ded62cb3e93d 100644
--- a/paddle/fluid/operators/math/cross_entropy.cc
+++ b/paddle/fluid/operators/math/cross_entropy.cc
@@ -46,7 +46,10 @@ class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
 
       const int64_t* label_data = labels->data<int64_t>();
       for (int i = 0; i < batch_size; ++i) {
-        int index = i * class_num + label_data[i];
+        int lbl = label_data[i];
+        PADDLE_ENFORCE_GE(lbl, 0);
+        PADDLE_ENFORCE_LT(lbl, class_num);
+        int index = i * class_num + lbl;
         loss_data[i] = -math::TolerableValue<T>()(std::log(prob_data[index]));
       }
     }
diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu
index f4935c2813c9f84699f1182df6a9adb613190506..0de58d5fddd84d33f708c4c73e5a19dc2fe8a86b 100644
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/cross_entropy.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
 namespace operators {
@@ -29,66 +31,22 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
   }
 }
 
-template <typename T>
-__device__ __forceinline__ T sum_single_warp(T val) {
-  val += __shfl_down(val, 16);
-  val += __shfl_down(val, 8);
-  val += __shfl_down(val, 4);
-  val += __shfl_down(val, 2);
-  val += __shfl_down(val, 1);
-  return val;
-}
-
-// CUDA do not support dynamic arrary in template
-// https://stackoverflow.com/questions/20497209
-template <typename T>
-struct SharedMemory {
-  // Ensure that we won't compile any un-specialized types
-  __device__ T* GetPointer() { return NULL; }
-};
-
-template <>
-struct SharedMemory<float> {
-  __device__ float* GetPointer() {
-    extern __shared__ float s_float[];
-    return s_float;
-  }
-};
-
-template <>
-struct SharedMemory<double> {
-  __device__ double* GetPointer() {
-    extern __shared__ double s_double[];
-    return s_double;
-  }
-};
-
 template <typename T>
 __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
                                        const int class_num) {
   int tid = threadIdx.x;
-  SharedMemory<T> d_sum_shared;
-  T* d_sum = d_sum_shared.GetPointer();
-  d_sum[tid] = 0;
+  T val = 0;
 
-  int cur_idx = tid;
-  int next_idx = blockIdx.x * class_num + tid;
-  while (cur_idx < class_num) {
-    d_sum[tid] +=
-        math::TolerableValue<T>()(std::log(X[next_idx])) * label[next_idx];
-    next_idx += blockDim.x;
-    cur_idx += blockDim.x;
+  int idx = blockIdx.x * class_num + tid;
+  int end = blockIdx.x * class_num + class_num;
+  for (; idx < end; idx += blockDim.x) {
+    val += math::TolerableValue<T>()(std::log(X[idx])) * label[idx];
   }
-  __syncthreads();
 
-  for (unsigned int stride = blockDim.x >> 1; stride >= 32; stride >>= 1) {
-    if (tid < stride) d_sum[tid] += d_sum[tid + stride];
-    __syncthreads();
+  val = paddle::platform::reduceSum(val, tid, blockDim.x);
+  if (threadIdx.x == 0) {
+    Y[blockIdx.x] = -val;
   }
-
-  T val = d_sum[tid];
-  val = sum_single_warp<T>(val);
-  if (tid == 0) Y[blockIdx.x] = -val;
 }
 }  // namespace
 
@@ -108,11 +66,11 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
 
     if (softLabel) {
       const T* label_data = labels->data<T>();
-      int block = class_num > 512 ? 512 : pow(2, int(std::log2(class_num)));
+      int block = class_num > 512
+                      ? 512
+                      : pow(2, static_cast<int>(std::log2(class_num)));
 
-      SoftCrossEntropyKernel<T><<<
-          batch_size, block, block * sizeof(T),
-          reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+      SoftCrossEntropyKernel<T><<<batch_size, block, 0, ctx.stream()>>>(
           loss_data, prob_data, label_data, class_num);
     } else {
       const int64_t* label_data = labels->data<int64_t>();
diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu
index a5e6e4031bbaddc2d09c660d34a975b2a496bc63..027e2de48d229761f12f974dc73625c8ea1b3567 100644
--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
@@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <vector>
 #include "paddle/fluid/operators/math/depthwise_conv.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/depthwise_conv.h b/paddle/fluid/operators/math/depthwise_conv.h
index 081bda891d044041f6814964e7076e28e812039c..97aec401889a56d3fc9ac08e766d931bb3725b01 100644
--- a/paddle/fluid/operators/math/depthwise_conv.h
+++ b/paddle/fluid/operators/math/depthwise_conv.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <vector>
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/hostdevice.h"
diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/fluid/operators/math/detail/activation_functions.h
index d205ebf210818b91c3cf6d4563fca1e56702bcf9..b127fbe8c8515e7fe57b07ea1d4291675ec4efca 100644
--- a/paddle/fluid/operators/math/detail/activation_functions.h
+++ b/paddle/fluid/operators/math/detail/activation_functions.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <math.h>
+#include <string>
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
diff --git a/paddle/fluid/operators/math/detail/avx_functions.cc b/paddle/fluid/operators/math/detail/avx_functions.cc
index b95109d3f73505fa6b5438326804a2b348fb3668..5641f914523771f47bd7f814bfd39964a53deefc 100644
--- a/paddle/fluid/operators/math/detail/avx_functions.cc
+++ b/paddle/fluid/operators/math/detail/avx_functions.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <immintrin.h>
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 // TODO(qingqing) refine this dependence
-#include "paddle/cuda/src/avx_mathfun.h"
+#include "paddle/legacy/cuda/src/avx_mathfun.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
index 1e5ff8ef46db960ddf88ebf03041893b176c1950..b6f4ab93777f2b5eb13a4a3172028d87c4546017 100644
--- a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
@@ -43,8 +43,8 @@ void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
       r_prev_out = prev_output_value[i];
     }
 
-    op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
-                    r_value_reset_output, active_gate);
+    op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out,
+                    &r_value_reset_output, active_gate);
 
     update_gate[i] = r_value_update_gate;
     reset_gate[i] = r_value_reset_gate;
@@ -71,8 +71,8 @@ void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
       r_prev_out = prev_output_value[i];
     }
 
-    op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
-                    r_output, active_node);
+    op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
+                    &r_output, active_node);
 
     frame_state[i] = r_value_frame_state;
     output_value[i] = r_output;
@@ -89,22 +89,22 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
   __m256 r_value_reset_gate;
   __m256 r_value_reset_output;
   __m256 r_prev_out = _mm256_set1_ps(0.0f);
-  __m256 *update_gate = (__m256 *)gate_value;
-  __m256 *reset_gate = (__m256 *)(gate_value + frame_size);
+  __m256 *update_gate = reinterpret_cast<__m256 *>(gate_value);
+  __m256 *reset_gate = reinterpret_cast<__m256 *>(gate_value + frame_size);
 
   for (int i = 0; i < frame_size / 8; i++) {
     r_value_update_gate = update_gate[i];
     r_value_reset_gate = reset_gate[i];
     if (prev_output_value) {
-      r_prev_out = ((__m256 *)prev_output_value)[i];
+      r_prev_out = (reinterpret_cast<__m256 *>(prev_output_value))[i];
     }
 
-    op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
-                    r_value_reset_output, active_gate);
+    op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out,
+                    &r_value_reset_output, active_gate);
 
     update_gate[i] = r_value_update_gate;
     reset_gate[i] = r_value_reset_gate;
-    ((__m256 *)reset_output_value)[i] = r_value_reset_output;
+    (reinterpret_cast<__m256 *>(reset_output_value))[i] = r_value_reset_output;
   }
 #endif
 }
@@ -119,21 +119,21 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
   __m256 r_value_frame_state;
   __m256 r_prev_out = _mm256_set1_ps(0.0f);
   __m256 r_output;
-  __m256 *update_gate = (__m256 *)gate_value;
-  __m256 *frame_state = (__m256 *)(gate_value + frame_size * 2);
+  __m256 *update_gate = reinterpret_cast<__m256 *>(gate_value);
+  __m256 *frame_state = reinterpret_cast<__m256 *>(gate_value + frame_size * 2);
 
   for (int i = 0; i < frame_size / 8; i++) {
     r_value_update_gate = update_gate[i];
     r_value_frame_state = frame_state[i];
     if (prev_output_value) {
-      r_prev_out = ((__m256 *)prev_output_value)[i];
+      r_prev_out = (reinterpret_cast<__m256 *>(prev_output_value))[i];
     }
 
-    op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
-                    r_output, active_node);
+    op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
+                    &r_output, active_node);
 
     frame_state[i] = r_value_frame_state;
-    ((__m256 *)output_value)[i] = r_output;
+    (reinterpret_cast<__m256 *>(output_value))[i] = r_output;
   }
 #endif
 }
@@ -213,9 +213,9 @@ void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
       r_prev_out_grad = prev_out_grad[i];
     }
 
-    op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
-                  r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
-                  r_out_grad, active_node);
+    op_state_grad(&r_update_gate_value, &r_update_gate_grad,
+                  &r_frame_state_value, &r_frame_state_grad, &r_prev_out_value,
+                  &r_prev_out_grad, &r_out_grad, active_node);
 
     update_gate_grad[i] = r_update_gate_grad;
     frame_state_grad[i] = r_frame_state_grad;
@@ -258,9 +258,9 @@ void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
       r_prev_out_grad = prev_out_grad[i];
     }
 
-    op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
-                  r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
-                  r_reset_output_grad, active_gate);
+    op_reset_grad(&r_update_gate_value, &r_update_gate_grad,
+                  &r_reset_gate_value, &r_reset_gate_grad, &r_prev_out_value,
+                  &r_prev_out_grad, &r_reset_output_grad, active_gate);
 
     update_gate_grad[i] = r_update_gate_grad;
     reset_gate_grad[i] = r_reset_gate_grad;
@@ -284,30 +284,32 @@ void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
   __m256 r_out_grad;
   __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
   __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
-  __m256 *update_gate_value = (__m256 *)gate_value;
-  __m256 *update_gate_grad = (__m256 *)gate_grad;
-  __m256 *frame_state_value = (__m256 *)(gate_value + frame_size * 2);
-  __m256 *frame_state_grad = (__m256 *)(gate_grad + frame_size * 2);
+  __m256 *update_gate_value = reinterpret_cast<__m256 *>(gate_value);
+  __m256 *update_gate_grad = reinterpret_cast<__m256 *>(gate_grad);
+  __m256 *frame_state_value =
+      reinterpret_cast<__m256 *>(gate_value + frame_size * 2);
+  __m256 *frame_state_grad =
+      reinterpret_cast<__m256 *>(gate_grad + frame_size * 2);
 
   for (int i = 0; i < frame_size / 8; i++) {
     r_update_gate_value = update_gate_value[i];
     r_frame_state_value = frame_state_value[i];
-    r_out_grad = ((__m256 *)output_grad)[i];
+    r_out_grad = (reinterpret_cast<__m256 *>(output_grad))[i];
     if (prev_out_value) {
-      r_prev_out_value = ((__m256 *)prev_out_value)[i];
+      r_prev_out_value = (reinterpret_cast<__m256 *>(prev_out_value))[i];
     }
     if (prev_out_grad) {
-      r_prev_out_grad = ((__m256 *)prev_out_grad)[i];
+      r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
     }
 
-    op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
-                  r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
-                  r_out_grad, active_node);
+    op_state_grad(&r_update_gate_value, &r_update_gate_grad,
+                  &r_frame_state_value, &r_frame_state_grad, &r_prev_out_value,
+                  &r_prev_out_grad, &r_out_grad, active_node);
 
     update_gate_grad[i] = r_update_gate_grad;
     frame_state_grad[i] = r_frame_state_grad;
     if (prev_out_grad) {
-      ((__m256 *)prev_out_grad)[i] = r_prev_out_grad;
+      (reinterpret_cast<__m256 *>(prev_out_grad))[i] = r_prev_out_grad;
     }
   }
 #endif
@@ -327,10 +329,11 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
   __m256 r_reset_output_grad = _mm256_set1_ps(0.0f);
   __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
   __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
-  __m256 *update_gate_value = (__m256 *)gate_value;
-  __m256 *update_gate_grad = (__m256 *)gate_grad;
-  __m256 *reset_gate_value = (__m256 *)(gate_value + frame_size);
-  __m256 *reset_gate_grad = (__m256 *)(gate_grad + frame_size);
+  __m256 *update_gate_value = reinterpret_cast<__m256 *>(gate_value);
+  __m256 *update_gate_grad = reinterpret_cast<__m256 *>(gate_grad);
+  __m256 *reset_gate_value =
+      reinterpret_cast<__m256 *>(gate_value + frame_size);
+  __m256 *reset_gate_grad = reinterpret_cast<__m256 *>(gate_grad + frame_size);
 
   for (int i = 0; i < frame_size / 8; i++) {
     r_update_gate_value = update_gate_value[i];
@@ -338,23 +341,23 @@ void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
     r_reset_gate_value = reset_gate_value[i];
 
     if (prev_out_value && prev_out_grad) {
-      r_reset_output_grad = ((__m256 *)reset_output_grad)[i];
+      r_reset_output_grad = (reinterpret_cast<__m256 *>(reset_output_grad))[i];
     }
     if (prev_out_value) {
-      r_prev_out_value = ((__m256 *)prev_out_value)[i];
+      r_prev_out_value = (reinterpret_cast<__m256 *>(prev_out_value))[i];
     }
     if (prev_out_grad) {
-      r_prev_out_grad = ((__m256 *)prev_out_grad)[i];
+      r_prev_out_grad = (reinterpret_cast<__m256 *>(prev_out_grad))[i];
     }
 
-    op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
-                  r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
-                  r_reset_output_grad, active_gate);
+    op_reset_grad(&r_update_gate_value, &r_update_gate_grad,
+                  &r_reset_gate_value, &r_reset_gate_grad, &r_prev_out_value,
+                  &r_prev_out_grad, &r_reset_output_grad, active_gate);
 
     update_gate_grad[i] = r_update_gate_grad;
     reset_gate_grad[i] = r_reset_gate_grad;
     if (prev_out_grad) {
-      ((__m256 *)prev_out_grad)[i] = r_prev_out_grad;
+      (reinterpret_cast<__m256 *>(prev_out_grad))[i] = r_prev_out_grad;
     }
   }
 #endif
diff --git a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
index 657652562780ae9932a4394335bfa3c3b397bb80..813d69f6aba722609a0523a5be71d32f91f76d59 100644
--- a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <type_traits>
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/gru_compute.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
@@ -55,8 +55,8 @@ __global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
     r_prev_out = prev_output_value[frame_idx];
   }
 
-  op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
-                  r_value_reset_output, active_gate);
+  op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out,
+                  &r_value_reset_output, active_gate);
 
   gate_value[frame_idx + frame_size * 0] = r_value_update_gate;
   gate_value[frame_idx + frame_size * 1] = r_value_reset_gate;
@@ -93,8 +93,8 @@ __global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
     r_prev_out = prev_output_value[frame_idx];
   }
 
-  op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
-                  r_output, active_node);
+  op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
+                  &r_output, active_node);
 
   gate_value[frame_idx + frame_size * 2] = r_value_frame_state;
   output_value[frame_idx] = r_output;
@@ -137,9 +137,9 @@ __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
     r_prev_out_grad = prev_out_grad[frame_idx];
   }
 
-  op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
-                r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
-                r_out_grad, active_node);
+  op_state_grad(&r_update_gate_value, &r_update_gate_grad, &r_frame_state_value,
+                &r_frame_state_grad, &r_prev_out_value, &r_prev_out_grad,
+                &r_out_grad, active_node);
 
   gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
   gate_grad[frame_idx + frame_size * 2] = r_frame_state_grad;
@@ -185,9 +185,9 @@ __global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value,
     r_reset_output_grad = reset_output_grad[frame_idx];
   }
 
-  op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
-                r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
-                r_reset_output_grad, active_gate);
+  op_reset_grad(&r_update_gate_value, &r_update_gate_grad, &r_reset_gate_value,
+                &r_reset_gate_grad, &r_prev_out_value, &r_prev_out_grad,
+                &r_reset_output_grad, active_gate);
 
   gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
   gate_grad[frame_idx + frame_size * 1] = r_reset_gate_grad;
diff --git a/paddle/fluid/operators/math/detail/gru_kernel.h b/paddle/fluid/operators/math/detail/gru_kernel.h
index 991f2e758c2c3b4acc93c1eb84dbd196d623b5cc..f6d192358bd84eb56a2e01eb36f28d8832ef271f 100644
--- a/paddle/fluid/operators/math/detail/gru_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_kernel.h
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
+#include <type_traits>
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
-#include <type_traits>
-
 // TODO(guosheng): refine code style in gru_kernel
 namespace paddle {
 namespace operators {
@@ -28,25 +28,25 @@ namespace forward {
 template <typename T>
 class gru_resetOutput {
  public:
-  HOSTDEVICE void operator()(T &value_update_gate, T &value_reset_gate,
-                             T &prev_out, T &value_reset_output,
+  HOSTDEVICE void operator()(T *value_update_gate, T *value_reset_gate,
+                             T *prev_out, T *value_reset_output,
                              ActivationType act_gate) {
-    value_update_gate = activation(value_update_gate, act_gate);
-    value_reset_gate = activation(value_reset_gate, act_gate);
-    value_reset_output = prev_out * value_reset_gate;
+    *value_update_gate = activation(*value_update_gate, act_gate);
+    *value_reset_gate = activation(*value_reset_gate, act_gate);
+    *value_reset_output = (*prev_out) * (*value_reset_gate);
   }
 #ifndef __NVCC__
 #ifndef __AVX__
   static const bool avx = false;
 #else
   static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &value_update_gate,
-                             __m256 &value_reset_gate, __m256 &prev_out,
-                             __m256 &value_reset_output,
+  HOSTDEVICE void operator()(__m256 *value_update_gate,
+                             __m256 *value_reset_gate, __m256 *prev_out,
+                             __m256 *value_reset_output,
                              ActivationType act_gate) {
-    value_update_gate = activation(value_update_gate, act_gate);
-    value_reset_gate = activation(value_reset_gate, act_gate);
-    value_reset_output = _mm256_mul_ps(prev_out, value_reset_gate);
+    *value_update_gate = activation(*value_update_gate, act_gate);
+    *value_reset_gate = activation(*value_reset_gate, act_gate);
+    *value_reset_output = _mm256_mul_ps(*prev_out, *value_reset_gate);
   }
 #endif
 #endif
@@ -55,25 +55,25 @@ class gru_resetOutput {
 template <typename T>
 class gru_finalOutput {
  public:
-  HOSTDEVICE void operator()(T &value_update_gate, T &value_frame_state,
-                             T &prev_out, T &value_output,
+  HOSTDEVICE void operator()(T *value_update_gate, T *value_frame_state,
+                             T *prev_out, T *value_output,
                              ActivationType act_input) {
-    value_frame_state = activation(value_frame_state, act_input);
-    value_output = prev_out - (value_update_gate * prev_out) +
-                   (value_update_gate * value_frame_state);
+    *value_frame_state = activation(*value_frame_state, act_input);
+    *value_output = *prev_out - ((*value_update_gate) * (*prev_out)) +
+                    ((*value_update_gate) * (*value_frame_state));
   }
 #ifndef __NVCC__
 #ifndef __AVX__
   static const bool avx = false;
 #else
   static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &value_update_gate,
-                             __m256 &value_frame_state, __m256 &prev_out,
-                             __m256 &value_output, ActivationType act_input) {
-    value_frame_state = activation(value_frame_state, act_input);
-    value_output = _mm256_add_ps(
-        _mm256_sub_ps(prev_out, _mm256_mul_ps(value_update_gate, prev_out)),
-        _mm256_mul_ps(value_update_gate, value_frame_state));
+  HOSTDEVICE void operator()(__m256 *value_update_gate,
+                             __m256 *value_frame_state, __m256 *prev_out,
+                             __m256 *value_output, ActivationType act_input) {
+    *value_frame_state = activation(*value_frame_state, act_input);
+    *value_output = _mm256_add_ps(
+        _mm256_sub_ps(*prev_out, _mm256_mul_ps(*value_update_gate, *prev_out)),
+        _mm256_mul_ps(*value_update_gate, *value_frame_state));
   }
 #endif
 #endif
@@ -85,37 +85,38 @@ namespace backward {
 template <typename T>
 class gru_stateGrad {
  public:
-  HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
-                             T &value_frame_state, T &grad_frame_state,
-                             T &value_prev_out, T &grad_prev_out,
-                             T &grad_output, ActivationType act_input) {
-    grad_update_gate = (grad_output * value_frame_state);
-    grad_update_gate -= (grad_output * value_prev_out);
-    grad_prev_out -= (grad_output * value_update_gate);
-    grad_prev_out += grad_output;
-    grad_frame_state = activation(grad_output * value_update_gate,
-                                  value_frame_state, act_input);
+  HOSTDEVICE void operator()(T *value_update_gate, T *grad_update_gate,
+                             T *value_frame_state, T *grad_frame_state,
+                             T *value_prev_out, T *grad_prev_out,
+                             T *grad_output, ActivationType act_input) {
+    *grad_update_gate = (*grad_output * (*value_frame_state));
+    *grad_update_gate -= (*grad_output * (*value_prev_out));
+    *grad_prev_out -= (*grad_output * (*value_update_gate));
+    *grad_prev_out += *grad_output;
+    *grad_frame_state = activation(*grad_output * (*value_update_gate),
+                                   *value_frame_state, act_input);
   }
 #ifndef __NVCC__
 #ifndef __AVX__
   static const bool avx = false;
 #else
   static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &value_update_gate,
-                             __m256 &grad_update_gate,
-                             __m256 &value_frame_state,
-                             __m256 &grad_frame_state, __m256 &value_prev_out,
-                             __m256 &grad_prev_out, __m256 &grad_output,
+  HOSTDEVICE void operator()(__m256 *value_update_gate,
+                             __m256 *grad_update_gate,
+                             __m256 *value_frame_state,
+                             __m256 *grad_frame_state, __m256 *value_prev_out,
+                             __m256 *grad_prev_out, __m256 *grad_output,
                              ActivationType act_input) {
-    grad_update_gate = _mm256_mul_ps(grad_output, value_frame_state);
-    grad_update_gate = _mm256_sub_ps(
-        grad_update_gate, _mm256_mul_ps(grad_output, value_prev_out));
-    grad_prev_out = _mm256_add_ps(
-        _mm256_sub_ps(grad_prev_out,
-                      _mm256_mul_ps(grad_output, value_update_gate)),
-        grad_output);
-    grad_frame_state = activation(_mm256_mul_ps(grad_output, value_update_gate),
-                                  value_frame_state, act_input);
+    *grad_update_gate = _mm256_mul_ps(*grad_output, *value_frame_state);
+    *grad_update_gate = _mm256_sub_ps(
+        *grad_update_gate, _mm256_mul_ps(*grad_output, *value_prev_out));
+    *grad_prev_out = _mm256_add_ps(
+        _mm256_sub_ps(*grad_prev_out,
+                      _mm256_mul_ps(*grad_output, *value_update_gate)),
+        *grad_output);
+    *grad_frame_state =
+        activation(_mm256_mul_ps(*grad_output, *value_update_gate),
+                   *value_frame_state, act_input);
   }
 #endif
 #endif
@@ -124,32 +125,34 @@ class gru_stateGrad {
 template <typename T>
 class gru_resetGrad {
  public:
-  HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
-                             T &value_reset_gate, T &grad_reset_gate,
-                             T &value_prev_out, T &grad_prev_out,
-                             T &grad_reset_output, ActivationType act_gate) {
-    grad_reset_gate = (grad_reset_output * value_prev_out);
-    grad_prev_out += (grad_reset_output * value_reset_gate);
-    grad_update_gate =
-        activation(grad_update_gate, value_update_gate, act_gate);
-    grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
+  HOSTDEVICE void operator()(T *value_update_gate, T *grad_update_gate,
+                             T *value_reset_gate, T *grad_reset_gate,
+                             T *value_prev_out, T *grad_prev_out,
+                             T *grad_reset_output, ActivationType act_gate) {
+    *grad_reset_gate = (*grad_reset_output * (*value_prev_out));
+    *grad_prev_out += (*grad_reset_output * (*value_reset_gate));
+    *grad_update_gate =
+        activation(*grad_update_gate, *value_update_gate, act_gate);
+    *grad_reset_gate =
+        activation(*grad_reset_gate, *value_reset_gate, act_gate);
   }
 #ifndef __NVCC__
 #ifndef __AVX__
   static const bool avx = false;
 #else
   static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &value_update_gate,
-                             __m256 &grad_update_gate, __m256 &value_reset_gate,
-                             __m256 &grad_reset_gate, __m256 &value_prev_out,
-                             __m256 &grad_prev_out, __m256 &grad_reset_output,
+  HOSTDEVICE void operator()(__m256 *value_update_gate,
+                             __m256 *grad_update_gate, __m256 *value_reset_gate,
+                             __m256 *grad_reset_gate, __m256 *value_prev_out,
+                             __m256 *grad_prev_out, __m256 *grad_reset_output,
                              ActivationType act_gate) {
-    grad_reset_gate = _mm256_mul_ps(grad_reset_output, value_prev_out);
-    grad_prev_out = _mm256_add_ps(
-        grad_prev_out, _mm256_mul_ps(grad_reset_output, value_reset_gate));
-    grad_update_gate =
-        activation(grad_update_gate, value_update_gate, act_gate);
-    grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
+    *grad_reset_gate = _mm256_mul_ps(*grad_reset_output, *value_prev_out);
+    *grad_prev_out = _mm256_add_ps(
+        *grad_prev_out, _mm256_mul_ps(*grad_reset_output, *value_reset_gate));
+    *grad_update_gate =
+        activation(*grad_update_gate, *value_update_gate, act_gate);
+    *grad_reset_gate =
+        activation(*grad_reset_gate, *value_reset_gate, act_gate);
   }
 #endif
 #endif
diff --git a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
index 6ad77830fd7a9809c4922878cc8ccdff1e8e0ef7..ccbd05c82ad6a880d21269092088be9656b35c99 100644
--- a/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_cpu_kernel.h
@@ -59,9 +59,9 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
       r_prev_state = value.prev_state_value[i];
     }
 
-    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
-       r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node,
-       active_gate, active_state);
+    op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
+       &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
+       active_node, active_gate, active_state);
 
     value_in[i] = r_value_in;
     value_ig[i] = r_value_ig;
@@ -125,11 +125,11 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
       r_prev_state = value.prev_state_value[i];
     }
 
-    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
-       r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
-       r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
-       r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
-       active_state);
+    op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in,
+       &r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad,
+       &r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI,
+       &r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad,
+       active_node, active_gate, active_state);
 
     grad_in[i] = r_grad_in;
     grad_ig[i] = r_grad_ig;
@@ -164,10 +164,12 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
   __m256 r_state_atv;
   __m256 r_out;
 
-  __m256 *value_in = (__m256 *)value.gate_value;
-  __m256 *value_ig = (__m256 *)(value.gate_value + frame_size);
-  __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2);
-  __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3);
+  __m256 *value_in = reinterpret_cast<__m256 *>(value.gate_value);
+  __m256 *value_ig = reinterpret_cast<__m256 *>(value.gate_value + frame_size);
+  __m256 *value_fg =
+      reinterpret_cast<__m256 *>(value.gate_value + frame_size * 2);
+  __m256 *value_og =
+      reinterpret_cast<__m256 *>(value.gate_value + frame_size * 3);
 
   for (int i = 0; i < frame_size / 8; i++) {
     r_value_in = value_in[i];
@@ -175,26 +177,26 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
     r_value_fg = value_fg[i];
     r_value_og = value_og[i];
     if (value.check_ig) {
-      r_checkI = ((__m256 *)value.check_ig)[i];
-      r_checkF = ((__m256 *)value.check_fg)[i];
-      r_checkO = ((__m256 *)value.check_og)[i];
+      r_checkI = (reinterpret_cast<__m256 *>(value.check_ig))[i];
+      r_checkF = (reinterpret_cast<__m256 *>(value.check_fg))[i];
+      r_checkO = (reinterpret_cast<__m256 *>(value.check_og))[i];
     }
 
     if (value.prev_state_value) {
-      r_prev_state = ((__m256 *)value.prev_state_value)[i];
+      r_prev_state = (reinterpret_cast<__m256 *>(value.prev_state_value))[i];
     }
 
-    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
-       r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node,
-       active_gate, active_state);
+    op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
+       &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
+       active_node, active_gate, active_state);
 
     value_in[i] = r_value_in;
     value_ig[i] = r_value_ig;
     value_fg[i] = r_value_fg;
     value_og[i] = r_value_og;
-    ((__m256 *)value.state_value)[i] = r_state;
-    ((__m256 *)value.state_active_value)[i] = r_state_atv;
-    ((__m256 *)value.output_value)[i] = r_out;
+    (reinterpret_cast<__m256 *>(value.state_value))[i] = r_state;
+    (reinterpret_cast<__m256 *>(value.state_active_value))[i] = r_state_atv;
+    (reinterpret_cast<__m256 *>(value.output_value))[i] = r_out;
   }
 #endif
 }
@@ -227,14 +229,16 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
   __m256 r_checkFGrad;
   __m256 r_checkOGrad;
 
-  __m256 *value_in = (__m256 *)value.gate_value;
-  __m256 *value_ig = (__m256 *)(value.gate_value + frame_size);
-  __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2);
-  __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3);
-  __m256 *grad_in = (__m256 *)grad.gate_grad;
-  __m256 *grad_ig = (__m256 *)(grad.gate_grad + frame_size);
-  __m256 *grad_fg = (__m256 *)(grad.gate_grad + frame_size * 2);
-  __m256 *grad_og = (__m256 *)(grad.gate_grad + frame_size * 3);
+  __m256 *value_in = reinterpret_cast<__m256 *>(value.gate_value);
+  __m256 *value_ig = reinterpret_cast<__m256 *>(value.gate_value + frame_size);
+  __m256 *value_fg =
+      reinterpret_cast<__m256 *>(value.gate_value + frame_size * 2);
+  __m256 *value_og =
+      reinterpret_cast<__m256 *>(value.gate_value + frame_size * 3);
+  __m256 *grad_in = reinterpret_cast<__m256 *>(grad.gate_grad);
+  __m256 *grad_ig = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size);
+  __m256 *grad_fg = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size * 2);
+  __m256 *grad_og = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size * 3);
 
   for (int i = 0; i < frame_size / 8; i++) {
     r_value_in = value_in[i];
@@ -242,37 +246,40 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
     r_value_fg = value_fg[i];
     r_value_og = value_og[i];
     if (value.check_ig) {
-      r_checkI = ((__m256 *)value.check_ig)[i];
-      r_checkF = ((__m256 *)value.check_fg)[i];
-      r_checkO = ((__m256 *)value.check_og)[i];
+      r_checkI = (reinterpret_cast<__m256 *>(value.check_ig))[i];
+      r_checkF = (reinterpret_cast<__m256 *>(value.check_fg))[i];
+      r_checkO = (reinterpret_cast<__m256 *>(value.check_og))[i];
     }
-    r_state = ((__m256 *)value.state_value)[i];
-    r_state_atv = ((__m256 *)value.state_active_value)[i];
-    r_output_grad = ((__m256 *)grad.output_grad)[i];
-    r_state_grad = ((__m256 *)grad.state_grad)[i];
+    r_state = (reinterpret_cast<__m256 *>(value.state_value))[i];
+    r_state_atv = (reinterpret_cast<__m256 *>(value.state_active_value))[i];
+    r_output_grad = (reinterpret_cast<__m256 *>(grad.output_grad))[i];
+    r_state_grad = (reinterpret_cast<__m256 *>(grad.state_grad))[i];
     if (value.prev_state_value) {
-      r_prev_state = ((__m256 *)value.prev_state_value)[i];
+      r_prev_state = (reinterpret_cast<__m256 *>(value.prev_state_value))[i];
     }
 
-    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
-       r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
-       r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
-       r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
-       active_state);
+    op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in,
+       &r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad,
+       &r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI,
+       &r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad,
+       active_node, active_gate, active_state);
 
     grad_in[i] = r_grad_in;
     grad_ig[i] = r_grad_ig;
     grad_fg[i] = r_grad_fg;
     grad_og[i] = r_grad_og;
-    ((__m256 *)grad.state_grad)[i] = r_state_grad;
+    (reinterpret_cast<__m256 *>(grad.state_grad))[i] = r_state_grad;
 
     if (grad.prev_state_grad)
-      ((__m256 *)grad.prev_state_grad)[i] = r_prev_state_grad;
+      (reinterpret_cast<__m256 *>(grad.prev_state_grad))[i] = r_prev_state_grad;
     if (value.prev_state_value) {
-      if (grad.check_ig_grad) ((__m256 *)grad.check_ig_grad)[i] += r_checkIGrad;
-      if (grad.check_fg_grad) ((__m256 *)grad.check_fg_grad)[i] += r_checkFGrad;
+      if (grad.check_ig_grad)
+        (reinterpret_cast<__m256 *>(grad.check_ig_grad))[i] += r_checkIGrad;
+      if (grad.check_fg_grad)
+        (reinterpret_cast<__m256 *>(grad.check_fg_grad))[i] += r_checkFGrad;
     }
-    if (grad.check_og_grad) ((__m256 *)grad.check_og_grad)[i] += r_checkOGrad;
+    if (grad.check_og_grad)
+      (reinterpret_cast<__m256 *>(grad.check_og_grad))[i] += r_checkOGrad;
   }
 #endif
 }
diff --git a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
index ee7b16da4187e0f7f7839eff5c8753d2eb4f9c6d..2aecb69237fdf344ebc0bfe72d9c7c147f06358d 100644
--- a/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_gpu_kernel.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <type_traits>
+
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/device_context.h"
 
-#include <type_traits>
-
 namespace paddle {
 namespace operators {
 namespace math {
@@ -70,9 +70,9 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
     r_prev_state = value.prev_state_value[frame_idx];
   }
 
-  op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
-     r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node, active_gate,
-     active_state);
+  op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
+     &r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
+     active_node, active_gate, active_state);
 
   value.gate_value[frame_idx] = r_value_in;
   value.gate_value[frame_idx + frame_size] = r_value_ig;
@@ -145,11 +145,11 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
     r_prev_state = value.prev_state_value[frame_idx];
   }
 
-  op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
-     r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
-     r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
-     r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
-     active_state);
+  op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in, &r_grad_ig,
+     &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad, &r_state,
+     &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI, &r_checkF,
+     &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad, active_node,
+     active_gate, active_state);
 
   grad.gate_grad[frame_idx] = r_grad_in;
   grad.gate_grad[frame_idx + frame_size] = r_grad_ig;
diff --git a/paddle/fluid/operators/math/detail/lstm_kernel.h b/paddle/fluid/operators/math/detail/lstm_kernel.h
index 9080634f2b3fc122a420e049314f53abd50376e0..cbe73d62938d7c4c03a2c8731665260624417fd7 100644
--- a/paddle/fluid/operators/math/detail/lstm_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_kernel.h
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
+#include <type_traits>
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
-#include <type_traits>
-
 namespace paddle {
 namespace operators {
 namespace math {
@@ -27,19 +27,19 @@ namespace forward {
 template <class T>
 class lstm {
  public:
-  HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og,
-                             T &prev_state, T &state, T &state_atv, T &output,
-                             T &checkI, T &checkF, T &checkO,
+  HOSTDEVICE void operator()(T *value_in, T *value_ig, T *value_fg, T *value_og,
+                             T *prev_state, T *state, T *state_atv, T *output,
+                             T *checkI, T *checkF, T *checkO,
                              ActivationType active_node,
                              ActivationType active_gate,
                              ActivationType active_state) {
-    value_in = activation(value_in, active_node);
-    value_ig = activation(value_ig + prev_state * checkI, active_gate);
-    value_fg = activation(value_fg + prev_state * checkF, active_gate);
-    state = value_in * value_ig + prev_state * value_fg;
-    value_og = activation(value_og + state * checkO, active_gate);
-    state_atv = activation(state, active_state);
-    output = value_og * state_atv;
+    *value_in = activation(*value_in, active_node);
+    *value_ig = activation(*value_ig + (*prev_state) * (*checkI), active_gate);
+    *value_fg = activation(*value_fg + (*prev_state) * (*checkF), active_gate);
+    *state = (*value_in) * (*value_ig) + (*prev_state) * (*value_fg);
+    *value_og = activation(*value_og + (*state) * (*checkO), active_gate);
+    *state_atv = activation(*state, active_state);
+    *output = (*value_og) * (*state_atv);
   }
 #ifndef __NVCC__
 #ifndef __AVX__  // If not compiled with AVX instructs. Disable AVX by default
@@ -48,27 +48,27 @@ class lstm {
   // Only float support AVX optimization
   static const bool avx = std::is_same<T, float>::value;
 
-  HOSTDEVICE void operator()(__m256 &value_in, __m256 &value_ig,
-                             __m256 &value_fg, __m256 &value_og,
-                             __m256 &prev_state, __m256 &state,
-                             __m256 &state_atv, __m256 &output, __m256 &checkI,
-                             __m256 &checkF, __m256 &checkO,
+  HOSTDEVICE void operator()(__m256 *value_in, __m256 *value_ig,
+                             __m256 *value_fg, __m256 *value_og,
+                             __m256 *prev_state, __m256 *state,
+                             __m256 *state_atv, __m256 *output, __m256 *checkI,
+                             __m256 *checkF, __m256 *checkO,
                              ActivationType active_node,
                              ActivationType active_gate,
                              ActivationType active_state) {
-    value_in = activation(value_in, active_node);
-    value_ig =
-        activation(_mm256_add_ps(value_ig, _mm256_mul_ps(prev_state, checkI)),
-                   active_gate);
-    value_fg =
-        activation(_mm256_add_ps(value_fg, _mm256_mul_ps(prev_state, checkF)),
-                   active_gate);
-    state = _mm256_add_ps(_mm256_mul_ps(value_in, value_ig),
-                          _mm256_mul_ps(prev_state, value_fg));
-    value_og = activation(_mm256_add_ps(value_og, _mm256_mul_ps(state, checkO)),
-                          active_gate);
-    state_atv = activation(state, active_state);
-    output = _mm256_mul_ps(value_og, state_atv);
+    *value_in = activation(*value_in, active_node);
+    *value_ig = activation(
+        _mm256_add_ps(*value_ig, _mm256_mul_ps(*prev_state, *checkI)),
+        active_gate);
+    *value_fg = activation(
+        _mm256_add_ps(*value_fg, _mm256_mul_ps(*prev_state, *checkF)),
+        active_gate);
+    *state = _mm256_add_ps(_mm256_mul_ps(*value_in, *value_ig),
+                           _mm256_mul_ps(*prev_state, *value_fg));
+    *value_og = activation(
+        _mm256_add_ps(*value_og, _mm256_mul_ps(*state, *checkO)), active_gate);
+    *state_atv = activation(*state, active_state);
+    *output = _mm256_mul_ps(*value_og, *state_atv);
   }
 #endif
 #endif
@@ -81,26 +81,29 @@ namespace backward {
 template <class T>
 class lstm {
  public:
-  HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og,
-                             T &grad_in, T &grad_ig, T &grad_fg, T &grad_og,
-                             T &prev_state, T &prev_state_grad, T &state,
-                             T &state_grad, T &state_atv, T &output_grad,
-                             T &checkI, T &checkF, T &checkO, T &checkIGrad,
-                             T &checkFGrad, T &checkOGrad,
+  HOSTDEVICE void operator()(T *value_in, T *value_ig, T *value_fg, T *value_og,
+                             T *grad_in, T *grad_ig, T *grad_fg, T *grad_og,
+                             T *prev_state, T *prev_state_grad, T *state,
+                             T *state_grad, T *state_atv, T *output_grad,
+                             T *checkI, T *checkF, T *checkO, T *checkIGrad,
+                             T *checkFGrad, T *checkOGrad,
                              ActivationType active_node,
                              ActivationType active_gate,
                              ActivationType active_state) {
-    grad_og = activation(output_grad * state_atv, value_og, active_gate);
-    state_grad += activation(output_grad * value_og, state_atv, active_state) +
-                  grad_og * checkO;
-    grad_in = activation(state_grad * value_ig, value_in, active_node);
-    grad_ig = activation(state_grad * value_in, value_ig, active_gate);
-    grad_fg = activation(state_grad * prev_state, value_fg, active_gate);
-    prev_state_grad =
-        grad_ig * checkI + grad_fg * checkF + state_grad * value_fg;
-    checkIGrad = grad_ig * prev_state;
-    checkFGrad = grad_fg * prev_state;
-    checkOGrad = grad_og * state;
+    *grad_og =
+        activation((*output_grad) * (*state_atv), *value_og, active_gate);
+    *state_grad +=
+        activation((*output_grad) * (*value_og), *state_atv, active_state) +
+        (*grad_og) * (*checkO);
+    *grad_in = activation((*state_grad) * (*value_ig), *value_in, active_node);
+    *grad_ig = activation((*state_grad) * (*value_in), *value_ig, active_gate);
+    *grad_fg =
+        activation((*state_grad) * (*prev_state), *value_fg, active_gate);
+    *prev_state_grad = (*grad_ig) * (*checkI) + (*grad_fg) * (*checkF) +
+                       (*state_grad) * (*value_fg);
+    *checkIGrad = (*grad_ig) * (*prev_state);
+    *checkFGrad = (*grad_fg) * (*prev_state);
+    *checkOGrad = (*grad_og) * (*state);
   }
 #ifndef __NVCC__
 #ifndef __AVX__  // If not compiled with AVX instructs. Disable AVX by default
@@ -109,32 +112,33 @@ class lstm {
   // Only float support AVX optimization
   static const bool avx = std::is_same<T, float>::value;
   HOSTDEVICE void operator()(
-      __m256 &value_in, __m256 &value_ig, __m256 &value_fg, __m256 &value_og,
-      __m256 &grad_in, __m256 &grad_ig, __m256 &grad_fg, __m256 &grad_og,
-      __m256 &prev_state, __m256 &prev_state_grad, __m256 &state,
-      __m256 &state_grad, __m256 &state_atv, __m256 &output_grad,
-      __m256 &checkI, __m256 &checkF, __m256 &checkO, __m256 &checkIGrad,
-      __m256 &checkFGrad, __m256 &checkOGrad, ActivationType active_node,
+      __m256 *value_in, __m256 *value_ig, __m256 *value_fg, __m256 *value_og,
+      __m256 *grad_in, __m256 *grad_ig, __m256 *grad_fg, __m256 *grad_og,
+      __m256 *prev_state, __m256 *prev_state_grad, __m256 *state,
+      __m256 *state_grad, __m256 *state_atv, __m256 *output_grad,
+      __m256 *checkI, __m256 *checkF, __m256 *checkO, __m256 *checkIGrad,
+      __m256 *checkFGrad, __m256 *checkOGrad, ActivationType active_node,
       ActivationType active_gate, ActivationType active_state) {
-    grad_og = activation(_mm256_mul_ps(output_grad, state_atv), value_og,
-                         active_gate);
-    state_grad = _mm256_add_ps(activation(_mm256_mul_ps(output_grad, value_og),
-                                          state_atv, active_state),
-                               state_grad);
-    state_grad = _mm256_add_ps(_mm256_mul_ps(grad_og, checkO), state_grad);
-    grad_in =
-        activation(_mm256_mul_ps(state_grad, value_ig), value_in, active_node);
-    grad_ig =
-        activation(_mm256_mul_ps(state_grad, value_in), value_ig, active_gate);
-    grad_fg = activation(_mm256_mul_ps(state_grad, prev_state), value_fg,
-                         active_gate);
-    prev_state_grad = _mm256_add_ps(_mm256_mul_ps(grad_ig, checkI),
-                                    _mm256_mul_ps(grad_fg, checkF));
-    prev_state_grad =
-        _mm256_add_ps(_mm256_mul_ps(state_grad, value_fg), prev_state_grad);
-    checkIGrad = _mm256_mul_ps(grad_ig, prev_state);
-    checkFGrad = _mm256_mul_ps(grad_fg, prev_state);
-    checkOGrad = _mm256_mul_ps(grad_og, state);
+    *grad_og = activation(_mm256_mul_ps(*output_grad, *state_atv), *value_og,
+                          active_gate);
+    *state_grad =
+        _mm256_add_ps(activation(_mm256_mul_ps(*output_grad, *value_og),
+                                 *state_atv, active_state),
+                      *state_grad);
+    *state_grad = _mm256_add_ps(_mm256_mul_ps(*grad_og, *checkO), *state_grad);
+    *grad_in = activation(_mm256_mul_ps(*state_grad, *value_ig), *value_in,
+                          active_node);
+    *grad_ig = activation(_mm256_mul_ps(*state_grad, *value_in), *value_ig,
+                          active_gate);
+    *grad_fg = activation(_mm256_mul_ps(*state_grad, *prev_state), *value_fg,
+                          active_gate);
+    *prev_state_grad = _mm256_add_ps(_mm256_mul_ps(*grad_ig, *checkI),
+                                     _mm256_mul_ps(*grad_fg, *checkF));
+    *prev_state_grad =
+        _mm256_add_ps(_mm256_mul_ps(*state_grad, *value_fg), *prev_state_grad);
+    *checkIGrad = _mm256_mul_ps(*grad_ig, *prev_state);
+    *checkFGrad = _mm256_mul_ps(*grad_fg, *prev_state);
+    *checkOGrad = _mm256_mul_ps(*grad_og, *state);
   }
 #endif
 #endif
diff --git a/paddle/fluid/operators/math/detection_util.h b/paddle/fluid/operators/math/detection_util.h
deleted file mode 100644
index c31764cfaf5bbdfea2f3ed06f31f97965a8858ed..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/detection_util.h
+++ /dev/null
@@ -1,300 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <map>
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-template <typename T>
-struct BBox {
-  BBox(T x_min, T y_min, T x_max, T y_max)
-      : x_min(x_min),
-        y_min(y_min),
-        x_max(x_max),
-        y_max(y_max),
-        is_difficult(false) {}
-
-  BBox() {}
-
-  T get_width() const { return x_max - x_min; }
-
-  T get_height() const { return y_max - y_min; }
-
-  T get_center_x() const { return (x_min + x_max) / 2; }
-
-  T get_center_y() const { return (y_min + y_max) / 2; }
-
-  T get_area() const { return get_width() * get_height(); }
-
-  // coordinate of bounding box
-  T x_min;
-  T y_min;
-  T x_max;
-  T y_max;
-  // whether difficult object (e.g. object with heavy occlusion is difficult)
-  bool is_difficult;
-};
-// KNCHW ==> NHWC
-// template <typename T>
-template <typename T>
-void GetBBoxFromPriorData(const T* prior_data, const size_t num_bboxes,
-                          std::vector<BBox<T>>& bbox_vec);
-template <typename T>
-void GetBBoxVarFromPriorData(const T* prior_data, const size_t num,
-                             std::vector<std::vector<T>>& var_vec);
-template <typename T>
-BBox<T> DecodeBBoxWithVar(BBox<T>& prior_bbox,
-                          const std::vector<T>& prior_bbox_var,
-                          const std::vector<T>& loc_pred_data);
-template <typename T1, typename T2>
-bool SortScorePairDescend(const std::pair<T1, T2>& pair1,
-                          const std::pair<T1, T2>& pair2);
-template <typename T>
-bool SortScorePairDescend(const std::pair<T, BBox<T>>& pair1,
-                          const std::pair<T, BBox<T>>& pair2);
-template <typename T>
-T jaccard_overlap(const BBox<T>& bbox1, const BBox<T>& bbox2);
-
-template <typename T>
-void ApplyNmsFast(const std::vector<BBox<T>>& bboxes, const T* conf_score_data,
-                  size_t class_idx, size_t top_k, T conf_threshold,
-                  T nms_threshold, size_t num_priors, size_t num_classes,
-                  std::vector<size_t>* indices);
-template <typename T>
-int GetDetectionIndices(
-    const T* conf_data, const size_t num_priors, const size_t num_classes,
-    const size_t background_label_id, const size_t batch_size,
-    const T conf_threshold, const size_t nms_top_k, const T nms_threshold,
-    const size_t top_k,
-    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes,
-    std::vector<std::map<size_t, std::vector<size_t>>>* all_detection_indices);
-template <typename T>
-BBox<T> ClipBBox(const BBox<T>& bbox);
-template <typename T>
-void GetDetectionOutput(
-    const T* conf_data, const size_t num_kept, const size_t num_priors,
-    const size_t num_classes, const size_t batch_size,
-    const std::vector<std::map<size_t, std::vector<size_t>>>& all_indices,
-    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes, T* out_data);
-template <typename T>
-void GetBBoxFromPriorData(const T* prior_data, const size_t num_bboxes,
-                          std::vector<BBox<T>>& bbox_vec) {
-  size_t out_offset = bbox_vec.size();
-  bbox_vec.resize(bbox_vec.size() + num_bboxes);
-  for (size_t i = 0; i < num_bboxes; ++i) {
-    BBox<T> bbox;
-    bbox.x_min = *(prior_data + i * 8);
-    bbox.y_min = *(prior_data + i * 8 + 1);
-    bbox.x_max = *(prior_data + i * 8 + 2);
-    bbox.y_max = *(prior_data + i * 8 + 3);
-    bbox_vec[out_offset + i] = bbox;
-  }
-}
-template <typename T>
-void GetBBoxVarFromPriorData(const T* prior_data, const size_t num,
-                             std::vector<std::vector<T>>& var_vec) {
-  size_t out_offset = var_vec.size();
-  var_vec.resize(var_vec.size() + num);
-  for (size_t i = 0; i < num; ++i) {
-    std::vector<T> var;
-    var.push_back(*(prior_data + i * 8 + 4));
-    var.push_back(*(prior_data + i * 8 + 5));
-    var.push_back(*(prior_data + i * 8 + 6));
-    var.push_back(*(prior_data + i * 8 + 7));
-    var_vec[out_offset + i] = var;
-  }
-}
-template <typename T>
-BBox<T> DecodeBBoxWithVar(BBox<T>& prior_bbox,
-                          const std::vector<T>& prior_bbox_var,
-                          const std::vector<T>& loc_pred_data) {
-  T prior_bbox_width = prior_bbox.get_width();
-  T prior_bbox_height = prior_bbox.get_height();
-  T prior_bbox_center_x = prior_bbox.get_center_x();
-  T prior_bbox_center_y = prior_bbox.get_center_y();
-
-  T decoded_bbox_center_x =
-      prior_bbox_var[0] * loc_pred_data[0] * prior_bbox_width +
-      prior_bbox_center_x;
-  T decoded_bbox_center_y =
-      prior_bbox_var[1] * loc_pred_data[1] * prior_bbox_height +
-      prior_bbox_center_y;
-  T decoded_bbox_width =
-      std::exp(prior_bbox_var[2] * loc_pred_data[2]) * prior_bbox_width;
-  T decoded_bbox_height =
-      std::exp(prior_bbox_var[3] * loc_pred_data[3]) * prior_bbox_height;
-
-  BBox<T> decoded_bbox;
-  decoded_bbox.x_min = decoded_bbox_center_x - decoded_bbox_width / 2;
-  decoded_bbox.y_min = decoded_bbox_center_y - decoded_bbox_height / 2;
-  decoded_bbox.x_max = decoded_bbox_center_x + decoded_bbox_width / 2;
-  decoded_bbox.y_max = decoded_bbox_center_y + decoded_bbox_height / 2;
-
-  return decoded_bbox;
-}
-template <typename T1, typename T2>
-bool SortScorePairDescend(const std::pair<T1, T2>& pair1,
-                          const std::pair<T1, T2>& pair2) {
-  return pair1.first > pair2.first;
-}
-template <typename T>
-T jaccard_overlap(const BBox<T>& bbox1, const BBox<T>& bbox2) {
-  if (bbox2.x_min > bbox1.x_max || bbox2.x_max < bbox1.x_min ||
-      bbox2.y_min > bbox1.y_max || bbox2.y_max < bbox1.y_min) {
-    return 0.0;
-  } else {
-    T inter_x_min = std::max(bbox1.x_min, bbox2.x_min);
-    T inter_y_min = std::max(bbox1.y_min, bbox2.y_min);
-    T interX_max = std::min(bbox1.x_max, bbox2.x_max);
-    T interY_max = std::min(bbox1.y_max, bbox2.y_max);
-
-    T inter_width = interX_max - inter_x_min;
-    T inter_height = interY_max - inter_y_min;
-    T inter_area = inter_width * inter_height;
-
-    T bbox_area1 = bbox1.get_area();
-    T bbox_area2 = bbox2.get_area();
-
-    return inter_area / (bbox_area1 + bbox_area2 - inter_area);
-  }
-}
-
-template <typename T>
-void ApplyNmsFast(const std::vector<BBox<T>>& bboxes, const T* conf_score_data,
-                  size_t class_idx, size_t top_k, T conf_threshold,
-                  T nms_threshold, size_t num_priors, size_t num_classes,
-                  std::vector<size_t>* indices) {
-  std::vector<std::pair<T, size_t>> scores;
-  for (size_t i = 0; i < num_priors; ++i) {
-    size_t conf_offset = i * num_classes + class_idx;
-    if (conf_score_data[conf_offset] > conf_threshold)
-      scores.push_back(std::make_pair(conf_score_data[conf_offset], i));
-  }
-  std::stable_sort(scores.begin(), scores.end(),
-                   SortScorePairDescend<T, size_t>);
-  if (top_k > 0 && top_k < scores.size()) scores.resize(top_k);
-  while (scores.size() > 0) {
-    const size_t idx = scores.front().second;
-    bool keep = true;
-    for (size_t i = 0; i < indices->size(); ++i) {
-      if (keep) {
-        const size_t saved_idx = (*indices)[i];
-        T overlap = jaccard_overlap<T>(bboxes[idx], bboxes[saved_idx]);
-        keep = overlap <= nms_threshold;
-      } else {
-        break;
-      }
-    }
-    if (keep) indices->push_back(idx);
-    scores.erase(scores.begin());
-  }
-}
-template <typename T>
-int GetDetectionIndices(
-    const T* conf_data, const size_t num_priors, const size_t num_classes,
-    const size_t background_label_id, const size_t batch_size,
-    const T conf_threshold, const size_t nms_top_k, const T nms_threshold,
-    const size_t top_k,
-    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes,
-    std::vector<std::map<size_t, std::vector<size_t>>>* all_detection_indices) {
-  int total_keep_num = 0;
-  for (size_t n = 0; n < batch_size; ++n) {
-    const std::vector<BBox<T>>& decoded_bboxes = all_decoded_bboxes[n];
-    size_t num_detected = 0;
-    std::map<size_t, std::vector<size_t>> indices;
-    size_t conf_offset = n * num_priors * num_classes;
-    for (size_t c = 0; c < num_classes; ++c) {
-      if (c == background_label_id) continue;
-      ApplyNmsFast<T>(decoded_bboxes, conf_data + conf_offset, c, nms_top_k,
-                      conf_threshold, nms_threshold, num_priors, num_classes,
-                      &(indices[c]));
-      num_detected += indices[c].size();
-    }
-    if (top_k > 0 && num_detected > top_k) {
-      // std::vector<pair<T,T>> score_index_pairs;
-      std::vector<std::pair<T, std::pair<size_t, size_t>>> score_index_pairs;
-      for (size_t c = 0; c < num_classes; ++c) {
-        const std::vector<size_t>& label_indices = indices[c];
-        for (size_t i = 0; i < label_indices.size(); ++i) {
-          size_t idx = label_indices[i];
-          score_index_pairs.push_back(
-              std::make_pair((conf_data + conf_offset)[idx * num_classes + c],
-                             std::make_pair(c, idx)));
-        }
-      }
-      std::sort(score_index_pairs.begin(), score_index_pairs.end(),
-                SortScorePairDescend<T, std::pair<size_t, size_t>>);
-      score_index_pairs.resize(top_k);
-      std::map<size_t, std::vector<size_t>> new_indices;
-      for (size_t i = 0; i < score_index_pairs.size(); ++i) {
-        size_t label = score_index_pairs[i].second.first;
-        size_t idx = score_index_pairs[i].second.second;
-        new_indices[label].push_back(idx);
-      }
-      all_detection_indices->push_back(new_indices);
-      total_keep_num += top_k;
-    } else {
-      all_detection_indices->push_back(indices);
-      total_keep_num += num_detected;
-    }
-  }
-  return total_keep_num;
-}
-template <typename T>
-BBox<T> ClipBBox(const BBox<T>& bbox) {
-  T one = static_cast<T>(1.0);
-  T zero = static_cast<T>(0.0);
-  BBox<T> clipped_bbox;
-  clipped_bbox.x_min = std::max(std::min(bbox.x_min, one), zero);
-  clipped_bbox.y_min = std::max(std::min(bbox.y_min, one), zero);
-  clipped_bbox.x_max = std::max(std::min(bbox.x_max, one), zero);
-  clipped_bbox.y_max = std::max(std::min(bbox.y_max, one), zero);
-  return clipped_bbox;
-}
-template <typename T>
-void GetDetectionOutput(
-    const T* conf_data, const size_t num_kept, const size_t num_priors,
-    const size_t num_classes, const size_t batch_size,
-    const std::vector<std::map<size_t, std::vector<size_t>>>& all_indices,
-    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes, T* out_data) {
-  size_t count = 0;
-  for (size_t n = 0; n < batch_size; ++n) {
-    for (std::map<size_t, std::vector<size_t>>::const_iterator it =
-             all_indices[n].begin();
-         it != all_indices[n].end(); ++it) {
-      size_t label = it->first;
-      const std::vector<size_t>& indices = it->second;
-      const std::vector<BBox<T>>& decoded_bboxes = all_decoded_bboxes[n];
-      for (size_t i = 0; i < indices.size(); ++i) {
-        size_t idx = indices[i];
-        size_t conf_offset = n * num_priors * num_classes + idx * num_classes;
-        out_data[count * 7] = n;
-        out_data[count * 7 + 1] = label;
-        out_data[count * 7 + 2] = (conf_data + conf_offset)[label];
-        BBox<T> clipped_bbox = ClipBBox<T>(decoded_bboxes[idx]);
-        out_data[count * 7 + 3] = clipped_bbox.x_min;
-        out_data[count * 7 + 4] = clipped_bbox.y_min;
-        out_data[count * 7 + 5] = clipped_bbox.x_max;
-        out_data[count * 7 + 6] = clipped_bbox.y_max;
-        ++count;
-      }
-    }
-  }
-}
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/gru_compute.cc b/paddle/fluid/operators/math/gru_compute.cc
index 3f044b775138c495052bec3d19121bf26c37cb37..0e15b81deef43a932d4b2d3f545393b0ad9e080c 100644
--- a/paddle/fluid/operators/math/gru_compute.cc
+++ b/paddle/fluid/operators/math/gru_compute.cc
@@ -10,9 +10,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/gru_compute.h"
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
 #include "paddle/fluid/operators/math/detail/gru_kernel.h"
-#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -25,21 +25,21 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
                       const detail::ActivationType active_node,
                       const detail::ActivationType active_gate) {
 #ifndef __NVCC__
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
     if (value.prev_out_value) {
-      math::gemm<platform::CPUDeviceContext, T>(
-          context, false, false, batch_size, frame_size * 2, frame_size, 1,
-          value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
-          1, value.gate_value, frame_size * 3);
+      blas.GEMM(false, false, batch_size, frame_size * 2, frame_size, 1,
+                value.prev_out_value, frame_size, value.gate_weight,
+                frame_size * 2, 1, value.gate_value, frame_size * 3);
     }
 
     detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
                                  frame_size, batch_size, active_gate);
 
     if (value.prev_out_value) {
-      math::gemm<platform::CPUDeviceContext, T>(
-          context, false, false, batch_size, frame_size, frame_size, 1,
-          value.reset_output_value, frame_size, value.state_weight, frame_size,
-          1, value.gate_value + frame_size * 2, frame_size * 3);
+      blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
+                value.reset_output_value, frame_size, value.state_weight,
+                frame_size, 1, value.gate_value + frame_size * 2,
+                frame_size * 3);
     }
 
     detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
@@ -58,36 +58,32 @@ struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> {
 #ifndef __NVCC__
     detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
                                 grad, frame_size, batch_size, active_node);
-
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
     if (value.prev_out_value && grad.prev_out_grad) {
-      math::gemm<platform::CPUDeviceContext, T>(
-          context, false, true, batch_size, frame_size, frame_size, 1,
-          grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
-          frame_size, 0, grad.reset_output_grad, frame_size);
+      blas.GEMM(false, true, batch_size, frame_size, frame_size, 1,
+                grad.gate_grad + frame_size * 2, frame_size * 3,
+                value.state_weight, frame_size, 0, grad.reset_output_grad,
+                frame_size);
 
       if (grad.state_weight_grad) {
-        math::gemm<platform::CPUDeviceContext, T>(
-            context, true, false, frame_size, frame_size, batch_size, 1,
-            value.reset_output_value, frame_size,
-            grad.gate_grad + frame_size * 2, frame_size * 3, 1,
-            grad.state_weight_grad, frame_size);
+        blas.GEMM(true, false, frame_size, frame_size, batch_size, 1,
+                  value.reset_output_value, frame_size,
+                  grad.gate_grad + frame_size * 2, frame_size * 3, 1,
+                  grad.state_weight_grad, frame_size);
       }
     }
 
     detail::backward_reset_grad(detail::backward::gru_resetGrad<T>(), value,
                                 grad, frame_size, batch_size, active_gate);
-
     if (grad.prev_out_grad && value.prev_out_value) {
-      math::gemm<platform::CPUDeviceContext, T>(
-          context, false, true, batch_size, frame_size, frame_size * 2, 1,
-          grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
-          grad.prev_out_grad, frame_size);
+      blas.GEMM(false, true, batch_size, frame_size, frame_size * 2, 1,
+                grad.gate_grad, frame_size * 3, value.gate_weight,
+                frame_size * 2, 1, grad.prev_out_grad, frame_size);
 
       if (grad.gate_weight_grad) {
-        math::gemm<platform::CPUDeviceContext, T>(
-            context, true, false, frame_size, frame_size * 2, batch_size, 1,
-            value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
-            grad.gate_weight_grad, frame_size * 2);
+        blas.GEMM(true, false, frame_size, frame_size * 2, batch_size, 1,
+                  value.prev_out_value, frame_size, grad.gate_grad,
+                  frame_size * 3, 1, grad.gate_weight_grad, frame_size * 2);
       }
     }
 #endif
diff --git a/paddle/fluid/operators/math/gru_compute.cu b/paddle/fluid/operators/math/gru_compute.cu
index 27caf3383dd6cd94779391b722bba1d6b74772c0..1327d914952d57aab6e5d17090d0ea976a6d4755 100644
--- a/paddle/fluid/operators/math/gru_compute.cu
+++ b/paddle/fluid/operators/math/gru_compute.cu
@@ -9,10 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <paddle/fluid/platform/device_context.h>
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/detail/gru_gpu_kernel.h"
 #include "paddle/fluid/operators/math/detail/gru_kernel.h"
 #include "paddle/fluid/operators/math/gru_compute.h"
-#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -36,12 +37,11 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
       threads = dim3(32, 32);
       grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
     }
-
+    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context);
     if (value.prev_out_value) {
-      math::gemm<platform::CUDADeviceContext, T>(
-          context, false, false, batch_size, frame_size * 2, frame_size, 1,
-          value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
-          1, value.gate_value, frame_size * 3);
+      blas.GEMM(false, false, batch_size, frame_size * 2, frame_size, 1,
+                value.prev_out_value, frame_size, value.gate_weight,
+                frame_size * 2, 1, value.gate_value, frame_size * 3);
     }
 
     if (batch_size == 1) {
@@ -61,10 +61,10 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
     }
 
     if (value.prev_out_value) {
-      math::gemm<platform::CUDADeviceContext, T>(
-          context, false, false, batch_size, frame_size, frame_size, 1,
-          value.reset_output_value, frame_size, value.state_weight, frame_size,
-          1, value.gate_value + frame_size * 2, frame_size * 3);
+      blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
+                value.reset_output_value, frame_size, value.state_weight,
+                frame_size, 1, value.gate_value + frame_size * 2,
+                frame_size * 3);
     }
 
     if (batch_size == 1) {
@@ -121,18 +121,19 @@ struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
           grad.output_grad, frame_size, batch_size, active_node);
     }
 
+    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context);
+
     if (value.prev_out_value && grad.prev_out_grad) {
-      math::gemm<platform::CUDADeviceContext, T>(
-          context, false, true, batch_size, frame_size, frame_size, 1,
-          grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
-          frame_size, 0, grad.reset_output_grad, frame_size);
+      blas.GEMM(false, true, batch_size, frame_size, frame_size, 1,
+                grad.gate_grad + frame_size * 2, frame_size * 3,
+                value.state_weight, frame_size, 0, grad.reset_output_grad,
+                frame_size);
 
       if (grad.state_weight_grad) {
-        math::gemm<platform::CUDADeviceContext, T>(
-            context, true, false, frame_size, frame_size, batch_size, 1,
-            value.reset_output_value, frame_size,
-            grad.gate_grad + frame_size * 2, frame_size * 3, 1,
-            grad.state_weight_grad, frame_size);
+        blas.GEMM(true, false, frame_size, frame_size, batch_size, 1,
+                  value.reset_output_value, frame_size,
+                  grad.gate_grad + frame_size * 2, frame_size * 3, 1,
+                  grad.state_weight_grad, frame_size);
       }
     }
 
@@ -153,16 +154,14 @@ struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
     }
 
     if (grad.prev_out_grad && value.prev_out_value) {
-      math::gemm<platform::CUDADeviceContext, T>(
-          context, false, true, batch_size, frame_size, frame_size * 2, 1,
-          grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
-          grad.prev_out_grad, frame_size);
+      blas.GEMM(false, true, batch_size, frame_size, frame_size * 2, 1,
+                grad.gate_grad, frame_size * 3, value.gate_weight,
+                frame_size * 2, 1, grad.prev_out_grad, frame_size);
 
       if (grad.gate_weight_grad) {
-        math::gemm<platform::CUDADeviceContext, T>(
-            context, true, false, frame_size, frame_size * 2, batch_size, 1,
-            value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
-            grad.gate_weight_grad, frame_size * 2);
+        blas.GEMM(true, false, frame_size, frame_size * 2, batch_size, 1,
+                  value.prev_out_value, frame_size, grad.gate_grad,
+                  frame_size * 3, 1, grad.gate_weight_grad, frame_size * 2);
       }
     }
   }
diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
index 123e10586f60f65d6c31a7c1f837bf32610d4ea5..336d6febc2ce3a55e82ed613bbc1081101f822f0 100644
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/im2col.h"
+#include <vector>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu
index f41c78140fb601cd27a4e9997a736f46e903efc7..eecb233d22cea06da016b2671fd606b70eddf5a5 100644
--- a/paddle/fluid/operators/math/im2col.cu
+++ b/paddle/fluid/operators/math/im2col.cu
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <algorithm>
+#include <vector>
 #include "paddle/fluid/operators/math/im2col.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/im2col.h b/paddle/fluid/operators/math/im2col.h
index 451ec9d53498caa6d53a28dbfc764775d3ed3ecf..26d94e0f2e6163eb7452cf1fbea5966b4344ace1 100644
--- a/paddle/fluid/operators/math/im2col.h
+++ b/paddle/fluid/operators/math/im2col.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc
index b3978536bca7dc276bf5417bed36761a2d496536..8e3f0f286823c383bb0c44d0e7887040ec9b20a0 100644
--- a/paddle/fluid/operators/math/im2col_test.cc
+++ b/paddle/fluid/operators/math/im2col_test.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/im2col.h"
 #include <gtest/gtest.h>
+#include <vector>
 
 template <typename DeviceContext, typename Place>
 void testIm2col() {
@@ -62,7 +63,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    TensorCopy(input_tmp, *place, *context, &input);
+    TensorCopySync(input_tmp, *place, &input);
   }
   output_cfo.mutable_data<float>(
       {1, filter_size, filter_size, output_height, output_width}, *place);
@@ -87,7 +88,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     out_cfo_ptr = output_cfo.data<float>();
   } else {
-    TensorCopy(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp);
+    TensorCopySync(output_cfo, paddle::platform::CPUPlace(), &output_tmp);
     out_cfo_ptr = output_tmp.data<float>();
   }
   for (int i = 0; i < 6; ++i) {
@@ -98,7 +99,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     out_ocf_ptr = output_ocf.data<float>();
   } else {
-    TensorCopy(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp);
+    TensorCopySync(output_ocf, paddle::platform::CPUPlace(), &output_tmp);
     out_ocf_ptr = output_tmp.data<float>();
   }
 
@@ -119,7 +120,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    TensorCopy(input_tmp, *place, *context, &input);
+    TensorCopySync(input_tmp, *place, &input);
   }
 
   col2im(*context, output_cfo, dilation, stride, padding, &input);
@@ -128,7 +129,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     in_ptr = input.data<float>();
   } else {
-    TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
+    TensorCopySync(input, paddle::platform::CPUPlace(), &input_tmp);
     in_ptr = input_tmp.data<float>();
   }
   for (int i = 0; i < 6; ++i) {
@@ -140,7 +141,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    TensorCopy(input_tmp, *place, *context, &input);
+    TensorCopySync(input_tmp, *place, &input);
   }
 
   col2im_ocf(*context, output_ocf, dilation, stride, padding, &input);
@@ -148,7 +149,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     in_ptr = input.data<float>();
   } else {
-    TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
+    TensorCopySync(input, paddle::platform::CPUPlace(), &input_tmp);
     in_ptr = input_tmp.data<float>();
   }
   for (int i = 0; i < 6; ++i) {
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 35d251f71a0cb631d5900498ea3188b5ddeae334..c3387be6daa3bd34a6e3410ced23fce5d65f2cf7 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/math_function.h"
+#include <vector>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/operators/math/math_function_impl.h"
 #include "paddle/fluid/platform/float16.h"
@@ -23,266 +24,13 @@ namespace math {
 
 using float16 = paddle::platform::float16;
 
-template <>
-void gemm<platform::CPUDeviceContext, float16>(
-    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const float16 alpha, const float16* A, const float16* B, const float16 beta,
-    float16* C) {
-  PADDLE_THROW("float16 GEMM not supported on CPU");
-}
-
-template <>
-void gemm<platform::CPUDeviceContext, float>(
-    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const float alpha, const float* A, const float* B, const float beta,
-    float* C) {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cblas_sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-              beta, C, ldc);
-}
-
-template <>
-void gemm<platform::CPUDeviceContext, double>(
-    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const double alpha, const double* A, const double* B, const double beta,
-    double* C) {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-              beta, C, ldc);
-}
-
-template <>
-void gemm<platform::CPUDeviceContext, float16>(
-    const platform::CPUDeviceContext& context, const bool transA,
-    const bool transB, const int M, const int N, const int K,
-    const float16 alpha, const float16* A, const int lda, const float16* B,
-    const int ldb, const float16 beta, float16* C, const int ldc) {
-  PADDLE_THROW("float16 GEMM not supported on CPU");
-}
-
-template <>
-void gemm<platform::CPUDeviceContext, float>(
-    const platform::CPUDeviceContext& context, const bool transA,
-    const bool transB, const int M, const int N, const int K, const float alpha,
-    const float* A, const int lda, const float* B, const int ldb,
-    const float beta, float* C, const int ldc) {
-  cblas_sgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
-              transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
-              lda, B, ldb, beta, C, ldc);
-}
-
-template <>
-void gemm<platform::CPUDeviceContext, double>(
-    const platform::CPUDeviceContext& context, const bool transA,
-    const bool transB, const int M, const int N, const int K,
-    const double alpha, const double* A, const int lda, const double* B,
-    const int ldb, const double beta, double* C, const int ldc) {
-  cblas_dgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
-              transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
-              lda, B, ldb, beta, C, ldc);
-}
-
-template <>
-void matmul<platform::CPUDeviceContext, float16>(
-    const platform::CPUDeviceContext& context,
-    const framework::Tensor& matrix_a, bool trans_a,
-    const framework::Tensor& matrix_b, bool trans_b, float16 alpha,
-    framework::Tensor* matrix_out, float16 beta) {
-  PADDLE_THROW("float16 matmul not supported on CPU");
-}
-
-template <>
-void matmul<platform::CPUDeviceContext, float>(
-    const platform::CPUDeviceContext& context,
-    const framework::Tensor& matrix_a, bool trans_a,
-    const framework::Tensor& matrix_b, bool trans_b, float alpha,
-    framework::Tensor* matrix_out, float beta) {
-  auto dim_a = matrix_a.dims();
-  auto dim_b = matrix_b.dims();
-  auto dim_out = matrix_out->dims();
-  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-                 "The input and output of matmul be matrix");
-
-  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
-                     platform::is_cpu_place(matrix_b.place()) &&
-                     platform::is_cpu_place(matrix_out->place()),
-                 "Matrix must all be in CPUPlace");
-
-  int M = dim_out[0];
-  int N = dim_out[1];
-  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
-
-  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
-
-  gemm<platform::CPUDeviceContext, float>(
-      context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
-      matrix_b.data<float>(), beta, matrix_out->data<float>());
-}
-
-template <>
-void matmul<platform::CPUDeviceContext, double>(
-    const platform::CPUDeviceContext& context,
-    const framework::Tensor& matrix_a, bool trans_a,
-    const framework::Tensor& matrix_b, bool trans_b, double alpha,
-    framework::Tensor* matrix_out, double beta) {
-  auto dim_a = matrix_a.dims();
-  auto dim_b = matrix_b.dims();
-  auto dim_out = matrix_out->dims();
-  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-                 "The input and output of matmul be matrix");
-
-  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
-                     platform::is_cpu_place(matrix_b.place()) &&
-                     platform::is_cpu_place(matrix_out->place()),
-                 "Matrix must all be in CPUPlace");
-
-  int M = dim_out[0];
-  int N = dim_out[1];
-  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
-
-  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
-
-  gemm<platform::CPUDeviceContext, double>(
-      context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
-      matrix_b.data<double>(), beta, matrix_out->data<double>());
-}
-
-template <>
-void batched_gemm<platform::CPUDeviceContext, float16>(
-    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const float16 alpha, const float16* A, const float16* B, const float16 beta,
-    float16* C, const int batchCount, const int strideA, const int strideB) {
-  PADDLE_THROW("float16 batched_gemm not supported on CPU");
-}
-
-#ifdef PADDLE_WITH_MKLML
-// Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize.
-template <>
-void batched_gemm<platform::CPUDeviceContext, float>(
-    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const float alpha, const float* A, const float* B, const float beta,
-    float* C, const int batchCount, const int strideA, const int strideB) {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  auto a_array = std::vector<const float*>(batchCount);
-  auto b_array = std::vector<const float*>(batchCount);
-  auto c_array = std::vector<float*>(batchCount);
-  for (int k = 0; k < batchCount; ++k) {
-    a_array[k] = &A[k * strideA];
-    b_array[k] = &B[k * strideB];
-    c_array[k] = &C[k * M * N];
-  }
-  cblas_sgemm_batch(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha,
-                    a_array.data(), &lda, b_array.data(), &ldb, &beta,
-                    c_array.data(), &ldc, 1 /* group_count */, &batchCount);
-}
-
-template <>
-void batched_gemm<platform::CPUDeviceContext, double>(
-    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const double alpha, const double* A, const double* B, const double beta,
-    double* C, const int batchCount, const int strideA, const int strideB) {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  auto a_array = std::vector<const double*>(batchCount);
-  auto b_array = std::vector<const double*>(batchCount);
-  auto c_array = std::vector<double*>(batchCount);
-  for (int k = 0; k < batchCount; ++k) {
-    a_array[k] = &A[k * strideA];
-    b_array[k] = &B[k * strideB];
-    c_array[k] = &C[k * M * N];
-  }
-  cblas_dgemm_batch(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha,
-                    a_array.data(), &lda, b_array.data(), &ldb, &beta,
-                    c_array.data(), &ldc, 1 /* group_count */, &batchCount);
-}
-#else
-// The below is a naive but correct serial implementation that just loops
-// over the batch dimension. This is a fallback for when the batched gemm
-// functions of Intel MKL are not available. In the future, this computation
-// should be parallelized.
-template <>
-void batched_gemm<platform::CPUDeviceContext, float>(
-    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const float alpha, const float* A, const float* B, const float beta,
-    float* C, const int batchCount, const int strideA, const int strideB) {
-  for (int k = 0; k < batchCount; ++k) {
-    const float* Ak = &A[k * strideA];
-    const float* Bk = &B[k * strideB];
-    float* Ck = &C[k * M * N];
-    gemm<platform::CPUDeviceContext, float>(context, transA, transB, M, N, K,
-                                            alpha, Ak, Bk, beta, Ck);
-  }
-}
-
-template <>
-void batched_gemm<platform::CPUDeviceContext, double>(
-    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const double alpha, const double* A, const double* B, const double beta,
-    double* C, const int batchCount, const int strideA, const int strideB) {
-  for (int k = 0; k < batchCount; ++k) {
-    const double* Ak = &A[k * strideA];
-    const double* Bk = &B[k * strideB];
-    double* Ck = &C[k * M * N];
-    gemm<platform::CPUDeviceContext, double>(context, transA, transB, M, N, K,
-                                             alpha, Ak, Bk, beta, Ck);
-  }
-}
-#endif
-
-template <>
-void gemv<platform::CPUDeviceContext, float>(
-    const platform::CPUDeviceContext& context, const bool trans_a, const int M,
-    const int N, const float alpha, const float* A, const float* B,
-    const float beta, float* C) {
-  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
-  cblas_sgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
-}
-
-template <>
-void gemv<platform::CPUDeviceContext, double>(
-    const platform::CPUDeviceContext& context, const bool trans_a, const int M,
-    const int N, const double alpha, const double* A, const double* B,
-    const double beta, double* C) {
-  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
-  cblas_dgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
-}
-
-template <>
-void axpy<platform::CPUDeviceContext, float>(
-    const platform::CPUDeviceContext& context, const int n, const float alpha,
-    const float* x, float* y) {
-  cblas_saxpy(n, alpha, x, 1, y, 1);
-}
-
-template <>
-void axpy<platform::CPUDeviceContext, double>(
-    const platform::CPUDeviceContext& context, const int n, const double alpha,
-    const double* x, double* y) {
-  cblas_daxpy(n, alpha, x, 1, y, 1);
-}
-
+template struct SetConstant<platform::CPUDeviceContext, platform::float16>;
 template struct SetConstant<platform::CPUDeviceContext, float>;
 template struct SetConstant<platform::CPUDeviceContext, double>;
 template struct SetConstant<platform::CPUDeviceContext, int>;
 template struct SetConstant<platform::CPUDeviceContext, int64_t>;
 template struct SetConstant<platform::CPUDeviceContext, bool>;
+template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
 
 #define DEFINE_CPU_TRANS(RANK)                                             \
   template struct Transpose<platform::CPUDeviceContext, platform::float16, \
@@ -291,7 +39,9 @@ template struct SetConstant<platform::CPUDeviceContext, bool>;
   template struct Transpose<platform::CPUDeviceContext, double, RANK>;     \
   template struct Transpose<platform::CPUDeviceContext, int, RANK>;        \
   template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;    \
-  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;
+  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;       \
+  template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>;    \
+  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;
 
 DEFINE_CPU_TRANS(1);
 DEFINE_CPU_TRANS(2);
@@ -321,6 +71,14 @@ void set_constant_with_place<platform::CPUPlace>(
                            TensorSetConstantCPU(tensor, value));
 }
 
+template <>
+void set_constant_with_place<platform::CUDAPinnedPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  framework::VisitDataType(framework::ToDataType(tensor->type()),
+                           TensorSetConstantCPU(tensor, value));
+}
+
 struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
   TensorSetConstantWithPlace(const platform::DeviceContext& context,
                              framework::Tensor* tensor, float value)
@@ -371,6 +129,8 @@ template struct RowwiseAdd<platform::CPUDeviceContext, double>;
 
 template struct ColwiseSum<platform::CPUDeviceContext, float>;
 template struct ColwiseSum<platform::CPUDeviceContext, double>;
+template struct ColwiseSum<platform::CPUDeviceContext, int>;
+template struct ColwiseSum<platform::CPUDeviceContext, int64_t>;
 
 template struct RowwiseSum<platform::CPUDeviceContext, float>;
 template struct RowwiseSum<platform::CPUDeviceContext, double>;
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index 3abbcdb71d03eaf6f8eba3d97150d27ac5a5405e..d5af718723e8d44da0971ea7756b8c36e771cca2 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
+#include <vector>
 #include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/math_function_impl.h"
 #include "paddle/fluid/platform/float16.h"
@@ -24,339 +26,17 @@ namespace math {
 
 using float16 = paddle::platform::float16;
 
-template <>
-void gemm<platform::CUDADeviceContext, float16>(
-    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const float16 alpha, const float16* A, const float16* B, const float16 beta,
-    float16* C) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-  const half h_alpha = static_cast<const half>(alpha);
-  const half h_beta = static_cast<const half>(beta);
-  const half* h_A = reinterpret_cast<const half*>(A);
-  const half* h_B = reinterpret_cast<const half*>(B);
-  half* h_C = reinterpret_cast<half*>(C);
-
-  // TODO(kexinzhao): add processing code for compute capability < 53 case
-  PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
-                    "cublas Hgemm requires GPU compute capability >= 53");
-  PADDLE_ENFORCE(platform::dynload::cublasHgemm(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
-      h_A, lda, &h_beta, h_C, N));
-}
-
-template <>
-void gemm<platform::CUDADeviceContext, float>(
-    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const float alpha, const float* A, const float* B, const float beta,
-    float* C) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-  PADDLE_ENFORCE(platform::dynload::cublasSgemm(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
-      lda, &beta, C, N));
-}
-
-template <>
-void gemm<platform::CUDADeviceContext, double>(
-    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const double alpha, const double* A, const double* B, const double beta,
-    double* C) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  PADDLE_ENFORCE(platform::dynload::cublasDgemm(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
-      lda, &beta, C, N));
-}
-
-template <>
-void gemm<platform::CUDADeviceContext, float16>(
-    const platform::CUDADeviceContext& context, const bool transA,
-    const bool transB, const int M, const int N, const int K,
-    const float16 alpha, const float16* A, const int lda, const float16* B,
-    const int ldb, const float16 beta, float16* C, const int ldc) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-  const half h_alpha = static_cast<const half>(alpha);
-  const half h_beta = static_cast<const half>(beta);
-  const half* h_A = reinterpret_cast<const half*>(A);
-  const half* h_B = reinterpret_cast<const half*>(B);
-  half* h_C = reinterpret_cast<half*>(C);
-
-  // TODO(kexinzhao): add processing code for compute capability < 53 case
-  PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
-                    "cublas Hgemm requires GPU compute capability >= 53");
-  PADDLE_ENFORCE(platform::dynload::cublasHgemm(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
-      h_A, lda, &h_beta, h_C, ldc));
-}
-
-template <>
-void gemm<platform::CUDADeviceContext, float>(
-    const platform::CUDADeviceContext& context, const bool transA,
-    const bool transB, const int M, const int N, const int K, const float alpha,
-    const float* A, const int lda, const float* B, const int ldb,
-    const float beta, float* C, const int ldc) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
-  PADDLE_ENFORCE(platform::dynload::cublasSgemm(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
-      lda, &beta, C, ldc));
-}
-
-template <>
-void gemm<platform::CUDADeviceContext, double>(
-    const platform::CUDADeviceContext& context, const bool transA,
-    const bool transB, const int M, const int N, const int K,
-    const double alpha, const double* A, const int lda, const double* B,
-    const int ldb, const double beta, double* C, const int ldc) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
-  PADDLE_ENFORCE(platform::dynload::cublasDgemm(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
-      lda, &beta, C, ldc));
-}
-
-template <>
-void matmul<platform::CUDADeviceContext, float16>(
-    const platform::CUDADeviceContext& context,
-    const framework::Tensor& matrix_a, bool trans_a,
-    const framework::Tensor& matrix_b, bool trans_b, float16 alpha,
-    framework::Tensor* matrix_out, float16 beta) {
-  auto dim_a = matrix_a.dims();
-  auto dim_b = matrix_b.dims();
-  auto dim_out = matrix_out->dims();
-  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-                 "The input and output of matmul be matrix");
-
-  PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
-                     platform::is_gpu_place(matrix_b.place()) &&
-                     platform::is_gpu_place(matrix_out->place()),
-                 "Matrix must all be in CUDAPlace");
-
-  int M = dim_out[0];
-  int N = dim_out[1];
-  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
-
-  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
-
-  gemm<platform::CUDADeviceContext, float16>(
-      context, transA, transB, M, N, K, alpha, matrix_a.data<float16>(),
-      matrix_b.data<float16>(), beta, matrix_out->data<float16>());
-}
-
-template <>
-void matmul<platform::CUDADeviceContext, float>(
-    const platform::CUDADeviceContext& context,
-    const framework::Tensor& matrix_a, bool trans_a,
-    const framework::Tensor& matrix_b, bool trans_b, float alpha,
-    framework::Tensor* matrix_out, float beta) {
-  auto dim_a = matrix_a.dims();
-  auto dim_b = matrix_b.dims();
-  auto dim_out = matrix_out->dims();
-  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-                 "The input and output of matmul be matrix");
-
-  PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
-                     platform::is_gpu_place(matrix_b.place()) &&
-                     platform::is_gpu_place(matrix_out->place()),
-                 "Matrix must all be in CUDAPlace");
-
-  int M = dim_out[0];
-  int N = dim_out[1];
-  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
-
-  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
-
-  gemm<platform::CUDADeviceContext, float>(
-      context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
-      matrix_b.data<float>(), beta, matrix_out->data<float>());
-}
-
-template <>
-void matmul<platform::CUDADeviceContext, double>(
-    const platform::CUDADeviceContext& context,
-    const framework::Tensor& matrix_a, bool trans_a,
-    const framework::Tensor& matrix_b, bool trans_b, double alpha,
-    framework::Tensor* matrix_out, double beta) {
-  auto dim_a = matrix_a.dims();
-  auto dim_b = matrix_b.dims();
-  auto dim_out = matrix_out->dims();
-  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-                 "The input and output of matmul be matrix");
-
-  PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
-                     platform::is_gpu_place(matrix_b.place()) &&
-                     platform::is_gpu_place(matrix_out->place()),
-                 "Matrix must all be in CUDAPlace");
-
-  int M = dim_out[0];
-  int N = dim_out[1];
-  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
-
-  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
-
-  gemm<platform::CUDADeviceContext, double>(
-      context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
-      matrix_b.data<double>(), beta, matrix_out->data<double>());
-}
-
-template <>
-void batched_gemm<platform::CUDADeviceContext, float16>(
-    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const float16 alpha, const float16* A, const float16* B, const float16 beta,
-    float16* C, const int batchCount, const int strideA, const int strideB) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  const int strideC = M * N;
-
-  const half h_alpha = static_cast<const half>(alpha);
-  const half h_beta = static_cast<const half>(beta);
-  const half* h_A = reinterpret_cast<const half*>(A);
-  const half* h_B = reinterpret_cast<const half*>(B);
-  half* h_C = reinterpret_cast<half*>(C);
-
-  // TODO(kexinzhao): add processing code for compute capability < 53 case
-  PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
-                    "cublas Hgemm requires GPU compute capability >= 53");
-  PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
-      strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount));
-}
-
-template <>
-void batched_gemm<platform::CUDADeviceContext, float>(
-    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const float alpha, const float* A, const float* B, const float beta,
-    float* C, const int batchCount, const int strideA, const int strideB) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  const int strideC = M * N;
-
-  PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
-      strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
-}
-
-template <>
-void batched_gemm<platform::CUDADeviceContext, double>(
-    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const double alpha, const double* A, const double* B, const double beta,
-    double* C, const int batchCount, const int strideA, const int strideB) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  const int strideC = M * N;
-
-  PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
-      strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
-}
-
-template <>
-void gemv<platform::CUDADeviceContext, float>(
-    const platform::CUDADeviceContext& context, const bool trans_a, const int M,
-    const int N, const float alpha, const float* A, const float* B,
-    const float beta, float* C) {
-  cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N;
-
-  PADDLE_ENFORCE(platform::dynload::cublasSgemv(context.cublas_handle(),
-                                                cuTransA, N, M, &alpha, A, N, B,
-                                                1, &beta, C, 1));
-}
-
-template <>
-void gemv<platform::CUDADeviceContext, double>(
-    const platform::CUDADeviceContext& context, const bool trans_a, const int M,
-    const int N, const double alpha, const double* A, const double* B,
-    const double beta, double* C) {
-  cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N;
-  PADDLE_ENFORCE(platform::dynload::cublasDgemv(context.cublas_handle(),
-                                                cuTransA, N, M, &alpha, A, N, B,
-                                                1, &beta, C, 1));
-}
-
-template <>
-void axpy<platform::CUDADeviceContext, float>(
-    const platform::CUDADeviceContext& context, const int n, const float alpha,
-    const float* x, float* y) {
-  PADDLE_ENFORCE(platform::dynload::cublasSaxpy(context.cublas_handle(), n,
-                                                &alpha, x, 1, y, 1));
-}
-
-template <>
-void axpy<platform::CUDADeviceContext, double>(
-    const platform::CUDADeviceContext& context, const int n, const double alpha,
-    const double* x, double* y) {
-  PADDLE_ENFORCE(platform::dynload::cublasDaxpy(context.cublas_handle(), n,
-                                                &alpha, x, 1, y, 1));
-}
-
+template struct SetConstant<platform::CUDADeviceContext, platform::float16>;
 template struct SetConstant<platform::CUDADeviceContext, float>;
 template struct SetConstant<platform::CUDADeviceContext, double>;
 template struct SetConstant<platform::CUDADeviceContext, int>;
 template struct SetConstant<platform::CUDADeviceContext, int64_t>;
 template struct SetConstant<platform::CUDADeviceContext, bool>;
 
-#define DEFINE_GPU_TRANS(RANK)                                         \
-  template struct Transpose<platform::CUDADeviceContext, float, RANK>; \
-  template struct Transpose<platform::CUDADeviceContext, double, RANK>;
+#define DEFINE_GPU_TRANS(RANK)                                          \
+  template struct Transpose<platform::CUDADeviceContext, float, RANK>;  \
+  template struct Transpose<platform::CUDADeviceContext, double, RANK>; \
+  template struct Transpose<platform::CUDADeviceContext, float16, RANK>;
 
 DEFINE_GPU_TRANS(1);
 DEFINE_GPU_TRANS(2);
@@ -422,6 +102,8 @@ struct RowwiseAdd<platform::CUDADeviceContext, T> {
 template struct RowwiseAdd<platform::CUDADeviceContext, float>;
 template struct RowwiseAdd<platform::CUDADeviceContext, double>;
 template struct ColwiseSum<platform::CUDADeviceContext, float>;
+template struct ColwiseSum<platform::CUDADeviceContext, int>;
+template struct ColwiseSum<platform::CUDADeviceContext, int64_t>;
 // template struct ColwiseSum<platform::CUDADeviceContext, double>;
 // The ColwiseSum<platform::CUDADeviceContext, double> failed in debug mode,
 // and only failed for this case. So reimplemented it.
@@ -436,10 +118,9 @@ void ColwiseSum<platform::CUDADeviceContext, double>::operator()(
   one.mutable_data<double>({in_dims[0]}, context.GetPlace());
   SetConstant<platform::CUDADeviceContext, double> set;
   set(context, &one, static_cast<double>(1.0));
-  gemv<platform::CUDADeviceContext, double>(
-      context, true, static_cast<int>(in_dims[0]), static_cast<int>(in_dims[1]),
-      1.0, input.data<double>(), one.data<double>(), 0.0,
-      vector->data<double>());
+  GetBlas<platform::CUDADeviceContext, double>(context).GEMV(
+      true, static_cast<int>(in_dims[0]), static_cast<int>(in_dims[1]), 1.0,
+      input.data<double>(), one.data<double>(), 0.0, vector->data<double>());
 }
 
 template struct RowwiseSum<platform::CUDADeviceContext, float>;
@@ -458,10 +139,9 @@ void RowwiseSum<platform::CUDADeviceContext, double>::operator()(
   one.mutable_data<double>({size}, context.GetPlace());
   SetConstant<platform::CUDADeviceContext, double> set;
   set(context, &one, static_cast<double>(1.0));
-  gemv<platform::CUDADeviceContext, double>(
-      context, true, static_cast<int>(in_dims[1]), static_cast<int>(in_dims[0]),
-      1.0, one.data<double>(), input.data<double>(), 0.0,
-      vector->data<double>());
+  GetBlas<platform::CUDADeviceContext, double>(context).GEMV(
+      true, static_cast<int>(in_dims[1]), static_cast<int>(in_dims[0]), 1.0,
+      one.data<double>(), input.data<double>(), 0.0, vector->data<double>());
 }
 
 template struct RowwiseMean<platform::CUDADeviceContext, float>;
diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h
index 47e2386d0578265330088eeac6c57fe2518f951a..56a039d3cec7375517573c9429801945bf99741e 100644
--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
@@ -14,26 +14,19 @@ limitations under the License. */
 
 #pragma once
 #ifdef PADDLE_WITH_MKLML
-#include <mkl_cblas.h>
-#include <mkl_lapacke.h>
-#include <mkl_vml_functions.h>
-#endif
-
-#ifdef PADDLE_USE_ATLAS
-extern "C" {
-#include <cblas.h>
-#include <clapack.h>
-}
+#include "paddle/fluid/platform/dynload/mklml.h"
 #endif
 
 #ifdef PADDLE_USE_OPENBLAS
 #include <cblas.h>
+#ifdef LAPACK_FOUND
 #include <lapacke.h>
 #endif
+#endif
 
 #ifndef LAPACK_FOUND
 extern "C" {
-#include <cblas.h>
+#include <cblas.h>  // NOLINT
 int LAPACKE_sgetrf(int matrix_layout, int m, int n, float* a, int lda,
                    int* ipiv);
 int LAPACKE_dgetrf(int matrix_layout, int m, int n, double* a, int lda,
@@ -46,8 +39,10 @@ int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
 #endif
 
 #include <cmath>
+#include <vector>
 
 #include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -56,47 +51,6 @@ int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
 namespace paddle {
 namespace operators {
 namespace math {
-
-// Support continuous memory now
-// If transA = N, and transB = N
-// Then matrixA: M * K, matrixB: K * N, matrixC : M * N
-// For more detailed info, please refer to
-// http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html
-template <typename DeviceContext, typename T>
-void gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA,
-          const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-          const T alpha, const T* A, const T* B, const T beta, T* C);
-
-// gemm wrapper with stride args for matrix uncontinuous in memory
-template <typename DeviceContext, typename T>
-void gemm(const DeviceContext& context, const bool transA, const bool transB,
-          const int M, const int N, const int K, const T alpha, const T* A,
-          const int lda, const T* B, const int ldb, const T beta, T* C,
-          const int ldc);
-
-// matrix multiply with continuous memory
-template <typename DeviceContext, typename T>
-void matmul(const DeviceContext& context, const framework::Tensor& matrix_a,
-            bool trans_a, const framework::Tensor& matrix_b, bool trans_b,
-            T alpha, framework::Tensor* matrix_out, T beta);
-
-// Batched gemm
-template <typename DeviceContext, typename T>
-void batched_gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA,
-                  const CBLAS_TRANSPOSE transB, const int M, const int N,
-                  const int K, const T alpha, const T* A, const T* B,
-                  const T beta, T* C, const int batchCount, const int strideA,
-                  const int strideB);
-
-template <typename DeviceContext, typename T>
-void gemv(const DeviceContext& context, const bool trans_a, const int M,
-          const int N, const T alpha, const T* A, const T* B, const T beta,
-          T* C);
-
-template <typename DeviceContext, typename T>
-void axpy(const DeviceContext& context, const int n, const T alpha, const T* x,
-          T* y);
-
 template <typename DeviceContext, typename T, int Rank>
 struct Transpose {
   void operator()(const DeviceContext& context, const framework::Tensor& in,
diff --git a/paddle/fluid/operators/math/math_function_impl.h b/paddle/fluid/operators/math/math_function_impl.h
index f9d4e4532428e3387ae39621d155434f05eb89d3..b9bd49d77d935e985705f78402ffe1ea90f24cb3 100644
--- a/paddle/fluid/operators/math/math_function_impl.h
+++ b/paddle/fluid/operators/math/math_function_impl.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <vector>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc
index 25a9d0111eee45b28adff012b705cbfa2407d2b6..b545671b43d3a453ab03e4774427179617f62db0 100644
--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
@@ -13,6 +13,14 @@
 // limitations under the License.
 #include "paddle/fluid/operators/math/math_function.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/operators/math/blas.h"
+
+template <typename T>
+inline paddle::operators::math::BlasT<paddle::platform::CPUDeviceContext, T>
+GetBlas(const paddle::platform::CPUDeviceContext& context) {
+  return paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext,
+                                          T>(context);
+}
 
 TEST(math_function, gemm_notrans_cblas) {
   paddle::framework::Tensor input1;
@@ -34,9 +42,8 @@ TEST(math_function, gemm_notrans_cblas) {
   memcpy(input3_ptr, arr3, 8 * sizeof(float));
 
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  paddle::operators::math::gemm<paddle::platform::CPUDeviceContext, float>(
-      context, false, false, m, n, k, 1, input1_ptr, 3, input2_ptr + 1, 4, 1,
-      input3_ptr + 1, 4);
+  GetBlas<float>(context).GEMM(false, false, m, n, k, 1, input1_ptr, 3,
+                               input2_ptr + 1, 4, 1, input3_ptr + 1, 4);
 
   EXPECT_EQ(input3_ptr[0], 0);
   EXPECT_EQ(input3_ptr[1], 24);
@@ -68,9 +75,10 @@ TEST(math_function, gemm_trans_clbas) {
   memcpy(input3_ptr, arr3, 8 * sizeof(float));
 
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  paddle::operators::math::gemm<paddle::platform::CPUDeviceContext, float>(
-      context, false, true, m, n, k, 1, input1_ptr, 3, input2_ptr + 3, 3, 1,
-      input3_ptr + 1, 4);
+  GetBlas<float>(context).GEMM(false, true, m, n, k, 1, input1_ptr, 3,
+                               input2_ptr + 3, 3, 1, input3_ptr + 1, 4);
+  delete cpu_place;
+  cpu_place = NULL;
 
   EXPECT_EQ(input3_ptr[0], 0);
   EXPECT_EQ(input3_ptr[1], 24);
@@ -124,9 +132,8 @@ void GemvTest(int m, int n, bool trans) {
   }
 
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  paddle::operators::math::gemv<paddle::platform::CPUDeviceContext, T>(
-      context, trans, static_cast<int>(m), static_cast<int>(n), 1., data_a,
-      data_b, 0., data_c);
+  GetBlas<T>(context).GEMV(trans, static_cast<int>(m), static_cast<int>(n), 1.,
+                           data_a, data_b, 0., data_c);
 
   if (!trans) {
     for (int i = 0; i < m; ++i) {
diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu
index 8982d9d066165a9da0461288685baa0c60e5f114..bcbb4a8274f149240b9f0990f38d9f38bdd0e5b1 100644
--- a/paddle/fluid/operators/math/math_function_test.cu
+++ b/paddle/fluid/operators/math/math_function_test.cu
@@ -12,7 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "gtest/gtest.h"
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/device_context.h"
 
 void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size,
                     const std::vector<float>& data) {
@@ -22,33 +24,36 @@ void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size,
   }
 }
 
-TEST(math_function, notrans_mul_trans_fp32) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
+template <typename T>
+inline paddle::operators::math::BlasT<paddle::platform::CUDADeviceContext, T>
+GetBlas(const paddle::platform::CUDADeviceContext& context) {
+  return paddle::operators::math::GetBlas<paddle::platform::CUDADeviceContext,
+                                          T>(context);
+}
 
-  Tensor input1;
-  Tensor input1_gpu;
-  Tensor input2_gpu;
-  Tensor out_gpu;
-  Tensor out;
+TEST(math_function, notrans_mul_trans_fp32) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor out_gpu;
+  paddle::framework::Tensor out;
 
-  CPUPlace cpu_place;
-  CUDAPlace gpu_place(0);
-  CUDADeviceContext context(gpu_place);
+  paddle::platform::CPUPlace cpu_place;
+  paddle::platform::CUDAPlace gpu_place(0);
+  paddle::platform::CUDADeviceContext context(gpu_place);
 
   float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
   float arr[6] = {0, 1, 2, 3, 4, 5};
   memcpy(input1_ptr, arr, 6 * sizeof(float));
 
-  TensorCopy(input1, gpu_place, context, &input1_gpu);
-  TensorCopy(input1, gpu_place, context, &input2_gpu);
+  paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
+  paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
 
   out_gpu.mutable_data<float>({2, 2}, gpu_place);
+  GetBlas<float>(context).MatMul(input1_gpu, false, input2_gpu, true, 1,
+                                 &out_gpu, 0);
 
-  paddle::operators::math::matmul<CUDADeviceContext, float>(
-      context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
-
-  TensorCopy(out_gpu, cpu_place, context, &out);
+  paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
 
   float* out_ptr = out.data<float>();
   context.Wait();
@@ -59,39 +64,37 @@ TEST(math_function, notrans_mul_trans_fp32) {
 }
 
 TEST(math_function, notrans_mul_trans_fp16) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-
-  Tensor input1;
-  Tensor input1_gpu;
-  Tensor input2_gpu;
-  Tensor out_gpu;
-  Tensor out;
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor out_gpu;
+  paddle::framework::Tensor out;
 
-  CPUPlace cpu_place;
-  CUDAPlace gpu_place(0);
-  CUDADeviceContext context(gpu_place);
+  paddle::platform::CPUPlace cpu_place;
+  paddle::platform::CUDAPlace gpu_place(0);
+  paddle::platform::CUDADeviceContext context(gpu_place);
 
   // fp16 GEMM in cublas requires GPU compute capability >= 53
   if (context.GetComputeCapability() < 53) {
     return;
   }
 
-  float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
+  paddle::platform::float16* input1_ptr =
+      input1.mutable_data<paddle::platform::float16>({2, 3}, cpu_place);
   fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
 
-  TensorCopy(input1, gpu_place, context, &input1_gpu);
-  TensorCopy(input1, gpu_place, context, &input2_gpu);
+  paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
+  paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
 
-  out_gpu.mutable_data<float16>({2, 2}, gpu_place);
+  out_gpu.mutable_data<paddle::platform::float16>({2, 2}, gpu_place);
 
-  paddle::operators::math::matmul<CUDADeviceContext, float16>(
-      context, input1_gpu, false, input2_gpu, true, float16(1), &out_gpu,
-      float16(0));
+  GetBlas<paddle::platform::float16>(context).MatMul(
+      input1_gpu, false, input2_gpu, true, paddle::platform::float16(1),
+      &out_gpu, paddle::platform::float16(0));
 
-  TensorCopy(out_gpu, cpu_place, context, &out);
+  paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
 
-  float16* out_ptr = out.data<float16>();
+  paddle::platform::float16* out_ptr = out.data<paddle::platform::float16>();
   context.Wait();
   EXPECT_EQ(static_cast<float>(out_ptr[0]), 5);
   EXPECT_EQ(static_cast<float>(out_ptr[1]), 14);
@@ -100,32 +103,29 @@ TEST(math_function, notrans_mul_trans_fp16) {
 }
 
 TEST(math_function, trans_mul_notrans_fp32) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-
-  Tensor input1;
-  Tensor input1_gpu;
-  Tensor input2_gpu;
-  Tensor out_gpu;
-  Tensor out;
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor out_gpu;
+  paddle::framework::Tensor out;
 
-  CPUPlace cpu_place;
-  CUDAPlace gpu_place(0);
-  CUDADeviceContext context(gpu_place);
+  paddle::platform::CPUPlace cpu_place;
+  paddle::platform::CUDAPlace gpu_place(0);
+  paddle::platform::CUDADeviceContext context(gpu_place);
 
   float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
   float arr[6] = {0, 1, 2, 3, 4, 5};
   memcpy(input1_ptr, arr, 6 * sizeof(float));
 
-  TensorCopy(input1, gpu_place, context, &input1_gpu);
-  TensorCopy(input1, gpu_place, context, &input2_gpu);
+  paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
+  paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
 
   out_gpu.mutable_data<float>({3, 3}, gpu_place);
 
-  paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
-      context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
+  GetBlas<float>(context).MatMul(input1_gpu, true, input2_gpu, false, 1,
+                                 &out_gpu, 0);
 
-  TensorCopy(out_gpu, cpu_place, context, &out);
+  paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
 
   float* out_ptr = out.data<float>();
   context.Wait();
@@ -141,39 +141,37 @@ TEST(math_function, trans_mul_notrans_fp32) {
 }
 
 TEST(math_function, trans_mul_notrans_fp16) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor out_gpu;
+  paddle::framework::Tensor out;
 
-  Tensor input1;
-  Tensor input1_gpu;
-  Tensor input2_gpu;
-  Tensor out_gpu;
-  Tensor out;
-
-  CPUPlace cpu_place;
-  CUDAPlace gpu_place(0);
-  CUDADeviceContext context(gpu_place);
+  paddle::platform::CPUPlace cpu_place;
+  paddle::platform::CUDAPlace gpu_place(0);
+  paddle::platform::CUDADeviceContext context(gpu_place);
 
   // fp16 GEMM in cublas requires GPU compute capability >= 53
   if (context.GetComputeCapability() < 53) {
     return;
   }
 
-  float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
+  paddle::platform::float16* input1_ptr =
+      input1.mutable_data<paddle::platform::float16>({2, 3}, cpu_place);
   fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
 
-  TensorCopy(input1, gpu_place, context, &input1_gpu);
-  TensorCopy(input1, gpu_place, context, &input2_gpu);
+  paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
+  paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
 
-  out_gpu.mutable_data<float16>({3, 3}, gpu_place);
+  out_gpu.mutable_data<paddle::platform::float16>({3, 3}, gpu_place);
 
-  paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float16>(
-      context, input1_gpu, true, input2_gpu, false, float16(1), &out_gpu,
-      float16(0));
+  GetBlas<paddle::platform::float16>(context).MatMul(
+      input1_gpu, true, input2_gpu, false, paddle::platform::float16(1),
+      &out_gpu, paddle::platform::float16(0));
 
-  TensorCopy(out_gpu, cpu_place, context, &out);
+  paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
 
-  float16* out_ptr = out.data<float16>();
+  paddle::platform::float16* out_ptr = out.data<paddle::platform::float16>();
   context.Wait();
   EXPECT_EQ(static_cast<float>(out_ptr[0]), 9);
   EXPECT_EQ(static_cast<float>(out_ptr[1]), 12);
@@ -187,19 +185,16 @@ TEST(math_function, trans_mul_notrans_fp16) {
 }
 
 TEST(math_function, gemm_notrans_cublas_fp32) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-
-  Tensor input1;
-  Tensor input2;
-  Tensor input3;
-  Tensor input1_gpu;
-  Tensor input2_gpu;
-  Tensor input3_gpu;
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor input3_gpu;
 
-  CPUPlace cpu_place;
-  CUDAPlace gpu_place(0);
-  CUDADeviceContext context(gpu_place);
+  paddle::platform::CPUPlace cpu_place;
+  paddle::platform::CUDAPlace gpu_place(0);
+  paddle::platform::CUDADeviceContext context(gpu_place);
 
   int m = 2;
   int n = 3;
@@ -214,17 +209,17 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
   float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
   memcpy(input3_ptr, arr3, 8 * sizeof(float));
 
-  TensorCopy(input1, gpu_place, context, &input1_gpu);
-  TensorCopy(input2, gpu_place, context, &input2_gpu);
-  TensorCopy(input3, gpu_place, context, &input3_gpu);
+  paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
+  paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu);
+  paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu);
   float* a = input1_gpu.data<float>();
   float* b = input2_gpu.data<float>();
   float* c = input3_gpu.mutable_data<float>(gpu_place);
 
-  paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
-      context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
+  GetBlas<float>(context).GEMM(false, false, m, n, k, 1, a, 3, b + 1, 4, 1,
+                               c + 1, 4);
 
-  TensorCopy(input3_gpu, cpu_place, context, &input3);
+  paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
 
   // numpy code:
   // a = np.arange(6).reshape(2, 3)
@@ -244,19 +239,16 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
 }
 
 TEST(math_function, gemm_notrans_cublas_fp16) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-
-  Tensor input1;
-  Tensor input2;
-  Tensor input3;
-  Tensor input1_gpu;
-  Tensor input2_gpu;
-  Tensor input3_gpu;
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor input3_gpu;
 
-  CPUPlace cpu_place;
-  CUDAPlace gpu_place(0);
-  CUDADeviceContext context(gpu_place);
+  paddle::platform::CPUPlace cpu_place;
+  paddle::platform::CUDAPlace gpu_place(0);
+  paddle::platform::CUDADeviceContext context(gpu_place);
 
   // fp16 GEMM in cublas requires GPU compute capability >= 53
   if (context.GetComputeCapability() < 53) {
@@ -266,26 +258,30 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
   int m = 2;
   int n = 3;
   int k = 3;
-  float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
+  paddle::platform::float16* input1_ptr =
+      input1.mutable_data<paddle::platform::float16>({2, 3}, cpu_place);
   fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
-  float16* input2_ptr = input2.mutable_data<float16>({3, 4}, cpu_place);
+  paddle::platform::float16* input2_ptr =
+      input2.mutable_data<paddle::platform::float16>({3, 4}, cpu_place);
   fill_fp16_data(input2_ptr, input2.numel(),
                  {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
-  float16* input3_ptr = input3.mutable_data<float16>({2, 4}, cpu_place);
+  paddle::platform::float16* input3_ptr =
+      input3.mutable_data<paddle::platform::float16>({2, 4}, cpu_place);
   fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
 
-  TensorCopy(input1, gpu_place, context, &input1_gpu);
-  TensorCopy(input2, gpu_place, context, &input2_gpu);
-  TensorCopy(input3, gpu_place, context, &input3_gpu);
-  float16* a = input1_gpu.data<float16>();
-  float16* b = input2_gpu.data<float16>();
-  float16* c = input3_gpu.mutable_data<float16>(gpu_place);
+  paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
+  paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu);
+  paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu);
+  paddle::platform::float16* a = input1_gpu.data<paddle::platform::float16>();
+  paddle::platform::float16* b = input2_gpu.data<paddle::platform::float16>();
+  paddle::platform::float16* c =
+      input3_gpu.mutable_data<paddle::platform::float16>(gpu_place);
 
-  paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float16>(
-      context, false, false, m, n, k, float16(1), a, 3, b + 1, 4, float16(1),
-      c + 1, 4);
+  GetBlas<paddle::platform::float16>(context).GEMM(
+      false, false, m, n, k, static_cast<paddle::platform::float16>(1), a, 3,
+      b + 1, 4, static_cast<paddle::platform::float16>(1), c + 1, 4);
 
-  TensorCopy(input3_gpu, cpu_place, context, &input3);
+  paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
 
   // numpy code:
   // a = np.arange(6).reshape(2, 3)
@@ -305,19 +301,16 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
 }
 
 TEST(math_function, gemm_trans_cublas_fp32) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor input3_gpu;
 
-  Tensor input1;
-  Tensor input2;
-  Tensor input3;
-  Tensor input1_gpu;
-  Tensor input2_gpu;
-  Tensor input3_gpu;
-
-  CPUPlace cpu_place;
-  CUDAPlace gpu_place(0);
-  CUDADeviceContext context(gpu_place);
+  paddle::platform::CPUPlace cpu_place;
+  paddle::platform::CUDAPlace gpu_place(0);
+  paddle::platform::CUDADeviceContext context(gpu_place);
 
   int m = 2;
   int n = 3;
@@ -332,17 +325,17 @@ TEST(math_function, gemm_trans_cublas_fp32) {
   float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
   memcpy(input3_ptr, arr3, 8 * sizeof(float));
 
-  TensorCopy(input1, gpu_place, context, &input1_gpu);
-  TensorCopy(input2, gpu_place, context, &input2_gpu);
-  TensorCopy(input3, gpu_place, context, &input3_gpu);
+  paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
+  paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu);
+  paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu);
   float* a = input1_gpu.data<float>();
   float* b = input2_gpu.data<float>();
   float* c = input3_gpu.mutable_data<float>(gpu_place);
 
-  paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
-      context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
+  GetBlas<float>(context).GEMM(false, true, m, n, k, 1, a, 3, b + 3, 3, 1,
+                               c + 1, 4);
 
-  TensorCopy(input3_gpu, cpu_place, context, &input3);
+  paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
 
   context.Wait();
   EXPECT_EQ(input3_ptr[0], 0);
@@ -356,19 +349,16 @@ TEST(math_function, gemm_trans_cublas_fp32) {
 }
 
 TEST(math_function, gemm_trans_cublas_fp16) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-
-  Tensor input1;
-  Tensor input2;
-  Tensor input3;
-  Tensor input1_gpu;
-  Tensor input2_gpu;
-  Tensor input3_gpu;
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor input3_gpu;
 
-  CPUPlace cpu_place;
-  CUDAPlace gpu_place(0);
-  CUDADeviceContext context(gpu_place);
+  paddle::platform::CPUPlace cpu_place;
+  paddle::platform::CUDAPlace gpu_place(0);
+  paddle::platform::CUDADeviceContext context(gpu_place);
 
   // fp16 GEMM in cublas requires GPU compute capability >= 53
   if (context.GetComputeCapability() < 53) {
@@ -378,26 +368,30 @@ TEST(math_function, gemm_trans_cublas_fp16) {
   int m = 2;
   int n = 3;
   int k = 3;
-  float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
+  paddle::platform::float16* input1_ptr =
+      input1.mutable_data<paddle::platform::float16>({2, 3}, cpu_place);
   fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
-  float16* input2_ptr = input2.mutable_data<float16>({4, 3}, cpu_place);
+  paddle::platform::float16* input2_ptr =
+      input2.mutable_data<paddle::platform::float16>({4, 3}, cpu_place);
   fill_fp16_data(input2_ptr, input2.numel(),
                  {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11});
-  float16* input3_ptr = input3.mutable_data<float16>({2, 4}, cpu_place);
+  paddle::platform::float16* input3_ptr =
+      input3.mutable_data<paddle::platform::float16>({2, 4}, cpu_place);
   fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
 
-  TensorCopy(input1, gpu_place, context, &input1_gpu);
-  TensorCopy(input2, gpu_place, context, &input2_gpu);
-  TensorCopy(input3, gpu_place, context, &input3_gpu);
-  float16* a = input1_gpu.data<float16>();
-  float16* b = input2_gpu.data<float16>();
-  float16* c = input3_gpu.mutable_data<float16>(gpu_place);
+  paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu);
+  paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu);
+  paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu);
+  paddle::platform::float16* a = input1_gpu.data<paddle::platform::float16>();
+  paddle::platform::float16* b = input2_gpu.data<paddle::platform::float16>();
+  paddle::platform::float16* c =
+      input3_gpu.mutable_data<paddle::platform::float16>(gpu_place);
 
-  paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float16>(
-      context, false, true, m, n, k, float16(1), a, 3, b + 3, 3, float16(1),
-      c + 1, 4);
+  GetBlas<paddle::platform::float16>(context).GEMM(
+      false, true, m, n, k, static_cast<paddle::platform::float16>(1), a, 3,
+      b + 3, 3, static_cast<paddle::platform::float16>(1), c + 1, 4);
 
-  TensorCopy(input3_gpu, cpu_place, context, &input3);
+  paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
 
   context.Wait();
   EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0);
@@ -412,24 +406,21 @@ TEST(math_function, gemm_trans_cublas_fp16) {
 
 template <typename T>
 void GemvTest(int m, int n, bool trans) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-
-  Tensor mat_a;
-  Tensor vec_b;
-  Tensor vec_c;
+  paddle::framework::Tensor mat_a;
+  paddle::framework::Tensor vec_b;
+  paddle::framework::Tensor vec_c;
 
-  CPUPlace cpu_place;
-  CUDAPlace gpu_place(0);
-  CUDADeviceContext context(gpu_place);
+  paddle::platform::CPUPlace cpu_place;
+  paddle::platform::CUDAPlace gpu_place(0);
+  paddle::platform::CUDADeviceContext context(gpu_place);
 
   T* data_a = mat_a.mutable_data<T>({m, n}, cpu_place);
   T* data_b = vec_b.mutable_data<T>({trans ? m : n}, cpu_place);
   T* data_c = vec_c.mutable_data<T>({trans ? n : m}, cpu_place);
 
-  Tensor g_mat_a;
-  Tensor g_vec_b;
-  Tensor g_vec_c;
+  paddle::framework::Tensor g_mat_a;
+  paddle::framework::Tensor g_vec_b;
+  paddle::framework::Tensor g_vec_c;
   T* g_data_a = g_mat_a.mutable_data<T>(mat_a.dims(), gpu_place);
   T* g_data_b = g_vec_b.mutable_data<T>(vec_b.dims(), gpu_place);
   T* g_data_c = g_vec_c.mutable_data<T>(vec_c.dims(), gpu_place);
@@ -441,14 +432,13 @@ void GemvTest(int m, int n, bool trans) {
     data_b[i] = static_cast<T>(i);
   }
 
-  TensorCopy(mat_a, gpu_place, context, &g_mat_a);
-  TensorCopy(vec_b, gpu_place, context, &g_vec_b);
+  paddle::framework::TensorCopySync(mat_a, gpu_place, &g_mat_a);
+  paddle::framework::TensorCopySync(vec_b, gpu_place, &g_vec_b);
 
-  paddle::operators::math::gemv<CUDADeviceContext, T>(
-      context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a,
-      g_data_b, 0., g_data_c);
+  GetBlas<T>(context).GEMV(trans, static_cast<int>(m), static_cast<int>(n), 1.,
+                           g_data_a, g_data_b, 0., g_data_c);
 
-  TensorCopy(g_vec_c, cpu_place, context, &vec_c);
+  paddle::framework::TensorCopySync(g_vec_c, cpu_place, &vec_c);
 
   if (!trans) {
     for (int i = 0; i < m; ++i) {
diff --git a/paddle/fluid/operators/math/matmul.h b/paddle/fluid/operators/math/matmul.h
deleted file mode 100644
index 6e2d35cd0f3581c742197a66842696a8e3a936b1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/math/matmul.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-// Implements the logic of numpy matmul:
-// https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html
-//
-// but allowing also for a, b to be transposed
-//
-// Both a & b can be 1- to 3-dimensional. Higher rank tensors are not supported
-// yet.
-template <typename DeviceContext, typename T>
-class MatMulFunctor {
- public:
-  void operator()(const DeviceContext& context, const framework::Tensor& a,
-                  bool trans_a, const framework::Tensor& b, bool trans_b,
-                  T alpha, framework::Tensor* out, T beta) {
-    auto dim_a = a.dims();
-    auto dim_b = b.dims();
-
-    PADDLE_ENFORCE(a.place() == b.place() && b.place() == out->place(),
-                   "Tensors must all be in the same place.");
-    PADDLE_ENFORCE_GE(dim_a.size(), 1,
-                      "Input tensor a must be at least 1-dimensional.");
-    PADDLE_ENFORCE_GE(dim_b.size(), 1,
-                      "Input tensor b must be at least 1-dimensional.");
-
-    std::vector<int64_t> out_dim;
-    int64_t batch_count = 1;
-    if (dim_a.size() > 3) {
-      PADDLE_ENFORCE(dim_b.size() == dim_a.size(),
-                     "The dimensions of X and Y must be the same, and both of "
-                     "them should be %d-dimensional.",
-                     dim_b.size());
-      // The first rank-2 dimensions are accumulated on the batch_count, and the
-      // last two dimensions are used for matrix multiplication.
-      for (int j = 0; j < dim_a.size() - 2; ++j) {
-        PADDLE_ENFORCE_EQ(dim_b[j], dim_a[j],
-                          "The %d-th dimension of X and Y must be the same.",
-                          j);
-        out_dim.push_back(dim_a[j]);
-        batch_count *= dim_a[j];
-      }
-    }
-
-    int M = 0, N = 0, kA = 0, kB = 0, batchCountA = 0, batchCountB = 0,
-        strideA = 0, strideB = 0;
-
-    switch (dim_a.size()) {
-      case 1:
-        // similar to np.matmul:
-        // prepend dimension 1 (no transpose) or append dimension 1 (transpose)
-        M = trans_a ? dim_a[0] : 1;
-        kA = trans_a ? 1 : dim_a[0];
-        break;
-      case 2:
-        M = trans_a ? dim_a[1] : dim_a[0];
-        kA = trans_a ? dim_a[0] : dim_a[1];
-        break;
-      case 3:
-        batchCountA = dim_a[0];
-        M = trans_a ? dim_a[2] : dim_a[1];
-        kA = trans_a ? dim_a[1] : dim_a[2];
-        strideA = M * kA;
-        break;
-      default:
-        batchCountA = batch_count;
-        size_t mat_s = dim_a.size() - 2;
-        M = trans_a ? dim_a[mat_s + 1] : dim_a[mat_s];
-        kA = trans_a ? dim_a[mat_s] : dim_a[mat_s + 1];
-        strideA = M * kA;
-    }
-
-    switch (dim_b.size()) {
-      case 1:
-        // similar to np.matmul:
-        // append dimension 1 (no transpose) or prepend dimension 1 (transpose)
-        kB = trans_b ? 1 : dim_b[0];
-        N = trans_b ? dim_b[0] : 1;
-        break;
-      case 2:
-        kB = trans_b ? dim_b[1] : dim_b[0];
-        N = trans_b ? dim_b[0] : dim_b[1];
-        break;
-      case 3:
-        batchCountB = dim_b[0];
-        kB = trans_b ? dim_b[2] : dim_b[1];
-        N = trans_b ? dim_b[1] : dim_b[2];
-        strideB = kB * N;
-        break;
-      default:
-        batchCountB = batch_count;
-        size_t mat_s = dim_b.size() - 2;
-        kB = trans_b ? dim_b[mat_s + 1] : dim_b[mat_s];
-        N = trans_b ? dim_b[mat_s] : dim_b[mat_s + 1];
-        strideB = kB * N;
-    }
-
-    PADDLE_ENFORCE_EQ(
-        kA, kB,
-        "First matrix's width must be equal with second matrix's height.");
-    if (batchCountA && batchCountB) {
-      PADDLE_ENFORCE_EQ(
-          batchCountA, batchCountB,
-          "When input tensors a and b are both batched, they must have the "
-          "same batch dimension.");
-    }
-    int batchCount = std::max(batchCountA, batchCountB);
-
-    CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
-    CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
-
-    if (!batchCount) {
-      // regular matrix multiplication
-      gemm<DeviceContext, T>(context, transA, transB, M, N, kA, alpha,
-                             a.data<T>(), b.data<T>(), beta, out->data<T>());
-    } else {
-      // batched matrix multiplication
-      batched_gemm<DeviceContext, T>(
-          context, transA, transB, M, N, kA, alpha, a.data<T>(), b.data<T>(),
-          beta, out->data<T>(), batchCount, strideA, strideB);
-    }
-  }
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/maxouting.cu b/paddle/fluid/operators/math/maxouting.cu
index 1e1a6a221c71c9d9cb9fda468360cb502c5ea52f..d9a23299a4d5750fc8c7fe3e5d1f8cd94bcb9cae 100644
--- a/paddle/fluid/operators/math/maxouting.cu
+++ b/paddle/fluid/operators/math/maxouting.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/maxouting.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc
index 97a2e81c84c060a8be57db6274839ee39edf466c..b871851798e48e6b598cb4ab8e2e42db478a3820 100644
--- a/paddle/fluid/operators/math/pooling.cc
+++ b/paddle/fluid/operators/math/pooling.cc
@@ -11,8 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #include "paddle/fluid/operators/math/pooling.h"
+#include <algorithm>
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -27,9 +28,10 @@ template <typename PoolProcess, typename T>
 class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
  public:
   void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_process, framework::Tensor* output) {
+                  const framework::Tensor& input, const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, PoolProcess pool_process,
+                  framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -63,11 +65,11 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
             T ele = pool_process.initial();
             for (int h = hstart; h < hend; ++h) {
               for (int w = wstart; w < wend; ++w) {
-                pool_process.compute(ele, input_data[h * input_width + w]);
+                pool_process.compute(input_data[h * input_width + w], &ele);
               }
             }
             int pool_size = (hend - hstart) * (wend - wstart);
-            pool_process.finalize(ele, (static_cast<T>(pool_size)));
+            pool_process.finalize(static_cast<T>(pool_size), &ele);
             output_data[ph * output_width + pw] = ele;
           }
         }
@@ -86,13 +88,12 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
 template <typename PoolProcess, class T>
 class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_grad_process,
-                  framework::Tensor* input_grad) {
+  void operator()(
+      const platform::CPUDeviceContext& context, const framework::Tensor& input,
+      const framework::Tensor& output, const framework::Tensor& output_grad,
+      const std::vector<int>& ksize, const std::vector<int>& strides,
+      const std::vector<int>& paddings, PoolProcess pool_grad_process,
+      framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -131,8 +132,8 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                     input_data[h * input_width + w],
                     output_data[ph * output_width + pw],
                     output_grad_data[ph * output_width + pw],
-                    input_grad_data[h * input_width + w],
-                    static_cast<T>(scale));
+                    static_cast<T>(scale),
+                    input_grad_data + h * input_width + w);
               }
             }
           }
@@ -154,12 +155,11 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
 template <class T>
 class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* input_grad) {
+  void operator()(
+      const platform::CPUDeviceContext& context, const framework::Tensor& input,
+      const framework::Tensor& output, const framework::Tensor& output_grad,
+      const std::vector<int>& ksize, const std::vector<int>& strides,
+      const std::vector<int>& paddings, framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -246,9 +246,10 @@ template <typename PoolProcess, class T>
 class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
  public:
   void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_process, framework::Tensor* output) {
+                  const framework::Tensor& input, const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, PoolProcess pool_process,
+                  framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
@@ -293,14 +294,14 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                 for (int h = hstart; h < hend; ++h) {
                   for (int w = wstart; w < wend; ++w) {
                     pool_process.compute(
-                        ele,
-                        input_data[(d * input_height + h) * input_width + w]);
+                        input_data[(d * input_height + h) * input_width + w],
+                        &ele);
                   }
                 }
               }
               int pool_size =
                   (dend - dstart) * (hend - hstart) * (wend - wstart);
-              pool_process.finalize(ele, static_cast<T>(pool_size));
+              pool_process.finalize(static_cast<T>(pool_size), &ele);
               output_data[output_idx] = ele;
             }
           }
@@ -320,13 +321,12 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
 template <typename PoolProcess, class T>
 class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_grad_process,
-                  framework::Tensor* input_grad) {
+  void operator()(
+      const platform::CPUDeviceContext& context, const framework::Tensor& input,
+      const framework::Tensor& output, const framework::Tensor& output_grad,
+      const std::vector<int>& ksize, const std::vector<int>& strides,
+      const std::vector<int>& paddings, PoolProcess pool_grad_process,
+      framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
@@ -379,8 +379,8 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                         (pd * output_height + ph) * output_width + pw;
                     pool_grad_process.compute(
                         input_data[input_idx], output_data[output_idx],
-                        output_grad_data[output_idx],
-                        input_grad_data[input_idx], static_cast<T>(scale));
+                        output_grad_data[output_idx], static_cast<T>(scale),
+                        input_grad_data + input_idx);
                   }
                 }
               }
@@ -404,12 +404,11 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
 template <class T>
 class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& output,
-                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* input_grad) {
+  void operator()(
+      const platform::CPUDeviceContext& context, const framework::Tensor& input,
+      const framework::Tensor& output, const framework::Tensor& output_grad,
+      const std::vector<int>& ksize, const std::vector<int>& strides,
+      const std::vector<int>& paddings, framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
@@ -510,9 +509,10 @@ template <typename T1, typename T2>
 class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
  public:
   void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* output, framework::Tensor* mask) {
+                  const framework::Tensor& input, const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* output,
+                  framework::Tensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -576,8 +576,9 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
  public:
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
+                  const framework::Tensor& mask, const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
                   framework::Tensor* input_grad) {
     const int batch_size = input_grad->dims()[0];
     const int input_height = input_grad->dims()[2];
@@ -628,9 +629,10 @@ template <typename T1, typename T2>
 class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
  public:
   void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* output, framework::Tensor* mask) {
+                  const framework::Tensor& input, const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* output,
+                  framework::Tensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
@@ -708,8 +710,9 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
  public:
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
+                  const framework::Tensor& mask, const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
                   framework::Tensor* input_grad) {
     const int batch_size = input_grad->dims()[0];
     const int input_depth = input_grad->dims()[2];
diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu
index 274263c69c535249fceee11075c5948b1fc34358..b1c76350d1724629bae175abf47e6671a1532242 100644
--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <algorithm>
+#include <vector>
 #include "paddle/fluid/operators/math/pooling.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
 namespace operators {
@@ -47,11 +49,11 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
     T ele = pool_process.initial();
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
-        pool_process.compute(ele, input_data[h * input_width + w]);
+        pool_process.compute(input_data[h * input_width + w], &ele);
       }
     }
     int pool_size = (hend - hstart) * (wend - wstart);
-    pool_process.finalize(ele, (static_cast<T>(pool_size)));
+    pool_process.finalize(static_cast<T>(pool_size), &ele);
     output_data[index] = ele;
   }
 }
@@ -96,8 +98,8 @@ __global__ void KernelPool2DGrad(
         int pool_size = (hend - hstart) * (wend - wstart);
         int output_sub_idx = ph * output_width + pw;
         pool_process.compute(input, output_data[output_sub_idx],
-                             output_grad[output_sub_idx], gradient,
-                             static_cast<T>(1.0 / pool_size));
+                             output_grad[output_sub_idx],
+                             static_cast<T>(1.0 / pool_size), &gradient);
       }
     }
     input_grad[index] = gradient;
@@ -158,9 +160,10 @@ template <typename PoolProcess, typename T>
 class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_process, framework::Tensor* output) {
+                  const framework::Tensor& input, const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, PoolProcess pool_process,
+                  framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
@@ -201,9 +204,11 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
-                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_process, framework::Tensor* input_grad) {
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, PoolProcess pool_process,
+                  framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
@@ -246,8 +251,10 @@ class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
-                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
                   framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
@@ -340,12 +347,12 @@ __global__ void KernelPool3D(const int nthreads, const T* input_data,
       for (int h = hstart; h < hend; ++h) {
         for (int w = wstart; w < wend; ++w) {
           pool_process.compute(
-              ele, input_data[(d * input_height + h) * input_width + w]);
+              input_data[(d * input_height + h) * input_width + w], &ele);
         }
       }
     }
     int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
-    pool_process.finalize(ele, static_cast<T>(pool_size));
+    pool_process.finalize(static_cast<T>(pool_size), &ele);
     output_data[index] = ele;
   }
 }
@@ -405,8 +412,8 @@ __global__ void KernelPool3DGrad(
           int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
           int output_sub_idx = (pd * output_height + ph) * output_width + pw;
           pool_process.compute(input, output_data[output_sub_idx],
-                               output_grad[output_sub_idx], gradient,
-                               static_cast<T>(1.0 / pool_size));
+                               output_grad[output_sub_idx],
+                               static_cast<T>(1.0 / pool_size), &gradient);
         }
       }
     }
@@ -474,9 +481,10 @@ template <typename PoolProcess, class T>
 class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_process, framework::Tensor* output) {
+                  const framework::Tensor& input, const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, PoolProcess pool_process,
+                  framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_depth = input.dims()[2];
@@ -525,9 +533,11 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
-                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_process, framework::Tensor* input_grad) {
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, PoolProcess pool_process,
+                  framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_depth = input.dims()[2];
@@ -578,8 +588,10 @@ class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output,
-                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
                   framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
@@ -736,9 +748,10 @@ template <typename T1, typename T2>
 class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* output, framework::Tensor* mask) {
+                  const framework::Tensor& input, const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* output,
+                  framework::Tensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
@@ -779,8 +792,9 @@ class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
+                  const framework::Tensor& mask, const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
                   framework::Tensor* input_grad) {
     const int batch_size = input_grad->dims()[0];
     const int input_channels = input_grad->dims()[1];
@@ -937,9 +951,10 @@ template <typename T1, typename T2>
 class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  framework::Tensor* output, framework::Tensor* mask) {
+                  const framework::Tensor& input, const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* output,
+                  framework::Tensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_depth = input.dims()[2];
@@ -987,8 +1002,9 @@ class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
+                  const framework::Tensor& mask, const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
                   framework::Tensor* input_grad) {
     const int batch_size = input_grad->dims()[0];
     const int input_channels = input_grad->dims()[1];
diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h
index 74cb42f0d02086a6776b22d57832757ae3ffc470..2538d739cce95d1b2fc5b3f905af5e6d94cf7af5 100644
--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -23,8 +24,8 @@ namespace operators {
 namespace math {
 
 #define FLT_MAX \
-  __FLT_MAX__  // It might need to be placed in another file, but I'm still
-               // wondering where to put it.
+  __FLT_MAX__  // TODO(zcd) :It might need to be placed in another file, but I'm
+               // still wondering where to put it.
 
 /*
  * \brief Extracting simple operations from pooling.
@@ -40,33 +41,33 @@ template <class T>
 class MaxPool {
  public:
   DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
-  DEVICE inline void compute(T& y, const T& x) { y = y > x ? y : x; }
-  DEVICE inline void finalize(T& y, const T& pool_field) {}
+  DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
+  DEVICE inline void finalize(const T& pool_field, T* y) {}
 };
 
 template <class T>
 class AvgPool {
  public:
   DEVICE inline T initial() { return static_cast<T>(0); }
-  DEVICE inline void compute(T& y, const T& x) { y += x; }
-  DEVICE inline void finalize(T& y, const T& pool_field) { y /= pool_field; }
+  DEVICE inline void compute(const T& x, T* y) { *y += x; }
+  DEVICE inline void finalize(const T& pool_field, T* y) { *y /= pool_field; }
 };
 
 template <class T>
 class MaxPoolGrad {
  public:
-  DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx,
-                             T scale) {
-    dx += dy * (x == y);
+  DEVICE inline void compute(const T& x, const T& y, const T& dy, T scale,
+                             T* dx) {
+    *dx += dy * (x == y);
   }
 };
 
 template <class T>
 class AvgPoolGrad {
  public:
-  DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx,
-                             T scale) {
-    dx += (scale * dy);
+  DEVICE inline void compute(const T& x, const T& y, const T& dy, T scale,
+                             T* dx) {
+    *dx += (scale * dy);
   }
 };
 
@@ -88,8 +89,9 @@ template <typename DeviceContext, typename PoolProcess, typename T>
 class Pool2dFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, PoolProcess pool_compute,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, PoolProcess pool_compute,
                   framework::Tensor* output);
 };
 
@@ -98,9 +100,11 @@ class Pool2dGradFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const framework::Tensor& output,
-                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_compute, framework::Tensor* input_grad);
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, PoolProcess pool_compute,
+                  framework::Tensor* input_grad);
 };
 
 template <typename DeviceContext, class T>
@@ -108,8 +112,10 @@ class MaxPool2dGradFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const framework::Tensor& output,
-                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
                   framework::Tensor* input_grad);
 };
 
@@ -117,8 +123,9 @@ template <typename DeviceContext, typename PoolProcess, typename T>
 class Pool3dFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, PoolProcess pool_compute,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, PoolProcess pool_compute,
                   framework::Tensor* output);
 };
 
@@ -127,9 +134,11 @@ class Pool3dGradFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const framework::Tensor& output,
-                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_compute, framework::Tensor* input_grad);
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, PoolProcess pool_compute,
+                  framework::Tensor* input_grad);
 };
 
 template <typename DeviceContext, class T>
@@ -137,8 +146,10 @@ class MaxPool3dGradFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const framework::Tensor& output,
-                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
+                  const framework::Tensor& output_grad,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
                   framework::Tensor* input_grad);
 };
 
@@ -153,8 +164,9 @@ template <typename DeviceContext, typename T1, typename T2>
 class MaxPool2dWithIndexFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, framework::Tensor* output,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* output,
                   framework::Tensor* mask);
 };
 
@@ -163,8 +175,9 @@ class MaxPool2dWithIndexGradFunctor {
  public:
   void operator()(const DeviceContext& context,
                   const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
+                  const framework::Tensor& mask, const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
                   framework::Tensor* input_grad);
 };
 
@@ -172,8 +185,9 @@ template <typename DeviceContext, typename T1, typename T2>
 class MaxPool3dWithIndexFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, framework::Tensor* output,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* output,
                   framework::Tensor* mask);
 };
 
@@ -182,8 +196,9 @@ class MaxPool3dWithIndexGradFunctor {
  public:
   void operator()(const DeviceContext& context,
                   const framework::Tensor& output_grad,
-                  const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings,
+                  const framework::Tensor& mask, const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
                   framework::Tensor* input_grad);
 };
 
diff --git a/paddle/fluid/operators/math/sampler.cc b/paddle/fluid/operators/math/sampler.cc
index 3ec6538d7fb0a902f468aa848c3064bd0260fabb..3066dc0ba284611af89c4927f45089a570ab88bc 100644
--- a/paddle/fluid/operators/math/sampler.cc
+++ b/paddle/fluid/operators/math/sampler.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "sampler.h"
+#include "paddle/fluid/operators/math/sampler.h"
 
 namespace paddle {
 namespace random {
diff --git a/paddle/fluid/operators/math/sampler.h b/paddle/fluid/operators/math/sampler.h
index 9d6a6c28c4304019d0347a30be605e1374c169ee..b82691f269c5d0f267ca98c78646efe9b26f0b34 100644
--- a/paddle/fluid/operators/math/sampler.h
+++ b/paddle/fluid/operators/math/sampler.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <cstdint>
 #include <memory>
 #include <random>
-typedef long int64;
 namespace paddle {
 namespace operators {
 namespace math {
@@ -27,25 +27,25 @@ namespace math {
 */
 class Sampler {
  public:
-  explicit Sampler(int64 range) : range_(range) {
+  explicit Sampler(int64_t range) : range_(range) {
     PADDLE_ENFORCE_GT(range, 0);
     std::random_device r;
     seed_ = r();
   }
-  explicit Sampler(int64 range, unsigned int seed)
+  explicit Sampler(int64_t range, unsigned int seed)
       : range_(range), seed_(seed) {
     PADDLE_ENFORCE_GT(range, 0);
   }
   virtual ~Sampler();
   // Sample a single value
-  virtual int64 Sample() const = 0;
+  virtual int64_t Sample() const = 0;
   // The probability that a single call to Sample() returns the given value.
-  virtual float Probability(int64 value) const = 0;
+  virtual float Probability(int64_t value) const = 0;
 
-  int64 range() { return range_; };
+  int64 range() { return range_; }
 
  protected:
-  const int64 range_;
+  const int64_t range_;
   unsigned int seed_;
 };
 
@@ -56,15 +56,15 @@ class Sampler {
  */
 class UniformSampler : public Sampler {
  public:
-  explicit UniformSampler(int64 range);
+  explicit UniformSampler(int64_t range);
 
-  explicit UniformSampler(int64 range, unsigned int seed);
+  explicit UniformSampler(int64_t range, unsigned int seed);
 
   ~UniformSampler() override {}
 
   int64 Sample() const override;
 
-  float Probability(int64 value) const override;
+  float Probability(int64_t value) const override;
 
  private:
   const float inv_range_;
@@ -79,15 +79,15 @@ class UniformSampler : public Sampler {
  */
 class LogUniformSampler : public Sampler {
  public:
-  explicit LogUniformSampler(int64 range);
+  explicit LogUniformSampler(int64_t range);
 
-  explicit LogUniformSampler(int64 range, unsigned int seed);
+  explicit LogUniformSampler(int64_t range, unsigned int seed);
 
   ~LogUniformSampler() override {}
 
   int64 Sample() const override;
 
-  float Probability(int64 value) const override;
+  float Probability(int64_t value) const override;
 
  private:
   const float log_range_;
@@ -95,6 +95,6 @@ class LogUniformSampler : public Sampler {
   std::shared_ptr<std::uniform_real_distribution<>> dist_;
 };
 
-}  // math
+}  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index 5da3d15277cff8b413a116819b17f50061632b5d..a830dc5250a6aea7e622da4046b512d0c7c5d6f9 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <set>
+#include <vector>
 
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index 5d78fd9d213556204d56087128dc84fe6a91e97d..a92762c7fea865fad2c7784736cce93a8af21892 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -13,10 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <set>
+#include <vector>
 
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc
index 679b6568ad09ce3a42a7ad4222310711e707d9a8..70bed820ee58885861fa8c5535c931f258625572 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
@@ -13,41 +13,50 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
 TEST(selected_rows_functor, cpu_add) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  using namespace paddle::operators::math;
-
-  CPUPlace cpu_place;
-  CPUDeviceContext ctx(cpu_place);
-  SetConstant<CPUDeviceContext, float> functor;
+  paddle::platform::CPUPlace cpu_place;
+  paddle::platform::CPUDeviceContext ctx(cpu_place);
+  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
+                                       float>
+      functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
   std::vector<int64_t> rows1{0, 4, 7};
-  std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)};
+  std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
+      new paddle::framework::SelectedRows(rows1, height)};
   auto* in1_value = selected_rows1->mutable_value();
   in1_value->mutable_data<float>(
-      make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), cpu_place);
+      paddle::framework::make_ddim(
+          {static_cast<int64_t>(rows1.size()), row_numel}),
+      cpu_place);
   functor(ctx, in1_value, 1.0);
 
   std::vector<int64_t> rows2{0, 5, 7, 9};
-  std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)};
+  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
+      new paddle::framework::SelectedRows(rows2, height)};
   auto* in2_value = selected_rows2->mutable_value();
   in2_value->mutable_data<float>(
-      make_ddim({static_cast<int64_t>(rows2.size()), row_numel}), cpu_place);
+      paddle::framework::make_ddim(
+          {static_cast<int64_t>(rows2.size()), row_numel}),
+      cpu_place);
   functor(ctx, in2_value, 2.0);
 
-  std::unique_ptr<SelectedRows> output{new SelectedRows()};
+  std::unique_ptr<paddle::framework::SelectedRows> output{
+      new paddle::framework::SelectedRows()};
   auto* out_value = output->mutable_value();
 
   // simplely concat two SelectedRows
-  out_value->mutable_data<float>(make_ddim({7, 10}), cpu_place);
+  out_value->mutable_data<float>(paddle::framework::make_ddim({7, 10}),
+                                 cpu_place);
 
-  SelectedRowsAdd<CPUDeviceContext, float> add_functor;
+  paddle::operators::math::SelectedRowsAdd<paddle::platform::CPUDeviceContext,
+                                           float>
+      add_functor;
   add_functor(ctx, *selected_rows1, *selected_rows2, output.get());
 
   auto out_height = output->height();
@@ -78,14 +87,20 @@ TEST(selected_rows_functor, cpu_add) {
   EXPECT_EQ(out_data[5 * row_numel + 7], 2.0);
   EXPECT_EQ(out_data[6 * row_numel + 9], 2.0);
 
-  std::unique_ptr<Tensor> tensor1{new Tensor()};
-  tensor1->mutable_data<float>(make_ddim({height, row_numel}), cpu_place);
+  std::unique_ptr<paddle::framework::Tensor> tensor1{
+      new paddle::framework::Tensor()};
+  tensor1->mutable_data<float>(
+      paddle::framework::make_ddim({height, row_numel}), cpu_place);
   functor(ctx, tensor1.get(), 3.0);
 
-  std::unique_ptr<Tensor> tensor2{new Tensor()};
-  tensor2->mutable_data<float>(make_ddim({height, row_numel}), cpu_place);
+  std::unique_ptr<paddle::framework::Tensor> tensor2{
+      new paddle::framework::Tensor()};
+  tensor2->mutable_data<float>(
+      paddle::framework::make_ddim({height, row_numel}), cpu_place);
 
-  SelectedRowsAddTensor<CPUDeviceContext, float> add_tensor_functor;
+  paddle::operators::math::SelectedRowsAddTensor<
+      paddle::platform::CPUDeviceContext, float>
+      add_tensor_functor;
   add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
 
   auto* tensor2_data = tensor2->data<float>();
@@ -106,38 +121,46 @@ TEST(selected_rows_functor, cpu_add) {
 }
 
 TEST(selected_rows_functor, cpu_add_to) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  using namespace paddle::operators::math;
-
-  CPUPlace cpu_place;
-  CPUDeviceContext ctx(cpu_place);
-  SetConstant<CPUDeviceContext, float> functor;
+  paddle::platform::CPUPlace cpu_place;
+  paddle::platform::CPUDeviceContext ctx(cpu_place);
+  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
+                                       float>
+      functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
   std::vector<int64_t> rows1{0, 4, 7};
-  std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)};
+  std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
+      new paddle::framework::SelectedRows(rows1, height)};
   auto* in1_value = selected_rows1->mutable_value();
   in1_value->mutable_data<float>(
-      make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), cpu_place);
+      paddle::framework::make_ddim(
+          {static_cast<int64_t>(rows1.size()), row_numel}),
+      cpu_place);
   functor(ctx, in1_value, 1.0);
 
   std::vector<int64_t> rows2{0, 5, 7, 9};
-  std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)};
+  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
+      new paddle::framework::SelectedRows(rows2, height)};
   auto* in2_value = selected_rows2->mutable_value();
   in2_value->mutable_data<float>(
-      make_ddim({static_cast<int64_t>(rows2.size()), row_numel}), cpu_place);
+      paddle::framework::make_ddim(
+          {static_cast<int64_t>(rows2.size()), row_numel}),
+      cpu_place);
   functor(ctx, in2_value, 2.0);
 
-  std::unique_ptr<SelectedRows> output{new SelectedRows()};
+  std::unique_ptr<paddle::framework::SelectedRows> output{
+      new paddle::framework::SelectedRows()};
   output->set_height(height);
   auto* out_value = output->mutable_value();
 
   // simplely concat two SelectedRows
-  out_value->mutable_data<float>(make_ddim({7, 10}), cpu_place);
+  out_value->mutable_data<float>(paddle::framework::make_ddim({7, 10}),
+                                 cpu_place);
 
-  SelectedRowsAddTo<CPUDeviceContext, float> add_to_functor;
+  paddle::operators::math::SelectedRowsAddTo<paddle::platform::CPUDeviceContext,
+                                             float>
+      add_to_functor;
   add_to_functor(ctx, *selected_rows1, 0, output.get());
   add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get());
 
@@ -169,11 +192,15 @@ TEST(selected_rows_functor, cpu_add_to) {
   EXPECT_EQ(out_data[5 * row_numel + 7], 2.0);
   EXPECT_EQ(out_data[6 * row_numel + 9], 2.0);
 
-  std::unique_ptr<Tensor> tensor1{new Tensor()};
-  tensor1->mutable_data<float>(make_ddim({height, row_numel}), cpu_place);
+  std::unique_ptr<paddle::framework::Tensor> tensor1{
+      new paddle::framework::Tensor()};
+  tensor1->mutable_data<float>(
+      paddle::framework::make_ddim({height, row_numel}), cpu_place);
   functor(ctx, tensor1.get(), 3.0);
 
-  SelectedRowsAddToTensor<CPUDeviceContext, float> add_to_tensor_functor;
+  paddle::operators::math::SelectedRowsAddToTensor<
+      paddle::platform::CPUDeviceContext, float>
+      add_to_tensor_functor;
   add_to_tensor_functor(ctx, *output, tensor1.get());
 
   auto* tensor1_data = tensor1->data<float>();
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu b/paddle/fluid/operators/math/selected_rows_functor_test.cu
index 942d9b13fc1a8f578da779351be9ba9de7fcce33..e89b27855bdeba3a5189feff94eb063ddfb9b9b8 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu
@@ -12,43 +12,52 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 
 TEST(selected_rows_functor, gpu_add) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  using namespace paddle::operators::math;
-
-  CUDAPlace gpu_place(0);
-  CPUPlace cpu_place;
-  CUDADeviceContext ctx(gpu_place);
-  SetConstant<CUDADeviceContext, float> functor;
+  paddle::platform::CUDAPlace gpu_place(0);
+  paddle::platform::CPUPlace cpu_place;
+  paddle::platform::CUDADeviceContext ctx(gpu_place);
+  paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
+                                       float>
+      functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
   std::vector<int64_t> rows1{0, 4, 7};
-  std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)};
+  std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
+      new paddle::framework::SelectedRows(rows1, height)};
   auto* in1_value = selected_rows1->mutable_value();
   in1_value->mutable_data<float>(
-      make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), gpu_place);
+      paddle::framework::make_ddim(
+          {static_cast<int64_t>(rows1.size()), row_numel}),
+      gpu_place);
   functor(ctx, in1_value, 1.0);
 
   std::vector<int64_t> rows2{0, 5, 7, 9};
-  std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)};
+  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
+      new paddle::framework::SelectedRows(rows2, height)};
   auto* in2_value = selected_rows2->mutable_value();
   in2_value->mutable_data<float>(
-      make_ddim({static_cast<int64_t>(rows2.size()), row_numel}), gpu_place);
+      paddle::framework::make_ddim(
+          {static_cast<int64_t>(rows2.size()), row_numel}),
+      gpu_place);
   functor(ctx, in2_value, 2.0);
 
-  std::unique_ptr<SelectedRows> output{new SelectedRows()};
+  std::unique_ptr<paddle::framework::SelectedRows> output{
+      new paddle::framework::SelectedRows()};
   auto* out_value = output->mutable_value();
 
-  // simplely concat two SelectedRows
-  out_value->mutable_data<float>(make_ddim({7, 10}), gpu_place);
+  // simply concat two SelectedRows
+  out_value->mutable_data<float>(paddle::framework::make_ddim({7, 10}),
+                                 gpu_place);
 
-  SelectedRowsAdd<CUDADeviceContext, float> add_functor;
+  paddle::operators::math::SelectedRowsAdd<paddle::platform::CUDADeviceContext,
+                                           float>
+      add_functor;
   add_functor(ctx, *selected_rows1, *selected_rows2, output.get());
 
   auto out_height = output->height();
@@ -66,8 +75,8 @@ TEST(selected_rows_functor, gpu_add) {
   EXPECT_EQ(out_rows[5], 7);
   EXPECT_EQ(out_rows[6], 9);
 
-  Tensor out_cpu;
-  TensorCopy(*out_value, cpu_place, ctx, &out_cpu);
+  paddle::framework::Tensor out_cpu;
+  paddle::framework::TensorCopy(*out_value, cpu_place, ctx, &out_cpu);
   ctx.Wait();
 
   auto* out_cpu_data = out_cpu.data<float>();
@@ -83,18 +92,24 @@ TEST(selected_rows_functor, gpu_add) {
   EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0);
   EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0);
 
-  std::unique_ptr<Tensor> tensor1{new Tensor()};
-  tensor1->mutable_data<float>(make_ddim({height, row_numel}), gpu_place);
+  std::unique_ptr<paddle::framework::Tensor> tensor1{
+      new paddle::framework::Tensor()};
+  tensor1->mutable_data<float>(
+      paddle::framework::make_ddim({height, row_numel}), gpu_place);
   functor(ctx, tensor1.get(), 3.0);
 
-  std::unique_ptr<Tensor> tensor2{new Tensor()};
-  tensor2->mutable_data<float>(make_ddim({height, row_numel}), gpu_place);
+  std::unique_ptr<paddle::framework::Tensor> tensor2{
+      new paddle::framework::Tensor()};
+  tensor2->mutable_data<float>(
+      paddle::framework::make_ddim({height, row_numel}), gpu_place);
 
-  SelectedRowsAddTensor<CUDADeviceContext, float> add_tensor_functor;
+  paddle::operators::math::SelectedRowsAddTensor<
+      paddle::platform::CUDADeviceContext, float>
+      add_tensor_functor;
   add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
 
-  Tensor tensor2_cpu;
-  TensorCopy(*tensor2, cpu_place, ctx, &tensor2_cpu);
+  paddle::framework::Tensor tensor2_cpu;
+  paddle::framework::TensorCopy(*tensor2, cpu_place, ctx, &tensor2_cpu);
   ctx.Wait();
 
   auto* tensor2_cpu_data = tensor2_cpu.data<float>();
@@ -115,39 +130,47 @@ TEST(selected_rows_functor, gpu_add) {
 }
 
 TEST(selected_rows_functor, gpu_add_to) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  using namespace paddle::operators::math;
-
-  CUDAPlace gpu_place(0);
-  CPUPlace cpu_place;
-  CUDADeviceContext ctx(gpu_place);
-  SetConstant<CUDADeviceContext, float> functor;
+  paddle::platform::CUDAPlace gpu_place(0);
+  paddle::platform::CPUPlace cpu_place;
+  paddle::platform::CUDADeviceContext ctx(gpu_place);
+  paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
+                                       float>
+      functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
   std::vector<int64_t> rows1{0, 4, 7};
-  std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)};
+  std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
+      new paddle::framework::SelectedRows(rows1, height)};
   auto* in1_value = selected_rows1->mutable_value();
   in1_value->mutable_data<float>(
-      make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), gpu_place);
+      paddle::framework::make_ddim(
+          {static_cast<int64_t>(rows1.size()), row_numel}),
+      gpu_place);
   functor(ctx, in1_value, 1.0);
 
   std::vector<int64_t> rows2{0, 5, 7, 9};
-  std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)};
+  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
+      new paddle::framework::SelectedRows(rows2, height)};
   auto* in2_value = selected_rows2->mutable_value();
   in2_value->mutable_data<float>(
-      make_ddim({static_cast<int64_t>(rows2.size()), row_numel}), gpu_place);
+      paddle::framework::make_ddim(
+          {static_cast<int64_t>(rows2.size()), row_numel}),
+      gpu_place);
   functor(ctx, in2_value, 2.0);
 
-  std::unique_ptr<SelectedRows> output{new SelectedRows()};
+  std::unique_ptr<paddle::framework::SelectedRows> output{
+      new paddle::framework::SelectedRows()};
   output->set_height(height);
   auto* out_value = output->mutable_value();
 
-  // simplely concat two SelectedRows
-  out_value->mutable_data<float>(make_ddim({7, 10}), gpu_place);
+  // simply concat two SelectedRows
+  out_value->mutable_data<float>(paddle::framework::make_ddim({7, 10}),
+                                 gpu_place);
 
-  SelectedRowsAddTo<CUDADeviceContext, float> add_to_functor;
+  paddle::operators::math::SelectedRowsAddTo<
+      paddle::platform::CUDADeviceContext, float>
+      add_to_functor;
   add_to_functor(ctx, *selected_rows1, 0, output.get());
   add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get());
 
@@ -166,8 +189,8 @@ TEST(selected_rows_functor, gpu_add_to) {
   EXPECT_EQ(out_rows[5], 7);
   EXPECT_EQ(out_rows[6], 9);
 
-  Tensor out_cpu;
-  TensorCopy(*out_value, cpu_place, ctx, &out_cpu);
+  paddle::framework::Tensor out_cpu;
+  paddle::framework::TensorCopy(*out_value, cpu_place, ctx, &out_cpu);
   ctx.Wait();
 
   auto* out_cpu_data = out_cpu.data<float>();
@@ -183,15 +206,19 @@ TEST(selected_rows_functor, gpu_add_to) {
   EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0);
   EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0);
 
-  std::unique_ptr<Tensor> tensor1{new Tensor()};
-  tensor1->mutable_data<float>(make_ddim({height, row_numel}), gpu_place);
+  std::unique_ptr<paddle::framework::Tensor> tensor1{
+      new paddle::framework::Tensor()};
+  tensor1->mutable_data<float>(
+      paddle::framework::make_ddim({height, row_numel}), gpu_place);
   functor(ctx, tensor1.get(), 3.0);
 
-  SelectedRowsAddToTensor<CUDADeviceContext, float> add_to_tensor_functor;
+  paddle::operators::math::SelectedRowsAddToTensor<
+      paddle::platform::CUDADeviceContext, float>
+      add_to_tensor_functor;
   add_to_tensor_functor(ctx, *output, tensor1.get());
 
-  Tensor tensor1_cpu;
-  TensorCopy(*tensor1, cpu_place, ctx, &tensor1_cpu);
+  paddle::framework::Tensor tensor1_cpu;
+  paddle::framework::TensorCopy(*tensor1, cpu_place, ctx, &tensor1_cpu);
   ctx.Wait();
 
   auto* tensor1_cpu_data = tensor1_cpu.data<float>();
diff --git a/paddle/fluid/operators/math/sequence2batch.cc b/paddle/fluid/operators/math/sequence2batch.cc
index 8899abff360ea867872d3433722cdb37ef358500..b546b8728217ed6013247555dcd5d7180ddeae74 100644
--- a/paddle/fluid/operators/math/sequence2batch.cc
+++ b/paddle/fluid/operators/math/sequence2batch.cc
@@ -23,11 +23,11 @@ class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
  public:
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& src,
-                  framework::Vector<size_t> index_lod, framework::Tensor& dst,
+                  framework::Vector<size_t> index_lod, framework::Tensor* dst,
                   bool is_src_index) {
     size_t* index = index_lod.data();
     auto src_dims = src.dims();
-    auto dst_dims = dst.dims();
+    auto dst_dims = dst->dims();
     PADDLE_ENFORCE_EQ(src_dims.size(), 2UL,
                       "The src must be matrix with rank 2.");
     PADDLE_ENFORCE_EQ(dst_dims.size(), 2UL,
@@ -37,7 +37,7 @@ class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
     auto height = dst_dims[0];
     auto width = dst_dims[1];
     auto* src_data = src.data<T>();
-    auto* dst_data = dst.data<T>();
+    auto* dst_data = dst->data<T>();
     for (int i = 0; i < height; ++i) {
       if (is_src_index) {
         memcpy(dst_data + i * width, src_data + index[i] * width,
diff --git a/paddle/fluid/operators/math/sequence2batch.cu b/paddle/fluid/operators/math/sequence2batch.cu
index 3185f10d4180437ab5a3f78df8583613edd9ed43..be73adfc0cbe37ed8831b5ad34e66bc95e342e9d 100644
--- a/paddle/fluid/operators/math/sequence2batch.cu
+++ b/paddle/fluid/operators/math/sequence2batch.cu
@@ -43,10 +43,10 @@ class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& src,
-                  framework::Vector<size_t> index_lod, framework::Tensor& dst,
+                  framework::Vector<size_t> index_lod, framework::Tensor* dst,
                   bool is_src_index) {
     auto src_dims = src.dims();
-    auto dst_dims = dst.dims();
+    auto dst_dims = dst->dims();
     PADDLE_ENFORCE_EQ(src_dims.size(), 2,
                       "The src must be matrix with rank 2.");
     PADDLE_ENFORCE_EQ(dst_dims.size(), 2,
@@ -56,7 +56,7 @@ class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
     auto height = dst_dims[0];
     auto width = dst_dims[1];
     auto* src_data = src.data<T>();
-    auto* dst_data = dst.data<T>();
+    auto* dst_data = dst->data<T>();
 
     dim3 threads(128, 8);
     dim3 grid(8, 1);
diff --git a/paddle/fluid/operators/math/sequence2batch.h b/paddle/fluid/operators/math/sequence2batch.h
index e78aafd37d1dda91a035f3ed850537e80f188cb2..62e6307ae9f4236a38c49daaf09fc05c54268159 100644
--- a/paddle/fluid/operators/math/sequence2batch.h
+++ b/paddle/fluid/operators/math/sequence2batch.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -35,7 +37,7 @@ class CopyMatrixRowsFunctor {
   // copy the input src to the indexed rows of output dst.
   // The indexed rows are based on the input index.
   void operator()(const DeviceContext& context, const framework::Tensor& src,
-                  framework::Vector<size_t> index_lod, framework::Tensor& dst,
+                  framework::Vector<size_t> index_lod, framework::Tensor* dst,
                   bool is_src_index);
 };
 
@@ -58,22 +60,26 @@ class LoDTensor2BatchFunctor {
  public:
   void operator()(const DeviceContext& context,
                   const framework::LoDTensor& lod_tensor,
-                  framework::LoDTensor& batch, bool is_cal_batch_lod,
+                  framework::LoDTensor* batch, bool is_cal_batch_lod,
                   bool is_reverse = false) const {
     if (!is_cal_batch_lod) {
-      auto lods = batch.lod();
-      PADDLE_ENFORCE_GT(lods.size(), 2UL);
-      PADDLE_ENFORCE_EQ(lods[1].size(),
-                        static_cast<size_t>(lod_tensor.dims()[0]));
+      auto lods = batch->lod();
+      PADDLE_ENFORCE_GT(lods.size(), 2UL,
+                        "The LoD of LoDTensor should inlcude at least 2-level "
+                        "sequence information.");
+      PADDLE_ENFORCE_EQ(
+          lods[1].size(), static_cast<size_t>(lod_tensor.dims()[0]),
+          "The LoD information should be consistent with the dims.");
       CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
       to_batch(context, lod_tensor, lods[1], batch, true);
       return;
     }
 
     auto lods = lod_tensor.lod();
-    auto lod = lods[0];
     PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
 
+    auto lod = lods[0];
+
     std::vector<SeqInfo> seq_info;
     for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
       int length = lod[seq_id + 1] - lod[seq_id];
@@ -141,7 +147,7 @@ class LoDTensor2BatchFunctor {
     for (size_t i = 0; i < seq_info.size(); ++i) {
       seq_order[i] = seq_info[i].seq_idx;
     }
-    batch.set_lod(batch_lods);
+    batch->set_lod(batch_lods);
 
     CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
     to_batch(context, lod_tensor, batch_lods[1], batch, true);
@@ -153,11 +159,14 @@ class Batch2LoDTensorFunctor {
  public:
   void operator()(const DeviceContext& context,
                   const framework::LoDTensor& batch,
-                  framework::LoDTensor& lod_tensor) const {
+                  framework::LoDTensor* lod_tensor) const {
     auto in_lod = batch.lod();
-    PADDLE_ENFORCE_GT(in_lod.size(), 2UL);
-    PADDLE_ENFORCE_EQ(in_lod[1].size(),
-                      static_cast<size_t>(lod_tensor.dims()[0]));
+    PADDLE_ENFORCE_GT(in_lod.size(), 2UL,
+                      "The LoD of LoDTensor should inlcude at least 2-level "
+                      "sequence information.");
+    PADDLE_ENFORCE_EQ(
+        in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0]),
+        "The LoD information should be consistent with the dims.");
     CopyMatrixRowsFunctor<DeviceContext, T> to_seq;
     to_seq(context, batch, in_lod[1], lod_tensor, false);
   }
diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc
index 38bd3b99758555a24b3b8eb0de06cca8e424fcb2..d63c6c4ed55331235188c1c750468d4e75b9b7f2 100644
--- a/paddle/fluid/operators/math/sequence_padding.cc
+++ b/paddle/fluid/operators/math/sequence_padding.cc
@@ -22,7 +22,7 @@ template <typename T>
 class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
  public:
   void operator()(const platform::CPUDeviceContext& context,
-                  const framework::LoDTensor& seq, framework::Tensor& padding,
+                  const framework::LoDTensor& seq, framework::Tensor* padding,
                   bool norm_by_times) {
     auto lod = seq.lod();
     PADDLE_ENFORCE_GT(lod.size(), 0UL,
@@ -37,7 +37,7 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
                       "The first dimension of LoDTensor seq should be "
                       "equal to the sum of all sequences's length.");
 
-    auto padding_dims = padding.dims();
+    auto padding_dims = padding->dims();
     PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
                       "The input padding should be a 3-D Tensor of shape "
                       "[max_sequence_length, num_sequences, sequence_width].");
@@ -58,7 +58,7 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
                       "width of sequence in LoDTensor seq.");
 
     const T* seq_data = seq.data<T>();
-    T* padding_data = padding.data<T>();
+    T* padding_data = padding->data<T>();
     for (int64_t i = 0; i < max_sequence_length; ++i) {
       for (int64_t j = 0; j < num_sequences; ++j) {
         int64_t start_pos = abs_offset_lod[level][j];
@@ -84,16 +84,16 @@ template <typename T>
 class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
  public:
   void operator()(const platform::CPUDeviceContext& context,
-                  framework::LoDTensor& seq, const framework::Tensor& padding,
+                  framework::LoDTensor* seq, const framework::Tensor& padding,
                   bool norm_by_times) {
-    auto lod = seq.lod();
+    auto lod = seq->lod();
     PADDLE_ENFORCE_GT(lod.size(), 0UL,
                       "The LoD of LoDTensor seq should not be null.");
 
     const size_t level = 0;
     framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
 
-    auto seq_dims = seq.dims();
+    auto seq_dims = seq->dims();
     PADDLE_ENFORCE_EQ(seq_dims[0],
                       static_cast<int64_t>(abs_offset_lod[level].back()),
                       "The first dimension of LoDTensor seq should be "
@@ -114,13 +114,13 @@ class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
                       "The second dimension of Tensor padding should be "
                       "the number of sequences in LoDTensor seq.");
 
-    const int64_t sequence_width = seq.numel() / seq_dims[0];
+    const int64_t sequence_width = seq->numel() / seq_dims[0];
     PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
                       "The third dimension of Tensor padding should be the "
                       "width of sequence in LoDTensor seq.");
 
     const T* padding_data = padding.data<T>();
-    T* seq_data = seq.data<T>();
+    T* seq_data = seq->data<T>();
     for (int64_t i = 0; i < num_sequences; ++i) {
       int64_t start_pos = abs_offset_lod[level][i];
       int64_t sequence_length = abs_offset_lod[level][i + 1] - start_pos;
diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu
index c044e6fc32bab8f72a0dce45b4abdb1174a0d72f..0956a0c17d387f4a174c7ed4e9b1b1f816dcf4ae 100644
--- a/paddle/fluid/operators/math/sequence_padding.cu
+++ b/paddle/fluid/operators/math/sequence_padding.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <algorithm>
 #include "paddle/fluid/operators/math/sequence_padding.h"
 
 namespace paddle {
@@ -61,7 +62,7 @@ template <typename T>
 class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
-                  const framework::LoDTensor& seq, framework::Tensor& padding,
+                  const framework::LoDTensor& seq, framework::Tensor* padding,
                   bool norm_by_times) {
     auto lod = seq.lod();
     PADDLE_ENFORCE_GT(lod.size(), 0UL,
@@ -76,7 +77,7 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
                       "The first dimension of LoDTensor seq should be "
                       "equal to the sum of all sequences's length.");
 
-    auto padding_dims = padding.dims();
+    auto padding_dims = padding->dims();
     PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
                       "The input padding should be a 3-D Tensor of shape "
                       "[max_sequence_length, num_sequences, sequence_width].");
@@ -97,8 +98,8 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
                       "width of sequence in LoDTensor seq.");
 
     if (!norm_by_times && num_sequences == 1UL) {
-      TensorCopy(seq, context.GetPlace(), context, &padding);
-      padding.Resize(padding_dims);
+      TensorCopy(seq, context.GetPlace(), context, padding);
+      padding->Resize(padding_dims);
       return;
     }
 
@@ -117,7 +118,7 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
     dim3 grid(grid_dim_x, grid_dim_y);
 
     const T* seq_data = seq.data<T>();
-    T* padding_data = padding.data<T>();
+    T* padding_data = padding->data<T>();
     if (norm_by_times) {
       SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>(
           padding_data, const_cast<T*>(seq_data),
@@ -136,16 +137,16 @@ template <typename T>
 class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
-                  framework::LoDTensor& seq, const framework::Tensor& padding,
+                  framework::LoDTensor* seq, const framework::Tensor& padding,
                   bool norm_by_times) {
-    auto lod = seq.lod();
+    auto lod = seq->lod();
     PADDLE_ENFORCE_GT(lod.size(), 0UL,
                       "The lod of LoDTensor seq should not be null.");
 
     const size_t level = 0;
     framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
 
-    auto seq_dims = seq.dims();
+    auto seq_dims = seq->dims();
     PADDLE_ENFORCE_EQ(seq_dims[0],
                       static_cast<int64_t>(abs_offset_lod[level].back()),
                       "The first dimension of LoDTensor seq should be "
@@ -166,14 +167,14 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
                       "The second dimension of Tensor padding should be "
                       "the number of sequences in LoDTensor seq.");
 
-    const int64_t sequence_width = seq.numel() / seq_dims[0];
+    const int64_t sequence_width = seq->numel() / seq_dims[0];
     PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
                       "The third dimension of Tensor padding should be the "
                       "width of sequence in LoDTensor seq.");
 
     if (!norm_by_times && num_sequences == 1UL) {
-      TensorCopy(padding, context.GetPlace(), context, &seq);
-      seq.Resize(seq_dims);
+      TensorCopy(padding, context.GetPlace(), context, seq);
+      seq->Resize(seq_dims);
       return;
     }
 
@@ -192,7 +193,7 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
     dim3 grid(grid_dim_x, grid_dim_y);
 
     const T* padding_data = padding.data<T>();
-    T* seq_data = seq.data<T>();
+    T* seq_data = seq->data<T>();
     if (norm_by_times) {
       SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>(
           const_cast<T*>(padding_data), seq_data,
diff --git a/paddle/fluid/operators/math/sequence_padding.h b/paddle/fluid/operators/math/sequence_padding.h
index 17f044b9d6667ed6a45bf5a0c2362c351d2c2beb..b56e6db1ebdac1a00561c07845c03bb8fbd8d35a 100644
--- a/paddle/fluid/operators/math/sequence_padding.h
+++ b/paddle/fluid/operators/math/sequence_padding.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
@@ -64,13 +65,13 @@ template <typename DeviceContext, typename T>
 class PaddingLoDTensorFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::LoDTensor& seq,
-                  framework::Tensor& padding, bool norm_by_times);
+                  framework::Tensor* padding, bool norm_by_times);
 };
 
 template <typename DeviceContext, typename T>
 class UnpaddingLoDTensorFunctor {
  public:
-  void operator()(const DeviceContext& context, framework::LoDTensor& seq,
+  void operator()(const DeviceContext& context, framework::LoDTensor* seq,
                   const framework::Tensor& padding, bool norm_by_times);
 };
 
diff --git a/paddle/fluid/operators/math/sequence_padding_test.cc b/paddle/fluid/operators/math/sequence_padding_test.cc
index bece46e75374cc38512d77c9d6b2fc6584e39db2..b0c201db0ccbe81d8f57cd984d2cdfd2f6a48f25 100644
--- a/paddle/fluid/operators/math/sequence_padding_test.cc
+++ b/paddle/fluid/operators/math/sequence_padding_test.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/sequence_padding.h"
 #include <gtest/gtest.h>
+#include <vector>
 
 template <typename DeviceContext, typename Place, typename T>
 void TestSequencePadding(const paddle::framework::LoD& lod,
@@ -40,7 +41,7 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
   if (paddle::platform::is_cpu_place(*place)) {
     seq = cpu_seq;
   } else {
-    TensorCopy(cpu_seq, *place, *context, &seq);
+    TensorCopySync(cpu_seq, *place, &seq);
     seq.set_lod(lod);
   }
 
@@ -53,17 +54,17 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
                                     static_cast<int64_t>(sequence_width)});
   padding.mutable_data<T>(padding_dims, *place);
   paddle::operators::math::PaddingLoDTensorFunctor<DeviceContext, T>()(
-      *context, seq, padding, false);
+      *context, seq, &padding, false);
 
   seq_back.set_lod(lod);
   seq_back.mutable_data<T>(seq_dims, *place);
   paddle::operators::math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
-      *context, seq_back, padding, false);
+      *context, &seq_back, padding, false);
 
   if (paddle::platform::is_cpu_place(*place)) {
     cpu_seq_back = seq_back;
   } else {
-    TensorCopy(seq_back, paddle::platform::CPUPlace(), *context, &cpu_seq_back);
+    TensorCopySync(seq_back, paddle::platform::CPUPlace(), &cpu_seq_back);
     cpu_seq_back.set_lod(lod);
   }
 
@@ -75,7 +76,7 @@ void TestSequencePadding(const paddle::framework::LoD& lod,
 
   delete place;
   delete context;
-};
+}
 
 TEST(Seq2BatchPadding, CPU) {
   paddle::framework::LoD lod1;
diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc
index f7a6f2bdf4e3b7896df39acfa51fa20577b20f3b..f25d3d3f1ee1f89d46b8e7c88ca68048f5203544 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -13,14 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/sequence_pooling.h"
+#include <string>
 #include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
 namespace math {
 
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
 template <typename T>
-class MaxSeqPoolFunctor<platform::CPUDeviceContext, T> {
+class MaxSeqPoolFunctor {
  public:
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::LoDTensor& input, framework::Tensor* output,
@@ -60,7 +70,7 @@ class MaxSeqPoolFunctor<platform::CPUDeviceContext, T> {
 };
 
 template <typename T>
-class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, T> {
+class MaxSeqPoolGradFunctor {
  public:
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& out_grad,
@@ -93,10 +103,101 @@ class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
-template class MaxSeqPoolFunctor<platform::CPUDeviceContext, float>;
-template class MaxSeqPoolFunctor<platform::CPUDeviceContext, double>;
-template class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, float>;
-template class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, double>;
+template <typename T>
+class SequencePoolFunctor<platform::CPUDeviceContext, T> {
+ public:
+  /* max pool has index output */
+  void operator()(const platform::CPUDeviceContext& context,
+                  const std::string pooltype, const framework::LoDTensor& input,
+                  framework::Tensor* output,
+                  framework::Tensor* index = nullptr) {
+    if (pooltype == "MAX") {
+      math::MaxSeqPoolFunctor<T> max_pool;
+      max_pool(context, input, output, index);
+      return;
+    }
+    auto lod = input.lod()[0];
+    auto& place = *context.eigen_device();
+    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
+      Tensor in_t =
+          input.Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
+      Tensor out_t = output->Slice(i, i + 1);
+      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
+      int64_t w = input.numel() / input.dims()[0];
+      auto in_e = EigenMatrix<T>::From(in_t, framework::make_ddim({h, w}));
+      auto out_e = EigenVector<T>::Flatten(out_t);
+      if (pooltype == "AVERAGE") {
+        out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}}));
+      } else if (pooltype == "SUM") {
+        out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}}));
+      } else if (pooltype == "SQRT") {
+        out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
+                              std::sqrt(static_cast<T>(h));
+      } else if (pooltype == "LAST") {
+        out_e.device(place) = in_e.chip(h - 1, 0);
+      } else if (pooltype == "FIRST") {
+        out_e.device(place) = in_e.chip(0, 0);
+      } else {
+        PADDLE_THROW("unsupported pooling pooltype");
+      }
+    }
+  }
+};
+
+template <typename T>
+class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const std::string pooltype, const framework::Tensor& out_grad,
+                  framework::LoDTensor* in_grad,
+                  /* max pool has index */
+                  const framework::Tensor* index = nullptr) {
+    if (pooltype == "MAX") {
+      math::MaxSeqPoolGradFunctor<T> max_pool_grad;
+      max_pool_grad(context, out_grad, *index, in_grad);
+      return;
+    }
+
+    if (pooltype == "LAST" || pooltype == "FIRST") {
+      // set X@Grad be zero at first when pooltype is LAST/FIRST
+      math::SetConstant<platform::CPUDeviceContext, T> functor;
+      functor(context, in_grad, 0);
+    }
+    auto lod = in_grad->lod()[0];
+    auto& place = *context.eigen_device();
+    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
+      auto in_g_t = in_grad->Slice(static_cast<int>(lod[i]),
+                                   static_cast<int>(lod[i + 1]));
+      auto out_g_t = out_grad.Slice(i, i + 1);
+      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
+      int64_t w = in_grad->numel() / in_grad->dims()[0];
+      auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
+      auto out_g_e = EigenMatrix<T>::From(out_g_t, {1, w});
+      auto out_g_e_v = EigenVector<T>::Flatten(out_g_t);
+      Eigen::DSizes<int, 2> bcast(h, 1);
+
+      if (pooltype == "AVERAGE") {
+        in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast);
+      } else if (pooltype == "SUM") {
+        in_g_e.device(place) = (out_g_e).broadcast(bcast);
+      } else if (pooltype == "SQRT") {
+        in_g_e.device(place) =
+            (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
+      } else if (pooltype == "LAST") {
+        in_g_e.chip(h - 1, 0).device(place) = out_g_e_v;
+      } else if (pooltype == "FIRST") {
+        in_g_e.chip(0, 0).device(place) = out_g_e_v;
+      } else {
+        PADDLE_THROW("unsupported pooling pooltype");
+      }
+    }
+  }
+};
+
+template class SequencePoolFunctor<platform::CPUDeviceContext, float>;
+template class SequencePoolFunctor<platform::CPUDeviceContext, double>;
+template class SequencePoolGradFunctor<platform::CPUDeviceContext, float>;
+template class SequencePoolGradFunctor<platform::CPUDeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu
index d61407c020142f046f41f71a56702fd6106df628..97c2e69fe5327956fc2828781fe3a37b88cc1b99 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <string>
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence_pooling.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
 namespace operators {
@@ -22,113 +24,331 @@ namespace math {
 #define FLT_MAX __FLT_MAX__
 
 template <typename T>
-__global__ void KeMaxSequencePool(const T* input, const size_t* starts,
-                                  T* output, int* index, int64_t num_seq,
-                                  int64_t dim) {
-  int dim_idx = threadIdx.x;
-  int seq_id = blockIdx.x;
-  if (seq_id >= num_seq) return;
-  size_t start = starts[seq_id];
-  size_t end = starts[seq_id + 1];
-
-  for (int64_t i = dim_idx; i < dim; i += blockDim.x) {
-    T max_val = static_cast<T>(-FLT_MAX);
-    int max_id = -1;
-    for (size_t step_id = start; step_id < end; step_id++) {
-      if (max_val < input[step_id * dim + i]) {
-        max_val = input[step_id * dim + i];
-        max_id = step_id;
+struct MaxPoolFunctor {
+  HOSTDEVICE void operator()(const T* input, const size_t start,
+                             const size_t end, const size_t item_dim, T* output,
+                             int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      T max_val = static_cast<T>(-FLT_MAX);
+      int max_index = -1;
+      for (int i = start; i < end; ++i) {
+        if (max_val < input[item_dim * i + tid]) {
+          max_val = input[item_dim * i + tid];
+          max_index = i;
+        }
       }
+      output[tid] = max_val;
+      index[tid] = max_index;
     }
-    output[seq_id * dim + i] = max_val;
-    index[seq_id * dim + i] = max_id;
   }
-}
+};
 
 template <typename T>
-class MaxSeqPoolFunctor<platform::CUDADeviceContext, T> {
- public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::LoDTensor& input, framework::Tensor* output,
-                  framework::Tensor* index) {
-    auto in_dims = input.dims();
-    auto out_dims = output->dims();
-    auto idx_dims = index->dims();
-    PADDLE_ENFORCE_GT(in_dims.size(), static_cast<int64_t>(1));
-    PADDLE_ENFORCE_GT(out_dims.size(), 1);
-    for (int64_t i = 1; i < in_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
+struct AvgPoolFunctor {
+  HOSTDEVICE void operator()(const T* input, const size_t start,
+                             const size_t end, const size_t item_dim, T* output,
+                             int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      T val = static_cast<T>(0);
+      for (int i = start; i < end; ++i) {
+        val += input[item_dim * i + tid];
+      }
+      // end, start is lod, so end - start != 0
+      output[tid] = val / static_cast<T>(end - start);
     }
-    PADDLE_ENFORCE_EQ(idx_dims, out_dims);
+  }
+};
 
-    auto starts = input.lod()[0];
-    const T* in_data = input.data<T>();
-    T* out_data = output->data<T>();
-    int* max_index = index->data<int>();
+template <typename T>
+struct SumPoolFunctor {
+  HOSTDEVICE void operator()(const T* input, const size_t start,
+                             const size_t end, const size_t item_dim, T* output,
+                             int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      T val = static_cast<T>(0);
+      for (int i = start; i < end; ++i) {
+        val += input[item_dim * i + tid];
+      }
+      output[tid] = val;
+    }
+  }
+};
 
-    int64_t num_seq = out_dims[0];
-    int64_t dim = output->numel() / num_seq;
+template <typename T>
+struct SqrtPoolFunctor {
+  HOSTDEVICE void operator()(const T* input, const size_t start,
+                             const size_t end, const size_t item_dim, T* output,
+                             int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      T val = static_cast<T>(0);
+      for (int i = start; i < end; ++i) {
+        val += input[item_dim * i + tid];
+      }
+      // end, start is lod, so end - start != 0
+      output[tid] = val / sqrt(end - start);
+    }
+  }
+};
 
-    dim3 threads(256, 1);
-    dim3 grid(num_seq, 1);
-    auto stream = context.stream();
-    KeMaxSequencePool<T><<<grid, threads, 0, stream>>>(
-        in_data, starts.CUDAData(context.GetPlace()), out_data, max_index,
-        num_seq, dim);
+template <typename T>
+struct LastPoolFunctor {
+  HOSTDEVICE void operator()(const T* input, const size_t start,
+                             const size_t end, const size_t item_dim, T* output,
+                             int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      output[tid] = input[item_dim * (end - 1) + tid];
+    }
   }
 };
 
 template <typename T>
-__global__ void KeMaxSequencePoolGrad(const T* out_grad, const int* max_index,
-                                      T* in_grad, int64_t num_seq,
-                                      int64_t dim) {
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int col_idx = idx % dim;
-  if (idx < num_seq * dim) {
-    int step_id = max_index[idx];
-    in_grad[step_id * dim + col_idx] = out_grad[idx];
+struct FirstPoolFunctor {
+  HOSTDEVICE void operator()(const T* input, const size_t start,
+                             const size_t end, const size_t item_dim, T* output,
+                             int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      output[tid] = input[item_dim * start + tid];
+    }
   }
+};
+
+template <typename T, typename Range_OP>
+__global__ void sequence_pool_kernel(Range_OP op, const T* input,
+                                     const size_t* lod, const size_t lod_size,
+                                     const size_t item_dim, T* output,
+                                     int* index) {
+  int bid = blockIdx.x;
+  if (bid >= lod_size - 1) return;
+  size_t start = lod[bid];
+  size_t end = lod[bid + 1];
+  int* index_offset = nullptr;
+  if (index != nullptr) {
+    index_offset = &index[bid * item_dim];
+  }
+  op(input, start, end, item_dim, &output[bid * item_dim], index_offset);
 }
 
 template <typename T>
-class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, T> {
+class SequencePoolFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& out_grad,
-                  const framework::Tensor& index,
-                  framework::LoDTensor* in_grad) {
-    auto og_dims = out_grad.dims();
-    auto idx_dims = index.dims();
-    auto ig_dims = in_grad->dims();
-    PADDLE_ENFORCE_GT(og_dims.size(), static_cast<int64_t>(1));
-    PADDLE_ENFORCE_GT(ig_dims.size(), static_cast<int64_t>(1));
-    for (int64_t i = 1; i < og_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
+                  const std::string pooltype, const framework::LoDTensor& input,
+                  framework::Tensor* output,
+                  framework::Tensor* index = nullptr) {
+    auto lod = input.lod()[0];
+    const size_t item_dim = output->numel() / output->dims()[0];
+    dim3 threads(1024, 1);
+    dim3 grid(lod.size(), 1);
+    if (pooltype == "MAX") {
+      sequence_pool_kernel<
+          T, MaxPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          MaxPoolFunctor<T>(), input.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          output->mutable_data<T>(context.GetPlace()), index->data<int>());
+    } else if (pooltype == "AVERAGE") {
+      sequence_pool_kernel<
+          T, AvgPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          AvgPoolFunctor<T>(), input.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          output->mutable_data<T>(context.GetPlace()), nullptr);
+    } else if (pooltype == "SUM") {
+      sequence_pool_kernel<
+          T, SumPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          SumPoolFunctor<T>(), input.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          output->mutable_data<T>(context.GetPlace()), nullptr);
+    } else if (pooltype == "SQRT") {
+      sequence_pool_kernel<
+          T, SqrtPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          SqrtPoolFunctor<T>(), input.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          output->mutable_data<T>(context.GetPlace()), nullptr);
+    } else if (pooltype == "LAST") {
+      sequence_pool_kernel<
+          T, LastPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          LastPoolFunctor<T>(), input.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          output->mutable_data<T>(context.GetPlace()), nullptr);
+    } else if (pooltype == "FIRST") {
+      sequence_pool_kernel<
+          T, FirstPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          FirstPoolFunctor<T>(), input.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          output->mutable_data<T>(context.GetPlace()), nullptr);
+    } else {
+      PADDLE_THROW("unsupported pooling pooltype");
     }
-    PADDLE_ENFORCE_EQ(idx_dims, og_dims);
+  }
+};
 
-    const T* og_data = out_grad.data<T>();
-    const int* max_index = index.data<int>();
-    T* ig_data = in_grad->data<T>();
+template <typename T>
+struct MaxPoolGradFunctor {
+  HOSTDEVICE void operator()(const T* out_grad, const size_t start,
+                             const size_t end, const size_t item_dim,
+                             T* in_grad, const int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      for (int i = start; i < end; ++i) {
+        if (i == index[tid]) {
+          in_grad[item_dim * i + tid] = out_grad[tid];
+        } else {
+          in_grad[item_dim * i + tid] = static_cast<T>(0);
+        }
+      }
+    }
+  }
+};
 
-    SetConstant<platform::CUDADeviceContext, T> set_zero;
-    set_zero(context, in_grad, static_cast<T>(0.0));
-    int64_t num_seq = og_dims[0];
-    int64_t dim = out_grad.numel() / num_seq;
+template <typename T>
+struct AvgPoolGradFunctor {
+  HOSTDEVICE void operator()(const T* out_grad, const size_t start,
+                             const size_t end, const size_t item_dim,
+                             T* in_grad, const int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      for (int i = start; i < end; ++i) {
+        in_grad[item_dim * i + tid] = out_grad[tid] / (end - start);
+      }
+    }
+  }
+};
 
-    unsigned int blocks = (num_seq * dim + 128 - 1) / 128;
-    dim3 threads(128, 1);
-    dim3 grid(blocks, 1);
-    auto stream = context.stream();
-    KeMaxSequencePoolGrad<T><<<grid, threads, 0, stream>>>(
-        og_data, max_index, ig_data, num_seq, dim);
+template <typename T>
+struct SumPoolGradFunctor {
+  HOSTDEVICE void operator()(const T* out_grad, const size_t start,
+                             const size_t end, const size_t item_dim,
+                             T* in_grad, const int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      for (int i = start; i < end; ++i) {
+        in_grad[item_dim * i + tid] = out_grad[tid];
+      }
+    }
+  }
+};
+
+template <typename T>
+struct SqrtPoolGradFunctor {
+  HOSTDEVICE void operator()(const T* out_grad, const size_t start,
+                             const size_t end, const size_t item_dim,
+                             T* in_grad, const int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      for (int i = start; i < end; ++i) {
+        in_grad[item_dim * i + tid] =
+            out_grad[tid] / (sqrt(static_cast<T>(end - start)));
+      }
+    }
+  }
+};
+
+template <typename T>
+struct LastPoolGradFunctor {
+  HOSTDEVICE void operator()(const T* out_grad, const size_t start,
+                             const size_t end, const size_t item_dim,
+                             T* in_grad, const int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      for (int i = start; i < end; ++i) {
+        if (i == end - 1) {
+          in_grad[item_dim * i + tid] = out_grad[tid];
+        } else {
+          in_grad[item_dim * i + tid] = static_cast<T>(0);
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+struct FirstPoolGradFunctor {
+  HOSTDEVICE void operator()(const T* out_grad, const size_t start,
+                             const size_t end, const size_t item_dim,
+                             T* in_grad, const int* index) {
+    for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
+      for (int i = start; i < end; ++i) {
+        if (i == start) {
+          in_grad[item_dim * i + tid] = out_grad[tid];
+        } else {
+          in_grad[item_dim * i + tid] = static_cast<T>(0);
+        }
+      }
+    }
+  }
+};
+
+template <typename T, typename Range_OP>
+__global__ void sequence_pool_grad_kernel(Range_OP op, const T* out_grad,
+                                          const size_t* lod,
+                                          const size_t lod_size,
+                                          const size_t item_dim, T* in_grad,
+                                          const int* index) {
+  int bid = blockIdx.x;
+  if (bid >= lod_size - 1) return;
+  size_t start = lod[bid];
+  size_t end = lod[bid + 1];
+  const int* index_offset = nullptr;
+  if (index != nullptr) {
+    index_offset = &index[bid * item_dim];
+  }
+  op(&out_grad[bid * item_dim], start, end, item_dim, in_grad, index_offset);
+}
+
+template <typename T>
+class SequencePoolGradFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const std::string pooltype, const framework::Tensor& out_grad,
+                  framework::LoDTensor* in_grad,
+                  /* max pool has index */
+                  const framework::Tensor* index = nullptr) {
+    auto lod = in_grad->lod()[0];
+    const size_t item_dim = in_grad->numel() / in_grad->dims()[0];
+    dim3 threads(1024, 1);
+    dim3 grid(lod.size(), 1);
+    if (pooltype == "MAX") {
+      sequence_pool_grad_kernel<
+          T, MaxPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          MaxPoolGradFunctor<T>(), out_grad.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          in_grad->mutable_data<T>(context.GetPlace()), index->data<int>());
+    } else if (pooltype == "AVERAGE") {
+      sequence_pool_grad_kernel<
+          T, AvgPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          AvgPoolGradFunctor<T>(), out_grad.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
+    } else if (pooltype == "SUM") {
+      sequence_pool_grad_kernel<
+          T, SumPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          SumPoolGradFunctor<T>(), out_grad.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
+    } else if (pooltype == "SQRT") {
+      sequence_pool_grad_kernel<
+          T, SqrtPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          SqrtPoolGradFunctor<T>(), out_grad.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
+    } else if (pooltype == "LAST") {
+      sequence_pool_grad_kernel<
+          T, LastPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          LastPoolGradFunctor<T>(), out_grad.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
+    } else if (pooltype == "FIRST") {
+      sequence_pool_grad_kernel<
+          T, FirstPoolGradFunctor<T>><<<grid, threads, 0, context.stream()>>>(
+          FirstPoolGradFunctor<T>(), out_grad.data<T>(),
+          lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
+          in_grad->mutable_data<T>(context.GetPlace()), nullptr);
+
+    } else {
+      PADDLE_THROW("unsupported pooling pooltype");
+    }
   }
 };
 
-template class MaxSeqPoolFunctor<platform::CUDADeviceContext, float>;
-template class MaxSeqPoolFunctor<platform::CUDADeviceContext, double>;
-template class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, float>;
-template class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, double>;
+// sequence pooling
+template class SequencePoolFunctor<platform::CUDADeviceContext, float>;
+template class SequencePoolFunctor<platform::CUDADeviceContext, double>;
+template class SequencePoolGradFunctor<platform::CUDADeviceContext, float>;
+template class SequencePoolGradFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/sequence_pooling.h b/paddle/fluid/operators/math/sequence_pooling.h
index ecb76884f670df1aee64ed65c3bb0cf09c5beaff..8dcbee65d0b63a137e5f422ec8667cc950641b4a 100644
--- a/paddle/fluid/operators/math/sequence_pooling.h
+++ b/paddle/fluid/operators/math/sequence_pooling.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -21,23 +22,23 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-#define FLT_MAX __FLT_MAX__
-
 template <typename DeviceContext, typename T>
-class MaxSeqPoolFunctor {
+class SequencePoolFunctor {
  public:
-  void operator()(const DeviceContext& context,
+  /* max pool has index output */
+  void operator()(const DeviceContext& context, const std::string pooltype,
                   const framework::LoDTensor& input, framework::Tensor* output,
-                  framework::Tensor* index);
+                  framework::Tensor* index = nullptr);
 };
 
-template <typename DeviceContext, class T>
-class MaxSeqPoolGradFunctor {
+template <typename DeviceContext, typename T>
+class SequencePoolGradFunctor {
  public:
-  void operator()(const DeviceContext& context,
+  void operator()(const DeviceContext& context, const std::string pooltype,
                   const framework::Tensor& out_grad,
-                  const framework::Tensor& index,
-                  framework::LoDTensor* in_grad);
+                  framework::LoDTensor* in_grad,
+                  /* max pool has index */
+                  const framework::Tensor* index = nullptr);
 };
 
 }  // namespace math
diff --git a/paddle/fluid/operators/math/sequence_scale.cc b/paddle/fluid/operators/math/sequence_scale.cc
index 2c46d4183b5ccd6db909e4142797f97a626c43d5..ee5b22ca855b4fa26e9626aadb84fa9b93b72952 100644
--- a/paddle/fluid/operators/math/sequence_scale.cc
+++ b/paddle/fluid/operators/math/sequence_scale.cc
@@ -21,15 +21,15 @@ namespace math {
 template <typename T>
 class ScaleLoDTensorFunctor<platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  framework::LoDTensor& seq, const T* scales) {
+  void operator()(const platform::CPUDeviceContext& context, const T* scales,
+                  framework::LoDTensor* seq) {
     const size_t level = 0;
-    auto lod = seq.lod();
+    auto lod = seq->lod();
     const size_t num_seq = lod[level].size() - 1;
-    size_t seq_width = seq.dims()[1];
+    size_t seq_width = seq->dims()[1];
     framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
 
-    T* seq_data = seq.mutable_data<T>(context.GetPlace());
+    T* seq_data = seq->mutable_data<T>(context.GetPlace());
     for (size_t i = 0; i < num_seq; ++i) {
       for (size_t j = lod[level][i] * seq_width;
            j < lod[level][i + 1] * seq_width; ++j) {
diff --git a/paddle/fluid/operators/math/sequence_scale.cu b/paddle/fluid/operators/math/sequence_scale.cu
index 74085153c62354771f6126b58746229b5564f2d0..079338c1d3dac6a9403c5871f3face9f1f8e77d2 100644
--- a/paddle/fluid/operators/math/sequence_scale.cu
+++ b/paddle/fluid/operators/math/sequence_scale.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/sequence_scale.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
 namespace operators {
@@ -35,14 +35,14 @@ __global__ void SequenceScaleKernel(T* seq, size_t* lod, const T* scales,
 template <typename T>
 class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
  public:
-  void operator()(const platform::CUDADeviceContext& context,
-                  framework::LoDTensor& seq, const T* scales) {
+  void operator()(const platform::CUDADeviceContext& context, const T* scales,
+                  framework::LoDTensor* seq) {
     const size_t level = 0;
-    auto lod = seq.lod();
+    auto lod = seq->lod();
     const size_t num_seq = lod[level].size() - 1;
-    const size_t seq_width = seq.numel() / seq.dims()[0];
+    const size_t seq_width = seq->numel() / seq->dims()[0];
     framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
-    T* seq_data = seq.mutable_data<T>(context.GetPlace());
+    T* seq_data = seq->mutable_data<T>(context.GetPlace());
 
     SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<<
         num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
diff --git a/paddle/fluid/operators/math/sequence_scale.h b/paddle/fluid/operators/math/sequence_scale.h
index 6cdcbe21cbf82881679d90de470f342f75b3e2f3..202243985c125cd518a27477eb370bf1a325fe16 100644
--- a/paddle/fluid/operators/math/sequence_scale.h
+++ b/paddle/fluid/operators/math/sequence_scale.h
@@ -46,8 +46,8 @@ namespace math {
 template <typename DeviceContext, typename T>
 class ScaleLoDTensorFunctor {
  public:
-  void operator()(const DeviceContext& context, framework::LoDTensor& seq,
-                  const T* scales);
+  void operator()(const DeviceContext& context, const T* scales,
+                  framework::LoDTensor* seq);
 };
 
 }  // namespace math
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index 38e93fdf15d99eb447948378a599891074c10fc5..a579182ec1bd5d10d95bbf8c6f5a0e70ceaaaf4b 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -14,13 +14,90 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 
+#include <vector>
+
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/operators/math/softmax_impl.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
 
 namespace paddle {
 namespace operators {
 namespace math {
 
+using Tensor = framework::Tensor;
+using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
+using DataLayout = platform::DataLayout;
+template <typename T>
+using CudnnDataType = platform::CudnnDataType<T>;
+
+template <typename T>
+void SoftmaxCUDNNFunctor<T>::operator()(
+    const platform::CUDADeviceContext& context, const framework::Tensor* X,
+    framework::Tensor* Y) {
+  // ------------------- cudnn descriptors ---------------------
+  ScopedTensorDescriptor xDesc;
+  ScopedTensorDescriptor yDesc;
+  std::vector<int> cudnn_tensor_dims = framework::vectorize2int(X->dims());
+  DataLayout layout = DataLayout::kNCHW;
+  if (cudnn_tensor_dims.size() == 5) {
+    layout = DataLayout::kNCDHW;
+  }
+  // NOTE(*) : cudnn softmax only support >= 4D Tensor,
+  // fill 1 at unused dims
+  if (cudnn_tensor_dims.size() <= 2) {
+    cudnn_tensor_dims.resize(4, 1);
+  }
+  cudnnTensorDescriptor_t cudnn_x_desc =
+      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  cudnnTensorDescriptor_t cudnn_y_desc =
+      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  PADDLE_ENFORCE(platform::dynload::cudnnSoftmaxForward(
+      context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE,
+      CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType<T>::kOne(), cudnn_x_desc,
+      X->data<T>(), CudnnDataType<T>::kZero(), cudnn_y_desc,
+      Y->mutable_data<T>(context.GetPlace())));
+}
+
+template <typename T>
+void SoftmaxGradCUDNNFunctor<T>::operator()(
+    const platform::CUDADeviceContext& context, const framework::Tensor* Y,
+    const framework::Tensor* YGrad, framework::Tensor* XGrad) {
+  // ------------------- cudnn descriptors ---------------------
+  ScopedTensorDescriptor yDesc;
+  ScopedTensorDescriptor dyDesc;
+  ScopedTensorDescriptor dxDesc;
+  std::vector<int> cudnn_tensor_dims = framework::vectorize2int(Y->dims());
+  DataLayout layout = DataLayout::kNCHW;
+  if (cudnn_tensor_dims.size() == 5) {
+    layout = DataLayout::kNCDHW;
+  }
+  // NOTE(*) : cudnn softmax only support >= 4D Tensor,
+  // fill 1 at unused dims
+  if (cudnn_tensor_dims.size() <= 2) {
+    cudnn_tensor_dims.resize(4, 1);
+  }
+  cudnnTensorDescriptor_t cudnn_y_desc =
+      yDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  cudnnTensorDescriptor_t cudnn_xgrad_desc =
+      dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  cudnnTensorDescriptor_t cudnn_ygrad_desc =
+      dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  PADDLE_ENFORCE(platform::dynload::cudnnSoftmaxBackward(
+      context.cudnn_handle(), CUDNN_SOFTMAX_ACCURATE,
+      CUDNN_SOFTMAX_MODE_INSTANCE, CudnnDataType<T>::kOne(), cudnn_y_desc,
+      Y->data<T>(), cudnn_ygrad_desc, YGrad->data<T>(),
+      CudnnDataType<T>::kZero(), cudnn_xgrad_desc,
+      XGrad->mutable_data<T>(context.GetPlace())));
+}
+
+template class SoftmaxCUDNNFunctor<platform::float16>;
+template class SoftmaxCUDNNFunctor<float>;
+template class SoftmaxCUDNNFunctor<double>;
+template class SoftmaxGradCUDNNFunctor<float>;
+template class SoftmaxGradCUDNNFunctor<double>;
+
+template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, float>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, double>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
diff --git a/paddle/fluid/operators/math/softmax.h b/paddle/fluid/operators/math/softmax.h
index 14b2690c2a4e764058270953214a07aee8053444..da1f0b672d3a5fb5da8f4d72892be21964bdbc0d 100644
--- a/paddle/fluid/operators/math/softmax.h
+++ b/paddle/fluid/operators/math/softmax.h
@@ -33,6 +33,23 @@ class SoftmaxGradFunctor {
                   const framework::Tensor* y_grad, framework::Tensor* x_grad);
 };
 
+#ifdef PADDLE_WITH_CUDA
+template <typename T>
+class SoftmaxCUDNNFunctor {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor* X, framework::Tensor* Y);
+};
+
+template <typename T>
+class SoftmaxGradCUDNNFunctor {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor* Y, const framework::Tensor* y_grad,
+                  framework::Tensor* x_grad);
+};
+#endif
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index 3e123f7bf5512618538fd35aa7e74b82586a5448..dd9971ba091cc3ece86654f65c335b98087f45ed 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -27,7 +27,7 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename T>
 struct ValueClip {
   HOSTDEVICE T operator()(const T& x) const {
-    const T kThreshold = -64.;
+    const T kThreshold = static_cast<T>(-64.);
     return x < kThreshold ? kThreshold : x;
   }
 };
diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu
index 367f343d51712d38edbb7eb50b41433258cf8c9d..c467ae8427d8f461b332eed8075631ed7e47b96e 100644
--- a/paddle/fluid/operators/math/unpooling.cu
+++ b/paddle/fluid/operators/math/unpooling.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/unpooling.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/vol2col.cc b/paddle/fluid/operators/math/vol2col.cc
index 09e9f85cca349be1dc46a3f3a0b2d919485d6fa1..e92adc09ba01b032aba8eba94bcb4ba96524c641 100644
--- a/paddle/fluid/operators/math/vol2col.cc
+++ b/paddle/fluid/operators/math/vol2col.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/vol2col.h"
+#include <vector>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/vol2col.cu b/paddle/fluid/operators/math/vol2col.cu
index 619730d394d075d05a016e421fd75c4549015216..28e1a752e34cf0171785a0341d8f0d8d3712fc7b 100644
--- a/paddle/fluid/operators/math/vol2col.cu
+++ b/paddle/fluid/operators/math/vol2col.cu
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <algorithm>
+#include <vector>
 #include "paddle/fluid/operators/math/vol2col.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/vol2col.h b/paddle/fluid/operators/math/vol2col.h
index dbc2ed7a6939cfb5dca61082decb8960e5f9ebda..5f59de8f02a52209a3901ca03680eb2d0dbc2658 100644
--- a/paddle/fluid/operators/math/vol2col.h
+++ b/paddle/fluid/operators/math/vol2col.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc
index eb91f862e39d9f158200f69fd48e7245dde47171..aa979c4f10907e604758c3e2cfb776cb994c9ceb 100644
--- a/paddle/fluid/operators/math/vol2col_test.cc
+++ b/paddle/fluid/operators/math/vol2col_test.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/vol2col.h"
 #include <gtest/gtest.h>
 #include <iostream>
+#include <vector>
 
 template <typename DeviceContext, typename Place>
 void testVol2col() {
@@ -71,7 +72,7 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    paddle::framework::TensorCopy(input_tmp, *place, *context, &input);
+    paddle::framework::TensorCopySync(input_tmp, *place, &input);
   }
   output.mutable_data<float>({1, filter_size, filter_size, filter_size,
                               output_depth, output_height, output_width},
@@ -85,7 +86,7 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     out_cfo_ptr = output.data<float>();
   } else {
-    TensorCopy(output, paddle::platform::CPUPlace(), *context, &output_tmp);
+    TensorCopySync(output, paddle::platform::CPUPlace(), &output_tmp);
     out_cfo_ptr = output_tmp.data<float>();
   }
 
@@ -99,7 +100,7 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    TensorCopy(input_tmp, *place, *context, &input);
+    TensorCopySync(input_tmp, *place, &input);
   }
 
   paddle::operators::math::Col2VolFunctor<DeviceContext, float> col2vol;
@@ -109,7 +110,7 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     in_ptr = input.data<float>();
   } else {
-    TensorCopy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
+    TensorCopySync(input, paddle::platform::CPUPlace(), &input_tmp);
     in_ptr = input_tmp.data<float>();
   }
 
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 85855928521b8b4cc5e8746b0b5f841cc2587618..7182149164854038bb67a9f06cdbec8a4a0f1fb2 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -12,19 +12,264 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/matmul_op.h"
+#include <algorithm>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/math/blas.h"
 
 namespace paddle {
 namespace operators {
+/**
+ * Get row matrix shape from a vector shape. If the rank of x_dim > 1, the
+ * original x_dim is returned.
+ */
+static framework::DDim RowMatrixFromVector(const framework::DDim &x_dim) {
+  if (x_dim.size() > 1) {
+    return x_dim;
+  }
+  return framework::make_ddim({1, x_dim[0]});
+}
+
+/**
+ * Get column matrix shape from a vector shape. If the ran of y_dim > 1, the
+ * original y_dim is returned.
+ */
+static framework::DDim ColumnMatrixFromVector(const framework::DDim &y_dim) {
+  if (y_dim.size() > 1) {
+    return y_dim;
+  }
+  return framework::make_ddim({y_dim[0], 1});
+}
+
+template <typename DeviceContext, typename T>
+class MatMulKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto &x =
+        detail::Ref(context.Input<framework::Tensor>("X"), "Cannot find X");
+    auto &y =
+        detail::Ref(context.Input<framework::Tensor>("Y"), "Cannot find Y");
+    auto *out = context.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+
+    auto blas = math::GetBlas<DeviceContext, T>(context);
+    auto mat_dim_a = math::CreateMatrixDescriptor(
+        RowMatrixFromVector(x.dims()), 0, context.Attr<bool>("transpose_X"));
+    auto mat_dim_b = math::CreateMatrixDescriptor(
+        ColumnMatrixFromVector(y.dims()), 0, context.Attr<bool>("transpose_Y"));
+    blas.MatMul(x, mat_dim_a, y, mat_dim_b, T(1), out, T(0));
+  }
+};
+
+// Reshape a rank-3 tensor from P x M x N to (P * M) x N.
+// Identity op if the tensor is not of rank 3.
+static framework::Tensor FoldInitDims(const framework::Tensor &input) {
+  auto output = input;
+  auto in_dims = input.dims();
+  if (in_dims.size() == 3) {
+    output.Resize({in_dims[0] * in_dims[1], in_dims[2]});
+  }
+  return output;
+}
+
+// Reshape a rank-3 tensor from P x M x N to M x (P * N).
+// (Warning: This requires transposing data and writes into new memory.)
+// Identity op if the tensor is not of rank 3.
+template <typename DeviceContext, typename T>
+static framework::Tensor FoldHeadAndLastDims(const DeviceContext &context,
+                                             const framework::Tensor &input) {
+  auto in_dims = input.dims();
+  if (in_dims.size() != 3) {
+    return input;
+  }
+  framework::Tensor output;
+  output.Resize({in_dims[1], in_dims[0], in_dims[2]});
+  output.mutable_data<T>(context.GetPlace());
+  std::vector<int> axis = {1, 0, 2};
+  math::Transpose<DeviceContext, T, 3> trans;
+  trans(context, input, &output, axis);
+  output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
+
+  return output;
+}
+
+/**
+ * Reshape a tensor to 3-D or 2-D tensor by matrix descriptor.
+ *
+ * The shape would be [BatchSize, H, W] or [H, W].
+ * If transposed, `H,W` will be swapped.
+ */
+static void ReshapeTensorIntoMatrixSequence(
+    framework::Tensor *x, const math::MatDescriptor &descriptor) {
+  int64_t h, w;
+  h = descriptor.height_;
+  w = descriptor.width_;
+  if (descriptor.trans_) {
+    std::swap(w, h);
+  }
+  if (descriptor.batch_size_) {
+    x->Resize({descriptor.batch_size_, h, w});
+  } else {
+    x->Resize({h, w});
+  }
+}
+
+/**
+ * Reshape the x,y,out tensor to 3-D or 2-D tensor by matrix descriptor
+ * Out = matmul(x, y)
+ *
+ * This method will first calculate X,Y matrix sequence, and then calculate
+ * the out shape.
+ *
+ * Assume X = [BatchSize, H1, W1], Y = [BatchSize, H2, W2]
+ * The out = [BatchSize, H1, W2]
+ *
+ * If there is no batch size in `X` and `Y`, the out will be [H1, W2]
+ * If any of `X` and `Y` has batch size BatchSize, the out will have the
+ * BatchSize.
+ */
+static void ReshapeXYOutIntoMatrixSequence(framework::Tensor *x,
+                                           framework::Tensor *y,
+                                           framework::Tensor *out, bool trans_x,
+                                           bool trans_y) {
+  auto x_dim = RowMatrixFromVector(x->dims());
+  auto y_dim = ColumnMatrixFromVector(y->dims());
+  auto mat_dim_x = math::CreateMatrixDescriptor(x_dim, 0, trans_x);
+  auto mat_dim_y = math::CreateMatrixDescriptor(y_dim, 0, trans_y);
+  if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) {
+    out->Resize({mat_dim_x.height_, mat_dim_y.width_});
+  } else {
+    out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_),
+                 mat_dim_x.height_, mat_dim_y.width_});
+  }
+
+  ReshapeTensorIntoMatrixSequence(x, mat_dim_x);
+  ReshapeTensorIntoMatrixSequence(y, mat_dim_y);
+}
+
+// Using dimensional constraints on matrix multiplication, it is
+// straight-forward to check the following table for when X and Y
+// are both matrices.
+//
+// transpose_X | False    | True     | False    | True
+// transpose_Y | False    | False    | True     | True
+// -----------+----------+----------+----------+-----------
+//        dX = | dOut Y^T | Y dOut^T | dOut Y   | Y^T dOut^T
+//        dY = | X^T dOut | X dOut   | dOut^T X | dOut^T X^T
+//
+// When X is a vector of size K, we treat it instead as a matrix of shape
+// (1, K). Similarly, when Y is a vector of size K, we treat it instead as
+// a matrix of shape (K, 1).
+//
+// When X and Y are both 3-dimensional tensors, then the first dimension
+// the batch dimension can be ignored and the exact same formulas apply
+// as for two matrices.
+//
+// Finally, when, e.g., X is a 3-dimensional tensor but Y is a matrix, we end
+// up with formulas like
+//
+//   dY_{ij} = \sum_{p, m} X_{pmi} dOut_{pmj}
+//
+// To handle this sort of scenario, we reshape X : P x M x K, dOut: P x M x N
+// to X: (P * M) x K, dOut: (P * M) x N.
+template <typename DeviceContext, typename T>
+class MatMulGradKernel : public framework::OpKernel<T> {
+ public:
+  void MatMul(const framework::ExecutionContext &context,
+              const framework::Tensor &a, bool trans_a,
+              const framework::Tensor &b, bool trans_b,
+              framework::Tensor *out) const {
+    out->mutable_data<T>(context.GetPlace());
+    auto blas = math::GetBlas<DeviceContext, T>(context);
+    auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a);
+    auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b);
+    blas.MatMul(a, mat_dim_a, b, mat_dim_b, T(1), out, T(0));
+  }
+
+  void CalcInputGrad(const framework::ExecutionContext &context,
+                     const framework::Tensor &a, bool trans_a,
+                     bool is_fold_init_dims_a, const framework::Tensor &b,
+                     bool trans_b, bool is_fold_init_dims_b,
+                     framework::Tensor *out) const {
+    if (out == nullptr) return;
+    bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) &&
+                        out->dims().size() == 2;
+    if (!need_combine) {
+      MatMul(context, a, trans_a, b, trans_b, out);
+    } else {
+      auto &ctx = context.template device_context<DeviceContext>();
+      MatMul(context, is_fold_init_dims_a
+                          ? FoldInitDims(a)
+                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, a),
+             trans_a, is_fold_init_dims_b
+                          ? FoldInitDims(b)
+                          : FoldHeadAndLastDims<DeviceContext, T>(ctx, b),
+             trans_b, out);
+    }
+  }
+
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto x = *context.Input<framework::Tensor>("X");
+    auto y = *context.Input<framework::Tensor>("Y");
+    auto dout =
+        *context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto *dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto *dy = context.Output<framework::Tensor>(framework::GradVarName("Y"));
+    bool transpose_x = context.Attr<bool>("transpose_X");
+    bool transpose_y = context.Attr<bool>("transpose_Y");
+
+    ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y);
+    framework::DDim dx_dims;
+    if (dx) {
+      dx_dims = dx->dims();
+      if (dx_dims != x.dims()) {
+        dx->Resize(x.dims());
+      }
+    }
+
+    framework::DDim dy_dims;
+    if (dy) {
+      dy_dims = dy->dims();
+      if (dy_dims != y.dims()) {
+        dy->Resize(y.dims());
+      }
+    }
 
-using framework::Tensor;
+    if (transpose_x && transpose_y) {
+      CalcInputGrad(context, y, true, true, dout, true, false, dx);
+      CalcInputGrad(context, dout, true, true, x, true, false, dy);
+    } else if (transpose_x) {
+      CalcInputGrad(context, y, false, false, dout, true, false, dx);
+      CalcInputGrad(context, x, false, false, dout, false, true, dy);
+    } else if (transpose_y) {
+      CalcInputGrad(context, dout, false, false, y, false, true, dx);
+      CalcInputGrad(context, dout, true, true, x, false, true, dy);
+    } else {
+      CalcInputGrad(context, dout, false, false, y, true, false, dx);
+      CalcInputGrad(context, x, true, true, dout, false, true, dy);
+    }
+
+    if (dx) {
+      if (dx_dims != x.dims()) {
+        dx->Resize(dx_dims);
+      }
+    }
+    if (dy) {
+      if (dy_dims != y.dims()) {
+        dy->Resize(dy_dims);
+      }
+    }
+  }
+};
 
 class MatMulOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(framework::InferShapeContext* context) const override {
+  void InferShape(framework::InferShapeContext *context) const override {
     PADDLE_ENFORCE(context->HasInput("X"),
                    "Input(X) of MatMulOp should not be null.");
     PADDLE_ENFORCE(context->HasInput("Y"),
@@ -34,121 +279,41 @@ class MatMulOp : public framework::OperatorWithKernel {
 
     auto dim_x = context->GetInputDim("X");
     auto dim_y = context->GetInputDim("Y");
-    bool transpose_x = context->Attrs().Get<bool>("transpose_X");
-    bool transpose_y = context->Attrs().Get<bool>("transpose_Y");
-
-    PADDLE_ENFORCE_GE(dim_x.size(), 1,
-                      "Input tensor X must be at least 1-dimensional.");
-    PADDLE_ENFORCE_GE(dim_y.size(), 1,
-                      "Input tensor Y must be at least 1-dimensional.");
-
-    std::vector<int64_t> out_dim;
-    int64_t batch_count = 1;
-    if (dim_x.size() > 3) {
-      PADDLE_ENFORCE_EQ(
-          dim_y.size(), dim_x.size(),
-          "The dimensions of X and Y must be the same, and both of "
-          "them should be %d-dimensional.",
-          dim_x.size());
-
-      // The first rank-2 dimensions are accumulated on the batch_count, and the
-      // last two dimensions are used for matrix multiplication.
-      for (int j = 0; j < dim_x.size() - 2; ++j) {
-        PADDLE_ENFORCE_EQ(dim_y[j], dim_x[j],
-                          "The %d-th dimension of X and Y must be the same.",
-                          j);
-        out_dim.push_back(dim_x[j]);
-        batch_count *= dim_x[j];
-      }
-    }
 
-    int M = 0, N = 0, KX = 0, KY = 0, batchCountX = 0, batchCountY = 0;
-    bool remove_initial_dim = false, remove_final_dim = false;
-
-    switch (dim_x.size()) {
-      case 1:
-        if (transpose_x) {
-          M = dim_x[0];
-          KX = 1;
-        } else {
-          M = 1;
-          KX = dim_x[0];
-          remove_initial_dim = true;
-        }
-        break;
-      case 2:
-        M = transpose_x ? dim_x[1] : dim_x[0];
-        KX = transpose_x ? dim_x[0] : dim_x[1];
-        break;
-      case 3:
-        batchCountX = dim_x[0];
-        M = transpose_x ? dim_x[2] : dim_x[1];
-        KX = transpose_x ? dim_x[1] : dim_x[2];
-        break;
-      default:
-        batchCountX = batch_count;
-        size_t mat_s = dim_x.size() - 2;
-        M = transpose_x ? dim_x[mat_s + 1] : dim_x[mat_s];
-        KX = transpose_x ? dim_x[mat_s] : dim_x[mat_s + 1];
-        break;
-    }
+    auto mat_dim_x =
+        math::CreateMatrixDescriptor(RowMatrixFromVector(dim_x), 0,
+                                     context->Attrs().Get<bool>("transpose_X"));
+    auto mat_dim_y =
+        math::CreateMatrixDescriptor(ColumnMatrixFromVector(dim_y), 0,
+                                     context->Attrs().Get<bool>("transpose_Y"));
 
-    switch (dim_y.size()) {
-      case 1:
-        if (transpose_y) {
-          N = dim_y[0];
-          KY = 1;
-        } else {
-          N = 1;
-          KY = dim_y[0];
-          remove_final_dim = true;
-        }
-        break;
-      case 2:
-        KY = transpose_y ? dim_y[1] : dim_y[0];
-        N = transpose_y ? dim_y[0] : dim_y[1];
-        break;
-      case 3:
-        batchCountY = dim_y[0];
-        KY = transpose_y ? dim_y[2] : dim_y[1];
-        N = transpose_y ? dim_y[1] : dim_y[2];
-        break;
-      default:
-        batchCountY = batch_count;
-        size_t mat_s = dim_y.size() - 2;
-        KY = transpose_y ? dim_y[mat_s + 1] : dim_y[mat_s];
-        N = transpose_y ? dim_y[mat_s] : dim_y[mat_s + 1];
+    PADDLE_ENFORCE_EQ(mat_dim_x.width_, mat_dim_y.height_);
+    PADDLE_ENFORCE(mat_dim_x.batch_size_ == mat_dim_y.batch_size_ ||
+                   mat_dim_x.batch_size_ == 0 || mat_dim_y.batch_size_ == 0);
+    std::vector<int64_t> dim_out;
+    if (mat_dim_x.batch_size_ != 0) {
+      dim_out = framework::vectorize(dim_x);
+      dim_out[dim_out.size() - 2] = mat_dim_x.height_;
+      dim_out[dim_out.size() - 1] = mat_dim_y.width_;
+    } else if (mat_dim_y.batch_size_ != 0) {
+      dim_out = framework::vectorize(dim_y);
+      dim_out[dim_out.size() - 2] = mat_dim_x.height_;
+      dim_out[dim_out.size() - 1] = mat_dim_y.width_;
+    } else {
+      dim_out = {mat_dim_x.height_, mat_dim_y.width_};
     }
 
-    PADDLE_ENFORCE_EQ(
-        KX, KY,
-        "First matrix's width must be equal with second matrix's height.");
-    if (batchCountX && batchCountY) {
-      PADDLE_ENFORCE_EQ(
-          batchCountX, batchCountY,
-          "When Input(X) and Input(Y) are both three dimensional, they "
-          "must have the same batch dimension.");
+    if (dim_x.size() == 1 && dim_out[dim_out.size() - 2] == 1) {
+      std::swap(dim_out[dim_out.size() - 2], dim_out[dim_out.size() - 1]);
+      dim_out.resize(dim_out.size() - 1);
     }
-    int batchCount = std::max(batchCountX, batchCountY);
 
-    std::vector<int64_t> dim_out;
-    if (batchCount) {
-      if (dim_x.size() > 3) {
-        dim_out.insert(dim_out.begin(), out_dim.begin(), out_dim.end());
-      } else {
-        dim_out.push_back(batchCount);
-      }
+    if (dim_y.size() == 1 && dim_out[dim_out.size() - 1] == 1) {
+      dim_out.resize(dim_out.size() - 1);
     }
-    if (!remove_initial_dim) {
-      dim_out.push_back(M);
-    }
-    if (!remove_final_dim) {
-      dim_out.push_back(N);
-    }
-    if (dim_out.size() == 0) {
-      // We don't support 0-dimensional Tensors (scalars), so instead
-      // treat the output as a Tensor of shape (1, ) in this case.
-      dim_out.push_back(1);
+
+    if (dim_out.empty()) {
+      dim_out = {1};
     }
     context->SetOutputDim("Out", framework::make_ddim(dim_out));
     context->ShareLoD("X", /*->*/ "Out");
@@ -157,8 +322,7 @@ class MatMulOp : public framework::OperatorWithKernel {
 
 class MatMulOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MatMulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The first input of MatMul op");
     AddInput("Y", "The second input of MatMul op");
     AddOutput("Out", "The output of MatMul op");
@@ -211,7 +375,7 @@ class MatMulOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(framework::InferShapeContext* context) const override {
+  void InferShape(framework::InferShapeContext *context) const override {
     PADDLE_ENFORCE(context->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(context->HasInput("Y"), "Input(Y) should not be null");
     PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
@@ -231,14 +395,52 @@ class MatMulOpGrad : public framework::OperatorWithKernel {
   }
 };
 
+class MatMulOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *retv = new framework::OpDesc();
+    retv->SetType("matmul_grad");
+    retv->SetInput("X", Input("X"));
+    retv->SetInput("Y", Input("Y"));
+    retv->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    retv->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    retv->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
+    retv->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(retv);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(matmul, ops::MatMulOp, ops::MatMulOpMaker, matmul_grad,
-            ops::MatMulOpGrad);
+REGISTER_OPERATOR(matmul, ops::MatMulOp, ops::MatMulOpMaker,
+                  ops::MatMulOpGradMaker);
+REGISTER_OPERATOR(matmul_grad, ops::MatMulOpGrad);
 REGISTER_OP_CPU_KERNEL(
-    matmul, ops::MatMulKernel<paddle::platform::CPUDeviceContext, float>);
+    matmul, ops::MatMulKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MatMulKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::MatMulKernel<paddle::platform::CPUDeviceContext,
+                      paddle::platform::float16>);
 REGISTER_OP_CPU_KERNEL(
     matmul_grad,
-    ops::MatMulGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::MatMulGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MatMulGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::MatMulGradKernel<paddle::platform::CPUDeviceContext,
+                          paddle::platform::float16>);
+
+#ifdef PADDLE_WITH_CUDA
+REGISTER_OP_CUDA_KERNEL(
+    matmul, ops::MatMulKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MatMulKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MatMulKernel<paddle::platform::CUDADeviceContext,
+                      paddle::platform::float16>);
+REGISTER_OP_CUDA_KERNEL(
+    matmul_grad,
+    ops::MatMulGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MatMulGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MatMulGradKernel<paddle::platform::CUDADeviceContext,
+                          paddle::platform::float16>);
+#endif
diff --git a/paddle/fluid/operators/matmul_op.cu.cc b/paddle/fluid/operators/matmul_op.cu.cc
deleted file mode 100644
index e021bbe645399e410cde5c3ff7035d4d68c71744..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/matmul_op.cu.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/matmul_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    matmul, ops::MatMulKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    matmul_grad,
-    ops::MatMulGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/matmul_op.h b/paddle/fluid/operators/matmul_op.h
deleted file mode 100644
index 1cd8fe55dcbd23eae771550a363bf0a07e9bf585..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/matmul_op.h
+++ /dev/null
@@ -1,242 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/matmul.h"
-
-namespace paddle {
-namespace operators {
-namespace matmul_detail {
-
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-using framework::make_ddim;
-using framework::vectorize;
-
-template <typename DeviceContext, typename T>
-class MatMulKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor& x = *context.Input<Tensor>("X");
-    const Tensor& y = *context.Input<Tensor>("Y");
-    Tensor* out = context.Output<Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-    bool transpose_x = context.Attr<bool>("transpose_X");
-    bool transpose_y = context.Attr<bool>("transpose_Y");
-
-    math::MatMulFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), x, transpose_x, y,
-        transpose_y, T(1), out, T(0));
-  }
-};
-
-template <typename T>
-inline Tensor Reshape(const Tensor& input, const DDim& dims) {
-  Tensor output;
-  output.ShareDataWith(input);
-  output.Resize(dims);
-  return output;
-}
-
-// Reshape a rank-3 tensor from P x M x N to (P * M) x N.
-// Identity op if the tensor is not of rank 3.
-template <typename T>
-Tensor CombineBatchAndM(const Tensor& input) {
-  Tensor output;
-  output.ShareDataWith(input);
-  auto in_dims = input.dims();
-  if (in_dims.size() == 3) {
-    std::vector<int64_t> out_dims = {in_dims[0] * in_dims[1], in_dims[2]};
-    output.Resize(make_ddim(out_dims));
-  }
-  return output;
-}
-
-// Reshape a rank-3 tensor from P x M x N to M x (P * N).
-// (Warning: This requires transposing data and writes into new memory.)
-// Identity op if the tensor is not of rank 3.
-template <typename DeviceContext, typename T>
-Tensor CombineBatchAndN(const DeviceContext& context, const Tensor& input) {
-  Tensor output;
-  auto in_dims = input.dims();
-  if (in_dims.size() == 3) {
-    output.Resize({in_dims[1], in_dims[0], in_dims[2]});
-    output.mutable_data<T>(context.GetPlace());
-    std::vector<int> axis = {1, 0, 2};
-    math::Transpose<DeviceContext, T, 3> trans;
-    trans(context, input, &output, axis);
-    std::vector<int64_t> out_dims = {in_dims[1], in_dims[0] * in_dims[2]};
-    output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
-  } else {
-    output.ShareDataWith(input);
-  }
-  return output;
-}
-
-// Using dimensional constraints on matrix multiplication, it is
-// straight-forward to check the following table for when X and Y
-// are both matrices.
-//
-// transpose_X | False    | True     | False    | True
-// transpose_Y | False    | False    | True     | True
-// -----------+----------+----------+----------+-----------
-//        dX = | dOut Y^T | Y dOut^T | dOut Y   | Y^T dOut^T
-//        dY = | X^T dOut | X dOut   | dOut^T X | dOut^T X^T
-//
-// When X is a vector of size K, we treat it instead as a matrix of shape
-// (1, K). Similarly, when Y is a vector of size K, we treat it instead as
-// a matrix of shape (K, 1).
-//
-// When X and Y are both 3-dimensional tensors, then the first dimension
-// the batch dimension can be ignored and the exact same formulas apply
-// as for two matrices.
-//
-// Finally, when, e.g., X is a 3-dimensional tensor but Y is a matrix, we end
-// up with formulas like
-//
-//   dY_{ij} = \sum_{p, m} X_{pmi} dOut_{pmj}
-//
-// To handle this sort of scenario, we reshape X : P x M x K, dOut: P x M x N
-// to X: (P * M) x K, dOut: (P * M) x N.
-template <typename DeviceContext, typename T>
-class MatMulGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const Tensor& x = *context.Input<Tensor>("X");
-    const Tensor& y = *context.Input<Tensor>("Y");
-    const Tensor& dout = *context.Input<Tensor>(framework::GradVarName("Out"));
-    Tensor* dx = context.Output<Tensor>(framework::GradVarName("X"));
-    Tensor* dy = context.Output<Tensor>(framework::GradVarName("Y"));
-    bool transpose_x = context.Attr<bool>("transpose_X");
-    bool transpose_y = context.Attr<bool>("transpose_Y");
-
-    std::vector<int64_t> x_dims = vectorize(x.dims());
-    std::vector<int64_t> y_dims = vectorize(y.dims());
-
-    // If X is a vector, reshape it to a matrix.
-    if (x_dims.size() == 1) {
-      x_dims.insert(x_dims.begin(), 1);
-    }
-
-    // If Y is a vector, reshape it to a matrix.
-    if (y_dims.size() == 1) {
-      y_dims.push_back(1);
-    }
-
-    int batch_count = 0;
-    // The first rank-2 dimensions are accumulated on the batch_count, and the
-    // last two dimensions are used for matrix multiplication.
-    if (x_dims.size() > 3) {
-      batch_count = accumulate(x_dims.begin(), x_dims.end() - 2, 1,
-                               std::multiplies<int>());
-    }
-    // Fix the dOut dimensions.
-    int M = 0, N = 0, batchCountX = 0, batchCountY = 0;
-
-    switch (x_dims.size()) {
-      case 2:
-        M = transpose_x ? x_dims[1] : x_dims[0];
-        break;
-      case 3:
-        batchCountX = x_dims[0];
-        M = transpose_x ? x_dims[2] : x_dims[1];
-        break;
-      default:
-        batchCountX = batch_count;
-        size_t mat_s = x_dims.size() - 2;
-        M = transpose_x ? x_dims[mat_s + 1] : x_dims[mat_s];
-    }
-
-    switch (y_dims.size()) {
-      case 2:
-        N = transpose_y ? y_dims[0] : y_dims[1];
-        break;
-      case 3:
-        batchCountY = y_dims[0];
-        N = transpose_y ? y_dims[1] : y_dims[2];
-        break;
-      default:
-        batchCountY = batch_count;
-        size_t mat_s = y_dims.size() - 2;
-        N = transpose_y ? y_dims[mat_s] : y_dims[mat_s + 1];
-    }
-    if (batchCountX && batchCountY) {
-      PADDLE_ENFORCE_EQ(
-          batchCountX, batchCountY,
-          "When Input(X) and Input(Y) are both three dimensional, they "
-          "must have the same batch dimension.");
-    }
-    int batchCount = std::max(batchCountX, batchCountY);
-    std::vector<int64_t> dout_dims = {M, N};
-    if (batchCount) {
-      if (x_dims.size() > 3) {
-        dout_dims.insert(dout_dims.begin(), x_dims.begin(), x_dims.end() - 2);
-      } else {
-        dout_dims.insert(dout_dims.begin(), batchCount);
-      }
-    }
-    Tensor X = Reshape<T>(x, make_ddim(x_dims));
-    Tensor Y = Reshape<T>(y, make_ddim(y_dims));
-    Tensor dOut = Reshape<T>(dout, make_ddim(dout_dims));
-
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    if (dx) {
-      dx->mutable_data<T>(context.GetPlace());
-      const Tensor& dOut_for_dX =
-          (x_dims.size() == 2 && y_dims.size() == 3)
-              ? CombineBatchAndN<DeviceContext, T>(dev_ctx, dOut)
-              : dOut;
-      if (x_dims.size() == 2 && y_dims.size() == 3) {
-        Y = transpose_y ? CombineBatchAndM<T>(Y)
-                        : CombineBatchAndN<DeviceContext, T>(dev_ctx, Y);
-      }
-      if (transpose_x) {
-        math::MatMulFunctor<DeviceContext, T>()(
-            dev_ctx, Y, transpose_y, dOut_for_dX, transpose_x, T(1), dx, T(0));
-      } else {
-        math::MatMulFunctor<DeviceContext, T>()(
-            dev_ctx, dOut_for_dX, transpose_x, Y, !transpose_y, T(1), dx, T(0));
-      }
-    }
-
-    if (dy) {
-      dy->mutable_data<T>(context.GetPlace());
-      const Tensor& dOut_for_dY = (y_dims.size() == 2 && x_dims.size() == 3)
-                                      ? CombineBatchAndM<T>(dOut)
-                                      : dOut;
-      if (y_dims.size() == 2 && x_dims.size() == 3) {
-        X = transpose_x ? CombineBatchAndN<DeviceContext, T>(dev_ctx, X)
-                        : CombineBatchAndM<T>(X);
-        dOut = CombineBatchAndM<T>(dOut);
-      }
-      if (transpose_y) {
-        math::MatMulFunctor<DeviceContext, T>()(
-            dev_ctx, dOut_for_dY, transpose_y, X, transpose_x, T(1), dy, T(0));
-      } else {
-        math::MatMulFunctor<DeviceContext, T>()(
-            dev_ctx, X, !transpose_x, dOut_for_dY, transpose_y, T(1), dy, T(0));
-      }
-    }
-  }
-};
-}  // namespace matmul_detail
-
-using matmul_detail::MatMulKernel;
-using matmul_detail::MatMulGradKernel;
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/max_sequence_len_op.cc b/paddle/fluid/operators/max_sequence_len_op.cc
index 4cd7c89b48a2442ee7a5074abbf0f3dd9ea3bcb4..b1e69f375d3274aade3184af02f7f914dba5db71 100644
--- a/paddle/fluid/operators/max_sequence_len_op.cc
+++ b/paddle/fluid/operators/max_sequence_len_op.cc
@@ -41,12 +41,16 @@ class MaxSeqenceLenOp : public framework::OperatorBase {
 
 class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MaxSeqenceLenOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("RankTable", "The lod_rank_table.");
-    AddOutput("Out", "The max sequence length.");
-    AddComment(
-        R"DOC(Calculate the max sequence length through lod_rank_table.)DOC");
+  void Make() override {
+    AddInput("RankTable", "Input variable which is a LoDRankTable object");
+    AddOutput("Out", "The max sequence length");
+    AddComment(R"DOC(
+    Given a LoDRankTable object, this layer returns the max length of
+    a batch of sequences. In fact, a LoDRankTable object contains a list of
+    tuples(<sequence index, sequence length>) and the list is already sorted by
+    sequence length in descending order, so the operator just returns the
+    sequence length of the first tuple element
+)DOC");
   }
 };
 
diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc
index efaae7d5f2d20484d90f79d9e13ec2f5ed6e06c9..058115cb624627d81b31d0903f7d615d19708c77 100644
--- a/paddle/fluid/operators/maxout_op.cc
+++ b/paddle/fluid/operators/maxout_op.cc
@@ -13,6 +13,8 @@
  *     limitations under the License. */
 
 #include "paddle/fluid/operators/maxout_op.h"
+#include <vector>
+
 namespace paddle {
 namespace operators {
 
@@ -20,8 +22,7 @@ using framework::Tensor;
 
 class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MaxOutOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(
         "X",
         "(Tensor) The input tensor of maxout operator. "
@@ -99,8 +100,9 @@ class MaxOutOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad,
-            ops::MaxOutOpGrad);
+REGISTER_OPERATOR(maxout, ops::MaxOutOp, ops::MaxOutOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(maxout_grad, ops::MaxOutOpGrad);
 REGISTER_OP_CPU_KERNEL(
     maxout, ops::MaxOutKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/mean_iou_op.cc b/paddle/fluid/operators/mean_iou_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a60f245f53e342fd9c1382fdda33a011a7fb06d6
--- /dev/null
+++ b/paddle/fluid/operators/mean_iou_op.cc
@@ -0,0 +1,110 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/mean_iou_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MeanIoUOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Predictions"),
+                   "Input (Predictions) of MeanIoU op should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input (labels) of MeanIoU op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutMeanIou"),
+                   "Output (OutMeanIou) of MeanIoU op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutWrong"),
+                   "Output (OutWrong) of MeanIoU op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutCorrect"),
+                   "Output (OutWrong) of MeanIoU op should not be null.");
+
+    int64_t num_classes =
+        static_cast<int64_t>(ctx->Attrs().Get<int>("num_classes"));
+
+    ctx->SetOutputDim("OutMeanIou", {1});
+    ctx->SetOutputDim("OutWrong", {num_classes});
+    ctx->SetOutputDim("OutCorrect", {num_classes});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Predictions")->type()),
+        ctx.GetPlace());
+  }
+};
+
+class MeanIoUOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Predictions",
+             "(Tensor), A Tensor of prediction results for semantic labels"
+             " with type int32 or int64. The rank should be greater than 1.");
+    AddInput(
+        "Labels",
+        "(Tensor), A Tensor of ground truth labels with type int32 or int64."
+        "Its shape should be the same as Input(Predictions).");
+    AddInput("InWrongs",
+             "(vector<Tensor>), A list of Tensor with shape "
+             "[num_classes]. They are used to collect wrong number among "
+             "batches. Empty list is also valid here.")
+        .AsDuplicable()
+        .AsDispensable();
+    AddInput(
+        "InCorrects",
+        "(vector<Tensor>), A list of Tensor with shape "
+        "[num_classes]. They are used to collect correct number among batches. "
+        "Empty list is also valid here.")
+        .AsDuplicable()
+        .AsDispensable();
+    AddInput("InMeanIou",
+             "(vector<Tensor>), A list of Tensor that Output(mean_iou) should "
+             "be added to. Empty list is also valid here.")
+        .AsDuplicable()
+        .AsDispensable();
+    AddOutput("OutMeanIou",
+              "(vector<Tensor>), A Tensor representing the"
+              " mean intersection-over-union with shape [1].");
+    AddOutput("OutWrong", "(Tensor), A Tensor with shape [num_classes]. ");
+    AddOutput("OutCorrect", "(Tensor), A Tensor with shape [num_classes]. ");
+    AddAttr<int>("num_classes", "(int), The possible number of labels.");
+
+    AddComment(R"DOC(
+mean-IOU Operator.
+Mean Intersection-Over-Union is a common evaluation metric for
+semantic image segmentation, which first computes the IOU for each
+semantic class and then computes the average over classes. 
+IOU is defined as follows: 
+    IOU = true_positive / (true_positive + false_positive + false_negative).
+It is based on pixel level area while "IOU Similarity Operator" 
+is based on area of rectangle.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(mean_iou, ops::MeanIoUOp, ops::MeanIoUOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(mean_iou, ops::MeanIoUKernel<int>,
+                       ops::MeanIoUKernel<int32_t>,
+                       ops::MeanIoUKernel<int64_t>);
diff --git a/paddle/fluid/operators/mean_iou_op.cu b/paddle/fluid/operators/mean_iou_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..83bb4dde46fa241affad3788e3381b6ecd8aa098
--- /dev/null
+++ b/paddle/fluid/operators/mean_iou_op.cu
@@ -0,0 +1,164 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/mean_iou_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__global__ void CountCUDAKernel(const int num_classes, const int count,
+                                const T* predictions, const T* labels,
+                                int* wrong, int* correct) {
+  extern __shared__ int blcok_cache[];
+  int* wrong_c = blcok_cache;
+  int* correct_c = blcok_cache + num_classes;
+  // init cache
+  for (int i = threadIdx.x; i < num_classes * 2; i += blockDim.x) {
+    blcok_cache[i] = 0;
+  }
+  __syncthreads();
+
+  T pred;
+  T label;
+  CUDA_1D_KERNEL_LOOP(i, count) {
+    pred = predictions[i];
+    label = labels[i];
+    if (pred == label) {
+      atomicAdd(correct_c + pred, 1);
+    } else {
+      atomicAdd(wrong_c + pred, 1);
+      atomicAdd(wrong_c + label, 1);
+    }
+  }
+
+  __syncthreads();
+
+  for (int i = threadIdx.x; i < num_classes; i += blockDim.x) {
+    atomicAdd(wrong + i, wrong_c[i]);
+    atomicAdd(correct + i, correct_c[i]);
+  }
+}
+
+__global__ void ComputeIoUCUDAKernel(const int num_classes, int* wrong,
+                                     int* correct, float* ious, float* iou) {
+  __shared__ int valid_count_c;
+  if (threadIdx.x == 0) {
+    valid_count_c = 0;
+  }
+  __syncthreads();
+  CUDA_1D_KERNEL_LOOP(i, num_classes) {
+    int wrong_n = wrong[i];
+    int correct_n = correct[i];
+    int denominator = wrong_n + correct_n;
+    if (denominator > 0) {
+      atomicAdd(&valid_count_c, 1);
+      ious[i] = static_cast<float>(correct_n) / denominator;
+    } else {
+      ious[i] = 0;
+    }
+  }
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    float iou_sum = 0;
+    for (int i = 0; i < num_classes; ++i) {
+      iou_sum += ious[i];
+    }
+    iou[0] += iou_sum / valid_count_c;
+  }
+}
+
+template <typename T>
+class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& place = *ctx.template device_context<platform::CUDADeviceContext>()
+                       .eigen_device();
+    // get input and output tensor
+    auto* predictions = ctx.Input<Tensor>("Predictions");
+    auto* labels = ctx.Input<Tensor>("Labels");
+    auto* out_mean_iou = ctx.Output<Tensor>("OutMeanIou");
+    auto* out_wrong = ctx.Output<Tensor>("OutWrong");
+    auto* out_correct = ctx.Output<Tensor>("OutCorrect");
+    int num_classes = static_cast<int>(ctx.Attr<int>("num_classes"));
+
+    // Get data ptr
+    const T* predictions_data = predictions->data<T>();
+    const T* labels_data = labels->data<T>();
+    int* out_wrong_data = out_wrong->mutable_data<int>(ctx.GetPlace());
+    int* out_correct_data = out_correct->mutable_data<int>(ctx.GetPlace());
+    float* out_mean_iou_data =
+        out_mean_iou->mutable_data<float>(ctx.GetPlace());
+
+    // Get Eigen tensor
+    auto out_mean_iou_t = EigenTensor<float, 1>::From(*out_mean_iou);
+    auto out_wrong_t = EigenTensor<int, 1>::From(*out_wrong);
+    auto out_correct_t = EigenTensor<int, 1>::From(*out_correct);
+
+    // Temporary tensor
+    Tensor ious;
+    float* ious_data = ious.mutable_data<float>(
+        {static_cast<int64_t>(num_classes)}, ctx.GetPlace());
+    auto ious_t = EigenTensor<float, 1>::From(ious);
+
+    // Init out_wrong, out_correct and out_mean_iou
+    out_wrong_t.device(place) = out_wrong_t.constant(0);
+    out_correct_t.device(place) = out_correct_t.constant(0);
+    out_mean_iou_t.device(place) = out_mean_iou_t.constant(0.0f);
+
+    // collect pre wrong, correct and mean_iou
+    auto in_mean_ious = ctx.MultiInput<Tensor>("InMeanIou");
+    for (int i = 0; i < in_mean_ious.size(); ++i) {
+      out_mean_iou_t.device(place) +=
+          EigenTensor<float, 1>::From(*in_mean_ious[i]);
+    }
+    auto in_wrongs = ctx.MultiInput<Tensor>("InWrongs");
+    for (int i = 0; i < in_wrongs.size(); ++i) {
+      out_wrong_t.device(place) += EigenTensor<int, 1>::From(*in_wrongs[i]);
+    }
+    auto in_corrects = ctx.MultiInput<Tensor>("InCorrects");
+    for (int i = 0; i < in_corrects.size(); ++i) {
+      out_correct_t.device(place) += EigenTensor<int, 1>::From(*in_corrects[i]);
+    }
+    // compute
+    auto stream = ctx.cuda_device_context().stream();
+    int block = PADDLE_CUDA_NUM_THREADS;
+    int grid = (predictions->numel() + block - 1) / block;
+    int cache_size = (num_classes * 2 + 1) * sizeof(int);
+    CountCUDAKernel<T><<<grid, block, cache_size, stream>>>(
+        num_classes, predictions->numel(), predictions_data, labels_data,
+        out_wrong_data, out_correct_data);
+    ctx.device_context().Wait();
+    ComputeIoUCUDAKernel<<<1, block, 0, stream>>>(num_classes, out_wrong_data,
+                                                  out_correct_data, ious_data,
+                                                  out_mean_iou_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(mean_iou, ops::MeanIoUCUDAOpKernel<int>,
+                        ops::MeanIoUCUDAOpKernel<int64_t>,
+                        ops::MeanIoUCUDAOpKernel<int32_t>);
diff --git a/paddle/fluid/operators/mean_iou_op.h b/paddle/fluid/operators/mean_iou_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9fa00e60e05504e0bb8658c6908e4d4ac46b2ca4
--- /dev/null
+++ b/paddle/fluid/operators/mean_iou_op.h
@@ -0,0 +1,117 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+template <typename T, int D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+
+template <typename T>
+class MeanIoUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
+                       .eigen_device();
+    // get input and output tensor
+    auto* predictions = ctx.Input<Tensor>("Predictions");
+    auto* labels = ctx.Input<Tensor>("Labels");
+    auto* out_mean_iou = ctx.Output<Tensor>("OutMeanIou");
+    auto* out_wrong = ctx.Output<Tensor>("OutWrong");
+    auto* out_correct = ctx.Output<Tensor>("OutCorrect");
+    int num_classes = static_cast<int>(ctx.Attr<int>("num_classes"));
+
+    // get data ptr
+    const T* predictions_data = predictions->data<T>();
+    const T* labels_data = labels->data<T>();
+    float* out_mean_iou_data =
+        out_mean_iou->mutable_data<float>(ctx.GetPlace());
+    int* out_wrong_data = out_wrong->mutable_data<int>(ctx.GetPlace());
+    int* out_correct_data = out_correct->mutable_data<int>(ctx.GetPlace());
+
+    // get eigen tensor
+    auto out_mean_iou_t = EigenTensor<float, 1>::From(*out_mean_iou);
+    auto out_wrong_t = EigenTensor<int, 1>::From(*out_wrong);
+    auto out_correct_t = EigenTensor<int, 1>::From(*out_correct);
+
+    // Tmp tensor
+    Tensor denominator;
+    Tensor valid_count;
+    Tensor iou_sum;
+
+    // get data ptr of tmp tensor
+    int* denominator_data = denominator.mutable_data<int>(
+        {static_cast<int64_t>(num_classes)}, ctx.GetPlace());
+    int* valid_count_data = valid_count.mutable_data<int>({1}, ctx.GetPlace());
+    float* iou_sum_data = iou_sum.mutable_data<float>({1}, ctx.GetPlace());
+
+    // get eigen tensor of tmp tensor
+    auto denominator_t = EigenTensor<int, 1>::From(denominator);
+    auto valid_count_t = EigenTensor<int, 1>::From(valid_count);
+    auto iou_sum_t = EigenTensor<float, 1>::From(iou_sum);
+
+    // init out_wrong, out_correct and out_mean_iou
+    out_wrong_t = out_wrong_t.constant(0);
+    out_correct_t = out_correct_t.constant(0);
+    out_mean_iou_t = out_mean_iou_t.constant(0);
+
+    // collect pre wrong, correct and mean_iou
+    auto in_mean_ious = ctx.MultiInput<Tensor>("InMeanIou");
+    for (size_t i = 0; i < in_mean_ious.size(); ++i) {
+      out_mean_iou_t.device(place) +=
+          EigenTensor<float, 1>::From(*in_mean_ious[i]);
+    }
+    auto in_wrongs = ctx.MultiInput<Tensor>("InWrongs");
+    for (size_t i = 0; i < in_wrongs.size(); ++i) {
+      out_wrong_t.device(place) += EigenTensor<int, 1>::From(*in_wrongs[i]);
+    }
+    auto in_corrects = ctx.MultiInput<Tensor>("InCorrects");
+    for (size_t i = 0; i < in_corrects.size(); ++i) {
+      out_correct_t.device(place) += EigenTensor<int, 1>::From(*in_corrects[i]);
+    }
+
+    // compute
+    for (int64_t i = 0; i < predictions->numel(); ++i) {
+      if (predictions_data[i] == labels_data[i]) {
+        out_correct_data[predictions_data[i]] += 1;
+      } else {
+        out_wrong_data[labels_data[i]] += 1;
+        out_wrong_data[predictions_data[i]] += 1;
+      }
+    }
+
+    denominator_t = out_wrong_t + out_correct_t;
+    valid_count_t =
+        (denominator_t > denominator_t.constant(0.0f)).cast<int>().sum();
+
+    for (int i = 0; i < num_classes; ++i) {
+      if (denominator_data[i] == 0) {
+        denominator_data[i] = 1;
+      }
+    }
+
+    iou_sum_t =
+        (out_correct_t.cast<float>() / denominator_t.cast<float>()).sum();
+    out_mean_iou_data[0] += (iou_sum_data[0] / valid_count_data[0]);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc
index a134796bfcaa9dea2483ace9f5045e257916daba..9e0bebd17c02a3ce010b77142757b8789cfbcdd9 100644
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
@@ -32,14 +32,11 @@ class MeanOp : public framework::OperatorWithKernel {
 
 class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MeanOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input of mean op");
-    AddOutput("Out", "The output of mean op");
+  void Make() override {
+    AddInput("X", "(Tensor) The input of mean op");
+    AddOutput("Out", "(Tensor) The output of mean op").Reuse("X");
     AddComment(R"DOC(
-Mean Operator.
-
-Out is a scalar which is the mean of all elements in X. 
+Mean Operator calculates the mean of all elements in X.
 
 )DOC");
   }
diff --git a/paddle/fluid/operators/merge_ids_op.cc b/paddle/fluid/operators/merge_ids_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c6ec4ab047d5e91625e646fd26108d2e477cdce5
--- /dev/null
+++ b/paddle/fluid/operators/merge_ids_op.cc
@@ -0,0 +1,128 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/merge_ids_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MergeIdsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}");
+    AddInput(
+        "X",
+        "(LoDTensors) multi input tensor with shape{batch_num, N}, N is the "
+        "size of embedding table")
+        .AsDuplicable();
+    AddOutput("Out", "(LoDTensor) The merged outputs of the input tensors.");
+
+    AddComment(R"DOC(
+Merge multi LoDTensor's into one according to Ids's shard num.
+
+
+split_ids_op -> prefetch_op -> merge_ids_op
+
+
+merge_ids_op should be used after split_ids_op and prefetch_op, split_ids_op
+ will split input Ids into multiple tensors according to Id's shard number.
+prefetch_op will send them to parameter server to prefetch embedding value
+back. During split, the order of ids is disordered. In merge_ids_op we use
+the original Ids to restore the order of the fetched embedding value and
+ also pass the lod information to the merged output.
+
+
+Example:
+
+    Ids = [1,2,3,4,5,6] # 3 shared
+
+split_ids_op ->
+
+    Id0 = [3, 6] # id % 3 == 0
+    Id1 = [1, 4] # id % 3 == 1
+    Id2 = [2, 5] # id % 3 == 2
+
+prefetch_op ->
+
+    X0 = [[0.3 0.3]   # 3
+          [0.6 0.6]]  # 6
+    X1 = [[0.1 0.1]   # 1
+          [0.4 0.4]]  # 4
+    X2 = [[0.2 0.2]   # 2
+          [0.5 0.5]]  # 5
+
+merge_ids_op ->
+
+    Out = [[0.1 0.1]  # 1
+           [0.2 0.2]  # 2
+           [0.3 0.3]  # 3
+           [0.4 0.4]  # 4
+           [0.5 0.5]  # 5
+           [0.6 0.6]] # 6
+)DOC");
+  }
+};
+
+class MergeIdsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Ids"), "MergeIdsOp must has input Ids.");
+    PADDLE_ENFORCE(ctx->HasInputs("X"), "MergeIdsOp must has input X.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "MergeIdsOp must has output Out.");
+
+    auto ids_var_type = ctx->GetInputsVarType("Ids").front();
+    auto ids_dims = ctx->GetInputDim("Ids");
+    if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
+      PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
+      PADDLE_ENFORCE_EQ(ids_dims[1], 1);
+    }
+    auto x_var_type = ctx->GetInputsVarType("X");
+    for (auto &var_type : x_var_type) {
+      PADDLE_ENFORCE_EQ(var_type, framework::proto::VarType::LOD_TENSOR,
+                        "input X only support lod tensors");
+    }
+    ctx->ShareLoD("Ids", "Out");
+  }
+
+ private:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.MultiInput<framework::Tensor>("X").front()->type()),
+        ctx.GetPlace());
+  }
+};
+
+class MergeIdsOpInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    auto *input_var = block->Var(op_desc.Input("Ids")[0]);
+    for (auto &out_var : op_desc.Output("Out")) {
+      block->Var(out_var)->SetType(input_var->GetType());
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(merge_ids, ops::MergeIdsOp, ops::MergeIdsOpMaker,
+                  ops::MergeIdsOpInferVarType);
+REGISTER_OP_CPU_KERNEL(
+    merge_ids, ops::MergeIdsOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/merge_ids_op.h b/paddle/fluid/operators/merge_ids_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..83712a8519c6817151e1922c606c0fdd4682a2db
--- /dev/null
+++ b/paddle/fluid/operators/merge_ids_op.h
@@ -0,0 +1,92 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class MergeIdsOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto place = ctx.GetPlace();
+    if (!platform::is_cpu_place(place)) {
+      PADDLE_THROW("MergeIds do not support GPU kernel");
+    }
+    VLOG(3) << "run in MergeIdsOpKernel";
+
+    const auto *ids_var = ctx.InputVar("Ids");
+    PADDLE_ENFORCE(ids_var->IsType<framework::LoDTensor>(),
+                   "only support to merge Ids of LoDTensor");
+
+    const auto &ids_tensor = ids_var->Get<framework::LoDTensor>();
+    const auto &ids_dims = ids_tensor.dims();
+    const int64_t *ids = ids_tensor.data<int64_t>();
+
+    auto x_tensors = ctx.MultiInput<framework::LoDTensor>("X");
+
+    auto *out = ctx.Output<framework::LoDTensor>("Out");
+
+    int batch_size = 0;
+    int embedding_size = 0;
+    for (auto &input : x_tensors) {
+      if (framework::product(input->dims()) != 0) {
+        if (embedding_size == 0) {
+          embedding_size = input->dims()[1];
+        }
+        PADDLE_ENFORCE_EQ(embedding_size, input->dims()[1],
+                          "embedding size of all input should be the same");
+        batch_size += input->dims()[0];
+      }
+    }
+    PADDLE_ENFORCE_EQ(
+        batch_size, ids_dims[0],
+        "the batch size of ids and merged embedding value should be the same");
+
+    const size_t shard_num = x_tensors.size();
+
+    if (shard_num == 1) {
+      VLOG(3) << "only one shard, we can copy the data directly";
+      TensorCopy(*x_tensors[0], place, out);
+    } else {
+      std::vector<int> in_indexs(shard_num, 0);
+      auto *out_data = out->mutable_data<T>(
+          framework::make_ddim({batch_size, embedding_size}), place);
+      // copy data from ins[shard_num] to out.
+      for (int i = 0; i < ids_dims[0]; ++i) {
+        int64_t id = ids[i];
+        size_t shard_id = static_cast<size_t>(id) % shard_num;
+        int index = in_indexs[shard_id];
+        memcpy(out_data + embedding_size * i,
+               x_tensors[shard_id]->data<T>() + index * embedding_size,
+               sizeof(T) * embedding_size);
+        in_indexs[shard_id] += 1;
+      }
+
+      for (size_t i = 0; i < shard_num; ++i) {
+        PADDLE_ENFORCE_EQ(in_indexs[i], x_tensors[i]->dims()[0],
+                          "after merge, all data in x_tensor should be used");
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index 4ebf20cbba69bee09dfddb8e928ddc95665e4731..a16861b3b77fc980ab932b9d88859b38ec36108b 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -121,8 +121,7 @@ class MergeLoDTensorOp : public framework::OperatorBase {
 
 class MergeLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MergeLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "The input LoDTensor, contains complete lod information to "
              "construct the output");
diff --git a/paddle/fluid/operators/mine_hard_examples_op.cc b/paddle/fluid/operators/mine_hard_examples_op.cc
deleted file mode 100644
index 0e81d60878dce747b047abbe4641b71462373b2b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/mine_hard_examples_op.cc
+++ /dev/null
@@ -1,333 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-enum MiningType { kNone = 0, kMaxNegative, kHardExample };
-
-template <typename T>
-bool SortScoreDescend(const std::pair<float, T>& pair1,
-                      const std::pair<float, T>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-inline bool IsEligibleMining(const MiningType mining_type, const int match_idx,
-                             const float match_dist,
-                             const float neg_dist_threshold) {
-  if (mining_type == MiningType::kMaxNegative) {
-    return match_idx == -1 && match_dist < neg_dist_threshold;
-  } else if (mining_type == MiningType::kHardExample) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-inline MiningType GetMiningType(std::string str) {
-  if (str == "max_negative") {
-    return MiningType::kMaxNegative;
-  } else if (str == "hard_example") {
-    return MiningType::kHardExample;
-  } else {
-    return MiningType::kNone;
-  }
-}
-
-template <typename DeviceContext, typename T>
-class MineHardExamplesKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_cls_loss = ctx.Input<framework::Tensor>("ClsLoss");
-    auto* in_loc_loss = ctx.Input<framework::Tensor>("LocLoss");
-    auto* in_matched_indices = ctx.Input<framework::Tensor>("MatchIndices");
-    auto* in_match_dist = ctx.Input<framework::Tensor>("MatchDist");
-    float neg_pos_ratio = ctx.Attr<float>("neg_pos_ratio");
-    T neg_dist_threshold =
-        static_cast<T>(ctx.Attr<float>("neg_dist_threshold"));
-    int sample_size = ctx.Attr<int>("sample_size");
-    MiningType mining_type =
-        GetMiningType(ctx.Attr<std::string>("mining_type"));
-
-    auto out_neg_indices = ctx.Output<framework::LoDTensor>("NegIndices");
-    auto out_match_indices =
-        ctx.Output<framework::Tensor>("UpdatedMatchIndices");
-
-    framework::TensorCopy(*in_matched_indices, ctx.GetPlace(),
-                          out_match_indices);
-
-    int batch_size = in_matched_indices->dims()[0];
-    int prior_num = in_matched_indices->dims()[1];
-
-    auto match_indices = framework::EigenMatrix<int>::From(*in_matched_indices);
-
-    auto match_indices_et =
-        framework::EigenMatrix<int>::From(*out_match_indices);
-
-    auto match_dist = framework::EigenMatrix<T>::From(*in_match_dist);
-
-    const T* cls_loss = in_cls_loss->data<T>();
-    const T* loc_loss = nullptr;
-    if (in_loc_loss) {
-      loc_loss = in_loc_loss->data<T>();
-    }
-
-    std::vector<std::vector<int>> all_neg_indices;
-    std::vector<size_t> batch_starts = {0};
-    for (int n = 0; n < batch_size; ++n) {
-      std::vector<std::pair<T, size_t>> loss_idx;
-      int neg_sel = 0;
-      for (int m = 0; m < prior_num; ++m) {
-        if (IsEligibleMining(mining_type, match_indices(n, m), match_dist(n, m),
-                             neg_dist_threshold)) {
-          T loss = cls_loss[n * prior_num + m];
-          if (mining_type == MiningType::kHardExample && loc_loss != nullptr) {
-            loss = cls_loss[n * prior_num + m] + loc_loss[n * prior_num + m];
-          }
-          loss_idx.push_back(std::make_pair(loss, m));
-          ++neg_sel;
-        }
-      }
-
-      if (mining_type == MiningType::kMaxNegative) {
-        int num_pos = 0;
-        for (int m = 0; m < prior_num; ++m) {
-          if (match_indices(n, m) != -1) ++num_pos;
-        }
-        neg_sel = std::min(static_cast<int>(num_pos * neg_pos_ratio), neg_sel);
-      } else if (mining_type == MiningType::kHardExample) {
-        neg_sel = std::min(sample_size, neg_sel);
-      }
-
-      std::sort(loss_idx.begin(), loss_idx.end(), SortScoreDescend<size_t>);
-      std::set<int> sel_indices;
-      std::vector<int> neg_indices;
-      std::transform(loss_idx.begin(), loss_idx.begin() + neg_sel,
-                     std::inserter(sel_indices, sel_indices.begin()),
-                     [](std::pair<T, size_t>& l) -> int {
-                       return static_cast<int>(l.second);
-                     });
-
-      if (mining_type == MiningType::kHardExample) {
-        for (int m = 0; m < prior_num; ++m) {
-          if (match_indices(n, m) > -1) {
-            if (sel_indices.find(m) == sel_indices.end()) {
-              match_indices_et(n, m) = -1;
-            }
-          } else {
-            if (sel_indices.find(m) != sel_indices.end()) {
-              neg_indices.push_back(m);
-            }
-          }
-        }
-      } else {
-        neg_indices.resize(sel_indices.size());
-        std::copy(sel_indices.begin(), sel_indices.end(), neg_indices.begin());
-      }
-
-      all_neg_indices.push_back(neg_indices);
-      batch_starts.push_back(batch_starts.back() + neg_indices.size());
-    }
-
-    framework::LoD out_neg_indices_lod;
-    out_neg_indices_lod.emplace_back(batch_starts);
-    int neg_offset = 0;
-    auto neg_data = out_neg_indices->mutable_data<int>(
-        framework::make_ddim({static_cast<int>(batch_starts.back()), 1}),
-        ctx.GetPlace());
-
-    for (auto neg_indices : all_neg_indices) {
-      std::copy(neg_indices.begin(), neg_indices.end(), neg_data + neg_offset);
-      neg_offset += neg_indices.size();
-    }
-    out_neg_indices->set_lod(out_neg_indices_lod);
-    return;
-  }
-};
-
-class MineHardExamplesOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("ClsLoss"),
-                   "Input(ClsLoss) of MineHardExamplesOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("MatchIndices"),
-        "Input(MatchIndices) of MineHardExamplesOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("MatchDist"),
-        "Input(MatchDist) of MineHardExamplesOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("NegIndices"),
-        "Output(NegIndices) of MineHardExamplesOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("UpdatedMatchIndices"),
-                   "Output(UpdatedMatchIndices) of MineHardExamplesOp should "
-                   "not be null.");
-
-    auto cls_loss_dims = ctx->GetInputDim("ClsLoss");
-    auto idx_dims = ctx->GetInputDim("MatchIndices");
-    auto dis_dims = ctx->GetInputDim("MatchDist");
-
-    PADDLE_ENFORCE_EQ(cls_loss_dims.size(), 2UL,
-                      "The shape of ClsLoss is [N, Np].");
-    PADDLE_ENFORCE_EQ(idx_dims.size(), 2UL,
-                      "The shape of MatchIndices is [N, Np].");
-    PADDLE_ENFORCE_EQ(dis_dims.size(), 2UL,
-                      "The shape of MatchDist is [N, Np].");
-
-    if (ctx->HasInput("LocLoss")) {
-      auto loc_loss_dims = ctx->GetInputDim("LocLoss");
-      PADDLE_ENFORCE_EQ(loc_loss_dims.size(), 2UL,
-                        "The shape of LocLoss is [N, Np].");
-      PADDLE_ENFORCE_EQ(cls_loss_dims[0], loc_loss_dims[0],
-                        "Batch size of ClsLoss and LocLoss must be the same.");
-      PADDLE_ENFORCE_EQ(
-          cls_loss_dims[1], loc_loss_dims[1],
-          "Prior box number of ClsLoss and LocLoss must be the same.");
-    }
-
-    PADDLE_ENFORCE_EQ(
-        cls_loss_dims[0], idx_dims[0],
-        "Batch size of ClsLoss and MatchIndices must be the same.");
-    PADDLE_ENFORCE_EQ(
-        cls_loss_dims[1], idx_dims[1],
-        "Prior box number of ClsLoss and MatchIndices must be the same.");
-
-    PADDLE_ENFORCE_EQ(cls_loss_dims[0], dis_dims[0],
-                      "Batch size of ClsLoss and MatchDist must be the same.");
-    PADDLE_ENFORCE_EQ(
-        cls_loss_dims[1], idx_dims[1],
-        "Prior box number of ClsLoss and MatchDist must be the same.");
-
-    auto mining_type =
-        GetMiningType(ctx->Attrs().Get<std::string>("mining_type"));
-
-    PADDLE_ENFORCE_NE(mining_type, MiningType::kNone,
-                      "mining_type must be hard_example or max_negative");
-
-    if (mining_type == MiningType::kMaxNegative) {
-      auto neg_pos_ratio = ctx->Attrs().Get<float>("neg_pos_ratio");
-      auto neg_dist_threshold = ctx->Attrs().Get<float>("neg_dist_threshold");
-      PADDLE_ENFORCE_GT(
-          neg_pos_ratio, 0.0f,
-          "neg_pos_ratio must greater than zero in max_negative mode");
-      PADDLE_ENFORCE_GT(
-          neg_dist_threshold, 0.0f,
-          "neg_dist_threshold must greater than zero in max_negative mode");
-    } else if (mining_type == MiningType::kHardExample) {
-      auto sample_size = ctx->Attrs().Get<int>("sample_size");
-      PADDLE_ENFORCE_GT(
-          sample_size, 0,
-          "sample_size must greater than zero in hard_example mode");
-    }
-
-    ctx->SetOutputDim("UpdatedMatchIndices", idx_dims);
-    // The first dimension of NegIndices will be set correcttly in Compute.
-    ctx->SetOutputDim("NegIndices", {-1, 1});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("ClsLoss")->type()),
-        platform::CPUPlace());
-  }
-};
-
-class MineHardExamplesOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  MineHardExamplesOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "ClsLoss",
-        "(Tensor, default Tensor<float>), The classification loss with shape "
-        "[N, Np], N is the batch size and Np is the number of prior box.");
-    AddInput("LocLoss",
-             "(Tensor, optional, default Tensor<float>), The localization loss "
-             "with shape [N, Np], N is the batch size and Np is the number of "
-             "prior box.")
-        .AsDispensable();
-    AddInput("MatchIndices",
-             "(Tensor, Tensor<int>), Matched indices with shape [N, Np], N is "
-             "the batch size and Np is the number of prior box. "
-             "MatchIndices[i][j] equal -1 means the j-th prior box in i-th "
-             "instance does not match any entity, otherwise means it is "
-             "matched to row.");
-    AddInput("MatchDist",
-             "(Tensor, default Tensor<float>) Matched indices with shape [N, "
-             "Np], N is the batch size and Np is the number of prior box.");
-    AddAttr<float>("neg_pos_ratio",
-                   "(float) The ratio of the negative box to the positive "
-                   "box. Use only when mining_type is max_negative.")
-        .SetDefault(1.0);
-    AddAttr<float>("neg_dist_threshold",
-                   "(float) The negative overlap upper bound for the unmatched "
-                   "predictions. Use only when mining_type is max_negative.")
-        .SetDefault(0.5);
-    AddAttr<int>("sample_size",
-                 "(float) The max sample size of negative box. Use only when "
-                 "mining_type is hard_example.")
-        .SetDefault(0);
-    AddAttr<std::string>("mining_type",
-                         "(float) The mining algorithm name, the value is "
-                         "hard_example or max_negative.")
-        .SetDefault("max_negative")
-        .InEnum({"hard_example", "max_negative"});
-
-    AddOutput(
-        "NegIndices",
-        "(LoDTensor<int>) The output of negative example indices. a LoDTensor "
-        "with shape [Neg, 1]. The size of lod[0] minus 1 is batch size, "
-        "and each element is the prior box index. "
-        "For example, the batch size is 2, the lod is [[0, 1, 2]], "
-        "the sample 0's box 1(MatchIndices[0][1]) is selected, "
-        "and sample 1's box 0 is selected. The output NegIndices is "
-        "[[1], [0]].");
-
-    AddOutput("UpdatedMatchIndices",
-              "(Tensor<int>) The output of updated MatchIndices, a tensor with "
-              "shape [N, Np]. Only update when mining_type is "
-              "hard_example. The input MatchIndices elements will be update to "
-              "-1 when it is not in the candidate high loss list of negative "
-              "examples.");
-
-    AddComment(R"DOC(
-Mine hard examples Operator.
-This operator implements hard example mining to select a subset of negative box indices.
-For each image, selects the box with highest losses. subject to the condition that the 
-box cannot have an Matcht > neg_dist_threshold when mining_type is max_negative. 
-The selected number is min(sample_size, max_negative_box_number) when mining_type is 
-hard_example, or min(neg_pos_ratio * positive_box_number, max_negative_box_number) 
-when mining_type is max_negative, where the max_negative_box_number is the count of 
-MatchIndices elements with value -1.
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(mine_hard_examples, ops::MineHardExamplesOp,
-                             ops::MineHardExamplesOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    mine_hard_examples,
-    ops::MineHardExamplesKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MineHardExamplesKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc
index 7de9d94979fdc3f3352c556cc8b655ad4bc7e201..34571a38a14795a98ac8454cec606077727b5ffa 100644
--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/minus_op.h"
-#include "paddle/fluid/operators/net_op.h"
+
+#include <string>
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -46,8 +48,7 @@ class MinusOp : public framework::OperatorWithKernel {
 
 class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MinusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The left tensor of minus operator.");
     AddInput("Y", "The right tensor of minus operator.");
     AddOutput("Out", "The output tensor of minus operator.");
diff --git a/paddle/fluid/operators/mkldnn_activation_op.h b/paddle/fluid/operators/mkldnn_activation_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..85664623d7330e9473286d995bec67879510dbd7
--- /dev/null
+++ b/paddle/fluid/operators/mkldnn_activation_op.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename Functor>
+class MKLDNNActivationKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(context.Input<framework::Tensor>("X") != nullptr,
+                   "Cannot get input tensor X, variable name = %s",
+                   context.op().Input("X"));
+    PADDLE_ENFORCE(context.Output<framework::Tensor>("Out") != nullptr,
+                   "Cannot find output tensor Out, variable name = %s",
+                   context.op().Output("Out"));
+    Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
+    functor(context);
+  }
+};
+
+template <typename Functor>
+class MKLDNNActivationGradKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
+    functor(context);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc
index a8fbd48c4da5b2d0585688e3100f9fe62ac5aa1f..35db4c1ad1f6c6481eca397e99fc8c1f0bc7164c 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cc
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
@@ -39,8 +39,7 @@ class ModifiedHuberLossOp : public framework::OperatorWithKernel {
 
 class ModifiedHuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ModifiedHuberLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "The input tensor of modified huber loss op. "
              "X is 2-D tensor with shape [batch_size, 1].");
@@ -108,9 +107,10 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(modified_huber_loss, ops::ModifiedHuberLossOp,
-            ops::ModifiedHuberLossOpMaker, modified_huber_loss_grad,
-            ops::ModifiedHuberLossGradOp);
+REGISTER_OPERATOR(modified_huber_loss, ops::ModifiedHuberLossOp,
+                  ops::ModifiedHuberLossOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(modified_huber_loss_grad, ops::ModifiedHuberLossGradOp);
 
 REGISTER_OP_CPU_KERNEL(
     modified_huber_loss,
diff --git a/paddle/fluid/operators/momentum_op.cc b/paddle/fluid/operators/momentum_op.cc
index 6c70970e15f0d63ebe2134c6bc8163339ba30e75..dcd73e3c3e40f80e07b73944d1f0cc57fea010d3 100644
--- a/paddle/fluid/operators/momentum_op.cc
+++ b/paddle/fluid/operators/momentum_op.cc
@@ -17,6 +17,8 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 class MomentumOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -50,12 +52,17 @@ class MomentumOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("ParamOut", param_dim);
     ctx->SetOutputDim("VelocityOut", param_dim);
   }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 
 class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MomentumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Param",
              "(Tensor, default Tensor<float>) "
              "Input parameter that has to be updated");
diff --git a/paddle/fluid/operators/momentum_op.cu b/paddle/fluid/operators/momentum_op.cu
index da4a6af298f61a20e60ff1b8358f30bb0aca2280..5eb9d9950248bb50bb823f071c7fff0ddcc47234 100644
--- a/paddle/fluid/operators/momentum_op.cu
+++ b/paddle/fluid/operators/momentum_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/momentum_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index e7bed2c39735b66c19e738c91f4977e46571143b..51993398bd3427e1f0da155918395bc50fa65e45 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -13,15 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/mul_op.h"
+#include <string>
+#include <vector>
 
 namespace paddle {
 namespace operators {
 
+using framework::OpKernelType;
 using framework::Tensor;
 
-class MulOpShapeInference : public framework::InferShapeBase {
+class MulOp : public framework::OperatorWithKernel {
  public:
-  void operator()(framework::InferShapeContext* ctx) const override {
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MulOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of MulOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -71,8 +76,7 @@ class MulOpShapeInference : public framework::InferShapeBase {
 
 class MulOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(Tensor), The first input tensor of mul op.");
     AddInput("Y", "(Tensor), The second input tensor of mul op.");
     AddOutput("Out", "(Tensor), The output tensor of mul op.");
@@ -122,7 +126,7 @@ or not. But the output only shares the LoD information with input $X$.
   }
 };
 
-class MulOpGrad : public framework::OperatorWithKernel {
+class MulGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -156,11 +160,12 @@ class MulOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(mul, paddle::framework::OperatorWithKernel, ops::MulOpMaker,
-                  ops::MulOpShapeInference,
+REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(mul_grad, ops::MulOpGrad);
+REGISTER_OPERATOR(mul_grad, ops::MulGradOp);
 REGISTER_OP_CPU_KERNEL(
-    mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
+    mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MulKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
-    mul_grad, ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
+    mul_grad, ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MulGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/mul_op.cu.cc b/paddle/fluid/operators/mul_op.cu.cc
index 0667530e943856576ae8c9fe4856cb6aa1448e4e..81f3e42bf412fa4d2cb48405f2f8ee49b6aa0b67 100644
--- a/paddle/fluid/operators/mul_op.cu.cc
+++ b/paddle/fluid/operators/mul_op.cu.cc
@@ -13,9 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/mul_op.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    mul_grad, ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<plat::CUDADeviceContext, float>,
+                        ops::MulKernel<plat::CUDADeviceContext, double>,
+                        ops::MulKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(mul_grad,
+                        ops::MulGradKernel<plat::CUDADeviceContext, float>,
+                        ops::MulGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/mul_op.h b/paddle/fluid/operators/mul_op.h
index 38311cf87265ad0f1f815734cbf69bd682d62e62..15dd975e3bbf80b2e616e6628555e812d025f70a 100644
--- a/paddle/fluid/operators/mul_op.h
+++ b/paddle/fluid/operators/mul_op.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/operators/math/math_function.h"
-
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -46,9 +46,10 @@ class MulKernel : public framework::OpKernel<T> {
     if (z_dim.size() != 2) {
       z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
     }
-    math::matmul<DeviceContext, T>(
-        context.template device_context<DeviceContext>(), x_matrix, false,
-        y_matrix, false, 1, z, 0);
+
+    auto blas = math::GetBlas<DeviceContext, T>(context);
+
+    blas.MatMul(x_matrix, y_matrix, z);
     if (z_dim.size() != 2) {
       z->Resize(z_dim);
     }
@@ -79,6 +80,7 @@ class MulGradKernel : public framework::OpKernel<T> {
     Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     Tensor* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     if (dx) {
       dx->mutable_data<T>(ctx.GetPlace());
       Tensor dx_matrix = dx->dims().size() > 2
@@ -86,8 +88,7 @@ class MulGradKernel : public framework::OpKernel<T> {
                              : *dx;
 
       // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
-      math::matmul<DeviceContext, T>(dev_ctx, dout_mat, false, y_matrix, true,
-                                     1, &dx_matrix, 0);
+      blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
     }
     if (dy) {
       dy->mutable_data<T>(ctx.GetPlace());
@@ -95,8 +96,7 @@ class MulGradKernel : public framework::OpKernel<T> {
                              ? framework::ReshapeToMatrix(*dy, y_num_col_dims)
                              : *dy;
       // dy = x' * dout. dy K x N, dout : M x N, x : M x K
-      math::matmul<DeviceContext, T>(dev_ctx, x_matrix, true, dout_mat, false,
-                                     1, &dy_matrix, 0);
+      blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
     }
   }
 };
diff --git a/paddle/fluid/operators/multiclass_nms_op.cc b/paddle/fluid/operators/multiclass_nms_op.cc
deleted file mode 100644
index 0f80f752c95e97ed4d6d299788734de9d29713db..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/multiclass_nms_op.cc
+++ /dev/null
@@ -1,393 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-constexpr int64_t kOutputDim = 6;
-constexpr int64_t kBBoxSize = 4;
-
-class MultiClassNMSOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("BBoxes"),
-                   "Input(BBoxes) of MultiClassNMS should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Scores"),
-                   "Input(Scores) of MultiClassNMS should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of MultiClassNMS should not be null.");
-
-    auto box_dims = ctx->GetInputDim("BBoxes");
-    auto score_dims = ctx->GetInputDim("Scores");
-
-    PADDLE_ENFORCE_EQ(box_dims.size(), 3,
-                      "The rank of Input(BBoxes) must be 3.");
-    PADDLE_ENFORCE_EQ(score_dims.size(), 3,
-                      "The rank of Input(Scores) must be 3.");
-    PADDLE_ENFORCE_EQ(box_dims[2], 4,
-                      "The 2nd dimension of Input(BBoxes) must be 4, "
-                      "represents the layout of coordinate "
-                      "[xmin, ymin, xmax, ymax]");
-    PADDLE_ENFORCE_EQ(box_dims[1], score_dims[2],
-                      "The 1st dimensiong of Input(BBoxes) must be equal to "
-                      "3rd dimension of Input(Scores), which represents the "
-                      "predicted bboxes.");
-
-    // Here the box_dims[0] is not the real dimension of output.
-    // It will be rewritten in the computing kernel.
-    ctx->SetOutputDim("Out", {box_dims[1], 6});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(
-            ctx.Input<framework::LoDTensor>("Scores")->type()),
-        platform::CPUPlace());
-  }
-};
-
-template <class T>
-bool SortScorePairDescend(const std::pair<float, T>& pair1,
-                          const std::pair<float, T>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-template <class T>
-static inline void GetMaxScoreIndex(
-    const std::vector<T>& scores, const T threshold, int top_k,
-    std::vector<std::pair<T, int>>* sorted_indices) {
-  for (size_t i = 0; i < scores.size(); ++i) {
-    if (scores[i] > threshold) {
-      sorted_indices->push_back(std::make_pair(scores[i], i));
-    }
-  }
-  // Sort the score pair according to the scores in descending order
-  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
-                   SortScorePairDescend<int>);
-  // Keep top_k scores if needed.
-  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
-    sorted_indices->resize(top_k);
-  }
-}
-
-template <class T>
-static inline T BBoxArea(const T* box, const bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
-    }
-  }
-}
-
-template <class T>
-static inline T JaccardOverlap(const T* box1, const T* box2,
-                               const bool normalized) {
-  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
-      box2[3] < box1[1]) {
-    return static_cast<T>(0.);
-  } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
-    const T inter_w = inter_xmax - inter_xmin;
-    const T inter_h = inter_ymax - inter_ymin;
-    const T inter_area = inter_w * inter_h;
-    const T bbox1_area = BBoxArea<T>(box1, normalized);
-    const T bbox2_area = BBoxArea<T>(box2, normalized);
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-
-template <typename T>
-class MultiClassNMSKernel : public framework::OpKernel<T> {
- public:
-  void NMSFast(const Tensor& bbox, const Tensor& scores,
-               const T score_threshold, const T nms_threshold, const T eta,
-               const int64_t top_k, std::vector<int>* selected_indices) const {
-    // The total boxes for each instance.
-    int64_t num_boxes = bbox.dims()[0];
-    // 4: [xmin ymin xmax ymax]
-    int64_t box_size = bbox.dims()[1];
-
-    std::vector<T> scores_data(num_boxes);
-    std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
-    std::vector<std::pair<T, int>> sorted_indices;
-    GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
-
-    selected_indices->clear();
-    T adaptive_threshold = nms_threshold;
-    const T* bbox_data = bbox.data<T>();
-
-    while (sorted_indices.size() != 0) {
-      const int idx = sorted_indices.front().second;
-      bool keep = true;
-      for (size_t k = 0; k < selected_indices->size(); ++k) {
-        if (keep) {
-          const int kept_idx = (*selected_indices)[k];
-          T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
-                                        bbox_data + kept_idx * box_size, true);
-          keep = overlap <= adaptive_threshold;
-        } else {
-          break;
-        }
-      }
-      if (keep) {
-        selected_indices->push_back(idx);
-      }
-      sorted_indices.erase(sorted_indices.begin());
-      if (keep && eta < 1 && adaptive_threshold > 0.5) {
-        adaptive_threshold *= eta;
-      }
-    }
-  }
-
-  void MultiClassNMS(const framework::ExecutionContext& ctx,
-                     const Tensor& scores, const Tensor& bboxes,
-                     std::map<int, std::vector<int>>& indices,
-                     int& num_nmsed_out) const {
-    int64_t background_label = ctx.Attr<int>("background_label");
-    int64_t nms_top_k = ctx.Attr<int>("nms_top_k");
-    int64_t keep_top_k = ctx.Attr<int>("keep_top_k");
-    T nms_threshold = static_cast<T>(ctx.Attr<float>("nms_threshold"));
-    T nms_eta = static_cast<T>(ctx.Attr<float>("nms_eta"));
-    T score_threshold = static_cast<T>(ctx.Attr<float>("score_threshold"));
-
-    int64_t class_num = scores.dims()[0];
-    int64_t predict_dim = scores.dims()[1];
-    int num_det = 0;
-    for (int64_t c = 0; c < class_num; ++c) {
-      if (c == background_label) continue;
-      Tensor score = scores.Slice(c, c + 1);
-      NMSFast(bboxes, score, score_threshold, nms_threshold, nms_eta, nms_top_k,
-              &(indices[c]));
-      num_det += indices[c].size();
-    }
-
-    num_nmsed_out = num_det;
-    const T* scores_data = scores.data<T>();
-    if (keep_top_k > -1 && num_det > keep_top_k) {
-      std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-      for (const auto& it : indices) {
-        int label = it.first;
-        const T* sdata = scores_data + label * predict_dim;
-        const std::vector<int>& label_indices = it.second;
-        for (size_t j = 0; j < label_indices.size(); ++j) {
-          int idx = label_indices[j];
-          PADDLE_ENFORCE_LT(idx, predict_dim);
-          score_index_pairs.push_back(
-              std::make_pair(sdata[idx], std::make_pair(label, idx)));
-        }
-      }
-      // Keep top k results per image.
-      std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
-                       SortScorePairDescend<std::pair<int, int>>);
-      score_index_pairs.resize(keep_top_k);
-
-      // Store the new indices.
-      std::map<int, std::vector<int>> new_indices;
-      for (size_t j = 0; j < score_index_pairs.size(); ++j) {
-        int label = score_index_pairs[j].second.first;
-        int idx = score_index_pairs[j].second.second;
-        new_indices[label].push_back(idx);
-      }
-      new_indices.swap(indices);
-      num_nmsed_out = keep_top_k;
-    }
-  }
-
-  void MultiClassOutput(const Tensor& scores, const Tensor& bboxes,
-                        std::map<int, std::vector<int>>& selected_indices,
-                        Tensor* outs) const {
-    int predict_dim = scores.dims()[1];
-    auto* scores_data = scores.data<T>();
-    auto* bboxes_data = bboxes.data<T>();
-    auto* odata = outs->data<T>();
-
-    int count = 0;
-    for (const auto& it : selected_indices) {
-      int label = it.first;
-      const T* sdata = scores_data + label * predict_dim;
-      const std::vector<int>& indices = it.second;
-      for (size_t j = 0; j < indices.size(); ++j) {
-        int idx = indices[j];
-        const T* bdata = bboxes_data + idx * kBBoxSize;
-        odata[count * kOutputDim] = label;           // label
-        odata[count * kOutputDim + 1] = sdata[idx];  // score
-        // xmin, ymin, xmax, ymax
-        std::memcpy(odata + count * kOutputDim + 2, bdata, 4 * sizeof(T));
-        count++;
-      }
-    }
-  }
-
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* boxes = ctx.Input<Tensor>("BBoxes");
-    auto* scores = ctx.Input<Tensor>("Scores");
-    auto* outs = ctx.Output<LoDTensor>("Out");
-
-    auto score_dims = scores->dims();
-
-    int64_t batch_size = score_dims[0];
-    int64_t class_num = score_dims[1];
-    int64_t predict_dim = score_dims[2];
-    int64_t box_dim = boxes->dims()[2];
-
-    std::vector<std::map<int, std::vector<int>>> all_indices;
-    std::vector<size_t> batch_starts = {0};
-    for (int64_t i = 0; i < batch_size; ++i) {
-      Tensor ins_score = scores->Slice(i, i + 1);
-      ins_score.Resize({class_num, predict_dim});
-
-      Tensor ins_boxes = boxes->Slice(i, i + 1);
-      ins_boxes.Resize({predict_dim, box_dim});
-
-      std::map<int, std::vector<int>> indices;
-      int num_nmsed_out = 0;
-      MultiClassNMS(ctx, ins_score, ins_boxes, indices, num_nmsed_out);
-      all_indices.push_back(indices);
-      batch_starts.push_back(batch_starts.back() + num_nmsed_out);
-    }
-
-    int num_kept = batch_starts.back();
-    if (num_kept == 0) {
-      T* od = outs->mutable_data<T>({1}, ctx.GetPlace());
-      od[0] = -1;
-    } else {
-      outs->mutable_data<T>({num_kept, kOutputDim}, ctx.GetPlace());
-      for (int64_t i = 0; i < batch_size; ++i) {
-        Tensor ins_score = scores->Slice(i, i + 1);
-        ins_score.Resize({class_num, predict_dim});
-
-        Tensor ins_boxes = boxes->Slice(i, i + 1);
-        ins_boxes.Resize({predict_dim, box_dim});
-
-        int64_t s = batch_starts[i];
-        int64_t e = batch_starts[i + 1];
-        if (e > s) {
-          Tensor out = outs->Slice(s, e);
-          MultiClassOutput(ins_score, ins_boxes, all_indices[i], &out);
-        }
-      }
-    }
-
-    framework::LoD lod;
-    lod.emplace_back(batch_starts);
-
-    outs->set_lod(lod);
-  }
-};
-
-class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  MultiClassNMSOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("BBoxes",
-             "(Tensor) A 3-D Tensor with shape [N, M, 4] represents the "
-             "predicted locations of M bounding bboxes, N is the batch size. "
-             "Each bounding box has four coordinate values and the layout is "
-             "[xmin, ymin, xmax, ymax].");
-    AddInput("Scores",
-             "(Tensor) A 3-D Tensor with shape [N, C, M] represents the "
-             "predicted confidence predictions. N is the batch size, C is the "
-             "class number, M is number of bounding boxes. For each category "
-             "there are total M scores which corresponding M bounding boxes. "
-             " Please note, M is equal to the 1st dimension of BBoxes. ");
-    AddAttr<int>(
-        "background_label",
-        "(int, defalut: 0) "
-        "The index of background label, the background label will be ignored. "
-        "If set to -1, then all categories will be considered.")
-        .SetDefault(0);
-    AddAttr<float>("score_threshold",
-                   "(float) "
-                   "Threshold to filter out bounding boxes with low "
-                   "confidence score. If not provided, consider all boxes.");
-    AddAttr<int>("nms_top_k",
-                 "(int64_t) "
-                 "Maximum number of detections to be kept according to the "
-                 "confidences aftern the filtering detections based on "
-                 "score_threshold");
-    AddAttr<float>("nms_threshold",
-                   "(float, defalut: 0.3) "
-                   "The threshold to be used in NMS.")
-        .SetDefault(0.3);
-    AddAttr<float>("nms_eta",
-                   "(float) "
-                   "The parameter for adaptive NMS.")
-        .SetDefault(1.0);
-    AddAttr<int>("keep_top_k",
-                 "(int64_t) "
-                 "Number of total bboxes to be kept per image after NMS "
-                 "step. -1 means keeping all bboxes after NMS step.");
-    AddOutput("Out",
-              "(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the "
-              "detections. Each row has 6 values: "
-              "[label, confidence, xmin, ymin, xmax, ymax], No is the total "
-              "number of detections in this mini-batch. For each instance, "
-              "the offsets in first dimension are called LoD, the number of "
-              "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
-              "no detected bbox.");
-    AddComment(R"DOC(
-This operator is to do multi-class non maximum suppression (NMS) on a batched
-of boxes and scores.
-
-In the NMS step, this operator greedily selects a subset of detection bounding
-boxes that have high scores larger than score_threshold, if providing this
-threshold, then selects the largest nms_top_k confidences scores if nms_top_k
-is larger than -1. Then this operator pruns away boxes that have high IOU
-(intersection over union) overlap with already selected boxes by adaptive
-threshold NMS based on parameters of nms_threshold and nms_eta.
-
-Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
-per image if keep_top_k is larger than -1.
-
-This operator support multi-class and batched inputs. It applying NMS
-independently for each class. The outputs is a 2-D LoDTenosr, for each
-image, the offsets in first dimension of LoDTensor are called LoD, the number
-of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0,
-means there is no detected bbox for this image. If there is no detected boxes
-for all images, all the elements in LoD are 0, and the Out only contains one
-value which is -1.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(multiclass_nms, ops::MultiClassNMSOp,
-                  ops::MultiClassNMSOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(multiclass_nms, ops::MultiClassNMSKernel<float>,
-                       ops::MultiClassNMSKernel<double>);
diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc
index b698c1bf8a05e053db07db34712a13c8074ee4d0..18ad46cb5eeeab2169136e40cebdaa53c0bfd587 100644
--- a/paddle/fluid/operators/multiplex_op.cc
+++ b/paddle/fluid/operators/multiplex_op.cc
@@ -61,28 +61,47 @@ class MultiplexOp : public framework::OperatorWithKernel {
 
 class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MultiplexOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Ids", "The index tensor of multiplex operator.");
-    AddInput("X", "The candidate tensors of multiplex operator.")
+  void Make() override {
+    AddInput("Ids",
+             "Tensor<int32>, index variable which is a 2-D tensor with shape "
+             "[M, 1] where M is the batch size.");
+    AddInput("X",
+             "A list of variables to gather from. All variables have the same "
+             "shape and the rank is at least 2.")
         .AsDuplicable();
     AddOutput("Out", "The output tensor of multiplex operator.");
     AddComment(R"DOC(
-Multiplex Operator.
-
-Multiplex multiple tensors according to the index provided by the index tensor.
-
-Ids: the index tensor.
-X[0 : N - 1]: the candidate tensors for output (N >= 2).
-For each index i from 0 to batchSize - 1, the output is the i-th row of the
+Referring to the given index variable, this layer selects rows from the
+input variables to construct a multiplex variable. Assuming that there are
+:math:`m` input variables and :math:`I_i` represents the i-th input
+variable and :math:`i` is in [0, :math:`m`). All input variables are
+tensors with same shape [:math:`d_0`, :math:`d_1`, ..., :math:`d_R`].
+Please note that rank of the input tensor should be at least 2. Each input
+variable will be treated as a 2-D matrix with shape [:math:`M`, :math:`N`]
+where :math:`M` for :math:`d_0` and :math:`N` for :math:`d_1` * :math:`d_2`
+* ... * :math:`d_R`. Let :math:`I_i[j]` be the j-th row of the i-th input
+variable. The given index variable should be a 2-D tensor with shape
+[:math:`M`, 1]. Let `ID[i]` be the i-th index value of the index variable.
+Then the output variable will be a tensor with shape [:math:`d_0`,
+:math:`d_1`, ..., :math:`d_R`]. If we treat the output tensor as a 2-D
+matrix with shape [:math:`M`, :math:`N`] and let :math:`O[i]` be the i-th
+row of the matrix, then `O[i]` is equal to :math:`I_{ID[i]}[i]`.
+
+* Ids: the index tensor.
+
+* X[0 : N - 1]: the candidate tensors for output (N >= 2).
+
+* For each index i from 0 to batchSize - 1, the output is the i-th row of the
 the (Ids[i])-th tensor.
 
 For i-th row of the output tensor:
 
-$$y[i] = x_{k}[i]$$
+$$
+y[i] = x_{k}[i]
+$$
 
-where `y` is the output tensor, `x_{k}` is the k-th input tensor,
-and `k = Ids[i]`.
+where $y$ is the output tensor, $x_{k}$ is the k-th input tensor,
+and $k = Ids[i]$.
 
 )DOC");
   }
diff --git a/paddle/fluid/operators/multiplex_op.cu b/paddle/fluid/operators/multiplex_op.cu
index 45a2550793511f7cb8c20644ac79e9e88629ce7b..2f8a602f3c5c0a7c262235f99943ce336e20a7b4 100644
--- a/paddle/fluid/operators/multiplex_op.cu
+++ b/paddle/fluid/operators/multiplex_op.cu
@@ -33,7 +33,7 @@ class MultiplexGPUKernel : public framework::OpKernel<T> {
     auto cols = ins[0]->numel() / rows;
     // copy index to cpu
     Tensor index_t_cpu;
-    TensorCopy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
+    TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
     auto* index = index_t_cpu.data<int32_t>();
     auto stream = ctx.cuda_device_context().stream();
     platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
@@ -69,7 +69,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
     auto cols = ins[0]->numel() / rows;
     // copy index to cpu
     Tensor index_t_cpu;
-    TensorCopy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
+    TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu);
     auto* index = index_t_cpu.data<int32_t>();
 
     auto stream = ctx.cuda_device_context().stream();
diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.h b/paddle/fluid/operators/nccl/nccl_gpu_common.h
index 113f93e346681e568524f9fb6a0ab9a56de8569e..558ff4cc09603eebbcd95a234ff1aa63ada7fbb2 100644
--- a/paddle/fluid/operators/nccl/nccl_gpu_common.h
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.h
@@ -15,9 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <algorithm>
-#include <condition_variable>
+#include <condition_variable>  // NOLINT
 #include <memory>
-#include <mutex>
+#include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
 #include <vector>
diff --git a/paddle/fluid/operators/nccl_op.cc b/paddle/fluid/operators/nccl_op.cc
index 5e4ed886b10bd48bf991ce84a9099611cf5d1d26..0018139cb06fe0573565c920849843e674df6f4c 100644
--- a/paddle/fluid/operators/nccl_op.cc
+++ b/paddle/fluid/operators/nccl_op.cc
@@ -76,8 +76,7 @@ class NCCLInitOpShapeInference : public framework::InferShapeBase {
 
 class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCCLInitOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(kParallelScopes, "The working place of parallel do.");
     AddOutput("Communicator",
               "Create Communicator for communicating between gpus");
@@ -118,8 +117,7 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
 // AllReduceOp
 class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCCLAllReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The input of AllReduce op");
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of AllReduce op");
@@ -165,8 +163,7 @@ class NCCLReduceOp : public framework::OperatorWithKernel {
 // ReduceOp
 class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCCLReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The input of Reduce op");
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of Reduce op");
@@ -214,8 +211,7 @@ class NCCLBcastOp : public framework::OperatorWithKernel {
 // BcastOp
 class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCCLBcastOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The input of BcastSend op");
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of Bcast");
diff --git a/paddle/fluid/operators/nccl_op.cu.cc b/paddle/fluid/operators/nccl_op.cu.cc
index 4d83a70e7334a84bb98bd52f0172f6b7ecedb58d..8de974bc2b333fb6ccc5b5f0bb1af86533139925 100644
--- a/paddle/fluid/operators/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl_op.cu.cc
@@ -106,6 +106,8 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
     T* recvbuffer = nullptr;
     if (root == gpu_id) {
       recvbuffer = out->mutable_data<T>(ctx.GetPlace());
+    } else {
+      out->Resize(framework::make_ddim({0}));
     }
     VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel()
             << " recv " << out->numel();
@@ -133,8 +135,9 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
       auto* x = ctx.Input<LoDTensor>("X");
       VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel();
       PADDLE_ENFORCE(platform::dynload::ncclBcast(
-          (void*)x->data<T>(), x->numel(), NCCLTypeWrapper<T>::type, root,
-          comm->comms().at(idx), ctx.cuda_device_context().stream()));
+          reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), x->numel(),
+          NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
+          ctx.cuda_device_context().stream()));
       VLOG(3) << "gpu : " << gpu_id << " finished Bcast.";
     } else {
       auto* out = ctx.Output<LoDTensor>("Out");
diff --git a/paddle/fluid/operators/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl_op_test.cu.cc
index 90f6f955cea51ded2dbb2bde459113458d7749a4..ef54d79fdf2becde98c68044d14bd4347773b975 100644
--- a/paddle/fluid/operators/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl_op_test.cu.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <memory>
-#include <mutex>
-#include <thread>
+#include <mutex>   // NOLINT
+#include <thread>  // NOLINT
 #include <vector>
 
 #include "paddle/fluid/framework/init.h"
@@ -43,7 +43,7 @@ const f::DDim kDims = {20, 20};
 // nccl op common tester, init communicator.
 class NCCLTester : public ::testing::Test {
  public:
-  virtual void SetUp() override {
+  void SetUp() override {
     int count = p::GetCUDADeviceCount();
     if (count <= 1) {
       LOG(WARNING)
@@ -64,7 +64,7 @@ class NCCLTester : public ::testing::Test {
     NCCLInitOp();
   }
 
-  virtual void TearDown() override {
+  void TearDown() override {
     for (auto &device_context : dev_ctxs_) {
       delete device_context;
     }
@@ -137,6 +137,8 @@ class NCCLTester : public ::testing::Test {
 TEST_F(NCCLTester, ncclInitOp) {}
 
 // ncclAllReduceOp with desc
+// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9367
+/*
 TEST_F(NCCLTester, ncclAllReduceOp) {
   std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
   op2->SetType("ncclAllReduce");
@@ -184,6 +186,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
     }
   }
 }
+*/
 
 // ncclReduceOp with desc
 TEST_F(NCCLTester, ncclReduceOp) {
@@ -225,10 +228,8 @@ TEST_F(NCCLTester, ncclReduceOp) {
   result_tensor->Resize(kDims);
   auto *ct = result_tensor->mutable_data<float>(cpu_place);
 
-  paddle::memory::Copy(
-      cpu_place, ct, p::CUDAPlace(gpu_list_[kRoot]), rt,
-      recv_tensor.numel() * sizeof(float),
-      static_cast<p::CUDADeviceContext *>(dev_ctxs_[kRoot])->stream());
+  paddle::memory::Copy(cpu_place, ct, p::CUDAPlace(gpu_list_[kRoot]), rt,
+                       recv_tensor.numel() * sizeof(float), nullptr);
 
   for (int64_t j = 0; j < f::product(kDims); ++j) {
     ASSERT_NEAR(ct[j], expected_result, 1e-5);
@@ -236,6 +237,8 @@ TEST_F(NCCLTester, ncclReduceOp) {
 }
 
 // ncclBcastOp with desc
+// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9540
+/*
 TEST_F(NCCLTester, ncclBcastOp) {
   std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
   const int kRoot = 0;
@@ -281,3 +284,4 @@ TEST_F(NCCLTester, ncclBcastOp) {
     ASSERT_NEAR(ct[j], result, 1e-5);
   }
 }
+*/
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index 99f38529bbb5a36cd944a01940b5579195f2d601..e471f04662a1fa3e8e77a2db37f0da4521682018 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/nce_op.h"
 
+#include <vector>
+
 namespace paddle {
 namespace operators {
 
@@ -73,8 +75,7 @@ class NCEOp : public framework::OperatorWithKernel {
 
 class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NCEOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Input", "(Tensor) A tensor of shape [batch_size, dim].");
     AddInput(
         "Label",
@@ -127,8 +128,10 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
                               "user should avoid setting this attribute.")
         .SetDefault({});
     AddComment(R"DOC(
-Compute and return the noise-contrastive estimation training loss.
-See [Noise-contrastive estimation: A new estimation principle for unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
+Compute and return the noise-contrastive estimation training loss. See 
+`Noise-contrastive estimation: A new estimation principle for unnormalized 
+statistical models 
+ <http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf>`_.
 By default this operator uses a uniform distribution for sampling.
 )DOC");
   }
@@ -179,7 +182,9 @@ class NCEOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(nce, ops::NCEOp, ops::NCEOpMaker, nce_grad, ops::NCEOpGrad);
+REGISTER_OPERATOR(nce, ops::NCEOp, ops::NCEOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(nce_grad, ops::NCEOpGrad);
 REGISTER_OP_CPU_KERNEL(nce, ops::NCEKernel<paddle::platform::CPUPlace, float>,
                        ops::NCEKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(nce_grad,
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
index 94207638473374ddf7e23d211d6cde93f112f492..2c4c97f28bc0b511d6eaa8f79a3a4efc9be8a5da 100644
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <math.h>
 #include <random>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -108,7 +109,7 @@ class NCEKernel : public framework::OpKernel<T> {
     auto weight_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
     for (int64_t i = 0; i < sample_labels->numel(); ++i) {
       Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
-          (input_mat.chip((int)(i / sample_labels->dims()[1]), 0) *
+          (input_mat.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
            weight_mat.chip(sample_labels_data[i], 0))
               .sum();
       sample_out_data[i] += result(0);
@@ -190,7 +191,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
       auto x_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
       for (int64_t i = 0; i < sample_labels->numel(); ++i) {
         d_w_matrix.chip(sample_labels_data[i], 0) +=
-            x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) *
+            x_matrix.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) *
             sample_grad_data[i];
       }
     }
@@ -202,7 +203,7 @@ class NCEGradKernel : public framework::OpKernel<T> {
       auto d_x_matrix = EigenMatrix<T>::From(*d_x);
       auto w_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
       for (int64_t i = 0; i < sample_labels->numel(); ++i) {
-        d_x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) +=
+        d_x_matrix.chip(static_cast<int>(i / sample_labels->dims()[1]), 0) +=
             w_matrix.chip(sample_labels_data[i], 0) * sample_grad_data[i];
       }
     }
diff --git a/paddle/fluid/operators/net_op.cc b/paddle/fluid/operators/net_op.cc
deleted file mode 100644
index 0c2da744177b602246719d701257fc1b509ad81e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/net_op.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/net_op.h"
-#include <set>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-const char NetOp::kAll[] = "all";
-
-void NetOp::CompleteAddOp(bool calc) {
-  add_op_done_ = true;
-  if (!calc) return;
-  std::set<std::string> input_set;
-  std::set<std::string> output_set;
-  for (auto& op : ops_) {
-    for (auto& ipt : op->Inputs()) {
-      for (auto& var_name : ipt.second) {
-        // If input variable has been in output set, then it will be
-        // added into intermediate_outputs_. Otherwise, it will be
-        // added into input set.
-        if (Contains(output_set, var_name)) {
-          intermediate_outputs_.insert(var_name);
-        } else {
-          input_set.insert(var_name);
-        }
-      }
-    }
-
-    for (auto& opt : op->Outputs()) {
-      for (auto& var_name : opt.second) {
-        output_set.insert(var_name);
-      }
-    }
-  }
-  auto& inputs = inputs_[kAll];
-  inputs.reserve(input_set.size());
-  std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs));
-  auto& outputs = outputs_[kAll];
-  outputs.reserve(output_set.size());
-  std::copy(output_set.begin(), output_set.end(), std::back_inserter(outputs));
-}
-
-std::string NetOp::DebugStringEx(const framework::Scope* scope) const {
-  std::ostringstream os;
-  os << OperatorBase::DebugStringEx(scope) << std::endl;
-  for (auto& op : ops_) {
-    std::istringstream is(op->DebugStringEx(scope));
-    for (std::string line; std::getline(is, line);) {
-      os << "    " << line << std::endl;
-    }
-  }
-  return os.str();
-}
-
-bool NetOp::IsNetOp() const { return true; }
-
-std::vector<std::string> NetOp::OutputVars(bool has_intermediate) const {
-  std::vector<std::string> all;
-  for (auto& pair : this->outputs_) {
-    for (auto& var_name : pair.second) {
-      all.push_back(var_name);
-    }
-  }
-  if (has_intermediate) {
-    return all;
-  }
-  std::vector<std::string> ret_val;
-  for (auto& each : all) {
-    if (!Contains(intermediate_outputs_, each)) {
-      ret_val.push_back(each);
-    }
-  }
-  return ret_val;
-}
-
-NetOp::NetOp(const std::string& type, const framework::VariableNameMap& inputs,
-             const framework::VariableNameMap& outputs,
-             const framework::AttributeMap& attrs)
-    : framework::OperatorBase(type, inputs, outputs, attrs) {}
-
-std::unique_ptr<framework::OperatorBase> NetOp::Clone() const {
-  PADDLE_ENFORCE(
-      add_op_done_,
-      "Must clone a sealed NetOp, invoke Net::CompleteAddOp before clone");
-  return std::unique_ptr<OperatorBase>(new NetOp(*this));
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/net_op.h b/paddle/fluid/operators/net_op.h
deleted file mode 100644
index cbf8820cf4991bc24893f13646364dea0955a128..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/net_op.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <set>
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-/**
- * @brief Network is also a type of Operator
- *
- * It will manage the operators it has.
- *
- * Network is the container and controller of a set of operators.
-
- * A network object knows all Operators belonging to this network. Variables,
- * which are inputs and outputs of these operators, are created and managed by a
- * hierarchy of Scope objects.
- *
- * This is the base class of network, all the networks should implement the APIs
- * it defines.
- */
-class NetOp : public framework::OperatorBase {
- public:
-  static const char kAll[];
-  NetOp()
-      : framework::OperatorBase("plain_net", framework::VariableNameMap{},
-                                framework::VariableNameMap{},
-                                framework::AttributeMap{}) {}
-
-  NetOp(const std::string& type, const framework::VariableNameMap& inputs,
-        const framework::VariableNameMap& outputs,
-        const framework::AttributeMap& attrs);
-
-  NetOp(const NetOp& o) : framework::OperatorBase(o.type_, {}, {}, o.attrs_) {
-    this->ops_.reserve(o.ops_.size());
-    std::transform(
-        o.ops_.begin(), o.ops_.end(), std::back_inserter(this->ops_),
-        [](const std::unique_ptr<framework::OperatorBase>& op) {
-          return std::unique_ptr<framework::OperatorBase>(op->Clone());
-        });
-    this->CompleteAddOp();
-  }
-
-  bool SupportGPU() const override {
-    for (auto& op : ops_) {
-      if (!op->SupportGPU()) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  void AppendOp(const framework::OperatorBase& op) { AppendOp(op.Clone()); }
-
-  /**
-   * @brief Add an operator by ptr
-   */
-  void AppendOp(std::unique_ptr<framework::OperatorBase> op) {
-    PADDLE_ENFORCE(!add_op_done_,
-                   "Cannot AppendOp when this network is sealed");
-    PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
-    ops_.push_back(std::move(op));
-  }
-
-  void InsertOp(size_t pos, std::unique_ptr<framework::OperatorBase> op) {
-    PADDLE_ENFORCE(!add_op_done_,
-                   "Cannot InsertOp when this network is sealed");
-    PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
-    PADDLE_ENFORCE_LE(pos, ops_.size(), "Out of range");
-    ops_.insert(ops_.begin() + pos, std::move(op));
-  }
-
-  void InsertOp(size_t pos, const framework::OperatorBase& op) {
-    InsertOp(pos, op.Clone());
-  }
-
-  void CompleteAddOp(bool calculate = true);
-
-  std::string DebugStringEx(
-      const framework::Scope* scope = nullptr) const override;
-
-  bool IsNetOp() const override;
-  std::vector<std::string> OutputVars(bool has_intermediate) const override;
-
-  std::unique_ptr<framework::OperatorBase> Clone() const override;
-
-  std::vector<std::unique_ptr<framework::OperatorBase>> ops_;
-
- private:
-  /**
-   * @brief Run the network.
-   *
-   * Run all the operators with the `scope`, if no scope is provided, default
-   * scope will be used instead. If no OpContext is provicded, default context
-   * will be used.
-   */
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    for (auto& op : ops_) {
-      op->Run(scope, place);
-    }
-  }
-
-  bool add_op_done_{false};
-  std::set<std::string> intermediate_outputs_;
-
-  template <typename T, typename KeyType>
-  static bool Contains(T container, KeyType key) {
-    return container.find(key) != container.end();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/net_op_test.cc b/paddle/fluid/operators/net_op_test.cc
deleted file mode 100644
index 3b5f57548585398c441fd8038ba8b053c27392cf..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/net_op_test.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/net_op.h"
-
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace operators {
-using Scope = framework::Scope;
-using DeviceContext = platform::DeviceContext;
-
-static int run_cnt = 0;
-
-class TestOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-  DEFINE_OP_CLONE_METHOD(TestOp);
-
- private:
-  void RunImpl(const Scope& scope,
-               const platform::Place& place) const override {
-    ++run_cnt;
-  }
-};
-
-template <typename T>
-void AssertSameVectorWithoutOrder(const std::vector<T>& expected,
-                                  const std::vector<T>& actual) {
-  ASSERT_EQ(expected.size(), actual.size());
-  std::unordered_set<T> expected_set;
-  for (auto& tmp : expected) {
-    expected_set.insert(tmp);
-  }
-  for (auto& act : actual) {
-    ASSERT_NE(expected_set.end(), expected_set.find(act));
-  }
-}
-
-TEST(OpKernel, all) {
-  auto net = std::make_shared<NetOp>();
-  ASSERT_NE(net, nullptr);
-
-  net->AppendOp(std::unique_ptr<TestOp>(
-      new TestOp("test", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
-                 {{"Out", {"y"}}}, framework::AttributeMap{})));
-  net->AppendOp(std::unique_ptr<TestOp>(
-      new TestOp("test", {{"X", {"y"}}, {"W", {"w2"}}, {"b", {"b2"}}},
-                 {{"Out", {"z"}}}, framework::AttributeMap{})));
-
-  net->CompleteAddOp();
-  AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"},
-                               net->Inputs(NetOp::kAll));
-  AssertSameVectorWithoutOrder({"y", "z"}, net->Outputs(NetOp::kAll));
-
-  auto final_outs = net->OutputVars(false);
-
-  ASSERT_EQ(final_outs.size(), 1UL);
-  ASSERT_EQ(final_outs[0], "z");
-}
-
-TEST(NetOp, insert_op) {
-  NetOp net;
-  auto op1 = std::unique_ptr<framework::NOP>(
-      new framework::NOP("empty", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
-                         {{"Out", {"y"}}}, framework::AttributeMap{}));
-  net.AppendOp(*op1);
-  net.InsertOp(0, *op1);
-  ASSERT_EQ(2UL, net.ops_.size());
-  net.InsertOp(2, std::move(op1));
-  ASSERT_EQ(3UL, net.ops_.size());
-}
-
-TEST(NetOp, Clone) {
-  NetOp net;
-  net.AppendOp(std::unique_ptr<framework::NOP>(new framework::NOP{
-      "empty", framework::VariableNameMap{}, framework::VariableNameMap{},
-      framework::AttributeMap{}}));
-  net.AppendOp(std::unique_ptr<framework::NOP>(new framework::NOP{
-      "empty2", framework::VariableNameMap{}, framework::VariableNameMap{},
-      framework::AttributeMap{}}));
-  net.CompleteAddOp(true);
-  auto new_net_op = net.Clone();
-  ASSERT_NE(new_net_op, nullptr);
-  ASSERT_TRUE(new_net_op->IsNetOp());
-  auto* new_net = static_cast<NetOp*>(new_net_op.get());
-  ASSERT_EQ(2UL, new_net->ops_.size());
-  ASSERT_EQ(new_net->ops_[0]->Type(), "empty");
-  ASSERT_EQ(new_net->ops_[1]->Type(), "empty2");
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc
index 5345c5bdb0f1e2d96233595f89028993606d2399..aa19c62c83648814e86b1e7062424be3693e4b98 100644
--- a/paddle/fluid/operators/norm_op.cc
+++ b/paddle/fluid/operators/norm_op.cc
@@ -16,41 +16,34 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename AttrType>
 class NormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "X",
-        "(Tensor) The input tensor of norm operator. "
-        "The format of input tensor is NCHW. Where N is batch size, C is the "
-        "number of channels, H and W is the height and width of feature.");
-    AddInput("Scale",
-             "(Tensor) The input tensor of norm operator. "
-             "The format of input tensor is C * 1.");
-    AddAttr<AttrType>("epsilon",
-                      "(float, default 1e-10) Constant "
-                      "for numerical stability.")
+  void Make() override {
+    AddInput("X", "(Tensor) A tensor of rank >= axis.");
+    AddAttr<int>("axis",
+                 "The axis on which to apply normalization. If axis < 0, "
+                 "the dimension to normalization is rank(X) + axis. -1 is "
+                 "the last dimension.");
+    AddAttr<float>("epsilon",
+                   "(float, default 1e-10) The epsilon value is used "
+                   "to avoid division by zero.")
         .SetDefault(1.0e-10f);
-    AddOutput("Out",
-              "(Tensor) The output tensor of norm operator."
-              "N * M."
-              "M = C * H * W");
+    AddOutput("Norm",
+              "(Tensor) A tensor saved the `sqrt(sum(x) + epsion)` will "
+              "be used in backward kernel.")
+        .AsIntermediate();
+    AddOutput("Out", "(Tensor) A tensor of the same shape as X.");
     AddComment(R"DOC(
-       "Input shape: $(N, C, H, W)$
-        Scale shape: $(C, 1)$
-        Output shape: $(N, C, H, W)$
-        Where
-        forward
-          $$
-            [\frac {x_{1}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{2}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{3}}{\sqrt{\sum{x_{i}^{2}}}} \cdot  \cdot  \cdot \frac {x_{n}}{\sqrt{\sum{x_{i}^{2}}}}]
-          $$
-        backward
-          $$
-            \frac{\frac{\mathrm{d}L }{\mathrm{d}y_{1}} - \frac {x_{1}\sum {\frac{\mathrm{d} L}{\mathrm{d} y_{j}}}x_{j}}{\sum x_{j}^{2}} }{\sqrt{\sum{x_{j}^{2}}}}
-          $$
-        )DOC");
+
+Given a tensor, apply 2-normalization along the provided axis.
+
+$$
+y = \frac{x}{ \sqrt{\sum {x^2} + epsion }}
+$$
+
+where, $\sum {x^2}$ is calculated along the `axis` dimension.
+        
+)DOC");
   }
 };
 
@@ -59,15 +52,15 @@ class NormOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of NormOp"
-                   "should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Scale"),
-                   "Input(Scale) of NormOp"
-                   "should not be null.");
+                   "Input(X) of NormOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of NormOp should not be null.");
-    auto in_x_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Out", in_x_dims);
+    auto xdim = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", xdim);
+    int axis = ctx->Attrs().Get<int>("axis");
+    if (axis < 0) axis = xdim.size() + axis;
+    xdim[axis] = 1;
+    ctx->SetOutputDim("Norm", xdim);
   }
 };
 
@@ -85,11 +78,12 @@ class NormOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(norm, ops::NormOp, ops::NormOpMaker<float>, norm_grad,
-            ops::NormOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    norm, ops::NormKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::NormKernel<paddle::platform::CPUDeviceContext, double, float>);
-REGISTER_OP_CPU_KERNEL(
-    norm_grad, ops::NormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::NormGradKernel<paddle::platform::CPUDeviceContext, double, float>);
+using CPU = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(norm_grad, ops::NormOpGrad);
+REGISTER_OP_CPU_KERNEL(norm, ops::NormKernel<CPU, float>,
+                       ops::NormKernel<CPU, double>);
+REGISTER_OP_CPU_KERNEL(norm_grad, ops::NormGradKernel<CPU, float>,
+                       ops::NormGradKernel<CPU, double>);
diff --git a/paddle/fluid/operators/norm_op.cu b/paddle/fluid/operators/norm_op.cu
index d1d9be50742b54a3b6f068fd43ec4b16696183bf..1d0021d33ff9ee65c3366183466b94266e6c2999 100644
--- a/paddle/fluid/operators/norm_op.cu
+++ b/paddle/fluid/operators/norm_op.cu
@@ -16,9 +16,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/norm_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    norm, ops::NormKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::NormKernel<paddle::platform::CUDADeviceContext, double, float>);
-REGISTER_OP_CUDA_KERNEL(
-    norm_grad, ops::NormGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::NormGradKernel<paddle::platform::CUDADeviceContext, double, float>);
+using CUDA = paddle::platform::CUDADeviceContext;
+
+REGISTER_OP_CUDA_KERNEL(norm, ops::NormKernel<CUDA, float>,
+                        ops::NormKernel<CUDA, double>);
+REGISTER_OP_CUDA_KERNEL(norm_grad, ops::NormGradKernel<CUDA, float>,
+                        ops::NormGradKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/norm_op.h b/paddle/fluid/operators/norm_op.h
index 0ad29e8a0385c46a07842930378ed7a040564437..3167bdc8ac718b23435690577e4163826d14a332 100644
--- a/paddle/fluid/operators/norm_op.h
+++ b/paddle/fluid/operators/norm_op.h
@@ -19,156 +19,110 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T, typename AttrType = T>
+inline void GetDims(const framework::DDim& dim, int axis, int* pre, int* n,
+                    int* post) {
+  *pre = 1;
+  *post = 1;
+  *n = dim[axis];
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= dim[i];
+  }
+  for (int i = axis + 1; i < dim.size(); ++i) {
+    (*post) *= dim[i];
+  }
+}
+
+template <typename DeviceContext, typename T>
 class NormKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
-    const framework::Tensor* scale = context.Input<framework::Tensor>("Scale");
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto epsilon = static_cast<T>(context.Attr<AttrType>("epsilon"));
-    out->mutable_data<T>(context.GetPlace());
-    int batch_size = in_x->dims()[0];
-    int channels = in_x->dims()[1];
-    int height = in_x->dims()[2];
-    int width = in_x->dims()[3];
-    int fea_len = height * width;
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-    auto x =
-        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            *in_x, framework::make_ddim({batch_size, fea_len * channels}));
-    // get square
-    framework::Tensor x_square;
-    x_square.mutable_data<T>(in_x->dims(), context.GetPlace());
-    auto x_square_eigen =
-        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            x_square, framework::make_ddim({batch_size, fea_len * channels}));
-    x_square_eigen.device(*place) = x.square();
-    auto scale_eigen =
-        framework::EigenVector<T, Eigen::RowMajor, Eigen::DenseIndex>::Flatten(
-            *scale);
-    for (int n = 0; n < batch_size; ++n) {
-      framework::Tensor in_x_batch = in_x->Slice(n, n + 1);
-      auto in_x_batch_eigen =
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-              in_x_batch, framework::make_ddim({channels, fea_len}));
-      framework::Tensor x_square_batch = x_square.Slice(n, n + 1);
-      auto x_square_batch_eigen =
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-              x_square_batch, framework::make_ddim({channels, fea_len}));
-      framework::Tensor out_batch = out->Slice(n, n + 1);
-      auto out_batch_eigen =
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-              out_batch, framework::make_ddim({channels, fea_len}));
-      framework::Tensor tmp_tensor;
-      tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
-                                 context.GetPlace());
-      auto tmp = framework::EigenVector<T, Eigen::RowMajor,
-                                        Eigen::DenseIndex>::Flatten(tmp_tensor);
-      // get colsum and sqrt , inverse
-      auto dim = Eigen::array<int, 1>({{0}});
-      tmp.device(*place) = x_square_batch_eigen.sum(dim);
-      tmp.device(*place) = (tmp + epsilon).sqrt().inverse();
-      Eigen::array<int, 2> broadcast_dim_col;
-      broadcast_dim_col[1] = 1;
-      broadcast_dim_col[0] = channels;
-      out_batch_eigen.device(*place) =
-          in_x_batch_eigen * (tmp.broadcast(broadcast_dim_col));
-      Eigen::array<int, 2> broadcast_dim_row;
-      broadcast_dim_row[1] = fea_len;
-      broadcast_dim_row[0] = 1;
-      out_batch_eigen.device(*place) =
-          out_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row));
-    }
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_x = ctx.Input<framework::Tensor>("X");
+    auto* out_y = ctx.Output<framework::Tensor>("Out");
+    auto* out_norm = ctx.Output<framework::Tensor>("Norm");
+    out_y->mutable_data<T>(ctx.GetPlace());
+    out_norm->mutable_data<T>(ctx.GetPlace());
+
+    auto xdim = in_x->dims();
+    auto ndim = out_norm->dims();
+    T eps = static_cast<T>(ctx.Attr<float>("epsilon"));
+    int axis = ctx.Attr<int>("axis");
+    if (axis < 0) axis = xdim.size() + axis;
+    int pre, n, post;
+    GetDims(xdim, axis, &pre, &n, &post);
+
+    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
+
+    Eigen::DSizes<int, 3> shape(pre, n, post);
+    Eigen::DSizes<int, 2> norm_shape(pre, post);
+
+    auto x_e = framework::EigenVector<T>::Flatten(*in_x);
+    auto y_e = framework::EigenVector<T>::Flatten(*out_y);
+    auto norm_e = framework::EigenVector<T>::Flatten(*out_norm);
+    auto x = x_e.reshape(shape);
+    auto y = y_e.reshape(shape);
+    auto norm = norm_e.reshape(norm_shape);
+
+    Eigen::DSizes<int, 1> rdim(1);
+    // y = x / sqrt((sum(x * x) + epsilon))
+    // norm = sqrt(sum(x * x) + epsilon)
+    auto sum = x.pow(2).sum(rdim) + eps;
+    norm.device(*place) = sum.sqrt();
+    // y = x / norm
+    Eigen::DSizes<int, 3> rshape(pre, 1, post);
+    Eigen::DSizes<int, 3> bcast(1, n, 1);
+    y.device(*place) = x / norm.reshape(rshape).broadcast(bcast);
   }
 };
 template <typename DeviceContext, typename T, typename AttrType = T>
 class NormGradKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
-    const framework::Tensor* scale = context.Input<framework::Tensor>("Scale");
-    const framework::Tensor* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto epsilon = static_cast<T>(context.Attr<AttrType>("epsilon"));
-    framework::Tensor* in_x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    in_x_grad->mutable_data<T>(context.GetPlace());
-    int batch_size = in_x->dims()[0];
-    int channels = in_x->dims()[1];
-    int height = in_x->dims()[2];
-    int width = in_x->dims()[3];
-    int fea_len = height * width;
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-
-    auto scale_eigen =
-        framework::EigenVector<T, Eigen::RowMajor, Eigen::DenseIndex>::Flatten(
-            *scale);
-    auto x =
-        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            *in_x, framework::make_ddim({batch_size, fea_len * channels}));
-    // get square
-    framework::Tensor x_square;
-    x_square.mutable_data<T>(in_x->dims(), context.GetPlace());
-    auto x_square_eigen =
-        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            x_square, framework::make_ddim({batch_size, fea_len * channels}));
-    x_square_eigen.device(*place) = x.square();
-
-    for (int n = 0; n < batch_size; ++n) {
-      framework::Tensor in_x_batch = in_x->Slice(n, n + 1);
-      auto in_x_batch_eigen =
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-              in_x_batch, framework::make_ddim({channels, fea_len}));
-      framework::Tensor in_g_batch = in_x_grad->Slice(n, n + 1);
-      auto in_g_batch_eigen =
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-              in_g_batch, framework::make_ddim({channels, fea_len}));
-      framework::Tensor x_square_batch = x_square.Slice(n, n + 1);
-      auto x_square_batch_eigen =
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-              x_square_batch, framework::make_ddim({channels, fea_len}));
-      framework::Tensor outg_batch = out_grad->Slice(n, n + 1);
-      auto outg_batch_eigen =
-          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
-              outg_batch, framework::make_ddim({channels, fea_len}));
-
-      framework::Tensor tmp_tensor;
-      tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
-                                 context.GetPlace());
-      auto tmp_eigen =
-          framework::EigenVector<T, Eigen::RowMajor,
-                                 Eigen::DenseIndex>::Flatten(tmp_tensor);
-      auto dim = Eigen::array<int, 1>({{0}});
-      tmp_eigen.device(*place) = (in_x_batch_eigen * outg_batch_eigen).sum(dim);
-      framework::Tensor norm_tmp_tensor;
-      norm_tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
-                                      context.GetPlace());
-      auto norm_tmp_eigen =
-          framework::EigenVector<T, Eigen::RowMajor,
-                                 Eigen::DenseIndex>::Flatten(norm_tmp_tensor);
-      norm_tmp_eigen.device(*place) =
-          (x_square_batch_eigen.sum(dim) + epsilon).sqrt();
-      Eigen::array<int, 2> broadcast_dim_col;
-      broadcast_dim_col[1] = 1;
-      broadcast_dim_col[0] = channels;
-      in_g_batch_eigen.device(*place) =
-          in_x_batch_eigen * tmp_eigen.broadcast(broadcast_dim_col);
-      in_g_batch_eigen.device(*place) =
-          in_g_batch_eigen /
-          (norm_tmp_eigen * norm_tmp_eigen).broadcast(broadcast_dim_col);
-      in_g_batch_eigen.device(*place) = outg_batch_eigen - in_g_batch_eigen;
-      // outg_batch_eigen + (in_g_batch_eigen * -1);
-      in_g_batch_eigen.device(*place) =
-          in_g_batch_eigen / norm_tmp_eigen.broadcast(broadcast_dim_col);
-      Eigen::array<int, 2> broadcast_dim_row;
-      broadcast_dim_row[1] = fea_len;
-      broadcast_dim_row[0] = 1;
-      in_g_batch_eigen.device(*place) =
-          in_g_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row));
-    }
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_x = ctx.Input<framework::Tensor>("X");
+    auto* in_norm = ctx.Input<framework::Tensor>("Norm");
+    auto* in_dy = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* out_dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    out_dx->mutable_data<T>(ctx.GetPlace());
+
+    auto xdim = in_x->dims();
+    int axis = ctx.Attr<int>("axis");
+    if (axis < 0) axis = xdim.size() + axis;
+    int pre, n, post;
+    GetDims(xdim, axis, &pre, &n, &post);
+
+    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
+
+    auto x_e = framework::EigenVector<T>::Flatten(*in_x);
+    auto dy_e = framework::EigenVector<T>::Flatten(*in_dy);
+    auto norm_e = framework::EigenVector<T>::Flatten(*in_norm);
+    auto dx_e = framework::EigenVector<T>::Flatten(*out_dx);
+
+    Eigen::DSizes<int, 3> shape(pre, n, post);
+    Eigen::DSizes<int, 2> norm_shape(pre, post);
+    auto x = x_e.reshape(shape);
+    auto dy = dy_e.reshape(shape);
+    auto norm = norm_e.reshape(norm_shape);
+    auto dx = dx_e.reshape(shape);
+
+    framework::Tensor rsum;
+    rsum.mutable_data<T>({pre, post}, ctx.GetPlace());
+    auto sum = framework::EigenTensor<T, 2>::From(rsum);
+
+    Eigen::DSizes<int, 1> rdim(1);
+    Eigen::DSizes<int, 3> bcast(1, n, 1);
+    Eigen::DSizes<int, 3> rshape(pre, 1, post);
+
+    // dx = ( dy/sqrt(sum(x*x)) ) * [1 - x*sum(x) / (sum(x*x) + e)]
+    //    = [dy - dy * x * sum(x) / (sum(x*x) + e)] / sqrt(sum(x*x))
+    //    = [dy - x * sum(x*dy) / (sum(x*x) + e)] / sqrt(sum(x*x))
+    // 1. sum = sum(x*dy)
+    sum.device(*place) = (x * dy).sum(rdim);
+    // 2. dx = x * sum
+    dx.device(*place) = sum.reshape(rshape).broadcast(bcast) * x;
+    // 3. dx / (sum(x*x) + e)
+    // where, norm.pow(2) = sum(x*x) + e, which is calculated in forward.
+    dx.device(*place) = dx / norm.pow(2).broadcast(bcast);
+    // 4. [dy - dx] / sqrt(sum(x*x))
+    dx.device(*place) = (dy - dx) / norm.broadcast(bcast);
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/one_hot_op.cc b/paddle/fluid/operators/one_hot_op.cc
index 1d42dfdd765166c9596abc08ce8abd534453bc63..4fcb1d69935175c3f643db7a4da04db34492f8fb 100644
--- a/paddle/fluid/operators/one_hot_op.cc
+++ b/paddle/fluid/operators/one_hot_op.cc
@@ -46,8 +46,7 @@ class OneHotOp : public framework::OperatorWithKernel {
 
 class OneHotOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  OneHotOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(LoDTensor, LoDTensor<int>) Input variable with rank at least 2. "
              "The last dimension of X should be 1. Each value of X is an index "
diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu
index 240ac895e2c8391322411d347384f4834995eb7c..625065692c1f32c89d9e566d00051e237ac9a3af 100644
--- a/paddle/fluid/operators/one_hot_op.cu
+++ b/paddle/fluid/operators/one_hot_op.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/one_hot_op.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/op_documentation/batch_norm_op.md b/paddle/fluid/operators/op_documentation/batch_norm_op.md
deleted file mode 100644
index d1392619c42d9206bf4bddcd33ad11b033e6cbdb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/op_documentation/batch_norm_op.md
+++ /dev/null
@@ -1,134 +0,0 @@
-# Batch Normalization
-
-## What is batch normalization
-
-Batch normalization is a frequently-used method in deep network training. It adjusts the mean and variance of a layer's output, and make the data distribution easier for next layer's training. 
-
-The principle of batch normalization can be summarized into a simple function:
-
-```
-y = (x - E[x]) / STD[x]) * scale + bias
-```
-
-`x` is a batch of output data of a certain layer. `E[x]` and `STD[x]` is the mean and standard deviation of `x`, respectively。 `scale` and `bias` are two trainable parameters. The training of batch normalization layer equals to the learning of best values of `scale` and `bias`.
-
-In our design, we use a single operator(`batch_norm_op`) to implement the whole batch normalization in C++, and wrap it as a layer in Python.
-
-## Differences with normal operators
-
-`batch_norm_op` is a single operator. However, there are a few differences between `BatchNormOp` and normal operators, which we shall take into consideration in our design.
-
-1. `batch_norm_op` shall behave differently in training and inferencing. For example, during inferencing, there is no batch data and it's impossible to compute `E[x]` and `STD[x]`, so we have to use an `estimated_mean` and an `estimated_variance` instead of them. These require our framework to be able to inform operators current running type (training/inferencing), then operators can switch their behaviors.
-
-2. `batch_norm_op` shall have the ability to maintain `estimated_mean` and `estimated_variance` across mini-batch. In each mini-batch, `estimated_mean` is iterated by the following equations:
-
-```
-if batch_id == 0
-  estimated_mean = E[x]
-else
-  estimated_mean = estimated_mean * momentum + (1.0 - momentum_) * E[x]
-```
-
-The iterating of `estimated_variance` is similar. `momentum` is an attribute, which controls estimated_mean updating speed.
-
-## Implementation
-
-Batch normalization is designed as a single operator is C++, and then wrapped as a layer in Python.
-
-### C++
-
-As most C++ operators do, `batch_norm_op` is defined by inputs, outputs, attributes and compute kernels.
-
-#### Inputs
-
-- `x`: The inputs data, which is generated by the previous layer.
-- `estimated_mean`: The estimated mean of all previous data batches. It is updated in each forward propagation and will be used in inferencing to take the role of `E[x]`.
-- `estimated_var`: The estimated standard deviation of all previous data batches. It is updated in each forward propagation and will be used in inferencing to take the role of `STD[x]`.
-- `scale`: trainable parameter 'scale'
-- `bias`: trainable parameter 'bias'
-
-#### Outputs
-
-- `y`: The output data.
-- `batch_mean`: The mean value of batch data.
-- `batch_var`: The standard deviation value of batch data.
-- `saved_mean`: Updated `estimated_mean` with current batch data. It's supposed to share the memory with input `estimated_mean`.
-- `saved_var`: Updated `estimated_var` with current batch data. It's supposed to share the memory with input `estimated_var`.
-
-#### Attributes
-
-- `is_infer`: *bool*. If true, run `batch_norm_op` in inferencing mode.
-- `use_global_est`: *bool*. If true, use `saved_mean` and `saved_var` instead of `E[x]` and `STD[x]` in trainning.
-- `epsilon`: *float*. The epsilon value to avoid division by zero.
-- `momentum`: *float*. Factor used in `estimated_mean` and `estimated_var` updating. The usage is shown above.
-
-#### Kernels
-
-The following graph showes the training computational process of `batch_norm_op`:
-
-<img src="../images/batch_norm_op_kernel.png" width="800"/>
-
-cudnn provides APIs to finish the whole series of computation, we can use them in our GPU kernel.
-
-### Python
-
-`batch_norm_op` is warpped as a layer in Python:
-
-```python 
-def batch_norm_layer(net, 
-                     input,
-                     output, 
-                     scale, 
-                     bias, 
-                     use_global_est = False, 
-                     epsilon = 1e-6,
-                     momentum = 0.99):
-	mean_cache = scope.new_var(name = 'estimated_mean', trainable = False)
-	var_cache = scop.new_var(name = 'estimated_var', trainable = False)
-	batch_mean = scope.new_var(name = 'batch_mean')
-	batch_var = scope.new_var(name = 'batch_var')
-	batch_norm_op = Operator('batch_norm_op',
-	                         x = input,
-	                         estimated_mean = mean_cache,
-	                         estimated_mean = var_cache,
-	                         scale = scale,
-	                         bias = bias,
-	                         y = output,
-	                         batch_mean = batch_mean,
-	                         batch_var = batch_var,
-	                         saved_mean = mean_cache,
-	                         saved_var = var_cache,
-	                         is_infer = False,
-	                         use_global_est = use_global_est,
-	                         epsilon = epsilon,
-	                         momentum = momentum)
-	net.append_op(batch_norm_op)
-	return output
-```
-
-Because Python API has not been finally decided, the code above can be regarded as pseudo code. There are a few key points we shall note:
-
-1. `estimated_mean` and `estimated_var` are assigned the same variables with `saved_mean` and `saved_var` respectively. So they share same the memories. The output mean and variance values(`saved_mean` and `saved_var`) of a certain batch will be the inputs(`estimated_mean` and `estimated_var`) of the next batch.
-
-2. `is_infer` decided whether `batch_norm_op` will run in training mode or inferencing mode. However, a network may contains both training and inferencing parts. And user may switch `batch_norm_op`'s running mode in Python `for` loop like this:
-
-```python
-for pass_id in range(PASS_NUM):
-    # ...
-    net.train()  # run training model
-    if pass_id % 100 == 0:
-        net.infer(test_image)    # run inferencing model
-    # ...
-``` 
-
-`is_infer` is an attribute. Once an operator is created, its attributes can not be changed. It suggests us that we shall maintain two `batch_norm_op` in the model, one's `is_infer` is `True`(we call it `infer_batch_norm_op`) and the other one's is `False`(we call it `train_batch_norm_op`). They share all parameters and variables, but be placed in two different branches. That is to say, if a network contains a `batch_norm_op`, it will fork into two branches, one go through `train_batch_norm_op` and the other one go through `infer_batch_norm_op`:
-
-<div align=center>
-<img src="../images/batch_norm_fork.png" width="500"/>
-</div>
-
-Just like what is shown in the above graph, the net forks before `batch_norm_op` and will never merge again. All the operators after `batch_norm_op` will duplicate. 
-
-When the net runs in training mode, the end of the left branch will be set as the running target, so the dependency tracking process will ignore right branch automatically. When the net runs in inferencing mode, the process is reversed.
-
-How to set a target is related to Python API design, so I will leave it here waiting for more discussions.
diff --git a/paddle/fluid/operators/op_documentation/name_convention.md b/paddle/fluid/operators/op_documentation/name_convention.md
deleted file mode 100644
index a02b356f058da68442516c2705d0bac140f8ef18..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/op_documentation/name_convention.md
+++ /dev/null
@@ -1,65 +0,0 @@
-## Operator's Parameter Name Convention
-
-To make the operator document itself more clear, we recommend operator names obey the listing conventions.
-
-### OpProtoMaker names
-
-When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L61) , and will be used in client language to create operator.
-
-- Input/Output.
-  - Input/Output names follow the **CamelCase**. e.g. `X`, `Y`, `Matrix`, `LastAxisInMatrix`. Input/Output much more like Variables, we prefer to meaningful English words.
-  - If an operator's Input/Output are tensors in math, not match to any meaningful words, input name should starts from `X`. e.g. `X`, `Y`, and output name should starts from `Out`. e.g. `Out`. This rule intends making operators which have few inputs/outputs unified.
-
-- Attribute.
-  - Attribute name follows the **snake_case**. e.g. `x`, `y`, `axis`, `rowwise_matrix`. Also, attribute name prefers to meaningful English words.
-
-- Comments.
-  - Input/Output/Attr comment follow the format of **(type,default value) usage**, corresponding to which type it can be and how it will be used in the operator. e.g.  Attribute in Accumulator`"gamma" `,`(float, default 1.0) Accumulation multiplier`.
-  - Operator comment format of` R"DOC(your comment here)DOC"`. You should explain the input/output of the operator first. If there is math calculation in this operator, you should write the equation in the comment. e.g. `Out = X + Y`.
-
-- Order.
-  - Follow the order of Input/Output, then Attribute, then Comments. See the example in best practice.
-
-### Best Practice
-
-Here we give some examples to show how these rules will be used.
-
-- The operator has one input, one output. e.g.`relu`, inputs: `X`, outputs: `Out`.
-
-- The operator has two input, one output. e.g. `rowwise_add`, inputs : `X`, `Y`, outputs : `Out`.
-
-- The operator contains attribute. e.g. `cosine`, inputs : `X`, `axis`, outputs : `Out`.
-
-  We give a full example of Accumulator Operator.
-
-```c++
-class AccumulateOpMaker : public framework::OpProtoAndCheckerMaker {
-public:
-  AccumulateOpMaker(OpProto *proto,
-                    OpAttrChecker *op_checker)
-    : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(Tensor) The input tensor that has to be accumulated to the output tensor.
-    If the output size is not the same as input size,
-    the output tensor is first reshaped and initialized to zero, and only then, accumulation is done.");
-    AddOutput("Out", "(Tensor) Accumulated output tensor");
-    AddAttr<float>("gamma", "(float, default 1.0) Accumulation multiplier").SetDefault(1.0f);
-    AddComment(R"DOC(
-Accumulate Operator.
-
-This operator accumulates the input tensor to the output tensor. If the
-output tensor already has the right size, we add to it; otherwise, we first
-initialize the output tensor to all zeros, and then do accumulation. Any
-further calls to the operator, given that no one else fiddles with the output
-in the interim, will do simple accumulations.
-
-Accumulation is done as follows:
-
-Out = 1*X + gamma*Out
-
-where X is the input tensor, Out is the output tensor and gamma is the multiplier
-argument.
-
-)DOC");
-  }
-};
-```
diff --git a/paddle/fluid/operators/op_documentation/net_op_design.md b/paddle/fluid/operators/op_documentation/net_op_design.md
deleted file mode 100644
index a5f0483081e8a03b2d001a551fcc02bbd392016d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/op_documentation/net_op_design.md
+++ /dev/null
@@ -1,250 +0,0 @@
-# Network Design
-
-`Network` is the container and controller of a set of operators,
-user can build a real network from a `NetDesc` which is a protobuf message 
-and use `Network.Run()` to run all the operators in the network.
-
-A network object knows all Operators belonging to this network. Variables, 
-which are inputs and outputs of these operators, 
-are created and managed by a hierarchy of Scope objects.
-
-# API
-
-## Net
-To make the `Network` extendable, a base class is defined like this
-
-```c++
-// operator's index stored in a network.
-typedef int OpIndex;
-
-// The minimum a network should be implemented.
-class Net {
- public:
-  // run all the operators and return success(true) or not, with all the
-  // variables are located in `scope`. `context` describes the detail execution
-  // environment for ops. `begin` and `end` specify the scope of `ops_` to run,
-  // If no positive indexes are provided, all operators in `ops_` will run.
-  virtual Error Run(Scope *scope, OpContext *context, OpIndex begin = -1,
-                   OpIndex end = -1) const = 0;
-
-  // Add an Operator according to `def`.
-  virtual OpIndex AddOp(const proto::OpDef &def) = 0;
-
-  // Add optimizer operators acctording to `attrs`.
-  virtual Error AddOptimizerOps(const OptAttrs &attrs) = 0;
-
-  // Add backward operators.
-  virtual Error AddBackwardOps() = 0;
-
-  // Infer the shapes of variables required by operators in the network. The
-  // `scope` will be mutated according to the inferred shapes.
-
-  static std::unique_ptr<Net> Create(const NetDesc &def = NetDesc());
-};
-```
-
-All network implementations should build networks from a protobuf message which 
-describes the structure of a real network; `Run` method should be implemented by 
-all implementations to offer a universal method to forward or backward compute a network.
-
-`Net::Create` is a method of factory pattern and can be implemented like
-
-```c++
-std::unique<Net> Net::Create(const NetDesc& def) {
-  switch (def.model_type()) {
-    case NN:
-      return new Network(def);
-    case Recursive:
-      return new RecursiveNet(def);
-    case Recurrent:
-      return new RecurrentNet(def);
-  }
-  return nullptr;
-}
-```
-
-Network is designed as the container of operators. to make it more extendable,
-we decouple it from the related variable resources. 
-
-`Run(Scope* scope)` takes the scope as a argument so that it can run in different scopes.
-
-Finally, `Net` can be used as followed
-
-```c++
-Scope default_scope;
-OpContext default_context;
-auto net = Net::CreateNet(def);
-
-if (net) {
-  net.Run(&default_scope, &default_context);
-}
-```
-
-## `PlainNet` as a simple implementation of `BaseNet`
-
-A very basic implementation is as follows. All it does is simply to run every operators in sequence.
-
-```c++
-class PlainNet : public Net {
- public:
-  // Create a network describe by `def`.  NetDesc is the definition of a network.
-  PlainNet(const NetDesc &def);
-
-  // Infer all the operators' input and output varialbes' shapes, will be called before every mini-batch
-  training.
-  virtual Error InferShape(Scope *scope) override;
-
-  // Run all the operators with the `scope`, if no scope is provided, default
-  // scope will be used instead. If no OpContext is provicded, default context will be used.
-  virtual Error Run(Scope *scope = nullptr, OpContext *context=nullptr, OpIndex begin = -1,
-                   OpIndex end = -1) const override;
-
-  virtual OpIndex AddOp(const proto::OpDef &def) override;
-
-  virtual Error AddOptimizerOps(const OptAttrs &attrs) override;
-
-  virtual Error AddBackwardOps() override;
-
- protected:
-  // Create operators accordding to `def`, will be called by the constructor.
-  Error BuildNet(const NetDesc &def);
-
-  // Add a operator which is identified as `type` and has attributes described
-  // in `attrs`, the `inputs` are the keys of readonly input variables,
-  // `outputs` are keys of mutable output variables. An `OpIndex` will be
-  // returned to indicate the offset of the new operator in `ops_`.
-  OpIndex AddOp(const std::string &type, const std::vector<string> &inputs,
-                const std::vector<string> &outputs,
-                const OprAttr &attrs = OprAttr());
-
- private:
-  // the operators owned by `Network`.
-  std::vector<Operator> ops_;
-};
-```
-
-`PlainNet` will create operators so that a private member `ops_` is defined,
-the operators are created by `CreateNet`, and each operator is created by `AddOp`.
-
-
-## PlainNet Usage
-`PlainNet` can be used to define and run a network as follows
-
-```c++
-// create an empty scope located on CPU device.
-Scope scope(CPUPlace());
-
-// create and init variables described in `net_desc`.
-scope.CreateVariables(net_desc);
-scope.InitVariables(net_desc);
-
-// create a network according to `net_desc`
-auto net = Net::CreateNet(net_desc);
-// Add more operators if needed.
-net->AddOp(add...);
-net->AddOp(fc...);
-
-net->AddBackwardOps();
-net->AddOptimizerOps();
-
-// run the network providing the `scope`.
-net.Run(&scope);
-```
-
-## `NetBuilder` as a C++ syntax wrapper
-This is a detailed description of the user-related C++ network API, and may not needed in the prototype development stage.
-
-The `NetBuilder` will give users a much simpler syntax as follows to create a network, and demonstrates how to use the `BaseNet`'s raw interfaces.
-
-```c++
-Variable* fc_out = builder.AddOp("fc", input=image, size=100, activation="Sigmoid");
-Variable* prediction = builder.AddOp("fc", input=fc_out, size=10, activation="Sigmoid");
-Variable* loss = builder.AddOp("cross_entropy", input=prediction, label=label);
-Variable* avg_loss = builder.AddOp("mean", loss);
-
-builder.BackwardFrom(avg_loss)
-builder.AddOptimization(1e-4, "adam");
-builder.Run();
-```
-
-`NetBuilder` will call `Net` 's virtual functions to change the real network structure, here is a sample definition
-
-```c++
-class NetBuilder final {
- public:
-  NetBuilder(Net* net) : net_(net) {}
-
-  Variable* AddOp(const string& type, const vector<Variable>& inputs,
-                  size_t size, Activation act) {
-    // much code here.
-    // ...
-    net_->AddOp(def);
-    need_rebuild_net_ = true;
-    net_->InferShape();
-    // ...
-  }
-
-  Error BackwardFrom(const Variable& cost);
-
-  Error Run(Scope* scope, OpContext* context, bool need_backward = true) {
-    // backward.
-    if (need_backward) {
-      if (need_rebuild_net_) {
-        AddBackwardOps();
-        AddOptimizerOps();
-      }
-      net_->Run(scope, context);
-      return;
-    }
-    // just forward.
-    net_->Run(scope, context, 0, last_forward_op_);
-  }
-
- protected:
-  Error AddBackwardOps();
-  Error AddOptimizerOps();
-
- private:
-  Net* net_;
-  OpIndex last_forward_op_{-1};
-  bool need_rebuild_net_{true};
-}
-```
-
-## Compatibility with RNN
-
-Benefitting from the decoupling of `PlainNet.Run` and `Scope`, `PlainNet` is compatible with future RNN design, 
-for example we can implement a simple recurrent neural network as follows
-
-```c++
-// copy some `vars` form `source` to `target`
-void Copy(const Scope &source, Scope &target,
-          const std::vector<std::string> &vars);
-
-Scope default_scope;
-// some initial mutations on `default_scope` here.
-
-auto rnn_step_net = PlainNet(rnn_step_net_def);
-
-// Create rnn's states, the last scope is used to store rnn outputs.
-Scope *rnn_states = new Scope[num_states + 1];
-
-for (int i = 0; i < num_states + 1; i++) {
-  // Initialize all rnn state scopes, copy parameters and so on.
-  rnn_states[i].CreateVars(rnn_step_net_def);
-  Copy(default_scope, rnn_states[i], rnn_related_vars);
-  // Prepare rnn's inlinks, just copy inlink variables to each state.
-  Copy(default_scope, rnn_states[i], inlink_vars);
-}
-
-// Run the rnn.
-for (int i = 0; i < num_states; i++) {
-  rnn_step_net.Run(rnn_states[i]);
-  // Copy current state's state variables to next state, the related variables
-  // are named like "previous_state_xxx".
-  Copy(rnn_states[i], rnn_states[i + 1], pre_state_vars)
-}
-
-// Copy rnn's final outputs to `default_scope`.
-Copy(rnn_states[num_states], default_scope, outlink_vars);
-```
diff --git a/paddle/fluid/operators/op_documentation/op_markdown_format.md b/paddle/fluid/operators/op_documentation/op_markdown_format.md
deleted file mode 100644
index 0ee804d592252c727622cbe59b0644813db3c4fd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/op_documentation/op_markdown_format.md
+++ /dev/null
@@ -1,64 +0,0 @@
-# Standard Markdown Format for Operators
-The following should be the standard format for documentation for all the operators that will get rendered in the `html`:
-
-```
-Operator Name (In PaddlePaddle)
-
-Operator Name (Standard)
-
-Operator description.
-
-LaTeX equation of how the operator performs an update.
-
-The signature of the operator.
-```
-
-Each section mentioned above has been covered in further detail in the rest of the document.
-
-# PaddlePaddle Operator Name
-This should be in all small letters, in case of multiple words, we separate them with an underscore. For example:
-`array to lod tensor` should be written as `array_to_lod_tensor`.
-
-This naming convention should be standard across all PaddlePaddle operators.
-
-# Standard Operator Name
-This is the standard name of the operator as used in the community. The general standard is usually:
-- Standard abbreviations like `SGD` are written in all capital letters.
-- Operator names that have multiple words inside a single word use `camelCase` (capitalize word boundaries inside of a word).
-- Keep numbers inside a word as is, with no boundary delimiters.
-- Follow the name of the operator with the keyword: `Activation Operator.`
-
-# Operator description
-This section should contain the description of what the operator does, including the operation performed, the literature from where it comes and was introduced first, and other important details. The relevant paper/article including the hyperlink should be cited in this section.
-
-# LaTeX equation
-This section should contain an overall equation of the update or operation that the operator performs. The variables used in the equation should follow the naming convention of operators as described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md). Two words in the same word should be separated by an underscore (`_`).
-
-# The signature
-This section describes the signature of the operator. A list of Inputs and Outputs, each of which have a small description of what the variable represents and the type of variable. The variable names follow the `CamelCase` naming convention. The proposed format for this is:
-`Section :
-VariableName : (VariableType) VariableDescription
-...
-...
-`
-
-
-The following example for an `sgd` operator covers the above mentioned sections as they would ideally look like in the `html`:
-
-```
-sgd
-
-SGD operator
-
-This operator implements one step of the stochastic gradient descent algorithm.
-
-param_out = param_learning_rate * grad
-
-Inputs:
-Param : (Tensor) Input parameter
-LearningRate : (Tensor) Learning rate of SGD
-Grad : (Tensor) Input gradient
-
-Outputs:
-ParamOut : (Tensor) Output parameter
-```
diff --git a/paddle/fluid/operators/op_documentation/rnn_design.md b/paddle/fluid/operators/op_documentation/rnn_design.md
deleted file mode 100644
index 3d38b9a0ad225fd8e0c1bb037474b292b1887f5b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/op_documentation/rnn_design.md
+++ /dev/null
@@ -1,239 +0,0 @@
-# RNN 变长输入设计
-对变长序列的学习，现有主流框架比如 tensorflow, pytorch, caffe2, mxnet 等均使用了padding的方式，
-即将一个mini-batch内不同长度的序列补0到固定长度参与计算。
-
-现有Paddle包括 `RecurrentLayerGroup` 在内的RNN均实现了无padding的变长序列支持，本文也将基于该模块的思路，设计重构后的变长序列支持。
-
-## 背景介绍
-由于tensor必须有明确的shape，因此基于tensor 的主流框架在存储变长序列时，
-必须用zero-padding的方式将变长序列补全为固定shape的tensor。
-
-由于padding是一种框架实现变长序列的妥协， 从用户角度，在使用RNN类模型时自然会比较介意padding的存在，
-因此会有pytorch中对非padding方式变长序列支持长篇的讨论[3]。
-
-由于padding对内存和计算会有额外的消耗，tensorflow和mxnet均使用了bucketing来进行优化[1][2]，
-但不管是padding还是bucket，对于用户都是额外的使用负担。
-
-因此，**paddle原生支持变长序列的方式，能直接满足用户对变长序列的最直接的需求，在当前主流平台中可以算是一大优势**。
-
-但对变长序列的支持，需要对目前框架做一些修改，下面讨论如何在最小修改下支持变长序列。
-
-## 多层序列数据格式 `LODTensor`
-目前 Paddle 会将一个mini-batch内的数据存储在一维的内存上，
-额外使用 `Argument.sequenceStartPositions` 来存储每个句子的信息。
-
-Paddle里使用 `Argument.subSequenceStartPositions` 来存储2层的序列信息，更高维度的序列则无法直接支持；
-
-为了支持 `N-level` 序列的存储，本文将序列信息定义成如下数据结构:
-
-```c++
-std::shared_ptr<std::vector<std::vector<int>>> lod_start_pos_;
-```
-
-或者更明确的定义
-
-```c++
-typedef std::vector<int> level_t;
-std::vector<level_t> lod_start_pos;
-```
-
-这里的每一个 `level_t` 存储一个粒度(level)的偏移信息，和paddle目前做法一致。
-
-为了更透明地传递序列信息，我们引入了一种新的tensor 称为 `LODTensor`[4]，
-其关于tensor相关的接口都直接继承自 `Tensor`，但另外添加了序列相关接口。
-如此，在操作一个 `LODTensor` 时，普通 `Op` 直接当成 `Tensor` 使用，
-而操作序列的 `Op` 会额外操作 `LODTensor` 的变长序列操作的相关接口。
-
-`LODTensor` 具体定义如下：
-
-```c++
-class LODTensor : public Tensor {
-public:
-  size_t Levels() const { return seq_start_positions_.size(); }
-  size_t Elements(int level = 0) const {
-    return seq_start_positions_[level].size();
-  }
-  // slice of level[elem_begin: elem_end]
-  // NOTE low performance in slice seq_start_positions_.
-  // TODO should call Tensor's Slice.
-  LODTensor LODSlice(int level, int elem_begin, int elem_end) const;
-
-  // slice with tensor's data shared with this.
-  LODTensor LODSliceShared(int level, int elem_begin, int elem_end) const;
-
-  // copy other's lod_start_pos_, to share LOD info.
-  // NOTE the LOD info sould not be changed.
-  void ShareConstLODFrom(const LODTensor &other) {
-    lod_start_pos_ = other.lod_start_pos_;
-  }
-  // copy other's lod_start_pos_'s content, free to mutate.
-  void ShareMutableLODFrom(const LODTensor &other) {
-    lod_start_pos_ = std::make_shared <
-                     std::vector<std::vector<int>>(other.lod_start_pos_.begin(),
-                                                   other.lod_start_pos_.end());
-  }
-
-private:
-  std::shared_ptr<std::vector<std::vector<int>>> lod_start_pos_;
-};
-```
-
-其中， `lod_start_pos_` 使用了 `shared_ptr` 来减少存储和复制的代价，
-可以认为 `LODTensor` 是 `Tensor` 的扩展，几乎完全兼容原始 `Tensor` 的使用。
-
-## 框架支持
-### 框架现有的 `Tensor` 调用替换为 `LODTensor`
-为了实现 `LODTensor` 的传递，框架里很多 `Tensor` 都需要变成 `LODTensor`，
-简单实现，直接 **把之前所有的`Tensor` 全部替换成 `LODTensor`，这里可以直接修改 `pybind.cc` 里面创建`Tensor`的接口**。
-
-此外，用户有可能需要感知序列的存在（比如序列的可视化需要解析模型中输出的序列），因此一些序列操作的API也需要暴露到 python 层。
-
-### `lod_start_pos` 随着Op调用链传递
-框架需要支持下列特性，以实现`lod_start_pos`的传递：
-
-1. 以 `shared_ptr` 的方式实现传递
-    - 不修改 `lod_start_pos` 内容的作为 consumer
-    - 修改 `lod_start_pos` 的作为 producer
-    - 约定 consumer 只需要复制传递过来的 `shared_ptr`
-      - producer 需要创建自己的独立的内存，以存储自己独立的修改，并暴露 `shared_ptr` 给后续 consumer
-    - 由于传递过程是以复制`shared_ptr`的方式实现，因此框架只需要传递一次 `lod_start_pos`
-
-2. 对于不感知 `lod_start_pos` 的Op足够透明
-3. 需要修改 `lod_start_pos` 的producer Op可以在 `Run` 时更新自己的 `lod_start_pos` 数据 
-
-具体的设计分为以下3小节
-
-#### `load_start_pos` 的传递
-
-- 对于不需要修改 `lod_start_pos` 的情况，调用 LODTensor的 `ShareConstLODFrom` 接口实现复制
-- 需要修改的，调用`ShareMutableLODFrom` 接口自己分配内存以存储修改
-
-#### 框架透明
-传递这一步需要加入到网络跑之前的初始化操作中，并且只需要初始化一次，基于当前框架设计的初步方案如下
-
-- 在 Op 的 `attrs` 中添加一项 `do_mutate_lod_info` 的属性，默认为 `false`
-  - 有需要修改 `lod_start_pos` 的Op需要在定义 `OpProto` 时设置为 `true`
-- `OperatorBase` 的 `InferShape` 中会读取 `do_mutate_lod_info` ，并且调用 `LODTensor` 相关的方法实现 `lod_start_pos` 的复制。
-- `OperatorBase` 中添加一个 member `is_lod_inited{false}` 来保证传递只进行一次
-
-一些逻辑如下
-
-```c++
-class OperatorBase {
-public:
-  // ...
-  void InferShape() {
-    if (!is_load_inited) {
-      bool do_mutate_lod_info = GetAttr<bool>("do_mutate_load_info");
-      // find a input having LOD to copy
-      auto lod_input = ValidLODInput();
-      for (auto &output : outputs) {
-        if (do_mutate_load_info) {
-          output.ShareMutableLODFrom(lod_input);
-        } else {
-          output.ShareConstLODFrom(load_input);
-        }
-      }
-      is_pod_inited = true;
-    }
-
-    // call op's InferShape
-    // ...
-  }
-
-private:
-  // ...
-  bool is_lod_inited{false};
-};
-```
-
-如此，`lod_start_pos` 的信息的传递对非OLD的Op的实现是完全透明的。
-
-#### `lod_start_pos` 的更新
-上一小节介绍到，对于需要修改 `load_start_pos` 的Op，`OperatorBase` 会分配一块自己的内存以存储修改，
-Op在 `Run` 的实现中，操作更新自己的 `load_start_pos` ，
-而所有依赖其 outputs 的 op 会通过共享的指针自动获取到其更新。
-
-## 根据长度排序
-按照长度排序后，从前往后的时间步的batch size会自然地递减，可以直接塞入 Net 做batch计算
-
-比如原始的输入：
-
-```
-origin:
-xxxx
-xx
-xxx
-
--> sorted:
-xxxx
-xxx
-xx
-```
-
-经过 `SegmentInputs` 之后，每个会有4个时间步，每个时间步的输入如下（纵向排列）
-
-```
-0    1    2    3
-x    x    x    x
-x    x    x
-x    x
-```
-
-为了追踪排序前后序列的变化，这里用
-```c++
-struct SortedSeqItem {
-   void *start{nullptr};
-   void *end{nullptr};
-};
-
-std::vector<SortedSeqItem> sorted_seqs;
-```
-来追踪序列排序后的位置，并添加一个新的接口 
-
-```c++
-std::vector<SortedSeqItem> SortBySeqLen(const LODTensor& tensor);
-```
-
-由于输入序列的顺序变化，以下现有的接口需要针对性地修改：
-
-- InitMemories, memory需要根据 `sorted_seqs` 重新排列
-- SetmentInputs
-- ConcatOutputs
-
-此外，由于 `sorted_seqs` 需要被 `RecurrentGradientOp` 复用，因此会变成 `RecurrentOp` 一个新的output输出，
-之后作为 `RecurrentGradientOp` 的一个输入传入。
-
-## InitMemories
-由于序列顺序的变化，`boot_memories` 的batch上的element的顺序也需要对应重新排列。
-
-## SegmentInputs
-`SegmentInputs` 会依赖 `sorted_seqs` 的信息，将原始的序列按照排序后的序列顺序，从横向切割，转为每个step中的inputs。
-
-即下面的转变：
-```
-origin:
-xxxx
-xx
-xxx
-
-   |
-   |
-  \ /
-   !
-0    1    2    3
-x    x    x    x
-x    x    x
-x    x
-```
-## ConcatOutputs
-`ConcatOutputs` 需要
-
-- 将每个时间步的输出重新还原为原始输入的序列顺序（以防止Infer阶段顺序打乱）
-- 将每个序列concat 为规则的mini-batch表示
-
-## 参考文献
-1. [Tensorflow Bucketing](https://www.tensorflow.org/versions/r0.12/api_docs/python/contrib.training/bucketing)
-2. [mxnet Bucketing](http://mxnet.io/how_to/bucketing.html)
-3. [variable length input in RNN scenario](https://discuss.pytorch.org/t/about-the-variable-length-input-in-rnn-scenario/345/5)
-4. [Level of details](https://en.wikipedia.org/wiki/Level_of_detail)
diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc
index d2a0106f80144e3550d73ea22f8e012426eb01ae..d4b631a6f5bf9332f4ed1d1a4bda529fbb6ada0a 100644
--- a/paddle/fluid/operators/pad_op.cc
+++ b/paddle/fluid/operators/pad_op.cc
@@ -48,8 +48,7 @@ class PadOp : public framework::OperatorWithKernel {
 
 class PadOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PadOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "The input of pad op. "
              "The input should be a k-D tensor(k > 0 and k < 7)");
diff --git a/paddle/fluid/operators/pad_op.h b/paddle/fluid/operators/pad_op.h
index a36abe3789574cb64f05001e34d534cf352a60b2..c93c096575a30dd9344894ead4b81acc16930e21 100644
--- a/paddle/fluid/operators/pad_op.h
+++ b/paddle/fluid/operators/pad_op.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc
index bf4d0476df32d7454d4064cb6ee454e6ad5d6fc5..c9744db3d0654ef63357963d9a9a3cb946f56e2d 100644
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -143,7 +144,12 @@ class ParallelDoOp : public framework::OperatorBase {
       PADDLE_ENFORCE(scope.FindVar(param)->IsType<LoDTensor>(),
                      "Only support parameter type as LoDTensor");
       auto &src = scope.FindVar(param)->Get<LoDTensor>();
-      for (size_t i = 0; i < sub_scopes.size(); ++i) {
+
+      auto *sub_scope0 = sub_scopes[0];
+      auto *dst0 = sub_scope0->Var(param)->GetMutable<LoDTensor>();
+      dst0->ShareDataWith(src);
+
+      for (size_t i = 1; i < sub_scopes.size(); ++i) {
         auto &place = places[i];
         auto *sub_scope = sub_scopes[i];
         auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>();
@@ -158,11 +164,14 @@ class ParallelDoOp : public framework::OperatorBase {
       auto &place = places[place_idx];
       auto *cur_scope = sub_scopes[place_idx];
 
-      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
-        framework::Executor executor(place);
-        executor.Run(*program, cur_scope, block->ID(),
-                     false /*create_local_scope*/);
-      }));
+      workers.emplace_back(
+          framework::Async([program, cur_scope, place, block, place_idx] {
+            // Give the thread an id to distinguish parallel block with same id.
+            platform::RecordThread rt(static_cast<int>(place_idx) + 1);
+            framework::Executor executor(place);
+            executor.Run(*program, cur_scope, block->ID(),
+                         false /*create_local_scope*/);
+          }));
     }
     for (auto &worker : workers) {
       worker.wait();
@@ -187,8 +196,7 @@ class ParallelDoOp : public framework::OperatorBase {
 
 class ParallelDoOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ParallelDoOpProtoMaker(OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(kInputs, "").AsDuplicable();
     AddInput(kParameters, "").AsDuplicable();
     AddInput(kPlaces, "");
@@ -234,11 +242,14 @@ class ParallelDoGradOp : public framework::OperatorBase {
       auto *cur_scope = sub_scopes[i];
 
       // execute
-      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
-        framework::Executor executor(place);
-        executor.Run(*program, cur_scope, block->ID(),
-                     false /*create_local_scope*/);
-      }));
+      workers.emplace_back(
+          framework::Async([program, cur_scope, place, block, i] {
+            // Give the thread an id to distinguish parallel block with same id.
+            platform::RecordThread rt(static_cast<int>(i) + 1);
+            framework::Executor executor(place);
+            executor.Run(*program, cur_scope, block->ID(),
+                         false /*create_local_scope*/);
+          }));
     }
     for (auto &worker : workers) {
       worker.wait();
@@ -284,7 +295,7 @@ class ParallelDoGradOp : public framework::OperatorBase {
 
         auto sum_op = framework::OpRegistry::CreateOp(
             "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
-            framework::AttributeMap{});
+            framework::AttributeMap{{"use_mkldnn", {false}}});
         VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]);
         sum_op->Run(*sub_scopes[0], places[0]);
         WaitOnPlace(places[0]);
@@ -352,7 +363,7 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
       }
     }
     grad->SetAttrMap(this->Attrs());
-    grad->SetBlockAttr(kParallelBlock, *grad_block_[0]);
+    grad->SetBlockAttr(kParallelBlock, grad_block_[0]);
 
     return std::unique_ptr<framework::OpDesc>(grad);
   }
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
index 781d96981e4c033d9287ab3de9860dfd9fcd2875..be55bc43b14f1e6211f71b4080d1676838ad508c 100644
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -24,6 +24,8 @@ using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor;
 using DataLayout = platform::DataLayout;
 using PoolingMode = platform::PoolingMode;
+template <typename T>
+using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
 
 template <typename T>
 class PoolCUDNNOpKernel : public framework::OpKernel<T> {
@@ -78,8 +80,7 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
 
     // ------------------- cudnn pool algorithm ---------------------
     auto handle = ctx.cuda_device_context().cudnn_handle();
-    T alpha = 1.0f, beta = 0.0f;
-
+    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
     PADDLE_ENFORCE(platform::dynload::cudnnPoolingForward(
         handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta,
         cudnn_output_desc, output_data));
@@ -134,7 +135,11 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
 
     PoolingMode pooling_mode;
     if (pooling_type == "max") {
-      pooling_mode = PoolingMode::kMaximum;
+      if (FLAGS_cudnn_deterministic) {
+        pooling_mode = PoolingMode::kMaximumDeterministic;
+      } else {
+        pooling_mode = PoolingMode::kMaximum;
+      }
     } else {
       pooling_mode = PoolingMode::kAverage;
     }
@@ -144,8 +149,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
 
     // ------------------- cudnn pool algorithm ---------------------
     auto handle = ctx.cuda_device_context().cudnn_handle();
-    T alpha = 1.0f, beta = 0.0f;
-
+    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
     if (input_grad) {
       T *input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset input_grad.
@@ -162,17 +166,20 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 
-REGISTER_OP_KERNEL(pool2d, CUDNN, ::paddle::platform::CUDAPlace,
+REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace,
                    ops::PoolCUDNNOpKernel<float>,
-                   ops::PoolCUDNNOpKernel<double>);
-REGISTER_OP_KERNEL(pool2d_grad, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::PoolCUDNNOpKernel<double>,
+                   ops::PoolCUDNNOpKernel<plat::float16>);
+REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace,
                    ops::PoolCUDNNGradOpKernel<float>,
                    ops::PoolCUDNNGradOpKernel<double>);
 
-REGISTER_OP_KERNEL(pool3d, CUDNN, ::paddle::platform::CUDAPlace,
+REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
                    ops::PoolCUDNNOpKernel<float>,
-                   ops::PoolCUDNNOpKernel<double>);
-REGISTER_OP_KERNEL(pool3d_grad, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::PoolCUDNNOpKernel<double>,
+                   ops::PoolCUDNNOpKernel<plat::float16>);
+REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace,
                    ops::PoolCUDNNGradOpKernel<float>,
                    ops::PoolCUDNNGradOpKernel<double>);
diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc
index c88578570c1acdecaa97dd8b12a702778fef2b7e..5341187d1ce9400ac34750ab691608e76158ae0d 100644
--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
@@ -18,6 +18,34 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using framework::DataLayout;
+using mkldnn::memory;
+using mkldnn::pooling_backward;
+using mkldnn::pooling_forward;
+using mkldnn::primitive;
+using mkldnn::reorder;
+using mkldnn::stream;
+using platform::to_void_cast;
+
+// Generate keys for storing/retriving primitives for this operator
+// TODO(jczaja): Make hashing function more optimial
+static std::string gethash(const memory::dims& input_dims,
+                           const std::string& pooling_type,
+                           const std::vector<int>& ksize,
+                           const std::vector<int>& strides,
+                           const std::vector<int>& paddings,
+                           const std::string& suffix) {
+  auto dims2str = [](const memory::dims& operand_dims) {
+    std::string dstr = "";
+    for (size_t i = 0; i < operand_dims.size(); ++i) {
+      dstr += std::to_string(operand_dims[i]) + "-";
+    }
+    return dstr;
+  };
+  return dims2str(input_dims) + dims2str(ksize) + dims2str(strides) +
+         dims2str(paddings) + pooling_type + suffix;
+}
+
 template <typename T>
 class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
@@ -32,12 +60,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const Tensor* input = ctx.Input<Tensor>("X");
     Tensor* output = ctx.Output<Tensor>("Out");
 
-    // Get an unique name from "argument" name of "Out" variable
-    // This name will be used as key when saving info into device context
-    const std::string key = ctx.op().Output("Out");
-    const std::string key_pool_pd = key + "@pool_pd";
-    const std::string key_pool_workspace_memory =
-        key + "@pool_workspace_memory";
+    PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
+                       input->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input tensor");
 
     std::string pooling_type = ctx.Attr<std::string>("pooling_type");
     std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
@@ -63,36 +88,84 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
 
-    // TODO(pzelazko-intel): support more formats
-    auto src_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
-                                          mkldnn::memory::format::nchw);
-    auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32,
-                                          mkldnn::memory::format::nchw);
-
-    std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd =
-        CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize,
-                            pooling_type, mkldnn_engine);
-
-    // save pool_pd into global device context to be referred in backward path
-    dev_ctx.SetBlob(key_pool_pd, pool_pd);
-
-    std::shared_ptr<mkldnn::memory> workspace_memory =
-        CreateWorkspaceMemory(pool_pd, pooling_type, mkldnn_engine);
-
-    // save pool_workspace_memory to be referred in backward path
-    dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
+    auto input_format = input->format();
+    memory::format output_format{memory::format::format_undef};
 
-    auto src_memory =
-        mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data);
-    auto dst_memory =
-        mkldnn::memory({dst_md, mkldnn_engine}, (void*)output_data);
+    const std::string key = gethash(src_tz, pooling_type, ksize, strides,
+                                    paddings, ctx.op().Output("Out"));
+    const std::string key_pool_p = key + "@pool_p";
+    const std::string key_pool_pd = key + "@pool_pd";
+    const std::string key_pool_src_mem_p = key + "@pool_src_mem_p";
+    const std::string key_pool_dst_mem_p = key + "@pool_dst_mem_p";
+    const std::string key_pool_workspace_memory =
+        key + "@pool_workspace_memory";
 
-    auto pool_prim = mkldnn::pooling_forward(*pool_pd, src_memory, dst_memory,
-                                             *workspace_memory);
+    auto pool_p =
+        std::static_pointer_cast<pooling_forward>(dev_ctx.GetBlob(key_pool_p));
+    if (pool_p == nullptr) {
+      auto src_md = platform::MKLDNNMemDesc(
+          src_tz, platform::MKLDNNGetDataType<T>(), input_format);
+
+      /* create memory descriptor for pooling without specified format
+       * ('any') which lets a primitive (pooling in this case) choose
+       * the memory format preferred for best performance
+       */
+      auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32,
+                                            mkldnn::memory::format::any);
+
+      std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd =
+          CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize,
+                              pooling_type, mkldnn_engine);
+
+      // save pool_pd into global device context to be referred in backward path
+      dev_ctx.SetBlob(key_pool_pd, pool_pd);
+
+      std::shared_ptr<mkldnn::memory> workspace_memory =
+          CreateWorkspaceMemory(pool_pd, pooling_type, mkldnn_engine);
+
+      // save pool_workspace_memory to be referred in backward path
+      dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
+
+      auto src_memory = std::make_shared<memory>(pool_pd->src_primitive_desc(),
+                                                 to_void_cast<T>(input_data));
+      auto dst_memory =
+          std::make_shared<memory>(pool_pd->dst_primitive_desc(), output_data);
+
+      dev_ctx.SetBlob(key_pool_src_mem_p, src_memory);
+      dev_ctx.SetBlob(key_pool_dst_mem_p, dst_memory);
+
+      pool_p = std::make_shared<pooling_forward>(*pool_pd, *(src_memory.get()),
+                                                 *(dst_memory.get()),
+                                                 *workspace_memory);
+
+      dev_ctx.SetBlob(key_pool_p, pool_p);
+
+      output_format =
+          (memory::format)dst_memory->get_primitive_desc().desc().data.format;
+    } else {
+      // Primitives already exist
+      auto pool_src_memory_p =
+          std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_src_mem_p));
+      PADDLE_ENFORCE(pool_src_memory_p != nullptr,
+                     "Fail to find pooling src mem_p in device context");
+      auto pool_dst_memory_p =
+          std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_dst_mem_p));
+      PADDLE_ENFORCE(pool_dst_memory_p != nullptr,
+                     "Fail to find pooling dst mem_p in device context");
+      pool_src_memory_p->set_data_handle(to_void_cast<T>(input_data));
+      pool_dst_memory_p->set_data_handle(output_data);
+
+      output_format = (memory::format)pool_dst_memory_p->get_primitive_desc()
+                          .desc()
+                          .data.format;
+    }
 
     // push primitive to stream and wait until it's executed
-    std::vector<mkldnn::primitive> pipeline{pool_prim};
-    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+    std::vector<mkldnn::primitive> pipeline{*(pool_p.get())};
+    stream(stream::kind::eager).submit(pipeline).wait();
+
+    output->set_layout(DataLayout::kMKLDNN);
+    output->set_format(output_format);
   }
 
  private:
@@ -118,9 +191,10 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     mkldnn::memory::primitive_desc workspace_md =
         pooling_type == "max"
             ? pool_pd->workspace_primitive_desc()
-            : mkldnn::memory::primitive_desc(
-                  {{}, mkldnn::memory::f32, mkldnn::memory::format::nchw},
-                  engine);
+            : mkldnn::memory::primitive_desc({{},
+                                              platform::MKLDNNGetDataType<T>(),
+                                              mkldnn::memory::format::nchw},
+                                             engine);
 
     auto p_workspace_memory = new mkldnn::memory(workspace_md);
     return std::unique_ptr<mkldnn::memory>(p_workspace_memory);
@@ -138,12 +212,12 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
     Tensor* in_x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
 
-    // Get an unique name from "argument" name of "Out" variable
-    // This name will be used as key when referring info from device context
-    const std::string key = ctx.op().Input("Out");
-    const std::string key_pool_pd = key + "@pool_pd";
-    const std::string key_pool_workspace_memory =
-        key + "@pool_workspace_memory";
+    PADDLE_ENFORCE(in_x->layout() == DataLayout::kMKLDNN &&
+                       in_x->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input X tensor");
+    PADDLE_ENFORCE(out_grad->layout() == DataLayout::kMKLDNN &&
+                       out_grad->format() != memory::format::format_undef,
+                   "Wrong layout/format set for Input output_grad tensor");
 
     std::string pooling_type = ctx.Attr<std::string>("pooling_type");
     std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
@@ -163,55 +237,139 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 
     const T* out_grad_data = out_grad->data<T>();
     T* in_x_grad_data = in_x_grad->mutable_data<T>(ctx.GetPlace());
+    memory::format in_x_grad_format{memory::format::format_undef};
 
     std::vector<int> diff_src_tz =
         paddle::framework::vectorize2int(in_x_grad->dims());
     std::vector<int> diff_dst_tz =
         paddle::framework::vectorize2int(out_grad->dims());
 
-    auto diff_src_md = platform::MKLDNNMemDesc(diff_src_tz, mkldnn::memory::f32,
-                                               mkldnn::memory::format::nchw);
-    auto diff_dst_md = platform::MKLDNNMemDesc(diff_dst_tz, mkldnn::memory::f32,
-                                               mkldnn::memory::format::nchw);
-
-    // Retrieve pool_pd/pool_workspace_memory from device context
-    auto pool_pd =
-        std::static_pointer_cast<mkldnn::pooling_forward::primitive_desc>(
-            dev_ctx.GetBlob(key_pool_pd));
-    PADDLE_ENFORCE(pool_pd != nullptr,
-                   "Fail to find pool_pd in device context");
+    // Get an unique name from "argument" name of "Out" variable
+    // This name will be used as key when referring info from device context
+    const std::string key = gethash(diff_src_tz, pooling_type, ksize, strides,
+                                    paddings, ctx.op().Input("Out"));
+    const std::string key_pool_bwd_p = key + "@pool_bwd_p";
+    const std::string key_pool_diff_src_mem_p = key + "@pool_diff_src_mem_p";
+    const std::string key_pool_diff_dst_mem_p = key + "@pool_diff_dst_mem_p";
+    const std::string key_pool_src_mem_p = key + "@pool_src_mem_p";
+    const std::string key_pool_dst_mem_p = key + "@pool_dst_mem_p";
+    const std::string key_pool_pd = key + "@pool_pd";
+    const std::string key_pool_workspace_memory =
+        key + "@pool_workspace_memory";
 
-    auto workspace_memory = std::static_pointer_cast<mkldnn::memory>(
-        dev_ctx.GetBlob(key_pool_workspace_memory));
-    PADDLE_ENFORCE(workspace_memory != nullptr,
-                   "Fail to find workspace_memory in device context");
+    auto user_diff_dst_memory =
+        memory({{{diff_dst_tz}, memory::data_type::f32, out_grad->format()},
+                mkldnn_engine},
+               to_void_cast<T>(out_grad_data));
 
-    auto pool_bwd_desc = mkldnn::pooling_backward::desc(
-        pooling_type == "max" ? mkldnn::algorithm::pooling_max
-                              : mkldnn::algorithm::pooling_avg,
-        diff_src_md, diff_dst_md, strides, ksize, paddings, paddings,
-        mkldnn::padding_kind::zero);
-    auto pool_bwd_pd = mkldnn::pooling_backward::primitive_desc(
-        pool_bwd_desc, mkldnn_engine, *pool_pd);
+    std::shared_ptr<memory> diff_src_memory;
+    std::shared_ptr<memory> diff_dst_memory;
+    auto dst_memory =
+        std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_dst_mem_p));
+    PADDLE_ENFORCE(dst_memory != nullptr,
+                   "Fail to find dst_memory in device context");
+
+    primitive reorder_diff_dst;
+    bool is_diff_dst_reordered = false;
+    auto pool_bwd_p = std::static_pointer_cast<pooling_backward>(
+        dev_ctx.GetBlob(key_pool_bwd_p));
+    if (pool_bwd_p == nullptr) {
+      // Retrieve src_memory/dst_memory saved in forward pass
+      auto src_memory =
+          std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_src_mem_p));
+      PADDLE_ENFORCE(src_memory != nullptr,
+                     "Fail to find src_memory in device context");
+      // Retrieve pool_pd/pool_workspace_memory from device context
+      auto pool_pd =
+          std::static_pointer_cast<mkldnn::pooling_forward::primitive_desc>(
+              dev_ctx.GetBlob(key_pool_pd));
+      PADDLE_ENFORCE(pool_pd != nullptr,
+                     "Fail to find pool_pd in device context");
+      auto workspace_memory = std::static_pointer_cast<memory>(
+          dev_ctx.GetBlob(key_pool_workspace_memory));
+      PADDLE_ENFORCE(workspace_memory != nullptr,
+                     "Fail to find workspace_memory in device context");
+
+      // create memory descriptors for pooling
+      auto diff_src_md = src_memory.get()->get_primitive_desc().desc();
+      auto diff_dst_md = dst_memory.get()->get_primitive_desc().desc();
+
+      auto pool_bwd_desc = mkldnn::pooling_backward::desc(
+          pooling_type == "max" ? mkldnn::algorithm::pooling_max
+                                : mkldnn::algorithm::pooling_avg,
+          diff_src_md, diff_dst_md, strides, ksize, paddings, paddings,
+          mkldnn::padding_kind::zero);
+      auto pool_bwd_pd = mkldnn::pooling_backward::primitive_desc(
+          pool_bwd_desc, mkldnn_engine, *pool_pd);
+
+      // reorder between user_diff_dst and pool diff_dst if needed
+      diff_dst_memory = std::make_shared<memory>(user_diff_dst_memory);
+      if (memory::primitive_desc(dst_memory->get_primitive_desc()) !=
+          user_diff_dst_memory.get_primitive_desc()) {
+        diff_dst_memory =
+            std::make_shared<memory>(dst_memory.get()->get_primitive_desc());
+        reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory);
+        is_diff_dst_reordered = true;
+      }
 
-    auto diff_src_memory =
-        mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)in_x_grad_data);
-    auto diff_dst_memory =
-        mkldnn::memory({diff_dst_md, mkldnn_engine}, (void*)out_grad_data);
+      diff_src_memory = std::make_shared<memory>(
+          pool_bwd_pd.diff_src_primitive_desc(), in_x_grad_data);
+
+      dev_ctx.SetBlob(key_pool_diff_src_mem_p, diff_src_memory);
+      dev_ctx.SetBlob(key_pool_diff_dst_mem_p, diff_dst_memory);
+
+      pool_bwd_p = std::make_shared<pooling_backward>(
+          pool_bwd_pd, *(diff_dst_memory.get()), *workspace_memory,
+          *(diff_src_memory));
+      dev_ctx.SetBlob(key_pool_bwd_p, pool_bwd_p);
+
+    } else {
+      // Primitives already exist
+      diff_src_memory = std::static_pointer_cast<memory>(
+          dev_ctx.GetBlob(key_pool_diff_src_mem_p));
+      PADDLE_ENFORCE(diff_src_memory != nullptr,
+                     "Fail to find pooling src mem_p in device context");
+      diff_dst_memory = std::static_pointer_cast<memory>(
+          dev_ctx.GetBlob(key_pool_diff_dst_mem_p));
+      PADDLE_ENFORCE(diff_dst_memory != nullptr,
+                     "Fail to find pooling dst mem_p in device context");
+
+      diff_src_memory->set_data_handle(reinterpret_cast<void*>(in_x_grad_data));
+      diff_dst_memory->set_data_handle(const_cast<T*>(out_grad_data));
+
+      // reorder between user_diff_dst and pool diff_dst if needed
+      if (memory::primitive_desc(dst_memory->get_primitive_desc()) !=
+          user_diff_dst_memory.get_primitive_desc()) {
+        diff_dst_memory =
+            std::make_shared<memory>(dst_memory.get()->get_primitive_desc());
+        reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory);
+        is_diff_dst_reordered = true;
+      }
+    }
 
-    auto bwd_prim = mkldnn::pooling_backward(
-        pool_bwd_pd, diff_dst_memory, *workspace_memory, diff_src_memory);
+    in_x_grad_format = (memory::format)diff_src_memory->get_primitive_desc()
+                           .desc()
+                           .data.format;
 
     // push primitive to stream and wait until it's executed
-    std::vector<mkldnn::primitive> pipeline{bwd_prim};
+    std::vector<mkldnn::primitive> pipeline;
+    if (is_diff_dst_reordered) {
+      pipeline.push_back(reorder_diff_dst);
+    }
+    pipeline.push_back(*(pool_bwd_p.get()));
     mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+
+    in_x_grad->set_layout(DataLayout::kMKLDNN);
+    in_x_grad->set_format(in_x_grad_format);
   }  // Compute()
 };
 
 }  // namespace operators
 }  // namespace paddle
 
+namespace ops = paddle::operators;
+
 REGISTER_OP_KERNEL(pool2d, MKLDNN, ::paddle::platform::CPUPlace,
-                   paddle::operators::PoolMKLDNNOpKernel<float>);
+                   ops::PoolMKLDNNOpKernel<float>);
 REGISTER_OP_KERNEL(pool2d_grad, MKLDNN, ::paddle::platform::CPUPlace,
-                   paddle::operators::PoolMKLDNNGradOpKernel<float>);
+                   ops::PoolMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index d78da10016a0e2b1d9a0ca9f3dfe4e8009bbe61d..f8ad63690e84339da0390d4ddd2db45f25db385a 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -83,6 +83,9 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
 framework::OpKernelType PoolOp::GetExpectedKernelType(
     const framework::ExecutionContext &ctx) const {
   framework::LibraryType library_{framework::LibraryType::kPlain};
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+
 #ifdef PADDLE_WITH_CUDA
   if (platform::CanCUDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kCUDNN;
@@ -92,11 +95,10 @@ framework::OpKernelType PoolOp::GetExpectedKernelType(
   if (library_ == framework::LibraryType::kPlain &&
       platform::CanMKLDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kMKLDNN;
+    layout_ = framework::DataLayout::kMKLDNN;
   }
 #endif
 
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
   return framework::OpKernelType(
       framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
       layout_, library_);
@@ -112,6 +114,9 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
 framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
     const framework::ExecutionContext &ctx) const {
   framework::LibraryType library_{framework::LibraryType::kPlain};
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+
 #ifdef PADDLE_WITH_CUDA
   if (platform::CanCUDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kCUDNN;
@@ -121,18 +126,20 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
   if (library_ == framework::LibraryType::kPlain &&
       platform::CanMKLDNNBeUsed(ctx)) {
     library_ = framework::LibraryType::kMKLDNN;
+    layout_ = framework::DataLayout::kMKLDNN;
   }
 #endif
 
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
-      layout_, library_);
+  auto input_data_type = framework::ToDataType(ctx.Input<Tensor>("X")->type());
+  if (input_data_type == framework::proto::VarType::FP16) {
+    PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
+                      "float16 can only be used when CUDNN is used");
+  }
+  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
+                                 library_);
 }
 
-Pool2dOpMaker::Pool2dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-    : OpProtoAndCheckerMaker(proto, op_checker) {
+void Pool2dOpMaker::Make() {
   AddInput(
       "X",
       "(Tensor) The input tensor of pooling operator. "
@@ -144,7 +151,8 @@ Pool2dOpMaker::Pool2dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
             "The format of output tensor is also NCHW, "
             "where N is batch size, C is the number of channels, "
             "H is the height of the feature, "
-            "and W is the width of the feature.");
+            "and W is the width of the feature.")
+      .Reuse("X");
 
   AddAttr<std::string>("pooling_type",
                        "(string), pooling type, can be \"max\" for max-pooling "
@@ -196,8 +204,6 @@ Pool2dOpMaker::Pool2dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
   // TODO(dzhwinter): need to registered layout transform function
 
   AddComment(R"DOC(
-Pool2d Operator.
-
 The pooling2d operation calculates the output based on
 the input, pooling_type and ksize, strides, paddings parameters.
 Input(X) and output(Out) are in NCHW format, where N is batch size, C is the
@@ -207,26 +213,34 @@ These two elements represent height and width, respectively.
 The input(X) size and output(Out) size may be different.
 
 Example:
+
   Input:
+
        X shape: $(N, C, H_{in}, W_{in})$
+
   Output:
+
        Out shape: $(N, C, H_{out}, W_{out})$
+
   For ceil_mode = false:
        $$
-       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
-       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
+       H_{out} = \\frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1
+       $$
+       $$
+       W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
        $$
   For ceil_mode = true:
        $$
-       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0] + strides[0] - 1)}{strides[0]} + 1 \\
-       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
+       H_{out} = \\frac{(H_{in} - ksize[0] + 2 * paddings[0] + strides[0] - 1)}{strides[0]} + 1
+       $$
+       $$
+       W_{out} = \\frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
        $$
 
 )DOC");
 }
 
-Pool3dOpMaker::Pool3dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-    : OpProtoAndCheckerMaker(proto, op_checker) {
+void Pool3dOpMaker::Make() {
   AddInput("X",
            "(Tensor) The input tensor of pooling operator. "
            "The format of input tensor is NCDHW, where N is batch size, C is "
@@ -238,7 +252,8 @@ Pool3dOpMaker::Pool3dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
             "The format of output tensor is also NCDHW, "
             "where N is batch size, C is "
             "the number of channels, and D, H and W is the depth, height and "
-            "width of the feature, respectively.");
+            "width of the feature, respectively.")
+      .Reuse("X");
 
   AddAttr<std::string>("pooling_type",
                        "(string) Pooling type, can be \"max\" for max-pooling "
@@ -329,18 +344,20 @@ Example:
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(pool2d, ops::PoolOp, ops::Pool2dOpMaker, pool2d_grad,
-            ops::PoolOpGrad);
+REGISTER_OPERATOR(pool2d, ops::PoolOp, ops::Pool2dOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(pool2d_grad, ops::PoolOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     pool2d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
     ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     pool2d_grad, ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>)
+    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>);
 
-REGISTER_OP(pool3d, ops::PoolOp, ops::Pool3dOpMaker, pool3d_grad,
-            ops::PoolOpGrad);
+REGISTER_OPERATOR(pool3d, ops::PoolOp, ops::Pool3dOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     pool3d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h
index 2fec50ef25e0d2621a87963acdf142d24970329d..a63963ca926bb94ff99e5cfe6dbcb2b15075bcb8 100644
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -48,12 +50,12 @@ class PoolOpGrad : public framework::OperatorWithKernel {
 
 class Pool2dOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Pool2dOpMaker(OpProto* proto, OpAttrChecker* op_checker);
+  void Make() override;
 };
 
 class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Pool3dOpMaker(OpProto* proto, OpAttrChecker* op_checker);
+  void Make() override;
 };
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc
index 4df0a14577ca13ddd79424fc324eb689913b20a0..873706593e4c856f0079738654a9e7e59a1c0cd8 100644
--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -100,8 +100,7 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
 
 class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MaxPool2dWithIndexOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(
         "X",
         "(Tensor) The input tensor of pooling operator. "
@@ -177,8 +176,7 @@ Example:
 
 class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MaxPool3dWithIndexOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(Tensor) The input tensor of pooling operator. "
              "The format of input tensor is NCDHW, where N is batch size, C is "
@@ -258,9 +256,10 @@ Example:
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(max_pool2d_with_index, ops::MaxPoolWithIndexOp,
-            ops::MaxPool2dWithIndexOpMaker, max_pool2d_with_index_grad,
-            ops::MaxPoolWithIndexOpGrad);
+REGISTER_OPERATOR(max_pool2d_with_index, ops::MaxPoolWithIndexOp,
+                  ops::MaxPool2dWithIndexOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(max_pool2d_with_index_grad, ops::MaxPoolWithIndexOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     max_pool2d_with_index,
@@ -272,11 +271,12 @@ REGISTER_OP_CPU_KERNEL(
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
                                     int>,
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
-                                    int>)
+                                    int>);
 
-REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
-            ops::MaxPool3dWithIndexOpMaker, max_pool3d_with_index_grad,
-            ops::MaxPoolWithIndexOpGrad);
+REGISTER_OPERATOR(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
+                  ops::MaxPool3dWithIndexOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(max_pool3d_with_index_grad, ops::MaxPoolWithIndexOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     max_pool3d_with_index,
@@ -288,4 +288,4 @@ REGISTER_OP_CPU_KERNEL(
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
                                     int>,
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
-                                    int>)
+                                    int>);
diff --git a/paddle/fluid/operators/pool_with_index_op.cu.cc b/paddle/fluid/operators/pool_with_index_op.cu.cc
index 5fc418b6fdd19eddfd27b4a1b3e2554d7b2f37e6..5497dcbd9ce255f833df24989d7a76c40bcbca06 100644
--- a/paddle/fluid/operators/pool_with_index_op.cu.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cu.cc
@@ -27,7 +27,7 @@ REGISTER_OP_CUDA_KERNEL(
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
                                     int>,
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
-                                    int>)
+                                    int>);
 
 REGISTER_OP_CUDA_KERNEL(
     max_pool3d_with_index,
@@ -40,4 +40,4 @@ REGISTER_OP_CUDA_KERNEL(
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
                                     int>,
     ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
-                                    int>)
+                                    int>);
diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h
index 83e7bd138ae25c6d3e09c3d01178d6887205bf98..b55fa76eae34c3179d40f31ed6a57d3ecbbaaccf 100644
--- a/paddle/fluid/operators/pool_with_index_op.h
+++ b/paddle/fluid/operators/pool_with_index_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
diff --git a/paddle/fluid/operators/positive_negative_pair_op.cc b/paddle/fluid/operators/positive_negative_pair_op.cc
index d237da25a00de13057e009b6705d3241b8b26539..4d865b7f17b050ac6f04addc9949f3f65da06ded 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.cc
+++ b/paddle/fluid/operators/positive_negative_pair_op.cc
@@ -95,8 +95,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
 
 class PositiveNegativePairOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PositiveNegativePairOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Score",
              "(Tensor, float) Model Score on an item (with "
              "respect to QueryID). It's a 2-D tensor with shape [batch_size, "
diff --git a/paddle/fluid/operators/positive_negative_pair_op.h b/paddle/fluid/operators/positive_negative_pair_op.h
index f20f33bbeb19766d6974ea17b155cac363c01fb2..db0a1002f47944c5d926fb5a51b84536dcf446b8 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.h
+++ b/paddle/fluid/operators/positive_negative_pair_op.h
@@ -14,7 +14,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/precision_recall_op.cc b/paddle/fluid/operators/precision_recall_op.cc
index c34b0d072bdb2f5b97dd4615ff9338d98f2bfbe5..e7ce16f33fb5052ffb41fc05bd1538e2f0dc35be 100644
--- a/paddle/fluid/operators/precision_recall_op.cc
+++ b/paddle/fluid/operators/precision_recall_op.cc
@@ -90,8 +90,7 @@ class PrecisionRecallOp : public framework::OperatorWithKernel {
 
 class PrecisionRecallOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PrecisionRecallOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("MaxProbs",
              "(Tensor, default Tensor<float>) A 2-D tensor with shape N x 1, "
              "where N is the batch size. Each row contains the max probability "
diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8734282fe496b8e90af19abd5549566d62316fc3
--- /dev/null
+++ b/paddle/fluid/operators/prefetch_op.cc
@@ -0,0 +1,94 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <future>  // NOLINT
+#include <ostream>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/send_recv_util.h"
+
+namespace paddle {
+namespace operators {
+
+class PrefetchOp : public framework::OperatorBase {
+ public:
+  PrefetchOp(const std::string& type, const framework::VariableNameMap& inputs,
+             const framework::VariableNameMap& outputs,
+             const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    auto ins = Inputs("X");
+    auto outs = Outputs("Out");
+
+    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& ctx = *pool.Get(place);
+
+    distributed::RPCClient* rpc_client =
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
+
+    for (size_t i = 0; i < ins.size(); i++) {
+      if (NeedSend(scope, ins[i])) {
+        VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get "
+                << outs[i] << " back";
+        rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope, ins[i], outs[i]);
+      } else {
+        VLOG(3) << "don't send no-initialied variable: " << ins[i];
+      }
+    }
+    rpc_client->Wait();
+  }
+};
+
+class PrefetchOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(LoDTensor) Input Id variables to be sent").AsDuplicable();
+    AddOutput("Out",
+              "(LoDTensor) result "
+              "to be fetched from parameter server")
+        .AsDuplicable();
+    AddAttr<std::vector<std::string>>(
+        "epmap",
+        "(string vector, default 127.0.0.1:6164)"
+        "Server endpoints in the order of input variables for mapping")
+        .SetDefault({"127.0.0.1:6164"});
+    AddComment(R"DOC(
+Prefetch operator
+
+This operator will send Ids variables to listen_and_serve op at
+the parameter server and fetch result back.
+)DOC");
+  }
+};
+
+class PrefetchOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(prefetch, ops::PrefetchOp,
+                  paddle::framework::EmptyGradOpMaker, ops::PrefetchOpMaker,
+                  ops::PrefetchOpShapeInference);
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
index 447b854544b72043ea09c09c134af3a48c305561..db040509bc08c3f6ad031c5b97c93574e31337e0 100644
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/prelu_op.h"
-#include "paddle/fluid/operators/net_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -38,8 +38,7 @@ class PReluOp : public framework::OperatorWithKernel {
 
 class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The input tensor of prelu operator.");
     AddInput("Alpha", "The alpha weight of prelu operator.");
     AddOutput("Out", "The output tensor of prelu operator.");
@@ -83,8 +82,9 @@ class PReluGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(prelu, ops::PReluOp, ops::PReluOpMaker, prelu_grad,
-            ops::PReluGradOp);
+REGISTER_OPERATOR(prelu, ops::PReluOp, ops::PReluOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(prelu_grad, ops::PReluGradOp);
 REGISTER_OP_CPU_KERNEL(
     prelu, ops::PReluKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
index fc09b4aa1da87e56678790785467e9f4080a20ea..cceac402951ae6bf3fe0b4c96af5b7ce9ca1ba0e 100644
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -16,6 +16,7 @@
 #include <ctime>
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/framework/variable.h"
 
 namespace paddle {
@@ -23,15 +24,15 @@ namespace operators {
 
 #define CLOG std::cout
 
-const std::string kForward = "FORWARD";
-const std::string kBackward = "BACKWARD";
-const std::string kBoth = "BOTH";
+const char kForward[] = "FORWARD";
+const char kBackward[] = "BACKWARD";
+const char kBoth[] = "BOTH";
 
 struct Formater {
   std::string message;
   std::string name;
   std::vector<int> dims;
-  std::type_index dtype{typeid(char)};
+  std::type_index dtype{typeid(const char)};
   framework::LoD lod;
   int summarize;
   void* data{nullptr};
@@ -62,7 +63,7 @@ struct Formater {
     }
   }
   void PrintDtype() {
-    if (dtype.hash_code() != typeid(char).hash_code()) {
+    if (!framework::IsType<const char>(dtype)) {
       CLOG << "\tdtype: " << dtype.name() << std::endl;
     }
   }
@@ -83,15 +84,15 @@ struct Formater {
   void PrintData(size_t size) {
     PADDLE_ENFORCE_NOT_NULL(data);
     // print float
-    if (dtype.hash_code() == typeid(float).hash_code()) {
+    if (framework::IsType<const float>(dtype)) {
       Display<float>(size);
-    } else if (dtype.hash_code() == typeid(double).hash_code()) {
+    } else if (framework::IsType<const double>(dtype)) {
       Display<double>(size);
-    } else if (dtype.hash_code() == typeid(int).hash_code()) {
+    } else if (framework::IsType<const int>(dtype)) {
       Display<int>(size);
-    } else if (dtype.hash_code() == typeid(int64_t).hash_code()) {
+    } else if (framework::IsType<const int64_t>(dtype)) {
       Display<int64_t>(size);
-    } else if (dtype.hash_code() == typeid(bool).hash_code()) {
+    } else if (framework::IsType<const bool>(dtype)) {
       Display<bool>(size);
     } else {
       CLOG << "\tdata: unprintable type: " << dtype.name() << std::endl;
@@ -100,7 +101,7 @@ struct Formater {
 
   template <typename T>
   void Display(size_t size) {
-    auto* d = (T*)data;
+    auto* d = reinterpret_cast<T*>(data);
     CLOG << "\tdata: ";
     if (summarize != -1) {
       summarize = std::min(size, (size_t)summarize);
@@ -135,7 +136,7 @@ class TensorPrintOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& place) const override {
     const framework::Variable* in_var_ptr = nullptr;
-    std::string phase = kForward;
+    std::string phase(kForward);
     std::string printed_var_name = "";
 
     auto& inputs = Inputs();
@@ -146,7 +147,7 @@ class TensorPrintOp : public framework::OperatorBase {
                !Inputs("In@GRAD").empty()) {
       in_var_ptr = scope.FindVar(Input("In@GRAD"));
       printed_var_name = Inputs("In@GRAD").front();
-      phase = kBackward;
+      phase = std::string(kBackward);
     } else {
       PADDLE_THROW("Unknown phase, should be forward or backward.");
     }
@@ -163,7 +164,7 @@ class TensorPrintOp : public framework::OperatorBase {
     out_tensor.set_lod(in_tensor.lod());
 
     std::string print_phase = Attr<std::string>("print_phase");
-    if (print_phase != phase && print_phase != kBoth) {
+    if (print_phase != phase && print_phase != std::string(kBoth)) {
       return;
     }
 
@@ -199,7 +200,7 @@ class TensorPrintOp : public framework::OperatorBase {
       formater.lod = printed_tensor.lod();
     }
     formater.summarize = Attr<int>("summarize");
-    formater.data = (void*)printed_tensor.data<void>();
+    formater.data = reinterpret_cast<void*>(printed_tensor.data<void>());
     formater(printed_tensor.numel());
   }
 
@@ -209,8 +210,7 @@ class TensorPrintOp : public framework::OperatorBase {
 
 class PrintOpProtoAndCheckMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  PrintOpProtoAndCheckMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("In", "Input tensor to be displayed.");
     AddAttr<int>("first_n", "Only log `first_n` number of times.");
     AddAttr<std::string>("message", "A string message to print as a prefix.");
@@ -223,8 +223,9 @@ class PrintOpProtoAndCheckMaker : public framework::OpProtoAndCheckerMaker {
         "print_phase",
         "(string, default 'BOTH') Which phase to display including 'FORWARD' "
         "'BACKWARD' and 'BOTH'.")
-        .SetDefault(kBoth)
-        .InEnum({kForward, kBackward, kBoth});
+        .SetDefault(std::string(kBoth))
+        .InEnum({std::string(kForward), std::string(kBackward),
+                 std::string(kBoth)});
     AddOutput("Out", "Output tensor with same data as input tensor.");
     AddComment(R"DOC(
 Creates a print op that will print when a tensor is accessed.
diff --git a/paddle/fluid/operators/prior_box_op.cc b/paddle/fluid/operators/prior_box_op.cc
deleted file mode 100644
index 7ba55437cb20f802cc12ceea7777d7d78bba62a6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/prior_box_op.cc
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/prior_box_op.h"
-
-namespace paddle {
-namespace operators {
-
-class PriorBoxOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of PriorBoxOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Image"),
-                   "Input(Image) of PriorBoxOp should not be null.");
-
-    auto image_dims = ctx->GetInputDim("Image");
-    auto input_dims = ctx->GetInputDim("Input");
-    PADDLE_ENFORCE(image_dims.size() == 4, "The layout of image is NCHW.");
-    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
-
-    PADDLE_ENFORCE_LT(input_dims[2], image_dims[2],
-                      "The height of input must smaller than image.");
-
-    PADDLE_ENFORCE_LT(input_dims[3], image_dims[3],
-                      "The width of input must smaller than image.");
-
-    auto min_sizes = ctx->Attrs().Get<std::vector<float>>("min_sizes");
-    auto max_sizes = ctx->Attrs().Get<std::vector<float>>("max_sizes");
-    auto variances = ctx->Attrs().Get<std::vector<float>>("variances");
-    auto aspect_ratios = ctx->Attrs().Get<std::vector<float>>("aspect_ratios");
-    bool flip = ctx->Attrs().Get<bool>("flip");
-
-    std::vector<float> aspect_ratios_vec;
-    ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec);
-
-    size_t num_priors = aspect_ratios_vec.size() * min_sizes.size();
-    if (max_sizes.size() > 0) {
-      PADDLE_ENFORCE_EQ(max_sizes.size(), min_sizes.size(),
-                        "The number of min_size and max_size must be equal.");
-      num_priors += max_sizes.size();
-      for (size_t i = 0; i < max_sizes.size(); ++i) {
-        PADDLE_ENFORCE_GT(max_sizes[i], min_sizes[i],
-                          "max_size[%d] must be greater than min_size[%d].", i,
-                          i);
-      }
-    }
-
-    std::vector<int64_t> dim_vec(4);
-    dim_vec[0] = input_dims[2];
-    dim_vec[1] = input_dims[3];
-    dim_vec[2] = num_priors;
-    dim_vec[3] = 4;
-    ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
-    ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::Tensor>("Input")->type()),
-        platform::CPUPlace());
-  }
-};
-
-class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  PriorBoxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Input",
-             "(Tensor, default Tensor<float>), "
-             "the input feature data of PriorBoxOp, The layout is NCHW.");
-    AddInput("Image",
-             "(Tensor, default Tensor<float>), "
-             "the input image data of PriorBoxOp, The layout is NCHW.");
-    AddOutput("Boxes",
-              "(Tensor, default Tensor<float>), the output prior boxes of "
-              "PriorBoxOp. The layout is [H, W, num_priors, 4]. "
-              "H is the height of input, W is the width of input, num_priors "
-              "is the box count of each position.");
-    AddOutput("Variances",
-              "(Tensor, default Tensor<float>), the expanded variances of "
-              "PriorBoxOp. The layout is [H, W, num_priors, 4]. "
-              "H is the height of input, W is the width of input, num_priors "
-              "is the box count of each position.");
-
-    AddAttr<std::vector<float>>("min_sizes",
-                                "(vector<float>) List of min sizes "
-                                "of generated prior boxes.")
-        .AddCustomChecker([](const std::vector<float>& min_sizes) {
-          PADDLE_ENFORCE_GT(min_sizes.size(), 0,
-                            "Size of min_sizes must be at least 1.");
-          for (size_t i = 0; i < min_sizes.size(); ++i) {
-            PADDLE_ENFORCE_GT(min_sizes[i], 0.0,
-                              "min_sizes[%d] must be positive.", i);
-          }
-        });
-    AddAttr<std::vector<float>>(
-        "max_sizes",
-        "(vector<float>) List of max sizes of generated prior boxes.")
-        .SetDefault(std::vector<float>{});
-    AddAttr<std::vector<float>>(
-        "aspect_ratios",
-        "(vector<float>) List of aspect ratios of generated prior boxes.");
-
-    AddAttr<std::vector<float>>(
-        "variances",
-        "(vector<float>) List of variances to be encoded in prior boxes.")
-        .AddCustomChecker([](const std::vector<float>& variances) {
-          PADDLE_ENFORCE_EQ(variances.size(), 4,
-                            "Must and only provide 4 variance.");
-          for (size_t i = 0; i < variances.size(); ++i) {
-            PADDLE_ENFORCE_GT(variances[i], 0.0,
-                              "variance[%d] must be greater than 0.", i);
-          }
-        });
-    AddAttr<bool>("flip", "(bool) Whether to flip aspect ratios.")
-        .SetDefault(true);
-    AddAttr<bool>("clip", "(bool) Whether to clip out-of-boundary boxes.")
-        .SetDefault(true);
-
-    AddAttr<float>("step_w",
-                   "Prior boxes step across width, 0.0 for auto calculation.")
-        .SetDefault(0.0)
-        .AddCustomChecker([](const float& step_w) {
-          PADDLE_ENFORCE_GE(step_w, 0.0, "step_w should be larger than 0.");
-        });
-    AddAttr<float>("step_h",
-                   "Prior boxes step across height, 0.0 for auto calculation.")
-        .SetDefault(0.0)
-        .AddCustomChecker([](const float& step_h) {
-          PADDLE_ENFORCE_GE(step_h, 0.0, "step_h should be larger than 0.");
-        });
-
-    AddAttr<float>("offset",
-                   "(float) "
-                   "Prior boxes center offset.")
-        .SetDefault(0.5);
-    AddComment(R"DOC(
-Prior box operator
-Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
-Each position of the input produce N prior boxes, N is determined by
- the count of min_sizes, max_sizes and aspect_ratios, The size of the
- box is in range(min_size, max_size) interval, which is generated in
- sequence according to the aspect_ratios.
-
-Please get more information from the following papers:
-https://arxiv.org/abs/1512.02325.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    prior_box, ops::PriorBoxOpKernel<paddle::platform::CPUPlace, float>,
-    ops::PriorBoxOpKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/prior_box_op.h b/paddle/fluid/operators/prior_box_op.h
deleted file mode 100644
index 18bb2deb6b5acf626dfb2883a5771d9d195d45c0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/prior_box_op.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
-                               bool flip,
-                               std::vector<float>& output_aspect_ratior) {
-  constexpr float epsilon = 1e-6;
-  output_aspect_ratior.clear();
-  output_aspect_ratior.push_back(1.0f);
-  for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
-    float ar = input_aspect_ratior[i];
-    bool already_exist = false;
-    for (size_t j = 0; j < output_aspect_ratior.size(); ++j) {
-      if (fabs(ar - output_aspect_ratior[j]) < epsilon) {
-        already_exist = true;
-        break;
-      }
-    }
-    if (!already_exist) {
-      output_aspect_ratior.push_back(ar);
-      if (flip) {
-        output_aspect_ratior.push_back(1.0f / ar);
-      }
-    }
-  }
-}
-
-template <typename T>
-struct ClipFunctor {
-  HOSTDEVICE inline T operator()(T in) const {
-    return std::min<T>(std::max<T>(in, 0.), 1.);
-  }
-};
-
-template <typename Place, typename T>
-class PriorBoxOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
-    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
-    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
-    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
-
-    auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
-    auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
-    auto input_aspect_ratio = ctx.Attr<std::vector<float>>("aspect_ratios");
-    auto variances = ctx.Attr<std::vector<float>>("variances");
-    auto flip = ctx.Attr<bool>("flip");
-    auto clip = ctx.Attr<bool>("clip");
-
-    std::vector<float> aspect_ratios;
-    ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios);
-
-    T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
-    T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
-    T offset = static_cast<T>(ctx.Attr<float>("offset"));
-
-    auto img_width = image->dims()[3];
-    auto img_height = image->dims()[2];
-
-    auto feature_width = input->dims()[3];
-    auto feature_height = input->dims()[2];
-
-    T step_width, step_height;
-    if (step_w == 0 || step_h == 0) {
-      step_width = static_cast<T>(img_width) / feature_width;
-      step_height = static_cast<T>(img_height) / feature_height;
-    } else {
-      step_width = step_w;
-      step_height = step_h;
-    }
-
-    int num_priors = aspect_ratios.size() * min_sizes.size();
-    if (max_sizes.size() > 0) {
-      num_priors += max_sizes.size();
-    }
-
-    boxes->mutable_data<T>(ctx.GetPlace());
-    vars->mutable_data<T>(ctx.GetPlace());
-
-    auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes);
-    for (int h = 0; h < feature_height; ++h) {
-      for (int w = 0; w < feature_width; ++w) {
-        T center_x = (w + offset) * step_width;
-        T center_y = (h + offset) * step_height;
-        T box_width, box_height;
-        int idx = 0;
-        for (size_t s = 0; s < min_sizes.size(); ++s) {
-          auto min_size = min_sizes[s];
-          // first prior: aspect_ratio = 1, size = min_size
-          box_width = box_height = min_size / 2.;
-          // xmin
-          e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-          // ymin
-          e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-          // xmax
-          e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-          // ymax
-          e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
-
-          idx++;
-          if (max_sizes.size() > 0) {
-            auto max_size = max_sizes[s];
-            // second prior: aspect_ratio = 1,
-            // size = sqrt(min_size * max_size)
-            box_width = box_height = sqrt(min_size * max_size) / 2.;
-            // xmin
-            e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-            // ymin
-            e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-            // xmax
-            e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-            // ymax
-            e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
-            idx++;
-          }
-
-          // rest of priors
-          for (size_t r = 0; r < aspect_ratios.size(); ++r) {
-            float ar = aspect_ratios[r];
-            if (fabs(ar - 1.) < 1e-6) {
-              continue;
-            }
-            box_width = min_size * sqrt(ar) / 2.;
-            box_height = min_size / sqrt(ar) / 2.;
-            // xmin
-            e_boxes(h, w, idx, 0) = (center_x - box_width) / img_width;
-            // ymin
-            e_boxes(h, w, idx, 1) = (center_y - box_height) / img_height;
-            // xmax
-            e_boxes(h, w, idx, 2) = (center_x + box_width) / img_width;
-            // ymax
-            e_boxes(h, w, idx, 3) = (center_y + box_height) / img_height;
-            idx++;
-          }
-        }
-      }
-    }
-
-    if (clip) {
-      platform::Transform<platform::CPUDeviceContext> trans;
-      ClipFunctor<T> clip_func;
-      trans(ctx.template device_context<platform::CPUDeviceContext>(),
-            boxes->data<T>(), boxes->data<T>() + boxes->numel(),
-            boxes->data<T>(), clip_func);
-    }
-
-    framework::Tensor var_t;
-    var_t.mutable_data<T>(
-        framework::make_ddim({1, static_cast<int>(variances.size())}),
-        ctx.GetPlace());
-    auto var_et = framework::EigenTensor<T, 2>::From(var_t);
-    for (size_t i = 0; i < variances.size(); ++i) {
-      var_et(0, i) = variances[i];
-    }
-
-    int box_num = feature_height * feature_width * num_priors;
-    auto var_dim = vars->dims();
-    vars->Resize({box_num, static_cast<int>(variances.size())});
-
-    auto e_vars = framework::EigenMatrix<T, Eigen::RowMajor>::From(*vars);
-    e_vars = var_et.broadcast(Eigen::DSizes<int, 2>(box_num, 1));
-
-    vars->Resize(var_dim);
-  }
-};  // namespace operators
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/proximal_adagrad_op.cc b/paddle/fluid/operators/proximal_adagrad_op.cc
index 38cd97c17b16a4cc64f7e6d52150fae392df6036..8d8075d76111928ec9855eb0b70fe6dbd90a979b 100644
--- a/paddle/fluid/operators/proximal_adagrad_op.cc
+++ b/paddle/fluid/operators/proximal_adagrad_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
 class ProximalAdagradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -55,12 +56,17 @@ class ProximalAdagradOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("ParamOut", param_dim);
     ctx->SetOutputDim("MomentOut", param_dim);
   }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 
 class ProximalAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ProximalAdagradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Param",
              "(Tensor, default Tensor<float>) "
              "Input parameter that has to be updated.");
diff --git a/paddle/fluid/operators/proximal_gd_op.cc b/paddle/fluid/operators/proximal_gd_op.cc
index efb4e1ac204ce79bfad7d77038f342be09e8f0e8..baf9cbcba2ed89f62afc9816e0ab9e0f112e6008 100644
--- a/paddle/fluid/operators/proximal_gd_op.cc
+++ b/paddle/fluid/operators/proximal_gd_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
 class ProximalGDOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -43,12 +44,17 @@ class ProximalGDOp : public framework::OperatorWithKernel {
 
     ctx->SetOutputDim("ParamOut", param_dim);
   }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("Param")->type());
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 
 class ProximalGDOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ProximalGDOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Param",
              "(Tensor, default Tensor<float>) "
              "Input parameter value that has to be updated.");
diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..123fa44fa3ddbc9343b9629be63fdefdf12b4646
--- /dev/null
+++ b/paddle/fluid/operators/random_crop_op.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/random_crop_op.h"
+
+namespace paddle {
+namespace operators {
+
+class RandomCropOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "A batch of instances to random crop.");
+    AddInput("Seed", "The random seed.");
+    AddOutput("Out", "The cropped instance batch.");
+    AddOutput("SeedOut", "The random seed after random cropping.")
+        .AsIntermediate();
+    AddAttr<std::vector<int>>("shape", "The shape of a cropped instance.");
+    AddAttr<int>("startup_seed",
+                 "If the input 'Seed' is not initialized, the 'startup_seed' "
+                 "will be used to replace it. Even so, the seed after random "
+                 "crop will also be outputed to the 'SeedOut'.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+      This operator takes a batch of instance, and do random cropping on each instance.
+      It means that cropping positions differs on each instance, which is determined
+      by an uniform random generator. All cropped instances have the same shape, which 
+      is determined by the operator's attribute 'shape'.
+    )DOC");
+  }
+};
+
+class RandomCropOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    auto x_dim = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_GT(x_dim.size(), static_cast<int64_t>(shape.size()));
+    auto out_dim = framework::vectorize2int(x_dim);
+    for (size_t i = 1; i <= shape.size(); ++i) {
+      size_t x_i = x_dim.size() - i;
+      size_t shape_i = shape.size() - i;
+      PADDLE_ENFORCE_GE(x_dim[x_i], shape[shape_i]);
+      out_dim[x_i] = shape[shape_i];
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dim));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace f = paddle::framework;
+REGISTER_OPERATOR(random_crop, ops::RandomCropOp, ops::RandomCropOpMaker,
+                  ops::RandomCropOpInferShape, f::EmptyGradOpMaker);
+
+template <typename T>
+using Kernel = ops::RandomCropKernel<paddle::platform::CPUDeviceContext, T>;
+REGISTER_OP_CPU_KERNEL(random_crop, Kernel<float>, Kernel<int>, Kernel<double>,
+                       Kernel<uint8_t>, Kernel<int16_t>);
diff --git a/paddle/fluid/operators/random_crop_op.cu b/paddle/fluid/operators/random_crop_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6fc9bedc55b4d349ddf3d109c7f9049113235f0c
--- /dev/null
+++ b/paddle/fluid/operators/random_crop_op.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/random_crop_op.h"
+
+namespace ops = paddle::operators;
+template <typename T>
+using Kernel = ops::RandomCropKernel<paddle::platform::CUDADeviceContext, T>;
+REGISTER_OP_CUDA_KERNEL(random_crop, Kernel<float>, Kernel<int>, Kernel<double>,
+                        Kernel<uint8_t>, Kernel<int16_t>);
diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..d68ba9d661698bb0d33b139f5748daec2ead6595
--- /dev/null
+++ b/paddle/fluid/operators/random_crop_op.h
@@ -0,0 +1,187 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/for_range.h"
+#ifdef PADDLE_WITH_CUDA
+#include <thrust/random.h>
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext>
+struct Random;
+
+template <>
+struct Random<platform::CPUDeviceContext> {
+  using Engine = std::minstd_rand;
+
+  template <typename T>
+  using UniformIntDist = std::uniform_int_distribution<T>;
+};
+
+#ifdef PADDLE_WITH_CUDA
+template <>
+struct Random<platform::CUDADeviceContext> {
+  using Engine = thrust::minstd_rand;
+
+  template <typename T>
+  using UniformIntDist = thrust::uniform_int_distribution<T>;
+};
+#endif
+
+template <typename T>
+HOSTDEVICE inline void StridedMemcpy(const T* x, const size_t* x_dims, T* out,
+                                     const size_t* out_dims, int i, int rank,
+                                     size_t prod_x_remain,
+                                     size_t prod_out_remain,
+                                     const size_t* offsets) {
+  size_t x_dim_i = x_dims[i];
+  size_t out_dim_i = out_dims[i];
+  size_t x_stride = prod_x_remain / x_dim_i;
+  size_t out_stride = prod_out_remain / out_dim_i;
+  size_t offset_i = offsets[i];
+
+  if (i == rank - 1) {
+    PADDLE_ASSERT(x_stride == 1 && out_stride == 1);
+    x += offset_i;
+    for (size_t j = 0; j < out_dim_i; ++j) {
+      *out++ = *x++;
+    }
+  } else {
+    x += offset_i * x_stride;
+    for (size_t j = 0; j < out_dim_i; ++j) {
+      StridedMemcpy<T>(x, x_dims, out, out_dims, i + 1, rank, x_stride,
+                       out_stride, offsets);
+      x += x_stride;
+      out += out_stride;
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+struct RandomCropFunctor {
+  const T* x_;
+  T* out_;
+  size_t x_dims_[9];
+  size_t out_dims_[9];
+  int num_batchsize_dims_;
+  int rank_;
+  int64_t seed_;
+
+  size_t prod_batchsize_dims_;
+  size_t prod_x_ins_dims_;
+  size_t prod_out_ins_dims_;
+
+  RandomCropFunctor(const T* x, T* out, const framework::DDim& x_dims,
+                    const framework::DDim& out_dims, int num_batchsize_dims,
+                    int64_t seed)
+      : x_(x),
+        out_(out),
+        num_batchsize_dims_(num_batchsize_dims),
+        rank_(x_dims.size()),
+        seed_(seed) {
+    PADDLE_ENFORCE_EQ(x_dims.size(), out_dims.size());
+    PADDLE_ENFORCE_GT(rank_, num_batchsize_dims_);
+    prod_batchsize_dims_ = 1;
+    prod_x_ins_dims_ = 1;
+    prod_out_ins_dims_ = 1;
+    for (size_t i = 0; i < static_cast<size_t>(rank_); ++i) {
+      size_t x_dim_i = x_dims[i];
+      size_t out_dim_i = out_dims[i];
+      x_dims_[i] = x_dim_i;
+      out_dims_[i] = out_dim_i;
+      if (i < static_cast<size_t>(num_batchsize_dims_)) {
+        PADDLE_ENFORCE_EQ(x_dim_i, out_dim_i);
+        prod_batchsize_dims_ *= x_dim_i;
+      } else {
+        prod_x_ins_dims_ *= x_dim_i;
+        prod_out_ins_dims_ *= out_dim_i;
+      }
+    }
+  }
+
+  HOSTDEVICE void operator()(size_t ins_idx) {
+    typename Random<DeviceContext>::Engine engine(seed_);
+    engine.discard(ins_idx * (rank_ - num_batchsize_dims_));
+    size_t offsets[9];
+    for (int i = num_batchsize_dims_; i < rank_; ++i) {
+      typename Random<DeviceContext>::template UniformIntDist<size_t> dist(
+          0, x_dims_[i] - out_dims_[i]);
+      offsets[i - num_batchsize_dims_] = dist(engine);
+    }
+
+    const T* x = x_ + ins_idx * prod_x_ins_dims_;
+    T* out = out_ + ins_idx * prod_out_ins_dims_;
+
+    StridedMemcpy<T>(x, x_dims_ + num_batchsize_dims_, out,
+                     out_dims_ + num_batchsize_dims_, 0,
+                     rank_ - num_batchsize_dims_, prod_x_ins_dims_,
+                     prod_out_ins_dims_, offsets);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class RandomCropKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& ctx) const {
+    int64_t seed = 0;
+    auto& seed_tensor = detail::Ref(ctx.Input<framework::LoDTensor>("Seed"));
+    if (seed_tensor.IsInitialized()) {
+      if (platform::is_cpu_place(seed_tensor.place())) {
+        seed = *seed_tensor.data<int64_t>();
+      } else {
+        LOG(WARNING) << "It is slow to place seed in GPU memory. Please verify "
+                        "your program";
+        framework::LoDTensor cpu_seed;
+        framework::TensorCopySync(seed_tensor, platform::CPUPlace(), &cpu_seed);
+        seed = *cpu_seed.data<int64_t>();
+      }
+    } else {
+      VLOG(5) << "WARNING: The input 'Seed' is not initialized, use attribute "
+                 "'startup_seed' instead.";
+      seed = ctx.Attr<int>("startup_seed");
+    }
+    auto shape = ctx.Attr<std::vector<int>>("shape");
+    auto& x = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
+    auto& out = detail::Ref(ctx.Output<framework::LoDTensor>("Out"));
+
+    int num_batchsize_dims = x.dims().size() - shape.size();
+    RandomCropFunctor<DeviceContext, T> functor(
+        x.data<T>(), out.mutable_data<T>(ctx.GetPlace()), x.dims(), out.dims(),
+        num_batchsize_dims, seed);
+    platform::ForRange<DeviceContext> for_range(
+        ctx.template device_context<DeviceContext>(),
+        functor.prod_batchsize_dims_);
+
+    for_range(functor);
+
+    Random<platform::CPUDeviceContext>::Engine engine(seed);
+    engine.discard(functor.prod_batchsize_dims_ *
+                   (functor.rank_ - functor.num_batchsize_dims_));
+    *ctx.Output<framework::LoDTensor>("SeedOut")->mutable_data<int64_t>(
+        framework::make_ddim({1}), platform::CPUPlace()) = engine();
+  }
+};
+
+// TODO(fengjiayi): Backward of random crop op
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
index 767eef56861ea075ec2450b1456e7c5c807ce25d..313cf01541dd88a0f4f8bf54fe4436984c2cbcf8 100644
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/rank_loss_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -45,8 +46,7 @@ class RankLossOp : public framework::OperatorWithKernel {
 
 class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RankLossOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Label",
              "(2-D Tensor with shape [batch_size x 1]) "
              "The label indicating A ranked higher than B or not.");
@@ -120,8 +120,9 @@ class RankLossGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 namespace ops = paddle::operators;
 
-REGISTER_OP(rank_loss, ops::RankLossOp, ops::RankLossOpMaker, rank_loss_grad,
-            ops::RankLossGradOp);
+REGISTER_OPERATOR(rank_loss, ops::RankLossOp, ops::RankLossOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(rank_loss_grad, ops::RankLossGradOp);
 REGISTER_OP_CPU_KERNEL(
     rank_loss, ops::RankLossKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc
index 2a5605e0d378a184ae132e657b2872279784855d..695d7ea83df952d9f2212cc0aaca5c90c7b47ee7 100644
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
 
 namespace paddle {
 namespace operators {
@@ -59,19 +60,25 @@ class ReadOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override {
     framework::ReaderHolder* reader =
-        scope.FindVar(Input("Reader"))->GetMutable<framework::ReaderHolder>();
+        detail::Ref(scope.FindVar(Input("Reader")),
+                    "Cannot find reader variable %s", Input("Reader"))
+            .GetMutable<framework::ReaderHolder>();
     std::vector<std::string> out_arg_names = Outputs("Out");
     std::vector<framework::LoDTensor> ins;
     reader->ReadNext(&ins);
     if (ins.empty()) {
-      reader->ReInit();
-      reader->ReadNext(&ins);
-      PADDLE_ENFORCE(
-          !ins.empty(),
-          "Reader can not read the next data even it has been re-initialized.");
+      if (Attr<bool>("throw_eof_exp")) {
+        PADDLE_THROW_EOF();
+      } else {
+        ins.resize(out_arg_names.size());
+        for (auto& tensor : ins) {
+          // data type is not important for subsequent DataBalanceOpHandle
+          tensor.mutable_data<float>(framework::make_ddim({0}), dev_place);
+        }
+      }
     }
     PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
-    for (size_t i = 0; i < ins.size(); ++i) {
+    for (size_t i = 0; i < out_arg_names.size(); ++i) {
       auto* out =
           scope.FindVar(out_arg_names[i])->GetMutable<framework::LoDTensor>();
       out->ShareDataWith(ins[i]);
@@ -82,10 +89,13 @@ class ReadOp : public framework::OperatorBase {
 
 class ReadOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ReadOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(op_proto, op_checker) {
+  void Make() override {
     AddInput("Reader", "(ReaderHolder) The executed reader.");
     AddOutput("Out", "(LoDTensor) The output data.").AsDuplicable();
+    AddAttr<bool>("throw_eof_exp",
+                  "If set true, an exception will be thrown when the Reader "
+                  "yields empty (which means there is no next data).")
+        .SetDefault(true);
     AddComment(R"DOC(
       Read Operator
 
diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt
index 744bd3b7ef71f83ad82979eb966369c2e9456a7d..a39c8a00538875e4e3284898230a6cb0693b7a12 100644
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -15,10 +15,17 @@ function(reader_library TARGET_NAME)
         PARENT_SCOPE)
 endfunction()
 
+reader_library(open_files_op SRCS open_files_op.cc)
 reader_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc)
 reader_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc)
 reader_library(create_batch_reader_op SRCS create_batch_reader_op.cc)
 reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_op.cc)
 reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc)
+reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
+reader_library(create_threaded_reader_op SRCS create_threaded_reader_op.cc)
+reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc)
+reader_library(create_py_reader_op SRCS create_py_reader_op.cc)
+
+cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc)
 # Export local libraries to parent
 set(READER_LIBRARY ${LOCAL_READER_LIBS} PARENT_SCOPE)
diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..db8cf3b605c9175eeda4548b1e7c8203f26c5d89
--- /dev/null
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -0,0 +1,117 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <condition_variable>  // NOLINT
+#include <deque>
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+template <typename T>
+class BlockingQueue {
+  // BlockingQueue is for buffered reading and is supposed to use only the
+  // reader package. It is true that we could and we should have been using
+  // framework::Channel, but which has currently a deadlock bug. BlockingQueue
+  // is a workaround and a simplified version of framework::Channel as it
+  // doesn't support GPU and it implements on buffered blocking queue.
+ public:
+  explicit BlockingQueue(size_t capacity)
+      : capacity_(capacity), closed_(false) {
+    PADDLE_ENFORCE_GT(
+        capacity_, 0,
+        "The capacity of a reader::BlockingQueue must be greater than 0.");
+  }
+
+  bool Send(const T& elem) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    send_cv_.wait(lock, [&] { return queue_.size() < capacity_ || closed_; });
+    if (closed_) {
+      VLOG(5)
+          << "WARNING: Sending an element to a closed reader::BlokcingQueue.";
+      return false;
+    }
+    PADDLE_ENFORCE_LT(queue_.size(), capacity_);
+    queue_.push_back(elem);
+    receive_cv_.notify_one();
+    return true;
+  }
+
+  bool Send(T&& elem) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    send_cv_.wait(lock, [&] { return queue_.size() < capacity_ || closed_; });
+    if (closed_) {
+      VLOG(5)
+          << "WARNING: Sending an element to a closed reader::BlokcingQueue.";
+      return false;
+    }
+    PADDLE_ENFORCE_LT(queue_.size(), capacity_);
+    queue_.emplace_back(std::move(elem));
+    receive_cv_.notify_one();
+    return true;
+  }
+
+  bool Receive(T* elem) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    receive_cv_.wait(lock, [&] { return !queue_.empty() || closed_; });
+    if (!queue_.empty()) {
+      PADDLE_ENFORCE_NOT_NULL(elem);
+      *elem = queue_.front();
+      queue_.pop_front();
+      send_cv_.notify_one();
+      return true;
+    } else {
+      PADDLE_ENFORCE(closed_);
+      return false;
+    }
+  }
+
+  void Close() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    closed_ = true;
+    send_cv_.notify_all();
+    receive_cv_.notify_all();
+  }
+
+  bool IsClosed() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return closed_;
+  }
+
+  size_t Cap() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return capacity_;
+  }
+
+  size_t Size() const {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return queue_.size();
+  }
+
+ private:
+  size_t capacity_;
+  bool closed_;
+  std::deque<T> queue_;
+
+  mutable std::mutex mutex_;
+  mutable std::condition_variable receive_cv_;
+  mutable std::condition_variable send_cv_;
+};
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/create_batch_reader_op.cc b/paddle/fluid/operators/reader/create_batch_reader_op.cc
index 277f2856c07b3fec2113486539aec1d9139fae92..ecbae3894d551186f53625a6cc9cfdb36adc8d2d 100644
--- a/paddle/fluid/operators/reader/create_batch_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc
@@ -20,7 +20,7 @@ namespace reader {
 
 class BatchReader : public framework::DecoratedReader {
  public:
-  BatchReader(ReaderBase* reader, int batch_size)
+  BatchReader(const std::shared_ptr<ReaderBase>& reader, int batch_size)
       : DecoratedReader(reader), batch_size_(batch_size) {
     buffer_.reserve(batch_size_);
   }
@@ -39,19 +39,21 @@ class CreateBatchReaderOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override {
-    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
-                                        ->Get<framework::ReaderHolder>();
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) {
+      return;
+    }
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
     out->Reset(
         new BatchReader(underlying_reader.Get(), Attr<int>("batch_size")));
   }
 };
 
 class CreateBatchReaderOpMaker : public DecoratedReaderMakerBase {
- public:
-  CreateBatchReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : DecoratedReaderMakerBase(op_proto, op_checker) {
+ protected:
+  void Apply() override {
     AddAttr<int>("batch_size",
                  "How many instances the batch reader yields each time.")
         .GreaterThan(0);
diff --git a/paddle/fluid/operators/reader/create_custom_reader_op.cc b/paddle/fluid/operators/reader/create_custom_reader_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a75c6d4c567ac93f37b38070421133af305f20a3
--- /dev/null
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
@@ -0,0 +1,190 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class CustomReader : public framework::DecoratedReader {
+ public:
+  CustomReader(const std::shared_ptr<ReaderBase>& reader,
+               const framework::BlockDesc& sub_block,
+               const std::vector<std::string>& source_var_names,
+               const std::vector<std::string>& sink_var_names)
+      : DecoratedReader(reader),
+        program_(*sub_block.Program()),
+        sub_block_id_(sub_block.ID()),
+        exe_(framework::Executor(platform::CPUPlace())),
+        source_var_names_(source_var_names),
+        sink_var_names_(sink_var_names) {}
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override;
+
+ private:
+  const framework::ProgramDesc program_;
+  int sub_block_id_;
+  framework::Executor exe_;
+  framework::Scope scope_;
+
+  std::vector<std::string> source_var_names_;
+  std::vector<std::string> sink_var_names_;
+};
+
+class CreateCustomReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    auto* sub_block = Attr<framework::BlockDesc*>("sub_block");
+    if (out->Get() != nullptr) {
+      return;
+    }
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    out->Reset(
+        new CustomReader(underlying_reader.Get(), *sub_block,
+                         Attr<std::vector<std::string>>("source_var_names"),
+                         Attr<std::vector<std::string>>("sink_var_names")));
+  }
+};
+
+class CreateCustomReaderOpMaker : public DecoratedReaderMakerBase {
+ protected:
+  void Apply() override {
+    AddAttr<framework::BlockDesc*>(
+        "sub_block", "The block to hold all preprocessing operators.");
+    AddAttr<std::vector<std::string>>(
+        "source_var_names",
+        "Source variables are starting points of data preprocessing. They hold "
+        "preprocessing's input tensors. Each source variable corresponds to "
+        "one of underlying reader's output datas.");
+    AddAttr<std::vector<std::string>>(
+        "sink_var_names",
+        "Sink variables are ending points of data preprocessing. They hold "
+        "preprocessing's output tensors. Each sink variable corresponds to "
+        "one of custom reader's output datas.");
+    AddComment(R"DOC(
+      CreateCustomReader Operator
+
+      A custom reader can be used for input data preprocessing. 
+      A custom reader holds its own sub-block, which will be executed in CPU 
+      in its 'ReadNext()' function. Users can configurate their own 
+      preprocessing pipelines by inserting operators into custom reader's 
+      sub-block.
+    )DOC");
+  }
+};
+
+class CustomReaderInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(!ctx->IsRuntime(),
+                   "'CustomReaderInferShape' should only be invoked during "
+                   "compile time.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "The output decorated reader should not be null.");
+    const auto* sub_block =
+        ctx->Attrs().Get<framework::BlockDesc*>("sub_block");
+    const auto sink_var_names =
+        ctx->Attrs().Get<std::vector<std::string>>("sink_var_names");
+    std::vector<std::vector<int64_t>> res_dims;
+    std::vector<int32_t> res_lod_levels;
+    for (const std::string& var_name : sink_var_names) {
+      auto* sink_var = sub_block->FindVar(var_name);
+      PADDLE_ENFORCE_NOT_NULL(sink_var);
+      res_dims.emplace_back(sink_var->GetShape());
+      res_lod_levels.push_back(sink_var->GetLoDLevel());
+    }
+    auto* out_reader =
+        boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
+    out_reader->SetShapes(res_dims);
+    out_reader->SetLoDLevels(res_lod_levels);
+  }
+};
+
+class CustomReaderInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    framework::VarDesc* out_reader = block->FindVar(op_desc.Output("Out")[0]);
+    PADDLE_ENFORCE_NOT_NULL(out_reader);
+    out_reader->SetType(framework::proto::VarType::READER);
+
+    auto sink_var_names =
+        boost::get<std::vector<std::string>>(op_desc.GetAttr("sink_var_names"));
+    const auto* sub_block =
+        boost::get<framework::BlockDesc*>(op_desc.GetAttr("sub_block"));
+    std::vector<framework::proto::VarType::Type> res_data_types;
+    for (const std::string& var_name : sink_var_names) {
+      framework::VarDesc* var = sub_block->FindVar(var_name);
+      PADDLE_ENFORCE_NOT_NULL(var);
+      res_data_types.emplace_back(var->GetDataType());
+    }
+    out_reader->SetDataTypes(res_data_types);
+  }
+};
+
+void CustomReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+  out->clear();
+  std::vector<framework::LoDTensor> underlying_outs;
+  reader_->ReadNext(&underlying_outs);
+  if (underlying_outs.empty()) {
+    // There is not next data.
+    return;
+  }
+  PADDLE_ENFORCE(source_var_names_.size() == underlying_outs.size(),
+                 "The size of source_var_names(%d) and the size of "
+                 "underlying_outs(%d) are not consistent. Each feeding element "
+                 "must have its own source variable.",
+                 source_var_names_.size(), underlying_outs.size());
+  // The scope for CustomReader's sub-block should be independent and shouldn't
+  // be any other computation scope's child. Otherwise, data preprocessing and
+  // compution cannot be concurrent.
+  framework::Scope* exe_scope = &scope_.NewScope();
+  // 1. Copy LoDTensors from underlying reader's output to source variables.
+  for (size_t i = 0; i < source_var_names_.size(); ++i) {
+    framework::Variable* var = exe_scope->Var(source_var_names_[i]);
+    framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
+    tensor->ShareDataWith(underlying_outs[i]);
+    tensor->set_lod(underlying_outs[i].lod());
+  }
+  // 2. Run the sub-block.
+  exe_.Run(program_, exe_scope, sub_block_id_, false, true);
+  // 3. Copy LoDTensors from sink variables to out.
+  out->resize(sink_var_names_.size());
+  for (size_t i = 0; i < sink_var_names_.size(); ++i) {
+    const auto& tensor = detail::Ref(exe_scope->FindVar(sink_var_names_[i]))
+                             .Get<framework::LoDTensor>();
+    framework::TensorCopySync(tensor, platform::CPUPlace(), &(*out)[i]);
+  }
+  scope_.DeleteScope(exe_scope);
+}
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators::reader;
+REGISTER_OPERATOR(create_custom_reader, ops::CreateCustomReaderOp,
+                  ops::CreateCustomReaderOpMaker, ops::CustomReaderInferShape,
+                  ops::CustomReaderInferVarType,
+                  paddle::framework::EmptyGradOpMaker)
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index ba08ea12e2486aaba8c57a9fe23592bd1738592d..5f734489a81764875988f440696682570ff4d1d7 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -12,37 +12,72 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <thread>
-#include "paddle/fluid/framework/channel.h"
+#include <thread>  // NOLINT
+
+#include "paddle/fluid/operators/reader/blocking_queue.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 
 namespace paddle {
 namespace operators {
 namespace reader {
 
-static constexpr size_t kDoubleBufferSize = 2;
+// 'Double buffer' means we shall maintain two batches of input data at the same
+// time. So the kCacheSize shoul be at least 2.
+static constexpr size_t kCacheSize = 5;
+// There will be two bacthes out of the channel during training:
+// 1. the one waiting to be sent to the channel
+// 2. the one just be received from the channel, which is also being used by
+// subsequent operators.
+// So the channel size should be kChacheSize - 2
+static constexpr size_t kChannelSize = 3;  // kCacheSize - 2
 
 class DoubleBufferReader : public framework::DecoratedReader {
  public:
-  explicit DoubleBufferReader(ReaderBase* reader)
-      : DecoratedReader(reader),
-        buffer_(framework::MakeChannel<std::vector<framework::LoDTensor>>(
-            kDoubleBufferSize)) {
-    std::thread prefetch(&DoubleBufferReader::PrefetchThreadFunc, this);
-    prefetch.detach();
+  explicit DoubleBufferReader(
+      const std::shared_ptr<ReaderBase>& reader,
+      platform::Place target_place = platform::CPUPlace())
+      : DecoratedReader(reader), place_(target_place) {
+    cpu_tensor_cache_.resize(kCacheSize);
+    gpu_tensor_cache_.resize(kCacheSize);
+#ifdef PADDLE_WITH_CUDA
+    if (platform::is_gpu_place(place_)) {
+      for (size_t i = 0; i < kCacheSize; ++i) {
+        ctxs_.emplace_back(new platform::CUDADeviceContext(
+            boost::get<platform::CUDAPlace>(place_)));
+      }
+    }
+#endif
+    StartPrefetcher();
   }
 
   void ReadNext(std::vector<framework::LoDTensor>* out) override;
   void ReInit() override;
 
-  ~DoubleBufferReader() { buffer_->Close(); }
-
-  bool HasNext() const override;
+  ~DoubleBufferReader() { EndPrefetcher(); }
 
  private:
+  void StartPrefetcher() {
+    channel_ = new reader::BlockingQueue<size_t>(kChannelSize);
+    prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
+  }
+
+  void EndPrefetcher() {
+    channel_->Close();
+    if (prefetcher_.joinable()) {
+      prefetcher_.join();
+    }
+    delete channel_;
+    channel_ = nullptr;
+  }
+
   void PrefetchThreadFunc();
 
-  framework::Channel<std::vector<framework::LoDTensor>>* buffer_;
+  std::thread prefetcher_;
+  reader::BlockingQueue<size_t>* channel_;
+  platform::Place place_;
+  std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache_;
+  std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache_;
+  std::vector<std::unique_ptr<platform::DeviceContext>> ctxs_;
 };
 
 class CreateDoubleBufferReaderOp : public framework::OperatorBase {
@@ -52,64 +87,106 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override {
-    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
-                                        ->Get<framework::ReaderHolder>();
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new DoubleBufferReader(underlying_reader.Get()));
+    if (out->Get() != nullptr) {
+      return;
+    }
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+
+    auto place_str = Attr<std::string>("place");
+    platform::Place place;
+    if (place_str == "AUTO") {
+      place = dev_place;
+    } else if (place_str == "CPU") {
+      place = platform::CPUPlace();
+    } else {
+      std::istringstream sin(place_str);
+      sin.seekg(std::string("CUDA:").size(), std::ios::beg);
+      size_t num;
+      sin >> num;
+      place = platform::CUDAPlace(static_cast<int>(num));
+    }
+
+    out->Reset(new DoubleBufferReader(underlying_reader.Get(), place));
   }
 };
 
 class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
- public:
-  CreateDoubleBufferReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : DecoratedReaderMakerBase(op_proto, op_checker) {
+ protected:
+  void Apply() override {
     AddComment(R"DOC(
       CreateDoubleBufferReader Operator
 
       A double buffer reader takes another reader as its 'underlying reader'.
-      It launches another thread to execute the 'underlying reader' asynchronously, 
+      It launches another thread to execute the 'underlying reader' asynchronously,
       which prevents reading process from blocking subsequent training.
     )DOC");
+    std::unordered_set<std::string> enum_range;
+    constexpr size_t kMaxCUDADevs = 128;
+    for (size_t i = 0; i < kMaxCUDADevs; ++i) {
+      enum_range.insert(string::Sprintf("CUDA:%d", i));
+    }
+    enum_range.insert("CPU");
+    enum_range.insert("AUTO");
+    AddAttr<std::string>("place", "The double buffer place")
+        .SetDefault("AUTO")
+        .InEnum({enum_range});
   }
 };
 
 void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
-  out->clear();
-  buffer_->Receive(out);
+  size_t cached_tensor_id;
+  if (channel_->Receive(&cached_tensor_id)) {
+    if (platform::is_gpu_place(place_)) {
+      *out = gpu_tensor_cache_[cached_tensor_id];
+    } else {
+      // CPU place
+      *out = cpu_tensor_cache_[cached_tensor_id];
+    }
+  } else {
+    out->clear();
+  }
 }
 
 void DoubleBufferReader::ReInit() {
   reader_->ReInit();
-  buffer_->Close();
-  // The existing prefetch thread will terminate for the buffer_ is closed.
-  buffer_ = framework::MakeChannel<std::vector<framework::LoDTensor>>(
-      kDoubleBufferSize);
-  std::thread prefetch(&DoubleBufferReader::PrefetchThreadFunc, this);
-  prefetch.detach();
+  EndPrefetcher();
+  StartPrefetcher();
 }
 
 void DoubleBufferReader::PrefetchThreadFunc() {
   VLOG(5) << "A new prefetch thread starts.";
+  size_t cached_tensor_id = 0;
   while (true) {
-    std::vector<framework::LoDTensor> batch;
-    reader_->ReadNext(&batch);
-    if (batch.empty()) {
-      // EOF
-      buffer_->Close();
-      VLOG(5) << "Reached the end of the file. The prefetch thread terminates.";
+    auto& cpu_batch = cpu_tensor_cache_[cached_tensor_id];
+    reader_->ReadNext(&cpu_batch);
+    if (cpu_batch.empty()) {
+      // The underlying reader have no next data.
       break;
     }
-    if (!buffer_->Send(&batch)) {
+    if (platform::is_gpu_place(place_)) {
+      auto& gpu_batch = gpu_tensor_cache_[cached_tensor_id];
+      gpu_batch.resize(cpu_batch.size());
+      for (size_t i = 0; i < cpu_batch.size(); ++i) {
+        // TODO(fengjiayi): Use asynchronous TensorCopy instead
+        framework::TensorCopySync(cpu_batch[i], place_, &gpu_batch[i]);
+        gpu_batch[i].set_lod(cpu_batch[i].lod());
+      }
+    }
+    if (!channel_->Send(cached_tensor_id)) {
       VLOG(5) << "WARNING: The double buffer channel has been closed. The "
-                 "prefetch thread terminates.";
+                 "prefetch thread will terminate.";
       break;
     }
+    ++cached_tensor_id;
+    cached_tensor_id %= kCacheSize;
   }
+  channel_->Close();
+  VLOG(5) << "Prefetch thread terminates.";
 }
 
-bool DoubleBufferReader::HasNext() const { PADDLE_THROW("Not Implemented"); }
-
 }  // namespace reader
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..19b54110b9aeece33b8d6c73612ae0e12dbfafbd
--- /dev/null
+++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
@@ -0,0 +1,93 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class MultiPassReader : public framework::DecoratedReader {
+ public:
+  MultiPassReader(const std::shared_ptr<ReaderBase>& reader, int pass_num)
+      : DecoratedReader(reader), pass_num_(pass_num), pass_count_(0) {}
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    reader_->ReadNext(out);
+    if (out->empty()) {
+      ++pass_count_;
+      if (pass_count_ < pass_num_) {
+        reader_->ReInit();
+        reader_->ReadNext(out);
+      }
+    }
+  }
+
+  void ReInit() override {
+    pass_count_ = 0;
+    reader_->ReInit();
+  }
+
+ private:
+  int pass_num_;
+  mutable int pass_count_;
+};
+
+class CreateMultiPassReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    auto* out = detail::Ref(scope.FindVar(Output("Out")))
+                    .GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) {
+      return;
+    }
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    int pass_num = Attr<int>("pass_num");
+    out->Reset(new MultiPassReader(underlying_reader.Get(), pass_num));
+  }
+};
+
+class CreateMultiPassReaderOpMaker : public DecoratedReaderMakerBase {
+ protected:
+  void Apply() override {
+    AddAttr<int>("pass_num", "The number of pass to run.").GreaterThan(0);
+    AddComment(R"DOC(
+      CreateMultiPassReader Operator
+
+      This operator creates a multi-pass reader. A multi-pass reader
+      is used to yield data for several pass training continuously.
+      It takes the number of passes to run as one of its attributes
+      ('pass_num'), and maintains a pass counter to record how many
+      passes it has completed. When the underlying reader reaches the
+      EOF, the multi-pass reader checks whether it has completed training
+      of the given number of pass. If not, the underlying reader will
+      be re-initialized and starts a new pass automatically.
+    )DOC");
+  }
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators::reader;
+REGISTER_DECORATED_READER_OPERATOR(create_multi_pass_reader,
+                                   ops::CreateMultiPassReaderOp,
+                                   ops::CreateMultiPassReaderOpMaker);
diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..36587360f7347a10e01d4e994482027d9a9bb5d0
--- /dev/null
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class PyReader : public framework::ReaderBase {
+ public:
+  explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue) {
+    PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
+    queue_ = queue;
+  }
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    bool success;
+    *out = queue_->Pop(&success);
+    if (!success) out->clear();
+  }
+
+  void ReInit() override {}
+
+ private:
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
+};
+
+class CreatePyReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) return;
+
+    const std::string& queue_name = Input("blocking_queue");
+    auto* queue_holder_var = scope.FindVar(queue_name);
+    PADDLE_ENFORCE(
+        queue_holder_var != nullptr,
+        "No LoDTensorBlockingQueueHolder variable with name %s found",
+        queue_name);
+    auto* queue_holder =
+        queue_holder_var->template GetMutable<LoDTensorBlockingQueueHolder>();
+
+    out->Reset(new PyReader(queue_holder->GetQueue()));
+  }
+};
+
+class CreatePyReaderOpMaker : public FileReaderMakerBase {
+ protected:
+  void Apply() override {
+    AddInput("blocking_queue",
+             "Name of the `LoDTensorBlockingQueueHolder` variable");
+
+    AddComment(R"DOC(
+			Create PyReader to support LoDTensor data feeding in Python side.
+      )DOC");
+  }
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+namespace reader = ::paddle::operators::reader;
+
+REGISTER_FILE_READER_OPERATOR(create_py_reader, reader::CreatePyReaderOp,
+                              reader::CreatePyReaderOpMaker);
diff --git a/paddle/fluid/operators/reader/create_random_data_generator_op.cc b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
index e62f952d0e89561c3eed56112dc9d1d78801b59e..5b7e8a063a034f0be056065826fca0fe807bc9a7 100644
--- a/paddle/fluid/operators/reader/create_random_data_generator_op.cc
+++ b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
@@ -19,16 +19,17 @@ namespace operators {
 namespace reader {
 
 template <typename T>
-class RandomDataGenerator : public framework::FileReader {
+class RandomDataGenerator : public framework::ReaderBase {
  public:
-  RandomDataGenerator(const std::vector<framework::DDim>& shapes, float min,
-                      float max)
-      : FileReader(shapes), min_(min), max_(max) {
-    PADDLE_ENFORCE_LE(
-        min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max);
+  RandomDataGenerator(const std::vector<framework::DDim>& shapes, float low,
+                      float high)
+      : framework::ReaderBase(), low_(low), high_(high), shapes_(shapes) {
+    PADDLE_ENFORCE_LE(low, high,
+                      "'low' shouldn't be greater than 'high'.(%f vs %f)", low,
+                      high);
     unsigned int seed = std::random_device()();
     engine_.seed(seed);
-    dist_ = std::uniform_real_distribution<float>(min_, max_);
+    dist_ = std::uniform_real_distribution<float>(low_, high_);
   }
 
   void ReadNext(std::vector<framework::LoDTensor>* out) override {
@@ -52,13 +53,12 @@ class RandomDataGenerator : public framework::FileReader {
 
   void ReInit() override { return; }
 
-  bool HasNext() const override { return true; }
-
  private:
-  float min_;
-  float max_;
+  float low_;
+  float high_;
   std::minstd_rand engine_;
   std::uniform_real_distribution<float> dist_;
+  std::vector<framework::DDim> shapes_;
 };
 
 template <typename T>
@@ -73,29 +73,28 @@ class CreateRandomDataGeneratorOp : public framework::OperatorBase {
     const auto& ranks = Attr<std::vector<int>>("ranks");
     PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
     PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
-                      int(shape_concat.size()),
+                      static_cast<int>(shape_concat.size()),
                       "The accumulate of all ranks should be equal to the "
                       "shape concat's length.");
     std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new RandomDataGenerator<T>(shapes, Attr<float>("min"),
-                                          Attr<float>("max")));
+    out->Reset(new RandomDataGenerator<T>(shapes, Attr<float>("low"),
+                                          Attr<float>("high")));
   }
 };
 
 class CreateRandomDataGeneratorOpMaker : public FileReaderMakerBase {
- public:
-  CreateRandomDataGeneratorOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : FileReaderMakerBase(op_proto, op_checker) {
-    AddAttr<float>("min", "The lower bound of reader's uniform distribution.");
-    AddAttr<float>("max", "The upper bound of reader's uniform distribution.");
+ protected:
+  void Apply() override {
+    AddAttr<float>("low", "The lower bound of reader's uniform distribution.");
+    AddAttr<float>("high", "The upper bound of reader's uniform distribution.");
     AddComment(R"DOC(
       CreateRandomDataGenerator Operator
 
       This Op creates a random reader.
       The reader generates random data instead of really reading from files.
-      Generated data follow an uniform distribution between 'min' and 'max'.
+      Generated data follow an uniform distribution between 'low' and 'high'.
     )DOC");
   }
 };
diff --git a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
index c3eb247bbe2041ae5a673c4fd3c1284c71276f91..559827f08494af6730aafa1e67c46a47c21dedf6 100644
--- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
@@ -18,24 +18,35 @@
 namespace paddle {
 namespace operators {
 namespace reader {
+template <bool ThreadSafe>
 class RecordIOFileReader : public framework::FileReader {
  public:
-  RecordIOFileReader(const std::string& filename,
-                     const std::vector<framework::DDim>& shapes)
-      : FileReader(shapes),
+  explicit RecordIOFileReader(const std::string& filename,
+                              const std::vector<framework::DDim>& dims)
+      : FileReader(dims),
         scanner_(filename),
         dev_ctx_(*platform::DeviceContextPool::Instance().Get(
-            platform::CPUPlace())) {}
-
-  void ReadNext(std::vector<framework::LoDTensor>* out) override {
-    *out = framework::ReadFromRecordIO(scanner_, dev_ctx_);
+            platform::CPUPlace())) {
+    if (ThreadSafe) {
+      mutex_.reset(new std::mutex());
+    }
+    LOG(INFO) << "Creating file reader" << filename;
   }
 
-  bool HasNext() const override { return scanner_.HasNext(); }
-
   void ReInit() override { scanner_.Reset(); }
 
+ protected:
+  void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
+    if (ThreadSafe) {
+      std::lock_guard<std::mutex> guard(*mutex_);
+      *out = framework::ReadFromRecordIO(&scanner_, dev_ctx_);
+    } else {
+      *out = framework::ReadFromRecordIO(&scanner_, dev_ctx_);
+    }
+  }
+
  private:
+  std::unique_ptr<std::mutex> mutex_;
   recordio::Scanner scanner_;
   const platform::DeviceContext& dev_ctx_;
 };
@@ -51,27 +62,31 @@ class CreateRecordIOReaderOp : public framework::OperatorBase {
     const auto& ranks = Attr<std::vector<int>>("ranks");
     PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
     PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
-                      int(shape_concat.size()),
+                      static_cast<int>(shape_concat.size()),
                       "The accumulate of all ranks should be equal to the "
                       "shape concat's length.");
-    std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
     std::string filename = Attr<std::string>("filename");
 
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new RecordIOFileReader(filename, shapes));
+
+    out->Reset(new RecordIOFileReader<true>(
+        filename, RestoreShapes(shape_concat, ranks)));
   }
 };
 
 class CreateRecordIOReaderOpMaker : public FileReaderMakerBase {
- public:
-  CreateRecordIOReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : FileReaderMakerBase(op_proto, op_checker) {
-    AddAttr<std::string>("filename", "The filename of record io reader");
+ protected:
+  void Apply() override {
+    AddAttr<std::string>(
+        "filename",
+        "The filename of record file. This file will given to reader.");
     AddComment(R"DOC(
-      CreateRecordIOReader Operator
+Open a recordio file and return the reader object. The returned reader object
+is thread-safe.
 
-      Create a reader from a record io file
+NOTE: This is a very low-level API. It is used for debugging data file or
+training. Please use `open_files` instead of this API for production usage.
     )DOC");
   }
 };
@@ -85,3 +100,5 @@ namespace reader = paddle::operators::reader;
 REGISTER_FILE_READER_OPERATOR(create_recordio_file_reader,
                               reader::CreateRecordIOReaderOp,
                               reader::CreateRecordIOReaderOpMaker);
+
+REGISTER_FILE_READER(recordio, reader::RecordIOFileReader<false>);
diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
index 4dac3831109beeed660d32f08fb27c7adf62ac2b..57e8e21214b7c99e52550fe51a67c9b5201cb46f 100644
--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
@@ -12,6 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <random>
+#include "glog/logging.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 
 namespace paddle {
@@ -20,43 +23,54 @@ namespace reader {
 
 class ShuffleReader : public framework::DecoratedReader {
  public:
-  ShuffleReader(ReaderBase* reader, int buffer_size)
-      : DecoratedReader(reader), buffer_size_(buffer_size), iteration_pos_(0) {
-    buffer_.reserve(buffer_size);
+  ShuffleReader(const std::shared_ptr<ReaderBase>& reader, size_t buffer_size,
+                size_t seed = 0)
+      : DecoratedReader(reader), buffer_size_(buffer_size), seed_(seed) {
+    VLOG(10) << "Create shuffle reader of " << reader_;
+    if (seed_ == 0) {
+      std::random_device device;
+      seed_ = device();
+    }
+    ReloadBuffer();
   }
 
-  void ReadNext(std::vector<framework::LoDTensor>* out) override;
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    out->clear();
+    if (iteration_pos_ >= buffer_.size()) {
+      VLOG(10) << "Resetting shuffle buffer";
+      ReloadBuffer();
+      if (buffer_.empty()) {
+        return;
+      }
+    }
+    *out = buffer_[iteration_pos_++];
+  }
 
  private:
-  int buffer_size_;
-  std::vector<std::vector<framework::LoDTensor>> buffer_;
-  size_t iteration_pos_;
-};
-
-void ShuffleReader::ReadNext(std::vector<framework::LoDTensor>* out) {
-  if (iteration_pos_ >= buffer_.size()) {
-    // Reload buffer with new data
+  void ReloadBuffer() {
     buffer_.clear();
     buffer_.reserve(buffer_size_);
-    for (int i = 0; i < buffer_size_; ++i) {
-      buffer_.push_back(std::vector<framework::LoDTensor>());
-      reader_->ReadNext(&buffer_.back());
-      if (buffer_.back().empty()) {
-        buffer_.pop_back();
+    iteration_pos_ = 0;
+    for (size_t i = 0; i < buffer_size_; ++i) {
+      std::vector<framework::LoDTensor> ins;
+      reader_->ReadNext(&ins);
+      if (ins.empty()) {
         break;
       }
+      buffer_.emplace_back(ins);
     }
-    // TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be
-    // optimize.
-    std::random_shuffle(buffer_.begin(), buffer_.end());
-    iteration_pos_ = 0;
-  }
-  out->clear();
-  if (!buffer_.empty()) {
-    std::swap(*out, buffer_[iteration_pos_++]);
+    std::mt19937 g(seed_);
+    std::shuffle(buffer_.begin(), buffer_.end(), g);
+    seed_ = g();  // update seed_;
+    VLOG(10) << "random buffer size = " << buffer_.size();
   }
-  // if buffer_ is empty, the 'out' will return as an empty vector.
-}
+
+  size_t buffer_size_;
+  std::vector<std::vector<framework::LoDTensor>> buffer_;
+
+  size_t iteration_pos_;
+  size_t seed_;
+};
 
 class CreateShuffleReaderOp : public framework::OperatorBase {
  public:
@@ -65,19 +79,22 @@ class CreateShuffleReaderOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override {
+    auto* out = detail::Ref(scope.FindVar(Output("Out")))
+                    .GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) {
+      return;
+    }
     const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
                                         ->Get<framework::ReaderHolder>();
-    auto* out = scope.FindVar(Output("Out"))
-                    ->template GetMutable<framework::ReaderHolder>();
     out->Reset(
-        new ShuffleReader(underlying_reader.Get(), Attr<int>("buffer_size")));
+        new ShuffleReader(underlying_reader.Get(),
+                          static_cast<size_t>(Attr<int>("buffer_size"))));
   }
 };
 
 class CreateShuffleReaderOpMaker : public DecoratedReaderMakerBase {
- public:
-  CreateShuffleReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
-      : DecoratedReaderMakerBase(op_proto, op_checker) {
+ protected:
+  void Apply() override {
     AddAttr<int>("buffer_size", "The shuffle buffer size.").GreaterThan(0);
     AddComment(R"DOC(
       CreateShuffleReader Operator
diff --git a/paddle/fluid/operators/reader/create_threaded_reader_op.cc b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3798015146f4ffb085aa82e23ca3f1fb3c5cf5a4
--- /dev/null
+++ b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
@@ -0,0 +1,79 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class ThreadedReader : public framework::DecoratedReader {
+ public:
+  explicit ThreadedReader(const std::shared_ptr<ReaderBase>& reader)
+      : DecoratedReader(reader) {}
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    reader_->ReadNext(out);
+  }
+
+  void ReInit() override { reader_->ReInit(); }
+
+ private:
+  std::mutex mutex_;
+};
+
+class CreateThreadedReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    auto* out = detail::Ref(scope.FindVar(Output("Out")))
+                    .GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) {
+      return;
+    }
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    out->Reset(new ThreadedReader(underlying_reader.Get()));
+  }
+};
+
+class CreateThreadedReaderOpMaker : public DecoratedReaderMakerBase {
+ protected:
+  void Apply() override {
+    AddComment(R"DOC(
+      CreateThreadedReader Operator
+
+      This operator creates a threaded reader. A threaded reader's
+      'ReadNext()' can be invoked by several threads at the same
+      time.
+      When the attribute 'safe_mode' is true, the threaded reader's
+      'ReInit()' is disabled to avoid unexpected bugs in multi-thread
+      environment.
+    )DOC");
+  }
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+namespace reader = paddle::operators::reader;
+REGISTER_DECORATED_READER_OPERATOR(create_threaded_reader,
+                                   reader::CreateThreadedReaderOp,
+                                   reader::CreateThreadedReaderOpMaker);
diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..30d962ba10a954a837f9771d21cedf0feb643439
--- /dev/null
+++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h
@@ -0,0 +1,103 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/operators/reader/blocking_queue.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class LoDTensorBlockingQueueHolder;
+
+class LoDTensorBlockingQueue {
+  friend class LoDTensorBlockingQueueHolder;
+
+ private:
+  LoDTensorBlockingQueue(size_t capacity,
+                         const std::vector<framework::DDim>& dims)
+      : queue_(capacity), dims_(dims) {}
+
+ public:
+  bool Push(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
+    CheckDims(lod_tensor_vec);
+    return queue_.Send(lod_tensor_vec);
+  }
+
+  bool Push(std::vector<framework::LoDTensor>&& lod_tensor_vec) {
+    CheckDims(lod_tensor_vec);
+    return queue_.Send(std::move(lod_tensor_vec));
+  }
+
+  std::vector<framework::LoDTensor> Pop(bool* ok = nullptr) {
+    std::vector<framework::LoDTensor> lod_tensor_vec;
+    bool success = queue_.Receive(&lod_tensor_vec);
+    if (ok != nullptr) *ok = success;
+    return lod_tensor_vec;
+  }
+
+  inline size_t Cap() const { return queue_.Cap(); }
+
+  inline size_t Size() const { return queue_.Size(); }
+
+  inline void Close() { return queue_.Close(); }
+
+  inline bool IsClosed() const { return queue_.IsClosed(); }
+
+ private:
+  void CheckDims(const std::vector<framework::LoDTensor>& lod_tensor_vec) {
+    PADDLE_ENFORCE(dims_.size() == lod_tensor_vec.size(),
+                   "Expect input size is %d but found %s", dims_.size(),
+                   lod_tensor_vec.size());
+    for (size_t i = 0; i < dims_.size(); ++i) {
+      const auto& in_dims = framework::slice_ddim(
+          lod_tensor_vec[i].dims(), 1, lod_tensor_vec[i].dims().size());
+      const auto& expect_dims =
+          framework::slice_ddim(dims_[i], 1, dims_[i].size());
+      PADDLE_ENFORCE(in_dims == expect_dims,
+                     "Dims of the %d-th input tensor do not match", i);
+    }
+  }
+
+  BlockingQueue<std::vector<framework::LoDTensor>> queue_;
+  std::vector<framework::DDim> dims_;
+};
+
+class LoDTensorBlockingQueueHolder {
+ public:
+  void InitOnce(size_t capacity, const std::vector<framework::DDim>& dims) {
+    PADDLE_ENFORCE(
+        queue_ == nullptr,
+        "LoDTensorBlockingQueueHolder::InitOnce() can only be called once");
+    queue_.reset(new LoDTensorBlockingQueue(capacity, dims));
+  }
+
+  inline const std::shared_ptr<LoDTensorBlockingQueue>& GetQueue() const {
+    return queue_;
+  }
+
+ private:
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..31e5d81e55ed9703eb3a9ef2595fa2a280f1a734
--- /dev/null
+++ b/paddle/fluid/operators/reader/open_files_op.cc
@@ -0,0 +1,213 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <thread>  // NOLINT
+
+#include "paddle/fluid/operators/reader/blocking_queue.h"
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class MultiFileReader : public framework::ReaderBase {
+ public:
+  MultiFileReader(const std::vector<std::string>& file_names,
+                  const std::vector<framework::DDim>& dims, size_t thread_num,
+                  size_t buffer_size)
+      : buffer_size_(buffer_size) {
+    readers_.reserve(file_names.size());
+    for (const std::string& f_name : file_names) {
+      readers_.emplace_back(CreateReaderByFileName(f_name, dims));
+    }
+    prefetchers_.resize(thread_num);
+    StartNewScheduler();
+  }
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override;
+  void ReInit() override;
+
+  ~MultiFileReader() { EndScheduler(); }
+
+ private:
+  void StartNewScheduler();
+  void EndScheduler();
+  void ScheduleThreadFunc();
+  void PrefetchThreadFunc(size_t reader_idx, size_t thread_idx);
+
+  std::vector<std::unique_ptr<framework::ReaderBase>> readers_;
+  std::thread scheduler_;
+  std::vector<std::thread> prefetchers_;
+  size_t buffer_size_;
+  reader::BlockingQueue<size_t>* waiting_reader_idx_;
+  reader::BlockingQueue<size_t>* available_thread_idx_;
+  reader::BlockingQueue<std::vector<framework::LoDTensor>>* buffer_;
+};
+
+void MultiFileReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+  if (!buffer_->Receive(out)) {
+    out->clear();
+  }
+}
+
+void MultiFileReader::ReInit() {
+  EndScheduler();
+  StartNewScheduler();
+}
+
+void MultiFileReader::StartNewScheduler() {
+  size_t thread_num = prefetchers_.size();
+  waiting_reader_idx_ = new reader::BlockingQueue<size_t>(readers_.size());
+  available_thread_idx_ = new reader::BlockingQueue<size_t>(thread_num);
+  buffer_ = new reader::BlockingQueue<std::vector<framework::LoDTensor>>(
+      buffer_size_);
+
+  for (size_t i = 0; i < readers_.size(); ++i) {
+    waiting_reader_idx_->Send(i);
+  }
+  waiting_reader_idx_->Close();
+  for (size_t i = 0; i < thread_num; ++i) {
+    available_thread_idx_->Send(i);
+  }
+
+  scheduler_ = std::thread([this] { ScheduleThreadFunc(); });
+}
+
+void MultiFileReader::EndScheduler() {
+  available_thread_idx_->Close();
+  buffer_->Close();
+  waiting_reader_idx_->Close();
+  if (scheduler_.joinable()) {
+    scheduler_.join();
+  }
+  delete buffer_;
+  delete available_thread_idx_;
+  delete waiting_reader_idx_;
+}
+
+void MultiFileReader::ScheduleThreadFunc() {
+  VLOG(5) << "MultiFileReader schedule thread starts.";
+  size_t completed_thread_num = 0;
+  size_t thread_idx;
+  while (available_thread_idx_->Receive(&thread_idx)) {
+    std::thread& prefetcher = prefetchers_[thread_idx];
+    if (prefetcher.joinable()) {
+      prefetcher.join();
+    }
+    size_t reader_idx;
+    if (waiting_reader_idx_->Receive(&reader_idx)) {
+      // Still have files to read. Start a new prefetch thread.
+      prefetcher = std::thread([this, reader_idx, thread_idx] {
+        PrefetchThreadFunc(reader_idx, thread_idx);
+      });
+    } else {
+      // No more file to read.
+      ++completed_thread_num;
+      if (completed_thread_num == prefetchers_.size()) {
+        buffer_->Close();
+        break;
+      }
+    }
+  }
+  // If users invoke ReInit() when scheduler is running, it will close the
+  // 'avaiable_thread_idx_' and prefecther threads have no way to tell scheduler
+  // to release their resource. So a check is needed before scheduler ends.
+  for (auto& p : prefetchers_) {
+    if (p.joinable()) {
+      p.join();
+    }
+  }
+  VLOG(5) << "MultiFileReader schedule thread terminates.";
+}
+
+void MultiFileReader::PrefetchThreadFunc(size_t reader_idx, size_t thread_idx) {
+  VLOG(5) << "The prefetch thread of file idx '" << reader_idx << "' starts.";
+  std::unique_ptr<framework::ReaderBase>& reader = readers_[reader_idx];
+  while (true) {
+    std::vector<framework::LoDTensor> ins;
+    reader->ReadNext(&ins);
+    if (ins.empty()) {
+      reader->ReInit();
+      break;
+    }
+    try {
+      buffer_->Send(std::move(ins));
+    } catch (paddle::platform::EnforceNotMet e) {
+      VLOG(5) << "WARNING: The buffer channel has been closed. The prefetch "
+                 "thread of file idx '"
+              << reader_idx << "' will terminate.";
+      break;
+    }
+  }
+
+  if (!available_thread_idx_->Send(thread_idx)) {
+    VLOG(5) << "WARNING: The available_thread_idx_ channel has been closed. "
+               "Fail to send thread_idx.";
+  }
+  VLOG(5) << "The prefetch thread of file idx '" << reader_idx
+          << "' terminates.";
+}
+
+class OpenFilesOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    const auto& shape_concat = Attr<std::vector<int>>("shape_concat");
+    const auto& ranks = Attr<std::vector<int>>("ranks");
+    PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
+    PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
+                      static_cast<int>(shape_concat.size()),
+                      "The accumulate of all ranks should be equal to the "
+                      "shape concat's length.");
+    const auto& file_names = Attr<std::vector<std::string>>("file_names");
+    PADDLE_ENFORCE(!file_names.empty(), "No file to be read!");
+    const size_t thread_num = Attr<int>("thread_num");
+    const size_t buffer_size = Attr<int>("buffer_size");
+
+    auto* out = scope.FindVar(Output("Out"))
+                    ->template GetMutable<framework::ReaderHolder>();
+    out->Reset(new MultiFileReader(file_names,
+                                   RestoreShapes(shape_concat, ranks),
+                                   thread_num, buffer_size));
+  }
+};
+
+class OpenFilesOpMaker : public FileReaderMakerBase {
+ protected:
+  void Apply() override {
+    AddAttr<std::vector<std::string>>("file_names", "Files to be read.");
+    AddAttr<int>("thread_num", "The maximal concurrent prefetch thread number.")
+        .GreaterThan(0);
+    AddAttr<int>("buffer_size", "The size of prefetch buffer.").GreaterThan(0);
+
+    AddComment(R"DOC(
+      OpenFiles Operator
+
+      An OpenFilesOp creates a MultiFileReader, which is able to
+      read data multi-threaded from multiple files.
+    )DOC");
+  }
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+namespace reader = paddle::operators::reader;
+
+REGISTER_FILE_READER_OPERATOR(open_files, reader::OpenFilesOp,
+                              reader::OpenFilesOpMaker);
diff --git a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7d1b381d56c8cdc1e79e594b18c1a1ed59ab5284
--- /dev/null
+++ b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
@@ -0,0 +1,219 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <chrono>  // NOLINT
+#include <set>
+#include <thread>  // NOLINT
+#include <vector>
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/operators/reader/blocking_queue.h"
+
+using paddle::operators::reader::BlockingQueue;
+
+TEST(BlockingQueue, CapacityTest) {
+  size_t cap = 10;
+  BlockingQueue<int> q(cap);
+  EXPECT_EQ(q.Cap(), cap);
+}
+
+void FirstInFirstOut(size_t queue_cap, size_t elem_num, size_t send_time_gap,
+                     size_t receive_time_gap) {
+  BlockingQueue<size_t> q(queue_cap);
+  std::thread sender([&]() {
+    for (size_t i = 0; i < elem_num; ++i) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(send_time_gap));
+      EXPECT_TRUE(q.Send(i));
+    }
+    q.Close();
+  });
+  size_t count = 0;
+  while (true) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(receive_time_gap));
+    size_t elem;
+    if (!q.Receive(&elem)) {
+      break;
+    }
+    EXPECT_EQ(elem, count++);
+  }
+  sender.join();
+  EXPECT_EQ(count, elem_num);
+  EXPECT_TRUE(q.IsClosed());
+}
+
+TEST(BlockingQueue, FirstInFirstOutTest) {
+  FirstInFirstOut(2, 5, 2, 50);
+  FirstInFirstOut(2, 5, 50, 2);
+  FirstInFirstOut(10, 3, 50, 2);
+  FirstInFirstOut(10, 3, 2, 50);
+}
+
+TEST(BlockingQueue, SenderBlockingTest) {
+  const size_t queue_cap = 2;
+  BlockingQueue<size_t> q(queue_cap);
+  size_t send_count = 0;
+  std::thread sender([&]() {
+    for (size_t i = 0; i < 5; ++i) {
+      if (!q.Send(i)) {
+        break;
+      }
+      ++send_count;
+    }
+  });
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));
+  q.Close();
+  sender.join();
+  EXPECT_EQ(send_count, queue_cap);
+  std::vector<size_t> res;
+  while (true) {
+    size_t elem;
+    if (!q.Receive(&elem)) {
+      break;
+    }
+    res.push_back(elem);
+  }
+  EXPECT_EQ(res.size(), queue_cap);
+  for (size_t i = 0; i < res.size(); ++i) {
+    EXPECT_EQ(res[i], i);
+  }
+}
+
+TEST(BlockingQueue, ReceiverBlockingTest) {
+  const size_t queue_cap = 5;
+  BlockingQueue<size_t> q(queue_cap);
+  std::vector<size_t> receive_res;
+  std::thread receiver([&]() {
+    size_t elem;
+    while (true) {
+      if (!q.Receive(&elem)) {
+        break;
+      }
+      receive_res.push_back(elem);
+    }
+  });
+  std::vector<size_t> to_send{2, 1, 7};
+  for (auto e : to_send) {
+    q.Send(e);
+  }
+  q.Close();
+  receiver.join();
+  EXPECT_EQ(receive_res.size(), to_send.size());
+  for (size_t i = 0; i < to_send.size(); ++i) {
+    EXPECT_EQ(receive_res[i], to_send[i]);
+  }
+}
+
+void CheckIsUnorderedSame(const std::vector<std::vector<size_t>>& v1,
+                          const std::vector<std::vector<size_t>>& v2) {
+  std::set<size_t> s1;
+  std::set<size_t> s2;
+  for (auto vec : v1) {
+    for (size_t elem : vec) {
+      s1.insert(elem);
+    }
+  }
+  for (auto vec : v2) {
+    for (size_t elem : vec) {
+      s2.insert(elem);
+    }
+  }
+  EXPECT_EQ(s1.size(), s2.size());
+  auto it1 = s1.begin();
+  auto it2 = s2.begin();
+  while (it1 != s1.end()) {
+    EXPECT_EQ(*it1, *it2);
+    ++it1;
+    ++it2;
+  }
+}
+
+void MultiSenderMultiReceiver(const size_t queue_cap,
+                              const std::vector<std::vector<size_t>>& to_send,
+                              size_t receiver_num, size_t send_time_gap,
+                              size_t receive_time_gap) {
+  BlockingQueue<size_t> q(queue_cap);
+  size_t sender_num = to_send.size();
+  std::vector<std::thread> senders;
+  for (size_t s_idx = 0; s_idx < sender_num; ++s_idx) {
+    senders.emplace_back(std::thread([&, s_idx] {
+      for (size_t elem : to_send[s_idx]) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(send_time_gap));
+        EXPECT_TRUE(q.Send(elem));
+      }
+    }));
+  }
+  std::vector<std::thread> receivers;
+  std::mutex mu;
+  std::vector<std::vector<size_t>> res;
+  for (size_t r_idx = 0; r_idx < receiver_num; ++r_idx) {
+    receivers.emplace_back(std::thread([&] {
+      std::vector<size_t> receiver_res;
+      while (true) {
+        std::this_thread::sleep_for(
+            std::chrono::milliseconds(receive_time_gap));
+        size_t elem;
+        if (!q.Receive(&elem)) {
+          break;
+        }
+        receiver_res.push_back(elem);
+      }
+      std::lock_guard<std::mutex> lock(mu);
+      res.push_back(receiver_res);
+    }));
+  }
+  for (auto& t : senders) {
+    t.join();
+  }
+  q.Close();
+  for (auto& t : receivers) {
+    t.join();
+  }
+  CheckIsUnorderedSame(to_send, res);
+}
+
+TEST(BlockingQueue, MultiSenderMultiReaderTest) {
+  std::vector<std::vector<size_t>> to_send_1{{2, 3, 4}, {9}, {0, 7, 15, 6}};
+  MultiSenderMultiReceiver(2, to_send_1, 2, 0, 0);
+  MultiSenderMultiReceiver(10, to_send_1, 2, 0, 0);
+  MultiSenderMultiReceiver(2, to_send_1, 20, 0, 0);
+  MultiSenderMultiReceiver(2, to_send_1, 2, 50, 0);
+  MultiSenderMultiReceiver(2, to_send_1, 2, 0, 50);
+
+  std::vector<std::vector<size_t>> to_send_2{
+      {2, 3, 4}, {}, {0, 7, 15, 6, 9, 32}};
+  MultiSenderMultiReceiver(2, to_send_2, 3, 0, 0);
+  MultiSenderMultiReceiver(20, to_send_2, 3, 0, 0);
+  MultiSenderMultiReceiver(2, to_send_2, 30, 0, 0);
+  MultiSenderMultiReceiver(2, to_send_2, 3, 50, 0);
+  MultiSenderMultiReceiver(2, to_send_2, 3, 0, 50);
+}
+
+struct MyClass {
+  MyClass() : val_(0) {}
+  explicit MyClass(int val) : val_(val) {}
+  MyClass(const MyClass& b) { val_ = b.val_; }
+  MyClass(MyClass&& b) { val_ = b.val_; }
+  void operator=(const MyClass& b) { val_ = b.val_; }
+
+  int val_;
+};
+
+TEST(BlockingQueue, MyClassTest) {
+  BlockingQueue<MyClass> q(2);
+  MyClass a(200);
+  q.Send(std::move(a));
+  MyClass b;
+  q.Receive(&b);
+  EXPECT_EQ(a.val_, b.val_);
+}
diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc
index 33d4ff4099a509daeaab83032c5d382718904dc7..e11256a49ffa6adc9410376cc8a71fa017df7e9c 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -12,7 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "reader_op_registry.h"
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+#include <string>
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -31,11 +33,28 @@ std::vector<framework::DDim> RestoreShapes(const std::vector<int>& shape_concat,
   return res;
 }
 
-FileReaderMakerBase::FileReaderMakerBase(
-    framework::OpProtoAndCheckerMaker::OpProto* op_proto,
-    framework::OpAttrChecker* op_checker)
-    : OpProtoAndCheckerMaker(op_proto, op_checker) {
-  AddOutput("Out", "(ReaderHolder) The created random reader.").AsDuplicable();
+std::unordered_map<std::string, FileReaderCreator>& FileReaderRegistry() {
+  static std::unordered_map<std::string, FileReaderCreator> regs;
+  return regs;
+}
+
+std::unique_ptr<framework::ReaderBase> CreateReaderByFileName(
+    const std::string& file_name, const std::vector<framework::DDim>& dims) {
+  size_t separator_pos = file_name.find_last_of(kFileFormatSeparator);
+  PADDLE_ENFORCE_NE(separator_pos, std::string::npos,
+                    "File name illegal! A legal file name should be like: "
+                    "[file_name].[file_format] (e.g., 'data_file.recordio').");
+  std::string filetype = file_name.substr(separator_pos + 1);
+
+  auto itor = FileReaderRegistry().find(filetype);
+  PADDLE_ENFORCE(itor != FileReaderRegistry().end(),
+                 "No file reader registered for '%s' format.", filetype);
+  framework::ReaderBase* reader = (itor->second)(file_name, dims);
+  return std::unique_ptr<framework::ReaderBase>(reader);
+}
+
+void FileReaderMakerBase::Make() {
+  AddOutput("Out", "(ReaderHolder): The created random reader.").AsDuplicable();
   AddAttr<std::vector<int>>("shape_concat", "The concat of all data's shapes.");
   AddAttr<std::vector<int>>(
       "ranks",
@@ -46,6 +65,7 @@ FileReaderMakerBase::FileReaderMakerBase(
       "It means the reader will generate two data each time,"
       "whose shapes are [2,3,4] and [5,6] respectively.");
   AddAttr<std::vector<int>>("lod_levels", "The LoD levels of each data.");
+  Apply();
 }
 
 void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const {
@@ -95,6 +115,7 @@ void DecoratedReaderInferShape::operator()(
       boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
   out_reader->SetLoDLevels(in_reader->GetLoDLevels());
 }
+
 void DecoratedReaderInferVarType::operator()(
     const framework::OpDesc& op_desc, framework::BlockDesc* block) const {
   std::string in_reader_name = op_desc.Input("UnderlyingReader")[0];
@@ -105,13 +126,11 @@ void DecoratedReaderInferVarType::operator()(
   out_reader->SetDataTypes(in_reader->GetDataTypes());
 }
 
-DecoratedReaderMakerBase::DecoratedReaderMakerBase(
-    framework::OpProtoAndCheckerMaker::OpProto* op_proto,
-    framework::OpAttrChecker* op_checker)
-    : OpProtoAndCheckerMaker(op_proto, op_checker) {
+void DecoratedReaderMakerBase::Make() {
   AddInput("UnderlyingReader",
            "(ReaderHolder) The underlying reader for creating a batch reader.");
   AddOutput("Out", "(ReaderHolder) The created batch reader.");
+  Apply();
 }
 
 }  // namespace reader
diff --git a/paddle/fluid/operators/reader/reader_op_registry.h b/paddle/fluid/operators/reader/reader_op_registry.h
index d1f0498f4692247cda72fbcbdd5070ddfaa11553..244bf15f068a47efc29ee54492cdbdeb10025020 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.h
+++ b/paddle/fluid/operators/reader/reader_op_registry.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 
@@ -21,12 +23,34 @@ namespace paddle {
 namespace operators {
 namespace reader {
 
+static constexpr char kFileFormatSeparator[] = ".";
+
+using FileReaderCreator = std::function<framework::ReaderBase*(
+    const std::string&, const std::vector<framework::DDim>&)>;
+
+std::unordered_map<std::string, FileReaderCreator>& FileReaderRegistry();
+
+template <typename Reader>
+int RegisterFileReader(const std::string& filetype) {
+  FileReaderRegistry()[filetype] = [](
+      const std::string& fn, const std::vector<framework::DDim>& dims) {
+    return new Reader(fn, dims);
+  };
+  return 0;
+}
+
+std::unique_ptr<framework::ReaderBase> CreateReaderByFileName(
+    const std::string& file_name, const std::vector<framework::DDim>& dims);
+
 extern std::vector<framework::DDim> RestoreShapes(
     const std::vector<int>& shape_concat, const std::vector<int>& ranks);
 
 class FileReaderMakerBase : public framework::OpProtoAndCheckerMaker {
  public:
-  FileReaderMakerBase(OpProto* op_proto, OpAttrChecker* op_checker);
+  void Make() final;
+
+ protected:
+  virtual void Apply() = 0;
 };
 
 class FileReaderInferShape : public framework::InferShapeBase {
@@ -55,7 +79,10 @@ class DecoratedReaderInferVarType : public framework::VarTypeInference {
 
 class DecoratedReaderMakerBase : public framework::OpProtoAndCheckerMaker {
  public:
-  DecoratedReaderMakerBase(OpProto* op_proto, OpAttrChecker* op_checker);
+  void Make() final;
+
+ protected:
+  virtual void Apply() = 0;
 };
 
 }  // namespace reader
@@ -73,3 +100,15 @@ class DecoratedReaderMakerBase : public framework::OpProtoAndCheckerMaker {
                     paddle::operators::reader::DecoratedReaderInferShape, \
                     paddle::framework::EmptyGradOpMaker,                  \
                     paddle::operators::reader::DecoratedReaderInferVarType)
+
+#define REGISTER_FILE_READER(_filetype, _reader)            \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                           \
+      _reg_file_reader_##_filetype,                         \
+      "Must use REGISTER_FILE_READER in global namespace"); \
+  int TouchFileReader##_filetype() { return 0; }            \
+  int _reg_file_reader_entry_##filetype =                   \
+      paddle::operators::reader::RegisterFileReader<_reader>(#_filetype)
+
+#define USE_FILE_READER(filetype)         \
+  extern int TouchFileReader##filetype(); \
+  static int _use_##filetype = TouchFileReader##filetype()
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index 00241e768217db0a611c00bbc72e2fb83ade73b4..162bfcbb0844d29385d0f8ad5d25a3f8de6bd41b 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -429,7 +429,8 @@ class RecurrentGradOp : public RecurrentBase {
 
           auto sum_op = framework::OpRegistry::CreateOp(
               "sum", {{"X", {pg_names[param_id], new_inside_name}}},
-              {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
+              {{"Out", {pg_names[param_id]}}},
+              framework::AttributeMap{{"use_mkldnn", {false}}});
           sum_op->Run(cur_scope, place);
 
           cur_scope.Rename(new_inside_name, inside_grad_name);
@@ -508,8 +509,7 @@ class RecurrentGradOp : public RecurrentBase {
 
 class RecurrentOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RecurrentOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(kInputs, "rnn inputs").AsDuplicable();
     AddInput(kInitialStates, "rnn initial states").AsDuplicable();
     AddInput(kParameters,
@@ -596,7 +596,7 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
       }
     }
     grad->SetAttrMap(this->Attrs());
-    grad->SetBlockAttr(kStepBlock, *grad_block_[0]);
+    grad->SetBlockAttr(kStepBlock, grad_block_[0]);
 
     return std::unique_ptr<framework::OpDesc>(grad);
   }
diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc
index 083c1fae5e2016ada6309aba78bdfa6ad7fef89c..9854a31f5b10f5ecd940c0d41c2c3e468fc17bad 100644
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <future>  // NOLINT
 #include <ostream>
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-
-#include <future>
-#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
@@ -36,25 +36,29 @@ class RecvOp : public framework::OperatorBase {
                const platform::Place& place) const override {
     auto outs = Outputs("Out");
     std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+    int sync_mode = Attr<int>("sync_mode");
 
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
+    // For profiling
+    platform::RecordEvent record_event(Type(), &ctx);
+
+    distributed::RPCClient* rpc_client =
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
     for (size_t i = 0; i < outs.size(); i++) {
-      VLOG(3) << "getting " << outs[i];
-      client_.AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
+      VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
+      rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]);
+    }
+    if (sync_mode) {
+      rpc_client->Wait();
     }
-    PADDLE_ENFORCE(client_.Wait());
   }
-
- private:
-  mutable detail::RPCClient client_;
 };
 
 class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RecvOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() {
     AddOutput("Out", "(Tensor) Variables to get from server.").AsDuplicable();
     AddComment(R"DOC(
 Recv operator
@@ -66,12 +70,22 @@ This operator can get variables from server side.
                                       "Server endpoints in the order of input "
                                       "variables for mapping")
         .SetDefault({});
+    AddAttr<int>("sync_mode",
+                 "(int, default 0)"
+                 "sync recv or async recv.")
+        .SetDefault(0);
   }
 };
 
+class RecvOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(recv, ops::RecvOp, ops::RecvOpMaker);
+REGISTER_OPERATOR(recv, ops::RecvOp, paddle::framework::EmptyGradOpMaker,
+                  ops::RecvOpMaker, ops::RecvOpShapeInference);
diff --git a/paddle/fluid/operators/reduce_max_op.cc b/paddle/fluid/operators/reduce_max_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..95d3768e1fdf6947659c7b3a1c9d57fad741472a
--- /dev/null
+++ b/paddle/fluid/operators/reduce_max_op.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+
+REGISTER_REDUCE_OP(reduce_max);
+REGISTER_OP_CPU_KERNEL(
+    reduce_max, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+                                  ops::MaxFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
+                      ops::MaxFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MaxFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
+                      ops::MaxFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_max_op.cu b/paddle/fluid/operators/reduce_max_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0d86b3127e42f7ee14ba57b1c762e8128a0f2d54
--- /dev/null
+++ b/paddle/fluid/operators/reduce_max_op.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_max,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::MaxFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::MaxFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::MaxFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::MaxFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_max_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_mean_op.cc b/paddle/fluid/operators/reduce_mean_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fc258c2496340b47d24dc89f16f7419dbb4b0d95
--- /dev/null
+++ b/paddle/fluid/operators/reduce_mean_op.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_mean_op.h"
+
+REGISTER_REDUCE_OP(reduce_mean);
+REGISTER_OP_CPU_KERNEL(reduce_mean,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         float, ops::MeanFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         double, ops::MeanFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         int, ops::MeanFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         int64_t, ops::MeanFunctor>);
+REGISTER_OP_CPU_KERNEL(reduce_mean_grad,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             float, ops::MeanGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             double, ops::MeanGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int, ops::MeanGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int64_t, ops::MeanGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_mean_op.cu b/paddle/fluid/operators/reduce_mean_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..960cb3235be7f4cc98b97d3b088ceaeb3d4a4209
--- /dev/null
+++ b/paddle/fluid/operators/reduce_mean_op.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_mean_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_mean,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::MeanFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::MeanFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::MeanFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::MeanFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_mean_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                            float, ops::MeanGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::MeanGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::MeanGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::MeanGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_mean_op.h b/paddle/fluid/operators/reduce_mean_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..1359679c4767d2032bf3e3a90849ad2a2ef3e829
--- /dev/null
+++ b/paddle/fluid/operators/reduce_mean_op.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct MeanFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->mean(dim);
+  }
+};
+
+struct MeanGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    dx->device(place) = dy->broadcast(dim) / dx->constant(size);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_min_max_op.h b/paddle/fluid/operators/reduce_min_max_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec59f3e71c1c702655a3feed10935b2f5a29d8a8
--- /dev/null
+++ b/paddle/fluid/operators/reduce_min_max_op.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/fluid/operators/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct MaxFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->maximum(dim);
+  }
+};
+
+struct MinFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->minimum(dim);
+  }
+};
+
+struct MaxOrMinGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    auto equals = (*x) == y->broadcast(dim);
+    auto ones = dx->constant(1);
+    auto zeros = dx->constant(0);
+    // If there are multiple minimum or maximum elements, the subgradient of
+    // each is the set [0, 1], and we pass gradient to all of them here.
+    dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_min_op.cc b/paddle/fluid/operators/reduce_min_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..330a86d2e4237a10d8cf6fd40025540edf08d897
--- /dev/null
+++ b/paddle/fluid/operators/reduce_min_op.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+
+REGISTER_REDUCE_OP(reduce_min);
+REGISTER_OP_CPU_KERNEL(
+    reduce_min, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+                                  ops::MinFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
+                      ops::MinFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::MinFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
+                      ops::MinFunctor>);
+REGISTER_OP_CPU_KERNEL(
+    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_min_op.cu b/paddle/fluid/operators/reduce_min_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..da466f805eff4709dc23471baef03e94052ee6c1
--- /dev/null
+++ b/paddle/fluid/operators/reduce_min_op.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_min_max_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_min,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::MinFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::MinFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::MinFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::MinFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_min_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                           float, ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::MaxOrMinGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::MaxOrMinGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_op.cc b/paddle/fluid/operators/reduce_op.cc
deleted file mode 100644
index 69e8f8081e93cb74177eac1a57e0eaf284951e3f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_op.cc
+++ /dev/null
@@ -1,214 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/reduce_op.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class ReduceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of ReduceOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of ReduceOp should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_rank = x_dims.size();
-    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
-    int dim = ctx->Attrs().Get<int>("dim");
-    if (dim < 0) dim = x_rank + dim;
-    PADDLE_ENFORCE_LT(
-        dim, x_rank,
-        "The dim should be in the range [-rank(input), rank(input)).");
-    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
-    bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
-    if (reduce_all) {
-      if (keep_dim)
-        ctx->SetOutputDim(
-            "Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
-      else
-        ctx->SetOutputDim("Out", {1});
-    } else {
-      auto dims_vector = vectorize(x_dims);
-      if (keep_dim || x_rank == 1) {
-        dims_vector[dim] = 1;
-      } else {
-        dims_vector.erase(dims_vector.begin() + dim);
-      }
-      auto out_dims = framework::make_ddim(dims_vector);
-      ctx->SetOutputDim("Out", out_dims);
-      if (dim != 0) {
-        // Only pass LoD when not reducing on the first dim.
-        ctx->ShareLoD("X", /*->*/ "Out");
-      }
-    }
-  }
-};
-
-class ReduceGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null.");
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_rank = x_dims.size();
-    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
-    int dim = ctx->Attrs().Get<int>("dim");
-    if (dim < 0) dim = x_rank + dim;
-    PADDLE_ENFORCE_LT(
-        dim, x_rank,
-        "The dim should be in the range [-rank(input), rank(input)).");
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-      ctx->ShareLoD("X", /*->*/ x_grad_name);
-    }
-  }
-};
-
-class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  ReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(Tensor) The input tensor. Tensors with rank at most 6 are "
-             "supported.");
-    AddOutput("Out", "(Tensor) The result tensor.");
-    AddAttr<int>(
-        "dim",
-        "(int, default 0) The dimension to reduce. "
-        "Must be in the range [-rank(input), rank(input)). "
-        "If `dim < 0`, the dim to reduce is `rank + dim`. "
-        "Note that reducing on the first dim will make the LoD info lost.")
-        .SetDefault(0);
-    AddAttr<bool>("keep_dim",
-                  "(bool, default false) "
-                  "If true, retain the reduced dimension with length 1.")
-        .SetDefault(false);
-    AddAttr<bool>("reduce_all",
-                  "(bool, default false) "
-                  "If true, output a scalar reduced along all dimensions.")
-        .SetDefault(false);
-    comment_ = R"DOC(
-{ReduceOp} Operator.
-
-This operator computes the {reduce} of input tensor along the given dimension. 
-The result tensor has 1 fewer dimension than the input unless keep_dim is true.
-If reduce_all is true, just reduce along all dimensions and output a scalar.
-
-)DOC";
-    AddComment(comment_);
-  }
-
- protected:
-  std::string comment_;
-
-  void Replace(std::string &src, std::string from, std::string to) {
-    std::size_t len_from = std::strlen(from.c_str());
-    std::size_t len_to = std::strlen(to.c_str());
-    for (std::size_t pos = src.find(from); pos != std::string::npos;
-         pos = src.find(from, pos + len_to)) {
-      src.replace(pos, len_from, to);
-    }
-  }
-
-  void SetComment(std::string name, std::string op) {
-    Replace(comment_, "{ReduceOp}", name);
-    Replace(comment_, "{reduce}", op);
-  }
-};
-
-class ReduceSumOpMaker : public ReduceOpMaker {
- public:
-  ReduceSumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : ReduceOpMaker(proto, op_checker) {
-    SetComment("ReduceSum", "sum");
-    AddComment(comment_);
-  }
-};
-
-class ReduceMeanOpMaker : public ReduceOpMaker {
- public:
-  ReduceMeanOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : ReduceOpMaker(proto, op_checker) {
-    SetComment("ReduceMean", "mean");
-    AddComment(comment_);
-  }
-};
-
-class ReduceMaxOpMaker : public ReduceOpMaker {
- public:
-  ReduceMaxOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : ReduceOpMaker(proto, op_checker) {
-    SetComment("ReduceMax", "max");
-    AddComment(comment_);
-  }
-};
-
-class ReduceMinOpMaker : public ReduceOpMaker {
- public:
-  ReduceMinOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : ReduceOpMaker(proto, op_checker) {
-    SetComment("ReduceMin", "min");
-    AddComment(comment_);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP(reduce_sum, ops::ReduceOp, ops::ReduceSumOpMaker, reduce_sum_grad,
-            ops::ReduceGradOp);
-
-REGISTER_OP(reduce_mean, ops::ReduceOp, ops::ReduceMeanOpMaker,
-            reduce_mean_grad, ops::ReduceGradOp);
-
-REGISTER_OP(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_max_grad,
-            ops::ReduceGradOp);
-
-REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker, reduce_min_grad,
-            ops::ReduceGradOp);
-
-#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor)         \
-  REGISTER_OP_CPU_KERNEL(reduce_type,                                          \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           float, ops::functor>,               \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           double, ops::functor>,              \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           int, ops::functor>,                 \
-                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
-                                           int64_t, ops::functor>);            \
-  REGISTER_OP_CPU_KERNEL(                                                      \
-      reduce_type##_grad,                                                      \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, float,         \
-                            ops::grad_functor>,                                \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,        \
-                            ops::grad_functor>,                                \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,           \
-                            ops::grad_functor>,                                \
-      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,       \
-                            ops::grad_functor>);
-
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_CPU_KERNEL);
diff --git a/paddle/fluid/operators/reduce_op.cu b/paddle/fluid/operators/reduce_op.cu
deleted file mode 100644
index ae29587f55847315b1d84f1344677e753fe01a9b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reduce_op.cu
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define EIGEN_USE_GPU
-#include "paddle/fluid/operators/reduce_op.h"
-
-namespace ops = paddle::operators;
-
-#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor)    \
-  REGISTER_OP_CUDA_KERNEL(                                                \
-      reduce_type, ops::ReduceKernel<paddle::platform::CUDADeviceContext, \
-                                     float, ops::functor>,                \
-      ops::ReduceKernel<paddle::platform::CUDADeviceContext, double,      \
-                        ops::functor>,                                    \
-      ops::ReduceKernel<paddle::platform::CUDADeviceContext, int,         \
-                        ops::functor>,                                    \
-      ops::ReduceKernel<paddle::platform::CUDADeviceContext, int64_t,     \
-                        ops::functor>);                                   \
-  REGISTER_OP_CUDA_KERNEL(                                                \
-      reduce_type##_grad,                                                 \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, float,   \
-                            ops::grad_functor>,                           \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,  \
-                            ops::grad_functor>,                           \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,     \
-                            ops::grad_functor>,                           \
-      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t, \
-                            ops::grad_functor>);
-
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_GPU_KERNEL);
diff --git a/paddle/fluid/operators/reduce_op.h b/paddle/fluid/operators/reduce_op.h
index ec23325e571277d6dc78c18303692f5b483d278b..72b6cf1773d5bcc42e40e72111179d454d2bb4a9 100644
--- a/paddle/fluid/operators/reduce_op.h
+++ b/paddle/fluid/operators/reduce_op.h
@@ -14,158 +14,63 @@ limitations under the License. */
 
 #pragma once
 
-#include "glog/logging.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/reduce_op_function.h"
 
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-using DDim = framework::DDim;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-struct SumFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
-    y.device(place) = x.sum(dim);
-  }
-};
-
-struct SumGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
-                  const Dim& dim, int size) {
-    dx.device(place) = dy.broadcast(dim);
-  }
-};
-
-struct MeanFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
-    y.device(place) = x.mean(dim);
-  }
-};
-
-struct MeanGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
-                  const Dim& dim, int size) {
-    dx.device(place) = dy.broadcast(dim) / dx.constant(size);
+#define HANDLE_DIM(NDIM, RDIM)                                            \
+  if (ndim == NDIM && rdim == RDIM) {                                     \
+    ReduceFunctor<DeviceContext, T, NDIM, RDIM, Functor>(                 \
+        context.template device_context<DeviceContext>(), *input, output, \
+        dims, keep_dim);                                                  \
   }
-};
-
-struct MaxFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
-    y.device(place) = x.maximum(dim);
-  }
-};
-
-struct MinFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
-    y.device(place) = x.minimum(dim);
-  }
-};
-
-struct MaxOrMinGradFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename DX,
-            typename DY, typename Dim>
-  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
-                  const Dim& dim, int size) {
-    auto equals = x == y.broadcast(dim);
-    auto ones = dx.constant(1);
-    auto zeros = dx.constant(0);
-    // If there are multiple minimum or maximum elements, the subgradient of
-    // each is the set [0, 1], and we pass gradient to all of them here.
-    dx.device(place) = dy.broadcast(dim) * equals.select(ones, zeros);
-  }
-};
 
 template <typename DeviceContext, typename T, typename Functor>
 class ReduceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     bool reduce_all = context.Attr<bool>("reduce_all");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<T>(context.GetPlace());
+
+    auto dims = context.Attr<std::vector<int>>("dim");
+    bool keep_dim = context.Attr<bool>("keep_dim");
+
     if (reduce_all) {
       // Flatten and reduce 1-D tensor
-      auto* input = context.Input<Tensor>("X");
-      auto* output = context.Output<Tensor>("Out");
-      output->mutable_data<T>(context.GetPlace());
       auto x = EigenVector<T>::Flatten(*input);
       auto out = EigenScalar<T>::From(*output);
       auto& place =
           *context.template device_context<DeviceContext>().eigen_device();
       auto reduce_dim = Eigen::array<int, 1>({{0}});
       Functor functor;
-      functor(place, x, out, reduce_dim);
-    } else {
-      int rank = context.Input<Tensor>("X")->dims().size();
-      switch (rank) {
-        case 1:
-          ReduceCompute<1>(context);
-          break;
-        case 2:
-          ReduceCompute<2>(context);
-          break;
-        case 3:
-          ReduceCompute<3>(context);
-          break;
-        case 4:
-          ReduceCompute<4>(context);
-          break;
-        case 5:
-          ReduceCompute<5>(context);
-          break;
-        case 6:
-          ReduceCompute<6>(context);
-          break;
-      }
-    }
-  }
-
- private:
-  template <size_t D>
-  void ReduceCompute(const framework::ExecutionContext& context) const {
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-    output->mutable_data<T>(context.GetPlace());
-
-    auto x = EigenTensor<T, D>::From(*input);
-    auto x_rank = static_cast<int>(x.dimensions().size());
-    int dim = static_cast<int>(context.Attr<int>("dim"));
-    if (dim < 0) dim = x_rank + dim;
-    auto reduce_dim = Eigen::array<int, 1>({{dim}});
-    // construct the squeezed output tensor
-    bool keep_dim = context.Attr<bool>("keep_dim");
-    DDim dims = output->dims();
-    auto dims_vector = vectorize(dims);
-    if (keep_dim && x_rank > 1) {
-      dims_vector.erase(dims_vector.begin() + dim);
-      dims = framework::make_ddim(dims_vector);
-    }
-
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    Functor functor;
-
-    if (D == 1) {
-      auto out = EigenScalar<T>::From(*output);
-      functor(place, x, out, reduce_dim);
+      functor(place, &x, &out, reduce_dim);
     } else {
-      auto out = EigenTensor<T, (D - 1)>::From(*output, dims);
-      functor(place, x, out, reduce_dim);
+      int ndim = input->dims().size();
+      int rdim = dims.size();
+      // comments for accelerating compiling temporarily.
+      //      HANDLE_DIM(6, 5);
+      //      HANDLE_DIM(6, 4);
+      //      HANDLE_DIM(6, 3);
+      //      HANDLE_DIM(6, 2);
+      //      HANDLE_DIM(6, 1);
+      //      HANDLE_DIM(5, 4);
+      //      HANDLE_DIM(5, 3);
+      //      HANDLE_DIM(5, 2);
+      //      HANDLE_DIM(5, 1);
+      HANDLE_DIM(4, 3);
+      HANDLE_DIM(4, 2);
+      HANDLE_DIM(4, 1);
+      HANDLE_DIM(3, 2);
+      HANDLE_DIM(3, 1);
+      HANDLE_DIM(2, 1);
+      HANDLE_DIM(1, 1);
     }
   }
 };
@@ -175,12 +80,15 @@ class ReduceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     bool reduce_all = context.Attr<bool>("reduce_all");
+    auto dims = context.Attr<std::vector<int>>("dim");
+
+    auto* input0 = context.Input<Tensor>("X");
+    auto* input1 = context.Input<Tensor>("Out");
+    auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* output = context.Output<Tensor>(framework::GradVarName("X"));
+    output->mutable_data<T>(context.GetPlace());
+
     if (reduce_all) {
-      auto* input0 = context.Input<Tensor>("X");
-      auto* input1 = context.Input<Tensor>("Out");
-      auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
-      auto* output = context.Output<Tensor>(framework::GradVarName("X"));
-      output->mutable_data<T>(context.GetPlace());
       auto x = EigenVector<T>::Flatten(*input0);
       auto x_reduce = EigenVector<T>::From(*input1);
       auto x_reduce_grad = EigenVector<T>::From(*input2);
@@ -190,68 +98,175 @@ class ReduceGradKernel : public framework::OpKernel<T> {
       auto broadcast_dim =
           Eigen::array<int, 1>({{static_cast<int>(input0->numel())}});
       Functor functor;
-      functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim,
+      functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
               broadcast_dim[0]);
     } else {
-      int rank = context.Input<Tensor>("X")->dims().size();
+      int rank = input0->dims().size();
       switch (rank) {
         case 1:
-          ReduceGradCompute<1>(context);
+          ReduceGradFunctor<DeviceContext, T, 1, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
         case 2:
-          ReduceGradCompute<2>(context);
+          ReduceGradFunctor<DeviceContext, T, 2, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
         case 3:
-          ReduceGradCompute<3>(context);
+          ReduceGradFunctor<DeviceContext, T, 3, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
         case 4:
-          ReduceGradCompute<4>(context);
+          ReduceGradFunctor<DeviceContext, T, 4, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
         case 5:
-          ReduceGradCompute<5>(context);
+          ReduceGradFunctor<DeviceContext, T, 5, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
         case 6:
-          ReduceGradCompute<6>(context);
+          ReduceGradFunctor<DeviceContext, T, 6, Functor>(
+              context.template device_context<DeviceContext>(), *input0,
+              *input1, *input2, output, dims);
           break;
       }
     }
   }
+};
 
- private:
-  template <size_t D>
-  void ReduceGradCompute(const framework::ExecutionContext& context) const {
-    auto* input0 = context.Input<Tensor>("X");
-    auto* input1 = context.Input<Tensor>("Out");
-    auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* output = context.Output<Tensor>(framework::GradVarName("X"));
+class ReduceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
-    output->mutable_data<T>(context.GetPlace());
-    auto x = EigenTensor<T, D>::From(*input0);
-    auto x_grad = EigenTensor<T, D>::From(*output);
-    auto x_rank = static_cast<int>(x.dimensions().size());
-    int dim = static_cast<int>(context.Attr<int>("dim"));
-    if (dim < 0) dim = x_rank + dim;
-    DDim dims = input0->dims();
-    dims[dim] = 1;
-    auto x_reduce = EigenTensor<T, D>::From(*input1, dims);
-    auto x_reduce_grad = EigenTensor<T, D>::From(*input2, dims);
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ReduceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ReduceOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_rank = x_dims.size();
+    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
+    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
+    for (size_t i = 0; i < dims.size(); ++i) {
+      if (dims[i] < 0) dims[i] = x_rank + dims[i];
+      PADDLE_ENFORCE_LT(
+          dims[i], x_rank,
+          "The dim should be in the range [-rank(input), rank(input)).");
+    }
+    sort(dims.begin(), dims.end());
+    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
+    bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
+    if (reduce_all) {
+      if (keep_dim)
+        ctx->SetOutputDim(
+            "Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
+      else
+        ctx->SetOutputDim("Out", {1});
+    } else {
+      auto dims_vector = vectorize(x_dims);
+      if (keep_dim) {
+        for (size_t i = 0; i < dims.size(); ++i) {
+          dims_vector[dims[i]] = 1;
+        }
+      } else {
+        const int kDelFlag = -2;
+        for (size_t i = 0; i < dims.size(); ++i) {
+          dims_vector[dims[i]] = kDelFlag;
+        }
+        dims_vector.erase(
+            remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
+            dims_vector.end());
+      }
+      auto out_dims = framework::make_ddim(dims_vector);
+      ctx->SetOutputDim("Out", out_dims);
+      if (dims[0] != 0) {
+        // Only pass LoD when not reducing on the first dim.
+        ctx->ShareLoD("X", /*->*/ "Out");
+      }
+    }
+  }
+};
+
+class ReduceGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
-    Eigen::array<int, D> broadcast_dim;
-    for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
-    broadcast_dim[dim] = input0->dims()[dim];
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    Functor functor;
-    functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim,
-            broadcast_dim[dim]);
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_rank = x_dims.size();
+    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
+    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
+    for (size_t i = 0; i < dims.size(); ++i) {
+      if (dims[i] < 0) dims[i] = x_rank + dims[i];
+      PADDLE_ENFORCE_LT(
+          dims[i], x_rank,
+          "The dim should be in the range [-rank(input), rank(input)).");
+    }
+    sort(dims.begin(), dims.end());
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+      ctx->ShareLoD("X", /*->*/ x_grad_name);
+    }
   }
 };
 
+class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() final {
+    AddInput("X",
+             "(Tensor) The input tensor. Tensors with rank at most 6 are "
+             "supported.");
+    AddOutput("Out", "(Tensor) The result tensor.");
+    AddAttr<std::vector<int>>(
+        "dim",
+        "(list<int>, default {0}) The dimensions to reduce. "
+        "Must be in the range [-rank(input), rank(input)). "
+        "If `dim[i] < 0`, the dims[i] to reduce is `rank + dims[i]`. "
+        "Note that reducing on the first dim will make the LoD info lost.")
+        .SetDefault({0});
+    AddAttr<bool>("keep_dim",
+                  "(bool, default false) "
+                  "If true, retain the reduced dimension with length 1.")
+        .SetDefault(false);
+    AddAttr<bool>("reduce_all",
+                  "(bool, default false) "
+                  "If true, output a scalar reduced along all dimensions.")
+        .SetDefault(false);
+    AddComment(string::Sprintf(R"DOC(
+%s Operator.
+
+This operator computes the %s of input tensor along the given dimension.
+The result tensor has 1 fewer dimension than the input unless keep_dim is true.
+If reduce_all is true, just reduce along all dimensions and output a scalar.
+
+)DOC",
+                               GetOpType(), GetName()));
+  }
+
+ protected:
+  virtual std::string GetName() const = 0;
+  virtual std::string GetOpType() const = 0;
+};
+
 }  // namespace operators
 }  // namespace paddle
 
-#define FOR_EACH_KERNEL_FUNCTOR(__macro)                \
-  __macro(reduce_sum, SumFunctor, SumGradFunctor);      \
-  __macro(reduce_mean, MeanFunctor, MeanGradFunctor);   \
-  __macro(reduce_max, MaxFunctor, MaxOrMinGradFunctor); \
-  __macro(reduce_min, MinFunctor, MaxOrMinGradFunctor);
+namespace ops = paddle::operators;
+
+#define REGISTER_REDUCE_OP(op_name)                                      \
+  class __##op_name##Maker__ : public ops::ReduceOpMaker {               \
+   protected:                                                            \
+    virtual std::string GetName() const { return #op_name; }             \
+    virtual std::string GetOpType() const { return "Reduce " #op_name; } \
+  };                                                                     \
+  REGISTER_OPERATOR(op_name, ops::ReduceOp, __##op_name##Maker__,        \
+                    paddle::framework::DefaultGradOpDescMaker<true>);    \
+  REGISTER_OPERATOR(op_name##_grad, ops::ReduceGradOp)
diff --git a/paddle/fluid/operators/reduce_op_function.h b/paddle/fluid/operators/reduce_op_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..3da27bc8ac8d448471b9ff3779ac6aca59fac523
--- /dev/null
+++ b/paddle/fluid/operators/reduce_op_function.h
@@ -0,0 +1,109 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T, size_t D, size_t R_D,
+          typename Functor>
+void ReduceFunctor(const DeviceContext& context, const framework::Tensor& input,
+                   framework::Tensor* output, const std::vector<int>& dims,
+                   bool keep_dim) {
+  auto x = EigenTensor<T, D>::From(input);
+  auto x_rank = static_cast<int>(x.dimensions().size());
+  auto reduce_dim = Eigen::array<int, R_D>();
+  std::vector<int> dims_ref = dims;
+  for (size_t i = 0; i < dims_ref.size(); ++i) {
+    if (dims_ref[i] < 0) dims_ref[i] = x_rank + dims_ref[i];
+    reduce_dim[i] = dims_ref[i];
+  }
+  // construct the squeezed output tensor
+  DDim out_dims = output->dims();
+  if (keep_dim && x_rank > 1) {
+    const int kDelFlag = -2;
+    auto dims_vector = framework::vectorize(out_dims);
+    for (size_t i = 0; i < dims_ref.size(); ++i) {
+      dims_vector[dims_ref[i]] = kDelFlag;
+    }
+    dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
+                      dims_vector.end());
+    out_dims = framework::make_ddim(dims_vector);
+  }
+  auto& place = *context.eigen_device();
+  Functor functor;
+
+  if (D == 1) {
+    auto out = EigenScalar<T>::From(*output);
+    functor(place, &x, &out, reduce_dim);
+  } else {
+    auto out = EigenTensor<T, (D - R_D)>::From(*output, out_dims);
+    functor(place, &x, &out, reduce_dim);
+  }
+}
+
+template <typename DeviceContext, typename T, size_t D, typename Functor>
+void ReduceGradFunctor(const DeviceContext& context,
+                       const framework::Tensor& input0,
+                       const framework::Tensor& input1,
+                       const framework::Tensor& input2,
+                       framework::Tensor* output,
+                       const std::vector<int>& dims) {
+  auto x = EigenTensor<T, D>::From(input0);
+  auto x_grad = EigenTensor<T, D>::From(*output);
+  auto x_rank = static_cast<int>(x.dimensions().size());
+  auto x_dims = input0.dims();
+  auto reduced_dims_v = framework::vectorize(x_dims);
+  std::vector<int> dims_ref = dims;
+  Eigen::array<int, D> broadcast_dim;
+  for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
+
+  int broad_cats_times = 1;
+  for (size_t i = 0; i < dims_ref.size(); ++i) {
+    if (dims_ref[i] < 0) {
+      dims_ref[i] = x_rank + dims_ref[i];
+    }
+    reduced_dims_v[dims_ref[i]] = 1;
+    broadcast_dim[dims_ref[i]] = x_dims[dims_ref[i]];
+    broad_cats_times *= x_dims[dims_ref[i]];
+  }
+  auto reduced_dims = framework::make_ddim(reduced_dims_v);
+  auto x_reduce = EigenTensor<T, D>::From(input1, reduced_dims);
+  auto x_reduce_grad = EigenTensor<T, D>::From(input2, reduced_dims);
+
+  auto& place = *context.eigen_device();
+
+  Functor functor;
+  functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
+          broad_cats_times);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_prod_op.cc b/paddle/fluid/operators/reduce_prod_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..713728b99757a6f3bb128f665d5576ac64eef8ec
--- /dev/null
+++ b/paddle/fluid/operators/reduce_prod_op.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_prod_op.h"
+
+REGISTER_REDUCE_OP(reduce_prod);
+REGISTER_OP_CPU_KERNEL(reduce_prod,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         float, ops::ProdFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         double, ops::ProdFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         int, ops::ProdFunctor>,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         int64_t, ops::ProdFunctor>);
+REGISTER_OP_CPU_KERNEL(reduce_prod_grad,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             float, ops::ProdGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             double, ops::ProdGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int, ops::ProdGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int64_t, ops::ProdGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_prod_op.cu b/paddle/fluid/operators/reduce_prod_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d62e677d92cffecf629d1684026b0c7bcfec29e3
--- /dev/null
+++ b/paddle/fluid/operators/reduce_prod_op.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_prod_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_prod,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::ProdFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::ProdFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::ProdFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::ProdFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_prod_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                            float, ops::ProdGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::ProdGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::ProdGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::ProdGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_prod_op.h b/paddle/fluid/operators/reduce_prod_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..97748113e092719aceed9d806ca6242077111532
--- /dev/null
+++ b/paddle/fluid/operators/reduce_prod_op.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct ProdFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->prod(dim);
+  }
+};
+
+struct ProdGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    dx->device(place) = dy->broadcast(dim) * y->broadcast(dim) * x->inverse();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_sum_op.cc b/paddle/fluid/operators/reduce_sum_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c5b5398787b44e658b0f8390162df0e6c3006651
--- /dev/null
+++ b/paddle/fluid/operators/reduce_sum_op.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_sum_op.h"
+
+REGISTER_REDUCE_OP(reduce_sum);
+REGISTER_OP_CPU_KERNEL(
+    reduce_sum, ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
+                                  ops::SumFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
+                      ops::SumFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::SumFunctor>,
+    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
+                      ops::SumFunctor>);
+REGISTER_OP_CPU_KERNEL(reduce_sum_grad,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             float, ops::SumGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             double, ops::SumGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int, ops::SumGradFunctor>,
+                       ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
+                                             int64_t, ops::SumGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_sum_op.cu b/paddle/fluid/operators/reduce_sum_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f2e16955a50dc6a7feda9fbaf968c929ef3d8a4f
--- /dev/null
+++ b/paddle/fluid/operators/reduce_sum_op.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_sum_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_sum,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          float, ops::SumFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          double, ops::SumFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int, ops::SumFunctor>,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          int64_t, ops::SumFunctor>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_sum_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
+                                           float, ops::SumGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
+                          ops::SumGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
+                          ops::SumGradFunctor>,
+    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
+                          ops::SumGradFunctor>);
diff --git a/paddle/fluid/operators/reduce_sum_op.h b/paddle/fluid/operators/reduce_sum_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e67d7e1da5f0244d2dee346873692a80cbad2fc4
--- /dev/null
+++ b/paddle/fluid/operators/reduce_sum_op.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/operators/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct SumFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->sum(dim);
+  }
+};
+
+struct SumGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
+                  const Dim& dim, int size) {
+    dx->device(place) = dy->broadcast(dim);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
index 5c3e1f5678df0270c837ed407d1e6cc662276880..e4f4fe358e0e8cd2080525227f14a3d40f3c1411 100644
--- a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
@@ -23,9 +23,7 @@ namespace operators {
 class ReorderLoDTensorByRankTableOpProtoMaker
     : public framework::OpProtoAndCheckerMaker {
  public:
-  ReorderLoDTensorByRankTableOpProtoMaker(OpProto *proto,
-                                          OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(LoDTensor), the input lod tensor to be reordered according to "
              "Input(RankTable).");
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 832509641cc3d5178ff090e05437484d395bfe51..918f3be533d51367eade5f5108ad2eab954a9303 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/reshape_op.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -25,82 +27,152 @@ class ReshapeOp : public framework::OperatorWithKernel {
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    // input check
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of ReshapeOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of ReshapeOp should not be null.");
 
-    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    PADDLE_ENFORCE(shape.size() > 0, "Attr(shape) shouldn't be empty.");
+    const std::vector<int> &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    PADDLE_ENFORCE(!shape.empty(),
+                   "The shape information must be set by Attr(shape).");
+
+    if (ctx->HasInput("Shape") && ctx->IsRuntime()) {
+      // If true, set the shape of Output(Out) according to Input(Shape) in
+      // ReshapeKernel with ExecutionContext. Also check LoD in ReshapeKernel.
+      ctx->ShareLoD("X", /*->*/ "Out");
+      return;
+    }
+
     auto x_dims = ctx->GetInputDim("X");
+    auto out_dims = ValidateShape(shape, x_dims);
+    ctx->SetOutputDim("Out", out_dims);
+    if (x_dims[0] == out_dims[0]) {
+      // Only pass LoD when the first dimension of output and Input(X)
+      // are the same.
+      ctx->ShareLoD("X", /*->*/ "Out");
+    }
+  }
+
+  static framework::DDim ValidateShape(const std::vector<int> shape,
+                                       const framework::DDim &in_dims) {
+    const int64_t in_size = framework::product(in_dims);
+    // only one dimension can be set to -1, whose size will be automatically
+    // infered.
+    const int64_t unk_dim_val = -1;
+    const int64_t copy_dim_val = 0;
 
-    std::vector<size_t> neg_dims_idx;
-    // set some dimension to -1 if it is unknown
-    const int unknown_size = -1;
+    std::vector<int64_t> output_shape(shape.size(), 0);
+    int64_t capacity = 1;
+    int unk_dim_idx = -1;
     for (size_t i = 0; i < shape.size(); ++i) {
-      PADDLE_ENFORCE(shape[i] > 0 || shape[i] == unknown_size,
-                     "Each dimension of Attr(shape) must be positive or %d.",
-                     unknown_size);
-      if (shape[i] == unknown_size) {
-        neg_dims_idx.push_back(i);
-        PADDLE_ENFORCE(neg_dims_idx.size() <= 1,
-                       "Only one dimension of Attr(shape) can be unknown.");
+      if (shape[i] == unk_dim_val) {
+        PADDLE_ENFORCE(
+            unk_dim_idx == -1,
+            "Only one input dimension of Attr(shape) can be unknown.");
+        unk_dim_idx = i;
+      } else if (shape[i] == copy_dim_val) {
+        PADDLE_ENFORCE(
+            static_cast<int>(i) < in_dims.size(),
+            "The index of dimension to copy from input shape must be less "
+            "than the size of input shape.");
+      } else {
+        PADDLE_ENFORCE(
+            shape[i] > 0,
+            "Each input dimension of Attr(shape) must not be negtive except "
+            "one unknown dimension.");
       }
-    }
 
-    int64_t capacity =
-        std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
-    int64_t in_size = framework::product(x_dims);
-    if (neg_dims_idx.size() == 1) {
-      // dim infer
-      shape[neg_dims_idx[0]] = in_size / (-capacity);
-      // recalculate capacity
-      capacity = shape[neg_dims_idx[0]] * (-capacity);
+      capacity *= (shape[i] ? shape[i] : in_dims[i]);
+      output_shape[i] =
+          (shape[i] ? static_cast<int64_t>(shape[i]) : in_dims[i]);
     }
-    // capacity check
-    PADDLE_ENFORCE(capacity == in_size,
-                   "The size of Input(X) mismatches with Attr(shape).");
-    // resize output
-    std::vector<int64_t> shape_int64(shape.size(), 0);
-    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
-                   [](int a) { return static_cast<int64_t>(a); });
-    auto out_dims = framework::make_ddim(shape_int64);
-    ctx->SetOutputDim("Out", out_dims);
-    if (shape[0] == x_dims[0]) {
-      // Only pass LoD when the first dimension is equal between
-      // output and input.
-      ctx->ShareLoD("X", /*->*/ "Out");
+
+    if (unk_dim_idx != -1) {
+      if (in_size > 0) {
+        // in_size < 0 and is un-determinate in compile time, skip the check,
+        // for example, in_dims = [-1, 8, 1, 1], shape = [-1, 3, 8],
+        // capacity = -24, in_size = -8, output_shape[0] = 0
+        // the following check will fail.
+        output_shape[unk_dim_idx] = -in_size / capacity;
+        PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size,
+                          "Invalid shape is given.");
+      } else {
+        output_shape[unk_dim_idx] = -1;
+      }
+    } else {
+      PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given.");
     }
+    return framework::make_ddim(output_shape);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
   }
 };
 
 class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input tensor of reshape operator.");
-    AddOutput("Out", "The output tensor of reshape operator.");
-    AddAttr<std::vector<int>>("shape",
-                              "(vector<int>) "
-                              "Target shape of reshape operator.");
+  void Make() override {
+    AddInput("X", "(Tensor). The input tensor of reshape operator.");
+    AddInput("Shape",
+             "(Tensor<int32>, optional). If provided, reshape according to "
+             "this given shape. That is to say it has a higher priority than "
+             "the shape attribute, while the shape attribute still should be "
+             "set correctly to gurantee shape inference in compile time.")
+        .AsDispensable();
+    AddOutput("Out", "(Tensor). The output tensor of reshape operator.");
+    AddAttr<std::vector<int>>(
+        "shape", "(std::vector<int>) Target shape of reshape operator.");
     AddAttr<bool>("inplace",
-                  "Change the source tensor's shape without copy memory.")
-        .SetDefault(true);
+                  "(default: false) Change the source tensor's shape without "
+                  "memory copy. When Attr(inplace) is set true, the output "
+                  "tensor shares memory with Input(X), otherwise, a new output "
+                  "tensor is created, and its data are copied from Input(x).")
+        .SetDefault(false);
     AddComment(R"DOC(
 Reshape Operator.
 
-Reshape Input(X) into the shape specified by Attr(shape).
+Reshape Input(X) into the shape specified by Attr(shape) or Input(Shape). The
+data in Input(X) are unchanged.
+
+Examples:
+
+1. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
+specified by Attr(shape) is [6, 8], the reshape operator will transform Input(X)
+into a 2-D tensor with shape [6, 8] and leaving Input(X)'s data unchanged.
 
-An example:
-Given a 2-D tensor X with 2 rows and 2 columns : [[1, 2], [3, 4]]
+2. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
+specified by Attr(shape) is [2, 3, -1, 2], the reshape operator will transform
+Input(X) into a 4-D tensor with shape [2, 3, 4, 2] and leaving Input(X)'s data
+unchanged. In this case, one and only dimension of Attr(shape) can be set to -1,
+the value of this dimension is inferred from the total element number of
+Input(X) and remaining dimensions.
 
-and target shape = [1, 4], the reshape operator will transform
-the tensor X into a 2-D tensor: [[1, 2, 3, 4]]
+3. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape
+specified by Attr(shape) is [-1, 0, 3, 2], the reshape operator will transform
+Input(X) into a 4-D tensor with shape [2, 4, 3, 2] and leaving Input(X)'s data
+unchanged. In this case, besides -1, 0 means the actual dimension value is going
+to be copied from the corresponding dimension of Input(X).
+
+Note:
+
+1. One and only one dimension in Attr(shape) can be set -1. In this case,
+the actual dimension value will be infered from the total element number of
+Input(X) and remaining dimensions.
+
+2. More than one dimensions in Attr(shape) can be set to 0, which means the real
+dimension value will be copied from Input(X) at runtime. Note that the index of
+0 can not exceed Rank(X). For example, Input(X) is a 3-D tensor with shape
+[2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input.
+
+3. Input(Shape) has a higher priority than Attr(shape) if it is provided, while
+Attr(shape) still should be set correctly to gurantee shape inference in 
+compile-time.
 
-One dimension in the target shape can be set -1, representing that its
-size is unknown. In this case, the real dimension will be infered from 
-the original shape of Input(X) and other dimensions in the target shape.
 )DOC");
   }
 };
@@ -119,20 +191,103 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
                    "Input(Out@GRAD) shouldn't be null.");
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class ReshapeKernel {
+ public:
+  void operator()(const framework::ExecutionContext &ctx) const {
+    auto *out = ctx.Output<framework::LoDTensor>("Out");
+    auto *in = ctx.Input<framework::LoDTensor>("X");
+
+    auto *shape_tensor = ctx.HasInput("Shape")
+                             ? ctx.Input<framework::LoDTensor>("Shape")
+                             : nullptr;
+
+    framework::DDim out_dims = out->dims();
+
+    if (shape_tensor) {
+      auto *shape_data = shape_tensor->data<int>();
+      framework::Tensor cpu_shape_tensor;
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
+        shape_data = cpu_shape_tensor.data<int>();
+      }
+      auto shape =
+          std::vector<int>(shape_data, shape_data + shape_tensor->numel());
+      out_dims = ReshapeOp::ValidateShape(shape, in->dims());
+    }
+    if (!in->lod().empty()) {
+      PADDLE_ENFORCE_EQ(
+          out_dims[0], in->dims()[0],
+          "Reshape operator cannot reshape an input sequence batch "
+          "into an output sequence batch that has a different "
+          "number of time steps. Please consider using "
+          "sequence_reshape op.");
+    }
+
+    bool inplace = ctx.Attr<bool>("inplace");
+    out->Resize(out_dims);
+    if (!inplace) {
+      out->mutable_data(ctx.GetPlace(), in->type());
+      framework::TensorCopySync(*in, ctx.GetPlace(), out);
+      out->Resize(out_dims);
+    } else {
+      out->ShareDataWith(*in);
+      out->Resize(out_dims);
+    }
+  }
+};
+
+class ReshapeGradKernel {
+ public:
+  void operator()(const framework::ExecutionContext &ctx) const {
+    auto *d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    d_x->mutable_data(ctx.GetPlace(), d_out->type());
+    bool inplace = ctx.Attr<bool>("inplace");
+
+    auto in_dims = d_x->dims();
+    if (!inplace) {
+      framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
+      ctx.device_context().Wait();
+      d_x->Resize(in_dims);
+    } else {
+      d_x->ShareDataWith(*d_out);
+      d_x->Resize(in_dims);
+    }
+  }
 };
 
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
-
-REGISTER_OP(reshape, ops::ReshapeOp, ops::ReshapeOpMaker, reshape_grad,
-            ops::ReshapeGradOp);
-REGISTER_OP_CPU_KERNEL(reshape, ops::ReshapeKernel<CPU, float>,
-                       ops::ReshapeKernel<CPU, double>,
-                       ops::ReshapeKernel<CPU, int>,
-                       ops::ReshapeKernel<CPU, int64_t>);
-REGISTER_OP_CPU_KERNEL(reshape_grad, ops::ReshapeGradKernel<CPU, float>,
-                       ops::ReshapeGradKernel<CPU, double>,
-                       ops::ReshapeGradKernel<CPU, int>,
-                       ops::ReshapeGradKernel<CPU, int64_t>);
+
+REGISTER_OPERATOR(reshape, ops::ReshapeOp, ops::ReshapeOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
+                               ops::ReshapeKernel, int, ops::ReshapeKernel,
+                               int64_t, ops::ReshapeKernel);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
+                               double, ops::ReshapeGradKernel, int,
+                               ops::ReshapeGradKernel, int64_t,
+                               ops::ReshapeGradKernel);
+
+#ifdef PADDLE_WITH_CUDA
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
+                                ops::ReshapeKernel, int, ops::ReshapeKernel,
+                                int64_t, ops::ReshapeKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
+                                double, ops::ReshapeGradKernel, int,
+                                ops::ReshapeGradKernel, int64_t,
+                                ops::ReshapeGradKernel);
+#endif
diff --git a/paddle/fluid/operators/reshape_op.cu b/paddle/fluid/operators/reshape_op.cu
deleted file mode 100644
index c628c634e2bc9ae260948a6e7ccf786cbd6c5c3c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reshape_op.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/reshape_op.h"
-using CUDA = paddle::platform::CUDADeviceContext;
-
-REGISTER_OP_CUDA_KERNEL(reshape, paddle::operators::ReshapeKernel<CUDA, float>,
-                        paddle::operators::ReshapeKernel<CUDA, double>,
-                        paddle::operators::ReshapeKernel<CUDA, int>,
-                        paddle::operators::ReshapeKernel<CUDA, int64_t>);
-REGISTER_OP_CUDA_KERNEL(reshape_grad,
-                        paddle::operators::ReshapeGradKernel<CUDA, float>,
-                        paddle::operators::ReshapeGradKernel<CUDA, double>,
-                        paddle::operators::ReshapeGradKernel<CUDA, int>,
-                        paddle::operators::ReshapeGradKernel<CUDA, int64_t>);
diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h
deleted file mode 100644
index eacb0a0cf21a60ffbdef5787434859ac549388bc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reshape_op.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ReshapeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto* in = ctx.Input<framework::Tensor>("X");
-    bool inplace = ctx.Attr<bool>("inplace");
-    auto out_dims = out->dims();
-    if (!inplace) {
-      out->mutable_data<T>(ctx.GetPlace());
-      framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out);
-      out->Resize(out_dims);
-    } else {
-      out->ShareDataWith(*in);
-      out->Resize(out_dims);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ReshapeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    d_x->mutable_data<T>(ctx.GetPlace());
-    bool inplace = ctx.Attr<bool>("inplace");
-
-    auto in_dims = d_x->dims();
-    if (!inplace) {
-      framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
-      d_x->Resize(in_dims);
-    } else {
-      d_x->ShareDataWith(*d_out);
-      d_x->Resize(in_dims);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reverse_op.cc b/paddle/fluid/operators/reverse_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a20f7d231fa9ea313581ac0629a87fa5f4a88ce5
--- /dev/null
+++ b/paddle/fluid/operators/reverse_op.cc
@@ -0,0 +1,107 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reverse_op.h"
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+class ReverseOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
+    const auto& x_dims = ctx->GetInputDim("X");
+    const auto& axis = ctx->Attrs().Get<std::vector<int>>("axis");
+    PADDLE_ENFORCE(!axis.empty(), "'axis' can not be empty.");
+    for (int a : axis) {
+      PADDLE_ENFORCE_LT(a, x_dims.size(),
+                        "The axis must be less than input tensor's rank.");
+    }
+    ctx->SetOutputDim("Out", x_dims);
+  }
+};
+
+class ReverseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The LoDTensor to be flipped.");
+    AddOutput("Out", "The LoDTensor after flipping.");
+    AddAttr<std::vector<int>>(
+        "axis", "The axises that along which order of elements is reversed.");
+    AddComment(R"DOC(
+      Reverse Operator.
+
+      Reverse the order of elements in the input LoDTensor along given axises.
+
+      Case 1:
+        Given
+            X = [[1, 2, 3, 4, 5]
+                 [6, 7, 8, 9, 10]
+                 [11, 12, 13, 14, 15]],
+        and
+            axis = [0],
+        we get:
+            Out = [[11, 12, 13, 14, 15]
+                   [6, 7, 8, 9, 10]
+                   [1, 2, 3, 4, 5]].
+        
+      Case 2:
+        Given
+            X = [[[1, 2, 3, 4]
+                  [5, 6, 7, 8]]
+                 [[9, 10, 11, 12]
+                  [13, 14, 15, 16]]],
+        and
+            axis = [0, 2],
+        we get:
+            Out = [[[12, 11, 10, 9]
+                    [16, 15, 14, 13]]
+                   [[4, 3, 2, 1]
+                    [8, 7, 6, 5]]],
+    )DOC");
+  }
+};
+
+class ReverseGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* grad_op = new framework::OpDesc();
+    grad_op->SetType("reverse");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttr("axis", GetAttr("axis"));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(reverse, ops::ReverseOp, ops::ReverseOpMaker,
+                  ops::ReverseGradMaker);
+REGISTER_OPERATOR(reverse_grad, ops::ReverseOp);
+REGISTER_OP_CPU_KERNEL(
+    reverse, ops::ReverseKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, uint8_t>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, bool>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, double>)
diff --git a/paddle/fluid/operators/reverse_op.cu b/paddle/fluid/operators/reverse_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..635c41529b38f2dd287b00ed2e5659e11f619e78
--- /dev/null
+++ b/paddle/fluid/operators/reverse_op.cu
@@ -0,0 +1,24 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reverse_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    reverse, ops::ReverseKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, uint8_t>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, bool>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, double>)
diff --git a/paddle/fluid/operators/reverse_op.h b/paddle/fluid/operators/reverse_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9063cd59bba5c6307b55a500455908a5fd278390
--- /dev/null
+++ b/paddle/fluid/operators/reverse_op.h
@@ -0,0 +1,87 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T, int Rank>
+struct ReverseFunctor {
+  void operator()(const DeviceContext& context, const framework::LoDTensor& in,
+                  framework::LoDTensor* out, const std::vector<int>& axis) {
+    Eigen::array<bool, Rank> reverse_axis;
+    for (int i = 0; i < Rank; ++i) {
+      reverse_axis[i] = false;
+    }
+    for (int a : axis) {
+      reverse_axis[a] = true;
+    }
+
+    auto in_eigen = framework::EigenTensor<T, Rank>::From(in);
+    auto out_eigen = framework::EigenTensor<T, Rank>::From(*out);
+    auto* dev = context.eigen_device();
+
+    out_eigen.device(*dev) = in_eigen.reverse(reverse_axis);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ReverseKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<framework::LoDTensor>("X");
+    auto* out = context.Output<framework::LoDTensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+    const auto& axis = context.Attr<std::vector<int>>("axis");
+    int rank = x->dims().size();
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    switch (rank) {
+      case 1:
+        ReverseFunctor<DeviceContext, T, 1> functor1;
+        functor1(dev_ctx, *x, out, axis);
+        break;
+      case 2:
+        ReverseFunctor<DeviceContext, T, 2> functor2;
+        functor2(dev_ctx, *x, out, axis);
+        break;
+      case 3:
+        ReverseFunctor<DeviceContext, T, 3> functor3;
+        functor3(dev_ctx, *x, out, axis);
+        break;
+      case 4:
+        ReverseFunctor<DeviceContext, T, 4> functor4;
+        functor4(dev_ctx, *x, out, axis);
+        break;
+      case 5:
+        ReverseFunctor<DeviceContext, T, 5> functor5;
+        functor5(dev_ctx, *x, out, axis);
+        break;
+      case 6:
+        ReverseFunctor<DeviceContext, T, 6> functor6;
+        functor6(dev_ctx, *x, out, axis);
+        break;
+      default:
+        PADDLE_THROW(
+            "Reserve operator doesn't supports tensors whose ranks are greater "
+            "than 6.");
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/rmsprop_op.cc b/paddle/fluid/operators/rmsprop_op.cc
index a8855b3ccd1686c75953e762ce42cc27b26202e6..919ebe48ca38040274bd2052b95ef96eccff4db6 100644
--- a/paddle/fluid/operators/rmsprop_op.cc
+++ b/paddle/fluid/operators/rmsprop_op.cc
@@ -63,8 +63,7 @@ class RmspropOp : public framework::OperatorWithKernel {
 
 class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RmspropOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Param",
              "(Tensor, default Tensor<float>) "
              "Input parameter value that has to be updated.");
diff --git a/paddle/fluid/operators/rnn_memory_helper_op.cc b/paddle/fluid/operators/rnn_memory_helper_op.cc
index 70f205d887ef710aeed02905713200ce32988987..23e5fc1112d0b1e634d0ab288721cbba57b3ffe5 100644
--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
@@ -59,8 +59,7 @@ class RNNMemoryHelperOpShapeInference : public framework::InferShapeBase {
 
 class RNNMemoryHelperOpInfoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RNNMemoryHelperOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "");
     AddOutput("Out", "");
     AddAttr<int>("dtype",
@@ -117,8 +116,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
 class RNNMemoryHelperGradOpInfoMaker
     : public framework::OpProtoAndCheckerMaker {
  public:
-  RNNMemoryHelperGradOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(framework::GradVarName("Out"), "");
     AddInput("X", "");
     AddInput("Out", "");
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index 6d4861f0428834b1893c3a10a83920f0a62b5455..d6d209d5de041500a9b4893d70800a58e8ee1e1d 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -18,8 +18,7 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-
-static constexpr int kROISize = 5;
+using LoDTensor = framework::LoDTensor;
 
 class ROIPoolOp : public framework::OperatorWithKernel {
  public:
@@ -40,11 +39,11 @@ class ROIPoolOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(input_dims.size() == 4,
                    "The format of input tensor is NCHW.");
     PADDLE_ENFORCE(rois_dims.size() == 2,
-                   "ROIs should be a 2-D tensor of shape (num_rois, 5)"
-                   "given as [[batch_id, x1, y1, x2, y2], …].");
+                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
+                   "given as [[x1, y1, x2, y2], …].");
     PADDLE_ENFORCE(rois_dims[1] == kROISize,
-                   "ROIs should be a 2-D tensor of shape (num_rois, 5)"
-                   "given as [[batch_id, x1, y1, x2, y2], …].");
+                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
+                   "given as [[x1, y1, x2, y2], …].");
 
     int pooled_height = ctx->Attrs().Get<int>("pooled_height");
     int pooled_width = ctx->Attrs().Get<int>("pooled_width");
@@ -99,8 +98,7 @@ class ROIPoolGradOp : public framework::OperatorWithKernel {
 
 class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ROIPoolOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(Tensor), "
              "the input of ROIPoolOp. "
@@ -109,10 +107,10 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
              "H is the height of the feature, and "
              "W is the width of the feature.");
     AddInput("ROIs",
-             "(Tensor), "
+             "(LoDTensor), "
              "ROIs (Regions of Interest) to pool over. "
-             "should be a 2-D tensor of shape (num_rois, 5)"
-             "given as [[batch_id, x1, y1, x2, y2], …]. "
+             "should be a 2-D LoDTensor of shape (num_rois, 4)"
+             "given as [[x1, y1, x2, y2], …]. "
              "Where batch_id is the id of the data, "
              "(x1, y1) is the top left coordinates, and "
              "(x2, y2) is the bottom right coordinates.");
@@ -141,7 +139,20 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
                  "The pooled output width.")
         .SetDefault(1);
     AddComment(R"DOC(
-ROIPool operator
+**ROIPool Operator**
+
+Region of interest pooling (also known as RoI pooling) is to perform
+is to perform max pooling on inputs of nonuniform sizes to obtain
+fixed-size feature maps (e.g. 7*7).
+
+The operator has three steps:
+
+1. Dividing each region proposal into equal-sized sections with
+   the pooled_width and pooled_height
+
+2. Finding the largest value in each section
+
+3. Copying these max values to the output buffer
 
 ROI Pooling for Faster-RCNN. The link below is a further introduction: 
 https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
@@ -153,8 +164,9 @@ https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, roi_pool_grad,
-            ops::ROIPoolGradOp);
+REGISTER_OPERATOR(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(roi_pool_grad, ops::ROIPoolGradOp);
 REGISTER_OP_CPU_KERNEL(
     roi_pool,
     ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
index 1931629d1340758edb6664a5e3ffdba126b33717..50450b62f7b1c0b2b5abf01a43581a0e2d2cd01e 100644
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/roi_pool_op.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
 
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaxinumNumBlocks = 4096;
-static constexpr int kROISize = 5;
 
 static inline int NumBlocks(const int N) {
   return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
@@ -30,38 +30,41 @@ static inline int NumBlocks(const int N) {
 }
 
 template <typename T>
-__global__ void GPUROIPoolForward(const int nthreads, const T* input_data,
-                                  const int64_t* input_rois,
-                                  const float spatial_scale, const int channels,
-                                  const int height, const int width,
-                                  const int pooled_height,
-                                  const int pooled_width, T* output_data,
-                                  int64_t* argmax_data) {
+__global__ void GPUROIPoolForward(
+    const int nthreads, const T* input_data, const int64_t* input_rois,
+    const float spatial_scale, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    int* roi_batch_id_data, T* output_data, int64_t* argmax_data) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int offset = blockDim.x * gridDim.x;
   for (size_t i = index; i < nthreads; i += offset) {
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % channels;
+    int n = i / pooled_width / pooled_height / channels;
 
     const int64_t* offset_input_rois = input_rois + n * kROISize;
-    int roi_batch_ind = offset_input_rois[0];
-    int roi_start_w = round(offset_input_rois[1] * spatial_scale);
-    int roi_start_h = round(offset_input_rois[2] * spatial_scale);
-    int roi_end_w = round(offset_input_rois[3] * spatial_scale);
-    int roi_end_h = round(offset_input_rois[4] * spatial_scale);
+    int roi_batch_ind = roi_batch_id_data[n];
+    int roi_start_w = round(offset_input_rois[0] * spatial_scale);
+    int roi_start_h = round(offset_input_rois[1] * spatial_scale);
+    int roi_end_w = round(offset_input_rois[2] * spatial_scale);
+    int roi_end_h = round(offset_input_rois[3] * spatial_scale);
 
     int roi_width = max(roi_end_w - roi_start_w + 1, 1);
     int roi_height = max(roi_end_h - roi_start_h + 1, 1);
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    int hstart = static_cast<int>(floor(static_cast<T>(ph) * bin_size_h));
-    int wstart = static_cast<int>(floor(static_cast<T>(pw) * bin_size_w));
-    int hend = static_cast<int>(ceil(static_cast<T>(ph + 1) * bin_size_h));
-    int wend = static_cast<int>(ceil(static_cast<T>(pw + 1) * bin_size_w));
 
+    int hstart = static_cast<int>(floor(static_cast<double>(ph) *
+                                        static_cast<double>(roi_height) /
+                                        static_cast<double>(pooled_height)));
+    int wstart = static_cast<int>(floor(static_cast<double>(pw) *
+                                        static_cast<double>(roi_width) /
+                                        static_cast<double>(pooled_width)));
+    int hend = static_cast<int>(ceil(static_cast<double>(ph + 1) *
+                                     static_cast<double>(roi_height) /
+                                     static_cast<double>(pooled_height)));
+    int wend = static_cast<int>(ceil(static_cast<double>(pw + 1) *
+                                     static_cast<double>(roi_width) /
+                                     static_cast<double>(pooled_width)));
     hstart = min(max(hstart + roi_start_h, 0), height);
     hend = min(max(hend + roi_start_h, 0), height);
     wstart = min(max(wstart + roi_start_w, 0), width);
@@ -81,9 +84,9 @@ __global__ void GPUROIPoolForward(const int nthreads, const T* input_data,
         }
       }
     }
-    output_data[index] = maxval;
+    output_data[i] = maxval;
     if (argmax_data) {
-      argmax_data[index] = maxidx;
+      argmax_data[i] = maxidx;
     }
   }
 }
@@ -93,17 +96,17 @@ __global__ void GPUROIPoolBackward(
     const int nthreads, const int64_t* input_rois, const T* output_grad,
     const int64_t* argmax_data, const int num_rois, const float spatial_scale,
     const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, T* input_grad) {
+    const int pooled_height, const int pooled_width, int* roi_batch_id_data,
+    T* input_grad) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int offset = blockDim.x * gridDim.x;
   for (int i = index; i < nthreads; i += offset) {
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % channels;
+    int n = i / pooled_width / pooled_height / channels;
 
-    const int64_t* offset_input_rois = input_rois + n * kROISize;
-    int roi_batch_ind = offset_input_rois[0];
+    int roi_batch_ind = roi_batch_id_data[n];
     int input_offset = (roi_batch_ind * channels + c) * height * width;
     int output_offset = (n * channels + c) * pooled_height * pooled_width;
     const T* offset_output_grad = output_grad + output_offset;
@@ -124,7 +127,7 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<Tensor>("ROIs");
+    auto* rois = ctx.Input<LoDTensor>("ROIs");
     auto* out = ctx.Output<Tensor>("Out");
     auto* argmax = ctx.Output<Tensor>("Argmax");
 
@@ -133,23 +136,47 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
     auto spatial_scale = ctx.Attr<float>("spatial_scale");
 
     auto in_dims = in->dims();
+    int batch_size = in_dims[0];
     auto in_stride = framework::stride(in_dims);
     int channels = in_dims[1];
     int height = in_dims[2];
     int width = in_dims[3];
 
-    size_t rois_num = rois->dims()[0];
+    int rois_num = rois->dims()[0];
+
     if (rois_num == 0) return;
 
     int output_size = out->numel();
     int blocks = NumBlocks(output_size);
     int threads = kNumCUDAThreads;
 
+    framework::Tensor roi_batch_id_list;
+    roi_batch_id_list.Resize({rois_num});
+    int* roi_batch_id_data =
+        roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
+    auto rois_lod = rois->lod().back();
+    int rois_batch_size = rois_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        rois_batch_size, batch_size,
+        "The rois_batch_size and imgs batch_size must be the same.");
+    int rois_num_with_lod = rois_lod[rois_batch_size];
+    PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
+                      "The rois_num from input and lod must be the same.");
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        roi_batch_id_data[i] = n;
+      }
+    }
+
+    framework::Tensor roi_batch_id_list_gpu;
+    framework::TensorCopy(roi_batch_id_list, ctx.GetPlace(),
+                          ctx.device_context(), &roi_batch_id_list_gpu);
+
     GPUROIPoolForward<
         T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
         output_size, in->data<T>(), rois->data<int64_t>(), spatial_scale,
         channels, height, width, pooled_height, pooled_width,
-        out->mutable_data<T>(ctx.GetPlace()),
+        roi_batch_id_list_gpu.data<int>(), out->mutable_data<T>(ctx.GetPlace()),
         argmax->mutable_data<int64_t>(ctx.GetPlace()));
   }
 };
@@ -159,7 +186,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<Tensor>("ROIs");
+    auto* rois = ctx.Input<LoDTensor>("ROIs");
     auto* argmax = ctx.Input<Tensor>("Argmax");
 
     auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
@@ -169,12 +196,27 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
     auto pooled_width = ctx.Attr<int>("pooled_width");
     auto spatial_scale = ctx.Attr<float>("spatial_scale");
 
-    size_t rois_num = rois->dims()[0];
+    int rois_num = rois->dims()[0];
     int channels = in->dims()[1];
     int height = in->dims()[2];
     int width = in->dims()[3];
 
     if (x_grad) {
+      framework::Tensor roi_batch_id_list;
+      roi_batch_id_list.Resize({rois_num});
+      int* roi_batch_id_data =
+          roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
+      auto rois_lod = rois->lod().back();
+      int rois_batch_size = rois_lod.size() - 1;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+          roi_batch_id_data[i] = n;
+        }
+      }
+      framework::Tensor roi_batch_id_list_gpu;
+      framework::TensorCopy(roi_batch_id_list, ctx.GetPlace(),
+                            ctx.device_context(), &roi_batch_id_list_gpu);
+
       x_grad->mutable_data<T>(ctx.GetPlace());
       math::SetConstant<Place, T> set_zero;
       set_zero(ctx.cuda_device_context(), x_grad, static_cast<T>(0));
@@ -189,6 +231,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
             output_grad_size, rois->data<int64_t>(), out_grad->data<T>(),
             argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
             width, pooled_height, pooled_width,
+            roi_batch_id_list_gpu.data<int>(),
             x_grad->mutable_data<T>(ctx.GetPlace()));
       }
     }
diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h
index f38c5a3c0c9952b37f7db468ea00470a00b5ff6f..c4f739b2c6b2d62ebebcc15fd627ebad040e7b3f 100644
--- a/paddle/fluid/operators/roi_pool_op.h
+++ b/paddle/fluid/operators/roi_pool_op.h
@@ -13,18 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
+#include <limits>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
 
+static constexpr int kROISize = 4;
+
 template <typename DeviceContext, typename T>
 class CPUROIPoolOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::Tensor>("ROIs");
+    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
     auto* out = ctx.Output<framework::Tensor>("Out");
     auto* argmax = ctx.Output<framework::Tensor>("Argmax");
 
@@ -45,24 +49,36 @@ class CPUROIPoolOpKernel : public framework::OpKernel<T> {
     auto out_stride = framework::stride(out->dims());
 
     const T* input_data = in->data<T>();
-    const int64_t* rois_data = rois->data<int64_t>();
-    T* output_data = out->mutable_data<T>(ctx.GetPlace());
-    int64_t* argmax_data = argmax->mutable_data<int64_t>(ctx.GetPlace());
 
-    for (int n = 0; n < rois_num; ++n) {
-      int roi_batch_id = rois_data[0];
-      PADDLE_ENFORCE_GE(roi_batch_id, 0);
-      PADDLE_ENFORCE_LT(roi_batch_id, batch_size);
-      rois_data += roi_stride[0];
+    framework::Tensor roi_batch_id_list;
+    roi_batch_id_list.Resize({rois_num});
+    int* roi_batch_id_data =
+        roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
+
+    auto rois_lod = rois->lod().back();
+    int rois_batch_size = rois_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        rois_batch_size, batch_size,
+        "The rois_batch_size and imgs batch_size must be the same.");
+    int rois_num_with_lod = rois_lod[rois_batch_size];
+    PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
+                      "The rois_num from input and lod must be the same.");
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        roi_batch_id_data[i] = n;
+      }
     }
 
-    rois_data = rois->data<int64_t>();
+    T* output_data = out->mutable_data<T>(ctx.GetPlace());
+    int64_t* argmax_data = argmax->mutable_data<int64_t>(ctx.GetPlace());
+
+    const int64_t* rois_data = rois->data<int64_t>();
     for (int n = 0; n < rois_num; ++n) {
-      int roi_batch_id = rois_data[0];
-      int roi_start_w = round(rois_data[1] * spatial_scale);
-      int roi_start_h = round(rois_data[2] * spatial_scale);
-      int roi_end_w = round(rois_data[3] * spatial_scale);
-      int roi_end_h = round(rois_data[4] * spatial_scale);
+      int roi_batch_id = roi_batch_id_data[n];
+      int roi_start_w = round(rois_data[0] * spatial_scale);
+      int roi_start_h = round(rois_data[1] * spatial_scale);
+      int roi_end_w = round(rois_data[2] * spatial_scale);
+      int roi_end_h = round(rois_data[3] * spatial_scale);
 
       // Force malformed ROIs to be 1x1
       int roi_height = std::max(roi_end_h - roi_start_h + 1, 1);
@@ -131,7 +147,7 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in = ctx.Input<framework::Tensor>("X");
-    auto* rois = ctx.Input<framework::Tensor>("ROIs");
+    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
     auto* argmax = ctx.Input<framework::Tensor>("Argmax");
     auto* out_grad =
         ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
@@ -141,6 +157,20 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
     auto pooled_width = ctx.Attr<int>("pooled_width");
 
     if (in_grad) {
+      int rois_num = rois->dims()[0];
+      framework::Tensor roi_batch_id_list;
+      roi_batch_id_list.Resize({rois_num});
+      int* roi_batch_id_data =
+          roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
+
+      auto rois_lod = rois->lod().back();
+      int rois_batch_size = rois_lod.size() - 1;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+          roi_batch_id_data[i] = n;
+        }
+      }
+
       const int64_t* rois_data = rois->data<int64_t>();
       const T* out_grad_data = out_grad->data<T>();
       const int64_t* argmax_data = argmax->data<int64_t>();
@@ -154,11 +184,10 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
       auto roi_stride = framework::stride(rois->dims());
       auto out_stride = framework::stride(out_grad->dims());
 
-      int rois_num = rois->dims()[0];
       int channels = in->dims()[1];
 
       for (int n = 0; n < rois_num; ++n) {
-        int roi_batch_idx = rois_data[0];
+        int roi_batch_idx = roi_batch_id_data[n];
         T* batch_grad_data = in_grad_data + roi_batch_idx * in_stride[0];
         for (int c = 0; c < channels; ++c) {
           for (int ph = 0; ph < pooled_height; ++ph) {
diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc
index d34beeb6508084f4d680fad9bac99ea474d274d3..10b1b0c899d833d70fa6afe51998fe210899e3c3 100644
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -76,26 +76,25 @@ class RowConvGradOp : public framework::OperatorWithKernel {
 
 class RowConvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  RowConvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
-             "(LoDTensor), the input(X) is a LodTensor, which supports "
+             "the input(X) is a LodTensor, which supports "
              "variable time-length input sequences. The underlying tensor "
              "in this LoDTensor is a matrix with shape (T x N), where T "
              "is the total time steps in this mini-batch and N is the input "
              "data dimension.");
     AddInput("Filter",
-             "(Tensor), the input(Filter) is a learnable parameter. It "
+             "the input(Filter) is a learnable parameter. It "
              "is a 2-D tensor with shape (future_context x N), where, "
              "future_context is the future context length and N is the data "
              "dimension.");
     AddOutput("Out",
-              "(LoDTensor), the output(Out) is a LodTensor, which supports "
+              "the output(Out) is a LodTensor, which supports "
               "variable time-length input sequences. The underlying tensor "
               "in this LodTensor is a matrix with shape T x N, i.e., the "
               "same shape as X.");
     AddComment(R"DOC(
-Row-convolution Operator.
+:strong:`Row-convolution operator`
 
 The row convolution is called lookahead convolution.  This operator was 
 introduced in the following paper for DeepSpeech2:
@@ -115,9 +114,23 @@ and a filter ($W$) of size $context \times d$,
 the output sequence is convolved as:
 
 $$
-out_{i, :} = \sum_{j=i}^{i + context} in_{j,:} \dot W_{i-j, :}
+out_{i, :} = \\sum_{j=i}^{i + context} in_{j,:} \\cdot W_{i-j, :}
 $$
 
+In the above equation:
+
+* $Out_{i}$: The i-th row of output variable with shape [1, D].
+
+* $\\tau$: Future context size.
+
+* $X_{j}$: The j-th row of input variable with shape [1, D].
+
+* $W_{i-j}$: The (i-j)-th row of parameters with shape [1, D].
+
+More details about row_conv please refer to
+the design document
+https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645 .
+
 )DOC");
   }
 };
@@ -250,8 +263,9 @@ class RowConvGradKernel<platform::CPUDeviceContext, T>
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(row_conv, ops::RowConvOp, ops::RowConvOpMaker, row_conv_grad,
-            ops::RowConvGradOp);
+REGISTER_OPERATOR(row_conv, ops::RowConvOp, ops::RowConvOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(row_conv_grad, ops::RowConvGradOp);
 REGISTER_OP_CPU_KERNEL(
     row_conv, ops::RowConvKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu
index 67083455a7579a4bbb6d9598a77b68a8375cf815..9ae80da6550bcef39c07f05e35d4153c24738f09 100644
--- a/paddle/fluid/operators/row_conv_op.cu
+++ b/paddle/fluid/operators/row_conv_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/row_conv_op.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
 
 namespace paddle {
 namespace operators {
@@ -189,6 +189,10 @@ __global__ void RowConvGradFilterImproved(const T *in, const T *dout,
   }
   __syncthreads();
 
+  // NOTE(zcd): temporary solution
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, true);
+
   for (int i = 0; i < num_sequence; i++) {
     int start = static_cast<int>(batch_indices[i]);
     int end = static_cast<int>(batch_indices[i + 1]);
@@ -220,7 +224,7 @@ __global__ void RowConvGradFilterImproved(const T *in, const T *dout,
 
         for (int offset = 16; offset > 0;
              offset = offset / 2) {  // blockDim.x is 32.
-          val += __shfl_down(val, offset);
+          val += platform::CudaShuffleDownSync(mask, val, offset);
         }
         __syncthreads();
 
@@ -251,6 +255,10 @@ __global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence,
   T *sh_in = mem;
   T *sh_dout = &mem[block_x * block_y];
 
+  // NOTE(zcd): temporary solution
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, true);
+
   for (int i = 0; i < num_sequence; i++) {
     int start = static_cast<int>(batch_indices[i]);
     int end = static_cast<int>(batch_indices[i + 1]);
@@ -276,7 +284,7 @@ __global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence,
 
         for (int offset = 16; offset > 0;
              offset = offset / 2) {  // blockDim.x is 32.
-          val += __shfl_down(val, offset);
+          val += platform::CudaShuffleDownSync(mask, val, offset);
         }
         __syncthreads();
 
diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc
index 94703393bfa53124d16e34ae4373773eece5f11f..cfee9207083b46f7c27354f22e82a7d3c38a027c 100644
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <numeric>
 #include <sstream>
 #include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -69,6 +70,7 @@ class SaveCombineOp : public framework::OperatorBase {
                const platform::Place &place) const override {
     auto filename = Attr<std::string>("file_path");
     auto overwrite = Attr<bool>("overwrite");
+    auto save_as_fp16 = Attr<bool>("save_as_fp16");
 
     bool is_present = FileExists(filename);
     if (is_present && !overwrite) {
@@ -100,8 +102,24 @@ class SaveCombineOp : public framework::OperatorBase {
                      inp_var_names[i]);
 
       auto &tensor = var->Get<framework::LoDTensor>();
-      // Serialize tensor
-      framework::SerializeToStream(fout, tensor, dev_ctx);
+      // Serialize tensors one by one
+
+      // Check types to see if a fp16 transformation is required
+      auto in_dtype = framework::ToDataType(tensor.type());
+      auto out_dtype =
+          save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
+
+      if (in_dtype != out_dtype) {
+        auto in_kernel_type = framework::OpKernelType(in_dtype, place);
+        auto out_kernel_type = framework::OpKernelType(out_dtype, place);
+        framework::LoDTensor out;
+        // copy LoD info to the new tensor
+        out.set_lod(tensor.lod());
+        framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
+        framework::SerializeToStream(fout, out, dev_ctx);
+      } else {
+        framework::SerializeToStream(fout, tensor, dev_ctx);
+      }
     }
     fout.close();
   }
@@ -109,8 +127,7 @@ class SaveCombineOp : public framework::OperatorBase {
 
 class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SaveCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(
         "X",
         "(vector) Input LoDTensors that need to be saved together in a file.")
@@ -125,6 +142,12 @@ to a file on disk.
                   "(boolean, default true)"
                   "Overwrite the output file if it exists.")
         .SetDefault(true);
+    AddAttr<bool>("save_as_fp16",
+                  "(boolean, default false)"
+                  "If true, the tensor will be converted to float16 data "
+                  "type and then saved. Otherwise, the tensor will be "
+                  "directly saved without data type conversion.")
+        .SetDefault(false);
     AddAttr<std::string>(
         "file_path",
         "(string)"
diff --git a/paddle/fluid/operators/save_load_combine_op_test.cc b/paddle/fluid/operators/save_load_combine_op_test.cc
index 286f75df4ca2daff24b696c6bcb0c3df32875875..4743e0d9499b111d8baa921dbb245431713fd7a8 100644
--- a/paddle/fluid/operators/save_load_combine_op_test.cc
+++ b/paddle/fluid/operators/save_load_combine_op_test.cc
@@ -17,49 +17,54 @@ limitations under the License. */
 #include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/float16.h"
 
 USE_NO_KERNEL_OP(save_combine);
 USE_NO_KERNEL_OP(load_combine);
 
-int* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info,
-                            std::string var_name,
-                            paddle::platform::CPUPlace& place,
-                            paddle::framework::Scope& scope,
-                            paddle::framework::LoD& expect_lod) {
-  auto var = scope.Var(var_name);
+template <typename T, typename U>
+T* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info,
+                          std::string var_name,
+                          const paddle::platform::CPUPlace& place,
+                          paddle::framework::Scope* scope,
+                          paddle::framework::LoD* expect_lod) {
+  auto var = scope->Var(var_name);
   auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
   tensor->Resize({x, y});
-  expect_lod.resize(1);
+  expect_lod->resize(1);
   for (size_t i = 0; i < lod_info.size(); i++) {
-    expect_lod[0].push_back(lod_info[i]);
+    (*expect_lod)[0].push_back(lod_info[i]);
   }
-  tensor->set_lod(expect_lod);
-  int* expect = tensor->mutable_data<int>(place);
+  tensor->set_lod(*expect_lod);
+  T* expect = tensor->mutable_data<T>(place);
   for (int64_t i = 0; i < tensor->numel(); ++i) {
-    expect[i] = static_cast<int>(i);
+    expect[i] = static_cast<T>(
+        static_cast<U>(i));  // For FP16, we intend to do float(float16(i))
   }
   return expect;
 }
 
 paddle::framework::LoDTensor* GeneratePlaceholderBeforeLoad(
-    const std::string out_var_name, paddle::framework::Scope& scope) {
-  auto load_var = scope.Var(out_var_name);
+    const std::string out_var_name, paddle::framework::Scope* scope) {
+  auto load_var = scope->Var(out_var_name);
   auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
   return target;
 }
 
-int* GetValuesAfterLoadCombineOp(paddle::framework::LoDTensor* target,
-                                 paddle::framework::Scope& scope,
-                                 paddle::framework::LoD& actual_lod) {
-  int* actual = target->data<int>();
-  actual_lod = target->lod();
+template <typename T>
+T* GetValuesAfterLoadCombineOp(paddle::framework::LoDTensor* target,
+                               const paddle::framework::Scope& scope,
+                               paddle::framework::LoD* actual_lod) {
+  T* actual = target->data<T>();
+  *actual_lod = target->lod();
   return actual;
 }
 
-void CheckValues(int* expect, int* actual, paddle::framework::LoD expect_lod,
-                 paddle::framework::LoD actual_lod, const int& numel) {
-  for (int64_t i = 0; i < numel; ++i) {
-    EXPECT_EQ(expect[i], actual[i]);
+template <typename T, typename U>
+void CheckValues(T* expect, U* actual, const paddle::framework::LoD& expect_lod,
+                 const paddle::framework::LoD& actual_lod, const int& numel) {
+  for (int i = 0; i < numel; ++i) {
+    EXPECT_EQ(expect[i], static_cast<T>(actual[i]));
   }
   EXPECT_EQ(expect_lod.size(), actual_lod.size());
   for (size_t i = 0; i < expect_lod.size(); ++i) {
@@ -78,26 +83,26 @@ TEST(SaveLoadCombineOp, CPU) {
   std::vector<int> lod1 = {0, 1, 2, 3, 10};
   int numel1 = 100;
   paddle::framework::LoD expect_lod1;
-  int* expect1 = CreateForSaveCombineOp(10, 10, lod1, "test_var1", place, scope,
-                                        expect_lod1);
+  int* expect1 = CreateForSaveCombineOp<int, int>(10, 10, lod1, "test_var1",
+                                                  place, &scope, &expect_lod1);
 
   std::vector<int> lod2 = {0, 2, 5, 10};
   int numel2 = 200;
   paddle::framework::LoD expect_lod2;
-  int* expect2 = CreateForSaveCombineOp(10, 20, lod2, "test_var2", place, scope,
-                                        expect_lod2);
+  int* expect2 = CreateForSaveCombineOp<int, int>(10, 20, lod2, "test_var2",
+                                                  place, &scope, &expect_lod2);
 
   std::vector<int> lod3 = {0, 2, 3, 20};
   int numel3 = 4000;
   paddle::framework::LoD expect_lod3;
-  int* expect3 = CreateForSaveCombineOp(20, 200, lod3, "test_var3", place,
-                                        scope, expect_lod3);
+  int* expect3 = CreateForSaveCombineOp<int, int>(20, 200, lod3, "test_var3",
+                                                  place, &scope, &expect_lod3);
 
   std::vector<int> lod4 = {0, 1, 20};
   int numel4 = 1000;
   paddle::framework::LoD expect_lod4;
-  int* expect4 = CreateForSaveCombineOp(20, 50, lod4, "test_var4", place, scope,
-                                        expect_lod4);
+  int* expect4 = CreateForSaveCombineOp<int, int>(20, 50, lod4, "test_var4",
+                                                  place, &scope, &expect_lod4);
 
   // Set attributes
   std::string filename = "check_tensor.ls";
@@ -111,27 +116,188 @@ TEST(SaveLoadCombineOp, CPU) {
   save_combine_op->Run(scope, place);
 
   // Set up output vars
-  auto target1 = GeneratePlaceholderBeforeLoad("out_var1", scope);
-  auto target2 = GeneratePlaceholderBeforeLoad("out_var2", scope);
-  auto target3 = GeneratePlaceholderBeforeLoad("out_var3", scope);
-  auto target4 = GeneratePlaceholderBeforeLoad("out_var4", scope);
+  auto target1 = GeneratePlaceholderBeforeLoad("out_var1", &scope);
+  auto target2 = GeneratePlaceholderBeforeLoad("out_var2", &scope);
+  auto target3 = GeneratePlaceholderBeforeLoad("out_var3", &scope);
+  auto target4 = GeneratePlaceholderBeforeLoad("out_var4", &scope);
+
+  // Run the load_combine_op
+  auto load_combine_op = paddle::framework::OpRegistry::CreateOp(
+      "load_combine", {},
+      {{"Out", {"out_var1", "out_var2", "out_var3", "out_var4"}}}, attrs);
+  load_combine_op->Run(scope, place);
+
+  paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4;
+  int* actual1 = GetValuesAfterLoadCombineOp<int>(target1, scope, &actual_lod1);
+  int* actual2 = GetValuesAfterLoadCombineOp<int>(target2, scope, &actual_lod2);
+  int* actual3 = GetValuesAfterLoadCombineOp<int>(target3, scope, &actual_lod3);
+  int* actual4 = GetValuesAfterLoadCombineOp<int>(target4, scope, &actual_lod4);
+
+  CheckValues<int, int>(expect1, actual1, expect_lod1, actual_lod1, numel1);
+  CheckValues<int, int>(expect2, actual2, expect_lod2, actual_lod2, numel2);
+  CheckValues<int, int>(expect3, actual3, expect_lod3, actual_lod3, numel3);
+  CheckValues<int, int>(expect4, actual4, expect_lod4, actual_lod4, numel4);
+}
+
+// FP16 version of SaveLoadCombineOp Test, only altering the saving aspect
+// to save as FP16.
+TEST(SaveCombineFP16Op, CPU) {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+
+  std::vector<int> lod1 = {0, 1, 2, 3, 10};
+  int numel1 = 100;
+  paddle::framework::LoD expect_lod1;
+  float* expect1 = CreateForSaveCombineOp<float, paddle::platform::float16>(
+      10, 10, lod1, "test_var1", place, &scope, &expect_lod1);
+
+  std::vector<int> lod2 = {0, 2, 5, 10};
+  int numel2 = 200;
+  paddle::framework::LoD expect_lod2;
+  float* expect2 = CreateForSaveCombineOp<float, paddle::platform::float16>(
+      10, 20, lod2, "test_var2", place, &scope, &expect_lod2);
+
+  std::vector<int> lod3 = {0, 20};
+  int numel3 = 4000;
+  paddle::framework::LoD expect_lod3;
+  float* expect3 = CreateForSaveCombineOp<float, paddle::platform::float16>(
+      20, 200, lod3, "test_var3", place, &scope, &expect_lod3);
+
+  std::vector<int> lod4 = {0, 1, 20};
+  int numel4 = 1000;
+  paddle::framework::LoD expect_lod4;
+  float* expect4 = CreateForSaveCombineOp<float, paddle::platform::float16>(
+      20, 50, lod4, "test_var4", place, &scope, &expect_lod4);
+
+  // Set attributes
+  std::string filename = "check_tensor_fp16_save.ls";
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"file_path", std::string(filename)});
+  attrs.insert({"save_as_fp16", true});
+
+  // Run the save_combine_op
+  auto save_combine_op = paddle::framework::OpRegistry::CreateOp(
+      "save_combine",
+      {{"X", {"test_var1", "test_var2", "test_var3", "test_var4"}}}, {}, attrs);
+  save_combine_op->Run(scope, place);
+
+  // Set up output vars
+  auto target1 = GeneratePlaceholderBeforeLoad("out_var1", &scope);
+  auto target2 = GeneratePlaceholderBeforeLoad("out_var2", &scope);
+  auto target3 = GeneratePlaceholderBeforeLoad("out_var3", &scope);
+  auto target4 = GeneratePlaceholderBeforeLoad("out_var4", &scope);
+
+  // Run the load_combine_op
+  auto load_combine_op = paddle::framework::OpRegistry::CreateOp(
+      "load_combine", {},
+      {{"Out", {"out_var1", "out_var2", "out_var3", "out_var4"}}}, attrs);
+  load_combine_op->Run(scope, place);
+
+  paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4;
+  paddle::platform::float16* actual1 =
+      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target1, scope,
+                                                             &actual_lod1);
+  paddle::platform::float16* actual2 =
+      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target2, scope,
+                                                             &actual_lod2);
+  paddle::platform::float16* actual3 =
+      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target3, scope,
+                                                             &actual_lod3);
+  paddle::platform::float16* actual4 =
+      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target4, scope,
+                                                             &actual_lod4);
+
+  CheckValues<float, paddle::platform::float16>(expect1, actual1, expect_lod1,
+                                                actual_lod1, numel1);
+  CheckValues<float, paddle::platform::float16>(expect2, actual2, expect_lod2,
+                                                actual_lod2, numel2);
+  CheckValues<float, paddle::platform::float16>(expect3, actual3, expect_lod3,
+                                                actual_lod3, numel3);
+  CheckValues<float, paddle::platform::float16>(expect4, actual4, expect_lod4,
+                                                actual_lod4, numel4);
+}
+
+// FP16 version of SaveLoadCombineOp Test, only altering the loading aspect
+// to load tensors with FP16 precision.
+TEST(LoadCombineFP16Op, CPU) {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+
+  std::vector<int> lod1 = {0, 1, 2, 3, 10};
+  int numel1 = 100;
+  paddle::framework::LoD expect_lod1;
+  float* expect1 = CreateForSaveCombineOp<float, paddle::platform::float16>(
+      10, 10, lod1, "test_var1", place, &scope, &expect_lod1);
+
+  std::vector<int> lod2 = {0, 2, 5, 10};
+  int numel2 = 200;
+  paddle::framework::LoD expect_lod2;
+  float* expect2 = CreateForSaveCombineOp<float, paddle::platform::float16>(
+      10, 20, lod2, "test_var2", place, &scope, &expect_lod2);
+
+  std::vector<int> lod3 = {0, 20};
+  int numel3 = 4000;
+  paddle::framework::LoD expect_lod3;
+  float* expect3 = CreateForSaveCombineOp<float, paddle::platform::float16>(
+      20, 200, lod3, "test_var3", place, &scope, &expect_lod3);
+
+  std::vector<int> lod4 = {0, 1, 20};
+  int numel4 = 1000;
+  paddle::framework::LoD expect_lod4;
+  float* expect4 = CreateForSaveCombineOp<float, paddle::platform::float16>(
+      20, 50, lod4, "test_var4", place, &scope, &expect_lod4);
+
+  // Set attributes
+  std::string filename = "check_tensor_fp16_load.ls";
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"file_path", std::string(filename)});
+
+  // Run the save_combine_op
+  auto save_combine_op = paddle::framework::OpRegistry::CreateOp(
+      "save_combine",
+      {{"X", {"test_var1", "test_var2", "test_var3", "test_var4"}}}, {}, attrs);
+  save_combine_op->Run(scope, place);
+
+  // Set up output vars
+  auto load_var1 = scope.Var("out_var1");
+  auto load_var2 = scope.Var("out_var2");
+  auto load_var3 = scope.Var("out_var3");
+  auto load_var4 = scope.Var("out_var4");
 
+  attrs.insert({"load_as_fp16", true});
   // Run the load_combine_op
   auto load_combine_op = paddle::framework::OpRegistry::CreateOp(
       "load_combine", {},
       {{"Out", {"out_var1", "out_var2", "out_var3", "out_var4"}}}, attrs);
   load_combine_op->Run(scope, place);
 
+  auto* target1 = load_var1->GetMutable<paddle::framework::LoDTensor>();
+  auto* target2 = load_var2->GetMutable<paddle::framework::LoDTensor>();
+  auto* target3 = load_var3->GetMutable<paddle::framework::LoDTensor>();
+  auto* target4 = load_var4->GetMutable<paddle::framework::LoDTensor>();
+
   paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4;
-  int* actual1 = GetValuesAfterLoadCombineOp(target1, scope, actual_lod1);
-  int* actual2 = GetValuesAfterLoadCombineOp(target2, scope, actual_lod2);
-  int* actual3 = GetValuesAfterLoadCombineOp(target3, scope, actual_lod3);
-  int* actual4 = GetValuesAfterLoadCombineOp(target4, scope, actual_lod4);
-
-  CheckValues(expect1, actual1, expect_lod1, actual_lod1, numel1);
-  CheckValues(expect2, actual2, expect_lod2, actual_lod2, numel2);
-  CheckValues(expect3, actual3, expect_lod3, actual_lod3, numel3);
-  CheckValues(expect4, actual4, expect_lod4, actual_lod4, numel4);
+  paddle::platform::float16* actual1 =
+      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target1, scope,
+                                                             &actual_lod1);
+  paddle::platform::float16* actual2 =
+      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target2, scope,
+                                                             &actual_lod2);
+  paddle::platform::float16* actual3 =
+      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target3, scope,
+                                                             &actual_lod3);
+  paddle::platform::float16* actual4 =
+      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target4, scope,
+                                                             &actual_lod4);
+
+  CheckValues<float, paddle::platform::float16>(expect1, actual1, expect_lod1,
+                                                actual_lod1, numel1);
+  CheckValues<float, paddle::platform::float16>(expect2, actual2, expect_lod2,
+                                                actual_lod2, numel2);
+  CheckValues<float, paddle::platform::float16>(expect3, actual3, expect_lod3,
+                                                actual_lod3, numel3);
+  CheckValues<float, paddle::platform::float16>(expect4, actual4, expect_lod4,
+                                                actual_lod4, numel4);
 }
 
 // Test with original SaveLoadTest
@@ -141,7 +307,7 @@ TEST(SaveLoadTestWithCombineOp, CPU) {
 
   auto var = scope.Var("test_var");
   auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
-  tensor->Resize({3, 10});
+  tensor->Resize({3, 4000});
   paddle::framework::LoD expect_lod;
   expect_lod.resize(1);
   expect_lod[0].push_back(0);
diff --git a/paddle/fluid/operators/save_load_op_test.cc b/paddle/fluid/operators/save_load_op_test.cc
index a7ba1e0ae1d22a22cf2943c9aaf0c394ef4ae326..ccaea0eef2906953d922e097348b6c0a86dad6f1 100644
--- a/paddle/fluid/operators/save_load_op_test.cc
+++ b/paddle/fluid/operators/save_load_op_test.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/float16.h"
 
 USE_NO_KERNEL_OP(save);
 USE_NO_KERNEL_OP(load);
@@ -61,3 +62,99 @@ TEST(SaveLoadOp, CPU) {
     }
   }
 }
+
+TEST(SaveFP16Op, CPU) {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+
+  auto var = scope.Var("test_var");
+  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
+  tensor->Resize({3, 10});
+  paddle::framework::LoD expect_lod;
+  expect_lod.resize(1);
+  expect_lod[0].push_back(0);
+  expect_lod[0].push_back(1);
+  expect_lod[0].push_back(2);
+  expect_lod[0].push_back(3);
+
+  tensor->set_lod(expect_lod);
+  float* expect = tensor->mutable_data<float>(place);
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    expect[i] = static_cast<float>(paddle::platform::float16(i));
+  }
+
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"file_path", std::string("tensor.save")});
+  attrs.insert({"save_as_fp16", true});
+
+  auto save_op = paddle::framework::OpRegistry::CreateOp(
+      "save", {{"X", {"test_var"}}}, {}, attrs);
+  save_op->Run(scope, place);
+
+  auto load_var = scope.Var("out_var");
+  auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
+  auto load_op = paddle::framework::OpRegistry::CreateOp(
+      "load", {}, {{"Out", {"out_var"}}}, attrs);
+  load_op->Run(scope, place);
+  paddle::platform::float16* actual = target->data<paddle::platform::float16>();
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    EXPECT_EQ(expect[i], static_cast<float>(actual[i]));
+  }
+  auto& actual_lod = target->lod();
+  EXPECT_EQ(expect_lod.size(), actual_lod.size());
+  for (size_t i = 0; i < expect_lod.size(); ++i) {
+    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
+      EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
+    }
+  }
+}
+
+TEST(LoadFP16Op, CPU) {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+
+  auto var = scope.Var("test_var");
+  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
+  tensor->Resize({3, 10});
+
+  paddle::framework::LoD expect_lod;
+  expect_lod.resize(1);
+  expect_lod[0].push_back(0);
+  expect_lod[0].push_back(1);
+  expect_lod[0].push_back(2);
+  expect_lod[0].push_back(3);
+
+  tensor->set_lod(expect_lod);
+  float* expect = tensor->mutable_data<float>(place);
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    expect[i] = static_cast<float>(paddle::platform::float16(i));
+  }
+
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"file_path", std::string("tensor.save")});
+  attrs.insert({"load_as_fp16", true});
+
+  auto save_op = paddle::framework::OpRegistry::CreateOp(
+      "save", {{"X", {"test_var"}}}, {}, attrs);
+  save_op->Run(scope, place);
+
+  auto load_var = scope.Var("out_var");
+  load_var->GetMutable<paddle::framework::LoDTensor>();
+  auto load_op = paddle::framework::OpRegistry::CreateOp(
+      "load", {}, {{"Out", {"out_var"}}}, attrs);
+  load_op->Run(scope, place);
+
+  auto target = load_var->Get<paddle::framework::LoDTensor>();
+  paddle::platform::float16* actual = target.data<paddle::platform::float16>();
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    EXPECT_EQ(expect[i], static_cast<float>(actual[i]));
+  }
+
+  auto& actual_lod = target.lod();
+  EXPECT_EQ(expect_lod.size(), actual_lod.size());
+  for (size_t i = 0; i < expect_lod.size(); ++i) {
+    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
+      EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
+    }
+  }
+}
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index 4a715c4baab2da7b7af86ada22ee88a16b05a814..201a51130d6b6f94104e2dabf9e7facffa672ae0 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -18,14 +18,21 @@ limitations under the License. */
 #include <numeric>
 
 #include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
 namespace operators {
 
+// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables
+// to directory specified.
+constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
+
 // TODO(yuyang18): If the functions below are needed by other files, move them
 // to paddle::filesystem namespace.
 constexpr char kSEP = '/';
@@ -66,6 +73,25 @@ class SaveOp : public framework::OperatorBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
+    auto iname = Input("X");
+    auto *var = scope.FindVar(iname);
+    PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op",
+                   iname);
+
+    if (var->IsType<framework::LoDTensor>()) {
+      SaveLodTensor(place, var);
+    } else if (var->IsType<framework::SelectedRows>()) {
+      SaveSelectedRows(scope, place, var);
+    } else {
+      PADDLE_ENFORCE(
+          false,
+          "SaveOp only support LoDTensor and SelectedRows, %s has wrong type",
+          iname);
+    }
+  }
+
+  void SaveLodTensor(const platform::Place &place,
+                     framework::Variable *var) const {
     auto filename = Attr<std::string>("file_path");
     auto overwrite = Attr<bool>("overwrite");
 
@@ -76,44 +102,81 @@ class SaveOp : public framework::OperatorBase {
 
     MkDirRecursively(DirName(filename).c_str());
 
+    auto &tensor = var->Get<framework::LoDTensor>();
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
     // FIXME(yuyang18): We save variable to local file now, but we should change
     // it to save an output stream.
     std::ofstream fout(filename);
     PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
                    filename);
 
-    auto iname = Input("X");
-    auto *var = scope.FindVar(iname);
-    PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op",
-                   iname);
+    auto save_as_fp16 = Attr<bool>("save_as_fp16");
+    auto in_dtype = framework::ToDataType(tensor.type());
+    auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
+
+    if (in_dtype != out_dtype) {
+      auto in_kernel_type = framework::OpKernelType(in_dtype, place);
+      auto out_kernel_type = framework::OpKernelType(out_dtype, place);
+      framework::LoDTensor out;
+      framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
+      // copy LoD info to the new tensor
+      out.set_lod(tensor.lod());
+      framework::SerializeToStream(fout, out, dev_ctx);
+    } else {
+      framework::SerializeToStream(fout, tensor, dev_ctx);
+    }
+    fout.close();
+  }
 
-    PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
-                   "SaveOp only support LoDTensor, %s has wrong type", iname);
+  void SaveSelectedRows(const framework::Scope &scope,
+                        const platform::Place &place,
+                        framework::Variable *var) const {
+    auto *lt_var = scope.FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
+    PADDLE_ENFORCE(
+        lt_var != nullptr,
+        "Can not find variable kLookupTablePath for SaveSelectedRows");
+    std::string filename = lt_var->data();
+    VLOG(4) << "SaveSelectedRows get File name: " << filename;
 
-    auto &tensor = var->Get<framework::LoDTensor>();
+    auto &selectedRows = var->Get<framework::SelectedRows>();
 
     // get device context from pool
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(place);
 
-    framework::SerializeToStream(fout, tensor, dev_ctx);
+    // FIXME(yuyang18): We save variable to local file now, but we should change
+    // it to save an output stream.
+    std::ofstream fout(filename);
+    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
+                   filename);
+    framework::SerializeToStream(fout, selectedRows, dev_ctx);
+    fout.close();
   }
 };
 
 class SaveOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SaveOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(Tensor ) Input tensor to be saved");
+  void Make() override {
+    AddInput("X", "(Tensor ) Input LoDTensor and SelectedRows to be saved");
     AddComment(R"DOC(
 Save operator
 
-This operator will serialize and write a tensor variable to file on disk.
+This operator will serialize and write LoDTensor / SelectedRows variable to file on disk.
 )DOC");
     AddAttr<bool>("overwrite",
                   "(boolean, default true)"
                   "Overwrite the output file if exist")
         .SetDefault(true);
+    AddAttr<bool>("save_as_fp16",
+                  "(boolean, default false)"
+                  "If true, the tensor will be converted to float16 data "
+                  "type and then saved. Otherwise, the tensor will be "
+                  "directly saved without data type conversion.")
+        .SetDefault(false);
     AddAttr<std::string>("file_path",
                          "(string)"
                          "The \"file_path\" where the variable will be saved.")
@@ -122,9 +185,26 @@ This operator will serialize and write a tensor variable to file on disk.
   }
 };
 
+class SaveOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    auto out_var_name = op_desc.Output(LOOKUP_TABLE_PATH).front();
+    auto &out_var = block->FindRecursiveOrCreateVar(out_var_name);
+    auto var_type = framework::proto::VarType::RAW;
+    out_var.SetType(var_type);
+  }
+};
+
+class SaveOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {}
+};
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(save, ops::SaveOp, ops::SaveOpProtoMaker);
+REGISTER_OPERATOR(save, ops::SaveOp, paddle::framework::EmptyGradOpMaker,
+                  ops::SaveOpProtoMaker, ops::SaveOpVarTypeInference,
+                  ops::SaveOpShapeInference);
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index b16d06df8d0f7f57a5ec2f2be9a2cbb12a8ba55d..7f8822e40053b5bcd394f446138a2292d80b69bf 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/scale_op.h"
-#include "paddle/fluid/operators/net_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -35,21 +35,19 @@ class ScaleOp : public framework::OperatorWithKernel {
   }
 };
 
-template <typename AttrType>
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(Tensor) Input tensor of scale operator.");
     AddOutput("Out", "(Tensor) Output tensor of scale operator.");
     AddComment(R"DOC(
-Scale operator
+**Scale operator**
+
+Multiply the input tensor with a float scalar to scale the input tensor.
 
 $$Out = scale*X$$
 )DOC");
-    AddAttr<AttrType>("scale",
-                      "(float, default 1.0)"
-                      "The scaling factor of the scale operator.")
+    AddAttr<float>("scale", "The scaling factor of the scale operator.")
         .SetDefault(1.0);
   }
 };
@@ -73,8 +71,7 @@ class ScaleGradMaker : public framework::SingleGradOpDescMaker {
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker<float>,
-                  ops::ScaleGradMaker);
+REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker);
 REGISTER_OP_CPU_KERNEL(
     scale, ops::ScaleKernel<paddle::platform::CPUDeviceContext, float>,
     ops::ScaleKernel<paddle::platform::CPUDeviceContext, double>,
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index 3fb8b56d2676f90ff7e1cefa46c459ee37f63ca8..bf5e0d864495ce3a651a31c9d5a7664fe9eb2396 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -23,24 +23,24 @@ class ScatterOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Ref"),
-                   "Input(Ref) of ScatterOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Index"),
-                   "Input(Index) of ScatterOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ScatterOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Ids"),
+                   "Input(Ids) of ScatterOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Updates"),
                    "Input(Updates) of ScatterOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of ScatterOp should not be null.");
 
     auto updates_dims = ctx->GetInputDim("Updates");
-    auto ref_dims = ctx->GetInputDim("Ref");
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Index").size(), 1,
-                      "Update Index should be 1-D.");
+    auto ref_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Ids").size(), 1,
+                      "Update Ids should be 1-D.");
     PADDLE_ENFORCE_EQ(ref_dims.size(), updates_dims.size(),
-                      "Reference and Updates should have the same shape size");
+                      "Xerence and Updates should have the same shape size");
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0],
-                      ctx->GetInputDim("Index")[0],
-                      "Updates and Index should have same batch-size.");
+                      ctx->GetInputDim("Ids")[0],
+                      "Updates and Ids should have same batch-size.");
     framework::DDim data_dim(updates_dims);
     for (int i = 1; i < data_dim.size(); ++i) {
       PADDLE_ENFORCE_EQ(data_dim[i], updates_dims[i]);
@@ -52,7 +52,7 @@ class ScatterOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
         ctx.device_context());
   }
 };
@@ -64,25 +64,23 @@ class ScatterGradOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     ctx->SetOutputDim(framework::GradVarName("Updates"),
                       ctx->GetInputDim("Updates"));
-    ctx->SetOutputDim(framework::GradVarName("Ref"), ctx->GetInputDim("Ref"));
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
         ctx.device_context());
   }
 };
 
 class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ScatterOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Ref", "The source input of scatter op");
-    AddInput("Index",
-             "The index input of scatter op where Ref will be updated");
+  void Make() override {
+    AddInput("X", "The source input of scatter op");
+    AddInput("Ids", "The index input of scatter op where X will be updated");
     AddInput("Updates", "The updated value of updates op");
     AddOutput("Out", "The output of add op");
     AddComment(R"DOC(
@@ -91,8 +89,8 @@ Scatter Operator.
 This operator obtains output by updating the input on selected indices on the first axis:
 
 $$
-Out = Ref \\
-Out[Index] = Ref[Index] + Updates
+Out = X \\
+Out[Ids] = X[Ids] + Updates
 $$
 
 )DOC");
@@ -103,7 +101,8 @@ $$
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(scatter, ops::ScatterOp, ops::ScatterOpMaker, scatter_grad,
-            ops::ScatterGradOp);
+REGISTER_OPERATOR(scatter, ops::ScatterOp, ops::ScatterOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(scatter_grad, ops::ScatterGradOp);
 REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel<float>);
 REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel<float>);
diff --git a/paddle/fluid/operators/scatter_op.cu b/paddle/fluid/operators/scatter_op.cu
index bdabb29fa680f8f87873b4381acf0dbd2b6195d0..a70b9091727935ddcbb83dd5775729969f7d64e5 100644
--- a/paddle/fluid/operators/scatter_op.cu
+++ b/paddle/fluid/operators/scatter_op.cu
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "gather.cu.h"
+#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/gather_op.h"
-#include "scatter.cu.h"
+#include "paddle/fluid/operators/scatter.cu.h"
+#include "paddle/fluid/operators/scatter_op.h"
 
 namespace paddle {
 namespace operators {
@@ -25,14 +26,14 @@ class ScatterOpCUDAKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "This kernel only runs on GPU device.");
-    auto *Ref = ctx.Input<Tensor>("Ref");
-    auto *Index = ctx.Input<Tensor>("Index");
+    auto *X = ctx.Input<Tensor>("X");
+    auto *Ids = ctx.Input<Tensor>("Ids");
     auto *Updates = ctx.Input<Tensor>("Updates");
     auto *Out = ctx.Output<Tensor>("Out");
 
-    Out->ShareDataWith(*Ref);
+    Out->ShareDataWith(*X);
 
-    GPUScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
+    GPUScatterAssign<T>(ctx.device_context(), *Updates, *Ids, Out);
   }
 };
 
@@ -42,16 +43,16 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "This kernel only runs on GPU device.");
-    auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
+    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
-    auto *Index = ctx.Input<Tensor>("Index");
+    auto *Ids = ctx.Input<Tensor>("Ids");
     auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
-    // In place gradient: dRef = dO
-    dRef->ShareDataWith(*dOut);
+    // In place gradient: dX = dO
+    dX->ShareDataWith(*dOut);
     dUpdates->mutable_data<T>(ctx.GetPlace());
-    // Gradient by Gather: dUpdates = dO[Index]
-    GPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
+    // Gradient by Gather: dUpdates = dO[Ids]
+    GPUGather<T>(ctx.device_context(), *dOut, *Ids, dUpdates);
   }
 };
 
diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h
index 3c6e7ece320229e1a311ef6d7a27387d40be3c2a..d29947b55e751a3e7993f765198364f4debe2472 100644
--- a/paddle/fluid/operators/scatter_op.h
+++ b/paddle/fluid/operators/scatter_op.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "gather.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "scatter.h"
+#include "paddle/fluid/operators/gather.h"
+#include "paddle/fluid/operators/scatter.h"
 
 namespace paddle {
 namespace operators {
@@ -29,15 +29,15 @@ class ScatterOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
                    "This kernel only runs on CPU.");
-    auto *Ref = ctx.Input<Tensor>("Ref");
-    auto *Index = ctx.Input<Tensor>("Index");
+    auto *X = ctx.Input<Tensor>("X");
+    auto *Ids = ctx.Input<Tensor>("Ids");
     auto *Updates = ctx.Input<Tensor>("Updates");
     auto *Out = ctx.Output<Tensor>("Out");
 
-    // In place output: Out = Ref, Out[Index] += Updates
-    Out->ShareDataWith(*Ref);
+    // In place output: Out = X, Out[Ids] += Updates
+    Out->ShareDataWith(*X);
     // Apply ScatterUpdate: Out[index] += Updates[:]
-    ScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
+    ScatterAssign<T>(ctx.device_context(), *Updates, *Ids, Out);
   }
 };
 
@@ -47,16 +47,16 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
                    "This kernel only runs on CPU.");
-    auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
+    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
-    auto *Index = ctx.Input<Tensor>("Index");
+    auto *Ids = ctx.Input<Tensor>("Ids");
     auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
-    // In place gradient: dRef = dO
-    dRef->ShareDataWith(*dOut);
+    // In place gradient: dX = dO
+    dX->ShareDataWith(*dOut);
     dUpdates->mutable_data<T>(ctx.GetPlace());
-    // Gradient by Gather: dUpdates += dO[Index]
-    CPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
+    // Gradient by Gather: dUpdates += dO[Ids]
+    CPUGather<T>(ctx.device_context(), *dOut, *Ids, dUpdates);
   }
 };
 
diff --git a/paddle/fluid/operators/scatter_test.cc b/paddle/fluid/operators/scatter_test.cc
index b67af3c3710eafc57b660a48e4c340d5eefe7e5b..750245153a7df6c4a7ce088038005dcab1685b5f 100644
--- a/paddle/fluid/operators/scatter_test.cc
+++ b/paddle/fluid/operators/scatter_test.cc
@@ -13,44 +13,48 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/scatter.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/place.h"
-
 #include <gtest/gtest.h>
 #include <iostream>
 #include <string>
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/place.h"
 
 TEST(scatter, ScatterUpdate) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  using namespace paddle::operators;
+  // using namespace paddle::framework;
+  // using namespace paddle::platform;
+  // using namespace paddle::operators;
 
-  Tensor* src = new Tensor();
-  Tensor* index = new Tensor();
-  Tensor* output = new Tensor();
+  paddle::framework::Tensor* src = new paddle::framework::Tensor();
+  paddle::framework::Tensor* index = new paddle::framework::Tensor();
+  paddle::framework::Tensor* output = new paddle::framework::Tensor();
 
   float* p_src = nullptr;
   int* p_index = nullptr;
-  p_src = src->mutable_data<float>(make_ddim({1, 4}), CPUPlace());
-  p_index = index->mutable_data<int>(make_ddim({1}), CPUPlace());
+  p_src = src->mutable_data<float>(paddle::framework::make_ddim({1, 4}),
+                                   paddle::platform::CPUPlace());
+  p_index = index->mutable_data<int>(paddle::framework::make_ddim({1}),
+                                     paddle::platform::CPUPlace());
 
-  for (size_t i = 0; i < 4; ++i) p_src[i] = float(i);
+  for (size_t i = 0; i < 4; ++i) p_src[i] = static_cast<float>(i);
   p_index[0] = 1;
 
-  float* p_output = output->mutable_data<float>(make_ddim({4, 4}), CPUPlace());
+  float* p_output = output->mutable_data<float>(
+      paddle::framework::make_ddim({4, 4}), paddle::platform::CPUPlace());
 
   auto* cpu_place = new paddle::platform::CPUPlace();
   paddle::platform::CPUDeviceContext ctx(*cpu_place);
-  ScatterAssign<float>(ctx, *src, *index, output);
+  paddle::operators::ScatterAssign<float>(ctx, *src, *index, output);
 
-  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], float(0));
-  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output->data<float>()[i], float(0));
-  for (size_t i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], float(i - 4));
+  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], 0.0f);
+  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output->data<float>()[i], 0.0f);
+  for (size_t i = 4; i < 8; ++i) {
+    EXPECT_EQ(p_output[i], static_cast<float>(i - 4));
+  }
   for (size_t i = 4; i < 8; ++i)
-    EXPECT_EQ(output->data<float>()[i], float(i - 4));
-  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(p_output[i], float(0));
-  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(output->data<float>()[i], float(0));
+    EXPECT_EQ(output->data<float>()[i], static_cast<float>(i - 4));
+  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(p_output[i], 0.0f);
+  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(output->data<float>()[i], 0.0f);
 
   delete src;
   delete index;
diff --git a/paddle/fluid/operators/select_op.cc b/paddle/fluid/operators/select_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e71841d4d1815d50cd9800910c9db34e121beffc
--- /dev/null
+++ b/paddle/fluid/operators/select_op.cc
@@ -0,0 +1,419 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <thread>  // NOLINT
+#include <vector>
+#include "paddle/fluid/framework/channel.h"
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/concurrency/channel_util.h"
+
+#include <boost/tokenizer.hpp>
+
+namespace paddle {
+namespace operators {
+
+static constexpr char kX[] = "X";
+static constexpr char kCaseToExecute[] = "case_to_execute";
+static constexpr char kOutputs[] = "Out";
+
+static constexpr char kCases[] = "cases";
+static constexpr char kCasesBlock[] = "sub_block";
+
+class SelectOp : public framework::OperatorBase {
+ public:
+  SelectOp(const std::string &type, const framework::VariableNameMap &inputs,
+           const framework::VariableNameMap &outputs,
+           const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+ private:
+  enum class SelectOpCaseType {
+    DEFAULT = 0,
+    SEND = 1,
+    RECEIVE = 2,
+  };
+
+  struct SelectOpCase {
+    int caseIndex;
+    SelectOpCaseType caseType;
+    std::string channelName;
+    std::string varName;
+
+    SelectOpCase() {}
+
+    SelectOpCase(int caseIndex, SelectOpCaseType caseType,
+                 std::string channelName, std::string varName)
+        : caseIndex(caseIndex),
+          caseType(caseType),
+          channelName(channelName),
+          varName(varName) {}
+  };
+
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    std::vector<std::string> casesConfigs =
+        Attr<std::vector<std::string>>(kCases);
+
+    framework::BlockDesc *casesBlock =
+        Attr<framework::BlockDesc *>(kCasesBlock);
+
+    framework::Scope &casesBlockScope = scope.NewScope();
+
+    std::string caseToExecuteVarName = Input(kCaseToExecute);
+    framework::Variable *caseToExecuteVar =
+        casesBlockScope.FindVar(caseToExecuteVarName);
+
+    // Construct cases from "conditional_block_op"(s) in the casesBlock
+    std::vector<std::shared_ptr<SelectOpCase>> cases =
+        ParseAndShuffleCases(&casesConfigs);
+
+    // Get all unique channels involved in select
+    std::set<framework::ChannelHolder *> channelsSet;
+    for (auto c : cases) {
+      if (!c->channelName.empty()) {
+        auto channelVar = scope.FindVar(c->channelName);
+        framework::ChannelHolder *ch =
+            channelVar->GetMutable<framework::ChannelHolder>();
+
+        if (channelsSet.find(ch) == channelsSet.end()) {
+          channelsSet.insert(ch);
+        }
+      }
+    }
+
+    // Order all channels by their pointer address
+    std::vector<framework::ChannelHolder *> channels(channelsSet.begin(),
+                                                     channelsSet.end());
+    std::sort(channels.begin(), channels.end());
+
+    // Poll all cases
+    int32_t caseToExecute = pollCases(&scope, &cases, channels);
+
+    // At this point, the case to execute has already been determined,
+    // so we can proceed with executing the cases block
+    framework::LoDTensor *caseToExecuteTensor =
+        caseToExecuteVar->GetMutable<framework::LoDTensor>();
+    caseToExecuteTensor->data<int32_t>()[0] = caseToExecute;
+
+    // Execute the cases block, only one case will be executed since we set the
+    // case_to_execute value to the index of the case we want to execute
+    framework::Executor executor(dev_place);
+    framework::ProgramDesc *program = casesBlock->Program();
+    executor.Run(*program, &casesBlockScope, casesBlock->ID(),
+                 false /*create_local_scope*/);
+  }
+
+  /**
+   * Goes through all operators in the casesConfigs and processes
+   * "conditional_block" operators.  These operators are mapped to our
+   * SelectOpCase objects.  We randomize the case orders, and set the
+   * default case (if any exists) as the last case)
+   * @param casesBlock
+   * @return
+   */
+  std::vector<std::shared_ptr<SelectOpCase>> ParseAndShuffleCases(
+      std::vector<std::string> *casesConfigs) const {
+    std::vector<std::shared_ptr<SelectOpCase>> cases;
+    std::shared_ptr<SelectOpCase> defaultCase;
+
+    if (casesConfigs != nullptr) {
+      boost::char_delimiters_separator<char> sep(false, ",", "");
+      for (std::vector<std::string>::iterator itr = casesConfigs->begin();
+           itr < casesConfigs->end(); ++itr) {
+        std::string caseConfig = *itr;
+        boost::tokenizer<> tokens(caseConfig, sep);
+
+        boost::tokenizer<>::iterator tok_iter = tokens.begin();
+        PADDLE_ENFORCE(tok_iter != tokens.end(), "Cannot get case index");
+        std::string caseIndexString = *tok_iter;
+        int caseIndex = std::stoi(caseIndexString);
+
+        ++tok_iter;
+        PADDLE_ENFORCE(tok_iter != tokens.end(), "Cannot get case type");
+        std::string caseTypeString = *tok_iter;
+        SelectOpCaseType caseType = (SelectOpCaseType)std::stoi(caseTypeString);
+
+        std::string caseChannel;
+        std::string caseChannelVar;
+
+        ++tok_iter;
+        if (caseType != SelectOpCaseType::DEFAULT) {
+          PADDLE_ENFORCE(tok_iter != tokens.end(), "Cannot get case channel");
+          caseChannel = *tok_iter;
+
+          ++tok_iter;
+          PADDLE_ENFORCE(tok_iter != tokens.end(),
+                         "Cannot get case channel variable");
+          caseChannelVar = *tok_iter;
+        }
+
+        auto c = std::make_shared<SelectOpCase>(caseIndex, caseType,
+                                                caseChannel, caseChannelVar);
+
+        if (caseType == SelectOpCaseType::DEFAULT) {
+          PADDLE_ENFORCE(defaultCase == nullptr,
+                         "Select can only contain one default case.");
+          defaultCase = c;
+        } else {
+          cases.push_back(c);
+        }
+      }
+    }
+
+    // Randomly sort cases, with default case being last
+    std::random_shuffle(cases.begin(), cases.end());
+    if (defaultCase != nullptr) {
+      cases.push_back(defaultCase);
+    }
+
+    return cases;
+  }
+
+  /**
+   * This method will recursively poll the cases and determines if any case
+   * condition is true.
+   * If none of the cases conditions are true (and there is no default case),
+   * then block
+   * the thread.  The thread may be woken up by a channel operation, at which
+   * point we
+   * execute the case.
+   * @param scope
+   * @param cases
+   * @param channels
+   * @return
+   */
+  int32_t pollCases(const framework::Scope *scope,
+                    std::vector<std::shared_ptr<SelectOpCase>> *cases,
+                    std::vector<framework::ChannelHolder *> channels) const {
+    // Lock all involved channels
+    lockChannels(channels);
+
+    std::atomic<int> caseToExecute(-1);
+
+    std::vector<std::shared_ptr<SelectOpCase>>::iterator it = cases->begin();
+    while (it != cases->end()) {
+      std::shared_ptr<SelectOpCase> c = *it;
+
+      auto chVar = scope->FindVar(c->channelName);
+      framework::ChannelHolder *ch =
+          chVar->GetMutable<framework::ChannelHolder>();
+
+      switch (c->caseType) {
+        case SelectOpCaseType::SEND:
+          PADDLE_ENFORCE(!ch->IsClosed(), "Cannot send to a closed channel");
+          if (ch->CanSend()) {
+            // We can send to channel directly, send the data to channel
+            // and execute case
+            auto chVar = scope->FindVar(c->varName);
+            concurrency::ChannelSend(ch, chVar);
+            caseToExecute = c->caseIndex;
+          }
+          break;
+        case SelectOpCaseType::RECEIVE:
+          if (ch->CanReceive()) {
+            // We can receive from channel directly, send the data to channel
+            // and execute case
+            auto chVar = scope->FindVar(c->varName);
+            concurrency::ChannelReceive(ch, chVar);
+            caseToExecute = c->caseIndex;
+          }
+          break;
+        case SelectOpCaseType::DEFAULT:
+          caseToExecute = c->caseIndex;
+          break;
+      }
+
+      if (caseToExecute != -1) {
+        // We found a case to execute, stop looking at other case statements
+        break;
+      }
+
+      ++it;
+    }
+
+    if (caseToExecute == -1) {
+      // None of the cases are eligible to execute, enqueue current thread
+      // into all the sending/receiving queue of each involved channel
+      std::atomic<bool> completed(false);
+      std::recursive_mutex mutex;
+      std::unique_lock<std::recursive_mutex> lock{mutex};
+      // std::condition_variable_any selectCond;
+      auto selectCond = std::make_shared<std::condition_variable_any>();
+
+      std::recursive_mutex callbackMutex;
+      pushThreadOnChannelQueues(scope, cases, selectCond, &caseToExecute,
+                                &completed, &callbackMutex);
+
+      // TODO(thuan): Atomically unlock all channels and sleep current thread
+      unlockChannels(channels);
+      selectCond->wait(lock, [&completed]() { return completed.load(); });
+
+      // Select has been woken up by case operation
+      lockChannels(channels);
+      removeThreadOnChannelQueues(scope, cases);
+
+      if (caseToExecute == -1) {
+        // Recursively poll cases, since we were woken up by a channel close
+        // TODO(thuan): Need to test if this is a valid case
+        unlockChannels(channels);
+        return pollCases(scope, cases, channels);
+      }
+    }
+
+    // At this point, caseToExecute != -1, and we can proceed with executing
+    // the case block
+    unlockChannels(channels);
+
+    return caseToExecute;
+  }
+
+  void lockChannels(std::vector<framework::ChannelHolder *> chs) const {
+    std::vector<framework::ChannelHolder *>::iterator it = chs.begin();
+    while (it != chs.end()) {
+      framework::ChannelHolder *ch = *it;
+      ch->Lock();
+      ++it;
+    }
+  }
+
+  void unlockChannels(std::vector<framework::ChannelHolder *> chs) const {
+    std::vector<framework::ChannelHolder *>::reverse_iterator it = chs.rbegin();
+    while (it != chs.rend()) {
+      framework::ChannelHolder *ch = *it;
+      ch->Unlock();
+      ++it;
+    }
+  }
+
+  void pushThreadOnChannelQueues(
+      const framework::Scope *scope,
+      std::vector<std::shared_ptr<SelectOpCase>> *cases,
+      std::shared_ptr<std::condition_variable_any> rCond,
+      std::atomic<int> *caseToExecute, std::atomic<bool> *completed,
+      std::recursive_mutex *callbackMutex) const {
+    std::vector<std::shared_ptr<SelectOpCase>>::iterator it = cases->begin();
+    while (it != cases->end()) {
+      std::shared_ptr<SelectOpCase> c = *it;
+
+      auto chVar = scope->FindVar(c->channelName);
+      framework::ChannelHolder *ch =
+          chVar->GetMutable<framework::ChannelHolder>();
+
+      std::function<bool(framework::ChannelAction channelAction)> cb =
+          [&caseToExecute, &completed, &callbackMutex,
+           c](framework::ChannelAction channelAction) {
+            std::lock_guard<std::recursive_mutex> lock{*callbackMutex};
+
+            bool canProcess = false;
+            if (!(*completed)) {
+              // If the channel wasn't closed, we set the caseToExecute index
+              // as this current case
+              if (channelAction != framework::ChannelAction::CLOSE) {
+                *caseToExecute = c->caseIndex;
+              }
+              // This will allow our conditional variable to break out of wait
+              *completed = true;
+              canProcess = true;
+            }
+
+            return canProcess;
+          };
+
+      switch (c->caseType) {
+        case SelectOpCaseType::SEND: {
+          auto chOutputVar = scope->FindVar(c->varName);
+          concurrency::ChannelAddToSendQ(ch, this, chOutputVar, rCond, cb);
+          break;
+        }
+        case SelectOpCaseType::RECEIVE: {
+          auto chOutputVar = scope->FindVar(c->varName);
+          concurrency::ChannelAddToReceiveQ(ch, this, chOutputVar, rCond, cb);
+          break;
+        }
+        default:
+          break;
+      }
+      ++it;
+    }
+  }
+
+  void removeThreadOnChannelQueues(
+      const framework::Scope *scope,
+      std::vector<std::shared_ptr<SelectOpCase>> *cases) const {
+    std::vector<std::shared_ptr<SelectOpCase>>::iterator it = cases->begin();
+    while (it != cases->end()) {
+      std::shared_ptr<SelectOpCase> c = *it;
+
+      auto chVar = scope->FindVar(c->channelName);
+      framework::ChannelHolder *ch =
+          chVar->GetMutable<framework::ChannelHolder>();
+      switch (c->caseType) {
+        case SelectOpCaseType::SEND: {
+          ch->RemoveFromSendQ(this);
+          break;
+        }
+        case SelectOpCaseType::RECEIVE: {
+          ch->RemoveFromReceiveQ(this);
+          break;
+        }
+        default:
+          break;
+      }
+      ++it;
+    }
+  }
+};
+
+class SelectOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(kX,
+             "A set of variables, which are required by operators inside the "
+             "cases of Select Op")
+        .AsDuplicable();
+    AddInput(kCaseToExecute,
+             "(Int) The variable the sets the index of the case to execute, "
+             "after evaluating the channels being sent to and received from")
+        .AsDuplicable();
+    AddOutput(kOutputs,
+              "A set of variables, which will be assigned with values "
+              "generated by the operators inside the cases of Select Op.")
+        .AsDuplicable();
+    AddAttr<std::vector<std::string>>(kCases,
+                                      "(String vector) Serialized list of"
+                                      "all cases in the select op. Each"
+                                      "case is serialized as: "
+                                      "'<index>,<type>,<channel>,<value>'"
+                                      "where type is 0 for default, 1 for"
+                                      "send, and 2 for receive"
+                                      "No channel and values are needed for"
+                                      "default cases.");
+    AddAttr<framework::BlockDesc *>(kCasesBlock,
+                                    "The cases block inside select_op");
+    AddComment(R"DOC(
+)DOC");
+  }
+};
+
+// TODO(thuan): Implement Gradient Operator for SELECT_OP
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(select, paddle::operators::SelectOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::SelectOpMaker);
diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6b4572dcccc21e783f1df0b9bcde11d532ff4ba8
--- /dev/null
+++ b/paddle/fluid/operators/send_barrier_op.cc
@@ -0,0 +1,94 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <future>  // NOLINT
+#include <ostream>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/macros.h"
+
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+
+class SendBarrierOp : public framework::OperatorBase {
+ public:
+  SendBarrierOp(const std::string& type,
+                const framework::VariableNameMap& inputs,
+                const framework::VariableNameMap& outputs,
+                const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
+    bool sync_mode = Attr<bool>("sync_mode");
+
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& ctx = *pool.Get(place);
+    // For profiling
+    platform::RecordEvent record_event(Type(), &ctx);
+
+    distributed::RPCClient* rpc_client =
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
+
+    VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode;
+
+    // need to wait before sending send_barrier message
+    rpc_client->Wait();
+    if (sync_mode) {
+      for (auto& ep : eps) {
+        VLOG(3) << "send barrier, ep: " << ep;
+        rpc_client->AsyncSendBatchBarrier(ep);
+      }
+      rpc_client->Wait();
+    }
+  }
+};
+
+class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddComment(R"DOC(
+SendBarrier operator
+
+This operator will send a send barrier signal to list_and_serv op, so that
+the Parameter Server would knew all variables have been sent.
+)DOC");
+
+    AddAttr<std::vector<std::string>>("endpoints",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints to send variables to.")
+        .SetDefault({"127.0.0.1:6164"});
+    AddAttr<bool>("sync_mode", "work in sync_mode or not").SetDefault(true);
+  }
+};
+
+class SendBarrierOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(send_barrier, ops::SendBarrierOp,
+                  paddle::framework::EmptyGradOpMaker, ops::SendBarrierOpMaker,
+                  ops::SendBarrierOpShapeInference);
diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc
index 443f40e803ea31c3961ed77842bd0775e0f74f35..0cac329aafa8c4c67cae48ba62a48575f5edba92 100644
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -12,34 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <future>  // NOLINT
 #include <ostream>
 
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-
-#include <future>
-#include "paddle/fluid/operators/detail/grpc_client.h"
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/send_recv_util.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
-static bool NeedSend(const framework::Scope& scope,
-                     const std::string& varname) {
-  auto* var = scope.FindVar(varname);
-  PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
-                          varname);
-  if (var->IsType<framework::LoDTensor>()) {
-    return var->Get<framework::LoDTensor>().IsInitialized();
-  } else if (var->IsType<framework::SelectedRows>()) {
-    return var->Get<framework::SelectedRows>().rows().size() > 0UL;
-  } else {
-    PADDLE_THROW(
-        "Variable type in send side should be in "
-        "[LodTensor, SelectedRows]");
-  }
-  return false;
-}
 
 class SendOp : public framework::OperatorBase {
  public:
@@ -51,90 +35,54 @@ class SendOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& place) const override {
     auto ins = Inputs("X");
-    auto outs = Outputs("Out");
+
     std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
-    std::vector<std::string> endpoints =
-        Attr<std::vector<std::string>>("endpoints");
+    int sync_send = Attr<int>("sync_mode");
 
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
 
-    auto client_var_name = Output("RPCClient");
-    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
-                            "Can not find variable '%s' in the scope.",
-                            client_var_name);
-    auto* client_var = scope.FindVar(client_var_name);
-    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
+    // For profiling
+    platform::RecordEvent record_event(Type(), &ctx);
+
+    distributed::RPCClient* rpc_client =
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>();
 
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
         VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
-        rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
+        // TODO(Yancey1989): we need to use an IO threadpool which has
+        // a larger number of threads than the computing threadpool.
+        rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]);
       } else {
         VLOG(3) << "don't send no-initialied variable: " << ins[i];
       }
     }
-    PADDLE_ENFORCE(rpc_client->Wait());
-
-    for (auto& ep : endpoints) {
-      VLOG(3) << "batch barrier, ep: " << ep;
-      rpc_client->AsyncSendBatchBarrier(ep);
-    }
-    PADDLE_ENFORCE(rpc_client->Wait());
-
-    if (outs.size() > 0) {
-      for (size_t i = 0; i < outs.size(); i++) {
-        VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
-        rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
-      }
-      PADDLE_ENFORCE(rpc_client->Wait());
-      // tell pservers that current trainer have called fetch
-      for (auto& ep : endpoints) {
-        VLOG(3) << "send fetch barrier, ep: " << ep;
-        rpc_client->AsyncSendFetchBarrier(ep);
-      }
-      PADDLE_ENFORCE(rpc_client->Wait());
+    if (sync_send) {
+      rpc_client->Wait();
     }
   }
 };
 
 class SendOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SendOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(Tensor) Input tensor to be sent").AsDuplicable();
-    AddOutput("Out", "(Tensor) Output tensor to be received from server")
+  void Make() {
+    AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
         .AsDuplicable();
-    AddOutput("RPCClient",
-              "(RPCClient) The RPC client object which is"
-              "initialized at most once.");
     AddComment(R"DOC(
 Send operator
 
-This operator will send tensor to recv_op at the parameter server.
+This operator will send variables to listen_and_serve op at the parameter server.
 )DOC");
-    // TODO(typhoonzero): remove this attr generate de-duplicated vector from
-    // epmap when initializing.
-    AddAttr<std::vector<std::string>>("endpoints",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints to send variables to.")
-        .SetDefault({});
+    AddAttr<int>("sync_mode",
+                 "(int, default 0)"
+                 "sync send or async send.")
+        .SetDefault(0);
     AddAttr<std::vector<std::string>>("epmap",
                                       "(string vector, default 127.0.0.1:6164)"
                                       "Server endpoints in the order of input "
                                       "variables for mapping")
-        .SetDefault({});
-  }
-};
-
-class SendOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto out_var_name = op_desc.Output("RPCClient").front();
-    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto var_type = framework::proto::VarType::RAW;
-    out_var.SetType(var_type);
+        .SetDefault({"127.0.0.1:6164"});
   }
 };
 
@@ -149,5 +97,4 @@ class SendOpShapeInference : public framework::InferShapeBase {
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(send, ops::SendOp, paddle::framework::EmptyGradOpMaker,
-                  ops::SendOpMaker, ops::SendOpVarTypeInference,
-                  ops::SendOpShapeInference);
+                  ops::SendOpMaker, ops::SendOpShapeInference);
diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc
index e9fb845b475ff5776bf948ab120a44c16ed87aa0..aee6180add5708d31f7ce927b37c4524a291fe3c 100644
--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -14,12 +14,13 @@ limitations under the License. */
 
 #include <unistd.h>
 #include <string>
-#include <thread>
+#include <thread>  // NOLINT
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/listen_and_serv_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/string/printf.h"
@@ -34,12 +35,13 @@ namespace m = paddle::operators::math;
 
 // global for simplicity.
 std::unique_ptr<f::OperatorBase> listen_and_serv_op;
+int selected_port;
 
-void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
+void InitTensorsInScope(const p::CPUPlace &place, f::Scope *scope) {
   p::CPUDeviceContext ctx(place);
   for (int i = 0; i < 2; ++i) {
     auto var_name = paddle::string::Sprintf("x%d", i);
-    auto var = scope.Var(var_name);
+    auto var = scope->Var(var_name);
     auto tensor = var->GetMutable<f::LoDTensor>();
     tensor->Resize({10, 10});
     float *expect = tensor->mutable_data<float>(place);
@@ -48,20 +50,20 @@ void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
     }
   }
 
-  auto out_var = scope.Var("Out");
+  auto out_var = scope->Var("Out");
   auto out_tensor = out_var->GetMutable<f::LoDTensor>();
   out_tensor->Resize({10, 10});
   out_tensor->mutable_data<float>(place);  // allocate
 }
 
-void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
+void InitSelectedRowsInScope(const p::CPUPlace &place, f::Scope *scope) {
   p::CPUDeviceContext ctx(place);
   int64_t height = 10;
   int64_t row_numel = 10;
   m::SetConstant<p::CPUDeviceContext, float> set_one;
   // init x0
   std::vector<int64_t> rows0{0, 4, 7};
-  auto x0_var = scope.Var("x0");
+  auto x0_var = scope->Var("x0");
   auto x0 = x0_var->GetMutable<f::SelectedRows>();
   x0->set_rows(rows0);
   x0->set_height(height);
@@ -72,7 +74,7 @@ void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
 
   // init x1
   std::vector<int64_t> rows1{2, 9};
-  auto x1_var = scope.Var("x1");
+  auto x1_var = scope->Var("x1");
   auto x1 = x1_var->GetMutable<f::SelectedRows>();
   x1->set_rows(rows1);
   x1->set_height(height);
@@ -81,7 +83,7 @@ void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
       f::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), place);
   set_one(ctx, x1_value, 1.0);
 
-  auto out_var = scope.Var("Out");
+  auto out_var = scope->Var("Out");
   auto out = out_var->GetMutable<f::SelectedRows>();
   auto out_value = out->mutable_value();
   out->set_height(height);
@@ -90,12 +92,16 @@ void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
 
 void AddOp(const std::string &type, const f::VariableNameMap &inputs,
            const f::VariableNameMap &outputs, f::AttributeMap attrs,
-           f::BlockDesc *block) {
+           f::BlockDesc *block, bool is_sparse) {
   // insert output
   for (auto kv : outputs) {
     for (auto v : kv.second) {
       auto var = block->Var(v);
       var->SetDataType(f::proto::VarType::FP32);
+      var->SetPersistable(true);
+      if (is_sparse) {
+        var->SetType(f::proto::VarType::SELECTED_ROWS);
+      }
     }
   }
 
@@ -111,48 +117,72 @@ void AddOp(const std::string &type, const f::VariableNameMap &inputs,
   op->SetAttrMap(attrs);
 }
 
-void StartServerNet(bool is_sparse) {
+void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
   f::Scope scope;
   p::CPUPlace place;
+  VLOG(4) << "before init tensor";
   if (is_sparse) {
-    InitSelectedRowsInScope(scope, place);
+    InitSelectedRowsInScope(place, &scope);
   } else {
-    InitTensorsInScope(scope, place);
+    InitTensorsInScope(place, &scope);
   }
-
   // sub program run in listen_and_serv_op, for simple test we use sum
   f::ProgramDesc program;
-  f::BlockDesc *optimize_block = program.MutableBlock(0);
-  // X for server side tensors, RX for received tensers, must be of same shape.
-  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block);
-
+  const auto &root_block = program.Block(0);
+  std::vector<framework::BlockDesc *> optimize_blocks;
+  auto *optimize_block = program.AppendBlock(root_block);
+  optimize_blocks.push_back(optimize_block);
+
+  auto *prefetch_block = program.AppendBlock(root_block);
+  // X for server side tensors, RX for received tensors, must be of same shape.
+  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block,
+        is_sparse);
   f::AttributeMap attrs;
-  attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
+  attrs.insert({"endpoint", std::string("127.0.0.1:0")});
   attrs.insert({"Fanin", 1});
   attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
   attrs.insert({"GradList", std::vector<std::string>({"x1"})});
-  attrs.insert({"OptimizeBlock", optimize_block});
+  attrs.insert({"optimize_blocks", optimize_blocks});
+  attrs.insert({"PrefetchBlock", prefetch_block});
+  attrs.insert({"grad_to_block_id", std::vector<std::string>({""})});
+  attrs.insert({"sync_mode", true});
+  VLOG(4) << "before init op";
   listen_and_serv_op =
       f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
+  *initialized = true;
   listen_and_serv_op->Run(scope, place);
+  LOG(INFO) << "server exit";
 }
 
 TEST(SendRecvOp, CPUDense) {
-  std::thread server_thread(StartServerNet, false);
-  sleep(5);  // wait server to start
+  std::atomic<bool> initialized{false};
+  std::thread server_thread(StartServerNet, false, &initialized);
+  while (!initialized) {
+  }
+
+  static_cast<paddle::operators::ListenAndServOp *>(listen_and_serv_op.get())
+      ->WaitServerReady();
+
   // local net
   f::Scope scope;
   p::CPUPlace place;
-  InitTensorsInScope(scope, place);
+  InitTensorsInScope(place, &scope);
   // create rpc client var
   scope.Var("RPC_CLIENT_VAR");
 
   f::AttributeMap attrs;
-  attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})});
-  attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})});
-  auto send_op = f::OpRegistry::CreateOp(
-      "send", {{"X", {"x1"}}},
-      {{"Out", {"Out"}}, {"RPCClient", {"RPC_CLIENT_VAR"}}}, attrs);
+  auto *listen_and_serv_op_ptr =
+      static_cast<paddle::operators::ListenAndServOp *>(
+          listen_and_serv_op.get());
+  ASSERT_TRUE(listen_and_serv_op_ptr != nullptr);
+  selected_port = listen_and_serv_op_ptr->GetSelectedPort();
+  std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
+  attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
+  attrs.insert({"epmap", std::vector<std::string>({endpoint})});
+  const f::VariableNameMap &inputs = {{"X", {"x1"}}};
+  const f::VariableNameMap &outputs = {{"Out", {"Out"}}};
+
+  auto send_op = f::OpRegistry::CreateOp("send", inputs, outputs, attrs);
   send_op->Run(scope, place);
 
   auto in_var = scope.Var("x1");
@@ -169,23 +199,34 @@ TEST(SendRecvOp, CPUDense) {
   listen_and_serv_op->Stop();
   server_thread.join();
   listen_and_serv_op.reset(nullptr);
+  paddle::operators::ListenAndServOp::ResetPort();
 }
 
 TEST(SendRecvOp, CPUSparse) {
-  std::thread server_thread(StartServerNet, true);
-  sleep(3);  // wait server to start
+  std::atomic<bool> initialized;
+  initialized = false;
+  std::thread server_thread(StartServerNet, true, &initialized);
+  while (!initialized) {
+  }
+  auto *listen_and_serv_op_ptr =
+      static_cast<paddle::operators::ListenAndServOp *>(
+          listen_and_serv_op.get());
+  ASSERT_TRUE(listen_and_serv_op_ptr != nullptr);
+  listen_and_serv_op_ptr->WaitServerReady();
+
   // local net
   f::Scope scope;
   p::CPUPlace place;
   p::CPUDeviceContext ctx(place);
-  InitSelectedRowsInScope(scope, place);
+  InitSelectedRowsInScope(place, &scope);
   scope.Var("RPC_CLIENT_VAR");
   f::AttributeMap attrs;
-  attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})});
-  attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})});
-  auto send_op = f::OpRegistry::CreateOp(
-      "send", {{"X", {"x1"}}},
-      {{"Out", {"Out"}}, {"RPCClient", {"RPC_CLIENT_VAR"}}}, attrs);
+  selected_port = listen_and_serv_op_ptr->GetSelectedPort();
+  std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
+  attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
+  attrs.insert({"epmap", std::vector<std::string>({endpoint})});
+  auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}},
+                                         {{"Out", {"Out"}}}, attrs);
   send_op->Run(scope, place);
 
   auto x0 = scope.Var("x0")->GetMutable<f::SelectedRows>();
@@ -210,4 +251,5 @@ TEST(SendRecvOp, CPUSparse) {
   listen_and_serv_op->Stop();
   server_thread.join();
   listen_and_serv_op.reset();
+  paddle::operators::ListenAndServOp::ResetPort();
 }
diff --git a/paddle/fluid/operators/send_recv_util.h b/paddle/fluid/operators/send_recv_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..deab005149027caffa962783df944fad7110382f
--- /dev/null
+++ b/paddle/fluid/operators/send_recv_util.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+
+namespace paddle {
+namespace operators {
+
+inline bool NeedSend(const framework::Scope& scope,
+                     const std::string& varname) {
+  // dummy variable is only used in parallel executor to represent
+  // some dependency relationship, we don't need to send/recv it.
+  if (varname == "dummy") return false;
+  auto* var = scope.FindVar(varname);
+  PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
+                          varname);
+  if (var->IsType<framework::LoDTensor>()) {
+    return var->Get<framework::LoDTensor>().IsInitialized();
+  } else if (var->IsType<framework::SelectedRows>()) {
+    return var->Get<framework::SelectedRows>().rows().size() > 0UL;
+  } else {
+    PADDLE_THROW(
+        "Variable type in send side should be in "
+        "[LodTensor, SelectedRows]");
+  }
+  return false;
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_concat_op.cc b/paddle/fluid/operators/sequence_concat_op.cc
index 126753edd09e8bd0f9d5a08936afbc6326b29ace..077b9a5f7d935a39706ef3c2b710522bf1b713ed 100644
--- a/paddle/fluid/operators/sequence_concat_op.cc
+++ b/paddle/fluid/operators/sequence_concat_op.cc
@@ -43,8 +43,7 @@ class SequenceConcatOp : public framework::OperatorWithKernel {
 
 class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceConcatOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(LodTensorArray) Input is a vector of LoDTensor, "
              "each of which is a variable-length sequence or nested sequence.")
@@ -124,9 +123,11 @@ class SequenceConcatGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_EX(sequence_concat, ops::SequenceConcatOp,
-               ops::SequenceConcatOpMaker, sequence_concat_grad,
-               ops::SequenceConcatGradOp, false);
+REGISTER_OPERATOR(sequence_concat, ops::SequenceConcatOp,
+                  ops::SequenceConcatOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<
+                      false> /* set false to disable empty grad */);
+REGISTER_OPERATOR(sequence_concat_grad, ops::SequenceConcatGradOp);
 REGISTER_OP_CPU_KERNEL(
     sequence_concat,
     ops::SequenceConcatOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_concat_op.h b/paddle/fluid/operators/sequence_concat_op.h
index 9f04c4199130de3cead6f23ef111453ca752c0e3..71c9f45287c29628a2f2c8c649e9e5270317ef6a 100644
--- a/paddle/fluid/operators/sequence_concat_op.h
+++ b/paddle/fluid/operators/sequence_concat_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 
diff --git a/paddle/fluid/operators/sequence_conv_op.cc b/paddle/fluid/operators/sequence_conv_op.cc
index ec1f3a5da8c1fc8933b3720802ea901695195dec..ec6cb24350ae276724aae339590d40be1e9ea400 100644
--- a/paddle/fluid/operators/sequence_conv_op.cc
+++ b/paddle/fluid/operators/sequence_conv_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_conv_op.h"
 
+#include <algorithm>
+
 namespace paddle {
 namespace operators {
 
@@ -100,8 +102,7 @@ class SequenceConvGradOp : public framework::OperatorWithKernel {
 
 class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceConvOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(
         "X",
         "(LoDTensor) the input(X) is a LodTensor, which supports "
@@ -174,8 +175,9 @@ context_length, context_stride and context_start.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker,
-            sequence_conv_grad, ops::SequenceConvGradOp);
+REGISTER_OPERATOR(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sequence_conv_grad, ops::SequenceConvGradOp);
 
 REGISTER_OP_CPU_KERNEL(
     sequence_conv,
diff --git a/paddle/fluid/operators/sequence_conv_op.h b/paddle/fluid/operators/sequence_conv_op.h
index ee48339c52e348e7b3060bbdd462177375aee9f5..ee70281d51673b94a1451f636e607fad3404863b 100644
--- a/paddle/fluid/operators/sequence_conv_op.h
+++ b/paddle/fluid/operators/sequence_conv_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/context_project.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -32,7 +33,6 @@ class SequenceConvKernel : public framework::OpKernel<T> {
     auto filter = *context.Input<Tensor>("Filter");
 
     out->mutable_data<T>(context.GetPlace());
-    context.ShareLoD("X", "Out");
 
     int context_start = context.Attr<int>("contextStart");
     int context_length = context.Attr<int>("contextLength");
@@ -58,17 +58,15 @@ class SequenceConvKernel : public framework::OpKernel<T> {
     // Because if padding_trainable is false, padding data should be zeros.
     math::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     set_zero(dev_ctx, &col, static_cast<T>(0));
-
     math::ContextProjectFunctor<DeviceContext, T> seq_project_functor;
 
     seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable,
                         context_start, context_length, context_stride, up_pad,
                         down_pad, &col);
 
-    math::matmul<DeviceContext, T>(dev_ctx, col, false, filter, false,
-                                   static_cast<T>(1.0), out,
-                                   static_cast<T>(0.0));
+    blas.MatMul(col, filter, out);
   }
 };
 
@@ -99,6 +97,7 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
 
     math::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     // use col_shape in the im2col calculation
     framework::DDim col_shape = {in->dims()[0],
                                  sequence_width * context_length};
@@ -108,8 +107,7 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
       col.mutable_data<T>(col_shape, context.GetPlace());
       // Because if padding_trainable is false, padding data should be zeros.
       set_zero(dev_ctx, &col, static_cast<T>(0));
-      math::matmul<DeviceContext, T>(dev_ctx, *out_g, false, *filter, true,
-                                     T(1.0), &col, T(1.0));
+      blas.MatMul(*out_g, false, *filter, true, &col);
     }
     math::ContextProjectFunctor<DeviceContext, T> seq_project_functor;
     math::ContextProjectGradFunctor<DeviceContext, T> seq_project_grad_functor;
@@ -150,8 +148,7 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
                           context_start, context_length, context_stride, up_pad,
                           down_pad, &col);
 
-      math::matmul<DeviceContext, T>(dev_ctx, col, true, out_grad, false,
-                                     T(1.0), &filter_grad, T(1.0));
+      blas.MatMul(col, true, out_grad, false, &filter_grad);
     }
   }
 };
diff --git a/paddle/fluid/operators/sequence_erase_op.cc b/paddle/fluid/operators/sequence_erase_op.cc
index 32b9d7f7c1528a365cd21122e4df0e3c1407a49e..1c86486157a02c3b78ed61e840fd8e452b9cb452 100644
--- a/paddle/fluid/operators/sequence_erase_op.cc
+++ b/paddle/fluid/operators/sequence_erase_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_erase_op.h"
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -36,8 +37,7 @@ class SequenceEraseOp : public framework::OperatorWithKernel {
 
 class SequenceEraseOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceEraseOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(2-D LoDTensor with the 2nd dim. equal to 1) "
              "Input LoDTensor of SequenceEraseOp.");
diff --git a/paddle/fluid/operators/sequence_erase_op.cu b/paddle/fluid/operators/sequence_erase_op.cu
index fc9b91c351defb92246e0966b9993fd1e288aaac..3a58e47f1132cd1ac85584b2470e8c6cddcfb28a 100644
--- a/paddle/fluid/operators/sequence_erase_op.cu
+++ b/paddle/fluid/operators/sequence_erase_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #include "paddle/fluid/operators/sequence_erase_op.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sequence_erase_op.h b/paddle/fluid/operators/sequence_erase_op.h
index b490c34f543875f73e0862c08c25bcb57611e2f4..265390528a15aa060900276f98128d754fc907fe 100644
--- a/paddle/fluid/operators/sequence_erase_op.h
+++ b/paddle/fluid/operators/sequence_erase_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/sequence_expand_op.cc b/paddle/fluid/operators/sequence_expand_op.cc
index a5d84d629b2e50763dac9bc571ac490414a8a406..944c7f85e5f43679e1875fcce813382be2ba5526 100644
--- a/paddle/fluid/operators/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_expand_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using framework::Tensor;
+using framework::LoDTensor;
 
 class SequenceExpandOp : public framework::OperatorWithKernel {
  public:
@@ -25,100 +25,152 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"));
-    PADDLE_ENFORCE(ctx->HasOutput("Out"));
-    PADDLE_ENFORCE(ctx->HasInput("Y"));
-    framework::DDim out_dim;
-    auto y_dim = ctx->GetInputDim("Y");
-    out_dim = ctx->GetInputDim("X");
-    out_dim[0] = y_dim[0];
-    ctx->ShareLoD("Y", "Out");
-    ctx->SetOutputDim("Out", out_dim);
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceExpandOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"),
+                   "Input(Y) of SequenceExpandOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceExpandOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto out_dims = x_dims;
+    int ref_level = ctx->Attrs().Get<int>("ref_level");
+
+    PADDLE_ENFORCE_GE(x_dims.size(), 2,
+                      "Dimension number of Input(X) should be at least 2.");
+
+    if (ctx->IsRuntime()) {
+      framework::Variable* x_var =
+          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("X")[0]);
+      framework::Variable* y_var =
+          boost::get<framework::Variable*>(ctx->GetInputVarPtrs("Y")[0]);
+
+      auto& x_lod = x_var->Get<LoDTensor>().lod();
+      auto& y_lod = y_var->Get<LoDTensor>().lod();
+
+      PADDLE_ENFORCE_LE(x_lod.size(), 1,
+                        "Level number of Input(X)'s lod should not be "
+                        "greater than 1.");
+      PADDLE_ENFORCE_GT(y_lod.size(), 0,
+                        "Level number of Input(Y)'s lod should be "
+                        "greater than 0.");
+      PADDLE_ENFORCE(
+          ref_level == -1 ||
+              (ref_level >= 0 && ref_level < static_cast<int>(y_lod.size())),
+          "Invlid `ref_level`, which should be either equal to -1 "
+          "or in [0, %d)",
+          y_lod.size());
+
+      if (ref_level == -1) ref_level = y_lod.size() - 1;
+
+      if (x_lod.size() > 0) {
+        PADDLE_ENFORCE(x_lod[0].size() == y_lod[ref_level].size(),
+                       "Level number of Input(X)'s lod could be 0. Otherwise "
+                       "size of Input(X)'s first level lod should be equal to "
+                       "size of Input(Y)'s referred level lod.");
+      }
+
+      int64_t out_first_dim = 0;
+      if (y_lod[ref_level].size() <= 1) {
+        out_first_dim = x_dims[0];
+      } else {
+        for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
+          int x_seq_len = 1;
+          if (x_lod.size() == 1) {
+            x_seq_len = x_lod[0][i] - x_lod[0][i - 1];
+          }
+          out_first_dim +=
+              (y_lod[ref_level][i] - y_lod[ref_level][i - 1]) * x_seq_len;
+        }
+      }
+      out_dims[0] = out_first_dim;
+    } else {
+      out_dims[0] = -1;
+    }
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
 class SequenceExpandOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
-             "(Tensor or LoDTensor) The input(X) of this operator can be a "
-             "LoDTensor or a base Tensor.");
+             "(LoDTensor, default LoDTensor<float>) A 2-D LoDTensor whose lod "
+             "level is at most 1.");
     AddInput("Y",
-             "(LoDTensor)The reference input(Y) of sequence_expand op."
-             "It must be a LoDTensor with k-level(k>0)."
-             "The input(X) will be expanded according to LOD of input(Y)."
-             "The element numbers of last level in input(Y) "
-             "must be equal to dims[0] of input(X).");
+             "(LoDTensor, default LoDTensor<float>) Referred LoDTensor whose "
+             "lod (specified level) is referred by Input(X).");
     AddOutput("Out",
-              "(LodTensor)The output of sequence_expand op."
-              "The lod of output will be as same as input(Y)'s lod.");
+              "(LodTensor, default LoDTensor<float>) Output LoDTensor which is "
+              "generated from Input(X) by referring lod of Input(Y).");
+    AddAttr<int>("ref_level", "Specify lod level of Input(Y).").SetDefault(-1);
     AddComment(R"DOC(
 Sequence Expand Operator.
 
-This operator expands input(X) according to LOD of input(Y).
+This operator expands `X` according to specified level lod of `Y`. Current
+implementation constaints that lod level of `X` should be at most 1. Attribute
+`ref_level` is used to specify which level lod of `Y` is referred to expand `X`.
+If set `ref_level` to -1, then last level lod of `Y` would be referred.
+Please note, rank of `X` should be at least 2, when the rank exceeds 2, `X`
+would be viewed as a 2-D tensor.
+
 Following are cases to better explain how this works:
+
 Case 1:
 
-Given a 2-level LoDTensor input(X)
-    X.lod = [[0,       2, 3],
-             [0, 1,    3, 4]]
-    X.data = [a, b, c, d]
+Given a 1-level LoDTensor input(X)
+    X.lod =  [[0,   2,        4]]
+    X.data = [[a], [b], [c], [d]]
     X.dims = [4, 1]
 and input(Y)
     Y.lod = [[0,    2,    4],
              [0, 3, 6, 7, 8]]
-with condition len(Y.lod[-1]) -1 == X.dims[0]
-then we get 2-level LoDTensor
-    Out.lod = [[0,                2,    4],
-               [0,       3,       6, 7, 8]]
-    Out.data = [a, a, a, b, b, b, c, d]
+ref_level: 0
+then we get 1-level LoDTensor
+    Out.lod =  [[0,   2,        4,        6,        8]]
+    Out.data = [[a], [b], [a], [b], [c], [d], [c], [d]]
     Out.dims = [8, 1]
 
 Case 2:
 
+Given 1-level LoDTensor input(X)
+    X.lod =  [[0,   1,        4]]
+    X.data = [[a], [b], [c], [d]]
+    X.dims = [4, 1]
+and input(Y)
+    Y.lod = [[0,    2,    4],
+             [0, 3, 6, 6, 8]]
+ref_level: 0
+then we get 1-level LoDTensor
+    Out.lod =  [[0,   1,   2,        5,             8]]
+    Out.data = [[a], [a], [b], [c], [d], [b], [c], [d]]
+    Out.dims = [8, 1]
+
+Case 3:
+
 Given a common Tensor input(X)
-    X.data = [a, b, c]
+    X.data = [[a], [b], [c]]
     X.dims = [3, 1]
 and input(Y)
     Y.lod = [[0, 2, 3, 6]]
-with condition len(Y.lod[-1]) -1 == X.dims[0]
-then we get 1-level LoDTensor
-    Out.lod = [[0,    2, 3,      6]]
-    Out.data = [a, a, b, c, c, c]
+ref_level: -1
+then we get a common Tensor
+    Out.data = [[a], [a], [b], [c], [c], [c]]
     Out.dims = [6, 1]
 
-Case 3:
+Case 4:
 
 Given a common Tensor input(X)
     X.data = [[a, b], [c, d], [e, f]]
     X.dims = [3, 2]
 and input(Y)
     Y.lod = [[0, 2, 3, 6]]
-with condition len(Y.lod[-1]) -1 == X.dims[0]
-then we get 1-level LoDTensor
-    Out.lod = [[0,           2,     3,                     6]]
-    Out.data = [[a,b], [a,b] [c,d], [e, f], [e, f], [e, f]]
+ref_level: 0
+then we get a common LoDTensor
+    Out.data = [[a, b], [a, b] [c, d], [e, f], [e, f], [e, f]]
     Out.dims = [6, 2]
 
-Case 4:
-
-Given 2-level a LoDTensor input(X)
-    X.lod = [[0,       2, 3],
-             [0, 1,    3, 4]]
-    X.data = [a, b, c, d]
-    X.dims = [4, 1]
-and input(Y)
-    Y.lod = [[0,    2,    4],
-             [0, 3, 6, 6, 8]]
-with condition len(Y.lod[-1]) -1 == X.dims[0]
-then we get 2-level LoDTensor
-    Out.lod = [[0,                2,    4],
-               [0,       3,       6, 6, 8]]
-    Out.data = [a, a, a, b, b, b, d, d]
-    Out.dims = [8, 1]
-
-
 )DOC");
   }
 };
@@ -129,12 +181,14 @@ class SequenceExpandOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"));
-    PADDLE_ENFORCE(ctx->HasInput("Out"));
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "The input(Out@GRAD) should not be null");
+                   "Input(Out@GRAD) should not be null.");
+
     auto x_dims = ctx->GetInputDim("X");
     auto x_grad_name = framework::GradVarName("X");
+
     if (ctx->HasOutput(x_grad_name)) {
       ctx->SetOutputDim(x_grad_name, x_dims);
     }
@@ -145,11 +199,19 @@ class SequenceExpandOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_expand, ops::SequenceExpandOp, ops::SequenceExpandOpMaker,
-            sequence_expand_grad, ops::SequenceExpandOpGrad);
+REGISTER_OPERATOR(sequence_expand, ops::SequenceExpandOp,
+                  ops::SequenceExpandOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sequence_expand_grad, ops::SequenceExpandOpGrad);
 REGISTER_OP_CPU_KERNEL(
     sequence_expand,
-    ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     sequence_expand_grad,
-    ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu
index 26622d23afa1c703e237628bcb11db8f1da73210..550677b22694085059e914678a5361d914b455bc 100644
--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
@@ -12,13 +12,147 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#define EIGEN_USE_GPU
+#include <algorithm>
 #include "paddle/fluid/operators/sequence_expand_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+__global__ void sequence_expand_kernel(const T* x_data, const size_t* x_lod,
+                                       const size_t* ref_lod,
+                                       const size_t* offset,
+                                       const size_t lod_size,
+                                       /* default=1,
+                                          the instance length*/
+                                       const int x_item_length, T* out_data) {
+  int bid = blockIdx.x;
+  if (bid >= lod_size - 1) return;
+
+  int x_item_count = x_lod[bid + 1] - x_lod[bid];
+  int repeats = ref_lod[bid + 1] - ref_lod[bid];
+  int out_offset = static_cast<int>(offset[bid]);
+  int x_offset = x_lod[bid];
+  for (int tid_z = threadIdx.z; tid_z < repeats; tid_z += blockDim.z) {
+    for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) {
+      for (int tid_x = threadIdx.x; tid_x < x_item_length;
+           tid_x += blockDim.x) {
+        out_data[(out_offset + tid_z * x_item_count + tid_y) * x_item_length +
+                 tid_x] = x_data[(x_offset + tid_y) * x_item_length + tid_x];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void sequence_expand_grad_kernel(
+    const T* dout_data, const size_t* ref_lod, const size_t* dx_lod,
+    const size_t* offset, const size_t lod_size,
+    /* default=1,
+       the instance length*/
+    const int x_item_length, T* dx_data) {
+  int bid = blockIdx.x;
+  if (bid >= lod_size - 1) return;
+  int x_item_count = dx_lod[bid + 1] - dx_lod[bid];
+  int repeats = ref_lod[bid + 1] - ref_lod[bid];
+  int out_offset = static_cast<int>(offset[bid]);
+  int x_offset = dx_lod[bid];
+
+  for (int tid_z = threadIdx.z; tid_z < repeats; tid_z += blockDim.z) {
+    for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) {
+      for (int tid_x = threadIdx.x; tid_x < x_item_length;
+           tid_x += blockDim.x) {
+        platform::CudaAtomicAdd(
+            &dx_data[(x_offset + tid_y) * x_item_length + tid_x],
+            dout_data[(out_offset + tid_z * x_item_count + tid_y) *
+                          x_item_length +
+                      tid_x]);
+      }
+    }
+  }
+}
+
+void GetOutputOffset(const framework::Vector<size_t>& x_lod,
+                     const framework::Vector<size_t>& ref_lod,
+                     framework::Vector<size_t>* out_offset) {
+  size_t offset = 0;
+  int lod_size = static_cast<int>(x_lod.size());
+  for (int i = 0; i < static_cast<int>(x_lod.size()); ++i) {
+    (*out_offset)[i] = offset;
+    if (i < lod_size - 1) {
+      offset += (ref_lod[i + 1] - ref_lod[i]) * (x_lod[i + 1] - x_lod[i]);
+    }
+  }
+}
+
+template <typename T>
+struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
+  void operator()(
+      const platform::CUDADeviceContext& context, const LoDTensor& x,
+      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
+      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
+      LoDTensor* out) {
+    int x_item_length = x.numel() / x.dims()[0];
+    framework::Vector<size_t> out_offset(x_lod.size());
+    GetOutputOffset(x_lod, ref_lod, &out_offset);
+
+    int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
+    int thread_y = 16;
+    int thread_z = 1024 / thread_x / thread_y;
+    int block_x = static_cast<int>(ref_lod.size());
+    dim3 block_size(thread_x, thread_y, thread_z);
+    dim3 grid_size(block_x, 1);
+
+    sequence_expand_kernel<<<grid_size, block_size, 0, context.stream()>>>(
+        x.data<T>(), x_lod.CUDAData(context.GetPlace()),
+        ref_lod.CUDAData(context.GetPlace()),
+        out_offset.CUDAData(context.GetPlace()), x_lod.size(), x_item_length,
+        out->mutable_data<T>(context.GetPlace()));
+  }
+};
+
+template <typename T>
+struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const LoDTensor& dout,
+                  const framework::Vector<size_t>& x_lod, /*expand source lod*/
+                  const framework::Vector<size_t>& ref_lod, /*expand based lod*/
+                  LoDTensor* dx) {
+    int x_item_length = framework::product(dx->dims()) / dx->dims()[0];
+    framework::Vector<size_t> out_offset(x_lod.size());
+    GetOutputOffset(x_lod, ref_lod, &out_offset);
+
+    int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
+    int thread_y = 16;
+    int thread_z = 1024 / thread_x / thread_y;
+    int block_x = static_cast<int>(ref_lod.size());
+    dim3 block_size(thread_x, thread_y, thread_z);
+    dim3 grid_size(block_x, 1);
+    sequence_expand_grad_kernel<<<grid_size, block_size, 0, context.stream()>>>(
+        dout.data<T>(), ref_lod.CUDAData(context.GetPlace()),
+        x_lod.CUDAData(context.GetPlace()),
+        out_offset.CUDAData(context.GetPlace()), ref_lod.size(), x_item_length,
+        dx->mutable_data<T>(context.GetPlace()));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     sequence_expand,
-    ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     sequence_expand_grad,
-    ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h
index 76dde976db2d19e307ae7406be8280f9b4987187..39301e1ac0971dfe0ca7854257f10ddeb60f1000 100644
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
@@ -13,49 +13,122 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <numeric>  // std::iota
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "unsupported/Eigen/CXX11/Tensor"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
 
 using LoDTensor = framework::LoDTensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+struct SequenceExpandFunctor {
+  void operator()(
+      const DeviceContext& ctx, const LoDTensor& x,
+      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
+      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
+      LoDTensor* out);
+};
+
+template <typename DeviceContext, typename T>
+struct SequenceExpandGradFunctor {
+  void operator()(
+      const DeviceContext& ctx, const LoDTensor& dout,
+      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
+      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
+      LoDTensor* dx);
+};
+
+template <typename T>
+struct SequenceExpandFunctor<platform::CPUDeviceContext, T> {
+  void operator()(
+      const platform::CPUDeviceContext& context, const LoDTensor& x,
+      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
+      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
+      LoDTensor* out) {
+    int out_offset = 0;
+    auto& eigen_place = *context.eigen_device();
+    for (size_t i = 1; i < ref_lod.size(); ++i) {
+      int repeat_num = ref_lod[i] - ref_lod[i - 1];
+      int x_start = x_lod[i - 1];
+      int x_end = x_lod[i];
+      int x_seq_len = x_end - x_start;
+      if (repeat_num > 0) {
+        auto x_sub_tensor = x.Slice(x_start, x_end);
+        x_sub_tensor.Resize({1, x_sub_tensor.numel()});
+        int out_start = out_offset;
+        if (out->lod().size() == 1) {
+          out_start = out->lod()[0][out_offset];
+        }
+        auto out_sub_tensor =
+            out->Slice(out_start, out_start + x_seq_len * repeat_num);
+        out_sub_tensor.Resize({repeat_num, x_sub_tensor.dims()[1]});
+        EigenMatrix<T>::From(out_sub_tensor).device(eigen_place) =
+            EigenMatrix<T>::From(x_sub_tensor)
+                .broadcast(Eigen::array<int, 2>({{repeat_num, 1}}));
+      }
+      out_offset += repeat_num;
+    }
+  }
+};
 
 template <typename DeviceContext, typename T>
 class SequenceExpandKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* x = context.Input<LoDTensor>("X");
-    auto* out = context.Output<LoDTensor>("Out");
-    const T* x_data = x->data<T>();
-    auto x_dims = x->dims();
     auto* y = context.Input<LoDTensor>("Y");
-    PADDLE_ENFORCE(!y->lod().empty(), "y should have lod");
-    PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dims[0]),
-                      y->lod().back().size() - 1,
-                      "The size of last lod level in Input(Y)"
-                      "must be equal to dims[0] of Input(X).");
-    out->set_lod(y->lod());
-    auto* place =
-        context.template device_context<DeviceContext>().eigen_device();
-    size_t element_len = framework::product(x_dims) / x_dims[0];
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    auto out_starts = out->lod().back();
-
-    for (size_t i = 0; i < out_starts.size() - 1; i++) {
-      int scale = out_starts[i + 1] - out_starts[i];
-      Eigen::TensorMap<
-          Eigen::Tensor<const T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
-          x_t(x_data, 1, element_len);
-      Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
-          out_t(out_data, scale, element_len);
-      Eigen::array<int, 2> cast({{scale, 1}});
-      out_t.device(*place) = x_t.broadcast(cast);
-      x_data += element_len;
-      out_data += element_len * scale;
+    auto* out = context.Output<LoDTensor>("Out");
+
+    int ref_level = context.Attr<int>("ref_level");
+    auto& x_lod = x->lod();
+    auto& y_lod = y->lod();
+
+    if (ref_level == -1) ref_level = y_lod.size() - 1;
+
+    out->mutable_data<T>(context.GetPlace());
+
+    if (y_lod[ref_level].size() <= 1) {
+      framework::TensorCopy(*x, context.GetPlace(), out);
+      return;
+    }
+
+    // x lod level is at most 1.
+    framework::Vector<size_t> out_lod;
+    if (x_lod.size() == 1) {
+      out_lod.push_back(0);
+      int out_offset = 0;
+      for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
+        int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
+        int x_start = x_lod[0][i - 1];
+        int x_end = x_lod[0][i];
+        int x_seq_len = x_end - x_start;
+        for (int j = 0; j < repeat_num; ++j) {
+          out_lod.push_back(out_lod.back() + x_seq_len);
+          out_offset++;
+        }
+      }
+      // write lod to out if x has lod
+      auto& ref_lod = *out->mutable_lod();
+      ref_lod[0] = out_lod;
+    }
+    framework::Vector<size_t> ref_x_lod;
+    if (x->lod().size() == 1) {
+      ref_x_lod = x->lod()[0];
+    } else {
+      // x_lod doesn't has lod, use fake x lod, level = 0
+      ref_x_lod.resize(x->dims()[0] + 1);
+      std::iota(ref_x_lod.begin(), ref_x_lod.end(), 0);
     }
+    SequenceExpandFunctor<DeviceContext, T> functor;
+    functor(context.template device_context<DeviceContext>(), *x, ref_x_lod,
+            y_lod[ref_level], out);
   }
 };
 
@@ -71,32 +144,70 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
  *    Grad(X).lod = Input(X).lod
  *
  * */
+template <typename T>
+struct SequenceExpandGradFunctor<platform::CPUDeviceContext, T> {
+  void operator()(
+      const platform::CPUDeviceContext& context, const LoDTensor& dout,
+      const framework::Vector<size_t>& x_lod,   /*expand source lod*/
+      const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
+      LoDTensor* dx) {
+    int dout_offset = 0;
+    for (size_t i = 1; i < ref_lod.size(); ++i) {
+      int repeat_num = ref_lod[i] - ref_lod[i - 1];
+      if (repeat_num > 0) {
+        int x_start = x_lod[i - 1];
+        int x_end = x_lod[i];
+        int x_seq_len = x_end - x_start;
+        auto dx_sub = dx->Slice(x_start, x_end);
+        dx_sub.Resize(flatten_to_1d(dx_sub.dims()));
+        int dout_end = dout_offset + repeat_num * x_seq_len;
+        auto dout_sub = dout.Slice(dout_offset, dout_end);
+        dout_sub.Resize({repeat_num, dx_sub.dims()[0]});
+        math::ColwiseSum<platform::CPUDeviceContext, T> col_sum;
+        col_sum(context, dout_sub, &dx_sub);
+        dout_offset += repeat_num * x_seq_len;
+      }
+    }
+  }
+};
+
 template <typename DeviceContext, typename T>
 class SequenceExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* g_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
     auto* x = context.Input<LoDTensor>("X");
-    auto* out = context.Input<LoDTensor>("Out");
-    auto* d_x = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto out_last_level = out->lod().back();
-    d_x->set_lod(x->lod());
-    const T* d_out_data = d_out->data<T>();
-    T* d_x_data = d_x->mutable_data<T>(context.GetPlace());
-    size_t element_len = d_out->numel() / d_out->dims()[0];
-    for (size_t i = 0; i < out_last_level.size() - 1; ++i) {
-      size_t repeat = out_last_level[i + 1] - out_last_level[i];
-      Eigen::TensorMap<
-          Eigen::Tensor<const T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
-      d_out_t(d_out_data, static_cast<int>(repeat), element_len);
-      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>
-      d_x_t(d_x_data, static_cast<int>(element_len));
-      auto place =
-          context.template device_context<DeviceContext>().eigen_device();
-      d_x_t.device(*place) = d_out_t.sum(Eigen::array<int, 1>({{0}}));
-      d_out_data += (repeat * element_len);
-      d_x_data += element_len;
+    auto* y = context.Input<LoDTensor>("Y");
+    auto* g_x = context.Output<LoDTensor>(framework::GradVarName("X"));
+    int ref_level = context.Attr<int>("ref_level");
+
+    g_x->mutable_data<T>(context.GetPlace());
+    g_x->set_lod(x->lod());
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> set_zero;
+    set_zero(dev_ctx, g_x, static_cast<T>(0));
+
+    auto& y_lod = y->lod();
+    if (ref_level == -1) ref_level = y_lod.size() - 1;
+    // just copy the gradient
+    if (y_lod[ref_level].size() <= 1) {
+      framework::TensorCopy(*g_out, context.GetPlace(), g_x);
+      return;
+    }
+
+    framework::Vector<size_t> ref_x_lod;
+    framework::Vector<size_t> ref_lod = y_lod[ref_level];
+    if (x->lod().size() == 1) {
+      ref_x_lod = x->lod()[0];
+    } else {
+      // x_lod doesn't has lod, use fake x lod, level = 0
+      ref_x_lod.resize(x->dims()[0] + 1);
+      std::iota(ref_x_lod.begin(), ref_x_lod.end(), 0);
     }
+    SequenceExpandGradFunctor<DeviceContext, T> functor;
+    functor(context.template device_context<DeviceContext>(), *g_out, ref_x_lod,
+            ref_lod, g_x);
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_pool_op.cc b/paddle/fluid/operators/sequence_pool_op.cc
index 3d4d54a3a3f292d34b1a7645a0db4bdd3208ba6d..5c6fd13d42e43e3502a1cab85a56e019420c708d 100644
--- a/paddle/fluid/operators/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_pool_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_pool_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -37,8 +38,7 @@ class SequencePoolOp : public framework::OperatorWithKernel {
 
 class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequencePoolOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(LoDTensor) The variable-length input of SequencePoolOp");
     AddOutput("Out",
               "(Tensor) The output of SequencePoolOp does not contain LoD "
diff --git a/paddle/fluid/operators/sequence_pool_op.h b/paddle/fluid/operators/sequence_pool_op.h
index 8706ff14aa20714e77d5625fc1f6287ee9b4a8a6..2aa20792f24305a106c500a3d7a6e3d363bc31d8 100644
--- a/paddle/fluid/operators/sequence_pool_op.h
+++ b/paddle/fluid/operators/sequence_pool_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -23,12 +24,6 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename DeviceContext, typename T>
 class SequencePoolKernel : public framework::OpKernel<T> {
@@ -37,11 +32,13 @@ class SequencePoolKernel : public framework::OpKernel<T> {
     auto* in = context.Input<LoDTensor>("X");
     auto* out = context.Output<Tensor>("Out");
     std::string pooltype = context.Attr<std::string>("pooltype");
+    Tensor* index = nullptr;
+    if (pooltype == "MAX") {
+      index = context.Output<Tensor>("MaxIndex");
+    }
 
     auto dims = in->dims();
     auto lod = in->lod();
-    int64_t w = in->numel() / dims[0];
-
     // InferShape by lod
     PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
     PADDLE_ENFORCE_GE(
@@ -50,45 +47,14 @@ class SequencePoolKernel : public framework::OpKernel<T> {
         "The first dimension of Input(X) must be large than batch size.");
     dims[0] = lod[0].size() - 1;
     out->Resize({dims});
-
-    auto lod_level_0 = lod[0];
-
     out->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
     if (pooltype == "MAX") {
-      math::MaxSeqPoolFunctor<DeviceContext, T> max_pool;
-      auto* index = context.Output<Tensor>("MaxIndex");
       index->Resize({dims});
       index->mutable_data<int>(context.GetPlace());
-      max_pool(dev_ctx, *in, out, index);
-      return;
-    }
-
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
-      Tensor in_t = in->Slice(static_cast<int>(lod_level_0[i]),
-                              static_cast<int>(lod_level_0[i + 1]));
-      Tensor out_t = out->Slice(i, i + 1);
-      int64_t h = static_cast<int64_t>(lod_level_0[i + 1] - lod_level_0[i]);
-      auto in_e = EigenMatrix<T>::From(in_t, framework::make_ddim({h, w}));
-      auto out_e = EigenVector<T>::Flatten(out_t);
-
-      if (pooltype == "AVERAGE") {
-        out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}}));
-      } else if (pooltype == "SUM") {
-        out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}}));
-      } else if (pooltype == "SQRT") {
-        out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
-                              std::sqrt(static_cast<T>(h));
-      } else if (pooltype == "LAST") {
-        out_e.device(place) = in_e.chip(h - 1, 0);
-      } else if (pooltype == "FIRST") {
-        out_e.device(place) = in_e.chip(0, 0);
-      } else {
-        PADDLE_THROW("unsupported pooling pooltype");
-      }
     }
+    math::SequencePoolFunctor<DeviceContext, T> pool;
+    pool(context.template device_context<DeviceContext>(), pooltype, *in, out,
+         index);
   }
 };
 
@@ -96,58 +62,17 @@ template <typename DeviceContext, typename T>
 class SequencePoolGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* in = context.Input<LoDTensor>("X");
     auto* out_g = context.Input<Tensor>(framework::GradVarName("Out"));
     auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
     std::string pooltype = context.Attr<std::string>("pooltype");
-
-    auto dims = in->dims();
-    auto lod = in->lod()[0];
-    int64_t w = in->numel() / dims[0];
-
-    in_g->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
+    const Tensor* index = nullptr;
     if (pooltype == "MAX") {
-      math::MaxSeqPoolGradFunctor<DeviceContext, T> max_pool_grad;
-      auto* index = context.Input<Tensor>("MaxIndex");
-      max_pool_grad(dev_ctx, *out_g, *index, in_g);
-      return;
-    }
-
-    if (pooltype == "LAST" || pooltype == "FIRST") {
-      // set X@Grad be zero at first when pooltype is LAST/FIRST
-      math::SetConstant<DeviceContext, T> functor;
-      functor(dev_ctx, in_g, 0);
-    }
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-      auto in_g_t =
-          in_g->Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
-      auto out_g_t = out_g->Slice(i, i + 1);
-      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
-      auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
-      auto out_g_e = EigenMatrix<T>::From(out_g_t, {1, w});
-      auto out_g_e_v = EigenVector<T>::Flatten(out_g_t);
-      Eigen::DSizes<int, 2> bcast(h, 1);
-
-      if (pooltype == "AVERAGE") {
-        in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast);
-      } else if (pooltype == "SUM") {
-        in_g_e.device(place) = (out_g_e).broadcast(bcast);
-      } else if (pooltype == "SQRT") {
-        in_g_e.device(place) =
-            (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
-      } else if (pooltype == "LAST") {
-        in_g_e.chip(h - 1, 0).device(place) = out_g_e_v;
-      } else if (pooltype == "FIRST") {
-        in_g_e.chip(0, 0).device(place) = out_g_e_v;
-      } else {
-        PADDLE_THROW("unsupported pooling pooltype");
-      }
+      index = context.Input<Tensor>("MaxIndex");
     }
+    in_g->mutable_data<T>(context.GetPlace());
+    math::SequencePoolGradFunctor<DeviceContext, T> pool;
+    pool(context.template device_context<DeviceContext>(), pooltype, *out_g,
+         in_g, index);
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_reshape_op.cc b/paddle/fluid/operators/sequence_reshape_op.cc
index a2999650b8903f9d819a8e8011421349e098b219..ef5e6f3210234d59298fcf04c812390643c693d0 100644
--- a/paddle/fluid/operators/sequence_reshape_op.cc
+++ b/paddle/fluid/operators/sequence_reshape_op.cc
@@ -42,8 +42,7 @@ class SequenceReshapeOp : public framework::OperatorWithKernel {
 
 class SequenceReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceReshapeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(LoDTensor, default LoDTensor<float>) A 2-D LoDTensor with shape "
              "being [N, M].");
diff --git a/paddle/fluid/operators/sequence_slice_op.cc b/paddle/fluid/operators/sequence_slice_op.cc
index d09e5bca56b226100d2d0cf3a030c77703bfa76e..df9243dc04c584d70dfa6ca78d5fac8423796466 100644
--- a/paddle/fluid/operators/sequence_slice_op.cc
+++ b/paddle/fluid/operators/sequence_slice_op.cc
@@ -79,8 +79,7 @@ class SequenceSliceGradOp : public framework::OperatorWithKernel {
 
 class SequenceSliceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceSliceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(LoDTensor), "
              "the input of SequenceSliceOp.");
@@ -120,8 +119,10 @@ NOTE: The first dimension size of input, the size of offset and Length, should b
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_slice, ops::SequenceSliceOp, ops::SequenceSliceOpMaker,
-            sequence_slice_grad, ops::SequenceSliceGradOp);
+REGISTER_OPERATOR(sequence_slice, ops::SequenceSliceOp,
+                  ops::SequenceSliceOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sequence_slice_grad, ops::SequenceSliceGradOp);
 REGISTER_OP_CPU_KERNEL(
     sequence_slice,
     ops::SequenceSliceOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sequence_slice_op.cu b/paddle/fluid/operators/sequence_slice_op.cu
old mode 100755
new mode 100644
diff --git a/paddle/fluid/operators/sequence_slice_op.h b/paddle/fluid/operators/sequence_slice_op.h
index b9c565cac9581a2e830697c1136919062eef345c..b5ea6ff49bbb29571f9a6ef6358ef881acd9be9e 100644
--- a/paddle/fluid/operators/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_slice_op.h
@@ -66,13 +66,11 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
 
     if (platform::is_gpu_place(ctx.GetPlace())) {
       offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
-      framework::TensorCopy(*offset, platform::CPUPlace(), ctx.device_context(),
-                            &offset_cpu);
+      framework::TensorCopySync(*offset, platform::CPUPlace(), &offset_cpu);
       offset_data = offset_cpu.data<int64_t>();
 
       length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
-      framework::TensorCopy(*length, platform::CPUPlace(), ctx.device_context(),
-                            &length_cpu);
+      framework::TensorCopySync(*length, platform::CPUPlace(), &length_cpu);
       length_data = length_cpu.data<int64_t>();
     }
 
@@ -127,13 +125,11 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
 
     if (platform::is_gpu_place(ctx.GetPlace())) {
       offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
-      framework::TensorCopy(*offset, platform::CPUPlace(), ctx.device_context(),
-                            &offset_cpu);
+      framework::TensorCopySync(*offset, platform::CPUPlace(), &offset_cpu);
       offset_data = offset_cpu.data<int64_t>();
 
       length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
-      framework::TensorCopy(*length, platform::CPUPlace(), ctx.device_context(),
-                            &length_cpu);
+      framework::TensorCopySync(*length, platform::CPUPlace(), &length_cpu);
       length_data = length_cpu.data<int64_t>();
     }
 
diff --git a/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0ddacb57106c090e8f4f9350a65a30ca102f8e0a
--- /dev/null
+++ b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
@@ -0,0 +1,105 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/softmax.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+class SequenceSoftmaxCUDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* out = ctx.Output<LoDTensor>("Out");
+
+    auto lod = x->lod();
+    auto dims = x->dims();
+
+    const size_t level = lod.size() - 1;
+    PADDLE_ENFORCE_EQ(dims[0], static_cast<int64_t>(lod[level].back()),
+                      "The first dimension of Input(X) should be equal to the "
+                      "sum of all sequences' lengths.");
+    PADDLE_ENFORCE_EQ(dims[0], x->numel(),
+                      "The width of each timestep in Input(X) of "
+                      "SequenceSoftmaxOp should be 1.");
+
+    out->mutable_data<T>(ctx.GetPlace());
+    for (int i = 0; i < static_cast<int>(lod[level].size()) - 1; ++i) {
+      int start_pos = static_cast<int>(lod[level][i]);
+      int end_pos = static_cast<int>(lod[level][i + 1]);
+      Tensor x_i = x->Slice(start_pos, end_pos);
+      Tensor out_i = out->Slice(start_pos, end_pos);
+
+      // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
+      framework::DDim dims_i =
+          // framework::make_ddim({1UL, end_pos - start_pos, 1UL, 1UL});
+          framework::make_ddim({1UL, end_pos - start_pos});
+      x_i.Resize(dims_i);
+      out_i.Resize(dims_i);
+      math::SoftmaxCUDNNFunctor<T>()(
+          ctx.template device_context<platform::CUDADeviceContext>(), &x_i,
+          &out_i);
+    }
+  }
+};
+
+template <typename T>
+class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Input<LoDTensor>("Out");
+    auto* out_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+
+    auto lod = x->lod();
+    const size_t level = lod.size() - 1;
+
+    x_grad->mutable_data<T>(ctx.GetPlace());
+    for (int i = 0; i < static_cast<int>(lod[level].size()) - 1; ++i) {
+      int start_pos = static_cast<int>(lod[level][i]);
+      int end_pos = static_cast<int>(lod[level][i + 1]);
+
+      Tensor out_i = out->Slice(start_pos, end_pos);
+      Tensor out_grad_i = out_grad->Slice(start_pos, end_pos);
+      Tensor x_grad_i = x_grad->Slice(start_pos, end_pos);
+
+      // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
+      framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos});
+      out_i.Resize(dims_i);
+      out_grad_i.Resize(dims_i);
+      x_grad_i.Resize(dims_i);
+      math::SoftmaxGradCUDNNFunctor<T>()(
+          ctx.template device_context<platform::CUDADeviceContext>(), &out_i,
+          &out_grad_i, &x_grad_i);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(sequence_softmax, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::SequenceSoftmaxCUDNNKernel<float>,
+                   ops::SequenceSoftmaxCUDNNKernel<double>);
+REGISTER_OP_KERNEL(sequence_softmax_grad, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::SequenceSoftmaxGradCUDNNKernel<float>,
+                   ops::SequenceSoftmaxGradCUDNNKernel<double>);
diff --git a/paddle/fluid/operators/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_softmax_op.cc
index 7e685eb3dc7b12ef38f06b37d99a1212cfbc992c..c44f8206eb5079fef969e3e527552512eebd0f1a 100644
--- a/paddle/fluid/operators/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sequence_softmax_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -29,18 +30,51 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
     ctx->ShareLoD("X", /*->*/ "Out");
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    // choose cudnn kernel if the runtime supported.
+    bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+    bool runtime_cudnn_support = false;
+#ifdef PADDLE_WITH_CUDA
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+      auto& dev_ctx =
+          ctx.template device_context<platform::CUDADeviceContext>();
+      runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false;
+    }
+#endif
+    framework::LibraryType library_ = framework::LibraryType::kPlain;
+    if (use_cudnn && runtime_cudnn_support) {
+      library_ = framework::LibraryType::kCUDNN;
+    }
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+        framework::StringToDataLayout(data_format), library_);
+  }
 };
 
 class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SequenceSoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(LoDTensor) 1-D or 2-D input LoDTensor with the 2-nd dimension "
              "of length 1.");
     AddOutput("Out",
               "(LoDTensor) 1-D or 2-D output LoDTensor with the 2-nd dimension "
               "of length 1.");
+    AddAttr<bool>(
+        "use_cudnn",
+        "(bool, default false) Only used in cudnn kernel, need install cudnn")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "data_format",
+        "(string, default NCHW) Only used in "
+        "An optional string from: \"NHWC\", \"NCHW\". "
+        "Defaults to \"NHWC\". Specify the data format of the output data, "
+        "the input will be transformed automatically. ")
+        .SetDefault("AnyLayout");
     AddComment(R"DOC(
 Sequence Softmax Operator.
 
@@ -91,18 +125,44 @@ class SequenceSoftmaxGradOp : public framework::OperatorWithKernel {
 
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    // choose cudnn kernel if the runtime supported.
+    bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+    bool runtime_cudnn_support = false;
+#ifdef PADDLE_WITH_CUDA
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+      auto& dev_ctx =
+          ctx.template device_context<platform::CUDADeviceContext>();
+      runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false;
+    }
+#endif
+    framework::LibraryType library_ = framework::LibraryType::kPlain;
+    if (use_cudnn && runtime_cudnn_support) {
+      library_ = framework::LibraryType::kCUDNN;
+    }
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+        framework::StringToDataLayout(data_format), library_);
+  }
 };
 
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sequence_softmax, ops::SequenceSoftmaxOp,
-            ops::SequenceSoftmaxOpMaker, sequence_softmax_grad,
-            ops::SequenceSoftmaxGradOp);
+REGISTER_OPERATOR(sequence_softmax, ops::SequenceSoftmaxOp,
+                  ops::SequenceSoftmaxOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sequence_softmax_grad, ops::SequenceSoftmaxGradOp);
 REGISTER_OP_CPU_KERNEL(
     sequence_softmax,
-    ops::SequenceSoftmaxKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::SequenceSoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceSoftmaxKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     sequence_softmax_grad,
-    ops::SequenceSoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::SequenceSoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceSoftmaxGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/sequence_softmax_op.cu.cc b/paddle/fluid/operators/sequence_softmax_op.cu.cc
index 295c68c5b936d6522666a4cc4e621db6f5f5f3ed..397df75415691e4f53bc399cd1868c3e37bc9110 100644
--- a/paddle/fluid/operators/sequence_softmax_op.cu.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cu.cc
@@ -17,7 +17,10 @@ limitations under the License. */
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     sequence_softmax,
-    ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, float>)
+    ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
     sequence_softmax_grad,
-    ops::SequenceSoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ops::SequenceSoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceSoftmaxGradKernel<paddle::platform::CUDADeviceContext,
+                                   double>);
diff --git a/paddle/fluid/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc
index d0aa2f9cbadaadf4e7e625628d9db5677d50d277..fef230e42d07a5ed73b7a7a6ab682694675bb9d2 100644
--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
@@ -35,28 +35,47 @@ class SGDOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
                       "Learning rate should have 1 element");
     auto param_dim = ctx->GetInputDim("Param");
-    // TODO(qijun): check dimensions of Param and Grad at complie
-    // and run time.
+    // TODO(qijun): check dimensions of Param and Grad at compile
+    // and runtime.
     ctx->SetOutputDim("ParamOut", param_dim);
   }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("Param")->type()),
-        ctx.GetPlace());
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+class SGDOpInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    auto input_var = op_desc.Input("Param")[0];
+    for (auto& out_var : op_desc.Output("ParamOut")) {
+      if (block->FindRecursiveOrCreateVar(input_var).GetType() ==
+          framework::proto::VarType::SELECTED_ROWS) {
+        block->FindRecursiveOrCreateVar(out_var).SetType(
+            framework::proto::VarType::SELECTED_ROWS);
+      } else {
+        block->FindRecursiveOrCreateVar(out_var).SetType(
+            framework::proto::VarType::LOD_TENSOR);
+      }
+    }
   }
 };
 
 class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SGDOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Param", "(Tensor) Input parameter");
+  void Make() override {
+    AddInput("Param", "(Tensor or SelectedRows) Input parameter");
     AddInput("LearningRate", "(Tensor) Learning rate of SGD");
-    AddInput("Grad", "(Tensor) Input gradient");
-    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddInput("Grad", "(Tensor or SelectedRows) Input gradient");
+    AddOutput("ParamOut",
+              "(Tensor or SelectedRows, same with Param) "
+              "Output parameter, should share the same memory with Param")
+        .Reuse("Param");
     AddComment(R"DOC(
 
 SGD operator
@@ -73,5 +92,6 @@ $$param\_out = param - learning\_rate * grad$$
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(sgd, ops::SGDOp, ops::SGDOpMaker);
+REGISTER_OPERATOR(sgd, ops::SGDOp, ops::SGDOpMaker,
+                  paddle::framework::EmptyGradOpMaker, ops::SGDOpInferVarType);
 REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel<float>, ops::SGDOpKernel<double>);
diff --git a/paddle/fluid/operators/sgd_op.cu b/paddle/fluid/operators/sgd_op.cu
index 9d211541c0bf729393b8190edb18e101d5e07d1a..4722be7a666d3e8f3c25c9499f88ddda835f60e3 100644
--- a/paddle/fluid/operators/sgd_op.cu
+++ b/paddle/fluid/operators/sgd_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/sgd_op.h"
-#include "paddle/fluid/platform/cuda_helper.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sgd_op.h b/paddle/fluid/operators/sgd_op.h
index 0ad801079400f1830d85a945e57a434a86adeb00..2685ce217ee0f0d3e89f3751e96218dcd19bead4 100644
--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
@@ -23,60 +23,104 @@ namespace operators {
 template <typename T>
 class SGDOpKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* param = ctx.Input<framework::Tensor>("Param");
-    auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
-    auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-
-    auto* grad_var = ctx.InputVar("Grad");
-    // Actually, all tensors are LoDTensor except SelectedRows.
-    if (grad_var->IsType<framework::LoDTensor>()) {
-      param_out->mutable_data<T>(ctx.GetPlace());
-      auto* grad = ctx.Input<framework::Tensor>("Grad");
-
-      auto p = framework::EigenVector<T>::Flatten(*param);
-      auto g = framework::EigenVector<T>::Flatten(*grad);
-      auto o = framework::EigenVector<T>::Flatten(*param_out);
-      auto* lr = learning_rate->data<T>();
-
-      o = p - lr[0] * g;
-    } else if (grad_var->IsType<framework::SelectedRows>()) {
-      // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
-      // This manual optimization brings difficulty to track data dependency.
-      // It's better to find a more elegant solution.
-      PADDLE_ENFORCE_EQ(param, param_out);
-      auto* grad = ctx.Input<framework::SelectedRows>("Grad");
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+
+    const auto *param_var = ctx.InputVar("Param");
+    const auto *grad_var = ctx.InputVar("Grad");
+
+    if (param_var->IsType<framework::LoDTensor>()) {
+      const auto *param = ctx.Input<framework::Tensor>("Param");
+      auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
+
+      // Actually, all tensors are LoDTensor except SelectedRows.
+      if (grad_var->IsType<framework::LoDTensor>()) {
+        param_out->mutable_data<T>(ctx.GetPlace());
+        const auto *grad = ctx.Input<framework::Tensor>("Grad");
+
+        auto p = framework::EigenVector<T>::Flatten(*param);
+        auto g = framework::EigenVector<T>::Flatten(*grad);
+        auto o = framework::EigenVector<T>::Flatten(*param_out);
+        auto *lr = learning_rate->data<T>();
+
+        o = p - lr[0] * g;
+      } else if (grad_var->IsType<framework::SelectedRows>()) {
+        // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
+        // This manual optimization brings difficulty to track data dependency.
+        // It's better to find a more elegant solution.
+        PADDLE_ENFORCE_EQ(param, param_out);
+        const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
+
+        // for distributed training, a sparse var may be empty,
+        // just skip updating.
+        if (grad->rows().size() == 0) {
+          return;
+        }
+
+        auto grad_height = grad->height();
+        auto out_dims = param_out->dims();
+        PADDLE_ENFORCE_EQ(grad_height, out_dims[0]);
+
+        auto &grad_value = grad->value();
+        auto &grad_rows = grad->rows();
+
+        size_t grad_row_numel = grad_value.numel() / grad_rows.size();
+        PADDLE_ENFORCE_EQ(static_cast<int64_t>(grad_row_numel),
+                          param_out->numel() / grad_height);
+
+        auto *grad_data = grad_value.data<T>();
+        auto *out_data = param_out->data<T>();
+        auto *lr = learning_rate->data<T>();
+        for (size_t i = 0; i < grad_rows.size(); i++) {
+          PADDLE_ENFORCE(grad_rows[i] < grad_height,
+                         "Input rows index should less than height");
+          for (size_t j = 0; j < grad_row_numel; j++) {
+            out_data[grad_rows[i] * grad_row_numel + j] -=
+                lr[0] * grad_data[i * grad_row_numel + j];
+          }
+        }
+      } else {
+        PADDLE_THROW("Unsupported Variable Type of Grad");
+      }
+    } else if (param_var->IsType<framework::SelectedRows>()) {
+      PADDLE_ENFORCE(grad_var->IsType<framework::SelectedRows>(),
+                     "when param "
+                     "is SelectedRows, gradient should also be SelectedRows");
+      const auto &param = param_var->Get<framework::SelectedRows>();
+      auto *param_out = ctx.Output<framework::SelectedRows>("ParamOut");
+      const auto &grad = grad_var->Get<framework::SelectedRows>();
 
       // for distributed training, a sparse var may be empty,
       // just skip updating.
-      if (grad->rows().size() == 0) {
+      if (grad.rows().size() == 0) {
         return;
       }
 
-      auto in_height = grad->height();
-      auto out_dims = param_out->dims();
-      PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
-
-      auto& in_value = grad->value();
-      auto& in_rows = grad->rows();
+      auto param_row_width = param.value().dims()[1];
+      auto grad_row_width = grad.value().dims()[1];
+      VLOG(4) << " param rows: " << param.rows().size()
+              << " param memory rows: " << param.value().dims()[0]
+              << " grad rows: " << grad.rows().size()
+              << " grad memory rows: " << grad.value().dims()[0];
+      PADDLE_ENFORCE_EQ(param_row_width, grad_row_width,
+                        "param_row should have the same size with grad_row");
 
-      int64_t in_row_numel = in_value.numel() / in_rows.size();
-      PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
-
-      auto* in_data = in_value.data<T>();
-      auto* out_data = param_out->data<T>();
-      auto* lr = learning_rate->data<T>();
-      for (size_t i = 0; i < in_rows.size(); i++) {
-        PADDLE_ENFORCE(in_rows[i] < in_height,
+      const auto *lr = learning_rate->data<T>();
+      const auto *grad_data = grad.value().data<T>();
+      auto *out_data = param_out->mutable_value()->data<T>();
+      for (size_t i = 0; i < grad.rows().size(); i++) {
+        PADDLE_ENFORCE(grad.rows()[i] < grad.height(),
                        "Input rows index should less than height");
-        for (int64_t j = 0; j < in_row_numel; j++) {
-          out_data[in_rows[i] * in_row_numel + j] -=
-              lr[0] * in_data[i * in_row_numel + j];
+        int64_t id_index = param.Index(grad.rows()[i]);
+        PADDLE_ENFORCE_GE(id_index, static_cast<int64_t>(0),
+                          "id should be in the table");
+        for (int64_t j = 0; j < grad_row_width; j++) {
+          out_data[id_index * grad_row_width + j] -=
+              lr[0] * grad_data[i * grad_row_width + j];
         }
       }
-
     } else {
-      PADDLE_THROW("Unsupported Variable Type of Grad");
+      PADDLE_THROW("Unsupported Variable Type of Parameter");
     }
   }
 };
diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b44d5f898013a5d27467bd80118c29a886d5e8b3
--- /dev/null
+++ b/paddle/fluid/operators/shape_op.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/shape_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class ShapeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input (Input) of get_shape op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output (Out) of get_shape op should not be null.");
+    auto in_dim = ctx->GetInputDim("Input");
+    ctx->SetOutputDim("Out", {in_dim.size()});
+  }
+};
+
+class ShapeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Input", "(Tensor), The input tensor.");
+    AddOutput("Out",
+              "(Tensor), The shape of input tensor, the data type of the shape"
+              " is int64_t, will be on the same device with the input Tensor.");
+    AddComment(R"DOC(
+Shape Operator
+
+Get the shape of input tensor. Only support CPU input Tensor now.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(shape, ops::ShapeOp, ops::ShapeOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<int>, ops::ShapeKernel<int64_t>,
+                       ops::ShapeKernel<float>, ops::ShapeKernel<double>);
diff --git a/paddle/fluid/operators/shape_op.cu b/paddle/fluid/operators/shape_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7736a2a1e13cfa5d445411b3efac7669a7bf23a2
--- /dev/null
+++ b/paddle/fluid/operators/shape_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/shape_op.h"
+
+REGISTER_OP_CUDA_KERNEL(shape, paddle::operators::ShapeKernel<int>,
+                        paddle::operators::ShapeKernel<int64_t>,
+                        paddle::operators::ShapeKernel<float>,
+                        paddle::operators::ShapeKernel<double>);
diff --git a/paddle/fluid/operators/shape_op.h b/paddle/fluid/operators/shape_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..3be86b66a538e7b38a5d59095fee7e7636364bce
--- /dev/null
+++ b/paddle/fluid/operators/shape_op.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class ShapeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_t = ctx.Input<Tensor>("Input");
+    auto* out_t = ctx.Output<Tensor>("Out");
+    auto out_data = out_t->mutable_data<int64_t>(platform::CPUPlace());
+    auto in_dims = in_t->dims();
+    for (int i = 0; i < in_dims.size(); ++i) {
+      out_data[i] = in_dims[i];
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc
index a1871a8e7fb27d351f9d333966baa63c6f32ae01..8146c5f56104b7dec86b1c4491ed10fc2e94b58b 100644
--- a/paddle/fluid/operators/shrink_rnn_memory_op.cc
+++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc
@@ -69,8 +69,7 @@ class ShrinkRNNMemoryOp : public ArrayOp {
 
 class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ShrinkRNNMemoryOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(LoDTensor) The RNN step memory to be shrinked.");
     AddInput("RankTable", "(LoDRankTable) The lod_rank_table of dynamic RNN.");
     AddInput("I",
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index 7b93f19bb2f7102824852aa181e3728f79025121..c3b0fe32098cb4b41ccc155db58809ef9f1bf46b 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -86,9 +86,7 @@ class SigmoidCrossEntropyWithLogitsGradOp
 class SigmoidCrossEntropyWithLogitsOpMaker
     : public framework::OpProtoAndCheckerMaker {
  public:
-  SigmoidCrossEntropyWithLogitsOpMaker(OpProto* proto,
-                                       OpAttrChecker* op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D, "
              "where N is the batch size and D is the number of classes. "
@@ -115,14 +113,14 @@ The logistic loss is given as follows:
 
        $$loss = -Labels * \log(\sigma(X)) - (1 - Labels) * \log(1 - \sigma(X))$$
 
-We know that $$\sigma(X) = (1 / (1 + \exp(-X)))$$. By substituting this we get:
+We know that $$\sigma(X) = \\frac{1}{1 + \exp(-X)}$$. By substituting this we get:
 
        $$loss = X - X * Labels + \log(1 + \exp(-X))$$
 
 For stability and to prevent overflow of $$\exp(-X)$$ when X < 0,
 we reformulate the loss as follows:
 
-       $$loss = \max(X, 0) - X * Labels + \log(1 + \exp(-|X|))$$
+       $$loss = \max(X, 0) - X * Labels + \log(1 + \exp(-\|X\|))$$
 
 Both the input `X` and `Labels` can carry the LoD (Level of Details) information.
 However the output only shares the LoD with input `X`.
@@ -135,11 +133,12 @@ However the output only shares the LoD with input `X`.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sigmoid_cross_entropy_with_logits,
-            ops::SigmoidCrossEntropyWithLogitsOp,
-            ops::SigmoidCrossEntropyWithLogitsOpMaker,
-            sigmoid_cross_entropy_with_logits_grad,
-            ops::SigmoidCrossEntropyWithLogitsGradOp);
+REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits,
+                  ops::SigmoidCrossEntropyWithLogitsOp,
+                  ops::SigmoidCrossEntropyWithLogitsOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad,
+                  ops::SigmoidCrossEntropyWithLogitsGradOp);
 REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits,
                        ops::SigmoidCrossEntropyWithLogitsKernel<
                            paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sign_op.cc b/paddle/fluid/operators/sign_op.cc
index 8f8b7abd03212c12ca351e551621e63b4c7148c2..f3985dcc027f974e0213a73ea9a21e268d77615f 100644
--- a/paddle/fluid/operators/sign_op.cc
+++ b/paddle/fluid/operators/sign_op.cc
@@ -34,8 +34,7 @@ class SignOp : public framework::OperatorWithKernel {
 template <typename AttrType>
 class SignOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SignOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(Tensor) Input tensor of sign operator.");
     AddOutput("Out", "(Tensor) Output tensor of sign operator.");
     AddComment(R"DOC(
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4bd23d594134f227e86b01fd75b7e202dd76c11b
--- /dev/null
+++ b/paddle/fluid/operators/slice_op.cc
@@ -0,0 +1,133 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/slice_op.h"
+#include <algorithm>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class SliceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input (Input) of slice op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output (Out) of slice op should not be null.");
+
+    auto in_dims = ctx->GetInputDim("Input");
+    PADDLE_ENFORCE(in_dims.size() < 7,
+                   "The rank of input should be less than 7.");
+    framework::DDim out_dims(in_dims);
+    auto axes = ctx->Attrs().Get<std::vector<int>>("axes");
+    auto starts = ctx->Attrs().Get<std::vector<int>>("starts");
+    auto ends = ctx->Attrs().Get<std::vector<int>>("ends");
+
+    PADDLE_ENFORCE_EQ(starts.size(), ends.size());
+    PADDLE_ENFORCE_EQ(starts.size(), axes.size());
+    int dim_value, start, end;
+    for (size_t i = 0; i < axes.size(); ++i) {
+      dim_value = out_dims[axes[i]];
+      start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
+      end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
+      start = std::max(start, 0);
+      end = std::max(end, 0);
+      start = std::min(start, dim_value);
+      end = std::min(end, dim_value);
+      start = std::min(start, end);
+      out_dims[axes[i]] = end - start;
+    }
+    ctx->SetOutputDim("Out", out_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
+        ctx.GetPlace());
+  }
+};
+
+class SliceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Input", "Tensor of data to extract slices from.");
+    AddOutput("Out", "Sliced data tensor.");
+
+    AddAttr<std::vector<int>>(
+        "axes",
+        "(list<int>) Axes that `starts` and `ends` apply to. It's optional."
+        "If not present, will be treated as [0, 1, ..., len(`starts`) - 1].");
+    AddAttr<std::vector<int>>(
+        "starts",
+        "(list<int>) Starting indices of corresponding axis in `axes`");
+    AddAttr<std::vector<int>>(
+        "ends",
+        "(list<int>) Starting indices of corresponding axis in `axes`.");
+
+    AddComment(R"DOC(
+Slice Operator.
+
+Produces a slice of the input tensor along multiple axes. Similar to numpy:
+https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html
+Slice uses `axes`, `starts` and `ends` attributes to specify the start and 
+end dimension for each axis in the list of axes, it uses this information
+to slice the input data tensor. If a negative value is passed for any of 
+the start or end indices, it represents number of elements before the end 
+of that dimension. If the value passed to start or end is larger than
+the n (the number of elements in this dimension), it represents n. 
+For slicing to the end of a dimension with unknown size, it is recommended 
+to pass in INT_MAX. If axes are omitted, they are set to [0, ..., ndim-1].
+Following examples will explain how slice works:
+
+    .. code-block:: text
+
+        Cast1:
+            Given:
+                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
+                axes = [0, 1]
+                starts = [1, 0]
+                ends = [2, 3]
+            Then:
+                result = [ [5, 6, 7], ]
+
+        Cast2:
+            Given:
+                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
+                starts = [0, 1]
+                ends = [-1, 1000]
+            Then:
+                result = [ [2, 3, 4], ]
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(slice, ops::SliceOp, ops::SliceOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    slice, ops::SliceKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SliceKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::SliceKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SliceKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/slice_op.cu b/paddle/fluid/operators/slice_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8c1767c70b19d1386af9610ef3405eb487a39878
--- /dev/null
+++ b/paddle/fluid/operators/slice_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/slice_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    slice, ops::SliceKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SliceKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SliceKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba231aee176564b91a642912ce0b32bcdef8cfc1
--- /dev/null
+++ b/paddle/fluid/operators/slice_op.h
@@ -0,0 +1,88 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SliceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    int rank = ctx.Input<framework::Tensor>("Input")->dims().size();
+    switch (rank) {
+      case 1:
+        SliceCompute<1>(ctx);
+        break;
+      case 2:
+        SliceCompute<2>(ctx);
+        break;
+      case 3:
+        SliceCompute<3>(ctx);
+        break;
+      case 4:
+        SliceCompute<4>(ctx);
+        break;
+      case 5:
+        SliceCompute<5>(ctx);
+        break;
+      case 6:
+        SliceCompute<6>(ctx);
+        break;
+    }
+  }
+
+ private:
+  template <size_t D>
+  void SliceCompute(const framework::ExecutionContext& context) const {
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    auto in = context.Input<framework::Tensor>("Input");
+    auto out = context.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+    auto out_dims = out->dims();
+    auto in_dims = in->dims();
+    auto axes = context.Attr<std::vector<int>>("axes");
+    auto starts = context.Attr<std::vector<int>>("starts");
+
+    auto offsets = Eigen::array<int, D>();
+    auto extents = Eigen::array<int, D>();
+    for (size_t i = 0; i < D; ++i) {
+      offsets[i] = 0;
+      extents[i] = out_dims[i];
+    }
+    int start;
+    for (size_t i = 0; i < axes.size(); ++i) {
+      start = starts[i];
+      if (start < 0) {
+        start = (start + in_dims[axes[i]]);
+      }
+      start = std::max(start, 0);
+      offsets[axes[i]] = start;
+    }
+    auto in_t =
+        framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            *in);
+    auto out_t =
+        framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            *out);
+    out_t.device(place) = in_t.slice(offsets, extents);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cc b/paddle/fluid/operators/smooth_l1_loss_op.cc
index 658eb0195212cc3038fce6aab0ec3804efc59edf..622420c1c33a62994c81ad9534c4fa37a4a1fa1a 100644
--- a/paddle/fluid/operators/smooth_l1_loss_op.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cc
@@ -46,8 +46,7 @@ class SmoothL1LossOp : public framework::OperatorWithKernel {
 
 class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SmoothL1LossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
              "The input value of smooth l1 loss op with shape "
@@ -106,7 +105,7 @@ class SmoothL1LossGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim("X");
+    auto in_dims = ctx->GetInputDim("Diff");
     auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
 
     PADDLE_ENFORCE_GE(out_dims.size(), 2,
@@ -128,12 +127,34 @@ class SmoothL1LossGradOp : public framework::OperatorWithKernel {
   }
 };
 
+class SmoothL1LossGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("smooth_l1_loss_grad");
+    op->SetInput("InsideWeight", Input("InsideWeight"));
+    op->SetInput("OutsideWeight", Input("OutsideWeight"));
+    op->SetInput("Diff", Output("Diff"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+
+    op->SetAttrMap(Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(smooth_l1_loss, ops::SmoothL1LossOp, ops::SmoothL1LossOpMaker,
-            smooth_l1_loss_grad, ops::SmoothL1LossGradOp);
+REGISTER_OPERATOR(smooth_l1_loss, ops::SmoothL1LossOp, ops::SmoothL1LossOpMaker,
+                  ops::SmoothL1LossGradMaker);
+REGISTER_OPERATOR(smooth_l1_loss_grad, ops::SmoothL1LossGradOp);
 REGISTER_OP_CPU_KERNEL(
     smooth_l1_loss,
     ops::SmoothL1LossKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5596fa0648ccc151bc0d11de9c556599428a8d71
--- /dev/null
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
@@ -0,0 +1,64 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<Tensor>("X");
+    auto* Out = context.Output<Tensor>("Out");
+
+    // allocate memory on device.
+    Out->mutable_data<T>(context.GetPlace());
+
+    math::SoftmaxCUDNNFunctor<T>()(
+        context.template device_context<platform::CUDADeviceContext>(), X, Out);
+  }
+};
+
+template <typename T>
+class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* Out = context.Input<Tensor>("Out");
+    auto* dOut = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dX = context.Output<Tensor>(framework::GradVarName("X"));
+
+    // allocate memory on device.
+    dX->mutable_data<T>(context.GetPlace());
+
+    math::SoftmaxGradCUDNNFunctor<T>()(
+        context.template device_context<platform::CUDADeviceContext>(), Out,
+        dOut, dX);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
+                   ops::SoftmaxCUDNNKernel<float>,
+                   ops::SoftmaxCUDNNKernel<plat::float16>);
+REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace,
+                   ops::SoftmaxGradCUDNNKernel<float>);
diff --git a/paddle/fluid/operators/softmax_mkldnn_op.cc b/paddle/fluid/operators/softmax_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6668e6b9e917eea7ba4a80ac78917b73eb827208
--- /dev/null
+++ b/paddle/fluid/operators/softmax_mkldnn_op.cc
@@ -0,0 +1,246 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "mkldnn.hpp"
+#include "paddle/fluid/operators/softmax_op.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+using paddle::platform::MKLDNNMemDesc;
+
+using mkldnn::memory;  // Note: paddle has also "memory" namespace
+using mkldnn::primitive;
+using mkldnn::softmax_forward;
+using mkldnn::softmax_backward;
+using mkldnn::prop_kind;
+using mkldnn::stream;
+using platform::to_void_cast;
+
+class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
+ public:
+  SoftmaxMKLDNNHandler(
+      std::shared_ptr<mkldnn::softmax_forward::primitive_desc> softmax_pd,
+      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+      const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
+        softmax_pd_(softmax_pd) {}
+
+  SoftmaxMKLDNNHandler(
+      std::shared_ptr<mkldnn::softmax_forward::primitive_desc> softmax_pd,
+      std::shared_ptr<mkldnn::softmax_backward::primitive_desc> softmax_bwd_pd,
+      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+      const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
+        softmax_pd_(softmax_pd),
+        softmax_bwd_pd_(softmax_bwd_pd) {
+    // If we are in Grad operatgor then update a key with BWD suffix to
+    // distinguish from FWD memory primitives
+    key_ += "-BWD";
+  }
+
+  std::shared_ptr<mkldnn::softmax_forward> AcquireSoftmax(
+      std::shared_ptr<mkldnn::memory> dst_memory_p,
+      std::shared_ptr<mkldnn::memory> src_memory_p) {
+    /*Generate key*/
+    auto prim_key = key_ + "@softmax_p";
+
+    auto softmax_p = std::static_pointer_cast<mkldnn::softmax_forward>(
+        dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((softmax_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find softmax primitive in device context");
+    if (softmax_p == nullptr) {
+      softmax_p = std::make_shared<mkldnn::softmax_forward>(
+          *(softmax_pd_.get()),
+          *(static_cast<mkldnn::memory*>(src_memory_p.get())),
+          *(static_cast<mkldnn::memory*>(dst_memory_p.get())));
+      dev_ctx_.SetBlob(prim_key, softmax_p);
+    } else {
+      is_reusing_ = true;
+    }
+
+    return softmax_p;
+  }
+
+  std::shared_ptr<mkldnn::softmax_backward> AcquireSoftmaxBackward(
+      std::shared_ptr<mkldnn::memory> dst_memory_p,
+      std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
+      std::shared_ptr<mkldnn::memory> diff_src_memory_p) {
+    auto prim_key = key_ + "@softmax_bwd_p";
+    auto softmax_bwd_p = std::static_pointer_cast<mkldnn::softmax_backward>(
+        dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((softmax_bwd_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find softmax backward primitive in device context");
+    if (softmax_bwd_p == nullptr) {
+      softmax_bwd_p = std::make_shared<mkldnn::softmax_backward>(
+          *softmax_bwd_pd_, *(dst_memory_p.get()), *(diff_dst_memory_p.get()),
+          *(diff_src_memory_p.get()));
+      dev_ctx_.SetBlob(prim_key, softmax_bwd_p);
+    } else {
+      is_reusing_ = true;
+    }
+
+    return softmax_bwd_p;
+  }
+
+ private:
+  std::shared_ptr<mkldnn::softmax_forward::primitive_desc> softmax_pd_;
+  std::shared_ptr<mkldnn::softmax_backward::primitive_desc> softmax_bwd_pd_;
+};
+
+template <typename T>
+class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    auto mkldnn_engine = dev_ctx.GetEngine();
+    const Tensor* input = ctx.Input<Tensor>("X");
+    Tensor* output = ctx.Output<Tensor>("Out");
+    PADDLE_ENFORCE(input->dims().size() == 2UL,
+                   "The input of softmax op must be a 2D matrix.");
+    const T* input_data = input->data<T>();
+    // allocate memory for output
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    // MKL-DNN does support softmax over selected axis. Having 2D Tensor,
+    // we will make normalization after final eg. axis: 1
+    PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])),
+                   "Softmax input and output dimensions should match");
+    // Same memory descriptor to be used for input and output
+    memory::dims softmax_tz = {src_tz[0], src_tz[1]};
+    // Generate keys for storing/retriving primitives for this operator
+    const std::string key =
+        platform::MKLDNNHandler::GetHash(softmax_tz, ctx.op().Output("Out"));
+    const std::string key_softmax_pd = key + "@softmax_pd";
+
+    // Currently only NC data format is supported
+    auto softmax_md = MKLDNNMemDesc(
+        {softmax_tz}, platform::MKLDNNGetDataType<T>(), memory::format::nc);
+    // Normalization is made after innermost dimension eg. C out of NC
+    auto softmax_desc = softmax_forward::desc(prop_kind::forward_scoring,
+                                              softmax_md, 1 /*dim: C*/);
+    auto softmax_pd = std::make_shared<mkldnn::softmax_forward::primitive_desc>(
+        softmax_desc, mkldnn_engine);
+    dev_ctx.SetBlob(key_softmax_pd, softmax_pd);
+
+    SoftmaxMKLDNNHandler handler(softmax_pd, dev_ctx, mkldnn_engine, key);
+    auto softmax_src_memory_p =
+        handler.AcquireSrcMemory(softmax_md, to_void_cast<T>(input_data));
+    auto softmax_dst_memory_p =
+        handler.AcquireDstMemory(softmax_md, to_void_cast<T>(output_data));
+    auto softmax_p =
+        handler.AcquireSoftmax(softmax_dst_memory_p, softmax_src_memory_p);
+
+    std::vector<primitive> pipeline{
+        *(static_cast<softmax_forward::primitive*>(softmax_p.get()))};
+    stream(stream::kind::eager).submit(pipeline).wait();
+
+    const bool is_test = ctx.Attr<bool>("is_test");
+    if (!is_test) {
+      T threshold = exp(-64);
+      for (int i = 0; i < dst_tz[0] * dst_tz[1]; ++i) {
+        output_data[i] =
+            output_data[i] < threshold ? threshold : output_data[i];
+      }
+    }
+  }
+};
+
+template <typename T>
+class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    auto mkldnn_engine = dev_ctx.GetEngine();
+    const Tensor* output = ctx.Input<Tensor>("Out");
+    const T* dst_data = output->data<T>();
+
+    auto* dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
+    const auto* diff_dst_ptr = dout->template data<T>();
+
+    auto* dx =
+        ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
+    T* diff_src_ptr = dx->template mutable_data<T>(ctx.GetPlace());
+
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    std::vector<int> src_tz(dst_tz);
+    PADDLE_ENFORCE(output->dims().size() == 2UL,
+                   "The input of softmax op must be a 2D matrix.");
+    // MKL-DNN does support softmax over selected axis. Having 2D Tensor,
+    // we will make normalization after final eg. axis: 1
+    PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])),
+                   "Softmax input and output dimensions should match");
+    // Same memory descriptor to be used for input and output
+    memory::dims softmax_tz = {src_tz[0], src_tz[1]};
+    // Currently only supports NC data format
+    // retrieve eltwise primitive desc from device context
+    const std::string key =
+        platform::MKLDNNHandler::GetHash(softmax_tz, ctx.op().Input("Out"));
+    const std::string key_softmax_pd = key + "@softmax_pd";
+
+    auto softmax_pd =
+        std::static_pointer_cast<mkldnn::softmax_forward::primitive_desc>(
+            dev_ctx.GetBlob(key_softmax_pd));
+    PADDLE_ENFORCE(softmax_pd != nullptr,
+                   "Fail to find softmax_pd in device context");
+
+    // TODO(jczaja): Add layouts support when there is a need to do so
+    // Two dimensional softmax does support NC format
+    auto data_softmax_md = MKLDNNMemDesc(
+        {softmax_tz}, platform::MKLDNNGetDataType<T>(), memory::format::nc);
+    auto diff_softmax_md = MKLDNNMemDesc(
+        {softmax_tz}, platform::MKLDNNGetDataType<T>(), memory::format::nc);
+    // Normalization is made after innermost dimension eg. C out of NC
+    auto softmax_bwd_desc =
+        softmax_backward::desc(diff_softmax_md, data_softmax_md, 1 /* dim: C*/);
+    auto softmax_bwd_pd =
+        std::make_shared<mkldnn::softmax_backward::primitive_desc>(
+            softmax_bwd_desc, mkldnn_engine, *softmax_pd);
+
+    SoftmaxMKLDNNHandler handler(softmax_pd, softmax_bwd_pd, dev_ctx,
+                                 mkldnn_engine, key);
+    auto dst_memory_p =
+        handler.AcquireDstMemory(data_softmax_md, to_void_cast<T>(dst_data));
+    auto diff_dst_memory_p = handler.AcquireDiffDstMemory(
+        diff_softmax_md, to_void_cast<T>(diff_dst_ptr));
+    auto diff_src_memory_p = handler.AcquireDiffSrcMemory(
+        diff_softmax_md, to_void_cast<T>(diff_src_ptr));
+
+    // Get primitve from device context
+    auto softmax_bwd_p = handler.AcquireSoftmaxBackward(
+        dst_memory_p, diff_dst_memory_p, diff_src_memory_p);
+
+    std::vector<primitive> pipeline{*softmax_bwd_p};
+    stream(stream::kind::eager).submit(pipeline).wait();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(softmax, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::SoftmaxMKLDNNKernel<float>);
+REGISTER_OP_KERNEL(softmax_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::SoftmaxMKLDNNGradKernel<float>);
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 09275ef290e8c78dc0902033e904cc4e7ccd7adb..31a7458f637921c290fc71ac748143867b4aae19 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -14,6 +14,16 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/softmax_op.h"
 
+#include <string>
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
+
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -33,16 +43,65 @@ class SoftmaxOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", x_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    // choose cudnn kernel if the runtime supported.
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+
+#ifdef PADDLE_WITH_CUDA
+    if (platform::CanCUDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kCUDNN;
+    }
+#endif
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
+    }
+#endif
+
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    if (input_data_type == framework::proto::VarType::FP16) {
+      PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                     "float16 can only be used on GPU place");
+    }
+
+    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
+                                   library_);
+  }
 };
 
 class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X",
              "The input tensor of softmax. "
              "2-D with shape [batch_size, input_feature_dimensions].");
-    AddOutput("Out", "The normalized values with the same shape as X.");
+    AddOutput("Out", "The normalized values with the same shape as X.")
+        .Reuse("X");
+    AddAttr<bool>(
+        "use_cudnn",
+        "(bool, default false) Only used in cudnn kernel, need install cudnn")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "data_format",
+        "(string, default NCHW) Only used in "
+        "An optional string from: \"NHWC\", \"NCHW\". "
+        "Defaults to \"NHWC\". Specify the data format of the output data, "
+        "the input will be transformed automatically. ")
+        .SetDefault("AnyLayout");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
+    AddAttr<bool>("is_test",
+                  "Disable epsilon adding to softmax results. Used by MKLDNN.")
+        .SetDefault(false);
     AddComment(R"DOC(
 Softmax Operator.
 
@@ -80,6 +139,37 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
 
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    // choose cudnn kernel if the runtime supported.
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+
+#ifdef PADDLE_WITH_CUDA
+    if (platform::CanCUDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kCUDNN;
+    }
+#endif
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
+    }
+#endif
+    auto input_data_type =
+        framework::ToDataType(ctx.Input<Tensor>("X")->type());
+    if (input_data_type == framework::proto::VarType::FP16) {
+      PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                     "float16 can only be used on GPU place");
+    }
+
+    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
+                                   library_);
+  }
 };
 
 }  // namespace operators
@@ -87,10 +177,13 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, softmax_grad,
-            ops::SoftmaxOpGrad);
+REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad);
 REGISTER_OP_CPU_KERNEL(
-    softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>);
+    softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     softmax_grad,
-    ops::SoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ops::SoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SoftmaxGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/softmax_op.cu.cc b/paddle/fluid/operators/softmax_op.cu.cc
index dbd13fd38a33d4068a5b5d47cd92a81293f6e748..5fb4f011d9b47cebc4a23bcce47eada825263343 100644
--- a/paddle/fluid/operators/softmax_op.cu.cc
+++ b/paddle/fluid/operators/softmax_op.cu.cc
@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/softmax_op.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
-
+namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
-    softmax, ops::SoftmaxKernel<paddle::platform::CUDADeviceContext, float>);
+    softmax, ops::SoftmaxKernel<plat::CUDADeviceContext, float>,
+    ops::SoftmaxKernel<plat::CUDADeviceContext, double>,
+    ops::SoftmaxKernel<plat::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
-    softmax_grad,
-    ops::SoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>);
+    softmax_grad, ops::SoftmaxGradKernel<plat::CUDADeviceContext, float>,
+    ops::SoftmaxGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
index 857e5733573497b56520daa7860f4feb4e01cda7..53cb716a979229c99fcbdc12f1aeab4e21b320f3 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -20,8 +20,7 @@ namespace operators {
 class SoftmaxWithCrossEntropyOpMaker
     : public framework::OpProtoAndCheckerMaker {
  public:
-  SoftmaxWithCrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Logits",
              "(Tensor, default: Tensor<float>), The unscaled log probabilities "
              "which is a 2-D tensor with shape [N x K]. N is the batch_size, "
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 39b246a5bedb2819fc9b7fc407cfe03e59af0b68..8f7840cee1dd95a828fd4ac8815e335a5db47e3d 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -23,21 +23,21 @@ using Tensor = framework::Tensor;
 
 namespace {
 template <typename T>
-__global__ void CrossEntropyGrad(T* logit_grad, const T* loss_grad,
-                                 const int64_t* labels, const int batch_size,
-                                 const int class_num) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int sample_idx = tid / class_num;
-
-  if (tid < batch_size) {
-    PADDLE_ASSERT(labels[sample_idx] >= 0 && labels[sample_idx] < class_num);
-    logit_grad[tid * class_num + labels[tid]] -= static_cast<T>(1.);
+__global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels,
+                                 const int batch_size, const int class_num) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < batch_size;
+       i += blockDim.x * gridDim.x) {
+    int idx = i * class_num + labels[i];
+    logit_grad[idx] -= static_cast<T>(1.);
   }
+}
 
-  __syncthreads();
-
-  if (tid < batch_size * class_num) {
-    logit_grad[tid] *= loss_grad[sample_idx];
+template <typename T>
+__global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
+                      const int class_num) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+       i += blockDim.x * gridDim.x) {
+    logit_grad[i] *= loss_grad[i / class_num];
   }
 }
 
@@ -94,22 +94,22 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
     const int batch_size = logit_grad->dims()[0];
     const int class_num = logit_grad->dims()[1];
     int block = 512;
-    int grid = (batch_size * class_num + block - 1) / block;
+    auto stream = context.cuda_device_context().stream();
 
     if (context.Attr<bool>("soft_label")) {
+      int grid = (batch_size * class_num + block - 1) / block;
       const T* label_data = labels->data<T>();
-      SoftCrossEntropyGradientKernel<
-          T><<<grid, block, 0,
-               context.template device_context<platform::CUDADeviceContext>()
-                   .stream()>>>(logit_grad_data, loss_grad_data, label_data,
-                                batch_size, class_num);
+      SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
+          logit_grad_data, loss_grad_data, label_data, batch_size, class_num);
     } else {
+      int grid = (batch_size + block - 1) / block;
       const int64_t* label_data = labels->data<int64_t>();
-      CrossEntropyGrad<
-          T><<<grid, block, 0,
-               context.template device_context<platform::CUDADeviceContext>()
-                   .stream()>>>(logit_grad_data, loss_grad_data, label_data,
-                                batch_size, class_num);
+      CrossEntropyGrad<T><<<grid, block, 0, stream>>>(
+          logit_grad_data, label_data, batch_size, class_num);
+      int num = batch_size * class_num;
+      grid = (num + block - 1) / block;
+      Scale<T><<<grid, block, 0, stream>>>(logit_grad_data, loss_grad_data, num,
+                                           class_num);
     }
   }
 };
diff --git a/paddle/fluid/operators/split_byref_op.cc b/paddle/fluid/operators/split_byref_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bc998e1abbd7131a7497288cc9d66315a6fedc85
--- /dev/null
+++ b/paddle/fluid/operators/split_byref_op.cc
@@ -0,0 +1,100 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/split_byref_op.h"
+#include "paddle/fluid/operators/split_op.h"
+
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+
+class SplitByrefOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SplitOp should not be null.");
+    PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL,
+                      "Outputs(Out) of SplitOp should not be empty.");
+    auto in_dims = ctx->GetInputDim("X");
+    auto outs_names = ctx->Outputs("Out");
+    size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num"));
+    std::vector<int> sections = static_cast<std::vector<int>>(
+        ctx->Attrs().Get<std::vector<int>>("sections"));
+    const size_t outs_number = outs_names.size();
+    std::vector<framework::DDim> outs_dims;
+    outs_dims.reserve(outs_number);
+
+    if (num > 0) {
+      int64_t in_axis_dim = in_dims[0];
+      PADDLE_ENFORCE_EQ(in_axis_dim % num, 0,
+                        "tensor split does not result"
+                        " in an equal division");
+      size_t out_axis_dim = in_axis_dim / num;
+      for (size_t i = 0; i < outs_number; ++i) {
+        auto dim = in_dims;
+        dim[0] = out_axis_dim;
+        outs_dims.push_back(dim);
+      }
+    } else if (sections.size() > 0) {
+      PADDLE_ENFORCE_EQ(sections.size(), outs_number,
+                        "tensor split sections size"
+                        "should be equal to output size.");
+      for (size_t i = 0; i < outs_number; ++i) {
+        auto dim = in_dims;
+        dim[0] = sections[i];
+        outs_dims.push_back(dim);
+      }
+    }
+    ctx->SetOutputsDim("Out", outs_dims);
+  }
+};
+
+class SplitByrefOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) Input tensor of the split operator.");
+    AddOutput("Out", "(Tensor) Output tensors of the split operator.")
+        .AsDuplicable();
+    AddComment(R"DOC(
+SplitByref operator
+
+Split source tensor to sevaral tensors by axis 0. No copy in this operator
+is performed, output tensor shares the same blocks of memory.
+)DOC");
+    AddAttr<std::vector<int>>("sections",
+                              "(vector<int>) "
+                              "the length of each output along the "
+                              "specified axis.")
+        .SetDefault(std::vector<int>{});
+    AddAttr<int>("num",
+                 "(int, default 0)"
+                 "Number of sub-tensors. This must evenly divide "
+                 "Input.dims()[axis]")
+        .SetDefault(0);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+// NOTE: concat op default axis must be 0!
+USE_CPU_ONLY_OP(concat);
+
+REGISTER_OPERATOR(split_byref, ops::SplitByrefOp, ops::SplitByrefOpMaker,
+                  ops::SplitGradMaker);
+REGISTER_OP_CPU_KERNEL(
+    split_byref, ops::SplitByrefOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/split_byref_op.cu.cc b/paddle/fluid/operators/split_byref_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5ee6186f3541b7dcb845ce0c6d28081685925da0
--- /dev/null
+++ b/paddle/fluid/operators/split_byref_op.cu.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/split_byref_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    split_byref,
+    ops::SplitByrefOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/split_byref_op.h b/paddle/fluid/operators/split_byref_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..fedd7218dd6cc9481e94a92a3820cafbe4157bd0
--- /dev/null
+++ b/paddle/fluid/operators/split_byref_op.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SplitByrefOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    auto place = ctx.GetPlace();
+
+    size_t row_offset = 0;
+    for (size_t i = 0; i < outs.size(); ++i) {
+      // NOTE: no need to call mutable_data here to allocate memory.
+      auto* out = outs[i];
+      VLOG(3) << "spliting by ref: " << row_offset << " " << out->dims()[0];
+      *out = in->Slice(row_offset, row_offset + out->dims()[0]);
+      row_offset += out->dims()[0];
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/split_ids_op.cc b/paddle/fluid/operators/split_ids_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c867c46873ae7ddbdbda280351e4ab28235bcc08
--- /dev/null
+++ b/paddle/fluid/operators/split_ids_op.cc
@@ -0,0 +1,77 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/split_ids_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SplitIdsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}");
+    AddOutput("Out", "(LoDTensor) The outputs of the input Ids.")
+        .AsDuplicable();
+
+    AddComment(R"DOC(
+Split a LoDTensor of Ids into multi LoDTensors, the number is pserver's number
+Example:
+  Input:
+    X = [1,2,3,4,5,6]
+
+  Out(3 output):
+    out0 = [3, 6]
+    out1 = [1, 4]
+    out2 = [2, 5]
+)DOC");
+  }
+};
+
+class SplitIdsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Ids"), "SplitIdsOp must has input Ids.");
+    PADDLE_ENFORCE(ctx->HasOutputs("Out"), "SplitIdsOp must has output Out.");
+
+    auto ids_var_type = ctx->GetInputsVarType("Ids").front();
+    auto ids_dims = ctx->GetInputDim("Ids");
+    if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
+      PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
+      PADDLE_ENFORCE_EQ(ids_dims[1], 1);
+    }
+  }
+};
+
+class SplitIdsOpInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    auto *input_var = block->Var(op_desc.Input("Ids")[0]);
+    for (auto &out_var : op_desc.Output("Out")) {
+      block->Var(out_var)->SetType(input_var->GetType());
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(split_ids, ops::SplitIdsOp, ops::SplitIdsOpMaker,
+                  ops::SplitIdsOpInferVarType);
+REGISTER_OP_CPU_KERNEL(
+    split_ids, ops::SplitIdsOpKernel<paddle::platform::CPUPlace, int64_t>,
+    ops::SplitIdsOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..d263426e073d95ad6d584c7370baf596587a993d
--- /dev/null
+++ b/paddle/fluid/operators/split_ids_op.h
@@ -0,0 +1,92 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SplitIdsOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto place = ctx.GetPlace();
+    if (!platform::is_cpu_place(place)) {
+      PADDLE_THROW("SplitIds do not support GPU kernel");
+    }
+
+    const auto *ids_var = ctx.InputVar("Ids");
+    if (ids_var->IsType<framework::LoDTensor>()) {
+      const auto &ids_dims = ctx.Input<framework::LoDTensor>("Ids")->dims();
+      const T *ids = ctx.Input<framework::LoDTensor>("Ids")->data<T>();
+      auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
+      const size_t shard_num = outs.size();
+
+      std::vector<std::vector<T>> out_ids;
+      out_ids.resize(outs.size());
+
+      // split id by their shard_num.
+      for (int i = 0; i < ids_dims[0]; ++i) {
+        T id = ids[i];
+        size_t shard_id = static_cast<size_t>(id) % shard_num;
+        out_ids[shard_id].push_back(id);
+      }
+
+      // create tensor for each shard and send to parameter server
+      for (size_t i = 0; i < out_ids.size(); ++i) {
+        auto *shard_t = outs[i];
+        std::vector<T> ids = out_ids[i];
+        auto *shard_data = shard_t->mutable_data<T>(
+            framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
+        for (size_t i = 0; i < ids.size(); ++i) {
+          shard_data[i] = ids[i];
+        }
+      }
+    } else if (ids_var->IsType<framework::SelectedRows>()) {
+      const auto *ids_selected_rows = ctx.Input<framework::SelectedRows>("Ids");
+      auto &ids_dims = ids_selected_rows->value().dims();
+      PADDLE_ENFORCE_EQ(ids_dims[0],
+                        static_cast<int64_t>(ids_selected_rows->rows().size()),
+                        "");
+      const T *ids = ids_selected_rows->value().data<T>();
+      const auto &ids_rows = ids_selected_rows->rows();
+      auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
+      const size_t shard_num = outs.size();
+      // get rows for outputs
+      for (auto &id : ids_rows) {
+        size_t shard_id = static_cast<size_t>(id) % shard_num;
+        outs[shard_id]->mutable_rows()->push_back(id);
+      }
+
+      int64_t row_width = ids_dims[1];
+      for (auto &out : outs) {
+        out->set_height(ids_selected_rows->height());
+        framework::DDim ddim = framework::make_ddim(
+            {static_cast<int64_t>(out->rows().size()), row_width});
+        T *output = out->mutable_value()->mutable_data<T>(ddim, place);
+        for (int64_t i = 0; i < ddim[0]; ++i) {
+          memcpy(output + i * row_width, ids + out->rows()[i] * row_width,
+                 row_width * sizeof(T));
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc
index 3222cce239988b170501f2b99e9f1253036b7fbc..767449cde981e5925b7144ff1038560c67651f3e 100644
--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
@@ -125,8 +125,7 @@ class SplitLoDTensorOp : public framework::OperatorBase {
 
 class SplitLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SplitLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The input LoDTensor");
     AddInput("Mask", "A bool column vector which mask the input");
     AddOutput("OutTrue", "True branch of input LoDTensor");
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index dffac772f11bee2fa3dcdf469a86adc57369b54d..d661b276bc31bf0c3ab181d706ffdccec89f0632 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/split_op.h"
-#include "paddle/fluid/operators/net_op.h"
 
 namespace paddle {
 namespace operators {
@@ -71,8 +70,7 @@ class SplitOp : public framework::OperatorWithKernel {
 
 class SplitOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SplitOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(Tensor) Input tensor of the split operator.");
     AddOutput("Out", "(Tensor) Output tensors of the split operator.")
         .AsDuplicable();
@@ -109,21 +107,6 @@ Example:
   }
 };
 
-class SplitGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto op = new framework::OpDesc();
-    op->SetType("concat");
-    op->SetInput("X", OutputGrad("Out"));
-    op->SetOutput("Out", InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
@@ -132,4 +115,7 @@ USE_CPU_ONLY_OP(concat);
 
 REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker, ops::SplitGradMaker);
 REGISTER_OP_CPU_KERNEL(split,
-                       ops::SplitOpKernel<paddle::platform::CPUPlace, float>);
+                       ops::SplitOpKernel<paddle::platform::CPUPlace, double>,
+                       ops::SplitOpKernel<paddle::platform::CPUPlace, float>,
+                       ops::SplitOpKernel<paddle::platform::CPUPlace, int64_t>,
+                       ops::SplitOpKernel<paddle::platform::CPUPlace, int>);
diff --git a/paddle/fluid/operators/split_op.cu.cc b/paddle/fluid/operators/split_op.cu.cc
index efa378af857a8881f25c76379ba7cf81e64c80bb..18e0904681753aff7f3deac96efb6d62f389a031 100644
--- a/paddle/fluid/operators/split_op.cu.cc
+++ b/paddle/fluid/operators/split_op.cu.cc
@@ -15,4 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/split_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    split, ops::SplitOpKernel<paddle::platform::CUDADeviceContext, float>);
+    split, ops::SplitOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SplitOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SplitOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::SplitOpKernel<paddle::platform::CUDADeviceContext, int>);
diff --git a/paddle/fluid/operators/split_op.h b/paddle/fluid/operators/split_op.h
index ae8562c0c503fec13dff61e04845ba0832848f5f..f0c417c70521b1bb3816f884d6ab7393473999e4 100644
--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include <chrono>
+#include <chrono>  // NOLINT
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
@@ -44,5 +44,20 @@ class SplitOpKernel : public framework::OpKernel<T> {
   }
 };
 
+class SplitGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto op = new framework::OpDesc();
+    op->SetType("concat");
+    op->SetInput("X", OutputGrad("Out"));
+    op->SetOutput("Out", InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/split_selected_rows_op.cc b/paddle/fluid/operators/split_selected_rows_op.cc
index e1ce3d0c1bf11e9a623e4e9adc8f08f5069f4d94..76615a9405d7a8e3fa9dba8d01a956209e02ae8f 100644
--- a/paddle/fluid/operators/split_selected_rows_op.cc
+++ b/paddle/fluid/operators/split_selected_rows_op.cc
@@ -19,8 +19,7 @@ namespace operators {
 
 class SplitSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SplitSelectedRowsOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "The input SelectedRows.");
     AddOutput("Out", "The outputs of the input SelectedRows.").AsDuplicable();
     AddAttr<std::vector<int>>("height_sections",
diff --git a/paddle/fluid/operators/spp_op.cc b/paddle/fluid/operators/spp_op.cc
index f1c4415f27d54ad09e5cb3659bd16abd82e38215..a2a96b72f09df86790ad1f90ead9189ff9bd581c 100644
--- a/paddle/fluid/operators/spp_op.cc
+++ b/paddle/fluid/operators/spp_op.cc
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/spp_op.h"
+#include <string>
+#include <vector>
 namespace paddle {
 namespace operators {
 
 class SppOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SppOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(
         "X",
         "(Tensor) The input tensor of spp operator. "
@@ -90,7 +91,9 @@ class SppOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(spp, ops::SppOp, ops::SppOpMaker, spp_grad, ops::SppOpGrad);
+REGISTER_OPERATOR(spp, ops::SppOp, ops::SppOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(spp_grad, ops::SppOpGrad);
 REGISTER_OP_CPU_KERNEL(
     spp, ops::SppKernel<paddle::platform::CPUDeviceContext, float>,
     ops::SppKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h
index 3d2f22632570fe2a28a822370a400390c78b533a..08cb7849d20443862b66ea6096c095b294c7242c 100644
--- a/paddle/fluid/operators/spp_op.h
+++ b/paddle/fluid/operators/spp_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/pooling.h"
diff --git a/paddle/fluid/operators/squared_l2_distance_op.cc b/paddle/fluid/operators/squared_l2_distance_op.cc
index 1c5e87040a8dd74b98d8e31bfe351ea256e01f15..42532a294b2ef9ffdb240fac8596278047daf7fe 100644
--- a/paddle/fluid/operators/squared_l2_distance_op.cc
+++ b/paddle/fluid/operators/squared_l2_distance_op.cc
@@ -56,8 +56,7 @@ class SquaredL2DistanceOp : public framework::OperatorWithKernel {
 
 class SquaredL2DistanceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SquaredL2DistanceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(Tensor) Input of SquaredL2DistanceOp.");
     AddInput("Y", "(Tensor) Target of SquaredL2DistanceOp.");
     AddOutput("sub_result",
@@ -109,9 +108,10 @@ class SquaredL2DistanceGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(squared_l2_distance, ops::SquaredL2DistanceOp,
-            ops::SquaredL2DistanceOpMaker, squared_l2_distance_grad,
-            ops::SquaredL2DistanceGradOp);
+REGISTER_OPERATOR(squared_l2_distance, ops::SquaredL2DistanceOp,
+                  ops::SquaredL2DistanceOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(squared_l2_distance_grad, ops::SquaredL2DistanceGradOp);
 REGISTER_OP_CPU_KERNEL(
     squared_l2_distance,
     ops::SquaredL2DistanceKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/squared_l2_norm_op.cc b/paddle/fluid/operators/squared_l2_norm_op.cc
index b64df2a218860be3adb3954e07b036c05bf05c8e..7bd82e0ce4add6d4434e1defaee43da178a6f309 100644
--- a/paddle/fluid/operators/squared_l2_norm_op.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op.cc
@@ -48,8 +48,7 @@ class SquaredL2NormGradOp : public framework::OperatorWithKernel {
 
 class SquaredL2NormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SquaredL2NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(Tensor) The input of squared_l2_norm op.");
     AddOutput("Out", "(Scalar) The output of squared_l2_norm op.");
     AddComment(R"DOC(
@@ -67,8 +66,10 @@ $$Out = \sum_{i} X_{i}^2$$
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(squared_l2_norm, ops::SquaredL2NormOp, ops::SquaredL2NormOpMaker,
-            squared_l2_norm_grad, ops::SquaredL2NormGradOp);
+REGISTER_OPERATOR(squared_l2_norm, ops::SquaredL2NormOp,
+                  ops::SquaredL2NormOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(squared_l2_norm_grad, ops::SquaredL2NormGradOp);
 REGISTER_OP_CPU_KERNEL(
     squared_l2_norm,
     ops::SquaredL2NormKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
index 22c1db82e9f5aff6aa9a311cd1093b33fa7e6db9..7a10218e1556698f3e0a1828db5de8851dd1c90b 100644
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -37,8 +37,8 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
                           const framework::DDim& src_stride,
                           const framework::DDim& dst_dim,
                           const framework::DDim& dst_stride, T* dst) {
-  using namespace detail;
-  StridedCopyDimVisitor<T> func(dev_ctx, src, src_stride, dst_stride, dst);
+  paddle::operators::detail::StridedCopyDimVisitor<T> func(
+      dev_ctx, src, src_stride, dst_stride, dst);
   boost::apply_visitor(func, dst_dim);
 }
 
diff --git a/paddle/fluid/operators/sum_mkldnn_op.cc b/paddle/fluid/operators/sum_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f78d977760f18c9eb1270e515e68acb208a7c9a4
--- /dev/null
+++ b/paddle/fluid/operators/sum_mkldnn_op.cc
@@ -0,0 +1,240 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*Licensed under the Apache License, Version 2.0(the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. */
+
+#include "mkldnn.hpp"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/operators/sum_op.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+using paddle::platform::CPUDeviceContext;
+using framework::DataLayout;
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::stream;
+using mkldnn::sum;
+using mkldnn::reorder;
+using platform::to_void_cast;
+
+template <typename T>
+class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+    auto in_vars = ctx.MultiInputVar("X");
+
+    const int N = in_vars.size();
+    auto out_var = ctx.OutputVar("Out");
+    bool in_place = out_var == in_vars[0];
+
+    if (out_var->IsType<framework::LoDTensor>()) {
+      LoDTensor* output = ctx.Output<LoDTensor>("Out");
+      T* output_data = output->mutable_data<T>(ctx.GetPlace());
+
+      std::vector<int> dst_tz = framework::vectorize2int(output->dims());
+      auto src_tz = dst_tz;
+      memory::format output_format{memory::format::format_undef};
+      std::vector<float> scales;
+      std::vector<memory::primitive_desc> srcs_mpd;
+      std::vector<mkldnn::memory> srcs_mem;
+
+      PADDLE_ENFORCE(in_vars[0]->IsType<LoDTensor>(),
+                     "Input[0] must be LoDTensors");
+      auto& input0 = in_vars[0]->Get<LoDTensor>();
+      PADDLE_ENFORCE(input0.layout() == DataLayout::kMKLDNN &&
+                         input0.format() != memory::format::format_undef,
+                     "Wrong layout/format for inputs[0]");
+
+      memory::format input_format = input0.format();
+
+      if (src_tz.size() == 1 && (input_format == memory::format::nchw ||
+                                 input_format == memory::format::nhwc)) {
+        input_format = memory::format::x;
+      }
+      if (src_tz.size() == 2 && (input_format == memory::format::nchw ||
+                                 input_format == memory::format::nhwc)) {
+        input_format = memory::format::nc;
+      }
+
+      for (int i = in_place ? 1 : 0; i < N; i++) {
+        PADDLE_ENFORCE(in_vars[i]->IsType<LoDTensor>(),
+                       "all inputs must be all LoDTensors");
+        auto& input = in_vars[i]->Get<LoDTensor>();
+        PADDLE_ENFORCE(input.layout() == DataLayout::kMKLDNN &&
+                           input.format() != memory::format::format_undef,
+                       "Wrong layout/format for inputs");
+
+        if (input.numel() == 0) {
+          continue;
+        }
+
+        const T* input_data = input.data<T>();
+
+        auto src_md =
+            memory::desc(src_tz, memory::data_type::f32, input_format);
+        auto src_mpd = memory::primitive_desc(src_md, mkldnn_engine);
+        auto src_mem = memory(src_mpd, to_void_cast(input_data));
+        srcs_mpd.push_back(src_mpd);
+        srcs_mem.push_back(src_mem);
+        scales.push_back(1.0);
+      }
+
+      auto dst_md =
+          memory::desc(dst_tz, memory::data_type::f32, memory::format::any);
+
+      auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_mpd);
+
+      std::shared_ptr<memory> dst_mem;
+      if (in_place) {
+        dst_mem.reset(new memory(sum_pd.dst_primitive_desc()));
+      } else {
+        dst_mem.reset(new memory(sum_pd.dst_primitive_desc(), output_data));
+      }
+      std::vector<mkldnn::primitive::at> inputs;
+      for (size_t i = 0; i < srcs_mem.size(); ++i) {
+        inputs.push_back(srcs_mem[i]);
+      }
+
+      auto sum_prim = mkldnn::sum(sum_pd, inputs, *dst_mem);
+      output_format = (memory::format)platform::GetMKLDNNFormat(sum_pd);
+
+      primitive reorder_prim;
+      std::shared_ptr<memory> target_mem;
+      if (in_place) {
+        output_format = input_format;
+        target_mem.reset(new memory(
+            {{{src_tz}, memory::data_type::f32, output_format}, mkldnn_engine},
+            output_data));
+        reorder_prim = reorder(*dst_mem, *target_mem);
+      }
+
+      std::vector<primitive> pipeline;
+      pipeline.push_back(sum_prim);
+      if (in_place) pipeline.push_back(reorder_prim);
+      stream(stream::kind::eager).submit(pipeline).wait();
+
+      output->set_layout(DataLayout::kMKLDNN);
+      output->set_format(output_format);
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      // TODO(@mozga-intel) Add MKLDNN SelectedRows support
+      std::unique_ptr<framework::SelectedRows> in0;
+      if (in_place) {
+        // If is in_place, we store the input[0] to in0
+        auto& in_sel0 = in_vars[0]->Get<SelectedRows>();
+        auto& rows = in_sel0.rows();
+        in0.reset(new framework::SelectedRows(rows, in_sel0.height()));
+        in0->mutable_value()->ShareDataWith(in_sel0.value());
+      }
+
+      auto get_selected_row = [&](size_t i) -> const SelectedRows& {
+        if (i == 0 && in0) {
+          return *in0.get();
+        } else {
+          return in_vars[i]->Get<SelectedRows>();
+        }
+      };
+      auto* out = ctx.Output<SelectedRows>("Out");
+      out->mutable_rows()->clear();
+      auto* out_value = out->mutable_value();
+
+      // Runtime InferShape
+      size_t first_dim = 0;
+      for (int i = 0; i < N; i++) {
+        auto& sel_row = get_selected_row(i);
+        first_dim += sel_row.rows().size();
+      }
+      auto in_dim =
+          framework::vectorize(get_selected_row(N - 1).value().dims());
+      in_dim[0] = static_cast<int64_t>(first_dim);
+
+      out_value->Resize(framework::make_ddim(in_dim));
+
+      // if all the input sparse vars are empty, no need to
+      // merge these vars.
+      if (first_dim == 0UL) {
+        return;
+      }
+      out_value->mutable_data<T>(ctx.GetPlace());
+      math::SelectedRowsAddTo<CPUDeviceContext, T> functor;
+      int64_t offset = 0;
+      for (int i = 0; i < N; i++) {
+        auto& sel_row = get_selected_row(i);
+        if (sel_row.rows().size() == 0) {
+          continue;
+        }
+        PADDLE_ENFORCE_EQ(out->height(), sel_row.height());
+        functor(ctx.template device_context<CPUDeviceContext>(), sel_row,
+                offset, out);
+        offset += sel_row.value().numel();
+      }
+    } else if (out_var->IsType<framework::LoDTensorArray>()) {
+      // TODO(@mozga-intel) Add MKLDNN LoDTensorArray support
+      auto& out_array = *out_var->GetMutable<framework::LoDTensorArray>();
+      for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) {
+        PADDLE_ENFORCE(in_vars[i]->IsType<framework::LoDTensorArray>(),
+                       "Only support all inputs are TensorArray");
+        auto& in_array = in_vars[i]->Get<framework::LoDTensorArray>();
+
+        for (size_t i = 0; i < in_array.size(); ++i) {
+          if (in_array[i].numel() != 0) {
+            if (i >= out_array.size()) {
+              out_array.resize(i + 1);
+            }
+            if (out_array[i].numel() == 0) {
+              framework::TensorCopy(in_array[i], in_array[i].place(),
+                                    ctx.device_context(), &out_array[i]);
+              out_array[i].set_lod(in_array[i].lod());
+            } else {
+              PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod());
+              auto in = EigenVector<T>::Flatten(in_array[i]);
+              auto result = EigenVector<T>::Flatten(out_array[i]);
+              result.device(*ctx.template device_context<MKLDNNDeviceContext>()
+                                 .eigen_device()) = result + in;
+            }
+          }
+        }
+      }
+    } else {
+      PADDLE_THROW("Unexpected branch, output variable type is %s",
+                   out_var->Type().name());
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_KERNEL(sum, MKLDNN, ::paddle::platform::CPUPlace,
+                   paddle::operators::SumMKLDNNOpKernel<float>);
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index d3d5c8a3429e2070c5472355b4440401eaa699cb..fe7c7039c7dec714e265ede1b7167fd800ddc2f7 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -10,10 +10,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sum_op.h"
+
+#include <algorithm>
+#include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 using framework::Tensor;
@@ -35,7 +43,10 @@ class SumOp : public framework::OperatorWithKernel {
 
     auto x_dims = ctx->GetInputsDim("X");
     size_t N = x_dims.size();
-    PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1.");
+    PADDLE_ENFORCE_GT(N, 0, "Input tensors count should > 0.");
+    if (N == 1) {
+      VLOG(3) << "Warning: sum have only one input, may waste memory";
+    }
 
     framework::DDim in_dim({0});
     for (auto& x_dim : x_dims) {
@@ -56,6 +67,18 @@ class SumOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     auto x_vars = ctx.MultiInputVar("X");
+
+    framework::LibraryType library{framework::LibraryType::kPlain};
+    framework::DataLayout layout{framework::DataLayout::kAnyLayout};
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (library == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library = framework::LibraryType::kMKLDNN;
+      layout = framework::DataLayout::kMKLDNN;
+    }
+#endif
+
     if (x_vars[0]->IsType<framework::LoDTensor>()) {
       int dtype = -1;
       for (auto& x_var : x_vars) {
@@ -73,26 +96,27 @@ class SumOp : public framework::OperatorWithKernel {
                         "Sum operator should have at least one tensor");
 
       return framework::OpKernelType(
-          static_cast<framework::proto::VarType::Type>(dtype),
-          ctx.device_context());
+          static_cast<framework::proto::VarType::Type>(dtype), ctx.GetPlace(),
+          layout, library);
     } else if (x_vars[0]->IsType<framework::SelectedRows>()) {
       for (auto& var : x_vars) {
         auto& value = var->Get<framework::SelectedRows>().value();
         if (value.IsInitialized()) {
           return framework::OpKernelType(framework::ToDataType(value.type()),
-                                         ctx.device_context());
+                                         ctx.device_context(), layout, library);
         }
       }
       // if input sparse vars are not initialized, use an default kernel type.
       return framework::OpKernelType(framework::proto::VarType::FP32,
-                                     ctx.device_context());
+                                     ctx.device_context(), layout, library);
     } else if (x_vars[0]->IsType<framework::LoDTensorArray>()) {
       for (auto& x_var : x_vars) {
         auto& array = x_var->Get<framework::LoDTensorArray>();
         for (auto& each : array) {
           if (each.numel() != 0) {
             return framework::OpKernelType(framework::ToDataType(each.type()),
-                                           ctx.device_context());
+                                           ctx.device_context(), layout,
+                                           library);
           }
         }
       }
@@ -105,11 +129,13 @@ class SumOp : public framework::OperatorWithKernel {
 
 class SumOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SumOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(vector<Tensor>) The input tensors of sum operator.")
         .AsDuplicable();
-    AddOutput("Out", "(Tensor) The output tensor of sum operator.");
+    AddOutput("Out", "(Tensor) The output tensor of sum operator.").Reuse("X");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddComment(R"DOC(
 Sum operator.
 
@@ -126,7 +152,6 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
                   framework::BlockDesc* block) const override {
     auto& inputs = op_desc.Input("X");
     auto var_type = framework::proto::VarType::SELECTED_ROWS;
-
     for (auto& name : op_desc.Input("X")) {
       VLOG(10) << name << " "
                << block->FindRecursiveOrCreateVar(name).GetType();
@@ -200,6 +225,7 @@ namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker,
                   ops::SumOpVarTypeInference);
+
 REGISTER_OP_CPU_KERNEL(
     sum, ops::SumKernel<paddle::platform::CPUDeviceContext, float>,
     ops::SumKernel<paddle::platform::CPUDeviceContext, double>,
diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h
index e7e5346cdca5efaf81c2b0fddedde7406e3b874d..49a4afb3a8a19c97e844e66477c6288772ece807 100644
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/target_assign_op.cc b/paddle/fluid/operators/target_assign_op.cc
deleted file mode 100644
index a894b12fa35a121eff0b8f9d2d0eecc5ae5185f3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/target_assign_op.cc
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/target_assign_op.h"
-
-namespace paddle {
-namespace operators {
-
-class TargetAssignOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of TargetAssignOp should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("MatchIndices"),
-                   "Input(MatchIndices) of TargetAssignOp should not be null");
-
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of TargetAssignOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("OutWeight"),
-                   "Output(OutWeight) of TargetAssignOp should not be null.");
-
-    auto in_dims = ctx->GetInputDim("X");
-    auto mi_dims = ctx->GetInputDim("MatchIndices");
-
-    PADDLE_ENFORCE_EQ(in_dims.size(), 3, "The rank of Input(X) must be 3.");
-    PADDLE_ENFORCE_EQ(mi_dims.size(), 2,
-                      "The rank of Input(MatchIndices) must be 2.");
-
-    if (ctx->HasInput("NegIndices")) {
-      auto neg_dims = ctx->GetInputDim("NegIndices");
-      PADDLE_ENFORCE_EQ(neg_dims.size(), 2,
-                        "The rank of Input(NegIndices) must be 2.");
-      PADDLE_ENFORCE_EQ(neg_dims[1], 1,
-                        "The last dimenstion of Out(NegIndices) must be 1.");
-    }
-
-    auto n = mi_dims[0];
-    auto m = mi_dims[1];
-    auto k = in_dims[in_dims.size() - 1];
-    ctx->SetOutputDim("Out", {n, m, k});
-    ctx->SetOutputDim("OutWeight", {n, m, 1});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
-        ctx.device_context());
-  }
-};
-
-class TargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  TargetAssignOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(LoDTensor), This input is a 3D LoDTensor with shape [M, P, K]. "
-             "Some elements in X will be assigned to Out based on the "
-             "MatchIndices and NegIndices.");
-    AddInput("MatchIndices",
-             "(Tensor, default Tensor<int>), The input matched indices "
-             "with shape [N, P], If MatchIndices[i][j] is -1, the j-th entity "
-             "of column is not matched to any entity of row in i-th instance.");
-    AddInput("NegIndices",
-             "(LoDTensor, default LoDTensor<int>), The input negative example "
-             "indices are an optional input with shape [Neg, 1], where Neg is "
-             "the total number of negative example indices.")
-        .AsDispensable();
-    AddAttr<int>("mismatch_value",
-                 "(int, default 0), Fill this value to the "
-                 "mismatched location.")
-        .SetDefault(0);
-    AddOutput("Out",
-              "(Tensor), The output is a 3D Tensor with shape [N, P, K], "
-              "N and P is the same as they are in NegIndices, K is the "
-              "same as it in input of X. If MatchIndices[i][j] "
-              "is -1, the Out[i][j][0 : K] is the mismatch_value.");
-    AddOutput("OutWeight",
-              "(Tensor), The weight for output with the shape of [N, P, 1]");
-    AddComment(R"DOC(
-This operator can be, for given the target bounding boxes or labels,
-to assign classification and regression targets to each prediction as well as
-weights to prediction. The weights is used to specify which prediction would
-not contribute to training loss.
-
-For each instance, the output `Out` and`OutWeight` are assigned based on
-`MatchIndices` and `NegIndices`.
-Assumed that the row offset for each instance in `X` is called lod,
-this operator assigns classification/regression targets by performing the
-following steps:
-
-1. Assigning all outpts based on `MatchIndices`:
-
-If id = MatchIndices[i][j] > 0,
-
-    Out[i][j][0 : K] = X[lod[i] + id][j % P][0 : K]
-    OutWeight[i][j] = 1.
-
-Otherwise, 
-
-    Out[j][j][0 : K] = {mismatch_value, mismatch_value, ...}
-    OutWeight[i][j] = 0.
-
-2. Assigning OutWeight based on `NegIndices` if `NegIndices` is provided:
-
-Assumed that the row offset for each instance in `NegIndices` is called neg_lod,
-for i-th instance and each `id` of NegIndices in this instance:
-
-    Out[i][id][0 : K] = {mismatch_value, mismatch_value, ...}
-    OutWeight[i][id] = 1.0
-
-    )DOC");
-  }
-};
-
-template <typename T, typename WT>
-struct NegTargetAssignFunctor<platform::CPUDeviceContext, T, WT> {
-  void operator()(const platform::CPUDeviceContext& ctx, const int* neg_indices,
-                  const size_t* lod, const int N, const int M, const int K,
-                  const int mismatch_value, T* out, WT* out_wt) {
-    for (int i = 0; i < N; ++i) {
-      for (size_t j = lod[i]; j < lod[i + 1]; ++j) {
-        int id = neg_indices[j];
-        int off = (i * M + id) * K;
-        for (int k = 0; k < K; ++k) {
-          out[off + k] = mismatch_value;
-          out_wt[off + k] = static_cast<WT>(1.0);
-        }
-      }
-    }
-  }
-};
-
-template struct NegTargetAssignFunctor<platform::CPUDeviceContext, int, float>;
-template struct NegTargetAssignFunctor<platform::CPUDeviceContext, float,
-                                       float>;
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(target_assign, ops::TargetAssignOp,
-                             ops::TargetAssignOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    target_assign,
-    ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, int, float>,
-    ops::TargetAssignKernel<paddle::platform::CPUDeviceContext, float, float>);
diff --git a/paddle/fluid/operators/target_assign_op.cu b/paddle/fluid/operators/target_assign_op.cu
deleted file mode 100644
index 24664f99b20f92108220d27ec58e8fdf3ba6193c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/target_assign_op.cu
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/target_assign_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename WT>
-__global__ void NegTargetAssignKernel(const int* neg_indices, const size_t* lod,
-                                      const int N, const int M, const int K,
-                                      const int mismatch_value, T* out,
-                                      WT* out_wt) {
-  int bidx = blockIdx.x;
-  int st = lod[bidx];
-  int ed = lod[bidx + 1];
-
-  int row_start = bidx * M;
-  for (int i = st + threadIdx.x; i < ed; i += blockDim.x) {
-    int id = row_start + neg_indices[i];
-    for (int k = 0; k < K; ++k) {
-      out[id * K + k] = T(mismatch_value);
-      out_wt[id * K + k] = WT(1.);
-    }
-  }
-}
-
-template <typename T, typename WT>
-struct NegTargetAssignFunctor<platform::CUDADeviceContext, T, WT> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const int* neg_indices, const size_t* lod, const int N,
-                  const int M, const int K, const int mismatch_value, T* out,
-                  WT* out_wt) {
-    const int block_size = 256;
-    const int grid_size = N;
-    NegTargetAssignKernel<T, WT><<<grid_size, block_size, 0, ctx.stream()>>>(
-        neg_indices, lod, N, M, K, mismatch_value, out, out_wt);
-  }
-};
-
-template struct NegTargetAssignFunctor<platform::CUDADeviceContext, int, float>;
-template struct NegTargetAssignFunctor<platform::CUDADeviceContext, float,
-                                       float>;
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    target_assign,
-    ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, int, float>,
-    ops::TargetAssignKernel<paddle::platform::CUDADeviceContext, float, float>);
diff --git a/paddle/fluid/operators/tensor_array_read_write_op.cc b/paddle/fluid/operators/tensor_array_read_write_op.cc
index 2636812c42985536e7ca3475c03bbd8d1638ece6..a2d44284e9de1ace42cabbce82e0b45929432d7b 100644
--- a/paddle/fluid/operators/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/tensor_array_read_write_op.cc
@@ -38,15 +38,14 @@ class WriteToArrayOp : public ArrayOp {
                << " to " << offset + 1;
       out->resize(offset + 1);
     }
+    auto *out_tensor = &out->at(offset);
+    out_tensor->set_lod(x_tensor.lod());
     if (x_tensor.memory_size() > 0) {
-      auto *out_tensor = &out->at(offset);
-
       platform::DeviceContextPool &pool =
           platform::DeviceContextPool::Instance();
       auto &dev_ctx = *pool.Get(place);
 
       TensorCopy(x_tensor, place, dev_ctx, out_tensor);
-      out_tensor->set_lod(x_tensor.lod());
     } else {
       VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
                   "nothing has been written to output array["
@@ -57,8 +56,7 @@ class WriteToArrayOp : public ArrayOp {
 
 class WriteToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  WriteToArrayOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(LoDTensor) the tensor will be written to tensor array");
     AddInput(
         "I",
@@ -148,8 +146,7 @@ class ReadFromArrayOp : public ArrayOp {
 
 class ReadFromArrayProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  ReadFromArrayProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(TensorArray) the array will be read from.");
     AddInput("I",
              "(Tensor) the subscript index in tensor array. The number of "
diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..647cfc0a0af2be85e2868c6f68cab962c6631a8d
--- /dev/null
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -0,0 +1,154 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/operators/tensorrt_engine_op.h"
+
+namespace paddle {
+namespace operators {
+
+using inference::Singleton;
+using inference::tensorrt::TRT_EngineManager;
+
+using FluidDT = framework::proto::VarType_Type;
+using TRT_DT = nvinfer1::DataType;
+
+namespace {
+
+TRT_DT FluidDataType2TRT(FluidDT type) {
+  switch (type) {
+    case FluidDT::VarType_Type_FP32:
+      return TRT_DT::kFLOAT;
+    case FluidDT::VarType_Type_INT32:
+      return TRT_DT::kINT32;
+    default:
+      return TRT_DT::kINT32;
+  }
+  PADDLE_THROW("unkown type");
+  return TRT_DT::kINT32;
+}
+
+nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
+  PADDLE_ENFORCE_GT(shape.size(), 1UL,
+                    "TensorRT' tensor input requires at least 2 dimensions");
+  PADDLE_ENFORCE_LE(shape.size(), 4UL,
+                    "TensorRT' tensor input requires at most 4 dimensions");
+
+  switch (shape.size()) {
+    case 2:
+      return nvinfer1::Dims2(shape[0], shape[1]);
+    case 3:
+      return nvinfer1::Dims3(shape[0], shape[1], shape[2]);
+    case 4:
+      return nvinfer1::Dims4(shape[0], shape[1], shape[2], shape[3]);
+    default:
+      return nvinfer1::Dims();
+  }
+  return nvinfer1::Dims();
+}
+
+}  // namespace
+
+template <typename DeviceContext, typename T>
+void TensorRTEngineKernel<DeviceContext, T>::Prepare(
+    const framework::ExecutionContext &context) const {
+  VLOG(4) << "Prepare engine";
+  // Get the ProgramDesc and pass to convert.
+  framework::proto::BlockDesc block_desc;
+  block_desc.ParseFromString(context.Attr<std::string>("subgraph"));
+  int max_batch = context.Attr<int>("max_batch");
+  auto max_workspace = context.Attr<int>("max_workspace");
+  auto params = context.Attr<std::vector<std::string>>("parameters");
+  std::unordered_set<std::string> parameters;
+  for (const auto &param : params) {
+    parameters.insert(param);
+  }
+
+  // TODO(Superjomn) replace this with a different stream
+  auto *engine = Singleton<TRT_EngineManager>::Global().Create(
+      max_batch, max_workspace, nullptr /*engine hold its own stream*/,
+      context.Attr<std::string>("engine_uniq_key"));
+  engine->InitNetwork();
+
+  framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
+  // Add inputs
+  VLOG(4) << "declare inputs";
+  for (auto &input : context.Inputs("Xs")) {
+    VLOG(4) << "declare input " << input;
+    auto *var = block.FindVar(input);
+    PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
+                      "TensorRT engine only takes LoDTensor as input");
+    auto shape = var->GetShape();
+    engine->DeclareInput(
+        input, FluidDataType2TRT(
+                   var->Proto()->type().lod_tensor().tensor().data_type()),
+        Vec2TRT_Dims(var->GetShape()));
+  }
+
+  inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
+      block_desc, parameters, context.scope(), engine);
+
+  // Add outputs
+  VLOG(4) << "declare outputs";
+  for (auto &output : context.Outputs("Ys")) {
+    VLOG(4) << "declare output " << output;
+    engine->DeclareOutput(output);
+  }
+
+  engine->FreezeNetwork();
+}
+
+class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Xs", "A list of inputs.").AsDuplicable();
+    AddOutput("Ys", "A list of outputs").AsDuplicable();
+    AddAttr<std::string>("subgraph", "the subgraph.");
+    AddAttr<std::string>("engine_uniq_key", "unique key for the TRT engine.");
+    AddAttr<int>("max_batch", "the maximum batch size.");
+    AddAttr<int>("max_workspace", "the maximum batch size.");
+    AddComment("TensorRT engine operator.");
+  }
+};
+
+class TensorRTEngineInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp,
+                  ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    tensorrt_engine,
+    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int64_t>);
+
+#endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..1602a913aeebe43fabe2f9c9036edd18ac4c70fd
--- /dev/null
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -0,0 +1,122 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_CUDA
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+
+namespace paddle {
+namespace operators {
+
+using inference::Singleton;
+using inference::tensorrt::TRT_EngineManager;
+
+class TensorRTEngineOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input0 = ctx.Inputs("Xs").front();
+    framework::OpKernelType kt = framework::OpKernelType(
+        framework::ToDataType(ctx.scope()
+                                  .FindVar(input0)
+                                  ->GetMutable<framework::LoDTensor>()
+                                  ->type()),
+        platform::CPUPlace());
+    return kt;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class TensorRTEngineKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    VLOG(4) << "TensorRTEngineKernel executing";
+    auto engine_name = context.Attr<std::string>("engine_uniq_key");
+    if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
+      Prepare(context);
+    }
+    auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
+    auto input_names = context.op().Inputs("Xs");
+    PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
+    // Try to determine a batch_size
+    auto& tensor0 = inference::analysis::GetFromScope<framework::LoDTensor>(
+        context.scope(), input_names.front());
+    int batch_size = tensor0.dims()[0];
+    PADDLE_ENFORCE_LE(batch_size, context.Attr<int>("max_batch"));
+
+    // Convert input tensor from fluid to engine.
+    for (const auto& x : context.Inputs("Xs")) {
+      // convert input and copy to TRT engine's buffer
+      auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(
+          context.scope(), x);
+      if (platform::is_cpu_place(t.place())) {
+        engine->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()),
+                                t.memory_size());
+      } else {
+        engine->SetInputFromGPU(x, static_cast<const void*>(t.data<void>()),
+                                t.memory_size());
+      }
+    }
+    // Execute the engine.
+    PADDLE_ENFORCE_GT(batch_size, 0);
+    engine->Execute(batch_size);
+    // Convert output tensor from engine to fluid
+    for (const auto& y : context.Outputs("Ys")) {
+      // convert output and copy to fluid.
+      nvinfer1::ITensor* trt_t = engine->GetITensor(y);
+      auto dims = trt_t->getDimensions();
+      // Use the output ITensor's dims to reshape the Fluid Tensor.
+      std::vector<int> ddim(dims.d, dims.d + dims.nbDims);
+
+      auto* fluid_v = context.scope().FindVar(y);
+      PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
+      auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
+      fluid_t->Resize(framework::make_ddim(ddim));
+      auto size = inference::analysis::AccuDims(dims.d, dims.nbDims);
+      if (platform::is_cpu_place(fluid_t->place())) {
+        // TODO(Superjomn) change this float to dtype size.
+        engine->GetOutputInCPU(
+            y, fluid_t->mutable_data<float>(platform::CPUPlace()),
+            size * sizeof(float));
+      } else {
+        engine->GetOutputInGPU(
+            y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
+            size * sizeof(float));
+      }
+    }
+
+    cudaStreamSynchronize(*engine->stream());
+  }
+
+ protected:
+  // Build the engine.
+  void Prepare(const framework::ExecutionContext& context) const;
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..82a16361e40513aeaf6f510e450f58989369fcdb
--- /dev/null
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
@@ -0,0 +1,211 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+USE_CPU_ONLY_OP(tensorrt_engine);
+
+namespace paddle {
+namespace operators {
+
+namespace {
+void CreateCPUTensor(framework::Scope* scope, const std::string& name,
+                     const std::vector<int64_t>& shape) {
+  auto* var = scope->Var(name);
+  auto* tensor = var->GetMutable<framework::LoDTensor>();
+  auto dims = framework::make_ddim(shape);
+  tensor->Resize(dims);
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+  inference::tensorrt::RandomizeTensor(tensor, place, ctx);
+}
+
+void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
+                          const std::string& name,
+                          const std::vector<int64_t>& shape) {
+  using framework::proto::VarType;
+  auto* var = block->add_vars();
+  framework::VarDesc desc(name);
+  desc.SetType(VarType::LOD_TENSOR);
+  desc.SetDataType(VarType::FP32);
+  desc.SetShape(shape);
+  *var = *desc.Proto();
+}
+
+}  // namespace
+
+using inference::analysis::SetAttr;
+
+TEST(TensorRTEngineOp, manual) {
+  framework::ProgramDesc program;
+  auto* block_ = program.Proto()->add_blocks();
+  block_->set_idx(0);
+  block_->set_parent_idx(-1);
+
+  LOG(INFO) << "create block desc";
+  framework::BlockDesc block_desc(&program, block_);
+  LOG(INFO) << "create mul op";
+  auto* mul = block_desc.AppendOp();
+  mul->SetType("mul");
+  mul->SetInput("X", std::vector<std::string>({"x"}));     // 2 x 4
+  mul->SetInput("Y", std::vector<std::string>({"y"}));     // 4 x 6
+  mul->SetOutput("Out", std::vector<std::string>({"z"}));  // 2 x 6
+
+  LOG(INFO) << "create fc op";
+  auto* fc = block_desc.AppendOp();
+  fc->SetType("mul");
+  fc->SetInput("X", std::vector<std::string>({"z"}));
+  fc->SetInput("Y", std::vector<std::string>({"y0"}));     // 6 x 8
+  fc->SetOutput("Out", std::vector<std::string>({"z0"}));  // 2 x 8
+
+  // Set inputs' variable shape in BlockDesc
+  AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4}));
+  AddTensorToBlockDesc(block_, "y", std::vector<int64_t>({4, 6}));
+  AddTensorToBlockDesc(block_, "y0", std::vector<int64_t>({6, 8}));
+  AddTensorToBlockDesc(block_, "z", std::vector<int64_t>({2, 6}));
+
+  // It is wired, need to copy manually.
+  *block_->add_ops() = *mul->Proto();
+  *block_->add_ops() = *fc->Proto();
+
+  ASSERT_EQ(block_->ops_size(), 2);
+
+  LOG(INFO) << "create tensorrt desc";
+  framework::OpDesc engine_op_desc(nullptr);
+  engine_op_desc.SetType("tensorrt_engine");
+  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x", "y", "y0"}));
+  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
+  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
+                       block_->SerializeAsString());
+  SetAttr<int>(engine_op_desc.Proto(), "max_batch", 100);
+  SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 1 << 10);
+  SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "a_engine");
+  SetAttr<std::vector<std::string>>(engine_op_desc.Proto(), "parameters",
+                                    std::vector<std::string>({}));
+
+  LOG(INFO) << "create engine op";
+  auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
+  LOG(INFO) << "engine_op " << engine_op.get();
+
+  framework::Scope scope;
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+  // Prepare variables.
+  CreateCPUTensor(&scope, "x", std::vector<int64_t>({2, 4}));
+  CreateCPUTensor(&scope, "y", std::vector<int64_t>({4, 6}));
+  CreateCPUTensor(&scope, "z", std::vector<int64_t>({2, 6}));
+
+  CreateCPUTensor(&scope, "y0", std::vector<int64_t>({6, 8}));
+  CreateCPUTensor(&scope, "z0", std::vector<int64_t>({2, 8}));
+
+  // Execute them.
+  LOG(INFO) << "engine_op run";
+  engine_op->Run(scope, place);
+}
+
+void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
+  framework::ProgramDesc program;
+  framework::Scope scope;
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+
+  auto* block_ = program.Proto()->add_blocks();
+  block_->set_idx(0);
+  block_->set_parent_idx(-1);
+
+  using shape_t = std::vector<int64_t>;
+
+  LOG(INFO) << "create block desc";
+  framework::BlockDesc block_desc(&program, block_);
+
+  auto AddFCLayer = [&](const std::string& x_name, const std::string& y_name,
+                        const std::string& z_name, bool x_created,
+                        const shape_t& x_shape, const shape_t& y_shape,
+                        const shape_t& z_shape) {
+    LOG(INFO) << "create fc op";
+    auto* fc = block_desc.AppendOp();
+    fc->SetType("mul");
+    fc->SetInput("X", std::vector<std::string>({x_name}));
+    fc->SetInput("Y", std::vector<std::string>({y_name}));
+    fc->SetOutput("Out", std::vector<std::string>({z_name}));
+
+    // Set inputs' variable shape in BlockDesc
+    if (!x_created) {
+      AddTensorToBlockDesc(block_, x_name,
+                           std::vector<int64_t>({batch_size, input_dim, 1, 1}));
+    }
+    AddTensorToBlockDesc(block_, y_name,
+                         std::vector<int64_t>({input_dim, output_dim}));
+    AddTensorToBlockDesc(block_, z_name,
+                         std::vector<int64_t>({batch_size, output_dim}));
+
+    // Prepare variables.
+    if (!x_created) {
+      CreateCPUTensor(&scope, x_name, std::vector<int64_t>(x_shape));
+    }
+    CreateCPUTensor(&scope, y_name, std::vector<int64_t>(y_shape));
+    CreateCPUTensor(&scope, z_name, std::vector<int64_t>(z_shape));
+
+    // It is wired, need to copy manually.
+    *block_->add_ops() = *fc->Proto();
+  };
+
+  // Test with 4 layer FC
+  AddFCLayer("x0", "y0", "z0", false, {batch_size, input_dim},
+             {input_dim, output_dim}, {batch_size, output_dim});
+  AddFCLayer("z0", "y1", "z1", true, {}, {output_dim, output_dim},
+             {batch_size, output_dim});
+  AddFCLayer("z1", "y2", "z2", true, {}, {output_dim, output_dim},
+             {batch_size, output_dim});
+  AddFCLayer("z2", "y3", "z3", true, {}, {output_dim, output_dim},
+             {batch_size, output_dim});
+
+  LOG(INFO) << "create tensorrt desc";
+  framework::OpDesc engine_op_desc(nullptr);
+  engine_op_desc.SetType("tensorrt_engine");
+  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x0"}));
+  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z3"}));
+
+  SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
+                       block_->SerializeAsString());
+  SetAttr<int>(engine_op_desc.Proto(), "max_batch", batch_size);
+  SetAttr<int>(engine_op_desc.Proto(), "max_workspace", 2 << 10);
+  SetAttr<std::vector<std::string>>(
+      engine_op_desc.Proto(), "parameters",
+      std::vector<std::string>({"y0", "y1", "y2", "y3"}));
+  SetAttr<std::string>(engine_op_desc.Proto(), "engine_uniq_key", "b_engine");
+
+  auto engine_op = framework::OpRegistry::CreateOp(*engine_op_desc.Proto());
+
+  // Execute them.
+  engine_op->Run(scope, place);
+}
+
+// Test with a larger FC layer.
+TEST(TensorRTEngineOp, fc) { Execute(40, 28, 28); }
+
+}  // namespace operators
+}  // namespace paddle
+
+USE_TRT_CONVERTER(mul)
+USE_TRT_CONVERTER(fc)
diff --git a/paddle/fluid/operators/test_send_nccl_id.cc b/paddle/fluid/operators/test_send_nccl_id.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e2b7b6b8e447381229e4ad594b7974bc0aa159d5
--- /dev/null
+++ b/paddle/fluid/operators/test_send_nccl_id.cc
@@ -0,0 +1,106 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <string>
+#include <thread>  // NOLINT
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/detail/macros.h"
+#include "paddle/fluid/operators/distributed/request_handler_impl.h"
+#include "paddle/fluid/operators/listen_and_serv_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/string/printf.h"
+
+#ifdef PADDLE_WITH_GRPC
+#include "paddle/fluid/operators/send_recv_util.h"
+#endif
+
+USE_NO_KERNEL_OP(listen_and_serv);
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+namespace distributed = paddle::operators::distributed;
+namespace string = paddle::string;
+
+std::unique_ptr<distributed::RPCServer> g_rpc_service;
+std::unique_ptr<distributed::RequestHandler> g_req_handler;
+
+void StartServer() {
+  f::Scope scope;
+  p::CPUPlace place;
+  scope.Var(NCCL_ID_VARNAME);
+  p::DeviceContextPool& pool = p::DeviceContextPool::Instance();
+  auto& dev_ctx = *pool.Get(p::CPUPlace());
+
+  f::ProgramDesc empty_program;
+  f::Executor executor(dev_ctx.GetPlace());
+  g_req_handler->SetScope(&scope);
+  g_req_handler->SetDevCtx(&dev_ctx);
+  g_req_handler->SetProgram(&empty_program);
+  g_req_handler->SetExecutor(&executor);
+
+  g_rpc_service->RegisterRPC(distributed::kRequestSend, g_req_handler.get());
+  g_req_handler->SetRPCServer(g_rpc_service.get());
+
+  std::thread server_thread(
+      std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
+
+  g_rpc_service->SetCond(distributed::kRequestSend);
+  g_rpc_service->WaitBarrier(distributed::kRequestSend);
+
+  LOG(INFO) << "got nccl id and stop server...";
+  g_rpc_service->ShutDown();
+  server_thread.join();
+}
+
+TEST(SendNcclId, RPCServer) {
+  g_req_handler.reset(new distributed::RequestSendHandler(true));
+  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
+
+  std::thread server_thread(StartServer);
+  g_rpc_service->WaitServerReady();
+
+  f::Scope scope;
+  p::CPUPlace place;
+  p::DeviceContextPool& pool = p::DeviceContextPool::Instance();
+  auto& dev_ctx = *pool.Get(p::CPUPlace());
+
+  auto var = scope.Var(NCCL_ID_VARNAME);
+  auto id = var->GetMutable<ncclUniqueId>();
+  p::dynload::ncclGetUniqueId(id);
+
+  int port = g_rpc_service->GetSelectedPort();
+
+  std::string ep = string::Sprintf("127.0.0.1:%d", port);
+
+  distributed::RPCClient* client =
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>();
+
+  LOG(INFO) << "connect to server" << ep;
+  client->AsyncSendVar(ep, dev_ctx, scope, NCCL_ID_VARNAME);
+  client->Wait();
+  client->AsyncSendBatchBarrier(ep);
+  client->Wait();
+
+  server_thread.join();
+  g_rpc_service.reset(nullptr);
+  g_req_handler.reset(nullptr);
+}
diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
index 2e4e8caed5327f4ca9038c376de2ec831354917e..4a8ac441cfaf642fde58ee30865a22e83c065498 100644
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -48,10 +48,9 @@ class TopkOp : public framework::OperatorWithKernel {
 
 class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  TopkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("X", "(Tensor) The input of Topk op");
-    AddOutput("Out", "(Tensor) The output tensor of Topk op");
+    AddOutput("Out", "(Tensor) The output tensor of Topk op").Reuse("X");
     AddOutput("Indices", "(Tensor) The indices of Topk elements of input");
     AddComment(R"DOC(
 Top K operator
@@ -75,4 +74,5 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(top_k, ops::TopkOp, ops::TopkOpMaker,
                   paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(top_k,
-                       ops::TopkKernel<paddle::platform::CPUPlace, float>);
+                       ops::TopkKernel<paddle::platform::CPUPlace, float>,
+                       ops::TopkKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index bfd26c2f2294f954adc81a1719650c46372098c4..9da8551eb2d7ea66ad434c42b54522432095ce29 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/top_k_op.h"
 #include "paddle/fluid/platform/assert.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
 
 namespace paddle {
 namespace operators {
@@ -133,71 +135,71 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
 }
 
 template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam,
+__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
                                               int beam_size, const T* src,
-                                              bool& firstStep, bool& is_empty,
-                                              Pair<T>& max, int dim,
+                                              bool* firstStep, bool* is_empty,
+                                              Pair<T>* max, int dim,
                                               const int tid) {
-  if (beam > 0) {
-    int length = beam < beam_size ? beam : beam_size;
-    if (firstStep) {
-      firstStep = false;
+  if (*beam > 0) {
+    int length = (*beam) < beam_size ? *beam : beam_size;
+    if (*firstStep) {
+      *firstStep = false;
       GetTopK<T, BlockSize>(topk, src, tid, dim, length);
     } else {
       for (int k = 0; k < MaxLength; k++) {
-        if (k < MaxLength - beam) {
-          topk[k] = topk[k + beam];
+        if (k < MaxLength - (*beam)) {
+          topk[k] = topk[k + *beam];
         } else {
           topk[k].set(-INFINITY, -1);
         }
       }
-      if (!is_empty) {
-        GetTopK<T, BlockSize>(topk + MaxLength - beam, src, tid, dim, max,
+      if (!(*is_empty)) {
+        GetTopK<T, BlockSize>(topk + MaxLength - *beam, src, tid, dim, *max,
                               length);
       }
     }
 
-    max = topk[MaxLength - 1];
-    if (max.v == -1) is_empty = true;
-    beam = 0;
+    *max = topk[MaxLength - 1];
+    if ((*max).v == -1) *is_empty = true;
+    *beam = 0;
   }
 }
 
 template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam,
+__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
                                               int beam_size, const T* val,
-                                              int* col, bool& firstStep,
-                                              bool& is_empty, Pair<T>& max,
+                                              int* col, bool* firstStep,
+                                              bool* is_empty, Pair<T>* max,
                                               int dim, const int tid) {
-  if (beam > 0) {
-    int length = beam < beam_size ? beam : beam_size;
-    if (firstStep) {
-      firstStep = false;
+  if (*beam > 0) {
+    int length = (*beam) < beam_size ? *beam : beam_size;
+    if (*firstStep) {
+      *firstStep = false;
       GetTopK<T, BlockSize>(topk, val, col, tid, dim, length);
     } else {
       for (int k = 0; k < MaxLength; k++) {
-        if (k < MaxLength - beam) {
-          topk[k] = topk[k + beam];
+        if (k < MaxLength - *beam) {
+          topk[k] = topk[k + *beam];
         } else {
           topk[k].set(-INFINITY, -1);
         }
       }
-      if (!is_empty) {
-        GetTopK<T, BlockSize>(topk + MaxLength - beam, val, col, tid, dim, max,
+      if (!(*is_empty)) {
+        GetTopK<T, BlockSize>(topk + MaxLength - *beam, val, col, tid, dim, max,
                               length);
       }
     }
 
-    max = topk[MaxLength - 1];
-    if (max.v == -1) is_empty = true;
-    beam = 0;
+    *max = topk[MaxLength - 1];
+    if ((*max).v == -1) *is_empty = true;
+    *beam = 0;
   }
 }
 
 template <typename T, int MaxLength, int BlockSize>
 __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
                                             Pair<T> topk[], T** topVal,
-                                            int64_t** topIds, int& beam, int& k,
+                                            int64_t** topIds, int* beam, int* k,
                                             const int tid, const int warp) {
   while (true) {
     __syncthreads();
@@ -225,17 +227,23 @@ __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
       (*topVal)++;
       (*topIds)++;
     }
-    if (tid == maxid[0]) beam++;
-    if (--k == 0) break;
+    if (tid == maxid[0]) (*beam)++;
+    if (--(*k) == 0) break;
     __syncthreads();
 
     if (tid == maxid[0]) {
-      if (beam < MaxLength) {
-        sh_topk[tid] = topk[beam];
+      if (*beam < MaxLength) {
+        sh_topk[tid] = topk[*beam];
       }
     }
+    // NOTE(zcd): temporary solution
+    unsigned mask = 0u;
+    CREATE_SHFL_MASK(mask, true);
+
     if (maxid[0] / 32 == warp) {
-      if (__shfl(beam, (maxid[0]) % 32, 32) == MaxLength) break;
+      if (platform::CudaShuffleSync(mask, *beam, (maxid[0]) % 32, 32) ==
+          MaxLength)
+        break;
     }
   }
 }
@@ -268,13 +276,13 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
     topk[k].set(-INFINITY, -1);
   }
   while (k) {
-    ThreadGetTopK<T, MaxLength, BlockSize>(topk, beam, k,
-                                           src + blockIdx.x * lds, firststep,
-                                           is_empty, max, dim, tid);
+    ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k,
+                                           src + blockIdx.x * lds, &firststep,
+                                           &is_empty, &max, dim, tid);
 
     sh_topk[tid] = topk[0];
     BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &output,
-                                         &indices, beam, k, tid, warp);
+                                         &indices, &beam, &k, tid, warp);
   }
 }
 
@@ -308,13 +316,14 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
     KeMatrixTopK<T, 5, 256><<<
         grid, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                               ctx.device_context())
-                              .stream()>>>(output_data, output->dims()[1],
-                                           indices_data, input_data,
-                                           input_width, input_width, int(k));
+                              .stream()>>>(
+        output_data, output->dims()[1], indices_data, input_data, input_width,
+        input_width, static_cast<int>(k));
   }
 };
 
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>,
+                        paddle::operators::TopkOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h
index 42828b7e6564d7da91d608d63fbc0615ef6c4f97..7ddb82ef6ff063868a4b9b603b8ab89700b9dd13 100644
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include <iostream>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -22,7 +24,6 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
 
 template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
@@ -34,9 +35,9 @@ class TopkKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     // Get the top k elements of each row of input tensor
     // FIXME: only deal with matrix(2d tensor).
-    auto* input = ctx.Input<LoDTensor>("X");
-    auto* output = ctx.Output<LoDTensor>("Out");
-    auto* indices = ctx.Output<LoDTensor>("Indices");
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    auto* indices = ctx.Output<Tensor>("Indices");
     // k is determined by Attr
     const size_t k = static_cast<int>(ctx.Attr<int>("k"));
 
@@ -54,6 +55,9 @@ class TopkKernel : public framework::OpKernel<T> {
     // NOTE: eigen shape doesn't affect paddle tensor.
     eg_input.reshape(flat2dims);
 
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
     for (size_t i = 0; i < row; i++) {
       std::vector<std::pair<T, size_t>> vec;
       for (size_t j = 0; j < col; j++) {
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index 87b1f530e08df7022d112b26e28511a982052126..60556a564c25c08612447ebd47a4b432b8a12d29 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/transpose_op.h"
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -55,8 +56,7 @@ class TransposeOp : public framework::OperatorWithKernel {
 
 class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  TransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(
         "X",
         "(Tensor) The input tensor, tensors with rank up to 6 are supported.");
@@ -117,8 +117,9 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(transpose, ops::TransposeOp, ops::TransposeOpMaker, transpose_grad,
-            ops::TransposeOpGrad);
+REGISTER_OPERATOR(transpose, ops::TransposeOp, ops::TransposeOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad);
 REGISTER_OP_CPU_KERNEL(
     transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h
index 90f16499a6f52514bfed3dbeb4176ccc956b23d7..895d1ce2cca19c0c1e4aa03cc64eb1425e8bab1a 100644
--- a/paddle/fluid/operators/transpose_op.h
+++ b/paddle/fluid/operators/transpose_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
diff --git a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
index 00f00bb403db5e40939a1502b2219fb4d36d58e5..75d6181749e4e9bd81a3c02de69caf0acd81eef9 100644
--- a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc
@@ -32,14 +32,13 @@ class UniformRandomBatchSizeLikeOp : public BatchSizeLikeOp {
 };
 
 class UniformRandomBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker {
- public:
-  UniformRandomBatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : BatchSizeLikeOpMaker(proto, op_checker) {
+ protected:
+  void Apply() override {
     AddComment(R"DOC(
-Uniform random operator
+UniformRandomBatchSizeLike operator.
 
 This operator initializes a tensor with the same batch_size as the Input tensor
- with random values sampled from a uniform distribution.
+with random values sampled from a uniform distribution.
 
 )DOC");
     AddAttr<float>("min",
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 87699362b2b5a14750a01345098ec5e6cc9be115..edd1baa4ace4e246190afcd12b0716f1dd38e243 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -24,7 +24,19 @@ template <typename T>
 class CPUUniformRandomKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* tensor = ctx.Output<framework::Tensor>("Out");
+    framework::Tensor* tensor = nullptr;
+    auto out_var = ctx.OutputVar("Out");
+    if (out_var->IsType<framework::LoDTensor>()) {
+      tensor = out_var->GetMutable<framework::LoDTensor>();
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      auto shape = ctx.Attr<std::vector<int>>("shape");
+      tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
+      tensor->Resize(framework::make_ddim(shape));
+    } else {
+      PADDLE_THROW(
+          "uniform_random_op's output only"
+          "supports SelectedRows and Tensor");
+    }
     T* data = tensor->mutable_data<T>(ctx.GetPlace());
     unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
     std::minstd_rand engine;
@@ -73,42 +85,53 @@ class UniformRandomOp : public framework::OperatorWithKernel {
 
 class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  UniformRandomOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
-    AddOutput("Out", "(Tensor) The output tensor of uniform random op");
+  void Make() override {
+    AddOutput("Out", "The output tensor of uniform random op");
     AddComment(R"DOC(
-Uniform random operator.
-
 This operator initializes a tensor with random values sampled from a
-uniform distribution.
+uniform distribution. The random result is in set [min, max].
 
 )DOC");
-    AddAttr<std::vector<int>>("shape",
-                              "(vector<int>) The shape of the output tensor");
-    AddAttr<float>("min",
-                   "(float, default -1.0) "
-                   "Minimum value of uniform random")
+    AddAttr<std::vector<int>>("shape", "The shape of the output tensor");
+    AddAttr<float>("min", "Minimum value of uniform random. [default -1.0].")
         .SetDefault(-1.0f);
-    AddAttr<float>("max",
-                   "(float, default 1.0) "
-                   "Maximun value of uniform random")
+    AddAttr<float>("max", "Maximun value of uniform random. [default 1.0].")
         .SetDefault(1.0f);
     AddAttr<int>("seed",
-                 "(int, default 0) "
                  "Random seed used for generating samples. "
                  "0 means use a seed generated by the system."
                  "Note that if seed is not 0, this operator will always "
-                 "generate the same random numbers every time.")
+                 "generate the same random numbers every time. [default 0].")
         .SetDefault(0);
-    AddAttr<int>("dtype", "(int, default 5(FP32)) Output tensor data type")
+    AddAttr<int>("dtype", "Output tensor data type. [default 5(FP32)].")
         .SetDefault(framework::proto::VarType::FP32);
   }
 };
+
+class UniformRandomOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    auto out_var_name = op_desc.Output("Out").front();
+    if (block->FindRecursiveOrCreateVar(out_var_name).GetType() ==
+        framework::proto::VarType::SELECTED_ROWS) {
+      block->FindRecursiveOrCreateVar(out_var_name)
+          .SetType(framework::proto::VarType::SELECTED_ROWS);
+    } else {
+      block->FindRecursiveOrCreateVar(out_var_name)
+          .SetType(framework::proto::VarType::LOD_TENSOR);
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_WITHOUT_GRADIENT(uniform_random, paddle::operators::UniformRandomOp,
-                             paddle::operators::UniformRandomOpMaker);
+REGISTER_OPERATOR(uniform_random, paddle::operators::UniformRandomOp,
+                  paddle::operators::UniformRandomOpMaker,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::UniformRandomOpVarTypeInference);
+
 REGISTER_OP_CPU_KERNEL(uniform_random,
                        paddle::operators::CPUUniformRandomKernel<float>,
                        paddle::operators::CPUUniformRandomKernel<double>);
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 1232cd1eb332441b12e59a34b2c2f75669925fd0..e1c7323a30233f4ec4f60e46aa6088ee6d8601b7 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -43,7 +43,19 @@ template <typename T>
 class GPUUniformRandomKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* tensor = context.Output<framework::Tensor>("Out");
+    framework::Tensor* tensor = nullptr;
+    auto out_var = context.OutputVar("Out");
+    if (out_var->IsType<framework::LoDTensor>()) {
+      tensor = out_var->GetMutable<framework::LoDTensor>();
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      auto shape = context.Attr<std::vector<int>>("shape");
+      tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
+      tensor->Resize(framework::make_ddim(shape));
+    } else {
+      PADDLE_THROW(
+          "uniform_random_op's output only"
+          "supports SelectedRows and Tensor");
+    }
     T* data = tensor->mutable_data<T>(context.GetPlace());
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
     if (seed == 0) {
diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc
index 0ca7ea00fafc5cf7ab240e1e41710d3b791dfbfb..1d441b43b14ea194152095874645f8133c423efd 100644
--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/unpool_op.h"
+#include <string>
+#include <vector>
 namespace paddle {
 namespace operators {
 
 class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  Unpool2dOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(
         "X",
         "(Tensor) The input tensor of unpool operator. "
@@ -130,8 +131,9 @@ class UnpoolOpGrad : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad,
-            ops::UnpoolOpGrad);
+REGISTER_OPERATOR(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(unpool_grad, ops::UnpoolOpGrad);
 REGISTER_OP_CPU_KERNEL(
     unpool, ops::UnpoolKernel<paddle::platform::CPUDeviceContext, float>,
     ops::UnpoolKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/unpool_op.h b/paddle/fluid/operators/unpool_op.h
index a4421045756bd39728fc14c06efd11a56c7e55af..96abad3de9b959ee611355c67f1fa9e56c430b1b 100644
--- a/paddle/fluid/operators/unpool_op.h
+++ b/paddle/fluid/operators/unpool_op.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/unpooling.h"
diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc
index 940bf4fe7baa6a01a2143374b502c61d0b55fd77..e06c8c962f45a4e91b7efed7431571f0fc6870a3 100644
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -53,8 +53,7 @@ class WarpCTCOp : public framework::OperatorWithKernel {
 
 class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  WarpCTCOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput("Logits",
              "(LodTensor, default: LoDTensor<float>), the unscaled "
              "probabilities of variable-length sequences, which is a 2-D "
@@ -132,8 +131,9 @@ class WarpCTCGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker, warpctc_grad,
-            ops::WarpCTCGradOp);
+REGISTER_OPERATOR(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(warpctc_grad, ops::WarpCTCGradOp);
 REGISTER_OP_CPU_KERNEL(
     warpctc, ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h
index 3e3e3089315ab9365925c38b9bce5fb0120d37c3..ab70c1f0592d122ba248a101db487e64c0bdae6f 100644
--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence_padding.h"
@@ -161,7 +162,7 @@ class WarpCTCKernel : public framework::OpKernel<T> {
                               static_cast<int64_t>(sequence_width)});
     warpctc_logits.mutable_data<T>(warpctc_logits_dims, ctx.GetPlace());
     math::PaddingLoDTensorFunctor<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), *logits, warpctc_logits,
+        ctx.template device_context<DeviceContext>(), *logits, &warpctc_logits,
         false);
     const T* warpctc_logits_data = warpctc_logits.data<T>();
 
@@ -185,8 +186,7 @@ class WarpCTCKernel : public framework::OpKernel<T> {
 
     // warpctc accesses labels in CPU memory
     Tensor warpctc_label;
-    TensorCopy(*label, platform::CPUPlace(), ctx.device_context(),
-               &warpctc_label);
+    TensorCopySync(*label, platform::CPUPlace(), &warpctc_label);
     const int* warpctc_label_data = warpctc_label.data<int>();
     // warpctc stores loss in CPU memory
     Tensor warpctc_loss;
@@ -216,13 +216,13 @@ class WarpCTCGradKernel : public framework::OpKernel<T> {
     logits_grad->mutable_data<T>(ctx.GetPlace());
     bool norm_by_times = ctx.Attr<bool>("norm_by_times");
     math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), *logits_grad,
+        ctx.template device_context<DeviceContext>(), logits_grad,
         *warpctc_grad, norm_by_times);
 
     const T* loss_grad_data = loss_grad->data<T>();
     math::ScaleLoDTensorFunctor<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), *logits_grad,
-        loss_grad_data);
+        ctx.template device_context<DeviceContext>(), loss_grad_data,
+        logits_grad);
   }
 };
 
diff --git a/paddle/fluid/operators/while_op.cc b/paddle/fluid/operators/while_op.cc
index 8b62b242cf8745378eb216db10605388b294ca75..733157ea05ed39434b9a750e3a94ea548f512ce6 100644
--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 
 namespace paddle {
@@ -68,8 +69,7 @@ class WhileOp : public framework::OperatorBase {
 
 class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  WhileOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
+  void Make() override {
     AddInput(kX,
              "A set of variables, which are required by operators inside the "
              "block of While Op.")
@@ -136,15 +136,14 @@ class WhileGradOp : public framework::OperatorBase {
         auto &og_inside =
             detail::Ref(cur_scope.Var(inside_og_name),
                         "Cannot find inside gradient %s", inside_og_name);
-        if (og_outside.Type().hash_code() ==
-            typeid(framework::LoDTensor).hash_code()) {
+        if (framework::IsType<framework::LoDTensor>(og_outside.Type())) {
           auto &outside_tensor = og_outside.Get<framework::LoDTensor>();
           auto &inside_tensor =
               detail::Ref(og_inside.GetMutable<framework::LoDTensor>());
           inside_tensor.set_lod(outside_tensor.lod());
           inside_tensor.ShareDataWith(outside_tensor);
-        } else if (og_outside.Type().hash_code() ==
-                   typeid(framework::LoDTensorArray).hash_code()) {
+        } else if (framework::IsType<framework::LoDTensorArray>(
+                       og_outside.Type())) {
           auto &outside_array = og_outside.Get<framework::LoDTensorArray>();
           auto &inside_array =
               detail::Ref(og_inside.GetMutable<framework::LoDTensorArray>());
@@ -204,11 +203,11 @@ class WhileGradOp : public framework::OperatorBase {
                 ->set_lod(inside_tensor.lod());
           }
         }
-
         auto new_inside_name = cur_scope.Rename(inside_grad_name);
         auto sum_op = framework::OpRegistry::CreateOp(
             "sum", {{"X", {pg_names[param_id], new_inside_name}}},
-            {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
+            {{"Out", {pg_names[param_id]}}},
+            framework::AttributeMap{{"use_mkldnn", {false}}});
         sum_op->Run(cur_scope, dev_place);
         cur_scope.Rename(new_inside_name, inside_grad_name);
       }
@@ -288,7 +287,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
     while_grad->SetInput(framework::GradVarName(kOutputs), output_grads_list);
 
     while_grad->SetAttrMap(this->Attrs());
-    while_grad->SetBlockAttr(kStepBlock, *grad_block);
+    while_grad->SetBlockAttr(kStepBlock, grad_block);
     // record the original output gradient names, since the gradient name of
     // while operator could be renamed.
     while_grad->SetAttr("original_output_grad", output_grads_list);
diff --git a/paddle/fluid/platform/.clang-format b/paddle/fluid/platform/.clang-format
deleted file mode 100644
index 29282dc87e2c499988c17d90d47d44cd5cf7f115..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/.clang-format
+++ /dev/null
@@ -1,5 +0,0 @@
----
-Language:        Cpp
-BasedOnStyle:  Google
-Standard:  Cpp11 
-...
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 7eec6ab657723c6390dfa14a78d6c49a76f2a279..b29035bafd34fa81dc6b59691142fe74439202b8 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -1,4 +1,4 @@
-proto_library(profiler_proto SRCS profiler.proto)
+proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto)
 py_proto_compile(profiler_py_proto SRCS profiler.proto)
 
 add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
@@ -6,13 +6,13 @@ add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch _
 add_dependencies(profiler_py_proto profiler_py_proto_init)
 
 add_custom_command(TARGET profiler_py_proto POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/profiler
-        COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/fluid/proto/profiler
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
+        COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
         COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
         WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
 if(WITH_GPU)
-  cc_library(enforce SRCS enforce.cc DEPS)
+  nv_library(enforce SRCS enforce.cc)
 else()
   cc_library(enforce SRCS enforce.cc)
 endif()
@@ -42,14 +42,14 @@ ENDIF()
 
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
-cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
-    system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
+cc_library(device_context SRCS device_context.cc DEPS malloc
+    place eigen3 ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
 
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
-nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
+nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
 
-cc_library(device_tracer SRCS device_tracer.cc DEPS profiler_proto ${GPU_CTX_DEPS})
+cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
 
diff --git a/paddle/fluid/platform/assert.h b/paddle/fluid/platform/assert.h
index 123d3598f4f4753f70889e415aff0f41b7d212f7..2ce9b31bb81de867ff4ed6ee14afddecd95317b9 100644
--- a/paddle/fluid/platform/assert.h
+++ b/paddle/fluid/platform/assert.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #define STRINGIFY(x) #x
 #define TOSTRING(x) STRINGIFY(x)
 
-#if defined(__APPLE__) && defined(__CUDA_ARCH__) && !defined(NDEBUG)
+#if defined(__CUDA_ARCH__)
 #include <stdio.h>
 #define PADDLE_ASSERT(e)                                           \
   do {                                                             \
@@ -38,6 +38,9 @@ limitations under the License. */
   } while (0)
 #else
 #include <assert.h>
-#define PADDLE_ASSERT(e) assert(e)
+// For cuda, the assertions can affect performance and it is therefore
+// recommended to disable them in production code
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion
+#define PADDLE_ASSERT(e) assert((e))
 #define PADDLE_ASSERT_MSG(e, m) assert((e) && (m))
 #endif
diff --git a/paddle/fluid/platform/call_once.h b/paddle/fluid/platform/call_once.h
deleted file mode 100644
index fa34972c38d6e7f77a7e178d68592f9886748fa1..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/call_once.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <mutex>
-
-namespace paddle {
-namespace platform {
-
-/*
- The current implementation of std::call_once has a bug described in
- https://stackoverflow.com/questions/41717579/stdcall-once-hangs-on-second-call-after-callable-threw-on-first-call.
- This is likely caused by a deeper bug of pthread_once, which is discussed in
- https://patchwork.ozlabs.org/patch/482350/
-
- This wrap is a hack to avoid this bug.
-*/
-template <typename Callable, typename... Args>
-inline void call_once(std::once_flag& flag, Callable&& f, Args&&... args) {
-  bool good = true;
-  std::exception ex;
-  try {
-    std::call_once(flag,
-                   [&](Args&&... args) {
-                     try {
-                       f(args...);
-                     } catch (const std::exception& e) {
-                       ex = e;
-                       good = false;
-                     } catch (...) {
-                       ex = std::runtime_error("excption caught in call_once");
-                       good = false;
-                     }
-                   },
-                   args...);
-  } catch (std::system_error& x) {
-    throw std::runtime_error("call once failed");
-  }
-  if (!good) {
-    throw std::exception(ex);
-  }
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index 8db08edba805e41d33ec6a6a4b338cca0d4906ef..f832d72b53e8d06a32d5c0ac2ecf7130aa28a666 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -21,12 +21,28 @@ limitations under the License. */
 #include <unistd.h>
 #endif
 
+#include <algorithm>
 #include "gflags/gflags.h"
 
 DEFINE_double(fraction_of_cpu_memory_to_use, 1,
               "Default use 100% of CPU memory for PaddlePaddle,"
               "reserve the rest for page tables, etc");
 
+DEFINE_uint64(initial_cpu_memory_in_mb,
+#ifdef PADDLE_WITH_MKLDNN
+              /* Aligned with mozga-intel, MKLDNN need at least 5000 MB
+               * to obtain the best performance*/
+              5000,
+#else
+              500,
+#endif
+              "Initial CPU memory for PaddlePaddle, in MD unit.");
+
+DEFINE_double(
+    fraction_of_cuda_pinned_memory_to_use, 0.5,
+    "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
+    "reserve the rest for page tables, etc");
+
 namespace paddle {
 namespace platform {
 
@@ -58,8 +74,28 @@ size_t CpuMinChunkSize() {
 }
 
 size_t CpuMaxChunkSize() {
-  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory.
-  return CpuMaxAllocSize() / 32;
+  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory,
+  // or the initial_cpu_memory_in_mb.
+  return std::min(
+      static_cast<size_t>(CpuMaxAllocSize() / 32),
+      static_cast<size_t>(FLAGS_initial_cpu_memory_in_mb * 1 << 20));
+}
+
+size_t CUDAPinnedMaxAllocSize() {
+  // For distributed systems, it requires configuring and limiting
+  // the fraction of memory to use.
+  return FLAGS_fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory();
+}
+
+size_t CUDAPinnedMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 64 KB.
+  return 1 << 16;
+}
+
+size_t CUDAPinnedMaxChunkSize() {
+  // Allow to allocate the maximum chunk size is roughly 1/256 of CUDA_PINNED
+  // memory.
+  return CUDAPinnedMaxAllocSize() / 256;
 }
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index a930151bd15a33d5b8861c6239e7dd964822f0f6..f06c2b67fe4385f427322e9bb2f3080fdd3acc94 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -22,11 +22,20 @@ namespace platform {
 //! Get the maximum allocation size for a machine.
 size_t CpuMaxAllocSize();
 
+//! Get the maximum allocation size for a machine.
+size_t CUDAPinnedMaxAllocSize();
+
 //! Get the minimum chunk size for buddy allocator.
 size_t CpuMinChunkSize();
 
 //! Get the maximum chunk size for buddy allocator.
 size_t CpuMaxChunkSize();
 
+//! Get the minimum chunk size for buddy allocator.
+size_t CUDAPinnedMinChunkSize();
+
+//! Get the maximum chunk size for buddy allocator.
+size_t CUDAPinnedMaxChunkSize();
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/cpu_info_test.cc b/paddle/fluid/platform/cpu_info_test.cc
index 78332f90cd96d80cca0cf865f4815aaf18463253..aac882e846309f23f49f68aba805da0857c7fb2d 100644
--- a/paddle/fluid/platform/cpu_info_test.cc
+++ b/paddle/fluid/platform/cpu_info_test.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/string/printf.h"
 
 #include <ostream>
 #include <sstream>
@@ -20,6 +19,7 @@
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/string/printf.h"
 
 DECLARE_double(fraction_of_cpu_memory_to_use);
 
diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..ecec4178f2d9937920e52eb74bf9068b84e741a0
--- /dev/null
+++ b/paddle/fluid/platform/cuda_device_function.h
@@ -0,0 +1,84 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cuda.h>
+
+namespace paddle {
+namespace platform {
+
+#if CUDA_VERSION < 9000
+#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
+#else
+#define FULL_WARP_MASK 0xFFFFFFFF
+#define CREATE_SHFL_MASK(mask, predicate) \
+  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
+#endif
+
+template <typename T>
+__forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
+                                                 int delta, int width = 32) {
+#if CUDA_VERSION < 9000
+  return __shfl_down(val, delta, width);
+#else
+  return __shfl_down_sync(mask, val, delta, width);
+#endif
+}
+
+template <typename T>
+__forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
+                                             int width = 32) {
+#if CUDA_VERSION < 9000
+  return __shfl(val, src_line, width);
+#else
+  return __shfl_sync(mask, val, src_line, width);
+#endif
+}
+
+template <typename T>
+__device__ T reduceSum(T val, int tid, int len) {
+  // NOTE(zcd): The warp size should be taken from the
+  // parameters of the GPU but not specified as 32 simply.
+  // To make the reduceSum more efficiently,
+  // I use Warp-Level Parallelism and assume the Warp size
+  // is 32 which may be different for different GPU,
+  // but most card's warp size is 32.
+  const int warpSize = 32;
+  __shared__ T shm[warpSize];
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, tid < len);
+
+  for (int offset = warpSize / 2; offset > 0; offset /= 2)
+    val += platform::CudaShuffleDownSync(mask, val, offset);
+
+  if (tid < warpSize) shm[tid] = 0;
+  __syncthreads();
+
+  if (tid % warpSize == 0) {
+    shm[tid / warpSize] = val;
+  }
+  __syncthreads();
+
+  CREATE_SHFL_MASK(mask, tid < warpSize);
+
+  if (tid < warpSize) {
+    val = shm[tid];
+    for (int offset = warpSize / 2; offset > 0; offset /= 2)
+      val += platform::CudaShuffleDownSync(mask, val, offset);
+  }
+  return val;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h
deleted file mode 100644
index a4ea4f21e3c16c9292cf67863616924e9d9f8aba..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/cuda_helper.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cuda.h>
-
-namespace paddle {
-namespace platform {
-
-#define CUDA_ATOMIC_WRAPPER(op, T) \
-  __device__ __forceinline__ T CudaAtomic##op(T* address, const T val)
-
-#define USE_CUDA_ATOMIC(op, T) \
-  CUDA_ATOMIC_WRAPPER(op, T) { return atomic##op(address, val); }
-
-// Default thread count per block(or block size).
-// TODO(typhoonzero): need to benchmark against setting this value
-//                    to 1024.
-constexpr int PADDLE_CUDA_NUM_THREADS = 512;
-
-// For atomicAdd.
-USE_CUDA_ATOMIC(Add, float);
-USE_CUDA_ATOMIC(Add, int);
-USE_CUDA_ATOMIC(Add, unsigned int);
-USE_CUDA_ATOMIC(Add, unsigned long long int);
-
-CUDA_ATOMIC_WRAPPER(Add, int64_t) {
-  static_assert(sizeof(int64_t) == sizeof(long long int),
-                "long long should be int64");
-  return CudaAtomicAdd(reinterpret_cast<unsigned long long int*>(address),
-                       static_cast<unsigned long long int>(val));
-}
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
-USE_CUDA_ATOMIC(Add, double);
-#else
-CUDA_ATOMIC_WRAPPER(Add, double) {
-  unsigned long long int* address_as_ull =
-      reinterpret_cast<unsigned long long int*>(address);
-  unsigned long long int old = *address_as_ull, assumed;
-
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-                    __double_as_longlong(val + __longlong_as_double(assumed)));
-
-    // Note: uses integer comparison to avoid hang in case of NaN
-  } while (assumed != old);
-
-  return __longlong_as_double(old);
-}
-#endif
-
-// __shfl_down has been deprecated as of CUDA 9.0.
-#if CUDA_VERSION < 9000
-template <typename T>
-__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
-  return __shfl_down(val, delta);
-}
-#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
-#else
-#define FULL_WARP_MASK 0xFFFFFFFF
-#define CREATE_SHFL_MASK(mask, predicate) \
-  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
-#endif
-
-template <typename T>
-__device__ T reduceSum(T val, int tid, int len) {
-  // TODO(zcd): The warp size should be taken from the
-  // parameters of the GPU but not specified as 32 simply.
-  // To make the reduceSum more efficiently,
-  // I use Warp-Level Parallelism and assume the Warp size
-  // is 32 which may be different for different GPU,
-  // but most card's warp size is 32.
-  __shared__ T shm[32];
-  const int warpSize = 32;
-  unsigned mask = 0u;
-  CREATE_SHFL_MASK(mask, tid < len);
-
-  for (int offset = warpSize / 2; offset > 0; offset /= 2)
-    val += __shfl_down_sync(mask, val, offset);
-
-  if (tid < warpSize) shm[tid] = 0;
-
-  __syncthreads();
-
-  if (tid % warpSize == 0) {
-    shm[tid / warpSize] = val;
-  }
-
-  CREATE_SHFL_MASK(mask, tid < warpSize);
-
-  if (tid < warpSize) {
-    val = shm[tid];
-    for (int offset = warpSize / 2; offset > 0; offset /= 2)
-      val += __shfl_down_sync(mask, val, offset);
-  }
-
-  return val;
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h
new file mode 100644
index 0000000000000000000000000000000000000000..d535ed2f89df6a0b311ec068ecd92c8e3183cee7
--- /dev/null
+++ b/paddle/fluid/platform/cuda_primitives.h
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cuda.h>
+
+namespace paddle {
+namespace platform {
+
+#define CUDA_ATOMIC_WRAPPER(op, T) \
+  __device__ __forceinline__ T CudaAtomic##op(T* address, const T val)
+
+#define USE_CUDA_ATOMIC(op, T) \
+  CUDA_ATOMIC_WRAPPER(op, T) { return atomic##op(address, val); }
+
+// Default thread count per block(or block size).
+// TODO(typhoonzero): need to benchmark against setting this value
+//                    to 1024.
+constexpr int PADDLE_CUDA_NUM_THREADS = 512;
+
+// For atomicAdd.
+USE_CUDA_ATOMIC(Add, float);
+USE_CUDA_ATOMIC(Add, int);
+USE_CUDA_ATOMIC(Add, unsigned int);
+// CUDA API uses unsigned long long int, we cannot use uint64_t here.
+// It because unsigned long long int is not necessarily uint64_t
+USE_CUDA_ATOMIC(Add, unsigned long long int);  // NOLINT
+
+CUDA_ATOMIC_WRAPPER(Add, int64_t) {
+  // Here, we check long long int must be int64_t.
+  static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
+                "long long should be int64");
+  return CudaAtomicAdd(
+      reinterpret_cast<unsigned long long int*>(address),  // NOLINT
+      static_cast<unsigned long long int>(val));           // NOLINT
+}
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
+USE_CUDA_ATOMIC(Add, double);
+#else
+CUDA_ATOMIC_WRAPPER(Add, double) {
+  unsigned long long int* address_as_ull =                 // NOLINT
+      reinterpret_cast<unsigned long long int*>(address);  // NOLINT
+  unsigned long long int old = *address_as_ull, assumed;   // NOLINT
+
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+
+    // Note: uses integer comparison to avoid hang in case of NaN
+  } while (assumed != old);
+
+  return __longlong_as_double(old);
+}
+#endif
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/cuda_profiler.h b/paddle/fluid/platform/cuda_profiler.h
index ebd6aebd7688549c6fb14466cfa461b90a9fdde0..41d7c121469edd24c67b4288793cb95159fd4b62 100644
--- a/paddle/fluid/platform/cuda_profiler.h
+++ b/paddle/fluid/platform/cuda_profiler.h
@@ -11,12 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
+
 #include <cuda_profiler_api.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
+
+#include <string>
+
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index 1842ecd745e3f5cb75600ce00d89018f81682632..6ea4f8b7cba18ce7f803dbd9b15a7ae70c3055f2 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -19,8 +19,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/macros.h"
 
+DECLARE_bool(cudnn_deterministic);
+
 namespace paddle {
 namespace platform {
 
@@ -75,16 +78,70 @@ enum class DataLayout {  // Not use
 enum class PoolingMode {
   kMaximum,
   kAverage,
+  kMaximumDeterministic,
 };
 
+#if CUDNN_VERSION < 6000
+#pragma message "CUDNN version under 6.0 is supported at best effort."
+#pragma message "We strongly encourage you to move to 6.0 and above."
+#pragma message "This message is intended to annoy you enough to update."
+#pragma message \
+    "please see https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/"
+
+inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
+  switch (mode) {
+    case PoolingMode::kMaximumDeterministic:
+      return CUDNN_POOLING_MAX;
+    case PoolingMode::kAverage:
+      return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+    case PoolingMode::kMaximum:
+      return CUDNN_POOLING_MAX;
+    default:
+      PADDLE_THROW("Unexpected pooling mode.");
+  }
+}
+#else
+
+inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
+  switch (mode) {
+    case PoolingMode::kMaximumDeterministic:
+      return CUDNN_POOLING_MAX_DETERMINISTIC;
+    case PoolingMode::kAverage:
+      return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+    case PoolingMode::kMaximum:
+      return CUDNN_POOLING_MAX;
+    default:
+      PADDLE_THROW("Unexpected pooling mode.");
+  }
+}
+#endif  // CUDNN_VERSION < 6000
+
 template <typename T>
 class CudnnDataType;
 
+template <>
+class CudnnDataType<float16> {
+ public:
+  static const cudnnDataType_t type = CUDNN_DATA_HALF;
+  // The scaling param type is float for HALF and FLOAT tensors
+  using ScalingParamType = const float;
+  using BatchNormParamType = float;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = 1.0;
+    return &v;
+  }
+  static ScalingParamType* kZero() {
+    static ScalingParamType v = 0.0;
+    return &v;
+  }
+};
+
 template <>
 class CudnnDataType<float> {
  public:
   static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
-  typedef const float ScalingParamType;
+  using ScalingParamType = const float;
+  using BatchNormParamType = float;
   static ScalingParamType* kOne() {
     static ScalingParamType v = 1.0;
     return &v;
@@ -99,7 +156,8 @@ template <>
 class CudnnDataType<double> {
  public:
   static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
-  typedef const double ScalingParamType;
+  using ScalingParamType = const double;
+  using BatchNormParamType = double;
   static ScalingParamType* kOne() {
     static ScalingParamType v = 1.0;
     return &v;
@@ -237,9 +295,11 @@ class ScopedConvolutionDescriptor {
     }
 #endif
 
+    cudnnDataType_t compute_type =
+        (type == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT;
     PADDLE_ENFORCE(dynload::cudnnSetConvolutionNdDescriptor(
         desc_, pads.size(), pads.data(), strides.data(), dilations.data(),
-        CUDNN_CROSS_CORRELATION, type));
+        CUDNN_CROSS_CORRELATION, compute_type));
     return desc_;
   }
 
@@ -271,9 +331,7 @@ class ScopedPoolingDescriptor {
     PADDLE_ENFORCE_EQ(kernel.size(), pads.size());
     PADDLE_ENFORCE_EQ(kernel.size(), strides.size());
     PADDLE_ENFORCE(dynload::cudnnSetPoolingNdDescriptor(
-        desc_, (mode == PoolingMode::kMaximum
-                    ? CUDNN_POOLING_MAX
-                    : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING),
+        desc_, (GetPoolingMode(mode)),
         CUDNN_PROPAGATE_NAN,  // Always propagate nans.
         kernel.size(), kernel.data(), pads.data(), strides.data()));
     return desc_;
@@ -289,7 +347,7 @@ inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
   use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace());
 #ifdef PADDLE_WITH_CUDA
   if (use_cudnn) {
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto& dev_ctx = ctx.device_context<platform::CUDADeviceContext>();
     use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
   }
 #endif
diff --git a/paddle/fluid/platform/details/cuda_transform_iterator_cast.h b/paddle/fluid/platform/details/cuda_transform_iterator_cast.h
new file mode 100644
index 0000000000000000000000000000000000000000..06afc44c257bbeb0729323e1a42e1eead23ff075
--- /dev/null
+++ b/paddle/fluid/platform/details/cuda_transform_iterator_cast.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifndef __NVCC__
+#error device_ptr_cast must be include by .cu file
+#endif
+
+#include <type_traits>  // For std::remove_pointer and std::is_pointer.
+
+#include "thrust/device_ptr.h"
+
+namespace paddle {
+namespace platform {
+namespace details {
+
+// PointerToThrustDevicePtr has two speicalizations, one casts a (CUDA
+// device) pointer into thrust::device_ptr, the other keeps rest types
+// un-casted.
+template <typename T, bool is_ptr>
+struct PointerToThrustDevicePtr;
+
+template <typename T>
+struct PointerToThrustDevicePtr<T, true> {
+  using ELEM = typename std::remove_pointer<T>::type;
+  using RTYPE = thrust::device_ptr<ELEM>;
+
+  inline thrust::device_ptr<ELEM> operator()(ELEM* ele) const {
+    return thrust::device_pointer_cast(ele);
+  }
+};
+
+template <typename T>
+struct PointerToThrustDevicePtr<T, false> {
+  using RTYPE = T;
+  inline RTYPE operator()(RTYPE it) const { return it; }
+};
+
+// CastToCUDATransformIterator casts a pointer to thrust::device_ptr
+// so it could be used as the iterator of thrust::transform.  It
+// doesn't cast other types.
+//
+// We need CastToCUDATransformIterator because it is often that we
+// want to use device memory pointers as transform iterators, e.g., to
+// transform a block of float32 to float16.  In this case, we want
+// CastToCUDATransformIterator to cast float16/32 pointers to
+// thrust::device_ptr, otherwise they cannot work as the iterator
+// required by thrust::transform.  At the same time, we don't want to
+// cast thrust::device_ptr to thrust::device_ptr repeatedly.
+template <typename T>
+auto CastToCUDATransformIterator(T t) ->
+    typename PointerToThrustDevicePtr<T, std::is_pointer<T>::value>::RTYPE {
+  PointerToThrustDevicePtr<T, std::is_pointer<T>::value> cast;
+  return cast(t);
+}
+
+}  // namespace details
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/details/device_ptr_cast.h b/paddle/fluid/platform/details/device_ptr_cast.h
deleted file mode 100644
index 1c502a19c056c7fe434e68d568a0f59bf6315b95..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/details/device_ptr_cast.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef __NVCC__
-#error device_ptr_cast must be include by .cu file
-#endif
-
-#include <thrust/device_ptr.h>
-
-namespace paddle {
-namespace platform {
-namespace details {
-template <typename T, bool is_ptr>
-struct DevicePtrCast;
-
-template <typename T>
-struct DevicePtrCast<T, true> {
-  using ELEM = typename std::remove_pointer<T>::type;
-  using RTYPE = thrust::device_ptr<ELEM>;
-
-  inline thrust::device_ptr<ELEM> operator()(ELEM* ele) const {
-    return thrust::device_pointer_cast(ele);
-  }
-};
-
-template <typename T>
-struct DevicePtrCast<T, false> {
-  using RTYPE = T;
-  inline RTYPE operator()(RTYPE it) const { return it; }
-};
-
-// Cast T to thrust::device_ptr if T is a pointer.
-// Otherwise, e.g., T is a iterator, return T itself.
-template <typename T>
-auto DevPtrCast(T t) ->
-    typename DevicePtrCast<T, std::is_pointer<T>::value>::RTYPE {
-  DevicePtrCast<T, std::is_pointer<T>::value> cast;
-  return cast(t);
-}
-
-}  // namespace details
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 98b4178177b0a8bafd6fe34a92be2a07a2fbc5a7..2cc26da013f59f5b7ee1747d57baca9c1c0efe2c 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -8,8 +8,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #include "paddle/fluid/platform/device_context.h"
+
+#include <set>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
 #include "paddle/fluid/memory/memory.h"
 
 namespace paddle {
@@ -17,36 +22,48 @@ namespace platform {
 
 DeviceContextPool* DeviceContextPool::pool = nullptr;
 
-const platform::DeviceContext* DeviceContextPool::Get(
-    const platform::Place& place) {
+platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
   auto it = device_contexts_.find(place);
   if (it == device_contexts_.end()) {
     PADDLE_THROW(
         "'Place' is not supported, Please re-compile with WITH_GPU "
         "option");
   }
-  return it->second;
+  return it->second.get();
 }
 
 DeviceContextPool::DeviceContextPool(
     const std::vector<platform::Place>& places) {
   PADDLE_ENFORCE_GT(places.size(), 0);
-  for (size_t i = 0; i < places.size(); i++) {
-    if (platform::is_cpu_place(places[i])) {
+  using PtrType = std::unique_ptr<DeviceContext>;
+  std::set<Place> set;
+  for (auto& p : places) {
+    set.insert(p);
+  }
+
+  for (auto& p : set) {
+    if (platform::is_cpu_place(p)) {
 #ifdef PADDLE_WITH_MKLDNN
-      device_contexts_.emplace(places[i],
-                               new platform::MKLDNNDeviceContext(
-                                   boost::get<platform::CPUPlace>(places[i])));
+      device_contexts_.emplace(
+          p, PtrType(new MKLDNNDeviceContext(boost::get<CPUPlace>(p))));
 #else
-      device_contexts_.emplace(places[i],
-                               new platform::CPUDeviceContext(
-                                   boost::get<platform::CPUPlace>(places[i])));
+      device_contexts_.emplace(
+          p, PtrType(new CPUDeviceContext(boost::get<CPUPlace>(p))));
+#endif
+    } else if (platform::is_gpu_place(p)) {
+#ifdef PADDLE_WITH_CUDA
+      device_contexts_.emplace(
+          p, PtrType(new CUDADeviceContext(boost::get<CUDAPlace>(p))));
+#else
+      PADDLE_THROW(
+          "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
+          "option");
 #endif
-    } else if (platform::is_gpu_place(places[i])) {
+    } else if (platform::is_cuda_pinned_place(p)) {
 #ifdef PADDLE_WITH_CUDA
-      device_contexts_.emplace(places[i],
-                               new platform::CUDADeviceContext(
-                                   boost::get<platform::CUDAPlace>(places[i])));
+      device_contexts_.emplace(
+          p,
+          PtrType(new CUDAPinnedDeviceContext(boost::get<CUDAPinnedPlace>(p))));
 #else
       PADDLE_THROW(
           "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
@@ -183,6 +200,20 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }
 
 cudaStream_t CUDADeviceContext::stream() const { return stream_; }
 
+CUDAPinnedDeviceContext::CUDAPinnedDeviceContext() {
+  eigen_device_.reset(new Eigen::DefaultDevice());
+}
+
+CUDAPinnedDeviceContext::CUDAPinnedDeviceContext(CUDAPinnedPlace place)
+    : place_(place) {
+  eigen_device_.reset(new Eigen::DefaultDevice());
+}
+
+Eigen::DefaultDevice* CUDAPinnedDeviceContext::eigen_device() const {
+  return eigen_device_.get();
+}
+
+Place CUDAPinnedDeviceContext::GetPlace() const { return place_; }
 #endif
 
 #ifdef PADDLE_WITH_MKLDNN
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 603b890af13b529c490c29112a73a09cc815d07a..88e0383146c1adf2752a362091996bad9cfcce5e 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -8,11 +8,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
 #include <memory>
+#include <mutex>  // NOLINT
+#include <string>
 #include <unordered_map>
+#include <vector>
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cublas.h"
@@ -25,12 +27,12 @@ limitations under the License. */
 #include <mkldnn.hpp>
 #endif
 
+#include <map>
+#include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
-#include "glog/logging.h"
-
 namespace paddle {
 namespace platform {
 
@@ -97,12 +99,18 @@ class CUDADeviceContext : public DeviceContext {
   /*! \brief  Return cuda stream in the device context. */
   cudaStream_t stream() const;
 
+  template <typename Callback>
+  void RecordEvent(cudaEvent_t ev, Callback callback) {
+    std::lock_guard<std::mutex> guard(mtx_);
+    callback();
+    PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
+  }
+
  private:
   CUDAPlace place_;
 
   std::unique_ptr<Eigen::GpuDevice> eigen_device_;
   std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
-
   cudaStream_t stream_;
   cudnnHandle_t cudnn_handle_;
   cublasHandle_t cublas_handle_;
@@ -110,6 +118,8 @@ class CUDADeviceContext : public DeviceContext {
   int compute_capability;
   int multi_process;
   int max_threads_per_mp;
+
+  std::mutex mtx_;
 };
 
 template <>
@@ -117,6 +127,25 @@ struct DefaultDeviceContextType<platform::CUDAPlace> {
   using TYPE = CUDADeviceContext;
 };
 
+// Currently, CUDAPinnedDeviceContext is only used to data copying.
+class CUDAPinnedDeviceContext : public DeviceContext {
+ public:
+  CUDAPinnedDeviceContext();
+  explicit CUDAPinnedDeviceContext(CUDAPinnedPlace place);
+
+  Place GetPlace() const override;
+
+  Eigen::DefaultDevice* eigen_device() const;
+
+ private:
+  CUDAPinnedPlace place_;
+  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
+};
+
+template <>
+struct DefaultDeviceContextType<platform::CUDAPinnedPlace> {
+  using TYPE = CUDAPinnedDeviceContext;
+};
 #endif
 
 #ifdef PADDLE_WITH_MKLDNN
@@ -159,7 +188,7 @@ class DeviceContextPool {
   }
 
   /*! \brief  Return handle of single device context. */
-  const platform::DeviceContext* Get(const platform::Place& place);
+  platform::DeviceContext* Get(const platform::Place& place);
 
   template <typename Place>
   const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace(
@@ -172,20 +201,7 @@ class DeviceContextPool {
 
  private:
   static DeviceContextPool* pool;
-  constexpr static int LEFT_SHIFT = 8;
-  struct Hash {
-    std::hash<int> hash_;
-    size_t operator()(const platform::Place& place) const {
-      int pre_hash = place.which() << LEFT_SHIFT;
-      if (platform::is_gpu_place(place)) {
-        pre_hash += boost::get<platform::CUDAPlace>(place).GetDeviceId();
-      }
-      return hash_(pre_hash);
-    }
-  };
-  std::unordered_map<const platform::Place, const platform::DeviceContext*,
-                     Hash>
-      device_contexts_;
+  std::map<Place, std::unique_ptr<DeviceContext>> device_contexts_;
   DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
 };
 
diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu
index 9d8d07362ce3a0d0c2a009c9844db0a3bdaf01cb..fa806aba6d8747beebc3eed2c661b326dd62fd76 100644
--- a/paddle/fluid/platform/device_context_test.cu
+++ b/paddle/fluid/platform/device_context_test.cu
@@ -11,11 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#include "gtest/gtest.h"
 #include "paddle/fluid/platform/device_context.h"
 
+#include <vector>
+
 #include "glog/logging.h"
+#include "gtest/gtest.h"
 
 TEST(Device, Init) {
   using paddle::platform::DeviceContext;
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index 78e00d5420bbea40c9bea4be919ec4ce5ececdcb..d9e2afadaf8ec439d158e57c94d3e6e684bce116 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -11,23 +11,33 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #include "paddle/fluid/platform/device_tracer.h"
-#include <google/protobuf/text_format.h>
+
+#include <deque>
 #include <fstream>
 #include <map>
-#include <mutex>
+#include <mutex>  // NOLINT
 #include <numeric>
-#include <thread>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
 #include "glog/logging.h"
+#include "google/protobuf/text_format.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/string/printf.h"
 
 namespace paddle {
 namespace platform {
 namespace {
+// Current thread's id. Note, we don't distinguish nested threads
+// for now.
+thread_local int cur_thread_id = 0;
+// Tracking the nested block stacks of each thread.
+thread_local std::deque<int> block_id_stack;
+// Tracking the nested event stacks.
+thread_local std::deque<std::string> annotation_stack;
 
-thread_local const char *cur_annotation = nullptr;
 std::once_flag tracer_once_flag;
 DeviceTracer *tracer = nullptr;
 }  // namespace
@@ -117,7 +127,7 @@ void DisableActivity() {
 
 void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size,
                               size_t *maxNumRecords) {
-  uint8_t *buf = (uint8_t *)malloc(kBufSize + kAlignSize);
+  uint8_t *buf = reinterpret_cast<uint8_t *>(malloc(kBufSize + kAlignSize));
   *size = kBufSize;
   *buffer = ALIGN_BUFFER(buf, kAlignSize);
   *maxNumRecords = 0;
@@ -191,19 +201,19 @@ class DeviceTracerImpl : public DeviceTracer {
     correlations_[id] = anno;
   }
 
-  void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {
-    if (!anno) {
-      // TODO(panyx0718): Currently, it doesn't support nested situation
-      // Up-level can be cleared by low-level and therefore get nullptr
-      // here.
+  void AddCPURecords(const std::string &anno, uint64_t start_ns,
+                     uint64_t end_ns, int64_t device_id, int64_t thread_id) {
+    if (anno.empty()) {
+      VLOG(1) << "Empty timeline annotation.";
       return;
     }
     std::lock_guard<std::mutex> l(trace_mu_);
-    cpu_records_.push_back(CPURecord{anno, start_ns, end_ns, 0});
+    cpu_records_.push_back(
+        CPURecord{anno, start_ns, end_ns, device_id, thread_id});
   }
 
   void AddMemRecords(const std::string &name, uint64_t start_ns,
-                     uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
+                     uint64_t end_ns, int64_t device_id, int64_t stream_id,
                      uint32_t correlation_id, uint64_t bytes) {
     // 0 means timestamp information could not be collected for the kernel.
     if (start_ns == 0 || end_ns == 0) {
@@ -215,8 +225,8 @@ class DeviceTracerImpl : public DeviceTracer {
                                      stream_id, correlation_id, bytes});
   }
 
-  void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
-                        uint32_t stream_id, uint32_t correlation_id) {
+  void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id,
+                        int64_t stream_id, uint32_t correlation_id) {
     // 0 means timestamp information could not be collected for the kernel.
     if (start == 0 || end == 0) {
       VLOG(3) << correlation_id << " cannot be traced";
@@ -235,7 +245,6 @@ class DeviceTracerImpl : public DeviceTracer {
   void Enable() {
     std::lock_guard<std::mutex> l(trace_mu_);
     if (enabled_) {
-      fprintf(stderr, "DeviceTracer already enabled\n");
       return;
     }
     EnableActivity();
@@ -270,27 +279,30 @@ class DeviceTracerImpl : public DeviceTracer {
         continue;
       }
       auto *event = profile_pb.add_events();
+      event->set_type(proto::Event::GPUKernel);
       event->set_name(correlations_.at(r.correlation_id));
       event->set_start_ns(r.start_ns);
       event->set_end_ns(r.end_ns);
-      event->set_stream_id(r.stream_id);
+      event->set_sub_device_id(r.stream_id);
       event->set_device_id(r.device_id);
     }
 
     for (const CPURecord &r : cpu_records_) {
       auto *event = profile_pb.add_events();
+      event->set_type(proto::Event::CPU);
       event->set_name(r.name);
       event->set_start_ns(r.start_ns);
       event->set_end_ns(r.end_ns);
-      event->set_stream_id(r.thread_id);
-      event->set_device_id(-1);
+      event->set_sub_device_id(r.thread_id);
+      event->set_device_id(r.device_id);
     }
     for (const MemRecord &r : mem_records_) {
       auto *event = profile_pb.add_events();
+      event->set_type(proto::Event::GPUKernel);
       event->set_name(r.name);
       event->set_start_ns(r.start_ns);
       event->set_end_ns(r.end_ns);
-      event->set_stream_id(r.stream_id);
+      event->set_sub_device_id(r.stream_id);
       event->set_device_id(r.device_id);
       event->mutable_memcopy()->set_bytes(r.bytes);
     }
@@ -310,7 +322,6 @@ class DeviceTracerImpl : public DeviceTracer {
     DisableActivity();
     dynload::cuptiUnsubscribe(subscriber_);
     CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_));
-    PADDLE_ENFORCE(dynload::cuptiFinalize());
     enabled_ = false;
   }
 
@@ -323,8 +334,9 @@ class DeviceTracerImpl : public DeviceTracer {
     if ((domain == CUPTI_CB_DOMAIN_DRIVER_API) &&
         (cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)) {
       if (cbInfo->callbackSite == CUPTI_API_ENTER) {
-        const std::string anno =
-            cur_annotation ? cur_annotation : cbInfo->symbolName;
+        const std::string anno = !annotation_stack.empty()
+                                     ? annotation_stack.back()
+                                     : cbInfo->symbolName;
         tracer->AddAnnotation(cbInfo->correlationId, anno);
       }
     } else {
@@ -351,14 +363,15 @@ class DeviceTracerDummy : public DeviceTracer {
 
   void AddAnnotation(uint64_t id, const std::string &anno) {}
 
-  void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {}
+  void AddCPURecords(const std::string &anno, uint64_t start_ns,
+                     uint64_t end_ns, int64_t device_id, int64_t thread_id) {}
 
   void AddMemRecords(const std::string &name, uint64_t start_ns,
-                     uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
+                     uint64_t end_ns, int64_t device_id, int64_t stream_id,
                      uint32_t correlation_id, uint64_t bytes) {}
 
-  void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
-                        uint32_t stream_id, uint32_t correlation_id) {}
+  void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id,
+                        int64_t stream_id, uint32_t correlation_id) {}
 
   bool IsEnabled() { return false; }
 
@@ -384,11 +397,28 @@ DeviceTracer *GetDeviceTracer() {
   return tracer;
 }
 
-void SetCurAnnotation(const char *anno) { cur_annotation = anno; }
+void SetCurAnnotation(const std::string &anno) {
+  annotation_stack.push_back(anno);
+}
+
+void ClearCurAnnotation() { annotation_stack.pop_back(); }
+
+std::string CurAnnotation() {
+  if (annotation_stack.empty()) return "";
+  return annotation_stack.back();
+}
+
+void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); }
+
+void ClearCurBlock() { block_id_stack.pop_back(); }
+
+int BlockDepth() { return block_id_stack.size(); }
+
+void SetCurThread(int thread_id) { cur_thread_id = thread_id; }
 
-void ClearCurAnnotation() { cur_annotation = nullptr; }
+void ClearCurThread() { cur_thread_id = 0; }
 
-const char *CurAnnotation() { return cur_annotation; }
+int CurThread() { return cur_thread_id; }
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h
index 23f7cdbdffc9f48ac5555455bf745233c81dd0cb..0375c7439c29d4122e8ff6b58734dad4f504b7a2 100644
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -11,8 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
+
+#include <string>
+
 #include "paddle/fluid/platform/dynload/cupti.h"
 #include "paddle/fluid/platform/profiler.pb.h"
 
@@ -32,22 +34,23 @@ class DeviceTracer {
   struct KernelRecord {
     uint64_t start_ns;
     uint64_t end_ns;
-    uint32_t device_id;
-    uint32_t stream_id;
+    int64_t device_id;
+    int64_t stream_id;
     uint32_t correlation_id;
   };
   struct CPURecord {
     std::string name;
     uint64_t start_ns;
     uint64_t end_ns;
-    uint64_t thread_id;
+    int64_t device_id;
+    int64_t thread_id;
   };
   struct MemRecord {
     std::string name;
     uint64_t start_ns;
     uint64_t end_ns;
-    uint32_t device_id;
-    uint32_t stream_id;
+    int64_t device_id;
+    int64_t stream_id;
     uint32_t correlation_id;
     uint64_t bytes;
   };
@@ -64,18 +67,18 @@ class DeviceTracer {
   virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;
 
   virtual void AddMemRecords(const std::string& name, uint64_t start_ns,
-                             uint64_t end_ns, uint32_t device_id,
-                             uint32_t stream_id, uint32_t correlation_id,
+                             uint64_t end_ns, int64_t device_id,
+                             int64_t stream_id, uint32_t correlation_id,
                              uint64_t bytes) = 0;
 
-  virtual void AddCPURecords(const char* anno, uint64_t start_ns,
-                             uint64_t end_ns) = 0;
+  virtual void AddCPURecords(const std::string& anno, uint64_t start_ns,
+                             uint64_t end_ns, int64_t device_id,
+                             int64_t thread_id) = 0;
 
   // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
   // added before for human readability.
-  virtual void AddKernelRecords(uint64_t start, uint64_t end,
-                                uint32_t device_id, uint32_t stream_id,
-                                uint32_t correlation_id) = 0;
+  virtual void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id,
+                                int64_t stream_id, uint32_t correlation_id) = 0;
 
   // Generate a proto after done (Disabled).
   virtual proto::Profile GenProfile(const std::string& profile_path) = 0;
@@ -87,10 +90,18 @@ class DeviceTracer {
 DeviceTracer* GetDeviceTracer();
 
 // Set a name for the cuda kernel operation being launched by the thread.
-void SetCurAnnotation(const char* anno);
+void SetCurAnnotation(const std::string& anno);
 // Clear the name after the operation is done.
 void ClearCurAnnotation();
 // Current name of the operation being run in the thread.
-const char* CurAnnotation();
+std::string CurAnnotation();
+
+void SetCurBlock(int block_id);
+void ClearCurBlock();
+int BlockDepth();
+
+void SetCurThread(int thread_id);
+void ClearCurThread();
+int CurThread();
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 84dac2937de02b3374156ebc83e19dac9f9a3e7a..9da787a4073fa002f75154f7c4fba54e9ed8efa6 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -1,9 +1,23 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
 
-list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc nccl.cc)
+list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc)
+
+# There is no macOS version of NCCL.
+if (NOT APPLE)
+  list(APPEND CUDA_SRCS nccl.cc)
+endif()
+
+if (TENSORRT_FOUND)
+  list(APPEND CUDA_SRCS tensorrt.cc)
+endif()
+
 configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h)
 if (CUPTI_FOUND)
     list(APPEND CUDA_SRCS cupti.cc)
 endif(CUPTI_FOUND)
 nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
 cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
+if (WITH_MKLML)
+    cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
+endif()
+# TODO(TJ): add iomp, mkldnn?
diff --git a/paddle/fluid/platform/dynload/cublas.cc b/paddle/fluid/platform/dynload/cublas.cc
index e90e3105f0809b3c7507a86fa5a3d61864290fcb..361d3439b844e9f68d3fba0a0e41ec457118a4a9 100644
--- a/paddle/fluid/platform/dynload/cublas.cc
+++ b/paddle/fluid/platform/dynload/cublas.cc
@@ -24,6 +24,14 @@ void *cublas_dso_handle = nullptr;
 
 CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
 
+#ifdef CUBLAS_BLAS_ROUTINE_EACH_R2
+CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
+#endif
+
+#ifdef CUBLAS_BLAS_ROUTINE_EACH_R3
+CUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP);
+#endif
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index fa9041134d863ebfd8d1e00379da3b92323ae6e3..25bcda7eedc1ef42f75fb8fd1439f0c8f55015c3 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -1,22 +1,25 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+   http://www.apache.org/licenses/LICENSE-2.0
 
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 
 #pragma once
 
+#include <cublasXt.h>
 #include <cublas_v2.h>
+#include <cuda.h>
 #include <dlfcn.h>
-#include <mutex>
+#include <mutex>  // NOLINT
+#include <type_traits>
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
@@ -34,18 +37,18 @@ extern void *cublas_dso_handle;
  * note: default dynamic linked libs
  */
 #ifdef PADDLE_USE_DSO
-#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                    \
-  struct DynLoad__##__name {                                        \
-    template <typename... Args>                                     \
-    inline cublasStatus_t operator()(Args... args) {                \
-      typedef cublasStatus_t (*cublasFunc)(Args...);                \
-      std::call_once(cublas_dso_flag,                               \
-                     paddle::platform::dynload::GetCublasDsoHandle, \
-                     &cublas_dso_handle);                           \
-      void *p_##__name = dlsym(cublas_dso_handle, #__name);         \
-      return reinterpret_cast<cublasFunc>(p_##__name)(args...);     \
-    }                                                               \
-  };                                                                \
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                             \
+  struct DynLoad__##__name {                                                 \
+    using FUNC_TYPE = decltype(&::__name);                                   \
+    template <typename... Args>                                              \
+    inline cublasStatus_t operator()(Args... args) {                         \
+      std::call_once(cublas_dso_flag, []() {                                 \
+        cublas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
+      });                                                                    \
+      static void *p_##__name = dlsym(cublas_dso_handle, #__name);           \
+      return reinterpret_cast<FUNC_TYPE>(p_##__name)(args...);               \
+    }                                                                        \
+  };                                                                         \
   extern DynLoad__##__name __name
 #else
 #define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)     \
@@ -70,8 +73,8 @@ extern void *cublas_dso_handle;
   __macro(cublasDgemm_v2);                \
   __macro(cublasHgemm);                   \
   __macro(cublasSgemmEx);                 \
-  __macro(cublasSgeam_v2);                \
-  __macro(cublasDgeam_v2);                \
+  __macro(cublasSgeam);                   \
+  __macro(cublasDgeam);                   \
   __macro(cublasCreate_v2);               \
   __macro(cublasDestroy_v2);              \
   __macro(cublasSetStream_v2);            \
@@ -81,17 +84,32 @@ extern void *cublas_dso_handle;
   __macro(cublasDgemmBatched);            \
   __macro(cublasCgemmBatched);            \
   __macro(cublasZgemmBatched);            \
-  __macro(cublasSgemmStridedBatched);     \
-  __macro(cublasDgemmStridedBatched);     \
-  __macro(cublasCgemmStridedBatched);     \
-  __macro(cublasZgemmStridedBatched);     \
-  __macro(cublasHgemmStridedBatched);     \
   __macro(cublasSgetrfBatched);           \
   __macro(cublasSgetriBatched);           \
   __macro(cublasDgetrfBatched);           \
-  __macro(cublasDgetriBatched)
+  __macro(cublasDgetriBatched);
 
-CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP);
+CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+
+// APIs available after CUDA 8.0
+#if CUDA_VERSION >= 8000
+#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
+  __macro(cublasGemmEx);                     \
+  __macro(cublasSgemmStridedBatched);        \
+  __macro(cublasDgemmStridedBatched);        \
+  __macro(cublasCgemmStridedBatched);        \
+  __macro(cublasZgemmStridedBatched);        \
+  __macro(cublasHgemmStridedBatched);
+
+CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+#endif
+
+// APIs available after CUDA 9.0
+#if CUDA_VERSION >= 9000
+#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) __macro(cublasSetMathMode);
+
+CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+#endif
 
 #undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
 }  // namespace dynload
diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc
index c65b060ab46cfcd38292be66dd5f2123f88bae63..f3cd3b2bbedef7c9140c2acddea0732972ff7fa0 100644
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -44,7 +44,8 @@ CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
 
 #ifdef PADDLE_USE_DSO
 bool HasCUDNN() {
-  std::call_once(cudnn_dso_flag, GetCUDNNDsoHandle, &cudnn_dso_handle);
+  std::call_once(cudnn_dso_flag,
+                 []() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
   return cudnn_dso_handle != nullptr;
 }
 
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 81acc445bd3803dede158ff09507a72fb6e293ac..77e46fa768b62c277d7b4027de7173e39a5672b4 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <cudnn.h>
 #include <dlfcn.h>
-#include <mutex>
+#include <mutex>  // NOLINT
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
@@ -30,19 +30,19 @@ extern bool HasCUDNN();
 #ifdef PADDLE_USE_DSO
 
 extern void EnforceCUDNNLoaded(const char* fn_name);
-#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                    \
-  struct DynLoad__##__name {                                       \
-    template <typename... Args>                                    \
-    auto operator()(Args... args) -> decltype(__name(args...)) {   \
-      using cudnn_func = decltype(__name(args...)) (*)(Args...);   \
-      std::call_once(cudnn_dso_flag,                               \
-                     paddle::platform::dynload::GetCUDNNDsoHandle, \
-                     &cudnn_dso_handle);                           \
-      EnforceCUDNNLoaded(#__name);                                 \
-      void* p_##__name = dlsym(cudnn_dso_handle, #__name);         \
-      return reinterpret_cast<cudnn_func>(p_##__name)(args...);    \
-    }                                                              \
-  };                                                               \
+#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                            \
+  struct DynLoad__##__name {                                               \
+    template <typename... Args>                                            \
+    auto operator()(Args... args) -> decltype(__name(args...)) {           \
+      using cudnn_func = decltype(&::__name);                              \
+      std::call_once(cudnn_dso_flag, []() {                                \
+        cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
+      });                                                                  \
+      EnforceCUDNNLoaded(#__name);                                         \
+      static void* p_##__name = dlsym(cudnn_dso_handle, #__name);          \
+      return reinterpret_cast<cudnn_func>(p_##__name)(args...);            \
+    }                                                                      \
+  };                                                                       \
   extern struct DynLoad__##__name __name
 
 #else
@@ -140,7 +140,8 @@ CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 
 #if CUDNN_VERSION >= 7001
 #define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \
-  __macro(cudnnSetConvolutionGroupCount);
+  __macro(cudnnSetConvolutionGroupCount);  \
+  __macro(cudnnSetConvolutionMathType);
 CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h
index c1bf88f8cb690861b97686d99d36410143445243..e8f4a82ef132be9e4ec3fb76f11766046a2ff638 100644
--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -11,14 +11,15 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
 #ifdef PADDLE_WITH_CUPTI
+
 #include <cuda.h>
 #include <cupti.h>
 #include <dlfcn.h>
-#include <mutex>
+#include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
@@ -36,18 +37,18 @@ extern void *cupti_dso_handle;
  * note: default dynamic linked libs
  */
 #ifdef PADDLE_USE_DSO
-#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)                    \
-  struct DynLoad__##__name {                                       \
-    template <typename... Args>                                    \
-    inline CUptiResult CUPTIAPI operator()(Args... args) {         \
-      typedef CUptiResult CUPTIAPI (*cuptiFunc)(Args...);          \
-      std::call_once(cupti_dso_flag,                               \
-                     paddle::platform::dynload::GetCUPTIDsoHandle, \
-                     &cupti_dso_handle);                           \
-      void *p_##__name = dlsym(cupti_dso_handle, #__name);         \
-      return reinterpret_cast<cuptiFunc>(p_##__name)(args...);     \
-    }                                                              \
-  };                                                               \
+#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)                            \
+  struct DynLoad__##__name {                                               \
+    template <typename... Args>                                            \
+    inline CUptiResult CUPTIAPI operator()(Args... args) {                 \
+      using cuptiFunc = decltype(&::__name);                               \
+      std::call_once(cupti_dso_flag, []() {                                \
+        cupti_dso_handle = paddle::platform::dynload::GetCUPTIDsoHandle(); \
+      });                                                                  \
+      static void *p_##__name = dlsym(cupti_dso_handle, #__name);          \
+      return reinterpret_cast<cuptiFunc>(p_##__name)(args...);             \
+    }                                                                      \
+  };                                                                       \
   extern DynLoad__##__name __name
 #else
 #define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)            \
@@ -71,7 +72,6 @@ extern void *cupti_dso_handle;
   __macro(cuptiGetResultString);              \
   __macro(cuptiActivityGetNumDroppedRecords); \
   __macro(cuptiActivityFlushAll);             \
-  __macro(cuptiFinalize);                     \
   __macro(cuptiSubscribe);                    \
   __macro(cuptiUnsubscribe);                  \
   __macro(cuptiEnableCallback);               \
diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h
index 1b3ff962d6edceb37deb94cc7daead7346d25352..5b9e0820e0b319fe7a636a57a0029caf038b4db3 100644
--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
@@ -11,12 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
 #include <curand.h>
 #include <dlfcn.h>
-#include <mutex>
+
+#include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
@@ -25,18 +26,18 @@ namespace dynload {
 extern std::once_flag curand_dso_flag;
 extern void *curand_dso_handle;
 #ifdef PADDLE_USE_DSO
-#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                    \
-  struct DynLoad__##__name {                                        \
-    template <typename... Args>                                     \
-    curandStatus_t operator()(Args... args) {                       \
-      typedef curandStatus_t (*curandFunc)(Args...);                \
-      std::call_once(curand_dso_flag,                               \
-                     paddle::platform::dynload::GetCurandDsoHandle, \
-                     &curand_dso_handle);                           \
-      void *p_##__name = dlsym(curand_dso_handle, #__name);         \
-      return reinterpret_cast<curandFunc>(p_##__name)(args...);     \
-    }                                                               \
-  };                                                                \
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                             \
+  struct DynLoad__##__name {                                                 \
+    template <typename... Args>                                              \
+    curandStatus_t operator()(Args... args) {                                \
+      using curandFunc = decltype(&::__name);                                \
+      std::call_once(curand_dso_flag, []() {                                 \
+        curand_dso_handle = paddle::platform::dynload::GetCurandDsoHandle(); \
+      });                                                                    \
+      static void *p_##__name = dlsym(curand_dso_handle, #__name);           \
+      return reinterpret_cast<curandFunc>(p_##__name)(args...);              \
+    }                                                                        \
+  };                                                                         \
   extern DynLoad__##__name __name
 #else
 #define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index e590e81bab51fd9fe12309335522614263d8e21d..198d8566b1bd726c5b33d8af22a19cb30a280fa2 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -11,12 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+
 #include <dlfcn.h>
+
 #include <memory>
-#include <mutex>
+#include <mutex>  // NOLINT
 #include <string>
+
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/fluid/platform/dynload/cupti_lib_path.h"
@@ -43,6 +45,12 @@ DEFINE_string(nccl_dir, "",
 
 DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
 
+DEFINE_string(
+    tensorrt_dir, "",
+    "Specify path for loading tensorrt library, such as libnvinfer.so.");
+
+DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
+
 namespace paddle {
 namespace platform {
 namespace dynload {
@@ -65,22 +73,22 @@ static inline std::string join(const std::string& part1,
   return ret;
 }
 
-static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
-                                               void** dso_handle,
-                                               int dynload_flags) {
+static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
+                                                int dynload_flags) {
   VLOG(3) << "Try to find library: " << dso_path
           << " from default system path.";
   // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
-  *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+  // and /usr/local/lib path
+  void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
 
 // DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
 // bring System Integrity Projection (SIP), if dso_handle
 // is null, search from default package path in Mac OS.
 #if defined(__APPLE__) || defined(__OSX__)
-  if (nullptr == *dso_handle) {
-    dso_path = join("/usr/local/cuda/lib/", dso_path);
-    *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-    if (nullptr == *dso_handle) {
+  if (nullptr == dso_handle) {
+    dso_handle =
+        dlopen(join("/usr/local/cuda/lib/", dso_path).c_str(), dynload_flags);
+    if (nullptr == dso_handle) {
       if (dso_path == "libcudnn.dylib") {
         LOG(WARNING) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
                         "For instance, sudo tar -xzf "
@@ -91,28 +99,33 @@ static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
     }
   }
 #endif
+
+  if (nullptr == dso_handle) {
+    LOG(WARNING) << "Can not find library: " << dso_path
+                 << ". Please try to add the lib path to LD_LIBRARY_PATH.";
+  }
+  return dso_handle;
 }
 
-static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
-                                              const std::string& dso_name,
-                                              void** dso_handle,
-                                              bool throw_on_error = true) {
+static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
+                                               const std::string& dso_name,
+                                               bool throw_on_error = true) {
   int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
-  *dso_handle = nullptr;
+  void* dso_handle = nullptr;
 
   std::string dlPath = dso_name;
   if (search_root.empty()) {
-    GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
+    dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
   } else {
     // search xxx.so from custom path
     dlPath = join(search_root, dso_name);
-    *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
+    dso_handle = dlopen(dlPath.c_str(), dynload_flags);
     // if not found, search from default path
-    if (nullptr == *dso_handle) {
+    if (nullptr == dso_handle) {
       LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
                    << dlerror() << ")";
       dlPath = dso_name;
-      GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
+      dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
     }
   }
   auto error_msg =
@@ -124,70 +137,87 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
       "using the DYLD_LIBRARY_PATH is impossible unless System "
       "Integrity Protection (SIP) is disabled.";
   if (throw_on_error) {
-    PADDLE_ENFORCE(nullptr != *dso_handle, error_msg, dlPath, dlerror());
-  } else if (nullptr == *dso_handle) {
+    PADDLE_ENFORCE(nullptr != dso_handle, error_msg, dlPath, dlerror());
+  } else if (nullptr == dso_handle) {
     LOG(WARNING) << string::Sprintf(error_msg, dlPath, dlerror());
   }
+
+  return dso_handle;
 }
 
-void GetCublasDsoHandle(void** dso_handle) {
+void* GetCublasDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
 #else
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
 #endif
 }
 
-void GetCUDNNDsoHandle(void** dso_handle) {
+void* GetCUDNNDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle,
-                             false);
+  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false);
 #else
-  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle, false);
+  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false);
 #endif
 }
 
-void GetCUPTIDsoHandle(void** dso_handle) {
+void* GetCUPTIDsoHandle() {
   std::string cupti_path = cupti_lib_path;
   if (!FLAGS_cupti_dir.empty()) {
     cupti_path = FLAGS_cupti_dir;
   }
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", dso_handle, false);
+  return GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", false);
+#else
+  return GetDsoHandleFromSearchPath(cupti_path, "libcupti.so", false);
+#endif
+}
+
+void* GetCurandDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
+#endif
+}
+
+void* GetWarpCTCDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib");
 #else
-  GetDsoHandleFromSearchPath(cupti_path, "libcupti.so", dso_handle, false);
+  return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so");
 #endif
 }
 
-void GetCurandDsoHandle(void** dso_handle) {
+void* GetLapackDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib");
 #else
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so");
 #endif
 }
 
-void GetWarpCTCDsoHandle(void** dso_handle) {
+void* GetNCCLDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib");
 #else
-  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so");
 #endif
 }
 
-void GetLapackDsoHandle(void** dso_handle) {
+void* GetTensorRtDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib");
 #else
-  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so");
 #endif
 }
 
-void GetNCCLDsoHandle(void** dso_handle) {
+void* GetMKLMLDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib");
 #else
-  GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", dso_handle);
+  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so");
 #endif
 }
 
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index b5b9c4af916241c1c7361b506f74563ebcf69b9a..ca87dc47f355a8a4fc840262044413414edf00a0 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -18,55 +18,15 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
-/**
- * @brief    load the DSO of CUBLAS
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCublasDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of CUDNN
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCUDNNDsoHandle(void** dso_handle);
-
-void GetCUPTIDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of CURAND
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCurandDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of warp-ctc
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetWarpCTCDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of lapack
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetLapackDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of NVIDIA nccl
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetNCCLDsoHandle(void** dso_handle);
+void* GetCublasDsoHandle();
+void* GetCUDNNDsoHandle();
+void* GetCUPTIDsoHandle();
+void* GetCurandDsoHandle();
+void* GetWarpCTCDsoHandle();
+void* GetLapackDsoHandle();
+void* GetNCCLDsoHandle();
+void* GetTensorRtDsoHandle();
+void* GetMKLMLDsoHandle();
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/mklml.cc b/paddle/fluid/platform/dynload/mklml.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0f61a5e09b3243cbdf570ba7c28a260f181d8848
--- /dev/null
+++ b/paddle/fluid/platform/dynload/mklml.cc
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/mklml.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag mklml_dso_flag;
+void* mklml_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MKLML_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
new file mode 100644
index 0000000000000000000000000000000000000000..17acefe8cde01809572e4c86cbdccfed9a477a51
--- /dev/null
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <dlfcn.h>
+#include <mkl.h>
+#include <mutex>  // NOLINT
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag mklml_dso_flag;
+extern void* mklml_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load mklml routine
+ * via operator overloading.
+ */
+#define DYNAMIC_LOAD_MKLML_WRAP(__name)                                    \
+  struct DynLoad__##__name {                                               \
+    template <typename... Args>                                            \
+    auto operator()(Args... args) -> decltype(__name(args...)) {           \
+      using mklmlFunc = decltype(&::__name);                               \
+      std::call_once(mklml_dso_flag, []() {                                \
+        mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \
+      });                                                                  \
+      static void* p_##_name = dlsym(mklml_dso_handle, #__name);           \
+      return reinterpret_cast<mklmlFunc>(p_##_name)(args...);              \
+    }                                                                      \
+  };                                                                       \
+  extern DynLoad__##__name __name
+
+#define DECLARE_DYNAMIC_LOAD_MKLML_WRAP(__name) DYNAMIC_LOAD_MKLML_WRAP(__name)
+
+#define MKLML_ROUTINE_EACH(__macro) \
+  __macro(cblas_sgemm);             \
+  __macro(cblas_saxpy);             \
+  __macro(cblas_scopy);             \
+  __macro(cblas_sgemv);             \
+  __macro(cblas_sgemm_batch);       \
+  __macro(cblas_dgemm);             \
+  __macro(cblas_daxpy);             \
+  __macro(cblas_dcopy);             \
+  __macro(cblas_dgemv);             \
+  __macro(cblas_dgemm_batch);       \
+  __macro(vsAdd);                   \
+  __macro(vdAdd);                   \
+  __macro(MKL_Set_Num_Threads)
+
+MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
+
+#undef DYNAMIC_LOAD_MKLML_WRAP
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/nccl.cc b/paddle/fluid/platform/dynload/nccl.cc
index 3edc70c46d03ddcc751e865676928c47fcb48e69..2c40c48ee08497f9a2a414687b9c51d87ba574aa 100644
--- a/paddle/fluid/platform/dynload/nccl.cc
+++ b/paddle/fluid/platform/dynload/nccl.cc
@@ -25,11 +25,6 @@ void *nccl_dso_handle;
 
 NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
 
-void LoadNCCLDSO() {
-  platform::call_once(nccl_dso_flag,
-                      [] { GetNCCLDsoHandle(&nccl_dso_handle); });
-}
-
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h
index dc78bcb44d3316a1ecee0c8d70dcb4777a9e2de4..575516f81870fc9f7b92919ffc20a201cb5cbce8 100644
--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -11,13 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
 #include <dlfcn.h>
 #include <nccl.h>
-#include <mutex>
-#include "paddle/fluid/platform/call_once.h"
+
+#include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
@@ -28,18 +28,19 @@ extern std::once_flag nccl_dso_flag;
 extern void* nccl_dso_handle;
 
 #ifdef PADDLE_USE_DSO
-extern void LoadNCCLDSO();
 
-#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                   \
-  struct DynLoad__##__name {                                     \
-    template <typename... Args>                                  \
-    auto operator()(Args... args) -> decltype(__name(args...)) { \
-      using nccl_func = decltype(__name(args...)) (*)(Args...);  \
-      paddle::platform::dynload::LoadNCCLDSO();                  \
-      void* p_##__name = dlsym(nccl_dso_handle, #__name);        \
-      return reinterpret_cast<nccl_func>(p_##__name)(args...);   \
-    }                                                            \
-  };                                                             \
+#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                           \
+  struct DynLoad__##__name {                                             \
+    template <typename... Args>                                          \
+    auto operator()(Args... args) -> decltype(__name(args...)) {         \
+      using nccl_func = decltype(&::__name);                             \
+      std::call_once(nccl_dso_flag, []() {                               \
+        nccl_dso_handle = paddle::platform::dynload::GetNCCLDsoHandle(); \
+      });                                                                \
+      static void* p_##__name = dlsym(nccl_dso_handle, #__name);         \
+      return reinterpret_cast<nccl_func>(p_##__name)(args...);           \
+    }                                                                    \
+  };                                                                     \
   extern DynLoad__##__name __name
 #else
 #define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
diff --git a/paddle/fluid/platform/dynload/tensorrt.cc b/paddle/fluid/platform/dynload/tensorrt.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f3c8e27944ca9b6419de87d752df3a83751039b1
--- /dev/null
+++ b/paddle/fluid/platform/dynload/tensorrt.cc
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/tensorrt.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag tensorrt_dso_flag;
+void *tensorrt_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+TENSORRT_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d67658b94af75680a100e13eed7b6b052162e00
--- /dev/null
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <NvInfer.h>
+#include <dlfcn.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag tensorrt_dso_flag;
+extern void* tensorrt_dso_handle;
+
+#ifdef PADDLE_USE_DSO
+
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name)                      \
+  struct DynLoad__##__name {                                            \
+    template <typename... Args>                                         \
+    auto operator()(Args... args) -> decltype(__name(args...)) {        \
+      using tensorrt_func = decltype(__name(args...)) (*)(Args...);     \
+      std::call_once(tensorrt_dso_flag, []() {                          \
+        tensorrt_dso_handle =                                           \
+            paddle::platform::dynload::GetTensorRtDsoHandle();          \
+        PADDLE_ENFORCE(tensorrt_dso_handle, "load tensorrt so failed"); \
+      });                                                               \
+      static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);    \
+      PADDLE_ENFORCE(p_##__name, "load %s failed", #__name);            \
+      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);      \
+    }                                                                   \
+  };                                                                    \
+  extern DynLoad__##__name __name
+
+#else
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \
+  struct DynLoad__##__name {                       \
+    template <typename... Args>                    \
+    tensorrtResult_t operator()(Args... args) {    \
+      return __name(args...);                      \
+    }                                              \
+  };                                               \
+  extern DynLoad__##__name __name
+#endif
+
+#define TENSORRT_RAND_ROUTINE_EACH(__macro) \
+  __macro(createInferBuilder_INTERNAL);     \
+  __macro(createInferRuntime_INTERNAL);
+
+TENSORRT_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP)
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h
index f5ded0eb6b1107c886641e848f5040a7a2d806a5..d157c1fda789b98f06ad069d2a9c4f421ff82dcd 100644
--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
@@ -15,9 +15,10 @@ limitations under the License. */
 #pragma once
 
 #include <dlfcn.h>
-#include <mutex>
-#include "ctc.h"
+#include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "warpctc/include/ctc.h"
 
 namespace paddle {
 namespace platform {
@@ -31,18 +32,18 @@ extern void* warpctc_dso_handle;
  * (for each function) to dynamic load warpctc routine
  * via operator overloading.
  */
-#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                            \
-  struct DynLoad__##__name {                                         \
-    template <typename... Args>                                      \
-    auto operator()(Args... args) -> decltype(__name(args...)) {     \
-      using warpctcFunc = decltype(__name(args...)) (*)(Args...);    \
-      std::call_once(warpctc_dso_flag,                               \
-                     paddle::platform::dynload::GetWarpCTCDsoHandle, \
-                     &warpctc_dso_handle);                           \
-      void* p_##_name = dlsym(warpctc_dso_handle, #__name);          \
-      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);      \
-    }                                                                \
-  };                                                                 \
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                                      \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    auto operator()(Args... args) -> decltype(__name(args...)) {               \
+      using warpctcFunc = decltype(&::__name);                                 \
+      std::call_once(warpctc_dso_flag, []() {                                  \
+        warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \
+      });                                                                      \
+      static void* p_##_name = dlsym(warpctc_dso_handle, #__name);             \
+      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);                \
+    }                                                                          \
+  };                                                                           \
   extern DynLoad__##__name __name
 
 #define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index d303fd6d63f8424c1c88a31eb3fa6f2136e0e430..566485cd3c383640047d97f40b452735e8c8c171 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -16,36 +16,38 @@ limitations under the License. */
 
 #include <dlfcn.h>     // for dladdr
 #include <execinfo.h>  // for backtrace
+
+#ifdef __GNUC__
+#include <cxxabi.h>  // for __cxa_demangle
+#endif               // __GNUC__
+
+#ifdef PADDLE_WITH_CUDA
+#include <cublas_v2.h>
+#include <cudnn.h>
+#include <curand.h>
+#include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
+#endif  // PADDLE_WITH_CUDA
+
 #include <iomanip>
 #include <memory>
 #include <sstream>
 #include <stdexcept>
 #include <string>
 
+#include "glog/logging.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/to_string.h"
 
-#ifdef __GNUC__
-#include <cxxabi.h>  // for __cxa_demangle
-#endif
-
-#include <glog/logging.h>
-
 #ifdef PADDLE_WITH_CUDA
-
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/dynload/curand.h"
+#ifndef __APPLE__
 #include "paddle/fluid/platform/dynload/nccl.h"
-
-#include <cublas_v2.h>
-#include <cudnn.h>
-#include <curand.h>
-#include <thrust/system/cuda/error.h>
-#include <thrust/system_error.h>
-
-#endif
+#endif  // __APPLE__
+#endif  // PADDLE_WITH_CUDA
 
 namespace paddle {
 namespace platform {
@@ -100,6 +102,15 @@ struct EnforceNotMet : public std::exception {
   const char* what() const noexcept { return err_str_.c_str(); }
 };
 
+struct EOFException : public std::exception {
+  std::string err_str_;
+  EOFException(const char* err_msg, const char* f, int l) {
+    err_str_ = string::Sprintf("%s at [%s:%d]", err_msg, f, l);
+  }
+
+  const char* what() const noexcept { return err_str_.c_str(); }
+};
+
 // Because most enforce conditions would evaluate to true, we can use
 // __builtin_expect to instruct the C++ compiler to generate code that
 // always forces branch prediction of true.
@@ -111,7 +122,11 @@ template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     bool stat, const Args&... args) {
   if (UNLIKELY(!(stat))) {
+#ifndef REPLACE_ENFORCE_GLOG
     throw std::runtime_error(string::Sprintf(args...));
+#else
+    LOG(FATAL) << string::Sprintf(args...);
+#endif
   }
 }
 
@@ -121,8 +136,12 @@ template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     cudaError_t e, const Args&... args) {
   if (UNLIKELY(e)) {
+#ifndef REPLACE_ENFORCE_GLOG
     throw thrust::system_error(e, thrust::cuda_category(),
                                string::Sprintf(args...));
+#else
+    LOG(FATAL) << string::Sprintf(args...);
+#endif
   }
 }
 
@@ -130,8 +149,12 @@ template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     curandStatus_t stat, const Args&... args) {
   if (stat != CURAND_STATUS_SUCCESS) {
+#ifndef REPLACE_ENFORCE_GLOG
     throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
                                string::Sprintf(args...));
+#else
+    LOG(FATAL) << string::Sprintf(args...);
+#endif
   }
 }
 
@@ -141,8 +164,12 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
   if (stat == CUDNN_STATUS_SUCCESS) {
     return;
   } else {
+#ifndef REPLACE_ENFORCE_GLOG
     throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) +
                              string::Sprintf(args...));
+#else
+    LOG(FATAL) << string::Sprintf(args...);
+#endif
   }
 }
 
@@ -171,21 +198,31 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
   } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) {
     err = "CUBLAS: license error, ";
   }
+#ifndef REPLACE_ENFORCE_GLOG
   throw std::runtime_error(err + string::Sprintf(args...));
+#else
+  LOG(FATAL) << err << string::Sprintf(args...);
+#endif
 }
 
+#ifndef __APPLE__
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     ncclResult_t stat, const Args&... args) {
   if (stat == ncclSuccess) {
     return;
   } else {
+#ifndef REPLACE_ENFORCE_GLOG
     throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) +
                              string::Sprintf(args...));
+#else
+    LOG(FATAL) << platform::dynload::ncclGetErrorString(stat)
+               << string::Sprintf(args...);
+#endif
   }
 }
-
-#endif  // PADDLE_ONLY_CPU
+#endif  // __APPLE__
+#endif  // PADDLE_WITH_CUDA
 
 template <typename T>
 inline void throw_on_error(T e) {
@@ -200,6 +237,7 @@ inline void throw_on_error(T e) {
         __FILE__, __LINE__);                                           \
   } while (false)
 
+#ifndef REPLACE_ENFORCE_GLOG
 #define PADDLE_ENFORCE(...)                                             \
   do {                                                                  \
     try {                                                               \
@@ -209,7 +247,15 @@ inline void throw_on_error(T e) {
                                               __FILE__, __LINE__);      \
     }                                                                   \
   } while (false)
+#else
+#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__);
+#endif
 
+#define PADDLE_THROW_EOF()                                                     \
+  do {                                                                         \
+    throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
+                                           __LINE__);                          \
+  } while (false)
 /*
  * Some enforce helpers here, usage:
  *    int a = 1;
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index bb9a3543ff267dadf3dfee260a320d292a1ba3cb..0e8684581a93f076b1a077cc52e966d3c88cf078 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -96,7 +96,6 @@ TEST(ENFORCE_GT, FAIL) {
   bool caught_exception = false;
   try {
     PADDLE_ENFORCE_GT(1, 2UL);
-
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
     EXPECT_TRUE(
@@ -115,7 +114,6 @@ TEST(ENFORCE_GE, FAIL) {
   bool caught_exception = false;
   try {
     PADDLE_ENFORCE_GE(1, 2UL);
-
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
     EXPECT_TRUE(
@@ -135,7 +133,6 @@ TEST(ENFORCE_LE, FAIL) {
   bool caught_exception = false;
   try {
     PADDLE_ENFORCE_GT(1, 2UL);
-
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
     EXPECT_TRUE(
@@ -171,7 +168,6 @@ TEST(ENFORCE_NOT_NULL, FAIL) {
   try {
     int* a = nullptr;
     PADDLE_ENFORCE_NOT_NULL(a);
-
   } catch (paddle::platform::EnforceNotMet error) {
     caught_exception = true;
     EXPECT_TRUE(HasPrefix(StringPiece(error.what()), "a should not be null"));
@@ -214,3 +210,14 @@ TEST(ENFORCE_USER_DEFINED_CLASS, NE) {
   Dims a{{1, 2, 3, 4}}, b{{5, 6, 7, 8}};
   ASSERT_THROW(PADDLE_ENFORCE_EQ(a, b), paddle::platform::EnforceNotMet);
 }
+
+TEST(EOF_EXCEPTION, THROW_EOF) {
+  bool caught_eof = false;
+  try {
+    PADDLE_THROW_EOF();
+  } catch (paddle::platform::EOFException error) {
+    caught_eof = true;
+    EXPECT_TRUE(HasPrefix(StringPiece(error.what()), "There is no next data."));
+  }
+  EXPECT_TRUE(caught_eof);
+}
diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h
index 52fb8c2531357ad7a2b2f8613e5c7fbcef52c6bb..ffd183af68514dbb1a8b3de39000c9ca3f56ddc3 100644
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stdint.h>
+#include <limits>
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
@@ -293,39 +294,39 @@ struct PADDLE_ALIGN(2) float16 {
   HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
 
   HOSTDEVICE inline explicit operator int8_t() const {
-    return static_cast<int8_t>(float(*this));
+    return static_cast<int8_t>(static_cast<float>(*this));
   }
 
   HOSTDEVICE inline explicit operator uint8_t() const {
-    return static_cast<uint8_t>(float(*this));
+    return static_cast<uint8_t>(static_cast<float>(*this));
   }
 
   HOSTDEVICE inline explicit operator int16_t() const {
-    return static_cast<int16_t>(float(*this));
+    return static_cast<int16_t>(static_cast<float>(*this));
   }
 
   HOSTDEVICE inline explicit operator uint16_t() const {
-    return static_cast<uint16_t>(float(*this));
+    return static_cast<uint16_t>(static_cast<float>(*this));
   }
 
   HOSTDEVICE inline explicit operator int32_t() const {
-    return static_cast<int32_t>(float(*this));
+    return static_cast<int32_t>(static_cast<float>(*this));
   }
 
   HOSTDEVICE inline explicit operator uint32_t() const {
-    return static_cast<uint32_t>(float(*this));
+    return static_cast<uint32_t>(static_cast<float>(*this));
   }
 
   HOSTDEVICE inline explicit operator int64_t() const {
-    return static_cast<int64_t>(float(*this));
+    return static_cast<int64_t>(static_cast<float>(*this));
   }
 
   HOSTDEVICE inline explicit operator uint64_t() const {
-    return static_cast<uint64_t>(float(*this));
+    return static_cast<uint64_t>(static_cast<float>(*this));
   }
 
   HOSTDEVICE inline explicit operator double() const {
-    return static_cast<double>(float(*this));
+    return static_cast<double>(static_cast<float>(*this));
   }
 
  private:
@@ -370,7 +371,7 @@ DEVICE inline half operator+(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hadd(a, b);
 #else
-  float res = float(float16(a)) + float(float16(b));
+  float res = static_cast<float>(float16(a)) + static_cast<float>(float16(b));
   return half(float16(res));
 #endif
 }
@@ -379,7 +380,7 @@ DEVICE inline half operator-(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hsub(a, b);
 #else
-  float res = float(float16(a)) - float(float16(b));
+  float res = static_cast<float>(float16(a)) - static_cast<float>(float16(b));
   return half(float16(res));
 #endif
 }
@@ -388,7 +389,7 @@ DEVICE inline half operator*(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hmul(a, b);
 #else
-  float res = float(float16(a)) * float(float16(b));
+  float res = static_cast<float>(float16(a)) * static_cast<float>(float16(b));
   return half(float16(res));
 #endif
 }
@@ -399,7 +400,7 @@ DEVICE inline half operator/(const half& a, const half& b) {
   float denom = __half2float(b);
   return __float2half(num / denom);
 #else
-  float res = float(float16(a)) / float(float16(b));
+  float res = static_cast<float>(float16(a)) / static_cast<float>(float16(b));
   return half(float16(res));
 #endif
 }
@@ -408,27 +409,27 @@ DEVICE inline half operator-(const half& a) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hneg(a);
 #else
-  float res = -float(float16(a));
+  float res = -static_cast<float>(float16(a));
   return half(float16(res));
 #endif
 }
 
-DEVICE inline half& operator+=(half& a, const half& b) {
+DEVICE inline half& operator+=(half& a, const half& b) {  // NOLINT
   a = a + b;
   return a;
 }
 
-DEVICE inline half& operator-=(half& a, const half& b) {
+DEVICE inline half& operator-=(half& a, const half& b) {  // NOLINT
   a = a - b;
   return a;
 }
 
-DEVICE inline half& operator*=(half& a, const half& b) {
+DEVICE inline half& operator*=(half& a, const half& b) {  // NOLINT
   a = a * b;
   return a;
 }
 
-DEVICE inline half& operator/=(half& a, const half& b) {
+DEVICE inline half& operator/=(half& a, const half& b) {  // NOLINT
   a = a / b;
   return a;
 }
@@ -437,7 +438,7 @@ DEVICE inline bool operator==(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __heq(a, b);
 #else
-  return float(float16(a)) == float(float16(b));
+  return static_cast<float>(float16(a)) == static_cast<float>(float16(b));
 #endif
 }
 
@@ -445,7 +446,7 @@ DEVICE inline bool operator!=(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hne(a, b);
 #else
-  return float(float16(a)) != float(float16(b));
+  return static_cast<float>(float16(a)) != static_cast<float>(float16(b));
 #endif
 }
 
@@ -453,7 +454,7 @@ DEVICE inline bool operator<(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hlt(a, b);
 #else
-  return float(float16(a)) < float(float16(b));
+  return static_cast<float>(float16(a)) < static_cast<float>(float16(b));
 #endif
 }
 
@@ -461,7 +462,7 @@ DEVICE inline bool operator<=(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hle(a, b);
 #else
-  return float(float16(a)) <= float(float16(b));
+  return static_cast<float>(float16(a)) <= static_cast<float>(float16(b));
 #endif
 }
 
@@ -469,7 +470,7 @@ DEVICE inline bool operator>(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hgt(a, b);
 #else
-  return float(float16(a)) > float(float16(b));
+  return static_cast<float>(float16(a)) > static_cast<float>(float16(b));
 #endif
 }
 
@@ -477,15 +478,130 @@ DEVICE inline bool operator>=(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hge(a, b);
 #else
-  return float(float16(a)) >= float(float16(b));
+  return static_cast<float>(float16(a)) >= static_cast<float>(float16(b));
 #endif
 }
 
 #endif  // PADDLE_CUDA_FP16
 
-// Arithmetic operators on ARMv8.2-A CPU
-#if defined(PADDLE_WITH_NATIVE_FP16)
-HOST inline float16 operator+(const float16& a, const float16& b) {
+// Arithmetic operators for float16 on GPU
+#if defined(PADDLE_CUDA_FP16)
+HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return float16(__hadd(half(a), half(b)));
+#else
+  return float16(static_cast<float>(a) + static_cast<float>(b));
+#endif
+}
+
+HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return float16(__hsub(half(a), half(b)));
+#else
+  return float16(static_cast<float>(a) - static_cast<float>(b));
+#endif
+}
+
+HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return float16(__hmul(half(a), half(b)));
+#else
+  return float16(static_cast<float>(a) * static_cast<float>(b));
+#endif
+}
+
+HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+  // TODO(kexinzhao): check which cuda version starts to support __hdiv
+  float num = __half2float(half(a));
+  float denom = __half2float(half(b));
+  return float16(num / denom);
+#else
+  return float16(static_cast<float>(a) / static_cast<float>(b));
+#endif
+}
+
+HOSTDEVICE inline float16 operator-(const float16& a) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return float16(__hneg(half(a)));
+#else
+  float16 res;
+  res.x = a.x ^ 0x8000;
+  return res;
+#endif
+}
+
+HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
+  a = a + b;
+  return a;
+}
+
+HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
+  a = a - b;
+  return a;
+}
+
+HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
+  a = a * b;
+  return a;
+}
+
+HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
+  a = a / b;
+  return a;
+}
+
+HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __heq(half(a), half(b));
+#else
+  return static_cast<float>(a) == static_cast<float>(b);
+#endif
+}
+
+HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hne(half(a), half(b));
+#else
+  return static_cast<float>(a) != static_cast<float>(b);
+#endif
+}
+
+HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hlt(half(a), half(b));
+#else
+  return static_cast<float>(a) < static_cast<float>(b);
+#endif
+}
+
+HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hle(half(a), half(b));
+#else
+  return static_cast<float>(a) <= static_cast<float>(b);
+#endif
+}
+
+HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hgt(half(a), half(b));
+#else
+  return static_cast<float>(a) > static_cast<float>(b);
+#endif
+}
+
+HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hge(half(a), half(b));
+#else
+  return static_cast<float>(a) >= static_cast<float>(b);
+#endif
+}
+
+// Arithmetic operators for float16 on ARMv8.2-A CPU
+#elif defined(PADDLE_WITH_NATIVE_FP16)
+inline float16 operator+(const float16& a, const float16& b) {
   float16 res;
   asm volatile(
       "ld1 {v0.h}[0], [%[a_ptr]]\n"
@@ -501,7 +617,7 @@ HOST inline float16 operator+(const float16& a, const float16& b) {
   return res;
 }
 
-HOST inline float16 operator-(const float16& a, const float16& b) {
+inline float16 operator-(const float16& a, const float16& b) {
   float16 res;
   asm volatile(
       "ld1 {v0.h}[0], [%[a_ptr]]\n"
@@ -517,7 +633,7 @@ HOST inline float16 operator-(const float16& a, const float16& b) {
   return res;
 }
 
-HOST inline float16 operator*(const float16& a, const float16& b) {
+inline float16 operator*(const float16& a, const float16& b) {
   float16 res;
   asm volatile(
       "ld1 {v0.h}[0], [%[a_ptr]]\n"
@@ -533,7 +649,7 @@ HOST inline float16 operator*(const float16& a, const float16& b) {
   return res;
 }
 
-HOST inline float16 operator/(const float16& a, const float16& b) {
+inline float16 operator/(const float16& a, const float16& b) {
   float16 res;
   asm volatile(
       "ld1 {v0.h}[0], [%[a_ptr]]\n"
@@ -549,7 +665,7 @@ HOST inline float16 operator/(const float16& a, const float16& b) {
   return res;
 }
 
-HOST inline float16 operator-(const float16& a) {
+inline float16 operator-(const float16& a) {
   float16 res;
   asm volatile(
       "ld1 {v0.h}[0], [%[a_ptr]]\n"
@@ -564,27 +680,27 @@ HOST inline float16 operator-(const float16& a) {
   return res;
 }
 
-HOST inline float16& operator+=(float16& a, const float16& b) {
+inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
   a = a + b;
   return a;
 }
 
-HOST inline float16& operator-=(float16& a, const float16& b) {
+inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
   a = a - b;
   return a;
 }
 
-HOST inline float16& operator*=(float16& a, const float16& b) {
+inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
   a = a * b;
   return a;
 }
 
-HOST inline float16& operator/=(float16& a, const float16& b) {
+inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
   a = a / b;
   return a;
 }
 
-HOST inline bool operator==(const float16& a, const float16& b) {
+inline bool operator==(const float16& a, const float16& b) {
   uint16_t res;
   asm volatile(
       "ld1 {v0.h}[0], [%[a_ptr]]\n"
@@ -600,11 +716,9 @@ HOST inline bool operator==(const float16& a, const float16& b) {
   return (res & 0xffff) != 0;
 }
 
-HOST inline bool operator!=(const float16& a, const float16& b) {
-  return !(a == b);
-}
+inline bool operator!=(const float16& a, const float16& b) { return !(a == b); }
 
-HOST inline bool operator<(const float16& a, const float16& b) {
+inline bool operator<(const float16& a, const float16& b) {
   uint16_t res;
   asm volatile(
       "ld1 {v1.h}[0], [%[a_ptr]]\n"
@@ -620,7 +734,7 @@ HOST inline bool operator<(const float16& a, const float16& b) {
   return (res & 0xffff) != 0;
 }
 
-HOST inline bool operator<=(const float16& a, const float16& b) {
+inline bool operator<=(const float16& a, const float16& b) {
   uint16_t res;
   asm volatile(
       "ld1 {v1.h}[0], [%[a_ptr]]\n"
@@ -636,7 +750,7 @@ HOST inline bool operator<=(const float16& a, const float16& b) {
   return (res & 0xffff) != 0;
 }
 
-HOST inline bool operator>(const float16& a, const float16& b) {
+inline bool operator>(const float16& a, const float16& b) {
   uint16_t res;
   asm volatile(
       "ld1 {v0.h}[0], [%[a_ptr]]\n"
@@ -652,7 +766,7 @@ HOST inline bool operator>(const float16& a, const float16& b) {
   return (res & 0xffff) != 0;
 }
 
-HOST inline bool operator>=(const float16& a, const float16& b) {
+inline bool operator>=(const float16& a, const float16& b) {
   uint16_t res;
   asm volatile(
       "ld1 {v0.h}[0], [%[a_ptr]]\n"
@@ -668,75 +782,81 @@ HOST inline bool operator>=(const float16& a, const float16& b) {
   return (res & 0xffff) != 0;
 }
 
-// Arithmetic operators, software emulated on other CPU
+// Arithmetic operators for float16, software emulated on other CPU
 #else
-HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
-  return float16(float(a) + float(b));
+inline float16 operator+(const float16& a, const float16& b) {
+  return float16(static_cast<float>(a) + static_cast<float>(b));
 }
 
-HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
-  return float16(float(a) - float(b));
+inline float16 operator-(const float16& a, const float16& b) {
+  return float16(static_cast<float>(a) - static_cast<float>(b));
 }
 
-HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
-  return float16(float(a) * float(b));
+inline float16 operator*(const float16& a, const float16& b) {
+  return float16(static_cast<float>(a) * static_cast<float>(b));
 }
 
-HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
-  return float16(float(a) / float(b));
+inline float16 operator/(const float16& a, const float16& b) {
+  return float16(static_cast<float>(a) / static_cast<float>(b));
 }
 
-HOSTDEVICE inline float16 operator-(const float16& a) {
+inline float16 operator-(const float16& a) {
   float16 res;
   res.x = a.x ^ 0x8000;
   return res;
 }
 
-HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {
-  a = float16(float(a) + float(b));
+inline float16& operator+=(float16& a, const float16& b) {  // NOLINT
+  a = float16(static_cast<float>(a) + static_cast<float>(b));
   return a;
 }
 
-HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {
-  a = float16(float(a) - float(b));
+inline float16& operator-=(float16& a, const float16& b) {  // NOLINT
+  a = float16(static_cast<float>(a) - static_cast<float>(b));
   return a;
 }
 
-HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {
-  a = float16(float(a) * float(b));
+inline float16& operator*=(float16& a, const float16& b) {  // NOLINT
+  a = float16(static_cast<float>(a) * static_cast<float>(b));
   return a;
 }
 
-HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {
-  a = float16(float(a) / float(b));
+inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
+  a = float16(static_cast<float>(a) / static_cast<float>(b));
   return a;
 }
 
-HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
-  return float(a) == float(b);
+inline bool operator==(const float16& a, const float16& b) {
+  return static_cast<float>(a) == static_cast<float>(b);
 }
 
-HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
-  return float(a) != float(b);
+inline bool operator!=(const float16& a, const float16& b) {
+  return static_cast<float>(a) != static_cast<float>(b);
 }
 
-HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
-  return float(a) < float(b);
+inline bool operator<(const float16& a, const float16& b) {
+  return static_cast<float>(a) < static_cast<float>(b);
 }
 
-HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
-  return float(a) <= float(b);
+inline bool operator<=(const float16& a, const float16& b) {
+  return static_cast<float>(a) <= static_cast<float>(b);
 }
 
-HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
-  return float(a) > float(b);
+inline bool operator>(const float16& a, const float16& b) {
+  return static_cast<float>(a) > static_cast<float>(b);
 }
 
-HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
-  return float(a) >= float(b);
+inline bool operator>=(const float16& a, const float16& b) {
+  return static_cast<float>(a) >= static_cast<float>(b);
 }
 #endif
 
+HOSTDEVICE inline float16 raw_uint16_to_float16(uint16_t a) {
+  float16 res;
+  res.x = a;
+  return res;
+}
+
 HOSTDEVICE inline bool(isnan)(const float16& a) {
 #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hisnan(half(a));
@@ -753,6 +873,11 @@ HOSTDEVICE inline bool(isfinite)(const float16& a) {
   return !((isnan)(a)) && !((isinf)(a));
 }
 
+inline std::ostream& operator<<(std::ostream& os, const float16& a) {
+  os << static_cast<float>(a);
+  return os;
+}
+
 }  // namespace platform
 }  // namespace paddle
 
@@ -773,28 +898,156 @@ struct is_pod<paddle::platform::float16> {
       is_standard_layout<paddle::platform::float16>::value;
 };
 
+template <>
+struct numeric_limits<paddle::platform::float16> {
+  static const bool is_specialized = true;
+  static const bool is_signed = true;
+  static const bool is_integer = false;
+  static const bool is_exact = false;
+  static const bool has_infinity = true;
+  static const bool has_quiet_NaN = true;
+  static const bool has_signaling_NaN = true;
+  static const float_denorm_style has_denorm = denorm_present;
+  static const bool has_denorm_loss = false;
+  static const std::float_round_style round_style = std::round_to_nearest;
+  static const bool is_iec559 = false;
+  static const bool is_bounded = false;
+  static const bool is_modulo = false;
+  static const int digits = 11;
+  static const int digits10 = 3;
+  static const int max_digits10 = 5;
+  static const int radix = 2;
+  static const int min_exponent = -13;
+  static const int min_exponent10 = -4;
+  static const int max_exponent = 16;
+  static const int max_exponent10 = 4;
+  static const bool traps = true;
+  static const bool tinyness_before = false;
+
+  static paddle::platform::float16(min)() {
+    return paddle::platform::raw_uint16_to_float16(0x400);
+  }
+  static paddle::platform::float16 lowest() {
+    return paddle::platform::raw_uint16_to_float16(0xfbff);
+  }
+  static paddle::platform::float16(max)() {
+    return paddle::platform::raw_uint16_to_float16(0x7bff);
+  }
+  static paddle::platform::float16 epsilon() {
+    return paddle::platform::raw_uint16_to_float16(0x0800);
+  }
+  static paddle::platform::float16 round_error() {
+    return paddle::platform::float16(0.5);
+  }
+  static paddle::platform::float16 infinity() {
+    return paddle::platform::raw_uint16_to_float16(0x7c00);
+  }
+  static paddle::platform::float16 quiet_NaN() {
+    return paddle::platform::raw_uint16_to_float16(0x7e00);
+  }
+  static paddle::platform::float16 signaling_NaN() {
+    return paddle::platform::raw_uint16_to_float16(0x7e00);
+  }
+  static paddle::platform::float16 denorm_min() {
+    return paddle::platform::raw_uint16_to_float16(0x1);
+  }
+};
+
 }  // namespace std
 
 namespace Eigen {
+
+using float16 = paddle::platform::float16;
+
+template <>
+struct NumTraits<float16> : GenericNumTraits<float16> {
+  enum {
+    IsSigned = true,
+    IsInteger = false,
+    IsComplex = false,
+    RequireInitialization = false
+  };
+
+  HOSTDEVICE static inline float16 epsilon() {
+    return paddle::platform::raw_uint16_to_float16(0x0800);
+  }
+  HOSTDEVICE static inline float16 dummy_precision() { return float16(1e-2f); }
+  HOSTDEVICE static inline float16 highest() {
+    return paddle::platform::raw_uint16_to_float16(0x7bff);
+  }
+  HOSTDEVICE static inline float16 lowest() {
+    return paddle::platform::raw_uint16_to_float16(0xfbff);
+  }
+  HOSTDEVICE static inline float16 infinity() {
+    return paddle::platform::raw_uint16_to_float16(0x7c00);
+  }
+  HOSTDEVICE static inline float16 quiet_NaN() {
+    return paddle::platform::raw_uint16_to_float16(0x7c01);
+  }
+};
+
 namespace numext {
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(
-    const paddle::platform::float16& a) {
+HOSTDEVICE inline bool(isnan)(const float16& a) {
   return (paddle::platform::isnan)(a);
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(
-    const paddle::platform::float16& a) {
+HOSTDEVICE inline bool(isinf)(const float16& a) {
   return (paddle::platform::isinf)(a);
 }
 
 template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(
-    const paddle::platform::float16& a) {
+HOSTDEVICE inline bool(isfinite)(const float16& a) {
   return (paddle::platform::isfinite)(a);
 }
 
+template <>
+HOSTDEVICE inline float16 exp(const float16& a) {
+  return float16(::expf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 log(const float16& a) {
+  return float16(::logf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 tanh(const float16& a) {
+  return float16(::tanhf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 sqrt(const float16& a) {
+  return float16(::sqrtf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 ceil(const float16& a) {
+  return float16(::ceilf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 floor(const float16& a) {
+  return float16(::floorf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 round(const float16& a) {
+  return float16(::roundf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 pow(const float16& a, const float16& b) {
+  return float16(::powf(static_cast<float>(a), static_cast<float>(b)));
+}
+
+template <>
+HOSTDEVICE inline float16 abs(const float16& a) {
+  return float16(::fabs(static_cast<float>(a)));
+}
+
 }  // namespace numext
+
 }  // namespace Eigen
diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc
index b716ad9df41330bd6e22937381d24e33fa3a7914..a589e32b61a9b6a44bdc4529eee715d987d6922c 100644
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
@@ -8,13 +8,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #include "paddle/fluid/platform/float16.h"
+
+#include <vector>
+
+#include "gtest/gtest.h"
 #include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 
-#include <gtest/gtest.h>
-
 namespace paddle {
 namespace platform {
 
@@ -74,24 +75,27 @@ TEST(float16, conversion_cpu) {
 
   // Conversion operator
   EXPECT_EQ(Eigen::half(float16(1.0f)).x, 0x3c00);
-  EXPECT_EQ(float(float16(0.5f)), 0.5f);
-  EXPECT_NEAR(double(float16(0.33333)), 0.33333, 0.0001);
-  EXPECT_EQ(int(float16(-1)), -1);
-  EXPECT_EQ(bool(float16(true)), true);
+  EXPECT_EQ(static_cast<float>(float16(0.5f)), 0.5f);
+  EXPECT_NEAR(static_cast<double>(float16(0.33333)), 0.33333, 0.0001);
+  EXPECT_EQ(static_cast<int>(float16(-1)), -1);
+  EXPECT_EQ(static_cast<bool>(float16(true)), true);
 }
 
 TEST(float16, arithmetic_cpu) {
-  EXPECT_EQ(float(float16(1) + float16(1)), 2);
-  EXPECT_EQ(float(float16(5) + float16(-5)), 0);
-  EXPECT_NEAR(float(float16(0.33333f) + float16(0.66667f)), 1.0f, 0.001);
-  EXPECT_EQ(float(float16(3) - float16(5)), -2);
-  EXPECT_NEAR(float(float16(0.66667f) - float16(0.33333f)), 0.33334f, 0.001);
-  EXPECT_NEAR(float(float16(3.3f) * float16(2.0f)), 6.6f, 0.01);
-  EXPECT_NEAR(float(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01);
-  EXPECT_NEAR(float(float16(2.0f) / float16(3.0f)), 0.66667f, 0.001);
-  EXPECT_EQ(float(float16(1.0f) / float16(2.0f)), 0.5f);
-  EXPECT_EQ(float(-float16(512.0f)), -512.0f);
-  EXPECT_EQ(float(-float16(-512.0f)), 512.0f);
+  EXPECT_EQ(static_cast<float>(float16(1) + float16(1)), 2);
+  EXPECT_EQ(static_cast<float>(float16(5) + float16(-5)), 0);
+  EXPECT_NEAR(static_cast<float>(float16(0.33333f) + float16(0.66667f)), 1.0f,
+              0.001);
+  EXPECT_EQ(static_cast<float>(float16(3) - float16(5)), -2);
+  EXPECT_NEAR(static_cast<float>(float16(0.66667f) - float16(0.33333f)),
+              0.33334f, 0.001);
+  EXPECT_NEAR(static_cast<float>(float16(3.3f) * float16(2.0f)), 6.6f, 0.01);
+  EXPECT_NEAR(static_cast<float>(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01);
+  EXPECT_NEAR(static_cast<float>(float16(2.0f) / float16(3.0f)), 0.66667f,
+              0.001);
+  EXPECT_EQ(static_cast<float>(float16(1.0f) / float16(2.0f)), 0.5f);
+  EXPECT_EQ(static_cast<float>(-float16(512.0f)), -512.0f);
+  EXPECT_EQ(static_cast<float>(-float16(-512.0f)), 512.0f);
 }
 
 TEST(float16, comparison_cpu) {
@@ -137,5 +141,10 @@ TEST(float16, lod_tensor_cpu) {
   }
 }
 
+TEST(float16, print) {
+  float16 a = float16(1.0f);
+  std::cout << a << std::endl;
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index 567209df4edc483bcb5c6264c62034ddff50c413..1b9cf9b5d3fa2121b588c31d7cf2f4c50cb951bc 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/utils/Logging.h"
+#include "paddle/legacy/utils/Logging.h"
 
 #define ARITHMETIC_KERNEL(op_type, sign)                                 \
   __global__ void op_type(const half* in1, const half* in2, half* out) { \
@@ -36,19 +36,19 @@ limitations under the License. */
     half *in1, *in2, *out;                                    \
     half *d_in1, *d_in2, *d_out;                              \
     int size = sizeof(half);                                  \
-    cudaMalloc((void**)&d_in1, size);                         \
-    cudaMalloc((void**)&d_in2, size);                         \
-    cudaMalloc((void**)&d_out, size);                         \
-    in1 = (half*)malloc(size);                                \
-    in2 = (half*)malloc(size);                                \
-    out = (half*)malloc(size);                                \
+    cudaMalloc(reinterpret_cast<void**>(&d_in1), size);       \
+    cudaMalloc(reinterpret_cast<void**>(&d_in2), size);       \
+    cudaMalloc(reinterpret_cast<void**>(&d_out), size);       \
+    in1 = reinterpret_cast<half*>(malloc(size));              \
+    in2 = reinterpret_cast<half*>(malloc(size));              \
+    out = reinterpret_cast<half*>(malloc(size));              \
     in1[0] = half(float16(v_in1));                            \
     in2[0] = half(float16(v_in2));                            \
     cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
     cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
     op_type<<<1, 1>>>(d_in1, d_in2, d_out);                   \
     cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost);     \
-    EXPECT_EQ(float(float16(out[0])), v_out);                 \
+    EXPECT_EQ(static_cast<float>(float16(out[0])), v_out);    \
     free(in1);                                                \
     free(in2);                                                \
     free(out);                                                \
@@ -63,17 +63,17 @@ limitations under the License. */
     half *in1, *in2;                                          \
     half *d_in1, *d_in2;                                      \
     int size = sizeof(half);                                  \
-    cudaMalloc((void**)&d_in1, size);                         \
-    cudaMalloc((void**)&d_in2, size);                         \
-    in1 = (half*)malloc(size);                                \
-    in2 = (half*)malloc(size);                                \
+    cudaMalloc(reinterpret_cast<void**>(&d_in1), size);       \
+    cudaMalloc(reinterpret_cast<void**>(&d_in2), size);       \
+    in1 = reinterpret_cast<half*>(malloc(size));              \
+    in2 = reinterpret_cast<half*>(malloc(size));              \
     in1[0] = half(float16(v_in1));                            \
     in2[0] = half(float16(v_in2));                            \
     cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
     cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
     op_type<<<1, 1>>>(d_in1, d_in2);                          \
     cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost);     \
-    EXPECT_EQ(float(float16(in1[0])), v_out);                 \
+    EXPECT_EQ(static_cast<float>(float16(in1[0])), v_out);    \
     free(in1);                                                \
     free(in2);                                                \
     cudaFree(d_in1);                                          \
@@ -87,12 +87,12 @@ limitations under the License. */
     half *d_in1, *d_in2;                                     \
     bool *out, *d_out;                                       \
     int size = sizeof(half);                                 \
-    cudaMalloc((void**)&d_in1, size);                        \
-    cudaMalloc((void**)&d_in2, size);                        \
-    cudaMalloc((void**)&d_out, 1);                           \
-    in1 = (half*)malloc(size);                               \
-    in2 = (half*)malloc(size);                               \
-    out = (bool*)malloc(1);                                  \
+    cudaMalloc(reinterpret_cast<void**>(&d_in1), size);      \
+    cudaMalloc(reinterpret_cast<void**>(&d_in2), size);      \
+    cudaMalloc(reinterpret_cast<void**>(&d_out), 1);         \
+    in1 = reinterpret_cast<half*>(malloc(size));             \
+    in2 = reinterpret_cast<half*>(malloc(size));             \
+    out = reinterpret_cast<bool*>(malloc(1));                \
     in1[0] = half(float16(v_in1));                           \
     in2[0] = half(float16(v_in2));                           \
     cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);    \
@@ -130,13 +130,13 @@ void TestNeg(float v_in, float v_out) {
   LOG(INFO) << "Test Neg on GPU!";
   half *in, *d_in;
   int size = sizeof(half);
-  cudaMalloc((void**)&d_in, size);
-  in = (half*)malloc(size);
+  cudaMalloc(reinterpret_cast<void**>(&d_in), size);
+  in = reinterpret_cast<half*>(malloc(size));
   in[0] = half(float16(v_in));
   cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
   Neg<<<1, 1>>>(d_in);
   cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost);
-  EXPECT_EQ(float(float16(in[0])), v_out);
+  EXPECT_EQ(static_cast<float>(float16(in[0])), v_out);
   free(in);
   cudaFree(d_in);
 }
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index dd70ff9ff574b32bc96a9e8255b1bf77a5cc84e4..4cee93f3a4224cb97327254cd1679021d197a1b1 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -14,8 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/gpu_info.h"
 
-#include "gflags/gflags.h"
+#include <algorithm>
 
+#include "gflags/gflags.h"
 #include "paddle/fluid/platform/enforce.h"
 
 DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
@@ -77,8 +78,8 @@ void SetDeviceId(int id) {
                  "cudaSetDevice failed in paddle::platform::SetDeviceId");
 }
 
-void GpuMemoryUsage(size_t &available, size_t &total) {
-  PADDLE_ENFORCE(cudaMemGetInfo(&available, &total),
+void GpuMemoryUsage(size_t *available, size_t *total) {
+  PADDLE_ENFORCE(cudaMemGetInfo(available, total),
                  "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
 }
 
@@ -86,7 +87,7 @@ size_t GpuMaxAllocSize() {
   size_t total = 0;
   size_t available = 0;
 
-  GpuMemoryUsage(available, total);
+  GpuMemoryUsage(&available, &total);
 
   // Reserve the rest for page tables, etc.
   return static_cast<size_t>(total * FLAGS_fraction_of_gpu_memory_to_use);
@@ -101,7 +102,7 @@ size_t GpuMaxChunkSize() {
   size_t total = 0;
   size_t available = 0;
 
-  GpuMemoryUsage(available, total);
+  GpuMemoryUsage(&available, &total);
   VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
            << total / 1024 / 1024 << "M";
   size_t reserving = static_cast<size_t>(0.05 * total);
@@ -126,11 +127,24 @@ void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                  "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync");
 }
 
-void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
-                   size_t count, cudaStream_t stream) {
+void GpuMemcpySync(void *dst, const void *src, size_t count,
+                   enum cudaMemcpyKind kind) {
+  PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind),
+                 "cudaMemcpy failed in paddle::platform::GpuMemcpySync");
+}
+
+void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
+                        int src_device, size_t count, cudaStream_t stream) {
   PADDLE_ENFORCE(
       cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream),
-      "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeer");
+      "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeerAsync");
+}
+
+void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
+                       int src_device, size_t count) {
+  PADDLE_ENFORCE(
+      cudaMemcpyPeer(dst, dst_device, src, src_device, count),
+      "cudaMemcpyPeer failed in paddle::platform::GpuMemcpyPeerSync");
 }
 
 void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h
index fa469fa77f5ca780da153cc87da8d04f239711f3..f4640d3eaa2165c35e8e14690d83e9e7e7168c0b 100644
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -23,10 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-//! Environment variable: fraction of GPU memory to use on each device.
-const std::string kEnvFractionGpuMemoryToUse =
-    "PADDLE_FRACTION_GPU_MEMORY_TO_USE";
-
 //! Get the total number of GPU devices in system.
 int GetCUDADeviceCount();
 
@@ -46,7 +42,7 @@ int GetCurrentDeviceId();
 void SetDeviceId(int device_id);
 
 //! Get the memory usage of current GPU device.
-void GpuMemoryUsage(size_t &available, size_t &total);
+void GpuMemoryUsage(size_t *available, size_t *total);
 
 //! Get the maximum allocation size of current GPU device.
 size_t GpuMaxAllocSize();
@@ -61,9 +57,17 @@ size_t GpuMaxChunkSize();
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                     enum cudaMemcpyKind kind, cudaStream_t stream);
 
-//! Copy memory from one device to another device.
-void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
-                   size_t count, cudaStream_t stream);
+//! Copy memory from address src to dst synchronously.
+void GpuMemcpySync(void *dst, const void *src, size_t count,
+                   enum cudaMemcpyKind kind);
+
+//! Copy memory from one device to another device asynchronously.
+void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
+                        int src_device, size_t count, cudaStream_t stream);
+
+//! Copy memory from one device to another device synchronously.
+void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
+                       int src_device, size_t count);
 
 //! Set memory dst with value count size asynchronously
 void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream);
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 90b78142b845e7e12c0c7dfb391f6aa3bd848436..33fec2c1073819d88d85a8872227adcb9df3e8f4 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -11,12 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
-#include <mkldnn.hpp>
-
+#include <mkldnn.h>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace platform {
@@ -34,6 +35,37 @@ typedef std::unique_ptr<MKLDNNMemory> MKLDNNMemoryPtr;
 typedef std::unique_ptr<MKLDNNPrimitive> MKLDNNPrimitivePtr;
 typedef std::unique_ptr<MKLDNNPrimitiveDesc> MKLDNNPrimitiveDescPtr;
 
+template <typename Type>
+void* to_void_cast(const Type* t) {
+  return static_cast<void*>(const_cast<Type*>(t));
+}
+
+template <typename Type>
+void* to_void_reinterpret_cast(const Type* t) {
+  return reinterpret_cast<void*>(const_cast<Type*>(t));
+}
+
+template <class Type>
+using tf_desc = typename Type::desc;
+
+template <class Type>
+using tf_pd = typename Type::primitive_desc;
+
+template <typename Type, typename Engine, typename... Args>
+std::shared_ptr<tf_pd<Type>> MKLDNNFwdPrimitiveDesc(const Engine& e,
+                                                    Args&&... args) {
+  auto desc = tf_desc<Type>(mkldnn::prop_kind::forward, (args)...);
+  auto pd = new tf_pd<Type>(desc, e);
+  return std::shared_ptr<tf_pd<Type>>(pd);
+}
+
+template <typename Type, typename Engine, typename Primitive, typename... Args>
+tf_pd<Type> MKLDNNBwdPrimitiveDesc(const Engine& e, const Primitive& p,
+                                   Args&&... args) {
+  auto desc = tf_desc<Type>(args...);
+  return tf_pd<Type>(desc, e, p);
+}
+
 inline mkldnn::memory::desc MKLDNNMemDesc(const std::vector<int>& dims,
                                           mkldnn::memory::data_type data_type,
                                           mkldnn::memory::format format) {
@@ -46,5 +78,177 @@ inline bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx) {
   return use_mkldnn && platform::is_cpu_place(ctx.GetPlace());
 }
 
+template <typename Type>
+mkldnn::memory::data_type MKLDNNGetDataType() {
+  return mkldnn::memory::data_undef;
+}
+
+template <>
+inline mkldnn::memory::data_type MKLDNNGetDataType<float>() {
+  return mkldnn::memory::f32;
+}
+
+inline void Reorder(const mkldnn::memory& src, const mkldnn::memory& dst) {
+  auto reorder_prim = mkldnn::reorder(src, dst);
+  std::vector<mkldnn::primitive> pipeline;
+  pipeline.push_back(reorder_prim);
+  mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+}
+
+inline mkldnn::memory::format GetMKLDNNFormat(const mkldnn::memory memory) {
+  return static_cast<mkldnn::memory::format>(
+      memory.get_primitive_desc().desc().data.format);
+}
+
+inline mkldnn::memory::format GetMKLDNNFormat(
+    const mkldnn::sum::primitive_desc& memory) {
+  return static_cast<mkldnn::memory::format>(
+      memory.dst_primitive_desc().desc().data.format);
+}
+
+class MKLDNNHandler {
+ public:
+  MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
+                const std::string& base_key)
+      : dev_ctx_(dev_ctx),
+        engine_(engine),
+        key_(base_key),
+        is_reusing_(false) {}
+
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_src_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireWeightsMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_weights_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDstMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_dst_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_diff_dst_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemory(
+      const mkldnn::memory::desc& md, void* ptr) {
+    return this->AcquireMemory(md, ptr, "@user_diff_src_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive(
+      mkldnn::memory::primitive_desc mdp, void* ptr,
+      const std::string& suffix) {
+    auto local_key = key_ + suffix;
+    auto mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find mem primitive in device context");
+    if (mem_p == nullptr) {
+      mem_p = std::make_shared<mkldnn::memory>(mdp, ptr);
+      dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+      // Mark that reusing happenned. All primitives from operator instance
+      // should be reused or none of them. So we check consistency
+      is_reusing_ = true;
+    }
+    return mem_p;
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireMemory(const mkldnn::memory::desc& md,
+                                                void* ptr,
+                                                const std::string& suffix) {
+    /*Generate key*/
+    auto local_key = key_ + suffix;
+    auto mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find mem primitive in device context");
+    if (mem_p == nullptr) {
+      mem_p = std::make_shared<mkldnn::memory>(
+          mkldnn::memory::primitive_desc{md, engine_}, ptr);
+      dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+      // Mark that reusing happenned. All primitives from operator instance
+      // should be reused or none of them. So we check consistency
+      is_reusing_ = true;
+    }
+    return mem_p;
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireMemory(
+      mkldnn::memory::primitive_desc& mpd,       // NOLINT
+      mkldnn::memory::primitive_desc& user_mpd,  // NOLINT
+      const std::shared_ptr<mkldnn::memory> user_memory_p,
+      const std::string& suffix,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
+    // create reorder primitive if the input format is not the preferred one
+    auto local_key = key_ + suffix;
+    auto key_reorder_p = key_ + suffix + "reorder_p";
+
+    auto target_memory_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((target_memory_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find mem primitive in device context");
+    if (target_memory_p == nullptr) {
+      target_memory_p = user_memory_p;
+      std::shared_ptr<mkldnn::primitive> reorder_p;
+      if (mpd != user_mpd) {
+        target_memory_p = std::make_shared<mkldnn::memory>(mpd);
+
+        auto reorder_p =
+            std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
+        dev_ctx_.SetBlob(key_reorder_p, reorder_p);
+        pipeline.push_back(*reorder_p);
+      }
+      dev_ctx_.SetBlob(local_key, target_memory_p);
+    } else {
+      // Make reorder if needed
+      auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
+          dev_ctx_.GetBlob(key_reorder_p));
+      if (reorder_p != nullptr) {
+        pipeline.push_back(*reorder_p);
+      }
+      is_reusing_ = true;
+    }
+    return target_memory_p;
+  }
+
+  static std::string GetHash(mkldnn::memory::dims& operand_dims,  // NOLINT
+                             const std::string& suffix) {
+    auto dims2str = [](const mkldnn::memory::dims& operand_dims) {
+      std::string dstr = "";
+      for (size_t i = 0; i < operand_dims.size(); ++i) {
+        dstr += std::to_string(operand_dims[i]) + "-";
+      }
+      return dstr;
+    };
+
+    return dims2str(operand_dims) + suffix;
+  }
+
+ protected:
+  const MKLDNNDeviceContext& dev_ctx_;
+  mkldnn::engine engine_;
+  std::string key_;
+  bool is_reusing_;
+};
+
+inline mkldnn::memory::format MKLDNNFormatForSize(
+    size_t dims_size, mkldnn::memory::format data_format) {
+  if (dims_size == 1) {
+    return mkldnn::memory::format::x;
+  } else if (dims_size == 2) {
+    return mkldnn::memory::format::nc;
+  }
+  return data_format;
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc46c88fd1f9a5d1bacad26beed6fd0af6405310
--- /dev/null
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -0,0 +1,152 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <typeindex>
+#include <vector>
+#include "paddle/fluid/platform/dynload/nccl.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#define NCCL_ID_VARNAME "NCCLID"
+
+namespace paddle {
+namespace platform {
+
+inline ncclDataType_t ToNCCLDataType(std::type_index type) {
+  if (type == typeid(float)) {  // NOLINT
+    return ncclFloat;
+  } else if (type == typeid(double)) {  // NOLINT
+    return ncclDouble;
+  } else if (type == typeid(int)) {  // NOLINT
+    return ncclInt;
+  } else if (type == typeid(int64_t)) {  // NOLINT
+    return ncclInt64;
+  } else {
+    PADDLE_THROW("Not supported");
+  }
+}
+
+// NOTE(minqiyang): according to the ncclGroupEnd documentations:
+// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
+// ncclGroupEnd will wait for all communicators to be initialized, which will
+// cause blocking problem when a runtime_error was thrown, so try only guard
+// NCCL actions when use it.
+class NCCLGroupGuard {
+ public:
+  static std::mutex &NCCLMutex() {
+    static std::mutex mtx;
+    return mtx;
+  }
+
+  inline NCCLGroupGuard() {
+    NCCLMutex().lock();
+    PADDLE_ENFORCE(dynload::ncclGroupStart());
+  }
+
+  inline ~NCCLGroupGuard() {
+    CHECK_EQ(dynload::ncclGroupEnd(), ncclSuccess);
+    NCCLMutex().unlock();
+  }
+};
+
+struct NCCLContext {
+  std::unique_ptr<CUDADeviceContext> ctx_;
+  ncclComm_t comm_;
+
+  explicit NCCLContext(int dev_id)
+      : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {}
+
+  cudaStream_t stream() const { return ctx_->stream(); }
+
+  int device_id() const {
+    return boost::get<platform::CUDAPlace>(ctx_->GetPlace()).device;
+  }
+};
+
+struct NCCLContextMap {
+  std::unordered_map<int, NCCLContext> contexts_;
+  std::vector<int> order_;
+
+  explicit NCCLContextMap(const std::vector<platform::Place> &places,
+                          ncclUniqueId *nccl_id = nullptr,
+                          size_t num_trainers = 1, size_t trainer_id = 0) {
+    PADDLE_ENFORCE(!places.empty());
+    order_.reserve(places.size());
+    for (auto &p : places) {
+      int dev_id = boost::get<CUDAPlace>(p).device;
+      order_.emplace_back(dev_id);
+      contexts_.emplace(dev_id, NCCLContext(dev_id));
+    }
+    PADDLE_ENFORCE_EQ(
+        order_.size(), contexts_.size(),
+        "NCCL Context Map does not support contain two or more same device");
+
+    if (places.size() <= 1) {
+      return;
+    }
+    std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
+    // if pass nccl_id here, can assume we are doing multi node training
+    if (nccl_id == nullptr) {
+      std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
+      PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
+          comms.get(), static_cast<int>(order_.size()), order_.data()));
+    } else {
+      PADDLE_ENFORCE_GT(num_trainers, 1);
+      // TODO(wuyi): need to ensure each node have same number of GPUs
+      {
+        int nranks = num_trainers * order_.size();
+        NCCLGroupGuard gurad;
+        for (auto &gpu_id : order_) {
+          int rank = trainer_id * order_.size() + gpu_id;
+          VLOG(3) << "init nccl rank: " << rank << " nranks: " << nranks;
+          PADDLE_ENFORCE(cudaSetDevice(gpu_id));
+          PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
+              comms.get() + gpu_id, nranks, *nccl_id, rank));
+        }
+      }
+    }
+    int i = 0;
+    for (auto &dev_id : order_) {
+      contexts_.at(dev_id).comm_ = comms[i++];
+    }
+  }
+
+  NCCLContextMap(const NCCLContextMap &other) = delete;
+  NCCLContextMap &operator=(const NCCLContextMap &other) = delete;
+
+  CUDADeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }
+
+  CUDADeviceContext *DevCtx(platform::Place p) const {
+    return DevCtx(boost::get<CUDAPlace>(p).device);
+  }
+
+  const NCCLContext &at(platform::Place p) const {
+    return this->at(boost::get<CUDAPlace>(p).device);
+  }
+
+  const NCCLContext &at(int dev_id) const { return contexts_.at(dev_id); }
+
+  void WaitAll() {
+    for (auto &p : contexts_) {
+      p.second.ctx_->Wait();
+    }
+  }
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc
index de8f958eb012cb1ac563cbbbac8951e439bf8f33..655ce8485d4584aa0955315b045da6bf541f7fe2 100644
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -26,6 +26,7 @@ class PlacePrinter : public boost::static_visitor<> {
   void operator()(const CUDAPlace &p) {
     os_ << "CUDAPlace(" << p.device << ")";
   }
+  void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }
 
  private:
   std::ostream &os_;
@@ -40,12 +41,19 @@ const Place &get_place() { return the_default_place; }
 
 const CUDAPlace default_gpu() { return CUDAPlace(0); }
 const CPUPlace default_cpu() { return CPUPlace(); }
+const CUDAPinnedPlace default_cuda_pinned() { return CUDAPinnedPlace(); }
 
 bool is_gpu_place(const Place &p) {
   return boost::apply_visitor(IsCUDAPlace(), p);
 }
 
-bool is_cpu_place(const Place &p) { return !is_gpu_place(p); }
+bool is_cpu_place(const Place &p) {
+  return boost::apply_visitor(IsCPUPlace(), p);
+}
+
+bool is_cuda_pinned_place(const Place &p) {
+  return boost::apply_visitor(IsCUDAPinnedPlace(), p);
+}
 
 bool places_are_same_class(const Place &p1, const Place &p2) {
   return p1.which() == p2.which();
@@ -53,7 +61,7 @@ bool places_are_same_class(const Place &p1, const Place &p2) {
 
 bool is_same_place(const Place &p1, const Place &p2) {
   if (places_are_same_class(p1, p2)) {
-    if (is_cpu_place(p1)) {
+    if (is_cpu_place(p1) || is_cuda_pinned_place(p1)) {
       return true;
     } else {
       return boost::get<CUDAPlace>(p1) == boost::get<CUDAPlace>(p2);
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
index 501bddfc6ec8b5d0bf554b0911c32e47fd51ec15..e3ee504f3d042d6a99036e34507c4c8bee306750 100644
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -11,10 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
 #include <iostream>
+#include <vector>
+
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/variant.h"
 
@@ -29,6 +30,7 @@ struct CPUPlace {
   // needed for variant equality comparison
   inline bool operator==(const CPUPlace &) const { return true; }
   inline bool operator!=(const CPUPlace &) const { return false; }
+  inline bool operator<(const CPUPlace &) const { return false; }
 };
 
 struct CUDAPlace {
@@ -41,16 +43,39 @@ struct CUDAPlace {
     return device == o.device;
   }
   inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); }
+  inline bool operator<(const CUDAPlace &o) const { return device < o.device; }
 
   int device;
 };
 
+struct CUDAPinnedPlace {
+  CUDAPinnedPlace() {}
+
+  // needed for variant equality comparison
+  inline bool operator==(const CUDAPinnedPlace &) const { return true; }
+  inline bool operator!=(const CUDAPinnedPlace &) const { return false; }
+  inline bool operator<(const CUDAPinnedPlace &) const { return false; }
+};
+
 struct IsCUDAPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &gpu) const { return true; }
+  bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
 
-typedef boost::variant<CUDAPlace, CPUPlace> Place;
+struct IsCPUPlace : public boost::static_visitor<bool> {
+  bool operator()(const CPUPlace &cpu) const { return true; }
+  bool operator()(const CUDAPlace &) const { return false; }
+  bool operator()(const CUDAPinnedPlace &) const { return false; }
+};
+
+struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
+  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const CUDAPlace &) const { return false; }
+  bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
+};
+
+typedef boost::variant<CUDAPlace, CPUPlace, CUDAPinnedPlace> Place;
 
 using PlaceList = std::vector<Place>;
 
@@ -59,9 +84,11 @@ const Place &get_place();
 
 const CUDAPlace default_gpu();
 const CPUPlace default_cpu();
+const CUDAPinnedPlace default_cuda_pinned();
 
 bool is_gpu_place(const Place &);
 bool is_cpu_place(const Place &);
+bool is_cuda_pinned_place(const Place &);
 bool places_are_same_class(const Place &, const Place &);
 bool is_same_place(const Place &, const Place &);
 
@@ -83,6 +110,16 @@ struct PlaceVisitorWrapper
 #else
     PADDLE_THROW("Paddle is not compiled with CUDA. Cannot visit cuda device");
     return typename Visitor::result_type();
+#endif
+  }
+
+  typename Visitor::result_type operator()(
+      const CUDAPinnedPlace &cuda_pinned) const {
+#ifdef PADDLE_WITH_CUDA
+    return visitor_(cuda_pinned);
+#else
+    PADDLE_THROW("Paddle is not compiled with CUDA. Cannot visit cuda_pinned");
+    return typename Visitor::result_type();
 #endif
   }
 };
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 28ef3e04b1c50e0d42eeb27608259c6449429da5..01de9d7041bf3eb40884e2a6295027cccfaebd2a 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -13,10 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/profiler.h"
+
 #include <sys/time.h>
 #include <time.h>
+#include <algorithm>
 #include <iomanip>
+#include <limits>
 #include <map>
+#include <mutex>  // NOLINT
+#include <random>
+#include <string>
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #endif  // PADDLE_WITH_CUDA
@@ -28,10 +34,14 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
+struct EventList;
+
+static int64_t profiler_lister_id = 0;
+static bool should_send_profile_state = false;
+std::mutex profiler_mu;
+
 // The profiler state, the initial value is ProfilerState::kDisabled
 static ProfilerState g_state = ProfilerState::kDisabled;
-// To record which timer the profiler used, CUDA or CPU.
-static std::string g_profiler_place = "";
 // The thread local event list only can be accessed by the specific thread
 // The thread index of each thread
 static thread_local int32_t g_thread_id;
@@ -45,6 +55,39 @@ static std::list<std::shared_ptr<EventList>> g_all_event_lists;
 // The thread local event list only can be accessed by the specific thread
 static thread_local std::shared_ptr<EventList> g_event_list;
 
+struct EventList {
+  constexpr static size_t kMB = 1024 * 1024;
+  constexpr static size_t kEventBlockSize = 16 * kMB;
+  constexpr static size_t kEventSize = sizeof(Event);
+  constexpr static size_t kEventAlign = alignof(Event);
+  constexpr static size_t kNumBlock =
+      kEventBlockSize /
+      ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
+
+  template <typename... Args>
+  void Record(Args&&... args) {
+    if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
+      event_blocks.emplace_front();
+      event_blocks.front().reserve(kNumBlock);
+    }
+    event_blocks.front().emplace_back(std::forward<Args>(args)...);
+  }
+
+  std::vector<Event> Reduce() {
+    std::vector<Event> result;
+    for (auto& block : event_blocks) {
+      result.insert(result.begin(), std::make_move_iterator(block.begin()),
+                    std::make_move_iterator(block.end()));
+    }
+    event_blocks.clear();
+    return result;
+  }
+
+  void Clear() { event_blocks.clear(); }
+
+  std::forward_list<std::vector<Event>> event_blocks;
+};
+
 inline uint64_t GetTimeInNsec() {
   using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
                                  std::chrono::high_resolution_clock,
@@ -60,9 +103,9 @@ inline uint64_t PosixInNsec() {
   return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
 }
 
-Event::Event(EventKind kind, std::string name, uint32_t thread_id,
+Event::Event(EventType type, std::string name, uint32_t thread_id,
              const DeviceContext* dev_ctx)
-    : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
+    : type_(type), name_(name), thread_id_(thread_id), has_cuda_(false) {
 #ifdef PADDLE_WITH_CUDA
   has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
   if (has_cuda_) {
@@ -76,17 +119,7 @@ Event::Event(EventKind kind, std::string name, uint32_t thread_id,
   cpu_ns_ = GetTimeInNsec();
 }
 
-std::string Event::kind() const {
-  switch (kind_) {
-    case EventKind::kMark:
-      return "mark";
-    case EventKind::kPushRange:
-      return "push";
-    case EventKind::kPopRange:
-      return "pop";
-  }
-  PADDLE_THROW("Unknown EventKind.");
-}
+const EventType& Event::type() const { return type_; }
 
 double Event::CpuElapsedMs(const Event& e) const {
   return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
@@ -94,6 +127,7 @@ double Event::CpuElapsedMs(const Event& e) const {
 
 double Event::CudaElapsedMs(const Event& e) const {
 #ifdef PADDLE_WITH_CUDA
+  if (!has_cuda_) return 0.0;
   PADDLE_ENFORCE(e.has_cuda() && has_cuda());
   PADDLE_ENFORCE(e.device() == device());
   PADDLE_ENFORCE(cudaEventSynchronize(event_));
@@ -129,53 +163,81 @@ inline EventList& GetEventList() {
 }
 
 void Mark(const std::string& name, const DeviceContext* dev_ctx) {
-  GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx);
+  GetEventList().Record(EventType::kMark, name, g_thread_id, dev_ctx);
 }
 
 void PushEvent(const std::string& name, const DeviceContext* dev_ctx) {
-  GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx);
+  GetEventList().Record(EventType::kPushRange, name, g_thread_id, dev_ctx);
 }
 
 void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
-  GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
+  GetEventList().Record(EventType::kPopRange, name, g_thread_id, dev_ctx);
 }
 
 RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
-    : start_ns_(PosixInNsec()) {
+    : is_enabled_(false), start_ns_(PosixInNsec()) {
   if (g_state == ProfilerState::kDisabled) return;
+  is_enabled_ = true;
   dev_ctx_ = dev_ctx;
   name_ = name;
   PushEvent(name_, dev_ctx_);
   // Maybe need the same push/pop behavior.
-  SetCurAnnotation(name_.c_str());
+  SetCurAnnotation(name_);
 }
 
 RecordEvent::~RecordEvent() {
-  if (g_state == ProfilerState::kDisabled) return;
+  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
   DeviceTracer* tracer = GetDeviceTracer();
   if (tracer) {
-    tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec());
+    tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(),
+                          BlockDepth(), CurThread());
   }
   ClearCurAnnotation();
   PopEvent(name_, dev_ctx_);
 }
 
+RecordBlock::RecordBlock(int block_id)
+    : is_enabled_(false), start_ns_(PosixInNsec()) {
+  if (g_state == ProfilerState::kDisabled) return;
+  is_enabled_ = true;
+  SetCurBlock(block_id);
+  name_ = string::Sprintf("block_%d", block_id);
+}
+
+RecordBlock::~RecordBlock() {
+  if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
+  DeviceTracer* tracer = GetDeviceTracer();
+  if (tracer) {
+    // We try to put all blocks at the same nested depth in the
+    // same timeline lane. and distinguish the using thread_id.
+    tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(),
+                          CurThread());
+  }
+  ClearCurBlock();
+}
+
+RecordThread::RecordThread(int thread_id) {
+  if (g_state == ProfilerState::kDisabled) return;
+  SetCurThread(thread_id);
+}
+
+RecordThread::~RecordThread() {
+  if (g_state == ProfilerState::kDisabled) return;
+  ClearCurThread();
+}
+
 void EnableProfiler(ProfilerState state) {
   PADDLE_ENFORCE(state != ProfilerState::kDisabled,
                  "Can't enbale profling, since the input state is ",
                  "ProfilerState::kDisabled");
-  PADDLE_ENFORCE(g_state == ProfilerState::kDisabled,
-                 "The profiling state should be disabled when calling ",
-                 "EnableProfiler.");
-  g_state = state;
-  if (g_state == ProfilerState::kCUDA) {
-    g_profiler_place = "CUDA";
-  } else if (g_state == ProfilerState::kCPU) {
-    g_profiler_place = "CPU";
-  } else {
-    g_profiler_place = "All";
-    GetDeviceTracer()->Enable();
+
+  std::lock_guard<std::mutex> l(profiler_mu);
+  if (state == g_state) {
+    return;
   }
+  g_state = state;
+  should_send_profile_state = true;
+  GetDeviceTracer()->Enable();
 #ifdef PADDLE_WITH_CUDA
   if (g_state == ProfilerState::kCUDA) {
     // Generate some dummy events first to reduce the startup overhead.
@@ -211,27 +273,63 @@ std::vector<std::vector<Event>> GetAllEvents() {
   return result;
 }
 
-void DisableProfiler(EventSortingKey sorted_key,
-                     const std::string& profile_path) {
-  PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
-                 "Can't disable profiling, since it's not starting.");
-  // Mark the profiling stop.
-  Mark("_stop_profiler_", nullptr);
-  g_state = ProfilerState::kDisabled;
+// The information of each event given in the profiling report
+struct EventItem {
+  std::string name;
+  int calls;
+  double total_time;
+  double min_time;
+  double max_time;
+  double ave_time;
+};
+
+// Print results
+void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
+                   const std::string& sorted_domain, const size_t name_width,
+                   const size_t data_width) {
+  // Output header information
+  std::cout << "\n------------------------->"
+            << "     Profiling Report     "
+            << "<-------------------------\n\n";
+  std::string place;
+  if (g_state == ProfilerState::kCPU) {
+    place = "CPU";
+  } else if (g_state == ProfilerState::kCUDA) {
+    place = "CUDA";
+  } else if (g_state == ProfilerState::kAll) {
+    place = "All";
+  } else {
+    PADDLE_THROW("Invalid profiler state", g_state);
+  }
 
-  std::vector<std::vector<Event>> all_events = GetAllEvents();
-  ParseEvents(all_events, sorted_key);
-  ResetProfiler();
-  DeviceTracer* tracer = GetDeviceTracer();
-  if (g_profiler_place == "All" && tracer && tracer->IsEnabled()) {
-    tracer->Disable();
-    tracer->GenProfile(profile_path);
+  std::cout << "Place: " << place << std::endl;
+  std::cout << "Time unit: ms" << std::endl;
+  std::cout << "Sorted by " << sorted_domain
+            << " in descending order in the same thread\n\n";
+  // Output events table
+  std::cout.setf(std::ios::left);
+  std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
+            << "Calls" << std::setw(data_width) << "Total"
+            << std::setw(data_width) << "Min." << std::setw(data_width)
+            << "Max." << std::setw(data_width) << "Ave." << std::endl;
+  for (size_t i = 0; i < events_table.size(); ++i) {
+    for (size_t j = 0; j < events_table[i].size(); ++j) {
+      const EventItem& event_item = events_table[i][j];
+      std::cout << std::setw(name_width) << event_item.name
+                << std::setw(data_width) << event_item.calls
+                << std::setw(data_width) << event_item.total_time
+                << std::setw(data_width) << event_item.min_time
+                << std::setw(data_width) << event_item.max_time
+                << std::setw(data_width) << event_item.ave_time << std::endl;
+    }
   }
+  std::cout << std::endl;
 }
 
-void ParseEvents(std::vector<std::vector<Event>>& events,
-                 EventSortingKey sorted_by) {
-  if (g_profiler_place == "") return;
+// Parse the event list and output the profiling report
+void ParseEvents(const std::vector<std::vector<Event>>& events,
+                 EventSortingKey sorted_by = EventSortingKey::kDefault) {
+  if (g_state == ProfilerState::kDisabled) return;
 
   std::string sorted_domain;
   std::function<bool(const EventItem&, const EventItem&)> sorted_func;
@@ -278,9 +376,9 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
     std::unordered_map<std::string, int> event_idx;
 
     for (size_t j = 0; j < events[i].size(); j++) {
-      if (events[i][j].kind() == "push") {
+      if (events[i][j].type() == EventType::kPushRange) {
         pushed_events.push_back(events[i][j]);
-      } else if (events[i][j].kind() == "pop") {
+      } else if (events[i][j].type() == EventType::kPopRange) {
         std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
         while (rit != pushed_events.rend() &&
                rit->name() != events[i][j].name()) {
@@ -288,10 +386,10 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
         }
 
         if (rit != pushed_events.rend()) {
-          double event_time =
-              (g_profiler_place == "CUDA" || g_profiler_place == "All")
-                  ? rit->CudaElapsedMs(events[i][j])
-                  : rit->CpuElapsedMs(events[i][j]);
+          double event_time = (g_state == ProfilerState::kCUDA ||
+                               g_state == ProfilerState::kAll)
+                                  ? rit->CudaElapsedMs(events[i][j])
+                                  : rit->CpuElapsedMs(events[i][j]);
 
           std::string event_name =
               "thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
@@ -347,36 +445,36 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
   PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12);
 }
 
-void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
-                   std::string& sorted_domain, const size_t name_width,
-                   const size_t data_width) {
-  // Output header information
-  std::cout << "\n------------------------->"
-            << "     Profiling Report     "
-            << "<-------------------------\n\n";
-  std::cout << "Place: " << g_profiler_place << std::endl;
-  std::cout << "Time unit: ms" << std::endl;
-  std::cout << "Sorted by " << sorted_domain
-            << " in descending order in the same thread\n\n";
-  // Output events table
-  std::cout.setf(std::ios::left);
-  std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
-            << "Calls" << std::setw(data_width) << "Total"
-            << std::setw(data_width) << "Min." << std::setw(data_width)
-            << "Max." << std::setw(data_width) << "Ave." << std::endl;
-  for (size_t i = 0; i < events_table.size(); ++i) {
-    for (size_t j = 0; j < events_table[i].size(); ++j) {
-      EventItem& event_item = events_table[i][j];
-      std::cout << std::setw(name_width) << event_item.name
-                << std::setw(data_width) << event_item.calls
-                << std::setw(data_width) << event_item.total_time
-                << std::setw(data_width) << event_item.min_time
-                << std::setw(data_width) << event_item.max_time
-                << std::setw(data_width) << event_item.ave_time << std::endl;
-    }
+void DisableProfiler(EventSortingKey sorted_key,
+                     const std::string& profile_path) {
+  std::lock_guard<std::mutex> l(profiler_mu);
+  if (g_state == ProfilerState::kDisabled) return;
+  // Mark the profiling stop.
+  Mark("_stop_profiler_", nullptr);
+
+  std::vector<std::vector<Event>> all_events = GetAllEvents();
+  ParseEvents(all_events, sorted_key);
+  ResetProfiler();
+  DeviceTracer* tracer = GetDeviceTracer();
+  if (tracer->IsEnabled()) {
+    tracer->Disable();
+    tracer->GenProfile(profile_path);
   }
-  std::cout << std::endl;
+  g_state = ProfilerState::kDisabled;
+  should_send_profile_state = true;
+}
+
+bool IsProfileEnabled() { return g_state != ProfilerState::kDisabled; }
+bool ShouldSendProfileState() { return should_send_profile_state; }
+
+void SetProfileListener() {
+  std::mt19937 rng;
+  rng.seed(std::random_device()());
+  std::uniform_int_distribution<std::mt19937::result_type> dist6(
+      1, std::numeric_limits<int>::max());
+  profiler_lister_id = dist6(rng);
 }
+int64_t ListenerId() { return profiler_lister_id; }
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 3542ce6cda87e3b013d60393e4ba93da61921940..bf43925373a12cd9ff2155d68c42d0266ba4df60 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -15,24 +15,23 @@ limitations under the License. */
 #pragma once
 #include <forward_list>
 #include <list>
-#include <mutex>
+#include <string>
 #include <vector>
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/profiler.pb.h"
 
 namespace paddle {
 namespace platform {
 
-enum EventKind { kMark, kPushRange, kPopRange };
+enum EventType { kMark, kPushRange, kPopRange };
 
 class Event {
  public:
   // The DeviceContext is used to get the cuda stream.
   // If CPU profiling mode, can pass nullptr.
-  Event(EventKind kind, std::string name, uint32_t thread_id,
+  Event(EventType type, std::string name, uint32_t thread_id,
         const DeviceContext* dev_ctx);
 
-  std::string kind() const;
+  const EventType& type() const;
   std::string name() const { return name_; }
   uint32_t thread_id() const { return thread_id_; }
   bool has_cuda() const { return has_cuda_; }
@@ -46,7 +45,7 @@ class Event {
   double CudaElapsedMs(const Event& e) const;
 
  private:
-  EventKind kind_;
+  EventType type_;
   std::string name_;
   uint32_t thread_id_;
   int64_t cpu_ns_;
@@ -57,39 +56,6 @@ class Event {
 #endif
 };
 
-struct EventList {
-  constexpr static size_t kMB = 1024 * 1024;
-  constexpr static size_t kEventBlockSize = 16 * kMB;
-  constexpr static size_t kEventSize = sizeof(Event);
-  constexpr static size_t kEventAlign = alignof(Event);
-  constexpr static size_t kNumBlock =
-      kEventBlockSize /
-      ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
-
-  template <typename... Args>
-  void Record(Args&&... args) {
-    if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
-      event_blocks.emplace_front();
-      event_blocks.front().reserve(kNumBlock);
-    }
-    event_blocks.front().emplace_back(std::forward<Args>(args)...);
-  }
-
-  std::vector<Event> Reduce() {
-    std::vector<Event> result;
-    for (auto& block : event_blocks) {
-      result.insert(result.begin(), std::make_move_iterator(block.begin()),
-                    std::make_move_iterator(block.end()));
-    }
-    event_blocks.clear();
-    return result;
-  }
-
-  void Clear() { event_blocks.clear(); }
-
-  std::forward_list<std::vector<Event>> event_blocks;
-};
-
 enum ProfilerState {
   kDisabled,  // disabled state
   kCPU,       // CPU profiling state
@@ -108,6 +74,7 @@ struct RecordEvent {
 
   ~RecordEvent();
 
+  bool is_enabled_;
   uint64_t start_ns_;
   // The device context is used by Event to get the current cuda stream.
   const DeviceContext* dev_ctx_;
@@ -118,20 +85,25 @@ struct RecordEvent {
   std::string full_name_;
 };
 
+struct RecordBlock {
+  explicit RecordBlock(int block_id);
+  ~RecordBlock();
+
+ private:
+  bool is_enabled_;
+  std::string name_;
+  uint64_t start_ns_;
+};
+
+struct RecordThread {
+  explicit RecordThread(int thread_id);
+  ~RecordThread();
+};
+
 // Return the event list of all threads. Assumed the returned value calls
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
 std::vector<std::vector<Event>> GetAllEvents();
 
-// The information of each event given in the profiling report
-struct EventItem {
-  std::string name;
-  int calls;
-  double total_time;
-  double min_time;
-  double max_time;
-  double ave_time;
-};
-
 // Candidate keys to sort the profiling report
 enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };
 
@@ -144,14 +116,15 @@ void ResetProfiler();
 void DisableProfiler(EventSortingKey sorted_key,
                      const std::string& profile_path);
 
-// Parse the event list and output the profiling report
-void ParseEvents(std::vector<std::vector<Event>>&,
-                 EventSortingKey sorted_by = EventSortingKey::kDefault);
-
-// Print results
-void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
-                   std::string& sorted_domain, const size_t name_width,
-                   const size_t data_width);
+const int kEnableProfiler = 1;
+const int kDisableProfiler = 2;
+// Test if the profiler is currently enabled.
+bool IsProfileEnabled();
+// Whether the trainer should send profiling state to PS.
+bool ShouldSendProfileState();
+// Mark current process as PS by assigning a lister id.
+void SetProfileListener();
+int64_t ListenerId();
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler.proto b/paddle/fluid/platform/profiler.proto
index 71b5a9b12ef4a045ebfd3ee3d06ee25032083ff5..7b42aa785ec6ad5731e3adee1e9f189127a826a1 100644
--- a/paddle/fluid/platform/profiler.proto
+++ b/paddle/fluid/platform/profiler.proto
@@ -18,12 +18,17 @@ package paddle.platform.proto;
 message MemCopy { optional uint64 bytes = 1; }
 
 message Event {
+  enum EventType {
+    CPU = 0;
+    GPUKernel = 1;
+  }
+  optional EventType type = 8;
   optional string name = 1;
   optional uint64 start_ns = 2;
   optional uint64 end_ns = 3;
   // When positive, it represents gpu id. When -1, it represents CPU.
   optional int64 device_id = 5;
-  optional uint32 stream_id = 6;
+  optional int64 sub_device_id = 6;
 
   optional MemCopy memcopy = 7;
 }
diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc
index fc77e0f3213da776e0b05ad5b5da9081665cdf6e..61f467814ba4a24c8b73f1bc614cda0ab8c4debd 100644
--- a/paddle/fluid/platform/profiler_test.cc
+++ b/paddle/fluid/platform/profiler_test.cc
@@ -13,19 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/profiler.h"
+#include <string>
+#ifdef PADDLE_WITH_CUDA
+#include <cuda_runtime.h>
+#endif
 #include "gtest/gtest.h"
 
 TEST(Event, CpuElapsedTime) {
   using paddle::platform::Event;
-  using paddle::platform::EventKind;
+  using paddle::platform::EventType;
 
-  Event start_event(EventKind::kPushRange, "test", 0, nullptr);
+  Event start_event(EventType::kPushRange, "test", 0, nullptr);
   EXPECT_TRUE(start_event.has_cuda() == false);
   int counter = 0;
   while (counter != 1000) {
     counter++;
   }
-  Event stop_event(EventKind::kPopRange, "test", 0, nullptr);
+  Event stop_event(EventType::kPopRange, "test", 0, nullptr);
   EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0);
 }
 
@@ -35,16 +39,16 @@ TEST(Event, CudaElapsedTime) {
   using paddle::platform::CUDADeviceContext;
   using paddle::platform::CUDAPlace;
   using paddle::platform::Event;
-  using paddle::platform::EventKind;
+  using paddle::platform::EventType;
 
   DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0));
-  Event start_event(EventKind::kPushRange, "test", 0, dev_ctx);
+  Event start_event(EventType::kPushRange, "test", 0, dev_ctx);
   EXPECT_TRUE(start_event.has_cuda() == true);
   int counter = 0;
   while (counter != 1000) {
     counter++;
   }
-  Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx);
+  Event stop_event(EventType::kPopRange, "test", 0, dev_ctx);
   EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0);
 }
 #endif
@@ -52,7 +56,7 @@ TEST(Event, CudaElapsedTime) {
 TEST(RecordEvent, RecordEvent) {
   using paddle::platform::DeviceContext;
   using paddle::platform::Event;
-  using paddle::platform::EventKind;
+  using paddle::platform::EventType;
   using paddle::platform::RecordEvent;
   using paddle::platform::ProfilerState;
   using paddle::platform::EventSortingKey;
@@ -157,3 +161,13 @@ TEST(RecordEvent, RecordEvent) {
   // Will remove parsing-related code from test later
   DisableProfiler(EventSortingKey::kTotal, "/tmp/profiler");
 }
+
+#ifdef PADDLE_WITH_CUDA
+TEST(TMP, stream_wait) {
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  cudaStreamSynchronize(stream);
+  cudaStreamSynchronize(stream);
+  cudaStreamSynchronize(stream);
+}
+#endif
diff --git a/paddle/fluid/platform/transform.h b/paddle/fluid/platform/transform.h
index 917c48b47f8d70cd821d45dfbc6bafa494710ffa..7877d3e41c1c993662f5d91b263cbcb71db74c36 100644
--- a/paddle/fluid/platform/transform.h
+++ b/paddle/fluid/platform/transform.h
@@ -14,29 +14,44 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
+#include <type_traits>
+
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/hostdevice.h"
 #include "paddle/fluid/platform/place.h"
 
-#include <algorithm>
-#include <type_traits>
 #ifdef __NVCC__
 #include <thrust/execution_policy.h>
 #include <thrust/transform.h>
-#include "paddle/fluid/platform/details/device_ptr_cast.h"
+#include "paddle/fluid/platform/details/cuda_transform_iterator_cast.h"
 #endif
 
 namespace paddle {
 namespace platform {
 
-// Transform on host or device. It provides the same API in std library.
+// Transform applys a unary or a binary functor on each element in a
+// range defined by a pair of iterators.
+//
+// - The specialization for CPU calls std::transform.
+// - The specialization for CUDA calls thrust::tranform.
+//
+// NOTE: We need to define InputIter and OutputIter defined as
+//       different types, because the InputIter points op's inputs and
+//       OutputIter pints to op's outputs.
+//
+// NOTE: We don't assume that InputIter to be const InputType* and
+//       OutputIter to be OutputType*, because we might use a iterator
+//       class, paddle::fluid::operators::RowwiseTRansformIterator.
 template <typename DeviceContext>
 struct Transform {
+  // The unary version.
   template <typename InputIter, typename OutputIter, typename UnaryOperation>
   void operator()(const DeviceContext& context, InputIter first, InputIter last,
                   OutputIter result, UnaryOperation op);
 
+  // The binary version.
   template <typename InputIter1, typename InputIter2, typename OutputIter,
             typename BinaryOperation>
   void operator()(const DeviceContext& context, InputIter1 first1,
@@ -70,8 +85,9 @@ struct Transform<platform::CUDADeviceContext> {
     auto place = context.GetPlace();
     PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
     thrust::transform(thrust::cuda::par.on(context.stream()),
-                      details::DevPtrCast(first), details::DevPtrCast(last),
-                      details::DevPtrCast(result), op);
+                      details::CastToCUDATransformIterator(first),
+                      details::CastToCUDATransformIterator(last),
+                      details::CastToCUDATransformIterator(result), op);
   }
 
   template <typename InputIter1, typename InputIter2, typename OutputIter,
@@ -82,9 +98,10 @@ struct Transform<platform::CUDADeviceContext> {
     auto place = context.GetPlace();
     PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
     thrust::transform(thrust::cuda::par.on(context.stream()),
-                      details::DevPtrCast(first1), details::DevPtrCast(last1),
-                      details::DevPtrCast(first2), details::DevPtrCast(result),
-                      op);
+                      details::CastToCUDATransformIterator(first1),
+                      details::CastToCUDATransformIterator(last1),
+                      details::CastToCUDATransformIterator(first2),
+                      details::CastToCUDATransformIterator(result), op);
   }
 };
 #endif
diff --git a/paddle/fluid/platform/transform_test.cu b/paddle/fluid/platform/transform_test.cu
index 7b5cfd8f43473dc6bc784e98bd26fdd9e0ba9994..f65d1f60100edc85ba9745ed36f26a0ed160d80f 100644
--- a/paddle/fluid/platform/transform_test.cu
+++ b/paddle/fluid/platform/transform_test.cu
@@ -18,11 +18,12 @@ limitations under the License. */
 #include "paddle/fluid/platform/hostdevice.h"
 #include "paddle/fluid/platform/transform.h"
 
+namespace {
+
 template <typename T>
 class Scale {
  public:
   explicit Scale(const T& scale) : scale_(scale) {}
-
   HOSTDEVICE T operator()(const T& a) const { return a * scale_; }
 
  private:
@@ -35,11 +36,23 @@ class Multiply {
   HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
 };
 
+}  // namespace
+
+using paddle::memory::Alloc;
+using paddle::memory::Free;
+using paddle::memory::Copy;
+
+using paddle::platform::CPUPlace;
+using paddle::platform::CUDAPlace;
+using paddle::platform::CPUDeviceContext;
+using paddle::platform::CUDADeviceContext;
+
+using paddle::platform::Transform;
+
 TEST(Transform, CPUUnary) {
-  using namespace paddle::platform;
   CPUDeviceContext ctx;
   float buf[4] = {0.1, 0.2, 0.3, 0.4};
-  Transform<paddle::platform::CPUDeviceContext> trans;
+  Transform<CPUDeviceContext> trans;
   trans(ctx, buf, buf + 4, buf, Scale<float>(10));
   for (int i = 0; i < 4; ++i) {
     ASSERT_NEAR(buf[i], static_cast<float>(i + 1), 1e-5);
@@ -47,14 +60,12 @@ TEST(Transform, CPUUnary) {
 }
 
 TEST(Transform, GPUUnary) {
-  using namespace paddle::platform;
-  using namespace paddle::memory;
   CUDAPlace gpu0(0);
   CUDADeviceContext ctx(gpu0);
   float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
   float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
   Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream());
-  Transform<paddle::platform::CUDADeviceContext> trans;
+  Transform<CUDADeviceContext> trans;
   trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
   ctx.Wait();
   Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream());
@@ -65,10 +76,8 @@ TEST(Transform, GPUUnary) {
 }
 
 TEST(Transform, CPUBinary) {
-  using namespace paddle::platform;
-  using namespace paddle::memory;
   int buf[4] = {1, 2, 3, 4};
-  Transform<paddle::platform::CPUDeviceContext> trans;
+  Transform<CPUDeviceContext> trans;
   CPUDeviceContext ctx;
   trans(ctx, buf, buf + 4, buf, buf, Multiply<int>());
   for (int i = 0; i < 4; ++i) {
@@ -77,14 +86,12 @@ TEST(Transform, CPUBinary) {
 }
 
 TEST(Transform, GPUBinary) {
-  using namespace paddle::platform;
-  using namespace paddle::memory;
   int buf[4] = {1, 2, 3, 4};
   CUDAPlace gpu0(0);
   CUDADeviceContext ctx(gpu0);
   int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
   Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
-  Transform<paddle::platform::CUDADeviceContext> trans;
+  Transform<CUDADeviceContext> trans;
   trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
   ctx.Wait();
   Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream());
diff --git a/paddle/fluid/platform/variant.h b/paddle/fluid/platform/variant.h
index 05ca33137de8db5291c8e38fc03457d05092cea8..45f60fc9d76560b133fa06198a24c7eaccc24088 100644
--- a/paddle/fluid/platform/variant.h
+++ b/paddle/fluid/platform/variant.h
@@ -14,29 +14,25 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef __CUDACC__
-#ifdef __CUDACC_VER_MAJOR__
-// CUDA 9 define `__CUDACC_VER__` as a warning message, manually define
-// __CUDACC_VER__ instead.
+// Boost 1.41.0 requires __CUDACC_VER__, but in CUDA 9 __CUDACC_VER__
+// is removed, so we have to manually define __CUDACC_VER__ instead.
+// For details, please refer to
+// https://github.com/PaddlePaddle/Paddle/issues/6626
+#if defined(__CUDACC__) && defined(__CUDACC_VER_MAJOR__)
 #undef __CUDACC_VER__
-
-#define __CUDACC_VER__                                         \
-  (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 + \
-   __CUDACC_VER_BUILD__)
-#endif
-
+#define __CUDACC_VER__                                  \
+  __CUDACC_VER_BUILD__ + __CUDACC_VER_MAJOR__ * 10000 + \
+      __CUDACC_VER_MINOR__ * 100
 #endif
 
-#include <boost/config.hpp>
+#include "boost/config.hpp"
 
-#ifdef PADDLE_WITH_CUDA
-
-// Because boost's variadic templates has bug on nvcc, boost will disable
-// variadic template support when GPU enabled on nvcc.
-// Define BOOST_NO_CXX11_VARIADIC_TEMPLATES on gcc/clang to generate same
-// function symbols.
-//
+// Because Boost 1.41.0's variadic templates has bug on nvcc, boost
+// will disable variadic template support in NVCC mode.  Define
+// BOOST_NO_CXX11_VARIADIC_TEMPLATES on gcc/clang to generate same
+// function symbols.  For details,
 // https://github.com/PaddlePaddle/Paddle/issues/3386
+#ifdef PADDLE_WITH_CUDA
 #ifndef BOOST_NO_CXX11_VARIADIC_TEMPLATES
 #define BOOST_NO_CXX11_VARIADIC_TEMPLATES
 #endif
diff --git a/paddle/fluid/pybind/.clang-format b/paddle/fluid/pybind/.clang-format
deleted file mode 100644
index 29282dc87e2c499988c17d90d47d44cd5cf7f115..0000000000000000000000000000000000000000
--- a/paddle/fluid/pybind/.clang-format
+++ /dev/null
@@ -1,5 +0,0 @@
----
-Language:        Cpp
-BasedOnStyle:  Google
-Standard:  Cpp11 
-...
diff --git a/paddle/fluid/pybind/.gitignore b/paddle/fluid/pybind/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..8f222791edb016df65be5db75831f5f83cf63726
--- /dev/null
+++ b/paddle/fluid/pybind/.gitignore
@@ -0,0 +1 @@
+pybind.h
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 8942b5c9430ffa4e499b0ad1d2b5acf6d18ec0ab..4fef351c2118e43697606c90a616cd870e78cd77 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,9 +1,20 @@
 if(WITH_PYTHON)
-  cc_library(paddle_pybind SHARED
-    SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-    DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
-    ${GLOB_OP_LIB})
-  if(NOT APPLE AND NOT ANDROID)
-    target_link_libraries(paddle_pybind rt)
-  endif(NOT APPLE AND NOT ANDROID)
+  if(WITH_AMD_GPU)
+    hip_library(paddle_pybind SHARED
+      SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
+      DEPS pybind python proto_desc memory executor prune init profiler feed_fetch_method
+           parallel_executor
+      ${GLOB_OP_LIB})
+  else()
+    cc_library(paddle_pybind SHARED
+      SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
+      DEPS pybind python proto_desc memory executor prune init profiler feed_fetch_method
+           parallel_executor
+      ${GLOB_OP_LIB})
+    if(NOT APPLE AND NOT ANDROID)
+      target_link_libraries(paddle_pybind rt)
+    endif(NOT APPLE AND NOT ANDROID)
+  endif(WITH_AMD_GPU)
+
+  cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python)
 endif(WITH_PYTHON)
diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc
index 6657b25ed2443c1ac9cb0a09098968d3181fc6ba..76aa7d2010682416f68e982e9b89da9813abb078 100644
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -12,17 +12,34 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "const_value.h"
+#include "paddle/fluid/pybind/const_value.h"
+#include <paddle/fluid/framework/op_proto_maker.h>
 #include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace pybind {
 
-void BindConstValue(pybind11::module& m) {
-  m.def("kEmptyVarName", [] { return framework::kEmptyVarName; });
-  m.def("kTempVarName", [] { return framework::kTempVarName; });
-  m.def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
-  m.def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
+void BindConstValue(pybind11::module* m) {
+  m->def("kEmptyVarName", [] { return framework::kEmptyVarName; });
+  m->def("kTempVarName", [] { return framework::kTempVarName; });
+  m->def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
+  m->def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
+
+  auto op_proto_and_checker_maker =
+      m->def_submodule("op_proto_and_checker_maker");
+
+  pybind11::enum_<framework::OpRole>(op_proto_and_checker_maker, "OpRole")
+      .value("Forward", framework::OpRole::kForward)
+      .value("Backward", framework::OpRole::kBackward)
+      .value("Optimize", framework::OpRole::kOptimize)
+      .value("Loss", framework::OpRole::kLoss)
+      .value("RPC", framework::OpRole::kRPC);
+
+  op_proto_and_checker_maker.def(
+      "kOpRoleAttrName", framework::OpProtoAndCheckerMaker::OpRoleAttrName);
+  op_proto_and_checker_maker.def(
+      "kOpRoleVarAttrName",
+      framework::OpProtoAndCheckerMaker::OpRoleVarAttrName);
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/const_value.h b/paddle/fluid/pybind/const_value.h
index 79e71e039dea6585aaf8193f1417c6ab3fbf6f76..2fab3160d1d95af7f6a49c472c2e211c19e67cac 100644
--- a/paddle/fluid/pybind/const_value.h
+++ b/paddle/fluid/pybind/const_value.h
@@ -11,16 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
+
 #include <Python.h>
+
 #include "paddle/fluid/platform/enforce.h"
 #include "pybind11/pybind11.h"
 
-namespace py = pybind11;
-
 namespace paddle {
 namespace pybind {
-extern void BindConstValue(pybind11::module& m);
+
+void BindConstValue(pybind11::module* m);
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/exception.cc b/paddle/fluid/pybind/exception.cc
index 4bd3ecf728dedaf74a554f77b114065f2d515786..831f30e35fd3e01ce0f0524f6f85dd59494f5353 100644
--- a/paddle/fluid/pybind/exception.cc
+++ b/paddle/fluid/pybind/exception.cc
@@ -17,17 +17,21 @@ limitations under the License. */
 namespace paddle {
 namespace pybind {
 
-void BindException(pybind11::module& m) {
-  static pybind11::exception<platform::EnforceNotMet> exc(m, "EnforceNotMet");
+void BindException(pybind11::module* m) {
+  static pybind11::exception<platform::EOFException> eof(*m, "EOFException");
+  static pybind11::exception<platform::EnforceNotMet> exc(*m, "EnforceNotMet");
   pybind11::register_exception_translator([](std::exception_ptr p) {
     try {
       if (p) std::rethrow_exception(p);
+    } catch (const platform::EOFException& e) {
+      eof(e.what());
     } catch (const platform::EnforceNotMet& e) {
       exc(e.what());
     }
   });
 
-  m.def("__unittest_throw_exception__", [] { PADDLE_THROW("test exception"); });
+  m->def("__unittest_throw_exception__",
+         [] { PADDLE_THROW("test exception"); });
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/exception.h b/paddle/fluid/pybind/exception.h
index bc6b0c067978959d4cdafec51db9574927b34b21..5e054267361f2c62b3ad36581be0ad17ce0718de 100644
--- a/paddle/fluid/pybind/exception.h
+++ b/paddle/fluid/pybind/exception.h
@@ -11,14 +11,17 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
+
 #include <Python.h>
+
 #include "paddle/fluid/platform/enforce.h"
 #include "pybind11/pybind11.h"
+
 namespace paddle {
 namespace pybind {
 
-extern void BindException(pybind11::module& m);
+void BindException(pybind11::module* m);
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 45a64f43846e79c27295e52c59dca6bdfaa120a3..fcd3356d44ee592233c3883d439d0677714900b8 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -11,11 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #include "paddle/fluid/pybind/protobuf.h"
+
 #include <deque>
 #include <iostream>
-#include "paddle/fluid/framework/backward.h"
+#include <string>
+#include <tuple>
+
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -95,10 +97,11 @@ struct type_caster<boost::variant<Args...>>
 namespace paddle {
 namespace pybind {
 
-using namespace paddle::framework;  // NOLINT
+namespace pd = paddle::framework;
 
 template <typename T>
-static py::bytes SerializeMessage(T &self) {
+static pybind11::bytes SerializeMessage(
+    T &self) {  // NOLINT due to pybind11 convention.
   // Check IsInitialized in Python
   std::string retv;
   PADDLE_ENFORCE(self.Proto()->SerializePartialToString(&retv),
@@ -107,203 +110,205 @@ static py::bytes SerializeMessage(T &self) {
 }
 
 // Bind Methods
-void BindProgramDesc(py::module &m) {
-  py::class_<ProgramDesc>(m, "ProgramDesc", "")
-      .def(py::init<>())
+void BindProgramDesc(pybind11::module *m) {
+  pybind11::class_<pd::ProgramDesc>(*m, "ProgramDesc", "")
+      .def(pybind11::init<>())
       .def("__init__",
-           [](ProgramDesc &self, const ProgramDesc &other) {
-             new (&self) ProgramDesc(other);
+           [](pd::ProgramDesc &self, const pd::ProgramDesc &other) {
+             new (&self) pd::ProgramDesc(other);
            })
       .def("__init__",
-           [](ProgramDesc &self, const py::bytes &binary_str) {
+           [](pd::ProgramDesc &self, const pybind11::bytes &binary_str) {
              std::string str(binary_str);
-             new (&self) ProgramDesc(str);
+             new (&self) pd::ProgramDesc(str);
            })
-      .def("append_block", &ProgramDesc::AppendBlock,
-           py::return_value_policy::reference)
-      .def("append_backward",
-           [](ProgramDesc &program_desc, const VarDesc &target,
-              const std::unordered_set<std::string> &no_grad_vars) {
-             ParamGradInfoMap param_grad_map =
-                 AppendBackward(program_desc, target, no_grad_vars);
-             std::unordered_map<
-                 std::string, std::tuple<std::string /* grad_var_name */,
-                                         int /* block_idx */, int /* op_idx */>>
-                 retv;
-             for (auto it = param_grad_map.begin(); it != param_grad_map.end();
-                  ++it) {
-               const auto &grad_info = it->second;
-               retv[it->first] = std::make_tuple(
-                   grad_info.name_, grad_info.block_idx_, grad_info.op_idx_);
-             }
-             return retv;
-           })
-      .def("block", &ProgramDesc::MutableBlock,
-           py::return_value_policy::reference)
-      .def("num_blocks", &ProgramDesc::Size)
-      .def("serialize_to_string", SerializeMessage<ProgramDesc>)
+      .def("append_block", &pd::ProgramDesc::AppendBlock,
+           pybind11::return_value_policy::reference)
+      .def("block", &pd::ProgramDesc::MutableBlock,
+           pybind11::return_value_policy::reference)
+      .def("num_blocks", &pd::ProgramDesc::Size)
+      .def("flush", &pd::ProgramDesc::Flush)
+      .def("get_feed_target_names", &pd::ProgramDesc::GetFeedTargetNames)
+      .def("get_fetch_target_names", &pd::ProgramDesc::GetFetchTargetNames)
+      .def("serialize_to_string", SerializeMessage<pd::ProgramDesc>)
       .def("parse_from_string",
-           [](ProgramDesc &program_desc, const std::string &data) {
-             proto::ProgramDesc *desc = program_desc.Proto();
+           [](pd::ProgramDesc &program_desc, const std::string &data) {
+             pd::proto::ProgramDesc *desc = program_desc.Proto();
              PADDLE_ENFORCE(desc->ParseFromString(data),
                             "Fail to parse ProgramDesc from string. This could "
                             "be a bug of Paddle.");
            });
 }
 
-void BindBlockDesc(py::module &m) {
-  py::class_<BlockDesc>(m, "BlockDesc", "")
-      .def_property_readonly("id", &BlockDesc::ID)
-      .def_property_readonly("parent", &BlockDesc::Parent)
-      .def("get_forward_block_idx", &BlockDesc::ForwardBlockID)
-      .def("set_forward_block_idx", &BlockDesc::SetForwardBlockID)
-      .def("append_op", &BlockDesc::AppendOp,
-           py::return_value_policy::reference)
-      .def("prepend_op", &BlockDesc::PrependOp,
-           py::return_value_policy::reference)
-      .def("insert_op", &BlockDesc::InsertOp,
-           py::return_value_policy::reference)
-      .def("remove_op", &BlockDesc::RemoveOp)
+void BindBlockDesc(pybind11::module *m) {
+  pybind11::class_<pd::BlockDesc>(*m, "BlockDesc", "")
+      .def_property_readonly("id", &pd::BlockDesc::ID)
+      .def_property_readonly("parent", &pd::BlockDesc::Parent)
+      .def("get_forward_block_idx", &pd::BlockDesc::ForwardBlockID)
+      .def("set_forward_block_idx", &pd::BlockDesc::SetForwardBlockID)
+      .def("append_op", &pd::BlockDesc::AppendOp,
+           pybind11::return_value_policy::reference)
+      .def("prepend_op", &pd::BlockDesc::PrependOp,
+           pybind11::return_value_policy::reference)
+      .def("insert_op", &pd::BlockDesc::InsertOp,
+           pybind11::return_value_policy::reference)
+      .def("remove_op", &pd::BlockDesc::RemoveOp)
       .def("var",
-           [](BlockDesc &self, py::bytes byte_name) {
+           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
              std::string name = byte_name;
              return self.Var(name);
            },
-           py::return_value_policy::reference)
+           pybind11::return_value_policy::reference)
       .def("has_var",
-           [](BlockDesc &self, py::bytes byte_name) {
+           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
              std::string name = byte_name;
              return self.HasVar(name);
            },
-           py::return_value_policy::reference)
+           pybind11::return_value_policy::reference)
       .def("rename_var",
-           [](BlockDesc &self, const py::bytes &byte_name,
-              const py::bytes &byte_name_new) {
+           [](pd::BlockDesc &self, const pybind11::bytes &byte_name,
+              const pybind11::bytes &byte_name_new) {
              std::string name = byte_name;
              std::string new_name = byte_name_new;
              self.RenameVar(name, new_name);
            })
       .def("has_var_recursive",
-           [](BlockDesc &self, py::bytes byte_name) {
+           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
              std::string name = byte_name;
              return self.HasVarRecursive(name);
            })
       .def("find_var",
-           [](BlockDesc &self, py::bytes byte_name) {
+           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
              std::string name = byte_name;
              return self.FindVar(name);
            },
-           py::return_value_policy::reference)
+           pybind11::return_value_policy::reference)
       .def("find_var_recursive",
-           [](BlockDesc &self, py::bytes byte_name) {
+           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
              std::string name = byte_name;
              return self.FindVarRecursive(name);
            },
-           py::return_value_policy::reference)
-      .def("all_vars", &BlockDesc::AllVars, py::return_value_policy::reference)
-      .def("op_size", &BlockDesc::OpSize)
-      .def("op", &BlockDesc::Op, py::return_value_policy::reference)
-      .def("serialize_to_string", SerializeMessage<BlockDesc>);
+           pybind11::return_value_policy::reference)
+      .def("remove_var",
+           [](pd::BlockDesc &self, pybind11::bytes byte_name) {
+             std::string name = byte_name;
+             return self.RemoveVar(name);
+           },
+           pybind11::return_value_policy::reference)
+      .def("all_vars", &pd::BlockDesc::AllVars,
+           pybind11::return_value_policy::reference)
+      .def("op_size", &pd::BlockDesc::OpSize)
+      .def("op", &pd::BlockDesc::Op, pybind11::return_value_policy::reference)
+      .def("serialize_to_string", SerializeMessage<pd::BlockDesc>);
 }
 
-void BindVarDsec(py::module &m) {
-  py::class_<VarDesc> var_desc(m, "VarDesc", "");
+void BindVarDsec(pybind11::module *m) {
+  pybind11::class_<pd::VarDesc> var_desc(*m, "VarDesc", "");
   var_desc
       .def("name",
-           [](VarDesc &self) {
-             py::bytes name = self.Name();
+           [](pd::VarDesc &self) {
+             pybind11::bytes name = self.Name();
              return name;
            },
-           py::return_value_policy::reference)
-      .def("set_name", &VarDesc::SetName)
-      .def("set_shape", &VarDesc::SetShape)
-      .def("set_shapes", &VarDesc::SetShapes)
-      .def("set_dtype", &VarDesc::SetDataType)
-      .def("set_dtypes", &VarDesc::SetDataTypes)
-      .def("set_capacity", &VarDesc::SetCapacity)
-      .def("shape", &VarDesc::GetShape, py::return_value_policy::reference)
-      .def("shapes", &VarDesc::GetShapes, py::return_value_policy::reference)
-      .def("dtype", &VarDesc::GetDataType, py::return_value_policy::reference)
-      .def("dtypes", &VarDesc::GetDataTypes, py::return_value_policy::reference)
-      .def("lod_level", &VarDesc::GetLoDLevel)
-      .def("lod_levels", &VarDesc::GetLoDLevels,
-           py::return_value_policy::reference)
-      .def("set_lod_level", &VarDesc::SetLoDLevel)
-      .def("set_lod_levels", &VarDesc::SetLoDLevels)
-      .def("type", &VarDesc::GetType)
-      .def("set_type", &VarDesc::SetType)
-      .def("serialize_to_string", SerializeMessage<VarDesc>)
-      .def("persistable", &VarDesc::Persistable)
-      .def("set_persistable", &VarDesc::SetPersistable);
+           pybind11::return_value_policy::reference)
+      .def("set_name", &pd::VarDesc::SetName)
+      .def("set_shape", &pd::VarDesc::SetShape)
+      .def("set_shapes", &pd::VarDesc::SetShapes)
+      .def("set_dtype", &pd::VarDesc::SetDataType)
+      .def("set_dtypes", &pd::VarDesc::SetDataTypes)
+      .def("set_capacity", &pd::VarDesc::SetCapacity)
+      .def("shape", &pd::VarDesc::GetShape,
+           pybind11::return_value_policy::reference)
+      .def("shapes", &pd::VarDesc::GetShapes,
+           pybind11::return_value_policy::reference)
+      .def("dtype", &pd::VarDesc::GetDataType,
+           pybind11::return_value_policy::reference)
+      .def("dtypes", &pd::VarDesc::GetDataTypes,
+           pybind11::return_value_policy::reference)
+      .def("lod_level", &pd::VarDesc::GetLoDLevel)
+      .def("lod_levels", &pd::VarDesc::GetLoDLevels,
+           pybind11::return_value_policy::reference)
+      .def("set_lod_level", &pd::VarDesc::SetLoDLevel)
+      .def("set_lod_levels", &pd::VarDesc::SetLoDLevels)
+      .def("type", &pd::VarDesc::GetType)
+      .def("set_type", &pd::VarDesc::SetType)
+      .def("serialize_to_string", SerializeMessage<pd::VarDesc>)
+      .def("persistable", &pd::VarDesc::Persistable)
+      .def("set_persistable", &pd::VarDesc::SetPersistable);
 
-  py::enum_<proto::VarType::Type>(var_desc, "VarType", "")
-      .value("BOOL", proto::VarType::BOOL)
-      .value("INT16", proto::VarType::INT16)
-      .value("INT32", proto::VarType::INT32)
-      .value("INT64", proto::VarType::INT64)
-      .value("FP16", proto::VarType::FP16)
-      .value("FP32", proto::VarType::FP32)
-      .value("FP64", proto::VarType::FP64)
-      .value("LOD_TENSOR", proto::VarType::LOD_TENSOR)
-      .value("SELECTED_ROWS", proto::VarType::SELECTED_ROWS)
-      .value("FEED_MINIBATCH", proto::VarType::FEED_MINIBATCH)
-      .value("FETCH_LIST", proto::VarType::FETCH_LIST)
-      .value("STEP_SCOPES", proto::VarType::STEP_SCOPES)
-      .value("LOD_RANK_TABLE", proto::VarType::LOD_RANK_TABLE)
-      .value("LOD_TENSOR_ARRAY", proto::VarType::LOD_TENSOR_ARRAY)
-      .value("CHANNEL", proto::VarType::CHANNEL)
-      .value("PLACE_LIST", proto::VarType::PLACE_LIST)
-      .value("READER", proto::VarType::READER)
-      .value("RAW", proto::VarType::RAW);
+  pybind11::enum_<pd::proto::VarType::Type>(var_desc, "VarType", "")
+      .value("BOOL", pd::proto::VarType::BOOL)
+      .value("UINT8", pd::proto::VarType::UINT8)
+      .value("INT16", pd::proto::VarType::INT16)
+      .value("INT32", pd::proto::VarType::INT32)
+      .value("INT64", pd::proto::VarType::INT64)
+      .value("FP16", pd::proto::VarType::FP16)
+      .value("FP32", pd::proto::VarType::FP32)
+      .value("FP64", pd::proto::VarType::FP64)
+      .value("LOD_TENSOR", pd::proto::VarType::LOD_TENSOR)
+      .value("SELECTED_ROWS", pd::proto::VarType::SELECTED_ROWS)
+      .value("FEED_MINIBATCH", pd::proto::VarType::FEED_MINIBATCH)
+      .value("FETCH_LIST", pd::proto::VarType::FETCH_LIST)
+      .value("STEP_SCOPES", pd::proto::VarType::STEP_SCOPES)
+      .value("LOD_RANK_TABLE", pd::proto::VarType::LOD_RANK_TABLE)
+      .value("LOD_TENSOR_ARRAY", pd::proto::VarType::LOD_TENSOR_ARRAY)
+      .value("CHANNEL", pd::proto::VarType::CHANNEL)
+      .value("PLACE_LIST", pd::proto::VarType::PLACE_LIST)
+      .value("READER", pd::proto::VarType::READER)
+      .value("RAW", pd::proto::VarType::RAW);
 }
 
-void BindOpDesc(py::module &m) {
-  py::enum_<proto::AttrType>(m, "AttrType", "")
-      .value("INT", proto::AttrType::INT)
-      .value("INTS", proto::AttrType::INTS)
-      .value("FLOAT", proto::AttrType::FLOAT)
-      .value("FLOATS", proto::AttrType::FLOATS)
-      .value("STRING", proto::AttrType::STRING)
-      .value("STRINGS", proto::AttrType::STRINGS)
-      .value("BOOL", proto::AttrType::BOOLEAN)
-      .value("BOOLS", proto::AttrType::BOOLEANS)
-      .value("BLOCK", proto::AttrType::BLOCK);
+void BindOpDesc(pybind11::module *m) {
+  pybind11::enum_<pd::proto::AttrType>(*m, "AttrType", "")
+      .value("INT", pd::proto::AttrType::INT)
+      .value("INTS", pd::proto::AttrType::INTS)
+      .value("FLOAT", pd::proto::AttrType::FLOAT)
+      .value("FLOATS", pd::proto::AttrType::FLOATS)
+      .value("STRING", pd::proto::AttrType::STRING)
+      .value("STRINGS", pd::proto::AttrType::STRINGS)
+      .value("BOOL", pd::proto::AttrType::BOOLEAN)
+      .value("BOOLS", pd::proto::AttrType::BOOLEANS)
+      .value("BLOCK", pd::proto::AttrType::BLOCK)
+      .value("BLOCKS", pd::proto::AttrType::BLOCKS);
 
-  py::class_<OpDesc> op_desc(m, "OpDesc", "");
+  pybind11::class_<pd::OpDesc> op_desc(*m, "OpDesc", "");
   op_desc
-      .def("__init__", [](OpDesc &self) { new (&self) OpDesc(); },
-           py::return_value_policy::reference)
-      .def("copy_from", &OpDesc::CopyFrom)
-      .def("type", &OpDesc::Type)
-      .def("set_type", &OpDesc::SetType)
-      .def("input", &OpDesc::Input)
-      .def("input_names", &OpDesc::InputNames)
-      .def("output", &OpDesc::Output)
-      .def("output_names", &OpDesc::OutputNames)
-      .def("set_input", &OpDesc::SetInput)
-      .def("set_output", &OpDesc::SetOutput)
-      .def("input_arg_names", &OpDesc::InputArgumentNames)
-      .def("output_arg_names", &OpDesc::OutputArgumentNames)
-      .def("rename_input", &OpDesc::RenameInput)
-      .def("rename_output", &OpDesc::RenameOutput)
-      .def("has_attr", &OpDesc::HasAttr)
-      .def("attr_type", &OpDesc::GetAttrType)
-      .def("attr_names", &OpDesc::AttrNames)
-      .def("set_attr", &OpDesc::SetAttr)
-      .def("attr", &OpDesc::GetAttr)
-      .def("set_block_attr", &OpDesc::SetBlockAttr)
+      .def("__init__", [](pd::OpDesc &self) { new (&self) pd::OpDesc(); },
+           pybind11::return_value_policy::reference)
+      .def("copy_from", &pd::OpDesc::CopyFrom)
+      .def("type", &pd::OpDesc::Type)
+      .def("set_type", &pd::OpDesc::SetType)
+      .def("input", &pd::OpDesc::Input)
+      .def("input_names", &pd::OpDesc::InputNames)
+      .def("output", &pd::OpDesc::Output)
+      .def("output_names", &pd::OpDesc::OutputNames)
+      .def("set_input", &pd::OpDesc::SetInput)
+      .def("set_output", &pd::OpDesc::SetOutput)
+      .def("input_arg_names", &pd::OpDesc::InputArgumentNames)
+      .def("output_arg_names", &pd::OpDesc::OutputArgumentNames)
+      .def("rename_input", &pd::OpDesc::RenameInput)
+      .def("rename_output", &pd::OpDesc::RenameOutput)
+      .def("has_attr", &pd::OpDesc::HasAttr)
+      .def("attr_type", &pd::OpDesc::GetAttrType)
+      .def("attr_names", &pd::OpDesc::AttrNames)
+      .def("set_attr", &pd::OpDesc::SetAttr)
+      .def("attr", &pd::OpDesc::GetAttr)
+      .def("set_block_attr", &pd::OpDesc::SetBlockAttr)
+      .def("set_blocks_attr", &pd::OpDesc::SetBlocksAttr)
       .def("set_serialized_attr",
-           [](OpDesc &self, const std::string &name,
-              const py::bytes &seriralized) {
+           [](pd::OpDesc &self, const std::string &name,
+              const pybind11::bytes &seriralized) {
              std::string ser(seriralized);
              self.SetAttr(name, ser);
            })
-      .def("block_attr", &OpDesc::GetBlockAttr)
-      .def("check_attrs", &OpDesc::CheckAttrs)
-      .def("infer_shape", &OpDesc::InferShape)
-      .def("infer_var_type", &OpDesc::InferVarType)
-      .def("serialize_to_string", SerializeMessage<OpDesc>)
-      .def("block", &OpDesc::Block, py::return_value_policy::reference);
+      .def("block_attr", &pd::OpDesc::GetBlockAttr)
+      .def("check_attrs", &pd::OpDesc::CheckAttrs)
+      .def("infer_shape", &pd::OpDesc::InferShape)
+      .def("infer_var_type", &pd::OpDesc::InferVarType)
+      .def("set_is_target", &pd::OpDesc::SetIsTarget)
+      .def("serialize_to_string", SerializeMessage<pd::OpDesc>)
+      .def("block", &pd::OpDesc::Block,
+           pybind11::return_value_policy::reference);
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/protobuf.h b/paddle/fluid/pybind/protobuf.h
index d0dc8936b3df50ca12315f113fbb36b0f98bb53f..e7370672a88fcf9238cc88c6aae65c6ee643746b 100644
--- a/paddle/fluid/pybind/protobuf.h
+++ b/paddle/fluid/pybind/protobuf.h
@@ -11,25 +11,25 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
 
 #include <Python.h>
+
 #include <fstream>
 #include <vector>
+
 #include "paddle/fluid/platform/variant.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 
-namespace py = pybind11;
-
 namespace paddle {
 namespace pybind {
 
-void BindProgramDesc(py::module& m);
-void BindBlockDesc(py::module& m);
-void BindVarDsec(py::module& m);
-void BindOpDesc(py::module& m);
+void BindProgramDesc(pybind11::module* m);
+void BindBlockDesc(pybind11::module* m);
+void BindVarDsec(pybind11::module* m);
+void BindOpDesc(pybind11::module* m);
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index d2e883caccdd34a9d662f06b83cf9a71d3d4a51e..3191f29fc3e5d2914e4b68be9e94ccc0d05f8f93 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -11,12 +11,15 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
-#include "paddle/fluid/pybind/protobuf.h"
-
-#include <mutex>  // for call_once
+#include <Python.h>
+#include <algorithm>
+#include <map>
+#include <mutex>  // NOLINT // for call_once
+#include <string>
 #include <unordered_map>
-#include "paddle/fluid/framework/backward.h"
+#include <utility>
+#include <vector>
+
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
@@ -25,17 +28,20 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/cond_op.h"
-#include "paddle/fluid/operators/net_op.h"
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/pybind/const_value.h"
 #include "paddle/fluid/pybind/exception.h"
-#include "paddle/fluid/pybind/pybind.h"
+#include "paddle/fluid/pybind/protobuf.h"
+#include "paddle/fluid/pybind/pybind.h"  // NOLINT
 #include "paddle/fluid/pybind/recordio.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 
@@ -67,7 +73,7 @@ PYBIND11_PLUGIN(core) {
   // not cause namespace pollution.
   using namespace paddle::framework;  // NOLINT
 
-  BindException(m);
+  BindException(&m);
 
   py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
       .def_buffer(
@@ -98,17 +104,36 @@ PYBIND11_PLUGIN(core) {
            [](Tensor &self, paddle::platform::CUDAPlace &place) {
              self.mutable_data<int>(place);
            })
+      .def("alloc_int",
+           [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
+             self.mutable_data<int>(place);
+           })
+      .def("alloc_float",
+           [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
+             self.mutable_data<float>(place);
+           })
       .def("set", PyCPUTensorSetFromArray<float>)
       .def("set", PyCPUTensorSetFromArray<int>)
       .def("set", PyCPUTensorSetFromArray<double>)
       .def("set", PyCPUTensorSetFromArray<int64_t>)
       .def("set", PyCPUTensorSetFromArray<bool>)
+      .def("set", PyCPUTensorSetFromArray<uint16_t>)
+      .def("set", PyCPUTensorSetFromArray<uint8_t>)
 #ifdef PADDLE_WITH_CUDA
       .def("set", PyCUDATensorSetFromArray<float>)
       .def("set", PyCUDATensorSetFromArray<int>)
       .def("set", PyCUDATensorSetFromArray<double>)
       .def("set", PyCUDATensorSetFromArray<int64_t>)
       .def("set", PyCUDATensorSetFromArray<bool>)
+      .def("set", PyCUDATensorSetFromArray<uint16_t>)
+      .def("set", PyCUDATensorSetFromArray<uint8_t>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<float>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<int>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<double>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<int64_t>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<bool>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<uint16_t>)
+      .def("set", PyCUDAPinnedTensorSetFromArray<uint8_t>)
 #endif
       .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
       .def("set_float_element", TensorSetElement<float>)
@@ -120,28 +145,75 @@ PYBIND11_PLUGIN(core) {
   py::class_<LoDTensor, Tensor>(m, "LoDTensor")
       .def_buffer(
           [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
-      .def(
-          "__init__",
-          [](LoDTensor &instance, const std::vector<std::vector<size_t>> &lod) {
-            LoD new_lod;
-            new_lod.reserve(lod.size());
-            std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-            new (&instance) LoDTensor(new_lod);
-          })
+      .def("__init__",
+           [](LoDTensor &instance, const std::vector<std::vector<size_t>>
+                                       &recursive_sequence_lengths) {
+             LoD new_lod;
+             new_lod.reserve(recursive_sequence_lengths.size());
+             std::copy(recursive_sequence_lengths.begin(),
+                       recursive_sequence_lengths.end(),
+                       std::back_inserter(new_lod));
+             LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
+             PADDLE_ENFORCE(
+                 CheckLoD(new_offset_lod, -1),
+                 "the provided recursive_sequence_lengths info is invalid");
+             new (&instance) LoDTensor(new_offset_lod);
+           })
       .def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); })
+      // We implement offset based LOD in C++ while we use length based with
+      // Python API. So we changed set_lod to set_recursive_sequence_lengths to
+      // avoid misuse.
+      // The discussion is here:
+      // https://github.com/PaddlePaddle/Paddle/issues/10855
       .def("set_lod",
            [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
+             // the input lod is offset-based level-of-detail info
              LoD new_lod;
              new_lod.reserve(lod.size());
              std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+             PADDLE_ENFORCE(CheckLoD(new_lod, vectorize(self.dims()).front()),
+                            "the provided lod info is invalid");
              self.set_lod(new_lod);
            })
-      .def("lod", [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
-        auto lod = self.lod();
-        std::vector<std::vector<size_t>> new_lod;
-        new_lod.reserve(lod.size());
-        std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-        return new_lod;
+      .def("set_recursive_sequence_lengths",
+           [](LoDTensor &self, const std::vector<std::vector<size_t>>
+                                   &recursive_sequence_lengths) {
+             // the input recursive_sequence_lengths is length-based
+             // level-of-detail info
+             LoD new_lod;
+             new_lod.reserve(recursive_sequence_lengths.size());
+             std::copy(recursive_sequence_lengths.begin(),
+                       recursive_sequence_lengths.end(),
+                       std::back_inserter(new_lod));
+             LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
+             PADDLE_ENFORCE(
+                 CheckLoD(new_offset_lod, vectorize(self.dims()).front()),
+                 "the provided recursive_sequence_lengths info is invalid");
+             self.set_lod(new_offset_lod);
+           })
+      .def("lod",
+           [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
+             // output the offset-based lod info
+             LoD lod = self.lod();
+             std::vector<std::vector<size_t>> new_lod;
+             new_lod.reserve(lod.size());
+             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+             return new_lod;
+           })
+      // Set above comments of set_lod.
+      .def("recursive_sequence_lengths",
+           [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
+             // output the length-based lod info
+             LoD lod = ConvertToLengthBasedLoD(self.lod());
+             std::vector<std::vector<size_t>> new_lod;
+             new_lod.reserve(lod.size());
+             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+             return new_lod;
+           })
+      .def("has_valid_recursive_sequence_lengths", [](LoDTensor &self) -> bool {
+        // Check that the lod info is valid and match the outermost
+        // dimension of the LoDTensor data
+        return CheckLoD(self.lod(), vectorize(self.dims()).front());
       });
 
   py::class_<SelectedRows>(m, "SelectedRows")
@@ -216,11 +288,6 @@ All parameter, weight, gradient are variables in Paddle.
            },
            py::return_value_policy::reference)
 #endif
-      .def("get_net",
-           [](Variable &self) -> operators::NetOp * {
-             return self.GetMutable<operators::NetOp>();
-           },
-           py::return_value_policy::reference)
       .def("get_reader",
            [](Variable &self) -> framework::ReaderHolder * {
              PADDLE_ENFORCE(self.IsType<framework::ReaderHolder>());
@@ -229,9 +296,39 @@ All parameter, weight, gradient are variables in Paddle.
            py::return_value_policy::reference);
 
   py::class_<framework::ReaderHolder>(m, "Reader", "")
-      .def("has_next", &framework::ReaderHolder::HasNext)
       .def("reset", &framework::ReaderHolder::ReInit);
 
+  using LoDTensorBlockingQueue =
+      ::paddle::operators::reader::LoDTensorBlockingQueue;
+  using LoDTensorBlockingQueueHolder =
+      ::paddle::operators::reader::LoDTensorBlockingQueueHolder;
+  py::class_<LoDTensorBlockingQueue>(m, "LoDTensorBlockingQueue", "")
+      .def("push",
+           [](LoDTensorBlockingQueue &self,
+              const std::vector<framework::LoDTensor> &lod_tensor_vec) {
+             pybind11::gil_scoped_release release;
+             return self.Push(lod_tensor_vec);
+           })
+      .def("size", &LoDTensorBlockingQueue::Size)
+      .def("capacity", &LoDTensorBlockingQueue::Cap)
+      .def("close", &LoDTensorBlockingQueue::Close)
+      .def("is_closed", &LoDTensorBlockingQueue::IsClosed);
+
+  m.def("init_lod_tensor_blocking_queue",
+        [](Variable &var, size_t capacity,
+           const std::vector<std::vector<int64_t>> &shapes)
+            -> LoDTensorBlockingQueue * {
+              std::vector<DDim> dims(shapes.size());
+              std::transform(shapes.begin(), shapes.end(), dims.begin(),
+                             [](const std::vector<int64_t> &shape) {
+                               return make_ddim(shape);
+                             });
+              auto *holder = var.GetMutable<LoDTensorBlockingQueueHolder>();
+              holder->InitOnce(capacity, dims);
+              return holder->GetQueue().get();
+            },
+        py::return_value_policy::reference);
+
   py::class_<Scope>(m, "Scope", "")
       .def("var",
            [](Scope &self, const std::string &name) -> Variable * {
@@ -280,7 +377,7 @@ All parameter, weight, gradient are variables in Paddle.
                     const std::vector<std::array<size_t, 2>> &targets) {
     ProgramDesc prog_with_targets(origin);
     for (const auto &t : targets) {
-      prog_with_targets.MutableBlock(t[0])->Op(t[1])->MarkAsTarget();
+      prog_with_targets.MutableBlock(t[0])->Op(t[1])->SetIsTarget(true);
     }
     proto::ProgramDesc pruned_desc;
     Prune(*prog_with_targets.Proto(), &pruned_desc);
@@ -313,9 +410,18 @@ All parameter, weight, gradient are variables in Paddle.
 #else
                     return new paddle::platform::CUDADeviceContext(place);
 #endif
-                  });
+                  })
+          .def_static("create",
+                [](paddle::platform::CUDAPinnedPlace& place)
+                        -> paddle::platform::DeviceContext* {
+#ifndef PADDLE_WITH_CUDA
+                  PADDLE_THROW(
+                        "CUDAPinnedPlace is not supported in CPU device.");
+#else
+                  return new paddle::platform::CUDAPinnedDeviceContext(place);
+#endif
+                });;
 // clang-format on
-
 #ifdef PADDLE_WITH_CUDA
   py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
 #endif
@@ -327,6 +433,10 @@ All parameter, weight, gradient are variables in Paddle.
       .def(py::init<>())
       .def("__str__", string::to_string<const platform::CPUPlace &>);
 
+  py::class_<paddle::platform::CUDAPinnedPlace>(m, "CUDAPinnedPlace")
+      .def(py::init<>())
+      .def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
+
   py::class_<platform::Place>(m, "Place")
       .def(py::init<>())
       .def("set_place",
@@ -336,7 +446,11 @@ All parameter, weight, gradient are variables in Paddle.
       .def("set_place",
            [](platform::Place &self, const platform::CUDAPlace &gpu_place) {
              self = gpu_place;
-           });
+           })
+      .def("set_place", [](platform::Place &self,
+                           const platform::CUDAPinnedPlace &cuda_pinned_place) {
+        self = cuda_pinned_place;
+      });
 
   py::class_<OperatorBase>(m, "Operator")
       .def_static("create",
@@ -349,17 +463,17 @@ All parameter, weight, gradient are variables in Paddle.
                                    desc.InitializationErrorString());
                     return OpRegistry::CreateOp(desc);
                   })
-      .def("backward",
-           [](const OperatorBase &forwardOp,
-              const std::unordered_set<std::string> &no_grad_vars) {
-             return Backward(forwardOp, no_grad_vars).release();
-           })
       .def("run",
            [](OperatorBase &self, const Scope &scope,
               const platform::CPUPlace &place) { self.Run(scope, place); })
       .def("run",
            [](OperatorBase &self, const Scope &scope,
               const platform::CUDAPlace &place) { self.Run(scope, place); })
+      .def("run",
+           [](OperatorBase &self, const Scope &scope,
+              const platform::CUDAPinnedPlace &place) {
+             self.Run(scope, place);
+           })
       .def("type",
            [](const OperatorBase &op) -> std::string { return op.Type(); })
       .def("outputs",
@@ -376,62 +490,39 @@ All parameter, weight, gradient are variables in Paddle.
            [](const OperatorBase &op) { return op.OutputVars(false); })
       .def("support_gpu", &OperatorBase::SupportGPU);
 
-  py::class_<operators::NetOp, OperatorBase>(m, "Net")
-      .def_static("create",
-                  []() -> operators::NetOp * {
-                    auto *retv = new operators::NetOp;
-                    retv->SetType("plain_net");
-                    return retv;
-                  })
-      .def("append_op", [](operators::NetOp &self,
-                           const OperatorBase &op) { self.AppendOp(op); })
-      .def("complete_add_op", &operators::NetOp::CompleteAddOp)
-      .def("complete_add_op", [](std::shared_ptr<operators::NetOp> &self) {
-        self->CompleteAddOp();
-      });
-
-  // cond_op
-  py::class_<operators::CondOp, OperatorBase>(m, "CondOp")
-      .def_static("create",
-                  [](py::bytes protobin) -> operators::CondOp * {
-                    proto::OpDesc desc;
-                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
-                                   "Cannot parse user input to OpDesc");
-                    PADDLE_ENFORCE(desc.IsInitialized(),
-                                   "User OpDesc is not initialized, reason %s",
-                                   desc.InitializationErrorString());
-                    auto cond_op = OpRegistry::CreateOp(desc);
-                    return static_cast<operators::CondOp *>(cond_op.release());
-                  })
-      .def("set_truenet",
-           [](operators::CondOp &self, const operators::NetOp &net) -> void {
-             self.set_truenet(net.Clone());
-           })
-      .def("set_falsenet",
-           [](operators::CondOp &self, const operators::NetOp &net) -> void {
-             self.set_falsenet(net.Clone());
-           });
-
   py::class_<framework::Executor>(m, "Executor")
       .def(py::init<const platform::Place &>())
-      .def("run",
-           (void (Executor::*)(const ProgramDesc &, Scope *, int, bool, bool)) &
-               Executor::Run);
+#ifdef PADDLE_WITH_DISTRIBUTE
+      .def("begin_pass", &Executor::BeginPass)
+      .def("end_pass", &Executor::EndPass)
+#endif
+      .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope,
+                     int block_id, bool create_local_scope, bool create_vars) {
+        pybind11::gil_scoped_release release;
+        self.Run(prog, scope, block_id, create_local_scope, create_vars);
+      });
 
   m.def("init_gflags", framework::InitGflags);
   m.def("init_glog", framework::InitGLOG);
-  m.def("init_devices", &framework::InitDevices);
+  m.def("init_devices",
+        [](bool init_p2p) { framework::InitDevices(init_p2p); });
 
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
+#ifdef PADDLE_WITH_CUDA
+  m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
+    // Only GPUs with Compute Capability >= 53 support float16
+    return platform::GetCUDAComputeCapability(place.device) >= 53;
+  });
+#endif
 
   m.def("set_feed_variable", framework::SetFeedVariable);
   m.def("get_fetch_variable", framework::GetFetchVariable);
 
-  BindProgramDesc(m);
-  BindBlockDesc(m);
-  BindVarDsec(m);
-  BindOpDesc(m);
-  BindConstValue(m);
+  BindProgramDesc(&m);
+  BindBlockDesc(&m);
+  BindVarDsec(&m);
+  BindOpDesc(&m);
+  BindConstValue(&m);
 
   py::class_<framework::LoDRankTable>(m, "LodRankTable")
       .def("items", [](framework::LoDRankTable &table) {
@@ -459,6 +550,9 @@ All parameter, weight, gradient are variables in Paddle.
         self.back().set_lod(t.lod());
       });
 
+  m.def("IsInplace",
+        [](std::string op) -> bool { return operators::IsInplace(op); });
+
   m.def("op_support_gpu", OpSupportGPU);
 #ifdef PADDLE_WITH_CUDA
   m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
@@ -486,9 +580,104 @@ All parameter, weight, gradient are variables in Paddle.
 
   m.def("enable_profiler", platform::EnableProfiler);
   m.def("disable_profiler", platform::DisableProfiler);
+  m.def("is_profiler_enabled", platform::IsProfileEnabled);
   m.def("reset_profiler", platform::ResetProfiler);
 
-  BindRecordIOWriter(m);
+  // -- python binds for parallel executor.
+  py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
+  py::class_<ExecutionStrategy>(pe, "ExecutionStrategy")
+      .def(py::init())
+      .def_property(
+          "num_threads",
+          [](const ExecutionStrategy &self) { return self.num_threads_; },
+          [](ExecutionStrategy &self, size_t num_threads) {
+            self.num_threads_ = num_threads;
+          })
+      .def_property(
+          "use_cuda",
+          [](const ExecutionStrategy &self) { return self.use_cuda_; },
+          [](ExecutionStrategy &self, bool use_cuda) {
+            self.use_cuda_ = use_cuda;
+          })
+      .def_property(
+          "allow_op_delay",
+          [](const ExecutionStrategy &self) { return self.allow_op_delay_; },
+          [](ExecutionStrategy &self, bool allow_op_delay) {
+            self.allow_op_delay_ = allow_op_delay;
+          })
+      .def_property(
+          "num_iteration_per_drop_scope",
+          [](const ExecutionStrategy &self) {
+            return self.num_iteration_per_drop_scope_;
+          },
+          [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) {
+            self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope;
+          });
+  py::class_<BuildStrategy> build_strategy(pe, "BuildStrategy");
+
+  py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")
+      .value("Reduce", BuildStrategy::ReduceStrategy::kReduce)
+      .value("AllReduce", BuildStrategy::ReduceStrategy::kAllReduce);
+  py::enum_<BuildStrategy::GradientScaleStrategy>(build_strategy,
+                                                  "GradientScaleStrategy")
+      .value("CoeffNumDevice",
+             BuildStrategy::GradientScaleStrategy::kCoeffNumDevice)
+      .value("One", BuildStrategy::GradientScaleStrategy::kOne)
+      .value("Customized", BuildStrategy::GradientScaleStrategy::kCustomized);
+
+  build_strategy.def(py::init())
+      .def_property(
+          "reduce_strategy",
+          [](const BuildStrategy &self) { return self.reduce_; },
+          [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
+            self.reduce_ = strategy;
+          })
+      .def_property(
+          "gradient_scale_strategy",
+          [](const BuildStrategy &self) { return self.gradient_scale_; },
+          [](BuildStrategy &self,
+             BuildStrategy::GradientScaleStrategy strategy) {
+            self.gradient_scale_ = strategy;
+          })
+      .def_property(
+          "debug_graphviz_path",
+          [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
+          [](BuildStrategy &self, const std::string &path) {
+            self.debug_graphviz_path_ = path;
+          })
+      .def_property(
+          "enable_data_balance",
+          [](const BuildStrategy &self) { return self.enable_data_balance_; },
+          [](BuildStrategy &self, bool b) { self.enable_data_balance_ = b; });
+
+  pe.def(py::init<const std::vector<platform::Place> &,
+                  const std::unordered_set<std::string> &,
+                  const std::unordered_set<std::string> &, const ProgramDesc &,
+                  const std::string &, Scope *, std::vector<Scope *> &,
+                  const ExecutionStrategy &, const BuildStrategy &, size_t,
+                  size_t>())
+      .def("bcast_params", &ParallelExecutor::BCastParamsToGPUs)
+      // NOTE: even we return a vec<Scope*>* to Python use reference policy.
+      // We still cannot get local_scope from this vector, since the element
+      // of vec<Scope*> will be freed by Python GC. We can only return Scope*
+      // one by one and mark them as reference.
+      .def("local_scopes",
+           [](ParallelExecutor &self) -> std::vector<Scope *> * {
+             return &self.GetLocalScopes();
+           },
+           py::return_value_policy::reference)
+      .def("feed_tensors_into_local_scopes",
+           &ParallelExecutor::FeedTensorsIntoLocalScopes)
+      .def("feed_and_split_tensor_into_local_scopes",
+           &ParallelExecutor::FeedAndSplitTensorIntoLocalScopes)
+      .def("run", [](ParallelExecutor &self,
+                     const std::vector<std::string> &fetch_tensors,
+                     const std::string &fetched_var_name) {
+        pybind11::gil_scoped_release release;
+        self.Run(fetch_tensors, fetched_var_name);
+      });
+
+  BindRecordIOWriter(&m);
   return m.ptr();
 }
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/recordio.cc b/paddle/fluid/pybind/recordio.cc
index 16f8bfb1a2e3a840670594d3cc2970e690dce891..330d104e0a774d905e463566f85bd2e64a080190 100644
--- a/paddle/fluid/pybind/recordio.cc
+++ b/paddle/fluid/pybind/recordio.cc
@@ -13,13 +13,19 @@
 // limitations under the License.
 
 #include "paddle/fluid/pybind/recordio.h"
+
 #include <fstream>
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/recordio/writer.h"
 
 namespace paddle {
 namespace pybind {
 
+namespace {
+
 class RecordIOWriter {
  public:
   RecordIOWriter(const std::string& filename, recordio::Compressor compressor,
@@ -33,7 +39,7 @@ class RecordIOWriter {
   void CompleteAppendTensor() {
     auto& ctx =
         *platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
-    framework::WriteToRecordIO(writer_, tensors_, ctx);
+    framework::WriteToRecordIO(&writer_, tensors_, ctx);
     tensors_.clear();
   }
 
@@ -49,8 +55,10 @@ class RecordIOWriter {
   recordio::Writer writer_;
 };
 
-void BindRecordIOWriter(py::module& m) {
-  py::class_<RecordIOWriter> writer(m, "RecordIOWriter", "");
+}  // namespace
+
+void BindRecordIOWriter(py::module* m) {
+  py::class_<RecordIOWriter> writer(*m, "RecordIOWriter", "");
   py::enum_<recordio::Compressor>(writer, "Compressor", "")
       .value("Snappy", recordio::Compressor::kSnappy)
       .value("NoCompress", recordio::Compressor::kNoCompress);
diff --git a/paddle/fluid/pybind/recordio.h b/paddle/fluid/pybind/recordio.h
index 60e6a9e8595614b38375fca8c13d520739af9aaf..2555f9b719af8f73fbac10d92b890afd99fac290 100644
--- a/paddle/fluid/pybind/recordio.h
+++ b/paddle/fluid/pybind/recordio.h
@@ -21,6 +21,7 @@ namespace py = pybind11;
 namespace paddle {
 namespace pybind {
 
-extern void BindRecordIOWriter(py::module& m);
+void BindRecordIOWriter(py::module* m);
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 1b0916ea0370d95a0c7dd149ee3f7b294c5e2351..3e2ea1ef88b03f5b2576c1cee2b5d26a439943da 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -13,19 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <Python.h>
 #include <string>
+#include <tuple>
+#include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/float16.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 
-namespace py = pybind11;
-
 namespace paddle {
-
 namespace pybind {
-
 namespace details {
 
 template <bool less, size_t I, typename... ARGS>
@@ -33,16 +33,16 @@ struct CastToPyBufferImpl;
 
 template <size_t I, typename... ARGS>
 struct CastToPyBufferImpl<false, I, ARGS...> {
-  py::buffer_info operator()(framework::Tensor &tensor) {
+  pybind11::buffer_info operator()(const framework::Tensor &tensor) {
     PADDLE_THROW("This type of tensor cannot be expose to Python");
-    return py::buffer_info();
+    return pybind11::buffer_info();
   }
 };
 
 template <size_t I, typename... ARGS>
 struct CastToPyBufferImpl<true, I, ARGS...> {
   using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
-  py::buffer_info operator()(framework::Tensor &tensor) {
+  pybind11::buffer_info operator()(const framework::Tensor &tensor) {
     if (std::type_index(typeid(CUR_TYPE)) == tensor.type()) {
       auto dim_vec = framework::vectorize(tensor.dims());
       std::vector<size_t> dims_outside;
@@ -63,99 +63,182 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
         auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
             tensor.dims(), platform::CPUPlace()));
 
-        platform::DeviceContextPool &pool =
-            platform::DeviceContextPool::Instance();
-        auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
-            pool.Get(tensor.place()));
-
-        paddle::platform::GpuMemcpyAsync(
-            dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
-            cudaMemcpyDeviceToHost, dev_ctx->stream());
+        paddle::platform::GpuMemcpySync(dst_ptr, src_ptr,
+                                        sizeof(CUR_TYPE) * tensor.numel(),
+                                        cudaMemcpyDeviceToHost);
 #else
         PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
 #endif
       } else if (paddle::platform::is_cpu_place(tensor.place())) {
         dst_tensor = tensor;
       }
-      return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
-                             py::format_descriptor<CUR_TYPE>::format(),
-                             (size_t)framework::arity(dst_tensor.dims()),
-                             dims_outside, strides);
+
+      if (std::type_index(typeid(CUR_TYPE)) ==
+          std::type_index(typeid(platform::float16))) {
+        return pybind11::buffer_info(
+            dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
+            "e", /* np.dtype('e') == np.float16 */
+            (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
+      } else {
+        return pybind11::buffer_info(
+            dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
+            pybind11::format_descriptor<CUR_TYPE>::format(),
+            (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
+      }
     } else {
       constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
       return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
     }
   }
 };
+
 }  // namespace details
-inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
+
+inline pybind11::buffer_info CastToPyBuffer(const framework::Tensor &tensor) {
   auto buffer_info =
-      details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool>()(
-          tensor);
+      details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool,
+                                  uint8_t, platform::float16>()(tensor);
   return buffer_info;
 }
 
 template <typename T>
-T TensorGetElement(framework::Tensor &self, size_t offset) {
+T TensorGetElement(const framework::Tensor &self, size_t offset) {
   if (platform::is_cpu_place(self.place())) {
     return self.data<T>()[offset];
   } else {
     std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
-    framework::TensorCopy(self, platform::CPUPlace(), dst.get());
+    framework::TensorCopySync(self, platform::CPUPlace(), dst.get());
     return dst->data<T>()[offset];
   }
 }
 
 // TODO(dzhwinter) : fix the redundent Tensor allocate and free
 template <typename T>
-void TensorSetElement(framework::Tensor &self, size_t offset, T elem) {
-  if (platform::is_gpu_place(self.place())) {
+void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
+  if (platform::is_gpu_place(self->place())) {
     std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
-    framework::TensorCopy(self, platform::CPUPlace(), dst.get());
+    framework::TensorCopySync(*self, platform::CPUPlace(), dst.get());
     dst->data<T>()[offset] = elem;
-    framework::TensorCopy(*dst.get(), self.place(), &self);
+    framework::TensorCopySync(*dst.get(), self->place(), self);
 
-  } else if (platform::is_cpu_place(self.place())) {
-    self.data<T>()[offset] = elem;
+  } else if (platform::is_cpu_place(self->place())) {
+    self->data<T>()[offset] = elem;
   }
 }
 
 template <typename T>
 void PyCPUTensorSetFromArray(
-    framework::Tensor &self,
-    py::array_t<T, py::array::c_style | py::array::forcecast> array,
-    paddle::platform::CPUPlace &place) {
+    framework::Tensor *self,
+    pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
+        array,
+    paddle::platform::CPUPlace place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
   for (size_t i = 0; i < array.ndim(); ++i) {
-    dims.push_back((int)array.shape()[i]);
+    dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
-  self.Resize(framework::make_ddim(dims));
-  auto *dst = self.mutable_data<T>(place);
+  self->Resize(framework::make_ddim(dims));
+  auto *dst = self->mutable_data<T>(place);
   std::memcpy(dst, array.data(), sizeof(T) * array.size());
 }
 
+template <>
+// This following specialization maps uint16_t in the parameter type to
+// platform::float16.
+inline void PyCPUTensorSetFromArray(
+    framework::Tensor *self,
+    pybind11::array_t<uint16_t,
+                      pybind11::array::c_style | pybind11::array::forcecast>
+        array,
+    paddle::platform::CPUPlace place) {
+  std::vector<int64_t> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back(static_cast<int>(array.shape()[i]));
+  }
+
+  self->Resize(framework::make_ddim(dims));
+  auto *dst = self->mutable_data<platform::float16>(place);
+  std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
+}
+
 #ifdef PADDLE_WITH_CUDA
 template <typename T>
 void PyCUDATensorSetFromArray(
-    framework::Tensor &self,
-    py::array_t<T, py::array::c_style | py::array::forcecast> array,
-    paddle::platform::CUDAPlace &place) {
+    framework::Tensor *self,
+    pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
+        array,
+    paddle::platform::CUDAPlace place) {
   std::vector<int64_t> dims;
   dims.reserve(array.ndim());
   for (size_t i = 0; i < array.ndim(); ++i) {
-    dims.push_back((int)array.shape()[i]);
+    dims.push_back(static_cast<int>(array.shape()[i]));
   }
 
-  self.Resize(framework::make_ddim(dims));
-  auto *dst = self.mutable_data<T>(place);
+  self->Resize(framework::make_ddim(dims));
+  auto *dst = self->mutable_data<T>(place);
+  paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(),
+                                  cudaMemcpyHostToDevice);
+}
+
+template <>
+// This following specialization maps uint16_t in the parameter type to
+// platform::float16.
+inline void PyCUDATensorSetFromArray(
+    framework::Tensor *self,
+    pybind11::array_t<uint16_t,
+                      pybind11::array::c_style | pybind11::array::forcecast>
+        array,
+    paddle::platform::CUDAPlace place) {
+  std::vector<int64_t> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back(static_cast<int>(array.shape()[i]));
+  }
+
+  self->Resize(framework::make_ddim(dims));
+  auto *dst = self->mutable_data<platform::float16>(place);
+  paddle::platform::GpuMemcpySync(dst, array.data(),
+                                  sizeof(uint16_t) * array.size(),
+                                  cudaMemcpyHostToDevice);
+}
+
+template <typename T>
+void PyCUDAPinnedTensorSetFromArray(
+    framework::Tensor *self,
+    pybind11::array_t<T, pybind11::array::c_style | pybind11::array::forcecast>
+        array,
+    const paddle::platform::CUDAPinnedPlace &place) {
+  std::vector<int64_t> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back(static_cast<int>(array.shape()[i]));
+  }
+
+  self->Resize(framework::make_ddim(dims));
+  auto *dst = self->mutable_data<T>(place);
+  std::memcpy(dst, array.data(), sizeof(T) * array.size());
+}
+
+template <>
+// This following specialization maps uint16_t in the parameter type to
+// platform::float16.
+inline void PyCUDAPinnedTensorSetFromArray(
+    framework::Tensor *self,
+    pybind11::array_t<uint16_t,
+                      pybind11::array::c_style | pybind11::array::forcecast>
+        array,
+    const paddle::platform::CUDAPinnedPlace &place) {
+  std::vector<int64_t> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back(static_cast<int>(array.shape()[i]));
+  }
 
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto dev_ctx =
-      static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
-  paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
-                                   cudaMemcpyHostToDevice, dev_ctx->stream());
+  self->Resize(framework::make_ddim(dims));
+  auto *dst = self->mutable_data<platform::float16>(place);
+  std::memcpy(dst, array.data(), sizeof(uint16_t) * array.size());
 }
 #endif
 
diff --git a/paddle/fluid/pybind/tensor_py_test.cc b/paddle/fluid/pybind/tensor_py_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1a0ae1d65833b1097bf69befe05884cab1317a89
--- /dev/null
+++ b/paddle/fluid/pybind/tensor_py_test.cc
@@ -0,0 +1,44 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pybind/tensor_py.h"
+
+#include <iostream>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/tensor.h"
+
+TEST(TensorPy, CastToPyBufferImpl) {
+  typedef int ElemType;
+
+  paddle::framework::Tensor t;
+  auto d = paddle::framework::make_ddim({1, 2, 3});
+  int* p = t.mutable_data<ElemType>(d, paddle::platform::CPUPlace());
+  for (int i = 0; i < paddle::framework::product(d); ++i) {
+    p[i] = i;
+  }
+
+  pybind11::buffer_info bi = paddle::pybind::CastToPyBuffer(t);
+  EXPECT_EQ(bi.itemsize, static_cast<size_t>(sizeof(ElemType)));
+  EXPECT_EQ(bi.size, static_cast<size_t>(paddle::framework::product(d)));
+  EXPECT_EQ(bi.ndim, static_cast<size_t>(3));  // 3-dimensional as d.
+  EXPECT_EQ(bi.shape.size(), 3U);              // as Dim d.
+  EXPECT_EQ(bi.shape[0], static_cast<size_t>(1));
+  EXPECT_EQ(bi.shape[1], static_cast<size_t>(2));
+  EXPECT_EQ(bi.shape[2], static_cast<size_t>(3));
+  EXPECT_EQ(bi.strides.size(), 3U);  // 3-dimensional as d.
+  EXPECT_EQ(bi.strides[2], static_cast<size_t>(sizeof(ElemType)));
+  EXPECT_EQ(bi.strides[1], static_cast<size_t>(sizeof(ElemType) * 3));
+  EXPECT_EQ(bi.strides[0], static_cast<size_t>(sizeof(ElemType) * 2 * 3));
+}
diff --git a/paddle/fluid/recordio/README.md b/paddle/fluid/recordio/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ef99c0cf0fa71d807a95898454d8fabb287324e9
--- /dev/null
+++ b/paddle/fluid/recordio/README.md
@@ -0,0 +1,13 @@
+## Background
+
+The RecordIO file format is a container for records.  This package is a C++ implementation of https://github.com/paddlepaddle/recordio, which originates from https://github.com/wangkuiyi/recordio.
+
+## Fault-tolerant Writing
+
+For the initial design purpose of RecordIO within Google, which was logging, RecordIO groups record into *chunks*, whose header contains an MD5 hash of the chunk.  A process that writes logs is supposed to call the Writer interface to add records.  Once the writer accumulates a handful of them, it groups a chunk, put the MD5 into the chunk header, and appends the chunk to the file.  In the event the process crashes unexpected, the last chunk in the RecordIO file could be incomplete/corrupt. The RecordIO reader is able to recover from these errors when the process restarts by identifying incomplete chucks and skipping over them.
+
+## Reading Ranges
+
+A side-effect of chunks is to make it easy to indexing records while reading, thus allows us to read a range of successive records.  This is good for distributed log process, where each MapReduce task handles only part of records in a big RecordIO file.
+
+The procedure that creates the index starts from reading the header of the first chunk. It indexes the offset (0) and the size of the chunk, and skips to the header of the next chunk by calling the `fseek` API. Please be aware that most distributed filesystems and all POSIX-compatible local filesystem provides `fseek`, and makes sure that `fseek` runs much faster than `fread`.  This procedure generates a map from chunks to their offsets, which allows the readers is to locate and read a range of records.
diff --git a/paddle/fluid/recordio/chunk.cc b/paddle/fluid/recordio/chunk.cc
index 187a6a4ea7bd9d3a8ae48fa262e18f71b0f7d20d..6c65d9160c059ac143ee258b2bdaed5915a1dca1 100644
--- a/paddle/fluid/recordio/chunk.cc
+++ b/paddle/fluid/recordio/chunk.cc
@@ -14,11 +14,13 @@
 
 #include "paddle/fluid/recordio/chunk.h"
 
+#include <zlib.h>
+#include <algorithm>
 #include <memory>
 #include <sstream>
+
 #include "paddle/fluid/platform/enforce.h"
 #include "snappystream.hpp"
-#include "zlib.h"
 
 namespace paddle {
 namespace recordio {
@@ -58,8 +60,8 @@ static void ReadStreamByBuf(std::istream& in, size_t limit, Callback callback) {
  * Copy stream in to another stream
  */
 static void PipeStream(std::istream& in, std::ostream& os) {
-  ReadStreamByBuf(
-      in, 0, [&os](const char* buf, size_t len) { os.write(buf, len); });
+  ReadStreamByBuf(in, 0,
+                  [&os](const char* buf, size_t len) { os.write(buf, len); });
 }
 
 /**
@@ -68,8 +70,8 @@ static void PipeStream(std::istream& in, std::ostream& os) {
 static uint32_t Crc32Stream(std::istream& in, size_t limit = 0) {
   uint32_t crc = static_cast<uint32_t>(crc32(0, nullptr, 0));
   ReadStreamByBuf(in, limit, [&crc](const char* buf, size_t len) {
-    crc = static_cast<uint32_t>(crc32(
-        crc, reinterpret_cast<const Bytef*>(buf), static_cast<uInt>(len)));
+    crc = static_cast<uint32_t>(crc32(crc, reinterpret_cast<const Bytef*>(buf),
+                                      static_cast<uInt>(len)));
   });
   return crc;
 }
@@ -117,40 +119,56 @@ bool Chunk::Write(std::ostream& os, Compressor ct) const {
 }
 
 bool Chunk::Parse(std::istream& sin) {
-  Header hdr;
-  bool ok = hdr.Parse(sin);
+  ChunkParser parser(sin);
+  if (!parser.Init()) {
+    return false;
+  }
+  Clear();
+  while (parser.HasNext()) {
+    Add(parser.Next());
+  }
+  return true;
+}
+
+ChunkParser::ChunkParser(std::istream& sin) : in_(sin) {}
+bool ChunkParser::Init() {
+  pos_ = 0;
+  bool ok = header_.Parse(in_);
   if (!ok) {
     return ok;
   }
-  auto beg_pos = sin.tellg();
-  uint32_t crc = Crc32Stream(sin, hdr.CompressSize());
-  PADDLE_ENFORCE_EQ(hdr.Checksum(), crc);
-  Clear();
-  sin.seekg(beg_pos, sin.beg);
-  std::unique_ptr<std::istream> compressed_stream;
-  switch (hdr.CompressType()) {
+  auto beg_pos = in_.tellg();
+  uint32_t crc = Crc32Stream(in_, header_.CompressSize());
+  PADDLE_ENFORCE_EQ(header_.Checksum(), crc);
+  in_.seekg(beg_pos, in_.beg);
+
+  switch (header_.CompressType()) {
     case Compressor::kNoCompress:
       break;
     case Compressor::kSnappy:
-      compressed_stream.reset(new snappy::iSnappyStream(sin));
+      compressed_stream_.reset(new snappy::iSnappyStream(in_));
       break;
     default:
       PADDLE_THROW("Not implemented");
   }
+  return true;
+}
 
-  std::istream& stream = compressed_stream ? *compressed_stream : sin;
+bool ChunkParser::HasNext() const { return pos_ < header_.NumRecords(); }
 
-  for (uint32_t i = 0; i < hdr.NumRecords(); ++i) {
-    uint32_t rec_len;
-    stream.read(reinterpret_cast<char*>(&rec_len), sizeof(uint32_t));
-    std::string buf;
-    buf.resize(rec_len);
-    stream.read(&buf[0], rec_len);
-    PADDLE_ENFORCE_EQ(rec_len, stream.gcount());
-    Add(buf);
+std::string ChunkParser::Next() {
+  if (!HasNext()) {
+    return "";
   }
-  return true;
+  ++pos_;
+  std::istream& stream = compressed_stream_ ? *compressed_stream_ : in_;
+  uint32_t rec_len;
+  stream.read(reinterpret_cast<char*>(&rec_len), sizeof(uint32_t));
+  std::string buf;
+  buf.resize(rec_len);
+  stream.read(&buf[0], rec_len);
+  PADDLE_ENFORCE_EQ(rec_len, stream.gcount());
+  return buf;
 }
-
 }  // namespace recordio
 }  // namespace paddle
diff --git a/paddle/fluid/recordio/chunk.h b/paddle/fluid/recordio/chunk.h
index bf20ebd455c26ddeebeeea8db04cf7103b0c085f..cfb954a591679c2d2c4f42ecd99ca0c8bd1084cf 100644
--- a/paddle/fluid/recordio/chunk.h
+++ b/paddle/fluid/recordio/chunk.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -24,7 +25,7 @@ namespace recordio {
 
 // A Chunk contains the Header and optionally compressed records.
 class Chunk {
-public:
+ public:
   Chunk() : num_bytes_(0) {}
   void Add(const std::string& buf) {
     num_bytes_ += buf.size();
@@ -46,16 +47,27 @@ public:
 
   bool Empty() const { return records_.empty(); }
 
-private:
+ private:
   std::vector<std::string> records_;
   // sum of record lengths in bytes.
   size_t num_bytes_;
   DISABLE_COPY_AND_ASSIGN(Chunk);
 };
 
-size_t CompressData(const char* in, size_t in_length, Compressor ct, char* out);
+class ChunkParser {
+ public:
+  explicit ChunkParser(std::istream& sin);
 
-void DeflateData(const char* in, size_t in_length, Compressor ct, char* out);
+  bool Init();
+  std::string Next();
+  bool HasNext() const;
+
+ private:
+  Header header_;
+  uint32_t pos_{0};
+  std::istream& in_;
+  std::unique_ptr<std::istream> compressed_stream_;
+};
 
 }  // namespace recordio
 }  // namespace paddle
diff --git a/paddle/fluid/recordio/chunk_test.cc b/paddle/fluid/recordio/chunk_test.cc
index 1f0e36a14d373ca96167199d4582bc8f17290ae8..5177475c016097d9a118aa79f855672354b3ef53 100644
--- a/paddle/fluid/recordio/chunk_test.cc
+++ b/paddle/fluid/recordio/chunk_test.cc
@@ -18,32 +18,30 @@
 
 #include "gtest/gtest.h"
 
-using namespace paddle::recordio;
-
 TEST(Chunk, SaveLoad) {
-  Chunk ch;
+  paddle::recordio::Chunk ch;
   ch.Add(std::string("12345", 6));
   ch.Add(std::string("123", 4));
   std::stringstream ss;
-  ch.Write(ss, Compressor::kNoCompress);
+  ch.Write(ss, paddle::recordio::Compressor::kNoCompress);
   ss.seekg(0);
   ch.Parse(ss);
   ASSERT_EQ(ch.NumBytes(), 10U);
 }
 
 TEST(Chunk, Compressor) {
-  Chunk ch;
+  paddle::recordio::Chunk ch;
   ch.Add(std::string("12345", 6));
   ch.Add(std::string("123", 4));
   ch.Add(std::string("123", 4));
   ch.Add(std::string("123", 4));
   std::stringstream ss;
-  ch.Write(ss, Compressor::kSnappy);
+  ch.Write(ss, paddle::recordio::Compressor::kSnappy);
   std::stringstream ss2;
-  ch.Write(ss2, Compressor::kNoCompress);
+  ch.Write(ss2, paddle::recordio::Compressor::kNoCompress);
   ASSERT_LE(ss.tellp(), ss2.tellp());  // Compress should contain less data;
 
   ch.Clear();
   ch.Parse(ss);
-  ASSERT_EQ(ch.NumBytes(), 18);
+  ASSERT_EQ(ch.NumBytes(), 18ul);
 }
diff --git a/paddle/fluid/recordio/header.cc b/paddle/fluid/recordio/header.cc
index e50de15b7c2b480357f5f6c7daa2b4a676749679..c4822329a43a79adc81f0b0cf145b22661ac6f50 100644
--- a/paddle/fluid/recordio/header.cc
+++ b/paddle/fluid/recordio/header.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/recordio/header.h"
+
+#include <string>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -29,8 +32,8 @@ Header::Header(uint32_t num, uint32_t sum, Compressor c, uint32_t cs)
 
 bool Header::Parse(std::istream& is) {
   uint32_t magic;
-  size_t read_size =
-      is.readsome(reinterpret_cast<char*>(&magic), sizeof(uint32_t));
+  is.read(reinterpret_cast<char*>(&magic), sizeof(uint32_t));
+  size_t read_size = is.gcount();
   if (read_size < sizeof(uint32_t)) {
     return false;
   }
diff --git a/paddle/fluid/recordio/header.h b/paddle/fluid/recordio/header.h
index 9200ac090de4514bef3704ac502039222eef2284..245425990b93a90d7ac6b233cff54feb48308d48 100644
--- a/paddle/fluid/recordio/header.h
+++ b/paddle/fluid/recordio/header.h
@@ -37,7 +37,7 @@ enum class Compressor : uint32_t {
 
 // Header is the metadata of Chunk
 class Header {
-public:
+ public:
   Header();
   Header(uint32_t num, uint32_t sum, Compressor ct, uint32_t cs);
 
@@ -51,7 +51,7 @@ public:
   Compressor CompressType() const { return compressor_; }
   uint32_t CompressSize() const { return compress_size_; }
 
-private:
+ private:
   uint32_t num_records_;
   uint32_t checksum_;
   Compressor compressor_;
diff --git a/paddle/fluid/recordio/header_test.cc b/paddle/fluid/recordio/header_test.cc
index a7d627c3eb4a7af1954795f77e5f24739edadae8..00f1887dc5e1188829ef4cd42754d161f041656d 100644
--- a/paddle/fluid/recordio/header_test.cc
+++ b/paddle/fluid/recordio/header_test.cc
@@ -18,14 +18,12 @@
 
 #include "gtest/gtest.h"
 
-using namespace paddle::recordio;
-
 TEST(Recordio, ChunkHead) {
-  Header hdr(0, 1, Compressor::kGzip, 3);
+  paddle::recordio::Header hdr(0, 1, paddle::recordio::Compressor::kGzip, 3);
   std::stringstream ss;
   hdr.Write(ss);
   ss.seekg(0, std::ios::beg);
-  Header hdr2;
+  paddle::recordio::Header hdr2;
   hdr2.Parse(ss);
   EXPECT_TRUE(hdr == hdr2);
 }
diff --git a/paddle/fluid/recordio/scanner.cc b/paddle/fluid/recordio/scanner.cc
index d842f8fe5a4c9d1a2b564c738d97fffb02f3ccb5..06a13e6c5b6ea76456e231e3f7b1eb33492b16ea 100644
--- a/paddle/fluid/recordio/scanner.cc
+++ b/paddle/fluid/recordio/scanner.cc
@@ -13,39 +13,42 @@
 // limitations under the License.
 
 #include "paddle/fluid/recordio/scanner.h"
+
+#include <string>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace recordio {
+
 Scanner::Scanner(std::unique_ptr<std::istream> &&stream)
-    : stream_(std::move(stream)) {
+    : stream_(std::move(stream)), parser_(*stream_) {
   Reset();
 }
 
-Scanner::Scanner(const std::string &filename) {
-  stream_.reset(new std::ifstream(filename));
+Scanner::Scanner(const std::string &filename)
+    : stream_(new std::ifstream(filename)), parser_(*stream_) {
   Reset();
 }
 
 void Scanner::Reset() {
+  stream_->clear();
   stream_->seekg(0, std::ios::beg);
-  ParseNextChunk();
+  parser_.Init();
 }
 
 std::string Scanner::Next() {
-  PADDLE_ENFORCE(!eof_, "StopIteration");
-  auto rec = cur_chunk_.Record(offset_++);
-  if (offset_ == cur_chunk_.NumRecords()) {
-    ParseNextChunk();
+  if (stream_->eof()) {
+    return "";
   }
-  return rec;
-}
 
-void Scanner::ParseNextChunk() {
-  eof_ = !cur_chunk_.Parse(*stream_);
-  offset_ = 0;
+  auto res = parser_.Next();
+  if (!parser_.HasNext() && HasNext()) {
+    parser_.Init();
+  }
+  return res;
 }
 
-bool Scanner::HasNext() const { return !eof_; }
+bool Scanner::HasNext() const { return !stream_->eof(); }
 }  // namespace recordio
 }  // namespace paddle
diff --git a/paddle/fluid/recordio/scanner.h b/paddle/fluid/recordio/scanner.h
index f3f17b69f195ddd92f5a39ead9755a7b8e2dd329..0d885dd87a2f819ba1d9f76259196f6cfff0b2a0 100644
--- a/paddle/fluid/recordio/scanner.h
+++ b/paddle/fluid/recordio/scanner.h
@@ -16,12 +16,15 @@
 
 #include <fstream>
 #include <memory>
+#include <string>
+
 #include "paddle/fluid/recordio/chunk.h"
+
 namespace paddle {
 namespace recordio {
 
 class Scanner {
-public:
+ public:
   explicit Scanner(std::unique_ptr<std::istream>&& stream);
 
   explicit Scanner(const std::string& filename);
@@ -32,13 +35,9 @@ public:
 
   bool HasNext() const;
 
-private:
+ private:
   std::unique_ptr<std::istream> stream_;
-  Chunk cur_chunk_;
-  size_t offset_;
-  bool eof_;
-
-  void ParseNextChunk();
+  ChunkParser parser_;
 };
 }  // namespace recordio
 }  // namespace paddle
diff --git a/paddle/fluid/recordio/writer.cc b/paddle/fluid/recordio/writer.cc
index 196d66edff8cc6000afcd74fb945c05dcab7106a..8046f4ff7896c897ebe1de2e2bb231cad5a0e410 100644
--- a/paddle/fluid/recordio/writer.cc
+++ b/paddle/fluid/recordio/writer.cc
@@ -12,9 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/recordio/writer.h"
+
+#include <string>
+
 #include "paddle/fluid/platform/enforce.h"
+
 namespace paddle {
 namespace recordio {
+
 void Writer::Write(const std::string& record) {
   cur_chunk_.Add(record);
   if (cur_chunk_.NumRecords() >= max_num_records_in_chunk_) {
diff --git a/paddle/fluid/recordio/writer.h b/paddle/fluid/recordio/writer.h
index 0c478d507547b10b8ebaaf5e512557a5c8c13e65..ac7e50ee90e6e8671d68e0d8065e0cf06c819ad0 100644
--- a/paddle/fluid/recordio/writer.h
+++ b/paddle/fluid/recordio/writer.h
@@ -11,16 +11,17 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #pragma once
+
+#include <string>
+
 #include "paddle/fluid/recordio/chunk.h"
 namespace paddle {
 namespace recordio {
 
 class Writer {
-public:
-  Writer(std::ostream* sout,
-         Compressor compressor,
+ public:
+  Writer(std::ostream* sout, Compressor compressor,
          size_t max_num_records_in_chunk = 1000)
       : stream_(*sout),
         max_num_records_in_chunk_(max_num_records_in_chunk),
@@ -32,7 +33,7 @@ public:
 
   ~Writer();
 
-private:
+ private:
   std::ostream& stream_;
   size_t max_num_records_in_chunk_;
   Chunk cur_chunk_;
diff --git a/paddle/fluid/recordio/writer_scanner_test.cc b/paddle/fluid/recordio/writer_scanner_test.cc
index 7e764f0d9439709ad101af2b8864dc0158bd359b..6583df21a20e9e034adc14b1d3eeb136899d659e 100644
--- a/paddle/fluid/recordio/writer_scanner_test.cc
+++ b/paddle/fluid/recordio/writer_scanner_test.cc
@@ -12,9 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "gtest/gtest.h"
-
 #include <sstream>
+#include <string>
+
+#include "gtest/gtest.h"
 #include "paddle/fluid/recordio/scanner.h"
 #include "paddle/fluid/recordio/writer.h"
 
@@ -66,4 +67,4 @@ TEST(WriterScanner, TinyChunk) {
     ASSERT_EQ(scanner.Next(), "DEFG");
     ASSERT_FALSE(scanner.HasNext());
   }
-}
\ No newline at end of file
+}
diff --git a/paddle/fluid/string/.clang-format b/paddle/fluid/string/.clang-format
deleted file mode 120000
index 7d28cb3924707d39dafe20f4664fb17b5538996c..0000000000000000000000000000000000000000
--- a/paddle/fluid/string/.clang-format
+++ /dev/null
@@ -1 +0,0 @@
-../framework/.clang-format
\ No newline at end of file
diff --git a/paddle/fluid/string/piece.cc b/paddle/fluid/string/piece.cc
index 454f5d8d38c5f02598cddaab555334a1e8a398da..8e8cfb0e91389490895835ed09ef36adf756d3ca 100644
--- a/paddle/fluid/string/piece.cc
+++ b/paddle/fluid/string/piece.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "piece.h"
+#include "paddle/fluid/string/piece.h"
 
 #include <string.h>
 
diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h
index 693cf9d6dfeea0735801e64fe74b9770c258c553..062095a1c3e977c0bcc89346ead765acb023bcf7 100644
--- a/paddle/fluid/string/printf.h
+++ b/paddle/fluid/string/printf.h
@@ -71,6 +71,8 @@
 
 #include <iostream>
 #include <sstream>
+#include <string>
+
 #include "tinyformat/tinyformat.h"  // https://github.com/c42f/tinyformat
 
 namespace paddle {
diff --git a/paddle/fluid/string/printf_test.cc b/paddle/fluid/string/printf_test.cc
index b6a60c8d6b7f15f8e5572cf5bb1e7f04ee1c1598..678029f93534ab374bd29083f8991d632ccdd5a1 100644
--- a/paddle/fluid/string/printf_test.cc
+++ b/paddle/fluid/string/printf_test.cc
@@ -11,7 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "printf.h"
+
+#include "paddle/fluid/string/printf.h"
 
 #include <string>
 
@@ -21,7 +22,7 @@ TEST(StringPrintf, StringPrintf) {
   std::string weekday = "Wednesday";
   const char* month = "July";
   size_t day = 27;
-  long hour = 14;
+  int hour = 14;
   int min = 44;
   EXPECT_EQ(std::string("Wednesday, July 27, 14:44"),
             paddle::string::Sprintf("%s, %s %d, %.2d:%.2d", weekday, month, day,
diff --git a/paddle/fluid/string/to_string_test.cc b/paddle/fluid/string/to_string_test.cc
index 8fc293af0e473994ac13f6615d3f6195c8c5f04c..1d9c0e5e0c2b6e7f44c1622d2828b21b0a4380ee 100644
--- a/paddle/fluid/string/to_string_test.cc
+++ b/paddle/fluid/string/to_string_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "to_string.h"
+#include "paddle/fluid/string/to_string.h"
 #include <gtest/gtest.h>
 
 constexpr char kOutputString[] = "User Defined Output";
@@ -26,14 +26,13 @@ std::ostream& operator<<(std::ostream& s, const UserDefinedClass& ins) {
 }
 
 TEST(to_string, normal) {
-  using namespace paddle::string;
+  using paddle::string::to_string;
   ASSERT_EQ("10", to_string(10));
   ASSERT_EQ("abc", to_string("abc"));
   ASSERT_EQ("1.2", to_string(1.2));
 }
 
 TEST(to_string, user_defined) {
-  using namespace paddle::string;
   UserDefinedClass instance;
-  ASSERT_EQ(kOutputString, to_string(instance));
+  ASSERT_EQ(kOutputString, paddle::string::to_string(instance));
 }
diff --git a/paddle/fluid/train/demo/CMakeLists.txt b/paddle/fluid/train/demo/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..78d6e5ff554b9cd9facae85be166a697e0b75337
--- /dev/null
+++ b/paddle/fluid/train/demo/CMakeLists.txt
@@ -0,0 +1,66 @@
+cmake_minimum_required(VERSION 3.0)
+
+project(cpp_train_demo CXX C)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+
+if(NOT DEFINED PADDLE_LIB)
+  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/paddle/lib/dir")
+endif()
+
+option(WITH_MKLDNN     "Compile PaddlePaddle with MKLDNN"                                   OFF)
+option(WITH_MKL        "Compile PaddlePaddle with MKL support, default use openblas."       OFF)
+
+include_directories("${PADDLE_LIB}")
+include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
+include_directories("${PADDLE_LIB}/third_party/install/glog/include")
+include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
+include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
+include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
+include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
+
+include_directories("${PADDLE_LIB}/third_party/boost")
+include_directories("${PADDLE_LIB}/third_party/eigen3")
+
+link_directories("${PADDLE_LIB}/third_party/install/snappy/lib")
+link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib")
+link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
+link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
+link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
+link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
+
+add_executable(demo_trainer demo_trainer.cc)
+
+if(WITH_MKLDNN)
+  include_directories("${PADDLE_LIB}/third_party/install/mkldnn/include")
+  set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/libmkldnn.so.0)
+endif()
+
+if(WITH_MKL)
+  include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
+  set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so)
+else()
+  if(APPLE)
+    set(MATH_LIB cblas)
+  else(APPLE)
+    set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
+  endif(APPLE)
+endif()
+
+if(APPLE)
+  set(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load -framework CoreFoundation -framework Security")
+else(APPLE)
+  set(ARCHIVE_START "-Wl,--whole-archive")
+  set(ARCHIVE_END "-Wl,--no-whole-archive")
+  set(EXTERNAL_LIB "-lrt -ldl -lpthread")
+endif(APPLE)
+
+target_link_libraries(demo_trainer
+        ${MACOS_LD_FLAGS}
+        ${ARCHIVE_START}
+        ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.a
+        ${ARCHIVE_END}
+        ${MATH_LIB}
+        ${MKLDNN_LIB}
+        glog gflags protobuf snappystream snappy z
+        ${EXTERNAL_LIB})
diff --git a/paddle/fluid/train/demo/README.md b/paddle/fluid/train/demo/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..41b01d33828f750f67bba5f82cb7ed6fe4d4ea0a
--- /dev/null
+++ b/paddle/fluid/train/demo/README.md
@@ -0,0 +1,66 @@
+
+### step 1. build paddle lib
+
+```
+
+# WITH_MKL=ON|OFF
+# WITH_MKLDNN=ON|OFF
+
+PADDLE_LIB=/paddle/lib/dir
+cmake .. -DFLUID_INSTALL_DIR=$PADDLE_LIB \
+         -DCMAKE_BUILD_TYPE=Release \
+         -DWITH_FLUID_ONLY=ON \
+         -DWITH_GPU=OFF \
+         -DWITH_STYLE_CHECK=OFF \
+         -DWITH_MKL=OFF \
+         -DWITH_MKLDNN=OFF
+make -j8
+make -j8 inference_lib_dist
+```
+
+### step 2. generate program desc
+```
+# please install paddle before run this scripe
+pip install --upgrade paddlepaddle-*.whl
+python demo_network.py
+```
+
+This will generate two program desc files:
+  - startup_program: used to init all parameters
+  - main_program: main logic of the network
+
+### step 3. build demo_trainer and run it.
+
+
+```
+# Make a build dir at the same dir of this README.md document.
+# The demo dir can be put anywhere.
+mkdir build
+cd build
+
+# WITH_MKL=ON|OFF
+# WITH_MKLDNN=ON|OFF
+PADDLE_LIB=/paddle/lib/dir
+
+# PADDLE_LIB is the same with FLUID_INSTALL_DIR when building the lib
+cmake .. -DPADDLE_LIB=$PADDLE_LIB \
+         -DWITH_MKLDNN=OFF \
+         -DWITH_MKL=OFF
+make
+
+# copy startup_program and main_program to this dir
+cp ../startup_program .
+cp ../main_program .
+
+# run demo cpp trainer
+./demo_trainer
+
+```
+
+The output will be:
+```
+step: 0 loss: 1069.02
+step: 1 loss: 1069.02
+step: 2 loss: 1069.02
+....
+```
diff --git a/paddle/fluid/train/demo/demo_network.py b/paddle/fluid/train/demo/demo_network.py
new file mode 100644
index 0000000000000000000000000000000000000000..41e98c6a24a750a9300b5c2a6d370303cc0e59c5
--- /dev/null
+++ b/paddle/fluid/train/demo/demo_network.py
@@ -0,0 +1,47 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import paddle.fluid.framework as framework
+
+
+def train_network(with_optimize):
+    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(cost)
+
+    if with_optimize:
+        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.00001)
+        sgd_optimizer.minimize(avg_cost)
+    else:
+        fluid.backward.append_backward(avg_cost)
+
+
+def save_program_desc(network_func):
+    startup_program = framework.Program()
+    train_program = framework.Program()
+
+    with framework.program_guard(train_program, startup_program):
+        network_func(with_optimize=False)
+
+    with open("startup_program", "w") as f:
+        f.write(startup_program.desc.serialize_to_string())
+    with open("main_program", "w") as f:
+        f.write(train_program.desc.serialize_to_string())
+
+
+save_program_desc(train_network)
diff --git a/paddle/fluid/train/demo/demo_trainer.cc b/paddle/fluid/train/demo/demo_trainer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..813d8386868558bd62a9d5670d540ddeddb2b77d
--- /dev/null
+++ b/paddle/fluid/train/demo/demo_trainer.cc
@@ -0,0 +1,103 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fstream>
+
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/init.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace train {
+
+void ReadBinaryFile(const std::string& filename, std::string* contents) {
+  std::ifstream fin(filename, std::ios::in | std::ios::binary);
+  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
+  fin.seekg(0, std::ios::end);
+  contents->clear();
+  contents->resize(fin.tellg());
+  fin.seekg(0, std::ios::beg);
+  fin.read(&(contents->at(0)), contents->size());
+  fin.close();
+}
+
+std::unique_ptr<paddle::framework::ProgramDesc> Load(
+    paddle::framework::Executor* executor, const std::string& model_filename) {
+  VLOG(3) << "loading model from " << model_filename;
+  std::string program_desc_str;
+  ReadBinaryFile(model_filename, &program_desc_str);
+
+  std::unique_ptr<paddle::framework::ProgramDesc> main_program(
+      new paddle::framework::ProgramDesc(program_desc_str));
+  return main_program;
+}
+
+}  // namespace train
+}  // namespace paddle
+
+int main() {
+  paddle::framework::InitDevices(false);
+
+  const auto cpu_place = paddle::platform::CPUPlace();
+
+  paddle::framework::Executor executor(cpu_place);
+  paddle::framework::Scope scope;
+  auto startup_program = paddle::train::Load(&executor, "startup_program");
+  auto train_program = paddle::train::Load(&executor, "main_program");
+
+  std::string loss_name = "";
+  for (auto op_desc : train_program->Block(0).AllOps()) {
+    if (op_desc->Type() == "mean") {
+      loss_name = op_desc->Output("Out")[0];
+      break;
+    }
+  }
+
+  PADDLE_ENFORCE_NE(loss_name, "", "loss not found");
+
+  // init all parameters
+  executor.Run(*startup_program.get(), &scope, 0);
+
+  // prepare data
+  auto x_var = scope.Var("x");
+  auto x_tensor = x_var->GetMutable<paddle::framework::LoDTensor>();
+  x_tensor->Resize({2, 13});
+
+  auto x_data = x_tensor->mutable_data<float>(cpu_place);
+  for (int i = 0; i < 2 * 13; ++i) {
+    x_data[i] = static_cast<float>(i);
+  }
+
+  auto y_var = scope.Var("y");
+  auto y_tensor = y_var->GetMutable<paddle::framework::LoDTensor>();
+  y_tensor->Resize({2, 1});
+  auto y_data = y_tensor->mutable_data<float>(cpu_place);
+  for (int i = 0; i < 2 * 1; ++i) {
+    y_data[i] = static_cast<float>(i);
+  }
+
+  auto loss_var = scope.Var(loss_name);
+
+  for (int i = 0; i < 10; ++i) {
+    executor.Run(*train_program.get(), &scope, 0, false, true);
+    std::cout << "step: " << i << " loss: "
+              << loss_var->Get<paddle::framework::LoDTensor>().data<float>()[0]
+              << std::endl;
+  }
+  return 0;
+}
diff --git a/paddle/function/BlockExpandOp.cpp b/paddle/function/BlockExpandOp.cpp
deleted file mode 100644
index aa53853e08716ff0dd8dce7c73766d9543bed2b9..0000000000000000000000000000000000000000
--- a/paddle/function/BlockExpandOp.cpp
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Function.h"
-#include "Im2Col.h"
-
-namespace paddle {
-
-/*
- * \brief Converts the image data of four dimensions(NCHW) into
- *        a sequence data of three dimensions(NST) in the forward calculation,
- *        which is reversed in the backward calculation.
- *        Where N is batch size, S is the length of the sequence after each
- *        image is expanded, T is the size of each time step in the sequence.
- *
- * Arguments in forward function:
- * \param inputs[0]  Image data of NCHW format.
- * \param outputs[0] Sequence data of NST format.
- *
- * Arguments in backward function:
- * \param inputs[0]  Sequence data of NST format.
- * \param outputs[0] Image data of NCHW format.
- */
-class BlockExpandFunction : public FunctionBase {
-public:
-  void init(const FuncConfig& config) override {
-    // function arguments
-    strides_ = config.get<std::vector<size_t>>("strides");
-    paddings_ = config.get<std::vector<size_t>>("paddings");
-    blocks_ = config.get<std::vector<size_t>>("blocks");
-
-    // number of inputs and outputs
-    numInputs_ = 1;
-    numOutputs_ = 1;
-  }
-
-  void checkShape(const TensorShape& image, const TensorShape& sequence) const {
-    // image shape should be 4-dimensional.
-    CHECK_EQ(image.ndims(), (size_t)4);
-    // sequence shape should be 3-dimensional.
-    CHECK_EQ(sequence.ndims(), (size_t)3);
-    // The batchSize of the image needs to be equal to
-    // the batchSize of the sequence.
-    CHECK_EQ(image[0], sequence[0]);
-  }
-
-  // Calculate the shape of colData based on the shape of the image
-  // and the shape of the sequence.
-  TensorShape getColShape(const TensorShape& image,
-                          const TensorShape& sequence) const {
-    size_t inputChannels = image[1];
-    size_t inputHeight = image[2];
-    size_t inputWidth = image[3];
-    size_t seqLength = sequence[1];
-    size_t stepSize = sequence[2];
-    size_t outputHeight =
-        1 +
-        (inputHeight + 2 * paddingH() - blockH() + strideH() - 1) / strideH();
-    size_t outputWidth =
-        1 +
-        (inputWidth + 2 * paddingW() - blockW() + strideW() - 1) / strideW();
-    CHECK_EQ(seqLength, outputHeight * outputWidth);
-    CHECK_EQ(stepSize, inputChannels * blockH() * blockW());
-
-    // [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
-    return TensorShape({outputHeight,
-                        outputWidth,
-                        inputChannels,
-                        (size_t)blockH(),
-                        (size_t)blockW()});
-  }
-
-protected:
-  std::vector<size_t> strides_;
-  std::vector<size_t> paddings_;
-  std::vector<size_t> blocks_;
-
-  inline int strideH() const { return strides_[0]; }
-
-  inline int strideW() const { return strides_[1]; }
-
-  inline int paddingH() const { return paddings_[0]; }
-
-  inline int paddingW() const { return paddings_[1]; }
-
-  inline int blockH() const { return blocks_[0]; }
-
-  inline int blockW() const { return blocks_[1]; }
-};
-
-template <DeviceType Device>
-class BlockExpandForward : public BlockExpandFunction {
-public:
-  void init(const FuncConfig& config) override {
-    BlockExpandFunction::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& image = inputs[0].shape();
-    const TensorShape& sequence = outputs[0].shape();
-    checkShape(image, sequence);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-    const TensorShape& image = inputs[0].shape();
-    const TensorShape& sequence = outputs[0].shape();
-
-    TensorShape imShape = TensorShape({image[1], image[2], image[3]});
-    TensorShape colShape = getColShape(image, sequence);
-    size_t batchSize = image[0];
-
-    real* imageData = inputs[0].data<real>();
-    real* seqData = outputs[0].data<real>();
-    Im2ColFunctor<kOCF, Device, real> im2col;
-    for (size_t i = 0; i < batchSize; i++) {
-      // The result of im2col is [outputHeight, outputWidth,
-      // inputChannels, filterHeight, filterWidth], and it is easy to
-      // reshape into [seqLength, stepSize], where seqLength is equal
-      // output_height * output_width, stepSize is equal
-      // input_channels * filter_height * filter_width
-      im2col(imageData,
-             imShape,
-             seqData,
-             colShape,
-             strideH(),
-             strideW(),
-             paddingH(),
-             paddingW());
-      imageData += imShape.getElements();
-      seqData += colShape.getElements();
-    }
-  }
-};
-
-template <DeviceType Device>
-class BlockExpandBackward : public BlockExpandFunction {
-public:
-  void init(const FuncConfig& config) override {
-    BlockExpandFunction::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& image = outputs[0].shape();
-    const TensorShape& sequence = inputs[0].shape();
-    checkShape(image, sequence);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-    // Since the implementation of Col2ImFunctor is ADD_TO,
-    // this function only supports ADD_TO mode.
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    const TensorShape& image = outputs[0].shape();
-    const TensorShape& sequence = inputs[0].shape();
-
-    TensorShape imShape = TensorShape({image[1], image[2], image[3]});
-    TensorShape colShape = getColShape(image, sequence);
-    size_t batchSize = image[0];
-
-    real* imageData = outputs[0].data<real>();
-    real* seqData = inputs[0].data<real>();
-    Col2ImFunctor<kOCF, Device, real> col2im;
-    for (size_t i = 0; i < batchSize; i++) {
-      col2im(imageData,
-             imShape,
-             seqData,
-             colShape,
-             strideH(),
-             strideW(),
-             paddingH(),
-             paddingW());
-      imageData += imShape.getElements();
-      seqData += colShape.getElements();
-    }
-  }
-};
-
-REGISTER_TYPED_FUNC(BlockExpand, CPU, BlockExpandForward);
-REGISTER_TYPED_FUNC(BlockExpandGrad, CPU, BlockExpandBackward);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(BlockExpand, GPU, BlockExpandForward);
-REGISTER_TYPED_FUNC(BlockExpandGrad, GPU, BlockExpandBackward);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/BufferArg.cpp b/paddle/function/BufferArg.cpp
deleted file mode 100644
index 2dc931c5d7e727679d435470544e60f9b5ce2bde..0000000000000000000000000000000000000000
--- a/paddle/function/BufferArg.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <glog/logging.h>
-
-#include "BufferArg.h"
-#include "paddle/math/SparseMatrix.h"
-
-namespace paddle {
-
-const SequenceArg& BufferArg::sequence() const {
-  CHECK_EQ(bufferType_, TENSOR_SEQUENCE_DATA);
-  return dynamic_cast<const SequenceArg&>(*this);
-}
-
-const SparseMatrixArg& BufferArg::sparse() const {
-  CHECK_EQ(bufferType_, TENSOR_SPARSE);
-  return dynamic_cast<const SparseMatrixArg&>(*this);
-}
-
-SparseMatrixArg::SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType)
-    : BufferArg(sparse, argType),
-      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
-      nnz_(sparse.getElementCnt()),
-      format_(static_cast<SparseDataFormat>(sparse.getFormat())),
-      type_(static_cast<SparseDataType>(sparse.getValueType())) {
-  bufferType_ = TENSOR_SPARSE;
-}
-
-SparseMatrixArg::SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType)
-    : BufferArg(sparse, argType),
-      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
-      nnz_(sparse.getElementCnt()),
-      format_(static_cast<SparseDataFormat>(sparse.getFormat())),
-      type_(static_cast<SparseDataType>(sparse.getValueType())) {
-  bufferType_ = TENSOR_SPARSE;
-}
-
-}  // namespace paddle
diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
deleted file mode 100644
index 89ee09837db69d79bbd678312f02f6dc87e8067c..0000000000000000000000000000000000000000
--- a/paddle/function/BufferArg.h
+++ /dev/null
@@ -1,364 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <glog/logging.h>
-
-#include "TensorShape.h"
-#include "TensorType.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-enum BufferType {
-  TENSOR_UNKNOWN = 0,
-  TENSOR_NORMAL = 1,
-  TENSOR_SEQUENCE_ID = 2,
-  TENSOR_SEQUENCE_DATA = 3,
-  TENSOR_SPARSE = 4
-};
-
-class BufferArg;
-class SequenceArg;
-class SparseMatrixArg;
-
-/**
- * \brief BufferArg used as the argument type of Function.
- *
- * The arguments of the Paddle Function have four Buffer types.
- * 1. BufferArg for a dense Buffer of any dimension.
- * 2. SequenceIdArg for a Buffer of sequence start positions.
- * 3. SequenceArg for a Buffer of sequence data.
- * 4. SparseMatrixArg for a Buffer of sparse matrix.
- *
- * Buffer shape
- * For most buffers, the first dimension `shape()[0]` represents
- * the size of the mini-batch.
- *
- * Buffer argType
- * There is an ArgType property for the BufferArg used as Function Output.
- * Whether the result of the Function calculation is assigned to the
- * output Buffer or added to the output Buffer is determined by the
- * argType_ property of the output BufferArg.
- */
-
-// ArgType is only used by output BufferArg.
-// For input argument, argType_ is ignored.
-// For output argument, need to set the argType_ of the BufferArg.
-enum ArgType {
-  UNSPECIFIED = 0,
-  ASSIGN_TO = 1,
-  ADD_TO = 2,
-};
-class BufferArg {
-public:
-  void setArgType(ArgType argType) { argType_ = argType; }
-
-  ArgType getArgType() const { return argType_; }
-
-public:
-  BufferArg(ValueType valueType,
-            const TensorShape& shape,
-            ArgType argType = UNSPECIFIED)
-      : buf_(nullptr), valueType_(valueType), shape_(shape), argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-  }
-
-  BufferArg(void* buf,
-            ValueType valueType,
-            const TensorShape& shape,
-            ArgType argType = UNSPECIFIED)
-      : buf_(buf), valueType_(valueType), shape_(shape), argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-  }
-
-  BufferArg(void* buf, ValueType valueType) : buf_(buf), valueType_(valueType) {
-    bufferType_ = TENSOR_NORMAL;
-  }
-
-  BufferArg(const Matrix& matrix, ArgType argType = UNSPECIFIED)
-      : buf_(
-            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
-        valueType_(DataType<real>::value),
-        shape_(2),
-        argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-    shape_.setDim(0, matrix.getHeight());
-    shape_.setDim(1, matrix.getWidth());
-  }
-
-  BufferArg(const Matrix& matrix,
-            const TensorShape& shape,
-            ArgType argType = UNSPECIFIED)
-      : buf_(
-            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
-        valueType_(DataType<real>::value),
-        shape_(shape),
-        argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-    CHECK_EQ(matrix.getElementCnt(), shape.getElements());
-  }
-
-  BufferArg(const Vector& vector, ArgType argType = UNSPECIFIED)
-      : buf_(
-            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
-        valueType_(DataType<real>::value),
-        shape_(1),
-        argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-    shape_.setDim(0, vector.getSize());
-  }
-
-  BufferArg(const IVector& vector, ArgType argType = UNSPECIFIED)
-      : buf_(
-            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
-        valueType_(VALUE_TYPE_INT32),
-        shape_(1),
-        argType_(argType) {
-    bufferType_ = TENSOR_NORMAL;
-    shape_.setDim(0, vector.getSize());
-  }
-
-  template <DeviceType DType>
-  typename Tensor<real, DType>::Matrix matrix() const {
-    CHECK(buf_);
-    CHECK(valueType_ == DataType<real>::value);
-    // CHECK(deviceType_ == DType);
-    CHECK_EQ((size_t)2, shape_.ndims());
-    return typename Tensor<real, DType>::Matrix(
-        reinterpret_cast<real*>(buf_), shape_[0], shape_[1]);
-  }
-
-  template <typename VType, DeviceType DType>
-  typename Tensor<VType, DType>::Vector vector() const {
-    CHECK(buf_);
-    CHECK(valueType_ == DataType<VType>::value);
-    // CHECK(deviceType_ == DType);
-    CHECK_EQ((size_t)1, shape_.ndims());
-    return typename Tensor<VType, DType>::Vector(
-        shape_[0], reinterpret_cast<VType*>(buf_));
-  }
-
-  virtual ~BufferArg() {}
-
-  template <typename T>
-  T* data() const {
-    return reinterpret_cast<T*>(buf_);
-  }
-
-  void* data() const { return buf_; }
-  ValueType valueType() const { return valueType_; }
-  BufferType bufferType() const { return bufferType_; }
-  const TensorShape& shape() const { return shape_; }
-  bool isSparseArg() const { return TENSOR_SPARSE == bufferType_; }
-  bool isSequenceArg() const { return TENSOR_SEQUENCE_DATA == bufferType_; }
-  virtual size_t numElements() const { return shape_.getElements(); }
-
-  const SequenceArg& sequence() const;
-  const SparseMatrixArg& sparse() const;
-
-protected:
-  void* buf_;
-  ValueType valueType_;
-  TensorShape shape_;
-  BufferType bufferType_{TENSOR_UNKNOWN};
-  ArgType argType_{UNSPECIFIED};
-  // TODO(tianbing), add deviceType_
-  // leading dimensions. The size is dims_.size()
-  // Dims lds_;
-};
-
-// sequence start positions in a mini-batch of sequences
-// shape_.ndims() == 1
-// valueType_ = int32
-// if a < b then value_.buf_[a] < value_.buf_[b]
-class SequenceIdArg : public BufferArg {
-public:
-  SequenceIdArg(const TensorShape& shape, ArgType argType = UNSPECIFIED)
-      : BufferArg(VALUE_TYPE_INT32, shape, argType) {
-    bufferType_ = TENSOR_SEQUENCE_ID;
-    CHECK_EQ(shape_.ndims(), 1UL);
-    CHECK_GE(shape_[0], 1UL);
-    numSeqs_ = shape_[0] - 1;
-  }
-
-  SequenceIdArg(void* buf,
-                const TensorShape& shape,
-                ArgType argType = UNSPECIFIED)
-      : BufferArg(buf, VALUE_TYPE_INT32, shape, argType) {
-    bufferType_ = TENSOR_SEQUENCE_ID;
-    CHECK_EQ(shape_.ndims(), 1UL);
-    numSeqs_ = shape_[0] - 1;
-  }
-
-  SequenceIdArg(const IVector& vector) : BufferArg(vector) {
-    bufferType_ = TENSOR_SEQUENCE_ID;
-    numSeqs_ = shape_[0] - 1;
-  }
-
-  ~SequenceIdArg() {}
-
-  size_t numSeqs() const { return numSeqs_; }
-
-private:
-  size_t numSeqs_;
-};
-
-// sequences data
-// For mini-batch calculate,
-// one batch can contain more than one sequence of data.
-// SequenceArg can be used to represent sequences that contain multiple
-// unequal lengths.
-class SequenceArg : public BufferArg {
-public:
-  SequenceArg(ValueType valueType,
-              const TensorShape& shape,
-              ArgType argType = UNSPECIFIED)
-      : BufferArg(valueType, shape, argType),
-        startPositions_(TensorShape({shape[0]})) {
-    bufferType_ = TENSOR_SEQUENCE_DATA;
-  }
-
-  SequenceArg(void* buf,
-              ValueType valueType,
-              const TensorShape& shape,
-              const SequenceIdArg& startPositions,
-              ArgType argType = UNSPECIFIED)
-      : BufferArg(buf, valueType, shape, argType),
-        startPositions_(startPositions) {
-    bufferType_ = TENSOR_SEQUENCE_DATA;
-  }
-
-  SequenceArg(const Matrix& matrix,
-              const IVector& vector,
-              ArgType argType = UNSPECIFIED)
-      : BufferArg(matrix, argType), startPositions_(vector) {
-    bufferType_ = TENSOR_SEQUENCE_DATA;
-  }
-
-  ~SequenceArg() {}
-
-  void* getIdBuf() const { return startPositions_.data(); }
-  size_t numSeqs() const { return startPositions_.numSeqs(); }
-  SequenceIdArg& getSequenceId() { return startPositions_; }
-  const SequenceIdArg& getSequenceId() const { return startPositions_; }
-
-private:
-  SequenceIdArg startPositions_;
-};
-
-// sparse matrix
-// valueType_ == float or double
-// shape_.ndims() == 2
-class SparseMatrixArg : public BufferArg {
-public:
-  SparseMatrixArg(void* buf,
-                  ValueType valueType,
-                  const TensorShape& shape,
-                  const BufferArg& row,
-                  const BufferArg& col,
-                  size_t nnz,
-                  SparseFormat format,
-                  SparseValueType type,
-                  ArgType argType = UNSPECIFIED)
-      : BufferArg(buf, valueType, shape, argType),
-        row_(row),
-        col_(col),
-        nnz_(nnz),
-        format_(static_cast<SparseDataFormat>(format)),
-        type_(static_cast<SparseDataType>(type)) {
-    bufferType_ = TENSOR_SPARSE;
-    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
-    CHECK_EQ(shape_.ndims(), 2UL);
-    CHECK_EQ(row_.shape().ndims(), 1UL);
-    CHECK_EQ(col_.shape().ndims(), 1UL);
-    if (format_ == T_SPARSE_CSR) {
-      CHECK_EQ(nnz, col.shape()[0]);
-    } else if (format_ == T_SPARSE_CSC) {
-      CHECK_EQ(nnz, row.shape()[0]);
-    }
-  }
-
-  SparseMatrixArg(ValueType valueType,
-                  const TensorShape& shape,
-                  size_t nnz,
-                  SparseFormat format,
-                  SparseValueType type,
-                  ArgType argType = UNSPECIFIED)
-      : BufferArg(valueType, shape, argType),
-        row_(BufferArg(nullptr, VALUE_TYPE_INT32)),
-        col_(BufferArg(nullptr, VALUE_TYPE_INT32)),
-        nnz_(nnz),
-        format_(static_cast<SparseDataFormat>(format)),
-        type_(static_cast<SparseDataType>(type)) {
-    bufferType_ = TENSOR_SPARSE;
-    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
-    CHECK_EQ(shape_.ndims(), 2UL);
-
-    /// len of row_ : height + 1 (CSR) or nnz (CSC), buf_ == nullptr
-    row_ = (format_ == T_SPARSE_CSR
-                ? BufferArg(VALUE_TYPE_INT32, TensorShape{shape_[0] + 1})
-                : BufferArg(VALUE_TYPE_INT32, TensorShape{nnz}));
-    /// len of col_ :  width + 1 (CSC) or nnz (CSR), buf_ == nullptr
-    col_ = (format_ == T_SPARSE_CSR
-                ? BufferArg(VALUE_TYPE_INT32, TensorShape{nnz})
-                : BufferArg(VALUE_TYPE_INT32, TensorShape{shape_[1] + 1}));
-  }
-
-  SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
-
-  SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
-
-  template <DeviceType DType>
-  typename Tensor<real, DType>::SparseMatrix SparseMatrix() const {
-    CHECK(buf_);
-    CHECK(valueType_ == DataType<real>::value);
-    // CHECK(deviceType_ == DType);
-    CHECK_EQ(2UL, shape_.ndims());
-    return typename Tensor<real, DType>::SparseMatrix(
-        reinterpret_cast<real*>(buf_),
-        reinterpret_cast<int*>(row_.data()),
-        reinterpret_cast<int*>(col_.data()),
-        shape_[0],
-        shape_[1],
-        nnz_,
-        static_cast<SparseValueType>(type_),
-        static_cast<SparseFormat>(format_),
-        false);
-  }
-
-  ~SparseMatrixArg() {}
-
-  void* getRowBuf() const { return row_.data(); }
-
-  void* getColBuf() const { return col_.data(); }
-
-  size_t nnz() const { return nnz_; }
-
-  size_t numElements() const override { return nnz_; }
-
-  SparseDataFormat dataFormat() const { return format_; }
-
-  SparseDataType dataType() const { return type_; }
-
-private:
-  BufferArg row_;
-  BufferArg col_;
-  size_t nnz_;
-  SparseDataFormat format_;
-  SparseDataType type_;
-};
-
-}  // namespace paddle
diff --git a/paddle/function/BufferArgTest.cpp b/paddle/function/BufferArgTest.cpp
deleted file mode 100644
index 1a6e0110afb64c8b4f164d71e31e5f9bfcdee4a8..0000000000000000000000000000000000000000
--- a/paddle/function/BufferArgTest.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "BufferArg.h"
-#include <gtest/gtest.h>
-#include "paddle/math/MemoryHandle.h"
-
-namespace paddle {
-
-TEST(BufferTest, BufferArg) {
-  TensorShape shape({8, 10});
-  CpuMemoryHandle memory(shape.getElements() *
-                         sizeOfValuType(VALUE_TYPE_FLOAT));
-  BufferArg buffer(memory.getBuf(), VALUE_TYPE_FLOAT, shape);
-  EXPECT_EQ(buffer.data(), memory.getBuf());
-}
-
-TEST(BufferTest, SequenceIdArg) {
-  TensorShape shape({10});
-  CpuMemoryHandle memory(shape.getElements() *
-                         sizeOfValuType(VALUE_TYPE_INT32));
-  SequenceIdArg buffer(memory.getBuf(), shape);
-  EXPECT_EQ(buffer.data(), memory.getBuf());
-  EXPECT_EQ(buffer.numSeqs(), 9U);
-}
-
-}  // namespace paddle
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
deleted file mode 100644
index 9b2779b42cad324253dadf27dbff20fd8e8c8e16..0000000000000000000000000000000000000000
--- a/paddle/function/CMakeLists.txt
+++ /dev/null
@@ -1,60 +0,0 @@
-file(GLOB h_files . *Op.h)
-file(GLOB cpp_files . *Op.cpp)
-
-list(APPEND h_files Function.h)
-list(APPEND cpp_files Function.cpp)
-list(APPEND cpp_files BufferArg.cpp)
-list(APPEND cpp_files GemmFunctor.cpp)
-if(USE_EIGEN_FOR_BLAS)
-  list(APPEND cpp_files EigenGemm.cpp)
-endif(USE_EIGEN_FOR_BLAS)
-
-if(WITH_GPU)
-    file(GLOB cu_files . *OpGpu.cu)
-    cuda_compile(cu_objs ${cu_files})
-endif()
-
-if(USE_NNPACK)
-  list(APPEND cpp_files nnpack/NNPACKConvOp.cpp)
-  if(WITH_TESTING)
-    add_unittest(NNPACKConvOpTest nnpack/NNPACKConvOpTest.cpp)
-  endif()
-endif()
-
-list(APPEND cpp_files neon/NeonDepthwiseConv.cpp)
-
-add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
-add_dependencies(paddle_function ${external_project_dependencies})
-add_dependencies(paddle_function paddle_proto)
-
-if(WITH_TESTING)
-if(WITH_GPU)
-    # TODO:
-    # file(GLOB test_files . *OpTest.cpp)
-    # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
-    add_simple_unittest(CrossMapNormalOpTest)
-    add_simple_unittest(TensorShapeTest)
-    add_simple_unittest(TensorTypeTest)
-    add_simple_unittest(BufferArgTest)
-    add_simple_unittest(FunctionTest)
-    add_simple_unittest(ContextProjectionOpTest)
-    add_simple_unittest(PadOpTest)
-    add_simple_unittest(MulOpTest)
-    add_simple_unittest(CosSimOpTest)
-    add_simple_unittest(RowConvOpTest)
-    add_simple_unittest(BlockExpandOpTest)
-    add_simple_unittest(CropOpTest)
-    add_simple_unittest(SwitchOpTest)
-    add_simple_unittest(ScaleSubRegionOpTest)
-endif()
-
-add_simple_unittest(Im2ColTest)
-add_simple_unittest(GemmConvOpTest)
-add_simple_unittest(DepthwiseConvOpTest)
-endif()
-
-add_style_check_target(paddle_function ${h_files})
-add_style_check_target(paddle_function ${cpp_files})
-if(WITH_GPU)
-    add_style_check_target(paddle_function ${cu_files})
-endif()
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
deleted file mode 100644
index 904b0958e6f2c1b8fb8cf56f3cd7d07ad8e24f19..0000000000000000000000000000000000000000
--- a/paddle/function/ContextProjectionOp.cpp
+++ /dev/null
@@ -1,412 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ContextProjectionOp.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-/**
- * Context Projection Forward with CPU Matrix Device.
- *
- */
-template <>
-void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
-                                               const CpuMatrix& input_mat,
-                                               const CpuMatrix& weight_mat,
-                                               const CpuIVector& seq_vec,
-                                               size_t context_length,
-                                               int context_start,
-                                               size_t begin_pad) {
-  const int* starts = seq_vec.getData();
-  const size_t num_sequences = seq_vec.getSize() - 1;
-  for (size_t i = 0; i < num_sequences; ++i) {
-    for (size_t j = 0; j < context_length; ++j) {
-      int begin = starts[i] + context_start + j;
-      int end = starts[i + 1] + context_start + j;
-      int dst_begin = starts[i];
-      int dst_end = starts[i + 1];
-      if (begin < starts[i]) {
-        int64_t pad_size =
-            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
-        MatrixPtr mat = out_mat.subMatrix(starts[i], pad_size);
-        if (weight_mat) {
-          MatrixPtr sub =
-              const_cast<CpuMatrix&>(weight_mat).subMatrix(j, pad_size);
-          mat->addAtOffset(*sub, j * input_mat.getWidth());
-        }
-        dst_begin = starts[i] + pad_size;
-        begin = starts[i];
-      }
-      if (end > starts[i + 1]) {
-        int64_t pad_size =
-            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
-        MatrixPtr mat = out_mat.subMatrix(starts[i + 1] - pad_size, pad_size);
-        if (weight_mat) {
-          MatrixPtr sub =
-              const_cast<CpuMatrix&>(weight_mat)
-                  .subMatrix(begin_pad + context_start + j - pad_size,
-                             pad_size);
-          mat->addAtOffset(*sub, j * input_mat.getWidth());
-        }
-        dst_end = starts[i + 1] - pad_size;
-        end = starts[i + 1];
-      }
-      if (end <= begin) continue;
-      MatrixPtr src =
-          const_cast<CpuMatrix&>(input_mat).subMatrix(begin, end - begin);
-      MatrixPtr dst = out_mat.subMatrix(dst_begin, dst_end - dst_begin);
-      dst->addAtOffset(*src, j * input_mat.getWidth());
-    }
-  }
-}
-
-/**
- * Paddle Function for Context Projection Forward.
- * Calculate the output layer value sequence after context projection.
- *
- * What is Context Projection for a sequence?
- * For example, assumed input (x) has 4 words and the dimension of each word
- * representation is 2. If we use zero to pad instead of learned weight to pad,
- * and the context_lenth is 3, the output (y) is:
- *
- * @code
- *  x = [a1, a2;
- *       b1, b2;
- *       c1, c2;
- *       d1, d2]
- *  y = [0,  0,  a1, a2, b1, b2;
- *       a1, a2, b1, b2, c1, c2;
- *       b1, b2, c1, c2, d1, d2;
- *       c1, c2, d1, d2, 0,  0]
- * @endcode
- *
- * \param outputs[0].matrix   output layer value, n * (d * l)
- * \param outputs[0].vector   start position sequence, n * 1
- * \param inputs[0].matrix    input layer value, n * d
- * \param inputs[0].vector    start position sequence, n * 1
- * \param inputs[1].matrix    input layer weight, pad * d
- */
-template <DeviceType Device>
-class ContextProjectionForwardFunc : public FunctionBase {
-public:
-  void init(const FuncConfig& config) override {
-    context_length_ = config.get<size_t>("context_length");
-    context_start_ = config.get<int>("context_start");
-    begin_pad_ = config.get<size_t>("begin_pad");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK(1UL == inputs.size() || 2UL == inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
-        << "SequenceArg required here";
-    const auto val_seqs = dynamic_cast<const SequenceArg&>(inputs[0]);
-    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
-
-    CHECK(out_seq.data() && val_seqs.data() && val_seqs.getSequenceId().data());
-    CHECK_EQ(out_seq.shape().ndims(), 2UL);
-    CHECK_EQ(val_seqs.shape().ndims(), 2UL);
-    /// dim of output = dim of input * context_length
-    CHECK_EQ(out_seq.shape()[1], val_seqs.shape()[1] * context_length_);
-    /// input and output has the same batch_size
-    CHECK_EQ(val_seqs.shape()[0], out_seq.shape()[0]);
-    if (2UL == inputs.size()) {
-      CHECK_EQ(inputs[1].shape().ndims(), 2UL);
-      /// dim of input == dim of weight
-      CHECK_EQ(val_seqs.shape()[1], inputs[1].shape()[1]);
-    }
-
-    CHECK_EQ(out_seq.getArgType(), ADD_TO);
-    auto out_mat = out_seq.matrix<Device>();
-    const auto in_mat = val_seqs.matrix<Device>();
-    const auto w_mat =
-        (2UL == inputs.size() && inputs[1].data())
-            ? inputs[1].matrix<Device>()
-            : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
-    const auto seq_vec = val_seqs.getSequenceId().vector<int, Device>();
-
-    ContextProjectionForward<Device>(out_mat,
-                                     in_mat,
-                                     w_mat,
-                                     seq_vec,
-                                     context_length_,
-                                     context_start_,
-                                     begin_pad_);
-  }
-
-private:
-  size_t context_length_;
-  int context_start_;
-  size_t begin_pad_;
-};
-
-/**
- * Context Projection Backward with CPU Matrix Device.
- *
- */
-template <>
-void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
-                                                CpuMatrix& in_grad_mat,
-                                                CpuMatrix& w_grad_mat,
-                                                const CpuIVector& seq_vec,
-                                                size_t context_length,
-                                                int context_start,
-                                                size_t begin_pad,
-                                                bool is_padding,
-                                                size_t total_pad) {
-  size_t input_dim = in_grad_mat ? in_grad_mat.getWidth()
-                                 : w_grad_mat ? w_grad_mat.getWidth() : 0;
-  const int* starts = seq_vec.getData();
-  size_t num_sequences = seq_vec.getSize() - 1;
-  for (size_t i = 0; i < num_sequences; ++i) {
-    for (size_t j = 0; j < context_length; ++j) {
-      int begin = starts[i] + context_start + j;
-      int end = starts[i + 1] + context_start + j;
-      int dst_begin = starts[i];
-      int dst_end = starts[i + 1];
-      if (begin < starts[i]) {
-        int64_t pad_size =
-            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
-        if (is_padding && w_grad_mat) {
-          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
-                              .subMatrix(starts[i], pad_size);
-          MatrixPtr sub = w_grad_mat.subMatrix(j, pad_size);
-          sub->addAtOffset(*mat, j * input_dim);
-        }
-        dst_begin = starts[i] + pad_size;
-        begin = starts[i];
-      }
-      if (end > starts[i + 1]) {
-        int64_t pad_size =
-            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
-        if (is_padding && w_grad_mat) {
-          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
-                              .subMatrix(starts[i + 1] - pad_size, pad_size);
-          MatrixPtr sub = w_grad_mat.subMatrix(
-              begin_pad + context_start + j - pad_size, pad_size);
-          sub->addAtOffset(*mat, j * input_dim);
-        }
-        dst_end = starts[i + 1] - pad_size;
-        end = starts[i + 1];
-      }
-      if (end <= begin) continue;
-      if (!in_grad_mat) continue;
-      MatrixPtr src = in_grad_mat.subMatrix(begin, end - begin);
-      MatrixPtr dst = const_cast<CpuMatrix&>(out_grad_mat)
-                          .subMatrix(dst_begin, dst_end - dst_begin);
-      src->addAtOffset(*dst, j * input_dim);
-    }
-  }
-}
-
-/**
- * Context Projection Backward Function.
- * Update the weight gradient and input layer gradient with backprop
- *
- * \param inputs[0].matrix          output layer grad, n * (d * l)
- * \param inputs[0].vector          start position sequence, n * 1
- * \param outputs[0].matrix         input layer grad, n * d
- * \param outputs[0].vector         start position sequence, n * 1
- * \param outputs[1]                weight grad, pad * d
- */
-template <DeviceType Device>
-class ContextProjectionBackwardFunc : public FunctionBase {
-public:
-  void init(const FuncConfig& config) override {
-    context_length_ = config.get<size_t>("context_length");
-    context_start_ = config.get<int>("context_start");
-    begin_pad_ = config.get<size_t>("begin_pad");
-    is_padding_ = config.get<bool>("is_padding");
-    total_pad_ = config.get<size_t>("total_pad");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK(1UL == outputs.size() || 2UL == outputs.size());
-    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
-        << "SequenceArg required here";
-    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
-    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
-    CHECK(in_seq.data() && in_seq.getSequenceId().data());
-    CHECK_EQ(in_seq.shape().ndims(), 2UL);
-    CHECK_EQ(out_seq.shape().ndims(), 2UL);
-    CHECK_EQ(out_seq.getSequenceId().shape().ndims(), 1UL);
-
-    /// input and output grad has the same batch_size
-    CHECK_EQ(out_seq.shape()[0], in_seq.shape()[0]);
-    /// dim of output grad = dim of input grad * context_length
-    CHECK_EQ(in_seq.shape()[1], out_seq.shape()[1] * context_length_);
-    CHECK_EQ(out_seq.getArgType(), ADD_TO);
-
-    if (2UL == outputs.size()) {
-      CHECK_EQ(outputs[1].shape().ndims(), 2UL);
-      /// dim of input grad == dim of weight
-      CHECK_EQ(out_seq.shape()[1], outputs[1].shape()[1]);
-      CHECK_EQ(outputs[1].getArgType(), ADD_TO);
-    }
-
-    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
-    const auto out_grad_mat = in_seq.matrix<Device>();
-    auto in_grad_mat =
-        !out_seq.data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
-                        : out_seq.matrix<Device>();
-    auto w_grad_mat =
-        (2UL == outputs.size() && outputs[1].data())
-            ? outputs[1].matrix<Device>()
-            : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
-
-    ContextProjectionBackward<Device>(out_grad_mat,
-                                      in_grad_mat,
-                                      w_grad_mat,
-                                      seq_vec,
-                                      context_length_,
-                                      context_start_,
-                                      begin_pad_,
-                                      is_padding_,
-                                      total_pad_);
-  }
-
-private:
-  size_t context_length_;
-  int context_start_;
-  size_t begin_pad_;
-  bool is_padding_;
-  size_t total_pad_;
-};
-
-/**
- * Context Projection Backward Data Function
- * Update input layer grad
- * input:  sequence of output layer grad
- * output: sequence of input layer grad
- *
- * \param outputs[0].matrix              input layer grad, n * d
- * \param outputs[0].vector              start position sequence, n * 1
- * \param inputs[0].matrix               output layer grad, n * (d * l)
- * \param inputs[0].vector               start positon sequence, n * 1
- */
-template <DeviceType Device>
-class ContextProjectionBackwardDataFunc : public FunctionBase {
-public:
-  void init(const FuncConfig& config) override {
-    context_length_ = config.get<size_t>("context_length");
-    context_start_ = config.get<int>("context_start");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
-        << "SequenceArg required here";
-    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
-    const auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
-
-    CHECK(in_seq.data() && out_seq.data() && in_seq.getSequenceId().data());
-    CHECK_EQ(out_seq.shape().ndims(), 2UL);
-    CHECK_EQ(in_seq.shape().ndims(), 2UL);
-    CHECK_EQ(in_seq.getSequenceId().shape().ndims(), 1UL);
-    /// output layer grad dim == input layer grad dim * context_length_
-    CHECK_EQ(in_seq.shape().ndims(), out_seq.shape().ndims() * context_length_);
-    /// input and output has the same batch_size
-    CHECK_EQ(in_seq.shape()[0], out_seq.shape()[0]);
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-
-    const auto out_grad_mat = in_seq.matrix<Device>();
-    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
-    auto in_grad_mat = out_seq.matrix<Device>();
-
-    ContextProjectionBackwardData<Device>(
-        out_grad_mat, in_grad_mat, seq_vec, context_length_, context_start_);
-  }
-
-private:
-  size_t context_length_;
-  int context_start_;
-};
-
-/**
- * Context Projection Backward Weight Function
- * Update weight grad by backprop
- * input:  sequence of output layer grad
- * output: weight grad
- *
- * \param outputs[0]                   weight grad, pad * d
- * \param inputs[0].matrix             output layer grad, n * (d * l)
- * \param inputs[0].vecotr             start positon sequence, n * 1
- */
-template <DeviceType Device>
-class ContextProjectionBackwardWeightFunc : public FunctionBase {
-public:
-  void init(const FuncConfig& config) override {
-    context_length_ = config.get<size_t>("context_length");
-    context_start_ = config.get<int>("context_start");
-    begin_pad_ = config.get<size_t>("begin_pad");
-    total_pad_ = config.get<size_t>("total_pad");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK(inputs[0].isSequenceArg()) << "SequenceArg required here";
-    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
-    CHECK(in_seq.data() && in_seq.getSequenceId().data() && outputs[0].data());
-    CHECK_EQ(outputs[0].shape().ndims(), 2UL);
-    CHECK_EQ(in_seq.shape().ndims(), 2UL);
-    CHECK_EQ(in_seq.getSequenceId().shape().ndims(), 1UL);
-    CHECK_EQ(in_seq.shape()[0], outputs[0].shape()[0]);
-    /// output layer grad dim == weight dim * context_length_
-    CHECK_EQ(in_seq.shape()[1], outputs[0].shape()[1] * context_length_);
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-
-    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
-    const auto out_grad_mat = in_seq.matrix<Device>();
-    auto w_grad_mat = outputs[0].matrix<Device>();
-    ContextProjectionBackwardWeight<Device>(out_grad_mat,
-                                            w_grad_mat,
-                                            seq_vec,
-                                            context_length_,
-                                            context_start_,
-                                            total_pad_,
-                                            begin_pad_);
-  }
-
-private:
-  size_t context_length_;
-  int context_start_;
-  size_t begin_pad_;
-  size_t total_pad_;
-};
-
-REGISTER_TYPED_FUNC(ContextProjectionForward,
-                    CPU,
-                    ContextProjectionForwardFunc);
-REGISTER_TYPED_FUNC(ContextProjectionBackward,
-                    CPU,
-                    ContextProjectionBackwardFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(ContextProjectionForward,
-                    GPU,
-                    ContextProjectionForwardFunc);
-REGISTER_TYPED_FUNC(ContextProjectionBackward,
-                    GPU,
-                    ContextProjectionBackwardFunc);
-REGISTER_TYPED_FUNC(ContextProjectionBackwardData,
-                    GPU,
-                    ContextProjectionBackwardDataFunc);
-REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight,
-                    GPU,
-                    ContextProjectionBackwardWeightFunc);
-#endif
-}  // namespace paddle
diff --git a/paddle/function/ContextProjectionOpTest.cpp b/paddle/function/ContextProjectionOpTest.cpp
deleted file mode 100644
index d805c3ae927321fc74946e202b98401b6b3cd0f7..0000000000000000000000000000000000000000
--- a/paddle/function/ContextProjectionOpTest.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-
-void testMatrixProjectionForward(int context_start,
-                                 size_t context_length,
-                                 bool is_padding,
-                                 size_t batch_size,
-                                 size_t input_dim) {
-  size_t pad = std::max(0, -context_start) +
-               std::max(0, (int)(context_start + context_length - 1));
-  if (pad == 0) is_padding = false;
-
-  CpuGpuFuncCompare test(
-      "ContextProjectionForward",
-      FuncConfig()
-          .set("context_length", context_length)
-          .set("context_start", context_start)
-          .set("begin_pad", (size_t)std::max(0, -context_start)));
-
-  // prepare input arguments
-  test.addSequence(SequenceIdArg(TensorShape{batch_size}));
-  test.addInputs(
-      SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim}));
-  if (is_padding) {  // weight
-    test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{pad, input_dim}));
-  }
-  test.addOutputs(
-      SequenceArg(VALUE_TYPE_FLOAT,
-                  TensorShape{batch_size, input_dim * context_length}),
-      ADD_TO);
-
-  // run Function
-  test.run();
-}
-
-void testMatrixProjectionBackward(int context_start,
-                                  size_t context_length,
-                                  bool is_padding,
-                                  size_t batch_size,
-                                  size_t input_dim) {
-  size_t pad = std::max(0, -context_start) +
-               std::max(0, (int)(context_start + context_length - 1));
-  if (pad == 0) is_padding = false;
-
-  CpuGpuFuncCompare test(
-      "ContextProjectionBackward",
-      FuncConfig()
-          .set("context_length", context_length)
-          .set("context_start", context_start)
-          .set("begin_pad", (size_t)std::max(0, -context_start))
-          .set("is_padding", is_padding)
-          .set("total_pad", pad));
-
-  // prepare input arguments
-  test.addSequence(SequenceIdArg(TensorShape{batch_size}));
-  test.addInputs(SequenceArg(
-      VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim * context_length}));
-  test.addOutputs(
-      SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim}),
-      ADD_TO);
-  if (is_padding) {  // weight
-    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{pad, input_dim}),
-                    ADD_TO);
-  }
-
-  // run Function
-  test.run();
-}
-
-TEST(ContextProjection, Projection) {
-  for (auto context_start : {-5, -3, -1, 0, 3}) {
-    for (auto context_length : {1, 2, 5, 7}) {
-      for (auto trainable_padding : {false, true}) {
-        for (auto batch_size : {1, 2, 5, 20, 100}) {
-          for (auto input_dim : {15, 32, 63, 128, 200}) {
-            VLOG(3) << " context_start=" << context_start
-                    << " context_length=" << context_length
-                    << " trainable_padding=" << trainable_padding
-                    << " batch_size=" << batch_size
-                    << " input_dim=" << input_dim;
-            testMatrixProjectionForward(context_start,
-                                        context_length,
-                                        trainable_padding,
-                                        batch_size,
-                                        input_dim);
-            testMatrixProjectionBackward(context_start,
-                                         context_length,
-                                         trainable_padding,
-                                         batch_size,
-                                         input_dim);
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/paddle/function/ConvOp.h b/paddle/function/ConvOp.h
deleted file mode 100644
index 7d23d0079c8f62b2c8912dfcb9f191c622a60bc9..0000000000000000000000000000000000000000
--- a/paddle/function/ConvOp.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Function.h"
-
-namespace paddle {
-
-/*
- * \brief Based on the ConvFunctionBase class, the forward calculation,
- *        backward input calculation and backward filter calculation
- *        of convolution operations can be implemented.
- *
- * Arguments of forward and backward calculation:
- *   1. Forward calculation of convolution.
- *      inputs = {INPUT, FILTER}, outputs = {OUTPUT}
- *      The first and second input arguments are input image and filter data.
- *      The output argument is output image.
- *
- *   2. Backward input calculation of convolution.
- *      inputs = {OUTPUT_GRAD, FILTER}, outputs = {INPUT_GRAD}
- *      The first and second input arguments are output grad image
- *      and filter data.
- *      The output argument is input grad image.
- *
- *   3. Backward filter calculation of convolution.
- *      inputs = {OUTPUT_GRAD, INPUT}, outputs = {FILTER_GRAD}
- *      The first and second input arguments are output grad image
- *      and input image.
- *      The output argument is filter grad.
- *
- * Arguments format of input, filter and output:
- *   1. Input image, output image, input image gradient, output image gradient
- *      are all NCHW format. Where N is batch size, C is the number of channels,
- *      H and W is the height and width of image or image gradient.
- *
- *   2. The format of the filter data is MCHW, where M is the number of output
- *      image channels, C is the number of input image channels,
- *      H and W is height and width of filter.
- *
- *      If `groups` is greater than 1, the filter's data format should be GMCHW,
- *      where G is the `groups`, and G * M is the number of output image
- *      channels, G * C is the number of input image channels,
- *      H and W is height and width of filter.
- */
-class ConvFunctionBase : public FunctionBase {
-public:
-  void init(const FuncConfig& config) override {
-    // function arguments
-    strides_ = config.get<std::vector<size_t>>("strides");
-    paddings_ = config.get<std::vector<size_t>>("paddings");
-    dilations_ = config.get<std::vector<size_t>>("dilations");
-    groups_ = config.get<size_t>("groups");
-
-    // number of inputs and outputs
-    numInputs_ = 2;
-    numOutputs_ = 1;
-  }
-
-  // input can be INPUT and INPUT_GRAD
-  // filter can be FILTER and FILTER_GRAD
-  // output can be OUTPUT and OUTPUT_GRAD
-  void checkShape(const TensorShape& input,
-                  const TensorShape& filter,
-                  const TensorShape& output) {
-    // inputs and outputs arguments should be 4-dimensional.
-    CHECK_EQ(input.ndims(), (size_t)4);
-    CHECK_EQ(output.ndims(), (size_t)4);
-    // The batchSize of the input needs to be equal to
-    // the batchSize of the output.
-    CHECK_EQ(input[0], output[0]);
-
-    if (filter.ndims() == (size_t)4) {
-      // If the filter's dimension is 4, groups convolution is not supported.
-      CHECK_EQ(groups_, (size_t)1);
-      // The input and output channel dimensions are the second and first
-      // dimensions of the filter shape.
-      CHECK_EQ(input[1], filter[1]);
-      CHECK_EQ(output[1], filter[0]);
-    } else {
-      // filter argument should be 5-dimensional.
-      CHECK_EQ(filter.ndims(), (size_t)5);
-      // The first dimension of the filter is the size of the group
-      CHECK_EQ(filter[0], groups_);
-      // The input and output channel dimensions are the third and second
-      // dimensions of the filter shape.
-      CHECK_EQ(input[1], filter[2] * groups_);
-      CHECK_EQ(output[1], filter[1] * groups_);
-    }
-  }
-
-protected:
-  size_t getFilterHeight(const TensorShape& filter) const {
-    return filter[filter.ndims() - 2];
-  }
-
-  size_t getFilterWidth(const TensorShape& filter) const {
-    return filter[filter.ndims() - 1];
-  }
-
-  // determine whether im2col needs to be performed
-  inline bool isNeedIm2col(const TensorShape& filter) const {
-    return !(getFilterHeight(filter) == 1 && getFilterWidth(filter) == 1 &&
-             strideH() == 1 && strideW() == 1 && paddingH() == 0 &&
-             paddingW() == 0);
-  }
-
-  std::vector<size_t> strides_;
-  std::vector<size_t> paddings_;
-  std::vector<size_t> dilations_;
-
-  /// Group size, refer to grouped convolution in
-  /// Alex Krizhevsky's paper: when group=2, the first half of the
-  /// filters are only connected to the first half of the input channels,
-  /// and the second half only connected to the second half.
-  size_t groups_;
-
-  inline int strideH() const { return strides_[0]; }
-
-  inline int strideW() const { return strides_[1]; }
-
-  inline int paddingH() const { return paddings_[0]; }
-
-  inline int paddingW() const { return paddings_[1]; }
-
-  inline int dilationH() const { return dilations_[0]; }
-
-  inline int dilationW() const { return dilations_[1]; }
-
-  // A temporary memory in convolution calculation.
-  MemoryHandlePtr memory_;
-
-  template <DeviceType Device>
-  void resizeBuffer(size_t newSize) {
-    if (!memory_ || newSize * sizeof(real) > memory_->getAllocSize()) {
-      if (Device == DEVICE_TYPE_CPU) {
-        memory_ = std::make_shared<CpuMemoryHandle>(newSize * sizeof(real));
-      } else {
-        memory_ = std::make_shared<GpuMemoryHandle>(newSize * sizeof(real));
-      }
-    }
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/function/CosSimOp.cpp b/paddle/function/CosSimOp.cpp
deleted file mode 100644
index 81bccc1a9c7d614763a10e3838271b57eef2c603..0000000000000000000000000000000000000000
--- a/paddle/function/CosSimOp.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CosSimOp.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-/**
- * Cosine Similarity for CpuMatrix
- *
- * \param out_mat, output value, size: nSamples * 1.
- * \param in1_mat, input value 1, size: nSamples * dim.
- * \param in2_mat, input value 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
- * \param scale, default 1.0
- *
- */
-template <>
-void CosSimForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
-                                    const CpuMatrix& in1_mat,
-                                    const CpuMatrix& in2_mat,
-                                    real scale) {
-  CHECK(out_mat.getData() && in1_mat.getData() && in2_mat.getData());
-  size_t num_samples = out_mat.getHeight();
-  size_t dim = in1_mat.getWidth();
-  /// column vector [nSamples, 1]
-  real* out = out_mat.getData();
-  const real* x = in1_mat.getData();
-  const real* y = in2_mat.getData();
-
-  /// in2 might only have one row or full rows
-  CHECK(in2_mat.getHeight() == 1LU || in2_mat.getHeight() == num_samples);
-  size_t inc = (in2_mat.getHeight() == 1LU) ? 0 : dim;
-  for (size_t i = 0; i < num_samples; ++i, x += dim, y += inc) {
-    real square_sum_x = 0;
-    real square_sum_y = 0;
-    real xy = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      square_sum_x += x[j] * x[j];
-      square_sum_y += y[j] * y[j];
-      xy += x[j] * y[j];
-    }
-    CHECK(square_sum_x > 0 && square_sum_y > 0);
-    out[i] = scale * xy / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y));
-  }
-}
-
-/**
- * Cosine Similarity
- * for each row i,
- *   out[i] = scale * cos(input1[i], input2[i])
- *      = scale * <input1[i], input2[i]>/sqrt(|input1[i]|^2 * |input2[i]|^2)
- * when input2 only has one row, then for each row i,
- *   out[i] = cos(input1[i], input2[0])
- *
- * \param inputs[0] input matrix 1, size: nSamples * dim.
- * \param inputs[1] input matrix 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
- * \param outputs[0] output matrix, size : nSamples * 1.
- */
-
-template <DeviceType Device>
-class CosSimForwardFunc : public FunctionBase {
-  void init(const FuncConfig& config) override {
-    scale_ = config.get<real>("scale");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(inputs.size(), 2UL);
-    CHECK_EQ(outputs.size(), 1UL);
-
-    CHECK_EQ(inputs[0].shape().ndims(), 2UL);
-    CHECK_EQ(inputs[1].shape().ndims(), 2UL);
-    CHECK_EQ(outputs[0].shape().ndims(), 2UL);
-
-    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
-    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
-    CHECK_EQ(outputs[0].shape()[1], 1UL);
-
-    CHECK(outputs[0].data() && inputs[0].data() && inputs[1].data());
-
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-    auto out_mat = outputs[0].matrix<Device>();
-    const auto in1_mat = inputs[0].matrix<Device>();
-    const auto in2_mat = inputs[1].matrix<Device>();
-
-    CosSimForward<Device>(out_mat, in1_mat, in2_mat, scale_);
-  }
-
-private:
-  real scale_;
-};
-
-/**
- * Cosine Similarity Derivative for CpuMatrix
- *
- * \param in1_grad  forward input grad 1, size: nSamples * dim.
- * \param in2_grad  forward input grad 2,
- *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
- *
- * \param out_grad  backward loss output grad, size : nSamples * 1.
- * \param out_val   forward output value, size: nSamples * 1.
- * \param in1_val   forward input value 1, size: nSamples * dim.
- * \param in2_val   forward input value 2,
- *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
- * \param scale,    default 1.0
- */
-template <>
-void CosSimBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad,
-                                     const CpuMatrix& out_val,
-                                     const CpuMatrix& in1_val,
-                                     const CpuMatrix& in2_val,
-                                     CpuMatrix& in1_grad,
-                                     CpuMatrix& in2_grad,
-                                     real scale) {
-  CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
-        in2_val.getData() && in1_grad.getData() && in2_grad.getData());
-  CHECK_EQ(out_val.useGpu_, false) << "Matrix type are GPU, CPU required";
-
-  const real* grad = out_grad.getData();
-  const real* out = out_val.getData();
-  const real* prev_out_x = in1_val.getData();
-  const real* prev_out_y = in2_val.getData();
-  real* prev_grad_x = in1_grad.getData();
-  real* prev_grad_y = in2_grad.getData();
-
-  size_t num_samples = out_grad.getHeight();
-  size_t dim = in1_val.getWidth();
-  CHECK_EQ(in2_val.getHeight(), in2_grad.getHeight());
-  CHECK(in2_val.getHeight() == 1LU || in2_val.getHeight() == num_samples);
-  size_t inc = (in2_val.getHeight() == 1LU) ? 0 : dim;
-  for (size_t i = 0; i < num_samples; ++i,
-              prev_out_x += dim,
-              prev_out_y += inc,
-              prev_grad_x += dim,
-              prev_grad_y += inc) {
-    real square_sum_x = 0;
-    real square_sum_y = 0;
-    real xy = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      square_sum_x += prev_out_x[j] * prev_out_x[j];
-      square_sum_y += prev_out_y[j] * prev_out_y[j];
-      xy += prev_out_x[j] * prev_out_y[j];
-    }
-    CHECK(square_sum_x > 0 && square_sum_y > 0);
-    if (xy == 0) {
-      real reciprocal =
-          1.0f / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y));
-      for (size_t j = 0; j < dim; ++j) {
-        prev_grad_x[j] += scale * grad[i] * prev_out_y[j] * reciprocal;
-        prev_grad_y[j] += scale * grad[i] * prev_out_x[j] * reciprocal;
-      }
-    } else {
-      real reciprocal_xy = 1.0f / xy;
-      real reciprocal_square_sum_x = 1.0f / square_sum_x;
-      real reciprocal_square_sum_y = 1.0f / square_sum_y;
-      for (size_t j = 0; j < dim; ++j) {
-        prev_grad_x[j] +=
-            out[i] * grad[i] * (prev_out_y[j] * reciprocal_xy -
-                                prev_out_x[j] * reciprocal_square_sum_x);
-        prev_grad_y[j] +=
-            out[i] * grad[i] * (prev_out_x[j] * reciprocal_xy -
-                                prev_out_y[j] * reciprocal_square_sum_y);
-      }
-    }
-  }
-}
-
-/**
- * Cosine Similarity backward Derivative
- *
- * \param outputs[0] forward input grad 1, size: nSamples * dim.
- * \param outputs[1] forward input grad 2,
- *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
- *
- * \param inputs[0] backward loss output grad, size : nSamples * 1.
- * \param inputs[1] forward output value, size: nSamples * 1.
- * \param inputs[2] forward input value 1, size: nSamples * dim.
- * \param inputs[3] forward input value 2,
- *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
- */
-template <DeviceType Device>
-class CosSimBackwardFunc : public FunctionBase {
-  void init(const FuncConfig& config) override {
-    scale_ = config.get<real>("scale");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(inputs.size(), 4UL);
-    CHECK_EQ(outputs.size(), 2UL);
-    /// dim of out_grad and out_val == 1, column vector
-    CHECK_EQ(inputs[0].shape()[1], 1UL);
-    CHECK_EQ(inputs[1].shape()[1], 1UL);
-    /// nSamples of out_grad == out_val == in_val1 == in_grad1
-    CHECK_EQ(inputs[1].shape()[0], inputs[0].shape()[0]);
-    CHECK_EQ(inputs[0].shape()[0], inputs[0].shape()[0]);
-    CHECK_EQ(outputs[0].shape()[0], inputs[0].shape()[0]);
-    /// dim of in1_val1 == in_val2 == in_grad1 == in_grad2
-    CHECK_EQ(inputs[3].shape()[1], inputs[2].shape()[1]);
-    CHECK_EQ(outputs[0].shape()[1], inputs[2].shape()[1]);
-    CHECK_EQ(outputs[1].shape()[1], inputs[2].shape()[1]);
-
-    CHECK(inputs[0].data() && inputs[1].data() && inputs[2].data() &&
-          inputs[3].data() && outputs[0].data() && outputs[1].data());
-
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
-
-    const auto out_grad = inputs[0].matrix<Device>();
-    const auto out_val = inputs[1].matrix<Device>();
-    const auto in1_val = inputs[2].matrix<Device>();
-    const auto in2_val = inputs[3].matrix<Device>();
-    auto in1_grad = outputs[0].matrix<Device>();
-    auto in2_grad = outputs[1].matrix<Device>();
-
-    CosSimBackward<Device>(
-        out_grad, out_val, in1_val, in2_val, in1_grad, in2_grad, scale_);
-  }
-
-private:
-  real scale_;
-};
-
-REGISTER_TYPED_FUNC(CosSimForward, CPU, CosSimForwardFunc);
-REGISTER_TYPED_FUNC(CosSimBackward, CPU, CosSimBackwardFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(CosSimForward, GPU, CosSimForwardFunc);
-REGISTER_TYPED_FUNC(CosSimBackward, GPU, CosSimBackwardFunc);
-#endif
-}  // namespace paddle
diff --git a/paddle/function/CosSimOpTest.cpp b/paddle/function/CosSimOpTest.cpp
deleted file mode 100644
index 42b02da0cb07a57e030a3edb08bea23203efd688..0000000000000000000000000000000000000000
--- a/paddle/function/CosSimOpTest.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-#include "paddle/math/Matrix.h"
-
-using namespace paddle;  // NOLINT
-
-void testCosSimForward(size_t height_x,
-                       size_t height_y,
-                       size_t width,
-                       real scale) {
-  CpuGpuFuncCompare test("CosSimForward", FuncConfig().set("scale", scale));
-  // prepare input arguments
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}),
-                  ASSIGN_TO);
-  // run Function
-  test.run();
-}
-
-void testCosSimBackward(size_t height_x,
-                        size_t height_y,
-                        size_t width,
-                        real scale) {
-  CpuGpuFuncCompare test("CosSimBackward", FuncConfig().set("scale", scale));
-  // prepare input arguments
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}),
-                  ADD_TO);
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}),
-                  ADD_TO);
-  // run Function
-  test.run();
-}
-
-TEST(Matrix, cosSim) {
-  for (auto height_x : {10, 100, 1000}) {
-    for (auto height_y : {1, height_x}) {
-      for (auto width : {10, 100, 1000}) {
-        for (auto scale : {1.0, 2.0}) {
-          testCosSimForward(height_x, height_y, width, scale);
-          testCosSimBackward(height_x, height_y, width, scale);
-        }
-      }
-    }
-  }
-}
diff --git a/paddle/function/CropOp.cpp b/paddle/function/CropOp.cpp
deleted file mode 100644
index 7aa527d21615e19257bd003d0563b5e26b2fcb2f..0000000000000000000000000000000000000000
--- a/paddle/function/CropOp.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CropOp.h"
-#include "paddle/function/TensorShape.h"
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-
-template <>
-void Crop<DEVICE_TYPE_CPU>(real* outputs,
-                           const real* inputs,
-                           const TensorShape inShape,
-                           const TensorShape outShape,
-                           const FuncConfig& conf) {
-  std::vector<uint32_t> crop_corner =
-      conf.get<std::vector<uint32_t>>("crop_corner");
-  int cCrop = crop_corner[1];
-  int hCrop = crop_corner[2];
-  int wCrop = crop_corner[3];
-
-  int num = inShape[0];
-  int inC = inShape[1];
-  int inH = inShape[2];
-  int inW = inShape[3];
-
-  int outC = outShape[1];
-  int outH = outShape[2];
-  int outW = outShape[3];
-
-  for (int n = 0; n < num; n++) {
-    for (int c = 0; c < outC; c++) {
-      for (int h = 0; h < outH; h++) {
-        int outoff = ((n * outC + c) * outH + h) * outW;
-        int inoff = ((n * inC + c + cCrop) * inH + h + hCrop) * inW + wCrop;
-        memcpy(outputs + outoff, inputs + inoff, outW * sizeof(real));
-      }
-    }
-  }
-}
-
-template <>
-void CropGrad<DEVICE_TYPE_CPU>(const real* inGrad,
-                               real* outGrad,
-                               const TensorShape inShape,
-                               const TensorShape outShape,
-                               const FuncConfig& conf) {
-  std::vector<uint32_t> crop_corner =
-      conf.get<std::vector<uint32_t>>("crop_corner");
-  int cCrop = crop_corner[1];
-  int hCrop = crop_corner[2];
-  int wCrop = crop_corner[3];
-
-  int num = outShape[0];
-  int outC = outShape[1];
-  int outH = outShape[2];
-  int outW = outShape[3];
-
-  int inC = inShape[1];
-  int inH = inShape[2];
-  int inW = inShape[3];
-
-  for (int n = 0; n < num; n++) {
-    for (int c = 0; c < inC; c++) {
-      for (int h = 0; h < inH; h++) {
-        int outoff = ((n * outC + c + cCrop) * outH + h + hCrop) * outW + wCrop;
-        int inoff = ((n * inC + c) * inH + h) * inW;
-        CpuVector inG = CpuVector(inW, const_cast<real*>(inGrad + inoff));
-        CpuVector outG = CpuVector(inW, outGrad + outoff);
-        outG += inG;
-      }
-    }
-  }
-}
-
-/**
- * \brief Crop input according to the specify corner and shape.
- *        The input and output is a 4D tensor. In CropFunc, we only
- *        crop the 2nd to 4th dimension.
- *
- * Argument in this Function:
- * \param pad_    A struct object contains the cropping corner and shape.
- * \param inputs  A 4D tensor, only one input.
- * \param outputs A 4D tensor, the output value after cropping.
- *
- * For example,
- * Input(2,2,2,3) = [
- *                    [ [[1,2,3], [3,4,5]],
- *                      [[2,3,5], [1,6,7]] ],
- *                    [ [[4,3,1], [1,8,7]],
- *                      [[3,8,9], [2,3,5]] ]
- *                  ] # the input shape is (2,2,2,3)
- *
- * pad_: if corner = (0,1,1) and crop_shape = (2,1,2)
- * Output(2,2,1,2) = [
- *                    [ [[4,5]],
- *                      [[6,7]] ],
- *                    [ [[8,7]],
- *                      [[3,5]] ]
- *                  ] # the input shape is (2,2,2,3)
- */
-template <DeviceType Device>
-class CropFunc : public FunctionBase {
-public:
-  void init(const FuncConfig& config) override { conf_ = config; }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-
-    TensorShape inShape = inputs[0].shape();
-    TensorShape outShape = outputs[0].shape();
-
-    Crop<Device>(outputs[0].data<real>(),
-                 inputs[0].data<real>(),
-                 inShape,
-                 outShape,
-                 conf_);
-  }
-
-private:
-  FuncConfig conf_;
-};
-
-/**
- * \brief The backward propagation of cropping Function.
- *
- * Argument in this Function:
- * \param crop_    The same meaning as it in CropFunc.
- * \param inputs  The gradient with respect to the output value of CropFunc.
- * \param outputs The gradient with respect to the input value of CropFunc.
- */
-
-template <DeviceType Device>
-class CropGradFunc : public FunctionBase {
-public:
-  void init(const FuncConfig& config) override { conf_ = config; }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-
-    TensorShape outShape = outputs[0].shape();
-    TensorShape inShape = inputs[0].shape();
-
-    CropGrad<Device>(inputs[0].data<real>(),
-                     outputs[0].data<real>(),
-                     inShape,
-                     outShape,
-                     conf_);
-  }
-
-private:
-  FuncConfig conf_;
-};
-
-REGISTER_TYPED_FUNC(Crop, CPU, CropFunc);
-REGISTER_TYPED_FUNC(CropGrad, CPU, CropGradFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(Crop, GPU, CropFunc);
-REGISTER_TYPED_FUNC(CropGrad, GPU, CropGradFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
deleted file mode 100644
index 75c0fc2a3d047a9162d49809a717629f2270872d..0000000000000000000000000000000000000000
--- a/paddle/function/CrossMapNormalOp.cpp
+++ /dev/null
@@ -1,344 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CrossMapNormalOp.h"
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-
-template <>
-void CrossMapNormal<DEVICE_TYPE_CPU>(real* outputs,
-                                     real* denoms,
-                                     const real* inputs,
-                                     size_t numSamples,
-                                     size_t channels,
-                                     size_t height,
-                                     size_t width,
-                                     size_t size,
-                                     real scale,
-                                     real pow) {
-  size_t oneImage = height * width;
-  size_t oneSample = channels * oneImage;
-
-  CpuVector outputsV(numSamples * oneSample, outputs);
-  CpuVector inputsV(numSamples * oneSample, const_cast<real*>(inputs));
-  CpuVector denomsV(numSamples * oneSample, denoms);
-
-  // f(x) = x * ( 1 + scale * SUM((x)^2) )^(-pow)
-  // x represents inputs
-  // f(x) represents outputs
-  // denoms save the intermediate result for backward
-  denomsV = denomsV.constant(1.0);
-  const int start = -((int)size - 1) / 2;
-  const int end = (int)size + start;
-  for (size_t i = 0; i < numSamples; i++) {
-    real* oneDenom = denoms + i * oneSample;
-    real* oneInput = const_cast<real*>(inputs) + i * oneSample;
-    for (int c = 0; c < (int)channels; c++) {
-      CpuVector denom(oneImage, oneDenom + c * oneImage);
-      for (int s = start; s < end; s++) {
-        if (c + s >= 0 && c + s < (int)channels) {
-          CpuVector input(oneImage, oneInput + (c + s) * oneImage);
-          denom += input.square() * scale;
-        }
-      }
-    }
-  }
-
-  outputsV = inputsV * denomsV.pow(-pow);
-}
-
-template <>
-void CrossMapNormalGrad<DEVICE_TYPE_CPU>(real* inputsGrad,
-                                         const real* inputsValue,
-                                         const real* outputsValue,
-                                         const real* outputsGrad,
-                                         const real* denoms,
-                                         size_t numSamples,
-                                         size_t channels,
-                                         size_t height,
-                                         size_t width,
-                                         size_t size,
-                                         real scale,
-                                         real pow) {
-  size_t oneSample = channels * height * width;
-  std::function<CpuVector(real*, size_t)> oneImage = [=](real* data,
-                                                         size_t offset) {
-    return CpuVector(height * width, data + offset);
-  };
-
-  const int start = -((int)size) / 2;
-  const int end = (int)size + start;
-  const real ratio = -(real)2 * scale * pow;
-  for (size_t i = 0; i < numSamples; i++) {
-    size_t sOffset = i * oneSample;
-    real* oneInputGrad = inputsGrad + sOffset;
-    real* oneInputValue = const_cast<real*>(inputsValue) + sOffset;
-    real* oneDenom = const_cast<real*>(denoms) + sOffset;
-    real* oneOutputGrad = const_cast<real*>(outputsGrad) + sOffset;
-    real* oneOutputValue = const_cast<real*>(outputsValue) + sOffset;
-
-    for (int c = 0; c < (int)channels; c++) {
-      size_t cOffset = c * height * width;
-      CpuVector inputGrad = oneImage(oneInputGrad, cOffset);
-      CpuVector inputValue = oneImage(oneInputValue, cOffset);
-      CpuVector denom = oneImage(oneDenom, cOffset);
-      CpuVector outputGrad = oneImage(oneOutputGrad, cOffset);
-
-      inputGrad = inputGrad + denom.pow(-pow) * outputGrad;
-      for (int s = start; s < end; s++) {
-        if (c + s >= 0 && c + s < (int)channels) {
-          size_t offset = (c + s) * height * width;
-          CpuVector output = oneImage(oneOutputValue, offset);
-          CpuVector outputGrad = oneImage(oneOutputGrad, offset);
-          CpuVector denom = oneImage(oneDenom, offset);
-
-          inputGrad += ((outputGrad * output * ratio) / denom) * inputValue;
-        }
-      }
-    }
-  }
-}
-
-/**
- * \brief Normalization with across maps.
- *
- * This Function comes from the paper
- * "ImageNet Classification with Deep Convolutional Neural Networks".
- *
- * The original formula is:
- *
- *                                Input(i, x, y)
- * Output(i, x, y) = ----------------------------------------------
- *                                 -- upper
- *                    (k + alpha * >  (Input(j, x, y))^2) ^ (beta)
- *                                 -- j = lower
- *
- * upper is `min(C, c + N/2)`
- * lower if `max(0, c - N/2)`
- *
- * Function implementation:
- *
- * inputs and outpus is NCHW format, while input.shape.ndims() is equal 4.
- * And the meaning of each dimension(0-3) is respectively batch size,
- * feature maps, rows and columns.
- *
- * Input and Output in the above formula is for each map(i) of one image, and
- * Input(i, x, y), Output(i, x, y) represents an element in an image.
- *
- * C is the number of feature maps of one image, and N is a hyper-parameters
- * is configured when Function is initialized. The sum in the denominator
- * is the sum of the same position in the neighboring maps.
- *
- * In the implementation of Function, k is equal to 1,
- * so Function has no argument for k.
- *
- * Function Arguments:
- *
- * \param size_      represent N
- * \param scale_     represent alpha
- * \param pow_       represent beta
- * \param inputs[0]  represent Input
- * \param outputs[0] represent Output
- * \param outputs[1] represent The denominator in the formula(except beta)
- *
- * Note:
- * Save output[1] is to simplify the backward calculation.
- * TODO, if only consider the forward calculation, we can optimize to
- * remove the output[1].
- */
-template <DeviceType Device>
-class CrossMapNormalFunc : public FunctionBase {
-public:
-  void init(const FuncConfig& config) override {
-    // function arguments
-    size_ = config.get<size_t>("size");
-    scale_ = config.get<real>("scale");
-    pow_ = config.get<real>("pow");
-
-    // number of inputs and outputs
-    numInputs_ = 1;
-    numOutputs_ = 2;
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    check(inputs, outputs);
-    // ArgType check still on here,
-    // not sure whether it is better to put inside the check.
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-    CHECK_EQ(outputs[1].getArgType(), ASSIGN_TO);
-    size_t batchSize = inputs[0].shape()[0];
-    size_t maps = inputs[0].shape()[1];
-    size_t rows = inputs[0].shape()[2];
-    size_t columns = inputs[0].shape()[3];
-
-    CrossMapNormal<Device>(outputs[0].data<real>(),
-                           outputs[1].data<real>(),
-                           inputs[0].data<real>(),
-                           batchSize,
-                           maps,
-                           rows,
-                           columns,
-                           size_,
-                           scale_,
-                           pow_);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
-    CHECK(inputs[0].shape() == outputs[0].shape());
-    CHECK(inputs[0].shape() == outputs[1].shape());
-  }
-
-  // Only need the shape of the input, can calculate the
-  // floating-point operation.
-  size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ((size_t)numInputs_, inputs.size());
-    size_t batchSize = inputs[0].shape()[0];
-    size_t maps = inputs[0].shape()[1];
-    size_t rows = inputs[0].shape()[2];
-    size_t columns = inputs[0].shape()[3];
-
-    // number of floating-point operations
-    // an approximate value
-    size_t ops = batchSize * maps * rows * columns * (size_ * 2 + 3);
-
-    return ops;
-  }
-
-private:
-  size_t size_;
-  real scale_;
-  real pow_;
-};
-
-/**
- * \brief Backward calculation for normalization with across maps.
- *
- * Function implementation:
- *
- * The implementation of this Function is derived from the
- * CrossMapNormalFunc implementation.
- *
- * InputGrad = OutputGrad * denoms ^ (-beta)
- *    -- upper
- *  + > (OutputGrad * OutputValue * (-2 * alpha * beta) / denoms) * InputValue
- *    -- lower
- *
- * The data of inputs/outputs format is the same as the forward interface
- * and is NCHW.
- *
- * The upper and lower is the same as forward. The logic of the sum
- * is also the same as forward.
- *
- * Function Arguments:
- *
- * \param size_      represent N
- * \param scale_     represent alpha
- * \param pow_       represent beta
- * \param inputs[0]  represent InputValue, inputs[0] of CrossMapNormalFunc
- * \param inputs[1]  represent OutputValue, outputs[0] of CrossMapNormalFunc
- * \param inputs[2]  represent OutputGrad
- * \param inputs[3]  represent denoms, outputs[1] of CrossMapNormalFunc
- *                   This is the intermediate result that is
- *                   preserved in the forward calculation.
- * \param outputs[0] represent InputGrad
- */
-template <DeviceType Device>
-class CrossMapNormalGradFunc : public FunctionBase {
-public:
-  void init(const FuncConfig& config) override {
-    // function arguments
-    size_ = config.get<size_t>("size");
-    scale_ = config.get<real>("scale");
-    pow_ = config.get<real>("pow");
-
-    // number of inputs and outputs
-    numInputs_ = 4;
-    numOutputs_ = 1;
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    check(inputs, outputs);
-    if (outputs[0].getArgType() != ADD_TO) {
-      // Currently, some algorithm implementations are ASSIGN_TO mode,
-      // if need to support the ADD_TO calculation, need to clear the output.
-      typename Tensor<real, Device>::Vector tmp(
-          outputs[0].shape().getElements(), outputs[0].data<real>());
-      tmp.zero();
-    }
-
-    size_t batchSize = inputs[0].shape()[0];
-    size_t maps = inputs[0].shape()[1];
-    size_t rows = inputs[0].shape()[2];
-    size_t columns = inputs[0].shape()[3];
-
-    CrossMapNormalGrad<Device>(outputs[0].data<real>(),
-                               inputs[0].data<real>(),
-                               inputs[1].data<real>(),
-                               inputs[2].data<real>(),
-                               inputs[3].data<real>(),
-                               batchSize,
-                               maps,
-                               rows,
-                               columns,
-                               size_,
-                               scale_,
-                               pow_);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
-    CHECK(inputs[0].shape() == inputs[1].shape());
-    CHECK(inputs[0].shape() == inputs[2].shape());
-    CHECK(inputs[0].shape() == inputs[3].shape());
-    CHECK(inputs[0].shape() == outputs[0].shape());
-  }
-
-  // Only need the shape of one input, can calculate the
-  // floating-point operation.
-  size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_LT((size_t)1, inputs.size());
-    size_t batchSize = inputs[0].shape()[0];
-    size_t maps = inputs[0].shape()[1];
-    size_t rows = inputs[0].shape()[2];
-    size_t columns = inputs[0].shape()[3];
-
-    // number of floating-point operations
-    // an approximate value
-    size_t ops = batchSize * maps * rows * columns * (size_ * 4 + 2);
-
-    return ops;
-  }
-
-private:
-  size_t size_;
-  real scale_;
-  real pow_;
-};
-
-REGISTER_TYPED_FUNC(CrossMapNormal, CPU, CrossMapNormalFunc);
-REGISTER_TYPED_FUNC(CrossMapNormalGrad, CPU, CrossMapNormalGradFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(CrossMapNormal, GPU, CrossMapNormalFunc);
-REGISTER_TYPED_FUNC(CrossMapNormalGrad, GPU, CrossMapNormalGradFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/DepthwiseConvOp.cpp b/paddle/function/DepthwiseConvOp.cpp
deleted file mode 100644
index 46651345b45e4ced9a3ef3373af437d939a66716..0000000000000000000000000000000000000000
--- a/paddle/function/DepthwiseConvOp.cpp
+++ /dev/null
@@ -1,305 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DepthwiseConvOp.h"
-#include "ConvOp.h"
-
-namespace paddle {
-
-template <class T>
-class DepthwiseConvFunctor<DEVICE_TYPE_CPU, T> {
-public:
-  void operator()(const T* inputData,
-                  const T* filterData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* outputData) {
-    // TODO(zhaolong) : cpu implementation of depthwise convolution
-  }
-};
-
-template <class T>
-class DepthwiseConvGradInputFunctor<DEVICE_TYPE_CPU, T> {
-public:
-  void operator()(const T* outputGrad,
-                  const T* filterData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* inputGrad) {}
-  // TODO(zhaolong) : cpu implementation of depthwise convolution
-};
-
-template <class T>
-class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_CPU, T> {
-public:
-  void operator()(const T* outputGrad,
-                  const T* inputData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* colData,
-                  T* filterGrad) {}
-  // TODO(zhaolong) : cpu implementation of depthwise convolution
-};
-
-/*
- * \brief Forward calculation of depthwise convolution.
- */
-template <DeviceType Device>
-class DepthwiseConvFunction : public ConvFunctionBase {
-public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-    size_t filterMultiplier = outputChannels / groups_;
-    CHECK_EQ(inputChannels, groups_);
-
-    real* inputData = inputs[0].data<real>();
-    real* filterData = inputs[1].data<real>();
-    real* outputData = outputs[0].data<real>();
-
-    DepthwiseConvFunctor<Device, real> depthwiseConv;
-    depthwiseConv(inputData,
-                  filterData,
-                  batchSize,
-                  outputChannels,
-                  outputHeight,
-                  outputWidth,
-                  inputChannels,
-                  inputHeight,
-                  inputWidth,
-                  filterMultiplier,
-                  filterHeight,
-                  filterWidth,
-                  strideH(),
-                  strideW(),
-                  paddingH(),
-                  paddingW(),
-                  outputData);
-  }
-};
-
-/*
- * \brief Backward input calculation of depthwise convolution.
- */
-template <DeviceType Device>
-class DepthwiseConvGradInputFunction : public ConvFunctionBase {
-public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& input = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    check(inputs, outputs);
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& input = outputs[0].shape();
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-    size_t filterMultiplier = outputChannels / groups_;
-    CHECK_EQ(inputChannels, groups_);
-
-    real* outputGrad = inputs[0].data<real>();
-    real* filterData = inputs[1].data<real>();
-    real* inputGrad = outputs[0].data<real>();
-
-    DepthwiseConvGradInputFunctor<Device, real> depthwiseConvGradInput;
-    depthwiseConvGradInput(outputGrad,
-                           filterData,
-                           batchSize,
-                           outputChannels,
-                           outputHeight,
-                           outputWidth,
-                           inputChannels,
-                           inputHeight,
-                           inputWidth,
-                           filterMultiplier,
-                           filterHeight,
-                           filterWidth,
-                           strideH(),
-                           strideW(),
-                           paddingH(),
-                           paddingW(),
-                           inputGrad);
-  }
-};
-
-/*
- * \brief Backward filter calculation of depthwise convolution.
- */
-template <DeviceType Device>
-class DepthwiseConvGradFilterFunction : public ConvFunctionBase {
-public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& input = inputs[1].shape();
-    const TensorShape& filter = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    check(inputs, outputs);
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& input = inputs[1].shape();
-    const TensorShape& filter = outputs[0].shape();
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-    size_t filterMultiplier = outputChannels / groups_;
-    CHECK_EQ(inputChannels, groups_);
-
-    real* outputGrad = inputs[0].data<real>();
-    real* inputData = inputs[1].data<real>();
-    real* filterGrad = outputs[0].data<real>();
-
-    int size = outputChannels * filterHeight * filterWidth * outputHeight *
-               outputWidth;
-    resizeBuffer<Device>(size);
-    real* colData = reinterpret_cast<real*>(memory_->getBuf());
-
-    DepthwiseConvGradFilterFunctor<Device, real> depthwiseConvGradFilter;
-
-    depthwiseConvGradFilter(outputGrad,
-                            inputData,
-                            batchSize,
-                            outputChannels,
-                            outputHeight,
-                            outputWidth,
-                            inputChannels,
-                            inputHeight,
-                            inputWidth,
-                            filterMultiplier,
-                            filterHeight,
-                            filterWidth,
-                            strideH(),
-                            strideW(),
-                            paddingH(),
-                            paddingW(),
-                            colData,
-                            filterGrad);
-  }
-};
-
-REGISTER_TYPED_FUNC(DepthwiseConv, CPU, DepthwiseConvFunction);
-REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
-                    CPU,
-                    DepthwiseConvGradInputFunction);
-REGISTER_TYPED_FUNC(DepthwiseConvGradFilter,
-                    CPU,
-                    DepthwiseConvGradFilterFunction);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(DepthwiseConv, GPU, DepthwiseConvFunction);
-REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
-                    GPU,
-                    DepthwiseConvGradInputFunction);
-REGISTER_TYPED_FUNC(DepthwiseConvGradFilter,
-                    GPU,
-                    DepthwiseConvGradFilterFunction);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/DepthwiseConvOp.h b/paddle/function/DepthwiseConvOp.h
deleted file mode 100644
index 6700747314fa8377828dab0c436eb4b2053f46f6..0000000000000000000000000000000000000000
--- a/paddle/function/DepthwiseConvOp.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "TensorType.h"
-
-namespace paddle {
-
-/**
- *\brief   Depthwise convolution forward. The outputData
- *         of depthwise convolution is same with ExpandConvLayer
- *         when groups equals inputChannels in ExpandConvLayer.
- *
- * \param[in]   inputData         input data.
- * \param[in]   filterData        the Paramters of the depthwise conv layer..
- * \param[in]   batchSize         batch size of input data.
- * \param[in]   outputChannels    channels of outputData.
- * \param[in]   outputHeight      height of outputData.
- * \param[in]   outputWidth       width of outputData.
- * \param[in]   inputChannels     channels of inputData.
- * \param[in]   inputHeight       height of inputData.
- * \param[in]   inputWidth        width of inputData..
- * \param[in]   filterMultiplier  equals to outputChannels/groups_.
- * \param[in]   filterHeight      height of filter.
- * \param[in]   filterWidth       widht of filter.
- * \param[in]   strideH           stride size in height direction.
- * \param[in]   strideW           stride size in width direction.
- * \param[in]   paddingH          padding size in height direction.
- * \param[in]   paddingW          padding size in width direction.
- * \param[out]  outputData        outputData.
- *
- */
-template <DeviceType Device, class T>
-class DepthwiseConvFunctor {
-public:
-  void operator()(const T* inputData,
-                  const T* filterData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* outputData);
-};
-
-/**
- *\brief  Functor tot compute the depthwise convolution backprop w.r.t input.
- *
- *
- * \param[in]   outputGradData    the grad data of output.
- * \param[in]   filterData        the Paramters of the depthwise conv layer..
- * \param[in]   batchSize         batch size of input data.
- * \param[in]   outputChannels    channels of outputData.
- * \param[in]   outputHeight      height of outputData.
- * \param[in]   outputWidth       width of outputData.
- * \param[in]   inputChannels     channels of input data.
- * \param[in]   inputHeight       height of inputData.
- * \param[in]   inputWidth        width of inputData.
- * \param[in]   filterMultiplier  equals to outputChannels/groups_.
- * \param[in]   filterHeight      height of filter.
- * \param[in]   filterWidth       widht of filter.
- * \param[in]   strideH           stride size in height direction.
- * \param[in]   strideW           stride size in width direction.
- * \param[in]   paddingH          padding size in height direction.
- * \param[in]   paddingW          padding size in width direction.
- * \param[out]  inputGrad         the grad data of input.
- *
- */
-template <DeviceType Device, class T>
-class DepthwiseConvGradInputFunctor {
-public:
-  void operator()(const T* outputGrad,
-                  const T* filterData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* inputGrad);
-};
-
-/**
- *\brief  Functor tot compute the depthwise convolution backprop w.r.t filter.
- *
- * \param[in]   outputGradData    the grad data of output.
- * \param[in]   inputData         inputData.
- * \param[in]   batchSize         batch size of input data.
- * \param[in]   outputChannels    channels of outputData.
- * \param[in]   outputHeight      height of outputData.
- * \param[in]   outputWidth       width of outputData.
- * \param[in]   inputChannels     channels of input data.
- * \param[in]   inputHeight       height of inputData.
- * \param[in]   inputWidth        width of inputData.
- * \param[in]   filterMultiplier  equals to outputChannels/groups_.
- * \param[in]   filterHeight      height of filter.
- * \param[in]   filterWidth       widht of filter.
- * \param[in]   strideH           stride size in height direction.
- * \param[in]   strideW           stride size in width direction.
- * \param[in]   paddingH          padding size in height direction.
- * \param[in]   paddingW          padding size in width direction.
- * \param[in]   colData           Auxiliary data when calculating filterGrad.
- * \param[in]   multiplierData    Auxiliary data when calculating filterGrad.
- * \param[out]  filterGrad        the grad data of filter.
- *
- */
-template <DeviceType Device, class T>
-class DepthwiseConvGradFilterFunctor {
-public:
-  void operator()(const T* outputGrad,
-                  const T* inputData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* colData,
-                  T* filterGrad);
-};
-
-}  // namespace paddle
diff --git a/paddle/function/DepthwiseConvOpGpu.cu b/paddle/function/DepthwiseConvOpGpu.cu
deleted file mode 100644
index cd1d55a416c84c6327226ffaae4d5d9d5be81038..0000000000000000000000000000000000000000
--- a/paddle/function/DepthwiseConvOpGpu.cu
+++ /dev/null
@@ -1,376 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DepthwiseConvOp.h"
-#include "paddle/math/BaseMatrix.h"
-
-namespace paddle {
-
-// CUDA kernel to compute the depthwise convolution forward pass
-template <class T>
-__global__ void ConvolutionDepthwiseForward(const int nthreads,
-                                            const T* const inputData,
-                                            const T* const filterData,
-                                            const int batchSize,
-                                            const int outputChannels,
-                                            const int outputHeight,
-                                            const int outputWidth,
-                                            const int inputChannels,
-                                            const int inputHeight,
-                                            const int inputWidth,
-                                            const int filterMultiplier,
-                                            const int filterHeight,
-                                            const int filterWidth,
-                                            const int strideH,
-                                            const int strideW,
-                                            const int paddingH,
-                                            const int paddingW,
-                                            T* const outputData) {
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if (index < nthreads) {
-    const int batch = index / outputChannels / outputHeight / outputWidth;
-    const int c_out = (index / outputHeight / outputWidth) % outputChannels;
-    const int h_out = (index / outputWidth) % outputHeight;
-    const int w_out = index % outputWidth;
-
-    const int c_in = c_out / filterMultiplier;
-    const T* weight = filterData + c_out * filterHeight * filterWidth;
-    T value = 0;
-    const int h_in_start = -paddingH + h_out * strideH;
-    const int w_in_start = -paddingW + w_out * strideW;
-    const int h_in_end = -paddingH + h_out * strideH + filterHeight - 1;
-    const int w_in_end = -paddingW + w_out * strideW + filterWidth - 1;
-    if ((h_in_start >= 0) && (h_in_end < inputHeight) && (w_in_start >= 0) &&
-        (w_in_end < inputWidth)) {
-      for (int kh = 0; kh < filterHeight; ++kh) {
-        for (int kw = 0; kw < filterWidth; ++kw) {
-          const int h_in = -paddingH + h_out * strideH + kh;
-          const int w_in = -paddingW + w_out * strideW + kw;
-          const int offset =
-              ((batch * inputChannels + c_in) * inputHeight + h_in) *
-                  inputWidth +
-              w_in;
-          value += (*weight) * inputData[offset];
-          ++weight;
-        }
-      }
-    } else {
-      for (int kh = 0; kh < filterHeight; ++kh) {
-        for (int kw = 0; kw < filterWidth; ++kw) {
-          const int h_in = -paddingH + h_out * strideH + kh;
-          const int w_in = -paddingW + w_out * strideW + kw;
-          if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
-              (w_in < inputWidth)) {
-            const int offset =
-                ((batch * inputChannels + c_in) * inputHeight + h_in) *
-                    inputWidth +
-                w_in;
-            value += (*weight) * inputData[offset];
-          }
-          ++weight;
-        }
-      }
-    }
-    outputData[index] = value;
-  }
-}
-
-// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
-template <class T>
-__global__ void ConvolutionDepthwiseInputBackward(const int nthreads,
-                                                  const T* const top_diff,
-                                                  const T* const weight_data,
-                                                  const int num,
-                                                  const int outputChannels,
-                                                  const int outputHeight,
-                                                  const int outputWidth,
-                                                  const int inputChannels,
-                                                  const int inputHeight,
-                                                  const int inputWidth,
-                                                  const int filterMultiplier,
-                                                  const int filterHeight,
-                                                  const int filterWidth,
-                                                  const int strideH,
-                                                  const int strideW,
-                                                  const int paddingH,
-                                                  const int paddingW,
-                                                  T* const bottom_diff) {
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    const int batch = index / inputChannels / inputHeight / inputWidth;
-    const int c_in = (index / inputHeight / inputWidth) % inputChannels;
-    const int h_in = (index / inputWidth) % inputHeight;
-    const int w_in = index % inputWidth;
-
-    const int c_out_start = c_in * filterMultiplier;
-
-    int h_out_start = (h_in - filterHeight + paddingH + strideH) / strideH;
-    h_out_start = 0 > h_out_start ? 0 : h_out_start;
-    int h_out_end = (h_in + paddingH) / strideH;
-    h_out_end = outputHeight - 1 < h_out_end ? outputHeight - 1 : h_out_end;
-    int w_out_start = (w_in - filterWidth + paddingW + strideW) / strideW;
-    w_out_start = 0 > w_out_start ? 0 : w_out_start;
-    int w_out_end = (w_in + paddingW) / strideW;
-    w_out_end = outputWidth - 1 < w_out_end ? outputWidth - 1 : w_out_end;
-
-    T value = 0;
-
-    for (int c_out = c_out_start; c_out < c_out_start + filterMultiplier;
-         c_out++) {
-      for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
-        const int filter_h = h_in + paddingH - h_out * strideH;
-        for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
-          const int filter_w = w_in + paddingW - w_out * strideW;
-          const int filter_offset = c_out * filterHeight * filterWidth +
-                                    filter_h * filterWidth + filter_w;
-          const int top_diff_offset =
-              ((batch * outputChannels + c_out) * outputHeight + h_out) *
-                  outputWidth +
-              w_out;
-          value += top_diff[top_diff_offset] * weight_data[filter_offset];
-        }
-      }
-    }
-    bottom_diff[index] += value;
-  }
-}
-
-// CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
-template <class T>
-__global__ void ConvolutionDepthwiseFilterBackward(const int num_i,
-                                                   const int nthreads,
-                                                   const T* const top_diff,
-                                                   const T* const inputData,
-                                                   const int num,
-                                                   const int outputChannels,
-                                                   const int outputHeight,
-                                                   const int outputWidth,
-                                                   const int inputChannels,
-                                                   const int inputHeight,
-                                                   const int inputWidth,
-                                                   const int filterMultiplier,
-                                                   const int filterHeight,
-                                                   const int filterWidth,
-                                                   const int strideH,
-                                                   const int strideW,
-                                                   const int paddingH,
-                                                   const int paddingW,
-                                                   T* const buffer_data) {
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < nthreads) {
-    const int h_out = (index / outputWidth) % outputHeight;
-    const int w_out = index % outputWidth;
-    const int kh =
-        (index / filterWidth / outputHeight / outputWidth) % filterHeight;
-    const int kw = (index / outputHeight / outputWidth) % filterWidth;
-    const int h_in = -paddingH + h_out * strideH + kh;
-    const int w_in = -paddingW + w_out * strideW + kw;
-    if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
-        (w_in < inputWidth)) {
-      const int c_out =
-          index / (filterHeight * filterWidth * outputHeight * outputWidth);
-      const int c_in = c_out / filterMultiplier;
-      const int batch = num_i;
-      const int top_offset =
-          ((batch * outputChannels + c_out) * outputHeight + h_out) *
-              outputWidth +
-          w_out;
-      const int bottom_offset =
-          ((batch * inputChannels + c_in) * inputHeight + h_in) * inputWidth +
-          w_in;
-      buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset];
-    } else {
-      buffer_data[index] = 0;
-    }
-  }
-}
-
-template <class T>
-class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T> {
-public:
-  void operator()(const T* inputData,
-                  const T* filterData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* outputData) {
-    int outputSize = batchSize * outputChannels * outputHeight * outputWidth;
-
-    size_t blocks = (outputSize + 1024 - 1) / 1024;
-    size_t blockX = 512;
-    size_t blockY = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-
-    ConvolutionDepthwiseForward<T><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        outputSize,
-        inputData,
-        filterData,
-        batchSize,
-        outputChannels,
-        outputHeight,
-        outputWidth,
-        inputChannels,
-        inputHeight,
-        inputWidth,
-        filterMultiplier,
-        filterHeight,
-        filterWidth,
-        strideH,
-        strideW,
-        paddingH,
-        paddingW,
-        outputData);
-  }
-};
-
-template <class T>
-class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T> {
-public:
-  void operator()(const T* outputGrad,
-                  const T* filterData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* inputGrad) {
-    int inputSize = batchSize * inputChannels * inputHeight * inputWidth;
-
-    size_t blocks = (inputSize + 1024 - 1) / 1024;
-    size_t blockX = 512;
-    size_t blockY = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-
-    ConvolutionDepthwiseInputBackward<T>
-        // NOLINT_NEXT_LINE(whitespace/operators)
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(inputSize,
-                                               outputGrad,
-                                               filterData,
-                                               batchSize,
-                                               outputChannels,
-                                               outputHeight,
-                                               outputWidth,
-                                               inputChannels,
-                                               inputHeight,
-                                               inputWidth,
-                                               filterMultiplier,
-                                               filterHeight,
-                                               filterWidth,
-                                               strideH,
-                                               strideW,
-                                               paddingH,
-                                               paddingW,
-                                               inputGrad);
-  }
-};
-
-template <class T>
-class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, T> {
-public:
-  void operator()(const T* outputGrad,
-                  const T* inputData,
-                  int batchSize,
-                  int outputChannels,
-                  int outputHeight,
-                  int outputWidth,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterMultiplier,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  T* colData,
-                  T* filterGrad) {
-    int colDataSize = outputChannels * filterHeight * filterWidth *
-                      outputHeight * outputWidth;
-
-    size_t blocks = (colDataSize + 1024 - 1) / 1024;
-    size_t blockX = 512;
-    size_t blockY = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-    BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth,
-                                1,
-                                filterGrad,
-                                false,
-                                true);
-
-    for (int i = 0; i < batchSize; i++) {
-      ConvolutionDepthwiseFilterBackward<
-          T><<<grid, threads, 0, STREAM_DEFAULT>>>(i,
-                                                   colDataSize,
-                                                   outputGrad,
-                                                   inputData,
-                                                   batchSize,
-                                                   outputChannels,
-                                                   outputHeight,
-                                                   outputWidth,
-                                                   inputChannels,
-                                                   inputHeight,
-                                                   inputWidth,
-                                                   filterMultiplier,
-                                                   filterHeight,
-                                                   filterWidth,
-                                                   strideH,
-                                                   strideW,
-                                                   paddingH,
-                                                   paddingW,
-                                                   colData);
-      int K = outputHeight * outputWidth;
-      int M = colDataSize / K;
-
-      BaseMatrix colMatrix(M, K, colData, false, true);
-      filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0);
-    }
-  }
-};
-
-#ifdef PADDLE_TYPE_DOUBLE
-template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, double>;
-template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, double>;
-template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, double>;
-#else
-template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, float>;
-template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, float>;
-template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, float>;
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/EigenGemm.cpp b/paddle/function/EigenGemm.cpp
deleted file mode 100644
index bac4659e62b107dd80ef95dd0907b3da4becffbc..0000000000000000000000000000000000000000
--- a/paddle/function/EigenGemm.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <glog/logging.h>
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace paddle {
-
-template <class T>
-struct EigenBlasGemm {
-  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, int>,
-                           Eigen::Aligned>
-      EigenMatrix;
-
-  static void compute(const bool transA,
-                      const bool transB,
-                      const int M,
-                      const int N,
-                      const int K,
-                      const T alpha,
-                      const T* A,
-                      const int lda,
-                      const T* B,
-                      const int ldb,
-                      const T beta,
-                      T* C,
-                      const int ldc) {
-    Eigen::array<int, 2> sizeA;
-    if (transA) {
-      sizeA[0] = K;
-      sizeA[1] = M;
-      CHECK_EQ(M, lda);
-    } else {
-      sizeA[0] = M;
-      sizeA[1] = K;
-      CHECK_EQ(K, lda);
-    }
-    Eigen::array<int, 2> sizeB;
-    if (transB) {
-      sizeB[0] = N;
-      sizeB[1] = K;
-      CHECK_EQ(K, ldb);
-    } else {
-      sizeB[0] = K;
-      sizeB[1] = N;
-      CHECK_EQ(N, ldb);
-    }
-    Eigen::array<int, 2> sizeC = {{M, ldc}};
-    Eigen::array<int, 2> offsetC = {{0, 0}};
-    Eigen::array<int, 2> extentC = {{M, N}};
-
-    const EigenMatrix a(const_cast<T*>(A), sizeA);
-    const EigenMatrix b(const_cast<T*>(B), sizeB);
-    EigenMatrix c(C, sizeC);
-
-    typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
-    Eigen::array<DimPair, 1> dims;
-    dims[0] = DimPair(1, 0);
-    dims[0].first = transA ? 0 : 1;
-    dims[0].second = transB ? 1 : 0;
-
-    Eigen::DefaultDevice device;
-    if (N == ldc) {
-      if (alpha == T(1) && beta == T(0)) {
-        c.device(device) = a.contract(b, dims);
-      } else if (alpha == T(1) && beta == T(1)) {
-        c.device(device) += a.contract(b, dims);
-      } else {
-        c.device(device) = alpha * a.contract(b, dims) + beta * c;
-      }
-    } else {
-      if (alpha == T(1) && beta == T(0)) {
-        c.slice(offsetC, extentC).device(device) = a.contract(b, dims);
-      } else if (alpha == T(1) && beta == T(1)) {
-        c.slice(offsetC, extentC).device(device) += a.contract(b, dims);
-      } else {
-        c.slice(offsetC, extentC).device(device) =
-            alpha * a.contract(b, dims) + beta * c.slice(offsetC, extentC);
-      }
-    }
-  }
-};
-
-#ifdef PADDLE_TYPE_DOUBLE
-template struct EigenBlasGemm<double>;
-#else
-template struct EigenBlasGemm<float>;
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
deleted file mode 100644
index 01288ef92e7b59d7958e6e23daf641b30a60eed1..0000000000000000000000000000000000000000
--- a/paddle/function/Function.h
+++ /dev/null
@@ -1,214 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <vector>
-#include "BufferArg.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Any.h"
-#include "paddle/utils/ClassRegistrar.h"
-#include "paddle/utils/Error.h"
-
-namespace paddle {
-
-/**
- * Function Configuration.
- * The argument type of Function::init.
- */
-class FuncConfig {
-public:
-  template <typename T>
-  T get(const std::string& key, Error* err = nullptr) const {
-    try {
-      return any_cast<T>(valueMap_.at(key));
-    } catch (std::exception& e) {  // could be cast or out of range exception.
-      if (err) {
-        *err = Error(e.what());
-      } else {
-        LOG(FATAL) << "Cannot get key " << key << " with error " << e.what();
-      }
-      return T();
-    }
-  }
-
-  template <typename T>
-  FuncConfig& set(const std::string& key, T v, Error* err = nullptr) {
-    auto it = valueMap_.find(key);
-    if (it != valueMap_.end()) {  // already contains key.
-      if (err) {
-        *err = Error("Key %s is already set in FuncConfig", key.c_str());
-      } else {
-        LOG(FATAL) << "Key " << key << " is already set in FuncConfig.";
-      }
-      return *this;
-    }
-    valueMap_[key] = any(v);
-    return *this;
-  }
-
-protected:
-  mutable std::unordered_map<std::string, any> valueMap_;
-};
-
-/**
- * Argument type for Function::calc().
- * A BufferArgs contains a set of BufferArg,
- * because Function can have multiple inputs and outputs.
- *
- * addArg() with Matix object used to adapt Layer Argument.
- * Will create a BufferArg object in addArg(),
- * and free in destructor of BufferArgs.
- *
- * addArg() with BufferArg object, just save BufferArg object address,
- * and the caller needs to guarantee the validity of the BufferArg object
- * in the BufferArgs life time.
- */
-class BufferArgs {
-public:
-  BufferArgs() {}
-
-  ~BufferArgs() {
-    for (auto arg : _args_) {
-      delete arg;
-    }
-  }
-
-  size_t size() const { return args_.size(); }
-
-  // add argument into BufferArgs
-  // Tensor can be Matrix, Vector, IVector.
-  // For inputs, do not need argType.
-  // For outputs, the argType needs to be specified as ASSIGN_TO or ADD_TO.
-  void addArg(const Matrix& arg, ArgType argType = UNSPECIFIED) {
-    _args_.push_back(new BufferArg(arg, argType));
-    addArg(*_args_.back());
-  }
-
-  void addArg(const Vector& arg, ArgType argType = UNSPECIFIED) {
-    _args_.push_back(new BufferArg(arg, argType));
-    addArg(*_args_.back());
-  }
-
-  void addArg(const IVector& arg, ArgType argType = UNSPECIFIED) {
-    _args_.push_back(new BufferArg(arg, argType));
-    addArg(*_args_.back());
-  }
-
-  // Add arg into BufferArgs and reshape the arg.
-  //
-  // For example, arg represents an image buffer,
-  // but Matrix can only represent a two-dimensional Tensor.
-  // So need an extra argument to describe the shape of the image buffer.
-  void addArg(const Matrix& arg,
-              const TensorShape& shape,
-              ArgType argType = UNSPECIFIED);
-
-  void addArg(const CpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
-  void addArg(const GpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
-
-  void addArg(const Matrix& matrix,
-              const IVector& vector,
-              ArgType argType = UNSPECIFIED);
-
-  // get argument
-  const BufferArg& operator[](size_t num) const {
-    CHECK_LT(num, args_.size());
-    return *args_[num];
-  }
-
-  void addArg(BufferArg& arg) { args_.push_back(&arg); }
-
-  void addArg(SequenceIdArg& arg) { args_.push_back(&arg); }
-
-  void addArg(SequenceArg& arg) { args_.push_back(&arg); }
-
-  void addArg(SparseMatrixArg& arg) { args_.push_back(&arg); }
-
-private:
-  std::vector<BufferArg*> args_;
-  // The BufferArg object is constructed and freed by BufferArgs.
-  std::vector<BufferArg*> _args_;
-};
-
-/**
- * \brief Base class for Function.
- * The basic Function implementation requires override init and calc interfaces.
- *
- * The caller needs to ensure the validity of the arguments
- * during Function execution.
- *
- * Function inputs are readonly, Function outputs have two modes: ASSIGN_TO
- * and ADD_TO.
- * If output.getArgType() == ASSIGN_TO, this is assign mode, and the calculation
- * result of Function assigned to the output BufferArg.
- * If output.getArgType() == ADD_TO, this is add mode, and the calculation
- * result of Function need added to the output BufferArg.
- *
- * For example:
- * ASSIGN_TO: output = Function(inputs)
- * ADD_TO: output += Function(inputs)
- * If Function has more than one output, each output can have different modes.
- */
-class FunctionBase {
-public:
-  virtual ~FunctionBase() {}
-
-  virtual void init(const FuncConfig& config) {}
-
-  virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {}
-
-  // This member function is used to check whether the BufferType and shape of
-  // the inputs and outputs arguments of the Function are correct.
-  // General calc function which will call this check to do arguments check.
-  // And before the calc called, the caller can also check their own arguments.
-  virtual void check(const BufferArgs& inputs, const BufferArgs& outputs) {}
-
-  // Calculate the number of floating-point operations of this Function.
-  // The inputs and outputs arguments do not need to contain the actual data,
-  // only the shape.
-  // And some Functions have the same input and output shapes,
-  // so you may not need to enter the complete number of arguments.
-  // But entering the full arguments is always correct for this interface.
-  virtual size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) {
-    return 0;
-  }
-
-  int getNumInputs() const { return numInputs_; }
-
-  int getNumOutputs() const { return numOutputs_; }
-
-  static ClassRegistrar<FunctionBase> funcRegistrar_;
-
-protected:
-  // numInputs_ and numOutputs_ represents the maximum
-  // input and output supported by Function.
-  // Some functions are optimized for input and output,
-  // so when comparing the number of arguments, for these functions
-  // inputs.size() <= numInputs_ or outputs.size() <= numOutputs_
-  size_t numInputs_;
-  size_t numOutputs_;
-};
-
-#define FUNC_NAME(typeName, deviceName) #typeName "-" #deviceName
-
-#define REGISTER_TYPED_FUNC(typeName, deviceName, className)   \
-  static InitFunction __reg_type_##typeName##deviceName([]() { \
-    FunctionBase::funcRegistrar_                               \
-        .registerClass<className<DEVICE_TYPE_##deviceName>>(   \
-            FUNC_NAME(typeName, deviceName));                  \
-  })
-
-}  // namespace paddle
diff --git a/paddle/function/FunctionTest.cpp b/paddle/function/FunctionTest.cpp
deleted file mode 100644
index f5e6ca3f515a7fcd1498979703a0a59ddca40742..0000000000000000000000000000000000000000
--- a/paddle/function/FunctionTest.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Function.h"
-#include <gtest/gtest.h>
-#include "paddle/math/SparseMatrix.h"
-
-namespace paddle {
-
-template <DeviceType DType>
-void FunctionApi(typename Tensor<real, DType>::Matrix& output,
-                 const typename Tensor<real, DType>::Matrix& input);
-
-template <>
-void FunctionApi<DEVICE_TYPE_CPU>(CpuMatrix& output, const CpuMatrix& input) {
-  EXPECT_EQ(output.getHeight(), 100U);
-  EXPECT_EQ(output.getWidth(), 200U);
-}
-
-template <>
-void FunctionApi<DEVICE_TYPE_GPU>(GpuMatrix& output, const GpuMatrix& input) {
-  EXPECT_EQ(output.getHeight(), 10U);
-  EXPECT_EQ(output.getWidth(), 20U);
-}
-
-template <DeviceType DType>
-void Function(const BufferArgs& arguments) {
-  const auto input = arguments[0].matrix<DType>();
-  auto output = arguments[1].matrix<DType>();
-  FunctionApi<DType>(output, input);
-}
-
-TEST(Function, BufferArgs) {
-  CpuMatrix cpuInput = CpuMatrix(100, 200);
-  CpuMatrix cpuOutput = CpuMatrix(100, 200);
-  BufferArgs cpuArgments;
-  cpuArgments.addArg(cpuInput);
-  cpuArgments.addArg(cpuOutput);
-  Function<DEVICE_TYPE_CPU>(cpuArgments);
-
-  GpuMatrix gpuInput = GpuMatrix(10, 20);
-  GpuMatrix gpuOutput = GpuMatrix(10, 20);
-  BufferArgs gpuArgments;
-  gpuArgments.addArg(gpuInput);
-  gpuArgments.addArg(gpuOutput);
-  Function<DEVICE_TYPE_GPU>(gpuArgments);
-}
-
-/**
- * Some tests case are used to check the consistency between the BufferArg type
- * argument received by Function and the original type argument.
- *
- * Use Case:
- *  TEST() {
- *    Matrix matrix(...);
- *    CheckBufferArg lambda = [=](const BufferArg& arg) {
- *      // check matrix and arg are equivalent
- *      EXPECT_EQ(matrix, arg);
- *    }
- *
- *   BufferArgs argments{matrix...};
- *   std::vector<CheckBufferArg> checkFunc{lambda...};
- *   testBufferArgs(argments, checkFunc);
- *  }
- */
-typedef std::function<void(const BufferArg&)> CheckBufferArg;
-
-void testBufferArgs(const BufferArgs& inputs,
-                    const std::vector<CheckBufferArg>& check) {
-  EXPECT_EQ(inputs.size(), check.size());
-  for (size_t i = 0; i < inputs.size(); i++) {
-    check[i](inputs[i]);
-  }
-}
-
-void testBufferArgs(const BufferArgs& inputs, const CheckBufferArg& check) {
-  EXPECT_EQ(inputs.size(), 1U);
-  check(inputs[0]);
-}
-
-TEST(Arguments, Matrix) {
-  MatrixPtr matrix = Matrix::create(100, 200);
-  CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 2U);
-    EXPECT_EQ(arg.shape()[0], 100U);
-    EXPECT_EQ(arg.shape()[1], 200U);
-    EXPECT_EQ(arg.data(), matrix->getData());
-
-    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getHeight(), matrix->getHeight());
-    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getWidth(), matrix->getWidth());
-    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
-  };
-
-  BufferArgs argments;
-  argments.addArg(*matrix);
-  std::vector<CheckBufferArg> checkFunc;
-  checkFunc.push_back(check);
-  testBufferArgs(argments, checkFunc);
-}
-
-TEST(Arguments, Vector) {
-  VectorPtr vector = Vector::create(100, false);
-  CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 1U);
-    EXPECT_EQ(arg.shape()[0], 100U);
-    EXPECT_EQ(arg.data(), vector->getData());
-
-    CpuVector inVector = arg.vector<real, DEVICE_TYPE_CPU>();
-    EXPECT_EQ(inVector.getSize(), vector->getSize());
-    EXPECT_EQ(inVector.getData(), vector->getData());
-  };
-
-  BufferArgs argments;
-  argments.addArg(*vector);
-  std::vector<CheckBufferArg> checkFunc;
-  checkFunc.push_back(check);
-  testBufferArgs(argments, checkFunc);
-}
-
-TEST(Arguments, CpuSparseMatrix) {
-  CpuSparseMatrix sparse(200, 300, 50);
-  CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 2U);
-    EXPECT_EQ(arg.shape()[0], 200U);
-    EXPECT_EQ(arg.shape()[1], 300U);
-    EXPECT_EQ(arg.data(), sparse.getData());
-    // CHECK_EQ(arg.sparse().nnz(), 50);
-    // CHECK_EQ(arg.sparse().dataFormat(), SPARSE_CSR_FORMAT);
-    // CHECK_EQ(arg.sparse().dataType(), SPARSE_FLOAT_VALUE);
-    EXPECT_EQ(arg.sparse().getRowBuf(), sparse.getRows());
-    EXPECT_EQ(arg.sparse().getColBuf(), sparse.getCols());
-  };
-
-  BufferArgs argments;
-  argments.addArg(sparse);
-  std::vector<CheckBufferArg> checkFunc;
-  checkFunc.push_back(check);
-  testBufferArgs(argments, checkFunc);
-}
-
-TEST(Arguments, BufferArg) {
-  BufferArg arg(nullptr, VALUE_TYPE_FLOAT, {1, 2, 3});
-  CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 3U);
-    EXPECT_EQ(arg.shape()[0], 1U);
-    EXPECT_EQ(arg.shape()[1], 2U);
-    EXPECT_EQ(arg.shape()[2], 3U);
-  };
-
-  BufferArgs argments;
-  argments.addArg(arg);
-  testBufferArgs(argments, check);
-}
-
-}  // namespace paddle
diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
deleted file mode 100644
index 56c3537b6a96c8042d172f8aca2163fa18c813c1..0000000000000000000000000000000000000000
--- a/paddle/function/FunctionTest.h
+++ /dev/null
@@ -1,410 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Function.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/math/tests/TensorCheck.h"
-#include "paddle/testing/TestUtil.h"
-
-namespace paddle {
-
-typedef std::shared_ptr<BufferArg> BufferArgPtr;
-
-namespace test {
-template <DeviceType DType>
-struct Allocator;
-
-template <>
-struct Allocator<DEVICE_TYPE_CPU> {
-  using type = CpuMemoryHandle;
-};
-
-template <>
-struct Allocator<DEVICE_TYPE_GPU> {
-  using type = GpuMemoryHandle;
-};
-
-// Copy argument1 to argument2
-template <DeviceType DType1, DeviceType DType2>
-class CopyArgument {
-public:
-  void operator()(const BufferArg& arg1, BufferArg& arg2) {
-    CHECK_EQ(arg1.valueType(), arg2.valueType());
-    CHECK_LE(arg1.shape().getElements(), arg2.shape().getElements());
-
-    if (arg1.valueType() == VALUE_TYPE_INT32) {
-      IVectorPtr vector1 =
-          IVector::create((int*)arg1.data(),
-                          arg1.shape().getElements(),
-                          DType1 == DEVICE_TYPE_CPU ? false : true);
-      IVectorPtr vector2 =
-          IVector::create((int*)arg2.data(),
-                          arg2.shape().getElements(),
-                          DType2 == DEVICE_TYPE_CPU ? false : true);
-      vector2->copyFrom(*vector1);
-    } else {
-      VectorPtr vector1 =
-          Vector::create((real*)arg1.data(),
-                         arg1.shape().getElements(),
-                         DType1 == DEVICE_TYPE_CPU ? false : true);
-      VectorPtr vector2 =
-          Vector::create((real*)arg2.data(),
-                         arg2.shape().getElements(),
-                         DType2 == DEVICE_TYPE_CPU ? false : true);
-      vector2->copyFrom(*vector1);
-    }
-  }
-};
-}  // namespace test
-
-/**
- * \brief A class for comparing two Functions of different implementations.
- *        For example, can be used to compare the CPU and GPU implementation
- *        of the function is consistent.
- *
- * Use case:
- *  // Initializes a test object, the corresponding cpu and gpu Function
- *  // are constructed according to FunctionName and FuncConfig.
- *  CpuGpuFuncCompare test(FunctionName, FuncConfig);
- *  // Prepare inputs and outputs arguments.
- *  // Here the input and output can not contain real data,
- *  // only contains the argument type and shape.
- *  test.addInputs(input1);
- *  test.addInputs(input2);
- *  test.addOutputs(output1);
- *  test.addOutputs(output2);
- *  // Run.
- *  // Will according to the type and shape of arguments(inputs_/outputs_),
- *  // automatic initialization cpu and gpu function required arguments
- *  // (cpuInputs_/cpuOutputs_/gpuInputs_/gpuOutputs_).
- *  // Call the CPU and GPU Function calculation results.
- *  // Compares CPU and GPU calculation results for consistency.
- *  test.run();
- */
-template <DeviceType DType1, DeviceType DType2>
-class Compare2Function {
-public:
-  typedef typename test::Allocator<DType1>::type Allocator1;
-  typedef typename test::Allocator<DType2>::type Allocator2;
-  typedef typename Tensor<real, DType1>::Vector Vector1;
-  typedef typename Tensor<real, DType2>::Vector Vector2;
-  typedef typename Tensor<real, DType1>::SparseMatrix SparseMatrix1;
-  typedef typename Tensor<real, DType2>::SparseMatrix SparseMatrix2;
-
-  Compare2Function(const std::string& name1,
-                   const std::string& name2,
-                   const FuncConfig& config)
-      : function1_(FunctionBase::funcRegistrar_.createByType(name1)),
-        function2_(FunctionBase::funcRegistrar_.createByType(name2)) {
-    function1_->init(config);
-    function2_->init(config);
-    initArgsCallback_ = nullptr;
-  }
-
-  ~Compare2Function() {}
-
-  // input need only contains shape, do not contains data.
-  void addInputs(const BufferArg& input) {
-    size_t size =
-        input.shape().getElements() * sizeOfValuType(input.valueType());
-    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
-
-    func1Inputs_.emplace_back(std::make_shared<BufferArg>(
-        func1Memory_.back()->getBuf(), input.valueType(), input.shape()));
-    func2Inputs_.emplace_back(std::make_shared<BufferArg>(
-        func2Memory_.back()->getBuf(), input.valueType(), input.shape()));
-  }
-
-  // assume one copy of sequence is shared by different SequenceArgs
-  void addSequence(const SequenceIdArg& input) {
-    CHECK_EQ(input.shape().ndims(), 1UL);
-    size_t batchSize = input.shape()[0];
-    size_t numSeqs = batchSize / 10 + 1;
-    size_t sizeId = (numSeqs + 1) * sizeOfValuType(VALUE_TYPE_INT32);
-    func1Memory_.emplace_back(std::make_shared<Allocator1>(sizeId));
-    func2Memory_.emplace_back(std::make_shared<Allocator2>(sizeId));
-    seq1_ = std::make_shared<SequenceIdArg>(func1Memory_.back()->getBuf(),
-                                            TensorShape{numSeqs + 1});
-    seq2_ = std::make_shared<SequenceIdArg>(func2Memory_.back()->getBuf(),
-                                            TensorShape{numSeqs + 1});
-    /// init sequence Id
-    initArg(*seq1_, batchSize);
-
-    copyArg_(*seq1_, *seq2_);
-  }
-
-  void addInputs(const SequenceArg& input) {
-    CHECK_EQ(input.shape().ndims(), 2UL);
-    size_t batchSize = input.shape()[0];
-    if (!seq1_ || !seq2_) {  // sequence not exist
-      addSequence(SequenceIdArg(TensorShape{batchSize}));
-    }
-
-    size_t size =
-        input.shape().getElements() * sizeOfValuType(input.valueType());
-    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
-
-    /// SequenceArg
-    func1Inputs_.emplace_back(
-        std::make_shared<SequenceArg>(func1Memory_.back()->getBuf(),
-                                      input.valueType(),
-                                      input.shape(),
-                                      *seq1_));
-    func2Inputs_.emplace_back(
-        std::make_shared<SequenceArg>(func2Memory_.back()->getBuf(),
-                                      input.valueType(),
-                                      input.shape(),
-                                      *seq2_));
-  }
-
-  void registerInitCallback(std::function<void(BufferArg&, size_t)> callback) {
-    initArgsCallback_ = callback;
-  }
-
-  // output need only contains shape, do not contains data.
-  void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) {
-    size_t size =
-        output.shape().getElements() * sizeOfValuType(output.valueType());
-    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
-
-    func1Outputs_.emplace_back(
-        std::make_shared<BufferArg>(func1Memory_.back()->getBuf(),
-                                    output.valueType(),
-                                    output.shape(),
-                                    argType));
-    func2Outputs_.emplace_back(
-        std::make_shared<BufferArg>(func2Memory_.back()->getBuf(),
-                                    output.valueType(),
-                                    output.shape(),
-                                    argType));
-  }
-
-  /// add and init output sparse matrix
-  void addOutputs(const SparseMatrixArg& output, ArgType argType = ASSIGN_TO) {
-    sparse1_ = std::make_shared<SparseMatrix1>(
-        output.shape()[0],
-        output.shape()[1],
-        output.nnz(),
-        static_cast<SparseValueType>(output.dataType()),
-        static_cast<SparseFormat>(output.dataFormat()));
-
-    sparse2_ = std::make_shared<SparseMatrix2>(
-        output.shape()[0],
-        output.shape()[1],
-        output.nnz(),
-        static_cast<SparseValueType>(output.dataType()),
-        static_cast<SparseFormat>(output.dataFormat()));
-
-    /// init sparse matrix
-    hl_stream_t stream(HPPL_STREAM_1);
-    sparse1_->randomizeUniform();
-    sparse2_->copyFrom(*sparse1_, stream);
-    hl_stream_synchronize(stream);
-
-    func1Outputs_.emplace_back(
-        std::make_shared<SparseMatrixArg>(*sparse1_, argType));
-    func2Outputs_.emplace_back(
-        std::make_shared<SparseMatrixArg>(*sparse2_, argType));
-  }
-
-  void addOutputs(const SequenceArg& output, ArgType argType = ASSIGN_TO) {
-    CHECK_EQ(output.shape().ndims(), 2UL);
-    size_t batchSize = output.shape()[0];
-
-    if (!seq1_ || !seq2_) {  // sequence not exist
-      addSequence(SequenceIdArg(TensorShape{batchSize}));
-    }
-    size_t size =
-        output.shape().getElements() * sizeOfValuType(output.valueType());
-    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
-
-    /// SequenceArg
-    func1Outputs_.emplace_back(
-        std::make_shared<SequenceArg>(func1Memory_.back()->getBuf(),
-                                      output.valueType(),
-                                      output.shape(),
-                                      *seq1_,
-                                      argType));
-    func2Outputs_.emplace_back(
-        std::make_shared<SequenceArg>(func2Memory_.back()->getBuf(),
-                                      output.valueType(),
-                                      output.shape(),
-                                      *seq2_,
-                                      argType));
-  }
-
-  void addInputs(const SparseMatrixArg& input) {
-    sparse1_ = std::make_shared<SparseMatrix1>(
-        input.shape()[0],
-        input.shape()[1],
-        input.nnz(),
-        static_cast<SparseValueType>(input.dataType()),
-        static_cast<SparseFormat>(input.dataFormat()));
-
-    sparse2_ = std::make_shared<SparseMatrix2>(
-        input.shape()[0],
-        input.shape()[1],
-        input.nnz(),
-        static_cast<SparseValueType>(input.dataType()),
-        static_cast<SparseFormat>(input.dataFormat()));
-
-    /// init sparse matrix
-    hl_stream_t stream(HPPL_STREAM_1);
-    sparse1_->randomizeUniform();
-    sparse2_->copyFrom(*sparse1_, stream);
-    hl_stream_synchronize(stream);
-
-    func1Inputs_.emplace_back(std::make_shared<SparseMatrixArg>(*sparse1_));
-    func2Inputs_.emplace_back(std::make_shared<SparseMatrixArg>(*sparse2_));
-  }
-
-  void run() {
-    // prepare cpu/gpu arguments
-    initInputs();
-
-    initOutputs();
-    // function calculate
-    auto callFunction = [](FunctionBase* function,
-                           std::vector<BufferArgPtr>& inputs,
-                           std::vector<BufferArgPtr>& outputs) {
-      BufferArgs inArgs;
-      BufferArgs outArgs;
-      for (auto arg : inputs) {
-        inArgs.addArg(*arg);
-      }
-      for (auto arg : outputs) {
-        outArgs.addArg(*arg);
-      }
-      function->calc(inArgs, outArgs);
-    };
-
-    callFunction(function1_.get(), func1Inputs_, func1Outputs_);
-    callFunction(function2_.get(), func2Inputs_, func2Outputs_);
-
-    // check outputs
-    compareOutputs();
-  }
-
-  std::shared_ptr<FunctionBase> getFunction1() const { return function1_; }
-
-  std::shared_ptr<FunctionBase> getFunction2() const { return function2_; }
-
-protected:
-  // only init cpu argument, gpu argument copy from cpu argument.
-  void initArg(BufferArg& arg) {
-    Vector1 vector(arg.shape().getElements(), (real*)arg.data());
-    vector.uniform(0.001, 1);
-  }
-
-  void initArg(SequenceArg& arg) {
-    /// init only matrix
-    Vector1 vector(arg.shape().getElements(), (real*)arg.data());
-    vector.uniform(0.001, 1);
-  }
-
-  void initArg(SequenceIdArg& arg, size_t batchSize) {
-    size_t numSeqs = arg.numSeqs();
-    int* buf = reinterpret_cast<int*>(arg.data());
-    int pos = 0;
-    size_t maxLen = 2 * batchSize / numSeqs;
-    for (int i = 0; i < (int)numSeqs; ++i) {
-      int len = 1 + uniformRandom(std::min<int64_t>(
-                        maxLen, batchSize - pos - numSeqs + i));
-      buf[i] = pos;
-      pos += len;
-      VLOG(1) << " len=" << len;
-    }
-    buf[numSeqs] = batchSize;
-  }
-
-  void initInputs() {
-    for (size_t i = 0; i < func1Inputs_.size(); i++) {
-      if (func1Inputs_[i]->isSparseArg()) {
-        continue;  /// sparse matrix already init
-      }
-
-      if (func1Inputs_[i]->isSequenceArg()) {
-        initArg(dynamic_cast<SequenceArg&>(*func1Inputs_[i]));
-      } else {
-        initArg(*func1Inputs_[i]);
-      }
-
-      if (initArgsCallback_ != nullptr) {
-        initArgsCallback_(*func1Inputs_[i], i);
-      }
-
-      copyArg_(*func1Inputs_[i], *func2Inputs_[i]);
-    }
-  }
-
-  void initOutputs() {
-    for (size_t i = 0; i < func1Outputs_.size(); i++) {
-      if (func1Outputs_[i]->isSparseArg()) {
-        continue;  /// sparse matrix already init
-      }
-
-      if (func1Outputs_[i]->isSequenceArg()) {
-        initArg(dynamic_cast<SequenceArg&>(*func1Outputs_[i]));
-      } else {
-        initArg(*func1Outputs_[i]);
-      }
-
-      copyArg_(*func1Outputs_[i], *func2Outputs_[i]);
-    }
-  }
-
-  void compareOutputs() {
-    for (size_t i = 0; i < func1Outputs_.size(); i++) {
-      // TODO, Need a BufferCheck used to compare the two buffers.
-      const auto cpu = func1Outputs_[i];
-      const auto gpu = func2Outputs_[i];
-      CHECK_EQ(cpu->numElements(), gpu->numElements());
-      Vector1 cpuVector(cpu->numElements(), (real*)cpu->data());
-      Vector2 gpuVector(gpu->numElements(), (real*)gpu->data());
-      autotest::TensorCheckErr(cpuVector, gpuVector);
-    }
-  }
-
-protected:
-  std::shared_ptr<FunctionBase> function1_;
-  std::shared_ptr<FunctionBase> function2_;
-  std::vector<std::shared_ptr<Allocator1>> func1Memory_;
-  std::vector<std::shared_ptr<Allocator2>> func2Memory_;
-  std::vector<BufferArgPtr> func1Inputs_;
-  std::vector<BufferArgPtr> func1Outputs_;
-  std::vector<BufferArgPtr> func2Inputs_;
-  std::vector<BufferArgPtr> func2Outputs_;
-  std::shared_ptr<SparseMatrix1> sparse1_;
-  std::shared_ptr<SparseMatrix2> sparse2_;
-  std::shared_ptr<SequenceIdArg> seq1_;
-  std::shared_ptr<SequenceIdArg> seq2_;
-  test::CopyArgument<DType1, DType2> copyArg_;
-  std::function<void(BufferArg&, size_t)> initArgsCallback_;
-};
-
-class CpuGpuFuncCompare
-    : public Compare2Function<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> {
-public:
-  CpuGpuFuncCompare(const std::string& name, const FuncConfig& config)
-      : Compare2Function(name + "-CPU", name + "-GPU", config) {}
-
-  ~CpuGpuFuncCompare() {}
-};
-
-}  // namespace paddle
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
deleted file mode 100644
index 2b7c6f9eab223c8d6a2107ff4605ac6e60295f7d..0000000000000000000000000000000000000000
--- a/paddle/function/GemmConvOp.cpp
+++ /dev/null
@@ -1,522 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvOp.h"
-#include "GemmFunctor.h"
-#include "Im2Col.h"
-#include "paddle/math/MemoryHandle.h"
-
-namespace paddle {
-
-/*
- * \brief Forward calculation of convolution.
- */
-template <DeviceType Device>
-class GemmConvFunction : public ConvFunctionBase {
-public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-    // TODO(hedaoyuan): Need to define some index macros,
-    // to avoid useing 0 and 1.
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    real beta;
-    if (outputs[0].getArgType() == ADD_TO) {
-      beta = 1.0;
-    } else {
-      beta = 0.0;
-    }
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-
-    real* inputData = inputs[0].data<real>();
-    real* filterData = inputs[1].data<real>();
-    real* outputData = outputs[0].data<real>();
-    bool needIm2col = isNeedIm2col(filter);
-
-    TensorShape imShape =
-        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-
-    TensorShape colShape;
-    real* colData = NULL;
-
-    if (needIm2col) {
-      colShape = TensorShape({inputChannels / groups_,
-                              filterHeight,
-                              filterWidth,
-                              outputHeight,
-                              outputWidth});
-      resizeBuffer<Device>(colShape.getElements());
-      colData = reinterpret_cast<real*>(memory_->getBuf());
-    }
-
-    Im2ColFunctor<kCFO, Device, real> im2col;
-    size_t inputOffset = imShape.getElements();
-    size_t outputOffset =
-        (outputChannels / groups_) * outputHeight * outputWidth;
-    size_t filterOffset = filter.getElements() / groups_;
-
-    for (size_t i = 0; i < batchSize; i++) {
-      for (size_t g = 0; g < groups_; g++) {
-        if (needIm2col) {
-          im2col(inputData + g * inputOffset,
-                 imShape,
-                 colData,
-                 colShape,
-                 strideH(),
-                 strideW(),
-                 paddingH(),
-                 paddingW(),
-                 dilationH(),
-                 dilationW());
-        } else {
-          colData = inputData + g * inputOffset;
-        }
-        int M = outputChannels / groups_;
-        int N = outputHeight * outputWidth;
-        int K = inputChannels / groups_ * filterHeight * filterWidth;
-        BlasGemm<Device, real>::compute(false,
-                                        false,
-                                        M,
-                                        N,
-                                        K,
-                                        1.0f,
-                                        filterData + g * filterOffset,
-                                        K,
-                                        colData,
-                                        N,
-                                        beta,
-                                        outputData + g * outputOffset,
-                                        N);
-      }
-      inputData += inputChannels * inputHeight * inputWidth;
-      outputData += outputChannels * outputHeight * outputWidth;
-    }
-  }
-};
-
-#ifdef PADDLE_MOBILE_INFERENCE
-
-/*
- * \brief Forward calculation of convolution, optimized for mobile.
- */
-template <DeviceType Device>
-class GemmConvMobileFunction : public ConvFunctionBase {
-public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-    // TODO(hedaoyuan): Need to define some index macros,
-    // to avoid useing 0 and 1.
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    real beta;
-    if (outputs[0].getArgType() == ADD_TO) {
-      beta = 1.0;
-    } else {
-      beta = 0.0;
-    }
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-
-    real* inputData = inputs[0].data<real>();
-    real* filterData = inputs[1].data<real>();
-    real* outputData = outputs[0].data<real>();
-    real* colData = NULL;
-    bool needIm2col = isNeedIm2col(filter);
-
-    TensorShape imShape =
-        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-    TensorShape colShape;
-
-    // Max col matrix width 4096, Max col matrix size 4M.
-    size_t outputHeightSteps =
-        std::min(std::max(4096 / outputWidth, (size_t)1), outputHeight);
-    size_t maxColWidth = outputHeightSteps * outputWidth;
-    size_t channelSteps =
-        std::min(std::max((1048576 / maxColWidth) / filterHeight * filterWidth,
-                          (size_t)1),
-                 inputChannels / groups_);
-    size_t maxColHeight = channelSteps * filterHeight * filterWidth;
-
-    if (needIm2col) {
-      colShape = TensorShape({inputChannels / groups_,
-                              filterHeight,
-                              filterWidth,
-                              outputHeight,
-                              outputWidth});
-
-      resizeBuffer<Device>(maxColHeight * maxColWidth * sizeof(real));
-      colData = reinterpret_cast<real*>(memory_->getBuf());
-    }
-
-    Im2ColMobileFunctor<real> im2col;
-    size_t inputOffset = imShape.getElements();
-    size_t outputOffset =
-        (outputChannels / groups_) * outputHeight * outputWidth;
-    size_t filterOffset = filter.getElements() / groups_;
-
-    int nStride = outputHeight * outputWidth;
-    int kStride = inputChannels / groups_ * filterHeight * filterWidth;
-    for (size_t i = 0; i < batchSize; i++) {
-      filterData = inputs[1].data<real>();
-      for (size_t g = 0; g < groups_; g++) {
-        if (needIm2col) {
-          real beta_ = beta;
-          for (size_t ic = 0; ic < inputChannels / groups_;
-               ic += channelSteps) {
-            int channels = std::min(inputChannels / groups_ - ic, channelSteps);
-            for (size_t oh = 0; oh < outputHeight; oh += outputHeightSteps) {
-              int height = std::min(outputHeight - oh, outputHeightSteps);
-
-              int M = outputChannels / groups_;
-              int N = height * outputWidth;
-              int K = channels * filterHeight * filterWidth;
-              // im2col
-              im2col(inputData,
-                     imShape,
-                     colData,
-                     colShape,
-                     strideH(),
-                     strideW(),
-                     paddingH(),
-                     paddingW(),
-                     dilationH(),
-                     dilationW(),
-                     channels,
-                     oh,
-                     height,
-                     N);
-
-              // gemm
-              BlasGemm<Device, real>::compute(
-                  false,
-                  false,
-                  M,
-                  N,
-                  K,
-                  1.0f,
-                  filterData + ic * filterHeight * filterWidth,
-                  kStride,
-                  colData,
-                  N,
-                  beta_,
-                  outputData + oh * outputWidth,
-                  nStride);
-            }
-            beta_ = 1.0;
-          }
-        } else {
-          int M = outputChannels / groups_;
-          int N = outputHeight * outputWidth;
-          int K = inputChannels / groups_ * filterHeight * filterWidth;
-          BlasGemm<Device, real>::compute(false,
-                                          false,
-                                          M,
-                                          N,
-                                          K,
-                                          1.0f,
-                                          filterData,
-                                          K,
-                                          inputData,
-                                          N,
-                                          beta,
-                                          outputData,
-                                          N);
-        }
-        inputData += inputOffset;
-        outputData += outputOffset;
-        filterData += filterOffset;
-      }
-    }
-
-    memory_.reset();
-  }
-};
-
-#endif
-
-/*
- * \brief Backward input calculation of convolution.
- */
-template <DeviceType Device>
-class GemmConvGradInputFunction : public ConvFunctionBase {
-public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& input = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-    // Since the implementation of Col2ImFunctor is ADD_TO,
-    // this function only supports ADD_TO mode.
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& input = outputs[0].shape();
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-
-    real* outputGrad = inputs[0].data<real>();
-    real* filterData = inputs[1].data<real>();
-    real* inputGrad = outputs[0].data<real>();
-    bool needIm2col = isNeedIm2col(filter);
-
-    TensorShape imShape =
-        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-
-    TensorShape colShape;
-    real* colData = NULL;
-
-    if (needIm2col) {
-      colShape = TensorShape({inputChannels / groups_,
-                              filterHeight,
-                              filterWidth,
-                              outputHeight,
-                              outputWidth});
-      resizeBuffer<Device>(colShape.getElements());
-      colData = reinterpret_cast<real*>(memory_->getBuf());
-    }
-
-    Col2ImFunctor<kCFO, Device, real> col2im;
-    size_t inputOffset = imShape.getElements();
-    size_t outputOffset =
-        (outputChannels / groups_) * outputHeight * outputWidth;
-    size_t filterOffset = filter.getElements() / groups_;
-
-    for (size_t i = 0; i < batchSize; i++) {
-      for (size_t g = 0; g < groups_; g++) {
-        int K = outputChannels / groups_;
-        int N = outputHeight * outputWidth;
-        int M = inputChannels / groups_ * filterHeight * filterWidth;
-        real scale = 0.0f;
-        if (!needIm2col) {
-          colData = inputGrad + g * inputOffset;
-          scale = 1.0f;
-        }
-        BlasGemm<Device, real>::compute(true,
-                                        false,
-                                        M,
-                                        N,
-                                        K,
-                                        1.0f,
-                                        filterData + g * filterOffset,
-                                        M,
-                                        outputGrad + g * outputOffset,
-                                        N,
-                                        scale,
-                                        colData,
-                                        N);
-        if (needIm2col) {
-          col2im(inputGrad + g * inputOffset,
-                 imShape,
-                 colData,
-                 colShape,
-                 strideH(),
-                 strideW(),
-                 paddingH(),
-                 paddingW(),
-                 dilationH(),
-                 dilationW());
-        }
-      }
-      inputGrad += inputChannels * inputHeight * inputWidth;
-      outputGrad += outputChannels * outputHeight * outputWidth;
-    }
-  }
-};
-
-/*
- * \brief Backward filter calculation of convolution.
- */
-template <DeviceType Device>
-class GemmConvGradFilterFunction : public ConvFunctionBase {
-public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& input = inputs[1].shape();
-    const TensorShape& filter = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-    const TensorShape& output = inputs[0].shape();
-    const TensorShape& input = inputs[1].shape();
-    const TensorShape& filter = outputs[0].shape();
-
-    real beta;
-    if (outputs[0].getArgType() == ADD_TO) {
-      beta = 1.0;
-    } else {
-      beta = 0.0;
-    }
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-
-    real* outputGrad = inputs[0].data<real>();
-    real* inputData = inputs[1].data<real>();
-    real* filterGrad = outputs[0].data<real>();
-    bool needIm2col = isNeedIm2col(filter);
-
-    TensorShape imShape =
-        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-
-    TensorShape colShape;
-    real* colData = NULL;
-
-    if (needIm2col) {
-      colShape = TensorShape({inputChannels / groups_,
-                              filterHeight,
-                              filterWidth,
-                              outputHeight,
-                              outputWidth});
-      resizeBuffer<Device>(colShape.getElements());
-      colData = reinterpret_cast<real*>(memory_->getBuf());
-    }
-
-    Im2ColFunctor<kCFO, Device, real> im2col;
-    size_t inputOffset = imShape.getElements();
-    size_t outputOffset =
-        (outputChannels / groups_) * outputHeight * outputWidth;
-    size_t filterOffset = filter.getElements() / groups_;
-    for (size_t i = 0; i < batchSize; i++) {
-      for (size_t g = 0; g < groups_; g++) {
-        if (needIm2col) {
-          im2col(inputData + g * inputOffset,
-                 imShape,
-                 colData,
-                 colShape,
-                 strideH(),
-                 strideW(),
-                 paddingH(),
-                 paddingW(),
-                 dilationH(),
-                 dilationW());
-        } else {
-          colData = inputData + g * inputOffset;
-        }
-        int M = outputChannels / groups_;
-        int K = outputHeight * outputWidth;
-        int N = inputChannels / groups_ * filterHeight * filterWidth;
-        BlasGemm<Device, real>::compute(false,
-                                        true,
-                                        M,
-                                        N,
-                                        K,
-                                        1.0f,
-                                        outputGrad + g * outputOffset,
-                                        K,
-                                        colData,
-                                        K,
-                                        i == 0 ? beta : 1.0f,
-                                        filterGrad + g * filterOffset,
-                                        N);
-      }
-      inputData += inputChannels * inputHeight * inputWidth;
-      outputGrad += outputChannels * outputHeight * outputWidth;
-    }
-  }
-};
-
-#ifdef PADDLE_MOBILE_INFERENCE
-REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvMobileFunction);
-#else
-REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction);
-#endif
-REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction);
-REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(GemmConv, GPU, GemmConvFunction);
-REGISTER_TYPED_FUNC(GemmConvGradInput, GPU, GemmConvGradInputFunction);
-REGISTER_TYPED_FUNC(GemmConvGradFilter, GPU, GemmConvGradFilterFunction);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/GemmFunctor.cpp b/paddle/function/GemmFunctor.cpp
deleted file mode 100644
index 0b1fe1b67d8fd6caf86a08bc05e250b1936e9f85..0000000000000000000000000000000000000000
--- a/paddle/function/GemmFunctor.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GemmFunctor.h"
-#include "paddle/math/MathFunctions.h"
-
-namespace paddle {
-
-template <class T>
-struct BlasGemm<DEVICE_TYPE_CPU, T> {
-  static void compute(const bool transA,
-                      const bool transB,
-                      const int M,
-                      const int N,
-                      const int K,
-                      const T alpha,
-                      const T* A,
-                      const int lda,
-                      const T* B,
-                      const int ldb,
-                      const T beta,
-                      T* C,
-                      const int ldc) {
-#ifdef PADDLE_USE_EIGEN_FOR_BLAS
-    EigenBlasGemm<T>::compute(
-        transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
-#else
-    gemm<T>(transA == false ? CblasNoTrans : CblasTrans,
-            transB == false ? CblasNoTrans : CblasTrans,
-            M,
-            N,
-            K,
-            alpha,
-            A,
-            lda,
-            B,
-            ldb,
-            beta,
-            C,
-            ldc);
-#endif
-  }
-};
-
-template <class T>
-struct BlasGemm<DEVICE_TYPE_GPU, T> {
-  static void compute(const bool transA,
-                      const bool transB,
-                      const int M,
-                      const int N,
-                      const int K,
-                      const T alpha,
-                      const T* A,
-                      const int lda,
-                      const T* B,
-                      const int ldb,
-                      const T beta,
-                      T* C,
-                      const int ldc) {
-    hl_matrix_mul((T*)A,
-                  transA == false ? HPPL_OP_N : HPPL_OP_T,
-                  (T*)B,
-                  transB == false ? HPPL_OP_N : HPPL_OP_T,
-                  C,
-                  M,
-                  N,
-                  K,
-                  alpha,
-                  beta,
-                  lda,
-                  ldb,
-                  ldc);
-  }
-};
-
-template struct BlasGemm<DEVICE_TYPE_CPU, real>;
-template struct BlasGemm<DEVICE_TYPE_GPU, real>;
-
-}  // namespace paddle
diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h
deleted file mode 100644
index 6a0778700037c142d62fdb99667403ade806f7c1..0000000000000000000000000000000000000000
--- a/paddle/function/Im2Col.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "TensorShape.h"
-#include "TensorType.h"
-#include "neon/neon_util.h"
-
-namespace paddle {
-
-/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */
-enum ColFormat { kCFO = 0, kOCF = 1 };
-
-/*
- * \brief Converts the image data of three dimensions(CHW) into a colData of
- *        five dimensions in the Im2ColFunctor calculation,
- *        And in the Col2ImFunctor calculation, it is reversed.
- *
- * \param imData   Image data.
- * \param imShape  The shape of imData,
- *                 [inputChannels, inputHeight, inputWidth].
- * \param colData  Column data.
- * \param colShape The shape of colData.
- *
- * If the template argument Format is kCFO, the shape of colData is:
- * [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
- * So, it is easy to reshape into a convolution matrix for convolution
- * calculation based on matrix multiplication.
- * The shape of convolution matrix is [height, width], where the height is equal
- * inputChannels * filterHeight * filterWidth, and the width is equal
- * outputHeight * outputWidth.
- *
- * Reshape:
- *     shape of colData           shape of convolution matrix
- *     [inputChannels,
- *      filterHeight,
- *      filterWidth,      ======>      [height, width]
- *      outputHeight,
- *      outputWidth]
- *
- * If the template argument Format is kOCF, the shape of colData is:
- * [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
- * So, it is easy to reshape into a sequence matrix for rnn calculation.
- * The shape of sequence matrix is [seqLength, stepSize], where the seqLength
- * is equal outputHeight * outputWidth, and the stepSize is equal
- * inputChannels * filterHeight * filterWidth.
- *
- * Reshape:
- *     shape of colData             shape of sequence matrix
- *     [outputHeight,
- *      outputWidth,
- *      inputChannels,    ======>    [seqLength, stepSize]
- *      filterHeight,
- *      filterWidth]
- *
- * \note The caller needs to ensure that imShape.inputChannels is equal to
- *       colShape.inputChannels.
- */
-template <ColFormat Format, DeviceType Device, class T>
-class Im2ColFunctor {
-public:
-  void operator()(const T* imData,
-                  const TensorShape& imShape,
-                  T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight = 1,
-                  int dilationWidth = 1);
-};
-
-template <ColFormat Format, DeviceType Device, class T>
-class Col2ImFunctor {
-public:
-  void operator()(T* imData,
-                  const TensorShape& imShape,
-                  const T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight = 1,
-                  int dilationWidth = 1);
-};
-
-template <class T>
-class Im2ColMobileFunctor {
-public:
-  void operator()(const T* imData,
-                  const TensorShape& imShape,
-                  T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight,
-                  int dilationWidth,
-                  int inputChannels,
-                  int colOffset,
-                  int colOutputHeight,
-                  int colWidth) {
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[1];
-    int filterWidth = colShape[2];
-    int outputWidth = colShape[4];
-
-    for (int ic = 0; ic < inputChannels; ic++) {
-      for (int oh = 0; oh < colOutputHeight; oh++) {
-        T* dstData = colData + oh * outputWidth;
-        for (int fh = 0; fh < filterHeight; fh++) {
-          for (int fw = 0; fw < filterWidth; fw++) {
-            int imRowIdx = (oh + colOffset) * strideHeight +
-                           fh * dilationHeight - paddingHeight;
-            if (imRowIdx < 0 || imRowIdx >= inputHeight) {
-              memset(dstData, 0, outputWidth * sizeof(T));
-            } else {
-              for (int ow = 0; ow < outputWidth; ow++) {
-                int imColIdx =
-                    ow * strideWidth + fw * dilationWidth - paddingWidth;
-                if (imColIdx < 0 || imColIdx >= inputWidth) {
-                  dstData[ow] = T(0);
-                } else {
-                  dstData[ow] = imData[imRowIdx * inputWidth + imColIdx];
-                }
-              }
-            }
-            dstData += colWidth;
-          }
-        }
-      }
-      colData += filterHeight * filterWidth * colWidth;
-      imData += inputHeight * inputWidth;
-    }
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/function/Im2ColOp.cpp b/paddle/function/Im2ColOp.cpp
deleted file mode 100644
index ad2aed8f3c237cf9c0f7f0dcc4900cac807e25ea..0000000000000000000000000000000000000000
--- a/paddle/function/Im2ColOp.cpp
+++ /dev/null
@@ -1,245 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Im2Col.h"
-
-namespace paddle {
-
-/*
- * imShape = [inputChannels, inputHeight, inputWidth]
- * colShape =
- *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
- */
-template <class T>
-class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> {
-public:
-  void operator()(const T* imData,
-                  const TensorShape& imShape,
-                  T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight,
-                  int dilationWidth) {
-    int inputChannels = imShape[0];
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[1];
-    int filterWidth = colShape[2];
-    int outputHeight = colShape[3];
-    int outputWidth = colShape[4];
-    int channelsCol = inputChannels * filterHeight * filterWidth;
-
-    for (int c = 0; c < channelsCol; ++c) {
-      int wOffset = c % filterWidth;
-      int hOffset = (c / filterWidth) % filterHeight;
-      int c_im = c / filterWidth / filterHeight;
-      for (int h = 0; h < outputHeight; ++h) {
-        for (int w = 0; w < outputWidth; ++w) {
-          int imRowIdx = h * strideHeight + hOffset * dilationHeight;
-          int imColIdx = w * strideWidth + wOffset * dilationWidth;
-          if ((imRowIdx - paddingHeight) < 0 ||
-              (imRowIdx - paddingHeight) >= inputHeight ||
-              (imColIdx - paddingWidth) < 0 ||
-              (imColIdx - paddingWidth) >= inputWidth) {
-            colData[(c * outputHeight + h) * outputWidth + w] = T(0);
-          } else {
-            imRowIdx += c_im * inputHeight - paddingHeight;
-            imColIdx -= paddingWidth;
-            colData[(c * outputHeight + h) * outputWidth + w] =
-                imData[imRowIdx * inputWidth + imColIdx];
-          }
-        }
-      }
-    }
-  }
-};
-
-/*
- * imShape = [inputChannels, inputHeight, inputWidth]
- * colShape =
- *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
- */
-template <class T>
-class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, T> {
-public:
-  void operator()(T* imData,
-                  const TensorShape& imShape,
-                  const T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight,
-                  int dilationWidth) {
-    int inputChannels = imShape[0];
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[1];
-    int filterWidth = colShape[2];
-    int outputHeight = colShape[3];
-    int outputWidth = colShape[4];
-    int channelsCol = inputChannels * filterHeight * filterWidth;
-
-    for (int c = 0; c < channelsCol; ++c) {
-      int wOffset = c % filterWidth;
-      int hOffset = (c / filterWidth) % filterHeight;
-      int c_im = c / filterWidth / filterHeight;
-      for (int h = 0; h < outputHeight; ++h) {
-        for (int w = 0; w < outputWidth; ++w) {
-          int imRowIdx = h * strideHeight + hOffset * dilationHeight;
-          int imColIdx = w * strideWidth + wOffset * dilationWidth;
-          if ((imRowIdx - paddingHeight) >= 0 &&
-              (imRowIdx - paddingHeight) < inputHeight &&
-              (imColIdx - paddingWidth) >= 0 &&
-              (imColIdx - paddingWidth) < inputWidth) {
-            imRowIdx += c_im * inputHeight - paddingHeight;
-            imColIdx -= paddingWidth;
-            imData[imRowIdx * inputWidth + imColIdx] +=
-                colData[(c * outputHeight + h) * outputWidth + w];
-          }
-        }
-      }
-    }
-  }
-};
-
-template class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, float>;
-template class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, double>;
-template class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, float>;
-template class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, double>;
-
-/*
- * imShape = [inputChannels, inputHeight, inputWidth]
- * colShape =
- *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
- */
-template <class T>
-class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, T> {
-public:
-  void operator()(const T* imData,
-                  const TensorShape& imShape,
-                  T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight = 1,
-                  int dilationWidth = 1) {
-    int inputChannels = imShape[0];
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[3];
-    int filterWidth = colShape[4];
-    int outputHeight = colShape[0];
-    int outputWidth = colShape[1];
-    for (int outputH = 0; outputH < outputHeight; ++outputH) {
-      for (int outputW = 0; outputW < outputWidth; ++outputW) {
-        for (int channel = 0; channel < inputChannels; ++channel) {
-          for (int filterH = 0; filterH < filterHeight; ++filterH) {
-            for (int filterW = 0; filterW < filterWidth; ++filterW) {
-              int imRowOffset = outputH * strideHeight +
-                                filterH * dilationHeight - paddingHeight;
-              int imColOffset = outputW * strideWidth +
-                                filterW * dilationWidth - paddingWidth;
-              int colDataOffset =
-                  (((outputH * outputWidth + outputW) * inputChannels +
-                    channel) *
-                       filterHeight +
-                   filterH) *
-                      filterWidth +
-                  filterW;
-              if (imRowOffset < 0 || imRowOffset >= inputHeight ||
-                  imColOffset < 0 || imColOffset >= inputWidth) {
-                colData[colDataOffset] = float(0);
-              } else {
-                int imDataOffset =
-                    (channel * inputHeight + imRowOffset) * inputWidth +
-                    imColOffset;
-                colData[colDataOffset] = imData[imDataOffset];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-/*
- * imShape = [inputChannels, inputHeight, inputWidth]
- * colShape =
- *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
- */
-template <class T>
-class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, T> {
-public:
-  void operator()(T* imData,
-                  const TensorShape& imShape,
-                  const T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight = 1,
-                  int dilationWidth = 1) {
-    int inputChannels = imShape[0];
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[3];
-    int filterWidth = colShape[4];
-    int outputHeight = colShape[0];
-    int outputWidth = colShape[1];
-    for (int outputH = 0; outputH < outputHeight; ++outputH) {
-      for (int outputW = 0; outputW < outputWidth; ++outputW) {
-        for (int channel = 0; channel < inputChannels; ++channel) {
-          for (int filterH = 0; filterH < filterHeight; ++filterH) {
-            for (int filterW = 0; filterW < filterWidth; ++filterW) {
-              int imRowOffset = outputH * strideHeight +
-                                filterH * dilationHeight - paddingHeight;
-              int imColOffset = outputW * strideWidth +
-                                filterW * dilationWidth - paddingWidth;
-              int colDataOffset =
-                  (((outputH * outputWidth + outputW) * inputChannels +
-                    channel) *
-                       filterHeight +
-                   filterH) *
-                      filterWidth +
-                  filterW;
-              if (imRowOffset >= 0 && imRowOffset < inputHeight &&
-                  imColOffset >= 0 && imColOffset < inputWidth) {
-                int imDataOffset =
-                    (channel * inputHeight + imRowOffset) * inputWidth +
-                    imColOffset;
-                imData[imDataOffset] += colData[colDataOffset];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, float>;
-template class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, double>;
-template class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, float>;
-template class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, double>;
-
-}  // namespace paddle
diff --git a/paddle/function/Im2ColOpGpu.cu b/paddle/function/Im2ColOpGpu.cu
deleted file mode 100644
index a944a0ee687fefc5e002096b9c5b869495554167..0000000000000000000000000000000000000000
--- a/paddle/function/Im2ColOpGpu.cu
+++ /dev/null
@@ -1,464 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Im2Col.h"
-#include "hl_device_functions.cuh"
-
-namespace paddle {
-
-template <class T>
-__global__ void im2col(const T* data_im,
-                       int numOuts,
-                       int height,
-                       int width,
-                       int blockH,
-                       int blockW,
-                       int strideH,
-                       int strideW,
-                       int paddingH,
-                       int paddingW,
-                       int dilationH,
-                       int dilationW,
-                       int height_col,
-                       int width_col,
-                       T* data_col) {
-  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < numOuts) {
-    int w_out = index % width_col;
-    index /= width_col;
-    int h_out = index % height_col;
-    int channel_in = index / height_col;
-    int channel_out = channel_in * blockH * blockW;
-    int h_in = h_out * strideH;
-    int w_in = w_out * strideW;
-
-    data_col += (channel_out * height_col + h_out) * width_col + w_out;
-    for (int i = 0; i < blockH; ++i) {
-      for (int j = 0; j < blockW; ++j) {
-        int rIdx = int(h_in + i * dilationH);
-        int cIdx = int(w_in + j * dilationW);
-        if ((rIdx - (int)paddingH) >= (int)height ||
-            (rIdx - (int)paddingH) < 0 ||
-            (cIdx - (int)paddingW) >= (int)width ||
-            (cIdx - (int)paddingW) < 0) {
-          *data_col = 0;
-        } else {
-          rIdx = rIdx + channel_in * height - paddingH;
-          cIdx = cIdx - paddingW;
-          *data_col = data_im[rIdx * width + cIdx];
-        }
-        data_col += height_col * width_col;
-      }
-    }
-  }
-}
-
-/*
- * imShape = [inputChannels, inputHeight, inputWidth]
- * colShape =
- *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
- */
-template <class T>
-class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, T> {
-public:
-  void operator()(const T* imData,
-                  const TensorShape& imShape,
-                  T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight,
-                  int dilationWidth) {
-    int inputChannels = imShape[0];
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[1];
-    int filterWidth = colShape[2];
-    int outputHeight = colShape[3];
-    int outputWidth = colShape[4];
-
-    int numKernels = inputChannels * outputHeight * outputWidth;
-    int blocks = (numKernels + 1024 - 1) / 1024;
-    int blockX = 512;
-    int blockY = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-    im2col<T><<<grid, threads, 0, STREAM_DEFAULT>>>(imData,
-                                                    numKernels,
-                                                    inputHeight,
-                                                    inputWidth,
-                                                    filterHeight,
-                                                    filterWidth,
-                                                    strideHeight,
-                                                    strideWidth,
-                                                    paddingHeight,
-                                                    paddingWidth,
-                                                    dilationHeight,
-                                                    dilationWidth,
-                                                    outputHeight,
-                                                    outputWidth,
-                                                    colData);
-    CHECK_SYNC("Im2ColFunctor GPU failed");
-  }
-};
-
-template <class T>
-__global__ void col2im(size_t n,
-                       const T* data_col,
-                       size_t height,
-                       size_t width,
-                       size_t channels,
-                       size_t blockH,
-                       size_t blockW,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t paddingH,
-                       size_t paddingW,
-                       size_t dilationH,
-                       size_t dilationW,
-                       size_t height_col,
-                       size_t width_col,
-                       T* data_im) {
-  size_t index =
-      (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < n) {
-    T val = 0;
-    int w = int(index % width);
-    int h = int((index / width) % height);
-    int c = int(index / (width * height));
-    int filterH = (blockH - 1) * dilationH + 1;
-    int filterW = (blockW - 1) * dilationW + 1;
-
-    if ((w - (int)paddingW) >= 0 &&
-        (w - (int)paddingW) < (width - 2 * paddingW) &&
-        (h - (int)paddingH) >= 0 && (h - paddingH) < (height - 2 * paddingH)) {
-      // compute the start and end of the output
-      int w_col_start =
-          (w < (int)filterW) ? 0 : (w - int(filterW)) / (int)strideW + 1;
-      int w_col_end = min((int)(w / (int)strideW + 1), (int)(width_col));
-      int h_col_start =
-          (h < (int)filterH) ? 0 : (h - (int)filterH) / (int)strideH + 1;
-      int h_col_end = min(int(h / strideH + 1), int(height_col));
-
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          // the col location: [c * width * height + h_out, w_out]
-          int h_k = (h - h_col * strideH);
-          int w_k = (w - w_col * strideW);
-          if (h_k % dilationH == 0 && w_k % dilationW == 0) {
-            h_k /= dilationH;
-            w_k /= dilationW;
-            int c_col =
-                (((c * blockH + h_k) * blockW + w_k) * height_col + h_col) *
-                    width_col +
-                w_col;
-            val += data_col[c_col];
-          }
-        }
-      }
-      h -= paddingH;
-      w -= paddingW;
-      data_im[c * ((width - 2 * paddingW) * (height - 2 * paddingH)) +
-              h * (width - 2 * paddingW) + w] += val;
-    }
-  }
-}
-
-/*
- * imShape = [inputChannels, inputHeight, inputWidth]
- * colShape =
- *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
- */
-template <class T>
-class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, T> {
-public:
-  void operator()(T* imData,
-                  const TensorShape& imShape,
-                  const T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight,
-                  int dilationWidth) {
-    int inputChannels = imShape[0];
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[1];
-    int filterWidth = colShape[2];
-    int outputHeight = colShape[3];
-    int outputWidth = colShape[4];
-
-    size_t numKernels = inputChannels * (inputHeight + 2 * paddingHeight) *
-                        (inputWidth + 2 * paddingWidth);
-
-    size_t blocks = (numKernels + 1024 - 1) / 1024;
-    size_t blockX = 512;
-    size_t blockY = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-
-    // To avoid involving atomic operations, we will launch one kernel per
-    // bottom dimension, and then in the kernel add up the top dimensions.
-    col2im<T><<<grid, threads, 0, STREAM_DEFAULT>>>(
-        numKernels,
-        colData,
-        inputHeight + 2 * paddingHeight,
-        inputWidth + 2 * paddingWidth,
-        inputChannels,
-        filterHeight,
-        filterWidth,
-        strideHeight,
-        strideWidth,
-        paddingHeight,
-        paddingWidth,
-        dilationHeight,
-        dilationWidth,
-        outputHeight,
-        outputWidth,
-        imData);
-    CHECK_SYNC("Col2ImFunctor GPU failed");
-  }
-};
-
-template class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, float>;
-template class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, double>;
-template class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, float>;
-template class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, double>;
-
-template <class T>
-__global__ void im2colOCF(const T* imData,
-                          T* colData,
-                          int inputChannels,
-                          int inputHeight,
-                          int inputWidth,
-                          int filterHeight,
-                          int filterWidth,
-                          int strideHeight,
-                          int strideWidth,
-                          int paddingHeight,
-                          int paddingWidth,
-                          int dilationHeight,
-                          int dilationWidth,
-                          int outputHeight,
-                          int outputWidth) {
-  int swId = blockIdx.x;
-  int shId = blockIdx.y;
-  for (int channelId = threadIdx.z; channelId < inputChannels;
-       channelId += blockDim.z) {
-    for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
-      for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
-        int widthOffset =
-            idx * dilationHeight + swId * strideWidth - paddingWidth;
-        int heightOffset =
-            idy * dilationWidth + shId * strideHeight - paddingHeight;
-        int imOffset = widthOffset + heightOffset * inputWidth +
-                       channelId * inputHeight * inputWidth;
-
-        int colOffset = idx + idy * filterWidth +
-                        channelId * filterHeight * filterWidth +
-                        (shId * outputWidth + swId) *
-                            (inputChannels * filterHeight * filterWidth);
-
-        if (heightOffset >= inputHeight || heightOffset < 0 ||
-            widthOffset >= inputWidth || widthOffset < 0) {
-          colData[colOffset] = T(0);
-        } else {
-          colData[colOffset] = imData[imOffset];
-        }
-      }
-    }
-  }
-}
-
-/*
- * imShape = [inputChannels, inputHeight, inputWidth]
- * colShape =
- *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
- */
-template <class T>
-class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, T> {
-public:
-  void operator()(const T* imData,
-                  const TensorShape& imShape,
-                  T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight,
-                  int dilationWidth) {
-    int inputChannels = imShape[0];
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[3];
-    int filterWidth = colShape[4];
-    int outputHeight = colShape[0];
-    int outputWidth = colShape[1];
-
-    int blockDimX = 0;
-    int blockDimY = 0;
-    if (filterHeight <= 4 && filterWidth <= 4) {
-      blockDimX = 4;
-      blockDimY = 4;
-    } else if (filterHeight <= 8 && filterWidth <= 8) {
-      blockDimX = 8;
-      blockDimY = 8;
-    } else if (filterHeight <= 16 && filterWidth <= 16) {
-      blockDimX = 16;
-      blockDimY = 16;
-    } else {
-      blockDimX = 32;
-      blockDimY = 32;
-    }
-
-    int blockDimZ = 1024 / blockDimX / blockDimY;
-    dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels));
-    dim3 grid(outputWidth, outputHeight);
-    im2colOCF<T><<<grid, threads, 0, STREAM_DEFAULT>>>(imData,
-                                                       colData,
-                                                       inputChannels,
-                                                       inputHeight,
-                                                       inputWidth,
-                                                       filterHeight,
-                                                       filterWidth,
-                                                       strideHeight,
-                                                       strideWidth,
-                                                       paddingHeight,
-                                                       paddingWidth,
-                                                       dilationHeight,
-                                                       dilationWidth,
-                                                       outputHeight,
-                                                       outputWidth);
-    CHECK_SYNC("Im2ColFunctor GPU failed");
-  }
-};
-
-template <class T>
-__global__ void col2imOCF(T* imData,
-                          const T* colData,
-                          int inputChannels,
-                          int inputHeight,
-                          int inputWidth,
-                          int filterHeight,
-                          int filterWidth,
-                          int strideHeight,
-                          int strideWidth,
-                          int paddingHeight,
-                          int paddingWidth,
-                          int dilationHeight,
-                          int dilationWidth,
-                          int outputHeight,
-                          int outputWidth) {
-  int swId = blockIdx.x;
-  int shId = blockIdx.y;
-  for (int channelId = threadIdx.z; channelId < inputChannels;
-       channelId += blockDim.z) {
-    for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
-      for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
-        int widthOffset =
-            idx * dilationWidth + swId * strideWidth - paddingWidth;
-        int heightOffset =
-            idy * dilationHeight + shId * strideHeight - paddingHeight;
-        int imOffset = widthOffset + heightOffset * inputWidth +
-                       channelId * inputHeight * inputWidth;
-
-        int colOffset = idx + idy * filterWidth +
-                        channelId * filterHeight * filterWidth +
-                        (shId * outputWidth + swId) *
-                            (inputChannels * filterHeight * filterWidth);
-
-        if (heightOffset >= 0 && heightOffset < inputHeight &&
-            widthOffset >= 0 && widthOffset < inputWidth) {
-          paddle::paddleAtomicAdd(imData + imOffset, colData[colOffset]);
-        }
-      }
-    }
-  }
-}
-
-/*
- * imShape = [inputChannels, inputHeight, inputWidth]
- * colShape =
- *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
- */
-template <class T>
-class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, T> {
-public:
-  void operator()(T* imData,
-                  const TensorShape& imShape,
-                  const T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight,
-                  int dilationWidth) {
-    int inputChannels = imShape[0];
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[3];
-    int filterWidth = colShape[4];
-    int outputHeight = colShape[0];
-    int outputWidth = colShape[1];
-
-    int blockDimX = 0;
-    int blockDimY = 0;
-    if (filterHeight <= 4 && filterWidth <= 4) {
-      blockDimX = 4;
-      blockDimY = 4;
-    } else if (filterHeight <= 8 && filterWidth <= 8) {
-      blockDimX = 8;
-      blockDimY = 8;
-    } else if (filterHeight <= 16 && filterWidth <= 16) {
-      blockDimX = 16;
-      blockDimY = 16;
-    } else {
-      blockDimX = 32;
-      blockDimY = 32;
-    }
-
-    int blockDimZ = 1024 / blockDimX / blockDimY;
-    dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels));
-    dim3 grid(outputWidth, outputHeight);
-    col2imOCF<T><<<grid, threads, 0, STREAM_DEFAULT>>>(imData,
-                                                       colData,
-                                                       inputChannels,
-                                                       inputHeight,
-                                                       inputWidth,
-                                                       filterHeight,
-                                                       filterWidth,
-                                                       strideHeight,
-                                                       strideWidth,
-                                                       paddingHeight,
-                                                       paddingWidth,
-                                                       dilationHeight,
-                                                       dilationWidth,
-                                                       outputHeight,
-                                                       outputWidth);
-    CHECK_SYNC("Col2ImFunctor GPU failed");
-  }
-};
-
-template class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, float>;
-template class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, double>;
-template class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, float>;
-template class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, double>;
-
-}  // namespace paddle
diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp
deleted file mode 100644
index 967c5b91536608364b4181707b843799b1764c3f..0000000000000000000000000000000000000000
--- a/paddle/function/Im2ColTest.cpp
+++ /dev/null
@@ -1,223 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Im2Col.h"
-#include <gtest/gtest.h>
-#include "Function.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/tests/TensorCheck.h"
-
-namespace paddle {
-
-template <DeviceType Device, class T>
-void TestIm2ColFunctor() {
-  for (size_t channels : {1, 5, 32}) {
-    for (size_t inputHeight : {5, 33, 100}) {
-      for (size_t inputWidth : {5, 32, 96}) {
-        for (size_t filterHeight : {1, 5}) {
-          for (size_t filterWidth : {3, 7}) {
-            for (size_t stride : {1, 2}) {
-              for (size_t padding : {0, 1}) {
-                for (size_t dilation : {1, 3}) {
-                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
-                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
-                  if (inputHeight + 2 * padding < filterSizeH ||
-                      inputWidth + 2 * padding < filterSizeW)
-                    break;
-                  if (padding >= filterSizeH || padding >= filterSizeW) break;
-                  size_t outputHeight =
-                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
-                  size_t outputWidth =
-                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;
-
-                  TensorShape imShape =
-                      TensorShape({channels, inputHeight, inputWidth});
-                  TensorShape colShape1 = TensorShape({channels,
-                                                       filterHeight,
-                                                       filterWidth,
-                                                       outputHeight,
-                                                       outputWidth});
-                  TensorShape colShape2 = TensorShape({outputHeight,
-                                                       outputWidth,
-                                                       channels,
-                                                       filterHeight,
-                                                       filterWidth});
-
-                  size_t height = channels * filterHeight * filterWidth;
-                  size_t width = outputHeight * outputWidth;
-                  VectorPtr input1 =
-                      Vector::create(imShape.getElements(), false);
-                  VectorPtr input2 =
-                      Vector::create(imShape.getElements(), false);
-                  MatrixPtr output1 =
-                      Matrix::create(height, width, false, false);
-                  MatrixPtr output2 =
-                      Matrix::create(width, height, false, false);
-                  input1->uniform(0.001, 1);
-                  input2->copyFrom(*input1);
-
-                  Im2ColFunctor<kCFO, Device, T> im2Col1;
-                  Im2ColFunctor<kOCF, Device, T> im2Col2;
-                  im2Col1(input1->getData(),
-                          imShape,
-                          output1->getData(),
-                          colShape1,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation);
-                  im2Col2(input2->getData(),
-                          imShape,
-                          output2->getData(),
-                          colShape2,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation);
-
-                  // The transposition of the result of ColFormat == kCFO
-                  // is equal to the result of ColFormat == kOCF.
-                  MatrixPtr test;
-                  output2->transpose(test, true);
-                  autotest::TensorCheckErr(*output1, *test);
-
-                  Col2ImFunctor<kCFO, Device, T> col2Im1;
-                  Col2ImFunctor<kOCF, Device, T> col2Im2;
-
-                  col2Im1(input1->getData(),
-                          imShape,
-                          output1->getData(),
-                          colShape1,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation);
-                  col2Im2(input2->getData(),
-                          imShape,
-                          output2->getData(),
-                          colShape2,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation);
-                  autotest::TensorCheckErr(*input1, *input2);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Im2ColFunctor, CPU) { TestIm2ColFunctor<DEVICE_TYPE_CPU, float>(); }
-
-#ifdef PADDLE_WITH_CUDA
-
-TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); }
-
-#endif
-
-template <class T>
-void TestIm2ColMobileFunctor() {
-  for (size_t channels : {32}) {
-    for (size_t inputHeight : {33, 100}) {
-      for (size_t inputWidth : {32, 96}) {
-        for (size_t filterHeight : {5}) {
-          for (size_t filterWidth : {7}) {
-            for (size_t stride : {2}) {
-              for (size_t padding : {1}) {
-                for (size_t dilation : {1, 3}) {
-                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
-                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
-                  if (inputHeight + 2 * padding < filterSizeH ||
-                      inputWidth + 2 * padding < filterSizeW)
-                    break;
-                  if (padding >= filterSizeH || padding >= filterSizeW) break;
-                  size_t outputHeight =
-                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
-                  size_t outputWidth =
-                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;
-
-                  TensorShape imShape =
-                      TensorShape({channels, inputHeight, inputWidth});
-                  TensorShape colShape1 = TensorShape({channels,
-                                                       filterHeight,
-                                                       filterWidth,
-                                                       outputHeight,
-                                                       outputWidth});
-
-                  size_t height = channels * filterHeight * filterWidth;
-                  size_t width = outputHeight * outputWidth;
-                  VectorPtr input1 =
-                      Vector::create(imShape.getElements(), false);
-                  VectorPtr input2 =
-                      Vector::create(imShape.getElements(), false);
-                  MatrixPtr output1 =
-                      Matrix::create(height, width, false, false);
-                  MatrixPtr output2 =
-                      Matrix::create(height, width, false, false);
-                  input1->uniform(0.001, 1);
-                  input2->copyFrom(*input1);
-
-                  Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> im2Col1;
-                  Im2ColMobileFunctor<T> im2Col2;
-                  im2Col1(input1->getData(),
-                          imShape,
-                          output1->getData(),
-                          colShape1,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation);
-                  im2Col2(input2->getData(),
-                          imShape,
-                          output2->getData(),
-                          colShape1,
-                          stride,
-                          stride,
-                          padding,
-                          padding,
-                          dilation,
-                          dilation,
-                          channels,
-                          0,
-                          outputHeight,
-                          outputHeight * outputWidth);
-
-                  autotest::TensorCheckEqual(*output1, *output2);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Im2ColFunctor, Mobile) { TestIm2ColMobileFunctor<float>(); }
-
-}  // namespace paddle
diff --git a/paddle/function/MulOp.cpp b/paddle/function/MulOp.cpp
deleted file mode 100644
index 90cd4a2b6d1bfb2529e1c966cf7a1fb904a844d7..0000000000000000000000000000000000000000
--- a/paddle/function/MulOp.cpp
+++ /dev/null
@@ -1,347 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MulOp.h"
-#include "GemmFunctor.h"
-#include "paddle/math/SIMDFunctions.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace {
-inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i] += (1.0 == scaleB) ? b[i] : scaleB * b[i];
-  }
-}
-
-inline void colVecAddTo(
-    real* a, real* b, real c, size_t len, size_t aWidth, size_t bWidth) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i * aWidth] += (1.0 == c) ? b[i * bWidth] : b[i * bWidth] * c;
-  }
-}
-}  // namespace
-
-namespace paddle {
-/// sparse matrix (+)= dense matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_CPU>(CpuSparseMatrix& out,
-                            const CpuMatrix& a,
-                            const CpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  CHECK_EQ(out.getValueType(), FLOAT_VALUE);
-  if (scaleT == 0) {
-    out.zeroMem();
-  }
-  const real* A = a.getData();
-  const real* B = b.getData();
-  real* C = out.getValue();
-  int* rows = out.getRows();
-  int* cols = out.getCols();
-  size_t width = out.getWidth();
-  size_t height = out.getHeight();
-
-  /// SPARSE_CSC, {a any, b not trans}
-  if (out.getFormat() == SPARSE_CSC) {
-    /// b not trans and a any
-    CHECK(!bTrans);
-    size_t m = !aTrans ? a.getWidth() : a.getHeight();
-    for (size_t i = 0; i < width; i++) {
-      size_t start = out.getColStartIdx(i);
-      size_t end = out.getColStartIdx(i + 1);
-      for (size_t j = start; j < end; j++) {
-        real sum = 0;
-        size_t rowIdx = rows[j];
-        for (size_t k = 0; k < m; k++) {
-          sum += (!aTrans ? A[rowIdx * m + k] : A[k * height + rowIdx]) *
-                 B[k * width + i];
-        }
-        C[j] = scaleAB * sum + scaleT * C[j];
-      }
-    }
-    return;
-  }
-
-  /// SPARSE_CSR, {a any, b not trans} or {a not trans, b trans}
-  if (out.getFormat() == SPARSE_CSR) {
-    /// a and b can not both transpose
-    CHECK(!(aTrans && bTrans));
-    size_t m = a.getWidth();
-    for (size_t i = 0; i < height; i++) {
-      size_t start = out.getRowStartIdx(i);
-      size_t end = out.getRowStartIdx(i + 1);
-      for (size_t j = start; j < end; j++) {
-        real sum = 0;
-        size_t colIdx = cols[j];
-        for (size_t k = 0; k < m; k++) {
-          sum += (!aTrans ? A[i * m + k] : A[k * height + i]) *
-                 (!bTrans ? B[k * width + colIdx] : B[colIdx * m + k]);
-        }
-        C[j] = scaleAB * sum + scaleT * C[j];
-      }
-    }
-    return;
-  }
-}
-
-/// dense matrix (+)= dense matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
-                            const CpuMatrix& a,
-                            const CpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  BlasGemm<DEVICE_TYPE_CPU, real>::compute(
-      aTrans,
-      bTrans,
-      out.getHeight(),
-      out.getWidth(),
-      !aTrans ? a.getWidth() : a.getHeight(),
-      scaleAB,
-      a.getData(),
-      a.getStride(),
-      b.getData(),
-      b.getStride(),
-      scaleT,
-      out.getData(),
-      out.getStride());
-}
-
-/// dense matrix (+)= sparse matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
-                            const CpuSparseMatrix& a,
-                            const CpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  if (scaleT == 0) {
-    out.zeroMem();
-  }
-  const real* B = b.getData();
-  real* C = out.getData();
-  if (out.getWidth() % 32 == 0) {
-    CHECK_EQ((size_t)B % 32, 0UL);
-    CHECK_EQ((size_t)C % 32, 0UL);
-  }
-
-  int* cols = a.getCols();
-  real* values = a.getValue();
-  for (size_t i = 0; i < a.getHeight(); ++i) {
-    const int start = a.getRowStartIdx(i);
-    const int end = a.getRowStartIdx(i + 1);
-    for (int j = start; j < end; ++j) {
-      vecAddTo(!aTrans ? out.getRow(i) : out.getRow(cols[j]),
-               !aTrans ? const_cast<CpuMatrix&>(b).getRow(cols[j])
-                       : const_cast<CpuMatrix&>(b).getRow(i),
-               (a.getValueType() == FLOAT_VALUE) ? values[j] : (real)1.0,
-               out.getWidth());
-    }
-  }
-}
-
-/// dense matrix (+)= dense matrix * sparse matrix
-template <>
-void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
-                            const CpuMatrix& a,
-                            const CpuSparseMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  if (scaleT == 0) {
-    out.zeroMem();
-  }
-  real* A = const_cast<real*>(a.getData());
-  real* B = const_cast<real*>(b.getValue());
-  real* C = out.getData();
-  int* rows = b.getRows();
-  int* cols = b.getCols();
-
-  /// SPARSE_CSC format
-  if (b.getFormat() == SPARSE_CSC) {
-    for (size_t j = 0; j < b.getWidth(); ++j) {
-      int start = b.getColStartIdx(j);
-      int end = b.getColStartIdx(j + 1);
-      for (int i = start; i < end; ++i) {
-        colVecAddTo(!bTrans ? C + j : C + rows[i],
-                    !bTrans ? A + rows[i] : A + j,
-                    (b.getValueType() == NO_VALUE) ? (real)1.0 : B[i],
-                    out.getHeight(),
-                    out.getWidth(),
-                    a.getWidth());
-      }
-    }
-    return;
-  }
-
-  /// SPARSE_CSR format
-  if (b.getFormat() == SPARSE_CSR) {
-    for (size_t j = 0; j < b.getHeight(); ++j) {
-      int start = b.getRowStartIdx(j);
-      int end = b.getRowStartIdx(j + 1);
-      for (int i = start; i < end; ++i) {
-        colVecAddTo(!bTrans ? C + cols[i] : C + j,
-                    !bTrans ? A + j : A + cols[i],
-                    (b.getValueType() == NO_VALUE) ? (real)1.0 : B[i],
-                    out.getHeight(),
-                    out.getWidth(),
-                    a.getWidth());
-      }
-    }
-    return;
-  }
-}
-
-/**
- * mul operator
- * out = scaleT * out + scaleAB * (A * B)
- * here, scaleT in {0, 1}, scaleAB == 1,
- * out = A * B, ASSIGN_TO
- * out += A * B, ADD_TO
- *
- *
- * \param outputs[0]      output matrix (out), M * N,
- *                        could be either Sparse or Dense Matrix
- *                        M is num of rows, N is num of columns
- * \param inputs[0]       first input matrix (A),  M * K (if non-trans)
- *                        could be either Sparse or Dense Matrix
- *                        M is num of rows, K is num of columns
- * \param inputs[1]       second input matrix (B), K * N (if non-trans)
- *                        could be either Sparse or Dense Matrix
- *                        K is num of rows, N is num of columns
- *
- * Support eight Mul operators, with both GPU and CPU devices
- * For each device, four Mul operators are supported:
- * 1. dense (out) = dense (A) * dense (B)
- * 2. dense (out) = sparse (A) * dense (B)
- *    sparse matrix only support SPARSE_CSR format
- * 3. dense (out) = dense (A) * sparse (B)
- *    sparse matrix support SPARSE_CSC and SPARSE_CSR formats
- * 4. sparse (out) = dense (A) * dense (B)
- *    sparse matrix support SPARSE_CSC and SPARSE_CSR formats
- *
- */
-template <DeviceType Device>
-class MulFunc : public FunctionBase {
-public:
-  void init(const FuncConfig& config) override {
-    aTrans_ = config.get<bool>("aTrans");
-    bTrans_ = config.get<bool>("bTrans");
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK(!aTrans_ || !bTrans_)
-        << "Not support both a and b are transpose matrices";
-
-    CHECK_EQ((size_t)2, inputs.size());
-    CHECK_EQ((size_t)1, outputs.size());
-    CHECK(inputs[0].data() && inputs[1].data() && outputs[0].data());
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
-    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
-
-    size_t aRow = !aTrans_ ? inputs[0].shape()[0] : inputs[0].shape()[1];
-    size_t aCol = !aTrans_ ? inputs[0].shape()[1] : inputs[0].shape()[0];
-    size_t bRow = !bTrans_ ? inputs[1].shape()[0] : inputs[1].shape()[1];
-    size_t bCol = !bTrans_ ? inputs[1].shape()[1] : inputs[1].shape()[0];
-    /// C = A * B, or C += A * B, for matrix format
-    CHECK_EQ(aCol, bRow);
-    CHECK_EQ(aRow, outputs[0].shape()[0]);
-    CHECK_EQ(bCol, outputs[0].shape()[1]);
-
-    /// only support C = A * B (ASSIGN_TO) or C += A * B (ADD_TO)
-    real scaleT = (outputs[0].getArgType() == ADD_TO) ? 1.0 : 0.0;
-
-    /// support dense = not both sparse * sparse
-    /// or sparse = dense * dense
-    CHECK((!outputs[0].isSparseArg() &&
-           !(inputs[0].isSparseArg() && inputs[1].isSparseArg())) ||
-          (outputs[0].isSparseArg() && !inputs[0].isSparseArg() &&
-           !inputs[1].isSparseArg()));
-
-    auto outMat = outputs[0].matrix<Device>();
-    /// dense matrix = dense matrix * dense matrix
-    if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
-        !outputs[0].isSparseArg()) {
-      MulOp<Device>(outMat,
-                    inputs[0].matrix<Device>(),
-                    inputs[1].matrix<Device>(),
-                    1.0,  // scaleAB
-                    scaleT,
-                    aTrans_,
-                    bTrans_);
-      return;
-    }
-
-    /// dense matrix = dense matrix * sparse matrix
-    if (!inputs[0].isSparseArg() && inputs[1].isSparseArg() &&
-        !outputs[0].isSparseArg()) {
-      CHECK(!aTrans_) << "Not supported a transpose";
-      MulOp<Device>(outMat,
-                    inputs[0].matrix<Device>(),
-                    inputs[1].sparse().SparseMatrix<Device>(),
-                    1.0,  // scaleAB
-                    scaleT,
-                    aTrans_,
-                    bTrans_);
-      return;
-    }
-
-    /// dense matrix = sparse matrix * dense matrix
-    if (inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
-        !outputs[0].isSparseArg()) {
-      CHECK(!bTrans_) << "Not supported b transpose";
-      CHECK_EQ(inputs[0].sparse().dataFormat(), T_SPARSE_CSR)
-          << "Only supported SPARSE_CSR format for sparse matrix a";
-      MulOp<Device>(outMat,
-                    inputs[0].sparse().SparseMatrix<Device>(),
-                    inputs[1].matrix<Device>(),
-                    1.0,  // scaleAB
-                    scaleT,
-                    aTrans_,
-                    bTrans_);
-      return;
-    }
-
-    /// sparse matrix = dense matrix * dense matrix
-    auto outSparseMat = outputs[0].sparse().SparseMatrix<Device>();
-    if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
-        outputs[0].isSparseArg()) {
-      MulOp<Device>(outSparseMat,
-                    inputs[0].matrix<Device>(),
-                    inputs[1].matrix<Device>(),
-                    1.0,  // scaleAB
-                    scaleT,
-                    aTrans_,
-                    bTrans_);
-      return;
-    }
-  }
-
-private:
-  bool aTrans_;
-  bool bTrans_;
-};
-
-REGISTER_TYPED_FUNC(MulOp, CPU, MulFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(MulOp, GPU, MulFunc);
-#endif
-}  // namespace paddle
diff --git a/paddle/function/MulOp.h b/paddle/function/MulOp.h
deleted file mode 100644
index e6057be4e54b3cc2b3502b9a93825d4b53037c91..0000000000000000000000000000000000000000
--- a/paddle/function/MulOp.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Function.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-
-namespace paddle {
-/// CPU, dense matrix (+)= dense matrix * dense matrix
-template <DeviceType DType>
-void MulOp(CpuMatrix& out,
-           const CpuMatrix& a,
-           const CpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// CPU, dense matrix (+)= sparse matrix * dense matrix
-template <DeviceType DType>
-void MulOp(CpuMatrix& out,
-           const CpuSparseMatrix& a,
-           const CpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// CPU, dense matrix (+)= dense matrix * sparse matrix
-template <DeviceType DType>
-void MulOp(CpuMatrix& out,
-           const CpuMatrix& a,
-           const CpuSparseMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// CPU, sparse matrix (+)= dense matrix * dense matrix
-template <DeviceType DType>
-void MulOp(CpuSparseMatrix& out,
-           const CpuMatrix& a,
-           const CpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// GPU, dense matrix (+)= dense matrix * dense matrix
-template <DeviceType DType>
-void MulOp(GpuMatrix& out,
-           const GpuMatrix& a,
-           const GpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// GPU, dense matrix (+)= sparse matrix * dense matrix
-template <DeviceType DType>
-void MulOp(GpuMatrix& out,
-           const GpuSparseMatrix& a,
-           const GpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// GPU, dense matrix (+)= dense matrix * sparse matrix
-template <DeviceType DType>
-void MulOp(GpuMatrix& out,
-           const GpuMatrix& a,
-           const GpuSparseMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-/// GPU, sparse matrix (+)= dense matrix * dense matrix
-template <DeviceType DType>
-void MulOp(GpuSparseMatrix& out,
-           const GpuMatrix& a,
-           const GpuMatrix& b,
-           real scaleAB,
-           real scaleT,
-           bool aTrans,
-           bool bTrans);
-
-}  // namespace paddle
diff --git a/paddle/function/MulOpGpu.cu b/paddle/function/MulOpGpu.cu
deleted file mode 100644
index d63416a8e45346089bac23100742b8afc99b8e77..0000000000000000000000000000000000000000
--- a/paddle/function/MulOpGpu.cu
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MulOp.h"
-#include "hl_base.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-
-namespace paddle {
-/// dense matrix (+)= dense matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
-                            const GpuMatrix& a,
-                            const GpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
-  hl_matrix_mul(const_cast<real*>(a.getData()),
-                !aTrans ? HPPL_OP_N : HPPL_OP_T,
-                const_cast<real*>(b.getData()),
-                !bTrans ? HPPL_OP_N : HPPL_OP_T,
-                const_cast<real*>(out.getData()),
-                out.getHeight(),
-                out.getWidth(),
-                !aTrans ? a.getWidth() : a.getHeight(),
-                scaleAB,
-                scaleT,
-                a.getStride(),
-                b.getStride(),
-                out.getStride());
-}
-
-/// dense matrix (+)= sparse matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
-                            const GpuSparseMatrix& a,
-                            const GpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  CHECK(out.isContiguous());
-  CHECK(b.isContiguous());
-  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
-  hl_matrix_csr_mul_dense(a.sMatrix_.get(),
-                          aTrans ? HPPL_OP_T : HPPL_OP_N,
-                          const_cast<real*>(b.getData()),
-                          HPPL_OP_N,
-                          const_cast<real*>(out.getData()),
-                          out.getHeight(),
-                          out.getWidth(),
-                          b.getHeight(),
-                          scaleAB,
-                          scaleT);
-}
-
-/// dense matrix (+)= dense matrix * sparse matrix
-template <>
-void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
-                            const GpuMatrix& a,
-                            const GpuSparseMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  CHECK(out.isContiguous());
-  CHECK(a.isContiguous());
-  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
-
-  if (b.format_ == SPARSE_CSC) {
-    hl_matrix_dense_mul_csc(const_cast<real*>(a.getData()),
-                            HPPL_OP_N,
-                            b.sMatrix_.get(),
-                            bTrans ? HPPL_OP_T : HPPL_OP_N,
-                            const_cast<real*>(out.getData()),
-                            out.getHeight(),
-                            out.getWidth(),
-                            a.getWidth(),
-                            scaleAB,
-                            scaleT);
-  } else {
-    hl_matrix_dense_mul_csr(const_cast<real*>(a.getData()),
-                            HPPL_OP_N,
-                            b.sMatrix_.get(),
-                            bTrans ? HPPL_OP_T : HPPL_OP_N,
-                            const_cast<real*>(out.getData()),
-                            out.getHeight(),
-                            out.getWidth(),
-                            a.getWidth(),
-                            scaleAB,
-                            scaleT);
-  }
-}
-
-/// sparse matrix (+)= dense matrix * dense matrix
-template <>
-void MulOp<DEVICE_TYPE_GPU>(GpuSparseMatrix& out,
-                            const GpuMatrix& a,
-                            const GpuMatrix& b,
-                            real scaleAB,
-                            real scaleT,
-                            bool aTrans,
-                            bool bTrans) {
-  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
-  hl_sparse_matrix_mul(const_cast<real*>(a.getData()),
-                       aTrans ? HPPL_OP_T : HPPL_OP_N,
-                       const_cast<real*>(b.getData()),
-                       bTrans ? HPPL_OP_T : HPPL_OP_N,
-                       out.sMatrix_.get(),
-                       out.getHeight(),
-                       out.getWidth(),
-                       !bTrans ? b.getHeight() : b.getWidth(),
-                       scaleAB,
-                       scaleT);
-}
-
-}  // namespace paddle
diff --git a/paddle/function/MulOpTest.cpp b/paddle/function/MulOpTest.cpp
deleted file mode 100644
index 4e1ebd749c0cd083c025e43a321d6992a11786ff..0000000000000000000000000000000000000000
--- a/paddle/function/MulOpTest.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "FunctionTest.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/math/tests/test_matrixUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-
-/**
- *  C += A * B, A, B, C dense matrix
- *  dense = dense * dense
- */
-void testFuncDDDMatrix(
-    bool transa, bool transb, size_t dimM, size_t dimN, size_t dimK) {
-  real scaleT = 1.0;
-  size_t heightA = (transa == false) ? dimM : dimK;
-  size_t widthA = (transa == false) ? dimK : dimM;
-  size_t heightB = (transb == false) ? dimK : dimN;
-  size_t widthB = (transb == false) ? dimN : dimK;
-  size_t heightC = dimM;
-  size_t widthC = dimN;
-  // init Test object
-  CpuGpuFuncCompare test(
-      "MulOp", FuncConfig().set("aTrans", transa).set("bTrans", transb));
-  // prepare input arguments
-  /// matrix A : HA * WA
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightA, widthA}));
-  /// matrix B: HB * WB
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightB, widthB}));
-
-  /// output matrix C: HC * WC
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightC, widthC}),
-                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
-  // run Function
-  test.run();
-}
-
-TEST(MulOp, DDDMatrixMul) {
-  LOG(INFO) << "function test for dense = dense * dense matrix";
-  for (const auto transa : {false, true}) {
-    for (const auto transb : {false, true}) {
-      for (const auto dimM : {1, 10, 100}) {
-        for (const auto dimN : {1, 10}) {
-          for (const auto dimK : {8}) {
-            if (transa && transb) {
-              continue;
-            }
-            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
-                    << " transa=" << transa << " transb=" << transb
-                    << " dimM=" << std::setw(5) << dimM
-                    << " dimN=" << std::setw(5) << dimN
-                    << " dimK=" << std::setw(5) << dimK;
-            testFuncDDDMatrix(transa, transb, dimM, dimN, dimK);
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * C += A * B, B, C dense, A sparse
- * dense = sparse * dense
- */
-void testFuncDSparseDMatrix(
-    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
-  real scaleT = 1.0;
-  // init Test object
-  CpuGpuFuncCompare test(
-      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
-  // prepare input arguments
-  /// sparse matrix A : M * K
-  test.addInputs(SparseMatrixArg(
-      VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}, nnz, FORMAT, FLOAT_VALUE));
-  /// matrix B: K * N
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));
-
-  /// output matrix C: M * N
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
-                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
-  // run Function
-  test.run();
-}
-
-TEST(MuLOp, DSparseDMul) {
-  LOG(INFO) << "function test for dense = sparse * dense matrix";
-  for (const auto dimM : {10, 100, 1000}) {
-    for (const auto dimN : {10, 100}) {
-      for (const auto dimK : {3, 10}) {
-        for (const auto nnz : {3, 10}) {
-          for (const auto FORMAT : {SPARSE_CSR}) {
-            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
-                    << " dimM=" << std::setw(5) << dimM
-                    << " dimN=" << std::setw(5) << dimN
-                    << " dimK=" << std::setw(5) << dimK
-                    << " nnz=" << std::setw(5) << nnz
-                    << " format=" << std::setw(5) << FORMAT;
-            testFuncDSparseDMatrix(dimM, dimN, dimK, nnz, FORMAT);
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * C += A * B, A, C dense, B sparse
- * dense = dense * sparse
- */
-void testFuncDDSparseMatrix(
-    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
-  real scaleT = 1.0;
-  // init Test object
-  CpuGpuFuncCompare test(
-      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
-  // prepare input arguments
-  /// matrix A : M * K
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
-
-  /// matrix B: K * N
-  test.addInputs(SparseMatrixArg(
-      VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}, nnz, FORMAT, FLOAT_VALUE));
-
-  /// output matrix C: M * N
-  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
-                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
-  // run Function
-  test.run();
-}
-
-TEST(MulOp, DDSparseMul) {
-  LOG(INFO) << "function test for dense = dense * sparse matrix";
-  for (const auto dimM : {10, 100, 1000}) {
-    for (const auto dimN : {10, 100}) {
-      for (const auto dimK : {3, 10}) {
-        for (const auto nnz : {3, 10}) {
-          for (const auto FORMAT : {SPARSE_CSR, SPARSE_CSC}) {
-            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
-                    << " dimM=" << std::setw(5) << dimM
-                    << " dimN=" << std::setw(5) << dimN
-                    << " dimK=" << std::setw(5) << dimK
-                    << " nnz=" << std::setw(5) << nnz
-                    << " format=" << std::setw(5) << FORMAT;
-            testFuncDDSparseMatrix(dimM, dimN, dimK, nnz, FORMAT);
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * C += A * B, A sparse, B, C dense
- * sparse = dense * dense
- */
-void testFuncSparseDDMatrix(
-    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
-  real scaleT = 1.0;
-  // init Test object
-  CpuGpuFuncCompare test(
-      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
-  // prepare input arguments
-  /// matrix A : M * K
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
-
-  /// matrix B: K * N
-  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));
-
-  /// output sparse matrix C: M * N
-  test.addOutputs(
-      SparseMatrixArg(
-          VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}, nnz, FORMAT, FLOAT_VALUE),
-      scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
-  // run Function
-  test.run();
-}
-
-TEST(MulOp, SparseDDMul) {
-  LOG(INFO) << "function test for sparse = dense * dense matrix";
-  for (const auto dimM : {10, 100, 1000}) {
-    for (const auto dimN : {10, 100}) {
-      for (const auto dimK : {3, 10}) {
-        for (const auto nnz : {3, 10}) {
-          for (const auto FORMAT : {SPARSE_CSC, SPARSE_CSR}) {
-            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
-                    << " dimM=" << std::setw(5) << dimM
-                    << " dimN=" << std::setw(5) << dimN
-                    << " dimK=" << std::setw(5) << dimK
-                    << " nnz=" << std::setw(5) << nnz
-                    << " format=" << std::setw(5) << FORMAT;
-            testFuncSparseDDMatrix(dimM, dimN, dimK, nnz, FORMAT);
-          }
-        }
-      }
-    }
-  }
-}
diff --git a/paddle/function/NaiveConvOp.cpp b/paddle/function/NaiveConvOp.cpp
deleted file mode 100644
index 22d3b33d0f4a730691234c6c742978abd72294a6..0000000000000000000000000000000000000000
--- a/paddle/function/NaiveConvOp.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvOp.h"
-
-namespace paddle {
-
-/*
- * The three arguments are stored in memory in row major order.
- * inputData  = [batchSize, inputChannels, inputHeight, inputWidth]
- * filterData = [outputChannels, inputChannels, filterHeight, filterWidth]
- * outputData = [batchSize, outputChannels, outputHeight, outputWidth]
- */
-template <class T>
-class NaiveConvFunctor {
-public:
-  void operator()(const T* inputData,
-                  size_t batchSize,
-                  size_t inputChannels,
-                  size_t inputHeight,
-                  size_t inputWidth,
-                  const T* filterData,
-                  size_t filterHeight,
-                  size_t filterWidth,
-                  T* outputData,
-                  size_t outputChannels,
-                  size_t outputHeight,
-                  size_t outputWidth,
-                  size_t paddingH,
-                  size_t paddingW,
-                  size_t strideH,
-                  size_t strideW) {
-    for (size_t batch = 0; batch < batchSize; batch++) {
-      for (size_t outC = 0; outC < outputChannels; outC++) {
-        for (size_t outH = 0; outH < outputHeight; outH++) {
-          for (size_t outW = 0; outW < outputWidth; outW++) {
-            const int inStartH = (outH * strideH) - paddingH;
-            const int inStartW = (outW * strideW) - paddingW;
-            T outValue = (T)0;
-            for (size_t inC = 0; inC < inputChannels; inC++) {
-              for (size_t fH = 0; fH < filterHeight; fH++) {
-                for (size_t fW = 0; fW < filterWidth; fW++) {
-                  T inValue;
-                  const int inH = inStartH + fH;
-                  const int inW = inStartW + fW;
-                  if ((inH >= 0 && inH < (int)inputHeight) &&
-                      (inW >= 0 && inW < (int)inputWidth)) {
-                    size_t offsetInput =
-                        batch * inputChannels * inputHeight * inputWidth +
-                        inC * inputHeight * inputWidth + inH * inputWidth + inW;
-                    inValue = inputData[offsetInput];
-                  } else {
-                    inValue = (T)0;
-                  }
-                  size_t offsetFilter =
-                      outC * inputChannels * filterHeight * filterWidth +
-                      inC * filterHeight * filterWidth + fH * filterWidth + fW;
-                  T filterValue = filterData[offsetFilter];
-                  outValue += (inValue * filterValue);
-                }
-              }
-            }
-
-            size_t offset =
-                batch * outputChannels * outputHeight * outputWidth +
-                outC * outputHeight * outputWidth + outH * outputWidth + outW;
-            outputData[offset] = outValue;
-          }
-        }
-      }
-    }
-  }
-};
-
-template <DeviceType Device>
-class NaiveConvFunction : public ConvFunctionBase {
-public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-    check(inputs, outputs);
-
-    size_t batchSize = inputs[0].shape()[0];
-    size_t inputChannels = inputs[0].shape()[1];
-    size_t inputHeight = inputs[0].shape()[2];
-    size_t inputWidth = inputs[0].shape()[3];
-    size_t filterHeight = inputs[1].shape()[2];
-    size_t filterWidth = inputs[1].shape()[3];
-    size_t outputChannels = outputs[0].shape()[1];
-    size_t outputHeight = outputs[0].shape()[2];
-    size_t outputWidth = outputs[0].shape()[3];
-
-    real* inputData = inputs[0].data<real>();
-    real* filterData = inputs[1].data<real>();
-    real* outputData = outputs[0].data<real>();
-    NaiveConvFunctor<real> conv;
-    conv(inputData,
-         batchSize,
-         inputChannels,
-         inputHeight,
-         inputWidth,
-         filterData,
-         filterHeight,
-         filterWidth,
-         outputData,
-         outputChannels,
-         outputHeight,
-         outputWidth,
-         paddingH(),
-         paddingW(),
-         strideH(),
-         strideW());
-  }
-};
-
-REGISTER_TYPED_FUNC(NaiveConv, CPU, NaiveConvFunction);
-
-}  // namespace paddle
diff --git a/paddle/function/PadOp.cpp b/paddle/function/PadOp.cpp
deleted file mode 100644
index db6dd518ca5df9d852e545b37f61f1141c81f57c..0000000000000000000000000000000000000000
--- a/paddle/function/PadOp.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PadOp.h"
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-
-template <>
-void Pad<DEVICE_TYPE_CPU>(real* outputs,
-                          const real* inputs,
-                          const int num,
-                          const int inC,
-                          const int inH,
-                          const int inW,
-                          const PadConf& pad) {
-  int cstart = pad.channel[0], cend = pad.channel[1];
-  int hstart = pad.height[0], hend = pad.height[1];
-  int wstart = pad.width[0], wend = pad.width[1];
-  int outC = inC + cstart + cend;
-  int outH = inH + hstart + hend;
-  int outW = inW + wstart + wend;
-  for (int i = 0; i < num; i++) {
-    for (int c = 0; c < inC; c++) {
-      for (int h = 0; h < inH; h++) {
-        int inoff = ((i * inC + c) * inH + h) * inW;
-        int outoff =
-            ((i * outC + c + cstart) * outH + h + hstart) * outW + wstart;
-        memcpy(outputs + outoff, inputs + inoff, inW * sizeof(real));
-      }
-    }
-  }
-}
-
-template <>
-void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
-                              const real* outGrad,
-                              const int num,
-                              const int inC,
-                              const int inH,
-                              const int inW,
-                              const PadConf& pad) {
-  int cstart = pad.channel[0], cend = pad.channel[1];
-  int hstart = pad.height[0], hend = pad.height[1];
-  int wstart = pad.width[0], wend = pad.width[1];
-  int outC = inC + cstart + cend;
-  int outH = inH + hstart + hend;
-  int outW = inW + wstart + wend;
-  for (int i = 0; i < num; i++) {
-    for (int c = 0; c < inC; c++) {
-      for (int h = 0; h < inH; h++) {
-        int inoff = ((i * inC + c) * inH + h) * inW;
-        int outoff =
-            ((i * outC + c + cstart) * outH + h + hstart) * outW + wstart;
-        CpuVector inG = CpuVector(inW, inGrad + inoff);
-        CpuVector outG = CpuVector(inW, const_cast<real*>(outGrad + outoff));
-        inG += outG;
-      }
-    }
-  }
-}
-
-static inline PadConf castToPadConf(const FuncConfig& conf) {
-  return {conf.get<std::vector<uint32_t>>("channel"),
-          conf.get<std::vector<uint32_t>>("height"),
-          conf.get<std::vector<uint32_t>>("width")};
-}
-
-/**
- * \brief Padding zeros to input according to the specify dimension.
- *        The struct pad_ contains the padding size in each dimension.
- *        The input and output is a 4D tensor. In PadFunc, we only
- *        pad zeros to the 2nd to 4th dimension.
- *
- * Argument in this Function:
- * \param pad_    A struct object contains the padding size in each dimension.
- *                It has six integers. The channelStart and channelEnd indicate
- *                how many zeros to add before and after the input in channel
- *                dimension. And the heightStart and heightEnd indicate padding
- *                in height dimension. The widthStart and widthEnd indicate the
- *                padding in width dimension.
- * \param inputs  A 4D tensor, only one input.
- * \param outputs A 4D tensor, the output value after padding.
- *
- * For example,
- * Input(2,2,2,3) = [
- *                    [ [[1,2,3], [3,4,5]],
- *                      [[2,3,5], [1,6,7]] ],
- *                    [ [[4,3,1], [1,8,7]],
- *                      [[3,8,9], [2,3,5]] ]
- *                  ] # the shape is (1,2,2,3)
- *
- * pad_: if channelStart = channelEnd = 1, others are 0.
- * Output(2,4,2,3) = [
- *                    [ [[0,0,0], [0,0,0]],
- *                      [[1,2,3], [3,4,5]],
- *                      [[2,3,5], [1,6,7]],
- *                      [[0,0,0], [0,0,0]] ],
- *                    [ [[0,0,0], [0,0,0]],
- *                      [[4,3,1], [1,8,7]],
- *                      [[3,8,9], [2,3,5]],
- *                      [[0,0,0], [0,0,0]] ]
- *                   ] # the shape is (2,4,2,3)
- *
- * pad_: if widthStart = 1, widthEnd = 2, others are 0.
- * Output(2,2,2,6) = [
- *                     [ [[0,1,2,3,0,0], [0,3,4,5,0,0]],
- *                       [[0,2,3,5,0,0], [0,1,6,7,0,0]] ],
- *                     [ [[0,4,3,1,0,0], [0,1,8,7,0,0]],
- *                       [[0,3,8,9,0,0], [0,2,3,5,0,0]] ],
- *                   ] # the shape is (2,2,2,6)
- *
- * pad_: if heightStart = 1, heightEnd = 1, others are 0.
- * Output(2,2,4,3) = [
- *                     [ [[0,0,0], [1,2,3], [3,4,5], [0,0,0]],
- *                       [[0,0,0], [2,3,5], [1,6,7], [0,0,0]] ],
- *                     [ [[0,0,0], [4,3,1], [1,8,7], [0,0,0]],
- *                       [[0,0,0], [3,8,9], [2,3,5], [0,0,0]] ],
- *                   ] # the shape is (2,2,4,3)
- */
-
-template <DeviceType Device>
-class PadFunc : public FunctionBase {
-public:
-  void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-
-    size_t num = inputs[0].shape()[0];
-    size_t inC = inputs[0].shape()[1];
-    size_t inH = inputs[0].shape()[2];
-    size_t inW = inputs[0].shape()[3];
-    typename Tensor<real, Device>::Vector vec(outputs[0].shape().getElements(),
-                                              outputs[0].data<real>());
-    vec.zero();
-
-    Pad<Device>(outputs[0].data<real>(),
-                inputs[0].data<real>(),
-                num,
-                inC,
-                inH,
-                inW,
-                pad_);
-  }
-
-private:
-  PadConf pad_;
-};
-
-/**
- * \brief The backward propagation of padding Function. Remove the elements
- *        in the padding positions of forward.
- *
- * Argument in this Function:
- * \param pad_    The same meaning as it in PadFunc.
- * \param inputs  The gradient with respect to the output value of PadFunc.
- * \param outputs The gradient with respect to the input value of PadFunc.
- */
-
-template <DeviceType Device>
-class PadGradFunc : public FunctionBase {
-public:
-  void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-
-    size_t num = outputs[0].shape()[0];
-    size_t inC = outputs[0].shape()[1];
-    size_t inH = outputs[0].shape()[2];
-    size_t inW = outputs[0].shape()[3];
-
-    if (outputs[0].getArgType() != ADD_TO) {
-      // for unit test
-      typename Tensor<real, Device>::Vector tmp(
-          outputs[0].shape().getElements(), outputs[0].data<real>());
-      tmp.zero();
-    }
-
-    PadGrad<Device>(outputs[0].data<real>(),
-                    inputs[0].data<real>(),
-                    num,
-                    inC,
-                    inH,
-                    inW,
-                    pad_);
-  }
-
-private:
-  PadConf pad_;
-};
-
-REGISTER_TYPED_FUNC(Pad, CPU, PadFunc);
-REGISTER_TYPED_FUNC(PadGrad, CPU, PadGradFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(Pad, GPU, PadFunc);
-REGISTER_TYPED_FUNC(PadGrad, GPU, PadGradFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/RowConvOp.cpp b/paddle/function/RowConvOp.cpp
deleted file mode 100644
index 925860346e1a53065b0fe4ccbd26853afc8898a1..0000000000000000000000000000000000000000
--- a/paddle/function/RowConvOp.cpp
+++ /dev/null
@@ -1,225 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "RowConvOp.h"
-#include <iostream>
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-
-template <>
-void RowConv<DEVICE_TYPE_CPU>(CpuMatrix& out,
-                              const CpuMatrix& in,
-                              const CpuMatrix& filter,
-                              const CpuIVector& seq) {
-  const int* starts = seq.getData();
-  const size_t numSeq = seq.getSize() - 1;
-  const size_t contextLength = filter.getHeight();
-  for (size_t i = 0; i < numSeq; ++i) {
-    size_t begin = starts[i];
-    size_t end = starts[i + 1];
-    for (size_t j = begin; j < end; ++j) {
-      MatrixPtr x;
-      MatrixPtr w;
-      if ((j + contextLength) < end) {
-        x = (const_cast<CpuMatrix&>(in)).subMatrix(j, contextLength);
-        w = (const_cast<CpuMatrix&>(filter)).subMatrix(0, contextLength);
-      } else {
-        x = (const_cast<CpuMatrix&>(in)).subMatrix(j, end - j);
-        w = (const_cast<CpuMatrix&>(filter)).subMatrix(0, end - j);
-      }
-      MatrixPtr y = out.subMatrix(j, 1);
-      y->addDotMulVMM(*x, *w);
-    }
-  }
-}
-
-template <>
-void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
-                                  const CpuMatrix& in,
-                                  const CpuMatrix& filter,
-                                  CpuMatrix& inG,
-                                  CpuMatrix& filterG,
-                                  const CpuIVector& seq) {
-  // gradient w.r.t filter
-  const int* starts = seq.getData();
-  const size_t numSeq = seq.getSize() - 1;
-  const size_t contextLength = filter.getHeight();
-  if (filterG) {
-    for (size_t i = 0; i < numSeq; ++i) {
-      size_t begin = starts[i];
-      size_t end = starts[i + 1];
-      size_t steps = end - begin;
-      for (size_t j = 0; j < contextLength && (begin + j) < end; ++j) {
-        MatrixPtr x =
-            (const_cast<CpuMatrix&>(in)).subMatrix(begin + j, steps - j);
-        MatrixPtr dy =
-            (const_cast<CpuMatrix&>(outG)).subMatrix(begin, steps - j);
-        MatrixPtr dw = filterG.subMatrix(j, 1);
-        dw->addDotMulVMM(*dy, *x);
-      }
-    }
-  }
-
-  // gradient w.r.t input feature
-  if (inG) {
-    for (size_t i = 0; i < numSeq; ++i) {
-      size_t begin = starts[i];
-      size_t end = starts[i + 1];
-      size_t steps = end - begin;
-      for (size_t j = 0; j < steps; ++j) {
-        MatrixPtr dx = inG.subMatrix(begin + j, 1);
-        for (size_t t = 0; t < contextLength; ++t) {
-          if (int(j - t) >= 0) {
-            MatrixPtr dy =
-                (const_cast<CpuMatrix&>(outG)).subMatrix(begin + j - t, 1);
-            MatrixPtr w = (const_cast<CpuMatrix&>(filter)).subMatrix(t, 1);
-            dx->addDotMul(*dy, *w, 1.0, 1.0);
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * \brief The row convolution is called lookahead convolution. It is firstly
- * introduced in deep-speech2 system. The bidirectional RNN that learns
- * representation for a sequence by performing a forward and a backward pass
- * through the entire sequence. However, unlike unidirectional RNNs,
- * bidirectional RNNs are challenging to deploy in an online and low-latency
- * setting. The lookahead convolution incorporates information from future
- * subsequences in a computationally efficient manner to improve unidirectional
- * recurrent neural networks.
- *
- * The connection of row convolution is different form the 1D sequence
- * convolution. Assumed that, the future context-length is k, that is to say,
- * it can get the output at timestep t by using the the input feature from t-th
- * timestep to (t+k)-th timestep. Assumed that the hidden dim of input
- * activations are d, the activations r_t for the new layer at time-step t are:
- *
- *
- *            -- k + 1
- *  r(t,i) =  >       W(i,j) * h(t+j-1, i),  for (1 <= i <= d)
- *            -- j = 1
- *
- *
- * The weight shape is: (k + 1) x d
- * Function Arguments:
- *
- * \param inputs[0]  The input activations.
- * \param inputs[0]  The filter (or weight) and shape is (k+1) x d.
- * \param outputs[1] The output activations.
- *
- * [1] Dario Amodei, etc. Deep Speech 2 : End-to-End Speech Recognition in
- * English
- *     and Mandarin. https://arxiv.org/abs/1512.02595
- */
-
-template <DeviceType Device>
-class RowConvFunc : public FunctionBase {
-public:
-  void init(const FuncConfig& config) override {}
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    // check
-    CHECK_EQ(2UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    // TODO(qingqing): support ASSIGN_TO.
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
-        << "SequenceArg required here.";
-    const auto in = dynamic_cast<const SequenceArg&>(inputs[0]);
-    auto out = dynamic_cast<const SequenceArg&>(outputs[0]);
-    auto w = inputs[1];
-    CHECK(in.data() && out.data() && in.getSequenceId().data());
-    CHECK_EQ(in.shape().ndims(), 2UL);
-    CHECK(in.shape() == out.shape());
-    CHECK_EQ(w.shape()[1], in.shape()[1]);
-
-    auto outMat = out.matrix<Device>();
-    const auto inMat = in.matrix<Device>();
-    const auto wMat = w.matrix<Device>();
-    const auto seqId = in.getSequenceId().vector<int, Device>();
-
-    RowConv<Device>(outMat, inMat, wMat, seqId);
-  }
-};
-
-/**
- * \brief The backward of row convolution function. This function calculated
- * the gradient w.r.t filter and the gradient w.r.t input activations(or data).
- *
- * Argument in this Function:
- *
- * \param inputs[0]  The gradient w.r.t output activations.
- * \param inputs[1]  The input activations.
- * \param inputs[2]  The filter (or weight) and shape is (k+1) x d.
- * \param outputs[0] The gradient w.r.t input activations.
- * \param outputs[1] The gradient w.r.r filter.
- *
- * Abbreviation:
- * w.r.t: with respect to.
- */
-
-template <DeviceType Device>
-class RowConvGradFunc : public FunctionBase {
-  // TODO(qingqing): split into RowConvDataFunc and RowConvWeightFunc
-public:
-  void init(const FuncConfig& config) override {}
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    // check
-    CHECK_EQ(3UL, inputs.size());
-    CHECK_EQ(2UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
-    CHECK(inputs[0].isSequenceArg() && inputs[1].isSequenceArg() &&
-          outputs[0].isSequenceArg())
-        << "SequenceArg required here.";
-
-    const auto outGrad = dynamic_cast<const SequenceArg&>(inputs[0]);
-    const auto in = dynamic_cast<const SequenceArg&>(inputs[1]);
-    const auto w = inputs[2];
-    auto inGrad = dynamic_cast<const SequenceArg&>(outputs[0]);
-    auto wGrad = outputs[1];
-
-    CHECK_EQ(in.shape().ndims(), 2UL);
-    CHECK(in.shape() == inGrad.shape());
-    CHECK(in.shape() == outGrad.shape());
-    CHECK_EQ(wGrad.shape()[1], in.shape()[1]);
-
-    const auto outGMat = outGrad.matrix<Device>();
-    const auto inMat = in.matrix<Device>();
-    const auto wMat = w.matrix<Device>();
-    auto inGMat = inGrad.data()
-                      ? inGrad.matrix<Device>()
-                      : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
-    auto wGMat = wGrad.data()
-                     ? wGrad.matrix<Device>()
-                     : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
-    const auto seqId = in.getSequenceId().vector<int, Device>();
-
-    RowConvGrad<Device>(outGMat, inMat, wMat, inGMat, wGMat, seqId);
-  }
-};
-
-REGISTER_TYPED_FUNC(RowConv, CPU, RowConvFunc);
-REGISTER_TYPED_FUNC(RowConvGrad, CPU, RowConvGradFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(RowConv, GPU, RowConvFunc);
-REGISTER_TYPED_FUNC(RowConvGrad, GPU, RowConvGradFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/RowConvOpGpu.cu b/paddle/function/RowConvOpGpu.cu
deleted file mode 100644
index 9d8a6d80bb2c95a9aa8eb3666fd637402c932559..0000000000000000000000000000000000000000
--- a/paddle/function/RowConvOpGpu.cu
+++ /dev/null
@@ -1,368 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "RowConvOp.h"
-#include "hl_base.h"
-
-namespace paddle {
-
-template <int BLOCK_H, int BLOCK_W>
-__global__ void KeRowConv(real* y,
-                          const real* x,
-                          const real* w,
-                          const int* starts,
-                          const int height,
-                          const int width,
-                          const int numSeq,
-                          const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int blky = blockDim.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  __shared__ real sw[BLOCK_H][BLOCK_W];
-
-  for (int i = tidy; i < context; i += blky) {
-    sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0;
-  }
-
-  __syncthreads();
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-    for (int j = tidy; j < steps; j += blky) {
-      real sum = 0;
-      int off = (start + j) * width;
-      for (int t = 0; t < context; ++t) {
-        if ((start + j + t) < end) {
-          int xoff = off + t * width;
-          real xVal = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
-          sum += sw[t][tidx] * xVal;
-        }
-      }
-      if (gidx + tidx < width) {
-        y[off + gidx + tidx] += sum;
-      }
-    }
-  }
-}
-
-__global__ void KeRowConv2(real* y,
-                           const real* x,
-                           const real* w,
-                           const int* starts,
-                           const int height,
-                           const int width,
-                           const int numSeq,
-                           const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int blky = blockDim.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-    for (int j = tidy; j < steps; j += blky) {
-      int off = (start + j) * width;
-      real sum = 0;
-      for (int t = 0; t < context && (start + j + t) < end; ++t) {
-        int xoff = off + t * width;
-        real xd = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
-        real wd = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
-        sum += wd * xd;
-      }
-      if (gidx + tidx < width) {
-        y[off + gidx + tidx] += sum;
-      }
-    }
-  }
-}
-
-template <>
-void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
-                              const GpuMatrix& in,
-                              const GpuMatrix& filter,
-                              const GpuIVector& seq) {
-  const size_t numSeq = seq.getSize() - 1;
-  const size_t contextLength = filter.getHeight();
-  const size_t height = in.getHeight();
-  const size_t width = in.getWidth();
-
-  real* y = out.getData();
-  const real* x = in.getData();
-  const real* w = filter.getData();
-  const int* starts = seq.getData();
-
-  dim3 dimBlock(32, 32);
-  dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
-
-  if (contextLength <= 32) {
-    KeRowConv<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-        y, x, w, starts, height, width, numSeq, contextLength);
-  } else {
-    KeRowConv2<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-        y, x, w, starts, height, width, numSeq, contextLength);
-  }
-  CHECK_SYNC("RowConv");
-}
-
-template <int BLOCK_H, int BLOCK_W, int CONTEXT>
-__global__ void KeRowConvBwWeight(real* dw,
-                                  const real* x,
-                                  const real* dy,
-                                  const int* starts,
-                                  const int height,
-                                  const int width,
-                                  const int numSeq,
-                                  const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int blky = blockDim.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  __shared__ real sh_x[BLOCK_W][BLOCK_H];
-  __shared__ real sh_dy[BLOCK_W][BLOCK_H + CONTEXT - 1];
-  __shared__ real sh_dw[CONTEXT][BLOCK_W];
-
-  if (tidy < context) {
-    sh_dw[tidy][tidx] = 0.0;
-  }
-  __syncthreads();
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-    const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H;
-    for (int j = tidy; j < size; j += BLOCK_H) {
-      int xoff = gidx + tidx;
-      int yoff = start + j;
-
-      // transpose
-      sh_x[tidx][tidy] =
-          (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
-      sh_dy[tidx][tidy + context - 1] =
-          (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0;
-      __syncthreads();
-      if (tidy < (context - 1)) {
-        yoff = yoff - context + 1;
-        sh_dy[tidx][tidy] =
-            (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0;
-      }
-      __syncthreads();
-
-      for (int t = 0; t < context; t++) {
-        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx + context - 1 - t];
-        __syncthreads();
-        // warp size and blockDim.x is 32.
-        val += __shfl_down(val, 16);
-        val += __shfl_down(val, 8);
-        val += __shfl_down(val, 4);
-        val += __shfl_down(val, 2);
-        val += __shfl_down(val, 1);
-        __syncthreads();
-        if (tidx == 0) {
-          sh_dw[t][tidy] += val;
-        }
-        __syncthreads();
-      }
-    }
-  }
-
-  for (int t = tidy; (t < context) && ((gidx + tidx) < width); t += blky) {
-    dw[t * width + gidx + tidx] += sh_dw[t][tidx];
-  }
-}
-
-template <int BLOCK_H, int BLOCK_W>
-__global__ void KeRowConvBwWeight2(real* dw,
-                                   const real* x,
-                                   const real* dy,
-                                   const int* starts,
-                                   const int height,
-                                   const int width,
-                                   const int numSeq,
-                                   const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  __shared__ real sh_x[BLOCK_H][BLOCK_W];
-  __shared__ real sh_dy[BLOCK_H][BLOCK_W];
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-
-    const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H;
-    for (int j = tidy; j < size; j += BLOCK_H) {
-      int xoff = gidx + tidx;
-      int yoff = start + j;
-
-      // transpose
-      sh_x[tidx][tidy] =
-          (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
-      __syncthreads();
-
-      for (int t = 0; t < context; t++) {
-        sh_dy[tidx][tidy] =
-            (xoff < width && (yoff - t) >= start && yoff - t < end)
-                ? dy[(yoff - t) * width + xoff]
-                : 0.0;
-        __syncthreads();
-
-        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx];
-        __syncthreads();
-        // warp size and blockDim.x is 32.
-        val += __shfl_down(val, 16);
-        val += __shfl_down(val, 8);
-        val += __shfl_down(val, 4);
-        val += __shfl_down(val, 2);
-        val += __shfl_down(val, 1);
-        __syncthreads();
-
-        if (tidx == 0 && (gidx + tidy) < width) {
-          dw[t * width + gidx + tidy] += val;
-        }
-      }
-    }
-  }
-}
-
-template <int BLOCK_H, int BLOCK_W>
-__global__ void KeRowConvBwData(real* dx,
-                                const real* w,
-                                const real* dy,
-                                const int* starts,
-                                const int height,
-                                const int width,
-                                const int numSeq,
-                                const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int blky = blockDim.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  __shared__ real sw[BLOCK_H][BLOCK_W];
-
-  for (int i = tidy; i < context; i += blky) {
-    sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0;
-  }
-
-  __syncthreads();
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-    for (int j = tidy; j < steps; j += blky) {
-      real sum = 0;
-      int off = (start + j) * width;
-      for (int t = 0; t < context && (j - t) >= 0; ++t) {
-        int dyOff = off - t * width;
-        real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
-        sum += sw[t][tidx] * dyVal;
-      }
-      if (gidx + tidx < width) {
-        dx[off + gidx + tidx] += sum;
-      }
-    }
-  }
-}
-
-__global__ void KeRowConvBwData2(real* dx,
-                                 const real* w,
-                                 const real* dy,
-                                 const int* starts,
-                                 const int height,
-                                 const int width,
-                                 const int numSeq,
-                                 const int context) {
-  const int tidx = threadIdx.x;
-  const int tidy = threadIdx.y;
-  const int blky = blockDim.y;
-  const int gidx = blockIdx.x * blockDim.x;
-
-  for (int i = 0; i < numSeq; ++i) {
-    const int start = starts[i];
-    const int end = starts[i + 1];
-    const int steps = end - start;
-    for (int j = tidy; j < steps; j += blky) {
-      real sum = 0;
-      int off = (start + j) * width;
-      for (int t = 0; t < context && (j - t) >= 0; ++t) {
-        int dyOff = off - t * width;
-        real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
-        real wVal = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
-        sum += wVal * dyVal;
-      }
-      if (gidx + tidx < width) {
-        dx[off + gidx + tidx] += sum;
-      }
-    }
-  }
-}
-
-template <>
-void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
-                                  const GpuMatrix& in,
-                                  const GpuMatrix& filter,
-                                  GpuMatrix& inG,
-                                  GpuMatrix& filterG,
-                                  const GpuIVector& seq) {
-  const size_t numSeq = seq.getSize() - 1;
-  const size_t contextLength = filter.getHeight();
-  const size_t height = in.getHeight();
-  const size_t width = in.getWidth();
-
-  const real* dy = outG.getData();
-  const real* x = in.getData();
-  const real* w = filter.getData();
-  const int* starts = seq.getData();
-
-  if (filterG) {
-    dim3 dimBlock(32, 32);
-    dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
-    real* dw = filterG.getData();
-    if (contextLength <= 32) {
-      KeRowConvBwWeight<32, 32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-          dw, x, dy, starts, height, width, numSeq, contextLength);
-    } else {
-      KeRowConvBwWeight2<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
-          dw, x, dy, starts, height, width, numSeq, contextLength);
-    }
-  }
-
-  if (inG) {
-    real* dx = inG.getData();
-    dim3 dimBlock2(32, 32);
-    dim3 dimGrid2(DIVUP(width, dimBlock2.x), 1);
-    if (contextLength <= 64) {
-      KeRowConvBwData<32, 64><<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>(
-          dx, w, dy, starts, height, width, numSeq, contextLength);
-    } else {
-      KeRowConvBwData2<<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>(
-          dx, w, dy, starts, height, width, numSeq, contextLength);
-    }
-  }
-
-  CHECK_SYNC("RowConvGrad");
-}
-
-}  // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOp.cpp b/paddle/function/ScaleSubRegionOp.cpp
deleted file mode 100644
index 6ed6eb2dba477722664ca4a29f4689114f368846..0000000000000000000000000000000000000000
--- a/paddle/function/ScaleSubRegionOp.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ScaleSubRegionOp.h"
-#include "paddle/function/TensorShape.h"
-
-namespace paddle {
-
-template <>
-void ScaleSubRegion<DEVICE_TYPE_CPU>(real* outputs,
-                                     const real* inputs,
-                                     const real* indices,
-                                     const TensorShape shape,
-                                     const FuncConfig& conf) {
-  real value = conf.get<real>("value");
-
-  int number = shape[0];
-  int channel = shape[1];
-  int height = shape[2];
-  int width = shape[3];
-
-  memcpy(outputs, inputs, number * channel * height * width * sizeof(real));
-
-  for (int n = 0; n < number; ++n) {
-    // indices start from 1
-    int offset = n * 6;
-    for (int c = indices[offset] - 1; c < indices[offset + 1]; ++c) {
-      for (int h = indices[offset + 2] - 1; h < indices[offset + 3]; ++h) {
-        for (int w = indices[offset + 4] - 1; w < indices[offset + 5]; ++w) {
-          int idx = ((n * channel + c) * height + h) * width + w;
-          outputs[idx] *= value;
-        }
-      }
-    }
-  }
-}
-
-template <>
-void ScaleSubRegionGrad<DEVICE_TYPE_CPU>(const real* inGrad,
-                                         real* outGrad,
-                                         const real* indices,
-                                         const TensorShape shape,
-                                         const FuncConfig& conf) {
-  real value = conf.get<real>("value");
-
-  int number = shape[0];
-  int channel = shape[1];
-  int height = shape[2];
-  int width = shape[3];
-
-  for (int n = 0; n < number; ++n) {
-    for (int c = 0; c < channel; ++c) {
-      for (int h = 0; h < height; ++h) {
-        for (int w = 0; w < width; ++w) {
-          int idx = ((n * channel + c) * height + h) * width + w;
-          int offset = n * 6;
-          if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
-              h >= (indices[offset + 2] - 1) &&
-              h <= (indices[offset + 3] - 1) &&
-              w >= (indices[offset + 4] - 1) &&
-              w <= (indices[offset + 5] - 1)) {
-            outGrad[idx] += inGrad[idx] * value;
-          } else {
-            outGrad[idx] += inGrad[idx];
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * \brief For each instance, ScaleSubRegion can be used to multiply a value to
- *        a specified sub continuous region. By providing start index and end
- *        index for C/H/W, you can specify the location and shape of the region.
- *
- * Argument in this Function:
- * \param inputs    A 4-D tensor with shape [N, C, H, W], only one input.
- * \param indices   A 2-D tensor with shape [N, 6], indicates the sub region.
- * \param outputs   A 4-D tensor with same shape as inputs, output value.
- */
-template <DeviceType Device>
-class ScaleSubRegionFunc : public FunctionBase {
-public:
-  void init(const FuncConfig& config) override { conf_ = config; }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(2UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-
-    TensorShape shape = inputs[0].shape();
-
-    ScaleSubRegion<Device>(outputs[0].data<real>(),
-                           inputs[0].data<real>(),
-                           inputs[1].data<real>(),
-                           shape,
-                           conf_);
-  }
-
-private:
-  FuncConfig conf_;
-};
-
-/**
- * \brief The backward propagation of ScaleSubRegion Function.
- *
- * Argument in this Function:
- * \param inputs  A 4-D tensor with shape [N, C, H, W], output gradient.
- * \param indices A 2-D tensor with shape [N, 6], indicates the sub region.
- * \param outputs A 4-D tensor with shape [N, C, H, W], gradient of input value.
- */
-
-template <DeviceType Device>
-class ScaleSubRegionGradFunc : public FunctionBase {
-public:
-  void init(const FuncConfig& config) override { conf_ = config; }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(2UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-
-    TensorShape shape = inputs[0].shape();
-
-    ScaleSubRegionGrad<Device>(inputs[0].data<real>(),
-                               outputs[0].data<real>(),
-                               inputs[1].data<real>(),
-                               shape,
-                               conf_);
-  }
-
-private:
-  FuncConfig conf_;
-};
-
-REGISTER_TYPED_FUNC(ScaleSubRegion, CPU, ScaleSubRegionFunc);
-REGISTER_TYPED_FUNC(ScaleSubRegionGrad, CPU, ScaleSubRegionGradFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(ScaleSubRegion, GPU, ScaleSubRegionFunc);
-REGISTER_TYPED_FUNC(ScaleSubRegionGrad, GPU, ScaleSubRegionGradFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/SwitchOp.cpp b/paddle/function/SwitchOp.cpp
deleted file mode 100644
index 50e1d6c04c54fed5b847aa10dbb253f00cfa42d4..0000000000000000000000000000000000000000
--- a/paddle/function/SwitchOp.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SwitchOp.h"
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-
-template <>
-void NCHW2NHWC<DEVICE_TYPE_CPU>(real* outputs,
-                                const real* inputs,
-                                const int num,
-                                const int inC,
-                                const int inH,
-                                const int inW,
-                                const int argType) {
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < inC; ++c) {
-      for (int h = 0; h < inH; ++h) {
-        for (int w = 0; w < inW; ++w) {
-          if (argType == ADD_TO) {
-            outputs[((n * inH + h) * inW + w) * inC + c] += *(inputs++);
-          } else {
-            outputs[((n * inH + h) * inW + w) * inC + c] = *(inputs++);
-          }
-        }
-      }
-    }
-  }
-}
-
-template <>
-void NHWC2NCHW<DEVICE_TYPE_CPU>(real* outputs,
-                                const real* inputs,
-                                const int num,
-                                const int inH,
-                                const int inW,
-                                const int inC,
-                                const int argType) {
-  for (int n = 0; n < num; ++n) {
-    for (int h = 0; h < inH; ++h) {
-      for (int w = 0; w < inW; ++w) {
-        for (int c = 0; c < inC; ++c) {
-          if (argType == ADD_TO) {
-            outputs[((n * inC + c) * inH + h) * inW + w] += *(inputs++);
-          } else {
-            outputs[((n * inC + c) * inH + h) * inW + w] = *(inputs++);
-          }
-        }
-      }
-    }
-  }
-}
-
-/**
- * \brief  Switch dimension order of image input.
- *         The input and output is a 4D tensor. Switch order
- *         'batch_size,channels, height, width' to
- *         order 'batch_size, height, width, channels'.
- *
- * Argument in this Function:
- * \param inputs  input data with order 'batch_size,channels, height, width'.
- * \param outputs output data with order 'batch_size, height, width, channels'.
- */
-template <DeviceType Device>
-class NCHW2NHWCFunc : public FunctionBase {
-public:
-  void init(const FuncConfig& config) override {}
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-
-    size_t num = inputs[0].shape()[0];
-    size_t inC = inputs[0].shape()[1];
-    size_t inH = inputs[0].shape()[2];
-    size_t inW = inputs[0].shape()[3];
-    NCHW2NHWC<Device>(outputs[0].data<real>(),
-                      inputs[0].data<real>(),
-                      num,
-                      inC,
-                      inH,
-                      inW,
-                      outputs[0].getArgType());
-  }
-};
-
-/**
- * \brief  Switch dimension order of image input.
- *         The input and output is a 4D tensor. Switch order
- *         'batch_size, height, width, channels' to
- *         order 'batch_size, channels, height, width'.
- *
- * Argument in this Function:
- * \param inputs  input data with order 'batch_size, height, width, channels'.
- * \param outputs output data with order 'batch_size, channels, height, width'.
- */
-template <DeviceType Device>
-class NHWC2NCHWFunc : public FunctionBase {
-public:
-  void init(const FuncConfig& config) override {}
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(1UL, outputs.size());
-
-    size_t num = inputs[0].shape()[0];
-    size_t inH = inputs[0].shape()[1];
-    size_t inW = inputs[0].shape()[2];
-    size_t inC = inputs[0].shape()[3];
-
-    NHWC2NCHW<Device>(outputs[0].data<real>(),
-                      inputs[0].data<real>(),
-                      num,
-                      inH,
-                      inW,
-                      inC,
-                      outputs[0].getArgType());
-  }
-};
-
-REGISTER_TYPED_FUNC(NCHW2NHWC, CPU, NCHW2NHWCFunc);
-REGISTER_TYPED_FUNC(NHWC2NCHW, CPU, NHWC2NCHWFunc);
-#ifdef PADDLE_WITH_CUDA
-REGISTER_TYPED_FUNC(NCHW2NHWC, GPU, NCHW2NHWCFunc);
-REGISTER_TYPED_FUNC(NHWC2NCHW, GPU, NHWC2NCHWFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/TensorShape.h b/paddle/function/TensorShape.h
deleted file mode 100644
index 02d38c32c007325a928910d136d48214ba5f6bc3..0000000000000000000000000000000000000000
--- a/paddle/function/TensorShape.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <glog/logging.h>
-
-namespace paddle {
-
-/**
- * TensorShape used to represent shape of normal tensor.
- */
-class TensorShape {
-public:
-  TensorShape() : ndims_(0), nelements_(0) { initDims(0); }
-
-  TensorShape(size_t ndims) : ndims_(ndims), nelements_(1) { initDims(ndims); };
-
-  TensorShape(std::initializer_list<size_t> dims) {
-    ndims_ = dims.size();
-    initDims(ndims_);
-    dims_.assign(dims);
-    numElements();
-  };
-
-  TensorShape(const TensorShape& t)
-      : ndims_(t.ndims_), nelements_(t.nelements_) {
-    initDims(ndims_);
-    dims_.assign(t.dims_.begin(), t.dims_.end());
-  };
-
-  // get the size of specified dimension
-  size_t operator[](size_t dim) const {
-    CHECK_GE(dim, (size_t)0);
-    CHECK_LT(dim, ndims_);
-    return dims_[dim];
-  }
-
-  // set the size of specified dimension
-  void setDim(size_t dim, size_t size) {
-    CHECK_GE(dim, (size_t)0);
-    CHECK_LT(dim, ndims_);
-    dims_[dim] = size;
-    numElements();
-  }
-
-  void reshape(std::initializer_list<size_t> dims) {
-    ndims_ = dims.size();
-    if (ndims_ > kMinDims) {
-      dims_.resize(ndims_);
-    }
-    dims_.assign(dims);
-    numElements();
-  }
-
-  // number of dimensions of the tensor
-  size_t ndims() const { return ndims_; }
-
-  size_t getElements() const { return nelements_; }
-
-  bool operator==(const TensorShape& t) const {
-    if (ndims() != t.ndims()) return false;
-    for (size_t i = 0; i < ndims(); i++) {
-      if (dims_[i] != t.dims_[i]) return false;
-    }
-
-    return true;
-  }
-
-  bool operator!=(const TensorShape& t) const { return !(*this == t); }
-
-private:
-  // compute number of elements
-  void numElements() {
-    nelements_ = 1;
-    for (size_t n = 0; n < ndims_; n++) {
-      nelements_ *= dims_[n];
-    }
-  }
-
-  // init dims_
-  void initDims(size_t ndims) {
-    size_t count = ndims < kMinDims ? kMinDims : ndims;
-    dims_.assign(count, 1);
-  }
-
-  // number of dimensions
-  // ndims_ may be not equeal dims_.size()
-  size_t ndims_;
-  // number of elements
-  size_t nelements_;
-  std::vector<size_t> dims_;
-  static const size_t kMinDims = 4;
-};
-
-}  // namespace paddle
diff --git a/paddle/function/TensorType.h b/paddle/function/TensorType.h
deleted file mode 100644
index b384591bd8852bbdc61bf9aa678ce613732c369a..0000000000000000000000000000000000000000
--- a/paddle/function/TensorType.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-enum ValueType {
-  VALUE_TYPE_INT32 = 0,
-  VALUE_TYPE_FLOAT = 1,
-  VALUE_TYPE_DOUBLE = 2,
-  VALUE_TYPE_BYTE = 3
-};
-
-enum DeviceType {
-  DEVICE_TYPE_UNSPECIFIED = 0,
-  DEVICE_TYPE_CPU = 1,
-  DEVICE_TYPE_GPU = 2
-};
-
-enum SparseDataType { T_NO_VALUE = 0, T_FLOAT_VALUE = 1 };
-
-enum SparseDataFormat { T_SPARSE_CSR = 0, T_SPARSE_CSC = 1 };
-
-inline int sizeOfValuType(ValueType valueType) {
-  if (valueType == VALUE_TYPE_INT32) {
-    return 4;
-  } else if (valueType == VALUE_TYPE_FLOAT) {
-    return 4;
-  } else if (valueType == VALUE_TYPE_DOUBLE) {
-    return 8;
-  } else {
-    LOG(FATAL) << "Unknown type: " << valueType;
-    return 0;
-  }
-}
-
-template <typename T>
-struct DataType;
-
-template <>
-struct DataType<float> {
-  static const ValueType value = VALUE_TYPE_FLOAT;
-};
-
-template <>
-struct DataType<double> {
-  static const ValueType value = VALUE_TYPE_DOUBLE;
-};
-
-template <>
-struct DataType<int> {
-  static const ValueType value = VALUE_TYPE_INT32;
-};
-
-namespace detail {
-
-template <typename VType, DeviceType Device>
-struct MatrixT;
-
-template <>
-struct MatrixT<real, DEVICE_TYPE_CPU> {
-  using type = CpuMatrix;
-};
-
-template <>
-struct MatrixT<real, DEVICE_TYPE_GPU> {
-  using type = GpuMatrix;
-};
-
-template <>
-struct MatrixT<int, DEVICE_TYPE_CPU> {
-  using type = void;  // Not implemented
-};
-
-template <>
-struct MatrixT<int, DEVICE_TYPE_GPU> {
-  using type = void;  // Not implemented
-};
-
-template <typename VType, DeviceType Device>
-struct SparseMatrixT;
-
-template <>
-struct SparseMatrixT<real, DEVICE_TYPE_CPU> {
-  using type = CpuSparseMatrix;
-};
-
-template <>
-struct SparseMatrixT<real, DEVICE_TYPE_GPU> {
-  using type = GpuSparseMatrix;
-};
-
-template <>
-struct SparseMatrixT<int, DEVICE_TYPE_CPU> {
-  using type = void;  // Not implemented
-};
-
-template <>
-struct SparseMatrixT<int, DEVICE_TYPE_GPU> {
-  using type = void;  // Not implemented
-};
-
-template <typename VType, DeviceType Device>
-struct VectorT;
-
-template <>
-struct VectorT<real, DEVICE_TYPE_CPU> {
-  using type = CpuVector;
-};
-
-template <>
-struct VectorT<real, DEVICE_TYPE_GPU> {
-  using type = GpuVector;
-};
-
-template <>
-struct VectorT<int, DEVICE_TYPE_CPU> {
-  using type = CpuIVector;
-};
-
-template <>
-struct VectorT<int, DEVICE_TYPE_GPU> {
-  using type = GpuIVector;
-};
-
-}  // namespace detail
-
-template <typename VType, DeviceType DType>
-struct Tensor {
-  typedef typename detail::VectorT<VType, DType>::type Vector;
-  typedef typename detail::MatrixT<VType, DType>::type Matrix;
-  typedef typename detail::SparseMatrixT<VType, DType>::type SparseMatrix;
-};
-
-}  // namespace paddle
diff --git a/paddle/function/neon/NeonDepthwiseConv.cpp b/paddle/function/neon/NeonDepthwiseConv.cpp
deleted file mode 100644
index d3298c753853ca6d212a619cf8d0bd9356a8dbd7..0000000000000000000000000000000000000000
--- a/paddle/function/neon/NeonDepthwiseConv.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "NeonDepthwiseConv.h"
-#include "paddle/function/ConvOp.h"
-
-namespace paddle {
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-template <DeviceType Device>
-class NeonDepthwiseConvFunction : public ConvFunctionBase {
-public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    int batchSize = input[0];
-    int inputChannels = input[1];
-    int inputHeight = input[2];
-    int inputWidth = input[3];
-    int filterHeight = getFilterHeight(filter);
-    int filterWidth = getFilterWidth(filter);
-    int outputChannels = output[1];
-    int outputHeight = output[2];
-    int outputWidth = output[3];
-    int filterMultiplier = outputChannels / groups_;
-    CHECK_EQ(static_cast<size_t>(inputChannels), groups_);
-
-    // only support strideH() == strideW() and filterHeight == filterWidth.
-    CHECK_EQ(strideH(), strideW());
-    CHECK_EQ(filterHeight, filterWidth);
-
-    float* inputData = inputs[0].data<float>();
-    float* filterData = inputs[1].data<float>();
-    float* outputData = outputs[0].data<float>();
-
-    // padding the input
-    float* inputPadding = inputData;
-    int padInputHeight = inputHeight + 2 * paddingH();
-    int padInputWidth = inputWidth + 2 * paddingW();
-    if (paddingH() > 0 || paddingW() > 0) {
-      int newSize = batchSize * inputChannels * padInputHeight * padInputWidth;
-      resizeBuffer<Device>(newSize);
-      inputPadding = reinterpret_cast<float*>(memory_->getBuf());
-      neon::Padding<float>::run(inputData,
-                                inputPadding,
-                                batchSize * inputChannels,
-                                inputHeight,
-                                inputWidth,
-                                padInputHeight,
-                                padInputWidth);
-    }
-
-    std::function<void(
-        const float*, const float*, int, int, int, int, int, int, float*)>
-        DepthWiseConv;
-
-    if (filterWidth == 3 && strideW() == 1) {
-      DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run;
-    } else if (filterWidth == 3 && strideW() == 2) {
-      DepthWiseConv = neon::DepthwiseConvKernel<3, 2>::run;
-    } else if (filterWidth == 4 && strideW() == 1) {
-      DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run;
-    } else if (filterWidth == 4 && strideW() == 2) {
-      DepthWiseConv = neon::DepthwiseConvKernel<4, 2>::run;
-    } else {
-      LOG(FATAL) << "Not supported";
-    }
-
-    for (int i = 0; i < batchSize; i++) {
-      DepthWiseConv(inputPadding,
-                    filterData,
-                    padInputHeight,
-                    padInputWidth,
-                    outputChannels,
-                    outputHeight,
-                    outputWidth,
-                    filterMultiplier,
-                    outputData);
-      inputPadding += inputChannels * padInputHeight * padInputWidth;
-      outputData += outputChannels * outputHeight * outputWidth;
-    }
-  }
-};
-
-#ifndef PADDLE_TYPE_DOUBLE
-REGISTER_TYPED_FUNC(NeonDepthwiseConv, CPU, NeonDepthwiseConvFunction);
-#endif
-
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/neon/NeonDepthwiseConvTranspose.cpp b/paddle/function/neon/NeonDepthwiseConvTranspose.cpp
deleted file mode 100644
index d443d3fa4902f998230651c5c64355d93c4c4f6a..0000000000000000000000000000000000000000
--- a/paddle/function/neon/NeonDepthwiseConvTranspose.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "NeonDepthwiseConv.h"
-#include "paddle/function/ConvOp.h"
-
-namespace paddle {
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-template <DeviceType Device>
-class NeonDepthwiseConvTransposeFunction : public ConvFunctionBase {
-public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    check(inputs, outputs);
-
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    int batchSize = input[0];
-    int inputChannels = input[1];
-    int inputHeight = input[2];
-    int inputWidth = input[3];
-    int filterHeight = getFilterHeight(filter);
-    int filterWidth = getFilterWidth(filter);
-    int outputChannels = output[1];
-    int outputHeight = output[2];
-    int outputWidth = output[3];
-    int filterMultiplier = outputChannels / groups_;
-    CHECK_EQ(inputChannels, groups_);
-
-    // only support strideH() == strideW() and filterHeight == filterWidth.
-    CHECK_EQ(strideH(), strideW());
-    CHECK_EQ(paddingH(), paddingW());
-    CHECK_EQ(filterHeight, filterWidth);
-
-    float* inputData = inputs[0].data<float>();
-    float* filterData = inputs[1].data<float>();
-    float* outputData = outputs[0].data<float>();
-
-    // padding the input, input -> inputPadding
-    float* inputPadding = inputData;
-    int padInputHeight =
-        (inputHeight - 1) * strideH() + 2 * filterHeight - 1 - 2 * paddingH();
-    int padInputWidth =
-        (inputWidth - 1) * strideW() + 2 * filterWidth - 1 - 2 * paddingW();
-
-    if (padInputHeight > inputHeight || padInputWidth > inputWidth) {
-      int newSize = batchSize * inputChannels * padInputHeight * padInputWidth;
-      resizeBuffer<Device>(newSize);
-      inputPadding = reinterpret_cast<float*>(memory_->getBuf());
-      if (strideH() == 1) {
-        neon::Padding<float>::run(inputData,
-                                  inputPadding,
-                                  batchSize * inputChannels,
-                                  inputHeight,
-                                  inputWidth,
-                                  padInputHeight,
-                                  padInputWidth);
-      } else if (strideH() == 2) {
-        neon::StridePadding::run(inputData,
-                                 inputPadding,
-                                 batchSize * inputChannels,
-                                 inputHeight,
-                                 inputWidth,
-                                 padInputHeight,
-                                 padInputWidth);
-      } else {
-        LOG(FATAL) << "Not supported";
-      }
-    }
-
-    std::function<void(
-        const float*, const float*, int, int, int, int, int, int, float*)>
-        DepthWiseConv;
-
-    if (filterWidth == 3) {
-      DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run;
-    } else if (filterWidth == 4) {
-      DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run;
-    } else {
-      LOG(FATAL) << "Not supported";
-    }
-
-    for (int i = 0; i < batchSize; i++) {
-      DepthWiseConv(inputPadding,
-                    filterData,
-                    padInputHeight,
-                    padInputWidth,
-                    outputChannels,
-                    outputHeight,
-                    outputWidth,
-                    filterMultiplier,
-                    outputData);
-      inputPadding += inputChannels * padInputHeight * padInputWidth;
-      outputData += outputChannels * outputHeight * outputWidth;
-    }
-  }
-};
-
-#ifndef PADDLE_TYPE_DOUBLE
-
-REGISTER_TYPED_FUNC(NeonDepthwiseConvTranspose,
-                    CPU,
-                    NeonDepthwiseConvTransposeFunction);
-
-#endif
-
-#endif
-
-}  // namespace paddle
diff --git a/paddle/function/nnpack/NNPACKConvOp.cpp b/paddle/function/nnpack/NNPACKConvOp.cpp
deleted file mode 100644
index 3cdba4f2ed0dad42035fe2d0de87ad5aeeef20ca..0000000000000000000000000000000000000000
--- a/paddle/function/nnpack/NNPACKConvOp.cpp
+++ /dev/null
@@ -1,247 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "nnpack.h"
-#include "paddle/function/ConvOp.h"
-
-DEFINE_bool(nnpack_allocate_outside,
-            true,
-            "Allocate and free workspace memory outside the NNPACK interface.");
-DEFINE_int32(nnpack_num_threads,
-             0,
-             "The number of nnpack threads"
-             "default: 0; 0 to disable threadpool.");
-
-namespace paddle {
-
-nnp_convolution_algorithm get_nnp_convolution_algorithm(
-    const std::string& algorithm) {
-  if (algorithm == "auto") {
-    return nnp_convolution_algorithm_auto;
-  } else if (algorithm == "ft8x8") {
-    return nnp_convolution_algorithm_ft8x8;
-  } else if (algorithm == "ft16x16") {
-    return nnp_convolution_algorithm_ft16x16;
-  } else if (algorithm == "wt8x8") {
-    return nnp_convolution_algorithm_wt8x8;
-  } else if (algorithm == "implicit-gemm") {
-    return nnp_convolution_algorithm_implicit_gemm;
-  } else if (algorithm == "direct") {
-    return nnp_convolution_algorithm_direct;
-  } else {
-    return nnp_convolution_algorithm_auto;
-  }
-}
-
-template <DeviceType Device>
-class NNPACKConvFunction : public ConvFunctionBase {
-public:
-  void init(const FuncConfig& config) override {
-    ConvFunctionBase::init(config);
-    algorithm_ = get_nnp_convolution_algorithm(config.get<std::string>("algo"));
-    transform_strategy_ = nnp_convolution_transform_strategy_compute;
-    nnp_status status = nnp_initialize();
-    CHECK_EQ(status, nnp_status_success);
-    workspaceBuffer_ = nullptr;
-    workspaceSize_ = 0;
-
-    create_nnpack_threadpool();
-  }
-
-  ~NNPACKConvFunction() {
-    if (workspaceBuffer_) {
-      free(workspaceBuffer_);
-    }
-  }
-
-  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-    checkShape(input, filter, output);
-  }
-
-  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(numInputs_, inputs.size());
-    CHECK_EQ(numOutputs_, outputs.size());
-    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-    check(inputs, outputs);
-    const TensorShape& input = inputs[0].shape();
-    const TensorShape& filter = inputs[1].shape();
-    const TensorShape& output = outputs[0].shape();
-
-    size_t batchSize = input[0];
-    size_t inputChannels = input[1];
-    size_t inputHeight = input[2];
-    size_t inputWidth = input[3];
-    size_t filterHeight = getFilterHeight(filter);
-    size_t filterWidth = getFilterWidth(filter);
-    size_t outputChannels = output[1];
-    size_t outputHeight = output[2];
-    size_t outputWidth = output[3];
-
-    nnp_size inputSize = {.width = inputWidth, .height = inputHeight};
-    nnp_padding padding = {.top = (size_t)paddingH(),
-                           .right = (size_t)paddingW(),
-                           .bottom = (size_t)paddingH(),
-                           .left = (size_t)paddingW()};
-    nnp_size kernelSize = {.width = filterWidth, .height = filterHeight};
-    nnp_size outputSubsampling = {.width = (size_t)strideW(),
-                                  .height = (size_t)strideH()};
-
-    float* inputData = inputs[0].data<float>();
-    float* filterData = inputs[1].data<float>();
-    float* outputData = outputs[0].data<float>();
-
-    void* bufferPtr = nullptr;
-    size_t* sizePtr = nullptr;
-    size_t needSize;
-    if (FLAGS_nnpack_allocate_outside) {
-      if (batchSize == 1) {
-        nnp_status status = nnp_convolution_inference(algorithm_,
-                                                      transform_strategy_,
-                                                      inputChannels,
-                                                      outputChannels,
-                                                      inputSize,
-                                                      padding,
-                                                      kernelSize,
-                                                      outputSubsampling,
-                                                      nullptr,
-                                                      nullptr,
-                                                      nullptr,
-                                                      nullptr,
-                                                      nullptr,
-                                                      &needSize,
-                                                      nnp_activation_identity,
-                                                      nullptr,
-                                                      nullptr,
-                                                      nullptr);
-        CHECK_EQ(status, nnp_status_success);
-      } else {
-        // only supports stride = 1
-        CHECK_EQ(strideH(), 1);
-        CHECK_EQ(strideW(), 1);
-        nnp_status status = nnp_convolution_output(algorithm_,
-                                                   batchSize,
-                                                   inputChannels,
-                                                   outputChannels,
-                                                   inputSize,
-                                                   padding,
-                                                   kernelSize,
-                                                   nullptr,
-                                                   nullptr,
-                                                   nullptr,
-                                                   nullptr,
-                                                   nullptr,
-                                                   &needSize,
-                                                   nnp_activation_identity,
-                                                   nullptr,
-                                                   nullptr,
-                                                   nullptr);
-        CHECK_EQ(status, nnp_status_success);
-      }
-
-      VLOG(3) << "workspace size is " << needSize;
-      if (needSize > workspaceSize_) {
-        workspaceSize_ = needSize;
-        if (workspaceBuffer_) {
-          free(workspaceBuffer_);
-        } else {
-          posix_memalign(&workspaceBuffer_, 64, needSize);
-        }
-      }
-
-      if (needSize) {
-        bufferPtr = workspaceBuffer_;
-        sizePtr = &needSize;
-      }
-    }
-
-    size_t inputOffset = inputChannels / groups_ * inputHeight * inputWidth;
-    size_t outputOffset = outputChannels / groups_ * outputHeight * outputWidth;
-    size_t filterOffset = filter.getElements() / groups_;
-
-    if (batchSize == 1) {
-      for (size_t g = 0; g < groups_; g++) {
-        nnp_status status =
-            nnp_convolution_inference(algorithm_,
-                                      transform_strategy_,
-                                      inputChannels / groups_,
-                                      outputChannels / groups_,
-                                      inputSize,
-                                      padding,
-                                      kernelSize,
-                                      outputSubsampling,
-                                      inputData + inputOffset * g,
-                                      filterData + filterOffset * g,
-                                      nullptr, /* bias */
-                                      outputData + outputOffset * g,
-                                      bufferPtr,
-                                      sizePtr,
-                                      nnp_activation_identity,
-                                      nullptr,
-                                      threadpool_, /* threadpool */
-                                      nullptr);
-        CHECK_EQ(status, nnp_status_success);
-      }
-    } else {
-      // only supports stride = 1
-      CHECK_EQ(strideH(), 1);
-      CHECK_EQ(strideW(), 1);
-
-      // TODO(hedaoyuan): There has some bug when batchSize > 1 and groups_ > 1.
-      CHECK_EQ(groups_, static_cast<size_t>(1));
-      nnp_status status = nnp_convolution_output(algorithm_,
-                                                 batchSize,
-                                                 inputChannels,
-                                                 outputChannels,
-                                                 inputSize,
-                                                 padding,
-                                                 kernelSize,
-                                                 inputData,
-                                                 filterData,
-                                                 nullptr, /* bias */
-                                                 outputData,
-                                                 bufferPtr,
-                                                 sizePtr,
-                                                 nnp_activation_identity,
-                                                 nullptr,
-                                                 threadpool_, /* threadpool */
-                                                 nullptr);
-      CHECK_EQ(status, nnp_status_success);
-    }
-  }
-
-  static void create_nnpack_threadpool() {
-    if (FLAGS_nnpack_num_threads && threadpool_ == nullptr) {
-      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
-      VLOG(3) << "Number of threads "
-              << pthreadpool_get_threads_count(threadpool_);
-    }
-  }
-
-private:
-  nnp_convolution_algorithm algorithm_;
-  nnp_convolution_transform_strategy transform_strategy_;
-  void* workspaceBuffer_;
-  size_t workspaceSize_;
-  static pthreadpool_t threadpool_;
-};
-
-template <DeviceType Device>
-pthreadpool_t NNPACKConvFunction<Device>::threadpool_ = nullptr;
-
-REGISTER_TYPED_FUNC(NNPACKConv, CPU, NNPACKConvFunction);
-
-}  // namespace paddle
diff --git a/paddle/function/nnpack/NNPACKConvOpTest.cpp b/paddle/function/nnpack/NNPACKConvOpTest.cpp
deleted file mode 100644
index c80ffb5d5d255465e9a2fa251fb9a6c61f96e7ec..0000000000000000000000000000000000000000
--- a/paddle/function/nnpack/NNPACKConvOpTest.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/function/ConvOpTest.h"
-
-namespace paddle {
-
-TEST(NNPACK, Forward) {
-  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
-      "GemmConv-CPU", "NNPACKConv-CPU", forward);
-}
-
-TEST(NNPACK, Depthwise) {
-  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
-      "GemmConv-CPU", "NNPACKConv-CPU", forward);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt
deleted file mode 100644
index 3d6ced713f00bd72622d8aeed3967642b6774ffe..0000000000000000000000000000000000000000
--- a/paddle/gserver/CMakeLists.txt
+++ /dev/null
@@ -1,154 +0,0 @@
-# Gserver package contains:
-#   * Layers
-#   * Activations
-#   * DataProviders
-#   * Evaluators
-#   * GradientMachines(NeuralNetwork)
-file(GLOB_RECURSE GSERVER_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.h")
-file(GLOB_RECURSE GSERVER_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cpp")
-set(GSERVER_SOURCES
-    layers/LstmCompute.cu
-    layers/GruCompute.cu
-    ${GSERVER_SOURCES})
-
-macro(filter_test VAR_NAME)
-    set(tmp)
-    foreach(p IN LISTS ${VAR_NAME})
-        if(NOT ${p} MATCHES ".*tests/.*")
-             set(tmp ${p} ${tmp})
-        endif()
-    endforeach()
-    set(${VAR_NAME} ${tmp})
-endmacro()
-
-filter_test(GSERVER_HEADER)
-filter_test(GSERVER_SOURCES)
-
-if(NOT WITH_MKLDNN)
-    file(GLOB_RECURSE DNN_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLDNN*.h")
-    file(GLOB_RECURSE DNN_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLDNN*.cpp")
-    list(REMOVE_ITEM GSERVER_HEADER ${DNN_HEADER})
-    list(REMOVE_ITEM GSERVER_SOURCES ${DNN_SOURCES})
-    message(STATUS "Skip compiling with MKLDNNLayers and MKLDNNActivations")
-else()
-    message(STATUS "Compile with MKLDNNLayers and MKLDNNActivations")
-endif()
-
-if(NOT WITH_MKLML)
-    file(GLOB_RECURSE MKL_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLPacked*.h")
-    file(GLOB_RECURSE MKL_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLPacked*.cpp")
-    list(REMOVE_ITEM GSERVER_HEADER ${MKL_HEADER})
-    list(REMOVE_ITEM GSERVER_SOURCES ${MKL_SOURCES})
-    message(STATUS "Skip compiling with MKLPackedLayers")
-else()
-    message(STATUS "Compile with MKLPackedLayers")
-endif()
-
-if(NOT WITH_GPU)
-    list(REMOVE_ITEM GSERVER_HEADER
-        layers/CudnnConvBaseLayer.h
-        layers/CudnnConvLayer.h
-        layers/CudnnConvTransLayer.h
-        layers/CudnnPoolLayer.h
-        layers/CudnnBatchNormLayer.h)
-
-    list(REMOVE_ITEM GSERVER_SOURCES
-        layers/CudnnConvBaseLayer.cpp
-        layers/CudnnConvLayer.cpp
-        layers/CudnnConvTransLayer.cpp
-        layers/CudnnPoolLayer.cpp
-        layers/CudnnBatchNormLayer.cpp)
-    compile_cu_as_cpp(layers/LstmCompute.cu)
-    compile_cu_as_cpp(layers/GruCompute.cu)
-endif()
-
-if(NOT WITH_PYTHON)
-    list(REMOVE_ITEM GSERVER_SOURCES
-            dataproviders/PyDataProvider.cpp)
-    
-    list(REMOVE_ITEM GSERVER_HEADER
-            dataproviders/PyDataProvider.h)
-endif()
-
-if(MOBILE_INFERENCE)
-    # Remove evaluators
-    list(REMOVE_ITEM GSERVER_SOURCES
-         layers/ValidationLayer.cpp
-         evaluators/Evaluator.cpp
-         evaluators/DetectionMAPEvaluator.cpp
-         evaluators/CTCErrorEvaluator.cpp
-         evaluators/ChunkEvaluator.cpp)
-
-    # Remove dataproviders
-    list(REMOVE_ITEM GSERVER_SOURCES
-         dataproviders/DataProvider.cpp
-         dataproviders/MultiDataProvider.cpp
-         dataproviders/PyDataProvider2.cpp
-         dataproviders/PyDataProvider.cpp)
-
-    # Remove useless gradientmachines
-    list(REMOVE_ITEM GSERVER_SOURCES
-         gradientmachines/MultiNetwork.cpp
-         gradientmachines/RecurrentGradientMachine.cpp
-         gradientmachines/ParallelNeuralNetwork.cpp
-         gradientmachines/GradientMachineMode.cpp
-         gradientmachines/MultiGradientMachine.cpp)
-
-    # Remove layers that used in training
-    list(REMOVE_ITEM GSERVER_SOURCES
-    	 layers/RecurrentLayerGroup.cpp
-         layers/CostLayer.cpp
-         layers/MultiBoxLossLayer.cpp
-         layers/WarpCTCLayer.cpp
-         layers/CTCLayer.cpp
-         layers/LinearChainCTC.cpp
-         layers/PrintLayer.cpp)
-    list(REMOVE_ITEM GSERVER_SOURCES
-         layers/OuterProdLayer.cpp
-         layers/SumToOneNormLayer.cpp
-         layers/ConvShiftLayer.cpp
-         layers/InterpolationLayer.cpp
-         layers/AgentLayer.cpp
-         layers/DotMulOperator.cpp
-         layers/GruStepLayer.cpp
-         layers/LstmStepLayer.cpp
-         layers/ConvexCombinationLayer.cpp
-         layers/Conv3DLayer.cpp
-         layers/DeConv3DLayer.cpp
-         layers/CropLayer.cpp
-         layers/CrossEntropyOverBeam.cpp
-         layers/DataNormLayer.cpp
-         layers/FeatureMapExpandLayer.cpp
-         layers/HierarchicalSigmoidLayer.cpp
-         layers/MultinomialSampler.cpp
-         layers/NCELayer.cpp
-         layers/KmaxSeqScoreLayer.cpp
-         layers/MDLstmLayer.cpp
-         layers/MultiplexLayer.cpp
-         layers/PadLayer.cpp
-         layers/Pool3DLayer.cpp
-         layers/ResizeLayer.cpp
-         layers/RotateLayer.cpp
-         layers/RowConvLayer.cpp
-         layers/RowL2NormLayer.cpp
-         layers/SamplingIdLayer.cpp
-         layers/ScaleShiftLayer.cpp
-         layers/SelectiveFullyConnectedLayer.cpp
-         layers/SpatialPyramidPoolLayer.cpp
-         layers/BilinearInterpLayer.cpp
-         layers/ClipLayer.cpp)
-endif()
-
-if(WITH_GPU)
-    cuda_add_library(paddle_gserver ${GSERVER_SOURCES})
-else()
-    add_library(paddle_gserver STATIC
-        ${GSERVER_SOURCES})
-endif()
-
-add_style_check_target(paddle_gserver ${GSERVER_SOURCES})
-add_style_check_target(paddle_gserver ${GSERVER_HEADER})
-add_dependencies(paddle_gserver paddle_proto ${external_project_dependencies})
-if(WITH_TESTING)
-    add_subdirectory(tests)
-endif()
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
deleted file mode 100644
index 8d8f01234fe3859989e44fe6147105fb72b832ff..0000000000000000000000000000000000000000
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ /dev/null
@@ -1,509 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ActivationFunction.h"
-
-#include <algorithm>
-#include <iostream>
-#include <memory>
-#include <string>
-#include <thread>
-#include <type_traits>
-#include "paddle/parameter/Argument.h"
-#include "paddle/utils/ClassRegistrar.h"
-#include "paddle/utils/Logging.h"
-
-#ifdef PADDLE_WITH_MKLDNN
-#include "MKLDNNActivation.h"
-#endif
-
-namespace paddle {
-
-static ClassRegistrar<ActivationFunction> gActivationRegistrar;
-/**
- * @def ACTIVATION_CLASS_NAME
- * @brief Macro for getting derived activation class name
- * @note ACTIVATION_CLASS_NAME(softmax) softmax_;
- * means softmaxActivation softmax_;
- */
-#define ACTIVATION_CLASS_NAME(ACTIVATION_NAME) ACTIVATION_NAME##Activation
-/**
- * @def BEGIN_DEFINE_ACTIVATION
- * @brief Macro for defining a devried activation class
- */
-#define BEGIN_DEFINE_ACTIVATION(ACTIVATION_NAME)                             \
-  class ACTIVATION_CLASS_NAME(ACTIVATION_NAME) : public ActivationFunction { \
-  private:                                                                   \
-    static const std::string name;                                           \
-                                                                             \
-  public:                                                                    \
-    const std::string& getName() const { return name; }
-/**
- * @def END_DEFINE_ACTIVATION
- * @brief Macro for registering a derived activation class
- */
-#define END_DEFINE_ACTIVATION(ACTIVATION_NAME)                     \
-  }                                                                \
-  ;                                                                \
-  const std::string ACTIVATION_CLASS_NAME(ACTIVATION_NAME)::name = \
-      #ACTIVATION_NAME;                                            \
-  static InitFunction __reg_activation__##ACTIVATION_NAME([] {     \
-    gActivationRegistrar                                           \
-        .registerClass<ACTIVATION_CLASS_NAME(ACTIVATION_NAME)>(    \
-            #ACTIVATION_NAME);                                     \
-  });
-
-/**
- * @brief The IdentityActivation class
- *
- * Do nothing when forward/backward.
- */
-class IdentityActivation : public ActivationFunction {
-public:
-  static const std::string name;
-  Error __must_check forward(Argument& act) {
-    (void)act;
-    return Error();
-  }
-  Error __must_check backward(Argument& act) {
-    (void)act;
-    return Error();
-  }
-  const std::string& getName() const { return name; }
-};
-const std::string IdentityActivation::name = "";
-static InitFunction __reg_activation__identity([] {
-  gActivationRegistrar.registerClass<IdentityActivation>("");
-  gActivationRegistrar.registerClass<IdentityActivation>("linear");
-});
-
-/**
- * @brief Sigmoid Activation
- * \f[
- * f(z) = \frac{1}{1+exp(-z)}
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(sigmoid)
-Error __must_check forward(Argument& act) {
-  act.value->sigmoid(*act.value);
-  return Error();
-}
-Error __must_check backward(Argument& act) {
-  act.grad->sigmoidDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(sigmoid)
-
-/**
- * @brief Softmax Activation
- * \f[
- * P(y=j|x) = \frac{e^{x^Tw_j}}{\sum^K_{k=1}e^{x^Tw_k}}
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(softmax)
-private:
-MatrixPtr sftMaxSum_;
-MatrixPtr sftMaxDot_;
-
-public:
-Error __must_check forward(Argument& act) {
-  act.value->softmax(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  MatrixPtr outputV = act.value;
-  MatrixPtr outputG = act.grad;
-
-  if (outputG->useGpu()) {
-    outputG->softmaxBackward(*outputV);
-  } else {
-    SetDevice device(act.deviceId);
-    Matrix::resizeOrCreate(sftMaxDot_,
-                           outputG->getHeight(),
-                           outputG->getWidth(),
-                           /* trans */ false,
-                           useGpu(act.deviceId));
-    Matrix::resizeOrCreate(sftMaxSum_,
-                           outputG->getHeight(),
-                           1,
-                           /* trans */ false,
-                           useGpu(act.deviceId));
-
-    sftMaxDot_->dotMul(*outputG, *outputV);
-    sftMaxSum_->colMerge(*sftMaxDot_);
-
-    act.grad->softmaxDerivative(*act.value, *sftMaxSum_);
-  }
-  return Error();
-}
-END_DEFINE_ACTIVATION(softmax)
-
-/**
- * @brief Sequence_softmax Activation
- * @note Softmax on all frames of one sequence.
- * Width of frame must be one.
- */
-BEGIN_DEFINE_ACTIVATION(sequence_softmax)
-private:
-ACTIVATION_CLASS_NAME(softmax) softmax_;
-Argument argument_;
-
-public:
-Error __must_check forward(Argument& act) {
-  if (act.value->getWidth() != 1UL) {
-    return Error(
-        "Input width for each timestep of sequence softmax should be 1");
-  }
-
-  if (!argument_.value) {
-    argument_.value = Matrix::create(nullptr,
-                                     /* height= */ 1,
-                                     1,
-                                     /* trans= */ false,
-                                     useGpu(act.deviceId));
-    argument_.grad = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    1,
-                                    /* trans= */ false,
-                                    useGpu(act.deviceId));
-  }
-
-  auto starts =
-      act.hasSubseq()
-          ? act.subSequenceStartPositions->getVector(useGpu(act.deviceId))
-          : act.sequenceStartPositions->getVector(useGpu(act.deviceId));
-  act.value->sequenceSoftmax(*act.value, *starts);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  if (act.value->getWidth() != 1UL) {
-    return Error(
-        "Input width for each timestep of sequence softmax should be 1");
-  }
-
-  size_t numSequences =
-      act.hasSubseq() ? act.getNumSubSequences() : act.getNumSequences();
-  const int* starts = act.getCpuStartPositions();
-
-  for (size_t i = 0; i < numSequences; ++i) {
-    // TODO(Dangqingqing) optimization for GPU
-    size_t offset = starts[i];
-    size_t size = starts[i + 1] - starts[i];
-    argument_.value->setData(act.value->getData() + offset, 1UL, size);
-    argument_.grad->setData(act.grad->getData() + offset, 1UL, size);
-
-    Error err = softmax_.backward(argument_);
-    if (!err.isOK()) return err;
-  }
-  return Error();
-}
-END_DEFINE_ACTIVATION(sequence_softmax)
-
-/*
- * @brief SoftSign Activation.
- * \f[
- * f(z) = \frac{z}{1 + |z|}
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(softsign)
-private:
-MatrixPtr denominator_;
-
-Error __must_check forward(Argument& act) {
-  size_t height = act.value->getHeight();
-  size_t width = act.value->getWidth();
-  Matrix::resizeOrCreate(
-      denominator_, height, width, false, useGpu(act.deviceId));
-  denominator_->assign(*act.value);
-  denominator_->abs2();
-  denominator_->add(1.);
-
-  act.value->dotDiv(*act.value, *denominator_);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  denominator_->square2();
-  denominator_->scalarDiv(*denominator_, 1.);
-  act.grad->dotMul(*act.grad, *denominator_);
-  return Error();
-}
-END_DEFINE_ACTIVATION(softsign)
-
-/**
- * @brief Relu Activation.
- * forward. y = max(0, z)
- *
- * derivative of relu is:
- *
- *    1 if z > 0
- *
- *    0 otherwise.
- */
-BEGIN_DEFINE_ACTIVATION(relu)
-Error __must_check forward(Argument& act) {
-  act.value->relu(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->reluDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(relu)
-
-/**
- * @brief BRelu Activation.
- *
- * forward. y = min(24, max(0, z))
- *
- * derivative of brelu is:
- *
- *    1 if 0 < z < 24
- *
- *    0 otherwise.
- *
- * TODO(yuyang18): Remove magic number 24 or make it configuable.
- */
-BEGIN_DEFINE_ACTIVATION(brelu)
-Error __must_check forward(Argument& act) {
-  act.value->brelu(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->breluDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(brelu)
-
-/**
- * @brief Tanh Activation.
- * \f[
- * f(z) = tanh(z)=\frac{e^z-e^{-z}}{e^z+e^{-z}}
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(tanh)
-Error __must_check forward(Argument& act) {
-  act.value->tanh(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->tanhDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(tanh)
-
-/**
- * @brief Scaled Tanh Activation
- * \f[
- * f(z) = 1.7159 * tanh(2/3*z)
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(stanh)
-private:
-real a, b;
-
-public:
-ACTIVATION_CLASS_NAME(stanh)() : a(1.7159), b(2. / 3.) {}
-Error __must_check forward(Argument& act) {
-  act.value->scaledTanh(*act.value, a, b);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->scaledTanhDerivative(*act.value, a, b);
-  return Error();
-}
-END_DEFINE_ACTIVATION(stanh)
-
-/**
- * @brief Soft Relu Activation.
- * \f[
- * f(z) = ln(1+e^z)
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(softrelu)
-Error __must_check forward(Argument& act) {
-  act.value->softrelu(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->softreluDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(softrelu)
-
-/**
- * @brief Abs Activation.
- * Forward: f(z) = abs(z)
- *
- * Derivative:
- *
- *     1   if z>0
- *
- *    -1   if z<0
- *
- *     0   if z=0
- */
-BEGIN_DEFINE_ACTIVATION(abs)
-Error __must_check forward(Argument& act) {
-  SetDevice device(act.deviceId);
-  Matrix::resizeOrCreate(act.in,
-                         act.value->getHeight(),
-                         act.value->getWidth(),
-                         /* trans */ false,
-                         useGpu(act.deviceId));
-
-  act.in->copyFrom(*act.value);
-  act.value->abs2(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->absDerivative(*act.in);
-  return Error();
-}
-END_DEFINE_ACTIVATION(abs)
-
-/**
- * @brief Square Activation.
- * \f[
- * f(z) = z^2.
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(square)
-Error __must_check forward(Argument& act) {
-  SetDevice device(act.deviceId);
-  Matrix::resizeOrCreate(act.in,
-                         act.value->getHeight(),
-                         act.value->getWidth(),
-                         /* trans */ false,
-                         useGpu(act.deviceId));
-
-  act.in->copyFrom(*act.value);
-  act.value->square2(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->squareDerivative(*act.in);
-  return Error();
-}
-END_DEFINE_ACTIVATION(square)
-
-/**
- * @brief Exponential Activation.
- * \f[
- * f(z) = e^z
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(exponential)
-Error __must_check forward(Argument& act) {
-  act.value->exp2(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->expDerivative(*act.value);
-  return Error();
-}
-END_DEFINE_ACTIVATION(exponential)
-
-/**
- * @brief Reciprocal Activation.
- * \f[
- * f(z) = 1/z
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(reciprocal)
-Error __must_check forward(Argument& act) {
-  act.value->reciprocal2();
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->dotMulSquare(*act.value);
-  act.grad->neg();
-  return Error();
-}
-END_DEFINE_ACTIVATION(reciprocal)
-
-/**
- * @brief Square Root Activation.
- * \f[
- * f(z) = sqrt(z)
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(sqrt)
-Error __must_check forward(Argument& act) {
-  act.value->sqrt2();
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->dotDiv(*act.grad, *act.value);
-  act.grad->mulScalar(0.5);
-  return Error();
-}
-END_DEFINE_ACTIVATION(sqrt)
-
-/**
- * @brief Logarithm Activation.
- * \f[
- * f(z) = log(z)
- * \f]
- */
-BEGIN_DEFINE_ACTIVATION(log)
-Error __must_check forward(Argument& act) {
-  SetDevice device(act.deviceId);
-  Matrix::resizeOrCreate(act.in,
-                         act.value->getHeight(),
-                         act.value->getWidth(),
-                         /* trans */ false,
-                         useGpu(act.deviceId));
-
-  act.in->copyFrom(*act.value);
-  act.value->log2(*act.value);
-  return Error();
-}
-
-Error __must_check backward(Argument& act) {
-  act.grad->dotDiv(*act.grad, *act.in);
-  return Error();
-}
-END_DEFINE_ACTIVATION(log)
-
-ActivationFunction* ActivationFunction::create(const std::string& type) {
-#ifdef PADDLE_WITH_MKLDNN
-  if (!type.empty() && type.compare(0, 7, "mkldnn_") == 0) {
-    return MKLDNNActivation::create(type);
-  }
-#endif
-
-  return gActivationRegistrar.createByType(type);
-}
-
-std::vector<std::string> ActivationFunction::getAllRegisteredTypes() {
-  std::vector<std::string> types;
-  gActivationRegistrar.forEachType(
-      [&](const std::string& type) { types.push_back(type); });
-  return types;
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/activations/ActivationFunction.h b/paddle/gserver/activations/ActivationFunction.h
deleted file mode 100644
index 0f4b0fe0abb85403d42fc8a2ac28560e10058c20..0000000000000000000000000000000000000000
--- a/paddle/gserver/activations/ActivationFunction.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include <vector>
-#include "paddle/utils/Error.h"
-
-namespace paddle {
-
-struct Argument;
-/**
- * @brief Activation function is a function that transforms a set of input
- * signals into an output signals. The purpose of the activation function
- * is to introduce non-liearilty into the network.
- *
- * @note Common activation function are provieded, including linear,
- * sigmoid, softmax, sequence_max, relu, brelu, tanh, stanh,
- * softrelu, abs, square, exponential.
- *
- */
-class ActivationFunction {
-public:
-  static ActivationFunction* create(const std::string& type);
-  static std::vector<std::string> getAllRegisteredTypes();
-
-  ActivationFunction() {}
-
-  virtual ~ActivationFunction() {}
-
-  /**
-   * @brief Foward propagation
-   *
-   * act.value <- f(act.value),
-   * where f is the activation function.
-   * Suppose that before calling forward(), act.value is x and
-   * after forward() is called, act.value is y, then y = f(x).
-   *
-   * Usually, act is Layer::output_
-   */
-  virtual Error __must_check forward(Argument& act) = 0;
-
-  /**
-   * @brief Backward propagaion
-   *
-   * x and y are defined in the above comment for forward().
-   * - Before calling backward(), act.grad = dE / dy, where E is the error/cost
-   * - After backward() returns, act.grad = dE / dx = (dE/dy) * (dy/dx)
-   */
-  virtual Error __must_check backward(Argument& act) = 0;
-
-  virtual const std::string& getName() const = 0;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/activations/MKLDNNActivation.cpp b/paddle/gserver/activations/MKLDNNActivation.cpp
deleted file mode 100644
index 56ffb839344aabe43eaae0bd46e6dbf95e4d8f20..0000000000000000000000000000000000000000
--- a/paddle/gserver/activations/MKLDNNActivation.cpp
+++ /dev/null
@@ -1,249 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNActivation.h"
-#include "mkldnn.hpp"
-#include "paddle/utils/ClassRegistrar.h"
-
-namespace paddle {
-
-static ClassRegistrar<ActivationFunction> gMKLDNNActivationRegistrar;
-/**
- * @def MKLDNN_ACTIVATION_CLASS_NAME
- * @note MKLDNN_ACTIVATION_CLASS_NAME(relu) relu_;
- * means mkldnn_reluActivation relu_;
- */
-#define MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE) mkldnn_##ACT_TYPE##Activation
-
-/**
- * @def BEGIN_MKLDNN_ACTIVATION
- */
-#define BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS) \
-  class MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE) : public BASE_CLASS {
-/**
- * @def END_MKLDNN_ACTIVATION
- */
-#define END_MKLDNN_ACTIVATION(ACT_TYPE)                            \
-private:                                                           \
-  static const std::string name;                                   \
-                                                                   \
-public:                                                            \
-  const std::string& getName() const { return name; }              \
-  }                                                                \
-  ;                                                                \
-  const std::string MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::name = \
-      "mkldnn_" #ACT_TYPE;                                         \
-  static InitFunction __reg_activation__mkldnn_##ACT_TYPE([] {     \
-    gMKLDNNActivationRegistrar                                     \
-        .registerClass<MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)>(    \
-            "mkldnn_" #ACT_TYPE);                                  \
-  });
-
-/**
- * @def DEFINE_MKLDNN_ACTIVATION
- */
-#define DEFINE_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS) \
-  BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS)        \
-  END_MKLDNN_ACTIVATION(ACT_TYPE)
-
-/**
- * @def DEFINE_MKLDNN_ELTWISE_ACTIVATION
- */
-#define DEFINE_MKLDNN_ELTWISE_ACTIVATION(                            \
-    ACT_TYPE, BASE_CLASS, ALPHA, BWD_ALPHA)                          \
-  BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS)                      \
-private:                                                             \
-  static const float alpha;                                          \
-  static const float bwdAlpha;                                       \
-                                                                     \
-public:                                                              \
-  float getAlpha() const { return alpha; }                           \
-  float getBwdAlpha() const { return bwdAlpha; }                     \
-  END_MKLDNN_ACTIVATION(ACT_TYPE)                                    \
-  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::alpha = ALPHA; \
-  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::bwdAlpha = BWD_ALPHA;
-
-/**
- * @brief MKLDNN Relu Activation.
- * Actually mkldnn_relu is Leaky Relu.
- *  f(x) = x                   (x >= 0)
- *  f(x) = negative_slope * x  (x <  0)
- * @note the negative_slope should be -0.f in forward
- */
-DEFINE_MKLDNN_ELTWISE_ACTIVATION(relu, MKLDNNEltwiseActivation, -0.f, 0.f)
-
-/**
- * @brief MKLDNN Tanh Activation.
- */
-DEFINE_MKLDNN_ELTWISE_ACTIVATION(tanh, MKLDNNEltwiseActivation, 0.f, 0.f)
-
-/**
- * @brief MKLDNN ELU(Exponential Linear Unit) Activation.
- *  f(x) = x                              (x >= 0)
- *  f(x) = negative_slope * (exp(x) - 1)  (x <  0)
- */
-DEFINE_MKLDNN_ELTWISE_ACTIVATION(elu, MKLDNNEltwiseActivation, 0.f, 0.f)
-
-mkldnn::algorithm MKLDNNEltwiseActivation::getAlgo(std::string type) const {
-  const std::map<std::string, mkldnn::algorithm> algoMap = {
-      {"relu", algorithm::eltwise_relu},
-      {"tanh", algorithm::eltwise_tanh},
-      {"elu", algorithm::eltwise_elu}};
-  type.erase(0, 7);  // remove mkldnn_
-  algorithm algo = (algorithm)0;
-  mapGet(type, algoMap, &algo);
-  return algo;
-}
-
-void MKLDNNEltwiseActivation::resetFwd(Argument& act) {
-  if (cnt_ == act.value->getElementCnt()) {
-    return;
-  }
-  MKLDNNActivation::resetFwd(act);
-  // note: alpha represents the NegativeSlope when used in relu.
-  float alpha = getAlpha();
-  float beta = getBeta();
-  algorithm algo = getAlgo(this->getName());
-  auto fwdDesc = eltwise_fwd::desc(mkldnn::prop_kind::forward_training,
-                                   algo,
-                                   val_->getMemoryDesc(),
-                                   alpha,
-                                   beta);
-  fwdPD_.reset(new eltwise_fwd::primitive_desc(fwdDesc, *engine_));
-  // use inplace for forward but save input value before submit
-  inVal_ = val_;
-  copyInVal_ = nullptr;
-  if (act.grad && algo == algorithm::eltwise_tanh) {
-    // tanh need save src input for backward
-    inVal_ = MKLDNNMatrix::create(val_->getPrimitiveDesc());
-    copyInVal_ = std::make_shared<mkldnn::reorder>(*val_, *inVal_);
-    CHECK(copyInVal_) << "should not be emptry";
-    pipelineFwd_.push_back(*copyInVal_);
-  }
-  fwd_.reset(new eltwise_fwd(*fwdPD_, *val_, *val_));
-  pipelineFwd_.push_back(*fwd_);
-  needResetBwd_ = true;
-}
-
-void MKLDNNEltwiseActivation::resetBwd(Argument& act) {
-  if (!needResetBwd_) {
-    return;
-  }
-  VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
-  needResetBwd_ = false;
-  algorithm algo = getAlgo(this->getName());
-  float alpha = getBwdAlpha();
-  float beta = getBeta();
-  grad_ = MKLDNNMatrix::create(val_->getPrimitiveDesc(), act.grad);
-  auto eng = CPUEngine::Instance().getEngine();
-  auto bwdDesc = eltwise_bwd::desc(
-      algo, grad_->getMemoryDesc(), val_->getMemoryDesc(), alpha, beta);
-  auto bwdPD = eltwise_bwd::primitive_desc(bwdDesc, eng, *fwdPD_);
-  CHECK(inVal_);
-  bwd_.reset(new eltwise_bwd(bwdPD, *inVal_, *grad_, *grad_));
-  pipelineBwd_.clear();
-  pipelineBwd_.push_back(*bwd_);
-}
-
-/**
- * @brief MKLDNN Softmax Activation
- */
-DEFINE_MKLDNN_ACTIVATION(softmax, MKLDNNSoftmaxActivation)
-
-void MKLDNNSoftmaxActivation::resetFwd(Argument& act) {
-  if (cnt_ == act.value->getElementCnt()) {
-    return;
-  }
-  MKLDNNActivation::resetFwd(act);
-  int axis = 1;
-  auto fwdDesc = softmax_fwd::desc(
-      mkldnn::prop_kind::forward_scoring, val_->getMemoryDesc(), axis);
-  auto fwdPD = softmax_fwd::primitive_desc(fwdDesc, *engine_);
-  fwd_.reset(new softmax_fwd(fwdPD, *val_, *val_));
-  pipelineFwd_.push_back(*fwd_);
-}
-
-Error __must_check MKLDNNSoftmaxActivation::forward(Argument& act) {
-  resetFwd(act);
-  stream_->submit(pipelineFwd_);
-  real* v = act.value->getData();
-  real threshold = exp(-64);
-#pragma omp parallel for
-  for (size_t i = 0; i < act.value->getElementCnt(); ++i) {
-    v[i] = v[i] < threshold ? threshold : v[i];
-  }
-  return Error();
-}
-
-Error __must_check MKLDNNSoftmaxActivation::backward(Argument& act) {
-  MatrixPtr outputV = act.value;
-  MatrixPtr outputG = act.grad;
-  Matrix::resizeOrCreate(sftMaxDot_,
-                         outputG->getHeight(),
-                         outputG->getWidth(),
-                         /* trans */ false,
-                         /* useGpu */ false);
-  Matrix::resizeOrCreate(sftMaxSum_,
-                         outputG->getHeight(),
-                         1,
-                         /* trans */ false,
-                         /* useGpu */ false);
-  sftMaxDot_->dotMul(*outputG, *outputV);
-  sftMaxSum_->colMerge(*sftMaxDot_);
-  act.grad->softmaxDerivative(*act.value, *sftMaxSum_);
-  return Error();
-}
-
-ActivationFunction* MKLDNNActivation::create(const std::string& type) {
-  return gMKLDNNActivationRegistrar.createByType(type);
-}
-
-std::vector<std::string> MKLDNNActivation::getAllRegisteredTypes() {
-  std::vector<std::string> types;
-  gMKLDNNActivationRegistrar.forEachType(
-      [&](const std::string& type) { types.push_back(type); });
-  return types;
-}
-
-void MKLDNNActivation::resetFwd(Argument& act) {
-  VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
-  cnt_ = act.value->getElementCnt();
-  pipelineFwd_.clear();
-  stream_.reset(new MKLDNNStream());
-  engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0));
-  val_ = std::dynamic_pointer_cast<MKLDNNMatrix>(act.value);
-  if (val_ == nullptr) {
-    int bs = act.getBatchSize();
-    int ih = act.getFrameHeight() > 0 ? act.getFrameHeight() : 1;
-    int iw = act.getFrameWidth() > 0 ? act.getFrameWidth() : 1;
-    int ic = cnt_ / bs / ih / iw;
-    CHECK_EQ(cnt_, (size_t)bs * ic * ih * iw);
-    val_ = MKLDNNMatrix::create(
-        {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_, act.value);
-    CHECK(val_);
-    val_->downSpatial();
-  }
-}
-
-Error __must_check MKLDNNActivation::forward(Argument& act) {
-  resetFwd(act);
-  stream_->submit(pipelineFwd_);
-  return Error();
-}
-Error __must_check MKLDNNActivation::backward(Argument& act) {
-  resetBwd(act);
-  stream_->submit(pipelineBwd_);
-  return Error();
-}
-}  // namespace paddle
diff --git a/paddle/gserver/activations/MKLDNNActivation.h b/paddle/gserver/activations/MKLDNNActivation.h
deleted file mode 100644
index 392b32c70dae3728e13ee64f09f135c015c122cf..0000000000000000000000000000000000000000
--- a/paddle/gserver/activations/MKLDNNActivation.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "ActivationFunction.h"
-#include "mkldnn.hpp"
-#include "paddle/gserver/layers/MKLDNNBase.h"
-#include "paddle/math/MKLDNNMatrix.h"
-#include "paddle/parameter/Argument.h"
-
-namespace paddle {
-
-/**
- * @brief Base class of MKLDNN Activation.
- * Common activation function are provieded,
- * including mkldnn_relu, mkldnn_elu, mkldnn_tanh, mkldnn_softmax
- */
-class MKLDNNActivation : public ActivationFunction {
-protected:
-  // input value element count
-  size_t cnt_;
-  // should not merge the resetBwd into resetFwd,
-  // because the grad data would be changing before backward.
-  bool needResetBwd_;
-  // mkldnn matrix, primitive, stream and pipeline
-  MKLDNNMatrixPtr val_;
-  MKLDNNMatrixPtr grad_;
-  std::shared_ptr<mkldnn::engine> engine_;
-  std::shared_ptr<MKLDNNStream> stream_;
-  std::shared_ptr<mkldnn::primitive> fwd_;
-  std::shared_ptr<mkldnn::primitive> bwd_;
-  std::vector<mkldnn::primitive> pipelineFwd_;
-  std::vector<mkldnn::primitive> pipelineBwd_;
-
-public:
-  MKLDNNActivation() : cnt_(0), needResetBwd_(true) {}
-  ~MKLDNNActivation() {}
-  static ActivationFunction* create(const std::string& type);
-  static std::vector<std::string> getAllRegisteredTypes();
-  virtual const std::string& getName() const = 0;
-  /**
-   * reset the forward primitives
-   */
-  virtual void resetFwd(Argument& act);
-  /**
-   * reset the backward primitives,
-   * can not merge this functions into resetFwd as the grad data
-   * would be changing before backward.
-   */
-  virtual void resetBwd(Argument& act) {}
-  virtual Error __must_check forward(Argument& act);
-  virtual Error __must_check backward(Argument& act);
-};
-
-/**
- * @brief Base class of MKLDNN Eltwise Activation,
- * includes mkldnn_relu, mkldnn_elu and mkldnn_tanh.
- */
-class MKLDNNEltwiseActivation : public MKLDNNActivation {
-  typedef mkldnn::eltwise_forward eltwise_fwd;
-  typedef mkldnn::eltwise_backward eltwise_bwd;
-  typedef mkldnn::algorithm algorithm;
-
-protected:
-  // save the forward primitive desc, which can be used backward
-  std::shared_ptr<eltwise_fwd::primitive_desc> fwdPD_;
-  // eltwise_bwd need src input value
-  MKLDNNMatrixPtr inVal_;
-  // use for copy data
-  std::shared_ptr<mkldnn::reorder> copyInVal_;
-
-public:
-  MKLDNNEltwiseActivation() {}
-  ~MKLDNNEltwiseActivation() {}
-  virtual const std::string& getName() const = 0;
-
-  // in common, the alpha of forward and backward should be equal.
-  // but for relu, to avoid negative value, they should be opposite
-  virtual float getAlpha() const = 0;
-  virtual float getBwdAlpha() const = 0;
-  virtual float getBeta() const { return 0.f; }
-  virtual algorithm getAlgo(std::string type) const;
-  void resetFwd(Argument& act) override;
-  void resetBwd(Argument& act) override;
-};
-
-/**
- * @brief Base class of MKLDNN softmax Activation,
- * only have mkldnn forward, use cpu implement for backward.
- */
-class MKLDNNSoftmaxActivation : public MKLDNNActivation {
-  typedef mkldnn::softmax_forward softmax_fwd;
-
-private:
-  // for backward
-  MatrixPtr sftMaxSum_;
-  MatrixPtr sftMaxDot_;
-
-public:
-  MKLDNNSoftmaxActivation() {}
-  ~MKLDNNSoftmaxActivation() {}
-  virtual const std::string& getName() const = 0;
-  void resetFwd(Argument& act) override;
-  Error __must_check forward(Argument& act) override;
-  Error __must_check backward(Argument& act) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/gserver/dataproviders/DataProvider.cpp
deleted file mode 100644
index 580cf821c685b3daf7f015bc137c6d5ea31ef100..0000000000000000000000000000000000000000
--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ /dev/null
@@ -1,410 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DataProvider.h"
-
-#include <unistd.h>
-#include <algorithm>
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-void BufferBatch::swap(BufferBatch* bufBatch) {
-  DataBatch* batchData = bufBatch->getDataBatch();
-  hl_event_t hlEvent = bufBatch->getCuEvent();
-  hl_stream_t hlStream = bufBatch->getCuStream();
-  bufBatch->setDataBatch(batchData_);
-  bufBatch->setCuStream(hlStream_);
-  bufBatch->setCuEvent(hlEvent_);
-
-  batchData_ = batchData;
-  hlEvent_ = hlEvent;
-  hlStream_ = hlStream;
-}
-
-void BufferBatch::clone(DataBatch* srcBatch, bool useGpu) {
-  if (batchData_ == NULL) {
-    batchData_ = new DataBatch();
-  }
-  std::vector<Argument>& destData = batchData_->getStreams();
-  int numStreams = srcBatch->getNumStreams();
-  destData.resize(numStreams);
-  batchData_->setSize(srcBatch->getSize());
-  if (useGpu) {
-    createCuEvent();
-  }
-
-  for (int i = 0; i < numStreams; i++) {
-    destData[i].resizeAndCopyFrom(srcBatch->getStream(i), useGpu, hlStream_);
-  }
-  if (useGpu) {
-    hl_stream_record_event(hlStream_, hlEvent_);
-  }
-}
-
-DoubleBuffer::DoubleBuffer(DataProvider* dataPool,
-                           bool useGpu,
-                           int64_t batchSize) {
-  batchSize_ = batchSize;
-  dataPool_ = dataPool;
-  useGpu_ = useGpu;
-  dataQueue_ = new BufferBatchQueue();
-  bufferQueue_ = new BufferBatchQueue();
-
-  // insert a empty buffer
-  bufferQueue_->enqueue(new BufferBatch());
-  stopping_ = false;
-  pending_ = true;
-}
-
-DoubleBuffer::~DoubleBuffer() {
-  finishAsyncLoad();
-  while (dataQueue_->size()) {
-    BufferBatch* dataBtch = dataQueue_->dequeue();
-    delete dataBtch;
-    dataBtch = NULL;
-  }
-  while (bufferQueue_->size()) {
-    BufferBatch* bufBtch = bufferQueue_->dequeue();
-    delete bufBtch;
-    bufBtch = NULL;
-  }
-  delete dataQueue_;
-  dataQueue_ = NULL;
-  delete bufferQueue_;
-  bufferQueue_ = NULL;
-}
-
-void DoubleBuffer::removeOneBatch(DataBatch* dataBatch) {
-  // get data
-  BufferBatch* batch = dataQueue_->dequeue();
-  batch->syncEvent();  // when use GPU, need synchronized with the cuEvent
-  *dataBatch = *(batch->getDataBatch());
-
-  // push anothor buffer
-  if (*usingBatch_ == nullptr) {
-    *usingBatch_ = std::make_shared<BufferBatch>();
-  }
-
-  // Mark the using-batch
-  batch->swap((*usingBatch_).get());
-  bufferQueue_->enqueue(batch);
-
-  if (0 == dataBatch->getSize()) {
-    setPending(true);
-  }
-}
-
-void DoubleBuffer::insertOneBatch(DataBatch* batch) {
-  while (!bufferQueue_->waitNotEmptyFor(2 /* seconds */)) {  // time out
-    if (stopping_) return;
-  }
-  BufferBatch* bufBatch = bufferQueue_->dequeue();
-  // clone and copy the data from an Threadlocal Variable
-  bufBatch->clone(batch, useGpu_);
-  dataQueue_->enqueue(bufBatch);
-}
-
-void DoubleBuffer::asyncLoadBatch() {
-  int64_t actualSize = 0;
-  if (useGpu_) {
-    hl_set_device(FLAGS_gpu_id);
-  }
-  setPending(false);
-
-  while (true) {
-    taskReadySem_.wait();
-    if (stopping_) break;
-
-    while (batchSize_ == 0 && !stopping_) {
-      usleep(5);
-    }
-    if (stopping_) break;
-
-    do {
-      DataBatch newBatch;
-      {
-        REGISTER_TIMER("getNextBatchInternal");
-        actualSize = dataPool_->getNextBatchInternal(batchSize_, &newBatch);
-      }
-      insertOneBatch(&newBatch);
-    } while (actualSize > 0 && !stopping_);
-  }
-}
-
-void DoubleBuffer::startAsyncLoad() {
-  if (asyncLoader_ == nullptr) {
-    asyncLoader_.reset(new std::thread([this]() { this->asyncLoadBatch(); }));
-  }
-  taskReadySem_.post();
-}
-
-ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool>
-    DataProvider::registrar_;
-
-DataProvider* DataProvider::create(const DataConfig& config,
-                                   const ModelConfig& modelConfig,
-                                   bool useGpu) {
-  return registrar_.createByType(config.type(), config, modelConfig, useGpu);
-}
-
-REGISTER_DATA_PROVIDER(simple, SimpleDataProvider);
-REGISTER_DATA_PROVIDER(dummy, DummyDataProvider);
-
-int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) {
-  int64_t batchSize = doubleBuffer_ ? getNextBatchFromBuffer(size, batch)
-                                    : getNextBatchInternal(size, batch);
-
-  if (!batchSize) return 0;
-
-  if (!config_.constant_slots_size()) return batchSize;
-
-  auto& constantSlots = *constantSlots_;
-  constantSlots.resize(config_.constant_slots_size());
-
-  for (int i = 0; i < config_.constant_slots_size(); ++i) {
-    MemoryHandlePtr handle =
-        constantSlots[i] ? constantSlots[i]->getMemoryHandle() : nullptr;
-    Matrix::resizeOrCreate(constantSlots[i],
-                           batchSize,
-                           1,         // = width
-                           false,     // = trans
-                           useGpu_);  // = useGpu
-    if (handle != constantSlots[i]->getMemoryHandle()) {
-      // memory buf was reallocated. We need to initialize the value
-      constantSlots[i]->assign(config_.constant_slots(i));
-    }
-    batch->appendData(constantSlots[i],
-                      batch->getStream(0).sequenceStartPositions);
-  }
-
-  return batchSize;
-}
-
-int64_t DataProvider::getNextBatchFromBuffer(int64_t size, DataBatch* batch) {
-  CHECK(doubleBuffer_ != nullptr);
-
-  if (doubleBuffer_->getBatchSize() != size) {
-    doubleBuffer_->setBatchSize(size);
-  }
-
-  doubleBuffer_->removeOneBatch(batch);
-  return batch->getSize();
-}
-
-void DataProvider::initAsyncLoader() {
-  if (doubleBuffer_ == nullptr) {
-    doubleBuffer_.reset(new DoubleBuffer(this, useGpu_));
-  }
-  useGpu_ = false;  // Avoid D2D copy, it will delay the computing performance
-}
-
-SimpleDataProviderBase::SimpleDataProviderBase(const DataConfig& config,
-                                               bool useGpu,
-                                               bool withInfo)
-    : DataProvider(config, useGpu) {
-  /* initialize the size of a sample, and the buffer */
-  sampleDim_ = config_.feat_dim() * (2 * config_.context_len() + 1);
-  bufferCapacity_ = config_.buffer_capacity();
-  withInfo_ = withInfo;
-  sampleNumInBuf_ = 0;
-  nextItemIndex_ = 0;
-
-  /* malloc buffer in cpu */
-  hInputDataBuf_ = std::make_shared<CpuMatrix>(bufferCapacity_, sampleDim_);
-  hInputLabelBuf_ = std::make_shared<CpuIVector>(bufferCapacity_);
-  hInputInfoBuf_ = std::make_shared<CpuIVector>(bufferCapacity_);
-}
-
-void SimpleDataProviderBase::shuffle() {
-  int i, t;
-  int len = sampleNumInBuf_;
-  std::vector<real> temp(sampleDim_);
-  real* data = hInputDataBuf_->getData();
-  int* label = hInputLabelBuf_->getData();
-  int* info = hInputInfoBuf_->getData();
-  int sampleSz = sizeof(real) * sampleDim_;
-  for (i = 0; i < len; i++) {
-    int randNum = rand();  // NOLINT TODO(yuyang18): Use rand_r instead?
-    t = randNum % (len - i) + i;
-    // swap
-    if (i != t) {
-      // swap data
-      memcpy(&temp[0], &data[i * sampleDim_], sampleSz);
-      memcpy(&data[i * sampleDim_], &data[t * sampleDim_], sampleSz);
-      memcpy(&data[t * sampleDim_], &temp[0], sampleSz);
-      std::swap(label[i], label[t]);
-      if (withInfo_) {
-        std::swap(info[i], info[t]);
-      }
-    }
-  }
-}
-
-int64_t SimpleDataProviderBase::getNextBatchInternal(int64_t size,
-                                                     DataBatch* batch) {
-  CHECK(batch != NULL);
-  batch->clear();
-
-  int64_t startIndex;
-  int64_t cpySize;
-
-  std::lock_guard<RWLock> guard(lock_);
-  if (sampleNumInBuf_ - nextItemIndex_ < size) {
-    int64_t n = fillBuffer();
-    VLOG(1) << "fillBuffer return " << n << " samples.\n";
-  }
-
-  startIndex = nextItemIndex_;
-  cpySize = std::min(size, sampleNumInBuf_ - nextItemIndex_);
-  nextItemIndex_ += cpySize;
-
-  if (cpySize > 0) {
-    real* data = hInputDataBuf_->getData() + startIndex * sampleDim_;
-    int* label = hInputLabelBuf_->getData() + startIndex;
-    int* info = hInputInfoBuf_->getData() + startIndex;
-
-    MatrixPtr& dataBatch = *dataBatch_;     // get the thread local object
-    IVectorPtr& labelBatch = *labelBatch_;  // get the thread local object
-    IVectorPtr& infoBatch = *infoBatch_;    // get the thread local object
-    if (!dataBatch) {
-      dataBatch = Matrix::create(cpySize, sampleDim_, false, useGpu_);
-      labelBatch = IVector::create(cpySize, useGpu_);
-      if (withInfo_) {
-        infoBatch = IVector::create(cpySize, 0);
-      }
-    } else {
-      dataBatch->resize(cpySize, sampleDim_);
-      labelBatch->resize(cpySize);
-      if (withInfo_) {
-        infoBatch->resize(cpySize);
-      }
-    }
-    dataBatch->copyFrom(data, cpySize * sampleDim_);
-    labelBatch->copyFrom(label, cpySize);
-    batch->appendData(dataBatch);
-    batch->appendLabel(labelBatch);
-    if (withInfo_) {
-      infoBatch->copyFrom(info, cpySize);
-      batch->appendLabel(infoBatch);
-    }
-  }
-
-  batch->setSize(cpySize);
-  return cpySize;
-}
-
-void SimpleDataProviderBase::reset() {
-  sampleNumInBuf_ = 0;
-  nextItemIndex_ = 0;
-  DataProvider::reset();
-}
-
-int64_t SimpleDataProviderBase::getSize() {
-  LOG(FATAL) << "Currently, not implemented";
-  return 0;
-}
-
-int64_t SimpleDataProviderBase::fillBuffer() {
-  int64_t n = sampleNumInBuf_ - nextItemIndex_;
-
-  /* flash the remaining data to the beginning of the buffer */
-  if (n > 0) {
-    hInputDataBuf_->copyFrom(
-        hInputDataBuf_->getData() + nextItemIndex_ * sampleDim_,
-        n * sampleDim_);
-    hInputLabelBuf_->copyFrom(hInputLabelBuf_->getData() + nextItemIndex_, n);
-    if (withInfo_) {
-      hInputInfoBuf_->copyFrom(hInputInfoBuf_->getData() + nextItemIndex_, n);
-    }
-  }
-
-  sampleNumInBuf_ =
-      n + fillBufferImp(hInputDataBuf_->getData() + n * sampleDim_,
-                        hInputLabelBuf_->getData() + n,
-                        hInputInfoBuf_->getData() + n,
-                        bufferCapacity_ - n);
-
-  /* for stachastic gradient training */
-  if (!skipShuffle_) {
-    shuffle();
-  }
-
-  nextItemIndex_ = 0;
-
-  return sampleNumInBuf_;
-}
-
-SimpleDataProvider::SimpleDataProvider(const DataConfig& config, bool useGpu)
-    : SimpleDataProviderBase(config, useGpu, /* withInfo= */ false),
-      currentSampleIndex_(0) {
-  loadData(config_.files());
-}
-
-SimpleDataProvider::~SimpleDataProvider() {}
-
-int64_t SimpleDataProvider::fillBufferImp(real* data,
-                                          int* label,
-                                          int* info,
-                                          int64_t size) {
-  (void)info;
-  int64_t n = std::min<int64_t>(labels_.size() - currentSampleIndex_, size);
-  memcpy(data,
-         &data_[currentSampleIndex_ * sampleDim_],
-         n * sampleDim_ * sizeof(real));
-  memcpy(label, &labels_[currentSampleIndex_], sizeof(int) * n);
-  currentSampleIndex_ += n;
-
-  return n;
-}
-
-void SimpleDataProvider::reset() {
-  currentSampleIndex_ = 0;
-  SimpleDataProviderBase::reset();
-}
-
-void SimpleDataProvider::loadData(const std::string& fileName) {
-  std::ifstream is(fileName);
-  CHECK(is) << "Fail to open " << fileName;
-  std::string line;
-  while (is) {
-    if (!getline(is, line)) break;
-    LOG(INFO) << "load data file " << line;
-    loadDataFile(line);
-  }
-  LOG(INFO) << "read done, num of instance=" << labels_.size()
-            << " data size=" << data_.size();
-}
-
-void SimpleDataProvider::loadDataFile(const std::string& fileName) {
-  std::ifstream is(fileName);
-  std::string line;
-  std::vector<std::string> pieces;
-  while (is) {
-    if (!getline(is, line)) break;
-    str::split(line, ' ', &pieces);
-    CHECK_EQ((uint64_t)(sampleDim_ + 1), pieces.size())
-        << " Dimension mismatch, " << pieces.size() - 1 << " in " << fileName
-        << " " << sampleDim_ << " from config";
-    labels_.push_back(atoi(pieces[0].c_str()));
-    for (int i = 0; i < sampleDim_; ++i) {
-      data_.push_back(atof(pieces[i + 1].c_str()));
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
deleted file mode 100644
index 4851168abab7179d552648c88923a529d55e6a7e..0000000000000000000000000000000000000000
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ /dev/null
@@ -1,480 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <mutex>
-#include <vector>
-
-#include "DataConfig.pb.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/math/Vector.h"
-#include "paddle/parameter/Argument.h"
-#include "paddle/utils/ClassRegistrar.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Queue.h"
-#include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-/**
- * @def REGISTER_DATA_PROVIDER
- * @brief Macro for registering a data provider. The class type should contain
- *        a consturctor with parameter (DataConfig, bool).
- */
-#define REGISTER_DATA_PROVIDER(__type_name, __class_name)                \
-  static InitFunction __reg_type_##__type_name([]() {                    \
-    DataProvider::registrar_.registerClass(                              \
-        #__type_name,                                                    \
-        [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \
-          DataProvider* dp = new __class_name(conf, useGpu);             \
-          return dp;                                                     \
-        });                                                              \
-  })
-
-/**
- * @def REGISTER_DATA_PROVIDER_EX
- * @brief Macro for registering a data provider, which contains a constructor
- *        with parameter (DataConfig, ModelConfig, bool).
- */
-#define REGISTER_DATA_PROVIDER_EX(__type_name, __class_name)            \
-  static InitFunction __reg_type_##__type_name([] {                     \
-    DataProvider::registrar_.registerClass<__class_name>(#__type_name); \
-  })
-
-class DataBatch;
-class BufferBatch;
-typedef std::shared_ptr<DataBatch> DataBatchPtr;
-typedef std::shared_ptr<BufferBatch> BufferBatchPtr;
-/**
- * @brief Data for batch training a neural network
- */
-class DataBatch {
-public:
-  DataBatch() : size_(0) { data_.clear(); }
-  /**
-   * @brief Get batch size
-   * @return batch size
-   */
-  int64_t getSize() const { return size_; }
-  /**
-   * @brief Get num of sequences of sequence data
-   * @return num of sequences
-   */
-  int64_t getNumSequences() const {
-    if (data_.empty()) return size_;
-    return data_[0].sequenceStartPositions
-               ? data_[0].sequenceStartPositions->getSize() - 1
-               : size_;
-  }
-  /**
-   * @brief Set batch size
-   * @param[in] size size
-   */
-  void setSize(int64_t size) { size_ = size; }
-  /**
-   * @brief Get size of argument vector
-   * @return size of argument vector
-   * @note For usual supervised learning, input data and label is needed,
-   * then there will be two argument.
-   */
-  int64_t getNumStreams() const { return data_.size(); }
-
-  /**
-   * @brief Get a argument with index i
-   * @param[in] i index in argument vector
-   * @return a argument with index i
-   */
-  const Argument& getStream(int i) const { return data_[i]; }
-  /**
-   * @brief Get all argument
-   * @return an argument vector
-   */
-  std::vector<Argument>& getStreams() { return data_; }
-  /**
-   * @brief Get all argument const
-   * @return an argument vector
-   */
-  std::vector<Argument> getStreams() const { return data_; }
-  /**
-   * @brief Clear DataBatch
-   */
-  void clear() {
-    data_.clear();
-    size_ = 0;
-  }
-
-  /**
-   * @brief Append data to DataBatch
-   * @param[in] data  matrix data
-   * @note The order in which each data stream is appended must match the order
-   * specified in stream_names of DataConfig. The stream_names can be obtained
-   * using DataProvider::getStreamNames().
-   */
-  void appendData(MatrixPtr data) {
-    Argument argu;
-    argu.value = data;
-    data_.push_back(argu);
-  }
-
-  /**
-   * @brief Append sequence data to DataBatch
-   * @param[in] data                      matrix data
-   * @param[in] sequenceStartPositions    sequence data
-   * @note The order in which each data stream is appended must match the order
-   * specified in stream_names of DataConfig. The stream_names can be obtained
-   * using DataProvider::getStreamNames().
-   */
-  void appendData(const MatrixPtr& data,
-                  const ICpuGpuVectorPtr& sequenceStartPositions) {
-    Argument argu;
-    argu.value = data;
-    argu.sequenceStartPositions = sequenceStartPositions;
-    data_.push_back(argu);
-  }
-  /**
-   * @brief Append label data
-   * @param[in]  label    label data
-   * @param[in]  value    matrix data, default null
-   */
-  void appendLabel(IVectorPtr label, MatrixPtr value = nullptr) {
-    Argument argu;
-    argu.ids = label;
-    argu.value = value;
-    data_.push_back(argu);
-  }
-
-  /*
-   * @brief Append argument
-   * @param[in]  argus   DataBatch.getStreams()
-   * @param[in]  size    DataBatch.getSize()
-   * @param[in]  dataId  sub dataprovider id (in MultiDataProvider)
-   */
-  void appendArguments(const std::vector<Argument>& argus,
-                       int size,
-                       int dataId) {
-    size_ += size;
-    for (const auto& argu : argus) {
-      data_.push_back(argu);
-      data_.back().dataId = dataId;
-    }
-  }
-
-protected:
-  /**
-   * @brief batch size
-   */
-  int64_t size_;
-  /**
-   * @brief A batch data consist of a Argument vector,
-   * An argument corresponds to a type of input data.
-   */
-  std::vector<Argument> data_;
-};
-
-class BufferBatch {
-public:
-  BufferBatch() {
-    hlStream_ = HPPL_STREAM_DEFAULT;
-    hlEvent_ = NULL;
-    batchData_ = NULL;
-  }
-  ~BufferBatch() {
-    if (hlEvent_) {
-      hl_destroy_event(hlEvent_);
-      hlEvent_ = NULL;
-    }
-    delete batchData_;
-    batchData_ = NULL;
-  }
-
-  void setDataBatch(DataBatch* batchData) { batchData_ = batchData; }
-  DataBatch* getDataBatch() { return batchData_; }
-
-  void setCuStream(hl_stream_t stream) { hlStream_ = stream; }
-  hl_stream_t getCuStream() const { return hlStream_; }
-
-  void setCuEvent(hl_event_t event) { hlEvent_ = event; }
-
-  hl_event_t getCuEvent() const { return hlEvent_; }
-
-  void createCuEvent() {
-    if (!hlEvent_) {
-      hlStream_ = HPPL_STREAM_1;
-      hl_create_event(&hlEvent_);
-    }
-  }
-
-  void syncEvent() {
-    if (hlEvent_) {
-      hl_stream_wait_event(hlStream_, hlEvent_);
-    }
-  }
-
-  void swap(BufferBatch* bufBatch);
-  void clone(DataBatch* srcBatch, bool useGpu);
-
-protected:
-  DataBatch* batchData_;
-  hl_stream_t hlStream_;
-  hl_event_t hlEvent_;
-};
-
-class DataProvider;
-typedef std::shared_ptr<DataProvider> DataProviderPtr;
-
-typedef Queue<BufferBatch*> BufferBatchQueue;
-
-class DoubleBuffer {
-public:
-  DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize = 0);
-  virtual ~DoubleBuffer();
-  void removeOneBatch(DataBatch* dataBatch);
-
-  void setBatchSize(int64_t newBatchSize) { batchSize_ = newBatchSize; }
-
-  int64_t getBatchSize() { return batchSize_; }
-
-  void startAsyncLoad();
-  void finishAsyncLoad() {
-    stopping_ = true;
-    taskReadySem_.post();
-    if (asyncLoader_) {
-      asyncLoader_->join();
-    }
-  }
-
-  void setPending(bool pending) { pending_ = pending; }
-
-protected:
-  virtual void asyncLoadBatch();
-  void insertOneBatch(DataBatch* batch);
-
-  DataProvider* dataPool_;
-  bool useGpu_;
-  int32_t batchSize_;
-  ThreadLocal<BufferBatchPtr> usingBatch_;
-  BufferBatchQueue* dataQueue_;
-  BufferBatchQueue* bufferQueue_;
-  std::unique_ptr<std::thread> asyncLoader_;
-  Semaphore taskReadySem_;
-  bool stopping_;
-  bool pending_;
-};
-
-/**
- * @brief Base class for DataProvider, which supplies data for training
- * @note It can supplies multiple streams of data.
- * For typical supervised training, there are two streams:
- * one is for input, one is for label.
- */
-class DataProvider {
-public:
-  static ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool> registrar_;
-  static DataProvider* create(const DataConfig& config,
-                              const ModelConfig& modelConfig,
-                              bool useGpu = FLAGS_use_gpu);
-
-  /**
-   * @brief create only used for unittest.
-   */
-  inline static DataProvider* create(const DataConfig& config,
-                                     bool useGpu = FLAGS_use_gpu) {
-    return create(config, ModelConfig(), useGpu);
-  }
-
-  DataProvider(const DataConfig& config, bool useGpu)
-      : config_(config),
-        skipShuffle_(false),
-        usageRatio_(config.usage_ratio()),
-        useGpu_(useGpu) {
-    if (config_.async_load_data()) {
-      initAsyncLoader();
-    }
-  }
-  virtual ~DataProvider() {}
-
-  const DataConfig& getConfig() const { return config_; }
-
-  void setSkipShuffle() { skipShuffle_ = true; }
-
-  /**
-   * @brief Get next batch of training samples
-   * @param[in]    size    size of training samples to get
-   * @param[out]   batch   a batch of training samples
-   * @return actual size of obtained training samples
-   */
-  int64_t getNextBatch(int64_t size, DataBatch* batch);
-
-  /**
-   * @brief Shuffle the data set
-   */
-  virtual void shuffle() = 0;
-
-  /**
-   * @brief reset all the value of index
-   * @note reset() must be called before any calls to getNextBatch()
-   * IMPORTANT: subclass reset() should always call the base class reset()
-   * at the end of the function
-   */
-  virtual void reset() {
-    if (doubleBuffer_ != nullptr) {
-      doubleBuffer_->startAsyncLoad();
-    }
-  }
-
-  /**
-   * @brief Get the size of training samples
-   * @return the number of training samples in the data set.
-   * @note return -1 to indicate unlimited number of samples.
-   */
-  virtual int64_t getSize() = 0;
-
-  /**
-   * @brief Get next batch training samples internally
-   * @param[in]    size      size of training samples to get
-   * @param[out]   batch     a batch of training samples
-   * @return actual size of obtained training samples
-   */
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch) = 0;
-
-protected:
-  DataConfig config_;
-  bool skipShuffle_;
-  float usageRatio_;
-  bool useGpu_;
-  std::unique_ptr<DoubleBuffer> doubleBuffer_;
-  ThreadLocal<std::vector<MatrixPtr>> constantSlots_;
-  /**
-   * @@brief Get next batch training samples from buffer
-   * @param[in]    size      size of training samples to get
-   * @param[out]   batch     a batch of training samples
-   * @return actual size of obtained training samples
-   */
-  int64_t getNextBatchFromBuffer(int64_t size, DataBatch* batch);
-
-  void initAsyncLoader();
-};
-
-/**
- * A data provider which does nothing. It only serves as providing
- * necessary configurations such as stream_names
- */
-class DummyDataProvider : public DataProvider {
-public:
-  DummyDataProvider(const DataConfig& config, bool useGpu)
-      : DataProvider(config, useGpu) {}
-  virtual void shuffle() {}
-  virtual void reset() { DataProvider::reset(); }
-  virtual int64_t getSize() { return 0; }
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch) {
-    (void)size;
-    (void)batch;
-    return 0;
-  }
-};
-
-/**
- * Data provider for one input and one integer label.
- */
-class SimpleDataProviderBase : public DataProvider {
-protected:
-  /// sample feature dimension
-  int64_t sampleDim_;
-  /// the number of samples
-  int64_t bufferCapacity_;
-  int64_t sampleNumInBuf_;
-  /// next item to read in buffer
-  int64_t nextItemIndex_;
-  /// some user defined info for validation
-  bool withInfo_;
-
-  /// data buffer: bufferCapacity_ * nDataDim_
-  CpuMatrixPtr hInputDataBuf_;
-
-  /// label buffer:bufferCapacity_ * 1
-  CpuIVectorPtr hInputLabelBuf_;
-
-  /// info buffer:bufferCapacity_ * 1
-  CpuIVectorPtr hInputInfoBuf_;
-
-  ThreadLocal<MatrixPtr> dataBatch_;
-  ThreadLocal<IVectorPtr> labelBatch_;
-  ThreadLocal<IVectorPtr> infoBatch_;
-
-  RWLock lock_;
-
-public:
-  SimpleDataProviderBase(const DataConfig& config, bool useGpu, bool withInfo);
-  ~SimpleDataProviderBase() {}
-
-  void shuffle();
-
-  virtual void reset();
-
-  virtual int64_t getSize();
-
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-
-  /// return the number of samples in the buffer
-  int64_t fillBuffer();
-
-protected:
-  /**
-   * @brief Fill at most size samples into data and label.
-   *
-   * Each input is stored in contiguous memory locations in data.
-   *
-   * data[n * sampleDim_] .. data[n * sampleDim_ + sampleDim_ - 1] is for
-   * the input of the n-th sample.
-   *
-   * label[n] is the label for the n-th sample.
-   */
-  virtual int64_t fillBufferImp(real* data,
-                                int* label,
-                                int* info,
-                                int64_t size) = 0;
-};
-
-class SimpleDataProvider : public SimpleDataProviderBase {
-public:
-  SimpleDataProvider(const DataConfig& config, bool useGpu);
-  ~SimpleDataProvider();
-  virtual void reset();
-
-protected:
-  void loadData(const std::string& fileName);
-  void loadDataFile(const std::string& fileName);
-  virtual int64_t fillBufferImp(real* data,
-                                int* label,
-                                int* info,
-                                int64_t size);
-
-protected:
-  size_t currentSampleIndex_;
-  std::vector<int> labels_;
-  std::vector<real> data_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/DataProviderGroup.h b/paddle/gserver/dataproviders/DataProviderGroup.h
deleted file mode 100644
index 768e54fe82bedd6faca5ad9eb2b6f2ee0017dc3d..0000000000000000000000000000000000000000
--- a/paddle/gserver/dataproviders/DataProviderGroup.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "DataProvider.h"
-
-namespace paddle {
-
-template <class T>
-class DataProviderGroup : public DataProvider {
-protected:
-  typedef T ProviderType;
-  typedef std::shared_ptr<ProviderType> ProviderPtrType;
-  ProviderPtrType provider_;
-
-  std::vector<std::string> fileList_;
-  std::mutex lock_;
-  std::unique_ptr<MultiThreadWorker<ProviderType>> loader_;
-
-public:
-  DataProviderGroup(const DataConfig& config, bool useGpu);
-  ~DataProviderGroup() {}
-
-  virtual void reset();
-  virtual void shuffle() {}
-  virtual int64_t getSize() { return -1; }
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-
-private:
-  void startLoader();
-  void stopLoader();
-  void forceStopLoader();
-  ProviderPtrType loadFile(const std::vector<std::string>& fileList);
-};
-
-template <class T>
-DataProviderGroup<T>::DataProviderGroup(const DataConfig& config, bool useGpu)
-    : DataProvider(config, useGpu) {
-  // load file list
-  loadFileList(config_.files(), fileList_);
-  CHECK_GT(fileList_.size(), 0LU);
-  LOG(INFO) << "load file list, numfiles=" << fileList_.size()
-            << ", max_num_of_data_providers_in_memory="
-            << (1 + config_.file_group_conf().queue_capacity() +
-                config_.file_group_conf().load_thread_num());
-}
-
-template <class T>
-void DataProviderGroup<T>::reset() {
-  forceStopLoader();
-  CHECK(!loader_);
-  provider_ = nullptr;
-
-  // shuffle file list
-  std::shuffle(
-      fileList_.begin(), fileList_.end(), ThreadLocalRandomEngine::get());
-
-  startLoader();
-  DataProvider::reset();
-}
-
-template <class T>
-int64_t DataProviderGroup<T>::getNextBatchInternal(int64_t size,
-                                                   DataBatch* batch) {
-  std::lock_guard<std::mutex> guard(lock_);
-
-  if (!loader_) {
-    return 0;
-  }
-  if (provider_) {
-    int64_t ret = provider_->getNextBatchInternal(size, batch);
-    if (ret > 0) {
-      return ret;
-    }
-  }
-
-  // else get data from next data provider
-  if (loader_->testResult()) {
-    LOG(INFO) << "WAIT provider";
-  }
-  provider_ = loader_->waitResult();
-  if (!provider_) {
-    stopLoader();  // All the data providers have been returned
-    return 0;
-  }
-  int64_t ret = provider_->getNextBatchInternal(size, batch);
-  CHECK(ret > 0) << "new data provider does not contain any valid samples!";
-  return ret;
-}
-
-template <class T>
-void DataProviderGroup<T>::startLoader() {
-  loader_.reset(new MultiThreadWorker<ProviderType>(
-      config_.file_group_conf().load_thread_num(),
-      config_.file_group_conf().queue_capacity()));
-
-  int loadFileCount = config_.file_group_conf().load_file_count();
-  for (size_t startPos = 0; startPos < fileList_.size();
-       startPos += loadFileCount) {
-    size_t endPos = std::min(fileList_.size(), startPos + loadFileCount);
-    std::vector<std::string> fileVec(fileList_.begin() + startPos,
-                                     fileList_.begin() + endPos);
-    loader_->addJob([this, fileVec]() -> ProviderPtrType {
-      return this->loadFile(fileVec);
-    });
-  }
-  loader_->stopAddJob();
-}
-
-template <class T>
-void DataProviderGroup<T>::stopLoader() {
-  if (loader_) {
-    loader_->stop();
-    loader_ = nullptr;
-  }
-}
-
-template <class T>
-void DataProviderGroup<T>::forceStopLoader() {
-  if (loader_) {
-    loader_->forceStop();
-    loader_ = nullptr;
-  }
-}
-
-template <class T>
-std::shared_ptr<T> DataProviderGroup<T>::loadFile(
-    const std::vector<std::string>& fileList) {
-  // disable async_load_data in sub dataprovider
-  DataConfig subConfig = config_;
-  subConfig.set_async_load_data(false);
-
-  CHECK(!fileList.empty()) << "fileList is empty";
-  ProviderPtrType provider =
-      std::make_shared<ProviderType>(subConfig, useGpu_, false);
-  provider->loadData(fileList);
-  provider->reset();
-  return provider;
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.cpp b/paddle/gserver/dataproviders/MultiDataProvider.cpp
deleted file mode 100644
index f71947ef3946284b7ecfb50851100fe43bd78857..0000000000000000000000000000000000000000
--- a/paddle/gserver/dataproviders/MultiDataProvider.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MultiDataProvider.h"
-#include <algorithm>
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-using namespace std;
-
-MultiDataProvider::MultiDataProvider(const DataConfig& config,
-                                     const ModelConfig& modelConfig,
-                                     bool useGpu)
-    : DataProvider(config, useGpu) {
-  bool atLeastOneMainDataFlag = false;
-  totalDataRatio_ = 0;
-  LOG(INFO) << "MultiDataProvider: sub data provider size: "
-            << config.sub_data_configs_size();
-  LOG(INFO) << "MultiDataProvider: for_test: " << config.for_test();
-  isTestMode_ = config.for_test();
-  for (int i = 0; i < config.sub_data_configs_size(); i++) {
-    LOG(INFO) << "dataRatio of sub(" << i
-              << ") is: " << config.sub_data_configs(i).data_ratio();
-    totalDataRatio_ += config.sub_data_configs(i).data_ratio();
-    if (config.sub_data_configs(i).is_main_data()) {
-      LOG(INFO) << "main data is [" << i << "]";
-      atLeastOneMainDataFlag = true;
-    }
-  }
-  CHECK(atLeastOneMainDataFlag) << "all sub dataproviders in MultiData do not"
-                                << " have is_main_data flag";
-  LOG(INFO) << "totalDataRatio_=" << totalDataRatio_;
-  DataConfig subConfig;
-  int subDataProviderCount = config.sub_data_configs_size();
-  if (isTestMode()) {
-    LOG(INFO) << "construct MultiDataProvider in test mode";
-  } else {
-    LOG(INFO) << "construct MultiDataProvider in train mode";
-  }
-  subDataProviders_.resize(subDataProviderCount);
-  for (int i = 0; i < subDataProviderCount; i++) {
-    subConfig = config.sub_data_configs(i);
-    if (subConfig.async_load_data()) {
-      LOG(INFO) << "can not use async_load_data in sub dataprovider of "
-                   "MultiDataProvider";
-      subConfig.set_async_load_data(false);
-    }
-    subDataProviders_[i] = std::unique_ptr<DataProvider>(
-        DataProvider::create(subConfig, modelConfig, useGpu_));
-  }
-}
-
-void MultiDataProvider::reset() {
-  for (auto& elem : subDataProviders_) {
-    elem->reset();
-  }
-  DataProvider::reset();
-}
-
-void MultiDataProvider::shuffle() {
-  for (auto& elem : subDataProviders_) {
-    elem->shuffle();
-  }
-}
-
-int64_t MultiDataProvider::getNextBatchInternal(int64_t size,
-                                                DataBatch* batch) {
-  batch->clear();
-  for (size_t i = 0; i < subDataProviders_.size(); ++i) {
-    // calc size according to data ratio
-    int64_t subSize =
-        (int64_t)(1.0 * size * config_.sub_data_configs(i).data_ratio() /
-                  totalDataRatio_);
-    DataBatch subBatch;
-    int64_t realSize =
-        subDataProviders_[i]->getNextBatchInternal(subSize, &subBatch);
-    if (realSize == 0) {
-      // current subDataProvider has no data
-      if (!isTestMode()) {
-        // in train mode
-        if (config_.sub_data_configs(i).is_main_data()) {
-          // is main data provider. then return 0
-          batch->clear();
-          return 0;
-        } else {
-          // not main data provider, reset current subDataProvider and try again
-          subDataProviders_[i]->reset();
-          subBatch.clear();
-          realSize =
-              subDataProviders_[i]->getNextBatchInternal(subSize, &subBatch);
-          CHECK_GT(realSize, 0);
-        }
-      } else {
-        // in test mode, make an empty argument
-        Argument emptyArgu;
-        std::vector<Argument> argus;
-        argus.push_back(emptyArgu);
-        batch->appendArguments(argus, 0, -1);
-        continue;
-      }
-    }
-    batch->appendArguments(subBatch.getStreams(), subBatch.getSize(), i);
-  }
-  return batch->getSize();
-}
-
-REGISTER_DATA_PROVIDER_EX(multi, MultiDataProvider);
-
-}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.h b/paddle/gserver/dataproviders/MultiDataProvider.h
deleted file mode 100644
index 9a863c896773d71a99e21660fc13e3dd477a0c12..0000000000000000000000000000000000000000
--- a/paddle/gserver/dataproviders/MultiDataProvider.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "DataProvider.h"
-
-namespace paddle {
-
-class MultiDataProvider : public DataProvider {
-protected:
-  std::vector<std::unique_ptr<DataProvider>> subDataProviders_;
-
-public:
-  MultiDataProvider(const DataConfig& config,
-                    const ModelConfig& modelConfig,
-                    bool useGpu);
-  ~MultiDataProvider() {}
-  virtual void reset();
-  virtual void shuffle();
-  virtual int64_t getSize() { return -1; }
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-  bool isTestMode() const { return isTestMode_; }
-
-private:
-  int totalDataRatio_;
-  bool isTestMode_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/ProtoReader.h b/paddle/gserver/dataproviders/ProtoReader.h
deleted file mode 100644
index 786703f4dee4802bb967f9d15fb69ebcbc15d997..0000000000000000000000000000000000000000
--- a/paddle/gserver/dataproviders/ProtoReader.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-
-#include <google/protobuf/io/coded_stream.h>
-#include <google/protobuf/io/gzip_stream.h>
-#include <google/protobuf/io/zero_copy_stream_impl.h>
-#include <google/protobuf/message_lite.h>
-
-namespace paddle {
-
-/**
- * ProtoReader/ProtoWriter are used to read/write a sequence of protobuf
- * messages from/to i/ostream.
- */
-class ProtoReader {
-public:
-  explicit ProtoReader(std::istream* s, bool dataCompression = false) {
-    CHECK(s) << "istream pointer is nullptr";
-    istreamInput_.reset(new google::protobuf::io::IstreamInputStream(s));
-    if (dataCompression) {
-      gzipInput_.reset(
-          new google::protobuf::io::GzipInputStream(istreamInput_.get()));
-      codedInput_.reset(
-          new google::protobuf::io::CodedInputStream(gzipInput_.get()));
-    } else {
-      codedInput_.reset(
-          new google::protobuf::io::CodedInputStream(istreamInput_.get()));
-    }
-    dataCompression_ = dataCompression;
-    approximateReadedBytes_ = 0;
-    codedInput_->SetTotalBytesLimit(kDefaultTotalBytesLimit,
-                                    kDefaultTotalBytesLimit);
-  }
-
-  /**
-   * read one message
-   */
-  bool read(google::protobuf::MessageLite* msg) {
-    if (approximateReadedBytes_ >= kMaxLimitBytes) {
-      // Once bytes we read get close to 64MB(larger than 55MB),
-      // we re-intialize the codedInputStream object.
-      approximateReadedBytes_ = 0;
-
-      /**
-       * Explicitly destroys the object owned by unique_ptr at first and then
-       * construct an new object.
-       *
-       * 1.reset()
-       *
-       * 2.reset(new ...)   <-- such sequence is EXTREAMLY important!
-       *
-       * Reason: (!!!Read me before you modify the following 2 lines of
-       * codes!!!)
-       *
-       * Otherwise, reset() method will ask the CodedInputStream constructor
-       * to construct the new object at first forcing the IstreamInputStream
-       * object to move its underlying pointer to the next 8192 bytes.
-       *
-       * Then the old object will be destroied calling
-       * IstreamInputStream::BackUp() to move the underlying pointer back.
-       * This means that the InstreamInputStream object is referenced by
-       * 2 different CodedInputStream object at the same time which "confuses"
-       * the position of istreamInput_'s underlying pointer. Such fatal
-       * confusion will lead to undefined behaviour when 'codedInput_' is
-       * used to read new data.
-       *
-       */
-      codedInput_.reset();
-      if (dataCompression_) {
-        codedInput_.reset(
-            new google::protobuf::io::CodedInputStream(gzipInput_.get()));
-      } else {
-        codedInput_.reset(
-            new google::protobuf::io::CodedInputStream(istreamInput_.get()));
-      }
-      codedInput_->SetTotalBytesLimit(kDefaultTotalBytesLimit,
-                                      kDefaultTotalBytesLimit);
-    }
-
-    uint32_t size;
-    if (!codedInput_->ReadVarint32(&size)) {
-      return false;
-    }
-    google::protobuf::io::CodedInputStream::Limit limit =
-        codedInput_->PushLimit(size);
-    CHECK(msg->ParseFromCodedStream(codedInput_.get()));
-    codedInput_->PopLimit(limit);
-
-    /**
-     * size is varint in the data file, we don't know the length.
-     * We assume every size takes 4 bytes in the data file.
-     */
-    approximateReadedBytes_ += 4 + size;
-    return true;
-  }
-
-protected:
-  std::unique_ptr<google::protobuf::io::ZeroCopyInputStream> istreamInput_;
-  std::unique_ptr<google::protobuf::io::GzipInputStream> gzipInput_;
-  std::unique_ptr<google::protobuf::io::CodedInputStream> codedInput_;
-  bool dataCompression_;
-
-  /**
-   * This is the maximum number of bytes that this CodedInputStream will read
-   * before refusing to continue.
-   */
-  static const int kDefaultTotalBytesLimit = 64 << 20;  // 64MB
-
-  /**
-   * If data readed by the reader is more than 55MB( << 64MB),
-   * we reset the CodedInputStream object.
-   * This can help avoid 64MB warning which will cause the ParseFromCodedStream
-   * to fail.
-   */
-  static const int kMaxLimitBytes = 55 << 20;
-
-  /**
-   * This variable dosen't store the exact bytes readed by CodedInputStream
-   * object since which is constructed. Instead, it store the approximate bytes
-   * because we can't tell how many bytes are readed by the object with the
-   * help of API.
-   *
-   * @note this code depends on protobuf 2.4.0. There is nothing like
-   * CodedInputStream::CurrentPosition() in protobuf 2.5.0 to tell us how many
-   * bytes has the object readed so far. Therefore, we calculated bytes
-   * ourselves.
-   */
-  int approximateReadedBytes_;
-};
-
-class ProtoWriter {
-public:
-  explicit ProtoWriter(std::ostream* s, bool dataCompression = false) {
-    CHECK(s) << "ostream pointer is nullptr";
-    ostreamOutput_.reset(new google::protobuf::io::OstreamOutputStream(s));
-    if (dataCompression) {
-      gzipOutput_.reset(
-          new google::protobuf::io::GzipOutputStream(ostreamOutput_.get()));
-      codedOutput_.reset(
-          new google::protobuf::io::CodedOutputStream(gzipOutput_.get()));
-    } else {
-      codedOutput_.reset(
-          new google::protobuf::io::CodedOutputStream(ostreamOutput_.get()));
-    }
-  }
-
-  /**
-   * write one message.
-   */
-  bool write(const google::protobuf::MessageLite& msg) {
-    codedOutput_->WriteVarint32(msg.ByteSize());
-    bool ret = msg.SerializeToCodedStream(codedOutput_.get());
-    return ret;
-  }
-
-protected:
-  std::unique_ptr<google::protobuf::io::ZeroCopyOutputStream> ostreamOutput_;
-  std::unique_ptr<google::protobuf::io::GzipOutputStream> gzipOutput_;
-  std::unique_ptr<google::protobuf::io::CodedOutputStream> codedOutput_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/PyDataProvider.cpp b/paddle/gserver/dataproviders/PyDataProvider.cpp
deleted file mode 100644
index dadf1b4cf27f248c7353aaad50dc22d4f6431cca..0000000000000000000000000000000000000000
--- a/paddle/gserver/dataproviders/PyDataProvider.cpp
+++ /dev/null
@@ -1,498 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PyDataProvider.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-#ifndef PADDLE_NO_PYTHON
-REGISTER_DATA_PROVIDER(py, PyDataProvider);
-#endif
-
-PyDataProvider::PyDataProvider(const DataConfig& config,
-                               bool useGpu,
-                               bool loadDataAll)
-    : DataProvider(config, useGpu), batchSize_(0) {
-  PyGuard guard;
-  pyModuleName_ = config_.load_data_module();
-  pyClassName_ = config_.load_data_object();
-  if (config_.load_data_args() != "") {
-    pyUserArgs_["load_data_args"] = config_.load_data_args();
-  }
-
-  if (loadDataAll) {
-    std::vector<std::string> fileList;
-    if (!config_.files().empty()) {
-      loadFileList(config_.files(), fileList);
-    }
-    loadData(fileList);
-  }
-}
-
-void PyDataProvider::loadData(const std::vector<std::string>& fileList) {
-  VLOG(1) << "module:" << pyModuleName_ << " class:" << pyClassName_;
-  classInstance_ =
-      createPythonClass(pyModuleName_, pyClassName_, fileList, pyUserArgs_);
-  CHECK(classInstance_) << "Create class instance failed.";
-  PyObjectPtr obj(PyObject_CallMethod(
-      classInstance_.get(), const_cast<char*>("getHeader"), NULL));
-  CHECK_PY(obj) << "Call function getHeader failed.";
-  std::string headerInfo =
-      std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
-  parseHeaderData(headerInfo);
-  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
-}
-
-void PyDataProvider::parseHeaderData(const std::string& headerData) {
-  char* pHeader = const_cast<char*>(headerData.c_str());
-  char* pHeaderEnd = pHeader + headerData.size();
-  slotNum_ = readT<unsigned int>(pHeader, pHeaderEnd);
-  unsigned int useSequenceFlag = readT<unsigned int>(pHeader, pHeaderEnd);
-  isIID_ = useSequenceFlag != 1;
-  slots_.clear();
-  slots_.reserve(slotNum_);
-  for (size_t i = 0; i < slotNum_; ++i) {
-    unsigned int slotType = readT<unsigned int>(pHeader, pHeaderEnd);
-    unsigned int slotDim = readT<unsigned int>(pHeader, pHeaderEnd);
-    slots_.emplace_back();
-    slots_.back().dim = slotDim;
-    slots_.back().type = static_cast<SlotDef_SlotType>(slotType);
-  }
-}
-
-void PyDataProvider::resetSlots() {
-  for (auto& slot : slots_) {
-    slot.indexData.clear();
-    slot.denseData.clear();
-    slot.sparseNonValueData.clear();
-    slot.sparseFloatValueData.clear();
-    slot.indices.clear();
-    slot.sequenceStartPositions.clear();
-    slot.sampleSequenceIdVec.clear();
-    slot.subSequenceStartPositions.clear();
-    slot.strData.clear();
-  }
-}
-
-void PyDataProvider::fillDenseSlot(ProtoSlot& slot,
-                                   char*& data,
-                                   const char* dataEnd) {
-  unsigned int dim = slot.dim;
-  slot.sampleNum = readT<unsigned int>(data, dataEnd);
-  slot.denseData.resize(slot.sampleNum * dim);
-#ifdef PADDLE_TYPE_DOUBLE
-  CHECK_LE(data + sizeof(real) * dim * slot.sampleNum, dataEnd)
-      << "std::copy data is out of range";
-  // PyDataProvider always provide data in float
-  float* dat = reinterpret_cast<float*>(data);
-  std::copy(dat, dat + slot.sampleNum * dim, slot.denseData.begin());
-#else
-  memcpyWithCheck(slot.denseData.data(),
-                  data,
-                  sizeof(real) * dim * slot.sampleNum,
-                  dataEnd);
-#endif
-  // PyDataProvider always provide data in float
-  data += sizeof(float) * dim * slot.sampleNum;
-}
-
-void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot,
-                                            char*& data,
-                                            const char* dataEnd) {
-  slot.sampleNum = readT<unsigned int>(data, dataEnd);
-  unsigned int* indexPtr = (unsigned int*)data;
-  CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
-      << "Vector assign value is out of range";
-  slot.indices.assign(indexPtr, indexPtr + slot.sampleNum);
-  data += sizeof(unsigned int) * slot.sampleNum;
-  unsigned int length = 0;
-  length = readT<unsigned int>(data, dataEnd);
-  slot.indices.push_back(length);
-  slot.sparseNonValueData.resize(length);
-  memcpyWithCheck(slot.sparseNonValueData.data(),
-                  data,
-                  sizeof(unsigned int) * length,
-                  dataEnd);
-  data += sizeof(unsigned int) * length;
-}
-
-void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot,
-                                         char*& data,
-                                         const char* dataEnd) {
-  slot.sampleNum = readT<unsigned int>(data, dataEnd);
-  unsigned int* indexPtr = (unsigned int*)data;
-  CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
-      << "Vector assign value is out of range";
-  slot.indices.assign(indexPtr, indexPtr + slot.sampleNum);
-  data += sizeof(unsigned int) * slot.sampleNum;
-  unsigned int length = 0;
-  length = readT<unsigned int>(data, dataEnd);
-  unsigned int* colPtr = reinterpret_cast<unsigned int*>(data);
-  CHECK_LE(data + sizeof(unsigned int) * length, dataEnd)
-      << "Data is out of range";
-  data += sizeof(unsigned int) * length;
-  size_t colLen = readT<unsigned int>(data, dataEnd);
-  CHECK_EQ(colLen, length);
-  float* valuePtr = reinterpret_cast<float*>(data);
-  CHECK_LE(data + sizeof(real) * length, dataEnd) << "Data is out of range";
-  data += sizeof(real) * length;
-  slot.indices.push_back(length);
-  slot.sparseFloatValueData.resize(length);
-  for (unsigned int ii = 0; ii < length; ++ii) {
-    slot.sparseFloatValueData[ii].col = colPtr[ii];
-    slot.sparseFloatValueData[ii].value = valuePtr[ii];
-  }
-}
-
-void PyDataProvider::fillIndexSlot(ProtoSlot& slot,
-                                   char*& data,
-                                   const char* dataEnd) {
-  slot.sampleNum = readT<unsigned int>(data, dataEnd);
-  CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
-      << "Vector assign is out of range";
-  slot.indexData.assign(reinterpret_cast<int*>(data),
-                        reinterpret_cast<int*>(data) + slot.sampleNum);
-  data += sizeof(unsigned int) * slot.sampleNum;
-}
-
-void PyDataProvider::fillStringSlot(ProtoSlot& slot,
-                                    char*& data,
-                                    const char* dataEnd) {
-  slot.sampleNum = readT<unsigned int>(data, dataEnd);
-  for (unsigned int i = 0; i < slot.sampleNum; ++i) {
-    size_t len = readT<uint32_t>(data, dataEnd);
-    auto str_begin = data;
-    data += len;
-    CHECK_LE(data, dataEnd) << "Data is out of range";
-    slot.strData.emplace_back(str_begin, len);
-  }
-}
-
-void PyDataProvider::fillSlotsByStr(const std::string& samples) {
-  char* data = const_cast<char*>(samples.c_str());
-  char* dataEnd = data + samples.size();
-  batchSize_ = readT<unsigned int>(data, dataEnd);
-  if (0 == batchSize_) {
-    return;
-  }
-
-  for (size_t j = 0; j < slotNum_; ++j) {
-    auto& slot = slots_[j];
-    CHECK(SlotDef::INDEX >= slot.type || SlotDef::STRING == slot.type)
-        << " Slot type:" << slot.type << " is out of range.";
-    CHECK_GE(slot.type, SlotDef::VECTOR_DENSE) << " Slot type:" << slot.type
-                                               << " is out of range.";
-    switch (slot.type) {
-      case SlotDef::VECTOR_DENSE:
-        fillDenseSlot(slot, data, dataEnd);
-        break;
-      case SlotDef::VECTOR_SPARSE_NON_VALUE:
-        fillSparseNonValueSlot(slot, data, dataEnd);
-        break;
-      case SlotDef::VECTOR_SPARSE_VALUE:
-        fillSparseValueSlot(slot, data, dataEnd);
-        break;
-      case SlotDef::INDEX:
-        fillIndexSlot(slot, data, dataEnd);
-        break;
-      case SlotDef::VAR_MDIM_DENSE:
-        LOG(FATAL) << "Not implemented";
-        break;
-      case SlotDef::VAR_MDIM_INDEX:
-        LOG(FATAL) << "Not implemented";
-        break;
-      case SlotDef::STRING:
-        fillStringSlot(slot, data, dataEnd);
-        break;
-    }
-  }
-  // read sequenceStartPositions
-  for (size_t j = 0; j < slotNum_; ++j) {
-    auto& slot = slots_[j];
-    if (!iidData()) {
-      unsigned int sequenceNum = readT<unsigned int>(data, dataEnd);
-      slot.sequenceNum = sequenceNum;
-      for (size_t i = 0; i < sequenceNum; ++i) {
-        slot.sequenceStartPositions.push_back(
-            readT<unsigned int>(data, dataEnd));
-      }
-      for (size_t i = 0; i < sequenceNum; ++i) {
-        size_t begin = slot.sequenceStartPositions[i];
-        size_t end = (i < sequenceNum - 1) ? slot.sequenceStartPositions[i + 1]
-                                           : slot.sampleNum;
-        for (size_t ii = begin; ii < end; ++ii) {
-          slot.sampleSequenceIdVec.push_back(ii);
-        }
-      }
-    } else {
-      for (size_t i = 0; i < slot.sampleNum; ++i) {
-        slot.sampleSequenceIdVec.push_back(i);
-      }
-    }
-  }
-  // read subSequenceStartPositions, not all slots have this infomation.
-  for (size_t j = 0; j < slotNum_; ++j) {
-    auto& slot = slots_[j];
-    if (!iidData() && data != dataEnd) {
-      unsigned int subSequenceNum = readT<unsigned int>(data, dataEnd);
-      slot.subSequenceNum = subSequenceNum;
-      for (size_t i = 0; i < subSequenceNum; ++i) {
-        slot.subSequenceStartPositions.push_back(
-            readT<unsigned int>(data, dataEnd));
-      }
-    }
-  }
-}
-
-void PyDataProvider::reset() {
-  {  // Invoke PyDataProvider Reset
-    PyGuard guard;
-    PyObjectPtr obj(PyObject_CallMethod(
-        classInstance_.get(), const_cast<char*>("reset"), NULL));
-    CHECK_PY(obj) << "Call function reset failed.";
-  }
-
-  if (!skipShuffle_) {
-    // Invoke PyDataProvider Shuffle
-    shuffle();
-  }
-  DataProvider::reset();
-}
-
-void PyDataProvider::shuffle() {
-  // py shuffle
-  PyGuard guard;
-  PyObjectPtr obj(PyObject_CallMethod(
-      classInstance_.get(), const_cast<char*>("shuffle"), NULL));
-  CHECK_PY(obj) << "Call function shuffle failed.";
-}
-
-void PyDataProvider::handleDenseSlot(ProtoSlot& slot,
-                                     size_t slotIndex,
-                                     std::vector<Argument>& cpuArguments) {
-  unsigned int dim = slot.dim;
-  Matrix::resizeOrCreate(cpuArguments[slotIndex].value,
-                         slot.sampleNum,
-                         dim,
-                         false,   // trans = false
-                         false);  // useGpu = false
-  real* buf = cpuArguments[slotIndex].value->getData();
-  for (size_t i = 0; i < slot.sampleNum; ++i) {
-    memcpyWithCheck(buf + i * dim,
-                    slot.denseData.data() + slot.sampleSequenceIdVec[i] * dim,
-                    sizeof(real) * dim,
-                    slot.denseData.data() + slot.denseData.size());
-  }
-}
-
-void PyDataProvider::handleSparseNonValueSlot(
-    ProtoSlot& slot, size_t slotIndex, std::vector<Argument>& cpuArguments) {
-  unsigned int dim = slot.dim;
-  if (!(cpuArguments[slotIndex].value)) {
-    cpuArguments[slotIndex].value =
-        Matrix::createSparseMatrix(slot.sampleNum,
-                                   dim,
-                                   slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
-                                   NO_VALUE,
-                                   SPARSE_CSR,
-                                   false,
-                                   useGpu_);
-  }
-  auto mat = cpuArguments[slotIndex].value;
-  mat->resize(slot.sampleNum, dim, slot.sampleNum, NO_VALUE, SPARSE_CSR);
-  if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
-        slot.sampleSequenceIdVec.data(),
-        slot.indices.data(),
-        slot.sparseNonValueData.data(),
-        HPPL_STREAM_1);
-  } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
-        slot.sampleSequenceIdVec.data(),
-        slot.indices.data(),
-        slot.sparseNonValueData.data());
-  } else {
-    LOG(FATAL) << "Not Supported";
-  }
-}
-
-void PyDataProvider::handleSparseValueSlot(
-    ProtoSlot& slot, size_t slotIndex, std::vector<Argument>& cpuArguments) {
-  unsigned int dim = slot.dim;
-  if (!(cpuArguments[slotIndex].value)) {
-    cpuArguments[slotIndex].value =
-        Matrix::createSparseMatrix(slot.sampleNum,
-                                   dim,
-                                   slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
-                                   FLOAT_VALUE,
-                                   SPARSE_CSR,
-                                   false,
-                                   useGpu_);
-  }
-  auto mat = cpuArguments[slotIndex].value;
-  mat->resize(slot.sampleNum, dim, slot.sampleNum, FLOAT_VALUE, SPARSE_CSR);
-  if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
-        slot.sampleSequenceIdVec.data(),
-        slot.indices.data(),
-        slot.sparseFloatValueData.data(),
-        HPPL_STREAM_DEFAULT);
-  } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
-        slot.sampleSequenceIdVec.data(),
-        slot.indices.data(),
-        slot.sparseFloatValueData.data());
-  } else {
-    LOG(FATAL) << "Not Supported";
-  }
-}
-
-void PyDataProvider::handleIndexSlot(ProtoSlot& slot,
-                                     size_t slotIndex,
-                                     std::vector<Argument>& cpuArguments) {
-  IVector::resizeOrCreate(cpuArguments[slotIndex].ids,
-                          slot.sampleNum,
-                          /*useGpu_*/ false);
-  int* buf = cpuArguments[slotIndex].ids->getData();
-  for (size_t i = 0; i < slot.sampleNum; ++i) {
-    buf[i] = slot.indexData[slot.sampleSequenceIdVec[i]];
-  }
-}
-
-void PyDataProvider::handleStringSlot(ProtoSlot& slot,
-                                      size_t slotIndex,
-                                      std::vector<Argument>& cpuArguments) {
-  if (cpuArguments[slotIndex].strs) {
-    cpuArguments[slotIndex].strs->resize(slot.sampleNum);
-  } else {
-    cpuArguments[slotIndex].strs =
-        std::make_shared<std::vector<std::string>>(slot.sampleNum);
-  }
-  for (size_t i = 0; i < slot.sampleNum; ++i) {
-    (*cpuArguments[slotIndex].strs)[i] =
-        slot.strData[slot.sampleSequenceIdVec[i]];
-  }
-}
-
-int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
-  PyGuard guard;
-  PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
-                                      const_cast<char*>("getNextBatch"),
-                                      const_cast<char*>("i"),
-                                      size));
-  CHECK_PY(obj) << "Call function getNextBatch failed.";
-  const std::string& samples =
-      std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
-  resetSlots();
-  fillSlotsByStr(samples);
-  size = batchSize_;
-  if (size <= 0) return 0;
-
-  DataBatch& cpuBatch = *cpuBatch_;
-  std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-  cpuBatch.setSize(size);
-  cpuArguments.resize(slotNum_);
-
-  if (!iidData()) {
-    for (size_t j = 0; j < slotNum_; ++j) {
-      auto& slot = slots_[j];
-      ICpuGpuVector::resizeOrCreate(cpuArguments[j].sequenceStartPositions,
-                                    slot.sequenceNum + 1,
-                                    /* useGpu= */ false);
-      int* buf = cpuArguments[j].sequenceStartPositions->getMutableData(false);
-      std::copy(slot.sequenceStartPositions.begin(),
-                slot.sequenceStartPositions.end(),
-                buf);
-      buf[slot.sequenceStartPositions.size()] = slot.sampleNum;
-
-      if (slot.subSequenceStartPositions.size()) {
-        ICpuGpuVector::resizeOrCreate(cpuArguments[j].subSequenceStartPositions,
-                                      slot.subSequenceNum + 1,
-                                      /*  useGpu= */ false);
-        int* buf =
-            cpuArguments[j].subSequenceStartPositions->getMutableData(false);
-        std::copy(slot.subSequenceStartPositions.begin(),
-                  slot.subSequenceStartPositions.end(),
-                  buf);
-        buf[slot.subSequenceNum] = slot.sampleNum;
-        // check subSequenceStartPositions and sequenceStartPositions
-        cpuArguments[j].checkSubset();
-      }
-    }
-  }
-
-  for (size_t slotIndex = 0; slotIndex < slotNum_; ++slotIndex) {
-    auto& slot = slots_[slotIndex];
-    SlotDef::SlotType slotType = slot.type;
-    switch (slotType) {
-      case SlotDef::VECTOR_DENSE:
-        handleDenseSlot(slot, slotIndex, cpuArguments);
-        break;
-      case SlotDef::VECTOR_SPARSE_NON_VALUE:
-        handleSparseNonValueSlot(slot, slotIndex, cpuArguments);
-        break;
-      case SlotDef::VECTOR_SPARSE_VALUE:
-        handleSparseValueSlot(slot, slotIndex, cpuArguments);
-        break;
-      case SlotDef::INDEX:
-        handleIndexSlot(slot, slotIndex, cpuArguments);
-        break;
-      case SlotDef::VAR_MDIM_DENSE:
-        LOG(FATAL) << "Not implemented";
-        break;
-      case SlotDef::VAR_MDIM_INDEX:
-        LOG(FATAL) << "Not implemented";
-        break;
-      case SlotDef::STRING:
-        handleStringSlot(slot, slotIndex, cpuArguments);
-        break;
-    }
-  }
-
-  if (useGpu_) {
-    std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-    DataBatch& gpuBatch = *gpuBatch_;
-    std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
-    gpuArguments.resize(cpuArguments.size());
-    gpuBatch.setSize(size);
-    for (size_t i = 0; i < slotNum_; ++i) {
-      SlotDef::SlotType slotType = slots_[i].type;
-      if (SlotDef::VECTOR_SPARSE_VALUE == slotType ||
-          SlotDef::VECTOR_SPARSE_NON_VALUE == slotType) {
-        gpuArguments[i] = cpuArguments[i];
-        gpuArguments[i].sequenceStartPositions =
-            cpuArguments[i].sequenceStartPositions;
-
-        if (slots_[i].subSequenceStartPositions.size()) {
-          gpuArguments[i].subSequenceStartPositions =
-              cpuArguments[i].subSequenceStartPositions;
-        }
-      } else {
-        gpuArguments[i].resizeAndCopyFrom(
-            cpuArguments[i], useGpu_, HPPL_STREAM_1);
-      }
-    }
-    hl_stream_synchronize(HPPL_STREAM_1);
-    *batch = gpuBatch;
-  } else {
-    *batch = cpuBatch;
-  }
-
-  return batch->getSize();
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/PyDataProvider.h b/paddle/gserver/dataproviders/PyDataProvider.h
deleted file mode 100644
index e53354c9e43ea9dc58fd4bd38a533025b6f17482..0000000000000000000000000000000000000000
--- a/paddle/gserver/dataproviders/PyDataProvider.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <paddle/utils/PythonUtil.h>
-#include "DataFormat.pb.h"
-#include "DataProvider.h"
-
-#include <vector>
-
-namespace paddle {
-
-class PyDataProvider : public DataProvider {
-public:
-  PyDataProvider(const DataConfig& config,
-                 bool useGpu,
-                 bool loadDataAll = true);
-
-  virtual void reset();
-
-  // Note this size includes the sequences which are skipped because they
-  // are longer than the batch size
-  virtual int64_t getSize() {
-    LOG(FATAL) << "Not implement yet";
-    return -1;
-  }
-  virtual void shuffle();
-
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-
-protected:
-  struct ProtoSlot;
-  // return false if each each sample is one sequence, i.e., independent
-  // of other samples.
-  inline bool iidData() const { return isIID_; }
-
-  void parseHeaderData(const std::string& headerData);
-  void fillDenseSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
-  void fillSparseNonValueSlot(ProtoSlot& slot,
-                              char*& data,
-                              const char* dataEnd);
-  void fillSparseValueSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
-  void fillIndexSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
-  void fillStringSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
-  void fillSlotsByStr(const std::string& samples);
-  void handleDenseSlot(ProtoSlot& slot,
-                       size_t slotIndex,
-                       std::vector<Argument>& cpuArguments);
-  void handleSparseNonValueSlot(ProtoSlot& slot,
-                                size_t slotIndex,
-                                std::vector<Argument>& cpuArguments);
-  void handleSparseValueSlot(ProtoSlot& slot,
-                             size_t slotIndex,
-                             std::vector<Argument>& cpuArguments);
-  void handleIndexSlot(ProtoSlot& slot,
-                       size_t slotIndex,
-                       std::vector<Argument>& cpuArguments);
-  void handleStringSlot(ProtoSlot& slot,
-                        size_t slotIndex,
-                        std::vector<Argument>& cpuArguments);
-  void resetSlots();
-  void loadData(const std::vector<std::string>& fileList);
-
-protected:
-  struct ProtoSlot {
-    SlotDef::SlotType type;
-    int dim;
-    unsigned int sampleNum;
-    unsigned int sequenceNum;
-    unsigned int subSequenceNum;
-    // Store the data of index type slot
-    std::vector<int> indexData;
-    // Store the data of dense type slot
-    std::vector<real> denseData;
-    // Store the data of sparseNonValue type slot
-    std::vector<sparse_non_value_t> sparseNonValueData;
-    // Store the data of sparseValue type slot
-    std::vector<sparse_float_value_t> sparseFloatValueData;
-    // Used to store the index of each sample in slot values
-    std::vector<int64_t> indices;
-    // The starting position of each sequence in samples
-    // The last element should be the number of samples
-    // If empty, each sample is one sequence.
-    std::vector<size_t> sequenceStartPositions;
-    // The index id of sequences in slot
-    std::vector<int64_t> sampleSequenceIdVec;
-    // The starting position of each subsequence in samples
-    // The last element should be the number of subsequence
-    // If empty, each sequence of sample has no subsequence.
-    std::vector<size_t> subSequenceStartPositions;
-    // Store the data of string type slot
-    std::vector<std::string> strData;
-  };
-  std::vector<ProtoSlot> slots_;
-
-  PyObjectPtr classInstance_;
-  unsigned int batchSize_;
-  unsigned int slotNum_;
-  // if use sequence, isIID_ equals false, otherwise it is true.
-  bool isIID_;
-  // The name of python module name
-  std::string pyModuleName_;
-  // The name of python class name
-  std::string pyClassName_;
-  // User args set in config
-  std::map<std::string, std::string> pyUserArgs_;
-
-  ThreadLocalD<DataBatch> cpuBatch_;
-  ThreadLocalD<DataBatch> gpuBatch_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp
deleted file mode 100644
index e3e4457f9b72c5edb8082fdf378ae662b4aee42f..0000000000000000000000000000000000000000
--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ /dev/null
@@ -1,1033 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_NO_PYTHON
-
-#include <Python.h>
-#include <numpy/numpyconfig.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <list>
-#include <unordered_set>
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
-#include <numpy/ndarrayobject.h>
-
-#include "DataProvider.h"
-
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-namespace unittest {
-
-static std::unique_ptr<std::function<void(size_t /*poolActualSize */)>>
-    OnPoolFilled;
-
-namespace pydp2 {
-
-void setOnPoolFilledHook(const std::function<void(size_t)>& callback) {
-  OnPoolFilled.reset(new std::function<void(size_t)>());
-  *OnPoolFilled = callback;
-}
-
-void clearOnPoolFilledHook() { OnPoolFilled.reset(); }
-
-}  // namespace pydp2
-}  // namespace unittest
-
-/**
- * Slot type
- */
-enum SlotType {
-  ST_DENSE = 0,
-  ST_NON_SPARSE_VALUE = 1,
-  ST_SPARSE_VALUE = 2,
-  ST_INDEX = 3
-};
-
-/**
- * Sequence type
- */
-enum SeqType { SQT_NONE = 0, SQT_SEQ, SQT_SUBSEQ };
-
-/**
- * Cache Type.
- */
-enum CacheType {
-  NO_CACHE = 0,           // Each pass will load data from PyDataProvider2.
-  CACHE_PASS_IN_MEM = 1,  // First pass will load data from PyDataProvider2,
-                          // then cache all data in memory. Load data from
-                          // memory in rest passes.
-};
-
-struct SlotHeader {  // Slot Header will parse from python object's slots field.
-  size_t dim;
-  SlotType slotType;
-  SeqType seqType;
-};
-
-inline std::ostream& operator<<(std::ostream& os, const SlotHeader& header) {
-  os << "Dim = " << header.dim << " Type = " << header.slotType
-     << " SeqType = " << header.seqType;
-  return os;
-}
-
-/**
- * FieldScanner Interface.
- *
- * It will read python object, and fill to argument's each slot.
- * There are two steps, prepare and fill. Scanner will alloc memory during
- * prepare step, fill data into argument during fill step.
- */
-class IFieldScanner {
-public:
-  DISABLE_COPY(IFieldScanner);
-  /**
-   * Ctor.
-   * @param headerPtr slot header that scanner belong to.
-   */
-  explicit IFieldScanner(SlotHeader* headerPtr) : headerPtr_(headerPtr) {}
-  virtual ~IFieldScanner() {}
-
-  /**
-   * Start prepare step.
-   */
-  virtual void startPrepare(Argument& argument) {}
-
-  /**
-   * Prepare step.
-   *
-   * @note the obj could be a timestep of sample or whole sample. It depends
-   * what scanner it is.
-   */
-  virtual void prepare(Argument& argument, PyObject* obj) {}
-
-  /**
-   * Finish Prepare step.
-   */
-  virtual void finishPrepare(Argument& argument) {}
-
-  /**
-   * Start fill step.
-   */
-  virtual void startFill(Argument& argument) {}
-
-  /**
-   * Fill step.
-   *
-   * @note the obj could be a timestep of sample or whole sample. It depends
-   * what scanner it is.
-   */
-  virtual void fill(Argument& argument, PyObject* obj) {}
-
-  /**
-   * Finish fill step.
-   */
-  virtual void finishFill(Argument& argument) {}
-
-  /**
-   * Factory method. Create a scanner by header. The final scanner may be
-   * combine many scanners.
-   *
-   * @note Fatal if header is not support.
-   */
-  static IFieldScanner* create(SlotHeader* header);
-
-protected:
-  SlotHeader* headerPtr_;
-};
-
-/**
- * Py Data Provider Cache Interface.
- */
-class IPyDataProviderCache {
-public:
-  virtual ~IPyDataProviderCache() {}
-
-  /**
-   * invoke when DataProvider::reset()
-   * @return true if read data from python.
-   */
-  virtual bool reset() = 0;
-
-  /**
-   * invoke when these data are used by DataProvider, and need to clear.
-   * @param [inout] data used data.
-   *
-   * @note The implemented class must clear these data array. Or if you want to
-   * delete the PyObjectPtr later, you should make sure the paddle process only
-   * have one active thread calling python code (use PyGuard otherwise).
-   */
-  virtual void drop(std::deque<PyObjectPtr>* data) = 0;
-
-  /**
-   * Return whole data in cache.
-   */
-  virtual std::deque<PyObjectPtr>* load() = 0;
-
-  /**
-   * Factory method. Convert CacheType to IPyDataProviderCache*
-   */
-  static IPyDataProviderCache* create(CacheType ct);
-};
-
-/**
- * PyDataProvider2.
- *
- * For usage, please refer python module 'paddle.trainer.PyDataProvider2'
- *
- * Here, we start a thread to read data. It is totally asynchronous for reading
- * data. And it support cache strategies.
- */
-class PyDataProvider2 : public DataProvider {
-public:
-  /**
-   * Ctor
-   */
-  PyDataProvider2(const DataConfig& config,
-                  const ModelConfig& modelConfig,
-                  bool useGpu)
-      : DataProvider(config, useGpu), callingContextCreated_(2) {
-    if (PyArray_API == NULL) import_array();
-    auto& args = config.load_data_args();
-    PyObjectPtr kwargs = PyObjectPtr(PyDict_New());
-    if (!args.empty()) {
-      kwargs = callPythonFuncRetPyObj(
-          "paddle.trainer.PyDataProvider2", "deserialize_args", {args});
-    }
-
-    py::DictHelper kwargsDict(kwargs);
-    kwargsDict.setBool("is_train", !config.for_test());
-    std::vector<std::string> inputs;
-    inputs.reserve(modelConfig.input_layer_names().size());
-    std::copy(modelConfig.input_layer_names().begin(),
-              modelConfig.input_layer_names().end(),
-              std::back_inserter(inputs));
-    kwargsDict.setStringList("input_order", inputs);
-
-    // kwargs is keyword arguemts to create object.
-    this->createPyDataObj(config.load_data_module(),
-                          config.load_data_object(),
-                          config.files(),
-                          std::move(kwargs));
-    DBG << "Instance " << instance_.get() << " loaded.";
-    this->readPyFields(config.for_test());
-    DBG << "Py Field Done";
-  }
-
-  /**
-   * Dtor
-   * @note will stop loading thread when destructing
-   */
-  virtual ~PyDataProvider2() { resetImpl(false); }
-
-private:
-  void createPyDataObj(const std::string& model,
-                       const std::string& className,
-                       const std::string& fileListName,
-                       PyObjectPtr&& kwargs  // NOLINT
-                       ) {
-    LOG(INFO) << "loading dataprovider " << model << "::" << className;
-
-    PyObjectPtr module = py::import(model);
-    PyObjectPtr moduleDict(PyModule_GetDict(module.get()));
-    CHECK_PY(moduleDict) << "Invoke module.__dict__ error";
-    PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(), className.c_str()));
-    CHECK_PY(cls) << "load class " << className.c_str() << "error";
-
-    // If there are multiple python instance share same module, the PyObjectPtr
-    // only for instance will make python reference-count error.
-    //
-    // So here, we increase reference count manually.
-    Py_XINCREF(module.get());
-    Py_XINCREF(moduleDict.get());
-    Py_XINCREF(cls.get());
-
-    PyObjectPtr fileListInPy = loadPyFileLists(fileListName);
-    PyDict_SetItemString(kwargs.get(), "file_list", fileListInPy.get());
-    {
-      PyGuard guard;
-      instance_.reset(PyObject_Call(cls.get(), zeroTuple_.get(), kwargs.get()));
-    }
-    CHECK_PY(instance_) << "Cannot Create instance";
-  }
-
-  void readPyFields(bool testing) {
-    py::ObjectHelper self(this->instance_);
-    bool ok;
-
-    this->skipShuffle_ =
-        !self.getBoolAttr("should_shuffle", &ok /*isBoolType*/);
-    if (!ok) {
-      this->skipShuffle_ = testing;  // shuffle when is training, skip shuffle
-                                     // when is testing.
-    }
-    DBG << "Provider Skip Shuffle " << this->skipShuffle_;
-
-    this->poolSize_ = self.getIntAttr<size_t>("pool_size", &ok);
-    if (!ok) {
-      this->poolSize_ = -1UL;
-    }
-    this->minPoolSize_ = self.getIntAttr<size_t>("min_pool_size", &ok);
-    if (!ok) {
-      this->minPoolSize_ = -1UL;
-    }
-    this->minPoolSize_ = std::min(this->poolSize_, this->minPoolSize_);
-
-    this->canOverBatchSize_ = self.getBoolAttr("can_over_batch_size");
-
-    calcBatchSize_.reset(self.getAttr("calc_batch_size"));
-    if (this->calcBatchSize_ && !py::isCallable(this->calcBatchSize_)) {
-      this->calcBatchSize_.reset();
-    }
-
-    generator_.reset(self.getAttr("generator"));
-    CHECK(py::isCallable(generator_));
-
-    // Reading slots.
-    PyObjectPtr slotsPtr(self.getAttr("slots"));
-    py::SequenceHelper slots(slotsPtr);
-    headers_.reserve(slots.size());
-    for (size_t i = 0; i < slots.size(); ++i) {
-      headers_.emplace_back();
-      auto& header = headers_.back();
-      PyObject* hdPtr = slots[i];
-      CHECK(hdPtr != nullptr);
-      Py_XINCREF(hdPtr);
-      PyObjectPtr headerPtrWrap(hdPtr);
-      py::ObjectHelper hd(headerPtrWrap);
-      header.dim = hd.getIntAttrWithError<size_t>("dim");
-      header.seqType = (SeqType)hd.getIntAttrWithError<int>("seq_type");
-      header.slotType = (SlotType)hd.getIntAttrWithError<int>("type");
-    }
-
-    DBG << "Data header size " << headers_.size();
-    for (auto& header : headers_) {
-      DBG << header;
-    }
-    cache_.reset(IPyDataProviderCache::create(
-        (CacheType)self.getIntAttrWithError<int>("cache")));
-  }
-
-  PyObjectPtr loadPyFileLists(const std::string& fileListName) {
-    loadFileList(fileListName, fileLists_);
-    PyObject* lst = PyList_New(fileLists_.size());
-    for (size_t i = 0; i < fileLists_.size(); ++i) {
-      PyList_SET_ITEM(lst, i, PyString_FromString(fileLists_[i].c_str()));
-    }
-    return PyObjectPtr(lst);
-  }
-
-  void loadThread() {
-    DBG << "Creating context";
-    for (auto& filename : fileLists_) {
-      PyGuard g;
-      py::CallableHelper generator(this->generator_);
-      generator.setArgsSize(2);
-      generator.getArgs().set(0, instance_);
-      generator.getArgs().set(1, PyString_FromString(filename.c_str()), true);
-      callingContexts_.emplace_back(generator());
-      CHECK_PY(callingContexts_.back()) << "Generator error.";
-      CHECK(PyIter_Check(callingContexts_.back()));
-    }
-    DBG << "Create context done";
-    callingContextCreated_.wait();
-
-    PositionRandom p(skipShuffle_);
-
-    while (!exit_ && !callingContexts_.empty()) {
-      PyObject* data = nullptr;
-
-      {  // Read data.
-        size_t cid = p(callingContexts_.size());
-        bool atEnd;
-        data = py::iterNext(callingContexts_[cid], &atEnd);
-        if (atEnd || data == nullptr) {
-          if (cid != 0) {
-            std::swap(callingContexts_[cid], callingContexts_[0]);
-            cid = 0;
-          }
-
-          PyObjectPtr front;
-          {
-            std::unique_lock<std::mutex> l(mtx_);
-            front = pop_get_front(callingContexts_);
-          }
-          {
-            PyGuard g;
-            front.reset();
-          }
-          this->pullCV_.notify_all();
-          continue;
-        }
-      }
-
-      size_t additionalBatchSize = 1;
-      if (calcBatchSize_) {
-        PyGuard guard;
-        py::CallableHelper calcBatchSize(this->calcBatchSize_);
-        calcBatchSize.setArgsSize(1);
-        calcBatchSize.getArgs().set(0, data);
-        PyObjectPtr bs(calcBatchSize());
-        CHECK_PY(bs);
-        bool ok;
-        additionalBatchSize = py::castInt<size_t>(bs.get(), &ok);
-        CHECK(ok) << "CalcBatchSize must return int or long";
-      }
-
-      if (this->loadThread_) {  // wait poolActualSize < poolSize;
-        std::unique_lock<std::mutex> l(mtx_);
-        pushCV_.wait(l, [this, additionalBatchSize] {
-          return this->poolActualSize_ < poolSize_;
-        });
-      }
-
-      {
-        std::lock_guard<std::mutex> guard(mtx_);
-        poolActualSize_ += additionalBatchSize;
-        dataPool_.emplace_back(data);
-      }
-      pullCV_.notify_all();
-    }
-    DBG << "load thread end";
-  }
-
-  inline void resetImpl(bool startNewThread) {
-    DBG << "Reseting " << startNewThread;
-    exit_.store(true);
-    if (loadThread_) {  // is loading.
-      loadThread_->join();
-      loadThread_.reset();
-    }
-    {
-      PyGuard g;
-      callingContexts_.clear();
-      this->pullCV_.notify_one();
-    }
-
-    std::lock_guard<std::mutex> guard(mutexForReset_);
-    {
-      PyGuard g;
-      dataPool_.clear();
-    }
-    poolActualSize_ = 0;
-
-    if (startNewThread && cache_->reset()) {
-      DBG << "Start new thread.";
-      loadThread_.reset(new std::thread([this] {
-        exit_ = false;
-        loadThread();
-      }));
-      callingContextCreated_.wait();
-    }
-    DBG << "Reset done";
-    exit_ = false;
-  }
-
-private:
-  std::unique_ptr<std::thread> loadThread_;
-  std::atomic<bool> exit_;
-  std::deque<PyObjectPtr> callingContexts_;
-  std::deque<PyObjectPtr> dataPool_;
-  size_t poolActualSize_;
-  std::condition_variable pushCV_;
-  std::condition_variable pullCV_;
-  std::mutex mtx_;
-
-  std::mutex mutexForReset_;
-
-  ThreadBarrier callingContextCreated_;
-  std::unique_ptr<IPyDataProviderCache> cache_;
-
-  PyObjectPtr instance_;
-  size_t poolSize_;
-  size_t minPoolSize_;
-  bool canOverBatchSize_;
-  PyObjectPtr calcBatchSize_;
-  PyObjectPtr generator_;
-  std::vector<std::string> fileLists_;
-  std::vector<SlotHeader> headers_;
-  static PyObjectPtr zeroTuple_;
-
-  class PositionRandom {
-  public:
-    inline explicit PositionRandom(bool skipRand)
-        : eng_(ThreadLocalRandomEngine::get()), skipRand_(skipRand) {}
-
-    inline size_t operator()(size_t len) {
-      if (!skipRand_) {
-        if (!dist_ || dist_->b() != len - 1) {
-          dist_.reset(new std::uniform_int_distribution<size_t>(0, len - 1));
-        }
-        return (*dist_)(eng_);
-      } else {
-        return 0;
-      }
-    }
-
-  private:
-    std::default_random_engine& eng_;
-    std::unique_ptr<std::uniform_int_distribution<size_t>> dist_;
-    bool skipRand_;
-  };
-
-  // DataProvider interface
-public:
-  /**
-   * Resetting the PyDataProvider. May start reading thread here.
-   */
-  virtual void reset() {
-    resetImpl(true);
-    DataProvider::reset();
-  }
-
-  /**
-   * Shuffle. Do nothing because PyDataProvider do shuffle implicitly by random
-   * select data from datapool.
-   */
-  void shuffle() {}
-
-  /**
-   * Not limited size.
-   */
-  int64_t getSize() { return -1; }
-
-  /**
-   * Loading a batch of data.
-   */
-  int64_t getNextBatchInternal(int64_t size_, DataBatch* batch) {
-    std::lock_guard<std::mutex> guard(mutexForReset_);
-    REGISTER_TIMER("PyDP2.getNextBatchInternal")
-    CHECK_GE(size_, 0);
-    size_t size = (size_t)size_;
-    if (loadThread_) {  // loading from thread should wait for data pool ready.
-                        // but, loading from cache, cache object should ensure
-                        // data pool ready.
-      std::unique_lock<std::mutex> l(mtx_);
-      pullCV_.wait(l, [this, &size] {
-        return this->poolActualSize_ >= std::max(size, this->minPoolSize_) ||
-               callingContexts_.empty();
-      });
-
-      if (unittest::OnPoolFilled) {
-        (*unittest::OnPoolFilled)(this->poolActualSize_);
-      }
-    }
-    std::deque<PyObjectPtr> data;
-    size_t bsize = 0;
-    std::deque<PyObjectPtr>* poolPtr = nullptr;
-
-    if (this->loadThread_) {  // loading from thread.
-      poolPtr = &this->dataPool_;
-    } else {  // loading from cache.
-      poolPtr = this->cache_->load();
-    }
-    if (exit_) {
-      // PyDataProvider is destructing.
-      return 0;
-    }
-    CHECK(poolPtr != nullptr);
-
-    std::deque<PyObjectPtr>& pool = *poolPtr;
-
-    while (bsize < size && !pool.empty()) {
-      {
-        // move data from pool to data
-        std::lock_guard<std::mutex> guard(mtx_);
-        if (skipShuffle_) {
-          size_t i = 0;
-          CHECK(pool[i] != nullptr);
-          data.emplace_back(std::move(pool[i]));
-          pool.pop_front();
-        } else {  // when shuffle, use swap to drop only last pool element.
-          size_t i = ThreadLocalRand::rand() % pool.size();
-          CHECK(pool[i] != nullptr);
-          if (i != 0) {
-            std::swap(pool[i], pool.front());
-          }
-          data.emplace_back(std::move(pool.front()));
-          pool.pop_front();
-        }
-
-        if (calcBatchSize_) {  // custom calc batch size.
-          PyGuard guard;
-          Py_INCREF(data.back().get());
-          py::CallableHelper calcBatchSize(calcBatchSize_);
-          calcBatchSize.setArgsSize(1);
-          calcBatchSize.getArgs().set(0, data.back());
-          PyObjectPtr customBatchSize(calcBatchSize());
-          bool ok;
-          size_t tmp = py::castInt<size_t>(customBatchSize.get(), &ok);
-          CHECK(ok) << "calc_batch_size must return int";
-
-          if (bsize + tmp > size && !canOverBatchSize_) {
-            // Put data back.
-            pool.push_front(std::move(data.back()));
-            data.pop_back();
-            break;
-          } else {
-            bsize += tmp;
-          }
-        } else {
-          bsize += 1;
-        }
-      }
-    }
-
-    if (this->loadThread_) {
-      {
-        std::lock_guard<std::mutex> g(mtx_);
-        poolActualSize_ -= bsize;
-      }
-      this->pushCV_.notify_all();
-    }
-
-    if (bsize == 0) {  // end of pass. In data pool, cannot get any data.
-      return 0;
-    }
-
-    DataBatch cpuBatch;
-    cpuBatch.setSize(bsize);
-    auto& inArgs = cpuBatch.getStreams();
-    inArgs.resize(headers_.size());
-    std::vector<std::unique_ptr<IFieldScanner>> scanners;
-    scanners.reserve(headers_.size());
-    for (auto& header : headers_) {
-      scanners.emplace_back(IFieldScanner::create(&header));
-    }
-    DBG << "Scanner created.";
-    for (size_t i = 0; i < headers_.size(); ++i) {
-      scanners[i]->startPrepare(inArgs[i]);
-    }
-    for (auto& d : data) {
-      py::SequenceHelper s(d);
-      for (size_t i = 0; i < headers_.size(); ++i) {
-        scanners[i]->prepare(inArgs[i], s[i]);
-      }
-    }
-    for (size_t i = 0; i < headers_.size(); ++i) {
-      scanners[i]->finishPrepare(inArgs[i]);
-    }
-    for (size_t i = 0; i < headers_.size(); ++i) {
-      scanners[i]->startFill(inArgs[i]);
-    }
-    for (auto& d : data) {
-      py::SequenceHelper s(d);
-      for (size_t i = 0; i < headers_.size(); ++i) {
-        scanners[i]->fill(inArgs[i], s[i]);
-      }
-    }
-
-    for (size_t i = 0; i < headers_.size(); ++i) {
-      scanners[i]->finishFill(inArgs[i]);
-    }
-
-    {
-      PyGuard g;
-      cache_->drop(&data);
-    }
-
-    DBG << "Reading CPU Batch Done.";
-
-    if (useGpu_) {
-      std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-      DataBatch& gpuBatch = *batch;
-      std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
-      gpuArguments.resize(cpuArguments.size());
-      gpuBatch.setSize(bsize);
-      for (size_t i = 0; i < headers_.size(); ++i) {
-        gpuArguments[i].resizeAndCopyFrom(
-            cpuArguments[i], useGpu_, HPPL_STREAM_1);
-      }
-      hl_stream_synchronize(HPPL_STREAM_1);
-    } else {
-      *batch = cpuBatch;
-    }
-    return bsize;
-  }
-};
-
-PyObjectPtr PyDataProvider2::zeroTuple_(PyTuple_New(0));
-
-REGISTER_DATA_PROVIDER_EX(py2, PyDataProvider2);
-
-/**
- * Scanner for dense slot.
- */
-class DenseScanner : public IFieldScanner {
-public:
-  explicit DenseScanner(SlotHeader* ptr) : IFieldScanner(ptr), height_(0) {}
-
-  /**
-   * Prepare.
-   * @param argument target argument
-   * @param obj each timestep of a sample.
-   */
-  virtual void prepare(Argument& argument, PyObject* obj) { ++height_; }
-
-  virtual void finishPrepare(Argument& argument) {
-    Matrix::resizeOrCreate(
-        argument.value, height_, headerPtr_->dim, false, false);
-    height_ = 0;
-  }
-
-  /**
-   * Fill argument from obj.
-   * @param argument
-   * @param obj
-   */
-  virtual void fill(Argument& argument, PyObject* obj) {
-    real* dat = argument.value->getData() + height_ * headerPtr_->dim;
-    if (PyArray_Check(obj)) {
-      auto dtype = PyArray_DTYPE((PyArrayObject*)obj);
-      if (dtype->type == 'f' && dtype->elsize == sizeof(real)) {
-        real* data = (real*)PyArray_DATA((PyArrayObject*)obj);
-        auto sz = PyArray_SIZE((PyArrayObject*)obj);
-        std::copy(data, data + sz, dat);
-      } else {
-        LOG(FATAL) << "You should yield float" << sizeof(real) * 8 << " array";
-      }
-    } else {
-      py::SequenceHelper s(obj);
-      // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
-      for (size_t i = 0; i < headerPtr_->dim; ++i) {
-        dat[i] = (real)s.getDouble(i);
-      }
-    }
-    ++height_;
-  }
-
-private:
-  size_t height_;
-};
-
-/**
- * Scanner for index slot
- */
-class IndexScanner : public IFieldScanner {
-public:
-  explicit IndexScanner(SlotHeader* ptr) : IFieldScanner(ptr), cnt_(0) {}
-
-  /**
-   * Prepare memory space.
-   *
-   * @note obj is a single timestep of sample
-   */
-  virtual void prepare(Argument& argument, PyObject* obj) { ++cnt_; }
-
-  virtual void finishPrepare(Argument& argument) {
-    IVector::resizeOrCreate(argument.ids, cnt_, false);
-    cnt_ = 0;
-  }
-
-  /**
-   * Fill one index to argument.
-   */
-  virtual void fill(Argument& argument, PyObject* obj) {
-    bool ok;
-    argument.ids->getData()[cnt_++] = py::castInt<int>(obj, &ok);
-    CHECK(ok) << "Cannot cast int " << py::repr(obj);
-  }
-
-private:
-  size_t cnt_;
-};
-
-class SparseNonValueScanner : public IFieldScanner {
-public:
-  explicit SparseNonValueScanner(SlotHeader* ptr)
-      : IFieldScanner(ptr), nnz_(0), height_(0) {}
-
-  /**
-   * Prepare memory space
-   * @note obj is a timestep of one sample.
-   */
-  virtual void prepare(Argument& argument, PyObject* obj) {
-    ++height_;
-    nnz_ += py::SequenceHelper(obj).size();
-  }
-
-  virtual void finishPrepare(Argument& argument) {
-    Matrix::resizeOrCreateSparseMatrix(
-        argument.value, height_, headerPtr_->dim, nnz_, NO_VALUE);
-  }
-
-  virtual void startFill(Argument& argument) {
-    auto smat = (CpuSparseMatrix*)(argument.value.get());
-    smat->getRows()[0] = 0;
-    nnz_ = 0;
-    height_ = 1;
-  }
-
-  /**
-   * Fill one sparse vector to argument.
-   * @note obj is a timestep of one sample.
-   */
-  virtual void fill(Argument& argument, PyObject* obj) {
-    py::SequenceHelper s(obj);
-    auto sz = s.size();
-    auto smat = (CpuSparseMatrix*)(argument.value.get());
-    int* row = smat->getRows();
-    int* col = smat->getCols();
-    real* dat = smat->getData();
-    row[height_] = row[height_ - 1] + (int)sz;
-
-    for (decltype(sz) i = 0; i < sz; ++i) {
-      setData(col + nnz_, dat + nnz_, s[i]);
-      ++nnz_;
-    }
-    ++height_;
-  }
-
-protected:
-  /**
-   * Set a single sparse index and value.
-   * @param [out] col sparse index
-   * @param [out] dat sparse value
-   * @param [in] obj Python Object. For sparse_non_value is a PyInt or PyLong.
-   *                 For sparse_value is a Tuple (int, float).
-   */
-  virtual void setData(int* col, real* dat, PyObject* obj) {
-    bool ok;
-    *col = py::castInt<int>(obj, &ok);
-    CHECK(ok);
-  }
-
-  size_t nnz_;
-  size_t height_;
-};
-
-class SparseValueScanner : public SparseNonValueScanner {
-public:
-  explicit SparseValueScanner(SlotHeader* ptr) : SparseNonValueScanner(ptr) {}
-
-  virtual void finishPrepare(Argument& argument) {
-    Matrix::resizeOrCreateSparseMatrix(
-        argument.value, height_, headerPtr_->dim, nnz_, FLOAT_VALUE);
-  }
-
-protected:
-  virtual void setData(int* col, real* dat, PyObject* obj) {
-    py::SequenceHelper s(obj);
-    SparseNonValueScanner::setData(col, dat, s[0]);
-    *dat = (real)s.getDouble(1);
-  }
-};
-
-/**
- * Sequence Scanner. Scanner for sequence or sub-sequence.
- */
-class SequenceScanner : public IFieldScanner {
-public:
-  /**
-   * Ctor
-   * @param innerScanner inner scanner for each timestep or sub-sequence.
-   * @param getSeqStartPos A callback, (Argument) => ICpuGpuVectorPtr.
-   *                       return a sequence start position or a sub-sequence
-   *                       start position.
-   */
-  SequenceScanner(
-      std::unique_ptr<IFieldScanner>&& innerScanner,
-      const std::function<ICpuGpuVectorPtr&(Argument&)>& getSeqStartPos)
-      : IFieldScanner(nullptr),
-        inner_(std::move(innerScanner)),
-        cnt_(0),
-        getSeqStartPos_(getSeqStartPos) {}
-
-  /**
-   * Start prepare. Invoke inner->startPrepare too.
-   */
-  virtual void startPrepare(Argument& argument) {
-    inner_->startPrepare(argument);
-  }
-
-  /**
-   * Prepare. obj is a list or tuple. it will invoke inner_->prepare for each
-   * element of sequence obj.
-   */
-  virtual void prepare(Argument& argument, PyObject* obj) {
-    py::SequenceHelper s(obj);
-    ++cnt_;
-    for (size_t i = 0; i < s.size(); ++i) {
-      inner_->prepare(argument, s[i]);
-    }
-  }
-
-  /**
-   * Finish prepare. invoke inner_->finishPrepare too.
-   */
-  virtual void finishPrepare(Argument& argument) {
-    ICpuGpuVector::resizeOrCreate(getSeqStartPos_(argument), cnt_ + 1, false);
-    inner_->finishPrepare(argument);
-  }
-
-  /**
-   * Start fill. invoke inner->startFill too.
-   */
-  virtual void startFill(Argument& argument) {
-    getSeqStartPos_(argument)->getMutableData(false)[0] = 0;
-    cnt_ = 1;
-    inner_->startFill(argument);
-  }
-
-  /**
-   * Fill. Obj is a tuple or list. invoke inner->fill for each element of
-   * sequence obj. And set seqStartPos at same time. The seqStartPos will be
-   * calculated by getSeqStartPos callback passed in ctor.
-   */
-  virtual void fill(Argument& argument, PyObject* obj) {
-    getSeqStartPos_(argument)->getMutableData(false)[cnt_] =
-        getSeqStartPos_(argument)->getMutableData(false)[cnt_ - 1] +
-        (int)getSize(obj);
-    py::SequenceHelper s(obj);
-    ++cnt_;
-    for (size_t i = 0; i < s.size(); ++i) {
-      inner_->fill(argument, s[i]);
-    }
-  }
-
-  /**
-   * Finish fill. will invoke inner->finishFill too.
-   */
-  virtual void finishFill(Argument& argument) { inner_->finishFill(argument); }
-
-protected:
-  size_t getSize(PyObject* obj) {
-    py::SequenceHelper s(obj);
-    auto sc = dynamic_cast<SequenceScanner*>(inner_.get());
-    if (sc) {
-      size_t sum = 0;
-      for (size_t i = 0; i < s.size(); ++i) {
-        sum += sc->getSize(s[i]);
-      }
-      return sum;
-    } else {
-      return s.size();
-    }
-  }
-
-private:
-  std::unique_ptr<IFieldScanner> inner_;
-  size_t cnt_;
-  std::function<ICpuGpuVectorPtr&(Argument&)> getSeqStartPos_;
-};
-
-IFieldScanner* IFieldScanner::create(SlotHeader* header) {
-  IFieldScanner* retv = nullptr;
-  switch (header->slotType) {
-    case ST_DENSE:
-      retv = new DenseScanner(header);
-      break;
-    case ST_INDEX:
-      retv = new IndexScanner(header);
-      break;
-    case ST_NON_SPARSE_VALUE:
-      retv = new SparseNonValueScanner(header);
-      break;
-    case ST_SPARSE_VALUE:
-      retv = new SparseValueScanner(header);
-      break;
-    default:
-      LOG(FATAL) << "Not implemented " << header->slotType;
-  }
-
-  switch (header->seqType) {
-    case SQT_NONE:
-      break;
-    case SQT_SUBSEQ:
-      retv = new SequenceScanner(std::unique_ptr<IFieldScanner>(retv),
-                                 [](Argument& arg) -> ICpuGpuVectorPtr& {
-                                   return arg.subSequenceStartPositions;
-                                 });
-    // fall through, not break;
-    case SQT_SEQ:
-      retv = new SequenceScanner(std::unique_ptr<IFieldScanner>(retv),
-                                 [](Argument& arg) -> ICpuGpuVectorPtr& {
-                                   return arg.sequenceStartPositions;
-                                 });
-      break;
-    default:
-      LOG(FATAL) << "Not implemented";
-  }
-
-  return retv;
-}
-
-/**
- * No Cache Strategy. Will destruct old data immediately and load data from
- * python every pass.
- */
-class NoCacheStrategy : public IPyDataProviderCache {
-public:
-  virtual bool reset() { return true; }
-
-  virtual void drop(std::deque<PyObjectPtr>* data) { data->clear(); }
-
-  virtual std::deque<PyObjectPtr>* load() { return nullptr; }
-};
-
-/**
- * Cache One Pass In Memory strategy.
- *
- * In first pass, will load data from python and store them in memory.
- * The rest passes, will load data from memory.
- */
-class CacheOnePassInMemory : public IPyDataProviderCache {
-public:
-  CacheOnePassInMemory()
-      : objPool_(new std::deque<PyObjectPtr>()),
-        droppedPool_(new std::deque<PyObjectPtr>()) {}
-
-  virtual bool reset() {
-    if (objPool_->empty() && droppedPool_->empty()) {
-      return true;
-    } else if (objPool_->empty()) {
-      std::swap(objPool_, droppedPool_);
-      return false;
-    } else {
-      LOG(FATAL) << "Unexpected branch";
-    }
-  }
-
-  virtual void drop(std::deque<PyObjectPtr>* data) {
-    size_t orgSize = droppedPool_->size();
-    droppedPool_->resize(orgSize + data->size());
-    for (size_t i = 0; i < data->size(); ++i) {
-      std::swap((*droppedPool_)[orgSize + i], (*data)[i]);
-    }
-    data->clear();
-  }
-
-  virtual std::deque<PyObjectPtr>* load() { return objPool_.get(); }
-
-private:
-  std::unique_ptr<std::deque<PyObjectPtr>> objPool_;
-  std::unique_ptr<std::deque<PyObjectPtr>> droppedPool_;
-};
-
-IPyDataProviderCache* IPyDataProviderCache::create(CacheType ct) {
-  switch (ct) {
-    case NO_CACHE:
-      return new NoCacheStrategy();
-    case CACHE_PASS_IN_MEM:
-      return new CacheOnePassInMemory();
-    default:
-      LOG(FATAL) << "Not implemented";
-  }
-}
-}  // namespace paddle
-
-#endif
diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
deleted file mode 100644
index 0f680de776f4755ca5fe83c86ea759d88f93ed01..0000000000000000000000000000000000000000
--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ /dev/null
@@ -1,320 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Evaluator.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/utils/StringUtil.h"
-
-namespace paddle {
-
-/**
- * calculate sequence-to-sequence edit distance
- */
-class CTCErrorEvaluator : public Evaluator {
-private:
-  MatrixPtr outActivations_;
-  int numTimes_, numClasses_, numSequences_, blank_;
-  real deletions_, insertions_, substitutions_;
-  int seqClassficationError_;
-  mutable std::unordered_map<std::string, real> evalResults_;
-
-  std::vector<int> path2String(const std::vector<int>& path) {
-    std::vector<int> str;
-    str.clear();
-    int prevLabel = -1;
-    for (std::vector<int>::const_iterator label = path.begin();
-         label != path.end();
-         label++) {
-      if (*label != blank_ &&
-          (str.empty() || *label != str.back() || prevLabel == blank_)) {
-        str.push_back(*label);
-      }
-      prevLabel = *label;
-    }
-    return str;
-  }
-
-  std::vector<int> bestLabelSeq() {
-    std::vector<int> path;
-    path.clear();
-    real* acts = outActivations_->getData();
-    for (int i = 0; i < numTimes_; ++i) {
-      path.push_back(std::max_element(acts + i * numClasses_,
-                                      acts + (i + 1) * numClasses_) -
-                     (acts + i * numClasses_));
-    }
-    return path2String(path);
-  }
-
-  /* "sp, dp, ip" is the weighting parameter of "substitution, deletion,
-   * insertion"
-   * in edit-distance error */
-  real stringAlignment(std::vector<int>& gtStr,
-                       std::vector<int>& recogStr,
-                       bool backtrace = true,
-                       real sp = 1.0,
-                       real dp = 1.0,
-                       real ip = 1.0) {
-    std::vector<std::vector<int>> matrix;
-    int substitutions, deletions, insertions;
-    real distance;
-    int n = gtStr.size();
-    int m = recogStr.size();
-
-    if (n == 0) {
-      substitutions = 0;
-      deletions = 0;
-      insertions = m;
-      distance = m;
-    } else if (m == 0) {
-      substitutions = 0;
-      deletions = n;
-      insertions = 0;
-      distance = n;
-    } else {
-      substitutions = 0;
-      deletions = 0;
-      insertions = 0;
-      distance = 0;
-      // initialize the matrix
-      matrix.resize(n + 1);
-      for (int i = 0; i < n + 1; ++i) {
-        matrix[i].resize(m + 1);
-        for (int j = 0; j < m + 1; ++j) {
-          matrix[i][j] = 0;
-        }
-      }
-      for (int i = 0; i < n + 1; ++i) {
-        matrix[i][0] = i;
-      }
-      for (int j = 0; j < m + 1; ++j) {
-        matrix[0][j] = j;
-      }
-
-      // calculate the insertions, substitutions and deletions
-      for (int i = 1; i < n + 1; ++i) {
-        int s_i = gtStr[i - 1];
-        for (int j = 1; j < m + 1; ++j) {
-          int t_j = recogStr[j - 1];
-          int cost = (s_i == t_j) ? 0 : 1;
-          const int above = matrix[i - 1][j];
-          const int left = matrix[i][j - 1];
-          const int diag = matrix[i - 1][j - 1];
-          const int cell = std::min(above + 1, std::min(left + 1, diag + cost));
-          matrix[i][j] = cell;
-        }
-      }
-
-      if (backtrace) {
-        size_t i = n;
-        size_t j = m;
-        substitutions = 0;
-        deletions = 0;
-        insertions = 0;
-
-        while (i != 0 && j != 0) {
-          if (matrix[i][j] == matrix[i - 1][j - 1]) {
-            --i;
-            --j;
-          } else if (matrix[i][j] == matrix[i - 1][j - 1] + 1) {
-            ++substitutions;
-            --i;
-            --j;
-          } else if (matrix[i][j] == matrix[i - 1][j] + 1) {
-            ++deletions;
-            --i;
-          } else {
-            ++insertions;
-            --j;
-          }
-        }
-        while (i != 0) {
-          ++deletions;
-          --i;
-        }
-        while (j != 0) {
-          ++insertions;
-          --j;
-        }
-        int diff = substitutions + deletions + insertions;
-        if (diff != matrix[n][m]) {
-          LOG(ERROR) << "Found path with distance " << diff
-                     << " but Levenshtein distance is " << matrix[n][m];
-        }
-
-        distance = (sp * substitutions) + (dp * deletions) + (ip * insertions);
-      } else {
-        distance = (real)matrix[n][m];
-      }
-    }
-    real maxLen = std::max(m, n);
-    deletions_ += deletions / maxLen;
-    insertions_ += insertions / maxLen;
-    substitutions_ += substitutions / maxLen;
-
-    if (distance != 0) {
-      seqClassficationError_ += 1;
-    }
-
-    return distance / maxLen;
-  }
-
-  real editDistance(
-      real* output, int numTimes, int numClasses, int* labels, int labelsLen) {
-    numTimes_ = numTimes;
-    numClasses_ = numClasses;
-    blank_ = numClasses_ - 1;
-    outActivations_ = Matrix::create(output, numTimes, numClasses);
-    std::vector<int> recogStr, gtStr;
-    recogStr = bestLabelSeq();
-    for (int i = 0; i < labelsLen; ++i) {
-      gtStr.push_back(labels[i]);
-    }
-
-    return stringAlignment(gtStr, recogStr);
-  }
-
-  void storeLocalValues() const {
-    evalResults_["error"] = numSequences_ ? totalScore_ / numSequences_ : 0;
-    evalResults_["deletion_error"] =
-        numSequences_ ? deletions_ / numSequences_ : 0;
-    evalResults_["insertion_error"] =
-        numSequences_ ? insertions_ / numSequences_ : 0;
-    evalResults_["substitution_error"] =
-        numSequences_ ? substitutions_ / numSequences_ : 0;
-    evalResults_["sequence_error"] =
-        (real)seqClassficationError_ / numSequences_;
-  }
-
-public:
-  CTCErrorEvaluator()
-      : numTimes_(0),
-        numClasses_(0),
-        numSequences_(0),
-        blank_(0),
-        deletions_(0),
-        insertions_(0),
-        substitutions_(0),
-        seqClassficationError_(0) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    CHECK_EQ(arguments.size(), (size_t)2);
-    Argument output, label;
-    output.resizeAndCopyFrom(arguments[0], false, HPPL_STREAM_DEFAULT);
-    label.resizeAndCopyFrom(arguments[1], false, HPPL_STREAM_DEFAULT);
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-    CHECK(label.sequenceStartPositions);
-    CHECK(label.ids);
-    size_t numSequences = label.sequenceStartPositions->getSize() - 1;
-    const int* labelStarts = label.sequenceStartPositions->getData(false);
-    const int* outputStarts = output.sequenceStartPositions->getData(false);
-    real totalErr = 0;
-    for (size_t i = 0; i < numSequences; ++i) {
-      real err = 0;
-      err = editDistance(
-          output.value->getData() + output.value->getWidth() * outputStarts[i],
-          outputStarts[i + 1] - outputStarts[i],
-          output.value->getWidth(),
-          label.ids->getData() + labelStarts[i],
-          labelStarts[i + 1] - labelStarts[i]);
-
-      totalErr += err;
-    }
-
-    return totalErr;
-  }
-
-  virtual void eval(const NeuralNetwork& nn) {
-    Evaluator::eval(nn);
-    std::vector<Argument> arguments;
-    arguments.reserve(config_.input_layers_size());
-    for (const std::string& name : config_.input_layers()) {
-      arguments.push_back(nn.getLayer(name)->getOutput());
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    numSequences_ += arguments[1].getNumSequences();
-  }
-
-  virtual void start() {
-    Evaluator::start();
-    numSequences_ = 0;
-    blank_ = 0;
-    deletions_ = 0;
-    insertions_ = 0;
-    substitutions_ = 0;
-    seqClassficationError_ = 0;
-  }
-
-  virtual void printStats(std::ostream& os) const {
-    storeLocalValues();
-    os << config_.name() << " error = " << evalResults_["error"];
-    os << " deletions error = " << evalResults_["deletion_error"];
-    os << " insertions error = " << evalResults_["insertion_error"];
-    os << " substitution error = " << evalResults_["substitution_error"];
-    os << " sequence error = " << evalResults_["sequence_error"];
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    double buf[6] = {totalScore_,
-                     (double)deletions_,
-                     (double)insertions_,
-                     (double)substitutions_,
-                     (double)seqClassficationError_,
-                     (double)numSequences_};
-    client->reduce(buf, buf, 6, FLAGS_trainer_id, 0);
-    totalScore_ = buf[0];
-    deletions_ = (real)buf[1];
-    insertions_ = (real)buf[2];
-    substitutions_ = (real)buf[3];
-    seqClassficationError_ = (int)buf[4];
-    numSequences_ = (int)buf[5];
-  }
-
-  void getNames(std::vector<std::string>* names) {
-    storeLocalValues();
-    names->reserve(names->size() + evalResults_.size());
-    for (auto it = evalResults_.begin(); it != evalResults_.end(); ++it) {
-      names->push_back(config_.name() + "." + it->first);
-    }
-  }
-
-  real getValue(const std::string& name, Error* err) const {
-    storeLocalValues();
-
-    std::vector<std::string> buffers;
-    paddle::str::split(name, '.', &buffers);
-    auto it = evalResults_.find(buffers[buffers.size() - 1]);
-
-    if (it == evalResults_.end()) {
-      *err = Error("Evaluator does not have the key %s", name.c_str());
-      return 0.0f;
-    }
-
-    return it->second;
-  }
-
-  std::string getType(const std::string& name, Error* err) const {
-    this->getValue(name, err);
-    if (!err->isOK()) {
-      return "";
-    }
-    return "ctc_edit_distance";
-  }
-};
-
-REGISTER_EVALUATOR(ctc_edit_distance, CTCErrorEvaluator);
-
-}  // namespace paddle
diff --git a/paddle/gserver/evaluators/ChunkEvaluator.cpp b/paddle/gserver/evaluators/ChunkEvaluator.cpp
deleted file mode 100644
index 755b91d05caf33745e66415e7b111ba348c575d9..0000000000000000000000000000000000000000
--- a/paddle/gserver/evaluators/ChunkEvaluator.cpp
+++ /dev/null
@@ -1,296 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <set>
-#include <vector>
-
-#include "paddle/math/Vector.h"
-#include "paddle/utils/StringUtil.h"
-
-#include "Evaluator.h"
-
-namespace paddle {
-
-/**
- * Chunk evaluator is used to evaluate segment labelling accuracy for a
- * sequence. It calculates the chunk detection F1 score.
- *
- * A chunk is correctly detected if its beginning, end and type are correct.
- * Other chunk type is ignored.
- * For each label in the label sequence, we have
- *
- * @code
- * tagType = label % numTagType
- * chunkType = label / numTagType
- * otherChunkType = numChunkTypes
- * @endcode
- *
- * The total number of different labels is numTagType*numChunkTypes+1
- * We support 4 labelling scheme
- * The tag type for each of the scheme is shown as follows:
- *
- * @code
- *  Scheme Begin Inside End   Single
- *   plain  0     -      -     -
- *   IOB    0     1      -     -
- *   IOE    -     0      1     -
- *   IOBES  0     1      2     3
- * @endcode
- *
- * 'plain' means the whole chunk must contain exactly the same chunk label.
- */
-class ChunkEvaluator : public Evaluator {
-  int otherChunkType_;
-  int numChunkTypes_;  // number of chunk types besides other chunk type
-  int numTagTypes_;
-  int tagBegin_;
-  int tagInside_;
-  int tagEnd_;
-  int tagSingle_;
-
-  int64_t numLabelSegments_;
-  int64_t numOutputSegments_;
-  int64_t numCorrect_;
-
-  struct Segment {
-    int begin;
-    int end;
-    int type;
-    bool operator==(const Segment& y) const {
-      return begin == y.begin && end == y.end && type == y.type;
-    }
-  };
-
-  std::vector<Segment> labelSegments_;
-  std::vector<Segment> outputSegments_;
-  std::set<int> excludedChunkTypes_;
-  mutable std::unordered_map<std::string, real> values_;
-
-public:
-  virtual void init(const EvaluatorConfig& config) {
-    Evaluator::init(config);
-    if (config.chunk_scheme() == "IOB") {
-      numTagTypes_ = 2;
-      tagBegin_ = 0;
-      tagInside_ = 1;
-      tagEnd_ = -1;
-      tagSingle_ = -1;
-    } else if (config.chunk_scheme() == "IOE") {
-      numTagTypes_ = 2;
-      tagBegin_ = -1;
-      tagInside_ = 0;
-      tagEnd_ = 1;
-      tagSingle_ = -1;
-    } else if (config.chunk_scheme() == "IOBES") {
-      numTagTypes_ = 4;
-      tagBegin_ = 0;
-      tagInside_ = 1;
-      tagEnd_ = 2;
-      tagSingle_ = 3;
-    } else if (config.chunk_scheme() == "plain") {
-      numTagTypes_ = 1;
-      tagBegin_ = -1;
-      tagInside_ = -1;
-      tagEnd_ = -1;
-      tagSingle_ = -1;
-    } else {
-      LOG(FATAL) << "Unknown chunk scheme: " << config.chunk_scheme();
-    }
-    CHECK(config.has_num_chunk_types()) << "Missing num_chunk_types in config";
-    otherChunkType_ = numChunkTypes_ = config.num_chunk_types();
-
-    // the chunks of types in excludedChunkTypes_ will not be counted
-    auto& tmp = config.excluded_chunk_types();
-    excludedChunkTypes_.insert(tmp.begin(), tmp.end());
-  }
-
-  virtual void start() {
-    Evaluator::start();
-    numLabelSegments_ = 0;
-    numOutputSegments_ = 0;
-    numCorrect_ = 0;
-  }
-
-  virtual void printStats(std::ostream& os) const {
-    storeLocalValues();
-    os << config_.name() << "=" << values_["F1-score"]
-       << " true_chunks=" << numLabelSegments_
-       << " result_chunks=" << numOutputSegments_
-       << " correct_chunks=" << numCorrect_;
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    int64_t buf[3] = {numLabelSegments_, numOutputSegments_, numCorrect_};
-    client->reduce(buf, buf, 3, FLAGS_trainer_id, 0);
-    numLabelSegments_ = buf[0];
-    numOutputSegments_ = buf[1];
-    numCorrect_ = buf[2];
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    CHECK_EQ(arguments.size(), (size_t)2);
-    IVectorPtr& output = arguments[0].ids;
-    IVectorPtr& label = arguments[1].ids;
-    CHECK(!output->useGpu() && !label->useGpu()) << "Not supported";
-    auto sequenceStartPositions =
-        arguments[1].sequenceStartPositions->getVector(false);
-    CHECK_EQ(output->getSize(), label->getSize());
-    CHECK(sequenceStartPositions);
-    size_t numSequences = sequenceStartPositions->getSize() - 1;
-    const int* starts = sequenceStartPositions->getData();
-    for (size_t i = 0; i < numSequences; ++i) {
-      eval1(output->getData() + starts[i],
-            label->getData() + starts[i],
-            starts[i + 1] - starts[i]);
-    }
-    return 0;
-  }
-
-  void eval1(int* output, int* label, int length) {
-    getSegments(output, length, outputSegments_);
-    getSegments(label, length, labelSegments_);
-    size_t i = 0, j = 0;
-    while (i < outputSegments_.size() && j < labelSegments_.size()) {
-      if (outputSegments_[i] == labelSegments_[j] &&
-          excludedChunkTypes_.count(outputSegments_[i].type) != 1) {
-        ++numCorrect_;
-      }
-      if (outputSegments_[i].end < labelSegments_[j].end) {
-        ++i;
-      } else if (outputSegments_[i].end > labelSegments_[j].end) {
-        ++j;
-      } else {
-        ++i;
-        ++j;
-      }
-    }
-    for (auto& segment : labelSegments_) {
-      if (excludedChunkTypes_.count(segment.type) != 1) ++numLabelSegments_;
-    }
-    for (auto& segment : outputSegments_) {
-      if (excludedChunkTypes_.count(segment.type) != 1) ++numOutputSegments_;
-    }
-  }
-
-  void getSegments(int* label, int length, std::vector<Segment>& segments) {
-    segments.clear();
-    segments.reserve(length);
-    int chunkStart = 0;
-    bool inChunk = false;
-    int tag = -1;
-    int type = otherChunkType_;
-    for (int i = 0; i < length; ++i) {
-      int prevTag = tag;
-      int prevType = type;
-      CHECK_LE(label[i], numChunkTypes_ * numTagTypes_);
-      tag = label[i] % numTagTypes_;
-      type = label[i] / numTagTypes_;
-      if (inChunk && isChunkEnd(prevTag, prevType, tag, type)) {
-        Segment segment{
-            chunkStart,  // begin
-            i - 1,       // end
-            prevType,
-        };
-        segments.push_back(segment);
-        inChunk = false;
-      }
-      if (isChunkBegin(prevTag, prevType, tag, type)) {
-        chunkStart = i;
-        inChunk = true;
-      }
-    }
-    if (inChunk) {
-      Segment segment{
-          chunkStart,  // begin
-          length - 1,  // end
-          type,
-      };
-      segments.push_back(segment);
-    }
-  }
-
-  // whether (prevTag, prevType) is the end of a chunk
-  bool isChunkEnd(int prevTag, int prevType, int tag, int type) {
-    if (prevType == otherChunkType_) return false;
-    if (type == otherChunkType_) return true;
-    if (type != prevType) return true;
-    if (prevTag == tagBegin_) return tag == tagBegin_ || tag == tagSingle_;
-    if (prevTag == tagInside_) return tag == tagBegin_ || tag == tagSingle_;
-    if (prevTag == tagEnd_) return true;
-    if (prevTag == tagSingle_) return true;
-    return false;
-  }
-
-  // whether (tag, type) is the beginning of a chunk
-  bool isChunkBegin(int prevTag, int prevType, int tag, int type) {
-    if (prevType == otherChunkType_) return type != otherChunkType_;
-    if (type == otherChunkType_) return false;
-    if (type != prevType) return true;
-    if (tag == tagBegin_) return true;
-    if (tag == tagInside_) return prevTag == tagEnd_ || prevTag == tagSingle_;
-    if (tag == tagEnd_) return prevTag == tagEnd_ || prevTag == tagSingle_;
-    if (tag == tagSingle_) return true;
-    return false;
-  }
-
-  // three metrics: precision, recall and F1-score
-  void getNames(std::vector<std::string>* names) {
-    storeLocalValues();
-    names->reserve(names->size() + values_.size());
-    for (auto it = values_.begin(); it != values_.end(); ++it) {
-      names->push_back(config_.name() + "." + it->first);
-    }
-  }
-
-  // get value by field name
-  real getValue(const std::string& name, Error* err) const {
-    storeLocalValues();
-    std::vector<std::string> buffers;
-    paddle::str::split(name, '.', &buffers);
-    auto it = values_.find(buffers.back());
-    if (it == values_.end()) {  // not found
-      *err = Error("No such key %s", name.c_str());
-      return 0.0f;
-    }
-
-    return it->second;
-  }
-
-  // get type of evaluator
-  std::string getType(const std::string& name, Error* err) const {
-    this->getValue(name, err);
-    if (!err->isOK()) {
-      return "";
-    }
-    return "chunk";
-  }
-
-private:
-  void storeLocalValues() const {
-    CHECK_GE(numOutputSegments_, 0);
-    CHECK_GE(numLabelSegments_, 0);
-    double precision =
-        !numOutputSegments_ ? 0 : (double)numCorrect_ / numOutputSegments_;
-    double recall =
-        !numLabelSegments_ ? 0 : (double)numCorrect_ / numLabelSegments_;
-    values_["precision"] = precision;
-    values_["recall"] = recall;
-    values_["F1-score"] =
-        !numCorrect_ ? 0 : 2 * precision * recall / (precision + recall);
-  }
-};
-
-REGISTER_EVALUATOR(chunk, ChunkEvaluator);
-
-}  // namespace paddle
diff --git a/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp b/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp
deleted file mode 100644
index f43ef5dd51407236a3a36b300b33f92a9fad885a..0000000000000000000000000000000000000000
--- a/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp
+++ /dev/null
@@ -1,308 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Evaluator.h"
-#include "paddle/gserver/layers/DetectionUtil.h"
-
-using std::map;
-using std::vector;
-using std::pair;
-using std::make_pair;
-
-namespace paddle {
-
-/**
- * @brief detection map Evaluator
- *
- * The config file api is detection_map_evaluator.
- */
-class DetectionMAPEvaluator : public Evaluator {
-public:
-  DetectionMAPEvaluator()
-      : evaluateDifficult_(false), cpuOutput_(nullptr), cpuLabel_(nullptr) {}
-
-  virtual void start() {
-    Evaluator::start();
-    allTruePos_.clear();
-    allFalsePos_.clear();
-    numPos_.clear();
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    overlapThreshold_ = config_.overlap_threshold();
-    backgroundId_ = config_.background_id();
-    evaluateDifficult_ = config_.evaluate_difficult();
-    apType_ = config_.ap_type();
-
-    MatrixPtr detectTmpValue = arguments[0].value;
-    Matrix::resizeOrCreate(cpuOutput_,
-                           detectTmpValue->getHeight(),
-                           detectTmpValue->getWidth(),
-                           false,
-                           false);
-
-    MatrixPtr labelTmpValue = arguments[1].value;
-    Matrix::resizeOrCreate(cpuLabel_,
-                           labelTmpValue->getHeight(),
-                           labelTmpValue->getWidth(),
-                           false,
-                           false);
-
-    cpuOutput_->copyFrom(*detectTmpValue);
-    cpuLabel_->copyFrom(*labelTmpValue);
-
-    Argument label = arguments[1];
-    const int* labelIndex = label.sequenceStartPositions->getData(false);
-    size_t batchSize = label.getNumSequences();
-
-    vector<map<size_t, vector<NormalizedBBox>>> allGTBBoxes;
-    vector<map<size_t, vector<pair<real, NormalizedBBox>>>> allDetectBBoxes;
-
-    for (size_t n = 0; n < batchSize; ++n) {
-      map<size_t, vector<NormalizedBBox>> bboxes;
-      for (int i = labelIndex[n]; i < labelIndex[n + 1]; ++i) {
-        vector<NormalizedBBox> bbox;
-        getBBoxFromLabelData(cpuLabel_->getData() + i * 6, 1, bbox);
-        int c = cpuLabel_->getData()[i * 6];
-        bboxes[c].push_back(bbox[0]);
-      }
-      allGTBBoxes.push_back(bboxes);
-    }
-
-    size_t n = 0;
-    const real* cpuOutputData = cpuOutput_->getData();
-    for (size_t imgId = 0; imgId < batchSize; ++imgId) {
-      map<size_t, vector<pair<real, NormalizedBBox>>> bboxes;
-      size_t curImgId = static_cast<size_t>((cpuOutputData + n * 7)[0]);
-      while (curImgId == imgId && n < cpuOutput_->getHeight()) {
-        vector<real> label;
-        vector<real> score;
-        vector<NormalizedBBox> bbox;
-        getBBoxFromDetectData(cpuOutputData + n * 7, 1, label, score, bbox);
-        bboxes[label[0]].push_back(make_pair(score[0], bbox[0]));
-        ++n;
-        curImgId = static_cast<size_t>((cpuOutputData + n * 7)[0]);
-      }
-      allDetectBBoxes.push_back(bboxes);
-    }
-
-    for (size_t n = 0; n < batchSize; ++n) {
-      for (map<size_t, vector<NormalizedBBox>>::iterator it =
-               allGTBBoxes[n].begin();
-           it != allGTBBoxes[n].end();
-           ++it) {
-        size_t count = 0;
-        if (evaluateDifficult_) {
-          count = it->second.size();
-        } else {
-          for (size_t i = 0; i < it->second.size(); ++i)
-            if (!(it->second[i].isDifficult)) ++count;
-        }
-        if (numPos_.find(it->first) == numPos_.end() && count != 0) {
-          numPos_[it->first] = count;
-        } else {
-          numPos_[it->first] += count;
-        }
-      }
-    }
-
-    // calcTFPos
-    calcTFPos(batchSize, allGTBBoxes, allDetectBBoxes);
-
-    return 0;
-  }
-
-  virtual void printStats(std::ostream& os) const {
-    real mAP = calcMAP();
-    os << "Detection mAP=" << mAP;
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    LOG(FATAL) << "Distribute detection evaluation not implemented.";
-  }
-
-protected:
-  void calcTFPos(const size_t batchSize,
-                 const vector<map<size_t, vector<NormalizedBBox>>>& allGTBBoxes,
-                 const vector<map<size_t, vector<pair<real, NormalizedBBox>>>>&
-                     allDetectBBoxes) {
-    for (size_t n = 0; n < allDetectBBoxes.size(); ++n) {
-      if (allGTBBoxes[n].size() == 0) {
-        for (map<size_t, vector<pair<real, NormalizedBBox>>>::const_iterator
-                 it = allDetectBBoxes[n].begin();
-             it != allDetectBBoxes[n].end();
-             ++it) {
-          size_t label = it->first;
-          for (size_t i = 0; i < it->second.size(); ++i) {
-            allTruePos_[label].push_back(make_pair(it->second[i].first, 0));
-            allFalsePos_[label].push_back(make_pair(it->second[i].first, 1));
-          }
-        }
-      } else {
-        for (map<size_t, vector<pair<real, NormalizedBBox>>>::const_iterator
-                 it = allDetectBBoxes[n].begin();
-             it != allDetectBBoxes[n].end();
-             ++it) {
-          size_t label = it->first;
-          vector<pair<real, NormalizedBBox>> predBBoxes = it->second;
-          if (allGTBBoxes[n].find(label) == allGTBBoxes[n].end()) {
-            for (size_t i = 0; i < predBBoxes.size(); ++i) {
-              allTruePos_[label].push_back(make_pair(predBBoxes[i].first, 0));
-              allFalsePos_[label].push_back(make_pair(predBBoxes[i].first, 1));
-            }
-          } else {
-            vector<NormalizedBBox> gtBBoxes =
-                allGTBBoxes[n].find(label)->second;
-            vector<bool> visited(gtBBoxes.size(), false);
-            // Sort detections in descend order based on scores
-            std::sort(predBBoxes.begin(),
-                      predBBoxes.end(),
-                      sortScorePairDescend<NormalizedBBox>);
-            for (size_t i = 0; i < predBBoxes.size(); ++i) {
-              real maxOverlap = -1.0;
-              size_t maxIdx = 0;
-              for (size_t j = 0; j < gtBBoxes.size(); ++j) {
-                real overlap =
-                    jaccardOverlap(predBBoxes[i].second, gtBBoxes[j]);
-                if (overlap > maxOverlap) {
-                  maxOverlap = overlap;
-                  maxIdx = j;
-                }
-              }
-              if (maxOverlap > overlapThreshold_) {
-                if (evaluateDifficult_ ||
-                    (!evaluateDifficult_ && !gtBBoxes[maxIdx].isDifficult)) {
-                  if (!visited[maxIdx]) {
-                    allTruePos_[label].push_back(
-                        make_pair(predBBoxes[i].first, 1));
-                    allFalsePos_[label].push_back(
-                        make_pair(predBBoxes[i].first, 0));
-                    visited[maxIdx] = true;
-                  } else {
-                    allTruePos_[label].push_back(
-                        make_pair(predBBoxes[i].first, 0));
-                    allFalsePos_[label].push_back(
-                        make_pair(predBBoxes[i].first, 1));
-                  }
-                }
-              } else {
-                allTruePos_[label].push_back(make_pair(predBBoxes[i].first, 0));
-                allFalsePos_[label].push_back(
-                    make_pair(predBBoxes[i].first, 1));
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  real calcMAP() const {
-    real mAP = 0.0;
-    size_t count = 0;
-    for (map<size_t, size_t>::const_iterator it = numPos_.begin();
-         it != numPos_.end();
-         ++it) {
-      size_t label = it->first;
-      size_t labelNumPos = it->second;
-      if (labelNumPos == 0 || allTruePos_.find(label) == allTruePos_.end())
-        continue;
-      vector<pair<real, size_t>> labelTruePos = allTruePos_.find(label)->second;
-      vector<pair<real, size_t>> labelFalsePos =
-          allFalsePos_.find(label)->second;
-      // Compute average precision.
-      vector<size_t> tpCumSum;
-      getAccumulation(labelTruePos, &tpCumSum);
-      vector<size_t> fpCumSum;
-      getAccumulation(labelFalsePos, &fpCumSum);
-      std::vector<real> precision, recall;
-      size_t num = tpCumSum.size();
-      // Compute Precision.
-      for (size_t i = 0; i < num; ++i) {
-        CHECK_LE(tpCumSum[i], labelNumPos);
-        precision.push_back(static_cast<real>(tpCumSum[i]) /
-                            static_cast<real>(tpCumSum[i] + fpCumSum[i]));
-        recall.push_back(static_cast<real>(tpCumSum[i]) / labelNumPos);
-      }
-      // VOC2007 style
-      if (apType_ == "11point") {
-        vector<real> maxPrecisions(11, 0.0);
-        int startIdx = num - 1;
-        for (int j = 10; j >= 0; --j)
-          for (int i = startIdx; i >= 0; --i) {
-            if (recall[i] < j / 10.) {
-              startIdx = i;
-              if (j > 0) maxPrecisions[j - 1] = maxPrecisions[j];
-              break;
-            } else {
-              if (maxPrecisions[j] < precision[i])
-                maxPrecisions[j] = precision[i];
-            }
-          }
-        for (int j = 10; j >= 0; --j) mAP += maxPrecisions[j] / 11;
-        ++count;
-      } else if (apType_ == "Integral") {
-        // Nature integral
-        real averagePrecisions = 0.;
-        real prevRecall = 0.;
-        for (size_t i = 0; i < num; ++i) {
-          if (fabs(recall[i] - prevRecall) > 1e-6)
-            averagePrecisions += precision[i] * fabs(recall[i] - prevRecall);
-          prevRecall = recall[i];
-        }
-        mAP += averagePrecisions;
-        ++count;
-      } else {
-        LOG(FATAL) << "Unkown ap version: " << apType_;
-      }
-    }
-    if (count != 0) mAP /= count;
-    return mAP * 100;
-  }
-
-  void getAccumulation(vector<pair<real, size_t>> inPairs,
-                       vector<size_t>* accuVec) const {
-    std::stable_sort(
-        inPairs.begin(), inPairs.end(), sortScorePairDescend<size_t>);
-    accuVec->clear();
-    size_t sum = 0;
-    for (size_t i = 0; i < inPairs.size(); ++i) {
-      sum += inPairs[i].second;
-      accuVec->push_back(sum);
-    }
-  }
-
-  std::string getTypeImpl() const { return "detection_map"; }
-
-  real getValueImpl() const { return calcMAP(); }
-
-private:
-  real overlapThreshold_;  // overlap threshold when determining whether matched
-  bool evaluateDifficult_;  // whether evaluate difficult ground truth
-  size_t backgroundId_;     // class index of background
-  std::string apType_;      // how to calculate mAP (Integral or 11point)
-
-  MatrixPtr cpuOutput_;
-  MatrixPtr cpuLabel_;
-
-  map<size_t, size_t> numPos_;  // counts of true objects each classification
-  map<size_t, vector<pair<real, size_t>>>
-      allTruePos_;  // true positive prediction
-  map<size_t, vector<pair<real, size_t>>>
-      allFalsePos_;  // false positive prediction
-};
-
-REGISTER_EVALUATOR(detection_map, DetectionMAPEvaluator);
-
-}  // namespace paddle
diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp
deleted file mode 100644
index 79478e7fac63a49c494105d53a6944b4b89e6c63..0000000000000000000000000000000000000000
--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ /dev/null
@@ -1,1361 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/gserver/evaluators/Evaluator.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/StringUtil.h"
-
-DECLARE_int32(trainer_id);
-
-namespace paddle {
-
-void Evaluator::eval(const NeuralNetwork& nn) {
-  std::vector<Argument> arguments;
-  arguments.reserve(config_.input_layers_size());
-  for (const std::string& name : config_.input_layers()) {
-    arguments.push_back(nn.getLayer(name)->getOutput());
-  }
-  SetDevice device(arguments[0].deviceId);
-  real score = evalImp(arguments);
-  totalScore_ += score;
-  updateSamplesNum(arguments);
-}
-/**
- * @brief classification error Evaluator
- *
- * The config file api is classification_error_evaluator.
- */
-class ClassificationErrorEvaluator : public Evaluator {
-public:
-  /*
-  ClassificationErrorEvaluator() : totalScore2_(0) {}
-
-  virtual void start() {
-    Evaluator::start();
-    totalScore2_ = 0;
-    } */
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    if (3 == arguments.size()) {
-      numSamples_ += arguments[2].value->getSum();
-    } else {
-      numSamples_ += arguments[0].getBatchSize();
-    }
-  }
-
-  MatrixPtr calcError(std::vector<Argument>& arguments) {
-    CHECK_GE(arguments.size(), (size_t)2);
-    CHECK_LE(arguments.size(), (size_t)3);
-    MatrixPtr& output = arguments[0].value;
-    IVectorPtr& label = arguments[1].ids;
-    MatrixPtr& multiBinaryLabel = arguments[1].value;  // For multi binary label
-    bool supportWeight = (3 == arguments.size()) ? true : false;
-    MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
-    if (nullptr == output ||
-        (nullptr == label && nullptr == multiBinaryLabel) ||
-        (supportWeight && nullptr == weight)) {
-      return 0;
-    }
-
-    if (label != nullptr) {
-      CHECK_EQ(label->getSize(), output->getHeight());
-    } else {
-      CHECK_EQ(multiBinaryLabel->getHeight(), output->getHeight());
-      CHECK_EQ(multiBinaryLabel->getWidth(), output->getWidth());
-    }
-    if (supportWeight) {
-      CHECK_EQ(output->getHeight(), weight->getHeight());
-      CHECK_EQ((size_t)1, weight->getWidth());
-    }
-
-    const MatrixPtr errorMat = Matrix::create(output->getHeight(),
-                                              1,
-                                              /* trans= */ false,
-                                              useGpu(arguments[0].deviceId));
-
-    errorMat->zeroMem();
-
-    if (label != nullptr) {
-      errorMat->classificationError(*output, *label, config_.top_k());
-    } else if (dynamic_cast<CpuSparseMatrix*>(multiBinaryLabel.get()) ||
-               dynamic_cast<GpuSparseMatrix*>(multiBinaryLabel.get())) {
-      errorMat->classificationErrorMulti(
-          *output, *multiBinaryLabel, config_.classification_threshold());
-    } else {
-      errorMat->binaryClassificationError(
-          0, *output, *multiBinaryLabel, config_.classification_threshold());
-    }
-
-    if (supportWeight) {
-      errorMat->dotMul(*errorMat, *weight);
-    }
-    return errorMat;
-  }
-
-  void printStats(std::ostream& os) const {
-    if (config_.top_k() == 1) {
-      os << config_.name() << "="
-         << (numSamples_ ? totalScore_ / numSamples_ : 0);
-    } else {
-      os << " top_" << config_.top_k()
-         << "_error=" << (numSamples_ ? totalScore_ / numSamples_ : 0);
-    }
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    MatrixPtr errorMat = calcError(arguments);
-    return errorMat->getSum();
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    mergeResultsOfAllClients(client);
-  }
-
-  // Evaluator interface
-protected:
-  std::string getTypeImpl() const { return "classification_error"; }
-};
-
-/**
- * @brief sequence classification error Evaluator
- * @note sequence level classification error stats,
- * if any frame in one sequence has error, the sequence is error
- */
-class SequenceClassificationErrorEvaluator
-    : public ClassificationErrorEvaluator {
-public:
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    numSamples_ += arguments[0].getNumSequences();
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    auto sequenceStartPositions =
-        arguments[0].sequenceStartPositions->getVector(false);
-    CHECK(sequenceStartPositions != nullptr);
-    const int* starts = sequenceStartPositions->getData();
-
-    MatrixPtr errorMat = calcError(arguments);
-
-    int errCounter = 0;
-    CpuVector errorVec(0, nullptr);
-    for (size_t i = 0; i < sequenceStartPositions->getSize() - 1; ++i) {
-      errorVec.subVecFrom(
-          errorMat->getData(), starts[i], starts[i + 1] - starts[i]);
-      if (errorVec.getSum() > 0) {
-        errCounter += 1;
-      }
-    }
-
-    return static_cast<real>(errCounter);
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    mergeResultsOfAllClients(client);
-  }
-
-  // Evaluator interface
-protected:
-  std::string getTypeImpl() const { return "seq_classification_error"; }
-};
-REGISTER_EVALUATOR(seq_classification_error,
-                   SequenceClassificationErrorEvaluator);
-/**
- * @brief sum Evaluator
- * Calculate the sum of output or label
- *
- * The config file api is sum_evaluator.
- */
-class SumEvaluator : public Evaluator {
-public:
-  SumEvaluator() : cpuLabel_(nullptr), cpuWeight_(nullptr) {}
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    if (2 == arguments.size()) {
-      numSamples_ += arguments[1].value->getSum();
-    } else {
-      numSamples_ += arguments[0].getBatchSize();
-    }
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    REGISTER_TIMER("SumEvaluator");
-    CHECK_GE(arguments.size(), (size_t)1);
-    CHECK_LE(arguments.size(), (size_t)2);
-    bool supportWeight = (2 == arguments.size()) ? true : false;
-    if (supportWeight) {
-      if (nullptr == arguments[1].value) {
-        return 0;
-      }
-      CHECK_EQ(arguments[1].value->getWidth(), (size_t)1);
-    }
-
-    // The sum of output
-    if (arguments[0].value) {
-      if (supportWeight) {
-        CHECK_EQ(arguments[0].value->getHeight(),
-                 arguments[1].value->getHeight());
-        MatrixPtr tmpMat = Matrix::create(arguments[0].value->getHeight(),
-                                          arguments[0].value->getWidth(),
-                                          /* trans= */ false,
-                                          arguments[0].value->useGpu());
-        tmpMat->copyFrom(*arguments[0].value);
-        tmpMat->rowScale(0, *tmpMat, *arguments[1].value);
-        return tmpMat->getSum();
-      } else {
-        return arguments[0].value->getSum();
-      }
-      // The sum of label
-    } else if (arguments[0].ids) {
-      size_t insNum = arguments[0].ids->getSize();
-      IVectorPtr label = arguments[0].ids;
-      MatrixPtr weight = supportWeight ? arguments[1].value : nullptr;
-      if (dynamic_cast<GpuIVector*>(label.get())) {
-        IVector::resizeOrCreate(cpuLabel_, insNum, false);
-        cpuLabel_->copyFrom(*arguments[0].ids);
-
-        if (supportWeight) {
-          CHECK_EQ(insNum, arguments[1].value->getHeight());
-          Matrix::resizeOrCreate(cpuWeight_, insNum, (size_t)1, false, false);
-          cpuWeight_->copyFrom(*arguments[1].value);
-        }
-
-        label = cpuLabel_;
-        weight = cpuWeight_;
-      }
-
-      if (supportWeight) {
-        real score = 0.0;
-        int* labelD = label->getData();
-        real* weightD = weight->getData();
-        for (size_t i = 0; i < insNum; ++i) {
-          score += (labelD[i] * weightD[i]);
-        }
-        return score;
-      } else {
-        return label->getSum();
-      }
-    } else {
-      return 0;
-    }
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    mergeResultsOfAllClients(client);
-  }
-
-private:
-  IVectorPtr cpuLabel_;
-  MatrixPtr cpuWeight_;
-
-  // Evaluator interface
-protected:
-  std::string getTypeImpl() const { return "sum"; }
-};
-/**
- * @brief column sum Evaluator
- * @note column sum for the colIdx-th column *
- * - colIdx = 0: the 0-th column.
- * - colIdx > 0: the colIdx-th column.
- * - colIdx < 0: the last colIdx-th column.
- *
- * The config file api is column_sum_evaluator.
- *
- */
-class ColumnSumEvaluator : public Evaluator {
-public:
-  explicit ColumnSumEvaluator(int32_t colIdx)
-      : colIdx_(colIdx), colNum_(0), sum_(nullptr) {}
-
-  virtual void start() {
-    Evaluator::start();
-    if (nullptr != sum_) {
-      sum_->zeroMem();
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    if (2 == arguments.size()) {
-      numSamples_ += arguments[1].value->getSum();
-    } else {
-      numSamples_ += arguments[0].getBatchSize();
-    }
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    REGISTER_TIMER("ColumnSumEvaluator");
-    CHECK_GE(arguments.size(), (size_t)1);
-    CHECK_LE(arguments.size(), (size_t)2);
-    bool supportWeight = (2 == arguments.size()) ? true : false;
-    if (nullptr == arguments[0].value ||
-        (supportWeight && nullptr == arguments[1].value)) {
-      return 0;
-    }
-
-    size_t insNum = arguments[0].value->getHeight();
-    size_t colNum = arguments[0].value->getWidth();
-    if (nullptr == sum_) {
-      sum_ = Matrix::create((size_t)1, colNum, false, /* useGpu */ false);
-      colNum_ = colNum;
-      sum_->zeroMem();
-    } else {
-      CHECK_EQ(colNum, sum_->getWidth());
-    }
-
-    if (supportWeight) {
-      CHECK_EQ(insNum, arguments[1].value->getHeight());
-      CHECK_EQ((size_t)1, arguments[1].value->getWidth());
-      MatrixPtr tmpMat = Matrix::create(insNum, colNum);
-      if (arguments[0].value->useGpu()) {
-        tmpMat->copyFrom(*arguments[0].value);
-      }
-      if (!arguments[1].value->useGpu()) {
-        if (!arguments[0].value->useGpu()) {
-          tmpMat->rowScale(0, *arguments[0].value, *arguments[1].value);
-        } else {
-          tmpMat->rowScale(0, *tmpMat, *arguments[1].value);
-        }
-      } else {
-        MatrixPtr tmp2 = Matrix::create(insNum, 1);
-        tmp2->copyFrom(*arguments[1].value);
-        if (!arguments[0].value->useGpu()) {
-          tmpMat->rowScale(0, *arguments[0].value, *tmp2);
-        } else {
-          tmpMat->rowScale(0, *tmpMat, *tmp2);
-        }
-      }
-      sum_->accumulateColSum(*tmpMat);
-    } else {
-      if (!arguments[0].value->useGpu()) {
-        sum_->accumulateColSum(*arguments[0].value);
-      } else {
-        MatrixPtr tmpMat = Matrix::create(insNum, colNum);
-        tmpMat->copyFrom(*arguments[0].value);
-        sum_->accumulateColSum(*tmpMat);
-      }
-    }
-    return 0;
-  }
-
-  virtual void printStats(std::ostream& os) const {
-    CHECK(colIdx_ + (int32_t)colNum_ >= 0 && colIdx_ - (int32_t)colNum_ < 0)
-        << "column index [" << colIdx_ << "] out of range [-" << colNum_ << ", "
-        << colNum_ << ")";
-    size_t colIdx = 0;
-    if (colIdx_ >= 0) {
-      colIdx = colIdx_;
-    } else {
-      colIdx = colNum_ + colIdx_;
-    }
-    os << config_.name() << "="
-       << (numSamples_ ? sum_->getElement(0, colIdx) / numSamples_ : 0);
-  }
-
-  void distributeEval(ParameterClient2* client) {
-    client->reduce(
-        sum_->getData(), sum_->getData(), colNum_, FLAGS_trainer_id, 0);
-    client->reduce(&numSamples_, &numSamples_, 1, FLAGS_trainer_id, 0);
-  }
-
-private:
-  int32_t colIdx_;
-  size_t colNum_;
-  MatrixPtr sum_; /* cpu matrix */
-
-  // Evaluator interface
-protected:
-  std::string getTypeImpl() const {
-    if (colIdx_ == -1)
-      return "last-column-sum";
-    else
-      return "column-sum";
-  }
-};
-
-void AucEvaluator::start() {
-  Evaluator::start();
-  memset(statPos_, 0, sizeof(statPos_));
-  memset(statNeg_, 0, sizeof(statNeg_));
-}
-
-real AucEvaluator::evalImp(std::vector<Argument>& arguments) {
-  REGISTER_TIMER("AucEvaluator");
-  CHECK_GE(arguments.size(), (size_t)2);
-  CHECK_LE(arguments.size(), (size_t)3);
-  MatrixPtr output = arguments[0].value;
-  IVectorPtr label = arguments[1].ids;
-  MatrixPtr labelval = arguments[1].value;
-  bool supportWeight = (3 == arguments.size()) ? true : false;
-  MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
-
-  if (nullptr == output || (supportWeight && nullptr == weight)) {
-    return 0;
-  }
-  size_t insNum = output->getHeight();
-  size_t outputDim = output->getWidth();
-  // Copy label from value to a vector.
-  if (nullptr == label && nullptr != labelval) {
-    // label width is 1
-    CHECK_EQ(1U, labelval->getWidth());
-    VectorPtr vec =
-        Vector::create(labelval->getData(), insNum, output->useGpu());
-    label = vec->castToInt();
-  }
-
-  CHECK_EQ(insNum, label->getSize());
-  if (supportWeight) {
-    CHECK_EQ(insNum, weight->getHeight());
-    CHECK_EQ((size_t)1, weight->getWidth());
-  }
-
-  CHECK(colIdx_ + (int32_t)outputDim >= 0 && colIdx_ - (int32_t)outputDim < 0)
-      << "column index [" << colIdx_ << "] out of range [-" << outputDim << ", "
-      << outputDim << ")";
-  realColumnIdx_ = 0;
-  if (colIdx_ >= 0) {
-    realColumnIdx_ = colIdx_;
-  } else {
-    realColumnIdx_ = outputDim + colIdx_;
-  }
-
-  if (dynamic_cast<GpuMatrix*>(output.get())) {
-    Matrix::resizeOrCreate(cpuOutput_,
-                           insNum,
-                           outputDim,
-                           /* trans=*/false,
-                           /* useGpu=*/false);
-    cpuOutput_->copyFrom(*output);
-    IVector::resizeOrCreate(cpuLabel_, insNum, false);
-    cpuLabel_->copyFrom(*label);
-
-    if (supportWeight) {
-      Matrix::resizeOrCreate(cpuWeight_, insNum, (size_t)1, false, false);
-      cpuWeight_->copyFrom(*weight);
-    }
-
-    output = cpuOutput_;
-    label = cpuLabel_;
-    weight = cpuWeight_;
-  }
-
-  real* outputD = output->getData();
-  int* labelD = label->getData();
-  real* weightD = supportWeight ? weight->getData() : nullptr;
-  size_t pos = realColumnIdx_;
-
-  for (size_t i = 0; i < insNum; ++i) {
-    real value = outputD[pos];
-    uint32_t binIdx = static_cast<uint32_t>(value * kBinNum_);
-    CHECK(binIdx <= kBinNum_) << "bin index [" << binIdx
-                              << "] out of range, predict value[" << value
-                              << "]";
-    real w = supportWeight ? weightD[i] : 1.0;
-    if (labelD[i] == kNegativeLabel_) {
-      statNeg_[binIdx] += w;
-    } else {
-      statPos_[binIdx] += w;
-    }
-    pos += outputDim;
-  }
-  return 0;
-}
-
-void AucEvaluator::distributeEval(ParameterClient2* client) {
-  client->reduce(statPos_, statPos_, kBinNum_ + 1, FLAGS_trainer_id, 0);
-  client->reduce(statNeg_, statNeg_, kBinNum_ + 1, FLAGS_trainer_id, 0);
-}
-
-double AucEvaluator::calcAuc() const {
-  double totPos = 0.0;
-  double totNeg = 0.0;
-  double totPosPrev = 0.0;
-  double totNegPrev = 0.0;
-  double auc = 0.0;
-
-  int64_t idx = kBinNum_;
-  while (idx >= 0) {
-    totPosPrev = totPos;
-    totNegPrev = totNeg;
-    totPos += statPos_[idx];
-    totNeg += statNeg_[idx];
-    auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
-    --idx;
-  }
-
-  if (totPos > 0.0 && totNeg > 0.0) {
-    return auc / totPos / totNeg;
-  } else {
-    return 0.0;
-  }
-}
-
-real AucEvaluator::getValueImpl() const { return calcAuc(); }
-
-std::string AucEvaluator::getTypeImpl() const {
-  if (colIdx_ == -1) {
-    return "last-column-auc";
-  } else {
-    return "auc";
-  }
-}
-
-// class RankAucEvaluator
-REGISTER_EVALUATOR(rankauc, RankAucEvaluator);
-
-void RankAucEvaluator::start() { Evaluator::start(); }
-void RankAucEvaluator::updateSamplesNum(
-    const std::vector<Argument>& arguments) {
-  numSamples_ += arguments[0].getNumSequences();
-}
-real RankAucEvaluator::evalImp(std::vector<Argument>& arguments) {
-  CHECK_GE(arguments.size(), 2U);
-  CHECK_LE(arguments.size(), 3U);
-  double batchAuc = 0.0;
-  output_ = arguments[0].value;
-  click_ = arguments[1].value;
-  size_t batchSize = output_->getHeight();
-  CHECK(!output_->useGpu()) << "RankAUC evaluator does not support GPU!";
-
-  if (arguments.size() == 3U) {
-    pv_ = arguments[2].value;
-  } else {
-    Matrix::resizeOrCreate(pv_, batchSize, 1, false, false);
-    std::fill(pv_->getData(), pv_->getData() + batchSize, 1.0);
-  }
-
-  real* outputData = output_->getData();
-  real* clickData = click_->getData();
-  real* pvData = pv_->getData();
-
-  auto startPos = arguments[0].sequenceStartPositions->getVector(false);
-  const int* startPosData = startPos->getData();
-  size_t batchNum = startPos->getSize() - 1;
-  for (size_t i = 0; i < batchNum; ++i) {
-    int beginPos = startPosData[i];
-    int endPos = startPosData[i + 1];
-    batchAuc += calcRankAuc(outputData + beginPos,
-                            clickData + beginPos,
-                            pvData + beginPos,
-                            endPos - beginPos);
-  }
-  return batchAuc;
-}
-
-double RankAucEvaluator::calcRankAuc(real* outputData,
-                                     real* clickData,
-                                     real* pvData,
-                                     size_t size) {
-  outputPair_.clear();
-  for (size_t i = 0; i < size; ++i) {
-    outputPair_.push_back(std::make_pair(outputData[i], i));
-  }
-  std::sort(outputPair_.begin(),
-            outputPair_.end(),
-            [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
-              return a.first > b.first;
-            });
-  double aucTmp = 0.0;
-  double clickSum = 0.0;
-  double oldClickSum = 0.0;
-  double noClick = 0.0;
-  double noClickSum = 0.0;
-
-  double lastScore = outputPair_[0].first + 1.0;
-  for (size_t i = 0; i < size; ++i) {
-    if (lastScore != outputPair_[i].first) {
-      aucTmp += (clickSum + oldClickSum) * noClick / 2.0;
-      oldClickSum = clickSum;
-      noClick = 0.0;
-      lastScore = outputPair_[i].first;
-    }
-    size_t id = outputPair_[i].second;
-    noClick += pvData[id] - clickData[id];
-    noClickSum += noClick;
-    clickSum += clickData[id];
-  }
-  aucTmp += (clickSum + oldClickSum) * noClick / 2.0;
-  return (clickSum * noClickSum) == 0.0 ? 0.0
-                                        : aucTmp / (clickSum * noClickSum);
-}
-
-std::string RankAucEvaluator::getTypeImpl() const { return "rankauc"; }
-
-// class PrecisionRecallEvaluator
-REGISTER_EVALUATOR(precision_recall, PrecisionRecallEvaluator);
-
-void PrecisionRecallEvaluator::start() {
-  Evaluator::start();
-  statsInfo_.clear();
-  values_.clear();
-}
-
-real PrecisionRecallEvaluator::evalImp(std::vector<Argument>& arguments) {
-  REGISTER_TIMER("PrecisionRecallEvaluator");
-  CHECK_GE(arguments.size(), (size_t)2);
-  CHECK_LE(arguments.size(), (size_t)3);
-  MatrixPtr output = arguments[0].value;
-  IVectorPtr label = arguments[1].ids;
-  MatrixPtr multiBinaryLabel = arguments[1].value;
-  bool supportWeight = (3 == arguments.size()) ? true : false;
-  MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
-  if (nullptr == output || (nullptr == label && nullptr == multiBinaryLabel) ||
-      (supportWeight && nullptr == weight)) {
-    return 0;
-  }
-
-  size_t insNum = output->getHeight();
-  size_t outputDim = output->getWidth();
-  if (label != nullptr) {
-    CHECK_EQ(insNum, label->getSize());
-  } else {
-    CHECK_EQ(insNum, multiBinaryLabel->getHeight());
-    CHECK_EQ(outputDim, multiBinaryLabel->getWidth());
-  }
-  if (supportWeight) {
-    CHECK_EQ(insNum, weight->getHeight());
-    CHECK_EQ((size_t)1, weight->getWidth());
-  }
-
-  if (statsInfo_.size() != outputDim) {
-    statsInfo_.clear();
-    statsInfo_.resize(outputDim);
-  }
-
-  isMultiBinaryLabel_ = (nullptr == label) ? true : false;
-  if (label != nullptr) {
-    if (dynamic_cast<GpuMatrix*>(output.get())) {
-      Matrix::resizeOrCreate(cpuOutput_, insNum, outputDim, false, false);
-      cpuOutput_->copyFrom(*output);
-      IVector::resizeOrCreate(cpuLabel_, insNum, false);
-      cpuLabel_->copyFrom(*label);
-      if (supportWeight) {
-        Matrix::resizeOrCreate(cpuWeight_, insNum, (size_t)1, false, false);
-        cpuWeight_->copyFrom(*weight);
-      }
-
-      output = cpuOutput_;
-      label = cpuLabel_;
-      weight = cpuWeight_;
-    }
-    calcStatsInfo(output, label, weight);
-  } else {
-    // Not support GPU for multi binary labels
-    CHECK(dynamic_cast<CpuSparseMatrix*>(multiBinaryLabel.get()));
-    calcStatsInfoMulti(output, multiBinaryLabel, weight);
-  }
-  return 0;
-}
-
-void PrecisionRecallEvaluator::printStats(std::ostream& os) const {
-  PrintStatsInfo info;
-  bool containMacroMicroInfo = getStatsInfo(&info);
-  os << "positive_label=" << config_.positive_label()
-     << " precision=" << info.precision << " recall=" << info.recall
-     << " F1-score=" << info.f1;
-  if (containMacroMicroInfo) {
-    os << "macro-average-precision=" << info.macroAvgPrecision
-       << " macro-average-recall=" << info.macroAvgRecall
-       << " macro-average-F1-score=" << info.macroAvgF1Score;
-    if (!isMultiBinaryLabel_) {
-      // precision and recall are equal in this case
-      os << " micro-average-precision=" << info.microAvgPrecision;
-    } else {
-      os << " micro-average-precision=" << info.microAvgPrecision
-         << " micro-average-recall=" << info.microAvgRecall
-         << " micro-average-F1-score=" << info.microAvgF1Score;
-    }
-  }
-}
-
-void PrecisionRecallEvaluator::calcStatsInfo(const MatrixPtr& output,
-                                             const IVectorPtr& label,
-                                             const MatrixPtr& weight) {
-  size_t insNum = output->getHeight();
-  size_t dim = output->getWidth();
-  real* outputD = output->getData();
-  int* labelD = label->getData();
-  real* weightD = (weight != nullptr) ? weight->getData() : nullptr;
-  for (size_t i = 0; i < insNum; ++i) {
-    CHECK_GE(labelD[i], 0);
-    CHECK_LT((size_t)labelD[i], dim);
-    size_t maxIdx = 0;
-    real maxValue = outputD[i * dim];
-    for (size_t j = 1; j < dim; ++j) {
-      size_t idx = i * dim + j;
-      if (maxValue < outputD[idx]) {
-        maxIdx = j;
-        maxValue = outputD[idx];
-      }
-    }
-
-    real w = (weightD != nullptr) ? weightD[i] : 1.0;
-    if (maxIdx == (size_t)labelD[i]) {
-      statsInfo_[maxIdx].TP += w;  // true positive for labelD[i]
-      // true negative for all labels except for labelD[i]
-      for (size_t j = 0; j < dim; ++j) {
-        statsInfo_[j].TN += w;
-      }
-      statsInfo_[maxIdx].TN -= w;
-    } else {
-      statsInfo_[labelD[i]].FN += w;  // false negative for labelD[i]
-      statsInfo_[maxIdx].FP += w;     // false positive for maxIdx
-      // true negatives for all labels except for maxIdx and labelD[i]
-      for (size_t j = 0; j < dim; ++j) {
-        statsInfo_[j].TN += w;
-      }
-      statsInfo_[maxIdx].TN -= w;
-      statsInfo_[labelD[i]].TN -= w;
-    }
-  }
-}
-
-void PrecisionRecallEvaluator::calcStatsInfoMulti(const MatrixPtr& output,
-                                                  const MatrixPtr& label,
-                                                  const MatrixPtr& weight) {
-  size_t insNum = output->getHeight();
-  size_t dim = output->getWidth();
-  real* outputD = output->getData();
-  auto labelD = dynamic_cast<CpuSparseMatrix*>(label.get());
-  real* weightD = (weight != nullptr) ? weight->getData() : nullptr;
-  real threshold = config_.classification_threshold();
-  for (size_t i = 0; i < insNum; ++i) {
-    for (size_t j = 0; j < dim; ++j) {
-      real w = (weightD != nullptr) ? weightD[i] : 1.0;
-      size_t idx = i * dim + j;
-      if (outputD[idx] < threshold) {
-        statsInfo_[j].TN += w;  // true negative
-      } else {
-        statsInfo_[j].FP += w;  // false positive
-      }
-    }
-
-    const int* cols = labelD->getRowCols(i);
-    for (size_t j = 0; j < labelD->getColNum(i); ++j) {
-      CHECK_LT(size_t(cols[j]), dim);
-      real w = (weightD != nullptr) ? weightD[i] : 1.0;
-      size_t idx = i * dim + cols[j];
-      if (outputD[idx] < threshold) {
-        statsInfo_[cols[j]].FN += w;  // false negative
-        statsInfo_[cols[j]].TN -= w;  // true negative
-      } else {
-        statsInfo_[cols[j]].TP += w;  // true positive
-        statsInfo_[cols[j]].FP -= w;  // false positive
-      }
-    }
-  }
-}
-
-void PrecisionRecallEvaluator::storeLocalValues() const {
-  if (this->values_.size() == 0) {
-    PrintStatsInfo info;
-    bool containMacroMicroInfo = getStatsInfo(&info);
-    values_["precision"] = info.precision;
-    values_["recal"] = info.recall;
-    values_["F1-score"] = info.f1;
-    if (containMacroMicroInfo) {
-      values_["macro-average-precision"] = info.macroAvgPrecision;
-      values_["macro-average-recall"] = info.macroAvgRecall;
-      values_["macro-average-F1-score"] = info.macroAvgF1Score;
-      if (!isMultiBinaryLabel_) {
-        // precision and recall are equal in this case
-        values_["micro-average-precision"] = info.microAvgPrecision;
-      } else {
-        values_["micro-average-precision"] = info.microAvgPrecision;
-        values_["micro-average-recall"] = info.microAvgRecall;
-        values_["micro-average-F1-score"] = info.microAvgF1Score;
-      }
-    }
-  }
-}
-
-void PrecisionRecallEvaluator::getNames(std::vector<std::string>* names) {
-  this->storeLocalValues();
-  names->reserve(this->values_.size());
-  for (auto it = this->values_.begin(); it != this->values_.end(); ++it) {
-    names->push_back(this->config_.name() + "." + it->first);
-  }
-}
-
-real PrecisionRecallEvaluator::getValue(const std::string& name,
-                                        Error* err) const {
-  this->storeLocalValues();
-  std::vector<std::string> buffers;
-  paddle::str::split(name, '.', &buffers);
-  auto it = this->values_.find(buffers[buffers.size() - 1]);
-  if (it == this->values_.end()) {  // not found
-    *err = Error("No such key %s", name.c_str());
-    return .0f;
-  }
-
-  return it->second;
-}
-
-std::string PrecisionRecallEvaluator::getType(const std::string& name,
-                                              Error* err) const {
-  this->getValue(name, err);
-  if (!err->isOK()) {
-    return "";
-  }
-  return "precision_recall";
-}
-
-void PrecisionRecallEvaluator::distributeEval(ParameterClient2* client) {
-  size_t size = 4 * statsInfo_.size();
-  double* buf = new double[size];
-  for (size_t i = 0; i < statsInfo_.size(); ++i) {
-    buf[4 * i + 0] = statsInfo_[i].TP;
-    buf[4 * i + 1] = statsInfo_[i].TN;
-    buf[4 * i + 2] = statsInfo_[i].FP;
-    buf[4 * i + 3] = statsInfo_[i].FN;
-  }
-  client->reduce(buf, buf, size, FLAGS_trainer_id, 0);
-  for (size_t i = 0; i < statsInfo_.size(); ++i) {
-    statsInfo_[i].TP = buf[4 * i + 0];
-    statsInfo_[i].TN = buf[4 * i + 1];
-    statsInfo_[i].FP = buf[4 * i + 2];
-    statsInfo_[i].FN = buf[4 * i + 3];
-  }
-  delete[] buf;
-}
-
-bool PrecisionRecallEvaluator::getStatsInfo(
-    PrecisionRecallEvaluator::PrintStatsInfo* info) const {
-  int label = config_.positive_label();
-  if (label != -1) {
-    CHECK(label >= 0 && label < (int)statsInfo_.size())
-        << "positive_label [" << label << "] should be in range [0, "
-        << statsInfo_.size() << ")";
-    info->precision = calcPrecision(statsInfo_[label].TP, statsInfo_[label].FP);
-    info->recall = calcRecall(statsInfo_[label].TP, statsInfo_[label].FN);
-    info->f1 = calcF1Score(info->precision, info->recall);
-    return false;
-  }
-
-  // micro average method: precision = (TP1+TP2)/(TP1+FP1+TP2+FP2)
-  // macro average method: precision = (precision1+precision2)/2
-  double microTotalTP = 0;
-  double microTotalFP = 0;
-  double microTotalFN = 0;
-  info->macroAvgPrecision = 0;
-  info->macroAvgRecall = 0;
-  size_t numLabels = statsInfo_.size();
-  for (size_t i = 0; i < numLabels; ++i) {
-    microTotalTP += statsInfo_[i].TP;
-    microTotalFP += statsInfo_[i].FP;
-    microTotalFN += statsInfo_[i].FN;
-    info->macroAvgPrecision +=
-        calcPrecision(statsInfo_[i].TP, statsInfo_[i].FP);
-    info->macroAvgRecall += calcRecall(statsInfo_[i].TP, statsInfo_[i].FN);
-  }
-  info->macroAvgPrecision /= numLabels;
-  info->macroAvgRecall /= numLabels;
-  info->macroAvgF1Score =
-      calcF1Score(info->macroAvgPrecision, info->macroAvgRecall);
-
-  info->microAvgPrecision = calcPrecision(microTotalTP, microTotalFP);
-  info->microAvgRecall = calcPrecision(microTotalTP, microTotalFN);
-  info->microAvgF1Score =
-      calcF1Score(info->microAvgPrecision, info->microAvgRecall);
-  return true;
-}
-
-REGISTER_EVALUATOR(pnpair, PnpairEvaluator);
-void PnpairEvaluator::start() {
-  Evaluator::start();
-  memset(pairArray_, 0, sizeof(pairArray_));
-  predictArray_.clear();
-}
-
-real PnpairEvaluator::evalImp(std::vector<Argument>& arguments) {
-  CHECK_GE(arguments.size(), 3UL);
-  CHECK_LE(arguments.size(), 4UL);
-  MatrixPtr output = arguments[0].value;
-  IVectorPtr label = arguments[1].ids;
-  IVectorPtr info = arguments[2].ids;
-  bool supportWeight = (4 == arguments.size()) ? true : false;
-  MatrixPtr weight = supportWeight ? arguments[3].value : nullptr;
-  if (nullptr == output || nullptr == label ||
-      (supportWeight && nullptr == weight)) {
-    return 0;
-  }
-  size_t height = output->getHeight();
-  size_t width = output->getWidth();
-  CHECK_EQ(height, label->getSize());
-  CHECK_EQ(height, info->getSize());
-  if (supportWeight) {
-    CHECK_EQ(height, weight->getHeight());
-    CHECK_EQ((size_t)1, weight->getWidth());
-  }
-
-  if (dynamic_cast<GpuMatrix*>(output.get())) {
-    Matrix::resizeOrCreate(cpuOutput_, height, width, false, false);
-    IVector::resizeOrCreate(cpuLabel_, height, false);
-    IVector::resizeOrCreate(cpuInfo_, height, false);
-    cpuOutput_->copyFrom(*output);
-    cpuLabel_->copyFrom(*label);
-    cpuInfo_->copyFrom(*info);
-
-    output = cpuOutput_;
-    label = cpuLabel_;
-    info = cpuInfo_;
-
-    if (supportWeight) {
-      Matrix::resizeOrCreate(cpuWeight_, height, (size_t)1, false, false);
-      cpuWeight_->copyFrom(*weight);
-      weight = cpuWeight_;
-    }
-  }
-
-  real* outputs = output->getData();
-  int* labels = label->getData();
-  int* infos = info->getData();
-  real* weights = supportWeight ? weight->getData() : nullptr;
-  for (size_t i = 0; i < output->getHeight(); i++) {
-    real y1 = outputs[i * width + (width - 1)];
-    real w = supportWeight ? weights[i] : 1.0;
-    predictArray_.push_back(PredictionResult(y1, labels[i], infos[i], w));
-  }
-  return 0;
-}
-
-void PnpairEvaluator::stat(size_t start,
-                           size_t end,
-                           PredictionResult* answers,
-                           double& pos,
-                           double& neg,
-                           double& spe) {
-  for (size_t i = start; i < end; i++) {
-    for (size_t j = i + 1; j < end; j++) {
-      CHECK_EQ(answers[i].queryid, answers[j].queryid);
-      // The pair weight is the mean of the two samples' weight
-      double weight = (answers[i].weight + answers[j].weight) / 2.0;
-      if (answers[i].label != answers[j].label) {
-        if ((answers[i].out > answers[j].out &&
-             answers[i].label > answers[j].label) ||
-            (answers[i].out < answers[j].out &&
-             answers[i].label < answers[j].label)) {
-          pos += weight;
-        } else if ((answers[i].out > answers[j].out &&
-                    answers[i].label < answers[j].label) ||
-                   (answers[i].out < answers[j].out &&
-                    answers[i].label > answers[j].label)) {
-          neg += weight;
-        } else {
-          spe += weight;
-        }
-      }
-    }
-  }
-}
-
-void PnpairEvaluator::calc(std::vector<PredictionResult>& predictArray) {
-  std::sort(predictArray.begin(),
-            predictArray.end(),
-            [](const PredictionResult& x, const PredictionResult& y) {
-              return x.queryid < y.queryid;
-            });
-
-  double pos = 0;
-  double neg = 0;
-  double special = 0;
-  auto start = predictArray.begin();
-  while (start != predictArray.end()) {
-    auto end = std::find_if(
-        start + 1, predictArray.end(), [=](const PredictionResult& x) {
-          return x.queryid != start->queryid;
-        });
-    CHECK(end != start);
-    stat(start - predictArray.begin(),
-         end - predictArray.begin(),
-         predictArray.data(),
-         pos,
-         neg,
-         special);
-
-    start = end;
-  }
-
-  pairArray_[0] += pos;
-  pairArray_[1] += neg;
-
-  LOG(INFO) << " calc total pos pair: " << pos
-            << " calc total neg pair: " << neg
-            << " calc total special pair: " << special;
-}
-
-std::string PnpairEvaluator::getTypeImpl() const { return "pnpair"; }
-
-ClassRegistrar<Evaluator> Evaluator::registrar_;
-Evaluator* Evaluator::create(const EvaluatorConfig& config) {
-  Evaluator* evaluator = registrar_.createByType(config.type());
-  evaluator->init(config);
-  return evaluator;
-}
-
-REGISTER_EVALUATOR(classification_error, ClassificationErrorEvaluator);
-REGISTER_EVALUATOR(sum, SumEvaluator);
-static InitFunction __reg_type_auc_sum__([]() {
-  Evaluator::registrar_.registerClass(
-      "last-column-sum", [] { return new ColumnSumEvaluator(-1); });
-  Evaluator::registrar_.registerClass("last-column-auc",
-                                      [] { return new AucEvaluator(-1); });
-});
-
-/**
- * @brief print value of each layer.
- *
- * The config file api is value_printer_evaluator.
- */
-class ValuePrinter : public NotGetableEvaluator {
-public:
-  virtual void eval(const NeuralNetwork& nn) {
-    for (const std::string& name : config_.input_layers()) {
-      nn.getLayer(name)->getOutput().printValueString(LOG(INFO),
-                                                      "layer=" + name + " ");
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
-};
-REGISTER_EVALUATOR(value_printer, ValuePrinter);
-
-/**
- * @brief print gradient of each layer.
- *
- * The config file api is gradient_printer_evaluator.
- */
-class GradientPrinter : public NotGetableEvaluator {
-public:
-  virtual void eval(const NeuralNetwork& nn) {
-    for (const std::string& name : config_.input_layers()) {
-      const Argument& argu = nn.getLayer(name)->getOutput();
-      if (argu.grad) {
-        std::ostringstream os;
-        argu.grad->print(os);
-        LOG(INFO) << "layer=" << name << " grad matrix:\n" << os.str();
-      }
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
-};
-REGISTER_EVALUATOR(gradient_printer, GradientPrinter);
-/**
- * @brief print row max id vctor of each layer
- *
- * The config file api is maxid_printer_evaluator.
- */
-class MaxIdPrinter : public NotGetableEvaluator {
-private:
-  IVectorPtr maxIds_;
-  MatrixPtr maxValues_;
-
-public:
-  MaxIdPrinter() {}
-
-  virtual void eval(const NeuralNetwork& nn) {
-    for (const std::string& name : config_.input_layers()) {
-      const Argument& argu = nn.getLayer(name)->getOutput();
-      if (argu.value) {
-        size_t height = argu.value->getHeight();
-        size_t width = config_.num_results();
-        IVector::resizeOrCreate(maxIds_, height * width, false);
-        Matrix::resizeOrCreate(maxValues_, height, width, false);
-        argu.value->rowMax(*maxIds_, *maxValues_);
-        std::ostringstream os;
-        int* ids = maxIds_->getData();
-        real* values = maxValues_->getData();
-        for (size_t i = 0; i < height; ++i) {
-          for (size_t j = 0; j < width; ++j) {
-            size_t pos = i * width + j;
-            os << ids[pos] << " : " << values[pos] << ", ";
-          }
-          os << std::endl;
-        }
-        LOG(INFO) << "layer=" << name << " row max id vector:\n" << os.str();
-      }
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
-};
-REGISTER_EVALUATOR(max_id_printer, MaxIdPrinter);
-/**
- * @brief print sequence max frames of each layer
- *
- * The config file api is maxframe_printer_evaluator.
- */
-class MaxFramePrinter : public NotGetableEvaluator {
-private:
-  IVectorPtr maxIds_;
-  MatrixPtr maxValues_;
-  MatrixPtr value_;
-
-public:
-  MaxFramePrinter() {
-    value_ =
-        Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, false);
-  }
-
-  virtual void eval(const NeuralNetwork& nn) {
-    for (const std::string& name : config_.input_layers()) {
-      const Argument& argu = nn.getLayer(name)->getOutput();
-
-      CHECK_EQ(argu.value->getWidth(), 1LU);
-      size_t numSequences = argu.getNumSequences();
-      const int* starts = argu.sequenceStartPositions->getData(false);
-
-      std::ostringstream os;
-      for (size_t i = 0; i < numSequences; ++i) {
-        size_t offset = starts[i];
-        size_t size = starts[i + 1] - starts[i];
-        value_->setData(argu.value->getData() + offset, 1LU, size);
-
-        size_t height = 1LU;
-        size_t width = std::min((size_t)config_.num_results(), size);
-        IVector::resizeOrCreate(maxIds_, height * width, false);
-        Matrix::resizeOrCreate(maxValues_, height, width, false);
-
-        value_->rowMax(*maxIds_, *maxValues_);
-
-        int* ids = maxIds_->getData();
-        real* values = maxValues_->getData();
-        for (size_t j = 0; j < width; ++j) {
-          os << ids[j] << " : " << values[j] << ", ";
-        }
-        os << "total " << size << " frames" << std::endl;
-      }
-      LOG(INFO) << "layer=" << name << " sequence max frames:\n" << os.str();
-    }
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
-};
-REGISTER_EVALUATOR(max_frame_printer, MaxFramePrinter);
-
-/**
- * @brief print text according to index matrix and a dictionary.
- *
- * There can be multiple input to this layer:
- * - If there is only one input, the input must be a matrix containing
- *      the sequence of indices;
- * - If there are more than one input, the first input should be ids,
- *      and are interpreted as sample ids.
- *
- * The output format will be:
- *
- * - sequence without sub-sequence, and there is probability.
- *
- *     @code
- *      id \t prob space_seperated_tokens_from_dictionary_according_to_seq
- *     @endcode
- *
- * - sequence without sub-sequence, and there is not probability.
- *
- *     @code
- *      id \t space_seperated_tokens_from_dictionary_according_to_seq
- *     @endcode
- *
- * - sequence with sub-sequence, and there is not probability.
- *
- *     @code
- *      id \t space_seperated_tokens_from_dictionary_according_to_sub_seq
- *      \t \t space_seperated_tokens_from_dictionary_according_to_sub_seq
- *      ...
- *     @endcode
- *
- * Typically SequenceTextPrinter layer takes output of maxid or RecurrentGroup
- * with maxid (when generating) as an input.
- *
- * The config file api is seqtext_printer_evaluator.
- *
- */
-class SequenceTextPrinter : public NotGetableEvaluator {
-private:
-  /// dict_file, which contains a list of tokens
-  std::vector<std::string> dict_;
-  /// result_file, which is the output file
-  std::ofstream os_;
-  /// True/False, to indicate whether to use space to separate output tokens.
-  /// Default is True. No space is added if set to False.
-  bool delimited_;
-  /// store the cpu version of argument.ids
-  std::vector<IVectorPtr> cpuIds_;
-  /// store the probability associated with each sequence
-  std::vector<MatrixPtr> cpuIn_;
-
-public:
-  SequenceTextPrinter() {}
-
-  virtual void init(const EvaluatorConfig& config) {
-    Evaluator::init(config);
-    if (!config.dict_file().empty()) {
-      loadFileList(config.dict_file(), dict_);
-    }
-
-    os_.open(config.result_file(), std::ofstream::trunc);
-    CHECK(os_.is_open()) << "Failed to open file " << config.result_file();
-    delimited_ = config.delimited();
-  }
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    CHECK_GE(arguments.size(), 1LU);
-    bool hasId = arguments.size() > 1;
-    size_t numSequences = arguments[0].getNumSequences();
-    if (hasId) {
-      CHECK_EQ(arguments[0].ids->getSize(), numSequences)
-          << "first input must be sample id.";
-    }
-    for (size_t i = hasId ? 1 : 0; i < arguments.size(); ++i) {
-      CHECK_EQ((size_t)arguments[i].getNumSequences(), numSequences);
-    }
-
-    auto resizeVector = [](IVectorPtr& dest, const IVectorPtr& src) {
-      if (src && src->useGpu()) {
-        IVector::resizeOrCreate(dest, src->getSize(), false);
-        dest->copyFrom(*src);
-      } else {
-        dest = src;
-      }
-    };
-
-    auto resizeMatrix = [](MatrixPtr& dest, const MatrixPtr& src) {
-      if (src && src->useGpu()) {
-        Matrix::resizeOrCreate(
-            dest, src->getHeight(), src->getWidth(), false, false);
-        dest->copyFrom(*src);
-      } else {
-        dest = src;
-      }
-    };
-
-    cpuIds_.resize(arguments.size());
-    cpuIn_.resize(arguments.size());
-    for (size_t i = 0; i < arguments.size(); ++i) {
-      resizeVector(cpuIds_[i], arguments[i].ids);
-      resizeMatrix(cpuIn_[i], arguments[i].in);
-    }
-
-    int* sampleIds = nullptr;
-    if (hasId) {
-      sampleIds = cpuIds_[0]->getData();
-    }
-
-    for (size_t i = 0; i < numSequences; ++i) {
-      os_ << (hasId ? sampleIds[i] : i);
-      for (size_t j = hasId ? 1 : 0; j < arguments.size(); ++j) {
-        int* output = cpuIds_[j]->getData();
-        const int* starts = arguments[j].sequenceStartPositions->getData(false);
-
-        auto seqPrint = [&](int start, int end) {
-          os_ << "\t";
-          for (int k = start; k < end; k++) {
-            int id = output[k];
-            os_ << (delimited_ ? " " : "");
-            if (!dict_.empty()) {
-              CHECK_LT((size_t)id, dict_.size());
-              os_ << dict_[id];
-            } else {
-              os_ << id;
-            }
-          }
-        };
-
-        if (arguments[j].hasSubseq()) {
-          // print sequence with sub-sequence
-          const int* subStarts =
-              arguments[j].subSequenceStartPositions->getData(false);
-          int subSeqId_start = 0;
-          int subSeqId_end = 0;
-          for (size_t k = 0; k < (size_t)arguments[j].getNumSubSequences() + 1;
-               ++k) {
-            if (starts[i] == subStarts[k]) subSeqId_start = k;
-            if (starts[i + 1] == subStarts[k]) subSeqId_end = k;
-          }
-          for (int k = subSeqId_start; k < subSeqId_end; k++) {
-            seqPrint(subStarts[k], subStarts[k + 1]);
-            os_ << std::endl;
-          }
-
-        } else {
-          // print sequence without sub-sequence
-          if (arguments[j].in) {  // beam print
-            real* probs = cpuIn_[j]->rowBuf(i);
-            os_ << std::endl;
-            int start = starts[i];
-            int seqEnd = starts[i + 1];
-            for (size_t k = 0; k < arguments[j].in->getWidth(); ++k) {
-              if (start == seqEnd) {
-                break;
-              }
-              int end = start + output[start] + 2;
-              CHECK_LE(end, seqEnd);
-              CHECK_EQ(output[end - 1], -1);
-              os_ << k << "\t" << probs[k];
-              seqPrint(start + 1, end - 1);
-              os_ << std::endl;
-              start = end;
-            }
-          } else {
-            seqPrint(starts[i], starts[i + 1]);
-          }
-        }
-      }
-      os_ << std::endl;
-    }
-    return 0;
-  }
-};
-REGISTER_EVALUATOR(seq_text_printer, SequenceTextPrinter);
-/**
- * @brief print classification error.
- *
- * The config file api is classification_error_printer_evaluator.
- */
-class ClassificationErrorPrinter : public ClassificationErrorEvaluator {
-public:
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    MatrixPtr errorMat = calcError(arguments);
-
-    std::ostringstream os;
-    errorMat->print(os);
-    LOG(INFO) << "Printer=" << config_.name() << " Classification Error:\n"
-              << os.str();
-
-    if (auto startPos = arguments[0].sequenceStartPositions) {
-      std::ostringstream os;
-      startPos->getVector(false)->print(os, startPos->getSize());
-      LOG(INFO) << "Printer=" << config_.name() << " sequence pos vector:\n"
-                << os.str();
-    }
-    return 0;
-  }
-};
-REGISTER_EVALUATOR(classification_error_printer, ClassificationErrorPrinter);
-
-std::string DummyEvaluator::getTypeImpl() const { return "dummy"; }
-
-}  // namespace paddle
diff --git a/paddle/gserver/evaluators/Evaluator.h b/paddle/gserver/evaluators/Evaluator.h
deleted file mode 100644
index be2032992c455fe2b442dbe05d84128ef8ebf82f..0000000000000000000000000000000000000000
--- a/paddle/gserver/evaluators/Evaluator.h
+++ /dev/null
@@ -1,510 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <fstream>
-#include "ModelConfig.pb.h"
-#include "paddle/parameter/Argument.h"
-#include "paddle/pserver/ParameterClient2.h"
-#include "paddle/utils/ClassRegistrar.h"
-#include "paddle/utils/Error.h"
-
-namespace paddle {
-
-class NeuralNetwork;
-/**
- * @def REGISTER_EVALUATOR
- * @brief Macro for registering evaluator class
- */
-
-#define REGISTER_EVALUATOR(__type_name, __class_name)                \
-  static InitFunction __reg_type_##__type_name([]() {                \
-    Evaluator::registrar_.registerClass<__class_name>(#__type_name); \
-  })
-/**
- * @brief Base class for Evaluator
- * Evaluating the performance of a model is very important.
- * It indicates how successful the scores(predictions) of a datasets
- * has been by a trained model.
- */
-class Evaluator {
-public:
-  static Evaluator* create(const EvaluatorConfig& config);
-
-  Evaluator() : numSamples_(0), totalScore_(0) {}
-
-  virtual ~Evaluator() {}
-
-  virtual void init(const EvaluatorConfig& config) { config_ = config; }
-
-  /**
-   * @brief start to evaluate some data
-   */
-  virtual void start() {
-    numSamples_ = 0;
-    totalScore_ = 0;
-  }
-
-  /**
-   * @brief Process a batch of data.
-   */
-  virtual void eval(const NeuralNetwork& nn);
-
-  /**
-   * @brief Process a batch of data.
-   * @return the score for the batch if it make sense to sum the score across
-   * batches.
-   * @note Otherwise evaluator should return 0 and override finish() and
-   * printStats() to do the right calculation.
-   */
-  virtual real evalImp(std::vector<Argument>& arguments) = 0;
-
-  /**
-   * @brief Update the number of processed samples
-   */
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
-    numSamples_ += arguments[0].getBatchSize();
-  }
-
-  /// finish() should be called before distributeEval
-  virtual void distributeEval(ParameterClient2* client) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  void mergeResultsOfAllClients(ParameterClient2* client) {
-    double data[2] = {totalScore_, numSamples_};
-    client->reduce(data, data, 2, FLAGS_trainer_id, 0);
-    totalScore_ = data[0];
-    numSamples_ = data[1];
-  }
-
-  /**
-   * @brief finish the evaluation.
-   */
-  virtual void finish() {}
-
-  /**
-   * @brief print the statistics of evaluate result
-   * @note finish() should be called before printStats
-   */
-  virtual void printStats(std::ostream& os) const {
-    os << config_.name() << "="
-       << (numSamples_ ? totalScore_ / numSamples_ : 0);
-  }
-
-  friend std::ostream& operator<<(std::ostream& os,
-                                  const Evaluator& evaluator) {
-    evaluator.printStats(os);
-    return os;
-  }
-
-  friend std::ostream&& operator<<(std::ostream&& os,  // NOLINT
-                                   const Evaluator& evaluator) {
-    evaluator.printStats(os);
-    return std::move(os);
-  }
-
-  static ClassRegistrar<Evaluator> registrar_;
-
-  /**
-   * @brief getNames will return all field names of current evaluator.
-   *
-   * The format of name is `evaluator_name.evaluator_fields`. If the evaluator
-   * has multiple field, the name could be `evaluator_name.field1`. For example
-   * the PrecisionRecallEvaluator contains `precision`, `recall` fields. The get
-   * names will return `precision_recall_evaluator.precision`,
-   * `precision_recall_evaluator.recal`, etc.
-   *
-   * Also, if current Evaluator is a combined evaluator. getNames will return
-   * all names of all evaluators inside the combined evaluator.
-   *
-   * @param names [out]: the field names of current evaluator.
-   * @note Never clear the names parameter inside getNames.
-   */
-  virtual void getNames(std::vector<std::string>* names) {
-    names->push_back(config_.name());
-  }
-
-  /**
-   * @brief getValue will return the current evaluate value of one field.
-   *
-   * @param name: The field name of current evaluator.
-   * @param err [out]: The error state.
-   *
-   * @return The evaluate value(metric).
-   */
-  virtual real getValue(const std::string& name, Error* err) const {
-    if (name != config_.name()) {
-      *err = Error("no such name of evaluator %s", name.c_str());
-      return .0f;
-    }
-    return this->getValueImpl();
-  }
-
-  /**
-   * @brief getType will return the evaluator type by field name.
-   *
-   * Evaluate Type is the current type of evaluator in string. Such as 'auc',
-   * 'precision_recall'. In combined evaluator, different name may get different
-   * evaluate type because it could be evaluated by different evaluator inside.
-   *
-   * @param name: The field name of current Evaluator.
-   * @param err: The error state. nullptr means don't care.
-   * @return the evaluator type string.
-   */
-  virtual std::string getType(const std::string& name, Error* err) const {
-    if (name != config_.name()) {
-      *err = Error("no such name of evaluator %s", name.c_str());
-      return std::string();
-    }
-    return this->getTypeImpl();
-  }
-
-protected:
-  /**
-   * @brief getValueImpl The simplest way to define getValue result. If this
-   * evaluator doesn't contain multiple fields, and do not throw any error, just
-   * implemented this method to get the evaluate result(metric).
-   * @return Evaluate result(metric).
-   */
-  virtual real getValueImpl() const {
-    return numSamples_ != .0 ? totalScore_ / numSamples_ : .0;
-  }
-
-  /**
-   * @brief getTypeImpl The simplest way to define getType result. If this
-   * evaluator doesn't combine many evaluators, the get type should only return
-   * itself type.
-   * @return Evaluator type.
-   */
-  virtual std::string getTypeImpl() const { return "base"; }
-
-protected:
-  EvaluatorConfig config_;
-  double numSamples_;
-  double totalScore_;
-};
-
-/**
- * @brief The NotGetableEvaluator class is the base class of evaluator that
- * cannot get value in runtime. The most NotGetableEvaluator is Printer
- * Evaluator, which is only used to debug network configuration.
- */
-class NotGetableEvaluator : public Evaluator {
-  // Evaluator interface
-public:
-  void getNames(std::vector<std::string>* names) {}
-
-  real getValue(const std::string& name, Error* err) const {
-    *err = Error("Not implemented");
-    return .0f;
-  }
-
-  std::string getType(const std::string& name, Error* err) const {
-    *err = Error("Not implemented");
-    return "";
-  }
-};
-
-class DummyEvaluator : public Evaluator {
-public:
-  DummyEvaluator() {}
-  virtual void init(const EvaluatorConfig&) {}
-  virtual void start() {}
-  virtual void eval(const NeuralNetwork&) {}
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    (void)arguments;
-    return -1;
-  }
-  virtual void finish() {}
-  virtual void printStats(std::ostream&) const {}
-
-  // Evaluator interface
-protected:
-  std::string getTypeImpl() const;
-};
-/**
- * @brief evaluate AUC using colIdx-th column as prediction.
- * The AUC(Area Under the Curve) is a common evaluation metric
- * for binary classification problems. It computes the area under
- * the receiver operating characteristic(ROC) curve.
- *
- * @note colIdx-th column
- *
- * - colIdx = 0: the 0-th column.
- * - colIdx > 0: the colIdx-th column.
- * - colIdx < 0: the last colIdx-th column.
- *
- * The config file api is auc_evaluator.
- *
- */
-class AucEvaluator : public Evaluator {
-public:
-  AucEvaluator(int32_t colIdx)
-      : colIdx_(colIdx),
-        realColumnIdx_(0),
-        cpuOutput_(nullptr),
-        cpuLabel_(nullptr),
-        cpuWeight_(nullptr) {}
-
-  virtual void start();
-
-  virtual real evalImp(std::vector<Argument>& arguments);
-
-  virtual void printStats(std::ostream& os) const {
-    os << config_.name() << "=" << calcAuc();
-  }
-
-  virtual void distributeEval(ParameterClient2* client);
-
-private:
-  static const uint32_t kBinNum_ = (1 << 24) - 1;
-  static const int kNegativeLabel_ = 0;
-  double statPos_[kBinNum_ + 1];
-  double statNeg_[kBinNum_ + 1];
-  int32_t colIdx_;
-  uint32_t realColumnIdx_;
-  MatrixPtr cpuOutput_;
-  IVectorPtr cpuLabel_;
-  MatrixPtr cpuWeight_;
-
-  AucEvaluator() {}
-
-  inline static double trapezoidArea(double X1,
-                                     double X2,
-                                     double Y1,
-                                     double Y2) {
-    return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
-  }
-
-  double calcAuc() const;
-
-  // Evaluator interface
-protected:
-  real getValueImpl() const;
-  std::string getTypeImpl() const;
-};
-
-/**
- * @brief RankAucEvaluator calculates the AUC of each list (i.e., titles
- * under the same query), and averages them. Each list should be organized
- * as a sequence. The inputs of this evaluator is [output, click, pv]. If pv
- * is not provided, it will be set to 1. The types of click and pv are
- * dense value.
- */
-class RankAucEvaluator : public Evaluator {
-public:
-  // evaluate ranking AUC
-  virtual void start();
-
-  virtual void updateSamplesNum(const std::vector<Argument>& arguments);
-
-  virtual real evalImp(std::vector<Argument>& arguments);
-
-  virtual void distributeEval(ParameterClient2* client) {
-    mergeResultsOfAllClients(client);
-  }
-
-private:
-  MatrixPtr output_;
-  MatrixPtr click_;
-  MatrixPtr pv_;
-  std::vector<std::pair<real, int>> outputPair_;
-
-  double calcRankAuc(real* outputData,
-                     real* clickData,
-                     real* pvData,
-                     size_t size);
-
-  // Evaluator interface
-protected:
-  std::string getTypeImpl() const;
-};
-
-/**
- * @brief precision, recall and f1 score Evaluator
- * \f[
- * precision = \frac{tp}{tp+tn} \\
- * recall=\frac{tp}{tp+fn} \\
- * f1=2*\frac{precsion*recall}{precision+recall}
- * \f]
- *
- * The config file api is precision_recall_evaluator.
- */
-class PrecisionRecallEvaluator : public Evaluator {
-public:
-  // Evaluate precision, recall and F1 score
-  PrecisionRecallEvaluator()
-      : isMultiBinaryLabel_(false),
-        cpuOutput_(nullptr),
-        cpuLabel_(nullptr),
-        cpuWeight_(nullptr) {}
-
-  virtual void start();
-
-  virtual real evalImp(std::vector<Argument>& arguments);
-
-  virtual void printStats(std::ostream& os) const;
-
-  virtual void distributeEval(ParameterClient2* client);
-
-  void getNames(std::vector<std::string>* names);
-
-  real getValue(const std::string& name, Error* err) const;
-
-  std::string getType(const std::string& name, Error* err) const;
-
-  struct StatsInfo {
-    /// numbers of true positives
-    double TP;
-    /// numbers of true negatives
-    double TN;
-    /// numbers of false positives
-    double FP;
-    /// numbers of false negatives
-    double FN;
-
-    StatsInfo() : TP(0.0), TN(0.0), FP(0.0), FN(0.0) {}
-  };
-
-private:
-  bool isMultiBinaryLabel_;
-  std::vector<StatsInfo> statsInfo_;
-
-  MatrixPtr cpuOutput_;
-  IVectorPtr cpuLabel_;
-  MatrixPtr cpuWeight_;
-
-  struct PrintStatsInfo {
-    double precision;
-    double recall;
-    double f1;
-    double macroAvgPrecision;
-    double macroAvgRecall;
-    double macroAvgF1Score;
-    double microAvgPrecision;
-    double microAvgRecall;
-    double microAvgF1Score;
-  };
-
-  bool getStatsInfo(PrintStatsInfo* info) const;
-
-  void calcStatsInfo(const MatrixPtr& output,
-                     const IVectorPtr& label,
-                     const MatrixPtr& weight);
-
-  void calcStatsInfoMulti(const MatrixPtr& output,
-                          const MatrixPtr& label,
-                          const MatrixPtr& weight);
-
-  inline static double calcPrecision(double TP, double FP) {
-    if (TP > 0.0 || FP > 0.0) {
-      return TP / (TP + FP);
-    } else {
-      return 1.0;
-    }
-  }
-
-  inline static double calcRecall(double TP, double FN) {
-    if (TP > 0.0 || FN > 0.0) {
-      return TP / (TP + FN);
-    } else {
-      return 1.0;
-    }
-  }
-
-  inline static double calcF1Score(double precision, double recall) {
-    if (precision > 0.0 || recall > 0.0) {
-      return 2 * precision * recall / (precision + recall);
-    } else {
-      return 0;
-    }
-  }
-
-  mutable std::unordered_map<std::string, real> values_;
-
-  void storeLocalValues() const;
-};
-
-/*
- * @brief positive-negative pair rate Evaluator
- *
- * The config file api is pnpair_evaluator.
- */
-class PnpairEvaluator : public Evaluator {
-public:
-  PnpairEvaluator()
-      : cpuOutput_(nullptr),
-        cpuLabel_(nullptr),
-        cpuInfo_(nullptr),
-        cpuWeight_(nullptr) {}
-
-  virtual void start();
-  virtual real evalImp(std::vector<Argument>& arguments);
-
-  struct PredictionResult {
-    PredictionResult(real __out, int __label, int __queryid, real __weight)
-        : out(__out), label(__label), queryid(__queryid), weight(__weight) {}
-    real out;
-    int label;
-    int queryid;
-    real weight;
-  };
-  std::vector<PredictionResult> predictArray_;
-  void printPredictResults() {
-    std::ofstream fs(FLAGS_predict_file);
-    CHECK(fs) << "Fail to open " << FLAGS_predict_file;
-    for (auto& res : predictArray_) {
-      fs << res.out << " " << res.label << " " << res.queryid << std::endl;
-    }
-  }
-
-  void stat(size_t start,
-            size_t end,
-            PredictionResult* answers,
-            double& pos,
-            double& neg,
-            double& spe);
-  void calc(std::vector<PredictionResult>& predictArray);
-
-  virtual void finish() { calc(predictArray_); }
-
-  virtual void printStats(std::ostream& os) const {
-    os << " pos/neg=" << this->getValueImpl();
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    client->reduce(pairArray_, pairArray_, kPairArrayNum_, FLAGS_trainer_id, 0);
-    LOG(INFO) << " distribute eval calc total pos pair: " << pairArray_[0]
-              << " calc total neg pair: " << pairArray_[1];
-  }
-
-private:
-  static const uint32_t kPairArrayNum_ = 2;
-  double pairArray_[kPairArrayNum_];
-  MatrixPtr cpuOutput_;
-  IVectorPtr cpuLabel_;
-  IVectorPtr cpuInfo_;
-  MatrixPtr cpuWeight_;
-
-  // Evaluator interface
-protected:
-  real getValueImpl() const {
-    return pairArray_[0] / ((pairArray_[1] <= 0) ? 1.0 : pairArray_[1]);
-  }
-  std::string getTypeImpl() const;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/GradientMachine.cpp b/paddle/gserver/gradientmachines/GradientMachine.cpp
deleted file mode 100644
index 654024e8a47c1e538f25823da78dce6a7a093975..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/GradientMachine.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GradientMachine.h"
-
-#include <fstream>
-#include "paddle/utils/Logging.h"
-
-#include "NeuralNetwork.h"
-#include "hl_gpu.h"
-
-#ifndef PADDLE_MOBILE_INFERENCE
-#include "GradientMachineMode.h"
-#include "MultiGradientMachine.h"
-#include "MultiNetwork.h"
-#include "ParallelNeuralNetwork.h"
-#endif
-
-namespace paddle {
-
-GradientMachine* GradientMachine::create(
-    const ModelConfig& config,
-    int mode,
-    const std::vector<ParameterType>& parameterTypes) {
-#ifndef PADDLE_MOBILE_INFERENCE
-  if (auto gm = IGradientMachineMode::tryCreateGradientMachine(mode, config)) {
-    return gm;
-  }
-  if (FLAGS_trainer_count > 1) {
-    return new MultiGradientMachine(config, FLAGS_use_gpu);
-  }
-#endif
-  if (FLAGS_trainer_count == 1) {  // single
-#ifndef PADDLE_MOBILE_INFERENCE
-    NeuralNetwork* nn;
-    if (config.type() == "multi_nn") {
-      /* multi submodel calculate, thread(s) will be initialized inside */
-      nn = new MultiNetwork("root");
-    } else if (FLAGS_parallel_nn) {
-      /* multi threads calculate */
-      nn = new ParallelNeuralNetwork();
-    } else {
-      /* single thread calculate */
-      nn = NeuralNetwork::create(config);
-    }
-#else
-    NeuralNetwork* nn = NeuralNetwork::create(config);
-#endif
-    ParamInitCallback testParamInitCb = [](int paramId, Parameter* para) {
-      para->enableType(PARAMETER_VALUE);
-    };
-    nn->init(
-        config, mode == kTesting ? testParamInitCb : nullptr, parameterTypes);
-    return nn;
-  }
-  LOG(FATAL) << "Unknown model type: " << config.type();
-  return nullptr;
-}
-
-void GradientMachine::saveParameters(const std::string& dir) const {
-  LOG(INFO) << "Saving parameters to " << dir;
-
-  for (auto& para : parameters_) {
-    std::string filename = dir + "/" + para->getName();
-    if (para->isFullSize()) {
-      para->save(filename);
-    }
-  }
-}
-
-void GradientMachine::loadParameters(const std::string& dir) {
-  LOG(INFO) << "Loading parameters from " << dir;
-
-  for (auto& para : parameters_) {
-    std::string filename = dir + "/" + para->getName();
-    if (para->isFullSize()) {
-      para->load(filename);
-    }
-  }
-}
-
-void GradientMachine::randParameters() {
-  LOG(INFO) << "Initing parameters..";
-
-  for (auto& para : parameters_) {
-    if (para->isFullSize()) {
-      para->randomize();
-    }
-  }
-  LOG(INFO) << "Init parameters done.";
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/GradientMachine.h b/paddle/gserver/gradientmachines/GradientMachine.h
deleted file mode 100644
index 60936c311d1b0119186c76d5c95b8819294446ce..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ /dev/null
@@ -1,250 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iostream>
-#include <vector>
-
-#include "ModelConfig.pb.h"
-#include "TrainerConfig.pb.h"
-#include "paddle/gserver/dataproviders/DataProvider.h"
-#include "paddle/gserver/layers/Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/parameter/ParameterUpdaterBase.h"
-#include "paddle/utils/Thread.h"
-
-#ifndef PADDLE_MOBILE_INFERENCE
-#include "paddle/gserver/evaluators/Evaluator.h"
-#endif
-
-namespace paddle {
-/**
- * @brief A gradient machine is capable of calculating some outputs given
- *        some inputs and performing gradient calculation based on the
- *        derivative from the outputs.
- *
- * A gradient machine can be either a full neural network or part of a neural
- * network.
- *
- * Usage for training:
- *
- *  1. Prepare inArgs. Put your input data into inArgs[i].value.
- *
- *  2. Call forward(inArgs, &outArgs)
- *
- *  3. Calculate gradient with respect to outArgs[i]->value
- *     and fill them into outArgs[i]->grad.
- *     This step can be skipped if your the outputs are from cost layers.
- *
- *  4. Call backward(). After backward, gradient of each parameter is
- *     accumulated to getParameters()[i]->getBuf(PARAMETER_GRADIENT)
- *
- *  5. Update parameter value getParameters()[i]->getBuf(PARAMETER_VALUE) using
- *     gradients.
- *
- *  6. Clear gradients to zero.
- *
- * Usage for prediction:
- *
- *  1. Prepare inArgs. Put your input data into inArgs[i].value.
- *
- *  2. Call forward(inArgs, &outArgs)
- *
- *  3. Obtain the prediction result from outArgs[i]
- */
-
-typedef std::vector<LayerStatePtr> MachineState;
-
-class GradientMachine;
-
-typedef std::shared_ptr<GradientMachine> GradientMachinePtr;
-
-class GradientMachine {
-public:
-  enum CreateMode {
-    kNormal = 0,
-    kSgdSparseCpuTraining = 3,
-    kTesting = 4,
-    kCustom = 10
-  };
-
-  /**
-   * Create a gradient machine from ModelConfig
-   * Parameter will have parameterTypes
-   */
-  static GradientMachine* create(
-      const ModelConfig& config,
-      int mode = kNormal,
-      const std::vector<ParameterType>& parameterTypes =
-          std::vector<ParameterType>{
-              PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM});
-
-  virtual ~GradientMachine() {}
-
-  /**
-   * Prefetch row ids of sparse parameter.
-   */
-  virtual void prefetch(const std::vector<Argument>& inArgs) { (void)inArgs; }
-
-  /**
-   * @brief Forward propagation.
-   *
-   * Calculate outputs (outArgs) based the inputs (inArgs)
-   *
-   * @note: if passType==PASS_TEST, then backward() should not be called
-   */
-  virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType) = 0;
-
-  /**
-   * @brief Backward propagation.
-   *
-   * Calculate the gradient of inArgs and parameter.
-   *
-   * This function should only be called after a corresponding forward() call.
-   * The caller is responsible for filling the correct grad for the outArgs
-   * obtained using forward().
-   *
-   * It may also change the grad field for the inArgs supplied at forward()
-   */
-  virtual void backward(const UpdateCallback& callback = nullptr) = 0;
-
-  /**
-   * Combine forward() and backward(). For multithread training, this
-   * may be faster.
-   *
-   * @note: passType PASS_TEST is not allowed for forwardBackward().
-   */
-  virtual void forwardBackward(const std::vector<Argument>& inArgs,
-                               std::vector<Argument>* outArgs,
-                               PassType passType,
-                               const UpdateCallback& callback = nullptr) {
-    forward(inArgs, outArgs, passType);
-    backward(callback);
-  }
-
-  virtual Argument getLayerOutput(const std::string& layerName) = 0;
-
-  // see comment in Layer.h for the function with the same name
-  virtual void resetState() {}
-
-  // set machine state
-  virtual void setState(const MachineState& machineState) {}
-
-  // save machine state
-  virtual void getState(MachineState& machineState) {}
-
-  virtual void onPassEnd() = 0;
-
-#ifndef PADDLE_MOBILE_INFERENCE
-  /**
-   * Create an evaluator which can be used for eval()
-   */
-  virtual Evaluator* makeEvaluator() const = 0;
-
-  /**
-   * evaluate using the given evaluator
-   */
-  virtual void eval(Evaluator* evaluator) const = 0;
-#endif
-
-  std::vector<ParameterPtr>& getParameters() { return parameters_; }
-
-  std::vector<ParameterPtr>& getNonStaticParameters() {
-    if (nonStaticParameters_.empty()) {
-      for (auto para : parameters_) {
-        if (!para->isStatic()) {
-          nonStaticParameters_.push_back(para);
-        }
-      }
-    }
-    return nonStaticParameters_;
-  }
-
-  inline bool hasStaticParameters() {
-    return parameters_.size() != getNonStaticParameters().size();
-  }
-
-  /**
-   * @brief   Used before formal training, start work-threads and set
-   *          trainer Parameters;
-   *
-   * @note    This function will only been implemented and used in a
-   *          multithreaded environment.
-   */
-  virtual void start() {}
-
-  /**
-   * @brief   check  each work-thread whether is failed/error/finish,
-   *          if not, return ture, and yes return false.
-   *
-   * @note    This function will only been implemented and used in a
-   *          multithreaded environment.
-   */
-  virtual void finish() {}
-
-  /**
-   * @brief   set the training status a "finished" value, the sub_work_threads
-   *          will option the change, and then exit.
-   *
-   * @note    This function will only been implemented and used in a
-   *          multithreaded environment.
-   */
-  virtual bool trainIsOn() { return true; }
-
-  /**
-   * @brief   when all or some of the sub-workThreads are suspended to waiting
-   *          controller's instructions, and after some processing done in the
-   *          controller, it will call this function to wake up all the pending
-   *          thread.
-   *
-   * @note    This function will only been implemented and used in a
-   *          multithreaded environment.
-   */
-  virtual void restart() {}
-
-  /// Set the gradient of the output from outside.
-  virtual void setOutputGrad(const std::vector<Argument>& args) {
-    LOG(FATAL) << "Not implemented!";
-  }
-
-  void saveParameters(const std::string& dir) const;
-
-  void loadParameters(const std::string& dir);
-
-  void randParameters();
-
-  virtual void getStats(real& cost, int64_t& numProcessed) {
-    (void)cost;
-    (void)numProcessed;
-  }
-
-  /**
-   * @brief   Release the middle layer's output memory.
-   *
-   * @note    This function is used for memory optimization in inference.
-   */
-  virtual void releaseOutput() {}
-
-protected:
-  virtual void onLoadParameter() {}
-
-  std::vector<ParameterPtr> parameters_;
-  std::vector<ParameterPtr> nonStaticParameters_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/GradientMachineMode.h b/paddle/gserver/gradientmachines/GradientMachineMode.h
deleted file mode 100644
index 898b68fbbc329145109ad0ae4b97c872d4f9a37c..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/GradientMachineMode.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "GradientMachine.h"
-#include "unordered_map"
-
-namespace paddle {
-
-class IGradientMachineMode {
-public:
-  virtual ~IGradientMachineMode() {}
-
-public:  // interfaces
-         /**
-          * @brief create current mode's gradient machine by model config.
-          * @param config model config
-          */
-  virtual GradientMachine* create(const ModelConfig& config) = 0;
-
-  /**
-   * @brief shouldBeMe the current mode of GradientMachine should be this mode.
-   * @param algo training algorithm name.
-   * @param trainerCount trainer count.
-   * @param isLocal is local mode (without pserver)
-   * @param isGpu is using gpu.
-   * @return true if mode should be this mode.
-   */
-  virtual bool shouldBeMe(const std::string& algo,
-                          size_t trainerCount,
-                          bool isLocal,
-                          bool isGpu) const = 0;
-
-  /**
-   * @brief Is data must be in cpu even if using gpu mode.
-   * @param trainerCount trainer count
-   * @return true if data must be gpu.
-   */
-  virtual bool isDataMustInCpu(size_t trainerCount) const = 0;
-
-  /**
-   * @brief Need not to use mini-batch method, and should train all data in one
-   * batch in one pass.
-   */
-  virtual bool needTrainWholeDataInOneBatch() const = 0;
-
-public:  // static methods.
-         /**
-          * @brief register a custom gradient machine mode.
-          * @note For user to register a custom gradient machine mode, id should >=
-          * kCustom.
-          * @param mode mode id.
-          * @param ptr mode description object.
-          */
-  static void regGradientMachineMode(
-      int32_t mode, std::unique_ptr<IGradientMachineMode>&& ptr) {
-    modes_.insert(std::make_pair(mode, std::move(ptr)));
-  }
-
-  /**
-   * @brief get custom mode from mode id.
-   * @param mode mode id
-   * @return mode description object.
-   */
-  static IGradientMachineMode* mode(int32_t mode) {
-    if (modes_.find(mode) != modes_.end()) {
-      return modes_[mode].get();
-    } else {
-      return nullptr;
-    }
-  }
-
-  /**
-   * @brief helper function to test trainWholeDataInOneBatch or not for mode
-   */
-  static bool trainWholeDataInOneBatch(int32_t mode) {
-    if (modes_.find(mode) != modes_.end()) {
-      return modes_[mode]->needTrainWholeDataInOneBatch();
-    } else {
-      return false;
-    }
-  }
-
-  /**
-   * @brief Try to get custom mode if we can.
-   * @param [out] mode the custom mode id.
-   * @param [in] algo algorithm name
-   * @param [in] trainerCount trainer count.
-   * @param [in] isLocal is local or not
-   * @param [in] isGpu using gpu or not.
-   * @return true if there is a custom mode fit these conditions.
-   */
-  static bool tryGetMode(int* mode,
-                         const std::string& algo,
-                         int32_t trainerCount,
-                         bool isLocal,
-                         bool isGpu) {
-    for (auto it = modes_.begin(); it != modes_.end(); ++it) {
-      if (it->second->shouldBeMe(algo, trainerCount, isLocal, isGpu)) {
-        *mode = it->first;
-        return true;
-      }
-    }
-    return false;
-  }
-
-  /**
-   * @brief helper function for data must in cpu
-   */
-  static bool dataMustInCpu(int32_t mode, size_t trainerCount) {
-    if (modes_.find(mode) != modes_.end()) {
-      return modes_[mode]->isDataMustInCpu(trainerCount);
-    } else {
-      // provide data to cpu if using synchronized multi-gpu gradient machine.
-      return trainerCount > 1;
-    }
-  }
-
-  /**
-   * @brief try to create gradient machine by mode & config.
-   * @return nullptr if we cannot create a gradient machine by such mode.
-   */
-  static GradientMachine* tryCreateGradientMachine(int32_t mode,
-                                                   const ModelConfig& config) {
-    auto m = IGradientMachineMode::mode(mode);
-    if (m) {
-      return m->create(config);
-    } else {
-      return nullptr;
-    }
-  }
-
-private:
-  static std::unordered_map<int32_t, std::unique_ptr<IGradientMachineMode>>
-      modes_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
deleted file mode 100644
index 3f46cc98cdef17d14c253c732814bcba005fd667..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ /dev/null
@@ -1,894 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MultiGradientMachine.h"
-
-#include "paddle/utils/Logging.h"
-
-#include "paddle/utils/Stat.h"
-
-#include "NeuralNetwork.h"
-#include "ParallelNeuralNetwork.h"
-
-DEFINE_bool(allow_only_one_model_on_one_gpu,
-            true,
-            "If true, do not allow multiple models on one GPU device");
-
-namespace paddle {
-
-// get types of the parameters which need to be merged after backward()
-static void fillMergeTypes(PassType passType,
-                           std::vector<ParameterType>* mergeTypes) {
-  mergeTypes->clear();
-  if (passType != PASS_TEST) {
-    mergeTypes->push_back(PARAMETER_GRADIENT);
-  }
-}
-
-MultiGradientMachine::MultiGradientMachine(const ModelConfig& config,
-                                           bool useGpu)
-    : useGpu_(useGpu),
-      trainerBarrier_(FLAGS_trainer_count),
-      allBarrier_(FLAGS_trainer_count + 1),
-      inArgsCopied_(false) {
-  isPassGrad_ = false;
-  numThreads_ = FLAGS_trainer_count;
-  if (useGpu) {
-    //! TODO(yuyang18): When useGpu=false && paddle is not compiled with gpu,
-    //! the hl_get_device_count will get an error result. It seems should return
-    //! 0 when hppl is not compiled as gpu version.
-    numDevices_ = hl_get_device_count();
-  } else {
-    numDevices_ = 0;
-  }
-  ParamInitCallback mainParamInitCb = [this](int paramId, Parameter* para) {
-    // only create buf for CPU parameters
-    // GPU parameters will be created in each thread
-    if (para->useGpu()) return;
-
-    if (para->isSparseRemoteUpdate()) {
-      para->enableType(PARAMETER_VALUE,
-                       FLAGS_loadsave_parameters_in_pserver
-                           ? Parameter::MAT_SPARSE_ROW_PREFETCH
-                           : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
-      para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW);
-    } else if (para->isGradSparseUpdate()) {
-      para->enableType(PARAMETER_VALUE);
-      para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW_IDS);
-      SparseRowIdsCpuMatrix* mat = dynamic_cast<SparseRowIdsCpuMatrix*>(
-          para->getMat(PARAMETER_GRADIENT).get());
-      mat->setNumOfThreads(FLAGS_trainer_count);
-    } else if (para->isValueShared()) {
-      para->enableType(PARAMETER_VALUE, Parameter::MAT_VALUE_SHARED);
-      if (!para->isStatic()) {
-        para->enableType(PARAMETER_GRADIENT);
-      }
-    } else {
-      para->enableType(PARAMETER_VALUE);
-      if (!para->isStatic()) {
-        para->enableType(PARAMETER_GRADIENT);
-      }
-    }
-  };
-
-  NeuralNetwork* nn = NeuralNetwork::create(config);
-  nn->init(config, mainParamInitCb);
-  gradientMachine_.reset(nn);
-  parameters_ = gradientMachine_->getParameters();
-
-  numLogicalDevices_ = 0;
-  if (useGpu_) {
-    numLogicalDevices_ = 1;
-
-    for (size_t pid = 0; pid < parameters_.size(); pid++) {
-      if (parameters_[pid]->getConfig().device() + 1 > numLogicalDevices_) {
-        numLogicalDevices_ = parameters_[pid]->getConfig().device() + 1;
-      }
-    }
-    LOG(INFO) << "numLogicalDevices=" << numLogicalDevices_
-              << " numThreads=" << numThreads_ << " numDevices=" << numDevices_;
-
-    if (numLogicalDevices_ * numThreads_ > numDevices_ &&
-        FLAGS_allow_only_one_model_on_one_gpu) {
-      LOG(FATAL) << "trainer_count * num_devices_in_model "
-                 << "(" << numThreads_ << "*" << numLogicalDevices_ << ")"
-                 << "=" << numThreads_ * numLogicalDevices_
-                 << " exceeds number of GPU devices(" << numDevices_ << ")";
-    }
-    numLogicalDevices_ = std::min(numLogicalDevices_, numDevices_);
-
-    /* Enables direct access to memory allocations on a peer device */
-    for (int i = 0; i < numThreads_; i++) {
-      for (int d = 0; d < numLogicalDevices_; ++d) {
-        enablePeerAccess(logicalDeviceId2RealDeviceId(d, i),
-                         logicalDeviceId2RealDeviceId(d, i + 1));
-        enablePeerAccess(logicalDeviceId2RealDeviceId(d, i),
-                         logicalDeviceId2RealDeviceId(d, i - 1));
-      }
-    }
-  }
-
-  for (int i = 0; i < numThreads_; ++i) {
-    threads_.emplace_back(new TrainerThread(config, i, this));
-  }
-
-  bufferSizes_.resize(numLogicalDevices_, 0);
-  paraMainThread_.reserve(parameters_.size());
-  int pid = 0;
-  for (auto& para : parameters_) {
-    if (para->isStatic() || !para->useGpu()) {
-      paraMainThread_.push_back(0);
-    } else {
-      int end = pid++ % numThreads_;
-      paraMainThread_.push_back(end);
-      int paraDeviceId = para->getDeviceId();
-      if (paraDeviceId == -1) paraDeviceId = 0;
-      paraDeviceId = paraDeviceId % numLogicalDevices_;
-      if (para->getSize() > bufferSizes_[paraDeviceId]) {
-        bufferSizes_[paraDeviceId] = para->getSize();
-        VLOG(1) << "bufferSize[" << paraDeviceId << "]" << para->getSize();
-      }
-    }
-  }
-
-  // TODO(xuwei06) Instead of using maximal buffer size, we may use a smaller
-  // fixed buffer size and use pipeline to dispatch parameter value and merge
-  // parameter gradient, which may be faster.
-
-  // combination of all trainers mainPara into GradientMachine parameters
-  hasNonstaticCpuParamters_ = false;
-  for (size_t pid = 0; pid < parameters_.size(); pid++) {
-    if (parameters_[pid]->useGpu()) {
-      parameters_[pid] = threads_[paraMainThread_[pid]]->getParameters()[pid];
-    } else if (!parameters_[pid]->isStatic()) {
-      hasNonstaticCpuParamters_ = true;
-    }
-  }
-
-  gradBufs_.resize(numThreads_);
-  for (int i = 0; i < numThreads_; ++i) {
-    gradBufs_[i].resize(numLogicalDevices_);
-    for (int d = 0; d < numLogicalDevices_; ++d) {
-      gradBufs_[i][d].sem.post();
-    }
-  }
-
-  outArgStream_ = HPPL_STREAM_1;
-
-  start();
-}
-
-void MultiGradientMachine::start() {
-  for (auto& thread : threads_) {
-    thread->start();
-  }
-}
-
-void MultiGradientMachine::finish() {
-  for (auto& thread : threads_) {
-    thread->stop();
-  }
-}
-
-std::vector<const std::vector<ParameterPtr>*>
-MultiGradientMachine::getSlaveParameters() {
-  std::vector<const std::vector<ParameterPtr>*> vec;
-  vec.reserve(threads_.size());
-  for (auto& thread : threads_) {
-    vec.push_back(&thread->getParameters());
-  }
-  return vec;
-}
-
-void MultiGradientMachine::notifyGradientTransfer(int paramId) {
-  gradQueue_.enqueue(paramId);
-}
-
-void MultiGradientMachine::allocGradBufs() {
-  if (numLogicalDevices_ == 0) return;
-  if (gradBufs_[0][0].bufs.size() >= mergeTypes_.size()) return;
-
-  for (int i = 0; i < numThreads_; i++) {
-    for (int d = 0; d < numLogicalDevices_; ++d) {
-      if (bufferSizes_[d] == 0) continue;
-      SetDevice device(logicalDeviceId2RealDeviceId(d, i));
-      for (size_t j = 0; j < mergeTypes_.size(); j++) {
-        gradBufs_[i][d].bufs.push_back(
-            Vector::create(bufferSizes_[d], /* useGpu= */ true));
-      }
-    }
-  }
-}
-
-void MultiGradientMachine::prefetch(const std::vector<Argument>& inArgs) {
-  // Each gradient machine in threads needs to do prefetch on its own
-  // part of inArgs. So we need to first divide inArgs to each thread
-  inArgs_ = inArgs;
-  startTask(TASK_COPY_IN_ARGS);
-
-  for (auto& para : parameters_) {
-    if (para->isSparseRemoteUpdate()) {
-      auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
-          para->getMat(PARAMETER_VALUE).get());
-      mat->clearIndices();
-    }
-  }
-
-  waitForCopyInArgs();
-
-  // Because SparsePrefetchRowCpuMatrix can only be changed by ONE thread
-  // at one time, we need to do prefetch sequentially
-  for (auto& thread : threads_) {
-    thread->prefetch();
-  }
-
-  for (auto& para : parameters_) {
-    if (para->isSparseRemoteUpdate()) {
-      auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
-          para->getMat(PARAMETER_VALUE).get());
-      mat->setupIndices();
-      auto matGrad = dynamic_cast<SparseRowCpuMatrix*>(
-          para->getMat(PARAMETER_GRADIENT).get());
-      matGrad->reserveStore();
-    }
-  }
-}
-
-void MultiGradientMachine::forward(const std::vector<Argument>& inArgs,
-                                   std::vector<Argument>* outArgs,
-                                   PassType passType) {
-  forwardImp(inArgs, outArgs, passType, TASK_FORWARD);
-}
-
-void MultiGradientMachine::forwardImp(const std::vector<Argument>& inArgs,
-                                      std::vector<Argument>* outArgs,
-                                      PassType passType,
-                                      TaskType taskType) {
-  updateThreadParameters();
-  passType_ = passType;
-
-  if (!inArgsCopied_) {
-    inArgs_ = inArgs;
-    inArgsCopied_ = false;
-  }
-
-  fillMergeTypes(passType, &mergeTypes_);
-  allocGradBufs();
-  startTask(taskType);
-
-  getOutArgs(outArgs, passType);
-}
-
-void MultiGradientMachine::backward(const UpdateCallback& callback) {
-  backwardCallback_ = callback;
-  startTask(TASK_BACKWARD);
-  backwardImp(callback);
-}
-
-void MultiGradientMachine::forwardBackward(const std::vector<Argument>& inArgs,
-                                           std::vector<Argument>* outArgs,
-                                           PassType passType,
-                                           const UpdateCallback& callback) {
-  backwardCallback_ = callback;
-  forwardImp(inArgs, outArgs, passType, TASK_FORWARD_BACKWARD);
-  backwardImp(callback);
-}
-
-Argument MultiGradientMachine::getLayerOutput(const std::string& layerName) {
-  std::vector<Argument> args;
-  args.reserve(threads_.size());
-
-  for (auto& thread : threads_) {
-    args.push_back(thread->getGradientMachine()->getLayerOutput(layerName));
-  }
-  outLayerArgs_.concat(args, false /* use_gpu */, outArgStream_, passType_);
-
-  return outLayerArgs_;
-}
-
-void MultiGradientMachine::backwardImp(const UpdateCallback& callback) {
-  for (size_t i = 0; i < parameters_.size(); i++) {
-    if (!parameters_[i]->useGpu() || parameters_[i]->isStatic()) continue;
-    REGISTER_TIMER("controller_dequeue");
-    gradQueue_.dequeue();
-  }
-  if (hasNonstaticCpuParamters()) {
-    waitAfterMerge();
-    if (backwardCallback_) {
-      for (auto& para : parameters_) {
-        if (!para->useGpu() && !para->isStatic()) {
-          backwardCallback_(para.get());
-        }
-      }
-    }
-  }
-}
-
-void MultiGradientMachine::updateThreadParameters() {
-  for (size_t pid = 0; pid < parameters_.size(); ++pid) {
-    if (!parameters_[pid]->useGpu()) continue;
-    if (!parameters_[pid]->isValueUpdated()) continue;
-    parameters_[pid]->clearValueUpdated();
-    for (int i = 0; i < (int)threads_.size(); i++) {
-      threads_[i]->incUpdateCounter();
-    }
-    // NotifyValueReady should happen after that all threads' incUpdateCounter()
-    // are called so that the counters are correct when notifyValueReady()
-    // is called.
-    threads_[paraMainThread_[pid]]->notifyValueReady(pid);
-  }
-}
-
-void MultiGradientMachine::onPassEnd() {
-  for (auto& thread : threads_) {
-    thread->onPassEnd();
-  }
-}
-
-Evaluator* MultiGradientMachine::makeEvaluator() const {
-  return threads_[0]->getGradientMachine()->makeEvaluator();
-}
-
-void MultiGradientMachine::eval(Evaluator* evaluator) const {
-  for (auto& thread : threads_) {
-    SetDevice device(thread->getDeviceId());
-    if (thread->hasInputData()) {
-      thread->getGradientMachine()->eval(evaluator);
-    }
-  }
-}
-
-void MultiGradientMachine::getOutArgs(std::vector<Argument>* outArgs,
-                                      PassType passType) {
-  for (auto& thread : threads_) {
-    REGISTER_TIMER("waitOutArgs");
-    thread->waitOutArgsReady();
-  }
-
-  outArgs_.resize(threads_[threads_.size() - 1]->getOutArgs().size());
-
-  REGISTER_TIMER("copyOutArgs");
-  for (size_t i = 0; i < outArgs_.size(); ++i) {
-    std::vector<Argument> args;
-    args.reserve(threads_.size());
-    for (auto& thread : threads_) {
-      // If the thread input is empty, then the output is empty.
-      auto tmp = thread->getOutArgs();
-      if (tmp.size() > 0) {
-        args.push_back(tmp[i]);
-      }
-    }
-    outArgs_[i].concat(args, useGpu_, outArgStream_, passType);
-  }
-
-  if (useGpu_) {
-    hl_stream_synchronize(outArgStream_);
-  }
-
-  *outArgs = outArgs_;
-}
-
-void MultiGradientMachine::setOutputGrad(const std::vector<Argument>& args) {
-  CHECK_EQ(args.size(), outArgs_.size());
-  for (size_t i = 0; i < args.size(); i++) {
-    outArgs_[i].grad = args[i].grad;
-  }
-}
-
-void MultiGradientMachine::startTask(TaskType taskType) {
-  taskType_ = taskType;
-  for (auto& thread : threads_) {
-    thread->notifyTaskReady();
-  }
-}
-
-TrainerThread::TrainerThread(const ModelConfig& config,
-                             int threadId,
-                             MultiGradientMachine* multiMachine)
-    : multiMachine_(multiMachine),
-      config_(config),
-      threadId_(threadId),
-      inArgsCopied_(false) {
-  int numThreads = multiMachine->getNumThreads();
-
-  auto& mainParas = multiMachine->getParameters();
-
-  using std::placeholders::_1;
-  using std::placeholders::_2;
-
-  partnerId_ = mod(threadId_ - 1, numThreads);
-
-  deviceId_ = !multiMachine_->useGpu()
-                  ? -1
-                  : multiMachine_->logicalDeviceId2RealDeviceId(0, threadId_);
-  SetDevice gpuDevice(deviceId_);
-
-  NeuralNetwork* nn = nullptr;
-  if (!multiMachine->useGpu() || !FLAGS_parallel_nn) {
-    nn = NeuralNetwork::create(config);
-  } else {
-    nn = new ParallelNeuralNetwork();
-    for (auto& paraConfig : *config_.mutable_parameters()) {
-      if (paraConfig.device() != -1) {
-        paraConfig.set_device(multiMachine_->logicalDeviceId2RealDeviceId(
-            paraConfig.device(), threadId_));
-      }
-    }
-    for (auto& layerConfig : *config_.mutable_layers()) {
-      if (layerConfig.device() != -1) {
-        layerConfig.set_device(multiMachine_->logicalDeviceId2RealDeviceId(
-            layerConfig.device(), threadId_));
-      }
-    }
-  }
-  // Only GPU do not share parameter values with main paramters.
-  ParamInitCallback slaveParamInitCb =
-      std::bind(parameterInitNN, _1, _2, &mainParas);
-  nn->init(config_, slaveParamInitCb);
-  gradientMachine_.reset(nn);
-  parameters_ = gradientMachine_->getParameters();
-  if (!FLAGS_parallel_nn) {
-    for (auto& para : parameters_) {
-      para->setDevice(deviceId_);
-    }
-  }
-
-  backwardCallback_ =
-      std::bind(&TrainerThread::backwardCallback, this, std::placeholders::_1);
-
-  gradStream_ = HPPL_STREAM_2;
-  valueStream_ = HPPL_STREAM_3;
-  stopping_ = true;
-  updateCounter_ = 0;
-  parameterUpdated_ = false;
-}
-
-TrainerThread::~TrainerThread() { stop(); }
-
-void TrainerThread::start() {
-  if (!stopping_) return;
-
-  stopping_ = false;
-
-  gradientMachine_->start();
-
-  computeThread_.reset(new std::thread([this]() { computeThread(); }));
-
-  if (multiMachine_->useGpu()) {
-    gradCollectThread_.reset(
-        new std::thread([this]() { gradCollectThread(); }));
-
-    valueDispatchThread_.reset(
-        new std::thread([this]() { valueDispatchThread(); }));
-
-    copyThread_.reset(new std::thread([this]() { copyGradToBufferThread(); }));
-  }
-}
-
-void TrainerThread::stop() {
-  if (stopping_) return;
-
-  stopping_ = true;
-
-  if (computeThread_) {
-    taskReadySem_.post();
-    computeThread_->join();
-  }
-  if (gradCollectThread_) {
-    gradQueue_.enqueue(0);
-    gradCollectThread_->join();
-  }
-  if (copyThread_) {
-    gradBufQueue_.enqueue(0);
-    copyThread_->join();
-  }
-  if (valueDispatchThread_) {
-    valueReadyQueue_.enqueue(0);
-    valueDispatchThread_->join();
-  }
-}
-
-void TrainerThread::computeThread() {
-  VLOG(1) << "gradComputeThread " << threadId_;
-
-  if (deviceId_ >= 0) {
-    hl_init(deviceId_);
-  }
-
-  while (true) {
-    {
-      REGISTER_TIMER("taskSem_wait");
-      taskReadySem_.wait();
-    }
-
-    if (stopping_) break;
-
-    switch (multiMachine_->getTaskType()) {
-      case MultiGradientMachine::TASK_FORWARD_BACKWARD:
-        forward();
-        backward();
-        break;
-      case MultiGradientMachine::TASK_FORWARD:
-        forward();
-        break;
-      case MultiGradientMachine::TASK_BACKWARD:
-        backward();
-        break;
-      case MultiGradientMachine::TASK_COPY_IN_ARGS:
-        batchSize_ = copyInArgs();
-        inArgsCopied_ = true;
-        multiMachine_->waitForCopyInArgs();
-        break;
-    }
-  }
-}
-
-void TrainerThread::prefetch() {
-  SetDevice setDevice(deviceId_);
-  gradientMachine_->prefetch(inArgs_);
-}
-
-void TrainerThread::forward() {
-  if (!inArgsCopied_) {
-    REGISTER_TIMER("copyInArgs");
-    batchSize_ = copyInArgs();
-  } else {
-    inArgsCopied_ = false;
-  }
-
-  if (multiMachine_->getPassType() != PASS_TEST) {
-    REGISTER_TIMER("clearGradient");
-    // For main parameter, the user of MultiGpuSyncMachine is responsible
-    // for setting the gradient to zero
-    for (size_t i = 0; i < parameters_.size(); i++) {
-      if (parameters_[i]->useGpu()) {
-        if (multiMachine_->paraMainThread(i) != threadId_) {
-          SetDevice device(parameters_[i]->getDeviceId());
-          parameters_[i]->clearGradient();
-        }
-      } else {
-        parameters_[i]->clearGradient();
-      }
-    }
-  }
-
-  {
-    REGISTER_TIMER("wait_value");
-    valueReadyCond_.wait([this]() { return !parameterUpdated_; });
-  }
-
-  { fillMergeTypes(multiMachine_->getPassType(), &mergeTypes_); }
-
-  {
-    REGISTER_TIMER("thread_forward");
-    if (batchSize_ > 0) {
-      gradientMachine_->forward(
-          inArgs_, &outArgs_, multiMachine_->getPassType());
-    } else {
-      outArgs_.clear();
-    }
-  }
-  outArgsReadySem_.post();
-}
-
-void TrainerThread::backward() {
-  REGISTER_TIMER("thread_backward");
-  if (multiMachine_->isPassGrad()) {
-    copyOutputGrad();
-  }
-  if (batchSize_ > 0) {
-    gradientMachine_->backward(backwardCallback_);
-  } else {
-    for (size_t i = parameters_.size(); i > 0; i--) {
-      backwardCallback(parameters_[i - 1].get());
-    }
-  }
-  if (multiMachine_->hasNonstaticCpuParamters()) {
-    mergeCpuGradients();
-  }
-}
-
-void TrainerThread::backwardCallback(Parameter* para) {
-  // CPU parameters are merged in the end
-  if (!para->useGpu() || para->isStatic()) return;
-
-  int paramId = para->getID();
-  if (multiMachine_->getNumThreads() == 1) {
-    // no need to do merge if there is only one thread
-    doCallback(paramId);
-  } else if (threadId_ == mod(multiMachine_->paraMainThread(paramId) - 1,
-                              multiMachine_->getNumThreads())) {
-    notifyCopyGradToBuffer(paramId);
-  } else {
-    notifyGradientCollect(paramId);
-  }
-}
-
-void TrainerThread::copyGradToBufferThread() {
-  VLOG(1) << "copyGradToBufferThread " << threadId_;
-
-  if (deviceId_ >= 0) {
-    hl_init(deviceId_);
-  }
-  auto& partnerThread = multiMachine_->getThread(partnerId_);
-  auto& gradBufs = multiMachine_->getGradBuf(partnerId_);
-
-  while (true) {
-    int pid = gradBufQueue_.dequeue();
-    if (stopping_) break;
-
-    int pdeviceId = multiMachine_->realDeviceId2LogicalDeviceId(
-        parameters_[pid]->getDeviceId(), threadId_);
-
-    auto& gradBuf = gradBufs[pdeviceId];
-
-    {
-      REGISTER_TIMER("waitBufferReady");
-      gradBuf.sem.wait();
-    }
-
-    {
-      REGISTER_TIMER("copyGradToBuffer");
-      SetDevice setDevice(parameters_[pid]->getDeviceId());
-      for (size_t i = 0; i < mergeTypes_.size(); ++i) {
-        gradBuf.bufs[i]->resize(
-            parameters_[pid]->getBuf(mergeTypes_[i])->getSize());
-        gradBuf.bufs[i]->copyFrom(*parameters_[pid]->getBuf(mergeTypes_[i]),
-                                  gradStream_);
-      }
-      hl_stream_synchronize(gradStream_);
-    }
-    partnerThread->notifyGradientCollect(pid);
-  }
-}
-
-void TrainerThread::gradCollectThread() {
-  VLOG(1) << "gradCollectThread " << threadId_;
-
-  if (deviceId_ >= 0) {
-    hl_init(deviceId_);
-  }
-
-  std::vector<size_t> gradReadyCount(parameters_.size(), 0);
-
-  auto& gradBufs = multiMachine_->getGradBuf(threadId_);
-
-  while (true) {
-    int pid = gradQueue_.dequeue();
-    if (stopping_) break;
-
-    if (++gradReadyCount[pid] < 2) continue;
-    gradReadyCount[pid] = 0;
-    int pdeviceId = multiMachine_->realDeviceId2LogicalDeviceId(
-        parameters_[pid]->getDeviceId(), threadId_);
-
-    auto& gradBuf = gradBufs[pdeviceId];
-
-    {
-      REGISTER_TIMER("mergeGrad");
-      for (size_t i = 0; i < mergeTypes_.size(); ++i) {
-        ParameterType type = mergeTypes_[i];
-        const VectorPtr& localGrad = parameters_[pid]->getBuf(type);
-        SetDevice setDevice(parameters_[pid]->getDeviceId());
-        localGrad->add(*gradBuf.bufs[i]);
-      }
-    }
-
-    gradBuf.sem.post();
-
-    if (multiMachine_->paraMainThread(pid) == threadId_) {
-      doCallback(pid);
-    } else {
-      notifyCopyGradToBuffer(pid);
-    }
-  }
-}
-
-void TrainerThread::doCallback(int pid) {
-  REGISTER_TIMER("callback");
-  auto& gpuThreads = multiMachine_->getAllThreads();
-  if (multiMachine_->getBackwardCallback()) {
-    // The callback supplied by the user of MultiGradientMachine may handle
-    // the parameter update using the gradient.
-    multiMachine_->getBackwardCallback()(parameters_[pid].get());
-    if (parameters_[pid]->isValueUpdated()) {
-      parameters_[pid]->clearValueUpdated();
-      for (auto& thread : gpuThreads) {
-        thread->incUpdateCounter();
-      }
-      notifyValueReady(pid);
-    }
-  }
-  multiMachine_->notifyGradientTransfer(pid);
-}
-
-void TrainerThread::valueDispatchThread() {
-  VLOG(1) << "valueDispatchThread " << threadId_;
-
-  if (deviceId_ >= 0) {
-    hl_init(deviceId_);
-  }
-
-  auto& thread = multiMachine_->getThread(partnerId_);
-
-  while (true) {
-    int pid;
-    {
-      REGISTER_TIMER("value_dequeue");
-      pid = valueReadyQueue_.dequeue();
-    }
-    if (stopping_) break;
-
-    if (multiMachine_->paraMainThread(pid) == partnerId_) continue;
-
-    {
-      REGISTER_TIMER("copyValue");
-      SetDevice setDevice(parameters_[pid]->getDeviceId());
-      thread->getValueBuf(pid)->copyFrom(*getValueBuf(pid), valueStream_);
-      hl_stream_synchronize(valueStream_);
-    }
-
-    thread->notifyValueReady(pid);
-  }
-}
-
-void TrainerThread::notifyValueReady(int paramId) {
-  if (--updateCounter_ == 0) {
-    valueReadyCond_.notify_all([this] { parameterUpdated_ = false; });
-  }
-
-  notifyValueDispatch(paramId);
-}
-
-int TrainerThread::copyInArgs() {
-  const std::vector<Argument>& fullInArgs = multiMachine_->getInArgs();
-  int numThreads = multiMachine_->getAllThreads().size();
-  int32_t numSequences = fullInArgs[0].getNumSequences();
-  int32_t startSeq = numSequences * threadId_ / numThreads;
-  int32_t endSeq = numSequences * (threadId_ + 1) / numThreads;
-  int32_t copySize = endSeq - startSeq;
-
-  /**
-   * For the first copy, need to allocate space here
-   */
-  if (inArgs_.size() == 0) {
-    inArgs_.resize(fullInArgs.size());
-  }
-
-  if (copySize == 0) {
-    return 0;
-  }
-
-  for (size_t i = 0; i < fullInArgs.size(); i++) {
-    inArgs_[i].resizeAndCopyFrom(
-        fullInArgs[i],
-        startSeq,
-        copySize,
-        FLAGS_parallel_nn ? false : multiMachine_->useGpu());
-  }
-  return copySize;
-}
-
-void TrainerThread::mergeCpuGradients() {
-  CHECK_EQ(mergeTypes_.size(), 1UL);
-  CHECK_EQ(mergeTypes_[0], PARAMETER_GRADIENT);
-
-  {
-    REGISTER_TIMER("waitbeforeMerge");
-    multiMachine_->waitBeforeMerge();
-  }
-  std::vector<const std::vector<ParameterPtr>*> slaveParameters =
-      multiMachine_->getSlaveParameters();
-
-  CHECK(slaveParameters.size());
-  for (auto& para : multiMachine_->getNonStaticParameters()) {
-    if (para->useGpu()) continue;
-    if (para->isSparseRemoteUpdate()) {
-      REGISTER_TIMER("mergeRemoteGradSparse");
-      mergeGradSparseRemote(para.get(), slaveParameters);
-    } else if (para->isGradSparseUpdate()) {
-      REGISTER_TIMER("mergeGradSparse");
-      mergeGradSparse(para.get(), slaveParameters);
-    } else {
-      REGISTER_TIMER("mergeGradDense");
-      mergeGradDense(para.get(), slaveParameters);
-    }
-  }
-  {
-    REGISTER_TIMER("waitbeforeMerge");
-    multiMachine_->waitAfterMerge();
-  }
-}
-
-void TrainerThread::mergeGradSparse(
-    Parameter* para,
-    std::vector<const std::vector<ParameterPtr>*>& slaveParameters) {
-  size_t pid = para->getID();
-  SparseRowIdsCpuMatrix* mainMat = dynamic_cast<SparseRowIdsCpuMatrix*>(
-      para->getMat(PARAMETER_GRADIENT).get());
-  std::vector<uint32_t>& ids = mainMat->getIds(threadId_);
-
-  for (auto slaveParams : slaveParameters) {
-    SparseRowCpuMatrix* mat = dynamic_cast<SparseRowCpuMatrix*>(
-        (*slaveParams)[pid]->getMat(PARAMETER_GRADIENT).get());
-    mat->addTo(*mainMat, ids, threadId_, multiMachine_->getNumThreads());
-    // we use a sample hash method(%) instead of range partition,
-    // because range partition has balance issue sometimes,
-    // when feature ids are not generated from hashcode.
-  }
-  uniqueIds(ids);
-}
-
-void TrainerThread::mergeGradSparseRemote(
-    Parameter* para,
-    std::vector<const std::vector<ParameterPtr>*>& slaveParameters) {
-  size_t pid = para->getID();
-  SparseRowCpuMatrix* mainMat =
-      dynamic_cast<SparseRowCpuMatrix*>(para->getMat(PARAMETER_GRADIENT).get());
-
-  mainMat->checkIndices();
-  mainMat->zeroMemThread(threadId_, multiMachine_->getNumThreads());
-
-  for (auto slaveParams : slaveParameters) {
-    SparseRowCpuMatrix* mat = dynamic_cast<SparseRowCpuMatrix*>(
-        (*slaveParams)[pid]->getMat(PARAMETER_GRADIENT).get());
-    mat->addTo(*mainMat, threadId_, multiMachine_->getNumThreads());
-  }
-}
-
-void TrainerThread::mergeGradDense(
-    Parameter* para,
-    std::vector<const std::vector<ParameterPtr>*>& slaveParameters) {
-  size_t pid = para->getID();
-  auto interval = calcSplitArrayInterval(para->getSize(),
-                                         (size_t)threadId_,
-                                         multiMachine_->getNumThreads(),
-                                         8LU /*for avx*/);
-  size_t startSeq = interval.first;
-  size_t copySize = interval.second - interval.first;
-
-  // setup sub bufs
-  CpuVector destGrad(0, nullptr);
-  destGrad.subVecFrom(*para->getBuf(PARAMETER_GRADIENT), startSeq, copySize);
-
-  // merge
-  CpuVector slaveGradSub(0, nullptr);
-  for (auto slaveParams : slaveParameters) {
-    slaveGradSub.subVecFrom(
-        *(*slaveParams)[pid]->getBuf(PARAMETER_GRADIENT), startSeq, copySize);
-    destGrad.add(slaveGradSub);
-  }
-}
-
-void TrainerThread::copyOutputGrad() {
-  const std::vector<Argument>& outputGradArgs = multiMachine_->outArgs_;
-  int numThreads = multiMachine_->getAllThreads().size();
-  int32_t numSequences = outputGradArgs[0].getNumSequences();
-  int32_t startSeq = numSequences * threadId_ / numThreads;
-  int32_t endSeq = numSequences * (threadId_ + 1) / numThreads;
-  int32_t copySize = endSeq - startSeq;
-  outArgs_.resize(outputGradArgs.size());
-  for (size_t i = 0; i < outputGradArgs.size(); i++) {
-    outArgs_[i].resizeAndCopyFrom(outputGradArgs[i],
-                                  startSeq,
-                                  copySize,
-                                  multiMachine_->useGpu(),
-                                  HPPL_STREAM_DEFAULT);
-  }
-  if (multiMachine_->useGpu()) {
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  }
-  gradientMachine_->setOutputGrad(outArgs_);
-}
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.h b/paddle/gserver/gradientmachines/MultiGradientMachine.h
deleted file mode 100644
index 83d2651f34b3698848427f29b1a90e606e57950e..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.h
+++ /dev/null
@@ -1,478 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <atomic>
-
-#include "GradientMachine.h"
-
-#include "hl_gpu.h"
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/Queue.h"
-
-namespace paddle {
-
-class TrainerThread;
-
-typedef Queue<int> PidQueue;
-typedef std::unique_ptr<TrainerThread> TrainerThreadPtr;
-
-struct GradBuffer {
-  /// GradBuffer is used for gathering gradient for GPU parameters
-  int paramId;
-
-  /// sem is used to notify that the local gradient merge of the current thread
-  /// finished for the current thread.
-  Semaphore sem;
-
-  // bufs[mergeIndex]
-  std::vector<VectorPtr> bufs;
-};
-
-/**
- *  A MultiGradientMachine is a synchronous GradientMachine which devides
- *  one data batch into several smaller batches and assign each one small batch
- *  to one computint thread for computation. After each thread finishes
- *  computation, it merges result (including output Argument and gradient during
- *  backward()). It basically is the same as single thread gradient machine,
- *  except that it uses multi-thread to do the computation.
- *
- *  It handles GPU and Cpu parameters differently.  In GPU, one computing thread
- *  generally corresponds to one GPU device. Thus, each thread keeps a separate
- *  copy of the parameter in its own device's memory. In CPU, we only need to
- keep
- *  one copy of the parameters in the main memory. After, each computing thread
- *  computes its own parameter gradient, the update process needs to accumulate
- *  the parameter gradients from all the computing threads, and update the
- *  accumulated parameter gradient to the corresponding parameter value.
- *
- *  Each GPU parameter is assigned to a thread called its main thread. For each
- *  parameter, the accumulation of its gradients and the update of its value
- *  happens in its main thread. The main thread first gather the parameter
- *  gradients from all the computing thread. Then, it performs parameter update.
- *  After a gradient is updated by the main thread, it is scattered to all the
- *  computing thread so that the parameters in all the computing threads are
- *  synchronized. The scatter and gather process are implemented by ring-style
- *  communication. Assume we have N computing threads, its thread ids will be
- *  0, 1, ..., N-1. For each parameter, the id of the main thread is specified
- in
- *  paraMainThread_[pid], where pid is the id of the parameter. Each thread i
- only
- *  sends data to its partner thread (i - 1) % N. For example, for a parameter
- *  gradient that is computed in thread 4, and its main thread is 2. Its
- *  traveling process would be 4, 5,..., N-1, 0, 1, 2. In each step, the
- gradient
- *  buffer is added to the local gradient, and the local gradient is then copied
- *  to the gradient buffer of the next thread. At last, its main thread 2 will
- *  get the accumulated parameter gradient. For the same parameter, after its
- *  value is updated, the value's traveling process would be 2, 1, 0, N-1, ...
- 3.
- *  At the end, all the computing threads would have the updated parameter
- value.
- *
- *  A computing thread (TrainerThread) uses 4 threads to do different jobs:
- *
- *  1. computeThread(): performing forward(), backward(), prefetch().
- *
- *  2. valueDispatchThread(): copying parameter values to partner thread.
- *
- *  3. copyGradToBufferThread(): copying parameter gradient to partner thread.
- *
- *  4. gradCollectThread(): merging the gradient from step 3 with local gradient
- *     and call the callback supplied by the user to update parameter value.
- *
- *  CPU parameter value has only one copy. And their gradients are merged at the
- *  end of backward().
- *
- *  * Handling of sparse update
- *  Currently, sparse update is only supported for CPU parameters.
-
- *  Sparse updates refers to gradient caculation where the gradient is sparse.
- For
- *  example, if the input argument to a 'fc' layer is sparse, the gradient of
- the
- *  weight matrix of this layer will be sparse. It is usually more efficient to
- *  treat the gradient explicitly as sparse vector during the parameter update.
-
- *  There are two types of sparse updates called local sparse update and remote
- *  sparse update.
-
- *  For both types of sparse updates, there is one copy of parameter value and
- *  gradient called main parameter value and gradient, and there is a copy of
- *  parameter value and gradient for each computing thread called slave
- parameter
- *  value and gradient. The slave parameter values are always shared with the
- *  corresponding main parameter value. The slave parameter grad is a sparse row
- *  matrix. The sparse pattern for slave parameter grads are different, because
- *  the small batches for each computing thread might have different sparsity
- *  pattern.
-
- *  1. Local sparse update
- *
- *     Main parameter value type is MAT_NORMAL. It is a dense matrix.
- *
- *     Main parameter grad type is MAT_SPARSE_ROW_IDS (SparseRowIdsCpuMatrix)
- *     It is also a dense matrix, but the updated values are specified by IDS.
- *
- *     Slave parameter value shares with main parameter value.
- *
- *     Slave parameter grad type is MAT_SPARSE_ROW_AUTO_GROW
- *     (SparseAutoGrowRowCpuMatrix). It is a sparse row matrix.
- *
- *     During backward() of each TrainerThread, SparseAutoGrowRowCpuMatrix will
- *     gather all the non-zero gradient. And After backward(), they will be
- merged
- *     into main parameter grad (SparseRowIdsCpuMatrix), with indices indicating
- *     which rows have nonzero gradient.
- *
- *  2. Remote sparse update
- *
- *     Main parameter value type is MAT_SPARSE_ROW_PREFETCH(_FULL_SIZE)
- *     (SparsePrefetchRowCpuMatrix). MAT_SPARSE_ROW_PREFETCH is a sparse matrix.
- *     MAT_SPARSE_ROW_PREFETCH_FULL_SIZE is a dense matrix. However, only the
- *     parameter values that are prefetched is up-to-date.
- *
- *     Main parameter grad type is MAT_SPARSE_ROW (SparseRowCpuMatrix).
- *     And it shares sparse pattern with value by sharing indexDictHandle_,
- which
- *     is an internal data structure used by SparseRowCpuMatrixto specify the
- *     sparsity pattern of Slave parameter value shares with main parameter
- value.
- *
- *     Slave parameter grad type is MAT_SPARSE_ROW_AUTO_GROW
- *     (SparsePrefetchRowCpuMatrix). It is a sparse row matrix
- *
- *     During prefetch(), all the layers will indicates which rows of each
- *     parameter are needed. Then the framework will retrieve those rows from
- *     parameter server.
- *
- *     During backward() of each TrainerThread, SparseAutoGrowRowCpuMatrix will
- *     gather all the non-zero gradient. And After backward(), they will be
- merged
- *     into main parameter grad (SparseRowCpuMatrix). And the framework will
- send
- *     the merged gradient to parameter server.
- */
-class MultiGradientMachine : public GradientMachine {
-public:
-  enum TaskType {
-    TASK_FORWARD_BACKWARD = 0,
-    TASK_FORWARD = 1,
-    TASK_BACKWARD = 2,
-    TASK_COPY_IN_ARGS = 3,
-  };
-
-  explicit MultiGradientMachine(const ModelConfig& config, bool useGpu);
-
-  virtual void start();
-
-  virtual void finish();
-
-  virtual void prefetch(const std::vector<Argument>& inArgs);
-
-  virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType);
-
-  virtual void backward(const UpdateCallback& callback = nullptr);
-
-  void forwardBackward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType,
-                       const UpdateCallback& callback);
-
-  virtual Argument getLayerOutput(const std::string& layerName);
-
-  virtual void onPassEnd();
-
-  virtual Evaluator* makeEvaluator() const;
-
-  virtual void eval(Evaluator* evaluator) const;
-
-  bool useGpu() const { return useGpu_; }
-
-  /// @return whether to pass the gradients in outArgs_ to each threads.
-  bool isPassGrad() { return isPassGrad_; }
-
-  /// @brief set whether to pass the gradient in outArgs_ to each threads.
-  void setPassGrad(bool isPass) { isPassGrad_ = isPass; }
-
-  /// Set the gradients of the outputs.
-  /// The gradietns will be copied to each thread in the computing threads.
-  virtual void setOutputGrad(const std::vector<Argument>& args);
-
-protected:
-  friend class TrainerThread;
-
-  std::vector<TrainerThreadPtr>& getAllThreads() { return threads_; }
-  /// Calculate the real device id based on the logical device id and the
-  /// thread id.
-  int logicalDeviceId2RealDeviceId(int logicalId, int threadId = 0) const {
-    if (logicalId == -1) {
-      logicalId = 0;
-    }
-    return mod(logicalId + FLAGS_gpu_id + threadId * numLogicalDevices_,
-               numDevices_);
-  }
-
-  /// Calculate the logical device id based on the real device id and the
-  /// thread id.
-  int realDeviceId2LogicalDeviceId(int realId, int threadId = 0) const {
-    if (realId == -1) {
-      return 0;
-    } else {
-      return mod(realId - FLAGS_gpu_id - threadId * numLogicalDevices_,
-                 numDevices_);
-    }
-  }
-
-  std::vector<const std::vector<ParameterPtr>*> getSlaveParameters();
-
-  bool hasNonstaticCpuParamters() const { return hasNonstaticCpuParamters_; }
-
-  /// Called TrainerThread to wait before merging CPU parameter gradients.
-  void waitBeforeMerge() { trainerBarrier_.wait(); }
-
-  /// called by MultiGradientMachine and TrainerThread to wait after merging
-  /// CPU parameter graidents.
-  void waitAfterMerge() { allBarrier_.wait(); }
-
-  /// called by MultiGradientMachine and TrainerThread to wait for copyInArgs()
-  /// finishing
-  void waitForCopyInArgs() { allBarrier_.wait(); }
-
-  TrainerThreadPtr& getThread(int threadId) { return threads_[threadId]; }
-
-  std::vector<GradBuffer>& getGradBuf(int threadId) {
-    return gradBufs_[threadId];
-  }
-
-  PassType getPassType() const { return passType_; }
-
-  /// Called by TrainerThread to notify MultiGradientMachine that the gradient
-  /// for paramId is ready
-  void notifyGradientTransfer(int paramId);
-
-  const std::vector<Argument>& getInArgs() { return inArgs_; }
-
-  TaskType getTaskType() const { return taskType_; }
-
-  const UpdateCallback& getBackwardCallback() const {
-    return backwardCallback_;
-  }
-
-  int getNumDevices() const { return numDevices_; }
-
-  int getNumLogicalDevices() const { return numLogicalDevices_; }
-
-  int getNumThreads() const { return numThreads_; }
-
-  int paraMainThread(int pid) const { return paraMainThread_[pid]; }
-
-protected:
-  virtual void forwardImp(const std::vector<Argument>& inArgs,
-                          std::vector<Argument>* outArgs,
-                          PassType passType,
-                          TaskType taskType);
-
-  virtual void backwardImp(const UpdateCallback& callback = NULL);
-
-  /// update all parameters
-  void updateThreadParameters();
-
-  void startTask(TaskType taskType);
-
-  void getOutArgs(std::vector<Argument>* outArgs, PassType passType);
-
-  void allocGradBufs();
-
-protected:
-  bool useGpu_;
-
-  bool hasNonstaticCpuParamters_;
-
-  /// store main parameter only
-  std::unique_ptr<GradientMachine> gradientMachine_;
-
-  std::vector<TrainerThreadPtr> threads_;
-  std::vector<int> paraMainThread_;
-  std::vector<std::vector<GradBuffer>> gradBufs_;  // [threadId][deviceId]
-  std::vector<size_t> bufferSizes_;
-
-  PassType passType_;
-  TaskType taskType_;
-  PidQueue gradQueue_;
-  std::vector<Argument> inArgs_;
-  std::vector<Argument> outArgs_;
-  hl_stream_t outArgStream_;
-
-  Argument outLayerArgs_;
-
-  /// ParameterType which needs to be merged from each GPU
-  std::vector<ParameterType> mergeTypes_;
-  int numDevices_;         /* number of gpu devices */
-  int numLogicalDevices_;  // number of GPU used by one NN
-  int numThreads_;         /* number of train threads */
-
-  UpdateCallback backwardCallback_;
-
-  /// barrrier for threads_
-  ThreadBarrier trainerBarrier_;
-
-  /// barrier for both MultiGradientMachine and threds_
-  ThreadBarrier allBarrier_;
-
-  /// indicate whether inArgs is copied before forward()
-  bool inArgsCopied_;
-
-  /// Whether to copy the gradient back from an external input.
-  bool isPassGrad_;
-};
-
-class TrainerThread {
-public:
-  TrainerThread(const ModelConfig& config,
-                int threadId,
-                MultiGradientMachine* multiMachine);
-
-  ~TrainerThread();
-
-  void start();
-
-  void onPassEnd() { gradientMachine_->onPassEnd(); }
-
-  void waitOutArgsReady() { outArgsReadySem_.wait(); }
-
-  void notifyTaskReady() { taskReadySem_.post(); }
-
-  int getDeviceId() const { return deviceId_; }
-
-  GradientMachine* getGradientMachine() { return gradientMachine_.get(); }
-
-  const std::vector<ParameterPtr>& getParameters() { return parameters_; }
-
-  void stop();
-
-  void notifyValueReady(int paramId);
-
-  const VectorPtr& getValueBuf(int paramId) {
-    return parameters_[paramId]->getBuf(PARAMETER_VALUE);
-  }
-
-  const std::vector<Argument>& getOutArgs() { return outArgs_; }
-
-  void incUpdateCounter(int n = 1) {
-    updateCounter_ += n;
-    parameterUpdated_ = true;
-  }
-
-  void notifyGradientCollect(int paramId) { gradQueue_.enqueue(paramId); }
-
-  void notifyCopyGradToBuffer(int paramId) { gradBufQueue_.enqueue(paramId); }
-
-  void notifyValueDispatch(int paramId) { valueReadyQueue_.enqueue(paramId); }
-
-  void prefetch();
-
-  /// copy the output gradient from the main GradientMachine.
-  void copyOutputGrad();
-
-  /// Whether the thread has input data.
-  bool hasInputData() { return batchSize_ != 0; }
-
-protected:
-  void mergeCpuGradients();
-
-  void mergeGradSparse(
-      Parameter* para,
-      std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
-
-  void mergeGradSparseRemote(
-      Parameter* para,
-      std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
-
-  void mergeGradDense(
-      Parameter* para,
-      std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
-
-  void computeThread();
-  void valueDispatchThread();
-  void copyGradToBufferThread();
-  void gradCollectThread();
-
-  int copyInArgs();
-  void forward();
-  void backward();
-  void backwardCallback(Parameter* para);
-
-  /// call the actuall callback supplied by the caller of
-  /// GradientMachine::backward
-  void doCallback(int pid);
-
-protected:
-  MultiGradientMachine* multiMachine_;
-  ModelConfig config_;
-  /// whether the thread should stop
-  bool stopping_;
-  /// the threads form which to collect gradient
-  int partnerId_;
-  /// from 0 to threads-1
-  int threadId_;
-  int deviceId_;
-  std::unique_ptr<GradientMachine> gradientMachine_;
-  std::vector<ParameterPtr> parameters_;
-
-  /// ParameterType which needs to be merged from each GPU
-  std::vector<ParameterType> mergeTypes_;
-
-  /// compute thread
-  std::unique_ptr<std::thread> computeThread_;
-  std::vector<Argument> inArgs_;
-  std::vector<Argument> outArgs_;
-  Semaphore taskReadySem_;
-  Semaphore outArgsReadySem_;
-
-  /// copy thread
-  std::unique_ptr<std::thread> copyThread_;
-  /// queue of gradient needs to be copied to partner
-  PidQueue gradBufQueue_;
-  hl_stream_t gradStream_;
-
-  /// grad merge thread
-  std::unique_ptr<std::thread> gradCollectThread_;
-  /// queue of gradient needs to be merged with gradient coopied by
-  /// copyGradToBufferThread
-  PidQueue gradQueue_;
-  UpdateCallback backwardCallback_;
-
-  /// value dispatch thread
-  std::unique_ptr<std::thread> valueDispatchThread_;
-  /// queue of the parameter whose the vale are ready for copy
-  PidQueue valueReadyQueue_;
-
-  /// used to notify all the parameter values are ready
-  LockedCondition valueReadyCond_;
-
-  hl_stream_t valueStream_;
-  /// how many parameters are updated
-  std::atomic<int> updateCounter_;
-  bool parameterUpdated_;
-
-  /// indicate whether inArgs is copied before forward()
-  bool inArgsCopied_;
-  int batchSize_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/MultiNetwork.cpp b/paddle/gserver/gradientmachines/MultiNetwork.cpp
deleted file mode 100644
index a1140402b8baaae20e20802ebf87462e301b60f9..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/MultiNetwork.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-
-#include "MultiNetwork.h"
-
-#include "NeuralNetwork.h"
-#include "ParallelNeuralNetwork.h"
-
-namespace paddle {
-
-void MultiNetwork::init(const ModelConfig& config,
-                        ParamInitCallback callback,
-                        const std::vector<ParameterType>& parameterTypes,
-                        bool useGpu) {
-  CHECK_GT(config.sub_models_size(), 1) << "sub_models_size should GT 1";
-  // check submodel[0] is root
-  CHECK_EQ("root", config.sub_models(0).name())
-      << "sub_models(0) should be root";
-  // ignore root
-  subNetworks_.resize(config.sub_models_size() - 1);
-  // base class
-  NeuralNetwork::init(config, callback, parameterTypes, useGpu);
-  // sub networks
-  for (int i = 1; i < config.sub_models_size(); ++i) {
-    std::string subModelName = config.sub_models(i).name();
-    if (FLAGS_parallel_nn) {
-      subNetworks_[i - 1] = std::unique_ptr<ParallelNeuralNetwork>(
-          new ParallelNeuralNetwork(subModelName, this));
-    } else {
-      subNetworks_[i - 1] = std::unique_ptr<NeuralNetwork>(
-          NeuralNetwork::newNeuralNetwork(subModelName, this));
-    }
-    subNetworks_[i - 1]->init(config);
-  }
-}
-
-void MultiNetwork::prefetch(const std::vector<Argument>& inArgs) {
-  std::vector<std::vector<Argument>> argumentGroups;
-  Argument::splitByDataId(inArgs, &argumentGroups);
-  // check group size is equal to sub network size
-  CHECK_EQ(argumentGroups.size(), subNetworks_.size());
-  for (size_t i = 0; i < subNetworks_.size(); i++) {
-    if (argumentGroups[i].size() == 1 && argumentGroups[i][0].dataId == -1) {
-      // check input args: if dataId is -1, then skip this sub network
-      continue;
-    }
-    subNetworks_[i]->prefetch(argumentGroups[i]);
-  }
-}
-
-void MultiNetwork::forward(const std::vector<Argument>& inArgs,
-                           std::vector<Argument>* outArgs,
-                           PassType passType) {
-  // split inArgs to several vectors
-  std::vector<std::vector<Argument>> argumentGroups;
-  Argument::splitByDataId(inArgs, &argumentGroups);
-
-  // check group size is equal to sub network size
-  CHECK_EQ(argumentGroups.size(), subNetworks_.size());
-  std::vector<Argument> tempOutArgs;
-  outArgs->clear();
-
-  for (size_t i = 0; i < subNetworks_.size(); i++) {
-    tempOutArgs.clear();
-    if (argumentGroups[i].size() == 1 && argumentGroups[i][0].dataId == -1) {
-      // check input args: if dataId is -1, then skip this sub network
-      continue;
-    }
-    subNetworks_[i]->forward(argumentGroups[i], &tempOutArgs, passType);
-    for (const auto& elem : tempOutArgs) {
-      outArgs->push_back(elem);
-      outArgs->back().dataId = i;
-    }
-  }
-}
-
-void MultiNetwork::backward(const UpdateCallback& callback) {
-  for (size_t i = 0; i < subNetworks_.size(); i++) {
-    subNetworks_[i]->backward(callback);
-  }
-}
-
-void MultiNetwork::forwardBackward(const std::vector<Argument>& inArgs,
-                                   std::vector<Argument>* outArgs,
-                                   PassType passType,
-                                   const UpdateCallback& callback) {
-  forward(inArgs, outArgs, passType);
-  backward(callback);
-}
-
-void MultiNetwork::onPassEnd() {
-  for (size_t i = 0; i < subNetworks_.size(); i++) {
-    subNetworks_[i]->onPassEnd();
-  }
-}
-
-void MultiNetwork::start() {
-  for (auto& subNetwork : subNetworks_) {
-    subNetwork->start();
-  }
-}
-
-void MultiNetwork::finish() {
-  for (size_t i = 0; i < subNetworks_.size(); i++) {
-    subNetworks_[i]->finish();
-  }
-}
-
-class MultiCombinedEvaluator : public Evaluator {
-public:
-  MultiCombinedEvaluator() {}
-  void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
-    evaluators_.emplace_back(std::move(evaluator));
-  }
-  virtual void start() {
-    for (auto& evaluator : evaluators_) {
-      evaluator->start();
-    }
-  }
-
-  virtual void finish() {
-    for (auto& evaluator : evaluators_) {
-      evaluator->finish();
-    }
-  }
-
-  virtual void eval(const NeuralNetwork& nn) {
-    const MultiNetwork& multiNetwork = dynamic_cast<const MultiNetwork&>(nn);
-    CHECK_EQ(evaluators_.size(), multiNetwork.getSubNetworks().size());
-    int size = evaluators_.size();
-    for (int i = 0; i < size; i++) {
-      // one evaluator for one subNetwork
-      evaluators_[i]->eval(*multiNetwork.getSubNetworks()[i]);
-    }
-  }
-
-  virtual real evalImp(std::vector<Argument>& arguments) {
-    (void)arguments;
-    return -1;
-  }
-
-  virtual void printStats(std::ostream& os) const {
-    for (auto& evaluator : evaluators_) {
-      evaluator->printStats(os);
-      os << ' ';
-    }
-  }
-
-  virtual void distributeEval(ParameterClient2* client) {
-    for (auto& evaluator : evaluators_) {
-      evaluator->distributeEval(client);
-    }
-  }
-
-protected:
-  std::vector<std::unique_ptr<Evaluator>> evaluators_;
-};
-
-Evaluator* MultiNetwork::makeEvaluator() const {
-  MultiCombinedEvaluator* multiCombinedEvaluator = new MultiCombinedEvaluator();
-  for (size_t i = 0; i < subNetworks_.size(); i++) {
-    std::unique_ptr<Evaluator> evaluator(subNetworks_[i]->makeEvaluator());
-    multiCombinedEvaluator->addEvaluator(std::move(evaluator));
-  }
-  return multiCombinedEvaluator;
-}
-
-void MultiNetwork::eval(Evaluator* evaluator) const { evaluator->eval(*this); }
-
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/MultiNetwork.h b/paddle/gserver/gradientmachines/MultiNetwork.h
deleted file mode 100644
index 186a9ad0a39cd7815aea6738e6c6bc4a0c944aa9..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/MultiNetwork.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "GradientMachine.h"
-#include "NeuralNetwork.h"
-
-#include "paddle/utils/Locks.h"
-
-namespace paddle {
-
-class MultiNetwork : public NeuralNetwork {
-public:
-  explicit MultiNetwork(std::string subModelName = "")
-      : NeuralNetwork(subModelName) {}
-
-  virtual void init(const ModelConfig& config,
-                    ParamInitCallback callback,
-                    const std::vector<ParameterType>& parameterTypes,
-                    bool useGpu);
-
-  virtual void prefetch(const std::vector<Argument>& inArgs);
-
-  virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType);
-
-  virtual void backward(const UpdateCallback& callback = nullptr);
-
-  void forwardBackward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType,
-                       const UpdateCallback& callback);
-
-  virtual void onPassEnd();
-
-  virtual Evaluator* makeEvaluator() const;
-
-  virtual void eval(Evaluator* evaluator) const;
-
-  const std::vector<std::unique_ptr<NeuralNetwork>>& getSubNetworks() const {
-    return subNetworks_;
-  }
-
-  virtual void start();
-
-  virtual void finish();
-
-protected:
-  std::vector<std::unique_ptr<NeuralNetwork>> subNetworks_;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
deleted file mode 100644
index a3c13df3dbad973505d8919bce8b95348527e273..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ /dev/null
@@ -1,548 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Util.h"
-
-#include "NeuralNetwork.h"
-#include "hl_gpu.h"
-#include "paddle/utils/CustomStackTrace.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-#ifdef PADDLE_WITH_MKLDNN
-#include "paddle/gserver/layers/MKLDNNLayer.h"
-#endif
-
-#ifndef PADDLE_MOBILE_INFERENCE
-#include "MultiNetwork.h"
-#include "RecurrentGradientMachine.h"
-#include "paddle/gserver/layers/AgentLayer.h"
-#endif
-
-namespace paddle {
-void parameterInitNN(int paramId,
-                     Parameter* para,
-                     std::vector<ParameterPtr>* sharedParams) {
-  // Create parameters values.
-  if (!para->useGpu() && sharedParams) {
-    para->enableSharedType(PARAMETER_VALUE,
-                           (*sharedParams)[paramId]->getBuf(PARAMETER_VALUE),
-                           (*sharedParams)[paramId]->getMat(PARAMETER_VALUE));
-  } else {
-    if (para->isSparseRemoteUpdate()) {
-      para->enableType(PARAMETER_VALUE,
-                       FLAGS_loadsave_parameters_in_pserver
-                           ? Parameter::MAT_SPARSE_ROW_PREFETCH
-                           : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
-    } else {
-      para->enableType(PARAMETER_VALUE);
-    }
-  }
-  // Create parameter gradients.
-  if (para->isSparseRemoteUpdate() && !sharedParams) {
-    para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW);
-  } else if (para->isGradSparseUpdate()) {
-    para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW_AUTO_GROW);
-  } else if (!para->isStatic()) {
-    para->enableType(PARAMETER_GRADIENT);
-  }
-}
-
-NeuralNetwork* NeuralNetwork::create(const ModelConfig& config) {
-#ifndef PADDLE_MOBILE_INFERENCE
-  if (config.type() == "recurrent_nn") {
-    return newNeuralNetwork("root");
-  } else if (config.type() == "multi_nn") {
-    return new MultiNetwork("root");
-  } else {
-    return newNeuralNetwork();
-  }
-#else
-  return new NeuralNetwork();
-#endif
-}
-
-std::map<std::string, bool> NeuralNetwork::dllInitMap;
-
-void NeuralNetwork::init(const ModelConfig& config,
-                         ParamInitCallback callback,
-                         const std::vector<ParameterType>& parameterTypes,
-                         bool useGpu) {
-  using std::placeholders::_1;
-  using std::placeholders::_2;
-  ParamInitCallback paramCallback = nullptr;
-  if (callback != nullptr) {
-    paramSelfInited_ = false;
-    paramCallback = callback;
-  } else {
-    paramSelfInited_ = true;
-    paramCallback = std::bind(parameterInitNN, _1, _2, nullptr);
-  }
-  config_ = config;
-
-  if (rootNetwork_ != nullptr) {
-    // direct use parameters_ and parameterMap_ from base network
-    CHECK_EQ((size_t)config.parameters_size(),
-             rootNetwork_->getParameters().size());
-    parameters_ = rootNetwork_->getParameters();
-    parameterMap_ = *(rootNetwork_->getParameterMap());
-  } else {
-    parameters_.reserve(config.parameters_size());
-    for (const auto& para_config : config.parameters()) {
-      auto parameter = std::make_shared<Parameter>(para_config,
-                                                   useGpu,
-                                                   /*initialize=*/false);
-      paramCallback(parameters_.size(), parameter.get());
-      if (!callback) {
-        for (ParameterType type :
-             (parameter->isStatic()
-                  ? std::vector<ParameterType>{PARAMETER_VALUE}
-                  : parameterTypes)) {
-          if (type != PARAMETER_VALUE && type != PARAMETER_GRADIENT) {
-            parameter->enableType(type);
-          }
-        }
-      }
-      parameter->setID(parameters_.size());
-      parameters_.push_back(parameter);
-      CHECK(!parameterMap_.count(parameter->getName()));
-      parameterMap_[parameter->getName()] = parameter;
-    }
-  }
-
-  auto layerCreate = [&](const LayerConfig& layer_config) {
-    auto layer = Layer::create(layer_config);
-    CHECK(layer) << "Create layer failed. Layer name:" << layer->getName();
-    layers_.push_back(layer);
-    CHECK(!layerMap_.count(layer->getName()));
-    layerMap_[layer->getName()] = layer;
-  };
-
-  auto subModelConfig = std::find_if(config.sub_models().begin(),
-                                     config.sub_models().end(),
-                                     [=](const SubModelConfig& sub_model) {
-                                       return sub_model.name() == subModelName_;
-                                     });
-  bool useSubModel = (subModelConfig != config.sub_models().end());
-  CHECK_EQ(useSubModel, !subModelName_.empty());
-  if (useSubModel) {
-    layers_.reserve(subModelConfig->layer_names_size());
-    for (const auto& layer_name : subModelConfig->layer_names()) {
-      auto layer_config =
-          std::find_if(config.layers().begin(),
-                       config.layers().end(),
-                       [=](const LayerConfig& layer_config) {
-                         return layer_config.name() == layer_name;
-                       });
-      CHECK(layer_config != config.layers().end());
-      layerCreate(*layer_config);
-    }
-  } else {
-    layers_.reserve(config.layers_size());
-    for (const auto& layer_config : config.layers()) {
-      bool useLayer = true;
-      if (config.has_external_config()) {
-        useLayer = true;
-        for (const auto& name : config.external_config().layer_names()) {
-          if (layer_config.name() == name) {
-            useLayer = false;
-            break;
-          }
-        }
-      }
-      if (useLayer) {
-        layerCreate(layer_config);
-      }
-    }
-  }
-
-  for (const auto& layer : layers_) {
-    layer->init(layerMap_, parameterMap_);
-    layer->initSubNetwork(this /*root*/, config_, parameterTypes, useGpu);
-  }
-
-  for (const auto& layer_name :
-       (useSubModel ? subModelConfig->input_layer_names()
-                    : config.input_layer_names())) {
-    auto it = layerMap_.find(layer_name);
-    CHECK(it != layerMap_.end());
-    dataLayers_.push_back(std::dynamic_pointer_cast<DataLayer>(it->second));
-  }
-
-  for (const auto& layer_name :
-       (useSubModel ? subModelConfig->output_layer_names()
-                    : config.output_layer_names())) {
-    auto it = layerMap_.find(layer_name);
-    CHECK(it != layerMap_.end());
-    outputLayers_.push_back(it->second);
-  }
-
-  for (const auto& layer : layers_) {
-    const auto& name = layer->getName();
-    bool isMiddleLayer = true;
-
-    // if data layer
-    for (const auto& dataLayer : dataLayers_) {
-      if (name == dataLayer->getName()) {
-        isMiddleLayer = false;
-        break;
-      }
-    }
-
-    // if output layer
-    for (const auto& dataLayer : outputLayers_) {
-      if (name == dataLayer->getName()) {
-        isMiddleLayer = false;
-        break;
-      }
-    }
-
-    if (isMiddleLayer) {
-      middleLayers_.push_back(layer);
-    }
-  }
-}
-
-void NeuralNetwork::connect(LayerPtr agentLayer,
-                            LayerPtr realLayer,
-                            int height) {
-#ifndef PADDLE_MOBILE_INFERENCE
-  AgentLayer* agent = dynamic_cast<AgentLayer*>(agentLayer.get());
-  CHECK_NOTNULL(agent);
-  agent->setRealLayer(realLayer, height);
-#endif
-}
-
-void NeuralNetwork::connect(std::string agentLayerName,
-                            NeuralNetwork* srcNN,
-                            std::string realLayerName) {
-  connect(this->getLayer(agentLayerName), srcNN->getLayer(realLayerName));
-}
-
-void NeuralNetwork::prefetch(const std::vector<Argument>& inArgs) {
-  CHECK_EQ(inArgs.size(), dataLayers_.size());
-
-  if (paramSelfInited_) {
-    for (auto& para : parameters_) {
-      if (para->isSparseRemoteUpdate()) {
-        auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
-            para->getMat(PARAMETER_VALUE).get());
-        para->clearGradient();
-        if (mat) mat->clearIndices();
-      }
-    }
-  }
-
-  for (size_t i = 0; i != dataLayers_.size(); ++i) {
-    if (FLAGS_parallel_nn) {
-      const_cast<Argument&>(inArgs[i]).deviceId = -1;
-    }
-    dataLayers_[i]->setData(inArgs[i]);
-  }
-
-  for (auto& layer : layers_) {
-    layer->prefetch();
-  }
-
-  if (paramSelfInited_) {
-    for (auto& para : parameters_) {
-      if (para->isSparseRemoteUpdate()) {
-        auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
-            para->getMat(PARAMETER_VALUE).get());
-        mat->setupIndices();
-        auto matGrad = dynamic_cast<SparseRowCpuMatrix*>(
-            para->getMat(PARAMETER_GRADIENT).get());
-        matGrad->reserveStore();
-      }
-    }
-  }
-}
-
-void NeuralNetwork::forward(const std::vector<Argument>& inArgs,
-                            std::vector<Argument>* outArgs,
-                            PassType passType) {
-  CHECK_EQ(inArgs.size(), dataLayers_.size());
-  outArgs->resize(outputLayers_.size());
-  for (size_t i = 0; i != dataLayers_.size(); ++i) {
-    dataLayers_[i]->setData(inArgs[i]);
-  }
-
-  gLayerStackTrace.set_stage(true);
-
-  {
-    for (auto& layer : layers_) {
-      REGISTER_TIMER_INFO("ForwardTimer", layer->getName().c_str());
-      gLayerStackTrace.push(layer->getName());
-      layer->forward(passType);
-      gLayerStackTrace.pop(layer->getName());
-    }
-  }
-
-  outArgs->clear();
-  outArgs->reserve(outputLayers_.size());
-  for (auto& layer : outputLayers_) {
-    outArgs->push_back(layer->getOutput());
-  }
-}
-
-void NeuralNetwork::resetState() {
-  for (auto& layer : layers_) {
-    layer->resetState();
-  }
-}
-
-void NeuralNetwork::setState(const MachineState& machineState) {
-  for (size_t i = 0; i < layers_.size(); i++) {
-    if (machineState[i] != nullptr) {
-      layers_[i]->setState(machineState[i]);
-    }
-  }
-}
-
-void NeuralNetwork::getState(MachineState& machineState) {
-  machineState.clear();
-  machineState.reserve(layers_.size());
-  for (auto& layer : layers_) {
-    LayerStatePtr p = layer->getState();
-    machineState.push_back(p);
-  }
-}
-
-void NeuralNetwork::backward(const UpdateCallback& callback) {
-  gLayerStackTrace.set_stage(false);
-  FOR_EACH_R(layer, layers_) {
-    REGISTER_TIMER_INFO("BackwardTimer", (*layer)->getName().c_str());
-    gLayerStackTrace.push((*layer)->getName());
-    if ((*layer)->needGradient()) {
-      (*layer)->backward(callback);
-    }
-    gLayerStackTrace.pop((*layer)->getName());
-  }
-}
-
-void NeuralNetwork::finish() {
-#ifdef PADDLE_WITH_MKLDNN
-  FOR_EACH_R(layer, layers_) {
-    MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(*layer);
-    if (dnnLayer) {
-      dnnLayer->convertWeightsToPaddle();
-    }
-  }
-#endif
-}
-
-Argument NeuralNetwork::getLayerOutput(const std::string& layerName) {
-  return getLayer(layerName)->getOutput();
-}
-
-void NeuralNetwork::onPassEnd() {
-  for (auto& layer : layers_) {
-    layer->onPassEnd();
-  }
-}
-
-void NeuralNetwork::releaseOutput() {
-  for (auto& layer : middleLayers_) {
-    Argument& arg = layer->getOutput();
-    arg.value.reset();
-  }
-}
-
-#ifndef PADDLE_MOBILE_INFERENCE
-
-class CombinedEvaluator : public Evaluator {
-public:
-  void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
-    evaluators_.emplace_back(std::move(evaluator));
-  }
-  void start() override {
-    for (auto& evaluator : evaluators_) {
-      evaluator->start();
-    }
-  }
-
-  void finish() override {
-    for (auto& evaluator : evaluators_) {
-      evaluator->finish();
-    }
-  }
-
-  void eval(const NeuralNetwork& nn) override {
-    for (auto& evaluator : evaluators_) {
-      evaluator->eval(nn);
-    }
-  }
-  real evalImp(std::vector<Argument>& arguments) override {
-    (void)arguments;
-    return -1;
-  }
-  void printStats(std::ostream& os) const override {
-    for (auto& evaluator : evaluators_) {
-      evaluator->printStats(os);
-      os << ' ';
-    }
-  }
-
-  void distributeEval(ParameterClient2* client) override {
-    for (auto& evaluator : evaluators_) {
-      evaluator->distributeEval(client);
-    }
-  }
-
-protected:
-  std::vector<std::unique_ptr<Evaluator>> evaluators_;
-
-  // Evaluator interface
-public:
-  /**
-   * @brief getNames will return all inside evaluators' names.
-   * @param names [out]: return names.
-   */
-  void getNames(std::vector<std::string>* names) override {
-    for (auto& eval : evaluators_) {
-      eval->getNames(names);
-    }
-  }
-
-  /**
-   * @brief getValue could get all inside evaluators' value.
-   */
-  real getValue(const std::string& name, Error* err) const override {
-    return this->getMethodHelper<real>(
-        name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
-          return eval->getValue(name, err);
-        });
-  }
-
-  /**
-   * @brief getType could get all inside evaluators' type.
-   */
-  std::string getType(const std::string& name, Error* err) const override {
-    return this->getMethodHelper<std::string>(
-        name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
-          return eval->getType(name, err);
-        });
-  }
-
-private:
-  template <typename T>
-  T getMethodHelper(const std::string& name,
-                    Error* err,
-                    const std::function<T(const std::unique_ptr<Evaluator>&)>&
-                        callback) const {
-    for (auto& eval : evaluators_) {
-      std::vector<std::string> names;
-      eval->getNames(&names);
-      if (std::find(names.begin(), names.end(), name) != names.end()) {
-        return callback(eval);
-      }
-    }
-    *err = Error("No such key %s", name.c_str());
-    return T();
-  }
-};
-
-class SubnetEvaluator : public CombinedEvaluator {
-public:
-  SubnetEvaluator(const std::string& layerName,
-                  std::unique_ptr<Evaluator>&& evaluator)
-      : layerName_(layerName) {
-    addEvaluator(std::move(evaluator));
-  }
-  void eval(const NeuralNetwork& nn) override {
-    const LayerPtr& layer = nn.getLayer(layerName_);
-    CHECK(layer) << "Nonexisted layer: " << layerName_ << " in submodel "
-                 << nn.getName();
-    bool accessed = false;
-    layer->accessSubNetwork([this, &accessed](NeuralNetwork& subnet) {
-      subnet.eval(evaluators_[0].get());
-      accessed = true;
-    });
-    CHECK(accessed) << "There is no subnetwork for layer " << layerName_
-                    << " in submodel " << nn.getName();
-  }
-
-protected:
-  std::string layerName_;
-};
-
-Evaluator* NeuralNetwork::makeEvaluator() const {
-  CombinedEvaluator* combinedEvaluator = new CombinedEvaluator();
-  auto subModelConfig = std::find_if(config_.sub_models().begin(),
-                                     config_.sub_models().end(),
-                                     [=](const SubModelConfig& sub_model) {
-                                       return sub_model.name() == subModelName_;
-                                     });
-  bool useSubModel = (subModelConfig != config_.sub_models().end());
-  CHECK_EQ(useSubModel, !subModelName_.empty());
-  if (useSubModel) {
-    // create the evaluators that belong to CURRENT submodel
-    for (int i = 0; i < subModelConfig->evaluator_names_size(); ++i) {
-      // find evaluator by name
-      auto thisEvalConfig = std::find_if(
-          config_.evaluators().begin(),
-          config_.evaluators().end(),
-          [=](const EvaluatorConfig& ecfg) {
-            return ecfg.name() == subModelConfig->evaluator_names(i);
-          });
-      bool validConfig = (thisEvalConfig != config_.evaluators().end());
-      if (validConfig) {
-        std::unique_ptr<Evaluator> evaluator(
-            Evaluator::create(*thisEvalConfig));
-        combinedEvaluator->addEvaluator(std::move(evaluator));
-      }
-    }
-    for (auto& layer : layers_) {
-      layer->accessSubNetwork(
-          [layer, combinedEvaluator](NeuralNetwork& subnet) {
-            std::unique_ptr<Evaluator> subEvaluator(new SubnetEvaluator(
-                layer->getName(),
-                std::unique_ptr<Evaluator>(subnet.makeEvaluator())));
-            combinedEvaluator->addEvaluator(std::move(subEvaluator));
-          });
-    }
-  } else {
-    for (const EvaluatorConfig& evalConfig : config_.evaluators()) {
-      std::unique_ptr<Evaluator> evaluator(Evaluator::create(evalConfig));
-      combinedEvaluator->addEvaluator(std::move(evaluator));
-    }
-  }
-  return combinedEvaluator;
-}
-
-void NeuralNetwork::eval(Evaluator* evaluator) const { evaluator->eval(*this); }
-
-#endif
-
-void NeuralNetwork::setOutputGrad(const std::vector<Argument>& args) {
-  CHECK_GE(outputLayers_.size(), args.size());
-  for (size_t i = 0; i < args.size(); ++i) {
-    outputLayers_[i]->getOutput().grad = args[i].grad;
-  }
-}
-
-extern NeuralNetwork* newCustomNerualNetwork(const std::string& name,
-                                             NeuralNetwork* network)
-    __attribute__((weak));
-
-NeuralNetwork* NeuralNetwork::newNeuralNetwork(const std::string& name,
-                                               NeuralNetwork* rootNetwork) {
-  if (newCustomNerualNetwork) {
-    return newCustomNerualNetwork(name, rootNetwork);
-  } else {
-    return new NeuralNetwork(name, rootNetwork);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h
deleted file mode 100644
index 5b32f844f742c07c8bee6638cb46dc00285f49b0..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <map>
-#include <memory>
-
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/dataproviders/DataProvider.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-#include "paddle/gserver/layers/CostLayer.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/Layer.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/utils/ClassRegistrar.h"
-
-namespace paddle {
-/*
- * @brief  Init function for the parameters.
- * @param paramId: the id of the parameter to init.
- * @param para: the pointer to the parameter to init.
- * @param sharedParams: the pointer to an array of the parameter to be shared.
- *                      If it is null, no parameter sharing is used.
- *                      Only CPU paramters can be shared.
- * It handles CPU, CPU sparse, CPU sparse remote,
- * and GPU parameters differently. If the type
- * of a parameter is NORMAL. Basically nothing need to be done.
- * CPU value: NORMAL.
- * CPU param: NORMAL.
- *
- * CPU sparse value: NORMAL.
- * CPU sparse gradient: MAT_SPARSE_ROW_AUTO_GROW.
- *
- * CPU sparse remote value: MAT_SPARSE_ROW_PREFETCH(_FULL_SIZE).
- * CPU sparse remote gradient: MAT_SPARSE_ROW (!sharedParams)
- *                             MAT_SPARSE_ROW_AUTO_GROW (sharedParams)
- *
- * GPU value: NORMAL
- * GPU param: NORMAL
- */
-void parameterInitNN(int paramId,
-                     Parameter* para,
-                     std::vector<ParameterPtr>* sharedParams);
-
-class NeuralNetwork : public GradientMachine {
-public:
-  virtual void init(const ModelConfig& config,
-                    ParamInitCallback callback = nullptr,
-                    const std::vector<ParameterType>& parameterTypes =
-                        std::vector<ParameterType>{PARAMETER_VALUE,
-                                                   PARAMETER_GRADIENT,
-                                                   PARAMETER_MOMENTUM},
-                    bool useGpu = FLAGS_use_gpu);
-
-  /**
-   * Connect two submodels and
-   * down-submodel's output become up-submodel's input.
-   * By default, connection is one by one,
-   * If the agent height is smaller than real layer, *height* has to be filled.
-   *
-   * @param realLayer  The down-submodel's output layer.
-   * @param agentLayer The up-submodel's input agent layer.
-   */
-  static void connect(LayerPtr agentLayer, LayerPtr realLayer, int height = 0);
-  void connect(std::string agentLayerName,
-               NeuralNetwork* srcNN,
-               std::string realLayerName);
-
-  virtual void prefetch(const std::vector<Argument>& inArgs);
-
-  virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType);
-
-  virtual void backward(const UpdateCallback& callback = nullptr);
-
-  virtual Argument getLayerOutput(const std::string& layerName);
-
-  const LayerPtr& getLayer(const std::string& layerName) const {
-    auto it = layerMap_.find(layerName);
-    CHECK(it != layerMap_.end()) << "Unknown layer " << layerName;
-    return it->second;
-  }
-
-  virtual void onPassEnd();
-
-#ifndef PADDLE_MOBILE_INFERENCE
-  virtual Evaluator* makeEvaluator() const;
-
-  virtual void eval(Evaluator* evaluator) const;
-#endif
-
-  virtual void resetState();
-  virtual void setOutputGrad(const std::vector<Argument>& args);
-
-  /// set machine state
-  virtual void setState(const MachineState& machineState);
-
-  /// get machine state
-  virtual void getState(MachineState& machineState);
-
-  static NeuralNetwork* create(const ModelConfig& config);
-
-  ParameterMap* getParameterMap() { return &parameterMap_; }
-
-  /**
-   * @brief Access each layer as a for each loop.
-   * @param callback invoke with each layer.
-   */
-  template <typename T>
-  void forEachLayer(T callback) {
-    for (auto& l : layers_) {
-      if (callback(l)) {
-        break;
-      }
-    }
-  }
-
-  static NeuralNetwork* newNeuralNetwork(const std::string& name = "",
-                                         NeuralNetwork* rootNetwork = nullptr);
-
-  const std::string& getName() const { return subModelName_; }
-
-  /// some finish work, like convert the weight format of MKLDNNLayers
-  void finish();
-
-  /**
-   * @brief   Release the middle layer's output memory.
-   *
-   * @note    This function is used for memory optimization in inference.
-   */
-  void releaseOutput();
-
-protected:
-  /**
-   * The constructor of NeuralNetwork.
-   * The sub networks can get parameters_ and parameterMap_
-   * from base NeuralNetwork.
-   *
-   * @param subModelName The name of sub-model.
-   * @param rootNetwork  It used in MultiNetwork.
-   */
-  NeuralNetwork(std::string subModelName = "",
-                NeuralNetwork* rootNetwork = nullptr)
-      : subModelName_(subModelName), rootNetwork_(rootNetwork) {}
-
-  std::string subModelName_;
-  ModelConfig config_;
-  std::vector<LayerPtr> layers_;
-  ParameterMap parameterMap_;
-  LayerMap layerMap_;
-
-  std::vector<DataLayerPtr> dataLayers_;
-  std::vector<LayerPtr> outputLayers_;
-  std::vector<LayerPtr> middleLayers_;
-
-  static std::map<std::string, bool> dllInitMap;
-
-  NeuralNetwork* rootNetwork_;
-
-  /// Whether parameter of this NN is initialized by its own
-  /// (i.e., not by callback supplied with the caller)
-  bool paramSelfInited_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp
deleted file mode 100644
index 85cfc59fbef7017f8dea7fdfecd18aa3e75a871c..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-
-#include "ParallelNeuralNetwork.h"
-
-#include <pthread.h>
-#include <sched.h>
-
-namespace paddle {
-
-void ParallelNeuralNetwork::init(
-    const ModelConfig& config,
-    ParamInitCallback callback,
-    const std::vector<ParameterType>& parameterTypes,
-    bool useGpu) {
-  NeuralNetwork::init(config, callback, parameterTypes, useGpu);
-
-  if (config.type() == "recurrent_nn") {
-    LOG(FATAL)
-        << "You can not add `--parallel_nn=true` on the command line, "
-        << "parallel_nn training mode does not support the recurrent_nn model.";
-  }
-
-  useGpu_ = useGpu;
-  numDevices_ = 0;
-  if (useGpu_) {
-    numDevices_ = hl_get_device_count();
-  }
-
-  for (auto& layer : layers_) {
-    int deviceId = layer->getDeviceId();
-    CHECK_LT(deviceId, numDevices_);
-    addComputeThread(deviceId);
-  }
-}
-
-void ParallelNeuralNetwork::addComputeThread(int deviceId) {
-  for (auto& thread : threads_) {
-    if (thread->getDeviceId() == deviceId) {
-      return;
-    }
-  }
-
-  threads_.emplace_back(new ParallelThread(
-      threads_.size(), deviceId, deviceId >= 0 ? useGpu_ : false));
-}
-
-void ParallelNeuralNetwork::waitAllThread() {
-  for (auto& thread : threads_) {
-    thread->jobEnqueue(NULL, TASK_END_LAYER);
-  }
-
-  for (size_t i = 0; i < threads_.size(); i++) {
-    threads_[i]->queue_.waitEmpty();
-  }
-}
-
-void ParallelNeuralNetwork::dispatchByDeviceId(int deviceId,
-                                               LayerPtr layer,
-                                               TaskType task) {
-  for (auto& thread : threads_) {
-    if (thread->getDeviceId() == deviceId) {
-      thread->jobEnqueue(layer, task);
-      return;
-    }
-  }
-  LOG(FATAL) << "No specific device thread ";
-}
-
-void ParallelNeuralNetwork::forward(const std::vector<Argument>& inArgs,
-                                    std::vector<Argument>* outArgs,
-                                    PassType passType) {
-  for (auto& thread : threads_) {
-    thread->setForwardPassType(passType);
-  }
-  CHECK_EQ(inArgs.size(), dataLayers_.size());
-  outArgs->resize(outputLayers_.size());
-  for (size_t i = 0; i != dataLayers_.size(); ++i) {
-    const_cast<Argument&>(inArgs[i]).deviceId = -1;
-    dataLayers_[i]->setData(inArgs[i]);
-  }
-
-  for (auto& layer : layers_) {
-    dispatchByDeviceId(layer->getDeviceId(), layer, TASK_FORWARD);
-  }
-
-  {
-    REGISTER_TIMER("forwardTime");
-    waitAllThread();
-  }
-  outArgs->clear();
-  outArgs->reserve(outputLayers_.size());
-  for (auto& layer : outputLayers_) {
-    outArgs->push_back(layer->getOutput());
-  }
-}
-
-void ParallelNeuralNetwork::backward(const UpdateCallback& callback) {
-  for (auto& thread : threads_) {
-    thread->setBackwardCallback(callback);
-  }
-
-  FOR_EACH_R(layer, layers_) {
-    dispatchByDeviceId((*layer)->getDeviceId(), *layer, TASK_BACKWARD);
-  }
-  {
-    REGISTER_TIMER("backwardTime");
-    waitAllThread();
-  }
-}
-
-void ParallelNeuralNetwork::forwardBackward(const std::vector<Argument>& inArgs,
-                                            std::vector<Argument>* outArgs,
-                                            PassType passType,
-                                            const UpdateCallback& callback) {
-  forward(inArgs, outArgs, passType);
-  backward(callback);
-}
-
-void ParallelNeuralNetwork::start() {
-  for (auto& thread : threads_) {
-    thread->start();
-  }
-}
-
-ParallelThread::ParallelThread(int threadId, int deviceId, bool useGpu)
-    : threadId_(threadId), deviceId_(deviceId), useGpu_(useGpu) {}
-
-ParallelThread::~ParallelThread() { stop(); }
-
-void ParallelThread::stop() {
-  if (computeThread_) {
-    jobEnqueue(NULL, TASK_THREAD_FINISH);
-    computeThread_->join();
-    computeThread_.reset(nullptr);
-  }
-}
-
-void ParallelThread::computeThread() {
-  LOG(INFO) << "gradComputeThread " << threadId_;
-
-  if (useGpu_) {
-    hl_init(deviceId_);
-  }
-
-  while (true) {
-    struct Job job_work = queue_.dequeue();
-
-    if (job_work.task_ == TASK_END_LAYER) {
-      continue;
-    } else if (job_work.task_ == TASK_THREAD_FINISH) {
-      break;
-    }
-
-    if (TASK_FORWARD == job_work.task_) {
-      {
-        REGISTER_TIMER_INFO("waitInputValue",
-                            job_work.layer_->getName().c_str());
-        job_work.layer_->waitInputValue();
-      }
-      {
-        REGISTER_TIMER_INFO("threadForwardTimer",
-                            job_work.layer_->getName().c_str());
-        job_work.layer_->forward(passType_);
-      }
-      {
-        REGISTER_TIMER_INFO("copyOutputToOtherDevice",
-                            job_work.layer_->getName().c_str());
-        job_work.layer_->copyOutputToOtherDevice();
-      }
-    } else {
-      {
-        REGISTER_TIMER_INFO("waitAndMergeOutputGrad",
-                            job_work.layer_->getName().c_str());
-        job_work.layer_->waitAndMergeOutputGrad();
-      }
-      {
-        REGISTER_TIMER_INFO("threadBackwardTimer",
-                            job_work.layer_->getName().c_str());
-        job_work.layer_->backward(backwardCallback_);
-      }
-      hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-      job_work.layer_->markAllInputGrad();
-    }
-  }
-}
-
-void ParallelThread::start() {
-  computeThread_.reset(new std::thread([this]() { computeThread(); }));
-}
-
-void ParallelThread::jobEnqueue(LayerPtr layer, TaskType task) {
-  struct Job job_work;
-  job_work.layer_ = layer;
-  job_work.task_ = task;
-  queue_.enqueue(job_work);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
deleted file mode 100644
index e3b6812123141e8e0afb9368fb06f2b34f526800..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "NeuralNetwork.h"
-
-namespace paddle {
-
-class ParallelThread;
-
-enum TaskType {
-  TASK_FORWARD = 0,
-  TASK_BACKWARD = 1,
-  TASK_END_LAYER = 2,
-  TASK_THREAD_FINISH = 3,
-};
-
-/**
- * A ParallelNeuralNetwork is capable of calculating a neural network through
- * multiple threads in parallel.
- */
-class ParallelNeuralNetwork : public NeuralNetwork {
-public:
-  ParallelNeuralNetwork(std::string subModelName = "",
-                        NeuralNetwork *rootNetwork = nullptr)
-      : NeuralNetwork(subModelName, rootNetwork) {}
-
-  virtual void init(const ModelConfig &config,
-                    ParamInitCallback callback = nullptr,
-                    const std::vector<ParameterType> &parameterTypes =
-                        std::vector<ParameterType>{PARAMETER_VALUE,
-                                                   PARAMETER_GRADIENT,
-                                                   PARAMETER_MOMENTUM},
-                    bool useGpu = FLAGS_use_gpu);
-
-  virtual void forward(const std::vector<Argument> &inArgs,
-                       std::vector<Argument> *outArgs,
-                       PassType passType);
-
-  virtual void backward(const UpdateCallback &callback = nullptr);
-
-  void forwardBackward(const std::vector<Argument> &inArgs,
-                       std::vector<Argument> *outArgs,
-                       PassType passType,
-                       const UpdateCallback &callback = NULL);
-
-  virtual void start();
-
-  void addComputeThread(int deviceId);
-
-  void dispatchByDeviceId(int deviceId, LayerPtr layer, TaskType task);
-
-  void waitAllThread();
-
-  // virtual void eval(Evaluator* evaluator);
-
-protected:
-  bool useGpu_;
-  /// number of gpu devices
-  int numDevices_;
-  std::vector<std::unique_ptr<ParallelThread>> threads_;
-};
-
-class ParallelThread {
-public:
-  ParallelThread(int threadId, int deviceId, bool useGpu);
-  ~ParallelThread();
-  void jobEnqueue(LayerPtr layer, TaskType task);
-  void start();
-  void stop();
-  int getDeviceId() const { return deviceId_; }
-
-  void setBackwardCallback(const UpdateCallback &callback) {
-    backwardCallback_ = callback;
-  }
-  void setForwardPassType(PassType passType) { passType_ = passType; }
-
-protected:
-  void computeThread();
-
-public:
-  struct Job {
-    LayerPtr layer_;
-    TaskType task_;
-  };
-  typedef Queue<Job> JobQueue;
-  JobQueue queue_;
-
-protected:
-  /// from 0 to threads-1
-  int threadId_;
-  /// the GPU device Id which the computeThread_ used
-  int deviceId_;
-  bool useGpu_;
-  std::unique_ptr<std::thread> computeThread_;
-  /// whether the thread should stop
-  bool stopping_;
-  UpdateCallback backwardCallback_;
-  PassType passType_;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
deleted file mode 100644
index 2429b5d1a0a5ccf66db365b82c494c53d8e1fd4b..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ /dev/null
@@ -1,1501 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "RecurrentGradientMachine.h"
-#include <dlfcn.h>
-#include <algorithm>
-#include <cmath>
-#include <functional>
-#include <limits>
-#include "NeuralNetwork.h"
-#include "paddle/gserver/layers/AgentLayer.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-
-DEFINE_string(diy_beam_search_prob_so, "", "the diy beam search cost so");
-
-static const char* DIY_CALC_PROB_SYMBOL_NAME = "calc_prob";
-static const char* DIY_START_CALC_PROB_SYMBOL_NAME = "start_calc_prob";
-static const char* DIY_FINISH_CALC_PROB_SYMBOL_NAME = "finish_calc_prob";
-
-namespace paddle {
-
-/**
- * Start Custom Calculate Probability callback type.
- *
- * @param nNode, nodes: the path will be explored. nNodes is array size.
- *                      nodes is array elements.
- *
- * @return: A custom handler id that will passed to another callback.
- */
-typedef int (*DiyStartCalcProbCallback)(size_t nNodes, int* nodes);
-
-/**
- * Doing Custom Calculation of Probability callback type.
- *
- * @param handler: User custom handler. The return value from start calc prob.
- * @param nNode, nodes: Array. The current path.
- * @param curProb: The current log probability that neural network returns.
- *
- * @return: Log probability which user calculated, it will be updated to this
- *          path.
- * @NOTE: Return -INFINITY will DROP this path IMMEDIATELY!!
- */
-typedef real (*DiyCalcProbCallback)(
-    int handler, size_t nNodes, int* nodes, real curProb, bool atEos);
-
-/**
- * Finish Custom Calculation of Probability callback type.
- *
- * @param handler: User custom handler. The return value from start calc prob.
- */
-typedef void (*DiyStopCalcProbCallback)(int handler);
-
-static DiyCalcProbCallback gDiyProbMethod = nullptr;
-static DiyStartCalcProbCallback gDiyProbStart = nullptr;
-static DiyStopCalcProbCallback gDiyProbStop = nullptr;
-static void* gDiyProbHandle = nullptr;
-
-static void exit_diy_prob() { dlclose(gDiyProbHandle); }
-
-template <typename SymbolType>
-static inline SymbolType loadDiySymbol(const char* symbolName) {
-  void* sym = dlsym(gDiyProbHandle, symbolName);
-  CHECK(sym) << "Cannot load symbol " << symbolName << " from "
-             << FLAGS_diy_beam_search_prob_so;
-  return reinterpret_cast<SymbolType>(sym);
-}
-
-static InitFunction __init__diy_prob_method(
-    [] {
-      std::string soName = FLAGS_diy_beam_search_prob_so;
-      if (!soName.empty()) {
-        gDiyProbHandle = dlopen(soName.c_str(), RTLD_LAZY);
-        CHECK(gDiyProbHandle) << "Cannot Open DIY Prob So " << soName;
-        atexit(exit_diy_prob);
-        gDiyProbMethod =
-            loadDiySymbol<decltype(gDiyProbMethod)>(DIY_CALC_PROB_SYMBOL_NAME);
-        gDiyProbStart = loadDiySymbol<decltype(gDiyProbStart)>(
-            DIY_START_CALC_PROB_SYMBOL_NAME);
-        gDiyProbStop = loadDiySymbol<decltype(gDiyProbStop)>(
-            DIY_FINISH_CALC_PROB_SYMBOL_NAME);
-      }
-    },
-    std::numeric_limits<int>::max());
-
-class BeamSearchControlCallbacks {
-public:
-  RecurrentGradientMachine::BeamSearchCandidatesAdjustCallback
-      beamSearchCandidateAdjust;
-  RecurrentGradientMachine::NormOrDropNodeCallback normOrDropNode;
-  RecurrentGradientMachine::DropCallback stopDetermineCandidates;
-
-  //! for gcc46 aggregate initialization is not very well, so we need to
-  //! explicit
-  BeamSearchControlCallbacks(
-      const RecurrentGradientMachine::BeamSearchCandidatesAdjustCallback&
-          candidateAdjust,
-      const RecurrentGradientMachine::NormOrDropNodeCallback& norm,
-      const RecurrentGradientMachine::DropCallback& stop)
-      : beamSearchCandidateAdjust(candidateAdjust),
-        normOrDropNode(norm),
-        stopDetermineCandidates(stop) {}
-};
-
-class BeamSearchStatisticsCallbacks {
-public:
-  RecurrentGradientMachine::EachStepCallback onEachStepStarted;
-  RecurrentGradientMachine::EachStepCallback onEachStepStoped;
-
-  BeamSearchStatisticsCallbacks(
-      const RecurrentGradientMachine::EachStepCallback& start,
-      const RecurrentGradientMachine::EachStepCallback& stop)
-      : onEachStepStarted(start), onEachStepStoped(stop) {}
-};
-
-RecurrentGradientMachine::RecurrentGradientMachine(
-    const std::string& subModelName, NeuralNetwork* rootNetwork)
-    : NeuralNetwork(subModelName),
-      rootNetwork_(rootNetwork),
-      beamSearchCtrlCallbacks_(nullptr),
-      beamSearchStatistics_(nullptr) {
-  CHECK(!subModelName_.empty());
-}
-
-/**
- * bias layer, as input of memory frame 0 will give vector of zeros
- * if bias parameter is not set.
- *
- * boot bias layer create directly in recurrent gradient machine, because:
- *
- * 1. It is only one frame, so it should not be placed in layer group,
- *    which is one instance for every one frame.
- *
- * 2. It is no input layer, so it need resetHeight() before forward(),
- *    and resetHeight() must be called in recurrent gradient machine,
- *    so it's should not be placed in root network.
- */
-class BootBiasLayer : public Layer {
-protected:
-  std::unique_ptr<Weight> biases_;
-  IVectorPtr cpuIds_;
-
-public:
-  explicit BootBiasLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    if (!Layer::init(layerMap, parameterMap)) return false;
-
-    if (biasParameter_) {
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-    }
-    return true;
-  }
-
-  void resetHeight(int height) {
-    if (config_.has_bos_id()) {  // used as a constant id layerConfig
-      IVector::resizeOrCreate(output_.ids, height, useGpu_);
-      output_.ids->reset((int)config_.bos_id());
-    } else {
-      resetOutput(height, getSize());
-    }
-  }
-
-  void forward(PassType passType) override {
-    if (biases_) {
-      MatrixPtr outV = getOutputValue();
-      outV->addBias(*(biases_->getW()), 1);
-      forwardActivation();
-    }
-  }
-
-  void backward(const UpdateCallback& callback) override {
-    if (biases_ && biases_->getWGrad()) {
-      backwardActivation();
-      biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-      biases_->getParameterPtr()->incUpdate(callback);
-    }
-  }
-};
-
-void RecurrentGradientMachine::init(
-    const ModelConfig& config,
-    ParamInitCallback callback,
-    const std::vector<ParameterType>& parameterTypes,
-    bool useGpu) {
-  NeuralNetwork::init(config, callback, parameterTypes, useGpu);
-  useGpu_ = useGpu;
-
-  auto subModelConfig =
-      std::find_if(config.sub_models().begin(),
-                   config.sub_models().end(),
-                   [this](const SubModelConfig& sub_model) {
-                     return sub_model.name() == this->subModelName_;
-                   });
-  CHECK(subModelConfig != config.sub_models().end());
-  reversed_ = subModelConfig->reversed();
-  generating_ = subModelConfig->has_generator();
-
-  inFrameLines_.resize(subModelConfig->in_links_size());
-  for (size_t i = 0; i < inFrameLines_.size(); ++i) {
-    inFrameLines_[i].linkName = subModelConfig->in_links(i).link_name();
-    inFrameLines_[i].inLayer =
-        rootNetwork_->getLayer(subModelConfig->in_links(i).layer_name());
-  }
-
-  outFrameLines_.resize(subModelConfig->out_links_size());
-  for (size_t i = 0; i < outFrameLines_.size(); ++i) {
-    auto& linkPair = subModelConfig->out_links(i);
-    outFrameLines_[i].layerName = linkPair.layer_name();
-    outFrameLines_[i].agentLayer = rootNetwork_->getLayer(linkPair.link_name());
-  }
-
-  memoryFrameLines_.resize(subModelConfig->memories_size());
-  for (size_t i = 0; i < memoryFrameLines_.size(); ++i) {
-    auto& memoryConfig = subModelConfig->memories(i);
-    memoryFrameLines_[i].layerName = memoryConfig.layer_name();
-    memoryFrameLines_[i].linkName = memoryConfig.link_name();
-    auto agentConfig =
-        std::find_if(config.layers().begin(),
-                     config.layers().end(),
-                     [&memoryConfig](const LayerConfig& layerConfig) {
-                       return layerConfig.name() == memoryConfig.link_name();
-                     });
-    CHECK(agentConfig != config.layers().end());
-    if (memoryConfig.has_boot_layer_name()) {
-      memoryFrameLines_[i].rootLayer =
-          rootNetwork_->getLayer(memoryConfig.boot_layer_name());
-
-      LayerConfig scatterConfig = *agentConfig;
-      memoryFrameLines_[i].rootAgent.reset(
-          new ScatterAgentLayer(scatterConfig));
-      memoryFrameLines_[i].rootAgent->init(LayerMap(), parameterMap_);
-
-      memoryFrameLines_[i].bootLayer = memoryFrameLines_[i].rootAgent;
-    } else {
-      LayerConfig biasConfig = *agentConfig;
-      if (memoryConfig.has_boot_bias_parameter_name()) {
-        biasConfig.set_bias_parameter_name(
-            memoryConfig.boot_bias_parameter_name());
-        biasConfig.set_active_type(memoryConfig.boot_bias_active_type());
-      } else if (memoryConfig.has_boot_with_const_id()) {
-        biasConfig.set_bos_id(memoryConfig.boot_with_const_id());
-      }
-      memoryFrameLines_[i].biasLayer.reset(new BootBiasLayer(biasConfig));
-      memoryFrameLines_[i].biasLayer->init(LayerMap(), parameterMap_);
-
-      memoryFrameLines_[i].bootLayer = memoryFrameLines_[i].biasLayer;
-    }
-
-    if (subModelConfig->has_generator()) {
-      memoryFrameLines_[i].scatterAgents.resize(2);
-      for (auto& agent : memoryFrameLines_[i].scatterAgents) {
-        agent.reset(new ScatterAgentLayer(*agentConfig));
-        agent->init(LayerMap(), parameterMap_);
-      }
-    }
-  }
-
-  if (subModelConfig->has_generator()) {
-    generator_.config = subModelConfig->generator();
-    eosFrameLine_.reset(new EosFrameLine);
-    maxSequenceLength_ = generator_.config.max_num_frames();
-  }
-
-  // get parameters actually used by this Layer Group
-  resizeOrCreateFrames(1);
-  for (auto& para : frames_[0]->getParameters()) {
-    if (para->getSharedCount() > 0) {
-      parameterIds_.push_back(para->getID());
-    }
-  }
-  for (auto& para : parameters_) {  // bias layer parameters
-    if (para->getSharedCount() > 0) {
-      parameterIds_.push_back(para->getID());
-    }
-  }
-}
-
-void RecurrentGradientMachine::resizeOrCreateFrames(int numFrames) {
-  if ((size_t)numFrames <= frames_.size()) {
-    return;
-  }
-
-  frames_.reserve(numFrames);
-  for (auto& inFrameLine : inFrameLines_) {
-    inFrameLine.agents.reserve(numFrames);
-  }
-  for (auto& outFrameLine : outFrameLines_) {
-    outFrameLine.frames.reserve(numFrames);
-  }
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    memoryFrameLine.frames.reserve(numFrames);
-    memoryFrameLine.agents.reserve(numFrames);
-  }
-  if (eosFrameLine_) {
-    eosFrameLine_->layers.reserve(numFrames);
-  }
-
-  ParamInitCallback subParamInitCb = [this](int paramId, Parameter* para) {
-    para->enableSharedType(PARAMETER_VALUE,
-                           this->parameters_[paramId]->getBuf(PARAMETER_VALUE),
-                           this->parameters_[paramId]->getMat(PARAMETER_VALUE));
-    para->enableSharedType(
-        PARAMETER_GRADIENT,
-        this->parameters_[paramId]->getBuf(PARAMETER_GRADIENT),
-        this->parameters_[paramId]->getMat(PARAMETER_GRADIENT));
-  };
-
-  for (int i = frames_.size(); i < numFrames; ++i) {
-    std::unique_ptr<NeuralNetwork> frame(
-        NeuralNetwork::newNeuralNetwork(subModelName_));
-    frame->init(config_, subParamInitCb);
-
-    for (auto& inFrameLine : inFrameLines_) {
-      inFrameLine.agents.push_back(frame->getLayer(inFrameLine.linkName));
-    }
-
-    for (auto& outFrameLine : outFrameLines_) {
-      outFrameLine.frames.push_back(frame->getLayer(outFrameLine.layerName));
-    }
-    for (auto& memoryFrameLine : memoryFrameLines_) {
-      memoryFrameLine.frames.push_back(
-          frame->getLayer(memoryFrameLine.layerName));
-      memoryFrameLine.agents.push_back(
-          frame->getLayer(memoryFrameLine.linkName));
-    }
-    if (eosFrameLine_) {
-      eosFrameLine_->layers.push_back(
-          frame->getLayer(generator_.config.eos_layer_name()));
-    }
-
-    frames_.emplace_back(std::move(frame));
-  }
-}
-
-void RecurrentGradientMachine::resizeBootFrame(int numSequences) {
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    if (memoryFrameLine.biasLayer) {
-      auto biasLayer =
-          dynamic_cast<BootBiasLayer*>(memoryFrameLine.biasLayer.get());
-      CHECK_NOTNULL(biasLayer);
-      biasLayer->resetHeight(numSequences);
-    } else {  // check input root layer height
-      CHECK_EQ(numSequences,
-               memoryFrameLine.rootLayer->getOutput().getNumSequences());
-    }
-  }
-}
-
-void RecurrentGradientMachine::prefetch(const std::vector<Argument>& inArgs) {
-  LOG(FATAL) << "should not use this function";
-}
-
-void RecurrentGradientMachine::checkInputConsistency(
-    int inlinkId, const std::vector<Argument::SeqInfo>& seqInfo) {
-  if (commonSeqInfo_.empty()) {
-    commonSeqInfo_.resize(seqInfo.size());
-    for (size_t i = 0; i < seqInfo.size(); ++i) {
-      commonSeqInfo_[i].topLevelLength = seqInfo[i].topLevelLength;
-      commonSeqInfo_[i].seqId = seqInfo[i].seqId;
-    }
-  } else {
-    CHECK_EQ(commonSeqInfo_.size(), seqInfo.size())
-        << " RecurrentGroup " << subModelName_ << " input " << inlinkId
-        << " has mismatched number of sequences";
-    for (size_t i = 0; i < seqInfo.size(); ++i) {
-      CHECK_EQ(commonSeqInfo_[i].topLevelLength, seqInfo[i].topLevelLength)
-          << " RecurrentGroup " << subModelName_ << " input " << inlinkId
-          << " has mismatched sequence length";
-      CHECK_EQ(commonSeqInfo_[i].seqId, seqInfo[i].seqId)
-          << " RecurrentGroup " << subModelName_ << " input " << inlinkId
-          << " has mismatched sequence length";
-    }
-  }
-}
-
-void RecurrentGradientMachine::calcNumSequencesAtEachStep() {
-  int numSequences = commonSeqInfo_.size();
-  numSeqs_.resize(maxSequenceLength_);
-  for (int i = 0; i < numSequences; ++i) {
-    for (int j = 0; j < commonSeqInfo_[i].topLevelLength; ++j) {
-      numSeqs_[j] = i + 1;
-    }
-  }
-}
-
-void RecurrentGradientMachine::reorganizeInput(PassType passType) {
-  info_.clear();
-  info_.resize(inFrameLines_.size());
-
-  commonSeqInfo_.clear();
-  seqInfos_.clear();
-  seqInfos_.resize(inFrameLines_.size());
-
-  for (size_t i = 0; i < inFrameLines_.size(); i++) {
-    const Argument& input = inFrameLines_[i].inLayer->getOutput();
-    if (!input.hasSeq()) {
-      continue;
-    }
-    input.getSeqInfo(&seqInfos_[i]);
-    checkInputConsistency(i, seqInfos_[i]);
-  }
-  CHECK(!commonSeqInfo_.empty())
-      << "At least one input needs to be sequence or subsequence";
-  maxSequenceLength_ = commonSeqInfo_[0].topLevelLength;
-
-  calcNumSequencesAtEachStep();
-
-  for (size_t i = 0; i < inFrameLines_.size(); ++i) {
-    const Argument& input = inFrameLines_[i].inLayer->getOutput();
-    if (!input.hasSeq()) {
-      seqInfos_[i] = commonSeqInfo_;
-    }
-    createInFrameInfo(i, input, passType);
-  }
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-
-    // inFrameLine select rows in real layer one time
-    for (size_t i = 0; i < inFrameLines_.size(); i++) {
-      selectRowsOneTime(inFrameLines_[i].inLayer,
-                        info_[i].allIds,
-                        &(inFrameLines_[i].outArg),
-                        passType);
-    }
-  }
-}
-
-void RecurrentGradientMachine::reorganizeOutput(PassType passType) {
-  calcSequenceStartPositions();
-  for (size_t i = 0; i < outFrameLines_.size(); ++i) {
-    Info info;
-    auto& outFrameLine = outFrameLines_[i];
-    ICpuGpuVectorPtr sequenceStartPositions;
-    ICpuGpuVectorPtr subSequenceStartPositions;
-    createOutFrameInfo(
-        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
-    auto gatherAgent =
-        dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
-    CHECK_NOTNULL(gatherAgent);
-    gatherAgent->copyIdAndSequenceInfo(sequenceStartPositions,
-                                       subSequenceStartPositions,
-                                       info.allIds,
-                                       info.idIndex);
-  }
-}
-
-void RecurrentGradientMachine::connectFrames(PassType passType) {
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    if (memoryFrameLine.rootAgent) {
-      auto scatterAgent =
-          dynamic_cast<ScatterAgentLayer*>(memoryFrameLine.rootAgent.get());
-      createMemoryFrameInfo(&memoryFrameLine, passType);
-      scatterAgent->setRealLayerAndOutput(memoryFrameLine.rootLayer,
-                                          memoryFrameLine.outArg,
-                                          memoryFrameLine.allIds,
-                                          /* idIndex */ 0,
-                                          memoryFrameLine.allIds->getSize(),
-                                          /* handleBackward */ true);
-      if (memoryFrameLine.sequenceStartPositions) {
-        int size = memoryFrameLine.sequenceStartPositions->getSize();
-        scatterAgent->setSequenceStartPositions(
-            memoryFrameLine.sequenceStartPositions,
-            /* seqStartPosIndex */ 0,
-            size);
-      }
-    }
-  }
-
-  for (auto& outFrameLine : outFrameLines_) {
-    auto gatherAgent =
-        dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
-    gatherAgent->clearRealLayers();
-  }
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    // connect in_links
-    for (size_t j = 0; j < inFrameLines_.size(); ++j) {
-      Info& info = info_[j];
-      // idSize denotes the sum number of tokens in each length i
-      int idIndex = info.idIndex.empty() ? 0 : info.idIndex[i];
-      int idSize = info.idIndex.empty() ? numSeqs_[i]
-                                        : info.idIndex[i + 1] - info.idIndex[i];
-      InFrameLine inFrameLine = inFrameLines_[j];
-      auto scatterAgent =
-          dynamic_cast<ScatterAgentLayer*>(inFrameLine.agents[i].get());
-      scatterAgent->setRealLayerAndOutput(inFrameLine.inLayer,
-                                          inFrameLine.outArg,
-                                          info.allIds,
-                                          idIndex,
-                                          idSize,
-                                          i == 0);
-      if (info.sequenceStartPositions) {
-        // size: the length of subsequence
-        int size = info.seqStartPosIndex[i + 1] - info.seqStartPosIndex[i];
-        scatterAgent->setSequenceStartPositions(
-            info.sequenceStartPositions, info.seqStartPosIndex[i], size);
-      }
-    }
-
-    // connect out_links
-    for (auto& outFrameLine : outFrameLines_) {
-      auto gatherAgent =
-          dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
-      gatherAgent->addRealLayer(outFrameLine.frames[i]);
-    }
-    for (auto& memoryFrameLine : memoryFrameLines_) {
-      NeuralNetwork::connect(
-          memoryFrameLine.agents[i],
-          i == 0 ? memoryFrameLine.bootLayer : memoryFrameLine.frames[i - 1],
-          numSeqs_[i] /*height of agent*/);
-    }
-  }
-}
-
-void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
-                                       std::vector<Argument>* outArgs,
-                                       PassType passType) {
-  /* inArgs and outArgs are not used.
-     The inputs are inFrameLines_[i].inLayer.
-     The outputs are outFramesLines_[i].agentLayer
-   */
-
-  if (generating_) {
-    generateSequence();
-    return;
-  }  // else forward..
-
-  reorganizeInput(passType);
-  int numSequences = commonSeqInfo_.size();
-
-  resizeOrCreateFrames(maxSequenceLength_);
-  resizeBootFrame(numSequences);
-
-  connectFrames(passType);
-
-  REGISTER_TIMER_INFO("RecurrentFwTime", "RecurrentFwTime");
-  // forward
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    memoryFrameLine.bootLayer->forward(passType);
-  }
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    const std::vector<Argument> inArgs;
-    std::vector<Argument> outArgs;
-    frames_[i]->forward(inArgs, &outArgs, passType);
-  }
-
-  reorganizeOutput(passType);
-}
-
-void RecurrentGradientMachine::backward(const UpdateCallback& callback) {
-  if (generating_) {
-    return;
-  }
-  REGISTER_TIMER_INFO("RecurrentBwTime", "RecurrentBwTime");
-  AsyncGpuBlock asyncGpuBlock;
-  for (int i = maxSequenceLength_ - 1; i >= 0; --i) {
-    frames_[i]->backward(nullptr);
-  }
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    memoryFrameLine.bootLayer->backward(nullptr);
-  }
-}
-
-void RecurrentGradientMachine::forwardBackward(
-    const std::vector<Argument>& inArgs,
-    std::vector<Argument>* outArgs,
-    PassType passType,
-    const UpdateCallback& callback) {
-  LOG(FATAL) << "should not use this function";
-}
-
-void RecurrentGradientMachine::eval(Evaluator* evaluator) const {
-  // call printers frame by frame
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    VLOG(2) << "Recurrent Layer Group eval frame " << i << " begin";
-    evaluator->eval(*(frames_[i].get()));
-    VLOG(2) << "Recurrent Layer Group eval frame " << i << " end";
-  }
-}
-
-void RecurrentGradientMachine::registerBeamSearchControlCallbacks(
-    const BeamSearchCandidatesAdjustCallback& adjustBeamSearch,
-    const NormOrDropNodeCallback& normOrDropNode,
-    const DropCallback& stopBeamSearch) {
-  this->removeBeamSearchControlCallbacks();
-  //! for gcc 46, aggregate initialization is not supported. TAT
-  this->beamSearchCtrlCallbacks_ = new BeamSearchControlCallbacks(
-      adjustBeamSearch, normOrDropNode, stopBeamSearch);
-}
-
-void RecurrentGradientMachine::removeBeamSearchControlCallbacks() {
-  if (this->beamSearchCtrlCallbacks_) {
-    delete this->beamSearchCtrlCallbacks_;
-    this->beamSearchCtrlCallbacks_ = nullptr;
-  }
-}
-
-void RecurrentGradientMachine::registerBeamSearchStatisticsCallbacks(
-    const EachStepCallback& onEachStepStarted,
-    const EachStepCallback& onEachStepStoped) {
-  this->removeBeamSearchStatisticsCallbacks();
-  this->beamSearchStatistics_ =
-      new BeamSearchStatisticsCallbacks(onEachStepStarted, onEachStepStoped);
-}
-
-void RecurrentGradientMachine::removeBeamSearchStatisticsCallbacks() {
-  if (this->beamSearchStatistics_) {
-    delete this->beamSearchStatistics_;
-    this->beamSearchStatistics_ = nullptr;
-  }
-}
-
-namespace {
-void lenToStarts(std::vector<int>& starts) {
-  int pos = 0;
-  starts.back() = 0;
-  for (auto& start : starts) {
-    int tmp = start;
-    start = pos;
-    pos += tmp;
-  }
-  starts.back() = pos;
-}
-}  // namespace
-
-void RecurrentGradientMachine::calcSequenceStartPositions() {
-  std::vector<int> starts(commonSeqInfo_.size() + 1);
-  for (auto& seqInfo : commonSeqInfo_) {
-    starts[seqInfo.seqId] = seqInfo.topLevelLength;
-  }
-  lenToStarts(starts);
-  ICpuGpuVector::resizeOrCreate(sequenceStartPositions_, starts.size(), false);
-  std::copy(starts.begin(),
-            starts.end(),
-            sequenceStartPositions_->getMutableData(false));
-}
-
-void RecurrentGradientMachine::checkOutputConsistency(
-    OutFrameLine& outFrameLine) {
-  bool hasSeq = outFrameLine.frames[0]->getOutput().hasSeq();
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    LayerPtr frame = outFrameLine.frames[i];
-    CHECK_EQ(hasSeq, frame->getOutput().hasSeq());
-    int numSequences = frame->getOutput().getNumSequences();
-    CHECK_EQ(numSeqs_[i], numSequences);
-  }
-}
-
-void RecurrentGradientMachine::createOutFrameInfo(
-    OutFrameLine& outFrameLine,
-    Info& info,
-    ICpuGpuVectorPtr& sequenceStartPositions,
-    ICpuGpuVectorPtr& subSequenceStartPositions) {
-  checkOutputConsistency(outFrameLine);
-
-  if (!outFrameLine.frames[0]->getOutput().hasSeq()) {
-    createOutFrameInfo_seq(
-        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
-  } else {
-    createOutFrameInfo_subseq(
-        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
-  }
-}
-
-void RecurrentGradientMachine::createOutFrameInfo_seq(
-    OutFrameLine& outFrameLine,
-    Info& info,
-    ICpuGpuVectorPtr& sequenceStartPositions,
-    ICpuGpuVectorPtr& subSequenceStartPositions) {
-  std::vector<int> allIds;
-  info.idIndex.resize(1, 0);  // first idIndex = 0
-
-  const int* starts = sequenceStartPositions_->getData(false);
-
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    LayerPtr frame = outFrameLine.frames[i];
-    size_t numSequences = frame->getOutput().getNumSequences();
-    for (size_t j = 0; j < numSequences; ++j) {
-      int seqStart = starts[commonSeqInfo_[j].seqId];
-      int seqLength = commonSeqInfo_[j].topLevelLength;
-      allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
-                                 : (seqStart + i));
-    }
-    info.idIndex.push_back(allIds.size());
-  }
-  sequenceStartPositions = sequenceStartPositions_;
-  copyScattedId(allIds, &info.allIds, allIds.size());
-  CHECK_EQ(info.idIndex.size(), static_cast<size_t>(maxSequenceLength_ + 1));
-}
-
-void RecurrentGradientMachine::createOutFrameInfo_subseq(
-    OutFrameLine& outFrameLine,
-    Info& info,
-    ICpuGpuVectorPtr& sequenceStartPositions,
-    ICpuGpuVectorPtr& subSequenceStartPositions) {
-  size_t numSequences = commonSeqInfo_.size();
-  std::vector<int> allIds;
-  info.idIndex.resize(1, 0);  // first idIndex = 0
-
-  const int* starts = sequenceStartPositions_->getData(false);
-  std::vector<int> subStarts(starts[numSequences] + 1);
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    LayerPtr frame = outFrameLine.frames[i];
-    size_t numSequences = frame->getOutput().getNumSequences();
-    const int* seqStarts =
-        frame->getOutput().sequenceStartPositions->getData(false);
-    for (size_t j = 0; j < numSequences; ++j) {
-      subStarts[starts[commonSeqInfo_[j].seqId] + i] =
-          seqStarts[j + 1] - seqStarts[j];
-    }
-  }
-  lenToStarts(subStarts);
-
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    LayerPtr frame = outFrameLine.frames[i];
-    size_t numSequences = frame->getOutput().getNumSequences();
-    for (size_t j = 0; j < numSequences; ++j) {
-      int pos = starts[commonSeqInfo_[j].seqId] + i;
-      int subSeqStart = subStarts[pos];
-      int subSeqEnd = subStarts[pos + 1];
-      for (int k = subSeqStart; k < subSeqEnd; ++k) {
-        allIds.push_back(k);
-      }
-    }
-    info.idIndex.push_back(allIds.size());
-  }
-
-  ICpuGpuVector::resizeOrCreate(
-      subSequenceStartPositions, subStarts.size(), false);
-  int* cpuSubSequenceStartPositions =
-      subSequenceStartPositions->getMutableData(false);
-  std::copy(subStarts.begin(), subStarts.end(), cpuSubSequenceStartPositions);
-  ICpuGpuVector::resizeOrCreate(
-      sequenceStartPositions, numSequences + 1, false);
-  int* cpuSequenceStartPositions =
-      sequenceStartPositions->getMutableData(false);
-  for (size_t i = 0; i <= numSequences; ++i) {
-    cpuSequenceStartPositions[i] = subStarts[starts[i]];
-  }
-  copyScattedId(allIds, &info.allIds, allIds.size());
-  CHECK_EQ(info.idIndex.size(), static_cast<size_t>(maxSequenceLength_ + 1));
-}
-
-/* create scattered id infomation for all realLayer of inFrameLines one time.
- * If hasSubseq, will also create scattered sequenceStartPositions infomation
- * for all realLayer of inFrameLines one time.
- */
-void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
-                                                 const Argument& input,
-                                                 PassType passType) {
-  if (!input.hasSeq()) {
-    createInFrameInfo_nonseq(inlinkId, input, passType);
-  } else if (!input.hasSubseq()) {
-    createInFrameInfo_seq(inlinkId, input, passType);
-  } else {
-    createInFrameInfo_subseq(inlinkId, input, passType);
-  }
-}
-
-void RecurrentGradientMachine::createInFrameInfo_nonseq(int inlinkId,
-                                                        const Argument& input,
-                                                        PassType passType) {
-  std::vector<int> allIds;
-
-  auto& seqInfo = seqInfos_[inlinkId];
-  Info* inlinkInfo = &info_[inlinkId];
-  inlinkInfo->idIndex.clear();
-  for (size_t i = 0; i < seqInfo.size(); ++i) {
-    allIds.push_back(seqInfo[i].seqId);
-  }
-  // copy and check scatterId
-  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
-}
-
-void RecurrentGradientMachine::createInFrameInfo_seq(int inlinkId,
-                                                     const Argument& input,
-                                                     PassType passType) {
-  std::vector<int> allIds;
-  auto& seqInfo = seqInfos_[inlinkId];
-  Info* inlinkInfo = &info_[inlinkId];
-  inlinkInfo->idIndex.resize(1, 0);  // first idIndex = 0
-
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    for (int j = 0; j < numSeqs_[i]; ++j) {
-      int seqLength = seqInfo[j].topLevelLength;
-      int seqStart = seqInfo[j].seqStart;
-      allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
-                                 : (seqStart + i));
-    }
-    inlinkInfo->idIndex.push_back(allIds.size());
-  }
-
-  // copy and check scatterId
-  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
-  CHECK_EQ(inlinkInfo->idIndex.size(),
-           static_cast<size_t>(maxSequenceLength_ + 1));
-}
-void RecurrentGradientMachine::createInFrameInfo_subseq(int inlinkId,
-                                                        const Argument& input,
-                                                        PassType passType) {
-  std::vector<int> allIds;
-
-  auto& seqInfo = seqInfos_[inlinkId];
-
-  Info* inlinkInfo = &info_[inlinkId];
-  inlinkInfo->idIndex.resize(1, 0);  // first idIndex = 0
-  std::vector<int> sequenceStartPositions;
-  const int* subSequenceStartPositions = nullptr;
-
-  subSequenceStartPositions = input.subSequenceStartPositions->getData(false);
-  inlinkInfo->seqStartPosIndex.clear();
-  inlinkInfo->seqStartPosIndex.push_back(0);  // first seqStartPosIndex = 0
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    sequenceStartPositions.push_back(0);  // first element = 0
-    for (int j = 0; j < numSeqs_[i]; ++j) {
-      int subSeqStart = subSequenceStartPositions[seqInfo[j].subSeqStart + i];
-      int subSeqEnd = subSequenceStartPositions[seqInfo[j].subSeqStart + i + 1];
-      for (int k = subSeqStart; k < subSeqEnd; ++k) {
-        allIds.push_back(k);
-      }
-      sequenceStartPositions.push_back(sequenceStartPositions.back() +
-                                       subSeqEnd - subSeqStart);
-    }
-    inlinkInfo->idIndex.push_back(allIds.size());
-    inlinkInfo->seqStartPosIndex.push_back(sequenceStartPositions.size());
-  }
-  // inFrameLine create sequenceStartPositions one time
-  CHECK_EQ(
-      sequenceStartPositions.size(),
-      static_cast<size_t>(maxSequenceLength_ + input.getNumSubSequences()));
-  CHECK_EQ(inlinkInfo->seqStartPosIndex.size(),
-           static_cast<size_t>(maxSequenceLength_ + 1));
-  createSeqPos(sequenceStartPositions, &inlinkInfo->sequenceStartPositions);
-
-  // copy and check scatterId
-  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
-  CHECK_EQ(inlinkInfo->idIndex.size(),
-           static_cast<size_t>(maxSequenceLength_ + 1));
-}
-
-/* like createInFrameInfo, but for all realLayer of memoryFrameLines*/
-void RecurrentGradientMachine::createMemoryFrameInfo(
-    MemoryFrameLine* memoryFrameLine, PassType passType) {
-  const Argument& input = (*memoryFrameLine).rootLayer->getOutput();
-  size_t numSequences = input.getNumSequences();
-  std::vector<int> allIds;
-  bool seqFlag = input.hasSeq();
-  CHECK(!input.hasSubseq())
-      << "Subsequence boot layer for memory is not supported";
-
-  if (seqFlag) {  // for sequenceScatterAgentLayer
-    std::vector<int> sequenceStartPositions;
-    sequenceStartPositions.push_back(0);  // first element = 0
-    const int* starts = input.sequenceStartPositions->getData(false);
-    for (size_t i = 0; i < numSequences; ++i) {
-      // memory info adopt info of inlinks[0]
-      int seqId = seqInfos_[0][i].seqId;
-      for (int k = starts[seqId]; k < starts[seqId + 1]; ++k) {
-        allIds.push_back(k);
-      }
-      sequenceStartPositions.push_back(sequenceStartPositions.back() +
-                                       starts[seqId + 1] - starts[seqId]);
-    }
-    createSeqPos(sequenceStartPositions,
-                 &(*memoryFrameLine).sequenceStartPositions);
-
-  } else {  // for scatterAgentLayer
-    for (size_t i = 0; i < numSequences; ++i) {
-      allIds.push_back(seqInfos_[0][i].seqId);
-    }
-  }
-  // copy and check scatterId
-  copyScattedId(allIds, &(*memoryFrameLine).allIds, input.getBatchSize());
-  // memoryFrameLine select rows in real layer one time
-  selectRowsOneTime((*memoryFrameLine).rootLayer,
-                    (*memoryFrameLine).allIds,
-                    &(*memoryFrameLine).outArg,
-                    passType);
-}
-
-void RecurrentGradientMachine::copyScattedId(std::vector<int>& srcIds,
-                                             IVectorPtr* dstIds,
-                                             int size) {
-  int idSize = srcIds.size();
-  CHECK_EQ(idSize, size);
-  IVector::resizeOrCreate(*dstIds, idSize, useGpu_);
-  (*dstIds)->copyFrom(srcIds.data(), idSize);
-  // check
-  std::sort(srcIds.begin(), srcIds.end());
-  for (int i = 0; i < idSize; ++i) {
-    CHECK_EQ(srcIds[i], i);
-  }
-}
-
-void RecurrentGradientMachine::selectRowsOneTime(LayerPtr layer,
-                                                 const IVectorPtr& allIds,
-                                                 Argument* arg,
-                                                 PassType passType) {
-  Argument& src = layer->getOutput();
-  if (src.value) {
-    const MatrixPtr& realV = src.value;
-    int height = realV->getHeight();
-    int width = realV->getWidth();
-    Matrix::resizeOrCreate(
-        arg->value, height, width, /* trans */ false, useGpu_);
-    arg->value->zeroMem();
-    arg->value->selectRows(*realV, *allIds);
-    if (passType != PASS_TEST) {
-      Matrix::resizeOrCreate(
-          arg->grad, height, width, /* trans */ false, useGpu_);
-      arg->grad->zeroMem();
-    }
-  }
-  if (src.ids) {
-    IVector::resizeOrCreate(arg->ids, src.ids->getSize(), useGpu_);
-    arg->ids->selectFrom(*src.ids, *allIds);
-  }
-}
-
-void RecurrentGradientMachine::createSeqPos(
-    const std::vector<int>& sequenceStartPosition,
-    ICpuGpuVectorPtr* sequenceStartPositions) {
-  int size = sequenceStartPosition.size();
-  const int* data = sequenceStartPosition.data();
-  ICpuGpuVector::resizeOrCreate(*sequenceStartPositions, size, false);
-  (*sequenceStartPositions)->copyFrom(data, size, false);
-}
-
-size_t RecurrentGradientMachine::getGenBatchSize() {
-  size_t numSequences = 0;
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    if (!memoryFrameLine.rootLayer) continue;
-    Argument& bootArg = memoryFrameLine.rootLayer->getOutput();
-    size_t batchSize = bootArg.getNumSequences();
-    if (numSequences) {
-      CHECK_EQ(numSequences, batchSize);
-    } else {
-      numSequences = batchSize;
-    }
-  }
-  CHECK(numSequences)
-      << "Fail to get batch size in generation. "
-         "At least one of the Memory layer MUST have a layer that is NOT in "
-         "the layer group to boot it, and this boot layer is used to "
-         "decide batch_size in generation process.";
-  return numSequences;
-}
-
-void RecurrentGradientMachine::generateSequence() {
-  CHECK_NOTNULL(eosFrameLine_.get());
-  CHECK_GE(outFrameLines_.size(), 1UL);
-  size_t numSequences = getGenBatchSize();
-
-  resizeBootFrame(numSequences);
-  // We create only two sub-network in generation, one stores states of all
-  // layers in previous time step and the other storing the states at current
-  // time step.
-  resizeOrCreateFrames(2);
-
-  // outFrameLines_.size() > 1UL
-  dataArgsSize_ = outFrameLines_.size() - 1;
-  dataArgs_.resize(dataArgsSize_);
-  dataArgsFrame_.clear();
-  dataArgsFrame_.resize(dataArgsSize_);
-
-  // connect boot frame memory links
-  std::vector<int> ids(numSequences);
-  for (size_t i = 0; i < numSequences; ++i) {
-    ids[i] = i;
-  }
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    if (memoryFrameLine.rootAgent) {
-      auto scatterAgent =
-          dynamic_cast<ScatterAgentLayer*>(memoryFrameLine.rootAgent.get());
-      scatterAgent->setRealLayer(memoryFrameLine.rootLayer, ids);
-    }
-    NeuralNetwork::connect(
-        memoryFrameLine.agents[0], memoryFrameLine.bootLayer, ids.size());
-  }
-
-  // boot layer forward
-  AsyncGpuBlock asyncGpuBlock;
-
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    memoryFrameLine.bootLayer->forward(PASS_TEST);
-  }
-
-  // init outArg
-  size_t resultNum = generator_.config.num_results_per_sample();
-  size_t maxGenWordCount =
-      generator_.config.max_num_frames() * numSequences * resultNum;
-  IVector::resizeOrCreate(generator_.outArg.ids, maxGenWordCount, false);
-  if (resultNum > 1) {
-    CHECK_LE(resultNum, static_cast<size_t>(generator_.config.beam_size()));
-    Matrix::resizeOrCreate(generator_.outArg.in,
-                           /* height */ numSequences,
-                           /* width */ resultNum,
-                           false,
-                           /* useGpu */ false);
-  }
-  ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions,
-                                numSequences + 1,
-                                /* useGpu */ false);
-  if (getBeamSize() > 1) {
-    beamSearch(numSequences);
-  } else {
-    oneWaySearch(numSequences);
-  }
-  if (dataArgsSize_) createDataOutlink();
-
-  size_t size = generator_.ids.size();
-  generator_.outArg.ids->resize(size);
-  generator_.outArg.ids->copyFrom(generator_.ids.data(), size);
-
-  OutFrameLine& outFrameLine = outFrameLines_[0];
-  auto dataAgent = dynamic_cast<DataLayer*>(outFrameLine.agentLayer.get());
-  CHECK_NOTNULL(dataAgent);
-  dataAgent->setData(generator_.outArg);
-  dataAgent->prefetch();
-}
-
-void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
-  OutFrameLine& outFrameLine = outFrameLines_[0];
-
-  // finalPaths_[0] stores the generated results of the
-  // entire batch, so its size exactly equals to batchSize.
-  finalPaths_.clear();
-  finalPaths_.resize(1);
-  std::vector<Path>& finalPaths = finalPaths_[0];
-  finalPaths.resize(batchSize);
-
-  seqIds_.resize(batchSize);
-  std::vector<int> scatterIds;
-  for (size_t i = 0; i < batchSize; ++i) {
-    finalPaths[i].seqId = i;
-    seqIds_[i] = i;
-  }
-
-  // forward
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    if (i && scatterIds.empty()) break;
-    int machineCur = i % 2;
-    int machinePrev = (i - 1) % 2;
-    // connect memory links
-    if (i) {
-      seqIds_.clear();
-      for (size_t j = 0; j < batchSize; ++j) {
-        if (finalPaths[j].seqId != -1) seqIds_.push_back(j);
-      }
-
-      for (auto& memoryFrameLine : memoryFrameLines_) {
-        auto scatterAgent = dynamic_cast<ScatterAgentLayer*>(
-            memoryFrameLine.scatterAgents[machineCur].get());
-        scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev],
-                                   scatterIds);
-        scatterAgent->forward(PASS_TEST);
-        NeuralNetwork::connect(memoryFrameLine.agents[machineCur],
-                               memoryFrameLine.scatterAgents[machineCur]);
-      }
-    }
-    const std::vector<Argument> inArgs;
-    std::vector<Argument> outArgs;
-    frames_[machineCur]->forward(inArgs, &outArgs, PASS_TEST);
-
-    const IVectorPtr& idVec = outFrameLine.frames[machineCur]->getOutput().ids;
-    for (size_t j = 0; j < seqIds_.size(); ++j) {
-      finalPaths[seqIds_[j]].ids.push_back(idVec->getElement(j));
-      finalPaths[seqIds_[j]].machineIdVec.push_back(j);
-    }
-
-    copyDataOutlinkFrame(machineCur);
-
-    // check eos
-    const IVectorPtr& eosVec =
-        eosFrameLine_->layers[machineCur]->getOutput().ids;
-    scatterIds.clear();
-    for (size_t j = 0; j < seqIds_.size(); ++j) {
-      if (eosVec->getElement(j) == 1U) {
-        // path.seqId = -1 indicates end of generation
-        // of an input sequence
-        finalPaths[seqIds_[j]].seqId = -1;
-      } else {
-        scatterIds.push_back(j);
-      }
-    }
-  }
-
-  batchMachineIdVec_.clear();
-  batchMachineStartPos_.clear();
-  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
-  starts[0] = 0;
-  generator_.ids.clear();
-  for (size_t i = 0; i < batchSize; ++i) {
-    generator_.ids.insert(generator_.ids.end(),
-                          finalPaths[i].ids.begin(),
-                          finalPaths[i].ids.end());
-    starts[i + 1] = generator_.ids.size();
-    batchMachineIdVec_.insert(batchMachineIdVec_.end(),
-                              finalPaths[i].machineIdVec.begin(),
-                              finalPaths[i].machineIdVec.end());
-  }
-}
-
-void RecurrentGradientMachine::connectPrevFrame(int stepId,
-                                                std::vector<Path>& paths) {
-  int machineCur = stepId % 2;
-  int machinePrev = (stepId - 1) % 2;
-  int beam = getBeamSize();
-  machineIds_.clear();
-  topIds_.clear();
-  seqIds_.clear();
-
-  for (size_t j = 0; j < paths.size(); ++j) {
-    machineIds_.push_back(paths[j].machineId);
-    topIds_.push_back(paths[j].machineId * beam + paths[j].topIndex);
-    seqIds_.push_back(paths[j].seqId);
-  }
-
-  for (auto& memoryFrameLine : memoryFrameLines_) {
-    bool isOutIds = (memoryFrameLine.layerName == outFrameLines_[0].layerName);
-    auto scatterAgent = dynamic_cast<ScatterAgentLayer*>(
-        memoryFrameLine.scatterAgents[machineCur].get());
-    scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev],
-                               isOutIds ? topIds_ : machineIds_);
-    scatterAgent->forward(PASS_TEST);
-    NeuralNetwork::connect(memoryFrameLine.agents[machineCur],
-                           memoryFrameLine.scatterAgents[machineCur]);
-  }
-}
-
-void RecurrentGradientMachine::forwardFrame(int machineCur) {
-  // forward
-  const std::vector<Argument> inArgs;
-  std::vector<Argument> outArgs;
-  frames_[machineCur]->forward(inArgs, &outArgs, PASS_TEST);
-
-  copyDataOutlinkFrame(machineCur);
-
-  IVectorPtr& ids = outFrameLines_[0].frames[machineCur]->getOutput().ids;
-  MatrixPtr in = outFrameLines_[0].frames[machineCur]->getOutput().in;
-  IVectorPtr& eos = eosFrameLine_->layers[machineCur]->getOutput().ids;
-  if (useGpu_) {
-    IVector::resizeOrCreate(cpuId_, ids->getSize(), false /* useGpu */);
-    cpuId_->copyFrom(*ids);
-    Matrix::resizeOrCreate(cpuProb_,
-                           in->getHeight(),
-                           in->getWidth(),
-                           false /* trans */,
-                           false /* useGpu */);
-    cpuProb_->copyFrom(*in);
-    IVector::resizeOrCreate(cpuEos_, eos->getSize(), false /* useGpu */);
-    cpuEos_->copyFrom(*eos);
-  } else {
-    cpuId_ = ids;
-    cpuProb_ = in;
-    cpuEos_ = eos;
-  }
-}
-
-void RecurrentGradientMachine::singlePathExpand(Path& curPath,
-                                                size_t curPathId,
-                                                std::vector<Path>& newPaths,
-                                                size_t expandWidth) {
-  int calc_id =
-      gDiyProbStart ? gDiyProbStart(curPath.ids.size(), curPath.ids.data()) : 0;
-
-  const int* idVec = cpuId_->getData();
-  const real* probMat = cpuProb_->getData();
-  const int* eosVec = cpuEos_->getData();
-
-  for (size_t k = 0; k < expandWidth; k++) {
-    int index = curPathId * expandWidth + k;
-    int id = idVec[index];
-    real prob = probMat[index];
-    /*
-     * Ordinarily, beam search greedily expands the most promising expandWidth
-     * paths that currently are ALWAYS returned by MaxIdLayer.
-     * In one condition, if user customizes the beam search procedure by
-     * restricting the expansion within a user defined subset,
-     * as a result, MaxIdLayer possibly COULD NOT return expandWidth
-     * vaild expansions, and it will use -1 to indicate the end of valid
-     * expansion candidates.
-     */
-    if (id == -1) break;
-
-    real newLogProb = generator_.config.log_prob() ? std::log(prob) : prob;
-    Path newPath(
-        curPath, id, newLogProb, curPathId /*machineId*/, k /*topIndex*/);
-    if (this->beamSearchCtrlCallbacks_) {
-      if (beamSearchCtrlCallbacks_->stopDetermineCandidates(
-              newPath.seqId, newPath.ids, newPath.probHistory))
-        return;
-    }
-    // outFrameLines_.size() > 1UL
-    if (dataArgsSize_) {
-      newPath.machineIdVec = curPath.machineIdVec;
-      newPath.machineIdVec.push_back(curPathId);
-    }
-    bool atEos =
-        eosVec[index] == 1U || newPath.ids.size() >= (size_t)maxSequenceLength_;
-    // adjustNewPath
-    newPath.adjustProb(calc_id, atEos);
-    if (this->beamSearchCtrlCallbacks_) {
-      this->beamSearchCtrlCallbacks_->normOrDropNode(
-          newPath.seqId, newPath.ids, newPath.probHistory, &newPath.logProb);
-    }
-    if (!newPath.isDropable()) {
-      atEos ? finalPaths_[curPath.seqId].push_back(newPath)
-            : newPaths.push_back(newPath);
-    }
-  }  // for expandWidth
-
-  if (gDiyProbStop) {
-    gDiyProbStop(calc_id);
-  }
-}
-
-void RecurrentGradientMachine::beamExpand(std::vector<Path>& paths,
-                                          std::vector<Path>& newPaths) {
-  size_t candidatePathCount = paths.size();
-  // idVec.size() could be larger than candidatePathCount * beam,
-  // so user can drop some node customly.
-  CHECK_EQ(cpuId_->getSize() % candidatePathCount, 0UL);
-  size_t expandWidth = cpuId_->getSize() / candidatePathCount;
-
-  // iterate over each sequence
-  size_t totalExpandCount = 0;
-  int prevSeqId = -1;
-  int curSeqId = 0;
-  for (size_t j = 0; j <= candidatePathCount; j++) {
-    // expansions of a single sequence are all processed
-    curSeqId = (j < candidatePathCount ? paths[j].seqId : curSeqId + 1);
-    if (prevSeqId != -1 && curSeqId != prevSeqId) {
-      totalExpandCount += beamShrink(newPaths, prevSeqId, totalExpandCount);
-    }
-    if (j == candidatePathCount) return;
-    singlePathExpand(paths[j], j, newPaths, expandWidth);
-
-    prevSeqId = paths[j].seqId;
-  }  // for paths
-}
-
-// Drop extra nodes to beam size.
-size_t RecurrentGradientMachine::beamShrink(std::vector<Path>& newPaths,
-                                            size_t seqId,
-                                            size_t totalExpandCount) {
-  size_t minNewPathSize =
-      std::min(getBeamSize(), newPaths.size() - totalExpandCount);
-  if (!minNewPathSize) {
-    return 0;
-  }
-  std::nth_element(newPaths.begin() + totalExpandCount,
-                   newPaths.begin() + totalExpandCount + minNewPathSize,
-                   newPaths.end(),
-                   Path::greaterPath);
-  newPaths.resize(totalExpandCount + minNewPathSize);
-
-  real minPathLogProb =
-      std::min_element(newPaths.end() - minNewPathSize, newPaths.end())
-          ->logProb;
-  real maxPathLogProb =
-      std::max_element(newPaths.end() - minNewPathSize, newPaths.end())
-          ->logProb;
-
-  // Remove the already formed paths that are relatively short
-  finalPaths_[seqId].erase(
-      std::remove_if(finalPaths_[seqId].begin(),
-                     finalPaths_[seqId].end(),
-                     [&](Path& p) { return p.logProb < minPathLogProb; }),
-      finalPaths_[seqId].end());
-  for (auto p : finalPaths_[seqId]) {
-    if (minFinalPathLogProb_[seqId] > p.logProb) {
-      minFinalPathLogProb_[seqId] = p.logProb;
-    }
-  }
-
-  if (finalPaths_[seqId].size() >= getBeamSize() &&
-      minFinalPathLogProb_[seqId] >= maxPathLogProb) {
-    newPaths.resize(totalExpandCount);
-    return 0;
-  }
-  return minNewPathSize;
-}
-
-void RecurrentGradientMachine::fillGenOutputs() {
-  size_t numResults = generator_.config.num_results_per_sample();
-  for (size_t i = 0; i < finalPaths_.size(); ++i) {
-    size_t minFinalPathsSize = std::min(numResults, finalPaths_[i].size());
-    std::partial_sort(finalPaths_[i].begin(),
-                      finalPaths_[i].begin() + minFinalPathsSize,
-                      finalPaths_[i].end(),
-                      Path::greaterPath);
-    finalPaths_[i].resize(minFinalPathsSize);
-  }
-
-  generator_.ids.clear();
-  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
-  starts[0] = 0;
-  if (numResults > 1) {
-    int idsProbSaveSize = 0;
-    for (auto inSeq : finalPaths_) {
-      for (auto path : inSeq) idsProbSaveSize += path.ids.size();
-      idsProbSaveSize += inSeq.size();
-    }
-    Matrix::resizeOrCreate(
-        generator_.outArg.value, idsProbSaveSize, 1, false, false);
-    real* idsProb = generator_.outArg.value->getData();
-
-    real* probs = generator_.outArg.in->getData();
-    size_t curPos = 0;
-    for (size_t i = 0; i < finalPaths_.size(); ++i) {
-      for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
-        Path& path = finalPaths_[i][j];
-        size_t genLen = path.ids.size();
-        generator_.ids.push_back(genLen);  // sequence size
-        generator_.ids.insert(
-            generator_.ids.end(), path.ids.begin(), path.ids.end());
-        generator_.ids.push_back(-1);  // end of sequence
-
-        memcpy(idsProb + curPos, path.idsProb.data(), sizeof(real) * genLen);
-        curPos += genLen;
-        idsProb[curPos++] = -1.0;
-        probs[i * numResults + j] = path.logProb;
-      }
-      starts[i + 1] = generator_.ids.size();
-    }
-  } else {
-    for (size_t i = 0; i < finalPaths_.size(); ++i) {
-      CHECK(!finalPaths_[i].empty());
-      Path& path = finalPaths_[i][0];
-      generator_.ids.insert(
-          generator_.ids.end(), path.ids.begin(), path.ids.end());
-      starts[i + 1] = starts[i] + path.ids.size();
-    }
-  }
-}
-
-void RecurrentGradientMachine::copyDataOutlinkFrame(size_t machineCur) {
-  for (size_t i = 0; i < dataArgsSize_; i++) {
-    Argument outFrame;
-    outFrame.resizeAndCopyFrom(
-        outFrameLines_[i + 1].frames[machineCur]->getOutput(), useGpu_);
-    dataArgsFrame_[i].emplace_back(outFrame);
-  }
-}
-
-void RecurrentGradientMachine::createDataOutlinkSelRowsInfo(
-    bool isSeq, std::vector<Argument>& outArgs) {
-  batchMachineIdVec_.clear();
-
-  size_t seqIdx = 0;
-  for (size_t i = 0; i < finalPaths_.size(); ++i) {
-    for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
-      std::vector<int>& machineIdVec = finalPaths_[i][j].machineIdVec;
-      if (isSeq) {
-        for (size_t i = 0; i < machineIdVec.size(); ++i) {
-          size_t rowId = machineIdVec[i];
-          int* seqPos =
-              outArgs[i].sequenceStartPositions->getMutableData(false);
-          batchMachineIdVec_.push_back(seqPos[rowId]);
-        }
-      } else {
-        batchMachineIdVec_.insert(
-            batchMachineIdVec_.end(), machineIdVec.begin(), machineIdVec.end());
-      }
-      seqIdx++;
-    }
-  }
-}
-
-void RecurrentGradientMachine::createDataOutlinkCopySizeInfo(
-    bool isSeq, std::vector<Argument>& outArgs, std::vector<int>& copySize) {
-  size_t totalSeqNum = std::accumulate(
-      finalPaths_.begin(),
-      finalPaths_.end(),
-      0UL,
-      [](size_t a, const std::vector<Path>& b) { return a + b.size(); });
-  copySize.resize(totalSeqNum, 1);
-
-  batchMachineStartPos_.resize(totalSeqNum + 1, 0);
-  if (isSeq) {
-    ICpuGpuVectorPtr inputSeqStartPos = outArgs[0].sequenceStartPositions;
-    CHECK_EQ(static_cast<size_t>(inputSeqStartPos->getSize() - 1),
-             getBeamSize() > 1 ? finalPaths_.size() : finalPaths_[0].size());
-    int* starts = inputSeqStartPos->getMutableData(false);
-    int seqId = 0;
-    for (size_t i = 0; i < finalPaths_.size(); ++i) {
-      for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
-        copySize[seqId] = getBeamSize() > 1 ? starts[i + 1] - starts[i]
-                                            : starts[j + 1] - starts[j];
-        batchMachineStartPos_[seqId + 1] =
-            batchMachineStartPos_[seqId] + finalPaths_[i][j].ids.size();
-        seqId++;
-      }
-    }
-  } else {
-    for (size_t i = 0; i < finalPaths_[0].size(); ++i)
-      batchMachineStartPos_[i + 1] =
-          batchMachineStartPos_[i] + finalPaths_[0][i].ids.size();
-  }
-}
-
-void RecurrentGradientMachine::createDataOutlink() {
-  for (size_t i = 0; i < dataArgsSize_; i++) {
-    bool isSeq = dataArgsFrame_[i][0].hasSeq();
-    std::vector<int> copySize;
-    createDataOutlinkCopySizeInfo(isSeq, dataArgsFrame_[i], copySize);
-    createDataOutlinkSelRowsInfo(isSeq, dataArgsFrame_[i]);
-
-    dataArgs_[i].concat(dataArgsFrame_[i],
-                        batchMachineIdVec_,
-                        batchMachineStartPos_,
-                        copySize,
-                        useGpu_,
-                        HPPL_STREAM_1,
-                        PASS_TEST);
-    auto dataAgent =
-        dynamic_cast<DataLayer*>(outFrameLines_[i + 1].agentLayer.get());
-    CHECK_NOTNULL(dataAgent);
-    dataAgent->setData(dataArgs_[i]);
-  }
-}
-
-void RecurrentGradientMachine::beamSearch(size_t batchSize) {
-  finalPaths_.clear();
-  finalPaths_.resize(batchSize);
-  seqIds_.resize(batchSize);
-  minFinalPathLogProb_.clear();
-  minFinalPathLogProb_.resize(batchSize, 0);
-
-  std::vector<Path> paths;
-  std::vector<Path> newPaths;
-  for (size_t i = 0; i < batchSize; ++i) {
-    paths.push_back(Path(i));
-    if (this->beamSearchCtrlCallbacks_) {
-      paths.back().recordHistory();
-    }
-  }
-
-  // restart beam search
-  stopBeamSearch_ = false;
-  for (int i = 0; i < maxSequenceLength_; ++i) {
-    int machineCur = i % 2;
-    std::unique_ptr<
-        ScopedCallbacks<const RecurrentGradientMachine::EachStepCallback&, int>>
-        statisticsBlock;
-    if (this->beamSearchStatistics_) {
-      auto ptr =
-          new ScopedCallbacks<const RecurrentGradientMachine::EachStepCallback&,
-                              int>(beamSearchStatistics_->onEachStepStarted,
-                                   beamSearchStatistics_->onEachStepStoped,
-                                   i);
-      statisticsBlock.reset(ptr);
-    }
-    if (stopBeamSearch_) break;
-
-    if (i) connectPrevFrame(i, paths);
-
-    if (this->beamSearchCtrlCallbacks_) {
-      std::vector<std::vector<int>*> prefixes;
-      prefixes.resize(paths.size());
-      std::transform(
-          paths.begin(), paths.end(), prefixes.begin(), [](const Path& p) {
-            return const_cast<std::vector<int>*>(&p.ids);
-          });
-      beamSearchCtrlCallbacks_->beamSearchCandidateAdjust(
-          prefixes, frames_[machineCur].get(), i);
-    }
-
-    forwardFrame(machineCur);
-    beamExpand(paths, newPaths);
-    if (newPaths.empty()) break;
-
-    paths = newPaths;
-    newPaths.clear();
-  }  // end for machineCur
-  fillGenOutputs();
-}
-
-void RecurrentGradientMachine::Path::adjustProb(int calc_id, bool atEos) {
-  if (gDiyProbMethod) {
-    logProb = gDiyProbMethod(calc_id, ids.size(), ids.data(), logProb, atEos);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
deleted file mode 100644
index 0032b72cdae44588af976f1ac542149545f551f1..0000000000000000000000000000000000000000
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ /dev/null
@@ -1,580 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include "GradientMachine.h"
-#include "NeuralNetwork.h"
-
-#include "paddle/utils/Locks.h"
-
-namespace paddle {
-
-/**
- * Private data class declares.
- * Used for user customized beam search.
- */
-class BeamSearchControlCallbacks;
-class BeamSearchStatisticsCallbacks;
-
-class RecurrentGradientMachine : public NeuralNetwork {
-public:
-  RecurrentGradientMachine(const std::string& subModelName,
-                           NeuralNetwork* rootNetwork);
-
-  // Disable copy and assign.
-  RecurrentGradientMachine(const RecurrentGradientMachine& other) = delete;
-  RecurrentGradientMachine& operator=(const RecurrentGradientMachine& other) =
-      delete;
-
-  virtual ~RecurrentGradientMachine() {
-    this->removeBeamSearchStatisticsCallbacks();
-    this->removeBeamSearchControlCallbacks();
-  }
-
-  virtual void init(const ModelConfig& config,
-                    ParamInitCallback callback,
-                    const std::vector<ParameterType>& parameterTypes,
-                    bool useGpu);
-
-  virtual void prefetch(const std::vector<Argument>& inArgs);
-
-  virtual void forward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType);
-
-  virtual void backward(const UpdateCallback& callback = nullptr);
-
-  void forwardBackward(const std::vector<Argument>& inArgs,
-                       std::vector<Argument>* outArgs,
-                       PassType passType,
-                       const UpdateCallback& callback);
-
-  virtual void resetState() {}
-  virtual void eval(Evaluator* evaluator) const;
-
-  const std::vector<int>& getParameterIds() { return parameterIds_; }
-
-  /**
-   * @brief BeamSearchCandidatesAdjustCallback
-   *
-   * Adjust searching candidates to restrict beam search
-   * searching within a limited subset of all possibile paths.
-   *
-   * The first parameter is the prefixes of all formed paths in current
-   * beam search step, whose type is basically int[][].
-   *
-   * The second parameter is a pointer to the network used to generate sequence,
-   * user can use this pointer to tranverse each layer in the network to
-   * modify behaivors of a particular layer.
-   *
-   * The third parameter is an integer to indicate the iteration number of
-   * beam search, so that user can customize different operations in different
-   * beam search iterations.
-   */
-  typedef std::function<void(
-      const std::vector<std::vector<int>*>&, NeuralNetwork*, const int)>
-      BeamSearchCandidatesAdjustCallback;
-
-  /**
-   * @brief DropCallback
-   *
-   * Drop a whole prefix or one candidate in beam search or not.
-   *
-   * The first parameter is sequence index in a batch
-   *
-   * The second parameter is one path in beam search,
-   * which is made up of node indices.
-   *
-   * The third parameter is probabilites for each node in this path.
-   *
-   * Return true if this prefix or candidate is expected to be dropped.
-   */
-  typedef std::function<bool(
-      int seqId, const std::vector<int>&, const std::vector<real>&)>
-      DropCallback;
-
-  /**
-   * @brief NormOrDropNodeCallback
-   *
-   * Normalize a path's probabilities or just drop it by modifying path.logProb
-   *
-   * The first parameter is sequence index in a batch
-   *
-   * The second parameter is path.ids
-   *
-   * The third parameter is probabilites for each node in this path.
-   *
-   * The fourth parameter is the probability of the whole path.
-   */
-  typedef std::function<void(
-      int seqId, const std::vector<int>&, std::vector<real>&, real*)>
-      NormOrDropNodeCallback;
-
-  /**
-   * @brief Register beam search control callbacks. Used for prediction.
-   *
-   * @param queryBeamSearch: Give the sequences already formed, return the
-   * nodes expected to be expanded.
-   * Input: A pointer to an array holding pathes which have been expanded
-   * Return: A pointer to an array holding nodes wanted to be expanded.
-   *
-   * @param dropOneNode: Early drop a node in one beam search step.
-   * Given the path formed and probability history, decide whether a node
-   * should be dropped or not.
-   *
-   * @param stopBeamSearch: Early stop a path in one beam search step.
-   * Given the path and probability history, decide whether a path
-   * should be dropped or not.
-   */
-  void registerBeamSearchControlCallbacks(
-      const BeamSearchCandidatesAdjustCallback& adjustBeamSearch,
-      const NormOrDropNodeCallback& normOrDropNode,
-      const DropCallback& stopBeamSearch);
-
-  /**
-   * @brief Remove user costumized beam search callbacks,
-   *
-   * make sequence generation acts like normal beam search.
-   */
-  void removeBeamSearchControlCallbacks();
-
-  /**
-   * @brief EachStepCallback
-   *
-   * Invoke with beam search step.
-   */
-  typedef std::function<void(int)> EachStepCallback;
-
-  /**
-   * @brief register statistics methods for performance profile of beam search.
-   *
-   * @param onEachStepStarted: invoke once a beam search step starts.
-   * Its input is index of the beam search step.
-   *
-   * @param onEachStepStoped: invoke once a beam search step ends.
-   * Its input is index of the beam search step.
-   */
-  void registerBeamSearchStatisticsCallbacks(
-      const EachStepCallback& onEachStepStarted,
-      const EachStepCallback& onEachStepStoped);
-
-  /**
-   * @brief Remove beam search callbacks.
-   */
-  void removeBeamSearchStatisticsCallbacks();
-
-  /**
-   * @brief Stop beam search for current source.
-   *
-   * Will restart beam search in the next forward
-   */
-  void stopBeamSearch();
-
-  struct Path {
-    /**
-     * @brief ids, path of beam search.
-     */
-    std::vector<int> ids;
-
-    /**
-     * @brief idsProb, log probability of each generated word.
-     */
-    std::vector<real> idsProb;
-
-    /**
-     * @brief logProb, current probability of path.
-     */
-    real logProb;
-
-    int machineId;  // index of sample in frame
-    int topIndex;   // index of MaxIdLayer output in one sample
-    int seqId;      // index of sequence in batch generation
-    std::vector<int> machineIdVec;
-
-    /**
-     * @brief A record of each node's probality in a formed path in beam search.
-     *
-     * @note  It could be empty when history is not recorded. If the history is
-     *        wanted to be recorded, recordHistory() MUST be invoked first.
-     */
-    std::vector<real> probHistory;
-
-    /**
-     * @brief Path default ctor, first logProb is 0.
-     */
-    Path() {
-      logProb = 0;
-      seqId = 0;
-    }
-    explicit Path(size_t seqId) : seqId(seqId) { logProb = 0; }
-
-    /**
-     * @brief Create a new path based on an old path and
-     * a new node with probability.
-     *
-     * @param old       old path
-     * @param newId     index of the new node
-     * @param logProb   probability of the new node.
-     * @param machineId sample index of a frame in RNN
-     * @param topIndex  index of MaxIdLayer output in one sample
-     */
-    Path(Path& old, int newId, real logProb, int machineId, int topIndex)
-        : ids(old.ids),
-          idsProb(old.idsProb),
-          logProb(old.logProb + logProb),
-          machineId(machineId),
-          topIndex(topIndex),
-          seqId(old.seqId) {
-      ids.push_back(newId);
-      idsProb.push_back(logProb);
-      if (!old.probHistory.empty()) {
-        this->probHistory = old.probHistory;
-        // probHistory store current prob, not sum
-        this->probHistory.push_back(logProb);
-      }
-    }
-
-    /**
-     * @brief operator <
-     *
-     * Path a < Path b means log probability of a is smaller than that of b
-     */
-    bool operator<(const Path& other) const {
-      return (logProb < other.logProb);
-    }
-
-    static bool greaterPath(const Path& a, const Path& b) { return (b < a); }
-
-    /**
-     * @brief Start recording history in this path.
-     */
-    void recordHistory() { this->probHistory.push_back(this->logProb); }
-
-    /**
-     * @brief Adjust probability for DIY beam search interface.
-     * In normal situation, it will do nothing.
-     *
-     * @param calc_id: the object id for DIY beam search interface.
-     * @param atEos: at end of sequence or not.
-     */
-    void adjustProb(int calc_id, bool atEos = false);
-
-    /**
-     * @brief isDropable indacating whether the current node will be
-     * dropped or not in beam search.
-     *
-     * @note: if logProb is -inf, current node will be dropped.
-     * @return true to drop the current node.
-     */
-    bool isDropable() const { return std::isinf(logProb) && logProb < 0; }
-  };
-
-  /**
-   * @brief access beam search results.
-   * @return beam search results.
-   */
-  const std::vector<std::vector<Path>>& getFinalPaths() const {
-    return this->finalPaths_;
-  }
-
-protected:
-  std::vector<Argument::SeqInfo> commonSeqInfo_;
-  ICpuGpuVectorPtr sequenceStartPositions_;
-  void calcSequenceStartPositions();
-  void checkInputConsistency(int inlinkId,
-                             const std::vector<Argument::SeqInfo>& seqInfo);
-  void reorganizeInput(PassType passType);
-  void reorganizeOutput(PassType passType);
-  void connectFrames(PassType passType);
-  void calcNumSequencesAtEachStep();
-
-  void resizeOrCreateFrames(int numFrames);
-  void resizeBootFrame(int numSequences);
-
-  void generateSequence();
-  void oneWaySearch(size_t batchSize);
-  void beamSearch(size_t batchSize);
-
-  struct InFrameLine {
-    std::string linkName;
-    LayerPtr inLayer;
-    std::vector<LayerPtr> agents;  // Scatter Agents to reform batch input
-    Argument outArg;               // scatter output argument
-  };
-  std::vector<InFrameLine> inFrameLines_;
-
-  struct OutFrameLine {
-    std::string layerName;
-    LayerPtr agentLayer;
-    std::vector<LayerPtr> frames;
-  };
-  std::vector<OutFrameLine> outFrameLines_;
-
-  struct MemoryFrameLine {
-    std::string layerName;
-    std::string linkName;
-    LayerPtr bootLayer;  // actually used biasLayer or rootAgent
-    LayerPtr biasLayer;
-    LayerPtr rootLayer;  // layer in root network to boot this memory
-    LayerPtr rootAgent;  // agent to link rootLayer
-    std::vector<LayerPtr> frames;
-    std::vector<LayerPtr> agents;
-    std::vector<LayerPtr> scatterAgents;  // scatter agent used by beam search
-    Argument outArg;                      // scatter output argument
-    // Different memoryFrameLine have different element as follows
-    IVectorPtr allIds;  // scattered id of realLayer
-    ICpuGpuVectorPtr
-        sequenceStartPositions;  // scattered sequenceStartPositions
-  };
-  std::vector<MemoryFrameLine> memoryFrameLines_;
-
-  // Each inFrameLines(inlinks) has its own info(elements) below,
-  // and all outFrameLines(outlinks) share the info with one inFrameLine,
-  // which is assigned by targetInfoInlinkId_.
-  struct Info {
-    // The original positions in the original batch
-    IVectorPtr allIds;  // scattered id of realLayer [batchSize]
-
-    // index of allIds for each step [maxSequenceLength_]
-    // idIndex[i] is the total length of the first i sequences
-    std::vector<int> idIndex;
-
-    ICpuGpuVectorPtr
-        sequenceStartPositions;         // scattered sequenceStartPositions
-    std::vector<int> seqStartPosIndex;  // index of sequenceStartPositions
-  };
-  std::vector<Info> info_;  // for input
-
-  // numSeqs_[i] is the number sequences which is longer than i (for sequence
-  // data) or has more than i subsequences (for subsequence data)
-  // Equivalently, numSeqs_[i] is the number of sequences at step i;
-  std::vector<int> numSeqs_;
-
-  std::vector<std::vector<Argument::SeqInfo>> seqInfos_;
-
-  void checkOutputConsistency(OutFrameLine& outFrameLine);
-
-  /* create scattered id infomation for all realLayer of inFrameLines one time.
-   *  If hasSubseq, will also create scattered sequenceStartPositions infomation
-   *  for all realLayer of inFrameLines one time.
-   */
-  void createInFrameInfo(int inlinks_id,
-                         const Argument& input,
-                         PassType passType);
-  void createInFrameInfo_nonseq(int inlinks_id,
-                                const Argument& input,
-                                PassType passType);
-  void createInFrameInfo_seq(int inlinks_id,
-                             const Argument& input,
-                             PassType passType);
-  void createInFrameInfo_subseq(int inlinks_id,
-                                const Argument& input,
-                                PassType passType);
-
-  void createOutFrameInfo(OutFrameLine& outFrameLine,
-                          Info& info,
-                          ICpuGpuVectorPtr& sequenceStartPositions,
-                          ICpuGpuVectorPtr& subSequenceStartPositions);
-  void createOutFrameInfo_seq(OutFrameLine& outFrameLine,
-                              Info& info,
-                              ICpuGpuVectorPtr& sequenceStartPositions,
-                              ICpuGpuVectorPtr& subSequenceStartPositions);
-  void createOutFrameInfo_subseq(OutFrameLine& outFrameLine,
-                                 Info& info,
-                                 ICpuGpuVectorPtr& sequenceStartPositions,
-                                 ICpuGpuVectorPtr& subSequenceStartPositions);
-
-  void createMemoryFrameInfo(MemoryFrameLine* memoryFrameLine,
-                             PassType passType);
-
-  void copyScattedId(std::vector<int>& srcIds, IVectorPtr* dstIds, int size);
-
-  void selectRowsOneTime(LayerPtr layer,
-                         const IVectorPtr& allIds,
-                         Argument* arg,
-                         PassType passType);
-
-  void createSeqPos(const std::vector<int>& sequenceStartPosition,
-                    ICpuGpuVectorPtr* sequenceStartPositions);
-
-  // for generator
-  struct EosFrameLine {
-    std::vector<LayerPtr> layers;
-  };
-  std::unique_ptr<EosFrameLine> eosFrameLine_;
-
-  struct Generator {
-    GeneratorConfig config;
-    std::vector<int> ids;       // store generated sequences
-    std::vector<real> idsProb;  // log probability of each generated word
-    Argument outArg;            // final output argument
-  };
-  bool generating_;
-  Generator generator_;
-
-  std::vector<std::unique_ptr<NeuralNetwork>> frames_;
-
-  NeuralNetwork* rootNetwork_;
-  bool reversed_;
-
-  int maxSequenceLength_;  // Max top-level length
-  bool useGpu_;
-  bool stopBeamSearch_;
-
-  std::vector<int>
-      parameterIds_;  // parameters actually used by this Layer Group
-
-  // store final argument of outFrameLines_
-  std::vector<Argument> dataArgs_;
-  // store each frame's output argument of outFrameLines_
-  std::vector<std::vector<Argument>> dataArgsFrame_;
-  size_t dataArgsSize_;  // size of dataArgs_ = size of dataArgsFrame_
-
-  IVectorPtr cpuId_;
-  MatrixPtr cpuProb_;
-  IVectorPtr cpuEos_;
-
-private:
-  /*
-   * @return beam size in beam search
-   */
-  size_t getBeamSize() { return generator_.config.beam_size(); }
-
-  /*
-   * @return number of sequence in a batch in generation
-   */
-  size_t getGenBatchSize();
-
-  /*
-   * @brief store output of the machineCur-th frame during generation, for
-   * creating the final outlink after the entire generation process is finished.
-   *
-   * In generation, if the layer group has more than 1 outlink, the first
-   * one is reserved to store the generated word indices, the others are data
-   * outlinks, that can be used like a common layer in the network.
-   *
-   * @param machineCur : index to access the layer group frame in
-   * currrent generation step.
-   */
-  void copyDataOutlinkFrame(size_t machineCur);
-
-  /*
-   * @brief In generation, if the layer group has more than 1 outlink, outlink
-   * except the first one is a data outlink. In RecurrentLayerGroup, each time
-   * step is a separate Network, outputs of a layer inside the
-   * RecurrentLayerGroup are stored in separate Arguments. If one layer is
-   * specified as an outlink of RecurrentLayerGroup. This function will
-   * collect outputs in each time step of each generated sequence which are
-   * dispersed in separate Arguments to form a new single Argument as output of
-   * RecurrentLayerGroup.
-   */
-  void createDataOutlink();
-
-  /*
-   * @brief decide to select how many rows from the Matrix stored the forward
-   * pass results from a start position.
-   *
-   * @param isSeq: a flag indicating whetehr the layer to be output of the
-   * RecurrentGradientMachine is a sequence or not
-   * @param outArgs: all of the the returned Arguments of the forward pass
-   * during the generation process.
-   * @param copySize: the returned result, number of rows to select from the
-   * Matrix stored the forward pass results from a start position.
-   */
-  void createDataOutlinkCopySizeInfo(bool isSeq,
-                                     std::vector<Argument>& outArgs,
-                                     std::vector<int>& copySize);
-
-  /*
-   * @brief decide index of the start row for each time step of a generated
-   * sequence in Matrix stored the entire beam search batch's forward pass
-   * results.
-   *
-   * @param isSeq: a flag indicating whether the layer to be output of the
-   * RecurrentGradientMachine is a sequence or not
-   * @param outArgs: all of the returned Arguments of the forward pass
-   * during the generation process.
-   */
-  void createDataOutlinkSelRowsInfo(bool isSeq, std::vector<Argument>& outArgs);
-
-  /*
-   * @brief used in beam search, connect previous frame to form recurrent link
-   * @param stepId : iteration number of generation process.
-   * It equals to the length of longest half-generated sequence.
-   * @param paths : half-generated paths that are going to be expanded
-   * in current beam search iteration.
-   */
-  void connectPrevFrame(int stepId, std::vector<Path>& paths);
-
-  /*
-   * @brief used in beam search, forward current recurrent frame
-   * @param machineCur : index to access the layer group frame in
-   * currrent generation step.
-   */
-  void forwardFrame(int machineCur);
-
-  /*
-   * @brief reduce all expanded paths to beam size.
-   *
-   * @param newPaths : newPaths[totalExpandCount : ] stores all expanded paths
-   * for the seqId-th sequence
-   * @param seqId : sequence index in a batch
-   * @param totalExpandCount : number of already shrinked paths in newPaths
-   * @return size of retained paths at the end of a beam search iteration
-   */
-  size_t beamShrink(std::vector<Path>& newPaths,
-                    size_t seqId,
-                    size_t totalExpandCount);
-
-  /*
-   * @brief expand a single path to expandWidth new paths
-   * with highest probability
-   * @param curPath : path to be expanded
-   * @param curPathId : index of curPath in member newPaths
-   * @param expandWidth : number of paths to be expanded
-   */
-  void singlePathExpand(Path& curPath,
-                        size_t curPathId,
-                        std::vector<Path>& newPaths,
-                        size_t expandWidth);
-
-  /*
-   * @brief A new beam search iteration. Each half-generated paths in previous
-   * beam search iteration are further expanded to beam_size new paths
-   * with highest probabilities, and then all the expanded paths are again
-   * reduced to beam_size paths according to their log probabilities.
-   * @param paths : half-generated paths in previous iteration.
-   * @param newPaths : paths expanded and then reduces in current iteration.
-   */
-  void beamExpand(std::vector<Path>& paths, std::vector<Path>& newPaths);
-
-  /*
-   * @brief fill sequence start positions and some other information that are
-   * uesed by the "text_printer" evaluator.
-   */
-  void fillGenOutputs();
-
-  std::vector<int> machineIds_;
-  std::vector<int> topIds_;
-  std::vector<int> seqIds_;
-  std::vector<int> batchMachineIdVec_;
-  std::vector<int> batchMachineStartPos_;
-  std::vector<std::vector<Path>> finalPaths_;
-  std::vector<real> minFinalPathLogProb_;
-  BeamSearchControlCallbacks* beamSearchCtrlCallbacks_;
-  BeamSearchStatisticsCallbacks* beamSearchStatistics_;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/AddtoLayer.cpp b/paddle/gserver/layers/AddtoLayer.cpp
deleted file mode 100644
index 75e17f52df64253232dc8fc042d0a1a8e7d98e26..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/AddtoLayer.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "AddtoLayer.h"
-
-#include "paddle/utils/Logging.h"
-
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(addto, AddtoLayer);
-
-bool AddtoLayer::init(const LayerMap& layerMap,
-                      const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  return true;
-}
-
-void AddtoLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(0)->getHeight();
-  int size = getSize();
-
-  reserveOutput(batchSize, size);
-
-  MatrixPtr outV = getOutputValue();
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    MatrixPtr input = getInputValue(i);
-    i == 0 ? outV->assign(*input) : outV->add(*input);
-  }
-  /* add the bias-vector */
-  if (biases_.get() != NULL) {
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */ { forwardActivation(); }
-}
-
-void AddtoLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ { backwardActivation(); }
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    /* Calculate the input layers error */
-    MatrixPtr preGrad = getInputGrad(i);
-    if (NULL != preGrad) {
-      preGrad->add(*getOutputGrad());
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/AddtoLayer.h b/paddle/gserver/layers/AddtoLayer.h
deleted file mode 100644
index 1d000630567cb1116ab0ff69e42380fc0eae6173..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/AddtoLayer.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * This layer just simply add all input layers together, then activate
- * the sum inputs. Each input of this layer should be the same size,
- * which is also the output size of this layer.
- * \f[
- *   y=f(\sum_{i}x_i + b)
- * \f]
- * where \f$y\f$ is output, \f$x\f$ is input, \f$b\f$ is bias, and \f$f\f$ is
- * activation function.
- *
- * The config file api is addto_layer.
- */
-class AddtoLayer : public Layer {
-protected:
-  std::unique_ptr<Weight> biases_;
-
-public:
-  explicit AddtoLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~AddtoLayer() {}
-
-  /**
-   * Intialization of AddtoLayer.
-   */
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  /**
-   * Forward propagation.
-   * @note There is no weight matrix for each input,
-   *       because it just a simple add operation.
-   */
-  void forward(PassType passType) override;
-
-  /**
-   * Backward propagation.
-   */
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp
deleted file mode 100644
index e2f73f88f59278c6e6e6f0a1fe8457393d53f44a..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/AgentLayer.cpp
+++ /dev/null
@@ -1,281 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "AgentLayer.h"
-
-#include "paddle/utils/Logging.h"
-
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(agent, AgentLayer);
-
-bool AgentLayer::init(const LayerMap& layerMap,
-                      const ParameterMap& parameterMap) {
-  CHECK_EQ(config_.inputs_size(), 0);
-  if (!Layer::init(layerMap, parameterMap)) {
-    return false;
-  }
-  setNeedGradient(true);
-  return true;
-}
-
-void AgentLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  Argument& realOutput = realLayer_->getOutput();
-  int realNumSequences = realOutput.getNumSequences();
-  CHECK_LE(numSamples_, realNumSequences);
-
-  // get Arguments from real layers
-  if (numSamples_ > 0 && numSamples_ < realNumSequences) {
-    if (realOutput.hasSeq()) {
-      int numRows =
-          realOutput.sequenceStartPositions->getData(false)[numSamples_];
-      output_.subArgFrom(realOutput,
-                         /* offset */ 0,
-                         numRows,
-                         getSize(),
-                         useGpu_,
-                         /* trans */ false,
-                         /* seqFlag */ true,
-                         /* seqStart */ 0,
-                         /* seqSize */ numSamples_ + 1);
-    } else {
-      output_.subArgFrom(
-          realOutput, /* offset */ 0, numSamples_, getSize(), useGpu_);
-    }
-  } else {
-    output_ = realOutput;
-  }
-}
-
-bool GatherAgentLayer::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  CHECK_EQ(config_.inputs_size(), 0);
-  if (!Layer::init(layerMap, parameterMap)) {
-    return false;
-  }
-  setNeedGradient(true);
-  return true;
-}
-
-void GatherAgentLayer::copyIdAndSequenceInfo(
-    ICpuGpuVectorPtr sequenceStartPositions,
-    ICpuGpuVectorPtr subSequenceStartPositions,
-    const IVectorPtr& ids,
-    const std::vector<int>& idIndex) {
-  output_.sequenceStartPositions = sequenceStartPositions;
-  output_.subSequenceStartPositions = subSequenceStartPositions;
-  allIds_ = ids;
-  idIndex_ = idIndex;
-}
-
-void GatherAgentLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  forwardIds(passType);
-  forwardValue(passType);
-}
-
-void GatherAgentLayer::forwardValue(PassType passType) {
-  MatrixPtr valueReal = realLayers_[0]->getOutputValue();
-  if (!valueReal) return;
-
-  int height = allIds_->getSize();
-  int width = this->getSize();
-  resetOutput(height, width);
-  idsVec_.resize(idIndex_.size());
-
-  const MatrixPtr& outV = getOutputValue();
-
-  for (size_t i = 0; i < realLayers_.size(); ++i) {
-    const MatrixPtr& realV = realLayers_[i]->getOutputValue();
-    idsVec_[i] = IVector::create(allIds_->getData() + idIndex_[i],
-                                 /* size */ realV->getHeight(),
-                                 useGpu_);
-    realV->addToRows(*outV, *idsVec_[i]);
-  }
-}
-
-namespace {
-
-// dest[index[i]] <- src[i] for each i
-void copyElements(const IVector& srcVec,
-                  const IVector& indexVec,
-                  IVector& destVec) {
-  const int* src = srcVec.getData();
-  const int* index = indexVec.getData();
-  int* dest = destVec.getData();
-  int len = indexVec.getSize();
-  CHECK_EQ(srcVec.getSize(), indexVec.getSize());
-  for (int i = 0; i < len; ++i) {
-    dest[index[i]] = src[i];
-  }
-}
-}  // namespace
-
-void GatherAgentLayer::forwardIds(PassType passType) {
-  IVectorPtr realId = realLayers_[0]->getOutputLabel();
-  if (!realId) return;
-
-  IVector::resizeOrCreate(output_.ids, allIds_->getSize(), useGpu_);
-  IVectorPtr outId = output_.ids;
-  idsVec_.resize(idIndex_.size());
-
-  for (size_t i = 0; i < realLayers_.size(); ++i) {
-    const IVectorPtr& realId = realLayers_[i]->getOutputLabel();
-    idsVec_[i] = IVector::create(allIds_->getData() + idIndex_[i],
-                                 /* size */ realId->getSize(),
-                                 useGpu_);
-    execViaCpu(&copyElements, *realId, *idsVec_[i], *outId);
-  }
-}
-
-void GatherAgentLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-  const MatrixPtr& outputGrad = getOutputGrad();
-
-  for (size_t i = 0; i < realLayers_.size(); ++i) {
-    const MatrixPtr& realG = realLayers_[i]->getOutputGrad();
-    if (realG) {
-      realG->selectRows(*outputGrad, *idsVec_[i]);
-    }
-  }
-}
-
-bool ScatterAgentLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  CHECK_EQ(config_.inputs_size(), 0);
-  if (!Layer::init(layerMap, parameterMap)) {
-    return false;
-  }
-  setNeedGradient(true);
-  return true;
-}
-
-void ScatterAgentLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());
-
-  int width = this->getSize();
-  if (selectionMode_) {
-    forwardWithSelection(passType);
-  } else {
-    if (realOutArg_.hasSeq()) {
-      output_.subArgFrom(realOutArg_,
-                         /* offset */ idIndex_,
-                         idSize_,
-                         width,
-                         useGpu_,
-                         /* trans */ false,
-                         /* seqFlag */ true,
-                         /* seqStart */ seqStartPosIndex_,
-                         /* seqSize */ numSequences_);
-    } else {
-      output_.subArgFrom(
-          realOutArg_, /* offset */ idIndex_, idSize_, width, useGpu_);
-    }
-  }
-}
-
-void ScatterAgentLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  CHECK(!selectionMode_);
-
-  const MatrixPtr& outputGrad = realOutArg_.grad;
-  const MatrixPtr& realGrad = realLayer_->getOutputGrad();
-  if (realGrad) {
-    // for agent in inFrameLines and memoryFrameLines,
-    // only first scatterAgentLayer should do addToRows in backward
-    if (handleBackward_) {
-      outputGrad->addToRows(*realGrad, *ids_);
-    }
-  }
-}
-
-REGISTER_LAYER(gather_agent, GatherAgentLayer);
-REGISTER_LAYER(scatter_agent, ScatterAgentLayer);
-
-void ScatterAgentLayer::forwardWithSelection(PassType passType) {
-  Layer::forward(passType);
-  CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());
-
-  const Argument& input = realLayer_->getOutput();
-  CHECK_EQ(realLayer_->getSize(), this->getSize());
-  int width = this->getSize();
-
-  AsyncGpuBlock asyncGpuBlock;
-  REGISTER_TIMER_INFO("SequenceAgentLayerForward", getName().c_str());
-
-  if (!input.hasSeq()) {
-    if (realLayer_->getOutput().ids) {
-      IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
-      output_.ids->selectFrom(*realLayer_->getOutput().ids, *ids_);
-    }
-    if (realLayer_->getOutput().value) {
-      int height = ids_->getSize();
-      resetOutput(height, width);
-
-      const MatrixPtr& outV = getOutputValue();
-      const MatrixPtr& realV = realLayer_->getOutputValue();
-      outV->selectRows(*realV, *ids_);
-    }
-  } else {
-    // Putting the generation logic here is really an ugly hack!
-    // used in generation
-    int height = 0;
-    size_t numSequences = ids_->getSize();
-    const int* starts = input.getCpuStartPositions();
-    size_t size = input.hasSubseq() ? input.getNumSubSequences()
-                                    : input.getNumSequences();
-    const int* cpuIds = cpuIds_->getData();
-
-    for (size_t i = 0; i < numSequences; ++i) {
-      size_t seqId = cpuIds[i];
-      CHECK_LT(seqId, size);
-      height += starts[seqId + 1] - starts[seqId];
-    }
-    reserveOutput(height, width);
-
-    const MatrixPtr& outputValue = getOutputValue();
-
-    CHECK_NE(input.sequenceStartPositions.get(),
-             output_.sequenceStartPositions.get());
-    ICpuGpuVector::resizeOrCreate(
-        output_.sequenceStartPositions, numSequences + 1, false);
-    int* outStarts = output_.sequenceStartPositions->getMutableData(false);
-
-    ICpuGpuVector::resizeOrCreate(inputStartPos_, height, false);
-    int* inStarts = inputStartPos_->getMutableData(false);
-
-    size_t offsetOut = 0;
-    for (size_t i = 0; i < numSequences; ++i) {
-      outStarts[i] = offsetOut;
-      size_t seqId = cpuIds[i];
-      int size = starts[seqId + 1] - starts[seqId];
-      for (int j = 0; j < size; j++) {
-        inStarts[offsetOut + j] = starts[seqId] + j;
-      }
-      offsetOut += size;
-    }
-    outStarts[numSequences] = offsetOut;
-
-    outputValue->copyByRowIndex(*input.value,
-                                *inputStartPos_->getVector(useGpu_));
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/AgentLayer.h b/paddle/gserver/layers/AgentLayer.h
deleted file mode 100644
index da0ac4530836205757399ac8eb64dd003740a53f..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/AgentLayer.h
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * AgentLayer use as a virtual input of another layer in config,
- * before execute forward/backward, setRealLayer() should be
- * called to set one and only one real layer
- */
-class AgentLayer : public Layer {
-protected:
-  LayerPtr realLayer_;
-  int numSamples_;
-
-public:
-  explicit AgentLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~AgentLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  // if *numSamples* set,
-  // real layer output will only use first *numSamples* rows
-  void setRealLayer(LayerPtr layer, int numSamples = 0) {
-    realLayer_ = layer;
-    numSamples_ = numSamples;
-  }
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override {}
-};
-
-/**
- * Like AgentLayer, but it can gather many real layers. Each real
- * layer give a few rows of a sequence, after gather all real layers,
- * GatherAgentLayer collect a complete sequence.
- */
-class GatherAgentLayer : public Layer {
-protected:
-  std::vector<LayerPtr> realLayers_;
-  std::vector<IVectorPtr> idsVec_;
-  // we don't clear idsVec_ vector to aviod IVector alloc/free
-  IVectorPtr allIds_;
-  std::vector<int> idIndex_;
-
-public:
-  explicit GatherAgentLayer(const LayerConfig& config) : Layer(config) {}
-
-  virtual ~GatherAgentLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  // call before addRealLayer
-  void clearRealLayers() { realLayers_.clear(); }
-
-  void copyIdAndSequenceInfo(ICpuGpuVectorPtr sequenceStartPositions,
-                             ICpuGpuVectorPtr subSequenceStartPositions,
-                             const IVectorPtr& allIds,
-                             const std::vector<int>& idIndex);
-
-  // add one real layer, can call many times
-  void addRealLayer(LayerPtr layer) { realLayers_.push_back(layer); }
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-  void forwardValue(PassType passType);
-  void forwardIds(PassType passType);
-};
-
-/**
- * Like AgentLayer, but only select a few rows in real layer.
- * [idIndex, idIndex + idSize) of *ids* in setRealLayerAndOutput()
- * are the selected row ids. It's used to scatter one layer's output
- * to many small submodels. ScatterAgentLayer can support ids real layer,
- * if it is, the agent will select a few ids in real layer.
- */
-class ScatterAgentLayer : public Layer {
-protected:
-  LayerPtr realLayer_;
-  IVectorPtr ids_;
-  IVectorPtr cpuIds_;
-  Argument realOutArg_;
-  int idIndex_;
-  int idSize_;
-  int seqStartPosIndex_;
-  int numSequences_;  // number of sequences in this scatterAgentLayer
-  bool handleBackward_;
-
-  // use to store expanded cpuStartPositions or subSequenceStartPositions
-  // of real layer.
-  ICpuGpuVectorPtr inputStartPos_;
-
-  // true for setRealLayer, false for setRealLayerAndOutput
-  bool selectionMode_;
-
-public:
-  explicit ScatterAgentLayer(const LayerConfig& config) : Layer(config) {}
-
-  virtual ~ScatterAgentLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  /**
-   * @brief set real layer in generation
-   *
-   * @param layer[input]    realLayer
-   * @param ids[input]      row id in real layer
-   * @param copyId[input]   whether to copy a cpu version of ids,
-   *                        false(default) in ScatterAgentLayer, and
-   *                        true in SequenceScatterAgentLayer.
-   */
-  void setRealLayer(LayerPtr layer, const std::vector<int>& ids) {
-    realLayer_ = layer;
-    IVector::resizeOrCreate(ids_, ids.size(), useGpu_);
-    ids_->copyFrom(ids.data(), ids.size());
-    if (useGpu_) {
-      IVector::resizeOrCreate(cpuIds_, ids.size(), false);
-      cpuIds_->copyFrom(ids.data(), ids.size());
-    } else {
-      cpuIds_ = ids_;
-    }
-    selectionMode_ = true;
-  }
-
-  // set real layer and output, [idIndex, idIndex + idSize) of *ids*
-  // are selected row for realOutArg in realLayer
-  void setRealLayerAndOutput(LayerPtr layer,
-                             const Argument& outArg,
-                             const IVectorPtr& ids,
-                             int idIndex,
-                             int idSize,
-                             bool handleBackward) {
-    realLayer_ = layer;
-    realOutArg_ = outArg;
-    ids_ = ids;
-    idIndex_ = idIndex;
-    idSize_ = idSize;
-    handleBackward_ = handleBackward;
-    selectionMode_ = false;
-  }
-
-  void setSequenceStartPositions(const ICpuGpuVectorPtr& sequenceStartPositions,
-                                 int seqStartPosIndex,
-                                 int numSequences) {
-    realOutArg_.sequenceStartPositions = sequenceStartPositions;
-    seqStartPosIndex_ = seqStartPosIndex;
-    numSequences_ = numSequences;
-  }
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
-  void forwardWithSelection(PassType passType);
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/AverageLayer.cpp b/paddle/gserver/layers/AverageLayer.cpp
deleted file mode 100644
index b3787b1448a272d2879b372d34406aacc6c0bbfb..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/AverageLayer.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "AverageLayer.h"
-
-#include "paddle/utils/Logging.h"
-
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(average, AverageLayer);
-
-bool AverageLayer::init(const LayerMap& layerMap,
-                        const ParameterMap& parameterMap) {
-  SequencePoolLayer::init(layerMap, parameterMap);
-
-  // average strategy
-  if (config_.average_strategy() == "average") {
-    mode_ = kAverage;
-  } else if (config_.average_strategy() == "sum") {
-    mode_ = kSum;
-  } else if (config_.average_strategy() == "squarerootn") {
-    mode_ = kAverageSquareRootN;
-  } else {
-    LOG(FATAL) << "Unknown average strategy: " << config_.average_strategy();
-  }
-  return true;
-}
-
-void AverageLayer::forward(PassType passType) {
-  SequencePoolLayer::forward(passType);
-
-  MatrixPtr inputValue = getInputValue(0);
-  getOutputValue()->sequenceAvgForward(
-      *inputValue, *startPositions_->getVector(useGpu_), mode_);
-
-  /* add the bias-vector AFTER average operation */
-  if (biases_.get() != NULL) {
-    MatrixPtr outV = getOutputValue();
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */ { forwardActivation(); }
-}
-
-void AverageLayer::backward(const UpdateCallback& callback) {
-  SequencePoolLayer::backward(callback);
-
-  if (getInputGrad(0)) {
-    getInputGrad(0)->sequenceAvgBackward(
-        *getOutputGrad(), *startPositions_->getVector(useGpu_), mode_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/AverageLayer.h b/paddle/gserver/layers/AverageLayer.h
deleted file mode 100644
index 24602d2a9c3e08cf76f6f98b5f9e3f593118e6e1..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/AverageLayer.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "SequencePoolLayer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * A layer for "internal average" for sequence input.
- * Input: one or more sequences. Each sequence contains some instances.
- * If SequenceLevel = kNonSeq:
- *    Output: output size is the number of input sequences (NOT input instances)
- *    output[i] = average_{for each instance in this sequence}{input[i]}
- *    If stride_ > 0:
- *      Output: a shorten sequence. Stride is the step size by which we slide a
- *              window upon the input sequence, and the average pooling
- *              operation is then applied to each interval independently.
- * If SequenceLevel = kSeq:
- *    Check input sequence must has sub-sequence
- *    Output: output size is the number of input sub-sequences
- *    output[i] = average_{for each instance in this sub-sequence}{input[i]}
- *
- * The config file api is pooling_layer.
- */
-class AverageLayer : public SequencePoolLayer {
-public:
-  enum AverageStrategy { kAverage = 0, kSum = 1, kAverageSquareRootN = 2 };
-  explicit AverageLayer(const LayerConfig& config)
-      : SequencePoolLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-protected:
-  int mode_;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.cpp b/paddle/gserver/layers/BatchNormBaseLayer.cpp
deleted file mode 100644
index a3516f9423e62df0192485c4476357ac51dc27a4..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/BatchNormBaseLayer.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "BatchNormBaseLayer.h"
-#include "BatchNormalizationLayer.h"
-#include "Layer.h"
-#include "paddle/utils/Stat.h"
-#ifdef PADDLE_WITH_CUDA
-#include "CudnnBatchNormLayer.h"
-#endif
-
-namespace paddle {
-
-bool BatchNormBaseLayer::init(const LayerMap& layerMap,
-                              const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  if (!Layer::init(layerMap, parameterMap)) return false;
-
-  /* initialize the weightList */
-  // first is Input in configure
-  // other two is created in config_parser.py
-  CHECK_EQ(inputLayers_.size(), 3U);
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  CHECK_EQ(inputLayers_.size(), size_t(config_.inputs_size()));
-  const ImageConfig& conf = config_.inputs(0).image_conf();
-  channels_ = conf.channels();
-  calFeatureMapSize();
-
-  if (config_.has_use_global_stats()) {
-    useGlobalStats_ = config_.use_global_stats();
-  }
-  movingAvgFraction_ = config_.moving_average_fraction();
-  epsilon_ = config_.epsilon();
-
-  weight_.reset(new Weight(1, channels_, parameters_[0]));
-  movingMean_.reset(new Weight(1, channels_, parameters_[1]));
-  movingVar_.reset(new Weight(1, channels_, parameters_[2]));
-
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, channels_, biasParameter_));
-  }
-
-  savedMean_ = Matrix::create(1, channels_, false, useGpu_);
-  savedInvVar_ = Matrix::create(1, channels_, false, useGpu_);
-  savedMean_->zeroMem();
-  savedInvVar_->zeroMem();
-
-  return true;
-}
-
-void BatchNormBaseLayer::calFeatureMapSize() {
-  const ImageConfig& conf = config_.inputs(0).image_conf();
-  imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  imageD_ = inputLayers_[0]->getOutput().getFrameDepth();
-
-  if (0 == imageD_) imageD_ = conf.img_size_z();
-  if (imageH_ == 0 && imageW_ == 0) {
-    imageH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-    imageW_ = conf.img_size();
-  } else {
-    getOutput().setFrameHeight(imageH_);
-    getOutput().setFrameWidth(imageW_);
-    getOutput().setFrameDepth(imageD_);
-  }
-  imgPixels_ = imageH_ * imageW_ * imageD_;
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.h b/paddle/gserver/layers/BatchNormBaseLayer.h
deleted file mode 100644
index 69d642af4f12593e8db8a726310e6b1934c8e3be..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/BatchNormBaseLayer.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief Batch normalization layer use to normalizes the input to across the
- * batch.
- *
- * By default, calculating global mean and variance statistics via a running
- * average in the training peroid. Then the pre-calculated global mean and
- * variance are used for testing.
- *
- * Moving mean and variance are located in Parameter object when constructing
- * and the calculation will change them. Now we only save global mean and
- * variance of one thread in first node for GPU.
- * But the calculation in CPU is different, because parameters are shared by
- * multiple threads. Here using ShareCpuMatrix with lock to calculate. We
- * still save global mean and variance in first node in CPU when multi machine.
- *
- * [1] S. Ioffe and C. Szegedy, "Batch Normalization: Accelerating Deep Network
- *     Training by Reducing Internal Covariate Shift." arXiv preprint
- *     arXiv:1502.03167 (2015).
- */
-
-class BatchNormBaseLayer : public Layer {
-public:
-  explicit BatchNormBaseLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~BatchNormBaseLayer() {}
-
-  /**
-   * @brief Create BatchNorm layer by norm_type, including batch_norm and
-   * cudnn_batch_norm. If do not set norm_type, it will automatically select
-   * cudnn_batch_norm for GPU and batch_norm for CPU.
-   */
-  static Layer* create(const LayerConfig& config);
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  /**
-   * @brief Calculate feature map size. Some input uses frameHeight and
-   * frameWidth to store feature size
-   */
-  void calFeatureMapSize();
-
-protected:
-  /// Batch normalization scale parameter, which is referred to as gamma in
-  /// in original paper.
-  std::unique_ptr<Weight> weight_;
-  /// Moving average of mean.
-  std::unique_ptr<Weight> movingMean_;
-  /// Moving average of variance.
-  std::unique_ptr<Weight> movingVar_;
-  /// Batch normalization bias parameter, which is referred to as beta in
-  /// in original paper.
-  std::unique_ptr<Weight> biases_;
-
-  /// Save intermediate results computed during the forward pass,
-  /// these can then be reused to speed up the backward pass.
-  MatrixPtr savedMean_;
-  MatrixPtr savedInvVar_;
-
-  /// Height or width of input image feature.
-  /// Both of them are 1 if the input is fully-connected layer.
-  int imageD_;
-  int imageH_;
-  int imageW_;
-  /// Height * Width.
-  int imgPixels_;
-  /// Feature dimension. If the input layer is conv layer, it is the channels
-  /// of feature map of the conv layer. If the input layer is fully-connected
-  /// layer, it is the dimension of fc layer.
-  int channels_;
-  // if useGlobalStats_ is true, will use the loaded mean and variance.
-  // otherwise, calculate mean and variance in this mini-batch.
-  bool useGlobalStats_;
-  // use to compute moving mean and variance.
-  real movingAvgFraction_;
-  // Epsilon is a small random noise used in batch normalization for stability.
-  real epsilon_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.cpp b/paddle/gserver/layers/BatchNormalizationLayer.cpp
deleted file mode 100644
index 59831dd9049d70198721989b4a515df39e015968..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/BatchNormalizationLayer.cpp
+++ /dev/null
@@ -1,266 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Stat.h"
-#ifdef PADDLE_WITH_CUDA
-#include "hl_batch_transpose.h"
-#endif
-#include "BatchNormalizationLayer.h"
-
-namespace paddle {
-
-REGISTER_LAYER(batch_norm, BatchNormalizationLayer);
-
-bool BatchNormalizationLayer::init(const LayerMap& layerMap,
-                                   const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  if (!BatchNormBaseLayer::init(layerMap, parameterMap)) return false;
-
-  return true;
-}
-
-void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
-  int numSamples = mat->getHeight();
-  Matrix::resizeOrCreate(tmpMat_, numSamples, channels_, false, useGpu_);
-  savedMean_->zeroMem();
-  savedMean_->accumulateColSum(*mat);
-  savedMean_->mulScalar(1.0 / numSamples);  // E[x]
-
-  tmpMat_->assign(*mat);
-  tmpMat_->square2();
-  savedInvVar_->zeroMem();
-  savedInvVar_->accumulateColSum(*tmpMat_);
-  savedInvVar_->mulScalar(1.0 / numSamples);   // E[x^2]
-  savedInvVar_->addSquare(*savedMean_, -1.0);  // E[x^2] - E^2[x]
-
-  // Variance may be small negative value
-  // because of the subtraction operation.
-  // Here using clipping.
-  savedInvVar_->downClip(real(0.0));
-
-  calMovingMeanAndVar();
-
-  savedInvVar_->subScalar(-epsilon_);
-  savedInvVar_->sqrt2(*savedInvVar_);
-}
-
-void BatchNormalizationLayer::calMovingMeanAndVar() {
-  // calculating and saving moving mean and variance
-  auto& movingMean = movingMean_->getW();
-  auto& movingVar = movingVar_->getW();
-  // movingMean =  movingMean * movingAvgFraction_
-  //            + savedMean_ * (1 - movingAvgFraction_)
-  movingMean->add(*savedMean_, movingAvgFraction_, 1.0 - movingAvgFraction_);
-  // movingVar =  movingVar * movingAvgFraction_
-  //           + savedInvVar_ * (1 - movingAvgFraction_)
-  movingVar->add(*savedInvVar_, movingAvgFraction_, 1.0 - movingAvgFraction_);
-}
-
-void BatchNormalizationLayer::setMeanAndStd() {
-  savedMean_->copyFrom(*(movingMean_->getW()));
-  savedInvVar_->copyFrom(*(movingVar_->getW()));
-  savedInvVar_->downClip(real(0.0));
-
-  savedInvVar_->subScalar(-epsilon_);
-  savedInvVar_->sqrt2(*savedInvVar_);
-}
-
-void BatchNormalizationLayer::expandMat(const MatrixPtr& in, MatrixPtr& out) {
-  CHECK_EQ(in->getWidth(), static_cast<size_t>(channels_ * imgPixels_));
-  CHECK_EQ(out->getWidth(), static_cast<size_t>(channels_));
-  CHECK(!in->isTransposed());
-  CHECK(!out->isTransposed());
-  if (imgPixels_ == 1) {
-    out->assign(*in);
-    return;
-  }
-  size_t batchSize = in->getHeight();
-  CHECK_EQ(out->getHeight(), batchSize * imgPixels_);
-  if (useGpu_) {
-#ifndef PADDLE_WITH_CUDA
-    LOG(FATAL) << "paddle is compiled only for cpu";
-#else
-    batchTranspose(
-        in->getData(), out->getData(), imgPixels_, channels_, batchSize);
-#endif
-  } else {
-    for (size_t i = 0; i < batchSize; i++) {
-      const MatrixPtr inTmp =
-          Matrix::create(in->getData() + i * imgPixels_ * channels_,
-                         channels_,
-                         imgPixels_,
-                         false,
-                         useGpu_);
-      MatrixPtr outTmp =
-          Matrix::create(out->getData() + i * imgPixels_ * channels_,
-                         imgPixels_,
-                         channels_,
-                         false,
-                         useGpu_);
-      inTmp->transpose(outTmp, false);
-    }
-  }
-}
-
-void BatchNormalizationLayer::shrinkMat(const MatrixPtr& in, MatrixPtr& out) {
-  CHECK_EQ(in->getWidth(), static_cast<size_t>(channels_));
-  CHECK_EQ(out->getWidth(), static_cast<size_t>(channels_ * imgPixels_));
-  size_t batchSize = out->getHeight();
-  CHECK(!in->isTransposed());
-  CHECK(!out->isTransposed());
-  if (imgPixels_ == 1) {
-    out->assign(*in);
-    return;
-  }
-  CHECK_EQ(in->getHeight(), static_cast<size_t>(batchSize * imgPixels_));
-  if (useGpu_) {
-#ifndef PADDLE_WITH_CUDA
-    LOG(FATAL) << "paddle is compiled only for cpu";
-#else
-    batchTranspose(
-        in->getData(), out->getData(), channels_, imgPixels_, batchSize);
-#endif
-  } else {
-    for (size_t i = 0; i < batchSize; i++) {
-      const MatrixPtr inTmp =
-          Matrix::create(in->getData() + i * channels_ * imgPixels_,
-                         imgPixels_,
-                         channels_,
-                         false,
-                         useGpu_);
-      MatrixPtr outTmp =
-          Matrix::create(out->getData() + i * imgPixels_ * channels_,
-                         channels_,
-                         imgPixels_,
-                         useGpu_);
-      inTmp->transpose(outTmp, false);
-    }
-  }
-}
-
-void BatchNormalizationLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = getInputValue(0)->getHeight();
-  calFeatureMapSize();
-  resetOutput(batchSize, getInputValue(0)->getWidth());
-
-  // for testing in training peroid.
-  useGlobalStats_ = (passType == PASS_TEST);
-  if (passType == PASS_TEST && config_.has_use_global_stats()) {
-    useGlobalStats_ = config_.use_global_stats();
-  }
-
-  Matrix::resizeOrCreate(
-      expandedIn_, batchSize * imgPixels_, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      normIn_, batchSize * imgPixels_, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      expandedOut_, batchSize * imgPixels_, channels_, false, useGpu_);
-  expandMat(getInputValue(0), expandedIn_);
-
-  if (useGlobalStats_) {
-    if (firstTest_) {
-      setMeanAndStd();
-      firstTest_ = false;
-    }
-  } else {
-    calMeanAndStd(expandedIn_);
-    firstTest_ = true;
-  }
-
-  normIn_->assign(*expandedIn_);
-  normIn_->addBias(*savedMean_, -1);     // subtract mean.
-  normIn_->divRowVector(*savedInvVar_);  // divide std.
-
-  expandedOut_->assign(*normIn_);
-  expandedOut_->mulRowVector(*weight_->getW());  // multiple gamma.
-  if (biases_) {
-    expandedOut_->addBias(*(biases_->getW()), 1);  // add beta.
-  }
-  MatrixPtr out = getOutputValue();
-  shrinkMat(expandedOut_, out);
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void BatchNormalizationLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-  int batchSize = getInputValue(0)->getHeight();
-
-  Matrix::resizeOrCreate(meanGrad_, 1, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(stdGrad_, 1, channels_, false, useGpu_);
-
-  Matrix::resizeOrCreate(
-      expandedInGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      inGrad_, batchSize, imgPixels_ * channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      normInGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      expandedOutGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      tmpMat_, batchSize * imgPixels_, channels_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      tmpGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
-
-  expandMat(getOutputGrad(), expandedOutGrad_);
-
-  // compute derivatives.
-  if (biases_ && biases_->getWGrad()) {
-    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
-    biases_->getWGrad()->collectBias(*expandedOutGrad_, 1);
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-  if (weight_->getWGrad()) {
-    tmpMat_->dotMul(*expandedOutGrad_, *normIn_);
-    weight_->getWGrad()->collectBias(*tmpMat_, 1);
-  }
-
-  // compute input gradients.
-  normInGrad_->assign(*expandedOutGrad_);
-  normInGrad_->mulRowVector(*(weight_->getW()));  // multiple gamma.
-  // normInGrad * (x - \mu)/ \sqrt(\delta^2)
-  tmpMat_->dotMul(*normInGrad_, *normIn_);
-  stdGrad_->zeroMem();
-  stdGrad_->collectBias(*tmpMat_, -1.0 / (batchSize * imgPixels_));
-  tmpGrad_->assign(*normIn_);
-  tmpGrad_->mulRowVector(*stdGrad_);
-
-  meanGrad_->zeroMem();
-  meanGrad_->collectBias(*normInGrad_, -1.0 / (batchSize * imgPixels_));
-
-  expandedInGrad_->zeroMem();
-  expandedInGrad_->add(*normInGrad_, *tmpGrad_);
-  expandedInGrad_->addRowVector(*meanGrad_);
-  expandedInGrad_->divRowVector(*savedInvVar_);
-
-  shrinkMat(expandedInGrad_, inGrad_);
-  if (getInputGrad(0)) {
-    getInputGrad(0)->add(*getInputGrad(0), *inGrad_);
-  }
-  {
-    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-    weight_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.h b/paddle/gserver/layers/BatchNormalizationLayer.h
deleted file mode 100644
index 95add69215e3ea0b0225d0a245fe37905c33127b..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/BatchNormalizationLayer.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "BatchNormBaseLayer.h"
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * @brief A Inheritance class of Batch normalization layer.
- * It supports both CPU and GPU.
- *
- * The config file api is batch_norm_layer.
- */
-
-class BatchNormalizationLayer : public BatchNormBaseLayer {
-public:
-  explicit BatchNormalizationLayer(const LayerConfig& config)
-      : BatchNormBaseLayer(config), firstTest_(true) {}
-
-  ~BatchNormalizationLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-protected:
-  /// Load pre-calculated mean and std.
-  void setMeanAndStd();
-
-  /// Calculate mean and std.
-  void calMeanAndStd(const MatrixPtr& mat);
-
-  /// Calculate moving mean and variance.
-  void calMovingMeanAndVar();
-
-  /// expand a Matrix from batch, channels* imagePixels to
-  /// batch * ImagePixels * channels.
-  void expandMat(const MatrixPtr& in, MatrixPtr& out);
-
-  /// Shrink a Matrix from  from batch * ImagePixels * channels
-  /// to batch, channels* imagePixels.
-  void shrinkMat(const MatrixPtr& in, MatrixPtr& out);
-
-  void onPassEnd() override { firstTest_ = true; }
-
-  MatrixPtr tmpMat_, tmpGrad_;
-  MatrixPtr expandedIn_, expandedOut_;
-  MatrixPtr expandedInGrad_, expandedOutGrad_, inGrad_;
-  MatrixPtr normIn_, normInGrad_, meanGrad_, stdGrad_;
-
-  /// Load mean and variance only once flag.
-  bool firstTest_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/BilinearInterpLayer.cpp b/paddle/gserver/layers/BilinearInterpLayer.cpp
deleted file mode 100644
index 9775914596ce3253aada71fbe7197410414fede5..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/BilinearInterpLayer.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "BilinearInterpLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(bilinear_interp, BilinearInterpLayer);
-
-size_t BilinearInterpLayer::getSize() {
-  inImgH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  inImgW_ = inputLayers_[0]->getOutput().getFrameWidth();
-
-  const BilinearInterpConfig& conf = config_.inputs(0).bilinear_interp_conf();
-  if (inImgH_ == 0) {
-    inImgH_ = conf.image_conf().img_size_y();
-  }
-  if (inImgW_ == 0) {
-    inImgW_ = conf.image_conf().img_size();
-  }
-
-  outImgH_ = conf.out_size_y();
-  outImgW_ = conf.out_size_x();
-  numChannels_ = conf.image_conf().channels();
-
-  CHECK(outImgH_ > 0 && outImgW_ > 0);
-  CHECK(inImgH_ > 0 && inImgW_ > 0);
-  CHECK(numChannels_);
-
-  ratioH_ =
-      (outImgH_ > 1) ? static_cast<real>(inImgH_ - 1) / (outImgH_ - 1) : 0.f;
-  ratioW_ =
-      (outImgW_ > 1) ? static_cast<real>(inImgW_ - 1) / (outImgW_ - 1) : 0.f;
-
-  getOutput().setFrameHeight(outImgH_);
-  getOutput().setFrameWidth(outImgW_);
-  return outImgH_ * outImgW_ * numChannels_;
-}
-
-bool BilinearInterpLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(1, config_.inputs_size());
-
-  return true;
-}
-
-void BilinearInterpLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  size_t batchSize = getInput(0).getBatchSize();
-  size_t size = getSize();
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, size);
-  }
-
-  MatrixPtr inV = getInputValue(0);
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwBilinearInterpTimer", getName().c_str());
-    outV->bilinearForward(*inV,
-                          inImgH_,
-                          inImgW_,
-                          outImgH_,
-                          outImgW_,
-                          numChannels_,
-                          ratioH_,
-                          ratioW_);
-  }
-}
-
-void BilinearInterpLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  MatrixPtr inputG = getInputGrad(0);
-  MatrixPtr outG = getOutputGrad();
-  {
-    REGISTER_TIMER_INFO("BwBilinearInterpTimer", getName().c_str());
-    if (inputG) {
-      inputG->bilinearBackward(*outG,
-                               outImgH_,
-                               outImgW_,
-                               inImgH_,
-                               inImgW_,
-                               numChannels_,
-                               ratioH_,
-                               ratioW_);
-    }
-  }
-}
-}  // namespace paddle
diff --git a/paddle/gserver/layers/BilinearInterpLayer.h b/paddle/gserver/layers/BilinearInterpLayer.h
deleted file mode 100644
index acd320420f4bbfe313f3ae77577ffc6b5cbfbfdf..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/BilinearInterpLayer.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for bilinear interpolation which is
- *        used on conv layer output.
- *
- * @note  The config file api is bilinear_interp_layer.
- */
-class BilinearInterpLayer : public Layer {
-protected:
-  size_t outImgH_, outImgW_;
-  size_t inImgH_, inImgW_;
-  real ratioH_, ratioW_;
-  size_t numChannels_;
-
-public:
-  explicit BilinearInterpLayer(const LayerConfig& config) : Layer(config) {}
-
-  virtual ~BilinearInterpLayer() {}
-
-  size_t getSize();
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp
deleted file mode 100644
index 793d24e884a6f76c2aa897b3d03f3adc3e201265..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/BlockExpandLayer.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "BlockExpandLayer.h"
-
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-REGISTER_LAYER(blockexpand, BlockExpandLayer);
-
-bool BlockExpandLayer::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(config_.inputs_size(), 1);
-  const BlockExpandConfig& blockConf = config_.inputs(0).block_expand_conf();
-  blockH_ = blockConf.block_y();
-  blockW_ = blockConf.block_x();
-  strideH_ = blockConf.stride_y();
-  strideW_ = blockConf.stride_x();
-  paddingH_ = blockConf.padding_y();
-  paddingW_ = blockConf.padding_x();
-  channels_ = blockConf.channels();
-  imgSizeH_ = blockConf.img_size_y();
-  imgSizeW_ = blockConf.img_size_x();
-
-  std::vector<size_t> strides = {(size_t)strideH_, (size_t)strideW_};
-  std::vector<size_t> paddings = {(size_t)paddingH_, (size_t)paddingW_};
-  std::vector<size_t> blocks = {(size_t)blockH_, (size_t)blockW_};
-  createFunction(forward_,
-                 "BlockExpand",
-                 FuncConfig()
-                     .set("strides", strides)
-                     .set("paddings", paddings)
-                     .set("blocks", blocks));
-  createFunction(backward_,
-                 "BlockExpandGrad",
-                 FuncConfig()
-                     .set("strides", strides)
-                     .set("paddings", paddings)
-                     .set("blocks", blocks));
-
-  return true;
-}
-
-size_t BlockExpandLayer::getBlockNum() {
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  const BlockExpandConfig& blockConf = config_.inputs(0).block_expand_conf();
-  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (imgSizeH_ == 0) {
-    imgSizeH_ = blockConf.img_size_y();
-  }
-  if (imgSizeW_ == 0) {
-    imgSizeW_ = blockConf.img_size_x();
-  }
-  size_t tmpH = 2 * paddingH_ + imgSizeH_ - blockH_;
-  outputH_ = (int)tmpH < 0 ? 1 : 1 + (tmpH + strideH_ - 1) / strideH_;
-  size_t tmpW = 2 * paddingW_ + imgSizeW_ - blockW_;
-  outputW_ = (int)tmpW < 0 ? 1 : 1 + (tmpW + strideW_ - 1) / strideW_;
-
-  return outputH_ * outputW_;
-}
-
-void BlockExpandLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  size_t blockNum = getBlockNum();
-  size_t blockSize = blockH_ * blockW_ * channels_;
-  resetOutput(blockNum * batchSize, blockSize);
-
-  // calculate output_.value
-  inputShape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
-  outputShape_ = TensorShape({batchSize, blockNum, blockSize});
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), inputShape_);
-  outputs.addArg(*getOutputValue(), outputShape_, ASSIGN_TO);
-  forward_[0]->calc(inputs, outputs);
-
-  // calculate output_.sequenceStartPositions and output_.cpuSequenceDims
-  Argument& out = getOutput();
-  ICpuGpuVector::resizeOrCreate(
-      out.sequenceStartPositions, batchSize + 1, false);
-  IVector::resizeOrCreate(out.cpuSequenceDims, 2 * batchSize, false);
-  int* start = out.sequenceStartPositions->getMutableData(false);
-  int* dims = out.cpuSequenceDims->getData();
-  for (size_t i = 0; i < batchSize; i++) {
-    start[i] = i * blockNum;
-    dims[2 * i] = outputH_;
-    dims[2 * i + 1] = outputW_;
-  }
-  start[batchSize] = batchSize * blockNum;
-}
-
-void BlockExpandLayer::backward(const UpdateCallback& callback) {
-  /* Calculate the input layers error */
-  if (getInputGrad(0)) {
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*getOutputGrad(), outputShape_);
-    outputs.addArg(*getInputGrad(0), inputShape_, ADD_TO);
-    backward_[0]->calc(inputs, outputs);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h
deleted file mode 100644
index 1797b64036b5cb9f97477d5a44b2f58e2d6c0cd4..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/BlockExpandLayer.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief Expand feature map to minibatch matrix.
- * - matrix width is: blockH_ * blockW_ * channels_
- * - matirx height is: outputH_ * outputW_
- *
- * \f[
- * outputH\_ = 1 + (2 * paddingH\_ + imgSizeH\_ - blockH\_ + strideH\_ - 1) /
- *             strideH\_ \\
- * outputW\_ = 1 + (2 * paddingW\_ + imgSizeW\_ - blockW\_ + strideW\_ - 1) /
- *             strideW\_
- * \f]
- *
- * The expand method is the same with ExpandConvLayer, but saved the transposed
- * value. After expanding, output_.sequenceStartPositions will store timeline.
- * The number of time steps are outputH_ * outputW_ and the dimension of each
- * time step is blockH_ * blockW_ * channels_. This layer can be used after
- * convolution neural network, and before recurrent neural network.
- *
- * The config file api is block_expand_layer.
- */
-class BlockExpandLayer : public Layer {
-protected:
-  /**
-   * @brief Calculate outputH_ and outputW_ and return block number which
-   * actually is time steps.
-   * @return time steps, outoutH_ * outputW_.
-   */
-  size_t getBlockNum();
-  size_t blockH_, blockW_, strideH_, strideW_, paddingH_, paddingW_;
-  size_t imgSizeH_, imgSizeW_, outputH_, outputW_, channels_;
-
-  TensorShape inputShape_;
-  TensorShape outputShape_;
-
-public:
-  explicit BlockExpandLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~BlockExpandLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CRFDecodingLayer.h b/paddle/gserver/layers/CRFDecodingLayer.h
deleted file mode 100644
index fba3cebac1a375008c58d21c458d9e0b98305ffa..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CRFDecodingLayer.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-
-#include "CRFLayer.h"
-#include "LinearChainCRF.h"
-
-namespace paddle {
-
-/**
- * A layer for calculating the decoding sequence of sequential conditional
- * random field model.
- * The decoding sequence is stored in output_.ids
- * It also calculate error, output_.value[i] is 1 for incorrect decoding
- * or 0 for correct decoding)
- * See LinearChainCRF.h for the detail of the CRF formulation.
- */
-class CRFDecodingLayer : public CRFLayer {
-public:
-  explicit CRFDecodingLayer(const LayerConfig& config) : CRFLayer(config) {}
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
-protected:
-  std::unique_ptr<LinearChainCRF> crf_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CRFLayer.h b/paddle/gserver/layers/CRFLayer.h
deleted file mode 100644
index cb5bd05568cc79c0093d6af0791cf0b3ce2dae47..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CRFLayer.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-
-#include "Layer.h"
-#include "LinearChainCRF.h"
-
-namespace paddle {
-
-/**
- * A layer for calculating the cost of sequential conditional random field
- * model.
- * See class LinearChainCRF for the detail of the CRF formulation.
- */
-class CRFLayer : public Layer {
-public:
-  explicit CRFLayer(const LayerConfig& config) : Layer(config) {}
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
-protected:
-  size_t numClasses_;
-  ParameterPtr parameter_;
-  std::vector<LinearChainCRF> crfs_;
-  LayerPtr weightLayer_;            // weight for each sequence
-  std::unique_ptr<Weight> weight_;  // parameters
-  real coeff_;                      // weight for the layer
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CTCLayer.h b/paddle/gserver/layers/CTCLayer.h
deleted file mode 100644
index fcbc42565e9340903d05aca2d0ba2091ffe20be0..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CTCLayer.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "LinearChainCTC.h"
-
-namespace paddle {
-
-class CTCLayer : public Layer {
-public:
-  explicit CTCLayer(const LayerConfig& config) : Layer(config) {}
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void forwardImp(const Argument& softmaxSeqs, const Argument& labelSeqs);
-  void backward(const UpdateCallback& callback) override;
-  void backwardImp(const UpdateCallback& callback,
-                   const Argument& softmaxSeqs,
-                   const Argument& labelSeqs);
-
-protected:
-  size_t numClasses_;
-  bool normByTimes_;
-  std::vector<LinearChainCTC> ctcs_;
-  std::vector<Argument> tmpCpuInput_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ClipLayer.cpp b/paddle/gserver/layers/ClipLayer.cpp
deleted file mode 100644
index dbc3337499788af5a9b6f68a6016e94c2072d61b..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ClipLayer.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * A layer for clipping the input value by the threshold.
- * \f[
- *   out[i] = \min\left(\max\left(in[i],p_{1}\right),p_{2}\right)
- * \f]
- */
-
-class ClipLayer : public Layer {
-protected:
-  double min_;
-  double max_;
-
-public:
-  explicit ClipLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(clip, ClipLayer);
-
-bool ClipLayer::init(const LayerMap& layerMap,
-                     const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 1U);
-  auto layerConf = config_.inputs(0).clip_conf();
-  min_ = layerConf.min();
-  max_ = layerConf.max();
-  CHECK_LT(min_, max_);
-  return true;
-}
-
-void ClipLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV = getInputValue(0);
-  resetOutput(inV->getHeight(), inV->getWidth());
-  MatrixPtr outV = getOutputValue();
-  outV->copyFrom(*inV);
-  outV->clip(min_, max_);
-}
-
-void ClipLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV = getInputValue(0);
-  MatrixPtr inG = getInputGrad(0);
-  if (inG) {
-    MatrixPtr outV = getOutputValue();
-    MatrixPtr outG = getOutputGrad();
-    MatrixPtr tmpMtx;
-    Matrix::resizeOrCreate(
-        tmpMtx, outG->getHeight(), outG->getWidth(), false, useGpu_);
-    tmpMtx->clipDerivative(*inV, min_, max_);
-    inG->addDotMul(*outG, *tmpMtx, 1, 1);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConcatenateLayer.cpp b/paddle/gserver/layers/ConcatenateLayer.cpp
deleted file mode 100644
index f5ab29a509e45e72c71ba122c73aeba1b3b6a827..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConcatenateLayer.cpp
+++ /dev/null
@@ -1,208 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "Projection.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A concatenate layer has multiple input layers. It concatenates rows of
- * each input as one row for the output of this layer and apply activation.
- */
-class ConcatenateLayer : public Layer {
-public:
-  explicit ConcatenateLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ConcatenateLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(concat, ConcatenateLayer);
-
-bool ConcatenateLayer::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  if (!Layer::init(layerMap, parameterMap)) return false;
-
-  CHECK(!biasParameter_);
-
-  return true;
-}
-
-void ConcatenateLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = getInput(0).getBatchSize();
-  int size = getSize();
-  reserveOutput(batchSize, size);
-
-  const MatrixPtr& out = getOutputValue();
-  int offset = 0;
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    const MatrixPtr& in = getInputValue(i);
-    size_t inSize = in->getWidth();
-    out->assignAtOffset(*in, offset);
-    offset += inSize;
-  }
-  CHECK_EQ(size, offset);
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void ConcatenateLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  /* Do activation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  const MatrixPtr& out = getOutputGrad();
-  int offset = 0;
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    const MatrixPtr& in = getInputGrad(i);
-    size_t inSize = getInputValue(i)->getWidth();
-    if (in) {
-      in->addAtOffset(*out, offset);
-    }
-    offset += inSize;
-  }
-}
-
-/**
- * concat2 layer is like concat layer, but each input layer was
- * processed by a Projection.
- */
-class ConcatenateLayer2 : public Layer {
-public:
-  explicit ConcatenateLayer2(const LayerConfig& config) : Layer(config) {}
-
-  ~ConcatenateLayer2() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-protected:
-  std::vector<std::unique_ptr<Projection>> projections_;
-  std::vector<Argument> projOutput_;
-  std::vector<std::pair<size_t, size_t>> projCol_;
-  bool sharedBias_;
-  std::unique_ptr<Weight> biases_;
-};
-
-REGISTER_LAYER(concat2, ConcatenateLayer2);
-
-bool ConcatenateLayer2::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  if (!Layer::init(layerMap, parameterMap)) return false;
-
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  projections_.reserve(inputLayers_.size());
-  projCol_.reserve(inputLayers_.size());
-  projOutput_.resize(inputLayers_.size());
-
-  size_t startCol = 0;
-  size_t endCol = 0;
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    projections_.emplace_back(Projection::create(
-        config_.inputs(i).proj_conf(), parameters_[i], useGpu_));
-
-    endCol += projections_[i]->getOutputSize();
-    projCol_.push_back(std::make_pair(startCol, endCol));
-    startCol = endCol;
-  }
-  CHECK_EQ(getSize(), endCol);
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    sharedBias_ = config_.shared_biases();
-    size_t psize = config_.bias_size();
-    biases_ = std::unique_ptr<Weight>(new Weight(1, psize, biasParameter_));
-  }
-
-  return true;
-}
-
-void ConcatenateLayer2::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = getInput(0).getBatchSize();
-  int size = getSize();
-  resetOutput(batchSize, size);
-
-  for (size_t i = 0; i < projections_.size(); i++) {
-    size_t startCol = projCol_[i].first;
-    size_t endCol = projCol_[i].second;
-    projOutput_[i].value = output_.value->subColMatrix(startCol, endCol);
-    if (output_.grad) {
-      projOutput_[i].grad = output_.grad->subColMatrix(startCol, endCol);
-    }
-  }
-
-  {
-    AsyncGpuBlock block;
-    for (size_t i = 0; i != inputLayers_.size(); ++i) {
-      projections_[i]->forward(&getInput(i), &projOutput_[i], passType);
-    }
-  }
-
-  /* add the bias-vector */
-  if (biases_) {
-    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
-    output_.value->addBias(*(biases_->getW()), 1, sharedBias_);
-  }
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void ConcatenateLayer2::backward(const UpdateCallback& callback) {
-  /* Do activation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  AsyncGpuBlock block;
-  if (biases_ && biases_->getWGrad()) {
-    REGISTER_TIMER_INFO("Concat2BpBiasTimer", getName().c_str());
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBias_);
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    if (projections_[i]) {
-      projections_[i]->backward(callback);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp
deleted file mode 100644
index 10c3cef0da61af76a6b0a207e4b914276a2fa39b..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ContextProjection.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_PROJECTION(context, ContextProjection);
-
-ContextProjection::ContextProjection(const ProjectionConfig& config,
-                                     ParameterPtr parameter,
-                                     bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  CHECK(config.has_context_start());
-  CHECK(config.has_context_length());
-  if (config.context_start() == 0 && config.context_length() == 1) {
-    config_.set_trainable_padding(false);
-  }
-  if (config_.trainable_padding()) {
-    CHECK(parameter);
-    beginPad_ = std::max(0, -config.context_start());
-    endPad_ = std::max(0, config.context_start() + config.context_length() - 1);
-    size_t totalPad = beginPad_ + endPad_;
-    size_t inputDim = parameter->getSize() / totalPad;
-    CHECK_EQ(config.input_size(), inputDim);
-    CHECK_EQ(inputDim * totalPad, parameter->getSize());
-    weight_.reset(new Weight(totalPad, inputDim, parameter));
-  }
-  // init forward_ and backward_ functions
-  init();
-}
-
-bool ContextProjection::init() {
-  size_t context_length = config_.context_length();
-  int context_start = config_.context_start();
-  bool is_padding = config_.trainable_padding();
-  size_t total_pad = is_padding ? beginPad_ + endPad_ : 0;
-
-  createFunction(forward_,
-                 "ContextProjectionForward",
-                 FuncConfig()
-                     .set("context_length", context_length)
-                     .set("context_start", context_start)
-                     .set("begin_pad", beginPad_));
-  createFunction(backward_,
-                 "ContextProjectionBackward",
-                 FuncConfig()
-                     .set("context_length", context_length)
-                     .set("context_start", context_start)
-                     .set("begin_pad", beginPad_)
-                     .set("is_padding", is_padding)
-                     .set("total_pad", total_pad));
-
-  return true;
-}
-
-void ContextProjection::resetState() {
-  CHECK_LE(config_.context_start() + config_.context_length(), 1)
-      << "state is not allowed for future context";
-  if (config_.context_start() >= 0) return;
-  Matrix::resizeOrCreate(state_,
-                         -config_.context_start(),
-                         config_.input_size(),
-                         false,  // trans
-                         useGpu_);
-  Matrix::resizeOrCreate(state2_,
-                         -config_.context_start(),
-                         config_.input_size(),
-                         false,  // trans
-                         useGpu_);
-  if (config_.trainable_padding()) {
-    state_->assign(*weight_->getW()->subMatrix(0, -config_.context_start()));
-  } else {
-    state_->zeroMem();
-  }
-}
-
-void ContextProjection::setState(LayerStatePtr state) {
-  CHECK(state->value.size() == 1)
-      << "one matrix is expected for ContextProjection state";
-  state_->copyFrom(*(state->value[0]));
-}
-
-LayerStatePtr ContextProjection::getState() {
-  if (state_ == nullptr) {
-    return nullptr;
-  }
-  LayerStatePtr res = std::make_shared<LayerState>();
-  res->value.push_back(state_->clone(0, 0, false));
-  res->value[0]->copyFrom(*state_);
-  return res;
-}
-
-void ContextProjection::forward() {
-  CHECK(in_->value && out_->value);
-  CHECK(in_->sequenceStartPositions);
-
-  size_t input_dim = in_->value->getWidth();
-  size_t dim = out_->value->getWidth();
-  CHECK_EQ(dim, input_dim * config_.context_length());
-  // size_t batch_size = in_->value->getHeight();
-  CHECK_EQ(forward_.size(), (size_t)1) << "Only one forward function here";
-
-  REGISTER_TIMER_INFO("ContextProjectionForward", getName().c_str());
-  bool is_padding = config_.trainable_padding();
-  /// first use state_, otherwise use weight_(padding false === w nullptr)
-  auto w_ptr =
-      state_ ? state_.get() : is_padding ? weight_->getW().get() : nullptr;
-  const auto start_pos = in_->sequenceStartPositions->getVector(useGpu_);
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*in_->value, *start_pos);
-  if (w_ptr) {
-    inputs.addArg(CpuMatrix(w_ptr->getData(), w_ptr->getHeight(), input_dim),
-                  *start_pos);
-  }
-  outputs.addArg(*out_->value, *start_pos, ADD_TO);
-  forward_[0]->calc(inputs, outputs);
-
-  if (state_ && config_.context_start() < 0) {
-    CHECK_EQ(1, in_->getNumSequences());
-    const int* starts = in_->sequenceStartPositions->getData(false);
-    int length = starts[1] - starts[0];
-    if (-config_.context_start() <= length) {
-      MatrixPtr sub = in_->value->subMatrix(starts[1] + config_.context_start(),
-                                            -config_.context_start());
-      state_->copyFrom(*sub);
-    } else {
-      int prevLength = -config_.context_start() - length;
-      state2_->subMatrix(0, prevLength)
-          ->copyFrom(*state_->subMatrix(length, prevLength));
-      state2_->subMatrix(prevLength, length)
-          ->copyFrom(*in_->value->subMatrix(starts[0], length));
-      std::swap(state_, state2_);
-    }
-  }
-}
-
-void ContextProjection::backward(const UpdateCallback& callback) {
-  CHECK(in_->value && out_->value && out_->grad);
-  size_t input_dim = in_->value->getWidth();
-  size_t dim = out_->value->getWidth();
-  CHECK_EQ(dim, input_dim * config_.context_length());
-  size_t batch_size = in_->value->getHeight();
-  CHECK_EQ(batch_size, out_->value->getHeight());
-  CHECK_EQ(static_cast<int>(backward_.size()), 1)
-      << "Only one backward function here";
-
-  REGISTER_TIMER_INFO("ContextProjectionBackward", getName().c_str());
-  bool is_padding = config_.trainable_padding();
-  auto start_pos = in_->sequenceStartPositions;
-  auto w_ptr = is_padding ? weight_->getWGrad() : nullptr;
-
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*out_->grad, *in_->sequenceStartPositions->getVector(useGpu_));
-  outputs.addArg(
-      CpuMatrix(
-          in_->grad ? in_->grad->getData() : nullptr, batch_size, input_dim),
-      *in_->sequenceStartPositions->getVector(useGpu_),
-      ADD_TO);
-  outputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
-                           w_ptr ? w_ptr->getHeight() : 0,
-                           input_dim),
-                 ADD_TO);
-  backward_[0]->calc(inputs, outputs);
-
-  if (config_.trainable_padding()) {
-    weight_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ContextProjection.h b/paddle/gserver/layers/ContextProjection.h
deleted file mode 100644
index e30f98f58d2be9ac538f6385efe68990b705ac5f..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ContextProjection.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Projection.h"
-
-namespace paddle {
-
-/**
- * @brief Context projection concatenate features in adjacent time steps in
- * a sequence. The i-th row of the output is the concatenation of
- * context_length rows of the input. The context_length rows are the
- * consecutive rows from the i+shift_start row.
- *
- * For example, assumed input (x) has 4 words and the dimension of each word
- * representation is 2. If we use zero to pad instead of learned weight to pad,
- * and the context_lenth is 3, the output (y) is:
- *
- * @code
- *  x = [a1, a2;
- *       b1, b2;
- *       c1, c2;
- *       d1, d2]
- *  y = [0,  0,  a1, a2, b1, b2;
- *       a1, a2, b1, b2, c1, c2;
- *       b1, b2, c1, c2, d1, d2;
- *       c1, c2, d1, d2, 0,  0]
- * @endcode
- *
- * The config file api is context_projection.
- */
-class ContextProjection : public Projection {
-public:
-  /**
-   * Constructor. If context_start is zero and context_lenth is one, it will
-   * set trainable_padding false. trainable_padding is an optional arguments
-   * and if it is set, constructor will set learned weight, which is used to
-   * pad output.
-   */
-  ContextProjection(const ProjectionConfig& config,
-                    ParameterPtr parameter,
-                    bool useGpu);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-
-  virtual void resetState();
-
-  virtual void setState(LayerStatePtr state);
-
-  virtual LayerStatePtr getState();
-
-  virtual bool init();
-
-protected:
-  std::unique_ptr<Weight> weight_;
-  /// number of extra timesteps added at the beginning
-  size_t beginPad_;
-  /// number of extra timesteps added at the end
-  size_t endPad_;
-  /// state_ and state2_ are used in sequence generating and saved
-  /// previous inputs.
-  MatrixPtr state_;
-  MatrixPtr state2_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/Conv3DLayer.cpp b/paddle/gserver/layers/Conv3DLayer.cpp
deleted file mode 100644
index b38de86b1591f987a63478d019019f87c88cee20..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/Conv3DLayer.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Conv3DLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(conv3d, Conv3DLayer);
-
-bool Conv3DLayer::init(const LayerMap &layerMap,
-                       const ParameterMap &parameterMap) {
-  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
-  int index = 0;
-  for (auto &inputConfig : config_.inputs()) {
-    const ConvConfig &conf = inputConfig.conv_conf();
-    M_.push_back(numFilters_ / conf.groups());
-    K_.push_back(filterPixels_[index] * filterChannels_[index]);
-
-    // create a new weight
-    size_t height, width;
-    width = filterPixels_[index] * filterChannels_[index];
-    height = numFilters_;
-    CHECK_EQ(parameters_[index]->getSize(), width * height);
-    Weight *w = new Weight(height, width, parameters_[index]);
-    weights_.emplace_back(w);
-    ++index;
-  }
-  if (biasParameter_.get()) {
-    if (sharedBiases_) {
-      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
-    } else {
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
-    }
-  }
-  return true;
-}
-
-size_t Conv3DLayer::getSize() {
-  CHECK_NE(inputLayers_.size(), 0UL);
-  outputH_.clear();
-  outputW_.clear();
-  outputD_.clear();
-  N_.clear();
-  size_t layerSize = 0;
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    outputW_.push_back(outputSize(
-        imgSizeW_[i], filterSize_[i], padding_[i], stride_[i], true));
-    outputH_.push_back(outputSize(
-        imgSizeH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
-    outputD_.push_back(outputSize(
-        imgSizeD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
-
-    N_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
-    CHECK(layerSize == 0 || N_[i] * size_t(numFilters_) == layerSize);
-    layerSize += N_[i] * numFilters_;
-  }
-  getOutput().setFrameHeight(outputH_[0]);
-  getOutput().setFrameWidth(outputW_[0]);
-  getOutput().setFrameDepth(outputD_[0]);
-  return layerSize;
-}
-
-void Conv3DLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  int outWidth = getSize();
-  resetOutput(batchSize, outWidth);
-
-  REGISTER_TIMER_INFO("FwdConv3D", getName().c_str());
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    const MatrixPtr &inMat = getInputValue(i);
-    const MatrixPtr &outMat = getOutputValue();
-    int M = M_[i];
-    int N = N_[i];
-    int K = K_[i];
-    Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
-    MatrixPtr wMat = weights_[i]->getW();
-    for (int n = 0; n < batchSize; ++n) {
-      colBuf_->vol2Col(inMat->getData() + n * inMat->getStride(),
-                       channels_[i],
-                       imgSizeD_[i],
-                       imgSizeH_[i],
-                       imgSizeW_[i],
-                       filterSizeZ_[i],
-                       filterSizeY_[i],
-                       filterSize_[i],
-                       strideZ_[i],
-                       strideY_[i],
-                       stride_[i],
-                       paddingZ_[i],
-                       paddingY_[i],
-                       padding_[i]);
-
-      real *outData = outMat->getData() + n * outMat->getStride();
-      MatrixPtr outMatSub =
-          Matrix::create(outData, groups_[i] * M, N, false, useGpu_);
-      for (int g = 0; g < groups_[i]; g++) {
-        MatrixPtr wMatSub = wMat->subMatrix(g * M, M);
-        MatrixPtr in = colBuf_->subMatrix(g * K, K);
-        MatrixPtr out = outMatSub->subMatrix(g * M, M);
-        out->mul(*wMatSub, *in, 1.0, 1.0);
-      }
-    }
-  }
-  if (nullptr != this->biasParameter_) {
-    this->addBias();
-  }
-  forwardActivation();
-}
-
-void Conv3DLayer::backward(const UpdateCallback &callback) {
-  backwardActivation();
-
-  if (biases_ && biases_->getWGrad()) {
-    bpropBiases();
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  REGISTER_TIMER_INFO("BwdConv3D", getName().c_str());
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    if (weights_[i]->getWGrad()) {
-      bpropWeights(i);
-    }
-    if (getInputGrad(i)) {
-      bpropData(i);
-    }
-    weights_[i]->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-void Conv3DLayer::bpropWeights(int i) {
-  int M = M_[i];
-  int N = N_[i];
-  int K = K_[i];
-  const MatrixPtr &inMat = getInputValue(i);
-  Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
-  MatrixPtr wGradMat = weights_[i]->getWGrad();
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  for (int n = 0; n < batchSize; ++n) {
-    colBuf_->vol2Col(inMat->getData() + n * inMat->getStride(),
-                     channels_[i],
-                     imgSizeD_[i],
-                     imgSizeH_[i],
-                     imgSizeW_[i],
-                     filterSizeZ_[i],
-                     filterSizeY_[i],
-                     filterSize_[i],
-                     strideZ_[i],
-                     strideY_[i],
-                     stride_[i],
-                     paddingZ_[i],
-                     paddingY_[i],
-                     padding_[i]);
-
-    real *outGradData =
-        getOutputGrad()->getData() + n * getOutputGrad()->getStride();
-    MatrixPtr outGradSub =
-        Matrix::create(outGradData, groups_[i] * M, N, false, useGpu_);
-    for (int g = 0; g < groups_[i]; ++g) {
-      MatrixPtr inMatSub = colBuf_->subMatrix(g * K, K);
-      MatrixPtr outG = outGradSub->subMatrix(g * M, M);
-      MatrixPtr wGradSub = wGradMat->subMatrix(g * M, M);
-      wGradSub->mul(*outG, *(inMatSub->getTranspose()), 1.0, 1.0);
-    }
-  }
-}
-
-void Conv3DLayer::bpropData(int i) {
-  int M = M_[i];
-  int N = N_[i];
-  int K = K_[i];
-  Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
-  MatrixPtr wMat = weights_[i]->getW();
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  for (int n = 0; n < batchSize; ++n) {
-    real *outGradData =
-        getOutputGrad()->getData() + n * getOutputGrad()->getStride();
-    real *preGradData =
-        getInputGrad(i)->getData() + n * getInputGrad(i)->getStride();
-    MatrixPtr outGradSub =
-        Matrix::create(outGradData, M * groups_[i], N, false, useGpu_);
-    for (int g = 0; g < groups_[i]; ++g) {
-      MatrixPtr wMatSub = wMat->subMatrix(g * M, M);
-      MatrixPtr outG = outGradSub->subMatrix(g * M, M);
-      MatrixPtr inGradMatSub = colBuf_->subMatrix(g * K, K);
-      inGradMatSub->mul(*(wMatSub->getTranspose()), *outG, 1.0, 0.0);
-    }
-    colBuf_->col2Vol(preGradData,
-                     channels_[i],
-                     imgSizeD_[i],
-                     imgSizeH_[i],
-                     imgSizeW_[i],
-                     filterSizeZ_[i],
-                     filterSizeY_[i],
-                     filterSize_[i],
-                     strideZ_[i],
-                     strideY_[i],
-                     stride_[i],
-                     paddingZ_[i],
-                     paddingY_[i],
-                     padding_[i],
-                     1.0,
-                     1.0);
-  }
-}
-
-void Conv3DLayer::bpropBiases() {
-  MatrixPtr biases = Matrix::create(biases_->getWGrad()->getData(),
-                                    1,
-                                    biases_->getWGrad()->getElementCnt(),
-                                    false,
-                                    useGpu_);
-  MatrixPtr outGradMat = getOutputGrad();
-
-  if (this->sharedBiases_) {
-    biases->collectSharedBias(*outGradMat, 1.0f);
-  } else {
-    biases->collectBias(*outGradMat, 1.0f);
-  }
-}
-
-void Conv3DLayer::addBias() {
-  MatrixPtr outMat = getOutputValue();
-  MatrixPtr bias = Matrix::create(biases_->getW()->getData(),
-                                  1,
-                                  biases_->getW()->getElementCnt(),
-                                  false,
-                                  useGpu_);
-  if (this->sharedBiases_) {
-    outMat->addSharedBias(*(bias), 1.0f);
-  } else {
-    outMat->addBias(*(bias), 1.0f);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/Conv3DLayer.h b/paddle/gserver/layers/Conv3DLayer.h
deleted file mode 100644
index 5ab5ff3d4af07449484c441958c31c8fb06de894..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/Conv3DLayer.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "ConvBaseLayer.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A subclass of convolution layer.
- * This layer expands input and use matrix multiplication to
- * calculate convolution operation.
- */
-class Conv3DLayer : public ConvBaseLayer {
-public:
-  explicit Conv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
-  ~Conv3DLayer() {}
-
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void addBias();
-  void backward(const UpdateCallback& callback);
-  void bpropBiases();
-  void bpropData(int i);
-  void bpropWeights(int i);
-  size_t getSize();
-
-protected:
-  // Figure out the dimensions for individual gemms.
-  IntV M_;  /// numFilters_ / filter_group_;
-  IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
-  IntV K_;  /// outputD_ * outputH_ * outputW_
-  MatrixPtr colBuf_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseLayer.cpp b/paddle/gserver/layers/ConvBaseLayer.cpp
deleted file mode 100644
index 56bf4f9fcb187f73409076b826b738f62d19516a..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvBaseLayer.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/utils/Logging.h"
-namespace paddle {
-
-bool ConvBaseLayer::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  isDeconv_ = (config_.type() == "exconv" || config_.type() == "cudnn_conv")
-                  ? false
-                  : true;
-
-  /* Initialize the convolutional layer parameter */
-  numFilters_ = config_.num_filters();
-  sharedBiases_ = config_.shared_biases();
-  for (auto& inputConfig : config_.inputs()) {
-    const ConvConfig& conf = inputConfig.conv_conf();
-    padding_.push_back(conf.padding());
-    stride_.push_back(conf.stride());
-    dilation_.push_back(conf.dilation());
-    filterSize_.push_back(conf.filter_size());
-    paddingY_.push_back(conf.padding_y());
-    strideY_.push_back(conf.stride_y());
-    dilationY_.push_back(conf.dilation_y());
-    filterSizeY_.push_back(conf.filter_size_y());
-    channels_.push_back(conf.channels());
-    imgSizeH_.push_back(conf.has_img_size_y() ? conf.img_size_y()
-                                              : conf.img_size());
-    imgSizeW_.push_back(conf.img_size());
-    groups_.push_back(conf.groups());
-    filterChannels_.push_back(conf.filter_channels());
-    outputH_.push_back(conf.has_output_y() ? conf.output_y() : conf.output_x());
-    outputW_.push_back(conf.output_x());
-
-    paddingZ_.push_back(conf.padding_z());
-    strideZ_.push_back(conf.stride_z());
-    filterSizeZ_.push_back(conf.filter_size_z());
-    imgSizeD_.push_back(conf.img_size_z());
-    outputD_.push_back(conf.output_z());
-    filterPixels_.push_back(filterSize_.back() * filterSizeY_.back() *
-                            filterSizeZ_.back());
-  }
-
-  CHECK(inputLayers_.size() == parameters_.size());
-
-  // create new weights_ in derived class
-  // create new biases_ in derived class
-
-  // default caffe model
-  caffeMode_ = true;
-
-  return true;
-}
-
-size_t ConvBaseLayer::calOutputSize() {
-  auto clearAndReserve = [this](IntV* vec) {
-    vec->clear();
-    vec->reserve(this->inputLayers_.size());
-  };
-  clearAndReserve(&imgSizeH_);
-  clearAndReserve(&imgSizeW_);
-  clearAndReserve(&outputH_);
-  clearAndReserve(&outputW_);
-  size_t layerSize = 0;
-
-  auto setLayerSize = [&](IntV& inH, IntV& inW, IntV& outH, IntV& outW) {
-    size_t filterSizeY;
-    size_t filterSize;
-    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      filterSizeY = (filterSizeY_[i] - 1) * dilationY_[i] + 1;
-      filterSize = (filterSize_[i] - 1) * dilation_[i] + 1;
-      inH.push_back(inputLayers_[i]->getOutput().getFrameHeight());
-      inW.push_back(inputLayers_[i]->getOutput().getFrameWidth());
-      const ConvConfig& conf = config_.inputs(i).conv_conf();
-      if (isDeconv_) {
-        if (inH[i] == 0)
-          inH[i] = conf.has_output_y() ? conf.output_y() : conf.output_x();
-        if (inW[i] == 0) inW[i] = conf.output_x();
-        outH.push_back(imageSize(
-            inH[i], filterSizeY, paddingY_[i], strideY_[i], caffeMode_));
-        outW.push_back(
-            imageSize(inW[i], filterSize, padding_[i], stride_[i], caffeMode_));
-      } else {
-        if (inH[i] == 0)
-          inH[i] = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-        if (inW[i] == 0) inW[i] = conf.img_size();
-        outH.push_back(outputSize(
-            inH[i], filterSizeY, paddingY_[i], strideY_[i], caffeMode_));
-        outW.push_back(outputSize(
-            inW[i], filterSize, padding_[i], stride_[i], caffeMode_));
-      }
-      CHECK_EQ(outH[i], outH[0]);
-      CHECK_EQ(outW[i], outW[0]);
-    }
-    getOutput().setFrameHeight(outH[0]);
-    getOutput().setFrameWidth(outW[0]);
-    layerSize = outH[0] * outW[0] * size_t(numFilters_);
-  };
-
-  setLayerSize(imgSizeH_, imgSizeW_, outputH_, outputW_);
-
-  return layerSize;
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseLayer.h b/paddle/gserver/layers/ConvBaseLayer.h
deleted file mode 100644
index 93869fe68d15b1cf38296fa8e2f6197dc74f879f..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvBaseLayer.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/MathUtils.h"
-namespace paddle {
-
-/**
- * @brief A Base Convolution Layer, which convolves the input image
- * with learned filters and (optionally) adds biases.
- */
-
-class ConvBaseLayer : public Layer {
-protected:
-  typedef std::vector<int> IntV;
-
-  /// True if it's deconv layer, false if it's convolution layer
-  bool isDeconv_;
-
-  /// The number of filters.
-  int numFilters_;
-  /// The x dimension of the padding.
-  IntV padding_;
-  /// The y dimension of the padding.
-  IntV paddingY_;
-  /// The x dimension of the stride.
-  IntV stride_;
-  /// The y dimension of the stride.
-  IntV strideY_;
-  /// The x dimension of the dilation.
-  IntV dilation_;
-  /// The y dimension of the dilation.
-  IntV dilationY_;
-  /// The x dimension of a filter kernel.
-  IntV filterSize_;
-  /// The y dimension of a filter kernel.
-  IntV filterSizeY_;
-  /// The spatial dimensions of the convolution input.
-  IntV channels_;
-  /// The spatial dimensions of input feature map height.
-  IntV imgSizeH_;
-  /// The spatial dimensions of input feature map width.
-  IntV imgSizeW_;
-  /// filterPixels_ = filterSizeX_ * filterSizeY_.
-  IntV filterPixels_;
-  /// filterChannels_ = channels_/groups_.
-  IntV filterChannels_;
-  /// The spatial dimensions of output feature map height.
-  IntV outputH_;
-  /// The spatial dimensions of output feature map width.
-  IntV outputW_;
-
-  IntV outputD_;
-  IntV imgSizeD_;
-  IntV filterSizeZ_;
-  IntV strideZ_;
-  IntV paddingZ_;
-
-  /// Group size, refer to grouped convolution in
-  /// Alex Krizhevsky's paper: when group=2, the first half of the
-  /// filters are only connected to the first half of the input channels,
-  /// and the second half only connected to the second half.
-  IntV groups_;
-  /// Whether the bias is shared for feature in each channel.
-  bool sharedBiases_;
-
-  /// shape of weight: (numChannels * filterPixels_, numFilters)
-  WeightList weights_;
-  /// If shared_biases is false shape of bias: (numFilters_, 1)
-  /// If shared_biases is ture shape of bias:
-  /// (numFilters_ * outputX * outputY, 1)
-  std::unique_ptr<Weight> biases_;
-
-  /// True by default. The only difference is the calculation
-  /// of output size.
-  bool caffeMode_;
-
-public:
-  explicit ConvBaseLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  /**
-   * imgSizeH_ and imgSizeW_ will be set according to the previous input layers
-   * in this function. Then it will calculate outputH_ and outputW_ and set them
-   * into output argument.
-   */
-  virtual size_t calOutputSize();
-
-  Weight& getWeight(int idx) { return *weights_[idx]; }
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseOperator.cpp b/paddle/gserver/layers/ConvBaseOperator.cpp
deleted file mode 100644
index 317e7d5c607683efa1e93aba9bc9ba472d37d60d..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvBaseOperator.cpp
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvBaseOperator.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvBaseOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-ConvBaseOperator::ConvBaseOperator(const OperatorConfig &config, bool useGpu)
-    : Operator(config, useGpu) {
-  CHECK(useGpu);
-  CHECK_EQ(config_.input_indices_size(), 2L);
-
-  caffeMode_ = true;
-  getConvParams();
-  computeConvSizes();
-
-  // initialize all to default algorithms
-  fwdAlgo_ = 0;
-  bwdFilterAlgo_ = 0;
-  bwdDataAlgo_ = 0;
-  fwdLimitBytes_ = 0;
-  bwdDataLimitBytes_ = 0;
-  bwdFilterLimitBytes_ = 0;
-  workSpaceInBytes_ = 0;
-  workSpace_ = nullptr;
-
-  isSelectAlgo_ = false;
-}
-
-void ConvBaseOperator::allocConvWorkSpace() {
-  hl_conv_workspace(imageDesc_,
-                    outputDesc_,
-                    filterDesc_,
-                    convDesc_,
-                    &fwdAlgo_,
-                    &fwdLimitBytes_,
-                    &bwdDataAlgo_,
-                    &bwdDataLimitBytes_,
-                    &bwdFilterAlgo_,
-                    &bwdFilterLimitBytes_,
-                    /*useDilation*/ false);
-
-  size_t maxWorkSpace = 0;
-  maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
-  maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
-
-  if (maxWorkSpace > workSpaceInBytes_) {
-    if (workSpaceInBytes_ != 0) {
-      hl_free_mem_device(workSpace_);
-    }
-    // total amount of storage needed
-    workSpace_ = hl_malloc_device(maxWorkSpace);
-    workSpaceInBytes_ = maxWorkSpace;
-  }
-}
-
-void ConvBaseOperator::computeConvSizes() {
-  hl_create_filter_descriptor(
-      &filterDesc_, channels_, numFilters_, filterSizeY_, filterSize_);
-  hl_create_tensor_descriptor(&imageDesc_);
-  hl_create_tensor_descriptor(&outputDesc_);
-  hl_create_convolution_descriptor(&convDesc_,
-                                   imageDesc_,
-                                   filterDesc_,
-                                   paddingY_,
-                                   padding_,
-                                   strideY_,
-                                   stride_);
-}
-
-void ConvBaseOperator::reshapeImageDescriptors() {
-  hl_tensor_reshape(imageDesc_,
-                    1,
-                    channels_,
-                    imageH_,
-                    imageW_,
-                    channels_ * imageH_ * imageW_,
-                    imageH_ * imageW_,
-                    imageW_,
-                    1);
-  hl_tensor_reshape(outputDesc_,
-                    1,
-                    numFilters_,
-                    outputH_,
-                    outputW_,
-                    numFilters_ * outputH_ * outputW_,
-                    outputH_ * outputW_,
-                    outputW_,
-                    1);
-  hl_reset_convolution_descriptor(convDesc_,
-                                  imageDesc_,
-                                  filterDesc_,
-                                  paddingY_,
-                                  padding_,
-                                  strideY_,
-                                  stride_);
-}
-
-void ConvBaseOperator::getConvParams() {
-  configNumFilters_ = config_.num_filters();
-  const ConvConfig &conf = config_.conv_conf();
-  padding_ = conf.padding();
-  stride_ = conf.stride();
-  filterSize_ = conf.filter_size();
-  paddingY_ = conf.padding_y();
-  strideY_ = conf.stride_y();
-  filterSizeY_ = conf.filter_size_y();
-  filterPixels_ = filterSize_ * filterSizeY_;
-  configChannels_ = conf.channels();
-  imgSize_ = conf.img_size();
-  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  imgPixels_ = imgSize_ * imgSizeY_;
-  CHECK_EQ(conf.groups(), 1U);
-  filterChannels_ = conf.filter_channels();
-  outputX_ = conf.output_x();
-  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-  outputs_ = outputX_ * outputX_;
-
-  isDeconv_ = (config_.type() == "conv") ? false : true;
-  if (isDeconv_) {
-    channels_ = configNumFilters_;
-    numFilters_ = configChannels_;
-  } else {
-    channels_ = configChannels_;
-    numFilters_ = configNumFilters_;
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseOperator.h b/paddle/gserver/layers/ConvBaseOperator.h
deleted file mode 100644
index 27fb0362d3c9518a263eac54206e00974d08eb20..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvBaseOperator.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include "Operator.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-class ConvBaseOperator : public Operator {
-public:
-  ConvBaseOperator(const OperatorConfig &config, bool useGpu);
-  /**
-   * Free workspace in device and destroy cudnn tensor descriptor.
-   */
-  virtual ~ConvBaseOperator() {
-    if (workSpaceInBytes_ != 0) {
-      hl_free_mem_device(workSpace_);
-      workSpaceInBytes_ = 0;
-    }
-
-    hl_destroy_tensor_descriptor(imageDesc_);
-    hl_destroy_tensor_descriptor(outputDesc_);
-    hl_destroy_filter_descriptor(filterDesc_);
-    hl_destroy_convolution_descriptor(convDesc_);
-  }
-
-protected:
-  /**
-   * Get convolution parameters from layer config and
-   * initialize member variables.
-   */
-  void getConvParams();
-
-  /**
-   * Allocate Gpu Memory for cudnn convolution algorithms.
-   */
-  void allocConvWorkSpace();
-
-  /**
-   * Create cudnn tensor descriptor for convolution operation.
-   */
-  void computeConvSizes();
-
-  /**
-   * Reshape cudnn tensor descriptor.
-   */
-  void reshapeImageDescriptors();
-
-  /**
-   * Reshape cudnn tensor descriptor.
-   */
-  virtual void reshape(int batchSize) = 0;
-
-  /**
-   * Check filter size is equal to the size calculated by parameters from
-   * layer config.
-   */
-  void checkFilterSize(const MatrixPtr &filter) {
-    CHECK_EQ(static_cast<int>(filter->getWidth()),
-             filterSize_ * filterSizeY_ * channels_ * numFilters_);
-  }
-
-  /// Most of member variables are same with CudnnConvLayer.
-  /// There is no explanation here.
-  bool isDeconv_;
-  int imageH_, imageW_, outputH_, outputW_;
-  hl_tensor_descriptor imageDesc_;
-  hl_tensor_descriptor outputDesc_;
-  hl_filter_descriptor filterDesc_;
-  hl_convolution_descriptor convDesc_;
-  bool caffeMode_;
-  int inputOffset_, outputOffset_, weightOffset_;
-  int numFilters_, channels_;
-
-  /// from parsing config
-  int configNumFilters_, configChannels_;
-  int padding_, stride_, filterSize_, imgSize_, imgSizeY_;
-  int paddingY_, strideY_, filterSizeY_;
-  int imgPixels_, filterPixels_, filterChannels_, outputX_, outputY_, outputs_;
-
-  /// Following member variables are same with CudnnConvLayer.
-  /// There is no explanation here.
-  int fwdAlgo_, bwdFilterAlgo_, bwdDataAlgo_;
-  size_t fwdLimitBytes_, bwdDataLimitBytes_, bwdFilterLimitBytes_;
-  size_t workSpaceInBytes_;
-  void *workSpace_;
-  bool isSelectAlgo_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseProjection.cpp b/paddle/gserver/layers/ConvBaseProjection.cpp
deleted file mode 100644
index 39f433b78fe7ce22cc7f93b87d96ed19c10fc2e9..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvBaseProjection.cpp
+++ /dev/null
@@ -1,199 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvBaseProjection.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-ThreadLocalD<std::vector<MemoryHandlePtr>> ConvBaseProjection::convMem_;
-
-ConvBaseProjection::ConvBaseProjection(const ProjectionConfig &config,
-                                       ParameterPtr parameter,
-                                       bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  CHECK(useGpu);  // only support GPU
-  getConvParams();
-  initCudnn();
-
-  size_t height = filterH_ * filterW_ * channels_ / groups_;
-  size_t width = numFilters_;
-  weight_.reset(new Weight(height, width, parameter));
-  weightOffset_ = height * width / groups_;
-}
-
-void ConvBaseProjection::getConvParams() {
-  const ConvConfig &conf = config_.conv_conf();
-  paddingH_ = conf.padding_y();
-  paddingW_ = conf.padding();
-
-  strideH_ = conf.stride_y();
-  strideW_ = conf.stride();
-
-  dilationH_ = conf.dilation_y();
-  dilationW_ = conf.dilation();
-  CHECK_GT(dilationH_, 0);
-  CHECK_GT(dilationW_, 0);
-
-  filterH_ = conf.filter_size_y();
-  filterW_ = conf.filter_size();
-
-  configImgH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  configImgW_ = conf.img_size();
-
-  configOutH_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-  configOutW_ = conf.output_x();
-
-  configChannels_ = conf.channels();
-  configNumFilters_ = config_.num_filters();
-
-  isDeconv_ = (config_.type() == "conv") ? false : true;
-
-  channels_ = (isDeconv_) ? configNumFilters_ : configChannels_;
-  numFilters_ = (isDeconv_) ? configChannels_ : configNumFilters_;
-
-  groups_ = conf.groups();
-  CHECK_EQ(channels_ % groups_, 0);
-  CHECK_EQ(numFilters_ % groups_, 0);
-}
-
-void ConvBaseProjection::initCudnn() {
-  hl_create_filter_descriptor(&filterDesc_,
-                              channels_ / groups_,
-                              numFilters_ / groups_,
-                              filterH_,
-                              filterW_);
-  hl_create_tensor_descriptor(&imageDesc_);
-  hl_create_tensor_descriptor(&outputDesc_);
-  hl_create_convolution_descriptor(&convDesc_,
-                                   imageDesc_,
-                                   filterDesc_,
-                                   paddingH_,
-                                   paddingW_,
-                                   strideH_,
-                                   strideW_,
-                                   dilationH_,
-                                   dilationW_);
-
-  // initialize all to default algorithms
-  fwdAlgo_ = 0;
-  bwdFilterAlgo_ = 0;
-  bwdDataAlgo_ = 0;
-  fwdLimitBytes_ = 0;
-  bwdDataLimitBytes_ = 0;
-  bwdFilterLimitBytes_ = 0;
-  workSpaceInBytes_ = 0;
-}
-
-void ConvBaseProjection::reshapeTensorDesc(int batchSize) {
-  // The stride between two consecutive samples in the output of ConvProjection
-  // may not be numFilters_ * outputH_ * outputW_ (conv) or
-  // channels_ * imageH_ * imageW_ (deconv)
-  // for example, in the case of layer ConcatenateLayer2 with two
-  // ConvProjection, the stride is the output_size of layer ConcatenateLayer2.
-  // So the calculation of nStride is different from CudnnConvLayer.
-  size_t nStrideImage, nStrideOutput;
-  if (isDeconv_) {
-    nStrideImage = out_->value->getStride();
-    nStrideOutput = numFilters_ * outputH_ * outputW_;
-  } else {
-    nStrideImage = channels_ * imageH_ * imageW_;
-    nStrideOutput = out_->value->getStride();
-  }
-
-  hl_tensor_reshape(imageDesc_,
-                    batchSize,
-                    channels_ / groups_,
-                    imageH_,
-                    imageW_,
-                    nStrideImage,
-                    imageH_ * imageW_,
-                    imageW_,
-                    1);
-
-  hl_tensor_reshape(outputDesc_,
-                    batchSize,
-                    numFilters_ / groups_,
-                    outputH_,
-                    outputW_,
-                    nStrideOutput,
-                    outputH_ * outputW_,
-                    outputW_,
-                    1);
-
-  hl_reset_convolution_descriptor(convDesc_,
-                                  imageDesc_,
-                                  filterDesc_,
-                                  paddingH_,
-                                  paddingW_,
-                                  strideH_,
-                                  strideW_,
-                                  dilationH_,
-                                  dilationW_);
-}
-
-void ConvBaseProjection::reshape(int batchSize) {
-  size_t width = calOutputSize();
-  CHECK_EQ(width, out_->value->getWidth());
-  CHECK_EQ(calInputSize(), in_->value->getWidth());
-
-  reshapeTensorDesc(batchSize);
-  bool useDilation = false;
-  if (dilationH_ > 1 || dilationW_ > 1) {
-    useDilation = true;
-  }
-  hl_conv_workspace(imageDesc_,
-                    outputDesc_,
-                    filterDesc_,
-                    convDesc_,
-                    &fwdAlgo_,
-                    &fwdLimitBytes_,
-                    &bwdDataAlgo_,
-                    &bwdDataLimitBytes_,
-                    &bwdFilterAlgo_,
-                    &bwdFilterLimitBytes_,
-                    useDilation);
-
-  size_t maxWorkSpace = 0;
-  maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
-  maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
-  workSpaceInBytes_ = maxWorkSpace;
-
-  VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
-          << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
-}
-
-void *ConvBaseProjection::getSpaceBytes(size_t size) {
-  std::vector<MemoryHandlePtr> &convMem = *convMem_;
-  if (convMem.empty()) {
-    int numDevices = hl_get_device_count();
-    convMem.resize(numDevices);
-  }
-
-  int devId = hl_get_device();
-  MemoryHandlePtr localMem = convMem[devId];
-  if (NULL == localMem || size > localMem->getAllocSize()) {
-    localMem = std::make_shared<GpuMemoryHandle>(size);
-  }
-  return localMem->getBuf();
-}
-
-ConvBaseProjection::~ConvBaseProjection() {
-  hl_destroy_tensor_descriptor(imageDesc_);
-  hl_destroy_tensor_descriptor(outputDesc_);
-  hl_destroy_filter_descriptor(filterDesc_);
-  hl_destroy_convolution_descriptor(convDesc_);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseProjection.h b/paddle/gserver/layers/ConvBaseProjection.h
deleted file mode 100644
index ba76d236d901187093a2e372a61c5d29d661e8bb..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvBaseProjection.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Projection.h"
-#include "paddle/math/MathUtils.h"
-
-namespace paddle {
-
-/**
- * @brief Base class for ConvProjection and ConvTransProjection.
- */
-class ConvBaseProjection : public Projection {
-public:
-  /**
-   * Constructor.
-   */
-  ConvBaseProjection(const ProjectionConfig& config,
-                     ParameterPtr parameter,
-                     bool useGpu);
-
-  ~ConvBaseProjection();
-
-protected:
-  void getConvParams();
-  void initCudnn();
-
-  void reshapeTensorDesc(int batchSize);
-  void reshape(int batchSize);
-
-  virtual size_t calOutputSize() = 0;
-  virtual size_t calInputSize() = 0;
-
-  static void* getSpaceBytes(size_t size);
-
-  /// True if it's deconv projection layer, false if it's ConvProjection layer
-  bool isDeconv_;
-  /// imageH_ and imageW_ / outputH_ and outputW_
-  /// is calculated from the input layer.
-  int imageH_, imageW_;
-  int outputH_, outputW_;
-  /// configImgH_ and configImgW_ / configOutH_ and configOutW_
-  /// is obtained from config.
-  int configImgH_, configImgW_;
-  int configOutH_, configOutW_;
-  /// channels_ and numFilters_ are defined in terms of convolution semantics
-  int channels_, numFilters_;
-  /// configChannels and configNumFilters_ are obtained from config
-  /// For Conv they are the same as channels_ and numFilters
-  /// For ConvTrans they are opposite to channels_ and numFilters
-  int configChannels_, configNumFilters_;
-  int paddingH_, paddingW_;
-  int strideH_, strideW_;
-  int dilationH_, dilationW_;
-  int filterH_, filterW_;
-  /// One group offset of input data.
-  int inputOffset_;
-  /// One group offset of output data.
-  int outputOffset_;
-  /// One group offset of weight.
-  int weightOffset_;
-  int groups_;
-
-  /// Cudnn tensor descriptor for input.
-  hl_tensor_descriptor imageDesc_;
-  /// Cudnn tensor descriptor for output.
-  hl_tensor_descriptor outputDesc_;
-  /// Cudnn tensor descriptor for filter.
-  hl_filter_descriptor filterDesc_;
-  /// Cudnn tensor descriptor for a convolution operation.
-  hl_convolution_descriptor convDesc_;
-
-  /// Record the algorithm for forward convolution, which is obtained by cudnn
-  /// api to search the best suited algorithm.
-  int fwdAlgo_;
-  /// Record the algorithm for computing convolution gradient with respect to
-  /// filter coefficients.
-  int bwdFilterAlgo_;
-  /// Record the algorithm for computing convolution gradient with respect to
-  /// the output.
-  int bwdDataAlgo_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// forward convolution with the specified algo.
-  size_t fwdLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardFilter with the specified algo.
-  size_t bwdDataLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardData with the specified algo.
-  size_t bwdFilterLimitBytes_;
-  /// Size of total work space.
-  size_t workSpaceInBytes_;
-  bool bias_;
-
-  std::unique_ptr<Weight> weight_;
-  static ThreadLocalD<std::vector<MemoryHandlePtr>> convMem_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvOperator.cpp b/paddle/gserver/layers/ConvOperator.cpp
deleted file mode 100644
index 45498b92d32e0fa72adbe95a98e8d30c7f8929e2..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvOperator.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvOperator.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-REGISTER_OPERATOR(conv, ConvOperator);
-
-void ConvOperator::reshape(int batchSize) {
-  imageH_ = ins_[0]->getFrameHeight();
-  imageW_ = ins_[0]->getFrameWidth();
-  if (imageH_ == 0) imageH_ = imgSizeY_;
-  if (imageW_ == 0) imageW_ = imgSize_;
-  outputH_ = outputSize(imageH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
-  outputW_ = outputSize(imageW_, filterSize_, padding_, stride_, caffeMode_);
-  /// Check that the outputSizes are consistent with config
-  CHECK_EQ(outputH_, outputY_);
-  CHECK_EQ(outputW_, outputX_);
-  out_->setFrameHeight(outputH_);
-  out_->setFrameWidth(outputW_);
-
-  reshapeImageDescriptors();
-
-  inputOffset_ = channels_ * imageH_ * imageW_;
-  outputOffset_ = numFilters_ * outputH_ * outputW_;
-  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSizeY_;
-
-  if (!isSelectAlgo_) {
-    allocConvWorkSpace();
-  }
-
-  isSelectAlgo_ = true;
-}
-
-void ConvOperator::forward() {
-  size_t batchSize = ins_[0]->value->getHeight();
-  reshape(batchSize);
-  CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
-  checkFilterSize(ins_[1]->value);
-  Matrix::resizeOrCreate(out_->value,
-                         batchSize,
-                         outputH_ * outputW_ * numFilters_,
-                         false,
-                         useGpu_);
-  {
-    AsyncGpuBlock block;
-    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
-      real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
-      real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
-      real *outData = out_->value->getData() + outputOffset_ * batchId;
-      hl_convolution_forward(imageDesc_,
-                             inputData,
-                             outputDesc_,
-                             outData,
-                             filterDesc_,
-                             wgtData,
-                             convDesc_,
-                             workSpace_,
-                             workSpaceInBytes_,
-                             fwdAlgo_);
-    }
-  }
-}
-
-void ConvOperator::backward() {
-  size_t batchSize = ins_[0]->value->getHeight();
-  {
-    AsyncGpuBlock block;
-    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
-      real *outGrad = out_->grad->getData() + outputOffset_ * batchId;
-      if (ins_[1]->grad) {
-        real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
-        real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_filter(imageDesc_,
-                                       inputData,
-                                       outputDesc_,
-                                       outGrad,
-                                       filterDesc_,
-                                       weightGrad,
-                                       convDesc_,
-                                       workSpace_,
-                                       workSpaceInBytes_,
-                                       bwdFilterAlgo_);
-      }
-
-      MatrixPtr preGrad = ins_[0]->grad;
-      if (NULL != preGrad) {
-        real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
-        real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_data(imageDesc_,
-                                     inputGrad,
-                                     outputDesc_,
-                                     outGrad,
-                                     filterDesc_,
-                                     wgtData,
-                                     convDesc_,
-                                     workSpace_,
-                                     workSpaceInBytes_,
-                                     bwdDataAlgo_);
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvOperator.h b/paddle/gserver/layers/ConvOperator.h
deleted file mode 100644
index fbdb7bb1cd2b81bd72912dffdc9d059c520068a8..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvOperator.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include "ConvBaseOperator.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-class ConvOperator : public ConvBaseOperator {
-public:
-  ConvOperator(const OperatorConfig &config, bool useGpu)
-      : ConvBaseOperator(config, useGpu) {}
-  /**
-   * Free workspace in device and destroy cudnn tensor descriptor.
-   */
-  virtual ~ConvOperator() {}
-  void forward() override;
-  void backward() override;
-  void reshape(int batchSize) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvProjection.cpp b/paddle/gserver/layers/ConvProjection.cpp
deleted file mode 100644
index f382e6cab12a833ce555c948f41e1086093bd78e..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvProjection.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvProjection.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_PROJECTION(conv, ConvProjection);
-
-size_t ConvProjection::calOutputSize() {
-  imageH_ = in_->getFrameHeight();
-  imageW_ = in_->getFrameWidth();
-  if (imageH_ == 0) imageH_ = configImgH_;
-  if (imageW_ == 0) imageW_ = configImgW_;
-  outputH_ = outputSize(imageH_,
-                        (filterH_ - 1) * dilationH_ + 1,
-                        paddingH_,
-                        strideH_,
-                        /* caffeMode */ true);
-  outputW_ = outputSize(imageW_,
-                        (filterW_ - 1) * dilationW_ + 1,
-                        paddingW_,
-                        strideW_,
-                        /* caffeMode */ true);
-
-  const_cast<Argument *>(out_)->setFrameHeight(outputH_);
-  const_cast<Argument *>(out_)->setFrameWidth(outputW_);
-
-  inputOffset_ = (configChannels_ / groups_) * imageH_ * imageW_;
-  outputOffset_ = (configNumFilters_ / groups_) * outputH_ * outputW_;
-  return outputH_ * outputW_ * configNumFilters_;
-}
-
-size_t ConvProjection::calInputSize() {
-  return static_cast<size_t>(configChannels_ * imageH_ * imageW_);
-}
-
-void ConvProjection::forward() {
-  int batchSize = in_->value->getHeight();
-  reshape(batchSize);
-
-  void *workSpace = NULL;
-  if (workSpaceInBytes_ > 0) {
-    workSpace = getSpaceBytes(workSpaceInBytes_);
-  }
-
-  for (int g = 0; g < groups_; ++g) {
-    REGISTER_TIMER_INFO("CudnnConvFwTimer", getName().c_str());
-
-    real *inputData = in_->value->getData() + g * inputOffset_;
-    real *wgtData = weight_->getW()->getData() + g * weightOffset_;
-    real *outData = out_->value->getData() + g * outputOffset_;
-    hl_convolution_forward(imageDesc_,
-                           inputData,
-                           outputDesc_,
-                           outData,
-                           filterDesc_,
-                           wgtData,
-                           convDesc_,
-                           workSpace,
-                           fwdLimitBytes_,
-                           fwdAlgo_);
-  }
-}
-
-void ConvProjection::backward(const UpdateCallback &callback) {
-  REGISTER_TIMER_INFO("CudnnConvBpTimer", getName().c_str());
-
-  void *workSpace = NULL;
-  if (workSpaceInBytes_ > 0) {
-    workSpace = getSpaceBytes(workSpaceInBytes_);
-  }
-
-  for (int g = 0; g < groups_; ++g) {
-    real *outGrad = out_->grad->getData() + g * outputOffset_;
-    if (weight_->getWGrad()) {
-      real *inputData = in_->value->getData() + g * inputOffset_;
-      real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
-      hl_convolution_backward_filter(imageDesc_,
-                                     inputData,
-                                     outputDesc_,
-                                     outGrad,
-                                     filterDesc_,
-                                     weightGrad,
-                                     convDesc_,
-                                     workSpace,
-                                     bwdFilterLimitBytes_,
-                                     bwdFilterAlgo_);
-    }
-
-    MatrixPtr preGrad = in_->grad;
-    if (NULL != preGrad) {
-      real *inputGrad = preGrad->getData() + g * inputOffset_;
-      real *wgtData = weight_->getW()->getData() + g * weightOffset_;
-      hl_convolution_backward_data(imageDesc_,
-                                   inputGrad,
-                                   outputDesc_,
-                                   outGrad,
-                                   filterDesc_,
-                                   wgtData,
-                                   convDesc_,
-                                   workSpace,
-                                   bwdDataLimitBytes_,
-                                   bwdDataAlgo_);
-    }
-  }
-
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvProjection.h b/paddle/gserver/layers/ConvProjection.h
deleted file mode 100644
index e8ecb99431a421d4b52228600909568b0808649a..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvProjection.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ConvBaseProjection.h"
-#include "paddle/math/MathUtils.h"
-
-namespace paddle {
-
-/**
- * @brief Convolution projection do the same calculation with CudnnConvLayer.
- */
-class ConvProjection : public ConvBaseProjection {
-public:
-  /**
-   * Constructor.
-   */
-  ConvProjection(const ProjectionConfig& config,
-                 ParameterPtr parameter,
-                 bool useGpu)
-      : ConvBaseProjection(config, parameter, useGpu) {}
-
-  ~ConvProjection() {}
-
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-  virtual size_t calOutputSize();
-  virtual size_t calInputSize();
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvShiftLayer.cpp b/paddle/gserver/layers/ConvShiftLayer.cpp
deleted file mode 100644
index fb877710196835e025466f37b5da27bcf80a3db4..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvShiftLayer.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for circular convluation of two vectors,
- * which is used in NEURAL TURING MACHINE.
- * - Input: two vectors, the first is data (batchSize x dataDim)
- * the second is shift weights (batchSize x shiftDim)
- * - Output: a vector (batchSize x dataDim)
- * Assumed that:
- * - a[in]: contains M elements.
- * - b[in]: contains N elements (N should be odd).
- * - c[out]: contains M elements.
- *
- * \f[
- *     c[i] = \sum_{j=-(N-1)/2}^{(N-1)/2}a_{i+j} * b_{j}
- * \f]
- *
- * In this formula:
- *  - a's index is computed modulo M.
- *  - b's index is comupted modulo N.
- *
- * The config file api is conv_shift_layer.
- */
-
-class ConvShiftLayer : public Layer {
-public:
-  explicit ConvShiftLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ConvShiftLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(conv_shift, ConvShiftLayer);
-
-bool ConvShiftLayer::init(const LayerMap& layerMap,
-                          const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-
-  return true;
-}
-
-void ConvShiftLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t dataDim = inV0->getWidth();
-
-  CHECK_EQ(batchSize, inV1->getHeight());
-  CHECK_EQ(dataDim, getSize());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  REGISTER_TIMER_INFO("FwConvShiftTimer", getName().c_str());
-  outV->circularConv(*inV0, *inV1);
-}
-
-void ConvShiftLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-
-  REGISTER_TIMER_INFO("BwConvShiftTimer", getName().c_str());
-
-  if (inG0 && inG1) {
-    outG->circularConvDerivative(*outG, *inV0, *inV1, *inG0, *inG1);
-  } else {
-    CHECK(!inG0 || !inG1) << "Not supported";
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvTransOperator.cpp b/paddle/gserver/layers/ConvTransOperator.cpp
deleted file mode 100644
index ac41d6f9a4f86364930e27ee401406432e731b65..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvTransOperator.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvTransOperator.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvTransOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-REGISTER_OPERATOR(convt, ConvTransOperator);
-
-void ConvTransOperator::reshape(int batchSize) {
-  outputH_ = ins_[0]->getFrameHeight();
-  outputW_ = ins_[0]->getFrameWidth();
-  if (outputH_ == 0) outputH_ = outputY_;
-  if (outputW_ == 0) outputW_ = outputX_;
-  imageH_ = imageSize(outputH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
-  imageW_ = imageSize(outputW_, filterSize_, padding_, stride_, caffeMode_);
-  /// Check that the imageSizes are consistent with config
-  CHECK_EQ(imageH_, imgSizeY_);
-  CHECK_EQ(imageW_, imgSize_);
-  out_->setFrameHeight(imageH_);
-  out_->setFrameWidth(imageW_);
-
-  reshapeImageDescriptors();
-
-  inputOffset_ = numFilters_ * outputH_ * outputW_;
-  outputOffset_ = channels_ * imageH_ * imageW_;
-  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSizeY_;
-
-  if (!isSelectAlgo_) {
-    allocConvWorkSpace();
-  }
-
-  isSelectAlgo_ = true;
-}
-
-void ConvTransOperator::forward() {
-  size_t batchSize = ins_[0]->value->getHeight();
-  reshape(batchSize);
-  CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
-  checkFilterSize(ins_[1]->value);
-  Matrix::resizeOrCreate(
-      out_->value, batchSize, imageH_ * imageW_ * channels_, false, useGpu_);
-  {
-    AsyncGpuBlock block;
-    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
-      real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
-      real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
-      real *outData = out_->value->getData() + outputOffset_ * batchId;
-      hl_convolution_backward_data(imageDesc_,
-                                   outData,
-                                   outputDesc_,
-                                   inputData,
-                                   filterDesc_,
-                                   wgtData,
-                                   convDesc_,
-                                   workSpace_,
-                                   workSpaceInBytes_,
-                                   bwdDataAlgo_);
-    }
-  }
-}
-
-void ConvTransOperator::backward() {
-  size_t batchSize = ins_[0]->value->getHeight();
-  {
-    AsyncGpuBlock block;
-    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
-      real *outGrad = out_->grad->getData() + outputOffset_ * batchId;
-      if (ins_[1]->grad) {
-        real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
-        real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_filter(imageDesc_,
-                                       outGrad,
-                                       outputDesc_,
-                                       inputData,
-                                       filterDesc_,
-                                       weightGrad,
-                                       convDesc_,
-                                       workSpace_,
-                                       workSpaceInBytes_,
-                                       bwdFilterAlgo_);
-      }
-
-      MatrixPtr preGrad = ins_[0]->grad;
-      if (NULL != preGrad) {
-        real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
-        real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
-        hl_convolution_forward(imageDesc_,
-                               outGrad,
-                               outputDesc_,
-                               inputGrad,
-                               filterDesc_,
-                               wgtData,
-                               convDesc_,
-                               workSpace_,
-                               workSpaceInBytes_,
-                               fwdAlgo_);
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvTransOperator.h b/paddle/gserver/layers/ConvTransOperator.h
deleted file mode 100644
index 1bf58f2bfb78ae7dee433455ece37d908b113045..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvTransOperator.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include "ConvBaseOperator.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief ConvTransOperator takes two inputs to perform the convolution.
- * The first input is the image, and the second input is the convolution kernel.
- * The height of data for two inputs are the same. Each data of the first input
- * is convolved with each data of the second input indepedently.
- *
- * The config file api is conv_operator.
- */
-
-class ConvTransOperator : public ConvBaseOperator {
-public:
-  ConvTransOperator(const OperatorConfig &config, bool useGpu)
-      : ConvBaseOperator(config, useGpu) {}
-  /**
-   * Free workspace in device and destroy cudnn tensor descriptor.
-   */
-  virtual ~ConvTransOperator() {}
-  void forward() override;
-  void backward() override;
-  void reshape(int batchSize) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvTransProjection.cpp b/paddle/gserver/layers/ConvTransProjection.cpp
deleted file mode 100644
index 242ce34a607057069a4d0a31e9b70d56279d37ab..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvTransProjection.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvTransProjection.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_PROJECTION(convt, ConvTransProjection);
-size_t ConvTransProjection::calOutputSize() {
-  outputH_ = in_->getFrameHeight();
-  outputW_ = in_->getFrameWidth();
-  if (outputH_ == 0) outputH_ = configOutH_;
-  if (outputW_ == 0) outputW_ = configOutW_;
-  imageH_ = imageSize(outputH_,
-                      (filterH_ - 1) * dilationH_ + 1,
-                      paddingH_,
-                      strideH_,
-                      /* caffeMode */ true);
-
-  imageW_ = imageSize(outputW_,
-                      (filterW_ - 1) * dilationW_ + 1,
-                      paddingW_,
-                      strideW_,
-                      /* caffeMode */ true);
-
-  const_cast<Argument *>(out_)->setFrameHeight(imageH_);
-  const_cast<Argument *>(out_)->setFrameWidth(imageW_);
-
-  inputOffset_ = (configChannels_ / groups_) * outputH_ * outputW_;
-  outputOffset_ = (configNumFilters_ / groups_) * imageH_ * imageW_;
-  return imageH_ * imageW_ * configNumFilters_;
-}
-
-size_t ConvTransProjection::calInputSize() {
-  return static_cast<size_t>(configChannels_ * outputH_ * outputW_);
-}
-
-void ConvTransProjection::forward() {
-  int batchSize = in_->value->getHeight();
-  reshape(batchSize);
-
-  void *workSpace = NULL;
-  if (workSpaceInBytes_ > 0) {
-    workSpace = getSpaceBytes(workSpaceInBytes_);
-  }
-
-  for (int g = 0; g < groups_; ++g) {
-    REGISTER_TIMER_INFO("CudnnConvTransFwTimer", getName().c_str());
-
-    real *inData = in_->value->getData() + g * inputOffset_;
-    real *wgtData = weight_->getW()->getData() + g * weightOffset_;
-    real *outData = out_->value->getData() + g * outputOffset_;
-    hl_convolution_backward_data(imageDesc_,
-                                 outData,
-                                 outputDesc_,
-                                 inData,
-                                 filterDesc_,
-                                 wgtData,
-                                 convDesc_,
-                                 workSpace,
-                                 bwdDataLimitBytes_,
-                                 bwdDataAlgo_);
-  }
-}
-
-void ConvTransProjection::backward(const UpdateCallback &callback) {
-  REGISTER_TIMER_INFO("CudnnConvTransBpTimer", getName().c_str());
-
-  void *workSpace = NULL;
-  if (workSpaceInBytes_ > 0) {
-    workSpace = getSpaceBytes(workSpaceInBytes_);
-  }
-
-  for (int g = 0; g < groups_; ++g) {
-    real *outGrad = out_->grad->getData() + g * outputOffset_;
-    if (weight_->getWGrad()) {
-      real *inData = in_->value->getData() + g * inputOffset_;
-      real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
-      hl_convolution_backward_filter(imageDesc_,
-                                     outGrad,
-                                     outputDesc_,
-                                     inData,
-                                     filterDesc_,
-                                     weightGrad,
-                                     convDesc_,
-                                     workSpace,
-                                     bwdFilterLimitBytes_,
-                                     bwdFilterAlgo_);
-    }
-
-    MatrixPtr preGrad = in_->grad;
-    if (NULL != preGrad) {
-      real *inGrad = preGrad->getData() + g * inputOffset_;
-      real *wgtData = weight_->getW()->getData() + g * weightOffset_;
-      hl_convolution_forward(imageDesc_,
-                             outGrad,
-                             outputDesc_,
-                             inGrad,
-                             filterDesc_,
-                             wgtData,
-                             convDesc_,
-                             workSpace,
-                             fwdLimitBytes_,
-                             fwdAlgo_);
-    }
-  }
-
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvTransProjection.h b/paddle/gserver/layers/ConvTransProjection.h
deleted file mode 100644
index 269b2694c82ea076102633537d7c961139a19a43..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvTransProjection.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ConvBaseProjection.h"
-#include "paddle/math/MathUtils.h"
-
-namespace paddle {
-
-/**
- * @brief Convolution projection do the same calculation with CudnnConvLayer.
- */
-class ConvTransProjection : public ConvBaseProjection {
-public:
-  /**
-   * Constructor.
-   */
-  ConvTransProjection(const ProjectionConfig& config,
-                      ParameterPtr parameter,
-                      bool useGpu)
-      : ConvBaseProjection(config, parameter, useGpu) {}
-
-  ~ConvTransProjection() {}
-
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-  virtual size_t calOutputSize();
-  virtual size_t calInputSize();
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvexCombinationLayer.cpp b/paddle/gserver/layers/ConvexCombinationLayer.cpp
deleted file mode 100644
index dce751940c1bf1695a034a3c551412dcb9b7b8b5..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ConvexCombinationLayer.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for weighted sum of vectors,
- * which is used in NEURAL MACHINE TRANSLATION BY JOINTLY LEARNING TO ALIGN AND
- * TRANSLATE
- * - Input: the the size of the first input is weightDim,
- *          and the size of the second input is weightdim * dataDim.
- * - Output: the sizeof the output is dataDim
- * \f[
- *   out(j) = \sum_{i}(in0(i) * in1(i,j + i * dataDim)),
- *               i = 0,1,...,(weightDim-1); j = 0, 1,...,(dataDim-1)
- * \f]
- * Note that the above computation is for one sample. Multiple samples are
- * processed in one batch.
- *
- * The config file api is linear_comb_layer.
- */
-class ConvexCombinationLayer : public Layer {
-protected:
-  /// A matrix pointer pointing to second input.
-  MatrixPtr tmpMtx0;
-  /// A matrix pointer pointing to first input.
-  MatrixPtr tmpRow0;
-  /// A matrix pointer pointing to output.
-  MatrixPtr tmpRow1;
-
-public:
-  explicit ConvexCombinationLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ConvexCombinationLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(convex_comb, ConvexCombinationLayer);
-
-bool ConvexCombinationLayer::init(const LayerMap& layerMap,
-                                  const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(2U, inputLayers_.size());
-  size_t dataDim = getSize();
-  size_t weightDim = inputLayers_[0]->getSize();
-
-  CHECK_EQ(weightDim * dataDim, inputLayers_[1]->getSize())
-      << "Dimension mismatch";
-
-  tmpRow0 = Matrix::create(nullptr,
-                           /* height= */ 1,
-                           weightDim,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpRow1 = Matrix::create(nullptr,
-                           /* height= */ 1,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpMtx0 = Matrix::create(nullptr,
-                           /* height= */ weightDim,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-
-  return true;
-}
-
-void ConvexCombinationLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t weightDim = inV0->getWidth();
-  size_t dataDim = getSize();
-
-  CHECK_EQ(batchSize, inV1->getHeight());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  REGISTER_TIMER_INFO("FwCvxCombTimer", getName().c_str());
-  for (size_t i = 0; i < batchSize; i++) {
-    tmpMtx0->setData(inV1->getData() + i * weightDim * dataDim);
-    tmpRow0->setData(inV0->getData() + i * weightDim);
-    tmpRow1->setData(outV->getData() + i * dataDim);
-
-    tmpRow1->mul(*tmpRow0, *tmpMtx0, 1, 0);
-  }
-}
-
-void ConvexCombinationLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t weightDim = inV0->getWidth();
-  size_t dataDim = getSize();
-
-  REGISTER_TIMER_INFO("BwCvxCombTimer", getName().c_str());
-
-  if (inG0) {
-    for (size_t i = 0; i < batchSize; i++) {
-      tmpRow0->setData(inG0->getData() + i * weightDim);
-      tmpRow1->setData(outG->getData() + i * dataDim);
-      tmpMtx0->setData(inV1->getData() + i * weightDim * dataDim);
-
-      tmpRow0->mul(*tmpRow1, *(tmpMtx0->getTranspose()), 1, 1);
-    }
-  }
-
-  if (inG1) {
-    for (size_t i = 0; i < batchSize; i++) {
-      tmpRow0->setData(inV0->getData() + i * weightDim);
-      tmpRow1->setData(outG->getData() + i * dataDim);
-      tmpMtx0->setData(inG1->getData() + i * weightDim * dataDim);
-
-      tmpMtx0->mul(*(tmpRow0->getTranspose()), *tmpRow1, 1, 1);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CosSimLayer.cpp b/paddle/gserver/layers/CosSimLayer.cpp
deleted file mode 100644
index 4e44a5e8dfdad98bff0cd0f405b4227340a45728..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CosSimLayer.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CosSimLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(cos, CosSimLayer);
-
-bool CosSimLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2LU);
-
-  createFunction(forward_,
-                 "CosSimForward",
-                 FuncConfig().set("scale", (real)config_.cos_scale()));
-  createFunction(backward_,
-                 "CosSimBackward",
-                 FuncConfig().set("scale", (real)config_.cos_scale()));
-
-  return true;
-}
-
-void CosSimLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(0)->getHeight();
-  int size = getSize();
-  CHECK_EQ(forward_.size(), 1UL) << "Only one forward function needed";
-
-  {
-    REGISTER_TIMER_INFO("CosFwResetTimer", getName().c_str());
-    reserveOutput(batchSize, size);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  /* activation */ {
-    REGISTER_TIMER_INFO("CosFwAtvTimer", getName().c_str());
-    MatrixPtr prevOut1 = getInputValue(0);
-    MatrixPtr prevOut2 = getInputValue(1);
-
-    CHECK(outV && prevOut1 && prevOut2);
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*prevOut1);
-    inputs.addArg(*prevOut2);
-    outputs.addArg(*outV, ASSIGN_TO);
-    forward_[0]->calc(inputs, outputs);
-  }
-}
-
-void CosSimLayer::backward(const UpdateCallback& callback) {
-  /* activation */ {
-    REGISTER_TIMER_INFO("CosBpAtvTimer", getName().c_str());
-    CHECK_EQ(backward_.size(), 1UL) << "Only one backward function needed";
-
-    const auto outG = this->getOutputGrad();
-    const auto outV = this->getOutputValue();
-    const auto inV1 = this->getInputValue(0);
-    const auto inV2 = this->getInputValue(1);
-    auto inG1 = this->getInputGrad(0);
-    auto inG2 = this->getInputGrad(1);
-    CHECK(outG && outV && inV1 && inV2 && inG1 && inG2);
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*outG);
-    inputs.addArg(*outV);
-    inputs.addArg(*inV1);
-    inputs.addArg(*inV2);
-    outputs.addArg(*inG1, ADD_TO);
-    outputs.addArg(*inG2, ADD_TO);
-
-    backward_[0]->calc(inputs, outputs);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CosSimLayer.h b/paddle/gserver/layers/CosSimLayer.h
deleted file mode 100644
index 675cdb16b563faa7acf9e701096bd334ed661160..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CosSimLayer.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-/**
- * @brief A layer for calculating cosine similarity between two vector
- * \f[
- * f(x,y)=scale\frac{x_1y_1+x_2y_2+...+x_ny_n}{\sqrt{x_1^2+x_2^2+...
- * +x_n^2}\sqrt{y_1^2+y_2^2+...+y_n^2}}
- * \f]
- *
- * - Input1: A vector (batchSize * dataDim) *
- * - Input2: A vector (batchSize * dataDim) or (1 * dataDim) *
- * - Output: A vector (batchSize * 1)
- *
- * The config file api is cos_sim.
- */
-class CosSimLayer : public Layer {
-public:
-  explicit CosSimLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~CosSimLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CosSimVecMatLayer.cpp b/paddle/gserver/layers/CosSimVecMatLayer.cpp
deleted file mode 100644
index 685b4e8ef376b76b3058eeba82d803d460e7105c..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CosSimVecMatLayer.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-/**
- * @brief A layer for computing cosine similarity between a vector
- * and each row of a matrix
- * out[i] = cos_scale * cos(in1, in2(i,:));
- * @note used in NEURAL TURING MACHINE
- *
- * Input1: a vector (batchSize * dataDim)
- *
- * Input2: a matrix in vector form (batchSize * (weightDim*dataDim))
- *
- * Output: a vector (batchSize * weightDim)
- */
-
-class CosSimVecMatLayer : public Layer {
-protected:
-  MatrixPtr tmpMtx0;
-  MatrixPtr tmpMtx1;
-  MatrixPtr tmpRow0;
-  MatrixPtr tmpRow1;
-  MatrixPtr tmpRow2;
-  MatrixPtr tmpRow3;
-
-public:
-  explicit CosSimVecMatLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~CosSimVecMatLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(cos_vm, CosSimVecMatLayer);
-
-bool CosSimVecMatLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-
-  size_t dataDim = inputLayers_[0]->getSize();
-  size_t numKeys = getSize();
-  size_t memoryDim = inputLayers_[1]->getSize();
-
-  CHECK_EQ(dataDim * numKeys, memoryDim) << "Dimension mismatch";
-
-  tmpRow0 = Matrix::create(nullptr,
-                           /* height= */ 1,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpRow1 = Matrix::create(nullptr,
-                           /* height= */ 1,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpRow2 = Matrix::create(nullptr,
-                           /* height= */ numKeys,
-                           1,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpRow3 = Matrix::create(nullptr,
-                           /* height= */ numKeys,
-                           1,
-                           /* trans= */ false,
-                           useGpu_);
-
-  tmpMtx0 = Matrix::create(nullptr,
-                           /* height= */ numKeys,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-  tmpMtx1 = Matrix::create(nullptr,
-                           /* height= */ numKeys,
-                           dataDim,
-                           /* trans= */ false,
-                           useGpu_);
-
-  CHECK(tmpRow0 && tmpRow1 && tmpRow2 && tmpRow3 && tmpMtx0 && tmpMtx1);
-
-  createFunction(forward_,
-                 "CosSimForward",
-                 FuncConfig().set("scale", (real)config_.cos_scale()));
-  createFunction(backward_,
-                 "CosSimBackward",
-                 FuncConfig().set("scale", (real)config_.cos_scale()));
-
-  return true;
-}
-
-void CosSimVecMatLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  CHECK_EQ(forward_.size(), 1UL) << "Only one forward function needed";
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t numKeys = getSize();
-
-  CHECK_EQ(batchSize, inV1->getHeight());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, numKeys);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  CHECK(outV && inV0 && inV1);
-  REGISTER_TIMER_INFO("FwCosVMTimer", getName().c_str());
-  for (size_t i = 0; i < batchSize; i++) {
-    tmpRow0->setData(inV0->rowBuf(i));
-    tmpMtx0->setData(inV1->rowBuf(i));
-    tmpRow2->setData(outV->rowBuf(i));
-
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*tmpMtx0);
-    inputs.addArg(*tmpRow0);
-    outputs.addArg(*tmpRow2, ASSIGN_TO);
-    forward_[0]->calc(inputs, outputs);
-  }
-}
-
-void CosSimVecMatLayer::backward(const UpdateCallback& callback) {
-  CHECK_EQ(backward_.size(), 1UL) << "Only one forward function needed";
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-  MatrixPtr outV = getOutputValue();
-  MatrixPtr outG = getOutputGrad();
-
-  size_t batchSize = inV0->getHeight();
-  CHECK(inV0 && inV1 && inG0 && inG1 && outV && outG);
-  REGISTER_TIMER_INFO("BwCosVMTimer", getName().c_str());
-
-  for (size_t i = 0; i < batchSize; i++) {
-    tmpRow0->setData(inV0->rowBuf(i));
-    tmpRow1->setData(inG0->rowBuf(i));
-    tmpMtx0->setData(inV1->rowBuf(i));
-    tmpMtx1->setData(inG1->rowBuf(i));
-    tmpRow2->setData(outV->rowBuf(i));
-    tmpRow3->setData(outG->rowBuf(i));
-
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*tmpRow3);
-    inputs.addArg(*tmpRow2);
-    inputs.addArg(*tmpMtx0);
-    inputs.addArg(*tmpRow0);
-    outputs.addArg(*tmpMtx1, ADD_TO);
-    outputs.addArg(*tmpRow1, ADD_TO);
-
-    backward_[0]->calc(inputs, outputs);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
deleted file mode 100644
index 484f803a8387a16152c5911d7d5c72b0111283ae..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CostLayer.cpp
+++ /dev/null
@@ -1,748 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CostLayer.h"
-#include <algorithm>
-#include <cmath>
-#include <memory>
-#include "paddle/utils/Logging.h"
-
-#include "paddle/math/SparseMatrix.h"
-
-namespace paddle {
-
-bool CostLayer::init(const LayerMap& layerMap,
-                     const ParameterMap& parameterMap) {
-  bool ret = Layer::init(layerMap, parameterMap);
-  coeff_ = config_.coeff();
-  if (!ret) return ret;
-  CHECK_GE(inputLayers_.size(), 2UL);
-  CHECK_LE(inputLayers_.size(), 3UL);
-  if (inputLayers_.size() == 3) {
-    weightLayer_ = inputLayers_[2];
-  }
-  return true;
-}
-
-void CostLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(*getOutputLayer())->getHeight();
-  int size = 1;
-  resetOutput(batchSize, size);
-
-  const MatrixPtr& output = getInputValue(*getOutputLayer());
-  Argument label = getInput(*getLabelLayer());
-
-  /* get the cost value for each sample*/
-  forwardImp(*output, label, *getOutputValue());
-  if (weightLayer_) {
-    const MatrixPtr& weight = getInputValue(*weightLayer_);
-    getOutputValue()->dotMul(*getOutputValue(), *weight);
-  }
-}
-
-void CostLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  const Argument& output = getInput(*getOutputLayer());
-  Argument label = getInput(*getLabelLayer());
-
-  bool support = true;
-  if (weightLayer_) {
-    support = output.grad->getAbsSum() == 0;
-  }
-
-  backwardImp(*output.value, label, *output.grad);
-
-  if (weightLayer_) {
-    CHECK(support) << "Weighted cost layer '" << getName()
-                   << "' must be the last layer "
-                      "connected to the output layer '"
-                   << getOutputLayer()->getName() << "'";
-    output.grad->rowScale(0, *output.grad, *getInputValue(*weightLayer_));
-  }
-  if (coeff_ != real(1.0f)) {
-    output.grad->add(coeff_, 0);
-  }
-}
-
-//
-// class MultiClassCrossEntropy
-//
-bool MultiClassCrossEntropy::init(const LayerMap& layerMap,
-                                  const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void MultiClassCrossEntropy::forwardImp(Matrix& output,
-                                        Argument& label,
-                                        Matrix& target) {
-  target.oneHotCrossEntropy(output, *label.ids);
-}
-
-void MultiClassCrossEntropy::backwardImp(Matrix& output,
-                                         Argument& label,
-                                         Matrix& outputG) {
-  outputG.oneHotCrossEntropyBp(output, *label.ids);
-}
-
-//
-// class MultiClassCrossEntropyWithSelfNorm
-//
-REGISTER_LAYER(multi_class_cross_entropy_with_selfnorm,
-               MultiClassCrossEntropyWithSelfNorm);
-
-bool MultiClassCrossEntropyWithSelfNorm::init(
-    const LayerMap& layerMap, const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void MultiClassCrossEntropyWithSelfNorm::forwardImp(Matrix& output,
-                                                    Argument& label,
-                                                    Matrix& target) {
-  Matrix::resizeOrCreate(sftMaxSum_, output.getHeight(), 1, false, useGpu_);
-  output.rowSum(*sftMaxSum_);
-  sftMaxSum_->log2();
-
-  target.oneHotCrossEntropy(output, *label.ids);
-  target.add(*sftMaxSum_);
-
-  sftMaxSum_->square2();
-  target.add(*sftMaxSum_, config_.softmax_selfnorm_alpha());
-}
-
-void MultiClassCrossEntropyWithSelfNorm::backwardImp(Matrix& output,
-                                                     Argument& label,
-                                                     Matrix& outputG) {
-  Matrix::resizeOrCreate(sftMaxSum_, output.getHeight(), 1, false, useGpu_);
-  output.rowSum(*sftMaxSum_);
-
-  Matrix::resizeOrCreate(sumInv_, output.getHeight(), 1, false, useGpu_);
-  sftMaxSum_->reciprocal2(*sumInv_);
-
-  outputG.oneHotCrossEntropyBp(output, *label.ids);
-  outputG.addColumnVector(*sumInv_);
-
-  sftMaxSum_->log2();
-  sumInv_->dotMul(*sumInv_, *sftMaxSum_);
-  sumInv_->mulScalar(2 * config_.softmax_selfnorm_alpha());
-
-  outputG.addColumnVector(*sumInv_);
-}
-
-//
-// class SoftBinaryClassCrossEntropy
-//
-REGISTER_LAYER(soft_binary_class_cross_entropy, SoftBinaryClassCrossEntropy);
-
-bool SoftBinaryClassCrossEntropy::init(const LayerMap& layerMap,
-                                       const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void SoftBinaryClassCrossEntropy::forwardImp(Matrix& output,
-                                             Argument& label,
-                                             Matrix& target) {
-  Matrix::resizeOrCreate(
-      targetPerDim_, output.getHeight(), output.getWidth(), false, useGpu_);
-
-  targetPerDim_->softCrossEntropy(output, *label.value);
-  targetPerDim_->rowSum(target);
-}
-
-void SoftBinaryClassCrossEntropy::backwardImp(Matrix& output,
-                                              Argument& label,
-                                              Matrix& outputG) {
-  outputG.softCrossEntropyBp(output, *label.value);
-}
-
-//
-// class SumOfSquaresCostLayer
-//
-
-REGISTER_LAYER(square_error, SumOfSquaresCostLayer);
-
-bool SumOfSquaresCostLayer::init(const LayerMap& layerMap,
-                                 const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void SumOfSquaresCostLayer::forwardImp(Matrix& output,
-                                       Argument& label,
-                                       Matrix& target) {
-  target.sumOfSquares(output, *label.value);
-}
-
-void SumOfSquaresCostLayer::backwardImp(Matrix& output,
-                                        Argument& label,
-                                        Matrix& outputG) {
-  outputG.sumOfSquaresBp(output, *label.value);
-}
-
-//
-// class SmoothL1CostLayer
-//
-
-REGISTER_LAYER(smooth_l1, SmoothL1CostLayer);
-
-bool SmoothL1CostLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void SmoothL1CostLayer::forwardImp(Matrix& output,
-                                   Argument& label,
-                                   Matrix& target) {
-  MatrixPtr targetCpu, outputCpu, labelCpu;
-  if (useGpu_) {
-    targetCpu =
-        Matrix::create(target.getHeight(), target.getWidth(), false, false);
-    outputCpu =
-        Matrix::create(output.getHeight(), output.getWidth(), false, false);
-    labelCpu = Matrix::create(
-        label.value->getHeight(), label.value->getWidth(), false, false);
-    targetCpu->copyFrom(target);
-    outputCpu->copyFrom(output);
-    labelCpu->copyFrom(*label.value);
-    targetCpu->smoothL1(*outputCpu, *labelCpu, 1.0);
-    target.copyFrom(*targetCpu);
-  } else {
-    target.smoothL1(output, *label.value, 1.0);
-  }
-}
-
-void SmoothL1CostLayer::backwardImp(Matrix& output,
-                                    Argument& label,
-                                    Matrix& outputG) {
-  MatrixPtr outputGCpu, outputCpu, labelCpu;
-  if (useGpu_) {
-    outputGCpu =
-        Matrix::create(outputG.getHeight(), outputG.getWidth(), false, false);
-    outputCpu =
-        Matrix::create(output.getHeight(), output.getWidth(), false, false);
-    labelCpu = Matrix::create(
-        label.value->getHeight(), label.value->getWidth(), false, false);
-    outputGCpu->copyFrom(outputG);
-    outputCpu->copyFrom(output);
-    labelCpu->copyFrom(*label.value);
-    outputGCpu->smoothL1Bp(*outputCpu, *labelCpu, 1.0);
-    outputG.copyFrom(*outputGCpu);
-  } else {
-    outputG.smoothL1Bp(output, *label.value, 1.0);
-  }
-}
-
-//
-// class RankingCost
-//
-bool RankingCost::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  posPairCount_ = 0;
-  negPairCount_ = 0;
-
-  bool ret = Layer::init(layerMap, parameterMap);
-  if (!ret) return ret;
-  CHECK_GE(inputLayers_.size(), 3UL);
-  CHECK_LE(inputLayers_.size(), 4UL);
-  if (inputLayers_.size() == 4) {
-    weightLayer_ = inputLayers_[3];
-  }
-  return true;
-}
-
-void RankingCost::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(*getOutputLayer(0))->getHeight();
-  int size = 1;
-  resizeOutput(batchSize, size);
-  Matrix::resizeOrCreate(margin_, batchSize, size, /* trans= */ false, useGpu_);
-  MatrixPtr label = getInputValue(*getLabelLayer());
-  if (!label) {
-    // input label is not in value, try ids
-    IVectorPtr idLabel = getInput(*getLabelLayer()).ids;
-    CHECK(idLabel) << "label layer has neither value nor ids";
-    CHECK_EQ((size_t)batchSize, idLabel->getSize());
-    Matrix::resizeOrCreate(
-        labelBuf_, batchSize, /*width*/ 1, /*trans*/ false, useGpu_);
-    labelBuf_->copyFrom(*idLabel);
-    label = labelBuf_;
-  }
-
-  MatrixPtr output[] = {getInputValue(*getOutputLayer(0)),
-                        getInputValue(*getOutputLayer(1))};
-  MatrixPtr target = this->getOutputValue();
-  margin_->sub(*output[0], *output[1]);
-
-  // for validation
-  size_t height = output[0]->getHeight();
-  target->biggerThan(*(output[0]), *(output[1]), *label);
-  double total = static_cast<double>(height);
-  if (weightLayer_) {
-    const MatrixPtr& weight = getInputValue(*weightLayer_);
-    target->dotMul(*target, *weight);
-    total = weight->getSum();
-  }
-  double pos = target->getSum();
-  posPairCount_ += pos;
-  negPairCount_ += (total - pos);
-
-  // forward
-  target->logisticRegressionLoss(*margin_, *label);
-  if (weightLayer_) {
-    const MatrixPtr& weight = getInputValue(*weightLayer_);
-    target->dotMul(*target, *weight);
-  }
-}
-
-void RankingCost::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  MatrixPtr label = getInputValue(*getLabelLayer());
-  if (!label) {
-    // input label is not in value, but in ids
-    // use labelBuf_ (should already resized and copied during forward)
-    label = labelBuf_;
-  }
-
-  Matrix::resizeOrCreate(
-      marginGrad_, label->getHeight(), 1, /* trans= */ false, useGpu_);
-  marginGrad_->zeroMem();
-  marginGrad_->logisticRegressionLossBp(*margin_, *label);
-  if (weightLayer_) {
-    const MatrixPtr& weight = getInputValue(*weightLayer_);
-    marginGrad_->dotMul(*marginGrad_, *weight);
-  }
-
-  getInputGrad(0)->add(*marginGrad_);
-  getInputGrad(1)->sub(*marginGrad_);
-}
-
-void RankingCost::onPassEnd() {
-  double ratio = posPairCount_ / ((negPairCount_ <= 0) ? 1.0 : negPairCount_);
-  LOG(INFO) << "calc pos/neg: " << ratio << " pos= " << posPairCount_
-            << " neg= " << negPairCount_;
-
-  posPairCount_ = 0;
-  negPairCount_ = 0;
-}
-
-//
-// class LambdaCost
-//
-REGISTER_LAYER(lambda_cost, LambdaCost);
-
-bool LambdaCost::init(const LayerMap& layerMap,
-                      const ParameterMap& parameterMap) {
-  truncationSize_ = config_.ndcg_num();
-  maxSortSize_ = config_.max_sort_size();
-  if (maxSortSize_ != -1) {
-    CHECK_GE(maxSortSize_, truncationSize_)
-        << "maxSortSize must be greater than or equal to NDCG size!";
-  }
-  LOG(INFO) << "LambdaRank v1.3, NDCG size = " << truncationSize_
-            << ", Max partial sort size = " << maxSortSize_;
-  CHECK(!useGpu_) << "LambdaRank supports CPU only!";
-  return Layer::init(layerMap, parameterMap);
-}
-
-void LambdaCost::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(*getOutputLayer())->getHeight();
-  resizeOutput(batchSize, 1);
-
-  MatrixPtr score = getInputValue(*getScoreLayer());
-  MatrixPtr output = getInputValue(*getOutputLayer());
-  MatrixPtr target = this->getOutputValue();
-
-  real* scoreData = score->getData();
-  real* outputData = output->getData();
-  real* targetData = target->getData();
-
-  auto startPos = getInput(*getOutputLayer()).sequenceStartPositions;
-  const int* startPosData = startPos->getData(false);
-  size_t batchNum = startPos->getSize() - 1;
-  for (size_t i = 0; i < batchNum; ++i) {
-    int beginPos = startPosData[i];
-    int endPos = startPosData[i + 1];
-    real NDCG = calcNDCG(
-        outputData + beginPos, scoreData + beginPos, endPos - beginPos);
-    for (int j = beginPos; j < endPos; ++j) {
-      targetData[j] = NDCG;
-    }
-  }
-}
-
-void LambdaCost::backward(const UpdateCallback& callback) {
-  (void)callback;
-  MatrixPtr score = getInputValue(*getScoreLayer());
-  MatrixPtr output = getInputValue(*getOutputLayer());
-  Matrix::resizeOrCreate(marginGrad_,
-                         score->getHeight(),
-                         1,
-                         /* trans= */ false,
-                         useGpu_);
-  marginGrad_->zeroMem();
-
-  real* gradData = marginGrad_->getData();
-  real* scoreData = score->getData();
-  real* outputData = output->getData();
-
-  auto startPos = getInput(*getOutputLayer()).sequenceStartPositions;
-  const int* startPosData = startPos->getData(false);
-  size_t batchNum = startPos->getSize() - 1;
-
-  for (size_t i = 0; i < batchNum; ++i) {
-    int beginPos = startPosData[i];
-    int endPos = startPosData[i + 1];
-    calcGrad(outputData + beginPos,
-             scoreData + beginPos,
-             gradData + beginPos,
-             endPos - beginPos);
-  }
-
-  getInputGrad(0)->add(*marginGrad_);
-}
-
-void LambdaCost::calcGrad(const real* outputScore,
-                          const real* score,
-                          real* gradData,
-                          int size) {
-  CHECK_GE(size, truncationSize_)
-      << "Invalid: (Sample num in the same list) < (NDCG truncation num) !";
-  int sortSize = maxSortSize_ == -1 ? size : std::min(maxSortSize_, size);
-
-  scorePair_.clear();
-  for (int i = 0; i < size; ++i) {
-    scorePair_.push_back(std::make_pair(score[i], i));
-  }
-  if (size <= sortSize) {
-    std::sort(scorePair_.begin(),
-              scorePair_.end(),
-              [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
-                return a.first > b.first;
-              });
-  } else {
-    std::partial_sort(
-        scorePair_.begin(),
-        scorePair_.begin() + sortSize,
-        scorePair_.end(),
-        [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
-          return a.first > b.first;
-        });
-  }
-
-  real maxDCG = 0;
-  for (int i = 0; i < truncationSize_; ++i) {
-    maxDCG += (std::pow(2, scorePair_[i].first) - 1) / std::log(i + 2);
-  }
-  CHECK_GT(maxDCG, 0) << "Invalid: max DCG = 0!";
-
-  for (int i = 0; i < sortSize; ++i) {
-    for (int j = i + 1; j < size; ++j) {
-      int index_i = scorePair_[i].second;
-      int index_j = scorePair_[j].second;
-      real score_i = score[index_i];
-      real score_j = score[index_j];
-      real dcgDif = 0;
-      if (j < sortSize) {
-        dcgDif = (std::pow(2, score_i) - std::pow(2, score_j)) *
-                 (1 / std::log(i + 2) - 1 / std::log(j + 2));
-      } else {
-        dcgDif =
-            (std::pow(2, score_i) - std::pow(2, score_j)) / std::log(i + 2);
-      }
-
-      real lambda_ij =
-          -std::abs(dcgDif) /
-          (1 + std::exp(outputScore[index_i] - outputScore[index_j]));
-      gradData[index_i] += lambda_ij / maxDCG;
-      gradData[index_j] -= lambda_ij / maxDCG;
-    }
-  }
-}
-
-real LambdaCost::calcNDCG(const real* outputScore,
-                          const real* score,
-                          int size) {
-  CHECK_GE(size, truncationSize_)
-      << "Invalid: (Sample num in the same list) < (NDCG truncation num) !";
-
-  outputScorePair_.clear();
-  for (int i = 0; i < size; ++i) {
-    outputScorePair_.push_back(std::make_pair(outputScore[i], i));
-  }
-  std::partial_sort(
-      outputScorePair_.begin(),
-      outputScorePair_.begin() + truncationSize_,
-      outputScorePair_.end(),
-      [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
-        return a.first > b.first;
-      });
-
-  real DCG = 0;
-  for (int i = 0; i < truncationSize_; ++i) {
-    DCG +=
-        (std::pow(2, score[outputScorePair_[i].second]) - 1) / std::log(i + 2);
-  }
-
-  scoreVec_.resize(size);
-  std::copy(score, score + size, scoreVec_.begin());
-  real maxDCG = 0;
-  std::partial_sort(scoreVec_.begin(),
-                    scoreVec_.begin() + truncationSize_,
-                    scoreVec_.end(),
-                    std::greater<real>());
-  for (int i = 0; i < truncationSize_; ++i) {
-    maxDCG += (std::pow(2, scoreVec_[i]) - 1) / std::log(i + 2);
-  }
-  CHECK_GT(maxDCG, 0) << "Invalid: max DCG = 0!";
-
-  return DCG / maxDCG;
-}
-
-//
-// class MultiBinaryLabelCrossEntropy
-//
-
-REGISTER_LAYER(multi_binary_label_cross_entropy, MultiBinaryLabelCrossEntropy);
-
-bool MultiBinaryLabelCrossEntropy::init(const LayerMap& layerMap,
-                                        const ParameterMap& parameterMap) {
-  return CostLayer::init(layerMap, parameterMap);
-}
-
-void MultiBinaryLabelCrossEntropy::forwardImp(Matrix& output,
-                                              Argument& label,
-                                              Matrix& target) {
-  MatrixPtr value = nullptr;
-  if (label.ids) {
-    CHECK(!label.value);
-    value = label.ids->toOneHotSparseMatrix(output.getWidth(), useGpu_);
-  } else {
-    CHECK(label.value);
-    value = label.value;
-  }
-
-  if (dynamic_cast<CpuSparseMatrix*>(value.get()) ||
-      dynamic_cast<GpuSparseMatrix*>(value.get())) {
-    target.multiBinaryLabelCrossEntropy(output, *value);
-  } else {
-    Matrix::resizeOrCreate(
-        targetPerDim_, output.getHeight(), output.getWidth(), false, useGpu_);
-
-    targetPerDim_->binaryLabelCrossEntropy(output, *value);
-    targetPerDim_->rowSum(target);
-  }
-}
-
-void MultiBinaryLabelCrossEntropy::backwardImp(Matrix& output,
-                                               Argument& label,
-                                               Matrix& outputG) {
-  MatrixPtr value = nullptr;
-  if (label.ids) {
-    CHECK(!value);
-    value = label.ids->toOneHotSparseMatrix(output.getWidth(), useGpu_);
-  } else {
-    CHECK(label.value);
-    value = label.value;
-  }
-
-  if (dynamic_cast<CpuSparseMatrix*>(value.get()) ||
-      dynamic_cast<GpuSparseMatrix*>(value.get())) {
-    outputG.multiBinaryLabelCrossEntropyBp(output, *value);
-  } else {
-    outputG.binaryLabelCrossEntropyBp(output, *value);
-  }
-}
-
-bool HuberCost::init(const LayerMap& layerMap,
-                     const ParameterMap& parameterMap) {
-  CostLayer::init(layerMap, parameterMap);
-  if (useGpu_) {
-    tmpCpuInput_.reserve(inputLayers_.size());
-    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_.push_back(Argument());
-    }
-  }
-  return true;
-}
-
-void HuberCost::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
-  if (useGpu_) {
-    for (size_t i = 0; i < inputLayers_.size(); i++) {
-      tmpCpuInput_[i].resizeAndCopyFrom(
-          getInput(i), false, HPPL_STREAM_DEFAULT);
-    }
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  }
-}
-
-//
-// Huber loss for robust regression.
-//
-REGISTER_LAYER(huber_regression, HuberRegressionLoss);
-
-bool HuberRegressionLoss::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  HuberCost::init(layerMap, parameterMap);
-  delta_ = config_.delta();
-  return true;
-}
-
-void HuberRegressionLoss::forwardImp(Matrix& output,
-                                     Argument& label,
-                                     Matrix& target) {
-  HuberCost::forwardImp(output, label, target);
-  size_t numSamples = target.getHeight();
-  size_t dim = output.getWidth();
-  CHECK(label.value);
-  CHECK_EQ((*label.value).getHeight(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(dim, (*label.value).getWidth());
-  CHECK_EQ(target.getWidth(), (size_t)1);
-
-  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
-  real* lbl =
-      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
-  std::vector<real> cost(numSamples, 0);
-  for (size_t i = 0; i < numSamples; ++i) {
-    for (size_t j = 0; j < dim; ++j) {
-      int index = i * dim + j;
-      real a = std::abs(lbl[index] - out[index]);
-      if (a <= delta_)
-        cost[i] += a * a / 2;
-      else
-        cost[i] += delta_ * (a - delta_ / 2);
-    }
-  }
-  target.copyFrom(cost.data(), numSamples);
-}
-
-void HuberRegressionLoss::backwardImp(Matrix& output,
-                                      Argument& label,
-                                      Matrix& outputG) {
-  size_t numSamples = output.getHeight();
-  size_t dim = output.getWidth();
-  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
-  real* lbl =
-      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
-  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    for (size_t j = 0; j < dim; ++j) {
-      int index = i * dim + j;
-      real a = lbl[index] - out[index];
-      if (std::abs(a) <= delta_)
-        grad[index] += -a;
-      else
-        grad[index] += a > 0 ? -delta_ : delta_;
-    }
-  }
-  if (useGpu_) outputG.copyFrom(grad, numSamples * dim);
-}
-
-//
-// Huber loss for robust 2-classes classification
-//
-REGISTER_LAYER(huber_classification, HuberTwoClassification);
-
-bool HuberTwoClassification::init(const LayerMap& layerMap,
-                                  const ParameterMap& parameterMap) {
-  return HuberCost::init(layerMap, parameterMap);
-}
-
-void HuberTwoClassification::forwardImp(Matrix& output,
-                                        Argument& label,
-                                        Matrix& target) {
-  HuberCost::forwardImp(output, label, target);
-  size_t numSamples = target.getHeight();
-  CHECK(label.ids);
-  CHECK_EQ((*label.ids).getSize(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(output.getWidth(), (size_t)1);
-  CHECK_EQ(target.getWidth(), (size_t)1);
-
-  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
-  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
-  std::vector<real> cost(numSamples, 0);
-  for (size_t i = 0; i < numSamples; ++i) {
-    int y = 2 * lbl[i] - 1;
-    real a = out[i] * y;
-    if (a < -1)
-      cost[i] = -4 * a;
-    else if (a < 1)
-      cost[i] = (1 - a) * (1 - a);
-  }
-  target.copyFrom(cost.data(), numSamples);
-}
-
-void HuberTwoClassification::backwardImp(Matrix& output,
-                                         Argument& label,
-                                         Matrix& outputG) {
-  size_t numSamples = output.getHeight();
-  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
-  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
-  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    int y = 2 * lbl[i] - 1;
-    real a = out[i] * y;
-    if (a < -1)
-      grad[i] += -4 * y;
-    else if (a < 1)
-      grad[i] += -2 * (1 - a) * y;
-  }
-  if (useGpu_) outputG.copyFrom(grad, numSamples);
-}
-/**
- * This cost layer compute the sum of its input as loss.
- * \f[
- * o(i) = \sum_{j=1}^D y_{ij}
- * \f]
- */
-class SumCostLayer : public Layer {
-public:
-  explicit SumCostLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    bool ret = Layer::init(layerMap, parameterMap);
-    if (!ret) return ret;
-    CHECK_EQ(inputLayers_.size(), 1UL);
-    return true;
-  }
-
-  void forward(PassType passType) override {
-    Layer::forward(passType);
-    const MatrixPtr& input = getInputValue(0);
-
-    /* malloc memory for the output_ if necessary */
-    int batchSize = input->getHeight();
-    int size = 1;
-    resizeOutput(batchSize, size);
-    output_.value->sumRows(*input, /* scaleSum= */ 1, /* scaleDest= */ 0);
-  }
-
-  void backward(const UpdateCallback& callback = nullptr) override {
-    getInputGrad(0)->add((real)1);
-  }
-};
-
-REGISTER_LAYER(sum_cost, SumCostLayer);
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CostLayer.h b/paddle/gserver/layers/CostLayer.h
deleted file mode 100644
index 306c067ed1c040555d2b03996cc0749faf0ea68c..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CostLayer.h
+++ /dev/null
@@ -1,374 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <vector>
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * Base class for a particular type of cost layer.
- * This type of cost should have one data layer, one label layer
- * and an optional weight layer as input.
- * The derived class should implemnt forwardImp() and backwardImp()
- * which calculate the cost for data and label. The weight is automatically
- * handled by the base class.
- */
-class CostLayer : public Layer {
-public:
-  explicit CostLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  LayerPtr getOutputLayer() { return inputLayers_[0]; }
-
-  LayerPtr getLabelLayer() { return inputLayers_[1]; }
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-  virtual void forwardImp(Matrix& outputValue,
-                          Argument& label,
-                          Matrix& cost) = 0;
-
-  virtual void backwardImp(Matrix& outputValue,
-                           Argument& label,
-                           Matrix& outputGrad) = 0;
-
-protected:
-  LayerPtr weightLayer_;
-  real coeff_;
-};
-
-/**
- * The cross-entropy loss for multi-class classification task.
- * The loss function is:
- *
- * \f[
- * L = - \sum_{i}{t_{k} * log(P(y=k))}
- * \f]
- */
-class MultiClassCrossEntropy : public CostLayer {
-public:
-  explicit MultiClassCrossEntropy(const LayerConfig& config)
-      : CostLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-
-  void backwardImp(Matrix& outputValue,
-                   Argument& label,
-                   Matrix& outputGrad) override;
-};
-
-/**
- * The cross-entropy with self-normalization for multi-class classification.
- *
- * The loss function is:
- * \f[
- * L = \sum_{i}[-log(P(x_{i})) + alpha * log(Z(x_{i})^2)]
- * \f]
- *
- * The \f$Z(x)\f$ is the softmax normalizer.
- *
- * [1] Jacob Devlin, Rabih Zbib, Zhongqiang Huang, Thomas Lamar,
- *     Richard Schwartz, and John Makhoul. Fast and robust neural
- *     network joint models for statistical machine translation.
- *     In Proceedings of the ACL 2014 Conference.
- */
-class MultiClassCrossEntropyWithSelfNorm : public CostLayer {
-public:
-  explicit MultiClassCrossEntropyWithSelfNorm(const LayerConfig& config)
-      : CostLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-
-  void backwardImp(Matrix& outputValue,
-                   Argument& label,
-                   Matrix& outputGrad) override;
-
-protected:
-  MatrixPtr sftMaxSum_;
-  MatrixPtr sumInv_;
-};
-
-/**
- * The cross-entropy for soft binary class.
- * \f[
- * L = \sum_i (\sum_j -y_j(i)*log(x_j(i))-(1-y_j(i))*log(1-x_j(i)))
- * \f]
- */
-class SoftBinaryClassCrossEntropy : public CostLayer {
-public:
-  explicit SoftBinaryClassCrossEntropy(const LayerConfig& config)
-      : CostLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-
-  void backwardImp(Matrix& outputValue,
-                   Argument& label,
-                   Matrix& outputGrad) override;
-
-protected:
-  MatrixPtr targetPerDim_;
-};
-
-/**
- * This cost layer compute Euclidean (L2) loss for real-valued regression
- * tasks.
- * \f[
- * L = \sum_{i=1}^N {|| \hat{y}_i - y_i||_2^2}
- * \f]
- */
-class SumOfSquaresCostLayer : public CostLayer {
-public:
-  explicit SumOfSquaresCostLayer(const LayerConfig& config)
-      : CostLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-
-  void backwardImp(Matrix& outputValue,
-                   Argument& label,
-                   Matrix& outputGrad) override;
-};
-
-/**
- * This cost layer compute smooth L1 loss for real-valued regression
- * tasks.
- * \f[
- * L =
- *   0.5 * x^2    if / -1 < |x| < 1 /
- *   |x| - 0.5    / otherwise /
- * \f]
- *
- * x = output - label
- */
-class SmoothL1CostLayer : public CostLayer {
-public:
-  explicit SmoothL1CostLayer(const LayerConfig& config) : CostLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-
-  void backwardImp(Matrix& outputValue,
-                   Argument& label,
-                   Matrix& outputGrad) override;
-};
-
-/**
- * A cost layer for learning to rank (LTR) task. This layer contains at leat
- * three inputs.
- * \f[
- *  C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) \\
- *  o_{i,j} =  o_i - o_j  \\
- *  \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
- * \f]
- *
- * [1]. Chris Burges, Tal Shaked, Erin Renshaw, et al. Learning to
- *      Rank useing Gradient Descent.
- */
-class RankingCost : public Layer {
-public:
-  explicit RankingCost(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  LayerPtr getOutputLayer(size_t i) { return inputLayers_[i]; }
-
-  LayerPtr getLabelLayer() { return inputLayers_[2]; }
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-  void onPassEnd() override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) {
-    (void)output;
-    (void)label;
-    (void)cost;
-  }
-
-  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad) {
-    (void)outputValue;
-    (void)label;
-    (void)outputGrad;
-  }
-
-private:
-  double posPairCount_;
-  double negPairCount_;
-  MatrixPtr margin_;
-  MatrixPtr marginGrad_;
-  /// if input label is put in ids (not value), copy to this buffer.
-  MatrixPtr labelBuf_;
-  LayerPtr weightLayer_;
-};
-
-/**
- * LambdaRank os a method for learning arbitrary information retrieval
- * measures. It can be applied to any algorithm that learns through gradient
- * descent. LambdaRank is a listwise method, in that the cost depends on the
- * sorted order of the documents. LambdaRank gives the gradient of cost
- * function:
- *
- * \f[
- * \lambda_{ij} = \frac{1}{1 + e^{o_i - o_j}} \left| \Delta_{NDCG} \right|
- * \f]
- *
- * [1] Christopher J.C. Burges, Robert Ragno, Quoc Viet Le. Learning to Rank
- *     with Nonsmooth Cost Functions.
- */
-class LambdaCost : public Layer {
-public:
-  explicit LambdaCost(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  LayerPtr getOutputLayer() { return inputLayers_[0]; }
-
-  LayerPtr getScoreLayer() { return inputLayers_[1]; }
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-  real calcNDCG(const real* outputScore, const real* score, int size);
-  void calcGrad(const real* outputScore,
-                const real* score,
-                real* gradData,
-                int size);
-
-private:
-  MatrixPtr marginGrad_;
-  int truncationSize_;
-  int maxSortSize_;
-  std::vector<std::pair<real, int>> scorePair_;
-  std::vector<std::pair<real, int>> outputScorePair_;
-  std::vector<real> scoreVec_;
-};
-
-/**
- * Cross entropy for multi binary labels.
- * \f[
- * cost[i] = -sum(label[i][j]*log(output[i][j]) +
- *            (1-label[i][j])*log(1-output[i][j]))
- * \f]
- */
-class MultiBinaryLabelCrossEntropy : public CostLayer {
-protected:
-  MatrixPtr targetPerDim_;
-
-public:
-  explicit MultiBinaryLabelCrossEntropy(const LayerConfig& config)
-      : CostLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-
-  void backwardImp(Matrix& outputValue,
-                   Argument& label,
-                   Matrix& outputGrad) override;
-};
-
-/*
- * A base layer for HuberRegressionLoss and HuberTwoClassification.
- */
-class HuberCost : public CostLayer {
-public:
-  std::vector<Argument> tmpCpuInput_;
-
-  explicit HuberCost(const LayerConfig& config) : CostLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-
-  void backwardImp(Matrix& outputValue,
-                   Argument& label,
-                   Matrix& outputGrad) override {}
-};
-
-/**
- * Huber loss for robust regression.
- *
- * Given output f(x), label y and delta, the loss is:
- * Loss = 0.5 * (1 - y * f)^2, if abs(y - f) <= delta \\
- * Loss = delta * abs(y - f) - 0.5 * delta^2, otherwise
- */
-class HuberRegressionLoss : public HuberCost {
-public:
-  explicit HuberRegressionLoss(const LayerConfig& config) : HuberCost(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-
-  void backwardImp(Matrix& outputValue,
-                   Argument& label,
-                   Matrix& outputGrad) override;
-
-protected:
-  real delta_;
-};
-
-/**
- * Huber loss for robust 2-classes classification.
- *
- * For label={0, 1}, let y=2*label-1. Given output f(x), the loss is:
- * Loss = 4 * y * f, if y* f < -1 \\
- * Loss = (1 - y * f)^2, if -1 < y * f < 1  \\
- * Loss = 0, otherwise
- */
-class HuberTwoClassification : public HuberCost {
-public:
-  explicit HuberTwoClassification(const LayerConfig& config)
-      : HuberCost(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
-
-  void backwardImp(Matrix& outputValue,
-                   Argument& label,
-                   Matrix& outputGrad) override;
-};
-
-typedef std::shared_ptr<CostLayer> CostLayerPtr;
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CropLayer.cpp b/paddle/gserver/layers/CropLayer.cpp
deleted file mode 100644
index bc97ca2f9e0cdc86f82baa0ce3fbafde2db0c10f..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CropLayer.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CropLayer.h"
-#include "paddle/utils/Stat.h"
-namespace paddle {
-
-REGISTER_LAYER(crop, CropLayer);
-
-bool CropLayer::init(const LayerMap& layerMap,
-                     const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  CHECK_LE(static_cast<int>(inputLayers_.size()), 2);
-  CHECK_GE(static_cast<int>(inputLayers_.size()), 1);
-  crop_axis_ = config_.axis();
-  for (int i = 0; i < config_.offset_size(); i++) {
-    crop_offsets_.push_back(config_.offset(i));
-  }
-
-  // 1. get input_0 shape
-  auto& input0_img_conf = config_.inputs(0).image_conf();
-  inDims_ = TensorShape({0,
-                         input0_img_conf.channels(),
-                         input0_img_conf.has_img_size_y()
-                             ? input0_img_conf.img_size_y()
-                             : input0_img_conf.img_size(),
-                         input0_img_conf.img_size()});
-  // 2. get target dims from config
-  if (config_.inputs_size() == 1) {
-    targetDims_ = TensorShape({config_.shape(0),
-                               config_.shape(1),
-                               config_.shape(2),
-                               config_.shape(3)});
-  } else {
-    // 2. get input_1 shape
-    auto& input1_img_conf = config_.inputs(1).image_conf();
-    targetDims_ = TensorShape({0,
-                               input1_img_conf.channels(),
-                               input1_img_conf.has_img_size_y()
-                                   ? input1_img_conf.img_size_y()
-                                   : input1_img_conf.img_size(),
-                               input1_img_conf.img_size()});
-  }
-
-  // 3. get final crop corner
-  int dimSize = 4;
-  crop_corner_ = {0, 0, 0, 0};
-  for (int i = 0; i < dimSize; i++) {
-    if (i >= crop_axis_) {
-      if (crop_offsets_.size() > 1) {
-        crop_corner_[i] = crop_offsets_[i - crop_axis_];
-      } else {
-        crop_corner_[i] = crop_offsets_[0];
-      }
-    }
-  }
-
-  outDims_ = TensorShape(4);
-
-  createFunction(
-      forward_, "Crop", FuncConfig().set("crop_corner", crop_corner_));
-  createFunction(
-      backward_, "CropGrad", FuncConfig().set("crop_corner", crop_corner_));
-
-  return true;
-}
-
-void CropLayer::setOutDims() {
-  MatrixPtr input = inputLayers_[1]->getOutputValue();
-  size_t batchSize = input->getHeight();
-  // get target dims from input_1
-  if (config_.inputs_size() == 2) {
-    targetDims_.setDim(0, batchSize);
-    int ch = config_.inputs(0).image_conf().channels();
-    if (ch != 0) targetDims_.setDim(1, ch);
-    int h = inputLayers_[1]->getOutput().getFrameHeight();
-    if (h != 0) targetDims_.setDim(2, h);
-    int w = inputLayers_[1]->getOutput().getFrameWidth();
-    if (w != 0) targetDims_.setDim(3, w);
-  }
-  // get final crop shape from target dims and crop axis
-  std::vector<uint32_t> crop_shape;
-  int dimSize = 4;
-  for (int i = 0; i < dimSize; i++) {
-    if (i >= crop_axis_) {
-      crop_shape.push_back(targetDims_[i]);
-    } else {
-      crop_shape.push_back(inDims_[i]);
-    }
-  }
-
-  outDims_.reshape(
-      {crop_shape[0], crop_shape[1], crop_shape[2], crop_shape[3]});
-  output_.setFrameHeight(crop_shape[2]);
-  output_.setFrameWidth(crop_shape[3]);
-}
-
-void CropLayer::setInDims() {
-  MatrixPtr input = inputLayers_[0]->getOutputValue();
-  size_t batchSize = input->getHeight();
-  inDims_.setDim(0, batchSize);
-  int h = inputLayers_[0]->getOutput().getFrameHeight();
-  if (h != 0) inDims_.setDim(2, h);
-  int w = inputLayers_[0]->getOutput().getFrameWidth();
-  if (w != 0) inDims_.setDim(3, w);
-}
-
-void CropLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  setInDims();
-  setOutDims();
-  int size = outDims_[1] * outDims_[2] * outDims_[3];
-  resetOutput(outDims_[0], size);
-  MatrixPtr outV = getOutputValue();
-  REGISTER_TIMER_INFO("CropForward", getName().c_str());
-
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), inDims_);
-  outputs.addArg(*getOutputValue(), outDims_, ASSIGN_TO);
-  forward_[0]->calc(inputs, outputs);
-}
-
-void CropLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-  REGISTER_TIMER_INFO("CropBackward", getName().c_str());
-
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getOutputGrad(), outDims_);
-  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
-  backward_[0]->calc(inputs, outputs);
-}
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CropLayer.h b/paddle/gserver/layers/CropLayer.h
deleted file mode 100644
index 1a85911ef75e992df587a60cfc9a727eafa4cc76..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CropLayer.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * \brief  This layer crop input according to the specify conf.
- *         input_0: input to be cropped
- *         input_1: optional reference input
- *         axis: start dimension to be croped
- *         offset: offset of cropping  in each dimension
- *         shape: if reference input layer was not setted,
- *                  crop input as this shape conf
- */
-class CropLayer : public Layer {
-public:
-  explicit CropLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~CropLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-protected:
-  void setOutDims();
-  void setInDims();
-
-  int32_t crop_axis_;
-  std::vector<uint32_t> crop_offsets_;
-  std::vector<uint32_t> crop_corner_;
-  TensorShape inDims_;
-  TensorShape targetDims_;
-  TensorShape outDims_;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CrossChannelNormLayer.cpp b/paddle/gserver/layers/CrossChannelNormLayer.cpp
deleted file mode 100644
index 644450291ee8a308accf7a1fe096332cc8c241dc..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CrossChannelNormLayer.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "NormLayer.h"
-#include "paddle/math/BaseMatrix.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-MatrixPtr CrossChannelNormLayer::createSampleMatrix(MatrixPtr data,
-                                                    size_t iter,
-                                                    size_t spatialDim) {
-  return Matrix::create(data->getData() + iter * channels_ * spatialDim,
-                        channels_,
-                        spatialDim,
-                        false,
-                        useGpu_);
-}
-
-MatrixPtr CrossChannelNormLayer::createSpatialMatrix(MatrixPtr data,
-                                                     size_t iter,
-                                                     size_t spatialDim) {
-  return Matrix::create(
-      data->getData() + iter * spatialDim, 1, spatialDim, false, useGpu_);
-}
-
-bool CrossChannelNormLayer::init(const LayerMap& layerMap,
-                                 const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  CHECK(parameters_[0]);
-  const NormConfig& conf = config_.inputs(0).norm_conf();
-  channels_ = conf.channels();
-  scale_.reset(new Weight(channels_, 1, parameters_[0]));
-  return true;
-}
-
-void CrossChannelNormLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  MatrixPtr inV = getInputValue(0);
-
-  size_t batchSize = inV->getHeight();
-  size_t dataDim = inV->getWidth();
-  CHECK_EQ(getSize(), dataDim);
-
-  reserveOutput(batchSize, dataDim);
-  MatrixPtr outV = getOutputValue();
-  size_t spatialDim = dataDim / channels_;
-
-  Matrix::resizeOrCreate(dataBuffer_, batchSize, dataDim, false, useGpu_);
-  Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_);
-  Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_);
-
-  inV->square2(*dataBuffer_);
-  for (size_t i = 0; i < batchSize; i++) {
-    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
-    const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
-    MatrixPtr outVTmp = createSampleMatrix(outV, i, spatialDim);
-    MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
-
-    // compute norm.
-    spatialBuffer_->sumCols(*dataTmp, 1, 0);
-    // add eps to avoid overflow
-    spatialBuffer_->add(1e-6);
-    spatialBuffer_->sqrt2(*spatialBuffer_);
-    normTmp->copyFrom(*spatialBuffer_);
-    outVTmp->copyFrom(*inVTmp);
-    outVTmp->divRowVector(*spatialBuffer_);
-    // scale the layer.
-    outVTmp->mulColVector(*scale_->getW());
-  }
-}
-
-void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inG = getInputGrad(0);
-  MatrixPtr inV = getInputValue(0);
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr outV = getOutputValue();
-
-  size_t batchSize = inG->getHeight();
-  size_t dataDim = inG->getWidth();
-  size_t spatialDim = dataDim / channels_;
-
-  MatrixPtr inGBuffer;
-  Matrix::resizeOrCreate(inGBuffer, channels_, spatialDim, false, useGpu_);
-
-  dataBuffer_->dotMul(*outG, *outV);
-  Matrix::resizeOrCreate(scaleDiff_, channels_, 1, false, useGpu_);
-  Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_);
-  Matrix::resizeOrCreate(sampleBuffer_, channels_, spatialDim, false, useGpu_);
-  scaleDiff_->zeroMem();
-  for (size_t i = 0; i < batchSize; i++) {
-    MatrixPtr outGTmp = createSampleMatrix(outG, i, spatialDim);
-    const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
-    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
-    const MatrixPtr inGTmp = createSampleMatrix(inG, i, spatialDim);
-    const MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
-
-    channelBuffer_->sumRows(*dataTmp, 1, 0);
-    channelBuffer_->dotDiv(*channelBuffer_, *(scale_->getW()));
-    // store a / scale[i] in scaleDiff_ temporary
-    scaleDiff_->add(*channelBuffer_, 1.);
-
-    sampleBuffer_->dotMul(*inVTmp, *outGTmp);
-    spatialBuffer_->sumCols(*sampleBuffer_, 1., 0.);
-    // scale the grad
-    inGBuffer->copyFrom(*inVTmp);
-    inGBuffer->mulRowVector(*spatialBuffer_);
-    // divide by square of norm
-    spatialBuffer_->dotMul(*normTmp, *normTmp);
-    inGBuffer->divRowVector(*spatialBuffer_);
-    // subtract
-    inGBuffer->add(*outGTmp, -1, 1);
-    // divide by norm
-    inGBuffer->divRowVector(*normTmp);
-    // scale the diff
-    inGBuffer->mulColVector(*scale_->getW());
-
-    inGTmp->add(*inGBuffer);
-  }
-  // updata scale
-  if (scale_->getWGrad()) scale_->getWGrad()->add(*scaleDiff_);
-  scale_->getParameterPtr()->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.h b/paddle/gserver/layers/CrossEntropyOverBeam.h
deleted file mode 100644
index b47a2933c255c264ba780b2d87c9fbe53cb5665d..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CrossEntropyOverBeam.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "CrossEntropyOverBeam.h"
-#include "Layer.h"
-
-namespace paddle {
-
-/* This struct stores the beams in all search steps for a single sequence. */
-struct BeamExpansion {
-  std::vector<MatrixPtr> scores;
-  std::vector<IVectorPtr> seqInfo;
-
-  std::vector<MatrixPtr> candidateIds;
-  std::vector<int> gold;
-
-  std::vector<MatrixPtr> scoreGrad;
-
-  size_t expansionCount;
-
-  explicit BeamExpansion(int n) {
-    expansionCount = n;
-    scores.resize(expansionCount);
-    seqInfo.resize(expansionCount);
-    candidateIds.resize(expansionCount);
-    scoreGrad.resize(expansionCount);
-
-    gold.resize(expansionCount);
-  }
-};
-typedef std::shared_ptr<BeamExpansion> BeamExpansionPtr;
-
-class CostForOneSequence {
-public:
-  CostForOneSequence()
-      : beamSize_(0), validExpansionCount_(0), goldAsExtraPath_(false) {}
-  void setData(const BeamExpansionPtr bPtr, size_t beamSize) {
-    beams_ = bPtr;
-    beamSize_ = beamSize;
-
-    expandedPathScores_.clear();
-    expandedPathScores_.resize(beams_->expansionCount);
-
-    goldRowIds_.clear();
-    goldRowIds_.resize(beams_->expansionCount, 0);
-    goldColIds_.clear();
-    goldColIds_.resize(beams_->expansionCount, -1);
-  }
-  size_t getValidExpansionCount() { return validExpansionCount_; }
-
-  real forward();
-  void backward();
-
-private:
-  void calValidExpandStep();
-  void constructTotalExpansion();
-  size_t initLastExpansion();
-  real globallyNormalizedScore();
-
-  int getSeqStartPos(size_t beamId, size_t rowId) {
-    CHECK_GT(beams_->seqInfo[beamId]->getSize() - 1, rowId);
-    int* starts = beams_->seqInfo[beamId]->getData();
-    return starts[rowId] - starts[0];
-  }
-
-  size_t beamSize_;
-  size_t validExpansionCount_;
-  bool goldAsExtraPath_;
-  std::vector<int> goldRowIds_;
-  std::vector<int> goldColIds_;
-
-  BeamExpansionPtr beams_;
-  std::vector<std::vector<int>> pathRowIdsInEachBeam_;
-  std::vector<int> parentIdsInBeam_;
-  size_t goldIdsInFinalExpansion_;
-
-  std::vector<MatrixPtr> expandedPathScores_;
-
-  MatrixPtr softmaxOut_;
-};
-
-class CrossEntropyOverBeam : public Layer {
-public:
-  explicit CrossEntropyOverBeam(const LayerConfig& config) : Layer(config) {}
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
-private:
-  void checkInputs();
-  void copyInputsToCpu();
-  void resizeOutput();
-  void copyGradToGpu(size_t copyCount);
-  void splitBatchBeams();
-
-  size_t beamExpanCount_;
-  size_t batchSize_;
-  size_t beamSize_;
-
-  /*
-   * the process of constructing beams is not friendly to GPU, currently, this
-   * layer only runs on CPU, if any of its inputs is on GPU memory, then copy
-   * it to CPU memory.
-   */
-  std::vector<MatrixPtr> candidateScores_;
-  std::vector<MatrixPtr> candidateScoreGrad_;
-  std::vector<MatrixPtr> candidateInBeam_;
-  std::vector<MatrixPtr> gradToInputs_;
-  std::vector<IVectorPtr> goldSequence_;
-  std::vector<std::vector<int>> beamSplitPos_;
-
-  /*
-   * split entire bath of beams into beam per sequnence and store the result
-   * into this member.
-   */
-  std::vector<BeamExpansion> beamPerSeq_;
-  /* beamCosts_ is used to propagate error in one sequence. */
-  std::vector<CostForOneSequence> beamCosts_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
deleted file mode 100644
index 9a29e6a55e95334def2b83dc4a794e07a7fd5154..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CudnnBatchNormLayer.h"
-#include "Layer.h"
-#include "paddle/cuda/include/hl_batch_norm.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(cudnn_batch_norm, CudnnBatchNormLayer);
-
-bool CudnnBatchNormLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  if (!BatchNormBaseLayer::init(layerMap, parameterMap)) return false;
-  CHECK(useGpu_) << "CudnnBatchNorm only support GPU";
-
-  hl_create_tensor_descriptor(&ioDesc_);
-  hl_create_tensor_descriptor(&bnParamDesc_);
-  hl_tensor_reshape(bnParamDesc_, 1, channels_, 1, 1);
-
-  return true;
-}
-
-void CudnnBatchNormLayer::reshape(int batchSize) {
-  hl_tensor_reshape(ioDesc_, batchSize, channels_, imageH_ * imageD_, imageW_);
-}
-
-void CudnnBatchNormLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = getInputValue(0)->getHeight();
-  calFeatureMapSize();
-  reshape(batchSize);
-  resetOutput(batchSize, getInputValue(0)->getWidth());
-
-  // for testing in training peroid.
-  useGlobalStats_ = (passType == PASS_TEST);
-  if (passType == PASS_TEST && config_.has_use_global_stats()) {
-    useGlobalStats_ = config_.use_global_stats();
-  }
-
-  real* input = getInputValue(0)->getData();
-  real* output = getOutputValue()->getData();
-  real* gamma = weight_->getW()->getData();
-  real* beta = biases_->getW()->getData();
-  real* movingMean = movingMean_->getW()->getData();
-  real* movingVar = movingVar_->getW()->getData();
-
-  // cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
-  eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
-
-  if (!useGlobalStats_) {
-    REGISTER_TIMER_INFO("CudnnBatchFwTimer", getName().c_str());
-    real* savedMean = savedMean_->getData();
-    real* savedInvVar = savedInvVar_->getData();
-    hl_batch_norm_forward_training(ioDesc_,
-                                   input,
-                                   ioDesc_,
-                                   output,
-                                   bnParamDesc_,
-                                   gamma,
-                                   beta,
-                                   1.0 - movingAvgFraction_,
-                                   movingMean,
-                                   movingVar,
-                                   eps_,
-                                   savedMean,
-                                   savedInvVar);
-  } else {
-    // used movingMean and movingVar in testing
-    if (batchSize <= 1024) {
-      hl_batch_norm_forward_inference(ioDesc_,
-                                      input,
-                                      ioDesc_,
-                                      output,
-                                      bnParamDesc_,
-                                      gamma,
-                                      beta,
-                                      movingMean,
-                                      movingVar,
-                                      eps_);
-    } else {
-      // There is a limitation in cudnn library.
-      // When the batch size is larger than 1024 in cuDNN v5.1,
-      // the cudnnBatchNormalizationForwardInference will fail.
-      hl_batch_norm_cuda_inference(input,
-                                   output,
-                                   gamma,
-                                   beta,
-                                   movingMean,
-                                   movingVar,
-                                   eps_,
-                                   batchSize,
-                                   channels_,
-                                   imageH_ * imageD_,
-                                   imageW_);
-    }
-  }
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  real* input = getInputValue(0)->getData();
-  real* outGrad = getOutputGrad()->getData();
-  real* inGrad = getInputGrad(0)->getData();
-  real* gamma = weight_->getW()->getData();
-  real* savedMean = savedMean_->getData();
-  real* savedInvVar = savedInvVar_->getData();
-
-  // cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
-  eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
-
-  auto create = [](MatrixPtr& m, size_t h, size_t w, real** p) {
-    Matrix::resizeOrCreate(m, h, w, false, true);
-    m->zeroMem();
-    *p = m->getData();
-  };
-
-  real* gammaGrad = nullptr;
-  real* betaGrad = nullptr;
-  if (weight_->getWGrad()) {
-    gammaGrad = weight_->getWGrad()->getData();
-  } else {
-    create(tmpWGrad_, 1, channels_, &gammaGrad);
-  }
-  if (biases_ && biases_->getWGrad()) {
-    betaGrad = biases_->getWGrad()->getData();
-  } else {
-    create(tmpBiasGrad_, 1, channels_, &betaGrad);
-  }
-
-  hl_batch_norm_backward(ioDesc_,
-                         input,
-                         ioDesc_,
-                         outGrad,
-                         ioDesc_,
-                         inGrad,
-                         bnParamDesc_,
-                         gamma,
-                         gammaGrad,
-                         betaGrad,
-                         eps_,
-                         savedMean,
-                         savedInvVar);
-
-  {
-    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-    biases_->getParameterPtr()->incUpdate(callback);
-    weight_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-CudnnBatchNormLayer::~CudnnBatchNormLayer() {
-  hl_destroy_tensor_descriptor(ioDesc_);
-  hl_destroy_tensor_descriptor(bnParamDesc_);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.h b/paddle/gserver/layers/CudnnBatchNormLayer.h
deleted file mode 100644
index aa279f73d66770384815cad4d9e2ee0b04a4a1ad..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CudnnBatchNormLayer.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cudnn.h>
-#include "BatchNormBaseLayer.h"
-#include "Layer.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief Cudnn Batch normalization layer use to cuDNN lib to implentment.
- * @note Cudnn version must >= v4.0, and better to use the latest version
- * (v5.1).
- *
- * The config file api is batch_norm_layer.
- */
-
-class CudnnBatchNormLayer : public BatchNormBaseLayer {
-public:
-  explicit CudnnBatchNormLayer(const LayerConfig& config)
-      : BatchNormBaseLayer(config) {}
-
-  ~CudnnBatchNormLayer();
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  /**
-   * reshape tensor of ioDesc_.
-   */
-  void reshape(int batchSize);
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-protected:
-  /// Epsilon value used in the batch normalization formula.
-  /// Same epsilon value should be used in forward and backward functions.
-  double eps_;
-
-  /// Input/output tensor descriptor desc
-  hl_tensor_descriptor ioDesc_;
-  /// Shared tensor descriptor desc for the 6 tenros:
-  /// bnScale, bnBias, running mean/var, save_mean/var
-  hl_tensor_descriptor bnParamDesc_;
-
-  /**
-   * @brief The gradient of weight and bias in cudnn api can not be empty.
-   * If set is_static for weight or bias, it will not allocate memory for them,
-   * and the gradient is NULL. In this case, will use two matrix.
-   */
-  MatrixPtr tmpWGrad_, tmpBiasGrad_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnConvBaseLayer.cpp b/paddle/gserver/layers/CudnnConvBaseLayer.cpp
deleted file mode 100644
index 6d0a40a60710603900a9b89980d38b2d7638ad60..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CudnnConvBaseLayer.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CudnnConvBaseLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-REGISTER_LAYER(cudnn_conv, CudnnConvBaseLayer);
-REGISTER_LAYER(cudnn_convt, CudnnConvBaseLayer);
-
-bool CudnnConvBaseLayer::init(const LayerMap &layerMap,
-                              const ParameterMap &parameterMap) {
-  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
-  CHECK(useGpu_) << "CudnnConvLayer only support gpu";
-
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  projections_.reserve(inputLayers_.size());
-  projConf_.reserve(inputLayers_.size());
-
-  numFilters_ = config_.num_filters();
-  CHECK(config_.shared_biases());
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    ProjectionConfig *conf = new ProjectionConfig();
-    if (isDeconv_) {
-      conf->set_type("convt");
-    } else {
-      conf->set_type("conv");
-    }
-    conf->set_num_filters(numFilters_);
-    ConvConfig *convConf = conf->mutable_conv_conf();
-    *convConf = *(config_.mutable_inputs(i)->mutable_conv_conf());
-    conf->set_input_size(getPrev(i)->getSize());
-    conf->set_output_size(getSize());
-    projConf_.emplace_back(conf);
-    projections_.emplace_back(
-        Projection::create(*projConf_[i], parameters_[i], useGpu_));
-
-    // create a new weight
-    size_t height, width;
-    height = filterPixels_[i] * filterChannels_[i];
-    width = (!isDeconv_) ? numFilters_ : channels_[i];
-    CHECK_EQ(parameters_[i]->getSize(), width * height);
-    Weight *w = new Weight(height, width, parameters_[i]);
-    weights_.emplace_back(w);
-  }
-
-  if (biasParameter_.get()) {
-    if (sharedBiases_) {
-      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
-    } else {
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
-    }
-  }
-  if (biases_.get() && sharedBiases_) {
-    hl_create_tensor_descriptor(&biasDesc_);
-    hl_create_tensor_descriptor(&outputDesc_);
-    hl_tensor_reshape(biasDesc_, 1, numFilters_, 1, 1);
-  }
-
-  return true;
-}
-
-void CudnnConvBaseLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = getInput(0).getBatchSize();
-  resetOutput(batchSize, calOutputSize());
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    projections_[i]->forward(&getInput(i), &getOutput(), passType);
-  }
-
-  if (biases_) {
-    REGISTER_TIMER_INFO("CudnnConvBiasTimer", getName().c_str());
-    int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-    int outH = outputH_[0];
-    int outW = outputW_[0];
-
-    hl_tensor_reshape(outputDesc_,
-                      batchSize,
-                      numFilters_,
-                      outH,
-                      outW,
-                      numFilters_ * outH * outW,
-                      outH * outW,
-                      outW,
-                      1);
-    real *outData = getOutputValue()->getData();
-    real *biasData = biases_->getW()->getData();
-    hl_convolution_forward_add_bias(biasDesc_, biasData, outputDesc_, outData);
-  }
-
-  forwardActivation();
-}
-
-void CudnnConvBaseLayer::backward(const UpdateCallback &callback) {
-  backwardActivation();
-
-  if (biases_ && biases_->getWGrad()) {
-    REGISTER_TIMER_INFO("CudnnConvBpBiasTimer", getName().c_str());
-    real *biasGrad = biases_->getWGrad()->getData();
-    real *outGrad = getOutputGrad()->getData();
-    hl_convolution_backward_bias(biasDesc_, biasGrad, outputDesc_, outGrad);
-
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    projections_[i]->backward(callback);
-  }
-}
-
-CudnnConvBaseLayer::~CudnnConvBaseLayer() {
-  if (biases_) {
-    hl_destroy_tensor_descriptor(biasDesc_);
-    hl_destroy_tensor_descriptor(outputDesc_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnConvBaseLayer.h b/paddle/gserver/layers/CudnnConvBaseLayer.h
deleted file mode 100644
index 698104e4fbd2556f426001687a581153f32773d8..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CudnnConvBaseLayer.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "ConvBaseLayer.h"
-#include "Projection.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A 2-dimension conv layer implemented by cuDNN. It only
- *        supports GPU mode. We automatic select CudnnConvLayer for GPU
- *        mode and ExpandConvLayer for CPU mode if you set type of "conv".
- *        User also can specfiy type of "exconv" or "cudnn_conv" for
- *        particular type.
- *
- * The config file api is img_conv_layer.
- */
-class CudnnConvBaseLayer : public ConvBaseLayer {
-protected:
-  std::vector<std::unique_ptr<ProjectionConfig>> projConf_;
-  std::vector<std::unique_ptr<Projection>> projections_;
-
-  hl_tensor_descriptor biasDesc_;
-  hl_tensor_descriptor outputDesc_;
-
-public:
-  explicit CudnnConvBaseLayer(const LayerConfig& config)
-      : ConvBaseLayer(config) {}
-
-  ~CudnnConvBaseLayer();
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnPoolLayer.cpp b/paddle/gserver/layers/CudnnPoolLayer.cpp
deleted file mode 100644
index ac6d2168f43590a6acd70f6641ff729327894ea0..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CudnnPoolLayer.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CudnnPoolLayer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-bool CudnnPoolLayer::typeCheck(const std::string &poolType,
-                               hl_pooling_mode_t *mode) {
-  if (poolType == "cudnn-max-pool") {
-    if (mode) {
-      *mode = HL_POOLING_MAX;
-    }
-  } else if (poolType == "cudnn-avg-pool") {
-    if (mode) {
-      *mode = HL_POOLING_AVERAGE;
-    }
-  } else if (poolType == "cudnn-avg-incl-pad-pool") {
-    if (mode) {
-      *mode = HL_POOLING_AVERAGE_INCLUDE_PADDING;
-    }
-  } else {
-    return false;
-  }
-
-  return true;
-}
-
-CudnnPoolLayer::CudnnPoolLayer(const LayerConfig &config) : PoolLayer(config) {
-  const std::string &pool_type = config.inputs(0).pool_conf().pool_type();
-  CHECK_EQ(CudnnPoolLayer::typeCheck(pool_type, &mode_), true);
-}
-
-bool CudnnPoolLayer::init(const LayerMap &layerMap,
-                          const ParameterMap &parameterMap) {
-  PoolLayer::init(layerMap, parameterMap);
-
-  CHECK(useGpu_) << "CudnnPoolLayer only support gpu";
-
-  hl_create_tensor_descriptor(&inputDesc_);
-  hl_create_tensor_descriptor(&outputDesc_);
-
-  windowHeight = sizeY_;
-  windowWidth = sizeX_;
-  heightPadding = confPaddingY_;
-  widthPadding = confPadding_;
-  strideHeight = strideY_;
-  strideWidth = stride_;
-
-  hl_create_pooling_descriptor(&poolingDesc_,
-                               mode_,
-                               windowHeight,
-                               windowWidth,
-                               heightPadding,
-                               widthPadding,
-                               strideHeight,
-                               strideWidth);
-
-  return true;
-}
-
-void CudnnPoolLayer::reshape(int batchSize) {
-  imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (imageH_ == 0) {
-    imageH_ = imgSizeY_;
-  }
-  if (imageW_ == 0) {
-    imageW_ = imgSize_;
-  }
-  CHECK_EQ(inputLayers_[0]->getOutput().value->getWidth(),
-           channels_ * imageH_ * imageW_);
-  outputH_ = outputSize(imageH_,
-                        sizeY_,
-                        confPaddingY_,
-                        strideY_,
-                        /* caffeMode */ false);
-  outputW_ =
-      outputSize(imageW_, sizeX_, confPadding_, stride_, /* caffeMode */ false);
-  getOutput().setFrameHeight(outputH_);
-  getOutput().setFrameWidth(outputW_);
-
-  hl_tensor_reshape(inputDesc_, batchSize, channels_, imageH_, imageW_);
-  hl_tensor_reshape(outputDesc_, batchSize, channels_, outputH_, outputW_);
-}
-
-void CudnnPoolLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  CHECK(inputLayers_[0]->getOutputValue()->useGpu());
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  reshape(batchSize);
-  resetOutput(batchSize, outputH_ * outputW_ * channels_);
-
-  real *inputData = getInputValue(0)->getData();
-  real *outData = getOutputValue()->getData();
-  hl_pooling_forward(inputDesc_, inputData, outputDesc_, outData, poolingDesc_);
-}
-
-void CudnnPoolLayer::backward(const UpdateCallback &callback) {
-  (void)callback;
-  if (NULL == getInputGrad(0)) {
-    return;
-  }
-
-  real *inputData = getInputValue(0)->getData();
-  real *inputGrad = getInputGrad(0)->getData();
-  real *outData = getOutputValue()->getData();
-  real *outGrad = getOutputGrad()->getData();
-  hl_pooling_backward(inputDesc_,
-                      inputData,
-                      inputGrad,
-                      outputDesc_,
-                      outData,
-                      outGrad,
-                      poolingDesc_);
-}
-
-CudnnPoolLayer::~CudnnPoolLayer() {
-  hl_destroy_tensor_descriptor(inputDesc_);
-  hl_destroy_tensor_descriptor(outputDesc_);
-  hl_destroy_pooling_descriptor(poolingDesc_);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnPoolLayer.h b/paddle/gserver/layers/CudnnPoolLayer.h
deleted file mode 100644
index 9eb4fc6138b0bce59660406705d15291eb38af9b..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/CudnnPoolLayer.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "PoolLayer.h"
-
-namespace paddle {
-
-/**
- * @brief CudnnPoolLayer is subclass of PoolLayer, which is implemented by
- * cudnn api and only supports GPU.
- *
- * The config file api is img_pool_layer.
- */
-
-class CudnnPoolLayer : public PoolLayer {
-protected:
-  int windowHeight, windowWidth;
-  int heightPadding, widthPadding, strideHeight, strideWidth;
-  int imageH_, imageW_, outputH_, outputW_;
-  /// mode_ is poolint type, inlcuding "cudnn-max-pool", "cudnn-avg-pool"
-  /// "cudnn-avg-excl-pad-pool".
-  hl_pooling_mode_t mode_;
-  /// cudnn tensor descriptor for input.
-  hl_tensor_descriptor inputDesc_;
-  /// cudnn tensor descriptor for output.
-  hl_tensor_descriptor outputDesc_;
-  /// A description of a pooling operation.
-  hl_pooling_descriptor poolingDesc_;
-
-public:
-  static bool typeCheck(const std::string& poolType,
-                        hl_pooling_mode_t* mode = nullptr);
-  explicit CudnnPoolLayer(const LayerConfig& config);
-  ~CudnnPoolLayer();
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  /**
-   * Reshape input and output tensor descriptor.
-   * The batch size maybe change during training in last batch of each pass.
-   * So reshaping is needed.
-   */
-  void reshape(int batchSize);
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/DataLayer.h b/paddle/gserver/layers/DataLayer.h
deleted file mode 100644
index 4b12afe0efe81843b58e459ca1e58b4f7f4a1664..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/DataLayer.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-
-#include "Layer.h"
-
-namespace paddle {
-/**
- * This layer just copy data to output, and has no backward propagation.
- *
- * The config file api is data_layer.
- */
-class DataLayer : public Layer {
-public:
-  explicit DataLayer(const LayerConfig& config) : Layer(config) {}
-
-  virtual void setData(const Argument& data) { data_ = data; }
-
-  /**
-   * Prefetch sparse matrix/ids only.
-   */
-  void prefetch() override { output_ = data_; }
-
-  /**
-   * Forward propagation. Copy data_ (value, in, grad, ids, cpuSequenceDims,
-   * sequenceStartPositions, subSequenceStartPositions, strs) to output_.
-   */
-  void forward(PassType passType) override {
-    Layer::forward(passType);
-    copyDataToOutput(output_);
-    if (FLAGS_show_layer_stat) {
-      showOutputStats();
-    }
-  }
-
-  /**
-   * Data layer's backward propagation do nothing.
-   */
-  void backward(const UpdateCallback& callback) override { (void)callback; }
-
-  void copyOutputToOtherDevice() override {
-    for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-      copyDataToOutput(outputOtherDevice_[i]);
-    }
-  }
-
-private:
-  void copyDataToOutput(Argument& output);
-
-protected:
-  Argument data_;
-};
-
-typedef std::shared_ptr<DataLayer> DataLayerPtr;
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/DataNormLayer.cpp b/paddle/gserver/layers/DataNormLayer.cpp
deleted file mode 100644
index 86da4d6f957e2ce0afc53d69f9d57c234f8f178f..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/DataNormLayer.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DataNormLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(data_norm, DataNormLayer);
-
-bool DataNormLayer::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* initialize the weight */
-  CHECK(!biasParameter_) << "DataNormLayer does not need bias";
-  CHECK(inputLayers_.size() == 1 && inputLayers_[0]->getType() == "data")
-      << "DataNormLayer accepts one and only one DataLayer as its input layer";
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  CHECK_EQ(inputLayers_[0]->getSize(), getSize());
-  CHECK_EQ(parameters_[0]->getSize(), 5 * getSize());
-  CHECK(parameters_[0]->isStatic())
-      << "The parameter of DataNormLayer must be static";
-
-  weight_ = std::unique_ptr<Weight>(new Weight(5, getSize(), parameters_[0]));
-  min_ = Matrix::create(
-      nullptr, /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
-  rangeReciprocal_ = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    getSize(),
-                                    /* trans= */ false,
-                                    useGpu_);
-  mean_ = Matrix::create(nullptr,
-                         /* height= */ 1,
-                         getSize(),
-                         /* trans= */ false,
-                         useGpu_);
-  stdReciprocal_ = Matrix::create(nullptr,
-                                  /* height= */ 1,
-                                  getSize(),
-                                  /* trans= */ false,
-                                  useGpu_);
-  decimalReciprocal_ = Matrix::create(nullptr,
-                                      /* height= */ 1,
-                                      getSize(),
-                                      /* trans= */ false,
-                                      useGpu_);
-
-  min_->setData(weight_->getW()->getData());
-  rangeReciprocal_->setData(weight_->getW()->getData() + getSize());
-  mean_->setData(weight_->getW()->getData() + 2 * getSize());
-  stdReciprocal_->setData(weight_->getW()->getData() + 3 * getSize());
-  decimalReciprocal_->setData(weight_->getW()->getData() + 4 * getSize());
-
-  /* normalization strategy */
-  if (config_.data_norm_strategy() == "z-score") {
-    mode_ = kZScore;
-  } else if (config_.data_norm_strategy() == "min-max") {
-    mode_ = kMinMax;
-  } else if (config_.data_norm_strategy() == "decimal-scaling") {
-    mode_ = kDecimalScaling;
-  } else {
-    LOG(FATAL) << "Unknown data normalization strategy: "
-               << config_.data_norm_strategy();
-  }
-
-  return true;
-}
-
-void DataNormLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInput(0).getBatchSize();
-  int size = getSize();
-  reserveOutput(batchSize, size);
-
-  const MatrixPtr inValue = getInputValue(0);
-  MatrixPtr outValue = getOutputValue();
-  outValue->copyFrom(*inValue);
-  switch (mode_) {
-    case kZScore: {
-      outValue->addBias(*mean_, -1.0);
-      outValue->colScale(0, *outValue, *stdReciprocal_);
-      break;
-    }
-    case kMinMax: {
-      outValue->addBias(*min_, -1.0);
-      outValue->colScale(0, *outValue, *rangeReciprocal_);
-      break;
-    }
-    case kDecimalScaling: {
-      outValue->colScale(0, *outValue, *decimalReciprocal_);
-      break;
-    }
-    default:
-      LOG(FATAL) << "should not reach here";
-  }
-}
-
-void DataNormLayer::backward(const UpdateCallback& callback) {
-  // The parameter for DataNormLayer is static, and does not need to be updated
-  (void)callback;
-
-  /* Calculate the input layers error */
-  const MatrixPtr& outGrad = getOutputGrad();
-  MatrixPtr inGrad = getInputGrad(0);
-  if (inGrad) {
-    switch (mode_) {
-      case kZScore: {
-        inGrad->addColScale(0, *outGrad, *stdReciprocal_);
-        break;
-      }
-      case kMinMax: {
-        inGrad->addColScale(0, *outGrad, *rangeReciprocal_);
-        break;
-      }
-      case kDecimalScaling: {
-        inGrad->addColScale(0, *outGrad, *decimalReciprocal_);
-        break;
-      }
-      default: { LOG(FATAL) << "should not reach here"; }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/DataNormLayer.h b/paddle/gserver/layers/DataNormLayer.h
deleted file mode 100644
index 2a2a2a4aa76e8e315d9d66da1b738d6d615d10f2..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/DataNormLayer.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for data normalization
- * - Input: One and only one input layer is accepted. The input layer must
- *        be DataLayer with dense data type.
- * - Output: The normalization of the input data
- *
- * Reference:
- *    LA Shalabi, Z Shaaban, B Kasasbeh. Data mining: A preprocessing engine
- *
- * Three data normalization methoeds are considered
- * - z-score: y = (x-mean)/std
- * - min-max: y = (x-min)/(max-min)
- * - decimal-scaling: y = x/10^j, where j is the smallest integer such that
- *max(|y|)<1
- */
-
-class DataNormLayer : public Layer {
-public:
-  enum NormalizationStrategy { kZScore = 0, kMinMax = 1, kDecimalScaling = 2 };
-
-  explicit DataNormLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~DataNormLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-protected:
-  int mode_;
-  std::unique_ptr<Weight> weight_;
-  MatrixPtr min_;
-  MatrixPtr rangeReciprocal_;  // 1/(max-min)
-  MatrixPtr mean_;
-  MatrixPtr stdReciprocal_;      // 1/std
-  MatrixPtr decimalReciprocal_;  // 1/10^j
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/DeConv3DLayer.cpp b/paddle/gserver/layers/DeConv3DLayer.cpp
deleted file mode 100644
index db6d6e073c08c35c5a71b2b18ab0103d42ccd318..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/DeConv3DLayer.cpp
+++ /dev/null
@@ -1,220 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DeConv3DLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(deconv3d, DeConv3DLayer);
-
-bool DeConv3DLayer::init(const LayerMap &layerMap,
-                         const ParameterMap &parameterMap) {
-  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
-  // for Deconv, the dimension of Kernel is
-  // channel * output * depth * height * weigth
-  // Matrix storage format: (output * depth * height * weigth) x  channel
-  for (int index = 0; index < config_.inputs().size(); ++index) {
-    M_.push_back(filterChannels_[index]);
-    K_.push_back(filterPixels_[index] * (numFilters_ / groups_[index]));
-
-    // create a new weight
-    size_t height, width;
-    height = filterPixels_[index] * numFilters_;
-    width = filterChannels_[index];
-    CHECK_EQ(parameters_[index]->getSize(), width * height);
-    Weight *w = new Weight(height, width, parameters_[index]);
-    weights_.emplace_back(w);
-  }
-  if (biasParameter_.get()) {
-    if (sharedBiases_) {
-      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
-    } else {
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
-    }
-  }
-  return true;
-}
-
-size_t DeConv3DLayer::getSize() {
-  CHECK_NE(inputLayers_.size(), 0UL);
-  imgSizeW_.clear();
-  imgSizeH_.clear();
-  imgSizeD_.clear();
-  N_.clear();
-  NOut_.clear();
-  size_t layerSize = 0;
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    imgSizeW_.push_back(
-        imageSize(outputW_[i], filterSize_[i], padding_[i], stride_[i], true));
-    imgSizeH_.push_back(imageSize(
-        outputH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
-    imgSizeD_.push_back(imageSize(
-        outputD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
-    NOut_.push_back(imgSizeD_[i] * imgSizeH_[i] * imgSizeW_[i]);
-    N_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
-    CHECK(layerSize == 0 || N_[i] * size_t(numFilters_) == layerSize);
-    layerSize += NOut_[i] * numFilters_;
-  }
-  getOutput().setFrameHeight(imgSizeH_[0]);
-  getOutput().setFrameWidth(imgSizeW_[0]);
-  getOutput().setFrameDepth(imgSizeD_[0]);
-  return layerSize;
-}
-
-void DeConv3DLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  int outWidth = getSize();
-  resetOutput(batchSize, outWidth);
-  const MatrixPtr outMat = getOutputValue();
-
-  REGISTER_TIMER_INFO("FwdDeConv3D", getName().c_str());
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    const MatrixPtr &inMat = getInputValue(i);
-    int M = M_[i];
-    int N = N_[i];
-    int K = K_[i];
-    MatrixPtr wMat = weights_[i]->getW();
-    Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
-    for (int n = 0; n < batchSize; ++n) {
-      real *inData = inMat->getData() + n * inMat->getStride();
-      for (int g = 0; g < groups_[i]; ++g) {
-        MatrixPtr inMatSub = Matrix::create(inData, M, N, false, useGpu_);
-        MatrixPtr wMatSub = wMat->subMatrix(g * K, K);
-        MatrixPtr colBufDataSub = colBuf_->subMatrix(g * K, K);
-        colBufDataSub->mul(*wMatSub, *inMatSub, 1.0, 0.0);
-        inData += M * N;
-      }
-      colBuf_->col2Vol(outMat->getData() + n * outMat->getStride(),
-                       numFilters_,
-                       imgSizeD_[i],
-                       imgSizeH_[i],
-                       imgSizeW_[i],
-                       filterSizeZ_[i],
-                       filterSizeY_[i],
-                       filterSize_[i],
-                       strideZ_[i],
-                       strideY_[i],
-                       stride_[i],
-                       paddingZ_[i],
-                       paddingY_[i],
-                       padding_[i],
-                       1.0,
-                       1.0);
-    }
-  }
-  if (nullptr != this->biasParameter_) {
-    this->addBias();
-  }
-  forwardActivation();
-}
-
-void DeConv3DLayer::backward(const UpdateCallback &callback) {
-  backwardActivation();
-  int batchSize = getOutputGrad()->getHeight();
-  if (biases_ && biases_->getWGrad()) {
-    bpropBiases();
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-  REGISTER_TIMER_INFO("BwdDeConv3D", getName().c_str());
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    if (weights_[i]->getWGrad() || this->needGradient_) {
-      int M = M_[i];
-      int N = N_[i];
-      int K = K_[i];
-      Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
-      const MatrixPtr &inMat = getInputValue(i);
-      for (int n = 0; n < batchSize; ++n) {
-        colBuf_->vol2Col(
-            getOutputGrad()->getData() + n * getOutputGrad()->getStride(),
-            numFilters_,
-            imgSizeD_[i],
-            imgSizeH_[i],
-            imgSizeW_[i],
-            filterSizeZ_[i],
-            filterSizeY_[i],
-            filterSize_[i],
-            strideZ_[i],
-            strideY_[i],
-            stride_[i],
-            paddingZ_[i],
-            paddingY_[i],
-            padding_[i]);
-        if (weights_[i]->getWGrad()) {
-          real *inData = inMat->getData() + n * inMat->getStride();
-          for (int g = 0; g < groups_[i]; ++g) {
-            MatrixPtr colBufDataSub = colBuf_->subMatrix(g * K, K);
-            MatrixPtr wGradMatSub =
-                weights_[i]->getWGrad()->subMatrix(g * K, K);
-            MatrixPtr inMatSub = Matrix::create(inData, M, N, false, useGpu_);
-            wGradMatSub->mul(
-                *colBufDataSub, *(inMatSub->getTranspose()), 1.0, 1.0);
-            inData += M * N;
-          }
-        }
-        if (getInputGrad(i)) {
-          real *preGrad =
-              getInputGrad(i)->getData() + n * getInputGrad(i)->getStride();
-          for (int g = 0; g < groups_[i]; ++g) {
-            MatrixPtr w = weights_[i]->getW()->subMatrix(g * K, K);
-            MatrixPtr outGradMat = colBuf_->subMatrix(g * K, K);
-            MatrixPtr inGradMatSub =
-                Matrix::create(preGrad, M, N, false, useGpu_);
-            inGradMatSub->mul(*(w->getTranspose()), *outGradMat, 1.0, 1.0);
-            preGrad += M * N;
-          }
-        }
-      }
-      weights_[i]->getParameterPtr()->incUpdate(callback);
-    }
-  }
-}
-void DeConv3DLayer::bpropWeights(int i) {}
-void DeConv3DLayer::bpropData(int i) {}
-
-void DeConv3DLayer::bpropBiases() {
-  MatrixPtr biases = Matrix::create(biases_->getWGrad()->getData(),
-                                    1,
-                                    biases_->getWGrad()->getElementCnt(),
-                                    false,
-                                    useGpu_);
-  const MatrixPtr &outGradMat = getOutputGrad();
-
-  if (this->sharedBiases_) {
-    biases->collectSharedBias(*outGradMat, 1.0f);
-  } else {
-    biases->collectBias(*outGradMat, 1.0f);
-  }
-}
-
-void DeConv3DLayer::addBias() {
-  MatrixPtr outMat = getOutputValue();
-  MatrixPtr bias = Matrix::create(biases_->getW()->getData(),
-                                  1,
-                                  biases_->getW()->getElementCnt(),
-                                  false,
-                                  useGpu_);
-  if (this->sharedBiases_) {
-    outMat->addSharedBias(*(bias), 1.0f);
-  } else {
-    outMat->addBias(*(bias), 1.0f);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/DeConv3DLayer.h b/paddle/gserver/layers/DeConv3DLayer.h
deleted file mode 100644
index 57d51cdec66930b9b79c0c0395da66922cd53ae4..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/DeConv3DLayer.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "ConvBaseLayer.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A subclass of deconvolution3D layer.
- * This layer expands input and use matrix multiplication to
- * calculate deconvolution3D operation.
- */
-class DeConv3DLayer : public ConvBaseLayer {
-public:
-  explicit DeConv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
-  ~DeConv3DLayer() {}
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void addBias();
-  void backward(const UpdateCallback& callback);
-  void bpropBiases();
-  void bpropData(int i);
-  void bpropWeights(int i);
-  size_t getSize();
-
-protected:
-  // Figure out the dimensions for individual gemms.
-  IntV M_;  /// numFilters_ / filter_group_;
-  IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
-  IntV K_;  /// outputD_ * outputH_ * outputW_
-  IntV NOut_;
-  MatrixPtr colBuf_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/DetectionOutputLayer.h b/paddle/gserver/layers/DetectionOutputLayer.h
deleted file mode 100644
index 174a6e5d9acb476276b66627b4aabce2ae6c1037..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/DetectionOutputLayer.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <vector>
-#include "DetectionUtil.h"
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * The detection output layer for a SSD detection task. This layer applies the
- * Non-maximum suppression to the all predicted bounding box and keeps the
- * Top-K bounding boxes.
- * - Input: This layer needs three input layers: The first input layer
- *          is the priorbox layer. The rest two input layers are convolution
- *          layers for generating bbox location offset and the classification
- *          confidence.
- * - Output: The predict bounding box locations.
- */
-
-class DetectionOutputLayer : public Layer {
-public:
-  explicit DetectionOutputLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-
-  void backward(const UpdateCallback& callback = nullptr) {}
-
-protected:
-  inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
-
-  inline LayerPtr getLocInputLayer(size_t index) {
-    return inputLayers_[1 + index];
-  }
-
-  inline LayerPtr getConfInputLayer(size_t index) {
-    return inputLayers_[1 + inputNum_ + index];
-  }
-
-private:
-  size_t numClasses_;  // number of classes
-  size_t inputNum_;    // number of input layers
-  real nmsThreshold_;
-  real confidenceThreshold_;
-  size_t nmsTopK_;
-  size_t keepTopK_;
-  size_t backgroundId_;
-
-  size_t locSizeSum_;
-  size_t confSizeSum_;
-
-  MatrixPtr locBuffer_;
-  MatrixPtr confBuffer_;
-  MatrixPtr locTmpBuffer_;
-  MatrixPtr confTmpBuffer_;
-  MatrixPtr priorCpuValue_;
-  MatrixPtr locCpuBuffer_;
-  MatrixPtr confCpuBuffer_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/DetectionUtil.h b/paddle/gserver/layers/DetectionUtil.h
deleted file mode 100644
index d6502fcf8fb12a434632876c25ac3ca23b87e60e..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/DetectionUtil.h
+++ /dev/null
@@ -1,307 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <float.h>
-#include <algorithm>
-#include <vector>
-#include "paddle/math/Matrix.h"
-
-using std::vector;
-using std::pair;
-using std::map;
-
-namespace paddle {
-
-template <typename T>
-struct BBoxBase {
-  BBoxBase(T xMin, T yMin, T xMax, T yMax)
-      : xMin(xMin), yMin(yMin), xMax(xMax), yMax(yMax), isDifficult(false) {}
-
-  BBoxBase() {}
-
-  T getWidth() const { return xMax - xMin; }
-
-  T getHeight() const { return yMax - yMin; }
-
-  T getCenterX() const { return (xMin + xMax) / 2; }
-
-  T getCenterY() const { return (yMin + yMax) / 2; }
-
-  T getArea() const { return getWidth() * getHeight(); }
-
-  // coordinate of bounding box
-  T xMin;
-  T yMin;
-  T xMax;
-  T yMax;
-  // whether difficult object (e.g. object with heavy occlusion is difficult)
-  bool isDifficult;
-};
-
-struct NormalizedBBox : BBoxBase<real> {
-  NormalizedBBox() : BBoxBase<real>() {}
-};
-
-enum PermMode { kNCHWToNHWC, kNHWCToNCHW };
-
-/**
- * @brief First permute input maxtrix then append to output matrix
- */
-size_t appendWithPermute(const Matrix& inMatrix,
-                         size_t height,
-                         size_t width,
-                         size_t outTotalSize,
-                         size_t outOffset,
-                         size_t batchSize,
-                         Matrix& outMatrix,
-                         PermMode permMode);
-
-/**
- * @brief First permute input maxtrix then decompose to output
- */
-size_t decomposeWithPermute(const Matrix& inMatrix,
-                            size_t height,
-                            size_t width,
-                            size_t totalSize,
-                            size_t offset,
-                            size_t batchSize,
-                            Matrix& outMatrix,
-                            PermMode permMode);
-
-/**
- * @brief Compute jaccard overlap between two bboxes.
- * @param bbox1 The first bbox
- * @param bbox2 The second bbox
- */
-real jaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2);
-
-/**
- * @brief Compute offset parameters between prior bbox and ground truth bbox
- * and variances of prior bbox are considered
- * @param priorBBox Input prior bbox
- * @param priorBBoxVar Variance parameters of prior bbox
- * @param gtBBox Groundtruth bbox
- * @param outVec Output vector
- */
-void encodeBBoxWithVar(const NormalizedBBox& priorBBox,
-                       const vector<real>& priorBBoxVar,
-                       const NormalizedBBox& gtBBox,
-                       vector<real>& outVec);
-
-/**
- * @brief Decode prior bbox with offset parameters
- * and variances of prior bbox are considered
- * @param priorBBox Prior bbox to be decoded
- * @param priorBBoxVar Variance parameters of prior bbox
- * @param locPredData Offset parameters
- */
-NormalizedBBox decodeBBoxWithVar(const NormalizedBBox& priorBBox,
-                                 const vector<real>& priorBBoxVar,
-                                 const vector<real>& locPredData);
-
-/**
- * @brief Extract bboxes from prior matrix, the layout is
- * xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var ...
- * @param priorData Matrix of prior value
- * @param numBBoxes Number of bbox to be extracted
- * @param bboxVec Append to the vector
- */
-void getBBoxFromPriorData(const real* priorData,
-                          const size_t numBBoxes,
-                          vector<NormalizedBBox>& bboxVec);
-
-/**
- * @brief Extract labels, scores and bboxes from detection matrix, the layout is
- * imageId | label | score | xmin | ymin | xmax | ymax
- * @param detectData Matrix of detection value
- * @param numBBoxes Number of bbox to be extracted
- * @param labelVec Label of bbox
- * @param scoreVec Score of bbox
- * @param bboxVec Append to the vector
- */
-void getBBoxFromDetectData(const real* detectData,
-                           const size_t numBBoxes,
-                           vector<real>& labelVec,
-                           vector<real>& scoreVec,
-                           vector<NormalizedBBox>& bboxVec);
-
-/**
- * @brief Extract variances from prior matrix, the layout is
- * xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var ...
- * @param priorData Matrix of prior value
- * @param num Number to be extracted
- * @param varVec Append to the vector
- */
-void getBBoxVarFromPriorData(const real* priorData,
-                             const size_t num,
-                             vector<vector<real>>& varVec);
-
-/**
- * @brief Extract bboxes from label matrix, the layout is
- * class1_1 | xmin1_1 | ymin1_1 | xmax1_1 | ymax1_1 | difficult1_1 | ...
- * @param labelData Matrix of label value
- * @param numBBoxes Number to be extracted
- * @param bboxVec Append to the vector
- */
-void getBBoxFromLabelData(const real* labelData,
-                          const size_t numBBoxes,
-                          vector<NormalizedBBox>& bboxVec);
-
-/**
-* @brief Match prior bbox to groundtruth bbox, the strategy is:
-1. Find the most overlaped bbox pair (prior and groundtruth)
-2. For rest of prior bboxes find the most overlaped groundtruth bbox
-* @param priorBBoxes prior bbox
-* @param gtBBoxes groundtruth bbox
-* @param overlapThreshold Low boundary of overlap (judge whether matched)
-* @param matchIndices For each prior bbox, groundtruth bbox index if matched
-otherwise -1
-* @param matchOverlaps For each prior bbox, overap with all groundtruth bboxes
-*/
-void matchBBox(const vector<NormalizedBBox>& priorBBoxes,
-               const vector<NormalizedBBox>& gtBBoxes,
-               real overlapThreshold,
-               vector<int>* matchIndices,
-               vector<real>* matchOverlaps);
-
-/**
-* @brief Generate positive bboxes and negative bboxes,
-|positive bboxes|/|negative bboxes| is negPosRatio
-* @param priorValue Prior value
-* @param numPriorBBoxes Number of prior bbox
-* @param gtValue Groundtruth value
-* @param gtStartPosPtr Since groundtruth value stored as sequence type,
-this parameter indicates start position of each record
-* @param seqNum Number of sequence
-* @param maxConfScore Classification score for prior bbox, used to mine
-negative examples
-* @param batchSize Image number
-* @param overlapThreshold Low boundary of overap
-* @param negOverlapThreshold Upper boundary of overap (judge negative example)
-* @param negPosRatio Control number of negative bboxes
-* @param matchIndicesVecPtr Save indices of matched prior bbox
-* @param negIndicesVecPtr Save indices of negative prior bbox
-*/
-pair<size_t, size_t> generateMatchIndices(
-    const Matrix& priorValue,
-    const size_t numPriorBBoxes,
-    const Matrix& gtValue,
-    const int* gtStartPosPtr,
-    const size_t seqNum,
-    const vector<vector<real>>& maxConfScore,
-    const size_t batchSize,
-    const real overlapThreshold,
-    const real negOverlapThreshold,
-    const size_t negPosRatio,
-    vector<vector<int>>* matchIndicesVecPtr,
-    vector<vector<int>>* negIndicesVecPtr);
-
-/**
- * @brief Get max confidence score for each prior bbox
- * @param confData Confidence scores, layout is
- * class1 score | class2 score | ... | classN score ...
- * @param batchSize Image number
- * @param numPriorBBoxes Prior bbox number
- * @param numClasses Classes number
- * @param backgroundId Background id
- * @param maxConfScoreVecPtr Ouput
- */
-void getMaxConfidenceScores(const real* confData,
-                            const size_t batchSize,
-                            const size_t numPriorBBoxes,
-                            const size_t numClasses,
-                            const size_t backgroundId,
-                            vector<vector<real>>* maxConfScoreVecPtr);
-
-template <typename T>
-bool sortScorePairDescend(const pair<real, T>& pair1,
-                          const pair<real, T>& pair2);
-
-template <>
-bool sortScorePairDescend(const pair<real, NormalizedBBox>& pair1,
-                          const pair<real, NormalizedBBox>& pair2);
-
-/**
- * @brief Do NMS for bboxes to remove duplicated bboxes
- * @param bboxes BBoxes to apply NMS
- * @param confScoreData Confidence scores
- * @param classIdx Class to do NMS
- * @param topK Number to keep
- * @param confThreshold Low boundary of confidence score
- * @param nmsThreshold Threshold of overlap
- * @param numPriorBBoxes Total number of prior bboxes
- * @param numClasses Total class number
- * @param indices Indices of high quality bboxes
- */
-void applyNMSFast(const vector<NormalizedBBox>& bboxes,
-                  const real* confScoreData,
-                  size_t classIdx,
-                  size_t topK,
-                  real confThreshold,
-                  real nmsThreshold,
-                  size_t numPriorBBoxes,
-                  size_t numClasses,
-                  vector<size_t>* indices);
-
-/**
- * @brief Get detection results which satify requirements
- * @param numPriorBBoxes Prior bbox number
- * @param numClasses Class number
- * @param backgroundId Background class
- * @param batchSize Image number
- * @param confThreshold Threshold of class confidence
- * @param nmsTopK Used in NMS operation to keep top k bbox
- * @param nmsThreshold Used in NMS, threshold of overlap
- * @param keepTopK How many bboxes keeped in an image
- * @param allDecodedBBoxes Decoded bboxes for all images
- * @param allDetectionIndices Save detection bbox indices
- */
-size_t getDetectionIndices(
-    const real* confData,
-    const size_t numPriorBBoxes,
-    const size_t numClasses,
-    const size_t backgroundId,
-    const size_t batchSize,
-    const real confThreshold,
-    const size_t nmsTopK,
-    const real nmsThreshold,
-    const size_t keepTopK,
-    const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
-    vector<map<size_t, vector<size_t>>>* allDetectionIndices);
-
-/**
- * @brief Get detection results
- * @param confData Confidence scores
- * @param numPriorBBoxes Prior bbox number
- * @param numClasses Class number
- * @param batchSize Image number
- * @param allIndices Indices of predicted bboxes
- * @param allDecodedBBoxes BBoxes decoded
- * @param out Output matrix
- * image number | label | confidence score | xMin | yMin | xMax | yMax
- */
-void getDetectionOutput(const real* confData,
-                        const size_t numKept,
-                        const size_t numPriorBBoxes,
-                        const size_t numClasses,
-                        const size_t batchSize,
-                        const vector<map<size_t, vector<size_t>>>& allIndices,
-                        const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
-                        Matrix& out);
-
-NormalizedBBox clipBBox(const NormalizedBBox& bbox);
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/DotMulOperator.cpp b/paddle/gserver/layers/DotMulOperator.cpp
deleted file mode 100644
index 68db2929adee1336e52abfcb8e6495e589afa683..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/DotMulOperator.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Operator.h"
-
-namespace paddle {
-
-/**
- * DotMulOperator takes two inputs, performs element-wise multiplication:
- * \f[
- *   out.row[i] += scale * (in1.row[i] .* in2.row[i])
- * \f]
- * where \f$.*\f$ means element-wise multiplication,
- * and scale is a config scalar, its default value is one.
- *
- * The config file api is dotmul_operator.
- */
-class DotMulOperator : public Operator {
-public:
-  DotMulOperator(const OperatorConfig& config, bool useGpu);
-  virtual void forward();
-  virtual void backward();
-};
-
-REGISTER_OPERATOR(dot_mul, DotMulOperator);
-
-DotMulOperator::DotMulOperator(const OperatorConfig& config, bool useGpu)
-    : Operator(config, useGpu) {
-  CHECK_EQ(config_.input_indices_size(), 2L);
-}
-
-void DotMulOperator::forward() {
-  out_->value->addDotMul(
-      *ins_[0]->value, *ins_[1]->value, 1, config_.dotmul_scale());
-}
-
-void DotMulOperator::backward() {
-  const MatrixPtr& inV0 = ins_[0]->value;
-  const MatrixPtr& inV1 = ins_[1]->value;
-  const MatrixPtr& inG0 = ins_[0]->grad;
-  const MatrixPtr& inG1 = ins_[1]->grad;
-
-  if (inG0) {
-    inG0->addDotMul(*out_->grad, *inV1, 1, config_.dotmul_scale());
-  }
-  if (inG1) {
-    inG1->addDotMul(*out_->grad, *inV0, 1, config_.dotmul_scale());
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/DotMulProjection.cpp b/paddle/gserver/layers/DotMulProjection.cpp
deleted file mode 100644
index 86453aae84142f9f534182d085f4a96a2c7a3e15..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/DotMulProjection.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Projection.h"
-
-namespace paddle {
-
-/**
- * DotMulProjection performs element-wise multiplication with weight:
- * \f[
- *   out.row[i] += in.row[i] .* weight
- * \f]
- * where \f$.*\f$ means element-wise multiplication.
- *
- * The config file api is dotmul_projection.
- */
-class DotMulProjection : public Projection {
-public:
-  DotMulProjection(const ProjectionConfig& config,
-                   const ParameterPtr& parameter,
-                   bool useGpu);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-
-protected:
-  /// shared memory with parameter
-  std::unique_ptr<Weight> weight_;
-};
-
-REGISTER_PROJECTION(dot_mul, DotMulProjection);
-
-DotMulProjection::DotMulProjection(const ProjectionConfig& config,
-                                   const ParameterPtr& parameter,
-                                   bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  weight_.reset(new Weight(1LU, config.output_size(), parameter));
-}
-
-void DotMulProjection::forward() {
-  out_->value->addDotMulMMV(*in_->value, *(weight_->getW()));
-}
-
-void DotMulProjection::backward(const UpdateCallback& callback) {
-  /* Calculate the W-gradient for the current layer */
-  if (weight_->getWGrad()) {
-    weight_->getWGrad()->addDotMulVMM(*out_->grad, *in_->value);
-  }
-
-  /* Calculate the input layers error */
-  if (in_->grad) {
-    in_->grad->addDotMulMMV(*out_->grad, *(weight_->getW()));
-  }
-
-  parameter_->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/DotProdLayer.cpp b/paddle/gserver/layers/DotProdLayer.cpp
deleted file mode 100644
index 5148d93e27d199b0c373221cedd4f03d6d32c8ab..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/DotProdLayer.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for computing the dot product of two vectors.
- * Input1: vector (batchSize * dim)
- * Input2: vector (batchSize * dim)
- * Output: a matrix: (batchSize * 1)
- */
-
-class DotProdLayer : public Layer {
-public:
-  explicit DotProdLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~DotProdLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(dot_prod, DotProdLayer);
-
-bool DotProdLayer::init(const LayerMap& layerMap,
-                        const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-  CHECK_EQ(1UL, getSize())
-      << "The output dimensionality of this layer should be fixed to 1.";
-
-  return true;
-}
-
-void DotProdLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV0->getHeight();
-  CHECK_EQ(inV1->getHeight(), batchSize);
-  CHECK_EQ(inV0->getWidth(), inV1->getWidth());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, 1);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwDotProdTimer", getName().c_str());
-    outV->sumOfProducts(*inV0, *inV1, 1, 0);
-  }
-}
-
-void DotProdLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-
-  {
-    REGISTER_TIMER_INFO("BwDotProdTimer", getName().c_str());
-
-    if (inG0) {
-      inG0->addRowScale(0, *inV1, *outG);
-    }
-
-    if (inG1) {
-      inG1->addRowScale(0, *inV0, *outG);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/EosIdCheckLayer.cpp b/paddle/gserver/layers/EosIdCheckLayer.cpp
deleted file mode 100644
index 470a5b8ea208ad0acb64e3067881e0d183e1dc39..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/EosIdCheckLayer.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-/**
- * A layer for checking EOS for each sample:
- * - output_id = (input_id == conf.eos_id)
- *
- * The result is stored in output_.ids.
- * It is used by recurrent layer group.
- */
-class EosIdCheckLayer : public Layer {
-public:
-  explicit EosIdCheckLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    bool ret = Layer::init(layerMap, parameterMap);
-    CHECK_EQ(1UL, inputLayers_.size());
-    return ret;
-  }
-
-  void forward(PassType passType) override {
-    Layer::forward(passType);
-
-    const Argument& input = getInput(0);
-    IVector::resizeOrCreate(output_.ids, input.ids->getSize(), useGpu_);
-    output_.ids->isEqualTo(*input.ids, config_.eos_id());
-  }
-
-  void backward(const UpdateCallback& callback) override {}
-};
-
-REGISTER_LAYER(eos_id, EosIdCheckLayer);
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp
deleted file mode 100644
index 3a8478658249bfb0886e904aec43e50fe3618f79..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ /dev/null
@@ -1,248 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ExpandConvLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-DEFINE_bool(use_nnpack,
-            false,
-            "Whether to use nnpack for convolution calculation.");
-
-namespace paddle {
-
-/*
- * The calculation of the exconvt(convolution transpose (deconv) operation)
- * is a swap of forward and backward of the calculation of exconv.
- * */
-REGISTER_LAYER(exconv, ExpandConvLayer);
-REGISTER_LAYER(exconvt, ExpandConvLayer);
-
-inline bool isDepthwiseConv(int channels, int groups) {
-  return channels == groups;
-}
-
-bool ExpandConvLayer::init(const LayerMap &layerMap,
-                           const ParameterMap &parameterMap) {
-  /* Initialize the basic convolutional parent class */
-  ConvBaseLayer::init(layerMap, parameterMap);
-
-  int index = 0;
-  for (auto &inputConfig : config_.inputs()) {
-    const ConvConfig &conf = inputConfig.conv_conf();
-    /* Consistent caffe mode for multiple input */
-    caffeMode_ = conf.caffe_mode();
-
-    // create a new weight
-    size_t height, width;
-    height = filterPixels_[index] * filterChannels_[index];
-    width = (!isDeconv_) ? numFilters_ : channels_[index];
-    CHECK_EQ(parameters_[index]->getSize(), width * height);
-    Weight *w = new Weight(height, width, parameters_[index]);
-    weights_.emplace_back(w);
-    index++;
-  }
-
-  if (biasParameter_.get()) {
-    if (sharedBiases_) {
-      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
-      biases_ = std::unique_ptr<Weight>(
-          new Weight(1, numFilters_, biasParameter_, 0));
-    } else {
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_, 0));
-    }
-  }
-
-  getOutputSize();
-
-  size_t numInputs = config_.inputs_size();
-  inputShape_.resize(numInputs);
-  filterShape_.resize(numInputs);
-  outputShape_.resize(numInputs);
-
-  std::string convType;
-  std::string convGradInputType;
-  std::string convGradFilterType;
-
-  for (int i = 0; i < config_.inputs_size(); i++) {
-    std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
-    std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
-    std::vector<size_t> dilations = {(size_t)dilationY_[i],
-                                     (size_t)dilation_[i]};
-
-    bool useDilation = ((size_t)dilationY_[i] > 1 || (size_t)dilation_[i] > 1);
-
-    // Convolution Layer uses the GemmConv function by default.
-    convType = "GemmConv";
-    convGradInputType = "GemmConvGradInput";
-    convGradFilterType = "GemmConvGradFilter";
-
-    // If depth wise convolution and useGpu == true
-    if (useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) {
-      convType = "DepthwiseConv";
-      convGradInputType = "DepthwiseConvGradInput";
-      convGradFilterType = "DepthwiseConvGradFilter";
-    }
-
-    // If depth wise convolution and useGpu == false and ARM-NEON
-    if (!useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) {
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-      if ((filterSize_[i] == filterSizeY_[i]) &&
-          (filterSize_[i] == 3 || filterSize_[i] == 4) &&
-          (stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2) &&
-          !useDilation) {
-        convType = "NeonDepthwiseConv";
-      }
-#endif
-    }
-
-    if (FLAGS_use_nnpack && !isDeconv_ && !useDilation) {
-      createFunction(forward_,
-                     "NNPACKConv",
-                     FuncConfig()
-                         .set("paddings", paddings)
-                         .set("strides", strides)
-                         .set("groups", (size_t)groups_[i])
-                         .set("algo", std::string("auto")));
-    } else {
-      createFunction(forward_,
-                     !isDeconv_ ? convType : convGradInputType,
-                     FuncConfig()
-                         .set("paddings", paddings)
-                         .set("strides", strides)
-                         .set("dilations", dilations)
-                         .set("groups", (size_t)groups_[i]));
-
-      createFunction(backward_,
-                     !isDeconv_ ? convGradInputType : convType,
-                     FuncConfig()
-                         .set("paddings", paddings)
-                         .set("strides", strides)
-                         .set("dilations", dilations)
-                         .set("groups", (size_t)groups_[i]));
-
-      createFunction(backward_,
-                     convGradFilterType,
-                     FuncConfig()
-                         .set("paddings", paddings)
-                         .set("strides", strides)
-                         .set("dilations", dilations)
-                         .set("groups", (size_t)groups_[i]));
-    }
-  }
-  return true;
-}
-
-size_t ExpandConvLayer::getOutputSize() {
-  CHECK_NE(inputLayers_.size(), 0UL);
-  size_t layerSize = ConvBaseLayer::calOutputSize();
-  return layerSize;
-}
-
-// i is the index of input layers
-#define BACKWARD_INPUT(i, inputs, outputs) \
-  backward_[2 * i]->calc(inputs, outputs)
-#define BACKWARD_FILTER(i, inputs, outputs) \
-  backward_[2 * i + 1]->calc(inputs, outputs)
-
-void ExpandConvLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  resetOutput(batchSize, getOutputSize());
-
-  // Calculate the shape of the input, output, and filter.
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    inputShape_[i] = TensorShape({(size_t)batchSize,
-                                  (size_t)channels_[i],
-                                  (size_t)imgSizeH_[i],
-                                  (size_t)imgSizeW_[i]});
-    filterShape_[i] =
-        TensorShape({(size_t)groups_[i],
-                     !isDeconv_ ? (size_t)numFilters_ / groups_[i]
-                                : (size_t)channels_[i] / groups_[i],
-                     !isDeconv_ ? (size_t)channels_[i] / groups_[i]
-                                : (size_t)numFilters_ / groups_[i],
-                     (size_t)filterSizeY_[i],
-                     (size_t)filterSize_[i]});
-    outputShape_[i] = TensorShape({(size_t)batchSize,
-                                   (size_t)numFilters_,
-                                   (size_t)outputH_[i],
-                                   (size_t)outputW_[i]});
-  }
-
-  // Calculate the output value.
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    BufferArgs inputs;
-    BufferArgs outputs;
-    inputs.addArg(*getInputValue(i), inputShape_[i]);
-    inputs.addArg(*weights_[i]->getW(), filterShape_[i]);
-    outputs.addArg(*getOutputValue(),
-                   outputShape_[i],
-                   !isDeconv_ && i == 0 ? ASSIGN_TO : ADD_TO);
-
-    forward_[i]->calc(inputs, outputs);
-  }
-
-  /* add the bias-vector */
-  if (biases_.get()) {
-    output_.value->addBias(*biases_->getW(), 1.0, sharedBiases_);
-  }
-
-  /* activation */
-  forwardActivation();
-}
-
-void ExpandConvLayer::backward(const UpdateCallback &callback) {
-  backwardActivation();
-
-  MatrixPtr outGrad = getOutputGrad();
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBiases_);
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  // Calculate the input grad and filter grad.
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    if (getInputGrad(i)) {
-      BufferArgs inputs;
-      BufferArgs outputs;
-      inputs.addArg(*getOutputGrad(), outputShape_[i]);
-      inputs.addArg(*weights_[i]->getW(), filterShape_[i]);
-      outputs.addArg(*getInputGrad(i), inputShape_[i], ADD_TO);
-      BACKWARD_INPUT(i, inputs, outputs);
-    }
-
-    if (weights_[i]->getWGrad()) {
-      BufferArgs inputs;
-      BufferArgs outputs;
-      if (!isDeconv_) {
-        inputs.addArg(*getOutputGrad(), outputShape_[i]);
-        inputs.addArg(*getInputValue(i), inputShape_[i]);
-      } else {
-        inputs.addArg(*getInputValue(i), inputShape_[i]);
-        inputs.addArg(*getOutputGrad(), outputShape_[i]);
-      }
-      outputs.addArg(*weights_[i]->getWGrad(), filterShape_[i], ADD_TO);
-      BACKWARD_FILTER(i, inputs, outputs);
-
-      /* Increasing the number of gradient */
-      weights_[i]->getParameterPtr()->incUpdate(callback);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvLayer.h b/paddle/gserver/layers/ExpandConvLayer.h
deleted file mode 100644
index be968155efd0b8f19503c996ccd329379c6b1104..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ExpandConvLayer.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "ConvBaseLayer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A subclass of convolution layer.
- * This layer expands input and use matrix multiplication to
- * calculate convolution operation.
- *
- * The config file api is img_conv_layer.
- */
-
-class ExpandConvLayer : public ConvBaseLayer {
-public:
-  explicit ExpandConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
-
-  ~ExpandConvLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
-  size_t getOutputSize();
-
-protected:
-  std::vector<TensorShape> inputShape_;
-  std::vector<TensorShape> filterShape_;
-  std::vector<TensorShape> outputShape_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandLayer.cpp b/paddle/gserver/layers/ExpandLayer.cpp
deleted file mode 100644
index 6b5776754017bca8f8c14170ecfb4faa4109e0b5..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ExpandLayer.cpp
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ExpandLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(expand, ExpandLayer);
-
-bool ExpandLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  CHECK_EQ(inputLayers_.size(), 2UL);
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-  // which sequence type of input[0]
-  if (config_.trans_type() == "non-seq") {
-    type_ = kNonSeq;
-  } else if (config_.trans_type() == "seq") {
-    type_ = kSeq;
-  } else {
-    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
-  }
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void ExpandLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  // Expand layer should have exactly 2 input, one for data, one for size
-  CHECK_EQ(2U, inputLayers_.size());
-
-  // using two input:
-  // * first one for data;
-  // * second one only for sequence info
-  const Argument& shapeInput = getInput(1);
-  const Argument& dataInput = getInput(0);
-  size_t outputBatchSize = shapeInput.getBatchSize();
-  auto startPositions = type_ ? shapeInput.subSequenceStartPositions
-                              : shapeInput.sequenceStartPositions;
-  size_t numSequences = startPositions->getSize() - 1;
-  const int* starts = startPositions->getData(false);
-
-  CHECK_EQ(starts[numSequences], shapeInput.getBatchSize());
-  if (type_) {
-    // when trans_type = seq, input[1] must hasSubseq
-    CHECK_EQ(shapeInput.hasSubseq(), 1UL);
-    CHECK_EQ(dataInput.getNumSequences(), shapeInput.getNumSequences());
-  } else {
-    CHECK_EQ(dataInput.getBatchSize(), shapeInput.getNumSequences());
-  }
-
-  // set output sequence info as shape sequence
-  output_.sequenceStartPositions = shapeInput.sequenceStartPositions;
-  if (shapeInput.hasSubseq()) {
-    output_.subSequenceStartPositions = shapeInput.subSequenceStartPositions;
-  }
-
-  // reserve output: Expand output to batchsize of sequence data.
-  reserveOutput(outputBatchSize, dataInput.value->getWidth());
-
-  MatrixPtr inputValue = getInputValue(0);
-  MatrixPtr outputValue = getOutputValue();
-
-  ICpuGpuVector::resizeOrCreate(expandStartsPos_, outputBatchSize, false);
-  int* expandStarts = expandStartsPos_->getMutableData(false);
-  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
-    int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
-    for (int j = 0; j < sequenceLength; j++) {
-      expandStarts[starts[sequenceId] + j] = sequenceId;
-    }
-  }
-
-  outputValue->copyByRowIndex(*inputValue,
-                              *expandStartsPos_->getVector(useGpu_));
-
-  if (biases_.get() != NULL) {
-    outputValue->addBias(*(biases_->getW()), 1);
-  }
-}
-
-void ExpandLayer::backward(const UpdateCallback& callback) {
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  if (!getInputGrad(0)) return;
-  MatrixPtr inputGrad = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-  auto cpuSeqStartPos = type_ ? getInput(1).subSequenceStartPositions
-                              : getInput(1).sequenceStartPositions;
-  size_t numSequences = cpuSeqStartPos->getSize() - 1;
-  const int* starts = cpuSeqStartPos->getData(false);
-
-  CHECK_EQ(inputGrad->getWidth(), outputGrad->getWidth());
-  CHECK_EQ(outputGrad->getHeight(), (size_t)starts[numSequences]);
-
-  AsyncGpuBlock asyncGpuBlock;
-
-  // sum to get the grad
-  real scale = 1;
-  for (size_t sequenceId = 0; sequenceId < numSequences; sequenceId++) {
-    // TODO(Dangqingqing) optimization for GPU
-    int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
-    if (sequenceLength == 0) {
-      // empty sequence
-      continue;
-    }
-    MatrixPtr copyData = inputGrad->subMatrix(sequenceId, 1);
-    copyData->collectBias(
-        *outputGrad->subMatrix(starts[sequenceId], sequenceLength), scale);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandLayer.h b/paddle/gserver/layers/ExpandLayer.h
deleted file mode 100644
index 04bbfcbd04931fa11d11a9fcc74f0e4f19767f1b..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ExpandLayer.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * A layer for "Expand Dense data or (sequence data where the length of each
- * sequence is one) to sequence data."
- *
- * It should have exactly 2 input, one for data, one for size:
- * - first one for data
- *   - If ExpandLevel = kNonSeq: dense data
- *   - If ExpandLevel = kSeq: sequence data where the length of each sequence is
- * one
- * - second one only for sequence info
- *   - should be sequence data with or without sub-sequence.
- *
- * And the output size is the batch size(not instances) of second input.
- *
- * The config file api is expand_layer.
- */
-
-class ExpandLayer : public Layer {
-protected:
-  std::unique_ptr<Weight> biases_;
-  /// if input[0] is dense data, ExpandLevel=kNonSeq;
-  /// if input[0] is sequence data, ExpandLevel=kSeq
-  enum ExpandLevel { kNonSeq = 0, kSeq = 1 };
-  /// store the ExpandLevel
-  int type_;
-  /// expanded sequenceStartPositions or subSequenceStartPositions
-  /// of input[1]
-  ICpuGpuVectorPtr expandStartsPos_;
-
-public:
-  explicit ExpandLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ExpandLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
deleted file mode 100644
index 1744faada2ebd9f2c88ba9a3952b6b2646729e3b..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/FactorizationMachineLayer.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "FactorizationMachineLayer.h"
-#include <algorithm>
-#include <vector>
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(factorization_machine, FactorizationMachineLayer);
-
-bool FactorizationMachineLayer::init(const LayerMap& layerMap,
-                                     const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  factorSize_ = config_.factor_size();
-
-  /* initialize the latentVectors_ */
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  size_t inputSize = inputLayers_[0]->getSize();
-  CHECK_EQ(parameters_[0]->getSize(), inputSize * factorSize_);
-  latentVectors_ = std::unique_ptr<Weight>(
-      new Weight(inputSize, factorSize_, parameters_[0]));
-
-  return true;
-}
-
-void FactorizationMachineLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const MatrixPtr& inputV = getInputValue(0);
-
-  size_t batchSize = inputV->getHeight();
-  size_t outputSize = getSize();
-  size_t inputSize = inputLayers_[0]->getSize();
-  reserveOutput(batchSize, outputSize);
-
-  MatrixPtr outV = getOutputValue();
-
-  Matrix::resizeOrCreate(
-      latentVectorsSquare_, inputSize, factorSize_, false, useGpu_);
-  Matrix::resizeOrCreate(
-      inputMulFactor_, batchSize, factorSize_, false, useGpu_);
-  Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);
-
-  REGISTER_TIMER_INFO("FmInputMulFactorTimer", getName().c_str());
-  inputMulFactor_->mul(*inputV, *latentVectors_->getW());
-  inputMulFactor_->square2(*tmpOut_);
-  outV->sumRows(*tmpOut_, 0.5, 0);
-
-  if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
-    Matrix::resizeOrCreateSparseMatrix(inputSquare_,
-                                       inputV->getHeight(),
-                                       inputV->getWidth(),
-                                       inputV->getElementCnt(),
-                                       inputV->getValueType());
-    inputSquare_->copyFrom(*inputV);
-    (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get()))->square2();
-  } else {
-    Matrix::resizeOrCreate(
-        inputSquare_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
-    inputV->square2(*inputSquare_);
-  }
-  latentVectors_->getW()->square2(*latentVectorsSquare_);
-  tmpOut_->mul(*inputSquare_, *latentVectorsSquare_);
-  outV->sumRows(*tmpOut_, -0.5, 1.0);
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FmFwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ { backwardActivation(); }
-
-  const MatrixPtr& inputV = getInputValue(0);
-  const MatrixPtr& oGrad = getOutputGrad();
-
-  Matrix::resizeOrCreate(
-      tmpSum_, 1, latentVectors_->getW()->getHeight(), false, useGpu_);
-  MatrixPtr tmpSumTrans = Matrix::create(tmpSum_->getRowBuf(0),
-                                         latentVectors_->getW()->getHeight(),
-                                         1,
-                                         false,
-                                         useGpu_);
-
-  /* Calculate the gradients of the latentVectors_ matrix */
-  if (latentVectors_->getWGrad()) {
-    if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
-      Matrix::resizeOrCreateSparseMatrix(tmpInput_,
-                                         inputV->getHeight(),
-                                         inputV->getWidth(),
-                                         inputV->getElementCnt());
-
-      CpuSparseMatrix* sparseInputV =
-          dynamic_cast<CpuSparseMatrix*>(inputV.get());
-      CpuSparseMatrix* sparseInputSquare =
-          dynamic_cast<CpuSparseMatrix*>(inputSquare_.get());
-      CpuSparseMatrix* sparseTmpInput =
-          dynamic_cast<CpuSparseMatrix*>(tmpInput_.get());
-      sparseTmpInput->copyFrom(*sparseInputV);
-
-      sparseTmpInput->rowScale(0, *sparseInputV, *oGrad);
-      latentVectors_->getWGrad()->mul(
-          *sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1);
-      sparseTmpInput->rowScale(0, *sparseInputSquare, *oGrad);
-
-      Matrix::resizeOrCreate(negOnes_, 1, inputV->getHeight(), false, useGpu_);
-      negOnes_->zeroMem();
-      negOnes_->add(-1);
-      tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0);
-    } else {
-      Matrix::resizeOrCreate(
-          tmpInput_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
-
-      tmpInput_->rowScale(0, *inputV, *oGrad);
-      latentVectors_->getWGrad()->mul(
-          *tmpInput_->getTranspose(), *inputMulFactor_, 1, 1);
-      tmpInput_->rowScale(0, *inputSquare_, *oGrad);
-
-      tmpSum_->sumCols(*tmpInput_, -1, 0);
-    }
-
-    latentVectors_->getWGrad()->addRowScale(
-        0, *latentVectors_->getW(), *tmpSumTrans);
-
-    /* Increasing the number of gradient */
-    latentVectors_->getParameterPtr()->incUpdate(callback);
-  }
-
-  /* Calculate the input layers gradient */
-  MatrixPtr inGrad = getInputGrad(0);
-  if (inGrad != NULL) {
-    inGrad->mul(
-        *inputMulFactor_, *latentVectors_->getW()->getTranspose(), 1, 1);
-    tmpSumTrans->sumRows(*latentVectorsSquare_, -1, 0);
-    inGrad->addColScale(0, *inputV, *tmpSum_);
-    inGrad->rowScale(0, *inGrad, *oGrad);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h
deleted file mode 100644
index 684da4e65a461d46204c348b3374b0e9e00eb389..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/FactorizationMachineLayer.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-/**
- * @brief The Factorization Machine models pairwise (order-2) feature
- * interactions as inner product of the learned latent vectors corresponding
- * to each input feature.
- *
- * The Factorization Machine can effectively capture feature interactions
- * especially when the input is sparse. While in principle FM can model higher
- * order feature interaction, in practice usually only order-2 feature
- * interactions are considered. The Factorization Machine Layer here only
- * computes the order-2 interations with the formula:
- *
- * \f[
- *     y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
- * \f]
- *
- * The detailed calculation for forward and backward can be found at this paper:
- *
- *     Factorization machines.
- *
- * The config file api is factorization_machine.
- */
-
-class FactorizationMachineLayer : public Layer {
-protected:
-  // The latent vectors, shape: (size, factorSize_)
-  // Each row of the latentVectors_ matrix is the latent vector
-  // corresponding to one input feature dimension
-  std::unique_ptr<Weight> latentVectors_;
-  // The hyperparameter that defines the dimensionality of the factorization
-  size_t factorSize_;
-
-private:
-  // Store the square values of the letent vectors matrix
-  MatrixPtr latentVectorsSquare_;
-  // Store the square values of input matrix
-  MatrixPtr inputSquare_;
-  // The result of input matrix * latent vector matrix that will be used in
-  // both forward and backward step
-  MatrixPtr inputMulFactor_;
-  // Store temporary calculation result
-  MatrixPtr tmpOut_;
-  MatrixPtr tmpSum_;
-  MatrixPtr tmpInput_;
-  // Negative identity matrix
-  MatrixPtr negOnes_;
-
-public:
-  explicit FactorizationMachineLayer(const LayerConfig& config)
-      : Layer(config) {}
-  ~FactorizationMachineLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/FeatureMapExpandLayer.cpp b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
deleted file mode 100644
index 81b98da45bc4b9b8ef0723dd6ea2db809860e219..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/FeatureMapExpandLayer.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for expanding a batch of images to feature maps.
- * Each data of the input is a 2 dimensional matrix. Each element of the matrix
- * is replicated num_filters times to create a feature map with num_filters
- * channels.
- * - Input: Input one should be dense image data.
- * - Output: expanded fature maps.
- * \f[
- *  y.row[i] = x.row[i \mod x.width], i = 0,1,..., (x.width * num\_filters - 1)
- * \f]
- * For example, num_filters = 4:
- * @code
- *   x = [a1,a2;
- *        b1,b2]
- *   y = [a1, a2, a1, a2, a1, a2, a1, a2;
- *        b1, b2, b1, b2, b1, b2, b1, b2;]
- * @endcode
- */
-
-class FeatureMapExpandLayer : public Layer {
-private:
-  int numFilters_;
-  bool asRowVector_;
-
-public:
-  explicit FeatureMapExpandLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~FeatureMapExpandLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(featmap_expand, FeatureMapExpandLayer);
-
-bool FeatureMapExpandLayer::init(const LayerMap& layerMap,
-                                 const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  numFilters_ = config_.num_filters();
-  asRowVector_ = config_.user_arg() != "as_col_vec";
-  return true;
-}
-
-void FeatureMapExpandLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  MatrixPtr inputV = getInputValue(0);
-  size_t batchSize = getInput(0).getBatchSize();
-  int imgSize = inputV->getWidth();
-  resetOutput(batchSize, imgSize * numFilters_);
-
-  MatrixPtr outputV = getOutputValue();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    if (asRowVector_) {
-      for (size_t i = 0; i < batchSize; i++) {
-        MatrixPtr outVTmp =
-            Matrix::create(outputV->getData() + i * imgSize * numFilters_,
-                           numFilters_,
-                           imgSize,
-                           false,
-                           useGpu_);
-        MatrixPtr inVTmp = Matrix::create(
-            inputV->getData() + i * imgSize, 1, imgSize, false, useGpu_);
-        outVTmp->addRowVector(*inVTmp);
-      }
-    } else {
-      for (size_t i = 0; i < batchSize; i++) {
-        MatrixPtr outVTmp =
-            Matrix::create(outputV->getData() + i * imgSize * numFilters_,
-                           imgSize,
-                           numFilters_,
-                           false,
-                           useGpu_);
-        MatrixPtr inVTmp = Matrix::create(
-            inputV->getData() + i * imgSize, imgSize, 1, false, useGpu_);
-        outVTmp->addColVector(*inVTmp);
-      }
-    }
-  }
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void FeatureMapExpandLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inGrad = getInputGrad(0);
-  if (NULL == inGrad) {
-    return;
-  }
-  MatrixPtr outGrad = getOutputGrad();
-  size_t batchSize = getInput(0).getBatchSize();
-  int imgSize = inGrad->getWidth();
-  /* Do activation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    if (asRowVector_) {
-      for (size_t i = 0; i < batchSize; i++) {
-        MatrixPtr outGradTmp =
-            Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
-                           numFilters_,
-                           imgSize,
-                           false,
-                           useGpu_);
-        MatrixPtr inGradTmp = Matrix::create(
-            inGrad->getData() + i * imgSize, 1, imgSize, false, useGpu_);
-        inGradTmp->collectBias(*outGradTmp, 1);
-      }
-    } else {
-      for (size_t i = 0; i < batchSize; i++) {
-        MatrixPtr outGradTmp =
-            Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
-                           imgSize,
-                           numFilters_,
-                           false,
-                           useGpu_);
-        MatrixPtr inGradTmp = Matrix::create(
-            inGrad->getData() + i * imgSize, imgSize, 1, false, useGpu_);
-        inGradTmp->sumRows(*outGradTmp, 1, 1);
-      }
-    }
-  }
-}
-
-}  // namespace paddle.
diff --git a/paddle/gserver/layers/FullMatrixProjection.h b/paddle/gserver/layers/FullMatrixProjection.h
deleted file mode 100644
index 7c4cd1a7066d427f54e1a280a956acb025e6dc16..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/FullMatrixProjection.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/utils/Stat.h"
-
-#include "Projection.h"
-
-namespace paddle {
-
-/**
- * FullMatrixProjection performs full matrix multiplication:
- * \f[
- *    out.row[i] += in.row[i] * weight
- * \f]
- *
- * The config file api is full_matrix_projection.
- */
-class FullMatrixProjection : public Projection {
-public:
-  FullMatrixProjection(const ProjectionConfig& config,
-                       const ParameterPtr& parameter,
-                       bool useGpu);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-
-protected:
-  std::unique_ptr<Weight> weight_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/FullyConnectedLayer.cpp b/paddle/gserver/layers/FullyConnectedLayer.cpp
deleted file mode 100644
index 21ffa01d95a460b4b6edc2b02d63c19b32d0b070..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/FullyConnectedLayer.cpp
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "FullyConnectedLayer.h"
-#include <algorithm>
-#include <vector>
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(fc, FullyConnectedLayer);
-
-bool FullyConnectedLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* initialize the weightList */
-  CHECK(inputLayers_.size() == parameters_.size());
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    // Option the parameters
-    size_t height = inputLayers_[i]->getSize();
-    size_t width = getSize();
-
-    // create a new weight
-    if (parameters_[i]->isSparse()) {
-      CHECK_LE(parameters_[i]->getSize(), width * height);
-    } else {
-      CHECK_EQ(parameters_[i]->getSize(), width * height);
-    }
-    Weight* w = new Weight(height, width, parameters_[i]);
-
-    // append the new weight to the list
-    weights_.emplace_back(w);
-  }
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  return true;
-}
-
-void FullyConnectedLayer::prefetch() {
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    auto* sparseParam =
-        dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
-    if (sparseParam) {
-      MatrixPtr input = getInputValue(i);
-      sparseParam->addRows(input);
-    }
-  }
-}
-
-void FullyConnectedLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInput(0).getBatchSize();
-  int size = getSize();
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, size);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    auto input = getInput(i);
-    CHECK(input.value) << "The input of 'fc' layer must be matrix";
-    REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
-    i == 0 ? outV->mul(*input.value, *weights_[i]->getW(), 1, 0)
-           : outV->mul(*input.value, *weights_[i]->getW(), 1, 1);
-  }
-
-  /* add the bias-vector */
-  if (biases_.get() != NULL) {
-    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void FullyConnectedLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  if (biases_ && biases_->getWGrad()) {
-    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  bool syncFlag = hl_get_sync_flag();
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    /* Calculate the W-gradient for the current layer */
-    if (weights_[i]->getWGrad()) {
-      MatrixPtr input_T = getInputValue(i)->getTranspose();
-      MatrixPtr oGrad = getOutputGrad();
-      {
-        REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
-        weights_[i]->getWGrad()->mul(*input_T, *oGrad, 1, 1);
-      }
-    }
-
-    // If callback does not change value, backprop error asynchronously so that
-    // we can do the callback concurrently.
-    hl_set_sync_flag(false);
-
-    /* Calculate the input layers error */
-    MatrixPtr preGrad = getInputGrad(i);
-    if (NULL != preGrad) {
-      MatrixPtr weights_T = weights_[i]->getW()->getTranspose();
-      REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
-      preGrad->mul(*getOutputGrad(), *weights_T, 1, 1);
-    }
-
-    hl_set_sync_flag(syncFlag);
-    {
-      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-      weights_[i]->getParameterPtr()->incUpdate(callback);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/FullyConnectedLayer.h b/paddle/gserver/layers/FullyConnectedLayer.h
deleted file mode 100644
index e66aeeb7334c9c871749196d77474a02ecf82b09..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/FullyConnectedLayer.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-/**
- * A layer has full connections to all neurons in the previous layer.
- * It computes an inner product with a set of learned weights, and
- * (optionally) adds biases.
- *
- * The config file api is fc_layer.
- */
-
-class FullyConnectedLayer : public Layer {
-protected:
-  WeightList weights_;
-  std::unique_ptr<Weight> biases_;
-
-public:
-  explicit FullyConnectedLayer(const LayerConfig& config) : Layer(config) {}
-  ~FullyConnectedLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  Weight& getWeight(int idx) { return *weights_[idx]; }
-
-  void prefetch() override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/GatedRecurrentLayer.cpp b/paddle/gserver/layers/GatedRecurrentLayer.cpp
deleted file mode 100644
index 9d38849fdf97e6099e39384dd7e6546de9180462..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/GatedRecurrentLayer.cpp
+++ /dev/null
@@ -1,414 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GatedRecurrentLayer.h"
-#include "Layer.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(gated_recurrent, GatedRecurrentLayer);
-
-bool GatedRecurrentLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(1U, inputLayers_.size());
-  CHECK_EQ(1U, parameters_.size());
-  CHECK_EQ(getSize() * getSize() * 3, parameters_[0]->getSize());
-  CHECK_EQ(getSize() * 3, biasParameter_->getSize());
-  weight_.reset(new Weight(getSize(), getSize() * 3, parameters_[0]));
-  gateWeight_.reset(new Weight(getSize(), getSize() * 2, parameters_[0], 0));
-  stateWeight_.reset(new Weight(
-      getSize(), getSize(), parameters_[0], 2 * getSize() * getSize()));
-  if (biasParameter_.get() != NULL) {
-    bias_.reset(new Weight(1, getSize() * 3, biasParameter_));
-  }
-
-  reversed_ = config_.reversed();
-  activationGate_.reset(ActivationFunction::create(config_.active_gate_type()));
-
-  GruCompute::init(config_);
-  useBatch_ = true;
-
-  return true;
-}
-
-void GatedRecurrentLayer::resetState() {
-  CHECK(!reversed_) << "state is not allowed for reversed gated "
-                       "recurrent layer";
-  Matrix::resizeOrCreate(
-      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
-  prevOutput_->zeroMem();
-
-  // TODO(hedaoyuan): support prev_batch_state
-  CHECK(!FLAGS_prev_batch_state) << "Not supported";
-
-  useBatch_ = false;
-}
-
-void GatedRecurrentLayer::setState(LayerStatePtr state) {
-  CHECK(state->value.size() == 1)
-      << "one matrix is expected for GatedRecurrentLayer state";
-  prevOutput_->copyFrom(*(state->value[0]));
-}
-
-LayerStatePtr GatedRecurrentLayer::getState() {
-  LayerStatePtr res = std::make_shared<LayerState>();
-  res->value.push_back(prevOutput_->clone(0, 0, useGpu_));
-  res->value[0]->copyFrom(*prevOutput_);
-  return res;
-}
-
-void GatedRecurrentLayer::forward(PassType passType) {
-  REGISTER_TIMER_INFO("GruFwTimer", getName().c_str());
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  size_t numSequences = input.getNumSequences();
-  resetOutput(batchSize, getSize());
-  CHECK_EQ(getSize() * 3, input.value->getWidth());
-  const int* starts = input.sequenceStartPositions->getData(false);
-  // batchSize = length of total frames in a batch (NOT size of mini-batch)
-  CHECK_EQ(starts[numSequences], batchSize);
-
-  Matrix::resizeOrCreate(gate_.value,
-                         /* height= */ batchSize,
-                         getSize() * 3,
-                         /* trans= */ false,
-                         useGpu_);
-  Matrix::resizeOrCreate(resetOutput_.value,
-                         /* height= */ batchSize,
-                         getSize(),
-                         /* trans= */ false,
-                         useGpu_);
-
-  if (useBatch_) {
-    forwardBatch(batchSize, numSequences, starts, input.value);
-  } else {
-    forwardSequence(batchSize, numSequences, starts, input.value);
-  }
-}
-
-void GatedRecurrentLayer::backward(const UpdateCallback& callback) {
-  REGISTER_TIMER_INFO("GruBwTimer", getName().c_str());
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  const int* starts = input.sequenceStartPositions->getData(false);
-  size_t numSequences = input.getNumSequences();
-
-  Matrix::resizeOrCreate(gate_.grad,
-                         /* height= */ batchSize,
-                         getSize() * 3,
-                         /* trans= */ false,
-                         useGpu_);
-  Matrix::resizeOrCreate(resetOutput_.grad,
-                         /* height= */ batchSize,
-                         getSize(),
-                         /* trans= */ false,
-                         useGpu_);
-
-  if (useBatch_) {
-    backwardBatch(batchSize, input.grad);
-  } else {
-    backwardSequence(batchSize, numSequences, starts, input.grad);
-  }
-
-  if (bias_) {
-    bias_->getParameterPtr()->incUpdate(callback);
-  }
-
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-void GatedRecurrentLayer::forwardSequence(int batchSize,
-                                          size_t numSequences,
-                                          const int* starts,
-                                          MatrixPtr inputValue) {
-  REGISTER_TIMER_INFO("GruFwSequenceTime", getName().c_str());
-  gate_.value->assign(*inputValue);
-  if (bias_) {
-    gate_.value->addBias(*(bias_->getW()), 1);
-  }
-
-  hl_gru_value gruValue;
-  gruValue.gateWeight = (gateWeight_->getW())->getData();
-  gruValue.stateWeight = (stateWeight_->getW())->getData();
-  gruValue.gateValue = gate_.value->getData();
-  gruValue.resetOutputValue = resetOutput_.value->getData();
-  gruValue.outputValue = output_.value->getData();
-  gruValue.prevOutValue = nullptr;
-
-  if (reversed_) {
-    gruValue.gateValue += (batchSize - 1) * getSize() * 3;
-    gruValue.resetOutputValue += (batchSize - 1) * getSize();
-    gruValue.outputValue += (batchSize - 1) * getSize();
-  }
-
-  auto nextFrame = [&gruValue](bool reversed, int frameSize) {
-    gruValue.prevOutValue = gruValue.outputValue;
-    if (!reversed) {
-      gruValue.gateValue += frameSize * 3;
-      gruValue.resetOutputValue += frameSize;
-      gruValue.outputValue += frameSize;
-    } else {
-      gruValue.gateValue -= frameSize * 3;
-      gruValue.resetOutputValue -= frameSize;
-      gruValue.outputValue -= frameSize;
-    }
-  };
-
-  if (!reversed_) {
-    if (prevOutput_) {
-      gruValue.prevOutValue = prevOutput_->getData();
-    }
-  }
-  AsyncGpuBlock asyncGpuBlock;
-  for (size_t n = 0; n < numSequences; ++n) {
-    int length;
-    if (!reversed_) {
-      length = starts[n + 1] - starts[n];
-    } else {
-      length = starts[numSequences - n] - starts[numSequences - n - 1];
-    }
-    for (int l = 0; l < length; ++l) {
-      if (useGpu_) {
-        GruCompute::forward<1>(gruValue, getSize());
-      } else {
-        GruCompute::forward<0>(gruValue, getSize());
-      }
-
-      nextFrame(reversed_, getSize());
-    }
-    if (!reversed_) {
-      if (!prevOutput_) gruValue.prevOutValue = nullptr;
-    } else {
-      gruValue.prevOutValue = nullptr;
-    }
-  }
-
-  if (!reversed_) {
-    if (prevOutput_) {
-      prevOutput_->assign(*output_.value->subMatrix(batchSize - 1, 1));
-    }
-  }
-}
-
-void GatedRecurrentLayer::backwardSequence(int batchSize,
-                                           size_t numSequences,
-                                           const int* starts,
-                                           MatrixPtr inputGrad) {
-  REGISTER_TIMER_INFO("GruBwSequenceTime", getName().c_str());
-
-  hl_gru_value gruValue;
-  gruValue.gateWeight = (gateWeight_->getW())->getData();
-  gruValue.stateWeight = (stateWeight_->getW())->getData();
-  gruValue.gateValue = gate_.value->getData();
-  gruValue.resetOutputValue = resetOutput_.value->getData();
-  gruValue.outputValue = output_.value->getData();
-
-  hl_gru_grad gruGrad;
-  gruGrad.gateWeightGrad =
-      (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
-  gruGrad.stateWeightGrad =
-      (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData()
-                                : nullptr);
-  gruGrad.gateGrad = gate_.grad->getData();
-  gruGrad.resetOutputGrad = resetOutput_.grad->getData();
-  gruGrad.outputGrad = output_.grad->getData();
-
-  if (!reversed_) {
-    gruValue.gateValue += (batchSize - 1) * getSize() * 3;
-    gruValue.resetOutputValue += (batchSize - 1) * getSize();
-    gruValue.outputValue += (batchSize - 1) * getSize();
-    gruGrad.gateGrad += (batchSize - 1) * getSize() * 3;
-    gruGrad.resetOutputGrad += (batchSize - 1) * getSize();
-    gruGrad.outputGrad += (batchSize - 1) * getSize();
-    gruValue.prevOutValue = gruValue.outputValue - getSize();
-    gruGrad.prevOutGrad = gruGrad.outputGrad - getSize();
-  } else {
-    gruValue.prevOutValue = gruValue.outputValue + getSize();
-    gruGrad.prevOutGrad = gruGrad.outputGrad + getSize();
-  }
-
-  auto nextFrame = [&gruValue, &gruGrad](bool reversed, int frameSize) {
-    if (reversed) {
-      gruValue.gateValue += frameSize * 3;
-      gruValue.resetOutputValue += frameSize;
-      gruValue.outputValue += frameSize;
-      gruGrad.gateGrad += frameSize * 3;
-      gruGrad.resetOutputGrad += frameSize;
-      gruGrad.outputGrad += frameSize;
-      gruValue.prevOutValue = gruValue.outputValue + frameSize;
-      gruGrad.prevOutGrad = gruGrad.outputGrad + frameSize;
-    } else {
-      gruValue.gateValue -= frameSize * 3;
-      gruValue.resetOutputValue -= frameSize;
-      gruValue.outputValue -= frameSize;
-      gruGrad.gateGrad -= frameSize * 3;
-      gruGrad.resetOutputGrad -= frameSize;
-      gruGrad.outputGrad -= frameSize;
-      gruValue.prevOutValue = gruValue.outputValue - frameSize;
-      gruGrad.prevOutGrad = gruGrad.outputGrad - frameSize;
-    }
-  };
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    for (size_t n = 0; n < numSequences; ++n) {
-      int length;
-      if (reversed_) {
-        length = starts[n + 1] - starts[n];
-      } else {
-        length = starts[numSequences - n] - starts[numSequences - n - 1];
-      }
-      for (int l = 0; l < length; ++l) {
-        if (l == length - 1) {
-          gruValue.prevOutValue = nullptr;
-          gruGrad.prevOutGrad = nullptr;
-        }
-        if (useGpu_) {
-          GruCompute::backward<1>(gruValue, gruGrad, getSize());
-        } else {
-          GruCompute::backward<0>(gruValue, gruGrad, getSize());
-        }
-        nextFrame(reversed_, getSize());
-      }
-    }
-  }
-
-  if (inputGrad) {
-    inputGrad->add(*gate_.grad);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    bias_->getWGrad()->collectBias(*gate_.grad, 1);
-  }
-}
-
-void GatedRecurrentLayer::forwardBatch(int batchSize,
-                                       size_t numSequences,
-                                       const int* starts,
-                                       MatrixPtr inputValue) {
-  REGISTER_TIMER_INFO("GruFwBatchTime", getName().c_str());
-  hl_gru_value gruValue;
-  gruValue.gateWeight = (gateWeight_->getW())->getData();
-  gruValue.stateWeight = (stateWeight_->getW())->getData();
-
-  if (!batchValue_) {
-    batchValue_.reset(new SequenceToBatch(useGpu_));
-  }
-  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_);
-
-  batchValue_->resizeOrCreate(*output_.value);
-  batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */ true);
-  if (bias_) {
-    gate_.value->addBias(*(bias_->getW()), 1);
-  }
-
-  {
-    int numBatch = batchValue_->getNumBatch();
-    int curBatchSize = 0;
-    AsyncGpuBlock asyncGpuBlock;
-    for (int n = 0; n < numBatch; n++) {
-      MatrixPtr outputValueTmp = batchValue_->getBatchValue(n);
-      gruValue.outputValue = outputValueTmp->getData();
-      gruValue.gateValue =
-          (batchValue_->getBatchValue(*gate_.value, n))->getData();
-      gruValue.resetOutputValue =
-          (batchValue_->getBatchValue(*resetOutput_.value, n))->getData();
-
-      curBatchSize = outputValueTmp->getHeight();
-      gruValue.prevOutValue =
-          (n == 0
-               ? nullptr
-               : (batchValue_->getBatchValue(n - 1, curBatchSize))->getData());
-
-      {
-        if (useGpu_) {
-          GruCompute::forward<1>(gruValue, getSize(), curBatchSize);
-        } else {
-          GruCompute::forward<0>(gruValue, getSize(), curBatchSize);
-        }
-      }
-    }
-  }
-  { batchValue_->copyBackSeq(*output_.value); }
-}
-
-void GatedRecurrentLayer::backwardBatch(int batchSize, MatrixPtr inputGrad) {
-  REGISTER_TIMER_INFO("GruBwBatchTime", getName().c_str());
-  hl_gru_value gruValue;
-  gruValue.gateWeight = (gateWeight_->getW())->getData();
-  gruValue.stateWeight = (stateWeight_->getW())->getData();
-
-  hl_gru_grad gruGrad;
-  gruGrad.gateWeightGrad =
-      (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
-  gruGrad.stateWeightGrad =
-      (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData()
-                                : nullptr);
-
-  if (!batchGrad_) {
-    batchGrad_.reset(new SequenceToBatch(useGpu_));
-  }
-  batchGrad_->shareIndexWith(*batchValue_);
-
-  { batchGrad_->copyFromSeq(*output_.grad); }
-
-  {
-    int numBatch = batchGrad_->getNumBatch();
-    int batchSize = 0;
-    AsyncGpuBlock asyncGpuBlock;
-    for (int n = (int)numBatch - 1; n >= 0; n--) {
-      gruValue.gateValue =
-          (batchGrad_->getBatchValue(*gate_.value, n))->getData();
-      gruValue.resetOutputValue =
-          (batchGrad_->getBatchValue(*resetOutput_.value, n))->getData();
-
-      MatrixPtr outputGradTmp = batchGrad_->getBatchValue(n);
-      gruGrad.outputGrad = outputGradTmp->getData();
-      gruGrad.gateGrad = (batchGrad_->getBatchValue(*gate_.grad, n))->getData();
-      gruGrad.resetOutputGrad =
-          (batchGrad_->getBatchValue(*resetOutput_.grad, n))->getData();
-
-      {
-        batchSize = outputGradTmp->getHeight();
-        gruValue.prevOutValue =
-            (n == 0
-                 ? nullptr
-                 : (batchValue_->getBatchValue(n - 1, batchSize))->getData());
-        gruGrad.prevOutGrad =
-            (n == 0 ? nullptr
-                    : (batchGrad_->getBatchValue(n - 1, batchSize))->getData());
-
-        if (useGpu_) {
-          GruCompute::backward<1>(gruValue, gruGrad, getSize(), batchSize);
-        } else {
-          GruCompute::backward<0>(gruValue, gruGrad, getSize(), batchSize);
-        }
-      }
-    }
-  }
-
-  if (inputGrad) {
-    batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */ false);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    bias_->getWGrad()->collectBias(*gate_.grad, /* scale */ 1);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/GatedRecurrentLayer.h b/paddle/gserver/layers/GatedRecurrentLayer.h
deleted file mode 100644
index f0a3a823018f3943b0295c172b19d0fe9d0674b4..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/GatedRecurrentLayer.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "GruCompute.h"
-#include "Layer.h"
-#include "SequenceToBatch.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief Please refer to "Junyoung Chung, Empirical Evaluation
- * of Gated Recurrent Neural Networks on Sequence Modeling".
- *
- * GatedRecurrentLayer takes 1 input layer with size * 3.
- * Input layer is diveded into 3 equal parts: (xz_t, xr_t, xi_t).
- * parameter and biasParameter is also diveded into 3 equal parts:
- *   - parameter consists of (U_z, U_r, U)
- *   - baisParameter consists of (bias_z, bias_r, bias_o)
- *
- * \f[
- * update \ gate: z_t = actGate(xz_t + U_z * h_{t-1} + bias_z) \\
- * reset \ gate: r_t = actGate(xr_t + U_r * h_{t-1} + bias_r) \\
- * output \ candidate: {h}_t = actNode(xi_t + U * dot(r_t, h_{t-1}) + bias_o) \\
- * hidden \ activation: h_t = dot((1-z_t), h_{t-1}) + dot(z_t, {h}_t) \\
- * \f]
- *
- * @note
- * - dot denotes "element-wise multiplication".
- * - actNode is defined by config active_type
- * - actGate is defined by config actvie_gate_type
- *
- * The config file is grumemory.
- */
-
-class GatedRecurrentLayer : public Layer, public GruCompute {
-public:
-  explicit GatedRecurrentLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback) override;
-
-  void resetState() override;
-
-  void setState(LayerStatePtr state) override;
-
-  LayerStatePtr getState() override;
-
-protected:
-  void forwardSequence(int batchSize,
-                       size_t numSequences,
-                       const int* starts,
-                       MatrixPtr inputValue);
-  void backwardSequence(int batchSize,
-                        size_t numSequences,
-                        const int* starts,
-                        MatrixPtr inputGrad);
-
-  void forwardBatch(int batchSize,
-                    size_t numSequences,
-                    const int* starts,
-                    MatrixPtr inputValue);
-  void backwardBatch(int batchSize, MatrixPtr inputGrad);
-
-protected:
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> gateWeight_;
-  std::unique_ptr<Weight> stateWeight_;
-  std::unique_ptr<Weight> bias_;
-
-  Argument gate_;
-  Argument resetOutput_;
-
-  bool reversed_;
-  bool useBatch_;
-  std::unique_ptr<SequenceToBatch> batchValue_;
-  std::unique_ptr<SequenceToBatch> batchGrad_;
-  std::unique_ptr<ActivationFunction> activationGate_;
-
-  MatrixPtr prevOutput_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/GetOutputLayer.cpp b/paddle/gserver/layers/GetOutputLayer.cpp
deleted file mode 100644
index f255681f3e678e51f069522f965fd2776680b595..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/GetOutputLayer.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-
-namespace paddle {
-
-class GetOutputLayer : public Layer {
-public:
-  explicit GetOutputLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~GetOutputLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    if (!Layer::init(layerMap, parameterMap)) return false;
-    CHECK_EQ(1U, inputLayers_.size());
-    CHECK_NE(inputArgument_[0], "");
-    return true;
-  }
-
-  void forward(PassType passType) override {
-    output_ = getPrev(0)->getOutput(inputArgument_[0]);
-  }
-  void backward(const UpdateCallback& callback = nullptr) override {}
-};
-
-REGISTER_LAYER(get_output, GetOutputLayer);
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/GruCompute.cpp b/paddle/gserver/layers/GruCompute.cpp
deleted file mode 100644
index 48ddbc413e6c915be6e86704f96e919932ca2970..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/GruCompute.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GruCompute.h"
-#include "hl_recurrent_apply.cuh"
-#include "paddle/function/GruFunctor.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-void GruCompute::init(LayerConfig &config) {
-  activeNode_ = hlActiveType(config.active_type());
-  activeGate_ = hlActiveType(config.active_gate_type());
-}
-
-template <>
-void GruCompute::forward<0>(hl_gru_value value, int frameSize, int batchSize) {
-  GruFunctor<DEVICE_TYPE_CPU, real>::compute(hppl::forward::gru_resetOutput(),
-                                             hppl::forward::gru_finalOutput(),
-                                             value,
-                                             frameSize,
-                                             batchSize,
-                                             activeNode_,
-                                             activeGate_);
-}
-
-template <>
-void GruCompute::backward<0>(hl_gru_value value,
-                             hl_gru_grad grad,
-                             int frameSize,
-                             int batchSize) {
-  GruGradFunctor<DEVICE_TYPE_CPU, real>::compute(
-      hppl::backward::gru_stateGrad(),
-      hppl::backward::gru_resetGrad(),
-      value,
-      grad,
-      frameSize,
-      batchSize,
-      activeNode_,
-      activeGate_);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/GruCompute.h b/paddle/gserver/layers/GruCompute.h
deleted file mode 100644
index fb6bc56422002b4d4080ccb8438767b27ceef064..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/GruCompute.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ModelConfig.pb.h"
-#include "hl_gpu.h"
-#include "paddle/utils/Common.h"
-
-namespace paddle {
-
-class GruCompute {
-public:
-  void init(LayerConfig &config);
-
-  template <bool useGpu>
-  void forward(hl_gru_value value, int frameSize, int batchSize = 1);
-
-  template <bool useGpu>
-  void backward(hl_gru_value value,
-                hl_gru_grad grad,
-                int frameSize,
-                int batchSize = 1);
-
-public:
-  hl_activation_mode_t activeNode_;
-  hl_activation_mode_t activeGate_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/GruStepLayer.cpp b/paddle/gserver/layers/GruStepLayer.cpp
deleted file mode 100644
index 917c50250c1c04d6c8f113c8d42ef029e1028606..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/GruStepLayer.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "GruCompute.h"
-#include "Layer.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief GruStepLayer is like GatedRecurrentLayer, but used in recurrent
- * layer group. GruStepLayer takes 2 input layer.
- * - input[0] with size * 3 and diveded into 3 equal parts: (xz_t, xr_t, xi_t).
- * - input[1] with size: {prev_out}.
- *
- * parameter and biasParameter is also diveded into 3 equal parts:
- * - parameter consists of (U_z, U_r, U)
- * - baisParameter consists of (bias_z, bias_r, bias_o)
- *
- * \f[
- * update \ gate: z_t = actGate(xz_t + U_z * prev_out + bias_z) \\
- * reset \ gate: r_t = actGate(xr_t + U_r * prev_out + bias_r)  \\
- * output \ candidate: {h}_t = actNode(xi_t + U * dot(r_t, prev_out) + bias_o)
- * \\
- * output: h_t = dot((1-z_t), prev_out) + dot(z_t, prev_out)
- * \f]
- *
- * @note
- *   - dot denotes "element-wise multiplication".
- *   - actNode is defined by config active_type
- *   - actGate is defined by config actvie_gate_type
- *
- * The config file api if gru_step_layer.
- */
-class GruStepLayer : public Layer, public GruCompute {
-protected:
-  Argument gate_;
-  Argument resetOutput_;
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> bias_;
-
-public:
-  explicit GruStepLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~GruStepLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(gru_step, GruStepLayer);
-
-bool GruStepLayer::init(const LayerMap& layerMap,
-                        const ParameterMap& parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(2U, inputLayers_.size());
-
-  CHECK_EQ(getSize() * getSize() * 3, parameters_[0]->getSize());
-  weight_.reset(new Weight(getSize(), getSize() * 3, parameters_[0]));
-
-  if (biasParameter_.get() != NULL) {
-    CHECK_EQ(getSize() * 3, biasParameter_->getSize());
-    bias_.reset(new Weight(1, getSize() * 3, biasParameter_));
-  }
-
-  GruCompute::init(config_);
-  return true;
-}
-
-void GruStepLayer::forward(PassType passType) {
-  REGISTER_TIMER_INFO("GruStepFwTime", getName().c_str());
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-  const Argument& prevOutput = getInput(1);
-  CHECK_EQ(getSize() * 3, input.value->getWidth());
-  CHECK_EQ(getSize(), prevOutput.value->getWidth());
-
-  int batchSize = input.getBatchSize();
-  resetOutput(batchSize, getSize());
-  resetSpecifyOutput(gate_,
-                     batchSize,
-                     getSize() * 3,
-                     /* isValueClean */ false,
-                     /* isGradClean */ false);
-  resetSpecifyOutput(resetOutput_,
-                     batchSize,
-                     getSize(),
-                     /* isValueClean */ false,
-                     /* isGradClean */ false);
-  gate_.value->assign(*input.value);
-  if (bias_) {
-    gate_.value->addBias(*(bias_->getW()), 1);
-  }
-
-  hl_gru_value gruValue;
-  gruValue.gateWeight = weight_->getW()->getData();
-  gruValue.stateWeight = weight_->getW()->getData() + getSize() * getSize() * 2;
-  gruValue.gateValue = gate_.value->getData();
-  gruValue.resetOutputValue = resetOutput_.value->getData();
-  gruValue.outputValue = output_.value->getData();
-  gruValue.prevOutValue = prevOutput.value->getData();
-
-  if (useGpu_) {
-    GruCompute::forward<1>(gruValue, getSize(), batchSize);
-  } else {
-    GruCompute::forward<0>(gruValue, getSize(), batchSize);
-  }
-}
-
-void GruStepLayer::backward(const UpdateCallback& callback) {
-  REGISTER_TIMER_INFO("GruStepBwTime", getName().c_str());
-
-  const Argument& input = getInput(0);
-  const Argument& prevOutput = getInput(1);
-  int batchSize = input.getBatchSize();
-
-  hl_gru_value gruValue;
-  gruValue.gateWeight = weight_->getW()->getData();
-  gruValue.stateWeight = weight_->getW()->getData() + getSize() * getSize() * 2;
-  gruValue.gateValue = gate_.value->getData();
-  gruValue.resetOutputValue = resetOutput_.value->getData();
-  gruValue.outputValue = output_.value->getData();
-  gruValue.prevOutValue = prevOutput.value->getData();
-
-  hl_gru_grad gruGrad;
-  gruGrad.gateWeightGrad =
-      (weight_->getWGrad() ? weight_->getWGrad()->getData() : nullptr);
-  gruGrad.stateWeightGrad =
-      (weight_->getWGrad()
-           ? weight_->getWGrad()->getData() + getSize() * getSize() * 2
-           : nullptr);
-
-  gruGrad.gateGrad = gate_.grad->getData();
-  gruGrad.resetOutputGrad = resetOutput_.grad->getData();
-  gruGrad.outputGrad = output_.grad->getData();
-  if (prevOutput.grad) {
-    gruGrad.prevOutGrad = prevOutput.grad->getData();
-  } else {
-    gruGrad.prevOutGrad = nullptr;
-  }
-
-  if (useGpu_) {
-    GruCompute::backward<1>(gruValue, gruGrad, getSize(), batchSize);
-  } else {
-    GruCompute::backward<0>(gruValue, gruGrad, getSize(), batchSize);
-  }
-
-  if (input.grad) {
-    input.grad->add(*gate_.grad);
-  }
-
-  if (bias_ && bias_->getWGrad()) {
-    bias_->getWGrad()->collectBias(*gate_.grad, 1);
-  }
-
-  if (bias_) {
-    bias_->getParameterPtr()->incUpdate(callback);
-  }
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
deleted file mode 100644
index 3e720f179ee66baa73f40b8f5f19bfb4090831c0..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "HierarchicalSigmoidLayer.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-REGISTER_LAYER(hsigmoid, HierarchicalSigmoidLayer);
-
-bool HierarchicalSigmoidLayer::init(const LayerMap& layerMap,
-                                    const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK(config_.has_num_classes()) << "num_classes must be specifed in config";
-  numClasses_ = config_.num_classes();
-  CHECK_GE(numClasses_, (size_t)2);
-  codeLength_ = findLastSet(numClasses_ - 1);
-
-  size_t height = numClasses_ - 1;
-
-  /* initialize the weightList */
-  // The last input layer is for label
-  CHECK(!parameters_.back());
-  for (size_t i = 0; i < inputLayers_.size() - 1; i++) {
-    size_t width = inputLayers_[i]->getSize();
-    // create a new weight
-    CHECK_EQ(parameters_[i]->getSize(), width * height);
-    Weight* w = new Weight(height, width, parameters_[i]);
-
-    // append the new weight to the list
-    weights_.emplace_back(w);
-  }
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    CHECK_EQ(biasParameter_->getSize(), numClasses_ - 1);
-    biases_.reset(new Weight(1, numClasses_ - 1, biasParameter_));
-  }
-
-  return true;
-}
-
-void HierarchicalSigmoidLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(0)->getHeight();
-  int size = getSize();
-  reserveOutput(batchSize, size);
-  Matrix::resizeOrCreate(preOutput_.value,
-                         batchSize,
-                         codeLength_,
-                         /* trans */ false,
-                         false);
-  Matrix::resizeOrCreate(preOutput_.grad,
-                         batchSize,
-                         codeLength_,
-                         /* trans */ false,
-                         false);
-  IVectorPtr label = getInput(*getLabelLayer()).ids;
-  preOutput_.value->zeroMem();
-
-  if (useGpu_) {
-    Matrix::resizeOrCreate(cpuOutput_,
-                           output_.value->getHeight(),
-                           output_.value->getWidth(),
-                           /* trans */ false,
-                           false);
-    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
-    cpuLabel_->copyFrom(*label);
-    cpuOutput_->copyFrom(*output_.value);
-  } else {
-    cpuOutput_ = output_.value;
-    cpuLabel_ = label;
-  }
-  /* add the bias-vector */
-  if (biases_.get() != NULL) {
-    if (useGpu_) {
-      Matrix::resizeOrCreate(cpuBias_,
-                             1,
-                             numClasses_ - 1,
-                             /* trans */ false,
-                             false);
-      cpuBias_->copyFrom(*biases_->getW());
-    } else {
-      cpuBias_ = biases_->getW();
-    }
-    preOutput_.value->addByBitCode(numClasses_, *cpuLabel_, *cpuBias_);
-  }
-  for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
-    MatrixPtr input = getInputValue(i);
-    if (useGpu_) {
-      Matrix::resizeOrCreate(cpuInput_,
-                             input->getHeight(),
-                             input->getWidth(),
-                             /* trans */ false,
-                             false);
-      Matrix::resizeOrCreate(cpuWeight_,
-                             weights_[i]->getW()->getHeight(),
-                             weights_[i]->getW()->getWidth(),
-                             /* trans */ false,
-                             false);
-      cpuInput_->copyFrom(*input);
-      cpuWeight_->copyFrom(*weights_[i]->getW());
-    } else {
-      cpuInput_ = input;
-      cpuWeight_ = weights_[i]->getW();
-    }
-    preOutput_.value->mulByBitCode(
-        numClasses_, *cpuLabel_, *cpuWeight_, *cpuInput_);
-  }
-  // keep consistent with the clipping in the following softrelu
-  preOutput_.value->clip(-40.0, 40.0);
-  preOutput_.value->sumByBitCode(numClasses_,
-                                 *cpuLabel_,
-                                 *cpuOutput_,
-                                 -1);  // scaleSum
-  preOutput_.value->softrelu(*preOutput_.value);
-  MatrixPtr sum = Matrix::create(batchSize, 1, /* trans= */ false, false);
-  preOutput_.value->rowSum(*sum);
-  cpuOutput_->add(*sum);
-  if (useGpu_) {
-    output_.value->copyFrom(*cpuOutput_);
-  } else {
-    output_.value = cpuOutput_;
-  }
-}
-
-void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
-  IVectorPtr label = getInput(*getLabelLayer()).ids;
-  if (useGpu_) {
-    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
-    cpuLabel_->copyFrom(*label);
-  } else {
-    cpuLabel_ = label;
-  }
-  preOutput_.grad->one();
-  preOutput_.grad->softreluDerivative(*preOutput_.value);
-  preOutput_.grad->subByBitCode(numClasses_, *cpuLabel_);
-
-  if (biases_ && biases_->getWGrad()) {
-    MatrixPtr biases_grad = biases_->getWGrad();
-    if (useGpu_) {
-      Matrix::resizeOrCreate(cpuBias_,
-                             1,
-                             numClasses_ - 1,
-                             /* trans */ false,
-                             false);
-      cpuBias_->copyFrom(*biases_grad);
-    } else {
-      cpuBias_ = biases_grad;
-    }
-    preOutput_.grad->addByBitCodeBackward(numClasses_, *cpuLabel_, *cpuBias_);
-    if (useGpu_) {
-      biases_grad->copyFrom(*cpuBias_);
-    } else {
-      biases_grad = cpuBias_;
-    }
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
-    /* Calculate the W-gradient for the current layer */
-    MatrixPtr input = getInputValue(i);
-    if (weights_[i]->getWGrad()) {
-      MatrixPtr weights_grad = weights_[i]->getWGrad();
-      if (useGpu_) {
-        Matrix::resizeOrCreate(cpuInput_,
-                               input->getHeight(),
-                               input->getWidth(),
-                               /* trans */ false,
-                               false);
-        Matrix::resizeOrCreate(cpuWeightGrad_,
-                               weights_grad->getHeight(),
-                               weights_grad->getWidth(),
-                               /* trans */ false,
-                               false);
-        cpuInput_->copyFrom(*input);
-        cpuWeightGrad_->copyFrom(*weights_grad);
-      } else {
-        cpuInput_ = input;
-        cpuWeightGrad_ = weights_grad;
-      }
-      preOutput_.grad->mulByBitCodeBackwardWeight(
-          numClasses_, *cpuLabel_, *cpuWeightGrad_, *cpuInput_);
-      if (useGpu_) {
-        weights_grad->copyFrom(*cpuWeightGrad_);
-      } else {
-        weights_grad = cpuWeightGrad_;
-      }
-      /* Increasing the number of gradient */
-      weights_[i]->getParameterPtr()->incUpdate(callback);
-    }
-
-    /* Calculate the input layers error */
-    MatrixPtr inputGrad = getInputGrad(i);
-    if (inputGrad) {
-      if (useGpu_) {
-        Matrix::resizeOrCreate(cpuInputGrad_,
-                               inputGrad->getHeight(),
-                               inputGrad->getWidth(),
-                               /* trans */ false,
-                               false);
-        Matrix::resizeOrCreate(cpuWeight_,
-                               weights_[i]->getW()->getHeight(),
-                               weights_[i]->getW()->getWidth(),
-                               /* trans */ false,
-                               false);
-        cpuInputGrad_->copyFrom(*inputGrad);
-        cpuWeight_->copyFrom(*weights_[i]->getW());
-      } else {
-        cpuInputGrad_ = inputGrad;
-        cpuWeight_ = weights_[i]->getW();
-      }
-      preOutput_.grad->mulByBitCodeBackwardError(
-          numClasses_, *cpuLabel_, *cpuWeight_, *cpuInputGrad_);
-      if (useGpu_) {
-        inputGrad->copyFrom(*cpuInputGrad_);
-      } else {
-        inputGrad = cpuInputGrad_;
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
deleted file mode 100644
index 10e501f1807ef6ba03d326a1bcf257ede0ee850a..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * Organize the classes into a binary tree. At each node, a sigmoid function
- * is used to calculate the probability of belonging to the right branch.
- * This idea is from "F. Morin, Y. Bengio (AISTATS 05):
- * Hierarchical Probabilistic Neural Network Language Model."
- *
- * Here we uses a simple way of making the binary tree.
- * Assuming the number of classes C = 6,
- * The classes are organized as a binary tree in the following way:
- *
- * @code{.py}
- * *-*-*- 2
- * | | |- 3
- * | |
- * | |-*- 4
- * |   |- 5
- * |
- * |-*- 0
- *   |- 1
- * @endcode
- *
- * where * indicates an internal node, and each leaf node represents a class.
- * - Node 0 ... C-2 are internal nodes.
- * - Node C-1 ... 2C-2 are leaf nodes.
- * - Class c is represented by leaf node \f$c+C-1\f$.
- *
- * We assign an id for each node:
- * - the id of root be 0.
- * - the left child of a node i is 2*i+1.
- * - the right child of a node i is 2*i+2.
- *
- * It's easy to see that:
- * - the parent of node i is \f$\left\lfloor(i-1)/2\right\rfloor\f$.
- * - the j-th level ancestor of node i is
- * \f$\left\lfloor(i+1)/2^{j+1}\right\rfloor - 1\f$.
- * - A node i is a left child of its parent if \f$(i-1)\%2==0\f$.
- *
- * The config file api is hsigmod_layer.
- */
-class HierarchicalSigmoidLayer : public Layer {
-public:
-  explicit HierarchicalSigmoidLayer(const LayerConfig& config)
-      : Layer(config) {}
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
-protected:
-  /**
-   * The last of inputs is label layer.
-   */
-  LayerPtr getLabelLayer() { return inputLayers_.back(); }
-
-  WeightList weights_;
-  std::unique_ptr<Weight> biases_;
-  /// number of classes
-  size_t numClasses_;
-  /// codeLength_ = \f$1 + \left\lfloor log_{2}(numClasses-1)\right\rfloor\f$
-  int codeLength_;
-  /// temporary result of output_
-  Argument preOutput_;
-
-  /// The temporary variables in CPU memory.
-  MatrixPtr cpuWeight_;
-  MatrixPtr cpuWeightGrad_;
-  MatrixPtr cpuInput_;
-  MatrixPtr cpuInputGrad_;
-  MatrixPtr cpuBias_;
-  MatrixPtr cpuOutput_;
-  IVectorPtr cpuLabel_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/IdentityProjection.cpp b/paddle/gserver/layers/IdentityProjection.cpp
deleted file mode 100644
index 6c70f77acc0c890e11a4929ea013d7745d8bbed0..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/IdentityProjection.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Projection.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * IdentityProjection performs addition:
- * \f[
- *   out.row[i] += in.row[i]
- * \f]
- *
- * The config file api is identity_projection.
- */
-class IdentityProjection : public Projection {
-public:
-  IdentityProjection(const ProjectionConfig& config,
-                     const ParameterPtr& parameter,
-                     bool useGpu);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-};
-
-REGISTER_PROJECTION(identity, IdentityProjection);
-
-/**
- * Constructed function.
- * @note IdentityProjection should not have any parameter.
- */
-IdentityProjection::IdentityProjection(const ProjectionConfig& config,
-                                       const ParameterPtr& parameter,
-                                       bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  CHECK(!parameter) << "'identity' projection should not have any parameter";
-}
-
-void IdentityProjection::forward() { out_->value->add(*in_->value); }
-
-void IdentityProjection::backward(const UpdateCallback& callback) {
-  if (in_->grad) {
-    in_->grad->add(*out_->grad);
-  }
-}
-
-/**
- * IdentityOffsetProjection likes IdentityProjection, but layer size may be
- * smaller
- * than input size. It selects dimensions [offset, offset+layer_size) from input
- * to
- * perform addition:
- * \f[
- *   out.row[i] += in.row[i + \textrm{offset}]
- * \f]
- *
- * The config file api is identity_projection.
- */
-class IdentityOffsetProjection : public Projection {
-public:
-  IdentityOffsetProjection(const ProjectionConfig& config,
-                           const ParameterPtr& parameter,
-                           bool useGpu);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-};
-
-REGISTER_PROJECTION(identity_offset, IdentityOffsetProjection);
-
-/**
- * Constructed function.
- * @note IdentityOffsetProjection should not have any parameter.
- */
-IdentityOffsetProjection::IdentityOffsetProjection(
-    const ProjectionConfig& config, const ParameterPtr& parameter, bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  CHECK(!parameter) << "'identity_offset' projection "
-                       "should not have any parameter";
-  CHECK_LE(config.output_size() + config.offset(), config.input_size());
-}
-
-void IdentityOffsetProjection::forward() {
-  out_->value->addAtOffset(*in_->value, config_.offset());
-}
-
-void IdentityOffsetProjection::backward(const UpdateCallback& callback) {
-  if (in_->grad) {
-    in_->grad->addAtOffset(*out_->grad, config_.offset());
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/InterpolationLayer.cpp b/paddle/gserver/layers/InterpolationLayer.cpp
deleted file mode 100644
index 0ac92024bc7eddf05ce023708537d0aa7bab6426..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/InterpolationLayer.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for linear interpolation with two inputs,
- * which is used in NEURAL TURING MACHINE.
- * \f[
- *   y.row[i] = w[i] * x_1.row[i] + (1 - w[i]) * x_2.row[i]
- * \f]
- * where \f$x_1\f$ and \f$x_2\f$ are two (batchSize x dataDim) inputs,
- * \f$w\f$ is (batchSize x 1) weight vector,
- * and \f$y\f$ is (batchSize x dataDim) output.
- *
- * The config file api is interpolation_layer.
- */
-
-class InterpolationLayer : public Layer {
-protected:
-  /// weightLast = 1 - weight
-  MatrixPtr weightLast_;
-  MatrixPtr tmpMatrix;
-
-public:
-  explicit InterpolationLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~InterpolationLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(interpolation, InterpolationLayer);
-
-bool InterpolationLayer::init(const LayerMap& layerMap,
-                              const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(3U, inputLayers_.size());
-
-  return true;
-}
-
-void InterpolationLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr weightV = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inV2 = getInputValue(2);
-
-  size_t batchSize = inV1->getHeight();
-  size_t dataDim = inV1->getWidth();
-
-  CHECK_EQ(dataDim, getSize());
-  CHECK_EQ(dataDim, inV2->getWidth());
-  CHECK_EQ(batchSize, inV1->getHeight());
-  CHECK_EQ(batchSize, inV2->getHeight());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  Matrix::resizeOrCreate(weightLast_, batchSize, 1, false, useGpu_);
-  weightLast_->one();
-  weightLast_->sub(*weightV);
-
-  REGISTER_TIMER_INFO("FwInterpTimer", getName().c_str());
-  // outV = inV1 * weight + inV2 * weightLast
-  outV->addRowScale(0, *inV1, *weightV);
-  outV->addRowScale(0, *inV2, *weightLast_);
-}
-
-void InterpolationLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr weightV = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inV2 = getInputValue(2);
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-  MatrixPtr inG2 = getInputGrad(2);
-
-  size_t batchSize = inV1->getHeight();
-  size_t dataDim = inV1->getWidth();
-
-  REGISTER_TIMER_INFO("BwInterpTimer", getName().c_str());
-
-  if (inG0) {
-    Matrix::resizeOrCreate(tmpMatrix, batchSize, dataDim, false, useGpu_);
-
-    // inG0 += outG .* (inV1 - inV2)
-    tmpMatrix->sub(*inV1, *inV2);
-    inG0->rowDotMul(0, *outG, *tmpMatrix);
-  }
-
-  if (inG1) {
-    // inG1 += outG * weight
-    inG1->addRowScale(0, *outG, *weightV);
-  }
-
-  if (inG2) {
-    // inG2 += outG * weightLast
-    inG2->addRowScale(0, *outG, *weightLast_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/KmaxSeqScoreLayer.cpp b/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
deleted file mode 100644
index 0ea960902efc10007896b3f4ce915dea79d0d12d..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-
-namespace paddle {
-
-class KmaxSeqScoreLayer : public Layer {
-private:
-  MatrixPtr scores_;
-  size_t beamSize_;
-  void kmaxScorePerSeq(const real* score,
-                       real* sortedRes,
-                       const ICpuGpuVectorPtr seqStartPos);
-
-public:
-  explicit KmaxSeqScoreLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(kmax_seq_score, KmaxSeqScoreLayer);
-
-bool KmaxSeqScoreLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  bool ret = Layer::init(layerMap, parameterMap);
-  CHECK_EQ(1U, inputLayers_.size());
-
-  beamSize_ = config_.beam_size();
-  CHECK_GE(beamSize_, 1U);
-
-  setNeedSequenceInfo(false);
-  setNeedGradient(false);
-  return ret;
-}
-
-void KmaxSeqScoreLayer::kmaxScorePerSeq(const real* scores,
-                                        real* sortedIds,
-                                        const ICpuGpuVectorPtr seqStartPos) {
-  int* starts = seqStartPos->getMutableData(false);
-  std::vector<real> indices;
-  for (size_t i = 0; i < seqStartPos->getSize() - 1; ++i) {
-    int seqLen = starts[i + 1] - starts[i];
-    int k = std::min(static_cast<int>(beamSize_), seqLen);
-
-    indices.resize(seqLen, 0);
-    std::iota(begin(indices), end(indices), 0.);
-    std::vector<real> tmpScore(scores + starts[i], scores + starts[i + 1]);
-    std::partial_sort(
-        begin(indices),
-        begin(indices) + k,
-        end(indices),
-        [&](size_t a, size_t b) { return tmpScore[a] > tmpScore[b]; });
-    memcpy(sortedIds + (i * beamSize_), indices.data(), k * sizeof(real));
-  }
-}
-
-void KmaxSeqScoreLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-  const MatrixPtr inputScore = getInputValue(0);
-
-  CHECK(input.hasSeq() || input.hasSubseq())
-      << "input of " << getName()
-      << " must be a sequence or a nested sequence.";
-  CHECK_EQ(input.value->getWidth(), 1UL)
-      << "input of " << getName() << " are scores over a sequence or "
-      << "a nested sequence, so its width must be 1.";
-
-  if (useGpu_) {
-    /*
-     * currently, this Layer only runs in CPU, if the other part of the model is
-     * runing on GPU, then copy the input to this layer from GPU to CPU.
-     */
-    Matrix::resizeOrCreate(scores_,
-                           inputScore->getHeight(),
-                           1,
-                           false /* trans */,
-                           false /* useGpu */);
-    scores_->copyFrom(*inputScore);
-  } else {
-    scores_ = inputScore;
-  }
-
-  /*
-   * TODO(caoying)
-   * In PaddePaddle, currently all matrices are real number types,
-   * but output of this layer which is some selected indices of the give
-   * sequence are actually filled with int types so that storing int types
-   * information in a real number matrix is dangerous, since real numbers will
-   * be convered to int types.
-   */
-  Matrix::resizeOrCreate(
-      output_.value,
-      input.hasSubseq() ? input.getNumSubSequences() : input.getNumSequences(),
-      beamSize_,
-      false,
-      false);
-  output_.value->one();
-  output_.value->mulScalar(-1.);
-
-  kmaxScorePerSeq(scores_->getData(),
-                  output_.value->getData(),
-                  input.hasSubseq() ? input.subSequenceStartPositions
-                                    : input.sequenceStartPositions);
-}
-
-void KmaxSeqScoreLayer::backward(const UpdateCallback& callback) {}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/L2DistanceLayer.cpp b/paddle/gserver/layers/L2DistanceLayer.cpp
deleted file mode 100644
index c8cca3762cc3ecd6c04d7d2b804bc588c281bfb4..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/L2DistanceLayer.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "L2DistanceLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(l2_distance, L2DistanceLayer);
-
-bool L2DistanceLayer::init(const LayerMap& layerMap,
-                           const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2UL) << "The L2DistanceLayer accepts two and "
-                                     << "only two inputs.";
-  CHECK_EQ(getSize(), 1UL) << "The output dimensionality of L2DistanceLayer "
-                           << "is fixed to be 1.";
-
-  return true;
-}
-
-void L2DistanceLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const auto inV1 = getInputValue(0);
-  const auto inV2 = getInputValue(1);
-
-  CHECK(inV1 && inV2);
-  CHECK_EQ(inV1->getHeight(), inV2->getHeight())
-      << "The height of two inputs of this layer must be the same.";
-  CHECK_EQ(inV1->getWidth(), inV2->getWidth())
-      << "The width of two inputs of this layer must be the same.";
-
-  int batchSize = inV1->getHeight();
-  int output_dim = getSize();
-  {
-    REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
-    reserveOutput(batchSize, output_dim);
-    auto outV = getOutputValue();
-    CHECK(outV) << "The output matrix should not be null.";
-
-    Matrix::resizeOrCreate(
-        inputSub_, inV1->getHeight(), inV1->getWidth(), false, useGpu_);
-
-    inputSub_->assign(*inV1);
-    inputSub_->sub(*inV2);
-    outV->sumOfProducts(*inputSub_, *inputSub_, 1, 0);
-    outV->sqrt2(*outV);
-  }
-}
-
-void L2DistanceLayer::backward(const UpdateCallback& callback) {
-  const auto outG = getOutputGrad();
-  const auto outV = getOutputValue();
-  CHECK(outG && outV);
-
-  auto inGrad1 = getInputGrad(0);
-  auto inGrad2 = getInputGrad(1);
-
-  {
-    REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
-
-    if (inGrad1 || inGrad2) {
-      outV->scalarDiv(*outV, 1.);
-      outV->dotMul(*outG, *outV);
-    }
-
-    if (inGrad1) inGrad1->addRowScale(0, *inputSub_, *outV);
-
-    if (inGrad2) {
-      inputSub_->mulScalar(-1.);
-      inGrad2->addRowScale(0, *inputSub_, *outV);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/L2DistanceLayer.h b/paddle/gserver/layers/L2DistanceLayer.h
deleted file mode 100644
index 97f35daf7860fb3b082ef03203327e09dca67371..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/L2DistanceLayer.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief The layer calculates the l2 distance between two input vectors.
- * \f[
- * f(\bf{x}, \bf{y}) = \sqrt{\sum_{i=1}^D(x_i - y_i)}
- * \f]
- *
- * - Input1: A vector (batchSize * dataDim)
- * - Input2: A vector (batchSize * dataDim)
- * - Output: A vector (batchSize * 1)
- *
- * The configuration api is: l2_distance_layer.
- */
-
-class L2DistanceLayer : public Layer {
-public:
-  explicit L2DistanceLayer(const LayerConfig& config) : Layer(config) {}
-  ~L2DistanceLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-private:
-  // Store the result of subtracting Input2 from Input1 in forward computation,
-  // which will be reused in backward computation.
-  MatrixPtr inputSub_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
deleted file mode 100644
index 32e2f4c9dd06e0ef7314b24719235c0be297961f..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/Layer.cpp
+++ /dev/null
@@ -1,410 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Util.h"
-
-#include "CostLayer.h"
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/utils/Error.h"
-#include "paddle/utils/Logging.h"
-
-#ifndef PADDLE_MOBILE_INFERENCE
-#include "ValidationLayer.h"
-#endif
-
-DEFINE_bool(log_error_clipping, false, "enable log error clipping or not");
-
-namespace paddle {
-
-Layer::Layer(const LayerConfig& config, bool useGpu)
-    : config_(config),
-      useGpu_(useGpu),
-      deviceId_(CPU_DEVICE),
-      needSequenceInfo_(true) {}
-
-bool Layer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
-  if (useGpu_ && FLAGS_parallel_nn) {
-    /* gpu environment is specified by device property */
-    deviceId_ = config_.device();
-    if (deviceId_ < 0) {
-      useGpu_ = false;
-    }
-  }
-
-  output_.deviceId = deviceId_;
-
-  for (auto& inputConfig : config_.inputs()) {
-    std::string inputName = inputConfig.input_layer_name();
-    LayerPtr inputLayer;
-    CHECK(mapGet(inputName, layerMap, &inputLayer))
-        << "Cannot find input layer " << inputName << " for layer "
-        << getName();
-    this->addPrev(inputLayer);
-
-    inputLayer->addOutputArgument(deviceId_);
-
-    if (inputConfig.has_input_parameter_name()) {
-      ParameterPtr parameter;
-      CHECK(
-          mapGet(inputConfig.input_parameter_name(), parameterMap, &parameter))
-          << "Cannot find input parameter "
-          << inputConfig.input_parameter_name() << " for layer " << getName();
-      parameter->incShared();
-      CHECK_EQ(parameter->getDeviceId(), getDeviceId());
-      parameters_.push_back(parameter);
-    } else {
-      parameters_.push_back(nullptr);
-    }
-
-    if (inputConfig.has_input_layer_argument()) {
-      inputArgument_.push_back(inputConfig.input_layer_argument());
-    } else {
-      inputArgument_.push_back("");
-    }
-  }
-
-  if (config_.has_bias_parameter_name()) {
-    CHECK(mapGet(config_.bias_parameter_name(), parameterMap, &biasParameter_))
-        << "Cannot find bias parameter " << config_.bias_parameter_name()
-        << " for layer " << getName();
-    biasParameter_->incShared();
-    CHECK_EQ(biasParameter_->getDeviceId(), getDeviceId());
-  }
-
-  /* specify the activation function according to the configuration */
-  std::string action_type = config_.active_type();
-  activation_.reset(ActivationFunction::create(action_type));
-  CHECK(activation_);
-
-  initNeedFlags();
-  markInBackward_.assign(inputLayers_.size(), false);
-
-  return true;
-}
-
-ClassRegistrar<Layer, LayerConfig> Layer::registrar_;
-
-LayerPtr Layer::create(const LayerConfig& config) {
-  std::string type = config.type();
-
-#ifndef PADDLE_MOBILE_INFERENCE
-  // NOTE: As following types have illegal character '-',
-  // they can not use REGISTER_LAYER to registrar.
-  // Besides, to fit with old training models,
-  // they can not use '_' instead.
-  if (type == "multi-class-cross-entropy")
-    return LayerPtr(new MultiClassCrossEntropy(config));
-  else if (type == "rank-cost")
-    return LayerPtr(new RankingCost(config));
-  else if (type == "auc-validation")
-    return LayerPtr(new AucValidation(config));
-  else if (type == "pnpair-validation")
-    return LayerPtr(new PnpairValidation(config));
-#endif
-
-  return LayerPtr(registrar_.createByType(config.type(), config));
-}
-
-void Layer::resetSpecifyOutput(Argument& output,
-                               size_t height,
-                               size_t width,
-                               bool isValueClean,
-                               bool isGradClean) {
-  SetDevice device(output.deviceId);
-
-  Matrix::resizeOrCreate(
-      output.value, height, width, /* trans */ false, useGpu(output.deviceId));
-  if (isValueClean) {
-    output.value->zeroMem();
-  }
-
-  if (passType_ != PASS_TEST && needGradient()) {
-    Matrix::resizeOrCreate(
-        output.grad, height, width, /* trans */ false, useGpu(output.deviceId));
-    if (isGradClean) {
-      output.grad->zeroMem();
-    }
-  }
-}
-
-void Layer::resizeOutput(size_t height, size_t width) {
-  resetSpecifyOutput(output_, height, width, false, false);
-
-  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-    resetSpecifyOutput(outputOtherDevice_[i], height, width, false, false);
-  }
-}
-
-void Layer::reserveOutput(size_t height, size_t width) {
-  resetSpecifyOutput(output_, height, width, false, true);
-
-  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-    resetSpecifyOutput(outputOtherDevice_[i], height, width, false, true);
-  }
-}
-
-void Layer::resetOutput(size_t height, size_t width) {
-  resetSpecifyOutput(output_, height, width, true, true);
-
-  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-    resetSpecifyOutput(outputOtherDevice_[i], height, width, true, true);
-  }
-}
-
-void Layer::addOutputArgument(int deviceId) {
-  if (deviceId == deviceId_) {
-    output_.countIncrement();
-    return;
-  } else {
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      if (outputOtherDevice_[i].deviceId == deviceId) {
-        outputOtherDevice_[i].countIncrement();
-        return;
-      }
-    }
-  }
-
-  Argument argu;
-  argu.deviceId = deviceId;
-  outputOtherDevice_.push_back(argu);
-  outputOtherDevice_.back().countIncrement();
-}
-
-void Layer::copyOutputToOtherDevice() {
-  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-    SetDevice device(outputOtherDevice_[i].deviceId);
-    // If outputOtherDevice_[i].value is a CpuMatrix,
-    // the copyFrom is a synchronous interface.
-    // If outputOtherDevice_[i].value is a GpuMatrix, since subsequent
-    // calculations are all on HPPL_STREAM_DEFAULT,
-    // copyFrom can be an asynchronous interface.
-    outputOtherDevice_[i].value->copyFrom(*getOutputValue(),
-                                          HPPL_STREAM_DEFAULT);
-    outputOtherDevice_[i].sequenceStartPositions =
-        output_.sequenceStartPositions;
-    outputOtherDevice_[i].subSequenceStartPositions =
-        output_.subSequenceStartPositions;
-    outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
-
-    outputOtherDevice_[i].notifyValueReady();
-  }
-}
-
-void Layer::waitInputValue() {
-  for (size_t i = 0; i != inputLayers_.size(); i++) {
-    if (inputLayers_[i]->getDeviceId() != deviceId_) {
-      getInput(i).waitValueReady();
-    }
-  }
-}
-
-void Layer::waitAndMergeOutputGrad() {
-  if (!output_.grad || !outputOtherDevice_.size()) {
-    return;
-  }
-
-  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
-    outputOtherDevice_[i].waitGradReady();
-  }
-
-  /* merge output grad */
-  size_t i = 0;
-  if (!output_.getAllCount()) {
-    output_.grad->copyFrom(*outputOtherDevice_[0].grad, HPPL_STREAM_1);
-    hl_stream_synchronize(HPPL_STREAM_1);
-
-    i++;
-    if (outputOtherDevice_.size() == 1) return;
-  }
-
-  Matrix::resizeOrCreate(tmpGrad_,
-                         output_.grad->getHeight(),
-                         output_.grad->getWidth(),
-                         /* trans */ false,
-                         useGpu(output_.deviceId));
-
-  for (; i != outputOtherDevice_.size(); i++) {
-    tmpGrad_->copyFrom(*outputOtherDevice_[i].grad, HPPL_STREAM_1);
-    hl_stream_synchronize(HPPL_STREAM_1);
-    output_.grad->add(*tmpGrad_);
-  }
-}
-
-void Layer::markAllInputGrad() {
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    if (!markInBackward_[i]) {
-      inputLayers_[i]->getOutput(deviceId_).notifyGradReady();
-    }
-    markInBackward_[i] = false;
-  }
-}
-
-void Layer::markInputGrad(int inputIndex) {
-  inputLayers_[inputIndex]->getOutput(deviceId_).notifyGradReady();
-  markInBackward_[inputIndex] = true;
-}
-
-void Layer::zeroGrad() {
-  CHECK(output_.grad.get() != NULL);
-  output_.grad->zeroMem();
-}
-
-void Layer::initNeedFlags() {
-  auto initFlag = [this](
-      bool& flag, bool (Layer::*flagQueryFunc)() const, ParameterType type) {
-    flag = false;
-    if (biasParameter_ && biasParameter_->hasType(type)) {
-      flag = true;
-    }
-    if (!flag) {
-      for (auto& para : parameters_) {
-        if (para && para->hasType(type)) {
-          flag = true;
-          break;
-        }
-      }
-    }
-    if (!flag) {
-      for (auto& layer : inputLayers_) {
-        if ((layer.get()->*flagQueryFunc)()) {
-          flag = true;
-        }
-      }
-    }
-  };
-  initFlag(needGradient_, &Layer::needGradient, PARAMETER_GRADIENT);
-}
-
-void Layer::showOutputStats() {
-  MatrixPtr out = getOutputValue();
-  if (!out) return;
-  if (!out->getElementCnt()) {
-    LOG(INFO) << "The number of output of " << config_.name()
-              << " is 0, skip to show the statistics";
-    return;
-  }
-  MatrixPtr outSquare;
-  if (dynamic_cast<GpuSparseMatrix*>(out.get())) {
-    GpuSparseMatrix* tmp = dynamic_cast<GpuSparseMatrix*>(out.get());
-    outSquare = std::make_shared<CpuSparseMatrix>(tmp->getHeight(),
-                                                  tmp->getWidth(),
-                                                  tmp->getElementCnt(),
-                                                  tmp->getValueType(),
-                                                  tmp->getFormat());
-  } else {
-    outSquare = out->clone();
-  }
-  outSquare->copyFrom(*out, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-
-  real mean = outSquare->getSum() / out->getElementCnt();
-  real min;
-  real max;
-  if (dynamic_cast<CpuSparseMatrix*>(outSquare.get())) {
-    auto tmpMat = dynamic_cast<CpuSparseMatrix*>(outSquare.get());
-    min = tmpMat->getMin();
-    max = tmpMat->getMax();
-    tmpMat->square2();
-    LOG(INFO) << "show statistics of [none zero values] in sparse matrix";
-  } else {
-    min = outSquare->getMin();
-    max = outSquare->getMax();
-    outSquare->square2();
-  }
-  real std = (outSquare->getSum() / outSquare->getElementCnt()) - mean * mean;
-  std = std > 0 ? std : 0;
-  LOG(INFO) << "The output state of " << config_.name() << ": mean=" << mean
-            << ", "
-            << "std=" << std << ", "
-            << "min=" << min << ", "
-            << "max=" << max;
-}
-
-void Layer::forwardActivation() {
-  /* activation */
-  auto status = activation_->forward(output_);
-  status.check();
-
-  /* dropout */
-  if (config_.drop_rate() > 0) {
-    forwardDropOut();
-    CHECK_NE(activation_->getName(), "softmax")
-        << "Softmax activation cannot be used with Dropout";
-  }
-
-  if (FLAGS_show_layer_stat) {
-    showOutputStats();
-  }
-}
-
-void Layer::backwardActivation() {
-  /* Do error clipping */
-  if (config_.error_clipping_threshold() > 0.0f) {
-    if (FLAGS_log_error_clipping) {
-      VectorPtr outGradVec = Vector::create(
-          output_.grad->getData(), output_.grad->getElementCnt(), useGpu_);
-      real maxAbsGrad = outGradVec->getAbsMax();
-      if (maxAbsGrad > config_.error_clipping_threshold()) {
-        real avgAbsGrad = outGradVec->getAbsSum() / outGradVec->getSize();
-        LOG(INFO) << " layer=" << config_.name() << " need clipping,"
-                  << " max error=" << maxAbsGrad << " avg error=" << avgAbsGrad;
-      }
-    }
-    output_.grad->clip(-config_.error_clipping_threshold(),
-                       config_.error_clipping_threshold());
-  }
-
-  /* Do dropout for delta*/
-  if (config_.drop_rate() > 0 && passType_ != PASS_TEST) {
-    MatrixPtr oGrad = getOutputGrad();
-    oGrad->dotMul(*oGrad, *dropOutMask_);
-  }
-
-  auto status = activation_->backward(output_);
-  status.check();
-}
-
-void Layer::forwardDropOut() {
-  auto& outV = getOutputValue();
-
-  if (passType_ == PASS_TRAIN) {
-    // new dropOutMask_ if dropOutMask_ is null ptr
-    Matrix::resizeOrCreate(dropOutMask_,
-                           outV->getHeight(),
-                           outV->getWidth(),
-                           false,
-                           useGpu(deviceId_));
-    dropOutMask_->randomizeUniform();  // generate a uniform random matrix
-    dropOutMask_->biggerThanScalar(config_.drop_rate());  // random mask
-    outV->dotMul(*outV, *dropOutMask_);                   // dropout
-  } else if (passType_ == PASS_GC) {
-    // only initialize once
-    if (!dropOutMask_) {
-      dropOutMask_ = Matrix::create(
-          outV->getHeight(), outV->getWidth(), false, useGpu(deviceId_));
-      // We use cpu matrix to generate mask so that the mask
-      // will be same for both gpu version and cpu version.
-      // This will help unittest to make sure they have same result.
-      MatrixPtr tmpMask = Matrix::create(outV->getHeight(), outV->getWidth());
-      tmpMask->randomizeUniform();  // generate a uniform random matrix
-      tmpMask->biggerThanScalar(config_.drop_rate());  // random mask
-      dropOutMask_->copyFrom(*tmpMask);
-    }
-    outV->dotMul(*outV, *dropOutMask_);
-  } else {  // passType == PASS_TEST
-    outV->mulScalar(1.0 - config_.drop_rate());
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
deleted file mode 100644
index 8da342a00f72ee1196c4af24104ce92c6bbf9f5c..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/Layer.h
+++ /dev/null
@@ -1,512 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <memory>
-#include "ModelConfig.pb.h"
-#include "paddle/function/Function.h"
-#include "paddle/gserver/activations/ActivationFunction.h"
-#include "paddle/math/CpuSparseMatrix.h"
-#include "paddle/parameter/Argument.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/parameter/Weight.h"
-#include "paddle/utils/ClassRegistrar.h"
-#include "paddle/utils/Util.h"
-
-/// Macro for registering a layer type.
-/// Example: REGISTER_LAYER(crf_error, CRFDecodingErrorLayer);
-#define REGISTER_LAYER(__type_name, __class_name) \
-  static InitFunction __reg_type_##__type_name(   \
-      []() { Layer::registrar_.registerClass<__class_name>(#__type_name); })
-
-#define REGISTER_LAYER_CREATE_FUNC(__type_name, createFunction) \
-  static InitFunction __reg_type_##__type_name(                 \
-      []() { Layer::registrar_.registerClass(#__type_name, createFunction); })
-
-namespace paddle {
-
-class Layer;
-typedef std::shared_ptr<Layer> LayerPtr;
-typedef std::map<std::string, LayerPtr> LayerMap;
-class NeuralNetwork;
-
-/// layer state, used for RNN and LSTM layers
-struct LayerState {
-  std::vector<MatrixPtr> value;
-};
-typedef std::shared_ptr<LayerState> LayerStatePtr;
-
-/// Paddle device ID, MKLDNN is -2, CPU is -1
-enum PADDLE_DEVICE_ID {
-  MKLDNN_DEVICE = -2,
-  CPU_DEVICE = -1,
-};
-
-/**
- * @brief Base class for layer.
- * Define necessary variables and functions for every layer.
- */
-class Layer {
-protected:
-  /// Layer config
-  LayerConfig config_;
-  /// whether to use GPU
-  bool useGpu_;
-  /// Device Id. MKLDNN is -2, CPU is -1, and GPU is 0, 1, 2 ...
-  int deviceId_;
-  /// Input layers
-  std::vector<LayerPtr> inputLayers_;
-  /// Argument of input layers
-  std::vector<std::string> inputArgument_;
-
-  /// Parameter for each input layer.
-  /// Parameters_[i] is nullptr if inputLayers_[i] does not need parameter.
-  std::vector<ParameterPtr> parameters_;
-
-  /// nullptr if bias is not needed.
-  ParameterPtr biasParameter_;
-
-  /// Output
-  Argument output_;
-  /// Several outputs stored on different devices, used in 'parallel_nn' case,
-  /// and record them by deviceId_.
-  /// Also used in 'use_mkldnn' case.
-  std::vector<Argument> outputOtherDevice_;
-  /// If there are several outputs, map them by each name.
-  /// MKLDNNLayer use it only to merge output grad
-  std::map<std::string, Argument*> outputMap_;
-  /// Used to merge grad on different devices.
-  MatrixPtr tmpGrad_;
-
-  std::unique_ptr<ActivationFunction> activation_;
-
-  /// Current passType, PASS_TRAIN or PASS_TEST
-  PassType passType_;
-
-  /// Random 0-1 matrix for dropOut
-  MatrixPtr dropOutMask_;
-
-  /// Whether the layer need to compute gradient
-  bool needGradient_;
-  /// Whether the layer need to compute re-sequence information
-  bool needSequenceInfo_;
-
-  /// Mark input grad in(true) or out(false) of backward function.
-  std::vector<bool> markInBackward_;
-
-  /// Layer forward function
-  std::vector<std::shared_ptr<FunctionBase>> forward_;
-  /// Layer backward function
-  std::vector<std::shared_ptr<FunctionBase>> backward_;
-
-public:
-  /**
-   * Wait until all input value ready.
-   * Called before Layer::forward() function.
-   */
-  virtual void waitInputValue();
-
-  /**
-   * Copy layer's output_ to other device.
-   * If output layer is in other device, called after Layer::forward() function.
-   */
-  virtual void copyOutputToOtherDevice();
-
-  /**
-   * Wait until all output grad ready and merge them to output_.grad.
-   * Called before Layer::backward() function.
-   */
-  virtual void waitAndMergeOutputGrad();
-
-  /**
-   * Notify previous layer the output grad ready.
-   * Called after Layer::backward() function.
-   */
-  virtual void markAllInputGrad();
-
-protected:
-  /**
-   * Create layer function. Function is called in forward or backward.
-   * \param function, Layer::forward_ or Layer::backward_
-   * \param name, function name
-   * \param config, initialization configuration for the function
-   */
-  void createFunction(std::vector<std::shared_ptr<FunctionBase>>& function,
-                      const std::string& name,
-                      const FuncConfig& config) {
-    if (useGpu_) {
-      function.emplace_back(
-          FunctionBase::funcRegistrar_.createByType(name + "-GPU"));
-    } else {
-      function.emplace_back(
-          FunctionBase::funcRegistrar_.createByType(name + "-CPU"));
-    }
-    auto& func = function.back();
-    func->init(config);
-  }
-
-  /**
-   * Notify specified layer the output grad ready.
-   * Called in the backward function.
-   * If do mark input grad in the backward function, you should to ensure
-   * that all input grad will be marked in the backward function.
-   */
-  void markInputGrad(int inputIndex);
-
-  /**
-   * Get the argument of input layer.
-   */
-  const Argument& getInput(size_t inputIndex) const {
-    return inputLayers_[inputIndex]->getOutput(deviceId_);
-  }
-
-  /**
-   * Get the argument of input layer.
-   */
-  const Argument& getInput(const Layer& inputLayer) const {
-    return inputLayer.getOutput(deviceId_);
-  }
-
-  /**
-   * Get the argument of input layer with deviceId.
-   */
-  const Argument& getInput(size_t inputIndex, int deviceId) const {
-    return inputLayers_[inputIndex]->getOutput(deviceId);
-  }
-
-  /**
-   * Get the forward-input value.
-   */
-  const MatrixPtr& getInputValue(int inputIndex) {
-    return inputLayers_[inputIndex]->getOutput(deviceId_).value;
-  }
-
-  /**
-   * Get the forward-input value.
-   */
-  const MatrixPtr& getInputValue(const Layer& inputLayer) {
-    return inputLayer.getOutput(deviceId_).value;
-  }
-
-  /**
-   * Get the forward-input value with deviceId.
-   */
-  const MatrixPtr& getInputValue(int inputIndex, int deviceId) {
-    return inputLayers_[inputIndex]->getOutput(deviceId).value;
-  }
-
-  /**
-   * Get the forward-input grad.
-   */
-  const MatrixPtr& getInputGrad(int inputIndex) {
-    return inputLayers_[inputIndex]->getOutput(deviceId_).grad;
-  }
-
-  /**
-   * Get the forward-input grad.
-   */
-  const MatrixPtr& getInputGrad(const Layer& inputLayer) {
-    return inputLayer.getOutput(deviceId_).grad;
-  }
-
-  /**
-   * Get the forward-input grad.
-   */
-  const MatrixPtr& getInputGrad(int inputIndex, int deviceId) {
-    return inputLayers_[inputIndex]->getOutput(deviceId).grad;
-  }
-
-  /**
-   * Get the forward-input label.
-   */
-  const IVectorPtr& getInputLabel(const Layer& inputLayer) {
-    return inputLayer.getOutput(deviceId_).ids;
-  }
-
-  /**
-   * Change the size of output (value, grad).
-   * Reset to value zero if isValueClean = true,
-   * Reset to grad zero if isGradClean = true.
-   */
-  void resetSpecifyOutput(Argument& output,
-                          size_t height,
-                          size_t width,
-                          bool isValueClean,
-                          bool isGradClean);
-
-  /**
-   * Add output argument to other devices.
-   */
-  void addOutputArgument(int deviceId);
-
-public:
-  explicit Layer(const LayerConfig& config, bool useGpu = FLAGS_use_gpu);
-  virtual ~Layer() {}
-
-  /// Register a Layer
-  static ClassRegistrar<Layer, LayerConfig> registrar_;
-
-  /**
-   * Get the flag whether layer need to compute gradient.
-   */
-  bool needGradient() const { return needGradient_; }
-
-  /**
-   * Set the flag whether layer need to compute gradient.
-   */
-  void setNeedGradient(bool need) { needGradient_ = need; }
-
-  /**
-   * Set the flag whether layer need to re-compute sequence information,
-   * which includes sequenceStartPositions or subSequenceStartPositions.
-   */
-  void setNeedSequenceInfo(bool need) { needSequenceInfo_ = need; }
-
-  /**
-   * Get layer's name.
-   */
-  const std::string& getName() const { return config_.name(); }
-
-  /**
-   * Get layer's type.
-   */
-  const std::string& getType() const { return config_.type(); }
-
-  /**
-   * Get layer's size.
-   */
-  size_t getSize() const { return config_.size(); }
-
-  /**
-   * Get layer's deviceId.
-   */
-  int getDeviceId() const { return deviceId_; }
-
-  /**
-   * Add the inputLayer.
-   */
-  void addPrev(LayerPtr l) { inputLayers_.push_back(l); }
-
-  /**
-   * Get the size of inputLayer[i].
-   */
-  const LayerPtr& getPrev(size_t i) { return inputLayers_[i]; }
-
-  /**
-   * Get the forward-output value.
-   */
-  const MatrixPtr& getOutputValue() { return output_.value; }
-
-  /**
-   * Get the forward-output label.
-   */
-  const IVectorPtr& getOutputLabel() { return output_.ids; }
-
-  /**
-   * Get the backward-Loss value.
-   */
-  const MatrixPtr& getOutputGrad() { return output_.grad; }
-  /**
-   * If layer has multi-output, set output into outputMap_.
-   */
-  void setOutput(const std::string& name, Argument* output) {
-    outputMap_[name] = output;
-  }
-
-  /**
-   * Get the output map size, if layer has multi-output.
-   */
-  size_t getOutputMapSize() { return outputMap_.size(); }
-
-  /**
-   * Get the output based on layer's name.
-   */
-  Argument& getOutput(const std::string& str = "") {
-    if (str == "") {
-      return output_;
-    } else {
-      auto output = outputMap_.find(str);
-      if (output != outputMap_.end()) {
-        return *output->second;
-      } else {
-        LOG(FATAL) << "No specific output " << str;
-        return *((Argument*)nullptr);
-      }
-    }
-  }
-
-  /**
-   * Get the output based on deviceId.
-   */
-  const Argument& getOutput(int deviceId) const {
-    if (deviceId == getDeviceId()) {
-      return output_;
-    } else {
-      for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-        if (outputOtherDevice_[i].deviceId == deviceId) {
-          return outputOtherDevice_[i];
-        }
-      }
-
-      LOG(FATAL) << "No specific device output ";
-      return *((Argument*)nullptr);
-    }
-  }
-
-  /**
-   * Get layer's parameters.
-   */
-  const std::vector<ParameterPtr>& getParameters() { return parameters_; }
-
-  /**
-   * Get layer's bias-parameters.
-   */
-  const ParameterPtr& getBiasParameter() { return biasParameter_; }
-
-  /**
-   * Create pointer of layer.
-   */
-  static LayerPtr create(const LayerConfig& config);
-
-  /**
-   * Resize the output matrix size.
-   */
-  void resizeOutput(size_t height, size_t width);
-
-  /**
-   * Resize the output matrix size,
-   * and reset value to zero.
-   */
-  void reserveOutput(size_t height, size_t width);
-
-  /**
-   * Resize the output matrix size,
-   * and reset value and grad to zero.
-   */
-  void resetOutput(size_t height, size_t width);
-
-  /**
-   * Clear the gradient of output.
-   */
-  void zeroGrad();
-
-  /**
-   * Intialization.
-   * For example, adding input layers from layerMap and parameterMap.
-   */
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  /**
-   * Intialization for sub network if there has sub network.
-   * @param rootNetwork root network
-   * @param config model config
-   * @param parameterTypes parameter's type
-   * @param useGpu whether to use gpu or not
-   */
-  virtual void initSubNetwork(NeuralNetwork* rootNetwork,
-                              const ModelConfig& config,
-                              const std::vector<ParameterType>& parameterTypes,
-                              bool useGpu) {}
-
-  /**
-   * @brief Access SubNetwork Object.
-   *        If subnetwork exists, then invoke callback with subnetwrk.
-   * @param callback if sub-network is exist, the callback is invoked.
-   */
-  virtual void accessSubNetwork(
-      const std::function<void(NeuralNetwork&)>& callback) {}
-
-  /**
-   * If use sparse row matrix as parameter,
-   * prefetch feature ids in input label.
-   */
-  virtual void prefetch() {}
-
-  /**
-   * Forward propagation.
-   * All inherited implementation should call Layer::foward() function.
-   */
-  virtual void forward(PassType passType) {
-    passType_ = passType;
-    if (!inputLayers_.empty() && needSequenceInfo_) {
-      const Argument& input = getInput(0);
-      output_.sequenceStartPositions = input.sequenceStartPositions;
-      output_.subSequenceStartPositions = input.subSequenceStartPositions;
-      output_.cpuSequenceDims = input.cpuSequenceDims;
-    }
-  }
-
-  /**
-   * Reset the internal state variables.
-   * Allocate them if they have not been allocated.
-   * This function need to called before Layer::forward() for generating
-   * sequence.
-   *
-   * This is used for sequence generation. When generating sequence, the
-   * calculation at current timestamp depends on the state from previous
-   * timestamp. The model needs to keep the information about the previous
-   * timestamp in the state variables. Layers such as RecurrentLayer,
-   * LstmLayer and ContextLayer have state variables.
-   */
-  virtual void resetState() {}
-
-  /**
-   * Set layer state.
-   */
-  virtual void setState(LayerStatePtr state) {}
-
-  /**
-   * Get layer state.
-   * @return A copy of internal state.
-   */
-  virtual LayerStatePtr getState() { return nullptr; }
-
-  /**
-   * Show output state.
-   */
-  void showOutputStats();
-
-  /**
-   * Backward propagation.
-   * Should only be called after Layer::forward() function.
-   */
-  virtual void backward(const UpdateCallback& callback = nullptr) = 0;
-
-  /**
-   * One pass is finished.
-   */
-  virtual void onPassEnd() {}
-
-protected:
-  /**
-   * Forward of activation function.
-   */
-  void forwardActivation();
-  /**
-   * Backward of activation function.
-   */
-  void backwardActivation();
-  /**
-   * Forward of dropOut.
-   */
-  void forwardDropOut();
-  /**
-   * Initilize the needGradient_ flag.
-   */
-  void initNeedFlags();
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/LinearChainCRF.h b/paddle/gserver/layers/LinearChainCRF.h
deleted file mode 100644
index 1ea4c7e105703b76601499bf3944648cdc98ec99..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/LinearChainCRF.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-class LinearChainCRF {
-public:
-  /**
-   * The size of para must be \f$(numClasses + 2) * numClasses\f$.
-   * The first numClasses values of para are for starting weights (\f$a\f$).
-   * The next numClasses values of para are for ending weights (\f$b\f$),
-   * The remaning values are for transition weights (\f$w\f$).
-   *
-   * The probability of a state sequence s of length \f$L\f$ is defined as:
-   * \f$P(s) = (1/Z) exp(a_{s_1} + b_{s_L}
-   *                  + \sum_{l=1}^L x_{s_l}
-   *                  + \sum_{l=2}^L w_{s_{l-1},s_l})\f$
-   * where \f$Z\f$ is a normalization value so that the sum of \f$P(s)\f$ over
-   * all possible
-   * sequences is \f$1\f$, and \f$x\f$ is the input feature to the CRF.
-   */
-  LinearChainCRF(int numClasses, real* para);
-
-  /**
-   * Calculate the negative log likelihood of s given x.
-   * The size of x must be length * numClasses. Each consecutive numClasses
-   * values are the features for one time step.
-   */
-  real forward(real* x, int* s, int length);
-
-  /**
-   * Calculate the gradient with respect to x, a, b, and w.
-   * backward() can only be called after a corresponding call to forward() with
-   * the same x, s and length.
-   * The gradient with respect to a, b, and w will not be calculated if
-   * needWGrad is false.
-   * @note Please call getWGrad() and getXGrad() to get the gradient with
-   * respect to (a, b, w) and x respectively.
-   */
-  void backward(real* x, int* s, int length, bool needWGrad);
-
-  /**
-   * Find the most probable sequence given x. The result will be stored in s.
-   */
-  void decode(real* x, int* s, int length);
-
-  /*
-   * Return the gradient with respect to (a, b, w). It can only be called after
-   * a corresponding call to backward().
-   */
-  MatrixPtr getWGrad() { return matWGrad_; }
-
-  /*
-   * Return the gradient with respect to x. It can only be called after a
-   * corresponding call to backward().
-   */
-  MatrixPtr getXGrad() { return matGrad_; }
-
-protected:
-  int numClasses_;
-  MatrixPtr a_;
-  MatrixPtr b_;
-  MatrixPtr w_;
-  MatrixPtr matWGrad_;
-  MatrixPtr da_;
-  MatrixPtr db_;
-  MatrixPtr dw_;
-  MatrixPtr ones_;
-
-  MatrixPtr expX_;
-  MatrixPtr matGrad_;
-  MatrixPtr alpha_;
-  MatrixPtr beta_;
-  MatrixPtr maxX_;
-  MatrixPtr expW_;
-
-  // track_(k,i) = j means that the best sequence at time k for class i comes
-  // from the sequence at time k-1 for class j
-  IVectorPtr track_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/LinearChainCTC.h b/paddle/gserver/layers/LinearChainCTC.h
deleted file mode 100644
index 0b774277dc8cf27f48c6905168cdea047365c99d..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/LinearChainCTC.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-class LinearChainCTC {
-public:
-  LinearChainCTC(int numClasses, bool normByTimes);
-
-  // Calculate the negative log probability as loss
-  real forward(real* softmaxSeq,
-               int softmaxSeqLen,
-               int* labelSeq,
-               int labelSeqLen);
-
-  // calculate the gradient
-  void backward(real* softmaxSeq,
-                real* softmaxSeqGrad,
-                int* labelSeq,
-                int labelSeqLen);
-
-protected:
-  int numClasses_, blank_, totalSegments_, totalTime_;
-  bool normByTimes_;
-  bool isInvalid_;
-
-  MatrixPtr logActs_, forwardVars_, backwardVars_, gradTerms_;
-
-  real logProb_;
-
-  void segmentRange(int& start, int& end, int time);
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/LstmCompute.cpp b/paddle/gserver/layers/LstmCompute.cpp
deleted file mode 100644
index ea30f6d6b1b8586569407af6baac2c14034e709c..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/LstmCompute.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "LstmCompute.h"
-#include "hl_recurrent_apply.cuh"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-void LstmCompute::init(LayerConfig &config) {
-  activeNode_ = hlActiveType(config.active_type());
-  activeGate_ = hlActiveType(config.active_gate_type());
-  activeState_ = hlActiveType(config.active_state_type());
-}
-
-template <>
-void LstmCompute::forwardOneSequence<0>(hl_lstm_value value, int frameSize) {
-  hl_cpu_lstm_forward(hppl::forward::lstm(),
-                      value,
-                      frameSize,
-                      activeNode_,
-                      activeGate_,
-                      activeState_);
-}
-
-template <>
-void LstmCompute::backwardOneSequence<0>(hl_lstm_value value,
-                                         hl_lstm_grad grad,
-                                         int frameSize) {
-  hl_cpu_lstm_backward(hppl::backward::lstm(),
-                       value,
-                       grad,
-                       frameSize,
-                       activeNode_,
-                       activeGate_,
-                       activeState_);
-}
-
-template <>
-void LstmCompute::forwardBatch<0>(hl_lstm_value value,
-                                  int frameSize,
-                                  int batchSize) {
-  for (int b = 0; b < batchSize; b++) {
-    forwardOneSequence<0>(value, frameSize);
-
-    value.gateValue += frameSize * 4;
-    value.stateValue += frameSize;
-    value.stateActiveValue += frameSize;
-    value.outputValue += frameSize;
-    if (value.prevStateValue) {
-      value.prevStateValue += frameSize;
-    }
-  }
-}
-
-template <>
-void LstmCompute::backwardBatch<0>(hl_lstm_value value,
-                                   hl_lstm_grad grad,
-                                   int frameSize,
-                                   int batchSize) {
-  for (int b = 0; b < batchSize; b++) {
-    backwardOneSequence<0>(value, grad, frameSize);
-
-    value.gateValue += frameSize * 4;
-    value.stateValue += frameSize;
-    value.stateActiveValue += frameSize;
-    value.outputValue += frameSize;
-    if (value.prevStateValue) {
-      value.prevStateValue += frameSize;
-    }
-
-    grad.gateGrad += frameSize * 4;
-    grad.stateGrad += frameSize;
-    grad.stateActiveGrad += frameSize;
-    grad.outputGrad += frameSize;
-    if (grad.prevStateGrad) {
-      grad.prevStateGrad += frameSize;
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/LstmCompute.h b/paddle/gserver/layers/LstmCompute.h
deleted file mode 100644
index b7d55eb1f984d102802cab87ba12ca9c69a2f4be..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/LstmCompute.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ModelConfig.pb.h"
-#include "hl_gpu.h"
-#include "paddle/utils/Common.h"
-
-namespace paddle {
-
-class LstmCompute {
-public:
-  void init(LayerConfig &config);
-
-  /**
-   * LstmLayer batch compute API (forwardBatch, backwardBatch).
-   * If use batch compute api, lstm value(and grad) need to be batch structure.
-   * Compute order:
-   *   forwardBatch:  for 0 <= id < numBatch
-   *   backwardBatch:  for numBatch > id >= 0
-   */
-  template <bool useGpu>
-  void forwardBatch(hl_lstm_value value, int frameSize, int batchSize);
-
-  template <bool useGpu>
-  void backwardBatch(hl_lstm_value value,
-                     hl_lstm_grad grad,
-                     int frameSize,
-                     int batchSize);
-
-  /**
-   * LstmLayer sequence compute API (forwardOneSequence, backwardOneSequence).
-   * Compute order(for each sequence):
-   *   forwardOneSequence:
-   *     if (!reversed) for 0 <= seqId < seqLength
-   *     if (reversed)  for seqLength > seqId >= 0
-   *   backwardOneSequence:
-   *     if (!reversed) for seqLength > seqId >= 0
-   *     if (reversed)  for 0 <= seqId < seqLength
-   */
-  template <bool useGpu>
-  void forwardOneSequence(hl_lstm_value value, int frameSize);
-  template <bool useGpu>
-  void backwardOneSequence(hl_lstm_value value,
-                           hl_lstm_grad grad,
-                           int frameSize);
-
-public:
-  hl_activation_mode_t activeNode_;
-  hl_activation_mode_t activeGate_;
-  hl_activation_mode_t activeState_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/LstmLayer.cpp b/paddle/gserver/layers/LstmLayer.cpp
deleted file mode 100644
index f65ae6a3e69cb5f0a7e6073d17bfd0beae91cd5d..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/LstmLayer.cpp
+++ /dev/null
@@ -1,805 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "LstmLayer.h"
-#include "paddle/math/BaseMatrix.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Stat.h"
-
-DECLARE_bool(prev_batch_state);
-
-namespace paddle {
-
-REGISTER_LAYER(lstmemory, LstmLayer);
-
-bool LstmLayer::init(const LayerMap &layerMap,
-                     const ParameterMap &parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(1U, inputLayers_.size());
-  CHECK_EQ(1U, parameters_.size());
-  CHECK_EQ(getSize() * getSize() * 4, parameters_[0]->getSize());
-  CHECK_EQ(getSize() * 7, biasParameter_->getSize());
-  weight_.reset(new Weight(getSize(), getSize() * 4, parameters_[0]));
-  if (biasParameter_.get() != NULL) {
-    bias_.reset(new Weight(1, getSize() * 7, biasParameter_));
-    if (bias_->getW()) {
-      localBias_ = Matrix::create(nullptr,
-                                  /* height= */ 1,
-                                  getSize() * 4,
-                                  /* trans= */ false,
-                                  useGpu_);
-      checkIg_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                getSize(),
-                                /* trans= */ false,
-                                useGpu_);
-      checkFg_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                getSize(),
-                                /* trans= */ false,
-                                useGpu_);
-      checkOg_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                getSize(),
-                                /* trans= */ false,
-                                useGpu_);
-
-      localBias_->setData(bias_->getW()->getData());
-      checkIg_->setData(bias_->getW()->getData() + getSize() * 4);
-      checkFg_->setData(bias_->getW()->getData() + getSize() * 5);
-      checkOg_->setData(bias_->getW()->getData() + getSize() * 6);
-    }
-
-    if (bias_->getWGrad()) {
-      localBiasGrad_ = Matrix::create(nullptr,
-                                      /* height= */ 1,
-                                      getSize() * 4,
-                                      /* trans= */ false,
-                                      useGpu_);
-      checkIgGrad_ = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    getSize(),
-                                    /* trans= */ false,
-                                    useGpu_);
-      checkFgGrad_ = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    getSize(),
-                                    /* trans= */ false,
-                                    useGpu_);
-      checkOgGrad_ = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    getSize(),
-                                    /* trans= */ false,
-                                    useGpu_);
-      localBiasGrad_->setData(bias_->getWGrad()->getData());
-      checkIgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 4);
-      checkFgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 5);
-      checkOgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 6);
-    }
-  } else {
-    LOG(FATAL) << "Bias should be here.";
-  }
-  reversed_ = config_.reversed();
-
-  // create IdentityActivation for using drop_rate
-  activation_.reset(ActivationFunction::create(""));
-
-  LstmCompute::init(config_);
-  useBatch_ = true;
-  useSeqParallel_ = false;
-  if (useGpu_ && (getSize() == 32 || getSize() == 64)) {
-    useSeqParallel_ = true;
-  }
-
-  return true;
-}
-
-void LstmLayer::resetState() {
-  CHECK(!reversed_) << "state is not allowed for reversed lstmemory layer";
-  Matrix::resizeOrCreate(
-      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
-  Matrix::resizeOrCreate(prevState_, 1, getSize(), /* trans= */ false, useGpu_);
-  prevOutput_->resize(0, getSize());
-  prevState_->resize(0, getSize());
-  if (FLAGS_prev_batch_state) {
-    useBatch_ = true;
-  } else {
-    useBatch_ = false;
-  }
-}
-
-void LstmLayer::setState(LayerStatePtr state) {
-  CHECK(state->value.size() == 2) << "two matrices are expected for LSTM state";
-  prevOutput_->resize(state->value[0]->getHeight(),
-                      state->value[0]->getWidth());
-  prevState_->resize(state->value[1]->getHeight(), state->value[1]->getWidth());
-  prevOutput_->copyFrom(*(state->value[0]));
-  prevState_->copyFrom(*(state->value[1]));
-}
-
-LayerStatePtr LstmLayer::getState() {
-  LayerStatePtr res = std::make_shared<LayerState>();
-  if (prevOutput_->getHeight() && prevOutput_->getWidth()) {
-    res->value.push_back(prevOutput_->clone(0, 0, useGpu_));
-    res->value[0]->copyFrom(*prevOutput_);
-    res->value.push_back(prevState_->clone(0, 0, useGpu_));
-    res->value[1]->copyFrom(*prevState_);
-  } else {
-    MatrixPtr output =
-        Matrix::create(1, getSize(), /* trans= */ false, useGpu_);
-    MatrixPtr state = Matrix::create(1, getSize(), /* trans= */ false, useGpu_);
-    output->resize(0, getSize());
-    state->resize(0, getSize());
-    res->value.push_back(output);
-    res->value.push_back(state);
-  }
-  return res;
-}
-
-void LstmLayer::forward(PassType passType) {
-  REGISTER_TIMER_INFO("LstmFwTimer", getName().c_str());
-  Layer::forward(passType);
-
-  const Argument &input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  resetOutput(batchSize, getSize());
-  CHECK_EQ(getSize() * 4, input.value->getWidth());
-  size_t numSequences = input.getNumSequences();
-  const int *starts = input.sequenceStartPositions->getData(false);
-  CHECK_EQ(starts[numSequences], batchSize);
-
-  Matrix::resizeOrCreate(gate_.value,
-                         /* height= */ batchSize,
-                         getSize() * 4,
-                         /* trans= */ false,
-                         useGpu_);
-  if (prevOutput_) {
-    size_t prevNumSeq = useBatch_ ? numSequences : 1;
-    if (prevOutput_->getHeight() == 0) {
-      prevOutput_->resize(prevNumSeq, getSize());
-      prevState_->resize(prevNumSeq, getSize());
-      prevOutput_->zeroMem();
-      prevState_->zeroMem();
-    } else {
-      CHECK_EQ(prevOutput_->getHeight(), prevNumSeq)
-          << "the number of sequences must be the same";
-    }
-    Matrix::resizeOrCreate(totalState_,
-                           prevState_->getHeight() + batchSize,
-                           getSize(),
-                           /*trans*/ false,
-                           useGpu_);
-    state_.value = Matrix::create(nullptr,
-                                  /* height= */ batchSize,
-                                  getSize(),
-                                  /* trans= */ false,
-                                  useGpu_);
-    state_.value->setData(totalState_->getData() +
-                          prevState_->getHeight() * getSize());
-  } else {
-    Matrix::resizeOrCreate(state_.value,
-                           /* height= */ batchSize,
-                           getSize(),
-                           /* trans= */ false,
-                           useGpu_);
-  }
-  Matrix::resizeOrCreate(preOutput_.value,
-                         /* height= */ batchSize,
-                         getSize(),
-                         /* trans= */ false,
-                         useGpu_);
-
-  if (!useBatch_) {
-    forwardSequence(batchSize, numSequences, starts, input.value);
-  } else {
-    if (!useSeqParallel_) {
-      forwardBatch(batchSize, numSequences, starts, input.value);
-    } else {
-      const int *starts = input.sequenceStartPositions->getData(useGpu_);
-      forwardSeqParallel(batchSize, numSequences, starts, input.value);
-    }
-  }
-  /*  activation */ { forwardActivation(); }
-}
-
-void LstmLayer::backward(const UpdateCallback &callback) {
-  REGISTER_TIMER_INFO("LstmBwTimer", getName().c_str());
-  /*  Do derivation */ { backwardActivation(); }
-
-  const Argument &input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  size_t numSequences = input.getNumSequences();
-
-  Matrix::resizeOrCreate(gate_.grad,
-                         /* height= */ batchSize,
-                         getSize() * 4,
-                         /* trans= */ false,
-                         useGpu_);
-  Matrix::resizeOrCreate(state_.grad,
-                         /* height= */ batchSize,
-                         getSize(),
-                         /* trans= */ false,
-                         useGpu_);
-  Matrix::resizeOrCreate(preOutput_.grad,
-                         /* height= */ batchSize,
-                         getSize(),
-                         /* trans= */ false,
-                         useGpu_);
-  state_.grad->zero();
-
-  const int *starts = input.sequenceStartPositions->getData(false);
-  if (!useBatch_) {
-    backwardSequence(batchSize, numSequences, starts, input.grad);
-  } else {
-    if (!useSeqParallel_) {
-      backwardBatch(batchSize, numSequences, starts, input.grad);
-    } else {
-      const int *starts = input.sequenceStartPositions->getData(useGpu_);
-      backwardSeqParallel(batchSize, numSequences, starts, input.grad);
-    }
-  }
-
-  if (bias_) {
-    bias_->getParameterPtr()->incUpdate(callback);
-  }
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-void LstmLayer::forwardSequence(int batchSize,
-                                size_t numSequences,
-                                const int *starts,
-                                MatrixPtr inputValue) {
-  REGISTER_TIMER_INFO("LstmFwSequenceTime", getName().c_str());
-  gate_.value->assign(*inputValue);
-  if (bias_) {
-    gate_.value->addBias(*localBias_, 1);
-  }
-
-  hl_lstm_value lstmValue;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-  lstmValue.gateValue = gate_.value->getData();
-  lstmValue.stateValue = state_.value->getData();
-  lstmValue.stateActiveValue = preOutput_.value->getData();
-  lstmValue.outputValue = output_.value->getData();
-  lstmValue.prevStateValue = nullptr;
-  if (reversed_) {
-    lstmValue.gateValue += (batchSize - 1) * getSize() * 4;
-    lstmValue.stateValue += (batchSize - 1) * getSize();
-    lstmValue.stateActiveValue += (batchSize - 1) * getSize();
-    lstmValue.outputValue += (batchSize - 1) * getSize();
-  }
-
-  auto nextFrame = [&lstmValue](bool reversed, int frameSize) {
-    lstmValue.prevStateValue = lstmValue.stateValue;
-    if (!reversed) {
-      lstmValue.gateValue += frameSize * 4;
-      lstmValue.stateValue += frameSize;
-      lstmValue.stateActiveValue += frameSize;
-      lstmValue.outputValue += frameSize;
-    } else {
-      lstmValue.gateValue -= frameSize * 4;
-      lstmValue.stateValue -= frameSize;
-      lstmValue.stateActiveValue -= frameSize;
-      lstmValue.outputValue -= frameSize;
-    }
-  };
-
-  MatrixPtr frameGate = Matrix::create(nullptr,
-                                       /* height= */ 1,
-                                       getSize() * 4,
-                                       /* trans= */ false,
-                                       useGpu_);
-  MatrixPtr frameOutput = Matrix::create(nullptr,
-                                         /* height= */ 1,
-                                         getSize(),
-                                         /* trans= */ false,
-                                         useGpu_);
-
-  if (!reversed_) {
-    if (prevState_) {
-      lstmValue.prevStateValue = prevState_->getData();
-    }
-    if (prevOutput_) {
-      frameGate->setData(lstmValue.gateValue);
-      frameGate->mul(*prevOutput_, *weight_->getW(), 1, 1);
-    }
-  }
-  AsyncGpuBlock asyncGpuBlock;
-  for (size_t n = 0; n < numSequences; ++n) {
-    int length;
-    if (!reversed_) {
-      length = starts[n + 1] - starts[n];
-    } else {
-      length = starts[numSequences - n] - starts[numSequences - n - 1];
-    }
-    for (int l = 0; l < length; ++l) {
-      if (useGpu_) {
-        LstmCompute::forwardOneSequence<1>(lstmValue, getSize());
-      } else {
-        LstmCompute::forwardOneSequence<0>(lstmValue, getSize());
-      }
-
-      if (l != length - 1) {
-        frameOutput->setData(lstmValue.outputValue);
-        nextFrame(reversed_, getSize());
-        frameGate->setData(lstmValue.gateValue);
-        frameGate->mul(*frameOutput, *weight_->getW(), 1, 1);
-      }
-    }
-    if (n != numSequences - 1) {
-      frameOutput->setData(lstmValue.outputValue);
-      nextFrame(reversed_, getSize());
-      frameGate->setData(lstmValue.gateValue);
-      if (!reversed_) {
-        if (!prevState_) lstmValue.prevStateValue = nullptr;
-        if (prevOutput_) {
-          frameGate->mul(*frameOutput, *weight_->getW(), 1, 1);
-        }
-      } else {
-        lstmValue.prevStateValue = nullptr;
-      }
-    }
-  }
-
-  if (!reversed_) {
-    if (prevState_) {
-      prevState_->assign(*state_.value->subMatrix(batchSize - 1, 1));
-    }
-    if (prevOutput_) {
-      prevOutput_->assign(*output_.value->subMatrix(batchSize - 1, 1));
-    }
-  }
-}
-
-void LstmLayer::backwardSequence(int batchSize,
-                                 size_t numSequences,
-                                 const int *starts,
-                                 MatrixPtr inputGrad) {
-  REGISTER_TIMER_INFO("LstmBwSequenceTime", getName().c_str());
-  MatrixPtr weightT = weight_->getW()->getTranspose();
-
-  hl_lstm_value lstmValue;
-  hl_lstm_grad lstmGrad;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-  lstmValue.gateValue = gate_.value->getData();
-  lstmValue.stateValue = state_.value->getData();
-  lstmValue.stateActiveValue = preOutput_.value->getData();
-  lstmValue.outputValue = nullptr;
-
-  if (bias_->getWGrad()) {
-    lstmGrad.checkIgGrad = checkIgGrad_->getData();
-    lstmGrad.checkFgGrad = checkFgGrad_->getData();
-    lstmGrad.checkOgGrad = checkOgGrad_->getData();
-  } else {
-    lstmGrad.checkIgGrad = nullptr;
-    lstmGrad.checkFgGrad = nullptr;
-    lstmGrad.checkOgGrad = nullptr;
-  }
-  lstmGrad.gateGrad = gate_.grad->getData();
-  lstmGrad.stateGrad = state_.grad->getData();
-  lstmGrad.stateActiveGrad = nullptr;
-  lstmGrad.outputGrad = output_.grad->getData();
-
-  if (!reversed_) {
-    lstmValue.gateValue += (batchSize - 1) * getSize() * 4;
-    lstmGrad.gateGrad += (batchSize - 1) * getSize() * 4;
-    lstmValue.stateValue += (batchSize - 1) * getSize();
-    lstmGrad.stateGrad += (batchSize - 1) * getSize();
-    lstmValue.stateActiveValue += (batchSize - 1) * getSize();
-    lstmGrad.outputGrad += (batchSize - 1) * getSize();
-    lstmValue.prevStateValue = lstmValue.stateValue - getSize();
-    lstmGrad.prevStateGrad = lstmGrad.stateGrad - getSize();
-  } else {
-    lstmValue.prevStateValue = lstmValue.stateValue + getSize();
-    lstmGrad.prevStateGrad = lstmGrad.stateGrad + getSize();
-  }
-
-  auto nextFrame = [&lstmValue, &lstmGrad](bool reversed, int frameSize) {
-    if (reversed) {
-      lstmValue.gateValue += frameSize * 4;
-      lstmGrad.gateGrad += frameSize * 4;
-      lstmValue.stateValue += frameSize;
-      lstmGrad.stateGrad += frameSize;
-      lstmValue.stateActiveValue += frameSize;
-      lstmGrad.outputGrad += frameSize;
-      lstmValue.prevStateValue = lstmValue.stateValue + frameSize;
-      lstmGrad.prevStateGrad = lstmGrad.stateGrad + frameSize;
-    } else {
-      lstmValue.gateValue -= frameSize * 4;
-      lstmGrad.gateGrad -= frameSize * 4;
-      lstmValue.stateValue -= frameSize;
-      lstmGrad.stateGrad -= frameSize;
-      lstmValue.stateActiveValue -= frameSize;
-      lstmGrad.outputGrad -= frameSize;
-      lstmValue.prevStateValue = lstmValue.stateValue - frameSize;
-      lstmGrad.prevStateGrad = lstmGrad.stateGrad - frameSize;
-    }
-  };
-
-  MatrixPtr frameGate = Matrix::create(nullptr,
-                                       /* height= */ 1,
-                                       getSize() * 4,
-                                       /* trans= */ false,
-                                       useGpu_);
-  MatrixPtr frameOutput = Matrix::create(nullptr,
-                                         /* height= */ 1,
-                                         getSize(),
-                                         /* trans= */ false,
-                                         useGpu_);
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    for (size_t n = 0; n < numSequences; ++n) {
-      int length;
-      int start;
-      if (reversed_) {
-        length = starts[n + 1] - starts[n];
-        start = starts[n];
-      } else {
-        length = starts[numSequences - n] - starts[numSequences - n - 1];
-        start = starts[numSequences - n - 1];
-      }
-      for (int l = 0; l < length; ++l) {
-        if (l == length - 1) {
-          lstmValue.prevStateValue = nullptr;
-          lstmGrad.prevStateGrad = nullptr;
-        }
-        if (useGpu_) {
-          LstmCompute::backwardOneSequence<1>(lstmValue, lstmGrad, getSize());
-        } else {
-          LstmCompute::backwardOneSequence<0>(lstmValue, lstmGrad, getSize());
-        }
-        if (l != length - 1) {
-          frameGate->setData(lstmGrad.gateGrad);
-          nextFrame(reversed_, getSize());
-          frameOutput->setData(lstmGrad.outputGrad);
-          frameOutput->mul(*frameGate, *weightT, 1, 1);
-        } else {
-          nextFrame(reversed_, getSize());
-        }
-      }
-
-      if (weight_->getWGrad()) {
-        if (!reversed_) {
-          weight_->getWGrad()->mul(
-              *output_.value->subMatrix(start, length - 1)->getTranspose(),
-              *gate_.grad->subMatrix(start + 1, length - 1),
-              1,
-              1);
-        } else {
-          weight_->getWGrad()->mul(
-              *output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
-              *gate_.grad->subMatrix(start, length - 1),
-              1,
-              1);
-        }
-      }
-    }
-  }
-
-  if (inputGrad) {
-    inputGrad->add(*gate_.grad);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    localBiasGrad_->collectBias(*gate_.grad, 1);
-  }
-}
-
-void LstmLayer::forwardBatch(int batchSize,
-                             size_t numSequences,
-                             const int *starts,
-                             MatrixPtr inputValue) {
-  REGISTER_TIMER_INFO("LstmFwBatchTime", getName().c_str());
-
-  hl_lstm_value lstmValue;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-
-  if (!batchValue_) {
-    batchValue_.reset(new SequenceToBatch(useGpu_));
-  }
-  batchValue_->resizeOrCreateBatch(
-      batchSize, numSequences, starts, reversed_, prevOutput_ ? true : false);
-
-  batchValue_->resizeOrCreate(*output_.value);
-  batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */ true);
-  if (bias_) {
-    gate_.value->addBias(*localBias_, 1);
-  }
-
-  {
-    int numBatch = batchValue_->getNumBatch();
-    int batchSize = 0;
-    AsyncGpuBlock asyncGpuBlock;
-    if (prevState_) {
-      lstmValue.prevStateValue = totalState_->getData();
-    } else {
-      lstmValue.prevStateValue = nullptr;
-    }
-    for (int n = 0; n < numBatch; n++) {
-      MatrixPtr outputValue = batchValue_->getBatchValue(n);
-      MatrixPtr gateValue = batchValue_->getBatchValue(*gate_.value, n);
-      batchSize = outputValue->getHeight();
-
-      if (n != 0) {
-        MatrixPtr batch1 = batchValue_->getBatchValue(n - 1, batchSize);
-        gateValue->mul(*batch1, *weight_->getW(), 1, 1);
-      } else if (prevOutput_) {
-        Matrix::resizeOrCreate(prevBatchOutput2_,
-                               gateValue->getHeight(),
-                               getSize(),
-                               false,
-                               useGpu_);
-        batchValue_->prevOutput2Batch(*prevOutput_, *prevBatchOutput2_);
-        gateValue->mul(*prevBatchOutput2_, *weight_->getW(), 1, 1);
-
-        batchValue_->prevOutput2Batch(*prevState_,
-                                      *totalState_->subMatrix(0, numSequences));
-      }
-
-      lstmValue.gateValue = gateValue->getData();
-      lstmValue.outputValue = outputValue->getData();
-      lstmValue.stateValue =
-          batchValue_->getBatchValue(*state_.value, n)->getData();
-      lstmValue.stateActiveValue =
-          batchValue_->getBatchValue(*preOutput_.value, n)->getData();
-      {
-        if (useGpu_) {
-          LstmCompute::forwardBatch<1>(lstmValue, getSize(), batchSize);
-        } else {
-          LstmCompute::forwardBatch<0>(lstmValue, getSize(), batchSize);
-        }
-      }
-      lstmValue.prevStateValue = lstmValue.stateValue;
-    }
-  }
-  {
-    REGISTER_TIMER_INFO("batchToSeq", getName().c_str());
-    batchValue_->copyBackSeq(*output_.value);
-  }
-  if (prevOutput_) {
-    getPrevBatchOutput(numSequences);
-    getPrevBatchState(numSequences);
-  }
-}
-
-void LstmLayer::getPrevBatchOutput(size_t numSequences) {
-  prevOutput_->resize(numSequences, getSize());
-  batchValue_->getSeqOutputFromBatch(*prevOutput_,
-                                     *batchValue_->getBatchValue());
-}
-
-void LstmLayer::getPrevBatchState(size_t numSequences) {
-  prevState_->resize(numSequences, getSize());
-  batchValue_->getSeqOutputFromBatch(*prevState_, *state_.value);
-}
-
-void LstmLayer::backwardBatch(int batchSize,
-                              size_t numSequences,
-                              const int *starts,
-                              MatrixPtr inputGrad) {
-  REGISTER_TIMER_INFO("LstmBwBatchTime", getName().c_str());
-
-  hl_lstm_value lstmValue;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-
-  hl_lstm_grad lstmGrad;
-  lstmGrad.stateActiveGrad = preOutput_.grad->getData();
-
-  if (bias_->getWGrad()) {
-    lstmGrad.checkIgGrad = checkIgGrad_->getData();
-    lstmGrad.checkFgGrad = checkFgGrad_->getData();
-    lstmGrad.checkOgGrad = checkOgGrad_->getData();
-  } else {
-    lstmGrad.checkIgGrad = nullptr;
-    lstmGrad.checkFgGrad = nullptr;
-    lstmGrad.checkOgGrad = nullptr;
-  }
-
-  if (!batchGrad_) {
-    batchGrad_.reset(new SequenceToBatch(useGpu_));
-  }
-  batchGrad_->shareIndexWith(*batchValue_);
-
-  {
-    REGISTER_TIMER_INFO("seqToBatch", getName().c_str());
-    batchGrad_->copyFromSeq(*output_.grad);
-  }
-
-  {
-    MatrixPtr weightT = weight_->getW()->getTranspose();
-    int numBatch = batchGrad_->getNumBatch();
-    int batchSize = 0;
-    AsyncGpuBlock asyncGpuBlock;
-    for (int n = (int)numBatch - 1; n >= 0; n--) {
-      MatrixPtr outputGrad = batchGrad_->getBatchValue(n);
-      MatrixPtr gateGrad = batchGrad_->getBatchValue(*gate_.grad, n);
-
-      lstmValue.gateValue =
-          batchGrad_->getBatchValue(*gate_.value, n)->getData();
-      lstmValue.stateValue =
-          batchGrad_->getBatchValue(*state_.value, n)->getData();
-      lstmValue.stateActiveValue =
-          batchGrad_->getBatchValue(*preOutput_.value, n)->getData();
-      lstmGrad.stateGrad =
-          batchGrad_->getBatchValue(*state_.grad, n)->getData();
-      lstmGrad.gateGrad = gateGrad->getData();
-      lstmGrad.outputGrad = outputGrad->getData();
-      {
-        batchSize = outputGrad->getHeight();
-        if (n != 0) {
-          lstmValue.prevStateValue =
-              batchGrad_->getBatchValue(*state_.value, n - 1)->getData();
-          lstmGrad.prevStateGrad =
-              batchGrad_->getBatchValue(*state_.grad, n - 1)->getData();
-        } else {
-          if (prevState_) {
-            lstmValue.prevStateValue = totalState_->getData();
-            lstmGrad.prevStateGrad = nullptr;
-          } else {
-            lstmValue.prevStateValue = nullptr;
-            lstmGrad.prevStateGrad = nullptr;
-          }
-        }
-        if (useGpu_) {
-          LstmCompute::backwardBatch<1>(
-              lstmValue, lstmGrad, getSize(), batchSize);
-        } else {
-          LstmCompute::backwardBatch<0>(
-              lstmValue, lstmGrad, getSize(), batchSize);
-        }
-      }
-
-      if (n != 0) {
-        MatrixPtr tmp = batchGrad_->getBatchValue(n - 1, batchSize);
-        tmp->mul(*gateGrad, *weightT, 1, 1);
-      }
-
-      if (n != 0 && weight_->getWGrad()) {
-        /* backward weight */
-        MatrixPtr outputValue = batchValue_->getBatchValue(n - 1, batchSize);
-        weight_->getWGrad()->mul(*outputValue->getTranspose(), *gateGrad, 1, 1);
-      } else if (prevOutput_ && weight_->getWGrad()) {
-        weight_->getWGrad()->mul(
-            *prevBatchOutput2_->getTranspose(), *gateGrad, 1, 1);
-      }
-    }
-  }
-
-  if (inputGrad) {
-    batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */ false);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    localBiasGrad_->collectBias(*gate_.grad, /* scale */ 1);
-  }
-}
-
-void LstmLayer::forwardSeqParallel(int batchSize,
-                                   size_t numSequences,
-                                   const int *starts,
-                                   MatrixPtr inputValue) {
-  REGISTER_TIMER_INFO("LstmFwSeqParallelTime", getName().c_str());
-  gate_.value->assign(*inputValue);
-  if (bias_) {
-    gate_.value->addBias(*localBias_, /* scale */ 1);
-  }
-
-  real *gateValue = gate_.value->getData();
-  real *stateValue = state_.value->getData();
-  real *outputValue = output_.value->getData();
-  real *preOutputValue = preOutput_.value->getData();
-  real *checkIg = checkIg_->getData();
-  real *checkFg = checkFg_->getData();
-  real *checkOg = checkOg_->getData();
-  real *weight = weight_->getW()->getData();
-  hl_lstm_parallel_forward(gateValue,
-                           stateValue,
-                           preOutputValue,
-                           outputValue,
-                           checkIg,
-                           checkFg,
-                           checkOg,
-                           weight,
-                           starts,
-                           getSize(),
-                           numSequences,
-                           reversed_,
-                           activeNode_,
-                           activeGate_,
-                           activeState_);
-}
-
-void LstmLayer::backwardSeqParallel(int batchSize,
-                                    size_t numSequences,
-                                    const int *starts,
-                                    MatrixPtr inputGrad) {
-  REGISTER_TIMER_INFO("LstmBwSeqParallelTime", getName().c_str());
-  real *gateValue = gate_.value->getData();
-  real *gateGrad = gate_.grad->getData();
-  real *stateValue = state_.value->getData();
-  real *stateGrad = state_.grad->getData();
-  real *preOutputValue = preOutput_.value->getData();
-  real *preOutputGrad = preOutput_.grad->getData();
-  real *checkIg = checkIg_->getData();
-  real *checkFg = checkFg_->getData();
-  real *checkOg = checkOg_->getData();
-  real *outputGrad = output_.grad->getData();
-  real *weight = weight_->getW()->getData();
-
-  real *checkIgGrad;
-  real *checkFgGrad;
-  real *checkOgGrad;
-  if (bias_->getWGrad()) {
-    checkIgGrad = checkIgGrad_->getData();
-    checkFgGrad = checkFgGrad_->getData();
-    checkOgGrad = checkOgGrad_->getData();
-  } else {
-    checkIgGrad = nullptr;
-    checkFgGrad = nullptr;
-    checkOgGrad = nullptr;
-  }
-
-  hl_lstm_parallel_backward_data(gateValue,
-                                 gateGrad,
-                                 stateValue,
-                                 stateGrad,
-                                 preOutputValue,
-                                 preOutputGrad,
-                                 outputGrad,
-                                 checkIg,
-                                 checkIgGrad,
-                                 checkFg,
-                                 checkFgGrad,
-                                 checkOg,
-                                 checkOgGrad,
-                                 weight,
-                                 starts,
-                                 getSize(),
-                                 numSequences,
-                                 reversed_,
-                                 activeNode_,
-                                 activeGate_,
-                                 activeState_);
-
-  if (inputGrad) {
-    inputGrad->add(*gate_.grad);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    localBiasGrad_->collectBias(*gate_.grad, 1);
-  }
-
-  real *outputValue = output_.value->getData();
-  if (weight_->getWGrad()) {
-    real *weightGrad = weight_->getWGrad()->getData();
-    hl_lstm_parallel_backward_weight(weightGrad,
-                                     outputValue,
-                                     gateGrad,
-                                     starts,
-                                     getSize(),
-                                     batchSize,
-                                     numSequences,
-                                     reversed_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/LstmLayer.h b/paddle/gserver/layers/LstmLayer.h
deleted file mode 100644
index 4568b13ade5555e3cff703ceda1bbce3007c409d..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/LstmLayer.h
+++ /dev/null
@@ -1,221 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "LstmCompute.h"
-#include "SequenceToBatch.h"
-#include "paddle/math/BaseMatrix.h"
-#include "paddle/math/Matrix.h"
-namespace paddle {
-
-/**
- * @brief LstmLayer takes 1 input layer with size * 4.
- * Input layer is diveded into 4 equal parts:
- *   (input_s, input_ig, input_fg, input_og)
- *
- * For each sequence [start, end] it performs the following computation:
- * @code
- * output_{i} = actState(state_{i}) * actGate(outputGate_{i})
- * state_{i} = actInput(input_s_{i} + bias_s +
- *             output_{i-1} * recurrIW) * actGate(inputGate_{i}) +
- *             actGate(forgetGate_{i}) * state_{i-1}
- * inputGate = input_ig_{i} + bias_ig + output_{i-1} * recurrIGW +
- *             state_{i-1} * inputCheck
- * ouputGate = input_og_{i} + bias_og + output_{i-1} * recurrOGW +
- *             state_{i} * outputCheck
- * forgetGate = input_fg_{i} + bias_fg + output_{i-1} * recurrFGW +
- *              state_{i-1} * forgetCheck
- * @endcode
- *
- * - parameter[0] consists of (recurrIW, recurrIGW, recurrFGW, recurrOGW)
- * - baisParameter consists of
- *   (bias_s, bias_ig, bias_og, bias_fg, inputCheck, forgetCheck, outputCheck)
- *
- * - actInput is defined by config active_type.
- * - actState is defined by config active_state_type.
- * - actGate is defined by config actvie_gate_type.
- *
- * There are two ways to compute, namely one sequence by one sequence or
- * one batch by one batch. By default and no setting pre_batch_state true,
- * it will compute batch by batch.
- *
- * The formula in the paper is as follows:
- * \f[
- * i_t = \sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i) \\
- * f_t = \sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f) \\
- * \tilde{c_t} = tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c) \\
- * o_t = \sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o) \\
- * c_t = f_t * c_{t-1} + i_t * \tilde{c_t} \\
- * h_t = o_t tanh(c_t)
- * \f]
- *
- * @note These \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$
- * operations on the input sequence were NOT included in LstmLayer. So
- * users should use fc_layer or mixed_layer before lstm_later.
- *
- * The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
- * The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf}, W_{co}\f$.
- */
-
-class LstmLayer : public Layer, public LstmCompute {
-public:
-  explicit LstmLayer(const LayerConfig &config) : Layer(config) {}
-
-  bool init(const LayerMap &layerMap,
-            const ParameterMap &parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback &callback) override;
-
-  void resetState() override;
-
-  void setState(LayerStatePtr state) override;
-
-  LayerStatePtr getState() override;
-
-protected:
-  /**
-   * @brief Compute lstm forward one sequence by one sequence.
-   * @param batchSize The batchSize is not equal to the batch_size in
-   * the config file. It is the total words number of all samples
-   * in this forward batch.
-   * @param numSequences The sample number. It is equal to the batch_size
-   * in the config file.
-   * @param starts Each start position of each samples.
-   * @param inputValue The input values.
-   */
-  void forwardSequence(int batchSize,
-                       size_t numSequences,
-                       const int *starts,
-                       MatrixPtr inputValue);
-  /**
-   * Compute lstm backward one sequence by one sequence.
-   */
-  void backwardSequence(int batchSize,
-                        size_t numSequences,
-                        const int *starts,
-                        MatrixPtr inputGrad);
-
-  /**
-   * Compute lstm forward one batch by one batch. The batch value is
-   * reorganized by SequenceToBatch class. The batch output value will
-   * be convert into sequence value after finishing forward. Here, one
-   * batch contains one word of each sample. If the length of each sample
-   * is not equality, the batch will not pads zero and contains less words.
-   * The total batch numbers are the max length of the sequence. The details
-   * can refer to SequenceToBatch class. On GPU mode, it will launch GPU
-   * kernel for loop.
-   *
-   * @code
-   * for (int i = 0; i < numBatch(max_sequence_length); ++i) {
-   *   compute one batch.
-   * }
-   * @endcode
-   */
-  void forwardBatch(int batchSize,
-                    size_t numSequences,
-                    const int *starts,
-                    MatrixPtr inputValue);
-  /**
-   * Compute lstm backward one batch by one batch.
-   */
-  void backwardBatch(int batchSize,
-                     size_t numSequences,
-                     const int *starts,
-                     MatrixPtr inputGrad);
-
-  /**
-   * This function only supports GPU. It not need to reorganize input into
-   * batch value. It will launch one kernel to parallelly compute forward
-   * propagation in sequence level.
-   */
-  void forwardSeqParallel(int batchSize,
-                          size_t numSequences,
-                          const int *starts,
-                          MatrixPtr inputValue);
-  /**
-   * Backward propagation corresponding to forwardSeqParallel.
-   */
-  void backwardSeqParallel(int batchSize,
-                           size_t numSequences,
-                           const int *starts,
-                           MatrixPtr inputGrad);
-  /**
-   * This function is used for sequence generation and get output after
-   * forwardBatch.
-   */
-  void getPrevBatchOutput(size_t numSequences);
-  /**
-   * This function is used for sequence generation and get state after
-   * forwardBatch.
-   */
-  void getPrevBatchState(size_t numSequences);
-
-protected:
-  /// Learned parameters, shape: (size, 4*size).
-  /// The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
-  std::unique_ptr<Weight> weight_;
-  /// Learned bias parameter, shape: (1, 7 * size).
-  /// The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf},
-  /// W_{co}\f$.
-  std::unique_ptr<Weight> bias_;
-  /// The reeal bias, point to \f$b_i, b_f, b_c, b_o\f$.
-  MatrixPtr localBias_;
-  /// The peephole connection for input gate.
-  MatrixPtr checkIg_;
-  /// The peephole connection for forget gate.
-  MatrixPtr checkFg_;
-  /// The peephole connection for output gate.
-  MatrixPtr checkOg_;
-  /// The gradient of real bias
-  MatrixPtr localBiasGrad_;
-  /// The gradient of peephole connection for input gates.
-  MatrixPtr checkIgGrad_;
-  /// The gradient of peephole connection for forget gates.
-  MatrixPtr checkFgGrad_;
-  /// The gradient of peephole connection for output gates.
-  MatrixPtr checkOgGrad_;
-
-  /// Stores the cell state of previous time step, namely \f$c_{t-1}\f$.
-  Argument state_;
-  /// Stores the hidden of previous time step, namely \f$h_{t-1}\f$.
-  Argument preOutput_;
-  /// Stores the value and gradient of four gates, namely
-  /// \f$i_t, f_t, o_t, c_t\f$.
-  Argument gate_;
-  /// Whether it is reversed lstm.
-  bool reversed_;
-  /// Whether to use batch method to compute.
-  bool useBatch_;
-  /// Whether to use sequence parallell method to compute.
-  bool useSeqParallel_;
-  /// batchValue_ is used in method of batch calculation. It stores the
-  /// batch value after reorganized input.
-  std::unique_ptr<SequenceToBatch> batchValue_;
-  /// The gradient of batchValue_.
-  std::unique_ptr<SequenceToBatch> batchGrad_;
-
-  /// Used in generation and stores the state of previous time step.
-  MatrixPtr prevState_;
-  /// Used in generation and stores the output of previous time step.
-  MatrixPtr prevOutput_;
-  MatrixPtr prevBatchOutput2_;
-  /// The total state.
-  MatrixPtr totalState_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/LstmStepLayer.cpp b/paddle/gserver/layers/LstmStepLayer.cpp
deleted file mode 100644
index 8faaa1c4e138fe1ec04b1911449d05528bb5b8b5..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/LstmStepLayer.cpp
+++ /dev/null
@@ -1,194 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "LstmCompute.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/*
- * LstmStepLayer used in recurrent layer group.
- */
-class LstmStepLayer : public Layer, public LstmCompute {
-protected:
-  Argument state_;
-  Argument gate_;
-  Argument stateActive_;
-  MatrixPtr checkIg_, checkFg_, checkOg_;
-  MatrixPtr checkIgGrad_, checkFgGrad_, checkOgGrad_;
-  std::unique_ptr<Weight> weight_;
-
-public:
-  explicit LstmStepLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~LstmStepLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(lstm_step, LstmStepLayer);
-
-bool LstmStepLayer::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(2U, inputLayers_.size());
-
-  checkIg_ = Matrix::create(nullptr,
-                            /* height= */ 1,
-                            getSize(),
-                            /* trans= */ false,
-                            useGpu_);
-  checkFg_ = Matrix::create(nullptr,
-                            /* height= */ 1,
-                            getSize(),
-                            /* trans= */ false,
-                            useGpu_);
-  checkOg_ = Matrix::create(nullptr,
-                            /* height= */ 1,
-                            getSize(),
-                            /* trans= */ false,
-                            useGpu_);
-  checkIgGrad_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                getSize(),
-                                /* trans= */ false,
-                                useGpu_);
-  checkFgGrad_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                getSize(),
-                                /* trans= */ false,
-                                useGpu_);
-  checkOgGrad_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                getSize(),
-                                /* trans= */ false,
-                                useGpu_);
-
-  if (biasParameter_.get() != NULL) {
-    CHECK_EQ(getSize() * 3, biasParameter_->getSize());
-    weight_.reset(new Weight(1, getSize() * 3, biasParameter_));
-    if (weight_->getW()) {
-      real* data = weight_->getW()->getData();
-      checkIg_->setData(data);
-      checkFg_->setData(data + getSize());
-      checkOg_->setData(data + getSize() * 2);
-    }
-
-    if (weight_->getWGrad()) {
-      real* data = weight_->getWGrad()->getData();
-      checkIgGrad_->setData(data);
-      checkFgGrad_->setData(data + getSize());
-      checkOgGrad_->setData(data + getSize() * 2);
-    }
-  }
-
-  setOutput("state", &state_);
-  LstmCompute::init(config_);
-  return true;
-}
-
-void LstmStepLayer::forward(PassType passType) {
-  REGISTER_TIMER_INFO("LstmRecurrentFwTime", getName().c_str());
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-  const Argument& prevState = getInput(1);
-  CHECK_EQ(getSize() * 4, input.value->getWidth());
-  CHECK_EQ(getSize(), prevState.value->getWidth());
-  int batchSize = input.getBatchSize();
-  reserveOutput(batchSize, getSize());
-  resetSpecifyOutput(state_,
-                     batchSize,
-                     getSize(),
-                     /*  isValueClean */ false,
-                     /* isGradClean */ true);
-  resetSpecifyOutput(gate_,
-                     batchSize,
-                     getSize() * 4,
-                     /* isValueClean */ false,
-                     /* isGradClean */ false);
-  resetSpecifyOutput(stateActive_,
-                     batchSize,
-                     getSize(),
-                     /*  isValueClean */ false,
-                     /* isGradClean */ false);
-  gate_.value->assign(*input.value);
-
-  hl_lstm_value lstmValue;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-  lstmValue.gateValue = gate_.value->getData();
-  lstmValue.stateValue = state_.value->getData();
-  lstmValue.prevStateValue = prevState.value->getData();
-  lstmValue.stateActiveValue = stateActive_.value->getData();
-  lstmValue.outputValue = output_.value->getData();
-
-  if (useGpu_) {
-    LstmCompute::forwardBatch<1>(lstmValue, getSize(), batchSize);
-  } else {
-    LstmCompute::forwardBatch<0>(lstmValue, getSize(), batchSize);
-  }
-}
-
-void LstmStepLayer::backward(const UpdateCallback& callback) {
-  REGISTER_TIMER_INFO("LstmRecurrentBwTime", getName().c_str());
-  const Argument& input = getInput(0);
-  const Argument& prevState = getInput(1);
-  int batchSize = input.getBatchSize();
-
-  hl_lstm_value lstmValue;
-  hl_lstm_grad lstmGrad;
-  lstmValue.checkIg = checkIg_->getData();
-  lstmValue.checkFg = checkFg_->getData();
-  lstmValue.checkOg = checkOg_->getData();
-  lstmValue.gateValue = gate_.value->getData();
-  lstmValue.prevStateValue = prevState.value->getData();
-  lstmValue.stateValue = state_.value->getData();
-  lstmValue.stateActiveValue = stateActive_.value->getData();
-
-  lstmGrad.gateGrad = gate_.grad->getData();
-  if (prevState.grad) {
-    lstmGrad.prevStateGrad = prevState.grad->getData();
-  } else {
-    lstmGrad.prevStateGrad = nullptr;
-  }
-  lstmGrad.stateGrad = state_.grad->getData();
-  lstmGrad.stateActiveGrad = stateActive_.grad->getData();
-  lstmGrad.outputGrad = output_.grad->getData();
-  lstmGrad.checkIgGrad = checkIgGrad_->getData();
-  lstmGrad.checkFgGrad = checkFgGrad_->getData();
-  lstmGrad.checkOgGrad = checkOgGrad_->getData();
-
-  if (useGpu_) {
-    LstmCompute::backwardBatch<1>(lstmValue, lstmGrad, getSize(), batchSize);
-  } else {
-    LstmCompute::backwardBatch<0>(lstmValue, lstmGrad, getSize(), batchSize);
-  }
-
-  if (input.grad) {
-    input.grad->add(*gate_.grad);
-  }
-
-  if (weight_) {
-    weight_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MDLstmLayer.cpp b/paddle/gserver/layers/MDLstmLayer.cpp
deleted file mode 100644
index 7cfdb3ff25096ad06c09434cdee48b5f85d650af..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MDLstmLayer.cpp
+++ /dev/null
@@ -1,769 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "LstmLayer.h"
-#include "paddle/math/BaseMatrix.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-class CoordIterator {
-public:
-  std::vector<int> dims_;
-  std::vector<bool> directions_;
-  std::vector<int> curPos_;
-  bool end_;
-
-  void step(size_t d, bool reversed) {
-    if (directions_[d] ^ reversed) {
-      if (curPos_[d] == dims_[d] - 1) {
-        curPos_[d] = 0;
-        if (d) {
-          step(d - 1, reversed);
-        } else {
-          end_ = true;
-        }
-      } else {
-        curPos_[d]++;
-      }
-    } else {
-      if (curPos_[d] == 0) {
-        curPos_[d] = dims_[d] - 1;
-        if (d) {
-          step(d - 1, reversed);
-        } else {
-          end_ = true;
-        }
-      } else {
-        curPos_[d]--;
-      }
-    }
-  }
-
-public:
-  CoordIterator(std::vector<int> dim, std::vector<bool> directions)
-      : dims_(dim), directions_(directions), end_(false) {
-    CHECK_EQ(dims_.size(), directions_.size());
-    for (size_t i = 0; i < dims_.size(); i++) {
-      curPos_.push_back(-1);
-    }
-  }
-  CoordIterator& operator++() {
-    step(dims_.size() - 1, false);
-    return *this;
-  }
-
-  CoordIterator& operator--() {
-    step(dims_.size() - 1, true);
-    return *this;
-  }
-
-  std::vector<int>& curPos() { return curPos_; }
-
-  int offset() {
-    int offset = curPos_[0];
-    for (size_t i = 1; i < dims_.size(); i++) {
-      offset = offset * dims_[i] + curPos_[i];
-    }
-    return offset;
-  }
-
-  int offset(const std::vector<int>& pos) {
-    int offset = pos[0];
-    for (size_t i = 1; i < dims_.size(); i++) {
-      offset = offset * dims_[i] + pos[i];
-    }
-    return offset;
-  }
-
-  std::vector<int>& begin() {
-    for (size_t i = 0; i < dims_.size(); i++) {
-      curPos_[i] = directions_[i] ? 0 : dims_[i] - 1;
-    }
-    end_ = false;
-    return curPos_;
-  }
-
-  std::vector<int>& rbegin() {
-    for (size_t i = 0; i < dims_.size(); i++) {
-      curPos_[i] = directions_[i] ? dims_[i] - 1 : 0;
-    }
-    end_ = false;
-    return curPos_;
-  }
-
-  bool end() { return end_; }
-
-  bool getPrePos(const std::vector<int>& delays,
-                 int idx,
-                 std::vector<int>& prePos) {
-    bool isAvial = true;
-    prePos.clear();
-    prePos.reserve(directions_.size());
-    for (size_t i = 0; i < directions_.size(); i++) {
-      if (int(i) == idx) {
-        prePos.push_back(curPos_[i] + delays[i] * (directions_[i] ? 1 : -1));
-        if (prePos[i] < 0) {
-          prePos[i] = 0;
-          isAvial = false;
-        }
-        if (prePos[i] >= dims_[i]) {
-          prePos[i] = dims_[i] - 1;
-          isAvial = false;
-        }
-      } else {
-        prePos.push_back(curPos_[i]);
-      }
-    }
-    return isAvial;
-  }
-
-  bool getNextPos(const std::vector<int>& delays,
-                  int idx,
-                  std::vector<int>& nextPos) {
-    bool isAvial = true;
-    nextPos.clear();
-    nextPos.reserve(directions_.size());
-    for (size_t i = 0; i < directions_.size(); i++) {
-      if (int(i) == idx) {
-        nextPos.push_back(curPos_[i] - delays[i] * (directions_[i] ? 1 : -1));
-        if (nextPos[i] < 0) {
-          nextPos[i] = 0;
-          isAvial = false;
-        }
-        if (nextPos[i] >= dims_[i]) {
-          nextPos[i] = dims_[i] - 1;
-          isAvial = false;
-        }
-      } else {
-        nextPos.push_back(curPos_[i]);
-      }
-    }
-    return isAvial;
-  }
-};
-/*
- * MDLstmLayer takes 1 input layer with size * (3+numDims).
- * For each sequence [start, end] it performs the following computation:
- * out_i = actState(state_i) * actGate(outputGate_i)
- *
- * For example the image with 2 dims, we take the scanning order from left-top
- * to right-bottom, then the 2 previous states of the current pixels are the
- * ones located at left and top. And each of them has a independent forget gate.
- *
- * state_i = actInput(input_i) * actGate(inputGate_i) +
- *           \sum{j}(actGate(forgetGate_i_j) * state_prev_i_j)
- *
- * inputGate = input_i * inputW + \sum{j}(output_prev_i_j * recurrInputW_j) +
- *             \sum{j}(state_prev_i_j * inputCheck_j)
- *
- * ouputGate = input_i * outputW + \sum{j}(output_prev_i_j * recurrOutputW_j) +
- *             state_i * outputCheck
- *
- * forgetGate_j = input_i * forgetW_j + \sum{j}(output_prev_i_j *
- *                recurrForgetW_j) + \sum{j}(state_prev_i_j * forgetCheck_j)
- *
- * IG Layer: (Input, InputGate, ForgetGates, OutputGate) * OutputSize
- * */
-
-class MDLstmLayer : public LstmLayer {
-public:
-  explicit MDLstmLayer(const LayerConfig& config) : LstmLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback) override;
-
-protected:
-  void forwardOneSequence(int start, CoordIterator& coordIter);
-  void backwardOneSequence(int start, CoordIterator& coordIter);
-  void forwardGate2OutputSequence(int start, CoordIterator& coordIter);
-  void backwardGate2OutputSequence(int start, CoordIterator& coordIter);
-
-protected:
-  std::vector<Argument> frameInputGate_;
-  std::vector<Argument> frameForgetGate_;
-  std::vector<Argument> frameOutputGate_;
-  std::vector<Argument> frameInputNode_;
-  std::vector<Argument> frameGate_;
-  std::vector<Argument> frameState_;
-  std::vector<Argument> framePreOutput_;
-  std::vector<Argument> frameOutput_;
-
-  // Activation
-  std::unique_ptr<ActivationFunction> activationGate_;
-  std::unique_ptr<ActivationFunction> activationState_;
-
-  int numDims_;
-  size_t numBlocks_;
-  std::vector<bool> directions_;
-  std::vector<int> delays_;
-  std::vector<std::vector<int>> dimsV_;
-};
-
-REGISTER_LAYER(mdlstmemory, MDLstmLayer);
-
-bool MDLstmLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(1U, inputLayers_.size());
-  CHECK_EQ(1U, parameters_.size());
-
-  numBlocks_ = getSize();
-  numDims_ = config_.directions_size();
-  CHECK_EQ(numBlocks_ * numBlocks_ * (3 + numDims_), parameters_[0]->getSize());
-
-  // inode(1), ig(1), fg(numDims_), og(1), peepIg(1), peepFg(numDims_),
-  // peepOg(1), then size of localBias_ is 3+numDims_
-  CHECK_EQ(numBlocks_ * (5 + 2 * numDims_), biasParameter_->getSize());
-  weight_.reset(
-      new Weight(numBlocks_, numBlocks_ * (3 + numDims_), parameters_[0]));
-  if (biasParameter_.get() != NULL) {
-    bias_.reset(new Weight(1, numBlocks_ * (5 + 2 * numDims_), biasParameter_));
-    localBias_ = Matrix::create(nullptr,
-                                /* height= */ 1,
-                                numBlocks_ * (3 + numDims_),
-                                /* trans= */ false,
-                                useGpu_);
-    checkIg_ = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    checkFg_ = Matrix::create(nullptr,
-                              /* height= */ numDims_,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    checkOg_ = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    localBiasGrad_ = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    numBlocks_ * (3 + numDims_),
-                                    /* trans= */ false,
-                                    useGpu_);
-    checkIgGrad_ = Matrix::create(nullptr,
-                                  /* height= */ 1,
-                                  numBlocks_,
-                                  /* trans= */ false,
-                                  useGpu_);
-    checkFgGrad_ = Matrix::create(nullptr,
-                                  /* height= */ numDims_,
-                                  numBlocks_,
-                                  /* trans= */ false,
-                                  useGpu_);
-    checkOgGrad_ = Matrix::create(nullptr,
-                                  /* height= */ 1,
-                                  numBlocks_,
-                                  /* trans= */ false,
-                                  useGpu_);
-
-    localBias_->setData(bias_->getW()->getData());
-    checkIg_->setData(bias_->getW()->getData() + numBlocks_ * (3 + numDims_));
-    checkFg_->setData(bias_->getW()->getData() + numBlocks_ * (4 + numDims_));
-    checkOg_->setData(bias_->getW()->getData() +
-                      numBlocks_ * (4 + 2 * numDims_));
-
-    if (bias_->getWGrad()) {
-      localBiasGrad_->setData(bias_->getWGrad()->getData());
-      checkIgGrad_->setData(bias_->getWGrad()->getData() +
-                            numBlocks_ * (3 + numDims_));
-      checkFgGrad_->setData(bias_->getWGrad()->getData() +
-                            numBlocks_ * (4 + numDims_));
-      checkOgGrad_->setData(bias_->getWGrad()->getData() +
-                            numBlocks_ * (4 + 2 * numDims_));
-    }
-  } else {
-    LOG(FATAL) << "Bias should be here.";
-  }
-  for (int i = 0; i < numDims_; i++) {
-    directions_.push_back(config_.directions(i));
-  }
-  for (int i = 0; i < numDims_; i++) {
-    delays_.push_back(-1);
-  }
-  activationGate_.reset(ActivationFunction::create(config_.active_gate_type()));
-  activationState_.reset(
-      ActivationFunction::create(config_.active_state_type()));
-
-  return true;
-}
-
-void MDLstmLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  int numSequences = input.getNumSequences();
-  resetOutput(batchSize, numBlocks_);
-  CHECK_EQ(numBlocks_ * (3 + numDims_), input.value->getWidth());
-  const int* starts = input.sequenceStartPositions->getData(false);
-  CHECK_EQ(starts[numSequences], batchSize);
-
-  int* dimsData = input.cpuSequenceDims->getData();
-  CHECK_EQ(int(input.cpuSequenceDims->getSize()), numDims_* numSequences);
-
-  for (int i = 0; i < numSequences; i++) {
-    std::vector<int> dims;
-    for (int j = 0; j < numDims_; j++) {
-      dims.push_back(dimsData[i * numDims_ + j]);
-    }
-    dimsV_.push_back(dims);
-  }
-
-  frameInputGate_.reserve(batchSize);
-  frameForgetGate_.reserve(batchSize);
-  frameOutputGate_.reserve(batchSize);
-  frameInputNode_.reserve(batchSize);
-  frameGate_.reserve(batchSize);
-  frameState_.reserve(batchSize);
-  framePreOutput_.reserve(batchSize);
-  frameOutput_.reserve(batchSize);
-
-  Matrix::resizeOrCreate(gate_.value,
-                         /* height= */ batchSize,
-                         numBlocks_ * (3 + numDims_),
-                         /* trans= */ false,
-                         useGpu_);
-
-  for (int i = frameGate_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               numBlocks_ * (3 + numDims_),
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_ * (3 + numDims_),
-                              /* trans= */ false,
-                              useGpu_);
-    frameGate_.push_back(arg);
-  }
-  for (int i = frameInputGate_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               numBlocks_,
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    frameInputGate_.push_back(arg);
-  }
-  for (int i = frameForgetGate_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ numDims_,
-                               numBlocks_,
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ numDims_,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    frameForgetGate_.push_back(arg);
-  }
-  for (int i = frameOutputGate_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               numBlocks_,
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    frameOutputGate_.push_back(arg);
-  }
-  for (int i = frameInputNode_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               numBlocks_,
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    frameInputNode_.push_back(arg);
-  }
-  for (int i = frameState_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(
-        /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
-    frameState_.push_back(arg);
-  }
-  for (int i = framePreOutput_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(
-        /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
-    framePreOutput_.push_back(arg);
-  }
-  for (int i = frameOutput_.size(); i < batchSize; i++) {
-    Argument arg;
-    arg.value = Matrix::create(nullptr,
-                               /* height= */ 1,
-                               numBlocks_,
-                               /* trans= */ false,
-                               useGpu_);
-    arg.grad = Matrix::create(nullptr,
-                              /* height= */ 1,
-                              numBlocks_,
-                              /* trans= */ false,
-                              useGpu_);
-    frameOutput_.push_back(arg);
-  }
-
-  for (int i = 0; i < batchSize; i++) {
-    frameOutput_[i].value->setData(output_.value->getData() + i * numBlocks_);
-    frameGate_[i].value->setData(gate_.value->getData() +
-                                 i * numBlocks_ * (3 + numDims_));
-    frameInputNode_[i].value->setData(gate_.value->getData() +
-                                      i * numBlocks_ * (3 + numDims_) +
-                                      numBlocks_ * 0);
-    frameInputGate_[i].value->setData(gate_.value->getData() +
-                                      i * numBlocks_ * (3 + numDims_) +
-                                      numBlocks_ * 1);
-    frameForgetGate_[i].value->setData(gate_.value->getData() +
-                                       i * numBlocks_ * (3 + numDims_) +
-                                       numBlocks_ * 2);
-    frameOutputGate_[i].value->setData(gate_.value->getData() +
-                                       i * numBlocks_ * (3 + numDims_) +
-                                       numBlocks_ * (2 + numDims_));
-  }
-
-  AsyncGpuBlock asyncGpuBlock;
-  gate_.value->assign(*input.value);
-
-  if (bias_) {
-    gate_.value->addBias(*localBias_, 1);
-  }
-
-  for (int i = 0; i < numSequences; i++) {
-    CoordIterator coordIter(dimsV_[i], directions_);
-    forwardOneSequence(starts[i], coordIter);
-  }
-}
-
-void MDLstmLayer::forwardGate2OutputSequence(int start,
-                                             CoordIterator& coordIter) {
-  int idxCurr = start + coordIter.offset();
-  std::vector<int> preOffsetV;
-  preOffsetV.reserve(numDims_);
-  for (int i = 0; i < numDims_; i++) {
-    std::vector<int> prePos;
-    if (coordIter.getPrePos(delays_, i, prePos)) {
-      preOffsetV[i] = coordIter.offset(prePos);
-    } else {
-      preOffsetV[i] = -1;
-    }
-  }
-
-  for (int i = 0; i < numDims_; i++) {
-    if (preOffsetV[i] >= 0) {
-      frameInputGate_[idxCurr].value->addDotMul(
-          *frameState_[start + preOffsetV[i]].value, *checkIg_, 1.0, 1.0);
-
-      MatrixPtr fgGateOneDim = Matrix::create(
-          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_,
-          1,
-          numBlocks_,
-          false,
-          useGpu_);
-      MatrixPtr checkFgOneDim =
-          Matrix::create(checkFg_->getData() + i * numBlocks_,
-                         1.0,
-                         numBlocks_,
-                         false,
-                         useGpu_);
-      fgGateOneDim->addDotMul(
-          *frameState_[start + preOffsetV[i]].value, *checkFgOneDim, 1.0, 1.0);
-    }
-  }
-  auto status = activationGate_->forward(frameInputGate_[idxCurr]);
-  status.check();
-  status = activationGate_->forward(frameForgetGate_[idxCurr]);
-  status.check();
-  status = activation_->forward(frameInputNode_[idxCurr]);
-  status.check();
-
-  frameState_[idxCurr].value->zeroMem();
-  for (int i = 0; i < numDims_; i++) {
-    if (preOffsetV[i] >= 0) {
-      MatrixPtr fgGateOneDim = Matrix::create(
-          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_,
-          1,
-          numBlocks_,
-          false,
-          useGpu_);
-      frameState_[idxCurr].value->addDotMul(
-          *frameState_[start + preOffsetV[i]].value, *fgGateOneDim, 1.0, 1.0);
-    }
-  }
-  frameState_[idxCurr].value->addDotMul(*frameInputNode_[idxCurr].value,
-                                        *frameInputGate_[idxCurr].value,
-                                        1.0,
-                                        1.0);
-
-  frameOutputGate_[idxCurr].value->addDotMul(
-      *frameState_[idxCurr].value, *checkOg_, 1.0, 1.0);
-  status = activationGate_->forward(frameOutputGate_[idxCurr]);
-  status.check();
-
-  framePreOutput_[idxCurr].value->copyFrom(*(frameState_[idxCurr].value));
-  status = activationState_->forward(framePreOutput_[idxCurr]);
-  status.check();
-
-  frameOutput_[idxCurr].value->dotMul(*framePreOutput_[idxCurr].value,
-                                      *frameOutputGate_[idxCurr].value);
-}
-
-void MDLstmLayer::forwardOneSequence(int start, CoordIterator& coordIter) {
-  for (coordIter.begin(); !coordIter.end(); ++coordIter) {
-    int offset = coordIter.offset();
-    for (int i = 0; i < numDims_; i++) {
-      std::vector<int> prePos;
-      if (coordIter.getPrePos(delays_, i, prePos)) {
-        int preOffset = coordIter.offset(prePos);
-        frameGate_[start + offset].value->mul(
-            *frameOutput_[start + preOffset].value, *weight_->getW(), 1.0, 1.0);
-      }
-    }
-    forwardGate2OutputSequence(start, coordIter);
-  }
-}
-
-void MDLstmLayer::backward(const UpdateCallback& callback) {
-  const Argument& input = getInput(0);
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  const int* starts = input.sequenceStartPositions->getData(false);
-  size_t numSequences = input.getNumSequences();
-
-  Matrix::resizeOrCreate(gate_.grad,
-                         /* height= */ batchSize,
-                         numBlocks_ * (3 + numDims_),
-                         /* trans= */ false,
-                         useGpu_);
-
-  for (int i = 0; i < batchSize; i++) {
-    if (frameState_[i].grad == NULL)
-      frameState_[i].grad = Matrix::create(
-          /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
-  }
-  for (int i = 0; i < batchSize; i++) {
-    if (framePreOutput_[i].grad == NULL)
-      framePreOutput_[i].grad = Matrix::create(
-          /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
-  }
-
-  for (int i = 0; i < batchSize; i++) {
-    frameOutput_[i].grad->setData(output_.grad->getData() + i * numBlocks_);
-    frameGate_[i].grad->setData(gate_.grad->getData() +
-                                i * numBlocks_ * (3 + numDims_));
-    frameInputNode_[i].grad->setData(gate_.grad->getData() +
-                                     i * numBlocks_ * (3 + numDims_) +
-                                     numBlocks_ * 0);
-    frameInputGate_[i].grad->setData(gate_.grad->getData() +
-                                     i * numBlocks_ * (3 + numDims_) +
-                                     numBlocks_ * 1);
-    frameForgetGate_[i].grad->setData(gate_.grad->getData() +
-                                      i * numBlocks_ * (3 + numDims_) +
-                                      numBlocks_ * 2);
-    frameOutputGate_[i].grad->setData(gate_.grad->getData() +
-                                      i * numBlocks_ * (3 + numDims_) +
-                                      numBlocks_ * (2 + numDims_));
-  }
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-
-    for (size_t i = 0; i < numSequences; i++) {
-      CoordIterator coordIter(dimsV_[i], directions_);
-      backwardOneSequence(starts[i], coordIter);
-    }
-  }
-
-  if (input.grad) {
-    input.grad->add(*gate_.grad);
-  }
-  if (bias_ && bias_->getWGrad()) {
-    localBiasGrad_->collectBias(*gate_.grad, 1);
-    bias_->getParameterPtr()->incUpdate(callback);
-  }
-
-  weight_->getParameterPtr()->incUpdate(callback);
-}
-
-void MDLstmLayer::backwardGate2OutputSequence(int start,
-                                              CoordIterator& coordIter) {
-  int idxCurr = start + coordIter.offset();
-  std::vector<int> preOffsetV;
-  std::vector<int> nextOffsetV;
-  preOffsetV.reserve(numDims_);
-  nextOffsetV.reserve(numDims_);
-  for (int i = 0; i < numDims_; i++) {
-    std::vector<int> prePos;
-    if (coordIter.getPrePos(delays_, i, prePos)) {
-      preOffsetV[i] = coordIter.offset(prePos);
-    } else {
-      preOffsetV[i] = -1;
-    }
-    std::vector<int> nextPos;
-    if (coordIter.getNextPos(delays_, i, nextPos)) {
-      nextOffsetV[i] = coordIter.offset(nextPos);
-    } else {
-      nextOffsetV[i] = -1;
-    }
-  }
-
-  framePreOutput_[idxCurr].grad->dotMul(*frameOutput_[idxCurr].grad,
-                                        *frameOutputGate_[idxCurr].value);
-  activationState_->backward(framePreOutput_[idxCurr]).check();
-  frameState_[idxCurr].grad->copyFrom(*(framePreOutput_[idxCurr].grad));
-
-  frameOutputGate_[idxCurr].grad->dotMul(*frameOutput_[idxCurr].grad,
-                                         *framePreOutput_[idxCurr].value);
-  activationGate_->backward(frameOutputGate_[idxCurr]).check();
-
-  frameState_[idxCurr].grad->addDotMul(
-      *frameOutputGate_[idxCurr].grad, *checkOg_, 1.0, 1.0);
-  for (int i = 0; i < numDims_; i++) {
-    if (nextOffsetV[i] >= 0) {
-      frameState_[idxCurr].grad->addDotMul(
-          *frameInputGate_[start + nextOffsetV[i]].grad, *checkIg_, 1.0, 1.0);
-
-      MatrixPtr fgGateOneDimGrad = Matrix::create(
-          frameForgetGate_[start + nextOffsetV[i]].grad->getData() +
-              i * numBlocks_,
-          1,
-          numBlocks_,
-          false,
-          useGpu_);
-      MatrixPtr fgGateOneDimVal = Matrix::create(
-          frameForgetGate_[start + nextOffsetV[i]].value->getData() +
-              i * numBlocks_,
-          1,
-          numBlocks_,
-          false,
-          useGpu_);
-      MatrixPtr checkFgOneDim = Matrix::create(
-          checkFg_->getData() + i * numBlocks_, 1, numBlocks_, false, useGpu_);
-
-      frameState_[idxCurr].grad->addDotMul(
-          *fgGateOneDimGrad, *checkFgOneDim, 1.0, 1.0);
-      frameState_[idxCurr].grad->addDotMul(
-          *frameState_[start + nextOffsetV[i]].grad,
-          *fgGateOneDimVal,
-          1.0,
-          1.0);
-    }
-  }
-
-  frameInputNode_[idxCurr].grad->dotMul(*frameState_[idxCurr].grad,
-                                        *frameInputGate_[idxCurr].value);
-  frameInputGate_[idxCurr].grad->dotMul(*frameState_[idxCurr].grad,
-                                        *frameInputNode_[idxCurr].value);
-
-  frameForgetGate_[idxCurr].grad->zeroMem();
-  for (int i = 0; i < numDims_; i++) {
-    if (preOffsetV[i] >= 0) {
-      MatrixPtr fgGateOneDimGrad = Matrix::create(
-          frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_,
-          1,
-          numBlocks_,
-          false,
-          useGpu_);
-      fgGateOneDimGrad->addDotMul(*frameState_[idxCurr].grad,
-                                  *frameState_[start + preOffsetV[i]].value,
-                                  1.0,
-                                  1.0);
-    }
-  }
-
-  activationGate_->backward(frameInputGate_[idxCurr]).check();
-  activationGate_->backward(frameForgetGate_[idxCurr]).check();
-  activation_->backward(frameInputNode_[idxCurr]).check();
-
-  if (bias_->getWGrad()) {
-    for (int i = 0; i < numDims_; i++) {
-      if (preOffsetV[i] >= 0) {
-        checkIgGrad_->addDotMul(*frameInputGate_[idxCurr].grad,
-                                *frameState_[start + preOffsetV[i]].value,
-                                1.0,
-                                1.0);
-
-        MatrixPtr fgGateOneDimGrad = Matrix::create(
-            frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_,
-            1,
-            numBlocks_,
-            false,
-            useGpu_);
-        MatrixPtr checkFgOneDimGrad =
-            Matrix::create(checkFgGrad_->getData() + i * numBlocks_,
-                           1,
-                           numBlocks_,
-                           false,
-                           useGpu_);
-        checkFgOneDimGrad->addDotMul(*fgGateOneDimGrad,
-                                     *frameState_[start + preOffsetV[i]].value,
-                                     1.0,
-                                     1.0);
-      }
-    }
-    checkOgGrad_->addDotMul(
-        *frameOutputGate_[idxCurr].grad, *frameState_[idxCurr].value, 1.0, 1.0);
-  }
-}
-
-void MDLstmLayer::backwardOneSequence(int start, CoordIterator& coordIter) {
-  MatrixPtr weightT = weight_->getW()->getTranspose();
-  for (coordIter.rbegin(); !coordIter.end(); --coordIter) {
-    int offset = coordIter.offset();
-    backwardGate2OutputSequence(start, coordIter);
-    for (int i = 0; i < numDims_; i++) {
-      std::vector<int> prePos;
-      if (coordIter.getPrePos(delays_, i, prePos)) {
-        int preOffset = coordIter.offset(prePos);
-        frameOutput_[start + preOffset].grad->mul(
-            *frameGate_[start + offset].grad, *weightT, 1.0, 1.0);
-        if (weight_->getWGrad()) {
-          weight_->getWGrad()->mul(
-              *frameOutput_[start + preOffset].value->getTranspose(),
-              *frameGate_[start + offset].grad,
-              1.0,
-              1.0);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.h b/paddle/gserver/layers/MKLDNNAddtoLayer.h
deleted file mode 100644
index e40e2f2251a1b739958773b8e6dc95a70ed58c76..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "MKLDNNLayer.h"
-#include "mkldnn.hpp"
-
-namespace paddle {
-
-/**
- * @brief A subclass of MKLDNNLayer Addto layer.
- *
- * The config file api is mkldnn_addto
- */
-class MKLDNNAddtoLayer : public MKLDNNLayer {
-protected:
-  // layer size == ic * ih * iw == oc * oh *ow, and can not be changed
-  size_t layerSize_;
-
-  std::unique_ptr<Weight> biases_;
-
-  // buffers for adding bias
-  std::vector<MKLDNNMatrixPtr> vals_;
-  std::vector<MKLDNNMatrixPtr> grads_;
-  // primitives for adding bias
-  std::vector<std::shared_ptr<mkldnn::primitive>> fwdBias_;
-  std::shared_ptr<mkldnn::primitive> bwdBias_;
-
-public:
-  explicit MKLDNNAddtoLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
-
-  ~MKLDNNAddtoLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
-
-  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void updateWeights(const UpdateCallback& callback) override;
-
-protected:
-  void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
-                       MKLDNNMatrixPtr& bias,
-                       MKLDNNMatrixPtr& out);
-  void resetFwdPD(std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
-                  std::shared_ptr<mkldnn::sum::primitive_desc>& biasPD,
-                  std::vector<MKLDNNMatrixPtr>& inputs,
-                  MKLDNNMatrixPtr bias,
-                  MKLDNNMatrixPtr out);
-  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
-                        std::shared_ptr<mkldnn::sum::primitive_desc>& biasPD,
-                        std::vector<MKLDNNMatrixPtr>& inputs,
-                        MKLDNNMatrixPtr& bias,
-                        MKLDNNMatrixPtr& out);
-  void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
-                       MKLDNNMatrixPtr& bias,
-                       MKLDNNMatrixPtr& out);
-
-  void prepareBias(MKLDNNMatrixPtr& bias,
-                   const MatrixPtr& biasMat,
-                   const MKLDNNMatrixPtr& out,
-                   std::vector<MKLDNNMatrixPtr>& outs);
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNBase.h b/paddle/gserver/layers/MKLDNNBase.h
deleted file mode 100644
index d84e2859407711c13c475a19e140e2f5f51e61c2..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MKLDNNBase.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "mkldnn.hpp"
-
-namespace paddle {
-
-typedef enum {
-  MKLDNN_BASE = 1,   // basical info of MKLDNN
-  MKLDNN_TESTS = 1,  // gtest info of MKLDNN
-  MKLDNN_FMTS = 2,   // format info of MKLDNN
-  MKLDNN_SIZES = 3,  // size info of MKLDNN
-  MKLDNN_ALL = 4,    // show all info of MKLDNN
-} MKLDNN_LOG_LEVEL;
-
-/**
- * @brief MKLDNN CPU engine.
- *
- */
-class CPUEngine {
-public:
-  static CPUEngine& Instance() {
-    // Thread-safe in C++11.
-    static CPUEngine myInstance;
-    return myInstance;
-  }
-
-  // Disallow copy or move
-  CPUEngine(const CPUEngine&) = delete;             // Copy constructor
-  CPUEngine(CPUEngine&&) = delete;                  // Move constructor
-  CPUEngine& operator=(const CPUEngine&) = delete;  // Copy assignment
-  CPUEngine& operator=(CPUEngine&&) = delete;       // Move assignment
-
-  mkldnn::engine& getEngine() { return cpuEngine_; }
-
-protected:
-  CPUEngine() : cpuEngine_(mkldnn::engine::cpu, 0) {}
-  //    CPUEngine() : cpuEngine_(mkldnn::engine::cpu_lazy, 0) {}
-  ~CPUEngine() {}
-
-private:
-  mkldnn::engine cpuEngine_;
-};
-
-/**
- * @brief MKLDNN Stream.
- *
- */
-class MKLDNNStream {
-public:
-  MKLDNNStream() : ready_(false) { resetState(); }
-
-  virtual ~MKLDNNStream() {}
-
-  /**
-   * @brief Submit stream
-   * @param prims The primitives vector
-   * @param block Waiting for the stream to complete
-   */
-  void submit(std::vector<mkldnn::primitive>& prims, bool block = true) {
-    resetState();
-    stream_->submit(prims).wait(block);
-    ready_ = false;
-  }
-
-  /**
-   * @brief Reset the mkldnn stream
-   */
-  void resetState() {
-    if (ready_) {
-      return;
-    }
-    // TODO(TJ): change me when mkldnn have method to reset this state
-    // stream_.reset(new mkldnn::stream(mkldnn::stream::kind::lazy));
-    stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
-    ready_ = true;
-  }
-
-private:
-  bool ready_;
-  std::shared_ptr<mkldnn::stream> stream_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNBatchNormLayer.h b/paddle/gserver/layers/MKLDNNBatchNormLayer.h
deleted file mode 100644
index 93e182206a1ab1f06087cb808bb266ddea1468c9..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "MKLDNNLayer.h"
-#include "mkldnn.hpp"
-
-namespace paddle {
-typedef mkldnn::batch_normalization_forward bn_fwd;
-typedef mkldnn::batch_normalization_backward bn_bwd;
-
-/**
- * @brief A subclass of MKLDNNLayer BatchNorm layer.
- *
- * The config file api is mkldnn_batch_norm
- */
-class MKLDNNBatchNormLayer : public MKLDNNLayer {
-protected:
-  // save forward primitive_desc, which can be used backward
-  std::shared_ptr<bn_fwd::primitive_desc> fwdPD_;
-
-  // Epsilon value used in the batch normalization formula.
-  real epsilon_;
-
-  // weight and bias in paddle
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> biases_;
-  // mkldnn use a large buffer store both scale and shift
-  // which are weight and bias in paddle corresponding.
-  MatrixPtr valueScaleShift_;
-  MatrixPtr gradScaleShift_;
-  // Moving average of mean.
-  std::unique_ptr<Weight> movingMean_;
-  // Moving average of variance.
-  std::unique_ptr<Weight> movingVar_;
-
-  // if useGlobalStats_ is true, will use the loaded mean and variance.
-  // otherwise, calculate mean and variance in every mini-batch.
-  bool useGlobalStats_;
-  // used in MKLDNN primitive desc
-  unsigned flags_;
-  // use to compute moving mean and variance.
-  real movingAvgFraction_;
-  // whether the weight has been init
-  bool hasInitedWgt_;
-
-  // local mean and variance
-  // when useGlobalStats_ they are loaded from moving mean and variance
-  // when do not useGlobalStats_ they are calculated from this mini-batch
-  MKLDNNMatrixPtr mean_;
-  MKLDNNMatrixPtr var_;
-
-public:
-  explicit MKLDNNBatchNormLayer(const LayerConfig& config)
-      : MKLDNNLayer(config), useGlobalStats_(true), hasInitedWgt_(false) {}
-
-  ~MKLDNNBatchNormLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
-
-  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void updateWeights(const UpdateCallback& callback) override;
-
-  void convertWeightsFromPaddle() override;
-
-protected:
-  void initWeight();
-  /**
-   * cal moving mean and variance.
-   * moving = moving * AvgFraction + local * (1 - AvgFraction)
-   */
-  void calMovingMeanAndVar();
-
-  void resetFwdBuffers(MKLDNNMatrixPtr& in,
-                       MKLDNNMatrixPtr& wgt,
-                       MKLDNNMatrixPtr& out);
-  void resetFwdPD(std::shared_ptr<bn_fwd::primitive_desc>& pd,
-                  MKLDNNMatrixPtr in,
-                  MKLDNNMatrixPtr wgt,
-                  MKLDNNMatrixPtr out);
-  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<bn_fwd::primitive_desc>& pd,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& out);
-  void resetBwdBuffers(MKLDNNMatrixPtr& in,
-                       MKLDNNMatrixPtr& wgt,
-                       MKLDNNMatrixPtr& out);
-  void resetBwdPD(std::shared_ptr<bn_bwd::primitive_desc>& pd,
-                  MKLDNNMatrixPtr& in,
-                  MKLDNNMatrixPtr& wgt,
-                  MKLDNNMatrixPtr& out);
-  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<bn_bwd::primitive_desc>& pd,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& out);
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNConcatLayer.h b/paddle/gserver/layers/MKLDNNConcatLayer.h
deleted file mode 100644
index f7abdabfb51df27f8db4e6d4d88c80546eeba248..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MKLDNNConcatLayer.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "MKLDNNLayer.h"
-#include "mkldnn.hpp"
-
-namespace paddle {
-
-/**
- * @brief A subclass of MKLDNNLayer Concatenate layer.
- *
- * The config file api is mkldnn_concat
- */
-class MKLDNNConcatLayer : public MKLDNNLayer {
-protected:
-  std::vector<std::shared_ptr<mkldnn::primitive>> bwds_;
-  // input channel numbers
-  std::vector<int> channels_;
-
-  // concat_dimension in MKLDNN
-  // if axis_ == 0, concat batchsize
-  // if axis_ == 1, concat channel (default)
-  int axis_;
-
-public:
-  explicit MKLDNNConcatLayer(const LayerConfig& config)
-      : MKLDNNLayer(config), axis_(1) {}
-
-  ~MKLDNNConcatLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
-
-  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void printSizeInfo() override {
-    CHECK_EQ(channels_.size(), inputLayers_.size());
-    for (size_t i = 0; i < channels_.size(); ++i) {
-      VLOG(MKLDNN_SIZES) << "Input " << i << ", " << inputLayers_[i]->getName()
-                         << ": " << bs_ << ", " << channels_[i] << ", " << ih_
-                         << ", " << iw_;
-    }
-    VLOG(MKLDNN_SIZES) << "Output: " << bs_ << ", " << oc_ << ", " << oh_
-                       << ", " << ow_;
-  }
-
-  size_t keepCondition() {
-    // reset when the total element size of all inputs changed
-    size_t totalSize = inputLayers_[0]->getOutputValue()->getElementCnt();
-    for (size_t i = 1; i < inputLayers_.size(); ++i) {
-      totalSize += inputLayers_[i]->getOutputValue()->getElementCnt();
-    }
-    return totalSize;
-  }
-
-protected:
-  void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
-                       MKLDNNMatrixPtr& out);
-  void resetFwdPD(std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
-                  std::vector<MKLDNNMatrixPtr>& inputs,
-                  MKLDNNMatrixPtr out);
-  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
-                        std::vector<MKLDNNMatrixPtr>& inputs,
-                        MKLDNNMatrixPtr& out);
-  void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
-                       MKLDNNMatrixPtr& out);
-  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::vector<std::shared_ptr<mkldnn::primitive>>& prims,
-                        std::vector<MKLDNNMatrixPtr>& inputs,
-                        MKLDNNMatrixPtr& out);
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp
deleted file mode 100644
index a442a0a01369f4ceb27ba4a1976df7f6e25b832f..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ /dev/null
@@ -1,388 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNConvLayer.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/utils/Logging.h"
-
-using namespace mkldnn;  // NOLINT
-typedef memory::format format;
-
-namespace paddle {
-
-REGISTER_LAYER(mkldnn_conv, MKLDNNConvLayer);
-
-bool MKLDNNConvLayer::init(const LayerMap& layerMap,
-                           const ParameterMap& parameterMap) {
-  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
-    return false;
-  }
-  CHECK_EQ(inputLayers_.size(), 1UL) << "Only support one input layer yet";
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  CHECK(config_.shared_biases()) << "Only support shared biases yet";
-
-  oc_ = config_.num_filters();
-  const ConvConfig& conf = config_.inputs(0).conv_conf();
-  ic_ = conf.channels();
-  fw_ = conf.filter_size();
-  fh_ = conf.filter_size_y();
-  pw_ = conf.padding();
-  ph_ = conf.padding_y();
-  dw_ = conf.dilation();
-  dh_ = conf.dilation_y();
-  sw_ = conf.stride();
-  sh_ = conf.stride_y();
-  gp_ = conf.groups();
-  oh_ = conf.output_y();
-  ow_ = conf.output_x();
-  ih_ = conf.img_size_y();
-  iw_ = conf.img_size();
-  caffeMode_ = conf.caffe_mode();
-  CHECK(caffeMode_) << "Only support caffe mode yet";
-  CHECK(dh_ == 1 && dw_ == 1) << "Only support dilation 1 yet";
-  // check group setting
-  CHECK_EQ((oc_ / gp_) * gp_, oc_) << "group is indivisible for oc";
-  CHECK_EQ((ic_ / gp_) * gp_, ic_) << "group is indivisible for ic";
-
-  // create weight
-  size_t height = oc_ / gp_;
-  size_t width = ic_ * fh_ * fw_;
-  CHECK_EQ(parameters_[0]->getSize(), height * width);
-  weight_ =
-      std::unique_ptr<Weight>(new Weight(height, width, parameters_[0], 0));
-
-  // create biases
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_, 0));
-  }
-  return true;
-}
-
-void MKLDNNConvLayer::convertWeightsFromPaddle() {
-  if (hasInitedWgt_) {
-    return;
-  }
-
-  CHECK(wgtVal_) << "should have been initialized";
-  // the paddle weight format is oihw or goihw
-  auto targetDim = wgtVal_->getDims();
-  auto srcFmt = (gp_ == 1) ? memory::format::oihw : memory::format::goihw;
-  wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
-  hasInitedWgt_ = true;
-}
-
-void MKLDNNConvLayer::convertWeightsToPaddle() {
-  CHECK(wgtVal_) << "should have been initialized";
-  auto targetDim = wgtVal_->getDims();
-  auto dstFmt = (gp_ == 1) ? memory::format::oihw : memory::format::goihw;
-  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
-}
-
-void MKLDNNConvLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
-  reshapeInput(bs, ih, iw);
-
-  // cal output sizes
-  // oc can not be changed
-  int fh = (fh_ - 1) * dh_ + 1;
-  int fw = (fw_ - 1) * dw_ + 1;
-  oh = outputSize(ih, fh, ph_, sh_, caffeMode_);
-  ow = outputSize(iw, fw, pw_, sw_, caffeMode_);
-
-  reshapeOutput(oh, ow);
-  resizeOutput(bs, oc * oh * ow);
-}
-
-void MKLDNNConvLayer::resetFwd(std::vector<primitive>& pipeline,
-                               std::vector<MKLDNNMatrixPtr>& inputs,
-                               MKLDNNMatrixPtr& out) {
-  resetFwdPD(fwdPD_);
-
-  resetFwdBuffers(fwdPD_, inputs[0], wgtVal_, biasVal_, out);
-
-  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, biasVal_, out);
-}
-
-void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
-                               std::vector<MKLDNNMatrixPtr>& inputs,
-                               MKLDNNMatrixPtr& out) {
-  std::shared_ptr<conv_bwdWgt::primitive_desc> bwdWgtPD;
-  std::shared_ptr<conv_bwdData::primitive_desc> bwdDataPD;
-
-  resetBwdWgtPD(bwdWgtPD);
-
-  resetBwdDataPD(bwdDataPD);
-
-  resetBwdBuffers(bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
-
-  resetBwdPipeline(
-      pipeline, bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
-}
-
-void MKLDNNConvLayer::updateWeights(const UpdateCallback& callback) {
-  weight_->getParameterPtr()->incUpdate(callback);
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-void MKLDNNConvLayer::loadConvSettings(memory::dims& wgt,
-                                       memory::dims& bias,
-                                       memory::dims& stride,
-                                       memory::dims& dilation,
-                                       memory::dims& padL,
-                                       memory::dims& padR) {
-  wgt = (gp_ == 1) ? memory::dims{oc_, ic_, fh_, fw_}
-                   : memory::dims{gp_, oc_ / gp_, ic_ / gp_, fh_, fw_};
-  bias = memory::dims{oc_};
-  stride = memory::dims{sh_, sw_};
-  padL = memory::dims{ph_, pw_};
-  padR = getPaddingR();
-  // note: mkldnn dilation start from 0
-  dilation = memory::dims{dh_ - 1, dw_ - 1};
-}
-
-void MKLDNNConvLayer::resetFwdPD(
-    std::shared_ptr<conv_fwd::primitive_desc>& pd) {
-  // dims for conv
-  memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
-  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
-  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
-  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
-
-  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
-                                        : prop_kind::forward_training;
-  algorithm algo = algorithm::convolution_direct;
-  padding_kind padKind = padding_kind::zero;
-  conv_fwd::desc fwdDesc =
-      biases_ && biases_->getW()
-          ? conv_fwd::desc(pk,
-                           algo,
-                           MKLDNNMatrix::createMemoryDesc(inDims),
-                           MKLDNNMatrix::createMemoryDesc(wgtDims),
-                           MKLDNNMatrix::createMemoryDesc(biasDims),
-                           MKLDNNMatrix::createMemoryDesc(outDims),
-                           strides,
-                           dilations,
-                           padL,
-                           padR,
-                           padKind)
-          : conv_fwd::desc(pk,
-                           algo,
-                           MKLDNNMatrix::createMemoryDesc(inDims),
-                           MKLDNNMatrix::createMemoryDesc(wgtDims),
-                           MKLDNNMatrix::createMemoryDesc(outDims),
-                           strides,
-                           dilations,
-                           padL,
-                           padR,
-                           padKind);
-  pd.reset(new conv_fwd::primitive_desc(fwdDesc, engine_));
-}
-
-void MKLDNNConvLayer::resetFwdBuffers(
-    std::shared_ptr<conv_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  CHECK(pd);
-  resetInValue(
-      in, std::make_shared<memory::primitive_desc>(pd->src_primitive_desc()));
-
-  resetOutValue(out, pd->dst_primitive_desc());
-
-  resetWithMatrix(wgt, weight_->getW(), pd->weights_primitive_desc());
-
-  if (biases_ && biases_->getW()) {
-    resetWithMatrix(bias, biases_->getW(), pd->bias_primitive_desc());
-  } else {
-    bias = nullptr;
-  }
-}
-
-void MKLDNNConvLayer::resetFwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<conv_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  if (bias) {
-    fwd_.reset(new conv_fwd(*pd, *in, *wgt, *bias, *out));
-  } else {
-    fwd_.reset(new conv_fwd(*pd, *in, *wgt, *out));
-  }
-  pipeline.push_back(*fwd_);
-}
-
-void MKLDNNConvLayer::resetBwdWgtPD(
-    std::shared_ptr<conv_bwdWgt::primitive_desc>& pd) {
-  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
-  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
-
-  // create backward weight using input, output and weight value memory desc
-  CHECK(inVals_[0]) << "Should have internal input value";
-  CHECK(outVal_) << "Should have internal output value";
-  CHECK(wgtVal_) << "Should have weight value";
-  algorithm algo = algorithm::convolution_direct;
-  padding_kind padKind = padding_kind::zero;
-  auto bwdWgtDesc = biasVal_ != nullptr
-                        ? conv_bwdWgt::desc(algo,
-                                            inVals_[0]->getMemoryDesc(),
-                                            wgtVal_->getMemoryDesc(),
-                                            biasVal_->getMemoryDesc(),
-                                            outVal_->getMemoryDesc(),
-                                            strides,
-                                            padL,
-                                            padR,
-                                            padKind)
-                        : conv_bwdWgt::desc(algo,
-                                            inVals_[0]->getMemoryDesc(),
-                                            wgtVal_->getMemoryDesc(),
-                                            outVal_->getMemoryDesc(),
-                                            strides,
-                                            padL,
-                                            padR,
-                                            padKind);
-  pd.reset(new conv_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
-  CHECK_PRIMITIVE_DESC_EQ(inVals_[0], pd->src_primitive_desc());
-  CHECK_PRIMITIVE_DESC_EQ(
-      outVal_,
-      pd->diff_dst_primitive_desc(),
-      "primitive desc of out value and grad should be equal");
-  CHECK_PRIMITIVE_DESC_EQ(
-      wgtVal_,
-      pd->diff_weights_primitive_desc(),
-      "primitive desc of weight value and grad should be equal");
-}
-
-void MKLDNNConvLayer::resetBwdDataPD(
-    std::shared_ptr<conv_bwdData::primitive_desc>& pd) {
-  pd = nullptr;
-  if (inputLayers_[0]->getOutput().grad == nullptr) {
-    return;
-  }
-
-  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
-  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
-  CHECK(inVals_[0]) << "Should have internal input value";
-  CHECK(outVal_) << "Should have internal output value";
-  // create backward data using input and output value memory desc
-  // but using weight memory desc with any format
-  auto bwdDataDesc = conv_bwdData::desc(algorithm::convolution_direct,
-                                        inVals_[0]->getMemoryDesc(),
-                                        MKLDNNMatrix::createMemoryDesc(wgtDims),
-                                        outVal_->getMemoryDesc(),
-                                        strides,
-                                        padL,
-                                        padR,
-                                        padding_kind::zero);
-  pd.reset(new conv_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_));
-  CHECK_PRIMITIVE_DESC_EQ(
-      inVals_[0],
-      pd->diff_src_primitive_desc(),
-      "primitive desc of in value and grad should be equal");
-  CHECK_PRIMITIVE_DESC_EQ(
-      outVal_,
-      pd->diff_dst_primitive_desc(),
-      "primitive desc of out value and grad should be equal");
-}
-
-void MKLDNNConvLayer::resetBwdBuffers(
-    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  CHECK(wgtPD);
-  resetOutGrad(out, wgtPD->diff_dst_primitive_desc());
-
-  resetWithMatrix(
-      wgt, weight_->getWGrad(), wgtPD->diff_weights_primitive_desc());
-  CHECK_PRIMITIVE_DESC_EQ(
-      wgtVal_,
-      wgt->getPrimitiveDesc(),
-      "primitive desc of weight grad and value should be equal");
-
-  bias = nullptr;
-  if (biases_ && biases_->getWGrad()) {
-    resetWithMatrix(
-        bias, biases_->getWGrad(), wgtPD->diff_bias_primitive_desc());
-    CHECK(bias);
-    CHECK_PRIMITIVE_DESC_EQ(
-        biasVal_,
-        bias->getPrimitiveDesc(),
-        "primitive desc of bias grad and value should be equal");
-  }
-
-  if (dataPD == nullptr) {
-    return;
-  }
-  resetInGrad(in, dataPD->diff_src_primitive_desc());
-  resetWgtValBwdData(dataPD, wgtValBwdData_);
-}
-
-void MKLDNNConvLayer::resetBwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0]);
-  // add bwdWgt handle
-  if (bias) {
-    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt, *bias));
-  } else {
-    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt));
-  }
-  pipeline.push_back(*bwdWgt_);
-
-  if (dataPD == nullptr) {
-    return;
-  }
-  if (cvtWgtVal_) {
-    pipeline.push_back(*cvtWgtVal_);
-  }
-  // add bwdData handle
-  CHECK(wgtValBwdData_) << "Should have weight memory";
-  bwdData_.reset(new conv_bwdData(*dataPD, *out, *wgtValBwdData_, *in));
-  pipeline.push_back(*bwdData_);
-}
-
-void MKLDNNConvLayer::resetWgtValBwdData(
-    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-    MKLDNNMatrixPtr& wgt) {
-  if (dataPD == nullptr) {
-    return;
-  }
-
-  // create new weight value for backward data, and create reorder if necessary
-  // since the primitive_desc would be different with wgtVal_
-  CHECK(wgtVal_) << "should have weight value";
-  if (dataPD->weights_primitive_desc() != wgtVal_->getPrimitiveDesc()) {
-    wgtValBwdData_ = MKLDNNMatrix::create(dataPD->weights_primitive_desc());
-    cvtWgtVal_ = MKLDNNMatrix::createReorder(wgtVal_, wgtValBwdData_);
-    CHECK(cvtWgtVal_);
-  } else {
-    wgtValBwdData_ = wgtVal_;
-  }
-  VLOG(MKLDNN_FMTS) << "weight value format for backward data: "
-                    << wgtValBwdData_->getFormat();
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.h b/paddle/gserver/layers/MKLDNNConvLayer.h
deleted file mode 100644
index 29c8735fbb91e7418797874238eb87759420f181..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MKLDNNConvLayer.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "MKLDNNLayer.h"
-#include "mkldnn.hpp"
-
-namespace paddle {
-typedef mkldnn::convolution_forward conv_fwd;
-typedef mkldnn::convolution_backward_weights conv_bwdWgt;
-typedef mkldnn::convolution_backward_data conv_bwdData;
-
-/**
- * @brief A subclass of MKLDNNLayer conv layer.
- *
- * The config file api is mkldnn_conv
- */
-class MKLDNNConvLayer : public MKLDNNLayer {
-protected:
-  // padding height and width
-  int ph_, pw_;
-  // stride height and width
-  int sh_, sw_;
-  // dilation height and width
-  int dh_, dw_;
-  // filter(kenerl) height and width
-  int fh_, fw_;
-  // group number
-  int gp_;
-
-  // in resetBwdData, the format of wgtValBwdData_ is different with wgtVal_
-  MKLDNNMatrixPtr wgtValBwdData_;
-  // convert handle from wgtVal_ to wgtValBwdData_
-  std::shared_ptr<mkldnn::reorder> cvtWgtVal_;
-
-  // save forward primitive_desc, which can be used backward
-  std::shared_ptr<conv_fwd::primitive_desc> fwdPD_;
-
-  // whether the weight has been init
-  bool hasInitedWgt_;
-
-  // true by default, which impact the calculation of output image size.
-  // details can refer to mathUtil.h
-  bool caffeMode_;
-
-  // weight and bias
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> biases_;
-
-public:
-  explicit MKLDNNConvLayer(const LayerConfig& config)
-      : MKLDNNLayer(config), hasInitedWgt_(false), caffeMode_(true) {}
-
-  ~MKLDNNConvLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
-
-  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void updateWeights(const UpdateCallback& callback) override;
-
-  void convertWeightsFromPaddle() override;
-
-  void convertWeightsToPaddle() override;
-
-  void printSizeInfo() override {
-    MKLDNNLayer::printSizeInfo();
-    VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
-                       << ", ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_
-                       << ", sw: " << sw_ << ", dh: " << dh_ << ", dw: " << dw_;
-  }
-
-protected:
-  /**
-   * load the dims settings of this conv
-   */
-  void loadConvSettings(mkldnn::memory::dims& wgt,
-                        mkldnn::memory::dims& bias,
-                        mkldnn::memory::dims& stride,
-                        mkldnn::memory::dims& dilation,
-                        mkldnn::memory::dims& padL,
-                        mkldnn::memory::dims& padR);
-
-  void resetFwdPD(std::shared_ptr<conv_fwd::primitive_desc>& pd);
-  void resetFwdBuffers(std::shared_ptr<conv_fwd::primitive_desc>& pd,
-                       MKLDNNMatrixPtr& in,
-                       MKLDNNMatrixPtr& wgt,
-                       MKLDNNMatrixPtr& bias,
-                       MKLDNNMatrixPtr& out);
-  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<conv_fwd::primitive_desc>& pd,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& bias,
-                        MKLDNNMatrixPtr& out);
-  void resetBwdWgtPD(std::shared_ptr<conv_bwdWgt::primitive_desc>& pd);
-  void resetBwdDataPD(std::shared_ptr<conv_bwdData::primitive_desc>& pd);
-  void resetBwdBuffers(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-                       std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-                       MKLDNNMatrixPtr& in,
-                       MKLDNNMatrixPtr& wgt,
-                       MKLDNNMatrixPtr& bias,
-                       MKLDNNMatrixPtr& out);
-  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-                        std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& bias,
-                        MKLDNNMatrixPtr& out);
-
-  /**
-   * reset MKLDNNMatrix of weight value for backward data
-   * since the primitive_desc would be different with wgtVal_
-   */
-  void resetWgtValBwdData(std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-                          MKLDNNMatrixPtr& wgt);
-
-  /**
-   * get padding_r according to
-   * https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
-   * test_convolution_forward_common.hpp
-   * @note: mkldnn dilation start from 0 while paddle start from 1
-   */
-  mkldnn::memory::dims getPaddingR() const {
-    mkldnn::memory::dims padR = {ph_, pw_};
-    for (int i = 0; i < 2; ++i) {
-      if ((ih_ - ((fh_ - 1) * dh_ + 1) + ph_ + padR[0]) / sh_ + 1 != oh_) {
-        ++padR[0];
-      }
-      if ((iw_ - ((fw_ - 1) * dw_ + 1) + pw_ + padR[1]) / sw_ + 1 != ow_) {
-        ++padR[1];
-      }
-    }
-    return padR;
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
deleted file mode 100644
index 0c7e6f16e24a65b552cebcbd2111926cefc211f4..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ /dev/null
@@ -1,262 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNFcLayer.h"
-#include "paddle/utils/Logging.h"
-
-using namespace mkldnn;  // NOLINT
-typedef memory::format format;
-
-namespace paddle {
-
-REGISTER_LAYER(mkldnn_fc, MKLDNNFcLayer);
-
-bool MKLDNNFcLayer::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
-    return false;
-  }
-
-  CHECK_EQ(inputLayers_.size(), 1UL) << "Only support one input layer yet";
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  CHECK(!parameters_[0]->isSparse()) << "Do not support sparse yet";
-
-  // output size, cat not be changed
-  oc_ = getSize();
-  oh_ = 1;
-  ow_ = 1;
-  ih_ = 1;
-  iw_ = 1;
-
-  // input size can not change in FC
-  iLayerSize_ = inputLayers_[0]->getSize();
-  CHECK_EQ(parameters_[0]->getSize(), iLayerSize_ * oc_);
-
-  // create weight
-  weight_ =
-      std::unique_ptr<Weight>(new Weight(oc_, iLayerSize_, parameters_[0], 0));
-
-  // create biases
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_, 0));
-  }
-  return true;
-}
-
-void MKLDNNFcLayer::convertWeightsFromPaddle() {
-  if (hasInitedWgt_) {
-    return;
-  }
-
-  CHECK(wgtVal_) << "should have been initialized";
-  auto targetDim = wgtVal_->getDims();
-  auto srcFmt = targetDim.size() == 2 ? format::io : format::ihwo;
-  wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
-  hasInitedWgt_ = true;
-}
-
-void MKLDNNFcLayer::convertWeightsToPaddle() {
-  CHECK(wgtVal_) << "should have been initialized";
-  auto targetDim = wgtVal_->getDims();
-  auto dstFmt = targetDim.size() == 2 ? format::io : format::ihwo;
-  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
-}
-
-void MKLDNNFcLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
-  reshapeInput(bs, ih, iw);
-
-  CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize());
-  ic = iLayerSize_ / (ih * iw);
-  CHECK_EQ(size_t(ic * ih * iw), iLayerSize_) << "not divisible";
-  CHECK_EQ(size_t(oc), getSize());
-
-  reshapeOutput(oh, ow);
-  resizeOutput(bs, oc);
-}
-
-void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline,
-                             std::vector<MKLDNNMatrixPtr>& inputs,
-                             MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(inputs[0], wgtVal_, biasVal_, out);
-
-  resetFwdPD(fwdPD_, inputs[0], wgtVal_, biasVal_, out);
-
-  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, biasVal_, out);
-}
-
-void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
-                             std::vector<MKLDNNMatrixPtr>& inputs,
-                             MKLDNNMatrixPtr& out) {
-  std::shared_ptr<fc_bwdWgt::primitive_desc> bwdWgtPD;
-  std::shared_ptr<fc_bwdData::primitive_desc> bwdDataPD;
-
-  resetBwdBuffers(inputs[0], wgtGrad_, biasGrad_, out);
-
-  resetBwdWgtPD(bwdWgtPD, wgtGrad_, biasGrad_, out);
-
-  resetBwdDataPD(bwdDataPD, inputs[0], out);
-
-  resetBwdPipeline(
-      pipeline, bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
-}
-
-void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) {
-  weight_->getParameterPtr()->incUpdate(callback);
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-void MKLDNNFcLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
-                                    MKLDNNMatrixPtr& wgt,
-                                    MKLDNNMatrixPtr& bias,
-                                    MKLDNNMatrixPtr& out) {
-  resetInValue(in);
-  CHECK(in);
-  in->downSpatial();
-
-  auto outPD =
-      MKLDNNMatrix::createPrimitiveDesc({bs_, oc_}, format::nc, engine_);
-  resetOutValue(out, outPD);
-
-  format wgtFmt = format::oihw;
-  if (in->getFormat() == format::nChw8c) {
-    wgtFmt = format::oIhw8i;
-  } else if (in->getFormat() == format::nChw16c) {
-    wgtFmt = format::oIhw16i;
-  }
-  auto wgtPD =
-      MKLDNNMatrix::createPrimitiveDesc({oc_, ic_, ih_, iw_}, wgtFmt, engine_);
-  resetWithMatrix(wgt, weight_->getW(), wgtPD);
-  wgt->downSpatial();
-
-  if (biases_ && biases_->getW()) {
-    auto biasPD = MKLDNNMatrix::createPrimitiveDesc({oc_}, format::x, engine_);
-    resetWithMatrix(bias, biases_->getW(), biasPD);
-  } else {
-    bias = nullptr;
-  }
-}
-
-void MKLDNNFcLayer::resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
-                               MKLDNNMatrixPtr in,
-                               MKLDNNMatrixPtr wgt,
-                               MKLDNNMatrixPtr bias,
-                               MKLDNNMatrixPtr out) {
-  CHECK(in);
-  CHECK(wgt);
-  CHECK(out);
-  prop_kind pk = prop_kind::forward;
-  fc_fwd::desc fwdDesc = bias != nullptr ? fc_fwd::desc(pk,
-                                                        in->getMemoryDesc(),
-                                                        wgt->getMemoryDesc(),
-                                                        bias->getMemoryDesc(),
-                                                        out->getMemoryDesc())
-                                         : fc_fwd::desc(pk,
-                                                        in->getMemoryDesc(),
-                                                        wgt->getMemoryDesc(),
-                                                        out->getMemoryDesc());
-  pd.reset(new fc_fwd::primitive_desc(fwdDesc, engine_));
-}
-
-void MKLDNNFcLayer::resetFwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<fc_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  if (bias) {
-    fwd_.reset(new fc_fwd(*pd, *in, *wgt, *bias, *out));
-  } else {
-    fwd_.reset(new fc_fwd(*pd, *in, *wgt, *out));
-  }
-  pipeline.push_back(*fwd_);
-}
-
-void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
-                                    MKLDNNMatrixPtr& wgt,
-                                    MKLDNNMatrixPtr& bias,
-                                    MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0] && outVal_);
-  resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
-
-  CHECK(wgtVal_);
-  resetWithMatrix(wgt, weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
-
-  if (biasVal_) {
-    resetWithMatrix(bias, biases_->getWGrad(), biasVal_->getPrimitiveDesc());
-  } else {
-    bias = nullptr;
-  }
-}
-
-void MKLDNNFcLayer::resetBwdWgtPD(
-    std::shared_ptr<fc_bwdWgt::primitive_desc>& pd,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0]);
-  fc_bwdWgt::desc bwdWgtDesc =
-      bias ? fc_bwdWgt::desc(inVals_[0]->getMemoryDesc(),
-                             wgt->getMemoryDesc(),
-                             bias->getMemoryDesc(),
-                             out->getMemoryDesc())
-           : fc_bwdWgt::desc(inVals_[0]->getMemoryDesc(),
-                             wgt->getMemoryDesc(),
-                             out->getMemoryDesc());
-  pd.reset(new fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
-}
-
-void MKLDNNFcLayer::resetBwdDataPD(
-    std::shared_ptr<fc_bwdData::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& out) {
-  pd = nullptr;
-  if (in == nullptr) {
-    return;
-  }
-  CHECK(wgtVal_);
-  fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(
-      in->getMemoryDesc(), wgtVal_->getMemoryDesc(), out->getMemoryDesc());
-  pd.reset(new fc_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_));
-}
-
-void MKLDNNFcLayer::resetBwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<fc_bwdWgt::primitive_desc>& bwdWgtPD,
-    std::shared_ptr<fc_bwdData::primitive_desc>& bwdDataPD,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias,
-    MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0]);
-  if (bias) {
-    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVals_[0], *out, *wgt, *bias));
-  } else {
-    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVals_[0], *out, *wgt));
-  }
-  pipeline.push_back(*bwdWgt_);
-
-  if (bwdDataPD == nullptr) {
-    return;
-  }
-  CHECK(wgtVal_) << "Should have weight memory";
-  bwdData_.reset(new fc_bwdData(*bwdDataPD, *out, *wgtVal_, *in));
-  pipeline.push_back(*bwdData_);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.h b/paddle/gserver/layers/MKLDNNFcLayer.h
deleted file mode 100644
index 0d41a4379d677f86f672852fec09b1241009597b..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "MKLDNNLayer.h"
-#include "mkldnn.hpp"
-
-namespace paddle {
-typedef mkldnn::inner_product_forward fc_fwd;
-typedef mkldnn::inner_product_backward_weights fc_bwdWgt;
-typedef mkldnn::inner_product_backward_data fc_bwdData;
-
-/**
- * @brief A subclass of MKLDNNLayer fc layer.
- *
- * The config file api is mkldnn_fc
- */
-class MKLDNNFcLayer : public MKLDNNLayer {
-protected:
-  // input layer size, can not be change after init
-  size_t iLayerSize_;  // == ic * ih * iw
-
-  // if has already init the weight
-  bool hasInitedWgt_;
-
-  // save forward primitive_desc, which can be used backward
-  std::shared_ptr<fc_fwd::primitive_desc> fwdPD_;
-
-  // fc weight and bias
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> biases_;
-
-public:
-  explicit MKLDNNFcLayer(const LayerConfig& config)
-      : MKLDNNLayer(config), hasInitedWgt_(false) {}
-
-  ~MKLDNNFcLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
-
-  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void updateWeights(const UpdateCallback& callback) override;
-
-  void convertWeightsFromPaddle() override;
-
-  void convertWeightsToPaddle() override;
-
-protected:
-  void resetFwdBuffers(MKLDNNMatrixPtr& in,
-                       MKLDNNMatrixPtr& wgt,
-                       MKLDNNMatrixPtr& bias,
-                       MKLDNNMatrixPtr& out);
-  void resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
-                  MKLDNNMatrixPtr in,
-                  MKLDNNMatrixPtr wgt,
-                  MKLDNNMatrixPtr bias,
-                  MKLDNNMatrixPtr out);
-  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<fc_fwd::primitive_desc>& pd,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& bias,
-                        MKLDNNMatrixPtr& out);
-  void resetBwdBuffers(MKLDNNMatrixPtr& in,
-                       MKLDNNMatrixPtr& wgt,
-                       MKLDNNMatrixPtr& bias,
-                       MKLDNNMatrixPtr& out);
-  void resetBwdWgtPD(std::shared_ptr<fc_bwdWgt::primitive_desc>& pd,
-                     MKLDNNMatrixPtr& wgt,
-                     MKLDNNMatrixPtr& bias,
-                     MKLDNNMatrixPtr& out);
-  void resetBwdDataPD(std::shared_ptr<fc_bwdData::primitive_desc>& pd,
-                      MKLDNNMatrixPtr& in,
-                      MKLDNNMatrixPtr& out);
-  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<fc_bwdWgt::primitive_desc>& bwdWgtPD,
-                        std::shared_ptr<fc_bwdData::primitive_desc>& bwdDataPD,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& bias,
-                        MKLDNNMatrixPtr& out);
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.cpp b/paddle/gserver/layers/MKLDNNLRNLayer.cpp
deleted file mode 100644
index 88513ab8bca3899775be7822083b51120a04d6e4..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MKLDNNLRNLayer.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNLRNLayer.h"
-#include "paddle/utils/Logging.h"
-
-using namespace mkldnn;  // NOLINT
-typedef memory::format format;
-
-namespace paddle {
-
-REGISTER_LAYER(mkldnn_lrn, MKLDNNLRNLayer);
-
-bool MKLDNNLRNLayer::init(const LayerMap& layerMap,
-                          const ParameterMap& parameterMap) {
-  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
-    return false;
-  }
-
-  /* the size of inputs for norm-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-  const NormConfig& conf = config_.inputs(0).norm_conf();
-  localSize_ = conf.size();
-  alpha_ = conf.scale();
-  beta_ = conf.pow();
-
-  ic_ = conf.channels();
-  oc_ = ic_;
-  iw_ = conf.img_size();
-  ow_ = conf.output_x();
-  ih_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  oh_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-  CHECK_EQ(iw_, ow_);
-  CHECK_EQ(ih_, oh_);
-  return true;
-}
-
-void MKLDNNLRNLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  reshapeInput(bs, ih, iw);
-  // ic_ and oc can not be changed
-  CHECK_EQ((size_t)ic,
-           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
-      << "Input channel can not be changed";
-  oh = ih;
-  ow = iw;
-  reshapeOutput(oh, ow);
-  resizeOutput(bs, oc * oh * ow);
-}
-
-void MKLDNNLRNLayer::resetFwd(std::vector<primitive>& pipeline,
-                              std::vector<MKLDNNMatrixPtr>& inputs,
-                              MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(inputs[0], out);
-
-  resetFwdPD(fwdPD_, inputs[0], out);
-
-  resetFwdPipeline(pipeline, fwdPD_, inputs[0], out);
-}
-
-void MKLDNNLRNLayer::resetBwd(std::vector<primitive>& pipeline,
-                              std::vector<MKLDNNMatrixPtr>& inputs,
-                              MKLDNNMatrixPtr& out) {
-  std::shared_ptr<lrn_bwd::primitive_desc> pd;
-
-  resetBwdBuffers(inputs[0], out);
-
-  resetBwdPD(pd, inputs[0], out);
-
-  resetBwdPipeline(pipeline, pd, inputs[0], out);
-}
-
-void MKLDNNLRNLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
-                                     MKLDNNMatrixPtr& out) {
-  resetInValue(in);
-  CHECK(in);
-  resetOutValue(out, in->getPrimitiveDesc());
-}
-
-void MKLDNNLRNLayer::resetFwdPD(std::shared_ptr<lrn_fwd::primitive_desc>& pd,
-                                MKLDNNMatrixPtr in,
-                                MKLDNNMatrixPtr out) {
-  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
-                                        : prop_kind::forward_training;
-  auto fwdDesc = lrn_fwd::desc(pk,
-                               algorithm::lrn_across_channels,
-                               in->getMemoryDesc(),
-                               localSize_,
-                               alpha_,
-                               beta_,
-                               1.0f);
-  pd.reset(new lrn_fwd::primitive_desc(fwdDesc, engine_));
-  // prepare workspace if necessary
-  workspace_ =
-      passType_ != PASS_TEST
-          ? std::make_shared<memory>(memory(pd->workspace_primitive_desc()))
-          : nullptr;
-}
-
-void MKLDNNLRNLayer::resetFwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<lrn_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& out) {
-  fwd_ = workspace_
-             ? std::make_shared<lrn_fwd>(lrn_fwd(*pd, *in, *workspace_, *out))
-             : std::make_shared<lrn_fwd>(lrn_fwd(*pd, *in, *out));
-  pipeline.push_back(*fwd_);
-}
-
-void MKLDNNLRNLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
-                                     MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0] && outVal_);
-  resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
-}
-
-void MKLDNNLRNLayer::resetBwdPD(std::shared_ptr<lrn_bwd::primitive_desc>& pd,
-                                MKLDNNMatrixPtr& in,
-                                MKLDNNMatrixPtr& out) {
-  pd = nullptr;
-  if (in == nullptr) {
-    return;
-  }
-  CHECK(out);
-  auto bwdDesc = lrn_bwd::desc(algorithm::lrn_across_channels,
-                               in->getMemoryDesc(),
-                               out->getMemoryDesc(),
-                               localSize_,
-                               alpha_,
-                               beta_,
-                               1.0f);
-  pd.reset(new lrn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
-}
-
-void MKLDNNLRNLayer::resetBwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<lrn_bwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& out) {
-  if (pd == nullptr) {
-    return;
-  }
-  CHECK(inVals_[0]);
-  CHECK(workspace_);
-  bwdData_ = std::make_shared<lrn_bwd>(
-      lrn_bwd(*pd, *inVals_[0], *out, *workspace_, *in));
-  pipeline.push_back(*bwdData_);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.h b/paddle/gserver/layers/MKLDNNLRNLayer.h
deleted file mode 100644
index b503ee55947294d7c44d1760058f8c26bceed142..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MKLDNNLRNLayer.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "MKLDNNLayer.h"
-#include "mkldnn.hpp"
-
-namespace paddle {
-typedef mkldnn::lrn_forward lrn_fwd;
-typedef mkldnn::lrn_backward lrn_bwd;
-
-/**
- * @brief A subclass of MKLDNNLayer LRN(Local Response Norm) layer.
- *
- * The config file api is mkldnn_lrn
- */
-class MKLDNNLRNLayer : public MKLDNNLayer {
-protected:
-  // save forward primitive_desc, which can be used in backward
-  std::shared_ptr<lrn_fwd::primitive_desc> fwdPD_;
-  // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
-  // test_lrn_backward.cpp, lrn need workspace for backward
-  std::shared_ptr<mkldnn::memory> workspace_;
-
-  int localSize_;
-  float alpha_, beta_;  // scale and pow in paddle
-
-public:
-  explicit MKLDNNLRNLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
-
-  ~MKLDNNLRNLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
-
-  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-protected:
-  void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
-  void resetFwdPD(std::shared_ptr<lrn_fwd::primitive_desc>& pd,
-                  MKLDNNMatrixPtr in,
-                  MKLDNNMatrixPtr out);
-  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<lrn_fwd::primitive_desc>& pd,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& out);
-  void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
-  void resetBwdPD(std::shared_ptr<lrn_bwd::primitive_desc>& pd,
-                  MKLDNNMatrixPtr& in,
-                  MKLDNNMatrixPtr& out);
-  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<lrn_bwd::primitive_desc>& pd,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& out);
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
deleted file mode 100644
index 4a7eb74ce3a13ed38be3548d8ce34382c594205a..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ /dev/null
@@ -1,477 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "MKLDNNBase.h"
-#include "mkldnn.hpp"
-#include "paddle/math/MKLDNNMatrix.h"
-#include "paddle/utils/Stat.h"
-
-DECLARE_bool(use_mkldnn);
-
-namespace paddle {
-
-class MKLDNNLayer;
-typedef std::shared_ptr<MKLDNNLayer> MKLDNNLayerPtr;
-
-/**
- * @brief Base class of MKLDNNlayer.
- *
- */
-class MKLDNNLayer : public Layer {
-protected:
-  // batch size
-  int bs_;
-  // their sizes are always from the first input layer
-  // input image channel, height and width
-  int ic_, ih_, iw_;
-  // output image channel, height and width
-  int oc_, oh_, ow_;
-
-  // the condition that forward need be reset
-  size_t condition_;
-  // backward also need reset after reset forward handle
-  bool needResetBwd_;
-
-  // is output only mkldnn
-  bool outputOnlyMKLDNN_;
-
-  // mkldnn engine, stream and primivtives
-  mkldnn::engine engine_;
-  std::shared_ptr<MKLDNNStream> stream_;
-  std::shared_ptr<mkldnn::primitive> fwd_;
-  std::shared_ptr<mkldnn::primitive> bwdWgt_;
-  std::shared_ptr<mkldnn::primitive> bwdData_;
-  std::vector<mkldnn::primitive> pipelineFwd_;
-  std::vector<mkldnn::primitive> pipelineBwd_;
-
-  /* Value and grad are seperated as internal and external buffers.
-   * Each MKLDNNLayer must init or reset internal buffer at least,
-   * and the external buffer format is always nchw of nc(when h==w==1),
-   * which is the same format as paddle.
-   * The output_.value and output_.grad always save the external data,
-   * when mixed with cpu device.
-   * When all layers are mkldnn layers, they could save internal data.
-   */
-  // below MKLDNNMatrix buffers are all internal buffers
-  std::vector<MKLDNNMatrixPtr> inVals_;
-  std::vector<MKLDNNMatrixPtr> inGrads_;
-  MKLDNNMatrixPtr outVal_;
-  MKLDNNMatrixPtr outGrad_;
-  // below are external value and grad
-  std::vector<MKLDNNMatrixPtr> extInVals_;
-  std::vector<MKLDNNMatrixPtr> extInGrads_;
-  MKLDNNMatrixPtr extOutVal_;
-  MKLDNNMatrixPtr extOutGrad_;
-  // convert handle between external and internal buffers
-  std::vector<std::shared_ptr<mkldnn::reorder>> cvtInVals_;
-  std::vector<std::shared_ptr<mkldnn::reorder>> cvtInGrads_;
-  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
-  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
-
-  // weight and bias are always internal buffers
-  MKLDNNMatrixPtr wgtVal_;
-  MKLDNNMatrixPtr wgtGrad_;
-  MKLDNNMatrixPtr biasVal_;
-  MKLDNNMatrixPtr biasGrad_;
-
-  // merge grad primitive
-  std::shared_ptr<mkldnn::primitive> mergeGrad_;
-  std::vector<mkldnn::primitive> pipelineMergeGrad_;
-  // tmp input argument to save input grad, only used to merge grad
-  Argument tmpInArg_;
-
-public:
-  explicit MKLDNNLayer(const LayerConfig& config)
-      : Layer(config),
-        ih_(0),
-        iw_(0),
-        condition_(0),
-        needResetBwd_(true),
-        outputOnlyMKLDNN_(false),
-        engine_(mkldnn::engine::cpu, 0),
-        stream_(nullptr),
-        fwd_(nullptr),
-        bwdWgt_(nullptr),
-        bwdData_(nullptr) {}
-
-  ~MKLDNNLayer() {}
-
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback);
-
-  /**
-   * reshape the input and output channels and image sizes
-   * and reset output buffer size
-   */
-  virtual void reshape(
-      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) = 0;
-
-  /**
-   * reset the mkldnn forward primitve and memories
-   * only would be called when input size changes
-   * weight and bias buffers should be coverd by child class itself
-   */
-  virtual void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                        std::vector<MKLDNNMatrixPtr>& inputs,
-                        MKLDNNMatrixPtr& out) = 0;
-
-  /**
-   * reset the mkldnn backward primitve and memories
-   * only would be called when needed
-   * weight and bias buffers should be coverd by child class itself
-   */
-  virtual void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                        std::vector<MKLDNNMatrixPtr>& inputs,
-                        MKLDNNMatrixPtr& out) = 0;
-
-  /**
-   * Update weights and biases if necessary.
-   */
-  virtual void updateWeights(const UpdateCallback& callback) {}
-
-  /**
-   * convert weight from paddle format to mkldnn format
-   * weight_ will be override
-   */
-  virtual void convertWeightsFromPaddle() {}
-
-  /**
-   * convert mkldnn weight to paddle format
-   * weight_ will be override
-   */
-  virtual void convertWeightsToPaddle() {}
-
-  /**
-   * add this interface as public for unit test
-   */
-  void addOutputArgument(int deviceId) { Layer::addOutputArgument(deviceId); }
-
-protected:
-  /**
-   * Some layers may have different condition to reset the forward.
-   * The function returns the condition that do not need reset forward.
-   */
-  inline virtual size_t keepCondition() {
-    // reset when the first input element size changed, not only the batchsize
-    return inputLayers_[0]->getOutputValue()->getElementCnt();
-  }
-
-  /**
-   * reshape the input image sizes and input batchsize
-   */
-  void reshapeInput(int& batchsize, int& height, int& width, size_t idx = 0);
-
-  /**
-   * reshape output image sizes
-   */
-  void reshapeOutput(size_t height, size_t width);
-
-  /**
-   * reset MKLDNNMatrix from Matrix and internal primitive desc.
-   * reset nullptr if matrix or primitive desc is empty
-   */
-  void resetWithMatrix(MKLDNNMatrixPtr& dnn,
-                       const MatrixPtr& mat,
-                       mkldnn::memory::primitive_desc pd);
-
-  /**
-   * reset input value from input MKLDNNMatrix and internal primitive desc.
-   * reset both internal and external buffer and create reorder if necessary.
-   * input channel may be different in concat.
-   */
-  void resetInValue(
-      MKLDNNMatrixPtr& in,
-      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr,
-      size_t idx = 0,
-      int inputChannel = 0);
-
-  /**
-   * reset output value from internal primitive desc.
-   * reset both internal and external buffer and create reorder if necessary.
-   */
-  void resetOutValue(MKLDNNMatrixPtr& out,
-                     mkldnn::memory::primitive_desc intPD);
-
-  /**
-   * reset input grad from internal primitive desc.
-   * reset both internal and external buffer and create reorder if necessary.
-   */
-  void resetInGrad(MKLDNNMatrixPtr& in,
-                   mkldnn::memory::primitive_desc intPD,
-                   size_t idx = 0);
-
-  /**
-   * reset output grad from internal primitive desc.
-   * merge grad if necessary.
-   * reset both internal and external buffer and create reorder if necessary.
-   * note: about merge grad, when this layer has several outputs,
-   *       it could not be mixed with cpu device,
-   *       since it can not get memory desc from cpu device.
-   */
-  void resetOutGrad(MKLDNNMatrixPtr& out, mkldnn::memory::primitive_desc intPD);
-
-  /**
-   * reset the merge grad primitive if necessary.
-   * note: do not support the grads mixed with cpu device,
-   *       since it can not get memory desc from cpu device.
-   */
-  void resetMergeGrad(MKLDNNMatrixPtr& out);
-
-protected:
-  /**
-   * Set deviceId of this layer.
-   */
-  void setDevice(int id) { deviceId_ = id; }
-
-  /**
-   * check the format is nchw or nc,
-   * which is supported by Paddle default memory layout
-   */
-  bool isPaddleFormat(mkldnn::memory::format fmt) {
-    if (fmt == mkldnn::memory::format::nchw ||
-        fmt == mkldnn::memory::format::nc) {
-      return true;
-    } else {
-      return false;
-    }
-  }
-
-  /**
-   * If input only has MKLDNN device.
-   * Otherwise, only support the previous layer using CPU device.
-   */
-  bool inputIsOnlyMKLDNN(int index = 0) {
-    int prevDevice = getPrev(index)->getDeviceId();
-    if (prevDevice == MKLDNN_DEVICE) {
-      return true;
-    } else {
-      CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet";
-      return false;
-    }
-  }
-
-  /**
-   * If output only has MKLDNN device.
-   * Otherwise, other devices should only using CPU device.
-   */
-  bool outputIsOnlyMKLDNN() {
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
-          << "Only support other device is CPU yet";
-    }
-    outputOnlyMKLDNN_ = outputOtherDevice_.size() == 0;
-    return outputOnlyMKLDNN_;
-  }
-
-  /**
-   * print info about sizes
-   */
-  virtual void printSizeInfo() {
-    VLOG(MKLDNN_SIZES) << getName() << ": bs: " << bs_ << ", ic: " << ic_
-                       << ", ih: " << ih_ << ", iw: " << iw_ << ", oc: " << oc_
-                       << ", oh: " << oh_ << ", ow: " << ow_;
-  }
-
-  /**
-   * print the mkldnn memory format of value
-   */
-  virtual void printValueFormat() {
-    for (size_t i = 0; i < inVals_.size(); ++i) {
-      if (!inVals_[i]) {
-        continue;
-      }
-      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
-                        << ": " << (extInVals_[i] ? extInVals_[i]->getFormat()
-                                                  : inVals_[i]->getFormat())
-                        << " >>> " << inVals_[i]->getFormat() << " >>>";
-    }
-    if (outVal_) {
-      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> "
-                        << (extOutVal_ ? extOutVal_->getFormat()
-                                       : outVal_->getFormat());
-    }
-    if (wgtVal_) {
-      VLOG(MKLDNN_FMTS) << "Weight value format: " << wgtVal_->getFormat();
-    }
-    if (biasVal_) {
-      VLOG(MKLDNN_FMTS) << "Bias value format: " << biasVal_->getFormat();
-    }
-  }
-
-  /**
-   * print the mkldnn memory format of grad
-   */
-  virtual void printGradFormat() {
-    if (outGrad_) {
-      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< "
-                        << (extOutGrad_ ? extOutGrad_->getFormat()
-                                        : outGrad_->getFormat());
-    }
-    for (size_t i = 0; i < inGrads_.size(); ++i) {
-      if (!inGrads_[i]) {
-        continue;
-      }
-      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
-                        << ": " << (extInGrads_[i] ? extInGrads_[i]->getFormat()
-                                                   : inGrads_[i]->getFormat())
-                        << " <<< " << inGrads_[i]->getFormat() << " <<<";
-    }
-    if (wgtGrad_) {
-      VLOG(MKLDNN_FMTS) << "Weight grad format: " << wgtGrad_->getFormat();
-    }
-    if (biasGrad_) {
-      VLOG(MKLDNN_FMTS) << "Bias grad format: " << biasGrad_->getFormat();
-    }
-  }
-
-private:
-  /**
-   * clear all grad
-   */
-  void clearGrads() {
-    if (output_.grad) {
-      output_.grad->zeroMem();
-    }
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      if (outputOtherDevice_[i].grad) {
-        outputOtherDevice_[i].grad->zeroMem();
-      }
-    }
-  }
-
-  /**
-   * Set deviceId of the params used in this layer.
-   */
-  void setParamsDevice(int id, const ParameterMap& parameterMap) {
-    for (auto& inputConfig : config_.inputs()) {
-      if (inputConfig.has_input_parameter_name()) {
-        ParameterPtr parameter;
-        std::string name = inputConfig.input_parameter_name();
-        CHECK(mapGet(name, parameterMap, &parameter))
-            << "Cannot find input parameter " << name << " for layer "
-            << getName();
-        parameter->setDevice(id);
-      }
-    }
-    if (config_.has_bias_parameter_name()) {
-      ParameterPtr parameter;
-      std::string name = config_.bias_parameter_name();
-      CHECK(mapGet(name, parameterMap, &parameter))
-          << "Cannot find bias parameter " << name << " for layer "
-          << getName();
-      parameter->setDevice(id);
-    }
-  }
-
-  /**
-   * Set output map of prev layers.
-   */
-  void setOutputMap() {
-    outputMap_.clear();
-    for (size_t i = 0; i < inputLayers_.size(); ++i) {
-      inputLayers_[i]->setOutput(getName(), &tmpInArg_);
-    }
-  }
-
-  /**
-   * if have cpu device, share value and grad data with output_
-   */
-  void shareCPUDevice() {
-    if (outputIsOnlyMKLDNN()) {
-      return;
-    }
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      outputOtherDevice_[i].value = output_.value;
-      outputOtherDevice_[i].grad = output_.grad;
-    }
-  }
-
-  /**
-   * Check the cpu device number of outputOtherDevice_.
-   * should have only one at most.
-   */
-  void checkCPUOutputsNumber(int max = 1) {
-    int cnt = 0;
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
-        ++cnt;
-      }
-    }
-    CHECK_LE(cnt, max) << "too much CPU devies";
-  }
-
-  /**
-   * copy SeqInfo from input layer to this output and other output devices.
-   * @note: do not use getInput(0) since it used this deviceId_,
-   *        use "inputLayers_[0]->getOutput()" instead.
-   */
-  void copySeqInfoToOutputs() {
-    if (inputLayers_.empty() || !needSequenceInfo_) {
-      return;
-    }
-    const Argument& input = inputLayers_[0]->getOutput();
-    output_.sequenceStartPositions = input.sequenceStartPositions;
-    output_.subSequenceStartPositions = input.subSequenceStartPositions;
-    output_.cpuSequenceDims = input.cpuSequenceDims;
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      outputOtherDevice_[i].sequenceStartPositions =
-          output_.sequenceStartPositions;
-      outputOtherDevice_[i].subSequenceStartPositions =
-          output_.subSequenceStartPositions;
-      outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
-    }
-  }
-
-  void prepareValueConversions(std::vector<mkldnn::primitive>& pipeline) {
-    // MKLDNNLayer output value should be MKLDNNMatrix
-    // so external output value is necessary.
-    // Then external input value is not necessary,
-    // since input may be mkldnn internal buffer.
-    CHECK(extOutVal_) << "external output value is necessary";
-    output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
-    CHECK(inVals_[0] && outVal_) << "internal memories are necessary";
-    for (size_t i = 0; i < cvtInVals_.size(); ++i) {
-      if (cvtInVals_[i]) {
-        pipeline.insert(pipeline.begin(), *cvtInVals_[i]);
-      }
-    }
-    if (cvtOutVal_) {
-      pipeline.push_back(*cvtOutVal_);
-    }
-  }
-  void prepareGradConversions(std::vector<mkldnn::primitive>& pipeline) {
-    // external output grad is not necessary
-    // since output may be mkldnn internal buffer or merge them directly.
-    CHECK(outGrad_) << "internal output grad is necessary";
-    if (extOutGrad_) {
-      CHECK_EQ(extOutGrad_->getData(), output_.grad->getData())
-          << "the external buffer should share the same data with output_.grad";
-    }
-    if (cvtOutGrad_) {
-      pipeline.insert(pipeline.begin(), *cvtOutGrad_);
-    }
-    for (size_t i = 0; i < cvtInGrads_.size(); ++i) {
-      if (cvtInGrads_[i]) {
-        pipeline.push_back(*cvtInGrads_[i]);
-      }
-    }
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
deleted file mode 100644
index 3be848c7496aac616903cb09844c5eadd320e91c..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ /dev/null
@@ -1,195 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNPoolLayer.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/utils/Logging.h"
-
-using namespace mkldnn;  // NOLINT
-typedef memory::format format;
-
-namespace paddle {
-
-REGISTER_LAYER(mkldnn_pool, MKLDNNPoolLayer);
-
-bool MKLDNNPoolLayer::init(const LayerMap& layerMap,
-                           const ParameterMap& parameterMap) {
-  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
-    return false;
-  }
-
-  /* the size of inputs for pool-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-  const PoolConfig& conf = config_.inputs(0).pool_conf();
-  ic_ = conf.channels();
-  ih_ = conf.img_size_y();
-  iw_ = conf.img_size();
-  oc_ = ic_;
-  oh_ = conf.output_y();
-  ow_ = conf.output_x();
-  fh_ = conf.size_y();
-  fw_ = conf.size_x();
-  ph_ = conf.padding_y();
-  pw_ = conf.padding();
-  sh_ = conf.stride_y();
-  sw_ = conf.stride();
-
-  const std::string& type = conf.pool_type();
-  if (type == "max-projection") {
-    poolAlgo_ = algorithm::pooling_max;
-  } else if (type == "avg-projection") {
-    // paddle only use exclude_padding
-    poolAlgo_ = algorithm::pooling_avg_exclude_padding;
-  } else {
-    LOG(FATAL) << "unknow pooling type!";
-  }
-  return true;
-}
-
-void MKLDNNPoolLayer::reshape(
-    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
-  reshapeInput(bs, ih, iw);
-  // ic_ and oc can not be changed
-  CHECK_EQ((size_t)ic,
-           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
-      << "Input channel can not be changed";
-
-  // cal output sizes
-  // paddle used false caffeMode for pooling
-  oh = outputSize(ih, fh_, ph_, sh_, false);
-  ow = outputSize(iw, fw_, pw_, sw_, false);
-  reshapeOutput(oh, ow);
-
-  resizeOutput(bs, oc * oh * ow);
-}
-
-void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
-                               std::vector<MKLDNNMatrixPtr>& inputs,
-                               MKLDNNMatrixPtr& out) {
-  resetFwdBuffers(inputs[0], out);
-
-  resetFwdPD(fwdPD_, inputs[0], out);
-
-  resetFwdPipeline(pipeline, fwdPD_, inputs[0], out);
-}
-
-void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
-                               std::vector<MKLDNNMatrixPtr>& inputs,
-                               MKLDNNMatrixPtr& out) {
-  std::shared_ptr<pool_bwd::primitive_desc> pd;
-
-  resetBwdBuffers(inputs[0], out);
-
-  resetBwdPD(pd, inputs[0], out);
-
-  resetBwdPipeline(pipeline, pd, inputs[0], out);
-}
-
-void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
-                                      MKLDNNMatrixPtr& out) {
-  resetInValue(in);
-
-  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
-  CHECK(in);
-  auto outPD =
-      MKLDNNMatrix::createPrimitiveDesc(outDims, in->getFormat(), engine_);
-  resetOutValue(out, outPD);
-}
-
-void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
-                                 MKLDNNMatrixPtr in,
-                                 MKLDNNMatrixPtr out) {
-  memory::dims kernels = memory::dims{fh_, fw_};
-  memory::dims strides = memory::dims{sh_, sw_};
-  memory::dims padL = memory::dims{ph_, pw_};
-  memory::dims padR = getPaddingR();
-  padding_kind padKind = padding_kind::zero;
-  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
-                                        : prop_kind::forward_training;
-  auto fwdDesc = pool_fwd::desc(pk,
-                                poolAlgo_,
-                                in->getMemoryDesc(),
-                                out->getMemoryDesc(),
-                                strides,
-                                kernels,
-                                padL,
-                                padR,
-                                padKind);
-  pd.reset(new pool_fwd::primitive_desc(fwdDesc, engine_));
-
-  // prepare workspace if necessary
-  workspace_ =
-      (passType_ != PASS_TEST && poolAlgo_ == algorithm::pooling_max)
-          ? std::make_shared<memory>(memory(pd->workspace_primitive_desc()))
-          : nullptr;
-}
-
-void MKLDNNPoolLayer::resetFwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<pool_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& out) {
-  fwd_ = workspace_
-             ? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_))
-             : std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out));
-  pipeline.push_back(*fwd_);
-}
-
-void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
-                                      MKLDNNMatrixPtr& out) {
-  CHECK(inVals_[0] && outVal_);
-  resetOutGrad(out, outVal_->getPrimitiveDesc());
-  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
-}
-
-void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
-                                 MKLDNNMatrixPtr& in,
-                                 MKLDNNMatrixPtr& out) {
-  pd = nullptr;
-  if (in == nullptr) {
-    return;
-  }
-  memory::dims kernels = memory::dims{fh_, fw_};
-  memory::dims strides = memory::dims{sh_, sw_};
-  memory::dims padL = memory::dims{ph_, pw_};
-  memory::dims padR = getPaddingR();
-  CHECK(out);
-  auto bwdDesc = pool_bwd::desc(poolAlgo_,
-                                in->getMemoryDesc(),
-                                out->getMemoryDesc(),
-                                strides,
-                                kernels,
-                                padL,
-                                padR,
-                                padding_kind::zero);
-  pd.reset(new pool_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
-}
-
-void MKLDNNPoolLayer::resetBwdPipeline(
-    std::vector<primitive>& pipeline,
-    std::shared_ptr<pool_bwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& out) {
-  if (pd == nullptr) {
-    return;
-  }
-
-  bwdData_ =
-      workspace_
-          ? std::make_shared<pool_bwd>(pool_bwd(*pd, *out, *workspace_, *in))
-          : std::make_shared<pool_bwd>(pool_bwd(*pd, *out, *in));
-  pipeline.push_back(*bwdData_);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.h b/paddle/gserver/layers/MKLDNNPoolLayer.h
deleted file mode 100644
index 12821cda7308602dd2fe834f52c614e6112b7cea..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MKLDNNPoolLayer.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "MKLDNNLayer.h"
-#include "mkldnn.hpp"
-
-namespace paddle {
-typedef mkldnn::pooling_forward pool_fwd;
-typedef mkldnn::pooling_backward pool_bwd;
-
-/**
- * @brief A subclass of MKLDNNLayer pool layer.
- *
- * The config file api is mkldnn_pool
- */
-class MKLDNNPoolLayer : public MKLDNNLayer {
-protected:
-  // padding height and width
-  int ph_, pw_;
-  // stride height and width
-  int sh_, sw_;
-  // filter(kenerl) height and width
-  int fh_, fw_;
-
-  // pooling_avg or pooling_max
-  mkldnn::algorithm poolAlgo_;
-
-  // save forward primitive_desc, which can be used backward
-  std::shared_ptr<pool_fwd::primitive_desc> fwdPD_;
-  // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
-  // test_pooling_forward.cpp, pool need workspace for backward
-  std::shared_ptr<mkldnn::memory> workspace_;
-
-public:
-  explicit MKLDNNPoolLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
-
-  ~MKLDNNPoolLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void reshape(
-      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
-
-  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
-                std::vector<MKLDNNMatrixPtr>& inputs,
-                MKLDNNMatrixPtr& out) override;
-
-  void printSizeInfo() override {
-    MKLDNNLayer::printSizeInfo();
-    VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
-                       << ": ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_
-                       << ", sw: " << sw_;
-  }
-
-protected:
-  void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
-  void resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
-                  MKLDNNMatrixPtr in,
-                  MKLDNNMatrixPtr out);
-  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<pool_fwd::primitive_desc>& pd,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& out);
-  void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
-  void resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
-                  MKLDNNMatrixPtr& in,
-                  MKLDNNMatrixPtr& out);
-  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
-                        std::shared_ptr<pool_bwd::primitive_desc>& pd,
-                        MKLDNNMatrixPtr& in,
-                        MKLDNNMatrixPtr& out);
-
-  /**
-   * get padding_r according to
-   * https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
-   * test_pooling_forward.cpp
-   */
-  mkldnn::memory::dims getPaddingR() const {
-    mkldnn::memory::dims padR = {ph_, pw_};
-    for (int i = 0; i < 2; ++i) {
-      if ((ih_ + ph_ + padR[0] - fh_) / sh_ + 1 < oh_) {
-        ++padR[0];
-      }
-      if ((iw_ + pw_ + padR[1] - fw_) / sw_ + 1 < ow_) {
-        ++padR[1];
-      }
-    }
-    return padR;
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/gserver/layers/MKLPackedRecurrentLayer.h
deleted file mode 100644
index 37eb362d45215edc736984f8da784fe74bb43f2b..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MKLPackedRecurrentLayer.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "MKLPackedWeight.h"
-#include "RecurrentLayer.h"
-
-DECLARE_bool(rnn_use_batch);
-
-namespace paddle {
-
-/**
- * @brief MKLPackedRecurrentLayer is almost the same with RecurrentLayer
- * but is optimized with MKL cblas packed gemm.
- * More details:
- * https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/mkl/mkl_packed.md
- */
-
-class MKLPackedRecurrentLayer : public RecurrentLayer {
-public:
-  explicit MKLPackedRecurrentLayer(const LayerConfig& config)
-      : RecurrentLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void backward(const UpdateCallback& callback) override;
-
-protected:
-  void forwardBatch(int batchSize,
-                    size_t numSequences,
-                    const int* starts) override;
-
-  void backwardBatch(int batchSize,
-                     size_t numSequences,
-                     const int* starts) override;
-
-protected:
-  /// packed_weight_ contains same data with
-  /// RecurrentLayer::weight_ but is packed
-  std::unique_ptr<MKLPackedWeight> packed_weight_;
-  /// packed_weightT_ is the transposition matrix of packed_weight_
-  std::unique_ptr<MKLPackedWeight> packed_weightT_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLPackedWeight.h b/paddle/gserver/layers/MKLPackedWeight.h
deleted file mode 100644
index 28b8a7db7cc3d2be12d6ce9291de1e415cf77bbc..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MKLPackedWeight.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/math/MathFunctions.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/parameter/Weight.h"
-
-namespace paddle {
-
-class MKLPackedWeight {
-protected:
-  /// The pointer of weight
-  real *weight_;
-  /// The pointer of cblas packed gemm to weight
-  real *packedWeight_;
-  size_t height_;
-  size_t width_;
-  bool transW_;
-
-public:
-  explicit MKLPackedWeight(MatrixPtr weight, bool transW = false) {
-    packedWeight_ = nullptr;
-    weight_ = weight->getData();
-    height_ = weight->getHeight();
-    width_ = weight->getWidth();
-    transW_ = transW;
-  }
-
-  ~MKLPackedWeight() { free_(); }
-
-  void pack() { pack_(weight_); }
-
-  void gemm_compute(const MatrixPtr src, MatrixPtr dst) {
-    cblas_sgemm_compute(CblasRowMajor,
-                        CblasNoTrans,
-                        CblasPacked,
-                        src->getHeight(),
-                        transW_ ? height_ : width_,
-                        transW_ ? width_ : height_,
-                        src->getData(),
-                        src->getWidth(),
-                        packedWeight_,
-                        width_,
-                        1.0,
-                        dst->getData(),
-                        dst->getWidth());
-  }
-
-protected:
-  void pack_(real *src) {
-    if (!packedWeight_) {
-      packedWeight_ = cblas_sgemm_alloc(CblasBMatrix, 1, width_, height_);
-    }
-    cblas_sgemm_pack(CblasRowMajor,
-                     CblasBMatrix,
-                     transW_ ? CblasTrans : CblasNoTrans,
-                     1,
-                     transW_ ? height_ : width_,
-                     transW_ ? width_ : height_,
-                     1.0,
-                     src,
-                     width_,
-                     packedWeight_);
-  }
-
-  void free_() {
-    if (packedWeight_) {
-      cblas_sgemm_free(packedWeight_);
-    }
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxIdLayer.cpp b/paddle/gserver/layers/MaxIdLayer.cpp
deleted file mode 100644
index 84e375d7441ce3ccd8a5df94df22d85d104b5d96..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MaxIdLayer.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * A layer for finding the id which has the maximal value for each sample.
- * The result is stored in output_.ids.
- *
- * The config file api is maxid_layer.
- */
-class MaxIdLayer : public Layer {
-private:
-  /// a predetermined number of best states at each level
-  size_t beamSize_;
-
-public:
-  explicit MaxIdLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    bool ret = Layer::init(layerMap, parameterMap);
-    CHECK_EQ(1UL, inputLayers_.size());
-
-    beamSize_ = config_.has_beam_size() ? config_.beam_size() : FLAGS_beam_size;
-    CHECK_GE(beamSize_, 1LU);
-    return ret;
-  }
-
-  void forward(PassType passType) override {
-    Layer::forward(passType);
-    const Argument& input = getInput(0);
-    size_t batchSize = input.getBatchSize();
-    IVector::resizeOrCreate(output_.ids, batchSize * beamSize_, useGpu_);
-    Matrix::resizeOrCreate(output_.in,
-                           batchSize,
-                           beamSize_,
-                           false,
-                           /* useGpu */ useGpu_);
-    output_.value = nullptr;
-    input.value->rowMax(*output_.ids, *output_.in);
-  }
-
-  void backward(const UpdateCallback& callback) override {}
-};
-
-REGISTER_LAYER(maxid, MaxIdLayer);
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxLayer.cpp b/paddle/gserver/layers/MaxLayer.cpp
deleted file mode 100644
index 7ee2e0dd946d6f332f6b8454f977601b0ee8d249..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MaxLayer.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MaxLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(max, MaxLayer);
-
-void MaxLayer::forward(PassType passType) {
-  SequencePoolLayer::forward(passType);
-
-  IVector::resizeOrCreate(
-      maxIndex_, newBatchSize_ * getSize(), useGpu(deviceId_));
-  maxIndex_->zeroMem();
-
-  MatrixPtr inputValue = getInputValue(0);
-  MatrixPtr outputValue = getOutputValue();
-
-  {
-    REGISTER_TIMER_INFO("MaxLayerForward", getName().c_str());
-    outputValue->maxSequenceForward(
-        *inputValue, *startPositions_->getVector(useGpu_), *maxIndex_);
-  }
-
-  if (config_.output_max_index()) {
-    // copy maxIndex_ to output
-    outputValue->copyFrom(*maxIndex_);
-  } else {
-    /* add the bias-vector AFTER max operation */
-    if (biases_.get() != NULL) {
-      outputValue->addBias(*(biases_->getW()), 1);
-    }
-    /* activation */ { forwardActivation(); }
-  }
-}
-
-void MaxLayer::backward(const UpdateCallback& callback) {
-  CHECK(!config_.output_max_index())
-      << "backward is not available when output_max_index is set";
-  SequencePoolLayer::backward(callback);
-
-  MatrixPtr inputGrad = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-  if (inputGrad) {
-    REGISTER_TIMER_INFO("MaxLayerBackward", getName().c_str());
-    inputGrad->maxSequenceBackward(
-        *outputGrad, *(startPositions_->getVector(useGpu_)), *maxIndex_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxLayer.h b/paddle/gserver/layers/MaxLayer.h
deleted file mode 100644
index 9dbc672652dc2670a775f02ecd3a9de9919c8ae0..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MaxLayer.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "SequencePoolLayer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * A layer for "internal max" for sequence input.
- * Input: one or more sequences. Each sequence contains some instances.
- * If SequenceLevel = kNonSeq:
- *    Output: output size is the number of input sequences (NOT input instances)
- *    output[i] = max_{for each instance in this sequence}{input[i]}
- *    If stride_ > 0:
- *      Output: a shorten sequence. Stride is the step size by which we slide a
- *              window upon the input sequence, and the max pooling operation is
- *              then applied to each interval independently.
- * If SequenceLevel = kSeq:
- *    Check input sequence must has sub-sequence
- *    Output: output size is the number of input sub-sequences
- *    output[i] = max_{for each instance in this sub-sequence}{input[i]}
- *
- * The config file api is pooling_layer.
- */
-
-class MaxLayer : public SequencePoolLayer {
-protected:
-  // maxIndex_[i][j] = k : the value at (i, j) is from input[k].
-  IVectorPtr maxIndex_;
-
-public:
-  explicit MaxLayer(const LayerConfig& config) : SequencePoolLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    return SequencePoolLayer::init(layerMap, parameterMap);
-  }
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxOutLayer.h b/paddle/gserver/layers/MaxOutLayer.h
deleted file mode 100644
index 1fb371836bacb9e02cc32eabfd21bf24165b0734..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MaxOutLayer.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * A layer to do max out on conv layer output.
- * Input: output of a conv layer.
- * Output: feature map size same as input.  Channel is (input channel) / groups.
- * So the num of channels should be able to devided by groups.
- *
- * The config file api is maxout_layer.
- */
-
-class MaxOutLayer : public Layer {
-protected:
-  size_t groups_;
-  size_t imgSizeH_, imgSizeW_;
-  /// outputChannels_ = channels_ / groups_
-  size_t channels_, outputChannels_;
-  /// feature length = imgSizeH_ * imgSizeW_
-  size_t featLen_;
-  IVectorPtr maxoutId_;
-
-public:
-  /// return imgSizeH_ * imgSizeW_ * outputChannels_;
-  size_t getSize();
-
-  explicit MaxOutLayer(const LayerConfig& config) : Layer(config) {}
-  virtual ~MaxOutLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxPoolWithMaskLayer.cpp b/paddle/gserver/layers/MaxPoolWithMaskLayer.cpp
deleted file mode 100644
index e594e22b5eaa6027fdf5bbd09ab93774d9a798be..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MaxPoolWithMaskLayer.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MaxPoolWithMaskLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-bool MaxPoolWithMaskLayer::init(const LayerMap& layerMap,
-                                const ParameterMap& parameterMap) {
-  PoolLayer::init(layerMap, parameterMap);
-  setOutput("mask", &mask_);
-  return true;
-}
-
-size_t MaxPoolWithMaskLayer::getSize() {
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  size_t layerSize = 0;
-
-  outputY_ = outputSize(imgSizeY_,
-                        sizeY_,
-                        confPaddingY_,
-                        strideY_,
-                        /* caffeMode */ false);
-  outputX_ = outputSize(imgSize_,
-                        sizeX_,
-                        confPadding_,
-                        stride_,
-                        /* caffeMode */ false);
-
-  layerSize = outputX_ * outputY_ * channels_;
-  getOutput().setFrameHeight(outputY_);
-  getOutput().setFrameWidth(outputX_);
-
-  return layerSize;
-}
-
-void MaxPoolWithMaskLayer::forward(PassType passType) {
-  size_t size = getSize();
-  MatrixPtr inputV = inputLayers_[0]->getOutputValue();
-  int batchSize = inputV->getHeight();
-  resetOutput(batchSize, size);
-
-  MatrixPtr outV = getOutputValue();
-  CHECK_EQ(size, outV->getWidth());
-
-  resetSpecifyOutput(mask_,
-                     batchSize,
-                     size,
-                     /* isValueClean */ false,
-                     /* isGradClean */ true);
-
-  MatrixPtr maskV = mask_.value;
-  outV->maxPoolForward(*inputV,
-                       imgSizeY_,
-                       imgSize_,
-                       channels_,
-                       sizeX_,
-                       sizeY_,
-                       strideY_,
-                       stride_,
-                       outputY_,
-                       outputX_,
-                       confPaddingY_,
-                       confPadding_,
-                       maskV);
-}
-
-void MaxPoolWithMaskLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-  if (NULL == getInputGrad(0)) {
-    return;
-  }
-
-  MatrixPtr outGrad = getOutputGrad();
-  MatrixPtr inputV = inputLayers_[0]->getOutputValue();
-  MatrixPtr outV = getOutputValue();
-  MatrixPtr inputGrad = inputLayers_[0]->getOutputGrad();
-
-  inputGrad->maxPoolBackward(*inputV,
-                             imgSizeY_,
-                             imgSize_,
-                             *outGrad,
-                             *outV,
-                             sizeX_,
-                             sizeY_,
-                             strideY_,
-                             stride_,
-                             outputY_,
-                             outputX_,
-                             1,
-                             1,
-                             confPaddingY_,
-                             confPadding_);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxPoolWithMaskLayer.h b/paddle/gserver/layers/MaxPoolWithMaskLayer.h
deleted file mode 100644
index 74cc8acf3515b10257ffb185061344fbcc94a337..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MaxPoolWithMaskLayer.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "PoolLayer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-/**
- * @brief Basic parent layer of different kinds of pooling
- */
-class MaxPoolWithMaskLayer : public PoolLayer {
-protected:
-  Argument mask_;
-
-public:
-  explicit MaxPoolWithMaskLayer(const LayerConfig& config)
-      : PoolLayer(config) {}
-
-  size_t getSize();
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MixedLayer.cpp b/paddle/gserver/layers/MixedLayer.cpp
deleted file mode 100644
index 7dcb30b98d6e6b08929d5fecba0833c8b1989725..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MixedLayer.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MixedLayer.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(mixed, MixedLayer);
-
-bool MixedLayer::init(const LayerMap& layerMap,
-                      const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  if (!Layer::init(layerMap, parameterMap)) return false;
-
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  projections_.resize(inputLayers_.size());
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    if (config_.inputs(i).has_proj_conf()) {
-      projections_[i].reset(Projection::create(
-          config_.inputs(i).proj_conf(), parameters_[i], useGpu_));
-    } else {
-      CHECK(!parameters_[i]) << "should no parameters for operators";
-    }
-  }
-  for (auto& operator_conf : config_.operator_confs()) {
-    for (auto& input_index : operator_conf.input_indices()) {
-      CHECK(!config_.inputs(input_index).has_proj_conf());
-    }
-    operators_.emplace_back(Operator::create(operator_conf, useGpu_));
-  }
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    sharedBias_ = config_.shared_biases();
-    size_t psize = config_.bias_size();
-    biases_ = std::unique_ptr<Weight>(new Weight(1, psize, biasParameter_));
-  }
-
-  return true;
-}
-
-void MixedLayer::prefetch() {
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    if (projections_[i]) {
-      projections_[i]->prefetch(&getInput(i));
-    }
-  }
-}
-
-void MixedLayer::resetState() {
-  for (auto& proj : projections_) {
-    if (proj) {
-      proj->resetState();
-    }
-  }
-}
-
-void MixedLayer::setState(LayerStatePtr state) {
-  CHECK(projectionStateMatrixSize_.size() == projections_.size())
-      << "projection size mis-match";
-
-  int start = 0;
-  LayerStatePtr statePtr = std::make_shared<LayerState>();
-  for (int i = 0; i < (int)projectionStateMatrixSize_.size(); i++) {
-    if (projectionStateMatrixSize_[i] > 0) {
-      statePtr->value.clear();
-      for (int j = start; j < start + projectionStateMatrixSize_[i]; j++) {
-        statePtr->value.push_back(state->value[j]);
-      }
-      projections_[i]->setState(statePtr);
-      start += projectionStateMatrixSize_[i];
-    }
-  }
-  CHECK((int)state->value.size() == start) << "state matrix size mis-match";
-}
-
-// Return state which consists of all projections states
-LayerStatePtr MixedLayer::getState() {
-  bool init = projectionStateMatrixSize_.size() == 0;
-  LayerStatePtr res = std::make_shared<LayerState>();
-  for (int i = 0; i < (int)projections_.size(); i++) {
-    LayerStatePtr statePtr =
-        projections_[i] ? projections_[i]->getState() : nullptr;
-    int stateSize = statePtr == nullptr ? 0 : statePtr->value.size();
-    if (init) {
-      projectionStateMatrixSize_.push_back(stateSize);
-    } else {
-      CHECK(projectionStateMatrixSize_[i] == stateSize)
-          << "state matrix size mis-match";
-    }
-    if (statePtr != nullptr) {
-      for (auto& matrixPtr : statePtr->value) {
-        res->value.push_back(matrixPtr);
-      }
-    }
-  }
-  return res;
-}
-
-void MixedLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  int batchSize = getInput(0).getBatchSize();
-  int size = getSize();
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, size);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    if (projections_[i]) {
-      projections_[i]->forward(&getInput(i), &output_, passType);
-    }
-  }
-
-  std::vector<const Argument*> ins;
-  for (auto& op : operators_) {
-    ins.clear();
-    for (auto& input_index : op->getConfig().input_indices()) {
-      ins.push_back(&getInput(input_index));
-    }
-    op->forward(ins, &output_, passType);
-  }
-
-  /* add the bias-vector */
-  if (biases_.get() != NULL) {
-    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
-    outV->addBias(*(biases_->getW()), 1, sharedBias_);
-  }
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void MixedLayer::backward(const UpdateCallback& callback) {
-  /* Do activation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  if (biases_ && biases_->getWGrad()) {
-    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBias_);
-
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    if (projections_[i]) {
-      projections_[i]->backward(callback);
-    }
-  }
-
-  for (auto& op : operators_) {
-    op->backward();
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MixedLayer.h b/paddle/gserver/layers/MixedLayer.h
deleted file mode 100644
index a1a43c52e4f503178a66ad8aa6c12bec89566081..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MixedLayer.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "Operator.h"
-#include "Projection.h"
-
-namespace paddle {
-
-/**
- * A mixed layer has multiple input layers.
- * Each input layer was processed by a Projection or Operator.
- * The results of all projections or Operators are summed together with bias
- * (if configured), and then go through an activation function and dropout
- * (if configured).
- *
- * The config file api is mixed_layer.
- */
-class MixedLayer : public Layer {
-public:
-  explicit MixedLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~MixedLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void prefetch() override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-  void resetState() override;
-  /**
-   * setState() should be called after getState().
-   * Argument state consists of all projections states.
-   */
-  void setState(LayerStatePtr state) override;
-  /**
-   * Return state which consists of all projections states.
-   */
-  LayerStatePtr getState() override;
-
-protected:
-  std::vector<std::unique_ptr<Projection>> projections_;
-  std::vector<std::unique_ptr<Operator>> operators_;
-  /// the matrix size of projection state
-  std::vector<int> projectionStateMatrixSize_;
-  std::unique_ptr<Weight> biases_;
-  bool sharedBias_;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MultiBoxLossLayer.h b/paddle/gserver/layers/MultiBoxLossLayer.h
deleted file mode 100644
index 9935da56446c1508549906becfd28548d5deecde..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MultiBoxLossLayer.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/* copyright (c) 2016 paddlepaddle authors. all rights reserve.
-
-licensed under the apache license, version 2.0 (the "license");
-you may not use this file except in compliance with the license.
-you may obtain a copy of the license at
-
-    http://www.apache.org/licenses/license-2.0
-
-unless required by applicable law or agreed to in writing, software
-distributed under the license is distributed on an "as is" basis,
-without warranties or conditions of any kind, either express or implied.
-see the license for the specific language governing permissions and
-limitations under the license. */
-
-#pragma once
-
-#include <vector>
-#include "CostLayer.h"
-#include "DataLayer.h"
-#include "DetectionUtil.h"
-#include "Layer.h"
-
-using std::vector;
-using std::pair;
-
-namespace paddle {
-
-/**
- * The multibox loss layer for a SSD detection task.
- * The loss is composed by the location loss and the confidence loss.
- * The location loss is a smooth L1 loss and the confidence loss is
- * a softmax loss.
- * - Input: This layer needs four input layers: The first input layer
- *          is the priorbox layer and the second layer is a label layer.
- *          The rest two input layers are convolution layers for generating
- *          bbox location offset and the classification confidence.
- * - Output: The Single Shot Multibox Detection loss value.
- * Reference:
- *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
- *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
- */
-
-class MultiBoxLossLayer : public CostLayer {
-public:
-  explicit MultiBoxLossLayer(const LayerConfig& config) : CostLayer(config) {}
-
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-
-  void backward(const UpdateCallback& callback = nullptr);
-
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost) {}
-
-  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad) {}
-
-protected:
-  inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
-  inline LayerPtr getLabelLayer() { return inputLayers_[1]; }
-  inline LayerPtr getLocInputLayer(size_t index) {
-    return inputLayers_[2 + index];
-  }
-  inline LayerPtr getConfInputLayer(size_t index) {
-    return inputLayers_[2 + inputNum_ + index];
-  }
-
-protected:
-  size_t numClasses_;
-  real overlapThreshold_;
-  real negPosRatio_;
-  real negOverlap_;
-  size_t inputNum_;
-  size_t backgroundId_;
-
-  real locLoss_;
-  real confLoss_;
-
-  size_t numPriors_;
-  size_t numMatches_;
-  size_t numNegs_;
-  size_t numConf_;
-  size_t locSizeSum_;
-  size_t confSizeSum_;
-
-  vector<vector<int>> allMatchIndices_;
-  vector<vector<int>> allNegIndices_;
-  MatrixPtr locGTData_;
-  IVectorPtr confGTData_;
-
-  MatrixPtr locBuffer_;
-  MatrixPtr confBuffer_;
-  MatrixPtr locDiff_;
-  MatrixPtr confProb_;
-
-  MatrixPtr labelCpuValue_;
-  MatrixPtr priorCpuValue_;
-  MatrixPtr locCpuBuffer_;
-  MatrixPtr confCpuBuffer_;
-  MatrixPtr locTmpBuffer_;
-  MatrixPtr confTmpBuffer_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MultinomialSampler.h b/paddle/gserver/layers/MultinomialSampler.h
deleted file mode 100644
index 1f9e818ee5d21188e3bd39d1225912a1a2ae1598..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MultinomialSampler.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <random>
-#include "paddle/utils/Common.h"
-
-namespace paddle {
-
-/**
- * @brief Given the probability of N objects, the sampler random select
- * one of the object.
- * @note: prob does not have to be unnormalized.
- *
- * The space requirement is O(N)=O(N * sizeof(Interval)).
- * The computational complexity of generate one sample is O(1).
- */
-class MultinomialSampler {
-public:
-  MultinomialSampler(const real* prob, int size);
-
-  //! protobuf always using double.
-  static MultinomialSampler* create(const double* prob, int size) {
-#ifdef PADDLE_TYPE_DOUBLE
-    return new MultinomialSampler(prob, size);
-#else
-    std::unique_ptr<real[]> tmp(new real[size]);
-    std::copy(prob, prob + size, tmp.get());
-    return new MultinomialSampler(tmp.get(), size);
-#endif
-  }
-
-  /**
-   * @brief Generate a random sample.
-   * @param g is a random number engine. See <random>.
-   * @return Random integer.
-   */
-  template <typename URNG>
-  int gen(URNG& g) {
-    return gen1([&g, this]() { return rand_(g); });
-  }
-
-protected:
-  /**
-   * @brief Generation
-   * @param[in] rand rand is a real random number distribution
-   * for the range [0, size).
-   * @return random int number or intervals_[random_int_number].otherId.
-   */
-  template <typename Rand>
-  int gen1(Rand rand) {
-    double r = rand();  // NOLINT
-    int i = (int)r;
-    r -= i;
-    return r < intervals_[i].thresh ? i : intervals_[i].otherId;
-  }
-
-  struct Interval {
-    int otherId;
-    real thresh;
-  };
-
-  /// The probability of each interval will be 1./size
-  std::vector<Interval> intervals_;
-  std::uniform_real_distribution<double> rand_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/MultiplexLayer.cpp b/paddle/gserver/layers/MultiplexLayer.cpp
deleted file mode 100644
index 82857f8c3ef3e39ec451c1f26bac4996c12350a5..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/MultiplexLayer.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- *@brief This layer multiplex multiple layers according to the index,
- * which is provided by the first input layer.
- * - Input[0]: the index of the layer to output of size batchSize.
- * - Input[1:N]; the candidate output data.
- * For each index i from 0 to batchSize -1, the output is the i-th row of the
- * (index[i] + 1)-th layer.
- *
- * For each i-th row of output:
- *
- * \f[
- *   y[i][j] = x_{x_{0}[i] + 1}[i][j], j = 0,1, ... , (x_{1}.width - 1)
- * \f]
- * where, y is output. \f$x_{k}\f$ is the k-th input layer and
- * \f$k = x_{0}[i] + 1\f$.
- */
-
-class MultiplexLayer : public Layer {
-protected:
-  /**
-   * @brief A struct is used to save the copy information, includes input
-   * layer index and copy size.
-   */
-  struct CopyInfo {
-    CopyInfo(int inStartIdx, int inLength, int inCopyIdx)
-        : startIdx(inStartIdx), length(inLength), copyIdx(inCopyIdx) {}
-
-    /// The start row of input.
-    int startIdx;
-    /// Number of rows. If the layer index in Input[0] is not consecutive,
-    /// the length is one. Otherwise, the length is > 1 and copy multi rows
-    /// once.
-    int length;
-    /// The copied layer index, which needs to add 1.
-    int copyIdx;
-  };
-
-  /// A list of CopyInfo used to save copy information.
-  std::vector<CopyInfo> copySchedule_;
-
-  /// Temporary matrix pointer to point to input data.
-  MatrixPtr tmpSrc_;
-  /// Temporary matrix pointer to point to output data.
-  MatrixPtr tmpDest_;
-
-public:
-  explicit MultiplexLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~MultiplexLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-private:
-  /**
-   * @brief Calculate copy info for input layers.
-   */
-  void calculateCopySchedule(const IVectorPtr& copyIds, size_t numIns);
-};
-
-REGISTER_LAYER(multiplex, MultiplexLayer);
-
-void MultiplexLayer::calculateCopySchedule(const IVectorPtr& copyIds,
-                                           size_t numIns) {
-  copySchedule_.clear();
-  CopyInfo prevCopyInfo(0, 0, -1);
-  for (size_t i = 0; i < copyIds->getSize(); i++) {
-    int copyId = copyIds->getElement(i);
-    CHECK_GE(copyId, 0);
-    CHECK_LT(copyId, int(numIns));
-    // copy same input layer with prevous and will copy consecutive.
-    if (copyId == prevCopyInfo.copyIdx) {
-      ++prevCopyInfo.length;
-    } else {
-      if (prevCopyInfo.copyIdx != -1) {
-        copySchedule_.emplace_back(prevCopyInfo);
-      }
-      prevCopyInfo.startIdx = i;
-      prevCopyInfo.length = 1;
-      prevCopyInfo.copyIdx = copyId;
-    }
-  }
-  if (prevCopyInfo.copyIdx != -1) {
-    copySchedule_.emplace_back(prevCopyInfo);
-  }
-}
-
-bool MultiplexLayer::init(const LayerMap& layerMap,
-                          const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_GE(inputLayers_.size(), 2U);
-
-  tmpSrc_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-  tmpDest_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-  return true;
-}
-
-void MultiplexLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  IVectorPtr copyIds = getInput(0).ids;
-  MatrixPtr inV1 = getInputValue(1);
-  CHECK_EQ(copyIds->getSize(), inV1->getHeight());
-  for (size_t i = 2; i < inputLayers_.size(); i++) {
-    CHECK_EQ(inV1->getHeight(), getInputValue(i)->getHeight());
-    CHECK_EQ(inV1->getWidth(), getInputValue(i)->getWidth());
-  }
-
-  calculateCopySchedule(copyIds, inputLayers_.size() - 1);
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(inV1->getHeight(), inV1->getWidth());
-  }
-
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwLMultplexingTimer", getName().c_str());
-    AsyncGpuBlock block;
-    for (const CopyInfo& info : copySchedule_) {
-      outV->subMatrix(info.startIdx, info.length, tmpDest_)
-          ->copyFrom(*getInputValue(info.copyIdx + 1)
-                          ->subMatrix(info.startIdx, info.length, tmpSrc_));
-    }
-  }
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void MultiplexLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  MatrixPtr outG = getOutputGrad();
-
-  {
-    REGISTER_TIMER_INFO("BwLMultiplexTimer", getName().c_str());
-    AsyncGpuBlock block;
-    for (const CopyInfo& info : copySchedule_) {
-      if (getInputGrad(info.copyIdx + 1)) {
-        getInputGrad(info.copyIdx + 1)
-            ->subMatrix(info.startIdx, info.length, tmpDest_)
-            ->add(*outG->subMatrix(info.startIdx, info.length, tmpSrc_));
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/NCELayer.cpp b/paddle/gserver/layers/NCELayer.cpp
deleted file mode 100644
index d3d7b1fd9ac3c366d11c3060848e89c24a16a70b..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/NCELayer.cpp
+++ /dev/null
@@ -1,323 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <random>
-
-#include "Layer.h"
-#include "MultinomialSampler.h"
-#include "paddle/math/MathFunctions.h"
-
-namespace paddle {
-
-/**
- * Noise-contrastive estimation.
- * Implements the method in the following paper:
- * A fast and simple algorithm for training neural probabilistic language
- * models.
- *
- * The config file api is nce_layer.
- */
-class NCELayer : public Layer {
-  int numClasses_;
-  /// number of input layer besides labelLayer and weightLayer
-  int numInputs_;
-  LayerPtr labelLayer_;
-  /// weight layer, can be None
-  LayerPtr weightLayer_;
-  WeightList weights_;
-  std::unique_ptr<Weight> biases_;
-  std::unique_ptr<MultinomialSampler> sampler_;
-
-  std::uniform_int_distribution<int> rand_;
-
-  struct Sample {
-    int sampleId;
-    int labelId;
-    bool target;
-    real weight;
-  };
-  std::vector<Sample> samples_;
-  /// whether samples_ is prepared
-  bool prepared_;
-  Argument sampleOut_;
-
-  IVectorPtr labelIds_;
-
-public:
-  explicit NCELayer(const LayerConfig& config)
-      : Layer(config),
-        numClasses_(config.num_classes()),
-        rand_(0, config.num_classes() - 1),
-        prepared_(false) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    /* Initialize the basic parent class */
-    Layer::init(layerMap, parameterMap);
-
-    /* initialize the weightList */
-    size_t i;
-    for (i = 0; i < inputLayers_.size(); i++) {
-      if (!parameters_[i]) break;
-      size_t width = inputLayers_[i]->getSize();
-      // create a new weight
-      CHECK_EQ(parameters_[i]->getSize(), width * numClasses_);
-      Weight* w = new Weight(numClasses_, width, parameters_[i]);
-
-      // append the new weight to the list
-      weights_.emplace_back(w);
-    }
-
-    CHECK_EQ(1U, getSize());
-
-    numInputs_ = i;
-    CHECK_GE(numInputs_, 1)
-        << "Must have at least one input besides label and weight";
-    CHECK_LT(i, inputLayers_.size()) << "Missing label layer";
-    labelLayer_ = inputLayers_[i];
-    if (++i < inputLayers_.size()) {
-      weightLayer_ = inputLayers_[i];
-      ++i;
-    }
-    CHECK_EQ(i, inputLayers_.size());
-
-    /* initialize biases_ */
-    if (biasParameter_.get() != NULL) {
-      CHECK_EQ(biasParameter_->getSize(), (size_t)numClasses_);
-      biases_.reset(new Weight(1, numClasses_, biasParameter_));
-    }
-
-    if (config_.neg_sampling_dist_size()) {
-      CHECK_EQ(numClasses_, config_.neg_sampling_dist_size());
-      sampler_.reset(MultinomialSampler::create(
-          config_.neg_sampling_dist().data(), numClasses_));
-    }
-
-    return true;
-  }
-
-  void prepareSamples() {
-    CHECK(!useGpu_) << "GPU is not supported";
-
-    int batchSize = getInput(*labelLayer_).getBatchSize();
-    IVectorPtr label = getInput(*labelLayer_).ids;
-
-    CpuSparseMatrixPtr multiLabel = std::dynamic_pointer_cast<CpuSparseMatrix>(
-        getInput(*labelLayer_).value);
-
-    CHECK(label || multiLabel)
-        << "The label layer must have ids or NonValueSparseMatrix value";
-
-    auto& randEngine = ThreadLocalRandomEngine::get();
-
-    samples_.clear();
-    samples_.reserve(batchSize * (1 + config_.num_neg_samples()));
-
-    real* weight =
-        weightLayer_ ? getInputValue(*weightLayer_)->getData() : nullptr;
-
-    for (int i = 0; i < batchSize; ++i) {
-      real w = weight ? weight[i] : 1;
-      if (label) {
-        int* ids = label->getData();
-        samples_.push_back({i, ids[i], true, w});
-      } else {
-        const int* cols = multiLabel->getRowCols(i);
-        int n = multiLabel->getColNum(i);
-        for (int j = 0; j < n; ++j) {
-          samples_.push_back({i, cols[j], true, w});
-        }
-      }
-      for (int j = 0; j < config_.num_neg_samples(); ++j) {
-        int id = sampler_ ? sampler_->gen(randEngine) : rand_(randEngine);
-        samples_.push_back({i, id, false, w});
-      }
-    }
-    prepared_ = true;
-  }
-
-  void prefetch() override {
-    prepareSamples();
-    IVector::resizeOrCreate(labelIds_, samples_.size(), useGpu_);
-    int* ids = labelIds_->getData();
-    for (size_t i = 0; i < samples_.size(); ++i) {
-      ids[i] = samples_[i].labelId;
-    }
-
-    for (int i = 0; i < numInputs_; ++i) {
-      auto sparseParam =
-          dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
-      if (sparseParam) {
-        sparseParam->addRows(labelIds_);
-      }
-    }
-  }
-
-  void forward(PassType passType) override {
-    Layer::forward(passType);
-
-    CHECK(!useGpu_) << "GPU is not supported";
-
-    if (!prepared_) {
-      if (passType == PASS_GC) {
-        ThreadLocalRandomEngine::get().seed(ThreadLocalRand::getDefaultSeed());
-      }
-      prepareSamples();
-    }
-    prepared_ = false;
-
-    /* malloc memory for the output_ if necessary */
-    int batchSize = getInputValue(0)->getHeight();
-    int size = getSize();
-    resetOutput(batchSize, size);
-
-    Matrix::resizeOrCreate(sampleOut_.value,
-                           1,
-                           samples_.size(),
-                           /* trans= */ false,
-                           useGpu_);
-
-    forwardBias();
-
-    for (int l = 0; l < numInputs_; ++l) {
-      forwardOneInput(l);
-    }
-
-    auto status = activation_->forward(sampleOut_);
-    status.check();
-
-    forwardCost();
-  }
-
-  void backward(const UpdateCallback& callback) override {
-    Matrix::resizeOrCreate(sampleOut_.grad,
-                           1,
-                           samples_.size(),
-                           /* trans= */ false,
-                           useGpu_);
-
-    backwardCost();
-
-    auto status = activation_->backward(sampleOut_);
-    status.check();
-
-    if (biases_->getWGrad()) {
-      backwardBias(callback);
-    }
-
-    for (int l = 0; l < numInputs_; ++l) {
-      backwardOneInput(l, callback);
-    }
-  }
-
-  void forwardBias() {
-    if (!biases_) {
-      sampleOut_.value->zeroMem();
-    } else {
-      real* bias = biases_->getW()->getData();
-      real* sampleOut = sampleOut_.value->getData();
-      for (size_t i = 0; i < samples_.size(); ++i) {
-        sampleOut[i] = bias[samples_[i].labelId];
-      }
-    }
-  }
-
-  void backwardBias(const UpdateCallback& callback) {
-    if (!biases_) return;
-    real* bias = biases_->getWGrad()->getData();
-    real* sampleOut = sampleOut_.grad->getData();
-    for (size_t i = 0; i < samples_.size(); ++i) {
-      bias[samples_[i].labelId] += sampleOut[i];
-    }
-    biases_->incUpdate(callback);
-  }
-
-  void forwardOneInput(int layerId) {
-    const MatrixPtr& inputMat = getInputValue(layerId);
-    const MatrixPtr& weightMat = weights_[layerId]->getW();
-
-    int dim = inputMat->getWidth();
-    real* sampleOut = sampleOut_.value->getData();
-
-    for (size_t i = 0; i < samples_.size(); ++i) {
-      sampleOut[i] += dotProduct(dim,
-                                 inputMat->getRowBuf(samples_[i].sampleId),
-                                 weightMat->getRowBuf(samples_[i].labelId));
-    }
-  }
-
-  void backwardOneInput(int layerId, const UpdateCallback& callback) {
-    const MatrixPtr& inputMat = getInputValue(layerId);
-    const MatrixPtr& inputGradMat = getInputGrad(layerId);
-    const MatrixPtr& weightMat = weights_[layerId]->getW();
-    const MatrixPtr& weightGradMat = weights_[layerId]->getWGrad();
-
-    int dim = inputMat->getWidth();
-    real* sampleGrad = sampleOut_.grad->getData();
-
-    if (weightGradMat) {
-      for (size_t i = 0; i < samples_.size(); ++i) {
-        axpy(dim,
-             sampleGrad[i],
-             inputMat->getRowBuf(samples_[i].sampleId),
-             weightGradMat->getRowBuf(samples_[i].labelId));
-      }
-      weights_[layerId]->incUpdate(callback);
-    }
-
-    if (inputGradMat) {
-      for (size_t i = 0; i < samples_.size(); ++i) {
-        axpy(dim,
-             sampleGrad[i],
-             weightMat->getRowBuf(samples_[i].labelId),
-             inputGradMat->getRowBuf(samples_[i].sampleId));
-      }
-    }
-  }
-
-  void forwardCost() {
-    real* out = output_.value->getData();
-    real* sampleOut = sampleOut_.value->getData();
-    real b = 1. / numClasses_ * config_.num_neg_samples();
-    for (size_t i = 0; i < samples_.size(); ++i) {
-      real o = sampleOut[i];
-      if (sampler_) {
-        b = config_.num_neg_samples() *
-            config_.neg_sampling_dist(samples_[i].labelId);
-      }
-      real cost = samples_[i].target ? -log(o / (o + b)) : -log(b / (o + b));
-      out[samples_[i].sampleId] += samples_[i].weight * cost;
-    }
-  }
-
-  void backwardCost() {
-    real* sampleOut = sampleOut_.value->getData();
-    real* sampleGrad = sampleOut_.grad->getData();
-
-    real b = 1. / numClasses_ * config_.num_neg_samples();
-    for (size_t i = 0; i < samples_.size(); ++i) {
-      real o = sampleOut[i];
-      if (sampler_) {
-        b = config_.num_neg_samples() *
-            config_.neg_sampling_dist(samples_[i].labelId);
-      }
-      real w = samples_[i].weight;
-      sampleGrad[i] = samples_[i].target ? -w * b / (o * (o + b)) : w / (o + b);
-    }
-  }
-};
-
-REGISTER_LAYER(nce, NCELayer);
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/NormLayer.cpp b/paddle/gserver/layers/NormLayer.cpp
deleted file mode 100644
index 4678f6fa9ab184870fc2651def18f47da9a0cc01..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/NormLayer.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "NormLayer.h"
-#include "NormProjectionLayer.h"
-#include "paddle/utils/Logging.h"
-namespace paddle {
-
-REGISTER_LAYER_CREATE_FUNC(norm, &NormLayer::create);
-
-Layer* NormLayer::create(const LayerConfig& config) {
-  CHECK_EQ(config.inputs_size(), 1);
-  const std::string& norm = config.inputs(0).norm_conf().norm_type();
-  if (norm == "rnorm") {
-    return new ResponseNormLayer(config);
-  } else if (norm == "cmrnorm-projection") {
-    return new CMRProjectionNormLayer(config);
-  } else if (norm == "cross-channel-norm") {
-    return new CrossChannelNormLayer(config);
-  } else {
-    LOG(FATAL) << "Unknown norm type: " << norm;
-    return nullptr;
-  }
-}
-
-bool ResponseNormLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  NormLayer::init(layerMap, parameterMap);
-
-  /* the size of inputs for norm-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-
-  const NormConfig& conf = config_.inputs(0).norm_conf();
-  channels_ = conf.channels();
-  size_ = conf.size();
-  scale_ = conf.scale();
-  pow_ = conf.pow();
-  outputX_ = conf.output_x();
-  imgSize_ = conf.img_size();
-  denoms_ = NULL;
-
-  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  return true;
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/NormLayer.h b/paddle/gserver/layers/NormLayer.h
deleted file mode 100644
index c89cbbfce9d9e35a6dd300864ee094ef8f9e283a..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/NormLayer.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "NormLayer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief Basic parent layer of normalization
- *
- * @note Normalize the input in local region
- */
-class NormLayer : public Layer {
-public:
-  explicit NormLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    Layer::init(layerMap, parameterMap);
-    return true;
-  }
-
-  /**
-   * @brief create norm layer by norm_type
-   */
-  static Layer* create(const LayerConfig& config);
-};
-
-/**
- * @brief response normalization within feature maps
- * namely normalize in independent channel
- * When code refactoring, we delete the original implementation.
- * Need to implement in the futrue.
- */
-class ResponseNormLayer : public NormLayer {
-protected:
-  size_t channels_, size_, outputX_, imgSize_, outputY_, imgSizeY_;
-  real scale_, pow_;
-  MatrixPtr denoms_;
-
-public:
-  explicit ResponseNormLayer(const LayerConfig& config) : NormLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override { LOG(FATAL) << "Not implemented"; }
-  void backward(const UpdateCallback& callback = nullptr) override {
-    LOG(FATAL) << "Not implemented";
-  }
-};
-
-/**
- * This layer applys normalization across the channels of each sample to a
- * conv layer's output, and scales the output by a group of trainable factors
- * whose dimensions equal to the number of channels.
- * - Input: One and only one input layer are accepted.
- * - Output: The normalized data of the input data.
- * Reference:
- *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
- *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
- */
-class CrossChannelNormLayer : public NormLayer {
-public:
-  explicit CrossChannelNormLayer(const LayerConfig& config)
-      : NormLayer(config) {}
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback);
-  MatrixPtr createSampleMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
-  MatrixPtr createSpatialMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
-
-protected:
-  size_t channels_;
-  std::unique_ptr<Weight> scale_;
-  MatrixPtr scaleDiff_;
-  MatrixPtr normBuffer_;
-  MatrixPtr dataBuffer_;
-  MatrixPtr channelBuffer_;
-  MatrixPtr spatialBuffer_;
-  MatrixPtr sampleBuffer_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/NormProjectionLayer.cpp b/paddle/gserver/layers/NormProjectionLayer.cpp
deleted file mode 100644
index 3013bbdbc791546897fca51e73a056f2c843e63f..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/NormProjectionLayer.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "NormProjectionLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-size_t CMRProjectionNormLayer::getSize() {
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  size_t layerSize = 0;
-  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (imgSizeH_ == 0) {
-    imgSizeH_ = imgSizeY_;
-  }
-  if (imgSizeW_ == 0) {
-    imgSizeW_ = imgSize_;
-  }
-  outputH_ = imgSizeH_;
-  outputW_ = imgSizeW_;
-  layerSize = outputH_ * outputW_ * channels_;
-
-  getOutput().setFrameHeight(outputH_);
-  getOutput().setFrameWidth(outputW_);
-  return layerSize;
-}
-
-bool CMRProjectionNormLayer::init(const LayerMap& layerMap,
-                                  const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  ResponseNormLayer::init(layerMap, parameterMap);
-
-  /* the size of inputs for norm-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-
-  createFunction(
-      forward_,
-      "CrossMapNormal",
-      FuncConfig().set("size", size_).set("scale", scale_).set("pow", pow_));
-  createFunction(
-      backward_,
-      "CrossMapNormalGrad",
-      FuncConfig().set("size", size_).set("scale", scale_).set("pow", pow_));
-
-  return true;
-}
-
-void CMRProjectionNormLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  /* malloc memory for the output_ if necessary */
-  /* note: one sample correspond to one row */
-  MatrixPtr input = inputLayers_[0]->getOutputValue();
-  size_t batchSize = input->getHeight();
-  int size = getSize();
-  resetOutput(batchSize, size);
-
-  Matrix::resizeOrCreate(denoms_, batchSize, size, /* trans */ false, useGpu_);
-
-  shape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
-
-  // prepare forward arguments
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), shape_);
-  outputs.addArg(*getOutputValue(), shape_, ASSIGN_TO);
-  outputs.addArg(*denoms_, shape_, ASSIGN_TO);
-
-  forward_[0]->calc(inputs, outputs);
-}
-
-void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  if (NULL == getInputGrad(0)) {
-    return;
-  }
-
-  // prepare backward arguments
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), shape_);
-  inputs.addArg(*getOutputValue(), shape_);
-  inputs.addArg(*getOutputGrad(), shape_);
-  inputs.addArg(*denoms_, shape_);
-  outputs.addArg(*getInputGrad(0), shape_, ADD_TO);
-
-  backward_[0]->calc(inputs, outputs);
-}
-}  // namespace paddle
diff --git a/paddle/gserver/layers/NormProjectionLayer.h b/paddle/gserver/layers/NormProjectionLayer.h
deleted file mode 100644
index 898b5823a9011c4b66e045c54afba070dd5cf772..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/NormProjectionLayer.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "NormLayer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief response normalization across feature maps
- * namely normalize in number of size_ channels
- */
-class CMRProjectionNormLayer : public ResponseNormLayer {
-  size_t imgSizeH_, imgSizeW_;
-  size_t outputH_, outputW_;
-
-public:
-  explicit CMRProjectionNormLayer(const LayerConfig& config)
-      : ResponseNormLayer(config) {}
-
-  ~CMRProjectionNormLayer() {}
-
-  size_t getSize();
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-protected:
-  TensorShape shape_;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/Operator.h b/paddle/gserver/layers/Operator.h
deleted file mode 100644
index a620926cccd3004d7bef57976047a190b4b566e2..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/Operator.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ModelConfig.pb.h"
-#include "paddle/parameter/Parameter.h"
-
-#include "Layer.h"
-#include "paddle/parameter/Argument.h"
-
-namespace paddle {
-
-// Macro for registering a operator type
-// Example: REGISTER_OPERATOR(dot_mul, DotMulOperator);
-#define REGISTER_OPERATOR(__type_name, __class_name)                \
-  static InitFunction __reg_type_##__type_name([]() {               \
-    Operator::registrar_.registerClass<__class_name>(#__type_name); \
-  })
-
-/**
- * Operator like Projection, but takes more than one Arguments as input.
- * @note: Operator can't have parameters.
- */
-class Operator {
-public:
-  static Operator* create(const OperatorConfig& config, bool useGpu);
-
-  Operator(const OperatorConfig& config, bool useGpu)
-      : config_(config), useGpu_(useGpu) {}
-
-  virtual ~Operator() {}
-
-  const OperatorConfig& getConfig() const { return config_; }
-
-  static ClassRegistrar<Operator, OperatorConfig, bool> registrar_;
-
-  /**
-   * Forward propagation. If backward() will be called, in and out must be kept
-   * valid until then.
-   * @param ins inputs of operator
-   * @param out output of operator
-   * @param passType PASS_TRAIN of PASS_TEST
-   */
-  void forward(std::vector<const Argument*> ins,
-               Argument* out,
-               PassType passType) {
-    ins_ = ins;
-    out_ = out;
-    passType_ = passType;
-    forward();
-  }
-
-  virtual void prefetch(const Argument* in) {}
-  virtual void forward() = 0;
-  virtual void backward() = 0;
-
-  /**
-   * See comment in Layer.h for the function with the same name.
-   */
-  virtual void resetState() {}
-
-  /**
-   * Set layer state.
-   */
-  virtual void setState(LayerStatePtr state) {}
-
-  /**
-   * Set layer state.
-   */
-  virtual LayerStatePtr getState() { return nullptr; }
-
-protected:
-  /// Config of operator
-  OperatorConfig config_;
-  bool useGpu_;
-
-  /// Store `ins` passed to forward()
-  std::vector<const Argument*> ins_;
-  /// Store `out` passed to forward()
-  Argument* out_;
-  /// Store `passType` passed to forward()
-  PassType passType_;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/OuterProdLayer.cpp b/paddle/gserver/layers/OuterProdLayer.cpp
deleted file mode 100644
index 75f4abf93e5db11dc688f8f2e0b2a36bf70fbccc..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/OuterProdLayer.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for computing the outer product of two vectors
- * @note used in NEURAL TURING MACHINE
- * Input1: vector (batchSize * dim1)
- * Input2: vector (batchSize * dim2)
- * Output: a matrix: (batchSize * (dim1*dim2))
- */
-
-class OuterProdLayer : public Layer {
-protected:
-  MatrixPtr tmpMtx0;
-  MatrixPtr tmpRow0;
-  MatrixPtr tmpRow1;
-
-public:
-  explicit OuterProdLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~OuterProdLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(out_prod, OuterProdLayer);
-
-bool OuterProdLayer::init(const LayerMap& layerMap,
-                          const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-
-  size_t dim0 = inputLayers_[0]->getSize();
-  size_t dim1 = inputLayers_[1]->getSize();
-
-  CHECK_EQ(dim0 * dim1, getSize()) << "Dimension mismatch";
-
-  tmpRow0 = Matrix::create(
-      nullptr, /* height= */ 1, dim0, /* trans= */ false, useGpu_);
-  tmpRow1 = Matrix::create(
-      nullptr, /* height= */ 1, dim1, /* trans= */ false, useGpu_);
-  tmpMtx0 = Matrix::create(nullptr,
-                           /* height= */ dim0,
-                           dim1,
-                           /* trans= */ false,
-                           useGpu_);
-  return true;
-}
-
-void OuterProdLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t dim0 = inV0->getWidth();
-  size_t dim1 = inV1->getWidth();
-
-  CHECK_EQ(dim0 * dim1, getSize());
-  CHECK_EQ(inV1->getHeight(), batchSize);
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, dim0 * dim1);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  {
-    REGISTER_TIMER_INFO("FwOutProdTimer", getName().c_str());
-    for (size_t i = 0; i < batchSize; i++) {
-      tmpMtx0->setData(outV->getData() + i * dim0 * dim1);
-      tmpRow0->setData(inV0->getData() + i * dim0);
-      tmpRow1->setData(inV1->getData() + i * dim1);
-
-      tmpMtx0->mul(*tmpRow0->getTranspose(), *tmpRow1);
-    }
-  }
-}
-
-void OuterProdLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr outG = getOutputGrad();
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-
-  size_t batchSize = inV0->getHeight();
-  size_t dim0 = inV0->getWidth();
-  size_t dim1 = inV1->getWidth();
-
-  {
-    REGISTER_TIMER_INFO("BwOutProdTimer", getName().c_str());
-
-    if (inG0) {
-      for (size_t i = 0; i < batchSize; i++) {
-        tmpMtx0->setData(outG->getData() + i * dim0 * dim1);
-        tmpRow0->setData(inG0->getData() + i * dim0);
-        tmpRow1->setData(inV1->getData() + i * dim1);
-
-        tmpRow0->mul(*tmpRow1, *tmpMtx0->getTranspose(), 1, 1);
-      }
-    }
-
-    if (inG1) {
-      for (size_t i = 0; i < batchSize; i++) {
-        tmpMtx0->setData(outG->getData() + i * dim0 * dim1);
-        tmpRow0->setData(inV0->getData() + i * dim0);
-        tmpRow1->setData(inG1->getData() + i * dim1);
-
-        tmpRow1->mul(*tmpRow0, *tmpMtx0, 1, 1);
-      }
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/PadLayer.cpp b/paddle/gserver/layers/PadLayer.cpp
deleted file mode 100644
index b1910e108b5b2f7b55a2aa1527b96e6b8a16f348..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/PadLayer.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PadLayer.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(pad, PadLayer);
-
-bool PadLayer::init(const LayerMap& layerMap,
-                    const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  auto& pad_conf = config_.inputs(0).pad_conf();
-  auto& img_conf = pad_conf.image_conf();
-  CHECK_EQ(config_.inputs_size(), 1);
-  inDims_ = TensorShape(
-      {0,
-       img_conf.channels(),
-       img_conf.has_img_size_y() ? img_conf.img_size_y() : img_conf.img_size(),
-       img_conf.img_size()});
-
-  CHECK_EQ(2, pad_conf.pad_c_size());
-  CHECK_EQ(2, pad_conf.pad_h_size());
-  CHECK_EQ(2, pad_conf.pad_w_size());
-  padc_ = {pad_conf.pad_c(0), pad_conf.pad_c(1)};
-  padh_ = {pad_conf.pad_h(0), pad_conf.pad_h(1)};
-  padw_ = {pad_conf.pad_w(0), pad_conf.pad_w(1)};
-
-  outDims_ = TensorShape(4);
-  setOutDims(0);
-
-  createFunction(forward_,
-                 "Pad",
-                 FuncConfig()
-                     .set("channel", padc_)
-                     .set("height", padh_)
-                     .set("width", padw_));
-  createFunction(backward_,
-                 "PadGrad",
-                 FuncConfig()
-                     .set("channel", padc_)
-                     .set("height", padh_)
-                     .set("width", padw_));
-
-  return true;
-}
-
-void PadLayer::setOutDims(const size_t batchSize) {
-  outDims_.reshape({batchSize,
-                    inDims_[1] + padc_[0] + padc_[1],
-                    inDims_[2] + padh_[0] + padh_[1],
-                    inDims_[3] + padw_[0] + padw_[1]});
-}
-
-void PadLayer::setTensorDim(const size_t batchSize) {
-  CHECK_EQ(static_cast<int>(inputLayers_.size()), 1);
-  inDims_.setDim(0, batchSize);
-  int h = inputLayers_[0]->getOutput().getFrameHeight();
-  if (h != 0) inDims_.setDim(2, h);
-  int w = inputLayers_[0]->getOutput().getFrameWidth();
-  if (w != 0) inDims_.setDim(3, w);
-  setOutDims(batchSize);
-}
-
-void PadLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  MatrixPtr input = inputLayers_[0]->getOutputValue();
-  size_t batchSize = input->getHeight();
-  setTensorDim(batchSize);
-  int size = outDims_[1] * outDims_[2] * outDims_[3];
-  resetOutput(batchSize, size);
-  MatrixPtr outV = getOutputValue();
-  REGISTER_TIMER_INFO("PadForward", getName().c_str());
-
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), inDims_);
-  outputs.addArg(*getOutputValue(), outDims_, ASSIGN_TO);
-  forward_[0]->calc(inputs, outputs);
-}
-
-void PadLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-  REGISTER_TIMER_INFO("PadBackward", getName().c_str());
-
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getOutputGrad(), outDims_);
-  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
-  backward_[0]->calc(inputs, outputs);
-}
-}  // namespace paddle
diff --git a/paddle/gserver/layers/PadLayer.h b/paddle/gserver/layers/PadLayer.h
deleted file mode 100644
index 7e09d7f8a0d4dfd5300298ad0514b69781d87016..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/PadLayer.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * \brief  This layer pads zeros to inputs according to the specify dimension.
- *         The input and output is a 4D tensor. Padding zeros from the 2nd to
- *         the 4th dimenstion according padc_, padh_ and padw_.
- */
-class PadLayer : public Layer {
-public:
-  explicit PadLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~PadLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-protected:
-  void setOutDims(const size_t batchSize);
-  void setTensorDim(const size_t batchSize);
-
-  std::vector<uint32_t> padc_;
-  std::vector<uint32_t> padh_;
-  std::vector<uint32_t> padw_;
-  TensorShape inDims_;
-  TensorShape outDims_;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ParameterReluLayer.cpp b/paddle/gserver/layers/ParameterReluLayer.cpp
deleted file mode 100644
index 12d04fc1c3ca169179beafc372a07a2e6d0a1773..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ParameterReluLayer.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParameterReluLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(prelu, ParameterReluLayer);
-
-bool ParameterReluLayer::init(const LayerMap& layerMap,
-                              const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  CHECK_EQ(inputLayers_.size(), parameters_.size());
-  partialSum_ = config_.partial_sum();
-  CHECK_GT(partialSum_, 0UL) << "partial_sum must be larger than zero.";
-  CHECK(!(inputLayers_[0]->getSize() % partialSum_))
-      << "Incorrect value for partialSum: " << partialSum_
-      << " must divide input size: " << inputLayers_[0]->getSize();
-  CHECK_EQ(getSize() / partialSum_, parameters_[0]->getSize());
-  weight_ = std::unique_ptr<Weight>(new Weight(
-      1UL, inputLayers_[0]->getSize() / partialSum_, parameters_[0]));
-  return true;
-}
-
-void ParameterReluLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInput(0).getBatchSize();
-  int size = getSize();
-  reserveOutput(batchSize, size);
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    outV->paramReluForward(*(getInput(0).value), *(weight_->getW()));
-  }
-}
-
-void ParameterReluLayer::backward(const UpdateCallback& callback) {
-  if (weight_->getWGrad()) {
-    weight_->getWGrad()->paramReluBackwardW(*getOutputGrad(),
-                                            *(getInputValue(0)));
-  }
-
-  MatrixPtr preGrad = getInputGrad(0);
-  preGrad->paramReluBackwardDiff(
-      *getOutputGrad(), *(getInputValue(0)), *(weight_->getW()));
-  {
-    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-    weight_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ParameterReluLayer.h b/paddle/gserver/layers/ParameterReluLayer.h
deleted file mode 100644
index 3725fa4a1199285b703590255af492ebffdaab2c..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ParameterReluLayer.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- *  @brief ParameterReluLayer active inputs with learnable parameter weight_.
- *  forward:
- *  \f[
- *      y = x > 0 ? x : w .* x
- *  \f]
- *  backward:
- *  \f[
- *      dx = x > 0 ? dy : w .* dy \\
- *      dw = x > 0 ? 0 : dy.*x
- *  \f]
- *  Here, x is the input, w is the weight, y is the output.
- *  dx, dw, dy is the gradient.
- */
-
-class ParameterReluLayer : public Layer {
-protected:
-  std::unique_ptr<Weight> weight_;
-
-  /**
-   *  @brief partialSum_ makes a group of inputs share same weights,
-   *  - partialSum_ = 1:
-   *       element wise activation: each element has a weight_,
-   *  - partialSum_ = number of elements in one channel,
-   *       channels wise parameter activation, elements in a channel
-   *       share same weight_,
-   *  - partialSum_ = number of outputs
-   *       all elements share same weight_,
-   */
-  size_t partialSum_;
-
-public:
-  explicit ParameterReluLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ParameterReluLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/Pool3DLayer.cpp b/paddle/gserver/layers/Pool3DLayer.cpp
deleted file mode 100644
index 3ac9eb0d8198814c9f01fe101a60ab1f1f431062..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/Pool3DLayer.cpp
+++ /dev/null
@@ -1,178 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Pool3DLayer.h"
-#include "PoolProjectionLayer.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-REGISTER_LAYER(pool3d, Pool3DLayer);
-
-bool Pool3DLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  /* the size of inputs for pool-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-
-  const PoolConfig& conf = config_.inputs(0).pool_conf();
-  poolType_ = conf.pool_type();
-  channels_ = conf.channels();
-
-  sizeX_ = conf.size_x();
-  sizeY_ = conf.size_y();
-  sizeZ_ = conf.size_z();
-
-  strideW_ = conf.stride();
-  strideH_ = conf.stride_y();
-  strideD_ = conf.stride_z();
-
-  imgSizeW_ = conf.img_size();
-  imgSizeH_ = conf.img_size_y();
-  imgSizeD_ = conf.img_size_z();
-
-  paddingW_ = conf.padding();
-  paddingH_ = conf.padding_y();
-  paddingD_ = conf.padding_z();
-
-  outputW_ = conf.output_x();
-  outputH_ = conf.output_y();
-  outputD_ = conf.output_z();
-
-  return true;
-}
-
-size_t Pool3DLayer::getSize() {
-  CHECK_EQ(inputLayers_.size(), 1UL);
-
-  size_t layerSize = 0;
-  outputD_ = outputSize(imgSizeD_, sizeZ_, paddingD_, strideD_, false);
-  outputH_ = outputSize(imgSizeH_, sizeY_, paddingH_, strideH_, false);
-  outputW_ = outputSize(imgSizeW_, sizeX_, paddingW_, strideW_, false);
-
-  layerSize = outputD_ * outputH_ * outputW_ * channels_;
-  getOutput().setFrameHeight(outputH_);
-  getOutput().setFrameWidth(outputW_);
-  getOutput().setFrameDepth(outputD_);
-  return layerSize;
-}
-
-void Pool3DLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
-  size_t batchSize = inMat->getHeight();
-  size_t outWidth = getSize();
-  resetOutput(batchSize, outWidth);
-  Matrix::resizeOrCreate(maxPoolIdx_, batchSize, outWidth, false, useGpu_);
-  const MatrixPtr outMat = getOutputValue();
-
-  if (poolType_ == "avg") {
-    outMat->avgPool3DForward(*inMat,
-                             channels_,
-                             imgSizeD_,
-                             imgSizeH_,
-                             imgSizeW_,
-                             outputD_,
-                             outputH_,
-                             outputW_,
-                             sizeZ_,
-                             sizeY_,
-                             sizeX_,
-                             strideD_,
-                             strideH_,
-                             strideW_,
-                             paddingD_,
-                             paddingH_,
-                             paddingW_);
-  } else if (poolType_ == "max") {
-    outMat->maxPool3DForward(*inMat,
-                             *maxPoolIdx_,
-                             channels_,
-                             imgSizeD_,
-                             imgSizeH_,
-                             imgSizeW_,
-                             outputD_,
-                             outputH_,
-                             outputW_,
-                             sizeZ_,
-                             sizeY_,
-                             sizeX_,
-                             strideD_,
-                             strideH_,
-                             strideW_,
-                             paddingD_,
-                             paddingH_,
-                             paddingW_);
-  } else {
-    LOG(FATAL) << "Unknown pool type: " << poolType_;
-  }
-  forwardActivation();
-}
-
-void Pool3DLayer::backward(const UpdateCallback& callback) {
-  backwardActivation();
-
-  (void)callback;
-  if (NULL == getInputGrad(0)) return;
-  MatrixPtr inMat = inputLayers_[0]->getOutputValue();
-  MatrixPtr inGradMat = inputLayers_[0]->getOutputGrad();
-  MatrixPtr outMat = getOutputValue();
-  MatrixPtr outGradMat = getOutputGrad();
-
-  if (poolType_ == "avg") {
-    inGradMat->avgPool3DBackward(*outGradMat,
-                                 imgSizeD_,
-                                 imgSizeH_,
-                                 imgSizeW_,
-                                 outputD_,
-                                 outputH_,
-                                 outputW_,
-                                 sizeZ_,
-                                 sizeY_,
-                                 sizeZ_,
-                                 strideD_,
-                                 strideH_,
-                                 strideW_,
-                                 paddingD_,
-                                 paddingH_,
-                                 paddingW_,
-                                 1.0,
-                                 1.0);
-  } else if (poolType_ == "max") {
-    inGradMat->maxPool3DBackward(*outGradMat,
-                                 *maxPoolIdx_,
-                                 imgSizeD_,
-                                 imgSizeH_,
-                                 imgSizeW_,
-                                 outputD_,
-                                 outputH_,
-                                 outputW_,
-                                 sizeZ_,
-                                 sizeY_,
-                                 sizeZ_,
-                                 strideD_,
-                                 strideH_,
-                                 strideW_,
-                                 paddingD_,
-                                 paddingH_,
-                                 paddingW_,
-                                 1.0,
-                                 1.0);
-  } else {
-    LOG(FATAL) << "Unknown pool type: " << poolType_;
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/Pool3DLayer.h b/paddle/gserver/layers/Pool3DLayer.h
deleted file mode 100644
index 59ee73f7cb9fb4287c12f3c7d0cacfc812484770..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/Pool3DLayer.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief Basic parent layer of pooling
- * Pools the input within regions
- */
-class Pool3DLayer : public Layer {
-public:
-  explicit Pool3DLayer(const LayerConfig& config) : Layer(config) {}
-  ~Pool3DLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-  size_t getSize();
-
-protected:
-  int channels_;
-  int sizeX_, sizeY_, sizeZ_;
-  int strideW_, strideH_, strideD_;
-  int paddingW_, paddingH_, paddingD_;
-  int imgSizeW_, imgSizeH_, imgSizeD_;
-  int outputW_, outputH_, outputD_;
-  std::string poolType_;
-  MatrixPtr maxPoolIdx_;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolLayer.cpp b/paddle/gserver/layers/PoolLayer.cpp
deleted file mode 100644
index ee589e6be51b1e66984f5a1d808b73aab962821d..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PoolLayer.h"
-#include "MaxPoolWithMaskLayer.h"
-#include "PoolProjectionLayer.h"
-#include "paddle/utils/Logging.h"
-#ifdef PADDLE_WITH_CUDA
-#include "CudnnPoolLayer.h"
-#endif
-namespace paddle {
-
-REGISTER_LAYER_CREATE_FUNC(pool, &PoolLayer::create);
-
-bool PoolLayer::init(const LayerMap& layerMap,
-                     const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* the size of inputs for pool-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-
-  const PoolConfig& conf = config_.inputs(0).pool_conf();
-  poolType_ = conf.pool_type();
-  channels_ = conf.channels();
-  sizeX_ = conf.size_x();
-  stride_ = conf.stride();
-  outputX_ = conf.output_x();
-  imgSize_ = conf.img_size();
-  confPadding_ = conf.padding();
-
-  sizeY_ = conf.has_size_y() ? conf.size_y() : conf.size_x();
-  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
-  confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
-  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-
-  excludeMode_ = conf.has_exclude_mode() ? conf.exclude_mode() : true;
-  return true;
-}
-
-Layer* PoolLayer::create(const LayerConfig& config) {
-  CHECK_EQ(config.inputs_size(), 1);
-  const std::string& pool = config.inputs(0).pool_conf().pool_type();
-  if (pool == "max-projection" || pool == "avg-projection") {
-    return new PoolProjectionLayer(config);
-#ifdef PADDLE_WITH_CUDA
-  } else if (CudnnPoolLayer::typeCheck(pool)) {
-    return new CudnnPoolLayer(config);
-#endif
-  } else if (pool == "max-pool-with-mask") {
-    return new MaxPoolWithMaskLayer(config);
-  } else {
-    LOG(FATAL) << "Unknown pool type: " << pool;
-    return nullptr;
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolLayer.h b/paddle/gserver/layers/PoolLayer.h
deleted file mode 100644
index 58d5fb0a095e8326f9b6f9cb2a97bb88022ceed8..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/PoolLayer.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief Basic parent layer of pooling
- * Pools the input within regions
- */
-class PoolLayer : public Layer {
-protected:
-  size_t channels_, sizeX_, stride_, outputX_, imgSize_;
-  int confPadding_;
-
-  size_t sizeY_;
-  size_t imgSizeY_;
-  size_t strideY_;
-  size_t outputY_;
-  int confPaddingY_;
-
-  std::string poolType_;
-
-  bool excludeMode_;
-
-public:
-  explicit PoolLayer(const LayerConfig& config) : Layer(config) {}
-
-  /**
-   * @brief create pooling layer by pool_type
-   */
-  static Layer* create(const LayerConfig& config);
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjection.h b/paddle/gserver/layers/PoolProjection.h
deleted file mode 100644
index c99287dbf0f4503c180b9b4e9e46abafa67bf64d..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/PoolProjection.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Projection.h"
-#include "paddle/math/MathUtils.h"
-
-namespace paddle {
-
-class PoolProjection : public Projection {
-protected:
-  size_t imgSizeY_, imgSize_;
-  size_t outputY_, outputX_;
-  size_t strideY_, stride_;
-  size_t sizeY_, sizeX_;
-  int confPaddingY_, confPadding_;
-  size_t channels_;
-  std::string poolType_;
-  bool excludeMode_;
-
-public:
-  PoolProjection(const ProjectionConfig& config,
-                 ParameterPtr parameter,
-                 bool useGpu);
-
-  static PoolProjection* create(const ProjectionConfig& config,
-                                ParameterPtr parameter,
-                                bool useGpu);
-
-  const std::string& getPoolType() const { return poolType_; }
-
-  size_t getSize();
-};
-
-class MaxPoolProjection : public PoolProjection {
-public:
-  MaxPoolProjection(const ProjectionConfig& config,
-                    ParameterPtr parameter,
-                    bool useGpu)
-      : PoolProjection(config, parameter, useGpu) {}
-
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback = nullptr);
-};
-
-class AvgPoolProjection : public PoolProjection {
-public:
-  AvgPoolProjection(const ProjectionConfig& config,
-                    ParameterPtr parameter,
-                    bool useGpu)
-      : PoolProjection(config, parameter, useGpu) {}
-
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback = nullptr);
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjectionLayer.cpp b/paddle/gserver/layers/PoolProjectionLayer.cpp
deleted file mode 100644
index 73d320e67ec09513f419ecdd45a57fc5c54df5ed..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/PoolProjectionLayer.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "PoolProjectionLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-size_t PoolProjectionLayer::getSize() {
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  size_t layerSize = 0;
-  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (imgSizeH_ == 0) {
-    imgSizeH_ = imgSizeY_;
-  }
-  if (imgSizeW_ == 0) {
-    imgSizeW_ = imgSize_;
-  }
-
-  outputH_ = outputSize(imgSizeH_,
-                        sizeY_,
-                        confPaddingY_,
-                        strideY_,
-                        /* caffeMode */ false);
-  outputW_ = outputSize(imgSizeW_,
-                        sizeX_,
-                        confPadding_,
-                        stride_,
-                        /* caffeMode */ false);
-
-  layerSize = outputH_ * outputW_ * channels_;
-
-  return layerSize;
-}
-
-void PoolProjectionLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  const Argument& in = getInput(0);
-  int batchSize = in.value->getHeight();
-  int size = getSize();
-  resetOutput(batchSize, size);
-  poolProjection_->forward(&in, &output_, passType);
-}
-
-void PoolProjectionLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-  if (NULL == getInputGrad(0)) {
-    return;
-  }
-  poolProjection_->backward(callback);
-}
-}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjectionLayer.h b/paddle/gserver/layers/PoolProjectionLayer.h
deleted file mode 100644
index 5a97a7769aaeebcfd4fe2c10d8ac0cc8892f68e3..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/PoolProjectionLayer.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "PoolLayer.h"
-#include "PoolProjection.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-/**
- * @brief Basic parent layer of different kinds of pooling
- */
-class PoolProjectionLayer : public PoolLayer {
-protected:
-  size_t imgSizeH_, imgSizeW_;
-  size_t outputH_, outputW_;
-  std::unique_ptr<PoolProjection> poolProjection_;
-  ProjectionConfig projectionConfig_;
-
-public:
-  explicit PoolProjectionLayer(const LayerConfig& config) : PoolLayer(config) {
-    PoolConfig* conf = projectionConfig_.mutable_pool_conf();
-    *conf = config_.inputs(0).pool_conf();
-    poolProjection_.reset(
-        PoolProjection::create(projectionConfig_, nullptr, useGpu_));
-  }
-
-  size_t getSize();
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/PowerLayer.cpp b/paddle/gserver/layers/PowerLayer.cpp
deleted file mode 100644
index 18f650fcdaded5ad7199510594b873fc18c3d7b5..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/PowerLayer.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * This layer applys a power function to a vector element-wise,
- * which is used in NEURAL TURING MACHINE.
- * \f[
- *   y = x^w
- * \f]
- * where \f$x\f$ is a input vector, \f$w\f$ is scalar weight,
- * and output \f$y\f$ is a vector.
- *
- * The config file api is power_layer.
- */
-
-class PowerLayer : public Layer {
-protected:
-  MatrixPtr tmpMtx;
-
-public:
-  explicit PowerLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~PowerLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(power, PowerLayer);
-
-bool PowerLayer::init(const LayerMap& layerMap,
-                      const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-
-  return true;
-}
-
-void PowerLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV1->getHeight();
-  size_t dataDim = inV1->getWidth();
-
-  CHECK_EQ(getSize(), dataDim);
-  CHECK_EQ(1U, inV0->getWidth());
-  CHECK_EQ(batchSize, inV0->getHeight());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-
-  {
-    REGISTER_TIMER_INFO("FwPowerTimer", getName().c_str());
-    outV->rowPow(0, *inV1, *inV0);
-  }
-}
-
-void PowerLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV0 = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-  MatrixPtr outV = getOutputValue();
-  MatrixPtr outG = getOutputGrad();
-
-  size_t batchSize = inV1->getHeight();
-  size_t dataDim = inV1->getWidth();
-
-  {
-    REGISTER_TIMER_INFO("BwPowerTimer", getName().c_str());
-    Matrix::resizeOrCreate(tmpMtx, batchSize, dataDim, false, useGpu_);
-
-    if (inG0) {
-      tmpMtx->log2(*inV1);
-      tmpMtx->dotMul(*tmpMtx, *outV);
-
-      // inG0 += outG .* (log(inV1) * outV)
-      inG0->rowDotMul(0, *outG, *tmpMtx);
-    }
-
-    if (inG1) {
-      // tmp = (outV / inV1) * inV0
-      tmpMtx->dotDiv(*outV, *inV1);
-      tmpMtx->rowScale(0, *tmpMtx, *inV0);
-
-      inG1->addDotMul(*outG, *tmpMtx, 1, 1);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/PrintLayer.cpp b/paddle/gserver/layers/PrintLayer.cpp
deleted file mode 100644
index 5a527d598dd5e11ae0b74a32c9b9884e73ed45a8..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/PrintLayer.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-
-namespace paddle {
-
-class PrintLayer : public Layer {
-public:
-  explicit PrintLayer(const LayerConfig& config) : Layer(config) {}
-
-  void forward(PassType passType) override {
-    Layer::forward(passType);
-    std::vector<std::string> vals;
-    for (size_t i = 0; i != inputLayers_.size(); ++i) {
-      std::ostringstream s;
-      getInput(i).printValueString(s, "");
-      vals.push_back(s.str());
-    }
-    size_t pos = 0;
-    size_t i = 0;
-    std::ostringstream s;
-    const std::string& format = config_.user_arg();
-    while (true) {
-      size_t pos1 = format.find("%s", pos);
-      if (pos1 == std::string::npos) break;
-      if (i >= vals.size()) {
-        break;
-      }
-      s << format.substr(pos, pos1 - pos) << vals[i];
-      pos = pos1 + 2;
-      ++i;
-    }
-    if (i != inputLayers_.size()) {
-      LOG(ERROR) << "Number of value in the format (" << format
-                 << ") is not same as the number of inputs ("
-                 << inputLayers_.size() << ") at " << getName();
-    }
-    s << format.substr(pos);
-
-    const std::string delimiter("\n");
-    std::string content = s.str();
-    std::string::size_type foundPos = 0;
-    std::string::size_type prevPos = 0;
-    while ((foundPos = content.find(delimiter, prevPos)) != std::string::npos) {
-      LOG(INFO) << content.substr(prevPos, foundPos - prevPos);
-      prevPos = foundPos + delimiter.size();
-    }
-    LOG(INFO) << content.substr(prevPos);
-  }
-
-  void backward(const UpdateCallback& callback) override {}
-};
-
-REGISTER_LAYER(print, PrintLayer);
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/PriorBox.cpp b/paddle/gserver/layers/PriorBox.cpp
deleted file mode 100644
index af2cc05a954b3a6857c1015104a57339282840b8..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/PriorBox.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/BaseMatrix.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-/**
- * @brief A layer for generating priorbox locations and variances.
- * - Input: Two and only two input layer are accepted. The input layer must be
- *          be a data output layer and a convolution output layer.
- * - Output: The priorbox locations and variances of the input data.
- * Reference:
- *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
- *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
- */
-
-class PriorBoxLayer : public Layer {
-public:
-  explicit PriorBoxLayer(const LayerConfig& config) : Layer(config) {}
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override {}
-
-protected:
-  int numPriors_;
-  std::vector<int> minSize_;
-  std::vector<int> maxSize_;
-  std::vector<real> aspectRatio_;
-  std::vector<real> variance_;
-  MatrixPtr buffer_;
-};
-
-REGISTER_LAYER(priorbox, PriorBoxLayer);
-
-bool PriorBoxLayer::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  auto pbConf = config_.inputs(0).priorbox_conf();
-  std::vector<real> tmp;
-  aspectRatio_.push_back(1.);
-  std::copy(pbConf.min_size().begin(),
-            pbConf.min_size().end(),
-            std::back_inserter(minSize_));
-  std::copy(pbConf.max_size().begin(),
-            pbConf.max_size().end(),
-            std::back_inserter(maxSize_));
-  std::copy(pbConf.variance().begin(),
-            pbConf.variance().end(),
-            std::back_inserter(variance_));
-  std::copy(pbConf.aspect_ratio().begin(),
-            pbConf.aspect_ratio().end(),
-            std::back_inserter(tmp));
-
-  if (maxSize_.size() > 0) CHECK_EQ(minSize_.size(), maxSize_.size());
-
-  // flip aspect ratios
-  for (unsigned index = 0; index < tmp.size(); index++) {
-    real ar = tmp[index];
-    if (fabs(ar - 1.) < 1e-6) continue;
-    aspectRatio_.push_back(ar);
-    aspectRatio_.push_back(1. / ar);
-  }
-
-  numPriors_ = aspectRatio_.size() * minSize_.size() + maxSize_.size();
-
-  return true;
-}
-
-void PriorBoxLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  auto input = getInput(0);
-  int layerWidth = input.getFrameWidth();
-  int layerHeight = input.getFrameHeight();
-
-  auto image = getInput(1);
-  int imageWidth = image.getFrameWidth();
-  int imageHeight = image.getFrameHeight();
-
-  real stepW = static_cast<real>(imageWidth) / layerWidth;
-  real stepH = static_cast<real>(imageHeight) / layerHeight;
-  int dim = layerHeight * layerWidth * numPriors_ * 4;
-  reserveOutput(1, dim * 2);
-  // use a cpu buffer to compute
-  Matrix::resizeOrCreate(buffer_, 1, dim * 2, false, false);
-  auto* tmpPtr = buffer_->getData();
-
-  int idx = 0;
-  for (int h = 0; h < layerHeight; ++h) {
-    for (int w = 0; w < layerWidth; ++w) {
-      real centerX = (w + 0.5) * stepW;
-      real centerY = (h + 0.5) * stepH;
-      for (size_t s = 0; s < minSize_.size(); s++) {
-        real minSize = minSize_[s];
-        real boxWidth = minSize;
-        real boxHeight = minSize;
-
-        // priors with different aspect ratios
-        for (size_t r = 0; r < aspectRatio_.size(); r++) {
-          real ar = aspectRatio_[r];
-          boxWidth = minSize * sqrt(ar);
-          boxHeight = minSize / sqrt(ar);
-          tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
-          tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
-          tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
-          tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
-          // set the variance.
-          for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
-        }
-
-        if (maxSize_.size() > 0) {
-          // square prior with size sqrt(minSize * maxSize)
-          real maxSize = maxSize_[s];
-          boxWidth = boxHeight = sqrt(minSize * maxSize);
-          tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
-          tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
-          tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
-          tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
-          // set the variance.
-          for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
-        }
-      }
-    }
-  }
-
-  // clip the prior's coordidate such that it is within [0, 1]
-  for (int d = 0; d < dim * 2; ++d)
-    if ((d % 8) < 4)
-      tmpPtr[d] = std::min(std::max(tmpPtr[d], (real)0.), (real)1.);
-  MatrixPtr outV = getOutputValue();
-  outV->copyFrom(buffer_->data_, dim * 2);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/Projection.h b/paddle/gserver/layers/Projection.h
deleted file mode 100644
index 1f0b96c79ec7313cd9c5ff9139a455b3269b222b..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/Projection.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "ModelConfig.pb.h"
-#include "paddle/parameter/Parameter.h"
-
-namespace paddle {
-
-// Macro for registering a projection type
-// Example: REGISTER_LAYER(fc, FullMatrixProjection);
-#define REGISTER_PROJECTION(__type_name, __class_name)                \
-  static InitFunction __reg_type_##__type_name([]() {                 \
-    Projection::registrar_.registerClass<__class_name>(#__type_name); \
-  })
-
-#define REGISTER_PROJECTION_CREATE_FUNC(__type_name, createFunction)    \
-  static InitFunction __reg_type_##__type_name([]() {                   \
-    Projection::registrar_.registerClass(#__type_name, createFunction); \
-  })
-
-/**
- * A projection takes one Argument as input, calculate the result and add it
- * to output Argument.
- */
-class Projection {
-public:
-  static Projection* create(const ProjectionConfig& config,
-                            ParameterPtr parameter,
-                            bool useGpu);
-
-  Projection(const ProjectionConfig& config,
-             ParameterPtr parameter,
-             bool useGpu)
-      : config_(config), parameter_(parameter), useGpu_(useGpu) {}
-
-  virtual ~Projection() {}
-
-  const std::string& getName() const { return config_.name(); }
-
-  /// Register a projection
-  static ClassRegistrar<Projection, ProjectionConfig, ParameterPtr, bool>
-      registrar_;
-
-  /**
-   * Forward propagation. If backward() will be called, in and out must be kept
-   * valid until then.
-   * @param in input of projection
-   * @param out output of projection
-   * @param passType PASS_TRAIN of PASS_TEST
-   */
-  void forward(const Argument* in, const Argument* out, PassType passType) {
-    in_ = in;
-    out_ = out;
-    passType_ = passType;
-    forward();
-  }
-
-  virtual void prefetch(const Argument* in) {}
-  virtual void forward() = 0;
-  virtual void backward(const UpdateCallback& callback) = 0;
-
-  /**
-   * See comment in Layer.h for the function with the same name.
-   */
-  virtual void resetState() {}
-
-  /**
-   * Set layer state.
-   */
-  virtual void setState(LayerStatePtr state) {}
-
-  /**
-   * Get layer state. A copy of internal state is returned.
-   */
-  virtual LayerStatePtr getState() { return nullptr; }
-
-  /**
-   * init forward_ and backward_ functions
-   */
-  virtual bool init() { return true; }
-
-  /**
-   * Get output size of projection.
-   */
-  size_t getOutputSize() const { return config_.output_size(); }
-
-protected:
-  /**
-   * Create layer function. Function is called in forward or backward.
-   * \param function, Layer::forward_ or Layer::backward_
-   * \param name, function name
-   * \param config, initialization configuration for the function
-   */
-  void createFunction(std::vector<std::shared_ptr<FunctionBase>>& function,
-                      const std::string& name,
-                      const FuncConfig& config) {
-    if (useGpu_) {
-      function.emplace_back(
-          FunctionBase::funcRegistrar_.createByType(name + "-GPU"));
-    } else {
-      function.emplace_back(
-          FunctionBase::funcRegistrar_.createByType(name + "-CPU"));
-    }
-    auto& func = function.back();
-    func->init(config);
-  }
-
-protected:
-  /// Config of projection
-  ProjectionConfig config_;
-  /// Parameter of projection
-  ParameterPtr parameter_;
-  bool useGpu_;
-
-  /// Store `in` passed to forward()
-  const Argument* in_;
-  /// Store `out` passed to forward()
-  const Argument* out_;
-  /// Store `passType` passed to forward()
-  PassType passType_;
-  /// Layer forward function
-  std::vector<std::shared_ptr<FunctionBase>> forward_;
-  /// Layer backward function
-  std::vector<std::shared_ptr<FunctionBase>> backward_;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ROIPoolLayer.h b/paddle/gserver/layers/ROIPoolLayer.h
deleted file mode 100644
index b1735e9748dc3956aade010f33303b55d4f9f439..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ROIPoolLayer.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * A layer used by Fast R-CNN to extract feature maps of ROIs from the last
- * feature map.
- * - Input: This layer needs two input layers: The first input layer is a
- *          convolution layer; The second input layer contains the ROI data
- *          which is the output of ProposalLayer in Faster R-CNN. layers for
- *          generating bbox location offset and the classification confidence.
- * - Output: The ROIs' feature map.
- * Reference:
- *    Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun.
- *    Faster R-CNN: Towards Real-Time Object Detection with Region Proposal
- * Networks
- */
-
-class ROIPoolLayer : public Layer {
-protected:
-  size_t channels_;
-  size_t width_;
-  size_t height_;
-  size_t pooledWidth_;
-  size_t pooledHeight_;
-  real spatialScale_;
-
-  // Since there is no int matrix, use real maxtrix instead.
-  MatrixPtr maxIdxs_;
-
-public:
-  explicit ROIPoolLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/RecurrentLayer.h b/paddle/gserver/layers/RecurrentLayer.h
deleted file mode 100644
index 8fd4fe6b78ae6474f3cfcec605f25b72af8295bb..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/RecurrentLayer.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <gflags/gflags.h>
-#include "Layer.h"
-#include "SequenceToBatch.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief RecurrentLayer takes 1 input layer. The output size is the same with
- * input layer.
- * For each sequence [start, end] it performs the following computation:
- * \f[
- *    out_{i} = act(in_{i})     \      \      \text{for} \ i = start \\
- *    out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end
- *
- * \f]
- * If reversed is true, the order is reversed:
- * \f[
- *   out_{i} = act(in_{i})           \    \   \text{for} \ i = end  \\
- *   out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end
- * \f]
- * There are two methods to calculate rnn. One way is to compute rnn one
- * sequence by one sequence. The other way is to reorganize the input
- * into batches, then compute rnn one batch by one batch. Users can select
- * them by rnn_use_batch flag.
- */
-
-class RecurrentLayer : public Layer {
-public:
-  explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback) override;
-
-  void resetState() override;
-
-  void setState(LayerStatePtr state) override;
-
-  LayerStatePtr getState() override;
-
-protected:
-  /**
-   * @brief If user do not set --rnn_use_batch=true, it will
-   * compute rnn forward one sequence by one sequence in default.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void forwardSequence(int batchSize, size_t numSequences, const int* starts);
-  /**
-   * @brief Compute rnn forward by one sequence.
-   * @param start The start position of this sequence (or sample).
-   * @param length The length of this sequence (or sample), namely the words
-   * number of this sequence.
-   */
-  void forwardOneSequence(int start, int length);
-  /**
-   * @brief Compute rnn backward one sequence by onesequence.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void backwardSequence(int batchSize, size_t numSequences, const int* starts);
-  /**
-   * @brief Compute rnn backward by one sequence.
-   * @param start The start position of this sequence (or sample).
-   * @param length The length of this sequence (or sample), namely the words
-   * number of this sequence.
-   */
-  void backwardOneSequence(int start, int length);
-
-  /**
-   * @brief Reorganize input into batches and compute rnn forward batch
-   * by batch. It will convert batch shape to sequence after finishing forward.
-   * The batch info can refer to SequenceToBatch class.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  virtual void forwardBatch(int batchSize,
-                            size_t numSequences,
-                            const int* starts);
-
-  /**
-   * @brief Reorganize input into batches and compute rnn forward batch
-   * by batch.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  virtual void backwardBatch(int batchSize,
-                             size_t numSequences,
-                             const int* starts);
-
-protected:
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> bias_;
-
-  /// frameOutput_[i] is used to hold the i-th sample of output_
-  std::vector<Argument> frameOutput_;
-  MatrixPtr prevOutput_;
-  /// Whether compute rnn by reverse.
-  bool reversed_;
-  /// If compute batch by batch, batchValue_ will be used to save the
-  /// reorganized input value.
-  std::unique_ptr<SequenceToBatch> batchValue_;
-  /// If compute batch by batch, batchGrad_ will be used to save the
-  /// gradient with respect to reorganized input value.
-  std::unique_ptr<SequenceToBatch> batchGrad_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/RecurrentLayerGroup.cpp b/paddle/gserver/layers/RecurrentLayerGroup.cpp
deleted file mode 100644
index 27e8b5868e6d85cf004945d7cb086d6d57487f9f..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/RecurrentLayerGroup.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <functional>
-#include "paddle/gserver/layers/Layer.h"
-
-#include "paddle/gserver/gradientmachines/RecurrentGradientMachine.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * Recurrent layer group is a group of layers, which forward/backward one frame
- * after previous frame forward/backward through all layers in layer group.
- * It's automatically added by config_parser if some layers are defined
- * between RecurrentLayerGroupBegin and RecurrentLayerGroupEnd.
- */
-class RecurrentLayerGroup : public Layer {
-public:
-  explicit RecurrentLayerGroup(const LayerConfig& config) : Layer(config) {}
-
-  void initSubNetwork(NeuralNetwork* rootNetwork,
-                      const ModelConfig& config,
-                      const std::vector<ParameterType>& parameterTypes,
-                      bool useGpu) override;
-
-  void forward(PassType passType) override {
-    REGISTER_TIMER_INFO("RecurrentGroupFwTime", getName().c_str());
-    const std::vector<Argument> inArgs;
-    std::vector<Argument> outArgs;
-    network_->forward(inArgs, &outArgs, passType);
-  }
-  void backward(const UpdateCallback& callback) override {
-    REGISTER_TIMER_INFO("RecurrentGroupBwTime", getName().c_str());
-    network_->backward(nullptr);
-
-    for (auto& para : parameters_) {
-      para->incUpdate(callback);
-    }
-  }
-
-  /**
-   * @see Layer.accessSubNetwork
-   */
-  void accessSubNetwork(
-      const std::function<void(NeuralNetwork&)>& callback) override {
-    callback(*network_);
-  }
-
-private:
-  std::unique_ptr<RecurrentGradientMachine> network_;
-};
-
-REGISTER_LAYER(recurrent_layer_group, RecurrentLayerGroup);
-
-void RecurrentLayerGroup::initSubNetwork(
-    NeuralNetwork* rootNetwork,
-    const ModelConfig& config,
-    const std::vector<ParameterType>& parameterTypes,
-    bool useGpu) {
-  setNeedGradient(true);
-
-  network_.reset(new RecurrentGradientMachine(config_.name(), rootNetwork));
-  ParamInitCallback cb = [this, rootNetwork](int paramId, Parameter* para) {
-    para->enableSharedType(
-        PARAMETER_VALUE,
-        rootNetwork->getParameters()[paramId]->getBuf(PARAMETER_VALUE),
-        rootNetwork->getParameters()[paramId]->getMat(PARAMETER_VALUE));
-    para->enableSharedType(
-        PARAMETER_GRADIENT,
-        rootNetwork->getParameters()[paramId]->getBuf(PARAMETER_GRADIENT),
-        rootNetwork->getParameters()[paramId]->getMat(PARAMETER_GRADIENT));
-  };
-  network_->init(config, cb, parameterTypes, useGpu);
-
-  for (auto paramId : network_->getParameterIds()) {
-    ParameterPtr parameter = rootNetwork->getParameters()[paramId];
-    parameter->incShared();
-    CHECK_EQ(parameter->getDeviceId(), getDeviceId());
-    parameters_.push_back(parameter);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ResizeLayer.cpp b/paddle/gserver/layers/ResizeLayer.cpp
deleted file mode 100644
index 831f4c3b7e103bc51d870cfa44616980adca08e8..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ResizeLayer.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/BaseMatrix.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-/**
- * @brief A layer for resizing a minibatch matrix h*w to h'*w'
- * @note
- * origin matrix height * width)
- * resize matrix: (height * width / size) * size
- */
-class ResizeLayer : public Layer {
-public:
-  explicit ResizeLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback) override;
-};
-
-REGISTER_LAYER(resize, ResizeLayer);
-
-bool ResizeLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  if (!Layer::init(layerMap, parameterMap)) return false;
-  CHECK_EQ(1U, inputLayers_.size());
-
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void ResizeLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  const Argument& input = getInput(0);
-  size_t height = input.value->getHeight();
-  size_t width = input.value->getWidth();
-  CHECK_EQ((height * width) % getSize(), 0UL);
-
-  reserveOutput(height * width / getSize(), getSize());
-  MatrixPtr tmp =
-      Matrix::create(output_.value->getData(), height, width, false, useGpu_);
-  tmp->assign(*input.value);
-}
-
-void ResizeLayer::backward(const UpdateCallback& callback) {
-  const Argument& input = getInput(0);
-  size_t height = input.value->getHeight();
-  size_t width = input.value->getWidth();
-
-  if (!input.grad) {
-    return;
-  }
-
-  MatrixPtr tmp = Matrix::create(input.grad->getData(),
-                                 height * width / getSize(),
-                                 getSize(),
-                                 false,
-                                 useGpu_);
-  tmp->add(*output_.grad);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/RotateLayer.h b/paddle/gserver/layers/RotateLayer.h
deleted file mode 100644
index 3b619921ab741e1236a495e497e18e265bd6e110..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/RotateLayer.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-/**
- * A layer for rotating a multi-channel feature map (M x N x C) in the spatial
- * domain
- * The rotation is 90 degrees in clock-wise for each channel
- * \f[
- *   y(j,i,:) = x(M-i-1,j,:)
- * \f]
- * where \f$x\f$ is (M x N x C) input, and \f$y\f$ is (N x M x C) output.
- *
- * The config file api is rotate_layer
- *
- */
-
-class RotateLayer : public Layer {
-public:
-  explicit RotateLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
-
-private:
-  int batchSize_;
-  int size_;
-  int height_;
-  int width_;
-  int channels_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/RowConvLayer.cpp b/paddle/gserver/layers/RowConvLayer.cpp
deleted file mode 100644
index 63b499e486fd24b5f816ee0e897b040ee5007581..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/RowConvLayer.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "RowConvLayer.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(row_conv, RowConvLayer);
-
-bool RowConvLayer::init(const LayerMap& layerMap,
-                        const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  contexLength_ = config_.inputs(0).row_conv_conf().context_length();
-
-  CHECK_EQ(inputLayers_.size(), 1UL);
-  weight_.reset(new Weight(contexLength_, getSize(), parameters_[0]));
-  createFunction(forward_, "RowConv", FuncConfig());
-  createFunction(backward_, "RowConvGrad", FuncConfig());
-
-  return true;
-}
-
-void RowConvLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  MatrixPtr input = getInputValue(0);
-  size_t height = input->getHeight();
-  size_t width = input->getWidth();
-  CHECK_EQ(width, getSize());
-  resetOutput(height, width);
-
-  const auto startPos = getInput(0).sequenceStartPositions->getVector(useGpu_);
-  MatrixPtr w = weight_->getW();
-  wDims_ = TensorShape({w->getHeight(), w->getWidth()});
-
-  MatrixPtr outV = getOutputValue();
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), *startPos);
-  inputs.addArg(*w, wDims_);
-  outputs.addArg(*getOutputValue(), *startPos, ADD_TO);
-
-  {
-    REGISTER_TIMER_INFO("RowConvForward", getName().c_str());
-    forward_[0]->calc(inputs, outputs);
-  }
-
-  /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
-    forwardActivation();
-  }
-}
-
-void RowConvLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
-
-  const auto startPos = getInput(0).sequenceStartPositions->getVector(useGpu_);
-
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getOutputGrad(), *startPos);
-  inputs.addArg(*getInputValue(0), *startPos);
-  inputs.addArg(*weight_->getW(), wDims_);
-
-  MatrixPtr inGrad = getInputGrad(0);
-  MatrixPtr wGrad = weight_->getWGrad();
-  size_t h = getInputValue(0)->getHeight();
-  size_t w = getInputValue(0)->getWidth();
-  outputs.addArg(
-      inGrad ? (*inGrad) : *(Matrix::create(nullptr, h, w, false, useGpu_)),
-      *startPos,
-      ADD_TO);
-  outputs.addArg(
-      wGrad ? (*wGrad)
-            : *(Matrix::create(nullptr, contexLength_, w, false, useGpu_)),
-      wDims_,
-      ADD_TO);
-
-  {
-    REGISTER_TIMER_INFO("RowConvBackward", getName().c_str());
-    backward_[0]->calc(inputs, outputs);
-  }
-
-  {
-    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-    weight_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/RowConvLayer.h b/paddle/gserver/layers/RowConvLayer.h
deleted file mode 100644
index ba0af1de68a5f77d9ffefac6ef5193bb9d1b4f83..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/RowConvLayer.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * \brief Row Convolution Layer.
- */
-class RowConvLayer : public Layer {
-public:
-  explicit RowConvLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~RowConvLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-protected:
-  // Row convolution weight, context_lenght_ * fan_out.
-  // fan_out is the size of output feature.
-  std::unique_ptr<Weight> weight_;
-
-  // The step number to look ahead plus one equals contexLength_.
-  size_t contexLength_;
-  TensorShape wDims_;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/RowL2NormLayer.cpp b/paddle/gserver/layers/RowL2NormLayer.cpp
deleted file mode 100644
index 7ff0c9bae927cae2bc6a332bc0bde013e07edd0a..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/RowL2NormLayer.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * A layer for L2 normalization in each row,
- * \f[
- *   out[i] = \frac{in[i]}{\sqrt{\sum_{k=1}^N in[k]^{2}}}
- * \f]
- * where the size of \f$in\f$ is (batchSize x dataDim),
- * and the size of \f$out\f$ is (batchSize x dataDim).
- */
-
-class RowL2NormLayer : public Layer {
-protected:
-  MatrixPtr inSquare_;
-  MatrixPtr l2NormReciprocal_;
-  MatrixPtr dotSum_;
-
-public:
-  explicit RowL2NormLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(row_l2_norm, RowL2NormLayer);
-
-bool RowL2NormLayer::init(const LayerMap& layerMap,
-                          const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 1U);
-
-  return true;
-}
-
-void RowL2NormLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV = getInputValue(0);
-
-  /* malloc memory for the output_ if necessary */
-  size_t batchSize = inV->getHeight();
-  size_t dataDim = getSize();
-  CHECK_EQ(dataDim, inV->getWidth());
-  resetOutput(batchSize, dataDim);
-  MatrixPtr outV = getOutputValue();
-
-  Matrix::resizeOrCreate(inSquare_, batchSize, dataDim, false, useGpu_);
-  inV->square2(*inSquare_);
-  Matrix::resizeOrCreate(l2NormReciprocal_, batchSize, 1, false, useGpu_);
-  inSquare_->rowSum(*l2NormReciprocal_);
-  l2NormReciprocal_->sqrt2(*l2NormReciprocal_);
-  l2NormReciprocal_->scalarDiv(*l2NormReciprocal_, 1.0);
-  outV->rowScale(0, *inV, *l2NormReciprocal_);
-}
-
-void RowL2NormLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV = getInputValue(0);
-  MatrixPtr inG = getInputGrad(0);
-  MatrixPtr outV = getOutputValue();
-  MatrixPtr outG = getOutputGrad();
-  size_t batchSize = inV->getHeight();
-
-  // inG[ij] += outG[ij] / l2NormReciprocal
-  // inG[ij] += -inV[ij] * l2NormReciprocal * l2NormReciprocal * DotMul(outG[i],
-  // inV[i])
-  if (inG) {
-    Matrix::resizeOrCreate(dotSum_, batchSize, 1, false, useGpu_);
-    dotSum_->zeroMem();
-    dotSum_->rowDotMul(0, *outG, *outV);
-    dotSum_->dotMul(*dotSum_, *l2NormReciprocal_);
-    dotSum_->dotMul(*dotSum_, *l2NormReciprocal_);
-    inSquare_->rowScale(0, *inV, *dotSum_);
-    inG->sub(*inSquare_);
-    inG->addRowScale(0, *outG, *l2NormReciprocal_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SamplingIdLayer.cpp b/paddle/gserver/layers/SamplingIdLayer.cpp
deleted file mode 100644
index 2edd915d226edfd7e48df1a066d5a6f51f259511..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SamplingIdLayer.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <random>
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for sampling id from multinomial distribution from the
- * input layer. Sampling one id for one sample. The result is stored in
- * output_.ids.
- *
- * The config file api is sampling_id_layer.
- */
-class SamplingIdLayer : public Layer {
-  /// Produces random floating-point values, uniformly distributed on [0, 1).
-  std::uniform_real_distribution<double> rand1_;
-  std::vector<Argument> tmpCpuInput_;
-
-public:
-  explicit SamplingIdLayer(const LayerConfig& config)
-      : Layer(config), rand1_(0, 1) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override {
-    bool ret = Layer::init(layerMap, parameterMap);
-    CHECK_EQ(1UL, inputLayers_.size());
-    if (useGpu_) {
-      tmpCpuInput_.reserve(inputLayers_.size());
-      for (size_t i = 0; i < inputLayers_.size(); i++) {
-        tmpCpuInput_.push_back(Argument());
-      }
-    }
-    return ret;
-  }
-
-  void forward(PassType passType) override {
-    Layer::forward(passType);
-    if (useGpu_) {
-      for (size_t i = 0; i < inputLayers_.size(); i++) {
-        tmpCpuInput_[i].resizeAndCopyFrom(
-            getInput(i), false, HPPL_STREAM_DEFAULT);
-      }
-      hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-      forwardImp(tmpCpuInput_[0]);
-    } else {
-      forwardImp(getInput(0));
-    }
-  }
-
-  void forwardImp(const Argument& input) {
-    size_t batchSize = input.getBatchSize();
-    IVector::resizeOrCreate(output_.ids, batchSize, useGpu_);
-    real* buf = input.value->getData();
-    int dim = input.value->getWidth();
-    std::vector<int> ids(batchSize);
-    auto& reng = ThreadLocalRandomEngine::get();
-    for (size_t i = 0; i < batchSize; ++i) {
-      double r = rand1_(reng);
-      int id = dim - 1;
-      for (int j = 0; j < dim; ++j) {
-        if ((r -= buf[i * dim + j]) < 0) {
-          id = j;
-          break;
-        }
-      }
-      ids[i] = id;
-    }
-    output_.ids->copyFrom(ids.data(), batchSize);
-  }
-
-  void backward(const UpdateCallback& callback) override {}
-};
-
-REGISTER_LAYER(sampling_id, SamplingIdLayer);
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ScaleShiftLayer.cpp b/paddle/gserver/layers/ScaleShiftLayer.cpp
deleted file mode 100644
index 799d1fe51a65da10bef637894931627315daf0a2..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ScaleShiftLayer.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * A layer applies a linear transformation to each element in each row of
- * the input matrix. For each element, the layer first re-scale it and then
- * adds a bias to it.
- *
- * \f[
- *    y = wx + b
- * \f]
- *
- * Here, w is the scale and b is the bias. Both w and b are trainable scalars.
- *
- */
-
-class ScaleShiftLayer : public Layer {
-protected:
-  std::unique_ptr<Weight> scale_;
-  std::unique_ptr<Weight> offset_;
-
-public:
-  explicit ScaleShiftLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(scale_shift, ScaleShiftLayer);
-
-bool ScaleShiftLayer::init(const LayerMap& layerMap,
-                           const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  CHECK_EQ(inputLayers_.size(), 1U);
-  scale_.reset(new Weight(1, 1, parameters_[0]));
-  if (biasParameter_.get() != NULL) {
-    offset_ = std::unique_ptr<Weight>(new Weight(1, 1, biasParameter_));
-  }
-  return true;
-}
-
-void ScaleShiftLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV = getInputValue(0);
-  resetOutput(inV->getHeight(), inV->getWidth());
-  MatrixPtr outV = getOutputValue();
-  real scaleValue = scale_->getW()->getElement(0, 0);
-  outV->mulScalar(*inV, scaleValue);
-  if (offset_) {
-    real offsetValue = offset_->getW()->getElement(0, 0);
-    outV->add(offsetValue);
-  }
-}
-
-void ScaleShiftLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV = getInputValue(0);
-  MatrixPtr inG = getInputGrad(0);
-  MatrixPtr outV = getOutputValue();
-  MatrixPtr outG = getOutputGrad();
-
-  /* Calculate the parameter gradient for the current layer */
-  if (scale_->getWGrad()) {
-    MatrixPtr rowSumMtx;
-    Matrix::resizeOrCreate(rowSumMtx, outG->getHeight(), 1, false, useGpu_);
-    // this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij} * c_{ij}
-    rowSumMtx->sumOfProducts(
-        /* b= */ *inV, /* c= */ *outG, /* scaleSum= */ 1, /* scaleDest= */ 0.);
-    // this_i = scaleDest * this_i + scaleSum * \sum_j b_{ji}
-    scale_->getWGrad()->sumCols(
-        /* b= */ *rowSumMtx, /* scaleSum= */ 1., /* scaleDest= */ 1.);
-    scale_->getParameterPtr()->incUpdate(callback);
-  }
-  if (offset_ && offset_->getWGrad()) {
-    MatrixPtr rowSumMtx;
-    Matrix::resizeOrCreate(rowSumMtx, outG->getHeight(), 1, false, useGpu_);
-    rowSumMtx->sumRows(*outG, 1., 0.);
-    offset_->getWGrad()->sumCols(*rowSumMtx, 1., 1.);
-    offset_->getParameterPtr()->incUpdate(callback);
-  }
-
-  /* Calculate the input layers error */
-  if (inG) {
-    real scaleValue = scale_->getW()->getElement(0, 0);
-    inG->add(*outG, scaleValue);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ScaleSubRegionLayer.cpp b/paddle/gserver/layers/ScaleSubRegionLayer.cpp
deleted file mode 100644
index 68a0ff735844679df1393473355f54ee616c09bd..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ScaleSubRegionLayer.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ScaleSubRegionLayer.h"
-#include "paddle/utils/Stat.h"
-namespace paddle {
-
-REGISTER_LAYER(scale_sub_region, ScaleSubRegionLayer);
-
-bool ScaleSubRegionLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  CHECK_EQ(static_cast<int>(inputLayers_.size()), 2);
-  auto& conf = config_.inputs(0).scale_sub_region_conf();
-  value_ = conf.value();
-
-  createFunction(forward_, "ScaleSubRegion", FuncConfig().set("value", value_));
-  createFunction(
-      backward_, "ScaleSubRegionGrad", FuncConfig().set("value", value_));
-
-  return true;
-}
-
-void ScaleSubRegionLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  auto in0 = getInput(0);
-  imgH_ = in0.getFrameHeight();
-  imgW_ = in0.getFrameWidth();
-  if (imgH_ == 0 || imgW_ == 0) {
-    auto& conf = config_.inputs(0).scale_sub_region_conf();
-    imgH_ = conf.image_conf().img_size_y();
-    imgW_ = conf.image_conf().img_size();
-  }
-  MatrixPtr imgV = in0.value;
-  size_t batchSize = imgV->getHeight();
-  size_t spatialSize = imgH_ * imgW_;
-  channelsNum_ = imgV->getWidth() / spatialSize;
-  shape_ = TensorShape({batchSize, channelsNum_, imgH_, imgW_});
-
-  resetOutput(batchSize, imgV->getWidth());
-  auto& out = getOutput();
-  out.setFrameHeight(imgH_);
-  out.setFrameWidth(imgW_);
-
-  MatrixPtr indicesV = getInputValue(1);
-  indicesShape_ = TensorShape({batchSize, 6});
-
-  REGISTER_TIMER_INFO("ScaleSubRegionForward", getName().c_str());
-  BufferArgs inArgs;
-  BufferArgs outArgs;
-  inArgs.addArg(*imgV, shape_);
-  inArgs.addArg(*indicesV, indicesShape_);
-  outArgs.addArg(*out.value, shape_, ASSIGN_TO);
-  forward_[0]->calc(inArgs, outArgs);
-}
-
-void ScaleSubRegionLayer::backward(const UpdateCallback& callback) {
-  REGISTER_TIMER_INFO("ScaleSubRegionBackward", getName().c_str());
-  BufferArgs inArgs;
-  BufferArgs outArgs;
-  inArgs.addArg(*getOutputGrad(), shape_);
-  inArgs.addArg(*getInputValue(1), indicesShape_);
-  outArgs.addArg(*getInputGrad(0), shape_, ADD_TO);
-  backward_[0]->calc(inArgs, outArgs);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ScaleSubRegionLayer.h b/paddle/gserver/layers/ScaleSubRegionLayer.h
deleted file mode 100644
index 6e861be4858cfc21a42ef7293652d5cdf81be5f5..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ScaleSubRegionLayer.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * \brief  For each instance, this layer can be used to multiply a value to a
- *         specified sub continuous region. By providing start index and end
- *         index for C/H/W, you can specify the location and shape of the
- *         region.
- *
- *         input_0: Input value.
- *         input_1: Indices value to specify the location an shape of the
- *                  region.
- */
-class ScaleSubRegionLayer : public Layer {
-public:
-  explicit ScaleSubRegionLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ScaleSubRegionLayer() {}
-
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-
-  void backward(const UpdateCallback& callback = nullptr);
-
-protected:
-  TensorShape shape_;
-  TensorShape indicesShape_;
-  size_t imgH_;
-  size_t imgW_;
-  size_t channelsNum_;
-  real value_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ScalingLayer.cpp b/paddle/gserver/layers/ScalingLayer.cpp
deleted file mode 100644
index 1d98a7373d172d40cddc9b4611cb00434f17e00b..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ScalingLayer.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for each row of a matrix, multiplying with a element of a vector,
- * which is used in NEURAL TURING MACHINE.
- * \f[
- *   y.row[i] = w[i] * x.row[i]
- * \f]
- * where \f$x\f$ is (batchSize x dataDim) input, \f$w\f$ is
- * (batchSize x 1) weight vector, and \f$y\f$ is (batchSize x dataDim) output.
- *
- * The config file api is scaling_layer.
- */
-
-class ScalingLayer : public Layer {
-public:
-  explicit ScalingLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~ScalingLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(scaling, ScalingLayer);
-
-bool ScalingLayer::init(const LayerMap& layerMap,
-                        const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 2U);
-
-  return true;
-}
-
-void ScalingLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr weightV = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-
-  size_t batchSize = inV1->getHeight();
-  size_t dataDim = inV1->getWidth();
-
-  CHECK_EQ(dataDim, getSize());
-  CHECK_EQ(weightV->getWidth(), 1U);
-  CHECK_EQ(weightV->getHeight(), batchSize);
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwScalingTimer", getName().c_str());
-    // outV += inV1 * weight
-    outV->addRowScale(0, *inV1, *weightV);
-  }
-}
-
-void ScalingLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr weightV = getInputValue(0);
-  MatrixPtr inV1 = getInputValue(1);
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-  MatrixPtr outG = getOutputGrad();
-
-  {
-    REGISTER_TIMER_INFO("BwScalingTimer", getName().c_str());
-
-    if (inG0) {
-      // inG0 += outG .* inV1
-      inG0->rowDotMul(0, *outG, *inV1);
-    }
-
-    if (inG1) {
-      // inG1 += outG * weight;
-      inG1->addRowScale(0, *outG, *weightV);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ScalingProjection.cpp b/paddle/gserver/layers/ScalingProjection.cpp
deleted file mode 100644
index 99b5b68f543842d23f20b626fddd66b677ebe059..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ScalingProjection.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Projection.h"
-
-namespace paddle {
-
-class ScalingProjection : public Projection {
-public:
-  ScalingProjection(const ProjectionConfig& config,
-                    const ParameterPtr& parameter,
-                    bool useGpu)
-      : Projection(config, parameter, useGpu) {
-    CHECK_EQ(parameter->getSize(), 1UL);
-    weight_.reset(new Weight(1, 1, parameter));
-  }
-
-  void forward() {
-    CHECK(in_->value);
-    out_->value->add(*in_->value, weight_->getW()->getElement(0, 0));
-  }
-
-  void backward(const UpdateCallback& callback) {
-    if (weight_->getWGrad()) {
-      auto sum = Matrix::create(in_->value->getHeight(), 1, false, useGpu_);
-      sum->sumOfProducts(*in_->value,
-                         *out_->grad,
-                         /* scaleSum= */ 1,
-                         /* scaleDest= */ 0);
-      weight_->getWGrad()->sumCols(*sum,
-                                   /* scaleSum= */ 1,
-                                   /* scaleDest= */ 1);
-      parameter_->incUpdate(callback);
-    }
-    if (in_->grad) {
-      in_->grad->add(*out_->grad, weight_->getW()->getElement(0, 0));
-    }
-  }
-
-protected:
-  std::unique_ptr<Weight> weight_;
-};
-
-REGISTER_PROJECTION(scaling, ScalingProjection);
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
deleted file mode 100644
index 43c98993f3f6f74c034c59176378c3ea97a9c19b..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
+++ /dev/null
@@ -1,336 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SelectiveFullyConnectedLayer.h"
-#include <algorithm>
-#include <vector>
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(selective_fc, SelectiveFullyConnectedLayer);
-
-bool SelectiveFullyConnectedLayer::init(const LayerMap& layerMap,
-                                        const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  inputNum_ = inputLayers_.size();
-  if (config_.has_selected_colums()) {
-    inputNum_ -= 1;
-  }
-  for (size_t i = 0; i < inputNum_; i++) {
-    size_t height = inputLayers_[i]->getSize();
-    size_t width = getSize();
-    // NOTE weight is transpoed
-    weights_.emplace_back(new Weight(width, height, parameters_[i]));
-  }
-
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  fullOutput_ = false;
-
-  return true;
-}
-
-void SelectiveFullyConnectedLayer::prefetch() {}
-
-void SelectiveFullyConnectedLayer::reserveOutput(size_t height,
-                                                 size_t width,
-                                                 size_t nnz) {
-  bool flag = (passType_ == PASS_TEST &&
-               config_.selective_fc_pass_generation() && !fullOutput_);
-  SetDevice device(output_.deviceId);
-  if (flag) {
-    // output_.value is sparse matrix
-    if (dynamic_cast<CpuMatrix*>(output_.value.get()) ||
-        dynamic_cast<GpuMatrix*>(output_.value.get())) {
-      output_.value = nullptr;
-    }
-    Matrix::resizeOrCreateSparseMatrix(output_.value,
-                                       height,
-                                       width,
-                                       nnz,
-                                       FLOAT_VALUE,
-                                       SPARSE_CSR,
-                                       /*trans=*/false,
-                                       /*useGpu=*/useGpu_);
-    output_.value->copyFrom(*selCols_);
-    interOutput_ = output_.value;
-  } else {
-    if (fullOutput_) {
-      // output_.value is dense matrix
-      if (dynamic_cast<CpuSparseMatrix*>(output_.value.get()) ||
-          dynamic_cast<GpuSparseMatrix*>(output_.value.get())) {
-        output_.value = nullptr;
-      }
-      Matrix::resizeOrCreate(output_.value,
-                             height,
-                             width,
-                             /*trans=*/false,
-                             /*useGpu=*/useGpu_);
-      interOutput_ = output_.value;
-    } else {
-      // output_.value is dense matrix, but width = nnz /height
-      CHECK_EQ(nnz % height, 0U);
-      CHECK(nnz / height);
-      Matrix::resizeOrCreate(output_.value,
-                             height,
-                             nnz / height,
-                             /*trans=*/false,
-                             /*useGpu=*/useGpu_);
-      interOutput_ = Matrix::createSparseMatrix(output_.value->getData(),
-                                                selCols_->getRows(),
-                                                selCols_->getCols(),
-                                                height,
-                                                width,
-                                                nnz,
-                                                FLOAT_VALUE,
-                                                SPARSE_CSR,
-                                                /*trans=*/false,
-                                                /*useGpu=*/useGpu_);
-    }
-  }
-  interOutput_->zeroMem();
-
-  if (passType_ != PASS_TEST && needGradient()) {
-    CHECK_EQ(nnz % height, 0U) << "during training, each sample must have a "
-                                  "same number of selected columns.";
-    CHECK(nnz / height)
-        << "during training, "
-           "each sample must have at least one column selected.";
-    Matrix::resizeOrCreate(output_.grad,
-                           height,
-                           nnz / height,
-                           /*trans=*/false,
-                           /*useGpu=*/useGpu_);
-    output_.grad->zeroMem();
-  }
-}
-
-void SelectiveFullyConnectedLayer::forward(PassType passType) {
-  REGISTER_TIMER("selective_fc.forward");
-  Layer::forward(passType);
-
-  getSelectiveCols();
-  size_t height = getInput(0).getBatchSize();
-  size_t width = getSize();
-  size_t nnz = height * width;
-  if (!fullOutput_) {
-    CHECK(selCols_);
-    CHECK(height == selCols_->getHeight());
-    CHECK(width == selCols_->getWidth());
-    nnz = selCols_->getElementCnt();
-  }
-
-  // Layer::ResetOutput(), here we set outV/outG as SparseMatrix manually
-  // this outV should be used as input of MaxIdLayer and softmax activation
-  reserveOutput(height, width, nnz);
-
-  bool flag = true;
-  for (size_t i = 0; i < inputNum_; i++) {
-    MatrixPtr input = getInputValue(i);
-    MatrixPtr weight = weights_[i]->getW();
-    size_t hsize = input->getHeight();
-    size_t wsize = weight->getHeight();
-    real scaleT = i == 0 ? real(0) : real(1);
-
-    flag = nnz < (hsize * wsize) * config_.selective_fc_full_mul_ratio() &&
-           !fullOutput_;
-    if (flag) {
-      // if the indecies are highly sparse,
-      // manully compute the multiplication of
-      // the input vector and the selected rows.
-      REGISTER_TIMER("selective.plain");
-      interOutput_->mul(*input, *weight->getTranspose(), 1, scaleT);
-    } else {
-      // if the indecies is not sparse enough,
-      // use full mul instead
-      REGISTER_TIMER("selective.mul");
-      if (fullOutput_) {
-        interOutput_->mul(*input, *weight->getTranspose(), 1, scaleT);
-      } else {
-        Matrix::resizeOrCreate(mmat_,
-                               hsize,
-                               wsize,
-                               /*trans=*/false,
-                               /*useGpu=*/useGpu_);
-        mmat_->mul(*input, *weight->getTranspose());
-        interOutput_->add3(mmat_);
-      }
-    }
-  }
-
-  if (biases_) {
-    interOutput_->addBias(*(biases_->getW()), 1);
-  }
-
-  flag = (passType_ == PASS_TEST && config_.selective_fc_pass_generation() &&
-          !fullOutput_);
-  if (flag) {
-    // during generation, output of this layer is a sparse csr matrix,
-    // which is probably the input of maxid layer
-    // if the model is trained with multi-class-cross-entroy-with-selfnorm,
-    // activiation of this layer should be exponential, not softmax.
-
-    Argument arg;
-    arg.value = Matrix::create(interOutput_->getData(),
-                               1,
-                               nnz,
-                               /*trans=*/false,
-                               /*useGpu=*/useGpu_);
-    //! TODO(yuyang18): Why we cannot invoke forwardActivation here?
-    activation_->forward(arg).check();
-  } else /* train and test in train, not generating */ {
-    // during training, this layer output value is *Matrix*, which is input of
-    // eg. multi-class-cross-entropy
-
-    // while training, every sample has a equal number of selected
-    // columns to be activated.
-    // note indices of multi-class-cross-entropy need to be remapped
-    // to this index.
-    // e.g. sample = [1,3,5] and 3 is gold, then label is 1
-
-    forwardActivation();
-  }
-}
-
-void SelectiveFullyConnectedLayer::backward(const UpdateCallback& callback) {
-  backwardActivation();
-  MatrixPtr oGrad = getOutputGrad();
-  if (!fullOutput_) {
-    interOutGrad_ = Matrix::createSparseMatrix(oGrad->getData(),
-                                               interOutput_->getRows(),
-                                               interOutput_->getCols(),
-                                               interOutput_->getHeight(),
-                                               interOutput_->getWidth(),
-                                               interOutput_->getElementCnt(),
-                                               FLOAT_VALUE,
-                                               SPARSE_CSR,
-                                               /*trans=*/false,
-                                               /*useGpu=*/useGpu_);
-  } else {
-    interOutGrad_ = Matrix::create(oGrad->getData(),
-                                   oGrad->getHeight(),
-                                   oGrad->getWidth(),
-                                   /*trans=*/false,
-                                   /*useGpu=*/useGpu_);
-  }
-
-  if (biases_ && biases_->getWGrad()) {
-    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
-    biases_->getWGrad()->collectBias(*interOutGrad_, 1);
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  // backward is different from FullyConnectedLayer
-  // because the weight is transposed
-  for (size_t i = 0; i < inputNum_; i++) {
-    AsyncGpuBlock block;
-    MatrixPtr preGrad = getInputGrad(i);
-    if (preGrad) {
-      REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
-      preGrad->mul(*interOutGrad_, *weights_[i]->getW(), 1, 1);
-    }
-
-    MatrixPtr wGrad = weights_[i]->getWGrad();
-    if (wGrad) {
-      REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
-      MatrixPtr input = getInputValue(i);
-      wGrad->mul(*interOutGrad_->getTranspose(), *input, 1, 1);
-    }
-
-    {
-      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-      weights_[i]->getParameterPtr()->incUpdate(callback);
-    }
-  }
-}
-
-void paddle::SelectiveFullyConnectedLayer::fillSelectiveData(
-    const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& candidates) {
-  if (candidates == nullptr) {
-    fillFullySelectiveData();
-    return;
-  }
-
-  size_t sampleNum = candidates->size();
-  size_t outputWidth = getSize();
-  size_t nnz =
-      std::accumulate(candidates->begin(),
-                      candidates->end(),
-                      0UL,
-                      [](size_t a, const std::pair<int*, size_t>& arr) {
-                        return a + arr.second;
-                      });
-
-  Matrix::resizeOrCreateSparseMatrix(this->cpuSelCols_,
-                                     sampleNum,
-                                     outputWidth,
-                                     nnz,
-                                     NO_VALUE,
-                                     SPARSE_CSR,
-                                     false,
-                                     false);
-  CHECK(this->cpuSelCols_ != nullptr);
-  CpuSparseMatrixPtr selCols =
-      std::dynamic_pointer_cast<CpuSparseMatrix>(cpuSelCols_);
-  int* rowOffsets = selCols->getRows();
-  int* colIndices = selCols->getCols();
-
-  rowOffsets[0] = 0;
-  int idx = 0;
-  for (size_t i = 0; i < sampleNum; ++i) {
-    if ((*candidates)[i].second > 0) {
-      rowOffsets[i + 1] = rowOffsets[i] + (*candidates)[i].second;
-      for (size_t j = 0; j < (*candidates)[i].second; ++j) {
-        colIndices[idx] = (*candidates)[i].first[j];
-        idx++;
-      }
-    } else {
-      rowOffsets[i + 1] = rowOffsets[i];
-    }
-  }
-
-  CHECK_EQ(static_cast<size_t>(rowOffsets[sampleNum]), nnz);
-  if (!useGpu_) {
-    this->selCols_ = this->cpuSelCols_;
-  } else {
-    Matrix::resizeOrCreateSparseMatrix(this->selCols_,
-                                       sampleNum,
-                                       outputWidth,
-                                       nnz,
-                                       NO_VALUE,
-                                       SPARSE_CSR,
-                                       false,
-                                       true);
-    this->selCols_->copyFrom(*cpuSelCols_, HPPL_STREAM_1);
-    hl_stream_synchronize(HPPL_STREAM_1);
-  }
-
-  fullOutput_ = false;
-}
-
-void paddle::SelectiveFullyConnectedLayer::getSelectiveCols() {
-  if (config_.has_selected_colums()) {
-    this->selCols_ = inputLayers_[inputNum_]->getOutputValue();
-    fullOutput_ = false;
-  } else if (!config_.selective_fc_pass_generation() || selCols_ == nullptr) {
-    this->fillFullySelectiveData();
-  }  // else selCols_ is initialized by fillSelectiveData
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
deleted file mode 100644
index 81564074185a5d9fc80d4d3a64af998098ab5472..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * @brief The SelectiveFullyConnectedLayer class
- *
- * SelectiveFullyConnectedLayer differs from FullyConnectedLayer by that it
- * requires an additional input to indicate several selected columns, and only
- * compute the multiplications between the input matrices and the selected
- * columns of the parameter matrices of this layer. If the selected columns is
- * not specified, SelectiveFullyConnected layer acts exactly like
- * FullyConnectedLayer.
- *
- * The config file api is selective_fc_layer.
- */
-class SelectiveFullyConnectedLayer : public Layer {
-protected:
-  WeightList weights_;
-  std::unique_ptr<Weight> biases_;
-
-private:
-  /**
-   * Get selected columns each forward.
-   */
-  void getSelectiveCols();
-
-  MatrixPtr mmat_;
-  /// cpuSelCols_ is a CpuSparseMatrix, used to save selected columns.
-  MatrixPtr cpuSelCols_;
-  /// CpuSparseMatrix or GpuSparseMatrix. In CPU mode, selCols_ points
-  /// to cpuSelCols_.
-  MatrixPtr selCols_;
-  size_t inputNum_;
-
-  /// interOutput_ shared same memory with output_.value.
-  MatrixPtr interOutput_;
-
-  /// if fullOutput_ is false, interOutGrad_ sparse matrix
-  MatrixPtr interOutGrad_;
-
-  /// if true, means output_.value is the same as Fc Layer
-  bool fullOutput_;
-
-public:
-  explicit SelectiveFullyConnectedLayer(const LayerConfig& config)
-      : Layer(config), selCols_(nullptr) {}
-
-  ~SelectiveFullyConnectedLayer() {}
-  void prefetch() override;
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  Weight& getWeight(int idx) { return *weights_[idx]; }
-
-  /**
-   * @brief Resize the output matrix size.
-   * And reset value to zero
-   */
-  void reserveOutput(size_t height, size_t width, size_t nnz);
-
-  /**
-   * @brief Fill candidates to select several activations as output.
-   * @param candidates specifies several selected columns of the parameter
-   * matrices of this layer.
-   * Multiplications only between the input matrices and the selected columns
-   * are computed.
-   * If the candidates is a nullptr, selective fc layer acts exactly like the
-   * fully connected layer.
-   * @note CURRENTLY, THIS METHOD IS ONLY USED FOR BEAM SEARCH
-   */
-  void fillSelectiveData(
-      const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& candidates);
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-private:
-  /**
-   * @brief Make SelectiveFC act as FullyConnectedLayer
-   */
-  void fillFullySelectiveData() { fullOutput_ = true; }
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceConcatLayer.cpp b/paddle/gserver/layers/SequenceConcatLayer.cpp
deleted file mode 100644
index cf573f3f33fcd70c6768b164f158cb1f545414fc..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SequenceConcatLayer.cpp
+++ /dev/null
@@ -1,189 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for concatenating the first sequence with the second sequence
- * Input: two sequences each containing the same number of instances
- *        seq1 = [a1, a2, ..., an]
- *        seq2 = [b1, b2, ..., bn]
- * Output: a concatenated sequence of the two input sequences
- *        out = [a1, b1, a2, b2, ..., an, bn]
- */
-
-class SequenceConcatLayer : public Layer {
-protected:
-  std::unique_ptr<Weight> biases_;
-
-public:
-  explicit SequenceConcatLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~SequenceConcatLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(seqconcat, SequenceConcatLayer);
-
-bool SequenceConcatLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  // sequene concatenation layer should have exactly 2 inputs
-  CHECK_EQ(2U, inputLayers_.size());
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SequenceConcatLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  size_t dim = getSize();
-
-  const Argument& input1 = getInput(0);
-  size_t numSequences1 = input1.getNumSequences();
-  auto startPositions1 = input1.sequenceStartPositions->getVector(false);
-
-  const Argument& input2 = getInput(1);
-  size_t numSequences2 = input2.getNumSequences();
-  auto startPositions2 = input2.sequenceStartPositions->getVector(false);
-
-  CHECK_EQ(dim, input1.value->getWidth());
-  CHECK_EQ(startPositions1->getData()[numSequences1], input1.getBatchSize());
-  CHECK_EQ(numSequences1, startPositions1->getSize() - 1);
-
-  CHECK_EQ(dim, input2.value->getWidth());
-  CHECK_EQ(startPositions2->getData()[numSequences2], input2.getBatchSize());
-  CHECK_EQ(numSequences2, startPositions2->getSize() - 1);
-
-  CHECK_EQ(numSequences1, numSequences2);
-
-  MatrixPtr inputValue1 = getInputValue(0);
-  MatrixPtr inputValue2 = getInputValue(1);
-
-  // reset output
-  reserveOutput(inputValue1->getHeight() + inputValue2->getHeight(), dim);
-
-  MatrixPtr outputValue = getOutputValue();
-
-  const int* starts1 = startPositions1->getData();
-  const int* starts2 = startPositions2->getData();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SequenceConcatLayerForward", getName().c_str());
-
-    size_t offset = 0;
-    size_t leftNumIns = 0;
-    size_t rightNumIns = 0;
-    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
-      leftNumIns = starts1[seqId + 1] - starts1[seqId];
-      outputValue->subMatrix(offset, leftNumIns)
-          ->assign(*(inputValue1->subMatrix(starts1[seqId], leftNumIns)));
-      offset += leftNumIns;
-
-      rightNumIns = starts2[seqId + 1] - starts2[seqId];
-      outputValue->subMatrix(offset, rightNumIns)
-          ->assign(*(inputValue2->subMatrix(starts2[seqId], rightNumIns)));
-      offset += rightNumIns;
-    }
-
-    // modify the sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(
-        output_.sequenceStartPositions, numSequences1 + 1, false);
-
-    int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
-
-    for (size_t seqId = 0; seqId < numSequences1 + 1; ++seqId) {
-      tgtBuf[seqId] = starts1[seqId] + starts2[seqId];
-    }
-  }
-
-  if (biases_.get() != NULL) {
-    MatrixPtr outV = getOutputValue();
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */
-  forwardActivation();
-}
-
-void SequenceConcatLayer::backward(const UpdateCallback& callback) {
-  /* activation */
-  backwardActivation();
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  MatrixPtr inputGrad1 = getInputGrad(0);
-  MatrixPtr inputGrad2 = getInputGrad(1);
-  MatrixPtr outputGrad = getOutputGrad();
-  auto startPositions1 = getInput(0).sequenceStartPositions->getVector(false);
-  auto startPositions2 = getInput(1).sequenceStartPositions->getVector(false);
-
-  size_t numSequences1 = startPositions1->getSize() - 1;
-  size_t numSequences2 = startPositions2->getSize() - 1;
-
-  CHECK_EQ(numSequences1, numSequences2);
-
-  const int* starts1 = startPositions1->getData();
-  const int* starts2 = startPositions2->getData();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SequenceConcatLayerBackward", getName().c_str());
-
-    size_t offset = 0;
-    size_t leftNumIns = 0;
-    size_t rightNumIns = 0;
-    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
-      leftNumIns = starts1[seqId + 1] - starts1[seqId];
-      if (inputGrad1) {
-        inputGrad1->subMatrix(starts1[seqId], leftNumIns)
-            ->add(*(outputGrad->subMatrix(offset, leftNumIns)));
-      }
-      offset += leftNumIns;
-
-      rightNumIns = starts2[seqId + 1] - starts2[seqId];
-      if (inputGrad2) {
-        inputGrad2->subMatrix(starts2[seqId], rightNumIns)
-            ->add(*(outputGrad->subMatrix(offset, rightNumIns)));
-      }
-      offset += rightNumIns;
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
deleted file mode 100644
index 6c4ae775c16ac76e237fb8f8ee5ec9ed8f11802e..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Logging.h"
-
-#include "SequencePoolLayer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for extracting the last instance of the input sequence.
- * Input: a sequence
- * If SequenceLevel = kNonseq:
- *   Output: a sequence containing only the last instance of the input sequence
- *   If stride_ > 0:
- *      Output: a shorten sequence. Stride is the step size by which we slide a
- *              window upon the input sequence, and getting last instance
- *              operation is then applied to each interval independently.
- * If SequenceLevel = kSeq:
- *   Check input sequence must has sub-sequence
- *   Output: a sequence containing only the last instance of each sub-sequence
- *           of the input sequence
- *
- * The config file api is last_seq and first_seq.
- */
-
-class SequenceLastInstanceLayer : public SequencePoolLayer {
-protected:
-  MatrixPtr tmpSrc_;
-  MatrixPtr tmpDest_;
-  std::vector<int> instanceIds_;
-
-public:
-  explicit SequenceLastInstanceLayer(const LayerConfig& config)
-      : SequencePoolLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer);
-
-bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
-                                     const ParameterMap& parameterMap) {
-  SequencePoolLayer::init(layerMap, parameterMap);
-  reversed_ = config_.select_first();
-
-  tmpSrc_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-  tmpDest_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-
-  return true;
-}
-
-void SequenceLastInstanceLayer::forward(PassType passType) {
-  SequencePoolLayer::forward(passType);
-
-  auto starts = startPositions_->getData(false);
-  MatrixPtr inputValue = getInputValue(0);
-  MatrixPtr outputValue = getOutputValue();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SequenceLastInstanceLayerForward", getName().c_str());
-
-    instanceIds_.clear();
-    for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
-      int insId = reversed_ ? starts[seqId] : starts[seqId + 1] - 1;
-      instanceIds_.push_back(insId);
-
-      outputValue->subMatrix(seqId, 1, tmpDest_)
-          ->assign(*(inputValue->subMatrix(insId, 1, tmpSrc_)));
-    }
-  }
-
-  if (biases_.get() != NULL) {
-    outputValue->addBias(*(biases_->getW()), 1);
-  }
-
-  /*  activation, should set to 'linear' in most cases */
-  forwardActivation();
-}
-
-void SequenceLastInstanceLayer::backward(const UpdateCallback& callback) {
-  SequencePoolLayer::backward(callback);
-
-  MatrixPtr inputGrad = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-
-  if (inputGrad) {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SequenceLastInstanceLayerBackward", getName().c_str());
-
-    for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
-      inputGrad->subMatrix(instanceIds_[seqId], 1, tmpDest_)
-          ->add(*(outputGrad->subMatrix(seqId, 1, tmpSrc_)));
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SequencePoolLayer.cpp b/paddle/gserver/layers/SequencePoolLayer.cpp
deleted file mode 100644
index 650ab425d1fcca56d8862200f37dd5bb36a67240..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SequencePoolLayer.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SequencePoolLayer.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-bool SequencePoolLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  // seqlastins/max/average layer should have exactly 1 input
-  CHECK_EQ(1U, inputLayers_.size());
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-  // transform to which sequence type
-  if (config_.trans_type() == "non-seq") {
-    type_ = kNonSeq;
-  } else if (config_.trans_type() == "seq") {
-    type_ = kSeq;
-  } else {
-    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
-  }
-  stride_ = config_.seq_pool_stride();
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SequencePoolLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-  CHECK(input.hasSeq() || input.hasSubseq())
-      << "Input should be a sequence or subsequence for layer " << getName();
-
-  newBatchSize_ = type_ ? input.getNumSubSequences() : input.getNumSequences();
-  size_t dim = getSize();
-  // check
-  CHECK_EQ(dim, input.value->getWidth());
-  startPositions_ =
-      type_ ? input.subSequenceStartPositions : input.sequenceStartPositions;
-  auto starts = startPositions_->getVector(false);
-  CHECK_EQ(starts->getData()[newBatchSize_], input.getBatchSize());
-  CHECK_EQ(newBatchSize_, starts->getSize() - 1);
-
-  /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
-   * thus, in this case, output_ has no sequenceStartPositions.
-   * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
-   * case, we should compute the new sequenceStartPositions.
-   */
-  if (type_) {
-    CHECK(input.subSequenceStartPositions)
-        << "when trans_type = seq, input must hasSubseq";
-    output_.degradeSequence(input);
-  }
-  if (stride_ > 0) {
-    CHECK_EQ(input.hasSubseq(), 0UL)
-        << "sequence stride pooling is invalid for hasSubseq now";
-    output_.poolSequenceWithStride(input, stride_, &startPositions_, reversed_);
-    newBatchSize_ = startPositions_->getSize() - 1;
-  }
-
-  resetOutput(newBatchSize_, dim);
-}
-
-void SequencePoolLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ { backwardActivation(); }
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SequencePoolLayer.h b/paddle/gserver/layers/SequencePoolLayer.h
deleted file mode 100644
index 254e4cc6b3aacf21565cb03e5bdb52a2beb9fea8..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SequencePoolLayer.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-/**
- * A base layer for SequenceLastInstanceLayer/AverageLayer/MaxLayer.
- *
- * Input: one or more sequences. Each sequence contains some instances.
- * If SequenceLevel = kNonSeq:
- *    Output: output size is the number of input sequences (NOT input instances)
- *    output[i] = seqlastin/average/max_{for each instance in this
- * sequence}{input[i]}
- *    If stride_ > 0:
- *        Check input sequence must not have sub-sequence
- *        Output: a shorten sequence. Stride is the step size by which we slide
- *                a window upon the input sequence, and the pooling operation
- *                is then applied to each interval independently.
- * If SequenceLevel = kSeq:
- *    Check input sequence must has sub-sequence
- *    Output: output size is the number of input sub-sequences
- *    output[i] = seqlastin/average/max_{for each instance in this
- * sub-sequence}{input[i]}
- *
- * The config file api is pooling_layer.
- */
-
-class SequencePoolLayer : public Layer {
-protected:
-  int type_;
-  std::unique_ptr<Weight> biases_;
-  enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
-  size_t newBatchSize_;
-  ICpuGpuVectorPtr startPositions_;
-  int stride_;
-  // Whether the input sequence is reversed or not.
-  bool reversed_ = false;
-
-public:
-  explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceReshapeLayer.cpp b/paddle/gserver/layers/SequenceReshapeLayer.cpp
deleted file mode 100644
index fb96669917236b98809f1cda0d023600f1e76731..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SequenceReshapeLayer.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- *  A layer for reshaping the sequence. Assume the input sequence has
- *  T instances, the dimension of each instance is M, and the input
- *  reshape_dim is N, then the output sequence has T*M/N instances,
- *  the dimension of each instance is N.
- *
- *  Note that T*M/N must be an integer.
- */
-
-class SequenceReshapeLayer : public Layer {
-protected:
-  std::unique_ptr<Weight> biases_;
-
-  MatrixPtr reshapedOutputGrad;
-
-public:
-  explicit SequenceReshapeLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(seqreshape, SequenceReshapeLayer);
-
-bool SequenceReshapeLayer::init(const LayerMap& layerMap,
-                                const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(1U, inputLayers_.size());
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SequenceReshapeLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const Argument& input = getInput(0);
-
-  size_t inDim = input.value->getWidth();
-  size_t outDim = getSize();
-
-  size_t numSequences = input.getNumSequences();
-
-  // by default, we assume each instance as a sequence
-  IVectorPtr seqStarts;
-  IVector::resizeOrCreate(seqStarts, input.getBatchSize() + 1, false);
-  int* startsData = seqStarts->getData();
-  for (int i = 0; i < input.getBatchSize() + 1; i++) {
-    startsData[i] = i;
-  }
-  const int* starts = startsData;
-
-  // if there is sequence, then use start positions
-  if (input.sequenceStartPositions) {
-    auto startPositions = input.sequenceStartPositions->getVector(false);
-    starts = startPositions->getData();
-    CHECK_EQ(starts[numSequences], input.getBatchSize());
-    CHECK_EQ(numSequences, startPositions->getSize() - 1);
-  }
-
-  for (size_t seqID = 0; seqID < numSequences; seqID++) {
-    size_t inNumIns = starts[seqID + 1] - starts[seqID];
-    size_t outNumIns = inNumIns * inDim / outDim;
-    CHECK_EQ(outNumIns * outDim, inNumIns * inDim);
-  }
-
-  MatrixPtr inputValue = getInputValue(0);
-
-  // reset output
-  reserveOutput(inputValue->getHeight() * inDim / outDim, outDim);
-  MatrixPtr outputValue = getOutputValue();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SequenceReshapeLayerForward", getName().c_str());
-
-    outputValue->copyFrom(*inputValue);
-
-    // modify the sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(
-        output_.sequenceStartPositions, numSequences + 1, false);
-
-    int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
-
-    for (size_t seqId = 0; seqId < numSequences + 1; ++seqId) {
-      tgtBuf[seqId] = starts[seqId] * inDim / outDim;
-    }
-  }
-
-  if (biases_.get() != NULL) {
-    MatrixPtr outV = getOutputValue();
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */
-  forwardActivation();
-}
-
-void SequenceReshapeLayer::backward(const UpdateCallback& callback) {
-  /* activation */
-  backwardActivation();
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  MatrixPtr inputGrad = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-
-  AsyncGpuBlock asyncGpuBlock;
-  REGISTER_TIMER_INFO("SequenceReshapeLayerBackward", getName().c_str());
-
-  if (inputGrad) {
-    Matrix::resizeOrCreate(reshapedOutputGrad,
-                           inputGrad->getHeight(),
-                           inputGrad->getWidth(),
-                           false,
-                           useGpu_);
-    reshapedOutputGrad->copyFrom(*outputGrad);
-    inputGrad->add(*reshapedOutputGrad);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceSliceLayer.cpp b/paddle/gserver/layers/SequenceSliceLayer.cpp
deleted file mode 100644
index 1b7c33477ea64c1cdb7c8e85d7a5302b299d7552..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SequenceSliceLayer.cpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-class SequenceSliceLayer : public Layer {
-public:
-  explicit SequenceSliceLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-private:
-  /*
-   * TODO(caoying)
-   * In PaddePaddle, currently all matrices are real number types,
-   * but the second and the (optional) third input which are some
-   * selected indices of the give sequence to trim the sequence, are actually
-   * filled with int types so that storing int types information in real number
-   * matrices is very dangerous, since real numbers will be convered to int
-   * types. If a user fills this matrix himself, invalid data may occor.
-   */
-
-  MatrixPtr startIdsOnCpu_;
-  MatrixPtr endIdsOnCpu_;
-
-  std::vector<int> selectedRows_;
-  IVectorPtr rowIndice_;
-  std::vector<std::vector<int>> inputSeqInfoVec_;
-  std::vector<int> outSubSeqStartPos_;
-  std::vector<int> outSeqStartPos_;
-
-  void checkInputs();
-  void copySliceIdsToCpu();
-  void calSelectedRows(const MatrixPtr starts, const MatrixPtr ends);
-};
-
-REGISTER_LAYER(seq_slice, SequenceSliceLayer);
-
-bool SequenceSliceLayer::init(const LayerMap& layerMap,
-                              const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  CHECK_GE(inputLayers_.size(), 2U);
-  CHECK_LE(inputLayers_.size(), 3U);
-
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SequenceSliceLayer::checkInputs() {
-  const Argument& inputSeq = getInput(0);
-  CHECK(inputSeq.hasSeq()) << "The first input of sequence slice layer "
-                           << "must be a sequence.";
-  const MatrixPtr indices1 = getInputValue(1);
-  CHECK_EQ(
-      indices1->getHeight(),
-      static_cast<size_t>(inputSeq.hasSubseq() ? inputSeq.getNumSubSequences()
-                                               : inputSeq.getNumSequences()))
-      << "Height of the second input should be equal to number of sequence "
-      << "in the first input.";
-  if (inputLayers_.size() == 3) {
-    const MatrixPtr indices2 = getInputValue(2);
-    CHECK_EQ(indices2->getHeight(), indices1->getHeight())
-        << "start indices and end indices should have the same height.";
-    CHECK_EQ(indices2->getWidth(), indices1->getWidth())
-        << "start indices and end indices should have the same Width.";
-  }
-}
-
-void SequenceSliceLayer::copySliceIdsToCpu() {
-  const MatrixPtr indices1 = getInputValue(1);
-  if (inputLayers_.size() == 2U) {
-    if (config_.select_first()) {
-      Matrix::resizeOrCreate(startIdsOnCpu_,
-                             indices1->getHeight(),
-                             indices1->getWidth(),
-                             false /* trans */,
-                             false /* useGpu */);
-      startIdsOnCpu_->copyFrom(*indices1);
-      endIdsOnCpu_ = nullptr;
-    } else {
-      Matrix::resizeOrCreate(endIdsOnCpu_,
-                             indices1->getHeight(),
-                             indices1->getWidth(),
-                             false /* trans */,
-                             false /* useGpu */);
-      endIdsOnCpu_->copyFrom(*indices1);
-      startIdsOnCpu_ = nullptr;
-    }
-  } else if (inputLayers_.size() == 3U) {
-    Matrix::resizeOrCreate(startIdsOnCpu_,
-                           indices1->getHeight(),
-                           indices1->getWidth(),
-                           false /* trans */,
-                           false /* useGpu */);
-    startIdsOnCpu_->copyFrom(*indices1);
-
-    const MatrixPtr indices2 = getInputValue(2);
-    Matrix::resizeOrCreate(endIdsOnCpu_,
-                           indices2->getHeight(),
-                           indices2->getWidth(),
-                           false /* trans */,
-                           false /* useGpu */);
-    endIdsOnCpu_->copyFrom(*indices2);
-  }
-}
-
-void SequenceSliceLayer::calSelectedRows(const MatrixPtr starts,
-                                         const MatrixPtr ends) {
-  CHECK(starts || ends) << "At least one of the start or end indices "
-                        << "should be given.";
-
-  bool hasSubseq = getInput(0).hasSubseq();
-
-  outSeqStartPos_.resize(1, 0);
-  outSubSeqStartPos_.resize(1, 0);
-  selectedRows_.clear();
-
-  size_t beamSize = starts ? starts->getWidth() : ends->getWidth();
-  size_t rowIdx = 0;
-  for (size_t i = 0; i < inputSeqInfoVec_.size(); ++i) {
-    for (size_t j = 0; j < inputSeqInfoVec_[i].size() - 1; ++j) {
-      for (size_t k = 0; k < beamSize; ++k) {
-        if (starts && starts->getElement(rowIdx, k) == -1.) break;
-        if (ends && ends->getElement(rowIdx, k) == -1.) break;
-
-        int begPos = inputSeqInfoVec_[i][j];
-        if (starts) begPos += starts->getElement(rowIdx, k);
-
-        int endPos = inputSeqInfoVec_[i][j + 1] - 1;
-        if (ends) endPos = inputSeqInfoVec_[i][j] + ends->getElement(rowIdx, k);
-
-        int seqLen = endPos - begPos + 1;
-        CHECK_GT(seqLen, 0);
-        for (int m = begPos; m <= endPos; ++m) selectedRows_.push_back(m);
-        hasSubseq
-            ? outSubSeqStartPos_.push_back(outSubSeqStartPos_.back() + seqLen)
-            : outSeqStartPos_.push_back(outSeqStartPos_.back() + seqLen);
-      }
-      rowIdx++;
-    }
-    if (hasSubseq) outSeqStartPos_.push_back(outSubSeqStartPos_.back());
-  }
-
-  if (useGpu_) {
-    rowIndice_ = IVector::create(selectedRows_.size(), useGpu_);
-    rowIndice_->copyFrom(selectedRows_.data(), selectedRows_.size());
-  } else {
-    rowIndice_ =
-        IVector::create(selectedRows_.data(), selectedRows_.size(), useGpu_);
-  }
-
-  // create the sequence information for the output.
-  ICpuGpuVector::resizeOrCreate(
-      output_.sequenceStartPositions, outSeqStartPos_.size(), false);
-  output_.sequenceStartPositions->copyFrom(
-      outSeqStartPos_.data(), outSeqStartPos_.size(), false);
-
-  if (hasSubseq) {
-    ICpuGpuVector::resizeOrCreate(
-        output_.subSequenceStartPositions, outSubSeqStartPos_.size(), false);
-    output_.subSequenceStartPositions->copyFrom(
-        outSubSeqStartPos_.data(), outSubSeqStartPos_.size(), false);
-  }
-}
-
-void SequenceSliceLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  checkInputs();
-
-  const Argument& inputSeq = getInput(0);
-  inputSeqInfoVec_.clear();
-  Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
-                              inputSeq.subSequenceStartPositions,
-                              inputSeqInfoVec_);
-  if (!useGpu_) {
-    if (inputLayers_.size() == 2U) {
-      startIdsOnCpu_ = config_.select_first() ? getInputValue(1) : nullptr;
-      endIdsOnCpu_ = config_.select_first() ? nullptr : getInputValue(1);
-    } else if (inputLayers_.size() == 3U) {
-      startIdsOnCpu_ = getInputValue(1);
-      endIdsOnCpu_ = getInputValue(2);
-    }
-  } else {
-    copySliceIdsToCpu();
-  }
-
-  /*
-   * calculate the selected row indices in a batch, and build the output
-   * sequence information.
-   */
-  calSelectedRows(startIdsOnCpu_, endIdsOnCpu_);
-
-  resetOutput(selectedRows_.size(), getSize());
-
-  getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
-}
-
-void SequenceSliceLayer::backward(const UpdateCallback& callback) {
-  getOutputGrad()->addToRows(*getInputGrad(0), *rowIndice_);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceToBatch.h b/paddle/gserver/layers/SequenceToBatch.h
deleted file mode 100644
index 8743a5ef10f61970d3d48b105b9da29bcd10ba83..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SequenceToBatch.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-
-/*
- * This class can used to modify the matrix structure of sequence matrix into
- * batch structure.
- * sequence matrix: [C1_s ... Cn_s | ...... | C1_t ... Cn_t]
- * batch matrix:    [C1_s ... C1_t | ...... | Cn_s ... Cn_t]
- * Cn_s is the state for sequence s at time n.
- *
- * Exampel:  sequence matrix = {{0, 0, 0, 0}, {1, 1, 1, 1, 1}, {2, 2, 2}}
- *           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
- *           batch matrix = {{1, 0, 2}, {1, 0, 2}, {1, 0, 2}, {1, 0}, {1}}
- *           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
- *
- * Use:
- * Input: seqMatrix, seqStarts(Sequence Start Positions)
- * Output: batchMatrix
- * 1. SequenceToBatch seq2batch;
- * 2. seq2batch.resizeOrCreateBatch(seqStarts);     // calculate seq2BatchIdx
- * 3. seq2batch.copy(seqMatrix, batchMatrix, true); // copy seq to batch matrix
- *
- */
-class SequenceToBatch {
-public:
-  explicit SequenceToBatch(bool useGpu) : useGpu_(useGpu) {}
-
-  /* resize and calculate the batchIndex_ */
-  void resizeOrCreateBatch(int batchSize,
-                           size_t numSequences,
-                           const int *seqStarts,
-                           bool reversed,
-                           bool prevBatchState = false);
-
-  /* sequence matrix and batch matrix copy:
-   * seq2batch: copy(seqValue, batchValue, true);
-   * batch2seq: copy(seqValue, batchValue, false);
-   */
-  void copy(Matrix &seqValue, Matrix &batchValue, bool seq2batch);
-  /* sequence/batch matrix add to batch/sequence matrix */
-  void add(Matrix &seqValue, Matrix &batchValue, bool seq2batch);
-  MatrixPtr getBatchValue(Matrix &batchValue, int batchId, int numRows = 0);
-
-  size_t getNumBatch() const { return numBatch_; }
-
-  /* resize or create a batch matrix(batchValue_) */
-  void resizeOrCreate(Matrix &seqValue);
-  /* copy seqValue to batchValue_ */
-  void copyFromSeq(Matrix &seqValue);
-  /* copy batchValue_ to seqValue */
-  void copyBackSeq(Matrix &seqValue);
-  MatrixPtr getBatchValue(int batchId, int numRows = 0);
-  MatrixPtr getBatchValue() { return batchValue_; }
-  /*tranfer preBatchOutput to batch struct*/
-  void prevOutput2Batch(Matrix &src, Matrix &dst);
-  /*get sequence output from batch struct*/
-  void getSeqOutputFromBatch(Matrix &sequence, Matrix &batch);
-
-  /* Copy the index from another seq2batch. */
-  void shareIndexWith(const SequenceToBatch &seq2batch) {
-    CHECK(useGpu_ == seq2batch.useGpu_);
-    batchStartPositions_ = seq2batch.batchStartPositions_;
-    seq2BatchIdx_ = seq2batch.seq2BatchIdx_;
-    cpuSeq2BatchIdx_ = seq2batch.cpuSeq2BatchIdx_;
-    numBatch_ = seq2batch.numBatch_;
-  }
-
-protected:
-  void sequence2BatchCopy(Matrix &batch,
-                          Matrix &sequence,
-                          IVector &seq2BatchIdx,
-                          bool seq2batch);
-  void sequence2BatchAdd(Matrix &batch,
-                         Matrix &sequence,
-                         IVector &seq2BatchIdx,
-                         bool seq2batch);
-
-  IVectorPtr batchStartPositions_;
-  IVectorPtr seq2BatchIdx_;
-  IVectorPtr cpuSeq2BatchIdx_;
-  IVectorPtr cpuSeqIdx_;
-  IVectorPtr cpuSeqEndIdxInBatch_;
-  IVectorPtr seqIdx_;
-  IVectorPtr seqEndIdxInBatch_;
-  size_t numBatch_;
-  bool useGpu_;
-  MatrixPtr batchValue_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SliceProjection.cpp b/paddle/gserver/layers/SliceProjection.cpp
deleted file mode 100644
index 5627ad1eb3a49a73261bc2197cbd3735489509d2..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SliceProjection.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Projection.h"
-
-namespace paddle {
-
-/**
- * SliceProjection can slice the input value into multiple parts,
- * and then select some of them to merge into a new output.
- *
- * First, calculate the slices that need to be merged into the output.
- * slices = input.slices().for_output()
- *
- * Second, merge each slice into the output.
- * for(auto slice: slices) {
- *   out.addAtOffset(slice, offset);
- * }
- *
- * Input slices as output: s0, s1, ...:
- *   -----------------------
- *   |///|   |//////|      |
- *   |/s0|   |//s1//|      |
- *   |///|   |//////|      |
- *   -----------------------
- * Output, merge s0, s1, ... into one output:
- *   ----------------
- *   |///|//////|   |
- *   |/s0|//s1//|...|
- *   |///|//////|   |
- *   ----------------
- *
- * The config file api is slice_projection.
- */
-class SliceProjection : public Projection {
-public:
-  SliceProjection(const ProjectionConfig& config,
-                  const ParameterPtr& parameter,
-                  bool useGpu);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-
-protected:
-  std::vector<std::pair<size_t, size_t>> slices_;
-};
-
-REGISTER_PROJECTION(slice, SliceProjection);
-
-/**
- * Constructed function.
- * @note SliceProjection should not have any parameter.
- */
-SliceProjection::SliceProjection(const ProjectionConfig& config,
-                                 const ParameterPtr& parameter,
-                                 bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  CHECK(!parameter) << "'slice' projection should not have any parameter";
-
-  slices_.reserve(config.slices_size());
-  for (const auto& slice : config.slices()) {
-    slices_.push_back(std::make_pair(slice.start(), slice.end()));
-  }
-}
-
-void SliceProjection::forward() {
-  size_t offset = 0;
-  for (auto& slice : slices_) {
-    auto slice_out = in_->value->subColMatrix(slice.first, slice.second);
-    out_->value->addAtOffset(*slice_out, offset);
-    offset += slice_out->getWidth();
-  }
-}
-
-void SliceProjection::backward(const UpdateCallback& callback) {
-  if (in_->grad) {
-    size_t offset = 0;
-    for (auto& slice : slices_) {
-      auto slice_out = in_->grad->subColMatrix(slice.first, slice.second);
-      slice_out->addAtOffset(*out_->grad, offset);
-      offset += slice_out->getWidth();
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SlopeInterceptLayer.cpp b/paddle/gserver/layers/SlopeInterceptLayer.cpp
deleted file mode 100644
index c94a07e5da7442bba1ce7e9c09c4ffea3e5cd4ac..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SlopeInterceptLayer.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief A layer for applying a slope and an intercept to the input
- * element-wise.
- * This layer is used in NEURAL TURING MACHINE.
- * @note There is no activation and weight in this layer.
- *
- * \f[
- *    y = ax + b
- * \f]
- *
- * Here, a is scale and b is offset, which are provided as attributes of the
- * layer.
- *
- * The config file api is slope_intercept_layer.
- */
-
-class SlopeInterceptLayer : public Layer {
-public:
-  explicit SlopeInterceptLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(slope_intercept, SlopeInterceptLayer);
-
-bool SlopeInterceptLayer::init(const LayerMap& layerMap,
-                               const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 1U);
-
-  return true;
-}
-
-void SlopeInterceptLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV = getInputValue(0);
-
-  /* malloc memory for the output_ if necessary */
-  size_t batchSize = inV->getHeight();
-  size_t size = getSize();
-
-  CHECK_EQ(size, inV->getWidth());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    reserveOutput(batchSize, size);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwSlopeInterceptTimer", getName().c_str());
-    outV->mulScalar(*inV, config_.slope());
-    outV->add(config_.intercept());
-  }
-}
-
-void SlopeInterceptLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inG = getInputGrad(0);
-  MatrixPtr outG = getOutputGrad();
-
-  if (inG) {
-    REGISTER_TIMER_INFO("BwSlopeInterceptTimer", getName().c_str());
-    inG->add(*outG, config_.slope());
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SpatialPyramidPoolLayer.h b/paddle/gserver/layers/SpatialPyramidPoolLayer.h
deleted file mode 100644
index 6cb5fdf83e2b88ce4adb392807a1fdbac253c51c..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SpatialPyramidPoolLayer.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "PoolProjection.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-/**
- * @brief A layer for spatial pyramid pooling on the input image by taking
- * the max, average, etc. within regions, so that the result vector of
- * different sized images are of the same size.
- *
- * The config file api is spp_layer.
- */
-
-class SpatialPyramidPoolLayer : public Layer {
-protected:
-  size_t channels_;
-  size_t imgSizeW_;
-  size_t imgSizeH_;
-  size_t pyramidHeight_;
-  std::string poolType_;
-
-  std::vector<std::unique_ptr<PoolProjection>> poolProjections_;
-  std::vector<Argument> projOutput_;
-  std::vector<std::pair<size_t, size_t>> projCol_;
-
-public:
-  explicit SpatialPyramidPoolLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  ProjectionConfig getConfig(size_t sizeX_,
-                             size_t sizeY_,
-                             size_t channels,
-                             size_t pyamidLevel_,
-                             std::string& poolType_);
-  size_t getSize();
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SubNestedSequenceLayer.cpp b/paddle/gserver/layers/SubNestedSequenceLayer.cpp
deleted file mode 100644
index db240ab0c96510263d90b291f6396ac51a73fbbd..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SubNestedSequenceLayer.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-class SubNestedSequenceLayer : public Layer {
-public:
-  explicit SubNestedSequenceLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-private:
-  /*
-   * This functions generates the indices of rows in a batch according to the
-   * indices of selected sub-sequence in each sequence.
-   *
-   * Examples:
-   * selectedIndices:
-   *   [
-   *     [0, 1, -1],
-   *     [0, 1, 2],
-   *     [0, -1, -1],
-   *     [0, 2, 3],
-   *   ]
-   * inputSeqInfo:
-   *   [
-   *     [0,3,4],
-   *     [4,5,7,10,15],
-   *     [15,20],
-   *     [20,22,23,25,28]
-   *   ]
-   *
-   * ths output is saved to private member rowIndice_;
-   * [0,1,2,3,4,5,6,7,8,9,15,16,17,18,19,20,21,23,24,25,26,27]
-   */
-
-  void calSelectedRows(const MatrixPtr selectedIndices,
-                       const std::vector<std::vector<int>>& inputSeqInfo);
-
-  /*
-   * TODO(caoying)
-   * In PaddePaddle, currently all matrices are real number types,
-   * but the second is some selected indices of the give sequence to trim
-   * the nested sequence, are actually filled with int types so that storing
-   * int types information in real number matrices is very dangerous, since
-   * real numbers will be convered to int types. If a user fills this matrix
-   * himself, invalid data may occor.
-   *
-   * if the second input of this layer is on GPU memory, copy it to CPU memory.
-   */
-  MatrixPtr selIdsCpu_;
-
-  /*
-   * reorganize sequenceStartPositions and subSequenceStartPositions
-   * into a 2d vector to facilitate the sequence selection process.
-   */
-  std::vector<std::vector<int>> inputSeqInfoVec_;
-
-  /* store the final selected row indices in a batch */
-  IVectorPtr rowIndice_;
-  /* rowIndice_ and selectedRows_ actually share a same memory. */
-  std::vector<int> selectedRows_;
-};
-
-REGISTER_LAYER(sub_nested_seq, SubNestedSequenceLayer);
-
-bool SubNestedSequenceLayer::init(const LayerMap& layerMap,
-                                  const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  CHECK_EQ(2U, inputLayers_.size());
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SubNestedSequenceLayer::calSelectedRows(
-    const MatrixPtr selectedIndices,
-    const std::vector<std::vector<int>>& inputSeqInfo) {
-  selectedRows_.clear();
-
-  std::vector<int> outSeqStartInfo(1, 0);
-  std::vector<int> outSubSeqStartInfo(1, 0);
-
-  size_t seqNum = selectedIndices->getHeight();
-  size_t beamSize = selectedIndices->getWidth();
-  for (size_t i = 0; i < seqNum; ++i) {
-    for (size_t j = 0; j < beamSize; ++j) {
-      if (selectedIndices->getElement(i, j) == -1.) break;
-      size_t selSubSeqIdx = selectedIndices->getElement(i, j);
-      CHECK_GT(inputSeqInfoVec_[i].size() - 1, selSubSeqIdx);
-
-      size_t subSeqLen = inputSeqInfoVec_[i][selSubSeqIdx + 1] -
-                         inputSeqInfoVec_[i][selSubSeqIdx];
-      for (size_t k = 0; k < subSeqLen; ++k)
-        selectedRows_.push_back(inputSeqInfoVec_[i][selSubSeqIdx] + k);
-      outSubSeqStartInfo.push_back(outSubSeqStartInfo.back() + subSeqLen);
-    }
-    outSeqStartInfo.push_back(outSubSeqStartInfo.back());
-  }
-
-  if (useGpu_) {
-    rowIndice_ = IVector::create(selectedRows_.size(), useGpu_);
-    rowIndice_->copyFrom(selectedRows_.data(), selectedRows_.size());
-  } else {
-    rowIndice_ =
-        IVector::create(selectedRows_.data(), selectedRows_.size(), useGpu_);
-  }
-
-  // create the sequence information for the output.
-  ICpuGpuVector::resizeOrCreate(
-      output_.sequenceStartPositions, outSeqStartInfo.size(), false);
-  output_.sequenceStartPositions->copyFrom(
-      outSeqStartInfo.data(), outSeqStartInfo.size(), false);
-
-  ICpuGpuVector::resizeOrCreate(
-      output_.subSequenceStartPositions, outSubSeqStartInfo.size(), false);
-  output_.subSequenceStartPositions->copyFrom(
-      outSubSeqStartInfo.data(), outSubSeqStartInfo.size(), false);
-}
-
-void SubNestedSequenceLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  const Argument& inputSeq = getInput(0);
-  CHECK(inputSeq.hasSubseq()) << "The first input of SubNestSequence layer "
-                              << "must be a nested sequence.";
-  const MatrixPtr selectedIndices = getInputValue(1);
-  CHECK_EQ(size_t(inputSeq.getNumSequences()), selectedIndices->getHeight());
-
-  if (dynamic_cast<GpuMatrix*>(selectedIndices.get())) {
-    /*
-     * Currently, the second input for this layer is generated by
-     * kmax_sequence_score_layer whose output is always stored on CPU,
-     * or a data_layer which canbe on GPU.
-     *
-     * If the second input is on GPU, copy it to CPU memory, because this
-     * input always uses very few memory, and operations related to it are
-     * all logic control, not computations.
-     */
-    Matrix::resizeOrCreate(selIdsCpu_,
-                           selectedIndices->getHeight(),
-                           selectedIndices->getWidth(),
-                           false /* trans */,
-                           false /* useGpu */);
-    selIdsCpu_->copyFrom(*selectedIndices);
-  } else {
-    selIdsCpu_ = selectedIndices;
-  }
-
-  Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
-                              inputSeq.subSequenceStartPositions,
-                              inputSeqInfoVec_);
-  calSelectedRows(selIdsCpu_, inputSeqInfoVec_);
-
-  resetOutput(selectedRows_.size(), getSize());
-  getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
-}
-
-void SubNestedSequenceLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inputSeqGrad = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-
-  if (inputSeqGrad) outputGrad->addToRows(*inputSeqGrad, *rowIndice_);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SubSequenceLayer.cpp b/paddle/gserver/layers/SubSequenceLayer.cpp
deleted file mode 100644
index 808627f09273950bb6f52a4a6e497bcb8ea170f7..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SubSequenceLayer.cpp
+++ /dev/null
@@ -1,226 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for taking the subsequence according to given offset and size
- * Input: original sequence, offset, size
- * Output: subsequence
- */
-
-class SubSequenceLayer : public Layer {
-protected:
-  std::unique_ptr<Weight> biases_;
-  MatrixPtr tmpSrc_;
-  MatrixPtr tmpDest_;
-
-public:
-  explicit SubSequenceLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(subseq, SubSequenceLayer);
-
-bool SubSequenceLayer::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  // sequene concatenation layer should have exactly 2 inputs
-  CHECK_EQ(3U, inputLayers_.size());
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  tmpSrc_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-  tmpDest_ =
-      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-
-  setNeedSequenceInfo(false);
-  return true;
-}
-
-void SubSequenceLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  size_t dim = getSize();
-
-  const Argument& input = getInput(0);
-  size_t numSequences1 = input.getNumSequences();
-  auto startPositions1 = input.sequenceStartPositions->getVector(false);
-
-  const Argument& offsetSeq = getInput(1);
-  size_t numSequences2 = offsetSeq.getNumSequences();
-  auto startPositions2 = offsetSeq.sequenceStartPositions->getVector(false);
-
-  const Argument& sizeSeq = getInput(2);
-  size_t numSequences3 = sizeSeq.getNumSequences();
-  auto startPositions3 = sizeSeq.sequenceStartPositions->getVector(false);
-
-  CHECK_EQ(dim, input.value->getWidth());
-
-  CHECK_EQ(startPositions1->getData()[numSequences1], input.getBatchSize());
-  CHECK_EQ(numSequences1, startPositions1->getSize() - 1);
-
-  CHECK_EQ(startPositions2->getData()[numSequences2], offsetSeq.getBatchSize());
-  CHECK_EQ(numSequences2, startPositions2->getSize() - 1);
-
-  CHECK_EQ(startPositions3->getData()[numSequences3], sizeSeq.getBatchSize());
-  CHECK_EQ(numSequences3, startPositions3->getSize() - 1);
-
-  CHECK_EQ(numSequences1, numSequences2);
-  CHECK_EQ(numSequences2, numSequences3);
-
-  MatrixPtr inputValue = input.value;
-  IVectorPtr offsetValue;
-  IVectorPtr sizeValue;
-
-  if (useGpu_) {
-    // copy to cpu
-    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
-    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
-    offsetValue->copyFrom(*offsetSeq.ids);
-    sizeValue->copyFrom(*sizeSeq.ids);
-  } else {
-    offsetValue = offsetSeq.ids;
-    sizeValue = sizeSeq.ids;
-  }
-
-  CHECK_EQ(offsetValue->getSize(), numSequences1);
-  CHECK_EQ(sizeValue->getSize(), numSequences1);
-
-  int* offsets = offsetValue->getData();
-  int* sizes = sizeValue->getData();
-
-  // get total height of output
-  size_t height = 0;
-  for (size_t seqId = 0; seqId < numSequences1; seqId++) {
-    height += sizes[seqId];
-  }
-
-  // reset output
-  resetOutput(height, dim);
-
-  MatrixPtr outputValue = getOutputValue();
-
-  const int* starts1 = startPositions1->getData();
-
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SubSequenceLayerForward", getName().c_str());
-
-    size_t offsetIn = 0;
-    size_t offsetOut = 0;
-    size_t size = 0;
-    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
-      offsetIn = starts1[seqId] + offsets[seqId];
-      size = sizes[seqId];
-
-      outputValue->subMatrix(offsetOut, size, tmpDest_)
-          ->assign(*(inputValue->subMatrix(offsetIn, size, tmpSrc_)));
-
-      offsetOut += size;
-    }
-
-    // modify the sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(
-        output_.sequenceStartPositions, numSequences1 + 1, false);
-
-    int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
-    int offset = 0;
-    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
-      tgtBuf[seqId] = offset;
-      offset += sizes[seqId];
-    }
-    tgtBuf[numSequences1] = offset;
-  }
-
-  if (biases_.get() != NULL) {
-    MatrixPtr outV = getOutputValue();
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* activation */
-  forwardActivation();
-}
-
-void SubSequenceLayer::backward(const UpdateCallback& callback) {
-  /* activation */
-  backwardActivation();
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  MatrixPtr inputGrad1 = getInputGrad(0);
-  MatrixPtr outputGrad = getOutputGrad();
-  auto startPositions1 = getInput(0).sequenceStartPositions->getVector(false);
-  size_t numSequences1 = startPositions1->getSize() - 1;
-  const int* starts1 = startPositions1->getData();
-
-  const Argument& offsetSeq = getInput(1);
-  const Argument& sizeSeq = getInput(2);
-  IVectorPtr offsetValue;
-  IVectorPtr sizeValue;
-
-  if (useGpu_) {
-    // copy to cpu
-    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
-    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
-    offsetValue->copyFrom(*offsetSeq.ids);
-    sizeValue->copyFrom(*sizeSeq.ids);
-  } else {
-    offsetValue = offsetSeq.ids;
-    sizeValue = sizeSeq.ids;
-  }
-
-  int* offsets = offsetValue->getData();
-  int* sizes = sizeValue->getData();
-  {
-    AsyncGpuBlock asyncGpuBlock;
-    REGISTER_TIMER_INFO("SubSequenceLayerBackward", getName().c_str());
-
-    int offsetIn = 0;
-    int offsetOut = 0;
-    int size = 0;
-    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
-      offsetIn = starts1[seqId] + offsets[seqId];
-      size = sizes[seqId];
-
-      inputGrad1->subMatrix(offsetIn, size, tmpDest_)
-          ->add(*(outputGrad->subMatrix(offsetOut, size, tmpSrc_)));
-      offsetOut += size;
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SumToOneNormLayer.cpp b/paddle/gserver/layers/SumToOneNormLayer.cpp
deleted file mode 100644
index ffbe14925300ad1ffbd33f43a6c0afadddd231e6..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SumToOneNormLayer.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * A layer for sum-to-one normalization,
- * which is used in NEURAL TURING MACHINE.
- * \f[
- *   out[i] = \frac {in[i]} {\sum_{k=1}^N in[k]}
- * \f]
- * where \f$in\f$ is a (batchSize x dataDim) input vector,
- * and \f$out\f$ is a (batchSize x dataDim) output vector.
- *
- * The config file api is sum_to_one_norm_layer.
- */
-
-class SumToOneNormLayer : public Layer {
-protected:
-  /// reciprocalRowSum_ = \f$1 / \sum_{k=1}^N in[k]\f$
-  MatrixPtr reciprocalRowSum_;
-  /// dotSum = output_.grad \f$.*\f$ output_.value
-  MatrixPtr dotSum_;
-
-public:
-  explicit SumToOneNormLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-
-REGISTER_LAYER(sum_to_one_norm, SumToOneNormLayer);
-
-bool SumToOneNormLayer::init(const LayerMap& layerMap,
-                             const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-
-  CHECK_EQ(inputLayers_.size(), 1U);
-
-  return true;
-}
-
-void SumToOneNormLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr inV = getInputValue(0);
-
-  /* malloc memory for the output_ if necessary */
-  size_t batchSize = inV->getHeight();
-  size_t dataDim = getSize();
-
-  CHECK_EQ(dataDim, inV->getWidth());
-
-  {
-    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
-    resetOutput(batchSize, dataDim);
-  }
-
-  MatrixPtr outV = getOutputValue();
-  {
-    REGISTER_TIMER_INFO("FwSumToOneNormTimer", getName().c_str());
-
-    Matrix::resizeOrCreate(reciprocalRowSum_, batchSize, 1, false, useGpu_);
-    inV->rowSum(*reciprocalRowSum_);
-
-    // todo: matrix checks
-    CHECK_GT(reciprocalRowSum_->getMin(), 0.0);
-
-    reciprocalRowSum_->scalarDiv(*reciprocalRowSum_, 1.0);
-
-    // outV = inV * reciprocalRowSum
-    outV->rowScale(0, *inV, *reciprocalRowSum_);
-  }
-}
-
-void SumToOneNormLayer::backward(const UpdateCallback& callback) {
-  MatrixPtr inV = getInputValue(0);
-  MatrixPtr inG = getInputGrad(0);
-  MatrixPtr outV = getOutputValue();
-  MatrixPtr outG = getOutputGrad();
-
-  size_t batchSize = inV->getHeight();
-
-  if (inG) {
-    REGISTER_TIMER_INFO("BwSumToOneTimer", getName().c_str());
-
-    Matrix::resizeOrCreate(dotSum_, batchSize, 1, false, useGpu_);
-
-    // dotSum = outG .* outV
-    dotSum_->zeroMem();
-    dotSum_->rowDotMul(0, *outG, *outV);
-
-    // inG += -1 * (dotSum / rowSum)
-    dotSum_->dotMul(*dotSum_, *reciprocalRowSum_);
-    inG->rowAdd(0, *inG, *dotSum_, -1.0);
-    // inG += outG * (1/rowSum)
-    inG->addRowScale(0, *outG, *reciprocalRowSum_);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SwitchOrderLayer.cpp b/paddle/gserver/layers/SwitchOrderLayer.cpp
deleted file mode 100644
index 704735de38bd373c0714de6bb4e139d1505c5451..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SwitchOrderLayer.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SwitchOrderLayer.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(switch_order, SwitchOrderLayer);
-
-bool SwitchOrderLayer::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  auto& img_conf = config_.inputs(0).image_conf();
-  size_t inD = img_conf.img_size_z();
-  size_t inH =
-      img_conf.has_img_size_y() ? img_conf.img_size_y() : img_conf.img_size();
-  size_t inW = img_conf.img_size();
-  size_t inC = img_conf.channels();
-  inH = inH * inD;
-  inDims_ = TensorShape({0, inC, inH, inW});
-  outDims_ = TensorShape(4);
-
-  auto& reshape_conf = config_.reshape_conf();
-  for (int i = 0; i < reshape_conf.height_axis_size(); i++) {
-    heightAxis_.push_back(reshape_conf.height_axis(i));
-  }
-  for (int i = 0; i < reshape_conf.width_axis_size(); i++) {
-    widthAxis_.push_back(reshape_conf.width_axis(i));
-  }
-  createFunction(nchw2nhwc_, "NCHW2NHWC", FuncConfig());
-  createFunction(nhwc2nchw_, "NHWC2NCHW", FuncConfig());
-  return true;
-}
-
-void SwitchOrderLayer::setOutDims() {
-  outDims_.setDim(0, inDims_[0]);
-  outDims_.setDim(1, inDims_[2]);
-  outDims_.setDim(2, inDims_[3]);
-  outDims_.setDim(3, inDims_[1]);
-  reshapeHeight_ = 1;
-  for (size_t i = 0; i < heightAxis_.size(); i++) {
-    reshapeHeight_ *= outDims_[heightAxis_[i]];
-  }
-  output_.setFrameHeight(reshapeHeight_);
-  reshapeWidth_ = 1;
-  for (size_t i = 0; i < widthAxis_.size(); i++) {
-    reshapeWidth_ *= outDims_[widthAxis_[i]];
-  }
-  output_.setFrameWidth(reshapeWidth_);
-}
-
-void SwitchOrderLayer::setInDims() {
-  MatrixPtr input = inputLayers_[0]->getOutputValue();
-  size_t batchSize = input->getHeight();
-  inDims_.setDim(0, batchSize);
-  int d = inputLayers_[0]->getOutput().getFrameDepth();
-  d = (d == 0 ? 1 : d);
-  int h = inputLayers_[0]->getOutput().getFrameHeight();
-  if (h != 0) inDims_.setDim(2, h * d);
-  int w = inputLayers_[0]->getOutput().getFrameWidth();
-  if (w != 0) inDims_.setDim(3, w);
-  int totalCount = input->getElementCnt();
-  int channels = totalCount / (inDims_[0] * inDims_[2] * inDims_[3]);
-  if (channels != 0) inDims_.setDim(1, channels);
-}
-
-void SwitchOrderLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  setInDims();
-  setOutDims();
-  resetOutput(outDims_[0], outDims_[1] * outDims_[2] * outDims_[3]);
-  if (heightAxis_.size() > 0) {
-    resetOutput(reshapeHeight_, reshapeWidth_);
-  }
-
-  // switch NCHW to NHWC
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getInputValue(0), inDims_);
-  outputs.addArg(*getOutputValue(), outDims_);
-  nchw2nhwc_[0]->calc(inputs, outputs);
-  forwardActivation();
-}
-
-void SwitchOrderLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-  backwardActivation();
-
-  // switch NHWC to NCHW
-  BufferArgs inputs;
-  BufferArgs outputs;
-  inputs.addArg(*getOutputGrad(), outDims_);
-  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
-  nhwc2nchw_[0]->calc(inputs, outputs);
-}
-}  // namespace paddle
diff --git a/paddle/gserver/layers/SwitchOrderLayer.h b/paddle/gserver/layers/SwitchOrderLayer.h
deleted file mode 100644
index 882437f4434c2e61a5b08328d2f79c1e7f589204..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/SwitchOrderLayer.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * \brief  This layer calculate softmax in image channel dimension.
- */
-class SwitchOrderLayer : public Layer {
-public:
-  explicit SwitchOrderLayer(const LayerConfig& config) : Layer(config) {}
-
-  ~SwitchOrderLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-  void setInDims();
-  void setOutDims();
-
-protected:
-  std::vector<std::shared_ptr<FunctionBase>> nchw2nhwc_;
-  std::vector<std::shared_ptr<FunctionBase>> nhwc2nchw_;
-  TensorShape inDims_;
-  TensorShape outDims_;
-  std::vector<int> heightAxis_;
-  std::vector<int> widthAxis_;
-  size_t reshapeHeight_;
-  size_t reshapeWidth_;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/TableProjection.h b/paddle/gserver/layers/TableProjection.h
deleted file mode 100644
index ffb05e68f068a7b9abb0db5cea6133e64300cb55..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/TableProjection.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Projection.h"
-
-namespace paddle {
-
-/**
- * Table projection takes index data input. It select rows from parameter
- * where row_id is in input_ids:
- * \f[
- *   out.row[i] += table.row[ids[i]]
- * \f]
- * where \f$out\f$ is out, \f$table\f$ is parameter, \f$ids\f$ is input_ids,
- * and \f$i\f$ is row_id.
- *
- * The config file api is table_projection.
- *
- * @note If \f$ids[i] = -1\f$, it will be ignored.
- */
-class TableProjection : public Projection {
-public:
-  TableProjection(const ProjectionConfig& config,
-                  const ParameterPtr& parameter,
-                  bool useGpu);
-  /**
-   * If use sparse row matrix as parameter, prefetch feature ids in input label.
-   */
-  virtual void prefetch(const Argument* in);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-
-protected:
-  std::unique_ptr<Weight> table_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/TensorLayer.cpp b/paddle/gserver/layers/TensorLayer.cpp
deleted file mode 100644
index b2271c63ef76d85574cf7f71b18aef4239938b8e..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/TensorLayer.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "TensorLayer.h"
-
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-REGISTER_LAYER(tensor, TensorLayer);
-
-bool TensorLayer::init(const LayerMap& layerMap,
-                       const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* initialize the weightList */
-  CHECK_EQ(inputLayers_.size(), 2LU);
-  CHECK(parameters_[0]);
-  CHECK(!parameters_[1]);
-
-  // Option the parameters
-  size_t height = inputLayers_[0]->getSize();
-  size_t width = inputLayers_[1]->getSize();
-  CHECK_EQ(width * height * getSize(), parameters_[0]->getSize());
-
-  for (size_t i = 0; i < getSize(); ++i) {
-    // create a new weight
-    Weight* w = new Weight(height, width, parameters_[0], i * width * height);
-
-    // append the new weight to the list
-    weights_.emplace_back(w);
-  }
-
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-
-  return true;
-}
-
-void TensorLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = getInputValue(0)->getHeight();
-  int size = getSize();
-
-  { resetOutput(batchSize, size); }
-
-  MatrixPtr outV = getOutputValue();
-  /* add the bias-vector */
-  if (biases_.get() != NULL) {
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
-  /* e1 * W * trans(e2) */ {
-    MatrixPtr input1 = getInputValue(0);
-    MatrixPtr input2 = getInputValue(1);
-    MatrixPtr tmpMat = Matrix::create(input2->getHeight(),
-                                      input2->getWidth(),
-                                      /* trans= */ false,
-                                      input2->useGpu());
-    REGISTER_TIMER_INFO("TensorFwMulTimer", getName().c_str());
-    for (size_t i = 0; i < getSize(); ++i) {
-      MatrixPtr weights = weights_[i]->getW();
-      tmpMat->mul(*input1, *weights, 1, 0);
-      outV->rowDotMul(i, *tmpMat, *input2);
-    }
-  }
-
-  /* activation */ { forwardActivation(); }
-}
-
-void TensorLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ { backwardActivation(); }
-
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  bool syncFlag = hl_get_sync_flag();
-
-  /* Calculate the W-gradient for the current layer */
-  MatrixPtr input1 = getInputValue(0);
-  MatrixPtr input2 = getInputValue(1);
-  MatrixPtr oGrad = getOutputGrad();
-  MatrixPtr tmpMat = Matrix::create(input1->getHeight(),
-                                    input1->getWidth(),
-                                    /* trans= */ false,
-                                    input1->useGpu());
-
-  /* trans(grad * e1) * e2 */ {
-    REGISTER_TIMER_INFO("TensorGradMulTimer", getName().c_str());
-    for (size_t i = 0; i < getSize(); ++i) {
-      if (weights_[i]->getWGrad()) {
-        tmpMat->rowScale(i, *input1, *oGrad);
-        MatrixPtr input1_T = tmpMat->getTranspose();
-        weights_[i]->getWGrad()->mul(*input1_T, *input2, 1, 1);
-      }
-    }
-  }
-
-  hl_set_sync_flag(false);
-
-  /* Calculate the input layers error */ {
-    MatrixPtr preGrad1 = getInputGrad(0);
-    MatrixPtr preGrad2 = getInputGrad(1);
-
-    REGISTER_TIMER_INFO("TensorBpMulTimer", getName().c_str());
-    for (size_t i = 0; i < getSize(); ++i) {
-      MatrixPtr weights = weights_[i]->getW();
-
-      if (NULL != preGrad1) { /* (grad * e2) * trans(W) */
-        tmpMat->rowScale(i, *input2, *oGrad);
-        MatrixPtr weights_T = weights->getTranspose();
-        preGrad1->mul(*tmpMat, *weights_T, 1, 1);
-      }
-      if (NULL != preGrad2) { /* (grad * e1) * W */
-        tmpMat->rowScale(i, *input1, *oGrad);
-        preGrad2->mul(*tmpMat, *weights, 1, 1);
-      }
-    }
-  }
-  hl_set_sync_flag(syncFlag);
-  parameters_[0]->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/TensorLayer.h b/paddle/gserver/layers/TensorLayer.h
deleted file mode 100644
index 8a323aa15f6f3761c45b6ca7e3be8f15621a189e..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/TensorLayer.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * @brief TensorLayer takes two input vectors.
- * \f[
- *     y_{i} = x_{1} * W_{i} * x_{2}^{\rm T}, i=0, 1, ...,K-1
- * \f]
- *
- * - \f$x_{1}\f$: the first input, size is M.
- * - \f$x_{2}\f$: the second input, size is N.
- * - y: output, size is K.
- * - \f$y_{i}\f$: i-th element of y.
- * - \f$W_{i}\f$: the i-th learned weight, dimensions: [M, N].
- * - \f$x_{2}^{\rm T}\f$: the transpose of \f$x_{2}\f$.
- *
- * The config file api is tensor_layer.
- */
-
-class TensorLayer : public Layer {
-protected:
-  WeightList weights_;
-  std::unique_ptr<Weight> biases_;
-
-public:
-  explicit TensorLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  Weight& getWeight(int idx) { return *weights_[idx]; }
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/TransLayer.cpp b/paddle/gserver/layers/TransLayer.cpp
deleted file mode 100644
index cf87ca53d1def32708400c507da673c3a6ec0a87..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/TransLayer.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "TransLayer.h"
-#include "paddle/utils/Logging.h"
-namespace paddle {
-
-REGISTER_LAYER(trans, TransLayer);
-
-bool TransLayer::init(const LayerMap& layerMap,
-                      const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-
-  /* the size of inputs for trans-layer is 1 */
-  CHECK_EQ(config_.inputs_size(), 1);
-
-  return true;
-}
-
-void TransLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  MatrixPtr input = getInputValue(0);
-  int height = input->getHeight();
-  int width = input->getWidth();
-
-  resizeOutput(width, height);
-
-  MatrixPtr outV = getOutputValue();
-
-  /* outV's memory has been allocated, so memAlloc = false */
-  input->transpose(outV, false);
-  if (getInputGrad(0)) {
-    zeroGrad();
-  }
-}
-
-void TransLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-
-  MatrixPtr outputGrad = getOutputGrad();
-  if (outputGrad == NULL) {
-    return;
-  }
-  MatrixPtr preGrad = getInputGrad(0);
-  if (preGrad) {
-    MatrixPtr transGrad = Matrix::create(preGrad->getHeight(),
-                                         preGrad->getWidth(),
-                                         /* trans= */ false,
-                                         preGrad->useGpu());
-    outputGrad->transpose(transGrad, false);
-    preGrad->add(*transGrad);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/TransLayer.h b/paddle/gserver/layers/TransLayer.h
deleted file mode 100644
index 03d094862459c80aee8899c0352ffce732db08af..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/TransLayer.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Layer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-/**
- * A layer for transposing a minibatch matrix.
- * \f[
-     y = x^\mathrm{T}
- * \f]
- * where \f$x\f$ is (M x N) input, and \f$y\f$ is (N x M) output.
- *
- * The config file api is trans_layer.
- */
-class TransLayer : public Layer {
-public:
-  explicit TransLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override;
-};
-}  // namespace paddle
diff --git a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp b/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
deleted file mode 100644
index 755389f7074c252c0fad396e629c6ffedc74b531..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Projection.h"
-#include "paddle/utils/Stat.h"
-
-namespace paddle {
-
-/**
- * @brief TransposedFullMatrixProjection performs full matrix multiplication:
- * out.row[i] += in.row[i] * weight.transpose
- *
- * The config file api is trans_full_matrix_projection.
- */
-class TransposedFullMatrixProjection : public Projection {
-public:
-  TransposedFullMatrixProjection(const ProjectionConfig& config,
-                                 ParameterPtr parameter,
-                                 bool useGPu);
-  virtual void forward();
-  virtual void backward(const UpdateCallback& callback);
-
-protected:
-  std::unique_ptr<Weight> weight_;
-};
-
-REGISTER_PROJECTION(trans_fc, TransposedFullMatrixProjection);
-
-TransposedFullMatrixProjection::TransposedFullMatrixProjection(
-    const ProjectionConfig& config, ParameterPtr parameter, bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  weight_.reset(
-      new Weight(config.output_size(), config.input_size(), parameter));
-}
-
-void TransposedFullMatrixProjection::forward() {
-  REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
-  out_->value->mul(*(in_->value), *(weight_->getW()->getTranspose()), 1, 1);
-}
-
-void TransposedFullMatrixProjection::backward(const UpdateCallback& callback) {
-  bool syncFlag = hl_get_sync_flag();
-
-  /* Calculate the W-gradient for the current layer */
-  if (weight_->getWGrad()) {
-    REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
-    weight_->getWGrad()->mul(
-        *(out_->grad->getTranspose()), *(in_->value), 1, 1);
-  }
-
-  // If callback does not change value, backprop error asynchronously so that
-  // we can do the callback concurrently.
-  // This is still a little bit dangerous since theoretically for
-  // SyncMultiGpuMachine it is possible that the value copyback can still
-  // happen at the same time as the error backprop where the value is being
-  // used.
-  hl_set_sync_flag(false);
-
-  /* Calculate the input layers error */
-  if (in_->grad) {
-    REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
-    in_->grad->mul(*(out_->grad), *(weight_->getW()), 1, 1);
-  }
-
-  hl_set_sync_flag(syncFlag);
-  parameter_->incUpdate(callback);
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ValidationLayer.cpp b/paddle/gserver/layers/ValidationLayer.cpp
deleted file mode 100644
index b626825a7b45fdb09cd8f9e8cc6727e218ab2940..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ValidationLayer.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <fstream>
-#include <memory>
-
-#include "ValidationLayer.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-bool ValidationLayer::init(const LayerMap& layerMap,
-                           const ParameterMap& parameterMap) {
-  return Layer::init(layerMap, parameterMap);
-}
-
-void ValidationLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  MatrixPtr output = getInputValue(*getOutputLayer());
-  CHECK(output);
-  IVectorPtr label = getInputLabel(*getLabelLayer());
-  CHECK(label);
-  validationImp(output, label);
-}
-
-void ValidationLayer::backward(const UpdateCallback& callback) {
-  (void)callback;
-}
-
-bool AucValidation::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
-  bool ret = ValidationLayer::init(layerMap, parameterMap);
-  EvaluatorConfig config;
-  config.set_name(getName());
-  config.set_type("last-column-auc");
-  config.add_input_layers(inputLayers_[0]->getName());
-  config.add_input_layers(inputLayers_[1]->getName());
-  if (3 == inputLayers_.size()) {
-    config.add_input_layers(inputLayers_[2]->getName());
-  }
-  evaluator_.reset(Evaluator::create(config));
-  passBegin_ = false;
-  return ret;
-}
-
-void AucValidation::validationImp(MatrixPtr output, IVectorPtr label) {
-  if (!passBegin_) {
-    passBegin_ = true;
-    evaluator_->start();
-  }
-
-  bool supportWeight = (3 == inputLayers_.size()) ? true : false;
-  MatrixPtr weight = supportWeight ? getInputValue(*inputLayers_[2]) : nullptr;
-  if (dynamic_cast<GpuMatrix*>(output.get())) {
-    size_t height = output->getHeight();
-    size_t width = output->getWidth();
-    Matrix::resizeOrCreate(cpuOutput_,
-                           height,
-                           width,
-                           /* trans=*/false,
-                           /* useGpu=*/false);
-    cpuOutput_->copyFrom(*output);
-    IVector::resizeOrCreate(cpuLabel_, height, false);
-    cpuLabel_->copyFrom(*label);
-
-    if (supportWeight) {
-      Matrix::resizeOrCreate(cpuWeight_, height, (size_t)1, false, false);
-      cpuWeight_->copyFrom(*weight);
-    }
-
-    output = cpuOutput_;
-    label = cpuLabel_;
-    weight = cpuWeight_;
-  }
-
-  for (size_t i = 0; i < output->getHeight(); i++) {
-    float y1 = output->getData()[i * output->getWidth() + 1];
-    int* labels = label->getData();
-    predictArray_.push_back(PredictionResult(y1, labels[i]));
-  }
-  std::vector<Argument> arguments;
-  if (3 == inputLayers_.size()) {
-    arguments.resize(3);
-    arguments[2].value = weight;
-  } else {
-    arguments.resize(2);
-  }
-  arguments[0].value = output;
-  arguments[1].ids = label;
-  evaluator_->evalImp(arguments);
-}
-
-void AucValidation::onPassEnd() {
-  if (!FLAGS_predict_file.empty()) {
-    std::ofstream fs(FLAGS_predict_file);
-    CHECK(fs) << "Fail to open " << FLAGS_predict_file;
-    for (auto& res : predictArray_) {
-      fs << res.out << " " << res.label << std::endl;
-    }
-  }
-
-  evaluator_->finish();
-  LOG(INFO) << *evaluator_;
-  passBegin_ = false;
-  predictArray_.clear();
-}
-
-bool PnpairValidation::init(const LayerMap& layerMap,
-                            const ParameterMap& parameterMap) {
-  bool ret = ValidationLayer::init(layerMap, parameterMap);
-  if (!ret) return ret;
-  CHECK_GE(inputLayers_.size(), 3UL);
-  CHECK_LE(inputLayers_.size(), 4UL);
-  EvaluatorConfig config;
-  config.set_name(getName());
-  config.set_type("pnpair");
-  config.add_input_layers(inputLayers_[0]->getName());
-  config.add_input_layers(inputLayers_[1]->getName());
-  config.add_input_layers(inputLayers_[2]->getName());
-  if (4 == inputLayers_.size()) {
-    config.add_input_layers(inputLayers_[3]->getName());
-  }
-  evaluator_.reset(Evaluator::create(config));
-  passBegin_ = false;
-  return true;
-}
-
-void PnpairValidation::validationImp(MatrixPtr output, IVectorPtr label) {
-  if (!passBegin_) {
-    passBegin_ = true;
-    evaluator_->start();
-  }
-  MatrixPtr weight =
-      (4 == inputLayers_.size()) ? getInputValue(*inputLayers_[3]) : nullptr;
-  IVectorPtr info = getInputLabel(*getInfoLayer());
-  std::vector<Argument> arguments;
-  if (4 == inputLayers_.size()) {
-    arguments.resize(4);
-    arguments[3].value = weight;
-  } else {
-    arguments.resize(3);
-  }
-  arguments[0].value = output;
-  arguments[1].ids = label;
-  arguments[2].ids = info;
-  evaluator_->evalImp(arguments);
-}
-
-void PnpairValidation::onPassEnd() {
-  if (!FLAGS_predict_file.empty()) {
-    (dynamic_cast<PnpairEvaluator*>(evaluator_.get()))->printPredictResults();
-  }
-  evaluator_->finish();
-  LOG(INFO) << *evaluator_;
-  passBegin_ = false;
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ValidationLayer.h b/paddle/gserver/layers/ValidationLayer.h
deleted file mode 100644
index f412d685c0541537bd4318fec2dae06215c4afbe..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/ValidationLayer.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-
-#include "Layer.h"
-#include "paddle/gserver/evaluators/Evaluator.h"
-
-DECLARE_int32(trainer_id);
-
-namespace paddle {
-
-class ValidationLayer : public Layer {
-public:
-  explicit ValidationLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  LayerPtr getOutputLayer() { return inputLayers_[0]; }
-
-  LayerPtr getLabelLayer() { return inputLayers_[1]; }
-
-  LayerPtr getInfoLayer() {
-    assert(inputLayers_.size() > 2);
-    return inputLayers_[2];
-  }
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback = nullptr) override;
-
-  virtual void validationImp(MatrixPtr outputValue, IVectorPtr label) = 0;
-
-  void onPassEnd() override = 0;
-};
-
-/*
- * AucValidation
- */
-class AucValidation : public ValidationLayer {
-public:
-  explicit AucValidation(const LayerConfig& config)
-      : ValidationLayer(config),
-        cpuOutput_(nullptr),
-        cpuLabel_(nullptr),
-        cpuWeight_(nullptr) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void validationImp(MatrixPtr outputValue, IVectorPtr label) override;
-
-  void onPassEnd() override;
-
-  struct PredictionResult {
-    PredictionResult(real __out, int __label) : out(__out), label(__label) {}
-    real out;
-    int label;
-  };
-  std::vector<PredictionResult> predictArray_;
-
-private:
-  bool passBegin_;
-  std::unique_ptr<Evaluator> evaluator_;
-  MatrixPtr cpuOutput_;
-  IVectorPtr cpuLabel_;
-  MatrixPtr cpuWeight_;
-};
-
-/*
- * positive-negative pair rate Validation
- */
-class PnpairValidation : public ValidationLayer {
-public:
-  explicit PnpairValidation(const LayerConfig& config)
-      : ValidationLayer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void validationImp(MatrixPtr outputValue, IVectorPtr label) override;
-
-  void onPassEnd() override;
-
-private:
-  bool passBegin_;
-  std::unique_ptr<Evaluator> evaluator_;
-};
-
-typedef std::shared_ptr<ValidationLayer> ValidationLayerPtr;
-}  // namespace paddle
diff --git a/paddle/gserver/layers/WarpCTCLayer.h b/paddle/gserver/layers/WarpCTCLayer.h
deleted file mode 100644
index 6f6be359c0aa46a4f3775f8405e1aa51ca1ae147..0000000000000000000000000000000000000000
--- a/paddle/gserver/layers/WarpCTCLayer.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Layer.h"
-
-namespace paddle {
-
-/**
- * @brief A layer integrating the open-source warp-ctc library
- *        <https://github.com/baidu-research/warp-ctc> to compute connectionist
- *        temporal classification cost.
- *
- * The config file api is warp_ctc_layer.
- */
-class WarpCTCLayer : public Layer {
-public:
-  explicit WarpCTCLayer(const LayerConfig& config) : Layer(config) {}
-  ~WarpCTCLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-
-protected:
-  /**
-   * sequence matrix and batch matrix copy:
-   * sequence (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
-   * batch    (s0, s1, s2, s3; s0, s1, s2, 0; s0, 0, s2, 0; s0, 0, 0, 0)
-   */
-  void seq2batchPadding(const MatrixPtr& seqValue,
-                        MatrixPtr& batchValue,
-                        const ICpuGpuVectorPtr& seqStartPositions);
-  void batch2seqPadding(const MatrixPtr& seqValue,
-                        MatrixPtr& batchValue,
-                        const ICpuGpuVectorPtr& seqStartPositions,
-                        bool normByTimes);
-
-protected:
-  size_t numClasses_;
-  size_t blank_;
-  size_t maxSequenceLength_;
-  bool normByTimes_;
-
-  MatrixPtr batchValue_;
-  MatrixPtr batchGrad_;
-  VectorPtr workspace_;
-
-  IVectorPtr cpuLabels_;
-  MatrixPtr cpuCosts_;
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
deleted file mode 100644
index b578a906c2027a1169a0098b93f8d0742920f99d..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/CMakeLists.txt
+++ /dev/null
@@ -1,97 +0,0 @@
-# gserver pacakge unittests
-add_simple_unittest(test_LinearChainCRF)
-add_simple_unittest(test_RecurrentLayer)
-
-if(NOT MOBILE_INFERENCE)
-  add_simple_unittest(test_MultinomialSampler)
-endif()
-
-function(gserver_test TARGET)
-  add_unittest_without_exec(${TARGET}
-      ${TARGET}.cpp
-      LayerGradUtil.cpp)
-  add_test(NAME ${TARGET}
-      COMMAND ${TARGET})
-endfunction()
-
-gserver_test(test_LayerGrad)
-gserver_test(test_CRFLayerGrad)
-gserver_test(test_CrossEntropyOverBeamGrad)
-gserver_test(test_SeqSliceLayerGrad)
-gserver_test(test_ActivationGrad)
-gserver_test(test_ConvTrans)
-gserver_test(test_PriorBox)
-gserver_test(test_DetectionOutput)
-gserver_test(test_ConvUnify)
-gserver_test(test_BatchNorm)
-gserver_test(test_KmaxSeqScore)
-gserver_test(test_Expand)
-gserver_test(test_MaxPoolingWithMaskOutput)
-
-set(PYTHON_PATH 
-   ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
-   ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/gserver/tests)
-function(gserver_test_with_python TARGET)
-  add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
-  add_test(NAME ${TARGET}
-    COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
-      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
-endfunction()
-
-gserver_test_with_python(test_PyDataProvider2)
-if(WITH_PYTHON)
-    gserver_test_with_python(test_PyDataProvider)
-endif()
-if(NOT MOBILE_INFERENCE)
-    gserver_test_with_python(test_CompareTwoNets)
-    # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine, I will fix it.
-    gserver_test_with_python(test_RecurrentGradientMachine)
-endif()
-
-########## test_MKLDNN layers and activations ##########
-if(WITH_MKLDNN)
-    add_unittest_without_exec(test_MKLDNN
-        test_MKLDNN.cpp
-        MKLDNNTester.cpp
-        LayerGradUtil.cpp)
-    add_test(NAME test_MKLDNN
-        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
-            WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
-endif()
-
-############### test_WarpCTCLayer #######################
-if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
-    add_unittest_without_exec(test_WarpCTCLayer
-        test_WarpCTCLayer.cpp)
-    add_test(NAME test_WarpCTCLayer
-        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
-endif()
-
-if(NOT MOBILE_INFERENCE)
-    ################## test_Evaluator #############
-    add_unittest(test_Evaluator
-        test_Evaluator.cpp)
-      
-    ########### test_NetworkCompare ###############
-    add_unittest_without_exec(test_NetworkCompare
-        test_NetworkCompare.cpp)
-    if(WITH_GPU)
-        set(use_gpu true)
-    else()
-        set(use_gpu false)
-    endif()
-    add_test(NAME test_NetworkCompare
-        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=${use_gpu}
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
-
-    ############ test_CompareSparse ################
-    add_unittest_without_exec(test_CompareSparse
-        test_CompareSparse.cpp)
-    if(NOT ON_TRAVIS)
-      add_test(NAME test_CompareSparse
-        COMMAND ${PYTHON_PATH} ./.set_port.sh -p port -n 6
-                ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
-    endif()
-endif()
diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h
deleted file mode 100644
index 1999b2204b1728bd60b1e107dfe7b10718e752a5..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ /dev/null
@@ -1,329 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-
-#include "paddle/testing/TestUtil.h"
-using namespace std;  // NOLINT
-
-namespace paddle {
-enum InputType {
-  INPUT_DATA,         // dense vector
-  INPUT_LABEL,        // id
-  INPUT_DATA_TARGET,  // dense vector, but no gradient
-  INPUT_SEQUENCE_DATA,
-  INPUT_HASSUB_SEQUENCE_DATA,  // sequence has sub-sequence
-  INPUT_SEQUENCE_MDIM_DATA,
-  INPUT_SEQUENCE_LABEL,
-  INPUT_SPARSE_NON_VALUE_DATA,
-  INPUT_SPARSE_FLOAT_VALUE_DATA,
-  INPUT_DENSE_DIM_DATA,    // using sequence length to init dense data
-  INPUT_SELF_DEFINE_DATA,  // support customizing for input value
-};
-
-struct ParaSparse {
-  bool sparse;
-  string format;
-  // if equalNnzPerSample is set true,
-  // every row of the sparse matrix in a format of CSR has a same
-  // number of nnz values. Currently, this flag is only used for
-  // selective_fc layer
-  bool equalNnzPerSample;
-  ParaSparse(const string& formatIn = "") {  // NOLINT
-    if (formatIn == "") {
-      sparse = false;
-    } else {
-      sparse = true;
-    }
-    equalNnzPerSample = false;
-  }
-  ParaSparse(const string& formatIn, bool equalNnz) {
-    format = formatIn;
-    sparse = true;
-    equalNnzPerSample = equalNnz;
-  }
-};
-
-struct InputDef {
-  InputType inputType;
-  string name;
-  size_t dim;
-  size_t paraSize;
-  ParaSparse sparse;
-  bool isStatic;
-  std::vector<int> labelInitValue;
-  std::vector<int> labelSeqStartPositions;
-  std::vector<int> labelSubSeqStartPositions;
-  std::vector<int> ids;
-  MatrixPtr selfDefinedData;
-
-  InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn) {
-    inputType = type;
-    name = nameIn;
-    dim = dimIn;
-    paraSize = sizeIn;
-    sparse = {""};
-    isStatic = false;
-  }
-
-  InputDef(InputType type,
-           string nameIn,
-           MatrixPtr selfDefinedData,
-           std::vector<int> selfDefinedSeqStartPos = {},
-           std::vector<int> selfDefinedSubSeqStartPos = {})
-      : labelSeqStartPositions(selfDefinedSeqStartPos),
-        labelSubSeqStartPositions(selfDefinedSubSeqStartPos),
-        selfDefinedData(selfDefinedData) {
-    inputType = type;
-    name = nameIn;
-    dim = 0;
-    sparse = {""};
-    paraSize = 0;
-    isStatic = false;
-  }
-
-  InputDef(InputType type,
-           string nameIn,
-           const std::vector<int>& ids,
-           const std::vector<int>& selfDefinedSeqStartPos = {},
-           const std::vector<int>& selfDefinedSubSeqStartPos = {})
-      : labelSeqStartPositions(selfDefinedSeqStartPos),
-        labelSubSeqStartPositions(selfDefinedSubSeqStartPos),
-        ids(ids) {
-    selfDefinedData = nullptr;
-    inputType = type;
-    name = nameIn;
-    dim = 0;
-    sparse = {""};
-    paraSize = 0;
-    isStatic = false;
-  }
-
-  InputDef(InputType type,
-           string nameIn,
-           size_t dimIn,
-           size_t sizeIn,
-           const std::vector<int>& labelInitValue,
-           const std::vector<int>& labelSeqStartPositions)
-      : labelInitValue(labelInitValue),
-        labelSeqStartPositions(labelSeqStartPositions) {
-    inputType = type;
-    name = nameIn;
-    dim = dimIn;
-    paraSize = sizeIn;
-    sparse = {""};
-    isStatic = false;
-  }
-
-  InputDef(InputType type,
-           string nameIn,
-           size_t dimIn,
-           size_t sizeIn,
-           ParaSparse sparseIn) {
-    inputType = type;
-    name = nameIn;
-    dim = dimIn;
-    paraSize = sizeIn;
-    sparse = sparseIn;
-  }
-};
-
-struct TestConfig {
-  LayerConfig layerConfig;
-  std::vector<InputDef> inputDefs;
-  size_t biasSize;
-  real paramInitialMean;
-  real paramInitialStd;
-  bool testAccumulate;
-  bool testState;
-  bool staticBias;
-  bool testBatchState;
-  TestConfig()
-      : biasSize(0),
-        paramInitialMean(0.0),
-        paramInitialStd(1.0),
-        testAccumulate(true),
-        testState(false),
-        staticBias(false),
-        testBatchState(false) {}
-};
-
-real getCostSum(ParameterPtr& parameter,
-                CpuVector& cpuPara,
-                LayerPtr& testLayer,
-                MatrixPtr weights = nullptr);
-
-real getDiffAndPrint(real newCost1,
-                     real newCost2,
-                     real callbackCount,
-                     char fill,
-                     string testLayerName,
-                     string name,
-                     real step,
-                     real delta);
-
-/**
- * @brief verify that sequentially running forward() one timestamp at one time
- *        has same result as running forward() with one whole sequence
- *
- * @param testLayer[in/out]    testLayer
- * @param dataLayers[in/out]   dataLayers
- * @param datas[in/out]        data of dataLayers
- */
-void testState(LayerPtr testLayer,
-               vector<DataLayerPtr>& dataLayers,
-               vector<Argument>& datas);
-
-/**
- * @brief verify that sequentially running forward() with short sequences one
- *        time has same result as running forward() with long sequences.
- *
- * @param testLayer[in/out]    testLayer
- * @param dataLayers[in/out]   dataLayers
- * @param datas[in/out]        data of dataLayers
- */
-void testBatchState(LayerPtr testLayer,
-                    vector<DataLayerPtr>& dataLayers,
-                    vector<Argument>& datas);
-
-/**
- * @brief Generate a perturbation so that it is roughly aligned with the
- *        gradient direction. This is to make sure that change along this
- *        direction will make cost increase (or decrease) in a meaningful
- *        way so that the finite difference can be used to approximate the
- *        directional dirivative well.
- *
- * @param oldGrad[in]  input gradient
- *        newGrad[out] output gradient
- *        dim          dimension of oldGrad/newGrad
- *
- * @return sum_i(oldGrad[i] * newGrad[i])
- */
-double genPerturbation(const real* oldGrad, real* newGrad, size_t dim);
-
-void initWeight(MatrixPtr& weights);
-
-void initBatchState(LayerPtr dataLayer,
-                    LayerPtr testLayer,
-                    LayerStatePtr state,
-                    bool useGpu);
-
-/**
- * @brief initialize the dataLayer by its inputType
- *
- * @param testConf[in]        test config
- *        dataLayers[out]     dataLayers
- *        datas[out]          initialized data of dataLayers
- *        layerMap[out]       layerMap
- */
-void initDataLayer(TestConfig testConf,
-                   std::vector<DataLayerPtr>* dataLayers,
-                   vector<Argument>* datas,
-                   LayerMap* layerMap,
-                   string testLayerName,
-                   size_t batchSize,
-                   bool trans,
-                   bool useGpu);
-
-/**
- * @brief initialize the parameter of testLayer
- *
- * @param testConf[in/out]    test config
- *        layerMap[out]       layerMap
- *        parameters[out]     parameters of testLayer
- *        testLayer[out]      testLayer
- */
-void initTestLayer(TestConfig testConf,
-                   LayerMap* layerMap,
-                   std::vector<ParameterPtr>* parameters,
-                   LayerPtr* testLayer);
-
-/**
- * @brief Test whether the layer's forward calculation is stable by adding
- *        perturbation to its parameters
- *
- * @param testConf[in]         test config
- *        weights[in]          weights of testLayer
- *        state[in]            state of testLayer
- *        cost[in]             input cost
- *        callbackCount[in]    number of done callback
- *        maxDiff[in/out]      max of all previous diff
- *        testLayer[in/out]    testLayer
- *        parameters[in/out]   parameters of testLayer
- */
-void testPerturbParameter(TestConfig testConf,
-                          const MatrixPtr weights,
-                          const LayerStatePtr state,
-                          real cost,
-                          real callbackCount,
-                          real* maxDiff,
-                          LayerPtr testLayer,
-                          std::vector<ParameterPtr>* parameters);
-
-/**
- * @brief Test whether the layer's forward calculation is stable by adding
- *        perturbation to its input layers
- *
- * @param testConf[in]         test config
- *        weights[in]          weights of testLayer
- *        state[in]            state of testLayer
- *        cost[in]             input cost
- *        callbackCount[in]    number of done callback
- *        maxDiff[in/out]      max of all previous diff
- *        testLayer[in/out]    testLayer
- *        dataLayers[in/out]   dataLayers
- */
-void testPerturbInput(TestConfig testConf,
-                      const MatrixPtr weights,
-                      const LayerStatePtr state,
-                      real cost,
-                      real callbackCount,
-                      real* maxDiff,
-                      LayerPtr testLayer,
-                      std::vector<DataLayerPtr> dataLayers);
-
-void testLayerGradKernel(TestConfig testConf,
-                         string testLayerName,
-                         size_t batchSize,
-                         bool trans,
-                         bool useGpu,
-                         bool useWeight = false,
-                         float epsilon = 0.02);
-
-void testLayerGrad(TestConfig testConf,
-                   string testLayerName,
-                   size_t batchSize,
-                   bool trans,
-                   bool useGpu,
-                   bool useWeight = false,
-                   float epsilon = 0.02);
-
-void testProjectionGrad(ProjectionConfig conf,
-                        InputType inputType,
-                        size_t parameterSize,
-                        size_t batchSize,
-                        bool useGpu,
-                        bool testState = false,
-                        int biasSize = 0,
-                        bool sharedBias = false);
-
-void testOperatorGrad(TestConfig& config,
-                      OperatorConfig& operatorConf,
-                      size_t batchSize,
-                      bool useGpu,
-                      bool testState = false);
-
-}  //  namespace paddle
diff --git a/paddle/gserver/tests/MKLDNNTester.cpp b/paddle/gserver/tests/MKLDNNTester.cpp
deleted file mode 100644
index d2a9761a4e16832a0722d4375cc11adb42524a8c..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ /dev/null
@@ -1,580 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MKLDNNTester.h"
-#include "paddle/gserver/layers/MKLDNNBase.h"
-#include "paddle/gserver/layers/MKLDNNLayer.h"
-#include "paddle/trainer/Trainer.h"
-
-namespace paddle {
-
-// init data layer and test layer of both dnn and reference
-void MKLDNNTester::reset(const TestConfig& dnn,
-                         const TestConfig& ref,
-                         size_t batchSize) {
-  const bool trans = false;
-  const bool useGpu = false;
-
-  // clear
-  configs_.clear();
-  layerNames_.clear();
-  dataLayers_.clear();
-  datas_.clear();
-  layerMaps_.clear();
-  parameters_.clear();
-  testLayers_.clear();
-
-  // resize
-  configs_.resize(NUM);
-  layerNames_.resize(NUM);
-  dataLayers_.resize(NUM);
-  datas_.resize(NUM);
-  layerMaps_.resize(NUM);
-  parameters_.resize(NUM);
-  testLayers_.resize(NUM);
-
-  // reset configs and layer names
-  configs_[DNN] = dnn;
-  configs_[REF] = ref;
-  layerNames_[DNN] = "mkldnn";     // the first is mkldnn layer
-  layerNames_[REF] = "reference";  // second is reference layer
-
-  // reset others
-  for (size_t i = 0; i < NUM; ++i) {
-    configs_[i].layerConfig.set_name(layerNames_[i]);
-    initDataLayer(configs_[i],
-                  &(dataLayers_[i]),
-                  &(datas_[i]),
-                  &(layerMaps_[i]),
-                  layerNames_[i],
-                  batchSize,
-                  trans,
-                  useGpu);
-    initTestLayer(
-        configs_[i], &(layerMaps_[i]), &(parameters_[i]), &(testLayers_[i]));
-  }
-  refLayer_ = testLayers_[REF];
-  dnnLayer_ = testLayers_[DNN];
-  EXPECT_EQ(dataLayers_[DNN].size(), dataLayers_[REF].size());
-  EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
-  setInputImgSize();
-
-  // for comparison with Paddle reference results,
-  // need manually add cpu device output for test
-  MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(dnnLayer_);
-  if (dnnLayer) {
-    dnnLayer->addOutputArgument(CPU_DEVICE);
-  }
-}
-
-void MKLDNNTester::setInputImgSize() {
-  for (size_t n = 0; n < dataLayers_.size(); ++n) {
-    for (size_t i = 0; i < dataLayers_[n].size(); ++i) {
-      // TODO(TJ): fix me when concat and elewise ready
-      dataLayers_[n][i]->getOutput().setFrameHeight(ih_);
-      dataLayers_[n][i]->getOutput().setFrameWidth(iw_);
-    }
-  }
-}
-
-// init randome parameters of ref, and copy to mkldnn
-void MKLDNNTester::randomWgtDatas() {
-  EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
-  const bool isBN = refLayer_->getType() == "batch_norm";
-  for (size_t i = 0; i < parameters_[REF].size(); ++i) {
-    const VectorPtr& dnnValue = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
-    const VectorPtr& refValue = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
-    parameters_[REF][i]->randomize();
-    if (isBN && i == 2) {
-      // this param is moving average in batch norm, which must larger than 0
-      real offset = fabs(refValue->getMin()) + 1.0;
-      refValue->add(offset);
-    }
-    dnnValue->copyFrom(*refValue);
-
-    VLOG(MKLDNN_TESTS) << "Random weight " << parameters_[DNN][i]->getName();
-    printVector(dnnValue);
-  }
-}
-
-// random botdata of ref layer and copy same to mkldnn
-void MKLDNNTester::randomBotDatas() {
-  CHECK_EQ(dataLayers_.size(), NUM);
-  for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
-    dataLayers_[REF][i]->getOutputValue()->randomizeUniform();
-    dataLayers_[DNN][i]->getOutputValue()->copyFrom(
-        *(dataLayers_[REF][i]->getOutputValue()));
-    VLOG(MKLDNN_TESTS) << "Random Foward, InputValue " << i;
-    printMatrix(dataLayers_[REF][i]->getOutputValue());
-  }
-}
-
-void MKLDNNTester::randomTopDiffs() {
-  refLayer_->getOutputGrad()->randomizeUniform();
-  dnnLayer_->getOutput(CPU_DEVICE)
-      .grad->copyFrom(*(refLayer_->getOutputGrad()));
-  VLOG(MKLDNN_TESTS) << "Random Backward, OutputGrad";
-  printMatrix(refLayer_->getOutputGrad());
-}
-
-void MKLDNNTester::checkForward() {
-  VLOG(MKLDNN_TESTS) << "Check Forward";
-  printTopDatas();
-  double delta =
-      compareMatrix(refLayer_->getOutputValue(), dnnLayer_->getOutputValue());
-  EXPECT_LE(fabs(delta), eps_);
-}
-
-void MKLDNNTester::checkBackwardData() {
-  VLOG(MKLDNN_TESTS) << "Check Backward Data";
-  const bool isBN = refLayer_->getType() == "batch_norm";
-  for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
-    const MatrixPtr& dnnDiff = dataLayers_[DNN][i]->getOutputGrad();
-    const MatrixPtr& refDiff = dataLayers_[REF][i]->getOutputGrad();
-    VLOG(MKLDNN_ALL) << "MKLDNN Backward Result: InputGrad " << i;
-    printMatrix(dnnDiff);
-    VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i;
-    printMatrix(refDiff);
-
-    double delta = compareMatrix(refDiff, dnnDiff);
-    EXPECT_LE(fabs(delta), eps_);
-    if (isBN) {
-      // the other two inputs in batch norm are for moving mean and var
-      // do not have grad to compare
-      break;
-    }
-  }
-}
-
-void MKLDNNTester::checkBackwardWgts() {
-  VLOG(MKLDNN_TESTS) << "Check Backward Weight";
-  CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
-  vector<VectorPtr> dnnWgts;  // used to temply save mkldnn weights
-  saveWgt(parameters_[DNN], dnnWgts);
-
-  MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(dnnLayer_);
-  if (dnnLayer) {
-    dnnLayer->convertWeightsToPaddle();
-  }
-  for (size_t i = 0; i < parameters_[DNN].size(); ++i) {
-    const VectorPtr& dnn = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
-    const VectorPtr& ref = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
-    VLOG(MKLDNN_ALL) << "MKLDNN Result: weight value"
-                     << parameters_[DNN][i]->getName();
-    printVector(dnn);
-    VLOG(MKLDNN_ALL) << "Reference Result: weight value "
-                     << parameters_[REF][i]->getName();
-    printVector(ref);
-
-    double delta = compareVector(ref, dnn);
-    EXPECT_LE(fabs(delta), eps_);
-  }
-
-  VLOG(MKLDNN_ALL) << "Restore dnn weights before comapre";
-  restoreWgt(dnnWgts, parameters_[DNN]);
-}
-
-void MKLDNNTester::saveWgt(const vector<ParameterPtr>& from,
-                           vector<VectorPtr>& to) {
-  const bool useGpu = false;
-  to.resize(from.size());
-  for (size_t i = 0; i < to.size(); ++i) {
-    const VectorPtr& wgt = from[i]->getBuf(PARAMETER_VALUE);
-    to[i] = Vector::create(wgt->getSize(), useGpu);
-    to[i]->copyFrom(*wgt);
-  }
-}
-
-void MKLDNNTester::restoreWgt(const vector<VectorPtr>& from,
-                              vector<ParameterPtr>& to) {
-  CHECK_EQ(from.size(), to.size());
-  for (size_t i = 0; i < from.size(); ++i) {
-    const VectorPtr& wgt = to[i]->getBuf(PARAMETER_VALUE);
-    wgt->copyFrom(*from[i]);
-  }
-}
-
-// clear parameters grad
-void MKLDNNTester::clearWgtDiffs(size_t id) {
-  CHECK_LE(id, parameters_.size());
-  for (size_t n = 0; n < parameters_.size(); ++n) {
-    if (id == n || id == parameters_.size()) {
-      for (size_t i = 0; i < parameters_[n].size(); ++i) {
-        const VectorPtr& grad = parameters_[n][i]->getBuf(PARAMETER_GRADIENT);
-        if (grad) {
-          grad->zeroMem();
-        }
-      }
-    }
-  }
-}
-
-void MKLDNNTester::clearBotDiffs(size_t id) {
-  CHECK_LE(id, dataLayers_.size());
-  for (size_t n = 0; n < dataLayers_.size(); ++n) {
-    if (id == n || id == dataLayers_.size()) {
-      // clear inputs layers of this specific layer
-      for (size_t i = 0; i < dataLayers_[n].size(); ++i) {
-        dataLayers_[n][i]->getOutputGrad()->zeroMem();
-      }
-    }
-  }
-}
-
-void MKLDNNTester::clearTopDatas(size_t id) {
-  CHECK_LE(id, testLayers_.size());
-  for (size_t i = 0; i < testLayers_.size(); ++i) {
-    if (id == i || id == testLayers_.size()) {
-      testLayers_[i]->getOutputValue()->zeroMem();
-    }
-  }
-}
-
-void MKLDNNTester::printTopDatas() {
-  if (!log_) {
-    return;
-  }
-
-  for (int n = 0; n < NUM; ++n) {
-    VLOG(MKLDNN_ALL) << testLayers_[n]->getType()
-                     << " Forward Result: OutputValue";
-    printMatrix(testLayers_[n]->getOutputValue());
-  }
-}
-
-void MKLDNNTester::printMatrix(const MatrixPtr& m) {
-  if (!log_) {
-    return;
-  }
-
-  std::ostringstream ostr;
-  m->print(ostr);
-  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
-}
-
-void MKLDNNTester::printVector(const VectorPtr& v) {
-  if (!log_) {
-    return;
-  }
-
-  std::ostringstream ostr;
-  v->print(ostr, v->getSize());
-  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
-}
-
-double MKLDNNTester::getDelta(const real* refer,
-                              const real* value,
-                              size_t len,
-                              const float failRate,
-                              const float thres) {
-  double delta = 0, sum = 0;
-  int failCnt = 0;
-  const double eps = 1e-5;
-  double maxRatio = 0;
-  for (size_t i = 0; i < len; ++i) {
-    double ref = fabs(refer[i]);
-    double val = fabs(value[i]);
-    double diff = fabs(refer[i] - value[i]);
-    delta += diff;
-    sum += ref;
-    if (ref < eps && val < eps) {  // both values are very small
-      continue;
-    }
-    double ratio = diff / ref;
-    if (ratio > thres) {
-      maxRatio = std::max(maxRatio, ratio);
-      failCnt++;
-    }
-  }
-  EXPECT_FALSE(std::isinf(sum));
-  EXPECT_FALSE(std::isnan(sum));
-  EXPECT_FALSE(std::isnan(delta));
-  VLOG(MKLDNN_ALL) << "reference avg data: " << sum / len
-                   << ", delta: " << delta / sum << ", failCnt:" << failCnt;
-  double res = sum > eps ? delta / sum : eps;
-  return (failCnt / (float)len) > failRate ? maxRatio : res;
-}
-
-double MKLDNNTester::compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2) {
-  CHECK_EQ(m1->getElementCnt(), m2->getElementCnt());
-  return getDelta(m1->getData(), m2->getData(), m1->getElementCnt());
-}
-
-double MKLDNNTester::compareVector(const VectorPtr& v1, const VectorPtr& v2) {
-  CHECK_EQ(v1->getSize(), v2->getSize());
-  return getDelta(v1->getData(), v2->getData(), v1->getSize());
-}
-
-void MKLDNNTester::runOnce() {
-  // test forward
-  randomBotDatas();
-  dnnLayer_->forward(passType_);
-  refLayer_->forward(passType_);
-  checkForward();
-
-  if (passType_ == PASS_TEST) {
-    return;
-  }
-
-  // test backward
-  // simple updater
-  UpdateCallback updateCallback = [](Parameter* para) {
-    auto& grad = para->getBuf(PARAMETER_GRADIENT);
-    auto& value = para->getBuf(PARAMETER_VALUE);
-    real lr = 1e-2;
-    value->add(*grad, lr);
-    grad->zeroMem();
-  };
-  randomTopDiffs();
-  dnnLayer_->backward(updateCallback);
-  refLayer_->backward(updateCallback);
-  checkBackwardData();
-  checkBackwardWgts();
-
-  // clear buffers
-  // ref code will addto the diff, dnn code will writeto it
-  // and clearTopDatas(REF) should be coverd by ref layers
-  clearBotDiffs(REF);
-  clearWgtDiffs(REF);
-  // it is necessary to clear bottom diffs when only activation is dnn type
-  if (configs_[DNN].layerConfig.active_type().compare(0, 7, "mkldnn_") == 0) {
-    clearBotDiffs(DNN);
-  }
-}
-
-void MKLDNNTester::run(const TestConfig& dnn,
-                       const TestConfig& ref,
-                       size_t batchSize,
-                       size_t inputImgH,
-                       size_t inputImgW,
-                       PassType passType,
-                       bool printDetails,
-                       size_t iter,
-                       float epsilon) {
-  CHECK(dnn.layerConfig.type().compare(0, 7, "mkldnn_") == 0 ||
-        dnn.layerConfig.active_type().compare(0, 7, "mkldnn_") == 0)
-      << "should be MKLDNN layer or MKLDNN activation";
-  if (dnn.layerConfig.type() == ref.layerConfig.type()) {
-    VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: "
-                       << dnn.layerConfig.active_type() << " vs "
-                       << ref.layerConfig.active_type();
-  } else {
-    VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: "
-                       << dnn.layerConfig.type() << " vs "
-                       << ref.layerConfig.type();
-  }
-
-  ih_ = inputImgH;
-  iw_ = inputImgW;
-  passType_ = passType;
-  log_ = printDetails;
-  iter_ = iter;
-  eps_ = epsilon;
-
-  // Firstly test mkldnn init from PARAM_FORMAT_ORIGINAL weight
-  reset(dnn, ref, batchSize);
-  randomWgtDatas();
-  clearWgtDiffs();
-  clearBotDiffs();
-  for (size_t i = 0; i < iter_; ++i) {
-    VLOG(MKLDNN_TESTS) << "Check Iteration " << i;
-    runOnce();
-  }
-
-  if (parameters_[DNN].empty()) {
-    // has no paramters
-    return;
-  }
-
-  // After run some iterations, the mkldnn weight has been stored in dnnLayer
-  // and we can also get the mkldnn weight parameter header format.
-  // Weight parameter should always be index 0 (and bias index 1).
-  // TODO(TJ): should also consider mean and var format when batchnorm ready
-  int dnnWgtFmt = parameters_[DNN][0]->getHeaderFormat();
-  int refWgtFmt = parameters_[REF][0]->getHeaderFormat();
-  if (dnnWgtFmt == refWgtFmt) {
-    // weight format are equal, so no need check more
-    return;
-  }
-
-  // then save the weights and restart again
-  vector<VectorPtr> dnnWgts, refWgts;
-  CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
-  saveWgt(parameters_[DNN], dnnWgts);
-  saveWgt(parameters_[REF], refWgts);
-
-  // restart again with dnn weight format
-  reset(dnn, ref, batchSize);
-  // TODO(TJ): should also considerate mean and var format when batchnorm ready
-  parameters_[DNN][0]->setHeaderFormat(dnnWgtFmt);
-
-  // restore wgt
-  restoreWgt(dnnWgts, parameters_[DNN]);
-  restoreWgt(refWgts, parameters_[REF]);
-  clearWgtDiffs();
-  clearBotDiffs();
-
-  for (size_t i = 0; i < iter_; ++i) {
-    VLOG(MKLDNN_TESTS) << "Check Iteration " << i;
-    runOnce();
-  }
-}
-
-void MKLDNNTester::initArgument(DataIn& data,
-                                const std::string& configPath,
-                                const size_t iter) {
-  TrainerConfigHelper config(configPath);
-  size_t batchSize = config.getOptConfig().batch_size();
-  data.inArgs.resize(iter);
-  data.outGrads.resize(iter);
-  data.paraValues.clear();
-  for (const auto& layer_name : config.getModelConfig().input_layer_names()) {
-    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
-                                     config.getModelConfig().layers().end(),
-                                     [=](const LayerConfig& layer_config) {
-                                       return layer_config.name() == layer_name;
-                                     });
-    CHECK(layer_config != config.getModelConfig().layers().end());
-
-    size_t layerSize = layer_config->size();
-    for (size_t i = 0; i < iter; ++i) {
-      Argument arg;
-      arg.value = Matrix::create(batchSize, layerSize, false, false);
-      arg.grad = Matrix::create(batchSize, layerSize, false, false);
-      arg.value->randomizeUniform();
-      arg.value->add(-0.5);
-      arg.value->sigmoid(*arg.value);
-      arg.grad->zeroMem();
-      arg.ids = VectorT<int>::create(batchSize, false);
-      arg.ids->rand(layerSize);
-      generateSequenceStartPositions(batchSize, arg.sequenceStartPositions);
-      data.inArgs[i].push_back(arg);
-    }
-  }
-
-  for (const auto& layer_name : config.getModelConfig().output_layer_names()) {
-    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
-                                     config.getModelConfig().layers().end(),
-                                     [=](const LayerConfig& layer_config) {
-                                       return layer_config.name() == layer_name;
-                                     });
-    CHECK(layer_config != config.getModelConfig().layers().end());
-
-    size_t layerSize = layer_config->size();
-    for (size_t i = 0; i < iter; ++i) {
-      MatrixPtr grad = Matrix::create(batchSize, layerSize, false, false);
-      grad->randomizeUniform();
-      data.outGrads[i].push_back(grad);
-    }
-  }
-
-  for (const auto& para_config : config.getModelConfig().parameters()) {
-    VectorPtr value = Vector::create(para_config.size(), false);
-    value->randnorm(0, 2);
-    data.paraValues.push_back(value);
-  }
-}
-
-void MKLDNNTester::getOutResult(const std::string& configPath,
-                                DataIn& in,
-                                DataOut& out,
-                                bool use_mkldnn,
-                                size_t iter) {
-  FLAGS_use_gpu = false;
-  FLAGS_use_mkldnn = use_mkldnn;
-  *ThreadLocalRand::getSeed() = 1;
-  srand(1);
-
-  Trainer trainer;
-  auto config = std::make_shared<TrainerConfigHelper>(configPath);
-  trainer.init(config, false);
-  auto gradientMachine = trainer.getGradientMachine();
-  std::vector<ParameterPtr> parameters = gradientMachine->getParameters();
-  for (size_t i = 0; i < in.paraValues.size(); i++) {
-    parameters[i]->getBuf(PARAMETER_VALUE)->copyFrom(*in.paraValues[i]);
-  }
-  UpdateCallback simpleUpdate = [](Parameter* para) {
-    auto& grad = para->getBuf(PARAMETER_GRADIENT);
-    auto& value = para->getBuf(PARAMETER_VALUE);
-    real lr = 1e-2;
-    value->add(*grad, lr);
-    grad->zeroMem();
-  };
-
-  vector<Argument> outArgs;
-  gradientMachine->start();
-  out.outValues.clear();
-  out.paraValues.clear();
-  for (size_t i = 0; i < iter; ++i) {
-    VLOG(MKLDNN_TESTS) << "runing iteration " << i;
-    gradientMachine->forward(in.inArgs[i], &outArgs, PASS_TRAIN);
-    // save forward result
-    for (size_t k = 0; k < outArgs.size(); k++) {
-      const MatrixPtr& src = outArgs[k].value;
-      MatrixPtr dst =
-          Matrix::create(src->getHeight(), src->getWidth(), false, false);
-      if (typeid(*src) == typeid(MKLDNNMatrix)) {
-        MKLDNNMatrixPtr dnnSrc = std::dynamic_pointer_cast<MKLDNNMatrix>(src);
-        dnnSrc->copyTo(*dst);
-      } else {
-        dst->copyFrom(*src);
-      }
-      out.outValues.push_back(dst);
-    }
-
-    // random backward input
-    for (size_t k = 0; k < outArgs.size(); k++) {
-      outArgs[k].grad->copyFrom(*in.outGrads[i][k]);
-    }
-    gradientMachine->backward(simpleUpdate);
-  }
-  gradientMachine->finish();
-
-  // save param value
-  for (size_t i = 0; i < in.paraValues.size(); i++) {
-    VectorPtr val = Vector::create(
-        parameters[i]->getBuf(PARAMETER_VALUE)->getSize(), false);
-    val->copyFrom(*parameters[i]->getBuf(PARAMETER_VALUE));
-    out.paraValues.push_back(val);
-  }
-}
-
-void MKLDNNTester::compareResult(DataOut& ref, DataOut& dnn, float eps) {
-  CHECK_EQ(ref.outValues.size(), dnn.outValues.size());
-  CHECK_EQ(ref.paraValues.size(), dnn.paraValues.size());
-  for (size_t i = 0; i < ref.outValues.size(); i++) {
-    VLOG(MKLDNN_TESTS) << "compare value index: " << i;
-    EXPECT_LE(fabs(compareMatrix(ref.outValues[i], dnn.outValues[i])), eps);
-  }
-  for (size_t i = 0; i < ref.paraValues.size(); i++) {
-    VLOG(MKLDNN_TESTS) << "compare param index: " << i;
-    EXPECT_LE(fabs(compareVector(ref.paraValues[i], dnn.paraValues[i])), eps);
-  }
-}
-
-void MKLDNNTester::runNetTest(const std::string& configPath,
-                              size_t iter,
-                              float eps) {
-  DataIn in;
-  initArgument(in, configPath, iter);
-  DataOut outCpu, outDnn;
-  VLOG(MKLDNN_TESTS) << "runing cpu network";
-  getOutResult(configPath, in, outCpu, false, iter);
-  VLOG(MKLDNN_TESTS) << "runing mkldnn network";
-  getOutResult(configPath, in, outDnn, true, iter);
-
-  compareResult(outCpu, outDnn, eps);
-}
-
-}  //  namespace paddle
diff --git a/paddle/gserver/tests/MKLDNNTester.h b/paddle/gserver/tests/MKLDNNTester.h
deleted file mode 100644
index c1faa6fd90e06d8c742e97c9ce51eeba3c24a550..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/MKLDNNTester.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "LayerGradUtil.h"
-#include "paddle/gserver/layers/MKLDNNBase.h"
-#include "paddle/gserver/layers/MKLDNNLayer.h"
-
-namespace paddle {
-
-/**
- * @brief test the functionality of MKLDNNlayers and MKLDNNActivations
- * refer to paddle original function
- */
-class MKLDNNTester {
-  enum {
-    DNN = 0,  // MKLDNN layer
-    REF = 1,  // Reference layer
-    NUM = 2,  // Number of total
-  };
-
-  struct DataIn {
-    std::vector<std::vector<Argument>> inArgs;
-    std::vector<std::vector<MatrixPtr>> outGrads;
-    std::vector<VectorPtr> paraValues;
-  };
-
-  struct DataOut {
-    std::vector<MatrixPtr> outValues;
-    std::vector<VectorPtr> paraValues;
-  };
-
-protected:
-  std::vector<TestConfig> configs_;
-  vector<string> layerNames_;
-  vector<vector<DataLayerPtr>> dataLayers_;
-  vector<vector<Argument>> datas_;
-  vector<LayerMap> layerMaps_;
-  vector<vector<ParameterPtr>> parameters_;
-  vector<LayerPtr> testLayers_;
-  LayerPtr refLayer_, dnnLayer_;
-
-  /// run some iterations, all the result should pass
-  size_t iter_;
-  /// whether to print out the details
-  bool log_;
-  /// epsilon
-  float eps_;
-  /// input image size, default 1
-  size_t ih_, iw_;
-  /// passType, PASS_TRAIN, PASS_TEST or PASS_GC (Gradient Check pass)
-  PassType passType_;
-
-public:
-  explicit MKLDNNTester(size_t iter = 3, float epsilon = 1e-4) {
-    iter_ = iter;
-    eps_ = epsilon;
-    log_ = false;
-    passType_ = PASS_TRAIN;
-  }
-
-  ~MKLDNNTester() {}
-
-public:
-  void run(const TestConfig& dnn,
-           const TestConfig& ref,
-           size_t batchSize,
-           size_t inputImgH = 1,
-           size_t inputImgW = 1,
-           PassType passType = PASS_TRAIN,
-           bool printDetails = false,
-           size_t iter = 3,
-           float epsilon = 1e-4);
-  static void runNetTest(const std::string& configPath,
-                         size_t iter = 2,
-                         float eps = 1e-4);
-  static void initArgument(DataIn& data,
-                           const std::string& configPath,
-                           size_t iter = 2);
-  static void getOutResult(const std::string& configPath,
-                           DataIn& in,
-                           DataOut& out,
-                           bool use_mkldnn,
-                           size_t iter = 2);
-
-private:
-  void reset(const TestConfig& dnn, const TestConfig& ref, size_t batchSize);
-  void setInputImgSize();
-  void runOnce();
-
-  void randomWgtDatas();
-  void randomBotDatas();
-  void randomTopDiffs();
-
-  void checkForward();
-  void checkBackwardData();
-  void checkBackwardWgts();
-
-  // clear specific layer, clear all when id equals NUM
-  void clearWgtDiffs(size_t id = NUM);
-  void clearBotDiffs(size_t id = NUM);
-  void clearTopDatas(size_t id = NUM);
-
-  void printTopDatas();
-  void printMatrix(const MatrixPtr& m);
-  void printVector(const VectorPtr& v);
-
-  void saveWgt(const vector<ParameterPtr>& from, vector<VectorPtr>& to);
-  void restoreWgt(const vector<VectorPtr>& from, vector<ParameterPtr>& to);
-
-  static double compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2);
-  static double compareVector(const VectorPtr& v1, const VectorPtr& v2);
-  static void compareResult(DataOut& ref, DataOut& dnn, float eps = 1e-4);
-
-  /**
-   * Get delta percent
-   * if many(>failRate) wrong(abs(val-ref)/abs(ref) > thres) points
-   * return the max(diff/ref)
-   * else return sum(abs(diff)) / sum(abs(ref))
-   * The return value should be smaller than eps when passing.
-   */
-  static double getDelta(const real* refer,
-                         const real* value,
-                         size_t len,
-                         const float failRate = 1e-3,
-                         const float thres = 0.1);
-};
-
-}  //  namespace paddle
diff --git a/paddle/gserver/tests/Sequence/train.list b/paddle/gserver/tests/Sequence/train.list
deleted file mode 100644
index be27acb3a5411d8fe65797079a9a5977c1f0f90a..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/Sequence/train.list
+++ /dev/null
@@ -1 +0,0 @@
-gserver/tests/Sequence/tour_train_wdseg
diff --git a/paddle/gserver/tests/Sequence/train.list.nest b/paddle/gserver/tests/Sequence/train.list.nest
deleted file mode 100644
index 7683ebc68efbb07ce01d8faab14574109df99af9..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/Sequence/train.list.nest
+++ /dev/null
@@ -1 +0,0 @@
-gserver/tests/Sequence/tour_train_wdseg.nest
diff --git a/paddle/gserver/tests/sequence_layer_group.conf b/paddle/gserver/tests/sequence_layer_group.conf
deleted file mode 100644
index 50f2d89d0271b2eaa460e57636eb09b6d6aeda18..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_layer_group.conf
+++ /dev/null
@@ -1,62 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
-dict_file = dict()
-for line_count, line in enumerate(open(dict_path, "r")):
-    dict_file[line.strip()] = line_count
-
-define_py_data_sources2(
-    train_list='gserver/tests/Sequence/train.list',
-    test_list=None,
-    module='sequenceGen',
-    obj='process',
-    args={"dict_file": dict_file})
-
-settings(batch_size=5)
-######################## network configure ################################
-dict_dim = len(open(dict_path, 'r').readlines())
-word_dim = 128
-hidden_dim = 256
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(input=data, size=word_dim)
-
-# (lstm_input + lstm) is equal to lstmemory 
-with mixed_layer(size=hidden_dim * 4) as lstm_input:
-    lstm_input += full_matrix_projection(input=emb)
-
-lstm = lstmemory_group(
-    input=lstm_input,
-    size=hidden_dim,
-    act=TanhActivation(),
-    gate_act=SigmoidActivation(),
-    state_act=TanhActivation())
-
-lstm_last = last_seq(input=lstm)
-
-with mixed_layer(
-        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
-    output += full_matrix_projection(input=lstm_last)
-
-outputs(
-    classification_cost(
-        input=output, label=data_layer(
-            name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_lstm.conf b/paddle/gserver/tests/sequence_lstm.conf
deleted file mode 100644
index f49a827f22edce056eaf9903e99b732cab7f3784..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_lstm.conf
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
-dict_file = dict()
-for line_count, line in enumerate(open(dict_path, "r")):
-    dict_file[line.strip()] = line_count
-
-define_py_data_sources2(
-    train_list='gserver/tests/Sequence/train.list',
-    test_list=None,
-    module='sequenceGen',
-    obj='process',
-    args={"dict_file": dict_file})
-
-settings(batch_size=5)
-######################## network configure ################################
-dict_dim = len(open(dict_path, 'r').readlines())
-word_dim = 128
-hidden_dim = 256
-label_dim = 3
-sparse_update = get_config_arg("sparse_update", bool, False)
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(
-    input=data,
-    size=word_dim,
-    param_attr=ParamAttr(sparse_update=sparse_update))
-
-with mixed_layer(size=hidden_dim * 4) as lstm_input:
-    lstm_input += full_matrix_projection(input=emb)
-
-lstm = lstmemory(
-    input=lstm_input,
-    act=TanhActivation(),
-    gate_act=SigmoidActivation(),
-    state_act=TanhActivation())
-
-lstm_last = last_seq(input=lstm)
-
-with mixed_layer(
-        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
-    output += full_matrix_projection(input=lstm_last)
-
-outputs(
-    classification_cost(
-        input=output, label=data_layer(
-            name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_nest_layer_group.conf b/paddle/gserver/tests/sequence_nest_layer_group.conf
deleted file mode 100644
index 71ef53d08a2cea070806afb2c65ef15c4dd28f31..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_nest_layer_group.conf
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
-dict_file = dict()
-for line_count, line in enumerate(open(dict_path, "r")):
-    dict_file[line.strip()] = line_count
-
-define_py_data_sources2(
-    train_list='gserver/tests/Sequence/train.list.nest',
-    test_list=None,
-    module='sequenceGen',
-    obj='process2',
-    args={"dict_file": dict_file})
-
-settings(batch_size=2)
-######################## network configure ################################
-dict_dim = len(open(dict_path, 'r').readlines())
-word_dim = 128
-hidden_dim = 256
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb_group = embedding_layer(input=data, size=word_dim)
-
-
-# (lstm_input + lstm) is equal to lstmemory 
-def lstm_group(lstm_group_input):
-    with mixed_layer(size=hidden_dim * 4) as group_input:
-        group_input += full_matrix_projection(input=lstm_group_input)
-
-    lstm_output = lstmemory_group(
-        input=group_input,
-        name="lstm_group",
-        size=hidden_dim,
-        act=TanhActivation(),
-        gate_act=SigmoidActivation(),
-        state_act=TanhActivation())
-    return lstm_output
-
-
-lstm_nest_group = recurrent_group(
-    input=SubsequenceInput(emb_group), step=lstm_group, name="lstm_nest_group")
-# hasSubseq ->(seqlastins) seq
-lstm_last = last_seq(
-    input=lstm_nest_group, agg_level=AggregateLevel.TO_SEQUENCE)
-
-# seq ->(expand) hasSubseq
-lstm_expand = expand_layer(
-    input=lstm_last,
-    expand_as=emb_group,
-    expand_level=ExpandLevel.FROM_SEQUENCE)
-
-# hasSubseq ->(average) seq
-lstm_average = pooling_layer(
-    input=lstm_expand,
-    pooling_type=AvgPooling(),
-    agg_level=AggregateLevel.TO_SEQUENCE)
-
-with mixed_layer(
-        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
-    output += full_matrix_projection(input=lstm_average)
-
-outputs(
-    classification_cost(
-        input=output, label=data_layer(
-            name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_nest_rnn.conf b/paddle/gserver/tests/sequence_nest_rnn.conf
deleted file mode 100644
index 2873a599669b4281a53cd71e8bb56f0d18c26b5a..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_nest_rnn.conf
+++ /dev/null
@@ -1,74 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
-                        test_list=None,
-                        module='rnn_data_provider',
-                        obj='process_subseq')
-
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(input=data, size=word_dim)
-
-# This hierachical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn.conf
-
-def outer_step(x):
-    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
-    def inner_step(y):
-        inner_mem = memory(name="inner_rnn_state",
-                           size=hidden_dim,
-                           boot_layer=outer_mem)
-        out = fc_layer(input=[y, inner_mem],
-                        size=hidden_dim,
-                        act=TanhActivation(),
-                        bias_attr=True,
-                        name="inner_rnn_state")
-        return out
-
-    inner_rnn_output = recurrent_group(
-        step=inner_step,
-        name="inner",
-        input=x)
-    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
-
-    # "return last" won't work, because recurrent_group only support the input 
-    # sequence type is same as return sequence type.
-    return inner_rnn_output
-
-out = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=SubsequenceInput(emb))
-
-rep = last_seq(input=out)
-prob = fc_layer(size=label_dim,
-                input=rep,
-                act=SoftmaxActivation(),
-                bias_attr=True)
-
-outputs(classification_cost(input=prob,
-                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf b/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
deleted file mode 100644
index afdacfffd7aecfe2f4762f04a987126381bcea34..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
+++ /dev/null
@@ -1,76 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
-                        test_list=None,
-                        module='rnn_data_provider',
-                        obj='process_subseq')
-
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(input=data, size=word_dim)
-
-# This hierachical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn.conf
-
-def outer_step(wid, x):
-    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
-    def inner_step(y, wid):
-        z = embedding_layer(input=wid, size=word_dim)
-        inner_mem = memory(name="inner_rnn_state",
-                           size=hidden_dim,
-                           boot_layer=outer_mem)
-        out = fc_layer(input=[y, z, inner_mem],
-                        size=hidden_dim,
-                        act=TanhActivation(),
-                        bias_attr=True,
-                        name="inner_rnn_state")
-        return out
-
-    inner_rnn_output = recurrent_group(
-        step=inner_step,
-        name="inner",
-        input=[x, wid])
-    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
-
-    # "return last" should also work. But currently RecurrentGradientMachine
-    # does not handle it, and will report error: In hierachical RNN, all out
-    # links should be from sequences now.
-    return inner_rnn_output
-
-out = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=[SubsequenceInput(data), SubsequenceInput(emb)])
-
-rep = last_seq(input=out)
-prob = fc_layer(size=label_dim,
-                input=rep,
-                act=SoftmaxActivation(),
-                bias_attr=True)
-
-outputs(classification_cost(input=prob,
-                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py b/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
deleted file mode 100644
index 569d3c094b6f5517dad0f1e04f98de12aaef9633..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
+++ /dev/null
@@ -1,96 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(
-    train_list='gserver/tests/Sequence/dummy.list',
-    test_list=None,
-    module='rnn_data_provider',
-    obj='process_unequalength_subseq')
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 2
-
-speaker1 = data_layer(name="word1", size=dict_dim)
-speaker2 = data_layer(name="word2", size=dict_dim)
-
-emb1 = embedding_layer(input=speaker1, size=word_dim)
-emb2 = embedding_layer(input=speaker2, size=word_dim)
-
-
-# This hierarchical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn_multi_unequalength_inputs.conf
-def outer_step(x1, x2):
-    index = [0]
-
-    def inner_step(ipt):
-        index[0] += 1
-        i = index[0]
-        outer_mem = memory(name="outer_rnn_state_%d" % i, size=hidden_dim)
-
-        def inner_step_impl(y):
-            inner_mem = memory(
-                name="inner_rnn_state_" + y.name,
-                size=hidden_dim,
-                boot_layer=outer_mem)
-            out = fc_layer(
-                input=[y, inner_mem],
-                size=hidden_dim,
-                act=TanhActivation(),
-                bias_attr=True,
-                name='inner_rnn_state_' + y.name)
-            return out
-
-        encoder = recurrent_group(
-            step=inner_step_impl, name='inner_%d' % i, input=ipt)
-        last = last_seq(name="outer_rnn_state_%d" % i, input=encoder)
-        return encoder, last
-
-    encoder1, sentence_last_state1 = inner_step(ipt=x1)
-    encoder2, sentence_last_state2 = inner_step(ipt=x2)
-
-    encoder1_expand = expand_layer(
-        input=sentence_last_state1, expand_as=encoder2)
-
-    return [encoder1_expand, encoder2]
-
-
-encoder1_rep, encoder2_rep = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=[SubsequenceInput(emb1), SubsequenceInput(emb2)],
-    targetInlink=emb2)
-
-encoder1_last = last_seq(input=encoder1_rep)
-encoder1_expandlast = expand_layer(input=encoder1_last, expand_as=encoder2_rep)
-context = mixed_layer(
-    input=[
-        identity_projection(encoder1_expandlast),
-        identity_projection(encoder2_rep)
-    ],
-    size=hidden_dim)
-
-rep = last_seq(input=context)
-prob = fc_layer(
-    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
-
-outputs(
-    classification_cost(
-        input=prob, label=data_layer(
-            name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/sequence_recurrent.py b/paddle/gserver/tests/sequence_recurrent.py
deleted file mode 100644
index b88c09084e1bc167a177b59566e9794ac4d616c7..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_recurrent.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
-dict_file = dict()
-for line_count, line in enumerate(open(dict_path, "r")):
-    dict_file[line.strip()] = line_count
-
-define_py_data_sources2(
-    train_list='gserver/tests/Sequence/train.list',
-    test_list=None,
-    module='sequenceGen',
-    obj='process',
-    args={"dict_file": dict_file})
-
-settings(batch_size=5)
-######################## network configure ################################
-dict_dim = len(open(dict_path, 'r').readlines())
-word_dim = 128
-hidden_dim = 128
-label_dim = 3
-
-# This config is designed to be equivalent with sequence_recurrent_group.py
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(
-    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
-
-recurrent = recurrent_layer(input=emb, bias_attr=False, act=SoftmaxActivation())
-
-recurrent_last = last_seq(input=recurrent)
-
-with mixed_layer(
-        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
-    output += full_matrix_projection(input=recurrent_last)
-
-outputs(
-    classification_cost(
-        input=output, label=data_layer(
-            name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_recurrent_group.py b/paddle/gserver/tests/sequence_recurrent_group.py
deleted file mode 100644
index 0daf746700231d302550004b1c10729e36807b8b..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_recurrent_group.py
+++ /dev/null
@@ -1,68 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
-dict_file = dict()
-for line_count, line in enumerate(open(dict_path, "r")):
-    dict_file[line.strip()] = line_count
-
-define_py_data_sources2(
-    train_list='gserver/tests/Sequence/train.list',
-    test_list=None,
-    module='sequenceGen',
-    obj='process',
-    args={"dict_file": dict_file})
-
-settings(batch_size=5)
-######################## network configure ################################
-dict_dim = len(open(dict_path, 'r').readlines())
-word_dim = 128
-hidden_dim = 128
-label_dim = 3
-
-# This config is designed to be equivalent with sequence_recurrent.py
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(
-    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
-
-
-def step(y):
-    mem = memory(name="rnn_state", size=hidden_dim)
-    with mixed_layer(
-            name="rnn_state",
-            size=hidden_dim,
-            bias_attr=False,
-            act=SoftmaxActivation()) as out:
-        out += identity_projection(input=y)
-        out += full_matrix_projection(
-            input=mem, param_attr=ParamAttr(name="___recurrent_layer_0__"))
-    return out
-
-
-recurrent = recurrent_group(name="rnn", step=step, input=emb)
-
-recurrent_last = last_seq(input=recurrent)
-
-with mixed_layer(
-        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
-    output += full_matrix_projection(input=recurrent_last)
-
-outputs(
-    classification_cost(
-        input=output, label=data_layer(
-            name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_rnn.conf b/paddle/gserver/tests/sequence_rnn.conf
deleted file mode 100644
index 1084edfe708c3348d40b67e270f64d8cda3cee0f..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_rnn.conf
+++ /dev/null
@@ -1,57 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
-                        test_list=None,
-                        module='rnn_data_provider',
-                        obj='process_seq')
-
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(input=data, size=word_dim)
-
-def step(y):
-    mem = memory(name="rnn_state", size=hidden_dim)
-    out = fc_layer(input=[y, mem],
-                    size=hidden_dim,
-                    act=TanhActivation(),
-                    bias_attr=True,
-                    name="rnn_state")
-    return out
-
-out = recurrent_group(
-    name="rnn",
-    step=step,
-    input=emb)
-
-rep = last_seq(input=out)
-prob = fc_layer(size=label_dim,
-                input=rep,
-                act=SoftmaxActivation(),
-                bias_attr=True)
-
-outputs(classification_cost(input=prob,
-                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/sequence_rnn_matched_inputs.py b/paddle/gserver/tests/sequence_rnn_matched_inputs.py
deleted file mode 100644
index 41a581e0ccd59588d1bcce9345056bea9d80b73d..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_rnn_matched_inputs.py
+++ /dev/null
@@ -1,84 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(
-    train_list='gserver/tests/Sequence/dummy.list',
-    test_list=None,
-    module='rnn_data_provider',
-    obj='process_mixed')
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 2
-hidden_dim = 2
-label_dim = 2
-
-data1 = data_layer(name="word1", size=dict_dim)
-data2 = data_layer(name="word2", size=dict_dim)
-label = data_layer(name="label", size=label_dim)
-
-encoding = embedding_layer(input=data2, size=word_dim)
-
-subseq = embedding_layer(input=data1, size=word_dim)
-seq = embedding_layer(input=data2, size=word_dim)
-nonseq = embedding_layer(input=label, size=word_dim)
-
-
-# This hierarchical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn_mixed_inputs.conf
-def outer_step(subseq, seq, nonseq, encoding):
-    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
-
-    def inner_step(subseq, seq, nonseq):
-        inner_mem = memory(
-            name="inner_rnn_state", size=hidden_dim, boot_layer=outer_mem)
-
-        out = fc_layer(
-            input=[subseq, seq, nonseq, inner_mem],
-            size=hidden_dim,
-            act=TanhActivation(),
-            bias_attr=True,
-            name='inner_rnn_state')
-        return out
-
-    decoder = recurrent_group(
-        step=inner_step, name='inner', input=[subseq, seq, nonseq])
-    last = last_seq(name="outer_rnn_state", input=decoder)
-    context = simple_attention(
-        encoded_sequence=encoding, encoded_proj=encoding, decoder_state=last)
-    return context
-
-
-out = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=[
-        subseq, expand_layer(
-            seq, expand_as=subseq,
-            expand_level=ExpandLevel.FROM_SEQUENCE), expand_layer(
-                nonseq,
-                expand_as=subseq,
-                expand_level=ExpandLevel.FROM_NO_SEQUENCE),
-        StaticInput(encoding)
-    ])
-
-rep = last_seq(input=out)
-prob = fc_layer(
-    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
-
-outputs(classification_cost(input=prob, label=label))
diff --git a/paddle/gserver/tests/sequence_rnn_mixed_inputs.py b/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
deleted file mode 100644
index ae89d8e2bb6f672eaf697ae4d24895b89f76544f..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
+++ /dev/null
@@ -1,78 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(
-    train_list='gserver/tests/Sequence/dummy.list',
-    test_list=None,
-    module='rnn_data_provider',
-    obj='process_mixed')
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 2
-hidden_dim = 2
-label_dim = 2
-
-data1 = data_layer(name="word1", size=dict_dim)
-data2 = data_layer(name="word2", size=dict_dim)
-label = data_layer(name="label", size=label_dim)
-
-encoding = embedding_layer(input=data2, size=word_dim)
-
-
-# This hierarchical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn_matched_inputs.conf
-def outer_step(subseq, seq, nonseq, encoding):
-    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
-
-    def inner_step(data1, data2, label):
-        inner_mem = memory(
-            name="inner_rnn_state", size=hidden_dim, boot_layer=outer_mem)
-
-        subseq = embedding_layer(input=data1, size=word_dim)
-        seq = embedding_layer(input=data2, size=word_dim)
-        nonseq = embedding_layer(input=label, size=word_dim)
-
-        print_layer(input=[data1, seq, label, inner_mem])
-        out = fc_layer(
-            input=[subseq, seq, nonseq, inner_mem],
-            size=hidden_dim,
-            act=TanhActivation(),
-            bias_attr=True,
-            name='inner_rnn_state')
-        return out
-
-    decoder = recurrent_group(
-        step=inner_step, name='inner',
-        input=[subseq, StaticInput(seq), nonseq])
-    last = last_seq(name="outer_rnn_state", input=decoder)
-    context = simple_attention(
-        encoded_sequence=encoding, encoded_proj=encoding, decoder_state=last)
-    return context
-
-
-out = recurrent_group(
-    name="outer",
-    step=outer_step,
-    input=[data1, data2, StaticInput(label), StaticInput(encoding)])
-
-rep = last_seq(input=out)
-prob = fc_layer(
-    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
-
-outputs(classification_cost(input=prob, label=label))
diff --git a/paddle/gserver/tests/sequence_rnn_multi_input.conf b/paddle/gserver/tests/sequence_rnn_multi_input.conf
deleted file mode 100644
index 9fae974f3079c49ad03d6ba34e30190f325414e8..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_rnn_multi_input.conf
+++ /dev/null
@@ -1,58 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
-                        test_list=None,
-                        module='rnn_data_provider',
-                        obj='process_seq')
-
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 3
-
-data = data_layer(name="word", size=dict_dim)
-
-emb = embedding_layer(input=data, size=word_dim)
-
-def step(y, wid):
-    z = embedding_layer(input=wid, size=word_dim)
-    mem = memory(name="rnn_state", size=hidden_dim)
-    out = fc_layer(input=[y, z, mem],
-                    size=hidden_dim,
-                    act=TanhActivation(),
-                    bias_attr=True,
-                    name="rnn_state")
-    return out
-
-out = recurrent_group(
-    name="rnn",
-    step=step,
-    input=[emb, data])
-
-rep = last_seq(input=out)
-prob = fc_layer(size=label_dim,
-                input=rep,
-                act=SoftmaxActivation(),
-                bias_attr=True)
-
-outputs(classification_cost(input=prob,
-                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py b/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
deleted file mode 100644
index 6473fb3f3eddc803282911a156c489e4ba39aded..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
+++ /dev/null
@@ -1,76 +0,0 @@
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-######################## data source ################################
-define_py_data_sources2(
-    train_list='gserver/tests/Sequence/dummy.list',
-    test_list=None,
-    module='rnn_data_provider',
-    obj='process_unequalength_seq')
-
-settings(batch_size=2, learning_rate=0.01)
-######################## network configure ################################
-dict_dim = 10
-word_dim = 8
-hidden_dim = 8
-label_dim = 2
-
-speaker1 = data_layer(name="word1", size=dict_dim)
-speaker2 = data_layer(name="word2", size=dict_dim)
-
-emb1 = embedding_layer(input=speaker1, size=word_dim)
-emb2 = embedding_layer(input=speaker2, size=word_dim)
-
-# This hierachical RNN is designed to be equivalent to the RNN in
-# sequence_nest_rnn_multi_unequalength_inputs.conf
-
-
-def step(x1, x2):
-    def calrnn(y):
-        mem = memory(name='rnn_state_' + y.name, size=hidden_dim)
-        out = fc_layer(
-            input=[y, mem],
-            size=hidden_dim,
-            act=TanhActivation(),
-            bias_attr=True,
-            name='rnn_state_' + y.name)
-        return out
-
-    encoder1 = calrnn(x1)
-    encoder2 = calrnn(x2)
-    return [encoder1, encoder2]
-
-
-encoder1_rep, encoder2_rep = recurrent_group(
-    name="stepout", step=step, input=[emb1, emb2])
-
-encoder1_last = last_seq(input=encoder1_rep)
-encoder1_expandlast = expand_layer(input=encoder1_last, expand_as=encoder2_rep)
-context = mixed_layer(
-    input=[
-        identity_projection(encoder1_expandlast),
-        identity_projection(encoder2_rep)
-    ],
-    size=hidden_dim)
-
-rep = last_seq(input=context)
-prob = fc_layer(
-    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
-
-outputs(
-    classification_cost(
-        input=prob, label=data_layer(
-            name="label", size=label_dim)))
diff --git a/paddle/gserver/tests/test_ActivationGrad.cpp b/paddle/gserver/tests/test_ActivationGrad.cpp
deleted file mode 100644
index b5e4af26dc123be3748adb4faed5fe1656ca44b3..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_ActivationGrad.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-void testActivation(const string& act) {
-  LOG(INFO) << "test activation: " << act;
-  size_t size = 10;
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("addto");
-  config.layerConfig.set_size(size);
-  config.layerConfig.set_active_type(act);
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", size, 0});
-  config.layerConfig.add_inputs();
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  act + "_activation",
-                  100,
-                  /* trans= */ false,
-                  useGpu,
-                  /* useWeight */ true);
-  }
-}
-
-TEST(Activation, activation) {
-  auto types = ActivationFunction::getAllRegisteredTypes();
-  std::set<string> excluded{"sequence_softmax"};
-  for (auto type : types) {
-    if (excluded.count(type)) continue;
-    testActivation(type);
-  }
-}
-
-void testSequenceSoftmaxAct(bool hasSubseq) {
-  LOG(INFO) << "test activation: sequence softmax";
-
-  const size_t size = 1;
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("addto");
-  config.layerConfig.set_size(size);
-  config.layerConfig.set_active_type("sequence_softmax");
-  config.inputDefs.push_back(
-      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
-       "layer_0",
-       1,
-       0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "sequence_softmax",
-                  100,
-                  /* trans= */ false,
-                  useGpu,
-                  /* useWeight */ true);
-  }
-}
-
-TEST(SequenceSoftmaxActivation, activation) {
-  for (auto hasSubseq : {false, true}) {
-    LOG(INFO) << "hasSubseq = " << hasSubseq;
-    testSequenceSoftmaxAct(hasSubseq);
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_BatchNorm.cpp b/paddle/gserver/tests/test_BatchNorm.cpp
deleted file mode 100644
index a3ec66c75829c5ef0ae834656ee82e40be76c892..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_BatchNorm.cpp
+++ /dev/null
@@ -1,195 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/utils/GlobalConstants.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/cuda/include/hl_batch_norm.h"
-#include "paddle/math/tests/TensorCheck.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_double(checkgrad_eps);
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_bool(prev_batch_state);
-
-// Test that the batchNormLayer can be followed by a ConvLayer
-TEST(Layer, batchNorm) {
-  FLAGS_use_gpu = false;
-  TestConfig configBN;
-  const int CHANNELS = 6272;
-  const int IMG_SIZE = 1;
-  configBN.layerConfig.set_type("batch_norm");
-  configBN.layerConfig.set_name("bn");
-  configBN.layerConfig.set_size(CHANNELS * IMG_SIZE * IMG_SIZE);
-  configBN.layerConfig.set_active_type("relu");
-  configBN.biasSize = CHANNELS;
-  configBN.inputDefs.push_back({INPUT_DATA,
-                                "layer_0",
-                                /* dim= */ IMG_SIZE * IMG_SIZE * CHANNELS,
-                                /* paraSize= */ CHANNELS});
-
-  configBN.inputDefs.push_back(
-      {INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
-  configBN.inputDefs.back().isStatic = true;
-  configBN.inputDefs.push_back(
-      {INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
-  configBN.inputDefs.back().isStatic = true;
-
-  LayerInputConfig* input = configBN.layerConfig.add_inputs();
-  configBN.layerConfig.add_inputs();
-  configBN.layerConfig.add_inputs();
-
-  ImageConfig* img_conf = input->mutable_image_conf();
-  img_conf->set_channels(CHANNELS);
-  img_conf->set_img_size(IMG_SIZE);
-
-  // Setting up conv-layer config
-  TestConfig config;
-  config.biasSize = 64;
-  config.layerConfig.set_type("exconv");
-  config.layerConfig.set_num_filters(64);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  config.inputDefs.push_back({INPUT_DATA, "bn", 6272, 204800});
-  input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(5);
-  conv->set_filter_size_y(5);
-  conv->set_channels(128);
-  conv->set_padding(1);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  conv->set_img_size(7);
-  conv->set_output_x(3);
-  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
-                              config.layerConfig.num_filters());
-  config.layerConfig.set_name("conv");
-
-  // data layer initialize
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(configBN,
-                &dataLayers,
-                &datas,
-                &layerMap,
-                "batch_norm",
-                100,
-                false,
-                false);
-  // test layer initialize
-  std::vector<ParameterPtr> parameters;
-  LayerPtr bnLayer;
-  initTestLayer(configBN, &layerMap, &parameters, &bnLayer);
-
-  std::vector<ParameterPtr> parameters2;
-  LayerPtr convLayer;
-  initTestLayer(config, &layerMap, &parameters2, &convLayer);
-
-  bnLayer->forward(PASS_GC);
-  convLayer->forward(PASS_GC);
-
-  CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getHeight()), 100);
-  CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getWidth()), 576);
-}
-
-#ifdef PADDLE_WITH_CUDA
-void batchNormInference(int n, int c, int h, int w) {
-  MatrixPtr input = std::make_shared<GpuMatrix>(n, c * h * w);
-  MatrixPtr cudnnOut = std::make_shared<GpuMatrix>(n, c * h * w);
-  MatrixPtr cudaOut = std::make_shared<GpuMatrix>(n, c * h * w);
-  MatrixPtr cudnnCheck = std::make_shared<CpuMatrix>(n, c * h * w);
-  MatrixPtr cudaCheck = std::make_shared<CpuMatrix>(n, c * h * w);
-  input->randomizeUniform();
-  cudnnOut->zeroMem();
-  cudaOut->zeroMem();
-
-  MatrixPtr scale = std::make_shared<GpuMatrix>(1, c);
-  scale->randomizeUniform();
-  MatrixPtr bias = std::make_shared<GpuMatrix>(1, c);
-  bias->randomizeUniform();
-
-  MatrixPtr movingMean = std::make_shared<GpuMatrix>(1, c);
-  movingMean->randomizeUniform();
-
-  MatrixPtr movingVar = std::make_shared<GpuMatrix>(1, c);
-  movingVar->randomizeUniform();
-  movingVar->clip(0.01, 50);
-
-  hl_tensor_descriptor ioDesc;
-  hl_tensor_descriptor bnDesc;
-  hl_create_tensor_descriptor(&ioDesc);
-  hl_create_tensor_descriptor(&bnDesc);
-  hl_tensor_reshape(ioDesc, n, c, h, w);
-  hl_tensor_reshape(bnDesc, 1, c, 1, 1);
-
-  double EPS = 1E-5;
-  hl_batch_norm_forward_inference(ioDesc,
-                                  input->getData(),
-                                  ioDesc,
-                                  cudnnOut->getData(),
-                                  bnDesc,
-                                  scale->getData(),
-                                  bias->getData(),
-                                  movingMean->getData(),
-                                  movingVar->getData(),
-                                  EPS);
-
-  hl_batch_norm_cuda_inference(input->getData(),
-                               cudaOut->getData(),
-                               scale->getData(),
-                               bias->getData(),
-                               movingMean->getData(),
-                               movingVar->getData(),
-                               EPS,
-                               n,
-                               c,
-                               h,
-                               w);
-
-  cudnnCheck->copyFrom(*cudnnOut);
-  cudaCheck->copyFrom(*cudaOut);
-  autotest::TensorCheckErr(*cudnnCheck, *cudaCheck);
-
-  hl_destroy_tensor_descriptor(ioDesc);
-  hl_destroy_tensor_descriptor(bnDesc);
-}
-
-TEST(BatchNorm, Inference) {
-  batchNormInference(33, 267, 1, 1);
-  batchNormInference(19, 105, 4, 4);
-}
-#endif
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_CRFLayerGrad.cpp b/paddle/gserver/tests/test_CRFLayerGrad.cpp
deleted file mode 100644
index 9f3d2936569af8f1923a471f4d262e9a472649c0..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_CRFLayerGrad.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/LinearChainCRF.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-
-DECLARE_int32(gpu_id);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-static inline bool getNextSequence(std::vector<int>& seq, int numClasses) {
-  for (auto& v : seq) {
-    if (++v < numClasses) {
-      return true;
-    }
-    v = 0;
-  }
-  return false;
-}
-
-// log(exp(x) + exp(y))
-static inline real logSum(real x, real y) {
-  real maxValue = std::max(x, y);
-  if (std::isinf(maxValue)) {
-    return -std::numeric_limits<real>::infinity();
-  } else {
-    return maxValue + log(exp(x - maxValue) + exp(y - maxValue));
-  }
-}
-
-static inline std::vector<int> genRandLabels(int numClasses, int length) {
-  std::vector<int> labels(length);
-  for (int i = 0; i < length; ++i) {
-    labels[i] = rand() % numClasses;  // NOLINT
-  }
-  return labels;
-}
-
-TEST(CRFLayer, cost) {
-  const int numClasses = 4;
-  CpuVector para(numClasses * (numClasses + 2));
-  real* a = para.getData();
-  real* b = para.getData() + numClasses;
-  real* w = para.getData() + 2 * numClasses;
-  LinearChainCRF crf(4, para.getData());
-  for (int length : {1, 2, 3, 10}) {
-    for (int tries = 0; tries < 10; ++tries) {
-      CpuMatrix x(length, numClasses);
-      x.randomizeUniform();
-      para.randnorm(0, 2);
-
-      std::vector<int> goldenLabels = genRandLabels(numClasses, length);
-
-      real cost = crf.forward(x.getData(), goldenLabels.data(), length);
-
-      real logZ = -std::numeric_limits<real>::infinity();
-      real logNominator = -std::numeric_limits<real>::infinity();
-      std::vector<int> testResult(length, 0);
-      do {
-        real score = a[testResult.front()];
-        score += x.getElement(0, testResult.front());
-        for (int k = 1; k < length; ++k) {
-          score += x.getElement(k, testResult[k]) +
-                   w[numClasses * testResult[k - 1] + testResult[k]];
-        }
-        score += b[testResult.back()];
-        logZ = logSum(logZ, score);
-
-        if (goldenLabels == testResult) {
-          logNominator = score;
-        }
-      } while (getNextSequence(testResult, numClasses));
-
-      real trueCost = -logNominator + logZ;
-
-      real diff = fabs(trueCost - cost);
-      diff /= fabs(cost) < fabs(trueCost) ? fabs(cost) : fabs(trueCost);
-      VLOG(1) << "cost=" << cost << " trueCost=" << trueCost << " diff=" << diff
-              << std::endl;
-      if (typeid(real) == typeid(double)) {  // NOLINT
-        EXPECT_LE(diff, 1e-10);
-      } else {
-        EXPECT_LE(diff, 5e-3);
-      }
-    }
-  }
-}
-
-inline real epsilon() { return typeid(real) == typeid(double) ? 1e-10 : 0.06; }
-
-TestConfig initTestConfig(size_t numClasses, bool withWeight) {
-  TestConfig config;
-  config.layerConfig.set_type("crf");
-  config.layerConfig.set_size(numClasses);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
-                              "layer_0",
-                              numClasses,
-                              numClasses * (numClasses + 2)});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_LABEL, "layer_label", numClasses, 0});
-  config.layerConfig.add_inputs();
-
-  if (withWeight) {
-    config.inputDefs.push_back({INPUT_DENSE_DIM_DATA, "layer_weight", 1, 0});
-    config.layerConfig.add_inputs();
-  }
-
-  return config;
-}
-
-TEST(Layer, CRFLayer) {
-  size_t numClasses = 10;
-  for (int tries = 0; tries < 5; ++tries) {
-    TestConfig config = initTestConfig(numClasses, /* withWeight= */ false);
-    for (int length : {1, 3, 100}) {
-      // Not support GPU now
-      testLayerGrad(config,
-                    "crf",
-                    length,
-                    /* trans= */ false,
-                    /* useGpu= */ false,
-                    /* useWeight= */ false,
-                    epsilon());
-    }
-  }
-}
-
-TEST(Layer, CRFLayerUseWeight) {
-  size_t numClasses = 10;
-  for (int tries = 0; tries < 5; ++tries) {
-    TestConfig config = initTestConfig(numClasses, /* withWeight= */ true);
-    for (int length : {1, 3, 100}) {
-      // Not support GPU now
-      testLayerGrad(config,
-                    "crf",
-                    length,
-                    /* trans= */ false,
-                    /* useGpu= */ false,
-                    /* useWeight= */ false,
-                    epsilon());
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  hl_start();
-  hl_init(FLAGS_gpu_id);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_CompareSparse.cpp b/paddle/gserver/tests/test_CompareSparse.cpp
deleted file mode 100644
index 2fbc404125a9364ac44a990f8ec92962cf7d1298..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_CompareSparse.cpp
+++ /dev/null
@@ -1,228 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/utils/PythonUtil.h>
-
-#include "paddle/trainer/Trainer.h"
-
-#include <gtest/gtest.h>
-#include <paddle/pserver/ParameterServer2.h>
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static const string& configFile1 = "gserver/tests/sequence_lstm.conf";
-
-DECLARE_bool(use_gpu);
-DECLARE_string(config);
-DECLARE_int32(gpu_id);
-DECLARE_int32(seed);
-DECLARE_int32(num_passes);
-DECLARE_int32(saving_period);
-
-DECLARE_int32(num_gradient_servers);
-DECLARE_int32(port);
-DECLARE_bool(local);
-DECLARE_bool(use_old_updater);
-DECLARE_bool(parallel_nn);
-DECLARE_string(config_args);
-DEFINE_double(max_diff_ratio,
-              0.0f,
-              "max diff ratio allowed for parameters value");
-
-int gNumDevices = 0;
-
-std::vector<ParameterPtr> trainerOnePassTest(const string& configFile,
-                                             bool sparseUpdate,
-                                             int trainerCount = 1,
-                                             bool useGpu = false) {
-  FLAGS_use_gpu = useGpu;
-  FLAGS_config = configFile;
-  FLAGS_trainer_count = trainerCount;
-  FLAGS_config_args = sparseUpdate ? "sparse_update=1" : "sparse_update=0";
-
-  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
-            << " configFile=" << configFile << " sparseUpdate=" << sparseUpdate;
-  srand(FLAGS_seed);
-  *ThreadLocalRand::getSeed() = FLAGS_seed;
-  ThreadLocalRandomEngine::get().seed(FLAGS_seed);
-  if (useGpu) {
-    CHECK_LE(trainerCount, gNumDevices);
-  }
-
-  std::vector<std::shared_ptr<ParameterServer2>> pservers;
-  if (!FLAGS_local) {
-    int numPorts = FLAGS_ports_num + FLAGS_ports_num_for_sparse;
-    pservers.resize(numPorts);
-
-    for (int i = 0; i < numPorts; ++i) {
-      pservers[i].reset(new ParameterServer2(std::string(), FLAGS_port + i));
-      pservers[i]->init();
-      pservers[i]->start();
-    }
-  }
-
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlagConfig());
-  trainer.train();
-  return trainer.getGradientMachine()->getParameters();
-}
-
-std::vector<ParameterPtr>& getDenseParameters() {
-  static std::vector<ParameterPtr> denseParameters;
-  if (denseParameters.empty()) {
-    // use dense training as base
-    FLAGS_local = true;
-    denseParameters = trainerOnePassTest(configFile1, false);
-  }
-
-  return denseParameters;
-}
-
-void checkBuffer(real* A,
-                 const char* desA,
-                 real* B,
-                 const char* desB,
-                 size_t len,
-                 double maxDiffRatio) {
-  double maxDiff = 0;
-  double maxValue = 0;
-  for (size_t i = 0; i < len; ++i) {
-    double diff = fabs(A[i] - B[i]);
-    maxValue = std::max<double>(maxValue, std::max(fabs(A[i]), fabs(B[i])));
-    maxDiff = std::max(maxDiff, diff);
-  }
-  EXPECT_LE(maxDiff / maxValue, maxDiffRatio);
-  LOG(INFO) << " maxDiff=" << maxDiff << " maxValue=" << maxValue
-            << " maxDiff/maxValue=" << maxDiff / maxValue << "\n\n";
-}
-
-void compareValue(const vector<ParameterPtr>& parametersA,
-                  const vector<ParameterPtr>& parametersB,
-                  double maxDiffRatio = 0.0) {
-  LOG(INFO) << "\n\n--------------------------------"
-            << " Check Gradient Machine Parameters:"
-            << " -------------------------------------\n";
-  for (size_t i = 0; i < parametersA.size(); ++i) {
-    ParameterPtr parameterA, parameterB;
-    parameterA = parametersA[i];
-    parameterB = parametersB[i];
-
-    CpuVector paraA(parameterA->getSize());
-    CpuVector paraB(parameterB->getSize());
-    paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE));
-    paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE));
-
-    LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
-              << " ; size : " << paraA.getSize() << " ------------";
-    checkBuffer(paraA.getData(),
-                "para_A",
-                paraB.getData(),
-                "para_B",
-                paraA.getSize(),
-                maxDiffRatio);
-  }
-}
-
-TEST(compareSparse, cpu) {
-  FLAGS_local = 1;  // disable remote sparse update in parameter config
-  std::vector<ParameterPtr> parameters = trainerOnePassTest(configFile1, true);
-  compareValue(getDenseParameters(), parameters);
-}
-
-TEST(compareSparse, remote_cpu) {
-  FLAGS_local = 0;  // will enable remote sparse update
-  FLAGS_ports_num_for_sparse = 5;
-  std::vector<ParameterPtr> parameters = trainerOnePassTest(configFile1, true);
-  compareValue(getDenseParameters(), parameters);
-}
-
-TEST(compareSparse, cpu10_local_vs_remote) {
-  FLAGS_local = 1;  // disable remote sparse update in parameter config
-  std::vector<ParameterPtr> localParameters =
-      trainerOnePassTest(configFile1, true, 2);
-
-  FLAGS_local = 0;  // will enable remote sparse update
-  FLAGS_ports_num_for_sparse = 5;
-  std::vector<ParameterPtr> remoteParameters =
-      trainerOnePassTest(configFile1, true, 2);
-
-  compareValue(localParameters, remoteParameters);
-}
-
-TEST(compareSparse, multiGradientMachine) {
-  int numGpu;
-#ifdef PADDLE_TYPE_DOUBLE
-  double eps = 1e-8;
-#else
-  double eps = 1e-4;
-#endif
-  numGpu = hl_get_device_count();
-  for (bool local : {false, true}) {
-    FLAGS_local = local;
-    FLAGS_ports_num_for_sparse = 5;
-    for (bool useGpu : {false, true}) {
-#ifndef PADDLE_WITH_CUDA
-      if (useGpu) continue;
-#endif
-      FLAGS_parallel_nn = useGpu;
-      LOG(INFO) << " local=" << local << " useGpu=" << useGpu;
-      int trainerCount = useGpu ? numGpu : 2;
-      std::vector<ParameterPtr> parameters =
-          trainerOnePassTest(configFile1, true, trainerCount, useGpu);
-      compareValue(getDenseParameters(), parameters, eps);
-    }
-  }
-  FLAGS_parallel_nn = false;
-}
-
-TEST(compareSparse, NeuralNetwork) {
-#ifdef PADDLE_TYPE_DOUBLE
-  double eps = 1e-8;
-#else
-  double eps = 1e-4;
-#endif
-  for (bool local : {false, true}) {
-    FLAGS_local = local;
-    FLAGS_ports_num_for_sparse = 5;
-    for (bool useGpu : {false, true}) {
-#ifndef PADDLE_WITH_CUDA
-      if (useGpu) continue;
-#endif
-      FLAGS_parallel_nn = useGpu;
-      LOG(INFO) << " local=" << local << " useGpu=" << useGpu;
-      int trainerCount = 1;
-      std::vector<ParameterPtr> parameters =
-          trainerOnePassTest(configFile1, true, trainerCount, useGpu);
-      compareValue(getDenseParameters(), parameters, useGpu ? eps : 0);
-    }
-  }
-  FLAGS_parallel_nn = false;
-}
-
-int main(int argc, char** argv) {
-  // FIXME(tonyyang-svail):
-  //   Turn off this test due CI failure:
-  //   https://paddleci.ngrok.io/viewLog.html?buildId=27608&buildTypeId=Paddle_PrCi&tab=buildLog&_focus=10430
-  return 0;
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  initPython(argc, argv);
-
-  gNumDevices = hl_get_device_count();
-  FLAGS_num_passes = 1;          // train one pass
-  FLAGS_saving_period = 100000;  // do not save parameter
-
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_CompareTwoNets.cpp b/paddle/gserver/tests/test_CompareTwoNets.cpp
deleted file mode 100644
index 1c9b4002a34ca5a9b668be69bd0ad392eb763803..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_CompareTwoNets.cpp
+++ /dev/null
@@ -1,209 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/utils/PythonUtil.h>
-#include <algorithm>
-#include <cstdlib>
-
-#include "paddle/trainer/Trainer.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_int32(gpu_id);
-
-DECLARE_bool(local);
-DECLARE_bool(use_gpu);
-
-DECLARE_string(config);
-DECLARE_string(nics);
-
-DEFINE_bool(need_high_accuracy,
-            false,
-            "whether need to run in double accuracy");
-DEFINE_double(
-    max_diff_ratio,
-    0.0f,
-    "max diff ratio allowed for outputs and parameters (value/gradient)");
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_int32(seed);
-
-static const string& config_file_a = "gserver/tests/sequence_recurrent.py";
-static const string& config_file_b =
-    "gserver/tests/sequence_recurrent_group.py";
-
-struct ComData {
-  vector<Argument> outArgs;
-  vector<ParameterPtr> parameters;
-};
-
-void calcGradient(ComData& data, const string configFile) {
-  FLAGS_config = configFile;
-
-  FLAGS_local = true;
-  FLAGS_use_gpu = false;
-
-  FLAGS_nics = "";
-
-  *ThreadLocalRand::getSeed() = FLAGS_seed;
-  srand(FLAGS_seed);
-
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlagConfig(), false);
-
-  data.parameters = trainer.getGradientMachine()->getParameters();
-
-  DataBatch dataBatch;
-  int32_t batchSize = trainer.getConfig().opt_config().batch_size();
-
-  trainer.getDataProvider()->reset();
-  trainer.getDataProvider()->setSkipShuffle();
-  trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
-
-  CHECK(dataBatch.getSize()) << "No data from data provider";
-  vector<Argument>& inArgs = dataBatch.getStreams();
-
-  trainer.getGradientMachine()->start();
-  trainer.getGradientMachine()->forwardBackward(
-      inArgs, &data.outArgs, PASS_TRAIN);
-
-  trainer.getGradientMachine()->finish();
-}
-
-void checkBuffer(real* A,
-                 const char* desA,
-                 real* B,
-                 const char* desB,
-                 size_t len,
-                 size_t width = 1) {
-  int nNum = 0;
-  real maxVal = 0;
-  for (size_t i = 0; i < len; ++i) {
-    maxVal = std::max(maxVal, std::max(A[i], B[i]));
-  }
-  real maxDiff = 0;
-  for (size_t i = 0; i < len; ++i) {
-    real diff = fabs(A[i] - B[i]);
-    maxDiff = std::max(maxDiff, diff);
-    if (diff > maxVal * FLAGS_max_diff_ratio) {
-      nNum++;
-      VLOG(1) << "Row: " << i / width << ", " << desA << " : " << A[i] << "    "
-              << desB << " : " << B[i] << " diff=" << diff;
-    }
-  }
-  EXPECT_EQ(0, nNum);
-  LOG(INFO) << "maxValue=" << maxVal << " maxDiff=" << maxDiff << "\n\n";
-}
-
-void compareGradient(ComData& comDataA, ComData& comDataB) {
-  vector<Argument> outArgsA = comDataA.outArgs;
-  vector<Argument> outArgsB = comDataB.outArgs;
-
-  for (size_t i = 0; i < outArgsA.size(); ++i) {
-    CpuMatrix matA(outArgsA[i].value->getHeight(),
-                   outArgsA[i].value->getWidth());
-    CpuMatrix matB(outArgsB[i].value->getHeight(),
-                   outArgsB[i].value->getWidth());
-
-    matA.copyFrom(*outArgsA[i].value);
-    matB.copyFrom(*outArgsB[i].value);
-
-    LOG(INFO) << "\n--------------------------------"
-              << " Check Network Output_" << i << ":"
-              << " -------------------------------------\n";
-    checkBuffer(matA.getData(),
-                "network A output",
-                matB.getData(),
-                "network B output",
-                matA.getElementCnt(),
-                matA.getWidth());
-  }
-
-  vector<ParameterPtr>& parametersA = comDataA.parameters;
-  vector<ParameterPtr>& parametersB = comDataB.parameters;
-
-  LOG(INFO) << "\n\n--------------------------------"
-            << " Check Gradient Machine Parameters:"
-            << " -------------------------------------\n";
-  for (size_t i = 0; i < parametersA.size(); ++i) {
-    ParameterPtr parameterA, parameterB;
-    parameterA = parametersA[i];
-    parameterB = parametersB[i];
-
-    CpuVector paraA(parameterA->getSize());
-    CpuVector paraB(parameterB->getSize());
-    paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE));
-    paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE));
-
-    LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
-              << " ; size : " << paraA.getSize() << " ------------";
-    checkBuffer(paraA.getData(),
-                "Network A",
-                paraB.getData(),
-                "Network B",
-                paraA.getSize());
-
-    CpuVector gradA(*parameterA->getBuf(PARAMETER_GRADIENT));
-    CpuVector gradB(*parameterB->getBuf(PARAMETER_GRADIENT));
-
-    LOG(INFO) << "\n\n----------- PARAMETER_GRADIENT: " << parameterA->getName()
-              << " ; size : " << gradA.getSize() << " -----------";
-    checkBuffer(gradA.getData(),
-                "Network A",
-                gradB.getData(),
-                "Network B",
-                gradA.getSize());
-  }
-}
-
-TEST(Trainer, create) {
-  ComData dataA;
-  calcGradient(dataA, config_file_a);
-  LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n";
-
-  ComData dataB;
-  calcGradient(dataB, config_file_b);
-  LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n";
-
-  compareGradient(dataA, dataB);
-}
-
-int main(int argc, char** argv) {
-  FLAGS_thread_local_rand_use_global_seed = true;
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  initPython(argc, argv);
-
-#ifndef PADDLE_TYPE_DOUBLE
-  if (FLAGS_need_high_accuracy) {
-    LOG(INFO) << "skip test due to it's need high accuracy";
-    return 0;
-  }
-  if (FLAGS_max_diff_ratio == 0.0f) {
-    FLAGS_max_diff_ratio = 1e-5;
-    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
-              << " in low accuracy mode";
-  }
-#else
-  if (FLAGS_max_diff_ratio == 0.0f) {
-    FLAGS_max_diff_ratio = 1e-10;
-    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
-              << " in high accuracy mode";
-  }
-#endif
-
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
diff --git a/paddle/gserver/tests/test_ConvTrans.cpp b/paddle/gserver/tests/test_ConvTrans.cpp
deleted file mode 100644
index 2e394a74b7d53fc53727d817c06479d545ade65d..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_ConvTrans.cpp
+++ /dev/null
@@ -1,244 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/utils/GlobalConstants.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_double(checkgrad_eps);
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_bool(prev_batch_state);
-
-// Test that the convTrans forward is the same as conv backward
-TEST(Layer, convTransLayerFwd) {
-  // Setting up conv-trans layer
-  TestConfig configt;
-  configt.biasSize = 3;
-  configt.layerConfig.set_type("exconvt");
-  configt.layerConfig.set_num_filters(3);
-  configt.layerConfig.set_partial_sum(1);
-  configt.layerConfig.set_shared_biases(true);
-
-  configt.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 384});
-  LayerInputConfig* input = configt.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(2);
-  conv->set_filter_size_y(4);
-  conv->set_channels(16);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_filter_channels(3 / conv->groups());
-  conv->set_img_size(16);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /* caffeMode */ true));
-  configt.layerConfig.set_size(conv->img_size() * conv->img_size() *
-                               configt.layerConfig.num_filters());
-  configt.layerConfig.set_name("convTrans");
-
-  // data layer initialize
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(
-      configt, &dataLayers, &datas, &layerMap, "convTrans", 100, false, false);
-  // test layer initialize
-  std::vector<ParameterPtr> parameters;
-  LayerPtr convtLayer;
-  initTestLayer(configt, &layerMap, &parameters, &convtLayer);
-  convtLayer->getBiasParameter()->zeroMem();
-  convtLayer->forward(PASS_GC);
-
-  // Setting up conv-layer config
-  TestConfig config;
-  config.biasSize = 16;
-  config.layerConfig.set_type("exconv");
-  config.layerConfig.set_num_filters(16);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 768, 384});
-  input = config.layerConfig.add_inputs();
-  conv = input->mutable_conv_conf();
-  conv->set_filter_size(2);
-  conv->set_filter_size_y(4);
-  conv->set_channels(3);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  conv->set_img_size(16);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /* caffeMode */ true));
-  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
-                              config.layerConfig.num_filters());
-  config.layerConfig.set_name("conv");
-
-  // data layer initialize
-  std::vector<DataLayerPtr> dataLayers2;
-  LayerMap layerMap2;
-  vector<Argument> datas2;
-  initDataLayer(
-      config, &dataLayers2, &datas2, &layerMap2, "conv", 100, false, false);
-  // test layer initialize
-  std::vector<ParameterPtr> parameters2;
-  LayerPtr convLayer;
-  initTestLayer(config, &layerMap2, &parameters2, &convLayer);
-
-  // Sync convLayer and convtLayer parameter
-  convLayer->getBiasParameter()->zeroMem();
-  convLayer->getParameters()[0]
-      ->getBuf(PARAMETER_VALUE)
-      ->copyFrom(*(convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)));
-
-  // Set convLayer outputGrad as convTransLayer input value
-  convLayer->forward(PASS_GC);
-  convLayer->getOutput().grad->copyFrom(*(dataLayers[0]->getOutputValue()));
-
-  vector<int> callbackFlags(parameters2.size(), 0);
-  auto callback = [&](Parameter* para) { ++callbackFlags[para->getID()]; };
-  convLayer->backward(callback);
-
-  // Check that the convLayer backward is the same as convTransLayer forward
-  checkMatrixEqual(convtLayer->getOutputValue(),
-                   dataLayers2[0]->getOutputGrad());
-}
-
-// Do one forward pass of convTrans layer and check to see if its output
-// matches the given result
-void doOneConvtTest(size_t imgSize,
-                    size_t output_x,
-                    size_t stride,
-                    size_t padding,
-                    size_t filter_size,
-                    MatrixPtr& result) {
-  TestConfig configt;
-  configt.biasSize = 1;
-  configt.layerConfig.set_type("exconvt");
-  configt.layerConfig.set_num_filters(1);
-  configt.layerConfig.set_partial_sum(1);
-  configt.layerConfig.set_shared_biases(true);
-
-  configt.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", output_x * output_x, filter_size * filter_size});
-  LayerInputConfig* input = configt.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(filter_size);
-  conv->set_filter_size_y(filter_size);
-  conv->set_channels(1);
-  conv->set_padding(padding);
-  conv->set_padding_y(padding);
-  conv->set_stride(stride);
-  conv->set_stride_y(stride);
-  conv->set_groups(1);
-  conv->set_filter_channels(1);
-  conv->set_img_size(imgSize);
-  conv->set_output_x(output_x);
-
-  configt.layerConfig.set_size(conv->img_size() * conv->img_size() *
-                               configt.layerConfig.num_filters());
-  configt.layerConfig.set_name("convTrans");
-
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(
-      configt, &dataLayers, &datas, &layerMap, "convTrans", 1, false, false);
-  dataLayers[0]->getOutputValue()->zeroMem();
-  dataLayers[0]->getOutputValue()->add(1.0);
-
-  // test layer initialize
-  std::vector<ParameterPtr> parameters;
-  LayerPtr convtLayer;
-  initTestLayer(configt, &layerMap, &parameters, &convtLayer);
-  convtLayer->getBiasParameter()->zeroMem();
-  convtLayer->getParameters()[0]->zeroMem();
-  convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)->add(1.0);
-  convtLayer->forward(PASS_GC);
-
-  checkMatrixEqual(convtLayer->getOutputValue(), result);
-}
-
-TEST(Layer, convTransLayerFwd2) {
-  MatrixPtr result;
-  result = Matrix::create(1, 5 * 5, false, false);
-  result->zeroMem();
-  result->add(1.0);
-  doOneConvtTest(/* imgSize */ 5,
-                 /* output_x */ 1,
-                 /* stride */ 1,
-                 /* padding */ 0,
-                 /* filter_size */ 5,
-                 result);
-
-  real resultData[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4,
-                       4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1};
-  result->setData(resultData);
-  doOneConvtTest(/* imgSize */ 5,
-                 /* output_x */ 2,
-                 /* stride */ 1,
-                 /* padding */ 0,
-                 /* filter_size */ 4,
-                 result);
-
-  real resultData2[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4,
-                        4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1};
-  result->setData(resultData2);
-  doOneConvtTest(/* imgSize */ 5,
-                 /* output_x */ 2,
-                 /* stride */ 2,
-                 /* padding */ 1,
-                 /* filter_size */ 5,
-                 result);
-
-  real resultData3[] = {1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 4,
-                        2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1};
-  result->setData(resultData3);
-  doOneConvtTest(/* imgSize */ 5,
-                 /* output_x */ 2,
-                 /* stride */ 2,
-                 /* padding */ 0,
-                 /* filter_size */ 3,
-                 result);
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_ConvUnify.cpp b/paddle/gserver/tests/test_ConvUnify.cpp
deleted file mode 100644
index ba820d9a2acabf95ff816705e4df124bb95da077..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ /dev/null
@@ -1,315 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/utils/GlobalConstants.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_double(checkgrad_eps);
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_bool(prev_batch_state);
-
-// Do one forward pass of ConvLayer using either exconv or cudnn_conv
-MatrixPtr doOneConvTest(size_t imgSize,
-                        size_t output_x,
-                        size_t stride,
-                        size_t padding,
-                        size_t filter_size,
-                        size_t channel,
-                        size_t numfilters,
-                        size_t groups,
-                        MatrixPtr& inputData,
-                        real* param,
-                        bool useGpu,
-                        bool isDeconv = false) {
-  TestConfig config;
-  config.biasSize = numfilters;
-  string layerType;
-  if (useGpu) {
-    layerType = (isDeconv) ? "cudnn_convt" : "cudnn_conv";
-  } else {
-    layerType = (isDeconv) ? "exconvt" : "exconv";
-  }
-  config.layerConfig.set_type(layerType);
-  config.layerConfig.set_num_filters(numfilters);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  size_t weightSize = channel * filter_size * filter_size *
-                      config.layerConfig.num_filters() / groups;
-  if (isDeconv) {
-    config.inputDefs.push_back(
-        {INPUT_DATA, "layer_0", output_x * output_x * channel, weightSize});
-    config.layerConfig.set_size(imgSize * imgSize *
-                                config.layerConfig.num_filters());
-  } else {
-    config.inputDefs.push_back(
-        {INPUT_DATA, "layer_0", imgSize * imgSize * channel, weightSize});
-    config.layerConfig.set_size(output_x * output_x *
-                                config.layerConfig.num_filters());
-  }
-
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(filter_size);
-  conv->set_filter_size_y(filter_size);
-  conv->set_channels(channel);
-  conv->set_padding(padding);
-  conv->set_padding_y(padding);
-  conv->set_stride(stride);
-  conv->set_stride_y(stride);
-  conv->set_groups(groups);
-  conv->set_img_size(imgSize);
-  conv->set_output_x(output_x);
-
-  if (isDeconv) {
-    conv->set_filter_channels(numfilters / groups);
-  } else {
-    conv->set_filter_channels(channel / groups);
-  }
-
-  config.layerConfig.set_name("conv");
-
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-  initDataLayer(
-      config, &dataLayers, &datas, &layerMap, "conv", 1, false, useGpu);
-  dataLayers[0]->getOutputValue()->zeroMem();
-  dataLayers[0]->getOutputValue()->copyFrom(*inputData);
-
-  // test layer initialize
-  std::vector<ParameterPtr> parameters;
-  LayerPtr convLayer;
-  initTestLayer(config, &layerMap, &parameters, &convLayer);
-  convLayer->getBiasParameter()->zeroMem();
-  convLayer->getParameters()[0]->zeroMem();
-  convLayer->getParameters()[0]
-      ->getBuf(PARAMETER_VALUE)
-      ->copyFrom(param, weightSize);
-  convLayer->forward(PASS_GC);
-
-  return convLayer->getOutputValue();
-}
-
-TEST(Layer, convParaUnified) {
-#ifdef PADDLE_WITH_CUDA
-  MatrixPtr input, resultCpu, resultGpu;
-
-  /// TEST1 for conv ///
-  input = Matrix::create(1, 4 * 4, false, false);
-  real inputData[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
-  real param[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 7, 6, 5, 4, 3, 2, 1};
-
-  input->setData(inputData);
-
-  resultCpu = doOneConvTest(/* imgSize */ 4,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 3,
-                            /*channel*/ 1,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param,
-                            /*useGpu*/ false);
-
-  resultGpu = doOneConvTest(/* imgSize */ 4,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 3,
-                            /*channel*/ 1,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param,
-                            /*useGpu*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-
-  /// TEST1 for deconv ///
-  input = Matrix::create(1, 2 * 2, false, false);
-  real inputDataT[] = {1, 2, 3, 4};
-  input->setData(inputDataT);
-
-  resultCpu = doOneConvTest(/* imgSize */ 4,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 3,
-                            /*channel*/ 1,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param,
-                            /*useGpu*/ false,
-                            /*isDeconv*/ true);
-
-  resultGpu = doOneConvTest(/* imgSize */ 4,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 3,
-                            /*channel*/ 1,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param,
-                            /*useGpu*/ true,
-                            /*isDeconv*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-
-  /// TEST2 for conv ///
-  input = Matrix::create(1, 3 * 3 * 2, false, false);
-  real inputData2[] = {
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
-  real param2[] = {1, 2, 3, 4, 5, 6, 7, 8, 8, 7, 6, 5, 4, 3, 2, 1};
-
-  input->setData(inputData2);
-
-  resultCpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param2,
-                            /*useGpu*/ false);
-
-  resultGpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param2,
-                            /*useGpu*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-
-  /// TEST3 for conv ///
-  real param3[] = {1, 2, 3, 4, 4, 3, 2, 1};
-
-  resultCpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 2,
-                            input,
-                            param3,
-                            /*useGpu*/ false);
-
-  resultGpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 2,
-                            input,
-                            param3,
-                            /*useGpu*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-
-  /// TEST2 for deconv ///
-  input = Matrix::create(1, 2 * 2 * 2, false, false);
-  real inputData2T[] = {1, 2, 3, 4, 5, 6, 7, 8};
-  input->setData(inputData2T);
-
-  resultCpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param2,
-                            /*useGpu*/ false,
-                            /*isDeconv*/ true);
-
-  resultGpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 1,
-                            input,
-                            param2,
-                            /*useGpu*/ true,
-                            /*isDeconv*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-
-  /// TEST3 for deconv ///
-  resultCpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 2,
-                            input,
-                            param3,
-                            /*useGpu*/ false,
-                            /*isDeconv*/ true);
-
-  resultGpu = doOneConvTest(/* imgSize */ 3,
-                            /* output_x */ 2,
-                            /* stride */ 1,
-                            /* padding */ 0,
-                            /* filter_size */ 2,
-                            /*channel*/ 2,
-                            /*numfilters*/ 2,
-                            /*groups*/ 2,
-                            input,
-                            param3,
-                            /*useGpu*/ true,
-                            /*isDeconv*/ true);
-  checkMatrixEqual(resultCpu, resultGpu);
-#endif
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
deleted file mode 100644
index 0041ed30939d1a6111a2db753da6172bb65e374b..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
+++ /dev/null
@@ -1,352 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <random>
-#include <sstream>
-
-#include <gtest/gtest.h>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-
-DECLARE_int32(gpu_id);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-const size_t MAX_SEQ_NUM = 23;
-const size_t MAX_SEQ_LEN = 50;
-const size_t MAX_BEAM_SIZE = 27;
-
-const size_t SEED = (size_t)(time(NULL));
-
-struct SingleBeamExpansion {
-  vector<int> seqStartPos;
-  vector<int> subSeqStartPos;
-  vector<real> candidateScores;
-
-  // TODO(caoying): store this into Argument.ids
-  vector<real> selectedIndices;
-
-  vector<int> groundTruth;
-  vector<size_t> inBeam;
-  vector<int> rowIdxInBeam;
-  vector<int> colIdxInBeam;
-
-  void resetGroundTruth(size_t n) {
-    groundTruth.clear();
-    groundTruth.resize(n, -1);
-
-    inBeam.clear();
-    inBeam.resize(n, 0);
-
-    rowIdxInBeam.clear();
-    rowIdxInBeam.resize(n, -1);
-
-    colIdxInBeam.clear();
-    colIdxInBeam.resize(n, -1);
-  }
-};
-
-inline float randFloat() {
-  return static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
-}
-
-void genRand(real* numbers, size_t n) {
-  default_random_engine generator;
-  uniform_real_distribution<real> distribution(0.0, 1.0);
-  for (size_t i = 0; i < n; ++i) numbers[i] = distribution(generator);
-}
-
-vector<real> randSampling(real range, int n) {
-  CHECK_GE(range, n);
-  vector<real> num(range);
-  iota(begin(num), end(num), 0.);
-  if (range == n) return num;
-
-  random_shuffle(begin(num), end(num));
-  num.resize(n);
-  sort(begin(num), end(num));
-  return num;
-}
-
-void genCandidateScores(bool hasSubseq,
-                        size_t beamSize,
-                        SingleBeamExpansion& prevBeam,
-                        SingleBeamExpansion& curBeam) {
-  vector<int>& seqStartPos = curBeam.seqStartPos;
-  seqStartPos.resize(1, 0);
-  vector<int>& subSeqStartPos = curBeam.subSeqStartPos;
-  subSeqStartPos.resize(1, 0);
-
-  srand(SEED);
-  if (prevBeam.selectedIndices.size()) {
-    if (prevBeam.subSeqStartPos.size() > 1) {
-      int seqIdx = 1;
-      // samples in previous beam are nested sequences.
-      for (size_t i = 1; i < prevBeam.subSeqStartPos.size(); ++i) {
-        for (size_t j = 0; j < beamSize; ++j) {
-          if (prevBeam.selectedIndices[(i - 1) * beamSize + j] == -1.) break;
-          subSeqStartPos.push_back(1 + (rand() % MAX_SEQ_LEN) +
-                                   subSeqStartPos.back());
-        }
-        if (prevBeam.seqStartPos[seqIdx] == prevBeam.subSeqStartPos[i]) {
-          seqStartPos.push_back(subSeqStartPos.back());
-          seqIdx++;
-        }
-      }
-    } else {
-      for (size_t i = 0; i <= prevBeam.selectedIndices.size(); ++i) {
-        if (i && i % beamSize == 0) {
-          seqStartPos.push_back(subSeqStartPos.back());
-          if (i == prevBeam.selectedIndices.size()) break;
-        }
-        if (prevBeam.selectedIndices[i] == -1.) continue;
-        subSeqStartPos.push_back(subSeqStartPos.back() +
-                                 (1 + (rand() % MAX_SEQ_LEN)));
-      }
-    }
-  } else {
-    // the first beam expansion
-    int seqNum = 1 + (rand() % MAX_SEQ_NUM);
-    for (int i = 0; i < seqNum; ++i) {
-      if (hasSubseq) {
-        for (size_t j = 0; j < 1 + (rand() % MAX_SEQ_NUM); ++j)
-          subSeqStartPos.push_back(subSeqStartPos.back() +
-                                   (1 + (rand() % MAX_SEQ_LEN)));
-        seqStartPos.push_back(subSeqStartPos.back());
-      } else {
-        seqStartPos.push_back(seqStartPos.back() +
-                              (1 + (rand() % MAX_SEQ_LEN)));
-      }
-    }
-  }
-
-  size_t totalSeqNum = hasSubseq ? subSeqStartPos.back() : seqStartPos.back();
-  curBeam.candidateScores.resize(totalSeqNum, 0.);
-  genRand(curBeam.candidateScores.data(), totalSeqNum);
-}
-
-void genSelectedIndices(size_t beamSize,
-                        vector<int>& seqStartPos,
-                        vector<real>& selectedIndices) {
-  size_t selectedIdsCount = beamSize * (seqStartPos.size() - 1);
-  selectedIndices.resize(selectedIdsCount, -1.);
-
-  for (size_t i = 0; i < seqStartPos.size() - 1; ++i) {
-    int seqLen = seqStartPos[i + 1] - seqStartPos[i];
-    int n = min(seqLen, static_cast<int>(beamSize));
-    vector<real> ids = randSampling(seqLen, n);
-    memcpy(selectedIndices.data() + i * beamSize,
-           ids.data(),
-           sizeof(real) * ids.size());
-  }
-}
-
-void genGroundTruth(vector<SingleBeamExpansion>& beamExpansions,
-                    size_t beamSize) {
-  SingleBeamExpansion& beam = beamExpansions[1];
-  size_t seqNum = beam.seqStartPos.size() - 1;
-  for (size_t i = 2; i < beamExpansions.size(); ++i)
-    CHECK_EQ(seqNum, beamExpansions[i].seqStartPos.size() - 1);
-
-  srand(SEED);
-
-  // initialize the first beam.
-  beam.resetGroundTruth(seqNum);
-  for (size_t i = 0; i < seqNum; ++i) {
-    if (randFloat() > 0.5) {
-      /*
-       * force the randomly generated label falls in the beam by chance 0.5.
-       * otherwise, when sequence length is relatively long and beam size is
-       * relatively small, the gold sequences falls off the beam at in the
-       * first search.
-       */
-      real* begPos = beam.selectedIndices.data() + i * beamSize;
-      beam.colIdxInBeam[i] =
-          rand() % count_if(begPos, begPos + beamSize, [](const real& val) {
-            return val != -1.;
-          });
-      beam.groundTruth[i] =
-          beam.selectedIndices[i * beamSize + beam.colIdxInBeam[i]];
-      beam.inBeam[i] = 1;
-    } else {
-      int label = rand() % (beam.seqStartPos[i + 1] - beam.seqStartPos[i]);
-      beam.groundTruth[i] = label;
-
-      real* begPos = beam.selectedIndices.data() + i * beamSize;
-      real* endPos = begPos + beamSize;
-      real* lblPos = find(begPos, endPos, real(label));
-      if (lblPos != endPos) {
-        beam.inBeam[i] = 1;
-        beam.colIdxInBeam[i] = lblPos - begPos;
-      }
-    }
-    beam.rowIdxInBeam[i] = i;
-  }
-
-  // iterate over each beam expansions
-  for (size_t i = 2; i < beamExpansions.size(); ++i) {
-    SingleBeamExpansion& curBeam = beamExpansions[i];
-    SingleBeamExpansion& prevBeam = beamExpansions[i - 1];
-    curBeam.resetGroundTruth(seqNum);
-
-    // iterate over each sequence
-    for (size_t j = 0; j < seqNum; ++j) {
-      if (!prevBeam.inBeam[j]) continue;
-
-      // gold sequence falls in the beam in previous search.
-      real* begPos = prevBeam.selectedIndices.data();
-      int offset =
-          prevBeam.rowIdxInBeam[j] * beamSize + prevBeam.colIdxInBeam[j];
-      curBeam.rowIdxInBeam[j] = count_if(
-          begPos, begPos + offset, [](const real& val) { return val != -1.; });
-
-      if (randFloat() > 0.5) {
-        // force the randomly generated label falls in the beam by chance 0.5.
-
-        real* start =
-            curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize;
-        int n = rand() % count_if(start, start + beamSize, [](const real& val) {
-                  return val != -1.;
-                });
-        curBeam.colIdxInBeam[j] = n;
-        curBeam.groundTruth[j] = *(start + n);
-        curBeam.inBeam[j] = 1;
-      } else {
-        CHECK_LE((size_t)curBeam.rowIdxInBeam[j] + 1,
-                 curBeam.subSeqStartPos.size() - 1);
-        int start = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j]];
-        int end = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j] + 1];
-        CHECK_GT(size_t(end), size_t(start));
-        int label = rand() % (end - start);
-
-        curBeam.groundTruth[j] = label;
-        real* findBeg =
-            curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize;
-        real* lblPos =
-            find(findBeg, findBeg + beamSize, static_cast<real>(label));
-        if (lblPos != (findBeg + beamSize)) {
-          curBeam.inBeam[j] = 1;
-          curBeam.colIdxInBeam[j] = lblPos - findBeg;
-        }
-      }
-    }
-  }
-}
-
-void genOneBeam(size_t beamSize,
-                bool hasSubseq,
-                SingleBeamExpansion& prevBeam,
-                SingleBeamExpansion& curBeam) {
-  genCandidateScores(hasSubseq, beamSize, prevBeam, curBeam);
-  genSelectedIndices(beamSize,
-                     hasSubseq ? curBeam.subSeqStartPos : curBeam.seqStartPos,
-                     curBeam.selectedIndices);
-}
-
-void genRandomBeamExpansion(size_t expansionCount,
-                            size_t beamSize,
-                            vector<SingleBeamExpansion>& beamExpansions) {
-  beamExpansions.clear();
-  beamExpansions.resize(expansionCount + 1);
-
-  // beamExpansions[0] is reserved.
-  for (size_t i = 1; i <= expansionCount; ++i)
-    genOneBeam(beamSize, bool(i - 1), beamExpansions[i - 1], beamExpansions[i]);
-  genGroundTruth(beamExpansions, beamSize);
-}
-
-void testCrossEntropyOverBeam(bool useGpu,
-                              size_t beamSize,
-                              vector<SingleBeamExpansion>& beams) {
-  TestConfig config;
-  config.layerConfig.set_type("cross_entropy_over_beam");
-
-  size_t seqNum = 0;
-  for (size_t i = 1; i < beams.size(); ++i) {
-    const SingleBeamExpansion& beam = beams[i];
-    // create scores for all the candidates
-    MatrixPtr candidateScorePtr =
-        Matrix::create(beam.candidateScores.size(), 1, false, false);
-    candidateScorePtr->copyFrom(beam.candidateScores.data(),
-                                beam.candidateScores.size());
-
-    ostringstream paramName;
-    paramName << "candidate_scores_" << i;
-
-    if (beam.subSeqStartPos.size() > 1) {
-      seqNum = beam.subSeqStartPos.size() - 1;
-      config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
-                                  paramName.str(),
-                                  candidateScorePtr,
-                                  beam.seqStartPos,
-                                  beam.subSeqStartPos});
-    } else {
-      seqNum = beam.seqStartPos.size() - 1;
-      config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
-                                  paramName.str(),
-                                  candidateScorePtr,
-                                  beam.seqStartPos});
-    }
-    config.layerConfig.add_inputs();
-
-    // create indices for the selected candidates
-    MatrixPtr selectedCandidates =
-        Matrix::create(seqNum, beamSize, false, false);
-    selectedCandidates->copyFrom(beam.selectedIndices.data(),
-                                 beam.selectedIndices.size());
-    paramName.clear();
-    paramName << "selected_candidates_" << i;
-    config.inputDefs.push_back(
-        {INPUT_SELF_DEFINE_DATA, paramName.str(), selectedCandidates});
-    config.layerConfig.add_inputs();
-
-    // create the ground truth
-    paramName.clear();
-    paramName << "label_" << i;
-    config.inputDefs.push_back(
-        {INPUT_SELF_DEFINE_DATA, paramName.str(), beam.groundTruth});
-    config.layerConfig.add_inputs();
-  }
-
-  testLayerGrad(
-      config, "cross_entropy_over_beam", seqNum, false, useGpu, false);
-}
-
-TEST(Layer, CrossEntropyOverBeam) {
-  LOG(INFO) << "SEED = " << SEED;
-  const size_t beamSize = 1 + rand() % MAX_BEAM_SIZE;
-  LOG(INFO) << "beamSize = " << beamSize;
-
-  // TODO(caoying): test with random beam expansions.
-  const size_t expansionCount = 3;
-  vector<SingleBeamExpansion> beams;
-  genRandomBeamExpansion(expansionCount, beamSize, beams);
-
-  for (bool useGpu : {false, true})
-    testCrossEntropyOverBeam(useGpu, beamSize, beams);
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  hl_start();
-  hl_init(FLAGS_gpu_id);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(SEED);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/gserver/tests/test_Evaluator.cpp
deleted file mode 100644
index 4a8843f3affe7b1d4f3172be733aefc085c9e7a5..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ /dev/null
@@ -1,267 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/testing/TestUtil.h"
-#include "paddle/trainer/Trainer.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-enum InputType {
-  INPUT_DATA,         // dense vector
-  INPUT_LABEL,        // id
-  INPUT_DATA_TARGET,  // dense vector, but no gradient
-  INPUT_SEQUENCE_DATA,
-  INPUT_SEQUENCE_LABEL,
-  INPUT_SPARSE_NON_VALUE_DATA
-};
-
-struct InputDef {
-  InputType inputType;
-  string name;
-  size_t dim;
-};
-
-struct TestConfig {
-  EvaluatorConfig evaluatorConfig;
-  std::vector<InputDef> inputDefs;
-  bool testAccumulate;
-  TestConfig() : testAccumulate(true) {}
-};
-
-void testEvaluator(TestConfig testConf,
-                   string testEvaluatorName,
-                   size_t batchSize,
-                   bool useGpu) {
-#ifndef PADDLE_WITH_CUDA
-  if (useGpu) return;
-#endif
-  FLAGS_use_gpu = useGpu;
-  testConf.evaluatorConfig.set_name(testEvaluatorName);
-  LOG(INFO) << " evaluator_type=" << testConf.evaluatorConfig.type()
-            << " useGpu=" << useGpu;
-
-  std::vector<Argument> arguments;
-  for (size_t i = 0; i < testConf.inputDefs.size(); ++i) {
-    Argument data;
-    size_t dim = testConf.inputDefs[i].dim;
-    switch (testConf.inputDefs[i].inputType) {
-      case INPUT_DATA:
-      case INPUT_SEQUENCE_DATA:
-      case INPUT_DATA_TARGET:
-        data.value = Matrix::create(batchSize, dim, false, useGpu);
-        data.value->randomizeUniform();
-
-        // make sure output > 0 && output < 1
-        data.value->add(-0.5);
-        data.value->sigmoid(*data.value);
-        break;
-      case INPUT_LABEL:
-      case INPUT_SEQUENCE_LABEL:
-        data.ids = VectorT<int>::create(batchSize, useGpu);
-        data.ids->rand(dim);  // now rand number can be 0 to inputDefs[i].dim.
-        break;
-      case INPUT_SPARSE_NON_VALUE_DATA:
-        data.value = makeRandomSparseMatrix(batchSize,
-                                            dim,
-                                            /* withValue= */ false,
-                                            useGpu);
-        break;
-      default:
-        LOG(FATAL) << " unknown inputType ";
-        return;
-    }
-
-    ICpuGpuVectorPtr sequenceStartPositions;
-    if (testConf.inputDefs[i].inputType == INPUT_SEQUENCE_DATA ||
-        testConf.inputDefs[i].inputType == INPUT_SEQUENCE_LABEL) {
-      if (!sequenceStartPositions) {
-        generateSequenceStartPositions(batchSize, sequenceStartPositions);
-      }
-      data.sequenceStartPositions = sequenceStartPositions;
-    }
-
-    arguments.push_back(data);
-  }
-
-  Evaluator* testEvaluator = Evaluator::create(testConf.evaluatorConfig);
-  double totalScore = 0.0;
-  testEvaluator->start();
-  totalScore += testEvaluator->evalImp(arguments);
-  testEvaluator->updateSamplesNum(arguments);
-  testEvaluator->finish();
-  LOG(INFO) << *testEvaluator;
-
-  std::vector<std::string> names;
-  testEvaluator->getNames(&names);
-  paddle::Error err;
-  for (auto& name : names) {
-    auto value = testEvaluator->getValue(name, &err);
-    ASSERT_TRUE(err.isOK());
-    LOG(INFO) << name << " " << value;
-    auto tp = testEvaluator->getType(name, &err);
-    ASSERT_TRUE(err.isOK());
-    ASSERT_EQ(testConf.evaluatorConfig.type(), tp);
-  }
-
-  double totalScore2 = 0.0;
-  if (testConf.testAccumulate) {
-    testEvaluator->start();
-    totalScore2 += testEvaluator->evalImp(arguments);
-    testEvaluator->finish();
-    EXPECT_LE(fabs(totalScore - totalScore2), 1.0e-5);
-  }
-}
-
-void testEvaluatorAll(TestConfig testConf,
-                      string testEvaluatorName,
-                      size_t batchSize) {
-  testEvaluator(testConf, testEvaluatorName, batchSize, true);
-  testEvaluator(testConf, testEvaluatorName, batchSize, false);
-}
-
-TEST(Evaluator, detection_map) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("detection_map");
-  config.evaluatorConfig.set_overlap_threshold(0.5);
-  config.evaluatorConfig.set_background_id(0);
-  config.evaluatorConfig.set_ap_type("Integral");
-  config.evaluatorConfig.set_evaluate_difficult(0);
-
-  config.inputDefs.push_back({INPUT_DATA, "output", 7});
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "label", 6});
-  config.evaluatorConfig.set_evaluate_difficult(false);
-  testEvaluatorAll(config, "detection_map", 100);
-
-  config.evaluatorConfig.set_evaluate_difficult(true);
-  testEvaluatorAll(config, "detection_map", 100);
-}
-
-TEST(Evaluator, classification_error) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("classification_error");
-  config.evaluatorConfig.set_top_k(5);
-
-  config.inputDefs.push_back({INPUT_DATA, "output", 50});
-  config.inputDefs.push_back({INPUT_LABEL, "label", 50});
-  testEvaluatorAll(config, "classification_error", 100);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  testEvaluatorAll(config, "classification_error_weight", 100);
-
-  // multi binary labels
-  config.inputDefs.clear();
-  config.inputDefs.push_back({INPUT_DATA, "output", 100});
-  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "label", 100});
-  // Not support GPU
-  testEvaluator(config, "classification_error_multi_binary_label", 50, false);
-
-  config.evaluatorConfig.set_classification_threshold(0.4);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  // Not support GPU
-  testEvaluator(
-      config, "classification_error_weight_multi_binary_label", 50, false);
-}
-
-TEST(Evaluator, sum) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("sum");
-
-  // sum of output
-  config.inputDefs.push_back({INPUT_DATA, "output", 10});
-  testEvaluatorAll(config, "sum_output", 200);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  testEvaluatorAll(config, "sum_output_weight", 200);
-
-  // sum of label
-  config.inputDefs.clear();
-  config.inputDefs.push_back({INPUT_LABEL, "label", 10});
-  testEvaluatorAll(config, "sum_label", 200);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  testEvaluatorAll(config, "sum_label_weight", 200);
-}
-
-TEST(Evaluator, last_column_sum) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("last-column-sum");
-
-  config.inputDefs.push_back({INPUT_DATA, "output", 50});
-  testEvaluatorAll(config, "last-column-sum", 200);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  testEvaluatorAll(config, "last-column-sum_weight", 200);
-}
-
-TEST(Evaluator, last_column_auc) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("last-column-auc");
-
-  config.inputDefs.push_back({INPUT_DATA, "output", 2});
-  config.inputDefs.push_back({INPUT_LABEL, "label", 2});
-  testEvaluatorAll(config, "last-column-auc", 500);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  testEvaluatorAll(config, "last-column-auc_weight", 200);
-}
-
-TEST(Evaluator, precision_recall) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("precision_recall");
-
-  config.inputDefs.push_back({INPUT_DATA, "output", 10});
-  config.inputDefs.push_back({INPUT_LABEL, "label", 10});
-  testEvaluatorAll(config, "precision_recall", 200);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  testEvaluatorAll(config, "precision_recall_weight", 200);
-
-  LOG(INFO) << "positive_label = 5";
-  config.evaluatorConfig.set_positive_label(5);
-  testEvaluatorAll(config, "precision_recall_weight", 200);
-
-  // multi binary labels
-  config.inputDefs.clear();
-  config.evaluatorConfig.set_positive_label(-1);
-  config.inputDefs.push_back({INPUT_DATA, "output", 10});
-  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "label", 10});
-  // Not support GPU
-  testEvaluator(config, "precision_recall_multi_binary_label", 100, false);
-
-  LOG(INFO) << "classification_threshold = 0.4";
-  config.evaluatorConfig.set_classification_threshold(0.4);
-  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
-  // Not support GPU
-  testEvaluator(
-      config, "precision_recall_weight_multi_binary_label", 100, false);
-}
-
-TEST(Evaluator, ctc_error_evaluator) {
-  TestConfig config;
-  config.evaluatorConfig.set_type("ctc_edit_distance");
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "output", 32});
-  config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "label", 1});
-  testEvaluatorAll(config, "ctc_error_evaluator", 100);
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_KmaxSeqScore.cpp b/paddle/gserver/tests/test_KmaxSeqScore.cpp
deleted file mode 100644
index 168ffbdac8cd6fb0ee4fa62e3766905c30d1844b..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_KmaxSeqScore.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/utils/GlobalConstants.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-vector<int> randSampling(int range, int n) {
-  CHECK_GE(range, n);
-  vector<int> num(range);
-  iota(begin(num), end(num), 0);
-  if (range == n) return num;
-
-  random_shuffle(begin(num), end(num));
-  num.resize(n);
-  return num;
-}
-
-void genRandomSeqInfo(vector<int>& seqStartPosition,
-                      vector<int>& subSeqStartPosition) {
-  const int maxSeqNum = 100;
-  // generate random start position information
-  int seqNum = 1 + (rand() % maxSeqNum);
-  seqStartPosition.resize(seqNum + 1, 0);
-  subSeqStartPosition.resize(1, 0);
-
-  for (int i = 0; i < seqNum; ++i) {
-    int subSeqLen = 1 + (rand() % maxSeqNum);
-    for (int j = 0; j < subSeqLen; ++j)
-      subSeqStartPosition.push_back(subSeqStartPosition.back() + subSeqLen);
-    seqStartPosition[i + 1] = subSeqStartPosition.back();
-  }
-}
-
-void genRandomGroundTruth(real* values,
-                          vector<vector<int>>& groundTruth,
-                          vector<int>& startPos,
-                          size_t beamSize) {
-  groundTruth.resize(startPos.size() - 1, vector<int>(beamSize, -1));
-  for (size_t i = 0; i < startPos.size() - 1; ++i) {
-    int seqLen = startPos[i + 1] - startPos[i];
-    vector<int> pos =
-        randSampling(seqLen, min(static_cast<int>(beamSize), seqLen));
-    for (size_t j = 0; j < pos.size(); ++j) {
-      groundTruth[i][j] = pos[j];
-      values[startPos[i] + pos[j]] = 1.;
-    }
-  }
-}
-
-void checkLayerOut(vector<vector<int>> groundTruth,
-                   real* layerOut,
-                   size_t beamSize) {
-  for (size_t i = 0; i < groundTruth.size(); ++i) {
-    int begPos = i * beamSize;
-    vector<real> tmp(layerOut + begPos, layerOut + begPos + beamSize);
-    sort(begin(tmp), end(tmp));
-    sort(begin(groundTruth[i]), end(groundTruth[i]));
-    for (size_t j = 0; j < beamSize; ++j) CHECK_EQ(tmp[j], groundTruth[i][j]);
-  }
-}
-
-TEST(Layer, kmaxSeqScoreLayer) {
-  const size_t maxBeamSize = 100;
-  size_t beamSize = 1 + (rand() % maxBeamSize);
-
-  vector<int> seqStartPosition;
-  vector<int> subSeqStartPosition;
-  genRandomSeqInfo(seqStartPosition, subSeqStartPosition);
-  MatrixPtr inValue =
-      Matrix::create(subSeqStartPosition.back(), 1, false, false);
-
-  std::vector<bool> mode = {false};
-#ifdef PADDLE_WITH_CUDA
-  mode.push_back(true);
-#endif
-
-  for (auto hasSubseq : {false, true}) {
-    vector<vector<int>> groundTruth;
-    inValue->randomizeUniform();
-    genRandomGroundTruth(inValue->getData(),
-                         groundTruth,
-                         hasSubseq ? subSeqStartPosition : seqStartPosition,
-                         beamSize);
-
-    for (auto useGpu : mode) {
-      TestConfig config;
-      config.layerConfig.set_type("kmax_seq_score");
-      config.layerConfig.set_beam_size(beamSize);
-
-      if (hasSubseq) {
-        config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
-                                    "scores",
-                                    inValue,
-                                    seqStartPosition,
-                                    subSeqStartPosition});
-      } else {
-        config.inputDefs.push_back(
-            {INPUT_SELF_DEFINE_DATA, "scores", inValue, seqStartPosition});
-      }
-      config.layerConfig.add_inputs();
-
-      // data layer initialize
-      std::vector<DataLayerPtr> dataLayers;
-      LayerMap layerMap;
-      vector<Argument> datas;
-      initDataLayer(
-          config,
-          &dataLayers,
-          &datas,
-          &layerMap,
-          "kmax_seq_score",
-          100 /* actually this parameter is unused in self-defined input*/,
-          false,
-          useGpu);
-      // test layer initialize
-      std::vector<ParameterPtr> parameters;
-      LayerPtr kmaxSeqScoreLayer;
-      FLAGS_use_gpu = useGpu;
-      initTestLayer(config, &layerMap, &parameters, &kmaxSeqScoreLayer);
-      kmaxSeqScoreLayer->forward(PASS_TRAIN);
-
-      const MatrixPtr outValue = kmaxSeqScoreLayer->getOutputValue();
-      CHECK_EQ(outValue->getHeight(),
-               hasSubseq ? subSeqStartPosition.size() - 1
-                         : seqStartPosition.size() - 1);
-      CHECK_EQ(outValue->getWidth(), beamSize);
-      checkLayerOut(groundTruth, outValue->getData(), beamSize);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand((size_t)(time(NULL)));
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
deleted file mode 100644
index 1254d580505512dc8fd7e34a053a7538832d271f..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ /dev/null
@@ -1,2532 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-#include <cudnn.h>
-#endif
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/math/MathUtils.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(gpu_id);
-DECLARE_double(checkgrad_eps);
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_bool(prev_batch_state);
-
-TEST(Operator, dot_mul) {
-  TestConfig config;
-  config.layerConfig.set_size(10);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
-  operatorConf.set_type("dot_mul");
-  operatorConf.set_dotmul_scale(-1);
-
-  testOperatorGrad(config, operatorConf, 100, false, false);
-}
-
-TEST(Projection, context) {
-  for (auto contextStart : {-5, -3, -1, 0, 3}) {
-    for (auto contextLength : {1, 2, 5, 7}) {
-      for (auto batchSize : {1, 2, 5, 20}) {
-        for (auto trainablePadding : {false, true}) {
-          LOG(INFO) << " contextStart=" << contextStart
-                    << " contextLength=" << contextLength
-                    << " batchSize=" << batchSize
-                    << " trainablePadding=" << trainablePadding;
-          ProjectionConfig conf;
-          conf.set_type("context");
-          conf.set_input_size(10);
-          conf.set_context_start(contextStart);
-          conf.set_context_length(contextLength);
-          conf.set_trainable_padding(trainablePadding);
-          conf.set_output_size(conf.context_length() * conf.input_size());
-          int pad =
-              std::max(0, -conf.context_start()) +
-              std::max(0, conf.context_start() + conf.context_length() - 1);
-          for (auto useGpu : {false, true}) {
-            testProjectionGrad(
-                conf,
-                INPUT_SEQUENCE_DATA,
-                trainablePadding ? conf.input_size() * pad : 0,
-                batchSize,
-                useGpu,
-                contextStart + contextLength <= 1);  // = testState
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Projection, trans_fc) {
-  ProjectionConfig conf;
-  conf.set_type("trans_fc");
-  conf.set_input_size(50);
-  conf.set_output_size(20);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 1000,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, fc) {
-  ProjectionConfig conf;
-  conf.set_type("fc");
-  conf.set_input_size(10);
-  conf.set_output_size(20);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 200,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, dot_mul) {
-  ProjectionConfig conf;
-  conf.set_type("dot_mul");
-  conf.set_input_size(20);
-  conf.set_output_size(20);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 20,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, table) {
-  ProjectionConfig conf;
-  conf.set_type("table");
-  conf.set_input_size(10);
-  conf.set_output_size(20);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_LABEL,
-                       /* parameterSize */ 200,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, identity) {
-  ProjectionConfig conf;
-  conf.set_type("identity");
-  conf.set_input_size(10);
-  conf.set_output_size(10);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 0,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-TEST(Projection, slice) {
-  ProjectionConfig conf;
-  conf.set_type("slice");
-  conf.set_input_size(100);
-  SliceConfig& slice1 = *conf.add_slices();
-  slice1.set_start(10);
-  slice1.set_end(20);
-  SliceConfig& slice2 = *conf.add_slices();
-  slice2.set_start(50);
-  slice2.set_end(70);
-  conf.set_output_size(30);
-  for (auto useGpu : {false, true}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 0,
-                       /* batchSize */ 10,
-                       useGpu);
-  }
-}
-
-TEST(Projection, scaling) {
-  ProjectionConfig conf;
-  conf.set_type("scaling");
-  conf.set_input_size(10);
-  conf.set_output_size(10);
-  for (auto useGpu : {false}) {
-    testProjectionGrad(conf,
-                       INPUT_DATA,
-                       /* parameterSize */ 1,
-                       /* batchSize */ 100,
-                       useGpu);
-  }
-}
-
-void testProjectionConv(size_t groups, bool isDeconv) {
-  const int NUM_FILTERS = 18;
-  const int FILTER_SIZE = 2;
-  const int FILTER_SIZE_Y = 2;
-  const int CHANNELS = 3;
-  const int IMAGE_SIZE = 16;
-
-#if CUDNN_VERSION >= 6000
-  const int DILATION = 2;
-#else
-  const int DILATION = 1;
-#endif
-
-  ProjectionConfig conf;
-  if (isDeconv) {
-    conf.set_type("convt");
-  } else {
-    conf.set_type("conv");
-  }
-  conf.set_num_filters(NUM_FILTERS);
-
-  ConvConfig* conv = conf.mutable_conv_conf();
-  conv->set_filter_size(FILTER_SIZE);
-  conv->set_filter_size_y(FILTER_SIZE_Y);
-  conv->set_channels(CHANNELS);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_dilation(DILATION);
-  conv->set_dilation_y(DILATION);
-  conv->set_groups(groups);
-  if (isDeconv) {
-    conv->set_filter_channels(NUM_FILTERS / conv->groups());
-  } else {
-    conv->set_filter_channels(conv->channels() / conv->groups());
-  }
-  conv->set_img_size(IMAGE_SIZE);
-  int output_x = outputSize(conv->img_size(),
-                            (conv->filter_size() - 1) * DILATION + 1,
-                            conv->padding(),
-                            conv->stride(),
-                            /* caffeMode */ true);
-  int output_y = outputSize(conv->img_size(),
-                            (conv->filter_size_y() - 1) * DILATION + 1,
-                            conv->padding_y(),
-                            conv->stride_y(),
-                            /* caffeMode */ true);
-  conv->set_output_x(output_x);
-  conv->set_output_y(output_y);
-  LOG(INFO) << "DILATION:" << DILATION << "; output_x: " << output_x
-            << "; output_y: " << output_y;
-  if (isDeconv) {
-    int deconv_image_x = imageSize(output_x,
-                                   (conv->filter_size() - 1) * DILATION + 1,
-                                   conv->padding(),
-                                   conv->stride(),
-                                   /* caffeMode */ true);
-    int deconv_image_y = imageSize(output_y,
-                                   (conv->filter_size_y() - 1) * DILATION + 1,
-                                   conv->padding_y(),
-                                   conv->stride_y(),
-                                   /* caffeMode */ true);
-
-    LOG(INFO) << " deconv_image_x: " << deconv_image_x
-              << "; deconv_image_y: " << deconv_image_y;
-    conf.set_input_size(output_x * output_y * CHANNELS);
-    conf.set_output_size(deconv_image_x * deconv_image_y * NUM_FILTERS);
-  } else {
-    conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
-    conf.set_output_size(output_x * output_y * NUM_FILTERS);
-  }
-
-  testProjectionGrad(conf,
-                     INPUT_DATA,
-                     /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE *
-                         FILTER_SIZE_Y / groups,
-                     /* batchSize */ 100,
-                     true,
-                     false,
-                     NUM_FILTERS,
-                     true);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(Projection, conv) {
-  /// test ConvProjection
-  testProjectionConv(1, false);
-  testProjectionConv(3, false);
-  /// test ConvTransProjection
-  testProjectionConv(1, true);
-  testProjectionConv(3, true);
-}
-#endif
-
-TEST(Layer, BilinearInterpLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("bilinear_interp");
-  config.biasSize = 0;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
-
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  BilinearInterpConfig* bilinear = input->mutable_bilinear_interp_conf();
-  ImageConfig* image = bilinear->mutable_image_conf();
-  image->set_img_size(32);
-  image->set_img_size_y(32);
-  image->set_channels(4);
-
-  for (auto useGpu : {false, true}) {
-    for (auto outSize : {32, 64}) {
-      bilinear->set_out_size_x(outSize);
-      bilinear->set_out_size_y(outSize);
-      testLayerGrad(config, "bilinear_interp", 10, false, useGpu);
-    }
-  }
-}
-
-TEST(Layer, concat) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("concat");
-  config.layerConfig.set_size(15);
-  config.layerConfig.set_active_type("sigmoid");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "concat", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, AddtoLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("addto");
-  config.layerConfig.set_size(10);
-  config.layerConfig.set_active_type("sigmoid");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "addto", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, CTCLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("ctc");
-  config.layerConfig.set_norm_by_times(false);
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "ctc",
-                  100,
-                  /* trans */ false, /* useGpu */
-                  useGpu);
-  }
-}
-
-TEST(Layer, cosSimLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("cos");
-  config.layerConfig.set_size(1);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 50, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "cos", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, CosSimVecMatLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("cos_vm");
-  config.layerConfig.set_size(5);  // output size
-  config.layerConfig.set_cos_scale(2.0);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 20, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 100, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "cos_vm", 100, false, useGpu);
-  }
-}
-
-void testDepthwiseConvLayer(const string& type, bool useGpu) {
-  TestConfig config;
-  config.biasSize = 32;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_num_filters(32);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 2048, 192});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(2);
-  conv->set_filter_size_y(3);
-  conv->set_channels(16);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(16);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  conv->set_img_size(16);
-  conv->set_img_size_y(8);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /* caffeMode */ true));
-  conv->set_output_y(outputSize(conv->img_size_y(),
-                                conv->filter_size_y(),
-                                conv->padding_y(),
-                                conv->stride_y(),
-                                /* caffeMode */ true));
-  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-                              config.layerConfig.num_filters());
-
-  testLayerGrad(config, "depthwise_conv", 100, false, useGpu);
-  // Use small batch_size and useWeight=true to test biasGrad
-  testLayerGrad(config, "depthwise_conv", 2, false, useGpu, true, 0.02);
-}
-
-TEST(Layer, depthwiseConvLayer) {
-  //  'depthwise_conv' is a sepecial case of 'exconv' whose
-  //  groups size equals to the input channels size.
-  testDepthwiseConvLayer("exconv", /* useGpu= */ false);
-#ifdef PADDLE_WITH_CUDA
-  testDepthwiseConvLayer("exconv", /* useGpu= */ true);
-#endif
-}
-
-void testConvLayer(const string& type, bool trans, bool useGpu) {
-  TestConfig config;
-  config.biasSize = 16;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_num_filters(16);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  int dilation = 2;
-  if (type == "cudnn_conv") {
-#if CUDNN_VERSION >= 6000
-    dilation = 2;
-#else
-    dilation = 1;
-#endif
-  }
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 768, 192});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(2);
-  conv->set_filter_size_y(2);
-  conv->set_channels(3);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_dilation(dilation);
-  conv->set_dilation_y(dilation);
-  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  conv->set_img_size(16);
-  conv->set_img_size_y(16);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                (conv->filter_size() - 1) * dilation + 1,
-                                conv->padding(),
-                                conv->stride(),
-                                /* caffeMode */ true));
-  conv->set_output_y(outputSize(conv->img_size_y(),
-                                (conv->filter_size_y() - 1) * dilation + 1,
-                                conv->padding_y(),
-                                conv->stride_y(),
-                                /* caffeMode */ true));
-  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-                              config.layerConfig.num_filters());
-
-  testLayerGrad(config, "conv", 100, trans, useGpu);
-  // Use small batch_size and useWeight=true to test biasGrad
-  testLayerGrad(config, "conv", 2, trans, useGpu, true, 0.02);
-}
-
-TEST(Layer, convLayer) {
-  testConvLayer("exconv", /* trans= */ false, /* useGpu= */ false);
-#ifdef PADDLE_WITH_CUDA
-  testConvLayer("exconv", /* trans= */ false, /* useGpu= */ true);
-  testConvLayer("cudnn_conv", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-void testConvTransLayer(const string& type, bool trans, bool useGpu) {
-  TestConfig config;
-  config.biasSize = 3;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_num_filters(3);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 384});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_filter_size(2);
-  conv->set_filter_size_y(4);
-  conv->set_channels(16);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_filter_channels(3 / conv->groups());
-  conv->set_img_size(16);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /* caffeMode */ true));
-
-  config.layerConfig.set_size(conv->img_size() * conv->img_size() *
-                              config.layerConfig.num_filters());
-
-  testLayerGrad(config, "convTrans", 100, trans, useGpu);
-  // Use small batch_size and useWeight=true to test biasGrad
-  testLayerGrad(config, "convTrans", 2, trans, useGpu, true, 0.02);
-}
-
-TEST(Layer, convTransLayer) {
-  for (auto useGpu : {false, true}) {
-    testConvTransLayer("exconvt", /* trans= */ false, /* useGpu= */ useGpu);
-  }
-#ifdef PADDLE_WITH_CUDA
-  testConvTransLayer("cudnn_convt", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-TEST(Layer, blockExpandLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("blockexpand");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 6144, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  BlockExpandConfig* blockExpand = input->mutable_block_expand_conf();
-  blockExpand->set_img_size_x(64);
-  blockExpand->set_img_size_y(32);
-  blockExpand->set_channels(3);
-  blockExpand->set_padding_x(0);
-  blockExpand->set_padding_y(0);
-  blockExpand->set_block_x(4);
-  blockExpand->set_block_y(32);
-  blockExpand->set_stride_x(2);
-  blockExpand->set_stride_y(2);
-  blockExpand->set_output_x(outputSize(blockExpand->img_size_x(),
-                                       blockExpand->block_x(),
-                                       blockExpand->padding_x(),
-                                       blockExpand->stride_x(),
-                                       /* caffeMode */ false));
-  blockExpand->set_output_y(outputSize(blockExpand->img_size_y(),
-                                       blockExpand->block_y(),
-                                       blockExpand->padding_y(),
-                                       blockExpand->stride_y(),
-                                       /* caffeMode */ false));
-  config.layerConfig.set_size(blockExpand->block_x() * blockExpand->block_y() *
-                              blockExpand->channels());
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "blockexpand", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, maxoutLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("maxout");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  MaxOutConfig* maxout = input->mutable_maxout_conf();
-  ImageConfig* image = maxout->mutable_image_conf();
-
-  image->set_img_size(32);
-  image->set_img_size_y(32);
-  image->set_channels(4);
-  maxout->set_groups(2);
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "maxout", 10, false, useGpu);
-  }
-}
-
-void testFcLayer(string format, size_t nnz) {
-  TestConfig config;
-  config.biasSize = 1024;
-  config.layerConfig.set_type("fc");
-  config.layerConfig.set_size(1024);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_drop_rate(0.1);
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", 2048, nnz, ParaSparse(format)});
-  config.layerConfig.add_inputs();
-
-  LOG(INFO) << config.inputDefs[0].sparse.sparse << " "
-            << config.inputDefs[0].sparse.format;
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "fc",
-                  100,
-                  /* trans */ false,
-                  useGpu,
-                  /* weight */ true);
-  }
-}
-
-TEST(Layer, fcLayer) {
-  testFcLayer("", 1024 * 1024 * 2);
-  testFcLayer("csc", 1024 * 10);
-  testFcLayer("csr", 1024 * 10);
-}
-
-TEST(Layer, SelectiveFullyConnectedLayer) {
-  TestConfig config;
-  size_t nin = 16;
-  size_t nout = 256;
-  config.layerConfig.set_type("selective_fc");
-  config.layerConfig.set_size(nout);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_has_selected_colums(true);
-  config.layerConfig.set_selective_fc_pass_generation(false);
-  config.biasSize = nout;
-
-  config.inputDefs.push_back({INPUT_DATA, "input0", nin, nin * nout});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back(
-      {INPUT_SPARSE_NON_VALUE_DATA, "index", nout, 0, ParaSparse("csr", true)});
-  config.layerConfig.add_inputs();
-
-  testLayerGrad(config,
-                "selective_fc",
-                100,
-                /* trans= */ false,
-                /* useGup= */ false,
-                false);
-#ifdef PADDLE_WITH_CUDA
-  testLayerGrad(config,
-                "selective_fc",
-                100,
-                /* trans= */ false,
-                /* useGup= */ true,
-                false);
-#endif
-}
-
-TEST(Layer, DataNormLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("data_norm");
-  config.layerConfig.set_size(20);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 20, 100});
-  config.inputDefs.back().isStatic = true;
-  config.layerConfig.add_inputs();
-
-  for (auto strategy : {"z-score", "min-max", "decimal-scaling"}) {
-    config.layerConfig.set_data_norm_strategy(strategy);
-    // The parameters are static, so not support GPU now
-    testLayerGrad(config,
-                  "data_norm",
-                  200,
-                  /* trans */ false,
-                  /* useGpu */ false);
-  }
-}
-
-TEST(Layer, hsigmoidLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("hsigmoid");
-  config.layerConfig.set_num_classes(5);
-  config.layerConfig.set_size(1);
-  config.biasSize = config.layerConfig.num_classes() - 1;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 200});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 5, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "hsigmoid",
-                  100,
-                  /* trans */ false,
-                  /* useGpu */ useGpu);
-  }
-}
-
-TEST(Layer, multi_cross) {
-  TestConfig config;
-  config.layerConfig.set_type("multi-class-cross-entropy");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(
-        config, "multi-class-cross-entropy", 100, /* trans */ false, useGpu);
-  }
-}
-
-TEST(Layer, multi_binary_label_sparse_mat) {
-  TestConfig config;
-  config.layerConfig.set_type("multi_binary_label_cross_entropy");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "layer_1", 50, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "multi_binary_label_cross_entropy",
-                  100,
-                  /* trans */ false,
-                  useGpu);
-  }
-}
-
-TEST(layer, multi_binary_label_id) {
-  TestConfig config;
-  config.layerConfig.set_type("multi_binary_label_cross_entropy");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "multi_binary_label_cross_entropy",
-                  100,
-                  /* trans */ false,
-                  useGpu);
-  }
-}
-
-TEST(Layer, multi_cross_with_selfnorm) {
-  TestConfig config;
-  config.layerConfig.set_type("multi_class_cross_entropy_with_selfnorm");
-  config.layerConfig.set_softmax_selfnorm_alpha(0.1);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // Not support GPU now
-  testLayerGrad(config,
-                "multi_class_cross_entropy_with_selfnorm",
-                100,
-                /* trans */ false,
-                /* useGpu */ false);
-}
-
-TEST(Layer, multi_cross_soft) {
-  TestConfig config;
-  config.layerConfig.set_type("soft_binary_class_cross_entropy");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "soft_binary_class_cross_entropy",
-                  100,
-                  /* trans */ false,
-                  useGpu);
-  }
-}
-
-TEST(Layer, square_error) {
-  TestConfig config;
-  config.layerConfig.set_type("square_error");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "square_error", 100, /* trans */ false, useGpu);
-  }
-}
-
-TEST(Layer, sparse_square_error) {
-  TestConfig config;
-  config.layerConfig.set_type("square_error");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "layer_1", 50, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // "GpuSparseMatrix" as label is not supported
-  testLayerGrad(config,
-                "square_error",
-                100,
-                /* trans */ false,
-                /* useGpu */ false);
-}
-
-TEST(Layer, sparse_float_square_error) {
-  TestConfig config;
-  config.layerConfig.set_type("square_error");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
-  config.inputDefs.push_back({INPUT_SPARSE_FLOAT_VALUE_DATA, "layer_1", 50, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // "GpuSparseMatrix" as label is not supported
-  testLayerGrad(config,
-                "square_error",
-                100,
-                /* trans */ false,
-                /* useGpu */ false);
-}
-
-TEST(Layer, square_error_weighted) {
-  TestConfig config;
-  config.layerConfig.set_type("square_error");
-  config.biasSize = 0;
-  config.testAccumulate = false;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "square_error", 100, /* trans */ false, useGpu);
-  }
-}
-
-TEST(Layer, huber_regression_loss) {
-  TestConfig config;
-  config.layerConfig.set_type("huber_regression");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    for (auto delta : {1, 3, 5}) {
-      config.layerConfig.set_delta(delta);
-      testLayerGrad(config, "huber_regression", 100, /* trans */ false, useGpu);
-    }
-  }
-}
-
-TEST(Layer, huber_two_class) {
-  TestConfig config;
-  config.layerConfig.set_type("huber_classification");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 2, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "huber_two_class", 100, /* trans */ false, useGpu);
-  }
-}
-
-void testExpandLayer(string trans_type, bool hasSubseq) {
-  TestConfig config;
-  config.layerConfig.set_type("expand");
-
-  config.inputDefs.push_back(
-      {trans_type == "non-seq" ? INPUT_DENSE_DIM_DATA : INPUT_SEQUENCE_DATA,
-       "layer_0",
-       10,
-       0});
-  config.inputDefs.push_back(
-      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
-       "layer_1",
-       10,
-       0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.set_trans_type(trans_type);
-  LOG(INFO) << " trans_type=" << trans_type << " hasSubseq=" << hasSubseq;
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "expand", 30, false, useGpu);
-  }
-}
-
-TEST(Layer, ExpandLayer) {
-  testExpandLayer("non-seq", false);  // non-seq expand to seq
-  testExpandLayer("non-seq", true);   // non-seq expand to hasSubseq
-  testExpandLayer("seq", true);       // seq expand to hasSubseq
-}
-
-void testDegradeLayer(bool hasSubseq,
-                      string layer_type,
-                      string trans_type,
-                      int stride) {
-  TestConfig config;
-  config.layerConfig.set_type(layer_type);
-  config.layerConfig.set_size(10);
-  config.layerConfig.set_seq_pool_stride(stride);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back(
-      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
-       "layer_0",
-       10,
-       0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.set_trans_type(trans_type);
-
-  auto testDegradeLayerGrad = [](TestConfig& config, string layer_type) {
-    for (auto useGpu : {false, true}) {
-      testLayerGrad(config, layer_type, 100, false, useGpu);
-    }
-  };
-
-  if (layer_type == "average") {
-    for (auto strategy : {"average", "sum", "squarerootn"}) {
-      LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
-                << " average_strategy=" << strategy
-                << " seq_pool_stride=" << stride;
-      config.layerConfig.set_average_strategy(strategy);
-      testDegradeLayerGrad(config, layer_type);
-    }
-  } else {
-    LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
-              << " seq_pool_stride=" << stride;
-    testDegradeLayerGrad(config, layer_type);
-  }
-}
-
-TEST(Layer, MaxLayer) {
-  testDegradeLayer(false, "max", "non-seq", -1);  // seq max to non-seq
-  testDegradeLayer(false,
-                   "max",
-                   "non-seq",
-                   5);  // seq max to a shorten seq, stride window = 5
-  testDegradeLayer(true, "max", "non-seq", -1);  // hasSubseq max to non-seq
-  testDegradeLayer(true, "max", "seq", -1);      // hasSubseq max to seq
-}
-
-TEST(Layer, SequenceLastInstanceLayer) {
-  testDegradeLayer(false,
-                   "seqlastins",
-                   "non-seq",
-                   -1);  // seq seqlastins to non-seq
-  testDegradeLayer(false,
-                   "seqlastins",
-                   "non-seq",
-                   5);  // seq seqlastins to a shorten seq, stride window = 5
-  testDegradeLayer(true,
-                   "seqlastins",
-                   "non-seq",
-                   -1);  // hasSubseq seqlastins to non-seq
-  testDegradeLayer(true,
-                   "seqlastins",
-                   "seq",
-                   -1);  // hasSubseq seqlastins to seq
-}
-
-TEST(Layer, AverageLayer) {
-  testDegradeLayer(false, "average", "non-seq", -1);  // seq average to non-seq
-  testDegradeLayer(false,
-                   "average",
-                   "non-seq",
-                   5);  // seq average to a shorten seq, stride window = 5
-  testDegradeLayer(true,
-                   "average",
-                   "non-seq",
-                   -1);                          // hasSubseq average to non-seq
-  testDegradeLayer(true, "average", "seq", -1);  // hasSubseq average to seq
-}
-
-TEST(Layer, SequenceConcatLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("seqconcat");
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "seqconcat", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, SequenceReshapeLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("seqreshape");
-  config.layerConfig.set_size(10);
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 100, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "seqreshape", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, ConvShiftLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("conv_shift");
-  config.layerConfig.set_size(10);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 3, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // Not support GPU now
-  testLayerGrad(config, "conv_shift", 100, false, false);
-}
-
-TEST(Layer, PowerLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("power");
-  config.layerConfig.set_size(10);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "power", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, ConvexCombinationLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("convex_comb");
-  config.layerConfig.set_size(20);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 100, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "convex_comb", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, InterpolationLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("interpolation");
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_2", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "interpolation", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, DotProdLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("dot_prod");
-  config.layerConfig.set_size(1);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "dot_prod", 10, false, useGpu);
-  }
-}
-
-TEST(Layer, OuterProdLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("out_prod");
-  config.layerConfig.set_size(100);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "out_prod", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, SlopeInterceptLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("slope_intercept");
-  config.layerConfig.set_size(10);
-  config.layerConfig.set_slope(1.0);
-  config.layerConfig.set_intercept(0.1);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "slope_intercept", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, ScalingLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("scaling");
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "scaling", 100, false, useGpu);
-  }
-}
-
-void testNormLayer(const string& normType, bool trans, bool useGpu) {
-  TestConfig config;
-  config.layerConfig.set_type("norm");
-  config.layerConfig.set_active_type("relu");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1568, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  NormConfig* norm = input->mutable_norm_conf();
-  norm->set_norm_type(normType);
-  norm->set_channels(16);
-  norm->set_size(5);
-  norm->set_scale(0.001);
-  norm->set_pow(0.75);
-  norm->set_blocked(0);
-  norm->set_img_size(14);
-  norm->set_img_size_y(7);
-  norm->set_output_x(norm->img_size());
-  norm->set_output_y(norm->img_size_y());
-  if (norm->norm_type() == "cmrnorm" ||
-      norm->norm_type() == "cmrnorm-projection") {
-    norm->set_scale(norm->scale() / norm->size());
-  } else {
-    norm->set_scale(norm->scale() / (norm->size() * norm->size()));
-  }
-
-  config.layerConfig.set_size(norm->output_x() * norm->output_y() *
-                              norm->channels());
-  config.biasSize = 0;
-
-  testLayerGrad(config, "norm", 100, trans, useGpu);
-}
-
-TEST(Layer, NormLayer) {
-  testNormLayer("cmrnorm-projection",
-                /* trans= */ false, /* useGpu= */
-                true);
-  testNormLayer("cmrnorm-projection",
-                /* trans= */ false, /* useGpu= */
-                false);
-}
-
-void setPoolConfig(TestConfig* config,
-                   PoolConfig* pool,
-                   const string& poolType) {
-  (*config).biasSize = 0;
-  (*config).layerConfig.set_type("pool");
-  (*config).layerConfig.set_num_filters(16);
-
-  int kw = 3, kh = 3;
-  int pw = 0, ph = 0;
-  int sw = 2, sh = 2;
-  pool->set_pool_type(poolType);
-  pool->set_channels(16);
-  pool->set_size_x(kw);
-  pool->set_size_y(kh);
-  pool->set_start(0);
-  pool->set_padding(pw);
-  pool->set_padding_y(ph);
-  pool->set_stride(sw);
-  pool->set_stride_y(sh);
-
-  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
-  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
-  pool->set_output_x(ow);
-  pool->set_output_y(oh);
-}
-
-void testPoolLayer(const string& poolType,
-                   bool trans,
-                   bool useGpu,
-                   bool excludeMode = true) {
-  TestConfig config;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
-
-  pool->set_img_size(14);
-  pool->set_img_size_y(14);
-  pool->set_exclude_mode(excludeMode);
-  setPoolConfig(&config, pool, poolType);
-  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-                              pool->channels());
-
-  testLayerGrad(config, "pool", 100, trans, useGpu);
-}
-
-#ifdef PADDLE_WITH_CUDA
-void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
-  TestConfig config;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
-
-  pool->set_size_y(4);
-  pool->set_stride_y(3);
-  pool->set_img_size(10);
-  pool->set_img_size_y(20);
-  setPoolConfig(&config, pool, poolType);
-  pool->set_output_y((pool->img_size_y() - pool->start() - pool->size_y()) /
-                         ((float)pool->stride_y()) +
-                     1.5);
-  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-                              pool->channels());
-
-  testLayerGrad(config, "pool", 100, trans, useGpu);
-}
-#endif
-
-TEST(Layer, PoolLayer) {
-  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
-  testPoolLayer("avg-projection",
-                /* trans= */ false,
-                /* useGpu= */ false,
-                /* excludeMode= */ false);
-  testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
-  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ false);
-
-#ifdef PADDLE_WITH_CUDA
-  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer("avg-projection",
-                /* trans= */ false,
-                /* useGpu= */ true,
-                /* excludeMode= */ false);
-  testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
-  testPoolLayer2("cudnn-avg-incl-pad-pool",
-                 /* trans= */ false,
-                 /* useGpu= */ true);
-  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-void setPool3DConfig(TestConfig* config,
-                     PoolConfig* pool,
-                     const string& poolType) {
-  // filter size
-  const int NUM_FILTERS = 16;
-  const int FILTER_SIZE = 3;
-  const int FILTER_SIZE_Y = 3;
-  const int FILTER_SIZE_Z = 3;
-  const int CHANNELS = 16;
-
-  (*config).biasSize = 0;
-  (*config).layerConfig.set_type("pool3d");
-  (*config).layerConfig.set_num_filters(NUM_FILTERS);
-
-  int kw = FILTER_SIZE, kh = FILTER_SIZE_Y, kd = FILTER_SIZE_Z;
-  int pw = 0, ph = 0, pd = 0;
-  int sw = 2, sh = 2, sd = 2;
-
-  pool->set_pool_type(poolType);
-  pool->set_pool_type("avg");
-  pool->set_channels(CHANNELS);
-  pool->set_size_x(kw);
-  pool->set_size_y(kh);
-  pool->set_size_z(kd);
-  pool->set_padding(0);
-  pool->set_padding_y(0);
-  pool->set_padding_z(0);
-  pool->set_stride(sw);
-  pool->set_stride_y(sh);
-  pool->set_stride_z(sd);
-  pool->set_start(0);
-  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
-  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
-  int od = outputSize(pool->img_size_z(), kd, pd, sd, /* caffeMode */ false);
-  pool->set_output_x(ow);
-  pool->set_output_y(oh);
-  pool->set_output_z(od);
-}
-
-void testPool3DLayer(const string& poolType, bool trans, bool useGpu) {
-  TestConfig config;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 11664, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
-
-  const int IMAGE_SIZE = 9;
-  const int IMAGE_SIZE_Y = 9;
-  const int IMAGE_SIZE_Z = 9;
-
-  pool->set_img_size(IMAGE_SIZE);
-  pool->set_img_size_y(IMAGE_SIZE_Y);
-  pool->set_img_size_z(IMAGE_SIZE_Z);
-
-  setPool3DConfig(&config, pool, poolType);
-  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-                              pool->channels());
-
-  testLayerGrad(config, "pool3d", 100, trans, useGpu);
-}
-
-TEST(Layer, Pool3DLayer) {
-  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ false);
-  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ false);
-#ifdef PADDLE_WITH_CUDA
-  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ true);
-  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-void testSppLayer(const string& poolType,
-                  const int pyramidHeight,
-                  bool trans,
-                  bool useGpu) {
-  TestConfig config;
-  config.layerConfig.set_type("spp");
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  SppConfig* sppConfig = input->mutable_spp_conf();
-  sppConfig->set_pool_type(poolType);
-  sppConfig->set_pyramid_height(pyramidHeight);
-  ImageConfig* imageConfig = sppConfig->mutable_image_conf();
-  imageConfig->set_channels(16);
-  imageConfig->set_img_size(10);
-  imageConfig->set_img_size_y(20);
-  int outputSize = (std::pow(4, sppConfig->pyramid_height()) - 1) / (4 - 1);
-  config.layerConfig.set_size(outputSize * imageConfig->channels());
-  testLayerGrad(config, "spp", 100, trans, useGpu);
-}
-
-TEST(Layer, SpatialPyramidPoolLayer) {
-  for (auto useGpu : {false, true}) {
-    for (auto pyramidHeight : {1, 2, 3}) {
-      testSppLayer("avg-projection", pyramidHeight, false, useGpu);
-      testSppLayer("max-projection", pyramidHeight, false, useGpu);
-    }
-  }
-}
-
-TEST(Layer, rankCostLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("rank-cost");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "rank-cost", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, sumCostLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("sum_cost");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "sum_cost", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, weightedRankCostLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("rank-cost");
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_3", 1, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "weighted-rank-cost", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, TensorLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("tensor");
-  config.layerConfig.set_size(10);
-  config.layerConfig.set_active_type("sigmoid");
-  config.biasSize = config.layerConfig.size();
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 250});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 5, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "tensor", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, RecurrentLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("recurrent");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("tanh");
-  config.biasSize = 4;
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 4, /* paraSize= */ 16});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    for (auto reversed : {false, true}) {
-      config.layerConfig.set_reversed(reversed);
-      config.testState = !reversed;
-      testLayerGrad(
-          config, "recurrent", 50, /* trans= */ false, useGpu, false, 1.0);
-    }
-  }
-}
-
-TEST(Layer, LstmLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("lstmemory");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("tanh");
-  config.layerConfig.set_active_state_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 28;
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 64});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    for (auto reversed : {false, true}) {
-      config.layerConfig.set_reversed(reversed);
-      config.testState = !reversed;
-      testLayerGrad(
-          config, "lstmemory", 100, /* trans= */ false, useGpu, false, 0.02);
-    }
-  }
-  for (auto useGpu : {true}) {
-    config.testBatchState = true;
-    config.layerConfig.set_reversed(false);
-    testLayerGrad(config, "lstmemory", 10, /* trans= */ false, useGpu);
-  }
-}
-
-TEST(Layer, MDLstmLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("mdlstmemory");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_active_state_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 4 * 9;
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_MDIM_DATA, "layer_0", 4 * 5, 4 * 4 * 5});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_directions(true);
-  config.layerConfig.add_directions(true);
-
-  for (auto useGpu : {false, true}) {
-    for (int i = 0; i < 2; i++) {
-      for (int j = 0; j < 2; j++) {
-        config.layerConfig.set_directions(0, bool(i));
-        config.layerConfig.set_directions(1, bool(j));
-        testLayerGrad(config, "mdlstmemory", 100, false, useGpu);
-      }
-    }
-  }
-}
-
-TEST(Layer, ParameterReluLayer) {
-  auto testParameterReluLayer = [&](size_t inputSize, size_t channels) {
-    TestConfig config;
-    config.layerConfig.set_type("prelu");
-    config.inputDefs.push_back({INPUT_DATA, "layer_0", inputSize, channels});
-    config.layerConfig.add_inputs();
-    config.layerConfig.set_size(inputSize);
-    config.layerConfig.set_partial_sum(inputSize /
-                                       channels);  // size of feature map
-    for (auto useGpu : {false, true}) {
-      testLayerGrad(config, "prelu", 100, false, useGpu);
-    }
-  };
-
-  testParameterReluLayer(192, 1);
-  testParameterReluLayer(192, 3);
-  testParameterReluLayer(192, 192);
-}
-
-TEST(Layer, ResizeLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("resize");
-  config.layerConfig.set_size(64);
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 16, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "resize", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, RotateLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("rotate");
-  const int CHANNEL = 2;
-  const int HEIGHT = 8;
-  const int WIDTH = 4;
-  const int INPUT_SIZE = HEIGHT * WIDTH * CHANNEL;
-  config.layerConfig.set_size(INPUT_SIZE);
-  config.layerConfig.set_height(HEIGHT);
-  config.layerConfig.set_width(WIDTH);
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", INPUT_SIZE, 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "rotate", 100, false, useGpu);
-  }
-}
-
-TEST(Layer, NCELayer) {
-  TestConfig config;
-  size_t numClasses = 4;
-  config.layerConfig.set_type("nce");
-  config.layerConfig.set_size(1);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_num_classes(numClasses);
-  config.biasSize = numClasses;
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 16 * numClasses});
-  config.inputDefs.push_back(
-      {INPUT_LABEL, "label", /* dim= */ numClasses, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto withWeight : {false, true}) {
-    if (withWeight) {
-      config.inputDefs.push_back(
-          {INPUT_DATA_TARGET, "weight", /* dim= */ 1, /* paraSize= */ 0});
-      config.layerConfig.add_inputs();
-    }
-
-    for (auto isIdLabel : {false, true}) {
-      config.inputDefs[1] = {
-          isIdLabel ? INPUT_LABEL : INPUT_SPARSE_NON_VALUE_DATA,
-          "label",
-          /* dim= */ numClasses,
-          /* paraSize= */ 0};
-
-      for (auto withDist : {false, true}) {
-        config.layerConfig.clear_neg_sampling_dist();
-        if (withDist) {
-          double sum = 0;
-          for (size_t i = 0; i < numClasses; ++i) {
-            real p = rand();  // NOLINT use rand_r
-            config.layerConfig.add_neg_sampling_dist(p);
-            sum += p;
-          }
-          for (size_t i = 0; i < numClasses; ++i) {
-            real p = config.layerConfig.neg_sampling_dist(i) / sum;
-            config.layerConfig.set_neg_sampling_dist(i, p);
-          }
-        }
-        LOG(INFO) << "NCELayer "
-                  << " isIdLabel=" << isIdLabel << " withWeight=" << withWeight
-                  << " withDist=" << withDist;
-        // Not support GPU now
-        testLayerGrad(config,
-                      "nce",
-                      100,
-                      /* trans= */ false,
-                      /* useGpu */ false);
-      }
-    }
-  }
-}
-
-TEST(Layer, GatedRecurrentLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("gated_recurrent");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 12;
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 12, /* paraSize= */ 48});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    for (auto reversed : {false, true}) {
-      config.layerConfig.set_reversed(reversed);
-      config.testState = !reversed;
-      testLayerGrad(config, "gated_recurrent", 100, /* trans= */ false, useGpu);
-    }
-  }
-}
-
-TEST(Layer, GruStepLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("gru_step");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 12;
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", /* dim= */ 12, /* paraSize= */ 48});
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_1", /* dim= */ 4, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "gruStep", 100, /* trans= */ false, useGpu);
-  }
-}
-
-TEST(Layer, LstmStepLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("lstm_step");
-  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
-  config.layerConfig.set_active_state_type("sigmoid");
-  config.layerConfig.set_active_gate_type("sigmoid");
-  config.biasSize = 12;
-  config.testAccumulate = false;
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 0});
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_1", /* dim= */ 4, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "lstmStep", 100, /* trans= */ false, useGpu);
-  }
-}
-
-void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
-  TestConfig config;
-  const int CHANNELS = 10;
-  const int IMG_SIZE = 16;
-  const int IMG_SIZE_Y = 8;
-  size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_size(size);
-  config.layerConfig.set_active_type("sigmoid");
-  config.biasSize = CHANNELS;
-  config.inputDefs.push_back({INPUT_DATA,
-                              "layer_0",
-                              /* dim= */ size,
-                              /* paraSize= */ CHANNELS});
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
-  config.inputDefs.back().isStatic = true;
-  config.inputDefs.push_back({INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
-  config.inputDefs.back().isStatic = true;
-
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  ImageConfig* img_conf = input->mutable_image_conf();
-  img_conf->set_channels(CHANNELS);
-  img_conf->set_img_size(IMG_SIZE);
-  img_conf->set_img_size_y(IMG_SIZE_Y);
-
-  testLayerGrad(config,
-                "batch_norm",
-                64,
-                /* trans= */ trans,
-                useGpu,
-                /* useWeight */ true);
-}
-
-TEST(Layer, BatchNormalizationLayer) {
-  testBatchNormLayer("batch_norm", false, false);
-#ifdef PADDLE_WITH_CUDA
-  testBatchNormLayer("batch_norm", false, true);
-  if (hl_get_cudnn_lib_version() >= int(4000)) {
-    testBatchNormLayer("cudnn_batch_norm", false, true);
-  }
-#endif
-}
-
-void testBatchNorm3DLayer(const string& type, bool trans, bool useGpu) {
-  TestConfig config;
-  const int CHANNELS = 10;
-  const int IMG_SIZE = 16;
-  const int IMG_SIZE_Y = 8;
-  const int IMG_SIZE_Z = 8;
-  size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y * IMG_SIZE_Z;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_size(size);
-  config.layerConfig.set_active_type("sigmoid");
-  config.biasSize = CHANNELS;
-  config.inputDefs.push_back({INPUT_DATA,
-                              "layer_0",
-                              /* dim= */ size,
-                              /* paraSize= */ CHANNELS});
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
-  config.inputDefs.back().isStatic = true;
-  config.inputDefs.push_back({INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
-  config.inputDefs.back().isStatic = true;
-
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  ImageConfig* img_conf = input->mutable_image_conf();
-  img_conf->set_channels(CHANNELS);
-  img_conf->set_img_size(IMG_SIZE);
-  img_conf->set_img_size_y(IMG_SIZE_Y);
-  img_conf->set_img_size_z(IMG_SIZE_Z);
-
-  testLayerGrad(config,
-                "batch_norm",
-                64,
-                /* trans= */ trans,
-                useGpu,
-                /* useWeight */ true);
-}
-
-TEST(Layer, testBatchNorm3DLayer) {
-  testBatchNorm3DLayer("batch_norm", false, false);
-#ifdef PADDLE_WITH_CUDA
-  testBatchNorm3DLayer("batch_norm", false, true);
-  if (hl_get_cudnn_lib_version() >= int(4000)) {
-    testBatchNorm3DLayer("cudnn_batch_norm", false, true);
-  }
-#endif
-}
-
-void testConvOperator(bool isDeconv) {
-  TestConfig config;
-  const int NUM_FILTERS = 16;
-  const int FILTER_SIZE = 2;
-  const int FILTER_SIZE_Y = 3;
-  const int CHANNELS = 3;
-  const int IMAGE_SIZE = 16;
-  const int IMAGE_SIZE_Y = 9;
-  OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
-  if (isDeconv) {
-    operatorConf.set_type("convt");
-  } else {
-    operatorConf.set_type("conv");
-  }
-  ConvConfig* conv = operatorConf.mutable_conv_conf();
-  operatorConf.set_num_filters(NUM_FILTERS);
-  conv->set_filter_size(FILTER_SIZE);
-  conv->set_filter_size_y(FILTER_SIZE_Y);
-  conv->set_channels(CHANNELS);
-  conv->set_padding(0);
-  conv->set_padding_y(1);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_groups(1);
-  conv->set_img_size(IMAGE_SIZE);
-  conv->set_img_size_y(IMAGE_SIZE_Y);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /*  caffeMode */ true));
-  conv->set_output_y(outputSize(conv->img_size_y(),
-                                conv->filter_size_y(),
-                                conv->padding_y(),
-                                conv->stride_y(),
-                                /*  caffeMode */ true));
-
-  if (isDeconv) {
-    conv->set_filter_channels(NUM_FILTERS / conv->groups());
-    config.inputDefs.push_back({INPUT_DATA,
-                                "layer_0",
-                                conv->output_x() * conv->output_y() * CHANNELS,
-                                0});
-    config.layerConfig.set_size(IMAGE_SIZE * IMAGE_SIZE_Y * NUM_FILTERS);
-  } else {
-    conv->set_filter_channels(conv->channels() / conv->groups());
-    config.inputDefs.push_back(
-        {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE_Y * CHANNELS, 0});
-    config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-                                NUM_FILTERS);
-  }
-
-  config.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_1",
-       FILTER_SIZE * FILTER_SIZE_Y * CHANNELS * NUM_FILTERS,
-       0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  testOperatorGrad(config, operatorConf, 100, /*useGpu*/ true, false);
-}
-
-TEST(Operator, conv) {
-  testConvOperator(/*isDeconv*/ true);
-  testConvOperator(/*isDeconv*/ false);
-}
-
-TEST(Layer, FeatureMapExpandLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("featmap_expand");
-  const int CHANNELS = 10;
-  const int INPUT_SIZE = 100;
-  config.layerConfig.set_size(INPUT_SIZE * CHANNELS);
-  config.layerConfig.set_num_filters(CHANNELS);
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
-                              "layer_0",
-                              /* dim= */ INPUT_SIZE,
-                              /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  for (auto useGpu : {false, true}) {
-    for (auto asRowVec : {false, true}) {
-      config.layerConfig.set_user_arg(asRowVec ? "as_row_vec" : "as_col_vec");
-      testLayerGrad(config,
-                    "featmap_expand",
-                    /*batch_size*/ 100,
-                    /* trans= */ false,
-                    useGpu,
-                    /* useWeight */ true);
-    }
-  }
-}
-
-TEST(Layer, MultiplexLayer) {
-  TestConfig config;
-  const int LAYER_SIZE = 100;
-  config.layerConfig.set_type("multiplex");
-  config.layerConfig.set_size(LAYER_SIZE);
-
-  config.inputDefs.push_back({INPUT_LABEL, "layer_0", 2, 0});
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_1", /* dim= */ LAYER_SIZE, /* paraSize= */ 0});
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_2", /* dim= */ LAYER_SIZE, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "multiplex", 512, /* trans= */ false, useGpu);
-  }
-}
-
-TEST(Layer, PadLayer) {
-  TestConfig config;
-  config.biasSize = 0;
-  config.layerConfig.set_type("pad");
-
-  int c = 4;
-  int h = 31;
-  int w = 36;
-  size_t size = c * h * w;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", size, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PadConfig* pad = input->mutable_pad_conf();
-  ImageConfig* image = pad->mutable_image_conf();
-
-  image->set_channels(c);
-  image->set_img_size(h);
-  image->set_img_size_y(w);
-  pad->add_pad_c(1);
-  pad->add_pad_c(2);
-  pad->add_pad_h(2);
-  pad->add_pad_h(3);
-  pad->add_pad_w(3);
-  pad->add_pad_w(5);
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "pad", 10, false, useGpu);
-  }
-}
-
-TEST(Layer, CrossChannelNormLayer) {
-  TestConfig config;
-  config.paramInitialMean = 1.;
-  config.paramInitialStd = 0.;
-  config.layerConfig.set_type("norm");
-  config.layerConfig.set_size(100);
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  NormConfig* norm = input->mutable_norm_conf();
-  norm->set_norm_type("cross-channel-norm");
-  norm->set_channels(10);
-  norm->set_size(100);
-  norm->set_scale(0);
-  norm->set_pow(0);
-  norm->set_blocked(0);
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 100, 10});
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false);
-  }
-}
-
-TEST(Layer, smooth_l1) {
-  TestConfig config;
-  config.layerConfig.set_type("smooth_l1");
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 200, 0});
-  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 200, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "smooth_l1", 100, false, useGpu, false);
-  }
-}
-
-TEST(Layer, multibox_loss) {
-  TestConfig config;
-  config.layerConfig.set_type("multibox_loss");
-  config.biasSize = 0;
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  MultiBoxLossConfig* multiboxLoss = input->mutable_multibox_loss_conf();
-  multiboxLoss->set_num_classes(21);
-  multiboxLoss->set_input_num(1);
-  multiboxLoss->set_overlap_threshold(0.5);
-  multiboxLoss->set_neg_pos_ratio(3);
-  multiboxLoss->set_neg_overlap(0.5);
-  multiboxLoss->set_background_id(0);
-  multiboxLoss->set_height(3);
-  multiboxLoss->set_width(3);
-
-  size_t gtNum = 1;
-  MatrixPtr labelValue = Matrix::create(gtNum, 6, false, false);
-  labelValue->randomizeUniform();
-  labelValue->add(-0.5);
-  labelValue->sigmoid(*labelValue);
-  real* labelData = labelValue->getData();
-  size_t labelWidth = labelValue->getWidth();
-  for (size_t i = 0; i < gtNum; ++i) {
-    *(labelData + i * labelWidth) = std::rand() % 20 + 1;
-    *(labelData + i * labelWidth + 1) = 0.400259;
-    *(labelData + i * labelWidth + 2) = 0.377857;
-    *(labelData + i * labelWidth + 3) = 0.525712;
-    *(labelData + i * labelWidth + 4) = 0.519368;
-  }
-  vector<int> seqStartPositions(gtNum + 1, 0);
-  for (size_t i = 1; i <= gtNum; ++i) {
-    seqStartPositions[i] = i;
-  }
-
-  // Ensure at lease one matched bbox
-  MatrixPtr priorValue = Matrix::create(1, 72, false, false);
-  priorValue->randomizeUniform();
-  priorValue->add(-0.5);
-  priorValue->sigmoid(*priorValue);
-  real* priorData = priorValue->getData();
-  *(priorData) = 0.424811;
-  *(priorData + 1) = 0.397059;
-  *(priorData + 2) = 0.538905;
-  *(priorData + 3) = 0.447091;
-  *(priorData + 4) = 0.425720;
-  *(priorData + 5) = 0.515228;
-  *(priorData + 6) = 0.519452;
-  *(priorData + 7) = 0.591065;
-
-  config.inputDefs.push_back(
-      {INPUT_SELF_DEFINE_DATA, "priorbox", priorValue, {}});
-  config.inputDefs.push_back(
-      {INPUT_SELF_DEFINE_DATA, "label", labelValue, seqStartPositions});
-  config.inputDefs.push_back({INPUT_DATA, "locPred", 36, 0});
-  config.inputDefs.push_back({INPUT_DATA, "confPred", 189, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "multibox_loss", 1, false, useGpu, false);
-  }
-}
-
-TEST(Layer, TransLayer) {
-  TestConfig config;
-  const int height = 128;
-  const int width = 256;
-  config.layerConfig.set_type("trans");
-  config.layerConfig.set_size(width);
-
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", /* dim= */ height * width, /* paraSize= */ 0});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "trans", height, /* trans= */ false, useGpu);
-  }
-}
-
-TEST(Layer, RowConvLayer) {
-  const int context = 3;
-  const int size = 512;
-
-  TestConfig config;
-  config.layerConfig.set_type("row_conv");
-  config.layerConfig.set_size(size);
-  config.layerConfig.set_active_type("sigmoid");
-
-  config.inputDefs.push_back(
-      {INPUT_SEQUENCE_DATA, "layer_0", size, context * size});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  RowConvConfig* conv = input->mutable_row_conv_conf();
-  conv->set_context_length(context);
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "row_conv", 100, false, useGpu, false);
-  }
-}
-
-TEST(Layer, CropLayer) {
-  TestConfig config;
-  // config input_0
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ImageConfig* img = input->mutable_image_conf();
-  img->set_channels(4);
-  img->set_img_size(16);
-  config.layerConfig.set_axis(2);
-  config.layerConfig.add_offset(0);
-  config.layerConfig.add_offset(0);
-
-  // config input_1
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", 128, 0});
-  input = config.layerConfig.add_inputs();
-  img = input->mutable_image_conf();
-  img->set_channels(2);
-  img->set_img_size(8);
-
-  // config crop layer
-  config.layerConfig.set_type("crop");
-  config.layerConfig.set_name("cropLayer");
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "crop", 100, false, useGpu, false);
-  }
-}
-
-TEST(Layer, roi_pool) {
-  TestConfig config;
-  config.layerConfig.set_type("roi_pool");
-  config.biasSize = 0;
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ROIPoolConfig* roiPoolConf = input->mutable_roi_pool_conf();
-  roiPoolConf->set_pooled_width(7);
-  roiPoolConf->set_pooled_height(7);
-  roiPoolConf->set_spatial_scale(1. / 16);
-  roiPoolConf->set_width(14);
-  roiPoolConf->set_height(14);
-
-  const size_t roiNum = 10;
-  const size_t roiDim = 10;
-  const size_t batchSize = 5;
-  MatrixPtr roiValue = Matrix::create(roiNum, roiDim, false, false);
-  roiValue->zeroMem();
-  real* roiData = roiValue->getData();
-  for (size_t i = 0; i < roiNum; ++i) {
-    roiData[i * roiDim + 0] = std::rand() % batchSize;
-    roiData[i * roiDim + 1] = std::rand() % 224;  // xMin
-    roiData[i * roiDim + 2] = std::rand() % 224;  // yMin
-    size_t xMin = static_cast<size_t>(roiData[i * roiDim + 1]);
-    size_t yMin = static_cast<size_t>(roiData[i * roiDim + 2]);
-    roiData[i * roiDim + 3] = xMin + std::rand() % (224 - xMin);  // xMax
-    roiData[i * roiDim + 4] = yMin + std::rand() % (224 - yMin);  // yMax
-  }
-
-  config.inputDefs.push_back({INPUT_DATA, "input", 3 * 14 * 14, {}});
-  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "rois", roiValue, {}});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "roi_pool", batchSize, false, useGpu, false);
-  }
-}
-
-TEST(Layer, SwitchOrderLayer) {
-  TestConfig config;
-  // config input_0
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ImageConfig* img = input->mutable_image_conf();
-  img->set_channels(4);
-  img->set_img_size(16);
-  img->set_img_size_y(16);
-
-  ReshapeConfig* reshape = config.layerConfig.mutable_reshape_conf();
-  reshape->add_height_axis(0);
-  reshape->add_height_axis(1);
-  reshape->add_height_axis(2);
-  reshape->add_width_axis(3);
-
-  // config softmax layer
-  config.layerConfig.set_type("switch_order");
-  config.layerConfig.set_name("switchOrderLayer");
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "switch_order", 100, false, useGpu, true);
-  }
-}
-
-vector<real> randSampling(real range, int n) {
-  CHECK_GE(range, n);
-  vector<real> num(range);
-  iota(begin(num), end(num), 0.);
-  if (range == n) return num;
-
-  random_shuffle(begin(num), end(num));
-  num.resize(n);
-  sort(begin(num), end(num));
-  return num;
-}
-
-TEST(Layer, SubNestedSequenceLayer) {
-  // layer size is not crutial for this layer,
-  // so use a small layer size in unittest
-  const int layerSize = 4;
-
-  const int maxSeqNum = 50;
-  const int maxSeqLen = 50;
-  const int maxBeamSize = 32;
-
-  srand((size_t)(time(NULL)));
-  int beamSize = 1 + (rand() % maxBeamSize);
-
-  TestConfig config;
-  config.layerConfig.set_type("sub_nested_seq");
-  config.layerConfig.set_name("sub_nested_seq_layer");
-  config.layerConfig.set_size(layerSize);
-
-  int seqNum = 1 + (rand() % maxSeqNum);
-
-  // sequence information for the first input, it is a nested sequence
-  vector<int> seqStartPos(seqNum + 1, 0);
-  vector<int> subSeqStartPos(1, 0);
-
-  // selected indices
-  MatrixPtr selectedIndices = Matrix::create(seqNum, beamSize, false, false);
-  selectedIndices->one();
-  selectedIndices->mulScalar(-1.);
-  real* indicesData = selectedIndices->getData();
-
-  for (int i = 0; i < seqNum; ++i) {
-    int subSeqNum = 1 + (rand() % maxSeqNum);
-    for (int j = 0; j < subSeqNum; ++j) {
-      subSeqStartPos.push_back(subSeqStartPos.back() +
-                               (1 + (rand() % maxSeqLen)));
-    }
-    vector<real> selSeqs =
-        randSampling(static_cast<real>(subSeqNum), min(beamSize, subSeqNum));
-    memcpy(indicesData + (i * beamSize),
-           selSeqs.data(),
-           selSeqs.size() * sizeof(real));
-    seqStartPos[i + 1] = subSeqStartPos.back();
-  }
-
-  MatrixPtr seqInputPtr =
-      Matrix::create(seqStartPos.back(), layerSize, false, false);
-  seqInputPtr->randomizeUniform();
-  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
-                              "nested_seq_input",
-                              seqInputPtr,
-                              seqStartPos,
-                              subSeqStartPos});
-  config.layerConfig.add_inputs();
-  config.inputDefs.push_back(
-      {INPUT_SELF_DEFINE_DATA, "selected_indices", selectedIndices});
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "sub_nested_seq",
-                  /* batchSize */ seqNum,
-                  /* trans */ false,
-                  /* useGpu*/ useGpu,
-                  /* useWeight */ false);
-  }
-}
-
-TEST(Layer, ClipLayer) {
-  const size_t batchSize = 128;
-  const size_t size = 512;
-  TestConfig config;
-  config.layerConfig.set_type("clip");
-  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ClipConfig* layerConf = input->mutable_clip_conf();
-  double p1 = std::rand() / (double)RAND_MAX;
-  double p2 = std::rand() / (double)RAND_MAX;
-  layerConf->set_min(std::min(p1, p2));
-  layerConf->set_max(std::max(p1, p2));
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "clip", batchSize, false, useGpu, false);
-  }
-}
-
-TEST(Layer, RowL2NormLayer) {
-  const size_t batchSize = 128;
-  const size_t size = 512;
-  TestConfig config;
-  config.layerConfig.set_type("row_l2_norm");
-  config.layerConfig.set_size(size);
-  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
-  config.layerConfig.add_inputs();
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "row_l2_norm", batchSize, false, useGpu, false);
-  }
-}
-
-void test3DConvLayer(const string& type, bool trans, bool useGpu) {
-  // filter size
-  const int NUM_FILTERS = 6;
-  // const int CHANNELS = 3;
-  const int FILTER_SIZE = 3;
-  const int FILTER_SIZE_Y = 3;
-  const int FILTER_SIZE_Z = 3;
-
-  // input image
-  const int CHANNELS = 3;
-  const int IMAGE_SIZE = 9;
-  const int IMAGE_SIZE_Y = 9;
-  const int IMAGE_SIZE_Z = 9;
-
-  TestConfig config;
-  config.biasSize = NUM_FILTERS;
-  config.layerConfig.set_type(type);
-  config.layerConfig.set_num_filters(NUM_FILTERS);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  // Setting up conv3D-trans layer
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-
-  conv->set_channels(CHANNELS);
-  conv->set_filter_size(FILTER_SIZE);
-  conv->set_filter_size_y(FILTER_SIZE_Y);
-  conv->set_filter_size_z(FILTER_SIZE_Z);
-  conv->set_padding(0);
-  conv->set_padding_y(0);
-  conv->set_padding_z(0);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_stride_z(2);
-  conv->set_img_size(IMAGE_SIZE);
-  conv->set_img_size_y(IMAGE_SIZE_Y);
-  conv->set_img_size_z(IMAGE_SIZE_Z);
-  conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
-                                conv->padding(),
-                                conv->stride(),
-                                /*  caffeMode */ true));
-  conv->set_output_y(outputSize(conv->img_size_y(),
-                                conv->filter_size_y(),
-                                conv->padding_y(),
-                                conv->stride_y(),
-                                /*  caffeMode */ true));
-  conv->set_output_z(outputSize(conv->img_size_z(),
-                                conv->filter_size_z(),
-                                conv->padding_z(),
-                                conv->stride_z(),
-                                /*  caffeMode */ true));
-
-  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-                              conv->output_z() * NUM_FILTERS);
-  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  config.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       CHANNELS * IMAGE_SIZE * IMAGE_SIZE_Y * IMAGE_SIZE_Z,
-       conv->filter_channels() * FILTER_SIZE * FILTER_SIZE_Y * FILTER_SIZE_Z *
-           NUM_FILTERS});
-
-  testLayerGrad(config, "conv3D", 10, trans, useGpu);
-  // Use small batch_size and useWeight=true to test biasGrad
-  testLayerGrad(config, "conv3D", 2, trans, useGpu, true, 0.02);
-}
-
-TEST(Layer, test3DConvLayer) {
-  test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ false);
-#ifdef PADDLE_WITH_CUDA
-  test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-void test3DDeConvLayer(const string& type, bool trans, bool useGpu) {
-  // filter size
-  const int NUM_FILTERS = 6;
-  // const int CHANNELS = 3;
-  const int FILTER_SIZE = 3;
-  const int FILTER_SIZE_Y = 3;
-  const int FILTER_SIZE_Z = 3;
-
-  // input image
-  const int CHANNELS = 3;
-  const int IMAGE_SIZE = 4;
-  const int IMAGE_SIZE_Y = 6;
-  const int IMAGE_SIZE_Z = 6;
-
-  // Setting up conv-trans layer
-  TestConfig config;
-  config.biasSize = NUM_FILTERS;
-  config.layerConfig.set_type("deconv3d");
-  config.layerConfig.set_num_filters(NUM_FILTERS);
-  config.layerConfig.set_partial_sum(1);
-  config.layerConfig.set_shared_biases(true);
-
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-
-  conv->set_channels(CHANNELS);
-  conv->set_filter_size(FILTER_SIZE);
-  conv->set_filter_size_y(FILTER_SIZE_Y);
-  conv->set_filter_size_z(FILTER_SIZE_Z);
-  conv->set_padding(0);
-  conv->set_padding_y(0);
-  conv->set_padding_z(0);
-  conv->set_stride(2);
-  conv->set_stride_y(2);
-  conv->set_stride_z(2);
-  conv->set_output_x(IMAGE_SIZE);
-  conv->set_output_y(IMAGE_SIZE_Y);
-  conv->set_output_z(IMAGE_SIZE_Z);
-
-  conv->set_img_size(imageSize(conv->output_x(),
-                               conv->filter_size(),
-                               conv->padding(),
-                               conv->stride(),
-                               true));
-  conv->set_img_size_y(imageSize(conv->output_y(),
-                                 conv->filter_size_y(),
-                                 conv->padding_y(),
-                                 conv->stride_y(),
-                                 true));
-  conv->set_img_size_z(imageSize(conv->output_z(),
-                                 conv->filter_size_z(),
-                                 conv->padding_z(),
-                                 conv->stride_z(),
-                                 true));
-  config.layerConfig.set_size(conv->img_size() * conv->img_size_y() *
-                              conv->img_size_z() * NUM_FILTERS);
-  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  config.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       CHANNELS * IMAGE_SIZE * IMAGE_SIZE_Y * IMAGE_SIZE_Z,
-       conv->filter_channels() * FILTER_SIZE * FILTER_SIZE_Y * FILTER_SIZE_Z *
-           NUM_FILTERS});
-
-  testLayerGrad(config, "deconv3D", 10, trans, useGpu);
-  // Use small batch_size and useWeight=true to test biasGrad
-  testLayerGrad(config, "deconv3D", 2, trans, useGpu, true, 0.02);
-}
-
-TEST(Layer, test3DDeConvLayer) {
-  test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ false);
-#ifdef PADDLE_WITH_CUDA
-  test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ true);
-#endif
-}
-
-TEST(Layer, ScaleShiftLayer) {
-  // FIXME: Disable ScaleShiftLayer because it is not stable.
-  // https://github.com/PaddlePaddle/Paddle/issues/7781
-  return;
-  //  const size_t batchSize = 16;
-  //  const size_t size = 32;
-  //  TestConfig config;
-  //  config.layerConfig.set_type("scale_shift");
-  //  config.layerConfig.set_size(size);
-  //  config.biasSize = 1;
-  //  config.inputDefs.push_back(
-  //      {INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1});
-  //  config.layerConfig.add_inputs();
-  //  for (auto useGpu : {false, true}) {
-  //    testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false);
-  //  }
-}
-
-TEST(Layer, ScaleSubRegionLayer) {
-  const size_t batchSize = 64;
-  const size_t size = 4096;
-  TestConfig config;
-  config.layerConfig.set_type("scale_sub_region");
-  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
-  MatrixPtr indicesV = Matrix::create(batchSize, 6, false, false);
-  auto* data = indicesV->getData();
-  for (size_t i = 0; i < batchSize; ++i) {
-    data[i * 2] = 2;
-    data[i * 2 + 1] = 4;
-    data[i * 2 + 2] = 16;
-    data[i * 2 + 3] = 32;
-    data[i * 2 + 4] = 16;
-    data[i * 2 + 5] = 32;
-  }
-  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "indices", indicesV, {}});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  ScaleSubRegionConfig* scaleSubRegionConf =
-      input->mutable_scale_sub_region_conf();
-  ImageConfig* imgConf = scaleSubRegionConf->mutable_image_conf();
-  imgConf->set_img_size(32);
-  imgConf->set_img_size_y(32);
-  imgConf->set_channels(4);
-  scaleSubRegionConf->set_value(2.0);
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "scale_sub_region", batchSize, false, useGpu, false);
-  }
-}
-
-TEST(Layer, L2DistanceLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("l2_distance");
-  config.layerConfig.set_size(1);
-  config.biasSize = 0;
-
-  const size_t input_dim = 27;
-  const size_t batch_size = 11;
-
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", input_dim, 0});
-  config.inputDefs.push_back({INPUT_DATA, "layer_1", input_dim, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "l2_distance", batch_size, false, useGpu);
-  }
-}
-
-void testFactorizationMachineLayer(InputType type, bool useGpu) {
-  const int FACTOR_SIZE = 10;
-  TestConfig config;
-  config.layerConfig.set_type("factorization_machine");
-  config.layerConfig.set_factor_size(FACTOR_SIZE);
-  config.layerConfig.set_size(1);
-  config.biasSize = 0;
-  config.inputDefs.push_back({type, "layer_0", 128, 1280});
-  config.layerConfig.add_inputs();
-  testLayerGrad(config, "factorization_machine", 16, false, useGpu, false);
-}
-
-TEST(Layer, FactorizationMachineLayer) {
-  for (auto useGpu : {false, true}) {
-    testFactorizationMachineLayer(INPUT_DATA, useGpu);
-  }
-  testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_LinearChainCRF.cpp b/paddle/gserver/tests/test_LinearChainCRF.cpp
deleted file mode 100644
index 423c31e27d7ca223f1cbff8f030b006d3889f0bb..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_LinearChainCRF.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <vector>
-#include "paddle/gserver/layers/LinearChainCRF.h"
-#include "paddle/utils/Util.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static inline bool getNextSequence(vector<int>& seq, int numClasses) {
-  for (auto& v : seq) {
-    if (++v < numClasses) {
-      return true;
-    }
-    v = 0;
-  }
-  return false;
-}
-
-TEST(LinearChainCRF, decoding) {
-  const int numClasses = 4;
-  CpuVector para(numClasses * (numClasses + 2));
-  real* a = para.getData();
-  real* b = para.getData() + numClasses;
-  real* w = para.getData() + 2 * numClasses;
-  LinearChainCRF crf(4, para.getData());
-  for (int length : {1, 2, 3, 10}) {
-    for (int tries = 0; tries < 10; ++tries) {
-      CpuMatrix x(length, numClasses);
-      x.randomizeUniform();
-      para.randnorm(0, 2);
-      vector<int> decodingResult(length);
-      vector<int> bestResult(length);
-      vector<int> testResult(length, 0);
-      crf.decode(x.getData(), &decodingResult[0], length);
-      real bestScore = -std::numeric_limits<real>::max();
-      do {
-        real score = a[testResult.front()] + b[testResult.back()];
-        score += x.getElement(0, testResult.front());
-        for (int k = 1; k < length; ++k) {
-          score += x.getElement(k, testResult[k]) +
-                   w[numClasses * testResult[k - 1] + testResult[k]];
-        }
-        if (score > bestScore) {
-          bestScore = score;
-          bestResult = testResult;
-        }
-      } while (getNextSequence(testResult, numClasses));
-      for (int k = 0; k < length; ++k) {
-        EXPECT_EQ(decodingResult[k], bestResult[k]);
-      }
-    }
-  }
-}
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
deleted file mode 100644
index a34a3f6206171fb1e0563ab9ef8550bc890359ce..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ /dev/null
@@ -1,448 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/utils/PythonUtil.h>
-#include <string>
-#include <vector>
-#include "MKLDNNTester.h"
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/activations/MKLDNNActivation.h"
-#include "paddle/math/MathUtils.h"
-
-using namespace paddle;  // NOLINT
-
-DECLARE_bool(thread_local_rand_use_global_seed);
-DECLARE_bool(use_gpu);
-DECLARE_bool(use_mkldnn);
-
-#define RUN_MKLDNN_TEST(DNN_CONFIG, REF_CONFIG, DESC)         \
-  MKLDNNTester tester;                                        \
-  for (auto bs : {DESC.bs, 1}) {                              \
-    tester.run(DNN_CONFIG, REF_CONFIG, bs, DESC.ih, DESC.iw); \
-  }
-
-#define RUN_MKLDNN_TEST_LAYER(DNN_CONFIG, REF_TYPE, DESC) \
-  TestConfig ref = DNN_CONFIG;                            \
-  ref.layerConfig.set_type(REF_TYPE);                     \
-  RUN_MKLDNN_TEST(DNN_CONFIG, ref, DESC)
-
-struct testFcDesc {
-  int bs;
-  int ic;
-  int ih, iw;  // oh == ow == 1
-  int oc;
-};
-
-static void getMKLDNNFcConfig(TestConfig& cfg, const testFcDesc& pm) {
-  cfg.layerConfig.set_type("mkldnn_fc");
-  cfg.layerConfig.set_active_type("relu");
-  cfg.layerConfig.set_size(pm.oc);
-  cfg.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
-       /* size of weight= */ size_t(pm.oc * pm.ic * pm.ih * pm.iw)});
-  cfg.layerConfig.add_inputs();
-}
-
-void testFcLayer(const testFcDesc& pm) {
-  TestConfig dnnConfig;
-  getMKLDNNFcConfig(dnnConfig, pm);
-  for (auto biasSize : {pm.oc, 0}) {
-    dnnConfig.biasSize = biasSize;
-    RUN_MKLDNN_TEST_LAYER(dnnConfig, "fc", pm)
-  }
-}
-
-TEST(MKLDNNLayer, FcLayer) {
-  /* bs, ic, ih, iw, oc */
-  testFcLayer({2, 2, 1, 1, 3});
-  testFcLayer({3, 7, 1, 1, 19});
-  testFcLayer({8, 16, 13, 13, 32});
-  testFcLayer({4, 12, 13, 13, 18});
-  testFcLayer({2, 64, 16, 16, 32});
-  testFcLayer({15, 3, 16, 16, 6});
-}
-
-struct testConvDesc {
-  int bs, gp;
-  int ic, ih, iw;
-  int oc, oh, ow;
-  int fh, fw;
-  int ph, pw;
-  int sh, sw;
-  int dh, dw;
-};
-
-static void getMKLDNNConvConfig(TestConfig& cfg, const testConvDesc& pm) {
-  cfg.layerConfig.set_type("mkldnn_conv");
-  cfg.layerConfig.set_active_type("relu");
-  cfg.layerConfig.set_num_filters(pm.oc);
-  cfg.layerConfig.set_size(pm.oc * pm.oh * pm.ow);
-  cfg.layerConfig.set_shared_biases(true);
-  cfg.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
-       /* size of weight= */ size_t(pm.oc * pm.ic * pm.fh * pm.fw / pm.gp)});
-  LayerInputConfig* input = cfg.layerConfig.add_inputs();
-  ConvConfig* conv = input->mutable_conv_conf();
-  conv->set_groups(pm.gp);
-  conv->set_img_size(pm.iw);
-  conv->set_img_size_y(pm.ih);
-  conv->set_output_x(pm.ow);
-  conv->set_output_y(pm.oh);
-  conv->set_filter_size(pm.fw);
-  conv->set_filter_size_y(pm.fh);
-  conv->set_channels(pm.ic);
-  conv->set_padding(pm.pw);
-  conv->set_padding_y(pm.ph);
-  conv->set_stride(pm.sw);
-  conv->set_stride_y(pm.sh);
-  conv->set_dilation(pm.dw);
-  conv->set_dilation_y(pm.dh);
-  conv->set_caffe_mode(true);
-  conv->set_filter_channels(conv->channels() / conv->groups());
-  CHECK_EQ(conv->filter_channels() * pm.gp, conv->channels())
-      << "it is indivisible";
-
-  int fh = (pm.fh - 1) * pm.dh + 1;
-  int fw = (pm.fw - 1) * pm.dw + 1;
-  int ow = outputSize(pm.iw, fw, pm.pw, pm.sw, true);
-  int oh = outputSize(pm.ih, fh, pm.ph, pm.sh, true);
-  CHECK_EQ(ow, pm.ow) << "output size check failed";
-  CHECK_EQ(oh, pm.oh) << "output size check failed";
-}
-
-void testConvLayer(const testConvDesc& pm) {
-  TestConfig dnnConfig;
-  getMKLDNNConvConfig(dnnConfig, pm);
-  for (auto biasSize : {pm.oc, 0}) {
-    dnnConfig.biasSize = biasSize;
-    RUN_MKLDNN_TEST_LAYER(dnnConfig, "exconv", pm)
-  }
-}
-
-TEST(MKLDNNLayer, ConvLayer) {
-  /* bs, gp, ic, ih, iw, oc, oh, ow, fh, fw, ph, pw, sh, sw, dh, dw */
-  testConvLayer({2, 1, 3, 32, 32, 16, 32, 32, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({2, 1, 8, 16, 16, 8, 16, 16, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({3, 1, 16, 32, 32, 3, 32, 32, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({8, 1, 16, 18, 18, 32, 18, 18, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({16, 1, 1, 42, 31, 32, 23, 11, 4, 5, 3, 2, 2, 3, 1, 1});
-  testConvLayer({2, 1, 8, 16, 16, 8, 8, 8, 3, 3, 1, 1, 2, 2, 1, 1});
-  testConvLayer({3, 1, 8, 13, 13, 8, 7, 7, 3, 3, 1, 1, 2, 2, 1, 1});
-  // with groups
-  testConvLayer({2, 2, 4, 5, 5, 8, 5, 5, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({2, 3, 3, 5, 5, 3, 5, 5, 3, 3, 1, 1, 1, 1, 1, 1});
-  testConvLayer({4, 4, 16, 3, 3, 16, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1});
-}
-
-struct testPoolDesc {
-  int bs, ic;  // input channel and output channel are the same
-  int ih, iw;
-  int oh, ow;
-  int fh, fw;
-  int ph, pw;
-  int sh, sw;
-};
-
-static void getMKLDNNPoolConfig(TestConfig& cfg, const testPoolDesc& pm) {
-  cfg.layerConfig.set_type("mkldnn_pool");
-  cfg.layerConfig.set_active_type("relu");
-  cfg.layerConfig.set_size(pm.ic * pm.oh * pm.ow);
-  cfg.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
-       0});
-  LayerInputConfig* input = cfg.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
-  pool->set_pool_type("avg-projection");
-  pool->set_channels(pm.ic);
-  pool->set_img_size(pm.iw);
-  pool->set_img_size_y(pm.ih);
-  pool->set_output_x(pm.ow);
-  pool->set_output_y(pm.oh);
-  pool->set_size_x(pm.fw);
-  pool->set_size_y(pm.fh);
-  pool->set_padding(pm.pw);
-  pool->set_padding_y(pm.ph);
-  pool->set_stride(pm.sw);
-  pool->set_stride_y(pm.sh);
-
-  int oh = outputSize(pm.ih, pm.fh, pm.ph, pm.sh, false);
-  int ow = outputSize(pm.iw, pm.fw, pm.pw, pm.sw, false);
-  CHECK_EQ(ow, pm.ow) << "output size check failed";
-  CHECK_EQ(oh, pm.oh) << "output size check failed";
-}
-
-void testPoolLayer(const testPoolDesc& pm) {
-  TestConfig dnnConfig;
-  getMKLDNNPoolConfig(dnnConfig, pm);
-  LayerInputConfig* input = dnnConfig.layerConfig.mutable_inputs(0);
-  PoolConfig* pool = input->mutable_pool_conf();
-  for (auto type : {"max-projection", "avg-projection"}) {
-    pool->set_pool_type(type);
-    RUN_MKLDNN_TEST_LAYER(dnnConfig, "pool", pm)
-  }
-}
-
-TEST(MKLDNNLayer, PoolLayer) {
-  /* bs, ch, ih, iw, oh, ow, fh, fw, ph, pw, sh, sw */
-  testPoolLayer({2, 1, 4, 4, 2, 2, 3, 3, 0, 0, 2, 2});
-  testPoolLayer({10, 8, 16, 16, 8, 8, 2, 2, 0, 0, 2, 2});
-  testPoolLayer({4, 2, 5, 5, 3, 3, 3, 3, 1, 1, 2, 2});
-  testPoolLayer({8, 16, 56, 56, 28, 28, 3, 3, 0, 0, 2, 2});
-  testPoolLayer({8, 16, 14, 14, 7, 7, 3, 3, 0, 0, 2, 2});
-  testPoolLayer({4, 16, 7, 7, 1, 1, 7, 7, 0, 0, 1, 1});
-  testPoolLayer({4, 2, 5, 5, 3, 3, 5, 5, 1, 1, 1, 1});
-  testPoolLayer({2, 8, 56, 56, 29, 29, 3, 3, 1, 1, 2, 2});
-}
-
-struct testBatchNormDesc {
-  int bs;
-  int ic;
-  int ih, iw;
-};
-
-static void getMKLDNNBatchNormConfig(TestConfig& cfg,
-                                     const testBatchNormDesc& pm) {
-  cfg.layerConfig.set_size(pm.ic * pm.ih * pm.iw);
-  cfg.layerConfig.set_type("mkldnn_batch_norm");
-  cfg.biasSize = pm.ic;
-  cfg.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
-       /* size of weight= */ size_t(pm.ic)});
-  cfg.inputDefs.push_back(
-      {INPUT_DATA, "layer_1_moving_mean", 1, size_t(pm.ic)});
-  cfg.inputDefs.back().isStatic = true;
-  cfg.inputDefs.push_back({INPUT_DATA, "layer_2_moving_var", 1, size_t(pm.ic)});
-  cfg.inputDefs.back().isStatic = true;
-  LayerInputConfig* input = cfg.layerConfig.add_inputs();
-  cfg.layerConfig.set_active_type("relu");
-  cfg.layerConfig.add_inputs();
-  cfg.layerConfig.add_inputs();
-  ImageConfig* img_conf = input->mutable_image_conf();
-  img_conf->set_channels(pm.ic);
-  img_conf->set_img_size_y(pm.ih);
-  img_conf->set_img_size(pm.iw);
-}
-
-void testBatchNormLayer(const testBatchNormDesc& pm) {
-  TestConfig dnnConfig;
-  getMKLDNNBatchNormConfig(dnnConfig, pm);
-  TestConfig refConfig = dnnConfig;
-  refConfig.layerConfig.set_type("batch_norm");
-  // for PASS_TRAIN, use_global_stats always should be false, and batchsize != 1
-  VLOG(MKLDNN_TESTS) << "check train phase";
-  dnnConfig.layerConfig.set_use_global_stats(false);
-  refConfig.layerConfig.set_use_global_stats(false);
-  MKLDNNTester tester;
-  tester.run(dnnConfig, refConfig, pm.bs, pm.ih, pm.iw, PASS_TRAIN);
-  // for PASS_TEST, check use_global_stats true and false, and batchsize 1
-  VLOG(MKLDNN_TESTS) << "check test phase";
-  for (auto useGS : {false, true}) {
-    dnnConfig.layerConfig.set_use_global_stats(useGS);
-    refConfig.layerConfig.set_use_global_stats(useGS);
-    MKLDNNTester tester;
-    for (auto bs : {pm.bs, 1}) {
-      tester.run(dnnConfig, refConfig, bs, pm.ih, pm.iw, PASS_TEST);
-    }
-  }
-}
-
-TEST(MKLDNNLayer, BatchNormLayer) {
-  testBatchNormLayer({4, 10, 6, 6});
-  testBatchNormLayer({16, 32, 16, 16});
-  testBatchNormLayer({4, 16, 8, 10});
-}
-
-struct testLRNDesc {
-  int bs, ic, ih, iw;
-  float scale, pow;
-  int localSize;
-};
-
-void getMKLDNNLRNConfig(TestConfig& cfg, const testLRNDesc& pm) {
-  cfg.layerConfig.set_type("mkldnn_lrn");
-  cfg.layerConfig.set_active_type("relu");
-  size_t layerSize = pm.ic * pm.ih * pm.iw;
-  cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0});
-  LayerInputConfig* input = cfg.layerConfig.add_inputs();
-  NormConfig* norm = input->mutable_norm_conf();
-  norm->set_channels(pm.ic);
-  norm->set_size(pm.localSize);
-  norm->set_scale(pm.scale);
-  norm->set_pow(pm.pow);
-  norm->set_blocked(0);
-  norm->set_img_size(pm.iw);
-  norm->set_img_size_y(pm.ih);
-  norm->set_output_x(norm->img_size());
-  norm->set_output_y(norm->img_size_y());
-  cfg.layerConfig.set_size(layerSize);
-  cfg.biasSize = 0;
-}
-
-void testLRNLayer(const testLRNDesc& pm) {
-  TestConfig dnnConfig;
-  getMKLDNNLRNConfig(dnnConfig, pm);
-  // mkldnn_lrn <==> norm with cmrnorm-projection type
-  TestConfig refConfig = dnnConfig;
-  refConfig.layerConfig.set_type("norm");
-  LayerInputConfig* input = refConfig.layerConfig.mutable_inputs(0);
-  NormConfig* norm = input->mutable_norm_conf();
-  norm->set_norm_type("cmrnorm-projection");
-  norm->set_scale(norm->scale() / norm->size());
-  RUN_MKLDNN_TEST(dnnConfig, refConfig, pm)
-}
-
-TEST(MKLDNNLayer, LRNLayer) {
-  testLRNLayer({4, 10, 12, 12, 0.001f, 0.75f, 5});
-  testLRNLayer({2, 32, 6, 6, 0.001f, 0.75f, 5});
-  testLRNLayer({4, 16, 8, 10, 0.01f, 0.5f, 5});
-}
-
-struct testImageDesc {
-  int bs, ic, ih, iw;
-};
-
-static void getAddtoConfig(TestConfig& cfg,
-                           const testImageDesc& pm,
-                           const size_t nInputs = 1) {
-  cfg.biasSize = 0;
-  cfg.layerConfig.set_type("addto");
-  size_t layerSize = pm.ic * pm.ih * pm.iw;
-  cfg.layerConfig.set_size(layerSize);
-  cfg.layerConfig.set_active_type("relu");
-  for (size_t i = 0; i < nInputs; ++i) {
-    std::stringstream ss;
-    ss << "layer_" << i;
-    cfg.inputDefs.push_back({INPUT_DATA, ss.str(), layerSize, 0});
-    LayerInputConfig* input = cfg.layerConfig.add_inputs();
-    ImageConfig* img_conf = input->mutable_image_conf();
-    img_conf->set_channels(pm.ic);
-    img_conf->set_img_size_y(pm.ih);
-    img_conf->set_img_size(pm.iw);
-  }
-}
-
-void testAddtoLayer(const testImageDesc& pm, const size_t nInputs) {
-  CHECK_GE(nInputs, 1UL);
-  TestConfig dnnConfig;
-  getAddtoConfig(dnnConfig, pm, nInputs);
-  dnnConfig.layerConfig.set_type("mkldnn_addto");
-  for (auto withBias : {false, true}) {
-    dnnConfig.biasSize = withBias ? pm.ic * pm.ih * pm.iw : 0;
-    RUN_MKLDNN_TEST_LAYER(dnnConfig, "addto", pm)
-  }
-}
-
-TEST(MKLDNNLayer, AddtoLayer) {
-  testAddtoLayer({16, 5, 14, 14}, 1);
-  testAddtoLayer({8, 10, 8, 8}, 2);
-  testAddtoLayer({4, 12, 1, 1}, 3);
-}
-
-static void getMKLDNNConcatConfig(TestConfig& cfg,
-                                  const std::vector<testImageDesc>& inputs) {
-  CHECK_GE(inputs.size(), 2UL) << "at least two inputs";
-  int oc = inputs[0].ic;
-  for (size_t i = 1; i < inputs.size(); ++i) {
-    CHECK_EQ(inputs[i].bs, inputs[0].bs);
-    CHECK_EQ(inputs[i].ih, inputs[0].ih);
-    CHECK_EQ(inputs[i].iw, inputs[0].iw);
-    oc += inputs[i].ic;
-  }
-  cfg.biasSize = 0;
-  cfg.layerConfig.set_type("mkldnn_concat");
-  cfg.layerConfig.set_size(oc * inputs[0].ih * inputs[0].iw);
-  cfg.layerConfig.set_active_type("relu");
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    std::stringstream ss;
-    ss << "layer_" << i;
-    cfg.inputDefs.push_back(
-        {INPUT_DATA,
-         ss.str(),
-         (size_t)(inputs[i].ic) * inputs[i].ih * inputs[i].iw,
-         0});
-    LayerInputConfig* input = cfg.layerConfig.add_inputs();
-    ImageConfig* img_conf = input->mutable_image_conf();
-    img_conf->set_channels(inputs[i].ic);
-    img_conf->set_img_size_y(inputs[i].ih);
-    img_conf->set_img_size(inputs[i].iw);
-  }
-}
-
-void testConcatLayer(const std::vector<testImageDesc>& inputs) {
-  TestConfig dnnConfig;
-  getMKLDNNConcatConfig(dnnConfig, inputs);
-  RUN_MKLDNN_TEST_LAYER(dnnConfig, "concat", inputs[0])
-}
-
-TEST(MKLDNNLayer, ConcatLayer) {
-  testConcatLayer({{64, 128, 1, 1}, {64, 32, 1, 1}, {64, 64, 1, 1}});
-  testConcatLayer({{32, 100, 8, 8}, {32, 10, 8, 8}});
-}
-
-void testActivation(std::string actType, const testImageDesc& pm) {
-  // TODO(TJ): remove me when paddle support elu activation
-  if (actType == "mkldnn_elu") {
-    return;
-  }
-  const std::string compareTypes[] = {actType, actType.erase(0, 7)};
-  TestConfig cfg;
-  getAddtoConfig(cfg, pm);
-  TestConfig ref = cfg;
-  cfg.layerConfig.set_active_type(compareTypes[0]);
-  ref.layerConfig.set_active_type(compareTypes[1]);
-  RUN_MKLDNN_TEST(cfg, ref, pm)
-}
-
-TEST(MKLDNNActivation, Activations) {
-  auto types = MKLDNNActivation::getAllRegisteredTypes();
-  for (auto type : types) {
-    /* bs, c, h, w*/
-    testActivation(type, {16, 64, 32, 32});
-    testActivation(type, {2, 8, 1, 1});
-  }
-}
-
-DECLARE_string(config_args);
-TEST(MKLDNNNet, net) {
-  std::vector<std::string> cases = {"simple", "branch"};
-  for (auto name : cases) {
-    std::string config = "./gserver/tests/mkldnn_" + name + "_net.conf";
-    for (auto channels : {2, 32}) {
-      std::ostringstream oss;
-      oss << "channels=" << channels;
-      FLAGS_config_args = oss.str();
-      MKLDNNTester::runNetTest(config);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  FLAGS_use_gpu = false;
-  FLAGS_use_mkldnn = true;
-  initMain(argc, argv);
-  initPython(argc, argv);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_MaxPoolingWithMaskOutput.cpp b/paddle/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
deleted file mode 100644
index 5188d2abed899a210de66084109034ee381cd078..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <string>
-#include <vector>
-
-#include "LayerGradUtil.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;
-
-void setPoolConfig(TestConfig* config,
-                   PoolConfig* pool,
-                   const string& poolType) {
-  (*config).biasSize = 0;
-  (*config).layerConfig.set_type("pool");
-  (*config).layerConfig.set_num_filters(1);
-
-  int kw = 3, kh = 3;
-  int pw = 0, ph = 0;
-  int sw = 2, sh = 2;
-  pool->set_pool_type(poolType);
-  pool->set_channels(1);
-  pool->set_size_x(kw);
-  pool->set_size_y(kh);
-  pool->set_start(0);
-  pool->set_padding(pw);
-  pool->set_padding_y(ph);
-  pool->set_stride(sw);
-  pool->set_stride_y(sh);
-
-  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
-  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
-  pool->set_output_x(ow);
-  pool->set_output_y(oh);
-}
-
-void doOneMaxPoolingWithMaskOutputTest(MatrixPtr& inputMat,
-                                       const string& poolType,
-                                       bool use_gpu,
-                                       MatrixPtr& maskMat) {
-  TestConfig config;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 25, 0});
-  LayerInputConfig* input = config.layerConfig.add_inputs();
-  PoolConfig* pool = input->mutable_pool_conf();
-
-  pool->set_img_size(5);
-  pool->set_img_size_y(5);
-  setPoolConfig(&config, pool, poolType);
-  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
-                              pool->channels());
-
-  config.layerConfig.set_name("MaxPoolWithMask");
-
-  std::vector<DataLayerPtr> dataLayers;
-  LayerMap layerMap;
-  vector<Argument> datas;
-
-  initDataLayer(config,
-                &dataLayers,
-                &datas,
-                &layerMap,
-                "MaxPoolWithMask",
-                1,
-                false,
-                use_gpu);
-
-  dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
-
-  FLAGS_use_gpu = use_gpu;
-  std::vector<ParameterPtr> parameters;
-  LayerPtr maxPoolingWithMaskOutputLayer;
-  initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
-  maxPoolingWithMaskOutputLayer->forward(PASS_GC);
-
-  checkMatrixEqual(maxPoolingWithMaskOutputLayer->getOutput("mask").value,
-                   maskMat);
-}
-
-TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
-  bool useGpu = false;
-  MatrixPtr inputMat;
-  MatrixPtr maskMat;
-  real inputData[] = {0.1, 0.1, 0.5, 0.5, 1.1, 0.2, 0.2, 0.6, 0.1,
-                      0.1, 0.3, 0.3, 0.7, 0.1, 0.1, 0.4, 0.4, 0.8,
-                      0.8, 0.1, 1.0, 2.0, 3.0, 0.0, 9.0};
-  real maskData[] = {12, 4, 22, 24};
-
-  inputMat = Matrix::create(1, 25, false, useGpu);
-  maskMat = Matrix::create(1, 4, false, useGpu);
-  inputMat->setData(inputData);
-  maskMat->setData(maskData);
-  doOneMaxPoolingWithMaskOutputTest(
-      inputMat, "max-pool-with-mask", useGpu, maskMat);
-#ifdef PADDLE_WITH_CUDA
-  useGpu = true;
-  inputMat = Matrix::create(1, 25, false, useGpu);
-  maskMat = Matrix::create(1, 4, false, useGpu);
-  inputMat->copyFrom(inputData, 25);
-  maskMat->copyFrom(maskData, 4);
-  doOneMaxPoolingWithMaskOutputTest(
-      inputMat, "max-pool-with-mask", useGpu, maskMat);
-#endif
-}
diff --git a/paddle/gserver/tests/test_MultinomialSampler.cpp b/paddle/gserver/tests/test_MultinomialSampler.cpp
deleted file mode 100644
index 4a295ea9d51788f988fe79f8439cc7769f661d8e..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_MultinomialSampler.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <random>
-
-#include <gtest/gtest.h>
-#include <vector>
-
-#undef PADDLE_DISABLE_TIMER
-#include "paddle/utils/Stat.h"
-
-#include "paddle/gserver/layers/MultinomialSampler.h"
-#include "paddle/utils/Util.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-class MultinomialSamplerTester : public MultinomialSampler {
-public:
-  MultinomialSamplerTester(real* prob, int size)
-      : MultinomialSampler(prob, size) {}
-
-  template <typename Rand1>
-  int testGen(Rand1 rand1) {
-    return gen1(rand1);
-  }
-};
-
-TEST(MultinomialSampler, gen) {
-  int numGrids = 1024 * 1024;
-  int size = 1024 * 4;
-  default_random_engine reng;
-
-  for (size_t iter = 0; iter < 256; ++iter) {
-    uniform_int_distribution<int> rand(1, numGrids / size * 1.8);
-    vector<real> prob;
-    int sum = 0;
-    for (int i = 0; i < size; ++i) {
-      prob.push_back(rand(reng));
-      sum += prob.back();
-    }
-
-    CHECK_LE(sum, numGrids);
-    prob.back() += numGrids - sum;
-
-    vector<int> counts(size);
-    MultinomialSamplerTester sampler(&prob[0], size);
-    counts.assign(size, 0);
-    {
-      double s = (double)size / (double)numGrids;
-      REGISTER_TIMER("MultinomialSampler");
-      for (double i = 0; i < numGrids; ++i) {
-        int ret = sampler.testGen([i, s]() { return s * i; });
-        if (ret < 0 || ret >= size) {
-          EXPECT_GE(ret, 0);
-          EXPECT_LT(ret, size);
-          break;
-        }
-        ++counts[ret];
-      }
-    }
-    for (int i = 0; i < size; ++i) {
-      if (prob[i] != counts[i]) {
-        EXPECT_EQ(prob[i], counts[i]);
-        LOG(INFO) << iter;
-        break;
-      }
-    }
-  }
-}
-
-void benchmarkRandom() {
-  int n = 1024 * 1024;
-
-  int sum;
-  double sum1;
-
-  sum = 0;
-  unsigned int seed = 1;
-  {
-    REGISTER_TIMER("crand");
-    for (int i = 0; i < n; ++i) {
-      sum += rand_r(&seed) % 1000;
-    }
-  }
-  LOG(INFO) << "sum=" << sum;
-
-  default_random_engine reng;
-  uniform_int_distribution<int> rand(1, 1000);
-  sum = 0;
-  {
-    REGISTER_TIMER("stdrand");
-    for (int i = 0; i < n; ++i) {
-      sum += rand(reng);
-    }
-  }
-  LOG(INFO) << "sum=" << sum;
-
-  sum = 0;
-  {
-    REGISTER_TIMER("default_random_engine");
-    for (int i = 0; i < n; ++i) {
-      sum += reng();
-    }
-  }
-  LOG(INFO) << "sum=" << sum;
-
-  uniform_real_distribution<double> rand1(0, 1);
-  sum1 = 0;
-  {
-    REGISTER_TIMER("stdrand1");
-    for (int i = 0; i < n; ++i) {
-      sum1 += rand1(reng);
-    }
-  }
-  LOG(INFO) << "sum1=" << sum1;
-
-  sum1 = 0;
-  {
-    real a = 1.0f / (real)RAND_MAX;
-    REGISTER_TIMER("crand1");
-    for (int i = 0; i < n; ++i) {
-      sum1 += a * rand_r(&seed);
-    }
-  }
-  LOG(INFO) << "sum1=" << sum1;
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  benchmarkRandom();
-  int ret = RUN_ALL_TESTS();
-  globalStat.printSegTimerStatus();
-  return ret;
-}
diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp
deleted file mode 100644
index fda3f2f7934adde09303f443ca5e8de6a7d077cd..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ /dev/null
@@ -1,294 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#undef PADDLE_DISABLE_TIMER
-#include <gtest/gtest.h>
-#include <paddle/utils/PythonUtil.h>
-#include <algorithm>
-#include <cstdlib>
-
-#include "paddle/testing/TestUtil.h"
-#include "paddle/trainer/Trainer.h"
-#include "paddle/utils/Stat.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_int32(gpu_id);
-DECLARE_double(checkgrad_eps);
-DEFINE_bool(use_label, true, "input label or sequence label");
-DEFINE_bool(static_para, false, "static parameter");
-
-struct DataIn {
-  std::vector<Argument> inArgs;
-  std::vector<MatrixPtr> outGrads;
-  std::vector<VectorPtr> paraValues;
-};
-
-struct DataOut {
-  std::vector<MatrixPtr> outValues;
-  std::vector<VectorPtr> paraGrads;
-};
-
-void initArgument(DataIn& data,
-                  const std::string& configPath,
-                  bool useGpu = FLAGS_use_gpu) {
-  TrainerConfigHelper config(configPath);
-  size_t batchSize = config.getOptConfig().batch_size();
-
-  for (const auto& layer_name : config.getModelConfig().input_layer_names()) {
-    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
-                                     config.getModelConfig().layers().end(),
-                                     [=](const LayerConfig& layer_config) {
-                                       return layer_config.name() == layer_name;
-                                     });
-    CHECK(layer_config != config.getModelConfig().layers().end());
-
-    size_t layerSize = layer_config->size();
-    Argument arg;
-    arg.value = Matrix::create(batchSize, layerSize, false, useGpu);
-    arg.grad = Matrix::create(batchSize, layerSize, false, useGpu);
-    arg.value->randomizeUniform();
-    arg.value->add(-0.5);
-    arg.value->sigmoid(*arg.value);
-    arg.grad->zeroMem();
-    if (FLAGS_use_label) {
-      arg.ids = VectorT<int>::create(batchSize, useGpu);
-      arg.ids->rand(layerSize);
-    }
-    generateSequenceStartPositions(batchSize, arg.sequenceStartPositions);
-    data.inArgs.push_back(arg);
-  }
-
-  for (const auto& layer_name : config.getModelConfig().output_layer_names()) {
-    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
-                                     config.getModelConfig().layers().end(),
-                                     [=](const LayerConfig& layer_config) {
-                                       return layer_config.name() == layer_name;
-                                     });
-    CHECK(layer_config != config.getModelConfig().layers().end());
-
-    size_t layerSize = layer_config->size();
-    MatrixPtr grad = Matrix::create(batchSize, layerSize, false, useGpu);
-    grad->randomizeUniform();
-    data.outGrads.push_back(grad);
-  }
-
-  for (const auto& para_config : config.getModelConfig().parameters()) {
-    VectorPtr value = Vector::create(para_config.size(), useGpu);
-    value->randnorm(0, 2);
-    data.paraValues.push_back(value);
-  }
-}
-
-void calcGradient(DataIn& in, DataOut& out, const std::string& configPath) {
-  *ThreadLocalRand::getSeed() = 0;
-  srand(0);
-
-  Trainer trainer;
-  auto config = std::make_shared<TrainerConfigHelper>(configPath);
-  trainer.init(config, false);
-
-  std::vector<ParameterPtr> parameters;
-  vector<Argument> outArgs;
-
-  auto gradientMachine = trainer.getGradientMachine();
-  parameters = gradientMachine->getParameters();
-  if (FLAGS_static_para) {
-    for (size_t i = 0; i < parameters.size(); i++) {
-      parameters[i]->getBuf(PARAMETER_VALUE)->one();
-    }
-  } else {
-    for (size_t i = 0; i < in.paraValues.size(); i++) {
-      parameters[i]->getBuf(PARAMETER_VALUE)->copyFrom(*in.paraValues[i]);
-    }
-  }
-  gradientMachine->start();
-  gradientMachine->forward(in.inArgs, &outArgs, PASS_TRAIN);
-  for (size_t i = 0; i < in.outGrads.size(); i++) {
-    // If the all the layers in the config have no parameters, also
-    // not set NeedGradient(), the outArgs[i] will be nullptr.
-    outArgs[i].grad->copyFrom(*in.outGrads[i]);
-  }
-  gradientMachine->backward();
-  for (size_t i = 0; i < in.outGrads.size(); i++) {
-    MatrixPtr value = Matrix::create(outArgs[i].value->getHeight(),
-                                     outArgs[i].value->getWidth(),
-                                     false,
-                                     false);
-    value->copyFrom(*outArgs[i].value);
-    out.outValues.push_back(value);
-  }
-  for (size_t i = 0; i < in.paraValues.size(); i++) {
-    VectorPtr grad = Vector::create(
-        parameters[i]->getBuf(PARAMETER_GRADIENT)->getSize(), false);
-    grad->copyFrom(*parameters[i]->getBuf(PARAMETER_GRADIENT));
-    out.paraGrads.push_back(grad);
-  }
-
-  for (int i = 0; i < 20; i++) {
-    REGISTER_TIMER("forward");
-    gradientMachine->forward(in.inArgs, &outArgs, PASS_TRAIN);
-  }
-  for (int i = 0; i < 20; i++) {
-    REGISTER_TIMER("backward");
-    gradientMachine->backward();
-  }
-
-  gradientMachine->finish();
-}
-
-void checkBuffer(real* A,
-                 const char* desA,
-                 real* B,
-                 const char* desB,
-                 size_t len,
-                 size_t width = 1) {
-  int nNum = 0;
-  for (size_t i = 0; i < len; ++i) {
-    real diff = fabs(A[i] - B[i]);
-    if (diff > 0.0f &&
-        diff / std::max(fabs(A[i]), fabs(B[i])) > FLAGS_checkgrad_eps) {
-      nNum++;
-      LOG(INFO) << "Row: " << i / width << ", " << desA << " : " << A[i]
-                << "    " << desB << " : " << B[i];
-    }
-  }
-  EXPECT_EQ(0, nNum);
-}
-
-void compareGradient(DataOut& outA, DataOut& outB) {
-  LOG(INFO) << "------------------------------"
-            << " Check Network Output "
-            << "------------------------------";
-  for (size_t i = 0; i < outA.outValues.size(); ++i) {
-    LOG(INFO) << "OUTPUT VALUE: " << i;
-    checkBuffer(outA.outValues[i]->getData(),
-                "network A output",
-                outB.outValues[i]->getData(),
-                "network B output",
-                outA.outValues[i]->getElementCnt(),
-                outA.outValues[i]->getWidth());
-  }
-
-  if (!FLAGS_static_para) {
-    LOG(INFO) << "------------------------------"
-              << " Check Parameters "
-              << "------------------------------";
-    for (size_t i = 0; i < outA.paraGrads.size(); ++i) {
-      LOG(INFO) << "PARAMETER GRADIENT: " << i;
-      checkBuffer(outA.paraGrads[i]->getData(),
-                  "Network A",
-                  outB.paraGrads[i]->getData(),
-                  "Network B",
-                  outA.paraGrads[i]->getSize());
-    }
-  }
-}
-
-void compareNetwork(const std::string& config_file_a,
-                    const std::string& config_file_b) {
-  DataIn in;
-  initArgument(in, config_file_a);
-
-  DataOut dataA;
-  calcGradient(in, dataA, config_file_a);
-  LOG(INFO) << "forwardBackward of Network A is finished";
-  globalStat.printSegTimerStatus();
-  globalStat.reset();
-  LOG(INFO) << "\n\n";
-
-  DataOut dataB;
-  calcGradient(in, dataB, config_file_b);
-  LOG(INFO) << "forwardBackward of the Network B is finished";
-  globalStat.printSegTimerStatus();
-  globalStat.reset();
-  LOG(INFO) << "\n\n";
-
-  compareGradient(dataA, dataB);
-}
-
-TEST(Compare, concat_dotmul) {
-  std::string config_file_a = "./gserver/tests/concat_dotmul_a.conf";
-  std::string config_file_b = "./gserver/tests/concat_dotmul_b.conf";
-  compareNetwork(config_file_a, config_file_b);
-}
-
-TEST(Compare, concat_fullmatrix) {
-  std::string config_file_a = "./gserver/tests/concat_fullmatrix_a.conf";
-  std::string config_file_b = "./gserver/tests/concat_fullmatrix_b.conf";
-  compareNetwork(config_file_a, config_file_b);
-}
-
-TEST(Compare, concat_table) {
-  std::string config_file_a = "./gserver/tests/concat_table_a.conf";
-  std::string config_file_b = "./gserver/tests/concat_table_b.conf";
-  compareNetwork(config_file_a, config_file_b);
-}
-
-TEST(Compare, concat_slice) {
-  std::string config_file_a = "./gserver/tests/concat_slice_a.conf";
-  std::string config_file_b = "./gserver/tests/concat_slice_b.conf";
-  compareNetwork(config_file_a, config_file_b);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(Compare, img_pool) {
-  std::string config_file_a = "./gserver/tests/img_pool_a.conf";
-  std::string config_file_b = "./gserver/tests/img_pool_b.conf";
-  bool useGpu = FLAGS_use_gpu;
-  FLAGS_use_gpu = true;
-  compareNetwork(config_file_a, config_file_b);
-  FLAGS_use_gpu = useGpu;
-}
-
-TEST(Compare, img_conv) {
-  std::string config_file_a = "./gserver/tests/img_conv_a.conf";
-  std::string config_file_b = "./gserver/tests/img_conv_b.conf";
-  bool useGpu = FLAGS_use_gpu;
-  FLAGS_use_gpu = true;
-  compareNetwork(config_file_a, config_file_b);
-  FLAGS_use_gpu = useGpu;
-}
-
-// Test cudnn_conv and exconv give the same result
-TEST(Compare, img_conv2) {
-  std::string config_file_a = "./gserver/tests/img_conv_cudnn.py";
-  std::string config_file_b = "./gserver/tests/img_conv_exconv.py";
-  bool useGpu = FLAGS_use_gpu;
-  double eps = FLAGS_checkgrad_eps;
-  FLAGS_use_gpu = true;
-  // Sometimes, this unit test will fail with 1e-2
-  FLAGS_checkgrad_eps = 4e-2;
-  compareNetwork(config_file_a, config_file_b);
-  FLAGS_use_gpu = useGpu;
-  FLAGS_checkgrad_eps = eps;
-}
-#endif
-
-DEFINE_string(config_file_a, "", "config of one network to compare");
-DEFINE_string(config_file_b, "", "config of another network to compare");
-TEST(Compare, network) {
-  if (FLAGS_config_file_a != "" && FLAGS_config_file_b != "") {
-    compareNetwork(FLAGS_config_file_a, FLAGS_config_file_b);
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  initPython(argc, argv);
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
diff --git a/paddle/gserver/tests/test_PyDataProvider.cpp b/paddle/gserver/tests/test_PyDataProvider.cpp
deleted file mode 100644
index a1dee9795077b835392469b5085e9728679a1664..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_PyDataProvider.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include <gtest/gtest.h>
-
-#include "paddle/gserver/dataproviders/PyDataProvider.h"
-#include "paddle/utils/Util.h"
-
-#include "paddle/testing/TestUtil.h"
-
-using namespace std;     // NOLINT
-using namespace paddle;  // NOLINT
-
-void simpleValueCheck(const vector<Argument>& argumentList, bool useGpu);
-void simpleSequenceCheck(const vector<Argument>& argumentList, int sample_num);
-
-TEST(PyDataProvider, py_fill_slots) {
-  DataConfig config;
-  config.set_type("py");
-  config.set_async_load_data(false);
-  config.set_load_data_module(std::string("pyDataProvider"));
-  config.set_load_data_object(std::string("SimpleDataProvider"));
-  config.clear_files();
-  std::string dataFile = "gserver/tests/pyDataProvider/pyDataProviderList";
-  config.set_files(dataFile);
-#ifndef PADDLE_WITH_CUDA
-  bool useGpu = false;
-#else
-  bool useGpu = true;
-#endif
-  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
-  DataBatch dataBatch;
-  dataProvider->getNextBatchInternal(2, &dataBatch);
-  const std::vector<Argument>& argumentList = dataBatch.getStreams();
-  // Check size
-  EXPECT_EQ(argumentList.size(), 3UL);
-  EXPECT_EQ(argumentList[0].value->getWidth(), 3UL);
-  EXPECT_EQ(argumentList[0].value->getHeight(), 2UL);
-  EXPECT_EQ(argumentList[0].value->getElementCnt(), 6UL);
-  EXPECT_EQ(argumentList[1].value->getWidth(), 7UL);
-  EXPECT_EQ(argumentList[1].value->getHeight(), 2UL);
-  EXPECT_EQ(argumentList[1].value->getElementCnt(), 4UL);
-  EXPECT_EQ(argumentList[2].ids->getSize(), 2UL);
-  // Check value
-  simpleValueCheck(argumentList, useGpu);
-  // Check sequenceStartPositions
-  simpleSequenceCheck(argumentList, 2);
-}
-
-TEST(PyDataProvider, py_fill_nest_slots) {
-  DataConfig config;
-  config.set_type("py");
-  config.set_async_load_data(false);
-  config.set_load_data_module(std::string("pyDataProvider"));
-  config.set_load_data_object(std::string("SimpleNestDataProvider"));
-  config.clear_files();
-  std::string dataFile = "gserver/tests/pyDataProvider/pyDataProviderList";
-  config.set_files(dataFile);
-  EXPECT_EQ(config.IsInitialized(), true);
-#ifndef PADDLE_WITH_CUDA
-  bool useGpu = false;
-#else
-  bool useGpu = true;
-#endif
-  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
-  DataBatch dataBatch;
-  dataProvider->getNextBatchInternal(2, &dataBatch);
-  const std::vector<Argument>& argumentList = dataBatch.getStreams();
-  // Check size
-  EXPECT_EQ(argumentList.size(), 3UL);
-  EXPECT_EQ(argumentList[0].value->getWidth(), 3UL);
-  EXPECT_EQ(argumentList[0].value->getHeight(), 4UL);
-  EXPECT_EQ(argumentList[0].value->getElementCnt(), 12UL);
-  EXPECT_EQ(argumentList[1].value->getWidth(), 7UL);
-  EXPECT_EQ(argumentList[1].value->getHeight(), 4UL);
-  EXPECT_EQ(argumentList[1].value->getElementCnt(), 8UL);
-  EXPECT_EQ(argumentList[2].ids->getSize(), 4UL);
-  // Check value
-  simpleValueCheck(argumentList, useGpu);
-  // Check sequenceStartPositions
-  simpleSequenceCheck(argumentList, 4);
-  // Check subSequenceStartPositions
-  EXPECT_EQ(argumentList[0].subSequenceStartPositions->getSize(), 4UL);
-  EXPECT_EQ(argumentList[1].subSequenceStartPositions->getSize(), 3UL);
-  EXPECT_EQ(argumentList[2].subSequenceStartPositions->getSize(), 4UL);
-  for (size_t i = 0; i < argumentList.size(); i++) {
-    EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(0), 0);
-    EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(1), 1);
-    if (i != 1) {
-      EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(2), 2);
-      EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(3), 4);
-    } else {
-      EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(2), 4);
-    }
-  }
-}
-
-void simpleValueCheck(const vector<Argument>& argumentList, bool useGpu) {
-  // Dense
-  real* data;
-  if (useGpu) {
-    MatrixPtr cpuMatrixPtr = Matrix::create(argumentList[0].value->getHeight(),
-                                            argumentList[0].value->getWidth(),
-                                            0,
-                                            0);
-    cpuMatrixPtr->copyFrom(*argumentList[0].value);
-    data = cpuMatrixPtr->getData();
-  } else {
-    data = argumentList[0].value->getData();
-  }
-  for (size_t i = 0; i < argumentList[0].value->getElementCnt(); ++i) {
-    EXPECT_EQ(*(data + i), (float)(i % 3 + 1));
-  }
-  // Sparse without value
-  GpuSparseMatrixPtr matGpu;
-  CpuSparseMatrixPtr matCpu;
-  if (useGpu) {
-    matGpu = dynamic_pointer_cast<GpuSparseMatrix>(argumentList[1].value);
-    ASSERT_TRUE(matGpu != NULL);
-  } else {
-    data = argumentList[0].value->getData();
-    matCpu = dynamic_pointer_cast<CpuSparseMatrix>(argumentList[1].value);
-    ASSERT_TRUE(matCpu != NULL);
-  }
-  for (size_t i = 0; i < argumentList[1].value->getHeight(); ++i) {
-    size_t colNum = useGpu ? matGpu->getColNum(i) : matCpu->getColNum(i);
-    EXPECT_EQ(colNum, (size_t)2);
-    const int* buf = useGpu ? matGpu->getRowCols(i) : matCpu->getRowCols(i);
-    for (size_t j = 0; j < colNum; ++j) {
-      EXPECT_EQ((size_t)buf[j], (size_t)(j + 1));
-    }
-  }
-  // Index
-  for (size_t j = 0; j < argumentList[2].ids->getSize(); ++j) {
-    EXPECT_EQ((size_t)argumentList[2].ids->get(j), 0UL);
-  }
-}
-
-void simpleSequenceCheck(const vector<Argument>& argumentList, int sample_num) {
-  EXPECT_EQ(argumentList[0].sequenceStartPositions->getSize(), 3UL);
-  EXPECT_EQ(argumentList[1].sequenceStartPositions->getSize(), 2UL);
-  EXPECT_EQ(argumentList[2].sequenceStartPositions->getSize(), 3UL);
-  for (size_t i = 0; i < argumentList.size(); i++) {
-    EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(0), 0);
-    if (i != 1) {
-      EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(1), 1);
-      EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(2),
-                sample_num);
-    } else {
-      EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(1),
-                sample_num);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  initPython(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_PyDataProvider2.cpp b/paddle/gserver/tests/test_PyDataProvider2.cpp
deleted file mode 100644
index b39fb3534509ebde2702c02e35800fe3ed6016c3..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_PyDataProvider2.cpp
+++ /dev/null
@@ -1,409 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_NO_PYTHON
-#include <gtest/gtest.h>
-#include <fstream>
-#include "paddle/gserver/dataproviders/DataProvider.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Util.h"
-
-DEFINE_string(train_list, "unittest.list", "file list for unittest");
-
-namespace paddle {
-namespace unittest {
-namespace pydp2 {
-extern void setOnPoolFilledHook(const std::function<void(size_t)> &func);
-extern void clearOnPoolFilledHook();
-
-}  // namespace pydp2
-}  // namespace unittest
-}  // namespace paddle
-
-const paddle::real epsilon = 1e-5;
-
-static inline int64_t readDataBatch(paddle::DataBatch *batch,
-                                    const std::string &funcName,
-                                    int64_t batchSize = 65535) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object(funcName);
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  return provider->getNextBatchInternal(batchSize, batch);
-}
-
-TEST(PyDataProvider2, dense_no_seq) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_dense_no_seq");
-
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-
-  provider->setSkipShuffle();  // skip shuffle for unittest.
-
-  paddle::DataBatch batch;
-  for (size_t pass = 0; pass < 2; ++pass) {  // read 2 passes
-    provider->reset();
-    int64_t num = provider->getNextBatchInternal(100, &batch);
-    ASSERT_NE(num, 0);
-    ASSERT_EQ((size_t)batch.getStreams().size(), (size_t)1);
-    ASSERT_EQ((size_t)batch.getSize(), (size_t)100);
-    // Check batch data.
-    for (size_t i = 0; i < 100; ++i) {
-      for (size_t j = 0; j < 200; ++j) {
-        paddle::real tmp = (paddle::real)((j - 100.0) * (i + 1) / 200.0);
-        ASSERT_NEAR(
-            batch.getStreams()[0].value->getData()[i * 200 + j], tmp, epsilon);
-      }
-    }
-
-    num = provider->getNextBatchInternal(100, &batch);
-    ASSERT_NE(num, 0);
-    ASSERT_EQ(batch.getStreams().size(), (size_t)1);
-    ASSERT_EQ((size_t)batch.getSize(), (size_t)100);
-    // Check batch data.
-    for (size_t i = 0; i < 100; ++i) {
-      size_t ii = i + 100;
-      for (size_t j = 0; j < 200; ++j) {
-        paddle::real tmp = (paddle::real)((j - 100.0) * (ii + 1) / 200.0);
-        ASSERT_NEAR(
-            batch.getStreams()[0].value->getData()[i * 200 + j], tmp, epsilon);
-      }
-    }
-    num = provider->getNextBatchInternal(100, &batch);
-    ASSERT_EQ(num, 0);
-  }
-}
-
-TEST(PyDataProvider2, index_no_seq) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_index_no_seq");
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-
-  provider->setSkipShuffle();  // skip shuffle for unittest.
-  paddle::DataBatch batch;
-  for (size_t pass = 0; pass < 2; ++pass) {
-    provider->reset();
-    int64_t num = provider->getNextBatchInternal(10000, &batch);
-    CHECK_EQ(num, 200);
-    for (int i = 0; i < 200; ++i) {
-      CHECK_EQ(i, batch.getStreams()[0].ids->getData()[i]);
-    }
-  }
-}
-
-TEST(PyDataProvider2, init_hook) {
-  paddle::PyObjectPtr pickle = paddle::py::import("pickle");
-  paddle::PyObjectPtr globals(PyModule_GetDict(PyImport_AddModule("__main__")));
-  PyDict_SetItemString(globals.get(), "pickle", pickle.get());
-  paddle::PyObjectPtr locals(PyDict_New());
-  paddle::PyObjectPtr mdl(PyRun_String(
-      "dumps = pickle.dumps({'value':[float(x) for x in xrange(20)]})",
-      Py_file_input,
-      globals.get(),
-      locals.get()));
-  CHECK_PY(mdl) << "Error!";
-  paddle::PyObjectPtr dps(PyDict_GetItemString(locals.get(), "dumps"));
-  CHECK_PY(dps) << "Error!";
-
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_init_hook");
-  config.set_load_data_args(PyString_AsString(dps.get()));
-
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->setSkipShuffle();  // skip shuffle for unittest.
-  provider->reset();
-  paddle::DataBatch batch;
-  int64_t num = provider->getNextBatchInternal(100000, &batch);
-  ASSERT_EQ(num, 200);
-  auto &mat = batch.getStreams()[0].value;
-  ASSERT_EQ((size_t)mat->getWidth(), (size_t)20);
-  for (size_t i = 0; i < 200; ++i) {
-    for (size_t j = 0; j < 20; ++j) {
-      ASSERT_NEAR((paddle::real)j, mat->getData()[i * 20 + j], epsilon);
-    }
-  }
-}
-
-TEST(PyDataProvider2, sparse_no_value_no_seq) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_sparse_non_value_no_seq");
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  paddle::DataBatch batch;
-  int64_t num = provider->getNextBatchInternal(10000, &batch);
-  CHECK_EQ(num, 200);
-  auto csm = std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(
-      batch.getStreams()[0].value);
-  CHECK(csm != nullptr);
-  for (int i = 0; i < 200; ++i) {
-    CHECK_EQ(csm->getColNum(i), (size_t)10);
-    int *cols = csm->getRowCols(i);
-    for (int j = 0; j < 10; ++j) {
-      CHECK_EQ(cols[j], (i + 1) * (j + 1));
-    }
-  }
-}
-
-TEST(PyDataProvider2, sparse_value_no_seq) {
-  paddle::DataBatch batch;
-  CHECK_EQ(readDataBatch(&batch, "test_sparse_value_no_seq"), 200);
-  auto csm = std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(
-      batch.getStreams()[0].value);
-  CHECK(csm != nullptr);
-  for (int i = 0; i < 200; ++i) {
-    CHECK_EQ(csm->getColNum(i), (size_t)10);
-    int *cols = csm->getRowCols(i);
-    real *dat = csm->getRowValues(i);
-    for (int j = 0; j < 10; ++j) {
-      EXPECT_EQ(cols[j], (i + 1) * (j + 1));
-      EXPECT_EQ(dat[j], real(j) / real(i + 1));
-    }
-  }
-}
-
-TEST(PyDataProvider2, index_seq) {
-  paddle::DataBatch batch;
-  CHECK_EQ(readDataBatch(&batch, "test_index_seq"), 200);
-  auto &arg = batch.getStreams()[0];
-  CHECK_EQ((int)arg.ids->getSize(), (200 + 1) * 200 / 2);
-  size_t tmp = 0;
-  for (size_t i = 0; i < 200; ++i) {  // CHECK DATA CORRECT
-    for (size_t j = 0; j < i + 1; ++j) {
-      ASSERT_EQ((size_t)arg.ids->getData()[tmp], j);
-      ++tmp;
-    }
-  }
-  ASSERT_EQ(arg.sequenceStartPositions->getSize(), (size_t)201);
-  tmp = 0;
-  for (size_t i = 0; i < 200; ++i) {
-    tmp += i;
-    ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[i], tmp);
-  }
-  tmp += 200;
-  ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[200], tmp);
-}
-
-TEST(PyDataProvider2, index_sub_seq) {
-  paddle::DataBatch batch;
-  ASSERT_EQ(readDataBatch(&batch, "test_index_sub_seq"), 200);
-  auto &arg = batch.getStreams()[0];
-  size_t tmp = 0;
-  for (size_t i = 0; i < 200; ++i) {
-    for (size_t j = 0; j < i + 1; ++j) {
-      for (size_t k = 0; k < j + 1; ++k) {
-        CHECK_EQ((size_t)arg.ids->getData()[tmp++], k);
-      }
-    }
-  }
-
-  CHECK_EQ(tmp, arg.ids->getSize());
-
-  ASSERT_EQ((size_t)arg.sequenceStartPositions->getSize(), (size_t)201);
-  ASSERT_EQ(arg.subSequenceStartPositions->getData(false)[0], 0);
-  ASSERT_EQ(arg.sequenceStartPositions->getData(false)[0], 0);
-  size_t idx = 1;
-  tmp = 0;
-  for (size_t i = 0; i < 200; ++i) {
-    for (size_t j = 0; j < i + 1; ++j) {
-      tmp += j + 1;
-      ASSERT_EQ((size_t)arg.subSequenceStartPositions->getData(false)[idx],
-                (size_t)tmp);
-      ++idx;
-    }
-    ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[i + 1], tmp);
-  }
-}
-
-TEST(PyDataProvider2, min_pool_size) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_min_pool_size");
-  config.set_load_data_args("");
-  size_t totalData = 1 << 14;
-  constexpr size_t batchSize = 100;
-  constexpr size_t minPoolSize = 1000;
-  paddle::DataBatch batch;
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->reset();
-
-  paddle::unittest::pydp2::setOnPoolFilledHook([&](size_t poolSize) {
-    if (totalData > batchSize) {
-      CHECK_GE(poolSize, std::min(totalData - batchSize, minPoolSize));
-    }
-  });
-  while (true) {
-    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
-    if (realBatchSize) {
-      totalData -= realBatchSize;
-    } else {
-      break;
-    }
-  }
-  paddle::unittest::pydp2::clearOnPoolFilledHook();
-}
-
-TEST(PyDataProvider2, can_over_batch_size) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_can_over_batch_size");
-  config.set_load_data_args("");
-  paddle::DataBatch batch;
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->reset();
-  constexpr size_t batchSize = 100;
-  while (true) {
-    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
-    if (realBatchSize) {
-      CHECK_LE(static_cast<size_t>(realBatchSize), batchSize);
-    } else {
-      break;
-    }
-  }
-}
-
-TEST(PyDataProvider2, input_order) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_input_order");
-  config.set_load_data_args("");
-
-  paddle::ModelConfig modelConfig;
-  *modelConfig.add_input_layer_names() = "input1";
-  *modelConfig.add_input_layer_names() = "input2";
-  paddle::DataBatch batch;
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, modelConfig, false));
-  provider->reset();
-  constexpr size_t batchSize = 100;
-  while (true) {
-    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
-    if (!realBatchSize) {
-      break;
-    }
-    ASSERT_EQ(batch.getStreams().size(), static_cast<size_t>(2));
-    for (int64_t i = 0; i < realBatchSize; ++i) {
-      ASSERT_EQ(batch.getStream(0).ids->getData()[i], 0);
-      ASSERT_EQ(batch.getStream(1).ids->getData()[i], 1);
-    }
-  }
-}
-
-TEST(PyDataProvider2, test_check) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_check");
-  config.set_load_data_args("");
-  paddle::DataBatch batch;
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->reset();
-  while (true) {
-    int64_t realBatchSize = provider->getNextBatchInternal(100, &batch);
-    if (!realBatchSize) {
-      break;
-    } else {
-      auto &ivec = batch.getStream(0).ids;
-      for (size_t i = 0; i < ivec->getSize(); ++i) {
-        CHECK_LT(ivec->getData()[i], 10);
-      }
-    }
-  }
-}
-
-TEST(PyDataProvider2, multiThread) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_dense_no_seq");
-  config.set_async_load_data(true);
-
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-  provider->reset();
-  paddle::DataBatch batch;
-  provider->getNextBatch(100, &batch);
-  provider->reset();
-  provider.reset();
-}
-
-TEST(PyDataProvider2, minPoolSizeWithCache) {
-  paddle::DataConfig config;
-  config.set_type("py2");
-  config.set_files(FLAGS_train_list.c_str());
-  config.set_load_data_module("test_PyDataProvider2");
-  config.set_load_data_object("test_min_pool_size_with_cache");
-  config.set_async_load_data(true);
-
-  std::unique_ptr<paddle::DataProvider> provider(
-      paddle::DataProvider::create(config, false));
-
-  paddle::DataBatch batch;
-
-  for (int i = 0; i < 10; ++i) {
-    provider->reset();
-    int64_t sum = 0;
-    while (int64_t actualNum = provider->getNextBatch(100, &batch)) {
-      sum += actualNum;
-    }
-    ASSERT_EQ(1 << 20, sum);
-  }
-}
-
-int main(int argc, char **argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  paddle::initPython(argc, argv);
-
-  std::ofstream fout(FLAGS_train_list);
-  CHECK(fout.is_open());
-  fout << "stub file name" << std::endl;  // in unittest, filename is not used.
-  fout.close();
-
-  return RUN_ALL_TESTS();
-}
-
-#endif
diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
deleted file mode 100644
index 72324fcf29cc60867005da25b35a8075fd590a89..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/gserver/gradientmachines/GradientMachine.h>
-#include <paddle/parameter/ParameterUpdateFunctions.h>
-#include <paddle/trainer/Trainer.h>
-#include <paddle/trainer/TrainerInternal.h>
-#include <paddle/utils/PythonUtil.h>
-#include <paddle/utils/Util.h>
-#include <paddle/utils/Version.h>
-
-DECLARE_int32(seed);
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-class TrainerForTest : public paddle::Trainer {
-public:
-  void startTrain() {
-    GradientMachine& gm = *this->trainerInternal_.getGradientMachine();
-    gm.start();
-  }
-
-  void finishTrain() {
-    GradientMachine& gm = *this->trainerInternal_.getGradientMachine();
-    gm.finish();
-  }
-
-  /**
-   * Get total dimension of all parameters.
-   *
-   * @return the total dimension of all parameters
-   */
-  size_t getTotalParameterSize() const {
-    auto p = const_cast<TrainerForTest*>(this);
-    auto& params = p->getGradientMachine()->getParameters();
-    return std::accumulate(
-        params.begin(), params.end(), 0UL, [](size_t a, const ParameterPtr& p) {
-          return a + p->getSize();
-        });
-  }
-};
-
-void CalCost(const string& conf,
-             const string& dir,
-             real* cost,
-             int num_passes) {
-  auto config = std::make_shared<TrainerConfigHelper>(conf);
-  TrainerForTest trainer;
-  trainer.init(config);
-  mkDir(dir.c_str());
-  config->setSaveDir(dir);
-  auto dataProvider = trainer.getDataProvider();
-  int32_t batchSize = config->getOptConfig().batch_size();
-  real learningRate = config->getOptConfig().learning_rate();
-  real momentum = 0;
-  real decayRate = 0;
-  int64_t dim = trainer.getTotalParameterSize();
-  CpuVector vecW(dim);
-  CpuVector vecGradient(dim);
-  CpuVector vecMomentum(dim);
-
-  // vecW needs to be assigned, otherwise the variable is an uncertain value.
-
-  *ThreadLocalRand::getSeed() = FLAGS_seed;
-  vecW.randnorm(0, 0.1);
-  vecMomentum.randnorm(0, 0.1);
-
-  trainer.startTrain();
-  for (int i = 0; i < num_passes; ++i) {
-    real totalCost = 0;
-    dataProvider->reset();
-    while (true) {
-      DataBatch dataBatch;
-      int num = dataProvider->getNextBatch(batchSize, &dataBatch);
-      if (num == 0) break;
-      totalCost += trainer.calcGradient(dataBatch, vecW, vecGradient);
-      sgdUpdate(
-          learningRate, momentum, decayRate, &vecW, &vecGradient, &vecMomentum);
-    }
-    cost[i] = totalCost;
-  }
-  trainer.finishTrain();
-  rmDir(dir.c_str());
-}
-
-void test(const string& conf1, const string& conf2, double eps, bool useGpu) {
-  if (!paddle::version::isWithGpu() && useGpu) {
-    return;
-  }
-  FLAGS_use_gpu = useGpu;
-  int num_passes = 5;
-  real* cost1 = new real[num_passes];
-  const string dir1 = "gserver/tests/t1";
-  CalCost(conf1, dir1, cost1, num_passes);
-
-  real* cost2 = new real[num_passes];
-  const string dir2 = "gserver/tests/t2";
-  CalCost(conf2, dir2, cost2, num_passes);
-
-  for (int i = 0; i < num_passes; i++) {
-    LOG(INFO) << "num_passes: " << i << ", cost1=" << cost1[i]
-              << ", cost2=" << cost2[i]
-              << ", diff=" << std::abs(cost1[i] - cost2[i]);
-    ASSERT_NEAR(cost1[i], cost2[i], eps);
-  }
-  delete[] cost1;
-  delete[] cost2;
-}
-
-TEST(RecurrentGradientMachine, HasSubSequence) {
-  for (bool useGpu : {false, true}) {
-    test("gserver/tests/sequence_layer_group.conf",
-         "gserver/tests/sequence_nest_layer_group.conf",
-         1e-5,
-         useGpu);
-  }
-}
-
-TEST(RecurrentGradientMachine, rnn) {
-  for (bool useGpu : {false, true}) {
-    test("gserver/tests/sequence_rnn.conf",
-         "gserver/tests/sequence_nest_rnn.conf",
-         1e-6,
-         useGpu);
-  }
-}
-
-TEST(RecurrentGradientMachine, rnn_multi_input) {
-  for (bool useGpu : {false, true}) {
-    test("gserver/tests/sequence_rnn_multi_input.conf",
-         "gserver/tests/sequence_nest_rnn_multi_input.conf",
-         1e-6,
-         useGpu);
-  }
-}
-
-TEST(RecurrentGradientMachine, rnn_multi_unequalength_input) {
-  for (bool useGpu : {false, true}) {
-    test("gserver/tests/sequence_rnn_multi_unequalength_inputs.py",
-         "gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py",
-         1e-6,
-         useGpu);
-  }
-}
-
-TEST(RecurrentGradientMachine, rnn_mixed_input) {
-  for (bool useGpu : {false, true}) {
-    test("gserver/tests/sequence_rnn_mixed_inputs.py",
-         "gserver/tests/sequence_rnn_matched_inputs.py",
-         1e-6,
-         useGpu);
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-
-  if (paddle::version::isWithPyDataProvider()) {
-    if (!paddle::version::isWithGpu()) {
-      FLAGS_use_gpu = false;
-    }
-    initMain(argc, argv);
-    initPython(argc, argv);
-    return RUN_ALL_TESTS();
-  } else {
-    return 0;
-  }
-}
diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
deleted file mode 100644
index e5ce922f15749cb18b93f64e0e08f437c5633065..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ /dev/null
@@ -1,571 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/utils/Version.h>
-#include <vector>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/Layer.h"
-
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-DECLARE_bool(use_gpu);
-DECLARE_bool(rnn_use_batch);
-DECLARE_int32(fixed_seq_length);
-
-void checkError(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      if (fabs(data1[i * width + j] - data2[i * width + j]) > err) {
-        count++;
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void checkError(const CpuVector& vector1, const CpuVector& vector2) {
-  CHECK(vector1.getSize() == vector2.getSize());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int size = vector1.getSize();
-  const real* data1 = vector1.getData();
-  const real* data2 = vector2.getData();
-  int count = 0;
-  for (int i = 0; i < size; i++) {
-    if (fabs(data1[i] - data2[i]) > err) {
-      count++;
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-LayerPtr creatDataLayer(string name,
-                        size_t batchSize,
-                        int layerSize,
-                        bool useGpu) {
-  LayerConfig dataConfig;
-  dataConfig.set_name(name);
-  dataConfig.set_type("data");
-  dataConfig.set_size(layerSize);
-  LayerPtr layer = LayerPtr(new DataLayer(dataConfig));
-
-  Argument data;
-  data.value = Matrix::create(batchSize, layer->getSize(), false, useGpu);
-  data.grad = Matrix::create(batchSize, layer->getSize(), false, useGpu);
-  data.value->randomizeUniform();
-  data.value->add(-0.5);
-  data.value->sigmoid(*data.value);
-  data.grad->zeroMem();
-
-  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
-
-  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
-  dataLayer->setData(data);
-  dataLayer->forward(PASS_GC);
-
-  return layer;
-}
-
-ParameterPtr creatParameter(string name,
-                            int pid,
-                            size_t paraSize,
-                            bool useGpu) {
-  ParameterConfig paraConfig;
-  paraConfig.set_name(name);
-  paraConfig.set_size(paraSize);
-
-  ParameterPtr parameter =
-      std::make_shared<Parameter>(paraConfig, useGpu, /*initialize */ false);
-  parameter->enableType(PARAMETER_VALUE);
-  parameter->enableType(PARAMETER_GRADIENT);
-  parameter->randomize();
-  parameter->setID(pid);
-
-  return parameter;
-}
-
-ParameterPtr creatParameterBias(string name,
-                                int pid,
-                                size_t paraSize,
-                                bool useGpu) {
-  ParameterConfig paraConfig;
-  paraConfig.set_name(name);
-  paraConfig.set_size(paraSize);
-  paraConfig.set_initial_std(1);
-
-  ParameterPtr parameter =
-      std::make_shared<Parameter>(paraConfig, useGpu, /*initialize */ true);
-  parameter->randomize();
-  parameter->setID(pid);
-
-  return parameter;
-}
-
-LayerPtr initRecurrentLayer(LayerConfig layerConfig,
-                            size_t batchSize,
-                            int layerSize,
-                            bool useGpu) {
-  FLAGS_use_gpu = useGpu;
-  LayerMap layerMap;
-  ParameterMap parameterMap;
-  LayerPtr dataLayer = creatDataLayer("layer_0", batchSize, layerSize, useGpu);
-  layerMap[dataLayer->getName()] = dataLayer;
-
-  ParameterPtr para =
-      creatParameter("para_0", 0, layerSize * layerSize, useGpu);
-  parameterMap[para->getName()] = para;
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
-  input.set_input_layer_name("layer_0");
-  input.set_input_parameter_name("para_0");
-  LayerPtr testLayer = Layer::create(layerConfig);
-  layerMap[testLayer->getName()] = testLayer;
-
-  testLayer->init(layerMap, parameterMap);
-  testLayer->setNeedGradient(true);
-
-  return testLayer;
-}
-
-void checkRecurrentLayer(LayerPtr testLayer) {
-  const VectorPtr& weightGrad =
-      (testLayer->getParameters()[0])->getBuf(PARAMETER_GRADIENT);
-  const MatrixPtr& inputGrad = testLayer->getPrev(0)->getOutputGrad();
-  CpuVector seqPara(weightGrad->getSize());
-  CpuVector batPara(weightGrad->getSize());
-  CpuMatrix seqInputGrad(inputGrad->getHeight(), inputGrad->getWidth());
-  CpuMatrix batInputGrad(inputGrad->getHeight(), inputGrad->getWidth());
-
-  CpuMatrix outputGrad(inputGrad->getHeight(), inputGrad->getWidth());
-  outputGrad.randomizeUniform();
-
-  /* use sequence calculate */
-  FLAGS_rnn_use_batch = false;
-  weightGrad->zero();
-  inputGrad->zero();
-  testLayer->forward(PASS_GC);
-  testLayer->getOutputGrad()->copyFrom(outputGrad);
-  testLayer->backward();
-  seqPara.copyFrom(*weightGrad);
-  seqInputGrad.copyFrom(*inputGrad);
-
-  /* use batch calculate */
-  FLAGS_rnn_use_batch = true;
-  weightGrad->zero();
-  inputGrad->zero();
-  testLayer->forward(PASS_GC);
-  testLayer->getOutputGrad()->copyFrom(outputGrad);
-  testLayer->backward();
-  batPara.copyFrom(*weightGrad);
-  batInputGrad.copyFrom(*inputGrad);
-
-  /* check */
-  checkError(seqInputGrad, batInputGrad);
-  checkError(seqPara, batPara);
-}
-
-TEST(Layer, RecurrentLayer) {
-  LayerConfig layerConfig;
-  layerConfig.set_name("rnn");
-  layerConfig.set_type("recurrent");
-  layerConfig.set_active_type("tanh");
-  for (auto layerSize : {1, 10, 64, 128, 256, 512}) {
-    for (auto batchSize : {1, 5, 20, 100, 128}) {
-      for (auto useGpu : {false, true}) {
-        for (auto reversed : {false, true}) {
-          LOG(INFO) << " layerSize=" << layerSize << " batchSize=" << batchSize
-                    << " useGpu=" << useGpu << " reversed=" << reversed;
-          layerConfig.set_size(layerSize);
-          layerConfig.set_reversed(reversed);
-          LayerPtr testLayer =
-              initRecurrentLayer(layerConfig, batchSize, layerSize, useGpu);
-          checkRecurrentLayer(testLayer);
-        }
-      }
-    }
-  }
-}
-
-#define protected public
-#include "paddle/gserver/layers/GatedRecurrentLayer.h"
-#include "paddle/gserver/layers/LstmLayer.h"
-#include "paddle/gserver/layers/RecurrentLayer.h"
-template <class T>
-class TestRecurrentLayer {
-public:
-  LayerConfig config_;
-  bool useGpu_;
-  bool useBatch_;
-  LayerPtr testLayer_;
-  LayerPtr dataLayer_;
-  ParameterPtr para_;
-  ParameterPtr bias_;
-  LayerMap layerMap_;
-  ParameterMap parameterMap_;
-  TestRecurrentLayer(const LayerConfig& config,
-                     bool useGpu,
-                     bool useBatch = false)
-      : config_(config), useGpu_(useGpu), useBatch_(useBatch) {}
-  void init(size_t batchSize) {
-    FLAGS_use_gpu = useGpu_;
-    testLayer_ = Layer::create(config_);
-    if (typeid(T) == typeid(GatedRecurrentLayer)) {
-      dataLayer_ = creatDataLayer(config_.mutable_inputs(0)->input_layer_name(),
-                                  batchSize,
-                                  config_.size() * 3,
-                                  useGpu_);
-      para_ = creatParameter(config_.mutable_inputs(0)->input_parameter_name(),
-                             0,
-                             config_.size() * config_.size() * 3,
-                             useGpu_);
-      bias_ = creatParameterBias(
-          config_.bias_parameter_name(), 1, config_.size() * 3, useGpu_);
-    } else if (typeid(T) == typeid(LstmLayer)) {
-      dataLayer_ = creatDataLayer(config_.mutable_inputs(0)->input_layer_name(),
-                                  batchSize,
-                                  config_.size() * 4,
-                                  useGpu_);
-      para_ = creatParameter(config_.mutable_inputs(0)->input_parameter_name(),
-                             0,
-                             config_.size() * config_.size() * 4,
-                             useGpu_);
-      bias_ = creatParameterBias(
-          config_.bias_parameter_name(), 1, config_.size() * 7, useGpu_);
-    }
-    layerMap_[dataLayer_->getName()] = dataLayer_;
-    parameterMap_[para_->getName()] = para_;
-    parameterMap_[bias_->getName()] = bias_;
-
-    layerMap_[testLayer_->getName()] = testLayer_;
-    testLayer_->init(layerMap_, parameterMap_);
-    testLayer_->setNeedGradient(true);
-    (dynamic_cast<T*>(testLayer_.get()))->useBatch_ = useBatch_;
-  }
-  void forward() {
-    FLAGS_use_gpu = useGpu_;
-    testLayer_->forward(PASS_GC);
-  }
-  void backward() {
-    FLAGS_use_gpu = useGpu_;
-    testLayer_->backward(nullptr);
-  }
-};
-
-template <class T>
-void checkRecurrentLayer(LayerConfig layerConfig,
-                         size_t batchSize,
-                         bool cpuBatch,
-                         bool gpuBatch) {
-  TestRecurrentLayer<T> testCpu(layerConfig, false, cpuBatch);
-  TestRecurrentLayer<T> testGpu(layerConfig, true, gpuBatch);
-  testCpu.init(batchSize);
-  testGpu.init(batchSize);
-  auto checkError = [](
-      MatrixPtr cpu, MatrixPtr gpu, int numSequences, const char* str) {
-    CpuMatrix check(gpu->getHeight(), gpu->getWidth());
-    check.copyFrom(*gpu);
-    int height = cpu->getHeight();
-    int width = cpu->getWidth();
-    const real* data1 = cpu->getData();
-    const real* data2 = check.getData();
-    int count = 0;
-    for (int i = 0; i < height; i++) {
-      for (int j = 0; j < width; j++) {
-        if (fabs(data1[i * width + j] - data2[i * width + j]) / numSequences >
-            1e-4) {
-          count++;
-        }
-      }
-    }
-    EXPECT_EQ(count, 0) << "[" << str << "]"
-                        << "There are " << count << " different element.";
-  };
-  T* cpuLayer = dynamic_cast<T*>(testCpu.testLayer_.get());
-  T* gpuLayer = dynamic_cast<T*>(testGpu.testLayer_.get());
-
-  Argument& cpuInput = testCpu.dataLayer_->getOutput();
-  Argument& gpuInput = testGpu.dataLayer_->getOutput();
-  gpuInput.resizeAndCopyFrom(cpuInput, true);
-
-  const VectorPtr& cpuVec = testCpu.para_->getBuf(PARAMETER_VALUE);
-  const VectorPtr& gpuVec = testGpu.para_->getBuf(PARAMETER_VALUE);
-  gpuVec->copyFrom(*cpuVec);
-
-  const VectorPtr& cpuBiasVec = testCpu.bias_->getBuf(PARAMETER_VALUE);
-  const VectorPtr& gpuBiasVec = testGpu.bias_->getBuf(PARAMETER_VALUE);
-  gpuBiasVec->copyFrom(*cpuBiasVec);
-
-  /* check forward */
-  testCpu.forward();
-  testGpu.forward();
-
-  checkError(
-      cpuLayer->getOutputValue(), gpuLayer->getOutputValue(), 1, "outputValue");
-
-  /* check backward */
-  cpuLayer->getOutputGrad()->randomizeUniform();
-  gpuLayer->getOutputGrad()->copyFrom(*cpuLayer->getOutputGrad());
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-
-  testCpu.backward();
-  testGpu.backward();
-
-  // check input grad
-  checkError(cpuInput.grad, gpuInput.grad, 1, "inputGrad");
-  // check weight grad
-  int numSequences = cpuInput.getNumSequences();
-  checkError(cpuLayer->weight_->getWGrad(),
-             gpuLayer->weight_->getWGrad(),
-             numSequences,
-             "weightGrad");
-  // check bias grad
-  checkError(cpuLayer->bias_->getWGrad(),
-             gpuLayer->bias_->getWGrad(),
-             numSequences,
-             "biasGrad");
-}
-
-TEST(Layer, GatedRecurrentLayer) {
-  LayerConfig layerConfig;
-  layerConfig.set_type("gated_recurrent");
-  layerConfig.set_active_type("sigmoid");
-  layerConfig.set_active_gate_type("sigmoid");
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
-  input.set_input_layer_name("layer_0");
-  input.set_input_parameter_name("para_0");
-  layerConfig.set_bias_parameter_name("bias");
-
-  for (auto frameSize : {32, 64, 128, 256, 512}) {
-    for (auto batchSize : {1, 5, 100, 500}) {
-      for (auto reversed : {false, true}) {
-        for (auto cpuBatch : {false, true}) {
-          for (auto gpuBatch : {false, true}) {
-            LOG(INFO) << " batchSize=" << batchSize
-                      << " frameSize=" << frameSize << " reversed=" << reversed
-                      << " cpuBatch=" << cpuBatch << " gpuBatch=" << gpuBatch;
-            layerConfig.set_size(frameSize);
-            layerConfig.set_reversed(reversed);
-            checkRecurrentLayer<GatedRecurrentLayer>(
-                layerConfig, batchSize, cpuBatch, gpuBatch);
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Layer, LstmLayer) {
-  LayerConfig layerConfig;
-  layerConfig.set_type("lstmemory");
-  layerConfig.set_active_type("relu");
-  layerConfig.set_active_state_type("tanh");
-  layerConfig.set_active_gate_type("sigmoid");
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
-  input.set_input_layer_name("layer_0");
-  input.set_input_parameter_name("para_0");
-  layerConfig.set_bias_parameter_name("bias");
-
-  for (auto frameSize : {32, 64, 128, 256, 512}) {
-    for (auto batchSize : {1, 5, 100, 500}) {
-      for (auto reversed : {false, true}) {
-        for (auto cpuBatch : {false, true}) {
-          for (auto gpuBatch : {false, true}) {
-            LOG(INFO) << " batchSize=" << batchSize
-                      << " frameSize=" << frameSize << " reversed=" << reversed
-                      << " cpuBatch=" << cpuBatch << " gpuBatch=" << gpuBatch;
-            layerConfig.set_size(frameSize);
-            layerConfig.set_reversed(reversed);
-            checkRecurrentLayer<LstmLayer>(
-                layerConfig, batchSize, cpuBatch, gpuBatch);
-          }
-        }
-      }
-    }
-  }
-}
-
-#ifdef PADDLE_WITH_MKLML
-
-#include "paddle/gserver/layers/MKLPackedRecurrentLayer.h"
-
-LayerPtr initMKLPackedLayer(LayerConfig layerConfig,
-                            bool reversed,
-                            int layerSize,
-                            LayerPtr dataLayer,
-                            ParameterPtr para,
-                            ParameterPtr bias = nullptr) {
-  LayerMap layerMap;
-  ParameterMap parameterMap;
-  layerMap[dataLayer->getName()] = dataLayer;
-  parameterMap[para->getName()] = para;
-  if (bias) {
-    parameterMap[bias->getName()] = bias;
-    layerConfig.set_bias_parameter_name("bias_0");
-  }
-
-  layerConfig.set_size(layerSize);
-  layerConfig.set_reversed(reversed);
-  layerConfig.add_inputs();
-  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
-  input.set_input_layer_name("layer_0");
-  input.set_input_parameter_name("para_0");
-
-  LayerPtr testLayer = Layer::create(layerConfig);
-  layerMap[testLayer->getName()] = testLayer;
-
-  testLayer->init(layerMap, parameterMap);
-  testLayer->setNeedGradient(true);
-
-  return testLayer;
-}
-
-void checkMKLPackedLayer(LayerConfig layerConfig1,
-                         LayerConfig layerConfig2,
-                         bool reversed,
-                         int layerSize,
-                         int batchSize,
-                         bool useBatch1,
-                         bool useBatch2) {
-  LayerPtr dataLayer;
-  ParameterPtr para, bias;
-
-  if (layerConfig1.type() == "recurrent") {
-    dataLayer = creatDataLayer("layer_0", batchSize, layerSize, false);
-    para = creatParameter("para_0", 0, layerSize * layerSize, false);
-    bias = nullptr;
-  } else if (layerConfig1.type() == "gated_recurrent") {
-    dataLayer = creatDataLayer("layer_0", batchSize, layerSize * 3, false);
-    para = creatParameter("para_0", 0, layerSize * layerSize * 3, false);
-    bias = creatParameterBias("bias_0", 1, layerSize * 3, false);
-  }
-
-  LayerPtr testLayer1 = initMKLPackedLayer(
-      layerConfig1, reversed, layerSize, dataLayer, para, bias);
-  LayerPtr testLayer2 = initMKLPackedLayer(
-      layerConfig2, reversed, layerSize, dataLayer, para, bias);
-
-  const VectorPtr& weightGrad =
-      (testLayer1->getParameters()[0])->getBuf(PARAMETER_GRADIENT);
-  const MatrixPtr& inputGrad = testLayer1->getPrev(0)->getOutputGrad();
-  CpuVector wgt_grad1(weightGrad->getSize());
-  CpuVector wgt_grad2(weightGrad->getSize());
-  CpuMatrix input_grad1(inputGrad->getHeight(), inputGrad->getWidth());
-  CpuMatrix input_grad2(inputGrad->getHeight(), inputGrad->getWidth());
-
-  for (int i = 0; i < 2; i++) {
-    FLAGS_rnn_use_batch = useBatch1;
-
-    testLayer1->forward(PASS_GC);
-
-    FLAGS_rnn_use_batch = useBatch2;
-    testLayer2->forward(PASS_GC);
-
-    testLayer1->getOutputGrad()->randomizeUniform();
-    testLayer2->getOutputGrad()->copyFrom(*testLayer1->getOutputGrad());
-
-    weightGrad->zero();
-    inputGrad->zero();
-    FLAGS_rnn_use_batch = useBatch1;
-    testLayer1->backward(nullptr);
-
-    wgt_grad1.copyFrom(*weightGrad);
-    input_grad1.copyFrom(*inputGrad);
-
-    weightGrad->zero();
-    inputGrad->zero();
-    FLAGS_rnn_use_batch = useBatch2;
-    testLayer2->backward(nullptr);
-
-    wgt_grad2.copyFrom(*weightGrad);
-    input_grad2.copyFrom(*inputGrad);
-
-    checkError(*testLayer1->getOutputValue(), *testLayer2->getOutputValue());
-    checkError(wgt_grad1, wgt_grad2);
-    checkError(input_grad1, input_grad2);
-  }
-}
-
-TEST(MKLPackedLayer, RecurrentLayer) {
-  LayerConfig layerConfig1;
-  LayerConfig layerConfig2;
-
-  layerConfig1.set_name("paddle-rnn");
-  layerConfig1.set_type("recurrent");
-  layerConfig1.set_active_type("relu");
-
-  layerConfig2.set_name("mkl-packed-rnn");
-  layerConfig2.set_type("mkl_packed_recurrent");
-  layerConfig2.set_active_type("relu");
-
-  FLAGS_use_gpu = false;
-
-  for (auto layerSize : {32, 64, 128, 256, 512}) {
-    for (auto batchSize : {1, 5, 100, 500}) {
-      for (auto reversed : {true, false}) {
-        for (auto paddle_use_batch : {true, false}) {
-          for (auto MKLPacked_use_batch : {true, false}) {
-            LOG(INFO) << " layerSize=" << layerSize
-                      << " batchSize=" << batchSize << " reversed=" << reversed
-                      << " paddle_use_batch=" << paddle_use_batch
-                      << " MKLPacked_use_batch=" << MKLPacked_use_batch;
-
-            checkMKLPackedLayer(layerConfig1,
-                                layerConfig2,
-                                reversed,
-                                layerSize,
-                                batchSize,
-                                paddle_use_batch,
-                                MKLPacked_use_batch);
-          }
-        }
-      }
-    }
-  }
-}
-#endif
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  if (!version::isWithGpu()) {
-    testing::GTEST_FLAG(filter) = "-Layer.*";
-  }
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_SelectiveFCLayer.cpp b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
deleted file mode 100644
index 583e3bc545a3b5eb158490a8ccc5ea7060c7c6ab..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_SelectiveFCLayer.cpp
+++ /dev/null
@@ -1,471 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <math.h>
-#include <paddle/utils/PythonUtil.h>
-#include <algorithm>
-#include <cstdlib>
-#include <ctime>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/FullyConnectedLayer.h"
-#include "paddle/gserver/layers/Layer.h"
-#include "paddle/gserver/layers/SelectiveFullyConnectedLayer.h"
-#include "paddle/math/CpuSparseMatrix.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-DECLARE_int32(num_passes);
-DECLARE_string(config);
-DECLARE_string(init_model_path);
-DECLARE_string(config_args);
-
-size_t fcLayerWidth = 1024;
-
-struct ComData {
-  vector<Argument> outArgs;
-  vector<ParameterPtr> parameters;
-};
-
-int randint(int* data, size_t int_max, size_t size) {
-  srand((size_t)(time(NULL)));
-  if (int_max < size) {
-    return -1;
-  }
-  size_t count = 0;
-  std::map<int, int> tmp;
-  int this_int = 0;
-
-  while (count < size) {
-    this_int = std::rand() % int_max;  // NOLINT
-    if (tmp.find(this_int) == tmp.end()) {
-      tmp[this_int] = 0;
-      count += 1;
-    }
-  }
-
-  if (tmp.size() != size) {
-    return -1;
-  }
-  count = 0;
-  for (auto itr = tmp.begin(); itr != tmp.end(); ++itr) {
-    data[count] = itr->first;
-    count += 1;
-  }
-  return 0;
-}
-
-void calcOutput(ComData& comData,
-                const string configFile,
-                const string configArgs,
-                bool useGpu) {
-  FLAGS_config = configFile;
-  FLAGS_config_args = configArgs;
-  FLAGS_use_gpu = useGpu;
-  FLAGS_init_model_path = "gserver/tests/SelectiveFcTest/model";
-  *ThreadLocalRand::getSeed() = 0;
-  srand(0);
-
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlags(), false);
-
-  comData.parameters = trainer.getGradientMachine()->getParameters();
-
-  auto dataProvider = trainer.getDataProvider();
-  int32_t batchSize = trainer.getConfig().opt_config().batch_size();
-  DataBatch dataBatch;
-  dataProvider->setSkipShuffle();
-  dataProvider->reset();
-  dataProvider->getNextBatch(batchSize, &dataBatch);
-  CHECK(dataBatch.getSize()) << "No data from data provider";
-
-  vector<Argument>& inArgs = dataBatch.getStreams();
-  trainer.getGradientMachine()->start(trainer.getConfig(), nullptr);
-  trainer.getGradientMachine()->forwardBackward(
-      inArgs, &comData.outArgs, PASS_TRAIN);
-  trainer.getGradientMachine()->finish();
-}
-
-void checkMatrix(real* A, real* B, size_t matSize) {
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-  int diffNum = 0;
-  for (size_t i = 0; i < matSize; ++i) {
-    if (std::isinf(A[i]) || std::isnan(A[i]) || std::isinf(B[i]) ||
-        std::isnan(B[i])) {
-    } else if (fabs(A[i] - B[i]) > err) {
-      diffNum++;
-    }
-  }
-  EXPECT_EQ(0, diffNum);
-}
-
-void checkTranspose(real* matrix,
-                    real* transpose,
-                    size_t width,
-                    size_t matSize) {
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-  size_t height = matSize / width;
-  int diffNum = 0;
-  size_t rowId = 0;
-  size_t colId = 0;
-  for (size_t i = 0; i < matSize; ++i) {
-    if (i % width == 0 && i) {
-      rowId++;
-    }
-    colId = i % width;
-    if (fabs(matrix[i] - transpose[colId * height + rowId]) > err) {
-      diffNum++;
-      LOG(INFO) << i << " diff : " << matrix[i] << "\t"
-                << transpose[colId * height + rowId];
-    }
-  }
-  EXPECT_EQ(0, diffNum);
-}
-
-void compareOutput(ComData& fcData, ComData& selFcData) {
-  vector<Argument> outArgsFc = fcData.outArgs;
-  vector<Argument> outArgsSelfc = selFcData.outArgs;
-
-  // check cost
-  LOG(INFO) << "Check cost";
-  CpuMatrix fcCost(outArgsFc[0].value->getHeight(),
-                   outArgsFc[0].value->getWidth());
-  CpuMatrix selfcCost(outArgsSelfc[0].value->getHeight(),
-                      outArgsSelfc[0].value->getWidth());
-  fcCost.copyFrom(*outArgsFc[0].value);
-  selfcCost.copyFrom(*outArgsSelfc[0].value);
-  checkMatrix(fcCost.getData(), selfcCost.getData(), fcCost.getElementCnt());
-
-  // check selective fc output and fc output
-  LOG(INFO) << "Compare output of SelectiveFullyConectedLayer "
-            << "with FullyConectedLayer";
-  CpuMatrix fcOut(outArgsFc[1].value->getHeight(),
-                  outArgsFc[1].value->getWidth());
-  CpuMatrix selfcOut(outArgsSelfc[1].value->getHeight(),
-                     outArgsSelfc[1].value->getWidth());
-
-  fcOut.copyFrom(*outArgsFc[1].value);
-  selfcOut.copyFrom(*outArgsSelfc[1].value);
-  checkMatrix(fcOut.getData(), selfcOut.getData(), fcOut.getElementCnt());
-
-  // check gradient math
-  vector<ParameterPtr>& fcParam = fcData.parameters;
-  vector<ParameterPtr>& selfcParam = selFcData.parameters;
-  for (size_t i = 0; i < fcParam.size(); ++i) {
-    ParameterPtr p1, p2;
-    p1 = fcParam[i];
-    p2 = selfcParam[i];
-
-    string paramName = p1->getName();
-    LOG(INFO) << "check parameter : " << paramName;
-
-    // check parameter value
-    CpuVector paraValue1(p1->getSize());
-    CpuVector paraValue2(p2->getSize());
-    paraValue1.copyFrom(*p1->getBuf(PARAMETER_VALUE));
-    paraValue2.copyFrom(*p2->getBuf(PARAMETER_VALUE));
-
-    // check gradient
-    CpuVector paraGrad1(*p1->getBuf(PARAMETER_GRADIENT));
-    CpuVector paraGrad2(*p2->getBuf(PARAMETER_GRADIENT));
-    if (paramName == "rand_fc_param.bias") {
-      checkMatrix(
-          paraValue1.getData(), paraValue2.getData(), paraValue1.getSize());
-      checkMatrix(
-          paraGrad1.getData(), paraGrad2.getData(), paraGrad1.getSize());
-    } else {
-      checkTranspose(paraValue1.getData(),
-                     paraValue2.getData(),
-                     fcLayerWidth,
-                     paraValue1.getSize());
-      checkTranspose(paraGrad1.getData(),
-                     paraGrad2.getData(),
-                     fcLayerWidth,
-                     paraGrad1.getSize());
-    }
-  }
-}
-
-void compareSparseMulOutput(
-    real* fcOutput,
-    real* selOutput,
-    size_t nnz,
-    const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& selCols) {
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-  size_t nnzCount =
-      std::accumulate(selCols->begin(),
-                      selCols->end(),
-                      0UL,
-                      [](size_t a, const std::pair<int*, size_t>& arr) {
-                        return a + arr.second;
-                      });
-  EXPECT_EQ(nnz, nnzCount);
-
-  size_t sampleNum = selCols->size();
-  int diffNum = 0;
-  size_t count = 0;
-  for (size_t i = 0; i < sampleNum; ++i) {
-    for (size_t j = 0; j < (*selCols)[i].second; ++j) {
-      size_t selIdx = (*selCols)[i].first[j];
-      if (fabs(fcOutput[i * fcLayerWidth + selIdx] - selOutput[count]) > err) {
-        diffNum++;
-        LOG(INFO) << count << " diff : " << fcOutput[i * fcLayerWidth + selIdx]
-                  << "\t" << selOutput[count];
-      }
-      count++;
-    }
-  }
-  EXPECT_EQ(0, diffNum);
-}
-
-LayerPtr creatDataLayer(string name,
-                        size_t batchSize,
-                        size_t layerSize,
-                        std::vector<real>& values,
-                        bool useGpu) {
-  LayerConfig dataConfig;
-  dataConfig.set_name(name);
-  dataConfig.set_type("data");
-  dataConfig.set_size(layerSize);
-  LayerPtr layer = LayerPtr(new DataLayer(dataConfig));
-
-  Argument data;
-  data.value = Matrix::create(batchSize, layerSize, false, useGpu);
-  data.value->copyFrom(values.data(), batchSize * layerSize);
-
-  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
-  dataLayer->setData(data);
-  dataLayer->forward(PASS_TEST);
-  return layer;
-}
-
-ParameterPtr creatParameter(
-    string name, int pid, size_t paraSize, string paramFile, bool useGpu) {
-  ParameterConfig paraConfig;
-  paraConfig.set_name(name);
-  paraConfig.set_size(paraSize);
-
-  ParameterPtr parameter =
-      std::make_shared<Parameter>(paraConfig, useGpu, /*initialize */ false);
-  parameter->enableType(PARAMETER_VALUE);
-  parameter->randomize();
-  parameter->setID(pid);
-  parameter->load(paramFile);
-  return parameter;
-}
-
-LayerPtr initFcLayer(LayerPtr dataLayer,
-                     LayerConfig layerConfig,
-                     int dataLayerSize,
-                     int fcLayerSize,
-                     string paraName,
-                     string paraFile,
-                     bool useGpu) {
-  LayerMap layerMap;
-  ParameterMap parameterMap;
-
-  layerMap[dataLayer->getName()] = dataLayer;
-  ParameterPtr para = creatParameter(
-      paraName, 0, dataLayerSize * fcLayerSize, paraFile, useGpu);
-  parameterMap[para->getName()] = para;
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
-  input.set_input_layer_name(dataLayer->getName());
-  input.set_input_parameter_name(paraName);
-
-  LayerPtr testLayer = Layer::create(layerConfig);
-  layerMap[testLayer->getName()] = testLayer;
-
-  testLayer->setNeedGradient(false);
-  testLayer->init(layerMap, parameterMap);
-  return testLayer;
-}
-
-#ifndef PADDLE_TYPE_DOUBLE
-// The parameter file used in fc.conf and selective_fc.conf is float
-TEST(Layer, SelectiveFcLayer_train_dense_mul) {
-  const string& fcConfig = "gserver/tests/SelectiveFcTest/conf/fc.conf";
-  const string& fcConfigArgs =
-      "filelist=gserver/tests/SelectiveFcTest/dense_mul_list";
-  const string& selFcConfig =
-      "gserver/tests/SelectiveFcTest/conf/selective_fc.conf";
-  const string& selConfigArgs =
-      "filelist=gserver/tests/SelectiveFcTest/dense_mul_list";
-
-  for (auto useGpu : {false, true}) {
-#ifndef PADDLE_WITH_CUDA
-    if (useGpu) {
-      break;
-    }
-#endif
-    LOG(INFO) << "FullyConnectedLayer forwardBackward()";
-    ComData fcData;
-    calcOutput(fcData, fcConfig, fcConfigArgs, useGpu);
-
-    LOG(INFO) << "SelectiveFullyConnectedLayer forwardBackward()";
-    ComData selFcData;
-    calcOutput(selFcData, selFcConfig, selConfigArgs, useGpu);
-    compareOutput(fcData, selFcData);
-  }
-}
-#endif  // PADDLE_TYPE_DOUBLE
-
-void testSelectiveFcLayerTrainSparseMul(const LayerConfig& config,
-                                        bool useGpu) {
-  FLAGS_use_gpu = useGpu;
-  size_t batchSize = 100;
-  size_t dataLayerSize = 512;
-  std::vector<real> values(batchSize * dataLayerSize);
-  for (size_t j = 0; j < batchSize * dataLayerSize; ++j) {
-    values[j] = std::rand() / real(RAND_MAX);
-  }
-  LayerPtr dataLayer =
-      creatDataLayer("data", batchSize, dataLayerSize, values, useGpu);
-
-  const string& selfcParaFile =
-      "gserver/tests/SelectiveFcTest/model/rand_fc_param.w.transpose";
-  const string& selfcParaName = "rand_fc_param.w.transpose";
-
-  std::shared_ptr<SelectiveFullyConnectedLayer> selfcLayer =
-      std::dynamic_pointer_cast<SelectiveFullyConnectedLayer>(
-          initFcLayer(dataLayer,
-                      config,
-                      dataLayerSize,
-                      fcLayerWidth,
-                      selfcParaName,
-                      selfcParaFile,
-                      useGpu));
-
-  // create selected columns
-  std::shared_ptr<std::vector<std::pair<int*, size_t>>> selCols(
-      new std::vector<std::pair<int*, size_t>>(batchSize));
-  size_t maxNNZ = 30;
-  srand((size_t)(time(NULL)));
-  int total = 0;
-  while (total == 0) {
-    for (size_t i = 0; i < batchSize; ++i) {
-      size_t num = std::rand() % maxNNZ;
-      int* data = new int[num];
-      randint(data, fcLayerWidth, num);
-      (*selCols)[i] = std::make_pair(data, num);
-      total += num;
-    }
-  }
-  selfcLayer->fillSelectiveData(selCols);
-  selfcLayer->forward(PASS_TEST);
-
-  MatrixPtr outMatSelfc = selfcLayer->getOutputValue();
-  CpuSparseMatrixPtr cpuOutMatSelfc(
-      new CpuSparseMatrix(outMatSelfc->getHeight(),
-                          outMatSelfc->getWidth(),
-                          outMatSelfc->getElementCnt()));
-  cpuOutMatSelfc->copyFrom(*outMatSelfc, HPPL_STREAM_DEFAULT);
-#ifdef PADDLE_WITH_CUDA
-  if (useGpu) {
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  }
-#endif
-  real* outValueSelfc = cpuOutMatSelfc->getValue();
-  size_t nnz = cpuOutMatSelfc->getElementCnt();
-
-  const string& fcParaFile =
-      "gserver/tests/SelectiveFcTest/model/rand_fc_param.w";
-  const string& fcParaName = "rand_fc_param.w";
-  LayerConfig fcLayerConfig;
-  fcLayerConfig.set_name("fc_layer");
-  fcLayerConfig.set_type("fc");
-  fcLayerConfig.set_active_type("linear");
-  fcLayerConfig.set_size(fcLayerWidth);
-
-  LayerPtr fcLayer = initFcLayer(dataLayer,
-                                 fcLayerConfig,
-                                 dataLayerSize,
-                                 fcLayerWidth,
-                                 fcParaName,
-                                 fcParaFile,
-                                 useGpu);
-  fcLayer->forward(PASS_TEST);
-
-  MatrixPtr outMatFc = fcLayer->getOutputValue();
-  MatrixPtr cpuOutMatFc(
-      new CpuMatrix(outMatFc->getHeight(), outMatFc->getWidth()));
-  cpuOutMatFc->copyFrom(*outMatFc, HPPL_STREAM_DEFAULT);
-#ifdef PADDLE_WITH_CUDA
-  if (useGpu) {
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  }
-#endif
-  real* outValueFc = cpuOutMatFc->getData();
-
-  compareSparseMulOutput(outValueFc, outValueSelfc, nnz, selCols);
-  for (size_t i = 0; i < batchSize; ++i) {
-    delete[](*selCols)[i].first;
-  }
-}
-
-#ifndef PADDLE_TYPE_DOUBLE
-// The parameter file used in testSelectiveFcLayerTrainSparseMul is float
-TEST(Layer, SelectiveFcLayer_train_sparse_mul) {
-  LayerConfig selLayerConfig;
-  selLayerConfig.set_name("sel_fc");
-  selLayerConfig.set_type("selective_fc");
-  selLayerConfig.set_active_type("linear");
-  selLayerConfig.set_has_selected_colums(false);
-  selLayerConfig.set_selective_fc_pass_generation(true);
-  selLayerConfig.set_size(fcLayerWidth);
-
-  testSelectiveFcLayerTrainSparseMul(selLayerConfig, false);
-#ifdef PADDLE_WITH_CUDA
-  testSelectiveFcLayerTrainSparseMul(selLayerConfig, true);
-#endif
-}
-#endif  // PADDLE_TYPE_DOUBLE
-
-// TODO(dangqingqing) test multi threads after support in matrix
-// TEST(Layer, SelectiveFcLayer_train_sparse_mul_parallel) {
-//   LayerConfig selLayerConfig;
-//   selLayerConfig.set_name("sel_fc");
-//   selLayerConfig.set_type("selective_fc");
-//   selLayerConfig.set_active_type("linear");
-//   selLayerConfig.set_has_selected_colums(false);
-//   selLayerConfig.set_selective_fc_pass_generation(true);
-//   selLayerConfig.set_selective_fc_parallel_plain_mul_thread_num(10);
-//   selLayerConfig.set_selective_fc_full_mul_ratio(1000);
-//   selLayerConfig.set_size(fcLayerWidth);
-//   SelectiveFcLayer_test(selLayerConfig, false);
-// }
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  initPython(argc, argv);
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
diff --git a/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp b/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
deleted file mode 100644
index 406ca63b6ee030a0882e05294d8d355d84531385..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/DataLayer.h"
-
-#include "LayerGradUtil.h"
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_int32(gpu_id);
-DECLARE_bool(thread_local_rand_use_global_seed);
-
-const int MAX_SEQ_NUM = 17;
-const int MAX_SEQ_LEN = 23;
-const int MAX_BEAM_SIZE = 13;
-
-const size_t SEED = (size_t)(time(NULL));
-
-vector<real> randSampling(real range, int n) {
-  CHECK_GE(range, n);
-  vector<real> num(range);
-  iota(begin(num), end(num), 0.);
-  if (range == n) return num;
-
-  random_shuffle(begin(num), end(num));
-  num.resize(n);
-  sort(begin(num), end(num));
-  return num;
-}
-
-void genSeqInfo(vector<int>& seqStartPos, vector<int>& subSeqStartPos) {
-  seqStartPos.resize(1, 0);
-  subSeqStartPos.resize(1, 0);
-
-  srand(SEED);
-  int seqNum = 1 + (rand() % MAX_SEQ_NUM);
-  for (int i = 0; i < seqNum; ++i) {
-    int subSeqNum = 1 + (rand() % MAX_SEQ_NUM);
-    for (int j = 0; j < subSeqNum; ++j)
-      subSeqStartPos.push_back(subSeqStartPos.back() +
-                               (1 + (rand() % MAX_SEQ_LEN)));
-    seqStartPos.push_back(subSeqStartPos.back());
-  }
-}
-
-/*
-  generate start indices according to sequence start positions.
- */
-void genStarts(vector<int>& seqStartPos,
-               vector<vector<real>>& starts,
-               size_t beamSize) {
-  starts.clear();
-  starts.resize(seqStartPos.size() - 1, vector<real>(beamSize, -1.));
-
-  for (size_t i = 0; i < seqStartPos.size() - 1; ++i) {
-    int seqLen = seqStartPos[i + 1] - seqStartPos[i];
-    vector<real> randStarts =
-        randSampling(seqLen, min(seqLen, static_cast<int>(beamSize)));
-    copy(begin(randStarts), end(randStarts), begin(starts[i]));
-  }
-}
-
-/*
-  generate end indices according to sequence start positions and start indices.
- */
-void genEnds(vector<int>& seqStartPos,
-             vector<vector<real>>& starts,
-             vector<vector<real>>& ends,
-             size_t beamSize) {
-  CHECK_EQ(seqStartPos.size() - 1, starts.size());
-  ends.clear();
-  ends.resize(seqStartPos.size() - 1, vector<real>(beamSize, -1.));
-
-  for (size_t i = 0; i < starts.size(); ++i) {
-    for (size_t j = 0; j < starts[i].size(); ++j) {
-      int seqLen = seqStartPos[i + 1] - seqStartPos[i];
-      CHECK_GE(seqLen - 1, starts[i][j]);
-      if (starts[i][j] == -1.) break;
-      if (starts[i][j] == (seqLen - 1)) {
-        ends[i][j] = starts[i][j];
-      } else {
-        ends[i][j] = starts[i][j] + randSampling(seqLen - starts[i][j], 1)[0];
-      }
-    }
-  }
-}
-
-void genTestData(vector<int>& seqStartPos,
-                 vector<int>& subSeqStartPos,
-                 vector<vector<real>>& starts,
-                 vector<vector<real>>& ends,
-                 bool hasSubseq) {
-  size_t beamSize = 1 + (rand() % MAX_BEAM_SIZE);
-  genSeqInfo(seqStartPos, subSeqStartPos);
-
-  genStarts(hasSubseq ? subSeqStartPos : seqStartPos, starts, beamSize);
-  genEnds(hasSubseq ? subSeqStartPos : seqStartPos, starts, ends, beamSize);
-}
-
-template <typename T>
-void flatten2dVector(vector<vector<T>>& inVec, vector<T>& outVec) {
-  size_t totalSize{0};
-  for (auto const& items : inVec) totalSize += items.size();
-  outVec.reserve(totalSize);
-
-  for (auto& items : inVec)
-    move(items.begin(), items.end(), back_inserter(outVec));
-}
-
-void testSeqSliceLayer(bool hasSubseq,
-                       bool useGpu,
-                       vector<int>& seqStartPos,
-                       vector<int>& subSeqStartPos,
-                       vector<vector<real>>& starts,
-                       vector<vector<real>>& ends) {
-  // layer size is not crutial for this layer,
-  // so here use a small layer size in the unittest.
-  const size_t layerSize{4};
-  TestConfig config;
-  config.layerConfig.set_type("seq_slice");
-  config.layerConfig.set_size(layerSize);
-
-  // add the first input
-  MatrixPtr seqInputPtr =
-      Matrix::create(hasSubseq ? subSeqStartPos.back() : seqStartPos.back(),
-                     layerSize,
-                     false,
-                     false);
-  seqInputPtr->randomizeUniform();
-
-  if (hasSubseq) {
-    config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
-                                "seq_input",
-                                seqInputPtr,
-                                seqStartPos,
-                                subSeqStartPos});
-  } else {
-    config.inputDefs.push_back(
-        {INPUT_SELF_DEFINE_DATA, "seq_input", seqInputPtr, seqStartPos});
-  }
-  config.layerConfig.add_inputs();
-
-  // add start indices
-  if (starts.size()) {
-    vector<real> startsToVec;
-    flatten2dVector(starts, startsToVec);
-
-    MatrixPtr startMatrixPtr =
-        Matrix::create(starts.size(), starts[0].size(), false, false);
-    startMatrixPtr->copyFrom(startsToVec.data(), startsToVec.size());
-
-    config.inputDefs.push_back(
-        {INPUT_SELF_DEFINE_DATA, "starts", startMatrixPtr});
-    config.layerConfig.add_inputs();
-    config.layerConfig.set_select_first(true);
-  }
-
-  // add end indices
-  if (ends.size()) {
-    vector<real> endsToVec;
-    flatten2dVector(ends, endsToVec);
-
-    MatrixPtr endMatrixPtr =
-        Matrix::create(ends.size(), ends[0].size(), false, false);
-    endMatrixPtr->copyFrom(endsToVec.data(), endsToVec.size());
-
-    config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "ends", endMatrixPtr});
-    config.layerConfig.add_inputs();
-    config.layerConfig.set_select_first(false);
-  }
-
-  testLayerGrad(config, "seq_slice", /*batchSize*/ 100, false, useGpu, false);
-}
-
-TEST(Layer, SeqSliceLayer) {
-  vector<int> seqStartPos;
-  vector<int> subSeqStartPos;
-  vector<vector<real>> starts;
-  vector<vector<real>> ends;
-
-  std::vector<bool> mode = {false};
-#ifdef PADDLE_WITH_CUDA
-  mode.push_back(true);
-#endif
-  genSeqInfo(seqStartPos, subSeqStartPos);
-  for (bool hasSubseq : {true, false}) {
-    LOG(INFO) << "hasSubSeq : " << hasSubseq;
-    genTestData(seqStartPos, subSeqStartPos, starts, ends, hasSubseq);
-    for (bool useGpu : mode) {
-      vector<vector<real>> tmp;
-      testSeqSliceLayer(
-          hasSubseq, useGpu, seqStartPos, subSeqStartPos, tmp, ends);
-      testSeqSliceLayer(
-          hasSubseq, useGpu, seqStartPos, subSeqStartPos, starts, tmp);
-      testSeqSliceLayer(
-          hasSubseq, useGpu, seqStartPos, subSeqStartPos, starts, ends);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  hl_start();
-  hl_init(FLAGS_gpu_id);
-  FLAGS_thread_local_rand_use_global_seed = true;
-  srand(1);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp
deleted file mode 100644
index f2299d7da2a51e4015793ae531af002aed1f6b2f..0000000000000000000000000000000000000000
--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ /dev/null
@@ -1,244 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/utils/Version.h>
-#include "ModelConfig.pb.h"
-#include "paddle/gserver/layers/CTCLayer.h"
-#include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/Layer.h"
-#include "paddle/gserver/layers/WarpCTCLayer.h"
-
-#include "paddle/testing/TestUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_bool(use_gpu);
-
-const real* getData(const Matrix& matrix) {
-  if (matrix.useGpu()) {
-    MatrixPtr cpuMatrix = Matrix::create(
-        matrix.getHeight(), matrix.getWidth(), matrix.isTransposed(), false);
-    cpuMatrix->copyFrom(matrix);
-    return cpuMatrix->getData();
-  } else {
-    return matrix.getData();
-  }
-}
-
-int checkError(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK_EQ(matrix1.getHeight(), matrix2.getHeight());
-  CHECK_EQ(matrix1.getWidth(), matrix2.getWidth());
-  CHECK_EQ(matrix1.isTransposed(), matrix2.isTransposed());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-
-  const real* data1 = getData(matrix1);
-  const real* data2 = getData(matrix2);
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      if (fabs(data1[i * width + j] - data2[i * width + j]) > err) {
-        count++;
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-  return count;
-}
-
-void initArgument(size_t batchSize,
-                  int layerSize,
-                  bool useGpu,
-                  Argument& data) {
-  data.value = Matrix::create(batchSize, layerSize, false, useGpu);
-  data.grad = Matrix::create(batchSize, layerSize, false, useGpu);
-  data.value->randomizeUniform();
-  data.value->add(-0.5);
-  data.grad->zeroMem();
-
-  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
-}
-
-LayerPtr createDataLayer(
-    string name, size_t batchSize, int layerSize, bool useGpu, Argument& data) {
-  LayerConfig layerConfig;
-  layerConfig.set_name(name);
-  layerConfig.set_type("data");
-  layerConfig.set_size(layerSize);
-  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
-
-  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
-  dataLayer->setData(data);
-  dataLayer->forward(PASS_GC);
-
-  return layer;
-}
-
-LayerPtr createLabelLayer(string name,
-                          size_t batchSize,
-                          size_t numClasses,
-                          bool useGpu) {
-  LayerConfig layerConfig;
-  layerConfig.set_name(name);
-  layerConfig.set_type("data");
-  layerConfig.set_size(1);
-  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
-
-  Argument data;
-  data.ids = IVector::create(batchSize, useGpu);
-  data.ids->rand(numClasses - 1);
-
-  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
-
-  DataLayerPtr labelLayer = std::dynamic_pointer_cast<DataLayer>(layer);
-  labelLayer->setData(data);
-  labelLayer->forward(PASS_GC);
-
-  return layer;
-}
-
-LayerPtr createCTCLayer(string name,
-                        size_t numClasses,
-                        bool useGpu,
-                        bool normByTimes,
-                        LayerPtr dataLayer,
-                        LayerPtr labelLayer) {
-  LayerMap layerMap;
-  layerMap[dataLayer->getName()] = dataLayer;
-  layerMap[labelLayer->getName()] = labelLayer;
-
-  ParameterMap parameterMap;
-
-  LayerConfig layerConfig;
-  layerConfig.set_name(name);
-  layerConfig.set_type("ctc");
-  layerConfig.set_size(numClasses);
-  layerConfig.set_norm_by_times(normByTimes);
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
-  input0.set_input_layer_name(dataLayer->getName());
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
-  input1.set_input_layer_name(labelLayer->getName());
-
-  LayerPtr layer = LayerPtr(new CTCLayer(layerConfig));
-  layerMap[layer->getName()] = layer;
-  layer->init(layerMap, parameterMap);
-
-  ActivationFunction* softmaxActivation = ActivationFunction::create("softmax");
-
-  softmaxActivation->forward(dataLayer->getOutput()).check();
-  layer->forward(PASS_GC);
-
-  layer->backward();
-  softmaxActivation->backward(dataLayer->getOutput()).check();
-
-  return layer;
-}
-
-LayerPtr createWarpCTCLayer(string name,
-                            size_t numClasses,
-                            bool useGpu,
-                            bool normByTimes,
-                            LayerPtr dataLayer,
-                            LayerPtr labelLayer) {
-  LayerMap layerMap;
-  layerMap[dataLayer->getName()] = dataLayer;
-  layerMap[labelLayer->getName()] = labelLayer;
-
-  ParameterMap parameterMap;
-
-  LayerConfig layerConfig;
-  layerConfig.set_name(name);
-  layerConfig.set_type("warp_ctc");
-  layerConfig.set_size(numClasses);
-  layerConfig.set_blank(numClasses - 1);
-  layerConfig.set_norm_by_times(normByTimes);
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
-  input0.set_input_layer_name(dataLayer->getName());
-
-  layerConfig.add_inputs();
-  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
-  input1.set_input_layer_name(labelLayer->getName());
-
-  LayerPtr layer = LayerPtr(new WarpCTCLayer(layerConfig));
-  layerMap[layer->getName()] = layer;
-  layer->init(layerMap, parameterMap);
-
-  layer->forward(PASS_GC);
-  layer->backward();
-
-  return layer;
-}
-
-TEST(Layer, WarpCTCLayer) {
-  for (auto layerSize : {10, 64}) {
-    for (auto batchSize : {1, 10, 32}) {
-      for (auto normByTimes : {false, true}) {
-        for (auto useGpu : {false, true}) {
-#ifndef PADDLE_WITH_CUDA
-          if (useGpu) continue;
-#endif
-          LOG(INFO) << "layerSize=" << layerSize << " batchSize=" << batchSize
-                    << " normByTimes = " << normByTimes << " useGpu=" << useGpu;
-
-          FLAGS_use_gpu = useGpu;
-
-          Argument data0;
-          initArgument(batchSize, layerSize, useGpu, data0);
-
-          Argument data1;
-          data1.resizeAndCopyFrom(data0);
-
-          LayerPtr dataLayer0 =
-              createDataLayer("data", batchSize, layerSize, useGpu, data0);
-          LayerPtr dataLayer1 =
-              createDataLayer("data", batchSize, layerSize, useGpu, data1);
-
-          LayerPtr labelLayer =
-              createLabelLayer("label", batchSize, layerSize, useGpu);
-
-          LayerPtr warpctcLayer = createWarpCTCLayer(
-              "cost", layerSize, useGpu, normByTimes, dataLayer0, labelLayer);
-          LayerPtr ctcLayer = createCTCLayer(
-              "cost", layerSize, useGpu, normByTimes, dataLayer1, labelLayer);
-
-          /// Check cost
-          LOG(INFO) << "Check cost: "
-                    << checkError(*(warpctcLayer->getOutput().value),
-                                  *(ctcLayer->getOutput().value))
-                    << " different elements.";
-
-          /// Check gradients
-          LOG(INFO) << "Check gradients: "
-                    << checkError(*(dataLayer0->getOutput().grad),
-                                  *(dataLayer1->getOutput().grad))
-                    << " different elements";
-        }
-      }
-    }
-  }
-}
diff --git a/paddle/legacy/api/Arguments.cpp b/paddle/legacy/api/Arguments.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7bb5a6f75b9a8ab800fc74c6cc01c0b104ccdd5e
--- /dev/null
+++ b/paddle/legacy/api/Arguments.cpp
@@ -0,0 +1,174 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
+
+#include "paddle/legacy/parameter/Argument.h"
+
+size_t Arguments::getSlotNum() const { return m->outputs.size(); }
+
+Arguments* Arguments::createArguments(size_t slotNum) {
+  auto args = new Arguments();
+  args->m->outputs.resize(slotNum);
+  return args;
+}
+
+void Arguments::resize(size_t slotNum) { m->outputs.resize(slotNum); }
+
+Arguments::Arguments() : m(new ArgumentsPrivate()) {}
+
+Arguments::~Arguments() { delete m; }
+
+Arguments* Arguments::createByPaddleArgumentVector(void* ptr) {
+  auto p = (std::vector<paddle::Argument>*)(ptr);
+  auto args = new Arguments();
+  args->m->outputs = *p;
+  return args;
+}
+
+Arguments* Arguments::createByPaddleArgument(const void* ptr) {
+  auto p = (paddle::Argument*)(ptr);
+  auto args = new Arguments();
+  args->m->outputs.push_back(*p);
+  return args;
+}
+
+Matrix* Arguments::getSlotValue(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return Matrix::createByPaddleMatrixPtr(&a.value);
+}
+
+Matrix* Arguments::getSlotGrad(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return Matrix::createByPaddleMatrixPtr(&a.grad);
+}
+
+IVector* Arguments::getSlotIds(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return IVector::createByPaddleVectorPtr(&a.ids);
+}
+
+Matrix* Arguments::getSlotIn(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return Matrix::createByPaddleMatrixPtr(&a.in);
+}
+
+void Arguments::setSlotValue(size_t idx, Matrix* mat) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  a.value = m->cast<paddle::Matrix>(mat->getSharedPtr());
+}
+
+void Arguments::setSlotGrad(size_t idx, Matrix* mat) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  a.grad = m->cast<paddle::Matrix>(mat->getSharedPtr());
+}
+
+void Arguments::setSlotIn(size_t idx, Matrix* mat) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  a.in = m->cast<paddle::Matrix>(mat->getSharedPtr());
+}
+
+void Arguments::setSlotIds(size_t idx, IVector* vec) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
+  a.ids = v;
+}
+
+template <typename T1>
+static inline void doCopyFromSafely(std::shared_ptr<T1>& dest,
+                                    std::shared_ptr<T1>& src) {
+  if (src) {
+    if (dest) {
+      dest->copyFrom(*src);
+    } else {
+      dest = src;
+    }
+  }
+}
+
+IVector* Arguments::getSlotSequenceStartPositions(size_t idx) const
+    throw(RangeError) {
+  auto& a = m->getArg(idx);
+  if (a.sequenceStartPositions) {
+    return IVector::createByPaddleVectorPtr(
+        &a.sequenceStartPositions->getMutableVector(false));
+  } else {
+    return nullptr;
+  }
+}
+
+IVector* Arguments::getSlotSubSequenceStartPositions(size_t idx) const
+    throw(RangeError) {
+  auto& a = m->getArg(idx);
+  if (a.subSequenceStartPositions) {
+    return IVector::createByPaddleVectorPtr(
+        &a.subSequenceStartPositions->getMutableVector(false));
+  } else {
+    return nullptr;
+  }
+}
+
+void Arguments::setSlotSequenceStartPositions(size_t idx,
+                                              IVector* vec) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
+  a.sequenceStartPositions = std::make_shared<paddle::ICpuGpuVector>(v);
+}
+
+void Arguments::setSlotSubSequenceStartPositions(
+    size_t idx, IVector* vec) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
+  a.subSequenceStartPositions = std::make_shared<paddle::ICpuGpuVector>(v);
+}
+
+IVector* Arguments::getSlotSequenceDim(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return IVector::createByPaddleVectorPtr(&a.cpuSequenceDims);
+}
+
+void Arguments::setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  a.cpuSequenceDims = m->cast<paddle::IVector>(vec->getSharedPtr());
+}
+
+float Arguments::sum() const { return paddle::Argument::sum(m->outputs); }
+
+int64_t Arguments::getBatchSize(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return a.getBatchSize();
+}
+
+void Arguments::setSlotFrameHeight(size_t idx, size_t h) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  a.setFrameHeight(h);
+}
+
+void Arguments::setSlotFrameWidth(size_t idx, size_t w) throw(RangeError) {
+  auto& a = m->getArg(idx);
+  a.setFrameWidth(w);
+}
+
+size_t Arguments::getSlotFrameHeight(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return a.getFrameHeight();
+}
+
+size_t Arguments::getSlotFrameWidth(size_t idx) const throw(RangeError) {
+  auto& a = m->getArg(idx);
+  return a.getFrameWidth();
+}
+
+void* Arguments::getInternalArgumentsPtr() const { return &m->outputs; }
diff --git a/paddle/legacy/api/CMakeLists.txt b/paddle/legacy/api/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..06e1f5d5f0884efabfcdf917ca5c35d94ad5dce9
--- /dev/null
+++ b/paddle/legacy/api/CMakeLists.txt
@@ -0,0 +1,120 @@
+set(API_SOURCES
+    Arguments.cpp
+    ConfigParser.cpp
+    Evaluator.cpp
+    GradientMachine.cpp
+    Matrix.cpp
+    Parameter.cpp
+    ParameterOptimizer.cpp
+    ParameterUpdater.cpp
+    SequenceGenerator.cpp
+    Trainer.cpp
+    Util.cpp
+    Vector.cpp)
+set(API_HEADER
+    PaddleAPI.h
+    Internal.h)
+
+add_library(paddle_api STATIC ${API_SOURCES})
+add_dependencies(paddle_api paddle_proto paddle_trainer_lib)
+
+INCLUDE(${SWIG_USE_FILE})
+INCLUDE_DIRECTORIES(${PADDLE_SOURCE_DIR}/paddle)
+
+FILE(GLOB PY_PADDLE_PYTHON_FILES ${PADDLE_SOURCE_DIR}/paddle/py_paddle/*.py)
+
+SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON)
+
+SET(SWIG_NEED_FLAGS
+    -ftls-model=global-dynamic
+    -Wno-parentheses-equality
+    -Wno-self-assign
+    -Wno-maybe-uninitialized
+    -Wno-missing-field-initializers)
+  FOREACH(flag ${SWIG_NEED_FLAGS})
+  safe_set_cxxflag(SWIG_CXX_FLAGS ${flag})
+ENDFOREACH()
+
+SET(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR})
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SWIG_CXX_FLAGS}")
+
+SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS
+    paddle_parameter
+    paddle_function
+    paddle_math
+    paddle_utils
+    paddle_gserver
+    paddle_pserver
+    paddle_api
+    paddle_cuda
+    paddle_trainer_lib
+    paddle_network
+    paddle_proto
+    ${external_project_dependencies}
+    ${RDMA_LIBS}
+)
+
+IF(APPLE)
+    SET(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load -framework CoreFoundation -framework Security")
+ELSE(APPLE)
+    SET(START_GROUP "-Xlinker -start-group")
+    SET(END_GROUP "-Xlinker -end-group")
+    SET(ARCHIVE_START "-Wl,--whole-archive")
+    SET(ARCHIVE_END "-Wl,--no-whole-archive")
+ENDIF(APPLE)
+
+SWIG_ADD_MODULE(swig_paddle python Paddle.i)
+SWIG_LINK_LIBRARIES(swig_paddle
+    ${MACOS_LD_FLAGS}
+    ${START_GROUP}
+    ${ARCHIVE_START}
+    paddle_gserver
+    paddle_function
+    ${METRIC_LIBS}
+    ${ARCHIVE_END}
+    paddle_pserver
+    paddle_trainer_lib
+    paddle_network
+    paddle_parameter
+    paddle_optimizer
+    paddle_math
+    paddle_utils
+    paddle_proto
+    paddle_cuda
+    paddle_api
+    ${CMAKE_DL_LIBS}
+    ${EXTERNAL_LIBS}
+    ${CMAKE_THREAD_LIBS_INIT}
+    ${RDMA_LD_FLAGS}
+    ${START_END}
+)
+
+add_custom_command(OUTPUT ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/py_paddle
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PADDLE_BINARY_DIR}/python/py_paddle
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PADDLE_BINARY_DIR}/python/py_paddle
+    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/.timestamp
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
+    DEPENDS _swig_paddle
+)
+
+# TODO(yuyang18) : make wheel name calculated by cmake
+add_custom_target(python_api_wheel ALL DEPENDS ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so)
+
+if(WITH_TESTING)
+    IF(NOT PY_PIP_FOUND)
+        SET(PIP_SOURCES_DIR ${PYTHON_SOURCES_DIR}/pip)
+        ExternalProject_Add(pip
+            ${EXTERNAL_PROJECT_LOG_ARGS}
+            GIT_REPOSITORY      https://github.com/pypa/pip.git
+            GIT_TAG             9.0.1
+            PREFIX              ${PIP_SOURCES_DIR}
+            CONFIGURE_COMMAND   ""
+            BUILD_COMMAND       ""
+            INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+            BUILD_IN_SOURCE     1
+            #DEPENDS python setuptools python_api_wheel
+        )
+    ENDIF()
+    add_subdirectory(test)
+endif()
diff --git a/paddle/legacy/api/ConfigParser.cpp b/paddle/legacy/api/ConfigParser.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..016d6da4e2e4ce888527fe9b61a163056d7729eb
--- /dev/null
+++ b/paddle/legacy/api/ConfigParser.cpp
@@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
+#include "paddle/legacy/trainer/Trainer.h"
+
+struct ParameterConfigPrivate {
+  paddle::ParameterPtr parameter;
+  paddle::ParameterConfig config;
+
+  inline paddle::ParameterConfig* getConfigPtr() {
+    if (parameter != nullptr) {
+      auto& conf = parameter->getConfig();
+      return const_cast<paddle::ParameterConfig*>(&conf);
+    } else {
+      return &config;
+    }
+  }
+};
+
+TrainerConfig::TrainerConfig() : m(new TrainerConfigPrivate()) {}
+
+TrainerConfig::~TrainerConfig() { delete m; }
+
+TrainerConfig* TrainerConfig::createFromTrainerConfigFile(
+    const std::string& confPath) {
+  LOG(INFO) << "load trainer config from " << confPath;
+  auto conf = std::make_shared<paddle::TrainerConfigHelper>(confPath);
+  auto retv = new TrainerConfig();
+  retv->m->conf = conf;
+  return retv;
+}
+
+TrainerConfig* TrainerConfig::createFromProtoString(const std::string& str) {
+  auto retv = new TrainerConfig();
+  paddle::TrainerConfig trainerConfigProto;
+  auto conf = std::make_shared<paddle::TrainerConfigHelper>(trainerConfigProto);
+  CHECK(conf->getMutableConfig().ParseFromString(str));
+  retv->m->conf = conf;
+  return retv;
+}
+
+ModelConfig::ModelConfig() : m(new ModelConfigPrivate()) {}
+
+ModelConfig::~ModelConfig() { delete m; }
+
+ModelConfig* TrainerConfig::getModelConfig() const {
+  auto retv = new ModelConfig();
+  retv->m->conf = m->conf;
+  return retv;
+}
+
+ParameterConfig::ParameterConfig() : m(new ParameterConfigPrivate()) {}
+
+ParameterConfig::~ParameterConfig() { delete m; }
+
+ParameterConfig* ParameterConfig::createParameterConfigFromParameterSharedPtr(
+    void* ptr) {
+  auto& p = *(paddle::ParameterPtr*)(ptr);
+  if (p != nullptr) {
+    auto conf = new ParameterConfig();
+    conf->m->parameter = p;
+    return conf;
+  } else {
+    return nullptr;
+  }
+}
+
+ParameterConfig* ParameterConfig::createParameterConfigFromParameterPtr(
+    void* ptr) {
+  auto& p = *(paddle::Parameter*)(ptr);
+  auto conf = new ParameterConfig();
+  conf->m->config = p.getConfig();
+  return conf;
+}
+
+std::string ParameterConfig::toProtoString() const {
+  return m->getConfigPtr()->SerializeAsString();
+}
+
+void* ParameterConfig::getRawPtr() { return m->getConfigPtr(); }
+
+OptimizationConfig::OptimizationConfig() : m(new OptimizationConfigPrivate()) {}
+
+OptimizationConfig::~OptimizationConfig() { delete m; }
+
+std::string OptimizationConfig::toProtoString() {
+  return m->getConfig().SerializeAsString();
+}
+
+OptimizationConfig* TrainerConfig::getOptimizationConfig() const {
+  auto opt_config = new OptimizationConfig();
+  opt_config->m->trainer_config = m->conf;
+  return opt_config;
+}
+
+OptimizationConfig* OptimizationConfig::createFromProtoString(
+    const std::string& str) {
+  auto conf = new OptimizationConfig();
+  conf->m->config.ParseFromString(str);
+  return conf;
+}
diff --git a/paddle/api/Evaluator.cpp b/paddle/legacy/api/Evaluator.cpp
similarity index 100%
rename from paddle/api/Evaluator.cpp
rename to paddle/legacy/api/Evaluator.cpp
diff --git a/paddle/legacy/api/GradientMachine.cpp b/paddle/legacy/api/GradientMachine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5ad2fe11a4c668a318f76492f57091f386183986
--- /dev/null
+++ b/paddle/legacy/api/GradientMachine.cpp
@@ -0,0 +1,196 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
+
+#include "Internal.h"
+#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
+
+std::vector<int> GradientMachine::defaultParamTypes = {
+    PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM};
+
+GradientMachine::GradientMachine() : m(new GradientMachinePrivate()) {}
+
+GradientMachine::~GradientMachine() { delete m; }
+
+GradientMachine* GradientMachine::createFromPaddleModelPtr(
+    const void* confPtr,
+    GradientMatchineCreateMode mode,
+    const std::vector<int>& types) {
+  auto& conf = *(const paddle::ModelConfig*)(confPtr);
+  std::vector<ParameterType> realTypes;
+  staticCastVector(&realTypes, types);
+  auto machineRawPtr = paddle::GradientMachine::create(conf, mode, realTypes);
+  auto machinePtr = std::shared_ptr<paddle::GradientMachine>(machineRawPtr);
+  if (machinePtr != nullptr) {
+    auto machine = new GradientMachine();
+    machine->m->machine = machinePtr;
+    return machine;
+  } else {
+    return nullptr;
+  }
+}
+
+GradientMachine* GradientMachine::createByConfigProtoStr(
+    const std::string& protoStr,
+    GradientMatchineCreateMode mode,
+    const std::vector<int>& types) {
+  paddle::ModelConfig conf;
+  conf.ParseFromString(protoStr);
+  if (conf.IsInitialized()) {
+    return GradientMachine::createFromPaddleModelPtr(&conf, mode, types);
+  } else {
+    return nullptr;
+  }
+}
+
+GradientMachine* GradientMachine::createByModelConfig(
+    ModelConfig* conf,
+    GradientMatchineCreateMode mode,
+    const std::vector<int>& types) {
+  auto confPtr = &conf->m->conf->getModelConfig();
+  return GradientMachine::createFromPaddleModelPtr(confPtr, mode, types);
+}
+
+void GradientMachine::start() { m->machine->start(); }
+
+void GradientMachine::finish() { m->machine->finish(); }
+
+void GradientMachine::onPassEnd() { m->machine->onPassEnd(); }
+
+void GradientMachine::prefetch(const Arguments& inArgs) {
+  auto& in =
+      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
+  m->machine->prefetch(in);
+}
+
+void GradientMachine::forward(const Arguments& inArgs,
+                              Arguments* outArgs,
+                              PassType passType) {
+  auto& in =
+      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
+  auto& out = m->cast<std::vector<paddle::Argument>>(
+      outArgs->getInternalArgumentsPtr());
+  paddle::PassType pt = (paddle::PassType)(passType);
+  m->machine->forward(in, &out, pt);
+}
+
+UpdateCallback::~UpdateCallback() {}
+
+void UpdateCallback::apply(Parameter* p) {
+  // UNUSED(p);
+}
+
+class UpdateCallbackWrapper {
+ public:
+  explicit UpdateCallbackWrapper(const UpdateCallback& callback)
+      : callback(const_cast<UpdateCallback&>(callback)) {}
+
+  void operator()(paddle::Parameter* param) {
+    auto p = Parameter::createFromRawPtr(&param);
+    // @TODO Use Stack variable instead.
+    callback.apply(p);
+    delete p;
+  }
+
+ private:
+  UpdateCallback& callback;
+};
+
+void GradientMachine::backward(const UpdateCallback& callback) {
+  m->machine->backward(UpdateCallbackWrapper(callback));
+}
+
+void GradientMachine::forwardBackward(const Arguments& inArgs,
+                                      Arguments* outArgs,
+                                      PassType passType,
+                                      const UpdateCallback& callback) {
+  auto& in =
+      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
+  auto& out = m->cast<std::vector<paddle::Argument>>(
+      outArgs->getInternalArgumentsPtr());
+  paddle::PassType pt = (paddle::PassType)(passType);
+  m->machine->forwardBackward(in, &out, pt, UpdateCallbackWrapper(callback));
+}
+
+void GradientMachine::loadParameters(const std::string& path) {
+  m->machine->loadParameters(path);
+}
+
+size_t GradientMachine::getParameterSize() const {
+  return m->machine->getParameters().size();
+}
+
+Parameter* GradientMachine::getParameter(size_t i) throw(RangeError) {
+  auto params = m->machine->getParameters();
+  if (i < params.size()) {
+    return Parameter::createFromSharedPtr(&m->machine->getParameters()[i]);
+  } else {
+    throw RangeError();
+  }
+}
+
+size_t GradientMachine::getNonStaticParameterSize() const {
+  return m->machine->getNonStaticParameters().size();
+}
+
+Parameter* GradientMachine::getNonStaticParameter(size_t i) throw(RangeError) {
+  auto params = m->machine->getNonStaticParameters();
+  if (i < params.size()) {
+    return Parameter::createFromSharedPtr(
+        &m->machine->getNonStaticParameters()[i]);
+  } else {
+    throw RangeError();
+  }
+}
+
+void GradientMachine::randParameters() { m->machine->randParameters(); }
+
+Arguments* GradientMachine::getLayerOutput(const std::string& layerName) const
+    throw(UnsupportError) {
+  auto nn = m->machine;
+  if (nn) {
+    auto arg = nn->getLayerOutput(layerName);
+    return Arguments::createByPaddleArgument(&arg);
+  } else {
+    throw UnsupportError();
+  }
+}
+
+SequenceGenerator* GradientMachine::asSequenceGenerator(
+    const std::vector<std::string>& dict,
+    size_t begin_id,
+    size_t end_id,
+    size_t max_length,
+    size_t beam_size) {
+  SequenceGenerator* r =
+      SequenceGenerator::createByGradientMachineSharedPtr(&m->machine);
+  r->setDict(dict);
+  r->setBos(begin_id);
+  r->setEos(end_id);
+  r->setMaxLength(max_length);
+  r->setBeamSize(beam_size);
+  return r;
+}
+
+Evaluator* GradientMachine::makeEvaluator() {
+  auto ev = new Evaluator();
+  ev->m->rawPtr = m->machine->makeEvaluator();
+  return ev;
+}
+
+void GradientMachine::eval(Evaluator* evaluator) {
+  m->machine->eval(evaluator->m->rawPtr);
+}
diff --git a/paddle/api/Internal.h b/paddle/legacy/api/Internal.h
similarity index 100%
rename from paddle/api/Internal.h
rename to paddle/legacy/api/Internal.h
diff --git a/paddle/legacy/api/Matrix.cpp b/paddle/legacy/api/Matrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8862d0ea92c92a2608b49c6b1315badae9e9fd98
--- /dev/null
+++ b/paddle/legacy/api/Matrix.cpp
@@ -0,0 +1,317 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/math/Matrix.h"
+#include <cstring>
+#include <iostream>
+#include "PaddleAPI.h"
+#include "paddle/legacy/math/CpuSparseMatrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+
+struct MatrixPrivate {
+  std::shared_ptr<paddle::Matrix> mat;
+};
+
+Matrix::Matrix() : m(new MatrixPrivate()) {}
+
+Matrix* Matrix::createByPaddleMatrixPtr(void* sharedPtr) {
+  auto* mat = reinterpret_cast<paddle::MatrixPtr*>(sharedPtr);
+  if ((*mat) != nullptr) {
+    auto m = new Matrix();
+    m->m->mat = *mat;
+    return m;
+  } else {
+    return nullptr;
+  }
+}
+
+Matrix* Matrix::createZero(size_t height, size_t width, bool useGpu) {
+  auto m = new Matrix();
+  m->m->mat = paddle::Matrix::create(height, width, useGpu);
+  m->m->mat->zero();
+  return m;
+}
+
+Matrix* Matrix::createDense(const std::vector<float>& data,
+                            size_t height,
+                            size_t width,
+                            bool useGpu) {
+  auto m = new Matrix();
+  m->m->mat = paddle::Matrix::create(height, width, useGpu);
+  m->m->mat->copyFrom(data.data(), data.size());
+  return m;
+}
+
+Matrix* Matrix::createDenseFromNumpy(float* data,
+                                     int dim1,
+                                     int dim2,
+                                     bool copy,
+                                     bool useGpu) throw(UnsupportError) {
+  if (useGpu) {
+    /// Gpu mode only supports copy=True
+    if (!copy) {
+      throw UnsupportError("Gpu mode only supports copy=True");
+    }
+    return Matrix::createGpuDenseFromNumpy(data, dim1, dim2);
+  } else {
+    return Matrix::createCpuDenseFromNumpy(data, dim1, dim2, copy);
+  }
+}
+
+Matrix* Matrix::createCpuDenseFromNumpy(float* data,
+                                        int dim1,
+                                        int dim2,
+                                        bool copy) {
+  auto m = new Matrix();
+  if (copy) {
+    m->m->mat = paddle::Matrix::create(dim1, dim2);
+    m->m->mat->copyFrom(data, dim1 * dim2);
+  } else {
+    m->m->mat = paddle::Matrix::create(data, dim1, dim2, false);
+  }
+  return m;
+}
+
+Matrix* Matrix::createGpuDenseFromNumpy(float* data, int dim1, int dim2) {
+  auto m = new Matrix();
+  m->m->mat = paddle::Matrix::create(dim1, dim2, false, true);
+  m->m->mat->copyFrom(data, dim1 * dim2);
+  return m;
+}
+
+Matrix* Matrix::createSparse(size_t height,
+                             size_t width,
+                             size_t nnz,
+                             bool isNonVal,
+                             bool isTrans,
+                             bool useGpu) {
+  auto m = new Matrix();
+  m->m->mat = paddle::Matrix::createSparseMatrix(
+      height,
+      width,
+      nnz,
+      isNonVal ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
+      isTrans,
+      useGpu);
+  return m;
+}
+
+Matrix::~Matrix() { delete m; }
+
+size_t Matrix::getHeight() const { return m->mat->getHeight(); }
+
+size_t Matrix::getWidth() const { return m->mat->getWidth(); }
+
+float Matrix::get(size_t x, size_t y) const throw(RangeError) {
+  if (x > this->getWidth() || y > this->getHeight()) {
+    RangeError e;
+    throw e;
+  }
+  return m->mat->getElement(x, y);
+}
+
+void Matrix::set(size_t x, size_t y, float val) throw(RangeError,
+                                                      UnsupportError) {
+  if (x > this->getWidth() || y > this->getHeight()) {
+    RangeError e;
+    throw e;
+  }
+  auto rawMat = m->mat.get();
+  if (auto cDenseMat = dynamic_cast<paddle::CpuMatrix*>(rawMat)) {
+    *(cDenseMat->getData() + x + y * cDenseMat->getWidth()) = val;
+  } else {
+    UnsupportError e;
+    throw e;
+  }
+}
+
+bool Matrix::isSparse() const {
+  auto raw_mat = m->mat.get();
+  return dynamic_cast<paddle::CpuSparseMatrix*>(raw_mat) != nullptr ||
+         dynamic_cast<paddle::GpuSparseMatrix*>(raw_mat) != nullptr;
+}
+
+SparseValueType Matrix::getSparseValueType() const throw(UnsupportError) {
+  auto cpuSparseMat =
+      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
+  if (cpuSparseMat != nullptr) {
+    return (SparseValueType)cpuSparseMat->getValueType();
+  } else {
+    auto gpuSparseMat =
+        std::dynamic_pointer_cast<paddle::GpuSparseMatrix>(m->mat);
+    if (gpuSparseMat != nullptr) {
+      return (SparseValueType)gpuSparseMat->getValueType();
+    } else {
+      UnsupportError e;
+      throw e;
+    }
+  }
+}
+
+SparseFormatType Matrix::getSparseFormat() const throw(UnsupportError) {
+  auto cpuSparseMat =
+      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
+  if (cpuSparseMat != nullptr) {
+    return (SparseFormatType)cpuSparseMat->getFormat();
+  } else {
+    auto gpuSparseMat =
+        std::dynamic_pointer_cast<paddle::GpuSparseMatrix>(m->mat);
+    if (gpuSparseMat != nullptr) {
+      return SPARSE_CSR;
+    } else {
+      UnsupportError e;
+      throw e;
+    }
+  }
+}
+
+IntArray Matrix::getSparseRowCols(size_t i) const
+    throw(UnsupportError, RangeError) {
+  auto cpuSparseMat =
+      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
+  if (cpuSparseMat != nullptr &&
+      cpuSparseMat->getFormat() == paddle::SPARSE_CSR) {
+    if (i < cpuSparseMat->getHeight()) {
+      // cpuSparseMat->print(std::cout);
+      size_t len = cpuSparseMat->getColNum(i);
+      return IntArray(cpuSparseMat->getRowCols(i), len);
+    } else {
+      RangeError e;
+      throw e;
+    }
+  } else {
+    UnsupportError e;
+    throw e;
+  }
+}
+
+IntWithFloatArray Matrix::getSparseRowColsVal(size_t i) const
+    throw(UnsupportError, RangeError) {
+  auto cpuSparseMat =
+      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
+  if (cpuSparseMat != nullptr &&
+      cpuSparseMat->getValueType() == paddle::FLOAT_VALUE) {
+    if (i < cpuSparseMat->getHeight()) {
+      return IntWithFloatArray(cpuSparseMat->getRowValues(i),
+                               cpuSparseMat->getRowCols(i),
+                               cpuSparseMat->getColNum(i));
+    } else {
+      RangeError e;
+      throw e;
+    }
+  } else {
+    UnsupportError e;
+    throw e;
+  }
+}
+
+FloatArray Matrix::getData() const {
+  auto rawMat = m->mat.get();
+  if (dynamic_cast<paddle::GpuMemoryHandle*>(rawMat->getMemoryHandle().get())) {
+    // is gpu. then copy data
+    float* data = rawMat->getData();
+    size_t len = rawMat->getElementCnt();
+    float* cpuData = new float[len];
+    hl_memcpy_device2host(cpuData, data, len * sizeof(float));
+    FloatArray ret_val(cpuData, len);
+    ret_val.needFree = true;
+    return ret_val;
+  } else {
+    FloatArray ret_val(rawMat->getData(), rawMat->getElementCnt());
+    return ret_val;
+  }
+}
+
+void Matrix::sparseCopyFrom(
+    const std::vector<int>& rows,
+    const std::vector<int>& cols,
+    const std::vector<float>& vals) throw(UnsupportError) {
+  auto cpuSparseMat =
+      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
+  if (cpuSparseMat != nullptr) {
+    // LOG(INFO) <<"RowSize = "<<rows.size()
+    //  <<" ColSize = "<<cols.size()
+    //  <<" ValSize = "<<vals.size();
+    cpuSparseMat->copyFrom(const_cast<std::vector<int>&>(rows),
+                           const_cast<std::vector<int>&>(cols),
+                           const_cast<std::vector<float>&>(vals));
+  } else {
+    UnsupportError e;
+    throw e;
+  }
+}
+
+void* Matrix::getSharedPtr() const { return &m->mat; }
+
+void Matrix::toNumpyMatInplace(float** view_data,
+                               int* dim1,
+                               int* dim2) throw(UnsupportError) {
+  auto cpuMat = std::dynamic_pointer_cast<paddle::CpuMatrix>(m->mat);
+  if (cpuMat) {
+    *dim1 = cpuMat->getHeight();
+    *dim2 = cpuMat->getWidth();
+    *view_data = cpuMat->getData();
+  } else {
+    throw UnsupportError();
+  }
+}
+void Matrix::copyToNumpyMat(float** view_m_data,
+                            int* dim1,
+                            int* dim2) throw(UnsupportError) {
+  static_assert(sizeof(paddle::real) == sizeof(float),
+                "Currently PaddleAPI only support for single "
+                "precision version of paddle.");
+  if (this->isSparse()) {
+    throw UnsupportError();
+  } else {
+    *dim1 = m->mat->getHeight();
+    *dim2 = m->mat->getWidth();
+    *view_m_data = new float[(*dim1) * (*dim2)];
+    if (auto cpuMat = dynamic_cast<paddle::CpuMatrix*>(m->mat.get())) {
+      auto src = cpuMat->getData();
+      auto dest = *view_m_data;
+      std::memcpy(dest, src, sizeof(paddle::real) * (*dim1) * (*dim2));
+    } else if (auto gpuMat = dynamic_cast<paddle::GpuMatrix*>(m->mat.get())) {
+      auto src = gpuMat->getData();
+      auto dest = *view_m_data;
+      hl_memcpy_device2host(
+          dest, src, sizeof(paddle::real) * (*dim1) * (*dim2));
+    } else {
+      LOG(WARNING) << "Unexpected Situation";
+      throw UnsupportError();
+    }
+  }
+}
+
+void Matrix::copyFromNumpyMat(float* data,
+                              int dim1,
+                              int dim2) throw(UnsupportError, RangeError) {
+  if (isSparse()) {
+    throw UnsupportError();
+  } else {
+    if (this->getHeight() == (size_t)dim1 && this->getWidth() == (size_t)dim2) {
+      if (m->mat->getData() != data) {
+        m->mat->copyFrom(data, dim1 * dim2);
+      }
+    } else {
+      throw RangeError();
+    }
+  }
+}
+
+bool Matrix::isGpu() const {
+  auto rawPtr = m->mat.get();
+  return dynamic_cast<paddle::GpuMatrix*>(rawPtr) != nullptr ||
+         dynamic_cast<paddle::GpuSparseMatrix*>(rawPtr) != nullptr;
+}
diff --git a/paddle/legacy/api/Paddle.i b/paddle/legacy/api/Paddle.i
new file mode 100644
index 0000000000000000000000000000000000000000..7a1456a5c065821caa54fbf4a10f7ceda08780c0
--- /dev/null
+++ b/paddle/legacy/api/Paddle.i
@@ -0,0 +1,202 @@
+%module(directors="1") swig_paddle
+%include "std_string.i"
+%{
+#define SWIG_FILE_WITH_INIT
+#include "legacy/api/PaddleAPI.h"
+%}
+
+%include "exception.i"
+%typemap(throws) UnsupportError %{
+  SWIG_exception(SWIG_RuntimeError, $1.what());
+  SWIG_fail;
+%}
+
+%include "std_vector.i"
+%include "std_pair.i"
+#ifdef SWIGPYTHON
+%include "numpy.i"
+#endif
+
+%init %{
+#ifdef SWIGPYTHON
+import_array();
+#endif
+%}
+
+
+namespace std {
+%template(vector_int) vector<int>;
+%template(vector_uint) vector<unsigned int>;
+%template(vector_float) vector<float>;
+%template(vector_string) vector<string>;
+%template(vector_vec_star) vector<Vector*>;
+}
+#ifdef SWIGPYTHON 
+%typemap(in) (int argc, char** argv) { 
+    int i = 0; 
+    if (!PyList_Check($input)) { 
+        PyErr_SetString(PyExc_ValueError, "Expecting a list"); 
+        return NULL; 
+    } 
+    $1 = PyList_Size($input); 
+    $2 = (char **) malloc(($1+1)*sizeof(char *)); 
+    for (i = 0; i < $1; i++) { 
+        PyObject *s = PyList_GetItem($input,i); 
+        if (!PyString_Check(s)) { 
+            free($2); 
+            PyErr_SetString(PyExc_ValueError, "List items must be strings"); 
+            return NULL; 
+        } 
+        $2[i] = PyString_AsString(s); 
+    } 
+    $2[i] = 0; 
+} 
+%typemap(freearg) (int argc, char** argv) { 
+    if ($2) free($2); 
+} 
+
+%typemap(out) FloatArray {
+  $result = PyList_New($1.length);
+  for (size_t i=0; i<$1.length; ++i) {
+    PyList_SetItem($result, i, PyFloat_FromDouble($1.buf[i]));
+  }  
+  if($1.needFree) {
+    delete [] $1.buf;  
+  }
+}
+
+%typemap(out) IntArray {
+  $result = PyList_New($1.length);  
+  for (size_t i=0; i<$1.length; ++i) {
+    PyList_SetItem($result, i, PyInt_FromLong($1.buf[i]));  
+  }
+  if ($1.needFree) {
+    delete [] $1.buf;  
+  }
+}
+
+%typemap(out) IntWithFloatArray {
+  $result = PyList_New($1.length);
+  for (size_t i=0; i<$1.length; ++i) {
+    PyList_SetItem($result, i, PyTuple_Pack(2, 
+      PyInt_FromLong($1.idxBuf[i]),
+      PyFloat_FromDouble($1.valBuf[i])
+    ));
+  }
+  if ($1.needFree) {
+    delete [] $1.idxBuf;
+    delete [] $1.valBuf;
+  } 
+}
+
+
+%rename(__getitem__) IVector::get;
+%rename(__setitem__) IVector::set;
+%rename(__len__) IVector::getSize;
+%rename(__getitem__) Vector::get;
+%rename(__setitem__) Vector::set;
+%rename(__len__) Vector::getSize;
+%rename(__len__) Parameter::getSize;
+%rename(__call__) ParameterTraverseCallback::apply;
+%rename(__repr__) Evaluator::toString;
+
+%apply (float* INPLACE_ARRAY2, int DIM1, int DIM2) { 
+  (float* data, int dim1, int dim2) 
+}
+
+%apply (float** ARGOUTVIEW_ARRAY2, int* DIM1, int* DIM2) { 
+  (float** view_data, int* dim1, int* dim2) 
+}
+
+%apply (float** ARGOUTVIEWM_ARRAY2, int* DIM1, int* DIM2) {
+  (float** view_m_data, int* dim1, int* dim2)  
+}
+
+%apply (int** ARGOUTVIEWM_ARRAY1, int* DIM1) {
+  (int** view_m_data, int* dim1)  
+}
+
+%apply (int* INPLACE_ARRAY1, int DIM1) { 
+  (int* data, int dim) 
+}
+
+%apply (int** ARGOUTVIEW_ARRAY1, int* DIM1) {
+  (int** view_data, int* dim1)  
+}
+
+%apply (float* INPLACE_ARRAY1, int DIM1) {
+  (float* data, int dim)
+}
+
+%apply (float** ARGOUTVIEW_ARRAY1, int* DIM1) {
+  (float** view_data, int* dim1)
+}
+
+%apply (float** ARGOUTVIEWM_ARRAY1, int* DIM1) {
+  (float** view_m_data, int* dim1)
+}
+
+#endif
+// The below functions internally create object by "new", so it should use
+// use SWIG to handle gc. There are hints for SWIG to handle GC.
+%newobject Matrix::createZero;
+%newobject Matrix::createSparse;
+%newobject Matrix::createDense;
+%newobject Matrix::createDenseFromNumpy;
+%newobject Matrix::createCpuDenseFromNumpy;
+%newobject Matrix::createGpuDenseFromNumpy;
+%newobject Vector::createZero;
+%newobject Vector::create;
+%newobject Vector::createVectorFromNumpy;
+%newobject Vector::createCpuVectorFromNumpy;
+%newobject Vector::createGpuVectorFromNumpy;
+%newobject IVector::createZero;
+%newobject IVector::create;
+%newobject IVector::createVectorFromNumpy;
+%newobject IVector::createCpuVectorFromNumpy;
+%newobject IVector::createGpuVectorFromNumpy;
+%newobject Trainer::createByCommandLine;
+%newobject Trainer::getForwardOutput;
+%newobject Trainer::getLayerOutput;
+%newobject Arguments::getSlotValue;
+%newobject Arguments::getSlotIds;
+%newobject Arguments::getSlotIn;
+%newobject Arguments::getSlotSequenceStartPositions;
+%newobject Arguments::getSlotSequenceDim;
+%newobject Arguments::createArguments;
+%newobject GradientMachine::createByConfigProtoStr;
+%newobject GradientMachine::createByModelConfig;
+%newobject GradientMachine::asSequenceGenerator;
+%newobject GradientMachine::getParameter;
+%newobject GradientMachine::getLayerOutput;
+%newobject GradientMachine::makeEvaluator;
+%newobject TrainerConfig::createFromTrainerConfigFile;
+%newobject TrainerConfig::getModelConfig;
+%newobject TrainerConfig::getOptimizationConfig;
+%newobject Parameter::getBuf;
+%newobject Parameter::getConfig;
+%newobject ParameterOptimizer::create;
+%newobject ParameterOptimizer::needSpecialTraversal;
+%newobject ParameterUpdater::createLocalUpdater;
+%newobject ParameterUpdater::createRemoteUpdater;
+%newobject ParameterUpdater::createNewRemoteUpdater;
+
+%feature("director") UpdateCallback;
+%feature("autodoc", 1); // To generate method stub, for code hint in ide
+
+// Ignore many private class, and method cannot be handled by swig.
+%ignore MatrixPrivate;
+%ignore TrainerPrivate;
+%ignore IVector::operator[];
+%ignore ArgumentsPrivate;
+%ignore GradientMachinePrivate;
+%ignore TrainerConfigPrivate;
+%ignore ModelConfigPrivate;
+%ignore ParameterPrivate;
+%ignore SequenceGeneratorPrivate;
+%ignore VectorPrivate;
+%ignore ParameterConfigPrivate;
+%ignore OptimizationConfigPrivate;
+%ignore ParameterTraverseCallbackPrivate;
+%include "legacy/utils/GlobalConstants.h"
+%include "legacy/api/PaddleAPI.h"
diff --git a/paddle/legacy/api/PaddleAPI.h b/paddle/legacy/api/PaddleAPI.h
new file mode 100644
index 0000000000000000000000000000000000000000..475984a3d57ebc25d5d071c33b7e6562ac78c503
--- /dev/null
+++ b/paddle/legacy/api/PaddleAPI.h
@@ -0,0 +1,1054 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdexcept>
+#include <string>
+#include <vector>
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+
+/// Import PaddlePaddle's enumeration into global namespace.
+using namespace paddle::enumeration_wrapper;  // NOLINT
+
+/**
+ * @brief Initialize paddle.
+ *
+ * In python, this method should be invoked as
+ * @code
+ *  import sys
+ *  import paddle
+ *  paddle.initPaddle(sys.argv)
+ *  or you can change arguments as any list of str.
+ * @endcode
+ */
+void initPaddle(int argc, char** argv);
+
+/// Return FLAGS_use_gpu
+bool isUsingGpu();
+
+/// Set the Flags_use_gpu to the given parameter
+void setUseGpu(bool useGpu);
+
+/// Return true if this py_paddle is compiled in GPU Version
+bool isGpuVersion();
+
+/// Return FLAGS_trainer_count
+int getTrainerCount();
+
+/// The Error of IO Operation. Such as file not found, etc.
+class IOError {};
+
+/// Out of range error
+class RangeError {};
+
+/// Not support Error, such as access GPU memory directly, etc.
+class UnsupportError : public std::runtime_error {
+ public:
+  UnsupportError() : std::runtime_error(" ") {}
+  explicit UnsupportError(const std::string& message)
+      : std::runtime_error(message) {}
+};
+
+/// This type will map to python's list of float.
+struct FloatArray {
+  const float* buf;
+  const size_t length;
+  bool needFree;  // true if the buf is dynamic alloced.
+  FloatArray(const float* b, const size_t l);
+};
+
+/// This type will map to python's list of int
+struct IntArray {
+  const int* buf;
+  const size_t length;
+  bool needFree;
+  IntArray(const int* b, const size_t l, bool f = false);
+};
+
+/// This type will map to python's list of (int, float)
+struct IntWithFloatArray {
+  const float* valBuf;
+  const int* idxBuf;
+  const size_t length;
+  bool needFree;
+  IntWithFloatArray(const float* v, const int* i, size_t l, bool f = false);
+};
+
+enum SparseValueType { SPARSE_NON_VALUE = 0, SPARSE_VALUE = 1 };
+
+enum SparseFormatType { SPARSE_CSR = 0, SPARSE_CSC = 1 };
+
+/**
+ * In Python, -1UL is hard to write. So define a const value used by python
+ * side.
+ */
+const size_t NO_SPARSE_ID = -1UL;
+
+struct MatrixPrivate;
+class Matrix {
+  Matrix();  // User Cannot Create Matrix.
+  DISABLE_COPY(Matrix);
+  static Matrix* createByPaddleMatrixPtr(void* sharedPtr);
+
+ public:
+  virtual ~Matrix();
+
+  /**
+   * Create A Matrix with height,width, which is filled by zero.
+   */
+  static Matrix* createZero(size_t height,
+                            size_t width,
+                            bool useGpu = isUsingGpu());
+
+  /**
+   * Create Sparse Matrix.
+   *
+   * After create sparse, sparseCopyFrom can be used to fill matrix.
+   *
+   * @param nnz  Number of non zero values.
+   *
+   * @note the default sparse type is SPARSE_CSR.
+   */
+  static Matrix* createSparse(size_t height,
+                              size_t width,
+                              size_t nnz,
+                              bool isNonVal = true,
+                              bool trans = false,
+                              bool useGpu = isUsingGpu());
+
+  /**
+   * Create Dense Matrix.
+   *
+   * @param data  list of float should be passed in python.
+   * @note        the value will be copy into a new matrix.
+   */
+  static Matrix* createDense(const std::vector<float>& data,
+                             size_t height,
+                             size_t width,
+                             bool useGpu = isUsingGpu());
+
+  static Matrix* createDenseFromNumpy(
+      float* data,
+      int dim1,
+      int dim2,
+      bool copy = true,
+      bool useGpu = isUsingGpu()) throw(UnsupportError);
+
+  /**
+   *  Create Cpu Dense Matrix from numpy matrix, dtype=float32
+   *
+   *  @param data  a numpy matrix.
+   *  @param dim1  dimension of data.
+   *  @param dim2  dimension of data.
+   *  @param copy  true if copy into a new matrix, false will create
+   *               matrix inplace. copy = false should be used with extreme
+   *               care because Matrix will share the memory with the given
+   *               numpy array. If the numpy array object is no longer valid,
+   *               the memory space will not be usable.
+   */
+  static Matrix* createCpuDenseFromNumpy(float* data,
+                                         int dim1,
+                                         int dim2,
+                                         bool copy = true);
+
+  /// Create Gpu Dense Matrix from numpy matrix, dtype=float32
+  static Matrix* createGpuDenseFromNumpy(float* data, int dim1, int dim2);
+
+  /**
+   * Cast to numpy matrix.
+   *
+   * @note    This method take no parameter in python.
+   * @note    This method in python will return a numpy matrix, not void.
+   * @note    Only CpuDenseMatrix is supported.
+   *
+   * Example:
+   * @code
+   * import paddle
+   * m = paddle.Matrix.createZero(10,2)
+   * numpy_mat = m.toNumpyMat()
+   * @endcode
+   */
+  void toNumpyMatInplace(float** view_data,
+                         int* dim1,
+                         int* dim2) throw(UnsupportError);
+
+  /// Copy To numpy mat.
+  void copyToNumpyMat(float** view_m_data,
+                      int* dim1,
+                      int* dim2) throw(UnsupportError);
+
+  /// Copy From Numpy Mat
+  void copyFromNumpyMat(float* data, int dim1, int dim2) throw(UnsupportError,
+                                                               RangeError);
+
+  /// return true if this matrix is sparse.
+  bool isSparse() const;
+
+  SparseValueType getSparseValueType() const throw(UnsupportError);
+
+  SparseFormatType getSparseFormat() const throw(UnsupportError);
+
+  IntArray getSparseRowCols(size_t i) const throw(UnsupportError, RangeError);
+
+  IntWithFloatArray getSparseRowColsVal(size_t i) const
+      throw(UnsupportError, RangeError);
+
+  size_t getHeight() const;
+
+  size_t getWidth() const;
+
+  float get(size_t x, size_t y) const throw(RangeError);
+
+  void set(size_t x, size_t y, float val) throw(RangeError, UnsupportError);
+
+  /// return type is list of float
+  FloatArray getData() const;
+
+  /**
+   * Copy from rows, cols, values.
+   *
+   * if sparse_nonvalue, the values should be []
+   */
+  void sparseCopyFrom(const std::vector<int>& rows,
+                      const std::vector<int>& cols,
+                      const std::vector<float>& values =
+                          std::vector<float>()) throw(UnsupportError);
+
+  bool isGpu() const;
+
+ private:
+  void* getSharedPtr() const;
+
+  MatrixPrivate* m;
+  friend class Trainer;
+  friend class GradientMachine;
+  friend class Arguments;
+};
+
+struct VectorPrivate;
+class Vector {
+  DISABLE_COPY(Vector);
+  Vector();
+  static Vector* createByPaddleVectorPtr(void* ptr);
+
+  void* getSharedPtr();
+
+ public:
+  ~Vector();
+
+  /// Create Vector filled with zero.
+  static Vector* createZero(size_t sz, bool useGpu = isUsingGpu());
+
+  /**
+   * Create Vector from list of float.
+   *
+   * It will create a new vector, and copy data into it.
+   */
+  static Vector* create(const std::vector<float>& data,
+                        bool useGpu = isUsingGpu());
+
+  static Vector* createVectorFromNumpy(
+      float* data,
+      int dim,
+      bool copy = true,
+      bool useGpu = isUsingGpu()) throw(UnsupportError);
+  /**
+   * Create Cpu Vector from numpy array, which dtype=float32
+   *
+   * If copy is false, it will create vector inplace.
+   */
+  static Vector* createCpuVectorFromNumpy(float* data,
+                                          int dim,
+                                          bool copy = true);
+
+  /// Create Gpu Vector from numpy array, which dtype=float32
+  static Vector* createGpuVectorFromNumpy(float* data, int dim);
+
+  /**
+   * copy from another vector
+   * throw(RangeError) if size of src vector is different from size of this
+   * vector
+   */
+  void copyFrom(Vector* src) throw(RangeError);
+
+  /// Cast to numpy array inplace.
+  void toNumpyArrayInplace(float** view_data, int* dim1) throw(UnsupportError);
+
+  /// Copy to numpy array.
+  void copyToNumpyArray(float** view_m_data, int* dim1);
+
+  /// Copy from numpy array.
+  void copyFromNumpyArray(float* data, int dim);
+
+  /// __getitem__ in python
+  float get(const size_t idx) const throw(RangeError, UnsupportError);
+
+  /// __setitem__ in python
+  void set(const size_t idx, float val) throw(RangeError, UnsupportError);
+
+  /// Return is GPU vector or not.
+  bool isGpu() const;
+
+  /// Return a list of float, the memory is alloced and copied.
+  FloatArray getData() const;
+
+  /// __len__ in python
+  size_t getSize() const;
+
+ private:
+  VectorPrivate* m;
+
+ private:
+  friend class Parameter;
+  friend class ParameterOptimizer;
+  friend struct ParameterTraverseCallbackPrivate;
+};
+
+struct IVectorPrivate;
+class IVector {
+  IVector();
+  DISABLE_COPY(IVector);
+  static IVector* createByPaddleVectorPtr(void* ptr);
+
+ public:
+  /// Create IVector filled with zero
+  static IVector* createZero(size_t sz, bool useGpu = isUsingGpu());
+
+  /**
+   * Create IVector from list of int.
+   * It will create a new vector, and copy data into it.
+   */
+  static IVector* create(const std::vector<int>& data,
+                         bool useGpu = isUsingGpu());
+
+  static IVector* createVectorFromNumpy(
+      int* data,
+      int dim,
+      bool copy = true,
+      bool useGpu = isUsingGpu()) throw(UnsupportError);
+
+  /**
+   * Create Cpu IVector from numpy array, which dtype=int32
+   *
+   * If copy is false, it will create vector inplace
+   */
+  static IVector* createCpuVectorFromNumpy(int* data,
+                                           int dim,
+                                           bool copy = true);
+  /**
+   * Create Gpu IVector from numpy array, which dtype=int32
+   */
+  static IVector* createGpuVectorFromNumpy(int* data, int dim);
+
+  /// Cast to numpy array inplace.
+  void toNumpyArrayInplace(int** view_data, int* dim1) throw(UnsupportError);
+
+  /// Copy to numpy array.
+  void copyToNumpyArray(int** view_m_data, int* dim1);
+
+  /// Copy from numpy array.
+  void copyFromNumpyArray(int* data, int dim);
+
+  virtual ~IVector();
+
+  /// Return a list of int, the memory is alloced and copied.
+  IntArray getData() const;
+
+  /// This method will map to python [] method.
+  int& operator[](const size_t idx) throw(RangeError, UnsupportError);
+
+  const int& operator[](const size_t idx) const
+      throw(RangeError, UnsupportError);
+
+  inline int get(const size_t idx) const throw(RangeError, UnsupportError) {
+    return (*this)[idx];
+  }
+
+  inline void set(const size_t idx, int val) throw(RangeError, UnsupportError) {
+    (*this)[idx] = val;
+  }
+
+  /// Return true if it is gpu vector.
+  bool isGpu() const;
+
+  /// This method will map to python __len__();
+  size_t getSize() const;
+
+ private:
+  void* getSharedPtr() const;
+
+  friend class Arguments;
+  IVectorPrivate* m;
+};
+
+struct ArgumentsPrivate;
+
+/// The Arguments is actual a std::vector<paddle::Argument> in paddle.
+class Arguments {
+ private:
+  Arguments();  // Internal Create.
+  DISABLE_COPY(Arguments);
+
+ public:
+  /**
+   * Create a arguments with size.
+   * Note that it can be zero.
+   */
+  static Arguments* createArguments(size_t slotNum);
+
+  void resize(size_t slotNum);
+
+  virtual ~Arguments();
+
+  /**
+   * Return the slot number that aguments contains.
+   *
+   * It is actually the vector's size
+   */
+  size_t getSlotNum() const;
+
+  /**
+   * The get functions of Arguments
+   *
+   * the param idx is the slot id
+   */
+  Matrix* getSlotValue(size_t idx) const throw(RangeError);
+  Matrix* getSlotGrad(size_t idx) const throw(RangeError);
+  IVector* getSlotIds(size_t idx) const throw(RangeError);
+  Matrix* getSlotIn(size_t idx) const throw(RangeError);
+  IVector* getSlotSequenceStartPositions(size_t idx) const throw(RangeError);
+  IVector* getSlotSubSequenceStartPositions(size_t idx) const throw(RangeError);
+  IVector* getSlotSequenceDim(size_t idx) const throw(RangeError);
+  // End Of get functions of Arguments
+
+  int64_t getBatchSize(size_t idx = 0) const throw(RangeError);
+
+  /**
+   * The set functions of Arguments.
+   *
+   * The param idx is the slot id.
+   * The other param is the input Matrix or vector.
+   */
+  void setSlotValue(size_t idx, Matrix* mat) throw(RangeError);
+  void setSlotGrad(size_t idx, Matrix* mat) throw(RangeError);
+  void setSlotIn(size_t idx, Matrix* mat) throw(RangeError);
+  void setSlotIds(size_t idx, IVector* vec) throw(RangeError);
+  void setSlotSequenceStartPositions(size_t idx,
+                                     IVector* vec) throw(RangeError);
+  void setSlotSubSequenceStartPositions(size_t idx,
+                                        IVector* vec) throw(RangeError);
+  void setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError);
+
+  /**
+   * Set the frame height of the idx-th Argument.
+   *
+   * @param ids The index of which Argument.
+   * @param h The height value.
+   */
+  void setSlotFrameHeight(size_t idx, size_t h) throw(RangeError);
+
+  /**
+   * Set the frame height of the idx-th Argument.
+   *
+   * @param ids The index of which Argument.
+   * @param h The height value.
+   */
+  void setSlotFrameWidth(size_t idx, size_t w) throw(RangeError);
+
+  size_t getSlotFrameHeight(size_t idx = 0) const throw(RangeError);
+  size_t getSlotFrameWidth(size_t idx = 0) const throw(RangeError);
+
+  float sum() const;
+
+ private:
+  static Arguments* createByPaddleArgumentVector(void* ptr);
+  static Arguments* createByPaddleArgument(const void* ptr);
+  void* getInternalArgumentsPtr() const;
+
+ private:
+  ArgumentsPrivate* m;
+  friend class Trainer;
+  friend class GradientMachine;
+  friend class SequenceGenerator;
+};
+
+enum GradientMatchineCreateMode {
+  CREATE_MODE_NORMAL = paddle::GradientMachine::kNormal,
+  CREATE_MODE_SGD_SPARSE_CPU_TRAINING =
+      paddle::GradientMachine::kSgdSparseCpuTraining,
+  CREATE_MODE_TESTING = paddle::GradientMachine::kTesting
+};
+
+struct ParameterConfigPrivate;
+class ParameterConfig {
+  DISABLE_COPY(ParameterConfig);
+  ParameterConfig();
+
+  /**
+   * Internal methods
+   */
+  static ParameterConfig* createParameterConfigFromParameterSharedPtr(
+      void* ptr);
+  static ParameterConfig* createParameterConfigFromParameterPtr(void* ptr);
+  void* getRawPtr();
+
+ public:
+  ~ParameterConfig();
+
+  /**
+   * return proto buf string.
+   */
+  std::string toProtoString() const;
+
+ private:
+  ParameterConfigPrivate* m;
+
+ private:
+  friend class Parameter;
+  friend class ParameterOptimizer;
+  friend struct ParameterTraverseCallbackPrivate;
+};
+
+struct OptimizationConfigPrivate;
+class OptimizationConfig {
+  DISABLE_COPY(OptimizationConfig);
+  OptimizationConfig();
+
+ public:
+  static OptimizationConfig* createFromProtoString(const std::string& str);
+  ~OptimizationConfig();
+
+  /**
+   * return protobuf string.
+   */
+  std::string toProtoString();
+
+ private:
+  OptimizationConfigPrivate* m;
+
+  friend class TrainerConfig;
+  friend class ParameterOptimizer;
+  friend class ParameterUpdater;
+  friend class Trainer;
+};
+
+struct ParameterPrivate;
+class Parameter {
+ private:
+  Parameter();
+  DISABLE_COPY(Parameter);
+
+ public:
+  virtual ~Parameter();
+
+  /**
+   * get parameter name
+   */
+  std::string getName() const;
+
+  /**
+   * get buf in Parameter
+   */
+  Vector* getBuf(ParameterType type);
+
+  /**
+   * get id
+   */
+  size_t getID() const;
+
+  ParameterConfig* getConfig();
+  void setValueUpdated();
+
+  bool save(const std::string& filename) const;
+
+  bool load(const std::string& filename) const;
+
+  size_t getSize() const;
+
+ private:
+  static Parameter* createFromRawPtr(void* ptr);
+  static Parameter* createFromSharedPtr(void* ptr);
+
+ private:
+  ParameterPrivate* m;
+  friend class UpdateCallbackWrapper;
+  friend class GradientMachine;
+  friend class ParameterUpdater;
+};
+
+struct ModelConfigPrivate;
+/**
+ * You can only get model config from TrainerConfig.
+ *
+ * It is used by GradientMachine.
+ */
+class ModelConfig {
+ private:
+  ModelConfig();
+  DISABLE_COPY(ModelConfig);
+
+ public:
+  virtual ~ModelConfig();
+
+ private:
+  ModelConfigPrivate* m;
+  friend class TrainerConfig;
+  friend struct TrainerConfigPrivate;
+  friend class GradientMachine;
+};
+
+struct TrainerConfigPrivate;
+/**
+ * To get TrainerConfig from file.
+ *
+ * It is used by GradientMachine.
+ */
+class TrainerConfig {
+ private:
+  TrainerConfig();
+  DISABLE_COPY(TrainerConfig);
+
+ public:
+  virtual ~TrainerConfig();
+
+  static TrainerConfig* createFromTrainerConfigFile(
+      const std::string& configPath);
+  static TrainerConfig* createFromProtoString(const std::string& str);
+
+  ModelConfig* getModelConfig() const;
+
+  OptimizationConfig* getOptimizationConfig() const;
+
+ private:
+  TrainerConfigPrivate* m;
+  friend class Trainer;
+};
+
+/**
+ * The callback in backword.
+ *
+ * You can inherit this class in python.
+ *
+ * @code
+ * class UpdateCallbackInPython(paddle.UpdateCallback):
+ *   def __init__(self):
+ *     paddle.UpdateCallback.__init__(self)
+ *
+ *   def apply(self, param):
+ *     assert isinstance(param, paddle.Parameter)
+ * @endcode
+ */
+class UpdateCallback {
+ public:
+  virtual ~UpdateCallback();
+  virtual void apply(Parameter* p);
+};
+
+struct ParameterTraverseCallbackPrivate;
+class ParameterTraverseCallback {
+  DISABLE_COPY(ParameterTraverseCallback);
+  ParameterTraverseCallback();
+
+ public:
+  ~ParameterTraverseCallback();
+
+  void apply(const std::vector<Vector*>& vecs,
+             const ParameterConfig& config,
+             size_t sparseId);
+
+ private:
+  ParameterTraverseCallbackPrivate* m;
+  friend class ParameterOptimizer;
+};
+
+/**
+ * The ParameterOptimizer Wrapper Class.
+ *
+ * Basically same as common/ParameterOptimizer.h
+ */
+struct ParameterOptimizerPrivate;
+class ParameterOptimizer {
+  DISABLE_COPY(ParameterOptimizer);
+  ParameterOptimizer();
+
+ public:
+  static ParameterOptimizer* create(OptimizationConfig* config);
+
+  ~ParameterOptimizer();
+
+  void init(size_t numRows, const ParameterConfig* config);
+
+  void startPass();
+
+  void finishPass();
+
+  void startBatch(size_t numSamplesProcessed);
+
+  void finishBatch();
+
+  void update(const std::vector<Vector*>& vecs,
+              const ParameterConfig& conf,
+              size_t sparseId = NO_SPARSE_ID);
+
+  std::vector<int> getParameterTypes() const;
+
+  ParameterTraverseCallback* needSpecialTraversal(
+      const ParameterConfig& config) const;
+
+ private:
+  ParameterOptimizerPrivate* m;
+};
+
+class SequenceGenerator;
+class Evaluator;
+struct GradientMachinePrivate;
+class GradientMachine {
+ private:
+  GradientMachine();
+  DISABLE_COPY(GradientMachine);
+
+ public:
+  virtual ~GradientMachine();
+
+  /**
+   * Create By ProtoStr.
+   *
+   * The ProtoStr can be generate by python's protobuf code.
+   */
+  static GradientMachine* createByConfigProtoStr(
+      const std::string& protoStr,
+      GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
+      const std::vector<int>& parameterTypes = defaultParamTypes);
+
+  /**
+   * Create by ModelConfig object.
+   *
+   * To get ModelConfig, you can get TrainerConfig from config file, then get
+   * model config by TrainerConfig
+   */
+  static GradientMachine* createByModelConfig(
+      ModelConfig* conf,
+      GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
+      const std::vector<int>& parameterTypes = defaultParamTypes);
+
+  /**
+   * @brief finish
+   */
+  void finish();
+
+  void start();
+
+  /**
+   * Prefetch row ids of sparse parameter.
+   */
+  void prefetch(const Arguments& inArgs);
+
+  /**
+   * Do some thing when train pass ended.
+   */
+  void onPassEnd();
+
+  /**
+   * The forward stage of GradientMachine.
+   *
+   * @note  the outArgs could be zero length arguemnts.
+   * @note  THIS METHOD IS VERY USEFULL FOR PREDICT FROM TRAINED MODEL.
+   */
+  void forward(const Arguments& inArgs, Arguments* outArgs, PassType passType);
+
+  /**
+   * The backward stage of GradientMachine.
+   *
+   * @note  Currently the ParameterUpdater is not wrapped in SWIG, so backward
+   * cannot actually train a network. But you can write a update callback to
+   * change the parameter or implement a ParameterUpdater in python side.
+   */
+  void backward(const UpdateCallback& callback = UpdateCallback());
+
+  /**
+   * Combine forward/backward
+   */
+  void forwardBackward(const Arguments& inArgs,
+                       Arguments* outArgs,
+                       PassType passType,
+                       const UpdateCallback& callback = UpdateCallback());
+
+  void loadParameters(const std::string& path);
+
+  size_t getParameterSize() const;
+  Parameter* getParameter(size_t i) throw(RangeError);
+
+  size_t getNonStaticParameterSize() const;
+  Parameter* getNonStaticParameter(size_t i) throw(RangeError);
+
+  void randParameters();
+
+  Arguments* getLayerOutput(const std::string& layerName) const
+      throw(UnsupportError);
+
+  /**
+   * Create a sequence generator.
+   *
+   * @note  It just like a paddle_gen_sequence.
+   */
+  SequenceGenerator* asSequenceGenerator(
+      const std::vector<std::string>& dict = std::vector<std::string>(),
+      size_t begin_id = 0UL,
+      size_t end_id = 0UL,
+      size_t max_length = 100UL,
+      size_t beam_size = -1UL);
+
+  Evaluator* makeEvaluator();
+
+  void eval(Evaluator* evaluator);
+
+ private:
+  GradientMachinePrivate* m;
+
+  static GradientMachine* createFromPaddleModelPtr(
+      const void* confPtr,
+      GradientMatchineCreateMode mode,
+      const std::vector<int>& types);
+
+  // Not to use c++ 11 init-list, so we use static var as function default arg.
+  static std::vector<int> defaultParamTypes;
+  friend class Trainer;
+  friend class ParameterUpdater;
+};
+
+struct ParameterUpdaterPrivate;
+class ParameterUpdater {
+ private:
+  ParameterUpdater();
+
+ public:
+  static ParameterUpdater* createLocalUpdater(OptimizationConfig* config);
+  static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
+                                               int passCount,
+                                               bool useSparseUpdater);
+  static ParameterUpdater* createNewRemoteUpdater(
+      OptimizationConfig* config,
+      const std::string pserverSpec,
+      const bool useEtcd) throw(UnsupportError);
+  ~ParameterUpdater();
+
+  /**
+   * @brief initialize Parameter Updater by GradientMachine.
+   * @param gm
+   */
+  void init(const GradientMachine& gm);
+
+  /**
+   * @brief begin of a training/testing of one pass.
+   */
+  void startPass();
+
+  /**
+   * @brief end of a traning/testing of one pass.
+   */
+  void finishPass();
+
+  /**
+   * @brief begin of a training/testing of one batch.
+   * @param data batch's size
+   * @return PassType, mostly will be training.
+   */
+  PassType startBatch(size_t batchSize);
+
+  /**
+   * @brief end of a traning/testing of one batch
+   * @param cost current batch cost.
+   */
+  void finishBatch(float cost);
+
+  /**
+   * @brief update a parameter (by local optimizer or by cluster pserver)
+   * @param param
+   */
+  void update(Parameter* param);
+
+  /**
+   * @breif only get required sparse rows by default.
+   * @param fullSize: get full matrix parameter if *fullSize* set
+   * @param apply: get PARAMETER_APPLY on pserver if *apply* set
+   */
+  void getParametersRemote(bool fullSize = false, bool apply = false);
+
+  /**
+   * @brief restore the average parameter.
+   * @note It is only used in AverageOptimizer. Restore will get the current
+   * PARAMETER_VALUE back.
+   */
+  void restore();
+
+  /**
+   * @brief apply. Store the average parameter.
+   * @note It is only used in AverageOptimizer. Apply will store the current
+   * PARAMETER_VALUE to buffer, calcaualte current Average Parameter, and save
+   * it to PARAMETER_VALUE.
+   */
+  void apply();
+
+  /**
+   * @brief catchUpWith The Regularization will be delayed in many situations(
+   * pserver, local sparse). Catch Up means catch the regularization up, apply
+   * regularization to all params.
+   */
+  void catchUpWith();
+
+ private:
+  ParameterUpdaterPrivate* m;
+};
+
+struct EvaluatorPrivate;
+class Evaluator {
+ private:
+  Evaluator();
+  DISABLE_COPY(Evaluator);
+
+ public:
+  ~Evaluator();
+
+  /**
+   * @brief begin an evaluate stage.
+   */
+  void start();
+
+  /**
+   * @brief end an evaluate stage.
+   */
+  void finish();
+
+  /**
+   * @brief toString will get a evaluate result.
+   *
+   * __repr__ method in python
+   */
+  std::string toString();
+
+  std::vector<std::string> getNames() const;
+
+  double getValue(const std::string name) const;
+
+ private:
+  EvaluatorPrivate* m;
+
+  friend class GradientMachine;
+};
+
+struct TrainerPrivate;
+class Trainer {
+ private:
+  TrainerPrivate* m;
+  Trainer();
+  Trainer(TrainerConfig* optConfig, GradientMachine* gm);
+  DISABLE_COPY(Trainer);
+
+ public:
+  virtual ~Trainer();
+
+  /// Create A Trainer By TrainerConfig. using paddle command line.
+  static Trainer* createByCommandLine() throw(IOError);
+
+  static Trainer* create(TrainerConfig* optConfig,
+                         GradientMachine* gm) throw(IOError);
+
+  /// Start training
+  void startTrain();
+
+  /// Finish training
+  void finishTrain();
+
+  /// Start a pass.
+  void startTrainPass();
+
+  /// Finish a pass
+  void finishTrainPass();
+
+  /**
+   * Train one batch,
+   *
+   * @return true if all batch finished.
+   */
+  bool trainOneBatch(size_t batchSize);
+
+  void trainOneDataBatch(size_t batchSize, const Arguments& args);
+
+  void startTestPeriod();
+  void testOneDataBatch(size_t batchSize, const Arguments& args);
+  void finishTestPeriod();
+
+  void forwardOneBatch(size_t batchSize);
+
+  Arguments* getForwardOutput();
+
+  Arguments* getLayerOutput(const std::string& layerName) const;
+};
+
+/// the N-Best results generated from one input sequence.
+class ISequenceResults {
+ public:
+  virtual ~ISequenceResults();
+
+  /// Number of result.
+  virtual size_t getSize() const = 0;
+
+  /**
+   * Get sentence from dictionary.
+   *
+   * @param id  the index of result.
+   * @param split  if true, the return sentence will be splited with ' ' by
+   *               each word. Default is false.
+   */
+  virtual std::string getSentence(size_t id, bool split = false) const
+      throw(RangeError) = 0;
+  virtual std::vector<int> getSequence(size_t id) const throw(RangeError) = 0;
+  virtual float getScore(size_t id) const throw(RangeError) = 0;
+};
+
+struct SequenceGeneratorPrivate;
+class SequenceGenerator {
+  DISABLE_COPY(SequenceGenerator);
+  SequenceGenerator();
+
+ public:
+  virtual ~SequenceGenerator();
+
+  /**
+   * Generate Sequence by input.
+   *
+   * @note  The inArgs is just one sequence of data.
+   * @note  The return will get a N-best generate result by inArgs.
+   *        Sort by score.
+   */
+  ISequenceResults* generateSequence(const Arguments& inArgs) const;
+
+  void setDict(const std::vector<std::string>& dict);
+  void setBos(size_t bos);
+  void setEos(size_t eos);
+  void setMaxLength(size_t maxlength);
+  void setBeamSize(size_t beamSize);
+
+ private:
+  static SequenceGenerator* createByGradientMachineSharedPtr(void* ptr);
+  friend class GradientMachine;
+
+ private:
+  SequenceGeneratorPrivate* m;
+};
diff --git a/paddle/legacy/api/PaddleAPIPrivate.h b/paddle/legacy/api/PaddleAPIPrivate.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ee192c31d597c4b4575e4a53a4aece09e642831
--- /dev/null
+++ b/paddle/legacy/api/PaddleAPIPrivate.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <memory>
+#include "PaddleAPI.h"
+#include "paddle/legacy/gserver/evaluators/Evaluator.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/legacy/parameter/ParameterUpdaterBase.h"
+#include "paddle/legacy/trainer/TrainerConfigHelper.h"
+
+struct GradientMachinePrivate {
+  std::shared_ptr<paddle::GradientMachine> machine;
+
+  template <typename T>
+  inline T& cast(void* ptr) {
+    return *(T*)(ptr);
+  }
+};
+
+struct OptimizationConfigPrivate {
+  std::shared_ptr<paddle::TrainerConfigHelper> trainer_config;
+  paddle::OptimizationConfig config;
+
+  const paddle::OptimizationConfig& getConfig() {
+    if (trainer_config != nullptr) {
+      return trainer_config->getOptConfig();
+    } else {
+      return config;
+    }
+  }
+};
+
+struct TrainerConfigPrivate {
+  std::shared_ptr<paddle::TrainerConfigHelper> conf;
+  TrainerConfigPrivate() {}
+};
+
+struct ModelConfigPrivate {
+  std::shared_ptr<paddle::TrainerConfigHelper> conf;
+};
+
+struct ArgumentsPrivate {
+  std::vector<paddle::Argument> outputs;
+
+  inline paddle::Argument& getArg(size_t idx) throw(RangeError) {
+    if (idx < outputs.size()) {
+      return outputs[idx];
+    } else {
+      RangeError e;
+      throw e;
+    }
+  }
+
+  template <typename T>
+  std::shared_ptr<T>& cast(void* rawPtr) const {
+    return *(std::shared_ptr<T>*)(rawPtr);
+  }
+};
+
+struct ParameterUpdaterPrivate {
+  std::unique_ptr<paddle::ParameterUpdater> updater;
+};
+
+struct ParameterPrivate {
+  std::shared_ptr<paddle::Parameter> sharedPtr;
+  paddle::Parameter* rawPtr;  // rawPtr only used in ParameterUpdater,
+                              // in other situation sharedPtr should
+                              // contains value.
+
+  ParameterPrivate() : sharedPtr(nullptr), rawPtr(nullptr) {}
+
+  paddle::Parameter* getPtr() {
+    if (sharedPtr) {
+      return sharedPtr.get();
+    } else {
+      return rawPtr;
+    }
+  }
+};
+
+struct EvaluatorPrivate {
+  paddle::Evaluator* rawPtr;
+
+  EvaluatorPrivate() : rawPtr(nullptr) {}
+  ~EvaluatorPrivate() { delete rawPtr; }
+};
diff --git a/paddle/legacy/api/Parameter.cpp b/paddle/legacy/api/Parameter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f05740eb750cccd8cfb6cbc826a04585ec06822e
--- /dev/null
+++ b/paddle/legacy/api/Parameter.cpp
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/parameter/Parameter.h"
+#include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
+
+Parameter::Parameter() : m(new ParameterPrivate()) {}
+
+Parameter::~Parameter() { delete m; }
+
+Parameter* Parameter::createFromRawPtr(void* ptr) {
+  auto p = new Parameter();
+  p->m->rawPtr = *static_cast<paddle::Parameter**>(ptr);
+  return p;
+}
+
+Parameter* Parameter::createFromSharedPtr(void* ptr) {
+  auto& p = *(paddle::ParameterPtr*)(ptr);
+  if (p == nullptr) {
+    return nullptr;
+  } else {
+    auto retParam = new Parameter();
+    retParam->m->sharedPtr = p;
+    return retParam;
+  }
+}
+
+std::string Parameter::getName() const { return m->getPtr()->getName(); }
+
+Vector* Parameter::getBuf(ParameterType type) {
+  auto buf = m->getPtr()->getBuf(type);
+  return Vector::createByPaddleVectorPtr(&buf);
+}
+
+ParameterConfig* Parameter::getConfig() {
+  if (m->sharedPtr) {
+    return ParameterConfig::createParameterConfigFromParameterSharedPtr(
+        &m->sharedPtr);
+  } else {
+    return ParameterConfig::createParameterConfigFromParameterPtr(m->rawPtr);
+  }
+}
+
+size_t Parameter::getID() const { return m->getPtr()->getID(); }
+
+void Parameter::setValueUpdated() { m->getPtr()->setValueUpdated(); }
+
+bool Parameter::save(const std::string& filename) const {
+  return m->getPtr()->save(filename);
+}
+
+bool Parameter::load(const std::string& filename) const {
+  return m->getPtr()->load(filename);
+}
+
+size_t Parameter::getSize() const { return m->getPtr()->getSize(); }
diff --git a/paddle/legacy/api/ParameterOptimizer.cpp b/paddle/legacy/api/ParameterOptimizer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..477d9dae44362f9073639093c3c4d1cf0ac12044
--- /dev/null
+++ b/paddle/legacy/api/ParameterOptimizer.cpp
@@ -0,0 +1,124 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/parameter/ParameterOptimizer.h"
+#include <algorithm>
+#include "Internal.h"
+#include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
+
+struct ParameterOptimizerPrivate {
+  std::unique_ptr<paddle::ParameterOptimizer> optimizer;
+};
+
+struct ParameterTraverseCallbackPrivate {
+  paddle::ParameterOptimizer::TraverseCallback callback;
+
+  ParameterTraverseCallbackPrivate() {}
+
+  ParameterTraverseCallbackPrivate(
+      const paddle::ParameterOptimizer::TraverseCallback& callback)
+      : callback(callback) {}
+
+  void apply(const std::vector<Vector*>& vecs,
+             const ParameterConfig& conf,
+             size_t sparseId) {
+    std::vector<paddle::VectorPtr> real_vecs;
+    real_vecs.resize(vecs.size());
+    std::transform(vecs.begin(), vecs.end(), real_vecs.begin(), [](Vector* v) {
+      if (v) {
+        return *(paddle::VectorPtr*)(v->getSharedPtr());
+      } else {
+        return paddle::VectorPtr();
+      }
+    });
+
+    paddle::ParameterConfig& real_conf =
+        *(paddle::ParameterConfig*)(const_cast<ParameterConfig&>(conf)
+                                        .getRawPtr());
+    callback(real_vecs.data(), real_conf, sparseId);
+  }
+};
+
+ParameterOptimizer::ParameterOptimizer() : m(new ParameterOptimizerPrivate()) {}
+
+ParameterOptimizer::~ParameterOptimizer() { delete m; }
+
+ParameterOptimizer* ParameterOptimizer::create(OptimizationConfig* config) {
+  CHECK(config != nullptr);
+  auto retOptimizer = new ParameterOptimizer();
+  retOptimizer->m->optimizer.reset(
+      paddle::ParameterOptimizer::create(config->m->getConfig(), false));
+  return retOptimizer;
+}
+
+void ParameterOptimizer::init(size_t numRows, const ParameterConfig* config) {
+  auto& conf = *(paddle::ParameterConfig*)(const_cast<ParameterConfig*>(config)
+                                               ->getRawPtr());
+  m->optimizer->init(numRows, &conf);
+}
+
+void ParameterOptimizer::startPass() { m->optimizer->startPass(); }
+
+void ParameterOptimizer::finishPass() { m->optimizer->finishPass(); }
+
+void ParameterOptimizer::startBatch(size_t numSamplesProcessed) {
+  constexpr size_t high_1 = 1UL << (sizeof(size_t) * 8 - 1);
+  CHECK_EQ(numSamplesProcessed & high_1, 0UL);  // Safely cast.
+  m->optimizer->startBatch((int64_t)numSamplesProcessed);
+}
+
+void ParameterOptimizer::finishBatch() { m->optimizer->finishBatch(); }
+
+void ParameterOptimizer::update(const std::vector<Vector*>& vecs,
+                                const ParameterConfig& conf,
+                                size_t sparseId) {
+  ParameterTraverseCallbackPrivate invoker(
+      [&](const paddle::VectorPtr _vecs[],
+          const paddle::ParameterConfig& config,
+          size_t sid = -1UL) { m->optimizer->update(_vecs, config, sid); });
+  invoker.apply(vecs, conf, sparseId);
+}
+
+std::vector<int> ParameterOptimizer::getParameterTypes() const {
+  std::vector<int> returnValue;
+  staticCastVector(&returnValue, m->optimizer->getParameterTypes());
+  return returnValue;
+}
+
+ParameterTraverseCallback::ParameterTraverseCallback()
+    : m(new ParameterTraverseCallbackPrivate()) {}
+
+ParameterTraverseCallback::~ParameterTraverseCallback() { delete m; }
+
+void ParameterTraverseCallback::apply(const std::vector<Vector*>& vecs,
+                                      const ParameterConfig& conf,
+                                      size_t sparseId) {
+  m->apply(vecs, conf, sparseId);
+}
+
+ParameterTraverseCallback* ParameterOptimizer::needSpecialTraversal(
+    const ParameterConfig& config) const {
+  auto& param_config =
+      *(paddle::ParameterConfig*)const_cast<ParameterConfig&>(config)
+           .getRawPtr();
+  auto callback = m->optimizer->needSpecialTraversal(param_config);
+  if (callback) {
+    auto retCallback = new ParameterTraverseCallback();
+    retCallback->m->callback = callback;
+    return retCallback;
+  } else {
+    return nullptr;
+  }
+}
diff --git a/paddle/legacy/api/ParameterUpdater.cpp b/paddle/legacy/api/ParameterUpdater.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..44af3f4635f2bda07d0079faff0bbc1ec7ed3954
--- /dev/null
+++ b/paddle/legacy/api/ParameterUpdater.cpp
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PaddleAPI.h"
+
+#include "PaddleAPIPrivate.h"
+#ifndef PADDLE_WITHOUT_GOLANG
+#include "paddle/legacy/trainer/NewRemoteParameterUpdater.h"
+#endif
+#include "paddle/legacy/trainer/RemoteParameterUpdater.h"
+#include "paddle/legacy/trainer/ThreadParameterUpdater.h"
+
+ParameterUpdater::ParameterUpdater() : m(new ParameterUpdaterPrivate()) {}
+
+ParameterUpdater *ParameterUpdater::createLocalUpdater(
+    OptimizationConfig *config) {
+  auto updater = new ParameterUpdater();
+  updater->m->updater.reset(
+      new paddle::SgdThreadUpdater(config->m->getConfig()));
+  return updater;
+}
+
+ParameterUpdater *ParameterUpdater::createNewRemoteUpdater(
+    OptimizationConfig *config,
+    const std::string pserverSpec,
+    const bool useEtcd) throw(UnsupportError) {
+#ifndef PADDLE_WITHOUT_GOLANG
+  auto updater = new ParameterUpdater();
+  updater->m->updater.reset(new paddle::NewRemoteParameterUpdater(
+      config->m->getConfig(), pserverSpec, useEtcd));
+  return updater;
+#else
+  throw UnsupportError("not compiled with WITH_GOLANG");
+#endif
+}
+
+ParameterUpdater *ParameterUpdater::createRemoteUpdater(
+    OptimizationConfig *config, int passCount, bool useSparseUpdater) {
+  auto updater = new ParameterUpdater();
+  auto remoteUpdater = new paddle::RemoteParameterUpdater(
+      config->m->getConfig(), passCount, nullptr);
+  if (useSparseUpdater) {
+    std::unique_ptr<paddle::ParameterUpdater> remoteUpdaterPtr(remoteUpdater);
+    auto sparseRemoteUpdater =
+        new paddle::SparseRemoteParameterUpdaterComposite(
+            config->m->getConfig(),
+            passCount,
+            false,
+            std::move(remoteUpdaterPtr));
+    updater->m->updater.reset(sparseRemoteUpdater);
+  } else {
+    updater->m->updater.reset(remoteUpdater);
+  }
+  return updater;
+}
+
+ParameterUpdater::~ParameterUpdater() { delete m; }
+
+void ParameterUpdater::init(const GradientMachine &gm) {
+  m->updater->init(gm.m->machine->getNonStaticParameters());
+}
+
+void ParameterUpdater::startPass() { m->updater->startPass(); }
+
+void ParameterUpdater::finishPass() { m->updater->finishPass(); }
+
+PassType ParameterUpdater::startBatch(size_t batchSize) {
+  return m->updater->startBatch((int64_t)batchSize);
+}
+
+void ParameterUpdater::finishBatch(float cost) {
+  m->updater->finishBatch(cost);
+}
+
+void ParameterUpdater::update(Parameter *param) {
+  auto paddleParam = param->m->getPtr();
+  m->updater->update(paddleParam);
+}
+
+void ParameterUpdater::getParametersRemote(bool fullSize, bool apply) {
+  m->updater->getParametersRemote(fullSize, apply);
+}
+
+void ParameterUpdater::restore() { m->updater->restore(); }
+
+void ParameterUpdater::apply() { m->updater->apply(); }
+
+void ParameterUpdater::catchUpWith() { m->updater->catchUpWith(); }
diff --git a/paddle/legacy/api/SequenceGenerator.cpp b/paddle/legacy/api/SequenceGenerator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2a73228f6d4770d9be31defd7a5dc217fc5c21f2
--- /dev/null
+++ b/paddle/legacy/api/SequenceGenerator.cpp
@@ -0,0 +1,242 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <iterator>
+#include <sstream>
+#include <vector>
+#include "PaddleAPI.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/legacy/parameter/Argument.h"
+#include "paddle/legacy/utils/Flags.h"
+
+// used to represent partial sequence
+struct Path {
+  std::vector<int> ids;
+  float logProb;
+  paddle::MachineState machineState;
+
+  Path() { logProb = 0; }
+
+  Path(std::vector<int>& ids, float logProb, paddle::MachineState& machineState)
+      : ids(ids), logProb(logProb), machineState(machineState) {}
+
+  bool operator<(const Path& other) const { return (logProb > other.logProb); }
+};
+
+// Return top k (k == beam_size) optimal paths using beam search. The last
+// element of inArgs is the Argument of feedback. gradMachine has MaxIdLayer
+// as output and outArgs thus stores top k labels and their probabilities per
+// position
+static void findNBest(paddle::GradientMachine* gradMachine,
+                      std::vector<paddle::Argument>& inArgs,
+                      std::vector<Path>& finalPaths,
+                      size_t bos_id,
+                      size_t eos_id,
+                      size_t max_length) {
+  std::vector<Path> paths;
+  Path emptyPath;
+  paths.push_back(emptyPath);
+  finalPaths.clear();
+  gradMachine->resetState();
+  paddle::Argument feedback = inArgs.back();
+  feedback.ids->setElement(0, (int)(bos_id));
+  float minFinalPathLogProb = 0;
+  size_t beam = 0;
+  int id;
+  std::vector<paddle::Argument> outArgs;
+  while (true) {  // iterate over each generated word
+    std::vector<Path> newPaths;
+    paddle::MachineState machineState;
+    for (size_t j = 0; j < paths.size(); j++) {
+      Path& path = paths[j];
+      if (path.machineState.size() > 0) {
+        gradMachine->setState(path.machineState);
+        feedback.ids->setElement(0, path.ids.back());
+      }
+      gradMachine->forward(inArgs, &outArgs, paddle::PASS_TEST);
+      gradMachine->getState(machineState);
+      beam = outArgs[0].ids->getSize();
+      for (size_t k = 0; k < beam; k++) {
+        id = outArgs[0].ids->getElement(k);
+        float prob = outArgs[0].in->getElement(0, k);
+        std::vector<int> nids(path.ids);
+        nids.push_back(id);
+        float newLogProb = path.logProb + log(prob);
+        Path newPath(nids, newLogProb, machineState);
+        if (id == (int)eos_id || nids.size() >= max_length) {
+          finalPaths.push_back(newPath);
+          if (minFinalPathLogProb > newPath.logProb) {
+            minFinalPathLogProb = newPath.logProb;
+          }
+        } else {
+          newPaths.push_back(newPath);
+        }
+      }
+    }
+
+    if (newPaths.size() == 0) {
+      break;
+    }
+    std::nth_element(newPaths.begin(),
+                     newPaths.begin() + std::min(beam, newPaths.size()),
+                     newPaths.end());
+    if (newPaths.size() > beam) {
+      newPaths.resize(beam);
+    }
+    // pathA < pathB means pathA.logProb > pathB.logProb
+    float maxPathLogProb =
+        std::min_element(newPaths.begin(), newPaths.end())->logProb;
+    if (finalPaths.size() >= beam && minFinalPathLogProb >= maxPathLogProb) {
+      break;
+    }
+    paths = newPaths;
+  }  // end while
+
+  std::partial_sort(finalPaths.begin(),
+                    finalPaths.begin() + std::min(beam, finalPaths.size()),
+                    finalPaths.end());
+  if (finalPaths.size() > beam) {
+    finalPaths.resize(beam);
+  }
+}
+
+struct SequenceGeneratorPrivate {
+  std::shared_ptr<paddle::GradientMachine> machine;
+  std::shared_ptr<std::vector<std::string>> dict;
+  size_t beginPos;
+  size_t endPos;
+  size_t maxLength;
+
+  paddle::Argument feedback;
+
+  template <typename T>
+  inline T& cast(void* ptr) {
+    return *(T*)(ptr);
+  }
+
+  inline void findNBest(std::vector<paddle::Argument>& inArgs,
+                        std::vector<Path>& path) {
+    ::findNBest(machine.get(), inArgs, path, beginPos, endPos, maxLength);
+  }
+
+  SequenceGeneratorPrivate()
+      : dict(std::make_shared<std::vector<std::string>>()),
+        beginPos(0UL),
+        endPos(0UL),
+        maxLength(0UL),
+        feedback(__create_feedback__()) {}
+
+ private:
+  static paddle::Argument __create_feedback__() {
+    paddle::Argument feedback;
+    feedback.ids = paddle::IVector::create(/* size= */ 1, FLAGS_use_gpu);
+
+    feedback.sequenceStartPositions =
+        paddle::ICpuGpuVector::create(/* size= */ 2, /* useGpu= */ false);
+    feedback.sequenceStartPositions->getMutableData(false)[0] = 0;
+    feedback.sequenceStartPositions->getMutableData(false)[1] = 1;
+    return feedback;
+  }
+};
+
+SequenceGenerator::SequenceGenerator() : m(new SequenceGeneratorPrivate()) {}
+
+SequenceGenerator::~SequenceGenerator() { delete m; }
+
+class PathSequenceResults : public ISequenceResults {
+  // ISequenceResults interface
+ public:
+  PathSequenceResults(const std::shared_ptr<std::vector<Path>>& path,
+                      const std::shared_ptr<std::vector<std::string>>& dict)
+      : path_(path), dict_(dict) {}
+
+  size_t getSize() const { return path_->size(); }
+  std::string getSentence(size_t id, bool split) const throw(RangeError) {
+    if (id < getSize()) {
+      Path& p = (*path_)[id];
+      std::ostringstream sout;
+      std::transform(p.ids.begin(),
+                     p.ids.end(),
+                     std::ostream_iterator<std::string>(sout, split ? " " : ""),
+                     [&](int id) { return (*dict_)[id]; });
+      return sout.str();
+    } else {
+      RangeError e;
+      throw e;
+    }
+  }
+  std::vector<int> getSequence(size_t id) const throw(RangeError) {
+    if (id < getSize()) {
+      Path& p = (*path_)[id];
+      return p.ids;
+    } else {
+      RangeError e;
+      throw e;
+    }
+  }
+  float getScore(size_t id) const throw(RangeError) {
+    if (id < getSize()) {
+      Path& p = (*path_)[id];
+      return p.logProb;
+    } else {
+      RangeError e;
+      throw e;
+    }
+  }
+
+ private:
+  std::shared_ptr<std::vector<Path>> path_;
+  std::shared_ptr<std::vector<std::string>> dict_;
+};
+
+ISequenceResults* SequenceGenerator::generateSequence(
+    const Arguments& inArgs) const {
+  auto& in_args =
+      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
+  for (auto& arg : in_args) {
+    arg.sequenceStartPositions = m->feedback.sequenceStartPositions;
+  }
+  in_args.push_back(m->feedback);
+  auto path = std::make_shared<std::vector<Path>>();
+  m->findNBest(in_args, *path);
+  return new PathSequenceResults(path, m->dict);
+}
+
+SequenceGenerator* SequenceGenerator::createByGradientMachineSharedPtr(
+    void* ptr) {
+  SequenceGenerator* r = new SequenceGenerator();
+  r->m->machine = r->m->cast<std::shared_ptr<paddle::GradientMachine>>(ptr);
+  return r;
+}
+
+void SequenceGenerator::setDict(const std::vector<std::string>& dict) {
+  *m->dict = dict;
+}
+
+void SequenceGenerator::setBos(size_t bos) { m->beginPos = bos; }
+
+void SequenceGenerator::setEos(size_t eos) { m->endPos = eos; }
+
+void SequenceGenerator::setMaxLength(size_t maxLength) {
+  m->maxLength = maxLength;
+}
+
+void SequenceGenerator::setBeamSize(size_t beamSize) {
+  if (beamSize != -1UL) {
+    FLAGS_beam_size = beamSize;
+  }
+}
+
+ISequenceResults::~ISequenceResults() {}
diff --git a/paddle/legacy/api/Trainer.cpp b/paddle/legacy/api/Trainer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e7c607201b0b946a6d6b2f3da35356e2c4e5e15e
--- /dev/null
+++ b/paddle/legacy/api/Trainer.cpp
@@ -0,0 +1,175 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
+
+#include <stdlib.h>
+#include <atomic>
+#include <memory>
+
+#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/legacy/trainer/ParamUtil.h"
+#include "paddle/legacy/trainer/Trainer.h"
+#include "paddle/legacy/trainer/TrainerInternal.h"
+#include "paddle/legacy/utils/Flags.h"
+
+using paddle::real;
+
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_int32(start_pass);
+
+struct TrainerPrivate : public paddle::Trainer {
+  bool _trainOneBatch(size_t batchSize);
+  bool forwardOneBatch(size_t batchSize);
+  void forwardOneDataBatch(const std::vector<paddle::Argument>& inArgs);
+  void setBatchSize(size_t batchSize);
+  std::vector<paddle::Argument>& getForwardOutput();
+
+  void startTestPeriod();
+  void finishTestPeriod();
+  void testOneDataBatch(const paddle::DataBatch& dataBatch);
+  TrainerPrivate() : paddle::Trainer() {}
+};
+
+Trainer::Trainer() : m(new TrainerPrivate()) {
+  auto conf = paddle::TrainerConfigHelper::createFromFlags();
+  if (conf != nullptr) {
+    m->init(conf);
+  }
+}
+
+Trainer::~Trainer() { delete m; }
+
+Trainer* Trainer::createByCommandLine() throw(IOError) {
+  auto retv = new Trainer();
+  if (retv->m->getConfig().IsInitialized()) {
+    return retv;
+  } else {
+    throw IOError();
+  }
+}
+
+Trainer::Trainer(TrainerConfig* config, GradientMachine* gm)
+    : m(new TrainerPrivate()) {
+  m->init(config->m->conf, /* testing= */ false, gm ? gm->m->machine : nullptr);
+}
+
+Trainer* Trainer::create(TrainerConfig* config,
+                         GradientMachine* gm) throw(IOError) {
+  auto retv = new Trainer(config, gm);
+  if (retv->m->getConfig().IsInitialized()) {
+    return retv;
+  } else {
+    retv->m->getConfig().CheckInitialized();
+    throw IOError();
+  }
+}
+
+void Trainer::startTrain() { m->startTrain(); }
+
+void Trainer::finishTrain() { m->finishTrain(); }
+
+void Trainer::startTrainPass() { m->startTrainPass(); }
+
+void Trainer::finishTrainPass() { m->finishTrainPass(); }
+
+void Trainer::trainOneDataBatch(size_t batchSize, const Arguments& inArgs) {
+  paddle::DataBatch dataBatch;
+  dataBatch.getStreams() = inArgs.m->outputs;
+  dataBatch.setSize(batchSize);
+  m->trainOneDataBatch(dataBatch);
+}
+
+bool Trainer::trainOneBatch(size_t batchSize) {
+  return m->_trainOneBatch(batchSize);
+}
+
+bool TrainerPrivate::_trainOneBatch(size_t batchSize) {
+  paddle::DataBatch dataBatch;
+  CHECK(dataProvider_) << "data_provider is not specified";
+  int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+  if (num == 0) {
+    return false;
+  }
+  trainOneDataBatch(dataBatch);
+  return false;
+}
+
+void TrainerPrivate::startTestPeriod() {
+  if (!tester_) {
+    createTester();
+  }
+  tester_->startTestPeriod();
+}
+
+void Trainer::startTestPeriod() { m->startTestPeriod(); }
+
+void TrainerPrivate::testOneDataBatch(const paddle::DataBatch& dataBatch) {
+  tester_->testOneDataBatch(dataBatch, &forwardOutput_);
+}
+
+void Trainer::testOneDataBatch(size_t batchSize, const Arguments& args) {
+  paddle::DataBatch dataBatch;
+  dataBatch.getStreams() = args.m->outputs;
+  dataBatch.setSize(batchSize);
+  m->testOneDataBatch(dataBatch);
+}
+
+void TrainerPrivate::finishTestPeriod() { tester_->finishTestPeriod(); }
+void Trainer::finishTestPeriod() { m->finishTestPeriod(); }
+
+Arguments* Trainer::getLayerOutput(const std::string& layerName) const {
+  auto nn = this->m->getGradientMachine();
+  CHECK(nn) << "trainerInternal_.getGradientMachine() is not NeuralNetwork";
+  auto arg = nn->getLayerOutput(layerName);
+  return Arguments::createByPaddleArgument(&arg);
+}
+
+void Trainer::forwardOneBatch(size_t batchSize) {
+  m->forwardOneBatch(batchSize);
+}
+
+bool TrainerPrivate::forwardOneBatch(size_t batchSize) {
+  CHECK(dataProvider_) << "data_provider is not specified";
+  paddle::DataBatch dataBatch;
+  int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+  if (num == 0) {
+    return false;
+  }
+
+  forwardOneDataBatch(dataBatch.getStreams());
+  return true;
+}
+
+void TrainerPrivate::forwardOneDataBatch(
+    const std::vector<paddle::Argument>& inArgs) {
+  std::vector<paddle::Argument>& outArgs = forwardOutput_;
+
+  if (config_->getOptConfig().use_sparse_remote_updater()) {
+    trainerInternal_.getGradientMachine()->prefetch(inArgs);
+    trainerInternal_.getParameterUpdater()->getParametersRemote();
+  }
+  trainerInternal_.getGradientMachine()->forward(
+      inArgs, &outArgs, paddle::PASS_TEST);
+}
+
+Arguments* Trainer::getForwardOutput() {
+  return Arguments::createByPaddleArgumentVector(&m->getForwardOutput());
+}
+
+std::vector<paddle::Argument>& TrainerPrivate::getForwardOutput() {
+  return forwardOutput_;
+}
diff --git a/paddle/legacy/api/Util.cpp b/paddle/legacy/api/Util.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b458c4d90ecc7333066f887dcbc93c4da5c43853
--- /dev/null
+++ b/paddle/legacy/api/Util.cpp
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PaddleAPI.h"
+
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+
+void initPaddle(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  paddle::initPython(argc, argv);
+  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
+}
+
+FloatArray::FloatArray(const float* b, const size_t l)
+    : buf(b), length(l), needFree(false) {}
+
+IntArray::IntArray(const int* b, const size_t l, bool f)
+    : buf(b), length(l), needFree(f) {}
+
+IntWithFloatArray::IntWithFloatArray(const float* v,
+                                     const int* i,
+                                     size_t l,
+                                     bool f)
+    : valBuf(v), idxBuf(i), length(l), needFree(f) {}
+
+bool isUsingGpu() { return FLAGS_use_gpu; }
+
+void setUseGpu(bool useGpu) { FLAGS_use_gpu = useGpu; }
+
+bool isGpuVersion() {
+#ifndef PADDLE_WITH_CUDA
+  return false;
+#else
+  return true;
+#endif
+}
+
+int getTrainerCount() { return FLAGS_trainer_count; }
+
+static_assert(NUM_PARAMETER_TYPES == paddle::NUM_PARAMETER_TYPES,
+              "The Parameter Type should be same in core/api and core/common");
diff --git a/paddle/legacy/api/Vector.cpp b/paddle/legacy/api/Vector.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..73b6d3a15d6d0ddc80a17846604d9500d8f7e4e3
--- /dev/null
+++ b/paddle/legacy/api/Vector.cpp
@@ -0,0 +1,304 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PaddleAPI.h"
+
+#include "paddle/legacy/math/Vector.h"
+
+#include <cstring>
+
+struct IVectorPrivate {
+  paddle::IVectorPtr vec;
+};
+
+IVector::IVector() : m(new IVectorPrivate()) {}
+
+IVector* IVector::createZero(size_t sz, bool useGpu) {
+  auto v = new IVector();
+  v->m->vec = paddle::IVector::create(sz, useGpu);
+  v->m->vec->zeroMem();
+  return v;
+}
+
+IVector* IVector::create(const std::vector<int>& data, bool useGpu) {
+  auto v = new IVector();
+  v->m->vec = paddle::IVector::create(data.size(), useGpu);
+  v->m->vec->copyFrom(data.data(), data.size());
+  return v;
+}
+
+IVector* IVector::createVectorFromNumpy(int* data,
+                                        int dim,
+                                        bool copy,
+                                        bool useGpu) throw(UnsupportError) {
+  if (useGpu) {
+    /// if use gpu only copy=true is supported
+    if (!copy) {
+      throw UnsupportError("Gpu mode only supports copy=True");
+    }
+    return IVector::createGpuVectorFromNumpy(data, dim);
+  } else {
+    return IVector::createCpuVectorFromNumpy(data, dim, copy);
+  }
+}
+
+IVector* IVector::createCpuVectorFromNumpy(int* data, int dim, bool copy) {
+  auto v = new IVector();
+  if (copy) {
+    v->m->vec = paddle::IVector::create(dim, false);
+    v->m->vec->copyFrom(data, dim);
+  } else {
+    v->m->vec = paddle::IVector::create(data, dim, false);
+  }
+  return v;
+}
+
+IVector* IVector::createGpuVectorFromNumpy(int* data, int dim) {
+  auto v = new IVector();
+  v->m->vec = paddle::IVector::create(dim, true);
+  v->m->vec->copyFrom(data, dim);
+  return v;
+}
+
+bool IVector::isGpu() const {
+  return dynamic_cast<paddle::GpuIVector*>(m->vec.get()) != nullptr;
+}
+
+IntArray IVector::getData() const {
+  if (this->isGpu()) {
+    int* src = m->vec->getData();
+    size_t len = m->vec->getSize();
+    int* dest = new int[len];
+    hl_memcpy_device2host(dest, src, len * sizeof(int));
+    return IntArray(dest, len, true);
+  } else {
+    return IntArray(m->vec->getData(), m->vec->getSize());
+  }
+}
+
+int& IVector::operator[](const size_t idx) throw(RangeError, UnsupportError) {
+  if (this->isGpu()) {
+    UnsupportError e;
+    throw e;
+  } else {
+    if (idx >= m->vec->getSize()) {
+      RangeError e;
+      throw e;
+    }
+  }
+  return m->vec->getData()[idx];
+}
+
+const int& IVector::operator[](const size_t idx) const
+    throw(RangeError, UnsupportError) {
+  return (*const_cast<IVector*>(this))[idx];
+}
+
+IVector* IVector::createByPaddleVectorPtr(void* ptr) {
+  auto* p = (paddle::IVectorPtr*)ptr;
+  if ((*p) != nullptr) {
+    IVector* vec = new IVector();
+    vec->m->vec = *p;
+    return vec;
+  } else {
+    return nullptr;
+  }
+}
+
+IVector::~IVector() { delete m; }
+
+void* IVector::getSharedPtr() const { return &m->vec; }
+
+size_t IVector::getSize() const { return m->vec->getSize(); }
+
+void IVector::toNumpyArrayInplace(int** data, int* dim1) throw(UnsupportError) {
+  auto v = std::dynamic_pointer_cast<paddle::CpuIVector>(m->vec);
+  if (v) {
+    *data = v->getData();
+    *dim1 = v->getSize();
+  } else {
+    throw UnsupportError();
+  }
+}
+
+void IVector::copyToNumpyArray(int** view_m_data, int* dim1) {
+  *dim1 = m->vec->getSize();
+  *view_m_data = new int[*dim1];
+  if (auto cpuVec = dynamic_cast<paddle::CpuIVector*>(m->vec.get())) {
+    std::memcpy(*view_m_data, cpuVec->getData(), sizeof(int) * (*dim1));
+  } else if (auto gpuVec = dynamic_cast<paddle::GpuIVector*>(m->vec.get())) {
+    hl_memcpy_device2host(
+        *view_m_data, gpuVec->getData(), sizeof(int) * (*dim1));
+  } else {
+    LOG(INFO) << "Unexpected situation";
+  }
+}
+
+void IVector::copyFromNumpyArray(int* data, int dim) {
+  m->vec->resize(dim);
+  m->vec->copyFrom(data, dim);
+}
+
+struct VectorPrivate {
+  paddle::VectorPtr vec;
+
+  void safeAccessData(const size_t idx,
+                      const std::function<void(float&)>& func) const
+      throw(RangeError, UnsupportError) {
+    auto cpuVec = std::dynamic_pointer_cast<const paddle::CpuVector>(vec);
+    if (cpuVec != nullptr) {
+      if (idx < vec->getSize()) {
+        func(vec->getData()[idx]);
+      } else {
+        throw RangeError();
+      }
+    } else {
+      throw UnsupportError();
+    }
+  }
+};
+
+Vector::Vector() : m(new VectorPrivate()) {}
+
+Vector::~Vector() { delete m; }
+
+Vector* Vector::createZero(size_t sz, bool useGpu) {
+  auto retVec = new Vector();
+  retVec->m->vec = paddle::Vector::create(sz, useGpu);
+  retVec->m->vec->zero();
+  return retVec;
+}
+
+Vector* Vector::create(const std::vector<float>& data, bool useGpu) {
+  auto retVec = new Vector();
+  retVec->m->vec = paddle::Vector::create(data.size(), useGpu);
+  retVec->m->vec->copyFrom(data.data(), data.size());
+  return retVec;
+}
+
+Vector* Vector::createByPaddleVectorPtr(void* ptr) {
+  auto& v = *(paddle::VectorPtr*)(ptr);
+  if (v == nullptr) {
+    return nullptr;
+  } else {
+    auto retVec = new Vector();
+    retVec->m->vec = v;
+    return retVec;
+  }
+}
+
+Vector* Vector::createVectorFromNumpy(float* data,
+                                      int dim,
+                                      bool copy,
+                                      bool useGpu) throw(UnsupportError) {
+  if (useGpu) {
+    /// if use gpu only copy=True is supported
+    if (!copy) {
+      throw UnsupportError("Gpu mode only supports copy=True");
+    }
+    return Vector::createGpuVectorFromNumpy(data, dim);
+  } else {
+    return Vector::createCpuVectorFromNumpy(data, dim, copy);
+  }
+}
+
+Vector* Vector::createCpuVectorFromNumpy(float* data, int dim, bool copy) {
+  CHECK_GT(dim, 0);
+  auto retVec = new Vector();
+  if (copy) {
+    retVec->m->vec = paddle::Vector::create((size_t)dim, false);
+    retVec->m->vec->copyFrom(data, dim);
+  } else {
+    retVec->m->vec = paddle::Vector::create(data, (size_t)dim, false);
+  }
+  return retVec;
+}
+
+Vector* Vector::createGpuVectorFromNumpy(float* data, int dim) {
+  CHECK_GT(dim, 0);
+  auto retVec = new Vector();
+  retVec->m->vec = paddle::Vector::create((size_t)dim, true);
+  retVec->m->vec->copyFrom(data, (size_t)dim);
+  return retVec;
+}
+
+void Vector::toNumpyArrayInplace(float** view_data,
+                                 int* dim1) throw(UnsupportError) {
+  auto v = std::dynamic_pointer_cast<paddle::CpuVector>(m->vec);
+  if (v != nullptr) {
+    *view_data = v->getData();
+    *dim1 = (int)v->getSize();
+  } else {
+    throw UnsupportError();
+  }
+}
+
+void Vector::copyToNumpyArray(float** view_m_data, int* dim1) {
+  *dim1 = m->vec->getSize();
+  *view_m_data = new float[*dim1];
+  if (auto cpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
+    std::memcpy(*view_m_data, cpuVec->getData(), sizeof(float) * (*dim1));
+  } else if (auto gpuVec = dynamic_cast<paddle::GpuVector*>(m->vec.get())) {
+    hl_memcpy_device2host(
+        *view_m_data, gpuVec->getData(), sizeof(float) * (*dim1));
+  } else {
+    LOG(INFO) << "Unexpected situation";
+  }
+}
+
+void Vector::copyFromNumpyArray(float* data, int dim) {
+  m->vec->resize(dim);
+  m->vec->copyFrom(data, dim);
+}
+
+FloatArray Vector::getData() const {
+  if (this->isGpu()) {
+    float* src = m->vec->getData();
+    size_t len = m->vec->getSize();
+    float* dest = new float[len];
+    hl_memcpy_device2host(dest, src, len * sizeof(float));
+    FloatArray ret_val(dest, len);
+    ret_val.needFree = true;
+    return ret_val;
+  } else {
+    FloatArray ret_val(m->vec->getData(), m->vec->getSize());
+    return ret_val;
+  }
+}
+
+void Vector::copyFrom(Vector* src) throw(RangeError) {
+  if (src->m->vec->getSize() != m->vec->getSize()) {
+    throw RangeError();
+  }
+  m->vec->copyFrom(*src->m->vec);
+}
+
+bool Vector::isGpu() const {
+  return std::dynamic_pointer_cast<paddle::GpuVector>(m->vec) != nullptr;
+}
+
+float Vector::get(const size_t idx) const throw(RangeError, UnsupportError) {
+  float r;
+  m->safeAccessData(idx, [&](float& o) { r = o; });
+  return r;
+}
+
+void Vector::set(const size_t idx, float val) throw(RangeError,
+                                                    UnsupportError) {
+  m->safeAccessData(idx, [&](float& o) { o = val; });
+}
+
+size_t Vector::getSize() const { return m->vec->getSize(); }
+
+void* Vector::getSharedPtr() { return &m->vec; }
diff --git a/paddle/api/__init__.py b/paddle/legacy/api/__init__.py
similarity index 100%
rename from paddle/api/__init__.py
rename to paddle/legacy/api/__init__.py
diff --git a/paddle/api/numpy.i b/paddle/legacy/api/numpy.i
similarity index 100%
rename from paddle/api/numpy.i
rename to paddle/legacy/api/numpy.i
diff --git a/paddle/api/test/.gitignore b/paddle/legacy/api/test/.gitignore
similarity index 100%
rename from paddle/api/test/.gitignore
rename to paddle/legacy/api/test/.gitignore
diff --git a/paddle/legacy/api/test/CMakeLists.txt b/paddle/legacy/api/test/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..13cb79129cc2272d215cdb475fb146b37266699e
--- /dev/null
+++ b/paddle/legacy/api/test/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/testTrain.py
+    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/*.py ${CMAKE_CURRENT_BINARY_DIR}
+)
+add_custom_target(copy_api_test ALL DEPENDS testTrain.py)
+
+py_test(testTrain SRCS testTrain.py)
+py_test(testMatrix SRCS testMatrix.py)
+py_test(testVector SRCS testVector.py)
+py_test(testTrainer SRCS testTrainer.py)
+py_test(testArguments SRCS testArguments.py)
+py_test(testGradientMachine SRCS testGradientMachine.py)
diff --git a/paddle/api/test/testArguments.py b/paddle/legacy/api/test/testArguments.py
similarity index 100%
rename from paddle/api/test/testArguments.py
rename to paddle/legacy/api/test/testArguments.py
diff --git a/paddle/api/test/testGradientMachine.py b/paddle/legacy/api/test/testGradientMachine.py
similarity index 100%
rename from paddle/api/test/testGradientMachine.py
rename to paddle/legacy/api/test/testGradientMachine.py
diff --git a/paddle/api/test/testMatrix.py b/paddle/legacy/api/test/testMatrix.py
similarity index 100%
rename from paddle/api/test/testMatrix.py
rename to paddle/legacy/api/test/testMatrix.py
diff --git a/paddle/api/test/testTrain.py b/paddle/legacy/api/test/testTrain.py
similarity index 100%
rename from paddle/api/test/testTrain.py
rename to paddle/legacy/api/test/testTrain.py
diff --git a/paddle/api/test/testTrainConfig.py b/paddle/legacy/api/test/testTrainConfig.py
similarity index 100%
rename from paddle/api/test/testTrainConfig.py
rename to paddle/legacy/api/test/testTrainConfig.py
diff --git a/paddle/api/test/testTrainer.py b/paddle/legacy/api/test/testTrainer.py
similarity index 100%
rename from paddle/api/test/testTrainer.py
rename to paddle/legacy/api/test/testTrainer.py
diff --git a/paddle/api/test/testVector.py b/paddle/legacy/api/test/testVector.py
similarity index 100%
rename from paddle/api/test/testVector.py
rename to paddle/legacy/api/test/testVector.py
diff --git a/paddle/api/test/util.py b/paddle/legacy/api/test/util.py
similarity index 100%
rename from paddle/api/test/util.py
rename to paddle/legacy/api/test/util.py
diff --git a/paddle/capi/Arguments.cpp b/paddle/legacy/capi/Arguments.cpp
similarity index 100%
rename from paddle/capi/Arguments.cpp
rename to paddle/legacy/capi/Arguments.cpp
diff --git a/paddle/legacy/capi/CMakeLists.txt b/paddle/legacy/capi/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..957b1a3e6b07b058a76605992da387b43657146a
--- /dev/null
+++ b/paddle/legacy/capi/CMakeLists.txt
@@ -0,0 +1,118 @@
+if (WITH_DOUBLE)
+  set(PADDLE_FLOAT_TYPE double)
+else ()
+  set(PADDLE_FLOAT_TYPE float)
+endif()
+
+execute_process(
+  COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
+  WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_GIT_COMMIT
+  RESULT_VARIABLE PADDLE_GIT_COMMIT_RESULT
+  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+if(NOT PADDLE_GIT_COMMIT)
+  set(PADDLE_GIT_COMMIT "no commit information")
+endif()
+
+# config.h used for C-API. It will store Paddle building configuration as a
+# header. Make user just include PaddleCAPI.h then can get building
+# configuration without explicitly set -DPADDLE_WITH_DOUBLE when building their
+# libraries.
+configure_file(config.h.in config.h @ONLY)
+
+# PaddleCAPI.h is the only header we exposed. It currently only used for model
+# inference.
+file(GLOB CAPI_HEADERS *.h)
+set(CAPI_PRIVATE_HEADER capi_private.h)
+list(REMOVE_ITEM CAPI_HEADERS ${CAPI_PRIVATE_HEADER})
+file(GLOB CAPI_SOURCES *.cpp)
+
+# building paddle_capi
+add_library(paddle_capi STATIC ${CAPI_HEADERS} ${CAPI_PRIVATE_HEADER}
+  ${CAPI_SOURCES})
+
+target_include_directories(paddle_capi PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
+
+add_dependencies(paddle_capi paddle_proto paddle_gserver)
+
+# TODO: paddle_capi_whole will be removed.
+set(PADDLE_CAPI_LAYERS_LIBS
+    paddle_function
+    paddle_gserver)
+if(MOBILE_INFERENCE)
+  set(PADDLE_CAPI_ENGINE_LIBS
+      paddle_utils
+      paddle_parameter
+      paddle_math
+      paddle_cuda
+      paddle_proto)
+else()
+  set(PADDLE_CAPI_ENGINE_LIBS
+      paddle_utils
+      paddle_parameter
+      paddle_math
+      paddle_cuda
+      paddle_proto
+      paddle_pserver
+      paddle_network)
+endif()
+set(PADDLE_CAPI_INFER_LIBS ${PADDLE_CAPI_LAYERS_LIBS} ${PADDLE_CAPI_ENGINE_LIBS})
+cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_CAPI_INFER_LIBS})
+
+# Link the static library for inference
+cc_library(paddle_capi_engine DEPS paddle_capi ${PADDLE_CAPI_ENGINE_LIBS})
+cc_library(paddle_capi_layers DEPS ${PADDLE_CAPI_LAYERS_LIBS})
+
+# Link the shared library for inference
+if(NOT IOS)
+  set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_capi.map")
+  add_library(paddle_capi_shared SHARED ${CAPI_SOURCES})
+  set_target_properties(paddle_capi_shared	PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+  target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
+  link_paddle_exe(paddle_capi_shared)
+endif()
+
+# install library & headers.
+install(FILES ${CAPI_HEADERS} DESTINATION include/paddle)
+install(FILES paddle_capi.map DESTINATION include/paddle)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config.h DESTINATION include/paddle)
+if(ANDROID)
+  install(TARGETS paddle_capi_whole paddle_capi_engine paddle_capi_layers paddle_capi_shared
+          ARCHIVE DESTINATION lib/${ANDROID_ABI}
+          LIBRARY DESTINATION lib/${ANDROID_ABI})
+  execute_process(
+    COMMAND ${GIT_EXECUTABLE} log --pretty=oneline -1
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+    OUTPUT_VARIABLE GIT_COMMITS_LIST
+    RESULT_VARIABLE GIT_COMMITS_LIST_RESULT
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(${GIT_COMMITS_LIST_RESULT})
+    set(GIT_COMMITS_LIST "No commits.")
+  endif()
+  install(CODE "FILE(WRITE ${CMAKE_INSTALL_PREFIX}/lib/${ANDROID_ABI}/BUILD.txt
+          \"Compiler:\n\"
+          \"\\t${CMAKE_C_COMPILER}\\n\"
+          \"\\t${CMAKE_CXX_COMPILER}\\n\"
+          \"Compiler Flags:\\n\"
+          \"\\t${CMAKE_F_FLAGS}\\n\"
+          \"\\t${CMAKE_CXX_FLAGS}\\n\"
+          \"Android API: ${CMAKE_SYSTEM_VERSION}\\n\"
+          \"Lastest commit:\\n\"
+          \"\\t${GIT_COMMITS_LIST}\\n\"
+      )"
+  )
+else(ANDROID)
+  install(TARGETS paddle_capi_whole paddle_capi_engine paddle_capi_layers ARCHIVE DESTINATION lib)
+  if(NOT IOS)
+    install(TARGETS paddle_capi_shared DESTINATION lib)
+  endif()
+endif(ANDROID)
+
+# this variable used for unittest
+set(PADDLE_CAPI_INC_PATH
+  ${CMAKE_CURRENT_BINARY_DIR}
+  ${CMAKE_CURRENT_SOURCE_DIR})
+
+if (WITH_TESTING)
+  add_subdirectory(tests)
+endif()
diff --git a/paddle/legacy/capi/Main.cpp b/paddle/legacy/capi/Main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..17d8f00a88a9fd0818e6b90f8f6888b7d793a46e
--- /dev/null
+++ b/paddle/legacy/capi/Main.cpp
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fenv.h>
+#include <stdlib.h>
+#include <string.h>
+#include <vector>
+#include "capi_private.h"
+#include "main.h"
+#include "paddle/legacy/trainer/TrainerConfigHelper.h"
+#include "paddle/legacy/utils/Excepts.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+
+static void initPaddle(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  paddle::initPython(argc, argv);
+}
+
+extern "C" {
+paddle_error paddle_init(int argc, char** argv) {
+  static bool isInit = false;
+  if (isInit) return kPD_NO_ERROR;
+
+  std::vector<char*> realArgv;
+  realArgv.reserve(argc + 1);
+  realArgv.push_back(strdup(""));
+  for (int i = 0; i < argc; ++i) {
+    realArgv.push_back(argv[i]);
+  }
+  initPaddle(argc + 1, realArgv.data());
+  free(realArgv[0]);
+  isInit = true;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_init_thread() {
+  if (FLAGS_use_gpu) {
+    hl_init(FLAGS_gpu_id);
+  }
+  return kPD_NO_ERROR;
+}
+}
diff --git a/paddle/legacy/capi/Matrix.cpp b/paddle/legacy/capi/Matrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..733d49cacfda17ad19b7bd7918be73c1fd14a64f
--- /dev/null
+++ b/paddle/legacy/capi/Matrix.cpp
@@ -0,0 +1,171 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "capi_private.h"
+#include "hl_cuda.h"
+#include "matrix.h"
+
+#define cast(v) paddle::capi::cast<paddle::capi::CMatrix>(v)
+extern "C" {
+paddle_matrix paddle_matrix_create(uint64_t height,
+                                   uint64_t width,
+                                   bool useGpu) {
+  auto ptr = new paddle::capi::CMatrix();
+  ptr->mat = paddle::Matrix::create(height, width, false, useGpu);
+  return ptr;
+}
+
+paddle_matrix paddle_matrix_create_none() {
+  return new paddle::capi::CMatrix();
+}
+
+paddle_error paddle_matrix_destroy(paddle_matrix mat) {
+  if (mat == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  delete ptr;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_matrix_set_row(paddle_matrix mat,
+                                   uint64_t rowID,
+                                   paddle_real* rowArray) {
+  if (mat == nullptr || rowArray == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  if (ptr->mat == nullptr) return kPD_NULLPTR;
+  if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
+  paddle::real* buf = ptr->mat->getRowBuf(rowID);
+  size_t width = ptr->mat->getWidth();
+#ifdef PADDLE_WITH_CUDA
+  hl_memcpy(buf, rowArray, sizeof(paddle::real) * width);
+#else
+  std::copy(rowArray, rowArray + width, buf);
+#endif
+  return kPD_NO_ERROR;
+}
+
+PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
+                                            paddle_real* value) {
+  if (mat == nullptr || value == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  if (ptr->mat == nullptr) return kPD_NULLPTR;
+  paddle::real* buf = ptr->mat->getRowBuf(0);
+  size_t width = ptr->mat->getWidth();
+  size_t height = ptr->mat->getHeight();
+  if (ptr->mat->useGpu()) {
+#ifdef PADDLE_WITH_CUDA
+    hl_memcpy(buf, value, sizeof(paddle::real) * width * height);
+#else
+    return kPD_NOT_SUPPORTED;
+#endif
+  } else {
+    std::copy(value, value + width * height, buf);
+  }
+  return kPD_NO_ERROR;
+}
+
+PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
+                                            paddle_real* result) {
+  if (mat == nullptr || result == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  if (ptr->mat == nullptr) return kPD_NULLPTR;
+  paddle::real* buf = ptr->mat->getRowBuf(0);
+  size_t width = ptr->mat->getWidth();
+  size_t height = ptr->mat->getHeight();
+  if (ptr->mat->useGpu()) {
+#ifdef PADDLE_WITH_CUDA
+    hl_memcpy(result, buf, width * height * sizeof(paddle::real));
+#else
+    return kPD_NOT_SUPPORTED;
+#endif
+  } else {
+    std::copy(buf, buf + width * height, result);
+  }
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_matrix_get_row(paddle_matrix mat,
+                                   uint64_t rowID,
+                                   paddle_real** rawRowBuffer) {
+  if (mat == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  if (ptr->mat == nullptr) return kPD_NULLPTR;
+  if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
+  *rawRowBuffer = ptr->mat->getRowBuf(rowID);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_matrix_get_shape(paddle_matrix mat,
+                                     uint64_t* height,
+                                     uint64_t* width) {
+  if (mat == nullptr || cast(mat)->mat == nullptr) return kPD_NULLPTR;
+  if (height != nullptr) {
+    *height = cast(mat)->mat->getHeight();
+  }
+  if (width != nullptr) {
+    *width = cast(mat)->mat->getWidth();
+  }
+  return kPD_NO_ERROR;
+}
+}
+
+paddle_matrix paddle_matrix_create_sparse(
+    uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu) {
+#ifndef PADDLE_MOBILE_INFERENCE
+  auto ptr = new paddle::capi::CMatrix();
+  ptr->mat = paddle::Matrix::createSparseMatrix(
+      height,
+      width,
+      nnz,
+      isBinary ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
+      paddle::SPARSE_CSR,
+      false,
+      useGpu);
+  return ptr;
+#else
+  return nullptr;
+#endif
+}
+
+paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
+                                            int* rowArray,
+                                            uint64_t rowSize,
+                                            int* colArray,
+                                            uint64_t colSize,
+                                            float* valueArray,
+                                            uint64_t valueSize) {
+#ifndef PADDLE_MOBILE_INFERENCE
+  if (mat == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  if (rowArray == nullptr || colArray == nullptr ||
+      (valueSize != 0 && valueArray == nullptr) || ptr->mat == nullptr) {
+    return kPD_NULLPTR;
+  }
+  if (auto sparseMat = dynamic_cast<paddle::CpuSparseMatrix*>(ptr->mat.get())) {
+    std::vector<int> row(rowSize);
+    row.assign(rowArray, rowArray + rowSize);
+    std::vector<int> col(colSize);
+    col.assign(colArray, colArray + colSize);
+    std::vector<paddle_real> val(valueSize);
+    if (valueSize) {
+      val.assign(valueArray, valueArray + valueSize);
+    }
+    sparseMat->copyFrom(row, col, val);
+    return kPD_NO_ERROR;
+  } else {
+    return kPD_NOT_SUPPORTED;
+  }
+#else
+  return kPD_NOT_SUPPORTED;
+#endif
+}
diff --git a/paddle/capi/Vector.cpp b/paddle/legacy/capi/Vector.cpp
similarity index 100%
rename from paddle/capi/Vector.cpp
rename to paddle/legacy/capi/Vector.cpp
diff --git a/paddle/capi/arguments.h b/paddle/legacy/capi/arguments.h
similarity index 100%
rename from paddle/capi/arguments.h
rename to paddle/legacy/capi/arguments.h
diff --git a/paddle/capi/capi.h b/paddle/legacy/capi/capi.h
similarity index 100%
rename from paddle/capi/capi.h
rename to paddle/legacy/capi/capi.h
diff --git a/paddle/legacy/capi/capi_private.h b/paddle/legacy/capi/capi_private.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5f8c8c5c8bd506f9c8f49ee7d03f9b20460efdb
--- /dev/null
+++ b/paddle/legacy/capi/capi_private.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "capi.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/parameter/Argument.h"
+#pragma once
+
+namespace paddle {
+namespace capi {
+
+enum CType { kIVECTOR = 0, kMATRIX, kARGUMENTS, kGRADIENT_MACHINE };
+
+#define STRUCT_HEADER CType type;
+
+struct CHeader {
+  STRUCT_HEADER
+};
+
+struct CIVector {
+  STRUCT_HEADER
+  IVectorPtr vec;
+
+  CIVector() : type(kIVECTOR) {}
+};
+
+struct CMatrix {
+  STRUCT_HEADER
+  MatrixPtr mat;
+
+  CMatrix() : type(kMATRIX) {}
+};
+
+struct CArguments {
+  STRUCT_HEADER
+  std::vector<paddle::Argument> args;
+
+  CArguments() : type(kARGUMENTS) {}
+
+  template <typename T>
+  paddle_error accessSeqPos(uint64_t ID, uint32_t nestedLevel, T callback) {
+    if (ID >= args.size()) return kPD_OUT_OF_RANGE;
+    switch (nestedLevel) {
+      case 0:
+        callback(args[ID].sequenceStartPositions);
+        break;
+      case 1:
+        callback(args[ID].subSequenceStartPositions);
+        break;
+      default:
+        return kPD_OUT_OF_RANGE;
+    }
+    return kPD_NO_ERROR;
+  }
+};
+
+struct CGradientMachine {
+  STRUCT_HEADER
+  paddle::GradientMachinePtr machine;
+
+  CGradientMachine() : type(kGRADIENT_MACHINE) {}
+};
+
+template <typename T>
+inline T* cast(void* ptr) {
+  return reinterpret_cast<T*>(ptr);
+}
+}  // namespace capi
+}  // namespace paddle
diff --git a/paddle/capi/config.h.in b/paddle/legacy/capi/config.h.in
similarity index 100%
rename from paddle/capi/config.h.in
rename to paddle/legacy/capi/config.h.in
diff --git a/paddle/capi/error.cpp b/paddle/legacy/capi/error.cpp
similarity index 100%
rename from paddle/capi/error.cpp
rename to paddle/legacy/capi/error.cpp
diff --git a/paddle/capi/error.h b/paddle/legacy/capi/error.h
similarity index 100%
rename from paddle/capi/error.h
rename to paddle/legacy/capi/error.h
diff --git a/paddle/capi/examples/.gitignore b/paddle/legacy/capi/examples/.gitignore
similarity index 100%
rename from paddle/capi/examples/.gitignore
rename to paddle/legacy/capi/examples/.gitignore
diff --git a/paddle/capi/examples/README.md b/paddle/legacy/capi/examples/README.md
similarity index 100%
rename from paddle/capi/examples/README.md
rename to paddle/legacy/capi/examples/README.md
diff --git a/paddle/capi/examples/model_inference/README.md b/paddle/legacy/capi/examples/model_inference/README.md
similarity index 100%
rename from paddle/capi/examples/model_inference/README.md
rename to paddle/legacy/capi/examples/model_inference/README.md
diff --git a/paddle/capi/examples/model_inference/common/common.h b/paddle/legacy/capi/examples/model_inference/common/common.h
similarity index 100%
rename from paddle/capi/examples/model_inference/common/common.h
rename to paddle/legacy/capi/examples/model_inference/common/common.h
diff --git a/paddle/capi/examples/model_inference/dense/CMakeLists.txt b/paddle/legacy/capi/examples/model_inference/dense/CMakeLists.txt
similarity index 100%
rename from paddle/capi/examples/model_inference/dense/CMakeLists.txt
rename to paddle/legacy/capi/examples/model_inference/dense/CMakeLists.txt
diff --git a/paddle/capi/examples/model_inference/dense/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/dense/convert_protobin.sh
similarity index 100%
rename from paddle/capi/examples/model_inference/dense/convert_protobin.sh
rename to paddle/legacy/capi/examples/model_inference/dense/convert_protobin.sh
diff --git a/paddle/capi/examples/model_inference/dense/main.c b/paddle/legacy/capi/examples/model_inference/dense/main.c
similarity index 100%
rename from paddle/capi/examples/model_inference/dense/main.c
rename to paddle/legacy/capi/examples/model_inference/dense/main.c
diff --git a/paddle/capi/examples/model_inference/dense/merge_v2_model.py b/paddle/legacy/capi/examples/model_inference/dense/merge_v2_model.py
similarity index 100%
rename from paddle/capi/examples/model_inference/dense/merge_v2_model.py
rename to paddle/legacy/capi/examples/model_inference/dense/merge_v2_model.py
diff --git a/paddle/capi/examples/model_inference/dense/mnist_v2.py b/paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py
similarity index 100%
rename from paddle/capi/examples/model_inference/dense/mnist_v2.py
rename to paddle/legacy/capi/examples/model_inference/dense/mnist_v2.py
diff --git a/paddle/capi/examples/model_inference/dense/trainer_config.py b/paddle/legacy/capi/examples/model_inference/dense/trainer_config.py
similarity index 100%
rename from paddle/capi/examples/model_inference/dense/trainer_config.py
rename to paddle/legacy/capi/examples/model_inference/dense/trainer_config.py
diff --git a/paddle/capi/examples/model_inference/multi_thread/.gitignore b/paddle/legacy/capi/examples/model_inference/multi_thread/.gitignore
similarity index 100%
rename from paddle/capi/examples/model_inference/multi_thread/.gitignore
rename to paddle/legacy/capi/examples/model_inference/multi_thread/.gitignore
diff --git a/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt b/paddle/legacy/capi/examples/model_inference/multi_thread/CMakeLists.txt
similarity index 100%
rename from paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
rename to paddle/legacy/capi/examples/model_inference/multi_thread/CMakeLists.txt
diff --git a/paddle/capi/examples/model_inference/multi_thread/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
similarity index 100%
rename from paddle/capi/examples/model_inference/multi_thread/convert_protobin.sh
rename to paddle/legacy/capi/examples/model_inference/multi_thread/convert_protobin.sh
diff --git a/paddle/capi/examples/model_inference/multi_thread/main.c b/paddle/legacy/capi/examples/model_inference/multi_thread/main.c
similarity index 100%
rename from paddle/capi/examples/model_inference/multi_thread/main.c
rename to paddle/legacy/capi/examples/model_inference/multi_thread/main.c
diff --git a/paddle/capi/examples/model_inference/multi_thread/main_gpu.c b/paddle/legacy/capi/examples/model_inference/multi_thread/main_gpu.c
similarity index 100%
rename from paddle/capi/examples/model_inference/multi_thread/main_gpu.c
rename to paddle/legacy/capi/examples/model_inference/multi_thread/main_gpu.c
diff --git a/paddle/capi/examples/model_inference/multi_thread/trainer_config.py b/paddle/legacy/capi/examples/model_inference/multi_thread/trainer_config.py
similarity index 100%
rename from paddle/capi/examples/model_inference/multi_thread/trainer_config.py
rename to paddle/legacy/capi/examples/model_inference/multi_thread/trainer_config.py
diff --git a/paddle/capi/examples/model_inference/sequence/.gitignore b/paddle/legacy/capi/examples/model_inference/sequence/.gitignore
similarity index 100%
rename from paddle/capi/examples/model_inference/sequence/.gitignore
rename to paddle/legacy/capi/examples/model_inference/sequence/.gitignore
diff --git a/paddle/capi/examples/model_inference/sequence/CMakeLists.txt b/paddle/legacy/capi/examples/model_inference/sequence/CMakeLists.txt
similarity index 100%
rename from paddle/capi/examples/model_inference/sequence/CMakeLists.txt
rename to paddle/legacy/capi/examples/model_inference/sequence/CMakeLists.txt
diff --git a/paddle/capi/examples/model_inference/sequence/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
similarity index 100%
rename from paddle/capi/examples/model_inference/sequence/convert_protobin.sh
rename to paddle/legacy/capi/examples/model_inference/sequence/convert_protobin.sh
diff --git a/paddle/capi/examples/model_inference/sequence/main.c b/paddle/legacy/capi/examples/model_inference/sequence/main.c
similarity index 100%
rename from paddle/capi/examples/model_inference/sequence/main.c
rename to paddle/legacy/capi/examples/model_inference/sequence/main.c
diff --git a/paddle/capi/examples/model_inference/sequence/trainer_config.py b/paddle/legacy/capi/examples/model_inference/sequence/trainer_config.py
similarity index 100%
rename from paddle/capi/examples/model_inference/sequence/trainer_config.py
rename to paddle/legacy/capi/examples/model_inference/sequence/trainer_config.py
diff --git a/paddle/capi/examples/model_inference/sparse_binary/.gitignore b/paddle/legacy/capi/examples/model_inference/sparse_binary/.gitignore
similarity index 100%
rename from paddle/capi/examples/model_inference/sparse_binary/.gitignore
rename to paddle/legacy/capi/examples/model_inference/sparse_binary/.gitignore
diff --git a/paddle/capi/examples/model_inference/sparse_binary/CMakeLists.txt b/paddle/legacy/capi/examples/model_inference/sparse_binary/CMakeLists.txt
similarity index 100%
rename from paddle/capi/examples/model_inference/sparse_binary/CMakeLists.txt
rename to paddle/legacy/capi/examples/model_inference/sparse_binary/CMakeLists.txt
diff --git a/paddle/capi/examples/model_inference/sparse_binary/convert_protobin.sh b/paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
similarity index 100%
rename from paddle/capi/examples/model_inference/sparse_binary/convert_protobin.sh
rename to paddle/legacy/capi/examples/model_inference/sparse_binary/convert_protobin.sh
diff --git a/paddle/capi/examples/model_inference/sparse_binary/main.c b/paddle/legacy/capi/examples/model_inference/sparse_binary/main.c
similarity index 100%
rename from paddle/capi/examples/model_inference/sparse_binary/main.c
rename to paddle/legacy/capi/examples/model_inference/sparse_binary/main.c
diff --git a/paddle/capi/examples/model_inference/sparse_binary/trainer_config.py b/paddle/legacy/capi/examples/model_inference/sparse_binary/trainer_config.py
similarity index 100%
rename from paddle/capi/examples/model_inference/sparse_binary/trainer_config.py
rename to paddle/legacy/capi/examples/model_inference/sparse_binary/trainer_config.py
diff --git a/paddle/legacy/capi/gradient_machine.cpp b/paddle/legacy/capi/gradient_machine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0c5ddd856b5d374ae90d6c8ef898be52aa2e4e89
--- /dev/null
+++ b/paddle/legacy/capi/gradient_machine.cpp
@@ -0,0 +1,180 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gradient_machine.h"
+#include "capi_private.h"
+#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
+
+#define cast(v) paddle::capi::cast<paddle::capi::CGradientMachine>(v)
+
+enum GradientMatchineCreateMode {
+  CREATE_MODE_NORMAL = 0,
+  CREATE_MODE_TESTING = 4
+};
+
+namespace paddle {
+
+class MyNeuralNetwork : public NeuralNetwork {
+ public:
+  MyNeuralNetwork(const std::string& name, NeuralNetwork* network)
+      : NeuralNetwork(name, network) {}
+};
+
+NeuralNetwork* newCustomNerualNetwork(const std::string& name,
+                                      NeuralNetwork* network) {
+  return new MyNeuralNetwork(name, network);
+}
+}  // namespace paddle
+
+extern "C" {
+paddle_error paddle_gradient_machine_create_for_inference(
+    paddle_gradient_machine* machine, void* modelConfigProtobuf, int size) {
+  if (modelConfigProtobuf == nullptr) return kPD_NULLPTR;
+  paddle::ModelConfig config;
+  if (!config.ParseFromArray(modelConfigProtobuf, size) ||
+      !config.IsInitialized()) {
+    return kPD_PROTOBUF_ERROR;
+  }
+
+  auto ptr = new paddle::capi::CGradientMachine();
+  ptr->machine.reset(paddle::GradientMachine::create(
+      config, CREATE_MODE_TESTING, {paddle::PARAMETER_VALUE}));
+  *machine = ptr;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_create_for_inference_with_parameters(
+    paddle_gradient_machine* machine, void* mergedModel, uint64_t size) {
+  if (mergedModel == nullptr) return kPD_NULLPTR;
+  std::istringstream is(std::string(static_cast<char*>(mergedModel), size));
+  int64_t modelConfigSize = 0;
+  is.read((char*)(&modelConfigSize), sizeof(modelConfigSize));
+  std::string modelConfigProtobuf;
+  modelConfigProtobuf.resize(modelConfigSize);
+  is.read(&modelConfigProtobuf[0], modelConfigSize);
+  paddle::TrainerConfig config;
+  paddle::ModelConfig modelConfig;
+  if (!config.ParseFromString(modelConfigProtobuf) || !config.IsInitialized()) {
+    if (!modelConfig.ParseFromString(modelConfigProtobuf) ||
+        !modelConfig.IsInitialized()) {
+      return kPD_PROTOBUF_ERROR;
+    }
+  } else {
+    modelConfig = config.model_config();
+  }
+  auto ptr = new paddle::capi::CGradientMachine();
+  ptr->machine.reset(paddle::GradientMachine::create(
+      modelConfig, CREATE_MODE_TESTING, {paddle::PARAMETER_VALUE}));
+  std::vector<paddle::ParameterPtr>& parameters = ptr->machine->getParameters();
+  for (auto& para : parameters) {
+    para->load(is);
+  }
+
+  *machine = ptr;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_destroy(paddle_gradient_machine machine) {
+  delete cast(machine);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_load_parameter_from_disk(
+    paddle_gradient_machine machine, const char* path) {
+  auto m = cast(machine);
+  if (m == nullptr || path == nullptr || m->machine == nullptr)
+    return kPD_NULLPTR;
+  m->machine->loadParameters(path);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_forward(paddle_gradient_machine machine,
+                                             paddle_arguments inArgs,
+                                             paddle_arguments outArgs,
+                                             bool isTrain) {
+  auto m = cast(machine);
+  auto in = paddle::capi::cast<paddle::capi::CArguments>(inArgs);
+  auto out = paddle::capi::cast<paddle::capi::CArguments>(outArgs);
+  if (m == nullptr || in == nullptr || out == nullptr || m->machine == nullptr)
+    return kPD_NULLPTR;
+  m->machine->forward(
+      in->args, &out->args, isTrain ? paddle::PASS_TRAIN : paddle::PASS_TEST);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_create_shared_param(
+    paddle_gradient_machine origin,
+    void* modelConfigProtobuf,
+    int size,
+    paddle_gradient_machine* slave) {
+  auto o = cast(origin);
+  if (origin == nullptr || slave == nullptr || o->machine == nullptr) {
+    return kPD_NULLPTR;
+  }
+  paddle::ModelConfig config;
+  if (!config.ParseFromArray(modelConfigProtobuf, size) ||
+      !config.IsInitialized()) {
+    return kPD_PROTOBUF_ERROR;
+  }
+
+  std::unique_ptr<paddle::capi::CGradientMachine> ptr(
+      new paddle::capi::CGradientMachine());
+  auto nn = paddle::NeuralNetwork::create(config);
+  nn->init(config,
+           [&o](int paramId, paddle::Parameter* param) {
+             auto p = o->machine->getParameters()[paramId];
+             param->enableSharedType(paddle::PARAMETER_VALUE,
+                                     p->getBuf(paddle::PARAMETER_VALUE));
+           },
+           {paddle::PARAMETER_VALUE},
+           false);
+  ptr->machine.reset(nn);
+  *slave = ptr.release();
+  return kPD_NO_ERROR;
+}
+}
+
+paddle_error paddle_gradient_machine_randomize_param(
+    paddle_gradient_machine machine) {
+  auto m = cast(machine);
+  if (m == nullptr || m->machine == nullptr) return kPD_NULLPTR;
+  m->machine->randParameters();
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_get_layer_output(
+    paddle_gradient_machine machine,
+    const char* layerName,
+    paddle_arguments args) {
+  auto m = cast(machine);
+  auto out = paddle::capi::cast<paddle::capi::CArguments>(args);
+  if (m == nullptr || layerName == nullptr || out == nullptr ||
+      m->machine == nullptr) {
+    return kPD_NULLPTR;
+  }
+
+  auto layerOutput = m->machine->getLayerOutput(layerName);
+  out->args.push_back(layerOutput);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_release_layer_output(
+    paddle_gradient_machine machine) {
+  auto m = cast(machine);
+  if (m == nullptr || m->machine == nullptr) {
+    return kPD_NULLPTR;
+  }
+  m->machine->releaseOutput();
+  return kPD_NO_ERROR;
+}
diff --git a/paddle/capi/gradient_machine.h b/paddle/legacy/capi/gradient_machine.h
similarity index 100%
rename from paddle/capi/gradient_machine.h
rename to paddle/legacy/capi/gradient_machine.h
diff --git a/paddle/capi/main.h b/paddle/legacy/capi/main.h
similarity index 100%
rename from paddle/capi/main.h
rename to paddle/legacy/capi/main.h
diff --git a/paddle/capi/matrix.h b/paddle/legacy/capi/matrix.h
similarity index 100%
rename from paddle/capi/matrix.h
rename to paddle/legacy/capi/matrix.h
diff --git a/paddle/capi/paddle_capi.map b/paddle/legacy/capi/paddle_capi.map
similarity index 100%
rename from paddle/capi/paddle_capi.map
rename to paddle/legacy/capi/paddle_capi.map
diff --git a/paddle/capi/tests/.gitignore b/paddle/legacy/capi/tests/.gitignore
similarity index 100%
rename from paddle/capi/tests/.gitignore
rename to paddle/legacy/capi/tests/.gitignore
diff --git a/paddle/capi/tests/CMakeLists.txt b/paddle/legacy/capi/tests/CMakeLists.txt
similarity index 100%
rename from paddle/capi/tests/CMakeLists.txt
rename to paddle/legacy/capi/tests/CMakeLists.txt
diff --git a/paddle/legacy/capi/tests/test_Arguments.cpp b/paddle/legacy/capi/tests/test_Arguments.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6fb379719dc0f3230c0801752720703ad185216f
--- /dev/null
+++ b/paddle/legacy/capi/tests/test_Arguments.cpp
@@ -0,0 +1,129 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <functional>
+#include "capi.h"
+#include "gtest/gtest.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+static std::vector<paddle_real> randomBuffer(size_t bufSize) {
+  auto& eng = paddle::ThreadLocalRandomEngine::get();
+  std::uniform_real_distribution<paddle_real> dist(-1.0, 1.0);
+  std::vector<paddle_real> retv;
+  retv.reserve(bufSize);
+  for (size_t i = 0; i < bufSize; ++i) {
+    retv.push_back(dist(eng));
+  }
+  return retv;
+}
+
+TEST(CAPIArguments, create) {
+  //! TODO(yuyang18): Test GPU Code.
+  paddle_arguments args = paddle_arguments_create_none();
+  uint64_t size;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_size(args, &size));
+  ASSERT_EQ(0UL, size);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
+}
+
+TEST(CAPIArguments, value) {
+  paddle_arguments args = paddle_arguments_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(args, 1));
+
+  paddle_matrix mat = paddle_matrix_create(128, 64, false);
+  for (size_t i = 0; i < 128; ++i) {
+    std::vector<paddle_real> sampleBuf = randomBuffer(64);
+    paddle_matrix_set_row(mat, i, sampleBuf.data());
+  }
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_value(args, 0, mat));
+
+  paddle_matrix val = paddle_matrix_create_none();
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_value(args, 0, val));
+
+  for (size_t i = 0; i < 128; ++i) {
+    paddle_real* row1;
+    paddle_real* row2;
+
+    ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, i, &row1));
+    ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(val, i, &row2));
+    ASSERT_EQ(row1, row2);
+  }
+
+  paddle_ivector ivec = paddle_ivector_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(ivec));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(val));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
+}
+
+TEST(CAPIArguments, ids) {
+  paddle_arguments args = paddle_arguments_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(args, 1));
+
+  paddle_ivector ivec;
+  int array[3] = {1, 2, 3};
+  ivec = paddle_ivector_create(array, 3, true, false);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_ids(args, 0, ivec));
+
+  paddle_ivector val = paddle_ivector_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_ids(args, 0, val));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(ivec));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(val));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
+}
+
+template <typename T1, typename T2>
+void testSequenceHelper(T1 setter, T2 getter) {
+  paddle_arguments args = paddle_arguments_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(args, 1));
+
+  paddle_ivector ivec;
+  int array[3] = {1, 2, 3};
+  ivec = paddle_ivector_create(array, 3, true, false);
+  ASSERT_EQ(kPD_NO_ERROR, setter(args, 0, ivec));
+
+  paddle_ivector val = paddle_ivector_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, getter(args, 0, val));
+  uint64_t size;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_get_size(val, &size));
+
+  int* rawBuf;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_get(val, &rawBuf));
+  for (size_t i = 0; i < size; ++i) {
+    ASSERT_EQ(array[i], rawBuf[i]);
+  }
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(ivec));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(val));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
+}
+
+TEST(CAPIArguments, Sequence) {
+  auto testSequence = [](uint32_t nestedLevel) {
+    testSequenceHelper(std::bind(paddle_arguments_set_sequence_start_pos,
+                                 std::placeholders::_1,
+                                 std::placeholders::_2,
+                                 nestedLevel,
+                                 std::placeholders::_3),
+                       std::bind(paddle_arguments_get_sequence_start_pos,
+                                 std::placeholders::_1,
+                                 std::placeholders::_2,
+                                 nestedLevel,
+                                 std::placeholders::_3));
+  };
+  for (uint32_t i = 0; i < 2; ++i) {  // test seq and sub-seq.
+    testSequence(i);
+  }
+}
diff --git a/paddle/legacy/capi/tests/test_GradientMachine.cpp b/paddle/legacy/capi/tests/test_GradientMachine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5d1b7cb6ca4073c0a489366e415f8f74d3c19bec
--- /dev/null
+++ b/paddle/legacy/capi/tests/test_GradientMachine.cpp
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/gserver/gradientmachines/GradientMachine.h>
+#include <paddle/legacy/trainer/TrainerConfigHelper.h>
+#include <stdlib.h>
+#include <string.h>
+#include <type_traits>
+#include "capi.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+static std::vector<paddle_real> randomBuffer(size_t bufSize) {
+  auto& eng = paddle::ThreadLocalRandomEngine::get();
+  std::uniform_real_distribution<paddle_real> dist(-1.0, 1.0);
+  std::vector<paddle_real> retv;
+  retv.reserve(bufSize);
+  for (size_t i = 0; i < bufSize; ++i) {
+    retv.push_back(dist(eng));
+  }
+  return retv;
+}
+
+TEST(GradientMachine, testPredict) {
+  //! TODO(yuyang18): Test GPU Code.
+  paddle::TrainerConfigHelper config("./test_predict_network.py");
+  std::string buffer;
+  ASSERT_TRUE(config.getModelConfig().SerializeToString(&buffer));
+  paddle_gradient_machine machine;
+
+  ASSERT_EQ(kPD_NO_ERROR,
+            paddle_gradient_machine_create_for_inference(
+                &machine, &buffer[0], (int)buffer.size()));
+  std::unique_ptr<paddle::GradientMachine> gm(
+      paddle::GradientMachine::create(config.getModelConfig()));
+  ASSERT_NE(nullptr, gm);
+  gm->randParameters();
+  gm->saveParameters("./");
+
+  ASSERT_EQ(kPD_NO_ERROR,
+            paddle_gradient_machine_load_parameter_from_disk(machine, "./"));
+
+  paddle_gradient_machine machineSlave;
+  ASSERT_EQ(kPD_NO_ERROR,
+            paddle_gradient_machine_create_shared_param(
+                machine, &buffer[0], (int)buffer.size(), &machineSlave));
+  std::swap(machineSlave, machine);
+  paddle_arguments outArgs = paddle_arguments_create_none();
+
+  paddle_arguments inArgs = paddle_arguments_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(inArgs, 1));
+  paddle_matrix mat = paddle_matrix_create(1, 100, false);
+  static_assert(std::is_same<paddle_real, paddle::real>::value, "");
+
+  auto data = randomBuffer(100);
+  paddle_real* rowPtr;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, 0, &rowPtr));
+  memcpy(rowPtr, data.data(), data.size() * sizeof(paddle_real));
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_value(inArgs, 0, mat));
+  ASSERT_EQ(kPD_NO_ERROR,
+            paddle_gradient_machine_forward(machine, inArgs, outArgs, false));
+
+  uint64_t sz;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_size(outArgs, &sz));
+  ASSERT_EQ(1UL, sz);
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_value(outArgs, 0, mat));
+  std::vector<paddle::Argument> paddleInArgs;
+  std::vector<paddle::Argument> paddleOutArgs;
+  paddleInArgs.resize(1);
+  paddleInArgs[0].value =
+      paddle::Matrix::create(data.data(), 1, 100, false, false);
+
+  gm->forward(paddleInArgs, &paddleOutArgs, paddle::PASS_TEST);
+
+  auto matPaddle = paddleOutArgs[0].value;
+
+  uint64_t height, width;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
+  ASSERT_EQ(matPaddle->getHeight(), height);
+  ASSERT_EQ(matPaddle->getWidth(), width);
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, 0, &rowPtr));
+  for (size_t i = 0; i < width; ++i) {
+    ASSERT_NEAR(matPaddle->getData()[i], rowPtr[i], 1e-5);
+  }
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(inArgs));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(outArgs));
+  std::swap(machineSlave, machine);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_gradient_machine_destroy(machineSlave));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_gradient_machine_destroy(machine));
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  std::vector<char*> argvs;
+  argvs.push_back(strdup("--use_gpu=false"));
+  paddle_init((int)argvs.size(), argvs.data());
+  for (auto each : argvs) {
+    free(each);
+  }
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/capi/tests/test_Matrix.cpp b/paddle/legacy/capi/tests/test_Matrix.cpp
similarity index 100%
rename from paddle/capi/tests/test_Matrix.cpp
rename to paddle/legacy/capi/tests/test_Matrix.cpp
diff --git a/paddle/capi/tests/test_Vector.cpp b/paddle/legacy/capi/tests/test_Vector.cpp
similarity index 100%
rename from paddle/capi/tests/test_Vector.cpp
rename to paddle/legacy/capi/tests/test_Vector.cpp
diff --git a/paddle/capi/tests/test_predict_network.py b/paddle/legacy/capi/tests/test_predict_network.py
similarity index 100%
rename from paddle/capi/tests/test_predict_network.py
rename to paddle/legacy/capi/tests/test_predict_network.py
diff --git a/paddle/capi/vector.h b/paddle/legacy/capi/vector.h
similarity index 100%
rename from paddle/capi/vector.h
rename to paddle/legacy/capi/vector.h
diff --git a/paddle/legacy/cuda/CMakeLists.txt b/paddle/legacy/cuda/CMakeLists.txt
new file mode 100755
index 0000000000000000000000000000000000000000..9bbb8de78e09829d24faf42c360811084981578f
--- /dev/null
+++ b/paddle/legacy/cuda/CMakeLists.txt
@@ -0,0 +1,89 @@
+set(AVX_SOURCES
+    src/hl_math.cc
+    src/hl_avx_functions.cc
+)
+
+if(WITH_AVX)
+    set(CUDA_SOURCES
+        src/hl_time.cc
+        src/hl_cpu_functions.cc
+        ${AVX_SOURCES})
+else()
+    set(CUDA_SOURCES
+        src/hl_time.cc
+        src/hl_cpu_functions.cc)
+endif()
+
+set(CUDA_CXX_WITH_GPU_SOURCES
+    src/hl_cuda_cublas.cc
+    src/hl_cuda_cudnn.cc
+    src/hl_cuda_device.cc)
+
+if(WITH_GPU)
+    set(CUDA_CXX_SOURCES
+        src/hl_warpctc_wrap.cc
+        ${CUDA_CXX_WITH_GPU_SOURCES})
+
+    set_source_files_properties(${CUDA_CXX_SOURCES}
+                                PROPERTIES COMPILE_FLAGS "-D__NVCC__")
+else()
+    if (NOT MOBILE_INFERENCE)
+    set(CUDA_CXX_SOURCES src/hl_warpctc_wrap.cc)
+    endif()
+endif()
+
+set(CUDA_CU_SOURCES
+    src/hl_perturbation_util.cu
+    src/hl_cuda_aggregate.cu
+    src/hl_cuda_matrix.cu
+    src/hl_cuda_sparse.cu
+    src/hl_cuda_cnn.cu
+    src/hl_cuda_lstm.cu
+    src/hl_top_k.cu
+    src/hl_batch_transpose.cu
+    src/hl_batch_norm.cu
+    src/hl_cuda_sequence.cu
+    src/hl_table_apply.cu)
+
+set(CUDA_HEADERS
+    include/hl_time.h
+    include/hl_warpctc_wrap.h
+    include/hl_sequence.h
+    include/hl_cuda_cublas.h
+    include/hl_batch_transpose.h
+    include/hl_avx_functions.h
+    include/hl_sparse.h
+    include/hl_functions.h
+    include/hl_cuda_cudnn.h
+    include/hl_activation_functions.h
+    include/hl_base.h
+    include/stub/hl_cuda_cudnn_stub.h
+    include/stub/hl_cuda_stub.h
+    include/stub/hl_cuda_cublas_stub.h
+    include/stub/hl_cnn_stub.h
+    include/stub/hl_lstm_stub.h
+    include/stub/hl_sequence_stub.h
+    include/stub/hl_aggregate_stub.h
+    include/stub/hl_sparse_stub.h
+    include/stub/hl_matrix_stub.h
+    include/hl_aggregate.h
+    include/hl_cuda.h
+    include/hl_lstm.h
+    include/hl_table_apply.h
+    include/hl_gpu.h
+    include/hl_top_k.h
+    include/hl_matrix.h
+    include/hl_cnn.h)
+
+if(WITH_GPU)
+    cuda_add_library(paddle_cuda
+        ${CUDA_SOURCES}
+        ${CUDA_CU_SOURCES}
+        ${CUDA_CXX_SOURCES})
+else()
+    add_library(paddle_cuda
+                ${CUDA_SOURCES}
+                ${CUDA_CXX_SOURCES})
+endif()
+
+add_dependencies(paddle_cuda paddle_proto ${external_project_dependencies})
diff --git a/paddle/legacy/cuda/include/hl_activation_functions.h b/paddle/legacy/cuda/include/hl_activation_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..66a69db545b541409f895820ad621a2a9a684e20
--- /dev/null
+++ b/paddle/legacy/cuda/include/hl_activation_functions.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_ACTIVATION_FUNCTIONS_H_
+#define HL_ACTIVATION_FUNCTIONS_H_
+
+#include "hl_functions.h"
+
+/**
+ * Active functions: sigmoid, relu, tanh and linear.
+ */
+#define HPPL_ACTIVE_FUNCTION \
+  { hppl::sigmoid, hppl::relu, hppl::tanh, hppl::linear }
+
+namespace hppl {
+
+/**
+ * Hppl supports sigmoid, relu, tanh, linear active functions
+ * for neural networks' forward and backward activation.
+ */
+template <class T>
+class Active {
+ public:
+  typedef T (*forward)(T);
+  typedef T (*backward)(T, T);
+};
+
+#ifdef __NVCC__
+namespace gpu {
+static __device__ Active<real>::forward forward[] = HPPL_ACTIVE_FUNCTION;
+static __device__ Active<real>::backward backward[] = HPPL_ACTIVE_FUNCTION;
+}  // namespace gpu
+#else
+namespace cpu {
+static Active<real>::forward forward[] = HPPL_ACTIVE_FUNCTION;
+static Active<real>::backward backward[] = HPPL_ACTIVE_FUNCTION;
+}  // namespace cpu
+
+#ifdef __AVX__
+namespace avx {
+static Active<__m256>::forward forward[] = HPPL_ACTIVE_FUNCTION;
+static Active<__m256>::backward backward[] = HPPL_ACTIVE_FUNCTION;
+}  // namespace avx
+#endif
+#endif
+
+}  // namespace hppl
+
+#endif  // HL_ACTIVATION_FUNCTIONS_H_
diff --git a/paddle/cuda/include/hl_aggregate.h b/paddle/legacy/cuda/include/hl_aggregate.h
similarity index 100%
rename from paddle/cuda/include/hl_aggregate.h
rename to paddle/legacy/cuda/include/hl_aggregate.h
diff --git a/paddle/cuda/include/hl_avx_functions.h b/paddle/legacy/cuda/include/hl_avx_functions.h
similarity index 100%
rename from paddle/cuda/include/hl_avx_functions.h
rename to paddle/legacy/cuda/include/hl_avx_functions.h
diff --git a/paddle/legacy/cuda/include/hl_base.h b/paddle/legacy/cuda/include/hl_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..bfe812a4387be72c3e73d6b45852e3a90b1926eb
--- /dev/null
+++ b/paddle/legacy/cuda/include/hl_base.h
@@ -0,0 +1,250 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstddef>
+
+#ifdef PADDLE_TYPE_DOUBLE
+#define HL_FLOAT_MAX 3.40282347e+38F
+#define HL_FLOAT_MIN 1.17549435e-38F
+using real = double;
+#else
+#define HL_FLOAT_MAX 1.7976931348623157e+308
+#define HL_FLOAT_MIN 2.2250738585072014e-308
+using real = float;
+#endif
+
+/**
+ * The maximum input value for exp, used to avoid overflow problem.
+ * currently only used for tanh function.
+ */
+#define EXP_MAX_INPUT 40.0
+
+/**
+ * @brief DIVUP(x, y) is similar to ceil(x / y).
+ * @note  For CUDA, DIVUP will be used to specify
+ *        the size of blockDim.
+ */
+#ifndef DIVUP
+#define DIVUP(x, y) (((x) + (y)-1) / (y))
+#endif
+
+/**
+ * HPPL is an internal high performance parallel computing library
+ * for high-level neural network routines, which can support many
+ * heterogeneous compute architectures, such as GPU, FPGA, etc.
+ */
+
+/**
+ * @brief   HPPL CUDA Stream.
+ *
+ * @note    Each thread can use HPPL_STREAM_* after calling hl_init.
+ *          HPPL_STREAM_DEFAULT is HPPL default stream.
+ */
+typedef enum {
+  HPPL_STREAM_DEFAULT = 0, /* Thread Default Stream*/
+  HPPL_STREAM_1 = 1,
+  HPPL_STREAM_2 = 2,
+  HPPL_STREAM_3 = 3,
+  HPPL_STREAM_4 = 4,
+  HPPL_THREAD_STREAM_1 = 5,
+  HPPL_THREAD_STREAM_2 = 6,
+  HPPL_THREAD_STREAM_3 = 7,
+  HPPL_THREAD_STREAM_4 = 8,
+  HPPL_STREAM_END
+} hl_stream_t;
+
+/**
+ * @brief HPPL activation mode.
+ */
+typedef enum {
+  HL_ACTIVATION_SIGMOID = 0,
+  HL_ACTIVATION_RELU = 1,
+  HL_ACTIVATION_TANH = 2,
+  HL_ACTIVATION_LINEAR = 3,
+  HL_ACTIVATION_END
+} hl_activation_mode_t;
+
+/**
+ * @brief Transpose type.
+ */
+typedef enum {
+  HPPL_OP_N = 0, /* transpose */
+  HPPL_OP_T = 1, /* non transpose */
+  HPPL_OP_END
+} hl_trans_op_t;
+
+/**
+ * @brief Lstm value.
+ *
+ * @param  gateValue         input value.
+ * @param  prevStateValue    previous state value.
+ * @param  stateValue        state value.
+ * @param  stateActiveValue  state active value.
+ * @param  outputValue       output value.
+ */
+typedef struct {
+  real *gateValue;
+  real *prevStateValue;
+  real *stateValue;
+  real *stateActiveValue;
+  real *outputValue;
+  real *checkIg;
+  real *checkFg;
+  real *checkOg;
+} hl_lstm_value;
+
+/**
+ * @brief Lstm gradient.
+ *
+ * @param  gateGrad          input gradient.
+ * @param  prevStateGrad     previous state gradient.
+ * @param  stateGrad         state gradient.
+ * @param  stateActiveGrad   state active gradient.
+ * @param  outputGrad        output gradient.
+ */
+typedef struct {
+  real *gateGrad;
+  real *prevStateGrad;
+  real *stateGrad;
+  real *stateActiveGrad;
+  real *outputGrad;
+  real *checkIgGrad;
+  real *checkFgGrad;
+  real *checkOgGrad;
+} hl_lstm_grad;
+
+/**
+ * @brief Gru value.
+ *
+ * @param  gateWeight           gate weight (updateGate + resetGate).
+ * @param  stateWeight          frame state weight.
+ * @param  gateValue            gate value results.
+ * @param  resetOutputValue     resetOutput value.
+ * @param  outputValue          output value.
+ * @param  prevOutValue         previous output value.
+ *
+ */
+typedef struct {
+  real *gateWeight;
+  real *stateWeight;
+  real *gateValue;
+  real *resetOutputValue;
+  real *outputValue;
+  real *prevOutValue;
+} hl_gru_value;
+
+/**
+ * @brief Gru gradient.
+ *
+ * @param  gateWeightGrad       gate weight gradient.
+ * @param  stateWeightGrad      frame state weight gradient.
+ * @param  gateGrad             gate gradient results.
+ * @param  resetOutputGrad      resetOutput gradient.
+ * @param  outputGrad           output gradient.
+ * @param  prevOutGrad          previous output gradient.
+ */
+typedef struct {
+  real *gateWeightGrad;
+  real *stateWeightGrad;
+  real *gateGrad;
+  real *resetOutputGrad;
+  real *outputGrad;
+  real *prevOutGrad;
+} hl_gru_grad;
+
+/**
+ * @brief  Sparse matrix value type.
+ */
+typedef enum {
+  HL_NO_VALUE = 0, /* matrix values only 0 or 1 */
+  HL_FLOAT_VALUE = 1,
+  HL_VALUE_END
+} hl_matrix_value_t;
+
+/**
+ * @brief  HPPL matrix format.
+ */
+typedef enum {
+  HL_SPARSE_CSR = 0,
+  HL_SPARSE_CSC = 1,
+  HL_SPARSE_END
+} hl_matrix_format_t;
+
+typedef struct _hl_matrix_s *hl_matrix_s;
+
+/**
+ * @brief   HPPL sparse matrix.
+ *
+ * @param  matrix     sparse matrix.
+ * @param  format     matrix format.
+ * @param  type       the type of matrix values.
+ * @param  rows       matrix rows.
+ * @param  cols       matrix columns.
+ * @param  nnz        nonzero values of sparse matrix.
+ */
+typedef struct {
+  hl_matrix_s matrix;
+  hl_matrix_format_t format;
+  hl_matrix_value_t type;
+  int rows;
+  int cols;
+  size_t nnz;
+} _hl_sparse_matrix_s, *hl_sparse_matrix_s;
+
+#ifdef __NVCC__
+
+#include <cuda_runtime.h>
+#include "paddle/legacy/cuda/include/hl_cuda.h"
+#include "paddle/legacy/utils/Logging.h"
+
+extern __thread bool g_sync_flag;
+extern __thread cudaStream_t default_stream;
+#define STREAM_DEFAULT default_stream
+
+/**
+ * @brief   Check cuda kernel execution.
+ * @param   msg   error string
+ */
+#define CHECK_SYNC(msg)                                               \
+  if (true == g_sync_flag) {                                          \
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);                       \
+    cudaError_t err = (cudaError_t)hl_get_device_last_error();        \
+    CHECK_EQ(cudaSuccess, err)                                        \
+        << "[" << msg << "] "                                         \
+        << "CUDA error: " << hl_get_device_error_string((size_t)err); \
+  }
+
+// __shfl has been deprecated as of CUDA 9.0.
+#if CUDA_VERSION < 9000
+template <typename T>
+__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
+  return __shfl_down(val, delta);
+}
+
+template <typename T>
+__forceinline__ __device__ T
+__shfl_sync(unsigned, T val, int src_line, int width) {
+  return __shfl(val, src_line, width);
+}
+
+#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
+#else
+#define FULL_WARP_MASK 0xFFFFFFFF
+#define CREATE_SHFL_MASK(mask, predicate) \
+  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
+#endif
+
+#endif  // __NVCC__
diff --git a/paddle/cuda/include/hl_batch_norm.h b/paddle/legacy/cuda/include/hl_batch_norm.h
similarity index 100%
rename from paddle/cuda/include/hl_batch_norm.h
rename to paddle/legacy/cuda/include/hl_batch_norm.h
diff --git a/paddle/cuda/include/hl_batch_transpose.h b/paddle/legacy/cuda/include/hl_batch_transpose.h
similarity index 100%
rename from paddle/cuda/include/hl_batch_transpose.h
rename to paddle/legacy/cuda/include/hl_batch_transpose.h
diff --git a/paddle/legacy/cuda/include/hl_cnn.h b/paddle/legacy/cuda/include/hl_cnn.h
new file mode 100644
index 0000000000000000000000000000000000000000..b790fa39fe863bbb00f6cd36d4c63481b7634fe1
--- /dev/null
+++ b/paddle/legacy/cuda/include/hl_cnn.h
@@ -0,0 +1,417 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_CNN_H_
+#define HL_CNN_H_
+
+#include "hl_base.h"
+
+/**
+ * @brief   Maximum pool forward with Mask output.
+ *
+ * @param[in]   frameCnt    batch size of input image.
+ * @param[in]   inputData   input data.
+ * @param[in]   channels    number of channel.
+ * @param[in]   height      image height.
+ * @param[in]   width       image width.
+ * @param[in]   pooledH     output image height.
+ * @param[in]   pooledW     output image width.
+ * @param[in]   sizeX       width of pooling window.
+ * @param[in]   sizeY       height of pooling window.
+ * @param[in]   strideH     pooling stride height.
+ * @param[in]   strideW     pooling stride width.
+ * @param[in]   paddingH    padding height.
+ * @param[in]   paddingW    padding width.
+ * @param[out]  tgtData     output data.
+ * @param[in]   tgtStride   stride between output data samples.
+ * @param[out]  maskData    the location indices of select max data.
+ */
+extern void hl_maxpool_forward(const int frameCnt,
+                               const real* inputData,
+                               const int channels,
+                               const int height,
+                               const int width,
+                               const int pooledH,
+                               const int pooledW,
+                               const int sizeX,
+                               const int sizeY,
+                               const int strideH,
+                               const int strideW,
+                               const int paddingH,
+                               const int paddingW,
+                               real* tgtData,
+                               const int tgtStride,
+                               real* maskData = NULL);
+
+/**
+ * @brief   Maximum pool backward.
+ *
+ * @param[in]   frameCnt    batch size of input image.
+ * @param[in]   inputData   input data.
+ * @param[out]  outData     output data.
+ * @param[out]  outGrad     output grad data.
+ * @param[in]   channels    number of channel.
+ * @param[in]   height      image height.
+ * @param[in]   width       image width.
+ * @param[in]   pooledH     output image height.
+ * @param[in]   pooledW     output image width.
+ * @param[in]   sizeX       width of pooling window.
+ * @param[in]   sizeY       height of pooling window.
+ * @param[in]   strideH     pooling stride height.
+ * @param[in]   strideW     pooling stride width.
+ * @param[in]   scaleA      scale.
+ * @param[in]   scaleB      scale.
+ * @param[in]   paddingH    padding height.
+ * @param[in]   paddingW    padding width.
+ * @param[out]  targetGrad  output grad.
+ * @param[in]   outStride   stride between output data samples.
+ *
+ */
+extern void hl_maxpool_backward(const int frameCnt,
+                                const real* inputData,
+                                const real* outData,
+                                const real* outGrad,
+                                const int channels,
+                                const int height,
+                                const int width,
+                                const int pooledH,
+                                const int pooledW,
+                                const int sizeX,
+                                const int sizeY,
+                                const int strideH,
+                                const int strideW,
+                                const int paddingH,
+                                const int paddingW,
+                                real scaleA,
+                                real scaleB,
+                                real* targetGrad,
+                                const int outStride);
+
+/**
+ * @brief   Averge pool forward.
+ *
+ * @param[in]   frameCnt    batch size of input image.
+ * @param[in]   inputData   input data.
+ * @param[in]   channels    number of channel.
+ * @param[in]   height      image height.
+ * @param[in]   width       image width.
+ * @param[in]   pooledH     output image height.
+ * @param[in]   pooledW     output image width.
+ * @param[in]   sizeX       width of pooling window.
+ * @param[in]   sizeY       height of pooling window.
+ * @param[in]   strideH     pooling stride height.
+ * @param[in]   strideW     pooling stride width.
+ * @param[in]   paddingH    padding height.
+ * @param[in]   paddingW    padding width.
+ * @param[out]  tgtData     output data.
+ * @param[in]   tgtStride   stride between output data samples.
+ * @param[in]   excludeMode whether to consider paddings for size.
+ *
+ */
+extern void hl_avgpool_forward(const int frameCnt,
+                               const real* inputData,
+                               const int channels,
+                               const int height,
+                               const int width,
+                               const int pooledH,
+                               const int pooledW,
+                               const int sizeX,
+                               const int sizeY,
+                               const int strideH,
+                               const int strideW,
+                               const int paddingH,
+                               const int paddingW,
+                               real* tgtData,
+                               const int tgtStride,
+                               bool excludeMode);
+
+/**
+ * @brief   Maximum pool backward.
+ *
+ * @param[in]   frameCnt    batch size of input image.
+ * @param[in]   outGrad     output grad data.
+ * @param[in]   channels    number of channel.
+ * @param[in]   height      image height.
+ * @param[in]   width       image width.
+ * @param[in]   pooledH     output image height.
+ * @param[in]   pooledW     output image width.
+ * @param[in]   sizeX       width of pooling window.
+ * @param[in]   sizeY       height of pooling window.
+ * @param[in]   strideH     pooling stride height.
+ * @param[in]   strideW     pooling stride width.
+ * @param[in]   paddingH    padding height.
+ * @param[in]   paddingW    padding width.
+ * @param[in]   scaleA      scale.
+ * @param[in]   scaleB      scale.
+ * @param[out]  backGrad    output grad.
+ * @param[in]   outStride   stride between output data samples.
+ * @param[in]   excludeMode whether to consider paddings for size.
+ *
+ */
+extern void hl_avgpool_backward(const int frameCnt,
+                                const real* outGrad,
+                                const int channels,
+                                const int height,
+                                const int width,
+                                const int pooledH,
+                                const int pooledW,
+                                const int sizeX,
+                                const int sizeY,
+                                const int strideH,
+                                const int strideW,
+                                int paddingH,
+                                int paddingW,
+                                real scaleA,
+                                real scaleB,
+                                real* backGrad,
+                                const int outStride,
+                                bool excludeMode);
+
+extern void hl_maxpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 real* maxPoolIdxData,
+                                 const int tgtStride);
+
+extern void hl_maxpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int paddingD,
+                                  const int paddingH,
+                                  const int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* targetGrad,
+                                  real* maxPoolIdxData,
+                                  const int outStride);
+
+extern void hl_avgpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 const int tgtStride);
+
+extern void hl_avgpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  int paddingD,
+                                  int paddingH,
+                                  int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* backGrad,
+                                  const int outStride);
+
+/**
+ * @brief   Bilinear interpolation forward.
+ *
+ * @param[in]   inData      input value.
+ * @param[in]   inImgH      input image height.
+ * @param[in]   inImgW      input image width.
+ * @param[in]   inputH      input batchSize.
+ * @param[in]   inputW      input image data dim.
+ * @param[out]  outData     output value.
+ * @param[in]   outImgH     output image height.
+ * @param[in]   outImgW     output image width.
+ * @param[in]   outputH     output batchSize.
+ * @param[in]   outputW     output image data dim.
+ * @param[in]   numChannels number of channels.
+ * @param[in]   ratioH      inImgH / outImgH.
+ * @param[in]   ratioW      inImgW / outImgW.
+ *
+ */
+extern void hl_bilinear_forward(const real* inData,
+                                const size_t inImgH,
+                                const size_t inImgW,
+                                const size_t inputH,
+                                const size_t inputW,
+                                real* outData,
+                                const size_t outImgH,
+                                const size_t outImgW,
+                                const size_t outputH,
+                                const size_t outputW,
+                                const size_t numChannels,
+                                const real ratioH,
+                                const real ratioW);
+
+/**
+ * @brief   Bilinear interpolation backward.
+ *
+ * @param[out]  inGrad      input gradient.
+ * @param[in]   inImgH      input image height.
+ * @param[in]   inImgW      input image width.
+ * @param[in]   inputH      input batchSize.
+ * @param[in]   inputW      input image data dim.
+ * @param[in]   outGrad     output gradient.
+ * @param[in]   outImgH     output image height.
+ * @param[in]   outImgW     output image width.
+ * @param[in]   outputH     output batchSize.
+ * @param[in]   outputW     output image data dim.
+ * @param[in]   numChannels number of channels.
+ * @param[in]   ratioH      inImgH / outImgH.
+ * @param[in]   ratioW      inImgW / outImgW.
+ *
+ */
+extern void hl_bilinear_backward(real* inGrad,
+                                 const size_t inImgH,
+                                 const size_t inImgW,
+                                 const size_t inputH,
+                                 const size_t inputW,
+                                 const real* outGrad,
+                                 const size_t outImgH,
+                                 const size_t outImgW,
+                                 const size_t outputH,
+                                 const size_t outputW,
+                                 const size_t numChannels,
+                                 const real ratioH,
+                                 const real ratioW);
+
+/**
+ * @brief   MaxOut forward.
+ *
+ * @param[in]   inData      input data.
+ * @param[out]  outData     output data.
+ * @param[out]  idData      output maxId.
+ * @param[in]   batchSize   batchSize.
+ * @param[in]   size        number of channels * image height * image width.
+ * @param[in]   featLen     feature length = image height * image width.
+ * @param[in]   groups      number of groups.
+ */
+extern void hl_maxout_forward(const real* inData,
+                              real* outData,
+                              int* idData,
+                              size_t batchSize,
+                              size_t size,
+                              size_t featLen,
+                              size_t groups);
+
+/**
+ * @brief   MaxOut backward.
+ *
+ * @param[out]  inGrad      input grad data.
+ * @param[in]   outGrad     output grad data.
+ * @param[in]   idData      output maxId.
+ * @param[in]   batchSize   batchSize.
+ * @param[in]   size        number of channels * image height * image width.
+ * @param[in]   featLen     feature length = image height * image width.
+ * @param[in]   groups      number of groups.
+ */
+extern void hl_maxout_backward(real* inGrad,
+                               const real* outGrad,
+                               const int* idData,
+                               size_t batchSize,
+                               size_t size,
+                               size_t featLen,
+                               size_t groups);
+
+/**
+ * @brief   Upsample forward.
+ * @param[in]   inputData   input data.
+ * @param[out]  maskData    the mask data from MaxPoolWithMaskLayer.
+ * @param[out]  batchSize   the batch size of the input.
+ * @param[in]   imgSizeH    image height.
+ * @param[in]   imgSizeW    image width.
+ * @param[in]   channels    the input channels.
+ * @param[in]   outputH     the output height.
+ * @param[in]   outputW     the output widht.
+ * @param[out]  outputData  output data.
+ */
+extern void hl_upsample_forward(real* inputData,
+                                real* maskData,
+                                size_t batchSize,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t channels,
+                                size_t outputH,
+                                size_t outputW,
+                                real* outputData);
+
+/**
+ * @brief   Upsample backward.
+ * @param[in]   outputGradData  the output grad data.
+ * @param[out]  maskData    the mask data from MaxPoolWithMaskLayer.
+ * @param[out]  batchSize       the batch size of the input.
+ * @param[in]   imgSizeH        image height.
+ * @param[in]   imgSizeW        image width.
+ * @param[in]   channels        the input channels.
+ * @param[in]   outputH         the output height.
+ * @param[in]   outputW         the output widht.
+ * @param[out]  inputGradData   the input grad data.
+ */
+extern void hl_upsample_backward(real* outputGradData,
+                                 real* maskData,
+                                 size_t batchSize,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t channels,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 real* inputGradData);
+
+#endif  // HL_CNN_H_
diff --git a/paddle/cuda/include/hl_cpu_gru.cuh b/paddle/legacy/cuda/include/hl_cpu_gru.cuh
similarity index 100%
rename from paddle/cuda/include/hl_cpu_gru.cuh
rename to paddle/legacy/cuda/include/hl_cpu_gru.cuh
diff --git a/paddle/cuda/include/hl_cpu_lstm.cuh b/paddle/legacy/cuda/include/hl_cpu_lstm.cuh
similarity index 100%
rename from paddle/cuda/include/hl_cpu_lstm.cuh
rename to paddle/legacy/cuda/include/hl_cpu_lstm.cuh
diff --git a/paddle/cuda/include/hl_cpu_matrix_kernel.cuh b/paddle/legacy/cuda/include/hl_cpu_matrix_kernel.cuh
similarity index 100%
rename from paddle/cuda/include/hl_cpu_matrix_kernel.cuh
rename to paddle/legacy/cuda/include/hl_cpu_matrix_kernel.cuh
diff --git a/paddle/cuda/include/hl_cpu_matrix_kernel_detail.cuh b/paddle/legacy/cuda/include/hl_cpu_matrix_kernel_detail.cuh
similarity index 100%
rename from paddle/cuda/include/hl_cpu_matrix_kernel_detail.cuh
rename to paddle/legacy/cuda/include/hl_cpu_matrix_kernel_detail.cuh
diff --git a/paddle/cuda/include/hl_cpu_scalar.cuh b/paddle/legacy/cuda/include/hl_cpu_scalar.cuh
similarity index 100%
rename from paddle/cuda/include/hl_cpu_scalar.cuh
rename to paddle/legacy/cuda/include/hl_cpu_scalar.cuh
diff --git a/paddle/cuda/include/hl_cpu_simd_neon.cuh b/paddle/legacy/cuda/include/hl_cpu_simd_neon.cuh
similarity index 100%
rename from paddle/cuda/include/hl_cpu_simd_neon.cuh
rename to paddle/legacy/cuda/include/hl_cpu_simd_neon.cuh
diff --git a/paddle/cuda/include/hl_cpu_simd_sse.cuh b/paddle/legacy/cuda/include/hl_cpu_simd_sse.cuh
similarity index 100%
rename from paddle/cuda/include/hl_cpu_simd_sse.cuh
rename to paddle/legacy/cuda/include/hl_cpu_simd_sse.cuh
diff --git a/paddle/cuda/include/hl_cuda.h b/paddle/legacy/cuda/include/hl_cuda.h
similarity index 100%
rename from paddle/cuda/include/hl_cuda.h
rename to paddle/legacy/cuda/include/hl_cuda.h
diff --git a/paddle/cuda/include/hl_cuda.ph b/paddle/legacy/cuda/include/hl_cuda.ph
similarity index 100%
rename from paddle/cuda/include/hl_cuda.ph
rename to paddle/legacy/cuda/include/hl_cuda.ph
diff --git a/paddle/cuda/include/hl_cuda_cublas.h b/paddle/legacy/cuda/include/hl_cuda_cublas.h
similarity index 100%
rename from paddle/cuda/include/hl_cuda_cublas.h
rename to paddle/legacy/cuda/include/hl_cuda_cublas.h
diff --git a/paddle/cuda/include/hl_cuda_cudnn.h b/paddle/legacy/cuda/include/hl_cuda_cudnn.h
similarity index 100%
rename from paddle/cuda/include/hl_cuda_cudnn.h
rename to paddle/legacy/cuda/include/hl_cuda_cudnn.h
diff --git a/paddle/cuda/include/hl_cuda_cudnn.ph b/paddle/legacy/cuda/include/hl_cuda_cudnn.ph
similarity index 100%
rename from paddle/cuda/include/hl_cuda_cudnn.ph
rename to paddle/legacy/cuda/include/hl_cuda_cudnn.ph
diff --git a/paddle/cuda/include/hl_device_functions.cuh b/paddle/legacy/cuda/include/hl_device_functions.cuh
similarity index 100%
rename from paddle/cuda/include/hl_device_functions.cuh
rename to paddle/legacy/cuda/include/hl_device_functions.cuh
diff --git a/paddle/cuda/include/hl_functions.h b/paddle/legacy/cuda/include/hl_functions.h
similarity index 100%
rename from paddle/cuda/include/hl_functions.h
rename to paddle/legacy/cuda/include/hl_functions.h
diff --git a/paddle/cuda/include/hl_gpu.h b/paddle/legacy/cuda/include/hl_gpu.h
similarity index 100%
rename from paddle/cuda/include/hl_gpu.h
rename to paddle/legacy/cuda/include/hl_gpu.h
diff --git a/paddle/cuda/include/hl_gpu_functions.cuh b/paddle/legacy/cuda/include/hl_gpu_functions.cuh
similarity index 100%
rename from paddle/cuda/include/hl_gpu_functions.cuh
rename to paddle/legacy/cuda/include/hl_gpu_functions.cuh
diff --git a/paddle/legacy/cuda/include/hl_gpu_gru.cuh b/paddle/legacy/cuda/include/hl_gpu_gru.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..8d299572c73e879a3a1e9fb60608c4f3abd1f685
--- /dev/null
+++ b/paddle/legacy/cuda/include/hl_gpu_gru.cuh
@@ -0,0 +1,393 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_GPU_GRU_CUH_
+#define HL_GPU_GRU_CUH_
+
+#ifdef __NVCC__
+
+#include "paddle/legacy/utils/Logging.h"
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template<class OpResetOutput, bool isBatch>
+__global__ void KeGruForwardResetOutput(OpResetOutput opResetOutput,
+                                        real *gateValue,
+                                        real *resetOutputValue,
+                                        real *prevOutputValue,
+                                        int frameSize,
+                                        int batchSize,
+                                        hl_activation_mode_t active_gate) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    gateValue += batchIdx * 3 * frameSize;
+    resetOutputValue += batchIdx * frameSize;
+  }
+
+  real rPrevOut = 0;
+  real rValueResetOutput;
+  real rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
+  real rValueResetGate  = gateValue[frameIdx + frameSize * 1];
+
+  if (prevOutputValue) {
+    if (isBatch) prevOutputValue += batchIdx * frameSize;
+    rPrevOut = prevOutputValue[frameIdx];
+  }
+
+  opResetOutput(rValueUpdateGate,
+                rValueResetGate,
+                rPrevOut,
+                rValueResetOutput,
+                hppl::gpu::forward[active_gate]);
+
+  gateValue[frameIdx + frameSize * 0] = rValueUpdateGate;
+  gateValue[frameIdx + frameSize * 1] = rValueResetGate;
+  resetOutputValue[frameIdx] = rValueResetOutput;
+}
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template<class OpFinalOutput, bool isBatch>
+__global__ void KeGruForwardFinalOutput(OpFinalOutput opFinalOutput,
+                                        real *gateValue,
+                                        real *prevOutputValue,
+                                        real *outputValue,
+                                        int frameSize,
+                                        int batchSize,
+                                        hl_activation_mode_t active_node) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    gateValue += batchIdx * 3 * frameSize;
+    outputValue += batchIdx * frameSize;
+  }
+
+  real rOutput;
+  real rPrevOut = 0;
+  real rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
+  real rValueFrameState = gateValue[frameIdx + frameSize * 2];
+
+  if (prevOutputValue) {
+    if (isBatch) prevOutputValue += batchIdx * frameSize;
+    rPrevOut = prevOutputValue[frameIdx];
+  }
+
+  opFinalOutput(rValueUpdateGate,
+                rValueFrameState,
+                rPrevOut,
+                rOutput,
+                hppl::gpu::forward[active_node]);
+
+  gateValue[frameIdx + frameSize * 2] = rValueFrameState;
+  outputValue[frameIdx] = rOutput;
+}
+
+template<class OpResetOutput, class OpFinalOutput>
+void hl_gpu_gru_forward(OpResetOutput opResetOutput,
+                        OpFinalOutput opFinalOutput,
+                        hl_gru_value value,
+                        int frameSize,
+                        int batchSize,
+                        hl_activation_mode_t active_node,
+                        hl_activation_mode_t active_gate) {
+  dim3 threads;
+  dim3 grid;
+  if (batchSize == 1) {
+    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
+    int frameBlocks = (frameSize + 1024 - 1) / 1024;
+    threads = dim3(framePerBlock, 1);
+    grid = dim3(frameBlocks, 1);
+  } else {
+    threads = dim3(32, 32);
+    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+  }
+
+  if (value.prevOutValue) {
+    hl_matrix_mul(value.prevOutValue, HPPL_OP_N,
+                  value.gateWeight, HPPL_OP_N,
+                  value.gateValue,
+                  batchSize, 2*frameSize, frameSize,
+                  /*alpha = */ 1, /*beta = */ 1,
+                  frameSize, 2* frameSize, 3*frameSize);
+  }
+
+  if (batchSize == 1) {
+    KeGruForwardResetOutput<OpResetOutput, /* isBatch= */false>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetOutput,
+        value.gateValue, value.resetOutputValue, value.prevOutValue,
+        frameSize, batchSize, active_gate);
+  } else {
+    KeGruForwardResetOutput<OpResetOutput, /* isBatch= */true>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetOutput,
+        value.gateValue, value.resetOutputValue, value.prevOutValue,
+        frameSize, batchSize, active_gate);
+  }
+
+  if (value.prevOutValue) {
+    hl_matrix_mul(value.resetOutputValue, HPPL_OP_N,
+                  value.stateWeight, HPPL_OP_N,
+                  value.gateValue + 2*frameSize,
+                  batchSize, frameSize, frameSize,
+                  /*alpha = */ 1, /*beta = */ 1,
+                  frameSize, frameSize, 3*frameSize);
+  }
+
+  if (batchSize == 1) {
+    KeGruForwardFinalOutput<OpFinalOutput, /* isBatch= */false>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(opFinalOutput,
+        value.gateValue, value.prevOutValue, value.outputValue,
+        frameSize, batchSize, active_node);
+  } else {
+    KeGruForwardFinalOutput<OpFinalOutput, /* isBatch= */true>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(opFinalOutput,
+        value.gateValue, value.prevOutValue, value.outputValue,
+        frameSize, batchSize, active_node);
+  }
+
+  CHECK_SYNC("hl_gpu_gru_forward failed");
+}
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template<class OpStateGrad, bool isBatch>
+__global__ void KeGruBackwardStateGrad(OpStateGrad opStateGrad,
+                                       real *gateValue,
+                                       real *gateGrad,
+                                       real *prevOutValue,
+                                       real *prevOutGrad,
+                                       real *outputGrad,
+                                       int frameSize,
+                                       int batchSize,
+                                       hl_activation_mode_t active_node) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    gateValue += batchIdx * 3 * frameSize;
+    gateGrad  += batchIdx * 3 * frameSize;
+    outputGrad += batchIdx * frameSize;
+  }
+
+  real rUpdateGateGrad;
+  real rFrameStateGrad;
+  real rPrevOutValue = 0;
+  real rPrevOutGrad  = 0;
+  real rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
+  real rFrameStateValue = gateValue[frameIdx + frameSize * 2];
+  real rOutGrad  = outputGrad[frameIdx];
+
+  if (prevOutValue && prevOutGrad) {
+    if (isBatch) prevOutValue += batchIdx * frameSize;
+    rPrevOutValue = prevOutValue[frameIdx];
+
+    if (isBatch) prevOutGrad  += batchIdx * frameSize;
+    rPrevOutGrad  = prevOutGrad[frameIdx];
+  }
+
+  opStateGrad(rUpdateGateValue,
+              rUpdateGateGrad,
+              rFrameStateValue,
+              rFrameStateGrad,
+              rPrevOutValue,
+              rPrevOutGrad,
+              rOutGrad,
+              hppl::gpu::backward[active_node]);
+
+  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
+  gateGrad[frameIdx + frameSize * 2] = rFrameStateGrad;
+  if (prevOutGrad) {
+    prevOutGrad[frameIdx] = rPrevOutGrad;
+  }
+}
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template<class OpResetGrad, bool isBatch>
+__global__ void KeGruBackwardResetGrad(OpResetGrad opResetGrad,
+                                       real *gateValue,
+                                       real *gateGrad,
+                                       real *prevOutValue,
+                                       real *prevOutGrad,
+                                       real *resetOutputGrad,
+                                       int frameSize,
+                                       int batchSize,
+                                       hl_activation_mode_t active_gate) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    gateValue += batchIdx * 3 * frameSize;
+    gateGrad  += batchIdx * 3 * frameSize;
+    resetOutputGrad += batchIdx * frameSize;
+  }
+
+  real rResetGateGrad;
+  real rPrevOutValue = 0;
+  real rPrevOutGrad  = 0;
+  real rResetOutputGrad = 0;
+  real rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
+  real rUpdateGateGrad  = gateGrad[frameIdx + frameSize * 0];
+  real rResetGateValue  = gateValue[frameIdx + frameSize * 1];
+
+  if (prevOutValue && prevOutGrad) {
+    if (isBatch) prevOutValue += batchIdx * frameSize;
+    if (isBatch) prevOutGrad  += batchIdx * frameSize;
+    rPrevOutValue = prevOutValue[frameIdx];
+    rPrevOutGrad  = prevOutGrad[frameIdx];
+    rResetOutputGrad = resetOutputGrad[frameIdx];
+  }
+
+  opResetGrad(rUpdateGateValue,
+              rUpdateGateGrad,
+              rResetGateValue,
+              rResetGateGrad,
+              rPrevOutValue,
+              rPrevOutGrad,
+              rResetOutputGrad,
+              hppl::gpu::backward[active_gate]);
+
+  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
+  gateGrad[frameIdx + frameSize * 1] = rResetGateGrad;
+  if (prevOutGrad) {
+    prevOutGrad[frameIdx] = rPrevOutGrad;
+  }
+}
+
+template<class OpStateGrad, class OpResetGrad>
+void hl_gpu_gru_backward(OpStateGrad opStateGrad,
+                         OpResetGrad opResetGrad,
+                         hl_gru_value value,
+                         hl_gru_grad  grad,
+                         int frameSize,
+                         int batchSize,
+                         hl_activation_mode_t active_node,
+                         hl_activation_mode_t active_gate) {
+  dim3 threads;
+  dim3 grid;
+  if (batchSize == 1) {
+    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
+    int frameBlocks = (frameSize + 1024 - 1) / 1024;
+    threads = dim3(framePerBlock, 1);
+    grid = dim3(frameBlocks, 1);
+  } else {
+    threads = dim3(32, 32);
+    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+  }
+
+  if (batchSize == 1) {
+    KeGruBackwardStateGrad<OpStateGrad, /* isBatch= */false>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(opStateGrad,
+        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
+        grad.outputGrad, frameSize, batchSize, active_node);
+  } else {
+    KeGruBackwardStateGrad<OpStateGrad, /* isBatch= */true>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(opStateGrad,
+        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
+        grad.outputGrad, frameSize, batchSize, active_node);
+  }
+
+  if (value.prevOutValue && grad.prevOutGrad) {
+    hl_matrix_mul(grad.gateGrad + 2*frameSize, HPPL_OP_N,
+                  value.stateWeight, HPPL_OP_T,
+                  grad.resetOutputGrad,
+                  batchSize, frameSize, frameSize,
+                  /*alpha = */ 1, /*beta = */ 0,
+                  3*frameSize, frameSize, frameSize);
+    if (grad.stateWeightGrad) {
+      hl_matrix_mul(value.resetOutputValue, HPPL_OP_T,
+                    grad.gateGrad + 2*frameSize, HPPL_OP_N,
+                    grad.stateWeightGrad,
+                    frameSize, frameSize, batchSize,
+                    /*alpha = */ 1, /*beta = */ 1,
+                    frameSize, 3*frameSize, frameSize);
+    }
+  }
+
+  if (batchSize == 1) {
+    KeGruBackwardResetGrad<OpResetGrad, /* isBatch= */false>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetGrad,
+        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
+        grad.resetOutputGrad, frameSize, batchSize, active_gate);
+  } else {
+    KeGruBackwardResetGrad<OpResetGrad, /* isBatch= */true>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(opResetGrad,
+        value.gateValue, grad.gateGrad, value.prevOutValue, grad.prevOutGrad,
+        grad.resetOutputGrad, frameSize, batchSize, active_gate);
+  }
+
+  if (grad.prevOutGrad && value.prevOutValue) {
+    hl_matrix_mul(grad.gateGrad, HPPL_OP_N,
+                  value.gateWeight, HPPL_OP_T,
+                  grad.prevOutGrad,
+                  batchSize, frameSize, 2*frameSize,
+                  /*alpha = */ 1, /*beta = */ 1,
+                  3*frameSize, 2*frameSize, frameSize);
+    if (grad.gateWeightGrad) {
+      hl_matrix_mul(value.prevOutValue, HPPL_OP_T,
+                    grad.gateGrad, HPPL_OP_N,
+                    grad.gateWeightGrad,
+                    frameSize, 2*frameSize, batchSize,
+                    /*alpha = */ 1, /*beta = */ 1,
+                    frameSize, 3*frameSize, 2*frameSize);
+    }
+  }
+
+  CHECK_SYNC("hl_gpu_gru_backward failed");
+}
+
+#else
+
+template<class OpResetOutput, class OpFinalOutput>
+void hl_gpu_gru_forward(OpResetOutput opResetOutput,
+                        OpFinalOutput opFinalOutput,
+                        hl_gru_value value,
+                        int frameSize,
+                        int batchSize,
+                        hl_activation_mode_t active_node,
+                        hl_activation_mode_t active_gate) {}
+
+template<class OpStateGrad, class OpResetGrad>
+void hl_gpu_gru_backward(OpStateGrad opStateGrad,
+                         OpResetGrad opResetGrad,
+                         hl_gru_value value,
+                         hl_gru_grad  grad,
+                         int frameSize,
+                         int batchSize,
+                         hl_activation_mode_t active_node,
+                         hl_activation_mode_t active_gate) {}
+
+#endif
+
+#endif /* HL_GPU_GRU_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_gpu_lstm.cuh b/paddle/legacy/cuda/include/hl_gpu_lstm.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..aae011b838c0eca1197f55d236d759eab8ea993c
--- /dev/null
+++ b/paddle/legacy/cuda/include/hl_gpu_lstm.cuh
@@ -0,0 +1,300 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_GPU_LSTM_CUH_
+#define HL_GPU_LSTM_CUH_
+
+#ifdef __NVCC__
+
+#include "paddle/legacy/utils/Logging.h"
+#include "hl_device_functions.cuh"
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template<class Op, bool isBatch>
+__global__ void KeLstmForward(Op op,
+                              hl_lstm_value value,
+                              int frameSize,
+                              int batchSize,
+                              hl_activation_mode_t active_node,
+                              hl_activation_mode_t active_gate,
+                              hl_activation_mode_t active_state) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    value.gateValue += batchIdx * frameSize * 4;
+    value.outputValue += batchIdx * frameSize;
+    value.stateValue  += batchIdx * frameSize;
+    value.stateActiveValue += batchIdx * frameSize;
+  }
+
+  real rState;
+  real rPrevState = 0;
+  real rStateAtv;
+  real rOut;
+  real rValueIn;
+  real rValueIg;
+  real rValueFg;
+  real rValueOg;
+  real rCheckI = value.checkIg[frameIdx];
+  real rCheckF = value.checkFg[frameIdx];
+  real rCheckO = value.checkOg[frameIdx];
+
+  rValueIn = value.gateValue[frameIdx];
+  rValueIg = value.gateValue[frameIdx + frameSize];
+  rValueFg = value.gateValue[frameIdx + frameSize * 2];
+  rValueOg = value.gateValue[frameIdx + frameSize * 3];
+
+  if (value.prevStateValue) {
+    if (isBatch) value.prevStateValue += batchIdx * frameSize;
+    rPrevState = value.prevStateValue[frameIdx];
+  }
+
+  op(rValueIn,
+     rValueIg,
+     rValueFg,
+     rValueOg,
+     rPrevState,
+     rState,
+     rStateAtv,
+     rOut,
+     rCheckI,
+     rCheckF,
+     rCheckO,
+     hppl::gpu::forward[active_node],
+     hppl::gpu::forward[active_gate],
+     hppl::gpu::forward[active_state]);
+
+  value.gateValue[frameIdx] = rValueIn;
+  value.gateValue[frameIdx + frameSize] = rValueIg;
+  value.gateValue[frameIdx + frameSize * 2] = rValueFg;
+  value.gateValue[frameIdx + frameSize * 3] = rValueOg;
+
+  value.stateValue[frameIdx] = rState;
+  value.stateActiveValue[frameIdx] = rStateAtv;
+  value.outputValue[frameIdx] = rOut;
+}
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template<class Op, bool isBatch>
+__global__ void KeLstmBackward(Op op,
+                               hl_lstm_value value,
+                               hl_lstm_grad grad,
+                               int frameSize,
+                               int batchSize,
+                               hl_activation_mode_t active_node,
+                               hl_activation_mode_t active_gate,
+                               hl_activation_mode_t active_state) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    value.gateValue += batchIdx * frameSize * 4;
+    value.stateValue += batchIdx * frameSize;
+    value.stateActiveValue += batchIdx * frameSize;
+    grad.gateGrad += batchIdx * frameSize * 4;
+    grad.stateGrad += batchIdx * frameSize;
+    grad.outputGrad += batchIdx * frameSize;
+  }
+
+  real rValueIn;
+  real rValueIg;
+  real rValueFg;
+  real rValueOg;
+  real rGradIn;
+  real rGradIg;
+  real rGradFg;
+  real rGradOg;
+  real rPrevState = 0;
+  real rPrevStateGrad;
+  real rState;
+  real rStateGrad;
+  real rStateAtv;
+  real rOutputGrad;
+  real rCheckI = value.checkIg[frameIdx];
+  real rCheckF = value.checkFg[frameIdx];
+  real rCheckO = value.checkOg[frameIdx];
+  real rCheckIGrad;
+  real rCheckFGrad;
+  real rCheckOGrad;
+
+  rValueIn = value.gateValue[frameIdx];
+  rValueIg = value.gateValue[frameIdx + frameSize];
+  rValueFg = value.gateValue[frameIdx + frameSize * 2];
+  rValueOg = value.gateValue[frameIdx + frameSize * 3];
+  rState = value.stateValue[frameIdx];
+  rStateAtv = value.stateActiveValue[frameIdx];
+  rOutputGrad = grad.outputGrad[frameIdx];
+  rStateGrad = grad.stateGrad[frameIdx];
+
+  if (value.prevStateValue) {
+    if (isBatch) value.prevStateValue += batchIdx * frameSize;
+    rPrevState = value.prevStateValue[frameIdx];
+  }
+
+  op(rValueIn,
+     rValueIg,
+     rValueFg,
+     rValueOg,
+     rGradIn,
+     rGradIg,
+     rGradFg,
+     rGradOg,
+     rPrevState,
+     rPrevStateGrad,
+     rState,
+     rStateGrad,
+     rStateAtv,
+     rOutputGrad,
+     rCheckI,
+     rCheckF,
+     rCheckO,
+     rCheckIGrad,
+     rCheckFGrad,
+     rCheckOGrad,
+     hppl::gpu::backward[active_node],
+     hppl::gpu::backward[active_gate],
+     hppl::gpu::backward[active_state]);
+
+  grad.gateGrad[frameIdx] = rGradIn;
+  grad.gateGrad[frameIdx + frameSize    ] = rGradIg;
+  grad.gateGrad[frameIdx + frameSize * 2] = rGradFg;
+  grad.gateGrad[frameIdx + frameSize * 3] = rGradOg;
+  grad.stateGrad[frameIdx] = rStateGrad;
+  if (grad.prevStateGrad) {
+    if (isBatch) grad.prevStateGrad += batchIdx * frameSize;
+    grad.prevStateGrad[frameIdx] = rPrevStateGrad;
+  }
+
+  if (isBatch) {
+    if (value.prevStateValue) {
+      if (grad.checkIgGrad) paddle::paddleAtomicAdd(grad.checkIgGrad+frameIdx, rCheckIGrad);
+      if (grad.checkFgGrad) paddle::paddleAtomicAdd(grad.checkFgGrad+frameIdx, rCheckFGrad);
+    }
+    if (grad.checkOgGrad) paddle::paddleAtomicAdd(grad.checkOgGrad+frameIdx, rCheckOGrad);
+  } else {
+    if (value.prevStateValue) {
+      if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad;
+      if (grad.checkFgGrad) grad.checkFgGrad[frameIdx] += rCheckFGrad;
+    }
+    if (grad.checkOgGrad) grad.checkOgGrad[frameIdx] += rCheckOGrad;
+  }
+}
+
+template<class Op>
+void hl_gpu_lstm_forward(Op op,
+                         hl_lstm_value value,
+                         int frameSize,
+                         int batchSize,
+                         hl_activation_mode_t active_node,
+                         hl_activation_mode_t active_gate,
+                         hl_activation_mode_t active_state) {
+  dim3 threads;
+  dim3 grid;
+  if (batchSize == 1) {
+    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
+    int frameBlocks = (frameSize + 1024 - 1) / 1024;
+    threads = dim3(framePerBlock, 1);
+    grid = dim3(frameBlocks, 1);
+  } else {
+    /* framePerBlock = 32 batchPerBlock = 32 */
+    threads = dim3(32, 32);
+    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+  }
+
+  if (batchSize == 1) {
+    KeLstmForward<Op, /* isBatch= */false>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value,
+      frameSize, batchSize, active_node, active_gate, active_state);
+  } else {
+    KeLstmForward<Op, /* isBatch= */true>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value,
+      frameSize, batchSize, active_node, active_gate, active_state);
+  }
+
+  CHECK_SYNC("hl_gpu_lstm_forward failed");
+}
+
+template<class Op>
+void hl_gpu_lstm_backward(Op op,
+                          hl_lstm_value value,
+                          hl_lstm_grad grad,
+                          int frameSize,
+                          int batchSize,
+                          hl_activation_mode_t active_node,
+                          hl_activation_mode_t active_gate,
+                          hl_activation_mode_t active_state) {
+  dim3 threads;
+  dim3 grid;
+  if (batchSize == 1) {
+    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
+    int frameBlocks = (frameSize + 1024 - 1) / 1024;
+    threads = dim3(framePerBlock, 1);
+    grid = dim3(frameBlocks, 1);
+  } else {
+    /* framePerBlock = 32 batchPerBlock = 32 */
+    threads = dim3(32, 32);
+    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+  }
+
+  if (batchSize == 1) {
+    KeLstmBackward<Op, /* isBatch= */false>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value, grad,
+      frameSize, batchSize, active_node, active_gate, active_state);
+  } else {
+    KeLstmBackward<Op, /* isBatch= */true>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>(op, value, grad,
+      frameSize, batchSize, active_node, active_gate, active_state);
+  }
+
+  CHECK_SYNC("hl_gpu_lstm_backward failed");
+}
+
+#else
+
+template<class Op>
+void hl_gpu_lstm_forward(Op op,
+                         hl_lstm_value value,
+                         int frameSize,
+                         int batchSize,
+                         hl_activation_mode_t active_node,
+                         hl_activation_mode_t active_gate,
+                         hl_activation_mode_t active_state) {}
+
+template<class Op>
+void hl_gpu_lstm_backward(Op op,
+                          hl_lstm_value value,
+                          hl_lstm_grad grad,
+                          int frameSize,
+                          int batchSize,
+                          hl_activation_mode_t active_node,
+                          hl_activation_mode_t active_gate,
+                          hl_activation_mode_t active_state) {}
+
+#endif
+
+#endif /* HL_GPU_LSTM_CUH_ */
diff --git a/paddle/legacy/cuda/include/hl_gpu_matrix_kernel.cuh b/paddle/legacy/cuda/include/hl_gpu_matrix_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6177d23657fba5b2800041a3dd7b5f76bf35aa1a
--- /dev/null
+++ b/paddle/legacy/cuda/include/hl_gpu_matrix_kernel.cuh
@@ -0,0 +1,629 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+
+#ifndef HL_GPU_MATRIX_KERNEL_CUH_
+#define HL_GPU_MATRIX_KERNEL_CUH_
+
+#include <algorithm>
+#include "paddle/legacy/utils/Logging.h"
+#include "hl_base.h"
+
+#ifdef __NVCC__
+/* gpu apply interface */
+
+template<class T, class Op>
+__global__ void KeEltWiseUnaryOp(T* A_d, const int border, Op op) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < border) {
+    op.gpuOperator(A_d[idx]);
+  }
+}
+
+template<class T, class Op>
+__global__ void KeEltWiseUnaryOp(T* A_d,
+                                 int dimM,
+                                 int dimN,
+                                 int lda,
+                                 Op op) {
+  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
+    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
+      op.gpuOperator(A_d[i * lda + j]);
+    }
+  }
+}
+
+template<class T, class Op>
+__global__ void KeEltWiseBinaryOp(T* A_d, T *B_d, const int border, Op op) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < border) {
+    op.gpuOperator(A_d[idx], B_d[idx]);
+  }
+}
+
+template<class T, class Op, bool BAsRowVector, bool BAsColVector>
+__global__ void KeEltWiseBinaryOp(T *A_d,
+                                  T *B_d,
+                                  int dimM,
+                                  int dimN,
+                                  int lda,
+                                  int ldb,
+                                  Op op) {
+  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
+    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
+      if (BAsRowVector == 0 && BAsColVector == 0) {
+        op.gpuOperator(A_d[i * lda + j], B_d[i * ldb + j]);
+      } else if (BAsRowVector == 1 && BAsColVector == 0) {
+        op.gpuOperator(A_d[i * lda + j], B_d[j]);
+      } else if (BAsRowVector == 0 && BAsColVector == 1) {
+        op.gpuOperator(A_d[i * lda + j], B_d[i * ldb]);
+      } else {
+        op.gpuOperator(A_d[i * lda + j], B_d[0]);
+      }
+    }
+  }
+}
+
+template<class T, class Op>
+__global__ void KeEltWiseTernaryOp(T* A_d,
+                                   T *B_d,
+                                   T *C_d,
+                                   const int border,
+                                   Op op) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < border) {
+    op.gpuOperator(A_d[idx], B_d[idx], C_d[idx]);
+  }
+}
+
+template<class T, class Op, bool CAsRowVector, bool CAsColVector>
+__global__ void KeEltWiseTernaryOp(T* A_d,
+                                   T* B_d,
+                                   T* C_d,
+                                   int dimM,
+                                   int dimN,
+                                   int lda,
+                                   int ldb,
+                                   int ldc,
+                                   Op op) {
+  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
+    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
+      if (CAsRowVector == 0 && CAsColVector == 0) {
+        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[i*ldc + j]);
+      } else if (CAsRowVector == 1 && CAsColVector == 0) {
+        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[j]);
+      } else if (CAsRowVector == 0 && CAsColVector == 1) {
+        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[i*ldc]);
+      } else {
+        op.gpuOperator(A_d[i*lda + j], B_d[i*ldb + j], C_d[0]);
+      }
+    }
+  }
+}
+
+template<class T, class Op>
+__global__ void KeEltWiseQuaternaryOp(T* A_d,
+                                      T* B_d,
+                                      T* C_d,
+                                      T* D_d,
+                                      const int border,
+                                      Op op) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < border) {
+    op.gpuOperator(A_d[idx], B_d[idx], C_d[idx], D_d[idx]);
+  }
+}
+
+template<class T, class Op>
+__global__ void KeEltWiseQuaternaryOp(T* A_d,
+                                      T* B_d,
+                                      T* C_d,
+                                      T* D_d,
+                                      int dimM,
+                                      int dimN,
+                                      int lda,
+                                      int ldb,
+                                      int ldc,
+                                      int ldd,
+                                      Op op) {
+  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  for (int i = rowIdx; i < dimM; i += gridDim.y * blockDim.y) {
+    for (int j = colIdx; j < dimN; j += gridDim.x * blockDim.x) {
+      op.gpuOperator(A_d[i*lda + j],
+        B_d[i*ldb + j], C_d[i*ldc + j], D_d[i*ldd + j]);
+    }
+  }
+}
+
+/**
+ * @brief   gpu element wise unary operator.
+ */
+template <class T, class Op>
+void hl_gpu_apply_unary_op(Op op, T* A_d, int dimM, int dimN, int lda) {
+  CHECK_NOTNULL(A_d);
+
+  if (dimM == 1 || dimN == lda) {
+    int size = dimM * dimN;
+    int blockSize = size <= 1024 ? size : 1024;
+    int gridSize = (size + 1024 - 1) / 1024;
+    KeEltWiseUnaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+      (A_d, size, op);
+  } else {
+    int blockSizeY = std::min(32, dimM);
+    int blockSizeX = (32 / blockSizeY) * 32;
+    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
+    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
+    dim3 threads(blockSizeX, blockSizeY);
+    dim3 grid(gridSizeX, gridSizeY);
+    KeEltWiseUnaryOp<T, Op><<<grid, threads, 0, STREAM_DEFAULT>>>
+      (A_d, dimM, dimN, lda, op);
+  }
+
+  CHECK_SYNC("hl_gpu_apply_unary_op failed");
+}
+
+/**
+ * @brief   gpu element wise binary operator.
+ */
+template <class T, class Op, bool BAsRowVector, bool BAsColVector>
+void hl_gpu_apply_binary_op(Op op,
+                            T* A_d,
+                            T* B_d,
+                            int dimM,
+                            int dimN,
+                            int lda,
+                            int ldb) {
+  CHECK_NOTNULL(A_d);
+
+  if ((BAsRowVector == 0 && BAsColVector == 0) &&
+      ((dimM == 1) || (dimN == lda && dimN == ldb))) {
+    int size = dimM * dimN;
+    int blockSize = size <= 1024 ? size : 1024;
+    int gridSize = (size + 1024 - 1) / 1024;
+    KeEltWiseBinaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+      (A_d, B_d, size, op);
+  } else {
+    int blockSizeY = std::min(32, dimM);
+    int blockSizeX = (32 / blockSizeY) * 32;
+    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
+    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
+    dim3 threads(blockSizeX, blockSizeY);
+    dim3 grid(gridSizeX, gridSizeY);
+    KeEltWiseBinaryOp<T, Op, BAsRowVector, BAsColVector>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>
+      (A_d, B_d, dimM, dimN, lda, ldb, op);
+  }
+
+  CHECK_SYNC("hl_gpu_apply_binary_op failed");
+}
+
+/**
+ * @brief   gpu element wise ternary operator.
+ */
+template <class T, class Op, bool CAsRowVector, bool CAsColVector>
+void hl_gpu_apply_ternary_op(Op op,
+                             T* A_d,
+                             T* B_d,
+                             T* C_d,
+                             int dimM,
+                             int dimN,
+                             int lda,
+                             int ldb,
+                             int ldc) {
+  CHECK_NOTNULL(A_d);
+
+  if ((CAsRowVector == 0 && CAsColVector == 0) &&
+      ((dimM == 1) || (dimN == lda && dimN == ldb && dimN == ldc))) {
+    int size = dimM * dimN;
+    int blockSize = size <= 1024 ? size : 1024;
+    int gridSize = (size + 1024 - 1) / 1024;
+    KeEltWiseTernaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+      (A_d, B_d, C_d, size, op);
+  } else {
+    int blockSizeY = std::min(32, dimM);
+    int blockSizeX = (32 / blockSizeY) * 32;
+    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
+    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
+    dim3 threads(blockSizeX, blockSizeY);
+    dim3 grid(gridSizeX, gridSizeY);
+    KeEltWiseTernaryOp<T, Op, CAsRowVector, CAsColVector>
+      <<<grid, threads, 0, STREAM_DEFAULT>>>
+      (A_d, B_d, C_d, dimM, dimN, lda, ldb, ldc, op);
+  }
+
+  CHECK_SYNC("hl_gpu_apply_ternary_op failed");
+}
+
+
+/**
+ * @brief   gpu element wise quaternary operator.
+ */
+template <class T, class Op>
+void hl_gpu_apply_quaternary_op(Op op,
+                                T* A_d,
+                                T* B_d,
+                                T* C_d,
+                                T* D_d,
+                                int dimM,
+                                int dimN,
+                                int lda,
+                                int ldb,
+                                int ldc,
+                                int ldd) {
+  CHECK_NOTNULL(A_d);
+
+  if ((dimM == 1) ||
+      (dimN == lda && dimN == ldb && dimN == ldc && dimN == ldd)) {
+    int size = dimM * dimN;
+    int blockSize = size <= 1024 ? size : 1024;
+    int gridSize = (size + 1024 - 1) / 1024;
+    KeEltWiseQuaternaryOp<T, Op><<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+      (A_d, B_d, C_d, D_d, size, op);
+  } else {
+    int blockSizeY = std::min(32, dimM);
+    int blockSizeX = (32 / blockSizeY) * 32;
+    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
+    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
+    dim3 threads(blockSizeX, blockSizeY);
+    dim3 grid(gridSizeX, gridSizeY);
+    KeEltWiseQuaternaryOp<T, Op><<<grid, threads, 0, STREAM_DEFAULT>>>
+      (A_d, B_d, C_d, D_d, dimM, dimN, lda, ldb, ldc, ldd, op);
+  }
+
+  CHECK_SYNC("hl_gpu_apply_quaternary_op failed");
+}
+
+#else
+
+template <class T, class Op>
+void hl_gpu_apply_unary_op(Op op, T* A_d, int dimM, int dimN, int lda) {}
+
+template <class T, class Op, bool BAsRowVector, bool BAsColVector>
+void hl_gpu_apply_binary_op(Op op,
+                            T* A_d,
+                            T* B_d,
+                            int dimM,
+                            int dimN,
+                            int lda,
+                            int ldb) {}
+
+template <class T, class Op, bool CAsRowVector, bool CAsColVector>
+void hl_gpu_apply_ternary_op(Op op,
+                             T* A_d,
+                             T* B_d,
+                             T* C_d,
+                             int dimM,
+                             int dimN,
+                             int lda,
+                             int ldb,
+                             int ldc) {}
+
+template <class T, class Op>
+void hl_gpu_apply_quaternary_op(Op op,
+                                T* A_d,
+                                T* B_d,
+                                T* C_d,
+                                T* D_d,
+                                int dimM,
+                                int dimN,
+                                int lda,
+                                int ldb,
+                                int ldc,
+                                int ldd) {}
+#endif
+
+#ifdef __NVCC__
+/**
+ * @brief   matrix row operator.
+ */
+
+template<class Agg, class Op>
+__device__ __inline__ real sumRow(Agg agg, Op op,
+                                  int idx, int blockSize,
+                                  int dimN, real *A) {
+  real tmp = agg.init();
+  int cnt = (dimN + blockSize -1) / blockSize;
+  for (int i = 0; i < cnt && idx < dimN; i++) {
+      tmp = agg(tmp, op(A[idx]));
+      idx += blockSize;
+  }
+  return tmp;
+}
+
+template<class Agg, class Op>
+__device__ __inline__ real sumRow(Agg agg, Op op,
+                                  int idx, int blockSize,
+                                  int dimN, real *A, real *B) {
+  real tmp = agg.init();
+  int cnt = (dimN + blockSize -1) / blockSize;
+  for (int i = 0; i < cnt && idx < dimN; i++) {
+    tmp = agg(tmp, op(A[idx], B[idx]));
+    idx += blockSize;
+  }
+  return tmp;
+}
+
+template<class Agg>
+__device__ __inline__ void aggRow(Agg agg, real *row, int size, int tid) {
+  for (int stride = size/2; stride > 0; stride = stride/2) {
+    if (tid < stride) {
+      row[tid] = agg(row[tid], row[tid + stride]);
+    }
+    __syncthreads();
+  }
+}
+
+template<class Agg, class Op, class Saver, int blockSize>
+__global__ void KeMatrixRowOp(Agg agg, Op op, Saver sv,
+                              int dimN,
+                              real *dst, int ld,
+                              real *A, int lda) {
+  __shared__ real row_s[blockSize];
+  int rowId = blockIdx.x + blockIdx.y*gridDim.x;
+  int tid = threadIdx.x;
+
+  A += rowId*lda;
+  row_s[tid] = sumRow(agg, op, tid, blockSize, dimN, A);
+  __syncthreads();
+
+  aggRow(agg, row_s, blockSize, tid);
+  __syncthreads();
+
+  if (tid == 0) {
+    dst[rowId*ld] = sv(dst[rowId*ld], row_s[0]);
+  }
+}
+
+template<class Agg, class Op, class Saver, int blockSize>
+__global__ void KeMatrixRowOp(Agg agg, Op op, Saver sv,
+                              int dimN,
+                              real *dst, int ld,
+                              real *A, int lda,
+                              real *B, int ldb) {
+  __shared__ real row_s[blockSize];
+  int rowId = blockIdx.x + blockIdx.y*gridDim.x;
+  int tid = threadIdx.x;
+
+  A += rowId*lda;
+  B += rowId*ldb;
+  row_s[tid] = sumRow(agg, op, tid, blockSize, dimN, A, B);
+  __syncthreads();
+
+  aggRow(agg, row_s, blockSize, tid);
+  __syncthreads();
+
+  if (tid == 0) {
+    dst[rowId*ld] = sv(dst[rowId*ld], row_s[0]);
+  }
+}
+
+/**
+ * @brief   matrix column operator.
+ */
+template <class Agg, class Op>
+__device__ __inline__ real sumCol(Agg agg, Op op,
+                                  int index, int stride,
+                                  int dimM, real *A, int lda) {
+  real tmp = agg.init();
+  for (; index < dimM;) {
+    tmp = agg(tmp, op(A[index*lda]));
+    index += stride;
+  }
+  return tmp;
+}
+
+template <class Agg, class Op>
+__device__ __inline__ real sumCol(Agg agg, Op op,
+                                  int index, int stride, int dimM,
+                                  real *A, int lda, real *B, int ldb) {
+  real tmp = agg.init();
+  for (; index < dimM;) {
+    tmp = agg(tmp, op(A[index*lda], B[index*ldb]));
+    index += stride;
+  }
+  return tmp;
+}
+
+template <class Agg, class Op, class Saver>
+__global__ void KeMatrixColumnOp(Agg agg, Op op, Saver sv,
+                                 int dimM, int dimN,
+                                 real *dst,
+                                 real *A, int lda) {
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (rowIdx < dimN) {
+    A += rowIdx;
+    real tmp = sumCol(agg, op, 0, 1, dimM, A, lda);
+    dst[rowIdx] = sv(dst[rowIdx], tmp);
+  }
+}
+
+template <class Agg, class Op, class Saver, int blockDimX, int blockDimY>
+__global__ void KeMatrixColumnOp_S(Agg agg, Op op, Saver sv,
+                                   int dimM, int dimN,
+                                   real *dst,
+                                   real *A, int lda) {
+  __shared__ real col_s[blockDimX*blockDimY];
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (rowIdx < dimN) {
+    A += rowIdx;
+    real tmp = sumCol(agg, op, threadIdx.y, blockDimY, dimM, A, lda);
+    col_s[threadIdx.x + threadIdx.y*blockDimX] = tmp;
+  }
+  __syncthreads();
+
+  if (rowIdx < dimN) {
+    if (threadIdx.y ==0) {
+      real tmp = agg.init();
+      for (int i=0; i < blockDimY; i++) {
+        tmp = agg(tmp, col_s[threadIdx.x + i*blockDimX]);
+      }
+      dst[rowIdx] = sv(dst[rowIdx], tmp);
+    }
+  }
+}
+
+template <class Agg, class Op, class Saver>
+__global__ void KeMatrixColumnOp(Agg agg, Op op, Saver sv,
+                                 int dimM, int dimN,
+                                 real *dst,
+                                 real *A, int lda,
+                                 real *B, int ldb) {
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (rowIdx < dimN) {
+    A += rowIdx;
+    B += rowIdx;
+    real tmp = sumCol(agg, op, 0, 1, dimM, A, lda, B, ldb);
+    dst[rowIdx] = sv(dst[rowIdx], tmp);
+  }
+}
+
+template <class Agg, class Op, class Saver, int blockDimX, int blockDimY>
+__global__ void KeMatrixColumnOp_S(Agg agg, Op op, Saver sv,
+                                   int dimM, int dimN,
+                                   real *dst,
+                                   real *A, int lda,
+                                   real *B, int ldb) {
+  __shared__ real col_s[blockDimX*blockDimY];
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (rowIdx < dimN) {
+    A += rowIdx;
+    B += rowIdx;
+    real tmp = sumCol(agg, op,
+        threadIdx.y, blockDimY, dimM, A, lda, B, ldb);
+    col_s[threadIdx.x + threadIdx.y*blockDimX] = tmp;
+  }
+  __syncthreads();
+
+  if (rowIdx < dimN) {
+    if (threadIdx.y ==0) {
+      real tmp = agg.init();
+      for (int i=0; i < blockDimY; i++) {
+        tmp = agg(tmp, col_s[threadIdx.x + i*blockDimX]);
+      }
+      dst[rowIdx] = sv(dst[rowIdx], tmp);
+    }
+  }
+}
+
+#endif
+
+template <class Agg, class Op, class Saver>
+void hl_gpu_matrix_row_op(Agg agg, Op op, Saver sv,
+                          int dimM, int dimN,
+                          real *dst, int ld,
+                          real *A, int lda) {
+#ifdef __NVCC__
+  CHECK_NOTNULL(dst);
+  CHECK_NOTNULL(A);
+
+  int blocksX = dimM;
+  int blocksY = 1;
+  dim3 threads(128, 1);
+  dim3 grid(blocksX, blocksY);
+  KeMatrixRowOp<Agg, Op, Saver, 128><<< grid, threads, 0, STREAM_DEFAULT >>>
+      (agg, op, sv, dimN, dst, ld, A, lda);
+
+  CHECK_SYNC("hl_matrix_row_op failed");
+#endif
+}
+
+template <class Agg, class Op, class Saver>
+void hl_gpu_matrix_row_op(Agg agg, Op op, Saver sv,
+                          int dimM, int dimN,
+                          real *dst, int ld,
+                          real *A, int lda,
+                          real *B, int ldb) {
+#ifdef __NVCC__
+  CHECK_NOTNULL(dst);
+  CHECK_NOTNULL(A);
+
+  int blocksX = dimM;
+  int blocksY = 1;
+  dim3 threads(128, 1);
+  dim3 grid(blocksX, blocksY);
+  KeMatrixRowOp<Agg, Op, Saver, 128><<< grid, threads, 0, STREAM_DEFAULT >>>
+    (agg, op, sv, dimN, dst, ld, A, lda, B, ldb);
+
+  CHECK_SYNC("hl_matrix_row_op failed");
+#endif
+}
+
+template <class Agg, class Op, class Saver>
+void hl_gpu_matrix_column_op(Agg agg, Op op, Saver sv,
+                             int dimM, int dimN,
+                             real *dst,
+                             real *A, int lda) {
+#ifdef __NVCC__
+  if (dimN >= 8192) {
+    int blocksX = (dimN + 128 -1) / 128;
+    int blocksY = 1;
+    dim3 threads(128, 1);
+    dim3 grid(blocksX, blocksY);
+    KeMatrixColumnOp<Agg, Op, Saver>
+        <<< grid, threads, 0, STREAM_DEFAULT >>>
+        (agg, op, sv, dimM, dimN, dst, A, lda);
+  } else {
+    int blocksX = (dimN + 32 -1) / 32;
+    int blocksY = 1;
+    dim3 threads(32, 32);
+    dim3 grid(blocksX, blocksY);
+    KeMatrixColumnOp_S<Agg, Op, Saver, 32, 32>
+        <<< grid, threads, 0, STREAM_DEFAULT>>>
+        (agg, op, sv, dimM, dimN, dst, A, lda);
+  }
+
+  CHECK_SYNC("hl_matrix_column_op failed");
+#endif
+}
+
+template <class Agg, class Op, class Saver>
+void hl_gpu_matrix_column_op(Agg agg, Op op, Saver sv,
+                             int dimM, int dimN,
+                             real *dst,
+                             real *A, int lda,
+                             real *B, int ldb) {
+#ifdef __NVCC__
+  if (dimN >= 8192) {
+    int blocksX = (dimN + 128 -1) / 128;
+    int blocksY = 1;
+    dim3 threads(128, 1);
+    dim3 grid(blocksX, blocksY);
+    KeMatrixColumnOp<Agg, Op, Saver>
+        <<< grid, threads, 0, STREAM_DEFAULT >>>
+        (agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
+  } else {
+    int blocksX = (dimN + 32 -1) / 32;
+    int blocksY = 1;
+    dim3 threads(32, 32);
+    dim3 grid(blocksX, blocksY);
+    KeMatrixColumnOp_S<Agg, Op, Saver, 32, 32>
+        <<< grid, threads, 0, STREAM_DEFAULT>>>
+        (agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
+  }
+
+  CHECK_SYNC("hl_matrix_column_op failed");
+#endif
+}
+
+#endif /* HL_GPU_MATRIX_KERNEL_CUH_ */
diff --git a/paddle/cuda/include/hl_gru_ops.cuh b/paddle/legacy/cuda/include/hl_gru_ops.cuh
similarity index 100%
rename from paddle/cuda/include/hl_gru_ops.cuh
rename to paddle/legacy/cuda/include/hl_gru_ops.cuh
diff --git a/paddle/cuda/include/hl_lstm.h b/paddle/legacy/cuda/include/hl_lstm.h
similarity index 100%
rename from paddle/cuda/include/hl_lstm.h
rename to paddle/legacy/cuda/include/hl_lstm.h
diff --git a/paddle/cuda/include/hl_lstm_ops.cuh b/paddle/legacy/cuda/include/hl_lstm_ops.cuh
similarity index 100%
rename from paddle/cuda/include/hl_lstm_ops.cuh
rename to paddle/legacy/cuda/include/hl_lstm_ops.cuh
diff --git a/paddle/cuda/include/hl_matrix.h b/paddle/legacy/cuda/include/hl_matrix.h
similarity index 100%
rename from paddle/cuda/include/hl_matrix.h
rename to paddle/legacy/cuda/include/hl_matrix.h
diff --git a/paddle/cuda/include/hl_matrix_apply.cuh b/paddle/legacy/cuda/include/hl_matrix_apply.cuh
similarity index 100%
rename from paddle/cuda/include/hl_matrix_apply.cuh
rename to paddle/legacy/cuda/include/hl_matrix_apply.cuh
diff --git a/paddle/cuda/include/hl_matrix_base.cuh b/paddle/legacy/cuda/include/hl_matrix_base.cuh
similarity index 100%
rename from paddle/cuda/include/hl_matrix_base.cuh
rename to paddle/legacy/cuda/include/hl_matrix_base.cuh
diff --git a/paddle/cuda/include/hl_matrix_base_detail.cuh b/paddle/legacy/cuda/include/hl_matrix_base_detail.cuh
similarity index 100%
rename from paddle/cuda/include/hl_matrix_base_detail.cuh
rename to paddle/legacy/cuda/include/hl_matrix_base_detail.cuh
diff --git a/paddle/cuda/include/hl_matrix_ops.cuh b/paddle/legacy/cuda/include/hl_matrix_ops.cuh
similarity index 100%
rename from paddle/cuda/include/hl_matrix_ops.cuh
rename to paddle/legacy/cuda/include/hl_matrix_ops.cuh
diff --git a/paddle/cuda/include/hl_matrix_type.cuh b/paddle/legacy/cuda/include/hl_matrix_type.cuh
similarity index 100%
rename from paddle/cuda/include/hl_matrix_type.cuh
rename to paddle/legacy/cuda/include/hl_matrix_type.cuh
diff --git a/paddle/cuda/include/hl_perturbation_util.cuh b/paddle/legacy/cuda/include/hl_perturbation_util.cuh
similarity index 100%
rename from paddle/cuda/include/hl_perturbation_util.cuh
rename to paddle/legacy/cuda/include/hl_perturbation_util.cuh
diff --git a/paddle/cuda/include/hl_recurrent_apply.cuh b/paddle/legacy/cuda/include/hl_recurrent_apply.cuh
similarity index 100%
rename from paddle/cuda/include/hl_recurrent_apply.cuh
rename to paddle/legacy/cuda/include/hl_recurrent_apply.cuh
diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/legacy/cuda/include/hl_sequence.h
similarity index 100%
rename from paddle/cuda/include/hl_sequence.h
rename to paddle/legacy/cuda/include/hl_sequence.h
diff --git a/paddle/cuda/include/hl_sparse.h b/paddle/legacy/cuda/include/hl_sparse.h
similarity index 100%
rename from paddle/cuda/include/hl_sparse.h
rename to paddle/legacy/cuda/include/hl_sparse.h
diff --git a/paddle/cuda/include/hl_sparse.ph b/paddle/legacy/cuda/include/hl_sparse.ph
similarity index 100%
rename from paddle/cuda/include/hl_sparse.ph
rename to paddle/legacy/cuda/include/hl_sparse.ph
diff --git a/paddle/cuda/include/hl_table_apply.h b/paddle/legacy/cuda/include/hl_table_apply.h
similarity index 100%
rename from paddle/cuda/include/hl_table_apply.h
rename to paddle/legacy/cuda/include/hl_table_apply.h
diff --git a/paddle/legacy/cuda/include/hl_tensor_ops.h b/paddle/legacy/cuda/include/hl_tensor_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc5e5da53d5c6ac2bae3b0067f46e39accd1b9d8
--- /dev/null
+++ b/paddle/legacy/cuda/include/hl_tensor_ops.h
@@ -0,0 +1,536 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_TENSOR_OPS_H_
+#define HL_TENSOR_OPS_H_
+
+#include <cmath>
+#include "hl_matrix_type.cuh"
+
+namespace hppl {
+namespace unary {
+
+template <class T>
+class add_scale {
+ private:
+  const T p;
+
+ public:
+  INLINE add_scale(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a + p; }
+};
+
+template <class T>
+class sub_scale {
+ private:
+  const T p;
+
+ public:
+  INLINE sub_scale(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a - p; }
+};
+
+template <class T>
+class mul_scale {
+ private:
+  const T p;
+
+ public:
+  INLINE mul_scale(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a * p; }
+};
+
+template <class T>
+class div_scale {
+ private:
+  const T p;
+
+ public:
+  INLINE div_scale(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a / p; }
+};
+
+template <class T>
+class neg {
+ public:
+  INLINE T operator()(const T a) const { return -a; }
+};
+
+template <class T>
+class exp_op {
+ public:
+  INLINE T operator()(const T a) const { return std::exp(a); }
+};
+
+template <class T>
+class log_op {
+ public:
+  INLINE T operator()(const T a) const { return std::log(a); }
+};
+
+template <class T>
+class sqrt_op {
+ public:
+  INLINE T operator()(const T a) const { return std::sqrt(a); }
+};
+
+template <class T>
+class square {
+ public:
+  INLINE T operator()(const T a) const { return a * a; }
+};
+
+template <class T>
+class reciprocal {
+ public:
+  INLINE T operator()(const T a) const { return T(1) / a; }
+};
+
+template <class T>
+class abs {
+ public:
+  INLINE T operator()(const T a) const { return a > 0 ? a : -a; }
+};
+
+template <class T>
+class sign {
+ public:
+  INLINE T operator()(const T a) const { return (a > 0) - (a < 0); }
+};
+
+template <class T>
+class min {
+ private:
+  const T p;
+
+ public:
+  INLINE min(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a > p ? p : a; }
+};
+
+template <class T>
+class max {
+ private:
+  const T p;
+
+ public:
+  INLINE max(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a < p ? p : a; }
+};
+
+template <class T>
+class pow_op {
+ private:
+  const T p;
+
+ public:
+  INLINE pow_op(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return std::pow(a, p); }
+};
+
+template <class T>
+class constant {
+ private:
+  const T p;
+
+ public:
+  INLINE constant(const T s) : p(s) {}
+  INLINE T operator()(int i) const { return p; }
+  INLINE T operator()(int i, int j) const { return p; }
+};
+
+template <class T>
+class cmp_eq {
+ private:
+  const T p;
+
+ public:
+  INLINE cmp_eq(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a == p; }
+};
+
+template <class T>
+class cmp_ne {
+ private:
+  const T p;
+
+ public:
+  INLINE cmp_ne(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a != p; }
+};
+
+template <class T>
+class cmp_le {
+ private:
+  const T p;
+
+ public:
+  INLINE cmp_le(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a <= p; }
+};
+
+template <class T>
+class cmp_lt {
+ private:
+  const T p;
+
+ public:
+  INLINE cmp_lt(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a < p; }
+};
+
+template <class T>
+class cmp_ge {
+ private:
+  const T p;
+
+ public:
+  INLINE cmp_ge(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a >= p; }
+};
+
+template <class T>
+class cmp_gt {
+ private:
+  const T p;
+
+ public:
+  INLINE cmp_gt(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a > p; }
+};
+
+template <class T>
+class and_op {
+ private:
+  const T p;
+
+ public:
+  INLINE and_op(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a && p; }
+};
+
+template <class T>
+class or_op {
+ private:
+  const T p;
+
+ public:
+  INLINE or_op(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a || p; }
+};
+
+}  // namespace unary
+
+namespace binary {
+template <class T>
+class add {
+ public:
+  INLINE T operator()(const T a, const T b) const { return a + b; }
+};
+
+template <class T>
+class add_scale {
+ private:
+  const T p1;
+  const T p2;
+
+ public:
+  INLINE add_scale(const T s1, const T s2) : p1(s1), p2(s2) {}
+  INLINE T operator()(const T a, const T b) const { return p1 * a + p2 * b; }
+};
+
+template <class T>
+class sub {
+ public:
+  INLINE T operator()(const T a, const T b) const { return a - b; }
+};
+
+template <class T>
+class mul {
+ public:
+  INLINE T operator()(const T a, const T b) const { return a * b; }
+};
+
+template <class T>
+class div {
+ public:
+  INLINE T operator()(const T a, const T b) const { return a / b; }
+};
+
+template <class T>
+class cmp_eq {
+ public:
+  INLINE bool operator()(const T a, const T b) const { return a == b; }
+};
+
+template <class T>
+class cmp_ne {
+ public:
+  INLINE bool operator()(const T a, const T b) const { return a != b; }
+};
+
+template <class T>
+class cmp_le {
+ public:
+  INLINE bool operator()(const T a, const T b) const { return a <= b; }
+};
+
+template <class T>
+class cmp_lt {
+ public:
+  INLINE bool operator()(const T a, const T b) const { return a < b; }
+};
+
+template <class T>
+class cmp_ge {
+ public:
+  INLINE bool operator()(const T a, const T b) const { return a >= b; }
+};
+
+template <class T>
+class cmp_gt {
+ public:
+  INLINE bool operator()(const T a, const T b) const { return a > b; }
+};
+
+template <class T>
+class and_op {
+ public:
+  INLINE bool operator()(const T a, const T b) const { return a && b; }
+};
+
+template <class T>
+class or_op {
+ public:
+  INLINE bool operator()(const T a, const T b) const { return a || b; }
+};
+
+template <class T>
+class min {
+ public:
+  INLINE T operator()(const T a, const T b) const { return a > b ? b : a; }
+};
+
+template <class T>
+class max {
+ public:
+  INLINE T operator()(const T a, const T b) const { return a < b ? b : a; }
+};
+
+#ifdef PADDLE_USE_SSE3
+#ifndef PADDLE_TYPE_DOUBLE
+template <>
+class add<__m128> {
+ public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_add_ps(a, b);
+  }
+};
+
+template <>
+class add_scale<__m128> {
+ private:
+  const __m128 p1;
+  const __m128 p2;
+
+ public:
+  INLINE add_scale(const __m128 s1, const __m128 s2) : p1(s1), p2(s2) {}
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_add_ps(_mm_mul_ps(p1, a), _mm_mul_ps(p2, b));
+  }
+};
+
+template <>
+class sub<__m128> {
+ public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_sub_ps(a, b);
+  }
+};
+
+template <>
+class mul<__m128> {
+ public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_mul_ps(a, b);
+  }
+};
+
+template <>
+class div<__m128> {
+ public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_div_ps(a, b);
+  }
+};
+
+template <>
+class min<__m128> {
+ public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_min_ps(a, b);
+  }
+};
+
+template <>
+class max<__m128> {
+ public:
+  INLINE __m128 operator()(const __m128 a, const __m128 b) const {
+    return _mm_max_ps(a, b);
+  }
+};
+#else
+template <>
+class add<__m128d> {
+ public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_add_pd(a, b);
+  }
+};
+
+template <>
+class add_scale<__m128d> {
+ private:
+  const __m128d p1;
+  const __m128d p2;
+
+ public:
+  INLINE add_scale(const __m128d s1, const __m128d s2) : p1(s1), p2(s2) {}
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_add_pd(_mm_mul_pd(p1, a), _mm_mul_pd(p2, b));
+  }
+};
+
+template <>
+class sub<__m128d> {
+ public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_sub_pd(a, b);
+  }
+};
+
+template <>
+class mul<__m128d> {
+ public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_mul_pd(a, b);
+  }
+};
+
+template <>
+class div<__m128d> {
+ public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_div_pd(a, b);
+  }
+};
+
+template <>
+class min<__m128d> {
+ public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_min_pd(a, b);
+  }
+};
+
+template <>
+class max<__m128d> {
+ public:
+  INLINE __m128d operator()(const __m128d a, const __m128d b) const {
+    return _mm_max_pd(a, b);
+  }
+};
+#endif  // PADDLE_TYPE_DOUBLE
+#endif  // PADDLE_USE_SSE3
+
+#ifdef PADDLE_USE_NEON
+#ifndef PADDLE_TYPE_DOUBLE
+template <>
+class add<float32x4_t> {
+ public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vaddq_f32(a, b);
+  }
+};
+
+template <>
+class add_scale<float32x4_t> {
+ private:
+  const float32x4_t p1;
+  const float32x4_t p2;
+
+ public:
+  INLINE add_scale(const float32x4_t s1, const float32x4_t s2)
+      : p1(s1), p2(s2) {}
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vaddq_f32(vmulq_f32(p1, a), vmulq_f32(p2, b));
+  }
+};
+
+template <>
+class sub<float32x4_t> {
+ public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vsubq_f32(a, b);
+  }
+};
+
+template <>
+class mul<float32x4_t> {
+ public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vmulq_f32(a, b);
+  }
+};
+
+template <>
+class div<float32x4_t> {
+ public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    float32x4_t tmp = vrecpeq_f32(b);
+    return vmulq_f32(a, tmp);
+  }
+};
+
+template <>
+class min<float32x4_t> {
+ public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vminq_f32(a, b);
+  }
+};
+
+template <>
+class max<float32x4_t> {
+ public:
+  INLINE float32x4_t operator()(const float32x4_t a,
+                                const float32x4_t b) const {
+    return vmaxq_f32(a, b);
+  }
+};
+#else
+#error To be implemented
+#endif  // PADDLE_TYPE_DOUBLE
+#endif  // PADDLE_USE_NEON
+
+}  // namespace binary
+}  // namespace hppl
+
+#endif  // HL_TENSOR_OPS_H_
diff --git a/paddle/cuda/include/hl_thread.ph b/paddle/legacy/cuda/include/hl_thread.ph
similarity index 100%
rename from paddle/cuda/include/hl_thread.ph
rename to paddle/legacy/cuda/include/hl_thread.ph
diff --git a/paddle/cuda/include/hl_time.h b/paddle/legacy/cuda/include/hl_time.h
similarity index 100%
rename from paddle/cuda/include/hl_time.h
rename to paddle/legacy/cuda/include/hl_time.h
diff --git a/paddle/cuda/include/hl_top_k.h b/paddle/legacy/cuda/include/hl_top_k.h
similarity index 100%
rename from paddle/cuda/include/hl_top_k.h
rename to paddle/legacy/cuda/include/hl_top_k.h
diff --git a/paddle/cuda/include/hl_warpctc_wrap.h b/paddle/legacy/cuda/include/hl_warpctc_wrap.h
similarity index 100%
rename from paddle/cuda/include/hl_warpctc_wrap.h
rename to paddle/legacy/cuda/include/hl_warpctc_wrap.h
diff --git a/paddle/cuda/include/stub/hl_aggregate_stub.h b/paddle/legacy/cuda/include/stub/hl_aggregate_stub.h
similarity index 100%
rename from paddle/cuda/include/stub/hl_aggregate_stub.h
rename to paddle/legacy/cuda/include/stub/hl_aggregate_stub.h
diff --git a/paddle/legacy/cuda/include/stub/hl_cnn_stub.h b/paddle/legacy/cuda/include/stub/hl_cnn_stub.h
new file mode 100644
index 0000000000000000000000000000000000000000..997eed62e07827f375c7441554b397fdd0bd6a80
--- /dev/null
+++ b/paddle/legacy/cuda/include/stub/hl_cnn_stub.h
@@ -0,0 +1,247 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_CNN_STUB_H_
+#define HL_CNN_STUB_H_
+
+#include "hl_cnn.h"
+
+inline void hl_maxpool_forward(const int frameCnt,
+                               const real* inputData,
+                               const int channels,
+                               const int height,
+                               const int width,
+                               const int pooledH,
+                               const int pooledW,
+                               const int sizeX,
+                               const int sizeY,
+                               const int strideH,
+                               const int strideW,
+                               const int paddingH,
+                               const int paddingW,
+                               real* tgtData,
+                               const int tgtStride,
+                               real* MaskData) {}
+
+inline void hl_maxpool_backward(const int frameCnt,
+                                const real* inputData,
+                                const real* outData,
+                                const real* outGrad,
+                                const int channels,
+                                const int height,
+                                const int width,
+                                const int pooledH,
+                                const int pooledW,
+                                const int sizeX,
+                                const int sizeY,
+                                const int strideH,
+                                const int strideW,
+                                const int paddingH,
+                                const int paddingW,
+                                real scaleA,
+                                real scaleB,
+                                real* targetGrad,
+                                const int outStride) {}
+
+inline void hl_avgpool_forward(const int frameCnt,
+                               const real* inputData,
+                               const int channels,
+                               const int height,
+                               const int width,
+                               const int pooledH,
+                               const int pooledW,
+                               const int sizeX,
+                               const int sizeY,
+                               const int strideH,
+                               const int strideW,
+                               const int paddingH,
+                               const int paddingW,
+                               real* tgtData,
+                               const int tgtStride,
+                               const bool excludeMode) {}
+
+inline void hl_avgpool_backward(const int frameCnt,
+                                const real* outGrad,
+                                const int channels,
+                                const int height,
+                                const int width,
+                                const int pooledH,
+                                const int pooledW,
+                                const int sizeX,
+                                const int sizeY,
+                                const int strideH,
+                                const int strideW,
+                                int paddingH,
+                                int paddingW,
+                                real scaleA,
+                                real scaleB,
+                                real* backGrad,
+                                const int outStride,
+                                const bool excludeMode) {}
+
+inline void hl_maxpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 real* maxPoolIdxData,
+                                 const int tgtStride) {}
+
+inline void hl_maxpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int paddingD,
+                                  const int paddingH,
+                                  const int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* targetGrad,
+                                  real* maxPoolIdxData,
+                                  const int outStride) {}
+
+inline void hl_avgpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 const int tgtStride) {}
+
+inline void hl_avgpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int paddingD,
+                                  const int paddingH,
+                                  const int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* backGrad,
+                                  const int outStride) {}
+
+inline void hl_bilinear_forward(const real* inData,
+                                const size_t inImgH,
+                                const size_t inImgW,
+                                const size_t inputH,
+                                const size_t inputW,
+                                real* outData,
+                                const size_t outImgH,
+                                const size_t outImgW,
+                                const size_t outputH,
+                                const size_t outputW,
+                                const size_t numChannels,
+                                const real ratioH,
+                                const real ratioW) {}
+
+inline void hl_bilinear_backward(real* inGrad,
+                                 const size_t inImgH,
+                                 const size_t inImgW,
+                                 const size_t inputH,
+                                 const size_t inputW,
+                                 const real* outGrad,
+                                 const size_t outImgH,
+                                 const size_t outImgW,
+                                 const size_t outputH,
+                                 const size_t outputW,
+                                 const size_t numChannels,
+                                 const real ratioH,
+                                 const real ratioW) {}
+
+inline void hl_maxout_forward(const real* inData,
+                              real* outData,
+                              int* idData,
+                              size_t batchSize,
+                              size_t size,
+                              size_t featLen,
+                              size_t group) {}
+
+inline void hl_maxout_backward(real* inGrad,
+                               const real* outGrad,
+                               const int* idData,
+                               size_t batchSize,
+                               size_t size,
+                               size_t featLen,
+                               size_t group) {}
+
+inline void hl_upsample_forward(real* inputData,
+                                real* maskData,
+                                size_t batchSize,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t channels,
+                                size_t outputH,
+                                size_t outputW,
+                                real* outputData) {}
+
+inline void hl_upsample_backward(real* outputGradData,
+                                 real* maskData,
+                                 size_t batchSize,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t channels,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 real* inputGradData) {}
+
+#endif  // HL_CNN_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_cuda_cublas_stub.h b/paddle/legacy/cuda/include/stub/hl_cuda_cublas_stub.h
similarity index 100%
rename from paddle/cuda/include/stub/hl_cuda_cublas_stub.h
rename to paddle/legacy/cuda/include/stub/hl_cuda_cublas_stub.h
diff --git a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h b/paddle/legacy/cuda/include/stub/hl_cuda_cudnn_stub.h
similarity index 100%
rename from paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
rename to paddle/legacy/cuda/include/stub/hl_cuda_cudnn_stub.h
diff --git a/paddle/cuda/include/stub/hl_cuda_stub.h b/paddle/legacy/cuda/include/stub/hl_cuda_stub.h
similarity index 100%
rename from paddle/cuda/include/stub/hl_cuda_stub.h
rename to paddle/legacy/cuda/include/stub/hl_cuda_stub.h
diff --git a/paddle/cuda/include/stub/hl_lstm_stub.h b/paddle/legacy/cuda/include/stub/hl_lstm_stub.h
similarity index 100%
rename from paddle/cuda/include/stub/hl_lstm_stub.h
rename to paddle/legacy/cuda/include/stub/hl_lstm_stub.h
diff --git a/paddle/cuda/include/stub/hl_matrix_stub.h b/paddle/legacy/cuda/include/stub/hl_matrix_stub.h
similarity index 100%
rename from paddle/cuda/include/stub/hl_matrix_stub.h
rename to paddle/legacy/cuda/include/stub/hl_matrix_stub.h
diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/legacy/cuda/include/stub/hl_sequence_stub.h
similarity index 100%
rename from paddle/cuda/include/stub/hl_sequence_stub.h
rename to paddle/legacy/cuda/include/stub/hl_sequence_stub.h
diff --git a/paddle/cuda/include/stub/hl_sparse_stub.h b/paddle/legacy/cuda/include/stub/hl_sparse_stub.h
similarity index 100%
rename from paddle/cuda/include/stub/hl_sparse_stub.h
rename to paddle/legacy/cuda/include/stub/hl_sparse_stub.h
diff --git a/paddle/cuda/src/avx_mathfun.h b/paddle/legacy/cuda/src/avx_mathfun.h
similarity index 100%
rename from paddle/cuda/src/avx_mathfun.h
rename to paddle/legacy/cuda/src/avx_mathfun.h
diff --git a/paddle/cuda/src/hl_avx_functions.cc b/paddle/legacy/cuda/src/hl_avx_functions.cc
similarity index 100%
rename from paddle/cuda/src/hl_avx_functions.cc
rename to paddle/legacy/cuda/src/hl_avx_functions.cc
diff --git a/paddle/cuda/src/hl_batch_norm.cu b/paddle/legacy/cuda/src/hl_batch_norm.cu
similarity index 100%
rename from paddle/cuda/src/hl_batch_norm.cu
rename to paddle/legacy/cuda/src/hl_batch_norm.cu
diff --git a/paddle/cuda/src/hl_batch_transpose.cu b/paddle/legacy/cuda/src/hl_batch_transpose.cu
similarity index 100%
rename from paddle/cuda/src/hl_batch_transpose.cu
rename to paddle/legacy/cuda/src/hl_batch_transpose.cu
diff --git a/paddle/cuda/src/hl_cpu_functions.cc b/paddle/legacy/cuda/src/hl_cpu_functions.cc
similarity index 100%
rename from paddle/cuda/src/hl_cpu_functions.cc
rename to paddle/legacy/cuda/src/hl_cpu_functions.cc
diff --git a/paddle/legacy/cuda/src/hl_cuda_aggregate.cu b/paddle/legacy/cuda/src/hl_cuda_aggregate.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9831c5ecc340135c27b49d24715c63f8a8dfa8e9
--- /dev/null
+++ b/paddle/legacy/cuda/src/hl_cuda_aggregate.cu
@@ -0,0 +1,293 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_aggregate.h"
+#include "hl_base.h"
+#include "hl_cuda.h"
+#include "hl_cuda.ph"
+#include "hl_matrix_base.cuh"
+#include "hl_thread.ph"
+#include "paddle/legacy/utils/Logging.h"
+
+/**
+ * @brief   matrix row operator.
+ */
+template <class Agg, int blockSize>
+__global__ void KeMatrixRowOp(Agg agg, real *E, real *Sum, int dimN) {
+  __shared__ real sum_s[blockSize];
+  int cnt = (dimN + blockSize - 1) / blockSize;
+  int rowId = blockIdx.x + blockIdx.y * gridDim.x;
+  int index = rowId * dimN;
+  int tid = threadIdx.x;
+  int lmt = tid;
+
+  real tmp = agg.init();
+  for (int ii = 0; ii < cnt && lmt < dimN; ii++) {
+    tmp = agg(tmp, E[index + lmt]);
+    lmt += blockSize;
+  }
+  sum_s[tid] = tmp;
+  __syncthreads();
+
+  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
+    if (tid < stride) {
+      sum_s[tid] = agg(sum_s[tid], sum_s[tid + stride]);
+    }
+    __syncthreads();
+  }
+  __syncthreads();
+
+  if (tid == 0) {
+    Sum[rowId] = sum_s[0];
+  }
+}
+
+template <class Agg>
+void hl_matrix_row_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
+  int blocksX = dimM;
+  int blocksY = 1;
+  dim3 threads(128, 1);
+  dim3 grid(blocksX, blocksY);
+
+  KeMatrixRowOp<Agg, 128><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      agg, A_d, C_d, dimN);
+}
+
+void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  hl_matrix_row_op(aggregate::sum(), A_d, C_d, dimM, dimN);
+  CHECK_SYNC("hl_matrix_row_sum failed");
+}
+
+void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  hl_matrix_row_op(aggregate::max(), A_d, C_d, dimM, dimN);
+  CHECK_SYNC("hl_matrix_row_max failed");
+}
+
+void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  hl_matrix_row_op(aggregate::min(), A_d, C_d, dimM, dimN);
+  CHECK_SYNC("hl_matrix_row_min failed");
+}
+
+/**
+ * @brief   matrix column operator.
+ */
+template <class Agg>
+__global__ void KeMatrixColumnOp(
+    Agg agg, real *E, real *Sum, int dimM, int dimN) {
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  real tmp = agg.init();
+  if (rowIdx < dimN) {
+    for (int index = 0; index < dimM; index++) {
+      tmp = agg(tmp, E[dimN * index + rowIdx]);
+    }
+    Sum[rowIdx] = tmp;
+  }
+}
+
+template <class Agg, int blockDimX, int blockDimY>
+__global__ void KeMatrixColumnOp_S(
+    Agg agg, real *E, real *Sum, int dimM, int dimN) {
+  __shared__ real _sum[blockDimX * blockDimY];
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int index = threadIdx.y;
+
+  real tmp = agg.init();
+  if (rowIdx < dimN) {
+    for (; index < dimM;) {
+      tmp = agg(tmp, E[dimN * index + rowIdx]);
+      index += blockDimY;
+    }
+  }
+  _sum[threadIdx.x + threadIdx.y * blockDimX] = tmp;
+  __syncthreads();
+
+  if (rowIdx < dimN) {
+    if (threadIdx.y == 0) {
+      real tmp = agg.init();
+      for (int i = 0; i < blockDimY; i++) {
+        tmp = agg(tmp, _sum[threadIdx.x + i * blockDimX]);
+      }
+      Sum[rowIdx] = tmp;
+    }
+  }
+}
+
+template <class Agg>
+void hl_matrix_column_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
+  if (dimN >= 8192) {
+    int blocksX = (dimN + 128 - 1) / 128;
+    int blocksY = 1;
+    dim3 threads(128, 1);
+    dim3 grid(blocksX, blocksY);
+    KeMatrixColumnOp<Agg><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        agg, A_d, C_d, dimM, dimN);
+  } else {
+    int blocksX = (dimN + 32 - 1) / 32;
+    int blocksY = 1;
+    dim3 threads(32, 32);
+    dim3 grid(blocksX, blocksY);
+    KeMatrixColumnOp_S<Agg, 32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        agg, A_d, C_d, dimM, dimN);
+  }
+
+  return;
+}
+
+void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  hl_matrix_column_op(aggregate::sum(), A_d, C_d, dimM, dimN);
+
+  CHECK_SYNC("hl_matrix_column_sum failed");
+}
+
+void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  hl_matrix_column_op(aggregate::max(), A_d, C_d, dimM, dimN);
+
+  CHECK_SYNC("hl_matrix_column_max failed");
+}
+
+void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  hl_matrix_column_op(aggregate::min(), A_d, C_d, dimM, dimN);
+
+  CHECK_SYNC("hl_matrix_column_min failed");
+}
+
+template <int blockSize>
+__global__ void KeVectorSum(real *E, real *Sum, int dimM) {
+  __shared__ double sum_s[blockSize];
+  int tid = threadIdx.x;
+  int index = blockIdx.y * blockDim.x + threadIdx.x;
+
+  sum_s[tid] = 0.0f;
+  while (index < dimM) {
+    sum_s[tid] += E[index];
+    index += blockDim.x * gridDim.y;
+  }
+  __syncthreads();
+
+  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
+    if (tid < stride) {
+      sum_s[tid] += sum_s[tid + stride];
+    }
+    __syncthreads();
+  }
+  __syncthreads();
+
+  if (tid == 0) {
+    Sum[blockIdx.y] = sum_s[0];
+  }
+}
+
+void hl_vector_sum(real *A_d, real *C_h, int dimM) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_h);
+
+  int blockSize = 128;
+  int gridSize = 128;
+  int blocksX = 1;
+  int blocksY = gridSize;
+  dim3 threads(blockSize, 1);
+  dim3 grid(blocksX, blocksY);
+
+  struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
+  hl_event_t hl_event = &hl_event_st;
+  while (!hl_cuda_event_is_ready(hl_event)) {
+  }
+
+  KeVectorSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      A_d, t_resource.gpu_mem, dimM);
+  KeVectorSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
+      t_resource.gpu_mem, t_resource.cpu_mem, 128);
+
+  hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
+  hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
+
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  cudaError_t err = (cudaError_t)hl_get_device_last_error();
+  CHECK_EQ(cudaSuccess, err) << "CUDA error: "
+                             << hl_get_device_error_string((size_t)err);
+}
+
+template <int blockSize>
+__global__ void KeVectorAbsSum(real *E, real *Sum, int dimM) {
+  __shared__ double sum_s[blockSize];
+  int tid = threadIdx.x;
+  int index = blockIdx.y * blockDim.x + threadIdx.x;
+
+  sum_s[tid] = 0.0f;
+  while (index < dimM) {
+    sum_s[tid] += abs(E[index]);
+    index += blockDim.x * gridDim.y;
+  }
+  __syncthreads();
+
+  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
+    if (tid < stride) {
+      sum_s[tid] += sum_s[tid + stride];
+    }
+    __syncthreads();
+  }
+  __syncthreads();
+
+  if (tid == 0) {
+    Sum[blockIdx.y] = sum_s[0];
+  }
+}
+
+void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_h);
+
+  int blockSize = 128;
+  int gridSize = 128;
+  int blocksX = 1;
+  int blocksY = gridSize;
+  dim3 threads(blockSize, 1);
+  dim3 grid(blocksX, blocksY);
+
+  struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
+  hl_event_t hl_event = &hl_event_st;
+  while (!hl_cuda_event_is_ready(hl_event)) {
+  }
+
+  KeVectorAbsSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      A_d, t_resource.gpu_mem, dimM);
+  KeVectorAbsSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
+      t_resource.gpu_mem, t_resource.cpu_mem, 128);
+
+  hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
+  hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
+
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  cudaError_t err = (cudaError_t)hl_get_device_last_error();
+  CHECK_EQ(cudaSuccess, err) << "CUDA error: "
+                             << hl_get_device_error_string((size_t)err);
+}
diff --git a/paddle/legacy/cuda/src/hl_cuda_cnn.cu b/paddle/legacy/cuda/src/hl_cuda_cnn.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bac743a293cc97b114281e510d06367a86536452
--- /dev/null
+++ b/paddle/legacy/cuda/src/hl_cuda_cnn.cu
@@ -0,0 +1,1106 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <float.h>
+#include "hl_base.h"
+#include "hl_cnn.h"
+#include "hl_device_functions.cuh"
+
+__global__ void KeMaxPoolForward(const int nthreads,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int height,
+                                 const int width,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int ksizeW,
+                                 const int ksizeH,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int offsetH,
+                                 const int offsetW,
+                                 real* tgtData,
+                                 const int tgtStride,
+                                 real* maskData) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    int pw = index % pooledW;
+    int ph = (index / pooledW) % pooledH;
+    int c = (index / pooledW / pooledH) % channels;
+    int frameNum = index / pooledW / pooledH / channels;
+    int hstart = ph * strideH - offsetH;
+    int wstart = pw * strideW - offsetW;
+    int hend = min(hstart + ksizeH, height);
+    int wend = min(wstart + ksizeW, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    real maxval = -FLT_MAX;
+    int max_index = -1;
+    inputData += (frameNum * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        if (maxval < inputData[h * width + w]) {
+          max_index = h * width + w;
+          maxval = inputData[max_index];
+        }
+      }
+    }
+    int tgtIndex =
+        index % (pooledW * pooledH * channels) + frameNum * tgtStride;
+    tgtData[tgtIndex] = maxval;
+    if (maskData != NULL) {
+      maskData[tgtIndex] = max_index;
+    }
+  }
+}
+
+void hl_maxpool_forward(const int frameCnt,
+                        const real* inputData,
+                        const int channels,
+                        const int height,
+                        const int width,
+                        const int pooledH,
+                        const int pooledW,
+                        const int sizeX,
+                        const int sizeY,
+                        const int strideH,
+                        const int strideW,
+                        const int paddingH,
+                        const int paddingW,
+                        real* tgtData,
+                        const int tgtStride,
+                        real* maskData) {
+  int num_kernels = pooledH * pooledW * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  dim3 threads(1024, 1);
+  dim3 grid(blocks, 1);
+
+  KeMaxPoolForward<<<grid, threads, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                         inputData,
+                                                         channels,
+                                                         height,
+                                                         width,
+                                                         pooledH,
+                                                         pooledW,
+                                                         sizeX,
+                                                         sizeY,
+                                                         strideH,
+                                                         strideW,
+                                                         paddingH,
+                                                         paddingW,
+                                                         tgtData,
+                                                         tgtStride,
+                                                         maskData);
+  CHECK_SYNC("hl_maxpool_forward failed");
+}
+
+__global__ void KeMaxPoolBackward(const int nthreads,
+                                  const real* inputData,
+                                  const real* outData,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int height,
+                                  const int width,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeX,
+                                  const int sizeY,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int padH,
+                                  const int padW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* targetGrad,
+                                  const int outStride) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    // find out the local index
+    // find out the local offset
+    int offsetW = index % width + padW;
+    int offsetH = (index / width) % height + padH;
+    int offsetC = (index / width / height) % channels;
+
+    int frameNum = index / width / height / channels;
+    int phstart = (offsetH < sizeY) ? 0 : (offsetH - sizeY) / strideH + 1;
+    int pwstart = (offsetW < sizeX) ? 0 : (offsetW - sizeX) / strideW + 1;
+    int phend = offsetH >= 0 ? min(offsetH / strideH + 1, pooledH) : 0;
+    int pwend = offsetW >= 0 ? min(offsetW / strideW + 1, pooledW) : 0;
+    real gradient = 0;
+    real input = inputData[index];
+    outData += (frameNum * outStride + offsetC * pooledH * pooledW);
+    outGrad += (frameNum * outStride + offsetC * pooledH * pooledW);
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        if (input == outData[ph * pooledW + pw]) {
+          gradient += outGrad[ph * pooledW + pw];
+        }
+      }
+    }
+    targetGrad[index] = scaleB * targetGrad[index] + scaleA * gradient;
+  }
+}
+
+void hl_maxpool_backward(const int frameCnt,
+                         const real* inputData,
+                         const real* outData,
+                         const real* outGrad,
+                         const int channels,
+                         const int height,
+                         const int width,
+                         const int pooledH,
+                         const int pooledW,
+                         const int sizeX,
+                         const int sizeY,
+                         const int strideH,
+                         const int strideW,
+                         const int paddingH,
+                         const int paddingW,
+                         real scaleA,
+                         real scaleB,
+                         real* targetGrad,
+                         const int outStride) {
+  int num_kernels = height * width * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+
+  KeMaxPoolBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                         inputData,
+                                                         outData,
+                                                         outGrad,
+                                                         channels,
+                                                         height,
+                                                         width,
+                                                         pooledH,
+                                                         pooledW,
+                                                         sizeX,
+                                                         sizeY,
+                                                         strideH,
+                                                         strideW,
+                                                         paddingH,
+                                                         paddingW,
+                                                         scaleA,
+                                                         scaleB,
+                                                         targetGrad,
+                                                         outStride);
+  CHECK_SYNC("hl_maxpool_backward");
+}
+
+__global__ void KeAvgPoolForward(const int nthreads,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int height,
+                                 const int width,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeX,
+                                 const int sizeY,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int padH,
+                                 const int padW,
+                                 real* tgtData,
+                                 const int tgtStride,
+                                 const bool excludeMode) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    int pw = index % pooledW;
+    int ph = (index / pooledW) % pooledH;
+    int c = (index / pooledW / pooledH) % channels;
+    int frameNum = index / pooledW / pooledH / channels;
+
+    int hstart = ph * strideH - padH;
+    int wstart = pw * strideW - padW;
+    int hend = min(hstart + sizeY, height);
+    int wend = min(wstart + sizeX, width);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    int poolSize =
+        excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
+
+    real aveval = 0;
+    inputData += (frameNum * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        aveval += inputData[h * width + w];
+      }
+    }
+    int tgtIndex =
+        index % (pooledW * pooledH * channels) + frameNum * tgtStride;
+    tgtData[tgtIndex] = aveval / poolSize;
+  }
+}
+
+void hl_avgpool_forward(const int frameCnt,
+                        const real* inputData,
+                        const int channels,
+                        const int height,
+                        const int width,
+                        const int pooledH,
+                        const int pooledW,
+                        const int sizeX,
+                        const int sizeY,
+                        const int strideH,
+                        const int strideW,
+                        const int paddingH,
+                        const int paddingW,
+                        real* tgtData,
+                        const int tgtStride,
+                        const bool excludeMode) {
+  int num_kernels = pooledH * pooledW * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  KeAvgPoolForward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                        inputData,
+                                                        channels,
+                                                        height,
+                                                        width,
+                                                        pooledH,
+                                                        pooledW,
+                                                        sizeX,
+                                                        sizeY,
+                                                        strideH,
+                                                        strideW,
+                                                        paddingH,
+                                                        paddingW,
+                                                        tgtData,
+                                                        tgtStride,
+                                                        excludeMode);
+  CHECK_SYNC("hl_avgpool_forward failed");
+}
+
+__global__ void KeAvgPoolBackward(const int nthreads,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int height,
+                                  const int width,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeX,
+                                  const int sizeY,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int padH,
+                                  const int padW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* tgtGrad,
+                                  const int outStride,
+                                  const bool excludeMode) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    int offsetW = index % width + padW;
+    int offsetH = (index / width) % height + padH;
+    int offsetC = (index / width / height) % channels;
+    int frameNum = index / width / height / channels;
+
+    int phstart = (offsetH < sizeY) ? 0 : (offsetH - sizeY) / strideH + 1;
+    int pwstart = (offsetW < sizeX) ? 0 : (offsetW - sizeX) / strideW + 1;
+    int phend = offsetH >= 0 ? min(offsetH / strideH + 1, pooledH) : 0;
+    int pwend = offsetW >= 0 ? min(offsetW / strideW + 1, pooledW) : 0;
+    real gradient = 0;
+    outGrad += (frameNum * outStride + offsetC * pooledH * pooledW);
+
+    for (int ph = phstart; ph < phend; ++ph) {
+      int hstart = ph * strideH - padH;
+      int hend = min(hstart + sizeY, height);
+      hstart = max(hstart, 0);
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        // figure out the pooling size
+        int wstart = pw * strideW - padW;
+        int wend = min(wstart + sizeX, width);
+        wstart = max(wstart, 0);
+        int poolSize =
+            excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
+        gradient += outGrad[ph * pooledW + pw] / poolSize;
+      }
+    }
+    tgtGrad[index] = scaleB * tgtGrad[index] + scaleA * gradient;
+  }
+}
+
+void hl_avgpool_backward(const int frameCnt,
+                         const real* outGrad,
+                         const int channels,
+                         const int height,
+                         const int width,
+                         const int pooledH,
+                         const int pooledW,
+                         const int sizeX,
+                         const int sizeY,
+                         const int strideH,
+                         const int strideW,
+                         const int paddingH,
+                         const int paddingW,
+                         real scaleA,
+                         real scaleB,
+                         real* backGrad,
+                         const int outStride,
+                         const bool excludeMode) {
+  int num_kernels = height * width * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+
+  KeAvgPoolBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                         outGrad,
+                                                         channels,
+                                                         height,
+                                                         width,
+                                                         pooledH,
+                                                         pooledW,
+                                                         sizeX,
+                                                         sizeY,
+                                                         strideH,
+                                                         strideW,
+                                                         paddingH,
+                                                         paddingW,
+                                                         scaleA,
+                                                         scaleB,
+                                                         backGrad,
+                                                         outStride,
+                                                         excludeMode);
+  CHECK_SYNC("hl_avgpool_backward failed");
+}
+
+__global__ void KeMaxPool3DForward(const int nthreads,
+                                   const real* inputData,
+                                   const int channels,
+                                   const int depth,
+                                   const int height,
+                                   const int width,
+                                   const int pooledD,
+                                   const int pooledH,
+                                   const int pooledW,
+                                   const int ksizeD,
+                                   const int ksizeH,
+                                   const int ksizeW,
+                                   const int strideD,
+                                   const int strideH,
+                                   const int strideW,
+                                   const int padD,
+                                   const int padH,
+                                   const int padW,
+                                   real* tgtData,
+                                   real* maxPoolIdxData,
+                                   const int tgtStride) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
+       index += blockDim.x * gridDim.x) {
+    int pw = index % pooledW;
+    int ph = (index / pooledW) % pooledH;
+    int pd = (index / pooledW / pooledH) % pooledD;
+    int c = (index / pooledW / pooledH / pooledD) % channels;
+    int frameNum = index / pooledW / pooledH / pooledD / channels;
+    int dstart = pd * strideD - padD;
+    int hstart = ph * strideH - padH;
+    int wstart = pw * strideW - padW;
+    int dend = min(dstart + ksizeD, depth);
+    int hend = min(hstart + ksizeH, height);
+    int wend = min(wstart + ksizeW, width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    real maxval = -FLT_MAX;
+    int maxIdx = -1;
+    inputData += (frameNum * channels + c) * depth * height * width;
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          if (maxval < inputData[(d * height + h) * width + w]) {
+            maxval = inputData[(d * height + h) * width + w];
+            maxIdx = (d * height + h) * width + w;
+          }
+        }
+      }
+    }
+    int tgtIndex =
+        index % (pooledW * pooledH * pooledD * channels) + frameNum * tgtStride;
+    tgtData[tgtIndex] = maxval;
+    maxPoolIdxData[tgtIndex] = maxIdx;
+  }
+}
+
+void hl_maxpool3D_forward(const int frameCnt,
+                          const real* inputData,
+                          const int channels,
+                          const int depth,
+                          const int height,
+                          const int width,
+                          const int pooledD,
+                          const int pooledH,
+                          const int pooledW,
+                          const int sizeZ,
+                          const int sizeY,
+                          const int sizeX,
+                          const int strideD,
+                          const int strideH,
+                          const int strideW,
+                          const int padD,
+                          const int padH,
+                          const int padW,
+                          real* tgtData,
+                          real* maxPoolIdxData,
+                          const int tgtStride) {
+  int num_kernels = pooledD * pooledH * pooledW * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  dim3 threads(1024, 1);
+  dim3 grid(blocks, 1);
+
+  KeMaxPool3DForward<<<grid, threads, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                           inputData,
+                                                           channels,
+                                                           depth,
+                                                           height,
+                                                           width,
+                                                           pooledD,
+                                                           pooledH,
+                                                           pooledW,
+                                                           sizeZ,
+                                                           sizeY,
+                                                           sizeX,
+                                                           strideD,
+                                                           strideH,
+                                                           strideW,
+                                                           padD,
+                                                           padH,
+                                                           padW,
+                                                           tgtData,
+                                                           maxPoolIdxData,
+                                                           tgtStride);
+  CHECK_SYNC("hl_maxpool3D_forward failed");
+}
+
+__global__ void KeMaxPool3DBackward(const int nthreads,
+                                    const real* outGrad,
+                                    const int channels,
+                                    const int depth,
+                                    const int height,
+                                    const int width,
+                                    const int pooledD,
+                                    const int pooledH,
+                                    const int pooledW,
+                                    const int sizeZ,
+                                    const int sizeY,
+                                    const int sizeX,
+                                    const int strideD,
+                                    const int strideH,
+                                    const int strideW,
+                                    const int padD,
+                                    const int padH,
+                                    const int padW,
+                                    real scaleA,
+                                    real scaleB,
+                                    real* targetGrad,
+                                    real* maxPoolIdxData,
+                                    const int outStride) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
+       index += blockDim.x * gridDim.x) {
+    int offsetW = index % width;
+    int offsetH = (index / width) % height;
+    int offsetD = (index / width / height) % depth;
+    int offsetC = (index / width / height / depth) % channels;
+    int frameNum = index / width / height / depth / channels;
+
+    int pdstart =
+        (offsetD + padD < sizeZ) ? 0 : (offsetD + padD - sizeZ) / strideD + 1;
+    int phstart =
+        (offsetH + padH < sizeY) ? 0 : (offsetH + padH - sizeY) / strideH + 1;
+    int pwstart =
+        (offsetW + padW < sizeX) ? 0 : (offsetW + padW - sizeX) / strideW + 1;
+    int pdend = min((offsetD + padD) / strideD + 1, pooledD);
+    int phend = min((offsetH + padH) / strideH + 1, pooledH);
+    int pwend = min((offsetW + padW) / strideW + 1, pooledW);
+
+    real gradient = 0;
+    outGrad += ((frameNum * channels + offsetC) * pooledD * pooledH * pooledW);
+    maxPoolIdxData +=
+        ((frameNum * channels + offsetC) * pooledD * pooledH * pooledW);
+    for (int pd = pdstart; pd < pdend; ++pd) {
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          if (((offsetD * height + offsetH) * width + offsetW) ==
+              maxPoolIdxData[(pd * pooledH + ph) * pooledW + pw])
+            gradient += outGrad[(pd * pooledH + ph) * pooledW + pw];
+        }
+      }
+    }
+    targetGrad[index] = scaleA * gradient + scaleB * targetGrad[index];
+  }
+}
+
+void hl_maxpool3D_backward(const int frameCnt,
+                           const real* outGrad,
+                           const int channels,
+                           const int depth,
+                           const int height,
+                           const int width,
+                           const int outputD,
+                           const int outputH,
+                           const int outputW,
+                           const int sizeZ,
+                           const int sizeY,
+                           const int sizeX,
+                           const int strideD,
+                           const int strideH,
+                           const int strideW,
+                           const int paddingD,
+                           const int paddingH,
+                           const int paddingW,
+                           real scaleA,
+                           real scaleB,
+                           real* targetGrad,
+                           real* maxPoolIdxData,
+                           const int outStride) {
+  int num_kernels = depth * height * width * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+
+  KeMaxPool3DBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                           outGrad,
+                                                           channels,
+                                                           depth,
+                                                           height,
+                                                           width,
+                                                           outputD,
+                                                           outputH,
+                                                           outputW,
+                                                           sizeZ,
+                                                           sizeY,
+                                                           sizeX,
+                                                           strideD,
+                                                           strideH,
+                                                           strideW,
+                                                           paddingD,
+                                                           paddingH,
+                                                           paddingW,
+                                                           scaleA,
+                                                           scaleB,
+                                                           targetGrad,
+                                                           maxPoolIdxData,
+                                                           outStride);
+  CHECK_SYNC("hl_maxpool3D_backward");
+}
+
+__global__ void KeAvgPool3DForward(const int nthreads,
+                                   const real* inputData,
+                                   const int channels,
+                                   const int depth,
+                                   const int height,
+                                   const int width,
+                                   const int pooledD,
+                                   const int pooledH,
+                                   const int pooledW,
+                                   const int sizeZ,
+                                   const int sizeY,
+                                   const int sizeX,
+                                   const int strideD,
+                                   const int strideH,
+                                   const int strideW,
+                                   const int padD,
+                                   const int padH,
+                                   const int padW,
+                                   real* tgtData,
+                                   const int tgtStride) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
+       index += blockDim.x * gridDim.x) {
+    int pw = index % pooledW;
+    int ph = (index / pooledW) % pooledH;
+    int pd = (index / pooledW / pooledH) % pooledD;
+    int c = (index / pooledW / pooledH / pooledD) % channels;
+    int frameNum = index / pooledW / pooledH / pooledD / channels;
+    int dstart = pd * strideD - padD;
+    int hstart = ph * strideH - padH;
+    int wstart = pw * strideW - padW;
+    int dend = min(dstart + sizeZ, depth);
+    int hend = min(hstart + sizeY, height);
+    int wend = min(wstart + sizeX, width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+
+    real aveval = 0;
+    inputData += (frameNum * channels + c) * depth * height * width;
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          aveval += inputData[(d * height + h) * width + w];
+        }
+      }
+    }
+    int tgtIndex =
+        index % (pooledW * pooledH * pooledD * channels) + frameNum * tgtStride;
+    tgtData[tgtIndex] = aveval / pool_size;
+  }
+}
+
+void hl_avgpool3D_forward(const int frameCnt,
+                          const real* inputData,
+                          const int channels,
+                          const int depth,
+                          const int height,
+                          const int width,
+                          const int pooledD,
+                          const int pooledH,
+                          const int pooledW,
+                          const int sizeZ,
+                          const int sizeY,
+                          const int sizeX,
+                          const int strideD,
+                          const int strideH,
+                          const int strideW,
+                          const int paddingD,
+                          const int paddingH,
+                          const int paddingW,
+                          real* tgtData,
+                          const int tgtStride) {
+  int num_kernels = pooledD * pooledH * pooledW * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  KeAvgPool3DForward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                          inputData,
+                                                          channels,
+                                                          depth,
+                                                          height,
+                                                          width,
+                                                          pooledD,
+                                                          pooledH,
+                                                          pooledW,
+                                                          sizeZ,
+                                                          sizeY,
+                                                          sizeX,
+                                                          strideD,
+                                                          strideH,
+                                                          strideW,
+                                                          paddingD,
+                                                          paddingH,
+                                                          paddingW,
+                                                          tgtData,
+                                                          tgtStride);
+  CHECK_SYNC("hl_avgpool3D_forward failed");
+}
+
+__global__ void KeAvgPool3DBackward(const int nthreads,
+                                    const real* outGrad,
+                                    const int channels,
+                                    const int depth,
+                                    const int height,
+                                    const int width,
+                                    const int pooledD,
+                                    const int pooledH,
+                                    const int pooledW,
+                                    const int sizeZ,
+                                    const int sizeY,
+                                    const int sizeX,
+                                    const int strideD,
+                                    const int strideH,
+                                    const int strideW,
+                                    const int padD,
+                                    const int padH,
+                                    const int padW,
+                                    real scaleA,
+                                    real scaleB,
+                                    real* tgtGrad,
+                                    const int outStride) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
+       index += blockDim.x * gridDim.x) {
+    int offsetW = index % width + padW;
+    int offsetH = (index / width) % height + padH;
+    int offsetD = (index / width / height) % depth + padD;
+    int offsetC = (index / width / height / depth) % channels;
+    int frameNum = index / width / height / depth / channels;
+
+    int pdstart = (offsetD < sizeZ) ? 0 : (offsetD - sizeZ) / strideD + 1;
+    int phstart = (offsetH < sizeY) ? 0 : (offsetH - sizeY) / strideH + 1;
+    int pwstart = (offsetW < sizeX) ? 0 : (offsetW - sizeX) / strideW + 1;
+    int pdend = min(offsetD / strideD + 1, pooledD);
+    int phend = min(offsetH / strideH + 1, pooledH);
+    int pwend = min(offsetW / strideW + 1, pooledW);
+
+    real gradient = 0;
+    outGrad += (frameNum * channels + offsetC) * pooledD * pooledH * pooledW;
+
+    for (int pd = pdstart; pd < pdend; ++pd) {
+      int dstart = pd * strideD - padD;
+      int dend = min(dstart + sizeZ, depth);
+      dstart = max(dstart, 0);
+      for (int ph = phstart; ph < phend; ++ph) {
+        int hstart = ph * strideH - padH;
+        int hend = min(hstart + sizeY, height);
+        hstart = max(hstart, 0);
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          // figure out the pooling size
+          int wstart = pw * strideW - padW;
+          int wend = min(wstart + sizeX, width);
+          wstart = max(wstart, 0);
+          int poolsize = (dend - dstart) * (hend - hstart) * (wend - wstart);
+          gradient += outGrad[(pd * pooledH + ph) * pooledW + pw] / poolsize;
+        }
+      }
+    }
+    tgtGrad[index] = scaleA * gradient + scaleB * tgtGrad[index];
+  }
+}
+
+void hl_avgpool3D_backward(const int frameCnt,
+                           const real* outGrad,
+                           const int channels,
+                           const int depth,
+                           const int height,
+                           const int width,
+                           const int outputD,
+                           const int outputH,
+                           const int outputW,
+                           const int sizeZ,
+                           const int sizeY,
+                           const int sizeX,
+                           const int strideD,
+                           const int strideH,
+                           const int strideW,
+                           int paddingD,
+                           int paddingH,
+                           int paddingW,
+                           real scaleA,
+                           real scaleB,
+                           real* backGrad,
+                           const int outStride) {
+  int num_kernels = depth * height * width * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+
+  KeAvgPool3DBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                           outGrad,
+                                                           channels,
+                                                           depth,
+                                                           height,
+                                                           width,
+                                                           outputD,
+                                                           outputH,
+                                                           outputW,
+                                                           sizeZ,
+                                                           sizeY,
+                                                           sizeX,
+                                                           strideD,
+                                                           strideH,
+                                                           strideW,
+                                                           paddingD,
+                                                           paddingH,
+                                                           paddingW,
+                                                           scaleA,
+                                                           scaleB,
+                                                           backGrad,
+                                                           outStride);
+  CHECK_SYNC("hl_avgpool3D_backward failed");
+}
+
+__global__ void KeBilinearInterpFw(const real* in,
+                                   const size_t inImgH,
+                                   const size_t inImgW,
+                                   const size_t inputH,
+                                   const size_t inputW,
+                                   real* out,
+                                   const size_t outImgH,
+                                   const size_t outImgW,
+                                   const size_t outputH,
+                                   const size_t outputW,
+                                   const size_t numChannels,
+                                   const real ratioH,
+                                   const real ratioW) {
+  int nthreads = outputH * outputW;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < nthreads) {
+    int outIdH = tid / outputW;
+    int outIdW = tid % outputW;
+    int inImgSize = inputW / numChannels;
+    int outImgSize = outputW / numChannels;
+    int channelId = outIdW / outImgSize;
+
+    int outImgIdy = (outIdW % outImgSize) / outImgW;
+    int inImgIdy = ratioH * outImgIdy;
+    int hId = (inImgIdy < inImgH - 1) ? 1 : 0;
+    real h1lambda = ratioH * outImgIdy - inImgIdy;
+    real h2lambda = 1.f - h1lambda;
+
+    int outImgIdx = tid % outImgW;
+    int inImgIdx = ratioW * outImgIdx;
+    int wId = (inImgIdx < inImgW - 1) ? 1 : 0;
+    real w1lambda = ratioW * outImgIdx - inImgIdx;
+    real w2lambda = 1.f - w1lambda;
+
+    const real* inPos = &in[outIdH * inputW + channelId * inImgSize +
+                            inImgIdy * inImgW + inImgIdx];
+
+    // bilinear interpolation
+    out[outIdH * outputW + outIdW] =
+        h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wId]) +
+        h1lambda * (w2lambda * inPos[hId * inImgW] +
+                    w1lambda * inPos[hId * inImgW + wId]);
+  }
+}
+
+void hl_bilinear_forward(const real* inData,
+                         const size_t inImgH,
+                         const size_t inImgW,
+                         const size_t inputH,
+                         const size_t inputW,
+                         real* outData,
+                         const size_t outImgH,
+                         const size_t outImgW,
+                         const size_t outputH,
+                         const size_t outputW,
+                         const size_t numChannels,
+                         const real ratioH,
+                         const real ratioW) {
+  int threadNum = outputH * outputW;
+  int blocks = (threadNum + 1024 - 1) / 1024;
+
+  KeBilinearInterpFw<<<blocks, 1024, 0, STREAM_DEFAULT>>>(inData,
+                                                          inImgH,
+                                                          inImgW,
+                                                          inputH,
+                                                          inputW,
+                                                          outData,
+                                                          outImgH,
+                                                          outImgW,
+                                                          outputH,
+                                                          outputW,
+                                                          numChannels,
+                                                          ratioH,
+                                                          ratioW);
+  CHECK_SYNC("hl_bilinear_forward failed");
+}
+
+__global__ void KeBilinearInterpBw(real* in,
+                                   const size_t inImgH,
+                                   const size_t inImgW,
+                                   const size_t inputH,
+                                   const size_t inputW,
+                                   const real* out,
+                                   const size_t outImgH,
+                                   const size_t outImgW,
+                                   const size_t outputH,
+                                   const size_t outputW,
+                                   const size_t numChannels,
+                                   const real ratioH,
+                                   const real ratioW) {
+  int nthreads = outputH * outputW;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < nthreads) {
+    int outIdH = tid / outputW;
+    int outIdW = tid % outputW;
+    int inImgSize = inputW / numChannels;
+    int outImgSize = outputW / numChannels;
+    int channelId = outIdW / outImgSize;
+
+    int outImgIdy = (outIdW % outImgSize) / outImgW;
+    int inImgIdy = ratioH * outImgIdy;
+    int hId = (inImgIdy < inImgH - 1) ? 1 : 0;
+    real h1lambda = ratioH * outImgIdy - inImgIdy;
+    real h2lambda = 1.f - h1lambda;
+
+    int outImgIdx = tid % outImgW;
+    int inImgIdx = ratioW * outImgIdx;
+    int wId = (inImgIdx < inImgW - 1) ? 1 : 0;
+    real w1lambda = ratioW * outImgIdx - inImgIdx;
+    real w2lambda = 1.f - w1lambda;
+
+    real* inPos = &in[outIdH * inputW + channelId * inImgSize +
+                      inImgIdy * inImgW + inImgIdx];
+    const real* outPos = &out[outIdH * outputW + outIdW];
+    paddle::paddleAtomicAdd(&inPos[0], h2lambda * w2lambda * outPos[0]);
+    paddle::paddleAtomicAdd(&inPos[wId], h2lambda * w1lambda * outPos[0]);
+    paddle::paddleAtomicAdd(&inPos[hId * inImgW],
+                            h1lambda * w2lambda * outPos[0]);
+    paddle::paddleAtomicAdd(&inPos[hId * inImgW + wId],
+                            h1lambda * w1lambda * outPos[0]);
+  }
+}
+
+void hl_bilinear_backward(real* inGrad,
+                          const size_t inImgH,
+                          const size_t inImgW,
+                          const size_t inputH,
+                          const size_t inputW,
+                          const real* outGrad,
+                          const size_t outImgH,
+                          const size_t outImgW,
+                          const size_t outputH,
+                          const size_t outputW,
+                          const size_t numChannels,
+                          const real ratioH,
+                          const real ratioW) {
+  int threadNum = outputH * outputW;
+  int blocks = (threadNum + 1024 - 1) / 1024;
+
+  KeBilinearInterpBw<<<blocks, 1024, 0, STREAM_DEFAULT>>>(inGrad,
+                                                          inImgH,
+                                                          inImgW,
+                                                          inputH,
+                                                          inputW,
+                                                          outGrad,
+                                                          outImgH,
+                                                          outImgW,
+                                                          outputH,
+                                                          outputW,
+                                                          numChannels,
+                                                          ratioH,
+                                                          ratioW);
+  CHECK_SYNC("hl_bilinear_backward failed");
+}
+
+__global__ void maxoutFpCompute(size_t nthreads,
+                                const real* inData,
+                                real* outData,
+                                int* idData,
+                                size_t size,
+                                size_t featLen,
+                                size_t groups) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    size_t batch_idx = index / size;
+    size_t i = index % size;
+    size_t channel_idx = i / featLen;
+    size_t feat_idx = i % featLen;
+    size_t data_idx =
+        (batch_idx * size + channel_idx * featLen) * groups + feat_idx;
+    real max = inData[data_idx];
+    int maxId = 0;
+    for (size_t g = 1; g < groups; ++g) {
+      real tmp = inData[data_idx + g * featLen];
+      if (tmp > max) {
+        max = tmp;
+        maxId = g;
+      }
+    }
+    outData[index] = max;
+    idData[index] = maxId;
+  }
+}
+
+void hl_maxout_forward(const real* inData,
+                       real* outData,
+                       int* idData,
+                       size_t batchSize,
+                       size_t size,
+                       size_t featLen,
+                       size_t groups) {
+  int num_kernels = size * batchSize;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  maxoutFpCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(
+      num_kernels, inData, outData, idData, size, featLen, groups);
+  CHECK_SYNC("hl_maxout_forward failed");
+}
+
+__global__ void maxoutBpCompute(size_t nthreads,
+                                real* inGrad,
+                                const real* outGrad,
+                                const int* idData,
+                                size_t size,
+                                size_t featLen,
+                                size_t groups) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    size_t batch_idx = index / size;
+    size_t i = index % size;
+    size_t channel_idx = i / featLen;
+    size_t feat_idx = i % featLen;
+    size_t newIndex = batch_idx * size;
+    size_t gradIdx =
+        (channel_idx * groups + (idData + newIndex)[i]) * featLen + feat_idx;
+    (inGrad + newIndex * groups)[gradIdx] += (outGrad + newIndex)[i];
+  }
+}
+
+void hl_maxout_backward(real* inGrad,
+                        const real* outGrad,
+                        const int* idData,
+                        size_t batchSize,
+                        size_t size,
+                        size_t featLen,
+                        size_t groups) {
+  int num_kernels = size * batchSize;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  maxoutBpCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(
+      num_kernels, inGrad, outGrad, idData, size, featLen, groups);
+  CHECK_SYNC("hl_maxout_backward failed");
+}
+
+__global__ void upsampleForwardCompute(real* input_data,
+                                       real* mask_data,
+                                       size_t nthreads,
+                                       size_t in_h,
+                                       size_t in_w,
+                                       size_t out_h,
+                                       size_t out_w,
+                                       real* output_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    int offset = index / (in_w * in_h) * out_h * out_w;
+    int upsample_idx = static_cast<int>(mask_data[index]);
+    output_data[offset + upsample_idx] = input_data[index];
+  }
+}
+
+__global__ void upsampleBackwardCompute(real* out_grad,
+                                        real* mask_data,
+                                        size_t nthreads,
+                                        size_t in_h,
+                                        size_t in_w,
+                                        size_t out_h,
+                                        size_t out_w,
+                                        real* input_grad) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    int offset = index / (in_w * in_h) * out_h * out_w;
+    int upsample_idx = static_cast<int>(mask_data[index]);
+    input_grad[index] = out_grad[offset + upsample_idx];
+  }
+}
+
+void hl_upsample_forward(real* inputData,
+                         real* maskData,
+                         size_t batchSize,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t channels,
+                         size_t outputH,
+                         size_t outputW,
+                         real* outputData) {
+  int num_kernels = batchSize * imgSizeH * imgSizeW * channels;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  upsampleForwardCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(inputData,
+                                                              maskData,
+                                                              num_kernels,
+                                                              imgSizeH,
+                                                              imgSizeW,
+                                                              outputH,
+                                                              outputW,
+                                                              outputData);
+  CHECK_SYNC("hl_upsample_forward failed");
+}
+
+void hl_upsample_backward(real* outputGradData,
+                          real* maskData,
+                          size_t batchSize,
+                          size_t imgSizeH,
+                          size_t imgSizeW,
+                          size_t channels,
+                          size_t outputH,
+                          size_t outputW,
+                          real* inputGradData) {
+  int num_kernels = batchSize * imgSizeH * imgSizeW * channels;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  upsampleBackwardCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(outputGradData,
+                                                               maskData,
+                                                               num_kernels,
+                                                               imgSizeH,
+                                                               imgSizeW,
+                                                               outputH,
+                                                               outputW,
+                                                               inputGradData);
+  CHECK_SYNC("hl_upsample_backward failed");
+}
diff --git a/paddle/legacy/cuda/src/hl_cuda_cublas.cc b/paddle/legacy/cuda/src/hl_cuda_cublas.cc
new file mode 100644
index 0000000000000000000000000000000000000000..283b8b6e9c8e7b843a8d28b940c6ef53b77ef655
--- /dev/null
+++ b/paddle/legacy/cuda/src/hl_cuda_cublas.cc
@@ -0,0 +1,400 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_cuda_cublas.h"
+#include <sys/time.h>
+#include "hl_cuda.h"
+#include "hl_thread.ph"
+#include "paddle/legacy/utils/DynamicLoader.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace dynload {
+
+std::once_flag cublas_dso_flag;
+void *cublas_dso_handle = nullptr;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cublas routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#ifdef PADDLE_USE_DSO
+#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    cublasStatus_t operator()(Args... args) {                                  \
+      typedef cublasStatus_t (*cublasFunc)(Args...);                           \
+      std::call_once(cublas_dso_flag, GetCublasDsoHandle, &cublas_dso_handle); \
+      void *p_##__name = dlsym(cublas_dso_handle, #__name);                    \
+      return reinterpret_cast<cublasFunc>(p_##__name)(args...);                \
+    }                                                                          \
+  } __name;  // struct DynLoad__##__name
+#else
+#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)      \
+  struct DynLoad__##__name {                  \
+    template <typename... Args>               \
+    cublasStatus_t operator()(Args... args) { \
+      return __name(args...);                 \
+    }                                         \
+  } __name;  // struct DynLoad__##__name
+#endif
+
+#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) DYNAMIC_LOAD_CUBLAS_WRAP(__name)
+
+// include all needed cublas functions in HPPL
+// clang-format off
+#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
+  __macro(cublasSgemv)                    \
+  __macro(cublasDgemv)                    \
+  __macro(cublasSgemm)                    \
+  __macro(cublasDgemm)                    \
+  __macro(cublasSgeam)                    \
+  __macro(cublasDgeam)                    \
+
+DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasCreate)
+DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasDestroy)
+DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetStream)
+DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetPointerMode)
+DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasGetPointerMode)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetrfBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetriBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetrfBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetriBatched)
+CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
+
+#undef DYNAMIC_LOAD_CUBLAS_WRAP
+#undef DYNAMIC_LOAD_CUBLAS_V2_WRAP
+#undef CUBLAS_BLAS_ROUTINE_EACH
+
+} /* namespace dynload */
+
+// clang-format on
+#ifndef PADDLE_TYPE_DOUBLE
+#define CUBLAS_GEAM dynload::cublasSgeam
+#define CUBLAS_GEMV dynload::cublasSgemv
+#define CUBLAS_GEMM dynload::cublasSgemm
+#define CUBLAS_GETRF dynload::cublasSgetrfBatched
+#define CUBLAS_GETRI dynload::cublasSgetriBatched
+#else
+#define CUBLAS_GEAM dynload::cublasDgeam
+#define CUBLAS_GEMV dynload::cublasDgemv
+#define CUBLAS_GEMM dynload::cublasDgemm
+#define CUBLAS_GETRF dynload::cublasDgetrfBatched
+#define CUBLAS_GETRI dynload::cublasDgetriBatched
+#endif
+
+const char *hl_cublas_get_error_string(cublasStatus_t status) {
+  switch (status) {
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return "[cublas status]: not initialized";
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return "[cublas status]: allocate failed";
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return "[cublas status]: invalid value";
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return "[cublas status]: arch mismatch";
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return "[cublas status]: mapping error";
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return "[cublas status]: execution failed";
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return "[cublas status]: internal error";
+    case CUBLAS_STATUS_SUCCESS:
+      return "[cublas status]: success";
+    default:
+      return "[cublas status]: unknown error";
+  }
+}
+
+/**
+ * Check build-in cublas function using glog and it also
+ * support << operator for more details error info.
+ */
+cublasStatus_t g_cublasStat;
+#define CHECK_CUBLAS(cublas_func)               \
+  g_cublasStat = cublas_func;                   \
+  CHECK_EQ(CUBLAS_STATUS_SUCCESS, g_cublasStat) \
+      << "Cublas Error: " << hl_cublas_get_error_string(g_cublasStat) << " "
+
+void hl_cublas_init(cublasHandle_t *cublas_handle, cudaStream_t stream) {
+  CHECK_CUBLAS(dynload::cublasCreate(cublas_handle))
+      << "[cublas init] Cublas create handle faild!";
+
+  CHECK_CUBLAS(dynload::cublasSetStream(*cublas_handle, stream))
+      << "[cublas init] Cublas set stream faild!";
+}
+
+void hl_matrix_transpose(
+    real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc) {
+  real alpha = 1.0;
+  real beta = 0.0;
+
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  CHECK_CUBLAS(CUBLAS_GEAM(t_resource.handle,
+                           CUBLAS_OP_T,
+                           CUBLAS_OP_N,
+                           dimM,
+                           dimN,
+                           &alpha,
+                           A_d,
+                           lda,
+                           &beta,
+                           nullptr,
+                           dimM,
+                           C_d,
+                           ldc));
+  CHECK_SYNC("hl_matrix_transpose failed");
+}
+
+void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN) {
+  hl_matrix_transpose(A_d, C_d, dimM, dimN, dimN, dimM);
+}
+
+void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
+  /* Solve Ax = I */
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  /* Step 1: Compute the LU decomposition of matrix A */
+  real **inout_h = &A_d;
+  real **inout_d = (real **)hl_malloc_device(sizeof(real *));
+  hl_memcpy(inout_d, inout_h, sizeof(real *));
+
+  int *pivot_d = (int *)hl_malloc_device(dimN * sizeof(int));
+  int *info_d = (int *)t_resource.gpu_mem;
+
+  /* Note: cublasSgetrfBatched is used to calculate a number of
+     small-sized matrices. There may be a better way to reconstruct
+     the API for better performance.
+   */
+  CHECK_CUBLAS(
+      CUBLAS_GETRF(t_resource.handle, dimN, inout_d, lda, pivot_d, info_d, 1));
+
+  int info_h;
+  hl_memcpy(&info_h, info_d, sizeof(int));
+  if (info_h != 0) {
+    LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
+  }
+
+  /* Step 2: Compute the inverse of the matrix given its LU decomposition */
+  real **out_h = &C_d;
+  real **out_d = (real **)hl_malloc_device(sizeof(real *));
+  hl_memcpy(out_d, out_h, sizeof(real *));
+
+  CHECK_CUBLAS(CUBLAS_GETRI(t_resource.handle,
+                            dimN,
+                            (const real **)inout_d,
+                            lda,
+                            pivot_d,
+                            out_d,
+                            ldc,
+                            info_d,
+                            1));
+
+  hl_memcpy(&info_h, info_d, sizeof(int));
+  if (info_h != 0) {
+    LOG(FATAL) << "Inversion of matrix failed: matrix may be singular.\n";
+  }
+
+  hl_free_mem_device(inout_d);
+  hl_free_mem_device(pivot_d);
+  hl_free_mem_device(out_d);
+
+  CHECK_SYNC("hl_matrix_inverse failed");
+}
+
+void hl_matrix_mul(real *A_d,
+                   hl_trans_op_t transa,
+                   real *B_d,
+                   hl_trans_op_t transb,
+                   real *C_d,
+                   int dimM,
+                   int dimN,
+                   int dimK,
+                   real alpha,
+                   real beta,
+                   int lda,
+                   int ldb,
+                   int ldc) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+  CHECK_NOTNULL(C_d);
+
+  if (dimN == 1 && dimM != 1 && dimK != 1 && transb == HPPL_OP_N) {
+    int m = (transa == HPPL_OP_N) ? dimM : dimK;
+    int n = (transa == HPPL_OP_N) ? dimK : dimM;
+    hl_matrix_mul_vector(
+        A_d, transa, B_d, C_d, m, n, alpha, beta, lda, ldb, ldc);
+    return;
+  }
+
+  if (dimM == 1 && dimN != 1 && dimK != 1 && transa == HPPL_OP_N) {
+    int m = (transb == HPPL_OP_N) ? dimK : dimN;
+    int n = (transb == HPPL_OP_N) ? dimN : dimK;
+    hl_trans_op_t trans = (transb == HPPL_OP_N) ? HPPL_OP_T : HPPL_OP_N;
+    hl_matrix_mul_vector(B_d, trans, A_d, C_d, m, n, alpha, beta, ldb, 1, 1);
+    return;
+  }
+
+  cublasStatus_t stat;
+  if ((HPPL_OP_N == transa) && (HPPL_OP_N == transb)) {
+    stat = CUBLAS_GEMM(t_resource.handle,
+                       CUBLAS_OP_N,
+                       CUBLAS_OP_N,
+                       dimN,
+                       dimM,
+                       dimK,
+                       &alpha,
+                       B_d,
+                       ldb,
+                       A_d,
+                       lda,
+                       &beta,
+                       C_d,
+                       ldc);
+  } else if ((HPPL_OP_T == transa) && (HPPL_OP_N == transb)) {
+    stat = CUBLAS_GEMM(t_resource.handle,
+                       CUBLAS_OP_N,
+                       CUBLAS_OP_T,
+                       dimN,
+                       dimM,
+                       dimK,
+                       &alpha,
+                       B_d,
+                       ldb,
+                       A_d,
+                       lda,
+                       &beta,
+                       C_d,
+                       ldc);
+  } else if ((HPPL_OP_N == transa) && (HPPL_OP_T == transb)) {
+    stat = CUBLAS_GEMM(t_resource.handle,
+                       CUBLAS_OP_T,
+                       CUBLAS_OP_N,
+                       dimN,
+                       dimM,
+                       dimK,
+                       &alpha,
+                       B_d,
+                       ldb,
+                       A_d,
+                       lda,
+                       &beta,
+                       C_d,
+                       ldc);
+  } else {
+    LOG(FATAL) << "parameter transa error!";
+  }
+  CHECK_EQ(stat, CUBLAS_STATUS_SUCCESS) << hl_cublas_get_error_string(stat);
+  CHECK_SYNC("hl_matrix_mul failed");
+}
+
+void hl_matrix_mul(real *A_d,
+                   hl_trans_op_t transa,
+                   real *B_d,
+                   hl_trans_op_t transb,
+                   real *C_d,
+                   int dimM,
+                   int dimN,
+                   int dimK,
+                   real alpha,
+                   real beta) {
+  int lda = (HPPL_OP_N == transa) ? dimK : dimM;
+  int ldb = (HPPL_OP_N == transb) ? dimN : dimK;
+  int ldc = dimN;
+
+  hl_matrix_mul(A_d,
+                transa,
+                B_d,
+                transb,
+                C_d,
+                dimM,
+                dimN,
+                dimK,
+                alpha,
+                beta,
+                lda,
+                ldb,
+                ldc);
+}
+
+void hl_matrix_mul_vector(real *A_d,
+                          hl_trans_op_t trans,
+                          real *B_d,
+                          real *C_d,
+                          int dimM,
+                          int dimN,
+                          real alpha,
+                          real beta,
+                          int lda,
+                          int incb,
+                          int incc) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+  CHECK_NOTNULL(C_d);
+
+  cublasStatus_t stat;
+  if (HPPL_OP_N == trans) {
+    stat = CUBLAS_GEMV(t_resource.handle,
+                       CUBLAS_OP_T,
+                       dimN,
+                       dimM,
+                       &alpha,
+                       A_d,
+                       lda,
+                       B_d,
+                       incb,
+                       &beta,
+                       C_d,
+                       incc);
+  } else if (HPPL_OP_T == trans) {
+    stat = CUBLAS_GEMV(t_resource.handle,
+                       CUBLAS_OP_N,
+                       dimN,
+                       dimM,
+                       &alpha,
+                       A_d,
+                       lda,
+                       B_d,
+                       incb,
+                       &beta,
+                       C_d,
+                       incc);
+  } else {
+    LOG(FATAL) << "parameter transa error!";
+  }
+
+  CHECK_EQ(stat, CUBLAS_STATUS_SUCCESS) << hl_cublas_get_error_string(stat);
+  CHECK_SYNC("hl_matrix_mul_vector");
+}
+
+void hl_matrix_mul_vector(real *A_d,
+                          hl_trans_op_t trans,
+                          real *B_d,
+                          real *C_d,
+                          int dimM,
+                          int dimN,
+                          real alpha,
+                          real beta) {
+  hl_matrix_mul_vector(
+      A_d, trans, B_d, C_d, dimM, dimN, alpha, beta, dimN, 1, 1);
+}
diff --git a/paddle/legacy/cuda/src/hl_cuda_cudnn.cc b/paddle/legacy/cuda/src/hl_cuda_cudnn.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b0ac5aaac284cd939fc46be6a7320242312674ab
--- /dev/null
+++ b/paddle/legacy/cuda/src/hl_cuda_cudnn.cc
@@ -0,0 +1,1117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_cuda_cudnn.h"
+#include <cudnn.h>
+#include <gflags/gflags.h>
+#include "hl_cuda_cudnn.ph"
+#include "hl_thread.ph"
+#include "paddle/legacy/utils/DynamicLoader.h"
+#include "paddle/legacy/utils/Logging.h"
+
+DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
+             4096,
+             "Specify cuDNN max workspace limit, in units MB, "
+             "4096MB=4GB by default.");
+
+namespace dynload {
+
+std::once_flag cudnn_dso_flag;
+void* cudnn_dso_handle = nullptr;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cudbnn routine
+ * via operator overloading: operator ()
+ *
+ * note: default dynamic linked libs
+ **/
+
+#ifdef PADDLE_USE_DSO
+
+#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                                     \
+  struct DynLoad__##__name {                                                \
+    template <typename... Args>                                             \
+    auto operator()(Args... args) -> decltype(__name(args...)) {            \
+      using cudnn_func = decltype(__name(args...)) (*)(Args...);            \
+      std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, &cudnn_dso_handle); \
+      void* p_##__name = dlsym(cudnn_dso_handle, #__name);                  \
+      return reinterpret_cast<cudnn_func>(p_##__name)(args...);             \
+    }                                                                       \
+  } __name; /* struct DynLoad__##__name */
+
+#else
+
+#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                          \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      return __name(args...);                                    \
+    }                                                            \
+  } __name; /* struct DynLoad__##__name */
+
+#endif
+
+/**
+ * include all needed cudnn functions in HPPL
+ * different cudnn version has different interfaces
+ **/
+// clang-format off
+#define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
+  __macro(cudnnSetTensor4dDescriptor)                     \
+  __macro(cudnnSetTensor4dDescriptorEx)                   \
+  __macro(cudnnGetConvolutionNdForwardOutputDim)          \
+  __macro(cudnnGetConvolutionForwardAlgorithm)            \
+  __macro(cudnnCreateTensorDescriptor)                    \
+  __macro(cudnnDestroyTensorDescriptor)                   \
+  __macro(cudnnCreateFilterDescriptor)                    \
+  __macro(cudnnSetFilter4dDescriptor)                     \
+  __macro(cudnnSetPooling2dDescriptor)                    \
+  __macro(cudnnDestroyFilterDescriptor)                   \
+  __macro(cudnnCreateConvolutionDescriptor)               \
+  __macro(cudnnCreatePoolingDescriptor)                   \
+  __macro(cudnnDestroyPoolingDescriptor)                  \
+  __macro(cudnnSetConvolution2dDescriptor)                \
+  __macro(cudnnDestroyConvolutionDescriptor)              \
+  __macro(cudnnCreate)                                    \
+  __macro(cudnnDestroy)                                   \
+  __macro(cudnnSetStream)                                 \
+  __macro(cudnnActivationForward)                         \
+  __macro(cudnnConvolutionForward)                        \
+  __macro(cudnnConvolutionBackwardBias)                   \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize)        \
+  __macro(cudnnTransformTensor)                           \
+  __macro(cudnnPoolingForward)                            \
+  __macro(cudnnPoolingBackward)                           \
+  __macro(cudnnSoftmaxBackward)                           \
+  __macro(cudnnSoftmaxForward)                            \
+  __macro(cudnnGetVersion)                                \
+  __macro(cudnnGetErrorString)
+CUDNN_DNN_ROUTINE_EACH(DYNAMIC_LOAD_CUDNN_WRAP)
+
+#define CUDNN_DNN_ROUTINE_EACH_R2(__macro)                \
+  __macro(cudnnAddTensor)                                 \
+  __macro(cudnnConvolutionBackwardData)                   \
+  __macro(cudnnConvolutionBackwardFilter)
+CUDNN_DNN_ROUTINE_EACH_R2(DYNAMIC_LOAD_CUDNN_WRAP)
+
+// APIs available after R3:
+#if CUDNN_VERSION >= 3000
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)              \
+  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize)     \
+  __macro(cudnnGetConvolutionBackwardDataAlgorithm)           \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithm)         \
+  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize)
+CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DYNAMIC_LOAD_CUDNN_WRAP)
+#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
+#endif
+
+
+// APIs available after R4:
+#if CUDNN_VERSION >= 4007
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)             \
+  __macro(cudnnBatchNormalizationForwardTraining)            \
+  __macro(cudnnBatchNormalizationForwardInference)           \
+  __macro(cudnnBatchNormalizationBackward)
+CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DYNAMIC_LOAD_CUDNN_WRAP)
+#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
+#endif
+
+// APIs in R5
+#if CUDNN_VERSION >= 5000
+#define CUDNN_DNN_ROUTINE_EACH_R5(__macro)                    \
+  __macro(cudnnCreateActivationDescriptor)                    \
+  __macro(cudnnSetActivationDescriptor)                       \
+  __macro(cudnnGetActivationDescriptor)                       \
+  __macro(cudnnDestroyActivationDescriptor)
+CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
+#undef CUDNN_DNN_ROUTINE_EACH_R5
+#endif
+
+#undef CUDNN_DNN_ROUTINE_EACH
+// clang-format on
+} /* namespace dynload */
+
+/**
+ * Check build-in cudnn function using glog and it **does not**
+ * support << operator for more details error info.
+ */
+#define CHECK_CUDNN(cudnnFunc)                                         \
+  do {                                                                 \
+    cudnnStatus_t cudnnStat = cudnnFunc;                               \
+    CHECK_EQ(CUDNN_STATUS_SUCCESS, cudnnStat)                          \
+        << "Cudnn Error: " << dynload::cudnnGetErrorString(cudnnStat); \
+  } while (0)
+
+bool g_is_libcudnn_init = false;
+int g_cudnn_lib_version = 0;
+
+void hl_cudnn_desc_init(cudnnTensorDescriptor_t* cudnn_desc) {
+  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc));
+}
+
+void hl_cudnn_init(cudnnHandle_t* cudnn_handle, cudaStream_t stream) {
+  size_t cudnn_dso_ver = dynload::cudnnGetVersion();
+  size_t cudnn_dso_major = cudnn_dso_ver / 1000;
+  size_t cudnn_cuh_major = CUDNN_VERSION / 1000;
+
+  // Compare cudnn header version with that of cudnn.so.
+  CHECK((cudnn_cuh_major < 4 && cudnn_dso_major < 4) ||
+        (cudnn_cuh_major == cudnn_dso_major))
+      << "[cudnn init] libcudnn v" << cudnn_dso_major << " with header v"
+      << cudnn_cuh_major << " unmatched!\n"
+      << "PaddlePaddle Requirement: "
+      << "(header v[2-3] with libcudnn v[2-3]) Or "
+      << "(header v4 with libcudnn v4) Or "
+      << "(header v5 with libcudnn v5) Or"
+      << "(header v6 with libcudnn v6).";
+
+  CHECK(!(CUDNN_VERSION < 6000 && CUDNN_VERSION >= 5000 && CUDA_VERSION < 7050))
+      << "cudnn v5 requires cuda version >= 7.5";
+
+  CHECK(!(CUDNN_VERSION >= 6000 && CUDA_VERSION < 8000))
+      << "cudnn v6 requires cuda version >= 8.0";
+
+  CHECK_CUDNN(dynload::cudnnCreate(cudnn_handle));
+  CHECK_CUDNN(dynload::cudnnSetStream(*cudnn_handle, stream));
+
+  g_is_libcudnn_init = true;
+  g_cudnn_lib_version = cudnn_dso_ver;
+}
+
+int hl_get_cudnn_lib_version() { return g_cudnn_lib_version; }
+
+void hl_conv_workspace(hl_tensor_descriptor input,
+                       hl_tensor_descriptor output,
+                       hl_filter_descriptor filter,
+                       hl_convolution_descriptor conv,
+                       int* convFwdAlgo,
+                       size_t* fwdLimitBytes,
+                       int* convBwdDataAlgo,
+                       size_t* bwdDataLimitBytes,
+                       int* convBwdFilterAlgo,
+                       size_t* bwdFilterLimitBytes,
+                       bool useDilation) {
+#if CUDNN_VERSION >= 4000
+
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(filter);
+  CHECK_NOTNULL(conv);
+
+  // Specify workspace limit directly
+  size_t memoryLimitBytes =
+      (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
+
+  // For dilation
+  int algo = 0;
+
+  // cudnn convolution forward configuration
+  cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnFilterDescriptor_t fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnConvolutionDescriptor_t fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+  // cudnn convolution backward data configuration
+  cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnConvolutionDescriptor_t bwd_data_conv_desc =
+      GET_CONVOLUTION_DESCRIPTOR(conv);
+  // cudnn convolution backward filter configuration
+  cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
+      GET_CONVOLUTION_DESCRIPTOR(conv);
+  cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter);
+
+  if (useDilation) {
+    convFwdAlgo = &algo;
+    convBwdDataAlgo = &algo;
+    convBwdFilterAlgo = &algo;
+  } else {
+    CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
+        t_resource.cudnn_handle,
+        fwd_src_desc,
+        fwd_filter_desc,
+        fwd_conv_desc,
+        fwd_dest_desc,
+        CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+        memoryLimitBytes,
+        reinterpret_cast<cudnnConvolutionFwdAlgo_t*>(convFwdAlgo)));
+    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+        t_resource.cudnn_handle,
+        bwd_data_filter_desc,
+        bwd_data_diff_desc,
+        bwd_data_conv_desc,
+        bwd_data_grad_desc,
+        CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+        memoryLimitBytes,
+        reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
+    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+        t_resource.cudnn_handle,
+        bwd_filter_src_desc,
+        bwd_filter_diff_desc,
+        bwd_filter_conv_desc,
+        bwd_filter_grad_desc,
+        CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+        memoryLimitBytes,
+        reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
+  }
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
+      t_resource.cudnn_handle,
+      fwd_src_desc,
+      fwd_filter_desc,
+      fwd_conv_desc,
+      fwd_dest_desc,
+      static_cast<cudnnConvolutionFwdAlgo_t>(*convFwdAlgo),
+      fwdLimitBytes));
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
+      t_resource.cudnn_handle,
+      bwd_data_filter_desc,
+      bwd_data_diff_desc,
+      bwd_data_conv_desc,
+      bwd_data_grad_desc,
+      static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
+      bwdDataLimitBytes));
+
+  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
+      t_resource.cudnn_handle,
+      bwd_filter_src_desc,
+      bwd_filter_diff_desc,
+      bwd_filter_conv_desc,
+      bwd_filter_grad_desc,
+      static_cast<cudnnConvolutionBwdFilterAlgo_t>(*convBwdFilterAlgo),
+      bwdFilterLimitBytes));
+
+#endif
+}
+
+void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc,
+                                 int batch_size,
+                                 int feature_maps,
+                                 int height,
+                                 int width) {
+  CHECK_NOTNULL(image_desc);
+
+  cudnn_tensor_descriptor hl_desc =
+      (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
+  CHECK_NOTNULL(hl_desc);
+
+#ifndef PADDLE_TYPE_DOUBLE
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+#else
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+#endif
+  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
+
+  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(hl_desc->desc,
+                                                  CUDNN_TENSOR_NCHW,
+                                                  data_type,
+                                                  batch_size,
+                                                  feature_maps,
+                                                  height,
+                                                  width));
+
+  hl_desc->format = CUDNN_TENSOR_NCHW;
+  hl_desc->data_type = data_type;
+  hl_desc->batch_size = batch_size;
+  hl_desc->feature_maps = feature_maps;
+  hl_desc->height = height;
+  hl_desc->width = width;
+
+  *image_desc = (hl_tensor_descriptor)hl_desc;
+}
+
+void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {
+  CHECK_NOTNULL(image_desc);
+
+  cudnn_tensor_descriptor hl_desc =
+      (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
+  CHECK_NOTNULL(hl_desc);
+
+#ifndef PADDLE_TYPE_DOUBLE
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+#else
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+#endif
+  CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc));
+
+  hl_desc->data_type = data_type;
+
+  *image_desc = (hl_tensor_descriptor)hl_desc;
+}
+
+void hl_tensor_reshape(hl_tensor_descriptor image_desc,
+                       int batch_size,
+                       int feature_maps,
+                       int height,
+                       int width) {
+  const int stride_w = 1;
+  const int stride_h = width * stride_w;
+  const int stride_c = height * stride_h;
+  const int stride_n = feature_maps * stride_c;
+  return hl_tensor_reshape(image_desc,
+                           batch_size,
+                           feature_maps,
+                           height,
+                           width,
+                           stride_n,
+                           stride_c,
+                           stride_h,
+                           stride_w);
+}
+
+void hl_tensor_reshape(hl_tensor_descriptor image_desc,
+                       int batch_size,
+                       int feature_maps,
+                       int height,
+                       int width,
+                       int nStride,
+                       int cStride,
+                       int hStride,
+                       int wStride) {
+  CHECK_NOTNULL(image_desc);
+
+  cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
+  CHECK_NOTNULL(hl_desc->desc);
+
+  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptorEx(hl_desc->desc,
+                                                    hl_desc->data_type,
+                                                    batch_size,
+                                                    feature_maps,
+                                                    height,
+                                                    width,
+                                                    nStride,
+                                                    cStride,
+                                                    hStride,
+                                                    wStride));
+
+  hl_desc->batch_size = batch_size;
+  hl_desc->feature_maps = feature_maps;
+  hl_desc->height = height;
+  hl_desc->width = width;
+}
+
+void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc) {
+  CHECK_NOTNULL(image_desc);
+
+  cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc;
+  CHECK_NOTNULL(hl_desc->desc);
+
+  CHECK_CUDNN(dynload::cudnnDestroyTensorDescriptor(hl_desc->desc));
+
+  hl_desc->desc = NULL;
+
+  free(image_desc);
+}
+
+void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
+                                  hl_pooling_mode_t mode,
+                                  int height,
+                                  int width,
+                                  int height_padding,
+                                  int width_padding,
+                                  int stride_height,
+                                  int stride_width) {
+  cudnnPoolingMode_t cudnn_mode;
+  switch (mode) {
+    case HL_POOLING_MAX:
+      cudnn_mode = CUDNN_POOLING_MAX;
+      break;
+    case HL_POOLING_AVERAGE:
+      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
+      break;
+    case HL_POOLING_AVERAGE_INCLUDE_PADDING:
+      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+      break;
+    default:
+      LOG(FATAL) << "parameter mode error";
+  }
+
+  CHECK_NOTNULL(pooling_desc);
+
+  cudnn_pooling_descriptor hl_pooling_desc =
+      (cudnn_pooling_descriptor)malloc(sizeof(_cudnn_pooling_descriptor));
+  CHECK_NOTNULL(hl_pooling_desc);
+
+  CHECK_CUDNN(dynload::cudnnCreatePoolingDescriptor(&hl_pooling_desc->desc));
+
+  CHECK_CUDNN(dynload::cudnnSetPooling2dDescriptor(hl_pooling_desc->desc,
+                                                   cudnn_mode,
+#if CUDNN_VERSION >= 5000
+                                                   CUDNN_PROPAGATE_NAN,
+#endif
+                                                   height,
+                                                   width,
+                                                   height_padding,
+                                                   width_padding,
+                                                   stride_height,
+                                                   stride_width));
+
+  hl_pooling_desc->mode = cudnn_mode;
+  hl_pooling_desc->window_height = height;
+  hl_pooling_desc->window_width = width;
+  hl_pooling_desc->stride_height = stride_height;
+  hl_pooling_desc->stride_width = stride_width;
+
+  *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc;
+}
+
+void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc) {
+  CHECK_NOTNULL(pooling_desc);
+
+  cudnn_pooling_descriptor hl_pooling = (cudnn_pooling_descriptor)pooling_desc;
+
+  CHECK_NOTNULL(hl_pooling->desc);
+  CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc));
+
+  hl_pooling->desc = NULL;
+
+  free(pooling_desc);
+}
+
+void hl_pooling_forward(hl_tensor_descriptor input,
+                        real* input_image,
+                        hl_tensor_descriptor output,
+                        real* output_image,
+                        hl_pooling_descriptor pooling) {
+  cudnnPoolingDescriptor_t pooling_desc;
+  cudnnTensorDescriptor_t input_desc;
+  cudnnTensorDescriptor_t output_desc;
+
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(pooling);
+  CHECK_NOTNULL(input_image);
+  CHECK_NOTNULL(output_image);
+
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  input_desc = ((cudnn_tensor_descriptor)input)->desc;
+  output_desc = ((cudnn_tensor_descriptor)output)->desc;
+  pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
+  CHECK_CUDNN(dynload::cudnnPoolingForward(t_resource.cudnn_handle,
+                                           pooling_desc,
+                                           &alpha,
+                                           input_desc,
+                                           input_image,
+                                           &beta,
+                                           output_desc,
+                                           output_image));
+  CHECK_SYNC("hl_pooling_forward failed");
+}
+
+void hl_pooling_backward(hl_tensor_descriptor input,
+                         real* input_image,
+                         real* input_image_grad,
+                         hl_tensor_descriptor output,
+                         real* output_image,
+                         real* output_image_grad,
+                         hl_pooling_descriptor pooling) {
+  cudnnPoolingDescriptor_t pooling_desc;
+  cudnnTensorDescriptor_t input_desc;
+  cudnnTensorDescriptor_t output_desc;
+
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(pooling);
+  CHECK_NOTNULL(input_image);
+  CHECK_NOTNULL(input_image_grad);
+  CHECK_NOTNULL(output_image);
+  CHECK_NOTNULL(output_image_grad);
+
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  input_desc = ((cudnn_tensor_descriptor)input)->desc;
+  output_desc = ((cudnn_tensor_descriptor)output)->desc;
+  pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc;
+  CHECK_CUDNN(dynload::cudnnPoolingBackward(t_resource.cudnn_handle,
+                                            pooling_desc,
+                                            &alpha,
+                                            output_desc,
+                                            output_image,
+                                            output_desc,
+                                            output_image_grad,
+                                            input_desc,
+                                            input_image,
+                                            &beta,
+                                            input_desc,
+                                            input_image_grad));
+  CHECK_SYNC("hl_pooling_backward failed");
+}
+
+void hl_create_filter_descriptor(hl_filter_descriptor* filter,
+                                 int input_feature_maps,
+                                 int output_feature_maps,
+                                 int height,
+                                 int width) {
+  CHECK_NOTNULL(filter);
+
+  cudnn_filter_descriptor hl_filter =
+      (cudnn_filter_descriptor)malloc(sizeof(_cudnn_filter_descriptor));
+  CHECK_NOTNULL(hl_filter);
+
+  CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc));
+
+#ifndef PADDLE_TYPE_DOUBLE
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+#else
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+#endif
+  CHECK_CUDNN(dynload::cudnnSetFilter4dDescriptor(hl_filter->desc,
+                                                  data_type,
+#if CUDNN_VERSION >= 5000
+                                                  CUDNN_TENSOR_NCHW,
+#endif
+                                                  output_feature_maps,
+                                                  input_feature_maps,
+                                                  height,
+                                                  width));
+
+  hl_filter->data_type = data_type;
+  hl_filter->output_feature_maps = output_feature_maps;
+  hl_filter->input_feature_maps = input_feature_maps;
+  hl_filter->filter_height = height;
+  hl_filter->filter_width = width;
+
+  *filter = (hl_filter_descriptor)hl_filter;
+}
+
+void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {
+  CHECK_NOTNULL(filter);
+
+  cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter;
+  CHECK_NOTNULL(hl_filter->desc);
+
+  CHECK_CUDNN(dynload::cudnnDestroyFilterDescriptor(hl_filter->desc));
+
+  hl_filter->desc = NULL;
+
+  free(filter);
+}
+
+void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
+                                      hl_tensor_descriptor image,
+                                      hl_filter_descriptor filter,
+                                      int padding_height,
+                                      int padding_width,
+                                      int stride_height,
+                                      int stride_width,
+                                      int dilation_h,
+                                      int dilation_w) {
+  CHECK_NOTNULL(conv);
+
+  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)malloc(
+      sizeof(_cudnn_convolution_descriptor));
+
+  CHECK_NOTNULL(hl_conv);
+  CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc));
+
+  cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
+
+#if CUDNN_VERSION >= 6000
+#ifndef PADDLE_TYPE_DOUBLE
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+#else
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+#endif
+  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
+                                                       padding_height,
+                                                       padding_width,
+                                                       stride_height,
+                                                       stride_width,
+                                                       dilation_h,
+                                                       dilation_w,
+                                                       mode,
+                                                       data_type));
+#else
+  if (dilation_h > 1 || dilation_w > 1) {
+    LOG(FATAL)
+        << "Current cuDNN version does't support for dilation convolution. "
+        << "The dilation convolution requires cuDNN >= v6.0.";
+  }
+
+  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
+                                                       padding_height,
+                                                       padding_width,
+                                                       stride_height,
+                                                       stride_width,
+                                                       dilation_h,
+                                                       dilation_w,
+                                                       mode));
+#endif
+
+  hl_conv->input_image = image;
+  hl_conv->filter = filter;
+  hl_conv->padding_height = padding_height;
+  hl_conv->padding_width = padding_width;
+  hl_conv->stride_height = stride_height;
+  hl_conv->stride_width = stride_width;
+  hl_conv->upscalex = 1;
+  hl_conv->upscaley = 1;
+  hl_conv->mode = mode;
+
+  *conv = (hl_convolution_descriptor)hl_conv;
+}
+
+void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
+                                     hl_tensor_descriptor image,
+                                     hl_filter_descriptor filter,
+                                     int padding_height,
+                                     int padding_width,
+                                     int stride_height,
+                                     int stride_width,
+                                     int dilation_h,
+                                     int dilation_w) {
+  CHECK_NOTNULL(conv);
+  CHECK_NOTNULL(image);
+  CHECK_NOTNULL(filter);
+
+  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+  cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION;
+
+#if CUDNN_VERSION >= 6000
+#ifndef PADDLE_TYPE_DOUBLE
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+#else
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+#endif
+  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(conv_desc,
+                                                       padding_height,
+                                                       padding_width,
+                                                       stride_height,
+                                                       stride_width,
+                                                       dilation_h,
+                                                       dilation_w,
+                                                       mode,
+                                                       data_type));
+#else
+  CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(conv_desc,
+                                                       padding_height,
+                                                       padding_width,
+                                                       stride_height,
+                                                       stride_width,
+                                                       dilation_h,
+                                                       dilation_w,
+                                                       mode));
+#endif
+
+  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
+  hl_conv->input_image = image;
+  hl_conv->filter = filter;
+  hl_conv->padding_height = padding_height;
+  hl_conv->padding_width = padding_width;
+  hl_conv->stride_height = stride_height;
+  hl_conv->stride_width = stride_width;
+  hl_conv->upscalex = 1;
+  hl_conv->upscaley = 1;
+  hl_conv->mode = mode;
+}
+
+void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {
+  CHECK_NOTNULL(conv);
+
+  cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv;
+  CHECK_NOTNULL(hl_conv->desc);
+
+  CHECK_CUDNN(dynload::cudnnDestroyConvolutionDescriptor(hl_conv->desc));
+  hl_conv->desc = NULL;
+
+  free(conv);
+}
+
+void hl_convolution_forward(hl_tensor_descriptor input,
+                            real* input_data,
+                            hl_tensor_descriptor output,
+                            real* output_data,
+                            hl_filter_descriptor filter,
+                            real* filter_data,
+                            hl_convolution_descriptor conv,
+                            void* gpuWorkSpace,
+                            size_t sizeInBytes,
+                            int convFwdAlgo) {
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(filter);
+  CHECK_NOTNULL(conv);
+  CHECK_NOTNULL(input_data);
+  CHECK_NOTNULL(output_data);
+  CHECK_NOTNULL(filter_data);
+  cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t dest_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  CHECK_CUDNN(dynload::cudnnConvolutionForward(
+      t_resource.cudnn_handle,
+      &alpha,
+      src_desc,
+      input_data,
+      filter_desc,
+      filter_data,
+      conv_desc,
+      static_cast<cudnnConvolutionFwdAlgo_t>(convFwdAlgo),
+      gpuWorkSpace,
+      sizeInBytes,
+      &beta,
+      dest_desc,
+      output_data));
+  CHECK_SYNC("hl_convolution_forward failed");
+}
+
+void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
+                                     real* bias_data,
+                                     hl_tensor_descriptor output,
+                                     real* output_data) {
+  CHECK_NOTNULL(bias);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(bias_data);
+  CHECK_NOTNULL(output_data);
+
+  cudnnTensorDescriptor_t output_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
+  real alpha = 1.0f;
+  real beta = 1.0f;
+
+  CHECK_CUDNN(dynload::cudnnAddTensor(t_resource.cudnn_handle,
+#if CUDNN_VERSION < 4000
+                                      CUDNN_ADD_SAME_C,
+#endif
+                                      &alpha,
+                                      bias_desc,
+                                      bias_data,
+                                      &beta,
+                                      output_desc,
+                                      output_data));
+  CHECK_SYNC("hl_convolution_forward_add_bias failed");
+}
+
+void hl_convolution_backward_bias(hl_tensor_descriptor bias,
+                                  real* bias_grad_data,
+                                  hl_tensor_descriptor output,
+                                  real* output_grad_data) {
+  CHECK_NOTNULL(bias);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(bias_grad_data);
+  CHECK_NOTNULL(output_grad_data);
+
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias);
+  CHECK_CUDNN(dynload::cudnnConvolutionBackwardBias(t_resource.cudnn_handle,
+                                                    &alpha,
+                                                    diff_desc,
+                                                    output_grad_data,
+                                                    &beta,
+                                                    bias_desc,
+                                                    bias_grad_data));
+  CHECK_SYNC("hl_convolution_backward_bias failed");
+}
+
+void hl_convolution_backward_filter(hl_tensor_descriptor input,
+                                    real* input_data,
+                                    hl_tensor_descriptor output,
+                                    real* output_grad_data,
+                                    hl_filter_descriptor filter,
+                                    real* filter_grad_data,
+                                    hl_convolution_descriptor conv,
+                                    void* gpuWorkSpace,
+                                    size_t sizeInBytes,
+                                    int convBwdFilterAlgo) {
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(filter);
+  CHECK_NOTNULL(conv);
+  CHECK_NOTNULL(input_data);
+  CHECK_NOTNULL(output_grad_data);
+  CHECK_NOTNULL(filter_grad_data);
+
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+  cudnnFilterDescriptor_t grad_desc = GET_FILTER_DESCRIPTOR(filter);
+
+  CHECK_CUDNN(dynload::cudnnConvolutionBackwardFilter(
+      t_resource.cudnn_handle,
+      &alpha,
+      src_desc,
+      input_data,
+      diff_desc,
+      output_grad_data,
+      conv_desc,
+#if CUDNN_VERSION >= 4000
+      static_cast<cudnnConvolutionBwdFilterAlgo_t>(convBwdFilterAlgo),
+      gpuWorkSpace,
+      sizeInBytes,
+#endif
+      &beta,
+      grad_desc,
+      filter_grad_data));
+  CHECK_SYNC("hl_convolution_backward_filter failed");
+}
+
+void hl_convolution_backward_data(hl_tensor_descriptor input,
+                                  real* input_data_grad,
+                                  hl_tensor_descriptor output,
+                                  real* output_grad_data,
+                                  hl_filter_descriptor filter,
+                                  real* filter_data,
+                                  hl_convolution_descriptor conv,
+                                  void* gpuWorkSpace,
+                                  size_t sizeInBytes,
+                                  int convBwdDataAlgo) {
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t grad_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+
+  CHECK_CUDNN(dynload::cudnnConvolutionBackwardData(
+      t_resource.cudnn_handle,
+      &alpha,
+      filter_desc,
+      filter_data,
+      diff_desc,
+      output_grad_data,
+      conv_desc,
+#if CUDNN_VERSION >= 4000
+      static_cast<cudnnConvolutionBwdDataAlgo_t>(convBwdDataAlgo),
+      gpuWorkSpace,
+      sizeInBytes,
+#endif
+      &beta,
+      grad_desc,
+      input_data_grad));
+  CHECK_SYNC("hl_convolution_backward_data failed");
+}
+
+void hl_softmax_forward(real* input, real* output, int height, int width) {
+#ifndef PADDLE_TYPE_DOUBLE
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+#else
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+#endif
+  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc,
+                                                  CUDNN_TENSOR_NCHW,
+                                                  data_type,
+                                                  height,
+                                                  width,
+                                                  1,
+                                                  1));
+
+  real alpha = 1.0f;
+  real beta = 0.0f;
+  CHECK_CUDNN(dynload::cudnnSoftmaxForward(t_resource.cudnn_handle,
+                                           CUDNN_SOFTMAX_ACCURATE,
+                                           CUDNN_SOFTMAX_MODE_CHANNEL,
+                                           &alpha,
+                                           t_resource.cudnn_desc,
+                                           input,
+                                           &beta,
+                                           t_resource.cudnn_desc,
+                                           output));
+  CHECK_SYNC("hl_softmax_forward failed");
+}
+
+void hl_softmax_backward(real* output_value,
+                         real* output_grad,
+                         int height,
+                         int width) {
+#ifndef PADDLE_TYPE_DOUBLE
+  cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
+#else
+  cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
+#endif
+  CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc,
+                                                  CUDNN_TENSOR_NCHW,
+                                                  data_type,
+                                                  height,
+                                                  width,
+                                                  1,
+                                                  1));
+
+  real alpha = 1.0f;
+  real beta = 0.0f;
+  CHECK_CUDNN(dynload::cudnnSoftmaxBackward(t_resource.cudnn_handle,
+                                            CUDNN_SOFTMAX_ACCURATE,
+                                            CUDNN_SOFTMAX_MODE_CHANNEL,
+                                            &alpha,
+                                            t_resource.cudnn_desc,
+                                            output_value,
+                                            t_resource.cudnn_desc,
+                                            output_grad,
+                                            &beta,
+                                            t_resource.cudnn_desc,
+                                            output_grad));
+  CHECK_SYNC("hl_softmax_backward failed");
+}
+
+void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
+                                    real* input,
+                                    hl_tensor_descriptor outputDesc,
+                                    real* output,
+                                    hl_tensor_descriptor bnParamDesc,
+                                    real* scale,
+                                    real* bias,
+                                    double factor,
+                                    real* runningMean,
+                                    real* runningInvVar,
+                                    double epsilon,
+                                    real* savedMean,
+                                    real* savedVar) {
+#if CUDNN_VERSION >= 4007
+  if ((NULL != runningMean && NULL == runningInvVar) ||
+      (NULL == runningMean && NULL != runningInvVar)) {
+    LOG(FATAL) << "runningMean and runningInvVar can be NULL "
+               << "but only at the same time.";
+  }
+  if ((NULL != savedMean && NULL == savedVar) ||
+      (NULL == savedMean && NULL != savedVar)) {
+    LOG(FATAL) << "savedMean and savedVar can be NULL "
+               << "but only at the same time.";
+  }
+
+  cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
+  cudnnTensorDescriptor_t yDesc = GET_TENSOR_DESCRIPTOR(outputDesc);
+  cudnnTensorDescriptor_t bnDesc = GET_TENSOR_DESCRIPTOR(bnParamDesc);
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
+  CHECK_CUDNN(
+      dynload::cudnnBatchNormalizationForwardTraining(t_resource.cudnn_handle,
+                                                      mode,
+                                                      &alpha,
+                                                      &beta,
+                                                      xDesc,
+                                                      input,
+                                                      yDesc,
+                                                      output,
+                                                      bnDesc,
+                                                      scale,
+                                                      bias,
+                                                      factor,
+                                                      runningMean,
+                                                      runningInvVar,
+                                                      epsilon,
+                                                      savedMean,
+                                                      savedVar));
+
+  CHECK_SYNC("hl_batch_norm_forward_training failed");
+#else
+  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
+             << "But cudnn lib version is " << g_cudnn_lib_version;
+#endif
+}
+
+void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
+                                     real* input,
+                                     hl_tensor_descriptor outputDesc,
+                                     real* output,
+                                     hl_tensor_descriptor bnParamDesc,
+                                     real* scale,
+                                     real* bias,
+                                     real* estimatedMean,
+                                     real* estimatedInvVar,
+                                     double epsilon) {
+#if CUDNN_VERSION >= 4007
+  cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
+  cudnnTensorDescriptor_t yDesc = GET_TENSOR_DESCRIPTOR(outputDesc);
+  cudnnTensorDescriptor_t bnDesc = GET_TENSOR_DESCRIPTOR(bnParamDesc);
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
+
+  CHECK_CUDNN(
+      dynload::cudnnBatchNormalizationForwardInference(t_resource.cudnn_handle,
+                                                       mode,
+                                                       &alpha,
+                                                       &beta,
+                                                       xDesc,
+                                                       input,
+                                                       yDesc,
+                                                       output,
+                                                       bnDesc,
+                                                       scale,
+                                                       bias,
+                                                       estimatedMean,
+                                                       estimatedInvVar,
+                                                       epsilon));
+
+  CHECK_SYNC("hl_batch_norm_forward_inference failed");
+#else
+  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
+             << "But cudnn lib version is " << g_cudnn_lib_version;
+#endif
+}
+
+void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
+                            real* input,
+                            hl_tensor_descriptor outGradDesc,
+                            real* outGrad,
+                            hl_tensor_descriptor inGradDesc,
+                            real* inGrad,
+                            hl_tensor_descriptor dBnParamDesc,
+                            real* scale,
+                            real* scaleGrad,
+                            real* biasGrad,
+                            double epsilon,
+                            real* savedMean,
+                            real* savedInvVar) {
+#if CUDNN_VERSION >= 4007
+  if ((NULL != savedMean && NULL == savedInvVar) ||
+      (NULL == savedMean && NULL != savedInvVar)) {
+    LOG(FATAL) << "savedMean and savedVar can be NULL "
+               << "but only at the same time.";
+  }
+
+  cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
+  cudnnTensorDescriptor_t dyDesc = GET_TENSOR_DESCRIPTOR(outGradDesc);
+  cudnnTensorDescriptor_t dxDesc = GET_TENSOR_DESCRIPTOR(inGradDesc);
+  cudnnTensorDescriptor_t bnDesc = GET_TENSOR_DESCRIPTOR(dBnParamDesc);
+  real alpha = 1.0f;
+  real beta = 1.0f;
+  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
+  CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward(t_resource.cudnn_handle,
+                                                       mode,
+                                                       &alpha,
+                                                       &beta,
+                                                       &alpha,
+                                                       &beta,
+                                                       xDesc,
+                                                       input,
+                                                       dyDesc,
+                                                       outGrad,
+                                                       dxDesc,
+                                                       inGrad,
+                                                       bnDesc,
+                                                       scale,
+                                                       scaleGrad,
+                                                       biasGrad,
+                                                       epsilon,
+                                                       savedMean,
+                                                       savedInvVar));
+
+  CHECK_SYNC("hl_batch_norm_backward failed");
+#else
+  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
+             << "But cudnn lib version is " << g_cudnn_lib_version;
+#endif
+}
diff --git a/paddle/legacy/cuda/src/hl_cuda_device.cc b/paddle/legacy/cuda/src/hl_cuda_device.cc
new file mode 100644
index 0000000000000000000000000000000000000000..501e3b0f3be02b9364f9182b2484d542f0f39889
--- /dev/null
+++ b/paddle/legacy/cuda/src/hl_cuda_device.cc
@@ -0,0 +1,677 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// clang-format off
+// Because clang-format 4.X and clang-format 3.8+ format
+// following lines in different. So disable clang-format.
+#include "hl_cuda.h"
+#include <cuda_profiler_api.h>
+#include <string.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include "hl_cuda.ph"
+#include "hl_thread.ph"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/DynamicLoader.h"
+// clang-format on
+
+namespace dynload {
+
+std::once_flag curand_dso_flag;
+void *curand_dso_handle = nullptr;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load curand routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#ifdef PADDLE_USE_DSO
+#define DYNAMIC_LOAD_CURAND_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    curandStatus_t operator()(Args... args) {                                  \
+      typedef curandStatus_t (*curandFunc)(Args...);                           \
+      std::call_once(curand_dso_flag, GetCurandDsoHandle, &curand_dso_handle); \
+      void *p_##__name = dlsym(curand_dso_handle, #__name);                    \
+      return reinterpret_cast<curandFunc>(p_##__name)(args...);                \
+    }                                                                          \
+  } __name; /* struct DynLoad__##__name */
+#else
+#define DYNAMIC_LOAD_CURAND_WRAP(__name)      \
+  struct DynLoad__##__name {                  \
+    template <typename... Args>               \
+    curandStatus_t operator()(Args... args) { \
+      return __name(args...);                 \
+    }                                         \
+  } __name; /* struct DynLoad__##__name */
+#endif
+
+/* include all needed curand functions in HPPL */
+// clang-format off
+#define CURAND_RAND_ROUTINE_EACH(__macro)    \
+  __macro(curandCreateGenerator)             \
+  __macro(curandSetStream)                   \
+  __macro(curandSetPseudoRandomGeneratorSeed)\
+  __macro(curandGenerateUniform)             \
+  __macro(curandGenerateUniformDouble)
+// clang-format on
+
+CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
+
+#undef CURAND_RAND_ROUTINE_EACH
+#undef DYNAMIC_LOAD_CURAND_WRAP
+
+} /* namespace dynload */
+
+/**
+ * @brief   global resource.
+ */
+int g_system_device_num = 0;                /* system device number */
+int device_num = 0;                         /* use    device number */
+hl_device_prop *g_device;                   /* device info table */
+__thread thread_device_resources *t_device; /* device resources table */
+int g_cuda_lib_version = 0;
+
+/* number of global stream */
+#define NUMBER_OF_GLOBAL_STREAM (HPPL_THREAD_STREAM_1)
+/* number of thread stream */
+#define NUMBER_OF_THREAD_STREAM (HPPL_STREAM_END - HPPL_THREAD_STREAM_1)
+/* sizeof of device memory */
+#define HPPL_GPU_MEMORY_SIZE (256 * 4)
+
+/**
+ * Check build-in cuda function using glog and it **does not**
+ * support << operator for more details error info.
+ */
+#define CHECK_CUDA(cudaFunc)                                         \
+  do {                                                               \
+    cudaError_t cudaStat = cudaFunc;                                 \
+    CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "                \
+                                    << cudaGetErrorString(cudaStat); \
+  } while (0)
+
+/**
+ * @brief   thread resource.
+ */
+__thread _hl_thread_resource t_resource = {{0},    /* stream */
+                                           0,      /* handle */
+                                           0,      /* gen */
+                                           0,      /* cudnn_handle */
+                                           0,      /* cudnn_desc */
+                                           NULL,   /* gen_mutex */
+                                           NULL,   /* gpu_mem */
+                                           NULL,   /* cpu_mem */
+                                           0,      /* event */
+                                           -1,     /* device */
+                                           0,      /* major */
+                                           false}; /* is_init */
+
+__thread cudaStream_t default_stream = 0;
+__thread bool g_sync_flag = true;
+bool hl_start_flag = false;
+
+inline pid_t gettid() {
+#if defined(__APPLE__) || defined(__OSX__)
+  // syscall is deprecated: first deprecated in macOS 10.12.
+  // syscall is unsupported;
+  // syscall pid_t tid = syscall(SYS_thread_selfid);
+  uint64_t tid;
+  pthread_threadid_np(NULL, &tid);
+#else
+#ifndef __NR_gettid
+#define __NR_gettid 224
+#endif
+  pid_t tid = syscall(__NR_gettid);
+#endif
+  CHECK_NE((int)tid, -1);
+  return tid;
+}
+
+void hl_init(int device) {
+  CHECK(hl_start_flag) << "[Init failed] hl_start() did not succeed.";
+
+  /* thread has been initialized */
+  if (true == t_resource.is_init) {
+    hl_set_device(device);
+    return;
+  }
+
+  /* create thread devcie resources */
+  char *tmp;
+  thread_device_resources device_res;
+  tmp = (char *)malloc(g_system_device_num * sizeof(thread_device_resources *) +
+                       device_num * sizeof(_thread_device_resources));
+  CHECK_NOTNULL(tmp);
+  t_device = (thread_device_resources *)tmp;
+  device_res = (thread_device_resources)(
+      (char *)tmp + g_system_device_num * sizeof(thread_device_resources *));
+  memset(t_device, 0, g_system_device_num * sizeof(thread_device_resources *));
+
+  char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_THREAD_STREAM *
+                                    sizeof(cudaStream_t));
+  CHECK_NOTNULL(tmp_stream);
+
+  int num = 0;
+  for (int dev = 0; dev < g_system_device_num; dev++) {
+    if (!g_device[dev]) {
+      continue;
+    }
+
+    t_device[dev] = &device_res[num];
+    t_device[dev]->stream =
+        (cudaStream_t *)(tmp_stream +
+                         num * NUMBER_OF_THREAD_STREAM * sizeof(cudaStream_t));
+
+    hl_create_thread_resources(dev, t_device[dev]);
+    num++;
+  }
+
+  hl_cudnn_desc_init(&t_resource.cudnn_desc);
+
+  /* thread initialization is complete */
+  t_resource.is_init = true;
+  /* set device */
+  t_resource.device = -1;
+  hl_set_device(device);
+}
+
+void hl_fini() {
+  if (false == t_resource.is_init) {
+    return;
+  }
+
+  /* hppl stream fini */
+  t_resource.device = -1;
+  for (int i = NUMBER_OF_GLOBAL_STREAM; i < HPPL_STREAM_END; i++) {
+    t_resource.stream[i] = 0;
+  }
+
+  char *tmp = (char *)t_device;
+  char *tmp_stream = NULL;
+  for (int dev = 0; dev < g_system_device_num; dev++) {
+    if (!t_device[dev]) {
+      continue;
+    }
+    if (!tmp_stream) {
+      tmp_stream = (char *)t_device[dev]->stream;
+    }
+    for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
+      CHECK_CUDA(cudaStreamDestroy(t_device[dev]->stream[j]));
+    }
+
+    /* free device memory */
+    hl_free_mem_device(t_device[dev]->gpu_mem);
+    hl_free_mem_host(t_device[dev]->cpu_mem);
+    CHECK_CUDA(cudaEventDestroy(t_device[dev]->mem_event));
+  }
+
+  free(tmp);
+  free(tmp_stream);
+  t_resource.is_init = false;
+}
+
+int hl_get_device_count() { return device_num; }
+
+void hl_set_device(int device) {
+  if (device == t_resource.device) {
+    return;
+  }
+
+  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
+      << "Device: " << device << " is not specified in startup.";
+
+  CHECK_CUDA(cudaSetDevice(device));
+
+  /* switch thread stream */
+  for (int i = 0; i < NUMBER_OF_GLOBAL_STREAM; i++) {
+    t_resource.stream[i] = g_device[device]->device_resources->stream[i];
+  }
+
+  if (true == t_resource.is_init) {
+    for (int i = NUMBER_OF_GLOBAL_STREAM; i < HPPL_STREAM_END; i++) {
+      t_resource.stream[i] =
+          t_device[device]->stream[i - NUMBER_OF_GLOBAL_STREAM];
+    }
+    t_resource.gpu_mem = t_device[device]->gpu_mem;
+    t_resource.cpu_mem = t_device[device]->cpu_mem;
+    t_resource.event = t_device[device]->mem_event;
+  }
+
+  t_resource.handle = g_device[device]->device_resources->handle;
+  t_resource.gen = g_device[device]->device_resources->gen;
+  t_resource.cudnn_handle = g_device[device]->device_resources->cudnn_handle;
+  t_resource.gen_mutex = g_device[device]->device_resources->gen_mutex;
+  t_resource.device = device;
+  t_resource.major = g_device[device]->major;
+  default_stream = t_resource.stream[0];
+}
+
+int hl_get_device() {
+  int device;
+  CHECK_CUDA(cudaGetDevice(&device));
+  return device;
+}
+
+void *hl_malloc_device(size_t size) {
+  void *dest_d;
+
+  CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
+  CHECK_CUDA(cudaMalloc((void **)&dest_d, size));
+
+  return dest_d;
+}
+
+void hl_free_mem_device(void *dest_d) {
+  CHECK_NOTNULL(dest_d);
+
+  cudaError_t err = cudaFree(dest_d);
+  CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
+      << hl_get_device_error_string();
+}
+
+void *hl_malloc_host(size_t size) {
+  void *dest_h;
+
+  CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
+  CHECK_CUDA(cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
+
+  return dest_h;
+}
+
+void hl_free_mem_host(void *dest_h) {
+  CHECK_NOTNULL(dest_h);
+
+  cudaError_t err = cudaFreeHost(dest_h);
+  CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
+      << hl_get_device_error_string();
+}
+
+void hl_memcpy(void *dst, void *src, size_t size) {
+  if (0 == size) {
+    return;
+  }
+  CHECK_NOTNULL(dst);
+  CHECK_NOTNULL(src);
+  CHECK_CUDA(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
+}
+
+void hl_memset_device(void *dest_d, int value, size_t size) {
+  CHECK_CUDA(cudaMemset(dest_d, value, size));
+}
+
+void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
+  if (0 == size) {
+    return;
+  }
+  CHECK_NOTNULL(src_h);
+  CHECK_NOTNULL(dest_d);
+  CHECK_CUDA(cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
+}
+
+void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
+  if (0 == size) {
+    return;
+  }
+  CHECK_NOTNULL(dest_h);
+  CHECK_NOTNULL(src_d);
+  CHECK_CUDA(cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
+}
+
+void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
+  if (0 == size) {
+    return;
+  }
+  CHECK_NOTNULL(dest_d);
+  CHECK_NOTNULL(src_d);
+  CHECK_CUDA(cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
+}
+
+void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
+  cudaStream_t cu_stream;
+
+  if (0 == size) {
+    return;
+  }
+  CHECK_NOTNULL(dst);
+  CHECK_NOTNULL(src);
+  CHECK_LT(stream, HPPL_STREAM_END);
+  cu_stream = t_resource.stream[stream];
+
+  CHECK_CUDA(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
+}
+
+void hl_start() {
+  hl_specify_devices_start(NULL, 0);
+  /* set default device */
+  hl_set_device(0);
+}
+
+bool hl_device_can_access_peer(int device, int peerDevice) {
+  int canAccessPeer;
+  CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
+
+  if (canAccessPeer == 1) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void hl_device_enable_peer_access(int peerDevice) {
+  cudaError_t err = cudaDeviceEnablePeerAccess(peerDevice, 0);
+  if (cudaErrorPeerAccessAlreadyEnabled == err) {
+    cudaGetLastError();
+  } else {
+    CHECK_CUDA(err);
+  }
+}
+
+void hl_create_global_resources(hl_device_prop device_prop) {
+  struct cudaDeviceProp cu_prop;
+  int device = device_prop->device;
+  global_device_resources device_res = device_prop->device_resources;
+
+  CHECK_CUDA(cudaSetDevice(device));
+  /* device properties */
+  CHECK_CUDA(cudaGetDeviceProperties(&cu_prop, device));
+
+  device_prop->major = cu_prop.major;
+  device_prop->minor = cu_prop.minor;
+  strncpy(device_prop->device_name, cu_prop.name, 256);
+  device_prop->device_mem = cu_prop.totalGlobalMem;
+
+  /* create device stream */
+  for (int j = 0; j < NUMBER_OF_GLOBAL_STREAM; j++) {
+    CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
+  }
+
+  /* cublas init */
+  hl_cublas_init(&device_res->handle, device_res->stream[0]);
+
+  /* create curand gen */
+  CHECK_EQ(dynload::curandCreateGenerator(&device_res->gen,
+                                          CURAND_RNG_PSEUDO_DEFAULT),
+           CURAND_STATUS_SUCCESS)
+      << "[Start failed] Curand init failed.";
+
+  CHECK_EQ(dynload::curandSetStream(device_res->gen, device_res->stream[0]),
+           CURAND_STATUS_SUCCESS)
+      << "[Start failed] Curand set stream failed!";
+
+  /* create cudnn handle */
+  hl_cudnn_init(&device_res->cudnn_handle, device_res->stream[0]);
+
+  int seed = gettid();
+  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(device_res->gen,
+                                                       seed + device),
+           CURAND_STATUS_SUCCESS);
+
+  device_res->gen_mutex = (pthread_mutex_t *)(malloc(sizeof(pthread_mutex_t)));
+  pthread_mutex_init(device_res->gen_mutex, NULL);
+
+  CHECK_CUDA(cudaRuntimeGetVersion(&g_cuda_lib_version));
+}
+
+int hl_get_cuda_version() { return g_cuda_lib_version; }
+
+void hl_create_thread_resources(int device,
+                                thread_device_resources device_res) {
+  CHECK_CUDA(cudaSetDevice(device));
+
+  /* create thread stream */
+  for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
+    CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
+  }
+
+  /* allocation device memory */
+  device_res->gpu_mem = (real *)hl_malloc_device(HPPL_GPU_MEMORY_SIZE);
+
+  /* allocation host memory */
+  device_res->cpu_mem = (real *)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
+
+  CHECK_CUDA(cudaEventCreate(&device_res->mem_event));
+}
+
+void hl_specify_devices_start(int *device, int number) {
+  if (hl_start_flag) return;
+
+  /* 1. get the number of devices */
+  CHECK_CUDA(cudaGetDeviceCount(&g_system_device_num));
+  CHECK_NE(g_system_device_num, 0) << "[Start failed] there is no GPU device";
+  if (device == NULL) {
+    number = g_system_device_num;
+  }
+
+  /* 2. check device & create device property table */
+  CHECK_LE(number, g_system_device_num)
+      << "[Start failed] System does not have enough device. "
+      << "Device number: " << g_system_device_num << "Input number: " << number;
+
+  char *tmp;
+  hl_device_prop device_prop;
+  tmp = (char *)malloc(g_system_device_num * sizeof(hl_device_prop *) +
+                       number * sizeof(_hl_device_prop));
+  CHECK(tmp) << "[Start failed] System memory is not enough.";
+
+  g_device = (hl_device_prop *)tmp;
+  device_prop = (hl_device_prop)(
+      (char *)tmp + g_system_device_num * sizeof(hl_device_prop *));
+  memset(g_device, 0, g_system_device_num * sizeof(hl_device_prop *));
+  int num = 0;
+  for (int i = 0; i < number; i++) {
+    int dev;
+    if (device == NULL) {
+      dev = i;
+    } else {
+      dev = device[i];
+    }
+
+    CHECK_LT(dev, g_system_device_num)
+        << "[Start failed] The specified device number is "
+        << "out of range. Max device number: " << g_system_device_num - 1
+        << " Specified devcie number: " << dev;
+
+    if (g_device[dev]) {
+      /* Warning */
+      LOG(WARNING) << "[Warning] Repeat specify device: " << dev;
+      continue;
+    }
+
+    g_device[dev] = &device_prop[num];
+    g_device[dev]->device = dev;
+    num++;
+  }
+  device_num = num;
+
+  /* 3.  create global device resources */
+  char *tmp_res = (char *)malloc(device_num * sizeof(_global_device_resources));
+  CHECK_NOTNULL(tmp_res);
+
+  char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_GLOBAL_STREAM *
+                                    sizeof(cudaStream_t));
+  CHECK_NOTNULL(tmp_stream);
+
+  num = 0;
+  for (int i = 0; i < g_system_device_num; i++) {
+    if (!g_device[i]) {
+      continue;
+    }
+
+    g_device[i]->device_resources = (global_device_resources)(
+        tmp_res + num * sizeof(_global_device_resources));
+    g_device[i]->device_resources->stream =
+        (cudaStream_t *)(tmp_stream +
+                         num * NUMBER_OF_GLOBAL_STREAM * sizeof(cudaStream_t));
+
+    hl_create_global_resources(g_device[i]);
+    num++;
+  }
+
+  /* hl_start() is ok */
+  hl_start_flag = true;
+  /* set default device */
+  if (device == NULL) {
+    hl_set_device(0);
+  } else {
+    hl_set_device(device[0]);
+  }
+}
+
+void hl_rand(real *dest_d, size_t num) {
+  pthread_mutex_lock(t_resource.gen_mutex);
+  CHECK_EQ(
+#ifndef PADDLE_TYPE_DOUBLE
+      dynload::curandGenerateUniform(t_resource.gen, dest_d, num),
+#else
+      dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num),
+#endif
+      CURAND_STATUS_SUCCESS);
+  pthread_mutex_unlock(t_resource.gen_mutex);
+  CHECK_SYNC("hl_rand failed");
+}
+
+void hl_srand(unsigned int seed) {
+  pthread_mutex_lock(t_resource.gen_mutex);
+  CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(t_resource.gen, seed),
+           CURAND_STATUS_SUCCESS);
+  pthread_mutex_unlock(t_resource.gen_mutex);
+}
+
+void hl_set_sync_flag(bool flag) { g_sync_flag = flag; }
+
+bool hl_get_sync_flag() { return g_sync_flag; }
+
+void hl_stream_synchronize(hl_stream_t stream) {
+  cudaStream_t cu_stream;
+
+  CHECK_LT(stream, HPPL_STREAM_END) << __func__
+                                    << ": the parameter stream is error.";
+
+  cu_stream = t_resource.stream[stream];
+  CHECK_CUDA(cudaStreamSynchronize(cu_stream));
+}
+
+void hl_create_event(hl_event_t *event) {
+  CHECK_NOTNULL(event);
+
+  struct _hl_event_st *st_event =
+      (struct _hl_event_st *)malloc(sizeof(struct _hl_event_st));
+
+  CHECK_CUDA(cudaEventCreate(&st_event->cu_event));
+
+  *event = st_event;
+}
+
+float hl_event_elapsed_time(hl_event_t start, hl_event_t end) {
+  float time;
+  CHECK_NOTNULL(start);
+  CHECK_NOTNULL(end);
+
+  CHECK_CUDA(cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
+  return time;
+}
+
+void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {
+  cudaStream_t cu_stream;
+
+  CHECK_NOTNULL(event);
+  CHECK_LT(stream, HPPL_STREAM_END) << __func__
+                                    << ": the parameter stream is error.";
+
+  cu_stream = t_resource.stream[stream];
+  CHECK_CUDA(cudaEventRecord(event->cu_event, cu_stream));
+}
+
+void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
+  cudaStream_t cu_stream;
+
+  CHECK_NOTNULL(event);
+  CHECK_LT(stream, HPPL_STREAM_END) << __func__
+                                    << ": the parameter stream is error.";
+
+  cu_stream = t_resource.stream[stream];
+  CHECK_CUDA(cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
+}
+
+void hl_destroy_event(hl_event_t event) {
+  CHECK_NOTNULL(event);
+  CHECK_CUDA(cudaEventDestroy(event->cu_event));
+
+  free(event);
+  event = NULL;
+}
+
+void hl_event_synchronize(hl_event_t event) {
+  CHECK_NOTNULL(event);
+  CHECK_CUDA(cudaEventSynchronize(event->cu_event));
+}
+
+void hl_get_device_name(char *name, int len, int device) {
+  CHECK_NOTNULL(name);
+  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
+      << "Device(" << device << ") is not specified in startup.";
+
+  strncpy(name, g_device[device]->device_name, len);
+}
+
+void hl_get_device_memory(size_t *mem_size, int device) {
+  CHECK_NOTNULL(mem_size);
+  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
+      << "Device(" << device << ") is not specified in startup.";
+
+  *mem_size = g_device[device]->device_mem;
+}
+
+void hl_get_device_compute_capability(int *major, int *minor, int device) {
+  CHECK_NOTNULL(major);
+  CHECK_NOTNULL(minor);
+  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
+      << "Device(" << device << ") is not specified in startup.";
+
+  *major = g_device[device]->major;
+  *minor = g_device[device]->minor;
+}
+
+int hl_get_device_last_error() { return (int)cudaGetLastError(); }
+
+const char *hl_get_device_error_string() {
+  cudaError_t err = cudaGetLastError();
+  return cudaGetErrorString(err);
+}
+
+const char *hl_get_device_error_string(size_t err) {
+  return cudaGetErrorString((cudaError_t)err);
+}
+
+void hl_device_synchronize() { CHECK_CUDA(cudaDeviceSynchronize()); }
+void hl_set_device_flags_block() {
+  CHECK_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
+}
+
+bool hl_cuda_event_is_ready(hl_event_t event) {
+  cudaError_t err = cudaEventQuery(event->cu_event);
+  CHECK(cudaSuccess == err || cudaErrorNotReady == err);
+
+  if (cudaErrorNotReady == err) {
+    return false;
+  }
+  return true;
+}
+
+void hl_profiler_start() { CHECK_CUDA(cudaProfilerStart()); }
+
+void hl_profiler_end() { CHECK_CUDA(cudaProfilerStop()); }
diff --git a/paddle/legacy/cuda/src/hl_cuda_lstm.cu b/paddle/legacy/cuda/src/hl_cuda_lstm.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9ac564fd2548cc782bee2380350f4ab888670ca3
--- /dev/null
+++ b/paddle/legacy/cuda/src/hl_cuda_lstm.cu
@@ -0,0 +1,876 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_activation_functions.h"
+#include "hl_base.h"
+#include "hl_cuda_cublas.h"
+#include "hl_device_functions.cuh"
+#include "paddle/legacy/utils/Logging.h"
+
+typedef hppl::Active<real>::forward t_forward;
+typedef hppl::Active<real>::backward t_backward;
+
+bool hl_lstm_sequence_parallel(int frameSize) {
+  if (frameSize == 32 || frameSize == 64) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+class frameValue {
+ public:
+  real *value_;
+  __device__ frameValue(real *value) : value_(value) {}
+  template <int reversed, int frameSize>
+  __device__ inline void init(int start, int length, int idx) {
+    if (reversed == 0) {
+      value_ += start * frameSize + idx;
+    } else {
+      value_ += (start + length - 1) * frameSize + idx;
+    }
+  }
+  __device__ inline real *getPtr() const { return value_; }
+  __device__ inline real getValue() { return *value_; }
+  __device__ inline void setValue(real value) { *value_ = value; }
+  template <int reversed, int frameSize>
+  __device__ inline void nextFrame() {
+    if (reversed == 0) {
+      value_ += frameSize;
+    } else {
+      value_ -= frameSize;
+    }
+  }
+};
+
+__device__ __forceinline__ void ptx_sync(const int id, const int barriers) {
+  asm volatile("bar.sync %0, %1;" : : "r"(id), "r"(barriers) : "memory");
+}
+
+__device__ __forceinline__ void ptx_arrive(const int id, const int barriers) {
+  asm volatile("bar.arrive %0, %1;" : : "r"(id), "r"(barriers) : "memory");
+}
+
+template <int valueSize, int frameSize>
+__device__ __forceinline__ real forward_sequence(real value,
+                                                 real *shValue,
+                                                 real *state,
+                                                 real *preOutput,
+                                                 real *output,
+                                                 real check,
+                                                 int index,
+                                                 t_forward activeNode,
+                                                 t_forward activeGate,
+                                                 t_forward activeState) {
+  real out;
+  real prevOut;
+  real state_r;
+  const int idx = index % frameSize;
+  const int idy = index / frameSize;
+  // assert(index < valueSize);
+
+  if (idy == 0) {
+    value = activeNode(value);
+    shValue[index] = value;
+  }
+  if (idy == 1 || idy == 2) {
+    state_r = state[idx];
+    value += state_r * check;
+    value = activeGate(value);
+    shValue[index] = value;
+  }
+  ptx_sync(1, valueSize);
+  if (idy == 3) {
+    state_r = state[idx];
+    state_r = state_r * shValue[idx + frameSize * 2];
+    state_r += shValue[idx] * shValue[idx + frameSize];
+    state[idx] = state_r;
+    ptx_arrive(2, frameSize * 2);
+    value += state_r * check;
+    value = activeGate(value);
+    shValue[index] = value;
+    ptx_sync(3, frameSize * 2);
+    prevOut = preOutput[idx];
+    out = prevOut * value;
+    output[idx] = out;
+  }
+  if (idy == 0) {
+    ptx_sync(2, frameSize * 2);
+    prevOut = state[idx];
+    prevOut = activeState(prevOut);
+    preOutput[idx] = prevOut;
+    ptx_arrive(3, frameSize * 2);
+  }
+  return value;
+}
+
+#define OUTPUT_BARRIER_ID 10
+#define OUTPUT_BARRIER_ID2 11
+template <int valueSize,
+          int frameSize,
+          int reversed,
+          int computeThreads,
+          int blockSize>
+__global__ void KeLstmForward(real *gateValue,
+                              real *state,
+                              real *output,
+                              real *preOutput,
+                              real *checkIg,
+                              real *checkFg,
+                              real *checkOg,
+                              real *weight,
+                              const int *starts,
+                              hl_activation_mode_t active_node,
+                              hl_activation_mode_t active_gate,
+                              hl_activation_mode_t active_state) {
+  __shared__ real shValue[valueSize];
+  __shared__ real shState[frameSize];
+  __shared__ real shPrevOutput[frameSize];
+  __shared__ real shOutput[frameSize];
+
+  const int index = threadIdx.x;
+  int start = starts[blockIdx.x];
+  int length = starts[blockIdx.x + 1] - start;
+
+  /* init */
+  real check;
+  real value;
+  frameValue frameGate(gateValue);
+  frameValue frameState(state);
+  frameValue frameOutput(output);
+  frameValue framePreOutput(preOutput);
+  if (index < valueSize) {
+    const int idx = index % frameSize;
+    const int idy = index / frameSize;
+    frameGate.init<reversed, valueSize>(start, length, index);
+    value = frameGate.getValue();
+    if (idy == 0) {
+      shState[idx] = 0.0;
+    } else if (idy == 1) {
+      check = checkIg[idx];
+    } else if (idy == 2) {
+      check = checkFg[idx];
+    } else if (idy == 3) {
+      check = checkOg[idx];
+    }
+
+    if (idy == 3) {
+      frameState.init<reversed, frameSize>(start, length, idx);
+      frameOutput.init<reversed, frameSize>(start, length, idx);
+      framePreOutput.init<reversed, frameSize>(start, length, idx);
+    }
+
+    ptx_sync(1, valueSize);
+  }
+
+  for (int i = 0; i < length; ++i) {
+    if (index < valueSize) {
+      if (valueSize == 128) {
+        if (i != 0) {
+          ptx_sync(OUTPUT_BARRIER_ID2, blockSize);
+          value += shValue[index];
+        }
+      }
+      value = forward_sequence<valueSize, frameSize>(
+          value,
+          shValue,
+          shState,
+          shPrevOutput,
+          shOutput,
+          check,
+          index,
+          hppl::gpu::forward[active_node],
+          hppl::gpu::forward[active_gate],
+          hppl::gpu::forward[active_state]);
+      const int idx = index % frameSize;
+      const int idy = index / frameSize;
+      if (valueSize == 128) {
+        if (idy == 3) {
+          ptx_arrive(OUTPUT_BARRIER_ID, frameSize + 128);
+        }
+      }
+      if (valueSize == 256) {
+        ptx_sync(OUTPUT_BARRIER_ID, valueSize);
+      }
+      frameGate.setValue(value);
+      if (idy == 3) {
+        frameState.setValue(shState[idx]);
+        frameOutput.setValue(shOutput[idx]);
+        framePreOutput.setValue(shPrevOutput[idx]);
+        frameState.nextFrame<reversed, frameSize>();
+        frameOutput.nextFrame<reversed, frameSize>();
+        framePreOutput.nextFrame<reversed, frameSize>();
+      }
+      if (i != length - 1) {
+        frameGate.nextFrame<reversed, valueSize>();
+        value = frameGate.getValue();
+      }
+    }
+    if (i != length - 1) {
+      if (valueSize == 128) {
+        if (valueSize <= index) {
+          real B_r[frameSize];
+          const int computeIdx = index - valueSize;
+          if (i == 0) {
+#pragma unroll
+            for (int n = 0; n < frameSize; n++) {
+              B_r[n] = weight[n * valueSize + computeIdx];
+            }
+          }
+          ptx_sync(OUTPUT_BARRIER_ID, frameSize + 128);
+          real A_r[frameSize];
+          for (int n = 0; n < frameSize; n++) {
+            A_r[n] = shOutput[n];
+          }
+          real sum = 0.0f;
+          for (int n = 0; n < frameSize; n++) {
+            sum += A_r[n] * B_r[n];
+          }
+          shValue[computeIdx] = sum;
+          ptx_arrive(OUTPUT_BARRIER_ID2, blockSize);
+        }
+      }
+      if (valueSize == 256) {
+        real B_r[frameSize];
+        if (i == 0) {
+#pragma unroll
+          for (int n = 0; n < frameSize; n++) {
+            B_r[n] = weight[n * valueSize + index];
+          }
+        }
+        real sum = 0.0f;
+        for (int n = 0; n < frameSize; n++) {
+          sum += shOutput[n] * B_r[n];
+        }
+        value += sum;
+      }
+    }
+  }
+}
+
+void hl_lstm_parallel_forward(real *gateValue,
+                              real *stateValue,
+                              real *preOutputValue,
+                              real *outputValue,
+                              real *checkIg,
+                              real *checkFg,
+                              real *checkOg,
+                              real *weight,
+                              const int *sequence,
+                              int frameSize,
+                              int numSequences,
+                              bool reversed,
+                              hl_activation_mode_t active_node,
+                              hl_activation_mode_t active_gate,
+                              hl_activation_mode_t active_state) {
+  CHECK(frameSize == 32 || frameSize == 64);
+  dim3 grid(numSequences, 1);
+  if (!reversed) {
+    if (frameSize == 32) {
+      KeLstmForward<128, 32, 0, 128, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          stateValue,
+          outputValue,
+          preOutputValue,
+          checkIg,
+          checkFg,
+          checkOg,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
+    } else if (frameSize == 64) {
+      KeLstmForward<256, 64, 0, 256, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          stateValue,
+          outputValue,
+          preOutputValue,
+          checkIg,
+          checkFg,
+          checkOg,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
+    }
+  } else {
+    if (frameSize == 32) {
+      KeLstmForward<128, 32, 1, 128, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          stateValue,
+          outputValue,
+          preOutputValue,
+          checkIg,
+          checkFg,
+          checkOg,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
+    } else if (frameSize == 64) {
+      KeLstmForward<256, 64, 1, 256, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          stateValue,
+          outputValue,
+          preOutputValue,
+          checkIg,
+          checkFg,
+          checkOg,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
+    }
+  }
+  CHECK_SYNC("hl_lstm_parallel_forward failed");
+}
+
+__device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
+  const int warp_size = 32;
+  int addr = idx % warp_size;
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, addr < warp_size);
+#pragma unroll
+  for (int k = 1; k < 32; k++) {
+    // rSrc[k] = __shfl_sync(rSrc[k], (threadIdx.x + k) % 32, 32);
+    addr = __shfl_sync(mask, addr, (idx + 1) % 32, 32);
+    a[k] = __shfl_sync(mask, a[k], addr, 32);
+  }
+
+#pragma unroll
+  for (int tid = 0; tid < 31; tid++) {
+    real tmp = (idx > tid) ? a[0] : a[1];
+#pragma unroll
+    for (int k = 31; k > 0; k--) {
+      a[(k + 1) % 32] = (idx > tid) ? a[k] : a[(k + 1) % 32];
+    }
+    a[1] = tmp;
+  }
+
+  addr = (32 - idx) % 32;
+  CREATE_SHFL_MASK(mask, idx % 32 < warp_size);
+#pragma unroll
+  for (int k = 0; k < 32; k++) {
+    a[k] = __shfl_sync(mask, a[k], addr, 32);
+    addr = __shfl_sync(mask, addr, (idx + 31) % 32, 32);
+  }
+}
+
+template <int valueSize, int frameSize>
+__device__ void backward_sequence(real rGateValue,
+                                  real rOutputGrad,
+                                  real rPreOutputValue,
+                                  real &rGateGrad,
+                                  real &rStateGrad,
+                                  real *shStateGrad,
+                                  real *shStateValue,
+                                  real *shGateValue,
+                                  real rCheck,
+                                  real &rGateValuePrev,
+                                  int index,
+                                  t_backward activeNode,
+                                  t_backward activeGate,
+                                  t_backward activeState) {
+  const int frameIdx = index % frameSize;
+  const int frameIdy = index / frameSize;
+  if (frameIdy == 3) {
+    real rPrevOutputGrad;
+    rPrevOutputGrad = rOutputGrad * rGateValue;
+    rStateGrad = activeState(rPrevOutputGrad, rPreOutputValue);
+    rGateGrad = rOutputGrad * rPreOutputValue;
+    rGateGrad = activeGate(rGateGrad, rGateValue);
+    rStateGrad += rGateGrad * rCheck;
+    shStateGrad[index] = rStateGrad;
+    ptx_arrive(3, valueSize);
+  } else if (frameIdy == 1) {
+    shGateValue[frameIdx + frameSize] = rGateValue;
+    rStateGrad = rGateGrad * rCheck;
+    shStateGrad[index] = rStateGrad;
+    ptx_sync(3, valueSize);
+    rStateGrad += shStateGrad[frameIdx + frameSize * 2];
+    rStateGrad += shStateGrad[frameIdx + frameSize * 3];
+    rGateGrad = rStateGrad * shGateValue[frameIdx];
+    rGateGrad = activeGate(rGateGrad, rGateValue);
+  } else if (frameIdy == 2) {
+    rStateGrad = rStateGrad * rGateValuePrev;
+    rStateGrad += rGateGrad * rCheck;
+    shStateGrad[index] = rStateGrad;
+    ptx_sync(3, valueSize);
+    rStateGrad += shStateGrad[frameIdx + frameSize];
+    rStateGrad += shStateGrad[frameIdx + frameSize * 3];
+    rGateValuePrev = rGateValue;
+    rGateGrad = rStateGrad * shStateValue[frameIdx];
+    rGateGrad = activeGate(rGateGrad, rGateValue);
+  } else if (frameIdy == 0) {
+    shGateValue[frameIdx] = rGateValue;
+    ptx_sync(3, valueSize);
+    rStateGrad = shStateGrad[frameIdx + frameSize];
+    rStateGrad += shStateGrad[frameIdx + frameSize * 2];
+    rStateGrad += shStateGrad[frameIdx + frameSize * 3];
+    rGateGrad = rStateGrad * shGateValue[frameIdx + frameSize];
+    rGateGrad = activeNode(rGateGrad, rGateValue);
+  }
+}
+
+template <int valueSize, int frameSize>
+__device__ void load_weight(real rWeight[], real *weight, const int index) {
+  if (valueSize == 128) {
+    weight += index;
+#pragma unroll
+    for (int n = 0; n < frameSize; n++) {
+      rWeight[n] = weight[n * valueSize];
+    }
+    transpose_32x32(rWeight, index % 32);
+  }
+  if (valueSize == 256) {
+    int id = (index / 32) % 2;
+    weight += index - id * 32 + id * 32 * valueSize;
+#pragma unroll
+    for (int n = 0; n < 32; n++) {
+      rWeight[n] = weight[n * valueSize];
+      rWeight[n + 32] = weight[n * valueSize + 32];
+    }
+    transpose_32x32(rWeight, index % 32);
+    transpose_32x32(&rWeight[32], index % 32);
+  }
+}
+
+template <int valueSize, int frameSize, int reversed>
+__global__ void KeLstmBackward(real *gateValue,
+                               real *gateGrad,
+                               real *stateValue,
+                               real *stateGrad, /* do not need save */
+                               real *preOutputValue,
+                               real *preOutputGrad, /* do not need save */
+                               real *checkIg,
+                               real *checkIgGrad,
+                               real *checkFg,
+                               real *checkFgGrad,
+                               real *checkOg,
+                               real *checkOgGrad,
+                               real *outputGrad,
+                               real *weightValue,
+                               const int *starts,
+                               hl_activation_mode_t active_node,
+                               hl_activation_mode_t active_gate,
+                               hl_activation_mode_t active_state) {
+  __shared__ real shGateValue[valueSize];
+  __shared__ real shStateGrad[valueSize];
+  __shared__ real shStateValue[frameSize];
+  __shared__ real shGateGrad[4][frameSize];
+  __shared__ real shOutputGrad[4][frameSize];
+  const int index = threadIdx.x;
+  int start = starts[blockIdx.x];
+  int length = starts[blockIdx.x + 1] - start;
+
+  const int frameIdx = index % frameSize;
+  const int frameIdy = index / frameSize;
+  real rCheck;
+  real rCheckGrad;
+  real rGateGrad;
+  real rStateGrad;
+  real rGateValuePrev;
+  real rPreOutputValue;
+  real rOutputGrad;
+  real rGateValue;
+  real rStateValue;
+
+  frameValue frameGateValue(gateValue);
+  frameValue frameGateGrad(gateGrad);
+  frameValue framePreOutputValue(preOutputValue);
+  frameValue frameStateValue(stateValue);
+  frameValue frameOutputGrad(outputGrad);
+  if (frameIdy == 0) {
+  } else if (frameIdy == 1) {
+    rCheck = checkIg[frameIdx];
+  } else if (frameIdy == 2) {
+    rCheck = checkFg[frameIdx];
+    rGateValuePrev = 0.0;
+    rStateGrad = 0.0;
+  } else if (frameIdy == 3) {
+    rCheck = checkOg[frameIdx];
+    framePreOutputValue.init<!reversed, frameSize>(start, length, frameIdx);
+    frameOutputGrad.init<!reversed, frameSize>(start, length, frameIdx);
+    rOutputGrad = frameOutputGrad.getValue();
+    rPreOutputValue = framePreOutputValue.getValue();
+    frameStateValue.init<!reversed, frameSize>(start, length, frameIdx);
+    rStateValue = frameStateValue.getValue();
+  }
+
+  frameGateValue.init<!reversed, valueSize>(start, length, index);
+  frameGateGrad.init<!reversed, valueSize>(start, length, index);
+  rGateValue = frameGateValue.getValue();
+  rGateGrad = 0.0;
+  rCheckGrad = 0.0;
+
+  real B_r[frameSize];
+  load_weight<valueSize, frameSize>(B_r, weightValue, index);
+
+  for (int i = 0; i < length; ++i) {
+    if (frameIdy == 3) {
+      if (i != length - 1) {
+        frameStateValue.nextFrame<!reversed, frameSize>();
+        shStateValue[frameIdx] = frameStateValue.getValue();
+      } else {
+        shStateValue[frameIdx] = 0.0;
+      }
+    }
+    backward_sequence<valueSize, frameSize>(rGateValue,
+                                            rOutputGrad,
+                                            rPreOutputValue,
+                                            rGateGrad,
+                                            rStateGrad,
+                                            shStateGrad,
+                                            shStateValue,
+                                            shGateValue,
+                                            rCheck,
+                                            rGateValuePrev,
+                                            index,
+                                            hppl::gpu::backward[active_node],
+                                            hppl::gpu::backward[active_gate],
+                                            hppl::gpu::backward[active_state]);
+    if (frameIdy == 3) {
+      rCheckGrad += rGateGrad * rStateValue;
+      rStateValue = shStateValue[frameIdx];
+    }
+
+    frameGateGrad.setValue(rGateGrad);
+    frameGateGrad.nextFrame<!reversed, valueSize>();
+
+    if (i != length - 1) {
+      if (frameIdy == 3) {
+        framePreOutputValue.nextFrame<!reversed, frameSize>();
+        rPreOutputValue = framePreOutputValue.getValue();
+        frameOutputGrad.nextFrame<!reversed, frameSize>();
+        rOutputGrad = frameOutputGrad.getValue();
+      } else if (frameIdy == 2) {
+        rCheckGrad += rGateGrad * shStateValue[frameIdx];
+      } else if (frameIdy == 1) {
+        rCheckGrad += rGateGrad * shStateValue[frameIdx];
+      }
+
+      frameGateValue.nextFrame<!reversed, valueSize>();
+      rGateValue = frameGateValue.getValue();
+      shGateGrad[frameIdy][frameIdx] = rGateGrad;
+      if (valueSize == 128) {
+        real sum = 0.0f;
+#pragma unroll
+        for (int n = 0; n < frameSize; n++) {
+          sum += shGateGrad[frameIdy][n] * B_r[n];
+        }
+        if (frameIdy == 3) {
+          rOutputGrad += sum;
+        } else {
+          shOutputGrad[frameIdy][frameIdx] = sum;
+        }
+      }
+      if (valueSize == 256) {
+        ptx_sync(5, valueSize);
+        real A_r[frameSize];
+        for (int n = 0; n < frameSize; n++) {
+          A_r[n] = shGateGrad[frameIdy][n];
+        }
+        real sum = 0.0f;
+        for (int n = 0; n < frameSize; n++) {
+          sum += A_r[n] * B_r[n];
+        }
+        if (frameIdy == 3) {
+          rOutputGrad += sum;
+        } else {
+          shOutputGrad[frameIdy][frameIdx] = sum;
+        }
+      }
+
+      if (frameIdy == 3) {
+        ptx_sync(6, valueSize);
+#pragma unroll
+        for (int i = 0; i < 3; i++) {
+          rOutputGrad += shOutputGrad[i][frameIdx];
+        }
+      } else {
+        ptx_arrive(6, valueSize);
+      }
+    }
+  }
+
+  /* TODO: Temporary save & merger in another kernel */
+  if (frameIdy == 1) {
+    if (checkIgGrad)
+      paddle::paddleAtomicAdd(checkIgGrad + frameIdx, rCheckGrad);
+  } else if (frameIdy == 2) {
+    if (checkFgGrad)
+      paddle::paddleAtomicAdd(checkFgGrad + frameIdx, rCheckGrad);
+  } else if (frameIdy == 3) {
+    if (checkOgGrad)
+      paddle::paddleAtomicAdd(checkOgGrad + frameIdx, rCheckGrad);
+  }
+}
+
+void hl_lstm_parallel_backward_data(real *gateValue,
+                                    real *gateGrad,
+                                    real *stateValue,
+                                    real *stateGrad,
+                                    real *preOutputValue,
+                                    real *preOutputGrad,
+                                    real *outputGrad,
+                                    real *checkIg,
+                                    real *checkIgGrad,
+                                    real *checkFg,
+                                    real *checkFgGrad,
+                                    real *checkOg,
+                                    real *checkOgGrad,
+                                    real *weight,
+                                    const int *sequence,
+                                    int frameSize,
+                                    int numSequences,
+                                    bool reversed,
+                                    hl_activation_mode_t active_node,
+                                    hl_activation_mode_t active_gate,
+                                    hl_activation_mode_t active_state) {
+  CHECK(frameSize == 32 || frameSize == 64 || frameSize == 128 ||
+        frameSize == 256);
+  dim3 grid(numSequences, 1);
+  if (!reversed) {
+    if (frameSize == 32) {
+      KeLstmBackward<128, 32, 0><<<grid, 128, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
+    } else if (frameSize == 64) {
+      KeLstmBackward<256, 64, 0><<<grid, 256, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
+    } else if (frameSize == 128) {
+      KeLstmBackward<512, 128, 0><<<grid, 512, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
+    } else if (frameSize == 256) {
+      KeLstmBackward<1024, 256, 0><<<grid, 1024, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
+    }
+  } else {
+    if (frameSize == 32) {
+      KeLstmBackward<128, 32, 1><<<grid, 128, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
+    } else if (frameSize == 64) {
+      KeLstmBackward<256, 64, 1><<<grid, 256, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
+    } else if (frameSize == 128) {
+      KeLstmBackward<512, 128, 1><<<grid, 512, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
+    } else if (frameSize == 256) {
+      KeLstmBackward<1024, 256, 1><<<grid, 1024, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
+    }
+  }
+  CHECK_SYNC("hl_lstm_parallel_backward_data");
+}
+
+template <int B_X, int B_Y>
+__global__ void KeSetGradZero(real *gateGrad,
+                              const int *starts,
+                              int valueSize,
+                              int numSequences,
+                              bool reversed) {
+  // const int tid = threadIdx.x;
+
+  const int frameIdx = blockIdx.x * B_X + threadIdx.x;
+  const int numSeqId = blockIdx.y * B_Y + threadIdx.y;
+
+  if (numSeqId >= numSequences || frameIdx >= valueSize) return;
+
+  if (!reversed) {
+    int seqId = starts[numSeqId];
+    gateGrad[seqId * valueSize + frameIdx] = 0.0;
+  } else {
+    int seqId = starts[numSeqId + 1] - 1;
+    gateGrad[seqId * valueSize + frameIdx] = 0.0;
+  }
+}
+
+void hl_lstm_parallel_backward_weight(real *weightGrad,
+                                      real *outputValue,
+                                      real *gateGrad,
+                                      const int *sequence,
+                                      int frameSize,
+                                      int batchSize,
+                                      int numSequences,
+                                      bool reversed) {
+  int valueSize = 4 * frameSize;
+  dim3 threads(32, 32);
+  dim3 grid((valueSize + 32 - 1) / 32, (numSequences + 32 - 1) / 32);
+  KeSetGradZero<32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      gateGrad, sequence, valueSize, numSequences, reversed);
+
+  if (!reversed) {
+    hl_matrix_mul(outputValue,
+                  HPPL_OP_T,
+                  gateGrad + valueSize,
+                  HPPL_OP_N,
+                  weightGrad,
+                  frameSize,
+                  valueSize,
+                  batchSize - 1,
+                  1.0,
+                  1.0);
+  } else {
+    hl_matrix_mul(outputValue + frameSize,
+                  HPPL_OP_T,
+                  gateGrad,
+                  HPPL_OP_N,
+                  weightGrad,
+                  frameSize,
+                  valueSize,
+                  batchSize - 1,
+                  1.0,
+                  1.0);
+  }
+  CHECK_SYNC("hl_lstm_parallel_backward_weight");
+}
diff --git a/paddle/legacy/cuda/src/hl_cuda_matrix.cu b/paddle/legacy/cuda/src/hl_cuda_matrix.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6fe460026bbd404e15b43bd221551094a7abeda2
--- /dev/null
+++ b/paddle/legacy/cuda/src/hl_cuda_matrix.cu
@@ -0,0 +1,806 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_base.h"
+#include "hl_device_functions.cuh"
+#include "hl_gpu_matrix_kernel.cuh"
+#include "hl_matrix.h"
+#include "hl_matrix_apply.cuh"
+#include "hl_matrix_ops.cuh"
+#include "hl_sequence.h"
+#include "hl_sparse.ph"
+#include "paddle/legacy/utils/Logging.h"
+
+DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1 * a + p2 * b);
+void hl_matrix_add(real* A_d,
+                   real* B_d,
+                   real* C_d,
+                   int dimM,
+                   int dimN,
+                   real alpha,
+                   real beta) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+  CHECK_NOTNULL(C_d);
+
+  hl_gpu_apply_ternary_op<real, ternary::_add<real>, 0, 0>(
+      ternary::_add<real>(alpha, beta),
+      A_d,
+      B_d,
+      C_d,
+      dimM,
+      dimN,
+      dimN,
+      dimN,
+      dimN);
+  CHECK_SYNC("hl_matrix_add failed");
+}
+
+#ifdef PADDLE_TYPE_DOUBLE
+#define THRESHOLD 128
+#else
+#define THRESHOLD 64
+#endif
+__device__ __forceinline__ void findMax(real* I,
+                                        real* dfMax_s,
+                                        int blockSize,
+                                        int base,
+                                        int curIdx,
+                                        int nextIdx,
+                                        int dimN,
+                                        real* max) {
+  dfMax_s[base] = -1.0e20;
+  while (curIdx < dimN) {
+    if (dfMax_s[base] < I[nextIdx]) {
+      dfMax_s[base] = I[nextIdx];
+    }
+    nextIdx += blockSize;
+    curIdx += blockSize;
+  }
+  __syncthreads();
+
+  for (int stride = blockSize >> 1; stride > 0; stride >>= 1) {
+    __syncthreads();
+    if (base < stride) {
+      nextIdx = base + stride;
+      if (dfMax_s[base] < dfMax_s[nextIdx]) {
+        dfMax_s[base] = dfMax_s[nextIdx];
+      }
+    }
+  }
+
+  if (0 == base) {
+    max[0] = dfMax_s[0];
+  }
+  __syncthreads();
+}
+
+__device__ __forceinline__ void subMaxAndExp(real* I,
+                                             real* O,
+                                             int curIdx,
+                                             int nextIdx,
+                                             int blockSize,
+                                             int dimN,
+                                             real max) {
+  real val;
+  while (curIdx < dimN) {
+    val = I[nextIdx] - max;
+    if (val < -THRESHOLD) {
+      val = -THRESHOLD;
+    }
+    I[nextIdx] = val;
+#ifndef PADDLE_TYPE_DOUBLE
+    O[nextIdx] = __expf(val);
+#else
+    O[nextIdx] = exp(val);
+#endif
+    nextIdx += blockSize;
+    curIdx += blockSize;
+  }
+  __syncthreads();
+}
+
+__device__ __forceinline__ void valueSum(real* O,
+                                         real* dfMax_s,
+                                         int blockSize,
+                                         int base,
+                                         int curIdx,
+                                         int nextIdx,
+                                         int dimN) {
+  dfMax_s[base] = 0;
+  while (curIdx < dimN) {
+    dfMax_s[base] += O[nextIdx];
+    nextIdx += blockSize;
+    curIdx += blockSize;
+  }
+  __syncthreads();
+
+  for (int stride = blockSize >> 1; stride > 0; stride >>= 1) {
+    __syncthreads();
+    if (base < stride) {
+      nextIdx = base + stride;
+      dfMax_s[base] += dfMax_s[nextIdx];
+    }
+  }
+  __syncthreads();
+}
+
+__device__ __forceinline__ void divSum(
+    real* O, real sum, int curIdx, int nextIdx, int blockSize, int dimN) {
+  while (curIdx < dimN) {
+    O[nextIdx] /= sum;
+    nextIdx += blockSize;
+    curIdx += blockSize;
+  }
+}
+
+__device__ __forceinline__ void softmax(real* I,
+                                        real* O,
+                                        real* dfMax_s,
+                                        int blockSize,
+                                        int base,
+                                        int curIdx,
+                                        int nextIdx,
+                                        int dimN) {
+  __shared__ real max;
+
+  // find the max number
+  findMax(I, dfMax_s, blockSize, base, curIdx, nextIdx, dimN, &max);
+
+  // sub max Value and do Exp operation
+  subMaxAndExp(I, O, base, nextIdx, blockSize, dimN, max);
+
+  // add dimN values into blockDim.x buffer
+  // sum is in dfMax_s[0]
+  valueSum(O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
+
+  // divided by sum
+  divSum(O, dfMax_s[0], curIdx, nextIdx, blockSize, dimN);
+}
+
+template <int blockSize>
+__global__ void KeMatrixSoftMax(real* O, real* I, int dimN) {
+  int base = threadIdx.x;
+  __shared__ real dfMax_s[blockSize];
+  int nextIdx = blockIdx.x * dimN + base;
+  int curIdx = base;
+
+  softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
+}
+
+void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  dim3 block(512, 1);
+  dim3 grid(dimM, 1);
+  KeMatrixSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
+  CHECK_SYNC("hl_matrix_softmax failed");
+}
+
+template <int blockSize>
+__global__ void KeSequenceSoftMax(real* O, real* I, const int* index) {
+  int base = threadIdx.x;
+  int bid = blockIdx.x;
+  __shared__ real dfMax_s[blockSize];
+
+  int start = index[bid];
+  int dimN = index[bid + 1] - start;
+
+  int nextIdx = start + base;
+  int curIdx = base;
+
+  softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
+}
+
+void hl_sequence_softmax_forward(real* A_d,
+                                 real* C_d,
+                                 const int* index,
+                                 int numSequence) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  dim3 block(512, 1);
+  dim3 grid(numSequence, 1);
+  KeSequenceSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
+  CHECK_SYNC("hl_sequence_softmax_forward failed");
+}
+
+__global__ void KeMatrixDerivative(
+    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int index;
+
+  if (rowIdx < dimM && colIdx < dimN) {
+    index = rowIdx * dimN + colIdx;
+    grad_d[index] = output_d[index] * (grad_d[index] - sftmaxSum_d[rowIdx]);
+  }
+}
+
+void hl_matrix_softmax_derivative(
+    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
+  CHECK_NOTNULL(grad_d);
+  CHECK_NOTNULL(output_d);
+  CHECK_NOTNULL(sftmaxSum_d);
+
+  int blocksX = (dimM + 0) / 1;
+  int blocksY = (dimN + 1024 - 1) / 1024;
+  dim3 threads(1, 1024);
+  dim3 grid(blocksX, blocksY);
+
+  KeMatrixDerivative<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      grad_d, output_d, sftmaxSum_d, dimM, dimN);
+  CHECK_SYNC("hl_matrix_softmax_derivative failed");
+}
+
+__global__ void KeMatrixMultiBinaryCrossEntropy(
+    real* output, real* entropy, int* row, int* col, int dimM, int dimN) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < dimM) {
+    for (int i = 0; i < dimN; i++) {
+      entropy[index] -= log(1 - output[index * dimN + i]);
+    }
+    int* row_col = col + row[index];
+    int col_num = row[index + 1] - row[index];
+    for (int i = 0; i < col_num; i++) {
+      real o = output[index * dimN + row_col[i]];
+      entropy[index] -= log(o / (1 - o));
+    }
+  }
+}
+
+void hl_matrix_multi_binary_cross_entropy(real* output,
+                                          real* entropy,
+                                          hl_sparse_matrix_s csr_mat,
+                                          int dimM,
+                                          int dimN) {
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(entropy);
+  CHECK_NOTNULL(csr_mat);
+  CHECK_EQ(csr_mat->format, HL_SPARSE_CSR);
+  int n_threads = 1024;
+  int blocks = (dimM + n_threads - 1) / n_threads;
+  dim3 threads(n_threads);
+  dim3 grid(blocks);
+  hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
+  KeMatrixMultiBinaryCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      output, entropy, mat->csr_row, mat->csr_col, dimM, dimN);
+  CHECK_SYNC("hl_matrix_multi_binary_cross_entropy failed");
+}
+
+__global__ void KeMatrixMultiBinaryCrossEntropyBp(
+    real* output, real* grad, int* row, int* col, int dimM, int dimN) {
+  int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (row_idx < dimM) {
+    for (int i = 0; i < dimN; i++) {
+      int index = row_idx * dimN + i;
+      grad[index] += 1.0 / (1 - output[index]);
+    }
+    int col_num = row[row_idx + 1] - row[row_idx];
+    int* row_col = col + row[row_idx];
+    for (int i = 0; i < col_num; i++) {
+      int index = row_idx * dimN + row_col[i];
+      grad[index] -= 1.0 / (output[index] * (1 - output[index]));
+    }
+  }
+}
+
+void hl_matrix_multi_binary_cross_entropy_bp(
+    real* output, real* grad, hl_sparse_matrix_s csr_mat, int dimM, int dimN) {
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(grad);
+  CHECK_NOTNULL(csr_mat);
+  CHECK_EQ(csr_mat->format, HL_SPARSE_CSR);
+  int n_threads = 1024;
+  int blocks = (dimM + n_threads - 1) / n_threads;
+  dim3 threads(n_threads);
+  dim3 grid(blocks);
+  hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
+  KeMatrixMultiBinaryCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      output, grad, mat->csr_row, mat->csr_col, dimM, dimN);
+  CHECK_SYNC("hl_matrix_multi_binary_cross_entropy_bp failed");
+}
+
+__global__ void KeMatrixCrossEntropy(
+    real* O, real* E, int* label, int dimM, int dimN) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int newBase;
+  if (index < dimM) {
+    newBase = label[index];
+    newBase = newBase % dimN;
+    E[index] = -log(O[index * dimN + newBase]);
+  }
+}
+
+void hl_matrix_cross_entropy(
+    real* A_d, real* C_d, int* label_d, int dimM, int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  int blocks = (dimM + 1024 - 1) / 1024;
+  dim3 threads(1024, 1);
+  dim3 grid(blocks, 1);
+  KeMatrixCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      A_d, C_d, label_d, dimM, dimN);
+  CHECK_SYNC("hl_matrix_cross_entropy failed");
+}
+
+__global__ void KeMatrixCrossEntropyBp(
+    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int index;
+  if (rowIdx < dimM && colIdx < dimN) {
+    index = rowIdx * dimN + colIdx;
+    if (label_d[rowIdx] == colIdx) {
+      grad_d[index] -= 1.0f / output_d[index];
+    }
+  }
+}
+
+void hl_matrix_cross_entropy_bp(
+    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
+  CHECK_NOTNULL(grad_d);
+  CHECK_NOTNULL(output_d);
+  CHECK_NOTNULL(label_d);
+
+  int blocksX = (dimM + 0) / 1;
+  int blocksY = (dimN + 1024 - 1) / 1024;
+  dim3 threads(1, 1024);
+  dim3 grid(blocksX, blocksY);
+  KeMatrixCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      grad_d, output_d, label_d, dimM, dimN);
+  CHECK_SYNC("hl_matrix_cross_entropy_bp failed");
+}
+
+void hl_matrix_zero_mem(real* data, int num) {
+  hl_gpu_apply_unary_op(unary::Zero<real>(), data, 1, num, num);
+}
+
+__global__ void KeParamReluForward(real* output,
+                                   real* input,
+                                   real* w,
+                                   int width,
+                                   int height,
+                                   int partial_sum) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int ty = blockIdx.y * blockDim.y + threadIdx.y;
+  if (tx < width && ty < height) {
+    int index = ty * width + tx;
+    output[index] =
+        input[index] > 0 ? input[index] : input[index] * w[tx / partial_sum];
+  }
+}
+
+void hl_param_relu_forward(real* output,
+                           real* input,
+                           real* w,
+                           int width,
+                           int height,
+                           int partial_sum) {
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(w);
+  dim3 threads(16, 16);
+  int blockX = (width + 16 - 1) / 16;
+  int blockY = (height + 16 - 1) / 16;
+  dim3 grid(blockX, blockY);
+  KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      output, input, w, width, height, partial_sum);
+  CHECK_SYNC("hl_param_relu_forward failed");
+}
+
+template <int blockSize>
+__global__ void KeParamReluBackWardW(real* grad_w,
+                                     real* grad_o,
+                                     real* input,
+                                     int width,
+                                     int height,
+                                     int partial_sum) {
+  const int tid = threadIdx.x;
+  __shared__ real temp[blockSize];
+  grad_o += partial_sum * blockIdx.x;
+  input += partial_sum * blockIdx.x;
+  real tmp = 0.0;
+  for (int index = tid; index < partial_sum * height; index += blockSize) {
+    int row = index / partial_sum;
+    int offset = row * width + (index - row * partial_sum);
+    if (input[offset] < 0) {
+      tmp += grad_o[offset] * input[offset];
+    }
+  }
+  temp[tid] = tmp;
+  __syncthreads();
+  for (int s = blockSize / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      temp[tid] += temp[tid + s];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    grad_w[blockIdx.x] += temp[0];
+  }
+}
+
+void hl_param_relu_backward_w(real* grad_w,
+                              real* grad_o,
+                              real* input,
+                              int width,
+                              int height,
+                              int partial_sum) {
+  CHECK_NOTNULL(grad_w);
+  CHECK_NOTNULL(grad_o);
+  CHECK_NOTNULL(input);
+  const int blockSize = 1024;
+  int grid_num = width / partial_sum;
+  dim3 threads(blockSize, 1);
+  dim3 grid(grid_num, 1);
+  KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      grad_w, grad_o, input, width, height, partial_sum);
+  CHECK_SYNC("hl_param_relu_backward_w failed");
+}
+
+__global__ void KeParamReluBackwardDiff(real* grad_o,
+                                        real* input,
+                                        real* w,
+                                        real* diff,
+                                        int width,
+                                        int height,
+                                        int partial_sum) {
+  int tx = blockIdx.x * blockDim.x + threadIdx.x;
+  int ty = blockIdx.y * blockDim.y + threadIdx.y;
+  if (tx < width && ty < height) {
+    int index = ty * width + tx;
+    diff[index] += grad_o[index] * (input[index] > 0 ? 1 : w[tx / partial_sum]);
+  }
+}
+
+void hl_param_relu_backward_diff(real* grad_o,
+                                 real* data,
+                                 real* w,
+                                 real* diff,
+                                 int width,
+                                 int height,
+                                 int partial_sum) {
+  CHECK_NOTNULL(grad_o);
+  CHECK_NOTNULL(data);
+  CHECK_NOTNULL(w);
+  CHECK_NOTNULL(diff);
+  dim3 threads(16, 16);
+  int blockX = (width + 16 - 1) / 16;
+  int blockY = (height + 16 - 1) / 16;
+  dim3 grid(blockX, blockY);
+  KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      grad_o, data, w, diff, width, height, partial_sum);
+  CHECK_SYNC("hl_param_relu_backward_diff failed");
+}
+
+__global__ void KeMatrixAddSharedBias(
+    real* A, real* B, const int channel, const int M, const int N, real scale) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int dim = N / channel;
+  if (index < M * N) {
+    int i = index % N;
+    i = i / dim;
+    A[index] += scale * B[i];
+  }
+}
+
+void hl_matrix_add_shared_bias(real* A_d,
+                               real* B_d,
+                               const int channel,
+                               const int dimM,
+                               const int dimN,
+                               real scale) {
+  const int blocks = 512;
+  const int grids = DIVUP(dimM * dimN, blocks);
+  KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>(
+      A_d, B_d, channel, dimM, dimN, scale);
+  CHECK_SYNC("hl_matrix_add_shared_bias failed");
+}
+
+template <int blockSize>
+__global__ void KeMatrixCollectSharedBias(real* B,
+                                          real* A,
+                                          const int channel,
+                                          const int M,
+                                          const int N,
+                                          const int dim,
+                                          const int limit,
+                                          real scale) {
+  if (dim < limit) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < channel) {
+      real sum = 0.0;
+      for (int i = 0; i < M; ++i) {
+        for (int j = 0; j < dim; ++j) {
+          sum += A[i * N + index * dim + j];
+        }
+      }
+      B[index] += scale * sum;
+    }
+  } else {
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+    __shared__ real smem[blockSize];
+    real sum = 0.0;
+    for (int j = 0; j < ((dim * M + blockSize - 1) / blockSize); ++j) {
+      int n = j * blockSize + tid;
+      int m = n / dim;
+      int w = n % dim;
+      smem[tid] = (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0;
+      __syncthreads();
+      simpleReduce(smem, tid, blockSize);
+      sum += smem[0];
+    }
+    if (tid == 0) {
+      B[bid] += scale * sum;
+    }
+  }
+}
+
+void hl_matrix_collect_shared_bias(real* B_d,
+                                   real* A_d,
+                                   const int channel,
+                                   const int dimM,
+                                   const int dimN,
+                                   real scale) {
+  const int dim = dimN / channel;
+  const int blocks = 256;
+  const int limit = 64;
+  int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel;
+
+  KeMatrixCollectSharedBias<blocks><<<grids, blocks, 0, STREAM_DEFAULT>>>(
+      B_d, A_d, channel, dimM, dimN, dim, limit, scale);
+  CHECK_SYNC("hl_matrix_collect_shared_bias failed");
+}
+
+__global__ void keMatrixRotate(
+    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < dimM * dimN) {
+    int i = idx / dimN;
+    int j = idx % dimN;
+    if (clockWise) {
+      matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j];
+    } else {
+      matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)];
+    }
+  }
+}
+
+void hl_matrix_rotate(
+    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
+  CHECK_NOTNULL(mat);
+  CHECK_NOTNULL(matRot);
+  const int threads = 512;
+  const int blocks = DIVUP(dimM * dimN, threads);
+  keMatrixRotate<<<blocks, threads, 0, STREAM_DEFAULT>>>(
+      mat, matRot, dimM, dimN, clockWise);
+  CHECK_SYNC("hl_matrix_rotate failed");
+}
+
+__global__ void keMatrixVol2Col(int num_kernels,
+                                const real* dataSrc,
+                                real* dataDst,
+                                int depth,
+                                int height,
+                                int width,
+                                int filterD,
+                                int filterH,
+                                int filterW,
+                                int strideD,
+                                int strideH,
+                                int strideW,
+                                int paddingD,
+                                int paddingH,
+                                int paddingW,
+                                int depth_col,
+                                int height_col,
+                                int width_col) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
+       index += blockDim.x * gridDim.x) {
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int d_out = (index / width_col / height_col) % depth_col;
+    int channel_in = index / width_col / height_col / depth_col;
+    int channel_out = channel_in * filterD * filterH * filterW;
+    int w_in = w_out * strideW - paddingW;
+    int h_in = h_out * strideH - paddingH;
+    int d_in = d_out * strideD - paddingD;
+
+    dataDst +=
+        ((channel_out * depth_col + d_out) * height_col + h_out) * width_col +
+        w_out;
+    dataSrc += ((channel_in * depth + d_in) * height + h_in) * width + w_in;
+    for (int k = 0; k < filterD; ++k) {
+      for (int i = 0; i < filterH; ++i) {
+        for (int j = 0; j < filterW; ++j) {
+          int d = d_in + k;
+          int h = h_in + i;
+          int w = w_in + j;
+          *dataDst = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 &&
+                      w < width)
+                         ? dataSrc[(k * height + i) * width + j]
+                         : 0;
+          dataDst += depth_col * height_col * width_col;
+        }
+      }
+    }
+  }
+}
+
+void hl_matrix_vol2Col(const real* dataSrc,
+                       int channels,
+                       int depth,
+                       int height,
+                       int width,
+                       int filterD,
+                       int filterH,
+                       int filterW,
+                       int strideD,
+                       int strideH,
+                       int strideW,
+                       int paddingD,
+                       int paddingH,
+                       int paddingW,
+                       real* dataDst) {
+  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
+  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
+  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
+  int num_kernels = channels * depth_col * height_col * width_col;
+
+  const int threads = 512;
+  const int blocks = DIVUP(num_kernels, threads);
+
+  keMatrixVol2Col<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                          dataSrc,
+                                                          dataDst,
+                                                          depth,
+                                                          height,
+                                                          width,
+                                                          filterD,
+                                                          filterH,
+                                                          filterW,
+                                                          strideD,
+                                                          strideH,
+                                                          strideW,
+                                                          paddingD,
+                                                          paddingH,
+                                                          paddingW,
+                                                          depth_col,
+                                                          height_col,
+                                                          width_col);
+  CHECK_SYNC("hl_matrix_vol2Col failed");
+}
+
+__global__ void keMatrixCol2Vol(int num_kernels,
+                                real* dataDst,
+                                const real* dataSrc,
+                                int depth,
+                                int height,
+                                int width,
+                                int filterD,
+                                int filterH,
+                                int filterW,
+                                int strideD,
+                                int strideH,
+                                int strideW,
+                                int paddingD,
+                                int paddingH,
+                                int paddingW,
+                                int depth_col,
+                                int height_col,
+                                int width_col,
+                                real alpha,
+                                real beta) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
+       index += blockDim.x * gridDim.x) {
+    real srcVal = 0;
+    real dstVal = dataDst[index];
+    int w = index % width + paddingW;
+    int h = (index / width) % height + paddingH;
+    int d = (index / width / height) % depth + paddingD;
+    int c = index / width / height / depth;
+    // compute the start and end of the output
+    int w_col_start = (w < filterW) ? 0 : (w - filterW) / strideW + 1;
+    int w_col_end = min(w / strideW + 1, width_col);
+    int h_col_start = (h < filterH) ? 0 : (h - filterH) / strideH + 1;
+    int h_col_end = min(h / strideH + 1, height_col);
+    int d_col_start = (d < filterD) ? 0 : (d - filterD) / strideD + 1;
+    int d_col_end = min(d / strideD + 1, depth_col);
+
+    int offset = (c * filterD * filterW * filterH + d * filterW * filterH +
+                  h * filterW + w) *
+                 depth_col * height_col * width_col;
+
+    int coeff_d_col =
+        (1 - strideD * filterW * filterH * depth_col) * height_col * width_col;
+    int coeff_h_col =
+        (1 - strideH * filterW * depth_col * height_col) * width_col;
+    int coeff_w_col = (1 - strideW * depth_col * height_col * width_col);
+
+    for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          srcVal += dataSrc[offset + d_col * coeff_d_col + h_col * coeff_h_col +
+                            w_col * coeff_w_col];
+        }
+      }
+    }
+    dataDst[index] = alpha * srcVal + beta * dstVal;
+  }
+}
+
+void hl_matrix_col2Vol(real* dataDst,
+                       int channels,
+                       int depth,
+                       int height,
+                       int width,
+                       int filterD,
+                       int filterH,
+                       int filterW,
+                       int strideD,
+                       int strideH,
+                       int strideW,
+                       int paddingD,
+                       int paddingH,
+                       int paddingW,
+                       const real* dataSrc,
+                       real alpha,
+                       real beta) {
+  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
+  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
+  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
+  int num_kernels = channels * depth * height * width;
+
+  const int threads = 512;
+  const int blocks = DIVUP(num_kernels, threads);
+
+  keMatrixCol2Vol<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                          dataDst,
+                                                          dataSrc,
+                                                          depth,
+                                                          height,
+                                                          width,
+                                                          filterD,
+                                                          filterH,
+                                                          filterW,
+                                                          strideD,
+                                                          strideH,
+                                                          strideW,
+                                                          paddingD,
+                                                          paddingH,
+                                                          paddingW,
+                                                          depth_col,
+                                                          height_col,
+                                                          width_col,
+                                                          alpha,
+                                                          beta);
+
+  CHECK_SYNC("hl_matrix_col2Vol failed");
+}
+
+__global__ void keVectorCast2Int(int* out, real* vec, int size) {
+  for (int i = threadIdx.x; i < (size); i += blockDim.x) {
+    out[i] = int(vec[i]);
+  }
+}
+
+void hl_vector_cast2int(int* out, real* vec, int size) {
+  keVectorCast2Int<<<1, 512, 0, STREAM_DEFAULT>>>(out, vec, size);
+  CHECK_SYNC("hl_vector_cast2int failed");
+}
diff --git a/paddle/legacy/cuda/src/hl_cuda_sequence.cu b/paddle/legacy/cuda/src/hl_cuda_sequence.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1d772b5ce27615673d85231ec8fd3ab1d0aed523
--- /dev/null
+++ b/paddle/legacy/cuda/src/hl_cuda_sequence.cu
@@ -0,0 +1,408 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_base.h"
+#include "hl_device_functions.cuh"
+#include "paddle/legacy/utils/Logging.h"
+
+__global__ void KeMaxSequenceForward(real* input,
+                                     const int* sequence,
+                                     real* output,
+                                     int* index,
+                                     int numSequences,
+                                     int dim) {
+  int dimIdx = threadIdx.x;
+  int sequenceId = blockIdx.x;
+  if (sequenceId >= numSequences) return;
+  int start = sequence[sequenceId];
+  int end = sequence[sequenceId + 1];
+
+  for (int i = dimIdx; i < dim; i += blockDim.x) {
+    real tmp = -HL_FLOAT_MAX;
+    int tmpId = -1;
+    for (int insId = start; insId < end; insId++) {
+      if (tmp < input[insId * dim + i]) {
+        tmp = input[insId * dim + i];
+        tmpId = insId;
+      }
+    }
+    output[sequenceId * dim + i] = tmp;
+    index[sequenceId * dim + i] = tmpId;
+  }
+}
+
+void hl_max_sequence_forward(real* input,
+                             const int* sequence,
+                             real* output,
+                             int* index,
+                             int numSequences,
+                             int dim) {
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(index);
+
+  dim3 threads(256, 1);
+  dim3 grid(numSequences, 1);
+  KeMaxSequenceForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      input, sequence, output, index, numSequences, dim);
+  CHECK_SYNC("hl_max_sequence_forward failed");
+}
+
+__global__ void KeMaxSequenceBackward(
+    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int colIdx = idx % dim;
+  if (idx < numSequences * dim) {
+    int insId = index[idx];
+    inputGrad[insId * dim + colIdx] += outputGrad[idx];
+  }
+}
+
+void hl_max_sequence_backward(
+    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
+  CHECK_NOTNULL(outputGrad);
+  CHECK_NOTNULL(index);
+  CHECK_NOTNULL(inputGrad);
+
+  unsigned int blocks = (numSequences * dim + 128 - 1) / 128;
+  dim3 threads(128, 1);
+  dim3 grid(blocks, 1);
+  KeMaxSequenceBackward<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      outputGrad, index, inputGrad, numSequences, dim);
+  CHECK_SYNC("hl_max_sequence_backward failed");
+}
+
+template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
+__global__ void KeMatrixAddRows(real* output,
+                                real* table,
+                                int* ids,
+                                int numSamples,
+                                int tableSize,
+                                int dim) {
+  int idx = threadIdx.x;
+  int idy = threadIdx.y;
+  int sampleId = blockIdx.x + idy * gridDimX;
+
+  while (sampleId < numSamples) {
+    int tableId = ids[sampleId];
+    if ((0 <= tableId) && (tableId < tableSize)) {
+      real* outputData = output + sampleId * dim;
+      real* tableData = table + tableId * dim;
+      for (int i = idx; i < dim; i += blockDimX) {
+        if (AddRow == 0) {
+          outputData[i] += tableData[i];
+        } else {
+          paddle::paddleAtomicAdd(&tableData[i], outputData[i]);
+        }
+      }
+    }
+    sampleId += blockDimY * gridDimX;
+  }
+}
+
+template <int blockDimX,
+          int blockDimY,
+          int gridDimX,
+          bool seq2batch,
+          bool isAdd>
+__global__ void KeSequence2Batch(real* batch,
+                                 real* sequence,
+                                 const int* batchIndex,
+                                 int seqWidth,
+                                 int batchCount) {
+  int idx = threadIdx.x;
+  int idy = threadIdx.y;
+  int id = blockIdx.x + idy * gridDimX;
+  while (id < batchCount) {
+    int seqId = batchIndex[id];
+    real* batchData = batch + id * seqWidth;
+    real* seqData = sequence + seqId * seqWidth;
+    for (int i = idx; i < seqWidth; i += blockDimX) {
+      if (seq2batch) {
+        if (isAdd) {
+          batchData[i] += seqData[i];
+        } else {
+          batchData[i] = seqData[i];
+        }
+      } else {
+        if (isAdd) {
+          seqData[i] += batchData[i];
+        } else {
+          seqData[i] = batchData[i];
+        }
+      }
+    }
+    id += blockDimY * gridDimX;
+  }
+}
+
+void hl_sequence2batch_copy(real* batch,
+                            real* sequence,
+                            const int* batchIndex,
+                            int seqWidth,
+                            int batchCount,
+                            bool seq2batch) {
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(batch);
+  CHECK_NOTNULL(batchIndex);
+
+  dim3 threads(128, 8);
+  dim3 grid(8, 1);
+  if (seq2batch) {
+    KeSequence2Batch<128, 8, 8, 1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        batch, sequence, batchIndex, seqWidth, batchCount);
+  } else {
+    KeSequence2Batch<128, 8, 8, 0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        batch, sequence, batchIndex, seqWidth, batchCount);
+  }
+  CHECK_SYNC("hl_sequence2batch_copy failed");
+}
+
+void hl_sequence2batch_add(real* batch,
+                           real* sequence,
+                           int* batchIndex,
+                           int seqWidth,
+                           int batchCount,
+                           bool seq2batch) {
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(batch);
+  CHECK_NOTNULL(batchIndex);
+
+  dim3 threads(128, 8);
+  dim3 grid(8, 1);
+  if (seq2batch) {
+    KeSequence2Batch<128, 8, 8, 1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        batch, sequence, batchIndex, seqWidth, batchCount);
+  } else {
+    KeSequence2Batch<128, 8, 8, 0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        batch, sequence, batchIndex, seqWidth, batchCount);
+  }
+  CHECK_SYNC("hl_sequence2batch_add failed");
+}
+
+template <bool normByTimes, bool seq2batch>
+__global__ void KeSequence2BatchPadding(real* batch,
+                                        real* sequence,
+                                        const int* sequenceStartPositions,
+                                        const size_t sequenceWidth,
+                                        const size_t maxSequenceLength,
+                                        const size_t numSequences) {
+  int batchIdx = blockIdx.y;
+  int sequenceStart = sequenceStartPositions[batchIdx];
+  int sequenceLength = sequenceStartPositions[batchIdx + 1] - sequenceStart;
+
+  int sequenceIdx = blockIdx.x * blockDim.y + threadIdx.y;
+  int batchBaseIdx = (sequenceIdx * numSequences + batchIdx) * sequenceWidth;
+  int sequenceBaseIdx = (sequenceStart + sequenceIdx) * sequenceWidth;
+
+  real scale = normByTimes ? (1.0f / (real)sequenceLength) : 1.0f;
+
+  if (sequenceIdx < sequenceLength) {
+    if (seq2batch) {
+      /* sequence -> batch */
+      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+        batch[batchBaseIdx + i] = scale * sequence[sequenceBaseIdx + i];
+      }
+    } else {
+      /* batch -> sequence */
+      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+        sequence[sequenceBaseIdx + i] = scale * batch[batchBaseIdx + i];
+      }
+    }
+  } else if (sequenceIdx < maxSequenceLength) {
+    if (seq2batch) {
+      /* sequence -> batch */
+      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+        batch[batchBaseIdx + i] = 0;
+      }
+    }
+  }
+}
+
+void hl_sequence2batch_copy_padding(real* batch,
+                                    real* sequence,
+                                    const int* sequenceStartPositions,
+                                    const size_t sequenceWidth,
+                                    const size_t maxSequenceLength,
+                                    const size_t numSequences,
+                                    bool normByTimes,
+                                    bool seq2batch) {
+  CHECK_NOTNULL(batch);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(sequenceStartPositions);
+
+  if (!normByTimes && numSequences == 1) {
+    size_t elementCount = maxSequenceLength * sequenceWidth;
+    if (seq2batch) {
+      /* sequence -> batch */
+      hl_memcpy_device2device(batch, sequence, sizeof(real) * elementCount);
+    } else {
+      /* batch -> sequence */
+      hl_memcpy_device2device(sequence, batch, sizeof(real) * elementCount);
+    }
+    return;
+  }
+
+  const int CUDA_BLOCK_SIZE = 512;
+
+  /* At least use 32 threads to copy sequenceWidth elements,
+     and at least 8 elements for each thread. */
+  int blockDimX = ((((sequenceWidth + 7) >> 3) + 31) >> 5) << 5;
+  blockDimX = (blockDimX < CUDA_BLOCK_SIZE) ? blockDimX : CUDA_BLOCK_SIZE;
+
+  int blockDimY = CUDA_BLOCK_SIZE / blockDimX;
+  dim3 threads(blockDimX, blockDimY);
+
+  int gridDimX = (maxSequenceLength + blockDimY - 1) / blockDimY;
+  int gridDimY = numSequences;
+  dim3 grid(gridDimX, gridDimY);
+
+  if (seq2batch) {
+    /* sequence -> batch */
+    if (normByTimes) {
+      KeSequence2BatchPadding<1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          batch,
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
+    } else {
+      KeSequence2BatchPadding<0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          batch,
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
+    }
+  } else {
+    /* batch -> sequence */
+    if (normByTimes) {
+      KeSequence2BatchPadding<1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          batch,
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
+    } else {
+      KeSequence2BatchPadding<0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          batch,
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
+    }
+  }
+
+  CHECK_SYNC("hl_sequence2batch_copy_padding failed");
+}
+
+__device__ inline float my_rsqrt(float x) { return rsqrtf(x); }
+
+__device__ inline double my_rsqrt(double x) { return rsqrt(x); }
+
+__global__ void KeSequenceAvgForward(real* dst,
+                                     real* src,
+                                     const int* starts,
+                                     int height,
+                                     int width,
+                                     const int mode) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int row = gid / width;
+  int col = gid % width;
+
+  if (gid < height * width) {
+    int start = starts[row];
+    int end = starts[row + 1];
+    int seqLength = end - start;
+    if (seqLength == 0) return;
+    real sum = 0.0;
+    for (int i = start; i < end; i++) {
+      sum += src[i * width + col];
+    }
+    sum = mode == 1 ? sum : (mode == 0 ? sum / seqLength
+                                       : sum * my_rsqrt((real)seqLength));
+    dst[gid] += sum;
+  }
+}
+
+void hl_sequence_avg_forward(real* dst,
+                             real* src,
+                             const int* starts,
+                             int height,
+                             int width,
+                             const int mode) {
+  CHECK_NOTNULL(dst);
+  CHECK_NOTNULL(src);
+  CHECK_NOTNULL(starts);
+
+  int block = 512;
+  int grid = DIVUP(width * height, 512);
+
+  CHECK(mode == 0 || mode == 1 || mode == 2)
+      << "mode error in hl_sequence_avg_forward!";
+
+  KeSequenceAvgForward<<<grid, block, 0, STREAM_DEFAULT>>>(
+      dst, src, starts, height, width, mode);
+  CHECK_SYNC("hl_sequence_avg_forward failed");
+}
+
+__global__ void KeSequenceAvgBackward(real* dst,
+                                      real* src,
+                                      const int* starts,
+                                      int height,
+                                      int width,
+                                      const int mode) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int row = gid / width;
+  int col = gid % width;
+
+  if (gid < height * width) {
+    int start = starts[row];
+    int end = starts[row + 1];
+    int seqLength = end - start;
+    if (seqLength == 0) return;
+    real grad = src[gid];
+    grad = mode == 1 ? grad : (mode == 0 ? grad / seqLength
+                                         : grad * my_rsqrt((real)seqLength));
+    for (int i = start; i < end; i++) {
+      dst[i * width + col] += grad;
+    }
+  }
+}
+
+void hl_sequence_avg_backward(real* dst,
+                              real* src,
+                              const int* starts,
+                              int height,
+                              int width,
+                              const int mode) {
+  CHECK_NOTNULL(dst);
+  CHECK_NOTNULL(src);
+  CHECK_NOTNULL(starts);
+
+  int block = 512;
+  int grid = DIVUP(width * height, 512);
+
+  CHECK(mode == 0 || mode == 1 || mode == 2)
+      << "mode error in hl_sequence_avg_backward!";
+
+  KeSequenceAvgBackward<<<grid, block, 0, STREAM_DEFAULT>>>(
+      dst, src, starts, height, width, mode);
+  CHECK_SYNC("hl_sequence_avg_backward failed");
+}
diff --git a/paddle/legacy/cuda/src/hl_cuda_sparse.cu b/paddle/legacy/cuda/src/hl_cuda_sparse.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8065a6f9f6f2ac4cacf9a63b7b80dd00391824a0
--- /dev/null
+++ b/paddle/legacy/cuda/src/hl_cuda_sparse.cu
@@ -0,0 +1,1262 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_cuda.h"
+#include "hl_cuda_sparse.cuh"
+#include "hl_matrix_apply.cuh"
+#include "hl_matrix_ops.cuh"
+#include "hl_sparse.h"
+#include "hl_sparse.ph"
+#include "paddle/legacy/utils/Logging.h"
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(mul_scalar, ONE_PARAMETER, a = a * p);
+DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
+
+void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
+                         real *C_d,
+                         int dimM,
+                         int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+  CHECK(dimM > 0 && dimN > 0 && A_d->rows == dimM && A_d->cols == dimN);
+  CHECK(A_d->format == HL_SPARSE_CSR) << "matrix format error!";
+
+  if (A_d->nnz == 0) {
+    hl_gpu_apply_unary_op(unary::Zero<real>(), C_d, dimM, dimN, dimN);
+    return;
+  }
+
+  /* nnz != 0 */
+  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
+  CHECK((A_d2->csr_val || A_d->type == HL_NO_VALUE) && A_d2->csr_row &&
+        A_d2->csr_col)
+      << "parameter transa error!";
+
+  int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
+  int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
+  dim3 threads(CU_CSR2DENSE_THREAD_X, CU_CSR2DENSE_THREAD_X);
+  dim3 grid(blocksX, blocksY);
+
+  if (A_d->type == HL_NO_VALUE) {
+    KeSMatrixCsr2Dense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, C_d, dimM, dimN);
+  } else if (A_d->type == HL_FLOAT_VALUE) {
+    KeSMatrixCsr2Dense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, C_d, dimM, dimN);
+  } else {
+  }
+  CHECK_SYNC("hl_matrix_csr2dense failed");
+}
+
+void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
+                         real *C_d,
+                         int dimM,
+                         int dimN) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+  CHECK(dimM > 0 && dimN > 0 && A_d->rows == dimM && A_d->cols == dimN);
+  CHECK(A_d->format == HL_SPARSE_CSC) << "matrix format error!";
+
+  if (A_d->nnz == 0) {
+    hl_gpu_apply_unary_op(unary::Zero<real>(), C_d, dimM, dimN, dimN);
+    return;
+  }
+
+  /* nnz != 0 */
+  hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix);
+  CHECK((A_d2->csc_val || A_d->type == HL_NO_VALUE) && A_d2->csc_row &&
+        A_d2->csc_col)
+      << "parameter transa error!";
+
+  int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
+  int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
+  dim3 threads(CU_CSR2DENSE_THREAD_X, CU_CSR2DENSE_THREAD_X);
+  dim3 grid(blocksX, blocksY);
+
+  if (A_d->type == HL_NO_VALUE) {
+    KeSMatrixCsc2Dense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        A_d2->csc_val, A_d2->csc_row, A_d2->csc_col, C_d, dimM, dimN);
+  } else if (A_d->type == HL_FLOAT_VALUE) {
+    KeSMatrixCsc2Dense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        A_d2->csc_val, A_d2->csc_row, A_d2->csc_col, C_d, dimM, dimN);
+  } else {
+  }
+  CHECK_SYNC("hl_matrix_csc2dense failed");
+}
+
+void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
+                             hl_matrix_format_t format,
+                             hl_matrix_value_t value_type,
+                             int dimM,
+                             int dimN,
+                             int nnz) {
+  CHECK_NOTNULL(A_d);
+  CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
+      << "sparse matrix format error!";
+  CHECK(value_type == HL_FLOAT_VALUE || value_type == HL_NO_VALUE)
+      << "sparse matrix value type error!";
+  /* avoid malloc 0 bytes */
+  int nnz_s = (nnz == 0 ? 1 : nnz);
+
+  if (format == HL_SPARSE_CSR) {
+    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
+
+    char *tmp =
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
+    CHECK_NOTNULL(tmp);
+
+    hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
+    csr->sparsity = -1.0;
+
+    if (value_type == HL_NO_VALUE) {
+      csr->csr_val = NULL;
+      csr->nnz_s = nnz_s;
+      csr->row_s = dimM + 1;
+      csr->csr_row = (int *)hl_malloc_device((dimM + 1) * sizeof(int));
+      csr->csr_col = (int *)hl_malloc_device((nnz_s) * sizeof(int));
+
+      *A_d = (hl_sparse_matrix_s)tmp;
+      (*A_d)->matrix = (hl_matrix_s)csr;
+    } else if (value_type == HL_FLOAT_VALUE) {
+      csr->nnz_s = nnz_s;
+      csr->row_s = dimM + 1;
+      csr->csr_val = (real *)hl_malloc_device((nnz_s) * sizeof(real));
+      csr->csr_row = (int *)hl_malloc_device((dimM + 1) * sizeof(int));
+      csr->csr_col = (int *)hl_malloc_device((nnz_s) * sizeof(int));
+
+      *A_d = (hl_sparse_matrix_s)tmp;
+      (*A_d)->matrix = (hl_matrix_s)csr;
+    }
+  } else if (format == HL_SPARSE_CSC) {
+    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
+
+    char *tmp =
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
+    CHECK_NOTNULL(tmp);
+
+    hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
+    csc->sparsity = -1.0f;
+
+    if (value_type == HL_NO_VALUE) {
+      csc->csc_val = NULL;
+      csc->nnz_s = nnz_s;
+      csc->col_s = dimN + 1;
+      csc->csc_row = (int *)hl_malloc_device((nnz_s) * sizeof(int));
+      csc->csc_col = (int *)hl_malloc_device((dimN + 1) * sizeof(int));
+
+      *A_d = (hl_sparse_matrix_s)tmp;
+      (*A_d)->matrix = (hl_matrix_s)csc;
+    } else if (value_type == HL_FLOAT_VALUE) {
+      csc->nnz_s = nnz_s;
+      csc->col_s = dimN + 1;
+      csc->csc_val = (real *)hl_malloc_device((nnz_s) * sizeof(real));
+      csc->csc_row = (int *)hl_malloc_device((nnz_s) * sizeof(int));
+      csc->csc_col = (int *)hl_malloc_device((dimN + 1) * sizeof(int));
+
+      *A_d = (hl_sparse_matrix_s)tmp;
+      (*A_d)->matrix = (hl_matrix_s)csc;
+    }
+  }
+
+  (*A_d)->format = format;
+  (*A_d)->type = value_type;
+  (*A_d)->rows = dimM;
+  (*A_d)->cols = dimN;
+  (*A_d)->nnz = nnz;
+}
+
+void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {
+  CHECK_NOTNULL(A_d);
+  CHECK(A_d->format == HL_SPARSE_CSR || A_d->format == HL_SPARSE_CSC)
+      << "sparse matrix format error!";
+
+  if (A_d->matrix == NULL) {
+    free(A_d);
+    return;
+  }
+
+  if (A_d->format == HL_SPARSE_CSR) {
+    hl_csr_matrix csr = (hl_csr_matrix)A_d->matrix;
+    if (csr->csr_val != NULL) {
+      hl_free_mem_device(csr->csr_val);
+      csr->csr_val = NULL;
+    }
+
+    if (csr->csr_row != NULL) {
+      hl_free_mem_device(csr->csr_row);
+      csr->csr_row = NULL;
+    }
+
+    if (csr->csr_col != NULL) {
+      hl_free_mem_device(csr->csr_col);
+      csr->csr_col = NULL;
+    }
+
+    A_d->matrix = NULL;
+    free(A_d);
+  } else if (A_d->format == HL_SPARSE_CSC) {
+    hl_csc_matrix csc = (hl_csc_matrix)A_d->matrix;
+    if (csc->csc_val != NULL) {
+      hl_free_mem_device(csc->csc_val);
+      csc->csc_val = NULL;
+    }
+
+    if (csc->csc_row != NULL) {
+      hl_free_mem_device(csc->csc_row);
+      csc->csc_row = NULL;
+    }
+
+    if (csc->csc_col != NULL) {
+      hl_free_mem_device(csc->csc_col);
+      csc->csc_col = NULL;
+    }
+
+    A_d->matrix = NULL;
+    free(A_d);
+  }
+}
+
+void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
+                                void *dest_d,
+                                size_t size,
+                                hl_matrix_format_t format,
+                                hl_matrix_value_t value_type,
+                                int dimM,
+                                int dimN,
+                                int nnz) {
+  CHECK_NOTNULL(A_d);
+  CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
+      << "sparse matrix format error!";
+
+  if (format == HL_SPARSE_CSR) {
+    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
+
+    size_t size_ = (dimM + 1) * sizeof(int) + nnz * sizeof(int);
+    if (value_type != HL_NO_VALUE) {
+      size_ += nnz * sizeof(real);
+    }
+    CHECK_LE(size_, size) << "dest_d size(" << size
+                          << ") too small, should bigger than(" << size_
+                          << ")!";
+
+    char *tmp =
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
+    CHECK_NOTNULL(tmp);
+
+    hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
+
+    if (value_type == HL_NO_VALUE) {
+      csr->csr_val = NULL;
+      csr->csr_row = (int *)dest_d;
+      csr->csr_col = (int *)((char *)dest_d + (dimM + 1) * sizeof(int));
+    } else {
+      csr->csr_val = (real *)dest_d;
+      csr->csr_row = (int *)((char *)dest_d + nnz * sizeof(real));
+      csr->csr_col = (int *)((char *)dest_d + nnz * sizeof(real) +
+                             (dimM + 1) * sizeof(int));
+    }
+    csr->nnz_s = nnz;
+    csr->row_s = dimM + 1;
+    csr->sparsity = -1.0;
+    *A_d = (hl_sparse_matrix_s)tmp;
+    (*A_d)->matrix = (hl_matrix_s)csr;
+  } else if (format == HL_SPARSE_CSC) {
+    CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
+
+    size_t size_ = (dimN + 1) * sizeof(int) + nnz * sizeof(int);
+    if (value_type != HL_NO_VALUE) {
+      size_ += nnz * sizeof(real);
+    }
+    CHECK_LE(size_, size) << "dest_d size(" << size
+                          << ") too small, should bigger than(" << size_
+                          << ")!";
+
+    char *tmp =
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
+    CHECK_NOTNULL(tmp);
+
+    hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
+    if (value_type == HL_NO_VALUE) {
+      csc->csc_val = NULL;
+      csc->csc_col = (int *)dest_d;
+      csc->csc_row = (int *)((char *)dest_d + (dimN + 1) * sizeof(int));
+    } else {
+      csc->csc_val = (real *)dest_d;
+      csc->csc_col = (int *)((char *)dest_d + nnz * sizeof(real));
+      csc->csc_row = (int *)((char *)dest_d + nnz * sizeof(real) +
+                             (dimN + 1) * sizeof(int));
+    }
+    csc->nnz_s = nnz;
+    csc->col_s = dimN + 1;
+    csc->sparsity = -1.0f;
+    *A_d = (hl_sparse_matrix_s)tmp;
+    (*A_d)->matrix = (hl_matrix_s)csc;
+  }
+
+  (*A_d)->format = format;
+  (*A_d)->type = value_type;
+  (*A_d)->rows = dimM;
+  (*A_d)->cols = dimN;
+  (*A_d)->nnz = nnz;
+}
+
+void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
+                                real *value_d,
+                                int *rows_d,
+                                int *cols_d,
+                                hl_matrix_format_t format,
+                                hl_matrix_value_t value_type,
+                                int dimM,
+                                int dimN,
+                                int nnz) {
+  CHECK_NOTNULL(A_d);
+  CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
+
+  CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
+      << "sparse matrix format error!";
+
+  if (format == HL_SPARSE_CSR) {
+    char *tmp =
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
+    CHECK_NOTNULL(tmp);
+
+    hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
+    csr->csr_row = rows_d;
+    csr->csr_col = cols_d;
+    csr->csr_val = value_d;
+    csr->nnz_s = nnz;
+    csr->row_s = dimM + 1;
+    csr->sparsity = -1.0;
+    *A_d = (hl_sparse_matrix_s)tmp;
+    (*A_d)->matrix = (hl_matrix_s)csr;
+  } else if (format == HL_SPARSE_CSC) {
+    char *tmp =
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
+    CHECK_NOTNULL(tmp);
+
+    hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
+    csc->csc_row = rows_d;
+    csc->csc_col = cols_d;
+    csc->csc_val = value_d;
+    csc->nnz_s = nnz;
+    csc->col_s = dimN + 1;
+    csc->sparsity = -1.0f;
+    *A_d = (hl_sparse_matrix_s)tmp;
+    (*A_d)->matrix = (hl_matrix_s)csc;
+  }
+
+  (*A_d)->format = format;
+  (*A_d)->type = value_type;
+  (*A_d)->rows = dimM;
+  (*A_d)->cols = dimN;
+  (*A_d)->nnz = nnz;
+}
+
+void hl_destruct_sparse_matrix(hl_sparse_matrix_s A_d) {
+  CHECK_NOTNULL(A_d);
+  free(A_d);
+}
+
+void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
+                          real *csr_val,
+                          int *csr_row,
+                          int *csr_col,
+                          hl_stream_t stream) {
+  CHECK_NOTNULL(csr_matrix);
+  CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR)
+      << "csr_matrix is not csr format!";
+  CHECK_NOTNULL(csr_matrix->matrix);
+
+  hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix);
+  CHECK_LE(csr_matrix->nnz, csr->nnz_s) << "copy size " << csr_matrix->nnz
+                                        << " is big than alloc size "
+                                        << csr->nnz_s;
+
+  CHECK_LE((csr_matrix->rows + 1), csr->row_s)
+      << "copy size " << (csr_matrix->rows + 1) << " is big than alloc size "
+      << csr->row_s;
+
+  CHECK(csr_matrix->type == HL_FLOAT_VALUE || csr_matrix->type == HL_NO_VALUE)
+      << "sparse matrix value type error!";
+
+  if (csr_matrix->type == HL_NO_VALUE) {
+    if (csr_row == NULL && csr_col == NULL) {
+      return;
+    } else if (csr_row != NULL && csr_col != NULL) {
+      hl_memcpy_async(
+          csr->csr_row, csr_row, (csr_matrix->rows + 1) * sizeof(int), stream);
+
+      hl_memcpy_async(
+          csr->csr_col, csr_col, (csr_matrix->nnz) * sizeof(int), stream);
+    } else {
+      LOG(FATAL) << "parameter csr_row or csr_col is null pointer!";
+    }
+  } else if (csr_matrix->type == HL_FLOAT_VALUE) {
+    if (csr_val == NULL && csr_row == NULL && csr_col == NULL) {
+      return;
+    } else if (csr_val != NULL && csr_row == NULL && csr_col == NULL) {
+      hl_memcpy_async(
+          csr->csr_val, csr_val, (csr_matrix->nnz) * sizeof(real), stream);
+    } else if (csr_val != NULL && csr_row != NULL && csr_col != NULL) {
+      hl_memcpy_async(
+          csr->csr_val, csr_val, (csr_matrix->nnz) * sizeof(real), stream);
+      hl_memcpy_async(
+          csr->csr_row, csr_row, (csr_matrix->rows + 1) * sizeof(int), stream);
+      hl_memcpy_async(
+          csr->csr_col, csr_col, (csr_matrix->nnz) * sizeof(int), stream);
+    } else {
+      LOG(FATAL) << "parameter csr_row or csr_col is null pointer!";
+    }
+  }
+
+  csr->sparsity = ((float)csr_matrix->nnz) / ((float)csr_matrix->rows) /
+                  ((float)csr_matrix->cols);
+}
+
+void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
+                          real *csc_val,
+                          int *csc_row,
+                          int *csc_col,
+                          hl_stream_t stream) {
+  CHECK_NOTNULL(csc_matrix);
+  CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC)
+      << "csc_matrix is not csc format error!";
+
+  hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix);
+  CHECK_LE(csc_matrix->nnz, csc->nnz_s) << "copy size " << csc_matrix->nnz
+                                        << " is big than alloc size "
+                                        << csc->nnz_s;
+
+  CHECK_LE((csc_matrix->cols + 1), csc->col_s)
+      << "copy size " << (csc_matrix->cols + 1) << " is big than alloc size "
+      << csc->col_s;
+
+  CHECK(csc_matrix->type == HL_FLOAT_VALUE || csc_matrix->type == HL_NO_VALUE)
+      << "sparse matrix value type error!";
+
+  if (csc_matrix->type == HL_NO_VALUE) {
+    if (csc_row == NULL && csc_col == NULL) {
+      return;
+    } else if (csc_row != NULL && csc_col != NULL) {
+      hl_memcpy_async(
+          csc->csc_row, csc_row, (csc_matrix->nnz) * sizeof(int), stream);
+      hl_memcpy_async(
+          csc->csc_col, csc_col, (csc_matrix->cols + 1) * sizeof(int), stream);
+    } else {
+      LOG(FATAL) << "parameter csc_row or csc_col is null pointer!";
+    }
+  } else if (csc_matrix->type == HL_FLOAT_VALUE) {
+    if (csc_val == NULL && csc_row == NULL && csc_col == NULL) {
+      return;
+    } else if (csc_val != NULL && csc_row == NULL && csc_col == NULL) {
+      hl_memcpy_async(
+          csc->csc_val, csc_val, (csc_matrix->nnz) * sizeof(real), stream);
+    } else if (csc_val != NULL && csc_row != NULL && csc_col != NULL) {
+      hl_memcpy_async(
+          csc->csc_val, csc_val, (csc_matrix->nnz) * sizeof(real), stream);
+      hl_memcpy_async(
+          csc->csc_row, csc_row, (csc_matrix->nnz) * sizeof(int), stream);
+      hl_memcpy_async(
+          csc->csc_col, csc_col, (csc_matrix->cols + 1) * sizeof(int), stream);
+    } else {
+      LOG(FATAL) << "parameter csc_row or csc_col is null pointer!";
+    }
+  }
+
+  csc->sparsity = ((float)csc_matrix->nnz) / ((float)csc_matrix->rows) /
+                  ((float)csc_matrix->cols);
+}
+
+void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst,
+                             hl_sparse_matrix_s src,
+                             hl_stream_t stream) {
+  CHECK(dst && src && dst->matrix && src->matrix)
+      << "parameter dst or src is null pointer!";
+  CHECK_EQ(dst->format, src->format) << "sparse matrix format does not match!";
+  CHECK(dst->type != HL_FLOAT_VALUE || src->type != HL_NO_VALUE)
+      << "src sparse matrix is no value, dst sparse matrix has value!";
+
+  if (dst->format == HL_SPARSE_CSR) {
+    dst->rows = src->rows;
+    dst->cols = src->cols;
+    dst->nnz = src->nnz;
+    hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
+    hl_memcpy_csr_matrix(dst, csr->csr_val, csr->csr_row, csr->csr_col, stream);
+  } else if (dst->format == HL_SPARSE_CSC) {
+    dst->rows = src->rows;
+    dst->cols = src->cols;
+    dst->nnz = src->nnz;
+    hl_csc_matrix csc = (hl_csc_matrix)src->matrix;
+    hl_memcpy_csc_matrix(dst, csc->csc_val, csc->csc_row, csc->csc_col, stream);
+  } else {
+    LOG(FATAL) << "sparse matrix format error!";
+  }
+}
+
+/**
+ * Calculate beta * C, if beta is zero, C does not have to be a valid input.
+ */
+static void _beta_mul_c(real *c, int dimM, int dimN, real beta) {
+  if (beta == 0.0) {
+    hl_gpu_apply_unary_op(unary::Zero<real>(), c, dimM, dimN, dimN);
+  } else {
+    if (beta != 1.0) {
+      hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta), c, dimM, dimN, dimN);
+    }
+  }
+
+  return;
+}
+
+void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
+                             hl_trans_op_t transa,
+                             real *B_d,
+                             hl_trans_op_t transb,
+                             real *C_d,
+                             int dimM,
+                             int dimN,
+                             int dimK,
+                             real alpha,
+                             real beta) {
+  CHECK_EQ(transb, HPPL_OP_N);
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+  CHECK_NOTNULL(C_d);
+  CHECK(dimM > 0 && dimN > 0 && dimK > 0);
+  CHECK_EQ(A_d->format, HL_SPARSE_CSR) << "matrix format error!";
+
+  if ((HPPL_OP_N == transa && (A_d->rows != dimM || A_d->cols != dimK)) ||
+      (HPPL_OP_T == transa && (A_d->rows != dimK || A_d->cols != dimM))) {
+    LOG(FATAL) << "parameter error!";
+  }
+
+  if (A_d->nnz == 0) {
+    _beta_mul_c(C_d, dimM, dimN, beta);
+    return;
+  }
+
+  /* nnz != 0 */
+  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
+  if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) ||
+      A_d2->csr_row == NULL || A_d2->csr_col == NULL) {
+    LOG(FATAL) << "parameter error!";
+  }
+
+  if (HPPL_OP_N == transa) {
+    int blocksX = (dimN + CU_CSRMM_BLOCK_N - 1) / CU_CSRMM_BLOCK_N;
+    int blocksY = (dimM + CU_CSRMM_THREAD_Y - 1) / CU_CSRMM_THREAD_Y;
+    dim3 threads(CU_CSRMM_THREAD_X, CU_CSRMM_THREAD_Y);
+    dim3 grid(blocksX, blocksY);
+
+    /* sparsity pattern */
+    // A_d->sparsity;
+    if (A_d->type == HL_NO_VALUE) {
+      KeSMatrixCsrMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csr_val,
+          A_d2->csr_col,
+          A_d2->csr_row,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    } else {
+      KeSMatrixCsrMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csr_val,
+          A_d2->csr_col,
+          A_d2->csr_row,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    }
+  } else if (HPPL_OP_T == transa) {
+    _beta_mul_c(C_d, dimM, dimN, beta);
+
+    int blocksX =
+        (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / CU_CSC_MUL_DENSE_BLOCK_N;
+    int blocksY =
+        (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / CU_CSC_MUL_DENSE_BLOCK_K;
+    dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y);
+    dim3 grid(blocksX, blocksY);
+    if (A_d->type == HL_NO_VALUE) {
+      KeSMatrixCscMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csr_val,
+          A_d2->csr_col,
+          A_d2->csr_row,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    } else {
+      KeSMatrixCscMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csr_val,
+          A_d2->csr_col,
+          A_d2->csr_row,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    }
+  } else {
+    LOG(FATAL) << "parameter transa error!";
+  }
+
+  CHECK_SYNC("hl_matrix_csr_mul_dense failed");
+}
+
+void hl_matrix_dense_mul_csc(real *A_d,
+                             hl_trans_op_t transa,
+                             hl_sparse_matrix_s B_d,
+                             hl_trans_op_t transb,
+                             real *C_d,
+                             int dimM,
+                             int dimN,
+                             int dimK,
+                             real alpha,
+                             real beta) {
+  CHECK_EQ(transa, HPPL_OP_N);
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+  CHECK_NOTNULL(C_d);
+
+  if (dimM <= 0 || dimN <= 0 || dimK <= 0 ||
+      ((transb == HPPL_OP_N) && (B_d->rows != dimK || B_d->cols != dimN)) ||
+      ((transb == HPPL_OP_T) && (B_d->rows != dimN || B_d->cols != dimK))) {
+    LOG(FATAL) << "parameter dims error!";
+  }
+
+  CHECK_EQ(B_d->format, HL_SPARSE_CSC) << "matrix format error!";
+
+  if (B_d->nnz == 0) {
+    _beta_mul_c(C_d, dimM, dimN, beta);
+    return;
+  }
+
+  /* nnz != 0 */
+  hl_csc_matrix B_d2 = (hl_csc_matrix)(B_d->matrix);
+  if ((B_d2->csc_val == NULL && B_d->type != HL_NO_VALUE) ||
+      B_d2->csc_row == NULL || B_d2->csc_col == NULL) {
+    LOG(FATAL) << "parameter B is null!";
+  }
+
+  if (transb == HPPL_OP_N) {
+    int blocksX = (dimM + CU_CSCMM_BLOCK_M_BEST - 1) / CU_CSCMM_BLOCK_M_BEST;
+    int blocksY = (dimN + CU_CSCMM_BLOCK_N_BEST - 1) / CU_CSCMM_BLOCK_N_BEST;
+    dim3 threads(CU_CSCMM_THREAD_X_BEST, CU_CSCMM_THREAD_Y_BEST);
+    dim3 grid(blocksX, blocksY);
+
+    if (B_d->type == HL_NO_VALUE) {
+      KeSMatrixDenseMulCsc<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csc_val,
+          B_d2->csc_row,
+          B_d2->csc_col,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    } else {
+      KeSMatrixDenseMulCsc<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csc_val,
+          B_d2->csc_row,
+          B_d2->csc_col,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    }
+  } else if (transb == HPPL_OP_T) {
+    _beta_mul_c(C_d, dimM, dimN, beta);
+    int blocksX = 1 + (dimK - 1) / CU_DM_CSR_THREAD_X;
+    int blocksY = 1 + (dimM - 1) / CU_DM_CSR_BLOCK_M;
+    dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y);
+    dim3 grid(blocksX, blocksY);
+    if (B_d->type == HL_NO_VALUE) {
+      KeSMatrixDenseMulCsr<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csc_val,
+          B_d2->csc_col,
+          B_d2->csc_row,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    } else {
+      KeSMatrixDenseMulCsr<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csc_val,
+          B_d2->csc_col,
+          B_d2->csc_row,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    }
+  } else {
+    LOG(FATAL) << "parameter transb error!";
+  }
+
+  CHECK_SYNC("hl_matrix_dense_mul_csc failed");
+}
+
+void hl_matrix_dense_mul_csr(real *A_d,
+                             hl_trans_op_t transa,
+                             hl_sparse_matrix_s B_d,
+                             hl_trans_op_t transb,
+                             real *C_d,
+                             int dimM,
+                             int dimN,
+                             int dimK,
+                             real alpha,
+                             real beta) {
+  CHECK_EQ(transa, HPPL_OP_N);
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+  CHECK_NOTNULL(C_d);
+
+  if (dimM <= 0 || dimN <= 0 || dimK <= 0 ||
+      (transb == HPPL_OP_N && (B_d->rows != dimK || B_d->cols != dimN)) ||
+      (transb == HPPL_OP_T && (B_d->rows != dimN || B_d->cols != dimK))) {
+    LOG(FATAL) << "parameter dims error!";
+  }
+
+  CHECK_EQ(B_d->format, HL_SPARSE_CSR) << "matrix format error!";
+
+  if (B_d->nnz == 0) {
+    _beta_mul_c(C_d, dimM, dimN, beta);
+    return;
+  }
+
+  /* nnz != 0 */
+  hl_csr_matrix B_d2 = (hl_csr_matrix)(B_d->matrix);
+  if ((B_d2->csr_val == NULL && B_d->type != HL_NO_VALUE) ||
+      B_d2->csr_row == NULL || B_d2->csr_col == NULL) {
+    LOG(FATAL) << "parameter transa error!";
+  }
+
+  if (transb == HPPL_OP_N) {
+    _beta_mul_c(C_d, dimM, dimN, beta);
+    int blocksX = 1 + (dimK - 1) / CU_DM_CSR_THREAD_X;
+    int blocksY = 1 + (dimM - 1) / CU_DM_CSR_BLOCK_M;
+    dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y);
+    dim3 grid(blocksX, blocksY);
+    if (B_d->type == HL_NO_VALUE) {
+      KeSMatrixDenseMulCsr<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csr_val,
+          B_d2->csr_row,
+          B_d2->csr_col,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    } else {
+      KeSMatrixDenseMulCsr<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csr_val,
+          B_d2->csr_row,
+          B_d2->csr_col,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    }
+  } else if (transb == HPPL_OP_T) {
+    int blocksX = (dimM + CU_CSCMM_BLOCK_M_BEST - 1) / CU_CSCMM_BLOCK_M_BEST;
+    int blocksY = (dimN + CU_CSCMM_BLOCK_N_BEST - 1) / CU_CSCMM_BLOCK_N_BEST;
+    dim3 threads(CU_CSCMM_THREAD_X_BEST, CU_CSCMM_THREAD_Y_BEST);
+    dim3 grid(blocksX, blocksY);
+    if (B_d->type == HL_NO_VALUE) {
+      KeSMatrixDenseMulCsc<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csr_val,
+          B_d2->csr_col,
+          B_d2->csr_row,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    } else {
+      KeSMatrixDenseMulCsc<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csr_val,
+          B_d2->csr_col,
+          B_d2->csr_row,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    }
+  } else {
+    LOG(FATAL) << "parameter transb error!";
+  }
+
+  CHECK_SYNC("hl_matrix_dense_mul_csr failed");
+}
+
+void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
+                             hl_trans_op_t transa,
+                             real *B_d,
+                             hl_trans_op_t transb,
+                             real *C_d,
+                             int dimM,
+                             int dimN,
+                             int dimK,
+                             real alpha,
+                             real beta) {
+  CHECK_EQ(transb, HPPL_OP_N);
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+  CHECK_NOTNULL(C_d);
+  CHECK(dimM > 0 && dimN > 0 && dimK > 0) << "parameter error!";
+  CHECK_EQ(A_d->format, HL_SPARSE_CSC) << "matrix format error!";
+
+  if ((HPPL_OP_N == transa && (A_d->rows != dimM || A_d->cols != dimK)) ||
+      (HPPL_OP_T == transa && (A_d->rows != dimK || A_d->cols != dimM))) {
+    LOG(FATAL) << "parameter error!";
+  }
+
+  if (A_d->nnz == 0) {
+    _beta_mul_c(C_d, dimM, dimN, beta);
+    return;
+  }
+
+  /* nnz != 0 */
+  hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix);
+  if ((A_d2->csc_val == NULL && A_d->type != HL_NO_VALUE) ||
+      A_d2->csc_row == NULL || A_d2->csc_col == NULL) {
+    LOG(FATAL) << "parameter error!";
+  }
+
+  if (HPPL_OP_N == transa) {
+    _beta_mul_c(C_d, dimM, dimN, beta);
+
+    int blocksX =
+        (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / CU_CSC_MUL_DENSE_BLOCK_N;
+    int blocksY =
+        (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / CU_CSC_MUL_DENSE_BLOCK_K;
+    dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y);
+    dim3 grid(blocksX, blocksY);
+    if (A_d->type == HL_NO_VALUE) {
+      KeSMatrixCscMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csc_val,
+          A_d2->csc_row,
+          A_d2->csc_col,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    } else {
+      KeSMatrixCscMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csc_val,
+          A_d2->csc_row,
+          A_d2->csc_col,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    }
+  } else if (HPPL_OP_T == transa) {
+    int blocksX = (dimN + CU_CSRMM_BLOCK_N - 1) / CU_CSRMM_BLOCK_N;
+    int blocksY = (dimM + CU_CSRMM_THREAD_Y - 1) / CU_CSRMM_THREAD_Y;
+    dim3 threads(CU_CSRMM_THREAD_X, CU_CSRMM_THREAD_Y);
+    dim3 grid(blocksX, blocksY);
+
+    /* sparsity pattern */
+    // A_d->sparsity;
+    if (A_d->type == HL_NO_VALUE) {
+      KeSMatrixCsrMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csc_val,
+          A_d2->csc_row,
+          A_d2->csc_col,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    } else {
+      KeSMatrixCsrMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csc_val,
+          A_d2->csc_row,
+          A_d2->csc_col,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+    }
+  } else {
+    LOG(FATAL) << "parameter transa error!";
+  }
+
+  CHECK_SYNC("hl_matrix_csc_mul_dense failed");
+}
+
+void hl_sparse_matrix_mul(real *A_d,
+                          hl_trans_op_t transa,
+                          real *B_d,
+                          hl_trans_op_t transb,
+                          hl_sparse_matrix_s C_d,
+                          int dimM,
+                          int dimN,
+                          int dimK,
+                          real alpha,
+                          real beta) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+  CHECK_NOTNULL(C_d);
+  CHECK(dimM > 0 && dimN > 0 && dimK > 0) << "parameter error!";
+  CHECK_NE(C_d->type, HL_NO_VALUE) << "C value type error!";
+
+  if (C_d->nnz == 0) return;
+
+  if (C_d->format == HL_SPARSE_CSC) {
+    hl_csc_matrix C_d2 = (hl_csc_matrix)(C_d->matrix);
+    if (C_d2->csc_val == NULL || C_d2->csc_row == NULL ||
+        C_d2->csc_col == NULL) {
+      LOG(FATAL) << "parameter error!";
+    }
+
+    if (beta != 1.0) {
+      hl_gpu_apply_unary_op(
+          unary::mul_scalar<real>(beta), C_d2->csc_val, 1, C_d->nnz, C_d->nnz);
+    }
+
+    int blocksX = dimN;
+    int blocksY = 1;
+    dim3 threads(CU_CSCMM_DMD2CSC_THREAD_X, 1);
+    dim3 grid(blocksX, blocksY);
+    bool transA = transa == HPPL_OP_T ? 1 : 0;
+    bool transB = transb == HPPL_OP_T ? 1 : 0;
+    KeSMatrixDenseMulDense2CSC<<<grid, threads, 0, STREAM_DEFAULT>>>(
+        C_d2->csc_val,
+        C_d2->csc_row,
+        C_d2->csc_col,
+        A_d,
+        B_d,
+        transA,
+        transB,
+        dimM,
+        dimN,
+        dimK,
+        alpha,
+        beta);
+    CHECK_SYNC("hl_sparse_matrix_mul failed");
+  } else {
+    hl_csr_matrix C_d2 = (hl_csr_matrix)(C_d->matrix);
+    if ((C_d2->csr_val == NULL && C_d->type != HL_NO_VALUE) ||
+        C_d2->csr_row == NULL || C_d2->csr_col == NULL) {
+      LOG(FATAL) << "parameter error!";
+    }
+
+    if (beta != 1.0) {
+      hl_gpu_apply_unary_op(
+          unary::mul_scalar<real>(beta), C_d2->csr_val, 1, C_d->nnz, C_d->nnz);
+    }
+
+    bool transA = transa == HPPL_OP_T ? 1 : 0;
+    bool transB = transb == HPPL_OP_T ? 1 : 0;
+    if (!transB) {
+      int blocksX = dimM;
+      int blocksY = 1;
+      dim3 threads(CU_CSCMM_DMD2CSR_THREAD_X, 1);
+      dim3 grid(blocksX, blocksY);
+
+      KeSMatrixDenseMulDense2CSR<<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d2->csr_val,
+          C_d2->csr_row,
+          C_d2->csr_col,
+          A_d,
+          B_d,
+          transA,
+          transB,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+      CHECK_SYNC("hl_sparse_matrix_mul failed");
+    } else {
+      CHECK(!transA) << "Not supported A is trans and B is not trans!";
+
+      dim3 block(CU_BLOCK_SIZE, 1);
+      int avgNnzPerRow = C_d->nnz / dimM;
+      avgNnzPerRow = avgNnzPerRow > 0 ? avgNnzPerRow : 1;
+      int gridx = DIVUP(avgNnzPerRow, CU_BLOCK_SIZE);
+      dim3 grid(gridx, dimM);
+      KeSMatrixDenseMulDenseTrans2CSR<<<grid, block, 0, STREAM_DEFAULT>>>(
+          C_d2->csr_val,
+          C_d2->csr_row,
+          C_d2->csr_col,
+          A_d,
+          B_d,
+          transA,
+          transB,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+      CHECK_SYNC("hl_sparse_matrix_mul failed");
+    }
+  }
+}
+
+void hl_memcpy_from_csc_matrix(real *csc_val,
+                               size_t val_size,
+                               int *csc_row,
+                               size_t row_size,
+                               int *csc_col,
+                               size_t col_size,
+                               hl_sparse_matrix_s csc_matrix,
+                               hl_stream_t stream) {
+  CHECK_NOTNULL(csc_matrix);
+  CHECK_NOTNULL(csc_row);
+  CHECK_NOTNULL(csc_col);
+
+  CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC)
+      << "csc_matrix is not csc format error!";
+
+  if (csc_matrix->nnz > row_size ||
+      csc_matrix->cols + 1 > static_cast<int>(col_size)) {
+    LOG(FATAL) << "size not match!";
+  }
+
+  hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix);
+  hl_memcpy_async((void *)csc_row,
+                  (void *)csc->csc_row,
+                  (csc_matrix->nnz) * sizeof(int),
+                  stream);
+  hl_memcpy_async((void *)csc_col,
+                  (void *)csc->csc_col,
+                  (csc_matrix->cols + 1) * sizeof(int),
+                  stream);
+  if (csc_matrix->type == HL_FLOAT_VALUE) {
+    if (csc_val != NULL) {
+      CHECK_LE(csc_matrix->nnz, val_size) << "size not match!";
+      hl_memcpy_async((void *)csc_val,
+                      (void *)csc->csc_val,
+                      (csc_matrix->nnz) * sizeof(real),
+                      stream);
+    } else {
+      LOG(FATAL) << "parameter csr_val is null pointer!";
+    }
+  }
+}
+
+void hl_memcpy_from_csr_matrix(real *csr_val,
+                               size_t val_size,
+                               int *csr_row,
+                               size_t row_size,
+                               int *csr_col,
+                               size_t col_size,
+                               hl_sparse_matrix_s csr_matrix,
+                               hl_stream_t stream) {
+  CHECK_NOTNULL(csr_matrix);
+  CHECK_NOTNULL(csr_row);
+  CHECK_NOTNULL(csr_col);
+  CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR)
+      << "csr_matrix is not csr format error!";
+
+  if (csr_matrix->nnz > col_size ||
+      csr_matrix->rows + 1 > static_cast<int>(row_size)) {
+    LOG(FATAL) << "size not match!";
+  }
+
+  hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix);
+  hl_memcpy_async((void *)csr_row,
+                  (void *)csr->csr_row,
+                  (csr_matrix->rows + 1) * sizeof(int),
+                  stream);
+  hl_memcpy_async((void *)csr_col,
+                  (void *)csr->csr_col,
+                  (csr_matrix->nnz) * sizeof(int),
+                  stream);
+  if (csr_matrix->type == HL_FLOAT_VALUE) {
+    if (csr_val != NULL) {
+      CHECK_LE(csr_matrix->nnz, val_size) << "size not match!";
+      hl_memcpy_async((void *)csr_val,
+                      (void *)csr->csr_val,
+                      (csr_matrix->nnz) * sizeof(real),
+                      stream);
+    } else {
+      LOG(FATAL) << "parameter csr_val is null pointer!";
+    }
+  }
+}
+
+void hl_sparse_matrix_column_sum(
+    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {
+  if (B_d->format == HL_SPARSE_CSR) {
+    hl_matrix_csr_column_sum(A_d, B_d, dimM, dimN, scale);
+  } else {
+    LOG(FATAL) << "Not support CSC format error!";
+  }
+}
+
+void hl_matrix_csr_column_sum(
+    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+
+  if (dimM <= 0 || dimN <= 0 || (B_d->rows != dimM || B_d->cols != dimN)) {
+    LOG(FATAL) << "parameter dims error!";
+  }
+
+  hl_csr_matrix B_d2 = (hl_csr_matrix)(B_d->matrix);
+  if ((B_d2->csr_val == NULL && B_d->type != HL_NO_VALUE) ||
+      B_d2->csr_row == NULL || B_d2->csr_col == NULL) {
+    LOG(FATAL) << "parameter B is null!";
+  }
+
+  if (B_d->nnz == 0) return;
+
+  int nnz = B_d->nnz;
+  int block = 512;
+  int grid = DIVUP(nnz, 512);
+  KeSMatrixCsrColumnSum<<<grid, block, 0, STREAM_DEFAULT>>>(
+      A_d, B_d2->csr_val, B_d2->csr_col, nnz);
+
+  CHECK_SYNC("hl_matrix_csr_column_sum failed");
+}
+
+void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d, real *B_d, real scale) {
+  if (A_d->format == HL_SPARSE_CSR) {
+    hl_matrix_csr_add_bias(A_d, B_d, scale);
+  } else {
+    LOG(FATAL) << "Not support CSC format error!";
+  }
+}
+
+void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real *B_d, real scale) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+
+  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
+  if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) ||
+      A_d2->csr_row == NULL || A_d2->csr_col == NULL) {
+    LOG(FATAL) << "parameter A_d is null!";
+  }
+
+  if (A_d->nnz == 0) return;
+
+  int nnz = A_d->nnz;
+  int block = 512;
+  int grid = DIVUP(nnz, 512);
+  KeSMatrixCsrAddBias<<<grid, block, 0, STREAM_DEFAULT>>>(
+      A_d2->csr_val, A_d2->csr_col, B_d, scale, nnz);
+
+  CHECK_SYNC("hl_sparse_matrix_add_bias failed");
+}
+
+void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
+                                real *B_d,
+                                int dimM,
+                                int dimN,
+                                real alpha,
+                                real beta) {
+  if (A_d->format == HL_SPARSE_CSR) {
+    hl_matrix_csr_add_dense(A_d, B_d, dimM, dimN, alpha, beta);
+  } else {
+    LOG(FATAL) << "Not support CSC format error!";
+  }
+}
+
+void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
+                             real *B_d,
+                             int dimM,
+                             int dimN,
+                             real alpha,
+                             real beta) {
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(B_d);
+
+  if (dimM <= 0 || dimN <= 0 || A_d->rows != dimM || A_d->cols != dimN) {
+    LOG(FATAL) << "parameter dim error!";
+  }
+
+  hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
+  if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) ||
+      A_d2->csr_row == NULL || A_d2->csr_col == NULL) {
+    LOG(FATAL) << "parameter A_d is null!";
+  }
+
+  if (A_d->nnz == 0) return;
+
+  int gridX = DIVUP((A_d->nnz / dimM), 512);
+  gridX = gridX > 0 ? gridX : 1;
+  dim3 block(512, 1);
+  dim3 grid(gridX, dimM);
+  KeSMatrixCsrAddDense<<<grid, block, 0, STREAM_DEFAULT>>>(A_d2->csr_val,
+                                                           A_d2->csr_row,
+                                                           A_d2->csr_col,
+                                                           B_d,
+                                                           alpha,
+                                                           beta,
+                                                           dimM,
+                                                           dimN);
+
+  CHECK_SYNC("hl_sparse_matrix_add_dense failed");
+}
+
+int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) {
+  __sparse_get_return__(sMat, row);
+}
+
+int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) {
+  __sparse_get_return__(sMat, col);
+}
+
+real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
+  __sparse_get_return__(sMat, val);
+}
diff --git a/paddle/cuda/src/hl_cuda_sparse.cuh b/paddle/legacy/cuda/src/hl_cuda_sparse.cuh
similarity index 100%
rename from paddle/cuda/src/hl_cuda_sparse.cuh
rename to paddle/legacy/cuda/src/hl_cuda_sparse.cuh
diff --git a/paddle/cuda/src/hl_math.cc b/paddle/legacy/cuda/src/hl_math.cc
similarity index 100%
rename from paddle/cuda/src/hl_math.cc
rename to paddle/legacy/cuda/src/hl_math.cc
diff --git a/paddle/cuda/src/hl_perturbation_util.cu b/paddle/legacy/cuda/src/hl_perturbation_util.cu
similarity index 100%
rename from paddle/cuda/src/hl_perturbation_util.cu
rename to paddle/legacy/cuda/src/hl_perturbation_util.cu
diff --git a/paddle/legacy/cuda/src/hl_table_apply.cu b/paddle/legacy/cuda/src/hl_table_apply.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7411ae35d382833253e3ceabe36b3a1938138028
--- /dev/null
+++ b/paddle/legacy/cuda/src/hl_table_apply.cu
@@ -0,0 +1,124 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_base.h"
+#include "hl_cuda.h"
+#include "hl_device_functions.cuh"
+#include "paddle/legacy/utils/Logging.h"
+
+template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
+__global__ void KeMatrixAddRows(real* output,
+                                int ldo,
+                                real* table,
+                                int ldt,
+                                int* ids,
+                                int numSamples,
+                                int tableSize,
+                                int dim) {
+  int idx = threadIdx.x;
+  int idy = blockIdx.x + threadIdx.y * gridDimX;
+
+  while (idy < numSamples) {
+    int tableId = ids[idy];
+    if ((0 <= tableId) && (tableId < tableSize)) {
+      real* out = output + idy * ldo;
+      real* tab = table + tableId * ldt;
+      for (int i = idx; i < dim; i += blockDimX) {
+        if (AddRow) {
+          paddle::paddleAtomicAdd(&tab[i], out[i]);
+        } else {
+          out[i] += tab[i];
+        }
+      }
+    }
+    idy += blockDimY * gridDimX;
+  }
+}
+
+void hl_matrix_select_rows(real* output,
+                           int ldo,
+                           real* table,
+                           int ldt,
+                           int* ids,
+                           int numSamples,
+                           int tableSize,
+                           int dim) {
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(table);
+  CHECK_NOTNULL(ids);
+
+  dim3 threads(128, 8);
+  dim3 grid(8, 1);
+  KeMatrixAddRows<128, 8, 8, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      output, ldo, table, ldt, ids, numSamples, tableSize, dim);
+
+  CHECK_SYNC("hl_matrix_select_rows failed");
+}
+
+void hl_matrix_add_to_rows(real* table,
+                           int ldt,
+                           real* input,
+                           int ldi,
+                           int* ids,
+                           int numSamples,
+                           int tableSize,
+                           int dim) {
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(table);
+  CHECK_NOTNULL(ids);
+
+  dim3 threads(128, 8);
+  dim3 grid(8, 1);
+  KeMatrixAddRows<128, 8, 8, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      input, ldi, table, ldt, ids, numSamples, tableSize, dim);
+
+  CHECK_SYNC("hl_matrix_add_to_rows failed");
+}
+
+template <class T, int blockDimX, int gridDimX>
+__global__ void KeVectorSelect(
+    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
+  int idx = threadIdx.x + blockDimX * blockIdx.x;
+  while (idx < sizei) {
+    int index = ids[idx];
+    // check(index < sizes);
+    dst[idx] = src[index];
+    idx += blockDimX * gridDimX;
+  }
+}
+
+template <class T>
+void hl_vector_select_from(
+    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
+  CHECK_NOTNULL(dst);
+  CHECK_NOTNULL(src);
+  CHECK_NOTNULL(ids);
+  CHECK_EQ(sized, sizei);
+
+  dim3 threads(512, 1);
+  dim3 grid(8, 1);
+  KeVectorSelect<T, 512, 8><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      dst, sized, src, sizes, ids, sizei);
+
+  CHECK_SYNC("hl_vector_select_from failed");
+}
+
+template void hl_vector_select_from(real* dst,
+                                    int sized,
+                                    const real* src,
+                                    int sizes,
+                                    const int* ids,
+                                    int sizei);
+template void hl_vector_select_from(
+    int* dst, int sized, const int* src, int sizes, const int* ids, int sizei);
diff --git a/paddle/cuda/src/hl_time.cc b/paddle/legacy/cuda/src/hl_time.cc
similarity index 100%
rename from paddle/cuda/src/hl_time.cc
rename to paddle/legacy/cuda/src/hl_time.cc
diff --git a/paddle/legacy/cuda/src/hl_top_k.cu b/paddle/legacy/cuda/src/hl_top_k.cu
new file mode 100644
index 0000000000000000000000000000000000000000..041ac419f5addfa49148270b8a8b421eb8ada78c
--- /dev/null
+++ b/paddle/legacy/cuda/src/hl_top_k.cu
@@ -0,0 +1,481 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/cuda/include/hl_base.h"
+#include "paddle/legacy/cuda/include/hl_sparse.ph"
+#include "paddle/legacy/cuda/include/hl_top_k.h"
+#include "paddle/legacy/utils/Logging.h"
+
+// using namespace hppl;
+
+struct Pair {
+  __device__ __forceinline__ Pair() {}
+
+  __device__ __forceinline__ Pair(real value, int id) : v_(value), id_(id) {}
+
+  __device__ __forceinline__ void set(real value, int id) {
+    v_ = value;
+    id_ = id;
+  }
+
+  __device__ __forceinline__ void operator=(const Pair& in) {
+    v_ = in.v_;
+    id_ = in.id_;
+  }
+
+  __device__ __forceinline__ bool operator<(const real value) const {
+    return (v_ < value);
+  }
+
+  __device__ __forceinline__ bool operator<(const Pair& in) const {
+    return (v_ < in.v_) || ((v_ == in.v_) && (id_ > in.id_));
+  }
+
+  __device__ __forceinline__ bool operator>(const Pair& in) const {
+    return (v_ > in.v_) || ((v_ == in.v_) && (id_ < in.id_));
+  }
+
+  real v_;
+  int id_;
+};
+
+__device__ __forceinline__ void addTo(Pair topK[],
+                                      const Pair& p,
+                                      int beamSize) {
+  for (int k = beamSize - 2; k >= 0; k--) {
+    if (topK[k] < p) {
+      topK[k + 1] = topK[k];
+    } else {
+      topK[k + 1] = p;
+      return;
+    }
+  }
+  topK[0] = p;
+}
+
+template <int beamSize>
+__device__ __forceinline__ void addTo(Pair topK[], const Pair& p) {
+  for (int k = beamSize - 2; k >= 0; k--) {
+    if (topK[k] < p) {
+      topK[k + 1] = topK[k];
+    } else {
+      topK[k + 1] = p;
+      return;
+    }
+  }
+  topK[0] = p;
+}
+
+template <int blockSize>
+__device__ __forceinline__ void getTopK(
+    Pair topK[], real* src, int idx, int dim, int beamSize) {
+  while (idx < dim) {
+    if (topK[beamSize - 1] < src[idx]) {
+      Pair tmp(src[idx], idx);
+      addTo(topK, tmp, beamSize);
+    }
+    idx += blockSize;
+  }
+}
+
+template <int blockSize>
+__device__ __forceinline__ void getTopK(
+    Pair topK[], real* src, int idx, int dim, const Pair& max, int beamSize) {
+  while (idx < dim) {
+    if (topK[beamSize - 1] < src[idx]) {
+      Pair tmp(src[idx], idx);
+      if (tmp < max) {
+        addTo(topK, tmp, beamSize);
+      }
+    }
+    idx += blockSize;
+  }
+}
+
+template <int blockSize>
+__device__ __forceinline__ void getTopK(
+    Pair topK[], real* val, int* col, int idx, int dim, int beamSize) {
+  while (idx < dim) {
+    if (topK[beamSize - 1] < val[idx]) {
+      Pair tmp(val[idx], col[idx]);
+      addTo(topK, tmp, beamSize);
+    }
+    idx += blockSize;
+  }
+}
+
+template <int blockSize>
+__device__ __forceinline__ void getTopK(Pair topK[],
+                                        real* val,
+                                        int* col,
+                                        int idx,
+                                        int dim,
+                                        const Pair& max,
+                                        int beamSize) {
+  while (idx < dim) {
+    if (topK[beamSize - 1] < val[idx]) {
+      Pair tmp(val[idx], col[idx]);
+      if (tmp < max) {
+        addTo(topK, tmp, beamSize);
+      }
+    }
+    idx += blockSize;
+  }
+}
+
+template <int maxLength, int blockSize>
+__device__ __forceinline__ void threadGetTopK(Pair topK[],
+                                              int& beam,
+                                              int beamSize,
+                                              real* src,
+                                              bool& firstStep,
+                                              bool& isEmpty,
+                                              Pair& max,
+                                              int dim,
+                                              const int tid) {
+  if (beam > 0) {
+    int length = beam < beamSize ? beam : beamSize;
+    if (firstStep) {
+      firstStep = false;
+      getTopK<blockSize>(topK, src, tid, dim, length);
+    } else {
+      for (int k = 0; k < maxLength; k++) {
+        if (k < maxLength - beam) {
+          topK[k] = topK[k + beam];
+        } else {
+          topK[k].set(-HL_FLOAT_MAX, -1);
+        }
+      }
+      if (!isEmpty) {
+        getTopK<blockSize>(topK + maxLength - beam, src, tid, dim, max, length);
+      }
+    }
+
+    max = topK[maxLength - 1];
+    if (max.id_ == -1) isEmpty = true;
+    beam = 0;
+  }
+}
+
+template <int maxLength, int blockSize>
+__device__ __forceinline__ void threadGetTopK(Pair topK[],
+                                              int& beam,
+                                              int beamSize,
+                                              real* val,
+                                              int* col,
+                                              bool& firstStep,
+                                              bool& isEmpty,
+                                              Pair& max,
+                                              int dim,
+                                              const int tid) {
+  if (beam > 0) {
+    int length = beam < beamSize ? beam : beamSize;
+    if (firstStep) {
+      firstStep = false;
+      getTopK<blockSize>(topK, val, col, tid, dim, length);
+    } else {
+      for (int k = 0; k < maxLength; k++) {
+        if (k < maxLength - beam) {
+          topK[k] = topK[k + beam];
+        } else {
+          topK[k].set(-HL_FLOAT_MAX, -1);
+        }
+      }
+      if (!isEmpty) {
+        getTopK<blockSize>(
+            topK + maxLength - beam, val, col, tid, dim, max, length);
+      }
+    }
+
+    max = topK[maxLength - 1];
+    if (max.id_ == -1) isEmpty = true;
+    beam = 0;
+  }
+}
+
+template <int maxLength, int blockSize>
+__device__ __forceinline__ void blockReduce(Pair* shTopK,
+                                            int* maxId,
+                                            Pair topK[],
+                                            real** topVal,
+                                            int** topIds,
+                                            int& beam,
+                                            int& beamSize,
+                                            const int tid,
+                                            const int warp) {
+  while (true) {
+    __syncthreads();
+    if (tid < blockSize / 2) {
+      if (shTopK[tid] < shTopK[tid + blockSize / 2]) {
+        maxId[tid] = tid + blockSize / 2;
+      } else {
+        maxId[tid] = tid;
+      }
+    }
+    __syncthreads();
+    for (int stride = blockSize / 4; stride > 0; stride = stride / 2) {
+      if (tid < stride) {
+        if (shTopK[maxId[tid]] < shTopK[maxId[tid + stride]]) {
+          maxId[tid] = maxId[tid + stride];
+        }
+      }
+      __syncthreads();
+    }
+    __syncthreads();
+
+    if (tid == 0) {
+      **topVal = shTopK[maxId[0]].v_;
+      **topIds = shTopK[maxId[0]].id_;
+      (*topVal)++;
+      (*topIds)++;
+    }
+    if (tid == maxId[0]) beam++;
+    if (--beamSize == 0) break;
+    __syncthreads();
+
+    // NOTE(zcd): temporary solution
+    unsigned mask = 0u;
+    CREATE_SHFL_MASK(mask, true);
+
+    if (tid == maxId[0]) {
+      if (beam < maxLength) {
+        shTopK[tid] = topK[beam];
+      }
+    }
+    if (maxId[0] / 32 == warp) {
+      if (__shfl_sync(mask, beam, (maxId[0]) % 32, 32) == maxLength) break;
+    }
+  }
+}
+
+/**
+ * Each block compute one sample.
+ * In a block:
+ * 1. every thread get top maxLength value;
+ * 2. merge to shTopK, block reduce and get max value;
+ * 3. go to the second setp, until one thread's topK value is null;
+ * 4. go to the first setp, until get the topK value.
+ */
+template <int maxLength, int blockSize>
+__global__ void KeMatrixTopK(real* topVal,
+                             int ldv,
+                             int* topIds,
+                             real* src,
+                             int lds,
+                             int dim,
+                             int beamSize) {
+  __shared__ Pair shTopK[blockSize];
+  __shared__ int maxId[blockSize / 2];
+  const int tid = threadIdx.x;
+  const int warp = threadIdx.x / 32;
+  src += blockIdx.x * lds;
+  topVal += blockIdx.x * ldv;
+  topIds += blockIdx.x * beamSize;
+
+  Pair topK[maxLength];  // NOLINT
+  int beam = maxLength;
+  Pair max;
+  bool isEmpty = false;
+  bool firstStep = true;
+
+  for (int k = 0; k < maxLength; k++) {
+    topK[k].set(-HL_FLOAT_MAX, -1);
+  }
+  while (beamSize) {
+    threadGetTopK<maxLength, blockSize>(
+        topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
+
+    shTopK[tid] = topK[0];
+    blockReduce<maxLength, blockSize>(
+        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+  }
+}
+
+template <int maxLength, int blockSize>
+__global__ void KeSMatrixTopK(real* topVal,
+                              int ldv,
+                              int* topIds,
+                              real* val,
+                              int* row,
+                              int* col,
+                              int beamSize) {
+  __shared__ Pair shTopK[blockSize];
+  __shared__ int maxId[blockSize / 2];
+  const int tid = threadIdx.x;
+  const int warp = threadIdx.x / 32;
+  topVal += blockIdx.x * ldv;
+  topIds += blockIdx.x * beamSize;
+
+  Pair topK[maxLength];  // NOLINT
+  int beam = maxLength;
+  Pair max;
+  bool isEmpty = false;
+  bool firstStep = true;
+
+  int start = row[blockIdx.x];
+  int end = row[blockIdx.x + 1];
+  int dim = end - start;
+  val += start;
+  col += start;
+
+  if (beamSize > dim) {
+    // if the number of values to sort are less than the output size,
+    // use -1 to indicate the end of valid sorted values.
+    if (tid == 0) {
+      topIds[dim] = -1;
+    }
+
+    beamSize = dim;
+  }
+
+  for (int k = 0; k < maxLength; k++) {
+    topK[k].set(-HL_FLOAT_MAX, -1);
+  }
+  while (beamSize) {
+    threadGetTopK<maxLength, blockSize>(
+        topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid);
+
+    shTopK[tid] = topK[0];
+    blockReduce<maxLength, blockSize>(
+        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+  }
+}
+
+void hl_matrix_top_k(real* topVal,
+                     int ldv,
+                     int* topIds,
+                     real* src,
+                     int lds,
+                     int dim,
+                     int beamSize,
+                     int numSamples) {
+  CHECK_NOTNULL(topVal);
+  CHECK_NOTNULL(topIds);
+  CHECK_NOTNULL(src);
+
+  if (beamSize > dim) beamSize = dim;
+
+  dim3 threads(256, 1);
+  dim3 grid(numSamples, 1);
+  KeMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      topVal, ldv, topIds, src, lds, dim, beamSize);
+
+  CHECK_SYNC("hl_matrix_top_k failed");
+}
+
+void hl_sparse_matrix_top_k(real* topVal,
+                            int ldv,
+                            int* topIds,
+                            hl_sparse_matrix_s src,
+                            int beamSize,
+                            int numSamples) {
+  CHECK_NOTNULL(topVal);
+  CHECK_NOTNULL(topIds);
+  CHECK_NOTNULL(src);
+  CHECK_EQ(src->format, HL_SPARSE_CSR) << "sparse matrix format error!";
+
+  hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
+  if (csr->csr_val == NULL || csr->csr_row == NULL || csr->csr_col == NULL) {
+    LOG(FATAL) << "parameter src is null!";
+  }
+
+  dim3 threads(256, 1);
+  dim3 grid(numSamples, 1);
+  KeSMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize);
+
+  CHECK_SYNC("hl_sparse_matrix_top_k failed");
+}
+
+/**
+ * Each block compute one sample.
+ * In a block:
+ * 1. every thread get top maxLength value;
+ * 2. merge to shTopK, block reduce and get max value;
+ * 3. go to the second setp, until one thread's topK value is null;
+ * 4. go to the first setp, until get the topK value.
+ */
+template <int maxLength, int blockSize>
+__global__ void KeMatrixTopKClassificationError(real* topVal,
+                                                int ldv,
+                                                int* topIds,
+                                                real* src,
+                                                int lds,
+                                                int dim,
+                                                int beamSize,
+                                                int* label,
+                                                real* recResult) {
+  __shared__ Pair shTopK[blockSize];
+  __shared__ int maxId[blockSize / 2];
+  const int tid = threadIdx.x;
+  const int warp = threadIdx.x / 32;
+  src += blockIdx.x * lds;
+  topVal += blockIdx.x * ldv;
+  topIds += blockIdx.x * beamSize;
+
+  Pair topK[maxLength];  // NOLINT
+  int beam = maxLength;
+  Pair max;
+  bool isEmpty = false;
+  bool firstStep = true;
+  int topkSize = beamSize;
+
+  for (int k = 0; k < maxLength; k++) {
+    topK[k].set(-HL_FLOAT_MAX, -1);
+  }
+
+  while (beamSize) {
+    threadGetTopK<maxLength, blockSize>(
+        topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
+
+    shTopK[tid] = topK[0];
+    blockReduce<maxLength, blockSize>(
+        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+  }
+
+  __syncthreads();
+  if (tid == 0) {
+    for (int i = 0; i < topkSize; i++) {
+      if (*--topIds == label[blockIdx.x]) {
+        recResult[blockIdx.x] = 0;
+        break;
+      }
+      recResult[blockIdx.x] = 1.0f;
+    }
+  }
+}
+
+void hl_matrix_classification_error(real* topVal,
+                                    int ldv,
+                                    int* topIds,
+                                    real* src,
+                                    int lds,
+                                    int dim,
+                                    int topkSize,
+                                    int numSamples,
+                                    int* label,
+                                    real* recResult) {
+  CHECK_NOTNULL(topVal);
+  CHECK_NOTNULL(topIds);
+  CHECK_NOTNULL(src);
+
+  if (topkSize > dim) topkSize = dim;
+
+  dim3 threads(256, 1);
+  dim3 grid(numSamples, 1);
+  KeMatrixTopKClassificationError<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
+
+  CHECK_SYNC("hl_matrix_top_k classification error failed");
+}
diff --git a/paddle/legacy/cuda/src/hl_warpctc_wrap.cc b/paddle/legacy/cuda/src/hl_warpctc_wrap.cc
new file mode 100644
index 0000000000000000000000000000000000000000..31a8652f1f55387ae48cb516cd092442be784cbb
--- /dev/null
+++ b/paddle/legacy/cuda/src/hl_warpctc_wrap.cc
@@ -0,0 +1,151 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_warpctc_wrap.h"
+#include <mutex>
+#include "paddle/legacy/utils/DynamicLoader.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace dynload {
+
+std::once_flag warpctc_dso_flag;
+void* warpctc_dso_handle = nullptr;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load warpctc routine
+ * via operator overloading. When PADDLE_USE_DSO is
+ * false, you need to add the path of libwarp-ctc.so to
+ * the linked-libs of paddle or to LD_PRELOAD.
+ */
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                              \
+  struct DynLoad__##__name {                                           \
+    template <typename... Args>                                        \
+    auto operator()(Args... args) -> decltype(__name(args...)) {       \
+      using warpctcFunc = decltype(__name(args...)) (*)(Args...);      \
+      std::call_once(                                                  \
+          warpctc_dso_flag, GetWarpCTCDsoHandle, &warpctc_dso_handle); \
+      void* p_##_name = dlsym(warpctc_dso_handle, #__name);            \
+      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);        \
+    }                                                                  \
+  } __name;  // struct DynLoad__##__name
+
+// include all needed warp-ctc functions
+DYNAMIC_LOAD_WARPCTC_WRAP(get_warpctc_version)
+DYNAMIC_LOAD_WARPCTC_WRAP(ctcGetStatusString)
+DYNAMIC_LOAD_WARPCTC_WRAP(compute_ctc_loss)
+DYNAMIC_LOAD_WARPCTC_WRAP(get_workspace_size)
+
+#undef DYNAMIC_LOAD_WARPCTC_WRAP
+
+} /* namespace dynload */
+
+#define WARPCTC_GET_VERSION dynload::get_warpctc_version
+#define WARPCTC_GET_STATUS_STRING dynload::ctcGetStatusString
+
+static int g_warpctcVersion = -1;
+#ifndef PADDLE_TYPE_DOUBLE
+#define WARPCTC_COMPUTE_LOSS dynload::compute_ctc_loss
+#define WARPCTC_GET_WORKSPACE_SIZE dynload::get_workspace_size
+#else
+hl_warpctc_status_t fatal(...) {
+  LOG(FATAL) << "warp-ctc [version " << g_warpctcVersion
+             << "] Error: not support double precision.";
+  // both of get_warpctc_version() and get_workspace_size() return an ctcStatus
+  // type value
+  return CTC_STATUS_EXECUTION_FAILED;
+}
+#define WARPCTC_COMPUTE_LOSS fatal
+#define WARPCTC_GET_WORKSPACE_SIZE fatal
+#endif
+
+/**
+ * Check build-in warp-ctc function using glog and it also
+ * support << operator for more details error info.
+ */
+#define CHECK_WARPCTC(warpctcStat)                \
+  CHECK_EQ(CTC_STATUS_SUCCESS, warpctcStat)       \
+      << "warp-ctc [version " << g_warpctcVersion \
+      << "] Error: " << WARPCTC_GET_STATUS_STRING(warpctcStat) << " "
+
+void hl_warpctc_init(const size_t blank,
+                     bool useGpu,
+                     hl_warpctc_options_t* options) {
+  CHECK_NOTNULL(options);
+
+  g_warpctcVersion = WARPCTC_GET_VERSION();
+
+  if (useGpu) {
+#ifdef __NVCC__
+    options->loc = CTC_GPU;
+    options->stream = STREAM_DEFAULT;
+#else
+    LOG(FATAL) << "[warpctc init] GPU is not enabled.";
+#endif
+  } else {
+    options->loc = CTC_CPU;
+    options->num_threads = 1;
+  }
+
+  options->blank_label = blank;
+}
+
+void hl_warpctc_compute_loss(const real* batchInput,
+                             real* batchGrad,
+                             const int* cpuLabels,
+                             const int* cpuLabelLengths,
+                             const int* cpuInputLengths,
+                             const size_t numClasses,
+                             const size_t numSequences,
+                             real* cpuCosts,
+                             void* workspace,
+                             hl_warpctc_options_t* options) {
+  CHECK_NOTNULL(batchInput);
+  CHECK_NOTNULL(cpuLabels);
+  CHECK_NOTNULL(cpuLabelLengths);
+  CHECK_NOTNULL(cpuInputLengths);
+  CHECK_NOTNULL(cpuCosts);
+  CHECK_NOTNULL(workspace);
+  CHECK_NOTNULL(options);
+
+  CHECK_WARPCTC(WARPCTC_COMPUTE_LOSS(batchInput,
+                                     batchGrad,
+                                     cpuLabels,
+                                     cpuLabelLengths,
+                                     cpuInputLengths,
+                                     numClasses,
+                                     numSequences,
+                                     cpuCosts,
+                                     workspace,
+                                     *options));
+}
+
+void hl_warpctc_get_workspace_size(const int* cpuLabelLengths,
+                                   const int* cpuInputLengths,
+                                   const size_t numClasses,
+                                   const size_t numSequences,
+                                   hl_warpctc_options_t* options,
+                                   size_t* bytes) {
+  CHECK_NOTNULL(cpuLabelLengths);
+  CHECK_NOTNULL(cpuInputLengths);
+  CHECK_NOTNULL(options);
+  CHECK_NOTNULL(bytes);
+
+  CHECK_WARPCTC(WARPCTC_GET_WORKSPACE_SIZE(cpuLabelLengths,
+                                           cpuInputLengths,
+                                           numClasses,
+                                           numSequences,
+                                           *options,
+                                           bytes));
+}
diff --git a/paddle/legacy/function/BlockExpandOp.cpp b/paddle/legacy/function/BlockExpandOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f01f89a7277acc5fe494b92a3e7ca3ca18498c97
--- /dev/null
+++ b/paddle/legacy/function/BlockExpandOp.cpp
@@ -0,0 +1,202 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Function.h"
+#include "Im2Col.h"
+
+namespace paddle {
+
+/*
+ * \brief Converts the image data of four dimensions(NCHW) into
+ *        a sequence data of three dimensions(NST) in the forward calculation,
+ *        which is reversed in the backward calculation.
+ *        Where N is batch size, S is the length of the sequence after each
+ *        image is expanded, T is the size of each time step in the sequence.
+ *
+ * Arguments in forward function:
+ * \param inputs[0]  Image data of NCHW format.
+ * \param outputs[0] Sequence data of NST format.
+ *
+ * Arguments in backward function:
+ * \param inputs[0]  Sequence data of NST format.
+ * \param outputs[0] Image data of NCHW format.
+ */
+class BlockExpandFunction : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    // function arguments
+    strides_ = config.get<std::vector<size_t>>("strides");
+    paddings_ = config.get<std::vector<size_t>>("paddings");
+    blocks_ = config.get<std::vector<size_t>>("blocks");
+
+    // number of inputs and outputs
+    numInputs_ = 1;
+    numOutputs_ = 1;
+  }
+
+  void checkShape(const TensorShape& image, const TensorShape& sequence) const {
+    // image shape should be 4-dimensional.
+    CHECK_EQ(image.ndims(), (size_t)4);
+    // sequence shape should be 3-dimensional.
+    CHECK_EQ(sequence.ndims(), (size_t)3);
+    // The batchSize of the image needs to be equal to
+    // the batchSize of the sequence.
+    CHECK_EQ(image[0], sequence[0]);
+  }
+
+  // Calculate the shape of colData based on the shape of the image
+  // and the shape of the sequence.
+  TensorShape getColShape(const TensorShape& image,
+                          const TensorShape& sequence) const {
+    size_t inputChannels = image[1];
+    size_t inputHeight = image[2];
+    size_t inputWidth = image[3];
+    size_t seqLength = sequence[1];
+    size_t stepSize = sequence[2];
+    size_t outputHeight =
+        1 +
+        (inputHeight + 2 * paddingH() - blockH() + strideH() - 1) / strideH();
+    size_t outputWidth =
+        1 +
+        (inputWidth + 2 * paddingW() - blockW() + strideW() - 1) / strideW();
+    CHECK_EQ(seqLength, outputHeight * outputWidth);
+    CHECK_EQ(stepSize, inputChannels * blockH() * blockW());
+
+    // [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+    return TensorShape({outputHeight,
+                        outputWidth,
+                        inputChannels,
+                        (size_t)blockH(),
+                        (size_t)blockW()});
+  }
+
+ protected:
+  std::vector<size_t> strides_;
+  std::vector<size_t> paddings_;
+  std::vector<size_t> blocks_;
+
+  inline int strideH() const { return strides_[0]; }
+
+  inline int strideW() const { return strides_[1]; }
+
+  inline int paddingH() const { return paddings_[0]; }
+
+  inline int paddingW() const { return paddings_[1]; }
+
+  inline int blockH() const { return blocks_[0]; }
+
+  inline int blockW() const { return blocks_[1]; }
+};
+
+template <DeviceType Device>
+class BlockExpandForward : public BlockExpandFunction {
+ public:
+  void init(const FuncConfig& config) override {
+    BlockExpandFunction::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& image = inputs[0].shape();
+    const TensorShape& sequence = outputs[0].shape();
+    checkShape(image, sequence);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    const TensorShape& image = inputs[0].shape();
+    const TensorShape& sequence = outputs[0].shape();
+
+    TensorShape imShape = TensorShape({image[1], image[2], image[3]});
+    TensorShape colShape = getColShape(image, sequence);
+    size_t batchSize = image[0];
+
+    real* imageData = inputs[0].data<real>();
+    real* seqData = outputs[0].data<real>();
+    Im2ColFunctor<kOCF, Device, real> im2col;
+    for (size_t i = 0; i < batchSize; i++) {
+      // The result of im2col is [outputHeight, outputWidth,
+      // inputChannels, filterHeight, filterWidth], and it is easy to
+      // reshape into [seqLength, stepSize], where seqLength is equal
+      // output_height * output_width, stepSize is equal
+      // input_channels * filter_height * filter_width
+      im2col(imageData,
+             imShape,
+             seqData,
+             colShape,
+             strideH(),
+             strideW(),
+             paddingH(),
+             paddingW());
+      imageData += imShape.getElements();
+      seqData += colShape.getElements();
+    }
+  }
+};
+
+template <DeviceType Device>
+class BlockExpandBackward : public BlockExpandFunction {
+ public:
+  void init(const FuncConfig& config) override {
+    BlockExpandFunction::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& image = outputs[0].shape();
+    const TensorShape& sequence = inputs[0].shape();
+    checkShape(image, sequence);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    // Since the implementation of Col2ImFunctor is ADD_TO,
+    // this function only supports ADD_TO mode.
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    const TensorShape& image = outputs[0].shape();
+    const TensorShape& sequence = inputs[0].shape();
+
+    TensorShape imShape = TensorShape({image[1], image[2], image[3]});
+    TensorShape colShape = getColShape(image, sequence);
+    size_t batchSize = image[0];
+
+    real* imageData = outputs[0].data<real>();
+    real* seqData = inputs[0].data<real>();
+    Col2ImFunctor<kOCF, Device, real> col2im;
+    for (size_t i = 0; i < batchSize; i++) {
+      col2im(imageData,
+             imShape,
+             seqData,
+             colShape,
+             strideH(),
+             strideW(),
+             paddingH(),
+             paddingW());
+      imageData += imShape.getElements();
+      seqData += colShape.getElements();
+    }
+  }
+};
+
+REGISTER_TYPED_FUNC(BlockExpand, CPU, BlockExpandForward);
+REGISTER_TYPED_FUNC(BlockExpandGrad, CPU, BlockExpandBackward);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(BlockExpand, GPU, BlockExpandForward);
+REGISTER_TYPED_FUNC(BlockExpandGrad, GPU, BlockExpandBackward);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/BlockExpandOpTest.cpp b/paddle/legacy/function/BlockExpandOpTest.cpp
similarity index 100%
rename from paddle/function/BlockExpandOpTest.cpp
rename to paddle/legacy/function/BlockExpandOpTest.cpp
diff --git a/paddle/legacy/function/BufferArg.cpp b/paddle/legacy/function/BufferArg.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1f3d505c31bf8d50503032a4baae6230b9f7241d
--- /dev/null
+++ b/paddle/legacy/function/BufferArg.cpp
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+
+#include "BufferArg.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+
+namespace paddle {
+
+const SequenceArg& BufferArg::sequence() const {
+  CHECK_EQ(bufferType_, TENSOR_SEQUENCE_DATA);
+  return dynamic_cast<const SequenceArg&>(*this);
+}
+
+const SparseMatrixArg& BufferArg::sparse() const {
+  CHECK_EQ(bufferType_, TENSOR_SPARSE);
+  return dynamic_cast<const SparseMatrixArg&>(*this);
+}
+
+SparseMatrixArg::SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType)
+    : BufferArg(sparse, argType),
+      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
+      nnz_(sparse.getElementCnt()),
+      format_(static_cast<SparseDataFormat>(sparse.getFormat())),
+      type_(static_cast<SparseDataType>(sparse.getValueType())) {
+  bufferType_ = TENSOR_SPARSE;
+}
+
+SparseMatrixArg::SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType)
+    : BufferArg(sparse, argType),
+      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
+      nnz_(sparse.getElementCnt()),
+      format_(static_cast<SparseDataFormat>(sparse.getFormat())),
+      type_(static_cast<SparseDataType>(sparse.getValueType())) {
+  bufferType_ = TENSOR_SPARSE;
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/BufferArg.h b/paddle/legacy/function/BufferArg.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f47ad556d29363d784fde718fdacdf0658ef010
--- /dev/null
+++ b/paddle/legacy/function/BufferArg.h
@@ -0,0 +1,364 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+
+#include "TensorShape.h"
+#include "TensorType.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+enum BufferType {
+  TENSOR_UNKNOWN = 0,
+  TENSOR_NORMAL = 1,
+  TENSOR_SEQUENCE_ID = 2,
+  TENSOR_SEQUENCE_DATA = 3,
+  TENSOR_SPARSE = 4
+};
+
+class BufferArg;
+class SequenceArg;
+class SparseMatrixArg;
+
+/**
+ * \brief BufferArg used as the argument type of Function.
+ *
+ * The arguments of the Paddle Function have four Buffer types.
+ * 1. BufferArg for a dense Buffer of any dimension.
+ * 2. SequenceIdArg for a Buffer of sequence start positions.
+ * 3. SequenceArg for a Buffer of sequence data.
+ * 4. SparseMatrixArg for a Buffer of sparse matrix.
+ *
+ * Buffer shape
+ * For most buffers, the first dimension `shape()[0]` represents
+ * the size of the mini-batch.
+ *
+ * Buffer argType
+ * There is an ArgType property for the BufferArg used as Function Output.
+ * Whether the result of the Function calculation is assigned to the
+ * output Buffer or added to the output Buffer is determined by the
+ * argType_ property of the output BufferArg.
+ */
+
+// ArgType is only used by output BufferArg.
+// For input argument, argType_ is ignored.
+// For output argument, need to set the argType_ of the BufferArg.
+enum ArgType {
+  UNSPECIFIED = 0,
+  ASSIGN_TO = 1,
+  ADD_TO = 2,
+};
+class BufferArg {
+ public:
+  void setArgType(ArgType argType) { argType_ = argType; }
+
+  ArgType getArgType() const { return argType_; }
+
+ public:
+  BufferArg(ValueType valueType,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
+      : buf_(nullptr), valueType_(valueType), shape_(shape), argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+  }
+
+  BufferArg(void* buf,
+            ValueType valueType,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
+      : buf_(buf), valueType_(valueType), shape_(shape), argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+  }
+
+  BufferArg(void* buf, ValueType valueType) : buf_(buf), valueType_(valueType) {
+    bufferType_ = TENSOR_NORMAL;
+  }
+
+  BufferArg(const Matrix& matrix, ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
+        valueType_(DataType<real>::value),
+        shape_(2),
+        argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+    shape_.setDim(0, matrix.getHeight());
+    shape_.setDim(1, matrix.getWidth());
+  }
+
+  BufferArg(const Matrix& matrix,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
+        valueType_(DataType<real>::value),
+        shape_(shape),
+        argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+    CHECK_EQ(matrix.getElementCnt(), shape.getElements());
+  }
+
+  BufferArg(const Vector& vector, ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
+        valueType_(DataType<real>::value),
+        shape_(1),
+        argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+    shape_.setDim(0, vector.getSize());
+  }
+
+  BufferArg(const IVector& vector, ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
+        valueType_(VALUE_TYPE_INT32),
+        shape_(1),
+        argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+    shape_.setDim(0, vector.getSize());
+  }
+
+  template <DeviceType DType>
+  typename Tensor<real, DType>::Matrix matrix() const {
+    CHECK(buf_);
+    CHECK(valueType_ == DataType<real>::value);
+    // CHECK(deviceType_ == DType);
+    CHECK_EQ((size_t)2, shape_.ndims());
+    return typename Tensor<real, DType>::Matrix(
+        reinterpret_cast<real*>(buf_), shape_[0], shape_[1]);
+  }
+
+  template <typename VType, DeviceType DType>
+  typename Tensor<VType, DType>::Vector vector() const {
+    CHECK(buf_);
+    CHECK(valueType_ == DataType<VType>::value);
+    // CHECK(deviceType_ == DType);
+    CHECK_EQ((size_t)1, shape_.ndims());
+    return typename Tensor<VType, DType>::Vector(
+        shape_[0], reinterpret_cast<VType*>(buf_));
+  }
+
+  virtual ~BufferArg() {}
+
+  template <typename T>
+  T* data() const {
+    return reinterpret_cast<T*>(buf_);
+  }
+
+  void* data() const { return buf_; }
+  ValueType valueType() const { return valueType_; }
+  BufferType bufferType() const { return bufferType_; }
+  const TensorShape& shape() const { return shape_; }
+  bool isSparseArg() const { return TENSOR_SPARSE == bufferType_; }
+  bool isSequenceArg() const { return TENSOR_SEQUENCE_DATA == bufferType_; }
+  virtual size_t numElements() const { return shape_.getElements(); }
+
+  const SequenceArg& sequence() const;
+  const SparseMatrixArg& sparse() const;
+
+ protected:
+  void* buf_;
+  ValueType valueType_;
+  TensorShape shape_;
+  BufferType bufferType_{TENSOR_UNKNOWN};
+  ArgType argType_{UNSPECIFIED};
+  // TODO(tianbing), add deviceType_
+  // leading dimensions. The size is dims_.size()
+  // Dims lds_;
+};
+
+// sequence start positions in a mini-batch of sequences
+// shape_.ndims() == 1
+// valueType_ = int32
+// if a < b then value_.buf_[a] < value_.buf_[b]
+class SequenceIdArg : public BufferArg {
+ public:
+  SequenceIdArg(const TensorShape& shape, ArgType argType = UNSPECIFIED)
+      : BufferArg(VALUE_TYPE_INT32, shape, argType) {
+    bufferType_ = TENSOR_SEQUENCE_ID;
+    CHECK_EQ(shape_.ndims(), 1UL);
+    CHECK_GE(shape_[0], 1UL);
+    numSeqs_ = shape_[0] - 1;
+  }
+
+  SequenceIdArg(void* buf,
+                const TensorShape& shape,
+                ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, VALUE_TYPE_INT32, shape, argType) {
+    bufferType_ = TENSOR_SEQUENCE_ID;
+    CHECK_EQ(shape_.ndims(), 1UL);
+    numSeqs_ = shape_[0] - 1;
+  }
+
+  SequenceIdArg(const IVector& vector) : BufferArg(vector) {
+    bufferType_ = TENSOR_SEQUENCE_ID;
+    numSeqs_ = shape_[0] - 1;
+  }
+
+  ~SequenceIdArg() {}
+
+  size_t numSeqs() const { return numSeqs_; }
+
+ private:
+  size_t numSeqs_;
+};
+
+// sequences data
+// For mini-batch calculate,
+// one batch can contain more than one sequence of data.
+// SequenceArg can be used to represent sequences that contain multiple
+// unequal lengths.
+class SequenceArg : public BufferArg {
+ public:
+  SequenceArg(ValueType valueType,
+              const TensorShape& shape,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(valueType, shape, argType),
+        startPositions_(TensorShape({shape[0]})) {
+    bufferType_ = TENSOR_SEQUENCE_DATA;
+  }
+
+  SequenceArg(void* buf,
+              ValueType valueType,
+              const TensorShape& shape,
+              const SequenceIdArg& startPositions,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, valueType, shape, argType),
+        startPositions_(startPositions) {
+    bufferType_ = TENSOR_SEQUENCE_DATA;
+  }
+
+  SequenceArg(const Matrix& matrix,
+              const IVector& vector,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(matrix, argType), startPositions_(vector) {
+    bufferType_ = TENSOR_SEQUENCE_DATA;
+  }
+
+  ~SequenceArg() {}
+
+  void* getIdBuf() const { return startPositions_.data(); }
+  size_t numSeqs() const { return startPositions_.numSeqs(); }
+  SequenceIdArg& getSequenceId() { return startPositions_; }
+  const SequenceIdArg& getSequenceId() const { return startPositions_; }
+
+ private:
+  SequenceIdArg startPositions_;
+};
+
+// sparse matrix
+// valueType_ == float or double
+// shape_.ndims() == 2
+class SparseMatrixArg : public BufferArg {
+ public:
+  SparseMatrixArg(void* buf,
+                  ValueType valueType,
+                  const TensorShape& shape,
+                  const BufferArg& row,
+                  const BufferArg& col,
+                  size_t nnz,
+                  SparseFormat format,
+                  SparseValueType type,
+                  ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, valueType, shape, argType),
+        row_(row),
+        col_(col),
+        nnz_(nnz),
+        format_(static_cast<SparseDataFormat>(format)),
+        type_(static_cast<SparseDataType>(type)) {
+    bufferType_ = TENSOR_SPARSE;
+    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
+    CHECK_EQ(shape_.ndims(), 2UL);
+    CHECK_EQ(row_.shape().ndims(), 1UL);
+    CHECK_EQ(col_.shape().ndims(), 1UL);
+    if (format_ == T_SPARSE_CSR) {
+      CHECK_EQ(nnz, col.shape()[0]);
+    } else if (format_ == T_SPARSE_CSC) {
+      CHECK_EQ(nnz, row.shape()[0]);
+    }
+  }
+
+  SparseMatrixArg(ValueType valueType,
+                  const TensorShape& shape,
+                  size_t nnz,
+                  SparseFormat format,
+                  SparseValueType type,
+                  ArgType argType = UNSPECIFIED)
+      : BufferArg(valueType, shape, argType),
+        row_(BufferArg(nullptr, VALUE_TYPE_INT32)),
+        col_(BufferArg(nullptr, VALUE_TYPE_INT32)),
+        nnz_(nnz),
+        format_(static_cast<SparseDataFormat>(format)),
+        type_(static_cast<SparseDataType>(type)) {
+    bufferType_ = TENSOR_SPARSE;
+    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
+    CHECK_EQ(shape_.ndims(), 2UL);
+
+    /// len of row_ : height + 1 (CSR) or nnz (CSC), buf_ == nullptr
+    row_ = (format_ == T_SPARSE_CSR
+                ? BufferArg(VALUE_TYPE_INT32, TensorShape{shape_[0] + 1})
+                : BufferArg(VALUE_TYPE_INT32, TensorShape{nnz}));
+    /// len of col_ :  width + 1 (CSC) or nnz (CSR), buf_ == nullptr
+    col_ = (format_ == T_SPARSE_CSR
+                ? BufferArg(VALUE_TYPE_INT32, TensorShape{nnz})
+                : BufferArg(VALUE_TYPE_INT32, TensorShape{shape_[1] + 1}));
+  }
+
+  SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
+
+  SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
+
+  template <DeviceType DType>
+  typename Tensor<real, DType>::SparseMatrix SparseMatrix() const {
+    CHECK(buf_);
+    CHECK(valueType_ == DataType<real>::value);
+    // CHECK(deviceType_ == DType);
+    CHECK_EQ(2UL, shape_.ndims());
+    return typename Tensor<real, DType>::SparseMatrix(
+        reinterpret_cast<real*>(buf_),
+        reinterpret_cast<int*>(row_.data()),
+        reinterpret_cast<int*>(col_.data()),
+        shape_[0],
+        shape_[1],
+        nnz_,
+        static_cast<SparseValueType>(type_),
+        static_cast<SparseFormat>(format_),
+        false);
+  }
+
+  ~SparseMatrixArg() {}
+
+  void* getRowBuf() const { return row_.data(); }
+
+  void* getColBuf() const { return col_.data(); }
+
+  size_t nnz() const { return nnz_; }
+
+  size_t numElements() const override { return nnz_; }
+
+  SparseDataFormat dataFormat() const { return format_; }
+
+  SparseDataType dataType() const { return type_; }
+
+ private:
+  BufferArg row_;
+  BufferArg col_;
+  size_t nnz_;
+  SparseDataFormat format_;
+  SparseDataType type_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/BufferArgTest.cpp b/paddle/legacy/function/BufferArgTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1ec153bea89f25414b0df3088ab0c366c92ecbe0
--- /dev/null
+++ b/paddle/legacy/function/BufferArgTest.cpp
@@ -0,0 +1,38 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "BufferArg.h"
+#include <gtest/gtest.h>
+#include "paddle/legacy/math/MemoryHandle.h"
+
+namespace paddle {
+
+TEST(BufferTest, BufferArg) {
+  TensorShape shape({8, 10});
+  CpuMemoryHandle memory(shape.getElements() *
+                         sizeOfValuType(VALUE_TYPE_FLOAT));
+  BufferArg buffer(memory.getBuf(), VALUE_TYPE_FLOAT, shape);
+  EXPECT_EQ(buffer.data(), memory.getBuf());
+}
+
+TEST(BufferTest, SequenceIdArg) {
+  TensorShape shape({10});
+  CpuMemoryHandle memory(shape.getElements() *
+                         sizeOfValuType(VALUE_TYPE_INT32));
+  SequenceIdArg buffer(memory.getBuf(), shape);
+  EXPECT_EQ(buffer.data(), memory.getBuf());
+  EXPECT_EQ(buffer.numSeqs(), 9U);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/CMakeLists.txt b/paddle/legacy/function/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..29b4ac098e21ee315d5c9b2f2499521d1aa1c322
--- /dev/null
+++ b/paddle/legacy/function/CMakeLists.txt
@@ -0,0 +1,54 @@
+file(GLOB h_files . *Op.h)
+file(GLOB cpp_files . *Op.cpp)
+
+list(APPEND h_files Function.h)
+list(APPEND cpp_files Function.cpp)
+list(APPEND cpp_files BufferArg.cpp)
+list(APPEND cpp_files GemmFunctor.cpp)
+if(USE_EIGEN_FOR_BLAS)
+  list(APPEND cpp_files EigenGemm.cpp)
+endif(USE_EIGEN_FOR_BLAS)
+
+if(WITH_GPU)
+    file(GLOB cu_files . *OpGpu.cu)
+    cuda_compile(cu_objs ${cu_files})
+endif()
+
+if(USE_NNPACK)
+  list(APPEND cpp_files nnpack/NNPACKConvOp.cpp)
+  if(WITH_TESTING)
+    add_unittest(NNPACKConvOpTest nnpack/NNPACKConvOpTest.cpp)
+  endif()
+endif()
+
+list(APPEND cpp_files neon/NeonDepthwiseConv.cpp)
+
+add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
+add_dependencies(paddle_function ${external_project_dependencies})
+add_dependencies(paddle_function paddle_proto)
+
+if(WITH_TESTING)
+if(WITH_GPU)
+    # TODO:
+    # file(GLOB test_files . *OpTest.cpp)
+    # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
+    add_simple_unittest(CrossMapNormalOpTest)
+    add_simple_unittest(TensorShapeTest)
+    add_simple_unittest(TensorTypeTest)
+    add_simple_unittest(BufferArgTest)
+    add_simple_unittest(FunctionTest)
+    add_simple_unittest(ContextProjectionOpTest)
+    add_simple_unittest(PadOpTest)
+    add_simple_unittest(MulOpTest)
+    add_simple_unittest(CosSimOpTest)
+    add_simple_unittest(RowConvOpTest)
+    add_simple_unittest(BlockExpandOpTest)
+    add_simple_unittest(CropOpTest)
+    add_simple_unittest(SwitchOpTest)
+    add_simple_unittest(ScaleSubRegionOpTest)
+endif()
+
+add_simple_unittest(Im2ColTest)
+add_simple_unittest(GemmConvOpTest)
+add_simple_unittest(DepthwiseConvOpTest)
+endif()
diff --git a/paddle/legacy/function/ContextProjectionOp.cpp b/paddle/legacy/function/ContextProjectionOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..05a3f915862b6657fc0a4300cbbea36721219e10
--- /dev/null
+++ b/paddle/legacy/function/ContextProjectionOp.cpp
@@ -0,0 +1,412 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ContextProjectionOp.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+
+namespace paddle {
+/**
+ * Context Projection Forward with CPU Matrix Device.
+ *
+ */
+template <>
+void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
+                                               const CpuMatrix& input_mat,
+                                               const CpuMatrix& weight_mat,
+                                               const CpuIVector& seq_vec,
+                                               size_t context_length,
+                                               int context_start,
+                                               size_t begin_pad) {
+  const int* starts = seq_vec.getData();
+  const size_t num_sequences = seq_vec.getSize() - 1;
+  for (size_t i = 0; i < num_sequences; ++i) {
+    for (size_t j = 0; j < context_length; ++j) {
+      int begin = starts[i] + context_start + j;
+      int end = starts[i + 1] + context_start + j;
+      int dst_begin = starts[i];
+      int dst_end = starts[i + 1];
+      if (begin < starts[i]) {
+        int64_t pad_size =
+            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
+        MatrixPtr mat = out_mat.subMatrix(starts[i], pad_size);
+        if (weight_mat) {
+          MatrixPtr sub =
+              const_cast<CpuMatrix&>(weight_mat).subMatrix(j, pad_size);
+          mat->addAtOffset(*sub, j * input_mat.getWidth());
+        }
+        dst_begin = starts[i] + pad_size;
+        begin = starts[i];
+      }
+      if (end > starts[i + 1]) {
+        int64_t pad_size =
+            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
+        MatrixPtr mat = out_mat.subMatrix(starts[i + 1] - pad_size, pad_size);
+        if (weight_mat) {
+          MatrixPtr sub =
+              const_cast<CpuMatrix&>(weight_mat)
+                  .subMatrix(begin_pad + context_start + j - pad_size,
+                             pad_size);
+          mat->addAtOffset(*sub, j * input_mat.getWidth());
+        }
+        dst_end = starts[i + 1] - pad_size;
+        end = starts[i + 1];
+      }
+      if (end <= begin) continue;
+      MatrixPtr src =
+          const_cast<CpuMatrix&>(input_mat).subMatrix(begin, end - begin);
+      MatrixPtr dst = out_mat.subMatrix(dst_begin, dst_end - dst_begin);
+      dst->addAtOffset(*src, j * input_mat.getWidth());
+    }
+  }
+}
+
+/**
+ * Paddle Function for Context Projection Forward.
+ * Calculate the output layer value sequence after context projection.
+ *
+ * What is Context Projection for a sequence?
+ * For example, assumed input (x) has 4 words and the dimension of each word
+ * representation is 2. If we use zero to pad instead of learned weight to pad,
+ * and the context_lenth is 3, the output (y) is:
+ *
+ * @code
+ *  x = [a1, a2;
+ *       b1, b2;
+ *       c1, c2;
+ *       d1, d2]
+ *  y = [0,  0,  a1, a2, b1, b2;
+ *       a1, a2, b1, b2, c1, c2;
+ *       b1, b2, c1, c2, d1, d2;
+ *       c1, c2, d1, d2, 0,  0]
+ * @endcode
+ *
+ * \param outputs[0].matrix   output layer value, n * (d * l)
+ * \param outputs[0].vector   start position sequence, n * 1
+ * \param inputs[0].matrix    input layer value, n * d
+ * \param inputs[0].vector    start position sequence, n * 1
+ * \param inputs[1].matrix    input layer weight, pad * d
+ */
+template <DeviceType Device>
+class ContextProjectionForwardFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+    begin_pad_ = config.get<size_t>("begin_pad");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK(1UL == inputs.size() || 2UL == inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto val_seqs = dynamic_cast<const SequenceArg&>(inputs[0]);
+    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
+
+    CHECK(out_seq.data() && val_seqs.data() && val_seqs.getSequenceId().data());
+    CHECK_EQ(out_seq.shape().ndims(), 2UL);
+    CHECK_EQ(val_seqs.shape().ndims(), 2UL);
+    /// dim of output = dim of input * context_length
+    CHECK_EQ(out_seq.shape()[1], val_seqs.shape()[1] * context_length_);
+    /// input and output has the same batch_size
+    CHECK_EQ(val_seqs.shape()[0], out_seq.shape()[0]);
+    if (2UL == inputs.size()) {
+      CHECK_EQ(inputs[1].shape().ndims(), 2UL);
+      /// dim of input == dim of weight
+      CHECK_EQ(val_seqs.shape()[1], inputs[1].shape()[1]);
+    }
+
+    CHECK_EQ(out_seq.getArgType(), ADD_TO);
+    auto out_mat = out_seq.matrix<Device>();
+    const auto in_mat = val_seqs.matrix<Device>();
+    const auto w_mat =
+        (2UL == inputs.size() && inputs[1].data())
+            ? inputs[1].matrix<Device>()
+            : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+    const auto seq_vec = val_seqs.getSequenceId().vector<int, Device>();
+
+    ContextProjectionForward<Device>(out_mat,
+                                     in_mat,
+                                     w_mat,
+                                     seq_vec,
+                                     context_length_,
+                                     context_start_,
+                                     begin_pad_);
+  }
+
+ private:
+  size_t context_length_;
+  int context_start_;
+  size_t begin_pad_;
+};
+
+/**
+ * Context Projection Backward with CPU Matrix Device.
+ *
+ */
+template <>
+void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
+                                                CpuMatrix& in_grad_mat,
+                                                CpuMatrix& w_grad_mat,
+                                                const CpuIVector& seq_vec,
+                                                size_t context_length,
+                                                int context_start,
+                                                size_t begin_pad,
+                                                bool is_padding,
+                                                size_t total_pad) {
+  size_t input_dim = in_grad_mat ? in_grad_mat.getWidth()
+                                 : w_grad_mat ? w_grad_mat.getWidth() : 0;
+  const int* starts = seq_vec.getData();
+  size_t num_sequences = seq_vec.getSize() - 1;
+  for (size_t i = 0; i < num_sequences; ++i) {
+    for (size_t j = 0; j < context_length; ++j) {
+      int begin = starts[i] + context_start + j;
+      int end = starts[i + 1] + context_start + j;
+      int dst_begin = starts[i];
+      int dst_end = starts[i + 1];
+      if (begin < starts[i]) {
+        int64_t pad_size =
+            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
+        if (is_padding && w_grad_mat) {
+          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
+                              .subMatrix(starts[i], pad_size);
+          MatrixPtr sub = w_grad_mat.subMatrix(j, pad_size);
+          sub->addAtOffset(*mat, j * input_dim);
+        }
+        dst_begin = starts[i] + pad_size;
+        begin = starts[i];
+      }
+      if (end > starts[i + 1]) {
+        int64_t pad_size =
+            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
+        if (is_padding && w_grad_mat) {
+          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
+                              .subMatrix(starts[i + 1] - pad_size, pad_size);
+          MatrixPtr sub = w_grad_mat.subMatrix(
+              begin_pad + context_start + j - pad_size, pad_size);
+          sub->addAtOffset(*mat, j * input_dim);
+        }
+        dst_end = starts[i + 1] - pad_size;
+        end = starts[i + 1];
+      }
+      if (end <= begin) continue;
+      if (!in_grad_mat) continue;
+      MatrixPtr src = in_grad_mat.subMatrix(begin, end - begin);
+      MatrixPtr dst = const_cast<CpuMatrix&>(out_grad_mat)
+                          .subMatrix(dst_begin, dst_end - dst_begin);
+      src->addAtOffset(*dst, j * input_dim);
+    }
+  }
+}
+
+/**
+ * Context Projection Backward Function.
+ * Update the weight gradient and input layer gradient with backprop
+ *
+ * \param inputs[0].matrix          output layer grad, n * (d * l)
+ * \param inputs[0].vector          start position sequence, n * 1
+ * \param outputs[0].matrix         input layer grad, n * d
+ * \param outputs[0].vector         start position sequence, n * 1
+ * \param outputs[1]                weight grad, pad * d
+ */
+template <DeviceType Device>
+class ContextProjectionBackwardFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+    begin_pad_ = config.get<size_t>("begin_pad");
+    is_padding_ = config.get<bool>("is_padding");
+    total_pad_ = config.get<size_t>("total_pad");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK(1UL == outputs.size() || 2UL == outputs.size());
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
+    CHECK(in_seq.data() && in_seq.getSequenceId().data());
+    CHECK_EQ(in_seq.shape().ndims(), 2UL);
+    CHECK_EQ(out_seq.shape().ndims(), 2UL);
+    CHECK_EQ(out_seq.getSequenceId().shape().ndims(), 1UL);
+
+    /// input and output grad has the same batch_size
+    CHECK_EQ(out_seq.shape()[0], in_seq.shape()[0]);
+    /// dim of output grad = dim of input grad * context_length
+    CHECK_EQ(in_seq.shape()[1], out_seq.shape()[1] * context_length_);
+    CHECK_EQ(out_seq.getArgType(), ADD_TO);
+
+    if (2UL == outputs.size()) {
+      CHECK_EQ(outputs[1].shape().ndims(), 2UL);
+      /// dim of input grad == dim of weight
+      CHECK_EQ(out_seq.shape()[1], outputs[1].shape()[1]);
+      CHECK_EQ(outputs[1].getArgType(), ADD_TO);
+    }
+
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
+    const auto out_grad_mat = in_seq.matrix<Device>();
+    auto in_grad_mat =
+        !out_seq.data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
+                        : out_seq.matrix<Device>();
+    auto w_grad_mat =
+        (2UL == outputs.size() && outputs[1].data())
+            ? outputs[1].matrix<Device>()
+            : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+
+    ContextProjectionBackward<Device>(out_grad_mat,
+                                      in_grad_mat,
+                                      w_grad_mat,
+                                      seq_vec,
+                                      context_length_,
+                                      context_start_,
+                                      begin_pad_,
+                                      is_padding_,
+                                      total_pad_);
+  }
+
+ private:
+  size_t context_length_;
+  int context_start_;
+  size_t begin_pad_;
+  bool is_padding_;
+  size_t total_pad_;
+};
+
+/**
+ * Context Projection Backward Data Function
+ * Update input layer grad
+ * input:  sequence of output layer grad
+ * output: sequence of input layer grad
+ *
+ * \param outputs[0].matrix              input layer grad, n * d
+ * \param outputs[0].vector              start position sequence, n * 1
+ * \param inputs[0].matrix               output layer grad, n * (d * l)
+ * \param inputs[0].vector               start positon sequence, n * 1
+ */
+template <DeviceType Device>
+class ContextProjectionBackwardDataFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    const auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
+
+    CHECK(in_seq.data() && out_seq.data() && in_seq.getSequenceId().data());
+    CHECK_EQ(out_seq.shape().ndims(), 2UL);
+    CHECK_EQ(in_seq.shape().ndims(), 2UL);
+    CHECK_EQ(in_seq.getSequenceId().shape().ndims(), 1UL);
+    /// output layer grad dim == input layer grad dim * context_length_
+    CHECK_EQ(in_seq.shape().ndims(), out_seq.shape().ndims() * context_length_);
+    /// input and output has the same batch_size
+    CHECK_EQ(in_seq.shape()[0], out_seq.shape()[0]);
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+    const auto out_grad_mat = in_seq.matrix<Device>();
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
+    auto in_grad_mat = out_seq.matrix<Device>();
+
+    ContextProjectionBackwardData<Device>(
+        out_grad_mat, in_grad_mat, seq_vec, context_length_, context_start_);
+  }
+
+ private:
+  size_t context_length_;
+  int context_start_;
+};
+
+/**
+ * Context Projection Backward Weight Function
+ * Update weight grad by backprop
+ * input:  sequence of output layer grad
+ * output: weight grad
+ *
+ * \param outputs[0]                   weight grad, pad * d
+ * \param inputs[0].matrix             output layer grad, n * (d * l)
+ * \param inputs[0].vecotr             start positon sequence, n * 1
+ */
+template <DeviceType Device>
+class ContextProjectionBackwardWeightFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+    begin_pad_ = config.get<size_t>("begin_pad");
+    total_pad_ = config.get<size_t>("total_pad");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK(inputs[0].isSequenceArg()) << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    CHECK(in_seq.data() && in_seq.getSequenceId().data() && outputs[0].data());
+    CHECK_EQ(outputs[0].shape().ndims(), 2UL);
+    CHECK_EQ(in_seq.shape().ndims(), 2UL);
+    CHECK_EQ(in_seq.getSequenceId().shape().ndims(), 1UL);
+    CHECK_EQ(in_seq.shape()[0], outputs[0].shape()[0]);
+    /// output layer grad dim == weight dim * context_length_
+    CHECK_EQ(in_seq.shape()[1], outputs[0].shape()[1] * context_length_);
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
+    const auto out_grad_mat = in_seq.matrix<Device>();
+    auto w_grad_mat = outputs[0].matrix<Device>();
+    ContextProjectionBackwardWeight<Device>(out_grad_mat,
+                                            w_grad_mat,
+                                            seq_vec,
+                                            context_length_,
+                                            context_start_,
+                                            total_pad_,
+                                            begin_pad_);
+  }
+
+ private:
+  size_t context_length_;
+  int context_start_;
+  size_t begin_pad_;
+  size_t total_pad_;
+};
+
+REGISTER_TYPED_FUNC(ContextProjectionForward,
+                    CPU,
+                    ContextProjectionForwardFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackward,
+                    CPU,
+                    ContextProjectionBackwardFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(ContextProjectionForward,
+                    GPU,
+                    ContextProjectionForwardFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackward,
+                    GPU,
+                    ContextProjectionBackwardFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackwardData,
+                    GPU,
+                    ContextProjectionBackwardDataFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight,
+                    GPU,
+                    ContextProjectionBackwardWeightFunc);
+#endif
+}  // namespace paddle
diff --git a/paddle/function/ContextProjectionOp.h b/paddle/legacy/function/ContextProjectionOp.h
similarity index 100%
rename from paddle/function/ContextProjectionOp.h
rename to paddle/legacy/function/ContextProjectionOp.h
diff --git a/paddle/function/ContextProjectionOpGpu.cu b/paddle/legacy/function/ContextProjectionOpGpu.cu
similarity index 100%
rename from paddle/function/ContextProjectionOpGpu.cu
rename to paddle/legacy/function/ContextProjectionOpGpu.cu
diff --git a/paddle/legacy/function/ContextProjectionOpTest.cpp b/paddle/legacy/function/ContextProjectionOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3b0a34567fe17b466de6186e537243fe8166a77a
--- /dev/null
+++ b/paddle/legacy/function/ContextProjectionOpTest.cpp
@@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+
+void testMatrixProjectionForward(int context_start,
+                                 size_t context_length,
+                                 bool is_padding,
+                                 size_t batch_size,
+                                 size_t input_dim) {
+  size_t pad = std::max(0, -context_start) +
+               std::max(0, (int)(context_start + context_length - 1));
+  if (pad == 0) is_padding = false;
+
+  CpuGpuFuncCompare test(
+      "ContextProjectionForward",
+      FuncConfig()
+          .set("context_length", context_length)
+          .set("context_start", context_start)
+          .set("begin_pad", (size_t)std::max(0, -context_start)));
+
+  // prepare input arguments
+  test.addSequence(SequenceIdArg(TensorShape{batch_size}));
+  test.addInputs(
+      SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim}));
+  if (is_padding) {  // weight
+    test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{pad, input_dim}));
+  }
+  test.addOutputs(
+      SequenceArg(VALUE_TYPE_FLOAT,
+                  TensorShape{batch_size, input_dim * context_length}),
+      ADD_TO);
+
+  // run Function
+  test.run();
+}
+
+void testMatrixProjectionBackward(int context_start,
+                                  size_t context_length,
+                                  bool is_padding,
+                                  size_t batch_size,
+                                  size_t input_dim) {
+  size_t pad = std::max(0, -context_start) +
+               std::max(0, (int)(context_start + context_length - 1));
+  if (pad == 0) is_padding = false;
+
+  CpuGpuFuncCompare test(
+      "ContextProjectionBackward",
+      FuncConfig()
+          .set("context_length", context_length)
+          .set("context_start", context_start)
+          .set("begin_pad", (size_t)std::max(0, -context_start))
+          .set("is_padding", is_padding)
+          .set("total_pad", pad));
+
+  // prepare input arguments
+  test.addSequence(SequenceIdArg(TensorShape{batch_size}));
+  test.addInputs(SequenceArg(
+      VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim * context_length}));
+  test.addOutputs(
+      SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim}),
+      ADD_TO);
+  if (is_padding) {  // weight
+    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{pad, input_dim}),
+                    ADD_TO);
+  }
+
+  // run Function
+  test.run();
+}
+
+TEST(ContextProjection, Projection) {
+  for (auto context_start : {-5, -3, -1, 0, 3}) {
+    for (auto context_length : {1, 2, 5, 7}) {
+      for (auto trainable_padding : {false, true}) {
+        for (auto batch_size : {1, 2, 5, 20, 100}) {
+          for (auto input_dim : {15, 32, 63, 128, 200}) {
+            VLOG(3) << " context_start=" << context_start
+                    << " context_length=" << context_length
+                    << " trainable_padding=" << trainable_padding
+                    << " batch_size=" << batch_size
+                    << " input_dim=" << input_dim;
+            testMatrixProjectionForward(context_start,
+                                        context_length,
+                                        trainable_padding,
+                                        batch_size,
+                                        input_dim);
+            testMatrixProjectionBackward(context_start,
+                                         context_length,
+                                         trainable_padding,
+                                         batch_size,
+                                         input_dim);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/paddle/legacy/function/ConvOp.h b/paddle/legacy/function/ConvOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d8437bcfe60d1d81897f1c4be1cbfecb5b27fe0
--- /dev/null
+++ b/paddle/legacy/function/ConvOp.h
@@ -0,0 +1,157 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/*
+ * \brief Based on the ConvFunctionBase class, the forward calculation,
+ *        backward input calculation and backward filter calculation
+ *        of convolution operations can be implemented.
+ *
+ * Arguments of forward and backward calculation:
+ *   1. Forward calculation of convolution.
+ *      inputs = {INPUT, FILTER}, outputs = {OUTPUT}
+ *      The first and second input arguments are input image and filter data.
+ *      The output argument is output image.
+ *
+ *   2. Backward input calculation of convolution.
+ *      inputs = {OUTPUT_GRAD, FILTER}, outputs = {INPUT_GRAD}
+ *      The first and second input arguments are output grad image
+ *      and filter data.
+ *      The output argument is input grad image.
+ *
+ *   3. Backward filter calculation of convolution.
+ *      inputs = {OUTPUT_GRAD, INPUT}, outputs = {FILTER_GRAD}
+ *      The first and second input arguments are output grad image
+ *      and input image.
+ *      The output argument is filter grad.
+ *
+ * Arguments format of input, filter and output:
+ *   1. Input image, output image, input image gradient, output image gradient
+ *      are all NCHW format. Where N is batch size, C is the number of channels,
+ *      H and W is the height and width of image or image gradient.
+ *
+ *   2. The format of the filter data is MCHW, where M is the number of output
+ *      image channels, C is the number of input image channels,
+ *      H and W is height and width of filter.
+ *
+ *      If `groups` is greater than 1, the filter's data format should be GMCHW,
+ *      where G is the `groups`, and G * M is the number of output image
+ *      channels, G * C is the number of input image channels,
+ *      H and W is height and width of filter.
+ */
+class ConvFunctionBase : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    // function arguments
+    strides_ = config.get<std::vector<size_t>>("strides");
+    paddings_ = config.get<std::vector<size_t>>("paddings");
+    dilations_ = config.get<std::vector<size_t>>("dilations");
+    groups_ = config.get<size_t>("groups");
+
+    // number of inputs and outputs
+    numInputs_ = 2;
+    numOutputs_ = 1;
+  }
+
+  // input can be INPUT and INPUT_GRAD
+  // filter can be FILTER and FILTER_GRAD
+  // output can be OUTPUT and OUTPUT_GRAD
+  void checkShape(const TensorShape& input,
+                  const TensorShape& filter,
+                  const TensorShape& output) {
+    // inputs and outputs arguments should be 4-dimensional.
+    CHECK_EQ(input.ndims(), (size_t)4);
+    CHECK_EQ(output.ndims(), (size_t)4);
+    // The batchSize of the input needs to be equal to
+    // the batchSize of the output.
+    CHECK_EQ(input[0], output[0]);
+
+    if (filter.ndims() == (size_t)4) {
+      // If the filter's dimension is 4, groups convolution is not supported.
+      CHECK_EQ(groups_, (size_t)1);
+      // The input and output channel dimensions are the second and first
+      // dimensions of the filter shape.
+      CHECK_EQ(input[1], filter[1]);
+      CHECK_EQ(output[1], filter[0]);
+    } else {
+      // filter argument should be 5-dimensional.
+      CHECK_EQ(filter.ndims(), (size_t)5);
+      // The first dimension of the filter is the size of the group
+      CHECK_EQ(filter[0], groups_);
+      // The input and output channel dimensions are the third and second
+      // dimensions of the filter shape.
+      CHECK_EQ(input[1], filter[2] * groups_);
+      CHECK_EQ(output[1], filter[1] * groups_);
+    }
+  }
+
+ protected:
+  size_t getFilterHeight(const TensorShape& filter) const {
+    return filter[filter.ndims() - 2];
+  }
+
+  size_t getFilterWidth(const TensorShape& filter) const {
+    return filter[filter.ndims() - 1];
+  }
+
+  // determine whether im2col needs to be performed
+  inline bool isNeedIm2col(const TensorShape& filter) const {
+    return !(getFilterHeight(filter) == 1 && getFilterWidth(filter) == 1 &&
+             strideH() == 1 && strideW() == 1 && paddingH() == 0 &&
+             paddingW() == 0);
+  }
+
+  std::vector<size_t> strides_;
+  std::vector<size_t> paddings_;
+  std::vector<size_t> dilations_;
+
+  /// Group size, refer to grouped convolution in
+  /// Alex Krizhevsky's paper: when group=2, the first half of the
+  /// filters are only connected to the first half of the input channels,
+  /// and the second half only connected to the second half.
+  size_t groups_;
+
+  inline int strideH() const { return strides_[0]; }
+
+  inline int strideW() const { return strides_[1]; }
+
+  inline int paddingH() const { return paddings_[0]; }
+
+  inline int paddingW() const { return paddings_[1]; }
+
+  inline int dilationH() const { return dilations_[0]; }
+
+  inline int dilationW() const { return dilations_[1]; }
+
+  // A temporary memory in convolution calculation.
+  MemoryHandlePtr memory_;
+
+  template <DeviceType Device>
+  void resizeBuffer(size_t newSize) {
+    if (!memory_ || newSize * sizeof(real) > memory_->getAllocSize()) {
+      if (Device == DEVICE_TYPE_CPU) {
+        memory_ = std::make_shared<CpuMemoryHandle>(newSize * sizeof(real));
+      } else {
+        memory_ = std::make_shared<GpuMemoryHandle>(newSize * sizeof(real));
+      }
+    }
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/function/ConvOpTest.h b/paddle/legacy/function/ConvOpTest.h
similarity index 100%
rename from paddle/function/ConvOpTest.h
rename to paddle/legacy/function/ConvOpTest.h
diff --git a/paddle/legacy/function/CosSimOp.cpp b/paddle/legacy/function/CosSimOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d04f4396caade803aa846fa81388f95a194845e6
--- /dev/null
+++ b/paddle/legacy/function/CosSimOp.cpp
@@ -0,0 +1,240 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CosSimOp.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+
+namespace paddle {
+/**
+ * Cosine Similarity for CpuMatrix
+ *
+ * \param out_mat, output value, size: nSamples * 1.
+ * \param in1_mat, input value 1, size: nSamples * dim.
+ * \param in2_mat, input value 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
+ * \param scale, default 1.0
+ *
+ */
+template <>
+void CosSimForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
+                                    const CpuMatrix& in1_mat,
+                                    const CpuMatrix& in2_mat,
+                                    real scale) {
+  CHECK(out_mat.getData() && in1_mat.getData() && in2_mat.getData());
+  size_t num_samples = out_mat.getHeight();
+  size_t dim = in1_mat.getWidth();
+  /// column vector [nSamples, 1]
+  real* out = out_mat.getData();
+  const real* x = in1_mat.getData();
+  const real* y = in2_mat.getData();
+
+  /// in2 might only have one row or full rows
+  CHECK(in2_mat.getHeight() == 1LU || in2_mat.getHeight() == num_samples);
+  size_t inc = (in2_mat.getHeight() == 1LU) ? 0 : dim;
+  for (size_t i = 0; i < num_samples; ++i, x += dim, y += inc) {
+    real square_sum_x = 0;
+    real square_sum_y = 0;
+    real xy = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      square_sum_x += x[j] * x[j];
+      square_sum_y += y[j] * y[j];
+      xy += x[j] * y[j];
+    }
+    CHECK(square_sum_x > 0 && square_sum_y > 0);
+    out[i] = scale * xy / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y));
+  }
+}
+
+/**
+ * Cosine Similarity
+ * for each row i,
+ *   out[i] = scale * cos(input1[i], input2[i])
+ *      = scale * <input1[i], input2[i]>/sqrt(|input1[i]|^2 * |input2[i]|^2)
+ * when input2 only has one row, then for each row i,
+ *   out[i] = cos(input1[i], input2[0])
+ *
+ * \param inputs[0] input matrix 1, size: nSamples * dim.
+ * \param inputs[1] input matrix 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
+ * \param outputs[0] output matrix, size : nSamples * 1.
+ */
+
+template <DeviceType Device>
+class CosSimForwardFunc : public FunctionBase {
+  void init(const FuncConfig& config) override {
+    scale_ = config.get<real>("scale");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(inputs.size(), 2UL);
+    CHECK_EQ(outputs.size(), 1UL);
+
+    CHECK_EQ(inputs[0].shape().ndims(), 2UL);
+    CHECK_EQ(inputs[1].shape().ndims(), 2UL);
+    CHECK_EQ(outputs[0].shape().ndims(), 2UL);
+
+    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
+    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
+    CHECK_EQ(outputs[0].shape()[1], 1UL);
+
+    CHECK(outputs[0].data() && inputs[0].data() && inputs[1].data());
+
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    auto out_mat = outputs[0].matrix<Device>();
+    const auto in1_mat = inputs[0].matrix<Device>();
+    const auto in2_mat = inputs[1].matrix<Device>();
+
+    CosSimForward<Device>(out_mat, in1_mat, in2_mat, scale_);
+  }
+
+ private:
+  real scale_;
+};
+
+/**
+ * Cosine Similarity Derivative for CpuMatrix
+ *
+ * \param in1_grad  forward input grad 1, size: nSamples * dim.
+ * \param in2_grad  forward input grad 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
+ *
+ * \param out_grad  backward loss output grad, size : nSamples * 1.
+ * \param out_val   forward output value, size: nSamples * 1.
+ * \param in1_val   forward input value 1, size: nSamples * dim.
+ * \param in2_val   forward input value 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
+ * \param scale,    default 1.0
+ */
+template <>
+void CosSimBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad,
+                                     const CpuMatrix& out_val,
+                                     const CpuMatrix& in1_val,
+                                     const CpuMatrix& in2_val,
+                                     CpuMatrix& in1_grad,
+                                     CpuMatrix& in2_grad,
+                                     real scale) {
+  CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
+        in2_val.getData() && in1_grad.getData() && in2_grad.getData());
+  CHECK_EQ(out_val.useGpu_, false) << "Matrix type are GPU, CPU required";
+
+  const real* grad = out_grad.getData();
+  const real* out = out_val.getData();
+  const real* prev_out_x = in1_val.getData();
+  const real* prev_out_y = in2_val.getData();
+  real* prev_grad_x = in1_grad.getData();
+  real* prev_grad_y = in2_grad.getData();
+
+  size_t num_samples = out_grad.getHeight();
+  size_t dim = in1_val.getWidth();
+  CHECK_EQ(in2_val.getHeight(), in2_grad.getHeight());
+  CHECK(in2_val.getHeight() == 1LU || in2_val.getHeight() == num_samples);
+  size_t inc = (in2_val.getHeight() == 1LU) ? 0 : dim;
+  for (size_t i = 0; i < num_samples; ++i,
+              prev_out_x += dim,
+              prev_out_y += inc,
+              prev_grad_x += dim,
+              prev_grad_y += inc) {
+    real square_sum_x = 0;
+    real square_sum_y = 0;
+    real xy = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      square_sum_x += prev_out_x[j] * prev_out_x[j];
+      square_sum_y += prev_out_y[j] * prev_out_y[j];
+      xy += prev_out_x[j] * prev_out_y[j];
+    }
+    CHECK(square_sum_x > 0 && square_sum_y > 0);
+    if (xy == 0) {
+      real reciprocal =
+          1.0f / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y));
+      for (size_t j = 0; j < dim; ++j) {
+        prev_grad_x[j] += scale * grad[i] * prev_out_y[j] * reciprocal;
+        prev_grad_y[j] += scale * grad[i] * prev_out_x[j] * reciprocal;
+      }
+    } else {
+      real reciprocal_xy = 1.0f / xy;
+      real reciprocal_square_sum_x = 1.0f / square_sum_x;
+      real reciprocal_square_sum_y = 1.0f / square_sum_y;
+      for (size_t j = 0; j < dim; ++j) {
+        prev_grad_x[j] +=
+            out[i] * grad[i] * (prev_out_y[j] * reciprocal_xy -
+                                prev_out_x[j] * reciprocal_square_sum_x);
+        prev_grad_y[j] +=
+            out[i] * grad[i] * (prev_out_x[j] * reciprocal_xy -
+                                prev_out_y[j] * reciprocal_square_sum_y);
+      }
+    }
+  }
+}
+
+/**
+ * Cosine Similarity backward Derivative
+ *
+ * \param outputs[0] forward input grad 1, size: nSamples * dim.
+ * \param outputs[1] forward input grad 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
+ *
+ * \param inputs[0] backward loss output grad, size : nSamples * 1.
+ * \param inputs[1] forward output value, size: nSamples * 1.
+ * \param inputs[2] forward input value 1, size: nSamples * dim.
+ * \param inputs[3] forward input value 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
+ */
+template <DeviceType Device>
+class CosSimBackwardFunc : public FunctionBase {
+  void init(const FuncConfig& config) override {
+    scale_ = config.get<real>("scale");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(inputs.size(), 4UL);
+    CHECK_EQ(outputs.size(), 2UL);
+    /// dim of out_grad and out_val == 1, column vector
+    CHECK_EQ(inputs[0].shape()[1], 1UL);
+    CHECK_EQ(inputs[1].shape()[1], 1UL);
+    /// nSamples of out_grad == out_val == in_val1 == in_grad1
+    CHECK_EQ(inputs[1].shape()[0], inputs[0].shape()[0]);
+    CHECK_EQ(inputs[0].shape()[0], inputs[0].shape()[0]);
+    CHECK_EQ(outputs[0].shape()[0], inputs[0].shape()[0]);
+    /// dim of in1_val1 == in_val2 == in_grad1 == in_grad2
+    CHECK_EQ(inputs[3].shape()[1], inputs[2].shape()[1]);
+    CHECK_EQ(outputs[0].shape()[1], inputs[2].shape()[1]);
+    CHECK_EQ(outputs[1].shape()[1], inputs[2].shape()[1]);
+
+    CHECK(inputs[0].data() && inputs[1].data() && inputs[2].data() &&
+          inputs[3].data() && outputs[0].data() && outputs[1].data());
+
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
+
+    const auto out_grad = inputs[0].matrix<Device>();
+    const auto out_val = inputs[1].matrix<Device>();
+    const auto in1_val = inputs[2].matrix<Device>();
+    const auto in2_val = inputs[3].matrix<Device>();
+    auto in1_grad = outputs[0].matrix<Device>();
+    auto in2_grad = outputs[1].matrix<Device>();
+
+    CosSimBackward<Device>(
+        out_grad, out_val, in1_val, in2_val, in1_grad, in2_grad, scale_);
+  }
+
+ private:
+  real scale_;
+};
+
+REGISTER_TYPED_FUNC(CosSimForward, CPU, CosSimForwardFunc);
+REGISTER_TYPED_FUNC(CosSimBackward, CPU, CosSimBackwardFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(CosSimForward, GPU, CosSimForwardFunc);
+REGISTER_TYPED_FUNC(CosSimBackward, GPU, CosSimBackwardFunc);
+#endif
+}  // namespace paddle
diff --git a/paddle/function/CosSimOp.h b/paddle/legacy/function/CosSimOp.h
similarity index 100%
rename from paddle/function/CosSimOp.h
rename to paddle/legacy/function/CosSimOp.h
diff --git a/paddle/function/CosSimOpGpu.cu b/paddle/legacy/function/CosSimOpGpu.cu
similarity index 100%
rename from paddle/function/CosSimOpGpu.cu
rename to paddle/legacy/function/CosSimOpGpu.cu
diff --git a/paddle/legacy/function/CosSimOpTest.cpp b/paddle/legacy/function/CosSimOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..31bb43e1baa9a6d890d1b8fe2abf15a07a7094c6
--- /dev/null
+++ b/paddle/legacy/function/CosSimOpTest.cpp
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+#include "paddle/legacy/math/Matrix.h"
+
+using namespace paddle;  // NOLINT
+
+void testCosSimForward(size_t height_x,
+                       size_t height_y,
+                       size_t width,
+                       real scale) {
+  CpuGpuFuncCompare test("CosSimForward", FuncConfig().set("scale", scale));
+  // prepare input arguments
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}),
+                  ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+void testCosSimBackward(size_t height_x,
+                        size_t height_y,
+                        size_t width,
+                        real scale) {
+  CpuGpuFuncCompare test("CosSimBackward", FuncConfig().set("scale", scale));
+  // prepare input arguments
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}),
+                  ADD_TO);
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}),
+                  ADD_TO);
+  // run Function
+  test.run();
+}
+
+TEST(Matrix, cosSim) {
+  for (auto height_x : {10, 100, 1000}) {
+    for (auto height_y : {1, height_x}) {
+      for (auto width : {10, 100, 1000}) {
+        for (auto scale : {1.0, 2.0}) {
+          testCosSimForward(height_x, height_y, width, scale);
+          testCosSimBackward(height_x, height_y, width, scale);
+        }
+      }
+    }
+  }
+}
diff --git a/paddle/legacy/function/CropOp.cpp b/paddle/legacy/function/CropOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e22678822f06a323d1e6c17dce63d44d143484a3
--- /dev/null
+++ b/paddle/legacy/function/CropOp.cpp
@@ -0,0 +1,177 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CropOp.h"
+#include "paddle/legacy/function/TensorShape.h"
+#include "paddle/legacy/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void Crop<DEVICE_TYPE_CPU>(real* outputs,
+                           const real* inputs,
+                           const TensorShape inShape,
+                           const TensorShape outShape,
+                           const FuncConfig& conf) {
+  std::vector<uint32_t> crop_corner =
+      conf.get<std::vector<uint32_t>>("crop_corner");
+  int cCrop = crop_corner[1];
+  int hCrop = crop_corner[2];
+  int wCrop = crop_corner[3];
+
+  int num = inShape[0];
+  int inC = inShape[1];
+  int inH = inShape[2];
+  int inW = inShape[3];
+
+  int outC = outShape[1];
+  int outH = outShape[2];
+  int outW = outShape[3];
+
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < outC; c++) {
+      for (int h = 0; h < outH; h++) {
+        int outoff = ((n * outC + c) * outH + h) * outW;
+        int inoff = ((n * inC + c + cCrop) * inH + h + hCrop) * inW + wCrop;
+        memcpy(outputs + outoff, inputs + inoff, outW * sizeof(real));
+      }
+    }
+  }
+}
+
+template <>
+void CropGrad<DEVICE_TYPE_CPU>(const real* inGrad,
+                               real* outGrad,
+                               const TensorShape inShape,
+                               const TensorShape outShape,
+                               const FuncConfig& conf) {
+  std::vector<uint32_t> crop_corner =
+      conf.get<std::vector<uint32_t>>("crop_corner");
+  int cCrop = crop_corner[1];
+  int hCrop = crop_corner[2];
+  int wCrop = crop_corner[3];
+
+  int num = outShape[0];
+  int outC = outShape[1];
+  int outH = outShape[2];
+  int outW = outShape[3];
+
+  int inC = inShape[1];
+  int inH = inShape[2];
+  int inW = inShape[3];
+
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < inC; c++) {
+      for (int h = 0; h < inH; h++) {
+        int outoff = ((n * outC + c + cCrop) * outH + h + hCrop) * outW + wCrop;
+        int inoff = ((n * inC + c) * inH + h) * inW;
+        CpuVector inG = CpuVector(inW, const_cast<real*>(inGrad + inoff));
+        CpuVector outG = CpuVector(inW, outGrad + outoff);
+        outG += inG;
+      }
+    }
+  }
+}
+
+/**
+ * \brief Crop input according to the specify corner and shape.
+ *        The input and output is a 4D tensor. In CropFunc, we only
+ *        crop the 2nd to 4th dimension.
+ *
+ * Argument in this Function:
+ * \param pad_    A struct object contains the cropping corner and shape.
+ * \param inputs  A 4D tensor, only one input.
+ * \param outputs A 4D tensor, the output value after cropping.
+ *
+ * For example,
+ * Input(2,2,2,3) = [
+ *                    [ [[1,2,3], [3,4,5]],
+ *                      [[2,3,5], [1,6,7]] ],
+ *                    [ [[4,3,1], [1,8,7]],
+ *                      [[3,8,9], [2,3,5]] ]
+ *                  ] # the input shape is (2,2,2,3)
+ *
+ * pad_: if corner = (0,1,1) and crop_shape = (2,1,2)
+ * Output(2,2,1,2) = [
+ *                    [ [[4,5]],
+ *                      [[6,7]] ],
+ *                    [ [[8,7]],
+ *                      [[3,5]] ]
+ *                  ] # the input shape is (2,2,2,3)
+ */
+template <DeviceType Device>
+class CropFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+    TensorShape inShape = inputs[0].shape();
+    TensorShape outShape = outputs[0].shape();
+
+    Crop<Device>(outputs[0].data<real>(),
+                 inputs[0].data<real>(),
+                 inShape,
+                 outShape,
+                 conf_);
+  }
+
+ private:
+  FuncConfig conf_;
+};
+
+/**
+ * \brief The backward propagation of cropping Function.
+ *
+ * Argument in this Function:
+ * \param crop_    The same meaning as it in CropFunc.
+ * \param inputs  The gradient with respect to the output value of CropFunc.
+ * \param outputs The gradient with respect to the input value of CropFunc.
+ */
+
+template <DeviceType Device>
+class CropGradFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
+    TensorShape outShape = outputs[0].shape();
+    TensorShape inShape = inputs[0].shape();
+
+    CropGrad<Device>(inputs[0].data<real>(),
+                     outputs[0].data<real>(),
+                     inShape,
+                     outShape,
+                     conf_);
+  }
+
+ private:
+  FuncConfig conf_;
+};
+
+REGISTER_TYPED_FUNC(Crop, CPU, CropFunc);
+REGISTER_TYPED_FUNC(CropGrad, CPU, CropGradFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(Crop, GPU, CropFunc);
+REGISTER_TYPED_FUNC(CropGrad, GPU, CropGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/CropOp.h b/paddle/legacy/function/CropOp.h
similarity index 100%
rename from paddle/function/CropOp.h
rename to paddle/legacy/function/CropOp.h
diff --git a/paddle/function/CropOpGpu.cu b/paddle/legacy/function/CropOpGpu.cu
similarity index 100%
rename from paddle/function/CropOpGpu.cu
rename to paddle/legacy/function/CropOpGpu.cu
diff --git a/paddle/function/CropOpTest.cpp b/paddle/legacy/function/CropOpTest.cpp
similarity index 100%
rename from paddle/function/CropOpTest.cpp
rename to paddle/legacy/function/CropOpTest.cpp
diff --git a/paddle/legacy/function/CrossMapNormalOp.cpp b/paddle/legacy/function/CrossMapNormalOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f28703af00fa4bd7bebd98839cb077798083b61f
--- /dev/null
+++ b/paddle/legacy/function/CrossMapNormalOp.cpp
@@ -0,0 +1,344 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CrossMapNormalOp.h"
+#include "paddle/legacy/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void CrossMapNormal<DEVICE_TYPE_CPU>(real* outputs,
+                                     real* denoms,
+                                     const real* inputs,
+                                     size_t numSamples,
+                                     size_t channels,
+                                     size_t height,
+                                     size_t width,
+                                     size_t size,
+                                     real scale,
+                                     real pow) {
+  size_t oneImage = height * width;
+  size_t oneSample = channels * oneImage;
+
+  CpuVector outputsV(numSamples * oneSample, outputs);
+  CpuVector inputsV(numSamples * oneSample, const_cast<real*>(inputs));
+  CpuVector denomsV(numSamples * oneSample, denoms);
+
+  // f(x) = x * ( 1 + scale * SUM((x)^2) )^(-pow)
+  // x represents inputs
+  // f(x) represents outputs
+  // denoms save the intermediate result for backward
+  denomsV = denomsV.constant(1.0);
+  const int start = -((int)size - 1) / 2;
+  const int end = (int)size + start;
+  for (size_t i = 0; i < numSamples; i++) {
+    real* oneDenom = denoms + i * oneSample;
+    real* oneInput = const_cast<real*>(inputs) + i * oneSample;
+    for (int c = 0; c < (int)channels; c++) {
+      CpuVector denom(oneImage, oneDenom + c * oneImage);
+      for (int s = start; s < end; s++) {
+        if (c + s >= 0 && c + s < (int)channels) {
+          CpuVector input(oneImage, oneInput + (c + s) * oneImage);
+          denom += input.square() * scale;
+        }
+      }
+    }
+  }
+
+  outputsV = inputsV * denomsV.pow(-pow);
+}
+
+template <>
+void CrossMapNormalGrad<DEVICE_TYPE_CPU>(real* inputsGrad,
+                                         const real* inputsValue,
+                                         const real* outputsValue,
+                                         const real* outputsGrad,
+                                         const real* denoms,
+                                         size_t numSamples,
+                                         size_t channels,
+                                         size_t height,
+                                         size_t width,
+                                         size_t size,
+                                         real scale,
+                                         real pow) {
+  size_t oneSample = channels * height * width;
+  std::function<CpuVector(real*, size_t)> oneImage = [=](real* data,
+                                                         size_t offset) {
+    return CpuVector(height * width, data + offset);
+  };
+
+  const int start = -((int)size) / 2;
+  const int end = (int)size + start;
+  const real ratio = -(real)2 * scale * pow;
+  for (size_t i = 0; i < numSamples; i++) {
+    size_t sOffset = i * oneSample;
+    real* oneInputGrad = inputsGrad + sOffset;
+    real* oneInputValue = const_cast<real*>(inputsValue) + sOffset;
+    real* oneDenom = const_cast<real*>(denoms) + sOffset;
+    real* oneOutputGrad = const_cast<real*>(outputsGrad) + sOffset;
+    real* oneOutputValue = const_cast<real*>(outputsValue) + sOffset;
+
+    for (int c = 0; c < (int)channels; c++) {
+      size_t cOffset = c * height * width;
+      CpuVector inputGrad = oneImage(oneInputGrad, cOffset);
+      CpuVector inputValue = oneImage(oneInputValue, cOffset);
+      CpuVector denom = oneImage(oneDenom, cOffset);
+      CpuVector outputGrad = oneImage(oneOutputGrad, cOffset);
+
+      inputGrad = inputGrad + denom.pow(-pow) * outputGrad;
+      for (int s = start; s < end; s++) {
+        if (c + s >= 0 && c + s < (int)channels) {
+          size_t offset = (c + s) * height * width;
+          CpuVector output = oneImage(oneOutputValue, offset);
+          CpuVector outputGrad = oneImage(oneOutputGrad, offset);
+          CpuVector denom = oneImage(oneDenom, offset);
+
+          inputGrad += ((outputGrad * output * ratio) / denom) * inputValue;
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief Normalization with across maps.
+ *
+ * This Function comes from the paper
+ * "ImageNet Classification with Deep Convolutional Neural Networks".
+ *
+ * The original formula is:
+ *
+ *                                Input(i, x, y)
+ * Output(i, x, y) = ----------------------------------------------
+ *                                 -- upper
+ *                    (k + alpha * >  (Input(j, x, y))^2) ^ (beta)
+ *                                 -- j = lower
+ *
+ * upper is `min(C, c + N/2)`
+ * lower if `max(0, c - N/2)`
+ *
+ * Function implementation:
+ *
+ * inputs and outpus is NCHW format, while input.shape.ndims() is equal 4.
+ * And the meaning of each dimension(0-3) is respectively batch size,
+ * feature maps, rows and columns.
+ *
+ * Input and Output in the above formula is for each map(i) of one image, and
+ * Input(i, x, y), Output(i, x, y) represents an element in an image.
+ *
+ * C is the number of feature maps of one image, and N is a hyper-parameters
+ * is configured when Function is initialized. The sum in the denominator
+ * is the sum of the same position in the neighboring maps.
+ *
+ * In the implementation of Function, k is equal to 1,
+ * so Function has no argument for k.
+ *
+ * Function Arguments:
+ *
+ * \param size_      represent N
+ * \param scale_     represent alpha
+ * \param pow_       represent beta
+ * \param inputs[0]  represent Input
+ * \param outputs[0] represent Output
+ * \param outputs[1] represent The denominator in the formula(except beta)
+ *
+ * Note:
+ * Save output[1] is to simplify the backward calculation.
+ * TODO, if only consider the forward calculation, we can optimize to
+ * remove the output[1].
+ */
+template <DeviceType Device>
+class CrossMapNormalFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    // function arguments
+    size_ = config.get<size_t>("size");
+    scale_ = config.get<real>("scale");
+    pow_ = config.get<real>("pow");
+
+    // number of inputs and outputs
+    numInputs_ = 1;
+    numOutputs_ = 2;
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    check(inputs, outputs);
+    // ArgType check still on here,
+    // not sure whether it is better to put inside the check.
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    CHECK_EQ(outputs[1].getArgType(), ASSIGN_TO);
+    size_t batchSize = inputs[0].shape()[0];
+    size_t maps = inputs[0].shape()[1];
+    size_t rows = inputs[0].shape()[2];
+    size_t columns = inputs[0].shape()[3];
+
+    CrossMapNormal<Device>(outputs[0].data<real>(),
+                           outputs[1].data<real>(),
+                           inputs[0].data<real>(),
+                           batchSize,
+                           maps,
+                           rows,
+                           columns,
+                           size_,
+                           scale_,
+                           pow_);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
+    CHECK(inputs[0].shape() == outputs[0].shape());
+    CHECK(inputs[0].shape() == outputs[1].shape());
+  }
+
+  // Only need the shape of the input, can calculate the
+  // floating-point operation.
+  size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ((size_t)numInputs_, inputs.size());
+    size_t batchSize = inputs[0].shape()[0];
+    size_t maps = inputs[0].shape()[1];
+    size_t rows = inputs[0].shape()[2];
+    size_t columns = inputs[0].shape()[3];
+
+    // number of floating-point operations
+    // an approximate value
+    size_t ops = batchSize * maps * rows * columns * (size_ * 2 + 3);
+
+    return ops;
+  }
+
+ private:
+  size_t size_;
+  real scale_;
+  real pow_;
+};
+
+/**
+ * \brief Backward calculation for normalization with across maps.
+ *
+ * Function implementation:
+ *
+ * The implementation of this Function is derived from the
+ * CrossMapNormalFunc implementation.
+ *
+ * InputGrad = OutputGrad * denoms ^ (-beta)
+ *    -- upper
+ *  + > (OutputGrad * OutputValue * (-2 * alpha * beta) / denoms) * InputValue
+ *    -- lower
+ *
+ * The data of inputs/outputs format is the same as the forward interface
+ * and is NCHW.
+ *
+ * The upper and lower is the same as forward. The logic of the sum
+ * is also the same as forward.
+ *
+ * Function Arguments:
+ *
+ * \param size_      represent N
+ * \param scale_     represent alpha
+ * \param pow_       represent beta
+ * \param inputs[0]  represent InputValue, inputs[0] of CrossMapNormalFunc
+ * \param inputs[1]  represent OutputValue, outputs[0] of CrossMapNormalFunc
+ * \param inputs[2]  represent OutputGrad
+ * \param inputs[3]  represent denoms, outputs[1] of CrossMapNormalFunc
+ *                   This is the intermediate result that is
+ *                   preserved in the forward calculation.
+ * \param outputs[0] represent InputGrad
+ */
+template <DeviceType Device>
+class CrossMapNormalGradFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    // function arguments
+    size_ = config.get<size_t>("size");
+    scale_ = config.get<real>("scale");
+    pow_ = config.get<real>("pow");
+
+    // number of inputs and outputs
+    numInputs_ = 4;
+    numOutputs_ = 1;
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    check(inputs, outputs);
+    if (outputs[0].getArgType() != ADD_TO) {
+      // Currently, some algorithm implementations are ASSIGN_TO mode,
+      // if need to support the ADD_TO calculation, need to clear the output.
+      typename Tensor<real, Device>::Vector tmp(
+          outputs[0].shape().getElements(), outputs[0].data<real>());
+      tmp.zero();
+    }
+
+    size_t batchSize = inputs[0].shape()[0];
+    size_t maps = inputs[0].shape()[1];
+    size_t rows = inputs[0].shape()[2];
+    size_t columns = inputs[0].shape()[3];
+
+    CrossMapNormalGrad<Device>(outputs[0].data<real>(),
+                               inputs[0].data<real>(),
+                               inputs[1].data<real>(),
+                               inputs[2].data<real>(),
+                               inputs[3].data<real>(),
+                               batchSize,
+                               maps,
+                               rows,
+                               columns,
+                               size_,
+                               scale_,
+                               pow_);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
+    CHECK(inputs[0].shape() == inputs[1].shape());
+    CHECK(inputs[0].shape() == inputs[2].shape());
+    CHECK(inputs[0].shape() == inputs[3].shape());
+    CHECK(inputs[0].shape() == outputs[0].shape());
+  }
+
+  // Only need the shape of one input, can calculate the
+  // floating-point operation.
+  size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_LT((size_t)1, inputs.size());
+    size_t batchSize = inputs[0].shape()[0];
+    size_t maps = inputs[0].shape()[1];
+    size_t rows = inputs[0].shape()[2];
+    size_t columns = inputs[0].shape()[3];
+
+    // number of floating-point operations
+    // an approximate value
+    size_t ops = batchSize * maps * rows * columns * (size_ * 4 + 2);
+
+    return ops;
+  }
+
+ private:
+  size_t size_;
+  real scale_;
+  real pow_;
+};
+
+REGISTER_TYPED_FUNC(CrossMapNormal, CPU, CrossMapNormalFunc);
+REGISTER_TYPED_FUNC(CrossMapNormalGrad, CPU, CrossMapNormalGradFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(CrossMapNormal, GPU, CrossMapNormalFunc);
+REGISTER_TYPED_FUNC(CrossMapNormalGrad, GPU, CrossMapNormalGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/CrossMapNormalOp.h b/paddle/legacy/function/CrossMapNormalOp.h
similarity index 100%
rename from paddle/function/CrossMapNormalOp.h
rename to paddle/legacy/function/CrossMapNormalOp.h
diff --git a/paddle/function/CrossMapNormalOpGpu.cu b/paddle/legacy/function/CrossMapNormalOpGpu.cu
similarity index 100%
rename from paddle/function/CrossMapNormalOpGpu.cu
rename to paddle/legacy/function/CrossMapNormalOpGpu.cu
diff --git a/paddle/function/CrossMapNormalOpTest.cpp b/paddle/legacy/function/CrossMapNormalOpTest.cpp
similarity index 100%
rename from paddle/function/CrossMapNormalOpTest.cpp
rename to paddle/legacy/function/CrossMapNormalOpTest.cpp
diff --git a/paddle/legacy/function/DepthwiseConvOp.cpp b/paddle/legacy/function/DepthwiseConvOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..958034e08e60c9a63d1c480bde7c84b760205ae4
--- /dev/null
+++ b/paddle/legacy/function/DepthwiseConvOp.cpp
@@ -0,0 +1,305 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DepthwiseConvOp.h"
+#include "ConvOp.h"
+
+namespace paddle {
+
+template <class T>
+class DepthwiseConvFunctor<DEVICE_TYPE_CPU, T> {
+ public:
+  void operator()(const T* inputData,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* outputData) {
+    // TODO(zhaolong) : cpu implementation of depthwise convolution
+  }
+};
+
+template <class T>
+class DepthwiseConvGradInputFunctor<DEVICE_TYPE_CPU, T> {
+ public:
+  void operator()(const T* outputGrad,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* inputGrad) {}
+  // TODO(zhaolong) : cpu implementation of depthwise convolution
+};
+
+template <class T>
+class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_CPU, T> {
+ public:
+  void operator()(const T* outputGrad,
+                  const T* inputData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* colData,
+                  T* filterGrad) {}
+  // TODO(zhaolong) : cpu implementation of depthwise convolution
+};
+
+/*
+ * \brief Forward calculation of depthwise convolution.
+ */
+template <DeviceType Device>
+class DepthwiseConvFunction : public ConvFunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+    size_t filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(inputChannels, groups_);
+
+    real* inputData = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* outputData = outputs[0].data<real>();
+
+    DepthwiseConvFunctor<Device, real> depthwiseConv;
+    depthwiseConv(inputData,
+                  filterData,
+                  batchSize,
+                  outputChannels,
+                  outputHeight,
+                  outputWidth,
+                  inputChannels,
+                  inputHeight,
+                  inputWidth,
+                  filterMultiplier,
+                  filterHeight,
+                  filterWidth,
+                  strideH(),
+                  strideW(),
+                  paddingH(),
+                  paddingW(),
+                  outputData);
+  }
+};
+
+/*
+ * \brief Backward input calculation of depthwise convolution.
+ */
+template <DeviceType Device>
+class DepthwiseConvGradInputFunction : public ConvFunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& input = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    check(inputs, outputs);
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& input = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+    size_t filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(inputChannels, groups_);
+
+    real* outputGrad = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* inputGrad = outputs[0].data<real>();
+
+    DepthwiseConvGradInputFunctor<Device, real> depthwiseConvGradInput;
+    depthwiseConvGradInput(outputGrad,
+                           filterData,
+                           batchSize,
+                           outputChannels,
+                           outputHeight,
+                           outputWidth,
+                           inputChannels,
+                           inputHeight,
+                           inputWidth,
+                           filterMultiplier,
+                           filterHeight,
+                           filterWidth,
+                           strideH(),
+                           strideW(),
+                           paddingH(),
+                           paddingW(),
+                           inputGrad);
+  }
+};
+
+/*
+ * \brief Backward filter calculation of depthwise convolution.
+ */
+template <DeviceType Device>
+class DepthwiseConvGradFilterFunction : public ConvFunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& input = inputs[1].shape();
+    const TensorShape& filter = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    check(inputs, outputs);
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& input = inputs[1].shape();
+    const TensorShape& filter = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+    size_t filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(inputChannels, groups_);
+
+    real* outputGrad = inputs[0].data<real>();
+    real* inputData = inputs[1].data<real>();
+    real* filterGrad = outputs[0].data<real>();
+
+    int size = outputChannels * filterHeight * filterWidth * outputHeight *
+               outputWidth;
+    resizeBuffer<Device>(size);
+    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+
+    DepthwiseConvGradFilterFunctor<Device, real> depthwiseConvGradFilter;
+
+    depthwiseConvGradFilter(outputGrad,
+                            inputData,
+                            batchSize,
+                            outputChannels,
+                            outputHeight,
+                            outputWidth,
+                            inputChannels,
+                            inputHeight,
+                            inputWidth,
+                            filterMultiplier,
+                            filterHeight,
+                            filterWidth,
+                            strideH(),
+                            strideW(),
+                            paddingH(),
+                            paddingW(),
+                            colData,
+                            filterGrad);
+  }
+};
+
+REGISTER_TYPED_FUNC(DepthwiseConv, CPU, DepthwiseConvFunction);
+REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
+                    CPU,
+                    DepthwiseConvGradInputFunction);
+REGISTER_TYPED_FUNC(DepthwiseConvGradFilter,
+                    CPU,
+                    DepthwiseConvGradFilterFunction);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(DepthwiseConv, GPU, DepthwiseConvFunction);
+REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
+                    GPU,
+                    DepthwiseConvGradInputFunction);
+REGISTER_TYPED_FUNC(DepthwiseConvGradFilter,
+                    GPU,
+                    DepthwiseConvGradFilterFunction);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/DepthwiseConvOp.h b/paddle/legacy/function/DepthwiseConvOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..7837edd1c071980592b1cf36ecb69a3b7c12cc5e
--- /dev/null
+++ b/paddle/legacy/function/DepthwiseConvOp.h
@@ -0,0 +1,159 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "TensorType.h"
+
+namespace paddle {
+
+/**
+ *\brief   Depthwise convolution forward. The outputData
+ *         of depthwise convolution is same with ExpandConvLayer
+ *         when groups equals inputChannels in ExpandConvLayer.
+ *
+ * \param[in]   inputData         input data.
+ * \param[in]   filterData        the Paramters of the depthwise conv layer..
+ * \param[in]   batchSize         batch size of input data.
+ * \param[in]   outputChannels    channels of outputData.
+ * \param[in]   outputHeight      height of outputData.
+ * \param[in]   outputWidth       width of outputData.
+ * \param[in]   inputChannels     channels of inputData.
+ * \param[in]   inputHeight       height of inputData.
+ * \param[in]   inputWidth        width of inputData..
+ * \param[in]   filterMultiplier  equals to outputChannels/groups_.
+ * \param[in]   filterHeight      height of filter.
+ * \param[in]   filterWidth       widht of filter.
+ * \param[in]   strideH           stride size in height direction.
+ * \param[in]   strideW           stride size in width direction.
+ * \param[in]   paddingH          padding size in height direction.
+ * \param[in]   paddingW          padding size in width direction.
+ * \param[out]  outputData        outputData.
+ *
+ */
+template <DeviceType Device, class T>
+class DepthwiseConvFunctor {
+ public:
+  void operator()(const T* inputData,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* outputData);
+};
+
+/**
+ *\brief  Functor tot compute the depthwise convolution backprop w.r.t input.
+ *
+ *
+ * \param[in]   outputGradData    the grad data of output.
+ * \param[in]   filterData        the Paramters of the depthwise conv layer..
+ * \param[in]   batchSize         batch size of input data.
+ * \param[in]   outputChannels    channels of outputData.
+ * \param[in]   outputHeight      height of outputData.
+ * \param[in]   outputWidth       width of outputData.
+ * \param[in]   inputChannels     channels of input data.
+ * \param[in]   inputHeight       height of inputData.
+ * \param[in]   inputWidth        width of inputData.
+ * \param[in]   filterMultiplier  equals to outputChannels/groups_.
+ * \param[in]   filterHeight      height of filter.
+ * \param[in]   filterWidth       widht of filter.
+ * \param[in]   strideH           stride size in height direction.
+ * \param[in]   strideW           stride size in width direction.
+ * \param[in]   paddingH          padding size in height direction.
+ * \param[in]   paddingW          padding size in width direction.
+ * \param[out]  inputGrad         the grad data of input.
+ *
+ */
+template <DeviceType Device, class T>
+class DepthwiseConvGradInputFunctor {
+ public:
+  void operator()(const T* outputGrad,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* inputGrad);
+};
+
+/**
+ *\brief  Functor tot compute the depthwise convolution backprop w.r.t filter.
+ *
+ * \param[in]   outputGradData    the grad data of output.
+ * \param[in]   inputData         inputData.
+ * \param[in]   batchSize         batch size of input data.
+ * \param[in]   outputChannels    channels of outputData.
+ * \param[in]   outputHeight      height of outputData.
+ * \param[in]   outputWidth       width of outputData.
+ * \param[in]   inputChannels     channels of input data.
+ * \param[in]   inputHeight       height of inputData.
+ * \param[in]   inputWidth        width of inputData.
+ * \param[in]   filterMultiplier  equals to outputChannels/groups_.
+ * \param[in]   filterHeight      height of filter.
+ * \param[in]   filterWidth       widht of filter.
+ * \param[in]   strideH           stride size in height direction.
+ * \param[in]   strideW           stride size in width direction.
+ * \param[in]   paddingH          padding size in height direction.
+ * \param[in]   paddingW          padding size in width direction.
+ * \param[in]   colData           Auxiliary data when calculating filterGrad.
+ * \param[in]   multiplierData    Auxiliary data when calculating filterGrad.
+ * \param[out]  filterGrad        the grad data of filter.
+ *
+ */
+template <DeviceType Device, class T>
+class DepthwiseConvGradFilterFunctor {
+ public:
+  void operator()(const T* outputGrad,
+                  const T* inputData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* colData,
+                  T* filterGrad);
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/DepthwiseConvOpGpu.cu b/paddle/legacy/function/DepthwiseConvOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..17138cc56390d0fcfb15d4b77a56eda466bcfd3c
--- /dev/null
+++ b/paddle/legacy/function/DepthwiseConvOpGpu.cu
@@ -0,0 +1,376 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DepthwiseConvOp.h"
+#include "paddle/legacy/math/BaseMatrix.h"
+
+namespace paddle {
+
+// CUDA kernel to compute the depthwise convolution forward pass
+template <class T>
+__global__ void ConvolutionDepthwiseForward(const int nthreads,
+                                            const T* const inputData,
+                                            const T* const filterData,
+                                            const int batchSize,
+                                            const int outputChannels,
+                                            const int outputHeight,
+                                            const int outputWidth,
+                                            const int inputChannels,
+                                            const int inputHeight,
+                                            const int inputWidth,
+                                            const int filterMultiplier,
+                                            const int filterHeight,
+                                            const int filterWidth,
+                                            const int strideH,
+                                            const int strideW,
+                                            const int paddingH,
+                                            const int paddingW,
+                                            T* const outputData) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if (index < nthreads) {
+    const int batch = index / outputChannels / outputHeight / outputWidth;
+    const int c_out = (index / outputHeight / outputWidth) % outputChannels;
+    const int h_out = (index / outputWidth) % outputHeight;
+    const int w_out = index % outputWidth;
+
+    const int c_in = c_out / filterMultiplier;
+    const T* weight = filterData + c_out * filterHeight * filterWidth;
+    T value = 0;
+    const int h_in_start = -paddingH + h_out * strideH;
+    const int w_in_start = -paddingW + w_out * strideW;
+    const int h_in_end = -paddingH + h_out * strideH + filterHeight - 1;
+    const int w_in_end = -paddingW + w_out * strideW + filterWidth - 1;
+    if ((h_in_start >= 0) && (h_in_end < inputHeight) && (w_in_start >= 0) &&
+        (w_in_end < inputWidth)) {
+      for (int kh = 0; kh < filterHeight; ++kh) {
+        for (int kw = 0; kw < filterWidth; ++kw) {
+          const int h_in = -paddingH + h_out * strideH + kh;
+          const int w_in = -paddingW + w_out * strideW + kw;
+          const int offset =
+              ((batch * inputChannels + c_in) * inputHeight + h_in) *
+                  inputWidth +
+              w_in;
+          value += (*weight) * inputData[offset];
+          ++weight;
+        }
+      }
+    } else {
+      for (int kh = 0; kh < filterHeight; ++kh) {
+        for (int kw = 0; kw < filterWidth; ++kw) {
+          const int h_in = -paddingH + h_out * strideH + kh;
+          const int w_in = -paddingW + w_out * strideW + kw;
+          if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
+              (w_in < inputWidth)) {
+            const int offset =
+                ((batch * inputChannels + c_in) * inputHeight + h_in) *
+                    inputWidth +
+                w_in;
+            value += (*weight) * inputData[offset];
+          }
+          ++weight;
+        }
+      }
+    }
+    outputData[index] = value;
+  }
+}
+
+// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
+template <class T>
+__global__ void ConvolutionDepthwiseInputBackward(const int nthreads,
+                                                  const T* const top_diff,
+                                                  const T* const weight_data,
+                                                  const int num,
+                                                  const int outputChannels,
+                                                  const int outputHeight,
+                                                  const int outputWidth,
+                                                  const int inputChannels,
+                                                  const int inputHeight,
+                                                  const int inputWidth,
+                                                  const int filterMultiplier,
+                                                  const int filterHeight,
+                                                  const int filterWidth,
+                                                  const int strideH,
+                                                  const int strideW,
+                                                  const int paddingH,
+                                                  const int paddingW,
+                                                  T* const bottom_diff) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    const int batch = index / inputChannels / inputHeight / inputWidth;
+    const int c_in = (index / inputHeight / inputWidth) % inputChannels;
+    const int h_in = (index / inputWidth) % inputHeight;
+    const int w_in = index % inputWidth;
+
+    const int c_out_start = c_in * filterMultiplier;
+
+    int h_out_start = (h_in - filterHeight + paddingH + strideH) / strideH;
+    h_out_start = 0 > h_out_start ? 0 : h_out_start;
+    int h_out_end = (h_in + paddingH) / strideH;
+    h_out_end = outputHeight - 1 < h_out_end ? outputHeight - 1 : h_out_end;
+    int w_out_start = (w_in - filterWidth + paddingW + strideW) / strideW;
+    w_out_start = 0 > w_out_start ? 0 : w_out_start;
+    int w_out_end = (w_in + paddingW) / strideW;
+    w_out_end = outputWidth - 1 < w_out_end ? outputWidth - 1 : w_out_end;
+
+    T value = 0;
+
+    for (int c_out = c_out_start; c_out < c_out_start + filterMultiplier;
+         c_out++) {
+      for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
+        const int filter_h = h_in + paddingH - h_out * strideH;
+        for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
+          const int filter_w = w_in + paddingW - w_out * strideW;
+          const int filter_offset = c_out * filterHeight * filterWidth +
+                                    filter_h * filterWidth + filter_w;
+          const int top_diff_offset =
+              ((batch * outputChannels + c_out) * outputHeight + h_out) *
+                  outputWidth +
+              w_out;
+          value += top_diff[top_diff_offset] * weight_data[filter_offset];
+        }
+      }
+    }
+    bottom_diff[index] += value;
+  }
+}
+
+// CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
+template <class T>
+__global__ void ConvolutionDepthwiseFilterBackward(const int num_i,
+                                                   const int nthreads,
+                                                   const T* const top_diff,
+                                                   const T* const inputData,
+                                                   const int num,
+                                                   const int outputChannels,
+                                                   const int outputHeight,
+                                                   const int outputWidth,
+                                                   const int inputChannels,
+                                                   const int inputHeight,
+                                                   const int inputWidth,
+                                                   const int filterMultiplier,
+                                                   const int filterHeight,
+                                                   const int filterWidth,
+                                                   const int strideH,
+                                                   const int strideW,
+                                                   const int paddingH,
+                                                   const int paddingW,
+                                                   T* const buffer_data) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    const int h_out = (index / outputWidth) % outputHeight;
+    const int w_out = index % outputWidth;
+    const int kh =
+        (index / filterWidth / outputHeight / outputWidth) % filterHeight;
+    const int kw = (index / outputHeight / outputWidth) % filterWidth;
+    const int h_in = -paddingH + h_out * strideH + kh;
+    const int w_in = -paddingW + w_out * strideW + kw;
+    if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
+        (w_in < inputWidth)) {
+      const int c_out =
+          index / (filterHeight * filterWidth * outputHeight * outputWidth);
+      const int c_in = c_out / filterMultiplier;
+      const int batch = num_i;
+      const int top_offset =
+          ((batch * outputChannels + c_out) * outputHeight + h_out) *
+              outputWidth +
+          w_out;
+      const int bottom_offset =
+          ((batch * inputChannels + c_in) * inputHeight + h_in) * inputWidth +
+          w_in;
+      buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset];
+    } else {
+      buffer_data[index] = 0;
+    }
+  }
+}
+
+template <class T>
+class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T> {
+ public:
+  void operator()(const T* inputData,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* outputData) {
+    int outputSize = batchSize * outputChannels * outputHeight * outputWidth;
+
+    size_t blocks = (outputSize + 1024 - 1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+
+    ConvolutionDepthwiseForward<T><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        outputSize,
+        inputData,
+        filterData,
+        batchSize,
+        outputChannels,
+        outputHeight,
+        outputWidth,
+        inputChannels,
+        inputHeight,
+        inputWidth,
+        filterMultiplier,
+        filterHeight,
+        filterWidth,
+        strideH,
+        strideW,
+        paddingH,
+        paddingW,
+        outputData);
+  }
+};
+
+template <class T>
+class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T> {
+ public:
+  void operator()(const T* outputGrad,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* inputGrad) {
+    int inputSize = batchSize * inputChannels * inputHeight * inputWidth;
+
+    size_t blocks = (inputSize + 1024 - 1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+
+    ConvolutionDepthwiseInputBackward<T>
+        // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<grid, threads, 0, STREAM_DEFAULT>>>(inputSize,
+                                               outputGrad,
+                                               filterData,
+                                               batchSize,
+                                               outputChannels,
+                                               outputHeight,
+                                               outputWidth,
+                                               inputChannels,
+                                               inputHeight,
+                                               inputWidth,
+                                               filterMultiplier,
+                                               filterHeight,
+                                               filterWidth,
+                                               strideH,
+                                               strideW,
+                                               paddingH,
+                                               paddingW,
+                                               inputGrad);
+  }
+};
+
+template <class T>
+class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, T> {
+ public:
+  void operator()(const T* outputGrad,
+                  const T* inputData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* colData,
+                  T* filterGrad) {
+    int colDataSize = outputChannels * filterHeight * filterWidth *
+                      outputHeight * outputWidth;
+
+    size_t blocks = (colDataSize + 1024 - 1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+    BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth,
+                                1,
+                                filterGrad,
+                                false,
+                                true);
+
+    for (int i = 0; i < batchSize; i++) {
+      ConvolutionDepthwiseFilterBackward<
+          T><<<grid, threads, 0, STREAM_DEFAULT>>>(i,
+                                                   colDataSize,
+                                                   outputGrad,
+                                                   inputData,
+                                                   batchSize,
+                                                   outputChannels,
+                                                   outputHeight,
+                                                   outputWidth,
+                                                   inputChannels,
+                                                   inputHeight,
+                                                   inputWidth,
+                                                   filterMultiplier,
+                                                   filterHeight,
+                                                   filterWidth,
+                                                   strideH,
+                                                   strideW,
+                                                   paddingH,
+                                                   paddingW,
+                                                   colData);
+      int K = outputHeight * outputWidth;
+      int M = colDataSize / K;
+
+      BaseMatrix colMatrix(M, K, colData, false, true);
+      filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0);
+    }
+  }
+};
+
+#ifdef PADDLE_TYPE_DOUBLE
+template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, double>;
+template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, double>;
+template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, double>;
+#else
+template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, float>;
+template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, float>;
+template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, float>;
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/DepthwiseConvOpTest.cpp b/paddle/legacy/function/DepthwiseConvOpTest.cpp
similarity index 100%
rename from paddle/function/DepthwiseConvOpTest.cpp
rename to paddle/legacy/function/DepthwiseConvOpTest.cpp
diff --git a/paddle/legacy/function/EigenGemm.cpp b/paddle/legacy/function/EigenGemm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5929c5c68ec818c2307580b06f76c63f04e0db5f
--- /dev/null
+++ b/paddle/legacy/function/EigenGemm.cpp
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include "paddle/legacy/function/EigenThreadDevice.h"
+
+namespace paddle {
+
+template <class T>
+struct EigenBlasGemm {
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, int>,
+                           Eigen::Aligned>
+      EigenMatrix;
+
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc) {
+    Eigen::array<int, 2> sizeA;
+    if (transA) {
+      sizeA[0] = K;
+      sizeA[1] = M;
+      CHECK_EQ(M, lda);
+    } else {
+      sizeA[0] = M;
+      sizeA[1] = K;
+      CHECK_EQ(K, lda);
+    }
+    Eigen::array<int, 2> sizeB;
+    if (transB) {
+      sizeB[0] = N;
+      sizeB[1] = K;
+      CHECK_EQ(K, ldb);
+    } else {
+      sizeB[0] = K;
+      sizeB[1] = N;
+      CHECK_EQ(N, ldb);
+    }
+    Eigen::array<int, 2> sizeC = {{M, ldc}};
+    Eigen::array<int, 2> offsetC = {{0, 0}};
+    Eigen::array<int, 2> extentC = {{M, N}};
+
+    const EigenMatrix a(const_cast<T*>(A), sizeA);
+    const EigenMatrix b(const_cast<T*>(B), sizeB);
+    EigenMatrix c(C, sizeC);
+
+    typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
+    Eigen::array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
+    dims[0].first = transA ? 0 : 1;
+    dims[0].second = transB ? 1 : 0;
+
+    auto* device = EigenDeviceWarpper::device();
+    if (N == ldc) {
+      if (alpha == T(1) && beta == T(0)) {
+        c.device(*device) = a.contract(b, dims);
+      } else if (alpha == T(1) && beta == T(1)) {
+        c.device(*device) += a.contract(b, dims);
+      } else {
+        c.device(*device) = alpha * a.contract(b, dims) + beta * c;
+      }
+    } else {
+      if (alpha == T(1) && beta == T(0)) {
+        c.slice(offsetC, extentC).device(*device) = a.contract(b, dims);
+      } else if (alpha == T(1) && beta == T(1)) {
+        c.slice(offsetC, extentC).device(*device) += a.contract(b, dims);
+      } else {
+        c.slice(offsetC, extentC).device(*device) =
+            alpha * a.contract(b, dims) + beta * c.slice(offsetC, extentC);
+      }
+    }
+    EigenDeviceWarpper::free_device(device);
+  }
+};
+
+#ifdef PADDLE_TYPE_DOUBLE
+template struct EigenBlasGemm<double>;
+#else
+template struct EigenBlasGemm<float>;
+#endif
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/EigenThreadDevice.h b/paddle/legacy/function/EigenThreadDevice.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb92251c827a26d55ca021c4418182bae28dd6a5
--- /dev/null
+++ b/paddle/legacy/function/EigenThreadDevice.h
@@ -0,0 +1,73 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License. */
+
+#pragma once
+
+#if defined(__OSX__) || defined(__APPLE__)
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#endif
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+
+#if defined(__ANDROID__)
+int GetCpuCount() {
+  FILE* fp = fopen("/sys/devices/system/cpu/possible", "r");
+  if (!fp) {
+    return 1;
+  }
+  int rank0, rank1;
+  int num = fscanf(fp, "%d-%d", &rank0, &rank1);
+  fclose(fp);
+  if (num < 2) return 1;
+  return rank1 + 1;
+}
+#elif defined(__OSX__) || defined(__APPLE__)
+int GetCpuCount() {
+  int count = 0;
+  size_t len = sizeof(int);
+  sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
+  return count > 0 ? count : 1;
+}
+#else
+int GetCpuCount() { return 1; }
+#endif
+
+class EigenDeviceWarpper {
+ public:  // NOLINT
+#if EIGEN_USE_THREADS
+  static Eigen::ThreadPoolDevice* device() {
+    const int num_cpus = GetCpuCount();
+    const int num_threads = (num_cpus > 2) ? 2 : num_cpus;
+    static Eigen::ThreadPool tp(num_threads);
+    static Eigen::ThreadPoolDevice* device =
+        new Eigen::ThreadPoolDevice(&tp, num_threads);
+    return device;
+  }
+
+  static void free_device(Eigen::ThreadPoolDevice* device) {
+    // do nothing
+  }
+#else
+  static Eigen::DefaultDevice* device() {
+    Eigen::DefaultDevice* device = new Eigen::DefaultDevice;
+    return device;
+  }
+
+  static void free_device(Eigen::DefaultDevice* device) { delete device; }
+#endif
+};
+
+}  // namespace paddle
diff --git a/paddle/function/Function.cpp b/paddle/legacy/function/Function.cpp
similarity index 100%
rename from paddle/function/Function.cpp
rename to paddle/legacy/function/Function.cpp
diff --git a/paddle/legacy/function/Function.h b/paddle/legacy/function/Function.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc5ef7e6f20b63a120a577ded876820aafecff19
--- /dev/null
+++ b/paddle/legacy/function/Function.h
@@ -0,0 +1,214 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <vector>
+#include "BufferArg.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Any.h"
+#include "paddle/legacy/utils/ClassRegistrar.h"
+#include "paddle/legacy/utils/Error.h"
+
+namespace paddle {
+
+/**
+ * Function Configuration.
+ * The argument type of Function::init.
+ */
+class FuncConfig {
+ public:
+  template <typename T>
+  T get(const std::string& key, Error* err = nullptr) const {
+    try {
+      return any_cast<T>(valueMap_.at(key));
+    } catch (std::exception& e) {  // could be cast or out of range exception.
+      if (err) {
+        *err = Error(e.what());
+      } else {
+        LOG(FATAL) << "Cannot get key " << key << " with error " << e.what();
+      }
+      return T();
+    }
+  }
+
+  template <typename T>
+  FuncConfig& set(const std::string& key, T v, Error* err = nullptr) {
+    auto it = valueMap_.find(key);
+    if (it != valueMap_.end()) {  // already contains key.
+      if (err) {
+        *err = Error("Key %s is already set in FuncConfig", key.c_str());
+      } else {
+        LOG(FATAL) << "Key " << key << " is already set in FuncConfig.";
+      }
+      return *this;
+    }
+    valueMap_[key] = any(v);
+    return *this;
+  }
+
+ protected:
+  mutable std::unordered_map<std::string, any> valueMap_;
+};
+
+/**
+ * Argument type for Function::calc().
+ * A BufferArgs contains a set of BufferArg,
+ * because Function can have multiple inputs and outputs.
+ *
+ * addArg() with Matix object used to adapt Layer Argument.
+ * Will create a BufferArg object in addArg(),
+ * and free in destructor of BufferArgs.
+ *
+ * addArg() with BufferArg object, just save BufferArg object address,
+ * and the caller needs to guarantee the validity of the BufferArg object
+ * in the BufferArgs life time.
+ */
+class BufferArgs {
+ public:
+  BufferArgs() {}
+
+  ~BufferArgs() {
+    for (auto arg : _args_) {
+      delete arg;
+    }
+  }
+
+  size_t size() const { return args_.size(); }
+
+  // add argument into BufferArgs
+  // Tensor can be Matrix, Vector, IVector.
+  // For inputs, do not need argType.
+  // For outputs, the argType needs to be specified as ASSIGN_TO or ADD_TO.
+  void addArg(const Matrix& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
+  }
+
+  void addArg(const Vector& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
+  }
+
+  void addArg(const IVector& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
+  }
+
+  // Add arg into BufferArgs and reshape the arg.
+  //
+  // For example, arg represents an image buffer,
+  // but Matrix can only represent a two-dimensional Tensor.
+  // So need an extra argument to describe the shape of the image buffer.
+  void addArg(const Matrix& arg,
+              const TensorShape& shape,
+              ArgType argType = UNSPECIFIED);
+
+  void addArg(const CpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
+  void addArg(const GpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
+
+  void addArg(const Matrix& matrix,
+              const IVector& vector,
+              ArgType argType = UNSPECIFIED);
+
+  // get argument
+  const BufferArg& operator[](size_t num) const {
+    CHECK_LT(num, args_.size());
+    return *args_[num];
+  }
+
+  void addArg(BufferArg& arg) { args_.push_back(&arg); }
+
+  void addArg(SequenceIdArg& arg) { args_.push_back(&arg); }
+
+  void addArg(SequenceArg& arg) { args_.push_back(&arg); }
+
+  void addArg(SparseMatrixArg& arg) { args_.push_back(&arg); }
+
+ private:
+  std::vector<BufferArg*> args_;
+  // The BufferArg object is constructed and freed by BufferArgs.
+  std::vector<BufferArg*> _args_;
+};
+
+/**
+ * \brief Base class for Function.
+ * The basic Function implementation requires override init and calc interfaces.
+ *
+ * The caller needs to ensure the validity of the arguments
+ * during Function execution.
+ *
+ * Function inputs are readonly, Function outputs have two modes: ASSIGN_TO
+ * and ADD_TO.
+ * If output.getArgType() == ASSIGN_TO, this is assign mode, and the calculation
+ * result of Function assigned to the output BufferArg.
+ * If output.getArgType() == ADD_TO, this is add mode, and the calculation
+ * result of Function need added to the output BufferArg.
+ *
+ * For example:
+ * ASSIGN_TO: output = Function(inputs)
+ * ADD_TO: output += Function(inputs)
+ * If Function has more than one output, each output can have different modes.
+ */
+class FunctionBase {
+ public:
+  virtual ~FunctionBase() {}
+
+  virtual void init(const FuncConfig& config) {}
+
+  virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {}
+
+  // This member function is used to check whether the BufferType and shape of
+  // the inputs and outputs arguments of the Function are correct.
+  // General calc function which will call this check to do arguments check.
+  // And before the calc called, the caller can also check their own arguments.
+  virtual void check(const BufferArgs& inputs, const BufferArgs& outputs) {}
+
+  // Calculate the number of floating-point operations of this Function.
+  // The inputs and outputs arguments do not need to contain the actual data,
+  // only the shape.
+  // And some Functions have the same input and output shapes,
+  // so you may not need to enter the complete number of arguments.
+  // But entering the full arguments is always correct for this interface.
+  virtual size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) {
+    return 0;
+  }
+
+  int getNumInputs() const { return numInputs_; }
+
+  int getNumOutputs() const { return numOutputs_; }
+
+  static ClassRegistrar<FunctionBase> funcRegistrar_;
+
+ protected:
+  // numInputs_ and numOutputs_ represents the maximum
+  // input and output supported by Function.
+  // Some functions are optimized for input and output,
+  // so when comparing the number of arguments, for these functions
+  // inputs.size() <= numInputs_ or outputs.size() <= numOutputs_
+  size_t numInputs_;
+  size_t numOutputs_;
+};
+
+#define FUNC_NAME(typeName, deviceName) #typeName "-" #deviceName
+
+#define REGISTER_TYPED_FUNC(typeName, deviceName, className)   \
+  static InitFunction __reg_type_##typeName##deviceName([]() { \
+    FunctionBase::funcRegistrar_                               \
+        .registerClass<className<DEVICE_TYPE_##deviceName>>(   \
+            FUNC_NAME(typeName, deviceName));                  \
+  })
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/FunctionTest.cpp b/paddle/legacy/function/FunctionTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1a0993e3135bcad9eb8a431e079ed56a267174ea
--- /dev/null
+++ b/paddle/legacy/function/FunctionTest.cpp
@@ -0,0 +1,166 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Function.h"
+#include <gtest/gtest.h>
+#include "paddle/legacy/math/SparseMatrix.h"
+
+namespace paddle {
+
+template <DeviceType DType>
+void FunctionApi(typename Tensor<real, DType>::Matrix& output,
+                 const typename Tensor<real, DType>::Matrix& input);
+
+template <>
+void FunctionApi<DEVICE_TYPE_CPU>(CpuMatrix& output, const CpuMatrix& input) {
+  EXPECT_EQ(output.getHeight(), 100U);
+  EXPECT_EQ(output.getWidth(), 200U);
+}
+
+template <>
+void FunctionApi<DEVICE_TYPE_GPU>(GpuMatrix& output, const GpuMatrix& input) {
+  EXPECT_EQ(output.getHeight(), 10U);
+  EXPECT_EQ(output.getWidth(), 20U);
+}
+
+template <DeviceType DType>
+void Function(const BufferArgs& arguments) {
+  const auto input = arguments[0].matrix<DType>();
+  auto output = arguments[1].matrix<DType>();
+  FunctionApi<DType>(output, input);
+}
+
+TEST(Function, BufferArgs) {
+  CpuMatrix cpuInput = CpuMatrix(100, 200);
+  CpuMatrix cpuOutput = CpuMatrix(100, 200);
+  BufferArgs cpuArgments;
+  cpuArgments.addArg(cpuInput);
+  cpuArgments.addArg(cpuOutput);
+  Function<DEVICE_TYPE_CPU>(cpuArgments);
+
+  GpuMatrix gpuInput = GpuMatrix(10, 20);
+  GpuMatrix gpuOutput = GpuMatrix(10, 20);
+  BufferArgs gpuArgments;
+  gpuArgments.addArg(gpuInput);
+  gpuArgments.addArg(gpuOutput);
+  Function<DEVICE_TYPE_GPU>(gpuArgments);
+}
+
+/**
+ * Some tests case are used to check the consistency between the BufferArg type
+ * argument received by Function and the original type argument.
+ *
+ * Use Case:
+ *  TEST() {
+ *    Matrix matrix(...);
+ *    CheckBufferArg lambda = [=](const BufferArg& arg) {
+ *      // check matrix and arg are equivalent
+ *      EXPECT_EQ(matrix, arg);
+ *    }
+ *
+ *   BufferArgs argments{matrix...};
+ *   std::vector<CheckBufferArg> checkFunc{lambda...};
+ *   testBufferArgs(argments, checkFunc);
+ *  }
+ */
+typedef std::function<void(const BufferArg&)> CheckBufferArg;
+
+void testBufferArgs(const BufferArgs& inputs,
+                    const std::vector<CheckBufferArg>& check) {
+  EXPECT_EQ(inputs.size(), check.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    check[i](inputs[i]);
+  }
+}
+
+void testBufferArgs(const BufferArgs& inputs, const CheckBufferArg& check) {
+  EXPECT_EQ(inputs.size(), 1U);
+  check(inputs[0]);
+}
+
+TEST(Arguments, Matrix) {
+  MatrixPtr matrix = Matrix::create(100, 200);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 2U);
+    EXPECT_EQ(arg.shape()[0], 100U);
+    EXPECT_EQ(arg.shape()[1], 200U);
+    EXPECT_EQ(arg.data(), matrix->getData());
+
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getHeight(), matrix->getHeight());
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getWidth(), matrix->getWidth());
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
+  };
+
+  BufferArgs argments;
+  argments.addArg(*matrix);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+
+TEST(Arguments, Vector) {
+  VectorPtr vector = Vector::create(100, false);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 1U);
+    EXPECT_EQ(arg.shape()[0], 100U);
+    EXPECT_EQ(arg.data(), vector->getData());
+
+    CpuVector inVector = arg.vector<real, DEVICE_TYPE_CPU>();
+    EXPECT_EQ(inVector.getSize(), vector->getSize());
+    EXPECT_EQ(inVector.getData(), vector->getData());
+  };
+
+  BufferArgs argments;
+  argments.addArg(*vector);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+
+TEST(Arguments, CpuSparseMatrix) {
+  CpuSparseMatrix sparse(200, 300, 50);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 2U);
+    EXPECT_EQ(arg.shape()[0], 200U);
+    EXPECT_EQ(arg.shape()[1], 300U);
+    EXPECT_EQ(arg.data(), sparse.getData());
+    // CHECK_EQ(arg.sparse().nnz(), 50);
+    // CHECK_EQ(arg.sparse().dataFormat(), SPARSE_CSR_FORMAT);
+    // CHECK_EQ(arg.sparse().dataType(), SPARSE_FLOAT_VALUE);
+    EXPECT_EQ(arg.sparse().getRowBuf(), sparse.getRows());
+    EXPECT_EQ(arg.sparse().getColBuf(), sparse.getCols());
+  };
+
+  BufferArgs argments;
+  argments.addArg(sparse);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+
+TEST(Arguments, BufferArg) {
+  BufferArg arg(nullptr, VALUE_TYPE_FLOAT, {1, 2, 3});
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 3U);
+    EXPECT_EQ(arg.shape()[0], 1U);
+    EXPECT_EQ(arg.shape()[1], 2U);
+    EXPECT_EQ(arg.shape()[2], 3U);
+  };
+
+  BufferArgs argments;
+  argments.addArg(arg);
+  testBufferArgs(argments, check);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/FunctionTest.h b/paddle/legacy/function/FunctionTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f01981a34bff0a7d9bb04d0a0012117ecf5f803
--- /dev/null
+++ b/paddle/legacy/function/FunctionTest.h
@@ -0,0 +1,410 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Function.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/legacy/math/tests/TensorCheck.h"
+#include "paddle/testing/TestUtil.h"
+
+namespace paddle {
+
+typedef std::shared_ptr<BufferArg> BufferArgPtr;
+
+namespace test {
+template <DeviceType DType>
+struct Allocator;
+
+template <>
+struct Allocator<DEVICE_TYPE_CPU> {
+  using type = CpuMemoryHandle;
+};
+
+template <>
+struct Allocator<DEVICE_TYPE_GPU> {
+  using type = GpuMemoryHandle;
+};
+
+// Copy argument1 to argument2
+template <DeviceType DType1, DeviceType DType2>
+class CopyArgument {
+ public:
+  void operator()(const BufferArg& arg1, BufferArg& arg2) {
+    CHECK_EQ(arg1.valueType(), arg2.valueType());
+    CHECK_LE(arg1.shape().getElements(), arg2.shape().getElements());
+
+    if (arg1.valueType() == VALUE_TYPE_INT32) {
+      IVectorPtr vector1 =
+          IVector::create((int*)arg1.data(),
+                          arg1.shape().getElements(),
+                          DType1 == DEVICE_TYPE_CPU ? false : true);
+      IVectorPtr vector2 =
+          IVector::create((int*)arg2.data(),
+                          arg2.shape().getElements(),
+                          DType2 == DEVICE_TYPE_CPU ? false : true);
+      vector2->copyFrom(*vector1);
+    } else {
+      VectorPtr vector1 =
+          Vector::create((real*)arg1.data(),
+                         arg1.shape().getElements(),
+                         DType1 == DEVICE_TYPE_CPU ? false : true);
+      VectorPtr vector2 =
+          Vector::create((real*)arg2.data(),
+                         arg2.shape().getElements(),
+                         DType2 == DEVICE_TYPE_CPU ? false : true);
+      vector2->copyFrom(*vector1);
+    }
+  }
+};
+}  // namespace test
+
+/**
+ * \brief A class for comparing two Functions of different implementations.
+ *        For example, can be used to compare the CPU and GPU implementation
+ *        of the function is consistent.
+ *
+ * Use case:
+ *  // Initializes a test object, the corresponding cpu and gpu Function
+ *  // are constructed according to FunctionName and FuncConfig.
+ *  CpuGpuFuncCompare test(FunctionName, FuncConfig);
+ *  // Prepare inputs and outputs arguments.
+ *  // Here the input and output can not contain real data,
+ *  // only contains the argument type and shape.
+ *  test.addInputs(input1);
+ *  test.addInputs(input2);
+ *  test.addOutputs(output1);
+ *  test.addOutputs(output2);
+ *  // Run.
+ *  // Will according to the type and shape of arguments(inputs_/outputs_),
+ *  // automatic initialization cpu and gpu function required arguments
+ *  // (cpuInputs_/cpuOutputs_/gpuInputs_/gpuOutputs_).
+ *  // Call the CPU and GPU Function calculation results.
+ *  // Compares CPU and GPU calculation results for consistency.
+ *  test.run();
+ */
+template <DeviceType DType1, DeviceType DType2>
+class Compare2Function {
+ public:
+  typedef typename test::Allocator<DType1>::type Allocator1;
+  typedef typename test::Allocator<DType2>::type Allocator2;
+  typedef typename Tensor<real, DType1>::Vector Vector1;
+  typedef typename Tensor<real, DType2>::Vector Vector2;
+  typedef typename Tensor<real, DType1>::SparseMatrix SparseMatrix1;
+  typedef typename Tensor<real, DType2>::SparseMatrix SparseMatrix2;
+
+  Compare2Function(const std::string& name1,
+                   const std::string& name2,
+                   const FuncConfig& config)
+      : function1_(FunctionBase::funcRegistrar_.createByType(name1)),
+        function2_(FunctionBase::funcRegistrar_.createByType(name2)) {
+    function1_->init(config);
+    function2_->init(config);
+    initArgsCallback_ = nullptr;
+  }
+
+  ~Compare2Function() {}
+
+  // input need only contains shape, do not contains data.
+  void addInputs(const BufferArg& input) {
+    size_t size =
+        input.shape().getElements() * sizeOfValuType(input.valueType());
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
+
+    func1Inputs_.emplace_back(std::make_shared<BufferArg>(
+        func1Memory_.back()->getBuf(), input.valueType(), input.shape()));
+    func2Inputs_.emplace_back(std::make_shared<BufferArg>(
+        func2Memory_.back()->getBuf(), input.valueType(), input.shape()));
+  }
+
+  // assume one copy of sequence is shared by different SequenceArgs
+  void addSequence(const SequenceIdArg& input) {
+    CHECK_EQ(input.shape().ndims(), 1UL);
+    size_t batchSize = input.shape()[0];
+    size_t numSeqs = batchSize / 10 + 1;
+    size_t sizeId = (numSeqs + 1) * sizeOfValuType(VALUE_TYPE_INT32);
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(sizeId));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(sizeId));
+    seq1_ = std::make_shared<SequenceIdArg>(func1Memory_.back()->getBuf(),
+                                            TensorShape{numSeqs + 1});
+    seq2_ = std::make_shared<SequenceIdArg>(func2Memory_.back()->getBuf(),
+                                            TensorShape{numSeqs + 1});
+    /// init sequence Id
+    initArg(*seq1_, batchSize);
+
+    copyArg_(*seq1_, *seq2_);
+  }
+
+  void addInputs(const SequenceArg& input) {
+    CHECK_EQ(input.shape().ndims(), 2UL);
+    size_t batchSize = input.shape()[0];
+    if (!seq1_ || !seq2_) {  // sequence not exist
+      addSequence(SequenceIdArg(TensorShape{batchSize}));
+    }
+
+    size_t size =
+        input.shape().getElements() * sizeOfValuType(input.valueType());
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
+
+    /// SequenceArg
+    func1Inputs_.emplace_back(
+        std::make_shared<SequenceArg>(func1Memory_.back()->getBuf(),
+                                      input.valueType(),
+                                      input.shape(),
+                                      *seq1_));
+    func2Inputs_.emplace_back(
+        std::make_shared<SequenceArg>(func2Memory_.back()->getBuf(),
+                                      input.valueType(),
+                                      input.shape(),
+                                      *seq2_));
+  }
+
+  void registerInitCallback(std::function<void(BufferArg&, size_t)> callback) {
+    initArgsCallback_ = callback;
+  }
+
+  // output need only contains shape, do not contains data.
+  void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) {
+    size_t size =
+        output.shape().getElements() * sizeOfValuType(output.valueType());
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
+
+    func1Outputs_.emplace_back(
+        std::make_shared<BufferArg>(func1Memory_.back()->getBuf(),
+                                    output.valueType(),
+                                    output.shape(),
+                                    argType));
+    func2Outputs_.emplace_back(
+        std::make_shared<BufferArg>(func2Memory_.back()->getBuf(),
+                                    output.valueType(),
+                                    output.shape(),
+                                    argType));
+  }
+
+  /// add and init output sparse matrix
+  void addOutputs(const SparseMatrixArg& output, ArgType argType = ASSIGN_TO) {
+    sparse1_ = std::make_shared<SparseMatrix1>(
+        output.shape()[0],
+        output.shape()[1],
+        output.nnz(),
+        static_cast<SparseValueType>(output.dataType()),
+        static_cast<SparseFormat>(output.dataFormat()));
+
+    sparse2_ = std::make_shared<SparseMatrix2>(
+        output.shape()[0],
+        output.shape()[1],
+        output.nnz(),
+        static_cast<SparseValueType>(output.dataType()),
+        static_cast<SparseFormat>(output.dataFormat()));
+
+    /// init sparse matrix
+    hl_stream_t stream(HPPL_STREAM_1);
+    sparse1_->randomizeUniform();
+    sparse2_->copyFrom(*sparse1_, stream);
+    hl_stream_synchronize(stream);
+
+    func1Outputs_.emplace_back(
+        std::make_shared<SparseMatrixArg>(*sparse1_, argType));
+    func2Outputs_.emplace_back(
+        std::make_shared<SparseMatrixArg>(*sparse2_, argType));
+  }
+
+  void addOutputs(const SequenceArg& output, ArgType argType = ASSIGN_TO) {
+    CHECK_EQ(output.shape().ndims(), 2UL);
+    size_t batchSize = output.shape()[0];
+
+    if (!seq1_ || !seq2_) {  // sequence not exist
+      addSequence(SequenceIdArg(TensorShape{batchSize}));
+    }
+    size_t size =
+        output.shape().getElements() * sizeOfValuType(output.valueType());
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
+
+    /// SequenceArg
+    func1Outputs_.emplace_back(
+        std::make_shared<SequenceArg>(func1Memory_.back()->getBuf(),
+                                      output.valueType(),
+                                      output.shape(),
+                                      *seq1_,
+                                      argType));
+    func2Outputs_.emplace_back(
+        std::make_shared<SequenceArg>(func2Memory_.back()->getBuf(),
+                                      output.valueType(),
+                                      output.shape(),
+                                      *seq2_,
+                                      argType));
+  }
+
+  void addInputs(const SparseMatrixArg& input) {
+    sparse1_ = std::make_shared<SparseMatrix1>(
+        input.shape()[0],
+        input.shape()[1],
+        input.nnz(),
+        static_cast<SparseValueType>(input.dataType()),
+        static_cast<SparseFormat>(input.dataFormat()));
+
+    sparse2_ = std::make_shared<SparseMatrix2>(
+        input.shape()[0],
+        input.shape()[1],
+        input.nnz(),
+        static_cast<SparseValueType>(input.dataType()),
+        static_cast<SparseFormat>(input.dataFormat()));
+
+    /// init sparse matrix
+    hl_stream_t stream(HPPL_STREAM_1);
+    sparse1_->randomizeUniform();
+    sparse2_->copyFrom(*sparse1_, stream);
+    hl_stream_synchronize(stream);
+
+    func1Inputs_.emplace_back(std::make_shared<SparseMatrixArg>(*sparse1_));
+    func2Inputs_.emplace_back(std::make_shared<SparseMatrixArg>(*sparse2_));
+  }
+
+  void run() {
+    // prepare cpu/gpu arguments
+    initInputs();
+
+    initOutputs();
+    // function calculate
+    auto callFunction = [](FunctionBase* function,
+                           std::vector<BufferArgPtr>& inputs,
+                           std::vector<BufferArgPtr>& outputs) {
+      BufferArgs inArgs;
+      BufferArgs outArgs;
+      for (auto arg : inputs) {
+        inArgs.addArg(*arg);
+      }
+      for (auto arg : outputs) {
+        outArgs.addArg(*arg);
+      }
+      function->calc(inArgs, outArgs);
+    };
+
+    callFunction(function1_.get(), func1Inputs_, func1Outputs_);
+    callFunction(function2_.get(), func2Inputs_, func2Outputs_);
+
+    // check outputs
+    compareOutputs();
+  }
+
+  std::shared_ptr<FunctionBase> getFunction1() const { return function1_; }
+
+  std::shared_ptr<FunctionBase> getFunction2() const { return function2_; }
+
+ protected:
+  // only init cpu argument, gpu argument copy from cpu argument.
+  void initArg(BufferArg& arg) {
+    Vector1 vector(arg.shape().getElements(), (real*)arg.data());
+    vector.uniform(0.001, 1);
+  }
+
+  void initArg(SequenceArg& arg) {
+    /// init only matrix
+    Vector1 vector(arg.shape().getElements(), (real*)arg.data());
+    vector.uniform(0.001, 1);
+  }
+
+  void initArg(SequenceIdArg& arg, size_t batchSize) {
+    size_t numSeqs = arg.numSeqs();
+    int* buf = reinterpret_cast<int*>(arg.data());
+    int pos = 0;
+    size_t maxLen = 2 * batchSize / numSeqs;
+    for (int i = 0; i < (int)numSeqs; ++i) {
+      int len = 1 + uniformRandom(std::min<int64_t>(
+                        maxLen, batchSize - pos - numSeqs + i));
+      buf[i] = pos;
+      pos += len;
+      VLOG(1) << " len=" << len;
+    }
+    buf[numSeqs] = batchSize;
+  }
+
+  void initInputs() {
+    for (size_t i = 0; i < func1Inputs_.size(); i++) {
+      if (func1Inputs_[i]->isSparseArg()) {
+        continue;  /// sparse matrix already init
+      }
+
+      if (func1Inputs_[i]->isSequenceArg()) {
+        initArg(dynamic_cast<SequenceArg&>(*func1Inputs_[i]));
+      } else {
+        initArg(*func1Inputs_[i]);
+      }
+
+      if (initArgsCallback_ != nullptr) {
+        initArgsCallback_(*func1Inputs_[i], i);
+      }
+
+      copyArg_(*func1Inputs_[i], *func2Inputs_[i]);
+    }
+  }
+
+  void initOutputs() {
+    for (size_t i = 0; i < func1Outputs_.size(); i++) {
+      if (func1Outputs_[i]->isSparseArg()) {
+        continue;  /// sparse matrix already init
+      }
+
+      if (func1Outputs_[i]->isSequenceArg()) {
+        initArg(dynamic_cast<SequenceArg&>(*func1Outputs_[i]));
+      } else {
+        initArg(*func1Outputs_[i]);
+      }
+
+      copyArg_(*func1Outputs_[i], *func2Outputs_[i]);
+    }
+  }
+
+  void compareOutputs() {
+    for (size_t i = 0; i < func1Outputs_.size(); i++) {
+      // TODO, Need a BufferCheck used to compare the two buffers.
+      const auto cpu = func1Outputs_[i];
+      const auto gpu = func2Outputs_[i];
+      CHECK_EQ(cpu->numElements(), gpu->numElements());
+      Vector1 cpuVector(cpu->numElements(), (real*)cpu->data());
+      Vector2 gpuVector(gpu->numElements(), (real*)gpu->data());
+      autotest::TensorCheckErr(cpuVector, gpuVector);
+    }
+  }
+
+ protected:
+  std::shared_ptr<FunctionBase> function1_;
+  std::shared_ptr<FunctionBase> function2_;
+  std::vector<std::shared_ptr<Allocator1>> func1Memory_;
+  std::vector<std::shared_ptr<Allocator2>> func2Memory_;
+  std::vector<BufferArgPtr> func1Inputs_;
+  std::vector<BufferArgPtr> func1Outputs_;
+  std::vector<BufferArgPtr> func2Inputs_;
+  std::vector<BufferArgPtr> func2Outputs_;
+  std::shared_ptr<SparseMatrix1> sparse1_;
+  std::shared_ptr<SparseMatrix2> sparse2_;
+  std::shared_ptr<SequenceIdArg> seq1_;
+  std::shared_ptr<SequenceIdArg> seq2_;
+  test::CopyArgument<DType1, DType2> copyArg_;
+  std::function<void(BufferArg&, size_t)> initArgsCallback_;
+};
+
+class CpuGpuFuncCompare
+    : public Compare2Function<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> {
+ public:
+  CpuGpuFuncCompare(const std::string& name, const FuncConfig& config)
+      : Compare2Function(name + "-CPU", name + "-GPU", config) {}
+
+  ~CpuGpuFuncCompare() {}
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/GemmConvOp.cpp b/paddle/legacy/function/GemmConvOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5a81315661dc2843a648315ca4a6b590f217a657
--- /dev/null
+++ b/paddle/legacy/function/GemmConvOp.cpp
@@ -0,0 +1,522 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvOp.h"
+#include "GemmFunctor.h"
+#include "Im2Col.h"
+#include "paddle/legacy/math/MemoryHandle.h"
+
+namespace paddle {
+
+/*
+ * \brief Forward calculation of convolution.
+ */
+template <DeviceType Device>
+class GemmConvFunction : public ConvFunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    // TODO(hedaoyuan): Need to define some index macros,
+    // to avoid useing 0 and 1.
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    real beta;
+    if (outputs[0].getArgType() == ADD_TO) {
+      beta = 1.0;
+    } else {
+      beta = 0.0;
+    }
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    real* inputData = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* outputData = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+
+    TensorShape colShape;
+    real* colData = NULL;
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(colShape.getElements());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
+
+    Im2ColFunctor<kCFO, Device, real> im2col;
+    size_t inputOffset = imShape.getElements();
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+
+    for (size_t i = 0; i < batchSize; i++) {
+      for (size_t g = 0; g < groups_; g++) {
+        if (needIm2col) {
+          im2col(inputData + g * inputOffset,
+                 imShape,
+                 colData,
+                 colShape,
+                 strideH(),
+                 strideW(),
+                 paddingH(),
+                 paddingW(),
+                 dilationH(),
+                 dilationW());
+        } else {
+          colData = inputData + g * inputOffset;
+        }
+        int M = outputChannels / groups_;
+        int N = outputHeight * outputWidth;
+        int K = inputChannels / groups_ * filterHeight * filterWidth;
+        BlasGemm<Device, real>::compute(false,
+                                        false,
+                                        M,
+                                        N,
+                                        K,
+                                        1.0f,
+                                        filterData + g * filterOffset,
+                                        K,
+                                        colData,
+                                        N,
+                                        beta,
+                                        outputData + g * outputOffset,
+                                        N);
+      }
+      inputData += inputChannels * inputHeight * inputWidth;
+      outputData += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+
+#ifdef PADDLE_MOBILE_INFERENCE
+
+/*
+ * \brief Forward calculation of convolution, optimized for mobile.
+ */
+template <DeviceType Device>
+class GemmConvMobileFunction : public ConvFunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    // TODO(hedaoyuan): Need to define some index macros,
+    // to avoid useing 0 and 1.
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    real beta;
+    if (outputs[0].getArgType() == ADD_TO) {
+      beta = 1.0;
+    } else {
+      beta = 0.0;
+    }
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    real* inputData = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* outputData = outputs[0].data<real>();
+    real* colData = NULL;
+    bool needIm2col = isNeedIm2col(filter);
+
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+    TensorShape colShape;
+
+    // Max col matrix width 4096, Max col matrix size 4M.
+    size_t outputHeightSteps =
+        std::min(std::max(4096 / outputWidth, (size_t)1), outputHeight);
+    size_t maxColWidth = outputHeightSteps * outputWidth;
+    size_t channelSteps =
+        std::min(std::max((1048576 / maxColWidth) / filterHeight * filterWidth,
+                          (size_t)1),
+                 inputChannels / groups_);
+    size_t maxColHeight = channelSteps * filterHeight * filterWidth;
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+
+      resizeBuffer<Device>(maxColHeight * maxColWidth * sizeof(real));
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
+
+    Im2ColMobileFunctor<real> im2col;
+    size_t inputOffset = imShape.getElements();
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+
+    int nStride = outputHeight * outputWidth;
+    int kStride = inputChannels / groups_ * filterHeight * filterWidth;
+    for (size_t i = 0; i < batchSize; i++) {
+      filterData = inputs[1].data<real>();
+      for (size_t g = 0; g < groups_; g++) {
+        if (needIm2col) {
+          real beta_ = beta;
+          for (size_t ic = 0; ic < inputChannels / groups_;
+               ic += channelSteps) {
+            int channels = std::min(inputChannels / groups_ - ic, channelSteps);
+            for (size_t oh = 0; oh < outputHeight; oh += outputHeightSteps) {
+              int height = std::min(outputHeight - oh, outputHeightSteps);
+
+              int M = outputChannels / groups_;
+              int N = height * outputWidth;
+              int K = channels * filterHeight * filterWidth;
+              // im2col
+              im2col(inputData,
+                     imShape,
+                     colData,
+                     colShape,
+                     strideH(),
+                     strideW(),
+                     paddingH(),
+                     paddingW(),
+                     dilationH(),
+                     dilationW(),
+                     channels,
+                     oh,
+                     height,
+                     N);
+
+              // gemm
+              BlasGemm<Device, real>::compute(
+                  false,
+                  false,
+                  M,
+                  N,
+                  K,
+                  1.0f,
+                  filterData + ic * filterHeight * filterWidth,
+                  kStride,
+                  colData,
+                  N,
+                  beta_,
+                  outputData + oh * outputWidth,
+                  nStride);
+            }
+            beta_ = 1.0;
+          }
+        } else {
+          int M = outputChannels / groups_;
+          int N = outputHeight * outputWidth;
+          int K = inputChannels / groups_ * filterHeight * filterWidth;
+          BlasGemm<Device, real>::compute(false,
+                                          false,
+                                          M,
+                                          N,
+                                          K,
+                                          1.0f,
+                                          filterData,
+                                          K,
+                                          inputData,
+                                          N,
+                                          beta,
+                                          outputData,
+                                          N);
+        }
+        inputData += inputOffset;
+        outputData += outputOffset;
+        filterData += filterOffset;
+      }
+    }
+
+    memory_.reset();
+  }
+};
+
+#endif
+
+/*
+ * \brief Backward input calculation of convolution.
+ */
+template <DeviceType Device>
+class GemmConvGradInputFunction : public ConvFunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& input = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    // Since the implementation of Col2ImFunctor is ADD_TO,
+    // this function only supports ADD_TO mode.
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& input = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    real* outputGrad = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* inputGrad = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+
+    TensorShape colShape;
+    real* colData = NULL;
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(colShape.getElements());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
+
+    Col2ImFunctor<kCFO, Device, real> col2im;
+    size_t inputOffset = imShape.getElements();
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+
+    for (size_t i = 0; i < batchSize; i++) {
+      for (size_t g = 0; g < groups_; g++) {
+        int K = outputChannels / groups_;
+        int N = outputHeight * outputWidth;
+        int M = inputChannels / groups_ * filterHeight * filterWidth;
+        real scale = 0.0f;
+        if (!needIm2col) {
+          colData = inputGrad + g * inputOffset;
+          scale = 1.0f;
+        }
+        BlasGemm<Device, real>::compute(true,
+                                        false,
+                                        M,
+                                        N,
+                                        K,
+                                        1.0f,
+                                        filterData + g * filterOffset,
+                                        M,
+                                        outputGrad + g * outputOffset,
+                                        N,
+                                        scale,
+                                        colData,
+                                        N);
+        if (needIm2col) {
+          col2im(inputGrad + g * inputOffset,
+                 imShape,
+                 colData,
+                 colShape,
+                 strideH(),
+                 strideW(),
+                 paddingH(),
+                 paddingW(),
+                 dilationH(),
+                 dilationW());
+        }
+      }
+      inputGrad += inputChannels * inputHeight * inputWidth;
+      outputGrad += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+
+/*
+ * \brief Backward filter calculation of convolution.
+ */
+template <DeviceType Device>
+class GemmConvGradFilterFunction : public ConvFunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& input = inputs[1].shape();
+    const TensorShape& filter = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& input = inputs[1].shape();
+    const TensorShape& filter = outputs[0].shape();
+
+    real beta;
+    if (outputs[0].getArgType() == ADD_TO) {
+      beta = 1.0;
+    } else {
+      beta = 0.0;
+    }
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    real* outputGrad = inputs[0].data<real>();
+    real* inputData = inputs[1].data<real>();
+    real* filterGrad = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+
+    TensorShape colShape;
+    real* colData = NULL;
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(colShape.getElements());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
+
+    Im2ColFunctor<kCFO, Device, real> im2col;
+    size_t inputOffset = imShape.getElements();
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+    for (size_t i = 0; i < batchSize; i++) {
+      for (size_t g = 0; g < groups_; g++) {
+        if (needIm2col) {
+          im2col(inputData + g * inputOffset,
+                 imShape,
+                 colData,
+                 colShape,
+                 strideH(),
+                 strideW(),
+                 paddingH(),
+                 paddingW(),
+                 dilationH(),
+                 dilationW());
+        } else {
+          colData = inputData + g * inputOffset;
+        }
+        int M = outputChannels / groups_;
+        int K = outputHeight * outputWidth;
+        int N = inputChannels / groups_ * filterHeight * filterWidth;
+        BlasGemm<Device, real>::compute(false,
+                                        true,
+                                        M,
+                                        N,
+                                        K,
+                                        1.0f,
+                                        outputGrad + g * outputOffset,
+                                        K,
+                                        colData,
+                                        K,
+                                        i == 0 ? beta : 1.0f,
+                                        filterGrad + g * filterOffset,
+                                        N);
+      }
+      inputData += inputChannels * inputHeight * inputWidth;
+      outputGrad += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+
+#ifdef PADDLE_MOBILE_INFERENCE
+REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvMobileFunction);
+#else
+REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction);
+#endif
+REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction);
+REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(GemmConv, GPU, GemmConvFunction);
+REGISTER_TYPED_FUNC(GemmConvGradInput, GPU, GemmConvGradInputFunction);
+REGISTER_TYPED_FUNC(GemmConvGradFilter, GPU, GemmConvGradFilterFunction);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/GemmConvOpTest.cpp b/paddle/legacy/function/GemmConvOpTest.cpp
similarity index 100%
rename from paddle/function/GemmConvOpTest.cpp
rename to paddle/legacy/function/GemmConvOpTest.cpp
diff --git a/paddle/legacy/function/GemmFunctor.cpp b/paddle/legacy/function/GemmFunctor.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..450293dfeea170e287cfc90226dabad25c76e537
--- /dev/null
+++ b/paddle/legacy/function/GemmFunctor.cpp
@@ -0,0 +1,90 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "GemmFunctor.h"
+#include "paddle/legacy/math/MathFunctions.h"
+
+namespace paddle {
+
+template <class T>
+struct BlasGemm<DEVICE_TYPE_CPU, T> {
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc) {
+#ifdef PADDLE_USE_EIGEN_FOR_BLAS
+    EigenBlasGemm<T>::compute(
+        transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+#else
+    gemm<T>(transA == false ? CblasNoTrans : CblasTrans,
+            transB == false ? CblasNoTrans : CblasTrans,
+            M,
+            N,
+            K,
+            alpha,
+            A,
+            lda,
+            B,
+            ldb,
+            beta,
+            C,
+            ldc);
+#endif
+  }
+};
+
+template <class T>
+struct BlasGemm<DEVICE_TYPE_GPU, T> {
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc) {
+    hl_matrix_mul((T*)A,
+                  transA == false ? HPPL_OP_N : HPPL_OP_T,
+                  (T*)B,
+                  transB == false ? HPPL_OP_N : HPPL_OP_T,
+                  C,
+                  M,
+                  N,
+                  K,
+                  alpha,
+                  beta,
+                  lda,
+                  ldb,
+                  ldc);
+  }
+};
+
+template struct BlasGemm<DEVICE_TYPE_CPU, real>;
+template struct BlasGemm<DEVICE_TYPE_GPU, real>;
+
+}  // namespace paddle
diff --git a/paddle/function/GemmFunctor.h b/paddle/legacy/function/GemmFunctor.h
similarity index 100%
rename from paddle/function/GemmFunctor.h
rename to paddle/legacy/function/GemmFunctor.h
diff --git a/paddle/function/GruFunctor.h b/paddle/legacy/function/GruFunctor.h
similarity index 100%
rename from paddle/function/GruFunctor.h
rename to paddle/legacy/function/GruFunctor.h
diff --git a/paddle/legacy/function/Im2Col.h b/paddle/legacy/function/Im2Col.h
new file mode 100644
index 0000000000000000000000000000000000000000..e0ce6918a2a5324a396ade734945cf426b81ab56
--- /dev/null
+++ b/paddle/legacy/function/Im2Col.h
@@ -0,0 +1,154 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "TensorShape.h"
+#include "TensorType.h"
+#include "neon/neon_util.h"
+
+namespace paddle {
+
+/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */
+enum ColFormat { kCFO = 0, kOCF = 1 };
+
+/*
+ * \brief Converts the image data of three dimensions(CHW) into a colData of
+ *        five dimensions in the Im2ColFunctor calculation,
+ *        And in the Col2ImFunctor calculation, it is reversed.
+ *
+ * \param imData   Image data.
+ * \param imShape  The shape of imData,
+ *                 [inputChannels, inputHeight, inputWidth].
+ * \param colData  Column data.
+ * \param colShape The shape of colData.
+ *
+ * If the template argument Format is kCFO, the shape of colData is:
+ * [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ * So, it is easy to reshape into a convolution matrix for convolution
+ * calculation based on matrix multiplication.
+ * The shape of convolution matrix is [height, width], where the height is equal
+ * inputChannels * filterHeight * filterWidth, and the width is equal
+ * outputHeight * outputWidth.
+ *
+ * Reshape:
+ *     shape of colData           shape of convolution matrix
+ *     [inputChannels,
+ *      filterHeight,
+ *      filterWidth,      ======>      [height, width]
+ *      outputHeight,
+ *      outputWidth]
+ *
+ * If the template argument Format is kOCF, the shape of colData is:
+ * [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ * So, it is easy to reshape into a sequence matrix for rnn calculation.
+ * The shape of sequence matrix is [seqLength, stepSize], where the seqLength
+ * is equal outputHeight * outputWidth, and the stepSize is equal
+ * inputChannels * filterHeight * filterWidth.
+ *
+ * Reshape:
+ *     shape of colData             shape of sequence matrix
+ *     [outputHeight,
+ *      outputWidth,
+ *      inputChannels,    ======>    [seqLength, stepSize]
+ *      filterHeight,
+ *      filterWidth]
+ *
+ * \note The caller needs to ensure that imShape.inputChannels is equal to
+ *       colShape.inputChannels.
+ */
+template <ColFormat Format, DeviceType Device, class T>
+class Im2ColFunctor {
+ public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1);
+};
+
+template <ColFormat Format, DeviceType Device, class T>
+class Col2ImFunctor {
+ public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1);
+};
+
+template <class T>
+class Im2ColMobileFunctor {
+ public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth,
+                  int inputChannels,
+                  int colOffset,
+                  int colOutputHeight,
+                  int colWidth) {
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputWidth = colShape[4];
+
+    for (int ic = 0; ic < inputChannels; ic++) {
+      for (int oh = 0; oh < colOutputHeight; oh++) {
+        T* dstData = colData + oh * outputWidth;
+        for (int fh = 0; fh < filterHeight; fh++) {
+          for (int fw = 0; fw < filterWidth; fw++) {
+            int imRowIdx = (oh + colOffset) * strideHeight +
+                           fh * dilationHeight - paddingHeight;
+            if (imRowIdx < 0 || imRowIdx >= inputHeight) {
+              memset(dstData, 0, outputWidth * sizeof(T));
+            } else {
+              for (int ow = 0; ow < outputWidth; ow++) {
+                int imColIdx =
+                    ow * strideWidth + fw * dilationWidth - paddingWidth;
+                if (imColIdx < 0 || imColIdx >= inputWidth) {
+                  dstData[ow] = T(0);
+                } else {
+                  dstData[ow] = imData[imRowIdx * inputWidth + imColIdx];
+                }
+              }
+            }
+            dstData += colWidth;
+          }
+        }
+      }
+      colData += filterHeight * filterWidth * colWidth;
+      imData += inputHeight * inputWidth;
+    }
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/Im2ColOp.cpp b/paddle/legacy/function/Im2ColOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..55a3ff98db63ede96094a3d3fdeedf03b573294f
--- /dev/null
+++ b/paddle/legacy/function/Im2ColOp.cpp
@@ -0,0 +1,245 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Im2Col.h"
+
+namespace paddle {
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ */
+template <class T>
+class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> {
+ public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputHeight = colShape[3];
+    int outputWidth = colShape[4];
+    int channelsCol = inputChannels * filterHeight * filterWidth;
+
+    for (int c = 0; c < channelsCol; ++c) {
+      int wOffset = c % filterWidth;
+      int hOffset = (c / filterWidth) % filterHeight;
+      int c_im = c / filterWidth / filterHeight;
+      for (int h = 0; h < outputHeight; ++h) {
+        for (int w = 0; w < outputWidth; ++w) {
+          int imRowIdx = h * strideHeight + hOffset * dilationHeight;
+          int imColIdx = w * strideWidth + wOffset * dilationWidth;
+          if ((imRowIdx - paddingHeight) < 0 ||
+              (imRowIdx - paddingHeight) >= inputHeight ||
+              (imColIdx - paddingWidth) < 0 ||
+              (imColIdx - paddingWidth) >= inputWidth) {
+            colData[(c * outputHeight + h) * outputWidth + w] = T(0);
+          } else {
+            imRowIdx += c_im * inputHeight - paddingHeight;
+            imColIdx -= paddingWidth;
+            colData[(c * outputHeight + h) * outputWidth + w] =
+                imData[imRowIdx * inputWidth + imColIdx];
+          }
+        }
+      }
+    }
+  }
+};
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ */
+template <class T>
+class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, T> {
+ public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputHeight = colShape[3];
+    int outputWidth = colShape[4];
+    int channelsCol = inputChannels * filterHeight * filterWidth;
+
+    for (int c = 0; c < channelsCol; ++c) {
+      int wOffset = c % filterWidth;
+      int hOffset = (c / filterWidth) % filterHeight;
+      int c_im = c / filterWidth / filterHeight;
+      for (int h = 0; h < outputHeight; ++h) {
+        for (int w = 0; w < outputWidth; ++w) {
+          int imRowIdx = h * strideHeight + hOffset * dilationHeight;
+          int imColIdx = w * strideWidth + wOffset * dilationWidth;
+          if ((imRowIdx - paddingHeight) >= 0 &&
+              (imRowIdx - paddingHeight) < inputHeight &&
+              (imColIdx - paddingWidth) >= 0 &&
+              (imColIdx - paddingWidth) < inputWidth) {
+            imRowIdx += c_im * inputHeight - paddingHeight;
+            imColIdx -= paddingWidth;
+            imData[imRowIdx * inputWidth + imColIdx] +=
+                colData[(c * outputHeight + h) * outputWidth + w];
+          }
+        }
+      }
+    }
+  }
+};
+
+template class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, float>;
+template class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, double>;
+template class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, float>;
+template class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, double>;
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, T> {
+ public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+    for (int outputH = 0; outputH < outputHeight; ++outputH) {
+      for (int outputW = 0; outputW < outputWidth; ++outputW) {
+        for (int channel = 0; channel < inputChannels; ++channel) {
+          for (int filterH = 0; filterH < filterHeight; ++filterH) {
+            for (int filterW = 0; filterW < filterWidth; ++filterW) {
+              int imRowOffset = outputH * strideHeight +
+                                filterH * dilationHeight - paddingHeight;
+              int imColOffset = outputW * strideWidth +
+                                filterW * dilationWidth - paddingWidth;
+              int colDataOffset =
+                  (((outputH * outputWidth + outputW) * inputChannels +
+                    channel) *
+                       filterHeight +
+                   filterH) *
+                      filterWidth +
+                  filterW;
+              if (imRowOffset < 0 || imRowOffset >= inputHeight ||
+                  imColOffset < 0 || imColOffset >= inputWidth) {
+                colData[colDataOffset] = float(0);
+              } else {
+                int imDataOffset =
+                    (channel * inputHeight + imRowOffset) * inputWidth +
+                    imColOffset;
+                colData[colDataOffset] = imData[imDataOffset];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, T> {
+ public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+    for (int outputH = 0; outputH < outputHeight; ++outputH) {
+      for (int outputW = 0; outputW < outputWidth; ++outputW) {
+        for (int channel = 0; channel < inputChannels; ++channel) {
+          for (int filterH = 0; filterH < filterHeight; ++filterH) {
+            for (int filterW = 0; filterW < filterWidth; ++filterW) {
+              int imRowOffset = outputH * strideHeight +
+                                filterH * dilationHeight - paddingHeight;
+              int imColOffset = outputW * strideWidth +
+                                filterW * dilationWidth - paddingWidth;
+              int colDataOffset =
+                  (((outputH * outputWidth + outputW) * inputChannels +
+                    channel) *
+                       filterHeight +
+                   filterH) *
+                      filterWidth +
+                  filterW;
+              if (imRowOffset >= 0 && imRowOffset < inputHeight &&
+                  imColOffset >= 0 && imColOffset < inputWidth) {
+                int imDataOffset =
+                    (channel * inputHeight + imRowOffset) * inputWidth +
+                    imColOffset;
+                imData[imDataOffset] += colData[colDataOffset];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, float>;
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, double>;
+template class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, float>;
+template class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, double>;
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/Im2ColOpGpu.cu b/paddle/legacy/function/Im2ColOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..96dd8f528eaa38f9d174ab7c2a5ea5eb96e2a060
--- /dev/null
+++ b/paddle/legacy/function/Im2ColOpGpu.cu
@@ -0,0 +1,464 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Im2Col.h"
+#include "hl_device_functions.cuh"
+
+namespace paddle {
+
+template <class T>
+__global__ void im2col(const T* data_im,
+                       int numOuts,
+                       int height,
+                       int width,
+                       int blockH,
+                       int blockW,
+                       int strideH,
+                       int strideW,
+                       int paddingH,
+                       int paddingW,
+                       int dilationH,
+                       int dilationW,
+                       int height_col,
+                       int width_col,
+                       T* data_col) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < numOuts) {
+    int w_out = index % width_col;
+    index /= width_col;
+    int h_out = index % height_col;
+    int channel_in = index / height_col;
+    int channel_out = channel_in * blockH * blockW;
+    int h_in = h_out * strideH;
+    int w_in = w_out * strideW;
+
+    data_col += (channel_out * height_col + h_out) * width_col + w_out;
+    for (int i = 0; i < blockH; ++i) {
+      for (int j = 0; j < blockW; ++j) {
+        int rIdx = int(h_in + i * dilationH);
+        int cIdx = int(w_in + j * dilationW);
+        if ((rIdx - (int)paddingH) >= (int)height ||
+            (rIdx - (int)paddingH) < 0 ||
+            (cIdx - (int)paddingW) >= (int)width ||
+            (cIdx - (int)paddingW) < 0) {
+          *data_col = 0;
+        } else {
+          rIdx = rIdx + channel_in * height - paddingH;
+          cIdx = cIdx - paddingW;
+          *data_col = data_im[rIdx * width + cIdx];
+        }
+        data_col += height_col * width_col;
+      }
+    }
+  }
+}
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ */
+template <class T>
+class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, T> {
+ public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputHeight = colShape[3];
+    int outputWidth = colShape[4];
+
+    int numKernels = inputChannels * outputHeight * outputWidth;
+    int blocks = (numKernels + 1024 - 1) / 1024;
+    int blockX = 512;
+    int blockY = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+    im2col<T><<<grid, threads, 0, STREAM_DEFAULT>>>(imData,
+                                                    numKernels,
+                                                    inputHeight,
+                                                    inputWidth,
+                                                    filterHeight,
+                                                    filterWidth,
+                                                    strideHeight,
+                                                    strideWidth,
+                                                    paddingHeight,
+                                                    paddingWidth,
+                                                    dilationHeight,
+                                                    dilationWidth,
+                                                    outputHeight,
+                                                    outputWidth,
+                                                    colData);
+    CHECK_SYNC("Im2ColFunctor GPU failed");
+  }
+};
+
+template <class T>
+__global__ void col2im(size_t n,
+                       const T* data_col,
+                       size_t height,
+                       size_t width,
+                       size_t channels,
+                       size_t blockH,
+                       size_t blockW,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t paddingH,
+                       size_t paddingW,
+                       size_t dilationH,
+                       size_t dilationW,
+                       size_t height_col,
+                       size_t width_col,
+                       T* data_im) {
+  size_t index =
+      (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < n) {
+    T val = 0;
+    int w = int(index % width);
+    int h = int((index / width) % height);
+    int c = int(index / (width * height));
+    int filterH = (blockH - 1) * dilationH + 1;
+    int filterW = (blockW - 1) * dilationW + 1;
+
+    if ((w - (int)paddingW) >= 0 &&
+        (w - (int)paddingW) < (width - 2 * paddingW) &&
+        (h - (int)paddingH) >= 0 && (h - paddingH) < (height - 2 * paddingH)) {
+      // compute the start and end of the output
+      int w_col_start =
+          (w < (int)filterW) ? 0 : (w - int(filterW)) / (int)strideW + 1;
+      int w_col_end = min((int)(w / (int)strideW + 1), (int)(width_col));
+      int h_col_start =
+          (h < (int)filterH) ? 0 : (h - (int)filterH) / (int)strideH + 1;
+      int h_col_end = min(int(h / strideH + 1), int(height_col));
+
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          // the col location: [c * width * height + h_out, w_out]
+          int h_k = (h - h_col * strideH);
+          int w_k = (w - w_col * strideW);
+          if (h_k % dilationH == 0 && w_k % dilationW == 0) {
+            h_k /= dilationH;
+            w_k /= dilationW;
+            int c_col =
+                (((c * blockH + h_k) * blockW + w_k) * height_col + h_col) *
+                    width_col +
+                w_col;
+            val += data_col[c_col];
+          }
+        }
+      }
+      h -= paddingH;
+      w -= paddingW;
+      data_im[c * ((width - 2 * paddingW) * (height - 2 * paddingH)) +
+              h * (width - 2 * paddingW) + w] += val;
+    }
+  }
+}
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ */
+template <class T>
+class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, T> {
+ public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputHeight = colShape[3];
+    int outputWidth = colShape[4];
+
+    size_t numKernels = inputChannels * (inputHeight + 2 * paddingHeight) *
+                        (inputWidth + 2 * paddingWidth);
+
+    size_t blocks = (numKernels + 1024 - 1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+
+    // To avoid involving atomic operations, we will launch one kernel per
+    // bottom dimension, and then in the kernel add up the top dimensions.
+    col2im<T><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        numKernels,
+        colData,
+        inputHeight + 2 * paddingHeight,
+        inputWidth + 2 * paddingWidth,
+        inputChannels,
+        filterHeight,
+        filterWidth,
+        strideHeight,
+        strideWidth,
+        paddingHeight,
+        paddingWidth,
+        dilationHeight,
+        dilationWidth,
+        outputHeight,
+        outputWidth,
+        imData);
+    CHECK_SYNC("Col2ImFunctor GPU failed");
+  }
+};
+
+template class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, float>;
+template class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, double>;
+template class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, float>;
+template class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, double>;
+
+template <class T>
+__global__ void im2colOCF(const T* imData,
+                          T* colData,
+                          int inputChannels,
+                          int inputHeight,
+                          int inputWidth,
+                          int filterHeight,
+                          int filterWidth,
+                          int strideHeight,
+                          int strideWidth,
+                          int paddingHeight,
+                          int paddingWidth,
+                          int dilationHeight,
+                          int dilationWidth,
+                          int outputHeight,
+                          int outputWidth) {
+  int swId = blockIdx.x;
+  int shId = blockIdx.y;
+  for (int channelId = threadIdx.z; channelId < inputChannels;
+       channelId += blockDim.z) {
+    for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
+      for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
+        int widthOffset =
+            idx * dilationHeight + swId * strideWidth - paddingWidth;
+        int heightOffset =
+            idy * dilationWidth + shId * strideHeight - paddingHeight;
+        int imOffset = widthOffset + heightOffset * inputWidth +
+                       channelId * inputHeight * inputWidth;
+
+        int colOffset = idx + idy * filterWidth +
+                        channelId * filterHeight * filterWidth +
+                        (shId * outputWidth + swId) *
+                            (inputChannels * filterHeight * filterWidth);
+
+        if (heightOffset >= inputHeight || heightOffset < 0 ||
+            widthOffset >= inputWidth || widthOffset < 0) {
+          colData[colOffset] = T(0);
+        } else {
+          colData[colOffset] = imData[imOffset];
+        }
+      }
+    }
+  }
+}
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, T> {
+ public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+
+    int blockDimX = 0;
+    int blockDimY = 0;
+    if (filterHeight <= 4 && filterWidth <= 4) {
+      blockDimX = 4;
+      blockDimY = 4;
+    } else if (filterHeight <= 8 && filterWidth <= 8) {
+      blockDimX = 8;
+      blockDimY = 8;
+    } else if (filterHeight <= 16 && filterWidth <= 16) {
+      blockDimX = 16;
+      blockDimY = 16;
+    } else {
+      blockDimX = 32;
+      blockDimY = 32;
+    }
+
+    int blockDimZ = 1024 / blockDimX / blockDimY;
+    dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels));
+    dim3 grid(outputWidth, outputHeight);
+    im2colOCF<T><<<grid, threads, 0, STREAM_DEFAULT>>>(imData,
+                                                       colData,
+                                                       inputChannels,
+                                                       inputHeight,
+                                                       inputWidth,
+                                                       filterHeight,
+                                                       filterWidth,
+                                                       strideHeight,
+                                                       strideWidth,
+                                                       paddingHeight,
+                                                       paddingWidth,
+                                                       dilationHeight,
+                                                       dilationWidth,
+                                                       outputHeight,
+                                                       outputWidth);
+    CHECK_SYNC("Im2ColFunctor GPU failed");
+  }
+};
+
+template <class T>
+__global__ void col2imOCF(T* imData,
+                          const T* colData,
+                          int inputChannels,
+                          int inputHeight,
+                          int inputWidth,
+                          int filterHeight,
+                          int filterWidth,
+                          int strideHeight,
+                          int strideWidth,
+                          int paddingHeight,
+                          int paddingWidth,
+                          int dilationHeight,
+                          int dilationWidth,
+                          int outputHeight,
+                          int outputWidth) {
+  int swId = blockIdx.x;
+  int shId = blockIdx.y;
+  for (int channelId = threadIdx.z; channelId < inputChannels;
+       channelId += blockDim.z) {
+    for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
+      for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
+        int widthOffset =
+            idx * dilationWidth + swId * strideWidth - paddingWidth;
+        int heightOffset =
+            idy * dilationHeight + shId * strideHeight - paddingHeight;
+        int imOffset = widthOffset + heightOffset * inputWidth +
+                       channelId * inputHeight * inputWidth;
+
+        int colOffset = idx + idy * filterWidth +
+                        channelId * filterHeight * filterWidth +
+                        (shId * outputWidth + swId) *
+                            (inputChannels * filterHeight * filterWidth);
+
+        if (heightOffset >= 0 && heightOffset < inputHeight &&
+            widthOffset >= 0 && widthOffset < inputWidth) {
+          paddle::paddleAtomicAdd(imData + imOffset, colData[colOffset]);
+        }
+      }
+    }
+  }
+}
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, T> {
+ public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+
+    int blockDimX = 0;
+    int blockDimY = 0;
+    if (filterHeight <= 4 && filterWidth <= 4) {
+      blockDimX = 4;
+      blockDimY = 4;
+    } else if (filterHeight <= 8 && filterWidth <= 8) {
+      blockDimX = 8;
+      blockDimY = 8;
+    } else if (filterHeight <= 16 && filterWidth <= 16) {
+      blockDimX = 16;
+      blockDimY = 16;
+    } else {
+      blockDimX = 32;
+      blockDimY = 32;
+    }
+
+    int blockDimZ = 1024 / blockDimX / blockDimY;
+    dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels));
+    dim3 grid(outputWidth, outputHeight);
+    col2imOCF<T><<<grid, threads, 0, STREAM_DEFAULT>>>(imData,
+                                                       colData,
+                                                       inputChannels,
+                                                       inputHeight,
+                                                       inputWidth,
+                                                       filterHeight,
+                                                       filterWidth,
+                                                       strideHeight,
+                                                       strideWidth,
+                                                       paddingHeight,
+                                                       paddingWidth,
+                                                       dilationHeight,
+                                                       dilationWidth,
+                                                       outputHeight,
+                                                       outputWidth);
+    CHECK_SYNC("Col2ImFunctor GPU failed");
+  }
+};
+
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, float>;
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, double>;
+template class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, float>;
+template class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, double>;
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/Im2ColTest.cpp b/paddle/legacy/function/Im2ColTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2c5f06f38991497963cfbe1e12825f1bc39dffa6
--- /dev/null
+++ b/paddle/legacy/function/Im2ColTest.cpp
@@ -0,0 +1,223 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Im2Col.h"
+#include <gtest/gtest.h>
+#include "Function.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/tests/TensorCheck.h"
+
+namespace paddle {
+
+template <DeviceType Device, class T>
+void TestIm2ColFunctor() {
+  for (size_t channels : {1, 5, 32}) {
+    for (size_t inputHeight : {5, 33, 100}) {
+      for (size_t inputWidth : {5, 32, 96}) {
+        for (size_t filterHeight : {1, 5}) {
+          for (size_t filterWidth : {3, 7}) {
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                for (size_t dilation : {1, 3}) {
+                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
+                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
+                  if (inputHeight + 2 * padding < filterSizeH ||
+                      inputWidth + 2 * padding < filterSizeW)
+                    break;
+                  if (padding >= filterSizeH || padding >= filterSizeW) break;
+                  size_t outputHeight =
+                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
+                  size_t outputWidth =
+                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;
+
+                  TensorShape imShape =
+                      TensorShape({channels, inputHeight, inputWidth});
+                  TensorShape colShape1 = TensorShape({channels,
+                                                       filterHeight,
+                                                       filterWidth,
+                                                       outputHeight,
+                                                       outputWidth});
+                  TensorShape colShape2 = TensorShape({outputHeight,
+                                                       outputWidth,
+                                                       channels,
+                                                       filterHeight,
+                                                       filterWidth});
+
+                  size_t height = channels * filterHeight * filterWidth;
+                  size_t width = outputHeight * outputWidth;
+                  VectorPtr input1 =
+                      Vector::create(imShape.getElements(), false);
+                  VectorPtr input2 =
+                      Vector::create(imShape.getElements(), false);
+                  MatrixPtr output1 =
+                      Matrix::create(height, width, false, false);
+                  MatrixPtr output2 =
+                      Matrix::create(width, height, false, false);
+                  input1->uniform(0.001, 1);
+                  input2->copyFrom(*input1);
+
+                  Im2ColFunctor<kCFO, Device, T> im2Col1;
+                  Im2ColFunctor<kOCF, Device, T> im2Col2;
+                  im2Col1(input1->getData(),
+                          imShape,
+                          output1->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  im2Col2(input2->getData(),
+                          imShape,
+                          output2->getData(),
+                          colShape2,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+
+                  // The transposition of the result of ColFormat == kCFO
+                  // is equal to the result of ColFormat == kOCF.
+                  MatrixPtr test;
+                  output2->transpose(test, true);
+                  autotest::TensorCheckErr(*output1, *test);
+
+                  Col2ImFunctor<kCFO, Device, T> col2Im1;
+                  Col2ImFunctor<kOCF, Device, T> col2Im2;
+
+                  col2Im1(input1->getData(),
+                          imShape,
+                          output1->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  col2Im2(input2->getData(),
+                          imShape,
+                          output2->getData(),
+                          colShape2,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  autotest::TensorCheckErr(*input1, *input2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Im2ColFunctor, CPU) { TestIm2ColFunctor<DEVICE_TYPE_CPU, float>(); }
+
+#ifdef PADDLE_WITH_CUDA
+
+TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); }
+
+#endif
+
+template <class T>
+void TestIm2ColMobileFunctor() {
+  for (size_t channels : {32}) {
+    for (size_t inputHeight : {33, 100}) {
+      for (size_t inputWidth : {32, 96}) {
+        for (size_t filterHeight : {5}) {
+          for (size_t filterWidth : {7}) {
+            for (size_t stride : {2}) {
+              for (size_t padding : {1}) {
+                for (size_t dilation : {1, 3}) {
+                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
+                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
+                  if (inputHeight + 2 * padding < filterSizeH ||
+                      inputWidth + 2 * padding < filterSizeW)
+                    break;
+                  if (padding >= filterSizeH || padding >= filterSizeW) break;
+                  size_t outputHeight =
+                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
+                  size_t outputWidth =
+                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;
+
+                  TensorShape imShape =
+                      TensorShape({channels, inputHeight, inputWidth});
+                  TensorShape colShape1 = TensorShape({channels,
+                                                       filterHeight,
+                                                       filterWidth,
+                                                       outputHeight,
+                                                       outputWidth});
+
+                  size_t height = channels * filterHeight * filterWidth;
+                  size_t width = outputHeight * outputWidth;
+                  VectorPtr input1 =
+                      Vector::create(imShape.getElements(), false);
+                  VectorPtr input2 =
+                      Vector::create(imShape.getElements(), false);
+                  MatrixPtr output1 =
+                      Matrix::create(height, width, false, false);
+                  MatrixPtr output2 =
+                      Matrix::create(height, width, false, false);
+                  input1->uniform(0.001, 1);
+                  input2->copyFrom(*input1);
+
+                  Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> im2Col1;
+                  Im2ColMobileFunctor<T> im2Col2;
+                  im2Col1(input1->getData(),
+                          imShape,
+                          output1->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  im2Col2(input2->getData(),
+                          imShape,
+                          output2->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation,
+                          channels,
+                          0,
+                          outputHeight,
+                          outputHeight * outputWidth);
+
+                  autotest::TensorCheckEqual(*output1, *output2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Im2ColFunctor, Mobile) { TestIm2ColMobileFunctor<float>(); }
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/MulOp.cpp b/paddle/legacy/function/MulOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..750978fc90201ccdc0a32f93fc01a2170d3f39d5
--- /dev/null
+++ b/paddle/legacy/function/MulOp.cpp
@@ -0,0 +1,347 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MulOp.h"
+#include "GemmFunctor.h"
+#include "paddle/legacy/math/SIMDFunctions.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+namespace {
+inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i] += (1.0 == scaleB) ? b[i] : scaleB * b[i];
+  }
+}
+
+inline void colVecAddTo(
+    real* a, real* b, real c, size_t len, size_t aWidth, size_t bWidth) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i * aWidth] += (1.0 == c) ? b[i * bWidth] : b[i * bWidth] * c;
+  }
+}
+}  // namespace
+
+namespace paddle {
+/// sparse matrix (+)= dense matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_CPU>(CpuSparseMatrix& out,
+                            const CpuMatrix& a,
+                            const CpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK_EQ(out.getValueType(), FLOAT_VALUE);
+  if (scaleT == 0) {
+    out.zeroMem();
+  }
+  const real* A = a.getData();
+  const real* B = b.getData();
+  real* C = out.getValue();
+  int* rows = out.getRows();
+  int* cols = out.getCols();
+  size_t width = out.getWidth();
+  size_t height = out.getHeight();
+
+  /// SPARSE_CSC, {a any, b not trans}
+  if (out.getFormat() == SPARSE_CSC) {
+    /// b not trans and a any
+    CHECK(!bTrans);
+    size_t m = !aTrans ? a.getWidth() : a.getHeight();
+    for (size_t i = 0; i < width; i++) {
+      size_t start = out.getColStartIdx(i);
+      size_t end = out.getColStartIdx(i + 1);
+      for (size_t j = start; j < end; j++) {
+        real sum = 0;
+        size_t rowIdx = rows[j];
+        for (size_t k = 0; k < m; k++) {
+          sum += (!aTrans ? A[rowIdx * m + k] : A[k * height + rowIdx]) *
+                 B[k * width + i];
+        }
+        C[j] = scaleAB * sum + scaleT * C[j];
+      }
+    }
+    return;
+  }
+
+  /// SPARSE_CSR, {a any, b not trans} or {a not trans, b trans}
+  if (out.getFormat() == SPARSE_CSR) {
+    /// a and b can not both transpose
+    CHECK(!(aTrans && bTrans));
+    size_t m = a.getWidth();
+    for (size_t i = 0; i < height; i++) {
+      size_t start = out.getRowStartIdx(i);
+      size_t end = out.getRowStartIdx(i + 1);
+      for (size_t j = start; j < end; j++) {
+        real sum = 0;
+        size_t colIdx = cols[j];
+        for (size_t k = 0; k < m; k++) {
+          sum += (!aTrans ? A[i * m + k] : A[k * height + i]) *
+                 (!bTrans ? B[k * width + colIdx] : B[colIdx * m + k]);
+        }
+        C[j] = scaleAB * sum + scaleT * C[j];
+      }
+    }
+    return;
+  }
+}
+
+/// dense matrix (+)= dense matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
+                            const CpuMatrix& a,
+                            const CpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  BlasGemm<DEVICE_TYPE_CPU, real>::compute(
+      aTrans,
+      bTrans,
+      out.getHeight(),
+      out.getWidth(),
+      !aTrans ? a.getWidth() : a.getHeight(),
+      scaleAB,
+      a.getData(),
+      a.getStride(),
+      b.getData(),
+      b.getStride(),
+      scaleT,
+      out.getData(),
+      out.getStride());
+}
+
+/// dense matrix (+)= sparse matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
+                            const CpuSparseMatrix& a,
+                            const CpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  if (scaleT == 0) {
+    out.zeroMem();
+  }
+  const real* B = b.getData();
+  real* C = out.getData();
+  if (out.getWidth() % 32 == 0) {
+    CHECK_EQ((size_t)B % 32, 0UL);
+    CHECK_EQ((size_t)C % 32, 0UL);
+  }
+
+  int* cols = a.getCols();
+  real* values = a.getValue();
+  for (size_t i = 0; i < a.getHeight(); ++i) {
+    const int start = a.getRowStartIdx(i);
+    const int end = a.getRowStartIdx(i + 1);
+    for (int j = start; j < end; ++j) {
+      vecAddTo(!aTrans ? out.getRow(i) : out.getRow(cols[j]),
+               !aTrans ? const_cast<CpuMatrix&>(b).getRow(cols[j])
+                       : const_cast<CpuMatrix&>(b).getRow(i),
+               (a.getValueType() == FLOAT_VALUE) ? values[j] : (real)1.0,
+               out.getWidth());
+    }
+  }
+}
+
+/// dense matrix (+)= dense matrix * sparse matrix
+template <>
+void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
+                            const CpuMatrix& a,
+                            const CpuSparseMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  if (scaleT == 0) {
+    out.zeroMem();
+  }
+  real* A = const_cast<real*>(a.getData());
+  real* B = const_cast<real*>(b.getValue());
+  real* C = out.getData();
+  int* rows = b.getRows();
+  int* cols = b.getCols();
+
+  /// SPARSE_CSC format
+  if (b.getFormat() == SPARSE_CSC) {
+    for (size_t j = 0; j < b.getWidth(); ++j) {
+      int start = b.getColStartIdx(j);
+      int end = b.getColStartIdx(j + 1);
+      for (int i = start; i < end; ++i) {
+        colVecAddTo(!bTrans ? C + j : C + rows[i],
+                    !bTrans ? A + rows[i] : A + j,
+                    (b.getValueType() == NO_VALUE) ? (real)1.0 : B[i],
+                    out.getHeight(),
+                    out.getWidth(),
+                    a.getWidth());
+      }
+    }
+    return;
+  }
+
+  /// SPARSE_CSR format
+  if (b.getFormat() == SPARSE_CSR) {
+    for (size_t j = 0; j < b.getHeight(); ++j) {
+      int start = b.getRowStartIdx(j);
+      int end = b.getRowStartIdx(j + 1);
+      for (int i = start; i < end; ++i) {
+        colVecAddTo(!bTrans ? C + cols[i] : C + j,
+                    !bTrans ? A + j : A + cols[i],
+                    (b.getValueType() == NO_VALUE) ? (real)1.0 : B[i],
+                    out.getHeight(),
+                    out.getWidth(),
+                    a.getWidth());
+      }
+    }
+    return;
+  }
+}
+
+/**
+ * mul operator
+ * out = scaleT * out + scaleAB * (A * B)
+ * here, scaleT in {0, 1}, scaleAB == 1,
+ * out = A * B, ASSIGN_TO
+ * out += A * B, ADD_TO
+ *
+ *
+ * \param outputs[0]      output matrix (out), M * N,
+ *                        could be either Sparse or Dense Matrix
+ *                        M is num of rows, N is num of columns
+ * \param inputs[0]       first input matrix (A),  M * K (if non-trans)
+ *                        could be either Sparse or Dense Matrix
+ *                        M is num of rows, K is num of columns
+ * \param inputs[1]       second input matrix (B), K * N (if non-trans)
+ *                        could be either Sparse or Dense Matrix
+ *                        K is num of rows, N is num of columns
+ *
+ * Support eight Mul operators, with both GPU and CPU devices
+ * For each device, four Mul operators are supported:
+ * 1. dense (out) = dense (A) * dense (B)
+ * 2. dense (out) = sparse (A) * dense (B)
+ *    sparse matrix only support SPARSE_CSR format
+ * 3. dense (out) = dense (A) * sparse (B)
+ *    sparse matrix support SPARSE_CSC and SPARSE_CSR formats
+ * 4. sparse (out) = dense (A) * dense (B)
+ *    sparse matrix support SPARSE_CSC and SPARSE_CSR formats
+ *
+ */
+template <DeviceType Device>
+class MulFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    aTrans_ = config.get<bool>("aTrans");
+    bTrans_ = config.get<bool>("bTrans");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK(!aTrans_ || !bTrans_)
+        << "Not support both a and b are transpose matrices";
+
+    CHECK_EQ((size_t)2, inputs.size());
+    CHECK_EQ((size_t)1, outputs.size());
+    CHECK(inputs[0].data() && inputs[1].data() && outputs[0].data());
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
+    CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
+    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
+
+    size_t aRow = !aTrans_ ? inputs[0].shape()[0] : inputs[0].shape()[1];
+    size_t aCol = !aTrans_ ? inputs[0].shape()[1] : inputs[0].shape()[0];
+    size_t bRow = !bTrans_ ? inputs[1].shape()[0] : inputs[1].shape()[1];
+    size_t bCol = !bTrans_ ? inputs[1].shape()[1] : inputs[1].shape()[0];
+    /// C = A * B, or C += A * B, for matrix format
+    CHECK_EQ(aCol, bRow);
+    CHECK_EQ(aRow, outputs[0].shape()[0]);
+    CHECK_EQ(bCol, outputs[0].shape()[1]);
+
+    /// only support C = A * B (ASSIGN_TO) or C += A * B (ADD_TO)
+    real scaleT = (outputs[0].getArgType() == ADD_TO) ? 1.0 : 0.0;
+
+    /// support dense = not both sparse * sparse
+    /// or sparse = dense * dense
+    CHECK((!outputs[0].isSparseArg() &&
+           !(inputs[0].isSparseArg() && inputs[1].isSparseArg())) ||
+          (outputs[0].isSparseArg() && !inputs[0].isSparseArg() &&
+           !inputs[1].isSparseArg()));
+
+    auto outMat = outputs[0].matrix<Device>();
+    /// dense matrix = dense matrix * dense matrix
+    if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
+        !outputs[0].isSparseArg()) {
+      MulOp<Device>(outMat,
+                    inputs[0].matrix<Device>(),
+                    inputs[1].matrix<Device>(),
+                    1.0,  // scaleAB
+                    scaleT,
+                    aTrans_,
+                    bTrans_);
+      return;
+    }
+
+    /// dense matrix = dense matrix * sparse matrix
+    if (!inputs[0].isSparseArg() && inputs[1].isSparseArg() &&
+        !outputs[0].isSparseArg()) {
+      CHECK(!aTrans_) << "Not supported a transpose";
+      MulOp<Device>(outMat,
+                    inputs[0].matrix<Device>(),
+                    inputs[1].sparse().SparseMatrix<Device>(),
+                    1.0,  // scaleAB
+                    scaleT,
+                    aTrans_,
+                    bTrans_);
+      return;
+    }
+
+    /// dense matrix = sparse matrix * dense matrix
+    if (inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
+        !outputs[0].isSparseArg()) {
+      CHECK(!bTrans_) << "Not supported b transpose";
+      CHECK_EQ(inputs[0].sparse().dataFormat(), T_SPARSE_CSR)
+          << "Only supported SPARSE_CSR format for sparse matrix a";
+      MulOp<Device>(outMat,
+                    inputs[0].sparse().SparseMatrix<Device>(),
+                    inputs[1].matrix<Device>(),
+                    1.0,  // scaleAB
+                    scaleT,
+                    aTrans_,
+                    bTrans_);
+      return;
+    }
+
+    /// sparse matrix = dense matrix * dense matrix
+    auto outSparseMat = outputs[0].sparse().SparseMatrix<Device>();
+    if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
+        outputs[0].isSparseArg()) {
+      MulOp<Device>(outSparseMat,
+                    inputs[0].matrix<Device>(),
+                    inputs[1].matrix<Device>(),
+                    1.0,  // scaleAB
+                    scaleT,
+                    aTrans_,
+                    bTrans_);
+      return;
+    }
+  }
+
+ private:
+  bool aTrans_;
+  bool bTrans_;
+};
+
+REGISTER_TYPED_FUNC(MulOp, CPU, MulFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(MulOp, GPU, MulFunc);
+#endif
+}  // namespace paddle
diff --git a/paddle/legacy/function/MulOp.h b/paddle/legacy/function/MulOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab33bde17296cd2b17ac45c5a936cfd2727919a5
--- /dev/null
+++ b/paddle/legacy/function/MulOp.h
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+
+namespace paddle {
+/// CPU, dense matrix (+)= dense matrix * dense matrix
+template <DeviceType DType>
+void MulOp(CpuMatrix& out,
+           const CpuMatrix& a,
+           const CpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// CPU, dense matrix (+)= sparse matrix * dense matrix
+template <DeviceType DType>
+void MulOp(CpuMatrix& out,
+           const CpuSparseMatrix& a,
+           const CpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// CPU, dense matrix (+)= dense matrix * sparse matrix
+template <DeviceType DType>
+void MulOp(CpuMatrix& out,
+           const CpuMatrix& a,
+           const CpuSparseMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// CPU, sparse matrix (+)= dense matrix * dense matrix
+template <DeviceType DType>
+void MulOp(CpuSparseMatrix& out,
+           const CpuMatrix& a,
+           const CpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// GPU, dense matrix (+)= dense matrix * dense matrix
+template <DeviceType DType>
+void MulOp(GpuMatrix& out,
+           const GpuMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// GPU, dense matrix (+)= sparse matrix * dense matrix
+template <DeviceType DType>
+void MulOp(GpuMatrix& out,
+           const GpuSparseMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// GPU, dense matrix (+)= dense matrix * sparse matrix
+template <DeviceType DType>
+void MulOp(GpuMatrix& out,
+           const GpuMatrix& a,
+           const GpuSparseMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// GPU, sparse matrix (+)= dense matrix * dense matrix
+template <DeviceType DType>
+void MulOp(GpuSparseMatrix& out,
+           const GpuMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/MulOpGpu.cu b/paddle/legacy/function/MulOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..217c983cb75dfcbc0e17f752a66847c5e92fcc91
--- /dev/null
+++ b/paddle/legacy/function/MulOpGpu.cu
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MulOp.h"
+#include "hl_base.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+
+namespace paddle {
+/// dense matrix (+)= dense matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
+                            const GpuMatrix& a,
+                            const GpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
+  hl_matrix_mul(const_cast<real*>(a.getData()),
+                !aTrans ? HPPL_OP_N : HPPL_OP_T,
+                const_cast<real*>(b.getData()),
+                !bTrans ? HPPL_OP_N : HPPL_OP_T,
+                const_cast<real*>(out.getData()),
+                out.getHeight(),
+                out.getWidth(),
+                !aTrans ? a.getWidth() : a.getHeight(),
+                scaleAB,
+                scaleT,
+                a.getStride(),
+                b.getStride(),
+                out.getStride());
+}
+
+/// dense matrix (+)= sparse matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
+                            const GpuSparseMatrix& a,
+                            const GpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK(out.isContiguous());
+  CHECK(b.isContiguous());
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
+  hl_matrix_csr_mul_dense(a.sMatrix_.get(),
+                          aTrans ? HPPL_OP_T : HPPL_OP_N,
+                          const_cast<real*>(b.getData()),
+                          HPPL_OP_N,
+                          const_cast<real*>(out.getData()),
+                          out.getHeight(),
+                          out.getWidth(),
+                          b.getHeight(),
+                          scaleAB,
+                          scaleT);
+}
+
+/// dense matrix (+)= dense matrix * sparse matrix
+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
+                            const GpuMatrix& a,
+                            const GpuSparseMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK(out.isContiguous());
+  CHECK(a.isContiguous());
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
+
+  if (b.format_ == SPARSE_CSC) {
+    hl_matrix_dense_mul_csc(const_cast<real*>(a.getData()),
+                            HPPL_OP_N,
+                            b.sMatrix_.get(),
+                            bTrans ? HPPL_OP_T : HPPL_OP_N,
+                            const_cast<real*>(out.getData()),
+                            out.getHeight(),
+                            out.getWidth(),
+                            a.getWidth(),
+                            scaleAB,
+                            scaleT);
+  } else {
+    hl_matrix_dense_mul_csr(const_cast<real*>(a.getData()),
+                            HPPL_OP_N,
+                            b.sMatrix_.get(),
+                            bTrans ? HPPL_OP_T : HPPL_OP_N,
+                            const_cast<real*>(out.getData()),
+                            out.getHeight(),
+                            out.getWidth(),
+                            a.getWidth(),
+                            scaleAB,
+                            scaleT);
+  }
+}
+
+/// sparse matrix (+)= dense matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuSparseMatrix& out,
+                            const GpuMatrix& a,
+                            const GpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
+  hl_sparse_matrix_mul(const_cast<real*>(a.getData()),
+                       aTrans ? HPPL_OP_T : HPPL_OP_N,
+                       const_cast<real*>(b.getData()),
+                       bTrans ? HPPL_OP_T : HPPL_OP_N,
+                       out.sMatrix_.get(),
+                       out.getHeight(),
+                       out.getWidth(),
+                       !bTrans ? b.getHeight() : b.getWidth(),
+                       scaleAB,
+                       scaleT);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/MulOpTest.cpp b/paddle/legacy/function/MulOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab08b6f8696ff4aefd2dbdda591b20730b46898c
--- /dev/null
+++ b/paddle/legacy/function/MulOpTest.cpp
@@ -0,0 +1,212 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/legacy/math/tests/test_matrixUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+
+/**
+ *  C += A * B, A, B, C dense matrix
+ *  dense = dense * dense
+ */
+void testFuncDDDMatrix(
+    bool transa, bool transb, size_t dimM, size_t dimN, size_t dimK) {
+  real scaleT = 1.0;
+  size_t heightA = (transa == false) ? dimM : dimK;
+  size_t widthA = (transa == false) ? dimK : dimM;
+  size_t heightB = (transb == false) ? dimK : dimN;
+  size_t widthB = (transb == false) ? dimN : dimK;
+  size_t heightC = dimM;
+  size_t widthC = dimN;
+  // init Test object
+  CpuGpuFuncCompare test(
+      "MulOp", FuncConfig().set("aTrans", transa).set("bTrans", transb));
+  // prepare input arguments
+  /// matrix A : HA * WA
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightA, widthA}));
+  /// matrix B: HB * WB
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightB, widthB}));
+
+  /// output matrix C: HC * WC
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightC, widthC}),
+                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+TEST(MulOp, DDDMatrixMul) {
+  LOG(INFO) << "function test for dense = dense * dense matrix";
+  for (const auto transa : {false, true}) {
+    for (const auto transb : {false, true}) {
+      for (const auto dimM : {1, 10, 100}) {
+        for (const auto dimN : {1, 10}) {
+          for (const auto dimK : {8}) {
+            if (transa && transb) {
+              continue;
+            }
+            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
+                    << " transa=" << transa << " transb=" << transb
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK;
+            testFuncDDDMatrix(transa, transb, dimM, dimN, dimK);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * C += A * B, B, C dense, A sparse
+ * dense = sparse * dense
+ */
+void testFuncDSparseDMatrix(
+    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
+  real scaleT = 1.0;
+  // init Test object
+  CpuGpuFuncCompare test(
+      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
+  // prepare input arguments
+  /// sparse matrix A : M * K
+  test.addInputs(SparseMatrixArg(
+      VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}, nnz, FORMAT, FLOAT_VALUE));
+  /// matrix B: K * N
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));
+
+  /// output matrix C: M * N
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
+                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+TEST(MuLOp, DSparseDMul) {
+  LOG(INFO) << "function test for dense = sparse * dense matrix";
+  for (const auto dimM : {10, 100, 1000}) {
+    for (const auto dimN : {10, 100}) {
+      for (const auto dimK : {3, 10}) {
+        for (const auto nnz : {3, 10}) {
+          for (const auto FORMAT : {SPARSE_CSR}) {
+            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK
+                    << " nnz=" << std::setw(5) << nnz
+                    << " format=" << std::setw(5) << FORMAT;
+            testFuncDSparseDMatrix(dimM, dimN, dimK, nnz, FORMAT);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * C += A * B, A, C dense, B sparse
+ * dense = dense * sparse
+ */
+void testFuncDDSparseMatrix(
+    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
+  real scaleT = 1.0;
+  // init Test object
+  CpuGpuFuncCompare test(
+      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
+  // prepare input arguments
+  /// matrix A : M * K
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
+
+  /// matrix B: K * N
+  test.addInputs(SparseMatrixArg(
+      VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}, nnz, FORMAT, FLOAT_VALUE));
+
+  /// output matrix C: M * N
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
+                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+TEST(MulOp, DDSparseMul) {
+  LOG(INFO) << "function test for dense = dense * sparse matrix";
+  for (const auto dimM : {10, 100, 1000}) {
+    for (const auto dimN : {10, 100}) {
+      for (const auto dimK : {3, 10}) {
+        for (const auto nnz : {3, 10}) {
+          for (const auto FORMAT : {SPARSE_CSR, SPARSE_CSC}) {
+            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK
+                    << " nnz=" << std::setw(5) << nnz
+                    << " format=" << std::setw(5) << FORMAT;
+            testFuncDDSparseMatrix(dimM, dimN, dimK, nnz, FORMAT);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * C += A * B, A sparse, B, C dense
+ * sparse = dense * dense
+ */
+void testFuncSparseDDMatrix(
+    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
+  real scaleT = 1.0;
+  // init Test object
+  CpuGpuFuncCompare test(
+      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
+  // prepare input arguments
+  /// matrix A : M * K
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
+
+  /// matrix B: K * N
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));
+
+  /// output sparse matrix C: M * N
+  test.addOutputs(
+      SparseMatrixArg(
+          VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}, nnz, FORMAT, FLOAT_VALUE),
+      scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+TEST(MulOp, SparseDDMul) {
+  LOG(INFO) << "function test for sparse = dense * dense matrix";
+  for (const auto dimM : {10, 100, 1000}) {
+    for (const auto dimN : {10, 100}) {
+      for (const auto dimK : {3, 10}) {
+        for (const auto nnz : {3, 10}) {
+          for (const auto FORMAT : {SPARSE_CSC, SPARSE_CSR}) {
+            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK
+                    << " nnz=" << std::setw(5) << nnz
+                    << " format=" << std::setw(5) << FORMAT;
+            testFuncSparseDDMatrix(dimM, dimN, dimK, nnz, FORMAT);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/paddle/legacy/function/NaiveConvOp.cpp b/paddle/legacy/function/NaiveConvOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..99c8b81acbbb16a91bc0faa1c7f2873fa94ab108
--- /dev/null
+++ b/paddle/legacy/function/NaiveConvOp.cpp
@@ -0,0 +1,141 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvOp.h"
+
+namespace paddle {
+
+/*
+ * The three arguments are stored in memory in row major order.
+ * inputData  = [batchSize, inputChannels, inputHeight, inputWidth]
+ * filterData = [outputChannels, inputChannels, filterHeight, filterWidth]
+ * outputData = [batchSize, outputChannels, outputHeight, outputWidth]
+ */
+template <class T>
+class NaiveConvFunctor {
+ public:
+  void operator()(const T* inputData,
+                  size_t batchSize,
+                  size_t inputChannels,
+                  size_t inputHeight,
+                  size_t inputWidth,
+                  const T* filterData,
+                  size_t filterHeight,
+                  size_t filterWidth,
+                  T* outputData,
+                  size_t outputChannels,
+                  size_t outputHeight,
+                  size_t outputWidth,
+                  size_t paddingH,
+                  size_t paddingW,
+                  size_t strideH,
+                  size_t strideW) {
+    for (size_t batch = 0; batch < batchSize; batch++) {
+      for (size_t outC = 0; outC < outputChannels; outC++) {
+        for (size_t outH = 0; outH < outputHeight; outH++) {
+          for (size_t outW = 0; outW < outputWidth; outW++) {
+            const int inStartH = (outH * strideH) - paddingH;
+            const int inStartW = (outW * strideW) - paddingW;
+            T outValue = (T)0;
+            for (size_t inC = 0; inC < inputChannels; inC++) {
+              for (size_t fH = 0; fH < filterHeight; fH++) {
+                for (size_t fW = 0; fW < filterWidth; fW++) {
+                  T inValue;
+                  const int inH = inStartH + fH;
+                  const int inW = inStartW + fW;
+                  if ((inH >= 0 && inH < (int)inputHeight) &&
+                      (inW >= 0 && inW < (int)inputWidth)) {
+                    size_t offsetInput =
+                        batch * inputChannels * inputHeight * inputWidth +
+                        inC * inputHeight * inputWidth + inH * inputWidth + inW;
+                    inValue = inputData[offsetInput];
+                  } else {
+                    inValue = (T)0;
+                  }
+                  size_t offsetFilter =
+                      outC * inputChannels * filterHeight * filterWidth +
+                      inC * filterHeight * filterWidth + fH * filterWidth + fW;
+                  T filterValue = filterData[offsetFilter];
+                  outValue += (inValue * filterValue);
+                }
+              }
+            }
+
+            size_t offset =
+                batch * outputChannels * outputHeight * outputWidth +
+                outC * outputHeight * outputWidth + outH * outputWidth + outW;
+            outputData[offset] = outValue;
+          }
+        }
+      }
+    }
+  }
+};
+
+template <DeviceType Device>
+class NaiveConvFunction : public ConvFunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    check(inputs, outputs);
+
+    size_t batchSize = inputs[0].shape()[0];
+    size_t inputChannels = inputs[0].shape()[1];
+    size_t inputHeight = inputs[0].shape()[2];
+    size_t inputWidth = inputs[0].shape()[3];
+    size_t filterHeight = inputs[1].shape()[2];
+    size_t filterWidth = inputs[1].shape()[3];
+    size_t outputChannels = outputs[0].shape()[1];
+    size_t outputHeight = outputs[0].shape()[2];
+    size_t outputWidth = outputs[0].shape()[3];
+
+    real* inputData = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* outputData = outputs[0].data<real>();
+    NaiveConvFunctor<real> conv;
+    conv(inputData,
+         batchSize,
+         inputChannels,
+         inputHeight,
+         inputWidth,
+         filterData,
+         filterHeight,
+         filterWidth,
+         outputData,
+         outputChannels,
+         outputHeight,
+         outputWidth,
+         paddingH(),
+         paddingW(),
+         strideH(),
+         strideW());
+  }
+};
+
+REGISTER_TYPED_FUNC(NaiveConv, CPU, NaiveConvFunction);
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/PadOp.cpp b/paddle/legacy/function/PadOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9d011d28e6938fac6980bed88f774abdbf3532d4
--- /dev/null
+++ b/paddle/legacy/function/PadOp.cpp
@@ -0,0 +1,215 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PadOp.h"
+#include "paddle/legacy/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void Pad<DEVICE_TYPE_CPU>(real* outputs,
+                          const real* inputs,
+                          const int num,
+                          const int inC,
+                          const int inH,
+                          const int inW,
+                          const PadConf& pad) {
+  int cstart = pad.channel[0], cend = pad.channel[1];
+  int hstart = pad.height[0], hend = pad.height[1];
+  int wstart = pad.width[0], wend = pad.width[1];
+  int outC = inC + cstart + cend;
+  int outH = inH + hstart + hend;
+  int outW = inW + wstart + wend;
+  for (int i = 0; i < num; i++) {
+    for (int c = 0; c < inC; c++) {
+      for (int h = 0; h < inH; h++) {
+        int inoff = ((i * inC + c) * inH + h) * inW;
+        int outoff =
+            ((i * outC + c + cstart) * outH + h + hstart) * outW + wstart;
+        memcpy(outputs + outoff, inputs + inoff, inW * sizeof(real));
+      }
+    }
+  }
+}
+
+template <>
+void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
+                              const real* outGrad,
+                              const int num,
+                              const int inC,
+                              const int inH,
+                              const int inW,
+                              const PadConf& pad) {
+  int cstart = pad.channel[0], cend = pad.channel[1];
+  int hstart = pad.height[0], hend = pad.height[1];
+  int wstart = pad.width[0], wend = pad.width[1];
+  int outC = inC + cstart + cend;
+  int outH = inH + hstart + hend;
+  int outW = inW + wstart + wend;
+  for (int i = 0; i < num; i++) {
+    for (int c = 0; c < inC; c++) {
+      for (int h = 0; h < inH; h++) {
+        int inoff = ((i * inC + c) * inH + h) * inW;
+        int outoff =
+            ((i * outC + c + cstart) * outH + h + hstart) * outW + wstart;
+        CpuVector inG = CpuVector(inW, inGrad + inoff);
+        CpuVector outG = CpuVector(inW, const_cast<real*>(outGrad + outoff));
+        inG += outG;
+      }
+    }
+  }
+}
+
+static inline PadConf castToPadConf(const FuncConfig& conf) {
+  return {conf.get<std::vector<uint32_t>>("channel"),
+          conf.get<std::vector<uint32_t>>("height"),
+          conf.get<std::vector<uint32_t>>("width")};
+}
+
+/**
+ * \brief Padding zeros to input according to the specify dimension.
+ *        The struct pad_ contains the padding size in each dimension.
+ *        The input and output is a 4D tensor. In PadFunc, we only
+ *        pad zeros to the 2nd to 4th dimension.
+ *
+ * Argument in this Function:
+ * \param pad_    A struct object contains the padding size in each dimension.
+ *                It has six integers. The channelStart and channelEnd indicate
+ *                how many zeros to add before and after the input in channel
+ *                dimension. And the heightStart and heightEnd indicate padding
+ *                in height dimension. The widthStart and widthEnd indicate the
+ *                padding in width dimension.
+ * \param inputs  A 4D tensor, only one input.
+ * \param outputs A 4D tensor, the output value after padding.
+ *
+ * For example,
+ * Input(2,2,2,3) = [
+ *                    [ [[1,2,3], [3,4,5]],
+ *                      [[2,3,5], [1,6,7]] ],
+ *                    [ [[4,3,1], [1,8,7]],
+ *                      [[3,8,9], [2,3,5]] ]
+ *                  ] # the shape is (1,2,2,3)
+ *
+ * pad_: if channelStart = channelEnd = 1, others are 0.
+ * Output(2,4,2,3) = [
+ *                    [ [[0,0,0], [0,0,0]],
+ *                      [[1,2,3], [3,4,5]],
+ *                      [[2,3,5], [1,6,7]],
+ *                      [[0,0,0], [0,0,0]] ],
+ *                    [ [[0,0,0], [0,0,0]],
+ *                      [[4,3,1], [1,8,7]],
+ *                      [[3,8,9], [2,3,5]],
+ *                      [[0,0,0], [0,0,0]] ]
+ *                   ] # the shape is (2,4,2,3)
+ *
+ * pad_: if widthStart = 1, widthEnd = 2, others are 0.
+ * Output(2,2,2,6) = [
+ *                     [ [[0,1,2,3,0,0], [0,3,4,5,0,0]],
+ *                       [[0,2,3,5,0,0], [0,1,6,7,0,0]] ],
+ *                     [ [[0,4,3,1,0,0], [0,1,8,7,0,0]],
+ *                       [[0,3,8,9,0,0], [0,2,3,5,0,0]] ],
+ *                   ] # the shape is (2,2,2,6)
+ *
+ * pad_: if heightStart = 1, heightEnd = 1, others are 0.
+ * Output(2,2,4,3) = [
+ *                     [ [[0,0,0], [1,2,3], [3,4,5], [0,0,0]],
+ *                       [[0,0,0], [2,3,5], [1,6,7], [0,0,0]] ],
+ *                     [ [[0,0,0], [4,3,1], [1,8,7], [0,0,0]],
+ *                       [[0,0,0], [3,8,9], [2,3,5], [0,0,0]] ],
+ *                   ] # the shape is (2,2,4,3)
+ */
+
+template <DeviceType Device>
+class PadFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+    size_t num = inputs[0].shape()[0];
+    size_t inC = inputs[0].shape()[1];
+    size_t inH = inputs[0].shape()[2];
+    size_t inW = inputs[0].shape()[3];
+    typename Tensor<real, Device>::Vector vec(outputs[0].shape().getElements(),
+                                              outputs[0].data<real>());
+    vec.zero();
+
+    Pad<Device>(outputs[0].data<real>(),
+                inputs[0].data<real>(),
+                num,
+                inC,
+                inH,
+                inW,
+                pad_);
+  }
+
+ private:
+  PadConf pad_;
+};
+
+/**
+ * \brief The backward propagation of padding Function. Remove the elements
+ *        in the padding positions of forward.
+ *
+ * Argument in this Function:
+ * \param pad_    The same meaning as it in PadFunc.
+ * \param inputs  The gradient with respect to the output value of PadFunc.
+ * \param outputs The gradient with respect to the input value of PadFunc.
+ */
+
+template <DeviceType Device>
+class PadGradFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+
+    size_t num = outputs[0].shape()[0];
+    size_t inC = outputs[0].shape()[1];
+    size_t inH = outputs[0].shape()[2];
+    size_t inW = outputs[0].shape()[3];
+
+    if (outputs[0].getArgType() != ADD_TO) {
+      // for unit test
+      typename Tensor<real, Device>::Vector tmp(
+          outputs[0].shape().getElements(), outputs[0].data<real>());
+      tmp.zero();
+    }
+
+    PadGrad<Device>(outputs[0].data<real>(),
+                    inputs[0].data<real>(),
+                    num,
+                    inC,
+                    inH,
+                    inW,
+                    pad_);
+  }
+
+ private:
+  PadConf pad_;
+};
+
+REGISTER_TYPED_FUNC(Pad, CPU, PadFunc);
+REGISTER_TYPED_FUNC(PadGrad, CPU, PadGradFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(Pad, GPU, PadFunc);
+REGISTER_TYPED_FUNC(PadGrad, GPU, PadGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/PadOp.h b/paddle/legacy/function/PadOp.h
similarity index 100%
rename from paddle/function/PadOp.h
rename to paddle/legacy/function/PadOp.h
diff --git a/paddle/function/PadOpGpu.cu b/paddle/legacy/function/PadOpGpu.cu
similarity index 100%
rename from paddle/function/PadOpGpu.cu
rename to paddle/legacy/function/PadOpGpu.cu
diff --git a/paddle/function/PadOpTest.cpp b/paddle/legacy/function/PadOpTest.cpp
similarity index 100%
rename from paddle/function/PadOpTest.cpp
rename to paddle/legacy/function/PadOpTest.cpp
diff --git a/paddle/legacy/function/RowConvOp.cpp b/paddle/legacy/function/RowConvOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3be50e80d71fabdb3e7a22bfc061da09412c132d
--- /dev/null
+++ b/paddle/legacy/function/RowConvOp.cpp
@@ -0,0 +1,225 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "RowConvOp.h"
+#include <iostream>
+#include "paddle/legacy/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void RowConv<DEVICE_TYPE_CPU>(CpuMatrix& out,
+                              const CpuMatrix& in,
+                              const CpuMatrix& filter,
+                              const CpuIVector& seq) {
+  const int* starts = seq.getData();
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  for (size_t i = 0; i < numSeq; ++i) {
+    size_t begin = starts[i];
+    size_t end = starts[i + 1];
+    for (size_t j = begin; j < end; ++j) {
+      MatrixPtr x;
+      MatrixPtr w;
+      if ((j + contextLength) < end) {
+        x = (const_cast<CpuMatrix&>(in)).subMatrix(j, contextLength);
+        w = (const_cast<CpuMatrix&>(filter)).subMatrix(0, contextLength);
+      } else {
+        x = (const_cast<CpuMatrix&>(in)).subMatrix(j, end - j);
+        w = (const_cast<CpuMatrix&>(filter)).subMatrix(0, end - j);
+      }
+      MatrixPtr y = out.subMatrix(j, 1);
+      y->addDotMulVMM(*x, *w);
+    }
+  }
+}
+
+template <>
+void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
+                                  const CpuMatrix& in,
+                                  const CpuMatrix& filter,
+                                  CpuMatrix& inG,
+                                  CpuMatrix& filterG,
+                                  const CpuIVector& seq) {
+  // gradient w.r.t filter
+  const int* starts = seq.getData();
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  if (filterG) {
+    for (size_t i = 0; i < numSeq; ++i) {
+      size_t begin = starts[i];
+      size_t end = starts[i + 1];
+      size_t steps = end - begin;
+      for (size_t j = 0; j < contextLength && (begin + j) < end; ++j) {
+        MatrixPtr x =
+            (const_cast<CpuMatrix&>(in)).subMatrix(begin + j, steps - j);
+        MatrixPtr dy =
+            (const_cast<CpuMatrix&>(outG)).subMatrix(begin, steps - j);
+        MatrixPtr dw = filterG.subMatrix(j, 1);
+        dw->addDotMulVMM(*dy, *x);
+      }
+    }
+  }
+
+  // gradient w.r.t input feature
+  if (inG) {
+    for (size_t i = 0; i < numSeq; ++i) {
+      size_t begin = starts[i];
+      size_t end = starts[i + 1];
+      size_t steps = end - begin;
+      for (size_t j = 0; j < steps; ++j) {
+        MatrixPtr dx = inG.subMatrix(begin + j, 1);
+        for (size_t t = 0; t < contextLength; ++t) {
+          if (int(j - t) >= 0) {
+            MatrixPtr dy =
+                (const_cast<CpuMatrix&>(outG)).subMatrix(begin + j - t, 1);
+            MatrixPtr w = (const_cast<CpuMatrix&>(filter)).subMatrix(t, 1);
+            dx->addDotMul(*dy, *w, 1.0, 1.0);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief The row convolution is called lookahead convolution. It is firstly
+ * introduced in deep-speech2 system. The bidirectional RNN that learns
+ * representation for a sequence by performing a forward and a backward pass
+ * through the entire sequence. However, unlike unidirectional RNNs,
+ * bidirectional RNNs are challenging to deploy in an online and low-latency
+ * setting. The lookahead convolution incorporates information from future
+ * subsequences in a computationally efficient manner to improve unidirectional
+ * recurrent neural networks.
+ *
+ * The connection of row convolution is different form the 1D sequence
+ * convolution. Assumed that, the future context-length is k, that is to say,
+ * it can get the output at timestep t by using the the input feature from t-th
+ * timestep to (t+k)-th timestep. Assumed that the hidden dim of input
+ * activations are d, the activations r_t for the new layer at time-step t are:
+ *
+ *
+ *            -- k + 1
+ *  r(t,i) =  >       W(i,j) * h(t+j-1, i),  for (1 <= i <= d)
+ *            -- j = 1
+ *
+ *
+ * The weight shape is: (k + 1) x d
+ * Function Arguments:
+ *
+ * \param inputs[0]  The input activations.
+ * \param inputs[0]  The filter (or weight) and shape is (k+1) x d.
+ * \param outputs[1] The output activations.
+ *
+ * [1] Dario Amodei, etc. Deep Speech 2 : End-to-End Speech Recognition in
+ * English
+ *     and Mandarin. https://arxiv.org/abs/1512.02595
+ */
+
+template <DeviceType Device>
+class RowConvFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {}
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    // check
+    CHECK_EQ(2UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    // TODO(qingqing): support ASSIGN_TO.
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here.";
+    const auto in = dynamic_cast<const SequenceArg&>(inputs[0]);
+    auto out = dynamic_cast<const SequenceArg&>(outputs[0]);
+    auto w = inputs[1];
+    CHECK(in.data() && out.data() && in.getSequenceId().data());
+    CHECK_EQ(in.shape().ndims(), 2UL);
+    CHECK(in.shape() == out.shape());
+    CHECK_EQ(w.shape()[1], in.shape()[1]);
+
+    auto outMat = out.matrix<Device>();
+    const auto inMat = in.matrix<Device>();
+    const auto wMat = w.matrix<Device>();
+    const auto seqId = in.getSequenceId().vector<int, Device>();
+
+    RowConv<Device>(outMat, inMat, wMat, seqId);
+  }
+};
+
+/**
+ * \brief The backward of row convolution function. This function calculated
+ * the gradient w.r.t filter and the gradient w.r.t input activations(or data).
+ *
+ * Argument in this Function:
+ *
+ * \param inputs[0]  The gradient w.r.t output activations.
+ * \param inputs[1]  The input activations.
+ * \param inputs[2]  The filter (or weight) and shape is (k+1) x d.
+ * \param outputs[0] The gradient w.r.t input activations.
+ * \param outputs[1] The gradient w.r.r filter.
+ *
+ * Abbreviation:
+ * w.r.t: with respect to.
+ */
+
+template <DeviceType Device>
+class RowConvGradFunc : public FunctionBase {
+  // TODO(qingqing): split into RowConvDataFunc and RowConvWeightFunc
+ public:
+  void init(const FuncConfig& config) override {}
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    // check
+    CHECK_EQ(3UL, inputs.size());
+    CHECK_EQ(2UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
+    CHECK(inputs[0].isSequenceArg() && inputs[1].isSequenceArg() &&
+          outputs[0].isSequenceArg())
+        << "SequenceArg required here.";
+
+    const auto outGrad = dynamic_cast<const SequenceArg&>(inputs[0]);
+    const auto in = dynamic_cast<const SequenceArg&>(inputs[1]);
+    const auto w = inputs[2];
+    auto inGrad = dynamic_cast<const SequenceArg&>(outputs[0]);
+    auto wGrad = outputs[1];
+
+    CHECK_EQ(in.shape().ndims(), 2UL);
+    CHECK(in.shape() == inGrad.shape());
+    CHECK(in.shape() == outGrad.shape());
+    CHECK_EQ(wGrad.shape()[1], in.shape()[1]);
+
+    const auto outGMat = outGrad.matrix<Device>();
+    const auto inMat = in.matrix<Device>();
+    const auto wMat = w.matrix<Device>();
+    auto inGMat = inGrad.data()
+                      ? inGrad.matrix<Device>()
+                      : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+    auto wGMat = wGrad.data()
+                     ? wGrad.matrix<Device>()
+                     : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+    const auto seqId = in.getSequenceId().vector<int, Device>();
+
+    RowConvGrad<Device>(outGMat, inMat, wMat, inGMat, wGMat, seqId);
+  }
+};
+
+REGISTER_TYPED_FUNC(RowConv, CPU, RowConvFunc);
+REGISTER_TYPED_FUNC(RowConvGrad, CPU, RowConvGradFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(RowConv, GPU, RowConvFunc);
+REGISTER_TYPED_FUNC(RowConvGrad, GPU, RowConvGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/RowConvOp.h b/paddle/legacy/function/RowConvOp.h
similarity index 100%
rename from paddle/function/RowConvOp.h
rename to paddle/legacy/function/RowConvOp.h
diff --git a/paddle/legacy/function/RowConvOpGpu.cu b/paddle/legacy/function/RowConvOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a6d2e4c7e38b12bcd448a85f9e74df226e6984af
--- /dev/null
+++ b/paddle/legacy/function/RowConvOpGpu.cu
@@ -0,0 +1,373 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/cuda/include/hl_base.h"
+#include "paddle/legacy/function/RowConvOp.h"
+
+namespace paddle {
+
+template <int BLOCK_H, int BLOCK_W>
+__global__ void KeRowConv(real* y,
+                          const real* x,
+                          const real* w,
+                          const int* starts,
+                          const int height,
+                          const int width,
+                          const int numSeq,
+                          const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  __shared__ real sw[BLOCK_H][BLOCK_W];
+
+  for (int i = tidy; i < context; i += blky) {
+    sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0;
+  }
+
+  __syncthreads();
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      real sum = 0;
+      int off = (start + j) * width;
+      for (int t = 0; t < context; ++t) {
+        if ((start + j + t) < end) {
+          int xoff = off + t * width;
+          real xVal = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
+          sum += sw[t][tidx] * xVal;
+        }
+      }
+      if (gidx + tidx < width) {
+        y[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+
+__global__ void KeRowConv2(real* y,
+                           const real* x,
+                           const real* w,
+                           const int* starts,
+                           const int height,
+                           const int width,
+                           const int numSeq,
+                           const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      int off = (start + j) * width;
+      real sum = 0;
+      for (int t = 0; t < context && (start + j + t) < end; ++t) {
+        int xoff = off + t * width;
+        real xd = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
+        real wd = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
+        sum += wd * xd;
+      }
+      if (gidx + tidx < width) {
+        y[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+
+template <>
+void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,  // NOLINT
+                              const GpuMatrix& in,
+                              const GpuMatrix& filter,
+                              const GpuIVector& seq) {
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  const size_t height = in.getHeight();
+  const size_t width = in.getWidth();
+
+  real* y = out.getData();
+  const real* x = in.getData();
+  const real* w = filter.getData();
+  const int* starts = seq.getData();
+
+  dim3 dimBlock(32, 32);
+  dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
+
+  if (contextLength <= 32) {
+    KeRowConv<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
+        y, x, w, starts, height, width, numSeq, contextLength);
+  } else {
+    KeRowConv2<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
+        y, x, w, starts, height, width, numSeq, contextLength);
+  }
+  CHECK_SYNC("RowConv");
+}
+
+template <int BLOCK_H, int BLOCK_W, int CONTEXT>
+__global__ void KeRowConvBwWeight(real* dw,
+                                  const real* x,
+                                  const real* dy,
+                                  const int* starts,
+                                  const int height,
+                                  const int width,
+                                  const int numSeq,
+                                  const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  __shared__ real sh_x[BLOCK_W][BLOCK_H];
+  __shared__ real sh_dy[BLOCK_W][BLOCK_H + CONTEXT - 1];
+  __shared__ real sh_dw[CONTEXT][BLOCK_W];
+
+  if (tidy < context) {
+    sh_dw[tidy][tidx] = 0.0;
+  }
+  __syncthreads();
+
+  // NOTE(zcd): temporary solution
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, true);
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H;
+    for (int j = tidy; j < size; j += BLOCK_H) {
+      int xoff = gidx + tidx;
+      int yoff = start + j;
+
+      // transpose
+      sh_x[tidx][tidy] =
+          (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
+      sh_dy[tidx][tidy + context - 1] =
+          (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0;
+      __syncthreads();
+      if (tidy < (context - 1)) {
+        yoff = yoff - context + 1;
+        sh_dy[tidx][tidy] =
+            (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0;
+      }
+      __syncthreads();
+
+      for (int t = 0; t < context; t++) {
+        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx + context - 1 - t];
+        __syncthreads();
+        // warp size and blockDim.x is 32.
+
+        for (int offset = 16; offset > 0; offset /= 2)
+          val += __shfl_down_sync(mask, val, offset);
+
+        __syncthreads();
+        if (tidx == 0) {
+          sh_dw[t][tidy] += val;
+        }
+        __syncthreads();
+      }
+    }
+  }
+
+  for (int t = tidy; (t < context) && ((gidx + tidx) < width); t += blky) {
+    dw[t * width + gidx + tidx] += sh_dw[t][tidx];
+  }
+}
+
+template <int BLOCK_H, int BLOCK_W>
+__global__ void KeRowConvBwWeight2(real* dw,
+                                   const real* x,
+                                   const real* dy,
+                                   const int* starts,
+                                   const int height,
+                                   const int width,
+                                   const int numSeq,
+                                   const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  __shared__ real sh_x[BLOCK_H][BLOCK_W];
+  __shared__ real sh_dy[BLOCK_H][BLOCK_W];
+
+  // NOTE(zcd): temporary solution
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, true);
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+
+    const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H;
+    for (int j = tidy; j < size; j += BLOCK_H) {
+      int xoff = gidx + tidx;
+      int yoff = start + j;
+
+      // transpose
+      sh_x[tidx][tidy] =
+          (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
+      __syncthreads();
+
+      for (int t = 0; t < context; t++) {
+        sh_dy[tidx][tidy] =
+            (xoff < width && (yoff - t) >= start && yoff - t < end)
+                ? dy[(yoff - t) * width + xoff]
+                : 0.0;
+        __syncthreads();
+
+        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx];
+        __syncthreads();
+        // warp size and blockDim.x is 32.
+        for (int offset = 16; offset > 0; offset /= 2)
+          val += __shfl_down_sync(mask, val, offset);
+
+        __syncthreads();
+
+        if (tidx == 0 && (gidx + tidy) < width) {
+          dw[t * width + gidx + tidy] += val;
+        }
+      }
+    }
+  }
+}
+
+template <int BLOCK_H, int BLOCK_W>
+__global__ void KeRowConvBwData(real* dx,
+                                const real* w,
+                                const real* dy,
+                                const int* starts,
+                                const int height,
+                                const int width,
+                                const int numSeq,
+                                const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  __shared__ real sw[BLOCK_H][BLOCK_W];
+
+  for (int i = tidy; i < context; i += blky) {
+    sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0;
+  }
+
+  __syncthreads();
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      real sum = 0;
+      int off = (start + j) * width;
+      for (int t = 0; t < context && (j - t) >= 0; ++t) {
+        int dyOff = off - t * width;
+        real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
+        sum += sw[t][tidx] * dyVal;
+      }
+      if (gidx + tidx < width) {
+        dx[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+
+__global__ void KeRowConvBwData2(real* dx,
+                                 const real* w,
+                                 const real* dy,
+                                 const int* starts,
+                                 const int height,
+                                 const int width,
+                                 const int numSeq,
+                                 const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      real sum = 0;
+      int off = (start + j) * width;
+      for (int t = 0; t < context && (j - t) >= 0; ++t) {
+        int dyOff = off - t * width;
+        real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
+        real wVal = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
+        sum += wVal * dyVal;
+      }
+      if (gidx + tidx < width) {
+        dx[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+
+template <>
+void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
+                                  const GpuMatrix& in,
+                                  const GpuMatrix& filter,
+                                  GpuMatrix& inG,      // NOLINT
+                                  GpuMatrix& filterG,  // NOLINT
+                                  const GpuIVector& seq) {
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  const size_t height = in.getHeight();
+  const size_t width = in.getWidth();
+
+  const real* dy = outG.getData();
+  const real* x = in.getData();
+  const real* w = filter.getData();
+  const int* starts = seq.getData();
+
+  if (filterG) {
+    dim3 dimBlock(32, 32);
+    dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
+    real* dw = filterG.getData();
+    if (contextLength <= 32) {
+      KeRowConvBwWeight<32, 32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
+          dw, x, dy, starts, height, width, numSeq, contextLength);
+    } else {
+      KeRowConvBwWeight2<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
+          dw, x, dy, starts, height, width, numSeq, contextLength);
+    }
+  }
+
+  if (inG) {
+    real* dx = inG.getData();
+    dim3 dimBlock2(32, 32);
+    dim3 dimGrid2(DIVUP(width, dimBlock2.x), 1);
+    if (contextLength <= 64) {
+      KeRowConvBwData<32, 64><<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>(
+          dx, w, dy, starts, height, width, numSeq, contextLength);
+    } else {
+      KeRowConvBwData2<<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>(
+          dx, w, dy, starts, height, width, numSeq, contextLength);
+    }
+  }
+
+  CHECK_SYNC("RowConvGrad");
+}
+
+}  // namespace paddle
diff --git a/paddle/function/RowConvOpTest.cpp b/paddle/legacy/function/RowConvOpTest.cpp
similarity index 100%
rename from paddle/function/RowConvOpTest.cpp
rename to paddle/legacy/function/RowConvOpTest.cpp
diff --git a/paddle/legacy/function/ScaleSubRegionOp.cpp b/paddle/legacy/function/ScaleSubRegionOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..03a422a740dca4499532cdb1bdfbf3d3ab272a9a
--- /dev/null
+++ b/paddle/legacy/function/ScaleSubRegionOp.cpp
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionOp.h"
+#include "paddle/legacy/function/TensorShape.h"
+
+namespace paddle {
+
+template <>
+void ScaleSubRegion<DEVICE_TYPE_CPU>(real* outputs,
+                                     const real* inputs,
+                                     const real* indices,
+                                     const TensorShape shape,
+                                     const FuncConfig& conf) {
+  real value = conf.get<real>("value");
+
+  int number = shape[0];
+  int channel = shape[1];
+  int height = shape[2];
+  int width = shape[3];
+
+  memcpy(outputs, inputs, number * channel * height * width * sizeof(real));
+
+  for (int n = 0; n < number; ++n) {
+    // indices start from 1
+    int offset = n * 6;
+    for (int c = indices[offset] - 1; c < indices[offset + 1]; ++c) {
+      for (int h = indices[offset + 2] - 1; h < indices[offset + 3]; ++h) {
+        for (int w = indices[offset + 4] - 1; w < indices[offset + 5]; ++w) {
+          int idx = ((n * channel + c) * height + h) * width + w;
+          outputs[idx] *= value;
+        }
+      }
+    }
+  }
+}
+
+template <>
+void ScaleSubRegionGrad<DEVICE_TYPE_CPU>(const real* inGrad,
+                                         real* outGrad,
+                                         const real* indices,
+                                         const TensorShape shape,
+                                         const FuncConfig& conf) {
+  real value = conf.get<real>("value");
+
+  int number = shape[0];
+  int channel = shape[1];
+  int height = shape[2];
+  int width = shape[3];
+
+  for (int n = 0; n < number; ++n) {
+    for (int c = 0; c < channel; ++c) {
+      for (int h = 0; h < height; ++h) {
+        for (int w = 0; w < width; ++w) {
+          int idx = ((n * channel + c) * height + h) * width + w;
+          int offset = n * 6;
+          if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
+              h >= (indices[offset + 2] - 1) &&
+              h <= (indices[offset + 3] - 1) &&
+              w >= (indices[offset + 4] - 1) &&
+              w <= (indices[offset + 5] - 1)) {
+            outGrad[idx] += inGrad[idx] * value;
+          } else {
+            outGrad[idx] += inGrad[idx];
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief For each instance, ScaleSubRegion can be used to multiply a value to
+ *        a specified sub continuous region. By providing start index and end
+ *        index for C/H/W, you can specify the location and shape of the region.
+ *
+ * Argument in this Function:
+ * \param inputs    A 4-D tensor with shape [N, C, H, W], only one input.
+ * \param indices   A 2-D tensor with shape [N, 6], indicates the sub region.
+ * \param outputs   A 4-D tensor with same shape as inputs, output value.
+ */
+template <DeviceType Device>
+class ScaleSubRegionFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(2UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+    TensorShape shape = inputs[0].shape();
+
+    ScaleSubRegion<Device>(outputs[0].data<real>(),
+                           inputs[0].data<real>(),
+                           inputs[1].data<real>(),
+                           shape,
+                           conf_);
+  }
+
+ private:
+  FuncConfig conf_;
+};
+
+/**
+ * \brief The backward propagation of ScaleSubRegion Function.
+ *
+ * Argument in this Function:
+ * \param inputs  A 4-D tensor with shape [N, C, H, W], output gradient.
+ * \param indices A 2-D tensor with shape [N, 6], indicates the sub region.
+ * \param outputs A 4-D tensor with shape [N, C, H, W], gradient of input value.
+ */
+
+template <DeviceType Device>
+class ScaleSubRegionGradFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(2UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
+    TensorShape shape = inputs[0].shape();
+
+    ScaleSubRegionGrad<Device>(inputs[0].data<real>(),
+                               outputs[0].data<real>(),
+                               inputs[1].data<real>(),
+                               shape,
+                               conf_);
+  }
+
+ private:
+  FuncConfig conf_;
+};
+
+REGISTER_TYPED_FUNC(ScaleSubRegion, CPU, ScaleSubRegionFunc);
+REGISTER_TYPED_FUNC(ScaleSubRegionGrad, CPU, ScaleSubRegionGradFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(ScaleSubRegion, GPU, ScaleSubRegionFunc);
+REGISTER_TYPED_FUNC(ScaleSubRegionGrad, GPU, ScaleSubRegionGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOp.h b/paddle/legacy/function/ScaleSubRegionOp.h
similarity index 100%
rename from paddle/function/ScaleSubRegionOp.h
rename to paddle/legacy/function/ScaleSubRegionOp.h
diff --git a/paddle/function/ScaleSubRegionOpGpu.cu b/paddle/legacy/function/ScaleSubRegionOpGpu.cu
similarity index 100%
rename from paddle/function/ScaleSubRegionOpGpu.cu
rename to paddle/legacy/function/ScaleSubRegionOpGpu.cu
diff --git a/paddle/function/ScaleSubRegionOpTest.cpp b/paddle/legacy/function/ScaleSubRegionOpTest.cpp
similarity index 100%
rename from paddle/function/ScaleSubRegionOpTest.cpp
rename to paddle/legacy/function/ScaleSubRegionOpTest.cpp
diff --git a/paddle/legacy/function/SwitchOp.cpp b/paddle/legacy/function/SwitchOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c6accd18039180aa521c18193e576d22e11f5a97
--- /dev/null
+++ b/paddle/legacy/function/SwitchOp.cpp
@@ -0,0 +1,140 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "SwitchOp.h"
+#include "paddle/legacy/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void NCHW2NHWC<DEVICE_TYPE_CPU>(real* outputs,
+                                const real* inputs,
+                                const int num,
+                                const int inC,
+                                const int inH,
+                                const int inW,
+                                const int argType) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < inC; ++c) {
+      for (int h = 0; h < inH; ++h) {
+        for (int w = 0; w < inW; ++w) {
+          if (argType == ADD_TO) {
+            outputs[((n * inH + h) * inW + w) * inC + c] += *(inputs++);
+          } else {
+            outputs[((n * inH + h) * inW + w) * inC + c] = *(inputs++);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <>
+void NHWC2NCHW<DEVICE_TYPE_CPU>(real* outputs,
+                                const real* inputs,
+                                const int num,
+                                const int inH,
+                                const int inW,
+                                const int inC,
+                                const int argType) {
+  for (int n = 0; n < num; ++n) {
+    for (int h = 0; h < inH; ++h) {
+      for (int w = 0; w < inW; ++w) {
+        for (int c = 0; c < inC; ++c) {
+          if (argType == ADD_TO) {
+            outputs[((n * inC + c) * inH + h) * inW + w] += *(inputs++);
+          } else {
+            outputs[((n * inC + c) * inH + h) * inW + w] = *(inputs++);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief  Switch dimension order of image input.
+ *         The input and output is a 4D tensor. Switch order
+ *         'batch_size,channels, height, width' to
+ *         order 'batch_size, height, width, channels'.
+ *
+ * Argument in this Function:
+ * \param inputs  input data with order 'batch_size,channels, height, width'.
+ * \param outputs output data with order 'batch_size, height, width, channels'.
+ */
+template <DeviceType Device>
+class NCHW2NHWCFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {}
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+
+    size_t num = inputs[0].shape()[0];
+    size_t inC = inputs[0].shape()[1];
+    size_t inH = inputs[0].shape()[2];
+    size_t inW = inputs[0].shape()[3];
+    NCHW2NHWC<Device>(outputs[0].data<real>(),
+                      inputs[0].data<real>(),
+                      num,
+                      inC,
+                      inH,
+                      inW,
+                      outputs[0].getArgType());
+  }
+};
+
+/**
+ * \brief  Switch dimension order of image input.
+ *         The input and output is a 4D tensor. Switch order
+ *         'batch_size, height, width, channels' to
+ *         order 'batch_size, channels, height, width'.
+ *
+ * Argument in this Function:
+ * \param inputs  input data with order 'batch_size, height, width, channels'.
+ * \param outputs output data with order 'batch_size, channels, height, width'.
+ */
+template <DeviceType Device>
+class NHWC2NCHWFunc : public FunctionBase {
+ public:
+  void init(const FuncConfig& config) override {}
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+
+    size_t num = inputs[0].shape()[0];
+    size_t inH = inputs[0].shape()[1];
+    size_t inW = inputs[0].shape()[2];
+    size_t inC = inputs[0].shape()[3];
+
+    NHWC2NCHW<Device>(outputs[0].data<real>(),
+                      inputs[0].data<real>(),
+                      num,
+                      inH,
+                      inW,
+                      inC,
+                      outputs[0].getArgType());
+  }
+};
+
+REGISTER_TYPED_FUNC(NCHW2NHWC, CPU, NCHW2NHWCFunc);
+REGISTER_TYPED_FUNC(NHWC2NCHW, CPU, NHWC2NCHWFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(NCHW2NHWC, GPU, NCHW2NHWCFunc);
+REGISTER_TYPED_FUNC(NHWC2NCHW, GPU, NHWC2NCHWFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/SwitchOp.h b/paddle/legacy/function/SwitchOp.h
similarity index 100%
rename from paddle/function/SwitchOp.h
rename to paddle/legacy/function/SwitchOp.h
diff --git a/paddle/function/SwitchOpGpu.cu b/paddle/legacy/function/SwitchOpGpu.cu
similarity index 100%
rename from paddle/function/SwitchOpGpu.cu
rename to paddle/legacy/function/SwitchOpGpu.cu
diff --git a/paddle/function/SwitchOpTest.cpp b/paddle/legacy/function/SwitchOpTest.cpp
similarity index 100%
rename from paddle/function/SwitchOpTest.cpp
rename to paddle/legacy/function/SwitchOpTest.cpp
diff --git a/paddle/legacy/function/TensorShape.h b/paddle/legacy/function/TensorShape.h
new file mode 100644
index 0000000000000000000000000000000000000000..d4d1eae3960c333a2a7dc6099ae7a68677fdcd5f
--- /dev/null
+++ b/paddle/legacy/function/TensorShape.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+
+namespace paddle {
+
+/**
+ * TensorShape used to represent shape of normal tensor.
+ */
+class TensorShape {
+ public:
+  TensorShape() : ndims_(0), nelements_(0) { initDims(0); }
+
+  TensorShape(size_t ndims) : ndims_(ndims), nelements_(1) { initDims(ndims); };
+
+  TensorShape(std::initializer_list<size_t> dims) {
+    ndims_ = dims.size();
+    initDims(ndims_);
+    dims_.assign(dims);
+    numElements();
+  };
+
+  TensorShape(const TensorShape& t)
+      : ndims_(t.ndims_), nelements_(t.nelements_) {
+    initDims(ndims_);
+    dims_.assign(t.dims_.begin(), t.dims_.end());
+  };
+
+  // get the size of specified dimension
+  size_t operator[](size_t dim) const {
+    CHECK_GE(dim, (size_t)0);
+    CHECK_LT(dim, ndims_);
+    return dims_[dim];
+  }
+
+  // set the size of specified dimension
+  void setDim(size_t dim, size_t size) {
+    CHECK_GE(dim, (size_t)0);
+    CHECK_LT(dim, ndims_);
+    dims_[dim] = size;
+    numElements();
+  }
+
+  void reshape(std::initializer_list<size_t> dims) {
+    ndims_ = dims.size();
+    if (ndims_ > kMinDims) {
+      dims_.resize(ndims_);
+    }
+    dims_.assign(dims);
+    numElements();
+  }
+
+  // number of dimensions of the tensor
+  size_t ndims() const { return ndims_; }
+
+  size_t getElements() const { return nelements_; }
+
+  bool operator==(const TensorShape& t) const {
+    if (ndims() != t.ndims()) return false;
+    for (size_t i = 0; i < ndims(); i++) {
+      if (dims_[i] != t.dims_[i]) return false;
+    }
+
+    return true;
+  }
+
+  bool operator!=(const TensorShape& t) const { return !(*this == t); }
+
+ private:
+  // compute number of elements
+  void numElements() {
+    nelements_ = 1;
+    for (size_t n = 0; n < ndims_; n++) {
+      nelements_ *= dims_[n];
+    }
+  }
+
+  // init dims_
+  void initDims(size_t ndims) {
+    size_t count = ndims < kMinDims ? kMinDims : ndims;
+    dims_.assign(count, 1);
+  }
+
+  // number of dimensions
+  // ndims_ may be not equeal dims_.size()
+  size_t ndims_;
+  // number of elements
+  size_t nelements_;
+  std::vector<size_t> dims_;
+  static const size_t kMinDims = 4;
+};
+
+}  // namespace paddle
diff --git a/paddle/function/TensorShapeTest.cpp b/paddle/legacy/function/TensorShapeTest.cpp
similarity index 100%
rename from paddle/function/TensorShapeTest.cpp
rename to paddle/legacy/function/TensorShapeTest.cpp
diff --git a/paddle/legacy/function/TensorType.h b/paddle/legacy/function/TensorType.h
new file mode 100644
index 0000000000000000000000000000000000000000..13994821be7ba7264f43d8550e6800cdc5b93875
--- /dev/null
+++ b/paddle/legacy/function/TensorType.h
@@ -0,0 +1,149 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+enum ValueType {
+  VALUE_TYPE_INT32 = 0,
+  VALUE_TYPE_FLOAT = 1,
+  VALUE_TYPE_DOUBLE = 2,
+  VALUE_TYPE_BYTE = 3
+};
+
+enum DeviceType {
+  DEVICE_TYPE_UNSPECIFIED = 0,
+  DEVICE_TYPE_CPU = 1,
+  DEVICE_TYPE_GPU = 2
+};
+
+enum SparseDataType { T_NO_VALUE = 0, T_FLOAT_VALUE = 1 };
+
+enum SparseDataFormat { T_SPARSE_CSR = 0, T_SPARSE_CSC = 1 };
+
+inline int sizeOfValuType(ValueType valueType) {
+  if (valueType == VALUE_TYPE_INT32) {
+    return 4;
+  } else if (valueType == VALUE_TYPE_FLOAT) {
+    return 4;
+  } else if (valueType == VALUE_TYPE_DOUBLE) {
+    return 8;
+  } else {
+    LOG(FATAL) << "Unknown type: " << valueType;
+    return 0;
+  }
+}
+
+template <typename T>
+struct DataType;
+
+template <>
+struct DataType<float> {
+  static const ValueType value = VALUE_TYPE_FLOAT;
+};
+
+template <>
+struct DataType<double> {
+  static const ValueType value = VALUE_TYPE_DOUBLE;
+};
+
+template <>
+struct DataType<int> {
+  static const ValueType value = VALUE_TYPE_INT32;
+};
+
+namespace detail {
+
+template <typename VType, DeviceType Device>
+struct MatrixT;
+
+template <>
+struct MatrixT<real, DEVICE_TYPE_CPU> {
+  using type = CpuMatrix;
+};
+
+template <>
+struct MatrixT<real, DEVICE_TYPE_GPU> {
+  using type = GpuMatrix;
+};
+
+template <>
+struct MatrixT<int, DEVICE_TYPE_CPU> {
+  using type = void;  // Not implemented
+};
+
+template <>
+struct MatrixT<int, DEVICE_TYPE_GPU> {
+  using type = void;  // Not implemented
+};
+
+template <typename VType, DeviceType Device>
+struct SparseMatrixT;
+
+template <>
+struct SparseMatrixT<real, DEVICE_TYPE_CPU> {
+  using type = CpuSparseMatrix;
+};
+
+template <>
+struct SparseMatrixT<real, DEVICE_TYPE_GPU> {
+  using type = GpuSparseMatrix;
+};
+
+template <>
+struct SparseMatrixT<int, DEVICE_TYPE_CPU> {
+  using type = void;  // Not implemented
+};
+
+template <>
+struct SparseMatrixT<int, DEVICE_TYPE_GPU> {
+  using type = void;  // Not implemented
+};
+
+template <typename VType, DeviceType Device>
+struct VectorT;
+
+template <>
+struct VectorT<real, DEVICE_TYPE_CPU> {
+  using type = CpuVector;
+};
+
+template <>
+struct VectorT<real, DEVICE_TYPE_GPU> {
+  using type = GpuVector;
+};
+
+template <>
+struct VectorT<int, DEVICE_TYPE_CPU> {
+  using type = CpuIVector;
+};
+
+template <>
+struct VectorT<int, DEVICE_TYPE_GPU> {
+  using type = GpuIVector;
+};
+
+}  // namespace detail
+
+template <typename VType, DeviceType DType>
+struct Tensor {
+  typedef typename detail::VectorT<VType, DType>::type Vector;
+  typedef typename detail::MatrixT<VType, DType>::type Matrix;
+  typedef typename detail::SparseMatrixT<VType, DType>::type SparseMatrix;
+};
+
+}  // namespace paddle
diff --git a/paddle/function/TensorTypeTest.cpp b/paddle/legacy/function/TensorTypeTest.cpp
similarity index 100%
rename from paddle/function/TensorTypeTest.cpp
rename to paddle/legacy/function/TensorTypeTest.cpp
diff --git a/paddle/legacy/function/neon/NeonDepthwiseConv.cpp b/paddle/legacy/function/neon/NeonDepthwiseConv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6179635a9fec4afecf53fabdc6a818588b54c808
--- /dev/null
+++ b/paddle/legacy/function/neon/NeonDepthwiseConv.cpp
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "NeonDepthwiseConv.h"
+#include "paddle/legacy/function/ConvOp.h"
+
+namespace paddle {
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+template <DeviceType Device>
+class NeonDepthwiseConvFunction : public ConvFunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    int batchSize = input[0];
+    int inputChannels = input[1];
+    int inputHeight = input[2];
+    int inputWidth = input[3];
+    int filterHeight = getFilterHeight(filter);
+    int filterWidth = getFilterWidth(filter);
+    int outputChannels = output[1];
+    int outputHeight = output[2];
+    int outputWidth = output[3];
+    int filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(static_cast<size_t>(inputChannels), groups_);
+
+    // only support strideH() == strideW() and filterHeight == filterWidth.
+    CHECK_EQ(strideH(), strideW());
+    CHECK_EQ(filterHeight, filterWidth);
+
+    float* inputData = inputs[0].data<float>();
+    float* filterData = inputs[1].data<float>();
+    float* outputData = outputs[0].data<float>();
+
+    // padding the input
+    float* inputPadding = inputData;
+    int padInputHeight = inputHeight + 2 * paddingH();
+    int padInputWidth = inputWidth + 2 * paddingW();
+    int newSize =
+        batchSize * (inputChannels + 1) * padInputHeight * padInputWidth;
+
+    resizeBuffer<Device>(newSize);
+    inputPadding = reinterpret_cast<float*>(memory_->getBuf());
+    neon::Padding<float>::run(inputData,
+                              inputPadding,
+                              batchSize * inputChannels,
+                              inputHeight,
+                              inputWidth,
+                              padInputHeight,
+                              padInputWidth);
+
+    std::function<void(
+        const float*, const float*, int, int, int, int, int, int, float*)>
+        DepthWiseConv;
+
+    if (filterWidth == 3 && strideW() == 1) {
+      DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run;
+    } else if (filterWidth == 3 && strideW() == 2) {
+      DepthWiseConv = neon::DepthwiseConvKernel<3, 2>::run;
+    } else if (filterWidth == 4 && strideW() == 1) {
+      DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run;
+    } else if (filterWidth == 4 && strideW() == 2) {
+      DepthWiseConv = neon::DepthwiseConvKernel<4, 2>::run;
+    } else {
+      LOG(FATAL) << "Not supported";
+    }
+
+    for (int i = 0; i < batchSize; i++) {
+      DepthWiseConv(inputPadding,
+                    filterData,
+                    padInputHeight,
+                    padInputWidth,
+                    outputChannels,
+                    outputHeight,
+                    outputWidth,
+                    filterMultiplier,
+                    outputData);
+      inputPadding += inputChannels * padInputHeight * padInputWidth;
+      outputData += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+
+#ifndef PADDLE_TYPE_DOUBLE
+REGISTER_TYPED_FUNC(NeonDepthwiseConv, CPU, NeonDepthwiseConvFunction);
+#endif
+
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/neon/NeonDepthwiseConv.h b/paddle/legacy/function/neon/NeonDepthwiseConv.h
similarity index 100%
rename from paddle/function/neon/NeonDepthwiseConv.h
rename to paddle/legacy/function/neon/NeonDepthwiseConv.h
diff --git a/paddle/legacy/function/neon/NeonDepthwiseConvTranspose.cpp b/paddle/legacy/function/neon/NeonDepthwiseConvTranspose.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..feb77e1ff9f591d63dbf86a05313d65025f7c65d
--- /dev/null
+++ b/paddle/legacy/function/neon/NeonDepthwiseConvTranspose.cpp
@@ -0,0 +1,136 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "NeonDepthwiseConv.h"
+#include "paddle/legacy/function/ConvOp.h"
+
+namespace paddle {
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+template <DeviceType Device>
+class NeonDepthwiseConvTransposeFunction : public ConvFunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    int batchSize = input[0];
+    int inputChannels = input[1];
+    int inputHeight = input[2];
+    int inputWidth = input[3];
+    int filterHeight = getFilterHeight(filter);
+    int filterWidth = getFilterWidth(filter);
+    int outputChannels = output[1];
+    int outputHeight = output[2];
+    int outputWidth = output[3];
+    int filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(inputChannels, groups_);
+
+    // only support strideH() == strideW() and filterHeight == filterWidth.
+    CHECK_EQ(strideH(), strideW());
+    CHECK_EQ(paddingH(), paddingW());
+    CHECK_EQ(filterHeight, filterWidth);
+
+    float* inputData = inputs[0].data<float>();
+    float* filterData = inputs[1].data<float>();
+    float* outputData = outputs[0].data<float>();
+
+    // padding the input, input -> inputPadding
+    float* inputPadding = inputData;
+    int padInputHeight =
+        (inputHeight - 1) * strideH() + 2 * filterHeight - 1 - 2 * paddingH();
+    int padInputWidth =
+        (inputWidth - 1) * strideW() + 2 * filterWidth - 1 - 2 * paddingW();
+
+    if (padInputHeight > inputHeight || padInputWidth > inputWidth) {
+      int newSize = batchSize * inputChannels * padInputHeight * padInputWidth;
+      resizeBuffer<Device>(newSize);
+      inputPadding = reinterpret_cast<float*>(memory_->getBuf());
+      if (strideH() == 1) {
+        neon::Padding<float>::run(inputData,
+                                  inputPadding,
+                                  batchSize * inputChannels,
+                                  inputHeight,
+                                  inputWidth,
+                                  padInputHeight,
+                                  padInputWidth);
+      } else if (strideH() == 2) {
+        neon::StridePadding::run(inputData,
+                                 inputPadding,
+                                 batchSize * inputChannels,
+                                 inputHeight,
+                                 inputWidth,
+                                 padInputHeight,
+                                 padInputWidth);
+      } else {
+        LOG(FATAL) << "Not supported";
+      }
+    }
+
+    std::function<void(
+        const float*, const float*, int, int, int, int, int, int, float*)>
+        DepthWiseConv;
+
+    if (filterWidth == 3) {
+      DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run;
+    } else if (filterWidth == 4) {
+      DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run;
+    } else {
+      LOG(FATAL) << "Not supported";
+    }
+
+    for (int i = 0; i < batchSize; i++) {
+      DepthWiseConv(inputPadding,
+                    filterData,
+                    padInputHeight,
+                    padInputWidth,
+                    outputChannels,
+                    outputHeight,
+                    outputWidth,
+                    filterMultiplier,
+                    outputData);
+      inputPadding += inputChannels * padInputHeight * padInputWidth;
+      outputData += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+
+#ifndef PADDLE_TYPE_DOUBLE
+
+REGISTER_TYPED_FUNC(NeonDepthwiseConvTranspose,
+                    CPU,
+                    NeonDepthwiseConvTransposeFunction);
+
+#endif
+
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/neon/neon_util.h b/paddle/legacy/function/neon/neon_util.h
similarity index 100%
rename from paddle/function/neon/neon_util.h
rename to paddle/legacy/function/neon/neon_util.h
diff --git a/paddle/legacy/function/nnpack/NNPACKConvOp.cpp b/paddle/legacy/function/nnpack/NNPACKConvOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..81c832e7747f8e75d322891476e08dacc435f5d4
--- /dev/null
+++ b/paddle/legacy/function/nnpack/NNPACKConvOp.cpp
@@ -0,0 +1,247 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "nnpack.h"
+#include "paddle/legacy/function/ConvOp.h"
+
+DEFINE_bool(nnpack_allocate_outside,
+            true,
+            "Allocate and free workspace memory outside the NNPACK interface.");
+DEFINE_int32(nnpack_num_threads,
+             0,
+             "The number of nnpack threads"
+             "default: 0; 0 to disable threadpool.");
+
+namespace paddle {
+
+nnp_convolution_algorithm get_nnp_convolution_algorithm(
+    const std::string& algorithm) {
+  if (algorithm == "auto") {
+    return nnp_convolution_algorithm_auto;
+  } else if (algorithm == "ft8x8") {
+    return nnp_convolution_algorithm_ft8x8;
+  } else if (algorithm == "ft16x16") {
+    return nnp_convolution_algorithm_ft16x16;
+  } else if (algorithm == "wt8x8") {
+    return nnp_convolution_algorithm_wt8x8;
+  } else if (algorithm == "implicit-gemm") {
+    return nnp_convolution_algorithm_implicit_gemm;
+  } else if (algorithm == "direct") {
+    return nnp_convolution_algorithm_direct;
+  } else {
+    return nnp_convolution_algorithm_auto;
+  }
+}
+
+template <DeviceType Device>
+class NNPACKConvFunction : public ConvFunctionBase {
+ public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+    algorithm_ = get_nnp_convolution_algorithm(config.get<std::string>("algo"));
+    transform_strategy_ = nnp_convolution_transform_strategy_compute;
+    nnp_status status = nnp_initialize();
+    CHECK_EQ(status, nnp_status_success);
+    workspaceBuffer_ = nullptr;
+    workspaceSize_ = 0;
+
+    create_nnpack_threadpool();
+  }
+
+  ~NNPACKConvFunction() {
+    if (workspaceBuffer_) {
+      free(workspaceBuffer_);
+    }
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    check(inputs, outputs);
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    nnp_size inputSize = {.width = inputWidth, .height = inputHeight};
+    nnp_padding padding = {.top = (size_t)paddingH(),
+                           .right = (size_t)paddingW(),
+                           .bottom = (size_t)paddingH(),
+                           .left = (size_t)paddingW()};
+    nnp_size kernelSize = {.width = filterWidth, .height = filterHeight};
+    nnp_size outputSubsampling = {.width = (size_t)strideW(),
+                                  .height = (size_t)strideH()};
+
+    float* inputData = inputs[0].data<float>();
+    float* filterData = inputs[1].data<float>();
+    float* outputData = outputs[0].data<float>();
+
+    void* bufferPtr = nullptr;
+    size_t* sizePtr = nullptr;
+    size_t needSize;
+    if (FLAGS_nnpack_allocate_outside) {
+      if (batchSize == 1) {
+        nnp_status status = nnp_convolution_inference(algorithm_,
+                                                      transform_strategy_,
+                                                      inputChannels,
+                                                      outputChannels,
+                                                      inputSize,
+                                                      padding,
+                                                      kernelSize,
+                                                      outputSubsampling,
+                                                      nullptr,
+                                                      nullptr,
+                                                      nullptr,
+                                                      nullptr,
+                                                      nullptr,
+                                                      &needSize,
+                                                      nnp_activation_identity,
+                                                      nullptr,
+                                                      nullptr,
+                                                      nullptr);
+        CHECK_EQ(status, nnp_status_success);
+      } else {
+        // only supports stride = 1
+        CHECK_EQ(strideH(), 1);
+        CHECK_EQ(strideW(), 1);
+        nnp_status status = nnp_convolution_output(algorithm_,
+                                                   batchSize,
+                                                   inputChannels,
+                                                   outputChannels,
+                                                   inputSize,
+                                                   padding,
+                                                   kernelSize,
+                                                   nullptr,
+                                                   nullptr,
+                                                   nullptr,
+                                                   nullptr,
+                                                   nullptr,
+                                                   &needSize,
+                                                   nnp_activation_identity,
+                                                   nullptr,
+                                                   nullptr,
+                                                   nullptr);
+        CHECK_EQ(status, nnp_status_success);
+      }
+
+      VLOG(3) << "workspace size is " << needSize;
+      if (needSize > workspaceSize_) {
+        workspaceSize_ = needSize;
+        if (workspaceBuffer_) {
+          free(workspaceBuffer_);
+        } else {
+          posix_memalign(&workspaceBuffer_, 64, needSize);
+        }
+      }
+
+      if (needSize) {
+        bufferPtr = workspaceBuffer_;
+        sizePtr = &needSize;
+      }
+    }
+
+    size_t inputOffset = inputChannels / groups_ * inputHeight * inputWidth;
+    size_t outputOffset = outputChannels / groups_ * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+
+    if (batchSize == 1) {
+      for (size_t g = 0; g < groups_; g++) {
+        nnp_status status =
+            nnp_convolution_inference(algorithm_,
+                                      transform_strategy_,
+                                      inputChannels / groups_,
+                                      outputChannels / groups_,
+                                      inputSize,
+                                      padding,
+                                      kernelSize,
+                                      outputSubsampling,
+                                      inputData + inputOffset * g,
+                                      filterData + filterOffset * g,
+                                      nullptr, /* bias */
+                                      outputData + outputOffset * g,
+                                      bufferPtr,
+                                      sizePtr,
+                                      nnp_activation_identity,
+                                      nullptr,
+                                      threadpool_, /* threadpool */
+                                      nullptr);
+        CHECK_EQ(status, nnp_status_success);
+      }
+    } else {
+      // only supports stride = 1
+      CHECK_EQ(strideH(), 1);
+      CHECK_EQ(strideW(), 1);
+
+      // TODO(hedaoyuan): There has some bug when batchSize > 1 and groups_ > 1.
+      CHECK_EQ(groups_, static_cast<size_t>(1));
+      nnp_status status = nnp_convolution_output(algorithm_,
+                                                 batchSize,
+                                                 inputChannels,
+                                                 outputChannels,
+                                                 inputSize,
+                                                 padding,
+                                                 kernelSize,
+                                                 inputData,
+                                                 filterData,
+                                                 nullptr, /* bias */
+                                                 outputData,
+                                                 bufferPtr,
+                                                 sizePtr,
+                                                 nnp_activation_identity,
+                                                 nullptr,
+                                                 threadpool_, /* threadpool */
+                                                 nullptr);
+      CHECK_EQ(status, nnp_status_success);
+    }
+  }
+
+  static void create_nnpack_threadpool() {
+    if (FLAGS_nnpack_num_threads && threadpool_ == nullptr) {
+      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
+      VLOG(3) << "Number of threads "
+              << pthreadpool_get_threads_count(threadpool_);
+    }
+  }
+
+ private:
+  nnp_convolution_algorithm algorithm_;
+  nnp_convolution_transform_strategy transform_strategy_;
+  void* workspaceBuffer_;
+  size_t workspaceSize_;
+  static pthreadpool_t threadpool_;
+};
+
+template <DeviceType Device>
+pthreadpool_t NNPACKConvFunction<Device>::threadpool_ = nullptr;
+
+REGISTER_TYPED_FUNC(NNPACKConv, CPU, NNPACKConvFunction);
+
+}  // namespace paddle
diff --git a/paddle/legacy/function/nnpack/NNPACKConvOpTest.cpp b/paddle/legacy/function/nnpack/NNPACKConvOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a2db83f5a36310ca6f173d6e6501118b34060761
--- /dev/null
+++ b/paddle/legacy/function/nnpack/NNPACKConvOpTest.cpp
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/legacy/function/ConvOpTest.h"
+
+namespace paddle {
+
+TEST(NNPACK, Forward) {
+  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
+      "GemmConv-CPU", "NNPACKConv-CPU", forward);
+}
+
+TEST(NNPACK, Depthwise) {
+  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
+      "GemmConv-CPU", "NNPACKConv-CPU", forward);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/CMakeLists.txt b/paddle/legacy/gserver/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6dc877dd90ee2ae3d99406299a9244eb3e3d7b53
--- /dev/null
+++ b/paddle/legacy/gserver/CMakeLists.txt
@@ -0,0 +1,152 @@
+# Gserver package contains:
+#   * Layers
+#   * Activations
+#   * DataProviders
+#   * Evaluators
+#   * GradientMachines(NeuralNetwork)
+file(GLOB_RECURSE GSERVER_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.h")
+file(GLOB_RECURSE GSERVER_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cpp")
+set(GSERVER_SOURCES
+    layers/LstmCompute.cu
+    layers/GruCompute.cu
+    ${GSERVER_SOURCES})
+
+macro(filter_test VAR_NAME)
+    set(tmp)
+    foreach(p IN LISTS ${VAR_NAME})
+        if(NOT ${p} MATCHES ".*tests/.*")
+             set(tmp ${p} ${tmp})
+        endif()
+    endforeach()
+    set(${VAR_NAME} ${tmp})
+endmacro()
+
+filter_test(GSERVER_HEADER)
+filter_test(GSERVER_SOURCES)
+
+if(NOT WITH_MKLDNN)
+    file(GLOB_RECURSE DNN_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLDNN*.h")
+    file(GLOB_RECURSE DNN_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLDNN*.cpp")
+    list(REMOVE_ITEM GSERVER_HEADER ${DNN_HEADER})
+    list(REMOVE_ITEM GSERVER_SOURCES ${DNN_SOURCES})
+    message(STATUS "Skip compiling with MKLDNNLayers and MKLDNNActivations")
+else()
+    message(STATUS "Compile with MKLDNNLayers and MKLDNNActivations")
+endif()
+
+if(NOT WITH_MKLML)
+    file(GLOB_RECURSE MKL_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLPacked*.h")
+    file(GLOB_RECURSE MKL_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLPacked*.cpp")
+    list(REMOVE_ITEM GSERVER_HEADER ${MKL_HEADER})
+    list(REMOVE_ITEM GSERVER_SOURCES ${MKL_SOURCES})
+    message(STATUS "Skip compiling with MKLPackedLayers")
+else()
+    message(STATUS "Compile with MKLPackedLayers")
+endif()
+
+if(NOT WITH_GPU)
+    list(REMOVE_ITEM GSERVER_HEADER
+        layers/CudnnConvBaseLayer.h
+        layers/CudnnConvLayer.h
+        layers/CudnnConvTransLayer.h
+        layers/CudnnPoolLayer.h
+        layers/CudnnBatchNormLayer.h)
+
+    list(REMOVE_ITEM GSERVER_SOURCES
+        layers/CudnnConvBaseLayer.cpp
+        layers/CudnnConvLayer.cpp
+        layers/CudnnConvTransLayer.cpp
+        layers/CudnnPoolLayer.cpp
+        layers/CudnnBatchNormLayer.cpp)
+    compile_cu_as_cpp(layers/LstmCompute.cu)
+    compile_cu_as_cpp(layers/GruCompute.cu)
+endif()
+
+if(NOT WITH_PYTHON)
+    list(REMOVE_ITEM GSERVER_SOURCES
+            dataproviders/PyDataProvider.cpp)
+    
+    list(REMOVE_ITEM GSERVER_HEADER
+            dataproviders/PyDataProvider.h)
+endif()
+
+if(MOBILE_INFERENCE)
+    # Remove evaluators
+    list(REMOVE_ITEM GSERVER_SOURCES
+         layers/ValidationLayer.cpp
+         evaluators/Evaluator.cpp
+         evaluators/DetectionMAPEvaluator.cpp
+         evaluators/CTCErrorEvaluator.cpp
+         evaluators/ChunkEvaluator.cpp)
+
+    # Remove dataproviders
+    list(REMOVE_ITEM GSERVER_SOURCES
+         dataproviders/DataProvider.cpp
+         dataproviders/MultiDataProvider.cpp
+         dataproviders/PyDataProvider2.cpp
+         dataproviders/PyDataProvider.cpp)
+
+    # Remove useless gradientmachines
+    list(REMOVE_ITEM GSERVER_SOURCES
+         gradientmachines/MultiNetwork.cpp
+         gradientmachines/RecurrentGradientMachine.cpp
+         gradientmachines/ParallelNeuralNetwork.cpp
+         gradientmachines/GradientMachineMode.cpp
+         gradientmachines/MultiGradientMachine.cpp)
+
+    # Remove layers that used in training
+    list(REMOVE_ITEM GSERVER_SOURCES
+    	 layers/RecurrentLayerGroup.cpp
+         layers/CostLayer.cpp
+         layers/MultiBoxLossLayer.cpp
+         layers/WarpCTCLayer.cpp
+         layers/CTCLayer.cpp
+         layers/LinearChainCTC.cpp
+         layers/PrintLayer.cpp)
+    list(REMOVE_ITEM GSERVER_SOURCES
+         layers/OuterProdLayer.cpp
+         layers/SumToOneNormLayer.cpp
+         layers/ConvShiftLayer.cpp
+         layers/InterpolationLayer.cpp
+         layers/AgentLayer.cpp
+         layers/DotMulOperator.cpp
+         layers/GruStepLayer.cpp
+         layers/LstmStepLayer.cpp
+         layers/ConvexCombinationLayer.cpp
+         layers/Conv3DLayer.cpp
+         layers/DeConv3DLayer.cpp
+         layers/CropLayer.cpp
+         layers/CrossEntropyOverBeam.cpp
+         layers/DataNormLayer.cpp
+         layers/FeatureMapExpandLayer.cpp
+         layers/HierarchicalSigmoidLayer.cpp
+         layers/MultinomialSampler.cpp
+         layers/NCELayer.cpp
+         layers/KmaxSeqScoreLayer.cpp
+         layers/MDLstmLayer.cpp
+         layers/MultiplexLayer.cpp
+         layers/PadLayer.cpp
+         layers/Pool3DLayer.cpp
+         layers/ResizeLayer.cpp
+         layers/RotateLayer.cpp
+         layers/RowConvLayer.cpp
+         layers/RowL2NormLayer.cpp
+         layers/SamplingIdLayer.cpp
+         layers/ScaleShiftLayer.cpp
+         layers/SelectiveFullyConnectedLayer.cpp
+         layers/SpatialPyramidPoolLayer.cpp
+         layers/BilinearInterpLayer.cpp
+         layers/ClipLayer.cpp)
+endif()
+
+if(WITH_GPU)
+    cuda_add_library(paddle_gserver ${GSERVER_SOURCES})
+else()
+    add_library(paddle_gserver STATIC
+        ${GSERVER_SOURCES})
+endif()
+
+add_dependencies(paddle_gserver paddle_proto ${external_project_dependencies})
+if(WITH_TESTING)
+    add_subdirectory(tests)
+endif()
diff --git a/paddle/legacy/gserver/activations/ActivationFunction.cpp b/paddle/legacy/gserver/activations/ActivationFunction.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ae07c7e6d7fd9fe28a00dd209ae834cd28a327f7
--- /dev/null
+++ b/paddle/legacy/gserver/activations/ActivationFunction.cpp
@@ -0,0 +1,509 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ActivationFunction.h"
+
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <thread>
+#include <type_traits>
+#include "paddle/legacy/parameter/Argument.h"
+#include "paddle/legacy/utils/ClassRegistrar.h"
+#include "paddle/legacy/utils/Logging.h"
+
+#ifdef PADDLE_WITH_MKLDNN
+#include "MKLDNNActivation.h"
+#endif
+
+namespace paddle {
+
+static ClassRegistrar<ActivationFunction> gActivationRegistrar;
+/**
+ * @def ACTIVATION_CLASS_NAME
+ * @brief Macro for getting derived activation class name
+ * @note ACTIVATION_CLASS_NAME(softmax) softmax_;
+ * means softmaxActivation softmax_;
+ */
+#define ACTIVATION_CLASS_NAME(ACTIVATION_NAME) ACTIVATION_NAME##Activation
+/**
+ * @def BEGIN_DEFINE_ACTIVATION
+ * @brief Macro for defining a devried activation class
+ */
+#define BEGIN_DEFINE_ACTIVATION(ACTIVATION_NAME)                             \
+  class ACTIVATION_CLASS_NAME(ACTIVATION_NAME) : public ActivationFunction { \
+   private:                                                                  \
+    static const std::string name;                                           \
+                                                                             \
+   public:                                                                   \
+    const std::string& getName() const { return name; }
+/**
+ * @def END_DEFINE_ACTIVATION
+ * @brief Macro for registering a derived activation class
+ */
+#define END_DEFINE_ACTIVATION(ACTIVATION_NAME)                     \
+  }                                                                \
+  ;                                                                \
+  const std::string ACTIVATION_CLASS_NAME(ACTIVATION_NAME)::name = \
+      #ACTIVATION_NAME;                                            \
+  static InitFunction __reg_activation__##ACTIVATION_NAME([] {     \
+    gActivationRegistrar                                           \
+        .registerClass<ACTIVATION_CLASS_NAME(ACTIVATION_NAME)>(    \
+            #ACTIVATION_NAME);                                     \
+  });
+
+/**
+ * @brief The IdentityActivation class
+ *
+ * Do nothing when forward/backward.
+ */
+class IdentityActivation : public ActivationFunction {
+ public:
+  static const std::string name;
+  Error __must_check forward(Argument& act) {
+    (void)act;
+    return Error();
+  }
+  Error __must_check backward(Argument& act) {
+    (void)act;
+    return Error();
+  }
+  const std::string& getName() const { return name; }
+};
+const std::string IdentityActivation::name = "";
+static InitFunction __reg_activation__identity([] {
+  gActivationRegistrar.registerClass<IdentityActivation>("");
+  gActivationRegistrar.registerClass<IdentityActivation>("linear");
+});
+
+/**
+ * @brief Sigmoid Activation
+ * \f[
+ * f(z) = \frac{1}{1+exp(-z)}
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(sigmoid)
+Error __must_check forward(Argument& act) {
+  act.value->sigmoid(*act.value);
+  return Error();
+}
+Error __must_check backward(Argument& act) {
+  act.grad->sigmoidDerivative(*act.value);
+  return Error();
+}
+END_DEFINE_ACTIVATION(sigmoid)
+
+/**
+ * @brief Softmax Activation
+ * \f[
+ * P(y=j|x) = \frac{e^{x^Tw_j}}{\sum^K_{k=1}e^{x^Tw_k}}
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(softmax)
+private:
+MatrixPtr sftMaxSum_;
+MatrixPtr sftMaxDot_;
+
+public:
+Error __must_check forward(Argument& act) {
+  act.value->softmax(*act.value);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  MatrixPtr outputV = act.value;
+  MatrixPtr outputG = act.grad;
+
+  if (outputG->useGpu()) {
+    outputG->softmaxBackward(*outputV);
+  } else {
+    SetDevice device(act.deviceId);
+    Matrix::resizeOrCreate(sftMaxDot_,
+                           outputG->getHeight(),
+                           outputG->getWidth(),
+                           /* trans */ false,
+                           useGpu(act.deviceId));
+    Matrix::resizeOrCreate(sftMaxSum_,
+                           outputG->getHeight(),
+                           1,
+                           /* trans */ false,
+                           useGpu(act.deviceId));
+
+    sftMaxDot_->dotMul(*outputG, *outputV);
+    sftMaxSum_->colMerge(*sftMaxDot_);
+
+    act.grad->softmaxDerivative(*act.value, *sftMaxSum_);
+  }
+  return Error();
+}
+END_DEFINE_ACTIVATION(softmax)
+
+/**
+ * @brief Sequence_softmax Activation
+ * @note Softmax on all frames of one sequence.
+ * Width of frame must be one.
+ */
+BEGIN_DEFINE_ACTIVATION(sequence_softmax)
+private:
+ACTIVATION_CLASS_NAME(softmax) softmax_;
+Argument argument_;
+
+public:
+Error __must_check forward(Argument& act) {
+  if (act.value->getWidth() != 1UL) {
+    return Error(
+        "Input width for each timestep of sequence softmax should be 1");
+  }
+
+  if (!argument_.value) {
+    argument_.value = Matrix::create(nullptr,
+                                     /* height= */ 1,
+                                     1,
+                                     /* trans= */ false,
+                                     useGpu(act.deviceId));
+    argument_.grad = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    1,
+                                    /* trans= */ false,
+                                    useGpu(act.deviceId));
+  }
+
+  auto starts =
+      act.hasSubseq()
+          ? act.subSequenceStartPositions->getVector(useGpu(act.deviceId))
+          : act.sequenceStartPositions->getVector(useGpu(act.deviceId));
+  act.value->sequenceSoftmax(*act.value, *starts);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  if (act.value->getWidth() != 1UL) {
+    return Error(
+        "Input width for each timestep of sequence softmax should be 1");
+  }
+
+  size_t numSequences =
+      act.hasSubseq() ? act.getNumSubSequences() : act.getNumSequences();
+  const int* starts = act.getCpuStartPositions();
+
+  for (size_t i = 0; i < numSequences; ++i) {
+    // TODO(Dangqingqing) optimization for GPU
+    size_t offset = starts[i];
+    size_t size = starts[i + 1] - starts[i];
+    argument_.value->setData(act.value->getData() + offset, 1UL, size);
+    argument_.grad->setData(act.grad->getData() + offset, 1UL, size);
+
+    Error err = softmax_.backward(argument_);
+    if (!err.isOK()) return err;
+  }
+  return Error();
+}
+END_DEFINE_ACTIVATION(sequence_softmax)
+
+/*
+ * @brief SoftSign Activation.
+ * \f[
+ * f(z) = \frac{z}{1 + |z|}
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(softsign)
+private:
+MatrixPtr denominator_;
+
+Error __must_check forward(Argument& act) {
+  size_t height = act.value->getHeight();
+  size_t width = act.value->getWidth();
+  Matrix::resizeOrCreate(
+      denominator_, height, width, false, useGpu(act.deviceId));
+  denominator_->assign(*act.value);
+  denominator_->abs2();
+  denominator_->add(1.);
+
+  act.value->dotDiv(*act.value, *denominator_);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  denominator_->square2();
+  denominator_->scalarDiv(*denominator_, 1.);
+  act.grad->dotMul(*act.grad, *denominator_);
+  return Error();
+}
+END_DEFINE_ACTIVATION(softsign)
+
+/**
+ * @brief Relu Activation.
+ * forward. y = max(0, z)
+ *
+ * derivative of relu is:
+ *
+ *    1 if z > 0
+ *
+ *    0 otherwise.
+ */
+BEGIN_DEFINE_ACTIVATION(relu)
+Error __must_check forward(Argument& act) {
+  act.value->relu(*act.value);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->reluDerivative(*act.value);
+  return Error();
+}
+END_DEFINE_ACTIVATION(relu)
+
+/**
+ * @brief BRelu Activation.
+ *
+ * forward. y = min(24, max(0, z))
+ *
+ * derivative of brelu is:
+ *
+ *    1 if 0 < z < 24
+ *
+ *    0 otherwise.
+ *
+ * TODO(yuyang18): Remove magic number 24 or make it configuable.
+ */
+BEGIN_DEFINE_ACTIVATION(brelu)
+Error __must_check forward(Argument& act) {
+  act.value->brelu(*act.value);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->breluDerivative(*act.value);
+  return Error();
+}
+END_DEFINE_ACTIVATION(brelu)
+
+/**
+ * @brief Tanh Activation.
+ * \f[
+ * f(z) = tanh(z)=\frac{e^z-e^{-z}}{e^z+e^{-z}}
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(tanh)
+Error __must_check forward(Argument& act) {
+  act.value->tanh(*act.value);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->tanhDerivative(*act.value);
+  return Error();
+}
+END_DEFINE_ACTIVATION(tanh)
+
+/**
+ * @brief Scaled Tanh Activation
+ * \f[
+ * f(z) = 1.7159 * tanh(2/3*z)
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(stanh)
+private:
+real a, b;
+
+public:
+ACTIVATION_CLASS_NAME(stanh)() : a(1.7159), b(2. / 3.) {}
+Error __must_check forward(Argument& act) {
+  act.value->scaledTanh(*act.value, a, b);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->scaledTanhDerivative(*act.value, a, b);
+  return Error();
+}
+END_DEFINE_ACTIVATION(stanh)
+
+/**
+ * @brief Soft Relu Activation.
+ * \f[
+ * f(z) = ln(1+e^z)
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(softrelu)
+Error __must_check forward(Argument& act) {
+  act.value->softrelu(*act.value);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->softreluDerivative(*act.value);
+  return Error();
+}
+END_DEFINE_ACTIVATION(softrelu)
+
+/**
+ * @brief Abs Activation.
+ * Forward: f(z) = abs(z)
+ *
+ * Derivative:
+ *
+ *     1   if z>0
+ *
+ *    -1   if z<0
+ *
+ *     0   if z=0
+ */
+BEGIN_DEFINE_ACTIVATION(abs)
+Error __must_check forward(Argument& act) {
+  SetDevice device(act.deviceId);
+  Matrix::resizeOrCreate(act.in,
+                         act.value->getHeight(),
+                         act.value->getWidth(),
+                         /* trans */ false,
+                         useGpu(act.deviceId));
+
+  act.in->copyFrom(*act.value);
+  act.value->abs2(*act.value);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->absDerivative(*act.in);
+  return Error();
+}
+END_DEFINE_ACTIVATION(abs)
+
+/**
+ * @brief Square Activation.
+ * \f[
+ * f(z) = z^2.
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(square)
+Error __must_check forward(Argument& act) {
+  SetDevice device(act.deviceId);
+  Matrix::resizeOrCreate(act.in,
+                         act.value->getHeight(),
+                         act.value->getWidth(),
+                         /* trans */ false,
+                         useGpu(act.deviceId));
+
+  act.in->copyFrom(*act.value);
+  act.value->square2(*act.value);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->squareDerivative(*act.in);
+  return Error();
+}
+END_DEFINE_ACTIVATION(square)
+
+/**
+ * @brief Exponential Activation.
+ * \f[
+ * f(z) = e^z
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(exponential)
+Error __must_check forward(Argument& act) {
+  act.value->exp2(*act.value);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->expDerivative(*act.value);
+  return Error();
+}
+END_DEFINE_ACTIVATION(exponential)
+
+/**
+ * @brief Reciprocal Activation.
+ * \f[
+ * f(z) = 1/z
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(reciprocal)
+Error __must_check forward(Argument& act) {
+  act.value->reciprocal2();
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->dotMulSquare(*act.value);
+  act.grad->neg();
+  return Error();
+}
+END_DEFINE_ACTIVATION(reciprocal)
+
+/**
+ * @brief Square Root Activation.
+ * \f[
+ * f(z) = sqrt(z)
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(sqrt)
+Error __must_check forward(Argument& act) {
+  act.value->sqrt2();
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->dotDiv(*act.grad, *act.value);
+  act.grad->mulScalar(0.5);
+  return Error();
+}
+END_DEFINE_ACTIVATION(sqrt)
+
+/**
+ * @brief Logarithm Activation.
+ * \f[
+ * f(z) = log(z)
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(log)
+Error __must_check forward(Argument& act) {
+  SetDevice device(act.deviceId);
+  Matrix::resizeOrCreate(act.in,
+                         act.value->getHeight(),
+                         act.value->getWidth(),
+                         /* trans */ false,
+                         useGpu(act.deviceId));
+
+  act.in->copyFrom(*act.value);
+  act.value->log2(*act.value);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  act.grad->dotDiv(*act.grad, *act.in);
+  return Error();
+}
+END_DEFINE_ACTIVATION(log)
+
+ActivationFunction* ActivationFunction::create(const std::string& type) {
+#ifdef PADDLE_WITH_MKLDNN
+  if (!type.empty() && type.compare(0, 7, "mkldnn_") == 0) {
+    return MKLDNNActivation::create(type);
+  }
+#endif
+
+  return gActivationRegistrar.createByType(type);
+}
+
+std::vector<std::string> ActivationFunction::getAllRegisteredTypes() {
+  std::vector<std::string> types;
+  gActivationRegistrar.forEachType(
+      [&](const std::string& type) { types.push_back(type); });
+  return types;
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/activations/ActivationFunction.h b/paddle/legacy/gserver/activations/ActivationFunction.h
new file mode 100644
index 0000000000000000000000000000000000000000..8bc5b0f529a6358fba8b6c9d1e1f6ee2358dbbf9
--- /dev/null
+++ b/paddle/legacy/gserver/activations/ActivationFunction.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/legacy/utils/Error.h"
+
+namespace paddle {
+
+struct Argument;
+/**
+ * @brief Activation function is a function that transforms a set of input
+ * signals into an output signals. The purpose of the activation function
+ * is to introduce non-liearilty into the network.
+ *
+ * @note Common activation function are provieded, including linear,
+ * sigmoid, softmax, sequence_max, relu, brelu, tanh, stanh,
+ * softrelu, abs, square, exponential.
+ *
+ */
+class ActivationFunction {
+ public:
+  static ActivationFunction* create(const std::string& type);
+  static std::vector<std::string> getAllRegisteredTypes();
+
+  ActivationFunction() {}
+
+  virtual ~ActivationFunction() {}
+
+  /**
+   * @brief Foward propagation
+   *
+   * act.value <- f(act.value),
+   * where f is the activation function.
+   * Suppose that before calling forward(), act.value is x and
+   * after forward() is called, act.value is y, then y = f(x).
+   *
+   * Usually, act is Layer::output_
+   */
+  virtual Error __must_check forward(Argument& act) = 0;
+
+  /**
+   * @brief Backward propagaion
+   *
+   * x and y are defined in the above comment for forward().
+   * - Before calling backward(), act.grad = dE / dy, where E is the error/cost
+   * - After backward() returns, act.grad = dE / dx = (dE/dy) * (dy/dx)
+   */
+  virtual Error __must_check backward(Argument& act) = 0;
+
+  virtual const std::string& getName() const = 0;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/activations/MKLDNNActivation.cpp b/paddle/legacy/gserver/activations/MKLDNNActivation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2eed7af70a8a3cc305a79bbe23177ea71d15d252
--- /dev/null
+++ b/paddle/legacy/gserver/activations/MKLDNNActivation.cpp
@@ -0,0 +1,249 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNActivation.h"
+#include "mkldnn.hpp"
+#include "paddle/legacy/utils/ClassRegistrar.h"
+
+namespace paddle {
+
+static ClassRegistrar<ActivationFunction> gMKLDNNActivationRegistrar;
+/**
+ * @def MKLDNN_ACTIVATION_CLASS_NAME
+ * @note MKLDNN_ACTIVATION_CLASS_NAME(relu) relu_;
+ * means mkldnn_reluActivation relu_;
+ */
+#define MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE) mkldnn_##ACT_TYPE##Activation
+
+/**
+ * @def BEGIN_MKLDNN_ACTIVATION
+ */
+#define BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS) \
+  class MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE) : public BASE_CLASS {
+/**
+ * @def END_MKLDNN_ACTIVATION
+ */
+#define END_MKLDNN_ACTIVATION(ACT_TYPE)                            \
+ private:                                                          \
+  static const std::string name;                                   \
+                                                                   \
+ public:                                                           \
+  const std::string& getName() const { return name; }              \
+  }                                                                \
+  ;                                                                \
+  const std::string MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::name = \
+      "mkldnn_" #ACT_TYPE;                                         \
+  static InitFunction __reg_activation__mkldnn_##ACT_TYPE([] {     \
+    gMKLDNNActivationRegistrar                                     \
+        .registerClass<MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)>(    \
+            "mkldnn_" #ACT_TYPE);                                  \
+  });
+
+/**
+ * @def DEFINE_MKLDNN_ACTIVATION
+ */
+#define DEFINE_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS) \
+  BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS)        \
+  END_MKLDNN_ACTIVATION(ACT_TYPE)
+
+/**
+ * @def DEFINE_MKLDNN_ELTWISE_ACTIVATION
+ */
+#define DEFINE_MKLDNN_ELTWISE_ACTIVATION(                            \
+    ACT_TYPE, BASE_CLASS, ALPHA, BWD_ALPHA)                          \
+  BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS)                      \
+ private:                                                            \
+  static const float alpha;                                          \
+  static const float bwdAlpha;                                       \
+                                                                     \
+ public:                                                             \
+  float getAlpha() const { return alpha; }                           \
+  float getBwdAlpha() const { return bwdAlpha; }                     \
+  END_MKLDNN_ACTIVATION(ACT_TYPE)                                    \
+  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::alpha = ALPHA; \
+  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::bwdAlpha = BWD_ALPHA;
+
+/**
+ * @brief MKLDNN Relu Activation.
+ * Actually mkldnn_relu is Leaky Relu.
+ *  f(x) = x                   (x >= 0)
+ *  f(x) = negative_slope * x  (x <  0)
+ * @note the negative_slope should be -0.f in forward
+ */
+DEFINE_MKLDNN_ELTWISE_ACTIVATION(relu, MKLDNNEltwiseActivation, -0.f, 0.f)
+
+/**
+ * @brief MKLDNN Tanh Activation.
+ */
+DEFINE_MKLDNN_ELTWISE_ACTIVATION(tanh, MKLDNNEltwiseActivation, 0.f, 0.f)
+
+/**
+ * @brief MKLDNN ELU(Exponential Linear Unit) Activation.
+ *  f(x) = x                              (x >= 0)
+ *  f(x) = negative_slope * (exp(x) - 1)  (x <  0)
+ */
+DEFINE_MKLDNN_ELTWISE_ACTIVATION(elu, MKLDNNEltwiseActivation, 0.f, 0.f)
+
+mkldnn::algorithm MKLDNNEltwiseActivation::getAlgo(std::string type) const {
+  const std::map<std::string, mkldnn::algorithm> algoMap = {
+      {"relu", algorithm::eltwise_relu},
+      {"tanh", algorithm::eltwise_tanh},
+      {"elu", algorithm::eltwise_elu}};
+  type.erase(0, 7);  // remove mkldnn_
+  algorithm algo = (algorithm)0;
+  mapGet(type, algoMap, &algo);
+  return algo;
+}
+
+void MKLDNNEltwiseActivation::resetFwd(Argument& act) {
+  if (cnt_ == act.value->getElementCnt()) {
+    return;
+  }
+  MKLDNNActivation::resetFwd(act);
+  // note: alpha represents the NegativeSlope when used in relu.
+  float alpha = getAlpha();
+  float beta = getBeta();
+  algorithm algo = getAlgo(this->getName());
+  auto fwdDesc = eltwise_fwd::desc(mkldnn::prop_kind::forward_training,
+                                   algo,
+                                   val_->getMemoryDesc(),
+                                   alpha,
+                                   beta);
+  fwdPD_.reset(new eltwise_fwd::primitive_desc(fwdDesc, *engine_));
+  // use inplace for forward but save input value before submit
+  inVal_ = val_;
+  copyInVal_ = nullptr;
+  if (act.grad && algo == algorithm::eltwise_tanh) {
+    // tanh need save src input for backward
+    inVal_ = MKLDNNMatrix::create(val_->getPrimitiveDesc());
+    copyInVal_ = std::make_shared<mkldnn::reorder>(*val_, *inVal_);
+    CHECK(copyInVal_) << "should not be emptry";
+    pipelineFwd_.push_back(*copyInVal_);
+  }
+  fwd_.reset(new eltwise_fwd(*fwdPD_, *val_, *val_));
+  pipelineFwd_.push_back(*fwd_);
+  needResetBwd_ = true;
+}
+
+void MKLDNNEltwiseActivation::resetBwd(Argument& act) {
+  if (!needResetBwd_) {
+    return;
+  }
+  VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
+  needResetBwd_ = false;
+  algorithm algo = getAlgo(this->getName());
+  float alpha = getBwdAlpha();
+  float beta = getBeta();
+  grad_ = MKLDNNMatrix::create(val_->getPrimitiveDesc(), act.grad);
+  auto eng = CPUEngine::Instance().getEngine();
+  auto bwdDesc = eltwise_bwd::desc(
+      algo, grad_->getMemoryDesc(), val_->getMemoryDesc(), alpha, beta);
+  auto bwdPD = eltwise_bwd::primitive_desc(bwdDesc, eng, *fwdPD_);
+  CHECK(inVal_);
+  bwd_.reset(new eltwise_bwd(bwdPD, *inVal_, *grad_, *grad_));
+  pipelineBwd_.clear();
+  pipelineBwd_.push_back(*bwd_);
+}
+
+/**
+ * @brief MKLDNN Softmax Activation
+ */
+DEFINE_MKLDNN_ACTIVATION(softmax, MKLDNNSoftmaxActivation)
+
+void MKLDNNSoftmaxActivation::resetFwd(Argument& act) {
+  if (cnt_ == act.value->getElementCnt()) {
+    return;
+  }
+  MKLDNNActivation::resetFwd(act);
+  int axis = 1;
+  auto fwdDesc = softmax_fwd::desc(
+      mkldnn::prop_kind::forward_scoring, val_->getMemoryDesc(), axis);
+  auto fwdPD = softmax_fwd::primitive_desc(fwdDesc, *engine_);
+  fwd_.reset(new softmax_fwd(fwdPD, *val_, *val_));
+  pipelineFwd_.push_back(*fwd_);
+}
+
+Error __must_check MKLDNNSoftmaxActivation::forward(Argument& act) {
+  resetFwd(act);
+  stream_->submit(pipelineFwd_);
+  real* v = act.value->getData();
+  real threshold = exp(-64);
+#pragma omp parallel for
+  for (size_t i = 0; i < act.value->getElementCnt(); ++i) {
+    v[i] = v[i] < threshold ? threshold : v[i];
+  }
+  return Error();
+}
+
+Error __must_check MKLDNNSoftmaxActivation::backward(Argument& act) {
+  MatrixPtr outputV = act.value;
+  MatrixPtr outputG = act.grad;
+  Matrix::resizeOrCreate(sftMaxDot_,
+                         outputG->getHeight(),
+                         outputG->getWidth(),
+                         /* trans */ false,
+                         /* useGpu */ false);
+  Matrix::resizeOrCreate(sftMaxSum_,
+                         outputG->getHeight(),
+                         1,
+                         /* trans */ false,
+                         /* useGpu */ false);
+  sftMaxDot_->dotMul(*outputG, *outputV);
+  sftMaxSum_->colMerge(*sftMaxDot_);
+  act.grad->softmaxDerivative(*act.value, *sftMaxSum_);
+  return Error();
+}
+
+ActivationFunction* MKLDNNActivation::create(const std::string& type) {
+  return gMKLDNNActivationRegistrar.createByType(type);
+}
+
+std::vector<std::string> MKLDNNActivation::getAllRegisteredTypes() {
+  std::vector<std::string> types;
+  gMKLDNNActivationRegistrar.forEachType(
+      [&](const std::string& type) { types.push_back(type); });
+  return types;
+}
+
+void MKLDNNActivation::resetFwd(Argument& act) {
+  VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
+  cnt_ = act.value->getElementCnt();
+  pipelineFwd_.clear();
+  stream_.reset(new MKLDNNStream());
+  engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0));
+  val_ = std::dynamic_pointer_cast<MKLDNNMatrix>(act.value);
+  if (val_ == nullptr) {
+    int bs = act.getBatchSize();
+    int ih = act.getFrameHeight() > 0 ? act.getFrameHeight() : 1;
+    int iw = act.getFrameWidth() > 0 ? act.getFrameWidth() : 1;
+    int ic = cnt_ / bs / ih / iw;
+    CHECK_EQ(cnt_, (size_t)bs * ic * ih * iw);
+    val_ = MKLDNNMatrix::create(
+        {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_, act.value);
+    CHECK(val_);
+    val_->downSpatial();
+  }
+}
+
+Error __must_check MKLDNNActivation::forward(Argument& act) {
+  resetFwd(act);
+  stream_->submit(pipelineFwd_);
+  return Error();
+}
+Error __must_check MKLDNNActivation::backward(Argument& act) {
+  resetBwd(act);
+  stream_->submit(pipelineBwd_);
+  return Error();
+}
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/activations/MKLDNNActivation.h b/paddle/legacy/gserver/activations/MKLDNNActivation.h
new file mode 100644
index 0000000000000000000000000000000000000000..59c447ad07398c0b6ca7d78766dd533963744d1b
--- /dev/null
+++ b/paddle/legacy/gserver/activations/MKLDNNActivation.h
@@ -0,0 +1,119 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "ActivationFunction.h"
+#include "mkldnn.hpp"
+#include "paddle/legacy/gserver/layers/MKLDNNBase.h"
+#include "paddle/legacy/math/MKLDNNMatrix.h"
+#include "paddle/legacy/parameter/Argument.h"
+
+namespace paddle {
+
+/**
+ * @brief Base class of MKLDNN Activation.
+ * Common activation function are provieded,
+ * including mkldnn_relu, mkldnn_elu, mkldnn_tanh, mkldnn_softmax
+ */
+class MKLDNNActivation : public ActivationFunction {
+ protected:
+  // input value element count
+  size_t cnt_;
+  // should not merge the resetBwd into resetFwd,
+  // because the grad data would be changing before backward.
+  bool needResetBwd_;
+  // mkldnn matrix, primitive, stream and pipeline
+  MKLDNNMatrixPtr val_;
+  MKLDNNMatrixPtr grad_;
+  std::shared_ptr<mkldnn::engine> engine_;
+  std::shared_ptr<MKLDNNStream> stream_;
+  std::shared_ptr<mkldnn::primitive> fwd_;
+  std::shared_ptr<mkldnn::primitive> bwd_;
+  std::vector<mkldnn::primitive> pipelineFwd_;
+  std::vector<mkldnn::primitive> pipelineBwd_;
+
+ public:
+  MKLDNNActivation() : cnt_(0), needResetBwd_(true) {}
+  ~MKLDNNActivation() {}
+  static ActivationFunction* create(const std::string& type);
+  static std::vector<std::string> getAllRegisteredTypes();
+  virtual const std::string& getName() const = 0;
+  /**
+   * reset the forward primitives
+   */
+  virtual void resetFwd(Argument& act);
+  /**
+   * reset the backward primitives,
+   * can not merge this functions into resetFwd as the grad data
+   * would be changing before backward.
+   */
+  virtual void resetBwd(Argument& act) {}
+  virtual Error __must_check forward(Argument& act);
+  virtual Error __must_check backward(Argument& act);
+};
+
+/**
+ * @brief Base class of MKLDNN Eltwise Activation,
+ * includes mkldnn_relu, mkldnn_elu and mkldnn_tanh.
+ */
+class MKLDNNEltwiseActivation : public MKLDNNActivation {
+  typedef mkldnn::eltwise_forward eltwise_fwd;
+  typedef mkldnn::eltwise_backward eltwise_bwd;
+  typedef mkldnn::algorithm algorithm;
+
+ protected:
+  // save the forward primitive desc, which can be used backward
+  std::shared_ptr<eltwise_fwd::primitive_desc> fwdPD_;
+  // eltwise_bwd need src input value
+  MKLDNNMatrixPtr inVal_;
+  // use for copy data
+  std::shared_ptr<mkldnn::reorder> copyInVal_;
+
+ public:
+  MKLDNNEltwiseActivation() {}
+  ~MKLDNNEltwiseActivation() {}
+  virtual const std::string& getName() const = 0;
+
+  // in common, the alpha of forward and backward should be equal.
+  // but for relu, to avoid negative value, they should be opposite
+  virtual float getAlpha() const = 0;
+  virtual float getBwdAlpha() const = 0;
+  virtual float getBeta() const { return 0.f; }
+  virtual algorithm getAlgo(std::string type) const;
+  void resetFwd(Argument& act) override;
+  void resetBwd(Argument& act) override;
+};
+
+/**
+ * @brief Base class of MKLDNN softmax Activation,
+ * only have mkldnn forward, use cpu implement for backward.
+ */
+class MKLDNNSoftmaxActivation : public MKLDNNActivation {
+  typedef mkldnn::softmax_forward softmax_fwd;
+
+ private:
+  // for backward
+  MatrixPtr sftMaxSum_;
+  MatrixPtr sftMaxDot_;
+
+ public:
+  MKLDNNSoftmaxActivation() {}
+  ~MKLDNNSoftmaxActivation() {}
+  virtual const std::string& getName() const = 0;
+  void resetFwd(Argument& act) override;
+  Error __must_check forward(Argument& act) override;
+  Error __must_check backward(Argument& act) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/DataProvider.cpp b/paddle/legacy/gserver/dataproviders/DataProvider.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b67af8a326bdfd211ee5720bf67828040b19e5c1
--- /dev/null
+++ b/paddle/legacy/gserver/dataproviders/DataProvider.cpp
@@ -0,0 +1,410 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DataProvider.h"
+
+#include <unistd.h>
+#include <algorithm>
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/StringUtil.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+void BufferBatch::swap(BufferBatch* bufBatch) {
+  DataBatch* batchData = bufBatch->getDataBatch();
+  hl_event_t hlEvent = bufBatch->getCuEvent();
+  hl_stream_t hlStream = bufBatch->getCuStream();
+  bufBatch->setDataBatch(batchData_);
+  bufBatch->setCuStream(hlStream_);
+  bufBatch->setCuEvent(hlEvent_);
+
+  batchData_ = batchData;
+  hlEvent_ = hlEvent;
+  hlStream_ = hlStream;
+}
+
+void BufferBatch::clone(DataBatch* srcBatch, bool useGpu) {
+  if (batchData_ == NULL) {
+    batchData_ = new DataBatch();
+  }
+  std::vector<Argument>& destData = batchData_->getStreams();
+  int numStreams = srcBatch->getNumStreams();
+  destData.resize(numStreams);
+  batchData_->setSize(srcBatch->getSize());
+  if (useGpu) {
+    createCuEvent();
+  }
+
+  for (int i = 0; i < numStreams; i++) {
+    destData[i].resizeAndCopyFrom(srcBatch->getStream(i), useGpu, hlStream_);
+  }
+  if (useGpu) {
+    hl_stream_record_event(hlStream_, hlEvent_);
+  }
+}
+
+DoubleBuffer::DoubleBuffer(DataProvider* dataPool,
+                           bool useGpu,
+                           int64_t batchSize) {
+  batchSize_ = batchSize;
+  dataPool_ = dataPool;
+  useGpu_ = useGpu;
+  dataQueue_ = new BufferBatchQueue();
+  bufferQueue_ = new BufferBatchQueue();
+
+  // insert a empty buffer
+  bufferQueue_->enqueue(new BufferBatch());
+  stopping_ = false;
+  pending_ = true;
+}
+
+DoubleBuffer::~DoubleBuffer() {
+  finishAsyncLoad();
+  while (dataQueue_->size()) {
+    BufferBatch* dataBtch = dataQueue_->dequeue();
+    delete dataBtch;
+    dataBtch = NULL;
+  }
+  while (bufferQueue_->size()) {
+    BufferBatch* bufBtch = bufferQueue_->dequeue();
+    delete bufBtch;
+    bufBtch = NULL;
+  }
+  delete dataQueue_;
+  dataQueue_ = NULL;
+  delete bufferQueue_;
+  bufferQueue_ = NULL;
+}
+
+void DoubleBuffer::removeOneBatch(DataBatch* dataBatch) {
+  // get data
+  BufferBatch* batch = dataQueue_->dequeue();
+  batch->syncEvent();  // when use GPU, need synchronized with the cuEvent
+  *dataBatch = *(batch->getDataBatch());
+
+  // push anothor buffer
+  if (*usingBatch_ == nullptr) {
+    *usingBatch_ = std::make_shared<BufferBatch>();
+  }
+
+  // Mark the using-batch
+  batch->swap((*usingBatch_).get());
+  bufferQueue_->enqueue(batch);
+
+  if (0 == dataBatch->getSize()) {
+    setPending(true);
+  }
+}
+
+void DoubleBuffer::insertOneBatch(DataBatch* batch) {
+  while (!bufferQueue_->waitNotEmptyFor(2 /* seconds */)) {  // time out
+    if (stopping_) return;
+  }
+  BufferBatch* bufBatch = bufferQueue_->dequeue();
+  // clone and copy the data from an Threadlocal Variable
+  bufBatch->clone(batch, useGpu_);
+  dataQueue_->enqueue(bufBatch);
+}
+
+void DoubleBuffer::asyncLoadBatch() {
+  int64_t actualSize = 0;
+  if (useGpu_) {
+    hl_set_device(FLAGS_gpu_id);
+  }
+  setPending(false);
+
+  while (true) {
+    taskReadySem_.wait();
+    if (stopping_) break;
+
+    while (batchSize_ == 0 && !stopping_) {
+      usleep(5);
+    }
+    if (stopping_) break;
+
+    do {
+      DataBatch newBatch;
+      {
+        REGISTER_TIMER("getNextBatchInternal");
+        actualSize = dataPool_->getNextBatchInternal(batchSize_, &newBatch);
+      }
+      insertOneBatch(&newBatch);
+    } while (actualSize > 0 && !stopping_);
+  }
+}
+
+void DoubleBuffer::startAsyncLoad() {
+  if (asyncLoader_ == nullptr) {
+    asyncLoader_.reset(new std::thread([this]() { this->asyncLoadBatch(); }));
+  }
+  taskReadySem_.post();
+}
+
+ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool>
+    DataProvider::registrar_;
+
+DataProvider* DataProvider::create(const DataConfig& config,
+                                   const ModelConfig& modelConfig,
+                                   bool useGpu) {
+  return registrar_.createByType(config.type(), config, modelConfig, useGpu);
+}
+
+REGISTER_DATA_PROVIDER(simple, SimpleDataProvider);
+REGISTER_DATA_PROVIDER(dummy, DummyDataProvider);
+
+int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) {
+  int64_t batchSize = doubleBuffer_ ? getNextBatchFromBuffer(size, batch)
+                                    : getNextBatchInternal(size, batch);
+
+  if (!batchSize) return 0;
+
+  if (!config_.constant_slots_size()) return batchSize;
+
+  auto& constantSlots = *constantSlots_;
+  constantSlots.resize(config_.constant_slots_size());
+
+  for (int i = 0; i < config_.constant_slots_size(); ++i) {
+    MemoryHandlePtr handle =
+        constantSlots[i] ? constantSlots[i]->getMemoryHandle() : nullptr;
+    Matrix::resizeOrCreate(constantSlots[i],
+                           batchSize,
+                           1,         // = width
+                           false,     // = trans
+                           useGpu_);  // = useGpu
+    if (handle != constantSlots[i]->getMemoryHandle()) {
+      // memory buf was reallocated. We need to initialize the value
+      constantSlots[i]->assign(config_.constant_slots(i));
+    }
+    batch->appendData(constantSlots[i],
+                      batch->getStream(0).sequenceStartPositions);
+  }
+
+  return batchSize;
+}
+
+int64_t DataProvider::getNextBatchFromBuffer(int64_t size, DataBatch* batch) {
+  CHECK(doubleBuffer_ != nullptr);
+
+  if (doubleBuffer_->getBatchSize() != size) {
+    doubleBuffer_->setBatchSize(size);
+  }
+
+  doubleBuffer_->removeOneBatch(batch);
+  return batch->getSize();
+}
+
+void DataProvider::initAsyncLoader() {
+  if (doubleBuffer_ == nullptr) {
+    doubleBuffer_.reset(new DoubleBuffer(this, useGpu_));
+  }
+  useGpu_ = false;  // Avoid D2D copy, it will delay the computing performance
+}
+
+SimpleDataProviderBase::SimpleDataProviderBase(const DataConfig& config,
+                                               bool useGpu,
+                                               bool withInfo)
+    : DataProvider(config, useGpu) {
+  /* initialize the size of a sample, and the buffer */
+  sampleDim_ = config_.feat_dim() * (2 * config_.context_len() + 1);
+  bufferCapacity_ = config_.buffer_capacity();
+  withInfo_ = withInfo;
+  sampleNumInBuf_ = 0;
+  nextItemIndex_ = 0;
+
+  /* malloc buffer in cpu */
+  hInputDataBuf_ = std::make_shared<CpuMatrix>(bufferCapacity_, sampleDim_);
+  hInputLabelBuf_ = std::make_shared<CpuIVector>(bufferCapacity_);
+  hInputInfoBuf_ = std::make_shared<CpuIVector>(bufferCapacity_);
+}
+
+void SimpleDataProviderBase::shuffle() {
+  int i, t;
+  int len = sampleNumInBuf_;
+  std::vector<real> temp(sampleDim_);
+  real* data = hInputDataBuf_->getData();
+  int* label = hInputLabelBuf_->getData();
+  int* info = hInputInfoBuf_->getData();
+  int sampleSz = sizeof(real) * sampleDim_;
+  for (i = 0; i < len; i++) {
+    int randNum = rand();  // NOLINT TODO(yuyang18): Use rand_r instead?
+    t = randNum % (len - i) + i;
+    // swap
+    if (i != t) {
+      // swap data
+      memcpy(&temp[0], &data[i * sampleDim_], sampleSz);
+      memcpy(&data[i * sampleDim_], &data[t * sampleDim_], sampleSz);
+      memcpy(&data[t * sampleDim_], &temp[0], sampleSz);
+      std::swap(label[i], label[t]);
+      if (withInfo_) {
+        std::swap(info[i], info[t]);
+      }
+    }
+  }
+}
+
+int64_t SimpleDataProviderBase::getNextBatchInternal(int64_t size,
+                                                     DataBatch* batch) {
+  CHECK(batch != NULL);
+  batch->clear();
+
+  int64_t startIndex;
+  int64_t cpySize;
+
+  std::lock_guard<RWLock> guard(lock_);
+  if (sampleNumInBuf_ - nextItemIndex_ < size) {
+    int64_t n = fillBuffer();
+    VLOG(1) << "fillBuffer return " << n << " samples.\n";
+  }
+
+  startIndex = nextItemIndex_;
+  cpySize = std::min(size, sampleNumInBuf_ - nextItemIndex_);
+  nextItemIndex_ += cpySize;
+
+  if (cpySize > 0) {
+    real* data = hInputDataBuf_->getData() + startIndex * sampleDim_;
+    int* label = hInputLabelBuf_->getData() + startIndex;
+    int* info = hInputInfoBuf_->getData() + startIndex;
+
+    MatrixPtr& dataBatch = *dataBatch_;     // get the thread local object
+    IVectorPtr& labelBatch = *labelBatch_;  // get the thread local object
+    IVectorPtr& infoBatch = *infoBatch_;    // get the thread local object
+    if (!dataBatch) {
+      dataBatch = Matrix::create(cpySize, sampleDim_, false, useGpu_);
+      labelBatch = IVector::create(cpySize, useGpu_);
+      if (withInfo_) {
+        infoBatch = IVector::create(cpySize, 0);
+      }
+    } else {
+      dataBatch->resize(cpySize, sampleDim_);
+      labelBatch->resize(cpySize);
+      if (withInfo_) {
+        infoBatch->resize(cpySize);
+      }
+    }
+    dataBatch->copyFrom(data, cpySize * sampleDim_);
+    labelBatch->copyFrom(label, cpySize);
+    batch->appendData(dataBatch);
+    batch->appendLabel(labelBatch);
+    if (withInfo_) {
+      infoBatch->copyFrom(info, cpySize);
+      batch->appendLabel(infoBatch);
+    }
+  }
+
+  batch->setSize(cpySize);
+  return cpySize;
+}
+
+void SimpleDataProviderBase::reset() {
+  sampleNumInBuf_ = 0;
+  nextItemIndex_ = 0;
+  DataProvider::reset();
+}
+
+int64_t SimpleDataProviderBase::getSize() {
+  LOG(FATAL) << "Currently, not implemented";
+  return 0;
+}
+
+int64_t SimpleDataProviderBase::fillBuffer() {
+  int64_t n = sampleNumInBuf_ - nextItemIndex_;
+
+  /* flash the remaining data to the beginning of the buffer */
+  if (n > 0) {
+    hInputDataBuf_->copyFrom(
+        hInputDataBuf_->getData() + nextItemIndex_ * sampleDim_,
+        n * sampleDim_);
+    hInputLabelBuf_->copyFrom(hInputLabelBuf_->getData() + nextItemIndex_, n);
+    if (withInfo_) {
+      hInputInfoBuf_->copyFrom(hInputInfoBuf_->getData() + nextItemIndex_, n);
+    }
+  }
+
+  sampleNumInBuf_ =
+      n + fillBufferImp(hInputDataBuf_->getData() + n * sampleDim_,
+                        hInputLabelBuf_->getData() + n,
+                        hInputInfoBuf_->getData() + n,
+                        bufferCapacity_ - n);
+
+  /* for stachastic gradient training */
+  if (!skipShuffle_) {
+    shuffle();
+  }
+
+  nextItemIndex_ = 0;
+
+  return sampleNumInBuf_;
+}
+
+SimpleDataProvider::SimpleDataProvider(const DataConfig& config, bool useGpu)
+    : SimpleDataProviderBase(config, useGpu, /* withInfo= */ false),
+      currentSampleIndex_(0) {
+  loadData(config_.files());
+}
+
+SimpleDataProvider::~SimpleDataProvider() {}
+
+int64_t SimpleDataProvider::fillBufferImp(real* data,
+                                          int* label,
+                                          int* info,
+                                          int64_t size) {
+  (void)info;
+  int64_t n = std::min<int64_t>(labels_.size() - currentSampleIndex_, size);
+  memcpy(data,
+         &data_[currentSampleIndex_ * sampleDim_],
+         n * sampleDim_ * sizeof(real));
+  memcpy(label, &labels_[currentSampleIndex_], sizeof(int) * n);
+  currentSampleIndex_ += n;
+
+  return n;
+}
+
+void SimpleDataProvider::reset() {
+  currentSampleIndex_ = 0;
+  SimpleDataProviderBase::reset();
+}
+
+void SimpleDataProvider::loadData(const std::string& fileName) {
+  std::ifstream is(fileName);
+  CHECK(is) << "Fail to open " << fileName;
+  std::string line;
+  while (is) {
+    if (!getline(is, line)) break;
+    LOG(INFO) << "load data file " << line;
+    loadDataFile(line);
+  }
+  LOG(INFO) << "read done, num of instance=" << labels_.size()
+            << " data size=" << data_.size();
+}
+
+void SimpleDataProvider::loadDataFile(const std::string& fileName) {
+  std::ifstream is(fileName);
+  std::string line;
+  std::vector<std::string> pieces;
+  while (is) {
+    if (!getline(is, line)) break;
+    str::split(line, ' ', &pieces);
+    CHECK_EQ((uint64_t)(sampleDim_ + 1), pieces.size())
+        << " Dimension mismatch, " << pieces.size() - 1 << " in " << fileName
+        << " " << sampleDim_ << " from config";
+    labels_.push_back(atoi(pieces[0].c_str()));
+    for (int i = 0; i < sampleDim_; ++i) {
+      data_.push_back(atof(pieces[i + 1].c_str()));
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/DataProvider.h b/paddle/legacy/gserver/dataproviders/DataProvider.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2e1c5fdd6d504b77873aaeeba3611dff6d8f738
--- /dev/null
+++ b/paddle/legacy/gserver/dataproviders/DataProvider.h
@@ -0,0 +1,480 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <vector>
+
+#include "DataConfig.pb.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/parameter/Argument.h"
+#include "paddle/legacy/utils/ClassRegistrar.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Queue.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+/**
+ * @def REGISTER_DATA_PROVIDER
+ * @brief Macro for registering a data provider. The class type should contain
+ *        a consturctor with parameter (DataConfig, bool).
+ */
+#define REGISTER_DATA_PROVIDER(__type_name, __class_name)                \
+  static InitFunction __reg_type_##__type_name([]() {                    \
+    DataProvider::registrar_.registerClass(                              \
+        #__type_name,                                                    \
+        [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \
+          DataProvider* dp = new __class_name(conf, useGpu);             \
+          return dp;                                                     \
+        });                                                              \
+  })
+
+/**
+ * @def REGISTER_DATA_PROVIDER_EX
+ * @brief Macro for registering a data provider, which contains a constructor
+ *        with parameter (DataConfig, ModelConfig, bool).
+ */
+#define REGISTER_DATA_PROVIDER_EX(__type_name, __class_name)            \
+  static InitFunction __reg_type_##__type_name([] {                     \
+    DataProvider::registrar_.registerClass<__class_name>(#__type_name); \
+  })
+
+class DataBatch;
+class BufferBatch;
+typedef std::shared_ptr<DataBatch> DataBatchPtr;
+typedef std::shared_ptr<BufferBatch> BufferBatchPtr;
+/**
+ * @brief Data for batch training a neural network
+ */
+class DataBatch {
+ public:
+  DataBatch() : size_(0) { data_.clear(); }
+  /**
+   * @brief Get batch size
+   * @return batch size
+   */
+  int64_t getSize() const { return size_; }
+  /**
+   * @brief Get num of sequences of sequence data
+   * @return num of sequences
+   */
+  int64_t getNumSequences() const {
+    if (data_.empty()) return size_;
+    return data_[0].sequenceStartPositions
+               ? data_[0].sequenceStartPositions->getSize() - 1
+               : size_;
+  }
+  /**
+   * @brief Set batch size
+   * @param[in] size size
+   */
+  void setSize(int64_t size) { size_ = size; }
+  /**
+   * @brief Get size of argument vector
+   * @return size of argument vector
+   * @note For usual supervised learning, input data and label is needed,
+   * then there will be two argument.
+   */
+  int64_t getNumStreams() const { return data_.size(); }
+
+  /**
+   * @brief Get a argument with index i
+   * @param[in] i index in argument vector
+   * @return a argument with index i
+   */
+  const Argument& getStream(int i) const { return data_[i]; }
+  /**
+   * @brief Get all argument
+   * @return an argument vector
+   */
+  std::vector<Argument>& getStreams() { return data_; }
+  /**
+   * @brief Get all argument const
+   * @return an argument vector
+   */
+  std::vector<Argument> getStreams() const { return data_; }
+  /**
+   * @brief Clear DataBatch
+   */
+  void clear() {
+    data_.clear();
+    size_ = 0;
+  }
+
+  /**
+   * @brief Append data to DataBatch
+   * @param[in] data  matrix data
+   * @note The order in which each data stream is appended must match the order
+   * specified in stream_names of DataConfig. The stream_names can be obtained
+   * using DataProvider::getStreamNames().
+   */
+  void appendData(MatrixPtr data) {
+    Argument argu;
+    argu.value = data;
+    data_.push_back(argu);
+  }
+
+  /**
+   * @brief Append sequence data to DataBatch
+   * @param[in] data                      matrix data
+   * @param[in] sequenceStartPositions    sequence data
+   * @note The order in which each data stream is appended must match the order
+   * specified in stream_names of DataConfig. The stream_names can be obtained
+   * using DataProvider::getStreamNames().
+   */
+  void appendData(const MatrixPtr& data,
+                  const ICpuGpuVectorPtr& sequenceStartPositions) {
+    Argument argu;
+    argu.value = data;
+    argu.sequenceStartPositions = sequenceStartPositions;
+    data_.push_back(argu);
+  }
+  /**
+   * @brief Append label data
+   * @param[in]  label    label data
+   * @param[in]  value    matrix data, default null
+   */
+  void appendLabel(IVectorPtr label, MatrixPtr value = nullptr) {
+    Argument argu;
+    argu.ids = label;
+    argu.value = value;
+    data_.push_back(argu);
+  }
+
+  /*
+   * @brief Append argument
+   * @param[in]  argus   DataBatch.getStreams()
+   * @param[in]  size    DataBatch.getSize()
+   * @param[in]  dataId  sub dataprovider id (in MultiDataProvider)
+   */
+  void appendArguments(const std::vector<Argument>& argus,
+                       int size,
+                       int dataId) {
+    size_ += size;
+    for (const auto& argu : argus) {
+      data_.push_back(argu);
+      data_.back().dataId = dataId;
+    }
+  }
+
+ protected:
+  /**
+   * @brief batch size
+   */
+  int64_t size_;
+  /**
+   * @brief A batch data consist of a Argument vector,
+   * An argument corresponds to a type of input data.
+   */
+  std::vector<Argument> data_;
+};
+
+class BufferBatch {
+ public:
+  BufferBatch() {
+    hlStream_ = HPPL_STREAM_DEFAULT;
+    hlEvent_ = NULL;
+    batchData_ = NULL;
+  }
+  ~BufferBatch() {
+    if (hlEvent_) {
+      hl_destroy_event(hlEvent_);
+      hlEvent_ = NULL;
+    }
+    delete batchData_;
+    batchData_ = NULL;
+  }
+
+  void setDataBatch(DataBatch* batchData) { batchData_ = batchData; }
+  DataBatch* getDataBatch() { return batchData_; }
+
+  void setCuStream(hl_stream_t stream) { hlStream_ = stream; }
+  hl_stream_t getCuStream() const { return hlStream_; }
+
+  void setCuEvent(hl_event_t event) { hlEvent_ = event; }
+
+  hl_event_t getCuEvent() const { return hlEvent_; }
+
+  void createCuEvent() {
+    if (!hlEvent_) {
+      hlStream_ = HPPL_STREAM_1;
+      hl_create_event(&hlEvent_);
+    }
+  }
+
+  void syncEvent() {
+    if (hlEvent_) {
+      hl_stream_wait_event(hlStream_, hlEvent_);
+    }
+  }
+
+  void swap(BufferBatch* bufBatch);
+  void clone(DataBatch* srcBatch, bool useGpu);
+
+ protected:
+  DataBatch* batchData_;
+  hl_stream_t hlStream_;
+  hl_event_t hlEvent_;
+};
+
+class DataProvider;
+typedef std::shared_ptr<DataProvider> DataProviderPtr;
+
+typedef Queue<BufferBatch*> BufferBatchQueue;
+
+class DoubleBuffer {
+ public:
+  DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize = 0);
+  virtual ~DoubleBuffer();
+  void removeOneBatch(DataBatch* dataBatch);
+
+  void setBatchSize(int64_t newBatchSize) { batchSize_ = newBatchSize; }
+
+  int64_t getBatchSize() { return batchSize_; }
+
+  void startAsyncLoad();
+  void finishAsyncLoad() {
+    stopping_ = true;
+    taskReadySem_.post();
+    if (asyncLoader_) {
+      asyncLoader_->join();
+    }
+  }
+
+  void setPending(bool pending) { pending_ = pending; }
+
+ protected:
+  virtual void asyncLoadBatch();
+  void insertOneBatch(DataBatch* batch);
+
+  DataProvider* dataPool_;
+  bool useGpu_;
+  int32_t batchSize_;
+  ThreadLocal<BufferBatchPtr> usingBatch_;
+  BufferBatchQueue* dataQueue_;
+  BufferBatchQueue* bufferQueue_;
+  std::unique_ptr<std::thread> asyncLoader_;
+  Semaphore taskReadySem_;
+  bool stopping_;
+  bool pending_;
+};
+
+/**
+ * @brief Base class for DataProvider, which supplies data for training
+ * @note It can supplies multiple streams of data.
+ * For typical supervised training, there are two streams:
+ * one is for input, one is for label.
+ */
+class DataProvider {
+ public:
+  static ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool> registrar_;
+  static DataProvider* create(const DataConfig& config,
+                              const ModelConfig& modelConfig,
+                              bool useGpu = FLAGS_use_gpu);
+
+  /**
+   * @brief create only used for unittest.
+   */
+  inline static DataProvider* create(const DataConfig& config,
+                                     bool useGpu = FLAGS_use_gpu) {
+    return create(config, ModelConfig(), useGpu);
+  }
+
+  DataProvider(const DataConfig& config, bool useGpu)
+      : config_(config),
+        skipShuffle_(false),
+        usageRatio_(config.usage_ratio()),
+        useGpu_(useGpu) {
+    if (config_.async_load_data()) {
+      initAsyncLoader();
+    }
+  }
+  virtual ~DataProvider() {}
+
+  const DataConfig& getConfig() const { return config_; }
+
+  void setSkipShuffle() { skipShuffle_ = true; }
+
+  /**
+   * @brief Get next batch of training samples
+   * @param[in]    size    size of training samples to get
+   * @param[out]   batch   a batch of training samples
+   * @return actual size of obtained training samples
+   */
+  int64_t getNextBatch(int64_t size, DataBatch* batch);
+
+  /**
+   * @brief Shuffle the data set
+   */
+  virtual void shuffle() = 0;
+
+  /**
+   * @brief reset all the value of index
+   * @note reset() must be called before any calls to getNextBatch()
+   * IMPORTANT: subclass reset() should always call the base class reset()
+   * at the end of the function
+   */
+  virtual void reset() {
+    if (doubleBuffer_ != nullptr) {
+      doubleBuffer_->startAsyncLoad();
+    }
+  }
+
+  /**
+   * @brief Get the size of training samples
+   * @return the number of training samples in the data set.
+   * @note return -1 to indicate unlimited number of samples.
+   */
+  virtual int64_t getSize() = 0;
+
+  /**
+   * @brief Get next batch training samples internally
+   * @param[in]    size      size of training samples to get
+   * @param[out]   batch     a batch of training samples
+   * @return actual size of obtained training samples
+   */
+  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch) = 0;
+
+ protected:
+  DataConfig config_;
+  bool skipShuffle_;
+  float usageRatio_;
+  bool useGpu_;
+  std::unique_ptr<DoubleBuffer> doubleBuffer_;
+  ThreadLocal<std::vector<MatrixPtr>> constantSlots_;
+  /**
+   * @@brief Get next batch training samples from buffer
+   * @param[in]    size      size of training samples to get
+   * @param[out]   batch     a batch of training samples
+   * @return actual size of obtained training samples
+   */
+  int64_t getNextBatchFromBuffer(int64_t size, DataBatch* batch);
+
+  void initAsyncLoader();
+};
+
+/**
+ * A data provider which does nothing. It only serves as providing
+ * necessary configurations such as stream_names
+ */
+class DummyDataProvider : public DataProvider {
+ public:
+  DummyDataProvider(const DataConfig& config, bool useGpu)
+      : DataProvider(config, useGpu) {}
+  virtual void shuffle() {}
+  virtual void reset() { DataProvider::reset(); }
+  virtual int64_t getSize() { return 0; }
+  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch) {
+    (void)size;
+    (void)batch;
+    return 0;
+  }
+};
+
+/**
+ * Data provider for one input and one integer label.
+ */
+class SimpleDataProviderBase : public DataProvider {
+ protected:
+  /// sample feature dimension
+  int64_t sampleDim_;
+  /// the number of samples
+  int64_t bufferCapacity_;
+  int64_t sampleNumInBuf_;
+  /// next item to read in buffer
+  int64_t nextItemIndex_;
+  /// some user defined info for validation
+  bool withInfo_;
+
+  /// data buffer: bufferCapacity_ * nDataDim_
+  CpuMatrixPtr hInputDataBuf_;
+
+  /// label buffer:bufferCapacity_ * 1
+  CpuIVectorPtr hInputLabelBuf_;
+
+  /// info buffer:bufferCapacity_ * 1
+  CpuIVectorPtr hInputInfoBuf_;
+
+  ThreadLocal<MatrixPtr> dataBatch_;
+  ThreadLocal<IVectorPtr> labelBatch_;
+  ThreadLocal<IVectorPtr> infoBatch_;
+
+  RWLock lock_;
+
+ public:
+  SimpleDataProviderBase(const DataConfig& config, bool useGpu, bool withInfo);
+  ~SimpleDataProviderBase() {}
+
+  void shuffle();
+
+  virtual void reset();
+
+  virtual int64_t getSize();
+
+  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
+
+  /// return the number of samples in the buffer
+  int64_t fillBuffer();
+
+ protected:
+  /**
+   * @brief Fill at most size samples into data and label.
+   *
+   * Each input is stored in contiguous memory locations in data.
+   *
+   * data[n * sampleDim_] .. data[n * sampleDim_ + sampleDim_ - 1] is for
+   * the input of the n-th sample.
+   *
+   * label[n] is the label for the n-th sample.
+   */
+  virtual int64_t fillBufferImp(real* data,
+                                int* label,
+                                int* info,
+                                int64_t size) = 0;
+};
+
+class SimpleDataProvider : public SimpleDataProviderBase {
+ public:
+  SimpleDataProvider(const DataConfig& config, bool useGpu);
+  ~SimpleDataProvider();
+  virtual void reset();
+
+ protected:
+  void loadData(const std::string& fileName);
+  void loadDataFile(const std::string& fileName);
+  virtual int64_t fillBufferImp(real* data,
+                                int* label,
+                                int* info,
+                                int64_t size);
+
+ protected:
+  size_t currentSampleIndex_;
+  std::vector<int> labels_;
+  std::vector<real> data_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/DataProviderGroup.h b/paddle/legacy/gserver/dataproviders/DataProviderGroup.h
new file mode 100644
index 0000000000000000000000000000000000000000..91c94dc986c7aeb70df25511ce14a5f9c312a159
--- /dev/null
+++ b/paddle/legacy/gserver/dataproviders/DataProviderGroup.h
@@ -0,0 +1,153 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "DataProvider.h"
+
+namespace paddle {
+
+template <class T>
+class DataProviderGroup : public DataProvider {
+ protected:
+  typedef T ProviderType;
+  typedef std::shared_ptr<ProviderType> ProviderPtrType;
+  ProviderPtrType provider_;
+
+  std::vector<std::string> fileList_;
+  std::mutex lock_;
+  std::unique_ptr<MultiThreadWorker<ProviderType>> loader_;
+
+ public:
+  DataProviderGroup(const DataConfig& config, bool useGpu);
+  ~DataProviderGroup() {}
+
+  virtual void reset();
+  virtual void shuffle() {}
+  virtual int64_t getSize() { return -1; }
+  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
+
+ private:
+  void startLoader();
+  void stopLoader();
+  void forceStopLoader();
+  ProviderPtrType loadFile(const std::vector<std::string>& fileList);
+};
+
+template <class T>
+DataProviderGroup<T>::DataProviderGroup(const DataConfig& config, bool useGpu)
+    : DataProvider(config, useGpu) {
+  // load file list
+  loadFileList(config_.files(), fileList_);
+  CHECK_GT(fileList_.size(), 0LU);
+  LOG(INFO) << "load file list, numfiles=" << fileList_.size()
+            << ", max_num_of_data_providers_in_memory="
+            << (1 + config_.file_group_conf().queue_capacity() +
+                config_.file_group_conf().load_thread_num());
+}
+
+template <class T>
+void DataProviderGroup<T>::reset() {
+  forceStopLoader();
+  CHECK(!loader_);
+  provider_ = nullptr;
+
+  // shuffle file list
+  std::shuffle(
+      fileList_.begin(), fileList_.end(), ThreadLocalRandomEngine::get());
+
+  startLoader();
+  DataProvider::reset();
+}
+
+template <class T>
+int64_t DataProviderGroup<T>::getNextBatchInternal(int64_t size,
+                                                   DataBatch* batch) {
+  std::lock_guard<std::mutex> guard(lock_);
+
+  if (!loader_) {
+    return 0;
+  }
+  if (provider_) {
+    int64_t ret = provider_->getNextBatchInternal(size, batch);
+    if (ret > 0) {
+      return ret;
+    }
+  }
+
+  // else get data from next data provider
+  if (loader_->testResult()) {
+    LOG(INFO) << "WAIT provider";
+  }
+  provider_ = loader_->waitResult();
+  if (!provider_) {
+    stopLoader();  // All the data providers have been returned
+    return 0;
+  }
+  int64_t ret = provider_->getNextBatchInternal(size, batch);
+  CHECK(ret > 0) << "new data provider does not contain any valid samples!";
+  return ret;
+}
+
+template <class T>
+void DataProviderGroup<T>::startLoader() {
+  loader_.reset(new MultiThreadWorker<ProviderType>(
+      config_.file_group_conf().load_thread_num(),
+      config_.file_group_conf().queue_capacity()));
+
+  int loadFileCount = config_.file_group_conf().load_file_count();
+  for (size_t startPos = 0; startPos < fileList_.size();
+       startPos += loadFileCount) {
+    size_t endPos = std::min(fileList_.size(), startPos + loadFileCount);
+    std::vector<std::string> fileVec(fileList_.begin() + startPos,
+                                     fileList_.begin() + endPos);
+    loader_->addJob([this, fileVec]() -> ProviderPtrType {
+      return this->loadFile(fileVec);
+    });
+  }
+  loader_->stopAddJob();
+}
+
+template <class T>
+void DataProviderGroup<T>::stopLoader() {
+  if (loader_) {
+    loader_->stop();
+    loader_ = nullptr;
+  }
+}
+
+template <class T>
+void DataProviderGroup<T>::forceStopLoader() {
+  if (loader_) {
+    loader_->forceStop();
+    loader_ = nullptr;
+  }
+}
+
+template <class T>
+std::shared_ptr<T> DataProviderGroup<T>::loadFile(
+    const std::vector<std::string>& fileList) {
+  // disable async_load_data in sub dataprovider
+  DataConfig subConfig = config_;
+  subConfig.set_async_load_data(false);
+
+  CHECK(!fileList.empty()) << "fileList is empty";
+  ProviderPtrType provider =
+      std::make_shared<ProviderType>(subConfig, useGpu_, false);
+  provider->loadData(fileList);
+  provider->reset();
+  return provider;
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/MultiDataProvider.cpp b/paddle/legacy/gserver/dataproviders/MultiDataProvider.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e5fc6d8a88fe2c03cc74b4a38e999d11d676dfdf
--- /dev/null
+++ b/paddle/legacy/gserver/dataproviders/MultiDataProvider.cpp
@@ -0,0 +1,122 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MultiDataProvider.h"
+#include <algorithm>
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+using namespace std;
+
+MultiDataProvider::MultiDataProvider(const DataConfig& config,
+                                     const ModelConfig& modelConfig,
+                                     bool useGpu)
+    : DataProvider(config, useGpu) {
+  bool atLeastOneMainDataFlag = false;
+  totalDataRatio_ = 0;
+  LOG(INFO) << "MultiDataProvider: sub data provider size: "
+            << config.sub_data_configs_size();
+  LOG(INFO) << "MultiDataProvider: for_test: " << config.for_test();
+  isTestMode_ = config.for_test();
+  for (int i = 0; i < config.sub_data_configs_size(); i++) {
+    LOG(INFO) << "dataRatio of sub(" << i
+              << ") is: " << config.sub_data_configs(i).data_ratio();
+    totalDataRatio_ += config.sub_data_configs(i).data_ratio();
+    if (config.sub_data_configs(i).is_main_data()) {
+      LOG(INFO) << "main data is [" << i << "]";
+      atLeastOneMainDataFlag = true;
+    }
+  }
+  CHECK(atLeastOneMainDataFlag) << "all sub dataproviders in MultiData do not"
+                                << " have is_main_data flag";
+  LOG(INFO) << "totalDataRatio_=" << totalDataRatio_;
+  DataConfig subConfig;
+  int subDataProviderCount = config.sub_data_configs_size();
+  if (isTestMode()) {
+    LOG(INFO) << "construct MultiDataProvider in test mode";
+  } else {
+    LOG(INFO) << "construct MultiDataProvider in train mode";
+  }
+  subDataProviders_.resize(subDataProviderCount);
+  for (int i = 0; i < subDataProviderCount; i++) {
+    subConfig = config.sub_data_configs(i);
+    if (subConfig.async_load_data()) {
+      LOG(INFO) << "can not use async_load_data in sub dataprovider of "
+                   "MultiDataProvider";
+      subConfig.set_async_load_data(false);
+    }
+    subDataProviders_[i] = std::unique_ptr<DataProvider>(
+        DataProvider::create(subConfig, modelConfig, useGpu_));
+  }
+}
+
+void MultiDataProvider::reset() {
+  for (auto& elem : subDataProviders_) {
+    elem->reset();
+  }
+  DataProvider::reset();
+}
+
+void MultiDataProvider::shuffle() {
+  for (auto& elem : subDataProviders_) {
+    elem->shuffle();
+  }
+}
+
+int64_t MultiDataProvider::getNextBatchInternal(int64_t size,
+                                                DataBatch* batch) {
+  batch->clear();
+  for (size_t i = 0; i < subDataProviders_.size(); ++i) {
+    // calc size according to data ratio
+    int64_t subSize =
+        (int64_t)(1.0 * size * config_.sub_data_configs(i).data_ratio() /
+                  totalDataRatio_);
+    DataBatch subBatch;
+    int64_t realSize =
+        subDataProviders_[i]->getNextBatchInternal(subSize, &subBatch);
+    if (realSize == 0) {
+      // current subDataProvider has no data
+      if (!isTestMode()) {
+        // in train mode
+        if (config_.sub_data_configs(i).is_main_data()) {
+          // is main data provider. then return 0
+          batch->clear();
+          return 0;
+        } else {
+          // not main data provider, reset current subDataProvider and try again
+          subDataProviders_[i]->reset();
+          subBatch.clear();
+          realSize =
+              subDataProviders_[i]->getNextBatchInternal(subSize, &subBatch);
+          CHECK_GT(realSize, 0);
+        }
+      } else {
+        // in test mode, make an empty argument
+        Argument emptyArgu;
+        std::vector<Argument> argus;
+        argus.push_back(emptyArgu);
+        batch->appendArguments(argus, 0, -1);
+        continue;
+      }
+    }
+    batch->appendArguments(subBatch.getStreams(), subBatch.getSize(), i);
+  }
+  return batch->getSize();
+}
+
+REGISTER_DATA_PROVIDER_EX(multi, MultiDataProvider);
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/MultiDataProvider.h b/paddle/legacy/gserver/dataproviders/MultiDataProvider.h
new file mode 100644
index 0000000000000000000000000000000000000000..baa1fc019002f86414c9c45734ad65cda916d457
--- /dev/null
+++ b/paddle/legacy/gserver/dataproviders/MultiDataProvider.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "DataProvider.h"
+
+namespace paddle {
+
+class MultiDataProvider : public DataProvider {
+ protected:
+  std::vector<std::unique_ptr<DataProvider>> subDataProviders_;
+
+ public:
+  MultiDataProvider(const DataConfig& config,
+                    const ModelConfig& modelConfig,
+                    bool useGpu);
+  ~MultiDataProvider() {}
+  virtual void reset();
+  virtual void shuffle();
+  virtual int64_t getSize() { return -1; }
+  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
+  bool isTestMode() const { return isTestMode_; }
+
+ private:
+  int totalDataRatio_;
+  bool isTestMode_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/ProtoReader.h b/paddle/legacy/gserver/dataproviders/ProtoReader.h
new file mode 100644
index 0000000000000000000000000000000000000000..08d045226e1ebb014bdd91ebf0e8f0353179b0c8
--- /dev/null
+++ b/paddle/legacy/gserver/dataproviders/ProtoReader.h
@@ -0,0 +1,177 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/io/gzip_stream.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include <google/protobuf/message_lite.h>
+
+namespace paddle {
+
+/**
+ * ProtoReader/ProtoWriter are used to read/write a sequence of protobuf
+ * messages from/to i/ostream.
+ */
+class ProtoReader {
+ public:
+  explicit ProtoReader(std::istream* s, bool dataCompression = false) {
+    CHECK(s) << "istream pointer is nullptr";
+    istreamInput_.reset(new google::protobuf::io::IstreamInputStream(s));
+    if (dataCompression) {
+      gzipInput_.reset(
+          new google::protobuf::io::GzipInputStream(istreamInput_.get()));
+      codedInput_.reset(
+          new google::protobuf::io::CodedInputStream(gzipInput_.get()));
+    } else {
+      codedInput_.reset(
+          new google::protobuf::io::CodedInputStream(istreamInput_.get()));
+    }
+    dataCompression_ = dataCompression;
+    approximateReadedBytes_ = 0;
+    codedInput_->SetTotalBytesLimit(kDefaultTotalBytesLimit,
+                                    kDefaultTotalBytesLimit);
+  }
+
+  /**
+   * read one message
+   */
+  bool read(google::protobuf::MessageLite* msg) {
+    if (approximateReadedBytes_ >= kMaxLimitBytes) {
+      // Once bytes we read get close to 64MB(larger than 55MB),
+      // we re-intialize the codedInputStream object.
+      approximateReadedBytes_ = 0;
+
+      /**
+       * Explicitly destroys the object owned by unique_ptr at first and then
+       * construct an new object.
+       *
+       * 1.reset()
+       *
+       * 2.reset(new ...)   <-- such sequence is EXTREAMLY important!
+       *
+       * Reason: (!!!Read me before you modify the following 2 lines of
+       * codes!!!)
+       *
+       * Otherwise, reset() method will ask the CodedInputStream constructor
+       * to construct the new object at first forcing the IstreamInputStream
+       * object to move its underlying pointer to the next 8192 bytes.
+       *
+       * Then the old object will be destroied calling
+       * IstreamInputStream::BackUp() to move the underlying pointer back.
+       * This means that the InstreamInputStream object is referenced by
+       * 2 different CodedInputStream object at the same time which "confuses"
+       * the position of istreamInput_'s underlying pointer. Such fatal
+       * confusion will lead to undefined behaviour when 'codedInput_' is
+       * used to read new data.
+       *
+       */
+      codedInput_.reset();
+      if (dataCompression_) {
+        codedInput_.reset(
+            new google::protobuf::io::CodedInputStream(gzipInput_.get()));
+      } else {
+        codedInput_.reset(
+            new google::protobuf::io::CodedInputStream(istreamInput_.get()));
+      }
+      codedInput_->SetTotalBytesLimit(kDefaultTotalBytesLimit,
+                                      kDefaultTotalBytesLimit);
+    }
+
+    uint32_t size;
+    if (!codedInput_->ReadVarint32(&size)) {
+      return false;
+    }
+    google::protobuf::io::CodedInputStream::Limit limit =
+        codedInput_->PushLimit(size);
+    CHECK(msg->ParseFromCodedStream(codedInput_.get()));
+    codedInput_->PopLimit(limit);
+
+    /**
+     * size is varint in the data file, we don't know the length.
+     * We assume every size takes 4 bytes in the data file.
+     */
+    approximateReadedBytes_ += 4 + size;
+    return true;
+  }
+
+ protected:
+  std::unique_ptr<google::protobuf::io::ZeroCopyInputStream> istreamInput_;
+  std::unique_ptr<google::protobuf::io::GzipInputStream> gzipInput_;
+  std::unique_ptr<google::protobuf::io::CodedInputStream> codedInput_;
+  bool dataCompression_;
+
+  /**
+   * This is the maximum number of bytes that this CodedInputStream will read
+   * before refusing to continue.
+   */
+  static const int kDefaultTotalBytesLimit = 64 << 20;  // 64MB
+
+  /**
+   * If data readed by the reader is more than 55MB( << 64MB),
+   * we reset the CodedInputStream object.
+   * This can help avoid 64MB warning which will cause the ParseFromCodedStream
+   * to fail.
+   */
+  static const int kMaxLimitBytes = 55 << 20;
+
+  /**
+   * This variable dosen't store the exact bytes readed by CodedInputStream
+   * object since which is constructed. Instead, it store the approximate bytes
+   * because we can't tell how many bytes are readed by the object with the
+   * help of API.
+   *
+   * @note this code depends on protobuf 2.4.0. There is nothing like
+   * CodedInputStream::CurrentPosition() in protobuf 2.5.0 to tell us how many
+   * bytes has the object readed so far. Therefore, we calculated bytes
+   * ourselves.
+   */
+  int approximateReadedBytes_;
+};
+
+class ProtoWriter {
+ public:
+  explicit ProtoWriter(std::ostream* s, bool dataCompression = false) {
+    CHECK(s) << "ostream pointer is nullptr";
+    ostreamOutput_.reset(new google::protobuf::io::OstreamOutputStream(s));
+    if (dataCompression) {
+      gzipOutput_.reset(
+          new google::protobuf::io::GzipOutputStream(ostreamOutput_.get()));
+      codedOutput_.reset(
+          new google::protobuf::io::CodedOutputStream(gzipOutput_.get()));
+    } else {
+      codedOutput_.reset(
+          new google::protobuf::io::CodedOutputStream(ostreamOutput_.get()));
+    }
+  }
+
+  /**
+   * write one message.
+   */
+  bool write(const google::protobuf::MessageLite& msg) {
+    codedOutput_->WriteVarint32(msg.ByteSize());
+    bool ret = msg.SerializeToCodedStream(codedOutput_.get());
+    return ret;
+  }
+
+ protected:
+  std::unique_ptr<google::protobuf::io::ZeroCopyOutputStream> ostreamOutput_;
+  std::unique_ptr<google::protobuf::io::GzipOutputStream> gzipOutput_;
+  std::unique_ptr<google::protobuf::io::CodedOutputStream> codedOutput_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/PyDataProvider.cpp b/paddle/legacy/gserver/dataproviders/PyDataProvider.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0827bd39d4cc78ef5658d437b6502f2e60e90b4c
--- /dev/null
+++ b/paddle/legacy/gserver/dataproviders/PyDataProvider.cpp
@@ -0,0 +1,498 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PyDataProvider.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+#ifndef PADDLE_NO_PYTHON
+REGISTER_DATA_PROVIDER(py, PyDataProvider);
+#endif
+
+PyDataProvider::PyDataProvider(const DataConfig& config,
+                               bool useGpu,
+                               bool loadDataAll)
+    : DataProvider(config, useGpu), batchSize_(0) {
+  PyGuard guard;
+  pyModuleName_ = config_.load_data_module();
+  pyClassName_ = config_.load_data_object();
+  if (config_.load_data_args() != "") {
+    pyUserArgs_["load_data_args"] = config_.load_data_args();
+  }
+
+  if (loadDataAll) {
+    std::vector<std::string> fileList;
+    if (!config_.files().empty()) {
+      loadFileList(config_.files(), fileList);
+    }
+    loadData(fileList);
+  }
+}
+
+void PyDataProvider::loadData(const std::vector<std::string>& fileList) {
+  VLOG(1) << "module:" << pyModuleName_ << " class:" << pyClassName_;
+  classInstance_ =
+      createPythonClass(pyModuleName_, pyClassName_, fileList, pyUserArgs_);
+  CHECK(classInstance_) << "Create class instance failed.";
+  PyObjectPtr obj(PyObject_CallMethod(
+      classInstance_.get(), const_cast<char*>("getHeader"), NULL));
+  CHECK_PY(obj) << "Call function getHeader failed.";
+  std::string headerInfo =
+      std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
+  parseHeaderData(headerInfo);
+  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
+}
+
+void PyDataProvider::parseHeaderData(const std::string& headerData) {
+  char* pHeader = const_cast<char*>(headerData.c_str());
+  char* pHeaderEnd = pHeader + headerData.size();
+  slotNum_ = readT<unsigned int>(pHeader, pHeaderEnd);
+  unsigned int useSequenceFlag = readT<unsigned int>(pHeader, pHeaderEnd);
+  isIID_ = useSequenceFlag != 1;
+  slots_.clear();
+  slots_.reserve(slotNum_);
+  for (size_t i = 0; i < slotNum_; ++i) {
+    unsigned int slotType = readT<unsigned int>(pHeader, pHeaderEnd);
+    unsigned int slotDim = readT<unsigned int>(pHeader, pHeaderEnd);
+    slots_.emplace_back();
+    slots_.back().dim = slotDim;
+    slots_.back().type = static_cast<SlotDef_SlotType>(slotType);
+  }
+}
+
+void PyDataProvider::resetSlots() {
+  for (auto& slot : slots_) {
+    slot.indexData.clear();
+    slot.denseData.clear();
+    slot.sparseNonValueData.clear();
+    slot.sparseFloatValueData.clear();
+    slot.indices.clear();
+    slot.sequenceStartPositions.clear();
+    slot.sampleSequenceIdVec.clear();
+    slot.subSequenceStartPositions.clear();
+    slot.strData.clear();
+  }
+}
+
+void PyDataProvider::fillDenseSlot(ProtoSlot& slot,
+                                   char*& data,
+                                   const char* dataEnd) {
+  unsigned int dim = slot.dim;
+  slot.sampleNum = readT<unsigned int>(data, dataEnd);
+  slot.denseData.resize(slot.sampleNum * dim);
+#ifdef PADDLE_TYPE_DOUBLE
+  CHECK_LE(data + sizeof(real) * dim * slot.sampleNum, dataEnd)
+      << "std::copy data is out of range";
+  // PyDataProvider always provide data in float
+  float* dat = reinterpret_cast<float*>(data);
+  std::copy(dat, dat + slot.sampleNum * dim, slot.denseData.begin());
+#else
+  memcpyWithCheck(slot.denseData.data(),
+                  data,
+                  sizeof(real) * dim * slot.sampleNum,
+                  dataEnd);
+#endif
+  // PyDataProvider always provide data in float
+  data += sizeof(float) * dim * slot.sampleNum;
+}
+
+void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot,
+                                            char*& data,
+                                            const char* dataEnd) {
+  slot.sampleNum = readT<unsigned int>(data, dataEnd);
+  unsigned int* indexPtr = (unsigned int*)data;
+  CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
+      << "Vector assign value is out of range";
+  slot.indices.assign(indexPtr, indexPtr + slot.sampleNum);
+  data += sizeof(unsigned int) * slot.sampleNum;
+  unsigned int length = 0;
+  length = readT<unsigned int>(data, dataEnd);
+  slot.indices.push_back(length);
+  slot.sparseNonValueData.resize(length);
+  memcpyWithCheck(slot.sparseNonValueData.data(),
+                  data,
+                  sizeof(unsigned int) * length,
+                  dataEnd);
+  data += sizeof(unsigned int) * length;
+}
+
+void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot,
+                                         char*& data,
+                                         const char* dataEnd) {
+  slot.sampleNum = readT<unsigned int>(data, dataEnd);
+  unsigned int* indexPtr = (unsigned int*)data;
+  CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
+      << "Vector assign value is out of range";
+  slot.indices.assign(indexPtr, indexPtr + slot.sampleNum);
+  data += sizeof(unsigned int) * slot.sampleNum;
+  unsigned int length = 0;
+  length = readT<unsigned int>(data, dataEnd);
+  unsigned int* colPtr = reinterpret_cast<unsigned int*>(data);
+  CHECK_LE(data + sizeof(unsigned int) * length, dataEnd)
+      << "Data is out of range";
+  data += sizeof(unsigned int) * length;
+  size_t colLen = readT<unsigned int>(data, dataEnd);
+  CHECK_EQ(colLen, length);
+  float* valuePtr = reinterpret_cast<float*>(data);
+  CHECK_LE(data + sizeof(real) * length, dataEnd) << "Data is out of range";
+  data += sizeof(real) * length;
+  slot.indices.push_back(length);
+  slot.sparseFloatValueData.resize(length);
+  for (unsigned int ii = 0; ii < length; ++ii) {
+    slot.sparseFloatValueData[ii].col = colPtr[ii];
+    slot.sparseFloatValueData[ii].value = valuePtr[ii];
+  }
+}
+
+void PyDataProvider::fillIndexSlot(ProtoSlot& slot,
+                                   char*& data,
+                                   const char* dataEnd) {
+  slot.sampleNum = readT<unsigned int>(data, dataEnd);
+  CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd)
+      << "Vector assign is out of range";
+  slot.indexData.assign(reinterpret_cast<int*>(data),
+                        reinterpret_cast<int*>(data) + slot.sampleNum);
+  data += sizeof(unsigned int) * slot.sampleNum;
+}
+
+void PyDataProvider::fillStringSlot(ProtoSlot& slot,
+                                    char*& data,
+                                    const char* dataEnd) {
+  slot.sampleNum = readT<unsigned int>(data, dataEnd);
+  for (unsigned int i = 0; i < slot.sampleNum; ++i) {
+    size_t len = readT<uint32_t>(data, dataEnd);
+    auto str_begin = data;
+    data += len;
+    CHECK_LE(data, dataEnd) << "Data is out of range";
+    slot.strData.emplace_back(str_begin, len);
+  }
+}
+
+void PyDataProvider::fillSlotsByStr(const std::string& samples) {
+  char* data = const_cast<char*>(samples.c_str());
+  char* dataEnd = data + samples.size();
+  batchSize_ = readT<unsigned int>(data, dataEnd);
+  if (0 == batchSize_) {
+    return;
+  }
+
+  for (size_t j = 0; j < slotNum_; ++j) {
+    auto& slot = slots_[j];
+    CHECK(SlotDef::INDEX >= slot.type || SlotDef::STRING == slot.type)
+        << " Slot type:" << slot.type << " is out of range.";
+    CHECK_GE(slot.type, SlotDef::VECTOR_DENSE) << " Slot type:" << slot.type
+                                               << " is out of range.";
+    switch (slot.type) {
+      case SlotDef::VECTOR_DENSE:
+        fillDenseSlot(slot, data, dataEnd);
+        break;
+      case SlotDef::VECTOR_SPARSE_NON_VALUE:
+        fillSparseNonValueSlot(slot, data, dataEnd);
+        break;
+      case SlotDef::VECTOR_SPARSE_VALUE:
+        fillSparseValueSlot(slot, data, dataEnd);
+        break;
+      case SlotDef::INDEX:
+        fillIndexSlot(slot, data, dataEnd);
+        break;
+      case SlotDef::VAR_MDIM_DENSE:
+        LOG(FATAL) << "Not implemented";
+        break;
+      case SlotDef::VAR_MDIM_INDEX:
+        LOG(FATAL) << "Not implemented";
+        break;
+      case SlotDef::STRING:
+        fillStringSlot(slot, data, dataEnd);
+        break;
+    }
+  }
+  // read sequenceStartPositions
+  for (size_t j = 0; j < slotNum_; ++j) {
+    auto& slot = slots_[j];
+    if (!iidData()) {
+      unsigned int sequenceNum = readT<unsigned int>(data, dataEnd);
+      slot.sequenceNum = sequenceNum;
+      for (size_t i = 0; i < sequenceNum; ++i) {
+        slot.sequenceStartPositions.push_back(
+            readT<unsigned int>(data, dataEnd));
+      }
+      for (size_t i = 0; i < sequenceNum; ++i) {
+        size_t begin = slot.sequenceStartPositions[i];
+        size_t end = (i < sequenceNum - 1) ? slot.sequenceStartPositions[i + 1]
+                                           : slot.sampleNum;
+        for (size_t ii = begin; ii < end; ++ii) {
+          slot.sampleSequenceIdVec.push_back(ii);
+        }
+      }
+    } else {
+      for (size_t i = 0; i < slot.sampleNum; ++i) {
+        slot.sampleSequenceIdVec.push_back(i);
+      }
+    }
+  }
+  // read subSequenceStartPositions, not all slots have this infomation.
+  for (size_t j = 0; j < slotNum_; ++j) {
+    auto& slot = slots_[j];
+    if (!iidData() && data != dataEnd) {
+      unsigned int subSequenceNum = readT<unsigned int>(data, dataEnd);
+      slot.subSequenceNum = subSequenceNum;
+      for (size_t i = 0; i < subSequenceNum; ++i) {
+        slot.subSequenceStartPositions.push_back(
+            readT<unsigned int>(data, dataEnd));
+      }
+    }
+  }
+}
+
+void PyDataProvider::reset() {
+  {  // Invoke PyDataProvider Reset
+    PyGuard guard;
+    PyObjectPtr obj(PyObject_CallMethod(
+        classInstance_.get(), const_cast<char*>("reset"), NULL));
+    CHECK_PY(obj) << "Call function reset failed.";
+  }
+
+  if (!skipShuffle_) {
+    // Invoke PyDataProvider Shuffle
+    shuffle();
+  }
+  DataProvider::reset();
+}
+
+void PyDataProvider::shuffle() {
+  // py shuffle
+  PyGuard guard;
+  PyObjectPtr obj(PyObject_CallMethod(
+      classInstance_.get(), const_cast<char*>("shuffle"), NULL));
+  CHECK_PY(obj) << "Call function shuffle failed.";
+}
+
+void PyDataProvider::handleDenseSlot(ProtoSlot& slot,
+                                     size_t slotIndex,
+                                     std::vector<Argument>& cpuArguments) {
+  unsigned int dim = slot.dim;
+  Matrix::resizeOrCreate(cpuArguments[slotIndex].value,
+                         slot.sampleNum,
+                         dim,
+                         false,   // trans = false
+                         false);  // useGpu = false
+  real* buf = cpuArguments[slotIndex].value->getData();
+  for (size_t i = 0; i < slot.sampleNum; ++i) {
+    memcpyWithCheck(buf + i * dim,
+                    slot.denseData.data() + slot.sampleSequenceIdVec[i] * dim,
+                    sizeof(real) * dim,
+                    slot.denseData.data() + slot.denseData.size());
+  }
+}
+
+void PyDataProvider::handleSparseNonValueSlot(
+    ProtoSlot& slot, size_t slotIndex, std::vector<Argument>& cpuArguments) {
+  unsigned int dim = slot.dim;
+  if (!(cpuArguments[slotIndex].value)) {
+    cpuArguments[slotIndex].value =
+        Matrix::createSparseMatrix(slot.sampleNum,
+                                   dim,
+                                   slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
+                                   NO_VALUE,
+                                   SPARSE_CSR,
+                                   false,
+                                   useGpu_);
+  }
+  auto mat = cpuArguments[slotIndex].value;
+  mat->resize(slot.sampleNum, dim, slot.sampleNum, NO_VALUE, SPARSE_CSR);
+  if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
+    std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
+        slot.sampleSequenceIdVec.data(),
+        slot.indices.data(),
+        slot.sparseNonValueData.data(),
+        HPPL_STREAM_1);
+  } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
+    std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
+        slot.sampleSequenceIdVec.data(),
+        slot.indices.data(),
+        slot.sparseNonValueData.data());
+  } else {
+    LOG(FATAL) << "Not Supported";
+  }
+}
+
+void PyDataProvider::handleSparseValueSlot(
+    ProtoSlot& slot, size_t slotIndex, std::vector<Argument>& cpuArguments) {
+  unsigned int dim = slot.dim;
+  if (!(cpuArguments[slotIndex].value)) {
+    cpuArguments[slotIndex].value =
+        Matrix::createSparseMatrix(slot.sampleNum,
+                                   dim,
+                                   slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/,
+                                   FLOAT_VALUE,
+                                   SPARSE_CSR,
+                                   false,
+                                   useGpu_);
+  }
+  auto mat = cpuArguments[slotIndex].value;
+  mat->resize(slot.sampleNum, dim, slot.sampleNum, FLOAT_VALUE, SPARSE_CSR);
+  if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
+    std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
+        slot.sampleSequenceIdVec.data(),
+        slot.indices.data(),
+        slot.sparseFloatValueData.data(),
+        HPPL_STREAM_DEFAULT);
+  } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
+    std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
+        slot.sampleSequenceIdVec.data(),
+        slot.indices.data(),
+        slot.sparseFloatValueData.data());
+  } else {
+    LOG(FATAL) << "Not Supported";
+  }
+}
+
+void PyDataProvider::handleIndexSlot(ProtoSlot& slot,
+                                     size_t slotIndex,
+                                     std::vector<Argument>& cpuArguments) {
+  IVector::resizeOrCreate(cpuArguments[slotIndex].ids,
+                          slot.sampleNum,
+                          /*useGpu_*/ false);
+  int* buf = cpuArguments[slotIndex].ids->getData();
+  for (size_t i = 0; i < slot.sampleNum; ++i) {
+    buf[i] = slot.indexData[slot.sampleSequenceIdVec[i]];
+  }
+}
+
+void PyDataProvider::handleStringSlot(ProtoSlot& slot,
+                                      size_t slotIndex,
+                                      std::vector<Argument>& cpuArguments) {
+  if (cpuArguments[slotIndex].strs) {
+    cpuArguments[slotIndex].strs->resize(slot.sampleNum);
+  } else {
+    cpuArguments[slotIndex].strs =
+        std::make_shared<std::vector<std::string>>(slot.sampleNum);
+  }
+  for (size_t i = 0; i < slot.sampleNum; ++i) {
+    (*cpuArguments[slotIndex].strs)[i] =
+        slot.strData[slot.sampleSequenceIdVec[i]];
+  }
+}
+
+int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) {
+  PyGuard guard;
+  PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(),
+                                      const_cast<char*>("getNextBatch"),
+                                      const_cast<char*>("i"),
+                                      size));
+  CHECK_PY(obj) << "Call function getNextBatch failed.";
+  const std::string& samples =
+      std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
+  resetSlots();
+  fillSlotsByStr(samples);
+  size = batchSize_;
+  if (size <= 0) return 0;
+
+  DataBatch& cpuBatch = *cpuBatch_;
+  std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
+  cpuBatch.setSize(size);
+  cpuArguments.resize(slotNum_);
+
+  if (!iidData()) {
+    for (size_t j = 0; j < slotNum_; ++j) {
+      auto& slot = slots_[j];
+      ICpuGpuVector::resizeOrCreate(cpuArguments[j].sequenceStartPositions,
+                                    slot.sequenceNum + 1,
+                                    /* useGpu= */ false);
+      int* buf = cpuArguments[j].sequenceStartPositions->getMutableData(false);
+      std::copy(slot.sequenceStartPositions.begin(),
+                slot.sequenceStartPositions.end(),
+                buf);
+      buf[slot.sequenceStartPositions.size()] = slot.sampleNum;
+
+      if (slot.subSequenceStartPositions.size()) {
+        ICpuGpuVector::resizeOrCreate(cpuArguments[j].subSequenceStartPositions,
+                                      slot.subSequenceNum + 1,
+                                      /*  useGpu= */ false);
+        int* buf =
+            cpuArguments[j].subSequenceStartPositions->getMutableData(false);
+        std::copy(slot.subSequenceStartPositions.begin(),
+                  slot.subSequenceStartPositions.end(),
+                  buf);
+        buf[slot.subSequenceNum] = slot.sampleNum;
+        // check subSequenceStartPositions and sequenceStartPositions
+        cpuArguments[j].checkSubset();
+      }
+    }
+  }
+
+  for (size_t slotIndex = 0; slotIndex < slotNum_; ++slotIndex) {
+    auto& slot = slots_[slotIndex];
+    SlotDef::SlotType slotType = slot.type;
+    switch (slotType) {
+      case SlotDef::VECTOR_DENSE:
+        handleDenseSlot(slot, slotIndex, cpuArguments);
+        break;
+      case SlotDef::VECTOR_SPARSE_NON_VALUE:
+        handleSparseNonValueSlot(slot, slotIndex, cpuArguments);
+        break;
+      case SlotDef::VECTOR_SPARSE_VALUE:
+        handleSparseValueSlot(slot, slotIndex, cpuArguments);
+        break;
+      case SlotDef::INDEX:
+        handleIndexSlot(slot, slotIndex, cpuArguments);
+        break;
+      case SlotDef::VAR_MDIM_DENSE:
+        LOG(FATAL) << "Not implemented";
+        break;
+      case SlotDef::VAR_MDIM_INDEX:
+        LOG(FATAL) << "Not implemented";
+        break;
+      case SlotDef::STRING:
+        handleStringSlot(slot, slotIndex, cpuArguments);
+        break;
+    }
+  }
+
+  if (useGpu_) {
+    std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
+    DataBatch& gpuBatch = *gpuBatch_;
+    std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
+    gpuArguments.resize(cpuArguments.size());
+    gpuBatch.setSize(size);
+    for (size_t i = 0; i < slotNum_; ++i) {
+      SlotDef::SlotType slotType = slots_[i].type;
+      if (SlotDef::VECTOR_SPARSE_VALUE == slotType ||
+          SlotDef::VECTOR_SPARSE_NON_VALUE == slotType) {
+        gpuArguments[i] = cpuArguments[i];
+        gpuArguments[i].sequenceStartPositions =
+            cpuArguments[i].sequenceStartPositions;
+
+        if (slots_[i].subSequenceStartPositions.size()) {
+          gpuArguments[i].subSequenceStartPositions =
+              cpuArguments[i].subSequenceStartPositions;
+        }
+      } else {
+        gpuArguments[i].resizeAndCopyFrom(
+            cpuArguments[i], useGpu_, HPPL_STREAM_1);
+      }
+    }
+    hl_stream_synchronize(HPPL_STREAM_1);
+    *batch = gpuBatch;
+  } else {
+    *batch = cpuBatch;
+  }
+
+  return batch->getSize();
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/PyDataProvider.h b/paddle/legacy/gserver/dataproviders/PyDataProvider.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b8bea04a1670c60d5a801ca950f59116ba50195
--- /dev/null
+++ b/paddle/legacy/gserver/dataproviders/PyDataProvider.h
@@ -0,0 +1,124 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <paddle/legacy/utils/PythonUtil.h>
+#include "DataFormat.pb.h"
+#include "DataProvider.h"
+
+#include <vector>
+
+namespace paddle {
+
+class PyDataProvider : public DataProvider {
+ public:
+  PyDataProvider(const DataConfig& config,
+                 bool useGpu,
+                 bool loadDataAll = true);
+
+  virtual void reset();
+
+  // Note this size includes the sequences which are skipped because they
+  // are longer than the batch size
+  virtual int64_t getSize() {
+    LOG(FATAL) << "Not implement yet";
+    return -1;
+  }
+  virtual void shuffle();
+
+  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
+
+ protected:
+  struct ProtoSlot;
+  // return false if each each sample is one sequence, i.e., independent
+  // of other samples.
+  inline bool iidData() const { return isIID_; }
+
+  void parseHeaderData(const std::string& headerData);
+  void fillDenseSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
+  void fillSparseNonValueSlot(ProtoSlot& slot,
+                              char*& data,
+                              const char* dataEnd);
+  void fillSparseValueSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
+  void fillIndexSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
+  void fillStringSlot(ProtoSlot& slot, char*& data, const char* dataEnd);
+  void fillSlotsByStr(const std::string& samples);
+  void handleDenseSlot(ProtoSlot& slot,
+                       size_t slotIndex,
+                       std::vector<Argument>& cpuArguments);
+  void handleSparseNonValueSlot(ProtoSlot& slot,
+                                size_t slotIndex,
+                                std::vector<Argument>& cpuArguments);
+  void handleSparseValueSlot(ProtoSlot& slot,
+                             size_t slotIndex,
+                             std::vector<Argument>& cpuArguments);
+  void handleIndexSlot(ProtoSlot& slot,
+                       size_t slotIndex,
+                       std::vector<Argument>& cpuArguments);
+  void handleStringSlot(ProtoSlot& slot,
+                        size_t slotIndex,
+                        std::vector<Argument>& cpuArguments);
+  void resetSlots();
+  void loadData(const std::vector<std::string>& fileList);
+
+ protected:
+  struct ProtoSlot {
+    SlotDef::SlotType type;
+    int dim;
+    unsigned int sampleNum;
+    unsigned int sequenceNum;
+    unsigned int subSequenceNum;
+    // Store the data of index type slot
+    std::vector<int> indexData;
+    // Store the data of dense type slot
+    std::vector<real> denseData;
+    // Store the data of sparseNonValue type slot
+    std::vector<sparse_non_value_t> sparseNonValueData;
+    // Store the data of sparseValue type slot
+    std::vector<sparse_float_value_t> sparseFloatValueData;
+    // Used to store the index of each sample in slot values
+    std::vector<int64_t> indices;
+    // The starting position of each sequence in samples
+    // The last element should be the number of samples
+    // If empty, each sample is one sequence.
+    std::vector<size_t> sequenceStartPositions;
+    // The index id of sequences in slot
+    std::vector<int64_t> sampleSequenceIdVec;
+    // The starting position of each subsequence in samples
+    // The last element should be the number of subsequence
+    // If empty, each sequence of sample has no subsequence.
+    std::vector<size_t> subSequenceStartPositions;
+    // Store the data of string type slot
+    std::vector<std::string> strData;
+  };
+  std::vector<ProtoSlot> slots_;
+
+  PyObjectPtr classInstance_;
+  unsigned int batchSize_;
+  unsigned int slotNum_;
+  // if use sequence, isIID_ equals false, otherwise it is true.
+  bool isIID_;
+  // The name of python module name
+  std::string pyModuleName_;
+  // The name of python class name
+  std::string pyClassName_;
+  // User args set in config
+  std::map<std::string, std::string> pyUserArgs_;
+
+  ThreadLocalD<DataBatch> cpuBatch_;
+  ThreadLocalD<DataBatch> gpuBatch_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/dataproviders/PyDataProvider2.cpp b/paddle/legacy/gserver/dataproviders/PyDataProvider2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8e931e40611e27caa43675c3567972384a4d9026
--- /dev/null
+++ b/paddle/legacy/gserver/dataproviders/PyDataProvider2.cpp
@@ -0,0 +1,1031 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_NO_PYTHON
+
+#include <Python.h>
+#include <numpy/numpyconfig.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <list>
+#include <unordered_set>
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#include <numpy/ndarrayobject.h>
+
+#include "DataProvider.h"
+
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+namespace unittest {
+
+static std::unique_ptr<std::function<void(size_t /*poolActualSize */)>>
+    OnPoolFilled;
+
+namespace pydp2 {
+
+void setOnPoolFilledHook(const std::function<void(size_t)>& callback) {
+  OnPoolFilled.reset(new std::function<void(size_t)>());
+  *OnPoolFilled = callback;
+}
+
+void clearOnPoolFilledHook() { OnPoolFilled.reset(); }
+
+}  // namespace pydp2
+}  // namespace unittest
+
+/**
+ * Slot type
+ */
+enum SlotType {
+  ST_DENSE = 0,
+  ST_NON_SPARSE_VALUE = 1,
+  ST_SPARSE_VALUE = 2,
+  ST_INDEX = 3
+};
+
+/**
+ * Sequence type
+ */
+enum SeqType { SQT_NONE = 0, SQT_SEQ, SQT_SUBSEQ };
+
+/**
+ * Cache Type.
+ */
+enum CacheType {
+  NO_CACHE = 0,           // Each pass will load data from PyDataProvider2.
+  CACHE_PASS_IN_MEM = 1,  // First pass will load data from PyDataProvider2,
+                          // then cache all data in memory. Load data from
+                          // memory in rest passes.
+};
+
+struct SlotHeader {  // Slot Header will parse from python object's slots field.
+  size_t dim;
+  SlotType slotType;
+  SeqType seqType;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const SlotHeader& header) {
+  os << "Dim = " << header.dim << " Type = " << header.slotType
+     << " SeqType = " << header.seqType;
+  return os;
+}
+
+/**
+ * FieldScanner Interface.
+ *
+ * It will read python object, and fill to argument's each slot.
+ * There are two steps, prepare and fill. Scanner will alloc memory during
+ * prepare step, fill data into argument during fill step.
+ */
+class IFieldScanner {
+ public:
+  DISABLE_COPY(IFieldScanner);
+  /**
+   * Ctor.
+   * @param headerPtr slot header that scanner belong to.
+   */
+  explicit IFieldScanner(SlotHeader* headerPtr) : headerPtr_(headerPtr) {}
+  virtual ~IFieldScanner() {}
+
+  /**
+   * Start prepare step.
+   */
+  virtual void startPrepare(Argument& argument) {}
+
+  /**
+   * Prepare step.
+   *
+   * @note the obj could be a timestep of sample or whole sample. It depends
+   * what scanner it is.
+   */
+  virtual void prepare(Argument& argument, PyObject* obj) {}
+
+  /**
+   * Finish Prepare step.
+   */
+  virtual void finishPrepare(Argument& argument) {}
+
+  /**
+   * Start fill step.
+   */
+  virtual void startFill(Argument& argument) {}
+
+  /**
+   * Fill step.
+   *
+   * @note the obj could be a timestep of sample or whole sample. It depends
+   * what scanner it is.
+   */
+  virtual void fill(Argument& argument, PyObject* obj) {}
+
+  /**
+   * Finish fill step.
+   */
+  virtual void finishFill(Argument& argument) {}
+
+  /**
+   * Factory method. Create a scanner by header. The final scanner may be
+   * combine many scanners.
+   *
+   * @note Fatal if header is not support.
+   */
+  static IFieldScanner* create(SlotHeader* header);
+
+ protected:
+  SlotHeader* headerPtr_;
+};
+
+/**
+ * Py Data Provider Cache Interface.
+ */
+class IPyDataProviderCache {
+ public:
+  virtual ~IPyDataProviderCache() {}
+
+  /**
+   * invoke when DataProvider::reset()
+   * @return true if read data from python.
+   */
+  virtual bool reset() = 0;
+
+  /**
+   * invoke when these data are used by DataProvider, and need to clear.
+   * @param [inout] data used data.
+   *
+   * @note The implemented class must clear these data array. Or if you want to
+   * delete the PyObjectPtr later, you should make sure the paddle process only
+   * have one active thread calling python code (use PyGuard otherwise).
+   */
+  virtual void drop(std::deque<PyObjectPtr>* data) = 0;
+
+  /**
+   * Return whole data in cache.
+   */
+  virtual std::deque<PyObjectPtr>* load() = 0;
+
+  /**
+   * Factory method. Convert CacheType to IPyDataProviderCache*
+   */
+  static IPyDataProviderCache* create(CacheType ct);
+};
+
+/**
+ * PyDataProvider2.
+ *
+ * For usage, please refer python module 'paddle.trainer.PyDataProvider2'
+ *
+ * Here, we start a thread to read data. It is totally asynchronous for reading
+ * data. And it support cache strategies.
+ */
+class PyDataProvider2 : public DataProvider {
+ public:
+  /**
+   * Ctor
+   */
+  PyDataProvider2(const DataConfig& config,
+                  const ModelConfig& modelConfig,
+                  bool useGpu)
+      : DataProvider(config, useGpu), callingContextCreated_(2) {
+    if (PyArray_API == NULL) import_array();
+    auto& args = config.load_data_args();
+    PyObjectPtr kwargs = PyObjectPtr(PyDict_New());
+    if (!args.empty()) {
+      kwargs = callPythonFuncRetPyObj(
+          "paddle.trainer.PyDataProvider2", "deserialize_args", {args});
+    }
+
+    py::DictHelper kwargsDict(kwargs);
+    kwargsDict.setBool("is_train", !config.for_test());
+    std::vector<std::string> inputs;
+    inputs.reserve(modelConfig.input_layer_names().size());
+    std::copy(modelConfig.input_layer_names().begin(),
+              modelConfig.input_layer_names().end(),
+              std::back_inserter(inputs));
+    kwargsDict.setStringList("input_order", inputs);
+
+    // kwargs is keyword arguemts to create object.
+    this->createPyDataObj(config.load_data_module(),
+                          config.load_data_object(),
+                          config.files(),
+                          std::move(kwargs));
+    DBG << "Instance " << instance_.get() << " loaded.";
+    this->readPyFields(config.for_test());
+    DBG << "Py Field Done";
+  }
+
+  /**
+   * Dtor
+   * @note will stop loading thread when destructing
+   */
+  virtual ~PyDataProvider2() { resetImpl(false); }
+
+ private:
+  void createPyDataObj(const std::string& model,
+                       const std::string& className,
+                       const std::string& fileListName,
+                       PyObjectPtr&& kwargs  // NOLINT
+                       ) {
+    LOG(INFO) << "loading dataprovider " << model << "::" << className;
+
+    PyObjectPtr module = py::import(model);
+    PyObjectPtr moduleDict(PyModule_GetDict(module.get()));
+    CHECK_PY(moduleDict) << "Invoke module.__dict__ error";
+    PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(), className.c_str()));
+    CHECK_PY(cls) << "load class " << className.c_str() << "error";
+
+    // If there are multiple python instance share same module, the PyObjectPtr
+    // only for instance will make python reference-count error.
+    //
+    // So here, we increase reference count manually.
+    Py_XINCREF(module.get());
+    Py_XINCREF(moduleDict.get());
+    Py_XINCREF(cls.get());
+
+    PyObjectPtr fileListInPy = loadPyFileLists(fileListName);
+    PyDict_SetItemString(kwargs.get(), "file_list", fileListInPy.get());
+    {
+      PyGuard guard;
+      instance_.reset(PyObject_Call(cls.get(), zeroTuple_.get(), kwargs.get()));
+    }
+    CHECK_PY(instance_) << "Cannot Create instance";
+  }
+
+  void readPyFields(bool testing) {
+    py::ObjectHelper self(this->instance_);
+    bool ok;
+
+    this->skipShuffle_ =
+        !self.getBoolAttr("should_shuffle", &ok /*isBoolType*/);
+    if (!ok) {
+      this->skipShuffle_ = testing;  // shuffle when is training, skip shuffle
+                                     // when is testing.
+    }
+    DBG << "Provider Skip Shuffle " << this->skipShuffle_;
+
+    this->poolSize_ = self.getIntAttr<size_t>("pool_size", &ok);
+    if (!ok) {
+      this->poolSize_ = -1UL;
+    }
+    this->minPoolSize_ = self.getIntAttr<size_t>("min_pool_size", &ok);
+    if (!ok) {
+      this->minPoolSize_ = -1UL;
+    }
+    this->minPoolSize_ = std::min(this->poolSize_, this->minPoolSize_);
+
+    this->canOverBatchSize_ = self.getBoolAttr("can_over_batch_size");
+
+    calcBatchSize_.reset(self.getAttr("calc_batch_size"));
+    if (this->calcBatchSize_ && !py::isCallable(this->calcBatchSize_)) {
+      this->calcBatchSize_.reset();
+    }
+
+    generator_.reset(self.getAttr("generator"));
+    CHECK(py::isCallable(generator_));
+
+    // Reading slots.
+    PyObjectPtr slotsPtr(self.getAttr("slots"));
+    py::SequenceHelper slots(slotsPtr);
+    headers_.reserve(slots.size());
+    for (size_t i = 0; i < slots.size(); ++i) {
+      headers_.emplace_back();
+      auto& header = headers_.back();
+      PyObject* hdPtr = slots[i];
+      CHECK(hdPtr != nullptr);
+      Py_XINCREF(hdPtr);
+      PyObjectPtr headerPtrWrap(hdPtr);
+      py::ObjectHelper hd(headerPtrWrap);
+      header.dim = hd.getIntAttrWithError<size_t>("dim");
+      header.seqType = (SeqType)hd.getIntAttrWithError<int>("seq_type");
+      header.slotType = (SlotType)hd.getIntAttrWithError<int>("type");
+    }
+
+    DBG << "Data header size " << headers_.size();
+    for (auto& header : headers_) {
+      DBG << header;
+    }
+    cache_.reset(IPyDataProviderCache::create(
+        (CacheType)self.getIntAttrWithError<int>("cache")));
+  }
+
+  PyObjectPtr loadPyFileLists(const std::string& fileListName) {
+    loadFileList(fileListName, fileLists_);
+    PyObject* lst = PyList_New(fileLists_.size());
+    for (size_t i = 0; i < fileLists_.size(); ++i) {
+      PyList_SET_ITEM(lst, i, PyString_FromString(fileLists_[i].c_str()));
+    }
+    return PyObjectPtr(lst);
+  }
+
+  void loadThread() {
+    DBG << "Creating context";
+    for (auto& filename : fileLists_) {
+      PyGuard g;
+      py::CallableHelper generator(this->generator_);
+      generator.setArgsSize(2);
+      generator.getArgs().set(0, instance_);
+      generator.getArgs().set(1, PyString_FromString(filename.c_str()), true);
+      callingContexts_.emplace_back(generator());
+      CHECK_PY(callingContexts_.back()) << "Generator error.";
+      CHECK(PyIter_Check(callingContexts_.back()));
+    }
+    DBG << "Create context done";
+    callingContextCreated_.wait();
+
+    PositionRandom p(skipShuffle_);
+
+    while (!exit_ && !callingContexts_.empty()) {
+      PyObject* data = nullptr;
+
+      {  // Read data.
+        size_t cid = p(callingContexts_.size());
+        bool atEnd;
+        data = py::iterNext(callingContexts_[cid], &atEnd);
+        if (atEnd || data == nullptr) {
+          if (cid != 0) {
+            std::swap(callingContexts_[cid], callingContexts_[0]);
+            cid = 0;
+          }
+
+          PyObjectPtr front;
+          {
+            std::unique_lock<std::mutex> l(mtx_);
+            front = pop_get_front(callingContexts_);
+          }
+          {
+            PyGuard g;
+            front.reset();
+          }
+          this->pullCV_.notify_all();
+          continue;
+        }
+      }
+
+      size_t additionalBatchSize = 1;
+      if (calcBatchSize_) {
+        PyGuard guard;
+        py::CallableHelper calcBatchSize(this->calcBatchSize_);
+        calcBatchSize.setArgsSize(1);
+        calcBatchSize.getArgs().set(0, data);
+        PyObjectPtr bs(calcBatchSize());
+        CHECK_PY(bs);
+        bool ok;
+        additionalBatchSize = py::castInt<size_t>(bs.get(), &ok);
+        CHECK(ok) << "CalcBatchSize must return int or long";
+      }
+
+      if (this->loadThread_) {  // wait poolActualSize < poolSize;
+        std::unique_lock<std::mutex> l(mtx_);
+        pushCV_.wait(l, [this] { return this->poolActualSize_ < poolSize_; });
+      }
+
+      {
+        std::lock_guard<std::mutex> guard(mtx_);
+        poolActualSize_ += additionalBatchSize;
+        dataPool_.emplace_back(data);
+      }
+      pullCV_.notify_all();
+    }
+    DBG << "load thread end";
+  }
+
+  inline void resetImpl(bool startNewThread) {
+    DBG << "Reseting " << startNewThread;
+    exit_.store(true);
+    if (loadThread_) {  // is loading.
+      loadThread_->join();
+      loadThread_.reset();
+    }
+    {
+      PyGuard g;
+      callingContexts_.clear();
+      this->pullCV_.notify_one();
+    }
+
+    std::lock_guard<std::mutex> guard(mutexForReset_);
+    {
+      PyGuard g;
+      dataPool_.clear();
+    }
+    poolActualSize_ = 0;
+
+    if (startNewThread && cache_->reset()) {
+      DBG << "Start new thread.";
+      loadThread_.reset(new std::thread([this] {
+        exit_ = false;
+        loadThread();
+      }));
+      callingContextCreated_.wait();
+    }
+    DBG << "Reset done";
+    exit_ = false;
+  }
+
+ private:
+  std::unique_ptr<std::thread> loadThread_;
+  std::atomic<bool> exit_;
+  std::deque<PyObjectPtr> callingContexts_;
+  std::deque<PyObjectPtr> dataPool_;
+  size_t poolActualSize_;
+  std::condition_variable pushCV_;
+  std::condition_variable pullCV_;
+  std::mutex mtx_;
+
+  std::mutex mutexForReset_;
+
+  ThreadBarrier callingContextCreated_;
+  std::unique_ptr<IPyDataProviderCache> cache_;
+
+  PyObjectPtr instance_;
+  size_t poolSize_;
+  size_t minPoolSize_;
+  bool canOverBatchSize_;
+  PyObjectPtr calcBatchSize_;
+  PyObjectPtr generator_;
+  std::vector<std::string> fileLists_;
+  std::vector<SlotHeader> headers_;
+  static PyObjectPtr zeroTuple_;
+
+  class PositionRandom {
+   public:
+    inline explicit PositionRandom(bool skipRand)
+        : eng_(ThreadLocalRandomEngine::get()), skipRand_(skipRand) {}
+
+    inline size_t operator()(size_t len) {
+      if (!skipRand_) {
+        if (!dist_ || dist_->b() != len - 1) {
+          dist_.reset(new std::uniform_int_distribution<size_t>(0, len - 1));
+        }
+        return (*dist_)(eng_);
+      } else {
+        return 0;
+      }
+    }
+
+   private:
+    std::default_random_engine& eng_;
+    std::unique_ptr<std::uniform_int_distribution<size_t>> dist_;
+    bool skipRand_;
+  };
+
+  // DataProvider interface
+ public:
+  /**
+   * Resetting the PyDataProvider. May start reading thread here.
+   */
+  virtual void reset() {
+    resetImpl(true);
+    DataProvider::reset();
+  }
+
+  /**
+   * Shuffle. Do nothing because PyDataProvider do shuffle implicitly by random
+   * select data from datapool.
+   */
+  void shuffle() {}
+
+  /**
+   * Not limited size.
+   */
+  int64_t getSize() { return -1; }
+
+  /**
+   * Loading a batch of data.
+   */
+  int64_t getNextBatchInternal(int64_t size_, DataBatch* batch) {
+    std::lock_guard<std::mutex> guard(mutexForReset_);
+    REGISTER_TIMER("PyDP2.getNextBatchInternal")
+    CHECK_GE(size_, 0);
+    size_t size = (size_t)size_;
+    if (loadThread_) {  // loading from thread should wait for data pool ready.
+                        // but, loading from cache, cache object should ensure
+                        // data pool ready.
+      std::unique_lock<std::mutex> l(mtx_);
+      pullCV_.wait(l, [this, &size] {
+        return this->poolActualSize_ >= std::max(size, this->minPoolSize_) ||
+               callingContexts_.empty();
+      });
+
+      if (unittest::OnPoolFilled) {
+        (*unittest::OnPoolFilled)(this->poolActualSize_);
+      }
+    }
+    std::deque<PyObjectPtr> data;
+    size_t bsize = 0;
+    std::deque<PyObjectPtr>* poolPtr = nullptr;
+
+    if (this->loadThread_) {  // loading from thread.
+      poolPtr = &this->dataPool_;
+    } else {  // loading from cache.
+      poolPtr = this->cache_->load();
+    }
+    if (exit_) {
+      // PyDataProvider is destructing.
+      return 0;
+    }
+    CHECK(poolPtr != nullptr);
+
+    std::deque<PyObjectPtr>& pool = *poolPtr;
+
+    while (bsize < size && !pool.empty()) {
+      {
+        // move data from pool to data
+        std::lock_guard<std::mutex> guard(mtx_);
+        if (skipShuffle_) {
+          size_t i = 0;
+          CHECK(pool[i] != nullptr);
+          data.emplace_back(std::move(pool[i]));
+          pool.pop_front();
+        } else {  // when shuffle, use swap to drop only last pool element.
+          size_t i = ThreadLocalRand::rand() % pool.size();
+          CHECK(pool[i] != nullptr);
+          if (i != 0) {
+            std::swap(pool[i], pool.front());
+          }
+          data.emplace_back(std::move(pool.front()));
+          pool.pop_front();
+        }
+
+        if (calcBatchSize_) {  // custom calc batch size.
+          PyGuard guard;
+          Py_INCREF(data.back().get());
+          py::CallableHelper calcBatchSize(calcBatchSize_);
+          calcBatchSize.setArgsSize(1);
+          calcBatchSize.getArgs().set(0, data.back());
+          PyObjectPtr customBatchSize(calcBatchSize());
+          bool ok;
+          size_t tmp = py::castInt<size_t>(customBatchSize.get(), &ok);
+          CHECK(ok) << "calc_batch_size must return int";
+
+          if (bsize + tmp > size && !canOverBatchSize_) {
+            // Put data back.
+            pool.push_front(std::move(data.back()));
+            data.pop_back();
+            break;
+          } else {
+            bsize += tmp;
+          }
+        } else {
+          bsize += 1;
+        }
+      }
+    }
+
+    if (this->loadThread_) {
+      {
+        std::lock_guard<std::mutex> g(mtx_);
+        poolActualSize_ -= bsize;
+      }
+      this->pushCV_.notify_all();
+    }
+
+    if (bsize == 0) {  // end of pass. In data pool, cannot get any data.
+      return 0;
+    }
+
+    DataBatch cpuBatch;
+    cpuBatch.setSize(bsize);
+    auto& inArgs = cpuBatch.getStreams();
+    inArgs.resize(headers_.size());
+    std::vector<std::unique_ptr<IFieldScanner>> scanners;
+    scanners.reserve(headers_.size());
+    for (auto& header : headers_) {
+      scanners.emplace_back(IFieldScanner::create(&header));
+    }
+    DBG << "Scanner created.";
+    for (size_t i = 0; i < headers_.size(); ++i) {
+      scanners[i]->startPrepare(inArgs[i]);
+    }
+    for (auto& d : data) {
+      py::SequenceHelper s(d);
+      for (size_t i = 0; i < headers_.size(); ++i) {
+        scanners[i]->prepare(inArgs[i], s[i]);
+      }
+    }
+    for (size_t i = 0; i < headers_.size(); ++i) {
+      scanners[i]->finishPrepare(inArgs[i]);
+    }
+    for (size_t i = 0; i < headers_.size(); ++i) {
+      scanners[i]->startFill(inArgs[i]);
+    }
+    for (auto& d : data) {
+      py::SequenceHelper s(d);
+      for (size_t i = 0; i < headers_.size(); ++i) {
+        scanners[i]->fill(inArgs[i], s[i]);
+      }
+    }
+
+    for (size_t i = 0; i < headers_.size(); ++i) {
+      scanners[i]->finishFill(inArgs[i]);
+    }
+
+    {
+      PyGuard g;
+      cache_->drop(&data);
+    }
+
+    DBG << "Reading CPU Batch Done.";
+
+    if (useGpu_) {
+      std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
+      DataBatch& gpuBatch = *batch;
+      std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
+      gpuArguments.resize(cpuArguments.size());
+      gpuBatch.setSize(bsize);
+      for (size_t i = 0; i < headers_.size(); ++i) {
+        gpuArguments[i].resizeAndCopyFrom(
+            cpuArguments[i], useGpu_, HPPL_STREAM_1);
+      }
+      hl_stream_synchronize(HPPL_STREAM_1);
+    } else {
+      *batch = cpuBatch;
+    }
+    return bsize;
+  }
+};
+
+PyObjectPtr PyDataProvider2::zeroTuple_(PyTuple_New(0));
+
+REGISTER_DATA_PROVIDER_EX(py2, PyDataProvider2);
+
+/**
+ * Scanner for dense slot.
+ */
+class DenseScanner : public IFieldScanner {
+ public:
+  explicit DenseScanner(SlotHeader* ptr) : IFieldScanner(ptr), height_(0) {}
+
+  /**
+   * Prepare.
+   * @param argument target argument
+   * @param obj each timestep of a sample.
+   */
+  virtual void prepare(Argument& argument, PyObject* obj) { ++height_; }
+
+  virtual void finishPrepare(Argument& argument) {
+    Matrix::resizeOrCreate(
+        argument.value, height_, headerPtr_->dim, false, false);
+    height_ = 0;
+  }
+
+  /**
+   * Fill argument from obj.
+   * @param argument
+   * @param obj
+   */
+  virtual void fill(Argument& argument, PyObject* obj) {
+    real* dat = argument.value->getData() + height_ * headerPtr_->dim;
+    if (PyArray_Check(obj)) {
+      auto dtype = PyArray_DTYPE((PyArrayObject*)obj);
+      if (dtype->type == 'f' && dtype->elsize == sizeof(real)) {
+        real* data = (real*)PyArray_DATA((PyArrayObject*)obj);
+        auto sz = PyArray_SIZE((PyArrayObject*)obj);
+        std::copy(data, data + sz, dat);
+      } else {
+        LOG(FATAL) << "You should yield float" << sizeof(real) * 8 << " array";
+      }
+    } else {
+      py::SequenceHelper s(obj);
+      // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
+      for (size_t i = 0; i < headerPtr_->dim; ++i) {
+        dat[i] = (real)s.getDouble(i);
+      }
+    }
+    ++height_;
+  }
+
+ private:
+  size_t height_;
+};
+
+/**
+ * Scanner for index slot
+ */
+class IndexScanner : public IFieldScanner {
+ public:
+  explicit IndexScanner(SlotHeader* ptr) : IFieldScanner(ptr), cnt_(0) {}
+
+  /**
+   * Prepare memory space.
+   *
+   * @note obj is a single timestep of sample
+   */
+  virtual void prepare(Argument& argument, PyObject* obj) { ++cnt_; }
+
+  virtual void finishPrepare(Argument& argument) {
+    IVector::resizeOrCreate(argument.ids, cnt_, false);
+    cnt_ = 0;
+  }
+
+  /**
+   * Fill one index to argument.
+   */
+  virtual void fill(Argument& argument, PyObject* obj) {
+    bool ok;
+    argument.ids->getData()[cnt_++] = py::castInt<int>(obj, &ok);
+    CHECK(ok) << "Cannot cast int " << py::repr(obj);
+  }
+
+ private:
+  size_t cnt_;
+};
+
+class SparseNonValueScanner : public IFieldScanner {
+ public:
+  explicit SparseNonValueScanner(SlotHeader* ptr)
+      : IFieldScanner(ptr), nnz_(0), height_(0) {}
+
+  /**
+   * Prepare memory space
+   * @note obj is a timestep of one sample.
+   */
+  virtual void prepare(Argument& argument, PyObject* obj) {
+    ++height_;
+    nnz_ += py::SequenceHelper(obj).size();
+  }
+
+  virtual void finishPrepare(Argument& argument) {
+    Matrix::resizeOrCreateSparseMatrix(
+        argument.value, height_, headerPtr_->dim, nnz_, NO_VALUE);
+  }
+
+  virtual void startFill(Argument& argument) {
+    auto smat = (CpuSparseMatrix*)(argument.value.get());
+    smat->getRows()[0] = 0;
+    nnz_ = 0;
+    height_ = 1;
+  }
+
+  /**
+   * Fill one sparse vector to argument.
+   * @note obj is a timestep of one sample.
+   */
+  virtual void fill(Argument& argument, PyObject* obj) {
+    py::SequenceHelper s(obj);
+    auto sz = s.size();
+    auto smat = (CpuSparseMatrix*)(argument.value.get());
+    int* row = smat->getRows();
+    int* col = smat->getCols();
+    real* dat = smat->getData();
+    row[height_] = row[height_ - 1] + (int)sz;
+
+    for (decltype(sz) i = 0; i < sz; ++i) {
+      setData(col + nnz_, dat + nnz_, s[i]);
+      ++nnz_;
+    }
+    ++height_;
+  }
+
+ protected:
+  /**
+   * Set a single sparse index and value.
+   * @param [out] col sparse index
+   * @param [out] dat sparse value
+   * @param [in] obj Python Object. For sparse_non_value is a PyInt or PyLong.
+   *                 For sparse_value is a Tuple (int, float).
+   */
+  virtual void setData(int* col, real* dat, PyObject* obj) {
+    bool ok;
+    *col = py::castInt<int>(obj, &ok);
+    CHECK(ok);
+  }
+
+  size_t nnz_;
+  size_t height_;
+};
+
+class SparseValueScanner : public SparseNonValueScanner {
+ public:
+  explicit SparseValueScanner(SlotHeader* ptr) : SparseNonValueScanner(ptr) {}
+
+  virtual void finishPrepare(Argument& argument) {
+    Matrix::resizeOrCreateSparseMatrix(
+        argument.value, height_, headerPtr_->dim, nnz_, FLOAT_VALUE);
+  }
+
+ protected:
+  virtual void setData(int* col, real* dat, PyObject* obj) {
+    py::SequenceHelper s(obj);
+    SparseNonValueScanner::setData(col, dat, s[0]);
+    *dat = (real)s.getDouble(1);
+  }
+};
+
+/**
+ * Sequence Scanner. Scanner for sequence or sub-sequence.
+ */
+class SequenceScanner : public IFieldScanner {
+ public:
+  /**
+   * Ctor
+   * @param innerScanner inner scanner for each timestep or sub-sequence.
+   * @param getSeqStartPos A callback, (Argument) => ICpuGpuVectorPtr.
+   *                       return a sequence start position or a sub-sequence
+   *                       start position.
+   */
+  SequenceScanner(
+      std::unique_ptr<IFieldScanner>&& innerScanner,
+      const std::function<ICpuGpuVectorPtr&(Argument&)>& getSeqStartPos)
+      : IFieldScanner(nullptr),
+        inner_(std::move(innerScanner)),
+        cnt_(0),
+        getSeqStartPos_(getSeqStartPos) {}
+
+  /**
+   * Start prepare. Invoke inner->startPrepare too.
+   */
+  virtual void startPrepare(Argument& argument) {
+    inner_->startPrepare(argument);
+  }
+
+  /**
+   * Prepare. obj is a list or tuple. it will invoke inner_->prepare for each
+   * element of sequence obj.
+   */
+  virtual void prepare(Argument& argument, PyObject* obj) {
+    py::SequenceHelper s(obj);
+    ++cnt_;
+    for (size_t i = 0; i < s.size(); ++i) {
+      inner_->prepare(argument, s[i]);
+    }
+  }
+
+  /**
+   * Finish prepare. invoke inner_->finishPrepare too.
+   */
+  virtual void finishPrepare(Argument& argument) {
+    ICpuGpuVector::resizeOrCreate(getSeqStartPos_(argument), cnt_ + 1, false);
+    inner_->finishPrepare(argument);
+  }
+
+  /**
+   * Start fill. invoke inner->startFill too.
+   */
+  virtual void startFill(Argument& argument) {
+    getSeqStartPos_(argument)->getMutableData(false)[0] = 0;
+    cnt_ = 1;
+    inner_->startFill(argument);
+  }
+
+  /**
+   * Fill. Obj is a tuple or list. invoke inner->fill for each element of
+   * sequence obj. And set seqStartPos at same time. The seqStartPos will be
+   * calculated by getSeqStartPos callback passed in ctor.
+   */
+  virtual void fill(Argument& argument, PyObject* obj) {
+    getSeqStartPos_(argument)->getMutableData(false)[cnt_] =
+        getSeqStartPos_(argument)->getMutableData(false)[cnt_ - 1] +
+        (int)getSize(obj);
+    py::SequenceHelper s(obj);
+    ++cnt_;
+    for (size_t i = 0; i < s.size(); ++i) {
+      inner_->fill(argument, s[i]);
+    }
+  }
+
+  /**
+   * Finish fill. will invoke inner->finishFill too.
+   */
+  virtual void finishFill(Argument& argument) { inner_->finishFill(argument); }
+
+ protected:
+  size_t getSize(PyObject* obj) {
+    py::SequenceHelper s(obj);
+    auto sc = dynamic_cast<SequenceScanner*>(inner_.get());
+    if (sc) {
+      size_t sum = 0;
+      for (size_t i = 0; i < s.size(); ++i) {
+        sum += sc->getSize(s[i]);
+      }
+      return sum;
+    } else {
+      return s.size();
+    }
+  }
+
+ private:
+  std::unique_ptr<IFieldScanner> inner_;
+  size_t cnt_;
+  std::function<ICpuGpuVectorPtr&(Argument&)> getSeqStartPos_;
+};
+
+IFieldScanner* IFieldScanner::create(SlotHeader* header) {
+  IFieldScanner* retv = nullptr;
+  switch (header->slotType) {
+    case ST_DENSE:
+      retv = new DenseScanner(header);
+      break;
+    case ST_INDEX:
+      retv = new IndexScanner(header);
+      break;
+    case ST_NON_SPARSE_VALUE:
+      retv = new SparseNonValueScanner(header);
+      break;
+    case ST_SPARSE_VALUE:
+      retv = new SparseValueScanner(header);
+      break;
+    default:
+      LOG(FATAL) << "Not implemented " << header->slotType;
+  }
+
+  switch (header->seqType) {
+    case SQT_NONE:
+      break;
+    case SQT_SUBSEQ:
+      retv = new SequenceScanner(std::unique_ptr<IFieldScanner>(retv),
+                                 [](Argument& arg) -> ICpuGpuVectorPtr& {
+                                   return arg.subSequenceStartPositions;
+                                 });
+    // fall through, not break;
+    case SQT_SEQ:
+      retv = new SequenceScanner(std::unique_ptr<IFieldScanner>(retv),
+                                 [](Argument& arg) -> ICpuGpuVectorPtr& {
+                                   return arg.sequenceStartPositions;
+                                 });
+      break;
+    default:
+      LOG(FATAL) << "Not implemented";
+  }
+
+  return retv;
+}
+
+/**
+ * No Cache Strategy. Will destruct old data immediately and load data from
+ * python every pass.
+ */
+class NoCacheStrategy : public IPyDataProviderCache {
+ public:
+  virtual bool reset() { return true; }
+
+  virtual void drop(std::deque<PyObjectPtr>* data) { data->clear(); }
+
+  virtual std::deque<PyObjectPtr>* load() { return nullptr; }
+};
+
+/**
+ * Cache One Pass In Memory strategy.
+ *
+ * In first pass, will load data from python and store them in memory.
+ * The rest passes, will load data from memory.
+ */
+class CacheOnePassInMemory : public IPyDataProviderCache {
+ public:
+  CacheOnePassInMemory()
+      : objPool_(new std::deque<PyObjectPtr>()),
+        droppedPool_(new std::deque<PyObjectPtr>()) {}
+
+  virtual bool reset() {
+    if (objPool_->empty() && droppedPool_->empty()) {
+      return true;
+    } else if (objPool_->empty()) {
+      std::swap(objPool_, droppedPool_);
+      return false;
+    } else {
+      LOG(FATAL) << "Unexpected branch";
+    }
+  }
+
+  virtual void drop(std::deque<PyObjectPtr>* data) {
+    size_t orgSize = droppedPool_->size();
+    droppedPool_->resize(orgSize + data->size());
+    for (size_t i = 0; i < data->size(); ++i) {
+      std::swap((*droppedPool_)[orgSize + i], (*data)[i]);
+    }
+    data->clear();
+  }
+
+  virtual std::deque<PyObjectPtr>* load() { return objPool_.get(); }
+
+ private:
+  std::unique_ptr<std::deque<PyObjectPtr>> objPool_;
+  std::unique_ptr<std::deque<PyObjectPtr>> droppedPool_;
+};
+
+IPyDataProviderCache* IPyDataProviderCache::create(CacheType ct) {
+  switch (ct) {
+    case NO_CACHE:
+      return new NoCacheStrategy();
+    case CACHE_PASS_IN_MEM:
+      return new CacheOnePassInMemory();
+    default:
+      LOG(FATAL) << "Not implemented";
+  }
+}
+}  // namespace paddle
+
+#endif
diff --git a/paddle/legacy/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/legacy/gserver/evaluators/CTCErrorEvaluator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c145adda5e04fb4a35df480fd3d0cf93ad453e0d
--- /dev/null
+++ b/paddle/legacy/gserver/evaluators/CTCErrorEvaluator.cpp
@@ -0,0 +1,320 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Evaluator.h"
+#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/legacy/utils/StringUtil.h"
+
+namespace paddle {
+
+/**
+ * calculate sequence-to-sequence edit distance
+ */
+class CTCErrorEvaluator : public Evaluator {
+ private:
+  MatrixPtr outActivations_;
+  int numTimes_, numClasses_, numSequences_, blank_;
+  real deletions_, insertions_, substitutions_;
+  int seqClassficationError_;
+  mutable std::unordered_map<std::string, real> evalResults_;
+
+  std::vector<int> path2String(const std::vector<int>& path) {
+    std::vector<int> str;
+    str.clear();
+    int prevLabel = -1;
+    for (std::vector<int>::const_iterator label = path.begin();
+         label != path.end();
+         label++) {
+      if (*label != blank_ &&
+          (str.empty() || *label != str.back() || prevLabel == blank_)) {
+        str.push_back(*label);
+      }
+      prevLabel = *label;
+    }
+    return str;
+  }
+
+  std::vector<int> bestLabelSeq() {
+    std::vector<int> path;
+    path.clear();
+    real* acts = outActivations_->getData();
+    for (int i = 0; i < numTimes_; ++i) {
+      path.push_back(std::max_element(acts + i * numClasses_,
+                                      acts + (i + 1) * numClasses_) -
+                     (acts + i * numClasses_));
+    }
+    return path2String(path);
+  }
+
+  /* "sp, dp, ip" is the weighting parameter of "substitution, deletion,
+   * insertion"
+   * in edit-distance error */
+  real stringAlignment(std::vector<int>& gtStr,
+                       std::vector<int>& recogStr,
+                       bool backtrace = true,
+                       real sp = 1.0,
+                       real dp = 1.0,
+                       real ip = 1.0) {
+    std::vector<std::vector<int>> matrix;
+    int substitutions, deletions, insertions;
+    real distance;
+    int n = gtStr.size();
+    int m = recogStr.size();
+
+    if (n == 0) {
+      substitutions = 0;
+      deletions = 0;
+      insertions = m;
+      distance = m;
+    } else if (m == 0) {
+      substitutions = 0;
+      deletions = n;
+      insertions = 0;
+      distance = n;
+    } else {
+      substitutions = 0;
+      deletions = 0;
+      insertions = 0;
+      distance = 0;
+      // initialize the matrix
+      matrix.resize(n + 1);
+      for (int i = 0; i < n + 1; ++i) {
+        matrix[i].resize(m + 1);
+        for (int j = 0; j < m + 1; ++j) {
+          matrix[i][j] = 0;
+        }
+      }
+      for (int i = 0; i < n + 1; ++i) {
+        matrix[i][0] = i;
+      }
+      for (int j = 0; j < m + 1; ++j) {
+        matrix[0][j] = j;
+      }
+
+      // calculate the insertions, substitutions and deletions
+      for (int i = 1; i < n + 1; ++i) {
+        int s_i = gtStr[i - 1];
+        for (int j = 1; j < m + 1; ++j) {
+          int t_j = recogStr[j - 1];
+          int cost = (s_i == t_j) ? 0 : 1;
+          const int above = matrix[i - 1][j];
+          const int left = matrix[i][j - 1];
+          const int diag = matrix[i - 1][j - 1];
+          const int cell = std::min(above + 1, std::min(left + 1, diag + cost));
+          matrix[i][j] = cell;
+        }
+      }
+
+      if (backtrace) {
+        size_t i = n;
+        size_t j = m;
+        substitutions = 0;
+        deletions = 0;
+        insertions = 0;
+
+        while (i != 0 && j != 0) {
+          if (matrix[i][j] == matrix[i - 1][j - 1]) {
+            --i;
+            --j;
+          } else if (matrix[i][j] == matrix[i - 1][j - 1] + 1) {
+            ++substitutions;
+            --i;
+            --j;
+          } else if (matrix[i][j] == matrix[i - 1][j] + 1) {
+            ++deletions;
+            --i;
+          } else {
+            ++insertions;
+            --j;
+          }
+        }
+        while (i != 0) {
+          ++deletions;
+          --i;
+        }
+        while (j != 0) {
+          ++insertions;
+          --j;
+        }
+        int diff = substitutions + deletions + insertions;
+        if (diff != matrix[n][m]) {
+          LOG(ERROR) << "Found path with distance " << diff
+                     << " but Levenshtein distance is " << matrix[n][m];
+        }
+
+        distance = (sp * substitutions) + (dp * deletions) + (ip * insertions);
+      } else {
+        distance = (real)matrix[n][m];
+      }
+    }
+    real maxLen = std::max(m, n);
+    deletions_ += deletions / maxLen;
+    insertions_ += insertions / maxLen;
+    substitutions_ += substitutions / maxLen;
+
+    if (distance != 0) {
+      seqClassficationError_ += 1;
+    }
+
+    return distance / maxLen;
+  }
+
+  real editDistance(
+      real* output, int numTimes, int numClasses, int* labels, int labelsLen) {
+    numTimes_ = numTimes;
+    numClasses_ = numClasses;
+    blank_ = numClasses_ - 1;
+    outActivations_ = Matrix::create(output, numTimes, numClasses);
+    std::vector<int> recogStr, gtStr;
+    recogStr = bestLabelSeq();
+    for (int i = 0; i < labelsLen; ++i) {
+      gtStr.push_back(labels[i]);
+    }
+
+    return stringAlignment(gtStr, recogStr);
+  }
+
+  void storeLocalValues() const {
+    evalResults_["error"] = numSequences_ ? totalScore_ / numSequences_ : 0;
+    evalResults_["deletion_error"] =
+        numSequences_ ? deletions_ / numSequences_ : 0;
+    evalResults_["insertion_error"] =
+        numSequences_ ? insertions_ / numSequences_ : 0;
+    evalResults_["substitution_error"] =
+        numSequences_ ? substitutions_ / numSequences_ : 0;
+    evalResults_["sequence_error"] =
+        (real)seqClassficationError_ / numSequences_;
+  }
+
+ public:
+  CTCErrorEvaluator()
+      : numTimes_(0),
+        numClasses_(0),
+        numSequences_(0),
+        blank_(0),
+        deletions_(0),
+        insertions_(0),
+        substitutions_(0),
+        seqClassficationError_(0) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    CHECK_EQ(arguments.size(), (size_t)2);
+    Argument output, label;
+    output.resizeAndCopyFrom(arguments[0], false, HPPL_STREAM_DEFAULT);
+    label.resizeAndCopyFrom(arguments[1], false, HPPL_STREAM_DEFAULT);
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+    CHECK(label.sequenceStartPositions);
+    CHECK(label.ids);
+    size_t numSequences = label.sequenceStartPositions->getSize() - 1;
+    const int* labelStarts = label.sequenceStartPositions->getData(false);
+    const int* outputStarts = output.sequenceStartPositions->getData(false);
+    real totalErr = 0;
+    for (size_t i = 0; i < numSequences; ++i) {
+      real err = 0;
+      err = editDistance(
+          output.value->getData() + output.value->getWidth() * outputStarts[i],
+          outputStarts[i + 1] - outputStarts[i],
+          output.value->getWidth(),
+          label.ids->getData() + labelStarts[i],
+          labelStarts[i + 1] - labelStarts[i]);
+
+      totalErr += err;
+    }
+
+    return totalErr;
+  }
+
+  virtual void eval(const NeuralNetwork& nn) {
+    Evaluator::eval(nn);
+    std::vector<Argument> arguments;
+    arguments.reserve(config_.input_layers_size());
+    for (const std::string& name : config_.input_layers()) {
+      arguments.push_back(nn.getLayer(name)->getOutput());
+    }
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
+    numSequences_ += arguments[1].getNumSequences();
+  }
+
+  virtual void start() {
+    Evaluator::start();
+    numSequences_ = 0;
+    blank_ = 0;
+    deletions_ = 0;
+    insertions_ = 0;
+    substitutions_ = 0;
+    seqClassficationError_ = 0;
+  }
+
+  virtual void printStats(std::ostream& os) const {
+    storeLocalValues();
+    os << config_.name() << " error = " << evalResults_["error"];
+    os << " deletions error = " << evalResults_["deletion_error"];
+    os << " insertions error = " << evalResults_["insertion_error"];
+    os << " substitution error = " << evalResults_["substitution_error"];
+    os << " sequence error = " << evalResults_["sequence_error"];
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    double buf[6] = {totalScore_,
+                     (double)deletions_,
+                     (double)insertions_,
+                     (double)substitutions_,
+                     (double)seqClassficationError_,
+                     (double)numSequences_};
+    client->reduce(buf, buf, 6, FLAGS_trainer_id, 0);
+    totalScore_ = buf[0];
+    deletions_ = (real)buf[1];
+    insertions_ = (real)buf[2];
+    substitutions_ = (real)buf[3];
+    seqClassficationError_ = (int)buf[4];
+    numSequences_ = (int)buf[5];
+  }
+
+  void getNames(std::vector<std::string>* names) {
+    storeLocalValues();
+    names->reserve(names->size() + evalResults_.size());
+    for (auto it = evalResults_.begin(); it != evalResults_.end(); ++it) {
+      names->push_back(config_.name() + "." + it->first);
+    }
+  }
+
+  real getValue(const std::string& name, Error* err) const {
+    storeLocalValues();
+
+    std::vector<std::string> buffers;
+    paddle::str::split(name, '.', &buffers);
+    auto it = evalResults_.find(buffers[buffers.size() - 1]);
+
+    if (it == evalResults_.end()) {
+      *err = Error("Evaluator does not have the key %s", name.c_str());
+      return 0.0f;
+    }
+
+    return it->second;
+  }
+
+  std::string getType(const std::string& name, Error* err) const {
+    this->getValue(name, err);
+    if (!err->isOK()) {
+      return "";
+    }
+    return "ctc_edit_distance";
+  }
+};
+
+REGISTER_EVALUATOR(ctc_edit_distance, CTCErrorEvaluator);
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/evaluators/ChunkEvaluator.cpp b/paddle/legacy/gserver/evaluators/ChunkEvaluator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0ff3f2fa8cf06c13ef327aa7ae2511bfc0d028be
--- /dev/null
+++ b/paddle/legacy/gserver/evaluators/ChunkEvaluator.cpp
@@ -0,0 +1,296 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <set>
+#include <vector>
+
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/utils/StringUtil.h"
+
+#include "Evaluator.h"
+
+namespace paddle {
+
+/**
+ * Chunk evaluator is used to evaluate segment labelling accuracy for a
+ * sequence. It calculates the chunk detection F1 score.
+ *
+ * A chunk is correctly detected if its beginning, end and type are correct.
+ * Other chunk type is ignored.
+ * For each label in the label sequence, we have
+ *
+ * @code
+ * tagType = label % numTagType
+ * chunkType = label / numTagType
+ * otherChunkType = numChunkTypes
+ * @endcode
+ *
+ * The total number of different labels is numTagType*numChunkTypes+1
+ * We support 4 labelling scheme
+ * The tag type for each of the scheme is shown as follows:
+ *
+ * @code
+ *  Scheme Begin Inside End   Single
+ *   plain  0     -      -     -
+ *   IOB    0     1      -     -
+ *   IOE    -     0      1     -
+ *   IOBES  0     1      2     3
+ * @endcode
+ *
+ * 'plain' means the whole chunk must contain exactly the same chunk label.
+ */
+class ChunkEvaluator : public Evaluator {
+  int otherChunkType_;
+  int numChunkTypes_;  // number of chunk types besides other chunk type
+  int numTagTypes_;
+  int tagBegin_;
+  int tagInside_;
+  int tagEnd_;
+  int tagSingle_;
+
+  int64_t numLabelSegments_;
+  int64_t numOutputSegments_;
+  int64_t numCorrect_;
+
+  struct Segment {
+    int begin;
+    int end;
+    int type;
+    bool operator==(const Segment& y) const {
+      return begin == y.begin && end == y.end && type == y.type;
+    }
+  };
+
+  std::vector<Segment> labelSegments_;
+  std::vector<Segment> outputSegments_;
+  std::set<int> excludedChunkTypes_;
+  mutable std::unordered_map<std::string, real> values_;
+
+ public:
+  virtual void init(const EvaluatorConfig& config) {
+    Evaluator::init(config);
+    if (config.chunk_scheme() == "IOB") {
+      numTagTypes_ = 2;
+      tagBegin_ = 0;
+      tagInside_ = 1;
+      tagEnd_ = -1;
+      tagSingle_ = -1;
+    } else if (config.chunk_scheme() == "IOE") {
+      numTagTypes_ = 2;
+      tagBegin_ = -1;
+      tagInside_ = 0;
+      tagEnd_ = 1;
+      tagSingle_ = -1;
+    } else if (config.chunk_scheme() == "IOBES") {
+      numTagTypes_ = 4;
+      tagBegin_ = 0;
+      tagInside_ = 1;
+      tagEnd_ = 2;
+      tagSingle_ = 3;
+    } else if (config.chunk_scheme() == "plain") {
+      numTagTypes_ = 1;
+      tagBegin_ = -1;
+      tagInside_ = -1;
+      tagEnd_ = -1;
+      tagSingle_ = -1;
+    } else {
+      LOG(FATAL) << "Unknown chunk scheme: " << config.chunk_scheme();
+    }
+    CHECK(config.has_num_chunk_types()) << "Missing num_chunk_types in config";
+    otherChunkType_ = numChunkTypes_ = config.num_chunk_types();
+
+    // the chunks of types in excludedChunkTypes_ will not be counted
+    auto& tmp = config.excluded_chunk_types();
+    excludedChunkTypes_.insert(tmp.begin(), tmp.end());
+  }
+
+  virtual void start() {
+    Evaluator::start();
+    numLabelSegments_ = 0;
+    numOutputSegments_ = 0;
+    numCorrect_ = 0;
+  }
+
+  virtual void printStats(std::ostream& os) const {
+    storeLocalValues();
+    os << config_.name() << "=" << values_["F1-score"]
+       << " true_chunks=" << numLabelSegments_
+       << " result_chunks=" << numOutputSegments_
+       << " correct_chunks=" << numCorrect_;
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    int64_t buf[3] = {numLabelSegments_, numOutputSegments_, numCorrect_};
+    client->reduce(buf, buf, 3, FLAGS_trainer_id, 0);
+    numLabelSegments_ = buf[0];
+    numOutputSegments_ = buf[1];
+    numCorrect_ = buf[2];
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    CHECK_EQ(arguments.size(), (size_t)2);
+    IVectorPtr& output = arguments[0].ids;
+    IVectorPtr& label = arguments[1].ids;
+    CHECK(!output->useGpu() && !label->useGpu()) << "Not supported";
+    auto sequenceStartPositions =
+        arguments[1].sequenceStartPositions->getVector(false);
+    CHECK_EQ(output->getSize(), label->getSize());
+    CHECK(sequenceStartPositions);
+    size_t numSequences = sequenceStartPositions->getSize() - 1;
+    const int* starts = sequenceStartPositions->getData();
+    for (size_t i = 0; i < numSequences; ++i) {
+      eval1(output->getData() + starts[i],
+            label->getData() + starts[i],
+            starts[i + 1] - starts[i]);
+    }
+    return 0;
+  }
+
+  void eval1(int* output, int* label, int length) {
+    getSegments(output, length, outputSegments_);
+    getSegments(label, length, labelSegments_);
+    size_t i = 0, j = 0;
+    while (i < outputSegments_.size() && j < labelSegments_.size()) {
+      if (outputSegments_[i] == labelSegments_[j] &&
+          excludedChunkTypes_.count(outputSegments_[i].type) != 1) {
+        ++numCorrect_;
+      }
+      if (outputSegments_[i].end < labelSegments_[j].end) {
+        ++i;
+      } else if (outputSegments_[i].end > labelSegments_[j].end) {
+        ++j;
+      } else {
+        ++i;
+        ++j;
+      }
+    }
+    for (auto& segment : labelSegments_) {
+      if (excludedChunkTypes_.count(segment.type) != 1) ++numLabelSegments_;
+    }
+    for (auto& segment : outputSegments_) {
+      if (excludedChunkTypes_.count(segment.type) != 1) ++numOutputSegments_;
+    }
+  }
+
+  void getSegments(int* label, int length, std::vector<Segment>& segments) {
+    segments.clear();
+    segments.reserve(length);
+    int chunkStart = 0;
+    bool inChunk = false;
+    int tag = -1;
+    int type = otherChunkType_;
+    for (int i = 0; i < length; ++i) {
+      int prevTag = tag;
+      int prevType = type;
+      CHECK_LE(label[i], numChunkTypes_ * numTagTypes_);
+      tag = label[i] % numTagTypes_;
+      type = label[i] / numTagTypes_;
+      if (inChunk && isChunkEnd(prevTag, prevType, tag, type)) {
+        Segment segment{
+            chunkStart,  // begin
+            i - 1,       // end
+            prevType,
+        };
+        segments.push_back(segment);
+        inChunk = false;
+      }
+      if (isChunkBegin(prevTag, prevType, tag, type)) {
+        chunkStart = i;
+        inChunk = true;
+      }
+    }
+    if (inChunk) {
+      Segment segment{
+          chunkStart,  // begin
+          length - 1,  // end
+          type,
+      };
+      segments.push_back(segment);
+    }
+  }
+
+  // whether (prevTag, prevType) is the end of a chunk
+  bool isChunkEnd(int prevTag, int prevType, int tag, int type) {
+    if (prevType == otherChunkType_) return false;
+    if (type == otherChunkType_) return true;
+    if (type != prevType) return true;
+    if (prevTag == tagBegin_) return tag == tagBegin_ || tag == tagSingle_;
+    if (prevTag == tagInside_) return tag == tagBegin_ || tag == tagSingle_;
+    if (prevTag == tagEnd_) return true;
+    if (prevTag == tagSingle_) return true;
+    return false;
+  }
+
+  // whether (tag, type) is the beginning of a chunk
+  bool isChunkBegin(int prevTag, int prevType, int tag, int type) {
+    if (prevType == otherChunkType_) return type != otherChunkType_;
+    if (type == otherChunkType_) return false;
+    if (type != prevType) return true;
+    if (tag == tagBegin_) return true;
+    if (tag == tagInside_) return prevTag == tagEnd_ || prevTag == tagSingle_;
+    if (tag == tagEnd_) return prevTag == tagEnd_ || prevTag == tagSingle_;
+    if (tag == tagSingle_) return true;
+    return false;
+  }
+
+  // three metrics: precision, recall and F1-score
+  void getNames(std::vector<std::string>* names) {
+    storeLocalValues();
+    names->reserve(names->size() + values_.size());
+    for (auto it = values_.begin(); it != values_.end(); ++it) {
+      names->push_back(config_.name() + "." + it->first);
+    }
+  }
+
+  // get value by field name
+  real getValue(const std::string& name, Error* err) const {
+    storeLocalValues();
+    std::vector<std::string> buffers;
+    paddle::str::split(name, '.', &buffers);
+    auto it = values_.find(buffers.back());
+    if (it == values_.end()) {  // not found
+      *err = Error("No such key %s", name.c_str());
+      return 0.0f;
+    }
+
+    return it->second;
+  }
+
+  // get type of evaluator
+  std::string getType(const std::string& name, Error* err) const {
+    this->getValue(name, err);
+    if (!err->isOK()) {
+      return "";
+    }
+    return "chunk";
+  }
+
+ private:
+  void storeLocalValues() const {
+    CHECK_GE(numOutputSegments_, 0);
+    CHECK_GE(numLabelSegments_, 0);
+    double precision =
+        !numOutputSegments_ ? 0 : (double)numCorrect_ / numOutputSegments_;
+    double recall =
+        !numLabelSegments_ ? 0 : (double)numCorrect_ / numLabelSegments_;
+    values_["precision"] = precision;
+    values_["recall"] = recall;
+    values_["F1-score"] =
+        !numCorrect_ ? 0 : 2 * precision * recall / (precision + recall);
+  }
+};
+
+REGISTER_EVALUATOR(chunk, ChunkEvaluator);
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/evaluators/DetectionMAPEvaluator.cpp b/paddle/legacy/gserver/evaluators/DetectionMAPEvaluator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..57657241f8c1517f674670d34cb984b85996bfc7
--- /dev/null
+++ b/paddle/legacy/gserver/evaluators/DetectionMAPEvaluator.cpp
@@ -0,0 +1,308 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Evaluator.h"
+#include "paddle/legacy/gserver/layers/DetectionUtil.h"
+
+using std::map;
+using std::vector;
+using std::pair;
+using std::make_pair;
+
+namespace paddle {
+
+/**
+ * @brief detection map Evaluator
+ *
+ * The config file api is detection_map_evaluator.
+ */
+class DetectionMAPEvaluator : public Evaluator {
+ public:
+  DetectionMAPEvaluator()
+      : evaluateDifficult_(false), cpuOutput_(nullptr), cpuLabel_(nullptr) {}
+
+  virtual void start() {
+    Evaluator::start();
+    allTruePos_.clear();
+    allFalsePos_.clear();
+    numPos_.clear();
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    overlapThreshold_ = config_.overlap_threshold();
+    backgroundId_ = config_.background_id();
+    evaluateDifficult_ = config_.evaluate_difficult();
+    apType_ = config_.ap_type();
+
+    MatrixPtr detectTmpValue = arguments[0].value;
+    Matrix::resizeOrCreate(cpuOutput_,
+                           detectTmpValue->getHeight(),
+                           detectTmpValue->getWidth(),
+                           false,
+                           false);
+
+    MatrixPtr labelTmpValue = arguments[1].value;
+    Matrix::resizeOrCreate(cpuLabel_,
+                           labelTmpValue->getHeight(),
+                           labelTmpValue->getWidth(),
+                           false,
+                           false);
+
+    cpuOutput_->copyFrom(*detectTmpValue);
+    cpuLabel_->copyFrom(*labelTmpValue);
+
+    Argument label = arguments[1];
+    const int* labelIndex = label.sequenceStartPositions->getData(false);
+    size_t batchSize = label.getNumSequences();
+
+    vector<map<size_t, vector<NormalizedBBox>>> allGTBBoxes;
+    vector<map<size_t, vector<pair<real, NormalizedBBox>>>> allDetectBBoxes;
+
+    for (size_t n = 0; n < batchSize; ++n) {
+      map<size_t, vector<NormalizedBBox>> bboxes;
+      for (int i = labelIndex[n]; i < labelIndex[n + 1]; ++i) {
+        vector<NormalizedBBox> bbox;
+        getBBoxFromLabelData(cpuLabel_->getData() + i * 6, 1, bbox);
+        int c = cpuLabel_->getData()[i * 6];
+        bboxes[c].push_back(bbox[0]);
+      }
+      allGTBBoxes.push_back(bboxes);
+    }
+
+    size_t n = 0;
+    const real* cpuOutputData = cpuOutput_->getData();
+    for (size_t imgId = 0; imgId < batchSize; ++imgId) {
+      map<size_t, vector<pair<real, NormalizedBBox>>> bboxes;
+      size_t curImgId = static_cast<size_t>((cpuOutputData + n * 7)[0]);
+      while (curImgId == imgId && n < cpuOutput_->getHeight()) {
+        vector<real> label;
+        vector<real> score;
+        vector<NormalizedBBox> bbox;
+        getBBoxFromDetectData(cpuOutputData + n * 7, 1, label, score, bbox);
+        bboxes[label[0]].push_back(make_pair(score[0], bbox[0]));
+        ++n;
+        curImgId = static_cast<size_t>((cpuOutputData + n * 7)[0]);
+      }
+      allDetectBBoxes.push_back(bboxes);
+    }
+
+    for (size_t n = 0; n < batchSize; ++n) {
+      for (map<size_t, vector<NormalizedBBox>>::iterator it =
+               allGTBBoxes[n].begin();
+           it != allGTBBoxes[n].end();
+           ++it) {
+        size_t count = 0;
+        if (evaluateDifficult_) {
+          count = it->second.size();
+        } else {
+          for (size_t i = 0; i < it->second.size(); ++i)
+            if (!(it->second[i].isDifficult)) ++count;
+        }
+        if (numPos_.find(it->first) == numPos_.end() && count != 0) {
+          numPos_[it->first] = count;
+        } else {
+          numPos_[it->first] += count;
+        }
+      }
+    }
+
+    // calcTFPos
+    calcTFPos(batchSize, allGTBBoxes, allDetectBBoxes);
+
+    return 0;
+  }
+
+  virtual void printStats(std::ostream& os) const {
+    real mAP = calcMAP();
+    os << "Detection mAP=" << mAP;
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    LOG(FATAL) << "Distribute detection evaluation not implemented.";
+  }
+
+ protected:
+  void calcTFPos(const size_t batchSize,
+                 const vector<map<size_t, vector<NormalizedBBox>>>& allGTBBoxes,
+                 const vector<map<size_t, vector<pair<real, NormalizedBBox>>>>&
+                     allDetectBBoxes) {
+    for (size_t n = 0; n < allDetectBBoxes.size(); ++n) {
+      if (allGTBBoxes[n].size() == 0) {
+        for (map<size_t, vector<pair<real, NormalizedBBox>>>::const_iterator
+                 it = allDetectBBoxes[n].begin();
+             it != allDetectBBoxes[n].end();
+             ++it) {
+          size_t label = it->first;
+          for (size_t i = 0; i < it->second.size(); ++i) {
+            allTruePos_[label].push_back(make_pair(it->second[i].first, 0));
+            allFalsePos_[label].push_back(make_pair(it->second[i].first, 1));
+          }
+        }
+      } else {
+        for (map<size_t, vector<pair<real, NormalizedBBox>>>::const_iterator
+                 it = allDetectBBoxes[n].begin();
+             it != allDetectBBoxes[n].end();
+             ++it) {
+          size_t label = it->first;
+          vector<pair<real, NormalizedBBox>> predBBoxes = it->second;
+          if (allGTBBoxes[n].find(label) == allGTBBoxes[n].end()) {
+            for (size_t i = 0; i < predBBoxes.size(); ++i) {
+              allTruePos_[label].push_back(make_pair(predBBoxes[i].first, 0));
+              allFalsePos_[label].push_back(make_pair(predBBoxes[i].first, 1));
+            }
+          } else {
+            vector<NormalizedBBox> gtBBoxes =
+                allGTBBoxes[n].find(label)->second;
+            vector<bool> visited(gtBBoxes.size(), false);
+            // Sort detections in descend order based on scores
+            std::sort(predBBoxes.begin(),
+                      predBBoxes.end(),
+                      sortScorePairDescend<NormalizedBBox>);
+            for (size_t i = 0; i < predBBoxes.size(); ++i) {
+              real maxOverlap = -1.0;
+              size_t maxIdx = 0;
+              for (size_t j = 0; j < gtBBoxes.size(); ++j) {
+                real overlap =
+                    jaccardOverlap(predBBoxes[i].second, gtBBoxes[j]);
+                if (overlap > maxOverlap) {
+                  maxOverlap = overlap;
+                  maxIdx = j;
+                }
+              }
+              if (maxOverlap > overlapThreshold_) {
+                if (evaluateDifficult_ ||
+                    (!evaluateDifficult_ && !gtBBoxes[maxIdx].isDifficult)) {
+                  if (!visited[maxIdx]) {
+                    allTruePos_[label].push_back(
+                        make_pair(predBBoxes[i].first, 1));
+                    allFalsePos_[label].push_back(
+                        make_pair(predBBoxes[i].first, 0));
+                    visited[maxIdx] = true;
+                  } else {
+                    allTruePos_[label].push_back(
+                        make_pair(predBBoxes[i].first, 0));
+                    allFalsePos_[label].push_back(
+                        make_pair(predBBoxes[i].first, 1));
+                  }
+                }
+              } else {
+                allTruePos_[label].push_back(make_pair(predBBoxes[i].first, 0));
+                allFalsePos_[label].push_back(
+                    make_pair(predBBoxes[i].first, 1));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  real calcMAP() const {
+    real mAP = 0.0;
+    size_t count = 0;
+    for (map<size_t, size_t>::const_iterator it = numPos_.begin();
+         it != numPos_.end();
+         ++it) {
+      size_t label = it->first;
+      size_t labelNumPos = it->second;
+      if (labelNumPos == 0 || allTruePos_.find(label) == allTruePos_.end())
+        continue;
+      vector<pair<real, size_t>> labelTruePos = allTruePos_.find(label)->second;
+      vector<pair<real, size_t>> labelFalsePos =
+          allFalsePos_.find(label)->second;
+      // Compute average precision.
+      vector<size_t> tpCumSum;
+      getAccumulation(labelTruePos, &tpCumSum);
+      vector<size_t> fpCumSum;
+      getAccumulation(labelFalsePos, &fpCumSum);
+      std::vector<real> precision, recall;
+      size_t num = tpCumSum.size();
+      // Compute Precision.
+      for (size_t i = 0; i < num; ++i) {
+        CHECK_LE(tpCumSum[i], labelNumPos);
+        precision.push_back(static_cast<real>(tpCumSum[i]) /
+                            static_cast<real>(tpCumSum[i] + fpCumSum[i]));
+        recall.push_back(static_cast<real>(tpCumSum[i]) / labelNumPos);
+      }
+      // VOC2007 style
+      if (apType_ == "11point") {
+        vector<real> maxPrecisions(11, 0.0);
+        int startIdx = num - 1;
+        for (int j = 10; j >= 0; --j)
+          for (int i = startIdx; i >= 0; --i) {
+            if (recall[i] < j / 10.) {
+              startIdx = i;
+              if (j > 0) maxPrecisions[j - 1] = maxPrecisions[j];
+              break;
+            } else {
+              if (maxPrecisions[j] < precision[i])
+                maxPrecisions[j] = precision[i];
+            }
+          }
+        for (int j = 10; j >= 0; --j) mAP += maxPrecisions[j] / 11;
+        ++count;
+      } else if (apType_ == "Integral") {
+        // Nature integral
+        real averagePrecisions = 0.;
+        real prevRecall = 0.;
+        for (size_t i = 0; i < num; ++i) {
+          if (fabs(recall[i] - prevRecall) > 1e-6)
+            averagePrecisions += precision[i] * fabs(recall[i] - prevRecall);
+          prevRecall = recall[i];
+        }
+        mAP += averagePrecisions;
+        ++count;
+      } else {
+        LOG(FATAL) << "Unkown ap version: " << apType_;
+      }
+    }
+    if (count != 0) mAP /= count;
+    return mAP * 100;
+  }
+
+  void getAccumulation(vector<pair<real, size_t>> inPairs,
+                       vector<size_t>* accuVec) const {
+    std::stable_sort(
+        inPairs.begin(), inPairs.end(), sortScorePairDescend<size_t>);
+    accuVec->clear();
+    size_t sum = 0;
+    for (size_t i = 0; i < inPairs.size(); ++i) {
+      sum += inPairs[i].second;
+      accuVec->push_back(sum);
+    }
+  }
+
+  std::string getTypeImpl() const { return "detection_map"; }
+
+  real getValueImpl() const { return calcMAP(); }
+
+ private:
+  real overlapThreshold_;  // overlap threshold when determining whether matched
+  bool evaluateDifficult_;  // whether evaluate difficult ground truth
+  size_t backgroundId_;     // class index of background
+  std::string apType_;      // how to calculate mAP (Integral or 11point)
+
+  MatrixPtr cpuOutput_;
+  MatrixPtr cpuLabel_;
+
+  map<size_t, size_t> numPos_;  // counts of true objects each classification
+  map<size_t, vector<pair<real, size_t>>>
+      allTruePos_;  // true positive prediction
+  map<size_t, vector<pair<real, size_t>>>
+      allFalsePos_;  // false positive prediction
+};
+
+REGISTER_EVALUATOR(detection_map, DetectionMAPEvaluator);
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/evaluators/Evaluator.cpp b/paddle/legacy/gserver/evaluators/Evaluator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a956f40d02e39ac57ca745988491c2b54741dca3
--- /dev/null
+++ b/paddle/legacy/gserver/evaluators/Evaluator.cpp
@@ -0,0 +1,1361 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/gserver/evaluators/Evaluator.h"
+#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/StringUtil.h"
+
+DECLARE_int32(trainer_id);
+
+namespace paddle {
+
+void Evaluator::eval(const NeuralNetwork& nn) {
+  std::vector<Argument> arguments;
+  arguments.reserve(config_.input_layers_size());
+  for (const std::string& name : config_.input_layers()) {
+    arguments.push_back(nn.getLayer(name)->getOutput());
+  }
+  SetDevice device(arguments[0].deviceId);
+  real score = evalImp(arguments);
+  totalScore_ += score;
+  updateSamplesNum(arguments);
+}
+/**
+ * @brief classification error Evaluator
+ *
+ * The config file api is classification_error_evaluator.
+ */
+class ClassificationErrorEvaluator : public Evaluator {
+ public:
+  /*
+  ClassificationErrorEvaluator() : totalScore2_(0) {}
+
+  virtual void start() {
+    Evaluator::start();
+    totalScore2_ = 0;
+    } */
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
+    if (3 == arguments.size()) {
+      numSamples_ += arguments[2].value->getSum();
+    } else {
+      numSamples_ += arguments[0].getBatchSize();
+    }
+  }
+
+  MatrixPtr calcError(std::vector<Argument>& arguments) {
+    CHECK_GE(arguments.size(), (size_t)2);
+    CHECK_LE(arguments.size(), (size_t)3);
+    MatrixPtr& output = arguments[0].value;
+    IVectorPtr& label = arguments[1].ids;
+    MatrixPtr& multiBinaryLabel = arguments[1].value;  // For multi binary label
+    bool supportWeight = (3 == arguments.size()) ? true : false;
+    MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
+    if (nullptr == output ||
+        (nullptr == label && nullptr == multiBinaryLabel) ||
+        (supportWeight && nullptr == weight)) {
+      return 0;
+    }
+
+    if (label != nullptr) {
+      CHECK_EQ(label->getSize(), output->getHeight());
+    } else {
+      CHECK_EQ(multiBinaryLabel->getHeight(), output->getHeight());
+      CHECK_EQ(multiBinaryLabel->getWidth(), output->getWidth());
+    }
+    if (supportWeight) {
+      CHECK_EQ(output->getHeight(), weight->getHeight());
+      CHECK_EQ((size_t)1, weight->getWidth());
+    }
+
+    const MatrixPtr errorMat = Matrix::create(output->getHeight(),
+                                              1,
+                                              /* trans= */ false,
+                                              useGpu(arguments[0].deviceId));
+
+    errorMat->zeroMem();
+
+    if (label != nullptr) {
+      errorMat->classificationError(*output, *label, config_.top_k());
+    } else if (dynamic_cast<CpuSparseMatrix*>(multiBinaryLabel.get()) ||
+               dynamic_cast<GpuSparseMatrix*>(multiBinaryLabel.get())) {
+      errorMat->classificationErrorMulti(
+          *output, *multiBinaryLabel, config_.classification_threshold());
+    } else {
+      errorMat->binaryClassificationError(
+          0, *output, *multiBinaryLabel, config_.classification_threshold());
+    }
+
+    if (supportWeight) {
+      errorMat->dotMul(*errorMat, *weight);
+    }
+    return errorMat;
+  }
+
+  void printStats(std::ostream& os) const {
+    if (config_.top_k() == 1) {
+      os << config_.name() << "="
+         << (numSamples_ ? totalScore_ / numSamples_ : 0);
+    } else {
+      os << " top_" << config_.top_k()
+         << "_error=" << (numSamples_ ? totalScore_ / numSamples_ : 0);
+    }
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    MatrixPtr errorMat = calcError(arguments);
+    return errorMat->getSum();
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    mergeResultsOfAllClients(client);
+  }
+
+  // Evaluator interface
+ protected:
+  std::string getTypeImpl() const { return "classification_error"; }
+};
+
+/**
+ * @brief sequence classification error Evaluator
+ * @note sequence level classification error stats,
+ * if any frame in one sequence has error, the sequence is error
+ */
+class SequenceClassificationErrorEvaluator
+    : public ClassificationErrorEvaluator {
+ public:
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
+    numSamples_ += arguments[0].getNumSequences();
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    auto sequenceStartPositions =
+        arguments[0].sequenceStartPositions->getVector(false);
+    CHECK(sequenceStartPositions != nullptr);
+    const int* starts = sequenceStartPositions->getData();
+
+    MatrixPtr errorMat = calcError(arguments);
+
+    int errCounter = 0;
+    CpuVector errorVec(0, nullptr);
+    for (size_t i = 0; i < sequenceStartPositions->getSize() - 1; ++i) {
+      errorVec.subVecFrom(
+          errorMat->getData(), starts[i], starts[i + 1] - starts[i]);
+      if (errorVec.getSum() > 0) {
+        errCounter += 1;
+      }
+    }
+
+    return static_cast<real>(errCounter);
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    mergeResultsOfAllClients(client);
+  }
+
+  // Evaluator interface
+ protected:
+  std::string getTypeImpl() const { return "seq_classification_error"; }
+};
+REGISTER_EVALUATOR(seq_classification_error,
+                   SequenceClassificationErrorEvaluator);
+/**
+ * @brief sum Evaluator
+ * Calculate the sum of output or label
+ *
+ * The config file api is sum_evaluator.
+ */
+class SumEvaluator : public Evaluator {
+ public:
+  SumEvaluator() : cpuLabel_(nullptr), cpuWeight_(nullptr) {}
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
+    if (2 == arguments.size()) {
+      numSamples_ += arguments[1].value->getSum();
+    } else {
+      numSamples_ += arguments[0].getBatchSize();
+    }
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    REGISTER_TIMER("SumEvaluator");
+    CHECK_GE(arguments.size(), (size_t)1);
+    CHECK_LE(arguments.size(), (size_t)2);
+    bool supportWeight = (2 == arguments.size()) ? true : false;
+    if (supportWeight) {
+      if (nullptr == arguments[1].value) {
+        return 0;
+      }
+      CHECK_EQ(arguments[1].value->getWidth(), (size_t)1);
+    }
+
+    // The sum of output
+    if (arguments[0].value) {
+      if (supportWeight) {
+        CHECK_EQ(arguments[0].value->getHeight(),
+                 arguments[1].value->getHeight());
+        MatrixPtr tmpMat = Matrix::create(arguments[0].value->getHeight(),
+                                          arguments[0].value->getWidth(),
+                                          /* trans= */ false,
+                                          arguments[0].value->useGpu());
+        tmpMat->copyFrom(*arguments[0].value);
+        tmpMat->rowScale(0, *tmpMat, *arguments[1].value);
+        return tmpMat->getSum();
+      } else {
+        return arguments[0].value->getSum();
+      }
+      // The sum of label
+    } else if (arguments[0].ids) {
+      size_t insNum = arguments[0].ids->getSize();
+      IVectorPtr label = arguments[0].ids;
+      MatrixPtr weight = supportWeight ? arguments[1].value : nullptr;
+      if (dynamic_cast<GpuIVector*>(label.get())) {
+        IVector::resizeOrCreate(cpuLabel_, insNum, false);
+        cpuLabel_->copyFrom(*arguments[0].ids);
+
+        if (supportWeight) {
+          CHECK_EQ(insNum, arguments[1].value->getHeight());
+          Matrix::resizeOrCreate(cpuWeight_, insNum, (size_t)1, false, false);
+          cpuWeight_->copyFrom(*arguments[1].value);
+        }
+
+        label = cpuLabel_;
+        weight = cpuWeight_;
+      }
+
+      if (supportWeight) {
+        real score = 0.0;
+        int* labelD = label->getData();
+        real* weightD = weight->getData();
+        for (size_t i = 0; i < insNum; ++i) {
+          score += (labelD[i] * weightD[i]);
+        }
+        return score;
+      } else {
+        return label->getSum();
+      }
+    } else {
+      return 0;
+    }
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    mergeResultsOfAllClients(client);
+  }
+
+ private:
+  IVectorPtr cpuLabel_;
+  MatrixPtr cpuWeight_;
+
+  // Evaluator interface
+ protected:
+  std::string getTypeImpl() const { return "sum"; }
+};
+/**
+ * @brief column sum Evaluator
+ * @note column sum for the colIdx-th column *
+ * - colIdx = 0: the 0-th column.
+ * - colIdx > 0: the colIdx-th column.
+ * - colIdx < 0: the last colIdx-th column.
+ *
+ * The config file api is column_sum_evaluator.
+ *
+ */
+class ColumnSumEvaluator : public Evaluator {
+ public:
+  explicit ColumnSumEvaluator(int32_t colIdx)
+      : colIdx_(colIdx), colNum_(0), sum_(nullptr) {}
+
+  virtual void start() {
+    Evaluator::start();
+    if (nullptr != sum_) {
+      sum_->zeroMem();
+    }
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
+    if (2 == arguments.size()) {
+      numSamples_ += arguments[1].value->getSum();
+    } else {
+      numSamples_ += arguments[0].getBatchSize();
+    }
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    REGISTER_TIMER("ColumnSumEvaluator");
+    CHECK_GE(arguments.size(), (size_t)1);
+    CHECK_LE(arguments.size(), (size_t)2);
+    bool supportWeight = (2 == arguments.size()) ? true : false;
+    if (nullptr == arguments[0].value ||
+        (supportWeight && nullptr == arguments[1].value)) {
+      return 0;
+    }
+
+    size_t insNum = arguments[0].value->getHeight();
+    size_t colNum = arguments[0].value->getWidth();
+    if (nullptr == sum_) {
+      sum_ = Matrix::create((size_t)1, colNum, false, /* useGpu */ false);
+      colNum_ = colNum;
+      sum_->zeroMem();
+    } else {
+      CHECK_EQ(colNum, sum_->getWidth());
+    }
+
+    if (supportWeight) {
+      CHECK_EQ(insNum, arguments[1].value->getHeight());
+      CHECK_EQ((size_t)1, arguments[1].value->getWidth());
+      MatrixPtr tmpMat = Matrix::create(insNum, colNum);
+      if (arguments[0].value->useGpu()) {
+        tmpMat->copyFrom(*arguments[0].value);
+      }
+      if (!arguments[1].value->useGpu()) {
+        if (!arguments[0].value->useGpu()) {
+          tmpMat->rowScale(0, *arguments[0].value, *arguments[1].value);
+        } else {
+          tmpMat->rowScale(0, *tmpMat, *arguments[1].value);
+        }
+      } else {
+        MatrixPtr tmp2 = Matrix::create(insNum, 1);
+        tmp2->copyFrom(*arguments[1].value);
+        if (!arguments[0].value->useGpu()) {
+          tmpMat->rowScale(0, *arguments[0].value, *tmp2);
+        } else {
+          tmpMat->rowScale(0, *tmpMat, *tmp2);
+        }
+      }
+      sum_->accumulateColSum(*tmpMat);
+    } else {
+      if (!arguments[0].value->useGpu()) {
+        sum_->accumulateColSum(*arguments[0].value);
+      } else {
+        MatrixPtr tmpMat = Matrix::create(insNum, colNum);
+        tmpMat->copyFrom(*arguments[0].value);
+        sum_->accumulateColSum(*tmpMat);
+      }
+    }
+    return 0;
+  }
+
+  virtual void printStats(std::ostream& os) const {
+    CHECK(colIdx_ + (int32_t)colNum_ >= 0 && colIdx_ - (int32_t)colNum_ < 0)
+        << "column index [" << colIdx_ << "] out of range [-" << colNum_ << ", "
+        << colNum_ << ")";
+    size_t colIdx = 0;
+    if (colIdx_ >= 0) {
+      colIdx = colIdx_;
+    } else {
+      colIdx = colNum_ + colIdx_;
+    }
+    os << config_.name() << "="
+       << (numSamples_ ? sum_->getElement(0, colIdx) / numSamples_ : 0);
+  }
+
+  void distributeEval(ParameterClient2* client) {
+    client->reduce(
+        sum_->getData(), sum_->getData(), colNum_, FLAGS_trainer_id, 0);
+    client->reduce(&numSamples_, &numSamples_, 1, FLAGS_trainer_id, 0);
+  }
+
+ private:
+  int32_t colIdx_;
+  size_t colNum_;
+  MatrixPtr sum_; /* cpu matrix */
+
+  // Evaluator interface
+ protected:
+  std::string getTypeImpl() const {
+    if (colIdx_ == -1)
+      return "last-column-sum";
+    else
+      return "column-sum";
+  }
+};
+
+void AucEvaluator::start() {
+  Evaluator::start();
+  memset(statPos_, 0, sizeof(statPos_));
+  memset(statNeg_, 0, sizeof(statNeg_));
+}
+
+real AucEvaluator::evalImp(std::vector<Argument>& arguments) {
+  REGISTER_TIMER("AucEvaluator");
+  CHECK_GE(arguments.size(), (size_t)2);
+  CHECK_LE(arguments.size(), (size_t)3);
+  MatrixPtr output = arguments[0].value;
+  IVectorPtr label = arguments[1].ids;
+  MatrixPtr labelval = arguments[1].value;
+  bool supportWeight = (3 == arguments.size()) ? true : false;
+  MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
+
+  if (nullptr == output || (supportWeight && nullptr == weight)) {
+    return 0;
+  }
+  size_t insNum = output->getHeight();
+  size_t outputDim = output->getWidth();
+  // Copy label from value to a vector.
+  if (nullptr == label && nullptr != labelval) {
+    // label width is 1
+    CHECK_EQ(1U, labelval->getWidth());
+    VectorPtr vec =
+        Vector::create(labelval->getData(), insNum, output->useGpu());
+    label = vec->castToInt();
+  }
+
+  CHECK_EQ(insNum, label->getSize());
+  if (supportWeight) {
+    CHECK_EQ(insNum, weight->getHeight());
+    CHECK_EQ((size_t)1, weight->getWidth());
+  }
+
+  CHECK(colIdx_ + (int32_t)outputDim >= 0 && colIdx_ - (int32_t)outputDim < 0)
+      << "column index [" << colIdx_ << "] out of range [-" << outputDim << ", "
+      << outputDim << ")";
+  realColumnIdx_ = 0;
+  if (colIdx_ >= 0) {
+    realColumnIdx_ = colIdx_;
+  } else {
+    realColumnIdx_ = outputDim + colIdx_;
+  }
+
+  if (dynamic_cast<GpuMatrix*>(output.get())) {
+    Matrix::resizeOrCreate(cpuOutput_,
+                           insNum,
+                           outputDim,
+                           /* trans=*/false,
+                           /* useGpu=*/false);
+    cpuOutput_->copyFrom(*output);
+    IVector::resizeOrCreate(cpuLabel_, insNum, false);
+    cpuLabel_->copyFrom(*label);
+
+    if (supportWeight) {
+      Matrix::resizeOrCreate(cpuWeight_, insNum, (size_t)1, false, false);
+      cpuWeight_->copyFrom(*weight);
+    }
+
+    output = cpuOutput_;
+    label = cpuLabel_;
+    weight = cpuWeight_;
+  }
+
+  real* outputD = output->getData();
+  int* labelD = label->getData();
+  real* weightD = supportWeight ? weight->getData() : nullptr;
+  size_t pos = realColumnIdx_;
+
+  for (size_t i = 0; i < insNum; ++i) {
+    real value = outputD[pos];
+    uint32_t binIdx = static_cast<uint32_t>(value * kBinNum_);
+    CHECK(binIdx <= kBinNum_) << "bin index [" << binIdx
+                              << "] out of range, predict value[" << value
+                              << "]";
+    real w = supportWeight ? weightD[i] : 1.0;
+    if (labelD[i] == kNegativeLabel_) {
+      statNeg_[binIdx] += w;
+    } else {
+      statPos_[binIdx] += w;
+    }
+    pos += outputDim;
+  }
+  return 0;
+}
+
+void AucEvaluator::distributeEval(ParameterClient2* client) {
+  client->reduce(statPos_, statPos_, kBinNum_ + 1, FLAGS_trainer_id, 0);
+  client->reduce(statNeg_, statNeg_, kBinNum_ + 1, FLAGS_trainer_id, 0);
+}
+
+double AucEvaluator::calcAuc() const {
+  double totPos = 0.0;
+  double totNeg = 0.0;
+  double totPosPrev = 0.0;
+  double totNegPrev = 0.0;
+  double auc = 0.0;
+
+  int64_t idx = kBinNum_;
+  while (idx >= 0) {
+    totPosPrev = totPos;
+    totNegPrev = totNeg;
+    totPos += statPos_[idx];
+    totNeg += statNeg_[idx];
+    auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev);
+    --idx;
+  }
+
+  if (totPos > 0.0 && totNeg > 0.0) {
+    return auc / totPos / totNeg;
+  } else {
+    return 0.0;
+  }
+}
+
+real AucEvaluator::getValueImpl() const { return calcAuc(); }
+
+std::string AucEvaluator::getTypeImpl() const {
+  if (colIdx_ == -1) {
+    return "last-column-auc";
+  } else {
+    return "auc";
+  }
+}
+
+// class RankAucEvaluator
+REGISTER_EVALUATOR(rankauc, RankAucEvaluator);
+
+void RankAucEvaluator::start() { Evaluator::start(); }
+void RankAucEvaluator::updateSamplesNum(
+    const std::vector<Argument>& arguments) {
+  numSamples_ += arguments[0].getNumSequences();
+}
+real RankAucEvaluator::evalImp(std::vector<Argument>& arguments) {
+  CHECK_GE(arguments.size(), 2U);
+  CHECK_LE(arguments.size(), 3U);
+  double batchAuc = 0.0;
+  output_ = arguments[0].value;
+  click_ = arguments[1].value;
+  size_t batchSize = output_->getHeight();
+  CHECK(!output_->useGpu()) << "RankAUC evaluator does not support GPU!";
+
+  if (arguments.size() == 3U) {
+    pv_ = arguments[2].value;
+  } else {
+    Matrix::resizeOrCreate(pv_, batchSize, 1, false, false);
+    std::fill(pv_->getData(), pv_->getData() + batchSize, 1.0);
+  }
+
+  real* outputData = output_->getData();
+  real* clickData = click_->getData();
+  real* pvData = pv_->getData();
+
+  auto startPos = arguments[0].sequenceStartPositions->getVector(false);
+  const int* startPosData = startPos->getData();
+  size_t batchNum = startPos->getSize() - 1;
+  for (size_t i = 0; i < batchNum; ++i) {
+    int beginPos = startPosData[i];
+    int endPos = startPosData[i + 1];
+    batchAuc += calcRankAuc(outputData + beginPos,
+                            clickData + beginPos,
+                            pvData + beginPos,
+                            endPos - beginPos);
+  }
+  return batchAuc;
+}
+
+double RankAucEvaluator::calcRankAuc(real* outputData,
+                                     real* clickData,
+                                     real* pvData,
+                                     size_t size) {
+  outputPair_.clear();
+  for (size_t i = 0; i < size; ++i) {
+    outputPair_.push_back(std::make_pair(outputData[i], i));
+  }
+  std::sort(outputPair_.begin(),
+            outputPair_.end(),
+            [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
+              return a.first > b.first;
+            });
+  double aucTmp = 0.0;
+  double clickSum = 0.0;
+  double oldClickSum = 0.0;
+  double noClick = 0.0;
+  double noClickSum = 0.0;
+
+  double lastScore = outputPair_[0].first + 1.0;
+  for (size_t i = 0; i < size; ++i) {
+    if (lastScore != outputPair_[i].first) {
+      aucTmp += (clickSum + oldClickSum) * noClick / 2.0;
+      oldClickSum = clickSum;
+      noClick = 0.0;
+      lastScore = outputPair_[i].first;
+    }
+    size_t id = outputPair_[i].second;
+    noClick += pvData[id] - clickData[id];
+    noClickSum += noClick;
+    clickSum += clickData[id];
+  }
+  aucTmp += (clickSum + oldClickSum) * noClick / 2.0;
+  return (clickSum * noClickSum) == 0.0 ? 0.0
+                                        : aucTmp / (clickSum * noClickSum);
+}
+
+std::string RankAucEvaluator::getTypeImpl() const { return "rankauc"; }
+
+// class PrecisionRecallEvaluator
+REGISTER_EVALUATOR(precision_recall, PrecisionRecallEvaluator);
+
+void PrecisionRecallEvaluator::start() {
+  Evaluator::start();
+  statsInfo_.clear();
+  values_.clear();
+}
+
+real PrecisionRecallEvaluator::evalImp(std::vector<Argument>& arguments) {
+  REGISTER_TIMER("PrecisionRecallEvaluator");
+  CHECK_GE(arguments.size(), (size_t)2);
+  CHECK_LE(arguments.size(), (size_t)3);
+  MatrixPtr output = arguments[0].value;
+  IVectorPtr label = arguments[1].ids;
+  MatrixPtr multiBinaryLabel = arguments[1].value;
+  bool supportWeight = (3 == arguments.size()) ? true : false;
+  MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
+  if (nullptr == output || (nullptr == label && nullptr == multiBinaryLabel) ||
+      (supportWeight && nullptr == weight)) {
+    return 0;
+  }
+
+  size_t insNum = output->getHeight();
+  size_t outputDim = output->getWidth();
+  if (label != nullptr) {
+    CHECK_EQ(insNum, label->getSize());
+  } else {
+    CHECK_EQ(insNum, multiBinaryLabel->getHeight());
+    CHECK_EQ(outputDim, multiBinaryLabel->getWidth());
+  }
+  if (supportWeight) {
+    CHECK_EQ(insNum, weight->getHeight());
+    CHECK_EQ((size_t)1, weight->getWidth());
+  }
+
+  if (statsInfo_.size() != outputDim) {
+    statsInfo_.clear();
+    statsInfo_.resize(outputDim);
+  }
+
+  isMultiBinaryLabel_ = (nullptr == label) ? true : false;
+  if (label != nullptr) {
+    if (dynamic_cast<GpuMatrix*>(output.get())) {
+      Matrix::resizeOrCreate(cpuOutput_, insNum, outputDim, false, false);
+      cpuOutput_->copyFrom(*output);
+      IVector::resizeOrCreate(cpuLabel_, insNum, false);
+      cpuLabel_->copyFrom(*label);
+      if (supportWeight) {
+        Matrix::resizeOrCreate(cpuWeight_, insNum, (size_t)1, false, false);
+        cpuWeight_->copyFrom(*weight);
+      }
+
+      output = cpuOutput_;
+      label = cpuLabel_;
+      weight = cpuWeight_;
+    }
+    calcStatsInfo(output, label, weight);
+  } else {
+    // Not support GPU for multi binary labels
+    CHECK(dynamic_cast<CpuSparseMatrix*>(multiBinaryLabel.get()));
+    calcStatsInfoMulti(output, multiBinaryLabel, weight);
+  }
+  return 0;
+}
+
+void PrecisionRecallEvaluator::printStats(std::ostream& os) const {
+  PrintStatsInfo info;
+  bool containMacroMicroInfo = getStatsInfo(&info);
+  os << "positive_label=" << config_.positive_label()
+     << " precision=" << info.precision << " recall=" << info.recall
+     << " F1-score=" << info.f1;
+  if (containMacroMicroInfo) {
+    os << "macro-average-precision=" << info.macroAvgPrecision
+       << " macro-average-recall=" << info.macroAvgRecall
+       << " macro-average-F1-score=" << info.macroAvgF1Score;
+    if (!isMultiBinaryLabel_) {
+      // precision and recall are equal in this case
+      os << " micro-average-precision=" << info.microAvgPrecision;
+    } else {
+      os << " micro-average-precision=" << info.microAvgPrecision
+         << " micro-average-recall=" << info.microAvgRecall
+         << " micro-average-F1-score=" << info.microAvgF1Score;
+    }
+  }
+}
+
+void PrecisionRecallEvaluator::calcStatsInfo(const MatrixPtr& output,
+                                             const IVectorPtr& label,
+                                             const MatrixPtr& weight) {
+  size_t insNum = output->getHeight();
+  size_t dim = output->getWidth();
+  real* outputD = output->getData();
+  int* labelD = label->getData();
+  real* weightD = (weight != nullptr) ? weight->getData() : nullptr;
+  for (size_t i = 0; i < insNum; ++i) {
+    CHECK_GE(labelD[i], 0);
+    CHECK_LT((size_t)labelD[i], dim);
+    size_t maxIdx = 0;
+    real maxValue = outputD[i * dim];
+    for (size_t j = 1; j < dim; ++j) {
+      size_t idx = i * dim + j;
+      if (maxValue < outputD[idx]) {
+        maxIdx = j;
+        maxValue = outputD[idx];
+      }
+    }
+
+    real w = (weightD != nullptr) ? weightD[i] : 1.0;
+    if (maxIdx == (size_t)labelD[i]) {
+      statsInfo_[maxIdx].TP += w;  // true positive for labelD[i]
+      // true negative for all labels except for labelD[i]
+      for (size_t j = 0; j < dim; ++j) {
+        statsInfo_[j].TN += w;
+      }
+      statsInfo_[maxIdx].TN -= w;
+    } else {
+      statsInfo_[labelD[i]].FN += w;  // false negative for labelD[i]
+      statsInfo_[maxIdx].FP += w;     // false positive for maxIdx
+      // true negatives for all labels except for maxIdx and labelD[i]
+      for (size_t j = 0; j < dim; ++j) {
+        statsInfo_[j].TN += w;
+      }
+      statsInfo_[maxIdx].TN -= w;
+      statsInfo_[labelD[i]].TN -= w;
+    }
+  }
+}
+
+void PrecisionRecallEvaluator::calcStatsInfoMulti(const MatrixPtr& output,
+                                                  const MatrixPtr& label,
+                                                  const MatrixPtr& weight) {
+  size_t insNum = output->getHeight();
+  size_t dim = output->getWidth();
+  real* outputD = output->getData();
+  auto labelD = dynamic_cast<CpuSparseMatrix*>(label.get());
+  real* weightD = (weight != nullptr) ? weight->getData() : nullptr;
+  real threshold = config_.classification_threshold();
+  for (size_t i = 0; i < insNum; ++i) {
+    for (size_t j = 0; j < dim; ++j) {
+      real w = (weightD != nullptr) ? weightD[i] : 1.0;
+      size_t idx = i * dim + j;
+      if (outputD[idx] < threshold) {
+        statsInfo_[j].TN += w;  // true negative
+      } else {
+        statsInfo_[j].FP += w;  // false positive
+      }
+    }
+
+    const int* cols = labelD->getRowCols(i);
+    for (size_t j = 0; j < labelD->getColNum(i); ++j) {
+      CHECK_LT(size_t(cols[j]), dim);
+      real w = (weightD != nullptr) ? weightD[i] : 1.0;
+      size_t idx = i * dim + cols[j];
+      if (outputD[idx] < threshold) {
+        statsInfo_[cols[j]].FN += w;  // false negative
+        statsInfo_[cols[j]].TN -= w;  // true negative
+      } else {
+        statsInfo_[cols[j]].TP += w;  // true positive
+        statsInfo_[cols[j]].FP -= w;  // false positive
+      }
+    }
+  }
+}
+
+void PrecisionRecallEvaluator::storeLocalValues() const {
+  if (this->values_.size() == 0) {
+    PrintStatsInfo info;
+    bool containMacroMicroInfo = getStatsInfo(&info);
+    values_["precision"] = info.precision;
+    values_["recal"] = info.recall;
+    values_["F1-score"] = info.f1;
+    if (containMacroMicroInfo) {
+      values_["macro-average-precision"] = info.macroAvgPrecision;
+      values_["macro-average-recall"] = info.macroAvgRecall;
+      values_["macro-average-F1-score"] = info.macroAvgF1Score;
+      if (!isMultiBinaryLabel_) {
+        // precision and recall are equal in this case
+        values_["micro-average-precision"] = info.microAvgPrecision;
+      } else {
+        values_["micro-average-precision"] = info.microAvgPrecision;
+        values_["micro-average-recall"] = info.microAvgRecall;
+        values_["micro-average-F1-score"] = info.microAvgF1Score;
+      }
+    }
+  }
+}
+
+void PrecisionRecallEvaluator::getNames(std::vector<std::string>* names) {
+  this->storeLocalValues();
+  names->reserve(this->values_.size());
+  for (auto it = this->values_.begin(); it != this->values_.end(); ++it) {
+    names->push_back(this->config_.name() + "." + it->first);
+  }
+}
+
+real PrecisionRecallEvaluator::getValue(const std::string& name,
+                                        Error* err) const {
+  this->storeLocalValues();
+  std::vector<std::string> buffers;
+  paddle::str::split(name, '.', &buffers);
+  auto it = this->values_.find(buffers[buffers.size() - 1]);
+  if (it == this->values_.end()) {  // not found
+    *err = Error("No such key %s", name.c_str());
+    return .0f;
+  }
+
+  return it->second;
+}
+
+std::string PrecisionRecallEvaluator::getType(const std::string& name,
+                                              Error* err) const {
+  this->getValue(name, err);
+  if (!err->isOK()) {
+    return "";
+  }
+  return "precision_recall";
+}
+
+void PrecisionRecallEvaluator::distributeEval(ParameterClient2* client) {
+  size_t size = 4 * statsInfo_.size();
+  double* buf = new double[size];
+  for (size_t i = 0; i < statsInfo_.size(); ++i) {
+    buf[4 * i + 0] = statsInfo_[i].TP;
+    buf[4 * i + 1] = statsInfo_[i].TN;
+    buf[4 * i + 2] = statsInfo_[i].FP;
+    buf[4 * i + 3] = statsInfo_[i].FN;
+  }
+  client->reduce(buf, buf, size, FLAGS_trainer_id, 0);
+  for (size_t i = 0; i < statsInfo_.size(); ++i) {
+    statsInfo_[i].TP = buf[4 * i + 0];
+    statsInfo_[i].TN = buf[4 * i + 1];
+    statsInfo_[i].FP = buf[4 * i + 2];
+    statsInfo_[i].FN = buf[4 * i + 3];
+  }
+  delete[] buf;
+}
+
+bool PrecisionRecallEvaluator::getStatsInfo(
+    PrecisionRecallEvaluator::PrintStatsInfo* info) const {
+  int label = config_.positive_label();
+  if (label != -1) {
+    CHECK(label >= 0 && label < (int)statsInfo_.size())
+        << "positive_label [" << label << "] should be in range [0, "
+        << statsInfo_.size() << ")";
+    info->precision = calcPrecision(statsInfo_[label].TP, statsInfo_[label].FP);
+    info->recall = calcRecall(statsInfo_[label].TP, statsInfo_[label].FN);
+    info->f1 = calcF1Score(info->precision, info->recall);
+    return false;
+  }
+
+  // micro average method: precision = (TP1+TP2)/(TP1+FP1+TP2+FP2)
+  // macro average method: precision = (precision1+precision2)/2
+  double microTotalTP = 0;
+  double microTotalFP = 0;
+  double microTotalFN = 0;
+  info->macroAvgPrecision = 0;
+  info->macroAvgRecall = 0;
+  size_t numLabels = statsInfo_.size();
+  for (size_t i = 0; i < numLabels; ++i) {
+    microTotalTP += statsInfo_[i].TP;
+    microTotalFP += statsInfo_[i].FP;
+    microTotalFN += statsInfo_[i].FN;
+    info->macroAvgPrecision +=
+        calcPrecision(statsInfo_[i].TP, statsInfo_[i].FP);
+    info->macroAvgRecall += calcRecall(statsInfo_[i].TP, statsInfo_[i].FN);
+  }
+  info->macroAvgPrecision /= numLabels;
+  info->macroAvgRecall /= numLabels;
+  info->macroAvgF1Score =
+      calcF1Score(info->macroAvgPrecision, info->macroAvgRecall);
+
+  info->microAvgPrecision = calcPrecision(microTotalTP, microTotalFP);
+  info->microAvgRecall = calcPrecision(microTotalTP, microTotalFN);
+  info->microAvgF1Score =
+      calcF1Score(info->microAvgPrecision, info->microAvgRecall);
+  return true;
+}
+
+REGISTER_EVALUATOR(pnpair, PnpairEvaluator);
+void PnpairEvaluator::start() {
+  Evaluator::start();
+  memset(pairArray_, 0, sizeof(pairArray_));
+  predictArray_.clear();
+}
+
+real PnpairEvaluator::evalImp(std::vector<Argument>& arguments) {
+  CHECK_GE(arguments.size(), 3UL);
+  CHECK_LE(arguments.size(), 4UL);
+  MatrixPtr output = arguments[0].value;
+  IVectorPtr label = arguments[1].ids;
+  IVectorPtr info = arguments[2].ids;
+  bool supportWeight = (4 == arguments.size()) ? true : false;
+  MatrixPtr weight = supportWeight ? arguments[3].value : nullptr;
+  if (nullptr == output || nullptr == label ||
+      (supportWeight && nullptr == weight)) {
+    return 0;
+  }
+  size_t height = output->getHeight();
+  size_t width = output->getWidth();
+  CHECK_EQ(height, label->getSize());
+  CHECK_EQ(height, info->getSize());
+  if (supportWeight) {
+    CHECK_EQ(height, weight->getHeight());
+    CHECK_EQ((size_t)1, weight->getWidth());
+  }
+
+  if (dynamic_cast<GpuMatrix*>(output.get())) {
+    Matrix::resizeOrCreate(cpuOutput_, height, width, false, false);
+    IVector::resizeOrCreate(cpuLabel_, height, false);
+    IVector::resizeOrCreate(cpuInfo_, height, false);
+    cpuOutput_->copyFrom(*output);
+    cpuLabel_->copyFrom(*label);
+    cpuInfo_->copyFrom(*info);
+
+    output = cpuOutput_;
+    label = cpuLabel_;
+    info = cpuInfo_;
+
+    if (supportWeight) {
+      Matrix::resizeOrCreate(cpuWeight_, height, (size_t)1, false, false);
+      cpuWeight_->copyFrom(*weight);
+      weight = cpuWeight_;
+    }
+  }
+
+  real* outputs = output->getData();
+  int* labels = label->getData();
+  int* infos = info->getData();
+  real* weights = supportWeight ? weight->getData() : nullptr;
+  for (size_t i = 0; i < output->getHeight(); i++) {
+    real y1 = outputs[i * width + (width - 1)];
+    real w = supportWeight ? weights[i] : 1.0;
+    predictArray_.push_back(PredictionResult(y1, labels[i], infos[i], w));
+  }
+  return 0;
+}
+
+void PnpairEvaluator::stat(size_t start,
+                           size_t end,
+                           PredictionResult* answers,
+                           double& pos,
+                           double& neg,
+                           double& spe) {
+  for (size_t i = start; i < end; i++) {
+    for (size_t j = i + 1; j < end; j++) {
+      CHECK_EQ(answers[i].queryid, answers[j].queryid);
+      // The pair weight is the mean of the two samples' weight
+      double weight = (answers[i].weight + answers[j].weight) / 2.0;
+      if (answers[i].label != answers[j].label) {
+        if ((answers[i].out > answers[j].out &&
+             answers[i].label > answers[j].label) ||
+            (answers[i].out < answers[j].out &&
+             answers[i].label < answers[j].label)) {
+          pos += weight;
+        } else if ((answers[i].out > answers[j].out &&
+                    answers[i].label < answers[j].label) ||
+                   (answers[i].out < answers[j].out &&
+                    answers[i].label > answers[j].label)) {
+          neg += weight;
+        } else {
+          spe += weight;
+        }
+      }
+    }
+  }
+}
+
+void PnpairEvaluator::calc(std::vector<PredictionResult>& predictArray) {
+  std::sort(predictArray.begin(),
+            predictArray.end(),
+            [](const PredictionResult& x, const PredictionResult& y) {
+              return x.queryid < y.queryid;
+            });
+
+  double pos = 0;
+  double neg = 0;
+  double special = 0;
+  auto start = predictArray.begin();
+  while (start != predictArray.end()) {
+    auto end = std::find_if(
+        start + 1, predictArray.end(), [=](const PredictionResult& x) {
+          return x.queryid != start->queryid;
+        });
+    CHECK(end != start);
+    stat(start - predictArray.begin(),
+         end - predictArray.begin(),
+         predictArray.data(),
+         pos,
+         neg,
+         special);
+
+    start = end;
+  }
+
+  pairArray_[0] += pos;
+  pairArray_[1] += neg;
+
+  LOG(INFO) << " calc total pos pair: " << pos
+            << " calc total neg pair: " << neg
+            << " calc total special pair: " << special;
+}
+
+std::string PnpairEvaluator::getTypeImpl() const { return "pnpair"; }
+
+ClassRegistrar<Evaluator> Evaluator::registrar_;
+Evaluator* Evaluator::create(const EvaluatorConfig& config) {
+  Evaluator* evaluator = registrar_.createByType(config.type());
+  evaluator->init(config);
+  return evaluator;
+}
+
+REGISTER_EVALUATOR(classification_error, ClassificationErrorEvaluator);
+REGISTER_EVALUATOR(sum, SumEvaluator);
+static InitFunction __reg_type_auc_sum__([]() {
+  Evaluator::registrar_.registerClass(
+      "last-column-sum", [] { return new ColumnSumEvaluator(-1); });
+  Evaluator::registrar_.registerClass("last-column-auc",
+                                      [] { return new AucEvaluator(-1); });
+});
+
+/**
+ * @brief print value of each layer.
+ *
+ * The config file api is value_printer_evaluator.
+ */
+class ValuePrinter : public NotGetableEvaluator {
+ public:
+  virtual void eval(const NeuralNetwork& nn) {
+    for (const std::string& name : config_.input_layers()) {
+      nn.getLayer(name)->getOutput().printValueString(LOG(INFO),
+                                                      "layer=" + name + " ");
+    }
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
+};
+REGISTER_EVALUATOR(value_printer, ValuePrinter);
+
+/**
+ * @brief print gradient of each layer.
+ *
+ * The config file api is gradient_printer_evaluator.
+ */
+class GradientPrinter : public NotGetableEvaluator {
+ public:
+  virtual void eval(const NeuralNetwork& nn) {
+    for (const std::string& name : config_.input_layers()) {
+      const Argument& argu = nn.getLayer(name)->getOutput();
+      if (argu.grad) {
+        std::ostringstream os;
+        argu.grad->print(os);
+        LOG(INFO) << "layer=" << name << " grad matrix:\n" << os.str();
+      }
+    }
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
+};
+REGISTER_EVALUATOR(gradient_printer, GradientPrinter);
+/**
+ * @brief print row max id vctor of each layer
+ *
+ * The config file api is maxid_printer_evaluator.
+ */
+class MaxIdPrinter : public NotGetableEvaluator {
+ private:
+  IVectorPtr maxIds_;
+  MatrixPtr maxValues_;
+
+ public:
+  MaxIdPrinter() {}
+
+  virtual void eval(const NeuralNetwork& nn) {
+    for (const std::string& name : config_.input_layers()) {
+      const Argument& argu = nn.getLayer(name)->getOutput();
+      if (argu.value) {
+        size_t height = argu.value->getHeight();
+        size_t width = config_.num_results();
+        IVector::resizeOrCreate(maxIds_, height * width, false);
+        Matrix::resizeOrCreate(maxValues_, height, width, false);
+        argu.value->rowMax(*maxIds_, *maxValues_);
+        std::ostringstream os;
+        int* ids = maxIds_->getData();
+        real* values = maxValues_->getData();
+        for (size_t i = 0; i < height; ++i) {
+          for (size_t j = 0; j < width; ++j) {
+            size_t pos = i * width + j;
+            os << ids[pos] << " : " << values[pos] << ", ";
+          }
+          os << std::endl;
+        }
+        LOG(INFO) << "layer=" << name << " row max id vector:\n" << os.str();
+      }
+    }
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
+};
+REGISTER_EVALUATOR(max_id_printer, MaxIdPrinter);
+/**
+ * @brief print sequence max frames of each layer
+ *
+ * The config file api is maxframe_printer_evaluator.
+ */
+class MaxFramePrinter : public NotGetableEvaluator {
+ private:
+  IVectorPtr maxIds_;
+  MatrixPtr maxValues_;
+  MatrixPtr value_;
+
+ public:
+  MaxFramePrinter() {
+    value_ =
+        Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, false);
+  }
+
+  virtual void eval(const NeuralNetwork& nn) {
+    for (const std::string& name : config_.input_layers()) {
+      const Argument& argu = nn.getLayer(name)->getOutput();
+
+      CHECK_EQ(argu.value->getWidth(), 1LU);
+      size_t numSequences = argu.getNumSequences();
+      const int* starts = argu.sequenceStartPositions->getData(false);
+
+      std::ostringstream os;
+      for (size_t i = 0; i < numSequences; ++i) {
+        size_t offset = starts[i];
+        size_t size = starts[i + 1] - starts[i];
+        value_->setData(argu.value->getData() + offset, 1LU, size);
+
+        size_t height = 1LU;
+        size_t width = std::min((size_t)config_.num_results(), size);
+        IVector::resizeOrCreate(maxIds_, height * width, false);
+        Matrix::resizeOrCreate(maxValues_, height, width, false);
+
+        value_->rowMax(*maxIds_, *maxValues_);
+
+        int* ids = maxIds_->getData();
+        real* values = maxValues_->getData();
+        for (size_t j = 0; j < width; ++j) {
+          os << ids[j] << " : " << values[j] << ", ";
+        }
+        os << "total " << size << " frames" << std::endl;
+      }
+      LOG(INFO) << "layer=" << name << " sequence max frames:\n" << os.str();
+    }
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
+};
+REGISTER_EVALUATOR(max_frame_printer, MaxFramePrinter);
+
+/**
+ * @brief print text according to index matrix and a dictionary.
+ *
+ * There can be multiple input to this layer:
+ * - If there is only one input, the input must be a matrix containing
+ *      the sequence of indices;
+ * - If there are more than one input, the first input should be ids,
+ *      and are interpreted as sample ids.
+ *
+ * The output format will be:
+ *
+ * - sequence without sub-sequence, and there is probability.
+ *
+ *     @code
+ *      id \t prob space_seperated_tokens_from_dictionary_according_to_seq
+ *     @endcode
+ *
+ * - sequence without sub-sequence, and there is not probability.
+ *
+ *     @code
+ *      id \t space_seperated_tokens_from_dictionary_according_to_seq
+ *     @endcode
+ *
+ * - sequence with sub-sequence, and there is not probability.
+ *
+ *     @code
+ *      id \t space_seperated_tokens_from_dictionary_according_to_sub_seq
+ *      \t \t space_seperated_tokens_from_dictionary_according_to_sub_seq
+ *      ...
+ *     @endcode
+ *
+ * Typically SequenceTextPrinter layer takes output of maxid or RecurrentGroup
+ * with maxid (when generating) as an input.
+ *
+ * The config file api is seqtext_printer_evaluator.
+ *
+ */
+class SequenceTextPrinter : public NotGetableEvaluator {
+ private:
+  /// dict_file, which contains a list of tokens
+  std::vector<std::string> dict_;
+  /// result_file, which is the output file
+  std::ofstream os_;
+  /// True/False, to indicate whether to use space to separate output tokens.
+  /// Default is True. No space is added if set to False.
+  bool delimited_;
+  /// store the cpu version of argument.ids
+  std::vector<IVectorPtr> cpuIds_;
+  /// store the probability associated with each sequence
+  std::vector<MatrixPtr> cpuIn_;
+
+ public:
+  SequenceTextPrinter() {}
+
+  virtual void init(const EvaluatorConfig& config) {
+    Evaluator::init(config);
+    if (!config.dict_file().empty()) {
+      loadFileList(config.dict_file(), dict_);
+    }
+
+    os_.open(config.result_file(), std::ofstream::trunc);
+    CHECK(os_.is_open()) << "Failed to open file " << config.result_file();
+    delimited_ = config.delimited();
+  }
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    CHECK_GE(arguments.size(), 1LU);
+    bool hasId = arguments.size() > 1;
+    size_t numSequences = arguments[0].getNumSequences();
+    if (hasId) {
+      CHECK_EQ(arguments[0].ids->getSize(), numSequences)
+          << "first input must be sample id.";
+    }
+    for (size_t i = hasId ? 1 : 0; i < arguments.size(); ++i) {
+      CHECK_EQ((size_t)arguments[i].getNumSequences(), numSequences);
+    }
+
+    auto resizeVector = [](IVectorPtr& dest, const IVectorPtr& src) {
+      if (src && src->useGpu()) {
+        IVector::resizeOrCreate(dest, src->getSize(), false);
+        dest->copyFrom(*src);
+      } else {
+        dest = src;
+      }
+    };
+
+    auto resizeMatrix = [](MatrixPtr& dest, const MatrixPtr& src) {
+      if (src && src->useGpu()) {
+        Matrix::resizeOrCreate(
+            dest, src->getHeight(), src->getWidth(), false, false);
+        dest->copyFrom(*src);
+      } else {
+        dest = src;
+      }
+    };
+
+    cpuIds_.resize(arguments.size());
+    cpuIn_.resize(arguments.size());
+    for (size_t i = 0; i < arguments.size(); ++i) {
+      resizeVector(cpuIds_[i], arguments[i].ids);
+      resizeMatrix(cpuIn_[i], arguments[i].in);
+    }
+
+    int* sampleIds = nullptr;
+    if (hasId) {
+      sampleIds = cpuIds_[0]->getData();
+    }
+
+    for (size_t i = 0; i < numSequences; ++i) {
+      os_ << (hasId ? sampleIds[i] : i);
+      for (size_t j = hasId ? 1 : 0; j < arguments.size(); ++j) {
+        int* output = cpuIds_[j]->getData();
+        const int* starts = arguments[j].sequenceStartPositions->getData(false);
+
+        auto seqPrint = [&](int start, int end) {
+          os_ << "\t";
+          for (int k = start; k < end; k++) {
+            int id = output[k];
+            os_ << (delimited_ ? " " : "");
+            if (!dict_.empty()) {
+              CHECK_LT((size_t)id, dict_.size());
+              os_ << dict_[id];
+            } else {
+              os_ << id;
+            }
+          }
+        };
+
+        if (arguments[j].hasSubseq()) {
+          // print sequence with sub-sequence
+          const int* subStarts =
+              arguments[j].subSequenceStartPositions->getData(false);
+          int subSeqId_start = 0;
+          int subSeqId_end = 0;
+          for (size_t k = 0; k < (size_t)arguments[j].getNumSubSequences() + 1;
+               ++k) {
+            if (starts[i] == subStarts[k]) subSeqId_start = k;
+            if (starts[i + 1] == subStarts[k]) subSeqId_end = k;
+          }
+          for (int k = subSeqId_start; k < subSeqId_end; k++) {
+            seqPrint(subStarts[k], subStarts[k + 1]);
+            os_ << std::endl;
+          }
+
+        } else {
+          // print sequence without sub-sequence
+          if (arguments[j].in) {  // beam print
+            real* probs = cpuIn_[j]->rowBuf(i);
+            os_ << std::endl;
+            int start = starts[i];
+            int seqEnd = starts[i + 1];
+            for (size_t k = 0; k < arguments[j].in->getWidth(); ++k) {
+              if (start == seqEnd) {
+                break;
+              }
+              int end = start + output[start] + 2;
+              CHECK_LE(end, seqEnd);
+              CHECK_EQ(output[end - 1], -1);
+              os_ << k << "\t" << probs[k];
+              seqPrint(start + 1, end - 1);
+              os_ << std::endl;
+              start = end;
+            }
+          } else {
+            seqPrint(starts[i], starts[i + 1]);
+          }
+        }
+      }
+      os_ << std::endl;
+    }
+    return 0;
+  }
+};
+REGISTER_EVALUATOR(seq_text_printer, SequenceTextPrinter);
+/**
+ * @brief print classification error.
+ *
+ * The config file api is classification_error_printer_evaluator.
+ */
+class ClassificationErrorPrinter : public ClassificationErrorEvaluator {
+ public:
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {}
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    MatrixPtr errorMat = calcError(arguments);
+
+    std::ostringstream os;
+    errorMat->print(os);
+    LOG(INFO) << "Printer=" << config_.name() << " Classification Error:\n"
+              << os.str();
+
+    if (auto startPos = arguments[0].sequenceStartPositions) {
+      std::ostringstream os;
+      startPos->getVector(false)->print(os, startPos->getSize());
+      LOG(INFO) << "Printer=" << config_.name() << " sequence pos vector:\n"
+                << os.str();
+    }
+    return 0;
+  }
+};
+REGISTER_EVALUATOR(classification_error_printer, ClassificationErrorPrinter);
+
+std::string DummyEvaluator::getTypeImpl() const { return "dummy"; }
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/evaluators/Evaluator.h b/paddle/legacy/gserver/evaluators/Evaluator.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3462819b1244e9f2d1a463cb44e7c550406c000
--- /dev/null
+++ b/paddle/legacy/gserver/evaluators/Evaluator.h
@@ -0,0 +1,510 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <fstream>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/parameter/Argument.h"
+#include "paddle/legacy/pserver/ParameterClient2.h"
+#include "paddle/legacy/utils/ClassRegistrar.h"
+#include "paddle/legacy/utils/Error.h"
+
+namespace paddle {
+
+class NeuralNetwork;
+/**
+ * @def REGISTER_EVALUATOR
+ * @brief Macro for registering evaluator class
+ */
+
+#define REGISTER_EVALUATOR(__type_name, __class_name)                \
+  static InitFunction __reg_type_##__type_name([]() {                \
+    Evaluator::registrar_.registerClass<__class_name>(#__type_name); \
+  })
+/**
+ * @brief Base class for Evaluator
+ * Evaluating the performance of a model is very important.
+ * It indicates how successful the scores(predictions) of a datasets
+ * has been by a trained model.
+ */
+class Evaluator {
+ public:
+  static Evaluator* create(const EvaluatorConfig& config);
+
+  Evaluator() : numSamples_(0), totalScore_(0) {}
+
+  virtual ~Evaluator() {}
+
+  virtual void init(const EvaluatorConfig& config) { config_ = config; }
+
+  /**
+   * @brief start to evaluate some data
+   */
+  virtual void start() {
+    numSamples_ = 0;
+    totalScore_ = 0;
+  }
+
+  /**
+   * @brief Process a batch of data.
+   */
+  virtual void eval(const NeuralNetwork& nn);
+
+  /**
+   * @brief Process a batch of data.
+   * @return the score for the batch if it make sense to sum the score across
+   * batches.
+   * @note Otherwise evaluator should return 0 and override finish() and
+   * printStats() to do the right calculation.
+   */
+  virtual real evalImp(std::vector<Argument>& arguments) = 0;
+
+  /**
+   * @brief Update the number of processed samples
+   */
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
+    numSamples_ += arguments[0].getBatchSize();
+  }
+
+  /// finish() should be called before distributeEval
+  virtual void distributeEval(ParameterClient2* client) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  void mergeResultsOfAllClients(ParameterClient2* client) {
+    double data[2] = {totalScore_, numSamples_};
+    client->reduce(data, data, 2, FLAGS_trainer_id, 0);
+    totalScore_ = data[0];
+    numSamples_ = data[1];
+  }
+
+  /**
+   * @brief finish the evaluation.
+   */
+  virtual void finish() {}
+
+  /**
+   * @brief print the statistics of evaluate result
+   * @note finish() should be called before printStats
+   */
+  virtual void printStats(std::ostream& os) const {
+    os << config_.name() << "="
+       << (numSamples_ ? totalScore_ / numSamples_ : 0);
+  }
+
+  friend std::ostream& operator<<(std::ostream& os,
+                                  const Evaluator& evaluator) {
+    evaluator.printStats(os);
+    return os;
+  }
+
+  friend std::ostream&& operator<<(std::ostream&& os,  // NOLINT
+                                   const Evaluator& evaluator) {
+    evaluator.printStats(os);
+    return std::move(os);
+  }
+
+  static ClassRegistrar<Evaluator> registrar_;
+
+  /**
+   * @brief getNames will return all field names of current evaluator.
+   *
+   * The format of name is `evaluator_name.evaluator_fields`. If the evaluator
+   * has multiple field, the name could be `evaluator_name.field1`. For example
+   * the PrecisionRecallEvaluator contains `precision`, `recall` fields. The get
+   * names will return `precision_recall_evaluator.precision`,
+   * `precision_recall_evaluator.recal`, etc.
+   *
+   * Also, if current Evaluator is a combined evaluator. getNames will return
+   * all names of all evaluators inside the combined evaluator.
+   *
+   * @param names [out]: the field names of current evaluator.
+   * @note Never clear the names parameter inside getNames.
+   */
+  virtual void getNames(std::vector<std::string>* names) {
+    names->push_back(config_.name());
+  }
+
+  /**
+   * @brief getValue will return the current evaluate value of one field.
+   *
+   * @param name: The field name of current evaluator.
+   * @param err [out]: The error state.
+   *
+   * @return The evaluate value(metric).
+   */
+  virtual real getValue(const std::string& name, Error* err) const {
+    if (name != config_.name()) {
+      *err = Error("no such name of evaluator %s", name.c_str());
+      return .0f;
+    }
+    return this->getValueImpl();
+  }
+
+  /**
+   * @brief getType will return the evaluator type by field name.
+   *
+   * Evaluate Type is the current type of evaluator in string. Such as 'auc',
+   * 'precision_recall'. In combined evaluator, different name may get different
+   * evaluate type because it could be evaluated by different evaluator inside.
+   *
+   * @param name: The field name of current Evaluator.
+   * @param err: The error state. nullptr means don't care.
+   * @return the evaluator type string.
+   */
+  virtual std::string getType(const std::string& name, Error* err) const {
+    if (name != config_.name()) {
+      *err = Error("no such name of evaluator %s", name.c_str());
+      return std::string();
+    }
+    return this->getTypeImpl();
+  }
+
+ protected:
+  /**
+   * @brief getValueImpl The simplest way to define getValue result. If this
+   * evaluator doesn't contain multiple fields, and do not throw any error, just
+   * implemented this method to get the evaluate result(metric).
+   * @return Evaluate result(metric).
+   */
+  virtual real getValueImpl() const {
+    return numSamples_ != .0 ? totalScore_ / numSamples_ : .0;
+  }
+
+  /**
+   * @brief getTypeImpl The simplest way to define getType result. If this
+   * evaluator doesn't combine many evaluators, the get type should only return
+   * itself type.
+   * @return Evaluator type.
+   */
+  virtual std::string getTypeImpl() const { return "base"; }
+
+ protected:
+  EvaluatorConfig config_;
+  double numSamples_;
+  double totalScore_;
+};
+
+/**
+ * @brief The NotGetableEvaluator class is the base class of evaluator that
+ * cannot get value in runtime. The most NotGetableEvaluator is Printer
+ * Evaluator, which is only used to debug network configuration.
+ */
+class NotGetableEvaluator : public Evaluator {
+  // Evaluator interface
+ public:
+  void getNames(std::vector<std::string>* names) {}
+
+  real getValue(const std::string& name, Error* err) const {
+    *err = Error("Not implemented");
+    return .0f;
+  }
+
+  std::string getType(const std::string& name, Error* err) const {
+    *err = Error("Not implemented");
+    return "";
+  }
+};
+
+class DummyEvaluator : public Evaluator {
+ public:
+  DummyEvaluator() {}
+  virtual void init(const EvaluatorConfig&) {}
+  virtual void start() {}
+  virtual void eval(const NeuralNetwork&) {}
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    (void)arguments;
+    return -1;
+  }
+  virtual void finish() {}
+  virtual void printStats(std::ostream&) const {}
+
+  // Evaluator interface
+ protected:
+  std::string getTypeImpl() const;
+};
+/**
+ * @brief evaluate AUC using colIdx-th column as prediction.
+ * The AUC(Area Under the Curve) is a common evaluation metric
+ * for binary classification problems. It computes the area under
+ * the receiver operating characteristic(ROC) curve.
+ *
+ * @note colIdx-th column
+ *
+ * - colIdx = 0: the 0-th column.
+ * - colIdx > 0: the colIdx-th column.
+ * - colIdx < 0: the last colIdx-th column.
+ *
+ * The config file api is auc_evaluator.
+ *
+ */
+class AucEvaluator : public Evaluator {
+ public:
+  AucEvaluator(int32_t colIdx)
+      : colIdx_(colIdx),
+        realColumnIdx_(0),
+        cpuOutput_(nullptr),
+        cpuLabel_(nullptr),
+        cpuWeight_(nullptr) {}
+
+  virtual void start();
+
+  virtual real evalImp(std::vector<Argument>& arguments);
+
+  virtual void printStats(std::ostream& os) const {
+    os << config_.name() << "=" << calcAuc();
+  }
+
+  virtual void distributeEval(ParameterClient2* client);
+
+ private:
+  static const uint32_t kBinNum_ = (1 << 24) - 1;
+  static const int kNegativeLabel_ = 0;
+  double statPos_[kBinNum_ + 1];
+  double statNeg_[kBinNum_ + 1];
+  int32_t colIdx_;
+  uint32_t realColumnIdx_;
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
+  MatrixPtr cpuWeight_;
+
+  AucEvaluator() {}
+
+  inline static double trapezoidArea(double X1,
+                                     double X2,
+                                     double Y1,
+                                     double Y2) {
+    return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0;
+  }
+
+  double calcAuc() const;
+
+  // Evaluator interface
+ protected:
+  real getValueImpl() const;
+  std::string getTypeImpl() const;
+};
+
+/**
+ * @brief RankAucEvaluator calculates the AUC of each list (i.e., titles
+ * under the same query), and averages them. Each list should be organized
+ * as a sequence. The inputs of this evaluator is [output, click, pv]. If pv
+ * is not provided, it will be set to 1. The types of click and pv are
+ * dense value.
+ */
+class RankAucEvaluator : public Evaluator {
+ public:
+  // evaluate ranking AUC
+  virtual void start();
+
+  virtual void updateSamplesNum(const std::vector<Argument>& arguments);
+
+  virtual real evalImp(std::vector<Argument>& arguments);
+
+  virtual void distributeEval(ParameterClient2* client) {
+    mergeResultsOfAllClients(client);
+  }
+
+ private:
+  MatrixPtr output_;
+  MatrixPtr click_;
+  MatrixPtr pv_;
+  std::vector<std::pair<real, int>> outputPair_;
+
+  double calcRankAuc(real* outputData,
+                     real* clickData,
+                     real* pvData,
+                     size_t size);
+
+  // Evaluator interface
+ protected:
+  std::string getTypeImpl() const;
+};
+
+/**
+ * @brief precision, recall and f1 score Evaluator
+ * \f[
+ * precision = \frac{tp}{tp+tn} \\
+ * recall=\frac{tp}{tp+fn} \\
+ * f1=2*\frac{precsion*recall}{precision+recall}
+ * \f]
+ *
+ * The config file api is precision_recall_evaluator.
+ */
+class PrecisionRecallEvaluator : public Evaluator {
+ public:
+  // Evaluate precision, recall and F1 score
+  PrecisionRecallEvaluator()
+      : isMultiBinaryLabel_(false),
+        cpuOutput_(nullptr),
+        cpuLabel_(nullptr),
+        cpuWeight_(nullptr) {}
+
+  virtual void start();
+
+  virtual real evalImp(std::vector<Argument>& arguments);
+
+  virtual void printStats(std::ostream& os) const;
+
+  virtual void distributeEval(ParameterClient2* client);
+
+  void getNames(std::vector<std::string>* names);
+
+  real getValue(const std::string& name, Error* err) const;
+
+  std::string getType(const std::string& name, Error* err) const;
+
+  struct StatsInfo {
+    /// numbers of true positives
+    double TP;
+    /// numbers of true negatives
+    double TN;
+    /// numbers of false positives
+    double FP;
+    /// numbers of false negatives
+    double FN;
+
+    StatsInfo() : TP(0.0), TN(0.0), FP(0.0), FN(0.0) {}
+  };
+
+ private:
+  bool isMultiBinaryLabel_;
+  std::vector<StatsInfo> statsInfo_;
+
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
+  MatrixPtr cpuWeight_;
+
+  struct PrintStatsInfo {
+    double precision;
+    double recall;
+    double f1;
+    double macroAvgPrecision;
+    double macroAvgRecall;
+    double macroAvgF1Score;
+    double microAvgPrecision;
+    double microAvgRecall;
+    double microAvgF1Score;
+  };
+
+  bool getStatsInfo(PrintStatsInfo* info) const;
+
+  void calcStatsInfo(const MatrixPtr& output,
+                     const IVectorPtr& label,
+                     const MatrixPtr& weight);
+
+  void calcStatsInfoMulti(const MatrixPtr& output,
+                          const MatrixPtr& label,
+                          const MatrixPtr& weight);
+
+  inline static double calcPrecision(double TP, double FP) {
+    if (TP > 0.0 || FP > 0.0) {
+      return TP / (TP + FP);
+    } else {
+      return 1.0;
+    }
+  }
+
+  inline static double calcRecall(double TP, double FN) {
+    if (TP > 0.0 || FN > 0.0) {
+      return TP / (TP + FN);
+    } else {
+      return 1.0;
+    }
+  }
+
+  inline static double calcF1Score(double precision, double recall) {
+    if (precision > 0.0 || recall > 0.0) {
+      return 2 * precision * recall / (precision + recall);
+    } else {
+      return 0;
+    }
+  }
+
+  mutable std::unordered_map<std::string, real> values_;
+
+  void storeLocalValues() const;
+};
+
+/*
+ * @brief positive-negative pair rate Evaluator
+ *
+ * The config file api is pnpair_evaluator.
+ */
+class PnpairEvaluator : public Evaluator {
+ public:
+  PnpairEvaluator()
+      : cpuOutput_(nullptr),
+        cpuLabel_(nullptr),
+        cpuInfo_(nullptr),
+        cpuWeight_(nullptr) {}
+
+  virtual void start();
+  virtual real evalImp(std::vector<Argument>& arguments);
+
+  struct PredictionResult {
+    PredictionResult(real __out, int __label, int __queryid, real __weight)
+        : out(__out), label(__label), queryid(__queryid), weight(__weight) {}
+    real out;
+    int label;
+    int queryid;
+    real weight;
+  };
+  std::vector<PredictionResult> predictArray_;
+  void printPredictResults() {
+    std::ofstream fs(FLAGS_predict_file);
+    CHECK(fs) << "Fail to open " << FLAGS_predict_file;
+    for (auto& res : predictArray_) {
+      fs << res.out << " " << res.label << " " << res.queryid << std::endl;
+    }
+  }
+
+  void stat(size_t start,
+            size_t end,
+            PredictionResult* answers,
+            double& pos,
+            double& neg,
+            double& spe);
+  void calc(std::vector<PredictionResult>& predictArray);
+
+  virtual void finish() { calc(predictArray_); }
+
+  virtual void printStats(std::ostream& os) const {
+    os << " pos/neg=" << this->getValueImpl();
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    client->reduce(pairArray_, pairArray_, kPairArrayNum_, FLAGS_trainer_id, 0);
+    LOG(INFO) << " distribute eval calc total pos pair: " << pairArray_[0]
+              << " calc total neg pair: " << pairArray_[1];
+  }
+
+ private:
+  static const uint32_t kPairArrayNum_ = 2;
+  double pairArray_[kPairArrayNum_];
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
+  IVectorPtr cpuInfo_;
+  MatrixPtr cpuWeight_;
+
+  // Evaluator interface
+ protected:
+  real getValueImpl() const {
+    return pairArray_[0] / ((pairArray_[1] <= 0) ? 1.0 : pairArray_[1]);
+  }
+  std::string getTypeImpl() const;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/GradientMachine.cpp b/paddle/legacy/gserver/gradientmachines/GradientMachine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1c4034d8bba59dbae0a1059b96ac2b6f18c5971b
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/GradientMachine.cpp
@@ -0,0 +1,104 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "GradientMachine.h"
+
+#include <fstream>
+#include "paddle/legacy/utils/Logging.h"
+
+#include "NeuralNetwork.h"
+#include "hl_gpu.h"
+
+#ifndef PADDLE_MOBILE_INFERENCE
+#include "GradientMachineMode.h"
+#include "MultiGradientMachine.h"
+#include "MultiNetwork.h"
+#include "ParallelNeuralNetwork.h"
+#endif
+
+namespace paddle {
+
+GradientMachine* GradientMachine::create(
+    const ModelConfig& config,
+    int mode,
+    const std::vector<ParameterType>& parameterTypes) {
+#ifndef PADDLE_MOBILE_INFERENCE
+  if (auto gm = IGradientMachineMode::tryCreateGradientMachine(mode, config)) {
+    return gm;
+  }
+  if (FLAGS_trainer_count > 1) {
+    return new MultiGradientMachine(config, FLAGS_use_gpu);
+  }
+#endif
+  if (FLAGS_trainer_count == 1) {  // single
+#ifndef PADDLE_MOBILE_INFERENCE
+    NeuralNetwork* nn;
+    if (config.type() == "multi_nn") {
+      /* multi submodel calculate, thread(s) will be initialized inside */
+      nn = new MultiNetwork("root");
+    } else if (FLAGS_parallel_nn) {
+      /* multi threads calculate */
+      nn = new ParallelNeuralNetwork();
+    } else {
+      /* single thread calculate */
+      nn = NeuralNetwork::create(config);
+    }
+#else
+    NeuralNetwork* nn = NeuralNetwork::create(config);
+#endif
+    ParamInitCallback testParamInitCb = [](int paramId, Parameter* para) {
+      para->enableType(PARAMETER_VALUE);
+    };
+    nn->init(
+        config, mode == kTesting ? testParamInitCb : nullptr, parameterTypes);
+    return nn;
+  }
+  LOG(FATAL) << "Unknown model type: " << config.type();
+  return nullptr;
+}
+
+void GradientMachine::saveParameters(const std::string& dir) const {
+  LOG(INFO) << "Saving parameters to " << dir;
+
+  for (auto& para : parameters_) {
+    std::string filename = dir + "/" + para->getName();
+    if (para->isFullSize()) {
+      para->save(filename);
+    }
+  }
+}
+
+void GradientMachine::loadParameters(const std::string& dir) {
+  LOG(INFO) << "Loading parameters from " << dir;
+
+  for (auto& para : parameters_) {
+    std::string filename = dir + "/" + para->getName();
+    if (para->isFullSize()) {
+      para->load(filename);
+    }
+  }
+}
+
+void GradientMachine::randParameters() {
+  LOG(INFO) << "Initing parameters..";
+
+  for (auto& para : parameters_) {
+    if (para->isFullSize()) {
+      para->randomize();
+    }
+  }
+  LOG(INFO) << "Init parameters done.";
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/GradientMachine.h b/paddle/legacy/gserver/gradientmachines/GradientMachine.h
new file mode 100644
index 0000000000000000000000000000000000000000..d4f754a9f4dc3175f5000774c77a0e7334df7d85
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/GradientMachine.h
@@ -0,0 +1,250 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "ModelConfig.pb.h"
+#include "TrainerConfig.pb.h"
+#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
+#include "paddle/legacy/gserver/layers/Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/parameter/ParameterUpdaterBase.h"
+#include "paddle/legacy/utils/Thread.h"
+
+#ifndef PADDLE_MOBILE_INFERENCE
+#include "paddle/legacy/gserver/evaluators/Evaluator.h"
+#endif
+
+namespace paddle {
+/**
+ * @brief A gradient machine is capable of calculating some outputs given
+ *        some inputs and performing gradient calculation based on the
+ *        derivative from the outputs.
+ *
+ * A gradient machine can be either a full neural network or part of a neural
+ * network.
+ *
+ * Usage for training:
+ *
+ *  1. Prepare inArgs. Put your input data into inArgs[i].value.
+ *
+ *  2. Call forward(inArgs, &outArgs)
+ *
+ *  3. Calculate gradient with respect to outArgs[i]->value
+ *     and fill them into outArgs[i]->grad.
+ *     This step can be skipped if your the outputs are from cost layers.
+ *
+ *  4. Call backward(). After backward, gradient of each parameter is
+ *     accumulated to getParameters()[i]->getBuf(PARAMETER_GRADIENT)
+ *
+ *  5. Update parameter value getParameters()[i]->getBuf(PARAMETER_VALUE) using
+ *     gradients.
+ *
+ *  6. Clear gradients to zero.
+ *
+ * Usage for prediction:
+ *
+ *  1. Prepare inArgs. Put your input data into inArgs[i].value.
+ *
+ *  2. Call forward(inArgs, &outArgs)
+ *
+ *  3. Obtain the prediction result from outArgs[i]
+ */
+
+typedef std::vector<LayerStatePtr> MachineState;
+
+class GradientMachine;
+
+typedef std::shared_ptr<GradientMachine> GradientMachinePtr;
+
+class GradientMachine {
+ public:
+  enum CreateMode {
+    kNormal = 0,
+    kSgdSparseCpuTraining = 3,
+    kTesting = 4,
+    kCustom = 10
+  };
+
+  /**
+   * Create a gradient machine from ModelConfig
+   * Parameter will have parameterTypes
+   */
+  static GradientMachine* create(
+      const ModelConfig& config,
+      int mode = kNormal,
+      const std::vector<ParameterType>& parameterTypes =
+          std::vector<ParameterType>{
+              PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM});
+
+  virtual ~GradientMachine() {}
+
+  /**
+   * Prefetch row ids of sparse parameter.
+   */
+  virtual void prefetch(const std::vector<Argument>& inArgs) { (void)inArgs; }
+
+  /**
+   * @brief Forward propagation.
+   *
+   * Calculate outputs (outArgs) based the inputs (inArgs)
+   *
+   * @note: if passType==PASS_TEST, then backward() should not be called
+   */
+  virtual void forward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs,
+                       PassType passType) = 0;
+
+  /**
+   * @brief Backward propagation.
+   *
+   * Calculate the gradient of inArgs and parameter.
+   *
+   * This function should only be called after a corresponding forward() call.
+   * The caller is responsible for filling the correct grad for the outArgs
+   * obtained using forward().
+   *
+   * It may also change the grad field for the inArgs supplied at forward()
+   */
+  virtual void backward(const UpdateCallback& callback = nullptr) = 0;
+
+  /**
+   * Combine forward() and backward(). For multithread training, this
+   * may be faster.
+   *
+   * @note: passType PASS_TEST is not allowed for forwardBackward().
+   */
+  virtual void forwardBackward(const std::vector<Argument>& inArgs,
+                               std::vector<Argument>* outArgs,
+                               PassType passType,
+                               const UpdateCallback& callback = nullptr) {
+    forward(inArgs, outArgs, passType);
+    backward(callback);
+  }
+
+  virtual Argument getLayerOutput(const std::string& layerName) = 0;
+
+  // see comment in Layer.h for the function with the same name
+  virtual void resetState() {}
+
+  // set machine state
+  virtual void setState(const MachineState& machineState) {}
+
+  // save machine state
+  virtual void getState(MachineState& machineState) {}
+
+  virtual void onPassEnd() = 0;
+
+#ifndef PADDLE_MOBILE_INFERENCE
+  /**
+   * Create an evaluator which can be used for eval()
+   */
+  virtual Evaluator* makeEvaluator() const = 0;
+
+  /**
+   * evaluate using the given evaluator
+   */
+  virtual void eval(Evaluator* evaluator) const = 0;
+#endif
+
+  std::vector<ParameterPtr>& getParameters() { return parameters_; }
+
+  std::vector<ParameterPtr>& getNonStaticParameters() {
+    if (nonStaticParameters_.empty()) {
+      for (auto para : parameters_) {
+        if (!para->isStatic()) {
+          nonStaticParameters_.push_back(para);
+        }
+      }
+    }
+    return nonStaticParameters_;
+  }
+
+  inline bool hasStaticParameters() {
+    return parameters_.size() != getNonStaticParameters().size();
+  }
+
+  /**
+   * @brief   Used before formal training, start work-threads and set
+   *          trainer Parameters;
+   *
+   * @note    This function will only been implemented and used in a
+   *          multithreaded environment.
+   */
+  virtual void start() {}
+
+  /**
+   * @brief   check  each work-thread whether is failed/error/finish,
+   *          if not, return ture, and yes return false.
+   *
+   * @note    This function will only been implemented and used in a
+   *          multithreaded environment.
+   */
+  virtual void finish() {}
+
+  /**
+   * @brief   set the training status a "finished" value, the sub_work_threads
+   *          will option the change, and then exit.
+   *
+   * @note    This function will only been implemented and used in a
+   *          multithreaded environment.
+   */
+  virtual bool trainIsOn() { return true; }
+
+  /**
+   * @brief   when all or some of the sub-workThreads are suspended to waiting
+   *          controller's instructions, and after some processing done in the
+   *          controller, it will call this function to wake up all the pending
+   *          thread.
+   *
+   * @note    This function will only been implemented and used in a
+   *          multithreaded environment.
+   */
+  virtual void restart() {}
+
+  /// Set the gradient of the output from outside.
+  virtual void setOutputGrad(const std::vector<Argument>& args) {
+    LOG(FATAL) << "Not implemented!";
+  }
+
+  void saveParameters(const std::string& dir) const;
+
+  void loadParameters(const std::string& dir);
+
+  void randParameters();
+
+  virtual void getStats(real& cost, int64_t& numProcessed) {
+    (void)cost;
+    (void)numProcessed;
+  }
+
+  /**
+   * @brief   Release the middle layer's output memory.
+   *
+   * @note    This function is used for memory optimization in inference.
+   */
+  virtual void releaseOutput() {}
+
+ protected:
+  virtual void onLoadParameter() {}
+
+  std::vector<ParameterPtr> parameters_;
+  std::vector<ParameterPtr> nonStaticParameters_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/GradientMachineMode.cpp b/paddle/legacy/gserver/gradientmachines/GradientMachineMode.cpp
similarity index 100%
rename from paddle/gserver/gradientmachines/GradientMachineMode.cpp
rename to paddle/legacy/gserver/gradientmachines/GradientMachineMode.cpp
diff --git a/paddle/legacy/gserver/gradientmachines/GradientMachineMode.h b/paddle/legacy/gserver/gradientmachines/GradientMachineMode.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd944a35f8952e354f8e4f3eb5c67b136c5f080e
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/GradientMachineMode.h
@@ -0,0 +1,149 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "GradientMachine.h"
+#include "unordered_map"
+
+namespace paddle {
+
+class IGradientMachineMode {
+ public:
+  virtual ~IGradientMachineMode() {}
+
+ public:  // interfaces
+          /**
+           * @brief create current mode's gradient machine by model config.
+           * @param config model config
+           */
+  virtual GradientMachine* create(const ModelConfig& config) = 0;
+
+  /**
+   * @brief shouldBeMe the current mode of GradientMachine should be this mode.
+   * @param algo training algorithm name.
+   * @param trainerCount trainer count.
+   * @param isLocal is local mode (without pserver)
+   * @param isGpu is using gpu.
+   * @return true if mode should be this mode.
+   */
+  virtual bool shouldBeMe(const std::string& algo,
+                          size_t trainerCount,
+                          bool isLocal,
+                          bool isGpu) const = 0;
+
+  /**
+   * @brief Is data must be in cpu even if using gpu mode.
+   * @param trainerCount trainer count
+   * @return true if data must be gpu.
+   */
+  virtual bool isDataMustInCpu(size_t trainerCount) const = 0;
+
+  /**
+   * @brief Need not to use mini-batch method, and should train all data in one
+   * batch in one pass.
+   */
+  virtual bool needTrainWholeDataInOneBatch() const = 0;
+
+ public:  // static methods.
+          /**
+           * @brief register a custom gradient machine mode.
+           * @note For user to register a custom gradient machine mode, id should >=
+           * kCustom.
+           * @param mode mode id.
+           * @param ptr mode description object.
+           */
+  static void regGradientMachineMode(
+      int32_t mode, std::unique_ptr<IGradientMachineMode>&& ptr) {
+    modes_.insert(std::make_pair(mode, std::move(ptr)));
+  }
+
+  /**
+   * @brief get custom mode from mode id.
+   * @param mode mode id
+   * @return mode description object.
+   */
+  static IGradientMachineMode* mode(int32_t mode) {
+    if (modes_.find(mode) != modes_.end()) {
+      return modes_[mode].get();
+    } else {
+      return nullptr;
+    }
+  }
+
+  /**
+   * @brief helper function to test trainWholeDataInOneBatch or not for mode
+   */
+  static bool trainWholeDataInOneBatch(int32_t mode) {
+    if (modes_.find(mode) != modes_.end()) {
+      return modes_[mode]->needTrainWholeDataInOneBatch();
+    } else {
+      return false;
+    }
+  }
+
+  /**
+   * @brief Try to get custom mode if we can.
+   * @param [out] mode the custom mode id.
+   * @param [in] algo algorithm name
+   * @param [in] trainerCount trainer count.
+   * @param [in] isLocal is local or not
+   * @param [in] isGpu using gpu or not.
+   * @return true if there is a custom mode fit these conditions.
+   */
+  static bool tryGetMode(int* mode,
+                         const std::string& algo,
+                         int32_t trainerCount,
+                         bool isLocal,
+                         bool isGpu) {
+    for (auto it = modes_.begin(); it != modes_.end(); ++it) {
+      if (it->second->shouldBeMe(algo, trainerCount, isLocal, isGpu)) {
+        *mode = it->first;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /**
+   * @brief helper function for data must in cpu
+   */
+  static bool dataMustInCpu(int32_t mode, size_t trainerCount) {
+    if (modes_.find(mode) != modes_.end()) {
+      return modes_[mode]->isDataMustInCpu(trainerCount);
+    } else {
+      // provide data to cpu if using synchronized multi-gpu gradient machine.
+      return trainerCount > 1;
+    }
+  }
+
+  /**
+   * @brief try to create gradient machine by mode & config.
+   * @return nullptr if we cannot create a gradient machine by such mode.
+   */
+  static GradientMachine* tryCreateGradientMachine(int32_t mode,
+                                                   const ModelConfig& config) {
+    auto m = IGradientMachineMode::mode(mode);
+    if (m) {
+      return m->create(config);
+    } else {
+      return nullptr;
+    }
+  }
+
+ private:
+  static std::unordered_map<int32_t, std::unique_ptr<IGradientMachineMode>>
+      modes_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..637686e443ceb740d52d42524246ae48a85d52f0
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -0,0 +1,894 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MultiGradientMachine.h"
+
+#include "paddle/legacy/utils/Logging.h"
+
+#include "paddle/legacy/utils/Stat.h"
+
+#include "NeuralNetwork.h"
+#include "ParallelNeuralNetwork.h"
+
+DEFINE_bool(allow_only_one_model_on_one_gpu,
+            true,
+            "If true, do not allow multiple models on one GPU device");
+
+namespace paddle {
+
+// get types of the parameters which need to be merged after backward()
+static void fillMergeTypes(PassType passType,
+                           std::vector<ParameterType>* mergeTypes) {
+  mergeTypes->clear();
+  if (passType != PASS_TEST) {
+    mergeTypes->push_back(PARAMETER_GRADIENT);
+  }
+}
+
+MultiGradientMachine::MultiGradientMachine(const ModelConfig& config,
+                                           bool useGpu)
+    : useGpu_(useGpu),
+      trainerBarrier_(FLAGS_trainer_count),
+      allBarrier_(FLAGS_trainer_count + 1),
+      inArgsCopied_(false) {
+  isPassGrad_ = false;
+  numThreads_ = FLAGS_trainer_count;
+  if (useGpu) {
+    //! TODO(yuyang18): When useGpu=false && paddle is not compiled with gpu,
+    //! the hl_get_device_count will get an error result. It seems should return
+    //! 0 when hppl is not compiled as gpu version.
+    numDevices_ = hl_get_device_count();
+  } else {
+    numDevices_ = 0;
+  }
+  ParamInitCallback mainParamInitCb = [](int paramId, Parameter* para) {
+    // only create buf for CPU parameters
+    // GPU parameters will be created in each thread
+    if (para->useGpu()) return;
+
+    if (para->isSparseRemoteUpdate()) {
+      para->enableType(PARAMETER_VALUE,
+                       FLAGS_loadsave_parameters_in_pserver
+                           ? Parameter::MAT_SPARSE_ROW_PREFETCH
+                           : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
+      para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW);
+    } else if (para->isGradSparseUpdate()) {
+      para->enableType(PARAMETER_VALUE);
+      para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW_IDS);
+      SparseRowIdsCpuMatrix* mat = dynamic_cast<SparseRowIdsCpuMatrix*>(
+          para->getMat(PARAMETER_GRADIENT).get());
+      mat->setNumOfThreads(FLAGS_trainer_count);
+    } else if (para->isValueShared()) {
+      para->enableType(PARAMETER_VALUE, Parameter::MAT_VALUE_SHARED);
+      if (!para->isStatic()) {
+        para->enableType(PARAMETER_GRADIENT);
+      }
+    } else {
+      para->enableType(PARAMETER_VALUE);
+      if (!para->isStatic()) {
+        para->enableType(PARAMETER_GRADIENT);
+      }
+    }
+  };
+
+  NeuralNetwork* nn = NeuralNetwork::create(config);
+  nn->init(config, mainParamInitCb);
+  gradientMachine_.reset(nn);
+  parameters_ = gradientMachine_->getParameters();
+
+  numLogicalDevices_ = 0;
+  if (useGpu_) {
+    numLogicalDevices_ = 1;
+
+    for (size_t pid = 0; pid < parameters_.size(); pid++) {
+      if (parameters_[pid]->getConfig().device() + 1 > numLogicalDevices_) {
+        numLogicalDevices_ = parameters_[pid]->getConfig().device() + 1;
+      }
+    }
+    LOG(INFO) << "numLogicalDevices=" << numLogicalDevices_
+              << " numThreads=" << numThreads_ << " numDevices=" << numDevices_;
+
+    if (numLogicalDevices_ * numThreads_ > numDevices_ &&
+        FLAGS_allow_only_one_model_on_one_gpu) {
+      LOG(FATAL) << "trainer_count * num_devices_in_model "
+                 << "(" << numThreads_ << "*" << numLogicalDevices_ << ")"
+                 << "=" << numThreads_ * numLogicalDevices_
+                 << " exceeds number of GPU devices(" << numDevices_ << ")";
+    }
+    numLogicalDevices_ = std::min(numLogicalDevices_, numDevices_);
+
+    /* Enables direct access to memory allocations on a peer device */
+    for (int i = 0; i < numThreads_; i++) {
+      for (int d = 0; d < numLogicalDevices_; ++d) {
+        enablePeerAccess(logicalDeviceId2RealDeviceId(d, i),
+                         logicalDeviceId2RealDeviceId(d, i + 1));
+        enablePeerAccess(logicalDeviceId2RealDeviceId(d, i),
+                         logicalDeviceId2RealDeviceId(d, i - 1));
+      }
+    }
+  }
+
+  for (int i = 0; i < numThreads_; ++i) {
+    threads_.emplace_back(new TrainerThread(config, i, this));
+  }
+
+  bufferSizes_.resize(numLogicalDevices_, 0);
+  paraMainThread_.reserve(parameters_.size());
+  int pid = 0;
+  for (auto& para : parameters_) {
+    if (para->isStatic() || !para->useGpu()) {
+      paraMainThread_.push_back(0);
+    } else {
+      int end = pid++ % numThreads_;
+      paraMainThread_.push_back(end);
+      int paraDeviceId = para->getDeviceId();
+      if (paraDeviceId == -1) paraDeviceId = 0;
+      paraDeviceId = paraDeviceId % numLogicalDevices_;
+      if (para->getSize() > bufferSizes_[paraDeviceId]) {
+        bufferSizes_[paraDeviceId] = para->getSize();
+        VLOG(1) << "bufferSize[" << paraDeviceId << "]" << para->getSize();
+      }
+    }
+  }
+
+  // TODO(xuwei06) Instead of using maximal buffer size, we may use a smaller
+  // fixed buffer size and use pipeline to dispatch parameter value and merge
+  // parameter gradient, which may be faster.
+
+  // combination of all trainers mainPara into GradientMachine parameters
+  hasNonstaticCpuParamters_ = false;
+  for (size_t pid = 0; pid < parameters_.size(); pid++) {
+    if (parameters_[pid]->useGpu()) {
+      parameters_[pid] = threads_[paraMainThread_[pid]]->getParameters()[pid];
+    } else if (!parameters_[pid]->isStatic()) {
+      hasNonstaticCpuParamters_ = true;
+    }
+  }
+
+  gradBufs_.resize(numThreads_);
+  for (int i = 0; i < numThreads_; ++i) {
+    gradBufs_[i].resize(numLogicalDevices_);
+    for (int d = 0; d < numLogicalDevices_; ++d) {
+      gradBufs_[i][d].sem.post();
+    }
+  }
+
+  outArgStream_ = HPPL_STREAM_1;
+
+  start();
+}
+
+void MultiGradientMachine::start() {
+  for (auto& thread : threads_) {
+    thread->start();
+  }
+}
+
+void MultiGradientMachine::finish() {
+  for (auto& thread : threads_) {
+    thread->stop();
+  }
+}
+
+std::vector<const std::vector<ParameterPtr>*>
+MultiGradientMachine::getSlaveParameters() {
+  std::vector<const std::vector<ParameterPtr>*> vec;
+  vec.reserve(threads_.size());
+  for (auto& thread : threads_) {
+    vec.push_back(&thread->getParameters());
+  }
+  return vec;
+}
+
+void MultiGradientMachine::notifyGradientTransfer(int paramId) {
+  gradQueue_.enqueue(paramId);
+}
+
+void MultiGradientMachine::allocGradBufs() {
+  if (numLogicalDevices_ == 0) return;
+  if (gradBufs_[0][0].bufs.size() >= mergeTypes_.size()) return;
+
+  for (int i = 0; i < numThreads_; i++) {
+    for (int d = 0; d < numLogicalDevices_; ++d) {
+      if (bufferSizes_[d] == 0) continue;
+      SetDevice device(logicalDeviceId2RealDeviceId(d, i));
+      for (size_t j = 0; j < mergeTypes_.size(); j++) {
+        gradBufs_[i][d].bufs.push_back(
+            Vector::create(bufferSizes_[d], /* useGpu= */ true));
+      }
+    }
+  }
+}
+
+void MultiGradientMachine::prefetch(const std::vector<Argument>& inArgs) {
+  // Each gradient machine in threads needs to do prefetch on its own
+  // part of inArgs. So we need to first divide inArgs to each thread
+  inArgs_ = inArgs;
+  startTask(TASK_COPY_IN_ARGS);
+
+  for (auto& para : parameters_) {
+    if (para->isSparseRemoteUpdate()) {
+      auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
+          para->getMat(PARAMETER_VALUE).get());
+      mat->clearIndices();
+    }
+  }
+
+  waitForCopyInArgs();
+
+  // Because SparsePrefetchRowCpuMatrix can only be changed by ONE thread
+  // at one time, we need to do prefetch sequentially
+  for (auto& thread : threads_) {
+    thread->prefetch();
+  }
+
+  for (auto& para : parameters_) {
+    if (para->isSparseRemoteUpdate()) {
+      auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
+          para->getMat(PARAMETER_VALUE).get());
+      mat->setupIndices();
+      auto matGrad = dynamic_cast<SparseRowCpuMatrix*>(
+          para->getMat(PARAMETER_GRADIENT).get());
+      matGrad->reserveStore();
+    }
+  }
+}
+
+void MultiGradientMachine::forward(const std::vector<Argument>& inArgs,
+                                   std::vector<Argument>* outArgs,
+                                   PassType passType) {
+  forwardImp(inArgs, outArgs, passType, TASK_FORWARD);
+}
+
+void MultiGradientMachine::forwardImp(const std::vector<Argument>& inArgs,
+                                      std::vector<Argument>* outArgs,
+                                      PassType passType,
+                                      TaskType taskType) {
+  updateThreadParameters();
+  passType_ = passType;
+
+  if (!inArgsCopied_) {
+    inArgs_ = inArgs;
+    inArgsCopied_ = false;
+  }
+
+  fillMergeTypes(passType, &mergeTypes_);
+  allocGradBufs();
+  startTask(taskType);
+
+  getOutArgs(outArgs, passType);
+}
+
+void MultiGradientMachine::backward(const UpdateCallback& callback) {
+  backwardCallback_ = callback;
+  startTask(TASK_BACKWARD);
+  backwardImp(callback);
+}
+
+void MultiGradientMachine::forwardBackward(const std::vector<Argument>& inArgs,
+                                           std::vector<Argument>* outArgs,
+                                           PassType passType,
+                                           const UpdateCallback& callback) {
+  backwardCallback_ = callback;
+  forwardImp(inArgs, outArgs, passType, TASK_FORWARD_BACKWARD);
+  backwardImp(callback);
+}
+
+Argument MultiGradientMachine::getLayerOutput(const std::string& layerName) {
+  std::vector<Argument> args;
+  args.reserve(threads_.size());
+
+  for (auto& thread : threads_) {
+    args.push_back(thread->getGradientMachine()->getLayerOutput(layerName));
+  }
+  outLayerArgs_.concat(args, false /* use_gpu */, outArgStream_, passType_);
+
+  return outLayerArgs_;
+}
+
+void MultiGradientMachine::backwardImp(const UpdateCallback& callback) {
+  for (size_t i = 0; i < parameters_.size(); i++) {
+    if (!parameters_[i]->useGpu() || parameters_[i]->isStatic()) continue;
+    REGISTER_TIMER("controller_dequeue");
+    gradQueue_.dequeue();
+  }
+  if (hasNonstaticCpuParamters()) {
+    waitAfterMerge();
+    if (backwardCallback_) {
+      for (auto& para : parameters_) {
+        if (!para->useGpu() && !para->isStatic()) {
+          backwardCallback_(para.get());
+        }
+      }
+    }
+  }
+}
+
+void MultiGradientMachine::updateThreadParameters() {
+  for (size_t pid = 0; pid < parameters_.size(); ++pid) {
+    if (!parameters_[pid]->useGpu()) continue;
+    if (!parameters_[pid]->isValueUpdated()) continue;
+    parameters_[pid]->clearValueUpdated();
+    for (int i = 0; i < (int)threads_.size(); i++) {
+      threads_[i]->incUpdateCounter();
+    }
+    // NotifyValueReady should happen after that all threads' incUpdateCounter()
+    // are called so that the counters are correct when notifyValueReady()
+    // is called.
+    threads_[paraMainThread_[pid]]->notifyValueReady(pid);
+  }
+}
+
+void MultiGradientMachine::onPassEnd() {
+  for (auto& thread : threads_) {
+    thread->onPassEnd();
+  }
+}
+
+Evaluator* MultiGradientMachine::makeEvaluator() const {
+  return threads_[0]->getGradientMachine()->makeEvaluator();
+}
+
+void MultiGradientMachine::eval(Evaluator* evaluator) const {
+  for (auto& thread : threads_) {
+    SetDevice device(thread->getDeviceId());
+    if (thread->hasInputData()) {
+      thread->getGradientMachine()->eval(evaluator);
+    }
+  }
+}
+
+void MultiGradientMachine::getOutArgs(std::vector<Argument>* outArgs,
+                                      PassType passType) {
+  for (auto& thread : threads_) {
+    REGISTER_TIMER("waitOutArgs");
+    thread->waitOutArgsReady();
+  }
+
+  outArgs_.resize(threads_[threads_.size() - 1]->getOutArgs().size());
+
+  REGISTER_TIMER("copyOutArgs");
+  for (size_t i = 0; i < outArgs_.size(); ++i) {
+    std::vector<Argument> args;
+    args.reserve(threads_.size());
+    for (auto& thread : threads_) {
+      // If the thread input is empty, then the output is empty.
+      auto tmp = thread->getOutArgs();
+      if (tmp.size() > 0) {
+        args.push_back(tmp[i]);
+      }
+    }
+    outArgs_[i].concat(args, useGpu_, outArgStream_, passType);
+  }
+
+  if (useGpu_) {
+    hl_stream_synchronize(outArgStream_);
+  }
+
+  *outArgs = outArgs_;
+}
+
+void MultiGradientMachine::setOutputGrad(const std::vector<Argument>& args) {
+  CHECK_EQ(args.size(), outArgs_.size());
+  for (size_t i = 0; i < args.size(); i++) {
+    outArgs_[i].grad = args[i].grad;
+  }
+}
+
+void MultiGradientMachine::startTask(TaskType taskType) {
+  taskType_ = taskType;
+  for (auto& thread : threads_) {
+    thread->notifyTaskReady();
+  }
+}
+
+TrainerThread::TrainerThread(const ModelConfig& config,
+                             int threadId,
+                             MultiGradientMachine* multiMachine)
+    : multiMachine_(multiMachine),
+      config_(config),
+      threadId_(threadId),
+      inArgsCopied_(false) {
+  int numThreads = multiMachine->getNumThreads();
+
+  auto& mainParas = multiMachine->getParameters();
+
+  using std::placeholders::_1;
+  using std::placeholders::_2;
+
+  partnerId_ = mod(threadId_ - 1, numThreads);
+
+  deviceId_ = !multiMachine_->useGpu()
+                  ? -1
+                  : multiMachine_->logicalDeviceId2RealDeviceId(0, threadId_);
+  SetDevice gpuDevice(deviceId_);
+
+  NeuralNetwork* nn = nullptr;
+  if (!multiMachine->useGpu() || !FLAGS_parallel_nn) {
+    nn = NeuralNetwork::create(config);
+  } else {
+    nn = new ParallelNeuralNetwork();
+    for (auto& paraConfig : *config_.mutable_parameters()) {
+      if (paraConfig.device() != -1) {
+        paraConfig.set_device(multiMachine_->logicalDeviceId2RealDeviceId(
+            paraConfig.device(), threadId_));
+      }
+    }
+    for (auto& layerConfig : *config_.mutable_layers()) {
+      if (layerConfig.device() != -1) {
+        layerConfig.set_device(multiMachine_->logicalDeviceId2RealDeviceId(
+            layerConfig.device(), threadId_));
+      }
+    }
+  }
+  // Only GPU do not share parameter values with main paramters.
+  ParamInitCallback slaveParamInitCb =
+      std::bind(parameterInitNN, _1, _2, &mainParas);
+  nn->init(config_, slaveParamInitCb);
+  gradientMachine_.reset(nn);
+  parameters_ = gradientMachine_->getParameters();
+  if (!FLAGS_parallel_nn) {
+    for (auto& para : parameters_) {
+      para->setDevice(deviceId_);
+    }
+  }
+
+  backwardCallback_ =
+      std::bind(&TrainerThread::backwardCallback, this, std::placeholders::_1);
+
+  gradStream_ = HPPL_STREAM_2;
+  valueStream_ = HPPL_STREAM_3;
+  stopping_ = true;
+  updateCounter_ = 0;
+  parameterUpdated_ = false;
+}
+
+TrainerThread::~TrainerThread() { stop(); }
+
+void TrainerThread::start() {
+  if (!stopping_) return;
+
+  stopping_ = false;
+
+  gradientMachine_->start();
+
+  computeThread_.reset(new std::thread([this]() { computeThread(); }));
+
+  if (multiMachine_->useGpu()) {
+    gradCollectThread_.reset(
+        new std::thread([this]() { gradCollectThread(); }));
+
+    valueDispatchThread_.reset(
+        new std::thread([this]() { valueDispatchThread(); }));
+
+    copyThread_.reset(new std::thread([this]() { copyGradToBufferThread(); }));
+  }
+}
+
+void TrainerThread::stop() {
+  if (stopping_) return;
+
+  stopping_ = true;
+
+  if (computeThread_) {
+    taskReadySem_.post();
+    computeThread_->join();
+  }
+  if (gradCollectThread_) {
+    gradQueue_.enqueue(0);
+    gradCollectThread_->join();
+  }
+  if (copyThread_) {
+    gradBufQueue_.enqueue(0);
+    copyThread_->join();
+  }
+  if (valueDispatchThread_) {
+    valueReadyQueue_.enqueue(0);
+    valueDispatchThread_->join();
+  }
+}
+
+void TrainerThread::computeThread() {
+  VLOG(1) << "gradComputeThread " << threadId_;
+
+  if (deviceId_ >= 0) {
+    hl_init(deviceId_);
+  }
+
+  while (true) {
+    {
+      REGISTER_TIMER("taskSem_wait");
+      taskReadySem_.wait();
+    }
+
+    if (stopping_) break;
+
+    switch (multiMachine_->getTaskType()) {
+      case MultiGradientMachine::TASK_FORWARD_BACKWARD:
+        forward();
+        backward();
+        break;
+      case MultiGradientMachine::TASK_FORWARD:
+        forward();
+        break;
+      case MultiGradientMachine::TASK_BACKWARD:
+        backward();
+        break;
+      case MultiGradientMachine::TASK_COPY_IN_ARGS:
+        batchSize_ = copyInArgs();
+        inArgsCopied_ = true;
+        multiMachine_->waitForCopyInArgs();
+        break;
+    }
+  }
+}
+
+void TrainerThread::prefetch() {
+  SetDevice setDevice(deviceId_);
+  gradientMachine_->prefetch(inArgs_);
+}
+
+void TrainerThread::forward() {
+  if (!inArgsCopied_) {
+    REGISTER_TIMER("copyInArgs");
+    batchSize_ = copyInArgs();
+  } else {
+    inArgsCopied_ = false;
+  }
+
+  if (multiMachine_->getPassType() != PASS_TEST) {
+    REGISTER_TIMER("clearGradient");
+    // For main parameter, the user of MultiGpuSyncMachine is responsible
+    // for setting the gradient to zero
+    for (size_t i = 0; i < parameters_.size(); i++) {
+      if (parameters_[i]->useGpu()) {
+        if (multiMachine_->paraMainThread(i) != threadId_) {
+          SetDevice device(parameters_[i]->getDeviceId());
+          parameters_[i]->clearGradient();
+        }
+      } else {
+        parameters_[i]->clearGradient();
+      }
+    }
+  }
+
+  {
+    REGISTER_TIMER("wait_value");
+    valueReadyCond_.wait([this]() { return !parameterUpdated_; });
+  }
+
+  { fillMergeTypes(multiMachine_->getPassType(), &mergeTypes_); }
+
+  {
+    REGISTER_TIMER("thread_forward");
+    if (batchSize_ > 0) {
+      gradientMachine_->forward(
+          inArgs_, &outArgs_, multiMachine_->getPassType());
+    } else {
+      outArgs_.clear();
+    }
+  }
+  outArgsReadySem_.post();
+}
+
+void TrainerThread::backward() {
+  REGISTER_TIMER("thread_backward");
+  if (multiMachine_->isPassGrad()) {
+    copyOutputGrad();
+  }
+  if (batchSize_ > 0) {
+    gradientMachine_->backward(backwardCallback_);
+  } else {
+    for (size_t i = parameters_.size(); i > 0; i--) {
+      backwardCallback(parameters_[i - 1].get());
+    }
+  }
+  if (multiMachine_->hasNonstaticCpuParamters()) {
+    mergeCpuGradients();
+  }
+}
+
+void TrainerThread::backwardCallback(Parameter* para) {
+  // CPU parameters are merged in the end
+  if (!para->useGpu() || para->isStatic()) return;
+
+  int paramId = para->getID();
+  if (multiMachine_->getNumThreads() == 1) {
+    // no need to do merge if there is only one thread
+    doCallback(paramId);
+  } else if (threadId_ == mod(multiMachine_->paraMainThread(paramId) - 1,
+                              multiMachine_->getNumThreads())) {
+    notifyCopyGradToBuffer(paramId);
+  } else {
+    notifyGradientCollect(paramId);
+  }
+}
+
+void TrainerThread::copyGradToBufferThread() {
+  VLOG(1) << "copyGradToBufferThread " << threadId_;
+
+  if (deviceId_ >= 0) {
+    hl_init(deviceId_);
+  }
+  auto& partnerThread = multiMachine_->getThread(partnerId_);
+  auto& gradBufs = multiMachine_->getGradBuf(partnerId_);
+
+  while (true) {
+    int pid = gradBufQueue_.dequeue();
+    if (stopping_) break;
+
+    int pdeviceId = multiMachine_->realDeviceId2LogicalDeviceId(
+        parameters_[pid]->getDeviceId(), threadId_);
+
+    auto& gradBuf = gradBufs[pdeviceId];
+
+    {
+      REGISTER_TIMER("waitBufferReady");
+      gradBuf.sem.wait();
+    }
+
+    {
+      REGISTER_TIMER("copyGradToBuffer");
+      SetDevice setDevice(parameters_[pid]->getDeviceId());
+      for (size_t i = 0; i < mergeTypes_.size(); ++i) {
+        gradBuf.bufs[i]->resize(
+            parameters_[pid]->getBuf(mergeTypes_[i])->getSize());
+        gradBuf.bufs[i]->copyFrom(*parameters_[pid]->getBuf(mergeTypes_[i]),
+                                  gradStream_);
+      }
+      hl_stream_synchronize(gradStream_);
+    }
+    partnerThread->notifyGradientCollect(pid);
+  }
+}
+
+void TrainerThread::gradCollectThread() {
+  VLOG(1) << "gradCollectThread " << threadId_;
+
+  if (deviceId_ >= 0) {
+    hl_init(deviceId_);
+  }
+
+  std::vector<size_t> gradReadyCount(parameters_.size(), 0);
+
+  auto& gradBufs = multiMachine_->getGradBuf(threadId_);
+
+  while (true) {
+    int pid = gradQueue_.dequeue();
+    if (stopping_) break;
+
+    if (++gradReadyCount[pid] < 2) continue;
+    gradReadyCount[pid] = 0;
+    int pdeviceId = multiMachine_->realDeviceId2LogicalDeviceId(
+        parameters_[pid]->getDeviceId(), threadId_);
+
+    auto& gradBuf = gradBufs[pdeviceId];
+
+    {
+      REGISTER_TIMER("mergeGrad");
+      for (size_t i = 0; i < mergeTypes_.size(); ++i) {
+        ParameterType type = mergeTypes_[i];
+        const VectorPtr& localGrad = parameters_[pid]->getBuf(type);
+        SetDevice setDevice(parameters_[pid]->getDeviceId());
+        localGrad->add(*gradBuf.bufs[i]);
+      }
+    }
+
+    gradBuf.sem.post();
+
+    if (multiMachine_->paraMainThread(pid) == threadId_) {
+      doCallback(pid);
+    } else {
+      notifyCopyGradToBuffer(pid);
+    }
+  }
+}
+
+void TrainerThread::doCallback(int pid) {
+  REGISTER_TIMER("callback");
+  auto& gpuThreads = multiMachine_->getAllThreads();
+  if (multiMachine_->getBackwardCallback()) {
+    // The callback supplied by the user of MultiGradientMachine may handle
+    // the parameter update using the gradient.
+    multiMachine_->getBackwardCallback()(parameters_[pid].get());
+    if (parameters_[pid]->isValueUpdated()) {
+      parameters_[pid]->clearValueUpdated();
+      for (auto& thread : gpuThreads) {
+        thread->incUpdateCounter();
+      }
+      notifyValueReady(pid);
+    }
+  }
+  multiMachine_->notifyGradientTransfer(pid);
+}
+
+void TrainerThread::valueDispatchThread() {
+  VLOG(1) << "valueDispatchThread " << threadId_;
+
+  if (deviceId_ >= 0) {
+    hl_init(deviceId_);
+  }
+
+  auto& thread = multiMachine_->getThread(partnerId_);
+
+  while (true) {
+    int pid;
+    {
+      REGISTER_TIMER("value_dequeue");
+      pid = valueReadyQueue_.dequeue();
+    }
+    if (stopping_) break;
+
+    if (multiMachine_->paraMainThread(pid) == partnerId_) continue;
+
+    {
+      REGISTER_TIMER("copyValue");
+      SetDevice setDevice(parameters_[pid]->getDeviceId());
+      thread->getValueBuf(pid)->copyFrom(*getValueBuf(pid), valueStream_);
+      hl_stream_synchronize(valueStream_);
+    }
+
+    thread->notifyValueReady(pid);
+  }
+}
+
+void TrainerThread::notifyValueReady(int paramId) {
+  if (--updateCounter_ == 0) {
+    valueReadyCond_.notify_all([this] { parameterUpdated_ = false; });
+  }
+
+  notifyValueDispatch(paramId);
+}
+
+int TrainerThread::copyInArgs() {
+  const std::vector<Argument>& fullInArgs = multiMachine_->getInArgs();
+  int numThreads = multiMachine_->getAllThreads().size();
+  int32_t numSequences = fullInArgs[0].getNumSequences();
+  int32_t startSeq = numSequences * threadId_ / numThreads;
+  int32_t endSeq = numSequences * (threadId_ + 1) / numThreads;
+  int32_t copySize = endSeq - startSeq;
+
+  /**
+   * For the first copy, need to allocate space here
+   */
+  if (inArgs_.size() == 0) {
+    inArgs_.resize(fullInArgs.size());
+  }
+
+  if (copySize == 0) {
+    return 0;
+  }
+
+  for (size_t i = 0; i < fullInArgs.size(); i++) {
+    inArgs_[i].resizeAndCopyFrom(
+        fullInArgs[i],
+        startSeq,
+        copySize,
+        FLAGS_parallel_nn ? false : multiMachine_->useGpu());
+  }
+  return copySize;
+}
+
+void TrainerThread::mergeCpuGradients() {
+  CHECK_EQ(mergeTypes_.size(), 1UL);
+  CHECK_EQ(mergeTypes_[0], PARAMETER_GRADIENT);
+
+  {
+    REGISTER_TIMER("waitbeforeMerge");
+    multiMachine_->waitBeforeMerge();
+  }
+  std::vector<const std::vector<ParameterPtr>*> slaveParameters =
+      multiMachine_->getSlaveParameters();
+
+  CHECK(slaveParameters.size());
+  for (auto& para : multiMachine_->getNonStaticParameters()) {
+    if (para->useGpu()) continue;
+    if (para->isSparseRemoteUpdate()) {
+      REGISTER_TIMER("mergeRemoteGradSparse");
+      mergeGradSparseRemote(para.get(), slaveParameters);
+    } else if (para->isGradSparseUpdate()) {
+      REGISTER_TIMER("mergeGradSparse");
+      mergeGradSparse(para.get(), slaveParameters);
+    } else {
+      REGISTER_TIMER("mergeGradDense");
+      mergeGradDense(para.get(), slaveParameters);
+    }
+  }
+  {
+    REGISTER_TIMER("waitbeforeMerge");
+    multiMachine_->waitAfterMerge();
+  }
+}
+
+void TrainerThread::mergeGradSparse(
+    Parameter* para,
+    std::vector<const std::vector<ParameterPtr>*>& slaveParameters) {
+  size_t pid = para->getID();
+  SparseRowIdsCpuMatrix* mainMat = dynamic_cast<SparseRowIdsCpuMatrix*>(
+      para->getMat(PARAMETER_GRADIENT).get());
+  std::vector<uint32_t>& ids = mainMat->getIds(threadId_);
+
+  for (auto slaveParams : slaveParameters) {
+    SparseRowCpuMatrix* mat = dynamic_cast<SparseRowCpuMatrix*>(
+        (*slaveParams)[pid]->getMat(PARAMETER_GRADIENT).get());
+    mat->addTo(*mainMat, ids, threadId_, multiMachine_->getNumThreads());
+    // we use a sample hash method(%) instead of range partition,
+    // because range partition has balance issue sometimes,
+    // when feature ids are not generated from hashcode.
+  }
+  uniqueIds(ids);
+}
+
+void TrainerThread::mergeGradSparseRemote(
+    Parameter* para,
+    std::vector<const std::vector<ParameterPtr>*>& slaveParameters) {
+  size_t pid = para->getID();
+  SparseRowCpuMatrix* mainMat =
+      dynamic_cast<SparseRowCpuMatrix*>(para->getMat(PARAMETER_GRADIENT).get());
+
+  mainMat->checkIndices();
+  mainMat->zeroMemThread(threadId_, multiMachine_->getNumThreads());
+
+  for (auto slaveParams : slaveParameters) {
+    SparseRowCpuMatrix* mat = dynamic_cast<SparseRowCpuMatrix*>(
+        (*slaveParams)[pid]->getMat(PARAMETER_GRADIENT).get());
+    mat->addTo(*mainMat, threadId_, multiMachine_->getNumThreads());
+  }
+}
+
+void TrainerThread::mergeGradDense(
+    Parameter* para,
+    std::vector<const std::vector<ParameterPtr>*>& slaveParameters) {
+  size_t pid = para->getID();
+  auto interval = calcSplitArrayInterval(para->getSize(),
+                                         (size_t)threadId_,
+                                         multiMachine_->getNumThreads(),
+                                         8LU /*for avx*/);
+  size_t startSeq = interval.first;
+  size_t copySize = interval.second - interval.first;
+
+  // setup sub bufs
+  CpuVector destGrad(0, nullptr);
+  destGrad.subVecFrom(*para->getBuf(PARAMETER_GRADIENT), startSeq, copySize);
+
+  // merge
+  CpuVector slaveGradSub(0, nullptr);
+  for (auto slaveParams : slaveParameters) {
+    slaveGradSub.subVecFrom(
+        *(*slaveParams)[pid]->getBuf(PARAMETER_GRADIENT), startSeq, copySize);
+    destGrad.add(slaveGradSub);
+  }
+}
+
+void TrainerThread::copyOutputGrad() {
+  const std::vector<Argument>& outputGradArgs = multiMachine_->outArgs_;
+  int numThreads = multiMachine_->getAllThreads().size();
+  int32_t numSequences = outputGradArgs[0].getNumSequences();
+  int32_t startSeq = numSequences * threadId_ / numThreads;
+  int32_t endSeq = numSequences * (threadId_ + 1) / numThreads;
+  int32_t copySize = endSeq - startSeq;
+  outArgs_.resize(outputGradArgs.size());
+  for (size_t i = 0; i < outputGradArgs.size(); i++) {
+    outArgs_[i].resizeAndCopyFrom(outputGradArgs[i],
+                                  startSeq,
+                                  copySize,
+                                  multiMachine_->useGpu(),
+                                  HPPL_STREAM_DEFAULT);
+  }
+  if (multiMachine_->useGpu()) {
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  }
+  gradientMachine_->setOutputGrad(outArgs_);
+}
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.h b/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.h
new file mode 100644
index 0000000000000000000000000000000000000000..674acd4124981face13b21aee02f031ea775ffec
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/MultiGradientMachine.h
@@ -0,0 +1,478 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <atomic>
+
+#include "GradientMachine.h"
+
+#include "hl_gpu.h"
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/Queue.h"
+
+namespace paddle {
+
+class TrainerThread;
+
+typedef Queue<int> PidQueue;
+typedef std::unique_ptr<TrainerThread> TrainerThreadPtr;
+
+struct GradBuffer {
+  /// GradBuffer is used for gathering gradient for GPU parameters
+  int paramId;
+
+  /// sem is used to notify that the local gradient merge of the current thread
+  /// finished for the current thread.
+  Semaphore sem;
+
+  // bufs[mergeIndex]
+  std::vector<VectorPtr> bufs;
+};
+
+/**
+ *  A MultiGradientMachine is a synchronous GradientMachine which devides
+ *  one data batch into several smaller batches and assign each one small batch
+ *  to one computint thread for computation. After each thread finishes
+ *  computation, it merges result (including output Argument and gradient during
+ *  backward()). It basically is the same as single thread gradient machine,
+ *  except that it uses multi-thread to do the computation.
+ *
+ *  It handles GPU and Cpu parameters differently.  In GPU, one computing thread
+ *  generally corresponds to one GPU device. Thus, each thread keeps a separate
+ *  copy of the parameter in its own device's memory. In CPU, we only need to
+ keep
+ *  one copy of the parameters in the main memory. After, each computing thread
+ *  computes its own parameter gradient, the update process needs to accumulate
+ *  the parameter gradients from all the computing threads, and update the
+ *  accumulated parameter gradient to the corresponding parameter value.
+ *
+ *  Each GPU parameter is assigned to a thread called its main thread. For each
+ *  parameter, the accumulation of its gradients and the update of its value
+ *  happens in its main thread. The main thread first gather the parameter
+ *  gradients from all the computing thread. Then, it performs parameter update.
+ *  After a gradient is updated by the main thread, it is scattered to all the
+ *  computing thread so that the parameters in all the computing threads are
+ *  synchronized. The scatter and gather process are implemented by ring-style
+ *  communication. Assume we have N computing threads, its thread ids will be
+ *  0, 1, ..., N-1. For each parameter, the id of the main thread is specified
+ in
+ *  paraMainThread_[pid], where pid is the id of the parameter. Each thread i
+ only
+ *  sends data to its partner thread (i - 1) % N. For example, for a parameter
+ *  gradient that is computed in thread 4, and its main thread is 2. Its
+ *  traveling process would be 4, 5,..., N-1, 0, 1, 2. In each step, the
+ gradient
+ *  buffer is added to the local gradient, and the local gradient is then copied
+ *  to the gradient buffer of the next thread. At last, its main thread 2 will
+ *  get the accumulated parameter gradient. For the same parameter, after its
+ *  value is updated, the value's traveling process would be 2, 1, 0, N-1, ...
+ 3.
+ *  At the end, all the computing threads would have the updated parameter
+ value.
+ *
+ *  A computing thread (TrainerThread) uses 4 threads to do different jobs:
+ *
+ *  1. computeThread(): performing forward(), backward(), prefetch().
+ *
+ *  2. valueDispatchThread(): copying parameter values to partner thread.
+ *
+ *  3. copyGradToBufferThread(): copying parameter gradient to partner thread.
+ *
+ *  4. gradCollectThread(): merging the gradient from step 3 with local gradient
+ *     and call the callback supplied by the user to update parameter value.
+ *
+ *  CPU parameter value has only one copy. And their gradients are merged at the
+ *  end of backward().
+ *
+ *  * Handling of sparse update
+ *  Currently, sparse update is only supported for CPU parameters.
+
+ *  Sparse updates refers to gradient caculation where the gradient is sparse.
+ For
+ *  example, if the input argument to a 'fc' layer is sparse, the gradient of
+ the
+ *  weight matrix of this layer will be sparse. It is usually more efficient to
+ *  treat the gradient explicitly as sparse vector during the parameter update.
+
+ *  There are two types of sparse updates called local sparse update and remote
+ *  sparse update.
+
+ *  For both types of sparse updates, there is one copy of parameter value and
+ *  gradient called main parameter value and gradient, and there is a copy of
+ *  parameter value and gradient for each computing thread called slave
+ parameter
+ *  value and gradient. The slave parameter values are always shared with the
+ *  corresponding main parameter value. The slave parameter grad is a sparse row
+ *  matrix. The sparse pattern for slave parameter grads are different, because
+ *  the small batches for each computing thread might have different sparsity
+ *  pattern.
+
+ *  1. Local sparse update
+ *
+ *     Main parameter value type is MAT_NORMAL. It is a dense matrix.
+ *
+ *     Main parameter grad type is MAT_SPARSE_ROW_IDS (SparseRowIdsCpuMatrix)
+ *     It is also a dense matrix, but the updated values are specified by IDS.
+ *
+ *     Slave parameter value shares with main parameter value.
+ *
+ *     Slave parameter grad type is MAT_SPARSE_ROW_AUTO_GROW
+ *     (SparseAutoGrowRowCpuMatrix). It is a sparse row matrix.
+ *
+ *     During backward() of each TrainerThread, SparseAutoGrowRowCpuMatrix will
+ *     gather all the non-zero gradient. And After backward(), they will be
+ merged
+ *     into main parameter grad (SparseRowIdsCpuMatrix), with indices indicating
+ *     which rows have nonzero gradient.
+ *
+ *  2. Remote sparse update
+ *
+ *     Main parameter value type is MAT_SPARSE_ROW_PREFETCH(_FULL_SIZE)
+ *     (SparsePrefetchRowCpuMatrix). MAT_SPARSE_ROW_PREFETCH is a sparse matrix.
+ *     MAT_SPARSE_ROW_PREFETCH_FULL_SIZE is a dense matrix. However, only the
+ *     parameter values that are prefetched is up-to-date.
+ *
+ *     Main parameter grad type is MAT_SPARSE_ROW (SparseRowCpuMatrix).
+ *     And it shares sparse pattern with value by sharing indexDictHandle_,
+ which
+ *     is an internal data structure used by SparseRowCpuMatrixto specify the
+ *     sparsity pattern of Slave parameter value shares with main parameter
+ value.
+ *
+ *     Slave parameter grad type is MAT_SPARSE_ROW_AUTO_GROW
+ *     (SparsePrefetchRowCpuMatrix). It is a sparse row matrix
+ *
+ *     During prefetch(), all the layers will indicates which rows of each
+ *     parameter are needed. Then the framework will retrieve those rows from
+ *     parameter server.
+ *
+ *     During backward() of each TrainerThread, SparseAutoGrowRowCpuMatrix will
+ *     gather all the non-zero gradient. And After backward(), they will be
+ merged
+ *     into main parameter grad (SparseRowCpuMatrix). And the framework will
+ send
+ *     the merged gradient to parameter server.
+ */
+class MultiGradientMachine : public GradientMachine {
+ public:
+  enum TaskType {
+    TASK_FORWARD_BACKWARD = 0,
+    TASK_FORWARD = 1,
+    TASK_BACKWARD = 2,
+    TASK_COPY_IN_ARGS = 3,
+  };
+
+  explicit MultiGradientMachine(const ModelConfig& config, bool useGpu);
+
+  virtual void start();
+
+  virtual void finish();
+
+  virtual void prefetch(const std::vector<Argument>& inArgs);
+
+  virtual void forward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs,
+                       PassType passType);
+
+  virtual void backward(const UpdateCallback& callback = nullptr);
+
+  void forwardBackward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs,
+                       PassType passType,
+                       const UpdateCallback& callback);
+
+  virtual Argument getLayerOutput(const std::string& layerName);
+
+  virtual void onPassEnd();
+
+  virtual Evaluator* makeEvaluator() const;
+
+  virtual void eval(Evaluator* evaluator) const;
+
+  bool useGpu() const { return useGpu_; }
+
+  /// @return whether to pass the gradients in outArgs_ to each threads.
+  bool isPassGrad() { return isPassGrad_; }
+
+  /// @brief set whether to pass the gradient in outArgs_ to each threads.
+  void setPassGrad(bool isPass) { isPassGrad_ = isPass; }
+
+  /// Set the gradients of the outputs.
+  /// The gradietns will be copied to each thread in the computing threads.
+  virtual void setOutputGrad(const std::vector<Argument>& args);
+
+ protected:
+  friend class TrainerThread;
+
+  std::vector<TrainerThreadPtr>& getAllThreads() { return threads_; }
+  /// Calculate the real device id based on the logical device id and the
+  /// thread id.
+  int logicalDeviceId2RealDeviceId(int logicalId, int threadId = 0) const {
+    if (logicalId == -1) {
+      logicalId = 0;
+    }
+    return mod(logicalId + FLAGS_gpu_id + threadId * numLogicalDevices_,
+               numDevices_);
+  }
+
+  /// Calculate the logical device id based on the real device id and the
+  /// thread id.
+  int realDeviceId2LogicalDeviceId(int realId, int threadId = 0) const {
+    if (realId == -1) {
+      return 0;
+    } else {
+      return mod(realId - FLAGS_gpu_id - threadId * numLogicalDevices_,
+                 numDevices_);
+    }
+  }
+
+  std::vector<const std::vector<ParameterPtr>*> getSlaveParameters();
+
+  bool hasNonstaticCpuParamters() const { return hasNonstaticCpuParamters_; }
+
+  /// Called TrainerThread to wait before merging CPU parameter gradients.
+  void waitBeforeMerge() { trainerBarrier_.wait(); }
+
+  /// called by MultiGradientMachine and TrainerThread to wait after merging
+  /// CPU parameter graidents.
+  void waitAfterMerge() { allBarrier_.wait(); }
+
+  /// called by MultiGradientMachine and TrainerThread to wait for copyInArgs()
+  /// finishing
+  void waitForCopyInArgs() { allBarrier_.wait(); }
+
+  TrainerThreadPtr& getThread(int threadId) { return threads_[threadId]; }
+
+  std::vector<GradBuffer>& getGradBuf(int threadId) {
+    return gradBufs_[threadId];
+  }
+
+  PassType getPassType() const { return passType_; }
+
+  /// Called by TrainerThread to notify MultiGradientMachine that the gradient
+  /// for paramId is ready
+  void notifyGradientTransfer(int paramId);
+
+  const std::vector<Argument>& getInArgs() { return inArgs_; }
+
+  TaskType getTaskType() const { return taskType_; }
+
+  const UpdateCallback& getBackwardCallback() const {
+    return backwardCallback_;
+  }
+
+  int getNumDevices() const { return numDevices_; }
+
+  int getNumLogicalDevices() const { return numLogicalDevices_; }
+
+  int getNumThreads() const { return numThreads_; }
+
+  int paraMainThread(int pid) const { return paraMainThread_[pid]; }
+
+ protected:
+  virtual void forwardImp(const std::vector<Argument>& inArgs,
+                          std::vector<Argument>* outArgs,
+                          PassType passType,
+                          TaskType taskType);
+
+  virtual void backwardImp(const UpdateCallback& callback = NULL);
+
+  /// update all parameters
+  void updateThreadParameters();
+
+  void startTask(TaskType taskType);
+
+  void getOutArgs(std::vector<Argument>* outArgs, PassType passType);
+
+  void allocGradBufs();
+
+ protected:
+  bool useGpu_;
+
+  bool hasNonstaticCpuParamters_;
+
+  /// store main parameter only
+  std::unique_ptr<GradientMachine> gradientMachine_;
+
+  std::vector<TrainerThreadPtr> threads_;
+  std::vector<int> paraMainThread_;
+  std::vector<std::vector<GradBuffer>> gradBufs_;  // [threadId][deviceId]
+  std::vector<size_t> bufferSizes_;
+
+  PassType passType_;
+  TaskType taskType_;
+  PidQueue gradQueue_;
+  std::vector<Argument> inArgs_;
+  std::vector<Argument> outArgs_;
+  hl_stream_t outArgStream_;
+
+  Argument outLayerArgs_;
+
+  /// ParameterType which needs to be merged from each GPU
+  std::vector<ParameterType> mergeTypes_;
+  int numDevices_;         /* number of gpu devices */
+  int numLogicalDevices_;  // number of GPU used by one NN
+  int numThreads_;         /* number of train threads */
+
+  UpdateCallback backwardCallback_;
+
+  /// barrrier for threads_
+  ThreadBarrier trainerBarrier_;
+
+  /// barrier for both MultiGradientMachine and threds_
+  ThreadBarrier allBarrier_;
+
+  /// indicate whether inArgs is copied before forward()
+  bool inArgsCopied_;
+
+  /// Whether to copy the gradient back from an external input.
+  bool isPassGrad_;
+};
+
+class TrainerThread {
+ public:
+  TrainerThread(const ModelConfig& config,
+                int threadId,
+                MultiGradientMachine* multiMachine);
+
+  ~TrainerThread();
+
+  void start();
+
+  void onPassEnd() { gradientMachine_->onPassEnd(); }
+
+  void waitOutArgsReady() { outArgsReadySem_.wait(); }
+
+  void notifyTaskReady() { taskReadySem_.post(); }
+
+  int getDeviceId() const { return deviceId_; }
+
+  GradientMachine* getGradientMachine() { return gradientMachine_.get(); }
+
+  const std::vector<ParameterPtr>& getParameters() { return parameters_; }
+
+  void stop();
+
+  void notifyValueReady(int paramId);
+
+  const VectorPtr& getValueBuf(int paramId) {
+    return parameters_[paramId]->getBuf(PARAMETER_VALUE);
+  }
+
+  const std::vector<Argument>& getOutArgs() { return outArgs_; }
+
+  void incUpdateCounter(int n = 1) {
+    updateCounter_ += n;
+    parameterUpdated_ = true;
+  }
+
+  void notifyGradientCollect(int paramId) { gradQueue_.enqueue(paramId); }
+
+  void notifyCopyGradToBuffer(int paramId) { gradBufQueue_.enqueue(paramId); }
+
+  void notifyValueDispatch(int paramId) { valueReadyQueue_.enqueue(paramId); }
+
+  void prefetch();
+
+  /// copy the output gradient from the main GradientMachine.
+  void copyOutputGrad();
+
+  /// Whether the thread has input data.
+  bool hasInputData() { return batchSize_ != 0; }
+
+ protected:
+  void mergeCpuGradients();
+
+  void mergeGradSparse(
+      Parameter* para,
+      std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
+
+  void mergeGradSparseRemote(
+      Parameter* para,
+      std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
+
+  void mergeGradDense(
+      Parameter* para,
+      std::vector<const std::vector<ParameterPtr>*>& slaveParameters);
+
+  void computeThread();
+  void valueDispatchThread();
+  void copyGradToBufferThread();
+  void gradCollectThread();
+
+  int copyInArgs();
+  void forward();
+  void backward();
+  void backwardCallback(Parameter* para);
+
+  /// call the actuall callback supplied by the caller of
+  /// GradientMachine::backward
+  void doCallback(int pid);
+
+ protected:
+  MultiGradientMachine* multiMachine_;
+  ModelConfig config_;
+  /// whether the thread should stop
+  bool stopping_;
+  /// the threads form which to collect gradient
+  int partnerId_;
+  /// from 0 to threads-1
+  int threadId_;
+  int deviceId_;
+  std::unique_ptr<GradientMachine> gradientMachine_;
+  std::vector<ParameterPtr> parameters_;
+
+  /// ParameterType which needs to be merged from each GPU
+  std::vector<ParameterType> mergeTypes_;
+
+  /// compute thread
+  std::unique_ptr<std::thread> computeThread_;
+  std::vector<Argument> inArgs_;
+  std::vector<Argument> outArgs_;
+  Semaphore taskReadySem_;
+  Semaphore outArgsReadySem_;
+
+  /// copy thread
+  std::unique_ptr<std::thread> copyThread_;
+  /// queue of gradient needs to be copied to partner
+  PidQueue gradBufQueue_;
+  hl_stream_t gradStream_;
+
+  /// grad merge thread
+  std::unique_ptr<std::thread> gradCollectThread_;
+  /// queue of gradient needs to be merged with gradient coopied by
+  /// copyGradToBufferThread
+  PidQueue gradQueue_;
+  UpdateCallback backwardCallback_;
+
+  /// value dispatch thread
+  std::unique_ptr<std::thread> valueDispatchThread_;
+  /// queue of the parameter whose the vale are ready for copy
+  PidQueue valueReadyQueue_;
+
+  /// used to notify all the parameter values are ready
+  LockedCondition valueReadyCond_;
+
+  hl_stream_t valueStream_;
+  /// how many parameters are updated
+  std::atomic<int> updateCounter_;
+  bool parameterUpdated_;
+
+  /// indicate whether inArgs is copied before forward()
+  bool inArgsCopied_;
+  int batchSize_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/MultiNetwork.cpp b/paddle/legacy/gserver/gradientmachines/MultiNetwork.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1245c441036a601025192ab23a6d2899b688a9dc
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/MultiNetwork.cpp
@@ -0,0 +1,185 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include "MultiNetwork.h"
+
+#include "NeuralNetwork.h"
+#include "ParallelNeuralNetwork.h"
+
+namespace paddle {
+
+void MultiNetwork::init(const ModelConfig& config,
+                        ParamInitCallback callback,
+                        const std::vector<ParameterType>& parameterTypes,
+                        bool useGpu) {
+  CHECK_GT(config.sub_models_size(), 1) << "sub_models_size should GT 1";
+  // check submodel[0] is root
+  CHECK_EQ("root", config.sub_models(0).name())
+      << "sub_models(0) should be root";
+  // ignore root
+  subNetworks_.resize(config.sub_models_size() - 1);
+  // base class
+  NeuralNetwork::init(config, callback, parameterTypes, useGpu);
+  // sub networks
+  for (int i = 1; i < config.sub_models_size(); ++i) {
+    std::string subModelName = config.sub_models(i).name();
+    if (FLAGS_parallel_nn) {
+      subNetworks_[i - 1] = std::unique_ptr<ParallelNeuralNetwork>(
+          new ParallelNeuralNetwork(subModelName, this));
+    } else {
+      subNetworks_[i - 1] = std::unique_ptr<NeuralNetwork>(
+          NeuralNetwork::newNeuralNetwork(subModelName, this));
+    }
+    subNetworks_[i - 1]->init(config);
+  }
+}
+
+void MultiNetwork::prefetch(const std::vector<Argument>& inArgs) {
+  std::vector<std::vector<Argument>> argumentGroups;
+  Argument::splitByDataId(inArgs, &argumentGroups);
+  // check group size is equal to sub network size
+  CHECK_EQ(argumentGroups.size(), subNetworks_.size());
+  for (size_t i = 0; i < subNetworks_.size(); i++) {
+    if (argumentGroups[i].size() == 1 && argumentGroups[i][0].dataId == -1) {
+      // check input args: if dataId is -1, then skip this sub network
+      continue;
+    }
+    subNetworks_[i]->prefetch(argumentGroups[i]);
+  }
+}
+
+void MultiNetwork::forward(const std::vector<Argument>& inArgs,
+                           std::vector<Argument>* outArgs,
+                           PassType passType) {
+  // split inArgs to several vectors
+  std::vector<std::vector<Argument>> argumentGroups;
+  Argument::splitByDataId(inArgs, &argumentGroups);
+
+  // check group size is equal to sub network size
+  CHECK_EQ(argumentGroups.size(), subNetworks_.size());
+  std::vector<Argument> tempOutArgs;
+  outArgs->clear();
+
+  for (size_t i = 0; i < subNetworks_.size(); i++) {
+    tempOutArgs.clear();
+    if (argumentGroups[i].size() == 1 && argumentGroups[i][0].dataId == -1) {
+      // check input args: if dataId is -1, then skip this sub network
+      continue;
+    }
+    subNetworks_[i]->forward(argumentGroups[i], &tempOutArgs, passType);
+    for (const auto& elem : tempOutArgs) {
+      outArgs->push_back(elem);
+      outArgs->back().dataId = i;
+    }
+  }
+}
+
+void MultiNetwork::backward(const UpdateCallback& callback) {
+  for (size_t i = 0; i < subNetworks_.size(); i++) {
+    subNetworks_[i]->backward(callback);
+  }
+}
+
+void MultiNetwork::forwardBackward(const std::vector<Argument>& inArgs,
+                                   std::vector<Argument>* outArgs,
+                                   PassType passType,
+                                   const UpdateCallback& callback) {
+  forward(inArgs, outArgs, passType);
+  backward(callback);
+}
+
+void MultiNetwork::onPassEnd() {
+  for (size_t i = 0; i < subNetworks_.size(); i++) {
+    subNetworks_[i]->onPassEnd();
+  }
+}
+
+void MultiNetwork::start() {
+  for (auto& subNetwork : subNetworks_) {
+    subNetwork->start();
+  }
+}
+
+void MultiNetwork::finish() {
+  for (size_t i = 0; i < subNetworks_.size(); i++) {
+    subNetworks_[i]->finish();
+  }
+}
+
+class MultiCombinedEvaluator : public Evaluator {
+ public:
+  MultiCombinedEvaluator() {}
+  void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
+    evaluators_.emplace_back(std::move(evaluator));
+  }
+  virtual void start() {
+    for (auto& evaluator : evaluators_) {
+      evaluator->start();
+    }
+  }
+
+  virtual void finish() {
+    for (auto& evaluator : evaluators_) {
+      evaluator->finish();
+    }
+  }
+
+  virtual void eval(const NeuralNetwork& nn) {
+    const MultiNetwork& multiNetwork = dynamic_cast<const MultiNetwork&>(nn);
+    CHECK_EQ(evaluators_.size(), multiNetwork.getSubNetworks().size());
+    int size = evaluators_.size();
+    for (int i = 0; i < size; i++) {
+      // one evaluator for one subNetwork
+      evaluators_[i]->eval(*multiNetwork.getSubNetworks()[i]);
+    }
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    (void)arguments;
+    return -1;
+  }
+
+  virtual void printStats(std::ostream& os) const {
+    for (auto& evaluator : evaluators_) {
+      evaluator->printStats(os);
+      os << ' ';
+    }
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    for (auto& evaluator : evaluators_) {
+      evaluator->distributeEval(client);
+    }
+  }
+
+ protected:
+  std::vector<std::unique_ptr<Evaluator>> evaluators_;
+};
+
+Evaluator* MultiNetwork::makeEvaluator() const {
+  MultiCombinedEvaluator* multiCombinedEvaluator = new MultiCombinedEvaluator();
+  for (size_t i = 0; i < subNetworks_.size(); i++) {
+    std::unique_ptr<Evaluator> evaluator(subNetworks_[i]->makeEvaluator());
+    multiCombinedEvaluator->addEvaluator(std::move(evaluator));
+  }
+  return multiCombinedEvaluator;
+}
+
+void MultiNetwork::eval(Evaluator* evaluator) const { evaluator->eval(*this); }
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/MultiNetwork.h b/paddle/legacy/gserver/gradientmachines/MultiNetwork.h
new file mode 100644
index 0000000000000000000000000000000000000000..afe15cb020ebe3bbe051800a72562c9543f3faa4
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/MultiNetwork.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "GradientMachine.h"
+#include "NeuralNetwork.h"
+
+#include "paddle/legacy/utils/Locks.h"
+
+namespace paddle {
+
+class MultiNetwork : public NeuralNetwork {
+ public:
+  explicit MultiNetwork(std::string subModelName = "")
+      : NeuralNetwork(subModelName) {}
+
+  virtual void init(const ModelConfig& config,
+                    ParamInitCallback callback,
+                    const std::vector<ParameterType>& parameterTypes,
+                    bool useGpu);
+
+  virtual void prefetch(const std::vector<Argument>& inArgs);
+
+  virtual void forward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs,
+                       PassType passType);
+
+  virtual void backward(const UpdateCallback& callback = nullptr);
+
+  void forwardBackward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs,
+                       PassType passType,
+                       const UpdateCallback& callback);
+
+  virtual void onPassEnd();
+
+  virtual Evaluator* makeEvaluator() const;
+
+  virtual void eval(Evaluator* evaluator) const;
+
+  const std::vector<std::unique_ptr<NeuralNetwork>>& getSubNetworks() const {
+    return subNetworks_;
+  }
+
+  virtual void start();
+
+  virtual void finish();
+
+ protected:
+  std::vector<std::unique_ptr<NeuralNetwork>> subNetworks_;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/legacy/gserver/gradientmachines/NeuralNetwork.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0f8048152ff317a1e445249fa7093158d2d4a5c5
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/NeuralNetwork.cpp
@@ -0,0 +1,548 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/Util.h"
+
+#include "NeuralNetwork.h"
+#include "hl_gpu.h"
+#include "paddle/legacy/utils/CustomStackTrace.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/legacy/gserver/layers/MKLDNNLayer.h"
+#endif
+
+#ifndef PADDLE_MOBILE_INFERENCE
+#include "MultiNetwork.h"
+#include "RecurrentGradientMachine.h"
+#include "paddle/legacy/gserver/layers/AgentLayer.h"
+#endif
+
+namespace paddle {
+void parameterInitNN(int paramId,
+                     Parameter* para,
+                     std::vector<ParameterPtr>* sharedParams) {
+  // Create parameters values.
+  if (!para->useGpu() && sharedParams) {
+    para->enableSharedType(PARAMETER_VALUE,
+                           (*sharedParams)[paramId]->getBuf(PARAMETER_VALUE),
+                           (*sharedParams)[paramId]->getMat(PARAMETER_VALUE));
+  } else {
+    if (para->isSparseRemoteUpdate()) {
+      para->enableType(PARAMETER_VALUE,
+                       FLAGS_loadsave_parameters_in_pserver
+                           ? Parameter::MAT_SPARSE_ROW_PREFETCH
+                           : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE);
+    } else {
+      para->enableType(PARAMETER_VALUE);
+    }
+  }
+  // Create parameter gradients.
+  if (para->isSparseRemoteUpdate() && !sharedParams) {
+    para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW);
+  } else if (para->isGradSparseUpdate()) {
+    para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW_AUTO_GROW);
+  } else if (!para->isStatic()) {
+    para->enableType(PARAMETER_GRADIENT);
+  }
+}
+
+NeuralNetwork* NeuralNetwork::create(const ModelConfig& config) {
+#ifndef PADDLE_MOBILE_INFERENCE
+  if (config.type() == "recurrent_nn") {
+    return newNeuralNetwork("root");
+  } else if (config.type() == "multi_nn") {
+    return new MultiNetwork("root");
+  } else {
+    return newNeuralNetwork();
+  }
+#else
+  return new NeuralNetwork();
+#endif
+}
+
+std::map<std::string, bool> NeuralNetwork::dllInitMap;
+
+void NeuralNetwork::init(const ModelConfig& config,
+                         ParamInitCallback callback,
+                         const std::vector<ParameterType>& parameterTypes,
+                         bool useGpu) {
+  using std::placeholders::_1;
+  using std::placeholders::_2;
+  ParamInitCallback paramCallback = nullptr;
+  if (callback != nullptr) {
+    paramSelfInited_ = false;
+    paramCallback = callback;
+  } else {
+    paramSelfInited_ = true;
+    paramCallback = std::bind(parameterInitNN, _1, _2, nullptr);
+  }
+  config_ = config;
+
+  if (rootNetwork_ != nullptr) {
+    // direct use parameters_ and parameterMap_ from base network
+    CHECK_EQ((size_t)config.parameters_size(),
+             rootNetwork_->getParameters().size());
+    parameters_ = rootNetwork_->getParameters();
+    parameterMap_ = *(rootNetwork_->getParameterMap());
+  } else {
+    parameters_.reserve(config.parameters_size());
+    for (const auto& para_config : config.parameters()) {
+      auto parameter = std::make_shared<Parameter>(para_config,
+                                                   useGpu,
+                                                   /*initialize=*/false);
+      paramCallback(parameters_.size(), parameter.get());
+      if (!callback) {
+        for (ParameterType type :
+             (parameter->isStatic()
+                  ? std::vector<ParameterType>{PARAMETER_VALUE}
+                  : parameterTypes)) {
+          if (type != PARAMETER_VALUE && type != PARAMETER_GRADIENT) {
+            parameter->enableType(type);
+          }
+        }
+      }
+      parameter->setID(parameters_.size());
+      parameters_.push_back(parameter);
+      CHECK(!parameterMap_.count(parameter->getName()));
+      parameterMap_[parameter->getName()] = parameter;
+    }
+  }
+
+  auto layerCreate = [&](const LayerConfig& layer_config) {
+    auto layer = Layer::create(layer_config);
+    CHECK(layer) << "Create layer failed. Layer name:" << layer->getName();
+    layers_.push_back(layer);
+    CHECK(!layerMap_.count(layer->getName()));
+    layerMap_[layer->getName()] = layer;
+  };
+
+  auto subModelConfig = std::find_if(config.sub_models().begin(),
+                                     config.sub_models().end(),
+                                     [=](const SubModelConfig& sub_model) {
+                                       return sub_model.name() == subModelName_;
+                                     });
+  bool useSubModel = (subModelConfig != config.sub_models().end());
+  CHECK_EQ(useSubModel, !subModelName_.empty());
+  if (useSubModel) {
+    layers_.reserve(subModelConfig->layer_names_size());
+    for (const auto& layer_name : subModelConfig->layer_names()) {
+      auto layer_config =
+          std::find_if(config.layers().begin(),
+                       config.layers().end(),
+                       [=](const LayerConfig& layer_config) {
+                         return layer_config.name() == layer_name;
+                       });
+      CHECK(layer_config != config.layers().end());
+      layerCreate(*layer_config);
+    }
+  } else {
+    layers_.reserve(config.layers_size());
+    for (const auto& layer_config : config.layers()) {
+      bool useLayer = true;
+      if (config.has_external_config()) {
+        useLayer = true;
+        for (const auto& name : config.external_config().layer_names()) {
+          if (layer_config.name() == name) {
+            useLayer = false;
+            break;
+          }
+        }
+      }
+      if (useLayer) {
+        layerCreate(layer_config);
+      }
+    }
+  }
+
+  for (const auto& layer : layers_) {
+    layer->init(layerMap_, parameterMap_);
+    layer->initSubNetwork(this /*root*/, config_, parameterTypes, useGpu);
+  }
+
+  for (const auto& layer_name :
+       (useSubModel ? subModelConfig->input_layer_names()
+                    : config.input_layer_names())) {
+    auto it = layerMap_.find(layer_name);
+    CHECK(it != layerMap_.end());
+    dataLayers_.push_back(std::dynamic_pointer_cast<DataLayer>(it->second));
+  }
+
+  for (const auto& layer_name :
+       (useSubModel ? subModelConfig->output_layer_names()
+                    : config.output_layer_names())) {
+    auto it = layerMap_.find(layer_name);
+    CHECK(it != layerMap_.end());
+    outputLayers_.push_back(it->second);
+  }
+
+  for (const auto& layer : layers_) {
+    const auto& name = layer->getName();
+    bool isMiddleLayer = true;
+
+    // if data layer
+    for (const auto& dataLayer : dataLayers_) {
+      if (name == dataLayer->getName()) {
+        isMiddleLayer = false;
+        break;
+      }
+    }
+
+    // if output layer
+    for (const auto& dataLayer : outputLayers_) {
+      if (name == dataLayer->getName()) {
+        isMiddleLayer = false;
+        break;
+      }
+    }
+
+    if (isMiddleLayer) {
+      middleLayers_.push_back(layer);
+    }
+  }
+}
+
+void NeuralNetwork::connect(LayerPtr agentLayer,
+                            LayerPtr realLayer,
+                            int height) {
+#ifndef PADDLE_MOBILE_INFERENCE
+  AgentLayer* agent = dynamic_cast<AgentLayer*>(agentLayer.get());
+  CHECK_NOTNULL(agent);
+  agent->setRealLayer(realLayer, height);
+#endif
+}
+
+void NeuralNetwork::connect(std::string agentLayerName,
+                            NeuralNetwork* srcNN,
+                            std::string realLayerName) {
+  connect(this->getLayer(agentLayerName), srcNN->getLayer(realLayerName));
+}
+
+void NeuralNetwork::prefetch(const std::vector<Argument>& inArgs) {
+  CHECK_EQ(inArgs.size(), dataLayers_.size());
+
+  if (paramSelfInited_) {
+    for (auto& para : parameters_) {
+      if (para->isSparseRemoteUpdate()) {
+        auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
+            para->getMat(PARAMETER_VALUE).get());
+        para->clearGradient();
+        if (mat) mat->clearIndices();
+      }
+    }
+  }
+
+  for (size_t i = 0; i != dataLayers_.size(); ++i) {
+    if (FLAGS_parallel_nn) {
+      const_cast<Argument&>(inArgs[i]).deviceId = -1;
+    }
+    dataLayers_[i]->setData(inArgs[i]);
+  }
+
+  for (auto& layer : layers_) {
+    layer->prefetch();
+  }
+
+  if (paramSelfInited_) {
+    for (auto& para : parameters_) {
+      if (para->isSparseRemoteUpdate()) {
+        auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
+            para->getMat(PARAMETER_VALUE).get());
+        mat->setupIndices();
+        auto matGrad = dynamic_cast<SparseRowCpuMatrix*>(
+            para->getMat(PARAMETER_GRADIENT).get());
+        matGrad->reserveStore();
+      }
+    }
+  }
+}
+
+void NeuralNetwork::forward(const std::vector<Argument>& inArgs,
+                            std::vector<Argument>* outArgs,
+                            PassType passType) {
+  CHECK_EQ(inArgs.size(), dataLayers_.size());
+  outArgs->resize(outputLayers_.size());
+  for (size_t i = 0; i != dataLayers_.size(); ++i) {
+    dataLayers_[i]->setData(inArgs[i]);
+  }
+
+  gLayerStackTrace.set_stage(true);
+
+  {
+    for (auto& layer : layers_) {
+      REGISTER_TIMER_INFO("ForwardTimer", layer->getName().c_str());
+      gLayerStackTrace.push(layer->getName());
+      layer->forward(passType);
+      gLayerStackTrace.pop(layer->getName());
+    }
+  }
+
+  outArgs->clear();
+  outArgs->reserve(outputLayers_.size());
+  for (auto& layer : outputLayers_) {
+    outArgs->push_back(layer->getOutput());
+  }
+}
+
+void NeuralNetwork::resetState() {
+  for (auto& layer : layers_) {
+    layer->resetState();
+  }
+}
+
+void NeuralNetwork::setState(const MachineState& machineState) {
+  for (size_t i = 0; i < layers_.size(); i++) {
+    if (machineState[i] != nullptr) {
+      layers_[i]->setState(machineState[i]);
+    }
+  }
+}
+
+void NeuralNetwork::getState(MachineState& machineState) {
+  machineState.clear();
+  machineState.reserve(layers_.size());
+  for (auto& layer : layers_) {
+    LayerStatePtr p = layer->getState();
+    machineState.push_back(p);
+  }
+}
+
+void NeuralNetwork::backward(const UpdateCallback& callback) {
+  gLayerStackTrace.set_stage(false);
+  FOR_EACH_R(layer, layers_) {
+    REGISTER_TIMER_INFO("BackwardTimer", (*layer)->getName().c_str());
+    gLayerStackTrace.push((*layer)->getName());
+    if ((*layer)->needGradient()) {
+      (*layer)->backward(callback);
+    }
+    gLayerStackTrace.pop((*layer)->getName());
+  }
+}
+
+void NeuralNetwork::finish() {
+#ifdef PADDLE_WITH_MKLDNN
+  FOR_EACH_R(layer, layers_) {
+    MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(*layer);
+    if (dnnLayer) {
+      dnnLayer->convertWeightsToPaddle();
+    }
+  }
+#endif
+}
+
+Argument NeuralNetwork::getLayerOutput(const std::string& layerName) {
+  return getLayer(layerName)->getOutput();
+}
+
+void NeuralNetwork::onPassEnd() {
+  for (auto& layer : layers_) {
+    layer->onPassEnd();
+  }
+}
+
+void NeuralNetwork::releaseOutput() {
+  for (auto& layer : middleLayers_) {
+    Argument& arg = layer->getOutput();
+    arg.value.reset();
+  }
+}
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
+class CombinedEvaluator : public Evaluator {
+ public:
+  void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
+    evaluators_.emplace_back(std::move(evaluator));
+  }
+  void start() override {
+    for (auto& evaluator : evaluators_) {
+      evaluator->start();
+    }
+  }
+
+  void finish() override {
+    for (auto& evaluator : evaluators_) {
+      evaluator->finish();
+    }
+  }
+
+  void eval(const NeuralNetwork& nn) override {
+    for (auto& evaluator : evaluators_) {
+      evaluator->eval(nn);
+    }
+  }
+  real evalImp(std::vector<Argument>& arguments) override {
+    (void)arguments;
+    return -1;
+  }
+  void printStats(std::ostream& os) const override {
+    for (auto& evaluator : evaluators_) {
+      evaluator->printStats(os);
+      os << ' ';
+    }
+  }
+
+  void distributeEval(ParameterClient2* client) override {
+    for (auto& evaluator : evaluators_) {
+      evaluator->distributeEval(client);
+    }
+  }
+
+ protected:
+  std::vector<std::unique_ptr<Evaluator>> evaluators_;
+
+  // Evaluator interface
+ public:
+  /**
+   * @brief getNames will return all inside evaluators' names.
+   * @param names [out]: return names.
+   */
+  void getNames(std::vector<std::string>* names) override {
+    for (auto& eval : evaluators_) {
+      eval->getNames(names);
+    }
+  }
+
+  /**
+   * @brief getValue could get all inside evaluators' value.
+   */
+  real getValue(const std::string& name, Error* err) const override {
+    return this->getMethodHelper<real>(
+        name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
+          return eval->getValue(name, err);
+        });
+  }
+
+  /**
+   * @brief getType could get all inside evaluators' type.
+   */
+  std::string getType(const std::string& name, Error* err) const override {
+    return this->getMethodHelper<std::string>(
+        name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
+          return eval->getType(name, err);
+        });
+  }
+
+ private:
+  template <typename T>
+  T getMethodHelper(const std::string& name,
+                    Error* err,
+                    const std::function<T(const std::unique_ptr<Evaluator>&)>&
+                        callback) const {
+    for (auto& eval : evaluators_) {
+      std::vector<std::string> names;
+      eval->getNames(&names);
+      if (std::find(names.begin(), names.end(), name) != names.end()) {
+        return callback(eval);
+      }
+    }
+    *err = Error("No such key %s", name.c_str());
+    return T();
+  }
+};
+
+class SubnetEvaluator : public CombinedEvaluator {
+ public:
+  SubnetEvaluator(const std::string& layerName,
+                  std::unique_ptr<Evaluator>&& evaluator)
+      : layerName_(layerName) {
+    addEvaluator(std::move(evaluator));
+  }
+  void eval(const NeuralNetwork& nn) override {
+    const LayerPtr& layer = nn.getLayer(layerName_);
+    CHECK(layer) << "Nonexisted layer: " << layerName_ << " in submodel "
+                 << nn.getName();
+    bool accessed = false;
+    layer->accessSubNetwork([this, &accessed](NeuralNetwork& subnet) {
+      subnet.eval(evaluators_[0].get());
+      accessed = true;
+    });
+    CHECK(accessed) << "There is no subnetwork for layer " << layerName_
+                    << " in submodel " << nn.getName();
+  }
+
+ protected:
+  std::string layerName_;
+};
+
+Evaluator* NeuralNetwork::makeEvaluator() const {
+  CombinedEvaluator* combinedEvaluator = new CombinedEvaluator();
+  auto subModelConfig = std::find_if(config_.sub_models().begin(),
+                                     config_.sub_models().end(),
+                                     [=](const SubModelConfig& sub_model) {
+                                       return sub_model.name() == subModelName_;
+                                     });
+  bool useSubModel = (subModelConfig != config_.sub_models().end());
+  CHECK_EQ(useSubModel, !subModelName_.empty());
+  if (useSubModel) {
+    // create the evaluators that belong to CURRENT submodel
+    for (int i = 0; i < subModelConfig->evaluator_names_size(); ++i) {
+      // find evaluator by name
+      auto thisEvalConfig = std::find_if(
+          config_.evaluators().begin(),
+          config_.evaluators().end(),
+          [=](const EvaluatorConfig& ecfg) {
+            return ecfg.name() == subModelConfig->evaluator_names(i);
+          });
+      bool validConfig = (thisEvalConfig != config_.evaluators().end());
+      if (validConfig) {
+        std::unique_ptr<Evaluator> evaluator(
+            Evaluator::create(*thisEvalConfig));
+        combinedEvaluator->addEvaluator(std::move(evaluator));
+      }
+    }
+    for (auto& layer : layers_) {
+      layer->accessSubNetwork(
+          [layer, combinedEvaluator](NeuralNetwork& subnet) {
+            std::unique_ptr<Evaluator> subEvaluator(new SubnetEvaluator(
+                layer->getName(),
+                std::unique_ptr<Evaluator>(subnet.makeEvaluator())));
+            combinedEvaluator->addEvaluator(std::move(subEvaluator));
+          });
+    }
+  } else {
+    for (const EvaluatorConfig& evalConfig : config_.evaluators()) {
+      std::unique_ptr<Evaluator> evaluator(Evaluator::create(evalConfig));
+      combinedEvaluator->addEvaluator(std::move(evaluator));
+    }
+  }
+  return combinedEvaluator;
+}
+
+void NeuralNetwork::eval(Evaluator* evaluator) const { evaluator->eval(*this); }
+
+#endif
+
+void NeuralNetwork::setOutputGrad(const std::vector<Argument>& args) {
+  CHECK_GE(outputLayers_.size(), args.size());
+  for (size_t i = 0; i < args.size(); ++i) {
+    outputLayers_[i]->getOutput().grad = args[i].grad;
+  }
+}
+
+extern NeuralNetwork* newCustomNerualNetwork(const std::string& name,
+                                             NeuralNetwork* network)
+    __attribute__((weak));
+
+NeuralNetwork* NeuralNetwork::newNeuralNetwork(const std::string& name,
+                                               NeuralNetwork* rootNetwork) {
+  if (newCustomNerualNetwork) {
+    return newCustomNerualNetwork(name, rootNetwork);
+  } else {
+    return new NeuralNetwork(name, rootNetwork);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/NeuralNetwork.h b/paddle/legacy/gserver/gradientmachines/NeuralNetwork.h
new file mode 100644
index 0000000000000000000000000000000000000000..566157c8998a38aef4a3620a4dca7246c6e66391
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/NeuralNetwork.h
@@ -0,0 +1,179 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <map>
+#include <memory>
+
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/legacy/gserver/layers/CostLayer.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/gserver/layers/Layer.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/utils/ClassRegistrar.h"
+
+namespace paddle {
+/*
+ * @brief  Init function for the parameters.
+ * @param paramId: the id of the parameter to init.
+ * @param para: the pointer to the parameter to init.
+ * @param sharedParams: the pointer to an array of the parameter to be shared.
+ *                      If it is null, no parameter sharing is used.
+ *                      Only CPU paramters can be shared.
+ * It handles CPU, CPU sparse, CPU sparse remote,
+ * and GPU parameters differently. If the type
+ * of a parameter is NORMAL. Basically nothing need to be done.
+ * CPU value: NORMAL.
+ * CPU param: NORMAL.
+ *
+ * CPU sparse value: NORMAL.
+ * CPU sparse gradient: MAT_SPARSE_ROW_AUTO_GROW.
+ *
+ * CPU sparse remote value: MAT_SPARSE_ROW_PREFETCH(_FULL_SIZE).
+ * CPU sparse remote gradient: MAT_SPARSE_ROW (!sharedParams)
+ *                             MAT_SPARSE_ROW_AUTO_GROW (sharedParams)
+ *
+ * GPU value: NORMAL
+ * GPU param: NORMAL
+ */
+void parameterInitNN(int paramId,
+                     Parameter* para,
+                     std::vector<ParameterPtr>* sharedParams);
+
+class NeuralNetwork : public GradientMachine {
+ public:
+  virtual void init(const ModelConfig& config,
+                    ParamInitCallback callback = nullptr,
+                    const std::vector<ParameterType>& parameterTypes =
+                        std::vector<ParameterType>{PARAMETER_VALUE,
+                                                   PARAMETER_GRADIENT,
+                                                   PARAMETER_MOMENTUM},
+                    bool useGpu = FLAGS_use_gpu);
+
+  /**
+   * Connect two submodels and
+   * down-submodel's output become up-submodel's input.
+   * By default, connection is one by one,
+   * If the agent height is smaller than real layer, *height* has to be filled.
+   *
+   * @param realLayer  The down-submodel's output layer.
+   * @param agentLayer The up-submodel's input agent layer.
+   */
+  static void connect(LayerPtr agentLayer, LayerPtr realLayer, int height = 0);
+  void connect(std::string agentLayerName,
+               NeuralNetwork* srcNN,
+               std::string realLayerName);
+
+  virtual void prefetch(const std::vector<Argument>& inArgs);
+
+  virtual void forward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs,
+                       PassType passType);
+
+  virtual void backward(const UpdateCallback& callback = nullptr);
+
+  virtual Argument getLayerOutput(const std::string& layerName);
+
+  const LayerPtr& getLayer(const std::string& layerName) const {
+    auto it = layerMap_.find(layerName);
+    CHECK(it != layerMap_.end()) << "Unknown layer " << layerName;
+    return it->second;
+  }
+
+  virtual void onPassEnd();
+
+#ifndef PADDLE_MOBILE_INFERENCE
+  virtual Evaluator* makeEvaluator() const;
+
+  virtual void eval(Evaluator* evaluator) const;
+#endif
+
+  virtual void resetState();
+  virtual void setOutputGrad(const std::vector<Argument>& args);
+
+  /// set machine state
+  virtual void setState(const MachineState& machineState);
+
+  /// get machine state
+  virtual void getState(MachineState& machineState);
+
+  static NeuralNetwork* create(const ModelConfig& config);
+
+  ParameterMap* getParameterMap() { return &parameterMap_; }
+
+  /**
+   * @brief Access each layer as a for each loop.
+   * @param callback invoke with each layer.
+   */
+  template <typename T>
+  void forEachLayer(T callback) {
+    for (auto& l : layers_) {
+      if (callback(l)) {
+        break;
+      }
+    }
+  }
+
+  static NeuralNetwork* newNeuralNetwork(const std::string& name = "",
+                                         NeuralNetwork* rootNetwork = nullptr);
+
+  const std::string& getName() const { return subModelName_; }
+
+  /// some finish work, like convert the weight format of MKLDNNLayers
+  void finish();
+
+  /**
+   * @brief   Release the middle layer's output memory.
+   *
+   * @note    This function is used for memory optimization in inference.
+   */
+  void releaseOutput();
+
+ protected:
+  /**
+   * The constructor of NeuralNetwork.
+   * The sub networks can get parameters_ and parameterMap_
+   * from base NeuralNetwork.
+   *
+   * @param subModelName The name of sub-model.
+   * @param rootNetwork  It used in MultiNetwork.
+   */
+  NeuralNetwork(std::string subModelName = "",
+                NeuralNetwork* rootNetwork = nullptr)
+      : subModelName_(subModelName), rootNetwork_(rootNetwork) {}
+
+  std::string subModelName_;
+  ModelConfig config_;
+  std::vector<LayerPtr> layers_;
+  ParameterMap parameterMap_;
+  LayerMap layerMap_;
+
+  std::vector<DataLayerPtr> dataLayers_;
+  std::vector<LayerPtr> outputLayers_;
+  std::vector<LayerPtr> middleLayers_;
+
+  static std::map<std::string, bool> dllInitMap;
+
+  NeuralNetwork* rootNetwork_;
+
+  /// Whether parameter of this NN is initialized by its own
+  /// (i.e., not by callback supplied with the caller)
+  bool paramSelfInited_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.cpp b/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..450514ca88a319b30ca3ebae669c78502087540a
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.cpp
@@ -0,0 +1,213 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include "ParallelNeuralNetwork.h"
+
+#include <pthread.h>
+#include <sched.h>
+
+namespace paddle {
+
+void ParallelNeuralNetwork::init(
+    const ModelConfig& config,
+    ParamInitCallback callback,
+    const std::vector<ParameterType>& parameterTypes,
+    bool useGpu) {
+  NeuralNetwork::init(config, callback, parameterTypes, useGpu);
+
+  if (config.type() == "recurrent_nn") {
+    LOG(FATAL)
+        << "You can not add `--parallel_nn=true` on the command line, "
+        << "parallel_nn training mode does not support the recurrent_nn model.";
+  }
+
+  useGpu_ = useGpu;
+  numDevices_ = 0;
+  if (useGpu_) {
+    numDevices_ = hl_get_device_count();
+  }
+
+  for (auto& layer : layers_) {
+    int deviceId = layer->getDeviceId();
+    CHECK_LT(deviceId, numDevices_);
+    addComputeThread(deviceId);
+  }
+}
+
+void ParallelNeuralNetwork::addComputeThread(int deviceId) {
+  for (auto& thread : threads_) {
+    if (thread->getDeviceId() == deviceId) {
+      return;
+    }
+  }
+
+  threads_.emplace_back(new ParallelThread(
+      threads_.size(), deviceId, deviceId >= 0 ? useGpu_ : false));
+}
+
+void ParallelNeuralNetwork::waitAllThread() {
+  for (auto& thread : threads_) {
+    thread->jobEnqueue(NULL, TASK_END_LAYER);
+  }
+
+  for (size_t i = 0; i < threads_.size(); i++) {
+    threads_[i]->queue_.waitEmpty();
+  }
+}
+
+void ParallelNeuralNetwork::dispatchByDeviceId(int deviceId,
+                                               LayerPtr layer,
+                                               TaskType task) {
+  for (auto& thread : threads_) {
+    if (thread->getDeviceId() == deviceId) {
+      thread->jobEnqueue(layer, task);
+      return;
+    }
+  }
+  LOG(FATAL) << "No specific device thread ";
+}
+
+void ParallelNeuralNetwork::forward(const std::vector<Argument>& inArgs,
+                                    std::vector<Argument>* outArgs,
+                                    PassType passType) {
+  for (auto& thread : threads_) {
+    thread->setForwardPassType(passType);
+  }
+  CHECK_EQ(inArgs.size(), dataLayers_.size());
+  outArgs->resize(outputLayers_.size());
+  for (size_t i = 0; i != dataLayers_.size(); ++i) {
+    const_cast<Argument&>(inArgs[i]).deviceId = -1;
+    dataLayers_[i]->setData(inArgs[i]);
+  }
+
+  for (auto& layer : layers_) {
+    dispatchByDeviceId(layer->getDeviceId(), layer, TASK_FORWARD);
+  }
+
+  {
+    REGISTER_TIMER("forwardTime");
+    waitAllThread();
+  }
+  outArgs->clear();
+  outArgs->reserve(outputLayers_.size());
+  for (auto& layer : outputLayers_) {
+    outArgs->push_back(layer->getOutput());
+  }
+}
+
+void ParallelNeuralNetwork::backward(const UpdateCallback& callback) {
+  for (auto& thread : threads_) {
+    thread->setBackwardCallback(callback);
+  }
+
+  FOR_EACH_R(layer, layers_) {
+    dispatchByDeviceId((*layer)->getDeviceId(), *layer, TASK_BACKWARD);
+  }
+  {
+    REGISTER_TIMER("backwardTime");
+    waitAllThread();
+  }
+}
+
+void ParallelNeuralNetwork::forwardBackward(const std::vector<Argument>& inArgs,
+                                            std::vector<Argument>* outArgs,
+                                            PassType passType,
+                                            const UpdateCallback& callback) {
+  forward(inArgs, outArgs, passType);
+  backward(callback);
+}
+
+void ParallelNeuralNetwork::start() {
+  for (auto& thread : threads_) {
+    thread->start();
+  }
+}
+
+ParallelThread::ParallelThread(int threadId, int deviceId, bool useGpu)
+    : threadId_(threadId), deviceId_(deviceId), useGpu_(useGpu) {}
+
+ParallelThread::~ParallelThread() { stop(); }
+
+void ParallelThread::stop() {
+  if (computeThread_) {
+    jobEnqueue(NULL, TASK_THREAD_FINISH);
+    computeThread_->join();
+    computeThread_.reset(nullptr);
+  }
+}
+
+void ParallelThread::computeThread() {
+  LOG(INFO) << "gradComputeThread " << threadId_;
+
+  if (useGpu_) {
+    hl_init(deviceId_);
+  }
+
+  while (true) {
+    struct Job job_work = queue_.dequeue();
+
+    if (job_work.task_ == TASK_END_LAYER) {
+      continue;
+    } else if (job_work.task_ == TASK_THREAD_FINISH) {
+      break;
+    }
+
+    if (TASK_FORWARD == job_work.task_) {
+      {
+        REGISTER_TIMER_INFO("waitInputValue",
+                            job_work.layer_->getName().c_str());
+        job_work.layer_->waitInputValue();
+      }
+      {
+        REGISTER_TIMER_INFO("threadForwardTimer",
+                            job_work.layer_->getName().c_str());
+        job_work.layer_->forward(passType_);
+      }
+      {
+        REGISTER_TIMER_INFO("copyOutputToOtherDevice",
+                            job_work.layer_->getName().c_str());
+        job_work.layer_->copyOutputToOtherDevice();
+      }
+    } else {
+      {
+        REGISTER_TIMER_INFO("waitAndMergeOutputGrad",
+                            job_work.layer_->getName().c_str());
+        job_work.layer_->waitAndMergeOutputGrad();
+      }
+      {
+        REGISTER_TIMER_INFO("threadBackwardTimer",
+                            job_work.layer_->getName().c_str());
+        job_work.layer_->backward(backwardCallback_);
+      }
+      hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+      job_work.layer_->markAllInputGrad();
+    }
+  }
+}
+
+void ParallelThread::start() {
+  computeThread_.reset(new std::thread([this]() { computeThread(); }));
+}
+
+void ParallelThread::jobEnqueue(LayerPtr layer, TaskType task) {
+  struct Job job_work;
+  job_work.layer_ = layer;
+  job_work.task_ = task;
+  queue_.enqueue(job_work);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.h b/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.h
new file mode 100644
index 0000000000000000000000000000000000000000..c091459506ad477bed3f429a22071eccedd664bb
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/ParallelNeuralNetwork.h
@@ -0,0 +1,113 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "NeuralNetwork.h"
+
+namespace paddle {
+
+class ParallelThread;
+
+enum TaskType {
+  TASK_FORWARD = 0,
+  TASK_BACKWARD = 1,
+  TASK_END_LAYER = 2,
+  TASK_THREAD_FINISH = 3,
+};
+
+/**
+ * A ParallelNeuralNetwork is capable of calculating a neural network through
+ * multiple threads in parallel.
+ */
+class ParallelNeuralNetwork : public NeuralNetwork {
+ public:
+  ParallelNeuralNetwork(std::string subModelName = "",
+                        NeuralNetwork *rootNetwork = nullptr)
+      : NeuralNetwork(subModelName, rootNetwork) {}
+
+  virtual void init(const ModelConfig &config,
+                    ParamInitCallback callback = nullptr,
+                    const std::vector<ParameterType> &parameterTypes =
+                        std::vector<ParameterType>{PARAMETER_VALUE,
+                                                   PARAMETER_GRADIENT,
+                                                   PARAMETER_MOMENTUM},
+                    bool useGpu = FLAGS_use_gpu);
+
+  virtual void forward(const std::vector<Argument> &inArgs,
+                       std::vector<Argument> *outArgs,
+                       PassType passType);
+
+  virtual void backward(const UpdateCallback &callback = nullptr);
+
+  void forwardBackward(const std::vector<Argument> &inArgs,
+                       std::vector<Argument> *outArgs,
+                       PassType passType,
+                       const UpdateCallback &callback = NULL);
+
+  virtual void start();
+
+  void addComputeThread(int deviceId);
+
+  void dispatchByDeviceId(int deviceId, LayerPtr layer, TaskType task);
+
+  void waitAllThread();
+
+  // virtual void eval(Evaluator* evaluator);
+
+ protected:
+  bool useGpu_;
+  /// number of gpu devices
+  int numDevices_;
+  std::vector<std::unique_ptr<ParallelThread>> threads_;
+};
+
+class ParallelThread {
+ public:
+  ParallelThread(int threadId, int deviceId, bool useGpu);
+  ~ParallelThread();
+  void jobEnqueue(LayerPtr layer, TaskType task);
+  void start();
+  void stop();
+  int getDeviceId() const { return deviceId_; }
+
+  void setBackwardCallback(const UpdateCallback &callback) {
+    backwardCallback_ = callback;
+  }
+  void setForwardPassType(PassType passType) { passType_ = passType; }
+
+ protected:
+  void computeThread();
+
+ public:
+  struct Job {
+    LayerPtr layer_;
+    TaskType task_;
+  };
+  typedef Queue<Job> JobQueue;
+  JobQueue queue_;
+
+ protected:
+  /// from 0 to threads-1
+  int threadId_;
+  /// the GPU device Id which the computeThread_ used
+  int deviceId_;
+  bool useGpu_;
+  std::unique_ptr<std::thread> computeThread_;
+  /// whether the thread should stop
+  bool stopping_;
+  UpdateCallback backwardCallback_;
+  PassType passType_;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e49f042404f80a21293545023efa3e68417c1edb
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -0,0 +1,1501 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "RecurrentGradientMachine.h"
+#include <dlfcn.h>
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <limits>
+#include "NeuralNetwork.h"
+#include "paddle/legacy/gserver/layers/AgentLayer.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
+
+DEFINE_string(diy_beam_search_prob_so, "", "the diy beam search cost so");
+
+static const char* DIY_CALC_PROB_SYMBOL_NAME = "calc_prob";
+static const char* DIY_START_CALC_PROB_SYMBOL_NAME = "start_calc_prob";
+static const char* DIY_FINISH_CALC_PROB_SYMBOL_NAME = "finish_calc_prob";
+
+namespace paddle {
+
+/**
+ * Start Custom Calculate Probability callback type.
+ *
+ * @param nNode, nodes: the path will be explored. nNodes is array size.
+ *                      nodes is array elements.
+ *
+ * @return: A custom handler id that will passed to another callback.
+ */
+typedef int (*DiyStartCalcProbCallback)(size_t nNodes, int* nodes);
+
+/**
+ * Doing Custom Calculation of Probability callback type.
+ *
+ * @param handler: User custom handler. The return value from start calc prob.
+ * @param nNode, nodes: Array. The current path.
+ * @param curProb: The current log probability that neural network returns.
+ *
+ * @return: Log probability which user calculated, it will be updated to this
+ *          path.
+ * @NOTE: Return -INFINITY will DROP this path IMMEDIATELY!!
+ */
+typedef real (*DiyCalcProbCallback)(
+    int handler, size_t nNodes, int* nodes, real curProb, bool atEos);
+
+/**
+ * Finish Custom Calculation of Probability callback type.
+ *
+ * @param handler: User custom handler. The return value from start calc prob.
+ */
+typedef void (*DiyStopCalcProbCallback)(int handler);
+
+static DiyCalcProbCallback gDiyProbMethod = nullptr;
+static DiyStartCalcProbCallback gDiyProbStart = nullptr;
+static DiyStopCalcProbCallback gDiyProbStop = nullptr;
+static void* gDiyProbHandle = nullptr;
+
+static void exit_diy_prob() { dlclose(gDiyProbHandle); }
+
+template <typename SymbolType>
+static inline SymbolType loadDiySymbol(const char* symbolName) {
+  void* sym = dlsym(gDiyProbHandle, symbolName);
+  CHECK(sym) << "Cannot load symbol " << symbolName << " from "
+             << FLAGS_diy_beam_search_prob_so;
+  return reinterpret_cast<SymbolType>(sym);
+}
+
+static InitFunction __init__diy_prob_method(
+    [] {
+      std::string soName = FLAGS_diy_beam_search_prob_so;
+      if (!soName.empty()) {
+        gDiyProbHandle = dlopen(soName.c_str(), RTLD_LAZY);
+        CHECK(gDiyProbHandle) << "Cannot Open DIY Prob So " << soName;
+        atexit(exit_diy_prob);
+        gDiyProbMethod =
+            loadDiySymbol<decltype(gDiyProbMethod)>(DIY_CALC_PROB_SYMBOL_NAME);
+        gDiyProbStart = loadDiySymbol<decltype(gDiyProbStart)>(
+            DIY_START_CALC_PROB_SYMBOL_NAME);
+        gDiyProbStop = loadDiySymbol<decltype(gDiyProbStop)>(
+            DIY_FINISH_CALC_PROB_SYMBOL_NAME);
+      }
+    },
+    std::numeric_limits<int>::max());
+
+class BeamSearchControlCallbacks {
+ public:
+  RecurrentGradientMachine::BeamSearchCandidatesAdjustCallback
+      beamSearchCandidateAdjust;
+  RecurrentGradientMachine::NormOrDropNodeCallback normOrDropNode;
+  RecurrentGradientMachine::DropCallback stopDetermineCandidates;
+
+  //! for gcc46 aggregate initialization is not very well, so we need to
+  //! explicit
+  BeamSearchControlCallbacks(
+      const RecurrentGradientMachine::BeamSearchCandidatesAdjustCallback&
+          candidateAdjust,
+      const RecurrentGradientMachine::NormOrDropNodeCallback& norm,
+      const RecurrentGradientMachine::DropCallback& stop)
+      : beamSearchCandidateAdjust(candidateAdjust),
+        normOrDropNode(norm),
+        stopDetermineCandidates(stop) {}
+};
+
+class BeamSearchStatisticsCallbacks {
+ public:
+  RecurrentGradientMachine::EachStepCallback onEachStepStarted;
+  RecurrentGradientMachine::EachStepCallback onEachStepStoped;
+
+  BeamSearchStatisticsCallbacks(
+      const RecurrentGradientMachine::EachStepCallback& start,
+      const RecurrentGradientMachine::EachStepCallback& stop)
+      : onEachStepStarted(start), onEachStepStoped(stop) {}
+};
+
+RecurrentGradientMachine::RecurrentGradientMachine(
+    const std::string& subModelName, NeuralNetwork* rootNetwork)
+    : NeuralNetwork(subModelName),
+      rootNetwork_(rootNetwork),
+      beamSearchCtrlCallbacks_(nullptr),
+      beamSearchStatistics_(nullptr) {
+  CHECK(!subModelName_.empty());
+}
+
+/**
+ * bias layer, as input of memory frame 0 will give vector of zeros
+ * if bias parameter is not set.
+ *
+ * boot bias layer create directly in recurrent gradient machine, because:
+ *
+ * 1. It is only one frame, so it should not be placed in layer group,
+ *    which is one instance for every one frame.
+ *
+ * 2. It is no input layer, so it need resetHeight() before forward(),
+ *    and resetHeight() must be called in recurrent gradient machine,
+ *    so it's should not be placed in root network.
+ */
+class BootBiasLayer : public Layer {
+ protected:
+  std::unique_ptr<Weight> biases_;
+  IVectorPtr cpuIds_;
+
+ public:
+  explicit BootBiasLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
+    if (!Layer::init(layerMap, parameterMap)) return false;
+
+    if (biasParameter_) {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+    }
+    return true;
+  }
+
+  void resetHeight(int height) {
+    if (config_.has_bos_id()) {  // used as a constant id layerConfig
+      IVector::resizeOrCreate(output_.ids, height, useGpu_);
+      output_.ids->reset((int)config_.bos_id());
+    } else {
+      resetOutput(height, getSize());
+    }
+  }
+
+  void forward(PassType passType) override {
+    if (biases_) {
+      MatrixPtr outV = getOutputValue();
+      outV->addBias(*(biases_->getW()), 1);
+      forwardActivation();
+    }
+  }
+
+  void backward(const UpdateCallback& callback) override {
+    if (biases_ && biases_->getWGrad()) {
+      backwardActivation();
+      biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+      biases_->getParameterPtr()->incUpdate(callback);
+    }
+  }
+};
+
+void RecurrentGradientMachine::init(
+    const ModelConfig& config,
+    ParamInitCallback callback,
+    const std::vector<ParameterType>& parameterTypes,
+    bool useGpu) {
+  NeuralNetwork::init(config, callback, parameterTypes, useGpu);
+  useGpu_ = useGpu;
+
+  auto subModelConfig =
+      std::find_if(config.sub_models().begin(),
+                   config.sub_models().end(),
+                   [this](const SubModelConfig& sub_model) {
+                     return sub_model.name() == this->subModelName_;
+                   });
+  CHECK(subModelConfig != config.sub_models().end());
+  reversed_ = subModelConfig->reversed();
+  generating_ = subModelConfig->has_generator();
+
+  inFrameLines_.resize(subModelConfig->in_links_size());
+  for (size_t i = 0; i < inFrameLines_.size(); ++i) {
+    inFrameLines_[i].linkName = subModelConfig->in_links(i).link_name();
+    inFrameLines_[i].inLayer =
+        rootNetwork_->getLayer(subModelConfig->in_links(i).layer_name());
+  }
+
+  outFrameLines_.resize(subModelConfig->out_links_size());
+  for (size_t i = 0; i < outFrameLines_.size(); ++i) {
+    auto& linkPair = subModelConfig->out_links(i);
+    outFrameLines_[i].layerName = linkPair.layer_name();
+    outFrameLines_[i].agentLayer = rootNetwork_->getLayer(linkPair.link_name());
+  }
+
+  memoryFrameLines_.resize(subModelConfig->memories_size());
+  for (size_t i = 0; i < memoryFrameLines_.size(); ++i) {
+    auto& memoryConfig = subModelConfig->memories(i);
+    memoryFrameLines_[i].layerName = memoryConfig.layer_name();
+    memoryFrameLines_[i].linkName = memoryConfig.link_name();
+    auto agentConfig =
+        std::find_if(config.layers().begin(),
+                     config.layers().end(),
+                     [&memoryConfig](const LayerConfig& layerConfig) {
+                       return layerConfig.name() == memoryConfig.link_name();
+                     });
+    CHECK(agentConfig != config.layers().end());
+    if (memoryConfig.has_boot_layer_name()) {
+      memoryFrameLines_[i].rootLayer =
+          rootNetwork_->getLayer(memoryConfig.boot_layer_name());
+
+      LayerConfig scatterConfig = *agentConfig;
+      memoryFrameLines_[i].rootAgent.reset(
+          new ScatterAgentLayer(scatterConfig));
+      memoryFrameLines_[i].rootAgent->init(LayerMap(), parameterMap_);
+
+      memoryFrameLines_[i].bootLayer = memoryFrameLines_[i].rootAgent;
+    } else {
+      LayerConfig biasConfig = *agentConfig;
+      if (memoryConfig.has_boot_bias_parameter_name()) {
+        biasConfig.set_bias_parameter_name(
+            memoryConfig.boot_bias_parameter_name());
+        biasConfig.set_active_type(memoryConfig.boot_bias_active_type());
+      } else if (memoryConfig.has_boot_with_const_id()) {
+        biasConfig.set_bos_id(memoryConfig.boot_with_const_id());
+      }
+      memoryFrameLines_[i].biasLayer.reset(new BootBiasLayer(biasConfig));
+      memoryFrameLines_[i].biasLayer->init(LayerMap(), parameterMap_);
+
+      memoryFrameLines_[i].bootLayer = memoryFrameLines_[i].biasLayer;
+    }
+
+    if (subModelConfig->has_generator()) {
+      memoryFrameLines_[i].scatterAgents.resize(2);
+      for (auto& agent : memoryFrameLines_[i].scatterAgents) {
+        agent.reset(new ScatterAgentLayer(*agentConfig));
+        agent->init(LayerMap(), parameterMap_);
+      }
+    }
+  }
+
+  if (subModelConfig->has_generator()) {
+    generator_.config = subModelConfig->generator();
+    eosFrameLine_.reset(new EosFrameLine);
+    maxSequenceLength_ = generator_.config.max_num_frames();
+  }
+
+  // get parameters actually used by this Layer Group
+  resizeOrCreateFrames(1);
+  for (auto& para : frames_[0]->getParameters()) {
+    if (para->getSharedCount() > 0) {
+      parameterIds_.push_back(para->getID());
+    }
+  }
+  for (auto& para : parameters_) {  // bias layer parameters
+    if (para->getSharedCount() > 0) {
+      parameterIds_.push_back(para->getID());
+    }
+  }
+}
+
+void RecurrentGradientMachine::resizeOrCreateFrames(int numFrames) {
+  if ((size_t)numFrames <= frames_.size()) {
+    return;
+  }
+
+  frames_.reserve(numFrames);
+  for (auto& inFrameLine : inFrameLines_) {
+    inFrameLine.agents.reserve(numFrames);
+  }
+  for (auto& outFrameLine : outFrameLines_) {
+    outFrameLine.frames.reserve(numFrames);
+  }
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    memoryFrameLine.frames.reserve(numFrames);
+    memoryFrameLine.agents.reserve(numFrames);
+  }
+  if (eosFrameLine_) {
+    eosFrameLine_->layers.reserve(numFrames);
+  }
+
+  ParamInitCallback subParamInitCb = [this](int paramId, Parameter* para) {
+    para->enableSharedType(PARAMETER_VALUE,
+                           this->parameters_[paramId]->getBuf(PARAMETER_VALUE),
+                           this->parameters_[paramId]->getMat(PARAMETER_VALUE));
+    para->enableSharedType(
+        PARAMETER_GRADIENT,
+        this->parameters_[paramId]->getBuf(PARAMETER_GRADIENT),
+        this->parameters_[paramId]->getMat(PARAMETER_GRADIENT));
+  };
+
+  for (int i = frames_.size(); i < numFrames; ++i) {
+    std::unique_ptr<NeuralNetwork> frame(
+        NeuralNetwork::newNeuralNetwork(subModelName_));
+    frame->init(config_, subParamInitCb);
+
+    for (auto& inFrameLine : inFrameLines_) {
+      inFrameLine.agents.push_back(frame->getLayer(inFrameLine.linkName));
+    }
+
+    for (auto& outFrameLine : outFrameLines_) {
+      outFrameLine.frames.push_back(frame->getLayer(outFrameLine.layerName));
+    }
+    for (auto& memoryFrameLine : memoryFrameLines_) {
+      memoryFrameLine.frames.push_back(
+          frame->getLayer(memoryFrameLine.layerName));
+      memoryFrameLine.agents.push_back(
+          frame->getLayer(memoryFrameLine.linkName));
+    }
+    if (eosFrameLine_) {
+      eosFrameLine_->layers.push_back(
+          frame->getLayer(generator_.config.eos_layer_name()));
+    }
+
+    frames_.emplace_back(std::move(frame));
+  }
+}
+
+void RecurrentGradientMachine::resizeBootFrame(int numSequences) {
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    if (memoryFrameLine.biasLayer) {
+      auto biasLayer =
+          dynamic_cast<BootBiasLayer*>(memoryFrameLine.biasLayer.get());
+      CHECK_NOTNULL(biasLayer);
+      biasLayer->resetHeight(numSequences);
+    } else {  // check input root layer height
+      CHECK_EQ(numSequences,
+               memoryFrameLine.rootLayer->getOutput().getNumSequences());
+    }
+  }
+}
+
+void RecurrentGradientMachine::prefetch(const std::vector<Argument>& inArgs) {
+  LOG(FATAL) << "should not use this function";
+}
+
+void RecurrentGradientMachine::checkInputConsistency(
+    int inlinkId, const std::vector<Argument::SeqInfo>& seqInfo) {
+  if (commonSeqInfo_.empty()) {
+    commonSeqInfo_.resize(seqInfo.size());
+    for (size_t i = 0; i < seqInfo.size(); ++i) {
+      commonSeqInfo_[i].topLevelLength = seqInfo[i].topLevelLength;
+      commonSeqInfo_[i].seqId = seqInfo[i].seqId;
+    }
+  } else {
+    CHECK_EQ(commonSeqInfo_.size(), seqInfo.size())
+        << " RecurrentGroup " << subModelName_ << " input " << inlinkId
+        << " has mismatched number of sequences";
+    for (size_t i = 0; i < seqInfo.size(); ++i) {
+      CHECK_EQ(commonSeqInfo_[i].topLevelLength, seqInfo[i].topLevelLength)
+          << " RecurrentGroup " << subModelName_ << " input " << inlinkId
+          << " has mismatched sequence length";
+      CHECK_EQ(commonSeqInfo_[i].seqId, seqInfo[i].seqId)
+          << " RecurrentGroup " << subModelName_ << " input " << inlinkId
+          << " has mismatched sequence length";
+    }
+  }
+}
+
+void RecurrentGradientMachine::calcNumSequencesAtEachStep() {
+  int numSequences = commonSeqInfo_.size();
+  numSeqs_.resize(maxSequenceLength_);
+  for (int i = 0; i < numSequences; ++i) {
+    for (int j = 0; j < commonSeqInfo_[i].topLevelLength; ++j) {
+      numSeqs_[j] = i + 1;
+    }
+  }
+}
+
+void RecurrentGradientMachine::reorganizeInput(PassType passType) {
+  info_.clear();
+  info_.resize(inFrameLines_.size());
+
+  commonSeqInfo_.clear();
+  seqInfos_.clear();
+  seqInfos_.resize(inFrameLines_.size());
+
+  for (size_t i = 0; i < inFrameLines_.size(); i++) {
+    const Argument& input = inFrameLines_[i].inLayer->getOutput();
+    if (!input.hasSeq()) {
+      continue;
+    }
+    input.getSeqInfo(&seqInfos_[i]);
+    checkInputConsistency(i, seqInfos_[i]);
+  }
+  CHECK(!commonSeqInfo_.empty())
+      << "At least one input needs to be sequence or subsequence";
+  maxSequenceLength_ = commonSeqInfo_[0].topLevelLength;
+
+  calcNumSequencesAtEachStep();
+
+  for (size_t i = 0; i < inFrameLines_.size(); ++i) {
+    const Argument& input = inFrameLines_[i].inLayer->getOutput();
+    if (!input.hasSeq()) {
+      seqInfos_[i] = commonSeqInfo_;
+    }
+    createInFrameInfo(i, input, passType);
+  }
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+
+    // inFrameLine select rows in real layer one time
+    for (size_t i = 0; i < inFrameLines_.size(); i++) {
+      selectRowsOneTime(inFrameLines_[i].inLayer,
+                        info_[i].allIds,
+                        &(inFrameLines_[i].outArg),
+                        passType);
+    }
+  }
+}
+
+void RecurrentGradientMachine::reorganizeOutput(PassType passType) {
+  calcSequenceStartPositions();
+  for (size_t i = 0; i < outFrameLines_.size(); ++i) {
+    Info info;
+    auto& outFrameLine = outFrameLines_[i];
+    ICpuGpuVectorPtr sequenceStartPositions;
+    ICpuGpuVectorPtr subSequenceStartPositions;
+    createOutFrameInfo(
+        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
+    auto gatherAgent =
+        dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
+    CHECK_NOTNULL(gatherAgent);
+    gatherAgent->copyIdAndSequenceInfo(sequenceStartPositions,
+                                       subSequenceStartPositions,
+                                       info.allIds,
+                                       info.idIndex);
+  }
+}
+
+void RecurrentGradientMachine::connectFrames(PassType passType) {
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    if (memoryFrameLine.rootAgent) {
+      auto scatterAgent =
+          dynamic_cast<ScatterAgentLayer*>(memoryFrameLine.rootAgent.get());
+      createMemoryFrameInfo(&memoryFrameLine, passType);
+      scatterAgent->setRealLayerAndOutput(memoryFrameLine.rootLayer,
+                                          memoryFrameLine.outArg,
+                                          memoryFrameLine.allIds,
+                                          /* idIndex */ 0,
+                                          memoryFrameLine.allIds->getSize(),
+                                          /* handleBackward */ true);
+      if (memoryFrameLine.sequenceStartPositions) {
+        int size = memoryFrameLine.sequenceStartPositions->getSize();
+        scatterAgent->setSequenceStartPositions(
+            memoryFrameLine.sequenceStartPositions,
+            /* seqStartPosIndex */ 0,
+            size);
+      }
+    }
+  }
+
+  for (auto& outFrameLine : outFrameLines_) {
+    auto gatherAgent =
+        dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
+    gatherAgent->clearRealLayers();
+  }
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    // connect in_links
+    for (size_t j = 0; j < inFrameLines_.size(); ++j) {
+      Info& info = info_[j];
+      // idSize denotes the sum number of tokens in each length i
+      int idIndex = info.idIndex.empty() ? 0 : info.idIndex[i];
+      int idSize = info.idIndex.empty() ? numSeqs_[i]
+                                        : info.idIndex[i + 1] - info.idIndex[i];
+      InFrameLine inFrameLine = inFrameLines_[j];
+      auto scatterAgent =
+          dynamic_cast<ScatterAgentLayer*>(inFrameLine.agents[i].get());
+      scatterAgent->setRealLayerAndOutput(inFrameLine.inLayer,
+                                          inFrameLine.outArg,
+                                          info.allIds,
+                                          idIndex,
+                                          idSize,
+                                          i == 0);
+      if (info.sequenceStartPositions) {
+        // size: the length of subsequence
+        int size = info.seqStartPosIndex[i + 1] - info.seqStartPosIndex[i];
+        scatterAgent->setSequenceStartPositions(
+            info.sequenceStartPositions, info.seqStartPosIndex[i], size);
+      }
+    }
+
+    // connect out_links
+    for (auto& outFrameLine : outFrameLines_) {
+      auto gatherAgent =
+          dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
+      gatherAgent->addRealLayer(outFrameLine.frames[i]);
+    }
+    for (auto& memoryFrameLine : memoryFrameLines_) {
+      NeuralNetwork::connect(
+          memoryFrameLine.agents[i],
+          i == 0 ? memoryFrameLine.bootLayer : memoryFrameLine.frames[i - 1],
+          numSeqs_[i] /*height of agent*/);
+    }
+  }
+}
+
+void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
+                                       std::vector<Argument>* outArgs,
+                                       PassType passType) {
+  /* inArgs and outArgs are not used.
+     The inputs are inFrameLines_[i].inLayer.
+     The outputs are outFramesLines_[i].agentLayer
+   */
+
+  if (generating_) {
+    generateSequence();
+    return;
+  }  // else forward..
+
+  reorganizeInput(passType);
+  int numSequences = commonSeqInfo_.size();
+
+  resizeOrCreateFrames(maxSequenceLength_);
+  resizeBootFrame(numSequences);
+
+  connectFrames(passType);
+
+  REGISTER_TIMER_INFO("RecurrentFwTime", "RecurrentFwTime");
+  // forward
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    memoryFrameLine.bootLayer->forward(passType);
+  }
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    const std::vector<Argument> inArgs;
+    std::vector<Argument> outArgs;
+    frames_[i]->forward(inArgs, &outArgs, passType);
+  }
+
+  reorganizeOutput(passType);
+}
+
+void RecurrentGradientMachine::backward(const UpdateCallback& callback) {
+  if (generating_) {
+    return;
+  }
+  REGISTER_TIMER_INFO("RecurrentBwTime", "RecurrentBwTime");
+  AsyncGpuBlock asyncGpuBlock;
+  for (int i = maxSequenceLength_ - 1; i >= 0; --i) {
+    frames_[i]->backward(nullptr);
+  }
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    memoryFrameLine.bootLayer->backward(nullptr);
+  }
+}
+
+void RecurrentGradientMachine::forwardBackward(
+    const std::vector<Argument>& inArgs,
+    std::vector<Argument>* outArgs,
+    PassType passType,
+    const UpdateCallback& callback) {
+  LOG(FATAL) << "should not use this function";
+}
+
+void RecurrentGradientMachine::eval(Evaluator* evaluator) const {
+  // call printers frame by frame
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    VLOG(2) << "Recurrent Layer Group eval frame " << i << " begin";
+    evaluator->eval(*(frames_[i].get()));
+    VLOG(2) << "Recurrent Layer Group eval frame " << i << " end";
+  }
+}
+
+void RecurrentGradientMachine::registerBeamSearchControlCallbacks(
+    const BeamSearchCandidatesAdjustCallback& adjustBeamSearch,
+    const NormOrDropNodeCallback& normOrDropNode,
+    const DropCallback& stopBeamSearch) {
+  this->removeBeamSearchControlCallbacks();
+  //! for gcc 46, aggregate initialization is not supported. TAT
+  this->beamSearchCtrlCallbacks_ = new BeamSearchControlCallbacks(
+      adjustBeamSearch, normOrDropNode, stopBeamSearch);
+}
+
+void RecurrentGradientMachine::removeBeamSearchControlCallbacks() {
+  if (this->beamSearchCtrlCallbacks_) {
+    delete this->beamSearchCtrlCallbacks_;
+    this->beamSearchCtrlCallbacks_ = nullptr;
+  }
+}
+
+void RecurrentGradientMachine::registerBeamSearchStatisticsCallbacks(
+    const EachStepCallback& onEachStepStarted,
+    const EachStepCallback& onEachStepStoped) {
+  this->removeBeamSearchStatisticsCallbacks();
+  this->beamSearchStatistics_ =
+      new BeamSearchStatisticsCallbacks(onEachStepStarted, onEachStepStoped);
+}
+
+void RecurrentGradientMachine::removeBeamSearchStatisticsCallbacks() {
+  if (this->beamSearchStatistics_) {
+    delete this->beamSearchStatistics_;
+    this->beamSearchStatistics_ = nullptr;
+  }
+}
+
+namespace {
+void lenToStarts(std::vector<int>& starts) {
+  int pos = 0;
+  starts.back() = 0;
+  for (auto& start : starts) {
+    int tmp = start;
+    start = pos;
+    pos += tmp;
+  }
+  starts.back() = pos;
+}
+}  // namespace
+
+void RecurrentGradientMachine::calcSequenceStartPositions() {
+  std::vector<int> starts(commonSeqInfo_.size() + 1);
+  for (auto& seqInfo : commonSeqInfo_) {
+    starts[seqInfo.seqId] = seqInfo.topLevelLength;
+  }
+  lenToStarts(starts);
+  ICpuGpuVector::resizeOrCreate(sequenceStartPositions_, starts.size(), false);
+  std::copy(starts.begin(),
+            starts.end(),
+            sequenceStartPositions_->getMutableData(false));
+}
+
+void RecurrentGradientMachine::checkOutputConsistency(
+    OutFrameLine& outFrameLine) {
+  bool hasSeq = outFrameLine.frames[0]->getOutput().hasSeq();
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    LayerPtr frame = outFrameLine.frames[i];
+    CHECK_EQ(hasSeq, frame->getOutput().hasSeq());
+    int numSequences = frame->getOutput().getNumSequences();
+    CHECK_EQ(numSeqs_[i], numSequences);
+  }
+}
+
+void RecurrentGradientMachine::createOutFrameInfo(
+    OutFrameLine& outFrameLine,
+    Info& info,
+    ICpuGpuVectorPtr& sequenceStartPositions,
+    ICpuGpuVectorPtr& subSequenceStartPositions) {
+  checkOutputConsistency(outFrameLine);
+
+  if (!outFrameLine.frames[0]->getOutput().hasSeq()) {
+    createOutFrameInfo_seq(
+        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
+  } else {
+    createOutFrameInfo_subseq(
+        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
+  }
+}
+
+void RecurrentGradientMachine::createOutFrameInfo_seq(
+    OutFrameLine& outFrameLine,
+    Info& info,
+    ICpuGpuVectorPtr& sequenceStartPositions,
+    ICpuGpuVectorPtr& subSequenceStartPositions) {
+  std::vector<int> allIds;
+  info.idIndex.resize(1, 0);  // first idIndex = 0
+
+  const int* starts = sequenceStartPositions_->getData(false);
+
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    LayerPtr frame = outFrameLine.frames[i];
+    size_t numSequences = frame->getOutput().getNumSequences();
+    for (size_t j = 0; j < numSequences; ++j) {
+      int seqStart = starts[commonSeqInfo_[j].seqId];
+      int seqLength = commonSeqInfo_[j].topLevelLength;
+      allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
+                                 : (seqStart + i));
+    }
+    info.idIndex.push_back(allIds.size());
+  }
+  sequenceStartPositions = sequenceStartPositions_;
+  copyScattedId(allIds, &info.allIds, allIds.size());
+  CHECK_EQ(info.idIndex.size(), static_cast<size_t>(maxSequenceLength_ + 1));
+}
+
+void RecurrentGradientMachine::createOutFrameInfo_subseq(
+    OutFrameLine& outFrameLine,
+    Info& info,
+    ICpuGpuVectorPtr& sequenceStartPositions,
+    ICpuGpuVectorPtr& subSequenceStartPositions) {
+  size_t numSequences = commonSeqInfo_.size();
+  std::vector<int> allIds;
+  info.idIndex.resize(1, 0);  // first idIndex = 0
+
+  const int* starts = sequenceStartPositions_->getData(false);
+  std::vector<int> subStarts(starts[numSequences] + 1);
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    LayerPtr frame = outFrameLine.frames[i];
+    size_t numSequences = frame->getOutput().getNumSequences();
+    const int* seqStarts =
+        frame->getOutput().sequenceStartPositions->getData(false);
+    for (size_t j = 0; j < numSequences; ++j) {
+      subStarts[starts[commonSeqInfo_[j].seqId] + i] =
+          seqStarts[j + 1] - seqStarts[j];
+    }
+  }
+  lenToStarts(subStarts);
+
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    LayerPtr frame = outFrameLine.frames[i];
+    size_t numSequences = frame->getOutput().getNumSequences();
+    for (size_t j = 0; j < numSequences; ++j) {
+      int pos = starts[commonSeqInfo_[j].seqId] + i;
+      int subSeqStart = subStarts[pos];
+      int subSeqEnd = subStarts[pos + 1];
+      for (int k = subSeqStart; k < subSeqEnd; ++k) {
+        allIds.push_back(k);
+      }
+    }
+    info.idIndex.push_back(allIds.size());
+  }
+
+  ICpuGpuVector::resizeOrCreate(
+      subSequenceStartPositions, subStarts.size(), false);
+  int* cpuSubSequenceStartPositions =
+      subSequenceStartPositions->getMutableData(false);
+  std::copy(subStarts.begin(), subStarts.end(), cpuSubSequenceStartPositions);
+  ICpuGpuVector::resizeOrCreate(
+      sequenceStartPositions, numSequences + 1, false);
+  int* cpuSequenceStartPositions =
+      sequenceStartPositions->getMutableData(false);
+  for (size_t i = 0; i <= numSequences; ++i) {
+    cpuSequenceStartPositions[i] = subStarts[starts[i]];
+  }
+  copyScattedId(allIds, &info.allIds, allIds.size());
+  CHECK_EQ(info.idIndex.size(), static_cast<size_t>(maxSequenceLength_ + 1));
+}
+
+/* create scattered id infomation for all realLayer of inFrameLines one time.
+ * If hasSubseq, will also create scattered sequenceStartPositions infomation
+ * for all realLayer of inFrameLines one time.
+ */
+void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
+                                                 const Argument& input,
+                                                 PassType passType) {
+  if (!input.hasSeq()) {
+    createInFrameInfo_nonseq(inlinkId, input, passType);
+  } else if (!input.hasSubseq()) {
+    createInFrameInfo_seq(inlinkId, input, passType);
+  } else {
+    createInFrameInfo_subseq(inlinkId, input, passType);
+  }
+}
+
+void RecurrentGradientMachine::createInFrameInfo_nonseq(int inlinkId,
+                                                        const Argument& input,
+                                                        PassType passType) {
+  std::vector<int> allIds;
+
+  auto& seqInfo = seqInfos_[inlinkId];
+  Info* inlinkInfo = &info_[inlinkId];
+  inlinkInfo->idIndex.clear();
+  for (size_t i = 0; i < seqInfo.size(); ++i) {
+    allIds.push_back(seqInfo[i].seqId);
+  }
+  // copy and check scatterId
+  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
+}
+
+void RecurrentGradientMachine::createInFrameInfo_seq(int inlinkId,
+                                                     const Argument& input,
+                                                     PassType passType) {
+  std::vector<int> allIds;
+  auto& seqInfo = seqInfos_[inlinkId];
+  Info* inlinkInfo = &info_[inlinkId];
+  inlinkInfo->idIndex.resize(1, 0);  // first idIndex = 0
+
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    for (int j = 0; j < numSeqs_[i]; ++j) {
+      int seqLength = seqInfo[j].topLevelLength;
+      int seqStart = seqInfo[j].seqStart;
+      allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
+                                 : (seqStart + i));
+    }
+    inlinkInfo->idIndex.push_back(allIds.size());
+  }
+
+  // copy and check scatterId
+  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
+  CHECK_EQ(inlinkInfo->idIndex.size(),
+           static_cast<size_t>(maxSequenceLength_ + 1));
+}
+void RecurrentGradientMachine::createInFrameInfo_subseq(int inlinkId,
+                                                        const Argument& input,
+                                                        PassType passType) {
+  std::vector<int> allIds;
+
+  auto& seqInfo = seqInfos_[inlinkId];
+
+  Info* inlinkInfo = &info_[inlinkId];
+  inlinkInfo->idIndex.resize(1, 0);  // first idIndex = 0
+  std::vector<int> sequenceStartPositions;
+  const int* subSequenceStartPositions = nullptr;
+
+  subSequenceStartPositions = input.subSequenceStartPositions->getData(false);
+  inlinkInfo->seqStartPosIndex.clear();
+  inlinkInfo->seqStartPosIndex.push_back(0);  // first seqStartPosIndex = 0
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    sequenceStartPositions.push_back(0);  // first element = 0
+    for (int j = 0; j < numSeqs_[i]; ++j) {
+      int subSeqStart = subSequenceStartPositions[seqInfo[j].subSeqStart + i];
+      int subSeqEnd = subSequenceStartPositions[seqInfo[j].subSeqStart + i + 1];
+      for (int k = subSeqStart; k < subSeqEnd; ++k) {
+        allIds.push_back(k);
+      }
+      sequenceStartPositions.push_back(sequenceStartPositions.back() +
+                                       subSeqEnd - subSeqStart);
+    }
+    inlinkInfo->idIndex.push_back(allIds.size());
+    inlinkInfo->seqStartPosIndex.push_back(sequenceStartPositions.size());
+  }
+  // inFrameLine create sequenceStartPositions one time
+  CHECK_EQ(
+      sequenceStartPositions.size(),
+      static_cast<size_t>(maxSequenceLength_ + input.getNumSubSequences()));
+  CHECK_EQ(inlinkInfo->seqStartPosIndex.size(),
+           static_cast<size_t>(maxSequenceLength_ + 1));
+  createSeqPos(sequenceStartPositions, &inlinkInfo->sequenceStartPositions);
+
+  // copy and check scatterId
+  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
+  CHECK_EQ(inlinkInfo->idIndex.size(),
+           static_cast<size_t>(maxSequenceLength_ + 1));
+}
+
+/* like createInFrameInfo, but for all realLayer of memoryFrameLines*/
+void RecurrentGradientMachine::createMemoryFrameInfo(
+    MemoryFrameLine* memoryFrameLine, PassType passType) {
+  const Argument& input = (*memoryFrameLine).rootLayer->getOutput();
+  size_t numSequences = input.getNumSequences();
+  std::vector<int> allIds;
+  bool seqFlag = input.hasSeq();
+  CHECK(!input.hasSubseq())
+      << "Subsequence boot layer for memory is not supported";
+
+  if (seqFlag) {  // for sequenceScatterAgentLayer
+    std::vector<int> sequenceStartPositions;
+    sequenceStartPositions.push_back(0);  // first element = 0
+    const int* starts = input.sequenceStartPositions->getData(false);
+    for (size_t i = 0; i < numSequences; ++i) {
+      // memory info adopt info of inlinks[0]
+      int seqId = seqInfos_[0][i].seqId;
+      for (int k = starts[seqId]; k < starts[seqId + 1]; ++k) {
+        allIds.push_back(k);
+      }
+      sequenceStartPositions.push_back(sequenceStartPositions.back() +
+                                       starts[seqId + 1] - starts[seqId]);
+    }
+    createSeqPos(sequenceStartPositions,
+                 &(*memoryFrameLine).sequenceStartPositions);
+
+  } else {  // for scatterAgentLayer
+    for (size_t i = 0; i < numSequences; ++i) {
+      allIds.push_back(seqInfos_[0][i].seqId);
+    }
+  }
+  // copy and check scatterId
+  copyScattedId(allIds, &(*memoryFrameLine).allIds, input.getBatchSize());
+  // memoryFrameLine select rows in real layer one time
+  selectRowsOneTime((*memoryFrameLine).rootLayer,
+                    (*memoryFrameLine).allIds,
+                    &(*memoryFrameLine).outArg,
+                    passType);
+}
+
+void RecurrentGradientMachine::copyScattedId(std::vector<int>& srcIds,
+                                             IVectorPtr* dstIds,
+                                             int size) {
+  int idSize = srcIds.size();
+  CHECK_EQ(idSize, size);
+  IVector::resizeOrCreate(*dstIds, idSize, useGpu_);
+  (*dstIds)->copyFrom(srcIds.data(), idSize);
+  // check
+  std::sort(srcIds.begin(), srcIds.end());
+  for (int i = 0; i < idSize; ++i) {
+    CHECK_EQ(srcIds[i], i);
+  }
+}
+
+void RecurrentGradientMachine::selectRowsOneTime(LayerPtr layer,
+                                                 const IVectorPtr& allIds,
+                                                 Argument* arg,
+                                                 PassType passType) {
+  Argument& src = layer->getOutput();
+  if (src.value) {
+    const MatrixPtr& realV = src.value;
+    int height = realV->getHeight();
+    int width = realV->getWidth();
+    Matrix::resizeOrCreate(
+        arg->value, height, width, /* trans */ false, useGpu_);
+    arg->value->zeroMem();
+    arg->value->selectRows(*realV, *allIds);
+    if (passType != PASS_TEST) {
+      Matrix::resizeOrCreate(
+          arg->grad, height, width, /* trans */ false, useGpu_);
+      arg->grad->zeroMem();
+    }
+  }
+  if (src.ids) {
+    IVector::resizeOrCreate(arg->ids, src.ids->getSize(), useGpu_);
+    arg->ids->selectFrom(*src.ids, *allIds);
+  }
+}
+
+void RecurrentGradientMachine::createSeqPos(
+    const std::vector<int>& sequenceStartPosition,
+    ICpuGpuVectorPtr* sequenceStartPositions) {
+  int size = sequenceStartPosition.size();
+  const int* data = sequenceStartPosition.data();
+  ICpuGpuVector::resizeOrCreate(*sequenceStartPositions, size, false);
+  (*sequenceStartPositions)->copyFrom(data, size, false);
+}
+
+size_t RecurrentGradientMachine::getGenBatchSize() {
+  size_t numSequences = 0;
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    if (!memoryFrameLine.rootLayer) continue;
+    Argument& bootArg = memoryFrameLine.rootLayer->getOutput();
+    size_t batchSize = bootArg.getNumSequences();
+    if (numSequences) {
+      CHECK_EQ(numSequences, batchSize);
+    } else {
+      numSequences = batchSize;
+    }
+  }
+  CHECK(numSequences)
+      << "Fail to get batch size in generation. "
+         "At least one of the Memory layer MUST have a layer that is NOT in "
+         "the layer group to boot it, and this boot layer is used to "
+         "decide batch_size in generation process.";
+  return numSequences;
+}
+
+void RecurrentGradientMachine::generateSequence() {
+  CHECK_NOTNULL(eosFrameLine_.get());
+  CHECK_GE(outFrameLines_.size(), 1UL);
+  size_t numSequences = getGenBatchSize();
+
+  resizeBootFrame(numSequences);
+  // We create only two sub-network in generation, one stores states of all
+  // layers in previous time step and the other storing the states at current
+  // time step.
+  resizeOrCreateFrames(2);
+
+  // outFrameLines_.size() > 1UL
+  dataArgsSize_ = outFrameLines_.size() - 1;
+  dataArgs_.resize(dataArgsSize_);
+  dataArgsFrame_.clear();
+  dataArgsFrame_.resize(dataArgsSize_);
+
+  // connect boot frame memory links
+  std::vector<int> ids(numSequences);
+  for (size_t i = 0; i < numSequences; ++i) {
+    ids[i] = i;
+  }
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    if (memoryFrameLine.rootAgent) {
+      auto scatterAgent =
+          dynamic_cast<ScatterAgentLayer*>(memoryFrameLine.rootAgent.get());
+      scatterAgent->setRealLayer(memoryFrameLine.rootLayer, ids);
+    }
+    NeuralNetwork::connect(
+        memoryFrameLine.agents[0], memoryFrameLine.bootLayer, ids.size());
+  }
+
+  // boot layer forward
+  AsyncGpuBlock asyncGpuBlock;
+
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    memoryFrameLine.bootLayer->forward(PASS_TEST);
+  }
+
+  // init outArg
+  size_t resultNum = generator_.config.num_results_per_sample();
+  size_t maxGenWordCount =
+      generator_.config.max_num_frames() * numSequences * resultNum;
+  IVector::resizeOrCreate(generator_.outArg.ids, maxGenWordCount, false);
+  if (resultNum > 1) {
+    CHECK_LE(resultNum, static_cast<size_t>(generator_.config.beam_size()));
+    Matrix::resizeOrCreate(generator_.outArg.in,
+                           /* height */ numSequences,
+                           /* width */ resultNum,
+                           false,
+                           /* useGpu */ false);
+  }
+  ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions,
+                                numSequences + 1,
+                                /* useGpu */ false);
+  if (getBeamSize() > 1) {
+    beamSearch(numSequences);
+  } else {
+    oneWaySearch(numSequences);
+  }
+  if (dataArgsSize_) createDataOutlink();
+
+  size_t size = generator_.ids.size();
+  generator_.outArg.ids->resize(size);
+  generator_.outArg.ids->copyFrom(generator_.ids.data(), size);
+
+  OutFrameLine& outFrameLine = outFrameLines_[0];
+  auto dataAgent = dynamic_cast<DataLayer*>(outFrameLine.agentLayer.get());
+  CHECK_NOTNULL(dataAgent);
+  dataAgent->setData(generator_.outArg);
+  dataAgent->prefetch();
+}
+
+void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
+  OutFrameLine& outFrameLine = outFrameLines_[0];
+
+  // finalPaths_[0] stores the generated results of the
+  // entire batch, so its size exactly equals to batchSize.
+  finalPaths_.clear();
+  finalPaths_.resize(1);
+  std::vector<Path>& finalPaths = finalPaths_[0];
+  finalPaths.resize(batchSize);
+
+  seqIds_.resize(batchSize);
+  std::vector<int> scatterIds;
+  for (size_t i = 0; i < batchSize; ++i) {
+    finalPaths[i].seqId = i;
+    seqIds_[i] = i;
+  }
+
+  // forward
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    if (i && scatterIds.empty()) break;
+    int machineCur = i % 2;
+    int machinePrev = (i - 1) % 2;
+    // connect memory links
+    if (i) {
+      seqIds_.clear();
+      for (size_t j = 0; j < batchSize; ++j) {
+        if (finalPaths[j].seqId != -1) seqIds_.push_back(j);
+      }
+
+      for (auto& memoryFrameLine : memoryFrameLines_) {
+        auto scatterAgent = dynamic_cast<ScatterAgentLayer*>(
+            memoryFrameLine.scatterAgents[machineCur].get());
+        scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev],
+                                   scatterIds);
+        scatterAgent->forward(PASS_TEST);
+        NeuralNetwork::connect(memoryFrameLine.agents[machineCur],
+                               memoryFrameLine.scatterAgents[machineCur]);
+      }
+    }
+    const std::vector<Argument> inArgs;
+    std::vector<Argument> outArgs;
+    frames_[machineCur]->forward(inArgs, &outArgs, PASS_TEST);
+
+    const IVectorPtr& idVec = outFrameLine.frames[machineCur]->getOutput().ids;
+    for (size_t j = 0; j < seqIds_.size(); ++j) {
+      finalPaths[seqIds_[j]].ids.push_back(idVec->getElement(j));
+      finalPaths[seqIds_[j]].machineIdVec.push_back(j);
+    }
+
+    copyDataOutlinkFrame(machineCur);
+
+    // check eos
+    const IVectorPtr& eosVec =
+        eosFrameLine_->layers[machineCur]->getOutput().ids;
+    scatterIds.clear();
+    for (size_t j = 0; j < seqIds_.size(); ++j) {
+      if (eosVec->getElement(j) == 1U) {
+        // path.seqId = -1 indicates end of generation
+        // of an input sequence
+        finalPaths[seqIds_[j]].seqId = -1;
+      } else {
+        scatterIds.push_back(j);
+      }
+    }
+  }
+
+  batchMachineIdVec_.clear();
+  batchMachineStartPos_.clear();
+  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
+  starts[0] = 0;
+  generator_.ids.clear();
+  for (size_t i = 0; i < batchSize; ++i) {
+    generator_.ids.insert(generator_.ids.end(),
+                          finalPaths[i].ids.begin(),
+                          finalPaths[i].ids.end());
+    starts[i + 1] = generator_.ids.size();
+    batchMachineIdVec_.insert(batchMachineIdVec_.end(),
+                              finalPaths[i].machineIdVec.begin(),
+                              finalPaths[i].machineIdVec.end());
+  }
+}
+
+void RecurrentGradientMachine::connectPrevFrame(int stepId,
+                                                std::vector<Path>& paths) {
+  int machineCur = stepId % 2;
+  int machinePrev = (stepId - 1) % 2;
+  int beam = getBeamSize();
+  machineIds_.clear();
+  topIds_.clear();
+  seqIds_.clear();
+
+  for (size_t j = 0; j < paths.size(); ++j) {
+    machineIds_.push_back(paths[j].machineId);
+    topIds_.push_back(paths[j].machineId * beam + paths[j].topIndex);
+    seqIds_.push_back(paths[j].seqId);
+  }
+
+  for (auto& memoryFrameLine : memoryFrameLines_) {
+    bool isOutIds = (memoryFrameLine.layerName == outFrameLines_[0].layerName);
+    auto scatterAgent = dynamic_cast<ScatterAgentLayer*>(
+        memoryFrameLine.scatterAgents[machineCur].get());
+    scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev],
+                               isOutIds ? topIds_ : machineIds_);
+    scatterAgent->forward(PASS_TEST);
+    NeuralNetwork::connect(memoryFrameLine.agents[machineCur],
+                           memoryFrameLine.scatterAgents[machineCur]);
+  }
+}
+
+void RecurrentGradientMachine::forwardFrame(int machineCur) {
+  // forward
+  const std::vector<Argument> inArgs;
+  std::vector<Argument> outArgs;
+  frames_[machineCur]->forward(inArgs, &outArgs, PASS_TEST);
+
+  copyDataOutlinkFrame(machineCur);
+
+  IVectorPtr& ids = outFrameLines_[0].frames[machineCur]->getOutput().ids;
+  MatrixPtr in = outFrameLines_[0].frames[machineCur]->getOutput().in;
+  IVectorPtr& eos = eosFrameLine_->layers[machineCur]->getOutput().ids;
+  if (useGpu_) {
+    IVector::resizeOrCreate(cpuId_, ids->getSize(), false /* useGpu */);
+    cpuId_->copyFrom(*ids);
+    Matrix::resizeOrCreate(cpuProb_,
+                           in->getHeight(),
+                           in->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
+    cpuProb_->copyFrom(*in);
+    IVector::resizeOrCreate(cpuEos_, eos->getSize(), false /* useGpu */);
+    cpuEos_->copyFrom(*eos);
+  } else {
+    cpuId_ = ids;
+    cpuProb_ = in;
+    cpuEos_ = eos;
+  }
+}
+
+void RecurrentGradientMachine::singlePathExpand(Path& curPath,
+                                                size_t curPathId,
+                                                std::vector<Path>& newPaths,
+                                                size_t expandWidth) {
+  int calc_id =
+      gDiyProbStart ? gDiyProbStart(curPath.ids.size(), curPath.ids.data()) : 0;
+
+  const int* idVec = cpuId_->getData();
+  const real* probMat = cpuProb_->getData();
+  const int* eosVec = cpuEos_->getData();
+
+  for (size_t k = 0; k < expandWidth; k++) {
+    int index = curPathId * expandWidth + k;
+    int id = idVec[index];
+    real prob = probMat[index];
+    /*
+     * Ordinarily, beam search greedily expands the most promising expandWidth
+     * paths that currently are ALWAYS returned by MaxIdLayer.
+     * In one condition, if user customizes the beam search procedure by
+     * restricting the expansion within a user defined subset,
+     * as a result, MaxIdLayer possibly COULD NOT return expandWidth
+     * vaild expansions, and it will use -1 to indicate the end of valid
+     * expansion candidates.
+     */
+    if (id == -1) break;
+
+    real newLogProb = generator_.config.log_prob() ? std::log(prob) : prob;
+    Path newPath(
+        curPath, id, newLogProb, curPathId /*machineId*/, k /*topIndex*/);
+    if (this->beamSearchCtrlCallbacks_) {
+      if (beamSearchCtrlCallbacks_->stopDetermineCandidates(
+              newPath.seqId, newPath.ids, newPath.probHistory))
+        return;
+    }
+    // outFrameLines_.size() > 1UL
+    if (dataArgsSize_) {
+      newPath.machineIdVec = curPath.machineIdVec;
+      newPath.machineIdVec.push_back(curPathId);
+    }
+    bool atEos =
+        eosVec[index] == 1U || newPath.ids.size() >= (size_t)maxSequenceLength_;
+    // adjustNewPath
+    newPath.adjustProb(calc_id, atEos);
+    if (this->beamSearchCtrlCallbacks_) {
+      this->beamSearchCtrlCallbacks_->normOrDropNode(
+          newPath.seqId, newPath.ids, newPath.probHistory, &newPath.logProb);
+    }
+    if (!newPath.isDropable()) {
+      atEos ? finalPaths_[curPath.seqId].push_back(newPath)
+            : newPaths.push_back(newPath);
+    }
+  }  // for expandWidth
+
+  if (gDiyProbStop) {
+    gDiyProbStop(calc_id);
+  }
+}
+
+void RecurrentGradientMachine::beamExpand(std::vector<Path>& paths,
+                                          std::vector<Path>& newPaths) {
+  size_t candidatePathCount = paths.size();
+  // idVec.size() could be larger than candidatePathCount * beam,
+  // so user can drop some node customly.
+  CHECK_EQ(cpuId_->getSize() % candidatePathCount, 0UL);
+  size_t expandWidth = cpuId_->getSize() / candidatePathCount;
+
+  // iterate over each sequence
+  size_t totalExpandCount = 0;
+  int prevSeqId = -1;
+  int curSeqId = 0;
+  for (size_t j = 0; j <= candidatePathCount; j++) {
+    // expansions of a single sequence are all processed
+    curSeqId = (j < candidatePathCount ? paths[j].seqId : curSeqId + 1);
+    if (prevSeqId != -1 && curSeqId != prevSeqId) {
+      totalExpandCount += beamShrink(newPaths, prevSeqId, totalExpandCount);
+    }
+    if (j == candidatePathCount) return;
+    singlePathExpand(paths[j], j, newPaths, expandWidth);
+
+    prevSeqId = paths[j].seqId;
+  }  // for paths
+}
+
+// Drop extra nodes to beam size.
+size_t RecurrentGradientMachine::beamShrink(std::vector<Path>& newPaths,
+                                            size_t seqId,
+                                            size_t totalExpandCount) {
+  size_t minNewPathSize =
+      std::min(getBeamSize(), newPaths.size() - totalExpandCount);
+  if (!minNewPathSize) {
+    return 0;
+  }
+  std::nth_element(newPaths.begin() + totalExpandCount,
+                   newPaths.begin() + totalExpandCount + minNewPathSize,
+                   newPaths.end(),
+                   Path::greaterPath);
+  newPaths.resize(totalExpandCount + minNewPathSize);
+
+  real minPathLogProb =
+      std::min_element(newPaths.end() - minNewPathSize, newPaths.end())
+          ->logProb;
+  real maxPathLogProb =
+      std::max_element(newPaths.end() - minNewPathSize, newPaths.end())
+          ->logProb;
+
+  // Remove the already formed paths that are relatively short
+  finalPaths_[seqId].erase(
+      std::remove_if(finalPaths_[seqId].begin(),
+                     finalPaths_[seqId].end(),
+                     [&](Path& p) { return p.logProb < minPathLogProb; }),
+      finalPaths_[seqId].end());
+  for (auto p : finalPaths_[seqId]) {
+    if (minFinalPathLogProb_[seqId] > p.logProb) {
+      minFinalPathLogProb_[seqId] = p.logProb;
+    }
+  }
+
+  if (finalPaths_[seqId].size() >= getBeamSize() &&
+      minFinalPathLogProb_[seqId] >= maxPathLogProb) {
+    newPaths.resize(totalExpandCount);
+    return 0;
+  }
+  return minNewPathSize;
+}
+
+void RecurrentGradientMachine::fillGenOutputs() {
+  size_t numResults = generator_.config.num_results_per_sample();
+  for (size_t i = 0; i < finalPaths_.size(); ++i) {
+    size_t minFinalPathsSize = std::min(numResults, finalPaths_[i].size());
+    std::partial_sort(finalPaths_[i].begin(),
+                      finalPaths_[i].begin() + minFinalPathsSize,
+                      finalPaths_[i].end(),
+                      Path::greaterPath);
+    finalPaths_[i].resize(minFinalPathsSize);
+  }
+
+  generator_.ids.clear();
+  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
+  starts[0] = 0;
+  if (numResults > 1) {
+    int idsProbSaveSize = 0;
+    for (auto inSeq : finalPaths_) {
+      for (auto path : inSeq) idsProbSaveSize += path.ids.size();
+      idsProbSaveSize += inSeq.size();
+    }
+    Matrix::resizeOrCreate(
+        generator_.outArg.value, idsProbSaveSize, 1, false, false);
+    real* idsProb = generator_.outArg.value->getData();
+
+    real* probs = generator_.outArg.in->getData();
+    size_t curPos = 0;
+    for (size_t i = 0; i < finalPaths_.size(); ++i) {
+      for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
+        Path& path = finalPaths_[i][j];
+        size_t genLen = path.ids.size();
+        generator_.ids.push_back(genLen);  // sequence size
+        generator_.ids.insert(
+            generator_.ids.end(), path.ids.begin(), path.ids.end());
+        generator_.ids.push_back(-1);  // end of sequence
+
+        memcpy(idsProb + curPos, path.idsProb.data(), sizeof(real) * genLen);
+        curPos += genLen;
+        idsProb[curPos++] = -1.0;
+        probs[i * numResults + j] = path.logProb;
+      }
+      starts[i + 1] = generator_.ids.size();
+    }
+  } else {
+    for (size_t i = 0; i < finalPaths_.size(); ++i) {
+      CHECK(!finalPaths_[i].empty());
+      Path& path = finalPaths_[i][0];
+      generator_.ids.insert(
+          generator_.ids.end(), path.ids.begin(), path.ids.end());
+      starts[i + 1] = starts[i] + path.ids.size();
+    }
+  }
+}
+
+void RecurrentGradientMachine::copyDataOutlinkFrame(size_t machineCur) {
+  for (size_t i = 0; i < dataArgsSize_; i++) {
+    Argument outFrame;
+    outFrame.resizeAndCopyFrom(
+        outFrameLines_[i + 1].frames[machineCur]->getOutput(), useGpu_);
+    dataArgsFrame_[i].emplace_back(outFrame);
+  }
+}
+
+void RecurrentGradientMachine::createDataOutlinkSelRowsInfo(
+    bool isSeq, std::vector<Argument>& outArgs) {
+  batchMachineIdVec_.clear();
+
+  size_t seqIdx = 0;
+  for (size_t i = 0; i < finalPaths_.size(); ++i) {
+    for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
+      std::vector<int>& machineIdVec = finalPaths_[i][j].machineIdVec;
+      if (isSeq) {
+        for (size_t i = 0; i < machineIdVec.size(); ++i) {
+          size_t rowId = machineIdVec[i];
+          int* seqPos =
+              outArgs[i].sequenceStartPositions->getMutableData(false);
+          batchMachineIdVec_.push_back(seqPos[rowId]);
+        }
+      } else {
+        batchMachineIdVec_.insert(
+            batchMachineIdVec_.end(), machineIdVec.begin(), machineIdVec.end());
+      }
+      seqIdx++;
+    }
+  }
+}
+
+void RecurrentGradientMachine::createDataOutlinkCopySizeInfo(
+    bool isSeq, std::vector<Argument>& outArgs, std::vector<int>& copySize) {
+  size_t totalSeqNum = std::accumulate(
+      finalPaths_.begin(),
+      finalPaths_.end(),
+      0UL,
+      [](size_t a, const std::vector<Path>& b) { return a + b.size(); });
+  copySize.resize(totalSeqNum, 1);
+
+  batchMachineStartPos_.resize(totalSeqNum + 1, 0);
+  if (isSeq) {
+    ICpuGpuVectorPtr inputSeqStartPos = outArgs[0].sequenceStartPositions;
+    CHECK_EQ(static_cast<size_t>(inputSeqStartPos->getSize() - 1),
+             getBeamSize() > 1 ? finalPaths_.size() : finalPaths_[0].size());
+    int* starts = inputSeqStartPos->getMutableData(false);
+    int seqId = 0;
+    for (size_t i = 0; i < finalPaths_.size(); ++i) {
+      for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
+        copySize[seqId] = getBeamSize() > 1 ? starts[i + 1] - starts[i]
+                                            : starts[j + 1] - starts[j];
+        batchMachineStartPos_[seqId + 1] =
+            batchMachineStartPos_[seqId] + finalPaths_[i][j].ids.size();
+        seqId++;
+      }
+    }
+  } else {
+    for (size_t i = 0; i < finalPaths_[0].size(); ++i)
+      batchMachineStartPos_[i + 1] =
+          batchMachineStartPos_[i] + finalPaths_[0][i].ids.size();
+  }
+}
+
+void RecurrentGradientMachine::createDataOutlink() {
+  for (size_t i = 0; i < dataArgsSize_; i++) {
+    bool isSeq = dataArgsFrame_[i][0].hasSeq();
+    std::vector<int> copySize;
+    createDataOutlinkCopySizeInfo(isSeq, dataArgsFrame_[i], copySize);
+    createDataOutlinkSelRowsInfo(isSeq, dataArgsFrame_[i]);
+
+    dataArgs_[i].concat(dataArgsFrame_[i],
+                        batchMachineIdVec_,
+                        batchMachineStartPos_,
+                        copySize,
+                        useGpu_,
+                        HPPL_STREAM_1,
+                        PASS_TEST);
+    auto dataAgent =
+        dynamic_cast<DataLayer*>(outFrameLines_[i + 1].agentLayer.get());
+    CHECK_NOTNULL(dataAgent);
+    dataAgent->setData(dataArgs_[i]);
+  }
+}
+
+void RecurrentGradientMachine::beamSearch(size_t batchSize) {
+  finalPaths_.clear();
+  finalPaths_.resize(batchSize);
+  seqIds_.resize(batchSize);
+  minFinalPathLogProb_.clear();
+  minFinalPathLogProb_.resize(batchSize, 0);
+
+  std::vector<Path> paths;
+  std::vector<Path> newPaths;
+  for (size_t i = 0; i < batchSize; ++i) {
+    paths.push_back(Path(i));
+    if (this->beamSearchCtrlCallbacks_) {
+      paths.back().recordHistory();
+    }
+  }
+
+  // restart beam search
+  stopBeamSearch_ = false;
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    int machineCur = i % 2;
+    std::unique_ptr<
+        ScopedCallbacks<const RecurrentGradientMachine::EachStepCallback&, int>>
+        statisticsBlock;
+    if (this->beamSearchStatistics_) {
+      auto ptr =
+          new ScopedCallbacks<const RecurrentGradientMachine::EachStepCallback&,
+                              int>(beamSearchStatistics_->onEachStepStarted,
+                                   beamSearchStatistics_->onEachStepStoped,
+                                   i);
+      statisticsBlock.reset(ptr);
+    }
+    if (stopBeamSearch_) break;
+
+    if (i) connectPrevFrame(i, paths);
+
+    if (this->beamSearchCtrlCallbacks_) {
+      std::vector<std::vector<int>*> prefixes;
+      prefixes.resize(paths.size());
+      std::transform(
+          paths.begin(), paths.end(), prefixes.begin(), [](const Path& p) {
+            return const_cast<std::vector<int>*>(&p.ids);
+          });
+      beamSearchCtrlCallbacks_->beamSearchCandidateAdjust(
+          prefixes, frames_[machineCur].get(), i);
+    }
+
+    forwardFrame(machineCur);
+    beamExpand(paths, newPaths);
+    if (newPaths.empty()) break;
+
+    paths = newPaths;
+    newPaths.clear();
+  }  // end for machineCur
+  fillGenOutputs();
+}
+
+void RecurrentGradientMachine::Path::adjustProb(int calc_id, bool atEos) {
+  if (gDiyProbMethod) {
+    logProb = gDiyProbMethod(calc_id, ids.size(), ids.data(), logProb, atEos);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a13d4f6f84eb5309a1b25f039357cb8af02c35e
--- /dev/null
+++ b/paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -0,0 +1,580 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include "GradientMachine.h"
+#include "NeuralNetwork.h"
+
+#include "paddle/legacy/utils/Locks.h"
+
+namespace paddle {
+
+/**
+ * Private data class declares.
+ * Used for user customized beam search.
+ */
+class BeamSearchControlCallbacks;
+class BeamSearchStatisticsCallbacks;
+
+class RecurrentGradientMachine : public NeuralNetwork {
+ public:
+  RecurrentGradientMachine(const std::string& subModelName,
+                           NeuralNetwork* rootNetwork);
+
+  // Disable copy and assign.
+  RecurrentGradientMachine(const RecurrentGradientMachine& other) = delete;
+  RecurrentGradientMachine& operator=(const RecurrentGradientMachine& other) =
+      delete;
+
+  virtual ~RecurrentGradientMachine() {
+    this->removeBeamSearchStatisticsCallbacks();
+    this->removeBeamSearchControlCallbacks();
+  }
+
+  virtual void init(const ModelConfig& config,
+                    ParamInitCallback callback,
+                    const std::vector<ParameterType>& parameterTypes,
+                    bool useGpu);
+
+  virtual void prefetch(const std::vector<Argument>& inArgs);
+
+  virtual void forward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs,
+                       PassType passType);
+
+  virtual void backward(const UpdateCallback& callback = nullptr);
+
+  void forwardBackward(const std::vector<Argument>& inArgs,
+                       std::vector<Argument>* outArgs,
+                       PassType passType,
+                       const UpdateCallback& callback);
+
+  virtual void resetState() {}
+  virtual void eval(Evaluator* evaluator) const;
+
+  const std::vector<int>& getParameterIds() { return parameterIds_; }
+
+  /**
+   * @brief BeamSearchCandidatesAdjustCallback
+   *
+   * Adjust searching candidates to restrict beam search
+   * searching within a limited subset of all possibile paths.
+   *
+   * The first parameter is the prefixes of all formed paths in current
+   * beam search step, whose type is basically int[][].
+   *
+   * The second parameter is a pointer to the network used to generate sequence,
+   * user can use this pointer to tranverse each layer in the network to
+   * modify behaivors of a particular layer.
+   *
+   * The third parameter is an integer to indicate the iteration number of
+   * beam search, so that user can customize different operations in different
+   * beam search iterations.
+   */
+  typedef std::function<void(
+      const std::vector<std::vector<int>*>&, NeuralNetwork*, const int)>
+      BeamSearchCandidatesAdjustCallback;
+
+  /**
+   * @brief DropCallback
+   *
+   * Drop a whole prefix or one candidate in beam search or not.
+   *
+   * The first parameter is sequence index in a batch
+   *
+   * The second parameter is one path in beam search,
+   * which is made up of node indices.
+   *
+   * The third parameter is probabilites for each node in this path.
+   *
+   * Return true if this prefix or candidate is expected to be dropped.
+   */
+  typedef std::function<bool(
+      int seqId, const std::vector<int>&, const std::vector<real>&)>
+      DropCallback;
+
+  /**
+   * @brief NormOrDropNodeCallback
+   *
+   * Normalize a path's probabilities or just drop it by modifying path.logProb
+   *
+   * The first parameter is sequence index in a batch
+   *
+   * The second parameter is path.ids
+   *
+   * The third parameter is probabilites for each node in this path.
+   *
+   * The fourth parameter is the probability of the whole path.
+   */
+  typedef std::function<void(
+      int seqId, const std::vector<int>&, std::vector<real>&, real*)>
+      NormOrDropNodeCallback;
+
+  /**
+   * @brief Register beam search control callbacks. Used for prediction.
+   *
+   * @param queryBeamSearch: Give the sequences already formed, return the
+   * nodes expected to be expanded.
+   * Input: A pointer to an array holding pathes which have been expanded
+   * Return: A pointer to an array holding nodes wanted to be expanded.
+   *
+   * @param dropOneNode: Early drop a node in one beam search step.
+   * Given the path formed and probability history, decide whether a node
+   * should be dropped or not.
+   *
+   * @param stopBeamSearch: Early stop a path in one beam search step.
+   * Given the path and probability history, decide whether a path
+   * should be dropped or not.
+   */
+  void registerBeamSearchControlCallbacks(
+      const BeamSearchCandidatesAdjustCallback& adjustBeamSearch,
+      const NormOrDropNodeCallback& normOrDropNode,
+      const DropCallback& stopBeamSearch);
+
+  /**
+   * @brief Remove user costumized beam search callbacks,
+   *
+   * make sequence generation acts like normal beam search.
+   */
+  void removeBeamSearchControlCallbacks();
+
+  /**
+   * @brief EachStepCallback
+   *
+   * Invoke with beam search step.
+   */
+  typedef std::function<void(int)> EachStepCallback;
+
+  /**
+   * @brief register statistics methods for performance profile of beam search.
+   *
+   * @param onEachStepStarted: invoke once a beam search step starts.
+   * Its input is index of the beam search step.
+   *
+   * @param onEachStepStoped: invoke once a beam search step ends.
+   * Its input is index of the beam search step.
+   */
+  void registerBeamSearchStatisticsCallbacks(
+      const EachStepCallback& onEachStepStarted,
+      const EachStepCallback& onEachStepStoped);
+
+  /**
+   * @brief Remove beam search callbacks.
+   */
+  void removeBeamSearchStatisticsCallbacks();
+
+  /**
+   * @brief Stop beam search for current source.
+   *
+   * Will restart beam search in the next forward
+   */
+  void stopBeamSearch();
+
+  struct Path {
+    /**
+     * @brief ids, path of beam search.
+     */
+    std::vector<int> ids;
+
+    /**
+     * @brief idsProb, log probability of each generated word.
+     */
+    std::vector<real> idsProb;
+
+    /**
+     * @brief logProb, current probability of path.
+     */
+    real logProb;
+
+    int machineId;  // index of sample in frame
+    int topIndex;   // index of MaxIdLayer output in one sample
+    int seqId;      // index of sequence in batch generation
+    std::vector<int> machineIdVec;
+
+    /**
+     * @brief A record of each node's probality in a formed path in beam search.
+     *
+     * @note  It could be empty when history is not recorded. If the history is
+     *        wanted to be recorded, recordHistory() MUST be invoked first.
+     */
+    std::vector<real> probHistory;
+
+    /**
+     * @brief Path default ctor, first logProb is 0.
+     */
+    Path() {
+      logProb = 0;
+      seqId = 0;
+    }
+    explicit Path(size_t seqId) : seqId(seqId) { logProb = 0; }
+
+    /**
+     * @brief Create a new path based on an old path and
+     * a new node with probability.
+     *
+     * @param old       old path
+     * @param newId     index of the new node
+     * @param logProb   probability of the new node.
+     * @param machineId sample index of a frame in RNN
+     * @param topIndex  index of MaxIdLayer output in one sample
+     */
+    Path(Path& old, int newId, real logProb, int machineId, int topIndex)
+        : ids(old.ids),
+          idsProb(old.idsProb),
+          logProb(old.logProb + logProb),
+          machineId(machineId),
+          topIndex(topIndex),
+          seqId(old.seqId) {
+      ids.push_back(newId);
+      idsProb.push_back(logProb);
+      if (!old.probHistory.empty()) {
+        this->probHistory = old.probHistory;
+        // probHistory store current prob, not sum
+        this->probHistory.push_back(logProb);
+      }
+    }
+
+    /**
+     * @brief operator <
+     *
+     * Path a < Path b means log probability of a is smaller than that of b
+     */
+    bool operator<(const Path& other) const {
+      return (logProb < other.logProb);
+    }
+
+    static bool greaterPath(const Path& a, const Path& b) { return (b < a); }
+
+    /**
+     * @brief Start recording history in this path.
+     */
+    void recordHistory() { this->probHistory.push_back(this->logProb); }
+
+    /**
+     * @brief Adjust probability for DIY beam search interface.
+     * In normal situation, it will do nothing.
+     *
+     * @param calc_id: the object id for DIY beam search interface.
+     * @param atEos: at end of sequence or not.
+     */
+    void adjustProb(int calc_id, bool atEos = false);
+
+    /**
+     * @brief isDropable indacating whether the current node will be
+     * dropped or not in beam search.
+     *
+     * @note: if logProb is -inf, current node will be dropped.
+     * @return true to drop the current node.
+     */
+    bool isDropable() const { return std::isinf(logProb) && logProb < 0; }
+  };
+
+  /**
+   * @brief access beam search results.
+   * @return beam search results.
+   */
+  const std::vector<std::vector<Path>>& getFinalPaths() const {
+    return this->finalPaths_;
+  }
+
+ protected:
+  std::vector<Argument::SeqInfo> commonSeqInfo_;
+  ICpuGpuVectorPtr sequenceStartPositions_;
+  void calcSequenceStartPositions();
+  void checkInputConsistency(int inlinkId,
+                             const std::vector<Argument::SeqInfo>& seqInfo);
+  void reorganizeInput(PassType passType);
+  void reorganizeOutput(PassType passType);
+  void connectFrames(PassType passType);
+  void calcNumSequencesAtEachStep();
+
+  void resizeOrCreateFrames(int numFrames);
+  void resizeBootFrame(int numSequences);
+
+  void generateSequence();
+  void oneWaySearch(size_t batchSize);
+  void beamSearch(size_t batchSize);
+
+  struct InFrameLine {
+    std::string linkName;
+    LayerPtr inLayer;
+    std::vector<LayerPtr> agents;  // Scatter Agents to reform batch input
+    Argument outArg;               // scatter output argument
+  };
+  std::vector<InFrameLine> inFrameLines_;
+
+  struct OutFrameLine {
+    std::string layerName;
+    LayerPtr agentLayer;
+    std::vector<LayerPtr> frames;
+  };
+  std::vector<OutFrameLine> outFrameLines_;
+
+  struct MemoryFrameLine {
+    std::string layerName;
+    std::string linkName;
+    LayerPtr bootLayer;  // actually used biasLayer or rootAgent
+    LayerPtr biasLayer;
+    LayerPtr rootLayer;  // layer in root network to boot this memory
+    LayerPtr rootAgent;  // agent to link rootLayer
+    std::vector<LayerPtr> frames;
+    std::vector<LayerPtr> agents;
+    std::vector<LayerPtr> scatterAgents;  // scatter agent used by beam search
+    Argument outArg;                      // scatter output argument
+    // Different memoryFrameLine have different element as follows
+    IVectorPtr allIds;  // scattered id of realLayer
+    ICpuGpuVectorPtr
+        sequenceStartPositions;  // scattered sequenceStartPositions
+  };
+  std::vector<MemoryFrameLine> memoryFrameLines_;
+
+  // Each inFrameLines(inlinks) has its own info(elements) below,
+  // and all outFrameLines(outlinks) share the info with one inFrameLine,
+  // which is assigned by targetInfoInlinkId_.
+  struct Info {
+    // The original positions in the original batch
+    IVectorPtr allIds;  // scattered id of realLayer [batchSize]
+
+    // index of allIds for each step [maxSequenceLength_]
+    // idIndex[i] is the total length of the first i sequences
+    std::vector<int> idIndex;
+
+    ICpuGpuVectorPtr
+        sequenceStartPositions;         // scattered sequenceStartPositions
+    std::vector<int> seqStartPosIndex;  // index of sequenceStartPositions
+  };
+  std::vector<Info> info_;  // for input
+
+  // numSeqs_[i] is the number sequences which is longer than i (for sequence
+  // data) or has more than i subsequences (for subsequence data)
+  // Equivalently, numSeqs_[i] is the number of sequences at step i;
+  std::vector<int> numSeqs_;
+
+  std::vector<std::vector<Argument::SeqInfo>> seqInfos_;
+
+  void checkOutputConsistency(OutFrameLine& outFrameLine);
+
+  /* create scattered id infomation for all realLayer of inFrameLines one time.
+   *  If hasSubseq, will also create scattered sequenceStartPositions infomation
+   *  for all realLayer of inFrameLines one time.
+   */
+  void createInFrameInfo(int inlinks_id,
+                         const Argument& input,
+                         PassType passType);
+  void createInFrameInfo_nonseq(int inlinks_id,
+                                const Argument& input,
+                                PassType passType);
+  void createInFrameInfo_seq(int inlinks_id,
+                             const Argument& input,
+                             PassType passType);
+  void createInFrameInfo_subseq(int inlinks_id,
+                                const Argument& input,
+                                PassType passType);
+
+  void createOutFrameInfo(OutFrameLine& outFrameLine,
+                          Info& info,
+                          ICpuGpuVectorPtr& sequenceStartPositions,
+                          ICpuGpuVectorPtr& subSequenceStartPositions);
+  void createOutFrameInfo_seq(OutFrameLine& outFrameLine,
+                              Info& info,
+                              ICpuGpuVectorPtr& sequenceStartPositions,
+                              ICpuGpuVectorPtr& subSequenceStartPositions);
+  void createOutFrameInfo_subseq(OutFrameLine& outFrameLine,
+                                 Info& info,
+                                 ICpuGpuVectorPtr& sequenceStartPositions,
+                                 ICpuGpuVectorPtr& subSequenceStartPositions);
+
+  void createMemoryFrameInfo(MemoryFrameLine* memoryFrameLine,
+                             PassType passType);
+
+  void copyScattedId(std::vector<int>& srcIds, IVectorPtr* dstIds, int size);
+
+  void selectRowsOneTime(LayerPtr layer,
+                         const IVectorPtr& allIds,
+                         Argument* arg,
+                         PassType passType);
+
+  void createSeqPos(const std::vector<int>& sequenceStartPosition,
+                    ICpuGpuVectorPtr* sequenceStartPositions);
+
+  // for generator
+  struct EosFrameLine {
+    std::vector<LayerPtr> layers;
+  };
+  std::unique_ptr<EosFrameLine> eosFrameLine_;
+
+  struct Generator {
+    GeneratorConfig config;
+    std::vector<int> ids;       // store generated sequences
+    std::vector<real> idsProb;  // log probability of each generated word
+    Argument outArg;            // final output argument
+  };
+  bool generating_;
+  Generator generator_;
+
+  std::vector<std::unique_ptr<NeuralNetwork>> frames_;
+
+  NeuralNetwork* rootNetwork_;
+  bool reversed_;
+
+  int maxSequenceLength_;  // Max top-level length
+  bool useGpu_;
+  bool stopBeamSearch_;
+
+  std::vector<int>
+      parameterIds_;  // parameters actually used by this Layer Group
+
+  // store final argument of outFrameLines_
+  std::vector<Argument> dataArgs_;
+  // store each frame's output argument of outFrameLines_
+  std::vector<std::vector<Argument>> dataArgsFrame_;
+  size_t dataArgsSize_;  // size of dataArgs_ = size of dataArgsFrame_
+
+  IVectorPtr cpuId_;
+  MatrixPtr cpuProb_;
+  IVectorPtr cpuEos_;
+
+ private:
+  /*
+   * @return beam size in beam search
+   */
+  size_t getBeamSize() { return generator_.config.beam_size(); }
+
+  /*
+   * @return number of sequence in a batch in generation
+   */
+  size_t getGenBatchSize();
+
+  /*
+   * @brief store output of the machineCur-th frame during generation, for
+   * creating the final outlink after the entire generation process is finished.
+   *
+   * In generation, if the layer group has more than 1 outlink, the first
+   * one is reserved to store the generated word indices, the others are data
+   * outlinks, that can be used like a common layer in the network.
+   *
+   * @param machineCur : index to access the layer group frame in
+   * currrent generation step.
+   */
+  void copyDataOutlinkFrame(size_t machineCur);
+
+  /*
+   * @brief In generation, if the layer group has more than 1 outlink, outlink
+   * except the first one is a data outlink. In RecurrentLayerGroup, each time
+   * step is a separate Network, outputs of a layer inside the
+   * RecurrentLayerGroup are stored in separate Arguments. If one layer is
+   * specified as an outlink of RecurrentLayerGroup. This function will
+   * collect outputs in each time step of each generated sequence which are
+   * dispersed in separate Arguments to form a new single Argument as output of
+   * RecurrentLayerGroup.
+   */
+  void createDataOutlink();
+
+  /*
+   * @brief decide to select how many rows from the Matrix stored the forward
+   * pass results from a start position.
+   *
+   * @param isSeq: a flag indicating whetehr the layer to be output of the
+   * RecurrentGradientMachine is a sequence or not
+   * @param outArgs: all of the the returned Arguments of the forward pass
+   * during the generation process.
+   * @param copySize: the returned result, number of rows to select from the
+   * Matrix stored the forward pass results from a start position.
+   */
+  void createDataOutlinkCopySizeInfo(bool isSeq,
+                                     std::vector<Argument>& outArgs,
+                                     std::vector<int>& copySize);
+
+  /*
+   * @brief decide index of the start row for each time step of a generated
+   * sequence in Matrix stored the entire beam search batch's forward pass
+   * results.
+   *
+   * @param isSeq: a flag indicating whether the layer to be output of the
+   * RecurrentGradientMachine is a sequence or not
+   * @param outArgs: all of the returned Arguments of the forward pass
+   * during the generation process.
+   */
+  void createDataOutlinkSelRowsInfo(bool isSeq, std::vector<Argument>& outArgs);
+
+  /*
+   * @brief used in beam search, connect previous frame to form recurrent link
+   * @param stepId : iteration number of generation process.
+   * It equals to the length of longest half-generated sequence.
+   * @param paths : half-generated paths that are going to be expanded
+   * in current beam search iteration.
+   */
+  void connectPrevFrame(int stepId, std::vector<Path>& paths);
+
+  /*
+   * @brief used in beam search, forward current recurrent frame
+   * @param machineCur : index to access the layer group frame in
+   * currrent generation step.
+   */
+  void forwardFrame(int machineCur);
+
+  /*
+   * @brief reduce all expanded paths to beam size.
+   *
+   * @param newPaths : newPaths[totalExpandCount : ] stores all expanded paths
+   * for the seqId-th sequence
+   * @param seqId : sequence index in a batch
+   * @param totalExpandCount : number of already shrinked paths in newPaths
+   * @return size of retained paths at the end of a beam search iteration
+   */
+  size_t beamShrink(std::vector<Path>& newPaths,
+                    size_t seqId,
+                    size_t totalExpandCount);
+
+  /*
+   * @brief expand a single path to expandWidth new paths
+   * with highest probability
+   * @param curPath : path to be expanded
+   * @param curPathId : index of curPath in member newPaths
+   * @param expandWidth : number of paths to be expanded
+   */
+  void singlePathExpand(Path& curPath,
+                        size_t curPathId,
+                        std::vector<Path>& newPaths,
+                        size_t expandWidth);
+
+  /*
+   * @brief A new beam search iteration. Each half-generated paths in previous
+   * beam search iteration are further expanded to beam_size new paths
+   * with highest probabilities, and then all the expanded paths are again
+   * reduced to beam_size paths according to their log probabilities.
+   * @param paths : half-generated paths in previous iteration.
+   * @param newPaths : paths expanded and then reduces in current iteration.
+   */
+  void beamExpand(std::vector<Path>& paths, std::vector<Path>& newPaths);
+
+  /*
+   * @brief fill sequence start positions and some other information that are
+   * uesed by the "text_printer" evaluator.
+   */
+  void fillGenOutputs();
+
+  std::vector<int> machineIds_;
+  std::vector<int> topIds_;
+  std::vector<int> seqIds_;
+  std::vector<int> batchMachineIdVec_;
+  std::vector<int> batchMachineStartPos_;
+  std::vector<std::vector<Path>> finalPaths_;
+  std::vector<real> minFinalPathLogProb_;
+  BeamSearchControlCallbacks* beamSearchCtrlCallbacks_;
+  BeamSearchStatisticsCallbacks* beamSearchStatistics_;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/AddtoLayer.cpp b/paddle/legacy/gserver/layers/AddtoLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..39c5603d9389b433b77e2876f34b3061c62f68f0
--- /dev/null
+++ b/paddle/legacy/gserver/layers/AddtoLayer.cpp
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "AddtoLayer.h"
+
+#include "paddle/legacy/utils/Logging.h"
+
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(addto, AddtoLayer);
+
+bool AddtoLayer::init(const LayerMap& layerMap,
+                      const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+
+  return true;
+}
+
+void AddtoLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInputValue(0)->getHeight();
+  int size = getSize();
+
+  reserveOutput(batchSize, size);
+
+  MatrixPtr outV = getOutputValue();
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    MatrixPtr input = getInputValue(i);
+    i == 0 ? outV->assign(*input) : outV->add(*input);
+  }
+  /* add the bias-vector */
+  if (biases_.get() != NULL) {
+    outV->addBias(*(biases_->getW()), 1);
+  }
+
+  /* activation */ { forwardActivation(); }
+}
+
+void AddtoLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ { backwardActivation(); }
+
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    /* Calculate the input layers error */
+    MatrixPtr preGrad = getInputGrad(i);
+    if (NULL != preGrad) {
+      preGrad->add(*getOutputGrad());
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/AddtoLayer.h b/paddle/legacy/gserver/layers/AddtoLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad3cefe1a4d27953b2fef535e1b865175a2cadc2
--- /dev/null
+++ b/paddle/legacy/gserver/layers/AddtoLayer.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ * This layer just simply add all input layers together, then activate
+ * the sum inputs. Each input of this layer should be the same size,
+ * which is also the output size of this layer.
+ * \f[
+ *   y=f(\sum_{i}x_i + b)
+ * \f]
+ * where \f$y\f$ is output, \f$x\f$ is input, \f$b\f$ is bias, and \f$f\f$ is
+ * activation function.
+ *
+ * The config file api is addto_layer.
+ */
+class AddtoLayer : public Layer {
+ protected:
+  std::unique_ptr<Weight> biases_;
+
+ public:
+  explicit AddtoLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~AddtoLayer() {}
+
+  /**
+   * Intialization of AddtoLayer.
+   */
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  /**
+   * Forward propagation.
+   * @note There is no weight matrix for each input,
+   *       because it just a simple add operation.
+   */
+  void forward(PassType passType) override;
+
+  /**
+   * Backward propagation.
+   */
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/AgentLayer.cpp b/paddle/legacy/gserver/layers/AgentLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bae89b2fa34d156adae1305d78d6c1465ccdd0ae
--- /dev/null
+++ b/paddle/legacy/gserver/layers/AgentLayer.cpp
@@ -0,0 +1,281 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "AgentLayer.h"
+
+#include "paddle/legacy/utils/Logging.h"
+
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(agent, AgentLayer);
+
+bool AgentLayer::init(const LayerMap& layerMap,
+                      const ParameterMap& parameterMap) {
+  CHECK_EQ(config_.inputs_size(), 0);
+  if (!Layer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  setNeedGradient(true);
+  return true;
+}
+
+void AgentLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  Argument& realOutput = realLayer_->getOutput();
+  int realNumSequences = realOutput.getNumSequences();
+  CHECK_LE(numSamples_, realNumSequences);
+
+  // get Arguments from real layers
+  if (numSamples_ > 0 && numSamples_ < realNumSequences) {
+    if (realOutput.hasSeq()) {
+      int numRows =
+          realOutput.sequenceStartPositions->getData(false)[numSamples_];
+      output_.subArgFrom(realOutput,
+                         /* offset */ 0,
+                         numRows,
+                         getSize(),
+                         useGpu_,
+                         /* trans */ false,
+                         /* seqFlag */ true,
+                         /* seqStart */ 0,
+                         /* seqSize */ numSamples_ + 1);
+    } else {
+      output_.subArgFrom(
+          realOutput, /* offset */ 0, numSamples_, getSize(), useGpu_);
+    }
+  } else {
+    output_ = realOutput;
+  }
+}
+
+bool GatherAgentLayer::init(const LayerMap& layerMap,
+                            const ParameterMap& parameterMap) {
+  CHECK_EQ(config_.inputs_size(), 0);
+  if (!Layer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  setNeedGradient(true);
+  return true;
+}
+
+void GatherAgentLayer::copyIdAndSequenceInfo(
+    ICpuGpuVectorPtr sequenceStartPositions,
+    ICpuGpuVectorPtr subSequenceStartPositions,
+    const IVectorPtr& ids,
+    const std::vector<int>& idIndex) {
+  output_.sequenceStartPositions = sequenceStartPositions;
+  output_.subSequenceStartPositions = subSequenceStartPositions;
+  allIds_ = ids;
+  idIndex_ = idIndex;
+}
+
+void GatherAgentLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  forwardIds(passType);
+  forwardValue(passType);
+}
+
+void GatherAgentLayer::forwardValue(PassType passType) {
+  MatrixPtr valueReal = realLayers_[0]->getOutputValue();
+  if (!valueReal) return;
+
+  int height = allIds_->getSize();
+  int width = this->getSize();
+  resetOutput(height, width);
+  idsVec_.resize(idIndex_.size());
+
+  const MatrixPtr& outV = getOutputValue();
+
+  for (size_t i = 0; i < realLayers_.size(); ++i) {
+    const MatrixPtr& realV = realLayers_[i]->getOutputValue();
+    idsVec_[i] = IVector::create(allIds_->getData() + idIndex_[i],
+                                 /* size */ realV->getHeight(),
+                                 useGpu_);
+    realV->addToRows(*outV, *idsVec_[i]);
+  }
+}
+
+namespace {
+
+// dest[index[i]] <- src[i] for each i
+void copyElements(const IVector& srcVec,
+                  const IVector& indexVec,
+                  IVector& destVec) {
+  const int* src = srcVec.getData();
+  const int* index = indexVec.getData();
+  int* dest = destVec.getData();
+  int len = indexVec.getSize();
+  CHECK_EQ(srcVec.getSize(), indexVec.getSize());
+  for (int i = 0; i < len; ++i) {
+    dest[index[i]] = src[i];
+  }
+}
+}  // namespace
+
+void GatherAgentLayer::forwardIds(PassType passType) {
+  IVectorPtr realId = realLayers_[0]->getOutputLabel();
+  if (!realId) return;
+
+  IVector::resizeOrCreate(output_.ids, allIds_->getSize(), useGpu_);
+  IVectorPtr outId = output_.ids;
+  idsVec_.resize(idIndex_.size());
+
+  for (size_t i = 0; i < realLayers_.size(); ++i) {
+    const IVectorPtr& realId = realLayers_[i]->getOutputLabel();
+    idsVec_[i] = IVector::create(allIds_->getData() + idIndex_[i],
+                                 /* size */ realId->getSize(),
+                                 useGpu_);
+    execViaCpu(&copyElements, *realId, *idsVec_[i], *outId);
+  }
+}
+
+void GatherAgentLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  const MatrixPtr& outputGrad = getOutputGrad();
+
+  for (size_t i = 0; i < realLayers_.size(); ++i) {
+    const MatrixPtr& realG = realLayers_[i]->getOutputGrad();
+    if (realG) {
+      realG->selectRows(*outputGrad, *idsVec_[i]);
+    }
+  }
+}
+
+bool ScatterAgentLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  CHECK_EQ(config_.inputs_size(), 0);
+  if (!Layer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  setNeedGradient(true);
+  return true;
+}
+
+void ScatterAgentLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());
+
+  int width = this->getSize();
+  if (selectionMode_) {
+    forwardWithSelection(passType);
+  } else {
+    if (realOutArg_.hasSeq()) {
+      output_.subArgFrom(realOutArg_,
+                         /* offset */ idIndex_,
+                         idSize_,
+                         width,
+                         useGpu_,
+                         /* trans */ false,
+                         /* seqFlag */ true,
+                         /* seqStart */ seqStartPosIndex_,
+                         /* seqSize */ numSequences_);
+    } else {
+      output_.subArgFrom(
+          realOutArg_, /* offset */ idIndex_, idSize_, width, useGpu_);
+    }
+  }
+}
+
+void ScatterAgentLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  CHECK(!selectionMode_);
+
+  const MatrixPtr& outputGrad = realOutArg_.grad;
+  const MatrixPtr& realGrad = realLayer_->getOutputGrad();
+  if (realGrad) {
+    // for agent in inFrameLines and memoryFrameLines,
+    // only first scatterAgentLayer should do addToRows in backward
+    if (handleBackward_) {
+      outputGrad->addToRows(*realGrad, *ids_);
+    }
+  }
+}
+
+REGISTER_LAYER(gather_agent, GatherAgentLayer);
+REGISTER_LAYER(scatter_agent, ScatterAgentLayer);
+
+void ScatterAgentLayer::forwardWithSelection(PassType passType) {
+  Layer::forward(passType);
+  CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());
+
+  const Argument& input = realLayer_->getOutput();
+  CHECK_EQ(realLayer_->getSize(), this->getSize());
+  int width = this->getSize();
+
+  AsyncGpuBlock asyncGpuBlock;
+  REGISTER_TIMER_INFO("SequenceAgentLayerForward", getName().c_str());
+
+  if (!input.hasSeq()) {
+    if (realLayer_->getOutput().ids) {
+      IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
+      output_.ids->selectFrom(*realLayer_->getOutput().ids, *ids_);
+    }
+    if (realLayer_->getOutput().value) {
+      int height = ids_->getSize();
+      resetOutput(height, width);
+
+      const MatrixPtr& outV = getOutputValue();
+      const MatrixPtr& realV = realLayer_->getOutputValue();
+      outV->selectRows(*realV, *ids_);
+    }
+  } else {
+    // Putting the generation logic here is really an ugly hack!
+    // used in generation
+    int height = 0;
+    size_t numSequences = ids_->getSize();
+    const int* starts = input.getCpuStartPositions();
+    size_t size = input.hasSubseq() ? input.getNumSubSequences()
+                                    : input.getNumSequences();
+    const int* cpuIds = cpuIds_->getData();
+
+    for (size_t i = 0; i < numSequences; ++i) {
+      size_t seqId = cpuIds[i];
+      CHECK_LT(seqId, size);
+      height += starts[seqId + 1] - starts[seqId];
+    }
+    reserveOutput(height, width);
+
+    const MatrixPtr& outputValue = getOutputValue();
+
+    CHECK_NE(input.sequenceStartPositions.get(),
+             output_.sequenceStartPositions.get());
+    ICpuGpuVector::resizeOrCreate(
+        output_.sequenceStartPositions, numSequences + 1, false);
+    int* outStarts = output_.sequenceStartPositions->getMutableData(false);
+
+    ICpuGpuVector::resizeOrCreate(inputStartPos_, height, false);
+    int* inStarts = inputStartPos_->getMutableData(false);
+
+    size_t offsetOut = 0;
+    for (size_t i = 0; i < numSequences; ++i) {
+      outStarts[i] = offsetOut;
+      size_t seqId = cpuIds[i];
+      int size = starts[seqId + 1] - starts[seqId];
+      for (int j = 0; j < size; j++) {
+        inStarts[offsetOut + j] = starts[seqId] + j;
+      }
+      offsetOut += size;
+    }
+    outStarts[numSequences] = offsetOut;
+
+    outputValue->copyByRowIndex(*input.value,
+                                *inputStartPos_->getVector(useGpu_));
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/AgentLayer.h b/paddle/legacy/gserver/layers/AgentLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..a05eac5e704466df02a74ce6e5364ab6f03f7446
--- /dev/null
+++ b/paddle/legacy/gserver/layers/AgentLayer.h
@@ -0,0 +1,177 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ * AgentLayer use as a virtual input of another layer in config,
+ * before execute forward/backward, setRealLayer() should be
+ * called to set one and only one real layer
+ */
+class AgentLayer : public Layer {
+ protected:
+  LayerPtr realLayer_;
+  int numSamples_;
+
+ public:
+  explicit AgentLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~AgentLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  // if *numSamples* set,
+  // real layer output will only use first *numSamples* rows
+  void setRealLayer(LayerPtr layer, int numSamples = 0) {
+    realLayer_ = layer;
+    numSamples_ = numSamples;
+  }
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override {}
+};
+
+/**
+ * Like AgentLayer, but it can gather many real layers. Each real
+ * layer give a few rows of a sequence, after gather all real layers,
+ * GatherAgentLayer collect a complete sequence.
+ */
+class GatherAgentLayer : public Layer {
+ protected:
+  std::vector<LayerPtr> realLayers_;
+  std::vector<IVectorPtr> idsVec_;
+  // we don't clear idsVec_ vector to aviod IVector alloc/free
+  IVectorPtr allIds_;
+  std::vector<int> idIndex_;
+
+ public:
+  explicit GatherAgentLayer(const LayerConfig& config) : Layer(config) {}
+
+  virtual ~GatherAgentLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  // call before addRealLayer
+  void clearRealLayers() { realLayers_.clear(); }
+
+  void copyIdAndSequenceInfo(ICpuGpuVectorPtr sequenceStartPositions,
+                             ICpuGpuVectorPtr subSequenceStartPositions,
+                             const IVectorPtr& allIds,
+                             const std::vector<int>& idIndex);
+
+  // add one real layer, can call many times
+  void addRealLayer(LayerPtr layer) { realLayers_.push_back(layer); }
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+  void forwardValue(PassType passType);
+  void forwardIds(PassType passType);
+};
+
+/**
+ * Like AgentLayer, but only select a few rows in real layer.
+ * [idIndex, idIndex + idSize) of *ids* in setRealLayerAndOutput()
+ * are the selected row ids. It's used to scatter one layer's output
+ * to many small submodels. ScatterAgentLayer can support ids real layer,
+ * if it is, the agent will select a few ids in real layer.
+ */
+class ScatterAgentLayer : public Layer {
+ protected:
+  LayerPtr realLayer_;
+  IVectorPtr ids_;
+  IVectorPtr cpuIds_;
+  Argument realOutArg_;
+  int idIndex_;
+  int idSize_;
+  int seqStartPosIndex_;
+  int numSequences_;  // number of sequences in this scatterAgentLayer
+  bool handleBackward_;
+
+  // use to store expanded cpuStartPositions or subSequenceStartPositions
+  // of real layer.
+  ICpuGpuVectorPtr inputStartPos_;
+
+  // true for setRealLayer, false for setRealLayerAndOutput
+  bool selectionMode_;
+
+ public:
+  explicit ScatterAgentLayer(const LayerConfig& config) : Layer(config) {}
+
+  virtual ~ScatterAgentLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  /**
+   * @brief set real layer in generation
+   *
+   * @param layer[input]    realLayer
+   * @param ids[input]      row id in real layer
+   * @param copyId[input]   whether to copy a cpu version of ids,
+   *                        false(default) in ScatterAgentLayer, and
+   *                        true in SequenceScatterAgentLayer.
+   */
+  void setRealLayer(LayerPtr layer, const std::vector<int>& ids) {
+    realLayer_ = layer;
+    IVector::resizeOrCreate(ids_, ids.size(), useGpu_);
+    ids_->copyFrom(ids.data(), ids.size());
+    if (useGpu_) {
+      IVector::resizeOrCreate(cpuIds_, ids.size(), false);
+      cpuIds_->copyFrom(ids.data(), ids.size());
+    } else {
+      cpuIds_ = ids_;
+    }
+    selectionMode_ = true;
+  }
+
+  // set real layer and output, [idIndex, idIndex + idSize) of *ids*
+  // are selected row for realOutArg in realLayer
+  void setRealLayerAndOutput(LayerPtr layer,
+                             const Argument& outArg,
+                             const IVectorPtr& ids,
+                             int idIndex,
+                             int idSize,
+                             bool handleBackward) {
+    realLayer_ = layer;
+    realOutArg_ = outArg;
+    ids_ = ids;
+    idIndex_ = idIndex;
+    idSize_ = idSize;
+    handleBackward_ = handleBackward;
+    selectionMode_ = false;
+  }
+
+  void setSequenceStartPositions(const ICpuGpuVectorPtr& sequenceStartPositions,
+                                 int seqStartPosIndex,
+                                 int numSequences) {
+    realOutArg_.sequenceStartPositions = sequenceStartPositions;
+    seqStartPosIndex_ = seqStartPosIndex;
+    numSequences_ = numSequences;
+  }
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+
+  void forwardWithSelection(PassType passType);
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/AverageLayer.cpp b/paddle/legacy/gserver/layers/AverageLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0539da793712527c72792603ae28a1d0aa903bcc
--- /dev/null
+++ b/paddle/legacy/gserver/layers/AverageLayer.cpp
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "AverageLayer.h"
+
+#include "paddle/legacy/utils/Logging.h"
+
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(average, AverageLayer);
+
+bool AverageLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  SequencePoolLayer::init(layerMap, parameterMap);
+
+  // average strategy
+  if (config_.average_strategy() == "average") {
+    mode_ = kAverage;
+  } else if (config_.average_strategy() == "sum") {
+    mode_ = kSum;
+  } else if (config_.average_strategy() == "squarerootn") {
+    mode_ = kAverageSquareRootN;
+  } else {
+    LOG(FATAL) << "Unknown average strategy: " << config_.average_strategy();
+  }
+  return true;
+}
+
+void AverageLayer::forward(PassType passType) {
+  SequencePoolLayer::forward(passType);
+
+  MatrixPtr inputValue = getInputValue(0);
+  getOutputValue()->sequenceAvgForward(
+      *inputValue, *startPositions_->getVector(useGpu_), mode_);
+
+  /* add the bias-vector AFTER average operation */
+  if (biases_.get() != NULL) {
+    MatrixPtr outV = getOutputValue();
+    outV->addBias(*(biases_->getW()), 1);
+  }
+
+  /* activation */ { forwardActivation(); }
+}
+
+void AverageLayer::backward(const UpdateCallback& callback) {
+  SequencePoolLayer::backward(callback);
+
+  if (getInputGrad(0)) {
+    getInputGrad(0)->sequenceAvgBackward(
+        *getOutputGrad(), *startPositions_->getVector(useGpu_), mode_);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/AverageLayer.h b/paddle/legacy/gserver/layers/AverageLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..a0d457d35f4bce99860cf45e94525f323f45e286
--- /dev/null
+++ b/paddle/legacy/gserver/layers/AverageLayer.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "SequencePoolLayer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * A layer for "internal average" for sequence input.
+ * Input: one or more sequences. Each sequence contains some instances.
+ * If SequenceLevel = kNonSeq:
+ *    Output: output size is the number of input sequences (NOT input instances)
+ *    output[i] = average_{for each instance in this sequence}{input[i]}
+ *    If stride_ > 0:
+ *      Output: a shorten sequence. Stride is the step size by which we slide a
+ *              window upon the input sequence, and the average pooling
+ *              operation is then applied to each interval independently.
+ * If SequenceLevel = kSeq:
+ *    Check input sequence must has sub-sequence
+ *    Output: output size is the number of input sub-sequences
+ *    output[i] = average_{for each instance in this sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
+ */
+class AverageLayer : public SequencePoolLayer {
+ public:
+  enum AverageStrategy { kAverage = 0, kSum = 1, kAverageSquareRootN = 2 };
+  explicit AverageLayer(const LayerConfig& config)
+      : SequencePoolLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ protected:
+  int mode_;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BatchNormBaseLayer.cpp b/paddle/legacy/gserver/layers/BatchNormBaseLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4dcbd8dc270d5e5329b33b366ac937894833085f
--- /dev/null
+++ b/paddle/legacy/gserver/layers/BatchNormBaseLayer.cpp
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "BatchNormBaseLayer.h"
+#include "BatchNormalizationLayer.h"
+#include "Layer.h"
+#include "paddle/legacy/utils/Stat.h"
+#ifdef PADDLE_WITH_CUDA
+#include "CudnnBatchNormLayer.h"
+#endif
+
+namespace paddle {
+
+bool BatchNormBaseLayer::init(const LayerMap& layerMap,
+                              const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  if (!Layer::init(layerMap, parameterMap)) return false;
+
+  /* initialize the weightList */
+  // first is Input in configure
+  // other two is created in config_parser.py
+  CHECK_EQ(inputLayers_.size(), 3U);
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  CHECK_EQ(inputLayers_.size(), size_t(config_.inputs_size()));
+  const ImageConfig& conf = config_.inputs(0).image_conf();
+  channels_ = conf.channels();
+  calFeatureMapSize();
+
+  if (config_.has_use_global_stats()) {
+    useGlobalStats_ = config_.use_global_stats();
+  }
+  movingAvgFraction_ = config_.moving_average_fraction();
+  epsilon_ = config_.epsilon();
+
+  weight_.reset(new Weight(1, channels_, parameters_[0]));
+  movingMean_.reset(new Weight(1, channels_, parameters_[1]));
+  movingVar_.reset(new Weight(1, channels_, parameters_[2]));
+
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, channels_, biasParameter_));
+  }
+
+  savedMean_ = Matrix::create(1, channels_, false, useGpu_);
+  savedInvVar_ = Matrix::create(1, channels_, false, useGpu_);
+  savedMean_->zeroMem();
+  savedInvVar_->zeroMem();
+
+  return true;
+}
+
+void BatchNormBaseLayer::calFeatureMapSize() {
+  const ImageConfig& conf = config_.inputs(0).image_conf();
+  imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  imageD_ = inputLayers_[0]->getOutput().getFrameDepth();
+
+  if (0 == imageD_) imageD_ = conf.img_size_z();
+  if (imageH_ == 0 && imageW_ == 0) {
+    imageH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+    imageW_ = conf.img_size();
+  } else {
+    getOutput().setFrameHeight(imageH_);
+    getOutput().setFrameWidth(imageW_);
+    getOutput().setFrameDepth(imageD_);
+  }
+  imgPixels_ = imageH_ * imageW_ * imageD_;
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BatchNormBaseLayer.h b/paddle/legacy/gserver/layers/BatchNormBaseLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..8dc1d7883767b4aabc8501531996036c2def9481
--- /dev/null
+++ b/paddle/legacy/gserver/layers/BatchNormBaseLayer.h
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief Batch normalization layer use to normalizes the input to across the
+ * batch.
+ *
+ * By default, calculating global mean and variance statistics via a running
+ * average in the training peroid. Then the pre-calculated global mean and
+ * variance are used for testing.
+ *
+ * Moving mean and variance are located in Parameter object when constructing
+ * and the calculation will change them. Now we only save global mean and
+ * variance of one thread in first node for GPU.
+ * But the calculation in CPU is different, because parameters are shared by
+ * multiple threads. Here using ShareCpuMatrix with lock to calculate. We
+ * still save global mean and variance in first node in CPU when multi machine.
+ *
+ * [1] S. Ioffe and C. Szegedy, "Batch Normalization: Accelerating Deep Network
+ *     Training by Reducing Internal Covariate Shift." arXiv preprint
+ *     arXiv:1502.03167 (2015).
+ */
+
+class BatchNormBaseLayer : public Layer {
+ public:
+  explicit BatchNormBaseLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~BatchNormBaseLayer() {}
+
+  /**
+   * @brief Create BatchNorm layer by norm_type, including batch_norm and
+   * cudnn_batch_norm. If do not set norm_type, it will automatically select
+   * cudnn_batch_norm for GPU and batch_norm for CPU.
+   */
+  static Layer* create(const LayerConfig& config);
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  /**
+   * @brief Calculate feature map size. Some input uses frameHeight and
+   * frameWidth to store feature size
+   */
+  void calFeatureMapSize();
+
+ protected:
+  /// Batch normalization scale parameter, which is referred to as gamma in
+  /// in original paper.
+  std::unique_ptr<Weight> weight_;
+  /// Moving average of mean.
+  std::unique_ptr<Weight> movingMean_;
+  /// Moving average of variance.
+  std::unique_ptr<Weight> movingVar_;
+  /// Batch normalization bias parameter, which is referred to as beta in
+  /// in original paper.
+  std::unique_ptr<Weight> biases_;
+
+  /// Save intermediate results computed during the forward pass,
+  /// these can then be reused to speed up the backward pass.
+  MatrixPtr savedMean_;
+  MatrixPtr savedInvVar_;
+
+  /// Height or width of input image feature.
+  /// Both of them are 1 if the input is fully-connected layer.
+  int imageD_;
+  int imageH_;
+  int imageW_;
+  /// Height * Width.
+  int imgPixels_;
+  /// Feature dimension. If the input layer is conv layer, it is the channels
+  /// of feature map of the conv layer. If the input layer is fully-connected
+  /// layer, it is the dimension of fc layer.
+  int channels_;
+  // if useGlobalStats_ is true, will use the loaded mean and variance.
+  // otherwise, calculate mean and variance in this mini-batch.
+  bool useGlobalStats_;
+  // use to compute moving mean and variance.
+  real movingAvgFraction_;
+  // Epsilon is a small random noise used in batch normalization for stability.
+  real epsilon_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BatchNormalizationLayer.cpp b/paddle/legacy/gserver/layers/BatchNormalizationLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0297bd44c7b0485f34598f6926e5337da452460d
--- /dev/null
+++ b/paddle/legacy/gserver/layers/BatchNormalizationLayer.cpp
@@ -0,0 +1,266 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/Stat.h"
+#ifdef PADDLE_WITH_CUDA
+#include "hl_batch_transpose.h"
+#endif
+#include "BatchNormalizationLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(batch_norm, BatchNormalizationLayer);
+
+bool BatchNormalizationLayer::init(const LayerMap& layerMap,
+                                   const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  if (!BatchNormBaseLayer::init(layerMap, parameterMap)) return false;
+
+  return true;
+}
+
+void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
+  int numSamples = mat->getHeight();
+  Matrix::resizeOrCreate(tmpMat_, numSamples, channels_, false, useGpu_);
+  savedMean_->zeroMem();
+  savedMean_->accumulateColSum(*mat);
+  savedMean_->mulScalar(1.0 / numSamples);  // E[x]
+
+  tmpMat_->assign(*mat);
+  tmpMat_->square2();
+  savedInvVar_->zeroMem();
+  savedInvVar_->accumulateColSum(*tmpMat_);
+  savedInvVar_->mulScalar(1.0 / numSamples);   // E[x^2]
+  savedInvVar_->addSquare(*savedMean_, -1.0);  // E[x^2] - E^2[x]
+
+  // Variance may be small negative value
+  // because of the subtraction operation.
+  // Here using clipping.
+  savedInvVar_->downClip(real(0.0));
+
+  calMovingMeanAndVar();
+
+  savedInvVar_->subScalar(-epsilon_);
+  savedInvVar_->sqrt2(*savedInvVar_);
+}
+
+void BatchNormalizationLayer::calMovingMeanAndVar() {
+  // calculating and saving moving mean and variance
+  auto& movingMean = movingMean_->getW();
+  auto& movingVar = movingVar_->getW();
+  // movingMean =  movingMean * movingAvgFraction_
+  //            + savedMean_ * (1 - movingAvgFraction_)
+  movingMean->add(*savedMean_, movingAvgFraction_, 1.0 - movingAvgFraction_);
+  // movingVar =  movingVar * movingAvgFraction_
+  //           + savedInvVar_ * (1 - movingAvgFraction_)
+  movingVar->add(*savedInvVar_, movingAvgFraction_, 1.0 - movingAvgFraction_);
+}
+
+void BatchNormalizationLayer::setMeanAndStd() {
+  savedMean_->copyFrom(*(movingMean_->getW()));
+  savedInvVar_->copyFrom(*(movingVar_->getW()));
+  savedInvVar_->downClip(real(0.0));
+
+  savedInvVar_->subScalar(-epsilon_);
+  savedInvVar_->sqrt2(*savedInvVar_);
+}
+
+void BatchNormalizationLayer::expandMat(const MatrixPtr& in, MatrixPtr& out) {
+  CHECK_EQ(in->getWidth(), static_cast<size_t>(channels_ * imgPixels_));
+  CHECK_EQ(out->getWidth(), static_cast<size_t>(channels_));
+  CHECK(!in->isTransposed());
+  CHECK(!out->isTransposed());
+  if (imgPixels_ == 1) {
+    out->assign(*in);
+    return;
+  }
+  size_t batchSize = in->getHeight();
+  CHECK_EQ(out->getHeight(), batchSize * imgPixels_);
+  if (useGpu_) {
+#ifndef PADDLE_WITH_CUDA
+    LOG(FATAL) << "paddle is compiled only for cpu";
+#else
+    batchTranspose(
+        in->getData(), out->getData(), imgPixels_, channels_, batchSize);
+#endif
+  } else {
+    for (size_t i = 0; i < batchSize; i++) {
+      const MatrixPtr inTmp =
+          Matrix::create(in->getData() + i * imgPixels_ * channels_,
+                         channels_,
+                         imgPixels_,
+                         false,
+                         useGpu_);
+      MatrixPtr outTmp =
+          Matrix::create(out->getData() + i * imgPixels_ * channels_,
+                         imgPixels_,
+                         channels_,
+                         false,
+                         useGpu_);
+      inTmp->transpose(outTmp, false);
+    }
+  }
+}
+
+void BatchNormalizationLayer::shrinkMat(const MatrixPtr& in, MatrixPtr& out) {
+  CHECK_EQ(in->getWidth(), static_cast<size_t>(channels_));
+  CHECK_EQ(out->getWidth(), static_cast<size_t>(channels_ * imgPixels_));
+  size_t batchSize = out->getHeight();
+  CHECK(!in->isTransposed());
+  CHECK(!out->isTransposed());
+  if (imgPixels_ == 1) {
+    out->assign(*in);
+    return;
+  }
+  CHECK_EQ(in->getHeight(), static_cast<size_t>(batchSize * imgPixels_));
+  if (useGpu_) {
+#ifndef PADDLE_WITH_CUDA
+    LOG(FATAL) << "paddle is compiled only for cpu";
+#else
+    batchTranspose(
+        in->getData(), out->getData(), channels_, imgPixels_, batchSize);
+#endif
+  } else {
+    for (size_t i = 0; i < batchSize; i++) {
+      const MatrixPtr inTmp =
+          Matrix::create(in->getData() + i * channels_ * imgPixels_,
+                         imgPixels_,
+                         channels_,
+                         false,
+                         useGpu_);
+      MatrixPtr outTmp =
+          Matrix::create(out->getData() + i * imgPixels_ * channels_,
+                         channels_,
+                         imgPixels_,
+                         useGpu_);
+      inTmp->transpose(outTmp, false);
+    }
+  }
+}
+
+void BatchNormalizationLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  int batchSize = getInputValue(0)->getHeight();
+  calFeatureMapSize();
+  resetOutput(batchSize, getInputValue(0)->getWidth());
+
+  // for testing in training peroid.
+  useGlobalStats_ = (passType == PASS_TEST);
+  if (passType == PASS_TEST && config_.has_use_global_stats()) {
+    useGlobalStats_ = config_.use_global_stats();
+  }
+
+  Matrix::resizeOrCreate(
+      expandedIn_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      normIn_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      expandedOut_, batchSize * imgPixels_, channels_, false, useGpu_);
+  expandMat(getInputValue(0), expandedIn_);
+
+  if (useGlobalStats_) {
+    if (firstTest_) {
+      setMeanAndStd();
+      firstTest_ = false;
+    }
+  } else {
+    calMeanAndStd(expandedIn_);
+    firstTest_ = true;
+  }
+
+  normIn_->assign(*expandedIn_);
+  normIn_->addBias(*savedMean_, -1);     // subtract mean.
+  normIn_->divRowVector(*savedInvVar_);  // divide std.
+
+  expandedOut_->assign(*normIn_);
+  expandedOut_->mulRowVector(*weight_->getW());  // multiple gamma.
+  if (biases_) {
+    expandedOut_->addBias(*(biases_->getW()), 1);  // add beta.
+  }
+  MatrixPtr out = getOutputValue();
+  shrinkMat(expandedOut_, out);
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void BatchNormalizationLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+  int batchSize = getInputValue(0)->getHeight();
+
+  Matrix::resizeOrCreate(meanGrad_, 1, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(stdGrad_, 1, channels_, false, useGpu_);
+
+  Matrix::resizeOrCreate(
+      expandedInGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      inGrad_, batchSize, imgPixels_ * channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      normInGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      expandedOutGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      tmpMat_, batchSize * imgPixels_, channels_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      tmpGrad_, batchSize * imgPixels_, channels_, false, useGpu_);
+
+  expandMat(getOutputGrad(), expandedOutGrad_);
+
+  // compute derivatives.
+  if (biases_ && biases_->getWGrad()) {
+    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
+    biases_->getWGrad()->collectBias(*expandedOutGrad_, 1);
+    /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+  if (weight_->getWGrad()) {
+    tmpMat_->dotMul(*expandedOutGrad_, *normIn_);
+    weight_->getWGrad()->collectBias(*tmpMat_, 1);
+  }
+
+  // compute input gradients.
+  normInGrad_->assign(*expandedOutGrad_);
+  normInGrad_->mulRowVector(*(weight_->getW()));  // multiple gamma.
+  // normInGrad * (x - \mu)/ \sqrt(\delta^2)
+  tmpMat_->dotMul(*normInGrad_, *normIn_);
+  stdGrad_->zeroMem();
+  stdGrad_->collectBias(*tmpMat_, -1.0 / (batchSize * imgPixels_));
+  tmpGrad_->assign(*normIn_);
+  tmpGrad_->mulRowVector(*stdGrad_);
+
+  meanGrad_->zeroMem();
+  meanGrad_->collectBias(*normInGrad_, -1.0 / (batchSize * imgPixels_));
+
+  expandedInGrad_->zeroMem();
+  expandedInGrad_->add(*normInGrad_, *tmpGrad_);
+  expandedInGrad_->addRowVector(*meanGrad_);
+  expandedInGrad_->divRowVector(*savedInvVar_);
+
+  shrinkMat(expandedInGrad_, inGrad_);
+  if (getInputGrad(0)) {
+    getInputGrad(0)->add(*getInputGrad(0), *inGrad_);
+  }
+  {
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    weight_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BatchNormalizationLayer.h b/paddle/legacy/gserver/layers/BatchNormalizationLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5e4e690b6017f32de0f4d7557065c02c03d689f
--- /dev/null
+++ b/paddle/legacy/gserver/layers/BatchNormalizationLayer.h
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "BatchNormBaseLayer.h"
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * @brief A Inheritance class of Batch normalization layer.
+ * It supports both CPU and GPU.
+ *
+ * The config file api is batch_norm_layer.
+ */
+
+class BatchNormalizationLayer : public BatchNormBaseLayer {
+ public:
+  explicit BatchNormalizationLayer(const LayerConfig& config)
+      : BatchNormBaseLayer(config), firstTest_(true) {}
+
+  ~BatchNormalizationLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ protected:
+  /// Load pre-calculated mean and std.
+  void setMeanAndStd();
+
+  /// Calculate mean and std.
+  void calMeanAndStd(const MatrixPtr& mat);
+
+  /// Calculate moving mean and variance.
+  void calMovingMeanAndVar();
+
+  /// expand a Matrix from batch, channels* imagePixels to
+  /// batch * ImagePixels * channels.
+  void expandMat(const MatrixPtr& in, MatrixPtr& out);
+
+  /// Shrink a Matrix from  from batch * ImagePixels * channels
+  /// to batch, channels* imagePixels.
+  void shrinkMat(const MatrixPtr& in, MatrixPtr& out);
+
+  void onPassEnd() override { firstTest_ = true; }
+
+  MatrixPtr tmpMat_, tmpGrad_;
+  MatrixPtr expandedIn_, expandedOut_;
+  MatrixPtr expandedInGrad_, expandedOutGrad_, inGrad_;
+  MatrixPtr normIn_, normInGrad_, meanGrad_, stdGrad_;
+
+  /// Load mean and variance only once flag.
+  bool firstTest_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BilinearInterpLayer.cpp b/paddle/legacy/gserver/layers/BilinearInterpLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a091f51bc20e219c3111fb07058b5adea5a3fc38
--- /dev/null
+++ b/paddle/legacy/gserver/layers/BilinearInterpLayer.cpp
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "BilinearInterpLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(bilinear_interp, BilinearInterpLayer);
+
+size_t BilinearInterpLayer::getSize() {
+  inImgH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  inImgW_ = inputLayers_[0]->getOutput().getFrameWidth();
+
+  const BilinearInterpConfig& conf = config_.inputs(0).bilinear_interp_conf();
+  if (inImgH_ == 0) {
+    inImgH_ = conf.image_conf().img_size_y();
+  }
+  if (inImgW_ == 0) {
+    inImgW_ = conf.image_conf().img_size();
+  }
+
+  outImgH_ = conf.out_size_y();
+  outImgW_ = conf.out_size_x();
+  numChannels_ = conf.image_conf().channels();
+
+  CHECK(outImgH_ > 0 && outImgW_ > 0);
+  CHECK(inImgH_ > 0 && inImgW_ > 0);
+  CHECK(numChannels_);
+
+  ratioH_ =
+      (outImgH_ > 1) ? static_cast<real>(inImgH_ - 1) / (outImgH_ - 1) : 0.f;
+  ratioW_ =
+      (outImgW_ > 1) ? static_cast<real>(inImgW_ - 1) / (outImgW_ - 1) : 0.f;
+
+  getOutput().setFrameHeight(outImgH_);
+  getOutput().setFrameWidth(outImgW_);
+  return outImgH_ * outImgW_ * numChannels_;
+}
+
+bool BilinearInterpLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(1, config_.inputs_size());
+
+  return true;
+}
+
+void BilinearInterpLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  size_t batchSize = getInput(0).getBatchSize();
+  size_t size = getSize();
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    resetOutput(batchSize, size);
+  }
+
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwBilinearInterpTimer", getName().c_str());
+    outV->bilinearForward(*inV,
+                          inImgH_,
+                          inImgW_,
+                          outImgH_,
+                          outImgW_,
+                          numChannels_,
+                          ratioH_,
+                          ratioW_);
+  }
+}
+
+void BilinearInterpLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  MatrixPtr inputG = getInputGrad(0);
+  MatrixPtr outG = getOutputGrad();
+  {
+    REGISTER_TIMER_INFO("BwBilinearInterpTimer", getName().c_str());
+    if (inputG) {
+      inputG->bilinearBackward(*outG,
+                               outImgH_,
+                               outImgW_,
+                               inImgH_,
+                               inImgW_,
+                               numChannels_,
+                               ratioH_,
+                               ratioW_);
+    }
+  }
+}
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BilinearInterpLayer.h b/paddle/legacy/gserver/layers/BilinearInterpLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..c585a5ed10d9c8f241b5a5ff3a671752fda6d432
--- /dev/null
+++ b/paddle/legacy/gserver/layers/BilinearInterpLayer.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for bilinear interpolation which is
+ *        used on conv layer output.
+ *
+ * @note  The config file api is bilinear_interp_layer.
+ */
+class BilinearInterpLayer : public Layer {
+ protected:
+  size_t outImgH_, outImgW_;
+  size_t inImgH_, inImgW_;
+  real ratioH_, ratioW_;
+  size_t numChannels_;
+
+ public:
+  explicit BilinearInterpLayer(const LayerConfig& config) : Layer(config) {}
+
+  virtual ~BilinearInterpLayer() {}
+
+  size_t getSize();
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BlockExpandLayer.cpp b/paddle/legacy/gserver/layers/BlockExpandLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..24b5af67d40958c940eb0864994e7e81464f6c70
--- /dev/null
+++ b/paddle/legacy/gserver/layers/BlockExpandLayer.cpp
@@ -0,0 +1,121 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "BlockExpandLayer.h"
+
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+REGISTER_LAYER(blockexpand, BlockExpandLayer);
+
+bool BlockExpandLayer::init(const LayerMap& layerMap,
+                            const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(config_.inputs_size(), 1);
+  const BlockExpandConfig& blockConf = config_.inputs(0).block_expand_conf();
+  blockH_ = blockConf.block_y();
+  blockW_ = blockConf.block_x();
+  strideH_ = blockConf.stride_y();
+  strideW_ = blockConf.stride_x();
+  paddingH_ = blockConf.padding_y();
+  paddingW_ = blockConf.padding_x();
+  channels_ = blockConf.channels();
+  imgSizeH_ = blockConf.img_size_y();
+  imgSizeW_ = blockConf.img_size_x();
+
+  std::vector<size_t> strides = {(size_t)strideH_, (size_t)strideW_};
+  std::vector<size_t> paddings = {(size_t)paddingH_, (size_t)paddingW_};
+  std::vector<size_t> blocks = {(size_t)blockH_, (size_t)blockW_};
+  createFunction(forward_,
+                 "BlockExpand",
+                 FuncConfig()
+                     .set("strides", strides)
+                     .set("paddings", paddings)
+                     .set("blocks", blocks));
+  createFunction(backward_,
+                 "BlockExpandGrad",
+                 FuncConfig()
+                     .set("strides", strides)
+                     .set("paddings", paddings)
+                     .set("blocks", blocks));
+
+  return true;
+}
+
+size_t BlockExpandLayer::getBlockNum() {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  const BlockExpandConfig& blockConf = config_.inputs(0).block_expand_conf();
+  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (imgSizeH_ == 0) {
+    imgSizeH_ = blockConf.img_size_y();
+  }
+  if (imgSizeW_ == 0) {
+    imgSizeW_ = blockConf.img_size_x();
+  }
+  size_t tmpH = 2 * paddingH_ + imgSizeH_ - blockH_;
+  outputH_ = (int)tmpH < 0 ? 1 : 1 + (tmpH + strideH_ - 1) / strideH_;
+  size_t tmpW = 2 * paddingW_ + imgSizeW_ - blockW_;
+  outputW_ = (int)tmpW < 0 ? 1 : 1 + (tmpW + strideW_ - 1) / strideW_;
+
+  return outputH_ * outputW_;
+}
+
+void BlockExpandLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  size_t blockNum = getBlockNum();
+  size_t blockSize = blockH_ * blockW_ * channels_;
+  resetOutput(blockNum * batchSize, blockSize);
+
+  // calculate output_.value
+  inputShape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
+  outputShape_ = TensorShape({batchSize, blockNum, blockSize});
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), inputShape_);
+  outputs.addArg(*getOutputValue(), outputShape_, ASSIGN_TO);
+  forward_[0]->calc(inputs, outputs);
+
+  // calculate output_.sequenceStartPositions and output_.cpuSequenceDims
+  Argument& out = getOutput();
+  ICpuGpuVector::resizeOrCreate(
+      out.sequenceStartPositions, batchSize + 1, false);
+  IVector::resizeOrCreate(out.cpuSequenceDims, 2 * batchSize, false);
+  int* start = out.sequenceStartPositions->getMutableData(false);
+  int* dims = out.cpuSequenceDims->getData();
+  for (size_t i = 0; i < batchSize; i++) {
+    start[i] = i * blockNum;
+    dims[2 * i] = outputH_;
+    dims[2 * i + 1] = outputW_;
+  }
+  start[batchSize] = batchSize * blockNum;
+}
+
+void BlockExpandLayer::backward(const UpdateCallback& callback) {
+  /* Calculate the input layers error */
+  if (getInputGrad(0)) {
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*getOutputGrad(), outputShape_);
+    outputs.addArg(*getInputGrad(0), inputShape_, ADD_TO);
+    backward_[0]->calc(inputs, outputs);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/BlockExpandLayer.h b/paddle/legacy/gserver/layers/BlockExpandLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b90249bfb0958f0081e7c668cd3b38a53c39951
--- /dev/null
+++ b/paddle/legacy/gserver/layers/BlockExpandLayer.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief Expand feature map to minibatch matrix.
+ * - matrix width is: blockH_ * blockW_ * channels_
+ * - matirx height is: outputH_ * outputW_
+ *
+ * \f[
+ * outputH\_ = 1 + (2 * paddingH\_ + imgSizeH\_ - blockH\_ + strideH\_ - 1) /
+ *             strideH\_ \\
+ * outputW\_ = 1 + (2 * paddingW\_ + imgSizeW\_ - blockW\_ + strideW\_ - 1) /
+ *             strideW\_
+ * \f]
+ *
+ * The expand method is the same with ExpandConvLayer, but saved the transposed
+ * value. After expanding, output_.sequenceStartPositions will store timeline.
+ * The number of time steps are outputH_ * outputW_ and the dimension of each
+ * time step is blockH_ * blockW_ * channels_. This layer can be used after
+ * convolution neural network, and before recurrent neural network.
+ *
+ * The config file api is block_expand_layer.
+ */
+class BlockExpandLayer : public Layer {
+ protected:
+  /**
+   * @brief Calculate outputH_ and outputW_ and return block number which
+   * actually is time steps.
+   * @return time steps, outoutH_ * outputW_.
+   */
+  size_t getBlockNum();
+  size_t blockH_, blockW_, strideH_, strideW_, paddingH_, paddingW_;
+  size_t imgSizeH_, imgSizeW_, outputH_, outputW_, channels_;
+
+  TensorShape inputShape_;
+  TensorShape outputShape_;
+
+ public:
+  explicit BlockExpandLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~BlockExpandLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CRFDecodingLayer.cpp b/paddle/legacy/gserver/layers/CRFDecodingLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/CRFDecodingLayer.cpp
rename to paddle/legacy/gserver/layers/CRFDecodingLayer.cpp
diff --git a/paddle/legacy/gserver/layers/CRFDecodingLayer.h b/paddle/legacy/gserver/layers/CRFDecodingLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..018162e146fa93725fe84bdf2da9a6124f3cea6f
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CRFDecodingLayer.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+
+#include "CRFLayer.h"
+#include "LinearChainCRF.h"
+
+namespace paddle {
+
+/**
+ * A layer for calculating the decoding sequence of sequential conditional
+ * random field model.
+ * The decoding sequence is stored in output_.ids
+ * It also calculate error, output_.value[i] is 1 for incorrect decoding
+ * or 0 for correct decoding)
+ * See LinearChainCRF.h for the detail of the CRF formulation.
+ */
+class CRFDecodingLayer : public CRFLayer {
+ public:
+  explicit CRFDecodingLayer(const LayerConfig& config) : CRFLayer(config) {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+
+ protected:
+  std::unique_ptr<LinearChainCRF> crf_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CRFLayer.cpp b/paddle/legacy/gserver/layers/CRFLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/CRFLayer.cpp
rename to paddle/legacy/gserver/layers/CRFLayer.cpp
diff --git a/paddle/legacy/gserver/layers/CRFLayer.h b/paddle/legacy/gserver/layers/CRFLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..88c2ed343ad5743068c871fe351437270d85f223
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CRFLayer.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+
+#include "Layer.h"
+#include "LinearChainCRF.h"
+
+namespace paddle {
+
+/**
+ * A layer for calculating the cost of sequential conditional random field
+ * model.
+ * See class LinearChainCRF for the detail of the CRF formulation.
+ */
+class CRFLayer : public Layer {
+ public:
+  explicit CRFLayer(const LayerConfig& config) : Layer(config) {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+
+ protected:
+  size_t numClasses_;
+  ParameterPtr parameter_;
+  std::vector<LinearChainCRF> crfs_;
+  LayerPtr weightLayer_;            // weight for each sequence
+  std::unique_ptr<Weight> weight_;  // parameters
+  real coeff_;                      // weight for the layer
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CTCLayer.cpp b/paddle/legacy/gserver/layers/CTCLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/CTCLayer.cpp
rename to paddle/legacy/gserver/layers/CTCLayer.cpp
diff --git a/paddle/legacy/gserver/layers/CTCLayer.h b/paddle/legacy/gserver/layers/CTCLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d70b1f4ceb03028865378d1d01b5706b35b10de
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CTCLayer.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "LinearChainCTC.h"
+
+namespace paddle {
+
+class CTCLayer : public Layer {
+ public:
+  explicit CTCLayer(const LayerConfig& config) : Layer(config) {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void forwardImp(const Argument& softmaxSeqs, const Argument& labelSeqs);
+  void backward(const UpdateCallback& callback) override;
+  void backwardImp(const UpdateCallback& callback,
+                   const Argument& softmaxSeqs,
+                   const Argument& labelSeqs);
+
+ protected:
+  size_t numClasses_;
+  bool normByTimes_;
+  std::vector<LinearChainCTC> ctcs_;
+  std::vector<Argument> tmpCpuInput_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ClipLayer.cpp b/paddle/legacy/gserver/layers/ClipLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6aa3c8fe64f5a59e82f3271baed99fd17fd6653f
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ClipLayer.cpp
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * A layer for clipping the input value by the threshold.
+ * \f[
+ *   out[i] = \min\left(\max\left(in[i],p_{1}\right),p_{2}\right)
+ * \f]
+ */
+
+class ClipLayer : public Layer {
+ protected:
+  double min_;
+  double max_;
+
+ public:
+  explicit ClipLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(clip, ClipLayer);
+
+bool ClipLayer::init(const LayerMap& layerMap,
+                     const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 1U);
+  auto layerConf = config_.inputs(0).clip_conf();
+  min_ = layerConf.min();
+  max_ = layerConf.max();
+  CHECK_LT(min_, max_);
+  return true;
+}
+
+void ClipLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV = getInputValue(0);
+  resetOutput(inV->getHeight(), inV->getWidth());
+  MatrixPtr outV = getOutputValue();
+  outV->copyFrom(*inV);
+  outV->clip(min_, max_);
+}
+
+void ClipLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr inG = getInputGrad(0);
+  if (inG) {
+    MatrixPtr outV = getOutputValue();
+    MatrixPtr outG = getOutputGrad();
+    MatrixPtr tmpMtx;
+    Matrix::resizeOrCreate(
+        tmpMtx, outG->getHeight(), outG->getWidth(), false, useGpu_);
+    tmpMtx->clipDerivative(*inV, min_, max_);
+    inG->addDotMul(*outG, *tmpMtx, 1, 1);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConcatenateLayer.cpp b/paddle/legacy/gserver/layers/ConcatenateLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ce3f2ca950bf87e287163f1cfc8b15d815a68cf4
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConcatenateLayer.cpp
@@ -0,0 +1,208 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "Projection.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A concatenate layer has multiple input layers. It concatenates rows of
+ * each input as one row for the output of this layer and apply activation.
+ */
+class ConcatenateLayer : public Layer {
+ public:
+  explicit ConcatenateLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ConcatenateLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(concat, ConcatenateLayer);
+
+bool ConcatenateLayer::init(const LayerMap& layerMap,
+                            const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  if (!Layer::init(layerMap, parameterMap)) return false;
+
+  CHECK(!biasParameter_);
+
+  return true;
+}
+
+void ConcatenateLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  int batchSize = getInput(0).getBatchSize();
+  int size = getSize();
+  reserveOutput(batchSize, size);
+
+  const MatrixPtr& out = getOutputValue();
+  int offset = 0;
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    const MatrixPtr& in = getInputValue(i);
+    size_t inSize = in->getWidth();
+    out->assignAtOffset(*in, offset);
+    offset += inSize;
+  }
+  CHECK_EQ(size, offset);
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void ConcatenateLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  /* Do activation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  const MatrixPtr& out = getOutputGrad();
+  int offset = 0;
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    const MatrixPtr& in = getInputGrad(i);
+    size_t inSize = getInputValue(i)->getWidth();
+    if (in) {
+      in->addAtOffset(*out, offset);
+    }
+    offset += inSize;
+  }
+}
+
+/**
+ * concat2 layer is like concat layer, but each input layer was
+ * processed by a Projection.
+ */
+class ConcatenateLayer2 : public Layer {
+ public:
+  explicit ConcatenateLayer2(const LayerConfig& config) : Layer(config) {}
+
+  ~ConcatenateLayer2() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ protected:
+  std::vector<std::unique_ptr<Projection>> projections_;
+  std::vector<Argument> projOutput_;
+  std::vector<std::pair<size_t, size_t>> projCol_;
+  bool sharedBias_;
+  std::unique_ptr<Weight> biases_;
+};
+
+REGISTER_LAYER(concat2, ConcatenateLayer2);
+
+bool ConcatenateLayer2::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  if (!Layer::init(layerMap, parameterMap)) return false;
+
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  projections_.reserve(inputLayers_.size());
+  projCol_.reserve(inputLayers_.size());
+  projOutput_.resize(inputLayers_.size());
+
+  size_t startCol = 0;
+  size_t endCol = 0;
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    projections_.emplace_back(Projection::create(
+        config_.inputs(i).proj_conf(), parameters_[i], useGpu_));
+
+    endCol += projections_[i]->getOutputSize();
+    projCol_.push_back(std::make_pair(startCol, endCol));
+    startCol = endCol;
+  }
+  CHECK_EQ(getSize(), endCol);
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    sharedBias_ = config_.shared_biases();
+    size_t psize = config_.bias_size();
+    biases_ = std::unique_ptr<Weight>(new Weight(1, psize, biasParameter_));
+  }
+
+  return true;
+}
+
+void ConcatenateLayer2::forward(PassType passType) {
+  Layer::forward(passType);
+
+  int batchSize = getInput(0).getBatchSize();
+  int size = getSize();
+  resetOutput(batchSize, size);
+
+  for (size_t i = 0; i < projections_.size(); i++) {
+    size_t startCol = projCol_[i].first;
+    size_t endCol = projCol_[i].second;
+    projOutput_[i].value = output_.value->subColMatrix(startCol, endCol);
+    if (output_.grad) {
+      projOutput_[i].grad = output_.grad->subColMatrix(startCol, endCol);
+    }
+  }
+
+  {
+    AsyncGpuBlock block;
+    for (size_t i = 0; i != inputLayers_.size(); ++i) {
+      projections_[i]->forward(&getInput(i), &projOutput_[i], passType);
+    }
+  }
+
+  /* add the bias-vector */
+  if (biases_) {
+    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
+    output_.value->addBias(*(biases_->getW()), 1, sharedBias_);
+  }
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void ConcatenateLayer2::backward(const UpdateCallback& callback) {
+  /* Do activation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  AsyncGpuBlock block;
+  if (biases_ && biases_->getWGrad()) {
+    REGISTER_TIMER_INFO("Concat2BpBiasTimer", getName().c_str());
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBias_);
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    if (projections_[i]) {
+      projections_[i]->backward(callback);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ContextProjection.cpp b/paddle/legacy/gserver/layers/ContextProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8bcf32663eb381a7d7700270efcaa08f9ff86356
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ContextProjection.cpp
@@ -0,0 +1,185 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ContextProjection.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_PROJECTION(context, ContextProjection);
+
+ContextProjection::ContextProjection(const ProjectionConfig& config,
+                                     ParameterPtr parameter,
+                                     bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  CHECK(config.has_context_start());
+  CHECK(config.has_context_length());
+  if (config.context_start() == 0 && config.context_length() == 1) {
+    config_.set_trainable_padding(false);
+  }
+  if (config_.trainable_padding()) {
+    CHECK(parameter);
+    beginPad_ = std::max(0, -config.context_start());
+    endPad_ = std::max(0, config.context_start() + config.context_length() - 1);
+    size_t totalPad = beginPad_ + endPad_;
+    size_t inputDim = parameter->getSize() / totalPad;
+    CHECK_EQ(config.input_size(), inputDim);
+    CHECK_EQ(inputDim * totalPad, parameter->getSize());
+    weight_.reset(new Weight(totalPad, inputDim, parameter));
+  }
+  // init forward_ and backward_ functions
+  init();
+}
+
+bool ContextProjection::init() {
+  size_t context_length = config_.context_length();
+  int context_start = config_.context_start();
+  bool is_padding = config_.trainable_padding();
+  size_t total_pad = is_padding ? beginPad_ + endPad_ : 0;
+
+  createFunction(forward_,
+                 "ContextProjectionForward",
+                 FuncConfig()
+                     .set("context_length", context_length)
+                     .set("context_start", context_start)
+                     .set("begin_pad", beginPad_));
+  createFunction(backward_,
+                 "ContextProjectionBackward",
+                 FuncConfig()
+                     .set("context_length", context_length)
+                     .set("context_start", context_start)
+                     .set("begin_pad", beginPad_)
+                     .set("is_padding", is_padding)
+                     .set("total_pad", total_pad));
+
+  return true;
+}
+
+void ContextProjection::resetState() {
+  CHECK_LE(config_.context_start() + config_.context_length(), 1)
+      << "state is not allowed for future context";
+  if (config_.context_start() >= 0) return;
+  Matrix::resizeOrCreate(state_,
+                         -config_.context_start(),
+                         config_.input_size(),
+                         false,  // trans
+                         useGpu_);
+  Matrix::resizeOrCreate(state2_,
+                         -config_.context_start(),
+                         config_.input_size(),
+                         false,  // trans
+                         useGpu_);
+  if (config_.trainable_padding()) {
+    state_->assign(*weight_->getW()->subMatrix(0, -config_.context_start()));
+  } else {
+    state_->zeroMem();
+  }
+}
+
+void ContextProjection::setState(LayerStatePtr state) {
+  CHECK(state->value.size() == 1)
+      << "one matrix is expected for ContextProjection state";
+  state_->copyFrom(*(state->value[0]));
+}
+
+LayerStatePtr ContextProjection::getState() {
+  if (state_ == nullptr) {
+    return nullptr;
+  }
+  LayerStatePtr res = std::make_shared<LayerState>();
+  res->value.push_back(state_->clone(0, 0, false));
+  res->value[0]->copyFrom(*state_);
+  return res;
+}
+
+void ContextProjection::forward() {
+  CHECK(in_->value && out_->value);
+  CHECK(in_->sequenceStartPositions);
+
+  size_t input_dim = in_->value->getWidth();
+  size_t dim = out_->value->getWidth();
+  CHECK_EQ(dim, input_dim * config_.context_length());
+  // size_t batch_size = in_->value->getHeight();
+  CHECK_EQ(forward_.size(), (size_t)1) << "Only one forward function here";
+
+  REGISTER_TIMER_INFO("ContextProjectionForward", getName().c_str());
+  bool is_padding = config_.trainable_padding();
+  /// first use state_, otherwise use weight_(padding false === w nullptr)
+  auto w_ptr =
+      state_ ? state_.get() : is_padding ? weight_->getW().get() : nullptr;
+  const auto start_pos = in_->sequenceStartPositions->getVector(useGpu_);
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*in_->value, *start_pos);
+  if (w_ptr) {
+    inputs.addArg(CpuMatrix(w_ptr->getData(), w_ptr->getHeight(), input_dim),
+                  *start_pos);
+  }
+  outputs.addArg(*out_->value, *start_pos, ADD_TO);
+  forward_[0]->calc(inputs, outputs);
+
+  if (state_ && config_.context_start() < 0) {
+    CHECK_EQ(1, in_->getNumSequences());
+    const int* starts = in_->sequenceStartPositions->getData(false);
+    int length = starts[1] - starts[0];
+    if (-config_.context_start() <= length) {
+      MatrixPtr sub = in_->value->subMatrix(starts[1] + config_.context_start(),
+                                            -config_.context_start());
+      state_->copyFrom(*sub);
+    } else {
+      int prevLength = -config_.context_start() - length;
+      state2_->subMatrix(0, prevLength)
+          ->copyFrom(*state_->subMatrix(length, prevLength));
+      state2_->subMatrix(prevLength, length)
+          ->copyFrom(*in_->value->subMatrix(starts[0], length));
+      std::swap(state_, state2_);
+    }
+  }
+}
+
+void ContextProjection::backward(const UpdateCallback& callback) {
+  CHECK(in_->value && out_->value && out_->grad);
+  size_t input_dim = in_->value->getWidth();
+  size_t dim = out_->value->getWidth();
+  CHECK_EQ(dim, input_dim * config_.context_length());
+  size_t batch_size = in_->value->getHeight();
+  CHECK_EQ(batch_size, out_->value->getHeight());
+  CHECK_EQ(static_cast<int>(backward_.size()), 1)
+      << "Only one backward function here";
+
+  REGISTER_TIMER_INFO("ContextProjectionBackward", getName().c_str());
+  bool is_padding = config_.trainable_padding();
+  auto start_pos = in_->sequenceStartPositions;
+  auto w_ptr = is_padding ? weight_->getWGrad() : nullptr;
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*out_->grad, *in_->sequenceStartPositions->getVector(useGpu_));
+  outputs.addArg(
+      CpuMatrix(
+          in_->grad ? in_->grad->getData() : nullptr, batch_size, input_dim),
+      *in_->sequenceStartPositions->getVector(useGpu_),
+      ADD_TO);
+  outputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
+                           w_ptr ? w_ptr->getHeight() : 0,
+                           input_dim),
+                 ADD_TO);
+  backward_[0]->calc(inputs, outputs);
+
+  if (config_.trainable_padding()) {
+    weight_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ContextProjection.h b/paddle/legacy/gserver/layers/ContextProjection.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c217145419048282a9a09ad899dc970e7c9704f
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ContextProjection.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Projection.h"
+
+namespace paddle {
+
+/**
+ * @brief Context projection concatenate features in adjacent time steps in
+ * a sequence. The i-th row of the output is the concatenation of
+ * context_length rows of the input. The context_length rows are the
+ * consecutive rows from the i+shift_start row.
+ *
+ * For example, assumed input (x) has 4 words and the dimension of each word
+ * representation is 2. If we use zero to pad instead of learned weight to pad,
+ * and the context_lenth is 3, the output (y) is:
+ *
+ * @code
+ *  x = [a1, a2;
+ *       b1, b2;
+ *       c1, c2;
+ *       d1, d2]
+ *  y = [0,  0,  a1, a2, b1, b2;
+ *       a1, a2, b1, b2, c1, c2;
+ *       b1, b2, c1, c2, d1, d2;
+ *       c1, c2, d1, d2, 0,  0]
+ * @endcode
+ *
+ * The config file api is context_projection.
+ */
+class ContextProjection : public Projection {
+ public:
+  /**
+   * Constructor. If context_start is zero and context_lenth is one, it will
+   * set trainable_padding false. trainable_padding is an optional arguments
+   * and if it is set, constructor will set learned weight, which is used to
+   * pad output.
+   */
+  ContextProjection(const ProjectionConfig& config,
+                    ParameterPtr parameter,
+                    bool useGpu);
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+
+  virtual void resetState();
+
+  virtual void setState(LayerStatePtr state);
+
+  virtual LayerStatePtr getState();
+
+  virtual bool init();
+
+ protected:
+  std::unique_ptr<Weight> weight_;
+  /// number of extra timesteps added at the beginning
+  size_t beginPad_;
+  /// number of extra timesteps added at the end
+  size_t endPad_;
+  /// state_ and state2_ are used in sequence generating and saved
+  /// previous inputs.
+  MatrixPtr state_;
+  MatrixPtr state2_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Conv3DLayer.cpp b/paddle/legacy/gserver/layers/Conv3DLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d072a74234b43e06c1194acc2ec2b3f961b4a97e
--- /dev/null
+++ b/paddle/legacy/gserver/layers/Conv3DLayer.cpp
@@ -0,0 +1,253 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Conv3DLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(conv3d, Conv3DLayer);
+
+bool Conv3DLayer::init(const LayerMap &layerMap,
+                       const ParameterMap &parameterMap) {
+  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
+  int index = 0;
+  for (auto &inputConfig : config_.inputs()) {
+    const ConvConfig &conf = inputConfig.conv_conf();
+    M_.push_back(numFilters_ / conf.groups());
+    K_.push_back(filterPixels_[index] * filterChannels_[index]);
+
+    // create a new weight
+    size_t height, width;
+    width = filterPixels_[index] * filterChannels_[index];
+    height = numFilters_;
+    CHECK_EQ(parameters_[index]->getSize(), width * height);
+    Weight *w = new Weight(height, width, parameters_[index]);
+    weights_.emplace_back(w);
+    ++index;
+  }
+  if (biasParameter_.get()) {
+    if (sharedBiases_) {
+      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
+    } else {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
+    }
+  }
+  return true;
+}
+
+size_t Conv3DLayer::getSize() {
+  CHECK_NE(inputLayers_.size(), 0UL);
+  outputH_.clear();
+  outputW_.clear();
+  outputD_.clear();
+  N_.clear();
+  size_t layerSize = 0;
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    outputW_.push_back(outputSize(
+        imgSizeW_[i], filterSize_[i], padding_[i], stride_[i], true));
+    outputH_.push_back(outputSize(
+        imgSizeH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
+    outputD_.push_back(outputSize(
+        imgSizeD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
+
+    N_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
+    CHECK(layerSize == 0 || N_[i] * size_t(numFilters_) == layerSize);
+    layerSize += N_[i] * numFilters_;
+  }
+  getOutput().setFrameHeight(outputH_[0]);
+  getOutput().setFrameWidth(outputW_[0]);
+  getOutput().setFrameDepth(outputD_[0]);
+  return layerSize;
+}
+
+void Conv3DLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  int outWidth = getSize();
+  resetOutput(batchSize, outWidth);
+
+  REGISTER_TIMER_INFO("FwdConv3D", getName().c_str());
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    const MatrixPtr &inMat = getInputValue(i);
+    const MatrixPtr &outMat = getOutputValue();
+    int M = M_[i];
+    int N = N_[i];
+    int K = K_[i];
+    Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+    MatrixPtr wMat = weights_[i]->getW();
+    for (int n = 0; n < batchSize; ++n) {
+      colBuf_->vol2Col(inMat->getData() + n * inMat->getStride(),
+                       channels_[i],
+                       imgSizeD_[i],
+                       imgSizeH_[i],
+                       imgSizeW_[i],
+                       filterSizeZ_[i],
+                       filterSizeY_[i],
+                       filterSize_[i],
+                       strideZ_[i],
+                       strideY_[i],
+                       stride_[i],
+                       paddingZ_[i],
+                       paddingY_[i],
+                       padding_[i]);
+
+      real *outData = outMat->getData() + n * outMat->getStride();
+      MatrixPtr outMatSub =
+          Matrix::create(outData, groups_[i] * M, N, false, useGpu_);
+      for (int g = 0; g < groups_[i]; g++) {
+        MatrixPtr wMatSub = wMat->subMatrix(g * M, M);
+        MatrixPtr in = colBuf_->subMatrix(g * K, K);
+        MatrixPtr out = outMatSub->subMatrix(g * M, M);
+        out->mul(*wMatSub, *in, 1.0, 1.0);
+      }
+    }
+  }
+  if (nullptr != this->biasParameter_) {
+    this->addBias();
+  }
+  forwardActivation();
+}
+
+void Conv3DLayer::backward(const UpdateCallback &callback) {
+  backwardActivation();
+
+  if (biases_ && biases_->getWGrad()) {
+    bpropBiases();
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  REGISTER_TIMER_INFO("BwdConv3D", getName().c_str());
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    if (weights_[i]->getWGrad()) {
+      bpropWeights(i);
+    }
+    if (getInputGrad(i)) {
+      bpropData(i);
+    }
+    weights_[i]->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+void Conv3DLayer::bpropWeights(int i) {
+  int M = M_[i];
+  int N = N_[i];
+  int K = K_[i];
+  const MatrixPtr &inMat = getInputValue(i);
+  Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+  MatrixPtr wGradMat = weights_[i]->getWGrad();
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  for (int n = 0; n < batchSize; ++n) {
+    colBuf_->vol2Col(inMat->getData() + n * inMat->getStride(),
+                     channels_[i],
+                     imgSizeD_[i],
+                     imgSizeH_[i],
+                     imgSizeW_[i],
+                     filterSizeZ_[i],
+                     filterSizeY_[i],
+                     filterSize_[i],
+                     strideZ_[i],
+                     strideY_[i],
+                     stride_[i],
+                     paddingZ_[i],
+                     paddingY_[i],
+                     padding_[i]);
+
+    real *outGradData =
+        getOutputGrad()->getData() + n * getOutputGrad()->getStride();
+    MatrixPtr outGradSub =
+        Matrix::create(outGradData, groups_[i] * M, N, false, useGpu_);
+    for (int g = 0; g < groups_[i]; ++g) {
+      MatrixPtr inMatSub = colBuf_->subMatrix(g * K, K);
+      MatrixPtr outG = outGradSub->subMatrix(g * M, M);
+      MatrixPtr wGradSub = wGradMat->subMatrix(g * M, M);
+      wGradSub->mul(*outG, *(inMatSub->getTranspose()), 1.0, 1.0);
+    }
+  }
+}
+
+void Conv3DLayer::bpropData(int i) {
+  int M = M_[i];
+  int N = N_[i];
+  int K = K_[i];
+  Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+  MatrixPtr wMat = weights_[i]->getW();
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  for (int n = 0; n < batchSize; ++n) {
+    real *outGradData =
+        getOutputGrad()->getData() + n * getOutputGrad()->getStride();
+    real *preGradData =
+        getInputGrad(i)->getData() + n * getInputGrad(i)->getStride();
+    MatrixPtr outGradSub =
+        Matrix::create(outGradData, M * groups_[i], N, false, useGpu_);
+    for (int g = 0; g < groups_[i]; ++g) {
+      MatrixPtr wMatSub = wMat->subMatrix(g * M, M);
+      MatrixPtr outG = outGradSub->subMatrix(g * M, M);
+      MatrixPtr inGradMatSub = colBuf_->subMatrix(g * K, K);
+      inGradMatSub->mul(*(wMatSub->getTranspose()), *outG, 1.0, 0.0);
+    }
+    colBuf_->col2Vol(preGradData,
+                     channels_[i],
+                     imgSizeD_[i],
+                     imgSizeH_[i],
+                     imgSizeW_[i],
+                     filterSizeZ_[i],
+                     filterSizeY_[i],
+                     filterSize_[i],
+                     strideZ_[i],
+                     strideY_[i],
+                     stride_[i],
+                     paddingZ_[i],
+                     paddingY_[i],
+                     padding_[i],
+                     1.0,
+                     1.0);
+  }
+}
+
+void Conv3DLayer::bpropBiases() {
+  MatrixPtr biases = Matrix::create(biases_->getWGrad()->getData(),
+                                    1,
+                                    biases_->getWGrad()->getElementCnt(),
+                                    false,
+                                    useGpu_);
+  MatrixPtr outGradMat = getOutputGrad();
+
+  if (this->sharedBiases_) {
+    biases->collectSharedBias(*outGradMat, 1.0f);
+  } else {
+    biases->collectBias(*outGradMat, 1.0f);
+  }
+}
+
+void Conv3DLayer::addBias() {
+  MatrixPtr outMat = getOutputValue();
+  MatrixPtr bias = Matrix::create(biases_->getW()->getData(),
+                                  1,
+                                  biases_->getW()->getElementCnt(),
+                                  false,
+                                  useGpu_);
+  if (this->sharedBiases_) {
+    outMat->addSharedBias(*(bias), 1.0f);
+  } else {
+    outMat->addBias(*(bias), 1.0f);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Conv3DLayer.h b/paddle/legacy/gserver/layers/Conv3DLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb42a2f36d31365b473d7f593fd27dc063c83c47
--- /dev/null
+++ b/paddle/legacy/gserver/layers/Conv3DLayer.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "ConvBaseLayer.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of convolution layer.
+ * This layer expands input and use matrix multiplication to
+ * calculate convolution operation.
+ */
+class Conv3DLayer : public ConvBaseLayer {
+ public:
+  explicit Conv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
+  ~Conv3DLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void addBias();
+  void backward(const UpdateCallback& callback);
+  void bpropBiases();
+  void bpropData(int i);
+  void bpropWeights(int i);
+  size_t getSize();
+
+ protected:
+  // Figure out the dimensions for individual gemms.
+  IntV M_;  /// numFilters_ / filter_group_;
+  IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
+  IntV K_;  /// outputD_ * outputH_ * outputW_
+  MatrixPtr colBuf_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseLayer.cpp b/paddle/legacy/gserver/layers/ConvBaseLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..76120915e48661a9b14fb6b9bb99e9ec9dd71e4b
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvBaseLayer.cpp
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvBaseLayer.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/utils/Logging.h"
+namespace paddle {
+
+bool ConvBaseLayer::init(const LayerMap& layerMap,
+                         const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  isDeconv_ = (config_.type() == "exconv" || config_.type() == "cudnn_conv")
+                  ? false
+                  : true;
+
+  /* Initialize the convolutional layer parameter */
+  numFilters_ = config_.num_filters();
+  sharedBiases_ = config_.shared_biases();
+  for (auto& inputConfig : config_.inputs()) {
+    const ConvConfig& conf = inputConfig.conv_conf();
+    padding_.push_back(conf.padding());
+    stride_.push_back(conf.stride());
+    dilation_.push_back(conf.dilation());
+    filterSize_.push_back(conf.filter_size());
+    paddingY_.push_back(conf.padding_y());
+    strideY_.push_back(conf.stride_y());
+    dilationY_.push_back(conf.dilation_y());
+    filterSizeY_.push_back(conf.filter_size_y());
+    channels_.push_back(conf.channels());
+    imgSizeH_.push_back(conf.has_img_size_y() ? conf.img_size_y()
+                                              : conf.img_size());
+    imgSizeW_.push_back(conf.img_size());
+    groups_.push_back(conf.groups());
+    filterChannels_.push_back(conf.filter_channels());
+    outputH_.push_back(conf.has_output_y() ? conf.output_y() : conf.output_x());
+    outputW_.push_back(conf.output_x());
+
+    paddingZ_.push_back(conf.padding_z());
+    strideZ_.push_back(conf.stride_z());
+    filterSizeZ_.push_back(conf.filter_size_z());
+    imgSizeD_.push_back(conf.img_size_z());
+    outputD_.push_back(conf.output_z());
+    filterPixels_.push_back(filterSize_.back() * filterSizeY_.back() *
+                            filterSizeZ_.back());
+  }
+
+  CHECK(inputLayers_.size() == parameters_.size());
+
+  // create new weights_ in derived class
+  // create new biases_ in derived class
+
+  // default caffe model
+  caffeMode_ = true;
+
+  return true;
+}
+
+size_t ConvBaseLayer::calOutputSize() {
+  auto clearAndReserve = [this](IntV* vec) {
+    vec->clear();
+    vec->reserve(this->inputLayers_.size());
+  };
+  clearAndReserve(&imgSizeH_);
+  clearAndReserve(&imgSizeW_);
+  clearAndReserve(&outputH_);
+  clearAndReserve(&outputW_);
+  size_t layerSize = 0;
+
+  auto setLayerSize = [&](IntV& inH, IntV& inW, IntV& outH, IntV& outW) {
+    size_t filterSizeY;
+    size_t filterSize;
+    for (size_t i = 0; i < inputLayers_.size(); i++) {
+      filterSizeY = (filterSizeY_[i] - 1) * dilationY_[i] + 1;
+      filterSize = (filterSize_[i] - 1) * dilation_[i] + 1;
+      inH.push_back(inputLayers_[i]->getOutput().getFrameHeight());
+      inW.push_back(inputLayers_[i]->getOutput().getFrameWidth());
+      const ConvConfig& conf = config_.inputs(i).conv_conf();
+      if (isDeconv_) {
+        if (inH[i] == 0)
+          inH[i] = conf.has_output_y() ? conf.output_y() : conf.output_x();
+        if (inW[i] == 0) inW[i] = conf.output_x();
+        outH.push_back(imageSize(
+            inH[i], filterSizeY, paddingY_[i], strideY_[i], caffeMode_));
+        outW.push_back(
+            imageSize(inW[i], filterSize, padding_[i], stride_[i], caffeMode_));
+      } else {
+        if (inH[i] == 0)
+          inH[i] = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+        if (inW[i] == 0) inW[i] = conf.img_size();
+        outH.push_back(outputSize(
+            inH[i], filterSizeY, paddingY_[i], strideY_[i], caffeMode_));
+        outW.push_back(outputSize(
+            inW[i], filterSize, padding_[i], stride_[i], caffeMode_));
+      }
+      CHECK_EQ(outH[i], outH[0]);
+      CHECK_EQ(outW[i], outW[0]);
+    }
+    getOutput().setFrameHeight(outH[0]);
+    getOutput().setFrameWidth(outW[0]);
+    layerSize = outH[0] * outW[0] * size_t(numFilters_);
+  };
+
+  setLayerSize(imgSizeH_, imgSizeW_, outputH_, outputW_);
+
+  return layerSize;
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseLayer.h b/paddle/legacy/gserver/layers/ConvBaseLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..01e90e999625f986b0f13d2b73a883297c097841
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvBaseLayer.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/MathUtils.h"
+namespace paddle {
+
+/**
+ * @brief A Base Convolution Layer, which convolves the input image
+ * with learned filters and (optionally) adds biases.
+ */
+
+class ConvBaseLayer : public Layer {
+ protected:
+  typedef std::vector<int> IntV;
+
+  /// True if it's deconv layer, false if it's convolution layer
+  bool isDeconv_;
+
+  /// The number of filters.
+  int numFilters_;
+  /// The x dimension of the padding.
+  IntV padding_;
+  /// The y dimension of the padding.
+  IntV paddingY_;
+  /// The x dimension of the stride.
+  IntV stride_;
+  /// The y dimension of the stride.
+  IntV strideY_;
+  /// The x dimension of the dilation.
+  IntV dilation_;
+  /// The y dimension of the dilation.
+  IntV dilationY_;
+  /// The x dimension of a filter kernel.
+  IntV filterSize_;
+  /// The y dimension of a filter kernel.
+  IntV filterSizeY_;
+  /// The spatial dimensions of the convolution input.
+  IntV channels_;
+  /// The spatial dimensions of input feature map height.
+  IntV imgSizeH_;
+  /// The spatial dimensions of input feature map width.
+  IntV imgSizeW_;
+  /// filterPixels_ = filterSizeX_ * filterSizeY_.
+  IntV filterPixels_;
+  /// filterChannels_ = channels_/groups_.
+  IntV filterChannels_;
+  /// The spatial dimensions of output feature map height.
+  IntV outputH_;
+  /// The spatial dimensions of output feature map width.
+  IntV outputW_;
+
+  IntV outputD_;
+  IntV imgSizeD_;
+  IntV filterSizeZ_;
+  IntV strideZ_;
+  IntV paddingZ_;
+
+  /// Group size, refer to grouped convolution in
+  /// Alex Krizhevsky's paper: when group=2, the first half of the
+  /// filters are only connected to the first half of the input channels,
+  /// and the second half only connected to the second half.
+  IntV groups_;
+  /// Whether the bias is shared for feature in each channel.
+  bool sharedBiases_;
+
+  /// shape of weight: (numChannels * filterPixels_, numFilters)
+  WeightList weights_;
+  /// If shared_biases is false shape of bias: (numFilters_, 1)
+  /// If shared_biases is ture shape of bias:
+  /// (numFilters_ * outputX * outputY, 1)
+  std::unique_ptr<Weight> biases_;
+
+  /// True by default. The only difference is the calculation
+  /// of output size.
+  bool caffeMode_;
+
+ public:
+  explicit ConvBaseLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  /**
+   * imgSizeH_ and imgSizeW_ will be set according to the previous input layers
+   * in this function. Then it will calculate outputH_ and outputW_ and set them
+   * into output argument.
+   */
+  virtual size_t calOutputSize();
+
+  Weight& getWeight(int idx) { return *weights_[idx]; }
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseOperator.cpp b/paddle/legacy/gserver/layers/ConvBaseOperator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e8e59b3bfe9d8a9e54e5c11906707d10ec346a4d
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvBaseOperator.cpp
@@ -0,0 +1,151 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvBaseOperator.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvBaseOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+ConvBaseOperator::ConvBaseOperator(const OperatorConfig &config, bool useGpu)
+    : Operator(config, useGpu) {
+  CHECK(useGpu);
+  CHECK_EQ(config_.input_indices_size(), 2L);
+
+  caffeMode_ = true;
+  getConvParams();
+  computeConvSizes();
+
+  // initialize all to default algorithms
+  fwdAlgo_ = 0;
+  bwdFilterAlgo_ = 0;
+  bwdDataAlgo_ = 0;
+  fwdLimitBytes_ = 0;
+  bwdDataLimitBytes_ = 0;
+  bwdFilterLimitBytes_ = 0;
+  workSpaceInBytes_ = 0;
+  workSpace_ = nullptr;
+
+  isSelectAlgo_ = false;
+}
+
+void ConvBaseOperator::allocConvWorkSpace() {
+  hl_conv_workspace(imageDesc_,
+                    outputDesc_,
+                    filterDesc_,
+                    convDesc_,
+                    &fwdAlgo_,
+                    &fwdLimitBytes_,
+                    &bwdDataAlgo_,
+                    &bwdDataLimitBytes_,
+                    &bwdFilterAlgo_,
+                    &bwdFilterLimitBytes_,
+                    /*useDilation*/ false);
+
+  size_t maxWorkSpace = 0;
+  maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
+  maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
+
+  if (maxWorkSpace > workSpaceInBytes_) {
+    if (workSpaceInBytes_ != 0) {
+      hl_free_mem_device(workSpace_);
+    }
+    // total amount of storage needed
+    workSpace_ = hl_malloc_device(maxWorkSpace);
+    workSpaceInBytes_ = maxWorkSpace;
+  }
+}
+
+void ConvBaseOperator::computeConvSizes() {
+  hl_create_filter_descriptor(
+      &filterDesc_, channels_, numFilters_, filterSizeY_, filterSize_);
+  hl_create_tensor_descriptor(&imageDesc_);
+  hl_create_tensor_descriptor(&outputDesc_);
+  hl_create_convolution_descriptor(&convDesc_,
+                                   imageDesc_,
+                                   filterDesc_,
+                                   paddingY_,
+                                   padding_,
+                                   strideY_,
+                                   stride_);
+}
+
+void ConvBaseOperator::reshapeImageDescriptors() {
+  hl_tensor_reshape(imageDesc_,
+                    1,
+                    channels_,
+                    imageH_,
+                    imageW_,
+                    channels_ * imageH_ * imageW_,
+                    imageH_ * imageW_,
+                    imageW_,
+                    1);
+  hl_tensor_reshape(outputDesc_,
+                    1,
+                    numFilters_,
+                    outputH_,
+                    outputW_,
+                    numFilters_ * outputH_ * outputW_,
+                    outputH_ * outputW_,
+                    outputW_,
+                    1);
+  hl_reset_convolution_descriptor(convDesc_,
+                                  imageDesc_,
+                                  filterDesc_,
+                                  paddingY_,
+                                  padding_,
+                                  strideY_,
+                                  stride_);
+}
+
+void ConvBaseOperator::getConvParams() {
+  configNumFilters_ = config_.num_filters();
+  const ConvConfig &conf = config_.conv_conf();
+  padding_ = conf.padding();
+  stride_ = conf.stride();
+  filterSize_ = conf.filter_size();
+  paddingY_ = conf.padding_y();
+  strideY_ = conf.stride_y();
+  filterSizeY_ = conf.filter_size_y();
+  filterPixels_ = filterSize_ * filterSizeY_;
+  configChannels_ = conf.channels();
+  imgSize_ = conf.img_size();
+  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  imgPixels_ = imgSize_ * imgSizeY_;
+  CHECK_EQ(conf.groups(), 1U);
+  filterChannels_ = conf.filter_channels();
+  outputX_ = conf.output_x();
+  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  outputs_ = outputX_ * outputX_;
+
+  isDeconv_ = (config_.type() == "conv") ? false : true;
+  if (isDeconv_) {
+    channels_ = configNumFilters_;
+    numFilters_ = configChannels_;
+  } else {
+    channels_ = configChannels_;
+    numFilters_ = configNumFilters_;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseOperator.h b/paddle/legacy/gserver/layers/ConvBaseOperator.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ac77f2d743abd6f01e8e3f1e2f4e730c0e6fb39
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvBaseOperator.h
@@ -0,0 +1,112 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "Operator.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+class ConvBaseOperator : public Operator {
+ public:
+  ConvBaseOperator(const OperatorConfig &config, bool useGpu);
+  /**
+   * Free workspace in device and destroy cudnn tensor descriptor.
+   */
+  virtual ~ConvBaseOperator() {
+    if (workSpaceInBytes_ != 0) {
+      hl_free_mem_device(workSpace_);
+      workSpaceInBytes_ = 0;
+    }
+
+    hl_destroy_tensor_descriptor(imageDesc_);
+    hl_destroy_tensor_descriptor(outputDesc_);
+    hl_destroy_filter_descriptor(filterDesc_);
+    hl_destroy_convolution_descriptor(convDesc_);
+  }
+
+ protected:
+  /**
+   * Get convolution parameters from layer config and
+   * initialize member variables.
+   */
+  void getConvParams();
+
+  /**
+   * Allocate Gpu Memory for cudnn convolution algorithms.
+   */
+  void allocConvWorkSpace();
+
+  /**
+   * Create cudnn tensor descriptor for convolution operation.
+   */
+  void computeConvSizes();
+
+  /**
+   * Reshape cudnn tensor descriptor.
+   */
+  void reshapeImageDescriptors();
+
+  /**
+   * Reshape cudnn tensor descriptor.
+   */
+  virtual void reshape(int batchSize) = 0;
+
+  /**
+   * Check filter size is equal to the size calculated by parameters from
+   * layer config.
+   */
+  void checkFilterSize(const MatrixPtr &filter) {
+    CHECK_EQ(static_cast<int>(filter->getWidth()),
+             filterSize_ * filterSizeY_ * channels_ * numFilters_);
+  }
+
+  /// Most of member variables are same with CudnnConvLayer.
+  /// There is no explanation here.
+  bool isDeconv_;
+  int imageH_, imageW_, outputH_, outputW_;
+  hl_tensor_descriptor imageDesc_;
+  hl_tensor_descriptor outputDesc_;
+  hl_filter_descriptor filterDesc_;
+  hl_convolution_descriptor convDesc_;
+  bool caffeMode_;
+  int inputOffset_, outputOffset_, weightOffset_;
+  int numFilters_, channels_;
+
+  /// from parsing config
+  int configNumFilters_, configChannels_;
+  int padding_, stride_, filterSize_, imgSize_, imgSizeY_;
+  int paddingY_, strideY_, filterSizeY_;
+  int imgPixels_, filterPixels_, filterChannels_, outputX_, outputY_, outputs_;
+
+  /// Following member variables are same with CudnnConvLayer.
+  /// There is no explanation here.
+  int fwdAlgo_, bwdFilterAlgo_, bwdDataAlgo_;
+  size_t fwdLimitBytes_, bwdDataLimitBytes_, bwdFilterLimitBytes_;
+  size_t workSpaceInBytes_;
+  void *workSpace_;
+  bool isSelectAlgo_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseProjection.cpp b/paddle/legacy/gserver/layers/ConvBaseProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ff5d3412de1c2940cdd9dcf9397370153c24b0c6
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvBaseProjection.cpp
@@ -0,0 +1,199 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvBaseProjection.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+ThreadLocalD<std::vector<MemoryHandlePtr>> ConvBaseProjection::convMem_;
+
+ConvBaseProjection::ConvBaseProjection(const ProjectionConfig &config,
+                                       ParameterPtr parameter,
+                                       bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  CHECK(useGpu);  // only support GPU
+  getConvParams();
+  initCudnn();
+
+  size_t height = filterH_ * filterW_ * channels_ / groups_;
+  size_t width = numFilters_;
+  weight_.reset(new Weight(height, width, parameter));
+  weightOffset_ = height * width / groups_;
+}
+
+void ConvBaseProjection::getConvParams() {
+  const ConvConfig &conf = config_.conv_conf();
+  paddingH_ = conf.padding_y();
+  paddingW_ = conf.padding();
+
+  strideH_ = conf.stride_y();
+  strideW_ = conf.stride();
+
+  dilationH_ = conf.dilation_y();
+  dilationW_ = conf.dilation();
+  CHECK_GT(dilationH_, 0);
+  CHECK_GT(dilationW_, 0);
+
+  filterH_ = conf.filter_size_y();
+  filterW_ = conf.filter_size();
+
+  configImgH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  configImgW_ = conf.img_size();
+
+  configOutH_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  configOutW_ = conf.output_x();
+
+  configChannels_ = conf.channels();
+  configNumFilters_ = config_.num_filters();
+
+  isDeconv_ = (config_.type() == "conv") ? false : true;
+
+  channels_ = (isDeconv_) ? configNumFilters_ : configChannels_;
+  numFilters_ = (isDeconv_) ? configChannels_ : configNumFilters_;
+
+  groups_ = conf.groups();
+  CHECK_EQ(channels_ % groups_, 0);
+  CHECK_EQ(numFilters_ % groups_, 0);
+}
+
+void ConvBaseProjection::initCudnn() {
+  hl_create_filter_descriptor(&filterDesc_,
+                              channels_ / groups_,
+                              numFilters_ / groups_,
+                              filterH_,
+                              filterW_);
+  hl_create_tensor_descriptor(&imageDesc_);
+  hl_create_tensor_descriptor(&outputDesc_);
+  hl_create_convolution_descriptor(&convDesc_,
+                                   imageDesc_,
+                                   filterDesc_,
+                                   paddingH_,
+                                   paddingW_,
+                                   strideH_,
+                                   strideW_,
+                                   dilationH_,
+                                   dilationW_);
+
+  // initialize all to default algorithms
+  fwdAlgo_ = 0;
+  bwdFilterAlgo_ = 0;
+  bwdDataAlgo_ = 0;
+  fwdLimitBytes_ = 0;
+  bwdDataLimitBytes_ = 0;
+  bwdFilterLimitBytes_ = 0;
+  workSpaceInBytes_ = 0;
+}
+
+void ConvBaseProjection::reshapeTensorDesc(int batchSize) {
+  // The stride between two consecutive samples in the output of ConvProjection
+  // may not be numFilters_ * outputH_ * outputW_ (conv) or
+  // channels_ * imageH_ * imageW_ (deconv)
+  // for example, in the case of layer ConcatenateLayer2 with two
+  // ConvProjection, the stride is the output_size of layer ConcatenateLayer2.
+  // So the calculation of nStride is different from CudnnConvLayer.
+  size_t nStrideImage, nStrideOutput;
+  if (isDeconv_) {
+    nStrideImage = out_->value->getStride();
+    nStrideOutput = numFilters_ * outputH_ * outputW_;
+  } else {
+    nStrideImage = channels_ * imageH_ * imageW_;
+    nStrideOutput = out_->value->getStride();
+  }
+
+  hl_tensor_reshape(imageDesc_,
+                    batchSize,
+                    channels_ / groups_,
+                    imageH_,
+                    imageW_,
+                    nStrideImage,
+                    imageH_ * imageW_,
+                    imageW_,
+                    1);
+
+  hl_tensor_reshape(outputDesc_,
+                    batchSize,
+                    numFilters_ / groups_,
+                    outputH_,
+                    outputW_,
+                    nStrideOutput,
+                    outputH_ * outputW_,
+                    outputW_,
+                    1);
+
+  hl_reset_convolution_descriptor(convDesc_,
+                                  imageDesc_,
+                                  filterDesc_,
+                                  paddingH_,
+                                  paddingW_,
+                                  strideH_,
+                                  strideW_,
+                                  dilationH_,
+                                  dilationW_);
+}
+
+void ConvBaseProjection::reshape(int batchSize) {
+  size_t width = calOutputSize();
+  CHECK_EQ(width, out_->value->getWidth());
+  CHECK_EQ(calInputSize(), in_->value->getWidth());
+
+  reshapeTensorDesc(batchSize);
+  bool useDilation = false;
+  if (dilationH_ > 1 || dilationW_ > 1) {
+    useDilation = true;
+  }
+  hl_conv_workspace(imageDesc_,
+                    outputDesc_,
+                    filterDesc_,
+                    convDesc_,
+                    &fwdAlgo_,
+                    &fwdLimitBytes_,
+                    &bwdDataAlgo_,
+                    &bwdDataLimitBytes_,
+                    &bwdFilterAlgo_,
+                    &bwdFilterLimitBytes_,
+                    useDilation);
+
+  size_t maxWorkSpace = 0;
+  maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
+  maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
+  workSpaceInBytes_ = maxWorkSpace;
+
+  VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
+          << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
+}
+
+void *ConvBaseProjection::getSpaceBytes(size_t size) {
+  std::vector<MemoryHandlePtr> &convMem = *convMem_;
+  if (convMem.empty()) {
+    int numDevices = hl_get_device_count();
+    convMem.resize(numDevices);
+  }
+
+  int devId = hl_get_device();
+  MemoryHandlePtr localMem = convMem[devId];
+  if (NULL == localMem || size > localMem->getAllocSize()) {
+    localMem = std::make_shared<GpuMemoryHandle>(size);
+  }
+  return localMem->getBuf();
+}
+
+ConvBaseProjection::~ConvBaseProjection() {
+  hl_destroy_tensor_descriptor(imageDesc_);
+  hl_destroy_tensor_descriptor(outputDesc_);
+  hl_destroy_filter_descriptor(filterDesc_);
+  hl_destroy_convolution_descriptor(convDesc_);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvBaseProjection.h b/paddle/legacy/gserver/layers/ConvBaseProjection.h
new file mode 100644
index 0000000000000000000000000000000000000000..dcf5ce0f48daac396bab0ec7620303f6c1236fc2
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvBaseProjection.h
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Projection.h"
+#include "paddle/legacy/math/MathUtils.h"
+
+namespace paddle {
+
+/**
+ * @brief Base class for ConvProjection and ConvTransProjection.
+ */
+class ConvBaseProjection : public Projection {
+ public:
+  /**
+   * Constructor.
+   */
+  ConvBaseProjection(const ProjectionConfig& config,
+                     ParameterPtr parameter,
+                     bool useGpu);
+
+  ~ConvBaseProjection();
+
+ protected:
+  void getConvParams();
+  void initCudnn();
+
+  void reshapeTensorDesc(int batchSize);
+  void reshape(int batchSize);
+
+  virtual size_t calOutputSize() = 0;
+  virtual size_t calInputSize() = 0;
+
+  static void* getSpaceBytes(size_t size);
+
+  /// True if it's deconv projection layer, false if it's ConvProjection layer
+  bool isDeconv_;
+  /// imageH_ and imageW_ / outputH_ and outputW_
+  /// is calculated from the input layer.
+  int imageH_, imageW_;
+  int outputH_, outputW_;
+  /// configImgH_ and configImgW_ / configOutH_ and configOutW_
+  /// is obtained from config.
+  int configImgH_, configImgW_;
+  int configOutH_, configOutW_;
+  /// channels_ and numFilters_ are defined in terms of convolution semantics
+  int channels_, numFilters_;
+  /// configChannels and configNumFilters_ are obtained from config
+  /// For Conv they are the same as channels_ and numFilters
+  /// For ConvTrans they are opposite to channels_ and numFilters
+  int configChannels_, configNumFilters_;
+  int paddingH_, paddingW_;
+  int strideH_, strideW_;
+  int dilationH_, dilationW_;
+  int filterH_, filterW_;
+  /// One group offset of input data.
+  int inputOffset_;
+  /// One group offset of output data.
+  int outputOffset_;
+  /// One group offset of weight.
+  int weightOffset_;
+  int groups_;
+
+  /// Cudnn tensor descriptor for input.
+  hl_tensor_descriptor imageDesc_;
+  /// Cudnn tensor descriptor for output.
+  hl_tensor_descriptor outputDesc_;
+  /// Cudnn tensor descriptor for filter.
+  hl_filter_descriptor filterDesc_;
+  /// Cudnn tensor descriptor for a convolution operation.
+  hl_convolution_descriptor convDesc_;
+
+  /// Record the algorithm for forward convolution, which is obtained by cudnn
+  /// api to search the best suited algorithm.
+  int fwdAlgo_;
+  /// Record the algorithm for computing convolution gradient with respect to
+  /// filter coefficients.
+  int bwdFilterAlgo_;
+  /// Record the algorithm for computing convolution gradient with respect to
+  /// the output.
+  int bwdDataAlgo_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// forward convolution with the specified algo.
+  size_t fwdLimitBytes_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// backwardFilter with the specified algo.
+  size_t bwdDataLimitBytes_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// backwardData with the specified algo.
+  size_t bwdFilterLimitBytes_;
+  /// Size of total work space.
+  size_t workSpaceInBytes_;
+  bool bias_;
+
+  std::unique_ptr<Weight> weight_;
+  static ThreadLocalD<std::vector<MemoryHandlePtr>> convMem_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvOperator.cpp b/paddle/legacy/gserver/layers/ConvOperator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5276b2c3920eee923f13a47d40b4498c6846f94b
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvOperator.cpp
@@ -0,0 +1,128 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvOperator.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+REGISTER_OPERATOR(conv, ConvOperator);
+
+void ConvOperator::reshape(int batchSize) {
+  imageH_ = ins_[0]->getFrameHeight();
+  imageW_ = ins_[0]->getFrameWidth();
+  if (imageH_ == 0) imageH_ = imgSizeY_;
+  if (imageW_ == 0) imageW_ = imgSize_;
+  outputH_ = outputSize(imageH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
+  outputW_ = outputSize(imageW_, filterSize_, padding_, stride_, caffeMode_);
+  /// Check that the outputSizes are consistent with config
+  CHECK_EQ(outputH_, outputY_);
+  CHECK_EQ(outputW_, outputX_);
+  out_->setFrameHeight(outputH_);
+  out_->setFrameWidth(outputW_);
+
+  reshapeImageDescriptors();
+
+  inputOffset_ = channels_ * imageH_ * imageW_;
+  outputOffset_ = numFilters_ * outputH_ * outputW_;
+  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSizeY_;
+
+  if (!isSelectAlgo_) {
+    allocConvWorkSpace();
+  }
+
+  isSelectAlgo_ = true;
+}
+
+void ConvOperator::forward() {
+  size_t batchSize = ins_[0]->value->getHeight();
+  reshape(batchSize);
+  CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
+  checkFilterSize(ins_[1]->value);
+  Matrix::resizeOrCreate(out_->value,
+                         batchSize,
+                         outputH_ * outputW_ * numFilters_,
+                         false,
+                         useGpu_);
+  {
+    AsyncGpuBlock block;
+    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
+      real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
+      real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
+      real *outData = out_->value->getData() + outputOffset_ * batchId;
+      hl_convolution_forward(imageDesc_,
+                             inputData,
+                             outputDesc_,
+                             outData,
+                             filterDesc_,
+                             wgtData,
+                             convDesc_,
+                             workSpace_,
+                             workSpaceInBytes_,
+                             fwdAlgo_);
+    }
+  }
+}
+
+void ConvOperator::backward() {
+  size_t batchSize = ins_[0]->value->getHeight();
+  {
+    AsyncGpuBlock block;
+    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
+      real *outGrad = out_->grad->getData() + outputOffset_ * batchId;
+      if (ins_[1]->grad) {
+        real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
+        real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
+        hl_convolution_backward_filter(imageDesc_,
+                                       inputData,
+                                       outputDesc_,
+                                       outGrad,
+                                       filterDesc_,
+                                       weightGrad,
+                                       convDesc_,
+                                       workSpace_,
+                                       workSpaceInBytes_,
+                                       bwdFilterAlgo_);
+      }
+
+      MatrixPtr preGrad = ins_[0]->grad;
+      if (NULL != preGrad) {
+        real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
+        real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
+        hl_convolution_backward_data(imageDesc_,
+                                     inputGrad,
+                                     outputDesc_,
+                                     outGrad,
+                                     filterDesc_,
+                                     wgtData,
+                                     convDesc_,
+                                     workSpace_,
+                                     workSpaceInBytes_,
+                                     bwdDataAlgo_);
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvOperator.h b/paddle/legacy/gserver/layers/ConvOperator.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f31620111c8ff3818d83145e16012d22b067a12
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvOperator.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "ConvBaseOperator.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+class ConvOperator : public ConvBaseOperator {
+ public:
+  ConvOperator(const OperatorConfig &config, bool useGpu)
+      : ConvBaseOperator(config, useGpu) {}
+  /**
+   * Free workspace in device and destroy cudnn tensor descriptor.
+   */
+  virtual ~ConvOperator() {}
+  void forward() override;
+  void backward() override;
+  void reshape(int batchSize) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvProjection.cpp b/paddle/legacy/gserver/layers/ConvProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b40cdac2587d1fc0fec00801414560d2a27bd34a
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvProjection.cpp
@@ -0,0 +1,123 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvProjection.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_PROJECTION(conv, ConvProjection);
+
+size_t ConvProjection::calOutputSize() {
+  imageH_ = in_->getFrameHeight();
+  imageW_ = in_->getFrameWidth();
+  if (imageH_ == 0) imageH_ = configImgH_;
+  if (imageW_ == 0) imageW_ = configImgW_;
+  outputH_ = outputSize(imageH_,
+                        (filterH_ - 1) * dilationH_ + 1,
+                        paddingH_,
+                        strideH_,
+                        /* caffeMode */ true);
+  outputW_ = outputSize(imageW_,
+                        (filterW_ - 1) * dilationW_ + 1,
+                        paddingW_,
+                        strideW_,
+                        /* caffeMode */ true);
+
+  const_cast<Argument *>(out_)->setFrameHeight(outputH_);
+  const_cast<Argument *>(out_)->setFrameWidth(outputW_);
+
+  inputOffset_ = (configChannels_ / groups_) * imageH_ * imageW_;
+  outputOffset_ = (configNumFilters_ / groups_) * outputH_ * outputW_;
+  return outputH_ * outputW_ * configNumFilters_;
+}
+
+size_t ConvProjection::calInputSize() {
+  return static_cast<size_t>(configChannels_ * imageH_ * imageW_);
+}
+
+void ConvProjection::forward() {
+  int batchSize = in_->value->getHeight();
+  reshape(batchSize);
+
+  void *workSpace = NULL;
+  if (workSpaceInBytes_ > 0) {
+    workSpace = getSpaceBytes(workSpaceInBytes_);
+  }
+
+  for (int g = 0; g < groups_; ++g) {
+    REGISTER_TIMER_INFO("CudnnConvFwTimer", getName().c_str());
+
+    real *inputData = in_->value->getData() + g * inputOffset_;
+    real *wgtData = weight_->getW()->getData() + g * weightOffset_;
+    real *outData = out_->value->getData() + g * outputOffset_;
+    hl_convolution_forward(imageDesc_,
+                           inputData,
+                           outputDesc_,
+                           outData,
+                           filterDesc_,
+                           wgtData,
+                           convDesc_,
+                           workSpace,
+                           fwdLimitBytes_,
+                           fwdAlgo_);
+  }
+}
+
+void ConvProjection::backward(const UpdateCallback &callback) {
+  REGISTER_TIMER_INFO("CudnnConvBpTimer", getName().c_str());
+
+  void *workSpace = NULL;
+  if (workSpaceInBytes_ > 0) {
+    workSpace = getSpaceBytes(workSpaceInBytes_);
+  }
+
+  for (int g = 0; g < groups_; ++g) {
+    real *outGrad = out_->grad->getData() + g * outputOffset_;
+    if (weight_->getWGrad()) {
+      real *inputData = in_->value->getData() + g * inputOffset_;
+      real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
+      hl_convolution_backward_filter(imageDesc_,
+                                     inputData,
+                                     outputDesc_,
+                                     outGrad,
+                                     filterDesc_,
+                                     weightGrad,
+                                     convDesc_,
+                                     workSpace,
+                                     bwdFilterLimitBytes_,
+                                     bwdFilterAlgo_);
+    }
+
+    MatrixPtr preGrad = in_->grad;
+    if (NULL != preGrad) {
+      real *inputGrad = preGrad->getData() + g * inputOffset_;
+      real *wgtData = weight_->getW()->getData() + g * weightOffset_;
+      hl_convolution_backward_data(imageDesc_,
+                                   inputGrad,
+                                   outputDesc_,
+                                   outGrad,
+                                   filterDesc_,
+                                   wgtData,
+                                   convDesc_,
+                                   workSpace,
+                                   bwdDataLimitBytes_,
+                                   bwdDataAlgo_);
+    }
+  }
+
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvProjection.h b/paddle/legacy/gserver/layers/ConvProjection.h
new file mode 100644
index 0000000000000000000000000000000000000000..890a17e2f8d2d05001f825f374e8ab6420f7b3ea
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvProjection.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ConvBaseProjection.h"
+#include "paddle/legacy/math/MathUtils.h"
+
+namespace paddle {
+
+/**
+ * @brief Convolution projection do the same calculation with CudnnConvLayer.
+ */
+class ConvProjection : public ConvBaseProjection {
+ public:
+  /**
+   * Constructor.
+   */
+  ConvProjection(const ProjectionConfig& config,
+                 ParameterPtr parameter,
+                 bool useGpu)
+      : ConvBaseProjection(config, parameter, useGpu) {}
+
+  ~ConvProjection() {}
+
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+  virtual size_t calOutputSize();
+  virtual size_t calInputSize();
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvShiftLayer.cpp b/paddle/legacy/gserver/layers/ConvShiftLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b7ecbe556c59b32cc5833617717b40c730392506
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvShiftLayer.cpp
@@ -0,0 +1,108 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for circular convluation of two vectors,
+ * which is used in NEURAL TURING MACHINE.
+ * - Input: two vectors, the first is data (batchSize x dataDim)
+ * the second is shift weights (batchSize x shiftDim)
+ * - Output: a vector (batchSize x dataDim)
+ * Assumed that:
+ * - a[in]: contains M elements.
+ * - b[in]: contains N elements (N should be odd).
+ * - c[out]: contains M elements.
+ *
+ * \f[
+ *     c[i] = \sum_{j=-(N-1)/2}^{(N-1)/2}a_{i+j} * b_{j}
+ * \f]
+ *
+ * In this formula:
+ *  - a's index is computed modulo M.
+ *  - b's index is comupted modulo N.
+ *
+ * The config file api is conv_shift_layer.
+ */
+
+class ConvShiftLayer : public Layer {
+ public:
+  explicit ConvShiftLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ConvShiftLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(conv_shift, ConvShiftLayer);
+
+bool ConvShiftLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+
+  return true;
+}
+
+void ConvShiftLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV0->getHeight();
+  size_t dataDim = inV0->getWidth();
+
+  CHECK_EQ(batchSize, inV1->getHeight());
+  CHECK_EQ(dataDim, getSize());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    resetOutput(batchSize, dataDim);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  REGISTER_TIMER_INFO("FwConvShiftTimer", getName().c_str());
+  outV->circularConv(*inV0, *inV1);
+}
+
+void ConvShiftLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+
+  REGISTER_TIMER_INFO("BwConvShiftTimer", getName().c_str());
+
+  if (inG0 && inG1) {
+    outG->circularConvDerivative(*outG, *inV0, *inV1, *inG0, *inG1);
+  } else {
+    CHECK(!inG0 || !inG1) << "Not supported";
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvTransOperator.cpp b/paddle/legacy/gserver/layers/ConvTransOperator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f4ce2affb144152ed41a9d4be9fa87f800c83dbb
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvTransOperator.cpp
@@ -0,0 +1,125 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvTransOperator.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvTransOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+REGISTER_OPERATOR(convt, ConvTransOperator);
+
+void ConvTransOperator::reshape(int batchSize) {
+  outputH_ = ins_[0]->getFrameHeight();
+  outputW_ = ins_[0]->getFrameWidth();
+  if (outputH_ == 0) outputH_ = outputY_;
+  if (outputW_ == 0) outputW_ = outputX_;
+  imageH_ = imageSize(outputH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
+  imageW_ = imageSize(outputW_, filterSize_, padding_, stride_, caffeMode_);
+  /// Check that the imageSizes are consistent with config
+  CHECK_EQ(imageH_, imgSizeY_);
+  CHECK_EQ(imageW_, imgSize_);
+  out_->setFrameHeight(imageH_);
+  out_->setFrameWidth(imageW_);
+
+  reshapeImageDescriptors();
+
+  inputOffset_ = numFilters_ * outputH_ * outputW_;
+  outputOffset_ = channels_ * imageH_ * imageW_;
+  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSizeY_;
+
+  if (!isSelectAlgo_) {
+    allocConvWorkSpace();
+  }
+
+  isSelectAlgo_ = true;
+}
+
+void ConvTransOperator::forward() {
+  size_t batchSize = ins_[0]->value->getHeight();
+  reshape(batchSize);
+  CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
+  checkFilterSize(ins_[1]->value);
+  Matrix::resizeOrCreate(
+      out_->value, batchSize, imageH_ * imageW_ * channels_, false, useGpu_);
+  {
+    AsyncGpuBlock block;
+    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
+      real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
+      real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
+      real *outData = out_->value->getData() + outputOffset_ * batchId;
+      hl_convolution_backward_data(imageDesc_,
+                                   outData,
+                                   outputDesc_,
+                                   inputData,
+                                   filterDesc_,
+                                   wgtData,
+                                   convDesc_,
+                                   workSpace_,
+                                   workSpaceInBytes_,
+                                   bwdDataAlgo_);
+    }
+  }
+}
+
+void ConvTransOperator::backward() {
+  size_t batchSize = ins_[0]->value->getHeight();
+  {
+    AsyncGpuBlock block;
+    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
+      real *outGrad = out_->grad->getData() + outputOffset_ * batchId;
+      if (ins_[1]->grad) {
+        real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
+        real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
+        hl_convolution_backward_filter(imageDesc_,
+                                       outGrad,
+                                       outputDesc_,
+                                       inputData,
+                                       filterDesc_,
+                                       weightGrad,
+                                       convDesc_,
+                                       workSpace_,
+                                       workSpaceInBytes_,
+                                       bwdFilterAlgo_);
+      }
+
+      MatrixPtr preGrad = ins_[0]->grad;
+      if (NULL != preGrad) {
+        real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
+        real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
+        hl_convolution_forward(imageDesc_,
+                               outGrad,
+                               outputDesc_,
+                               inputGrad,
+                               filterDesc_,
+                               wgtData,
+                               convDesc_,
+                               workSpace_,
+                               workSpaceInBytes_,
+                               fwdAlgo_);
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvTransOperator.h b/paddle/legacy/gserver/layers/ConvTransOperator.h
new file mode 100644
index 0000000000000000000000000000000000000000..206335a01ff7509eaa5528002c6c9686f05c931b
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvTransOperator.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "ConvBaseOperator.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvTransOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+class ConvTransOperator : public ConvBaseOperator {
+ public:
+  ConvTransOperator(const OperatorConfig &config, bool useGpu)
+      : ConvBaseOperator(config, useGpu) {}
+  /**
+   * Free workspace in device and destroy cudnn tensor descriptor.
+   */
+  virtual ~ConvTransOperator() {}
+  void forward() override;
+  void backward() override;
+  void reshape(int batchSize) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvTransProjection.cpp b/paddle/legacy/gserver/layers/ConvTransProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..00e34c8f2dcd2ea9698779f8b4425561f979cfef
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvTransProjection.cpp
@@ -0,0 +1,123 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvTransProjection.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_PROJECTION(convt, ConvTransProjection);
+size_t ConvTransProjection::calOutputSize() {
+  outputH_ = in_->getFrameHeight();
+  outputW_ = in_->getFrameWidth();
+  if (outputH_ == 0) outputH_ = configOutH_;
+  if (outputW_ == 0) outputW_ = configOutW_;
+  imageH_ = imageSize(outputH_,
+                      (filterH_ - 1) * dilationH_ + 1,
+                      paddingH_,
+                      strideH_,
+                      /* caffeMode */ true);
+
+  imageW_ = imageSize(outputW_,
+                      (filterW_ - 1) * dilationW_ + 1,
+                      paddingW_,
+                      strideW_,
+                      /* caffeMode */ true);
+
+  const_cast<Argument *>(out_)->setFrameHeight(imageH_);
+  const_cast<Argument *>(out_)->setFrameWidth(imageW_);
+
+  inputOffset_ = (configChannels_ / groups_) * outputH_ * outputW_;
+  outputOffset_ = (configNumFilters_ / groups_) * imageH_ * imageW_;
+  return imageH_ * imageW_ * configNumFilters_;
+}
+
+size_t ConvTransProjection::calInputSize() {
+  return static_cast<size_t>(configChannels_ * outputH_ * outputW_);
+}
+
+void ConvTransProjection::forward() {
+  int batchSize = in_->value->getHeight();
+  reshape(batchSize);
+
+  void *workSpace = NULL;
+  if (workSpaceInBytes_ > 0) {
+    workSpace = getSpaceBytes(workSpaceInBytes_);
+  }
+
+  for (int g = 0; g < groups_; ++g) {
+    REGISTER_TIMER_INFO("CudnnConvTransFwTimer", getName().c_str());
+
+    real *inData = in_->value->getData() + g * inputOffset_;
+    real *wgtData = weight_->getW()->getData() + g * weightOffset_;
+    real *outData = out_->value->getData() + g * outputOffset_;
+    hl_convolution_backward_data(imageDesc_,
+                                 outData,
+                                 outputDesc_,
+                                 inData,
+                                 filterDesc_,
+                                 wgtData,
+                                 convDesc_,
+                                 workSpace,
+                                 bwdDataLimitBytes_,
+                                 bwdDataAlgo_);
+  }
+}
+
+void ConvTransProjection::backward(const UpdateCallback &callback) {
+  REGISTER_TIMER_INFO("CudnnConvTransBpTimer", getName().c_str());
+
+  void *workSpace = NULL;
+  if (workSpaceInBytes_ > 0) {
+    workSpace = getSpaceBytes(workSpaceInBytes_);
+  }
+
+  for (int g = 0; g < groups_; ++g) {
+    real *outGrad = out_->grad->getData() + g * outputOffset_;
+    if (weight_->getWGrad()) {
+      real *inData = in_->value->getData() + g * inputOffset_;
+      real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
+      hl_convolution_backward_filter(imageDesc_,
+                                     outGrad,
+                                     outputDesc_,
+                                     inData,
+                                     filterDesc_,
+                                     weightGrad,
+                                     convDesc_,
+                                     workSpace,
+                                     bwdFilterLimitBytes_,
+                                     bwdFilterAlgo_);
+    }
+
+    MatrixPtr preGrad = in_->grad;
+    if (NULL != preGrad) {
+      real *inGrad = preGrad->getData() + g * inputOffset_;
+      real *wgtData = weight_->getW()->getData() + g * weightOffset_;
+      hl_convolution_forward(imageDesc_,
+                             outGrad,
+                             outputDesc_,
+                             inGrad,
+                             filterDesc_,
+                             wgtData,
+                             convDesc_,
+                             workSpace,
+                             fwdLimitBytes_,
+                             fwdAlgo_);
+    }
+  }
+
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvTransProjection.h b/paddle/legacy/gserver/layers/ConvTransProjection.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b63dd47352b9f24810d9406b314fbfa15ae13c3
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvTransProjection.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ConvBaseProjection.h"
+#include "paddle/legacy/math/MathUtils.h"
+
+namespace paddle {
+
+/**
+ * @brief Convolution projection do the same calculation with CudnnConvLayer.
+ */
+class ConvTransProjection : public ConvBaseProjection {
+ public:
+  /**
+   * Constructor.
+   */
+  ConvTransProjection(const ProjectionConfig& config,
+                      ParameterPtr parameter,
+                      bool useGpu)
+      : ConvBaseProjection(config, parameter, useGpu) {}
+
+  ~ConvTransProjection() {}
+
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+  virtual size_t calOutputSize();
+  virtual size_t calInputSize();
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ConvexCombinationLayer.cpp b/paddle/legacy/gserver/layers/ConvexCombinationLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c38ab251f18728425d01479b82630550d29e9b61
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ConvexCombinationLayer.cpp
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for weighted sum of vectors,
+ * which is used in NEURAL MACHINE TRANSLATION BY JOINTLY LEARNING TO ALIGN AND
+ * TRANSLATE
+ * - Input: the the size of the first input is weightDim,
+ *          and the size of the second input is weightdim * dataDim.
+ * - Output: the sizeof the output is dataDim
+ * \f[
+ *   out(j) = \sum_{i}(in0(i) * in1(i,j + i * dataDim)),
+ *               i = 0,1,...,(weightDim-1); j = 0, 1,...,(dataDim-1)
+ * \f]
+ * Note that the above computation is for one sample. Multiple samples are
+ * processed in one batch.
+ *
+ * The config file api is linear_comb_layer.
+ */
+class ConvexCombinationLayer : public Layer {
+ protected:
+  /// A matrix pointer pointing to second input.
+  MatrixPtr tmpMtx0;
+  /// A matrix pointer pointing to first input.
+  MatrixPtr tmpRow0;
+  /// A matrix pointer pointing to output.
+  MatrixPtr tmpRow1;
+
+ public:
+  explicit ConvexCombinationLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ConvexCombinationLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(convex_comb, ConvexCombinationLayer);
+
+bool ConvexCombinationLayer::init(const LayerMap& layerMap,
+                                  const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(2U, inputLayers_.size());
+  size_t dataDim = getSize();
+  size_t weightDim = inputLayers_[0]->getSize();
+
+  CHECK_EQ(weightDim * dataDim, inputLayers_[1]->getSize())
+      << "Dimension mismatch";
+
+  tmpRow0 = Matrix::create(nullptr,
+                           /* height= */ 1,
+                           weightDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpRow1 = Matrix::create(nullptr,
+                           /* height= */ 1,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpMtx0 = Matrix::create(nullptr,
+                           /* height= */ weightDim,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+
+  return true;
+}
+
+void ConvexCombinationLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV0->getHeight();
+  size_t weightDim = inV0->getWidth();
+  size_t dataDim = getSize();
+
+  CHECK_EQ(batchSize, inV1->getHeight());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, dataDim);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  REGISTER_TIMER_INFO("FwCvxCombTimer", getName().c_str());
+  for (size_t i = 0; i < batchSize; i++) {
+    tmpMtx0->setData(inV1->getData() + i * weightDim * dataDim);
+    tmpRow0->setData(inV0->getData() + i * weightDim);
+    tmpRow1->setData(outV->getData() + i * dataDim);
+
+    tmpRow1->mul(*tmpRow0, *tmpMtx0, 1, 0);
+  }
+}
+
+void ConvexCombinationLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+
+  size_t batchSize = inV0->getHeight();
+  size_t weightDim = inV0->getWidth();
+  size_t dataDim = getSize();
+
+  REGISTER_TIMER_INFO("BwCvxCombTimer", getName().c_str());
+
+  if (inG0) {
+    for (size_t i = 0; i < batchSize; i++) {
+      tmpRow0->setData(inG0->getData() + i * weightDim);
+      tmpRow1->setData(outG->getData() + i * dataDim);
+      tmpMtx0->setData(inV1->getData() + i * weightDim * dataDim);
+
+      tmpRow0->mul(*tmpRow1, *(tmpMtx0->getTranspose()), 1, 1);
+    }
+  }
+
+  if (inG1) {
+    for (size_t i = 0; i < batchSize; i++) {
+      tmpRow0->setData(inV0->getData() + i * weightDim);
+      tmpRow1->setData(outG->getData() + i * dataDim);
+      tmpMtx0->setData(inG1->getData() + i * weightDim * dataDim);
+
+      tmpMtx0->mul(*(tmpRow0->getTranspose()), *tmpRow1, 1, 1);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CosSimLayer.cpp b/paddle/legacy/gserver/layers/CosSimLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab8d7cc1f61823890676e8f647f784cfa9a0775e
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CosSimLayer.cpp
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CosSimLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(cos, CosSimLayer);
+
+bool CosSimLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2LU);
+
+  createFunction(forward_,
+                 "CosSimForward",
+                 FuncConfig().set("scale", (real)config_.cos_scale()));
+  createFunction(backward_,
+                 "CosSimBackward",
+                 FuncConfig().set("scale", (real)config_.cos_scale()));
+
+  return true;
+}
+
+void CosSimLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInputValue(0)->getHeight();
+  int size = getSize();
+  CHECK_EQ(forward_.size(), 1UL) << "Only one forward function needed";
+
+  {
+    REGISTER_TIMER_INFO("CosFwResetTimer", getName().c_str());
+    reserveOutput(batchSize, size);
+  }
+
+  MatrixPtr outV = getOutputValue();
+  /* activation */ {
+    REGISTER_TIMER_INFO("CosFwAtvTimer", getName().c_str());
+    MatrixPtr prevOut1 = getInputValue(0);
+    MatrixPtr prevOut2 = getInputValue(1);
+
+    CHECK(outV && prevOut1 && prevOut2);
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*prevOut1);
+    inputs.addArg(*prevOut2);
+    outputs.addArg(*outV, ASSIGN_TO);
+    forward_[0]->calc(inputs, outputs);
+  }
+}
+
+void CosSimLayer::backward(const UpdateCallback& callback) {
+  /* activation */ {
+    REGISTER_TIMER_INFO("CosBpAtvTimer", getName().c_str());
+    CHECK_EQ(backward_.size(), 1UL) << "Only one backward function needed";
+
+    const auto outG = this->getOutputGrad();
+    const auto outV = this->getOutputValue();
+    const auto inV1 = this->getInputValue(0);
+    const auto inV2 = this->getInputValue(1);
+    auto inG1 = this->getInputGrad(0);
+    auto inG2 = this->getInputGrad(1);
+    CHECK(outG && outV && inV1 && inV2 && inG1 && inG2);
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*outG);
+    inputs.addArg(*outV);
+    inputs.addArg(*inV1);
+    inputs.addArg(*inV2);
+    outputs.addArg(*inG1, ADD_TO);
+    outputs.addArg(*inG2, ADD_TO);
+
+    backward_[0]->calc(inputs, outputs);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CosSimLayer.h b/paddle/legacy/gserver/layers/CosSimLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b08e2c6a35369832732706d64f209f85a5292a6f
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CosSimLayer.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+namespace paddle {
+/**
+ * @brief A layer for calculating cosine similarity between two vector
+ * \f[
+ * f(x,y)=scale\frac{x_1y_1+x_2y_2+...+x_ny_n}{\sqrt{x_1^2+x_2^2+...
+ * +x_n^2}\sqrt{y_1^2+y_2^2+...+y_n^2}}
+ * \f]
+ *
+ * - Input1: A vector (batchSize * dataDim) *
+ * - Input2: A vector (batchSize * dataDim) or (1 * dataDim) *
+ * - Output: A vector (batchSize * 1)
+ *
+ * The config file api is cos_sim.
+ */
+class CosSimLayer : public Layer {
+ public:
+  explicit CosSimLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~CosSimLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CosSimVecMatLayer.cpp b/paddle/legacy/gserver/layers/CosSimVecMatLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..03de0be815a1fb5eeb7ffab31b1721dc5951a469
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CosSimVecMatLayer.cpp
@@ -0,0 +1,182 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+/**
+ * @brief A layer for computing cosine similarity between a vector
+ * and each row of a matrix
+ * out[i] = cos_scale * cos(in1, in2(i,:));
+ * @note used in NEURAL TURING MACHINE
+ *
+ * Input1: a vector (batchSize * dataDim)
+ *
+ * Input2: a matrix in vector form (batchSize * (weightDim*dataDim))
+ *
+ * Output: a vector (batchSize * weightDim)
+ */
+
+class CosSimVecMatLayer : public Layer {
+ protected:
+  MatrixPtr tmpMtx0;
+  MatrixPtr tmpMtx1;
+  MatrixPtr tmpRow0;
+  MatrixPtr tmpRow1;
+  MatrixPtr tmpRow2;
+  MatrixPtr tmpRow3;
+
+ public:
+  explicit CosSimVecMatLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~CosSimVecMatLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(cos_vm, CosSimVecMatLayer);
+
+bool CosSimVecMatLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+
+  size_t dataDim = inputLayers_[0]->getSize();
+  size_t numKeys = getSize();
+  size_t memoryDim = inputLayers_[1]->getSize();
+
+  CHECK_EQ(dataDim * numKeys, memoryDim) << "Dimension mismatch";
+
+  tmpRow0 = Matrix::create(nullptr,
+                           /* height= */ 1,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpRow1 = Matrix::create(nullptr,
+                           /* height= */ 1,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpRow2 = Matrix::create(nullptr,
+                           /* height= */ numKeys,
+                           1,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpRow3 = Matrix::create(nullptr,
+                           /* height= */ numKeys,
+                           1,
+                           /* trans= */ false,
+                           useGpu_);
+
+  tmpMtx0 = Matrix::create(nullptr,
+                           /* height= */ numKeys,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+  tmpMtx1 = Matrix::create(nullptr,
+                           /* height= */ numKeys,
+                           dataDim,
+                           /* trans= */ false,
+                           useGpu_);
+
+  CHECK(tmpRow0 && tmpRow1 && tmpRow2 && tmpRow3 && tmpMtx0 && tmpMtx1);
+
+  createFunction(forward_,
+                 "CosSimForward",
+                 FuncConfig().set("scale", (real)config_.cos_scale()));
+  createFunction(backward_,
+                 "CosSimBackward",
+                 FuncConfig().set("scale", (real)config_.cos_scale()));
+
+  return true;
+}
+
+void CosSimVecMatLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  CHECK_EQ(forward_.size(), 1UL) << "Only one forward function needed";
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV0->getHeight();
+  size_t numKeys = getSize();
+
+  CHECK_EQ(batchSize, inV1->getHeight());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, numKeys);
+  }
+
+  MatrixPtr outV = getOutputValue();
+  CHECK(outV && inV0 && inV1);
+  REGISTER_TIMER_INFO("FwCosVMTimer", getName().c_str());
+  for (size_t i = 0; i < batchSize; i++) {
+    tmpRow0->setData(inV0->rowBuf(i));
+    tmpMtx0->setData(inV1->rowBuf(i));
+    tmpRow2->setData(outV->rowBuf(i));
+
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*tmpMtx0);
+    inputs.addArg(*tmpRow0);
+    outputs.addArg(*tmpRow2, ASSIGN_TO);
+    forward_[0]->calc(inputs, outputs);
+  }
+}
+
+void CosSimVecMatLayer::backward(const UpdateCallback& callback) {
+  CHECK_EQ(backward_.size(), 1UL) << "Only one forward function needed";
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr outG = getOutputGrad();
+
+  size_t batchSize = inV0->getHeight();
+  CHECK(inV0 && inV1 && inG0 && inG1 && outV && outG);
+  REGISTER_TIMER_INFO("BwCosVMTimer", getName().c_str());
+
+  for (size_t i = 0; i < batchSize; i++) {
+    tmpRow0->setData(inV0->rowBuf(i));
+    tmpRow1->setData(inG0->rowBuf(i));
+    tmpMtx0->setData(inV1->rowBuf(i));
+    tmpMtx1->setData(inG1->rowBuf(i));
+    tmpRow2->setData(outV->rowBuf(i));
+    tmpRow3->setData(outG->rowBuf(i));
+
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*tmpRow3);
+    inputs.addArg(*tmpRow2);
+    inputs.addArg(*tmpMtx0);
+    inputs.addArg(*tmpRow0);
+    outputs.addArg(*tmpMtx1, ADD_TO);
+    outputs.addArg(*tmpRow1, ADD_TO);
+
+    backward_[0]->calc(inputs, outputs);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CostLayer.cpp b/paddle/legacy/gserver/layers/CostLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..18b5b77bde9dee97cb6971624007307ff06411c7
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CostLayer.cpp
@@ -0,0 +1,748 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CostLayer.h"
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include "paddle/legacy/utils/Logging.h"
+
+#include "paddle/legacy/math/SparseMatrix.h"
+
+namespace paddle {
+
+bool CostLayer::init(const LayerMap& layerMap,
+                     const ParameterMap& parameterMap) {
+  bool ret = Layer::init(layerMap, parameterMap);
+  coeff_ = config_.coeff();
+  if (!ret) return ret;
+  CHECK_GE(inputLayers_.size(), 2UL);
+  CHECK_LE(inputLayers_.size(), 3UL);
+  if (inputLayers_.size() == 3) {
+    weightLayer_ = inputLayers_[2];
+  }
+  return true;
+}
+
+void CostLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInputValue(*getOutputLayer())->getHeight();
+  int size = 1;
+  resetOutput(batchSize, size);
+
+  const MatrixPtr& output = getInputValue(*getOutputLayer());
+  Argument label = getInput(*getLabelLayer());
+
+  /* get the cost value for each sample*/
+  forwardImp(*output, label, *getOutputValue());
+  if (weightLayer_) {
+    const MatrixPtr& weight = getInputValue(*weightLayer_);
+    getOutputValue()->dotMul(*getOutputValue(), *weight);
+  }
+}
+
+void CostLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  const Argument& output = getInput(*getOutputLayer());
+  Argument label = getInput(*getLabelLayer());
+
+  bool support = true;
+  if (weightLayer_) {
+    support = output.grad->getAbsSum() == 0;
+  }
+
+  backwardImp(*output.value, label, *output.grad);
+
+  if (weightLayer_) {
+    CHECK(support) << "Weighted cost layer '" << getName()
+                   << "' must be the last layer "
+                      "connected to the output layer '"
+                   << getOutputLayer()->getName() << "'";
+    output.grad->rowScale(0, *output.grad, *getInputValue(*weightLayer_));
+  }
+  if (coeff_ != real(1.0f)) {
+    output.grad->add(coeff_, 0);
+  }
+}
+
+//
+// class MultiClassCrossEntropy
+//
+bool MultiClassCrossEntropy::init(const LayerMap& layerMap,
+                                  const ParameterMap& parameterMap) {
+  return CostLayer::init(layerMap, parameterMap);
+}
+
+void MultiClassCrossEntropy::forwardImp(Matrix& output,
+                                        Argument& label,
+                                        Matrix& target) {
+  target.oneHotCrossEntropy(output, *label.ids);
+}
+
+void MultiClassCrossEntropy::backwardImp(Matrix& output,
+                                         Argument& label,
+                                         Matrix& outputG) {
+  outputG.oneHotCrossEntropyBp(output, *label.ids);
+}
+
+//
+// class MultiClassCrossEntropyWithSelfNorm
+//
+REGISTER_LAYER(multi_class_cross_entropy_with_selfnorm,
+               MultiClassCrossEntropyWithSelfNorm);
+
+bool MultiClassCrossEntropyWithSelfNorm::init(
+    const LayerMap& layerMap, const ParameterMap& parameterMap) {
+  return CostLayer::init(layerMap, parameterMap);
+}
+
+void MultiClassCrossEntropyWithSelfNorm::forwardImp(Matrix& output,
+                                                    Argument& label,
+                                                    Matrix& target) {
+  Matrix::resizeOrCreate(sftMaxSum_, output.getHeight(), 1, false, useGpu_);
+  output.rowSum(*sftMaxSum_);
+  sftMaxSum_->log2();
+
+  target.oneHotCrossEntropy(output, *label.ids);
+  target.add(*sftMaxSum_);
+
+  sftMaxSum_->square2();
+  target.add(*sftMaxSum_, config_.softmax_selfnorm_alpha());
+}
+
+void MultiClassCrossEntropyWithSelfNorm::backwardImp(Matrix& output,
+                                                     Argument& label,
+                                                     Matrix& outputG) {
+  Matrix::resizeOrCreate(sftMaxSum_, output.getHeight(), 1, false, useGpu_);
+  output.rowSum(*sftMaxSum_);
+
+  Matrix::resizeOrCreate(sumInv_, output.getHeight(), 1, false, useGpu_);
+  sftMaxSum_->reciprocal2(*sumInv_);
+
+  outputG.oneHotCrossEntropyBp(output, *label.ids);
+  outputG.addColumnVector(*sumInv_);
+
+  sftMaxSum_->log2();
+  sumInv_->dotMul(*sumInv_, *sftMaxSum_);
+  sumInv_->mulScalar(2 * config_.softmax_selfnorm_alpha());
+
+  outputG.addColumnVector(*sumInv_);
+}
+
+//
+// class SoftBinaryClassCrossEntropy
+//
+REGISTER_LAYER(soft_binary_class_cross_entropy, SoftBinaryClassCrossEntropy);
+
+bool SoftBinaryClassCrossEntropy::init(const LayerMap& layerMap,
+                                       const ParameterMap& parameterMap) {
+  return CostLayer::init(layerMap, parameterMap);
+}
+
+void SoftBinaryClassCrossEntropy::forwardImp(Matrix& output,
+                                             Argument& label,
+                                             Matrix& target) {
+  Matrix::resizeOrCreate(
+      targetPerDim_, output.getHeight(), output.getWidth(), false, useGpu_);
+
+  targetPerDim_->softCrossEntropy(output, *label.value);
+  targetPerDim_->rowSum(target);
+}
+
+void SoftBinaryClassCrossEntropy::backwardImp(Matrix& output,
+                                              Argument& label,
+                                              Matrix& outputG) {
+  outputG.softCrossEntropyBp(output, *label.value);
+}
+
+//
+// class SumOfSquaresCostLayer
+//
+
+REGISTER_LAYER(square_error, SumOfSquaresCostLayer);
+
+bool SumOfSquaresCostLayer::init(const LayerMap& layerMap,
+                                 const ParameterMap& parameterMap) {
+  return CostLayer::init(layerMap, parameterMap);
+}
+
+void SumOfSquaresCostLayer::forwardImp(Matrix& output,
+                                       Argument& label,
+                                       Matrix& target) {
+  target.sumOfSquares(output, *label.value);
+}
+
+void SumOfSquaresCostLayer::backwardImp(Matrix& output,
+                                        Argument& label,
+                                        Matrix& outputG) {
+  outputG.sumOfSquaresBp(output, *label.value);
+}
+
+//
+// class SmoothL1CostLayer
+//
+
+REGISTER_LAYER(smooth_l1, SmoothL1CostLayer);
+
+bool SmoothL1CostLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  return CostLayer::init(layerMap, parameterMap);
+}
+
+void SmoothL1CostLayer::forwardImp(Matrix& output,
+                                   Argument& label,
+                                   Matrix& target) {
+  MatrixPtr targetCpu, outputCpu, labelCpu;
+  if (useGpu_) {
+    targetCpu =
+        Matrix::create(target.getHeight(), target.getWidth(), false, false);
+    outputCpu =
+        Matrix::create(output.getHeight(), output.getWidth(), false, false);
+    labelCpu = Matrix::create(
+        label.value->getHeight(), label.value->getWidth(), false, false);
+    targetCpu->copyFrom(target);
+    outputCpu->copyFrom(output);
+    labelCpu->copyFrom(*label.value);
+    targetCpu->smoothL1(*outputCpu, *labelCpu, 1.0);
+    target.copyFrom(*targetCpu);
+  } else {
+    target.smoothL1(output, *label.value, 1.0);
+  }
+}
+
+void SmoothL1CostLayer::backwardImp(Matrix& output,
+                                    Argument& label,
+                                    Matrix& outputG) {
+  MatrixPtr outputGCpu, outputCpu, labelCpu;
+  if (useGpu_) {
+    outputGCpu =
+        Matrix::create(outputG.getHeight(), outputG.getWidth(), false, false);
+    outputCpu =
+        Matrix::create(output.getHeight(), output.getWidth(), false, false);
+    labelCpu = Matrix::create(
+        label.value->getHeight(), label.value->getWidth(), false, false);
+    outputGCpu->copyFrom(outputG);
+    outputCpu->copyFrom(output);
+    labelCpu->copyFrom(*label.value);
+    outputGCpu->smoothL1Bp(*outputCpu, *labelCpu, 1.0);
+    outputG.copyFrom(*outputGCpu);
+  } else {
+    outputG.smoothL1Bp(output, *label.value, 1.0);
+  }
+}
+
+//
+// class RankingCost
+//
+bool RankingCost::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  posPairCount_ = 0;
+  negPairCount_ = 0;
+
+  bool ret = Layer::init(layerMap, parameterMap);
+  if (!ret) return ret;
+  CHECK_GE(inputLayers_.size(), 3UL);
+  CHECK_LE(inputLayers_.size(), 4UL);
+  if (inputLayers_.size() == 4) {
+    weightLayer_ = inputLayers_[3];
+  }
+  return true;
+}
+
+void RankingCost::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInputValue(*getOutputLayer(0))->getHeight();
+  int size = 1;
+  resizeOutput(batchSize, size);
+  Matrix::resizeOrCreate(margin_, batchSize, size, /* trans= */ false, useGpu_);
+  MatrixPtr label = getInputValue(*getLabelLayer());
+  if (!label) {
+    // input label is not in value, try ids
+    IVectorPtr idLabel = getInput(*getLabelLayer()).ids;
+    CHECK(idLabel) << "label layer has neither value nor ids";
+    CHECK_EQ((size_t)batchSize, idLabel->getSize());
+    Matrix::resizeOrCreate(
+        labelBuf_, batchSize, /*width*/ 1, /*trans*/ false, useGpu_);
+    labelBuf_->copyFrom(*idLabel);
+    label = labelBuf_;
+  }
+
+  MatrixPtr output[] = {getInputValue(*getOutputLayer(0)),
+                        getInputValue(*getOutputLayer(1))};
+  MatrixPtr target = this->getOutputValue();
+  margin_->sub(*output[0], *output[1]);
+
+  // for validation
+  size_t height = output[0]->getHeight();
+  target->biggerThan(*(output[0]), *(output[1]), *label);
+  double total = static_cast<double>(height);
+  if (weightLayer_) {
+    const MatrixPtr& weight = getInputValue(*weightLayer_);
+    target->dotMul(*target, *weight);
+    total = weight->getSum();
+  }
+  double pos = target->getSum();
+  posPairCount_ += pos;
+  negPairCount_ += (total - pos);
+
+  // forward
+  target->logisticRegressionLoss(*margin_, *label);
+  if (weightLayer_) {
+    const MatrixPtr& weight = getInputValue(*weightLayer_);
+    target->dotMul(*target, *weight);
+  }
+}
+
+void RankingCost::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  MatrixPtr label = getInputValue(*getLabelLayer());
+  if (!label) {
+    // input label is not in value, but in ids
+    // use labelBuf_ (should already resized and copied during forward)
+    label = labelBuf_;
+  }
+
+  Matrix::resizeOrCreate(
+      marginGrad_, label->getHeight(), 1, /* trans= */ false, useGpu_);
+  marginGrad_->zeroMem();
+  marginGrad_->logisticRegressionLossBp(*margin_, *label);
+  if (weightLayer_) {
+    const MatrixPtr& weight = getInputValue(*weightLayer_);
+    marginGrad_->dotMul(*marginGrad_, *weight);
+  }
+
+  getInputGrad(0)->add(*marginGrad_);
+  getInputGrad(1)->sub(*marginGrad_);
+}
+
+void RankingCost::onPassEnd() {
+  double ratio = posPairCount_ / ((negPairCount_ <= 0) ? 1.0 : negPairCount_);
+  LOG(INFO) << "calc pos/neg: " << ratio << " pos= " << posPairCount_
+            << " neg= " << negPairCount_;
+
+  posPairCount_ = 0;
+  negPairCount_ = 0;
+}
+
+//
+// class LambdaCost
+//
+REGISTER_LAYER(lambda_cost, LambdaCost);
+
+bool LambdaCost::init(const LayerMap& layerMap,
+                      const ParameterMap& parameterMap) {
+  truncationSize_ = config_.ndcg_num();
+  maxSortSize_ = config_.max_sort_size();
+  if (maxSortSize_ != -1) {
+    CHECK_GE(maxSortSize_, truncationSize_)
+        << "maxSortSize must be greater than or equal to NDCG size!";
+  }
+  LOG(INFO) << "LambdaRank v1.3, NDCG size = " << truncationSize_
+            << ", Max partial sort size = " << maxSortSize_;
+  CHECK(!useGpu_) << "LambdaRank supports CPU only!";
+  return Layer::init(layerMap, parameterMap);
+}
+
+void LambdaCost::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInputValue(*getOutputLayer())->getHeight();
+  resizeOutput(batchSize, 1);
+
+  MatrixPtr score = getInputValue(*getScoreLayer());
+  MatrixPtr output = getInputValue(*getOutputLayer());
+  MatrixPtr target = this->getOutputValue();
+
+  real* scoreData = score->getData();
+  real* outputData = output->getData();
+  real* targetData = target->getData();
+
+  auto startPos = getInput(*getOutputLayer()).sequenceStartPositions;
+  const int* startPosData = startPos->getData(false);
+  size_t batchNum = startPos->getSize() - 1;
+  for (size_t i = 0; i < batchNum; ++i) {
+    int beginPos = startPosData[i];
+    int endPos = startPosData[i + 1];
+    real NDCG = calcNDCG(
+        outputData + beginPos, scoreData + beginPos, endPos - beginPos);
+    for (int j = beginPos; j < endPos; ++j) {
+      targetData[j] = NDCG;
+    }
+  }
+}
+
+void LambdaCost::backward(const UpdateCallback& callback) {
+  (void)callback;
+  MatrixPtr score = getInputValue(*getScoreLayer());
+  MatrixPtr output = getInputValue(*getOutputLayer());
+  Matrix::resizeOrCreate(marginGrad_,
+                         score->getHeight(),
+                         1,
+                         /* trans= */ false,
+                         useGpu_);
+  marginGrad_->zeroMem();
+
+  real* gradData = marginGrad_->getData();
+  real* scoreData = score->getData();
+  real* outputData = output->getData();
+
+  auto startPos = getInput(*getOutputLayer()).sequenceStartPositions;
+  const int* startPosData = startPos->getData(false);
+  size_t batchNum = startPos->getSize() - 1;
+
+  for (size_t i = 0; i < batchNum; ++i) {
+    int beginPos = startPosData[i];
+    int endPos = startPosData[i + 1];
+    calcGrad(outputData + beginPos,
+             scoreData + beginPos,
+             gradData + beginPos,
+             endPos - beginPos);
+  }
+
+  getInputGrad(0)->add(*marginGrad_);
+}
+
+void LambdaCost::calcGrad(const real* outputScore,
+                          const real* score,
+                          real* gradData,
+                          int size) {
+  CHECK_GE(size, truncationSize_)
+      << "Invalid: (Sample num in the same list) < (NDCG truncation num) !";
+  int sortSize = maxSortSize_ == -1 ? size : std::min(maxSortSize_, size);
+
+  scorePair_.clear();
+  for (int i = 0; i < size; ++i) {
+    scorePair_.push_back(std::make_pair(score[i], i));
+  }
+  if (size <= sortSize) {
+    std::sort(scorePair_.begin(),
+              scorePair_.end(),
+              [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
+                return a.first > b.first;
+              });
+  } else {
+    std::partial_sort(
+        scorePair_.begin(),
+        scorePair_.begin() + sortSize,
+        scorePair_.end(),
+        [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
+          return a.first > b.first;
+        });
+  }
+
+  real maxDCG = 0;
+  for (int i = 0; i < truncationSize_; ++i) {
+    maxDCG += (std::pow(2, scorePair_[i].first) - 1) / std::log(i + 2);
+  }
+  CHECK_GT(maxDCG, 0) << "Invalid: max DCG = 0!";
+
+  for (int i = 0; i < sortSize; ++i) {
+    for (int j = i + 1; j < size; ++j) {
+      int index_i = scorePair_[i].second;
+      int index_j = scorePair_[j].second;
+      real score_i = score[index_i];
+      real score_j = score[index_j];
+      real dcgDif = 0;
+      if (j < sortSize) {
+        dcgDif = (std::pow(2, score_i) - std::pow(2, score_j)) *
+                 (1 / std::log(i + 2) - 1 / std::log(j + 2));
+      } else {
+        dcgDif =
+            (std::pow(2, score_i) - std::pow(2, score_j)) / std::log(i + 2);
+      }
+
+      real lambda_ij =
+          -std::abs(dcgDif) /
+          (1 + std::exp(outputScore[index_i] - outputScore[index_j]));
+      gradData[index_i] += lambda_ij / maxDCG;
+      gradData[index_j] -= lambda_ij / maxDCG;
+    }
+  }
+}
+
+real LambdaCost::calcNDCG(const real* outputScore,
+                          const real* score,
+                          int size) {
+  CHECK_GE(size, truncationSize_)
+      << "Invalid: (Sample num in the same list) < (NDCG truncation num) !";
+
+  outputScorePair_.clear();
+  for (int i = 0; i < size; ++i) {
+    outputScorePair_.push_back(std::make_pair(outputScore[i], i));
+  }
+  std::partial_sort(
+      outputScorePair_.begin(),
+      outputScorePair_.begin() + truncationSize_,
+      outputScorePair_.end(),
+      [](const std::pair<real, int>& a, const std::pair<real, int>& b) {
+        return a.first > b.first;
+      });
+
+  real DCG = 0;
+  for (int i = 0; i < truncationSize_; ++i) {
+    DCG +=
+        (std::pow(2, score[outputScorePair_[i].second]) - 1) / std::log(i + 2);
+  }
+
+  scoreVec_.resize(size);
+  std::copy(score, score + size, scoreVec_.begin());
+  real maxDCG = 0;
+  std::partial_sort(scoreVec_.begin(),
+                    scoreVec_.begin() + truncationSize_,
+                    scoreVec_.end(),
+                    std::greater<real>());
+  for (int i = 0; i < truncationSize_; ++i) {
+    maxDCG += (std::pow(2, scoreVec_[i]) - 1) / std::log(i + 2);
+  }
+  CHECK_GT(maxDCG, 0) << "Invalid: max DCG = 0!";
+
+  return DCG / maxDCG;
+}
+
+//
+// class MultiBinaryLabelCrossEntropy
+//
+
+REGISTER_LAYER(multi_binary_label_cross_entropy, MultiBinaryLabelCrossEntropy);
+
+bool MultiBinaryLabelCrossEntropy::init(const LayerMap& layerMap,
+                                        const ParameterMap& parameterMap) {
+  return CostLayer::init(layerMap, parameterMap);
+}
+
+void MultiBinaryLabelCrossEntropy::forwardImp(Matrix& output,
+                                              Argument& label,
+                                              Matrix& target) {
+  MatrixPtr value = nullptr;
+  if (label.ids) {
+    CHECK(!label.value);
+    value = label.ids->toOneHotSparseMatrix(output.getWidth(), useGpu_);
+  } else {
+    CHECK(label.value);
+    value = label.value;
+  }
+
+  if (dynamic_cast<CpuSparseMatrix*>(value.get()) ||
+      dynamic_cast<GpuSparseMatrix*>(value.get())) {
+    target.multiBinaryLabelCrossEntropy(output, *value);
+  } else {
+    Matrix::resizeOrCreate(
+        targetPerDim_, output.getHeight(), output.getWidth(), false, useGpu_);
+
+    targetPerDim_->binaryLabelCrossEntropy(output, *value);
+    targetPerDim_->rowSum(target);
+  }
+}
+
+void MultiBinaryLabelCrossEntropy::backwardImp(Matrix& output,
+                                               Argument& label,
+                                               Matrix& outputG) {
+  MatrixPtr value = nullptr;
+  if (label.ids) {
+    CHECK(!value);
+    value = label.ids->toOneHotSparseMatrix(output.getWidth(), useGpu_);
+  } else {
+    CHECK(label.value);
+    value = label.value;
+  }
+
+  if (dynamic_cast<CpuSparseMatrix*>(value.get()) ||
+      dynamic_cast<GpuSparseMatrix*>(value.get())) {
+    outputG.multiBinaryLabelCrossEntropyBp(output, *value);
+  } else {
+    outputG.binaryLabelCrossEntropyBp(output, *value);
+  }
+}
+
+bool HuberCost::init(const LayerMap& layerMap,
+                     const ParameterMap& parameterMap) {
+  CostLayer::init(layerMap, parameterMap);
+  if (useGpu_) {
+    tmpCpuInput_.reserve(inputLayers_.size());
+    for (size_t i = 0; i < inputLayers_.size(); i++) {
+      tmpCpuInput_.push_back(Argument());
+    }
+  }
+  return true;
+}
+
+void HuberCost::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
+  if (useGpu_) {
+    for (size_t i = 0; i < inputLayers_.size(); i++) {
+      tmpCpuInput_[i].resizeAndCopyFrom(
+          getInput(i), false, HPPL_STREAM_DEFAULT);
+    }
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  }
+}
+
+//
+// Huber loss for robust regression.
+//
+REGISTER_LAYER(huber_regression, HuberRegressionLoss);
+
+bool HuberRegressionLoss::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  HuberCost::init(layerMap, parameterMap);
+  delta_ = config_.delta();
+  return true;
+}
+
+void HuberRegressionLoss::forwardImp(Matrix& output,
+                                     Argument& label,
+                                     Matrix& target) {
+  HuberCost::forwardImp(output, label, target);
+  size_t numSamples = target.getHeight();
+  size_t dim = output.getWidth();
+  CHECK(label.value);
+  CHECK_EQ((*label.value).getHeight(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(dim, (*label.value).getWidth());
+  CHECK_EQ(target.getWidth(), (size_t)1);
+
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
+  real* lbl =
+      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
+  std::vector<real> cost(numSamples, 0);
+  for (size_t i = 0; i < numSamples; ++i) {
+    for (size_t j = 0; j < dim; ++j) {
+      int index = i * dim + j;
+      real a = std::abs(lbl[index] - out[index]);
+      if (a <= delta_)
+        cost[i] += a * a / 2;
+      else
+        cost[i] += delta_ * (a - delta_ / 2);
+    }
+  }
+  target.copyFrom(cost.data(), numSamples);
+}
+
+void HuberRegressionLoss::backwardImp(Matrix& output,
+                                      Argument& label,
+                                      Matrix& outputG) {
+  size_t numSamples = output.getHeight();
+  size_t dim = output.getWidth();
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
+  real* lbl =
+      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
+  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    for (size_t j = 0; j < dim; ++j) {
+      int index = i * dim + j;
+      real a = lbl[index] - out[index];
+      if (std::abs(a) <= delta_)
+        grad[index] += -a;
+      else
+        grad[index] += a > 0 ? -delta_ : delta_;
+    }
+  }
+  if (useGpu_) outputG.copyFrom(grad, numSamples * dim);
+}
+
+//
+// Huber loss for robust 2-classes classification
+//
+REGISTER_LAYER(huber_classification, HuberTwoClassification);
+
+bool HuberTwoClassification::init(const LayerMap& layerMap,
+                                  const ParameterMap& parameterMap) {
+  return HuberCost::init(layerMap, parameterMap);
+}
+
+void HuberTwoClassification::forwardImp(Matrix& output,
+                                        Argument& label,
+                                        Matrix& target) {
+  HuberCost::forwardImp(output, label, target);
+  size_t numSamples = target.getHeight();
+  CHECK(label.ids);
+  CHECK_EQ((*label.ids).getSize(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(output.getWidth(), (size_t)1);
+  CHECK_EQ(target.getWidth(), (size_t)1);
+
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
+  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
+  std::vector<real> cost(numSamples, 0);
+  for (size_t i = 0; i < numSamples; ++i) {
+    int y = 2 * lbl[i] - 1;
+    real a = out[i] * y;
+    if (a < -1)
+      cost[i] = -4 * a;
+    else if (a < 1)
+      cost[i] = (1 - a) * (1 - a);
+  }
+  target.copyFrom(cost.data(), numSamples);
+}
+
+void HuberTwoClassification::backwardImp(Matrix& output,
+                                         Argument& label,
+                                         Matrix& outputG) {
+  size_t numSamples = output.getHeight();
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
+  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
+  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    int y = 2 * lbl[i] - 1;
+    real a = out[i] * y;
+    if (a < -1)
+      grad[i] += -4 * y;
+    else if (a < 1)
+      grad[i] += -2 * (1 - a) * y;
+  }
+  if (useGpu_) outputG.copyFrom(grad, numSamples);
+}
+/**
+ * This cost layer compute the sum of its input as loss.
+ * \f[
+ * o(i) = \sum_{j=1}^D y_{ij}
+ * \f]
+ */
+class SumCostLayer : public Layer {
+ public:
+  explicit SumCostLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
+    bool ret = Layer::init(layerMap, parameterMap);
+    if (!ret) return ret;
+    CHECK_EQ(inputLayers_.size(), 1UL);
+    return true;
+  }
+
+  void forward(PassType passType) override {
+    Layer::forward(passType);
+    const MatrixPtr& input = getInputValue(0);
+
+    /* malloc memory for the output_ if necessary */
+    int batchSize = input->getHeight();
+    int size = 1;
+    resizeOutput(batchSize, size);
+    output_.value->sumRows(*input, /* scaleSum= */ 1, /* scaleDest= */ 0);
+  }
+
+  void backward(const UpdateCallback& callback = nullptr) override {
+    getInputGrad(0)->add((real)1);
+  }
+};
+
+REGISTER_LAYER(sum_cost, SumCostLayer);
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CostLayer.h b/paddle/legacy/gserver/layers/CostLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..9bfec0e2b169fac4f235fd13347be687c4f1a222
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CostLayer.h
@@ -0,0 +1,374 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * Base class for a particular type of cost layer.
+ * This type of cost should have one data layer, one label layer
+ * and an optional weight layer as input.
+ * The derived class should implemnt forwardImp() and backwardImp()
+ * which calculate the cost for data and label. The weight is automatically
+ * handled by the base class.
+ */
+class CostLayer : public Layer {
+ public:
+  explicit CostLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  LayerPtr getOutputLayer() { return inputLayers_[0]; }
+
+  LayerPtr getLabelLayer() { return inputLayers_[1]; }
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+  virtual void forwardImp(Matrix& outputValue,
+                          Argument& label,
+                          Matrix& cost) = 0;
+
+  virtual void backwardImp(Matrix& outputValue,
+                           Argument& label,
+                           Matrix& outputGrad) = 0;
+
+ protected:
+  LayerPtr weightLayer_;
+  real coeff_;
+};
+
+/**
+ * The cross-entropy loss for multi-class classification task.
+ * The loss function is:
+ *
+ * \f[
+ * L = - \sum_{i}{t_{k} * log(P(y=k))}
+ * \f]
+ */
+class MultiClassCrossEntropy : public CostLayer {
+ public:
+  explicit MultiClassCrossEntropy(const LayerConfig& config)
+      : CostLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
+
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
+};
+
+/**
+ * The cross-entropy with self-normalization for multi-class classification.
+ *
+ * The loss function is:
+ * \f[
+ * L = \sum_{i}[-log(P(x_{i})) + alpha * log(Z(x_{i})^2)]
+ * \f]
+ *
+ * The \f$Z(x)\f$ is the softmax normalizer.
+ *
+ * [1] Jacob Devlin, Rabih Zbib, Zhongqiang Huang, Thomas Lamar,
+ *     Richard Schwartz, and John Makhoul. Fast and robust neural
+ *     network joint models for statistical machine translation.
+ *     In Proceedings of the ACL 2014 Conference.
+ */
+class MultiClassCrossEntropyWithSelfNorm : public CostLayer {
+ public:
+  explicit MultiClassCrossEntropyWithSelfNorm(const LayerConfig& config)
+      : CostLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
+
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
+
+ protected:
+  MatrixPtr sftMaxSum_;
+  MatrixPtr sumInv_;
+};
+
+/**
+ * The cross-entropy for soft binary class.
+ * \f[
+ * L = \sum_i (\sum_j -y_j(i)*log(x_j(i))-(1-y_j(i))*log(1-x_j(i)))
+ * \f]
+ */
+class SoftBinaryClassCrossEntropy : public CostLayer {
+ public:
+  explicit SoftBinaryClassCrossEntropy(const LayerConfig& config)
+      : CostLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
+
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
+
+ protected:
+  MatrixPtr targetPerDim_;
+};
+
+/**
+ * This cost layer compute Euclidean (L2) loss for real-valued regression
+ * tasks.
+ * \f[
+ * L = \sum_{i=1}^N {|| \hat{y}_i - y_i||_2^2}
+ * \f]
+ */
+class SumOfSquaresCostLayer : public CostLayer {
+ public:
+  explicit SumOfSquaresCostLayer(const LayerConfig& config)
+      : CostLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
+
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
+};
+
+/**
+ * This cost layer compute smooth L1 loss for real-valued regression
+ * tasks.
+ * \f[
+ * L =
+ *   0.5 * x^2    if / -1 < |x| < 1 /
+ *   |x| - 0.5    / otherwise /
+ * \f]
+ *
+ * x = output - label
+ */
+class SmoothL1CostLayer : public CostLayer {
+ public:
+  explicit SmoothL1CostLayer(const LayerConfig& config) : CostLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
+
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
+};
+
+/**
+ * A cost layer for learning to rank (LTR) task. This layer contains at leat
+ * three inputs.
+ * \f[
+ *  C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) \\
+ *  o_{i,j} =  o_i - o_j  \\
+ *  \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
+ * \f]
+ *
+ * [1]. Chris Burges, Tal Shaked, Erin Renshaw, et al. Learning to
+ *      Rank useing Gradient Descent.
+ */
+class RankingCost : public Layer {
+ public:
+  explicit RankingCost(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  LayerPtr getOutputLayer(size_t i) { return inputLayers_[i]; }
+
+  LayerPtr getLabelLayer() { return inputLayers_[2]; }
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+  void onPassEnd() override;
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) {
+    (void)output;
+    (void)label;
+    (void)cost;
+  }
+
+  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad) {
+    (void)outputValue;
+    (void)label;
+    (void)outputGrad;
+  }
+
+ private:
+  double posPairCount_;
+  double negPairCount_;
+  MatrixPtr margin_;
+  MatrixPtr marginGrad_;
+  /// if input label is put in ids (not value), copy to this buffer.
+  MatrixPtr labelBuf_;
+  LayerPtr weightLayer_;
+};
+
+/**
+ * LambdaRank os a method for learning arbitrary information retrieval
+ * measures. It can be applied to any algorithm that learns through gradient
+ * descent. LambdaRank is a listwise method, in that the cost depends on the
+ * sorted order of the documents. LambdaRank gives the gradient of cost
+ * function:
+ *
+ * \f[
+ * \lambda_{ij} = \frac{1}{1 + e^{o_i - o_j}} \left| \Delta_{NDCG} \right|
+ * \f]
+ *
+ * [1] Christopher J.C. Burges, Robert Ragno, Quoc Viet Le. Learning to Rank
+ *     with Nonsmooth Cost Functions.
+ */
+class LambdaCost : public Layer {
+ public:
+  explicit LambdaCost(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  LayerPtr getOutputLayer() { return inputLayers_[0]; }
+
+  LayerPtr getScoreLayer() { return inputLayers_[1]; }
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+  real calcNDCG(const real* outputScore, const real* score, int size);
+  void calcGrad(const real* outputScore,
+                const real* score,
+                real* gradData,
+                int size);
+
+ private:
+  MatrixPtr marginGrad_;
+  int truncationSize_;
+  int maxSortSize_;
+  std::vector<std::pair<real, int>> scorePair_;
+  std::vector<std::pair<real, int>> outputScorePair_;
+  std::vector<real> scoreVec_;
+};
+
+/**
+ * Cross entropy for multi binary labels.
+ * \f[
+ * cost[i] = -sum(label[i][j]*log(output[i][j]) +
+ *            (1-label[i][j])*log(1-output[i][j]))
+ * \f]
+ */
+class MultiBinaryLabelCrossEntropy : public CostLayer {
+ protected:
+  MatrixPtr targetPerDim_;
+
+ public:
+  explicit MultiBinaryLabelCrossEntropy(const LayerConfig& config)
+      : CostLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
+
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
+};
+
+/*
+ * A base layer for HuberRegressionLoss and HuberTwoClassification.
+ */
+class HuberCost : public CostLayer {
+ public:
+  std::vector<Argument> tmpCpuInput_;
+
+  explicit HuberCost(const LayerConfig& config) : CostLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
+
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override {}
+};
+
+/**
+ * Huber loss for robust regression.
+ *
+ * Given output f(x), label y and delta, the loss is:
+ * Loss = 0.5 * (1 - y * f)^2, if abs(y - f) <= delta \\
+ * Loss = delta * abs(y - f) - 0.5 * delta^2, otherwise
+ */
+class HuberRegressionLoss : public HuberCost {
+ public:
+  explicit HuberRegressionLoss(const LayerConfig& config) : HuberCost(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
+
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
+
+ protected:
+  real delta_;
+};
+
+/**
+ * Huber loss for robust 2-classes classification.
+ *
+ * For label={0, 1}, let y=2*label-1. Given output f(x), the loss is:
+ * Loss = 4 * y * f, if y* f < -1 \\
+ * Loss = (1 - y * f)^2, if -1 < y * f < 1  \\
+ * Loss = 0, otherwise
+ */
+class HuberTwoClassification : public HuberCost {
+ public:
+  explicit HuberTwoClassification(const LayerConfig& config)
+      : HuberCost(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
+
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
+};
+
+typedef std::shared_ptr<CostLayer> CostLayerPtr;
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CropLayer.cpp b/paddle/legacy/gserver/layers/CropLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d891375ecce0371503ba3034f0584f3b1e553a55
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CropLayer.cpp
@@ -0,0 +1,146 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CropLayer.h"
+#include "paddle/legacy/utils/Stat.h"
+namespace paddle {
+
+REGISTER_LAYER(crop, CropLayer);
+
+bool CropLayer::init(const LayerMap& layerMap,
+                     const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_LE(static_cast<int>(inputLayers_.size()), 2);
+  CHECK_GE(static_cast<int>(inputLayers_.size()), 1);
+  crop_axis_ = config_.axis();
+  for (int i = 0; i < config_.offset_size(); i++) {
+    crop_offsets_.push_back(config_.offset(i));
+  }
+
+  // 1. get input_0 shape
+  auto& input0_img_conf = config_.inputs(0).image_conf();
+  inDims_ = TensorShape({0,
+                         input0_img_conf.channels(),
+                         input0_img_conf.has_img_size_y()
+                             ? input0_img_conf.img_size_y()
+                             : input0_img_conf.img_size(),
+                         input0_img_conf.img_size()});
+  // 2. get target dims from config
+  if (config_.inputs_size() == 1) {
+    targetDims_ = TensorShape({config_.shape(0),
+                               config_.shape(1),
+                               config_.shape(2),
+                               config_.shape(3)});
+  } else {
+    // 2. get input_1 shape
+    auto& input1_img_conf = config_.inputs(1).image_conf();
+    targetDims_ = TensorShape({0,
+                               input1_img_conf.channels(),
+                               input1_img_conf.has_img_size_y()
+                                   ? input1_img_conf.img_size_y()
+                                   : input1_img_conf.img_size(),
+                               input1_img_conf.img_size()});
+  }
+
+  // 3. get final crop corner
+  int dimSize = 4;
+  crop_corner_ = {0, 0, 0, 0};
+  for (int i = 0; i < dimSize; i++) {
+    if (i >= crop_axis_) {
+      if (crop_offsets_.size() > 1) {
+        crop_corner_[i] = crop_offsets_[i - crop_axis_];
+      } else {
+        crop_corner_[i] = crop_offsets_[0];
+      }
+    }
+  }
+
+  outDims_ = TensorShape(4);
+
+  createFunction(
+      forward_, "Crop", FuncConfig().set("crop_corner", crop_corner_));
+  createFunction(
+      backward_, "CropGrad", FuncConfig().set("crop_corner", crop_corner_));
+
+  return true;
+}
+
+void CropLayer::setOutDims() {
+  MatrixPtr input = inputLayers_[1]->getOutputValue();
+  size_t batchSize = input->getHeight();
+  // get target dims from input_1
+  if (config_.inputs_size() == 2) {
+    targetDims_.setDim(0, batchSize);
+    int ch = config_.inputs(0).image_conf().channels();
+    if (ch != 0) targetDims_.setDim(1, ch);
+    int h = inputLayers_[1]->getOutput().getFrameHeight();
+    if (h != 0) targetDims_.setDim(2, h);
+    int w = inputLayers_[1]->getOutput().getFrameWidth();
+    if (w != 0) targetDims_.setDim(3, w);
+  }
+  // get final crop shape from target dims and crop axis
+  std::vector<uint32_t> crop_shape;
+  int dimSize = 4;
+  for (int i = 0; i < dimSize; i++) {
+    if (i >= crop_axis_) {
+      crop_shape.push_back(targetDims_[i]);
+    } else {
+      crop_shape.push_back(inDims_[i]);
+    }
+  }
+
+  outDims_.reshape(
+      {crop_shape[0], crop_shape[1], crop_shape[2], crop_shape[3]});
+  output_.setFrameHeight(crop_shape[2]);
+  output_.setFrameWidth(crop_shape[3]);
+}
+
+void CropLayer::setInDims() {
+  MatrixPtr input = inputLayers_[0]->getOutputValue();
+  size_t batchSize = input->getHeight();
+  inDims_.setDim(0, batchSize);
+  int h = inputLayers_[0]->getOutput().getFrameHeight();
+  if (h != 0) inDims_.setDim(2, h);
+  int w = inputLayers_[0]->getOutput().getFrameWidth();
+  if (w != 0) inDims_.setDim(3, w);
+}
+
+void CropLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  setInDims();
+  setOutDims();
+  int size = outDims_[1] * outDims_[2] * outDims_[3];
+  resetOutput(outDims_[0], size);
+  MatrixPtr outV = getOutputValue();
+  REGISTER_TIMER_INFO("CropForward", getName().c_str());
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), inDims_);
+  outputs.addArg(*getOutputValue(), outDims_, ASSIGN_TO);
+  forward_[0]->calc(inputs, outputs);
+}
+
+void CropLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  REGISTER_TIMER_INFO("CropBackward", getName().c_str());
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getOutputGrad(), outDims_);
+  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
+  backward_[0]->calc(inputs, outputs);
+}
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CropLayer.h b/paddle/legacy/gserver/layers/CropLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef88bc483d157406a0f5a7924c14c345ea0df8c4
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CropLayer.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * \brief  This layer crop input according to the specify conf.
+ *         input_0: input to be cropped
+ *         input_1: optional reference input
+ *         axis: start dimension to be croped
+ *         offset: offset of cropping  in each dimension
+ *         shape: if reference input layer was not setted,
+ *                  crop input as this shape conf
+ */
+class CropLayer : public Layer {
+ public:
+  explicit CropLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~CropLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ protected:
+  void setOutDims();
+  void setInDims();
+
+  int32_t crop_axis_;
+  std::vector<uint32_t> crop_offsets_;
+  std::vector<uint32_t> crop_corner_;
+  TensorShape inDims_;
+  TensorShape targetDims_;
+  TensorShape outDims_;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CrossChannelNormLayer.cpp b/paddle/legacy/gserver/layers/CrossChannelNormLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0fe100a96c01713f6c8d10d4eff428e7e743b002
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CrossChannelNormLayer.cpp
@@ -0,0 +1,137 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "NormLayer.h"
+#include "paddle/legacy/math/BaseMatrix.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+MatrixPtr CrossChannelNormLayer::createSampleMatrix(MatrixPtr data,
+                                                    size_t iter,
+                                                    size_t spatialDim) {
+  return Matrix::create(data->getData() + iter * channels_ * spatialDim,
+                        channels_,
+                        spatialDim,
+                        false,
+                        useGpu_);
+}
+
+MatrixPtr CrossChannelNormLayer::createSpatialMatrix(MatrixPtr data,
+                                                     size_t iter,
+                                                     size_t spatialDim) {
+  return Matrix::create(
+      data->getData() + iter * spatialDim, 1, spatialDim, false, useGpu_);
+}
+
+bool CrossChannelNormLayer::init(const LayerMap& layerMap,
+                                 const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK(parameters_[0]);
+  const NormConfig& conf = config_.inputs(0).norm_conf();
+  channels_ = conf.channels();
+  scale_.reset(new Weight(channels_, 1, parameters_[0]));
+  return true;
+}
+
+void CrossChannelNormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr inV = getInputValue(0);
+
+  size_t batchSize = inV->getHeight();
+  size_t dataDim = inV->getWidth();
+  CHECK_EQ(getSize(), dataDim);
+
+  reserveOutput(batchSize, dataDim);
+  MatrixPtr outV = getOutputValue();
+  size_t spatialDim = dataDim / channels_;
+
+  Matrix::resizeOrCreate(dataBuffer_, batchSize, dataDim, false, useGpu_);
+  Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_);
+  Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_);
+
+  inV->square2(*dataBuffer_);
+  for (size_t i = 0; i < batchSize; i++) {
+    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
+    const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
+    MatrixPtr outVTmp = createSampleMatrix(outV, i, spatialDim);
+    MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
+
+    // compute norm.
+    spatialBuffer_->sumCols(*dataTmp, 1, 0);
+    // add eps to avoid overflow
+    spatialBuffer_->add(1e-6);
+    spatialBuffer_->sqrt2(*spatialBuffer_);
+    normTmp->copyFrom(*spatialBuffer_);
+    outVTmp->copyFrom(*inVTmp);
+    outVTmp->divRowVector(*spatialBuffer_);
+    // scale the layer.
+    outVTmp->mulColVector(*scale_->getW());
+  }
+}
+
+void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr outV = getOutputValue();
+
+  size_t batchSize = inG->getHeight();
+  size_t dataDim = inG->getWidth();
+  size_t spatialDim = dataDim / channels_;
+
+  MatrixPtr inGBuffer;
+  Matrix::resizeOrCreate(inGBuffer, channels_, spatialDim, false, useGpu_);
+
+  dataBuffer_->dotMul(*outG, *outV);
+  Matrix::resizeOrCreate(scaleDiff_, channels_, 1, false, useGpu_);
+  Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_);
+  Matrix::resizeOrCreate(sampleBuffer_, channels_, spatialDim, false, useGpu_);
+  scaleDiff_->zeroMem();
+  for (size_t i = 0; i < batchSize; i++) {
+    MatrixPtr outGTmp = createSampleMatrix(outG, i, spatialDim);
+    const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
+    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
+    const MatrixPtr inGTmp = createSampleMatrix(inG, i, spatialDim);
+    const MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
+
+    channelBuffer_->sumRows(*dataTmp, 1, 0);
+    channelBuffer_->dotDiv(*channelBuffer_, *(scale_->getW()));
+    // store a / scale[i] in scaleDiff_ temporary
+    scaleDiff_->add(*channelBuffer_, 1.);
+
+    sampleBuffer_->dotMul(*inVTmp, *outGTmp);
+    spatialBuffer_->sumCols(*sampleBuffer_, 1., 0.);
+    // scale the grad
+    inGBuffer->copyFrom(*inVTmp);
+    inGBuffer->mulRowVector(*spatialBuffer_);
+    // divide by square of norm
+    spatialBuffer_->dotMul(*normTmp, *normTmp);
+    inGBuffer->divRowVector(*spatialBuffer_);
+    // subtract
+    inGBuffer->add(*outGTmp, -1, 1);
+    // divide by norm
+    inGBuffer->divRowVector(*normTmp);
+    // scale the diff
+    inGBuffer->mulColVector(*scale_->getW());
+
+    inGTmp->add(*inGBuffer);
+  }
+  // updata scale
+  if (scale_->getWGrad()) scale_->getWGrad()->add(*scaleDiff_);
+  scale_->getParameterPtr()->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.cpp b/paddle/legacy/gserver/layers/CrossEntropyOverBeam.cpp
similarity index 100%
rename from paddle/gserver/layers/CrossEntropyOverBeam.cpp
rename to paddle/legacy/gserver/layers/CrossEntropyOverBeam.cpp
diff --git a/paddle/legacy/gserver/layers/CrossEntropyOverBeam.h b/paddle/legacy/gserver/layers/CrossEntropyOverBeam.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8702b16165eee8d552c563082ffc708ce443deb
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CrossEntropyOverBeam.h
@@ -0,0 +1,135 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "CrossEntropyOverBeam.h"
+#include "Layer.h"
+
+namespace paddle {
+
+/* This struct stores the beams in all search steps for a single sequence. */
+struct BeamExpansion {
+  std::vector<MatrixPtr> scores;
+  std::vector<IVectorPtr> seqInfo;
+
+  std::vector<MatrixPtr> candidateIds;
+  std::vector<int> gold;
+
+  std::vector<MatrixPtr> scoreGrad;
+
+  size_t expansionCount;
+
+  explicit BeamExpansion(int n) {
+    expansionCount = n;
+    scores.resize(expansionCount);
+    seqInfo.resize(expansionCount);
+    candidateIds.resize(expansionCount);
+    scoreGrad.resize(expansionCount);
+
+    gold.resize(expansionCount);
+  }
+};
+typedef std::shared_ptr<BeamExpansion> BeamExpansionPtr;
+
+class CostForOneSequence {
+ public:
+  CostForOneSequence()
+      : beamSize_(0), validExpansionCount_(0), goldAsExtraPath_(false) {}
+  void setData(const BeamExpansionPtr bPtr, size_t beamSize) {
+    beams_ = bPtr;
+    beamSize_ = beamSize;
+
+    expandedPathScores_.clear();
+    expandedPathScores_.resize(beams_->expansionCount);
+
+    goldRowIds_.clear();
+    goldRowIds_.resize(beams_->expansionCount, 0);
+    goldColIds_.clear();
+    goldColIds_.resize(beams_->expansionCount, -1);
+  }
+  size_t getValidExpansionCount() { return validExpansionCount_; }
+
+  real forward();
+  void backward();
+
+ private:
+  void calValidExpandStep();
+  void constructTotalExpansion();
+  size_t initLastExpansion();
+  real globallyNormalizedScore();
+
+  int getSeqStartPos(size_t beamId, size_t rowId) {
+    CHECK_GT(beams_->seqInfo[beamId]->getSize() - 1, rowId);
+    int* starts = beams_->seqInfo[beamId]->getData();
+    return starts[rowId] - starts[0];
+  }
+
+  size_t beamSize_;
+  size_t validExpansionCount_;
+  bool goldAsExtraPath_;
+  std::vector<int> goldRowIds_;
+  std::vector<int> goldColIds_;
+
+  BeamExpansionPtr beams_;
+  std::vector<std::vector<int>> pathRowIdsInEachBeam_;
+  std::vector<int> parentIdsInBeam_;
+  size_t goldIdsInFinalExpansion_;
+
+  std::vector<MatrixPtr> expandedPathScores_;
+
+  MatrixPtr softmaxOut_;
+};
+
+class CrossEntropyOverBeam : public Layer {
+ public:
+  explicit CrossEntropyOverBeam(const LayerConfig& config) : Layer(config) {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+
+ private:
+  void checkInputs();
+  void copyInputsToCpu();
+  void resizeOutput();
+  void copyGradToGpu(size_t copyCount);
+  void splitBatchBeams();
+
+  size_t beamExpanCount_;
+  size_t batchSize_;
+  size_t beamSize_;
+
+  /*
+   * the process of constructing beams is not friendly to GPU, currently, this
+   * layer only runs on CPU, if any of its inputs is on GPU memory, then copy
+   * it to CPU memory.
+   */
+  std::vector<MatrixPtr> candidateScores_;
+  std::vector<MatrixPtr> candidateScoreGrad_;
+  std::vector<MatrixPtr> candidateInBeam_;
+  std::vector<MatrixPtr> gradToInputs_;
+  std::vector<IVectorPtr> goldSequence_;
+  std::vector<std::vector<int>> beamSplitPos_;
+
+  /*
+   * split entire bath of beams into beam per sequnence and store the result
+   * into this member.
+   */
+  std::vector<BeamExpansion> beamPerSeq_;
+  /* beamCosts_ is used to propagate error in one sequence. */
+  std::vector<CostForOneSequence> beamCosts_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/legacy/gserver/layers/CudnnBatchNormLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..051155e0d2c1b4910c6627a902a4150cbfb15800
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CudnnBatchNormLayer.cpp
@@ -0,0 +1,180 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CudnnBatchNormLayer.h"
+#include "Layer.h"
+#include "paddle/legacy/cuda/include/hl_batch_norm.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(cudnn_batch_norm, CudnnBatchNormLayer);
+
+bool CudnnBatchNormLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  if (!BatchNormBaseLayer::init(layerMap, parameterMap)) return false;
+  CHECK(useGpu_) << "CudnnBatchNorm only support GPU";
+
+  hl_create_tensor_descriptor(&ioDesc_);
+  hl_create_tensor_descriptor(&bnParamDesc_);
+  hl_tensor_reshape(bnParamDesc_, 1, channels_, 1, 1);
+
+  return true;
+}
+
+void CudnnBatchNormLayer::reshape(int batchSize) {
+  hl_tensor_reshape(ioDesc_, batchSize, channels_, imageH_ * imageD_, imageW_);
+}
+
+void CudnnBatchNormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  int batchSize = getInputValue(0)->getHeight();
+  calFeatureMapSize();
+  reshape(batchSize);
+  resetOutput(batchSize, getInputValue(0)->getWidth());
+
+  // for testing in training peroid.
+  useGlobalStats_ = (passType == PASS_TEST);
+  if (passType == PASS_TEST && config_.has_use_global_stats()) {
+    useGlobalStats_ = config_.use_global_stats();
+  }
+
+  real* input = getInputValue(0)->getData();
+  real* output = getOutputValue()->getData();
+  real* gamma = weight_->getW()->getData();
+  real* beta = biases_->getW()->getData();
+  real* movingMean = movingMean_->getW()->getData();
+  real* movingVar = movingVar_->getW()->getData();
+
+  // cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
+  eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
+
+  if (!useGlobalStats_) {
+    REGISTER_TIMER_INFO("CudnnBatchFwTimer", getName().c_str());
+    real* savedMean = savedMean_->getData();
+    real* savedInvVar = savedInvVar_->getData();
+    hl_batch_norm_forward_training(ioDesc_,
+                                   input,
+                                   ioDesc_,
+                                   output,
+                                   bnParamDesc_,
+                                   gamma,
+                                   beta,
+                                   1.0 - movingAvgFraction_,
+                                   movingMean,
+                                   movingVar,
+                                   eps_,
+                                   savedMean,
+                                   savedInvVar);
+  } else {
+    // used movingMean and movingVar in testing
+    if (batchSize <= 1024) {
+      hl_batch_norm_forward_inference(ioDesc_,
+                                      input,
+                                      ioDesc_,
+                                      output,
+                                      bnParamDesc_,
+                                      gamma,
+                                      beta,
+                                      movingMean,
+                                      movingVar,
+                                      eps_);
+    } else {
+      // There is a limitation in cudnn library.
+      // When the batch size is larger than 1024 in cuDNN v5.1,
+      // the cudnnBatchNormalizationForwardInference will fail.
+      hl_batch_norm_cuda_inference(input,
+                                   output,
+                                   gamma,
+                                   beta,
+                                   movingMean,
+                                   movingVar,
+                                   eps_,
+                                   batchSize,
+                                   channels_,
+                                   imageH_ * imageD_,
+                                   imageW_);
+    }
+  }
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  real* input = getInputValue(0)->getData();
+  real* outGrad = getOutputGrad()->getData();
+  real* inGrad = getInputGrad(0)->getData();
+  real* gamma = weight_->getW()->getData();
+  real* savedMean = savedMean_->getData();
+  real* savedInvVar = savedInvVar_->getData();
+
+  // cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
+  eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
+
+  auto create = [](MatrixPtr& m, size_t h, size_t w, real** p) {
+    Matrix::resizeOrCreate(m, h, w, false, true);
+    m->zeroMem();
+    *p = m->getData();
+  };
+
+  real* gammaGrad = nullptr;
+  real* betaGrad = nullptr;
+  if (weight_->getWGrad()) {
+    gammaGrad = weight_->getWGrad()->getData();
+  } else {
+    create(tmpWGrad_, 1, channels_, &gammaGrad);
+  }
+  if (biases_ && biases_->getWGrad()) {
+    betaGrad = biases_->getWGrad()->getData();
+  } else {
+    create(tmpBiasGrad_, 1, channels_, &betaGrad);
+  }
+
+  hl_batch_norm_backward(ioDesc_,
+                         input,
+                         ioDesc_,
+                         outGrad,
+                         ioDesc_,
+                         inGrad,
+                         bnParamDesc_,
+                         gamma,
+                         gammaGrad,
+                         betaGrad,
+                         eps_,
+                         savedMean,
+                         savedInvVar);
+
+  {
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    biases_->getParameterPtr()->incUpdate(callback);
+    weight_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+CudnnBatchNormLayer::~CudnnBatchNormLayer() {
+  hl_destroy_tensor_descriptor(ioDesc_);
+  hl_destroy_tensor_descriptor(bnParamDesc_);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CudnnBatchNormLayer.h b/paddle/legacy/gserver/layers/CudnnBatchNormLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b33b983b31173ab941df5f2e66eac51aabc6315
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CudnnBatchNormLayer.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cudnn.h>
+#include "BatchNormBaseLayer.h"
+#include "Layer.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief Cudnn Batch normalization layer use to cuDNN lib to implentment.
+ * @note Cudnn version must >= v4.0, and better to use the latest version
+ * (v5.1).
+ *
+ * The config file api is batch_norm_layer.
+ */
+
+class CudnnBatchNormLayer : public BatchNormBaseLayer {
+ public:
+  explicit CudnnBatchNormLayer(const LayerConfig& config)
+      : BatchNormBaseLayer(config) {}
+
+  ~CudnnBatchNormLayer();
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  /**
+   * reshape tensor of ioDesc_.
+   */
+  void reshape(int batchSize);
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ protected:
+  /// Epsilon value used in the batch normalization formula.
+  /// Same epsilon value should be used in forward and backward functions.
+  double eps_;
+
+  /// Input/output tensor descriptor desc
+  hl_tensor_descriptor ioDesc_;
+  /// Shared tensor descriptor desc for the 6 tenros:
+  /// bnScale, bnBias, running mean/var, save_mean/var
+  hl_tensor_descriptor bnParamDesc_;
+
+  /**
+   * @brief The gradient of weight and bias in cudnn api can not be empty.
+   * If set is_static for weight or bias, it will not allocate memory for them,
+   * and the gradient is NULL. In this case, will use two matrix.
+   */
+  MatrixPtr tmpWGrad_, tmpBiasGrad_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CudnnConvBaseLayer.cpp b/paddle/legacy/gserver/layers/CudnnConvBaseLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9353cca9c83bd90a454b2be56dc08b8eadee0bf7
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CudnnConvBaseLayer.cpp
@@ -0,0 +1,135 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CudnnConvBaseLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+REGISTER_LAYER(cudnn_conv, CudnnConvBaseLayer);
+REGISTER_LAYER(cudnn_convt, CudnnConvBaseLayer);
+
+bool CudnnConvBaseLayer::init(const LayerMap &layerMap,
+                              const ParameterMap &parameterMap) {
+  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
+  CHECK(useGpu_) << "CudnnConvLayer only support gpu";
+
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  projections_.reserve(inputLayers_.size());
+  projConf_.reserve(inputLayers_.size());
+
+  numFilters_ = config_.num_filters();
+  CHECK(config_.shared_biases());
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    ProjectionConfig *conf = new ProjectionConfig();
+    if (isDeconv_) {
+      conf->set_type("convt");
+    } else {
+      conf->set_type("conv");
+    }
+    conf->set_num_filters(numFilters_);
+    ConvConfig *convConf = conf->mutable_conv_conf();
+    *convConf = *(config_.mutable_inputs(i)->mutable_conv_conf());
+    conf->set_input_size(getPrev(i)->getSize());
+    conf->set_output_size(getSize());
+    projConf_.emplace_back(conf);
+    projections_.emplace_back(
+        Projection::create(*projConf_[i], parameters_[i], useGpu_));
+
+    // create a new weight
+    size_t height, width;
+    height = filterPixels_[i] * filterChannels_[i];
+    width = (!isDeconv_) ? numFilters_ : channels_[i];
+    CHECK_EQ(parameters_[i]->getSize(), width * height);
+    Weight *w = new Weight(height, width, parameters_[i]);
+    weights_.emplace_back(w);
+  }
+
+  if (biasParameter_.get()) {
+    if (sharedBiases_) {
+      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
+    } else {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
+    }
+  }
+  if (biases_.get() && sharedBiases_) {
+    hl_create_tensor_descriptor(&biasDesc_);
+    hl_create_tensor_descriptor(&outputDesc_);
+    hl_tensor_reshape(biasDesc_, 1, numFilters_, 1, 1);
+  }
+
+  return true;
+}
+
+void CudnnConvBaseLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  int batchSize = getInput(0).getBatchSize();
+  resetOutput(batchSize, calOutputSize());
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    projections_[i]->forward(&getInput(i), &getOutput(), passType);
+  }
+
+  if (biases_) {
+    REGISTER_TIMER_INFO("CudnnConvBiasTimer", getName().c_str());
+    int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+    int outH = outputH_[0];
+    int outW = outputW_[0];
+
+    hl_tensor_reshape(outputDesc_,
+                      batchSize,
+                      numFilters_,
+                      outH,
+                      outW,
+                      numFilters_ * outH * outW,
+                      outH * outW,
+                      outW,
+                      1);
+    real *outData = getOutputValue()->getData();
+    real *biasData = biases_->getW()->getData();
+    hl_convolution_forward_add_bias(biasDesc_, biasData, outputDesc_, outData);
+  }
+
+  forwardActivation();
+}
+
+void CudnnConvBaseLayer::backward(const UpdateCallback &callback) {
+  backwardActivation();
+
+  if (biases_ && biases_->getWGrad()) {
+    REGISTER_TIMER_INFO("CudnnConvBpBiasTimer", getName().c_str());
+    real *biasGrad = biases_->getWGrad()->getData();
+    real *outGrad = getOutputGrad()->getData();
+    hl_convolution_backward_bias(biasDesc_, biasGrad, outputDesc_, outGrad);
+
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    projections_[i]->backward(callback);
+  }
+}
+
+CudnnConvBaseLayer::~CudnnConvBaseLayer() {
+  if (biases_) {
+    hl_destroy_tensor_descriptor(biasDesc_);
+    hl_destroy_tensor_descriptor(outputDesc_);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CudnnConvBaseLayer.h b/paddle/legacy/gserver/layers/CudnnConvBaseLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..d050183eb7838bed803995985383e0ee4e9731a1
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CudnnConvBaseLayer.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "ConvBaseLayer.h"
+#include "Projection.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief A 2-dimension conv layer implemented by cuDNN. It only
+ *        supports GPU mode. We automatic select CudnnConvLayer for GPU
+ *        mode and ExpandConvLayer for CPU mode if you set type of "conv".
+ *        User also can specfiy type of "exconv" or "cudnn_conv" for
+ *        particular type.
+ *
+ * The config file api is img_conv_layer.
+ */
+class CudnnConvBaseLayer : public ConvBaseLayer {
+ protected:
+  std::vector<std::unique_ptr<ProjectionConfig>> projConf_;
+  std::vector<std::unique_ptr<Projection>> projections_;
+
+  hl_tensor_descriptor biasDesc_;
+  hl_tensor_descriptor outputDesc_;
+
+ public:
+  explicit CudnnConvBaseLayer(const LayerConfig& config)
+      : ConvBaseLayer(config) {}
+
+  ~CudnnConvBaseLayer();
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CudnnPoolLayer.cpp b/paddle/legacy/gserver/layers/CudnnPoolLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c790dfd71efbee1a2a0afa69e6c336c4330737d0
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CudnnPoolLayer.cpp
@@ -0,0 +1,139 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CudnnPoolLayer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+bool CudnnPoolLayer::typeCheck(const std::string &poolType,
+                               hl_pooling_mode_t *mode) {
+  if (poolType == "cudnn-max-pool") {
+    if (mode) {
+      *mode = HL_POOLING_MAX;
+    }
+  } else if (poolType == "cudnn-avg-pool") {
+    if (mode) {
+      *mode = HL_POOLING_AVERAGE;
+    }
+  } else if (poolType == "cudnn-avg-incl-pad-pool") {
+    if (mode) {
+      *mode = HL_POOLING_AVERAGE_INCLUDE_PADDING;
+    }
+  } else {
+    return false;
+  }
+
+  return true;
+}
+
+CudnnPoolLayer::CudnnPoolLayer(const LayerConfig &config) : PoolLayer(config) {
+  const std::string &pool_type = config.inputs(0).pool_conf().pool_type();
+  CHECK_EQ(CudnnPoolLayer::typeCheck(pool_type, &mode_), true);
+}
+
+bool CudnnPoolLayer::init(const LayerMap &layerMap,
+                          const ParameterMap &parameterMap) {
+  PoolLayer::init(layerMap, parameterMap);
+
+  CHECK(useGpu_) << "CudnnPoolLayer only support gpu";
+
+  hl_create_tensor_descriptor(&inputDesc_);
+  hl_create_tensor_descriptor(&outputDesc_);
+
+  windowHeight = sizeY_;
+  windowWidth = sizeX_;
+  heightPadding = confPaddingY_;
+  widthPadding = confPadding_;
+  strideHeight = strideY_;
+  strideWidth = stride_;
+
+  hl_create_pooling_descriptor(&poolingDesc_,
+                               mode_,
+                               windowHeight,
+                               windowWidth,
+                               heightPadding,
+                               widthPadding,
+                               strideHeight,
+                               strideWidth);
+
+  return true;
+}
+
+void CudnnPoolLayer::reshape(int batchSize) {
+  imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (imageH_ == 0) {
+    imageH_ = imgSizeY_;
+  }
+  if (imageW_ == 0) {
+    imageW_ = imgSize_;
+  }
+  CHECK_EQ(inputLayers_[0]->getOutput().value->getWidth(),
+           channels_ * imageH_ * imageW_);
+  outputH_ = outputSize(imageH_,
+                        sizeY_,
+                        confPaddingY_,
+                        strideY_,
+                        /* caffeMode */ false);
+  outputW_ =
+      outputSize(imageW_, sizeX_, confPadding_, stride_, /* caffeMode */ false);
+  getOutput().setFrameHeight(outputH_);
+  getOutput().setFrameWidth(outputW_);
+
+  hl_tensor_reshape(inputDesc_, batchSize, channels_, imageH_, imageW_);
+  hl_tensor_reshape(outputDesc_, batchSize, channels_, outputH_, outputW_);
+}
+
+void CudnnPoolLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  CHECK(inputLayers_[0]->getOutputValue()->useGpu());
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  reshape(batchSize);
+  resetOutput(batchSize, outputH_ * outputW_ * channels_);
+
+  real *inputData = getInputValue(0)->getData();
+  real *outData = getOutputValue()->getData();
+  hl_pooling_forward(inputDesc_, inputData, outputDesc_, outData, poolingDesc_);
+}
+
+void CudnnPoolLayer::backward(const UpdateCallback &callback) {
+  (void)callback;
+  if (NULL == getInputGrad(0)) {
+    return;
+  }
+
+  real *inputData = getInputValue(0)->getData();
+  real *inputGrad = getInputGrad(0)->getData();
+  real *outData = getOutputValue()->getData();
+  real *outGrad = getOutputGrad()->getData();
+  hl_pooling_backward(inputDesc_,
+                      inputData,
+                      inputGrad,
+                      outputDesc_,
+                      outData,
+                      outGrad,
+                      poolingDesc_);
+}
+
+CudnnPoolLayer::~CudnnPoolLayer() {
+  hl_destroy_tensor_descriptor(inputDesc_);
+  hl_destroy_tensor_descriptor(outputDesc_);
+  hl_destroy_pooling_descriptor(poolingDesc_);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/CudnnPoolLayer.h b/paddle/legacy/gserver/layers/CudnnPoolLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc249354d10333211691b6844bffa3c8da8a79ee
--- /dev/null
+++ b/paddle/legacy/gserver/layers/CudnnPoolLayer.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "PoolLayer.h"
+
+namespace paddle {
+
+/**
+ * @brief CudnnPoolLayer is subclass of PoolLayer, which is implemented by
+ * cudnn api and only supports GPU.
+ *
+ * The config file api is img_pool_layer.
+ */
+
+class CudnnPoolLayer : public PoolLayer {
+ protected:
+  int windowHeight, windowWidth;
+  int heightPadding, widthPadding, strideHeight, strideWidth;
+  int imageH_, imageW_, outputH_, outputW_;
+  /// mode_ is poolint type, inlcuding "cudnn-max-pool", "cudnn-avg-pool"
+  /// "cudnn-avg-excl-pad-pool".
+  hl_pooling_mode_t mode_;
+  /// cudnn tensor descriptor for input.
+  hl_tensor_descriptor inputDesc_;
+  /// cudnn tensor descriptor for output.
+  hl_tensor_descriptor outputDesc_;
+  /// A description of a pooling operation.
+  hl_pooling_descriptor poolingDesc_;
+
+ public:
+  static bool typeCheck(const std::string& poolType,
+                        hl_pooling_mode_t* mode = nullptr);
+  explicit CudnnPoolLayer(const LayerConfig& config);
+  ~CudnnPoolLayer();
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  /**
+   * Reshape input and output tensor descriptor.
+   * The batch size maybe change during training in last batch of each pass.
+   * So reshaping is needed.
+   */
+  void reshape(int batchSize);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DataLayer.cpp b/paddle/legacy/gserver/layers/DataLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/DataLayer.cpp
rename to paddle/legacy/gserver/layers/DataLayer.cpp
diff --git a/paddle/legacy/gserver/layers/DataLayer.h b/paddle/legacy/gserver/layers/DataLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..d02f5a4697b9067f7d34e4d0b2d34f8c63ffe020
--- /dev/null
+++ b/paddle/legacy/gserver/layers/DataLayer.h
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+
+#include "Layer.h"
+
+namespace paddle {
+/**
+ * This layer just copy data to output, and has no backward propagation.
+ *
+ * The config file api is data_layer.
+ */
+class DataLayer : public Layer {
+ public:
+  explicit DataLayer(const LayerConfig& config) : Layer(config) {}
+
+  virtual void setData(const Argument& data) { data_ = data; }
+
+  /**
+   * Prefetch sparse matrix/ids only.
+   */
+  void prefetch() override { output_ = data_; }
+
+  /**
+   * Forward propagation. Copy data_ (value, in, grad, ids, cpuSequenceDims,
+   * sequenceStartPositions, subSequenceStartPositions, strs) to output_.
+   */
+  void forward(PassType passType) override {
+    Layer::forward(passType);
+    copyDataToOutput(output_);
+    if (FLAGS_show_layer_stat) {
+      showOutputStats();
+    }
+  }
+
+  /**
+   * Data layer's backward propagation do nothing.
+   */
+  void backward(const UpdateCallback& callback) override { (void)callback; }
+
+  void copyOutputToOtherDevice() override {
+    for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
+      copyDataToOutput(outputOtherDevice_[i]);
+    }
+  }
+
+ private:
+  void copyDataToOutput(Argument& output);
+
+ protected:
+  Argument data_;
+};
+
+typedef std::shared_ptr<DataLayer> DataLayerPtr;
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DataNormLayer.cpp b/paddle/legacy/gserver/layers/DataNormLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6820dfa4d4dcf90b2318a190ad4cc082c26fc180
--- /dev/null
+++ b/paddle/legacy/gserver/layers/DataNormLayer.cpp
@@ -0,0 +1,140 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DataNormLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(data_norm, DataNormLayer);
+
+bool DataNormLayer::init(const LayerMap& layerMap,
+                         const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  /* initialize the weight */
+  CHECK(!biasParameter_) << "DataNormLayer does not need bias";
+  CHECK(inputLayers_.size() == 1 && inputLayers_[0]->getType() == "data")
+      << "DataNormLayer accepts one and only one DataLayer as its input layer";
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  CHECK_EQ(inputLayers_[0]->getSize(), getSize());
+  CHECK_EQ(parameters_[0]->getSize(), 5 * getSize());
+  CHECK(parameters_[0]->isStatic())
+      << "The parameter of DataNormLayer must be static";
+
+  weight_ = std::unique_ptr<Weight>(new Weight(5, getSize(), parameters_[0]));
+  min_ = Matrix::create(
+      nullptr, /* height= */ 1, getSize(), /* trans= */ false, useGpu_);
+  rangeReciprocal_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    getSize(),
+                                    /* trans= */ false,
+                                    useGpu_);
+  mean_ = Matrix::create(nullptr,
+                         /* height= */ 1,
+                         getSize(),
+                         /* trans= */ false,
+                         useGpu_);
+  stdReciprocal_ = Matrix::create(nullptr,
+                                  /* height= */ 1,
+                                  getSize(),
+                                  /* trans= */ false,
+                                  useGpu_);
+  decimalReciprocal_ = Matrix::create(nullptr,
+                                      /* height= */ 1,
+                                      getSize(),
+                                      /* trans= */ false,
+                                      useGpu_);
+
+  min_->setData(weight_->getW()->getData());
+  rangeReciprocal_->setData(weight_->getW()->getData() + getSize());
+  mean_->setData(weight_->getW()->getData() + 2 * getSize());
+  stdReciprocal_->setData(weight_->getW()->getData() + 3 * getSize());
+  decimalReciprocal_->setData(weight_->getW()->getData() + 4 * getSize());
+
+  /* normalization strategy */
+  if (config_.data_norm_strategy() == "z-score") {
+    mode_ = kZScore;
+  } else if (config_.data_norm_strategy() == "min-max") {
+    mode_ = kMinMax;
+  } else if (config_.data_norm_strategy() == "decimal-scaling") {
+    mode_ = kDecimalScaling;
+  } else {
+    LOG(FATAL) << "Unknown data normalization strategy: "
+               << config_.data_norm_strategy();
+  }
+
+  return true;
+}
+
+void DataNormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInput(0).getBatchSize();
+  int size = getSize();
+  reserveOutput(batchSize, size);
+
+  const MatrixPtr inValue = getInputValue(0);
+  MatrixPtr outValue = getOutputValue();
+  outValue->copyFrom(*inValue);
+  switch (mode_) {
+    case kZScore: {
+      outValue->addBias(*mean_, -1.0);
+      outValue->colScale(0, *outValue, *stdReciprocal_);
+      break;
+    }
+    case kMinMax: {
+      outValue->addBias(*min_, -1.0);
+      outValue->colScale(0, *outValue, *rangeReciprocal_);
+      break;
+    }
+    case kDecimalScaling: {
+      outValue->colScale(0, *outValue, *decimalReciprocal_);
+      break;
+    }
+    default:
+      LOG(FATAL) << "should not reach here";
+  }
+}
+
+void DataNormLayer::backward(const UpdateCallback& callback) {
+  // The parameter for DataNormLayer is static, and does not need to be updated
+  (void)callback;
+
+  /* Calculate the input layers error */
+  const MatrixPtr& outGrad = getOutputGrad();
+  MatrixPtr inGrad = getInputGrad(0);
+  if (inGrad) {
+    switch (mode_) {
+      case kZScore: {
+        inGrad->addColScale(0, *outGrad, *stdReciprocal_);
+        break;
+      }
+      case kMinMax: {
+        inGrad->addColScale(0, *outGrad, *rangeReciprocal_);
+        break;
+      }
+      case kDecimalScaling: {
+        inGrad->addColScale(0, *outGrad, *decimalReciprocal_);
+        break;
+      }
+      default: { LOG(FATAL) << "should not reach here"; }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DataNormLayer.h b/paddle/legacy/gserver/layers/DataNormLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..7bb8e928248355cb7ae78dc16e467b77a42e02fc
--- /dev/null
+++ b/paddle/legacy/gserver/layers/DataNormLayer.h
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for data normalization
+ * - Input: One and only one input layer is accepted. The input layer must
+ *        be DataLayer with dense data type.
+ * - Output: The normalization of the input data
+ *
+ * Reference:
+ *    LA Shalabi, Z Shaaban, B Kasasbeh. Data mining: A preprocessing engine
+ *
+ * Three data normalization methoeds are considered
+ * - z-score: y = (x-mean)/std
+ * - min-max: y = (x-min)/(max-min)
+ * - decimal-scaling: y = x/10^j, where j is the smallest integer such that
+ *max(|y|)<1
+ */
+
+class DataNormLayer : public Layer {
+ public:
+  enum NormalizationStrategy { kZScore = 0, kMinMax = 1, kDecimalScaling = 2 };
+
+  explicit DataNormLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~DataNormLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ protected:
+  int mode_;
+  std::unique_ptr<Weight> weight_;
+  MatrixPtr min_;
+  MatrixPtr rangeReciprocal_;  // 1/(max-min)
+  MatrixPtr mean_;
+  MatrixPtr stdReciprocal_;      // 1/std
+  MatrixPtr decimalReciprocal_;  // 1/10^j
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DeConv3DLayer.cpp b/paddle/legacy/gserver/layers/DeConv3DLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2cd635564c4cd9f42d27cd58694cff381d1ce224
--- /dev/null
+++ b/paddle/legacy/gserver/layers/DeConv3DLayer.cpp
@@ -0,0 +1,220 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DeConv3DLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(deconv3d, DeConv3DLayer);
+
+bool DeConv3DLayer::init(const LayerMap &layerMap,
+                         const ParameterMap &parameterMap) {
+  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
+  // for Deconv, the dimension of Kernel is
+  // channel * output * depth * height * weigth
+  // Matrix storage format: (output * depth * height * weigth) x  channel
+  for (int index = 0; index < config_.inputs().size(); ++index) {
+    M_.push_back(filterChannels_[index]);
+    K_.push_back(filterPixels_[index] * (numFilters_ / groups_[index]));
+
+    // create a new weight
+    size_t height, width;
+    height = filterPixels_[index] * numFilters_;
+    width = filterChannels_[index];
+    CHECK_EQ(parameters_[index]->getSize(), width * height);
+    Weight *w = new Weight(height, width, parameters_[index]);
+    weights_.emplace_back(w);
+  }
+  if (biasParameter_.get()) {
+    if (sharedBiases_) {
+      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
+    } else {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
+    }
+  }
+  return true;
+}
+
+size_t DeConv3DLayer::getSize() {
+  CHECK_NE(inputLayers_.size(), 0UL);
+  imgSizeW_.clear();
+  imgSizeH_.clear();
+  imgSizeD_.clear();
+  N_.clear();
+  NOut_.clear();
+  size_t layerSize = 0;
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    imgSizeW_.push_back(
+        imageSize(outputW_[i], filterSize_[i], padding_[i], stride_[i], true));
+    imgSizeH_.push_back(imageSize(
+        outputH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
+    imgSizeD_.push_back(imageSize(
+        outputD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
+    NOut_.push_back(imgSizeD_[i] * imgSizeH_[i] * imgSizeW_[i]);
+    N_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
+    CHECK(layerSize == 0 || N_[i] * size_t(numFilters_) == layerSize);
+    layerSize += NOut_[i] * numFilters_;
+  }
+  getOutput().setFrameHeight(imgSizeH_[0]);
+  getOutput().setFrameWidth(imgSizeW_[0]);
+  getOutput().setFrameDepth(imgSizeD_[0]);
+  return layerSize;
+}
+
+void DeConv3DLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  int outWidth = getSize();
+  resetOutput(batchSize, outWidth);
+  const MatrixPtr outMat = getOutputValue();
+
+  REGISTER_TIMER_INFO("FwdDeConv3D", getName().c_str());
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    const MatrixPtr &inMat = getInputValue(i);
+    int M = M_[i];
+    int N = N_[i];
+    int K = K_[i];
+    MatrixPtr wMat = weights_[i]->getW();
+    Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+    for (int n = 0; n < batchSize; ++n) {
+      real *inData = inMat->getData() + n * inMat->getStride();
+      for (int g = 0; g < groups_[i]; ++g) {
+        MatrixPtr inMatSub = Matrix::create(inData, M, N, false, useGpu_);
+        MatrixPtr wMatSub = wMat->subMatrix(g * K, K);
+        MatrixPtr colBufDataSub = colBuf_->subMatrix(g * K, K);
+        colBufDataSub->mul(*wMatSub, *inMatSub, 1.0, 0.0);
+        inData += M * N;
+      }
+      colBuf_->col2Vol(outMat->getData() + n * outMat->getStride(),
+                       numFilters_,
+                       imgSizeD_[i],
+                       imgSizeH_[i],
+                       imgSizeW_[i],
+                       filterSizeZ_[i],
+                       filterSizeY_[i],
+                       filterSize_[i],
+                       strideZ_[i],
+                       strideY_[i],
+                       stride_[i],
+                       paddingZ_[i],
+                       paddingY_[i],
+                       padding_[i],
+                       1.0,
+                       1.0);
+    }
+  }
+  if (nullptr != this->biasParameter_) {
+    this->addBias();
+  }
+  forwardActivation();
+}
+
+void DeConv3DLayer::backward(const UpdateCallback &callback) {
+  backwardActivation();
+  int batchSize = getOutputGrad()->getHeight();
+  if (biases_ && biases_->getWGrad()) {
+    bpropBiases();
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+  REGISTER_TIMER_INFO("BwdDeConv3D", getName().c_str());
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    if (weights_[i]->getWGrad() || this->needGradient_) {
+      int M = M_[i];
+      int N = N_[i];
+      int K = K_[i];
+      Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+      const MatrixPtr &inMat = getInputValue(i);
+      for (int n = 0; n < batchSize; ++n) {
+        colBuf_->vol2Col(
+            getOutputGrad()->getData() + n * getOutputGrad()->getStride(),
+            numFilters_,
+            imgSizeD_[i],
+            imgSizeH_[i],
+            imgSizeW_[i],
+            filterSizeZ_[i],
+            filterSizeY_[i],
+            filterSize_[i],
+            strideZ_[i],
+            strideY_[i],
+            stride_[i],
+            paddingZ_[i],
+            paddingY_[i],
+            padding_[i]);
+        if (weights_[i]->getWGrad()) {
+          real *inData = inMat->getData() + n * inMat->getStride();
+          for (int g = 0; g < groups_[i]; ++g) {
+            MatrixPtr colBufDataSub = colBuf_->subMatrix(g * K, K);
+            MatrixPtr wGradMatSub =
+                weights_[i]->getWGrad()->subMatrix(g * K, K);
+            MatrixPtr inMatSub = Matrix::create(inData, M, N, false, useGpu_);
+            wGradMatSub->mul(
+                *colBufDataSub, *(inMatSub->getTranspose()), 1.0, 1.0);
+            inData += M * N;
+          }
+        }
+        if (getInputGrad(i)) {
+          real *preGrad =
+              getInputGrad(i)->getData() + n * getInputGrad(i)->getStride();
+          for (int g = 0; g < groups_[i]; ++g) {
+            MatrixPtr w = weights_[i]->getW()->subMatrix(g * K, K);
+            MatrixPtr outGradMat = colBuf_->subMatrix(g * K, K);
+            MatrixPtr inGradMatSub =
+                Matrix::create(preGrad, M, N, false, useGpu_);
+            inGradMatSub->mul(*(w->getTranspose()), *outGradMat, 1.0, 1.0);
+            preGrad += M * N;
+          }
+        }
+      }
+      weights_[i]->getParameterPtr()->incUpdate(callback);
+    }
+  }
+}
+void DeConv3DLayer::bpropWeights(int i) {}
+void DeConv3DLayer::bpropData(int i) {}
+
+void DeConv3DLayer::bpropBiases() {
+  MatrixPtr biases = Matrix::create(biases_->getWGrad()->getData(),
+                                    1,
+                                    biases_->getWGrad()->getElementCnt(),
+                                    false,
+                                    useGpu_);
+  const MatrixPtr &outGradMat = getOutputGrad();
+
+  if (this->sharedBiases_) {
+    biases->collectSharedBias(*outGradMat, 1.0f);
+  } else {
+    biases->collectBias(*outGradMat, 1.0f);
+  }
+}
+
+void DeConv3DLayer::addBias() {
+  MatrixPtr outMat = getOutputValue();
+  MatrixPtr bias = Matrix::create(biases_->getW()->getData(),
+                                  1,
+                                  biases_->getW()->getElementCnt(),
+                                  false,
+                                  useGpu_);
+  if (this->sharedBiases_) {
+    outMat->addSharedBias(*(bias), 1.0f);
+  } else {
+    outMat->addBias(*(bias), 1.0f);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DeConv3DLayer.h b/paddle/legacy/gserver/layers/DeConv3DLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..9931bccb1284111e299206883847045edaae4ded
--- /dev/null
+++ b/paddle/legacy/gserver/layers/DeConv3DLayer.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "ConvBaseLayer.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of deconvolution3D layer.
+ * This layer expands input and use matrix multiplication to
+ * calculate deconvolution3D operation.
+ */
+class DeConv3DLayer : public ConvBaseLayer {
+ public:
+  explicit DeConv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
+  ~DeConv3DLayer() {}
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void addBias();
+  void backward(const UpdateCallback& callback);
+  void bpropBiases();
+  void bpropData(int i);
+  void bpropWeights(int i);
+  size_t getSize();
+
+ protected:
+  // Figure out the dimensions for individual gemms.
+  IntV M_;  /// numFilters_ / filter_group_;
+  IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
+  IntV K_;  /// outputD_ * outputH_ * outputW_
+  IntV NOut_;
+  MatrixPtr colBuf_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DetectionOutputLayer.cpp b/paddle/legacy/gserver/layers/DetectionOutputLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/DetectionOutputLayer.cpp
rename to paddle/legacy/gserver/layers/DetectionOutputLayer.cpp
diff --git a/paddle/legacy/gserver/layers/DetectionOutputLayer.h b/paddle/legacy/gserver/layers/DetectionOutputLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0270ed33141993665aeabdc53829600a4403643
--- /dev/null
+++ b/paddle/legacy/gserver/layers/DetectionOutputLayer.h
@@ -0,0 +1,77 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <vector>
+#include "DetectionUtil.h"
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * The detection output layer for a SSD detection task. This layer applies the
+ * Non-maximum suppression to the all predicted bounding box and keeps the
+ * Top-K bounding boxes.
+ * - Input: This layer needs three input layers: The first input layer
+ *          is the priorbox layer. The rest two input layers are convolution
+ *          layers for generating bbox location offset and the classification
+ *          confidence.
+ * - Output: The predict bounding box locations.
+ */
+
+class DetectionOutputLayer : public Layer {
+ public:
+  explicit DetectionOutputLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+
+  void backward(const UpdateCallback& callback = nullptr) {}
+
+ protected:
+  inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
+
+  inline LayerPtr getLocInputLayer(size_t index) {
+    return inputLayers_[1 + index];
+  }
+
+  inline LayerPtr getConfInputLayer(size_t index) {
+    return inputLayers_[1 + inputNum_ + index];
+  }
+
+ private:
+  size_t numClasses_;  // number of classes
+  size_t inputNum_;    // number of input layers
+  real nmsThreshold_;
+  real confidenceThreshold_;
+  size_t nmsTopK_;
+  size_t keepTopK_;
+  size_t backgroundId_;
+
+  size_t locSizeSum_;
+  size_t confSizeSum_;
+
+  MatrixPtr locBuffer_;
+  MatrixPtr confBuffer_;
+  MatrixPtr locTmpBuffer_;
+  MatrixPtr confTmpBuffer_;
+  MatrixPtr priorCpuValue_;
+  MatrixPtr locCpuBuffer_;
+  MatrixPtr confCpuBuffer_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DetectionUtil.cpp b/paddle/legacy/gserver/layers/DetectionUtil.cpp
similarity index 100%
rename from paddle/gserver/layers/DetectionUtil.cpp
rename to paddle/legacy/gserver/layers/DetectionUtil.cpp
diff --git a/paddle/legacy/gserver/layers/DetectionUtil.h b/paddle/legacy/gserver/layers/DetectionUtil.h
new file mode 100644
index 0000000000000000000000000000000000000000..c1e0bb809ad290613159f558e9b1860476b3b5f2
--- /dev/null
+++ b/paddle/legacy/gserver/layers/DetectionUtil.h
@@ -0,0 +1,307 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <float.h>
+#include <algorithm>
+#include <vector>
+#include "paddle/legacy/math/Matrix.h"
+
+using std::vector;
+using std::pair;
+using std::map;
+
+namespace paddle {
+
+template <typename T>
+struct BBoxBase {
+  BBoxBase(T xMin, T yMin, T xMax, T yMax)
+      : xMin(xMin), yMin(yMin), xMax(xMax), yMax(yMax), isDifficult(false) {}
+
+  BBoxBase() {}
+
+  T getWidth() const { return xMax - xMin; }
+
+  T getHeight() const { return yMax - yMin; }
+
+  T getCenterX() const { return (xMin + xMax) / 2; }
+
+  T getCenterY() const { return (yMin + yMax) / 2; }
+
+  T getArea() const { return getWidth() * getHeight(); }
+
+  // coordinate of bounding box
+  T xMin;
+  T yMin;
+  T xMax;
+  T yMax;
+  // whether difficult object (e.g. object with heavy occlusion is difficult)
+  bool isDifficult;
+};
+
+struct NormalizedBBox : BBoxBase<real> {
+  NormalizedBBox() : BBoxBase<real>() {}
+};
+
+enum PermMode { kNCHWToNHWC, kNHWCToNCHW };
+
+/**
+ * @brief First permute input maxtrix then append to output matrix
+ */
+size_t appendWithPermute(const Matrix& inMatrix,
+                         size_t height,
+                         size_t width,
+                         size_t outTotalSize,
+                         size_t outOffset,
+                         size_t batchSize,
+                         Matrix& outMatrix,
+                         PermMode permMode);
+
+/**
+ * @brief First permute input maxtrix then decompose to output
+ */
+size_t decomposeWithPermute(const Matrix& inMatrix,
+                            size_t height,
+                            size_t width,
+                            size_t totalSize,
+                            size_t offset,
+                            size_t batchSize,
+                            Matrix& outMatrix,
+                            PermMode permMode);
+
+/**
+ * @brief Compute jaccard overlap between two bboxes.
+ * @param bbox1 The first bbox
+ * @param bbox2 The second bbox
+ */
+real jaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2);
+
+/**
+ * @brief Compute offset parameters between prior bbox and ground truth bbox
+ * and variances of prior bbox are considered
+ * @param priorBBox Input prior bbox
+ * @param priorBBoxVar Variance parameters of prior bbox
+ * @param gtBBox Groundtruth bbox
+ * @param outVec Output vector
+ */
+void encodeBBoxWithVar(const NormalizedBBox& priorBBox,
+                       const vector<real>& priorBBoxVar,
+                       const NormalizedBBox& gtBBox,
+                       vector<real>& outVec);
+
+/**
+ * @brief Decode prior bbox with offset parameters
+ * and variances of prior bbox are considered
+ * @param priorBBox Prior bbox to be decoded
+ * @param priorBBoxVar Variance parameters of prior bbox
+ * @param locPredData Offset parameters
+ */
+NormalizedBBox decodeBBoxWithVar(const NormalizedBBox& priorBBox,
+                                 const vector<real>& priorBBoxVar,
+                                 const vector<real>& locPredData);
+
+/**
+ * @brief Extract bboxes from prior matrix, the layout is
+ * xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var ...
+ * @param priorData Matrix of prior value
+ * @param numBBoxes Number of bbox to be extracted
+ * @param bboxVec Append to the vector
+ */
+void getBBoxFromPriorData(const real* priorData,
+                          const size_t numBBoxes,
+                          vector<NormalizedBBox>& bboxVec);
+
+/**
+ * @brief Extract labels, scores and bboxes from detection matrix, the layout is
+ * imageId | label | score | xmin | ymin | xmax | ymax
+ * @param detectData Matrix of detection value
+ * @param numBBoxes Number of bbox to be extracted
+ * @param labelVec Label of bbox
+ * @param scoreVec Score of bbox
+ * @param bboxVec Append to the vector
+ */
+void getBBoxFromDetectData(const real* detectData,
+                           const size_t numBBoxes,
+                           vector<real>& labelVec,
+                           vector<real>& scoreVec,
+                           vector<NormalizedBBox>& bboxVec);
+
+/**
+ * @brief Extract variances from prior matrix, the layout is
+ * xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var ...
+ * @param priorData Matrix of prior value
+ * @param num Number to be extracted
+ * @param varVec Append to the vector
+ */
+void getBBoxVarFromPriorData(const real* priorData,
+                             const size_t num,
+                             vector<vector<real>>& varVec);
+
+/**
+ * @brief Extract bboxes from label matrix, the layout is
+ * class1_1 | xmin1_1 | ymin1_1 | xmax1_1 | ymax1_1 | difficult1_1 | ...
+ * @param labelData Matrix of label value
+ * @param numBBoxes Number to be extracted
+ * @param bboxVec Append to the vector
+ */
+void getBBoxFromLabelData(const real* labelData,
+                          const size_t numBBoxes,
+                          vector<NormalizedBBox>& bboxVec);
+
+/**
+* @brief Match prior bbox to groundtruth bbox, the strategy is:
+1. Find the most overlaped bbox pair (prior and groundtruth)
+2. For rest of prior bboxes find the most overlaped groundtruth bbox
+* @param priorBBoxes prior bbox
+* @param gtBBoxes groundtruth bbox
+* @param overlapThreshold Low boundary of overlap (judge whether matched)
+* @param matchIndices For each prior bbox, groundtruth bbox index if matched
+otherwise -1
+* @param matchOverlaps For each prior bbox, overap with all groundtruth bboxes
+*/
+void matchBBox(const vector<NormalizedBBox>& priorBBoxes,
+               const vector<NormalizedBBox>& gtBBoxes,
+               real overlapThreshold,
+               vector<int>* matchIndices,
+               vector<real>* matchOverlaps);
+
+/**
+* @brief Generate positive bboxes and negative bboxes,
+|positive bboxes|/|negative bboxes| is negPosRatio
+* @param priorValue Prior value
+* @param numPriorBBoxes Number of prior bbox
+* @param gtValue Groundtruth value
+* @param gtStartPosPtr Since groundtruth value stored as sequence type,
+this parameter indicates start position of each record
+* @param seqNum Number of sequence
+* @param maxConfScore Classification score for prior bbox, used to mine
+negative examples
+* @param batchSize Image number
+* @param overlapThreshold Low boundary of overap
+* @param negOverlapThreshold Upper boundary of overap (judge negative example)
+* @param negPosRatio Control number of negative bboxes
+* @param matchIndicesVecPtr Save indices of matched prior bbox
+* @param negIndicesVecPtr Save indices of negative prior bbox
+*/
+pair<size_t, size_t> generateMatchIndices(
+    const Matrix& priorValue,
+    const size_t numPriorBBoxes,
+    const Matrix& gtValue,
+    const int* gtStartPosPtr,
+    const size_t seqNum,
+    const vector<vector<real>>& maxConfScore,
+    const size_t batchSize,
+    const real overlapThreshold,
+    const real negOverlapThreshold,
+    const size_t negPosRatio,
+    vector<vector<int>>* matchIndicesVecPtr,
+    vector<vector<int>>* negIndicesVecPtr);
+
+/**
+ * @brief Get max confidence score for each prior bbox
+ * @param confData Confidence scores, layout is
+ * class1 score | class2 score | ... | classN score ...
+ * @param batchSize Image number
+ * @param numPriorBBoxes Prior bbox number
+ * @param numClasses Classes number
+ * @param backgroundId Background id
+ * @param maxConfScoreVecPtr Ouput
+ */
+void getMaxConfidenceScores(const real* confData,
+                            const size_t batchSize,
+                            const size_t numPriorBBoxes,
+                            const size_t numClasses,
+                            const size_t backgroundId,
+                            vector<vector<real>>* maxConfScoreVecPtr);
+
+template <typename T>
+bool sortScorePairDescend(const pair<real, T>& pair1,
+                          const pair<real, T>& pair2);
+
+template <>
+bool sortScorePairDescend(const pair<real, NormalizedBBox>& pair1,
+                          const pair<real, NormalizedBBox>& pair2);
+
+/**
+ * @brief Do NMS for bboxes to remove duplicated bboxes
+ * @param bboxes BBoxes to apply NMS
+ * @param confScoreData Confidence scores
+ * @param classIdx Class to do NMS
+ * @param topK Number to keep
+ * @param confThreshold Low boundary of confidence score
+ * @param nmsThreshold Threshold of overlap
+ * @param numPriorBBoxes Total number of prior bboxes
+ * @param numClasses Total class number
+ * @param indices Indices of high quality bboxes
+ */
+void applyNMSFast(const vector<NormalizedBBox>& bboxes,
+                  const real* confScoreData,
+                  size_t classIdx,
+                  size_t topK,
+                  real confThreshold,
+                  real nmsThreshold,
+                  size_t numPriorBBoxes,
+                  size_t numClasses,
+                  vector<size_t>* indices);
+
+/**
+ * @brief Get detection results which satify requirements
+ * @param numPriorBBoxes Prior bbox number
+ * @param numClasses Class number
+ * @param backgroundId Background class
+ * @param batchSize Image number
+ * @param confThreshold Threshold of class confidence
+ * @param nmsTopK Used in NMS operation to keep top k bbox
+ * @param nmsThreshold Used in NMS, threshold of overlap
+ * @param keepTopK How many bboxes keeped in an image
+ * @param allDecodedBBoxes Decoded bboxes for all images
+ * @param allDetectionIndices Save detection bbox indices
+ */
+size_t getDetectionIndices(
+    const real* confData,
+    const size_t numPriorBBoxes,
+    const size_t numClasses,
+    const size_t backgroundId,
+    const size_t batchSize,
+    const real confThreshold,
+    const size_t nmsTopK,
+    const real nmsThreshold,
+    const size_t keepTopK,
+    const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
+    vector<map<size_t, vector<size_t>>>* allDetectionIndices);
+
+/**
+ * @brief Get detection results
+ * @param confData Confidence scores
+ * @param numPriorBBoxes Prior bbox number
+ * @param numClasses Class number
+ * @param batchSize Image number
+ * @param allIndices Indices of predicted bboxes
+ * @param allDecodedBBoxes BBoxes decoded
+ * @param out Output matrix
+ * image number | label | confidence score | xMin | yMin | xMax | yMax
+ */
+void getDetectionOutput(const real* confData,
+                        const size_t numKept,
+                        const size_t numPriorBBoxes,
+                        const size_t numClasses,
+                        const size_t batchSize,
+                        const vector<map<size_t, vector<size_t>>>& allIndices,
+                        const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
+                        Matrix& out);
+
+NormalizedBBox clipBBox(const NormalizedBBox& bbox);
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DotMulOperator.cpp b/paddle/legacy/gserver/layers/DotMulOperator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..03d18d9b239e57dc41334462f2324ae2d0505a62
--- /dev/null
+++ b/paddle/legacy/gserver/layers/DotMulOperator.cpp
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Operator.h"
+
+namespace paddle {
+
+/**
+ * DotMulOperator takes two inputs, performs element-wise multiplication:
+ * \f[
+ *   out.row[i] += scale * (in1.row[i] .* in2.row[i])
+ * \f]
+ * where \f$.*\f$ means element-wise multiplication,
+ * and scale is a config scalar, its default value is one.
+ *
+ * The config file api is dotmul_operator.
+ */
+class DotMulOperator : public Operator {
+ public:
+  DotMulOperator(const OperatorConfig& config, bool useGpu);
+  virtual void forward();
+  virtual void backward();
+};
+
+REGISTER_OPERATOR(dot_mul, DotMulOperator);
+
+DotMulOperator::DotMulOperator(const OperatorConfig& config, bool useGpu)
+    : Operator(config, useGpu) {
+  CHECK_EQ(config_.input_indices_size(), 2L);
+}
+
+void DotMulOperator::forward() {
+  out_->value->addDotMul(
+      *ins_[0]->value, *ins_[1]->value, 1, config_.dotmul_scale());
+}
+
+void DotMulOperator::backward() {
+  const MatrixPtr& inV0 = ins_[0]->value;
+  const MatrixPtr& inV1 = ins_[1]->value;
+  const MatrixPtr& inG0 = ins_[0]->grad;
+  const MatrixPtr& inG1 = ins_[1]->grad;
+
+  if (inG0) {
+    inG0->addDotMul(*out_->grad, *inV1, 1, config_.dotmul_scale());
+  }
+  if (inG1) {
+    inG1->addDotMul(*out_->grad, *inV0, 1, config_.dotmul_scale());
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DotMulProjection.cpp b/paddle/legacy/gserver/layers/DotMulProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d7780387670e83af24fa342be3d596b618b1f677
--- /dev/null
+++ b/paddle/legacy/gserver/layers/DotMulProjection.cpp
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Projection.h"
+
+namespace paddle {
+
+/**
+ * DotMulProjection performs element-wise multiplication with weight:
+ * \f[
+ *   out.row[i] += in.row[i] .* weight
+ * \f]
+ * where \f$.*\f$ means element-wise multiplication.
+ *
+ * The config file api is dotmul_projection.
+ */
+class DotMulProjection : public Projection {
+ public:
+  DotMulProjection(const ProjectionConfig& config,
+                   const ParameterPtr& parameter,
+                   bool useGpu);
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+
+ protected:
+  /// shared memory with parameter
+  std::unique_ptr<Weight> weight_;
+};
+
+REGISTER_PROJECTION(dot_mul, DotMulProjection);
+
+DotMulProjection::DotMulProjection(const ProjectionConfig& config,
+                                   const ParameterPtr& parameter,
+                                   bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  weight_.reset(new Weight(1LU, config.output_size(), parameter));
+}
+
+void DotMulProjection::forward() {
+  out_->value->addDotMulMMV(*in_->value, *(weight_->getW()));
+}
+
+void DotMulProjection::backward(const UpdateCallback& callback) {
+  /* Calculate the W-gradient for the current layer */
+  if (weight_->getWGrad()) {
+    weight_->getWGrad()->addDotMulVMM(*out_->grad, *in_->value);
+  }
+
+  /* Calculate the input layers error */
+  if (in_->grad) {
+    in_->grad->addDotMulMMV(*out_->grad, *(weight_->getW()));
+  }
+
+  parameter_->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/DotProdLayer.cpp b/paddle/legacy/gserver/layers/DotProdLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..06060d93f76c18d893852a5f5c99c36fe5641b2e
--- /dev/null
+++ b/paddle/legacy/gserver/layers/DotProdLayer.cpp
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for computing the dot product of two vectors.
+ * Input1: vector (batchSize * dim)
+ * Input2: vector (batchSize * dim)
+ * Output: a matrix: (batchSize * 1)
+ */
+
+class DotProdLayer : public Layer {
+ public:
+  explicit DotProdLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~DotProdLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(dot_prod, DotProdLayer);
+
+bool DotProdLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+  CHECK_EQ(1UL, getSize())
+      << "The output dimensionality of this layer should be fixed to 1.";
+
+  return true;
+}
+
+void DotProdLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV0->getHeight();
+  CHECK_EQ(inV1->getHeight(), batchSize);
+  CHECK_EQ(inV0->getWidth(), inV1->getWidth());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, 1);
+  }
+
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwDotProdTimer", getName().c_str());
+    outV->sumOfProducts(*inV0, *inV1, 1, 0);
+  }
+}
+
+void DotProdLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+
+  {
+    REGISTER_TIMER_INFO("BwDotProdTimer", getName().c_str());
+
+    if (inG0) {
+      inG0->addRowScale(0, *inV1, *outG);
+    }
+
+    if (inG1) {
+      inG1->addRowScale(0, *inV0, *outG);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/EosIdCheckLayer.cpp b/paddle/legacy/gserver/layers/EosIdCheckLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..38671126c62ba36e22496dcbe1ff3c8d6dcea742
--- /dev/null
+++ b/paddle/legacy/gserver/layers/EosIdCheckLayer.cpp
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+/**
+ * A layer for checking EOS for each sample:
+ * - output_id = (input_id == conf.eos_id)
+ *
+ * The result is stored in output_.ids.
+ * It is used by recurrent layer group.
+ */
+class EosIdCheckLayer : public Layer {
+ public:
+  explicit EosIdCheckLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
+    bool ret = Layer::init(layerMap, parameterMap);
+    CHECK_EQ(1UL, inputLayers_.size());
+    return ret;
+  }
+
+  void forward(PassType passType) override {
+    Layer::forward(passType);
+
+    const Argument& input = getInput(0);
+    IVector::resizeOrCreate(output_.ids, input.ids->getSize(), useGpu_);
+    output_.ids->isEqualTo(*input.ids, config_.eos_id());
+  }
+
+  void backward(const UpdateCallback& callback) override {}
+};
+
+REGISTER_LAYER(eos_id, EosIdCheckLayer);
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ExpandConvLayer.cpp b/paddle/legacy/gserver/layers/ExpandConvLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8a53db380686cea2ad121c948c45a0fa1154381e
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ExpandConvLayer.cpp
@@ -0,0 +1,248 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ExpandConvLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+DEFINE_bool(use_nnpack,
+            false,
+            "Whether to use nnpack for convolution calculation.");
+
+namespace paddle {
+
+/*
+ * The calculation of the exconvt(convolution transpose (deconv) operation)
+ * is a swap of forward and backward of the calculation of exconv.
+ * */
+REGISTER_LAYER(exconv, ExpandConvLayer);
+REGISTER_LAYER(exconvt, ExpandConvLayer);
+
+inline bool isDepthwiseConv(int channels, int groups) {
+  return channels == groups;
+}
+
+bool ExpandConvLayer::init(const LayerMap &layerMap,
+                           const ParameterMap &parameterMap) {
+  /* Initialize the basic convolutional parent class */
+  ConvBaseLayer::init(layerMap, parameterMap);
+
+  int index = 0;
+  for (auto &inputConfig : config_.inputs()) {
+    const ConvConfig &conf = inputConfig.conv_conf();
+    /* Consistent caffe mode for multiple input */
+    caffeMode_ = conf.caffe_mode();
+
+    // create a new weight
+    size_t height, width;
+    height = filterPixels_[index] * filterChannels_[index];
+    width = (!isDeconv_) ? numFilters_ : channels_[index];
+    CHECK_EQ(parameters_[index]->getSize(), width * height);
+    Weight *w = new Weight(height, width, parameters_[index]);
+    weights_.emplace_back(w);
+    index++;
+  }
+
+  if (biasParameter_.get()) {
+    if (sharedBiases_) {
+      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
+      biases_ = std::unique_ptr<Weight>(
+          new Weight(1, numFilters_, biasParameter_, 0));
+    } else {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_, 0));
+    }
+  }
+
+  getOutputSize();
+
+  size_t numInputs = config_.inputs_size();
+  inputShape_.resize(numInputs);
+  filterShape_.resize(numInputs);
+  outputShape_.resize(numInputs);
+
+  std::string convType;
+  std::string convGradInputType;
+  std::string convGradFilterType;
+
+  for (int i = 0; i < config_.inputs_size(); i++) {
+    std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
+    std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
+    std::vector<size_t> dilations = {(size_t)dilationY_[i],
+                                     (size_t)dilation_[i]};
+
+    bool useDilation = ((size_t)dilationY_[i] > 1 || (size_t)dilation_[i] > 1);
+
+    // Convolution Layer uses the GemmConv function by default.
+    convType = "GemmConv";
+    convGradInputType = "GemmConvGradInput";
+    convGradFilterType = "GemmConvGradFilter";
+
+    // If depth wise convolution and useGpu == true
+    if (useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) {
+      convType = "DepthwiseConv";
+      convGradInputType = "DepthwiseConvGradInput";
+      convGradFilterType = "DepthwiseConvGradFilter";
+    }
+
+    // If depth wise convolution and useGpu == false and ARM-NEON
+    if (!useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) {
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+      if ((filterSize_[i] == filterSizeY_[i]) &&
+          (filterSize_[i] == 3 || filterSize_[i] == 4) &&
+          (stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2) &&
+          !useDilation) {
+        convType = "NeonDepthwiseConv";
+      }
+#endif
+    }
+
+    if (FLAGS_use_nnpack && !isDeconv_ && !useDilation) {
+      createFunction(forward_,
+                     "NNPACKConv",
+                     FuncConfig()
+                         .set("paddings", paddings)
+                         .set("strides", strides)
+                         .set("groups", (size_t)groups_[i])
+                         .set("algo", std::string("auto")));
+    } else {
+      createFunction(forward_,
+                     !isDeconv_ ? convType : convGradInputType,
+                     FuncConfig()
+                         .set("paddings", paddings)
+                         .set("strides", strides)
+                         .set("dilations", dilations)
+                         .set("groups", (size_t)groups_[i]));
+
+      createFunction(backward_,
+                     !isDeconv_ ? convGradInputType : convType,
+                     FuncConfig()
+                         .set("paddings", paddings)
+                         .set("strides", strides)
+                         .set("dilations", dilations)
+                         .set("groups", (size_t)groups_[i]));
+
+      createFunction(backward_,
+                     convGradFilterType,
+                     FuncConfig()
+                         .set("paddings", paddings)
+                         .set("strides", strides)
+                         .set("dilations", dilations)
+                         .set("groups", (size_t)groups_[i]));
+    }
+  }
+  return true;
+}
+
+size_t ExpandConvLayer::getOutputSize() {
+  CHECK_NE(inputLayers_.size(), 0UL);
+  size_t layerSize = ConvBaseLayer::calOutputSize();
+  return layerSize;
+}
+
+// i is the index of input layers
+#define BACKWARD_INPUT(i, inputs, outputs) \
+  backward_[2 * i]->calc(inputs, outputs)
+#define BACKWARD_FILTER(i, inputs, outputs) \
+  backward_[2 * i + 1]->calc(inputs, outputs)
+
+void ExpandConvLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  resetOutput(batchSize, getOutputSize());
+
+  // Calculate the shape of the input, output, and filter.
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    inputShape_[i] = TensorShape({(size_t)batchSize,
+                                  (size_t)channels_[i],
+                                  (size_t)imgSizeH_[i],
+                                  (size_t)imgSizeW_[i]});
+    filterShape_[i] =
+        TensorShape({(size_t)groups_[i],
+                     !isDeconv_ ? (size_t)numFilters_ / groups_[i]
+                                : (size_t)channels_[i] / groups_[i],
+                     !isDeconv_ ? (size_t)channels_[i] / groups_[i]
+                                : (size_t)numFilters_ / groups_[i],
+                     (size_t)filterSizeY_[i],
+                     (size_t)filterSize_[i]});
+    outputShape_[i] = TensorShape({(size_t)batchSize,
+                                   (size_t)numFilters_,
+                                   (size_t)outputH_[i],
+                                   (size_t)outputW_[i]});
+  }
+
+  // Calculate the output value.
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*getInputValue(i), inputShape_[i]);
+    inputs.addArg(*weights_[i]->getW(), filterShape_[i]);
+    outputs.addArg(*getOutputValue(),
+                   outputShape_[i],
+                   !isDeconv_ && i == 0 ? ASSIGN_TO : ADD_TO);
+
+    forward_[i]->calc(inputs, outputs);
+  }
+
+  /* add the bias-vector */
+  if (biases_.get()) {
+    output_.value->addBias(*biases_->getW(), 1.0, sharedBiases_);
+  }
+
+  /* activation */
+  forwardActivation();
+}
+
+void ExpandConvLayer::backward(const UpdateCallback &callback) {
+  backwardActivation();
+
+  MatrixPtr outGrad = getOutputGrad();
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBiases_);
+    /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  // Calculate the input grad and filter grad.
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    if (getInputGrad(i)) {
+      BufferArgs inputs;
+      BufferArgs outputs;
+      inputs.addArg(*getOutputGrad(), outputShape_[i]);
+      inputs.addArg(*weights_[i]->getW(), filterShape_[i]);
+      outputs.addArg(*getInputGrad(i), inputShape_[i], ADD_TO);
+      BACKWARD_INPUT(i, inputs, outputs);
+    }
+
+    if (weights_[i]->getWGrad()) {
+      BufferArgs inputs;
+      BufferArgs outputs;
+      if (!isDeconv_) {
+        inputs.addArg(*getOutputGrad(), outputShape_[i]);
+        inputs.addArg(*getInputValue(i), inputShape_[i]);
+      } else {
+        inputs.addArg(*getInputValue(i), inputShape_[i]);
+        inputs.addArg(*getOutputGrad(), outputShape_[i]);
+      }
+      outputs.addArg(*weights_[i]->getWGrad(), filterShape_[i], ADD_TO);
+      BACKWARD_FILTER(i, inputs, outputs);
+
+      /* Increasing the number of gradient */
+      weights_[i]->getParameterPtr()->incUpdate(callback);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ExpandConvLayer.h b/paddle/legacy/gserver/layers/ExpandConvLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0eff3ab061949bd583e0deaf121912ed993be76
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ExpandConvLayer.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "ConvBaseLayer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of convolution layer.
+ * This layer expands input and use matrix multiplication to
+ * calculate convolution operation.
+ *
+ * The config file api is img_conv_layer.
+ */
+
+class ExpandConvLayer : public ConvBaseLayer {
+ public:
+  explicit ExpandConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
+
+  ~ExpandConvLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+
+  size_t getOutputSize();
+
+ protected:
+  std::vector<TensorShape> inputShape_;
+  std::vector<TensorShape> filterShape_;
+  std::vector<TensorShape> outputShape_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ExpandLayer.cpp b/paddle/legacy/gserver/layers/ExpandLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..074fbab8ef9d1453160058031be370e991459fa5
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ExpandLayer.cpp
@@ -0,0 +1,133 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ExpandLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(expand, ExpandLayer);
+
+bool ExpandLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(inputLayers_.size(), 2UL);
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+  // which sequence type of input[0]
+  if (config_.trans_type() == "non-seq") {
+    type_ = kNonSeq;
+  } else if (config_.trans_type() == "seq") {
+    type_ = kSeq;
+  } else {
+    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
+  }
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void ExpandLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  // Expand layer should have exactly 2 input, one for data, one for size
+  CHECK_EQ(2U, inputLayers_.size());
+
+  // using two input:
+  // * first one for data;
+  // * second one only for sequence info
+  const Argument& shapeInput = getInput(1);
+  const Argument& dataInput = getInput(0);
+  size_t outputBatchSize = shapeInput.getBatchSize();
+  auto startPositions = type_ ? shapeInput.subSequenceStartPositions
+                              : shapeInput.sequenceStartPositions;
+  size_t numSequences = startPositions->getSize() - 1;
+  const int* starts = startPositions->getData(false);
+
+  CHECK_EQ(starts[numSequences], shapeInput.getBatchSize());
+  if (type_) {
+    // when trans_type = seq, input[1] must hasSubseq
+    CHECK_EQ(shapeInput.hasSubseq(), 1UL);
+    CHECK_EQ(dataInput.getNumSequences(), shapeInput.getNumSequences());
+  } else {
+    CHECK_EQ(dataInput.getBatchSize(), shapeInput.getNumSequences());
+  }
+
+  // set output sequence info as shape sequence
+  output_.sequenceStartPositions = shapeInput.sequenceStartPositions;
+  if (shapeInput.hasSubseq()) {
+    output_.subSequenceStartPositions = shapeInput.subSequenceStartPositions;
+  }
+
+  // reserve output: Expand output to batchsize of sequence data.
+  reserveOutput(outputBatchSize, dataInput.value->getWidth());
+
+  MatrixPtr inputValue = getInputValue(0);
+  MatrixPtr outputValue = getOutputValue();
+
+  ICpuGpuVector::resizeOrCreate(expandStartsPos_, outputBatchSize, false);
+  int* expandStarts = expandStartsPos_->getMutableData(false);
+  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
+    int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
+    for (int j = 0; j < sequenceLength; j++) {
+      expandStarts[starts[sequenceId] + j] = sequenceId;
+    }
+  }
+
+  outputValue->copyByRowIndex(*inputValue,
+                              *expandStartsPos_->getVector(useGpu_));
+
+  if (biases_.get() != NULL) {
+    outputValue->addBias(*(biases_->getW()), 1);
+  }
+}
+
+void ExpandLayer::backward(const UpdateCallback& callback) {
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+    /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  if (!getInputGrad(0)) return;
+  MatrixPtr inputGrad = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+  auto cpuSeqStartPos = type_ ? getInput(1).subSequenceStartPositions
+                              : getInput(1).sequenceStartPositions;
+  size_t numSequences = cpuSeqStartPos->getSize() - 1;
+  const int* starts = cpuSeqStartPos->getData(false);
+
+  CHECK_EQ(inputGrad->getWidth(), outputGrad->getWidth());
+  CHECK_EQ(outputGrad->getHeight(), (size_t)starts[numSequences]);
+
+  AsyncGpuBlock asyncGpuBlock;
+
+  // sum to get the grad
+  real scale = 1;
+  for (size_t sequenceId = 0; sequenceId < numSequences; sequenceId++) {
+    // TODO(Dangqingqing) optimization for GPU
+    int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
+    if (sequenceLength == 0) {
+      // empty sequence
+      continue;
+    }
+    MatrixPtr copyData = inputGrad->subMatrix(sequenceId, 1);
+    copyData->collectBias(
+        *outputGrad->subMatrix(starts[sequenceId], sequenceLength), scale);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ExpandLayer.h b/paddle/legacy/gserver/layers/ExpandLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..75a1ec75688cdbc61a117da7d4be47848c30425a
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ExpandLayer.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * A layer for "Expand Dense data or (sequence data where the length of each
+ * sequence is one) to sequence data."
+ *
+ * It should have exactly 2 input, one for data, one for size:
+ * - first one for data
+ *   - If ExpandLevel = kNonSeq: dense data
+ *   - If ExpandLevel = kSeq: sequence data where the length of each sequence is
+ * one
+ * - second one only for sequence info
+ *   - should be sequence data with or without sub-sequence.
+ *
+ * And the output size is the batch size(not instances) of second input.
+ *
+ * The config file api is expand_layer.
+ */
+
+class ExpandLayer : public Layer {
+ protected:
+  std::unique_ptr<Weight> biases_;
+  /// if input[0] is dense data, ExpandLevel=kNonSeq;
+  /// if input[0] is sequence data, ExpandLevel=kSeq
+  enum ExpandLevel { kNonSeq = 0, kSeq = 1 };
+  /// store the ExpandLevel
+  int type_;
+  /// expanded sequenceStartPositions or subSequenceStartPositions
+  /// of input[1]
+  ICpuGpuVectorPtr expandStartsPos_;
+
+ public:
+  explicit ExpandLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ExpandLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/FactorizationMachineLayer.cpp b/paddle/legacy/gserver/layers/FactorizationMachineLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6cf269fa3ffb3f4a2864aea4225d9401930e73b1
--- /dev/null
+++ b/paddle/legacy/gserver/layers/FactorizationMachineLayer.cpp
@@ -0,0 +1,158 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "FactorizationMachineLayer.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(factorization_machine, FactorizationMachineLayer);
+
+bool FactorizationMachineLayer::init(const LayerMap& layerMap,
+                                     const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  factorSize_ = config_.factor_size();
+
+  /* initialize the latentVectors_ */
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t inputSize = inputLayers_[0]->getSize();
+  CHECK_EQ(parameters_[0]->getSize(), inputSize * factorSize_);
+  latentVectors_ = std::unique_ptr<Weight>(
+      new Weight(inputSize, factorSize_, parameters_[0]));
+
+  return true;
+}
+
+void FactorizationMachineLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const MatrixPtr& inputV = getInputValue(0);
+
+  size_t batchSize = inputV->getHeight();
+  size_t outputSize = getSize();
+  size_t inputSize = inputLayers_[0]->getSize();
+  reserveOutput(batchSize, outputSize);
+
+  MatrixPtr outV = getOutputValue();
+
+  Matrix::resizeOrCreate(
+      latentVectorsSquare_, inputSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      inputMulFactor_, batchSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);
+
+  REGISTER_TIMER_INFO("FmInputMulFactorTimer", getName().c_str());
+  inputMulFactor_->mul(*inputV, *latentVectors_->getW());
+  inputMulFactor_->square2(*tmpOut_);
+  outV->sumRows(*tmpOut_, 0.5, 0);
+
+  if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+    Matrix::resizeOrCreateSparseMatrix(inputSquare_,
+                                       inputV->getHeight(),
+                                       inputV->getWidth(),
+                                       inputV->getElementCnt(),
+                                       inputV->getValueType());
+    inputSquare_->copyFrom(*inputV);
+    (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get()))->square2();
+  } else {
+    Matrix::resizeOrCreate(
+        inputSquare_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
+    inputV->square2(*inputSquare_);
+  }
+  latentVectors_->getW()->square2(*latentVectorsSquare_);
+  tmpOut_->mul(*inputSquare_, *latentVectorsSquare_);
+  outV->sumRows(*tmpOut_, -0.5, 1.0);
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FmFwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ { backwardActivation(); }
+
+  const MatrixPtr& inputV = getInputValue(0);
+  const MatrixPtr& oGrad = getOutputGrad();
+
+  Matrix::resizeOrCreate(
+      tmpSum_, 1, latentVectors_->getW()->getHeight(), false, useGpu_);
+  MatrixPtr tmpSumTrans = Matrix::create(tmpSum_->getRowBuf(0),
+                                         latentVectors_->getW()->getHeight(),
+                                         1,
+                                         false,
+                                         useGpu_);
+
+  /* Calculate the gradients of the latentVectors_ matrix */
+  if (latentVectors_->getWGrad()) {
+    if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+      Matrix::resizeOrCreateSparseMatrix(tmpInput_,
+                                         inputV->getHeight(),
+                                         inputV->getWidth(),
+                                         inputV->getElementCnt());
+
+      CpuSparseMatrix* sparseInputV =
+          dynamic_cast<CpuSparseMatrix*>(inputV.get());
+      CpuSparseMatrix* sparseInputSquare =
+          dynamic_cast<CpuSparseMatrix*>(inputSquare_.get());
+      CpuSparseMatrix* sparseTmpInput =
+          dynamic_cast<CpuSparseMatrix*>(tmpInput_.get());
+      sparseTmpInput->copyFrom(*sparseInputV);
+
+      sparseTmpInput->rowScale(0, *sparseInputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1);
+      sparseTmpInput->rowScale(0, *sparseInputSquare, *oGrad);
+
+      Matrix::resizeOrCreate(negOnes_, 1, inputV->getHeight(), false, useGpu_);
+      negOnes_->zeroMem();
+      negOnes_->add(-1);
+      tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0);
+    } else {
+      Matrix::resizeOrCreate(
+          tmpInput_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
+
+      tmpInput_->rowScale(0, *inputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *tmpInput_->getTranspose(), *inputMulFactor_, 1, 1);
+      tmpInput_->rowScale(0, *inputSquare_, *oGrad);
+
+      tmpSum_->sumCols(*tmpInput_, -1, 0);
+    }
+
+    latentVectors_->getWGrad()->addRowScale(
+        0, *latentVectors_->getW(), *tmpSumTrans);
+
+    /* Increasing the number of gradient */
+    latentVectors_->getParameterPtr()->incUpdate(callback);
+  }
+
+  /* Calculate the input layers gradient */
+  MatrixPtr inGrad = getInputGrad(0);
+  if (inGrad != NULL) {
+    inGrad->mul(
+        *inputMulFactor_, *latentVectors_->getW()->getTranspose(), 1, 1);
+    tmpSumTrans->sumRows(*latentVectorsSquare_, -1, 0);
+    inGrad->addColScale(0, *inputV, *tmpSum_);
+    inGrad->rowScale(0, *inGrad, *oGrad);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/FactorizationMachineLayer.h b/paddle/legacy/gserver/layers/FactorizationMachineLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc015ed727bbd8781bb50a22b8e745d8896837e1
--- /dev/null
+++ b/paddle/legacy/gserver/layers/FactorizationMachineLayer.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+namespace paddle {
+/**
+ * @brief The Factorization Machine models pairwise (order-2) feature
+ * interactions as inner product of the learned latent vectors corresponding
+ * to each input feature.
+ *
+ * The Factorization Machine can effectively capture feature interactions
+ * especially when the input is sparse. While in principle FM can model higher
+ * order feature interaction, in practice usually only order-2 feature
+ * interactions are considered. The Factorization Machine Layer here only
+ * computes the order-2 interations with the formula:
+ *
+ * \f[
+ *     y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+ * \f]
+ *
+ * The detailed calculation for forward and backward can be found at this paper:
+ *
+ *     Factorization machines.
+ *
+ * The config file api is factorization_machine.
+ */
+
+class FactorizationMachineLayer : public Layer {
+ protected:
+  // The latent vectors, shape: (size, factorSize_)
+  // Each row of the latentVectors_ matrix is the latent vector
+  // corresponding to one input feature dimension
+  std::unique_ptr<Weight> latentVectors_;
+  // The hyperparameter that defines the dimensionality of the factorization
+  size_t factorSize_;
+
+ private:
+  // Store the square values of the letent vectors matrix
+  MatrixPtr latentVectorsSquare_;
+  // Store the square values of input matrix
+  MatrixPtr inputSquare_;
+  // The result of input matrix * latent vector matrix that will be used in
+  // both forward and backward step
+  MatrixPtr inputMulFactor_;
+  // Store temporary calculation result
+  MatrixPtr tmpOut_;
+  MatrixPtr tmpSum_;
+  MatrixPtr tmpInput_;
+  // Negative identity matrix
+  MatrixPtr negOnes_;
+
+ public:
+  explicit FactorizationMachineLayer(const LayerConfig& config)
+      : Layer(config) {}
+  ~FactorizationMachineLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/FeatureMapExpandLayer.cpp b/paddle/legacy/gserver/layers/FeatureMapExpandLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a3fe1433e4b5fd7bd77f8d6bb73378243d391dd5
--- /dev/null
+++ b/paddle/legacy/gserver/layers/FeatureMapExpandLayer.cpp
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for expanding a batch of images to feature maps.
+ * Each data of the input is a 2 dimensional matrix. Each element of the matrix
+ * is replicated num_filters times to create a feature map with num_filters
+ * channels.
+ * - Input: Input one should be dense image data.
+ * - Output: expanded fature maps.
+ * \f[
+ *  y.row[i] = x.row[i \mod x.width], i = 0,1,..., (x.width * num\_filters - 1)
+ * \f]
+ * For example, num_filters = 4:
+ * @code
+ *   x = [a1,a2;
+ *        b1,b2]
+ *   y = [a1, a2, a1, a2, a1, a2, a1, a2;
+ *        b1, b2, b1, b2, b1, b2, b1, b2;]
+ * @endcode
+ */
+
+class FeatureMapExpandLayer : public Layer {
+ private:
+  int numFilters_;
+  bool asRowVector_;
+
+ public:
+  explicit FeatureMapExpandLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~FeatureMapExpandLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(featmap_expand, FeatureMapExpandLayer);
+
+bool FeatureMapExpandLayer::init(const LayerMap& layerMap,
+                                 const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  numFilters_ = config_.num_filters();
+  asRowVector_ = config_.user_arg() != "as_col_vec";
+  return true;
+}
+
+void FeatureMapExpandLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr inputV = getInputValue(0);
+  size_t batchSize = getInput(0).getBatchSize();
+  int imgSize = inputV->getWidth();
+  resetOutput(batchSize, imgSize * numFilters_);
+
+  MatrixPtr outputV = getOutputValue();
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    if (asRowVector_) {
+      for (size_t i = 0; i < batchSize; i++) {
+        MatrixPtr outVTmp =
+            Matrix::create(outputV->getData() + i * imgSize * numFilters_,
+                           numFilters_,
+                           imgSize,
+                           false,
+                           useGpu_);
+        MatrixPtr inVTmp = Matrix::create(
+            inputV->getData() + i * imgSize, 1, imgSize, false, useGpu_);
+        outVTmp->addRowVector(*inVTmp);
+      }
+    } else {
+      for (size_t i = 0; i < batchSize; i++) {
+        MatrixPtr outVTmp =
+            Matrix::create(outputV->getData() + i * imgSize * numFilters_,
+                           imgSize,
+                           numFilters_,
+                           false,
+                           useGpu_);
+        MatrixPtr inVTmp = Matrix::create(
+            inputV->getData() + i * imgSize, imgSize, 1, false, useGpu_);
+        outVTmp->addColVector(*inVTmp);
+      }
+    }
+  }
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void FeatureMapExpandLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inGrad = getInputGrad(0);
+  if (NULL == inGrad) {
+    return;
+  }
+  MatrixPtr outGrad = getOutputGrad();
+  size_t batchSize = getInput(0).getBatchSize();
+  int imgSize = inGrad->getWidth();
+  /* Do activation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    if (asRowVector_) {
+      for (size_t i = 0; i < batchSize; i++) {
+        MatrixPtr outGradTmp =
+            Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
+                           numFilters_,
+                           imgSize,
+                           false,
+                           useGpu_);
+        MatrixPtr inGradTmp = Matrix::create(
+            inGrad->getData() + i * imgSize, 1, imgSize, false, useGpu_);
+        inGradTmp->collectBias(*outGradTmp, 1);
+      }
+    } else {
+      for (size_t i = 0; i < batchSize; i++) {
+        MatrixPtr outGradTmp =
+            Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
+                           imgSize,
+                           numFilters_,
+                           false,
+                           useGpu_);
+        MatrixPtr inGradTmp = Matrix::create(
+            inGrad->getData() + i * imgSize, imgSize, 1, false, useGpu_);
+        inGradTmp->sumRows(*outGradTmp, 1, 1);
+      }
+    }
+  }
+}
+
+}  // namespace paddle.
diff --git a/paddle/gserver/layers/FullMatrixProjection.cpp b/paddle/legacy/gserver/layers/FullMatrixProjection.cpp
similarity index 100%
rename from paddle/gserver/layers/FullMatrixProjection.cpp
rename to paddle/legacy/gserver/layers/FullMatrixProjection.cpp
diff --git a/paddle/legacy/gserver/layers/FullMatrixProjection.h b/paddle/legacy/gserver/layers/FullMatrixProjection.h
new file mode 100644
index 0000000000000000000000000000000000000000..c33d02a3aeac8e83f613e61320cb6cd63baeae83
--- /dev/null
+++ b/paddle/legacy/gserver/layers/FullMatrixProjection.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/legacy/utils/Stat.h"
+
+#include "Projection.h"
+
+namespace paddle {
+
+/**
+ * FullMatrixProjection performs full matrix multiplication:
+ * \f[
+ *    out.row[i] += in.row[i] * weight
+ * \f]
+ *
+ * The config file api is full_matrix_projection.
+ */
+class FullMatrixProjection : public Projection {
+ public:
+  FullMatrixProjection(const ProjectionConfig& config,
+                       const ParameterPtr& parameter,
+                       bool useGpu);
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+
+ protected:
+  std::unique_ptr<Weight> weight_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/FullyConnectedLayer.cpp b/paddle/legacy/gserver/layers/FullyConnectedLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..07f4dfbe39c6b9bc233b3c75b4b5891a1ec9b2ec
--- /dev/null
+++ b/paddle/legacy/gserver/layers/FullyConnectedLayer.cpp
@@ -0,0 +1,150 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "FullyConnectedLayer.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(fc, FullyConnectedLayer);
+
+bool FullyConnectedLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  /* initialize the weightList */
+  CHECK(inputLayers_.size() == parameters_.size());
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    // Option the parameters
+    size_t height = inputLayers_[i]->getSize();
+    size_t width = getSize();
+
+    // create a new weight
+    if (parameters_[i]->isSparse()) {
+      CHECK_LE(parameters_[i]->getSize(), width * height);
+    } else {
+      CHECK_EQ(parameters_[i]->getSize(), width * height);
+    }
+    Weight* w = new Weight(height, width, parameters_[i]);
+
+    // append the new weight to the list
+    weights_.emplace_back(w);
+  }
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+
+  return true;
+}
+
+void FullyConnectedLayer::prefetch() {
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    auto* sparseParam =
+        dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
+    if (sparseParam) {
+      MatrixPtr input = getInputValue(i);
+      sparseParam->addRows(input);
+    }
+  }
+}
+
+void FullyConnectedLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInput(0).getBatchSize();
+  int size = getSize();
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, size);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    auto input = getInput(i);
+    CHECK(input.value) << "The input of 'fc' layer must be matrix";
+    REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
+    i == 0 ? outV->mul(*input.value, *weights_[i]->getW(), 1, 0)
+           : outV->mul(*input.value, *weights_[i]->getW(), 1, 1);
+  }
+
+  /* add the bias-vector */
+  if (biases_.get() != NULL) {
+    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
+    outV->addBias(*(biases_->getW()), 1);
+  }
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void FullyConnectedLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  if (biases_ && biases_->getWGrad()) {
+    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  bool syncFlag = hl_get_sync_flag();
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    /* Calculate the W-gradient for the current layer */
+    if (weights_[i]->getWGrad()) {
+      MatrixPtr input_T = getInputValue(i)->getTranspose();
+      MatrixPtr oGrad = getOutputGrad();
+      {
+        REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
+        weights_[i]->getWGrad()->mul(*input_T, *oGrad, 1, 1);
+      }
+    }
+
+    // If callback does not change value, backprop error asynchronously so that
+    // we can do the callback concurrently.
+    hl_set_sync_flag(false);
+
+    /* Calculate the input layers error */
+    MatrixPtr preGrad = getInputGrad(i);
+    if (NULL != preGrad) {
+      MatrixPtr weights_T = weights_[i]->getW()->getTranspose();
+      REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
+      preGrad->mul(*getOutputGrad(), *weights_T, 1, 1);
+    }
+
+    hl_set_sync_flag(syncFlag);
+    {
+      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+      weights_[i]->getParameterPtr()->incUpdate(callback);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/FullyConnectedLayer.h b/paddle/legacy/gserver/layers/FullyConnectedLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e29cac0437a8ae735ffb71e5ee901edd79fa7f3
--- /dev/null
+++ b/paddle/legacy/gserver/layers/FullyConnectedLayer.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+namespace paddle {
+/**
+ * A layer has full connections to all neurons in the previous layer.
+ * It computes an inner product with a set of learned weights, and
+ * (optionally) adds biases.
+ *
+ * The config file api is fc_layer.
+ */
+
+class FullyConnectedLayer : public Layer {
+ protected:
+  WeightList weights_;
+  std::unique_ptr<Weight> biases_;
+
+ public:
+  explicit FullyConnectedLayer(const LayerConfig& config) : Layer(config) {}
+  ~FullyConnectedLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  Weight& getWeight(int idx) { return *weights_[idx]; }
+
+  void prefetch() override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/GatedRecurrentLayer.cpp b/paddle/legacy/gserver/layers/GatedRecurrentLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bdcd445cb47de346a8ca496fdaecf7d1f841f51e
--- /dev/null
+++ b/paddle/legacy/gserver/layers/GatedRecurrentLayer.cpp
@@ -0,0 +1,414 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "GatedRecurrentLayer.h"
+#include "Layer.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(gated_recurrent, GatedRecurrentLayer);
+
+bool GatedRecurrentLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  if (!Layer::init(layerMap, parameterMap)) return false;
+  CHECK_EQ(1U, inputLayers_.size());
+  CHECK_EQ(1U, parameters_.size());
+  CHECK_EQ(getSize() * getSize() * 3, parameters_[0]->getSize());
+  CHECK_EQ(getSize() * 3, biasParameter_->getSize());
+  weight_.reset(new Weight(getSize(), getSize() * 3, parameters_[0]));
+  gateWeight_.reset(new Weight(getSize(), getSize() * 2, parameters_[0], 0));
+  stateWeight_.reset(new Weight(
+      getSize(), getSize(), parameters_[0], 2 * getSize() * getSize()));
+  if (biasParameter_.get() != NULL) {
+    bias_.reset(new Weight(1, getSize() * 3, biasParameter_));
+  }
+
+  reversed_ = config_.reversed();
+  activationGate_.reset(ActivationFunction::create(config_.active_gate_type()));
+
+  GruCompute::init(config_);
+  useBatch_ = true;
+
+  return true;
+}
+
+void GatedRecurrentLayer::resetState() {
+  CHECK(!reversed_) << "state is not allowed for reversed gated "
+                       "recurrent layer";
+  Matrix::resizeOrCreate(
+      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
+  prevOutput_->zeroMem();
+
+  // TODO(hedaoyuan): support prev_batch_state
+  CHECK(!FLAGS_prev_batch_state) << "Not supported";
+
+  useBatch_ = false;
+}
+
+void GatedRecurrentLayer::setState(LayerStatePtr state) {
+  CHECK(state->value.size() == 1)
+      << "one matrix is expected for GatedRecurrentLayer state";
+  prevOutput_->copyFrom(*(state->value[0]));
+}
+
+LayerStatePtr GatedRecurrentLayer::getState() {
+  LayerStatePtr res = std::make_shared<LayerState>();
+  res->value.push_back(prevOutput_->clone(0, 0, useGpu_));
+  res->value[0]->copyFrom(*prevOutput_);
+  return res;
+}
+
+void GatedRecurrentLayer::forward(PassType passType) {
+  REGISTER_TIMER_INFO("GruFwTimer", getName().c_str());
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+  CHECK(input.sequenceStartPositions);
+  int batchSize = input.getBatchSize();
+  size_t numSequences = input.getNumSequences();
+  resetOutput(batchSize, getSize());
+  CHECK_EQ(getSize() * 3, input.value->getWidth());
+  const int* starts = input.sequenceStartPositions->getData(false);
+  // batchSize = length of total frames in a batch (NOT size of mini-batch)
+  CHECK_EQ(starts[numSequences], batchSize);
+
+  Matrix::resizeOrCreate(gate_.value,
+                         /* height= */ batchSize,
+                         getSize() * 3,
+                         /* trans= */ false,
+                         useGpu_);
+  Matrix::resizeOrCreate(resetOutput_.value,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
+                         useGpu_);
+
+  if (useBatch_) {
+    forwardBatch(batchSize, numSequences, starts, input.value);
+  } else {
+    forwardSequence(batchSize, numSequences, starts, input.value);
+  }
+}
+
+void GatedRecurrentLayer::backward(const UpdateCallback& callback) {
+  REGISTER_TIMER_INFO("GruBwTimer", getName().c_str());
+  const Argument& input = getInput(0);
+  CHECK(input.sequenceStartPositions);
+  int batchSize = input.getBatchSize();
+  const int* starts = input.sequenceStartPositions->getData(false);
+  size_t numSequences = input.getNumSequences();
+
+  Matrix::resizeOrCreate(gate_.grad,
+                         /* height= */ batchSize,
+                         getSize() * 3,
+                         /* trans= */ false,
+                         useGpu_);
+  Matrix::resizeOrCreate(resetOutput_.grad,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
+                         useGpu_);
+
+  if (useBatch_) {
+    backwardBatch(batchSize, input.grad);
+  } else {
+    backwardSequence(batchSize, numSequences, starts, input.grad);
+  }
+
+  if (bias_) {
+    bias_->getParameterPtr()->incUpdate(callback);
+  }
+
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+
+void GatedRecurrentLayer::forwardSequence(int batchSize,
+                                          size_t numSequences,
+                                          const int* starts,
+                                          MatrixPtr inputValue) {
+  REGISTER_TIMER_INFO("GruFwSequenceTime", getName().c_str());
+  gate_.value->assign(*inputValue);
+  if (bias_) {
+    gate_.value->addBias(*(bias_->getW()), 1);
+  }
+
+  hl_gru_value gruValue;
+  gruValue.gateWeight = (gateWeight_->getW())->getData();
+  gruValue.stateWeight = (stateWeight_->getW())->getData();
+  gruValue.gateValue = gate_.value->getData();
+  gruValue.resetOutputValue = resetOutput_.value->getData();
+  gruValue.outputValue = output_.value->getData();
+  gruValue.prevOutValue = nullptr;
+
+  if (reversed_) {
+    gruValue.gateValue += (batchSize - 1) * getSize() * 3;
+    gruValue.resetOutputValue += (batchSize - 1) * getSize();
+    gruValue.outputValue += (batchSize - 1) * getSize();
+  }
+
+  auto nextFrame = [&gruValue](bool reversed, int frameSize) {
+    gruValue.prevOutValue = gruValue.outputValue;
+    if (!reversed) {
+      gruValue.gateValue += frameSize * 3;
+      gruValue.resetOutputValue += frameSize;
+      gruValue.outputValue += frameSize;
+    } else {
+      gruValue.gateValue -= frameSize * 3;
+      gruValue.resetOutputValue -= frameSize;
+      gruValue.outputValue -= frameSize;
+    }
+  };
+
+  if (!reversed_) {
+    if (prevOutput_) {
+      gruValue.prevOutValue = prevOutput_->getData();
+    }
+  }
+  AsyncGpuBlock asyncGpuBlock;
+  for (size_t n = 0; n < numSequences; ++n) {
+    int length;
+    if (!reversed_) {
+      length = starts[n + 1] - starts[n];
+    } else {
+      length = starts[numSequences - n] - starts[numSequences - n - 1];
+    }
+    for (int l = 0; l < length; ++l) {
+      if (useGpu_) {
+        GruCompute::forward<1>(gruValue, getSize());
+      } else {
+        GruCompute::forward<0>(gruValue, getSize());
+      }
+
+      nextFrame(reversed_, getSize());
+    }
+    if (!reversed_) {
+      if (!prevOutput_) gruValue.prevOutValue = nullptr;
+    } else {
+      gruValue.prevOutValue = nullptr;
+    }
+  }
+
+  if (!reversed_) {
+    if (prevOutput_) {
+      prevOutput_->assign(*output_.value->subMatrix(batchSize - 1, 1));
+    }
+  }
+}
+
+void GatedRecurrentLayer::backwardSequence(int batchSize,
+                                           size_t numSequences,
+                                           const int* starts,
+                                           MatrixPtr inputGrad) {
+  REGISTER_TIMER_INFO("GruBwSequenceTime", getName().c_str());
+
+  hl_gru_value gruValue;
+  gruValue.gateWeight = (gateWeight_->getW())->getData();
+  gruValue.stateWeight = (stateWeight_->getW())->getData();
+  gruValue.gateValue = gate_.value->getData();
+  gruValue.resetOutputValue = resetOutput_.value->getData();
+  gruValue.outputValue = output_.value->getData();
+
+  hl_gru_grad gruGrad;
+  gruGrad.gateWeightGrad =
+      (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
+  gruGrad.stateWeightGrad =
+      (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData()
+                                : nullptr);
+  gruGrad.gateGrad = gate_.grad->getData();
+  gruGrad.resetOutputGrad = resetOutput_.grad->getData();
+  gruGrad.outputGrad = output_.grad->getData();
+
+  if (!reversed_) {
+    gruValue.gateValue += (batchSize - 1) * getSize() * 3;
+    gruValue.resetOutputValue += (batchSize - 1) * getSize();
+    gruValue.outputValue += (batchSize - 1) * getSize();
+    gruGrad.gateGrad += (batchSize - 1) * getSize() * 3;
+    gruGrad.resetOutputGrad += (batchSize - 1) * getSize();
+    gruGrad.outputGrad += (batchSize - 1) * getSize();
+    gruValue.prevOutValue = gruValue.outputValue - getSize();
+    gruGrad.prevOutGrad = gruGrad.outputGrad - getSize();
+  } else {
+    gruValue.prevOutValue = gruValue.outputValue + getSize();
+    gruGrad.prevOutGrad = gruGrad.outputGrad + getSize();
+  }
+
+  auto nextFrame = [&gruValue, &gruGrad](bool reversed, int frameSize) {
+    if (reversed) {
+      gruValue.gateValue += frameSize * 3;
+      gruValue.resetOutputValue += frameSize;
+      gruValue.outputValue += frameSize;
+      gruGrad.gateGrad += frameSize * 3;
+      gruGrad.resetOutputGrad += frameSize;
+      gruGrad.outputGrad += frameSize;
+      gruValue.prevOutValue = gruValue.outputValue + frameSize;
+      gruGrad.prevOutGrad = gruGrad.outputGrad + frameSize;
+    } else {
+      gruValue.gateValue -= frameSize * 3;
+      gruValue.resetOutputValue -= frameSize;
+      gruValue.outputValue -= frameSize;
+      gruGrad.gateGrad -= frameSize * 3;
+      gruGrad.resetOutputGrad -= frameSize;
+      gruGrad.outputGrad -= frameSize;
+      gruValue.prevOutValue = gruValue.outputValue - frameSize;
+      gruGrad.prevOutGrad = gruGrad.outputGrad - frameSize;
+    }
+  };
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    for (size_t n = 0; n < numSequences; ++n) {
+      int length;
+      if (reversed_) {
+        length = starts[n + 1] - starts[n];
+      } else {
+        length = starts[numSequences - n] - starts[numSequences - n - 1];
+      }
+      for (int l = 0; l < length; ++l) {
+        if (l == length - 1) {
+          gruValue.prevOutValue = nullptr;
+          gruGrad.prevOutGrad = nullptr;
+        }
+        if (useGpu_) {
+          GruCompute::backward<1>(gruValue, gruGrad, getSize());
+        } else {
+          GruCompute::backward<0>(gruValue, gruGrad, getSize());
+        }
+        nextFrame(reversed_, getSize());
+      }
+    }
+  }
+
+  if (inputGrad) {
+    inputGrad->add(*gate_.grad);
+  }
+  if (bias_ && bias_->getWGrad()) {
+    bias_->getWGrad()->collectBias(*gate_.grad, 1);
+  }
+}
+
+void GatedRecurrentLayer::forwardBatch(int batchSize,
+                                       size_t numSequences,
+                                       const int* starts,
+                                       MatrixPtr inputValue) {
+  REGISTER_TIMER_INFO("GruFwBatchTime", getName().c_str());
+  hl_gru_value gruValue;
+  gruValue.gateWeight = (gateWeight_->getW())->getData();
+  gruValue.stateWeight = (stateWeight_->getW())->getData();
+
+  if (!batchValue_) {
+    batchValue_.reset(new SequenceToBatch(useGpu_));
+  }
+  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_);
+
+  batchValue_->resizeOrCreate(*output_.value);
+  batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */ true);
+  if (bias_) {
+    gate_.value->addBias(*(bias_->getW()), 1);
+  }
+
+  {
+    int numBatch = batchValue_->getNumBatch();
+    int curBatchSize = 0;
+    AsyncGpuBlock asyncGpuBlock;
+    for (int n = 0; n < numBatch; n++) {
+      MatrixPtr outputValueTmp = batchValue_->getBatchValue(n);
+      gruValue.outputValue = outputValueTmp->getData();
+      gruValue.gateValue =
+          (batchValue_->getBatchValue(*gate_.value, n))->getData();
+      gruValue.resetOutputValue =
+          (batchValue_->getBatchValue(*resetOutput_.value, n))->getData();
+
+      curBatchSize = outputValueTmp->getHeight();
+      gruValue.prevOutValue =
+          (n == 0
+               ? nullptr
+               : (batchValue_->getBatchValue(n - 1, curBatchSize))->getData());
+
+      {
+        if (useGpu_) {
+          GruCompute::forward<1>(gruValue, getSize(), curBatchSize);
+        } else {
+          GruCompute::forward<0>(gruValue, getSize(), curBatchSize);
+        }
+      }
+    }
+  }
+  { batchValue_->copyBackSeq(*output_.value); }
+}
+
+void GatedRecurrentLayer::backwardBatch(int batchSize, MatrixPtr inputGrad) {
+  REGISTER_TIMER_INFO("GruBwBatchTime", getName().c_str());
+  hl_gru_value gruValue;
+  gruValue.gateWeight = (gateWeight_->getW())->getData();
+  gruValue.stateWeight = (stateWeight_->getW())->getData();
+
+  hl_gru_grad gruGrad;
+  gruGrad.gateWeightGrad =
+      (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr);
+  gruGrad.stateWeightGrad =
+      (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData()
+                                : nullptr);
+
+  if (!batchGrad_) {
+    batchGrad_.reset(new SequenceToBatch(useGpu_));
+  }
+  batchGrad_->shareIndexWith(*batchValue_);
+
+  { batchGrad_->copyFromSeq(*output_.grad); }
+
+  {
+    int numBatch = batchGrad_->getNumBatch();
+    int batchSize = 0;
+    AsyncGpuBlock asyncGpuBlock;
+    for (int n = (int)numBatch - 1; n >= 0; n--) {
+      gruValue.gateValue =
+          (batchGrad_->getBatchValue(*gate_.value, n))->getData();
+      gruValue.resetOutputValue =
+          (batchGrad_->getBatchValue(*resetOutput_.value, n))->getData();
+
+      MatrixPtr outputGradTmp = batchGrad_->getBatchValue(n);
+      gruGrad.outputGrad = outputGradTmp->getData();
+      gruGrad.gateGrad = (batchGrad_->getBatchValue(*gate_.grad, n))->getData();
+      gruGrad.resetOutputGrad =
+          (batchGrad_->getBatchValue(*resetOutput_.grad, n))->getData();
+
+      {
+        batchSize = outputGradTmp->getHeight();
+        gruValue.prevOutValue =
+            (n == 0
+                 ? nullptr
+                 : (batchValue_->getBatchValue(n - 1, batchSize))->getData());
+        gruGrad.prevOutGrad =
+            (n == 0 ? nullptr
+                    : (batchGrad_->getBatchValue(n - 1, batchSize))->getData());
+
+        if (useGpu_) {
+          GruCompute::backward<1>(gruValue, gruGrad, getSize(), batchSize);
+        } else {
+          GruCompute::backward<0>(gruValue, gruGrad, getSize(), batchSize);
+        }
+      }
+    }
+  }
+
+  if (inputGrad) {
+    batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */ false);
+  }
+  if (bias_ && bias_->getWGrad()) {
+    bias_->getWGrad()->collectBias(*gate_.grad, /* scale */ 1);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/GatedRecurrentLayer.h b/paddle/legacy/gserver/layers/GatedRecurrentLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..8bbf01ce200c9922f49508b0499aa9422745f474
--- /dev/null
+++ b/paddle/legacy/gserver/layers/GatedRecurrentLayer.h
@@ -0,0 +1,100 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "GruCompute.h"
+#include "Layer.h"
+#include "SequenceToBatch.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief Please refer to "Junyoung Chung, Empirical Evaluation
+ * of Gated Recurrent Neural Networks on Sequence Modeling".
+ *
+ * GatedRecurrentLayer takes 1 input layer with size * 3.
+ * Input layer is diveded into 3 equal parts: (xz_t, xr_t, xi_t).
+ * parameter and biasParameter is also diveded into 3 equal parts:
+ *   - parameter consists of (U_z, U_r, U)
+ *   - baisParameter consists of (bias_z, bias_r, bias_o)
+ *
+ * \f[
+ * update \ gate: z_t = actGate(xz_t + U_z * h_{t-1} + bias_z) \\
+ * reset \ gate: r_t = actGate(xr_t + U_r * h_{t-1} + bias_r) \\
+ * output \ candidate: {h}_t = actNode(xi_t + U * dot(r_t, h_{t-1}) + bias_o) \\
+ * hidden \ activation: h_t = dot((1-z_t), h_{t-1}) + dot(z_t, {h}_t) \\
+ * \f]
+ *
+ * @note
+ * - dot denotes "element-wise multiplication".
+ * - actNode is defined by config active_type
+ * - actGate is defined by config actvie_gate_type
+ *
+ * The config file is grumemory.
+ */
+
+class GatedRecurrentLayer : public Layer, public GruCompute {
+ public:
+  explicit GatedRecurrentLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback& callback) override;
+
+  void resetState() override;
+
+  void setState(LayerStatePtr state) override;
+
+  LayerStatePtr getState() override;
+
+ protected:
+  void forwardSequence(int batchSize,
+                       size_t numSequences,
+                       const int* starts,
+                       MatrixPtr inputValue);
+  void backwardSequence(int batchSize,
+                        size_t numSequences,
+                        const int* starts,
+                        MatrixPtr inputGrad);
+
+  void forwardBatch(int batchSize,
+                    size_t numSequences,
+                    const int* starts,
+                    MatrixPtr inputValue);
+  void backwardBatch(int batchSize, MatrixPtr inputGrad);
+
+ protected:
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> gateWeight_;
+  std::unique_ptr<Weight> stateWeight_;
+  std::unique_ptr<Weight> bias_;
+
+  Argument gate_;
+  Argument resetOutput_;
+
+  bool reversed_;
+  bool useBatch_;
+  std::unique_ptr<SequenceToBatch> batchValue_;
+  std::unique_ptr<SequenceToBatch> batchGrad_;
+  std::unique_ptr<ActivationFunction> activationGate_;
+
+  MatrixPtr prevOutput_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/GetOutputLayer.cpp b/paddle/legacy/gserver/layers/GetOutputLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7c1e3c407cca374c7aa238d07e2263c4a142b6a5
--- /dev/null
+++ b/paddle/legacy/gserver/layers/GetOutputLayer.cpp
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+class GetOutputLayer : public Layer {
+ public:
+  explicit GetOutputLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~GetOutputLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
+    if (!Layer::init(layerMap, parameterMap)) return false;
+    CHECK_EQ(1U, inputLayers_.size());
+    CHECK_NE(inputArgument_[0], "");
+    return true;
+  }
+
+  void forward(PassType passType) override {
+    output_ = getPrev(0)->getOutput(inputArgument_[0]);
+  }
+  void backward(const UpdateCallback& callback = nullptr) override {}
+};
+
+REGISTER_LAYER(get_output, GetOutputLayer);
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/GruCompute.cpp b/paddle/legacy/gserver/layers/GruCompute.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..adad6285b7d5acd8780444ffeab6627531683cb7
--- /dev/null
+++ b/paddle/legacy/gserver/layers/GruCompute.cpp
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "GruCompute.h"
+#include "hl_recurrent_apply.cuh"
+#include "paddle/legacy/function/GruFunctor.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+void GruCompute::init(LayerConfig &config) {
+  activeNode_ = hlActiveType(config.active_type());
+  activeGate_ = hlActiveType(config.active_gate_type());
+}
+
+template <>
+void GruCompute::forward<0>(hl_gru_value value, int frameSize, int batchSize) {
+  GruFunctor<DEVICE_TYPE_CPU, real>::compute(hppl::forward::gru_resetOutput(),
+                                             hppl::forward::gru_finalOutput(),
+                                             value,
+                                             frameSize,
+                                             batchSize,
+                                             activeNode_,
+                                             activeGate_);
+}
+
+template <>
+void GruCompute::backward<0>(hl_gru_value value,
+                             hl_gru_grad grad,
+                             int frameSize,
+                             int batchSize) {
+  GruGradFunctor<DEVICE_TYPE_CPU, real>::compute(
+      hppl::backward::gru_stateGrad(),
+      hppl::backward::gru_resetGrad(),
+      value,
+      grad,
+      frameSize,
+      batchSize,
+      activeNode_,
+      activeGate_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/GruCompute.cu b/paddle/legacy/gserver/layers/GruCompute.cu
similarity index 100%
rename from paddle/gserver/layers/GruCompute.cu
rename to paddle/legacy/gserver/layers/GruCompute.cu
diff --git a/paddle/legacy/gserver/layers/GruCompute.h b/paddle/legacy/gserver/layers/GruCompute.h
new file mode 100644
index 0000000000000000000000000000000000000000..6feea7aca81b8618071893581a4e16d8ad38101c
--- /dev/null
+++ b/paddle/legacy/gserver/layers/GruCompute.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ModelConfig.pb.h"
+#include "hl_gpu.h"
+#include "paddle/legacy/utils/Common.h"
+
+namespace paddle {
+
+class GruCompute {
+ public:
+  void init(LayerConfig &config);
+
+  template <bool useGpu>
+  void forward(hl_gru_value value, int frameSize, int batchSize = 1);
+
+  template <bool useGpu>
+  void backward(hl_gru_value value,
+                hl_gru_grad grad,
+                int frameSize,
+                int batchSize = 1);
+
+ public:
+  hl_activation_mode_t activeNode_;
+  hl_activation_mode_t activeGate_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/GruStepLayer.cpp b/paddle/legacy/gserver/layers/GruStepLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2480e42d68b87ee406efc2b220b9ad6bf5cacbd6
--- /dev/null
+++ b/paddle/legacy/gserver/layers/GruStepLayer.cpp
@@ -0,0 +1,177 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "GruCompute.h"
+#include "Layer.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief GruStepLayer is like GatedRecurrentLayer, but used in recurrent
+ * layer group. GruStepLayer takes 2 input layer.
+ * - input[0] with size * 3 and diveded into 3 equal parts: (xz_t, xr_t, xi_t).
+ * - input[1] with size: {prev_out}.
+ *
+ * parameter and biasParameter is also diveded into 3 equal parts:
+ * - parameter consists of (U_z, U_r, U)
+ * - baisParameter consists of (bias_z, bias_r, bias_o)
+ *
+ * \f[
+ * update \ gate: z_t = actGate(xz_t + U_z * prev_out + bias_z) \\
+ * reset \ gate: r_t = actGate(xr_t + U_r * prev_out + bias_r)  \\
+ * output \ candidate: {h}_t = actNode(xi_t + U * dot(r_t, prev_out) + bias_o)
+ * \\
+ * output: h_t = dot((1-z_t), prev_out) + dot(z_t, prev_out)
+ * \f]
+ *
+ * @note
+ *   - dot denotes "element-wise multiplication".
+ *   - actNode is defined by config active_type
+ *   - actGate is defined by config actvie_gate_type
+ *
+ * The config file api if gru_step_layer.
+ */
+class GruStepLayer : public Layer, public GruCompute {
+ protected:
+  Argument gate_;
+  Argument resetOutput_;
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> bias_;
+
+ public:
+  explicit GruStepLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~GruStepLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(gru_step, GruStepLayer);
+
+bool GruStepLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  if (!Layer::init(layerMap, parameterMap)) return false;
+  CHECK_EQ(2U, inputLayers_.size());
+
+  CHECK_EQ(getSize() * getSize() * 3, parameters_[0]->getSize());
+  weight_.reset(new Weight(getSize(), getSize() * 3, parameters_[0]));
+
+  if (biasParameter_.get() != NULL) {
+    CHECK_EQ(getSize() * 3, biasParameter_->getSize());
+    bias_.reset(new Weight(1, getSize() * 3, biasParameter_));
+  }
+
+  GruCompute::init(config_);
+  return true;
+}
+
+void GruStepLayer::forward(PassType passType) {
+  REGISTER_TIMER_INFO("GruStepFwTime", getName().c_str());
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+  const Argument& prevOutput = getInput(1);
+  CHECK_EQ(getSize() * 3, input.value->getWidth());
+  CHECK_EQ(getSize(), prevOutput.value->getWidth());
+
+  int batchSize = input.getBatchSize();
+  resetOutput(batchSize, getSize());
+  resetSpecifyOutput(gate_,
+                     batchSize,
+                     getSize() * 3,
+                     /* isValueClean */ false,
+                     /* isGradClean */ false);
+  resetSpecifyOutput(resetOutput_,
+                     batchSize,
+                     getSize(),
+                     /* isValueClean */ false,
+                     /* isGradClean */ false);
+  gate_.value->assign(*input.value);
+  if (bias_) {
+    gate_.value->addBias(*(bias_->getW()), 1);
+  }
+
+  hl_gru_value gruValue;
+  gruValue.gateWeight = weight_->getW()->getData();
+  gruValue.stateWeight = weight_->getW()->getData() + getSize() * getSize() * 2;
+  gruValue.gateValue = gate_.value->getData();
+  gruValue.resetOutputValue = resetOutput_.value->getData();
+  gruValue.outputValue = output_.value->getData();
+  gruValue.prevOutValue = prevOutput.value->getData();
+
+  if (useGpu_) {
+    GruCompute::forward<1>(gruValue, getSize(), batchSize);
+  } else {
+    GruCompute::forward<0>(gruValue, getSize(), batchSize);
+  }
+}
+
+void GruStepLayer::backward(const UpdateCallback& callback) {
+  REGISTER_TIMER_INFO("GruStepBwTime", getName().c_str());
+
+  const Argument& input = getInput(0);
+  const Argument& prevOutput = getInput(1);
+  int batchSize = input.getBatchSize();
+
+  hl_gru_value gruValue;
+  gruValue.gateWeight = weight_->getW()->getData();
+  gruValue.stateWeight = weight_->getW()->getData() + getSize() * getSize() * 2;
+  gruValue.gateValue = gate_.value->getData();
+  gruValue.resetOutputValue = resetOutput_.value->getData();
+  gruValue.outputValue = output_.value->getData();
+  gruValue.prevOutValue = prevOutput.value->getData();
+
+  hl_gru_grad gruGrad;
+  gruGrad.gateWeightGrad =
+      (weight_->getWGrad() ? weight_->getWGrad()->getData() : nullptr);
+  gruGrad.stateWeightGrad =
+      (weight_->getWGrad()
+           ? weight_->getWGrad()->getData() + getSize() * getSize() * 2
+           : nullptr);
+
+  gruGrad.gateGrad = gate_.grad->getData();
+  gruGrad.resetOutputGrad = resetOutput_.grad->getData();
+  gruGrad.outputGrad = output_.grad->getData();
+  if (prevOutput.grad) {
+    gruGrad.prevOutGrad = prevOutput.grad->getData();
+  } else {
+    gruGrad.prevOutGrad = nullptr;
+  }
+
+  if (useGpu_) {
+    GruCompute::backward<1>(gruValue, gruGrad, getSize(), batchSize);
+  } else {
+    GruCompute::backward<0>(gruValue, gruGrad, getSize(), batchSize);
+  }
+
+  if (input.grad) {
+    input.grad->add(*gate_.grad);
+  }
+
+  if (bias_ && bias_->getWGrad()) {
+    bias_->getWGrad()->collectBias(*gate_.grad, 1);
+  }
+
+  if (bias_) {
+    bias_->getParameterPtr()->incUpdate(callback);
+  }
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..34495994096a87640bdeef777feb5cd783cd4598
--- /dev/null
+++ b/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.cpp
@@ -0,0 +1,240 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "HierarchicalSigmoidLayer.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+REGISTER_LAYER(hsigmoid, HierarchicalSigmoidLayer);
+
+bool HierarchicalSigmoidLayer::init(const LayerMap& layerMap,
+                                    const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK(config_.has_num_classes()) << "num_classes must be specifed in config";
+  numClasses_ = config_.num_classes();
+  CHECK_GE(numClasses_, (size_t)2);
+  codeLength_ = findLastSet(numClasses_ - 1);
+
+  size_t height = numClasses_ - 1;
+
+  /* initialize the weightList */
+  // The last input layer is for label
+  CHECK(!parameters_.back());
+  for (size_t i = 0; i < inputLayers_.size() - 1; i++) {
+    size_t width = inputLayers_[i]->getSize();
+    // create a new weight
+    CHECK_EQ(parameters_[i]->getSize(), width * height);
+    Weight* w = new Weight(height, width, parameters_[i]);
+
+    // append the new weight to the list
+    weights_.emplace_back(w);
+  }
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    CHECK_EQ(biasParameter_->getSize(), numClasses_ - 1);
+    biases_.reset(new Weight(1, numClasses_ - 1, biasParameter_));
+  }
+
+  return true;
+}
+
+void HierarchicalSigmoidLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInputValue(0)->getHeight();
+  int size = getSize();
+  reserveOutput(batchSize, size);
+  Matrix::resizeOrCreate(preOutput_.value,
+                         batchSize,
+                         codeLength_,
+                         /* trans */ false,
+                         false);
+  Matrix::resizeOrCreate(preOutput_.grad,
+                         batchSize,
+                         codeLength_,
+                         /* trans */ false,
+                         false);
+  IVectorPtr label = getInput(*getLabelLayer()).ids;
+  preOutput_.value->zeroMem();
+
+  if (useGpu_) {
+    Matrix::resizeOrCreate(cpuOutput_,
+                           output_.value->getHeight(),
+                           output_.value->getWidth(),
+                           /* trans */ false,
+                           false);
+    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
+    cpuLabel_->copyFrom(*label);
+    cpuOutput_->copyFrom(*output_.value);
+  } else {
+    cpuOutput_ = output_.value;
+    cpuLabel_ = label;
+  }
+  /* add the bias-vector */
+  if (biases_.get() != NULL) {
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuBias_,
+                             1,
+                             numClasses_ - 1,
+                             /* trans */ false,
+                             false);
+      cpuBias_->copyFrom(*biases_->getW());
+    } else {
+      cpuBias_ = biases_->getW();
+    }
+    preOutput_.value->addByBitCode(numClasses_, *cpuLabel_, *cpuBias_);
+  }
+  for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
+    MatrixPtr input = getInputValue(i);
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuInput_,
+                             input->getHeight(),
+                             input->getWidth(),
+                             /* trans */ false,
+                             false);
+      Matrix::resizeOrCreate(cpuWeight_,
+                             weights_[i]->getW()->getHeight(),
+                             weights_[i]->getW()->getWidth(),
+                             /* trans */ false,
+                             false);
+      cpuInput_->copyFrom(*input);
+      cpuWeight_->copyFrom(*weights_[i]->getW());
+    } else {
+      cpuInput_ = input;
+      cpuWeight_ = weights_[i]->getW();
+    }
+    preOutput_.value->mulByBitCode(
+        numClasses_, *cpuLabel_, *cpuWeight_, *cpuInput_);
+  }
+  // keep consistent with the clipping in the following softrelu
+  preOutput_.value->clip(-40.0, 40.0);
+  preOutput_.value->sumByBitCode(numClasses_,
+                                 *cpuLabel_,
+                                 *cpuOutput_,
+                                 -1);  // scaleSum
+  preOutput_.value->softrelu(*preOutput_.value);
+  MatrixPtr sum = Matrix::create(batchSize, 1, /* trans= */ false, false);
+  preOutput_.value->rowSum(*sum);
+  cpuOutput_->add(*sum);
+  if (useGpu_) {
+    output_.value->copyFrom(*cpuOutput_);
+  } else {
+    output_.value = cpuOutput_;
+  }
+}
+
+void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
+  IVectorPtr label = getInput(*getLabelLayer()).ids;
+  if (useGpu_) {
+    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
+    cpuLabel_->copyFrom(*label);
+  } else {
+    cpuLabel_ = label;
+  }
+  preOutput_.grad->one();
+  preOutput_.grad->softreluDerivative(*preOutput_.value);
+  preOutput_.grad->subByBitCode(numClasses_, *cpuLabel_);
+
+  if (biases_ && biases_->getWGrad()) {
+    MatrixPtr biases_grad = biases_->getWGrad();
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuBias_,
+                             1,
+                             numClasses_ - 1,
+                             /* trans */ false,
+                             false);
+      cpuBias_->copyFrom(*biases_grad);
+    } else {
+      cpuBias_ = biases_grad;
+    }
+    preOutput_.grad->addByBitCodeBackward(numClasses_, *cpuLabel_, *cpuBias_);
+    if (useGpu_) {
+      biases_grad->copyFrom(*cpuBias_);
+    } else {
+      biases_grad = cpuBias_;
+    }
+    /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
+    /* Calculate the W-gradient for the current layer */
+    MatrixPtr input = getInputValue(i);
+    if (weights_[i]->getWGrad()) {
+      MatrixPtr weights_grad = weights_[i]->getWGrad();
+      if (useGpu_) {
+        Matrix::resizeOrCreate(cpuInput_,
+                               input->getHeight(),
+                               input->getWidth(),
+                               /* trans */ false,
+                               false);
+        Matrix::resizeOrCreate(cpuWeightGrad_,
+                               weights_grad->getHeight(),
+                               weights_grad->getWidth(),
+                               /* trans */ false,
+                               false);
+        cpuInput_->copyFrom(*input);
+        cpuWeightGrad_->copyFrom(*weights_grad);
+      } else {
+        cpuInput_ = input;
+        cpuWeightGrad_ = weights_grad;
+      }
+      preOutput_.grad->mulByBitCodeBackwardWeight(
+          numClasses_, *cpuLabel_, *cpuWeightGrad_, *cpuInput_);
+      if (useGpu_) {
+        weights_grad->copyFrom(*cpuWeightGrad_);
+      } else {
+        weights_grad = cpuWeightGrad_;
+      }
+      /* Increasing the number of gradient */
+      weights_[i]->getParameterPtr()->incUpdate(callback);
+    }
+
+    /* Calculate the input layers error */
+    MatrixPtr inputGrad = getInputGrad(i);
+    if (inputGrad) {
+      if (useGpu_) {
+        Matrix::resizeOrCreate(cpuInputGrad_,
+                               inputGrad->getHeight(),
+                               inputGrad->getWidth(),
+                               /* trans */ false,
+                               false);
+        Matrix::resizeOrCreate(cpuWeight_,
+                               weights_[i]->getW()->getHeight(),
+                               weights_[i]->getW()->getWidth(),
+                               /* trans */ false,
+                               false);
+        cpuInputGrad_->copyFrom(*inputGrad);
+        cpuWeight_->copyFrom(*weights_[i]->getW());
+      } else {
+        cpuInputGrad_ = inputGrad;
+        cpuWeight_ = weights_[i]->getW();
+      }
+      preOutput_.grad->mulByBitCodeBackwardError(
+          numClasses_, *cpuLabel_, *cpuWeight_, *cpuInputGrad_);
+      if (useGpu_) {
+        inputGrad->copyFrom(*cpuInputGrad_);
+      } else {
+        inputGrad = cpuInputGrad_;
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..73ef252fd5a5443fe065f3b7bd8c49951ae0b4bd
--- /dev/null
+++ b/paddle/legacy/gserver/layers/HierarchicalSigmoidLayer.h
@@ -0,0 +1,94 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * Organize the classes into a binary tree. At each node, a sigmoid function
+ * is used to calculate the probability of belonging to the right branch.
+ * This idea is from "F. Morin, Y. Bengio (AISTATS 05):
+ * Hierarchical Probabilistic Neural Network Language Model."
+ *
+ * Here we uses a simple way of making the binary tree.
+ * Assuming the number of classes C = 6,
+ * The classes are organized as a binary tree in the following way:
+ *
+ * @code{.py}
+ * *-*-*- 2
+ * | | |- 3
+ * | |
+ * | |-*- 4
+ * |   |- 5
+ * |
+ * |-*- 0
+ *   |- 1
+ * @endcode
+ *
+ * where * indicates an internal node, and each leaf node represents a class.
+ * - Node 0 ... C-2 are internal nodes.
+ * - Node C-1 ... 2C-2 are leaf nodes.
+ * - Class c is represented by leaf node \f$c+C-1\f$.
+ *
+ * We assign an id for each node:
+ * - the id of root be 0.
+ * - the left child of a node i is 2*i+1.
+ * - the right child of a node i is 2*i+2.
+ *
+ * It's easy to see that:
+ * - the parent of node i is \f$\left\lfloor(i-1)/2\right\rfloor\f$.
+ * - the j-th level ancestor of node i is
+ * \f$\left\lfloor(i+1)/2^{j+1}\right\rfloor - 1\f$.
+ * - A node i is a left child of its parent if \f$(i-1)\%2==0\f$.
+ *
+ * The config file api is hsigmod_layer.
+ */
+class HierarchicalSigmoidLayer : public Layer {
+ public:
+  explicit HierarchicalSigmoidLayer(const LayerConfig& config)
+      : Layer(config) {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+
+ protected:
+  /**
+   * The last of inputs is label layer.
+   */
+  LayerPtr getLabelLayer() { return inputLayers_.back(); }
+
+  WeightList weights_;
+  std::unique_ptr<Weight> biases_;
+  /// number of classes
+  size_t numClasses_;
+  /// codeLength_ = \f$1 + \left\lfloor log_{2}(numClasses-1)\right\rfloor\f$
+  int codeLength_;
+  /// temporary result of output_
+  Argument preOutput_;
+
+  /// The temporary variables in CPU memory.
+  MatrixPtr cpuWeight_;
+  MatrixPtr cpuWeightGrad_;
+  MatrixPtr cpuInput_;
+  MatrixPtr cpuInputGrad_;
+  MatrixPtr cpuBias_;
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/IdentityProjection.cpp b/paddle/legacy/gserver/layers/IdentityProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f707642e09b86721a88142ab8b745bb3492e820c
--- /dev/null
+++ b/paddle/legacy/gserver/layers/IdentityProjection.cpp
@@ -0,0 +1,103 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Projection.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * IdentityProjection performs addition:
+ * \f[
+ *   out.row[i] += in.row[i]
+ * \f]
+ *
+ * The config file api is identity_projection.
+ */
+class IdentityProjection : public Projection {
+ public:
+  IdentityProjection(const ProjectionConfig& config,
+                     const ParameterPtr& parameter,
+                     bool useGpu);
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+};
+
+REGISTER_PROJECTION(identity, IdentityProjection);
+
+/**
+ * Constructed function.
+ * @note IdentityProjection should not have any parameter.
+ */
+IdentityProjection::IdentityProjection(const ProjectionConfig& config,
+                                       const ParameterPtr& parameter,
+                                       bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  CHECK(!parameter) << "'identity' projection should not have any parameter";
+}
+
+void IdentityProjection::forward() { out_->value->add(*in_->value); }
+
+void IdentityProjection::backward(const UpdateCallback& callback) {
+  if (in_->grad) {
+    in_->grad->add(*out_->grad);
+  }
+}
+
+/**
+ * IdentityOffsetProjection likes IdentityProjection, but layer size may be
+ * smaller
+ * than input size. It selects dimensions [offset, offset+layer_size) from input
+ * to
+ * perform addition:
+ * \f[
+ *   out.row[i] += in.row[i + \textrm{offset}]
+ * \f]
+ *
+ * The config file api is identity_projection.
+ */
+class IdentityOffsetProjection : public Projection {
+ public:
+  IdentityOffsetProjection(const ProjectionConfig& config,
+                           const ParameterPtr& parameter,
+                           bool useGpu);
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+};
+
+REGISTER_PROJECTION(identity_offset, IdentityOffsetProjection);
+
+/**
+ * Constructed function.
+ * @note IdentityOffsetProjection should not have any parameter.
+ */
+IdentityOffsetProjection::IdentityOffsetProjection(
+    const ProjectionConfig& config, const ParameterPtr& parameter, bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  CHECK(!parameter) << "'identity_offset' projection "
+                       "should not have any parameter";
+  CHECK_LE(config.output_size() + config.offset(), config.input_size());
+}
+
+void IdentityOffsetProjection::forward() {
+  out_->value->addAtOffset(*in_->value, config_.offset());
+}
+
+void IdentityOffsetProjection::backward(const UpdateCallback& callback) {
+  if (in_->grad) {
+    in_->grad->addAtOffset(*out_->grad, config_.offset());
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/InterpolationLayer.cpp b/paddle/legacy/gserver/layers/InterpolationLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ed2294e8a397edfee6ad3c1f52235970d6ad48a9
--- /dev/null
+++ b/paddle/legacy/gserver/layers/InterpolationLayer.cpp
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A layer for linear interpolation with two inputs,
+ * which is used in NEURAL TURING MACHINE.
+ * \f[
+ *   y.row[i] = w[i] * x_1.row[i] + (1 - w[i]) * x_2.row[i]
+ * \f]
+ * where \f$x_1\f$ and \f$x_2\f$ are two (batchSize x dataDim) inputs,
+ * \f$w\f$ is (batchSize x 1) weight vector,
+ * and \f$y\f$ is (batchSize x dataDim) output.
+ *
+ * The config file api is interpolation_layer.
+ */
+
+class InterpolationLayer : public Layer {
+ protected:
+  /// weightLast = 1 - weight
+  MatrixPtr weightLast_;
+  MatrixPtr tmpMatrix;
+
+ public:
+  explicit InterpolationLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~InterpolationLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(interpolation, InterpolationLayer);
+
+bool InterpolationLayer::init(const LayerMap& layerMap,
+                              const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(3U, inputLayers_.size());
+
+  return true;
+}
+
+void InterpolationLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr weightV = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr inV2 = getInputValue(2);
+
+  size_t batchSize = inV1->getHeight();
+  size_t dataDim = inV1->getWidth();
+
+  CHECK_EQ(dataDim, getSize());
+  CHECK_EQ(dataDim, inV2->getWidth());
+  CHECK_EQ(batchSize, inV1->getHeight());
+  CHECK_EQ(batchSize, inV2->getHeight());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    resetOutput(batchSize, dataDim);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  Matrix::resizeOrCreate(weightLast_, batchSize, 1, false, useGpu_);
+  weightLast_->one();
+  weightLast_->sub(*weightV);
+
+  REGISTER_TIMER_INFO("FwInterpTimer", getName().c_str());
+  // outV = inV1 * weight + inV2 * weightLast
+  outV->addRowScale(0, *inV1, *weightV);
+  outV->addRowScale(0, *inV2, *weightLast_);
+}
+
+void InterpolationLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr weightV = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr inV2 = getInputValue(2);
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+  MatrixPtr inG2 = getInputGrad(2);
+
+  size_t batchSize = inV1->getHeight();
+  size_t dataDim = inV1->getWidth();
+
+  REGISTER_TIMER_INFO("BwInterpTimer", getName().c_str());
+
+  if (inG0) {
+    Matrix::resizeOrCreate(tmpMatrix, batchSize, dataDim, false, useGpu_);
+
+    // inG0 += outG .* (inV1 - inV2)
+    tmpMatrix->sub(*inV1, *inV2);
+    inG0->rowDotMul(0, *outG, *tmpMatrix);
+  }
+
+  if (inG1) {
+    // inG1 += outG * weight
+    inG1->addRowScale(0, *outG, *weightV);
+  }
+
+  if (inG2) {
+    // inG2 += outG * weightLast
+    inG2->addRowScale(0, *outG, *weightLast_);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/KmaxSeqScoreLayer.cpp b/paddle/legacy/gserver/layers/KmaxSeqScoreLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7fd25954efeb9d9e672040f9909198f2ae3c0449
--- /dev/null
+++ b/paddle/legacy/gserver/layers/KmaxSeqScoreLayer.cpp
@@ -0,0 +1,126 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+class KmaxSeqScoreLayer : public Layer {
+ private:
+  MatrixPtr scores_;
+  size_t beamSize_;
+  void kmaxScorePerSeq(const real* score,
+                       real* sortedRes,
+                       const ICpuGpuVectorPtr seqStartPos);
+
+ public:
+  explicit KmaxSeqScoreLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(kmax_seq_score, KmaxSeqScoreLayer);
+
+bool KmaxSeqScoreLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  bool ret = Layer::init(layerMap, parameterMap);
+  CHECK_EQ(1U, inputLayers_.size());
+
+  beamSize_ = config_.beam_size();
+  CHECK_GE(beamSize_, 1U);
+
+  setNeedSequenceInfo(false);
+  setNeedGradient(false);
+  return ret;
+}
+
+void KmaxSeqScoreLayer::kmaxScorePerSeq(const real* scores,
+                                        real* sortedIds,
+                                        const ICpuGpuVectorPtr seqStartPos) {
+  int* starts = seqStartPos->getMutableData(false);
+  std::vector<real> indices;
+  for (size_t i = 0; i < seqStartPos->getSize() - 1; ++i) {
+    int seqLen = starts[i + 1] - starts[i];
+    int k = std::min(static_cast<int>(beamSize_), seqLen);
+
+    indices.resize(seqLen, 0);
+    std::iota(begin(indices), end(indices), 0.);
+    std::vector<real> tmpScore(scores + starts[i], scores + starts[i + 1]);
+    std::partial_sort(
+        begin(indices),
+        begin(indices) + k,
+        end(indices),
+        [&](size_t a, size_t b) { return tmpScore[a] > tmpScore[b]; });
+    memcpy(sortedIds + (i * beamSize_), indices.data(), k * sizeof(real));
+  }
+}
+
+void KmaxSeqScoreLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+  const MatrixPtr inputScore = getInputValue(0);
+
+  CHECK(input.hasSeq() || input.hasSubseq())
+      << "input of " << getName()
+      << " must be a sequence or a nested sequence.";
+  CHECK_EQ(input.value->getWidth(), 1UL)
+      << "input of " << getName() << " are scores over a sequence or "
+      << "a nested sequence, so its width must be 1.";
+
+  if (useGpu_) {
+    /*
+     * currently, this Layer only runs in CPU, if the other part of the model is
+     * runing on GPU, then copy the input to this layer from GPU to CPU.
+     */
+    Matrix::resizeOrCreate(scores_,
+                           inputScore->getHeight(),
+                           1,
+                           false /* trans */,
+                           false /* useGpu */);
+    scores_->copyFrom(*inputScore);
+  } else {
+    scores_ = inputScore;
+  }
+
+  /*
+   * TODO(caoying)
+   * In PaddePaddle, currently all matrices are real number types,
+   * but output of this layer which is some selected indices of the give
+   * sequence are actually filled with int types so that storing int types
+   * information in a real number matrix is dangerous, since real numbers will
+   * be convered to int types.
+   */
+  Matrix::resizeOrCreate(
+      output_.value,
+      input.hasSubseq() ? input.getNumSubSequences() : input.getNumSequences(),
+      beamSize_,
+      false,
+      false);
+  output_.value->one();
+  output_.value->mulScalar(-1.);
+
+  kmaxScorePerSeq(scores_->getData(),
+                  output_.value->getData(),
+                  input.hasSubseq() ? input.subSequenceStartPositions
+                                    : input.sequenceStartPositions);
+}
+
+void KmaxSeqScoreLayer::backward(const UpdateCallback& callback) {}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/L2DistanceLayer.cpp b/paddle/legacy/gserver/layers/L2DistanceLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a3e627e57047b790b4f74089a352f06b55e48664
--- /dev/null
+++ b/paddle/legacy/gserver/layers/L2DistanceLayer.cpp
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "L2DistanceLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(l2_distance, L2DistanceLayer);
+
+bool L2DistanceLayer::init(const LayerMap& layerMap,
+                           const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2UL) << "The L2DistanceLayer accepts two and "
+                                     << "only two inputs.";
+  CHECK_EQ(getSize(), 1UL) << "The output dimensionality of L2DistanceLayer "
+                           << "is fixed to be 1.";
+
+  return true;
+}
+
+void L2DistanceLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const auto inV1 = getInputValue(0);
+  const auto inV2 = getInputValue(1);
+
+  CHECK(inV1 && inV2);
+  CHECK_EQ(inV1->getHeight(), inV2->getHeight())
+      << "The height of two inputs of this layer must be the same.";
+  CHECK_EQ(inV1->getWidth(), inV2->getWidth())
+      << "The width of two inputs of this layer must be the same.";
+
+  int batchSize = inV1->getHeight();
+  int output_dim = getSize();
+  {
+    REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
+    reserveOutput(batchSize, output_dim);
+    auto outV = getOutputValue();
+    CHECK(outV) << "The output matrix should not be null.";
+
+    Matrix::resizeOrCreate(
+        inputSub_, inV1->getHeight(), inV1->getWidth(), false, useGpu_);
+
+    inputSub_->assign(*inV1);
+    inputSub_->sub(*inV2);
+    outV->sumOfProducts(*inputSub_, *inputSub_, 1, 0);
+    outV->sqrt2(*outV);
+  }
+}
+
+void L2DistanceLayer::backward(const UpdateCallback& callback) {
+  const auto outG = getOutputGrad();
+  const auto outV = getOutputValue();
+  CHECK(outG && outV);
+
+  auto inGrad1 = getInputGrad(0);
+  auto inGrad2 = getInputGrad(1);
+
+  {
+    REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
+
+    if (inGrad1 || inGrad2) {
+      outV->scalarDiv(*outV, 1.);
+      outV->dotMul(*outG, *outV);
+    }
+
+    if (inGrad1) inGrad1->addRowScale(0, *inputSub_, *outV);
+
+    if (inGrad2) {
+      inputSub_->mulScalar(-1.);
+      inGrad2->addRowScale(0, *inputSub_, *outV);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/L2DistanceLayer.h b/paddle/legacy/gserver/layers/L2DistanceLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa8aabd9ca5702e3ebdccbe7bb4f98fa087dd238
--- /dev/null
+++ b/paddle/legacy/gserver/layers/L2DistanceLayer.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief The layer calculates the l2 distance between two input vectors.
+ * \f[
+ * f(\bf{x}, \bf{y}) = \sqrt{\sum_{i=1}^D(x_i - y_i)}
+ * \f]
+ *
+ * - Input1: A vector (batchSize * dataDim)
+ * - Input2: A vector (batchSize * dataDim)
+ * - Output: A vector (batchSize * 1)
+ *
+ * The configuration api is: l2_distance_layer.
+ */
+
+class L2DistanceLayer : public Layer {
+ public:
+  explicit L2DistanceLayer(const LayerConfig& config) : Layer(config) {}
+  ~L2DistanceLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ private:
+  // Store the result of subtracting Input2 from Input1 in forward computation,
+  // which will be reused in backward computation.
+  MatrixPtr inputSub_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Layer.cpp b/paddle/legacy/gserver/layers/Layer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..890d33552dd31a8fd348a36d44fb0824ac9b32b5
--- /dev/null
+++ b/paddle/legacy/gserver/layers/Layer.cpp
@@ -0,0 +1,410 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/Util.h"
+
+#include "CostLayer.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/legacy/utils/Error.h"
+#include "paddle/legacy/utils/Logging.h"
+
+#ifndef PADDLE_MOBILE_INFERENCE
+#include "ValidationLayer.h"
+#endif
+
+DEFINE_bool(log_error_clipping, false, "enable log error clipping or not");
+
+namespace paddle {
+
+Layer::Layer(const LayerConfig& config, bool useGpu)
+    : config_(config),
+      useGpu_(useGpu),
+      deviceId_(CPU_DEVICE),
+      needSequenceInfo_(true) {}
+
+bool Layer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+  if (useGpu_ && FLAGS_parallel_nn) {
+    /* gpu environment is specified by device property */
+    deviceId_ = config_.device();
+    if (deviceId_ < 0) {
+      useGpu_ = false;
+    }
+  }
+
+  output_.deviceId = deviceId_;
+
+  for (auto& inputConfig : config_.inputs()) {
+    std::string inputName = inputConfig.input_layer_name();
+    LayerPtr inputLayer;
+    CHECK(mapGet(inputName, layerMap, &inputLayer))
+        << "Cannot find input layer " << inputName << " for layer "
+        << getName();
+    this->addPrev(inputLayer);
+
+    inputLayer->addOutputArgument(deviceId_);
+
+    if (inputConfig.has_input_parameter_name()) {
+      ParameterPtr parameter;
+      CHECK(
+          mapGet(inputConfig.input_parameter_name(), parameterMap, &parameter))
+          << "Cannot find input parameter "
+          << inputConfig.input_parameter_name() << " for layer " << getName();
+      parameter->incShared();
+      CHECK_EQ(parameter->getDeviceId(), getDeviceId());
+      parameters_.push_back(parameter);
+    } else {
+      parameters_.push_back(nullptr);
+    }
+
+    if (inputConfig.has_input_layer_argument()) {
+      inputArgument_.push_back(inputConfig.input_layer_argument());
+    } else {
+      inputArgument_.push_back("");
+    }
+  }
+
+  if (config_.has_bias_parameter_name()) {
+    CHECK(mapGet(config_.bias_parameter_name(), parameterMap, &biasParameter_))
+        << "Cannot find bias parameter " << config_.bias_parameter_name()
+        << " for layer " << getName();
+    biasParameter_->incShared();
+    CHECK_EQ(biasParameter_->getDeviceId(), getDeviceId());
+  }
+
+  /* specify the activation function according to the configuration */
+  std::string action_type = config_.active_type();
+  activation_.reset(ActivationFunction::create(action_type));
+  CHECK(activation_);
+
+  initNeedFlags();
+  markInBackward_.assign(inputLayers_.size(), false);
+
+  return true;
+}
+
+ClassRegistrar<Layer, LayerConfig> Layer::registrar_;
+
+LayerPtr Layer::create(const LayerConfig& config) {
+  std::string type = config.type();
+
+#ifndef PADDLE_MOBILE_INFERENCE
+  // NOTE: As following types have illegal character '-',
+  // they can not use REGISTER_LAYER to registrar.
+  // Besides, to fit with old training models,
+  // they can not use '_' instead.
+  if (type == "multi-class-cross-entropy")
+    return LayerPtr(new MultiClassCrossEntropy(config));
+  else if (type == "rank-cost")
+    return LayerPtr(new RankingCost(config));
+  else if (type == "auc-validation")
+    return LayerPtr(new AucValidation(config));
+  else if (type == "pnpair-validation")
+    return LayerPtr(new PnpairValidation(config));
+#endif
+
+  return LayerPtr(registrar_.createByType(config.type(), config));
+}
+
+void Layer::resetSpecifyOutput(Argument& output,
+                               size_t height,
+                               size_t width,
+                               bool isValueClean,
+                               bool isGradClean) {
+  SetDevice device(output.deviceId);
+
+  Matrix::resizeOrCreate(
+      output.value, height, width, /* trans */ false, useGpu(output.deviceId));
+  if (isValueClean) {
+    output.value->zeroMem();
+  }
+
+  if (passType_ != PASS_TEST && needGradient()) {
+    Matrix::resizeOrCreate(
+        output.grad, height, width, /* trans */ false, useGpu(output.deviceId));
+    if (isGradClean) {
+      output.grad->zeroMem();
+    }
+  }
+}
+
+void Layer::resizeOutput(size_t height, size_t width) {
+  resetSpecifyOutput(output_, height, width, false, false);
+
+  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
+    resetSpecifyOutput(outputOtherDevice_[i], height, width, false, false);
+  }
+}
+
+void Layer::reserveOutput(size_t height, size_t width) {
+  resetSpecifyOutput(output_, height, width, false, true);
+
+  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
+    resetSpecifyOutput(outputOtherDevice_[i], height, width, false, true);
+  }
+}
+
+void Layer::resetOutput(size_t height, size_t width) {
+  resetSpecifyOutput(output_, height, width, true, true);
+
+  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
+    resetSpecifyOutput(outputOtherDevice_[i], height, width, true, true);
+  }
+}
+
+void Layer::addOutputArgument(int deviceId) {
+  if (deviceId == deviceId_) {
+    output_.countIncrement();
+    return;
+  } else {
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      if (outputOtherDevice_[i].deviceId == deviceId) {
+        outputOtherDevice_[i].countIncrement();
+        return;
+      }
+    }
+  }
+
+  Argument argu;
+  argu.deviceId = deviceId;
+  outputOtherDevice_.push_back(argu);
+  outputOtherDevice_.back().countIncrement();
+}
+
+void Layer::copyOutputToOtherDevice() {
+  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
+    SetDevice device(outputOtherDevice_[i].deviceId);
+    // If outputOtherDevice_[i].value is a CpuMatrix,
+    // the copyFrom is a synchronous interface.
+    // If outputOtherDevice_[i].value is a GpuMatrix, since subsequent
+    // calculations are all on HPPL_STREAM_DEFAULT,
+    // copyFrom can be an asynchronous interface.
+    outputOtherDevice_[i].value->copyFrom(*getOutputValue(),
+                                          HPPL_STREAM_DEFAULT);
+    outputOtherDevice_[i].sequenceStartPositions =
+        output_.sequenceStartPositions;
+    outputOtherDevice_[i].subSequenceStartPositions =
+        output_.subSequenceStartPositions;
+    outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
+
+    outputOtherDevice_[i].notifyValueReady();
+  }
+}
+
+void Layer::waitInputValue() {
+  for (size_t i = 0; i != inputLayers_.size(); i++) {
+    if (inputLayers_[i]->getDeviceId() != deviceId_) {
+      getInput(i).waitValueReady();
+    }
+  }
+}
+
+void Layer::waitAndMergeOutputGrad() {
+  if (!output_.grad || !outputOtherDevice_.size()) {
+    return;
+  }
+
+  for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
+    outputOtherDevice_[i].waitGradReady();
+  }
+
+  /* merge output grad */
+  size_t i = 0;
+  if (!output_.getAllCount()) {
+    output_.grad->copyFrom(*outputOtherDevice_[0].grad, HPPL_STREAM_1);
+    hl_stream_synchronize(HPPL_STREAM_1);
+
+    i++;
+    if (outputOtherDevice_.size() == 1) return;
+  }
+
+  Matrix::resizeOrCreate(tmpGrad_,
+                         output_.grad->getHeight(),
+                         output_.grad->getWidth(),
+                         /* trans */ false,
+                         useGpu(output_.deviceId));
+
+  for (; i != outputOtherDevice_.size(); i++) {
+    tmpGrad_->copyFrom(*outputOtherDevice_[i].grad, HPPL_STREAM_1);
+    hl_stream_synchronize(HPPL_STREAM_1);
+    output_.grad->add(*tmpGrad_);
+  }
+}
+
+void Layer::markAllInputGrad() {
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    if (!markInBackward_[i]) {
+      inputLayers_[i]->getOutput(deviceId_).notifyGradReady();
+    }
+    markInBackward_[i] = false;
+  }
+}
+
+void Layer::markInputGrad(int inputIndex) {
+  inputLayers_[inputIndex]->getOutput(deviceId_).notifyGradReady();
+  markInBackward_[inputIndex] = true;
+}
+
+void Layer::zeroGrad() {
+  CHECK(output_.grad.get() != NULL);
+  output_.grad->zeroMem();
+}
+
+void Layer::initNeedFlags() {
+  auto initFlag = [this](
+      bool& flag, bool (Layer::*flagQueryFunc)() const, ParameterType type) {
+    flag = false;
+    if (biasParameter_ && biasParameter_->hasType(type)) {
+      flag = true;
+    }
+    if (!flag) {
+      for (auto& para : parameters_) {
+        if (para && para->hasType(type)) {
+          flag = true;
+          break;
+        }
+      }
+    }
+    if (!flag) {
+      for (auto& layer : inputLayers_) {
+        if ((layer.get()->*flagQueryFunc)()) {
+          flag = true;
+        }
+      }
+    }
+  };
+  initFlag(needGradient_, &Layer::needGradient, PARAMETER_GRADIENT);
+}
+
+void Layer::showOutputStats() {
+  MatrixPtr out = getOutputValue();
+  if (!out) return;
+  if (!out->getElementCnt()) {
+    LOG(INFO) << "The number of output of " << config_.name()
+              << " is 0, skip to show the statistics";
+    return;
+  }
+  MatrixPtr outSquare;
+  if (dynamic_cast<GpuSparseMatrix*>(out.get())) {
+    GpuSparseMatrix* tmp = dynamic_cast<GpuSparseMatrix*>(out.get());
+    outSquare = std::make_shared<CpuSparseMatrix>(tmp->getHeight(),
+                                                  tmp->getWidth(),
+                                                  tmp->getElementCnt(),
+                                                  tmp->getValueType(),
+                                                  tmp->getFormat());
+  } else {
+    outSquare = out->clone();
+  }
+  outSquare->copyFrom(*out, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+
+  real mean = outSquare->getSum() / out->getElementCnt();
+  real min;
+  real max;
+  if (dynamic_cast<CpuSparseMatrix*>(outSquare.get())) {
+    auto tmpMat = dynamic_cast<CpuSparseMatrix*>(outSquare.get());
+    min = tmpMat->getMin();
+    max = tmpMat->getMax();
+    tmpMat->square2();
+    LOG(INFO) << "show statistics of [none zero values] in sparse matrix";
+  } else {
+    min = outSquare->getMin();
+    max = outSquare->getMax();
+    outSquare->square2();
+  }
+  real std = (outSquare->getSum() / outSquare->getElementCnt()) - mean * mean;
+  std = std > 0 ? std : 0;
+  LOG(INFO) << "The output state of " << config_.name() << ": mean=" << mean
+            << ", "
+            << "std=" << std << ", "
+            << "min=" << min << ", "
+            << "max=" << max;
+}
+
+void Layer::forwardActivation() {
+  /* activation */
+  auto status = activation_->forward(output_);
+  status.check();
+
+  /* dropout */
+  if (config_.drop_rate() > 0) {
+    forwardDropOut();
+    CHECK_NE(activation_->getName(), "softmax")
+        << "Softmax activation cannot be used with Dropout";
+  }
+
+  if (FLAGS_show_layer_stat) {
+    showOutputStats();
+  }
+}
+
+void Layer::backwardActivation() {
+  /* Do error clipping */
+  if (config_.error_clipping_threshold() > 0.0f) {
+    if (FLAGS_log_error_clipping) {
+      VectorPtr outGradVec = Vector::create(
+          output_.grad->getData(), output_.grad->getElementCnt(), useGpu_);
+      real maxAbsGrad = outGradVec->getAbsMax();
+      if (maxAbsGrad > config_.error_clipping_threshold()) {
+        real avgAbsGrad = outGradVec->getAbsSum() / outGradVec->getSize();
+        LOG(INFO) << " layer=" << config_.name() << " need clipping,"
+                  << " max error=" << maxAbsGrad << " avg error=" << avgAbsGrad;
+      }
+    }
+    output_.grad->clip(-config_.error_clipping_threshold(),
+                       config_.error_clipping_threshold());
+  }
+
+  /* Do dropout for delta*/
+  if (config_.drop_rate() > 0 && passType_ != PASS_TEST) {
+    MatrixPtr oGrad = getOutputGrad();
+    oGrad->dotMul(*oGrad, *dropOutMask_);
+  }
+
+  auto status = activation_->backward(output_);
+  status.check();
+}
+
+void Layer::forwardDropOut() {
+  auto& outV = getOutputValue();
+
+  if (passType_ == PASS_TRAIN) {
+    // new dropOutMask_ if dropOutMask_ is null ptr
+    Matrix::resizeOrCreate(dropOutMask_,
+                           outV->getHeight(),
+                           outV->getWidth(),
+                           false,
+                           useGpu(deviceId_));
+    dropOutMask_->randomizeUniform();  // generate a uniform random matrix
+    dropOutMask_->biggerThanScalar(config_.drop_rate());  // random mask
+    outV->dotMul(*outV, *dropOutMask_);                   // dropout
+  } else if (passType_ == PASS_GC) {
+    // only initialize once
+    if (!dropOutMask_) {
+      dropOutMask_ = Matrix::create(
+          outV->getHeight(), outV->getWidth(), false, useGpu(deviceId_));
+      // We use cpu matrix to generate mask so that the mask
+      // will be same for both gpu version and cpu version.
+      // This will help unittest to make sure they have same result.
+      MatrixPtr tmpMask = Matrix::create(outV->getHeight(), outV->getWidth());
+      tmpMask->randomizeUniform();  // generate a uniform random matrix
+      tmpMask->biggerThanScalar(config_.drop_rate());  // random mask
+      dropOutMask_->copyFrom(*tmpMask);
+    }
+    outV->dotMul(*outV, *dropOutMask_);
+  } else {  // passType == PASS_TEST
+    outV->mulScalar(1.0 - config_.drop_rate());
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Layer.h b/paddle/legacy/gserver/layers/Layer.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7ff76decea9a448acfcdef1c81a68b5a823cc56
--- /dev/null
+++ b/paddle/legacy/gserver/layers/Layer.h
@@ -0,0 +1,512 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/function/Function.h"
+#include "paddle/legacy/gserver/activations/ActivationFunction.h"
+#include "paddle/legacy/math/CpuSparseMatrix.h"
+#include "paddle/legacy/parameter/Argument.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/parameter/Weight.h"
+#include "paddle/legacy/utils/ClassRegistrar.h"
+#include "paddle/legacy/utils/Util.h"
+
+/// Macro for registering a layer type.
+/// Example: REGISTER_LAYER(crf_error, CRFDecodingErrorLayer);
+#define REGISTER_LAYER(__type_name, __class_name) \
+  static InitFunction __reg_type_##__type_name(   \
+      []() { Layer::registrar_.registerClass<__class_name>(#__type_name); })
+
+#define REGISTER_LAYER_CREATE_FUNC(__type_name, createFunction) \
+  static InitFunction __reg_type_##__type_name(                 \
+      []() { Layer::registrar_.registerClass(#__type_name, createFunction); })
+
+namespace paddle {
+
+class Layer;
+typedef std::shared_ptr<Layer> LayerPtr;
+typedef std::map<std::string, LayerPtr> LayerMap;
+class NeuralNetwork;
+
+/// layer state, used for RNN and LSTM layers
+struct LayerState {
+  std::vector<MatrixPtr> value;
+};
+typedef std::shared_ptr<LayerState> LayerStatePtr;
+
+/// Paddle device ID, MKLDNN is -2, CPU is -1
+enum PADDLE_DEVICE_ID {
+  MKLDNN_DEVICE = -2,
+  CPU_DEVICE = -1,
+};
+
+/**
+ * @brief Base class for layer.
+ * Define necessary variables and functions for every layer.
+ */
+class Layer {
+ protected:
+  /// Layer config
+  LayerConfig config_;
+  /// whether to use GPU
+  bool useGpu_;
+  /// Device Id. MKLDNN is -2, CPU is -1, and GPU is 0, 1, 2 ...
+  int deviceId_;
+  /// Input layers
+  std::vector<LayerPtr> inputLayers_;
+  /// Argument of input layers
+  std::vector<std::string> inputArgument_;
+
+  /// Parameter for each input layer.
+  /// Parameters_[i] is nullptr if inputLayers_[i] does not need parameter.
+  std::vector<ParameterPtr> parameters_;
+
+  /// nullptr if bias is not needed.
+  ParameterPtr biasParameter_;
+
+  /// Output
+  Argument output_;
+  /// Several outputs stored on different devices, used in 'parallel_nn' case,
+  /// and record them by deviceId_.
+  /// Also used in 'use_mkldnn' case.
+  std::vector<Argument> outputOtherDevice_;
+  /// If there are several outputs, map them by each name.
+  /// MKLDNNLayer use it only to merge output grad
+  std::map<std::string, Argument*> outputMap_;
+  /// Used to merge grad on different devices.
+  MatrixPtr tmpGrad_;
+
+  std::unique_ptr<ActivationFunction> activation_;
+
+  /// Current passType, PASS_TRAIN or PASS_TEST
+  PassType passType_;
+
+  /// Random 0-1 matrix for dropOut
+  MatrixPtr dropOutMask_;
+
+  /// Whether the layer need to compute gradient
+  bool needGradient_;
+  /// Whether the layer need to compute re-sequence information
+  bool needSequenceInfo_;
+
+  /// Mark input grad in(true) or out(false) of backward function.
+  std::vector<bool> markInBackward_;
+
+  /// Layer forward function
+  std::vector<std::shared_ptr<FunctionBase>> forward_;
+  /// Layer backward function
+  std::vector<std::shared_ptr<FunctionBase>> backward_;
+
+ public:
+  /**
+   * Wait until all input value ready.
+   * Called before Layer::forward() function.
+   */
+  virtual void waitInputValue();
+
+  /**
+   * Copy layer's output_ to other device.
+   * If output layer is in other device, called after Layer::forward() function.
+   */
+  virtual void copyOutputToOtherDevice();
+
+  /**
+   * Wait until all output grad ready and merge them to output_.grad.
+   * Called before Layer::backward() function.
+   */
+  virtual void waitAndMergeOutputGrad();
+
+  /**
+   * Notify previous layer the output grad ready.
+   * Called after Layer::backward() function.
+   */
+  virtual void markAllInputGrad();
+
+ protected:
+  /**
+   * Create layer function. Function is called in forward or backward.
+   * \param function, Layer::forward_ or Layer::backward_
+   * \param name, function name
+   * \param config, initialization configuration for the function
+   */
+  void createFunction(std::vector<std::shared_ptr<FunctionBase>>& function,
+                      const std::string& name,
+                      const FuncConfig& config) {
+    if (useGpu_) {
+      function.emplace_back(
+          FunctionBase::funcRegistrar_.createByType(name + "-GPU"));
+    } else {
+      function.emplace_back(
+          FunctionBase::funcRegistrar_.createByType(name + "-CPU"));
+    }
+    auto& func = function.back();
+    func->init(config);
+  }
+
+  /**
+   * Notify specified layer the output grad ready.
+   * Called in the backward function.
+   * If do mark input grad in the backward function, you should to ensure
+   * that all input grad will be marked in the backward function.
+   */
+  void markInputGrad(int inputIndex);
+
+  /**
+   * Get the argument of input layer.
+   */
+  const Argument& getInput(size_t inputIndex) const {
+    return inputLayers_[inputIndex]->getOutput(deviceId_);
+  }
+
+  /**
+   * Get the argument of input layer.
+   */
+  const Argument& getInput(const Layer& inputLayer) const {
+    return inputLayer.getOutput(deviceId_);
+  }
+
+  /**
+   * Get the argument of input layer with deviceId.
+   */
+  const Argument& getInput(size_t inputIndex, int deviceId) const {
+    return inputLayers_[inputIndex]->getOutput(deviceId);
+  }
+
+  /**
+   * Get the forward-input value.
+   */
+  const MatrixPtr& getInputValue(int inputIndex) {
+    return inputLayers_[inputIndex]->getOutput(deviceId_).value;
+  }
+
+  /**
+   * Get the forward-input value.
+   */
+  const MatrixPtr& getInputValue(const Layer& inputLayer) {
+    return inputLayer.getOutput(deviceId_).value;
+  }
+
+  /**
+   * Get the forward-input value with deviceId.
+   */
+  const MatrixPtr& getInputValue(int inputIndex, int deviceId) {
+    return inputLayers_[inputIndex]->getOutput(deviceId).value;
+  }
+
+  /**
+   * Get the forward-input grad.
+   */
+  const MatrixPtr& getInputGrad(int inputIndex) {
+    return inputLayers_[inputIndex]->getOutput(deviceId_).grad;
+  }
+
+  /**
+   * Get the forward-input grad.
+   */
+  const MatrixPtr& getInputGrad(const Layer& inputLayer) {
+    return inputLayer.getOutput(deviceId_).grad;
+  }
+
+  /**
+   * Get the forward-input grad.
+   */
+  const MatrixPtr& getInputGrad(int inputIndex, int deviceId) {
+    return inputLayers_[inputIndex]->getOutput(deviceId).grad;
+  }
+
+  /**
+   * Get the forward-input label.
+   */
+  const IVectorPtr& getInputLabel(const Layer& inputLayer) {
+    return inputLayer.getOutput(deviceId_).ids;
+  }
+
+  /**
+   * Change the size of output (value, grad).
+   * Reset to value zero if isValueClean = true,
+   * Reset to grad zero if isGradClean = true.
+   */
+  void resetSpecifyOutput(Argument& output,
+                          size_t height,
+                          size_t width,
+                          bool isValueClean,
+                          bool isGradClean);
+
+  /**
+   * Add output argument to other devices.
+   */
+  void addOutputArgument(int deviceId);
+
+ public:
+  explicit Layer(const LayerConfig& config, bool useGpu = FLAGS_use_gpu);
+  virtual ~Layer() {}
+
+  /// Register a Layer
+  static ClassRegistrar<Layer, LayerConfig> registrar_;
+
+  /**
+   * Get the flag whether layer need to compute gradient.
+   */
+  bool needGradient() const { return needGradient_; }
+
+  /**
+   * Set the flag whether layer need to compute gradient.
+   */
+  void setNeedGradient(bool need) { needGradient_ = need; }
+
+  /**
+   * Set the flag whether layer need to re-compute sequence information,
+   * which includes sequenceStartPositions or subSequenceStartPositions.
+   */
+  void setNeedSequenceInfo(bool need) { needSequenceInfo_ = need; }
+
+  /**
+   * Get layer's name.
+   */
+  const std::string& getName() const { return config_.name(); }
+
+  /**
+   * Get layer's type.
+   */
+  const std::string& getType() const { return config_.type(); }
+
+  /**
+   * Get layer's size.
+   */
+  size_t getSize() const { return config_.size(); }
+
+  /**
+   * Get layer's deviceId.
+   */
+  int getDeviceId() const { return deviceId_; }
+
+  /**
+   * Add the inputLayer.
+   */
+  void addPrev(LayerPtr l) { inputLayers_.push_back(l); }
+
+  /**
+   * Get the size of inputLayer[i].
+   */
+  const LayerPtr& getPrev(size_t i) { return inputLayers_[i]; }
+
+  /**
+   * Get the forward-output value.
+   */
+  const MatrixPtr& getOutputValue() { return output_.value; }
+
+  /**
+   * Get the forward-output label.
+   */
+  const IVectorPtr& getOutputLabel() { return output_.ids; }
+
+  /**
+   * Get the backward-Loss value.
+   */
+  const MatrixPtr& getOutputGrad() { return output_.grad; }
+  /**
+   * If layer has multi-output, set output into outputMap_.
+   */
+  void setOutput(const std::string& name, Argument* output) {
+    outputMap_[name] = output;
+  }
+
+  /**
+   * Get the output map size, if layer has multi-output.
+   */
+  size_t getOutputMapSize() { return outputMap_.size(); }
+
+  /**
+   * Get the output based on layer's name.
+   */
+  Argument& getOutput(const std::string& str = "") {
+    if (str == "") {
+      return output_;
+    } else {
+      auto output = outputMap_.find(str);
+      if (output != outputMap_.end()) {
+        return *output->second;
+      } else {
+        LOG(FATAL) << "No specific output " << str;
+        return *((Argument*)nullptr);
+      }
+    }
+  }
+
+  /**
+   * Get the output based on deviceId.
+   */
+  const Argument& getOutput(int deviceId) const {
+    if (deviceId == getDeviceId()) {
+      return output_;
+    } else {
+      for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+        if (outputOtherDevice_[i].deviceId == deviceId) {
+          return outputOtherDevice_[i];
+        }
+      }
+
+      LOG(FATAL) << "No specific device output ";
+      return *((Argument*)nullptr);
+    }
+  }
+
+  /**
+   * Get layer's parameters.
+   */
+  const std::vector<ParameterPtr>& getParameters() { return parameters_; }
+
+  /**
+   * Get layer's bias-parameters.
+   */
+  const ParameterPtr& getBiasParameter() { return biasParameter_; }
+
+  /**
+   * Create pointer of layer.
+   */
+  static LayerPtr create(const LayerConfig& config);
+
+  /**
+   * Resize the output matrix size.
+   */
+  void resizeOutput(size_t height, size_t width);
+
+  /**
+   * Resize the output matrix size,
+   * and reset value to zero.
+   */
+  void reserveOutput(size_t height, size_t width);
+
+  /**
+   * Resize the output matrix size,
+   * and reset value and grad to zero.
+   */
+  void resetOutput(size_t height, size_t width);
+
+  /**
+   * Clear the gradient of output.
+   */
+  void zeroGrad();
+
+  /**
+   * Intialization.
+   * For example, adding input layers from layerMap and parameterMap.
+   */
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  /**
+   * Intialization for sub network if there has sub network.
+   * @param rootNetwork root network
+   * @param config model config
+   * @param parameterTypes parameter's type
+   * @param useGpu whether to use gpu or not
+   */
+  virtual void initSubNetwork(NeuralNetwork* rootNetwork,
+                              const ModelConfig& config,
+                              const std::vector<ParameterType>& parameterTypes,
+                              bool useGpu) {}
+
+  /**
+   * @brief Access SubNetwork Object.
+   *        If subnetwork exists, then invoke callback with subnetwrk.
+   * @param callback if sub-network is exist, the callback is invoked.
+   */
+  virtual void accessSubNetwork(
+      const std::function<void(NeuralNetwork&)>& callback) {}
+
+  /**
+   * If use sparse row matrix as parameter,
+   * prefetch feature ids in input label.
+   */
+  virtual void prefetch() {}
+
+  /**
+   * Forward propagation.
+   * All inherited implementation should call Layer::foward() function.
+   */
+  virtual void forward(PassType passType) {
+    passType_ = passType;
+    if (!inputLayers_.empty() && needSequenceInfo_) {
+      const Argument& input = getInput(0);
+      output_.sequenceStartPositions = input.sequenceStartPositions;
+      output_.subSequenceStartPositions = input.subSequenceStartPositions;
+      output_.cpuSequenceDims = input.cpuSequenceDims;
+    }
+  }
+
+  /**
+   * Reset the internal state variables.
+   * Allocate them if they have not been allocated.
+   * This function need to called before Layer::forward() for generating
+   * sequence.
+   *
+   * This is used for sequence generation. When generating sequence, the
+   * calculation at current timestamp depends on the state from previous
+   * timestamp. The model needs to keep the information about the previous
+   * timestamp in the state variables. Layers such as RecurrentLayer,
+   * LstmLayer and ContextLayer have state variables.
+   */
+  virtual void resetState() {}
+
+  /**
+   * Set layer state.
+   */
+  virtual void setState(LayerStatePtr state) {}
+
+  /**
+   * Get layer state.
+   * @return A copy of internal state.
+   */
+  virtual LayerStatePtr getState() { return nullptr; }
+
+  /**
+   * Show output state.
+   */
+  void showOutputStats();
+
+  /**
+   * Backward propagation.
+   * Should only be called after Layer::forward() function.
+   */
+  virtual void backward(const UpdateCallback& callback = nullptr) = 0;
+
+  /**
+   * One pass is finished.
+   */
+  virtual void onPassEnd() {}
+
+ protected:
+  /**
+   * Forward of activation function.
+   */
+  void forwardActivation();
+  /**
+   * Backward of activation function.
+   */
+  void backwardActivation();
+  /**
+   * Forward of dropOut.
+   */
+  void forwardDropOut();
+  /**
+   * Initilize the needGradient_ flag.
+   */
+  void initNeedFlags();
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/LinearChainCRF.cpp b/paddle/legacy/gserver/layers/LinearChainCRF.cpp
similarity index 100%
rename from paddle/gserver/layers/LinearChainCRF.cpp
rename to paddle/legacy/gserver/layers/LinearChainCRF.cpp
diff --git a/paddle/legacy/gserver/layers/LinearChainCRF.h b/paddle/legacy/gserver/layers/LinearChainCRF.h
new file mode 100644
index 0000000000000000000000000000000000000000..65e23905435da24a1a7554c30e33d303b05aef69
--- /dev/null
+++ b/paddle/legacy/gserver/layers/LinearChainCRF.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+class LinearChainCRF {
+ public:
+  /**
+   * The size of para must be \f$(numClasses + 2) * numClasses\f$.
+   * The first numClasses values of para are for starting weights (\f$a\f$).
+   * The next numClasses values of para are for ending weights (\f$b\f$),
+   * The remaning values are for transition weights (\f$w\f$).
+   *
+   * The probability of a state sequence s of length \f$L\f$ is defined as:
+   * \f$P(s) = (1/Z) exp(a_{s_1} + b_{s_L}
+   *                  + \sum_{l=1}^L x_{s_l}
+   *                  + \sum_{l=2}^L w_{s_{l-1},s_l})\f$
+   * where \f$Z\f$ is a normalization value so that the sum of \f$P(s)\f$ over
+   * all possible
+   * sequences is \f$1\f$, and \f$x\f$ is the input feature to the CRF.
+   */
+  LinearChainCRF(int numClasses, real* para);
+
+  /**
+   * Calculate the negative log likelihood of s given x.
+   * The size of x must be length * numClasses. Each consecutive numClasses
+   * values are the features for one time step.
+   */
+  real forward(real* x, int* s, int length);
+
+  /**
+   * Calculate the gradient with respect to x, a, b, and w.
+   * backward() can only be called after a corresponding call to forward() with
+   * the same x, s and length.
+   * The gradient with respect to a, b, and w will not be calculated if
+   * needWGrad is false.
+   * @note Please call getWGrad() and getXGrad() to get the gradient with
+   * respect to (a, b, w) and x respectively.
+   */
+  void backward(real* x, int* s, int length, bool needWGrad);
+
+  /**
+   * Find the most probable sequence given x. The result will be stored in s.
+   */
+  void decode(real* x, int* s, int length);
+
+  /*
+   * Return the gradient with respect to (a, b, w). It can only be called after
+   * a corresponding call to backward().
+   */
+  MatrixPtr getWGrad() { return matWGrad_; }
+
+  /*
+   * Return the gradient with respect to x. It can only be called after a
+   * corresponding call to backward().
+   */
+  MatrixPtr getXGrad() { return matGrad_; }
+
+ protected:
+  int numClasses_;
+  MatrixPtr a_;
+  MatrixPtr b_;
+  MatrixPtr w_;
+  MatrixPtr matWGrad_;
+  MatrixPtr da_;
+  MatrixPtr db_;
+  MatrixPtr dw_;
+  MatrixPtr ones_;
+
+  MatrixPtr expX_;
+  MatrixPtr matGrad_;
+  MatrixPtr alpha_;
+  MatrixPtr beta_;
+  MatrixPtr maxX_;
+  MatrixPtr expW_;
+
+  // track_(k,i) = j means that the best sequence at time k for class i comes
+  // from the sequence at time k-1 for class j
+  IVectorPtr track_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/LinearChainCTC.cpp b/paddle/legacy/gserver/layers/LinearChainCTC.cpp
similarity index 100%
rename from paddle/gserver/layers/LinearChainCTC.cpp
rename to paddle/legacy/gserver/layers/LinearChainCTC.cpp
diff --git a/paddle/legacy/gserver/layers/LinearChainCTC.h b/paddle/legacy/gserver/layers/LinearChainCTC.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6c4c7bfe0cdb1bbcafbf5b847ea592eef02794a
--- /dev/null
+++ b/paddle/legacy/gserver/layers/LinearChainCTC.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+class LinearChainCTC {
+ public:
+  LinearChainCTC(int numClasses, bool normByTimes);
+
+  // Calculate the negative log probability as loss
+  real forward(real* softmaxSeq,
+               int softmaxSeqLen,
+               int* labelSeq,
+               int labelSeqLen);
+
+  // calculate the gradient
+  void backward(real* softmaxSeq,
+                real* softmaxSeqGrad,
+                int* labelSeq,
+                int labelSeqLen);
+
+ protected:
+  int numClasses_, blank_, totalSegments_, totalTime_;
+  bool normByTimes_;
+  bool isInvalid_;
+
+  MatrixPtr logActs_, forwardVars_, backwardVars_, gradTerms_;
+
+  real logProb_;
+
+  void segmentRange(int& start, int& end, int time);
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LstmCompute.cpp b/paddle/legacy/gserver/layers/LstmCompute.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..70f08e1d4efd2223e7ddec1b104e4ee63fc34de5
--- /dev/null
+++ b/paddle/legacy/gserver/layers/LstmCompute.cpp
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "LstmCompute.h"
+#include "hl_recurrent_apply.cuh"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+void LstmCompute::init(LayerConfig &config) {
+  activeNode_ = hlActiveType(config.active_type());
+  activeGate_ = hlActiveType(config.active_gate_type());
+  activeState_ = hlActiveType(config.active_state_type());
+}
+
+template <>
+void LstmCompute::forwardOneSequence<0>(hl_lstm_value value, int frameSize) {
+  hl_cpu_lstm_forward(hppl::forward::lstm(),
+                      value,
+                      frameSize,
+                      activeNode_,
+                      activeGate_,
+                      activeState_);
+}
+
+template <>
+void LstmCompute::backwardOneSequence<0>(hl_lstm_value value,
+                                         hl_lstm_grad grad,
+                                         int frameSize) {
+  hl_cpu_lstm_backward(hppl::backward::lstm(),
+                       value,
+                       grad,
+                       frameSize,
+                       activeNode_,
+                       activeGate_,
+                       activeState_);
+}
+
+template <>
+void LstmCompute::forwardBatch<0>(hl_lstm_value value,
+                                  int frameSize,
+                                  int batchSize) {
+  for (int b = 0; b < batchSize; b++) {
+    forwardOneSequence<0>(value, frameSize);
+
+    value.gateValue += frameSize * 4;
+    value.stateValue += frameSize;
+    value.stateActiveValue += frameSize;
+    value.outputValue += frameSize;
+    if (value.prevStateValue) {
+      value.prevStateValue += frameSize;
+    }
+  }
+}
+
+template <>
+void LstmCompute::backwardBatch<0>(hl_lstm_value value,
+                                   hl_lstm_grad grad,
+                                   int frameSize,
+                                   int batchSize) {
+  for (int b = 0; b < batchSize; b++) {
+    backwardOneSequence<0>(value, grad, frameSize);
+
+    value.gateValue += frameSize * 4;
+    value.stateValue += frameSize;
+    value.stateActiveValue += frameSize;
+    value.outputValue += frameSize;
+    if (value.prevStateValue) {
+      value.prevStateValue += frameSize;
+    }
+
+    grad.gateGrad += frameSize * 4;
+    grad.stateGrad += frameSize;
+    grad.stateActiveGrad += frameSize;
+    grad.outputGrad += frameSize;
+    if (grad.prevStateGrad) {
+      grad.prevStateGrad += frameSize;
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/LstmCompute.cu b/paddle/legacy/gserver/layers/LstmCompute.cu
similarity index 100%
rename from paddle/gserver/layers/LstmCompute.cu
rename to paddle/legacy/gserver/layers/LstmCompute.cu
diff --git a/paddle/legacy/gserver/layers/LstmCompute.h b/paddle/legacy/gserver/layers/LstmCompute.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac40c35ef1b0a11e61b5d1b11476ffe7daff6d5e
--- /dev/null
+++ b/paddle/legacy/gserver/layers/LstmCompute.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ModelConfig.pb.h"
+#include "hl_gpu.h"
+#include "paddle/legacy/utils/Common.h"
+
+namespace paddle {
+
+class LstmCompute {
+ public:
+  void init(LayerConfig &config);
+
+  /**
+   * LstmLayer batch compute API (forwardBatch, backwardBatch).
+   * If use batch compute api, lstm value(and grad) need to be batch structure.
+   * Compute order:
+   *   forwardBatch:  for 0 <= id < numBatch
+   *   backwardBatch:  for numBatch > id >= 0
+   */
+  template <bool useGpu>
+  void forwardBatch(hl_lstm_value value, int frameSize, int batchSize);
+
+  template <bool useGpu>
+  void backwardBatch(hl_lstm_value value,
+                     hl_lstm_grad grad,
+                     int frameSize,
+                     int batchSize);
+
+  /**
+   * LstmLayer sequence compute API (forwardOneSequence, backwardOneSequence).
+   * Compute order(for each sequence):
+   *   forwardOneSequence:
+   *     if (!reversed) for 0 <= seqId < seqLength
+   *     if (reversed)  for seqLength > seqId >= 0
+   *   backwardOneSequence:
+   *     if (!reversed) for seqLength > seqId >= 0
+   *     if (reversed)  for 0 <= seqId < seqLength
+   */
+  template <bool useGpu>
+  void forwardOneSequence(hl_lstm_value value, int frameSize);
+  template <bool useGpu>
+  void backwardOneSequence(hl_lstm_value value,
+                           hl_lstm_grad grad,
+                           int frameSize);
+
+ public:
+  hl_activation_mode_t activeNode_;
+  hl_activation_mode_t activeGate_;
+  hl_activation_mode_t activeState_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LstmLayer.cpp b/paddle/legacy/gserver/layers/LstmLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..43a55d8d490faf0049d47bbca6ae1947d13e6be8
--- /dev/null
+++ b/paddle/legacy/gserver/layers/LstmLayer.cpp
@@ -0,0 +1,805 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "LstmLayer.h"
+#include "paddle/legacy/math/BaseMatrix.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Stat.h"
+
+DECLARE_bool(prev_batch_state);
+
+namespace paddle {
+
+REGISTER_LAYER(lstmemory, LstmLayer);
+
+bool LstmLayer::init(const LayerMap &layerMap,
+                     const ParameterMap &parameterMap) {
+  if (!Layer::init(layerMap, parameterMap)) return false;
+  CHECK_EQ(1U, inputLayers_.size());
+  CHECK_EQ(1U, parameters_.size());
+  CHECK_EQ(getSize() * getSize() * 4, parameters_[0]->getSize());
+  CHECK_EQ(getSize() * 7, biasParameter_->getSize());
+  weight_.reset(new Weight(getSize(), getSize() * 4, parameters_[0]));
+  if (biasParameter_.get() != NULL) {
+    bias_.reset(new Weight(1, getSize() * 7, biasParameter_));
+    if (bias_->getW()) {
+      localBias_ = Matrix::create(nullptr,
+                                  /* height= */ 1,
+                                  getSize() * 4,
+                                  /* trans= */ false,
+                                  useGpu_);
+      checkIg_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
+      checkFg_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
+      checkOg_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
+
+      localBias_->setData(bias_->getW()->getData());
+      checkIg_->setData(bias_->getW()->getData() + getSize() * 4);
+      checkFg_->setData(bias_->getW()->getData() + getSize() * 5);
+      checkOg_->setData(bias_->getW()->getData() + getSize() * 6);
+    }
+
+    if (bias_->getWGrad()) {
+      localBiasGrad_ = Matrix::create(nullptr,
+                                      /* height= */ 1,
+                                      getSize() * 4,
+                                      /* trans= */ false,
+                                      useGpu_);
+      checkIgGrad_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    getSize(),
+                                    /* trans= */ false,
+                                    useGpu_);
+      checkFgGrad_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    getSize(),
+                                    /* trans= */ false,
+                                    useGpu_);
+      checkOgGrad_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    getSize(),
+                                    /* trans= */ false,
+                                    useGpu_);
+      localBiasGrad_->setData(bias_->getWGrad()->getData());
+      checkIgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 4);
+      checkFgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 5);
+      checkOgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 6);
+    }
+  } else {
+    LOG(FATAL) << "Bias should be here.";
+  }
+  reversed_ = config_.reversed();
+
+  // create IdentityActivation for using drop_rate
+  activation_.reset(ActivationFunction::create(""));
+
+  LstmCompute::init(config_);
+  useBatch_ = true;
+  useSeqParallel_ = false;
+  if (useGpu_ && (getSize() == 32 || getSize() == 64)) {
+    useSeqParallel_ = true;
+  }
+
+  return true;
+}
+
+void LstmLayer::resetState() {
+  CHECK(!reversed_) << "state is not allowed for reversed lstmemory layer";
+  Matrix::resizeOrCreate(
+      prevOutput_, 1, getSize(), /* trans= */ false, useGpu_);
+  Matrix::resizeOrCreate(prevState_, 1, getSize(), /* trans= */ false, useGpu_);
+  prevOutput_->resize(0, getSize());
+  prevState_->resize(0, getSize());
+  if (FLAGS_prev_batch_state) {
+    useBatch_ = true;
+  } else {
+    useBatch_ = false;
+  }
+}
+
+void LstmLayer::setState(LayerStatePtr state) {
+  CHECK(state->value.size() == 2) << "two matrices are expected for LSTM state";
+  prevOutput_->resize(state->value[0]->getHeight(),
+                      state->value[0]->getWidth());
+  prevState_->resize(state->value[1]->getHeight(), state->value[1]->getWidth());
+  prevOutput_->copyFrom(*(state->value[0]));
+  prevState_->copyFrom(*(state->value[1]));
+}
+
+LayerStatePtr LstmLayer::getState() {
+  LayerStatePtr res = std::make_shared<LayerState>();
+  if (prevOutput_->getHeight() && prevOutput_->getWidth()) {
+    res->value.push_back(prevOutput_->clone(0, 0, useGpu_));
+    res->value[0]->copyFrom(*prevOutput_);
+    res->value.push_back(prevState_->clone(0, 0, useGpu_));
+    res->value[1]->copyFrom(*prevState_);
+  } else {
+    MatrixPtr output =
+        Matrix::create(1, getSize(), /* trans= */ false, useGpu_);
+    MatrixPtr state = Matrix::create(1, getSize(), /* trans= */ false, useGpu_);
+    output->resize(0, getSize());
+    state->resize(0, getSize());
+    res->value.push_back(output);
+    res->value.push_back(state);
+  }
+  return res;
+}
+
+void LstmLayer::forward(PassType passType) {
+  REGISTER_TIMER_INFO("LstmFwTimer", getName().c_str());
+  Layer::forward(passType);
+
+  const Argument &input = getInput(0);
+  CHECK(input.sequenceStartPositions);
+  int batchSize = input.getBatchSize();
+  resetOutput(batchSize, getSize());
+  CHECK_EQ(getSize() * 4, input.value->getWidth());
+  size_t numSequences = input.getNumSequences();
+  const int *starts = input.sequenceStartPositions->getData(false);
+  CHECK_EQ(starts[numSequences], batchSize);
+
+  Matrix::resizeOrCreate(gate_.value,
+                         /* height= */ batchSize,
+                         getSize() * 4,
+                         /* trans= */ false,
+                         useGpu_);
+  if (prevOutput_) {
+    size_t prevNumSeq = useBatch_ ? numSequences : 1;
+    if (prevOutput_->getHeight() == 0) {
+      prevOutput_->resize(prevNumSeq, getSize());
+      prevState_->resize(prevNumSeq, getSize());
+      prevOutput_->zeroMem();
+      prevState_->zeroMem();
+    } else {
+      CHECK_EQ(prevOutput_->getHeight(), prevNumSeq)
+          << "the number of sequences must be the same";
+    }
+    Matrix::resizeOrCreate(totalState_,
+                           prevState_->getHeight() + batchSize,
+                           getSize(),
+                           /*trans*/ false,
+                           useGpu_);
+    state_.value = Matrix::create(nullptr,
+                                  /* height= */ batchSize,
+                                  getSize(),
+                                  /* trans= */ false,
+                                  useGpu_);
+    state_.value->setData(totalState_->getData() +
+                          prevState_->getHeight() * getSize());
+  } else {
+    Matrix::resizeOrCreate(state_.value,
+                           /* height= */ batchSize,
+                           getSize(),
+                           /* trans= */ false,
+                           useGpu_);
+  }
+  Matrix::resizeOrCreate(preOutput_.value,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
+                         useGpu_);
+
+  if (!useBatch_) {
+    forwardSequence(batchSize, numSequences, starts, input.value);
+  } else {
+    if (!useSeqParallel_) {
+      forwardBatch(batchSize, numSequences, starts, input.value);
+    } else {
+      const int *starts = input.sequenceStartPositions->getData(useGpu_);
+      forwardSeqParallel(batchSize, numSequences, starts, input.value);
+    }
+  }
+  /*  activation */ { forwardActivation(); }
+}
+
+void LstmLayer::backward(const UpdateCallback &callback) {
+  REGISTER_TIMER_INFO("LstmBwTimer", getName().c_str());
+  /*  Do derivation */ { backwardActivation(); }
+
+  const Argument &input = getInput(0);
+  CHECK(input.sequenceStartPositions);
+  int batchSize = input.getBatchSize();
+  size_t numSequences = input.getNumSequences();
+
+  Matrix::resizeOrCreate(gate_.grad,
+                         /* height= */ batchSize,
+                         getSize() * 4,
+                         /* trans= */ false,
+                         useGpu_);
+  Matrix::resizeOrCreate(state_.grad,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
+                         useGpu_);
+  Matrix::resizeOrCreate(preOutput_.grad,
+                         /* height= */ batchSize,
+                         getSize(),
+                         /* trans= */ false,
+                         useGpu_);
+  state_.grad->zero();
+
+  const int *starts = input.sequenceStartPositions->getData(false);
+  if (!useBatch_) {
+    backwardSequence(batchSize, numSequences, starts, input.grad);
+  } else {
+    if (!useSeqParallel_) {
+      backwardBatch(batchSize, numSequences, starts, input.grad);
+    } else {
+      const int *starts = input.sequenceStartPositions->getData(useGpu_);
+      backwardSeqParallel(batchSize, numSequences, starts, input.grad);
+    }
+  }
+
+  if (bias_) {
+    bias_->getParameterPtr()->incUpdate(callback);
+  }
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+
+void LstmLayer::forwardSequence(int batchSize,
+                                size_t numSequences,
+                                const int *starts,
+                                MatrixPtr inputValue) {
+  REGISTER_TIMER_INFO("LstmFwSequenceTime", getName().c_str());
+  gate_.value->assign(*inputValue);
+  if (bias_) {
+    gate_.value->addBias(*localBias_, 1);
+  }
+
+  hl_lstm_value lstmValue;
+  lstmValue.checkIg = checkIg_->getData();
+  lstmValue.checkFg = checkFg_->getData();
+  lstmValue.checkOg = checkOg_->getData();
+  lstmValue.gateValue = gate_.value->getData();
+  lstmValue.stateValue = state_.value->getData();
+  lstmValue.stateActiveValue = preOutput_.value->getData();
+  lstmValue.outputValue = output_.value->getData();
+  lstmValue.prevStateValue = nullptr;
+  if (reversed_) {
+    lstmValue.gateValue += (batchSize - 1) * getSize() * 4;
+    lstmValue.stateValue += (batchSize - 1) * getSize();
+    lstmValue.stateActiveValue += (batchSize - 1) * getSize();
+    lstmValue.outputValue += (batchSize - 1) * getSize();
+  }
+
+  auto nextFrame = [&lstmValue](bool reversed, int frameSize) {
+    lstmValue.prevStateValue = lstmValue.stateValue;
+    if (!reversed) {
+      lstmValue.gateValue += frameSize * 4;
+      lstmValue.stateValue += frameSize;
+      lstmValue.stateActiveValue += frameSize;
+      lstmValue.outputValue += frameSize;
+    } else {
+      lstmValue.gateValue -= frameSize * 4;
+      lstmValue.stateValue -= frameSize;
+      lstmValue.stateActiveValue -= frameSize;
+      lstmValue.outputValue -= frameSize;
+    }
+  };
+
+  MatrixPtr frameGate = Matrix::create(nullptr,
+                                       /* height= */ 1,
+                                       getSize() * 4,
+                                       /* trans= */ false,
+                                       useGpu_);
+  MatrixPtr frameOutput = Matrix::create(nullptr,
+                                         /* height= */ 1,
+                                         getSize(),
+                                         /* trans= */ false,
+                                         useGpu_);
+
+  if (!reversed_) {
+    if (prevState_) {
+      lstmValue.prevStateValue = prevState_->getData();
+    }
+    if (prevOutput_) {
+      frameGate->setData(lstmValue.gateValue);
+      frameGate->mul(*prevOutput_, *weight_->getW(), 1, 1);
+    }
+  }
+  AsyncGpuBlock asyncGpuBlock;
+  for (size_t n = 0; n < numSequences; ++n) {
+    int length;
+    if (!reversed_) {
+      length = starts[n + 1] - starts[n];
+    } else {
+      length = starts[numSequences - n] - starts[numSequences - n - 1];
+    }
+    for (int l = 0; l < length; ++l) {
+      if (useGpu_) {
+        LstmCompute::forwardOneSequence<1>(lstmValue, getSize());
+      } else {
+        LstmCompute::forwardOneSequence<0>(lstmValue, getSize());
+      }
+
+      if (l != length - 1) {
+        frameOutput->setData(lstmValue.outputValue);
+        nextFrame(reversed_, getSize());
+        frameGate->setData(lstmValue.gateValue);
+        frameGate->mul(*frameOutput, *weight_->getW(), 1, 1);
+      }
+    }
+    if (n != numSequences - 1) {
+      frameOutput->setData(lstmValue.outputValue);
+      nextFrame(reversed_, getSize());
+      frameGate->setData(lstmValue.gateValue);
+      if (!reversed_) {
+        if (!prevState_) lstmValue.prevStateValue = nullptr;
+        if (prevOutput_) {
+          frameGate->mul(*frameOutput, *weight_->getW(), 1, 1);
+        }
+      } else {
+        lstmValue.prevStateValue = nullptr;
+      }
+    }
+  }
+
+  if (!reversed_) {
+    if (prevState_) {
+      prevState_->assign(*state_.value->subMatrix(batchSize - 1, 1));
+    }
+    if (prevOutput_) {
+      prevOutput_->assign(*output_.value->subMatrix(batchSize - 1, 1));
+    }
+  }
+}
+
+void LstmLayer::backwardSequence(int batchSize,
+                                 size_t numSequences,
+                                 const int *starts,
+                                 MatrixPtr inputGrad) {
+  REGISTER_TIMER_INFO("LstmBwSequenceTime", getName().c_str());
+  MatrixPtr weightT = weight_->getW()->getTranspose();
+
+  hl_lstm_value lstmValue;
+  hl_lstm_grad lstmGrad;
+  lstmValue.checkIg = checkIg_->getData();
+  lstmValue.checkFg = checkFg_->getData();
+  lstmValue.checkOg = checkOg_->getData();
+  lstmValue.gateValue = gate_.value->getData();
+  lstmValue.stateValue = state_.value->getData();
+  lstmValue.stateActiveValue = preOutput_.value->getData();
+  lstmValue.outputValue = nullptr;
+
+  if (bias_->getWGrad()) {
+    lstmGrad.checkIgGrad = checkIgGrad_->getData();
+    lstmGrad.checkFgGrad = checkFgGrad_->getData();
+    lstmGrad.checkOgGrad = checkOgGrad_->getData();
+  } else {
+    lstmGrad.checkIgGrad = nullptr;
+    lstmGrad.checkFgGrad = nullptr;
+    lstmGrad.checkOgGrad = nullptr;
+  }
+  lstmGrad.gateGrad = gate_.grad->getData();
+  lstmGrad.stateGrad = state_.grad->getData();
+  lstmGrad.stateActiveGrad = nullptr;
+  lstmGrad.outputGrad = output_.grad->getData();
+
+  if (!reversed_) {
+    lstmValue.gateValue += (batchSize - 1) * getSize() * 4;
+    lstmGrad.gateGrad += (batchSize - 1) * getSize() * 4;
+    lstmValue.stateValue += (batchSize - 1) * getSize();
+    lstmGrad.stateGrad += (batchSize - 1) * getSize();
+    lstmValue.stateActiveValue += (batchSize - 1) * getSize();
+    lstmGrad.outputGrad += (batchSize - 1) * getSize();
+    lstmValue.prevStateValue = lstmValue.stateValue - getSize();
+    lstmGrad.prevStateGrad = lstmGrad.stateGrad - getSize();
+  } else {
+    lstmValue.prevStateValue = lstmValue.stateValue + getSize();
+    lstmGrad.prevStateGrad = lstmGrad.stateGrad + getSize();
+  }
+
+  auto nextFrame = [&lstmValue, &lstmGrad](bool reversed, int frameSize) {
+    if (reversed) {
+      lstmValue.gateValue += frameSize * 4;
+      lstmGrad.gateGrad += frameSize * 4;
+      lstmValue.stateValue += frameSize;
+      lstmGrad.stateGrad += frameSize;
+      lstmValue.stateActiveValue += frameSize;
+      lstmGrad.outputGrad += frameSize;
+      lstmValue.prevStateValue = lstmValue.stateValue + frameSize;
+      lstmGrad.prevStateGrad = lstmGrad.stateGrad + frameSize;
+    } else {
+      lstmValue.gateValue -= frameSize * 4;
+      lstmGrad.gateGrad -= frameSize * 4;
+      lstmValue.stateValue -= frameSize;
+      lstmGrad.stateGrad -= frameSize;
+      lstmValue.stateActiveValue -= frameSize;
+      lstmGrad.outputGrad -= frameSize;
+      lstmValue.prevStateValue = lstmValue.stateValue - frameSize;
+      lstmGrad.prevStateGrad = lstmGrad.stateGrad - frameSize;
+    }
+  };
+
+  MatrixPtr frameGate = Matrix::create(nullptr,
+                                       /* height= */ 1,
+                                       getSize() * 4,
+                                       /* trans= */ false,
+                                       useGpu_);
+  MatrixPtr frameOutput = Matrix::create(nullptr,
+                                         /* height= */ 1,
+                                         getSize(),
+                                         /* trans= */ false,
+                                         useGpu_);
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    for (size_t n = 0; n < numSequences; ++n) {
+      int length;
+      int start;
+      if (reversed_) {
+        length = starts[n + 1] - starts[n];
+        start = starts[n];
+      } else {
+        length = starts[numSequences - n] - starts[numSequences - n - 1];
+        start = starts[numSequences - n - 1];
+      }
+      for (int l = 0; l < length; ++l) {
+        if (l == length - 1) {
+          lstmValue.prevStateValue = nullptr;
+          lstmGrad.prevStateGrad = nullptr;
+        }
+        if (useGpu_) {
+          LstmCompute::backwardOneSequence<1>(lstmValue, lstmGrad, getSize());
+        } else {
+          LstmCompute::backwardOneSequence<0>(lstmValue, lstmGrad, getSize());
+        }
+        if (l != length - 1) {
+          frameGate->setData(lstmGrad.gateGrad);
+          nextFrame(reversed_, getSize());
+          frameOutput->setData(lstmGrad.outputGrad);
+          frameOutput->mul(*frameGate, *weightT, 1, 1);
+        } else {
+          nextFrame(reversed_, getSize());
+        }
+      }
+
+      if (weight_->getWGrad()) {
+        if (!reversed_) {
+          weight_->getWGrad()->mul(
+              *output_.value->subMatrix(start, length - 1)->getTranspose(),
+              *gate_.grad->subMatrix(start + 1, length - 1),
+              1,
+              1);
+        } else {
+          weight_->getWGrad()->mul(
+              *output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
+              *gate_.grad->subMatrix(start, length - 1),
+              1,
+              1);
+        }
+      }
+    }
+  }
+
+  if (inputGrad) {
+    inputGrad->add(*gate_.grad);
+  }
+  if (bias_ && bias_->getWGrad()) {
+    localBiasGrad_->collectBias(*gate_.grad, 1);
+  }
+}
+
+void LstmLayer::forwardBatch(int batchSize,
+                             size_t numSequences,
+                             const int *starts,
+                             MatrixPtr inputValue) {
+  REGISTER_TIMER_INFO("LstmFwBatchTime", getName().c_str());
+
+  hl_lstm_value lstmValue;
+  lstmValue.checkIg = checkIg_->getData();
+  lstmValue.checkFg = checkFg_->getData();
+  lstmValue.checkOg = checkOg_->getData();
+
+  if (!batchValue_) {
+    batchValue_.reset(new SequenceToBatch(useGpu_));
+  }
+  batchValue_->resizeOrCreateBatch(
+      batchSize, numSequences, starts, reversed_, prevOutput_ ? true : false);
+
+  batchValue_->resizeOrCreate(*output_.value);
+  batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */ true);
+  if (bias_) {
+    gate_.value->addBias(*localBias_, 1);
+  }
+
+  {
+    int numBatch = batchValue_->getNumBatch();
+    int batchSize = 0;
+    AsyncGpuBlock asyncGpuBlock;
+    if (prevState_) {
+      lstmValue.prevStateValue = totalState_->getData();
+    } else {
+      lstmValue.prevStateValue = nullptr;
+    }
+    for (int n = 0; n < numBatch; n++) {
+      MatrixPtr outputValue = batchValue_->getBatchValue(n);
+      MatrixPtr gateValue = batchValue_->getBatchValue(*gate_.value, n);
+      batchSize = outputValue->getHeight();
+
+      if (n != 0) {
+        MatrixPtr batch1 = batchValue_->getBatchValue(n - 1, batchSize);
+        gateValue->mul(*batch1, *weight_->getW(), 1, 1);
+      } else if (prevOutput_) {
+        Matrix::resizeOrCreate(prevBatchOutput2_,
+                               gateValue->getHeight(),
+                               getSize(),
+                               false,
+                               useGpu_);
+        batchValue_->prevOutput2Batch(*prevOutput_, *prevBatchOutput2_);
+        gateValue->mul(*prevBatchOutput2_, *weight_->getW(), 1, 1);
+
+        batchValue_->prevOutput2Batch(*prevState_,
+                                      *totalState_->subMatrix(0, numSequences));
+      }
+
+      lstmValue.gateValue = gateValue->getData();
+      lstmValue.outputValue = outputValue->getData();
+      lstmValue.stateValue =
+          batchValue_->getBatchValue(*state_.value, n)->getData();
+      lstmValue.stateActiveValue =
+          batchValue_->getBatchValue(*preOutput_.value, n)->getData();
+      {
+        if (useGpu_) {
+          LstmCompute::forwardBatch<1>(lstmValue, getSize(), batchSize);
+        } else {
+          LstmCompute::forwardBatch<0>(lstmValue, getSize(), batchSize);
+        }
+      }
+      lstmValue.prevStateValue = lstmValue.stateValue;
+    }
+  }
+  {
+    REGISTER_TIMER_INFO("batchToSeq", getName().c_str());
+    batchValue_->copyBackSeq(*output_.value);
+  }
+  if (prevOutput_) {
+    getPrevBatchOutput(numSequences);
+    getPrevBatchState(numSequences);
+  }
+}
+
+void LstmLayer::getPrevBatchOutput(size_t numSequences) {
+  prevOutput_->resize(numSequences, getSize());
+  batchValue_->getSeqOutputFromBatch(*prevOutput_,
+                                     *batchValue_->getBatchValue());
+}
+
+void LstmLayer::getPrevBatchState(size_t numSequences) {
+  prevState_->resize(numSequences, getSize());
+  batchValue_->getSeqOutputFromBatch(*prevState_, *state_.value);
+}
+
+void LstmLayer::backwardBatch(int batchSize,
+                              size_t numSequences,
+                              const int *starts,
+                              MatrixPtr inputGrad) {
+  REGISTER_TIMER_INFO("LstmBwBatchTime", getName().c_str());
+
+  hl_lstm_value lstmValue;
+  lstmValue.checkIg = checkIg_->getData();
+  lstmValue.checkFg = checkFg_->getData();
+  lstmValue.checkOg = checkOg_->getData();
+
+  hl_lstm_grad lstmGrad;
+  lstmGrad.stateActiveGrad = preOutput_.grad->getData();
+
+  if (bias_->getWGrad()) {
+    lstmGrad.checkIgGrad = checkIgGrad_->getData();
+    lstmGrad.checkFgGrad = checkFgGrad_->getData();
+    lstmGrad.checkOgGrad = checkOgGrad_->getData();
+  } else {
+    lstmGrad.checkIgGrad = nullptr;
+    lstmGrad.checkFgGrad = nullptr;
+    lstmGrad.checkOgGrad = nullptr;
+  }
+
+  if (!batchGrad_) {
+    batchGrad_.reset(new SequenceToBatch(useGpu_));
+  }
+  batchGrad_->shareIndexWith(*batchValue_);
+
+  {
+    REGISTER_TIMER_INFO("seqToBatch", getName().c_str());
+    batchGrad_->copyFromSeq(*output_.grad);
+  }
+
+  {
+    MatrixPtr weightT = weight_->getW()->getTranspose();
+    int numBatch = batchGrad_->getNumBatch();
+    int batchSize = 0;
+    AsyncGpuBlock asyncGpuBlock;
+    for (int n = (int)numBatch - 1; n >= 0; n--) {
+      MatrixPtr outputGrad = batchGrad_->getBatchValue(n);
+      MatrixPtr gateGrad = batchGrad_->getBatchValue(*gate_.grad, n);
+
+      lstmValue.gateValue =
+          batchGrad_->getBatchValue(*gate_.value, n)->getData();
+      lstmValue.stateValue =
+          batchGrad_->getBatchValue(*state_.value, n)->getData();
+      lstmValue.stateActiveValue =
+          batchGrad_->getBatchValue(*preOutput_.value, n)->getData();
+      lstmGrad.stateGrad =
+          batchGrad_->getBatchValue(*state_.grad, n)->getData();
+      lstmGrad.gateGrad = gateGrad->getData();
+      lstmGrad.outputGrad = outputGrad->getData();
+      {
+        batchSize = outputGrad->getHeight();
+        if (n != 0) {
+          lstmValue.prevStateValue =
+              batchGrad_->getBatchValue(*state_.value, n - 1)->getData();
+          lstmGrad.prevStateGrad =
+              batchGrad_->getBatchValue(*state_.grad, n - 1)->getData();
+        } else {
+          if (prevState_) {
+            lstmValue.prevStateValue = totalState_->getData();
+            lstmGrad.prevStateGrad = nullptr;
+          } else {
+            lstmValue.prevStateValue = nullptr;
+            lstmGrad.prevStateGrad = nullptr;
+          }
+        }
+        if (useGpu_) {
+          LstmCompute::backwardBatch<1>(
+              lstmValue, lstmGrad, getSize(), batchSize);
+        } else {
+          LstmCompute::backwardBatch<0>(
+              lstmValue, lstmGrad, getSize(), batchSize);
+        }
+      }
+
+      if (n != 0) {
+        MatrixPtr tmp = batchGrad_->getBatchValue(n - 1, batchSize);
+        tmp->mul(*gateGrad, *weightT, 1, 1);
+      }
+
+      if (n != 0 && weight_->getWGrad()) {
+        /* backward weight */
+        MatrixPtr outputValue = batchValue_->getBatchValue(n - 1, batchSize);
+        weight_->getWGrad()->mul(*outputValue->getTranspose(), *gateGrad, 1, 1);
+      } else if (prevOutput_ && weight_->getWGrad()) {
+        weight_->getWGrad()->mul(
+            *prevBatchOutput2_->getTranspose(), *gateGrad, 1, 1);
+      }
+    }
+  }
+
+  if (inputGrad) {
+    batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */ false);
+  }
+  if (bias_ && bias_->getWGrad()) {
+    localBiasGrad_->collectBias(*gate_.grad, /* scale */ 1);
+  }
+}
+
+void LstmLayer::forwardSeqParallel(int batchSize,
+                                   size_t numSequences,
+                                   const int *starts,
+                                   MatrixPtr inputValue) {
+  REGISTER_TIMER_INFO("LstmFwSeqParallelTime", getName().c_str());
+  gate_.value->assign(*inputValue);
+  if (bias_) {
+    gate_.value->addBias(*localBias_, /* scale */ 1);
+  }
+
+  real *gateValue = gate_.value->getData();
+  real *stateValue = state_.value->getData();
+  real *outputValue = output_.value->getData();
+  real *preOutputValue = preOutput_.value->getData();
+  real *checkIg = checkIg_->getData();
+  real *checkFg = checkFg_->getData();
+  real *checkOg = checkOg_->getData();
+  real *weight = weight_->getW()->getData();
+  hl_lstm_parallel_forward(gateValue,
+                           stateValue,
+                           preOutputValue,
+                           outputValue,
+                           checkIg,
+                           checkFg,
+                           checkOg,
+                           weight,
+                           starts,
+                           getSize(),
+                           numSequences,
+                           reversed_,
+                           activeNode_,
+                           activeGate_,
+                           activeState_);
+}
+
+void LstmLayer::backwardSeqParallel(int batchSize,
+                                    size_t numSequences,
+                                    const int *starts,
+                                    MatrixPtr inputGrad) {
+  REGISTER_TIMER_INFO("LstmBwSeqParallelTime", getName().c_str());
+  real *gateValue = gate_.value->getData();
+  real *gateGrad = gate_.grad->getData();
+  real *stateValue = state_.value->getData();
+  real *stateGrad = state_.grad->getData();
+  real *preOutputValue = preOutput_.value->getData();
+  real *preOutputGrad = preOutput_.grad->getData();
+  real *checkIg = checkIg_->getData();
+  real *checkFg = checkFg_->getData();
+  real *checkOg = checkOg_->getData();
+  real *outputGrad = output_.grad->getData();
+  real *weight = weight_->getW()->getData();
+
+  real *checkIgGrad;
+  real *checkFgGrad;
+  real *checkOgGrad;
+  if (bias_->getWGrad()) {
+    checkIgGrad = checkIgGrad_->getData();
+    checkFgGrad = checkFgGrad_->getData();
+    checkOgGrad = checkOgGrad_->getData();
+  } else {
+    checkIgGrad = nullptr;
+    checkFgGrad = nullptr;
+    checkOgGrad = nullptr;
+  }
+
+  hl_lstm_parallel_backward_data(gateValue,
+                                 gateGrad,
+                                 stateValue,
+                                 stateGrad,
+                                 preOutputValue,
+                                 preOutputGrad,
+                                 outputGrad,
+                                 checkIg,
+                                 checkIgGrad,
+                                 checkFg,
+                                 checkFgGrad,
+                                 checkOg,
+                                 checkOgGrad,
+                                 weight,
+                                 starts,
+                                 getSize(),
+                                 numSequences,
+                                 reversed_,
+                                 activeNode_,
+                                 activeGate_,
+                                 activeState_);
+
+  if (inputGrad) {
+    inputGrad->add(*gate_.grad);
+  }
+  if (bias_ && bias_->getWGrad()) {
+    localBiasGrad_->collectBias(*gate_.grad, 1);
+  }
+
+  real *outputValue = output_.value->getData();
+  if (weight_->getWGrad()) {
+    real *weightGrad = weight_->getWGrad()->getData();
+    hl_lstm_parallel_backward_weight(weightGrad,
+                                     outputValue,
+                                     gateGrad,
+                                     starts,
+                                     getSize(),
+                                     batchSize,
+                                     numSequences,
+                                     reversed_);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LstmLayer.h b/paddle/legacy/gserver/layers/LstmLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c8b382f505d791fb1ef4265dcfe95046aa832fb
--- /dev/null
+++ b/paddle/legacy/gserver/layers/LstmLayer.h
@@ -0,0 +1,221 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "LstmCompute.h"
+#include "SequenceToBatch.h"
+#include "paddle/legacy/math/BaseMatrix.h"
+#include "paddle/legacy/math/Matrix.h"
+namespace paddle {
+
+/**
+ * @brief LstmLayer takes 1 input layer with size * 4.
+ * Input layer is diveded into 4 equal parts:
+ *   (input_s, input_ig, input_fg, input_og)
+ *
+ * For each sequence [start, end] it performs the following computation:
+ * @code
+ * output_{i} = actState(state_{i}) * actGate(outputGate_{i})
+ * state_{i} = actInput(input_s_{i} + bias_s +
+ *             output_{i-1} * recurrIW) * actGate(inputGate_{i}) +
+ *             actGate(forgetGate_{i}) * state_{i-1}
+ * inputGate = input_ig_{i} + bias_ig + output_{i-1} * recurrIGW +
+ *             state_{i-1} * inputCheck
+ * ouputGate = input_og_{i} + bias_og + output_{i-1} * recurrOGW +
+ *             state_{i} * outputCheck
+ * forgetGate = input_fg_{i} + bias_fg + output_{i-1} * recurrFGW +
+ *              state_{i-1} * forgetCheck
+ * @endcode
+ *
+ * - parameter[0] consists of (recurrIW, recurrIGW, recurrFGW, recurrOGW)
+ * - baisParameter consists of
+ *   (bias_s, bias_ig, bias_og, bias_fg, inputCheck, forgetCheck, outputCheck)
+ *
+ * - actInput is defined by config active_type.
+ * - actState is defined by config active_state_type.
+ * - actGate is defined by config actvie_gate_type.
+ *
+ * There are two ways to compute, namely one sequence by one sequence or
+ * one batch by one batch. By default and no setting pre_batch_state true,
+ * it will compute batch by batch.
+ *
+ * The formula in the paper is as follows:
+ * \f[
+ * i_t = \sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i) \\
+ * f_t = \sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f) \\
+ * \tilde{c_t} = tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c) \\
+ * o_t = \sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o) \\
+ * c_t = f_t * c_{t-1} + i_t * \tilde{c_t} \\
+ * h_t = o_t tanh(c_t)
+ * \f]
+ *
+ * @note These \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$
+ * operations on the input sequence were NOT included in LstmLayer. So
+ * users should use fc_layer or mixed_layer before lstm_later.
+ *
+ * The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
+ * The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf}, W_{co}\f$.
+ */
+
+class LstmLayer : public Layer, public LstmCompute {
+ public:
+  explicit LstmLayer(const LayerConfig &config) : Layer(config) {}
+
+  bool init(const LayerMap &layerMap,
+            const ParameterMap &parameterMap) override;
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback &callback) override;
+
+  void resetState() override;
+
+  void setState(LayerStatePtr state) override;
+
+  LayerStatePtr getState() override;
+
+ protected:
+  /**
+   * @brief Compute lstm forward one sequence by one sequence.
+   * @param batchSize The batchSize is not equal to the batch_size in
+   * the config file. It is the total words number of all samples
+   * in this forward batch.
+   * @param numSequences The sample number. It is equal to the batch_size
+   * in the config file.
+   * @param starts Each start position of each samples.
+   * @param inputValue The input values.
+   */
+  void forwardSequence(int batchSize,
+                       size_t numSequences,
+                       const int *starts,
+                       MatrixPtr inputValue);
+  /**
+   * Compute lstm backward one sequence by one sequence.
+   */
+  void backwardSequence(int batchSize,
+                        size_t numSequences,
+                        const int *starts,
+                        MatrixPtr inputGrad);
+
+  /**
+   * Compute lstm forward one batch by one batch. The batch value is
+   * reorganized by SequenceToBatch class. The batch output value will
+   * be convert into sequence value after finishing forward. Here, one
+   * batch contains one word of each sample. If the length of each sample
+   * is not equality, the batch will not pads zero and contains less words.
+   * The total batch numbers are the max length of the sequence. The details
+   * can refer to SequenceToBatch class. On GPU mode, it will launch GPU
+   * kernel for loop.
+   *
+   * @code
+   * for (int i = 0; i < numBatch(max_sequence_length); ++i) {
+   *   compute one batch.
+   * }
+   * @endcode
+   */
+  void forwardBatch(int batchSize,
+                    size_t numSequences,
+                    const int *starts,
+                    MatrixPtr inputValue);
+  /**
+   * Compute lstm backward one batch by one batch.
+   */
+  void backwardBatch(int batchSize,
+                     size_t numSequences,
+                     const int *starts,
+                     MatrixPtr inputGrad);
+
+  /**
+   * This function only supports GPU. It not need to reorganize input into
+   * batch value. It will launch one kernel to parallelly compute forward
+   * propagation in sequence level.
+   */
+  void forwardSeqParallel(int batchSize,
+                          size_t numSequences,
+                          const int *starts,
+                          MatrixPtr inputValue);
+  /**
+   * Backward propagation corresponding to forwardSeqParallel.
+   */
+  void backwardSeqParallel(int batchSize,
+                           size_t numSequences,
+                           const int *starts,
+                           MatrixPtr inputGrad);
+  /**
+   * This function is used for sequence generation and get output after
+   * forwardBatch.
+   */
+  void getPrevBatchOutput(size_t numSequences);
+  /**
+   * This function is used for sequence generation and get state after
+   * forwardBatch.
+   */
+  void getPrevBatchState(size_t numSequences);
+
+ protected:
+  /// Learned parameters, shape: (size, 4*size).
+  /// The weight ([size, 4*size]) contains \f$W_{hi}, W_{hf}, W_{hc}, W_{ho}\f$.
+  std::unique_ptr<Weight> weight_;
+  /// Learned bias parameter, shape: (1, 7 * size).
+  /// The bias contains \f$b_i, b_f, b_c, b_o\f$ and \f$W_{ci}, W_{cf},
+  /// W_{co}\f$.
+  std::unique_ptr<Weight> bias_;
+  /// The reeal bias, point to \f$b_i, b_f, b_c, b_o\f$.
+  MatrixPtr localBias_;
+  /// The peephole connection for input gate.
+  MatrixPtr checkIg_;
+  /// The peephole connection for forget gate.
+  MatrixPtr checkFg_;
+  /// The peephole connection for output gate.
+  MatrixPtr checkOg_;
+  /// The gradient of real bias
+  MatrixPtr localBiasGrad_;
+  /// The gradient of peephole connection for input gates.
+  MatrixPtr checkIgGrad_;
+  /// The gradient of peephole connection for forget gates.
+  MatrixPtr checkFgGrad_;
+  /// The gradient of peephole connection for output gates.
+  MatrixPtr checkOgGrad_;
+
+  /// Stores the cell state of previous time step, namely \f$c_{t-1}\f$.
+  Argument state_;
+  /// Stores the hidden of previous time step, namely \f$h_{t-1}\f$.
+  Argument preOutput_;
+  /// Stores the value and gradient of four gates, namely
+  /// \f$i_t, f_t, o_t, c_t\f$.
+  Argument gate_;
+  /// Whether it is reversed lstm.
+  bool reversed_;
+  /// Whether to use batch method to compute.
+  bool useBatch_;
+  /// Whether to use sequence parallell method to compute.
+  bool useSeqParallel_;
+  /// batchValue_ is used in method of batch calculation. It stores the
+  /// batch value after reorganized input.
+  std::unique_ptr<SequenceToBatch> batchValue_;
+  /// The gradient of batchValue_.
+  std::unique_ptr<SequenceToBatch> batchGrad_;
+
+  /// Used in generation and stores the state of previous time step.
+  MatrixPtr prevState_;
+  /// Used in generation and stores the output of previous time step.
+  MatrixPtr prevOutput_;
+  MatrixPtr prevBatchOutput2_;
+  /// The total state.
+  MatrixPtr totalState_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/LstmStepLayer.cpp b/paddle/legacy/gserver/layers/LstmStepLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f02f8ad62fe4d4cb4bb580923200b398c8483a99
--- /dev/null
+++ b/paddle/legacy/gserver/layers/LstmStepLayer.cpp
@@ -0,0 +1,194 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "LstmCompute.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/*
+ * LstmStepLayer used in recurrent layer group.
+ */
+class LstmStepLayer : public Layer, public LstmCompute {
+ protected:
+  Argument state_;
+  Argument gate_;
+  Argument stateActive_;
+  MatrixPtr checkIg_, checkFg_, checkOg_;
+  MatrixPtr checkIgGrad_, checkFgGrad_, checkOgGrad_;
+  std::unique_ptr<Weight> weight_;
+
+ public:
+  explicit LstmStepLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~LstmStepLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(lstm_step, LstmStepLayer);
+
+bool LstmStepLayer::init(const LayerMap& layerMap,
+                         const ParameterMap& parameterMap) {
+  if (!Layer::init(layerMap, parameterMap)) return false;
+  CHECK_EQ(2U, inputLayers_.size());
+
+  checkIg_ = Matrix::create(nullptr,
+                            /* height= */ 1,
+                            getSize(),
+                            /* trans= */ false,
+                            useGpu_);
+  checkFg_ = Matrix::create(nullptr,
+                            /* height= */ 1,
+                            getSize(),
+                            /* trans= */ false,
+                            useGpu_);
+  checkOg_ = Matrix::create(nullptr,
+                            /* height= */ 1,
+                            getSize(),
+                            /* trans= */ false,
+                            useGpu_);
+  checkIgGrad_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
+  checkFgGrad_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
+  checkOgGrad_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                getSize(),
+                                /* trans= */ false,
+                                useGpu_);
+
+  if (biasParameter_.get() != NULL) {
+    CHECK_EQ(getSize() * 3, biasParameter_->getSize());
+    weight_.reset(new Weight(1, getSize() * 3, biasParameter_));
+    if (weight_->getW()) {
+      real* data = weight_->getW()->getData();
+      checkIg_->setData(data);
+      checkFg_->setData(data + getSize());
+      checkOg_->setData(data + getSize() * 2);
+    }
+
+    if (weight_->getWGrad()) {
+      real* data = weight_->getWGrad()->getData();
+      checkIgGrad_->setData(data);
+      checkFgGrad_->setData(data + getSize());
+      checkOgGrad_->setData(data + getSize() * 2);
+    }
+  }
+
+  setOutput("state", &state_);
+  LstmCompute::init(config_);
+  return true;
+}
+
+void LstmStepLayer::forward(PassType passType) {
+  REGISTER_TIMER_INFO("LstmRecurrentFwTime", getName().c_str());
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+  const Argument& prevState = getInput(1);
+  CHECK_EQ(getSize() * 4, input.value->getWidth());
+  CHECK_EQ(getSize(), prevState.value->getWidth());
+  int batchSize = input.getBatchSize();
+  reserveOutput(batchSize, getSize());
+  resetSpecifyOutput(state_,
+                     batchSize,
+                     getSize(),
+                     /*  isValueClean */ false,
+                     /* isGradClean */ true);
+  resetSpecifyOutput(gate_,
+                     batchSize,
+                     getSize() * 4,
+                     /* isValueClean */ false,
+                     /* isGradClean */ false);
+  resetSpecifyOutput(stateActive_,
+                     batchSize,
+                     getSize(),
+                     /*  isValueClean */ false,
+                     /* isGradClean */ false);
+  gate_.value->assign(*input.value);
+
+  hl_lstm_value lstmValue;
+  lstmValue.checkIg = checkIg_->getData();
+  lstmValue.checkFg = checkFg_->getData();
+  lstmValue.checkOg = checkOg_->getData();
+  lstmValue.gateValue = gate_.value->getData();
+  lstmValue.stateValue = state_.value->getData();
+  lstmValue.prevStateValue = prevState.value->getData();
+  lstmValue.stateActiveValue = stateActive_.value->getData();
+  lstmValue.outputValue = output_.value->getData();
+
+  if (useGpu_) {
+    LstmCompute::forwardBatch<1>(lstmValue, getSize(), batchSize);
+  } else {
+    LstmCompute::forwardBatch<0>(lstmValue, getSize(), batchSize);
+  }
+}
+
+void LstmStepLayer::backward(const UpdateCallback& callback) {
+  REGISTER_TIMER_INFO("LstmRecurrentBwTime", getName().c_str());
+  const Argument& input = getInput(0);
+  const Argument& prevState = getInput(1);
+  int batchSize = input.getBatchSize();
+
+  hl_lstm_value lstmValue;
+  hl_lstm_grad lstmGrad;
+  lstmValue.checkIg = checkIg_->getData();
+  lstmValue.checkFg = checkFg_->getData();
+  lstmValue.checkOg = checkOg_->getData();
+  lstmValue.gateValue = gate_.value->getData();
+  lstmValue.prevStateValue = prevState.value->getData();
+  lstmValue.stateValue = state_.value->getData();
+  lstmValue.stateActiveValue = stateActive_.value->getData();
+
+  lstmGrad.gateGrad = gate_.grad->getData();
+  if (prevState.grad) {
+    lstmGrad.prevStateGrad = prevState.grad->getData();
+  } else {
+    lstmGrad.prevStateGrad = nullptr;
+  }
+  lstmGrad.stateGrad = state_.grad->getData();
+  lstmGrad.stateActiveGrad = stateActive_.grad->getData();
+  lstmGrad.outputGrad = output_.grad->getData();
+  lstmGrad.checkIgGrad = checkIgGrad_->getData();
+  lstmGrad.checkFgGrad = checkFgGrad_->getData();
+  lstmGrad.checkOgGrad = checkOgGrad_->getData();
+
+  if (useGpu_) {
+    LstmCompute::backwardBatch<1>(lstmValue, lstmGrad, getSize(), batchSize);
+  } else {
+    LstmCompute::backwardBatch<0>(lstmValue, lstmGrad, getSize(), batchSize);
+  }
+
+  if (input.grad) {
+    input.grad->add(*gate_.grad);
+  }
+
+  if (weight_) {
+    weight_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MDLstmLayer.cpp b/paddle/legacy/gserver/layers/MDLstmLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4838183e8ccb213aa249fddf5102026198e98d3c
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MDLstmLayer.cpp
@@ -0,0 +1,769 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "LstmLayer.h"
+#include "paddle/legacy/math/BaseMatrix.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+class CoordIterator {
+ public:
+  std::vector<int> dims_;
+  std::vector<bool> directions_;
+  std::vector<int> curPos_;
+  bool end_;
+
+  void step(size_t d, bool reversed) {
+    if (directions_[d] ^ reversed) {
+      if (curPos_[d] == dims_[d] - 1) {
+        curPos_[d] = 0;
+        if (d) {
+          step(d - 1, reversed);
+        } else {
+          end_ = true;
+        }
+      } else {
+        curPos_[d]++;
+      }
+    } else {
+      if (curPos_[d] == 0) {
+        curPos_[d] = dims_[d] - 1;
+        if (d) {
+          step(d - 1, reversed);
+        } else {
+          end_ = true;
+        }
+      } else {
+        curPos_[d]--;
+      }
+    }
+  }
+
+ public:
+  CoordIterator(std::vector<int> dim, std::vector<bool> directions)
+      : dims_(dim), directions_(directions), end_(false) {
+    CHECK_EQ(dims_.size(), directions_.size());
+    for (size_t i = 0; i < dims_.size(); i++) {
+      curPos_.push_back(-1);
+    }
+  }
+  CoordIterator& operator++() {
+    step(dims_.size() - 1, false);
+    return *this;
+  }
+
+  CoordIterator& operator--() {
+    step(dims_.size() - 1, true);
+    return *this;
+  }
+
+  std::vector<int>& curPos() { return curPos_; }
+
+  int offset() {
+    int offset = curPos_[0];
+    for (size_t i = 1; i < dims_.size(); i++) {
+      offset = offset * dims_[i] + curPos_[i];
+    }
+    return offset;
+  }
+
+  int offset(const std::vector<int>& pos) {
+    int offset = pos[0];
+    for (size_t i = 1; i < dims_.size(); i++) {
+      offset = offset * dims_[i] + pos[i];
+    }
+    return offset;
+  }
+
+  std::vector<int>& begin() {
+    for (size_t i = 0; i < dims_.size(); i++) {
+      curPos_[i] = directions_[i] ? 0 : dims_[i] - 1;
+    }
+    end_ = false;
+    return curPos_;
+  }
+
+  std::vector<int>& rbegin() {
+    for (size_t i = 0; i < dims_.size(); i++) {
+      curPos_[i] = directions_[i] ? dims_[i] - 1 : 0;
+    }
+    end_ = false;
+    return curPos_;
+  }
+
+  bool end() { return end_; }
+
+  bool getPrePos(const std::vector<int>& delays,
+                 int idx,
+                 std::vector<int>& prePos) {
+    bool isAvial = true;
+    prePos.clear();
+    prePos.reserve(directions_.size());
+    for (size_t i = 0; i < directions_.size(); i++) {
+      if (int(i) == idx) {
+        prePos.push_back(curPos_[i] + delays[i] * (directions_[i] ? 1 : -1));
+        if (prePos[i] < 0) {
+          prePos[i] = 0;
+          isAvial = false;
+        }
+        if (prePos[i] >= dims_[i]) {
+          prePos[i] = dims_[i] - 1;
+          isAvial = false;
+        }
+      } else {
+        prePos.push_back(curPos_[i]);
+      }
+    }
+    return isAvial;
+  }
+
+  bool getNextPos(const std::vector<int>& delays,
+                  int idx,
+                  std::vector<int>& nextPos) {
+    bool isAvial = true;
+    nextPos.clear();
+    nextPos.reserve(directions_.size());
+    for (size_t i = 0; i < directions_.size(); i++) {
+      if (int(i) == idx) {
+        nextPos.push_back(curPos_[i] - delays[i] * (directions_[i] ? 1 : -1));
+        if (nextPos[i] < 0) {
+          nextPos[i] = 0;
+          isAvial = false;
+        }
+        if (nextPos[i] >= dims_[i]) {
+          nextPos[i] = dims_[i] - 1;
+          isAvial = false;
+        }
+      } else {
+        nextPos.push_back(curPos_[i]);
+      }
+    }
+    return isAvial;
+  }
+};
+/*
+ * MDLstmLayer takes 1 input layer with size * (3+numDims).
+ * For each sequence [start, end] it performs the following computation:
+ * out_i = actState(state_i) * actGate(outputGate_i)
+ *
+ * For example the image with 2 dims, we take the scanning order from left-top
+ * to right-bottom, then the 2 previous states of the current pixels are the
+ * ones located at left and top. And each of them has a independent forget gate.
+ *
+ * state_i = actInput(input_i) * actGate(inputGate_i) +
+ *           \sum{j}(actGate(forgetGate_i_j) * state_prev_i_j)
+ *
+ * inputGate = input_i * inputW + \sum{j}(output_prev_i_j * recurrInputW_j) +
+ *             \sum{j}(state_prev_i_j * inputCheck_j)
+ *
+ * ouputGate = input_i * outputW + \sum{j}(output_prev_i_j * recurrOutputW_j) +
+ *             state_i * outputCheck
+ *
+ * forgetGate_j = input_i * forgetW_j + \sum{j}(output_prev_i_j *
+ *                recurrForgetW_j) + \sum{j}(state_prev_i_j * forgetCheck_j)
+ *
+ * IG Layer: (Input, InputGate, ForgetGates, OutputGate) * OutputSize
+ * */
+
+class MDLstmLayer : public LstmLayer {
+ public:
+  explicit MDLstmLayer(const LayerConfig& config) : LstmLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback& callback) override;
+
+ protected:
+  void forwardOneSequence(int start, CoordIterator& coordIter);
+  void backwardOneSequence(int start, CoordIterator& coordIter);
+  void forwardGate2OutputSequence(int start, CoordIterator& coordIter);
+  void backwardGate2OutputSequence(int start, CoordIterator& coordIter);
+
+ protected:
+  std::vector<Argument> frameInputGate_;
+  std::vector<Argument> frameForgetGate_;
+  std::vector<Argument> frameOutputGate_;
+  std::vector<Argument> frameInputNode_;
+  std::vector<Argument> frameGate_;
+  std::vector<Argument> frameState_;
+  std::vector<Argument> framePreOutput_;
+  std::vector<Argument> frameOutput_;
+
+  // Activation
+  std::unique_ptr<ActivationFunction> activationGate_;
+  std::unique_ptr<ActivationFunction> activationState_;
+
+  int numDims_;
+  size_t numBlocks_;
+  std::vector<bool> directions_;
+  std::vector<int> delays_;
+  std::vector<std::vector<int>> dimsV_;
+};
+
+REGISTER_LAYER(mdlstmemory, MDLstmLayer);
+
+bool MDLstmLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  if (!Layer::init(layerMap, parameterMap)) return false;
+  CHECK_EQ(1U, inputLayers_.size());
+  CHECK_EQ(1U, parameters_.size());
+
+  numBlocks_ = getSize();
+  numDims_ = config_.directions_size();
+  CHECK_EQ(numBlocks_ * numBlocks_ * (3 + numDims_), parameters_[0]->getSize());
+
+  // inode(1), ig(1), fg(numDims_), og(1), peepIg(1), peepFg(numDims_),
+  // peepOg(1), then size of localBias_ is 3+numDims_
+  CHECK_EQ(numBlocks_ * (5 + 2 * numDims_), biasParameter_->getSize());
+  weight_.reset(
+      new Weight(numBlocks_, numBlocks_ * (3 + numDims_), parameters_[0]));
+  if (biasParameter_.get() != NULL) {
+    bias_.reset(new Weight(1, numBlocks_ * (5 + 2 * numDims_), biasParameter_));
+    localBias_ = Matrix::create(nullptr,
+                                /* height= */ 1,
+                                numBlocks_ * (3 + numDims_),
+                                /* trans= */ false,
+                                useGpu_);
+    checkIg_ = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    checkFg_ = Matrix::create(nullptr,
+                              /* height= */ numDims_,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    checkOg_ = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    localBiasGrad_ = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    numBlocks_ * (3 + numDims_),
+                                    /* trans= */ false,
+                                    useGpu_);
+    checkIgGrad_ = Matrix::create(nullptr,
+                                  /* height= */ 1,
+                                  numBlocks_,
+                                  /* trans= */ false,
+                                  useGpu_);
+    checkFgGrad_ = Matrix::create(nullptr,
+                                  /* height= */ numDims_,
+                                  numBlocks_,
+                                  /* trans= */ false,
+                                  useGpu_);
+    checkOgGrad_ = Matrix::create(nullptr,
+                                  /* height= */ 1,
+                                  numBlocks_,
+                                  /* trans= */ false,
+                                  useGpu_);
+
+    localBias_->setData(bias_->getW()->getData());
+    checkIg_->setData(bias_->getW()->getData() + numBlocks_ * (3 + numDims_));
+    checkFg_->setData(bias_->getW()->getData() + numBlocks_ * (4 + numDims_));
+    checkOg_->setData(bias_->getW()->getData() +
+                      numBlocks_ * (4 + 2 * numDims_));
+
+    if (bias_->getWGrad()) {
+      localBiasGrad_->setData(bias_->getWGrad()->getData());
+      checkIgGrad_->setData(bias_->getWGrad()->getData() +
+                            numBlocks_ * (3 + numDims_));
+      checkFgGrad_->setData(bias_->getWGrad()->getData() +
+                            numBlocks_ * (4 + numDims_));
+      checkOgGrad_->setData(bias_->getWGrad()->getData() +
+                            numBlocks_ * (4 + 2 * numDims_));
+    }
+  } else {
+    LOG(FATAL) << "Bias should be here.";
+  }
+  for (int i = 0; i < numDims_; i++) {
+    directions_.push_back(config_.directions(i));
+  }
+  for (int i = 0; i < numDims_; i++) {
+    delays_.push_back(-1);
+  }
+  activationGate_.reset(ActivationFunction::create(config_.active_gate_type()));
+  activationState_.reset(
+      ActivationFunction::create(config_.active_state_type()));
+
+  return true;
+}
+
+void MDLstmLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+  CHECK(input.sequenceStartPositions);
+  int batchSize = input.getBatchSize();
+  int numSequences = input.getNumSequences();
+  resetOutput(batchSize, numBlocks_);
+  CHECK_EQ(numBlocks_ * (3 + numDims_), input.value->getWidth());
+  const int* starts = input.sequenceStartPositions->getData(false);
+  CHECK_EQ(starts[numSequences], batchSize);
+
+  int* dimsData = input.cpuSequenceDims->getData();
+  CHECK_EQ(int(input.cpuSequenceDims->getSize()), numDims_* numSequences);
+
+  for (int i = 0; i < numSequences; i++) {
+    std::vector<int> dims;
+    for (int j = 0; j < numDims_; j++) {
+      dims.push_back(dimsData[i * numDims_ + j]);
+    }
+    dimsV_.push_back(dims);
+  }
+
+  frameInputGate_.reserve(batchSize);
+  frameForgetGate_.reserve(batchSize);
+  frameOutputGate_.reserve(batchSize);
+  frameInputNode_.reserve(batchSize);
+  frameGate_.reserve(batchSize);
+  frameState_.reserve(batchSize);
+  framePreOutput_.reserve(batchSize);
+  frameOutput_.reserve(batchSize);
+
+  Matrix::resizeOrCreate(gate_.value,
+                         /* height= */ batchSize,
+                         numBlocks_ * (3 + numDims_),
+                         /* trans= */ false,
+                         useGpu_);
+
+  for (int i = frameGate_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_ * (3 + numDims_),
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_ * (3 + numDims_),
+                              /* trans= */ false,
+                              useGpu_);
+    frameGate_.push_back(arg);
+  }
+  for (int i = frameInputGate_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    frameInputGate_.push_back(arg);
+  }
+  for (int i = frameForgetGate_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ numDims_,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ numDims_,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    frameForgetGate_.push_back(arg);
+  }
+  for (int i = frameOutputGate_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    frameOutputGate_.push_back(arg);
+  }
+  for (int i = frameInputNode_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    frameInputNode_.push_back(arg);
+  }
+  for (int i = frameState_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(
+        /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
+    frameState_.push_back(arg);
+  }
+  for (int i = framePreOutput_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(
+        /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
+    framePreOutput_.push_back(arg);
+  }
+  for (int i = frameOutput_.size(); i < batchSize; i++) {
+    Argument arg;
+    arg.value = Matrix::create(nullptr,
+                               /* height= */ 1,
+                               numBlocks_,
+                               /* trans= */ false,
+                               useGpu_);
+    arg.grad = Matrix::create(nullptr,
+                              /* height= */ 1,
+                              numBlocks_,
+                              /* trans= */ false,
+                              useGpu_);
+    frameOutput_.push_back(arg);
+  }
+
+  for (int i = 0; i < batchSize; i++) {
+    frameOutput_[i].value->setData(output_.value->getData() + i * numBlocks_);
+    frameGate_[i].value->setData(gate_.value->getData() +
+                                 i * numBlocks_ * (3 + numDims_));
+    frameInputNode_[i].value->setData(gate_.value->getData() +
+                                      i * numBlocks_ * (3 + numDims_) +
+                                      numBlocks_ * 0);
+    frameInputGate_[i].value->setData(gate_.value->getData() +
+                                      i * numBlocks_ * (3 + numDims_) +
+                                      numBlocks_ * 1);
+    frameForgetGate_[i].value->setData(gate_.value->getData() +
+                                       i * numBlocks_ * (3 + numDims_) +
+                                       numBlocks_ * 2);
+    frameOutputGate_[i].value->setData(gate_.value->getData() +
+                                       i * numBlocks_ * (3 + numDims_) +
+                                       numBlocks_ * (2 + numDims_));
+  }
+
+  AsyncGpuBlock asyncGpuBlock;
+  gate_.value->assign(*input.value);
+
+  if (bias_) {
+    gate_.value->addBias(*localBias_, 1);
+  }
+
+  for (int i = 0; i < numSequences; i++) {
+    CoordIterator coordIter(dimsV_[i], directions_);
+    forwardOneSequence(starts[i], coordIter);
+  }
+}
+
+void MDLstmLayer::forwardGate2OutputSequence(int start,
+                                             CoordIterator& coordIter) {
+  int idxCurr = start + coordIter.offset();
+  std::vector<int> preOffsetV;
+  preOffsetV.reserve(numDims_);
+  for (int i = 0; i < numDims_; i++) {
+    std::vector<int> prePos;
+    if (coordIter.getPrePos(delays_, i, prePos)) {
+      preOffsetV[i] = coordIter.offset(prePos);
+    } else {
+      preOffsetV[i] = -1;
+    }
+  }
+
+  for (int i = 0; i < numDims_; i++) {
+    if (preOffsetV[i] >= 0) {
+      frameInputGate_[idxCurr].value->addDotMul(
+          *frameState_[start + preOffsetV[i]].value, *checkIg_, 1.0, 1.0);
+
+      MatrixPtr fgGateOneDim = Matrix::create(
+          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_,
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
+      MatrixPtr checkFgOneDim =
+          Matrix::create(checkFg_->getData() + i * numBlocks_,
+                         1.0,
+                         numBlocks_,
+                         false,
+                         useGpu_);
+      fgGateOneDim->addDotMul(
+          *frameState_[start + preOffsetV[i]].value, *checkFgOneDim, 1.0, 1.0);
+    }
+  }
+  auto status = activationGate_->forward(frameInputGate_[idxCurr]);
+  status.check();
+  status = activationGate_->forward(frameForgetGate_[idxCurr]);
+  status.check();
+  status = activation_->forward(frameInputNode_[idxCurr]);
+  status.check();
+
+  frameState_[idxCurr].value->zeroMem();
+  for (int i = 0; i < numDims_; i++) {
+    if (preOffsetV[i] >= 0) {
+      MatrixPtr fgGateOneDim = Matrix::create(
+          frameForgetGate_[idxCurr].value->getData() + i * numBlocks_,
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
+      frameState_[idxCurr].value->addDotMul(
+          *frameState_[start + preOffsetV[i]].value, *fgGateOneDim, 1.0, 1.0);
+    }
+  }
+  frameState_[idxCurr].value->addDotMul(*frameInputNode_[idxCurr].value,
+                                        *frameInputGate_[idxCurr].value,
+                                        1.0,
+                                        1.0);
+
+  frameOutputGate_[idxCurr].value->addDotMul(
+      *frameState_[idxCurr].value, *checkOg_, 1.0, 1.0);
+  status = activationGate_->forward(frameOutputGate_[idxCurr]);
+  status.check();
+
+  framePreOutput_[idxCurr].value->copyFrom(*(frameState_[idxCurr].value));
+  status = activationState_->forward(framePreOutput_[idxCurr]);
+  status.check();
+
+  frameOutput_[idxCurr].value->dotMul(*framePreOutput_[idxCurr].value,
+                                      *frameOutputGate_[idxCurr].value);
+}
+
+void MDLstmLayer::forwardOneSequence(int start, CoordIterator& coordIter) {
+  for (coordIter.begin(); !coordIter.end(); ++coordIter) {
+    int offset = coordIter.offset();
+    for (int i = 0; i < numDims_; i++) {
+      std::vector<int> prePos;
+      if (coordIter.getPrePos(delays_, i, prePos)) {
+        int preOffset = coordIter.offset(prePos);
+        frameGate_[start + offset].value->mul(
+            *frameOutput_[start + preOffset].value, *weight_->getW(), 1.0, 1.0);
+      }
+    }
+    forwardGate2OutputSequence(start, coordIter);
+  }
+}
+
+void MDLstmLayer::backward(const UpdateCallback& callback) {
+  const Argument& input = getInput(0);
+  CHECK(input.sequenceStartPositions);
+  int batchSize = input.getBatchSize();
+  const int* starts = input.sequenceStartPositions->getData(false);
+  size_t numSequences = input.getNumSequences();
+
+  Matrix::resizeOrCreate(gate_.grad,
+                         /* height= */ batchSize,
+                         numBlocks_ * (3 + numDims_),
+                         /* trans= */ false,
+                         useGpu_);
+
+  for (int i = 0; i < batchSize; i++) {
+    if (frameState_[i].grad == NULL)
+      frameState_[i].grad = Matrix::create(
+          /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
+  }
+  for (int i = 0; i < batchSize; i++) {
+    if (framePreOutput_[i].grad == NULL)
+      framePreOutput_[i].grad = Matrix::create(
+          /* height= */ 1, numBlocks_, /* trans= */ false, useGpu_);
+  }
+
+  for (int i = 0; i < batchSize; i++) {
+    frameOutput_[i].grad->setData(output_.grad->getData() + i * numBlocks_);
+    frameGate_[i].grad->setData(gate_.grad->getData() +
+                                i * numBlocks_ * (3 + numDims_));
+    frameInputNode_[i].grad->setData(gate_.grad->getData() +
+                                     i * numBlocks_ * (3 + numDims_) +
+                                     numBlocks_ * 0);
+    frameInputGate_[i].grad->setData(gate_.grad->getData() +
+                                     i * numBlocks_ * (3 + numDims_) +
+                                     numBlocks_ * 1);
+    frameForgetGate_[i].grad->setData(gate_.grad->getData() +
+                                      i * numBlocks_ * (3 + numDims_) +
+                                      numBlocks_ * 2);
+    frameOutputGate_[i].grad->setData(gate_.grad->getData() +
+                                      i * numBlocks_ * (3 + numDims_) +
+                                      numBlocks_ * (2 + numDims_));
+  }
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+
+    for (size_t i = 0; i < numSequences; i++) {
+      CoordIterator coordIter(dimsV_[i], directions_);
+      backwardOneSequence(starts[i], coordIter);
+    }
+  }
+
+  if (input.grad) {
+    input.grad->add(*gate_.grad);
+  }
+  if (bias_ && bias_->getWGrad()) {
+    localBiasGrad_->collectBias(*gate_.grad, 1);
+    bias_->getParameterPtr()->incUpdate(callback);
+  }
+
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+
+void MDLstmLayer::backwardGate2OutputSequence(int start,
+                                              CoordIterator& coordIter) {
+  int idxCurr = start + coordIter.offset();
+  std::vector<int> preOffsetV;
+  std::vector<int> nextOffsetV;
+  preOffsetV.reserve(numDims_);
+  nextOffsetV.reserve(numDims_);
+  for (int i = 0; i < numDims_; i++) {
+    std::vector<int> prePos;
+    if (coordIter.getPrePos(delays_, i, prePos)) {
+      preOffsetV[i] = coordIter.offset(prePos);
+    } else {
+      preOffsetV[i] = -1;
+    }
+    std::vector<int> nextPos;
+    if (coordIter.getNextPos(delays_, i, nextPos)) {
+      nextOffsetV[i] = coordIter.offset(nextPos);
+    } else {
+      nextOffsetV[i] = -1;
+    }
+  }
+
+  framePreOutput_[idxCurr].grad->dotMul(*frameOutput_[idxCurr].grad,
+                                        *frameOutputGate_[idxCurr].value);
+  activationState_->backward(framePreOutput_[idxCurr]).check();
+  frameState_[idxCurr].grad->copyFrom(*(framePreOutput_[idxCurr].grad));
+
+  frameOutputGate_[idxCurr].grad->dotMul(*frameOutput_[idxCurr].grad,
+                                         *framePreOutput_[idxCurr].value);
+  activationGate_->backward(frameOutputGate_[idxCurr]).check();
+
+  frameState_[idxCurr].grad->addDotMul(
+      *frameOutputGate_[idxCurr].grad, *checkOg_, 1.0, 1.0);
+  for (int i = 0; i < numDims_; i++) {
+    if (nextOffsetV[i] >= 0) {
+      frameState_[idxCurr].grad->addDotMul(
+          *frameInputGate_[start + nextOffsetV[i]].grad, *checkIg_, 1.0, 1.0);
+
+      MatrixPtr fgGateOneDimGrad = Matrix::create(
+          frameForgetGate_[start + nextOffsetV[i]].grad->getData() +
+              i * numBlocks_,
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
+      MatrixPtr fgGateOneDimVal = Matrix::create(
+          frameForgetGate_[start + nextOffsetV[i]].value->getData() +
+              i * numBlocks_,
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
+      MatrixPtr checkFgOneDim = Matrix::create(
+          checkFg_->getData() + i * numBlocks_, 1, numBlocks_, false, useGpu_);
+
+      frameState_[idxCurr].grad->addDotMul(
+          *fgGateOneDimGrad, *checkFgOneDim, 1.0, 1.0);
+      frameState_[idxCurr].grad->addDotMul(
+          *frameState_[start + nextOffsetV[i]].grad,
+          *fgGateOneDimVal,
+          1.0,
+          1.0);
+    }
+  }
+
+  frameInputNode_[idxCurr].grad->dotMul(*frameState_[idxCurr].grad,
+                                        *frameInputGate_[idxCurr].value);
+  frameInputGate_[idxCurr].grad->dotMul(*frameState_[idxCurr].grad,
+                                        *frameInputNode_[idxCurr].value);
+
+  frameForgetGate_[idxCurr].grad->zeroMem();
+  for (int i = 0; i < numDims_; i++) {
+    if (preOffsetV[i] >= 0) {
+      MatrixPtr fgGateOneDimGrad = Matrix::create(
+          frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_,
+          1,
+          numBlocks_,
+          false,
+          useGpu_);
+      fgGateOneDimGrad->addDotMul(*frameState_[idxCurr].grad,
+                                  *frameState_[start + preOffsetV[i]].value,
+                                  1.0,
+                                  1.0);
+    }
+  }
+
+  activationGate_->backward(frameInputGate_[idxCurr]).check();
+  activationGate_->backward(frameForgetGate_[idxCurr]).check();
+  activation_->backward(frameInputNode_[idxCurr]).check();
+
+  if (bias_->getWGrad()) {
+    for (int i = 0; i < numDims_; i++) {
+      if (preOffsetV[i] >= 0) {
+        checkIgGrad_->addDotMul(*frameInputGate_[idxCurr].grad,
+                                *frameState_[start + preOffsetV[i]].value,
+                                1.0,
+                                1.0);
+
+        MatrixPtr fgGateOneDimGrad = Matrix::create(
+            frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_,
+            1,
+            numBlocks_,
+            false,
+            useGpu_);
+        MatrixPtr checkFgOneDimGrad =
+            Matrix::create(checkFgGrad_->getData() + i * numBlocks_,
+                           1,
+                           numBlocks_,
+                           false,
+                           useGpu_);
+        checkFgOneDimGrad->addDotMul(*fgGateOneDimGrad,
+                                     *frameState_[start + preOffsetV[i]].value,
+                                     1.0,
+                                     1.0);
+      }
+    }
+    checkOgGrad_->addDotMul(
+        *frameOutputGate_[idxCurr].grad, *frameState_[idxCurr].value, 1.0, 1.0);
+  }
+}
+
+void MDLstmLayer::backwardOneSequence(int start, CoordIterator& coordIter) {
+  MatrixPtr weightT = weight_->getW()->getTranspose();
+  for (coordIter.rbegin(); !coordIter.end(); --coordIter) {
+    int offset = coordIter.offset();
+    backwardGate2OutputSequence(start, coordIter);
+    for (int i = 0; i < numDims_; i++) {
+      std::vector<int> prePos;
+      if (coordIter.getPrePos(delays_, i, prePos)) {
+        int preOffset = coordIter.offset(prePos);
+        frameOutput_[start + preOffset].grad->mul(
+            *frameGate_[start + offset].grad, *weightT, 1.0, 1.0);
+        if (weight_->getWGrad()) {
+          weight_->getWGrad()->mul(
+              *frameOutput_[start + preOffset].value->getTranspose(),
+              *frameGate_[start + offset].grad,
+              1.0,
+              1.0);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNAddtoLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MKLDNNAddtoLayer.cpp
rename to paddle/legacy/gserver/layers/MKLDNNAddtoLayer.cpp
diff --git a/paddle/legacy/gserver/layers/MKLDNNAddtoLayer.h b/paddle/legacy/gserver/layers/MKLDNNAddtoLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..0b385e804fdbc74c8612031cf415d06f15ce311a
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MKLDNNAddtoLayer.h
@@ -0,0 +1,87 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of MKLDNNLayer Addto layer.
+ *
+ * The config file api is mkldnn_addto
+ */
+class MKLDNNAddtoLayer : public MKLDNNLayer {
+ protected:
+  // layer size == ic * ih * iw == oc * oh *ow, and can not be changed
+  size_t layerSize_;
+
+  std::unique_ptr<Weight> biases_;
+
+  // buffers for adding bias
+  std::vector<MKLDNNMatrixPtr> vals_;
+  std::vector<MKLDNNMatrixPtr> grads_;
+  // primitives for adding bias
+  std::vector<std::shared_ptr<mkldnn::primitive>> fwdBias_;
+  std::shared_ptr<mkldnn::primitive> bwdBias_;
+
+ public:
+  explicit MKLDNNAddtoLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
+
+  ~MKLDNNAddtoLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
+
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void updateWeights(const UpdateCallback& callback) override;
+
+ protected:
+  void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& bias,
+                       MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
+                  std::shared_ptr<mkldnn::sum::primitive_desc>& biasPD,
+                  std::vector<MKLDNNMatrixPtr>& inputs,
+                  MKLDNNMatrixPtr bias,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
+                        std::shared_ptr<mkldnn::sum::primitive_desc>& biasPD,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& bias,
+                        MKLDNNMatrixPtr& out);
+  void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& bias,
+                       MKLDNNMatrixPtr& out);
+
+  void prepareBias(MKLDNNMatrixPtr& bias,
+                   const MatrixPtr& biasMat,
+                   const MKLDNNMatrixPtr& out,
+                   std::vector<MKLDNNMatrixPtr>& outs);
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNBase.h b/paddle/legacy/gserver/layers/MKLDNNBase.h
new file mode 100644
index 0000000000000000000000000000000000000000..786ceaf86086d7c04331641693181809ac019597
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MKLDNNBase.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "mkldnn.hpp"
+
+namespace paddle {
+
+typedef enum {
+  MKLDNN_BASE = 1,   // basical info of MKLDNN
+  MKLDNN_TESTS = 1,  // gtest info of MKLDNN
+  MKLDNN_FMTS = 2,   // format info of MKLDNN
+  MKLDNN_SIZES = 3,  // size info of MKLDNN
+  MKLDNN_ALL = 4,    // show all info of MKLDNN
+} MKLDNN_LOG_LEVEL;
+
+/**
+ * @brief MKLDNN CPU engine.
+ *
+ */
+class CPUEngine {
+ public:
+  static CPUEngine& Instance() {
+    // Thread-safe in C++11.
+    static CPUEngine myInstance;
+    return myInstance;
+  }
+
+  // Disallow copy or move
+  CPUEngine(const CPUEngine&) = delete;             // Copy constructor
+  CPUEngine(CPUEngine&&) = delete;                  // Move constructor
+  CPUEngine& operator=(const CPUEngine&) = delete;  // Copy assignment
+  CPUEngine& operator=(CPUEngine&&) = delete;       // Move assignment
+
+  mkldnn::engine& getEngine() { return cpuEngine_; }
+
+ protected:
+  CPUEngine() : cpuEngine_(mkldnn::engine::cpu, 0) {}
+  //    CPUEngine() : cpuEngine_(mkldnn::engine::cpu_lazy, 0) {}
+  ~CPUEngine() {}
+
+ private:
+  mkldnn::engine cpuEngine_;
+};
+
+/**
+ * @brief MKLDNN Stream.
+ *
+ */
+class MKLDNNStream {
+ public:
+  MKLDNNStream() : ready_(false) { resetState(); }
+
+  virtual ~MKLDNNStream() {}
+
+  /**
+   * @brief Submit stream
+   * @param prims The primitives vector
+   * @param block Waiting for the stream to complete
+   */
+  void submit(std::vector<mkldnn::primitive>& prims, bool block = true) {
+    resetState();
+    stream_->submit(prims).wait(block);
+    ready_ = false;
+  }
+
+  /**
+   * @brief Reset the mkldnn stream
+   */
+  void resetState() {
+    if (ready_) {
+      return;
+    }
+    // TODO(TJ): change me when mkldnn have method to reset this state
+    // stream_.reset(new mkldnn::stream(mkldnn::stream::kind::lazy));
+    stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
+    ready_ = true;
+  }
+
+ private:
+  bool ready_;
+  std::shared_ptr<mkldnn::stream> stream_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
rename to paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.cpp
diff --git a/paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.h b/paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..9aa20df98f30837e1b80b4269d05d85b7d99ba76
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MKLDNNBatchNormLayer.h
@@ -0,0 +1,125 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+typedef mkldnn::batch_normalization_forward bn_fwd;
+typedef mkldnn::batch_normalization_backward bn_bwd;
+
+/**
+ * @brief A subclass of MKLDNNLayer BatchNorm layer.
+ *
+ * The config file api is mkldnn_batch_norm
+ */
+class MKLDNNBatchNormLayer : public MKLDNNLayer {
+ protected:
+  // save forward primitive_desc, which can be used backward
+  std::shared_ptr<bn_fwd::primitive_desc> fwdPD_;
+
+  // Epsilon value used in the batch normalization formula.
+  real epsilon_;
+
+  // weight and bias in paddle
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> biases_;
+  // mkldnn use a large buffer store both scale and shift
+  // which are weight and bias in paddle corresponding.
+  MatrixPtr valueScaleShift_;
+  MatrixPtr gradScaleShift_;
+  // Moving average of mean.
+  std::unique_ptr<Weight> movingMean_;
+  // Moving average of variance.
+  std::unique_ptr<Weight> movingVar_;
+
+  // if useGlobalStats_ is true, will use the loaded mean and variance.
+  // otherwise, calculate mean and variance in every mini-batch.
+  bool useGlobalStats_;
+  // used in MKLDNN primitive desc
+  unsigned flags_;
+  // use to compute moving mean and variance.
+  real movingAvgFraction_;
+  // whether the weight has been init
+  bool hasInitedWgt_;
+
+  // local mean and variance
+  // when useGlobalStats_ they are loaded from moving mean and variance
+  // when do not useGlobalStats_ they are calculated from this mini-batch
+  MKLDNNMatrixPtr mean_;
+  MKLDNNMatrixPtr var_;
+
+ public:
+  explicit MKLDNNBatchNormLayer(const LayerConfig& config)
+      : MKLDNNLayer(config), useGlobalStats_(true), hasInitedWgt_(false) {}
+
+  ~MKLDNNBatchNormLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
+
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void updateWeights(const UpdateCallback& callback) override;
+
+  void convertWeightsFromPaddle() override;
+
+ protected:
+  void initWeight();
+  /**
+   * cal moving mean and variance.
+   * moving = moving * AvgFraction + local * (1 - AvgFraction)
+   */
+  void calMovingMeanAndVar();
+
+  void resetFwdBuffers(MKLDNNMatrixPtr& in,
+                       MKLDNNMatrixPtr& wgt,
+                       MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<bn_fwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr in,
+                  MKLDNNMatrixPtr wgt,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<bn_fwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& wgt,
+                        MKLDNNMatrixPtr& out);
+  void resetBwdBuffers(MKLDNNMatrixPtr& in,
+                       MKLDNNMatrixPtr& wgt,
+                       MKLDNNMatrixPtr& out);
+  void resetBwdPD(std::shared_ptr<bn_bwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr& in,
+                  MKLDNNMatrixPtr& wgt,
+                  MKLDNNMatrixPtr& out);
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<bn_bwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& wgt,
+                        MKLDNNMatrixPtr& out);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNConcatLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNConcatLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MKLDNNConcatLayer.cpp
rename to paddle/legacy/gserver/layers/MKLDNNConcatLayer.cpp
diff --git a/paddle/legacy/gserver/layers/MKLDNNConcatLayer.h b/paddle/legacy/gserver/layers/MKLDNNConcatLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7738df6c106c68f55b313f2d119e31c6e444cbf
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MKLDNNConcatLayer.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of MKLDNNLayer Concatenate layer.
+ *
+ * The config file api is mkldnn_concat
+ */
+class MKLDNNConcatLayer : public MKLDNNLayer {
+ protected:
+  std::vector<std::shared_ptr<mkldnn::primitive>> bwds_;
+  // input channel numbers
+  std::vector<int> channels_;
+
+  // concat_dimension in MKLDNN
+  // if axis_ == 0, concat batchsize
+  // if axis_ == 1, concat channel (default)
+  int axis_;
+
+ public:
+  explicit MKLDNNConcatLayer(const LayerConfig& config)
+      : MKLDNNLayer(config), axis_(1) {}
+
+  ~MKLDNNConcatLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
+
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void printSizeInfo() override {
+    CHECK_EQ(channels_.size(), inputLayers_.size());
+    for (size_t i = 0; i < channels_.size(); ++i) {
+      VLOG(MKLDNN_SIZES) << "Input " << i << ", " << inputLayers_[i]->getName()
+                         << ": " << bs_ << ", " << channels_[i] << ", " << ih_
+                         << ", " << iw_;
+    }
+    VLOG(MKLDNN_SIZES) << "Output: " << bs_ << ", " << oc_ << ", " << oh_
+                       << ", " << ow_;
+  }
+
+  size_t keepCondition() {
+    // reset when the total element size of all inputs changed
+    size_t totalSize = inputLayers_[0]->getOutputValue()->getElementCnt();
+    for (size_t i = 1; i < inputLayers_.size(); ++i) {
+      totalSize += inputLayers_[i]->getOutputValue()->getElementCnt();
+    }
+    return totalSize;
+  }
+
+ protected:
+  void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
+                  std::vector<MKLDNNMatrixPtr>& inputs,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& out);
+  void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& out);
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::vector<std::shared_ptr<mkldnn::primitive>>& prims,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& out);
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNConvLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNConvLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b47bf14821fed4057227c80bb77e584649ab3145
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MKLDNNConvLayer.cpp
@@ -0,0 +1,388 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNConvLayer.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/utils/Logging.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_conv, MKLDNNConvLayer);
+
+bool MKLDNNConvLayer::init(const LayerMap& layerMap,
+                           const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  CHECK_EQ(inputLayers_.size(), 1UL) << "Only support one input layer yet";
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  CHECK(config_.shared_biases()) << "Only support shared biases yet";
+
+  oc_ = config_.num_filters();
+  const ConvConfig& conf = config_.inputs(0).conv_conf();
+  ic_ = conf.channels();
+  fw_ = conf.filter_size();
+  fh_ = conf.filter_size_y();
+  pw_ = conf.padding();
+  ph_ = conf.padding_y();
+  dw_ = conf.dilation();
+  dh_ = conf.dilation_y();
+  sw_ = conf.stride();
+  sh_ = conf.stride_y();
+  gp_ = conf.groups();
+  oh_ = conf.output_y();
+  ow_ = conf.output_x();
+  ih_ = conf.img_size_y();
+  iw_ = conf.img_size();
+  caffeMode_ = conf.caffe_mode();
+  CHECK(caffeMode_) << "Only support caffe mode yet";
+  CHECK(dh_ == 1 && dw_ == 1) << "Only support dilation 1 yet";
+  // check group setting
+  CHECK_EQ((oc_ / gp_) * gp_, oc_) << "group is indivisible for oc";
+  CHECK_EQ((ic_ / gp_) * gp_, ic_) << "group is indivisible for ic";
+
+  // create weight
+  size_t height = oc_ / gp_;
+  size_t width = ic_ * fh_ * fw_;
+  CHECK_EQ(parameters_[0]->getSize(), height * width);
+  weight_ =
+      std::unique_ptr<Weight>(new Weight(height, width, parameters_[0], 0));
+
+  // create biases
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_, 0));
+  }
+  return true;
+}
+
+void MKLDNNConvLayer::convertWeightsFromPaddle() {
+  if (hasInitedWgt_) {
+    return;
+  }
+
+  CHECK(wgtVal_) << "should have been initialized";
+  // the paddle weight format is oihw or goihw
+  auto targetDim = wgtVal_->getDims();
+  auto srcFmt = (gp_ == 1) ? memory::format::oihw : memory::format::goihw;
+  wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
+  hasInitedWgt_ = true;
+}
+
+void MKLDNNConvLayer::convertWeightsToPaddle() {
+  CHECK(wgtVal_) << "should have been initialized";
+  auto targetDim = wgtVal_->getDims();
+  auto dstFmt = (gp_ == 1) ? memory::format::oihw : memory::format::goihw;
+  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
+}
+
+void MKLDNNConvLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
+  reshapeInput(bs, ih, iw);
+
+  // cal output sizes
+  // oc can not be changed
+  int fh = (fh_ - 1) * dh_ + 1;
+  int fw = (fw_ - 1) * dw_ + 1;
+  oh = outputSize(ih, fh, ph_, sh_, caffeMode_);
+  ow = outputSize(iw, fw, pw_, sw_, caffeMode_);
+
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc * oh * ow);
+}
+
+void MKLDNNConvLayer::resetFwd(std::vector<primitive>& pipeline,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
+                               MKLDNNMatrixPtr& out) {
+  resetFwdPD(fwdPD_);
+
+  resetFwdBuffers(fwdPD_, inputs[0], wgtVal_, biasVal_, out);
+
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, biasVal_, out);
+}
+
+void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
+                               MKLDNNMatrixPtr& out) {
+  std::shared_ptr<conv_bwdWgt::primitive_desc> bwdWgtPD;
+  std::shared_ptr<conv_bwdData::primitive_desc> bwdDataPD;
+
+  resetBwdWgtPD(bwdWgtPD);
+
+  resetBwdDataPD(bwdDataPD);
+
+  resetBwdBuffers(bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
+
+  resetBwdPipeline(
+      pipeline, bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
+}
+
+void MKLDNNConvLayer::updateWeights(const UpdateCallback& callback) {
+  weight_->getParameterPtr()->incUpdate(callback);
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+void MKLDNNConvLayer::loadConvSettings(memory::dims& wgt,
+                                       memory::dims& bias,
+                                       memory::dims& stride,
+                                       memory::dims& dilation,
+                                       memory::dims& padL,
+                                       memory::dims& padR) {
+  wgt = (gp_ == 1) ? memory::dims{oc_, ic_, fh_, fw_}
+                   : memory::dims{gp_, oc_ / gp_, ic_ / gp_, fh_, fw_};
+  bias = memory::dims{oc_};
+  stride = memory::dims{sh_, sw_};
+  padL = memory::dims{ph_, pw_};
+  padR = getPaddingR();
+  // note: mkldnn dilation start from 0
+  dilation = memory::dims{dh_ - 1, dw_ - 1};
+}
+
+void MKLDNNConvLayer::resetFwdPD(
+    std::shared_ptr<conv_fwd::primitive_desc>& pd) {
+  // dims for conv
+  memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
+  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
+  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
+  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
+
+  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
+                                        : prop_kind::forward_training;
+  algorithm algo = algorithm::convolution_direct;
+  padding_kind padKind = padding_kind::zero;
+  conv_fwd::desc fwdDesc =
+      biases_ && biases_->getW()
+          ? conv_fwd::desc(pk,
+                           algo,
+                           MKLDNNMatrix::createMemoryDesc(inDims),
+                           MKLDNNMatrix::createMemoryDesc(wgtDims),
+                           MKLDNNMatrix::createMemoryDesc(biasDims),
+                           MKLDNNMatrix::createMemoryDesc(outDims),
+                           strides,
+                           dilations,
+                           padL,
+                           padR,
+                           padKind)
+          : conv_fwd::desc(pk,
+                           algo,
+                           MKLDNNMatrix::createMemoryDesc(inDims),
+                           MKLDNNMatrix::createMemoryDesc(wgtDims),
+                           MKLDNNMatrix::createMemoryDesc(outDims),
+                           strides,
+                           dilations,
+                           padL,
+                           padR,
+                           padKind);
+  pd.reset(new conv_fwd::primitive_desc(fwdDesc, engine_));
+}
+
+void MKLDNNConvLayer::resetFwdBuffers(
+    std::shared_ptr<conv_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  CHECK(pd);
+  resetInValue(
+      in, std::make_shared<memory::primitive_desc>(pd->src_primitive_desc()));
+
+  resetOutValue(out, pd->dst_primitive_desc());
+
+  resetWithMatrix(wgt, weight_->getW(), pd->weights_primitive_desc());
+
+  if (biases_ && biases_->getW()) {
+    resetWithMatrix(bias, biases_->getW(), pd->bias_primitive_desc());
+  } else {
+    bias = nullptr;
+  }
+}
+
+void MKLDNNConvLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<conv_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  if (bias) {
+    fwd_.reset(new conv_fwd(*pd, *in, *wgt, *bias, *out));
+  } else {
+    fwd_.reset(new conv_fwd(*pd, *in, *wgt, *out));
+  }
+  pipeline.push_back(*fwd_);
+}
+
+void MKLDNNConvLayer::resetBwdWgtPD(
+    std::shared_ptr<conv_bwdWgt::primitive_desc>& pd) {
+  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
+  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
+
+  // create backward weight using input, output and weight value memory desc
+  CHECK(inVals_[0]) << "Should have internal input value";
+  CHECK(outVal_) << "Should have internal output value";
+  CHECK(wgtVal_) << "Should have weight value";
+  algorithm algo = algorithm::convolution_direct;
+  padding_kind padKind = padding_kind::zero;
+  auto bwdWgtDesc = biasVal_ != nullptr
+                        ? conv_bwdWgt::desc(algo,
+                                            inVals_[0]->getMemoryDesc(),
+                                            wgtVal_->getMemoryDesc(),
+                                            biasVal_->getMemoryDesc(),
+                                            outVal_->getMemoryDesc(),
+                                            strides,
+                                            padL,
+                                            padR,
+                                            padKind)
+                        : conv_bwdWgt::desc(algo,
+                                            inVals_[0]->getMemoryDesc(),
+                                            wgtVal_->getMemoryDesc(),
+                                            outVal_->getMemoryDesc(),
+                                            strides,
+                                            padL,
+                                            padR,
+                                            padKind);
+  pd.reset(new conv_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
+  CHECK_PRIMITIVE_DESC_EQ(inVals_[0], pd->src_primitive_desc());
+  CHECK_PRIMITIVE_DESC_EQ(
+      outVal_,
+      pd->diff_dst_primitive_desc(),
+      "primitive desc of out value and grad should be equal");
+  CHECK_PRIMITIVE_DESC_EQ(
+      wgtVal_,
+      pd->diff_weights_primitive_desc(),
+      "primitive desc of weight value and grad should be equal");
+}
+
+void MKLDNNConvLayer::resetBwdDataPD(
+    std::shared_ptr<conv_bwdData::primitive_desc>& pd) {
+  pd = nullptr;
+  if (inputLayers_[0]->getOutput().grad == nullptr) {
+    return;
+  }
+
+  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
+  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
+  CHECK(inVals_[0]) << "Should have internal input value";
+  CHECK(outVal_) << "Should have internal output value";
+  // create backward data using input and output value memory desc
+  // but using weight memory desc with any format
+  auto bwdDataDesc = conv_bwdData::desc(algorithm::convolution_direct,
+                                        inVals_[0]->getMemoryDesc(),
+                                        MKLDNNMatrix::createMemoryDesc(wgtDims),
+                                        outVal_->getMemoryDesc(),
+                                        strides,
+                                        padL,
+                                        padR,
+                                        padding_kind::zero);
+  pd.reset(new conv_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_));
+  CHECK_PRIMITIVE_DESC_EQ(
+      inVals_[0],
+      pd->diff_src_primitive_desc(),
+      "primitive desc of in value and grad should be equal");
+  CHECK_PRIMITIVE_DESC_EQ(
+      outVal_,
+      pd->diff_dst_primitive_desc(),
+      "primitive desc of out value and grad should be equal");
+}
+
+void MKLDNNConvLayer::resetBwdBuffers(
+    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
+    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  CHECK(wgtPD);
+  resetOutGrad(out, wgtPD->diff_dst_primitive_desc());
+
+  resetWithMatrix(
+      wgt, weight_->getWGrad(), wgtPD->diff_weights_primitive_desc());
+  CHECK_PRIMITIVE_DESC_EQ(
+      wgtVal_,
+      wgt->getPrimitiveDesc(),
+      "primitive desc of weight grad and value should be equal");
+
+  bias = nullptr;
+  if (biases_ && biases_->getWGrad()) {
+    resetWithMatrix(
+        bias, biases_->getWGrad(), wgtPD->diff_bias_primitive_desc());
+    CHECK(bias);
+    CHECK_PRIMITIVE_DESC_EQ(
+        biasVal_,
+        bias->getPrimitiveDesc(),
+        "primitive desc of bias grad and value should be equal");
+  }
+
+  if (dataPD == nullptr) {
+    return;
+  }
+  resetInGrad(in, dataPD->diff_src_primitive_desc());
+  resetWgtValBwdData(dataPD, wgtValBwdData_);
+}
+
+void MKLDNNConvLayer::resetBwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
+    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  CHECK(inVals_[0]);
+  // add bwdWgt handle
+  if (bias) {
+    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt, *bias));
+  } else {
+    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt));
+  }
+  pipeline.push_back(*bwdWgt_);
+
+  if (dataPD == nullptr) {
+    return;
+  }
+  if (cvtWgtVal_) {
+    pipeline.push_back(*cvtWgtVal_);
+  }
+  // add bwdData handle
+  CHECK(wgtValBwdData_) << "Should have weight memory";
+  bwdData_.reset(new conv_bwdData(*dataPD, *out, *wgtValBwdData_, *in));
+  pipeline.push_back(*bwdData_);
+}
+
+void MKLDNNConvLayer::resetWgtValBwdData(
+    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+    MKLDNNMatrixPtr& wgt) {
+  if (dataPD == nullptr) {
+    return;
+  }
+
+  // create new weight value for backward data, and create reorder if necessary
+  // since the primitive_desc would be different with wgtVal_
+  CHECK(wgtVal_) << "should have weight value";
+  if (dataPD->weights_primitive_desc() != wgtVal_->getPrimitiveDesc()) {
+    wgtValBwdData_ = MKLDNNMatrix::create(dataPD->weights_primitive_desc());
+    cvtWgtVal_ = MKLDNNMatrix::createReorder(wgtVal_, wgtValBwdData_);
+    CHECK(cvtWgtVal_);
+  } else {
+    wgtValBwdData_ = wgtVal_;
+  }
+  VLOG(MKLDNN_FMTS) << "weight value format for backward data: "
+                    << wgtValBwdData_->getFormat();
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNConvLayer.h b/paddle/legacy/gserver/layers/MKLDNNConvLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..d399035ed3ae2f411587c1fcf1799bb71c8de63e
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MKLDNNConvLayer.h
@@ -0,0 +1,161 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+typedef mkldnn::convolution_forward conv_fwd;
+typedef mkldnn::convolution_backward_weights conv_bwdWgt;
+typedef mkldnn::convolution_backward_data conv_bwdData;
+
+/**
+ * @brief A subclass of MKLDNNLayer conv layer.
+ *
+ * The config file api is mkldnn_conv
+ */
+class MKLDNNConvLayer : public MKLDNNLayer {
+ protected:
+  // padding height and width
+  int ph_, pw_;
+  // stride height and width
+  int sh_, sw_;
+  // dilation height and width
+  int dh_, dw_;
+  // filter(kenerl) height and width
+  int fh_, fw_;
+  // group number
+  int gp_;
+
+  // in resetBwdData, the format of wgtValBwdData_ is different with wgtVal_
+  MKLDNNMatrixPtr wgtValBwdData_;
+  // convert handle from wgtVal_ to wgtValBwdData_
+  std::shared_ptr<mkldnn::reorder> cvtWgtVal_;
+
+  // save forward primitive_desc, which can be used backward
+  std::shared_ptr<conv_fwd::primitive_desc> fwdPD_;
+
+  // whether the weight has been init
+  bool hasInitedWgt_;
+
+  // true by default, which impact the calculation of output image size.
+  // details can refer to mathUtil.h
+  bool caffeMode_;
+
+  // weight and bias
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> biases_;
+
+ public:
+  explicit MKLDNNConvLayer(const LayerConfig& config)
+      : MKLDNNLayer(config), hasInitedWgt_(false), caffeMode_(true) {}
+
+  ~MKLDNNConvLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
+
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void updateWeights(const UpdateCallback& callback) override;
+
+  void convertWeightsFromPaddle() override;
+
+  void convertWeightsToPaddle() override;
+
+  void printSizeInfo() override {
+    MKLDNNLayer::printSizeInfo();
+    VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
+                       << ", ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_
+                       << ", sw: " << sw_ << ", dh: " << dh_ << ", dw: " << dw_;
+  }
+
+ protected:
+  /**
+   * load the dims settings of this conv
+   */
+  void loadConvSettings(mkldnn::memory::dims& wgt,
+                        mkldnn::memory::dims& bias,
+                        mkldnn::memory::dims& stride,
+                        mkldnn::memory::dims& dilation,
+                        mkldnn::memory::dims& padL,
+                        mkldnn::memory::dims& padR);
+
+  void resetFwdPD(std::shared_ptr<conv_fwd::primitive_desc>& pd);
+  void resetFwdBuffers(std::shared_ptr<conv_fwd::primitive_desc>& pd,
+                       MKLDNNMatrixPtr& in,
+                       MKLDNNMatrixPtr& wgt,
+                       MKLDNNMatrixPtr& bias,
+                       MKLDNNMatrixPtr& out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<conv_fwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& wgt,
+                        MKLDNNMatrixPtr& bias,
+                        MKLDNNMatrixPtr& out);
+  void resetBwdWgtPD(std::shared_ptr<conv_bwdWgt::primitive_desc>& pd);
+  void resetBwdDataPD(std::shared_ptr<conv_bwdData::primitive_desc>& pd);
+  void resetBwdBuffers(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
+                       std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+                       MKLDNNMatrixPtr& in,
+                       MKLDNNMatrixPtr& wgt,
+                       MKLDNNMatrixPtr& bias,
+                       MKLDNNMatrixPtr& out);
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
+                        std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& wgt,
+                        MKLDNNMatrixPtr& bias,
+                        MKLDNNMatrixPtr& out);
+
+  /**
+   * reset MKLDNNMatrix of weight value for backward data
+   * since the primitive_desc would be different with wgtVal_
+   */
+  void resetWgtValBwdData(std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+                          MKLDNNMatrixPtr& wgt);
+
+  /**
+   * get padding_r according to
+   * https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
+   * test_convolution_forward_common.hpp
+   * @note: mkldnn dilation start from 0 while paddle start from 1
+   */
+  mkldnn::memory::dims getPaddingR() const {
+    mkldnn::memory::dims padR = {ph_, pw_};
+    for (int i = 0; i < 2; ++i) {
+      if ((ih_ - ((fh_ - 1) * dh_ + 1) + ph_ + padR[0]) / sh_ + 1 != oh_) {
+        ++padR[0];
+      }
+      if ((iw_ - ((fw_ - 1) * dw_ + 1) + pw_ + padR[1]) / sw_ + 1 != ow_) {
+        ++padR[1];
+      }
+    }
+    return padR;
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNFcLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNFcLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f3747c7db84ef53fdcfa3741525a754fab63bca5
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MKLDNNFcLayer.cpp
@@ -0,0 +1,262 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNFcLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_fc, MKLDNNFcLayer);
+
+bool MKLDNNFcLayer::init(const LayerMap& layerMap,
+                         const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+
+  CHECK_EQ(inputLayers_.size(), 1UL) << "Only support one input layer yet";
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  CHECK(!parameters_[0]->isSparse()) << "Do not support sparse yet";
+
+  // output size, cat not be changed
+  oc_ = getSize();
+  oh_ = 1;
+  ow_ = 1;
+  ih_ = 1;
+  iw_ = 1;
+
+  // input size can not change in FC
+  iLayerSize_ = inputLayers_[0]->getSize();
+  CHECK_EQ(parameters_[0]->getSize(), iLayerSize_ * oc_);
+
+  // create weight
+  weight_ =
+      std::unique_ptr<Weight>(new Weight(oc_, iLayerSize_, parameters_[0], 0));
+
+  // create biases
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_, 0));
+  }
+  return true;
+}
+
+void MKLDNNFcLayer::convertWeightsFromPaddle() {
+  if (hasInitedWgt_) {
+    return;
+  }
+
+  CHECK(wgtVal_) << "should have been initialized";
+  auto targetDim = wgtVal_->getDims();
+  auto srcFmt = targetDim.size() == 2 ? format::io : format::ihwo;
+  wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
+  hasInitedWgt_ = true;
+}
+
+void MKLDNNFcLayer::convertWeightsToPaddle() {
+  CHECK(wgtVal_) << "should have been initialized";
+  auto targetDim = wgtVal_->getDims();
+  auto dstFmt = targetDim.size() == 2 ? format::io : format::ihwo;
+  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
+}
+
+void MKLDNNFcLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
+  reshapeInput(bs, ih, iw);
+
+  CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize());
+  ic = iLayerSize_ / (ih * iw);
+  CHECK_EQ(size_t(ic * ih * iw), iLayerSize_) << "not divisible";
+  CHECK_EQ(size_t(oc), getSize());
+
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc);
+}
+
+void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline,
+                             std::vector<MKLDNNMatrixPtr>& inputs,
+                             MKLDNNMatrixPtr& out) {
+  resetFwdBuffers(inputs[0], wgtVal_, biasVal_, out);
+
+  resetFwdPD(fwdPD_, inputs[0], wgtVal_, biasVal_, out);
+
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, biasVal_, out);
+}
+
+void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
+                             std::vector<MKLDNNMatrixPtr>& inputs,
+                             MKLDNNMatrixPtr& out) {
+  std::shared_ptr<fc_bwdWgt::primitive_desc> bwdWgtPD;
+  std::shared_ptr<fc_bwdData::primitive_desc> bwdDataPD;
+
+  resetBwdBuffers(inputs[0], wgtGrad_, biasGrad_, out);
+
+  resetBwdWgtPD(bwdWgtPD, wgtGrad_, biasGrad_, out);
+
+  resetBwdDataPD(bwdDataPD, inputs[0], out);
+
+  resetBwdPipeline(
+      pipeline, bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
+}
+
+void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) {
+  weight_->getParameterPtr()->incUpdate(callback);
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+void MKLDNNFcLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
+                                    MKLDNNMatrixPtr& wgt,
+                                    MKLDNNMatrixPtr& bias,
+                                    MKLDNNMatrixPtr& out) {
+  resetInValue(in);
+  CHECK(in);
+  in->downSpatial();
+
+  auto outPD =
+      MKLDNNMatrix::createPrimitiveDesc({bs_, oc_}, format::nc, engine_);
+  resetOutValue(out, outPD);
+
+  format wgtFmt = format::oihw;
+  if (in->getFormat() == format::nChw8c) {
+    wgtFmt = format::oIhw8i;
+  } else if (in->getFormat() == format::nChw16c) {
+    wgtFmt = format::oIhw16i;
+  }
+  auto wgtPD =
+      MKLDNNMatrix::createPrimitiveDesc({oc_, ic_, ih_, iw_}, wgtFmt, engine_);
+  resetWithMatrix(wgt, weight_->getW(), wgtPD);
+  wgt->downSpatial();
+
+  if (biases_ && biases_->getW()) {
+    auto biasPD = MKLDNNMatrix::createPrimitiveDesc({oc_}, format::x, engine_);
+    resetWithMatrix(bias, biases_->getW(), biasPD);
+  } else {
+    bias = nullptr;
+  }
+}
+
+void MKLDNNFcLayer::resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
+                               MKLDNNMatrixPtr in,
+                               MKLDNNMatrixPtr wgt,
+                               MKLDNNMatrixPtr bias,
+                               MKLDNNMatrixPtr out) {
+  CHECK(in);
+  CHECK(wgt);
+  CHECK(out);
+  prop_kind pk = prop_kind::forward;
+  fc_fwd::desc fwdDesc = bias != nullptr ? fc_fwd::desc(pk,
+                                                        in->getMemoryDesc(),
+                                                        wgt->getMemoryDesc(),
+                                                        bias->getMemoryDesc(),
+                                                        out->getMemoryDesc())
+                                         : fc_fwd::desc(pk,
+                                                        in->getMemoryDesc(),
+                                                        wgt->getMemoryDesc(),
+                                                        out->getMemoryDesc());
+  pd.reset(new fc_fwd::primitive_desc(fwdDesc, engine_));
+}
+
+void MKLDNNFcLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<fc_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  if (bias) {
+    fwd_.reset(new fc_fwd(*pd, *in, *wgt, *bias, *out));
+  } else {
+    fwd_.reset(new fc_fwd(*pd, *in, *wgt, *out));
+  }
+  pipeline.push_back(*fwd_);
+}
+
+void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
+                                    MKLDNNMatrixPtr& wgt,
+                                    MKLDNNMatrixPtr& bias,
+                                    MKLDNNMatrixPtr& out) {
+  CHECK(inVals_[0] && outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
+
+  CHECK(wgtVal_);
+  resetWithMatrix(wgt, weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
+
+  if (biasVal_) {
+    resetWithMatrix(bias, biases_->getWGrad(), biasVal_->getPrimitiveDesc());
+  } else {
+    bias = nullptr;
+  }
+}
+
+void MKLDNNFcLayer::resetBwdWgtPD(
+    std::shared_ptr<fc_bwdWgt::primitive_desc>& pd,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  CHECK(inVals_[0]);
+  fc_bwdWgt::desc bwdWgtDesc =
+      bias ? fc_bwdWgt::desc(inVals_[0]->getMemoryDesc(),
+                             wgt->getMemoryDesc(),
+                             bias->getMemoryDesc(),
+                             out->getMemoryDesc())
+           : fc_bwdWgt::desc(inVals_[0]->getMemoryDesc(),
+                             wgt->getMemoryDesc(),
+                             out->getMemoryDesc());
+  pd.reset(new fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
+}
+
+void MKLDNNFcLayer::resetBwdDataPD(
+    std::shared_ptr<fc_bwdData::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  pd = nullptr;
+  if (in == nullptr) {
+    return;
+  }
+  CHECK(wgtVal_);
+  fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(
+      in->getMemoryDesc(), wgtVal_->getMemoryDesc(), out->getMemoryDesc());
+  pd.reset(new fc_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_));
+}
+
+void MKLDNNFcLayer::resetBwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<fc_bwdWgt::primitive_desc>& bwdWgtPD,
+    std::shared_ptr<fc_bwdData::primitive_desc>& bwdDataPD,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  CHECK(inVals_[0]);
+  if (bias) {
+    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVals_[0], *out, *wgt, *bias));
+  } else {
+    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVals_[0], *out, *wgt));
+  }
+  pipeline.push_back(*bwdWgt_);
+
+  if (bwdDataPD == nullptr) {
+    return;
+  }
+  CHECK(wgtVal_) << "Should have weight memory";
+  bwdData_.reset(new fc_bwdData(*bwdDataPD, *out, *wgtVal_, *in));
+  pipeline.push_back(*bwdData_);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNFcLayer.h b/paddle/legacy/gserver/layers/MKLDNNFcLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..a704066cc818a6b33bd0eed4612d62b674fa72ca
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MKLDNNFcLayer.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+typedef mkldnn::inner_product_forward fc_fwd;
+typedef mkldnn::inner_product_backward_weights fc_bwdWgt;
+typedef mkldnn::inner_product_backward_data fc_bwdData;
+
+/**
+ * @brief A subclass of MKLDNNLayer fc layer.
+ *
+ * The config file api is mkldnn_fc
+ */
+class MKLDNNFcLayer : public MKLDNNLayer {
+ protected:
+  // input layer size, can not be change after init
+  size_t iLayerSize_;  // == ic * ih * iw
+
+  // if has already init the weight
+  bool hasInitedWgt_;
+
+  // save forward primitive_desc, which can be used backward
+  std::shared_ptr<fc_fwd::primitive_desc> fwdPD_;
+
+  // fc weight and bias
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> biases_;
+
+ public:
+  explicit MKLDNNFcLayer(const LayerConfig& config)
+      : MKLDNNLayer(config), hasInitedWgt_(false) {}
+
+  ~MKLDNNFcLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
+
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void updateWeights(const UpdateCallback& callback) override;
+
+  void convertWeightsFromPaddle() override;
+
+  void convertWeightsToPaddle() override;
+
+ protected:
+  void resetFwdBuffers(MKLDNNMatrixPtr& in,
+                       MKLDNNMatrixPtr& wgt,
+                       MKLDNNMatrixPtr& bias,
+                       MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr in,
+                  MKLDNNMatrixPtr wgt,
+                  MKLDNNMatrixPtr bias,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<fc_fwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& wgt,
+                        MKLDNNMatrixPtr& bias,
+                        MKLDNNMatrixPtr& out);
+  void resetBwdBuffers(MKLDNNMatrixPtr& in,
+                       MKLDNNMatrixPtr& wgt,
+                       MKLDNNMatrixPtr& bias,
+                       MKLDNNMatrixPtr& out);
+  void resetBwdWgtPD(std::shared_ptr<fc_bwdWgt::primitive_desc>& pd,
+                     MKLDNNMatrixPtr& wgt,
+                     MKLDNNMatrixPtr& bias,
+                     MKLDNNMatrixPtr& out);
+  void resetBwdDataPD(std::shared_ptr<fc_bwdData::primitive_desc>& pd,
+                      MKLDNNMatrixPtr& in,
+                      MKLDNNMatrixPtr& out);
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<fc_bwdWgt::primitive_desc>& bwdWgtPD,
+                        std::shared_ptr<fc_bwdData::primitive_desc>& bwdDataPD,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& wgt,
+                        MKLDNNMatrixPtr& bias,
+                        MKLDNNMatrixPtr& out);
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNLRNLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNLRNLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..739482348f71bf144551cd1d881f1f1d7d69201f
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MKLDNNLRNLayer.cpp
@@ -0,0 +1,163 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNLRNLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_lrn, MKLDNNLRNLayer);
+
+bool MKLDNNLRNLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+
+  /* the size of inputs for norm-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+  const NormConfig& conf = config_.inputs(0).norm_conf();
+  localSize_ = conf.size();
+  alpha_ = conf.scale();
+  beta_ = conf.pow();
+
+  ic_ = conf.channels();
+  oc_ = ic_;
+  iw_ = conf.img_size();
+  ow_ = conf.output_x();
+  ih_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  oh_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  CHECK_EQ(iw_, ow_);
+  CHECK_EQ(ih_, oh_);
+  return true;
+}
+
+void MKLDNNLRNLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  reshapeInput(bs, ih, iw);
+  // ic_ and oc can not be changed
+  CHECK_EQ((size_t)ic,
+           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
+      << "Input channel can not be changed";
+  oh = ih;
+  ow = iw;
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc * oh * ow);
+}
+
+void MKLDNNLRNLayer::resetFwd(std::vector<primitive>& pipeline,
+                              std::vector<MKLDNNMatrixPtr>& inputs,
+                              MKLDNNMatrixPtr& out) {
+  resetFwdBuffers(inputs[0], out);
+
+  resetFwdPD(fwdPD_, inputs[0], out);
+
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], out);
+}
+
+void MKLDNNLRNLayer::resetBwd(std::vector<primitive>& pipeline,
+                              std::vector<MKLDNNMatrixPtr>& inputs,
+                              MKLDNNMatrixPtr& out) {
+  std::shared_ptr<lrn_bwd::primitive_desc> pd;
+
+  resetBwdBuffers(inputs[0], out);
+
+  resetBwdPD(pd, inputs[0], out);
+
+  resetBwdPipeline(pipeline, pd, inputs[0], out);
+}
+
+void MKLDNNLRNLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
+                                     MKLDNNMatrixPtr& out) {
+  resetInValue(in);
+  CHECK(in);
+  resetOutValue(out, in->getPrimitiveDesc());
+}
+
+void MKLDNNLRNLayer::resetFwdPD(std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+                                MKLDNNMatrixPtr in,
+                                MKLDNNMatrixPtr out) {
+  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
+                                        : prop_kind::forward_training;
+  auto fwdDesc = lrn_fwd::desc(pk,
+                               algorithm::lrn_across_channels,
+                               in->getMemoryDesc(),
+                               localSize_,
+                               alpha_,
+                               beta_,
+                               1.0f);
+  pd.reset(new lrn_fwd::primitive_desc(fwdDesc, engine_));
+  // prepare workspace if necessary
+  workspace_ =
+      passType_ != PASS_TEST
+          ? std::make_shared<memory>(memory(pd->workspace_primitive_desc()))
+          : nullptr;
+}
+
+void MKLDNNLRNLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  fwd_ = workspace_
+             ? std::make_shared<lrn_fwd>(lrn_fwd(*pd, *in, *workspace_, *out))
+             : std::make_shared<lrn_fwd>(lrn_fwd(*pd, *in, *out));
+  pipeline.push_back(*fwd_);
+}
+
+void MKLDNNLRNLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
+                                     MKLDNNMatrixPtr& out) {
+  CHECK(inVals_[0] && outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
+}
+
+void MKLDNNLRNLayer::resetBwdPD(std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+                                MKLDNNMatrixPtr& in,
+                                MKLDNNMatrixPtr& out) {
+  pd = nullptr;
+  if (in == nullptr) {
+    return;
+  }
+  CHECK(out);
+  auto bwdDesc = lrn_bwd::desc(algorithm::lrn_across_channels,
+                               in->getMemoryDesc(),
+                               out->getMemoryDesc(),
+                               localSize_,
+                               alpha_,
+                               beta_,
+                               1.0f);
+  pd.reset(new lrn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
+}
+
+void MKLDNNLRNLayer::resetBwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  if (pd == nullptr) {
+    return;
+  }
+  CHECK(inVals_[0]);
+  CHECK(workspace_);
+  bwdData_ = std::make_shared<lrn_bwd>(
+      lrn_bwd(*pd, *inVals_[0], *out, *workspace_, *in));
+  pipeline.push_back(*bwdData_);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNLRNLayer.h b/paddle/legacy/gserver/layers/MKLDNNLRNLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..028438f2c93b2182318c53cd348351376d491e79
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MKLDNNLRNLayer.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+typedef mkldnn::lrn_forward lrn_fwd;
+typedef mkldnn::lrn_backward lrn_bwd;
+
+/**
+ * @brief A subclass of MKLDNNLayer LRN(Local Response Norm) layer.
+ *
+ * The config file api is mkldnn_lrn
+ */
+class MKLDNNLRNLayer : public MKLDNNLayer {
+ protected:
+  // save forward primitive_desc, which can be used in backward
+  std::shared_ptr<lrn_fwd::primitive_desc> fwdPD_;
+  // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
+  // test_lrn_backward.cpp, lrn need workspace for backward
+  std::shared_ptr<mkldnn::memory> workspace_;
+
+  int localSize_;
+  float alpha_, beta_;  // scale and pow in paddle
+
+ public:
+  explicit MKLDNNLRNLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
+
+  ~MKLDNNLRNLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
+
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+ protected:
+  void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr in,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& out);
+  void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
+  void resetBwdPD(std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr& in,
+                  MKLDNNMatrixPtr& out);
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& out);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MKLDNNLayer.cpp
rename to paddle/legacy/gserver/layers/MKLDNNLayer.cpp
diff --git a/paddle/legacy/gserver/layers/MKLDNNLayer.h b/paddle/legacy/gserver/layers/MKLDNNLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..94dc8625f68985a16bd68a6e36a1ad607d77a7cb
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MKLDNNLayer.h
@@ -0,0 +1,477 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "MKLDNNBase.h"
+#include "mkldnn.hpp"
+#include "paddle/legacy/math/MKLDNNMatrix.h"
+#include "paddle/legacy/utils/Stat.h"
+
+DECLARE_bool(use_mkldnn);
+
+namespace paddle {
+
+class MKLDNNLayer;
+typedef std::shared_ptr<MKLDNNLayer> MKLDNNLayerPtr;
+
+/**
+ * @brief Base class of MKLDNNlayer.
+ *
+ */
+class MKLDNNLayer : public Layer {
+ protected:
+  // batch size
+  int bs_;
+  // their sizes are always from the first input layer
+  // input image channel, height and width
+  int ic_, ih_, iw_;
+  // output image channel, height and width
+  int oc_, oh_, ow_;
+
+  // the condition that forward need be reset
+  size_t condition_;
+  // backward also need reset after reset forward handle
+  bool needResetBwd_;
+
+  // is output only mkldnn
+  bool outputOnlyMKLDNN_;
+
+  // mkldnn engine, stream and primivtives
+  mkldnn::engine engine_;
+  std::shared_ptr<MKLDNNStream> stream_;
+  std::shared_ptr<mkldnn::primitive> fwd_;
+  std::shared_ptr<mkldnn::primitive> bwdWgt_;
+  std::shared_ptr<mkldnn::primitive> bwdData_;
+  std::vector<mkldnn::primitive> pipelineFwd_;
+  std::vector<mkldnn::primitive> pipelineBwd_;
+
+  /* Value and grad are seperated as internal and external buffers.
+   * Each MKLDNNLayer must init or reset internal buffer at least,
+   * and the external buffer format is always nchw of nc(when h==w==1),
+   * which is the same format as paddle.
+   * The output_.value and output_.grad always save the external data,
+   * when mixed with cpu device.
+   * When all layers are mkldnn layers, they could save internal data.
+   */
+  // below MKLDNNMatrix buffers are all internal buffers
+  std::vector<MKLDNNMatrixPtr> inVals_;
+  std::vector<MKLDNNMatrixPtr> inGrads_;
+  MKLDNNMatrixPtr outVal_;
+  MKLDNNMatrixPtr outGrad_;
+  // below are external value and grad
+  std::vector<MKLDNNMatrixPtr> extInVals_;
+  std::vector<MKLDNNMatrixPtr> extInGrads_;
+  MKLDNNMatrixPtr extOutVal_;
+  MKLDNNMatrixPtr extOutGrad_;
+  // convert handle between external and internal buffers
+  std::vector<std::shared_ptr<mkldnn::reorder>> cvtInVals_;
+  std::vector<std::shared_ptr<mkldnn::reorder>> cvtInGrads_;
+  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
+  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
+
+  // weight and bias are always internal buffers
+  MKLDNNMatrixPtr wgtVal_;
+  MKLDNNMatrixPtr wgtGrad_;
+  MKLDNNMatrixPtr biasVal_;
+  MKLDNNMatrixPtr biasGrad_;
+
+  // merge grad primitive
+  std::shared_ptr<mkldnn::primitive> mergeGrad_;
+  std::vector<mkldnn::primitive> pipelineMergeGrad_;
+  // tmp input argument to save input grad, only used to merge grad
+  Argument tmpInArg_;
+
+ public:
+  explicit MKLDNNLayer(const LayerConfig& config)
+      : Layer(config),
+        ih_(0),
+        iw_(0),
+        condition_(0),
+        needResetBwd_(true),
+        outputOnlyMKLDNN_(false),
+        engine_(mkldnn::engine::cpu, 0),
+        stream_(nullptr),
+        fwd_(nullptr),
+        bwdWgt_(nullptr),
+        bwdData_(nullptr) {}
+
+  ~MKLDNNLayer() {}
+
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  virtual void forward(PassType passType);
+  virtual void backward(const UpdateCallback& callback);
+
+  /**
+   * reshape the input and output channels and image sizes
+   * and reset output buffer size
+   */
+  virtual void reshape(
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) = 0;
+
+  /**
+   * reset the mkldnn forward primitve and memories
+   * only would be called when input size changes
+   * weight and bias buffers should be coverd by child class itself
+   */
+  virtual void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& out) = 0;
+
+  /**
+   * reset the mkldnn backward primitve and memories
+   * only would be called when needed
+   * weight and bias buffers should be coverd by child class itself
+   */
+  virtual void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& out) = 0;
+
+  /**
+   * Update weights and biases if necessary.
+   */
+  virtual void updateWeights(const UpdateCallback& callback) {}
+
+  /**
+   * convert weight from paddle format to mkldnn format
+   * weight_ will be override
+   */
+  virtual void convertWeightsFromPaddle() {}
+
+  /**
+   * convert mkldnn weight to paddle format
+   * weight_ will be override
+   */
+  virtual void convertWeightsToPaddle() {}
+
+  /**
+   * add this interface as public for unit test
+   */
+  void addOutputArgument(int deviceId) { Layer::addOutputArgument(deviceId); }
+
+ protected:
+  /**
+   * Some layers may have different condition to reset the forward.
+   * The function returns the condition that do not need reset forward.
+   */
+  inline virtual size_t keepCondition() {
+    // reset when the first input element size changed, not only the batchsize
+    return inputLayers_[0]->getOutputValue()->getElementCnt();
+  }
+
+  /**
+   * reshape the input image sizes and input batchsize
+   */
+  void reshapeInput(int& batchsize, int& height, int& width, size_t idx = 0);
+
+  /**
+   * reshape output image sizes
+   */
+  void reshapeOutput(size_t height, size_t width);
+
+  /**
+   * reset MKLDNNMatrix from Matrix and internal primitive desc.
+   * reset nullptr if matrix or primitive desc is empty
+   */
+  void resetWithMatrix(MKLDNNMatrixPtr& dnn,
+                       const MatrixPtr& mat,
+                       mkldnn::memory::primitive_desc pd);
+
+  /**
+   * reset input value from input MKLDNNMatrix and internal primitive desc.
+   * reset both internal and external buffer and create reorder if necessary.
+   * input channel may be different in concat.
+   */
+  void resetInValue(
+      MKLDNNMatrixPtr& in,
+      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr,
+      size_t idx = 0,
+      int inputChannel = 0);
+
+  /**
+   * reset output value from internal primitive desc.
+   * reset both internal and external buffer and create reorder if necessary.
+   */
+  void resetOutValue(MKLDNNMatrixPtr& out,
+                     mkldnn::memory::primitive_desc intPD);
+
+  /**
+   * reset input grad from internal primitive desc.
+   * reset both internal and external buffer and create reorder if necessary.
+   */
+  void resetInGrad(MKLDNNMatrixPtr& in,
+                   mkldnn::memory::primitive_desc intPD,
+                   size_t idx = 0);
+
+  /**
+   * reset output grad from internal primitive desc.
+   * merge grad if necessary.
+   * reset both internal and external buffer and create reorder if necessary.
+   * note: about merge grad, when this layer has several outputs,
+   *       it could not be mixed with cpu device,
+   *       since it can not get memory desc from cpu device.
+   */
+  void resetOutGrad(MKLDNNMatrixPtr& out, mkldnn::memory::primitive_desc intPD);
+
+  /**
+   * reset the merge grad primitive if necessary.
+   * note: do not support the grads mixed with cpu device,
+   *       since it can not get memory desc from cpu device.
+   */
+  void resetMergeGrad(MKLDNNMatrixPtr& out);
+
+ protected:
+  /**
+   * Set deviceId of this layer.
+   */
+  void setDevice(int id) { deviceId_ = id; }
+
+  /**
+   * check the format is nchw or nc,
+   * which is supported by Paddle default memory layout
+   */
+  bool isPaddleFormat(mkldnn::memory::format fmt) {
+    if (fmt == mkldnn::memory::format::nchw ||
+        fmt == mkldnn::memory::format::nc) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /**
+   * If input only has MKLDNN device.
+   * Otherwise, only support the previous layer using CPU device.
+   */
+  bool inputIsOnlyMKLDNN(int index = 0) {
+    int prevDevice = getPrev(index)->getDeviceId();
+    if (prevDevice == MKLDNN_DEVICE) {
+      return true;
+    } else {
+      CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet";
+      return false;
+    }
+  }
+
+  /**
+   * If output only has MKLDNN device.
+   * Otherwise, other devices should only using CPU device.
+   */
+  bool outputIsOnlyMKLDNN() {
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
+          << "Only support other device is CPU yet";
+    }
+    outputOnlyMKLDNN_ = outputOtherDevice_.size() == 0;
+    return outputOnlyMKLDNN_;
+  }
+
+  /**
+   * print info about sizes
+   */
+  virtual void printSizeInfo() {
+    VLOG(MKLDNN_SIZES) << getName() << ": bs: " << bs_ << ", ic: " << ic_
+                       << ", ih: " << ih_ << ", iw: " << iw_ << ", oc: " << oc_
+                       << ", oh: " << oh_ << ", ow: " << ow_;
+  }
+
+  /**
+   * print the mkldnn memory format of value
+   */
+  virtual void printValueFormat() {
+    for (size_t i = 0; i < inVals_.size(); ++i) {
+      if (!inVals_[i]) {
+        continue;
+      }
+      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
+                        << ": " << (extInVals_[i] ? extInVals_[i]->getFormat()
+                                                  : inVals_[i]->getFormat())
+                        << " >>> " << inVals_[i]->getFormat() << " >>>";
+    }
+    if (outVal_) {
+      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> "
+                        << (extOutVal_ ? extOutVal_->getFormat()
+                                       : outVal_->getFormat());
+    }
+    if (wgtVal_) {
+      VLOG(MKLDNN_FMTS) << "Weight value format: " << wgtVal_->getFormat();
+    }
+    if (biasVal_) {
+      VLOG(MKLDNN_FMTS) << "Bias value format: " << biasVal_->getFormat();
+    }
+  }
+
+  /**
+   * print the mkldnn memory format of grad
+   */
+  virtual void printGradFormat() {
+    if (outGrad_) {
+      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< "
+                        << (extOutGrad_ ? extOutGrad_->getFormat()
+                                        : outGrad_->getFormat());
+    }
+    for (size_t i = 0; i < inGrads_.size(); ++i) {
+      if (!inGrads_[i]) {
+        continue;
+      }
+      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
+                        << ": " << (extInGrads_[i] ? extInGrads_[i]->getFormat()
+                                                   : inGrads_[i]->getFormat())
+                        << " <<< " << inGrads_[i]->getFormat() << " <<<";
+    }
+    if (wgtGrad_) {
+      VLOG(MKLDNN_FMTS) << "Weight grad format: " << wgtGrad_->getFormat();
+    }
+    if (biasGrad_) {
+      VLOG(MKLDNN_FMTS) << "Bias grad format: " << biasGrad_->getFormat();
+    }
+  }
+
+ private:
+  /**
+   * clear all grad
+   */
+  void clearGrads() {
+    if (output_.grad) {
+      output_.grad->zeroMem();
+    }
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      if (outputOtherDevice_[i].grad) {
+        outputOtherDevice_[i].grad->zeroMem();
+      }
+    }
+  }
+
+  /**
+   * Set deviceId of the params used in this layer.
+   */
+  void setParamsDevice(int id, const ParameterMap& parameterMap) {
+    for (auto& inputConfig : config_.inputs()) {
+      if (inputConfig.has_input_parameter_name()) {
+        ParameterPtr parameter;
+        std::string name = inputConfig.input_parameter_name();
+        CHECK(mapGet(name, parameterMap, &parameter))
+            << "Cannot find input parameter " << name << " for layer "
+            << getName();
+        parameter->setDevice(id);
+      }
+    }
+    if (config_.has_bias_parameter_name()) {
+      ParameterPtr parameter;
+      std::string name = config_.bias_parameter_name();
+      CHECK(mapGet(name, parameterMap, &parameter))
+          << "Cannot find bias parameter " << name << " for layer "
+          << getName();
+      parameter->setDevice(id);
+    }
+  }
+
+  /**
+   * Set output map of prev layers.
+   */
+  void setOutputMap() {
+    outputMap_.clear();
+    for (size_t i = 0; i < inputLayers_.size(); ++i) {
+      inputLayers_[i]->setOutput(getName(), &tmpInArg_);
+    }
+  }
+
+  /**
+   * if have cpu device, share value and grad data with output_
+   */
+  void shareCPUDevice() {
+    if (outputIsOnlyMKLDNN()) {
+      return;
+    }
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      outputOtherDevice_[i].value = output_.value;
+      outputOtherDevice_[i].grad = output_.grad;
+    }
+  }
+
+  /**
+   * Check the cpu device number of outputOtherDevice_.
+   * should have only one at most.
+   */
+  void checkCPUOutputsNumber(int max = 1) {
+    int cnt = 0;
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
+        ++cnt;
+      }
+    }
+    CHECK_LE(cnt, max) << "too much CPU devies";
+  }
+
+  /**
+   * copy SeqInfo from input layer to this output and other output devices.
+   * @note: do not use getInput(0) since it used this deviceId_,
+   *        use "inputLayers_[0]->getOutput()" instead.
+   */
+  void copySeqInfoToOutputs() {
+    if (inputLayers_.empty() || !needSequenceInfo_) {
+      return;
+    }
+    const Argument& input = inputLayers_[0]->getOutput();
+    output_.sequenceStartPositions = input.sequenceStartPositions;
+    output_.subSequenceStartPositions = input.subSequenceStartPositions;
+    output_.cpuSequenceDims = input.cpuSequenceDims;
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      outputOtherDevice_[i].sequenceStartPositions =
+          output_.sequenceStartPositions;
+      outputOtherDevice_[i].subSequenceStartPositions =
+          output_.subSequenceStartPositions;
+      outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
+    }
+  }
+
+  void prepareValueConversions(std::vector<mkldnn::primitive>& pipeline) {
+    // MKLDNNLayer output value should be MKLDNNMatrix
+    // so external output value is necessary.
+    // Then external input value is not necessary,
+    // since input may be mkldnn internal buffer.
+    CHECK(extOutVal_) << "external output value is necessary";
+    output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
+    CHECK(inVals_[0] && outVal_) << "internal memories are necessary";
+    for (size_t i = 0; i < cvtInVals_.size(); ++i) {
+      if (cvtInVals_[i]) {
+        pipeline.insert(pipeline.begin(), *cvtInVals_[i]);
+      }
+    }
+    if (cvtOutVal_) {
+      pipeline.push_back(*cvtOutVal_);
+    }
+  }
+  void prepareGradConversions(std::vector<mkldnn::primitive>& pipeline) {
+    // external output grad is not necessary
+    // since output may be mkldnn internal buffer or merge them directly.
+    CHECK(outGrad_) << "internal output grad is necessary";
+    if (extOutGrad_) {
+      CHECK_EQ(extOutGrad_->getData(), output_.grad->getData())
+          << "the external buffer should share the same data with output_.grad";
+    }
+    if (cvtOutGrad_) {
+      pipeline.insert(pipeline.begin(), *cvtOutGrad_);
+    }
+    for (size_t i = 0; i < cvtInGrads_.size(); ++i) {
+      if (cvtInGrads_[i]) {
+        pipeline.push_back(*cvtInGrads_[i]);
+      }
+    }
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/legacy/gserver/layers/MKLDNNPoolLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..83d980538d2b1b7351bf858ab391c14f6e7170bd
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MKLDNNPoolLayer.cpp
@@ -0,0 +1,195 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNPoolLayer.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/utils/Logging.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_pool, MKLDNNPoolLayer);
+
+bool MKLDNNPoolLayer::init(const LayerMap& layerMap,
+                           const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+
+  /* the size of inputs for pool-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+  const PoolConfig& conf = config_.inputs(0).pool_conf();
+  ic_ = conf.channels();
+  ih_ = conf.img_size_y();
+  iw_ = conf.img_size();
+  oc_ = ic_;
+  oh_ = conf.output_y();
+  ow_ = conf.output_x();
+  fh_ = conf.size_y();
+  fw_ = conf.size_x();
+  ph_ = conf.padding_y();
+  pw_ = conf.padding();
+  sh_ = conf.stride_y();
+  sw_ = conf.stride();
+
+  const std::string& type = conf.pool_type();
+  if (type == "max-projection") {
+    poolAlgo_ = algorithm::pooling_max;
+  } else if (type == "avg-projection") {
+    // paddle only use exclude_padding
+    poolAlgo_ = algorithm::pooling_avg_exclude_padding;
+  } else {
+    LOG(FATAL) << "unknow pooling type!";
+  }
+  return true;
+}
+
+void MKLDNNPoolLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
+  reshapeInput(bs, ih, iw);
+  // ic_ and oc can not be changed
+  CHECK_EQ((size_t)ic,
+           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
+      << "Input channel can not be changed";
+
+  // cal output sizes
+  // paddle used false caffeMode for pooling
+  oh = outputSize(ih, fh_, ph_, sh_, false);
+  ow = outputSize(iw, fw_, pw_, sw_, false);
+  reshapeOutput(oh, ow);
+
+  resizeOutput(bs, oc * oh * ow);
+}
+
+void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
+                               MKLDNNMatrixPtr& out) {
+  resetFwdBuffers(inputs[0], out);
+
+  resetFwdPD(fwdPD_, inputs[0], out);
+
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], out);
+}
+
+void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
+                               MKLDNNMatrixPtr& out) {
+  std::shared_ptr<pool_bwd::primitive_desc> pd;
+
+  resetBwdBuffers(inputs[0], out);
+
+  resetBwdPD(pd, inputs[0], out);
+
+  resetBwdPipeline(pipeline, pd, inputs[0], out);
+}
+
+void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
+                                      MKLDNNMatrixPtr& out) {
+  resetInValue(in);
+
+  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
+  CHECK(in);
+  auto outPD =
+      MKLDNNMatrix::createPrimitiveDesc(outDims, in->getFormat(), engine_);
+  resetOutValue(out, outPD);
+}
+
+void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
+                                 MKLDNNMatrixPtr in,
+                                 MKLDNNMatrixPtr out) {
+  memory::dims kernels = memory::dims{fh_, fw_};
+  memory::dims strides = memory::dims{sh_, sw_};
+  memory::dims padL = memory::dims{ph_, pw_};
+  memory::dims padR = getPaddingR();
+  padding_kind padKind = padding_kind::zero;
+  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
+                                        : prop_kind::forward_training;
+  auto fwdDesc = pool_fwd::desc(pk,
+                                poolAlgo_,
+                                in->getMemoryDesc(),
+                                out->getMemoryDesc(),
+                                strides,
+                                kernels,
+                                padL,
+                                padR,
+                                padKind);
+  pd.reset(new pool_fwd::primitive_desc(fwdDesc, engine_));
+
+  // prepare workspace if necessary
+  workspace_ =
+      (passType_ != PASS_TEST && poolAlgo_ == algorithm::pooling_max)
+          ? std::make_shared<memory>(memory(pd->workspace_primitive_desc()))
+          : nullptr;
+}
+
+void MKLDNNPoolLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<pool_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  fwd_ = workspace_
+             ? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_))
+             : std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out));
+  pipeline.push_back(*fwd_);
+}
+
+void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
+                                      MKLDNNMatrixPtr& out) {
+  CHECK(inVals_[0] && outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
+}
+
+void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
+                                 MKLDNNMatrixPtr& in,
+                                 MKLDNNMatrixPtr& out) {
+  pd = nullptr;
+  if (in == nullptr) {
+    return;
+  }
+  memory::dims kernels = memory::dims{fh_, fw_};
+  memory::dims strides = memory::dims{sh_, sw_};
+  memory::dims padL = memory::dims{ph_, pw_};
+  memory::dims padR = getPaddingR();
+  CHECK(out);
+  auto bwdDesc = pool_bwd::desc(poolAlgo_,
+                                in->getMemoryDesc(),
+                                out->getMemoryDesc(),
+                                strides,
+                                kernels,
+                                padL,
+                                padR,
+                                padding_kind::zero);
+  pd.reset(new pool_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
+}
+
+void MKLDNNPoolLayer::resetBwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<pool_bwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  if (pd == nullptr) {
+    return;
+  }
+
+  bwdData_ =
+      workspace_
+          ? std::make_shared<pool_bwd>(pool_bwd(*pd, *out, *workspace_, *in))
+          : std::make_shared<pool_bwd>(pool_bwd(*pd, *out, *in));
+  pipeline.push_back(*bwdData_);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLDNNPoolLayer.h b/paddle/legacy/gserver/layers/MKLDNNPoolLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..1eb0ee4ad946f61e32b7d4f4fd376dda89d6acf7
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MKLDNNPoolLayer.h
@@ -0,0 +1,110 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+typedef mkldnn::pooling_forward pool_fwd;
+typedef mkldnn::pooling_backward pool_bwd;
+
+/**
+ * @brief A subclass of MKLDNNLayer pool layer.
+ *
+ * The config file api is mkldnn_pool
+ */
+class MKLDNNPoolLayer : public MKLDNNLayer {
+ protected:
+  // padding height and width
+  int ph_, pw_;
+  // stride height and width
+  int sh_, sw_;
+  // filter(kenerl) height and width
+  int fh_, fw_;
+
+  // pooling_avg or pooling_max
+  mkldnn::algorithm poolAlgo_;
+
+  // save forward primitive_desc, which can be used backward
+  std::shared_ptr<pool_fwd::primitive_desc> fwdPD_;
+  // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
+  // test_pooling_forward.cpp, pool need workspace for backward
+  std::shared_ptr<mkldnn::memory> workspace_;
+
+ public:
+  explicit MKLDNNPoolLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
+
+  ~MKLDNNPoolLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
+
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void printSizeInfo() override {
+    MKLDNNLayer::printSizeInfo();
+    VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
+                       << ": ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_
+                       << ", sw: " << sw_;
+  }
+
+ protected:
+  void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr in,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<pool_fwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& out);
+  void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
+  void resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr& in,
+                  MKLDNNMatrixPtr& out);
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<pool_bwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& out);
+
+  /**
+   * get padding_r according to
+   * https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
+   * test_pooling_forward.cpp
+   */
+  mkldnn::memory::dims getPaddingR() const {
+    mkldnn::memory::dims padR = {ph_, pw_};
+    for (int i = 0; i < 2; ++i) {
+      if ((ih_ + ph_ + padR[0] - fh_) / sh_ + 1 < oh_) {
+        ++padR[0];
+      }
+      if ((iw_ + pw_ + padR[1] - fw_) / sw_ + 1 < ow_) {
+        ++padR[1];
+      }
+    }
+    return padR;
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp b/paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MKLPackedRecurrentLayer.cpp
rename to paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.cpp
diff --git a/paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..441025a9c9d75786b17db84c74995a96b6a06ea8
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLPackedWeight.h"
+#include "RecurrentLayer.h"
+
+DECLARE_bool(rnn_use_batch);
+
+namespace paddle {
+
+/**
+ * @brief MKLPackedRecurrentLayer is almost the same with RecurrentLayer
+ * but is optimized with MKL cblas packed gemm.
+ * More details:
+ * https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/mkl/mkl_packed.md
+ */
+
+class MKLPackedRecurrentLayer : public RecurrentLayer {
+ public:
+  explicit MKLPackedRecurrentLayer(const LayerConfig& config)
+      : RecurrentLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void backward(const UpdateCallback& callback) override;
+
+ protected:
+  void forwardBatch(int batchSize,
+                    size_t numSequences,
+                    const int* starts) override;
+
+  void backwardBatch(int batchSize,
+                     size_t numSequences,
+                     const int* starts) override;
+
+ protected:
+  /// packed_weight_ contains same data with
+  /// RecurrentLayer::weight_ but is packed
+  std::unique_ptr<MKLPackedWeight> packed_weight_;
+  /// packed_weightT_ is the transposition matrix of packed_weight_
+  std::unique_ptr<MKLPackedWeight> packed_weightT_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MKLPackedWeight.h b/paddle/legacy/gserver/layers/MKLPackedWeight.h
new file mode 100644
index 0000000000000000000000000000000000000000..47f225bd03c3ccb594db952483d3b8397b61e1ec
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MKLPackedWeight.h
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/math/MathFunctions.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/parameter/Weight.h"
+
+namespace paddle {
+
+class MKLPackedWeight {
+ protected:
+  /// The pointer of weight
+  real *weight_;
+  /// The pointer of cblas packed gemm to weight
+  real *packedWeight_;
+  size_t height_;
+  size_t width_;
+  bool transW_;
+
+ public:
+  explicit MKLPackedWeight(MatrixPtr weight, bool transW = false) {
+    packedWeight_ = nullptr;
+    weight_ = weight->getData();
+    height_ = weight->getHeight();
+    width_ = weight->getWidth();
+    transW_ = transW;
+  }
+
+  ~MKLPackedWeight() { free_(); }
+
+  void pack() { pack_(weight_); }
+
+  void gemm_compute(const MatrixPtr src, MatrixPtr dst) {
+    cblas_sgemm_compute(CblasRowMajor,
+                        CblasNoTrans,
+                        CblasPacked,
+                        src->getHeight(),
+                        transW_ ? height_ : width_,
+                        transW_ ? width_ : height_,
+                        src->getData(),
+                        src->getWidth(),
+                        packedWeight_,
+                        width_,
+                        1.0,
+                        dst->getData(),
+                        dst->getWidth());
+  }
+
+ protected:
+  void pack_(real *src) {
+    if (!packedWeight_) {
+      packedWeight_ = cblas_sgemm_alloc(CblasBMatrix, 1, width_, height_);
+    }
+    cblas_sgemm_pack(CblasRowMajor,
+                     CblasBMatrix,
+                     transW_ ? CblasTrans : CblasNoTrans,
+                     1,
+                     transW_ ? height_ : width_,
+                     transW_ ? width_ : height_,
+                     1.0,
+                     src,
+                     width_,
+                     packedWeight_);
+  }
+
+  void free_() {
+    if (packedWeight_) {
+      cblas_sgemm_free(packedWeight_);
+    }
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MaxIdLayer.cpp b/paddle/legacy/gserver/layers/MaxIdLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..eecd4996e962857b09001a1bb36bc027cbaa4308
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MaxIdLayer.cpp
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * A layer for finding the id which has the maximal value for each sample.
+ * The result is stored in output_.ids.
+ *
+ * The config file api is maxid_layer.
+ */
+class MaxIdLayer : public Layer {
+ private:
+  /// a predetermined number of best states at each level
+  size_t beamSize_;
+
+ public:
+  explicit MaxIdLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
+    bool ret = Layer::init(layerMap, parameterMap);
+    CHECK_EQ(1UL, inputLayers_.size());
+
+    beamSize_ = config_.has_beam_size() ? config_.beam_size() : FLAGS_beam_size;
+    CHECK_GE(beamSize_, 1LU);
+    return ret;
+  }
+
+  void forward(PassType passType) override {
+    Layer::forward(passType);
+    const Argument& input = getInput(0);
+    size_t batchSize = input.getBatchSize();
+    IVector::resizeOrCreate(output_.ids, batchSize * beamSize_, useGpu_);
+    Matrix::resizeOrCreate(output_.in,
+                           batchSize,
+                           beamSize_,
+                           false,
+                           /* useGpu */ useGpu_);
+    output_.value = nullptr;
+    input.value->rowMax(*output_.ids, *output_.in);
+  }
+
+  void backward(const UpdateCallback& callback) override {}
+};
+
+REGISTER_LAYER(maxid, MaxIdLayer);
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MaxLayer.cpp b/paddle/legacy/gserver/layers/MaxLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b51251b663cf818fbe662a96b7c0d55a615640d4
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MaxLayer.cpp
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MaxLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(max, MaxLayer);
+
+void MaxLayer::forward(PassType passType) {
+  SequencePoolLayer::forward(passType);
+
+  IVector::resizeOrCreate(
+      maxIndex_, newBatchSize_ * getSize(), useGpu(deviceId_));
+  maxIndex_->zeroMem();
+
+  MatrixPtr inputValue = getInputValue(0);
+  MatrixPtr outputValue = getOutputValue();
+
+  {
+    REGISTER_TIMER_INFO("MaxLayerForward", getName().c_str());
+    outputValue->maxSequenceForward(
+        *inputValue, *startPositions_->getVector(useGpu_), *maxIndex_);
+  }
+
+  if (config_.output_max_index()) {
+    // copy maxIndex_ to output
+    outputValue->copyFrom(*maxIndex_);
+  } else {
+    /* add the bias-vector AFTER max operation */
+    if (biases_.get() != NULL) {
+      outputValue->addBias(*(biases_->getW()), 1);
+    }
+    /* activation */ { forwardActivation(); }
+  }
+}
+
+void MaxLayer::backward(const UpdateCallback& callback) {
+  CHECK(!config_.output_max_index())
+      << "backward is not available when output_max_index is set";
+  SequencePoolLayer::backward(callback);
+
+  MatrixPtr inputGrad = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+  if (inputGrad) {
+    REGISTER_TIMER_INFO("MaxLayerBackward", getName().c_str());
+    inputGrad->maxSequenceBackward(
+        *outputGrad, *(startPositions_->getVector(useGpu_)), *maxIndex_);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MaxLayer.h b/paddle/legacy/gserver/layers/MaxLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..12d0128e39f2113d0e156813f9b3657cae145eed
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MaxLayer.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "SequencePoolLayer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ * A layer for "internal max" for sequence input.
+ * Input: one or more sequences. Each sequence contains some instances.
+ * If SequenceLevel = kNonSeq:
+ *    Output: output size is the number of input sequences (NOT input instances)
+ *    output[i] = max_{for each instance in this sequence}{input[i]}
+ *    If stride_ > 0:
+ *      Output: a shorten sequence. Stride is the step size by which we slide a
+ *              window upon the input sequence, and the max pooling operation is
+ *              then applied to each interval independently.
+ * If SequenceLevel = kSeq:
+ *    Check input sequence must has sub-sequence
+ *    Output: output size is the number of input sub-sequences
+ *    output[i] = max_{for each instance in this sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
+ */
+
+class MaxLayer : public SequencePoolLayer {
+ protected:
+  // maxIndex_[i][j] = k : the value at (i, j) is from input[k].
+  IVectorPtr maxIndex_;
+
+ public:
+  explicit MaxLayer(const LayerConfig& config) : SequencePoolLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
+    return SequencePoolLayer::init(layerMap, parameterMap);
+  }
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxOutLayer.cpp b/paddle/legacy/gserver/layers/MaxOutLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MaxOutLayer.cpp
rename to paddle/legacy/gserver/layers/MaxOutLayer.cpp
diff --git a/paddle/legacy/gserver/layers/MaxOutLayer.h b/paddle/legacy/gserver/layers/MaxOutLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..e56f34b8e02bf1dd48c6b5b6ea135cc1009c25b5
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MaxOutLayer.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * A layer to do max out on conv layer output.
+ * Input: output of a conv layer.
+ * Output: feature map size same as input.  Channel is (input channel) / groups.
+ * So the num of channels should be able to devided by groups.
+ *
+ * The config file api is maxout_layer.
+ */
+
+class MaxOutLayer : public Layer {
+ protected:
+  size_t groups_;
+  size_t imgSizeH_, imgSizeW_;
+  /// outputChannels_ = channels_ / groups_
+  size_t channels_, outputChannels_;
+  /// feature length = imgSizeH_ * imgSizeW_
+  size_t featLen_;
+  IVectorPtr maxoutId_;
+
+ public:
+  /// return imgSizeH_ * imgSizeW_ * outputChannels_;
+  size_t getSize();
+
+  explicit MaxOutLayer(const LayerConfig& config) : Layer(config) {}
+  virtual ~MaxOutLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.cpp b/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a1cc59a719e43453a8919a5827369982ac355480
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.cpp
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MaxPoolWithMaskLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+bool MaxPoolWithMaskLayer::init(const LayerMap& layerMap,
+                                const ParameterMap& parameterMap) {
+  PoolLayer::init(layerMap, parameterMap);
+  setOutput("mask", &mask_);
+  return true;
+}
+
+size_t MaxPoolWithMaskLayer::getSize() {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t layerSize = 0;
+
+  outputY_ = outputSize(imgSizeY_,
+                        sizeY_,
+                        confPaddingY_,
+                        strideY_,
+                        /* caffeMode */ false);
+  outputX_ = outputSize(imgSize_,
+                        sizeX_,
+                        confPadding_,
+                        stride_,
+                        /* caffeMode */ false);
+
+  layerSize = outputX_ * outputY_ * channels_;
+  getOutput().setFrameHeight(outputY_);
+  getOutput().setFrameWidth(outputX_);
+
+  return layerSize;
+}
+
+void MaxPoolWithMaskLayer::forward(PassType passType) {
+  size_t size = getSize();
+  MatrixPtr inputV = inputLayers_[0]->getOutputValue();
+  int batchSize = inputV->getHeight();
+  resetOutput(batchSize, size);
+
+  MatrixPtr outV = getOutputValue();
+  CHECK_EQ(size, outV->getWidth());
+
+  resetSpecifyOutput(mask_,
+                     batchSize,
+                     size,
+                     /* isValueClean */ false,
+                     /* isGradClean */ true);
+
+  MatrixPtr maskV = mask_.value;
+  outV->maxPoolForward(*inputV,
+                       imgSizeY_,
+                       imgSize_,
+                       channels_,
+                       sizeX_,
+                       sizeY_,
+                       strideY_,
+                       stride_,
+                       outputY_,
+                       outputX_,
+                       confPaddingY_,
+                       confPadding_,
+                       maskV);
+}
+
+void MaxPoolWithMaskLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  if (NULL == getInputGrad(0)) {
+    return;
+  }
+
+  MatrixPtr outGrad = getOutputGrad();
+  MatrixPtr inputV = inputLayers_[0]->getOutputValue();
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr inputGrad = inputLayers_[0]->getOutputGrad();
+
+  inputGrad->maxPoolBackward(*inputV,
+                             imgSizeY_,
+                             imgSize_,
+                             *outGrad,
+                             *outV,
+                             sizeX_,
+                             sizeY_,
+                             strideY_,
+                             stride_,
+                             outputY_,
+                             outputX_,
+                             1,
+                             1,
+                             confPaddingY_,
+                             confPadding_);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.h b/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..fcd5388abe3f8229dfa418e6917a8a73c93900a7
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MaxPoolWithMaskLayer.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "PoolLayer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+/**
+ * @brief Basic parent layer of different kinds of pooling
+ */
+class MaxPoolWithMaskLayer : public PoolLayer {
+ protected:
+  Argument mask_;
+
+ public:
+  explicit MaxPoolWithMaskLayer(const LayerConfig& config)
+      : PoolLayer(config) {}
+
+  size_t getSize();
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MixedLayer.cpp b/paddle/legacy/gserver/layers/MixedLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..63e658c09c2b3bae30c8b2890e4d67f72266dd4d
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MixedLayer.cpp
@@ -0,0 +1,176 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MixedLayer.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(mixed, MixedLayer);
+
+bool MixedLayer::init(const LayerMap& layerMap,
+                      const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  if (!Layer::init(layerMap, parameterMap)) return false;
+
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  projections_.resize(inputLayers_.size());
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    if (config_.inputs(i).has_proj_conf()) {
+      projections_[i].reset(Projection::create(
+          config_.inputs(i).proj_conf(), parameters_[i], useGpu_));
+    } else {
+      CHECK(!parameters_[i]) << "should no parameters for operators";
+    }
+  }
+  for (auto& operator_conf : config_.operator_confs()) {
+    for (auto& input_index : operator_conf.input_indices()) {
+      CHECK(!config_.inputs(input_index).has_proj_conf());
+    }
+    operators_.emplace_back(Operator::create(operator_conf, useGpu_));
+  }
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    sharedBias_ = config_.shared_biases();
+    size_t psize = config_.bias_size();
+    biases_ = std::unique_ptr<Weight>(new Weight(1, psize, biasParameter_));
+  }
+
+  return true;
+}
+
+void MixedLayer::prefetch() {
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    if (projections_[i]) {
+      projections_[i]->prefetch(&getInput(i));
+    }
+  }
+}
+
+void MixedLayer::resetState() {
+  for (auto& proj : projections_) {
+    if (proj) {
+      proj->resetState();
+    }
+  }
+}
+
+void MixedLayer::setState(LayerStatePtr state) {
+  CHECK(projectionStateMatrixSize_.size() == projections_.size())
+      << "projection size mis-match";
+
+  int start = 0;
+  LayerStatePtr statePtr = std::make_shared<LayerState>();
+  for (int i = 0; i < (int)projectionStateMatrixSize_.size(); i++) {
+    if (projectionStateMatrixSize_[i] > 0) {
+      statePtr->value.clear();
+      for (int j = start; j < start + projectionStateMatrixSize_[i]; j++) {
+        statePtr->value.push_back(state->value[j]);
+      }
+      projections_[i]->setState(statePtr);
+      start += projectionStateMatrixSize_[i];
+    }
+  }
+  CHECK((int)state->value.size() == start) << "state matrix size mis-match";
+}
+
+// Return state which consists of all projections states
+LayerStatePtr MixedLayer::getState() {
+  bool init = projectionStateMatrixSize_.size() == 0;
+  LayerStatePtr res = std::make_shared<LayerState>();
+  for (int i = 0; i < (int)projections_.size(); i++) {
+    LayerStatePtr statePtr =
+        projections_[i] ? projections_[i]->getState() : nullptr;
+    int stateSize = statePtr == nullptr ? 0 : statePtr->value.size();
+    if (init) {
+      projectionStateMatrixSize_.push_back(stateSize);
+    } else {
+      CHECK(projectionStateMatrixSize_[i] == stateSize)
+          << "state matrix size mis-match";
+    }
+    if (statePtr != nullptr) {
+      for (auto& matrixPtr : statePtr->value) {
+        res->value.push_back(matrixPtr);
+      }
+    }
+  }
+  return res;
+}
+
+void MixedLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  int batchSize = getInput(0).getBatchSize();
+  int size = getSize();
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    resetOutput(batchSize, size);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    if (projections_[i]) {
+      projections_[i]->forward(&getInput(i), &output_, passType);
+    }
+  }
+
+  std::vector<const Argument*> ins;
+  for (auto& op : operators_) {
+    ins.clear();
+    for (auto& input_index : op->getConfig().input_indices()) {
+      ins.push_back(&getInput(input_index));
+    }
+    op->forward(ins, &output_, passType);
+  }
+
+  /* add the bias-vector */
+  if (biases_.get() != NULL) {
+    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
+    outV->addBias(*(biases_->getW()), 1, sharedBias_);
+  }
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void MixedLayer::backward(const UpdateCallback& callback) {
+  /* Do activation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  if (biases_ && biases_->getWGrad()) {
+    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBias_);
+
+    /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    if (projections_[i]) {
+      projections_[i]->backward(callback);
+    }
+  }
+
+  for (auto& op : operators_) {
+    op->backward();
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MixedLayer.h b/paddle/legacy/gserver/layers/MixedLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..43ee2bd81854f2dea837734f556c197613f6fdaf
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MixedLayer.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "Operator.h"
+#include "Projection.h"
+
+namespace paddle {
+
+/**
+ * A mixed layer has multiple input layers.
+ * Each input layer was processed by a Projection or Operator.
+ * The results of all projections or Operators are summed together with bias
+ * (if configured), and then go through an activation function and dropout
+ * (if configured).
+ *
+ * The config file api is mixed_layer.
+ */
+class MixedLayer : public Layer {
+ public:
+  explicit MixedLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~MixedLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void prefetch() override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+  void resetState() override;
+  /**
+   * setState() should be called after getState().
+   * Argument state consists of all projections states.
+   */
+  void setState(LayerStatePtr state) override;
+  /**
+   * Return state which consists of all projections states.
+   */
+  LayerStatePtr getState() override;
+
+ protected:
+  std::vector<std::unique_ptr<Projection>> projections_;
+  std::vector<std::unique_ptr<Operator>> operators_;
+  /// the matrix size of projection state
+  std::vector<int> projectionStateMatrixSize_;
+  std::unique_ptr<Weight> biases_;
+  bool sharedBias_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MultiBoxLossLayer.cpp b/paddle/legacy/gserver/layers/MultiBoxLossLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/MultiBoxLossLayer.cpp
rename to paddle/legacy/gserver/layers/MultiBoxLossLayer.cpp
diff --git a/paddle/legacy/gserver/layers/MultiBoxLossLayer.h b/paddle/legacy/gserver/layers/MultiBoxLossLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..a358cded00bb01bfe5d02f9a6d8a24e4b2e51b74
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MultiBoxLossLayer.h
@@ -0,0 +1,103 @@
+/* copyright (c) 2016 paddlepaddle authors. all rights reserve.
+
+licensed under the apache license, version 2.0 (the "license");
+you may not use this file except in compliance with the license.
+you may obtain a copy of the license at
+
+    http://www.apache.org/licenses/license-2.0
+
+unless required by applicable law or agreed to in writing, software
+distributed under the license is distributed on an "as is" basis,
+without warranties or conditions of any kind, either express or implied.
+see the license for the specific language governing permissions and
+limitations under the license. */
+
+#pragma once
+
+#include <vector>
+#include "CostLayer.h"
+#include "DataLayer.h"
+#include "DetectionUtil.h"
+#include "Layer.h"
+
+using std::vector;
+using std::pair;
+
+namespace paddle {
+
+/**
+ * The multibox loss layer for a SSD detection task.
+ * The loss is composed by the location loss and the confidence loss.
+ * The location loss is a smooth L1 loss and the confidence loss is
+ * a softmax loss.
+ * - Input: This layer needs four input layers: The first input layer
+ *          is the priorbox layer and the second layer is a label layer.
+ *          The rest two input layers are convolution layers for generating
+ *          bbox location offset and the classification confidence.
+ * - Output: The Single Shot Multibox Detection loss value.
+ * Reference:
+ *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
+ *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
+ */
+
+class MultiBoxLossLayer : public CostLayer {
+ public:
+  explicit MultiBoxLossLayer(const LayerConfig& config) : CostLayer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+
+  void backward(const UpdateCallback& callback = nullptr);
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) {}
+
+  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad) {}
+
+ protected:
+  inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
+  inline LayerPtr getLabelLayer() { return inputLayers_[1]; }
+  inline LayerPtr getLocInputLayer(size_t index) {
+    return inputLayers_[2 + index];
+  }
+  inline LayerPtr getConfInputLayer(size_t index) {
+    return inputLayers_[2 + inputNum_ + index];
+  }
+
+ protected:
+  size_t numClasses_;
+  real overlapThreshold_;
+  real negPosRatio_;
+  real negOverlap_;
+  size_t inputNum_;
+  size_t backgroundId_;
+
+  real locLoss_;
+  real confLoss_;
+
+  size_t numPriors_;
+  size_t numMatches_;
+  size_t numNegs_;
+  size_t numConf_;
+  size_t locSizeSum_;
+  size_t confSizeSum_;
+
+  vector<vector<int>> allMatchIndices_;
+  vector<vector<int>> allNegIndices_;
+  MatrixPtr locGTData_;
+  IVectorPtr confGTData_;
+
+  MatrixPtr locBuffer_;
+  MatrixPtr confBuffer_;
+  MatrixPtr locDiff_;
+  MatrixPtr confProb_;
+
+  MatrixPtr labelCpuValue_;
+  MatrixPtr priorCpuValue_;
+  MatrixPtr locCpuBuffer_;
+  MatrixPtr confCpuBuffer_;
+  MatrixPtr locTmpBuffer_;
+  MatrixPtr confTmpBuffer_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MultinomialSampler.cpp b/paddle/legacy/gserver/layers/MultinomialSampler.cpp
similarity index 100%
rename from paddle/gserver/layers/MultinomialSampler.cpp
rename to paddle/legacy/gserver/layers/MultinomialSampler.cpp
diff --git a/paddle/legacy/gserver/layers/MultinomialSampler.h b/paddle/legacy/gserver/layers/MultinomialSampler.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed445352418f8504e52a6139492e3577a95eecb1
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MultinomialSampler.h
@@ -0,0 +1,81 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <random>
+#include "paddle/legacy/utils/Common.h"
+
+namespace paddle {
+
+/**
+ * @brief Given the probability of N objects, the sampler random select
+ * one of the object.
+ * @note: prob does not have to be unnormalized.
+ *
+ * The space requirement is O(N)=O(N * sizeof(Interval)).
+ * The computational complexity of generate one sample is O(1).
+ */
+class MultinomialSampler {
+ public:
+  MultinomialSampler(const real* prob, int size);
+
+  //! protobuf always using double.
+  static MultinomialSampler* create(const double* prob, int size) {
+#ifdef PADDLE_TYPE_DOUBLE
+    return new MultinomialSampler(prob, size);
+#else
+    std::unique_ptr<real[]> tmp(new real[size]);
+    std::copy(prob, prob + size, tmp.get());
+    return new MultinomialSampler(tmp.get(), size);
+#endif
+  }
+
+  /**
+   * @brief Generate a random sample.
+   * @param g is a random number engine. See <random>.
+   * @return Random integer.
+   */
+  template <typename URNG>
+  int gen(URNG& g) {
+    return gen1([&g, this]() { return rand_(g); });
+  }
+
+ protected:
+  /**
+   * @brief Generation
+   * @param[in] rand rand is a real random number distribution
+   * for the range [0, size).
+   * @return random int number or intervals_[random_int_number].otherId.
+   */
+  template <typename Rand>
+  int gen1(Rand rand) {
+    double r = rand();  // NOLINT
+    int i = (int)r;
+    r -= i;
+    return r < intervals_[i].thresh ? i : intervals_[i].otherId;
+  }
+
+  struct Interval {
+    int otherId;
+    real thresh;
+  };
+
+  /// The probability of each interval will be 1./size
+  std::vector<Interval> intervals_;
+  std::uniform_real_distribution<double> rand_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/MultiplexLayer.cpp b/paddle/legacy/gserver/layers/MultiplexLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9ca2b2417596e7978ea6b84ec76bcb8a305a4f5d
--- /dev/null
+++ b/paddle/legacy/gserver/layers/MultiplexLayer.cpp
@@ -0,0 +1,180 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ *@brief This layer multiplex multiple layers according to the index,
+ * which is provided by the first input layer.
+ * - Input[0]: the index of the layer to output of size batchSize.
+ * - Input[1:N]; the candidate output data.
+ * For each index i from 0 to batchSize -1, the output is the i-th row of the
+ * (index[i] + 1)-th layer.
+ *
+ * For each i-th row of output:
+ *
+ * \f[
+ *   y[i][j] = x_{x_{0}[i] + 1}[i][j], j = 0,1, ... , (x_{1}.width - 1)
+ * \f]
+ * where, y is output. \f$x_{k}\f$ is the k-th input layer and
+ * \f$k = x_{0}[i] + 1\f$.
+ */
+
+class MultiplexLayer : public Layer {
+ protected:
+  /**
+   * @brief A struct is used to save the copy information, includes input
+   * layer index and copy size.
+   */
+  struct CopyInfo {
+    CopyInfo(int inStartIdx, int inLength, int inCopyIdx)
+        : startIdx(inStartIdx), length(inLength), copyIdx(inCopyIdx) {}
+
+    /// The start row of input.
+    int startIdx;
+    /// Number of rows. If the layer index in Input[0] is not consecutive,
+    /// the length is one. Otherwise, the length is > 1 and copy multi rows
+    /// once.
+    int length;
+    /// The copied layer index, which needs to add 1.
+    int copyIdx;
+  };
+
+  /// A list of CopyInfo used to save copy information.
+  std::vector<CopyInfo> copySchedule_;
+
+  /// Temporary matrix pointer to point to input data.
+  MatrixPtr tmpSrc_;
+  /// Temporary matrix pointer to point to output data.
+  MatrixPtr tmpDest_;
+
+ public:
+  explicit MultiplexLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~MultiplexLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ private:
+  /**
+   * @brief Calculate copy info for input layers.
+   */
+  void calculateCopySchedule(const IVectorPtr& copyIds, size_t numIns);
+};
+
+REGISTER_LAYER(multiplex, MultiplexLayer);
+
+void MultiplexLayer::calculateCopySchedule(const IVectorPtr& copyIds,
+                                           size_t numIns) {
+  copySchedule_.clear();
+  CopyInfo prevCopyInfo(0, 0, -1);
+  for (size_t i = 0; i < copyIds->getSize(); i++) {
+    int copyId = copyIds->getElement(i);
+    CHECK_GE(copyId, 0);
+    CHECK_LT(copyId, int(numIns));
+    // copy same input layer with prevous and will copy consecutive.
+    if (copyId == prevCopyInfo.copyIdx) {
+      ++prevCopyInfo.length;
+    } else {
+      if (prevCopyInfo.copyIdx != -1) {
+        copySchedule_.emplace_back(prevCopyInfo);
+      }
+      prevCopyInfo.startIdx = i;
+      prevCopyInfo.length = 1;
+      prevCopyInfo.copyIdx = copyId;
+    }
+  }
+  if (prevCopyInfo.copyIdx != -1) {
+    copySchedule_.emplace_back(prevCopyInfo);
+  }
+}
+
+bool MultiplexLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_GE(inputLayers_.size(), 2U);
+
+  tmpSrc_ =
+      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
+  tmpDest_ =
+      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
+  return true;
+}
+
+void MultiplexLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  IVectorPtr copyIds = getInput(0).ids;
+  MatrixPtr inV1 = getInputValue(1);
+  CHECK_EQ(copyIds->getSize(), inV1->getHeight());
+  for (size_t i = 2; i < inputLayers_.size(); i++) {
+    CHECK_EQ(inV1->getHeight(), getInputValue(i)->getHeight());
+    CHECK_EQ(inV1->getWidth(), getInputValue(i)->getWidth());
+  }
+
+  calculateCopySchedule(copyIds, inputLayers_.size() - 1);
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(inV1->getHeight(), inV1->getWidth());
+  }
+
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwLMultplexingTimer", getName().c_str());
+    AsyncGpuBlock block;
+    for (const CopyInfo& info : copySchedule_) {
+      outV->subMatrix(info.startIdx, info.length, tmpDest_)
+          ->copyFrom(*getInputValue(info.copyIdx + 1)
+                          ->subMatrix(info.startIdx, info.length, tmpSrc_));
+    }
+  }
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void MultiplexLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  MatrixPtr outG = getOutputGrad();
+
+  {
+    REGISTER_TIMER_INFO("BwLMultiplexTimer", getName().c_str());
+    AsyncGpuBlock block;
+    for (const CopyInfo& info : copySchedule_) {
+      if (getInputGrad(info.copyIdx + 1)) {
+        getInputGrad(info.copyIdx + 1)
+            ->subMatrix(info.startIdx, info.length, tmpDest_)
+            ->add(*outG->subMatrix(info.startIdx, info.length, tmpSrc_));
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/NCELayer.cpp b/paddle/legacy/gserver/layers/NCELayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ae4d6408168d1597760fe0094bc04f9cef657da4
--- /dev/null
+++ b/paddle/legacy/gserver/layers/NCELayer.cpp
@@ -0,0 +1,323 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <random>
+
+#include "Layer.h"
+#include "MultinomialSampler.h"
+#include "paddle/legacy/math/MathFunctions.h"
+
+namespace paddle {
+
+/**
+ * Noise-contrastive estimation.
+ * Implements the method in the following paper:
+ * A fast and simple algorithm for training neural probabilistic language
+ * models.
+ *
+ * The config file api is nce_layer.
+ */
+class NCELayer : public Layer {
+  int numClasses_;
+  /// number of input layer besides labelLayer and weightLayer
+  int numInputs_;
+  LayerPtr labelLayer_;
+  /// weight layer, can be None
+  LayerPtr weightLayer_;
+  WeightList weights_;
+  std::unique_ptr<Weight> biases_;
+  std::unique_ptr<MultinomialSampler> sampler_;
+
+  std::uniform_int_distribution<int> rand_;
+
+  struct Sample {
+    int sampleId;
+    int labelId;
+    bool target;
+    real weight;
+  };
+  std::vector<Sample> samples_;
+  /// whether samples_ is prepared
+  bool prepared_;
+  Argument sampleOut_;
+
+  IVectorPtr labelIds_;
+
+ public:
+  explicit NCELayer(const LayerConfig& config)
+      : Layer(config),
+        numClasses_(config.num_classes()),
+        rand_(0, config.num_classes() - 1),
+        prepared_(false) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
+    /* Initialize the basic parent class */
+    Layer::init(layerMap, parameterMap);
+
+    /* initialize the weightList */
+    size_t i;
+    for (i = 0; i < inputLayers_.size(); i++) {
+      if (!parameters_[i]) break;
+      size_t width = inputLayers_[i]->getSize();
+      // create a new weight
+      CHECK_EQ(parameters_[i]->getSize(), width * numClasses_);
+      Weight* w = new Weight(numClasses_, width, parameters_[i]);
+
+      // append the new weight to the list
+      weights_.emplace_back(w);
+    }
+
+    CHECK_EQ(1U, getSize());
+
+    numInputs_ = i;
+    CHECK_GE(numInputs_, 1)
+        << "Must have at least one input besides label and weight";
+    CHECK_LT(i, inputLayers_.size()) << "Missing label layer";
+    labelLayer_ = inputLayers_[i];
+    if (++i < inputLayers_.size()) {
+      weightLayer_ = inputLayers_[i];
+      ++i;
+    }
+    CHECK_EQ(i, inputLayers_.size());
+
+    /* initialize biases_ */
+    if (biasParameter_.get() != NULL) {
+      CHECK_EQ(biasParameter_->getSize(), (size_t)numClasses_);
+      biases_.reset(new Weight(1, numClasses_, biasParameter_));
+    }
+
+    if (config_.neg_sampling_dist_size()) {
+      CHECK_EQ(numClasses_, config_.neg_sampling_dist_size());
+      sampler_.reset(MultinomialSampler::create(
+          config_.neg_sampling_dist().data(), numClasses_));
+    }
+
+    return true;
+  }
+
+  void prepareSamples() {
+    CHECK(!useGpu_) << "GPU is not supported";
+
+    int batchSize = getInput(*labelLayer_).getBatchSize();
+    IVectorPtr label = getInput(*labelLayer_).ids;
+
+    CpuSparseMatrixPtr multiLabel = std::dynamic_pointer_cast<CpuSparseMatrix>(
+        getInput(*labelLayer_).value);
+
+    CHECK(label || multiLabel)
+        << "The label layer must have ids or NonValueSparseMatrix value";
+
+    auto& randEngine = ThreadLocalRandomEngine::get();
+
+    samples_.clear();
+    samples_.reserve(batchSize * (1 + config_.num_neg_samples()));
+
+    real* weight =
+        weightLayer_ ? getInputValue(*weightLayer_)->getData() : nullptr;
+
+    for (int i = 0; i < batchSize; ++i) {
+      real w = weight ? weight[i] : 1;
+      if (label) {
+        int* ids = label->getData();
+        samples_.push_back({i, ids[i], true, w});
+      } else {
+        const int* cols = multiLabel->getRowCols(i);
+        int n = multiLabel->getColNum(i);
+        for (int j = 0; j < n; ++j) {
+          samples_.push_back({i, cols[j], true, w});
+        }
+      }
+      for (int j = 0; j < config_.num_neg_samples(); ++j) {
+        int id = sampler_ ? sampler_->gen(randEngine) : rand_(randEngine);
+        samples_.push_back({i, id, false, w});
+      }
+    }
+    prepared_ = true;
+  }
+
+  void prefetch() override {
+    prepareSamples();
+    IVector::resizeOrCreate(labelIds_, samples_.size(), useGpu_);
+    int* ids = labelIds_->getData();
+    for (size_t i = 0; i < samples_.size(); ++i) {
+      ids[i] = samples_[i].labelId;
+    }
+
+    for (int i = 0; i < numInputs_; ++i) {
+      auto sparseParam =
+          dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
+      if (sparseParam) {
+        sparseParam->addRows(labelIds_);
+      }
+    }
+  }
+
+  void forward(PassType passType) override {
+    Layer::forward(passType);
+
+    CHECK(!useGpu_) << "GPU is not supported";
+
+    if (!prepared_) {
+      if (passType == PASS_GC) {
+        ThreadLocalRandomEngine::get().seed(ThreadLocalRand::getDefaultSeed());
+      }
+      prepareSamples();
+    }
+    prepared_ = false;
+
+    /* malloc memory for the output_ if necessary */
+    int batchSize = getInputValue(0)->getHeight();
+    int size = getSize();
+    resetOutput(batchSize, size);
+
+    Matrix::resizeOrCreate(sampleOut_.value,
+                           1,
+                           samples_.size(),
+                           /* trans= */ false,
+                           useGpu_);
+
+    forwardBias();
+
+    for (int l = 0; l < numInputs_; ++l) {
+      forwardOneInput(l);
+    }
+
+    auto status = activation_->forward(sampleOut_);
+    status.check();
+
+    forwardCost();
+  }
+
+  void backward(const UpdateCallback& callback) override {
+    Matrix::resizeOrCreate(sampleOut_.grad,
+                           1,
+                           samples_.size(),
+                           /* trans= */ false,
+                           useGpu_);
+
+    backwardCost();
+
+    auto status = activation_->backward(sampleOut_);
+    status.check();
+
+    if (biases_->getWGrad()) {
+      backwardBias(callback);
+    }
+
+    for (int l = 0; l < numInputs_; ++l) {
+      backwardOneInput(l, callback);
+    }
+  }
+
+  void forwardBias() {
+    if (!biases_) {
+      sampleOut_.value->zeroMem();
+    } else {
+      real* bias = biases_->getW()->getData();
+      real* sampleOut = sampleOut_.value->getData();
+      for (size_t i = 0; i < samples_.size(); ++i) {
+        sampleOut[i] = bias[samples_[i].labelId];
+      }
+    }
+  }
+
+  void backwardBias(const UpdateCallback& callback) {
+    if (!biases_) return;
+    real* bias = biases_->getWGrad()->getData();
+    real* sampleOut = sampleOut_.grad->getData();
+    for (size_t i = 0; i < samples_.size(); ++i) {
+      bias[samples_[i].labelId] += sampleOut[i];
+    }
+    biases_->incUpdate(callback);
+  }
+
+  void forwardOneInput(int layerId) {
+    const MatrixPtr& inputMat = getInputValue(layerId);
+    const MatrixPtr& weightMat = weights_[layerId]->getW();
+
+    int dim = inputMat->getWidth();
+    real* sampleOut = sampleOut_.value->getData();
+
+    for (size_t i = 0; i < samples_.size(); ++i) {
+      sampleOut[i] += dotProduct(dim,
+                                 inputMat->getRowBuf(samples_[i].sampleId),
+                                 weightMat->getRowBuf(samples_[i].labelId));
+    }
+  }
+
+  void backwardOneInput(int layerId, const UpdateCallback& callback) {
+    const MatrixPtr& inputMat = getInputValue(layerId);
+    const MatrixPtr& inputGradMat = getInputGrad(layerId);
+    const MatrixPtr& weightMat = weights_[layerId]->getW();
+    const MatrixPtr& weightGradMat = weights_[layerId]->getWGrad();
+
+    int dim = inputMat->getWidth();
+    real* sampleGrad = sampleOut_.grad->getData();
+
+    if (weightGradMat) {
+      for (size_t i = 0; i < samples_.size(); ++i) {
+        axpy(dim,
+             sampleGrad[i],
+             inputMat->getRowBuf(samples_[i].sampleId),
+             weightGradMat->getRowBuf(samples_[i].labelId));
+      }
+      weights_[layerId]->incUpdate(callback);
+    }
+
+    if (inputGradMat) {
+      for (size_t i = 0; i < samples_.size(); ++i) {
+        axpy(dim,
+             sampleGrad[i],
+             weightMat->getRowBuf(samples_[i].labelId),
+             inputGradMat->getRowBuf(samples_[i].sampleId));
+      }
+    }
+  }
+
+  void forwardCost() {
+    real* out = output_.value->getData();
+    real* sampleOut = sampleOut_.value->getData();
+    real b = 1. / numClasses_ * config_.num_neg_samples();
+    for (size_t i = 0; i < samples_.size(); ++i) {
+      real o = sampleOut[i];
+      if (sampler_) {
+        b = config_.num_neg_samples() *
+            config_.neg_sampling_dist(samples_[i].labelId);
+      }
+      real cost = samples_[i].target ? -log(o / (o + b)) : -log(b / (o + b));
+      out[samples_[i].sampleId] += samples_[i].weight * cost;
+    }
+  }
+
+  void backwardCost() {
+    real* sampleOut = sampleOut_.value->getData();
+    real* sampleGrad = sampleOut_.grad->getData();
+
+    real b = 1. / numClasses_ * config_.num_neg_samples();
+    for (size_t i = 0; i < samples_.size(); ++i) {
+      real o = sampleOut[i];
+      if (sampler_) {
+        b = config_.num_neg_samples() *
+            config_.neg_sampling_dist(samples_[i].labelId);
+      }
+      real w = samples_[i].weight;
+      sampleGrad[i] = samples_[i].target ? -w * b / (o * (o + b)) : w / (o + b);
+    }
+  }
+};
+
+REGISTER_LAYER(nce, NCELayer);
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/NormLayer.cpp b/paddle/legacy/gserver/layers/NormLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..443e26dbc859b1c51c5fb93077178ac45bdeaff3
--- /dev/null
+++ b/paddle/legacy/gserver/layers/NormLayer.cpp
@@ -0,0 +1,59 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "NormLayer.h"
+#include "NormProjectionLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+namespace paddle {
+
+REGISTER_LAYER_CREATE_FUNC(norm, &NormLayer::create);
+
+Layer* NormLayer::create(const LayerConfig& config) {
+  CHECK_EQ(config.inputs_size(), 1);
+  const std::string& norm = config.inputs(0).norm_conf().norm_type();
+  if (norm == "rnorm") {
+    return new ResponseNormLayer(config);
+  } else if (norm == "cmrnorm-projection") {
+    return new CMRProjectionNormLayer(config);
+  } else if (norm == "cross-channel-norm") {
+    return new CrossChannelNormLayer(config);
+  } else {
+    LOG(FATAL) << "Unknown norm type: " << norm;
+    return nullptr;
+  }
+}
+
+bool ResponseNormLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  NormLayer::init(layerMap, parameterMap);
+
+  /* the size of inputs for norm-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+
+  const NormConfig& conf = config_.inputs(0).norm_conf();
+  channels_ = conf.channels();
+  size_ = conf.size();
+  scale_ = conf.scale();
+  pow_ = conf.pow();
+  outputX_ = conf.output_x();
+  imgSize_ = conf.img_size();
+  denoms_ = NULL;
+
+  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  return true;
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/NormLayer.h b/paddle/legacy/gserver/layers/NormLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ac00034d086a5952b30576268c72af326e3ebf9
--- /dev/null
+++ b/paddle/legacy/gserver/layers/NormLayer.h
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "NormLayer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief Basic parent layer of normalization
+ *
+ * @note Normalize the input in local region
+ */
+class NormLayer : public Layer {
+ public:
+  explicit NormLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
+    Layer::init(layerMap, parameterMap);
+    return true;
+  }
+
+  /**
+   * @brief create norm layer by norm_type
+   */
+  static Layer* create(const LayerConfig& config);
+};
+
+/**
+ * @brief response normalization within feature maps
+ * namely normalize in independent channel
+ * When code refactoring, we delete the original implementation.
+ * Need to implement in the futrue.
+ */
+class ResponseNormLayer : public NormLayer {
+ protected:
+  size_t channels_, size_, outputX_, imgSize_, outputY_, imgSizeY_;
+  real scale_, pow_;
+  MatrixPtr denoms_;
+
+ public:
+  explicit ResponseNormLayer(const LayerConfig& config) : NormLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override { LOG(FATAL) << "Not implemented"; }
+  void backward(const UpdateCallback& callback = nullptr) override {
+    LOG(FATAL) << "Not implemented";
+  }
+};
+
+/**
+ * This layer applys normalization across the channels of each sample to a
+ * conv layer's output, and scales the output by a group of trainable factors
+ * whose dimensions equal to the number of channels.
+ * - Input: One and only one input layer are accepted.
+ * - Output: The normalized data of the input data.
+ * Reference:
+ *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
+ *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
+ */
+class CrossChannelNormLayer : public NormLayer {
+ public:
+  explicit CrossChannelNormLayer(const LayerConfig& config)
+      : NormLayer(config) {}
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback);
+  MatrixPtr createSampleMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
+  MatrixPtr createSpatialMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
+
+ protected:
+  size_t channels_;
+  std::unique_ptr<Weight> scale_;
+  MatrixPtr scaleDiff_;
+  MatrixPtr normBuffer_;
+  MatrixPtr dataBuffer_;
+  MatrixPtr channelBuffer_;
+  MatrixPtr spatialBuffer_;
+  MatrixPtr sampleBuffer_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/NormProjectionLayer.cpp b/paddle/legacy/gserver/layers/NormProjectionLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..72affaa1ce618a841f8040c84467a46b77531958
--- /dev/null
+++ b/paddle/legacy/gserver/layers/NormProjectionLayer.cpp
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "NormProjectionLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+size_t CMRProjectionNormLayer::getSize() {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t layerSize = 0;
+  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (imgSizeH_ == 0) {
+    imgSizeH_ = imgSizeY_;
+  }
+  if (imgSizeW_ == 0) {
+    imgSizeW_ = imgSize_;
+  }
+  outputH_ = imgSizeH_;
+  outputW_ = imgSizeW_;
+  layerSize = outputH_ * outputW_ * channels_;
+
+  getOutput().setFrameHeight(outputH_);
+  getOutput().setFrameWidth(outputW_);
+  return layerSize;
+}
+
+bool CMRProjectionNormLayer::init(const LayerMap& layerMap,
+                                  const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  ResponseNormLayer::init(layerMap, parameterMap);
+
+  /* the size of inputs for norm-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+
+  createFunction(
+      forward_,
+      "CrossMapNormal",
+      FuncConfig().set("size", size_).set("scale", scale_).set("pow", pow_));
+  createFunction(
+      backward_,
+      "CrossMapNormalGrad",
+      FuncConfig().set("size", size_).set("scale", scale_).set("pow", pow_));
+
+  return true;
+}
+
+void CMRProjectionNormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  /* malloc memory for the output_ if necessary */
+  /* note: one sample correspond to one row */
+  MatrixPtr input = inputLayers_[0]->getOutputValue();
+  size_t batchSize = input->getHeight();
+  int size = getSize();
+  resetOutput(batchSize, size);
+
+  Matrix::resizeOrCreate(denoms_, batchSize, size, /* trans */ false, useGpu_);
+
+  shape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
+
+  // prepare forward arguments
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), shape_);
+  outputs.addArg(*getOutputValue(), shape_, ASSIGN_TO);
+  outputs.addArg(*denoms_, shape_, ASSIGN_TO);
+
+  forward_[0]->calc(inputs, outputs);
+}
+
+void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  if (NULL == getInputGrad(0)) {
+    return;
+  }
+
+  // prepare backward arguments
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), shape_);
+  inputs.addArg(*getOutputValue(), shape_);
+  inputs.addArg(*getOutputGrad(), shape_);
+  inputs.addArg(*denoms_, shape_);
+  outputs.addArg(*getInputGrad(0), shape_, ADD_TO);
+
+  backward_[0]->calc(inputs, outputs);
+}
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/NormProjectionLayer.h b/paddle/legacy/gserver/layers/NormProjectionLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..492d1fcb72343a54577a459aaa5de53596f43f42
--- /dev/null
+++ b/paddle/legacy/gserver/layers/NormProjectionLayer.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "NormLayer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief response normalization across feature maps
+ * namely normalize in number of size_ channels
+ */
+class CMRProjectionNormLayer : public ResponseNormLayer {
+  size_t imgSizeH_, imgSizeW_;
+  size_t outputH_, outputW_;
+
+ public:
+  explicit CMRProjectionNormLayer(const LayerConfig& config)
+      : ResponseNormLayer(config) {}
+
+  ~CMRProjectionNormLayer() {}
+
+  size_t getSize();
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ protected:
+  TensorShape shape_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/Operator.cpp b/paddle/legacy/gserver/layers/Operator.cpp
similarity index 100%
rename from paddle/gserver/layers/Operator.cpp
rename to paddle/legacy/gserver/layers/Operator.cpp
diff --git a/paddle/legacy/gserver/layers/Operator.h b/paddle/legacy/gserver/layers/Operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..20a248985eb6b3aba016b28bca4c0eea44baa868
--- /dev/null
+++ b/paddle/legacy/gserver/layers/Operator.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/parameter/Parameter.h"
+
+#include "Layer.h"
+#include "paddle/legacy/parameter/Argument.h"
+
+namespace paddle {
+
+// Macro for registering a operator type
+// Example: REGISTER_OPERATOR(dot_mul, DotMulOperator);
+#define REGISTER_OPERATOR(__type_name, __class_name)                \
+  static InitFunction __reg_type_##__type_name([]() {               \
+    Operator::registrar_.registerClass<__class_name>(#__type_name); \
+  })
+
+/**
+ * Operator like Projection, but takes more than one Arguments as input.
+ * @note: Operator can't have parameters.
+ */
+class Operator {
+ public:
+  static Operator* create(const OperatorConfig& config, bool useGpu);
+
+  Operator(const OperatorConfig& config, bool useGpu)
+      : config_(config), useGpu_(useGpu) {}
+
+  virtual ~Operator() {}
+
+  const OperatorConfig& getConfig() const { return config_; }
+
+  static ClassRegistrar<Operator, OperatorConfig, bool> registrar_;
+
+  /**
+   * Forward propagation. If backward() will be called, in and out must be kept
+   * valid until then.
+   * @param ins inputs of operator
+   * @param out output of operator
+   * @param passType PASS_TRAIN of PASS_TEST
+   */
+  void forward(std::vector<const Argument*> ins,
+               Argument* out,
+               PassType passType) {
+    ins_ = ins;
+    out_ = out;
+    passType_ = passType;
+    forward();
+  }
+
+  virtual void prefetch(const Argument* in) {}
+  virtual void forward() = 0;
+  virtual void backward() = 0;
+
+  /**
+   * See comment in Layer.h for the function with the same name.
+   */
+  virtual void resetState() {}
+
+  /**
+   * Set layer state.
+   */
+  virtual void setState(LayerStatePtr state) {}
+
+  /**
+   * Set layer state.
+   */
+  virtual LayerStatePtr getState() { return nullptr; }
+
+ protected:
+  /// Config of operator
+  OperatorConfig config_;
+  bool useGpu_;
+
+  /// Store `ins` passed to forward()
+  std::vector<const Argument*> ins_;
+  /// Store `out` passed to forward()
+  Argument* out_;
+  /// Store `passType` passed to forward()
+  PassType passType_;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/OuterProdLayer.cpp b/paddle/legacy/gserver/layers/OuterProdLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d0928be9d4d52532503987af8e29fdf5c7fb16a5
--- /dev/null
+++ b/paddle/legacy/gserver/layers/OuterProdLayer.cpp
@@ -0,0 +1,141 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for computing the outer product of two vectors
+ * @note used in NEURAL TURING MACHINE
+ * Input1: vector (batchSize * dim1)
+ * Input2: vector (batchSize * dim2)
+ * Output: a matrix: (batchSize * (dim1*dim2))
+ */
+
+class OuterProdLayer : public Layer {
+ protected:
+  MatrixPtr tmpMtx0;
+  MatrixPtr tmpRow0;
+  MatrixPtr tmpRow1;
+
+ public:
+  explicit OuterProdLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~OuterProdLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(out_prod, OuterProdLayer);
+
+bool OuterProdLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+
+  size_t dim0 = inputLayers_[0]->getSize();
+  size_t dim1 = inputLayers_[1]->getSize();
+
+  CHECK_EQ(dim0 * dim1, getSize()) << "Dimension mismatch";
+
+  tmpRow0 = Matrix::create(
+      nullptr, /* height= */ 1, dim0, /* trans= */ false, useGpu_);
+  tmpRow1 = Matrix::create(
+      nullptr, /* height= */ 1, dim1, /* trans= */ false, useGpu_);
+  tmpMtx0 = Matrix::create(nullptr,
+                           /* height= */ dim0,
+                           dim1,
+                           /* trans= */ false,
+                           useGpu_);
+  return true;
+}
+
+void OuterProdLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV0->getHeight();
+  size_t dim0 = inV0->getWidth();
+  size_t dim1 = inV1->getWidth();
+
+  CHECK_EQ(dim0 * dim1, getSize());
+  CHECK_EQ(inV1->getHeight(), batchSize);
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, dim0 * dim1);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  {
+    REGISTER_TIMER_INFO("FwOutProdTimer", getName().c_str());
+    for (size_t i = 0; i < batchSize; i++) {
+      tmpMtx0->setData(outV->getData() + i * dim0 * dim1);
+      tmpRow0->setData(inV0->getData() + i * dim0);
+      tmpRow1->setData(inV1->getData() + i * dim1);
+
+      tmpMtx0->mul(*tmpRow0->getTranspose(), *tmpRow1);
+    }
+  }
+}
+
+void OuterProdLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+
+  size_t batchSize = inV0->getHeight();
+  size_t dim0 = inV0->getWidth();
+  size_t dim1 = inV1->getWidth();
+
+  {
+    REGISTER_TIMER_INFO("BwOutProdTimer", getName().c_str());
+
+    if (inG0) {
+      for (size_t i = 0; i < batchSize; i++) {
+        tmpMtx0->setData(outG->getData() + i * dim0 * dim1);
+        tmpRow0->setData(inG0->getData() + i * dim0);
+        tmpRow1->setData(inV1->getData() + i * dim1);
+
+        tmpRow0->mul(*tmpRow1, *tmpMtx0->getTranspose(), 1, 1);
+      }
+    }
+
+    if (inG1) {
+      for (size_t i = 0; i < batchSize; i++) {
+        tmpMtx0->setData(outG->getData() + i * dim0 * dim1);
+        tmpRow0->setData(inV0->getData() + i * dim0);
+        tmpRow1->setData(inG1->getData() + i * dim1);
+
+        tmpRow1->mul(*tmpRow0, *tmpMtx0, 1, 1);
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PadLayer.cpp b/paddle/legacy/gserver/layers/PadLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7b92b3de2d839f240ec8cbe07ed7685295568809
--- /dev/null
+++ b/paddle/legacy/gserver/layers/PadLayer.cpp
@@ -0,0 +1,106 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PadLayer.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(pad, PadLayer);
+
+bool PadLayer::init(const LayerMap& layerMap,
+                    const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  auto& pad_conf = config_.inputs(0).pad_conf();
+  auto& img_conf = pad_conf.image_conf();
+  CHECK_EQ(config_.inputs_size(), 1);
+  inDims_ = TensorShape(
+      {0,
+       img_conf.channels(),
+       img_conf.has_img_size_y() ? img_conf.img_size_y() : img_conf.img_size(),
+       img_conf.img_size()});
+
+  CHECK_EQ(2, pad_conf.pad_c_size());
+  CHECK_EQ(2, pad_conf.pad_h_size());
+  CHECK_EQ(2, pad_conf.pad_w_size());
+  padc_ = {pad_conf.pad_c(0), pad_conf.pad_c(1)};
+  padh_ = {pad_conf.pad_h(0), pad_conf.pad_h(1)};
+  padw_ = {pad_conf.pad_w(0), pad_conf.pad_w(1)};
+
+  outDims_ = TensorShape(4);
+  setOutDims(0);
+
+  createFunction(forward_,
+                 "Pad",
+                 FuncConfig()
+                     .set("channel", padc_)
+                     .set("height", padh_)
+                     .set("width", padw_));
+  createFunction(backward_,
+                 "PadGrad",
+                 FuncConfig()
+                     .set("channel", padc_)
+                     .set("height", padh_)
+                     .set("width", padw_));
+
+  return true;
+}
+
+void PadLayer::setOutDims(const size_t batchSize) {
+  outDims_.reshape({batchSize,
+                    inDims_[1] + padc_[0] + padc_[1],
+                    inDims_[2] + padh_[0] + padh_[1],
+                    inDims_[3] + padw_[0] + padw_[1]});
+}
+
+void PadLayer::setTensorDim(const size_t batchSize) {
+  CHECK_EQ(static_cast<int>(inputLayers_.size()), 1);
+  inDims_.setDim(0, batchSize);
+  int h = inputLayers_[0]->getOutput().getFrameHeight();
+  if (h != 0) inDims_.setDim(2, h);
+  int w = inputLayers_[0]->getOutput().getFrameWidth();
+  if (w != 0) inDims_.setDim(3, w);
+  setOutDims(batchSize);
+}
+
+void PadLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr input = inputLayers_[0]->getOutputValue();
+  size_t batchSize = input->getHeight();
+  setTensorDim(batchSize);
+  int size = outDims_[1] * outDims_[2] * outDims_[3];
+  resetOutput(batchSize, size);
+  MatrixPtr outV = getOutputValue();
+  REGISTER_TIMER_INFO("PadForward", getName().c_str());
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), inDims_);
+  outputs.addArg(*getOutputValue(), outDims_, ASSIGN_TO);
+  forward_[0]->calc(inputs, outputs);
+}
+
+void PadLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  REGISTER_TIMER_INFO("PadBackward", getName().c_str());
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getOutputGrad(), outDims_);
+  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
+  backward_[0]->calc(inputs, outputs);
+}
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PadLayer.h b/paddle/legacy/gserver/layers/PadLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..46b8a595978489c630b3ff2429ecb19d7c12521a
--- /dev/null
+++ b/paddle/legacy/gserver/layers/PadLayer.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * \brief  This layer pads zeros to inputs according to the specify dimension.
+ *         The input and output is a 4D tensor. Padding zeros from the 2nd to
+ *         the 4th dimenstion according padc_, padh_ and padw_.
+ */
+class PadLayer : public Layer {
+ public:
+  explicit PadLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~PadLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ protected:
+  void setOutDims(const size_t batchSize);
+  void setTensorDim(const size_t batchSize);
+
+  std::vector<uint32_t> padc_;
+  std::vector<uint32_t> padh_;
+  std::vector<uint32_t> padw_;
+  TensorShape inDims_;
+  TensorShape outDims_;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ParameterReluLayer.cpp b/paddle/legacy/gserver/layers/ParameterReluLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..23715d1975d7a3606a9418d54bc69ae6f036a93a
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ParameterReluLayer.cpp
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ParameterReluLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(prelu, ParameterReluLayer);
+
+bool ParameterReluLayer::init(const LayerMap& layerMap,
+                              const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  partialSum_ = config_.partial_sum();
+  CHECK_GT(partialSum_, 0UL) << "partial_sum must be larger than zero.";
+  CHECK(!(inputLayers_[0]->getSize() % partialSum_))
+      << "Incorrect value for partialSum: " << partialSum_
+      << " must divide input size: " << inputLayers_[0]->getSize();
+  CHECK_EQ(getSize() / partialSum_, parameters_[0]->getSize());
+  weight_ = std::unique_ptr<Weight>(new Weight(
+      1UL, inputLayers_[0]->getSize() / partialSum_, parameters_[0]));
+  return true;
+}
+
+void ParameterReluLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInput(0).getBatchSize();
+  int size = getSize();
+  reserveOutput(batchSize, size);
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    outV->paramReluForward(*(getInput(0).value), *(weight_->getW()));
+  }
+}
+
+void ParameterReluLayer::backward(const UpdateCallback& callback) {
+  if (weight_->getWGrad()) {
+    weight_->getWGrad()->paramReluBackwardW(*getOutputGrad(),
+                                            *(getInputValue(0)));
+  }
+
+  MatrixPtr preGrad = getInputGrad(0);
+  preGrad->paramReluBackwardDiff(
+      *getOutputGrad(), *(getInputValue(0)), *(weight_->getW()));
+  {
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    weight_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ParameterReluLayer.h b/paddle/legacy/gserver/layers/ParameterReluLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..3aac4b42f60531b5856ddef208b8356898e42859
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ParameterReluLayer.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ *  @brief ParameterReluLayer active inputs with learnable parameter weight_.
+ *  forward:
+ *  \f[
+ *      y = x > 0 ? x : w .* x
+ *  \f]
+ *  backward:
+ *  \f[
+ *      dx = x > 0 ? dy : w .* dy \\
+ *      dw = x > 0 ? 0 : dy.*x
+ *  \f]
+ *  Here, x is the input, w is the weight, y is the output.
+ *  dx, dw, dy is the gradient.
+ */
+
+class ParameterReluLayer : public Layer {
+ protected:
+  std::unique_ptr<Weight> weight_;
+
+  /**
+   *  @brief partialSum_ makes a group of inputs share same weights,
+   *  - partialSum_ = 1:
+   *       element wise activation: each element has a weight_,
+   *  - partialSum_ = number of elements in one channel,
+   *       channels wise parameter activation, elements in a channel
+   *       share same weight_,
+   *  - partialSum_ = number of outputs
+   *       all elements share same weight_,
+   */
+  size_t partialSum_;
+
+ public:
+  explicit ParameterReluLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ParameterReluLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Pool3DLayer.cpp b/paddle/legacy/gserver/layers/Pool3DLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ae3f55c27f2d7bd3ab47d834d5b6f274ff558310
--- /dev/null
+++ b/paddle/legacy/gserver/layers/Pool3DLayer.cpp
@@ -0,0 +1,178 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Pool3DLayer.h"
+#include "PoolProjectionLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+REGISTER_LAYER(pool3d, Pool3DLayer);
+
+bool Pool3DLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  /* the size of inputs for pool-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+
+  const PoolConfig& conf = config_.inputs(0).pool_conf();
+  poolType_ = conf.pool_type();
+  channels_ = conf.channels();
+
+  sizeX_ = conf.size_x();
+  sizeY_ = conf.size_y();
+  sizeZ_ = conf.size_z();
+
+  strideW_ = conf.stride();
+  strideH_ = conf.stride_y();
+  strideD_ = conf.stride_z();
+
+  imgSizeW_ = conf.img_size();
+  imgSizeH_ = conf.img_size_y();
+  imgSizeD_ = conf.img_size_z();
+
+  paddingW_ = conf.padding();
+  paddingH_ = conf.padding_y();
+  paddingD_ = conf.padding_z();
+
+  outputW_ = conf.output_x();
+  outputH_ = conf.output_y();
+  outputD_ = conf.output_z();
+
+  return true;
+}
+
+size_t Pool3DLayer::getSize() {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+
+  size_t layerSize = 0;
+  outputD_ = outputSize(imgSizeD_, sizeZ_, paddingD_, strideD_, false);
+  outputH_ = outputSize(imgSizeH_, sizeY_, paddingH_, strideH_, false);
+  outputW_ = outputSize(imgSizeW_, sizeX_, paddingW_, strideW_, false);
+
+  layerSize = outputD_ * outputH_ * outputW_ * channels_;
+  getOutput().setFrameHeight(outputH_);
+  getOutput().setFrameWidth(outputW_);
+  getOutput().setFrameDepth(outputD_);
+  return layerSize;
+}
+
+void Pool3DLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
+  size_t batchSize = inMat->getHeight();
+  size_t outWidth = getSize();
+  resetOutput(batchSize, outWidth);
+  Matrix::resizeOrCreate(maxPoolIdx_, batchSize, outWidth, false, useGpu_);
+  const MatrixPtr outMat = getOutputValue();
+
+  if (poolType_ == "avg") {
+    outMat->avgPool3DForward(*inMat,
+                             channels_,
+                             imgSizeD_,
+                             imgSizeH_,
+                             imgSizeW_,
+                             outputD_,
+                             outputH_,
+                             outputW_,
+                             sizeZ_,
+                             sizeY_,
+                             sizeX_,
+                             strideD_,
+                             strideH_,
+                             strideW_,
+                             paddingD_,
+                             paddingH_,
+                             paddingW_);
+  } else if (poolType_ == "max") {
+    outMat->maxPool3DForward(*inMat,
+                             *maxPoolIdx_,
+                             channels_,
+                             imgSizeD_,
+                             imgSizeH_,
+                             imgSizeW_,
+                             outputD_,
+                             outputH_,
+                             outputW_,
+                             sizeZ_,
+                             sizeY_,
+                             sizeX_,
+                             strideD_,
+                             strideH_,
+                             strideW_,
+                             paddingD_,
+                             paddingH_,
+                             paddingW_);
+  } else {
+    LOG(FATAL) << "Unknown pool type: " << poolType_;
+  }
+  forwardActivation();
+}
+
+void Pool3DLayer::backward(const UpdateCallback& callback) {
+  backwardActivation();
+
+  (void)callback;
+  if (NULL == getInputGrad(0)) return;
+  MatrixPtr inMat = inputLayers_[0]->getOutputValue();
+  MatrixPtr inGradMat = inputLayers_[0]->getOutputGrad();
+  MatrixPtr outMat = getOutputValue();
+  MatrixPtr outGradMat = getOutputGrad();
+
+  if (poolType_ == "avg") {
+    inGradMat->avgPool3DBackward(*outGradMat,
+                                 imgSizeD_,
+                                 imgSizeH_,
+                                 imgSizeW_,
+                                 outputD_,
+                                 outputH_,
+                                 outputW_,
+                                 sizeZ_,
+                                 sizeY_,
+                                 sizeZ_,
+                                 strideD_,
+                                 strideH_,
+                                 strideW_,
+                                 paddingD_,
+                                 paddingH_,
+                                 paddingW_,
+                                 1.0,
+                                 1.0);
+  } else if (poolType_ == "max") {
+    inGradMat->maxPool3DBackward(*outGradMat,
+                                 *maxPoolIdx_,
+                                 imgSizeD_,
+                                 imgSizeH_,
+                                 imgSizeW_,
+                                 outputD_,
+                                 outputH_,
+                                 outputW_,
+                                 sizeZ_,
+                                 sizeY_,
+                                 sizeZ_,
+                                 strideD_,
+                                 strideH_,
+                                 strideW_,
+                                 paddingD_,
+                                 paddingH_,
+                                 paddingW_,
+                                 1.0,
+                                 1.0);
+  } else {
+    LOG(FATAL) << "Unknown pool type: " << poolType_;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/Pool3DLayer.h b/paddle/legacy/gserver/layers/Pool3DLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..6851c44ab22a39bebe3592b8e5f6384a393947f2
--- /dev/null
+++ b/paddle/legacy/gserver/layers/Pool3DLayer.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief Basic parent layer of pooling
+ * Pools the input within regions
+ */
+class Pool3DLayer : public Layer {
+ public:
+  explicit Pool3DLayer(const LayerConfig& config) : Layer(config) {}
+  ~Pool3DLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+  size_t getSize();
+
+ protected:
+  int channels_;
+  int sizeX_, sizeY_, sizeZ_;
+  int strideW_, strideH_, strideD_;
+  int paddingW_, paddingH_, paddingD_;
+  int imgSizeW_, imgSizeH_, imgSizeD_;
+  int outputW_, outputH_, outputD_;
+  std::string poolType_;
+  MatrixPtr maxPoolIdx_;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PoolLayer.cpp b/paddle/legacy/gserver/layers/PoolLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..df172d95757e0842328caa508042f3613bc72232
--- /dev/null
+++ b/paddle/legacy/gserver/layers/PoolLayer.cpp
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PoolLayer.h"
+#include "MaxPoolWithMaskLayer.h"
+#include "PoolProjectionLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#ifdef PADDLE_WITH_CUDA
+#include "CudnnPoolLayer.h"
+#endif
+namespace paddle {
+
+REGISTER_LAYER_CREATE_FUNC(pool, &PoolLayer::create);
+
+bool PoolLayer::init(const LayerMap& layerMap,
+                     const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  /* the size of inputs for pool-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+
+  const PoolConfig& conf = config_.inputs(0).pool_conf();
+  poolType_ = conf.pool_type();
+  channels_ = conf.channels();
+  sizeX_ = conf.size_x();
+  stride_ = conf.stride();
+  outputX_ = conf.output_x();
+  imgSize_ = conf.img_size();
+  confPadding_ = conf.padding();
+
+  sizeY_ = conf.has_size_y() ? conf.size_y() : conf.size_x();
+  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
+  confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
+  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+
+  excludeMode_ = conf.has_exclude_mode() ? conf.exclude_mode() : true;
+  return true;
+}
+
+Layer* PoolLayer::create(const LayerConfig& config) {
+  CHECK_EQ(config.inputs_size(), 1);
+  const std::string& pool = config.inputs(0).pool_conf().pool_type();
+  if (pool == "max-projection" || pool == "avg-projection") {
+    return new PoolProjectionLayer(config);
+#ifdef PADDLE_WITH_CUDA
+  } else if (CudnnPoolLayer::typeCheck(pool)) {
+    return new CudnnPoolLayer(config);
+#endif
+  } else if (pool == "max-pool-with-mask") {
+    return new MaxPoolWithMaskLayer(config);
+  } else {
+    LOG(FATAL) << "Unknown pool type: " << pool;
+    return nullptr;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PoolLayer.h b/paddle/legacy/gserver/layers/PoolLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..0808dfae8497008f974730b65977c85e914a7a27
--- /dev/null
+++ b/paddle/legacy/gserver/layers/PoolLayer.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief Basic parent layer of pooling
+ * Pools the input within regions
+ */
+class PoolLayer : public Layer {
+ protected:
+  size_t channels_, sizeX_, stride_, outputX_, imgSize_;
+  int confPadding_;
+
+  size_t sizeY_;
+  size_t imgSizeY_;
+  size_t strideY_;
+  size_t outputY_;
+  int confPaddingY_;
+
+  std::string poolType_;
+
+  bool excludeMode_;
+
+ public:
+  explicit PoolLayer(const LayerConfig& config) : Layer(config) {}
+
+  /**
+   * @brief create pooling layer by pool_type
+   */
+  static Layer* create(const LayerConfig& config);
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjection.cpp b/paddle/legacy/gserver/layers/PoolProjection.cpp
similarity index 100%
rename from paddle/gserver/layers/PoolProjection.cpp
rename to paddle/legacy/gserver/layers/PoolProjection.cpp
diff --git a/paddle/legacy/gserver/layers/PoolProjection.h b/paddle/legacy/gserver/layers/PoolProjection.h
new file mode 100644
index 0000000000000000000000000000000000000000..d01b6a13f0a5fd2283f1f216ef419b9ccc7308f9
--- /dev/null
+++ b/paddle/legacy/gserver/layers/PoolProjection.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Projection.h"
+#include "paddle/legacy/math/MathUtils.h"
+
+namespace paddle {
+
+class PoolProjection : public Projection {
+ protected:
+  size_t imgSizeY_, imgSize_;
+  size_t outputY_, outputX_;
+  size_t strideY_, stride_;
+  size_t sizeY_, sizeX_;
+  int confPaddingY_, confPadding_;
+  size_t channels_;
+  std::string poolType_;
+  bool excludeMode_;
+
+ public:
+  PoolProjection(const ProjectionConfig& config,
+                 ParameterPtr parameter,
+                 bool useGpu);
+
+  static PoolProjection* create(const ProjectionConfig& config,
+                                ParameterPtr parameter,
+                                bool useGpu);
+
+  const std::string& getPoolType() const { return poolType_; }
+
+  size_t getSize();
+};
+
+class MaxPoolProjection : public PoolProjection {
+ public:
+  MaxPoolProjection(const ProjectionConfig& config,
+                    ParameterPtr parameter,
+                    bool useGpu)
+      : PoolProjection(config, parameter, useGpu) {}
+
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback = nullptr);
+};
+
+class AvgPoolProjection : public PoolProjection {
+ public:
+  AvgPoolProjection(const ProjectionConfig& config,
+                    ParameterPtr parameter,
+                    bool useGpu)
+      : PoolProjection(config, parameter, useGpu) {}
+
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback = nullptr);
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PoolProjectionLayer.cpp b/paddle/legacy/gserver/layers/PoolProjectionLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e44b1d7ba1494e43db81f998c2818bbbf7779d6f
--- /dev/null
+++ b/paddle/legacy/gserver/layers/PoolProjectionLayer.cpp
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PoolProjectionLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+size_t PoolProjectionLayer::getSize() {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t layerSize = 0;
+  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (imgSizeH_ == 0) {
+    imgSizeH_ = imgSizeY_;
+  }
+  if (imgSizeW_ == 0) {
+    imgSizeW_ = imgSize_;
+  }
+
+  outputH_ = outputSize(imgSizeH_,
+                        sizeY_,
+                        confPaddingY_,
+                        strideY_,
+                        /* caffeMode */ false);
+  outputW_ = outputSize(imgSizeW_,
+                        sizeX_,
+                        confPadding_,
+                        stride_,
+                        /* caffeMode */ false);
+
+  layerSize = outputH_ * outputW_ * channels_;
+
+  return layerSize;
+}
+
+void PoolProjectionLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  const Argument& in = getInput(0);
+  int batchSize = in.value->getHeight();
+  int size = getSize();
+  resetOutput(batchSize, size);
+  poolProjection_->forward(&in, &output_, passType);
+}
+
+void PoolProjectionLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  if (NULL == getInputGrad(0)) {
+    return;
+  }
+  poolProjection_->backward(callback);
+}
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PoolProjectionLayer.h b/paddle/legacy/gserver/layers/PoolProjectionLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..fcd35bbba4dff612fba827cdf545de71127c560e
--- /dev/null
+++ b/paddle/legacy/gserver/layers/PoolProjectionLayer.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "PoolLayer.h"
+#include "PoolProjection.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+/**
+ * @brief Basic parent layer of different kinds of pooling
+ */
+class PoolProjectionLayer : public PoolLayer {
+ protected:
+  size_t imgSizeH_, imgSizeW_;
+  size_t outputH_, outputW_;
+  std::unique_ptr<PoolProjection> poolProjection_;
+  ProjectionConfig projectionConfig_;
+
+ public:
+  explicit PoolProjectionLayer(const LayerConfig& config) : PoolLayer(config) {
+    PoolConfig* conf = projectionConfig_.mutable_pool_conf();
+    *conf = config_.inputs(0).pool_conf();
+    poolProjection_.reset(
+        PoolProjection::create(projectionConfig_, nullptr, useGpu_));
+  }
+
+  size_t getSize();
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PowerLayer.cpp b/paddle/legacy/gserver/layers/PowerLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5e94c64db6098dbc1ed13bdcbd573f95024713bc
--- /dev/null
+++ b/paddle/legacy/gserver/layers/PowerLayer.cpp
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * This layer applys a power function to a vector element-wise,
+ * which is used in NEURAL TURING MACHINE.
+ * \f[
+ *   y = x^w
+ * \f]
+ * where \f$x\f$ is a input vector, \f$w\f$ is scalar weight,
+ * and output \f$y\f$ is a vector.
+ *
+ * The config file api is power_layer.
+ */
+
+class PowerLayer : public Layer {
+ protected:
+  MatrixPtr tmpMtx;
+
+ public:
+  explicit PowerLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~PowerLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(power, PowerLayer);
+
+bool PowerLayer::init(const LayerMap& layerMap,
+                      const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+
+  return true;
+}
+
+void PowerLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV1->getHeight();
+  size_t dataDim = inV1->getWidth();
+
+  CHECK_EQ(getSize(), dataDim);
+  CHECK_EQ(1U, inV0->getWidth());
+  CHECK_EQ(batchSize, inV0->getHeight());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, dataDim);
+  }
+
+  MatrixPtr outV = getOutputValue();
+
+  {
+    REGISTER_TIMER_INFO("FwPowerTimer", getName().c_str());
+    outV->rowPow(0, *inV1, *inV0);
+  }
+}
+
+void PowerLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr outG = getOutputGrad();
+
+  size_t batchSize = inV1->getHeight();
+  size_t dataDim = inV1->getWidth();
+
+  {
+    REGISTER_TIMER_INFO("BwPowerTimer", getName().c_str());
+    Matrix::resizeOrCreate(tmpMtx, batchSize, dataDim, false, useGpu_);
+
+    if (inG0) {
+      tmpMtx->log2(*inV1);
+      tmpMtx->dotMul(*tmpMtx, *outV);
+
+      // inG0 += outG .* (log(inV1) * outV)
+      inG0->rowDotMul(0, *outG, *tmpMtx);
+    }
+
+    if (inG1) {
+      // tmp = (outV / inV1) * inV0
+      tmpMtx->dotDiv(*outV, *inV1);
+      tmpMtx->rowScale(0, *tmpMtx, *inV0);
+
+      inG1->addDotMul(*outG, *tmpMtx, 1, 1);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PrintLayer.cpp b/paddle/legacy/gserver/layers/PrintLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6fbcc447f92208439bddd14d421d62cab30d81f4
--- /dev/null
+++ b/paddle/legacy/gserver/layers/PrintLayer.cpp
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+class PrintLayer : public Layer {
+ public:
+  explicit PrintLayer(const LayerConfig& config) : Layer(config) {}
+
+  void forward(PassType passType) override {
+    Layer::forward(passType);
+    std::vector<std::string> vals;
+    for (size_t i = 0; i != inputLayers_.size(); ++i) {
+      std::ostringstream s;
+      getInput(i).printValueString(s, "");
+      vals.push_back(s.str());
+    }
+    size_t pos = 0;
+    size_t i = 0;
+    std::ostringstream s;
+    const std::string& format = config_.user_arg();
+    while (true) {
+      size_t pos1 = format.find("%s", pos);
+      if (pos1 == std::string::npos) break;
+      if (i >= vals.size()) {
+        break;
+      }
+      s << format.substr(pos, pos1 - pos) << vals[i];
+      pos = pos1 + 2;
+      ++i;
+    }
+    if (i != inputLayers_.size()) {
+      LOG(ERROR) << "Number of value in the format (" << format
+                 << ") is not same as the number of inputs ("
+                 << inputLayers_.size() << ") at " << getName();
+    }
+    s << format.substr(pos);
+
+    const std::string delimiter("\n");
+    std::string content = s.str();
+    std::string::size_type foundPos = 0;
+    std::string::size_type prevPos = 0;
+    while ((foundPos = content.find(delimiter, prevPos)) != std::string::npos) {
+      LOG(INFO) << content.substr(prevPos, foundPos - prevPos);
+      prevPos = foundPos + delimiter.size();
+    }
+    LOG(INFO) << content.substr(prevPos);
+  }
+
+  void backward(const UpdateCallback& callback) override {}
+};
+
+REGISTER_LAYER(print, PrintLayer);
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/PriorBox.cpp b/paddle/legacy/gserver/layers/PriorBox.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..83aab6e36662855a5867463757bc5a92e6e83e07
--- /dev/null
+++ b/paddle/legacy/gserver/layers/PriorBox.cpp
@@ -0,0 +1,159 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/BaseMatrix.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+/**
+ * @brief A layer for generating priorbox locations and variances.
+ * - Input: Two and only two input layer are accepted. The input layer must be
+ *          be a data output layer and a convolution output layer.
+ * - Output: The priorbox locations and variances of the input data.
+ * Reference:
+ *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
+ *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
+ */
+
+class PriorBoxLayer : public Layer {
+ public:  // NOLINT
+  explicit PriorBoxLayer(const LayerConfig& config) : Layer(config) {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override {}
+
+ protected:  // NOLINT
+  int numPriors_;
+  std::vector<int> minSize_;
+  std::vector<int> maxSize_;
+  std::vector<real> aspectRatio_;
+  std::vector<real> variance_;
+  MatrixPtr buffer_;
+};
+
+REGISTER_LAYER(priorbox, PriorBoxLayer);
+
+bool PriorBoxLayer::init(const LayerMap& layerMap,
+                         const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  auto pbConf = config_.inputs(0).priorbox_conf();
+  std::vector<real> tmp;
+  aspectRatio_.push_back(1.);
+  std::copy(pbConf.min_size().begin(),
+            pbConf.min_size().end(),
+            std::back_inserter(minSize_));
+  std::copy(pbConf.max_size().begin(),
+            pbConf.max_size().end(),
+            std::back_inserter(maxSize_));
+  std::copy(pbConf.variance().begin(),
+            pbConf.variance().end(),
+            std::back_inserter(variance_));
+  std::copy(pbConf.aspect_ratio().begin(),
+            pbConf.aspect_ratio().end(),
+            std::back_inserter(tmp));
+
+  if (maxSize_.size() > 0) CHECK_EQ(minSize_.size(), maxSize_.size());
+
+  // flip aspect ratios
+  for (unsigned index = 0; index < tmp.size(); index++) {
+    real ar = tmp[index];
+    if (fabs(ar - 1.) < 1e-6) continue;
+    aspectRatio_.push_back(ar);
+    aspectRatio_.push_back(1. / ar);
+  }
+
+  numPriors_ = aspectRatio_.size() * minSize_.size() + maxSize_.size();
+
+  return true;
+}
+
+void PriorBoxLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  auto input = getInput(0);
+  int layerWidth = input.getFrameWidth();
+  int layerHeight = input.getFrameHeight();
+
+  auto image = getInput(1);
+  int imageWidth = image.getFrameWidth();
+  int imageHeight = image.getFrameHeight();
+
+  real stepW = static_cast<real>(imageWidth) / layerWidth;
+  real stepH = static_cast<real>(imageHeight) / layerHeight;
+  int dim = layerHeight * layerWidth * numPriors_ * 4;
+  reserveOutput(1, dim * 2);
+  // use a cpu buffer to compute
+  Matrix::resizeOrCreate(buffer_, 1, dim * 2, false, false);
+  auto* tmpPtr = buffer_->getData();
+
+  int idx = 0;
+  for (int h = 0; h < layerHeight; ++h) {
+    for (int w = 0; w < layerWidth; ++w) {
+      real centerX = (w + 0.5) * stepW;
+      real centerY = (h + 0.5) * stepH;
+      for (size_t s = 0; s < minSize_.size(); s++) {
+        real minSize = minSize_[s];
+        real boxWidth = minSize;
+        real boxHeight = minSize;
+
+        // first prior: aspect_ratio == 1.0, compatible to old logic
+        tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
+        tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
+        tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
+        tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
+        // set the variance.
+        for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
+
+        if (maxSize_.size() > 0) {
+          // square prior with size sqrt(minSize * maxSize)
+          real maxSize = maxSize_[s];
+          boxWidth = boxHeight = sqrt(minSize * maxSize);
+          tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
+          tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
+          tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
+          tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
+          // set the variance.
+          for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
+        }
+
+        // priors with different aspect ratios
+        for (size_t r = 0; r < aspectRatio_.size(); r++) {
+          real ar = aspectRatio_[r];
+          if (fabs(ar - 1.0) < 1e-6) {
+            continue;
+          }
+          boxWidth = minSize * sqrt(ar);
+          boxHeight = minSize / sqrt(ar);
+          tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
+          tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
+          tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
+          tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
+          // set the variance.
+          for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
+        }
+      }
+    }
+  }
+
+  // clip the prior's coordidate such that it is within [0, 1]
+  for (int d = 0; d < dim * 2; ++d)
+    if ((d % 8) < 4)
+      tmpPtr[d] = std::min(std::max(tmpPtr[d], (real)0.), (real)1.);
+  MatrixPtr outV = getOutputValue();
+  outV->copyFrom(buffer_->data_, dim * 2);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/Projection.cpp b/paddle/legacy/gserver/layers/Projection.cpp
similarity index 100%
rename from paddle/gserver/layers/Projection.cpp
rename to paddle/legacy/gserver/layers/Projection.cpp
diff --git a/paddle/legacy/gserver/layers/Projection.h b/paddle/legacy/gserver/layers/Projection.h
new file mode 100644
index 0000000000000000000000000000000000000000..974f5a2cacd10a965adcb4accf6ca00c26044b64
--- /dev/null
+++ b/paddle/legacy/gserver/layers/Projection.h
@@ -0,0 +1,140 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/parameter/Parameter.h"
+
+namespace paddle {
+
+// Macro for registering a projection type
+// Example: REGISTER_LAYER(fc, FullMatrixProjection);
+#define REGISTER_PROJECTION(__type_name, __class_name)                \
+  static InitFunction __reg_type_##__type_name([]() {                 \
+    Projection::registrar_.registerClass<__class_name>(#__type_name); \
+  })
+
+#define REGISTER_PROJECTION_CREATE_FUNC(__type_name, createFunction)    \
+  static InitFunction __reg_type_##__type_name([]() {                   \
+    Projection::registrar_.registerClass(#__type_name, createFunction); \
+  })
+
+/**
+ * A projection takes one Argument as input, calculate the result and add it
+ * to output Argument.
+ */
+class Projection {
+ public:
+  static Projection* create(const ProjectionConfig& config,
+                            ParameterPtr parameter,
+                            bool useGpu);
+
+  Projection(const ProjectionConfig& config,
+             ParameterPtr parameter,
+             bool useGpu)
+      : config_(config), parameter_(parameter), useGpu_(useGpu) {}
+
+  virtual ~Projection() {}
+
+  const std::string& getName() const { return config_.name(); }
+
+  /// Register a projection
+  static ClassRegistrar<Projection, ProjectionConfig, ParameterPtr, bool>
+      registrar_;
+
+  /**
+   * Forward propagation. If backward() will be called, in and out must be kept
+   * valid until then.
+   * @param in input of projection
+   * @param out output of projection
+   * @param passType PASS_TRAIN of PASS_TEST
+   */
+  void forward(const Argument* in, const Argument* out, PassType passType) {
+    in_ = in;
+    out_ = out;
+    passType_ = passType;
+    forward();
+  }
+
+  virtual void prefetch(const Argument* in) {}
+  virtual void forward() = 0;
+  virtual void backward(const UpdateCallback& callback) = 0;
+
+  /**
+   * See comment in Layer.h for the function with the same name.
+   */
+  virtual void resetState() {}
+
+  /**
+   * Set layer state.
+   */
+  virtual void setState(LayerStatePtr state) {}
+
+  /**
+   * Get layer state. A copy of internal state is returned.
+   */
+  virtual LayerStatePtr getState() { return nullptr; }
+
+  /**
+   * init forward_ and backward_ functions
+   */
+  virtual bool init() { return true; }
+
+  /**
+   * Get output size of projection.
+   */
+  size_t getOutputSize() const { return config_.output_size(); }
+
+ protected:
+  /**
+   * Create layer function. Function is called in forward or backward.
+   * \param function, Layer::forward_ or Layer::backward_
+   * \param name, function name
+   * \param config, initialization configuration for the function
+   */
+  void createFunction(std::vector<std::shared_ptr<FunctionBase>>& function,
+                      const std::string& name,
+                      const FuncConfig& config) {
+    if (useGpu_) {
+      function.emplace_back(
+          FunctionBase::funcRegistrar_.createByType(name + "-GPU"));
+    } else {
+      function.emplace_back(
+          FunctionBase::funcRegistrar_.createByType(name + "-CPU"));
+    }
+    auto& func = function.back();
+    func->init(config);
+  }
+
+ protected:
+  /// Config of projection
+  ProjectionConfig config_;
+  /// Parameter of projection
+  ParameterPtr parameter_;
+  bool useGpu_;
+
+  /// Store `in` passed to forward()
+  const Argument* in_;
+  /// Store `out` passed to forward()
+  const Argument* out_;
+  /// Store `passType` passed to forward()
+  PassType passType_;
+  /// Layer forward function
+  std::vector<std::shared_ptr<FunctionBase>> forward_;
+  /// Layer backward function
+  std::vector<std::shared_ptr<FunctionBase>> backward_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ROIPoolLayer.cpp b/paddle/legacy/gserver/layers/ROIPoolLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/ROIPoolLayer.cpp
rename to paddle/legacy/gserver/layers/ROIPoolLayer.cpp
diff --git a/paddle/legacy/gserver/layers/ROIPoolLayer.h b/paddle/legacy/gserver/layers/ROIPoolLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..801a9b3aebe6d718ea38b76246a6056891d0b1f6
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ROIPoolLayer.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * A layer used by Fast R-CNN to extract feature maps of ROIs from the last
+ * feature map.
+ * - Input: This layer needs two input layers: The first input layer is a
+ *          convolution layer; The second input layer contains the ROI data
+ *          which is the output of ProposalLayer in Faster R-CNN. layers for
+ *          generating bbox location offset and the classification confidence.
+ * - Output: The ROIs' feature map.
+ * Reference:
+ *    Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun.
+ *    Faster R-CNN: Towards Real-Time Object Detection with Region Proposal
+ * Networks
+ */
+
+class ROIPoolLayer : public Layer {
+ protected:
+  size_t channels_;
+  size_t width_;
+  size_t height_;
+  size_t pooledWidth_;
+  size_t pooledHeight_;
+  real spatialScale_;
+
+  // Since there is no int matrix, use real maxtrix instead.
+  MatrixPtr maxIdxs_;
+
+ public:
+  explicit ROIPoolLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/legacy/gserver/layers/RecurrentLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/RecurrentLayer.cpp
rename to paddle/legacy/gserver/layers/RecurrentLayer.cpp
diff --git a/paddle/legacy/gserver/layers/RecurrentLayer.h b/paddle/legacy/gserver/layers/RecurrentLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..287ea27a0984729fde5b35aa0807e9f2b29f993f
--- /dev/null
+++ b/paddle/legacy/gserver/layers/RecurrentLayer.h
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <gflags/gflags.h>
+#include "Layer.h"
+#include "SequenceToBatch.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief RecurrentLayer takes 1 input layer. The output size is the same with
+ * input layer.
+ * For each sequence [start, end] it performs the following computation:
+ * \f[
+ *    out_{i} = act(in_{i})     \      \      \text{for} \ i = start \\
+ *    out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end
+ *
+ * \f]
+ * If reversed is true, the order is reversed:
+ * \f[
+ *   out_{i} = act(in_{i})           \    \   \text{for} \ i = end  \\
+ *   out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end
+ * \f]
+ * There are two methods to calculate rnn. One way is to compute rnn one
+ * sequence by one sequence. The other way is to reorganize the input
+ * into batches, then compute rnn one batch by one batch. Users can select
+ * them by rnn_use_batch flag.
+ */
+
+class RecurrentLayer : public Layer {
+ public:
+  explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback& callback) override;
+
+  void resetState() override;
+
+  void setState(LayerStatePtr state) override;
+
+  LayerStatePtr getState() override;
+
+ protected:
+  /**
+   * @brief If user do not set --rnn_use_batch=true, it will
+   * compute rnn forward one sequence by one sequence in default.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  void forwardSequence(int batchSize, size_t numSequences, const int* starts);
+  /**
+   * @brief Compute rnn forward by one sequence.
+   * @param start The start position of this sequence (or sample).
+   * @param length The length of this sequence (or sample), namely the words
+   * number of this sequence.
+   */
+  void forwardOneSequence(int start, int length);
+  /**
+   * @brief Compute rnn backward one sequence by onesequence.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  void backwardSequence(int batchSize, size_t numSequences, const int* starts);
+  /**
+   * @brief Compute rnn backward by one sequence.
+   * @param start The start position of this sequence (or sample).
+   * @param length The length of this sequence (or sample), namely the words
+   * number of this sequence.
+   */
+  void backwardOneSequence(int start, int length);
+
+  /**
+   * @brief Reorganize input into batches and compute rnn forward batch
+   * by batch. It will convert batch shape to sequence after finishing forward.
+   * The batch info can refer to SequenceToBatch class.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  virtual void forwardBatch(int batchSize,
+                            size_t numSequences,
+                            const int* starts);
+
+  /**
+   * @brief Reorganize input into batches and compute rnn forward batch
+   * by batch.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  virtual void backwardBatch(int batchSize,
+                             size_t numSequences,
+                             const int* starts);
+
+ protected:
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> bias_;
+
+  /// frameOutput_[i] is used to hold the i-th sample of output_
+  std::vector<Argument> frameOutput_;
+  MatrixPtr prevOutput_;
+  /// Whether compute rnn by reverse.
+  bool reversed_;
+  /// If compute batch by batch, batchValue_ will be used to save the
+  /// reorganized input value.
+  std::unique_ptr<SequenceToBatch> batchValue_;
+  /// If compute batch by batch, batchGrad_ will be used to save the
+  /// gradient with respect to reorganized input value.
+  std::unique_ptr<SequenceToBatch> batchGrad_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/RecurrentLayerGroup.cpp b/paddle/legacy/gserver/layers/RecurrentLayerGroup.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..39321245995fce2f2bd671593c028fd6038865de
--- /dev/null
+++ b/paddle/legacy/gserver/layers/RecurrentLayerGroup.cpp
@@ -0,0 +1,95 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <functional>
+#include "paddle/legacy/gserver/layers/Layer.h"
+
+#include "paddle/legacy/gserver/gradientmachines/RecurrentGradientMachine.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * Recurrent layer group is a group of layers, which forward/backward one frame
+ * after previous frame forward/backward through all layers in layer group.
+ * It's automatically added by config_parser if some layers are defined
+ * between RecurrentLayerGroupBegin and RecurrentLayerGroupEnd.
+ */
+class RecurrentLayerGroup : public Layer {
+ public:
+  explicit RecurrentLayerGroup(const LayerConfig& config) : Layer(config) {}
+
+  void initSubNetwork(NeuralNetwork* rootNetwork,
+                      const ModelConfig& config,
+                      const std::vector<ParameterType>& parameterTypes,
+                      bool useGpu) override;
+
+  void forward(PassType passType) override {
+    REGISTER_TIMER_INFO("RecurrentGroupFwTime", getName().c_str());
+    const std::vector<Argument> inArgs;
+    std::vector<Argument> outArgs;
+    network_->forward(inArgs, &outArgs, passType);
+  }
+  void backward(const UpdateCallback& callback) override {
+    REGISTER_TIMER_INFO("RecurrentGroupBwTime", getName().c_str());
+    network_->backward(nullptr);
+
+    for (auto& para : parameters_) {
+      para->incUpdate(callback);
+    }
+  }
+
+  /**
+   * @see Layer.accessSubNetwork
+   */
+  void accessSubNetwork(
+      const std::function<void(NeuralNetwork&)>& callback) override {
+    callback(*network_);
+  }
+
+ private:
+  std::unique_ptr<RecurrentGradientMachine> network_;
+};
+
+REGISTER_LAYER(recurrent_layer_group, RecurrentLayerGroup);
+
+void RecurrentLayerGroup::initSubNetwork(
+    NeuralNetwork* rootNetwork,
+    const ModelConfig& config,
+    const std::vector<ParameterType>& parameterTypes,
+    bool useGpu) {
+  setNeedGradient(true);
+
+  network_.reset(new RecurrentGradientMachine(config_.name(), rootNetwork));
+  ParamInitCallback cb = [rootNetwork](int paramId, Parameter* para) {
+    para->enableSharedType(
+        PARAMETER_VALUE,
+        rootNetwork->getParameters()[paramId]->getBuf(PARAMETER_VALUE),
+        rootNetwork->getParameters()[paramId]->getMat(PARAMETER_VALUE));
+    para->enableSharedType(
+        PARAMETER_GRADIENT,
+        rootNetwork->getParameters()[paramId]->getBuf(PARAMETER_GRADIENT),
+        rootNetwork->getParameters()[paramId]->getMat(PARAMETER_GRADIENT));
+  };
+  network_->init(config, cb, parameterTypes, useGpu);
+
+  for (auto paramId : network_->getParameterIds()) {
+    ParameterPtr parameter = rootNetwork->getParameters()[paramId];
+    parameter->incShared();
+    CHECK_EQ(parameter->getDeviceId(), getDeviceId());
+    parameters_.push_back(parameter);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ResizeLayer.cpp b/paddle/legacy/gserver/layers/ResizeLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8f8aad820f7d6d2be0af74d607d763912c3c0f2a
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ResizeLayer.cpp
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/BaseMatrix.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+/**
+ * @brief A layer for resizing a minibatch matrix h*w to h'*w'
+ * @note
+ * origin matrix height * width)
+ * resize matrix: (height * width / size) * size
+ */
+class ResizeLayer : public Layer {
+ public:
+  explicit ResizeLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback& callback) override;
+};
+
+REGISTER_LAYER(resize, ResizeLayer);
+
+bool ResizeLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  if (!Layer::init(layerMap, parameterMap)) return false;
+  CHECK_EQ(1U, inputLayers_.size());
+
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void ResizeLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  const Argument& input = getInput(0);
+  size_t height = input.value->getHeight();
+  size_t width = input.value->getWidth();
+  CHECK_EQ((height * width) % getSize(), 0UL);
+
+  reserveOutput(height * width / getSize(), getSize());
+  MatrixPtr tmp =
+      Matrix::create(output_.value->getData(), height, width, false, useGpu_);
+  tmp->assign(*input.value);
+}
+
+void ResizeLayer::backward(const UpdateCallback& callback) {
+  const Argument& input = getInput(0);
+  size_t height = input.value->getHeight();
+  size_t width = input.value->getWidth();
+
+  if (!input.grad) {
+    return;
+  }
+
+  MatrixPtr tmp = Matrix::create(input.grad->getData(),
+                                 height * width / getSize(),
+                                 getSize(),
+                                 false,
+                                 useGpu_);
+  tmp->add(*output_.grad);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/RotateLayer.cpp b/paddle/legacy/gserver/layers/RotateLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/RotateLayer.cpp
rename to paddle/legacy/gserver/layers/RotateLayer.cpp
diff --git a/paddle/legacy/gserver/layers/RotateLayer.h b/paddle/legacy/gserver/layers/RotateLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..498e24372b8ca17c21ebecbe6a8c8b40217ab259
--- /dev/null
+++ b/paddle/legacy/gserver/layers/RotateLayer.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+/**
+ * A layer for rotating a multi-channel feature map (M x N x C) in the spatial
+ * domain
+ * The rotation is 90 degrees in clock-wise for each channel
+ * \f[
+ *   y(j,i,:) = x(M-i-1,j,:)
+ * \f]
+ * where \f$x\f$ is (M x N x C) input, and \f$y\f$ is (N x M x C) output.
+ *
+ * The config file api is rotate_layer
+ *
+ */
+
+class RotateLayer : public Layer {
+ public:
+  explicit RotateLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+
+ private:
+  int batchSize_;
+  int size_;
+  int height_;
+  int width_;
+  int channels_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/RowConvLayer.cpp b/paddle/legacy/gserver/layers/RowConvLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1961557dc2d2601091bb0e56fcd884d76d49bc0e
--- /dev/null
+++ b/paddle/legacy/gserver/layers/RowConvLayer.cpp
@@ -0,0 +1,106 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "RowConvLayer.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(row_conv, RowConvLayer);
+
+bool RowConvLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  contexLength_ = config_.inputs(0).row_conv_conf().context_length();
+
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  weight_.reset(new Weight(contexLength_, getSize(), parameters_[0]));
+  createFunction(forward_, "RowConv", FuncConfig());
+  createFunction(backward_, "RowConvGrad", FuncConfig());
+
+  return true;
+}
+
+void RowConvLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr input = getInputValue(0);
+  size_t height = input->getHeight();
+  size_t width = input->getWidth();
+  CHECK_EQ(width, getSize());
+  resetOutput(height, width);
+
+  const auto startPos = getInput(0).sequenceStartPositions->getVector(useGpu_);
+  MatrixPtr w = weight_->getW();
+  wDims_ = TensorShape({w->getHeight(), w->getWidth()});
+
+  MatrixPtr outV = getOutputValue();
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), *startPos);
+  inputs.addArg(*w, wDims_);
+  outputs.addArg(*getOutputValue(), *startPos, ADD_TO);
+
+  {
+    REGISTER_TIMER_INFO("RowConvForward", getName().c_str());
+    forward_[0]->calc(inputs, outputs);
+  }
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void RowConvLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  const auto startPos = getInput(0).sequenceStartPositions->getVector(useGpu_);
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getOutputGrad(), *startPos);
+  inputs.addArg(*getInputValue(0), *startPos);
+  inputs.addArg(*weight_->getW(), wDims_);
+
+  MatrixPtr inGrad = getInputGrad(0);
+  MatrixPtr wGrad = weight_->getWGrad();
+  size_t h = getInputValue(0)->getHeight();
+  size_t w = getInputValue(0)->getWidth();
+  outputs.addArg(
+      inGrad ? (*inGrad) : *(Matrix::create(nullptr, h, w, false, useGpu_)),
+      *startPos,
+      ADD_TO);
+  outputs.addArg(
+      wGrad ? (*wGrad)
+            : *(Matrix::create(nullptr, contexLength_, w, false, useGpu_)),
+      wDims_,
+      ADD_TO);
+
+  {
+    REGISTER_TIMER_INFO("RowConvBackward", getName().c_str());
+    backward_[0]->calc(inputs, outputs);
+  }
+
+  {
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    weight_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/RowConvLayer.h b/paddle/legacy/gserver/layers/RowConvLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b74df0b1af5caef1a1abd3d3c5b3ae3b67c429b
--- /dev/null
+++ b/paddle/legacy/gserver/layers/RowConvLayer.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * \brief Row Convolution Layer.
+ */
+class RowConvLayer : public Layer {
+ public:
+  explicit RowConvLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~RowConvLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ protected:
+  // Row convolution weight, context_lenght_ * fan_out.
+  // fan_out is the size of output feature.
+  std::unique_ptr<Weight> weight_;
+
+  // The step number to look ahead plus one equals contexLength_.
+  size_t contexLength_;
+  TensorShape wDims_;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/RowL2NormLayer.cpp b/paddle/legacy/gserver/layers/RowL2NormLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d5e6e10a0276adb74ec31c13d9e8acc77414a85b
--- /dev/null
+++ b/paddle/legacy/gserver/layers/RowL2NormLayer.cpp
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * A layer for L2 normalization in each row,
+ * \f[
+ *   out[i] = \frac{in[i]}{\sqrt{\sum_{k=1}^N in[k]^{2}}}
+ * \f]
+ * where the size of \f$in\f$ is (batchSize x dataDim),
+ * and the size of \f$out\f$ is (batchSize x dataDim).
+ */
+
+class RowL2NormLayer : public Layer {
+ protected:
+  MatrixPtr inSquare_;
+  MatrixPtr l2NormReciprocal_;
+  MatrixPtr dotSum_;
+
+ public:
+  explicit RowL2NormLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(row_l2_norm, RowL2NormLayer);
+
+bool RowL2NormLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 1U);
+
+  return true;
+}
+
+void RowL2NormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV = getInputValue(0);
+
+  /* malloc memory for the output_ if necessary */
+  size_t batchSize = inV->getHeight();
+  size_t dataDim = getSize();
+  CHECK_EQ(dataDim, inV->getWidth());
+  resetOutput(batchSize, dataDim);
+  MatrixPtr outV = getOutputValue();
+
+  Matrix::resizeOrCreate(inSquare_, batchSize, dataDim, false, useGpu_);
+  inV->square2(*inSquare_);
+  Matrix::resizeOrCreate(l2NormReciprocal_, batchSize, 1, false, useGpu_);
+  inSquare_->rowSum(*l2NormReciprocal_);
+  l2NormReciprocal_->sqrt2(*l2NormReciprocal_);
+  l2NormReciprocal_->scalarDiv(*l2NormReciprocal_, 1.0);
+  outV->rowScale(0, *inV, *l2NormReciprocal_);
+}
+
+void RowL2NormLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr outG = getOutputGrad();
+  size_t batchSize = inV->getHeight();
+
+  // inG[ij] += outG[ij] / l2NormReciprocal
+  // inG[ij] += -inV[ij] * l2NormReciprocal * l2NormReciprocal * DotMul(outG[i],
+  // inV[i])
+  if (inG) {
+    Matrix::resizeOrCreate(dotSum_, batchSize, 1, false, useGpu_);
+    dotSum_->zeroMem();
+    dotSum_->rowDotMul(0, *outG, *outV);
+    dotSum_->dotMul(*dotSum_, *l2NormReciprocal_);
+    dotSum_->dotMul(*dotSum_, *l2NormReciprocal_);
+    inSquare_->rowScale(0, *inV, *dotSum_);
+    inG->sub(*inSquare_);
+    inG->addRowScale(0, *outG, *l2NormReciprocal_);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SamplingIdLayer.cpp b/paddle/legacy/gserver/layers/SamplingIdLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dbce63588126c012e3b9713e8be749e0001ddec7
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SamplingIdLayer.cpp
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <random>
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for sampling id from multinomial distribution from the
+ * input layer. Sampling one id for one sample. The result is stored in
+ * output_.ids.
+ *
+ * The config file api is sampling_id_layer.
+ */
+class SamplingIdLayer : public Layer {
+  /// Produces random floating-point values, uniformly distributed on [0, 1).
+  std::uniform_real_distribution<double> rand1_;
+  std::vector<Argument> tmpCpuInput_;
+
+ public:
+  explicit SamplingIdLayer(const LayerConfig& config)
+      : Layer(config), rand1_(0, 1) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
+    bool ret = Layer::init(layerMap, parameterMap);
+    CHECK_EQ(1UL, inputLayers_.size());
+    if (useGpu_) {
+      tmpCpuInput_.reserve(inputLayers_.size());
+      for (size_t i = 0; i < inputLayers_.size(); i++) {
+        tmpCpuInput_.push_back(Argument());
+      }
+    }
+    return ret;
+  }
+
+  void forward(PassType passType) override {
+    Layer::forward(passType);
+    if (useGpu_) {
+      for (size_t i = 0; i < inputLayers_.size(); i++) {
+        tmpCpuInput_[i].resizeAndCopyFrom(
+            getInput(i), false, HPPL_STREAM_DEFAULT);
+      }
+      hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+      forwardImp(tmpCpuInput_[0]);
+    } else {
+      forwardImp(getInput(0));
+    }
+  }
+
+  void forwardImp(const Argument& input) {
+    size_t batchSize = input.getBatchSize();
+    IVector::resizeOrCreate(output_.ids, batchSize, useGpu_);
+    real* buf = input.value->getData();
+    int dim = input.value->getWidth();
+    std::vector<int> ids(batchSize);
+    auto& reng = ThreadLocalRandomEngine::get();
+    for (size_t i = 0; i < batchSize; ++i) {
+      double r = rand1_(reng);
+      int id = dim - 1;
+      for (int j = 0; j < dim; ++j) {
+        if ((r -= buf[i * dim + j]) < 0) {
+          id = j;
+          break;
+        }
+      }
+      ids[i] = id;
+    }
+    output_.ids->copyFrom(ids.data(), batchSize);
+  }
+
+  void backward(const UpdateCallback& callback) override {}
+};
+
+REGISTER_LAYER(sampling_id, SamplingIdLayer);
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ScaleShiftLayer.cpp b/paddle/legacy/gserver/layers/ScaleShiftLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8af78a2e27d2b50572f8bdd6e98696f3d1967eb1
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ScaleShiftLayer.cpp
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * A layer applies a linear transformation to each element in each row of
+ * the input matrix. For each element, the layer first re-scale it and then
+ * adds a bias to it.
+ *
+ * \f[
+ *    y = wx + b
+ * \f]
+ *
+ * Here, w is the scale and b is the bias. Both w and b are trainable scalars.
+ *
+ */
+
+class ScaleShiftLayer : public Layer {
+ protected:
+  std::unique_ptr<Weight> scale_;
+  std::unique_ptr<Weight> offset_;
+
+ public:
+  explicit ScaleShiftLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(scale_shift, ScaleShiftLayer);
+
+bool ScaleShiftLayer::init(const LayerMap& layerMap,
+                           const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(inputLayers_.size(), 1U);
+  scale_.reset(new Weight(1, 1, parameters_[0]));
+  if (biasParameter_.get() != NULL) {
+    offset_ = std::unique_ptr<Weight>(new Weight(1, 1, biasParameter_));
+  }
+  return true;
+}
+
+void ScaleShiftLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV = getInputValue(0);
+  resetOutput(inV->getHeight(), inV->getWidth());
+  MatrixPtr outV = getOutputValue();
+  real scaleValue = scale_->getW()->getElement(0, 0);
+  outV->mulScalar(*inV, scaleValue);
+  if (offset_) {
+    real offsetValue = offset_->getW()->getElement(0, 0);
+    outV->add(offsetValue);
+  }
+}
+
+void ScaleShiftLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr outG = getOutputGrad();
+
+  /* Calculate the parameter gradient for the current layer */
+  if (scale_->getWGrad()) {
+    MatrixPtr rowSumMtx;
+    Matrix::resizeOrCreate(rowSumMtx, outG->getHeight(), 1, false, useGpu_);
+    // this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij} * c_{ij}
+    rowSumMtx->sumOfProducts(
+        /* b= */ *inV, /* c= */ *outG, /* scaleSum= */ 1, /* scaleDest= */ 0.);
+    // this_i = scaleDest * this_i + scaleSum * \sum_j b_{ji}
+    scale_->getWGrad()->sumCols(
+        /* b= */ *rowSumMtx, /* scaleSum= */ 1., /* scaleDest= */ 1.);
+    scale_->getParameterPtr()->incUpdate(callback);
+  }
+  if (offset_ && offset_->getWGrad()) {
+    MatrixPtr rowSumMtx;
+    Matrix::resizeOrCreate(rowSumMtx, outG->getHeight(), 1, false, useGpu_);
+    rowSumMtx->sumRows(*outG, 1., 0.);
+    offset_->getWGrad()->sumCols(*rowSumMtx, 1., 1.);
+    offset_->getParameterPtr()->incUpdate(callback);
+  }
+
+  /* Calculate the input layers error */
+  if (inG) {
+    real scaleValue = scale_->getW()->getElement(0, 0);
+    inG->add(*outG, scaleValue);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ScaleSubRegionLayer.cpp b/paddle/legacy/gserver/layers/ScaleSubRegionLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..70d44d2a7ef25df64beb2c861692436d842dac02
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ScaleSubRegionLayer.cpp
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionLayer.h"
+#include "paddle/legacy/utils/Stat.h"
+namespace paddle {
+
+REGISTER_LAYER(scale_sub_region, ScaleSubRegionLayer);
+
+bool ScaleSubRegionLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(static_cast<int>(inputLayers_.size()), 2);
+  auto& conf = config_.inputs(0).scale_sub_region_conf();
+  value_ = conf.value();
+
+  createFunction(forward_, "ScaleSubRegion", FuncConfig().set("value", value_));
+  createFunction(
+      backward_, "ScaleSubRegionGrad", FuncConfig().set("value", value_));
+
+  return true;
+}
+
+void ScaleSubRegionLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  auto in0 = getInput(0);
+  imgH_ = in0.getFrameHeight();
+  imgW_ = in0.getFrameWidth();
+  if (imgH_ == 0 || imgW_ == 0) {
+    auto& conf = config_.inputs(0).scale_sub_region_conf();
+    imgH_ = conf.image_conf().img_size_y();
+    imgW_ = conf.image_conf().img_size();
+  }
+  MatrixPtr imgV = in0.value;
+  size_t batchSize = imgV->getHeight();
+  size_t spatialSize = imgH_ * imgW_;
+  channelsNum_ = imgV->getWidth() / spatialSize;
+  shape_ = TensorShape({batchSize, channelsNum_, imgH_, imgW_});
+
+  resetOutput(batchSize, imgV->getWidth());
+  auto& out = getOutput();
+  out.setFrameHeight(imgH_);
+  out.setFrameWidth(imgW_);
+
+  MatrixPtr indicesV = getInputValue(1);
+  indicesShape_ = TensorShape({batchSize, 6});
+
+  REGISTER_TIMER_INFO("ScaleSubRegionForward", getName().c_str());
+  BufferArgs inArgs;
+  BufferArgs outArgs;
+  inArgs.addArg(*imgV, shape_);
+  inArgs.addArg(*indicesV, indicesShape_);
+  outArgs.addArg(*out.value, shape_, ASSIGN_TO);
+  forward_[0]->calc(inArgs, outArgs);
+}
+
+void ScaleSubRegionLayer::backward(const UpdateCallback& callback) {
+  REGISTER_TIMER_INFO("ScaleSubRegionBackward", getName().c_str());
+  BufferArgs inArgs;
+  BufferArgs outArgs;
+  inArgs.addArg(*getOutputGrad(), shape_);
+  inArgs.addArg(*getInputValue(1), indicesShape_);
+  outArgs.addArg(*getInputGrad(0), shape_, ADD_TO);
+  backward_[0]->calc(inArgs, outArgs);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ScaleSubRegionLayer.h b/paddle/legacy/gserver/layers/ScaleSubRegionLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe431698bc6cd5e52e2c545756b40be8b307e644
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ScaleSubRegionLayer.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * \brief  For each instance, this layer can be used to multiply a value to a
+ *         specified sub continuous region. By providing start index and end
+ *         index for C/H/W, you can specify the location and shape of the
+ *         region.
+ *
+ *         input_0: Input value.
+ *         input_1: Indices value to specify the location an shape of the
+ *                  region.
+ */
+class ScaleSubRegionLayer : public Layer {
+ public:
+  explicit ScaleSubRegionLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ScaleSubRegionLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+
+  void backward(const UpdateCallback& callback = nullptr);
+
+ protected:
+  TensorShape shape_;
+  TensorShape indicesShape_;
+  size_t imgH_;
+  size_t imgW_;
+  size_t channelsNum_;
+  real value_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ScalingLayer.cpp b/paddle/legacy/gserver/layers/ScalingLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a8286b6614c3cdfbd720d0719f939018f6ae9579
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ScalingLayer.cpp
@@ -0,0 +1,106 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A layer for each row of a matrix, multiplying with a element of a vector,
+ * which is used in NEURAL TURING MACHINE.
+ * \f[
+ *   y.row[i] = w[i] * x.row[i]
+ * \f]
+ * where \f$x\f$ is (batchSize x dataDim) input, \f$w\f$ is
+ * (batchSize x 1) weight vector, and \f$y\f$ is (batchSize x dataDim) output.
+ *
+ * The config file api is scaling_layer.
+ */
+
+class ScalingLayer : public Layer {
+ public:
+  explicit ScalingLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ScalingLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(scaling, ScalingLayer);
+
+bool ScalingLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+
+  return true;
+}
+
+void ScalingLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr weightV = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV1->getHeight();
+  size_t dataDim = inV1->getWidth();
+
+  CHECK_EQ(dataDim, getSize());
+  CHECK_EQ(weightV->getWidth(), 1U);
+  CHECK_EQ(weightV->getHeight(), batchSize);
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    resetOutput(batchSize, dataDim);
+  }
+
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwScalingTimer", getName().c_str());
+    // outV += inV1 * weight
+    outV->addRowScale(0, *inV1, *weightV);
+  }
+}
+
+void ScalingLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr weightV = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+  MatrixPtr outG = getOutputGrad();
+
+  {
+    REGISTER_TIMER_INFO("BwScalingTimer", getName().c_str());
+
+    if (inG0) {
+      // inG0 += outG .* inV1
+      inG0->rowDotMul(0, *outG, *inV1);
+    }
+
+    if (inG1) {
+      // inG1 += outG * weight;
+      inG1->addRowScale(0, *outG, *weightV);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ScalingProjection.cpp b/paddle/legacy/gserver/layers/ScalingProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4d871cafc4d0194a61044d76a766236209c33d47
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ScalingProjection.cpp
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Projection.h"
+
+namespace paddle {
+
+class ScalingProjection : public Projection {
+ public:
+  ScalingProjection(const ProjectionConfig& config,
+                    const ParameterPtr& parameter,
+                    bool useGpu)
+      : Projection(config, parameter, useGpu) {
+    CHECK_EQ(parameter->getSize(), 1UL);
+    weight_.reset(new Weight(1, 1, parameter));
+  }
+
+  void forward() {
+    CHECK(in_->value);
+    out_->value->add(*in_->value, weight_->getW()->getElement(0, 0));
+  }
+
+  void backward(const UpdateCallback& callback) {
+    if (weight_->getWGrad()) {
+      auto sum = Matrix::create(in_->value->getHeight(), 1, false, useGpu_);
+      sum->sumOfProducts(*in_->value,
+                         *out_->grad,
+                         /* scaleSum= */ 1,
+                         /* scaleDest= */ 0);
+      weight_->getWGrad()->sumCols(*sum,
+                                   /* scaleSum= */ 1,
+                                   /* scaleDest= */ 1);
+      parameter_->incUpdate(callback);
+    }
+    if (in_->grad) {
+      in_->grad->add(*out_->grad, weight_->getW()->getElement(0, 0));
+    }
+  }
+
+ protected:
+  std::unique_ptr<Weight> weight_;
+};
+
+REGISTER_PROJECTION(scaling, ScalingProjection);
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.cpp b/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..72fb06814884cc2bcca2c600105077d8cf1459c5
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.cpp
@@ -0,0 +1,336 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "SelectiveFullyConnectedLayer.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(selective_fc, SelectiveFullyConnectedLayer);
+
+bool SelectiveFullyConnectedLayer::init(const LayerMap& layerMap,
+                                        const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  inputNum_ = inputLayers_.size();
+  if (config_.has_selected_colums()) {
+    inputNum_ -= 1;
+  }
+  for (size_t i = 0; i < inputNum_; i++) {
+    size_t height = inputLayers_[i]->getSize();
+    size_t width = getSize();
+    // NOTE weight is transpoed
+    weights_.emplace_back(new Weight(width, height, parameters_[i]));
+  }
+
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+
+  fullOutput_ = false;
+
+  return true;
+}
+
+void SelectiveFullyConnectedLayer::prefetch() {}
+
+void SelectiveFullyConnectedLayer::reserveOutput(size_t height,
+                                                 size_t width,
+                                                 size_t nnz) {
+  bool flag = (passType_ == PASS_TEST &&
+               config_.selective_fc_pass_generation() && !fullOutput_);
+  SetDevice device(output_.deviceId);
+  if (flag) {
+    // output_.value is sparse matrix
+    if (dynamic_cast<CpuMatrix*>(output_.value.get()) ||
+        dynamic_cast<GpuMatrix*>(output_.value.get())) {
+      output_.value = nullptr;
+    }
+    Matrix::resizeOrCreateSparseMatrix(output_.value,
+                                       height,
+                                       width,
+                                       nnz,
+                                       FLOAT_VALUE,
+                                       SPARSE_CSR,
+                                       /*trans=*/false,
+                                       /*useGpu=*/useGpu_);
+    output_.value->copyFrom(*selCols_);
+    interOutput_ = output_.value;
+  } else {
+    if (fullOutput_) {
+      // output_.value is dense matrix
+      if (dynamic_cast<CpuSparseMatrix*>(output_.value.get()) ||
+          dynamic_cast<GpuSparseMatrix*>(output_.value.get())) {
+        output_.value = nullptr;
+      }
+      Matrix::resizeOrCreate(output_.value,
+                             height,
+                             width,
+                             /*trans=*/false,
+                             /*useGpu=*/useGpu_);
+      interOutput_ = output_.value;
+    } else {
+      // output_.value is dense matrix, but width = nnz /height
+      CHECK_EQ(nnz % height, 0U);
+      CHECK(nnz / height);
+      Matrix::resizeOrCreate(output_.value,
+                             height,
+                             nnz / height,
+                             /*trans=*/false,
+                             /*useGpu=*/useGpu_);
+      interOutput_ = Matrix::createSparseMatrix(output_.value->getData(),
+                                                selCols_->getRows(),
+                                                selCols_->getCols(),
+                                                height,
+                                                width,
+                                                nnz,
+                                                FLOAT_VALUE,
+                                                SPARSE_CSR,
+                                                /*trans=*/false,
+                                                /*useGpu=*/useGpu_);
+    }
+  }
+  interOutput_->zeroMem();
+
+  if (passType_ != PASS_TEST && needGradient()) {
+    CHECK_EQ(nnz % height, 0U) << "during training, each sample must have a "
+                                  "same number of selected columns.";
+    CHECK(nnz / height)
+        << "during training, "
+           "each sample must have at least one column selected.";
+    Matrix::resizeOrCreate(output_.grad,
+                           height,
+                           nnz / height,
+                           /*trans=*/false,
+                           /*useGpu=*/useGpu_);
+    output_.grad->zeroMem();
+  }
+}
+
+void SelectiveFullyConnectedLayer::forward(PassType passType) {
+  REGISTER_TIMER("selective_fc.forward");
+  Layer::forward(passType);
+
+  getSelectiveCols();
+  size_t height = getInput(0).getBatchSize();
+  size_t width = getSize();
+  size_t nnz = height * width;
+  if (!fullOutput_) {
+    CHECK(selCols_);
+    CHECK(height == selCols_->getHeight());
+    CHECK(width == selCols_->getWidth());
+    nnz = selCols_->getElementCnt();
+  }
+
+  // Layer::ResetOutput(), here we set outV/outG as SparseMatrix manually
+  // this outV should be used as input of MaxIdLayer and softmax activation
+  reserveOutput(height, width, nnz);
+
+  bool flag = true;
+  for (size_t i = 0; i < inputNum_; i++) {
+    MatrixPtr input = getInputValue(i);
+    MatrixPtr weight = weights_[i]->getW();
+    size_t hsize = input->getHeight();
+    size_t wsize = weight->getHeight();
+    real scaleT = i == 0 ? real(0) : real(1);
+
+    flag = nnz < (hsize * wsize) * config_.selective_fc_full_mul_ratio() &&
+           !fullOutput_;
+    if (flag) {
+      // if the indecies are highly sparse,
+      // manully compute the multiplication of
+      // the input vector and the selected rows.
+      REGISTER_TIMER("selective.plain");
+      interOutput_->mul(*input, *weight->getTranspose(), 1, scaleT);
+    } else {
+      // if the indecies is not sparse enough,
+      // use full mul instead
+      REGISTER_TIMER("selective.mul");
+      if (fullOutput_) {
+        interOutput_->mul(*input, *weight->getTranspose(), 1, scaleT);
+      } else {
+        Matrix::resizeOrCreate(mmat_,
+                               hsize,
+                               wsize,
+                               /*trans=*/false,
+                               /*useGpu=*/useGpu_);
+        mmat_->mul(*input, *weight->getTranspose());
+        interOutput_->add3(mmat_);
+      }
+    }
+  }
+
+  if (biases_) {
+    interOutput_->addBias(*(biases_->getW()), 1);
+  }
+
+  flag = (passType_ == PASS_TEST && config_.selective_fc_pass_generation() &&
+          !fullOutput_);
+  if (flag) {
+    // during generation, output of this layer is a sparse csr matrix,
+    // which is probably the input of maxid layer
+    // if the model is trained with multi-class-cross-entroy-with-selfnorm,
+    // activiation of this layer should be exponential, not softmax.
+
+    Argument arg;
+    arg.value = Matrix::create(interOutput_->getData(),
+                               1,
+                               nnz,
+                               /*trans=*/false,
+                               /*useGpu=*/useGpu_);
+    //! TODO(yuyang18): Why we cannot invoke forwardActivation here?
+    activation_->forward(arg).check();
+  } else /* train and test in train, not generating */ {
+    // during training, this layer output value is *Matrix*, which is input of
+    // eg. multi-class-cross-entropy
+
+    // while training, every sample has a equal number of selected
+    // columns to be activated.
+    // note indices of multi-class-cross-entropy need to be remapped
+    // to this index.
+    // e.g. sample = [1,3,5] and 3 is gold, then label is 1
+
+    forwardActivation();
+  }
+}
+
+void SelectiveFullyConnectedLayer::backward(const UpdateCallback& callback) {
+  backwardActivation();
+  MatrixPtr oGrad = getOutputGrad();
+  if (!fullOutput_) {
+    interOutGrad_ = Matrix::createSparseMatrix(oGrad->getData(),
+                                               interOutput_->getRows(),
+                                               interOutput_->getCols(),
+                                               interOutput_->getHeight(),
+                                               interOutput_->getWidth(),
+                                               interOutput_->getElementCnt(),
+                                               FLOAT_VALUE,
+                                               SPARSE_CSR,
+                                               /*trans=*/false,
+                                               /*useGpu=*/useGpu_);
+  } else {
+    interOutGrad_ = Matrix::create(oGrad->getData(),
+                                   oGrad->getHeight(),
+                                   oGrad->getWidth(),
+                                   /*trans=*/false,
+                                   /*useGpu=*/useGpu_);
+  }
+
+  if (biases_ && biases_->getWGrad()) {
+    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
+    biases_->getWGrad()->collectBias(*interOutGrad_, 1);
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  // backward is different from FullyConnectedLayer
+  // because the weight is transposed
+  for (size_t i = 0; i < inputNum_; i++) {
+    AsyncGpuBlock block;
+    MatrixPtr preGrad = getInputGrad(i);
+    if (preGrad) {
+      REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
+      preGrad->mul(*interOutGrad_, *weights_[i]->getW(), 1, 1);
+    }
+
+    MatrixPtr wGrad = weights_[i]->getWGrad();
+    if (wGrad) {
+      REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
+      MatrixPtr input = getInputValue(i);
+      wGrad->mul(*interOutGrad_->getTranspose(), *input, 1, 1);
+    }
+
+    {
+      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+      weights_[i]->getParameterPtr()->incUpdate(callback);
+    }
+  }
+}
+
+void paddle::SelectiveFullyConnectedLayer::fillSelectiveData(
+    const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& candidates) {
+  if (candidates == nullptr) {
+    fillFullySelectiveData();
+    return;
+  }
+
+  size_t sampleNum = candidates->size();
+  size_t outputWidth = getSize();
+  size_t nnz =
+      std::accumulate(candidates->begin(),
+                      candidates->end(),
+                      0UL,
+                      [](size_t a, const std::pair<int*, size_t>& arr) {
+                        return a + arr.second;
+                      });
+
+  Matrix::resizeOrCreateSparseMatrix(this->cpuSelCols_,
+                                     sampleNum,
+                                     outputWidth,
+                                     nnz,
+                                     NO_VALUE,
+                                     SPARSE_CSR,
+                                     false,
+                                     false);
+  CHECK(this->cpuSelCols_ != nullptr);
+  CpuSparseMatrixPtr selCols =
+      std::dynamic_pointer_cast<CpuSparseMatrix>(cpuSelCols_);
+  int* rowOffsets = selCols->getRows();
+  int* colIndices = selCols->getCols();
+
+  rowOffsets[0] = 0;
+  int idx = 0;
+  for (size_t i = 0; i < sampleNum; ++i) {
+    if ((*candidates)[i].second > 0) {
+      rowOffsets[i + 1] = rowOffsets[i] + (*candidates)[i].second;
+      for (size_t j = 0; j < (*candidates)[i].second; ++j) {
+        colIndices[idx] = (*candidates)[i].first[j];
+        idx++;
+      }
+    } else {
+      rowOffsets[i + 1] = rowOffsets[i];
+    }
+  }
+
+  CHECK_EQ(static_cast<size_t>(rowOffsets[sampleNum]), nnz);
+  if (!useGpu_) {
+    this->selCols_ = this->cpuSelCols_;
+  } else {
+    Matrix::resizeOrCreateSparseMatrix(this->selCols_,
+                                       sampleNum,
+                                       outputWidth,
+                                       nnz,
+                                       NO_VALUE,
+                                       SPARSE_CSR,
+                                       false,
+                                       true);
+    this->selCols_->copyFrom(*cpuSelCols_, HPPL_STREAM_1);
+    hl_stream_synchronize(HPPL_STREAM_1);
+  }
+
+  fullOutput_ = false;
+}
+
+void paddle::SelectiveFullyConnectedLayer::getSelectiveCols() {
+  if (config_.has_selected_colums()) {
+    this->selCols_ = inputLayers_[inputNum_]->getOutputValue();
+    fullOutput_ = false;
+  } else if (!config_.selective_fc_pass_generation() || selCols_ == nullptr) {
+    this->fillFullySelectiveData();
+  }  // else selCols_ is initialized by fillSelectiveData
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.h b/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ba04d9b2ae208eda021a451e94856d9993dc126
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.h
@@ -0,0 +1,103 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ * @brief The SelectiveFullyConnectedLayer class
+ *
+ * SelectiveFullyConnectedLayer differs from FullyConnectedLayer by that it
+ * requires an additional input to indicate several selected columns, and only
+ * compute the multiplications between the input matrices and the selected
+ * columns of the parameter matrices of this layer. If the selected columns is
+ * not specified, SelectiveFullyConnected layer acts exactly like
+ * FullyConnectedLayer.
+ *
+ * The config file api is selective_fc_layer.
+ */
+class SelectiveFullyConnectedLayer : public Layer {
+ protected:
+  WeightList weights_;
+  std::unique_ptr<Weight> biases_;
+
+ private:
+  /**
+   * Get selected columns each forward.
+   */
+  void getSelectiveCols();
+
+  MatrixPtr mmat_;
+  /// cpuSelCols_ is a CpuSparseMatrix, used to save selected columns.
+  MatrixPtr cpuSelCols_;
+  /// CpuSparseMatrix or GpuSparseMatrix. In CPU mode, selCols_ points
+  /// to cpuSelCols_.
+  MatrixPtr selCols_;
+  size_t inputNum_;
+
+  /// interOutput_ shared same memory with output_.value.
+  MatrixPtr interOutput_;
+
+  /// if fullOutput_ is false, interOutGrad_ sparse matrix
+  MatrixPtr interOutGrad_;
+
+  /// if true, means output_.value is the same as Fc Layer
+  bool fullOutput_;
+
+ public:
+  explicit SelectiveFullyConnectedLayer(const LayerConfig& config)
+      : Layer(config), selCols_(nullptr) {}
+
+  ~SelectiveFullyConnectedLayer() {}
+  void prefetch() override;
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  Weight& getWeight(int idx) { return *weights_[idx]; }
+
+  /**
+   * @brief Resize the output matrix size.
+   * And reset value to zero
+   */
+  void reserveOutput(size_t height, size_t width, size_t nnz);
+
+  /**
+   * @brief Fill candidates to select several activations as output.
+   * @param candidates specifies several selected columns of the parameter
+   * matrices of this layer.
+   * Multiplications only between the input matrices and the selected columns
+   * are computed.
+   * If the candidates is a nullptr, selective fc layer acts exactly like the
+   * fully connected layer.
+   * @note CURRENTLY, THIS METHOD IS ONLY USED FOR BEAM SEARCH
+   */
+  void fillSelectiveData(
+      const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& candidates);
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ private:
+  /**
+   * @brief Make SelectiveFC act as FullyConnectedLayer
+   */
+  void fillFullySelectiveData() { fullOutput_ = true; }
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequenceConcatLayer.cpp b/paddle/legacy/gserver/layers/SequenceConcatLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7b598e11acde533564f6eda49d78ea8df99a5056
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SequenceConcatLayer.cpp
@@ -0,0 +1,189 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A layer for concatenating the first sequence with the second sequence
+ * Input: two sequences each containing the same number of instances
+ *        seq1 = [a1, a2, ..., an]
+ *        seq2 = [b1, b2, ..., bn]
+ * Output: a concatenated sequence of the two input sequences
+ *        out = [a1, b1, a2, b2, ..., an, bn]
+ */
+
+class SequenceConcatLayer : public Layer {
+ protected:
+  std::unique_ptr<Weight> biases_;
+
+ public:
+  explicit SequenceConcatLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~SequenceConcatLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(seqconcat, SequenceConcatLayer);
+
+bool SequenceConcatLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  // sequene concatenation layer should have exactly 2 inputs
+  CHECK_EQ(2U, inputLayers_.size());
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SequenceConcatLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  size_t dim = getSize();
+
+  const Argument& input1 = getInput(0);
+  size_t numSequences1 = input1.getNumSequences();
+  auto startPositions1 = input1.sequenceStartPositions->getVector(false);
+
+  const Argument& input2 = getInput(1);
+  size_t numSequences2 = input2.getNumSequences();
+  auto startPositions2 = input2.sequenceStartPositions->getVector(false);
+
+  CHECK_EQ(dim, input1.value->getWidth());
+  CHECK_EQ(startPositions1->getData()[numSequences1], input1.getBatchSize());
+  CHECK_EQ(numSequences1, startPositions1->getSize() - 1);
+
+  CHECK_EQ(dim, input2.value->getWidth());
+  CHECK_EQ(startPositions2->getData()[numSequences2], input2.getBatchSize());
+  CHECK_EQ(numSequences2, startPositions2->getSize() - 1);
+
+  CHECK_EQ(numSequences1, numSequences2);
+
+  MatrixPtr inputValue1 = getInputValue(0);
+  MatrixPtr inputValue2 = getInputValue(1);
+
+  // reset output
+  reserveOutput(inputValue1->getHeight() + inputValue2->getHeight(), dim);
+
+  MatrixPtr outputValue = getOutputValue();
+
+  const int* starts1 = startPositions1->getData();
+  const int* starts2 = startPositions2->getData();
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SequenceConcatLayerForward", getName().c_str());
+
+    size_t offset = 0;
+    size_t leftNumIns = 0;
+    size_t rightNumIns = 0;
+    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
+      leftNumIns = starts1[seqId + 1] - starts1[seqId];
+      outputValue->subMatrix(offset, leftNumIns)
+          ->assign(*(inputValue1->subMatrix(starts1[seqId], leftNumIns)));
+      offset += leftNumIns;
+
+      rightNumIns = starts2[seqId + 1] - starts2[seqId];
+      outputValue->subMatrix(offset, rightNumIns)
+          ->assign(*(inputValue2->subMatrix(starts2[seqId], rightNumIns)));
+      offset += rightNumIns;
+    }
+
+    // modify the sequenceStartPositions
+    ICpuGpuVector::resizeOrCreate(
+        output_.sequenceStartPositions, numSequences1 + 1, false);
+
+    int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
+
+    for (size_t seqId = 0; seqId < numSequences1 + 1; ++seqId) {
+      tgtBuf[seqId] = starts1[seqId] + starts2[seqId];
+    }
+  }
+
+  if (biases_.get() != NULL) {
+    MatrixPtr outV = getOutputValue();
+    outV->addBias(*(biases_->getW()), 1);
+  }
+
+  /* activation */
+  forwardActivation();
+}
+
+void SequenceConcatLayer::backward(const UpdateCallback& callback) {
+  /* activation */
+  backwardActivation();
+
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    // Increasing the number of gradient
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  MatrixPtr inputGrad1 = getInputGrad(0);
+  MatrixPtr inputGrad2 = getInputGrad(1);
+  MatrixPtr outputGrad = getOutputGrad();
+  auto startPositions1 = getInput(0).sequenceStartPositions->getVector(false);
+  auto startPositions2 = getInput(1).sequenceStartPositions->getVector(false);
+
+  size_t numSequences1 = startPositions1->getSize() - 1;
+  size_t numSequences2 = startPositions2->getSize() - 1;
+
+  CHECK_EQ(numSequences1, numSequences2);
+
+  const int* starts1 = startPositions1->getData();
+  const int* starts2 = startPositions2->getData();
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SequenceConcatLayerBackward", getName().c_str());
+
+    size_t offset = 0;
+    size_t leftNumIns = 0;
+    size_t rightNumIns = 0;
+    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
+      leftNumIns = starts1[seqId + 1] - starts1[seqId];
+      if (inputGrad1) {
+        inputGrad1->subMatrix(starts1[seqId], leftNumIns)
+            ->add(*(outputGrad->subMatrix(offset, leftNumIns)));
+      }
+      offset += leftNumIns;
+
+      rightNumIns = starts2[seqId + 1] - starts2[seqId];
+      if (inputGrad2) {
+        inputGrad2->subMatrix(starts2[seqId], rightNumIns)
+            ->add(*(outputGrad->subMatrix(offset, rightNumIns)));
+      }
+      offset += rightNumIns;
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/legacy/gserver/layers/SequenceLastInstanceLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8735d71ba372de894c9852229ed8c77537792ea0
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SequenceLastInstanceLayer.cpp
@@ -0,0 +1,118 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/Logging.h"
+
+#include "SequencePoolLayer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A layer for extracting the last instance of the input sequence.
+ * Input: a sequence
+ * If SequenceLevel = kNonseq:
+ *   Output: a sequence containing only the last instance of the input sequence
+ *   If stride_ > 0:
+ *      Output: a shorten sequence. Stride is the step size by which we slide a
+ *              window upon the input sequence, and getting last instance
+ *              operation is then applied to each interval independently.
+ * If SequenceLevel = kSeq:
+ *   Check input sequence must has sub-sequence
+ *   Output: a sequence containing only the last instance of each sub-sequence
+ *           of the input sequence
+ *
+ * The config file api is last_seq and first_seq.
+ */
+
+class SequenceLastInstanceLayer : public SequencePoolLayer {
+ protected:
+  MatrixPtr tmpSrc_;
+  MatrixPtr tmpDest_;
+  std::vector<int> instanceIds_;
+
+ public:
+  explicit SequenceLastInstanceLayer(const LayerConfig& config)
+      : SequencePoolLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer);
+
+bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
+                                     const ParameterMap& parameterMap) {
+  SequencePoolLayer::init(layerMap, parameterMap);
+  reversed_ = config_.select_first();
+
+  tmpSrc_ =
+      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
+  tmpDest_ =
+      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
+
+  return true;
+}
+
+void SequenceLastInstanceLayer::forward(PassType passType) {
+  SequencePoolLayer::forward(passType);
+
+  auto starts = startPositions_->getData(false);
+  MatrixPtr inputValue = getInputValue(0);
+  MatrixPtr outputValue = getOutputValue();
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SequenceLastInstanceLayerForward", getName().c_str());
+
+    instanceIds_.clear();
+    for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
+      int insId = reversed_ ? starts[seqId] : starts[seqId + 1] - 1;
+      instanceIds_.push_back(insId);
+
+      outputValue->subMatrix(seqId, 1, tmpDest_)
+          ->assign(*(inputValue->subMatrix(insId, 1, tmpSrc_)));
+    }
+  }
+
+  if (biases_.get() != NULL) {
+    outputValue->addBias(*(biases_->getW()), 1);
+  }
+
+  /*  activation, should set to 'linear' in most cases */
+  forwardActivation();
+}
+
+void SequenceLastInstanceLayer::backward(const UpdateCallback& callback) {
+  SequencePoolLayer::backward(callback);
+
+  MatrixPtr inputGrad = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+
+  if (inputGrad) {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SequenceLastInstanceLayerBackward", getName().c_str());
+
+    for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
+      inputGrad->subMatrix(instanceIds_[seqId], 1, tmpDest_)
+          ->add(*(outputGrad->subMatrix(seqId, 1, tmpSrc_)));
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequencePoolLayer.cpp b/paddle/legacy/gserver/layers/SequencePoolLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..243b795db428ede1fbb39a5054485a198a14e00c
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SequencePoolLayer.cpp
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "SequencePoolLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+bool SequencePoolLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  // seqlastins/max/average layer should have exactly 1 input
+  CHECK_EQ(1U, inputLayers_.size());
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+  // transform to which sequence type
+  if (config_.trans_type() == "non-seq") {
+    type_ = kNonSeq;
+  } else if (config_.trans_type() == "seq") {
+    type_ = kSeq;
+  } else {
+    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
+  }
+  stride_ = config_.seq_pool_stride();
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SequencePoolLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+  CHECK(input.hasSeq() || input.hasSubseq())
+      << "Input should be a sequence or subsequence for layer " << getName();
+
+  newBatchSize_ = type_ ? input.getNumSubSequences() : input.getNumSequences();
+  size_t dim = getSize();
+  // check
+  CHECK_EQ(dim, input.value->getWidth());
+  startPositions_ =
+      type_ ? input.subSequenceStartPositions : input.sequenceStartPositions;
+  auto starts = startPositions_->getVector(false);
+  CHECK_EQ(starts->getData()[newBatchSize_], input.getBatchSize());
+  CHECK_EQ(newBatchSize_, starts->getSize() - 1);
+
+  /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
+   * thus, in this case, output_ has no sequenceStartPositions.
+   * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
+   * case, we should compute the new sequenceStartPositions.
+   */
+  if (type_) {
+    CHECK(input.subSequenceStartPositions)
+        << "when trans_type = seq, input must hasSubseq";
+    output_.degradeSequence(input);
+  }
+  if (stride_ > 0) {
+    CHECK_EQ(input.hasSubseq(), 0UL)
+        << "sequence stride pooling is invalid for hasSubseq now";
+    output_.poolSequenceWithStride(input, stride_, &startPositions_, reversed_);
+    newBatchSize_ = startPositions_->getSize() - 1;
+  }
+
+  resetOutput(newBatchSize_, dim);
+}
+
+void SequencePoolLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ { backwardActivation(); }
+
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    // Increasing the number of gradient
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequencePoolLayer.h b/paddle/legacy/gserver/layers/SequencePoolLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c019b313093f4ac717e0fc57a9aa798e2951580
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SequencePoolLayer.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+/**
+ * A base layer for SequenceLastInstanceLayer/AverageLayer/MaxLayer.
+ *
+ * Input: one or more sequences. Each sequence contains some instances.
+ * If SequenceLevel = kNonSeq:
+ *    Output: output size is the number of input sequences (NOT input instances)
+ *    output[i] = seqlastin/average/max_{for each instance in this
+ * sequence}{input[i]}
+ *    If stride_ > 0:
+ *        Check input sequence must not have sub-sequence
+ *        Output: a shorten sequence. Stride is the step size by which we slide
+ *                a window upon the input sequence, and the pooling operation
+ *                is then applied to each interval independently.
+ * If SequenceLevel = kSeq:
+ *    Check input sequence must has sub-sequence
+ *    Output: output size is the number of input sub-sequences
+ *    output[i] = seqlastin/average/max_{for each instance in this
+ * sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
+ */
+
+class SequencePoolLayer : public Layer {
+ protected:
+  int type_;
+  std::unique_ptr<Weight> biases_;
+  enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
+  size_t newBatchSize_;
+  ICpuGpuVectorPtr startPositions_;
+  int stride_;
+  // Whether the input sequence is reversed or not.
+  bool reversed_ = false;
+
+ public:
+  explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequenceReshapeLayer.cpp b/paddle/legacy/gserver/layers/SequenceReshapeLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e3d40cab50af1d6eafe28331cdd481ee2b187a56
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SequenceReshapeLayer.cpp
@@ -0,0 +1,157 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ *  A layer for reshaping the sequence. Assume the input sequence has
+ *  T instances, the dimension of each instance is M, and the input
+ *  reshape_dim is N, then the output sequence has T*M/N instances,
+ *  the dimension of each instance is N.
+ *
+ *  Note that T*M/N must be an integer.
+ */
+
+class SequenceReshapeLayer : public Layer {
+ protected:
+  std::unique_ptr<Weight> biases_;
+
+  MatrixPtr reshapedOutputGrad;
+
+ public:
+  explicit SequenceReshapeLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(seqreshape, SequenceReshapeLayer);
+
+bool SequenceReshapeLayer::init(const LayerMap& layerMap,
+                                const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(1U, inputLayers_.size());
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SequenceReshapeLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+
+  size_t inDim = input.value->getWidth();
+  size_t outDim = getSize();
+
+  size_t numSequences = input.getNumSequences();
+
+  // by default, we assume each instance as a sequence
+  IVectorPtr seqStarts;
+  IVector::resizeOrCreate(seqStarts, input.getBatchSize() + 1, false);
+  int* startsData = seqStarts->getData();
+  for (int i = 0; i < input.getBatchSize() + 1; i++) {
+    startsData[i] = i;
+  }
+  const int* starts = startsData;
+
+  // if there is sequence, then use start positions
+  if (input.sequenceStartPositions) {
+    auto startPositions = input.sequenceStartPositions->getVector(false);
+    starts = startPositions->getData();
+    CHECK_EQ(starts[numSequences], input.getBatchSize());
+    CHECK_EQ(numSequences, startPositions->getSize() - 1);
+  }
+
+  for (size_t seqID = 0; seqID < numSequences; seqID++) {
+    size_t inNumIns = starts[seqID + 1] - starts[seqID];
+    size_t outNumIns = inNumIns * inDim / outDim;
+    CHECK_EQ(outNumIns * outDim, inNumIns * inDim);
+  }
+
+  MatrixPtr inputValue = getInputValue(0);
+
+  // reset output
+  reserveOutput(inputValue->getHeight() * inDim / outDim, outDim);
+  MatrixPtr outputValue = getOutputValue();
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SequenceReshapeLayerForward", getName().c_str());
+
+    outputValue->copyFrom(*inputValue);
+
+    // modify the sequenceStartPositions
+    ICpuGpuVector::resizeOrCreate(
+        output_.sequenceStartPositions, numSequences + 1, false);
+
+    int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
+
+    for (size_t seqId = 0; seqId < numSequences + 1; ++seqId) {
+      tgtBuf[seqId] = starts[seqId] * inDim / outDim;
+    }
+  }
+
+  if (biases_.get() != NULL) {
+    MatrixPtr outV = getOutputValue();
+    outV->addBias(*(biases_->getW()), 1);
+  }
+
+  /* activation */
+  forwardActivation();
+}
+
+void SequenceReshapeLayer::backward(const UpdateCallback& callback) {
+  /* activation */
+  backwardActivation();
+
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    // Increasing the number of gradient
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  MatrixPtr inputGrad = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+
+  AsyncGpuBlock asyncGpuBlock;
+  REGISTER_TIMER_INFO("SequenceReshapeLayerBackward", getName().c_str());
+
+  if (inputGrad) {
+    Matrix::resizeOrCreate(reshapedOutputGrad,
+                           inputGrad->getHeight(),
+                           inputGrad->getWidth(),
+                           false,
+                           useGpu_);
+    reshapedOutputGrad->copyFrom(*outputGrad);
+    inputGrad->add(*reshapedOutputGrad);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SequenceSliceLayer.cpp b/paddle/legacy/gserver/layers/SequenceSliceLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3ed51c4ef2f6e91da94f302c14d1c0cc555886aa
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SequenceSliceLayer.cpp
@@ -0,0 +1,224 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+class SequenceSliceLayer : public Layer {
+ public:
+  explicit SequenceSliceLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ private:
+  /*
+   * TODO(caoying)
+   * In PaddePaddle, currently all matrices are real number types,
+   * but the second and the (optional) third input which are some
+   * selected indices of the give sequence to trim the sequence, are actually
+   * filled with int types so that storing int types information in real number
+   * matrices is very dangerous, since real numbers will be convered to int
+   * types. If a user fills this matrix himself, invalid data may occor.
+   */
+
+  MatrixPtr startIdsOnCpu_;
+  MatrixPtr endIdsOnCpu_;
+
+  std::vector<int> selectedRows_;
+  IVectorPtr rowIndice_;
+  std::vector<std::vector<int>> inputSeqInfoVec_;
+  std::vector<int> outSubSeqStartPos_;
+  std::vector<int> outSeqStartPos_;
+
+  void checkInputs();
+  void copySliceIdsToCpu();
+  void calSelectedRows(const MatrixPtr starts, const MatrixPtr ends);
+};
+
+REGISTER_LAYER(seq_slice, SequenceSliceLayer);
+
+bool SequenceSliceLayer::init(const LayerMap& layerMap,
+                              const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_GE(inputLayers_.size(), 2U);
+  CHECK_LE(inputLayers_.size(), 3U);
+
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SequenceSliceLayer::checkInputs() {
+  const Argument& inputSeq = getInput(0);
+  CHECK(inputSeq.hasSeq()) << "The first input of sequence slice layer "
+                           << "must be a sequence.";
+  const MatrixPtr indices1 = getInputValue(1);
+  CHECK_EQ(
+      indices1->getHeight(),
+      static_cast<size_t>(inputSeq.hasSubseq() ? inputSeq.getNumSubSequences()
+                                               : inputSeq.getNumSequences()))
+      << "Height of the second input should be equal to number of sequence "
+      << "in the first input.";
+  if (inputLayers_.size() == 3) {
+    const MatrixPtr indices2 = getInputValue(2);
+    CHECK_EQ(indices2->getHeight(), indices1->getHeight())
+        << "start indices and end indices should have the same height.";
+    CHECK_EQ(indices2->getWidth(), indices1->getWidth())
+        << "start indices and end indices should have the same Width.";
+  }
+}
+
+void SequenceSliceLayer::copySliceIdsToCpu() {
+  const MatrixPtr indices1 = getInputValue(1);
+  if (inputLayers_.size() == 2U) {
+    if (config_.select_first()) {
+      Matrix::resizeOrCreate(startIdsOnCpu_,
+                             indices1->getHeight(),
+                             indices1->getWidth(),
+                             false /* trans */,
+                             false /* useGpu */);
+      startIdsOnCpu_->copyFrom(*indices1);
+      endIdsOnCpu_ = nullptr;
+    } else {
+      Matrix::resizeOrCreate(endIdsOnCpu_,
+                             indices1->getHeight(),
+                             indices1->getWidth(),
+                             false /* trans */,
+                             false /* useGpu */);
+      endIdsOnCpu_->copyFrom(*indices1);
+      startIdsOnCpu_ = nullptr;
+    }
+  } else if (inputLayers_.size() == 3U) {
+    Matrix::resizeOrCreate(startIdsOnCpu_,
+                           indices1->getHeight(),
+                           indices1->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
+    startIdsOnCpu_->copyFrom(*indices1);
+
+    const MatrixPtr indices2 = getInputValue(2);
+    Matrix::resizeOrCreate(endIdsOnCpu_,
+                           indices2->getHeight(),
+                           indices2->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
+    endIdsOnCpu_->copyFrom(*indices2);
+  }
+}
+
+void SequenceSliceLayer::calSelectedRows(const MatrixPtr starts,
+                                         const MatrixPtr ends) {
+  CHECK(starts || ends) << "At least one of the start or end indices "
+                        << "should be given.";
+
+  bool hasSubseq = getInput(0).hasSubseq();
+
+  outSeqStartPos_.resize(1, 0);
+  outSubSeqStartPos_.resize(1, 0);
+  selectedRows_.clear();
+
+  size_t beamSize = starts ? starts->getWidth() : ends->getWidth();
+  size_t rowIdx = 0;
+  for (size_t i = 0; i < inputSeqInfoVec_.size(); ++i) {
+    for (size_t j = 0; j < inputSeqInfoVec_[i].size() - 1; ++j) {
+      for (size_t k = 0; k < beamSize; ++k) {
+        if (starts && starts->getElement(rowIdx, k) == -1.) break;
+        if (ends && ends->getElement(rowIdx, k) == -1.) break;
+
+        int begPos = inputSeqInfoVec_[i][j];
+        if (starts) begPos += starts->getElement(rowIdx, k);
+
+        int endPos = inputSeqInfoVec_[i][j + 1] - 1;
+        if (ends) endPos = inputSeqInfoVec_[i][j] + ends->getElement(rowIdx, k);
+
+        int seqLen = endPos - begPos + 1;
+        CHECK_GT(seqLen, 0);
+        for (int m = begPos; m <= endPos; ++m) selectedRows_.push_back(m);
+        hasSubseq
+            ? outSubSeqStartPos_.push_back(outSubSeqStartPos_.back() + seqLen)
+            : outSeqStartPos_.push_back(outSeqStartPos_.back() + seqLen);
+      }
+      rowIdx++;
+    }
+    if (hasSubseq) outSeqStartPos_.push_back(outSubSeqStartPos_.back());
+  }
+
+  if (useGpu_) {
+    rowIndice_ = IVector::create(selectedRows_.size(), useGpu_);
+    rowIndice_->copyFrom(selectedRows_.data(), selectedRows_.size());
+  } else {
+    rowIndice_ =
+        IVector::create(selectedRows_.data(), selectedRows_.size(), useGpu_);
+  }
+
+  // create the sequence information for the output.
+  ICpuGpuVector::resizeOrCreate(
+      output_.sequenceStartPositions, outSeqStartPos_.size(), false);
+  output_.sequenceStartPositions->copyFrom(
+      outSeqStartPos_.data(), outSeqStartPos_.size(), false);
+
+  if (hasSubseq) {
+    ICpuGpuVector::resizeOrCreate(
+        output_.subSequenceStartPositions, outSubSeqStartPos_.size(), false);
+    output_.subSequenceStartPositions->copyFrom(
+        outSubSeqStartPos_.data(), outSubSeqStartPos_.size(), false);
+  }
+}
+
+void SequenceSliceLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  checkInputs();
+
+  const Argument& inputSeq = getInput(0);
+  inputSeqInfoVec_.clear();
+  Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
+                              inputSeq.subSequenceStartPositions,
+                              inputSeqInfoVec_);
+  if (!useGpu_) {
+    if (inputLayers_.size() == 2U) {
+      startIdsOnCpu_ = config_.select_first() ? getInputValue(1) : nullptr;
+      endIdsOnCpu_ = config_.select_first() ? nullptr : getInputValue(1);
+    } else if (inputLayers_.size() == 3U) {
+      startIdsOnCpu_ = getInputValue(1);
+      endIdsOnCpu_ = getInputValue(2);
+    }
+  } else {
+    copySliceIdsToCpu();
+  }
+
+  /*
+   * calculate the selected row indices in a batch, and build the output
+   * sequence information.
+   */
+  calSelectedRows(startIdsOnCpu_, endIdsOnCpu_);
+
+  resetOutput(selectedRows_.size(), getSize());
+
+  getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
+}
+
+void SequenceSliceLayer::backward(const UpdateCallback& callback) {
+  getOutputGrad()->addToRows(*getInputGrad(0), *rowIndice_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceToBatch.cpp b/paddle/legacy/gserver/layers/SequenceToBatch.cpp
similarity index 100%
rename from paddle/gserver/layers/SequenceToBatch.cpp
rename to paddle/legacy/gserver/layers/SequenceToBatch.cpp
diff --git a/paddle/legacy/gserver/layers/SequenceToBatch.h b/paddle/legacy/gserver/layers/SequenceToBatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ed517937d4a015b6b11de16412cac7599f5f8b9
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SequenceToBatch.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+
+namespace paddle {
+
+/*
+ * This class can used to modify the matrix structure of sequence matrix into
+ * batch structure.
+ * sequence matrix: [C1_s ... Cn_s | ...... | C1_t ... Cn_t]
+ * batch matrix:    [C1_s ... C1_t | ...... | Cn_s ... Cn_t]
+ * Cn_s is the state for sequence s at time n.
+ *
+ * Exampel:  sequence matrix = {{0, 0, 0, 0}, {1, 1, 1, 1, 1}, {2, 2, 2}}
+ *           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
+ *           batch matrix = {{1, 0, 2}, {1, 0, 2}, {1, 0, 2}, {1, 0}, {1}}
+ *           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
+ *
+ * Use:
+ * Input: seqMatrix, seqStarts(Sequence Start Positions)
+ * Output: batchMatrix
+ * 1. SequenceToBatch seq2batch;
+ * 2. seq2batch.resizeOrCreateBatch(seqStarts);     // calculate seq2BatchIdx
+ * 3. seq2batch.copy(seqMatrix, batchMatrix, true); // copy seq to batch matrix
+ *
+ */
+class SequenceToBatch {
+ public:
+  explicit SequenceToBatch(bool useGpu) : useGpu_(useGpu) {}
+
+  /* resize and calculate the batchIndex_ */
+  void resizeOrCreateBatch(int batchSize,
+                           size_t numSequences,
+                           const int *seqStarts,
+                           bool reversed,
+                           bool prevBatchState = false);
+
+  /* sequence matrix and batch matrix copy:
+   * seq2batch: copy(seqValue, batchValue, true);
+   * batch2seq: copy(seqValue, batchValue, false);
+   */
+  void copy(Matrix &seqValue, Matrix &batchValue, bool seq2batch);
+  /* sequence/batch matrix add to batch/sequence matrix */
+  void add(Matrix &seqValue, Matrix &batchValue, bool seq2batch);
+  MatrixPtr getBatchValue(Matrix &batchValue, int batchId, int numRows = 0);
+
+  size_t getNumBatch() const { return numBatch_; }
+
+  /* resize or create a batch matrix(batchValue_) */
+  void resizeOrCreate(Matrix &seqValue);
+  /* copy seqValue to batchValue_ */
+  void copyFromSeq(Matrix &seqValue);
+  /* copy batchValue_ to seqValue */
+  void copyBackSeq(Matrix &seqValue);
+  MatrixPtr getBatchValue(int batchId, int numRows = 0);
+  MatrixPtr getBatchValue() { return batchValue_; }
+  /*tranfer preBatchOutput to batch struct*/
+  void prevOutput2Batch(Matrix &src, Matrix &dst);
+  /*get sequence output from batch struct*/
+  void getSeqOutputFromBatch(Matrix &sequence, Matrix &batch);
+
+  /* Copy the index from another seq2batch. */
+  void shareIndexWith(const SequenceToBatch &seq2batch) {
+    CHECK(useGpu_ == seq2batch.useGpu_);
+    batchStartPositions_ = seq2batch.batchStartPositions_;
+    seq2BatchIdx_ = seq2batch.seq2BatchIdx_;
+    cpuSeq2BatchIdx_ = seq2batch.cpuSeq2BatchIdx_;
+    numBatch_ = seq2batch.numBatch_;
+  }
+
+ protected:
+  void sequence2BatchCopy(Matrix &batch,
+                          Matrix &sequence,
+                          IVector &seq2BatchIdx,
+                          bool seq2batch);
+  void sequence2BatchAdd(Matrix &batch,
+                         Matrix &sequence,
+                         IVector &seq2BatchIdx,
+                         bool seq2batch);
+
+  IVectorPtr batchStartPositions_;
+  IVectorPtr seq2BatchIdx_;
+  IVectorPtr cpuSeq2BatchIdx_;
+  IVectorPtr cpuSeqIdx_;
+  IVectorPtr cpuSeqEndIdxInBatch_;
+  IVectorPtr seqIdx_;
+  IVectorPtr seqEndIdxInBatch_;
+  size_t numBatch_;
+  bool useGpu_;
+  MatrixPtr batchValue_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SliceProjection.cpp b/paddle/legacy/gserver/layers/SliceProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b474f2db759adfad337f9485a5a38588b6839c54
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SliceProjection.cpp
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Projection.h"
+
+namespace paddle {
+
+/**
+ * SliceProjection can slice the input value into multiple parts,
+ * and then select some of them to merge into a new output.
+ *
+ * First, calculate the slices that need to be merged into the output.
+ * slices = input.slices().for_output()
+ *
+ * Second, merge each slice into the output.
+ * for(auto slice: slices) {
+ *   out.addAtOffset(slice, offset);
+ * }
+ *
+ * Input slices as output: s0, s1, ...:
+ *   -----------------------
+ *   |///|   |//////|      |
+ *   |/s0|   |//s1//|      |
+ *   |///|   |//////|      |
+ *   -----------------------
+ * Output, merge s0, s1, ... into one output:
+ *   ----------------
+ *   |///|//////|   |
+ *   |/s0|//s1//|...|
+ *   |///|//////|   |
+ *   ----------------
+ *
+ * The config file api is slice_projection.
+ */
+class SliceProjection : public Projection {
+ public:
+  SliceProjection(const ProjectionConfig& config,
+                  const ParameterPtr& parameter,
+                  bool useGpu);
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+
+ protected:
+  std::vector<std::pair<size_t, size_t>> slices_;
+};
+
+REGISTER_PROJECTION(slice, SliceProjection);
+
+/**
+ * Constructed function.
+ * @note SliceProjection should not have any parameter.
+ */
+SliceProjection::SliceProjection(const ProjectionConfig& config,
+                                 const ParameterPtr& parameter,
+                                 bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  CHECK(!parameter) << "'slice' projection should not have any parameter";
+
+  slices_.reserve(config.slices_size());
+  for (const auto& slice : config.slices()) {
+    slices_.push_back(std::make_pair(slice.start(), slice.end()));
+  }
+}
+
+void SliceProjection::forward() {
+  size_t offset = 0;
+  for (auto& slice : slices_) {
+    auto slice_out = in_->value->subColMatrix(slice.first, slice.second);
+    out_->value->addAtOffset(*slice_out, offset);
+    offset += slice_out->getWidth();
+  }
+}
+
+void SliceProjection::backward(const UpdateCallback& callback) {
+  if (in_->grad) {
+    size_t offset = 0;
+    for (auto& slice : slices_) {
+      auto slice_out = in_->grad->subColMatrix(slice.first, slice.second);
+      slice_out->addAtOffset(*out_->grad, offset);
+      offset += slice_out->getWidth();
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SlopeInterceptLayer.cpp b/paddle/legacy/gserver/layers/SlopeInterceptLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9168fd7dda6dcdcd9e272acbf6337f1c8468e6f0
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SlopeInterceptLayer.cpp
@@ -0,0 +1,94 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for applying a slope and an intercept to the input
+ * element-wise.
+ * This layer is used in NEURAL TURING MACHINE.
+ * @note There is no activation and weight in this layer.
+ *
+ * \f[
+ *    y = ax + b
+ * \f]
+ *
+ * Here, a is scale and b is offset, which are provided as attributes of the
+ * layer.
+ *
+ * The config file api is slope_intercept_layer.
+ */
+
+class SlopeInterceptLayer : public Layer {
+ public:
+  explicit SlopeInterceptLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(slope_intercept, SlopeInterceptLayer);
+
+bool SlopeInterceptLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 1U);
+
+  return true;
+}
+
+void SlopeInterceptLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV = getInputValue(0);
+
+  /* malloc memory for the output_ if necessary */
+  size_t batchSize = inV->getHeight();
+  size_t size = getSize();
+
+  CHECK_EQ(size, inV->getWidth());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, size);
+  }
+
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwSlopeInterceptTimer", getName().c_str());
+    outV->mulScalar(*inV, config_.slope());
+    outV->add(config_.intercept());
+  }
+}
+
+void SlopeInterceptLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr outG = getOutputGrad();
+
+  if (inG) {
+    REGISTER_TIMER_INFO("BwSlopeInterceptTimer", getName().c_str());
+    inG->add(*outG, config_.slope());
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp b/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/SpatialPyramidPoolLayer.cpp
rename to paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.cpp
diff --git a/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.h b/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d8ed9c87889a93664f09dbaf2a84bd00b1757ad
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SpatialPyramidPoolLayer.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "PoolProjection.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+/**
+ * @brief A layer for spatial pyramid pooling on the input image by taking
+ * the max, average, etc. within regions, so that the result vector of
+ * different sized images are of the same size.
+ *
+ * The config file api is spp_layer.
+ */
+
+class SpatialPyramidPoolLayer : public Layer {
+ protected:
+  size_t channels_;
+  size_t imgSizeW_;
+  size_t imgSizeH_;
+  size_t pyramidHeight_;
+  std::string poolType_;
+
+  std::vector<std::unique_ptr<PoolProjection>> poolProjections_;
+  std::vector<Argument> projOutput_;
+  std::vector<std::pair<size_t, size_t>> projCol_;
+
+ public:
+  explicit SpatialPyramidPoolLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  ProjectionConfig getConfig(size_t sizeX_,
+                             size_t sizeY_,
+                             size_t channels,
+                             size_t pyamidLevel_,
+                             std::string& poolType_);
+  size_t getSize();
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SubNestedSequenceLayer.cpp b/paddle/legacy/gserver/layers/SubNestedSequenceLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f363c2ac8dd22fc8b8e1d7fca27e5beb935d42de
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SubNestedSequenceLayer.cpp
@@ -0,0 +1,187 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+class SubNestedSequenceLayer : public Layer {
+ public:
+  explicit SubNestedSequenceLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+ private:
+  /*
+   * This functions generates the indices of rows in a batch according to the
+   * indices of selected sub-sequence in each sequence.
+   *
+   * Examples:
+   * selectedIndices:
+   *   [
+   *     [0, 1, -1],
+   *     [0, 1, 2],
+   *     [0, -1, -1],
+   *     [0, 2, 3],
+   *   ]
+   * inputSeqInfo:
+   *   [
+   *     [0,3,4],
+   *     [4,5,7,10,15],
+   *     [15,20],
+   *     [20,22,23,25,28]
+   *   ]
+   *
+   * ths output is saved to private member rowIndice_;
+   * [0,1,2,3,4,5,6,7,8,9,15,16,17,18,19,20,21,23,24,25,26,27]
+   */
+
+  void calSelectedRows(const MatrixPtr selectedIndices,
+                       const std::vector<std::vector<int>>& inputSeqInfo);
+
+  /*
+   * TODO(caoying)
+   * In PaddePaddle, currently all matrices are real number types,
+   * but the second is some selected indices of the give sequence to trim
+   * the nested sequence, are actually filled with int types so that storing
+   * int types information in real number matrices is very dangerous, since
+   * real numbers will be convered to int types. If a user fills this matrix
+   * himself, invalid data may occor.
+   *
+   * if the second input of this layer is on GPU memory, copy it to CPU memory.
+   */
+  MatrixPtr selIdsCpu_;
+
+  /*
+   * reorganize sequenceStartPositions and subSequenceStartPositions
+   * into a 2d vector to facilitate the sequence selection process.
+   */
+  std::vector<std::vector<int>> inputSeqInfoVec_;
+
+  /* store the final selected row indices in a batch */
+  IVectorPtr rowIndice_;
+  /* rowIndice_ and selectedRows_ actually share a same memory. */
+  std::vector<int> selectedRows_;
+};
+
+REGISTER_LAYER(sub_nested_seq, SubNestedSequenceLayer);
+
+bool SubNestedSequenceLayer::init(const LayerMap& layerMap,
+                                  const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(2U, inputLayers_.size());
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SubNestedSequenceLayer::calSelectedRows(
+    const MatrixPtr selectedIndices,
+    const std::vector<std::vector<int>>& inputSeqInfo) {
+  selectedRows_.clear();
+
+  std::vector<int> outSeqStartInfo(1, 0);
+  std::vector<int> outSubSeqStartInfo(1, 0);
+
+  size_t seqNum = selectedIndices->getHeight();
+  size_t beamSize = selectedIndices->getWidth();
+  for (size_t i = 0; i < seqNum; ++i) {
+    for (size_t j = 0; j < beamSize; ++j) {
+      if (selectedIndices->getElement(i, j) == -1.) break;
+      size_t selSubSeqIdx = selectedIndices->getElement(i, j);
+      CHECK_GT(inputSeqInfoVec_[i].size() - 1, selSubSeqIdx);
+
+      size_t subSeqLen = inputSeqInfoVec_[i][selSubSeqIdx + 1] -
+                         inputSeqInfoVec_[i][selSubSeqIdx];
+      for (size_t k = 0; k < subSeqLen; ++k)
+        selectedRows_.push_back(inputSeqInfoVec_[i][selSubSeqIdx] + k);
+      outSubSeqStartInfo.push_back(outSubSeqStartInfo.back() + subSeqLen);
+    }
+    outSeqStartInfo.push_back(outSubSeqStartInfo.back());
+  }
+
+  if (useGpu_) {
+    rowIndice_ = IVector::create(selectedRows_.size(), useGpu_);
+    rowIndice_->copyFrom(selectedRows_.data(), selectedRows_.size());
+  } else {
+    rowIndice_ =
+        IVector::create(selectedRows_.data(), selectedRows_.size(), useGpu_);
+  }
+
+  // create the sequence information for the output.
+  ICpuGpuVector::resizeOrCreate(
+      output_.sequenceStartPositions, outSeqStartInfo.size(), false);
+  output_.sequenceStartPositions->copyFrom(
+      outSeqStartInfo.data(), outSeqStartInfo.size(), false);
+
+  ICpuGpuVector::resizeOrCreate(
+      output_.subSequenceStartPositions, outSubSeqStartInfo.size(), false);
+  output_.subSequenceStartPositions->copyFrom(
+      outSubSeqStartInfo.data(), outSubSeqStartInfo.size(), false);
+}
+
+void SubNestedSequenceLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& inputSeq = getInput(0);
+  CHECK(inputSeq.hasSubseq()) << "The first input of SubNestSequence layer "
+                              << "must be a nested sequence.";
+  const MatrixPtr selectedIndices = getInputValue(1);
+  CHECK_EQ(size_t(inputSeq.getNumSequences()), selectedIndices->getHeight());
+
+  if (dynamic_cast<GpuMatrix*>(selectedIndices.get())) {
+    /*
+     * Currently, the second input for this layer is generated by
+     * kmax_sequence_score_layer whose output is always stored on CPU,
+     * or a data_layer which canbe on GPU.
+     *
+     * If the second input is on GPU, copy it to CPU memory, because this
+     * input always uses very few memory, and operations related to it are
+     * all logic control, not computations.
+     */
+    Matrix::resizeOrCreate(selIdsCpu_,
+                           selectedIndices->getHeight(),
+                           selectedIndices->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
+    selIdsCpu_->copyFrom(*selectedIndices);
+  } else {
+    selIdsCpu_ = selectedIndices;
+  }
+
+  Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
+                              inputSeq.subSequenceStartPositions,
+                              inputSeqInfoVec_);
+  calSelectedRows(selIdsCpu_, inputSeqInfoVec_);
+
+  resetOutput(selectedRows_.size(), getSize());
+  getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
+}
+
+void SubNestedSequenceLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inputSeqGrad = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+
+  if (inputSeqGrad) outputGrad->addToRows(*inputSeqGrad, *rowIndice_);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SubSequenceLayer.cpp b/paddle/legacy/gserver/layers/SubSequenceLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..36796f04739054bb19d4a3ce656e248898ba4b17
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SubSequenceLayer.cpp
@@ -0,0 +1,226 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A layer for taking the subsequence according to given offset and size
+ * Input: original sequence, offset, size
+ * Output: subsequence
+ */
+
+class SubSequenceLayer : public Layer {
+ protected:
+  std::unique_ptr<Weight> biases_;
+  MatrixPtr tmpSrc_;
+  MatrixPtr tmpDest_;
+
+ public:
+  explicit SubSequenceLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(subseq, SubSequenceLayer);
+
+bool SubSequenceLayer::init(const LayerMap& layerMap,
+                            const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  // sequene concatenation layer should have exactly 2 inputs
+  CHECK_EQ(3U, inputLayers_.size());
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+
+  tmpSrc_ =
+      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
+  tmpDest_ =
+      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
+
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SubSequenceLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  size_t dim = getSize();
+
+  const Argument& input = getInput(0);
+  size_t numSequences1 = input.getNumSequences();
+  auto startPositions1 = input.sequenceStartPositions->getVector(false);
+
+  const Argument& offsetSeq = getInput(1);
+  size_t numSequences2 = offsetSeq.getNumSequences();
+  auto startPositions2 = offsetSeq.sequenceStartPositions->getVector(false);
+
+  const Argument& sizeSeq = getInput(2);
+  size_t numSequences3 = sizeSeq.getNumSequences();
+  auto startPositions3 = sizeSeq.sequenceStartPositions->getVector(false);
+
+  CHECK_EQ(dim, input.value->getWidth());
+
+  CHECK_EQ(startPositions1->getData()[numSequences1], input.getBatchSize());
+  CHECK_EQ(numSequences1, startPositions1->getSize() - 1);
+
+  CHECK_EQ(startPositions2->getData()[numSequences2], offsetSeq.getBatchSize());
+  CHECK_EQ(numSequences2, startPositions2->getSize() - 1);
+
+  CHECK_EQ(startPositions3->getData()[numSequences3], sizeSeq.getBatchSize());
+  CHECK_EQ(numSequences3, startPositions3->getSize() - 1);
+
+  CHECK_EQ(numSequences1, numSequences2);
+  CHECK_EQ(numSequences2, numSequences3);
+
+  MatrixPtr inputValue = input.value;
+  IVectorPtr offsetValue;
+  IVectorPtr sizeValue;
+
+  if (useGpu_) {
+    // copy to cpu
+    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
+    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
+    offsetValue->copyFrom(*offsetSeq.ids);
+    sizeValue->copyFrom(*sizeSeq.ids);
+  } else {
+    offsetValue = offsetSeq.ids;
+    sizeValue = sizeSeq.ids;
+  }
+
+  CHECK_EQ(offsetValue->getSize(), numSequences1);
+  CHECK_EQ(sizeValue->getSize(), numSequences1);
+
+  int* offsets = offsetValue->getData();
+  int* sizes = sizeValue->getData();
+
+  // get total height of output
+  size_t height = 0;
+  for (size_t seqId = 0; seqId < numSequences1; seqId++) {
+    height += sizes[seqId];
+  }
+
+  // reset output
+  resetOutput(height, dim);
+
+  MatrixPtr outputValue = getOutputValue();
+
+  const int* starts1 = startPositions1->getData();
+
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SubSequenceLayerForward", getName().c_str());
+
+    size_t offsetIn = 0;
+    size_t offsetOut = 0;
+    size_t size = 0;
+    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
+      offsetIn = starts1[seqId] + offsets[seqId];
+      size = sizes[seqId];
+
+      outputValue->subMatrix(offsetOut, size, tmpDest_)
+          ->assign(*(inputValue->subMatrix(offsetIn, size, tmpSrc_)));
+
+      offsetOut += size;
+    }
+
+    // modify the sequenceStartPositions
+    ICpuGpuVector::resizeOrCreate(
+        output_.sequenceStartPositions, numSequences1 + 1, false);
+
+    int* tgtBuf = output_.sequenceStartPositions->getMutableData(false);
+    int offset = 0;
+    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
+      tgtBuf[seqId] = offset;
+      offset += sizes[seqId];
+    }
+    tgtBuf[numSequences1] = offset;
+  }
+
+  if (biases_.get() != NULL) {
+    MatrixPtr outV = getOutputValue();
+    outV->addBias(*(biases_->getW()), 1);
+  }
+
+  /* activation */
+  forwardActivation();
+}
+
+void SubSequenceLayer::backward(const UpdateCallback& callback) {
+  /* activation */
+  backwardActivation();
+
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    // Increasing the number of gradient
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  MatrixPtr inputGrad1 = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+  auto startPositions1 = getInput(0).sequenceStartPositions->getVector(false);
+  size_t numSequences1 = startPositions1->getSize() - 1;
+  const int* starts1 = startPositions1->getData();
+
+  const Argument& offsetSeq = getInput(1);
+  const Argument& sizeSeq = getInput(2);
+  IVectorPtr offsetValue;
+  IVectorPtr sizeValue;
+
+  if (useGpu_) {
+    // copy to cpu
+    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
+    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
+    offsetValue->copyFrom(*offsetSeq.ids);
+    sizeValue->copyFrom(*sizeSeq.ids);
+  } else {
+    offsetValue = offsetSeq.ids;
+    sizeValue = sizeSeq.ids;
+  }
+
+  int* offsets = offsetValue->getData();
+  int* sizes = sizeValue->getData();
+  {
+    AsyncGpuBlock asyncGpuBlock;
+    REGISTER_TIMER_INFO("SubSequenceLayerBackward", getName().c_str());
+
+    int offsetIn = 0;
+    int offsetOut = 0;
+    int size = 0;
+    for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
+      offsetIn = starts1[seqId] + offsets[seqId];
+      size = sizes[seqId];
+
+      inputGrad1->subMatrix(offsetIn, size, tmpDest_)
+          ->add(*(outputGrad->subMatrix(offsetOut, size, tmpSrc_)));
+      offsetOut += size;
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SumToOneNormLayer.cpp b/paddle/legacy/gserver/layers/SumToOneNormLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..410f4dd7c90e67488bc3dda6dfad551032890d65
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SumToOneNormLayer.cpp
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * A layer for sum-to-one normalization,
+ * which is used in NEURAL TURING MACHINE.
+ * \f[
+ *   out[i] = \frac {in[i]} {\sum_{k=1}^N in[k]}
+ * \f]
+ * where \f$in\f$ is a (batchSize x dataDim) input vector,
+ * and \f$out\f$ is a (batchSize x dataDim) output vector.
+ *
+ * The config file api is sum_to_one_norm_layer.
+ */
+
+class SumToOneNormLayer : public Layer {
+ protected:
+  /// reciprocalRowSum_ = \f$1 / \sum_{k=1}^N in[k]\f$
+  MatrixPtr reciprocalRowSum_;
+  /// dotSum = output_.grad \f$.*\f$ output_.value
+  MatrixPtr dotSum_;
+
+ public:
+  explicit SumToOneNormLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(sum_to_one_norm, SumToOneNormLayer);
+
+bool SumToOneNormLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 1U);
+
+  return true;
+}
+
+void SumToOneNormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV = getInputValue(0);
+
+  /* malloc memory for the output_ if necessary */
+  size_t batchSize = inV->getHeight();
+  size_t dataDim = getSize();
+
+  CHECK_EQ(dataDim, inV->getWidth());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    resetOutput(batchSize, dataDim);
+  }
+
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwSumToOneNormTimer", getName().c_str());
+
+    Matrix::resizeOrCreate(reciprocalRowSum_, batchSize, 1, false, useGpu_);
+    inV->rowSum(*reciprocalRowSum_);
+
+    // todo: matrix checks
+    CHECK_GT(reciprocalRowSum_->getMin(), 0.0);
+
+    reciprocalRowSum_->scalarDiv(*reciprocalRowSum_, 1.0);
+
+    // outV = inV * reciprocalRowSum
+    outV->rowScale(0, *inV, *reciprocalRowSum_);
+  }
+}
+
+void SumToOneNormLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr outG = getOutputGrad();
+
+  size_t batchSize = inV->getHeight();
+
+  if (inG) {
+    REGISTER_TIMER_INFO("BwSumToOneTimer", getName().c_str());
+
+    Matrix::resizeOrCreate(dotSum_, batchSize, 1, false, useGpu_);
+
+    // dotSum = outG .* outV
+    dotSum_->zeroMem();
+    dotSum_->rowDotMul(0, *outG, *outV);
+
+    // inG += -1 * (dotSum / rowSum)
+    dotSum_->dotMul(*dotSum_, *reciprocalRowSum_);
+    inG->rowAdd(0, *inG, *dotSum_, -1.0);
+    // inG += outG * (1/rowSum)
+    inG->addRowScale(0, *outG, *reciprocalRowSum_);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SwitchOrderLayer.cpp b/paddle/legacy/gserver/layers/SwitchOrderLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..513f3df7bcaf854835ec0e500d47c23469d5aa46
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SwitchOrderLayer.cpp
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "SwitchOrderLayer.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(switch_order, SwitchOrderLayer);
+
+bool SwitchOrderLayer::init(const LayerMap& layerMap,
+                            const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  auto& img_conf = config_.inputs(0).image_conf();
+  size_t inD = img_conf.img_size_z();
+  size_t inH =
+      img_conf.has_img_size_y() ? img_conf.img_size_y() : img_conf.img_size();
+  size_t inW = img_conf.img_size();
+  size_t inC = img_conf.channels();
+  inH = inH * inD;
+  inDims_ = TensorShape({0, inC, inH, inW});
+  outDims_ = TensorShape(4);
+
+  auto& reshape_conf = config_.reshape_conf();
+  for (int i = 0; i < reshape_conf.height_axis_size(); i++) {
+    heightAxis_.push_back(reshape_conf.height_axis(i));
+  }
+  for (int i = 0; i < reshape_conf.width_axis_size(); i++) {
+    widthAxis_.push_back(reshape_conf.width_axis(i));
+  }
+  createFunction(nchw2nhwc_, "NCHW2NHWC", FuncConfig());
+  createFunction(nhwc2nchw_, "NHWC2NCHW", FuncConfig());
+  return true;
+}
+
+void SwitchOrderLayer::setOutDims() {
+  outDims_.setDim(0, inDims_[0]);
+  outDims_.setDim(1, inDims_[2]);
+  outDims_.setDim(2, inDims_[3]);
+  outDims_.setDim(3, inDims_[1]);
+  reshapeHeight_ = 1;
+  for (size_t i = 0; i < heightAxis_.size(); i++) {
+    reshapeHeight_ *= outDims_[heightAxis_[i]];
+  }
+  output_.setFrameHeight(reshapeHeight_);
+  reshapeWidth_ = 1;
+  for (size_t i = 0; i < widthAxis_.size(); i++) {
+    reshapeWidth_ *= outDims_[widthAxis_[i]];
+  }
+  output_.setFrameWidth(reshapeWidth_);
+}
+
+void SwitchOrderLayer::setInDims() {
+  MatrixPtr input = inputLayers_[0]->getOutputValue();
+  size_t batchSize = input->getHeight();
+  inDims_.setDim(0, batchSize);
+  int d = inputLayers_[0]->getOutput().getFrameDepth();
+  d = (d == 0 ? 1 : d);
+  int h = inputLayers_[0]->getOutput().getFrameHeight();
+  if (h != 0) inDims_.setDim(2, h * d);
+  int w = inputLayers_[0]->getOutput().getFrameWidth();
+  if (w != 0) inDims_.setDim(3, w);
+  int totalCount = input->getElementCnt();
+  int channels = totalCount / (inDims_[0] * inDims_[2] * inDims_[3]);
+  if (channels != 0) inDims_.setDim(1, channels);
+}
+
+void SwitchOrderLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  setInDims();
+  setOutDims();
+  resetOutput(outDims_[0], outDims_[1] * outDims_[2] * outDims_[3]);
+  if (heightAxis_.size() > 0) {
+    resetOutput(reshapeHeight_, reshapeWidth_);
+  }
+
+  // switch NCHW to NHWC
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), inDims_);
+  outputs.addArg(*getOutputValue(), outDims_);
+  nchw2nhwc_[0]->calc(inputs, outputs);
+  forwardActivation();
+}
+
+void SwitchOrderLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  backwardActivation();
+
+  // switch NHWC to NCHW
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getOutputGrad(), outDims_);
+  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
+  nhwc2nchw_[0]->calc(inputs, outputs);
+}
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/SwitchOrderLayer.h b/paddle/legacy/gserver/layers/SwitchOrderLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a551a2bba698374841e73dc4dbad403034dd300
--- /dev/null
+++ b/paddle/legacy/gserver/layers/SwitchOrderLayer.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * \brief  This layer calculate softmax in image channel dimension.
+ */
+class SwitchOrderLayer : public Layer {
+ public:
+  explicit SwitchOrderLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~SwitchOrderLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+  void setInDims();
+  void setOutDims();
+
+ protected:
+  std::vector<std::shared_ptr<FunctionBase>> nchw2nhwc_;
+  std::vector<std::shared_ptr<FunctionBase>> nhwc2nchw_;
+  TensorShape inDims_;
+  TensorShape outDims_;
+  std::vector<int> heightAxis_;
+  std::vector<int> widthAxis_;
+  size_t reshapeHeight_;
+  size_t reshapeWidth_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/TableProjection.cpp b/paddle/legacy/gserver/layers/TableProjection.cpp
similarity index 100%
rename from paddle/gserver/layers/TableProjection.cpp
rename to paddle/legacy/gserver/layers/TableProjection.cpp
diff --git a/paddle/legacy/gserver/layers/TableProjection.h b/paddle/legacy/gserver/layers/TableProjection.h
new file mode 100644
index 0000000000000000000000000000000000000000..60286149f4227fbc758dca7864c6d1f67782c7ae
--- /dev/null
+++ b/paddle/legacy/gserver/layers/TableProjection.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Projection.h"
+
+namespace paddle {
+
+/**
+ * Table projection takes index data input. It select rows from parameter
+ * where row_id is in input_ids:
+ * \f[
+ *   out.row[i] += table.row[ids[i]]
+ * \f]
+ * where \f$out\f$ is out, \f$table\f$ is parameter, \f$ids\f$ is input_ids,
+ * and \f$i\f$ is row_id.
+ *
+ * The config file api is table_projection.
+ *
+ * @note If \f$ids[i] = -1\f$, it will be ignored.
+ */
+class TableProjection : public Projection {
+ public:
+  TableProjection(const ProjectionConfig& config,
+                  const ParameterPtr& parameter,
+                  bool useGpu);
+  /**
+   * If use sparse row matrix as parameter, prefetch feature ids in input label.
+   */
+  virtual void prefetch(const Argument* in);
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+
+ protected:
+  std::unique_ptr<Weight> table_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/TensorLayer.cpp b/paddle/legacy/gserver/layers/TensorLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7f874bce0f2bdf7ab4771e470e2e4535693ecf68
--- /dev/null
+++ b/paddle/legacy/gserver/layers/TensorLayer.cpp
@@ -0,0 +1,145 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "TensorLayer.h"
+
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(tensor, TensorLayer);
+
+bool TensorLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  /* initialize the weightList */
+  CHECK_EQ(inputLayers_.size(), 2LU);
+  CHECK(parameters_[0]);
+  CHECK(!parameters_[1]);
+
+  // Option the parameters
+  size_t height = inputLayers_[0]->getSize();
+  size_t width = inputLayers_[1]->getSize();
+  CHECK_EQ(width * height * getSize(), parameters_[0]->getSize());
+
+  for (size_t i = 0; i < getSize(); ++i) {
+    // create a new weight
+    Weight* w = new Weight(height, width, parameters_[0], i * width * height);
+
+    // append the new weight to the list
+    weights_.emplace_back(w);
+  }
+
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+
+  return true;
+}
+
+void TensorLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  int batchSize = getInputValue(0)->getHeight();
+  int size = getSize();
+
+  { resetOutput(batchSize, size); }
+
+  MatrixPtr outV = getOutputValue();
+  /* add the bias-vector */
+  if (biases_.get() != NULL) {
+    outV->addBias(*(biases_->getW()), 1);
+  }
+
+  /* e1 * W * trans(e2) */ {
+    MatrixPtr input1 = getInputValue(0);
+    MatrixPtr input2 = getInputValue(1);
+    MatrixPtr tmpMat = Matrix::create(input2->getHeight(),
+                                      input2->getWidth(),
+                                      /* trans= */ false,
+                                      input2->useGpu());
+    REGISTER_TIMER_INFO("TensorFwMulTimer", getName().c_str());
+    for (size_t i = 0; i < getSize(); ++i) {
+      MatrixPtr weights = weights_[i]->getW();
+      tmpMat->mul(*input1, *weights, 1, 0);
+      outV->rowDotMul(i, *tmpMat, *input2);
+    }
+  }
+
+  /* activation */ { forwardActivation(); }
+}
+
+void TensorLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ { backwardActivation(); }
+
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+    /* Increasing the number of gradient */
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  bool syncFlag = hl_get_sync_flag();
+
+  /* Calculate the W-gradient for the current layer */
+  MatrixPtr input1 = getInputValue(0);
+  MatrixPtr input2 = getInputValue(1);
+  MatrixPtr oGrad = getOutputGrad();
+  MatrixPtr tmpMat = Matrix::create(input1->getHeight(),
+                                    input1->getWidth(),
+                                    /* trans= */ false,
+                                    input1->useGpu());
+
+  /* trans(grad * e1) * e2 */ {
+    REGISTER_TIMER_INFO("TensorGradMulTimer", getName().c_str());
+    for (size_t i = 0; i < getSize(); ++i) {
+      if (weights_[i]->getWGrad()) {
+        tmpMat->rowScale(i, *input1, *oGrad);
+        MatrixPtr input1_T = tmpMat->getTranspose();
+        weights_[i]->getWGrad()->mul(*input1_T, *input2, 1, 1);
+      }
+    }
+  }
+
+  hl_set_sync_flag(false);
+
+  /* Calculate the input layers error */ {
+    MatrixPtr preGrad1 = getInputGrad(0);
+    MatrixPtr preGrad2 = getInputGrad(1);
+
+    REGISTER_TIMER_INFO("TensorBpMulTimer", getName().c_str());
+    for (size_t i = 0; i < getSize(); ++i) {
+      MatrixPtr weights = weights_[i]->getW();
+
+      if (NULL != preGrad1) { /* (grad * e2) * trans(W) */
+        tmpMat->rowScale(i, *input2, *oGrad);
+        MatrixPtr weights_T = weights->getTranspose();
+        preGrad1->mul(*tmpMat, *weights_T, 1, 1);
+      }
+      if (NULL != preGrad2) { /* (grad * e1) * W */
+        tmpMat->rowScale(i, *input1, *oGrad);
+        preGrad2->mul(*tmpMat, *weights, 1, 1);
+      }
+    }
+  }
+  hl_set_sync_flag(syncFlag);
+  parameters_[0]->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/TensorLayer.h b/paddle/legacy/gserver/layers/TensorLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc491a7c9f223cf0dff6d878c6ec27a858c7c7b7
--- /dev/null
+++ b/paddle/legacy/gserver/layers/TensorLayer.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ * @brief TensorLayer takes two input vectors.
+ * \f[
+ *     y_{i} = x_{1} * W_{i} * x_{2}^{\rm T}, i=0, 1, ...,K-1
+ * \f]
+ *
+ * - \f$x_{1}\f$: the first input, size is M.
+ * - \f$x_{2}\f$: the second input, size is N.
+ * - y: output, size is K.
+ * - \f$y_{i}\f$: i-th element of y.
+ * - \f$W_{i}\f$: the i-th learned weight, dimensions: [M, N].
+ * - \f$x_{2}^{\rm T}\f$: the transpose of \f$x_{2}\f$.
+ *
+ * The config file api is tensor_layer.
+ */
+
+class TensorLayer : public Layer {
+ protected:
+  WeightList weights_;
+  std::unique_ptr<Weight> biases_;
+
+ public:
+  explicit TensorLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  Weight& getWeight(int idx) { return *weights_[idx]; }
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/TransLayer.cpp b/paddle/legacy/gserver/layers/TransLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fd1d435ea5f53785c9c416146c642637adc786a8
--- /dev/null
+++ b/paddle/legacy/gserver/layers/TransLayer.cpp
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "TransLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+namespace paddle {
+
+REGISTER_LAYER(trans, TransLayer);
+
+bool TransLayer::init(const LayerMap& layerMap,
+                      const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  /* the size of inputs for trans-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+
+  return true;
+}
+
+void TransLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  /* malloc memory for the output_ if necessary */
+  MatrixPtr input = getInputValue(0);
+  int height = input->getHeight();
+  int width = input->getWidth();
+
+  resizeOutput(width, height);
+
+  MatrixPtr outV = getOutputValue();
+
+  /* outV's memory has been allocated, so memAlloc = false */
+  input->transpose(outV, false);
+  if (getInputGrad(0)) {
+    zeroGrad();
+  }
+}
+
+void TransLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  MatrixPtr outputGrad = getOutputGrad();
+  if (outputGrad == NULL) {
+    return;
+  }
+  MatrixPtr preGrad = getInputGrad(0);
+  if (preGrad) {
+    MatrixPtr transGrad = Matrix::create(preGrad->getHeight(),
+                                         preGrad->getWidth(),
+                                         /* trans= */ false,
+                                         preGrad->useGpu());
+    outputGrad->transpose(transGrad, false);
+    preGrad->add(*transGrad);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/TransLayer.h b/paddle/legacy/gserver/layers/TransLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a6b13933f83f30a07ed63d722dbb612c64edae7
--- /dev/null
+++ b/paddle/legacy/gserver/layers/TransLayer.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+
+namespace paddle {
+/**
+ * A layer for transposing a minibatch matrix.
+ * \f[
+     y = x^\mathrm{T}
+ * \f]
+ * where \f$x\f$ is (M x N) input, and \f$y\f$ is (N x M) output.
+ *
+ * The config file api is trans_layer.
+ */
+class TransLayer : public Layer {
+ public:
+  explicit TransLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/TransposedFullMatrixProjection.cpp b/paddle/legacy/gserver/layers/TransposedFullMatrixProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c8533dc7d78ec4fd3629e29e6c1c3e73c6acdc17
--- /dev/null
+++ b/paddle/legacy/gserver/layers/TransposedFullMatrixProjection.cpp
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Projection.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief TransposedFullMatrixProjection performs full matrix multiplication:
+ * out.row[i] += in.row[i] * weight.transpose
+ *
+ * The config file api is trans_full_matrix_projection.
+ */
+class TransposedFullMatrixProjection : public Projection {
+ public:
+  TransposedFullMatrixProjection(const ProjectionConfig& config,
+                                 ParameterPtr parameter,
+                                 bool useGPu);
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+
+ protected:
+  std::unique_ptr<Weight> weight_;
+};
+
+REGISTER_PROJECTION(trans_fc, TransposedFullMatrixProjection);
+
+TransposedFullMatrixProjection::TransposedFullMatrixProjection(
+    const ProjectionConfig& config, ParameterPtr parameter, bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  weight_.reset(
+      new Weight(config.output_size(), config.input_size(), parameter));
+}
+
+void TransposedFullMatrixProjection::forward() {
+  REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
+  out_->value->mul(*(in_->value), *(weight_->getW()->getTranspose()), 1, 1);
+}
+
+void TransposedFullMatrixProjection::backward(const UpdateCallback& callback) {
+  bool syncFlag = hl_get_sync_flag();
+
+  /* Calculate the W-gradient for the current layer */
+  if (weight_->getWGrad()) {
+    REGISTER_TIMER_INFO("GradMulTimer", getName().c_str());
+    weight_->getWGrad()->mul(
+        *(out_->grad->getTranspose()), *(in_->value), 1, 1);
+  }
+
+  // If callback does not change value, backprop error asynchronously so that
+  // we can do the callback concurrently.
+  // This is still a little bit dangerous since theoretically for
+  // SyncMultiGpuMachine it is possible that the value copyback can still
+  // happen at the same time as the error backprop where the value is being
+  // used.
+  hl_set_sync_flag(false);
+
+  /* Calculate the input layers error */
+  if (in_->grad) {
+    REGISTER_TIMER_INFO("BpMulTimer", getName().c_str());
+    in_->grad->mul(*(out_->grad), *(weight_->getW()), 1, 1);
+  }
+
+  hl_set_sync_flag(syncFlag);
+  parameter_->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/UpsampleLayer.cpp b/paddle/legacy/gserver/layers/UpsampleLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3ff5332e6401acc3a28c9808fddd4812a7323544
--- /dev/null
+++ b/paddle/legacy/gserver/layers/UpsampleLayer.cpp
@@ -0,0 +1,108 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+    limitations under the License. */
+
+#include "UpsampleLayer.h"
+#include "iostream"
+
+namespace paddle {
+
+REGISTER_LAYER(upsample, UpsampleLayer);
+
+size_t UpsampleLayer::getOutputSize() {
+  if (upsampleSize_ == 0) {
+    upsampleSize_ = imgSize_ * scale_ - static_cast<int>(padOutX_);
+    upsampleSizeY_ = imgSizeY_ * scaleY_ - static_cast<int>(padOutY_);
+  }
+  return upsampleSize_ * upsampleSizeY_ * channels_;
+}
+
+bool UpsampleLayer::init(const LayerMap& layerMap,
+                         const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+  CHECK_EQ(config_.inputs_size(), 2);
+  const auto& conf = config_.inputs(0).upsample_conf();
+  const auto& img_conf = conf.image_conf();
+
+  imgSizeY_ =
+      img_conf.has_img_size_y() ? img_conf.img_size_y() : img_conf.img_size();
+  imgSize_ = img_conf.img_size();
+  channels_ = img_conf.channels();
+
+  CHECK((conf.has_upsample_size()) || (conf.has_scale()))
+      << "scale or upsample_size is required.";
+
+  if (conf.has_upsample_size()) {
+    upsampleSize_ = conf.upsample_size();
+    upsampleSizeY_ = upsampleSize_;
+    if (conf.has_upsample_size_y()) {
+      upsampleSizeY_ = conf.upsample_size_y();
+    }
+  } else {
+    if (!conf.has_scale_y()) {
+      scale_ = scaleY_ = conf.scale_y();
+      CHECK_GT(static_cast<int>(scale_), 1);
+    } else {
+      scale_ = conf.scale();
+      scaleY_ = conf.scale_y();
+    }
+    padOutX_ = conf.pad_out_x();
+    padOutY_ = conf.pad_out_y();
+    CHECK(!padOutX_ || scale_ == 2)
+        << "Output height padding compensation requires scale_ == 2";
+    CHECK(!padOutY_ || scaleY_ == 2)
+        << "Output width padding compensation requires scaleY_ == 2";
+    upsampleSize_ = upsampleSizeY_ = 0;
+  }
+  return true;
+}
+
+void UpsampleLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr input = getInputValue(0);
+  MatrixPtr mask = inputLayers_[1]->getOutput("mask").value;
+
+  size_t batchSize = input->getHeight();
+  size_t outSize = getOutputSize();
+
+  CHECK_EQ(input->getWidth(), mask->getWidth());
+  CHECK_EQ(mask->getHeight(), batchSize);
+  resetOutput(batchSize, outSize);
+
+  MatrixPtr output = getOutputValue();
+  output->upsampleForward(*input,
+                          *mask,
+                          imgSize_,
+                          imgSizeY_,
+                          channels_,
+                          upsampleSize_,
+                          upsampleSizeY_);
+}
+
+void UpsampleLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr mask = inputLayers_[1]->getOutput("mask").value;
+  MatrixPtr inputGrad = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+  inputGrad->upsampleBackward(*outputGrad,
+                              *mask,
+                              imgSize_,
+                              imgSizeY_,
+                              channels_,
+                              upsampleSize_,
+                              upsampleSizeY_);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/UpsampleLayer.h b/paddle/legacy/gserver/layers/UpsampleLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..2fe5938244c81ab25c66083cc1ad63ba15618aa1
--- /dev/null
+++ b/paddle/legacy/gserver/layers/UpsampleLayer.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * This layer transpose the pooling process.
+ * It takes two input, the first input is the input data, and
+ * the second is the mask data from the max-pool-with-mask layer.
+ *
+ */
+
+class UpsampleLayer : public Layer {
+ public:
+  explicit UpsampleLayer(const LayerConfig& config) : Layer(config) {}
+  ~UpsampleLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+
+  size_t getOutputSize();
+
+ protected:
+  size_t scale_, scaleY_;
+  size_t upsampleSize_, upsampleSizeY_;
+  size_t padOutX_, padOutY_;
+  size_t imgSize_, imgSizeY_;
+  size_t channels_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ValidationLayer.cpp b/paddle/legacy/gserver/layers/ValidationLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9956fd2ed41464eae096911620e160f5ecd89da3
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ValidationLayer.cpp
@@ -0,0 +1,171 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <fstream>
+#include <memory>
+
+#include "ValidationLayer.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+bool ValidationLayer::init(const LayerMap& layerMap,
+                           const ParameterMap& parameterMap) {
+  return Layer::init(layerMap, parameterMap);
+}
+
+void ValidationLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr output = getInputValue(*getOutputLayer());
+  CHECK(output);
+  IVectorPtr label = getInputLabel(*getLabelLayer());
+  CHECK(label);
+  validationImp(output, label);
+}
+
+void ValidationLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+}
+
+bool AucValidation::init(const LayerMap& layerMap,
+                         const ParameterMap& parameterMap) {
+  bool ret = ValidationLayer::init(layerMap, parameterMap);
+  EvaluatorConfig config;
+  config.set_name(getName());
+  config.set_type("last-column-auc");
+  config.add_input_layers(inputLayers_[0]->getName());
+  config.add_input_layers(inputLayers_[1]->getName());
+  if (3 == inputLayers_.size()) {
+    config.add_input_layers(inputLayers_[2]->getName());
+  }
+  evaluator_.reset(Evaluator::create(config));
+  passBegin_ = false;
+  return ret;
+}
+
+void AucValidation::validationImp(MatrixPtr output, IVectorPtr label) {
+  if (!passBegin_) {
+    passBegin_ = true;
+    evaluator_->start();
+  }
+
+  bool supportWeight = (3 == inputLayers_.size()) ? true : false;
+  MatrixPtr weight = supportWeight ? getInputValue(*inputLayers_[2]) : nullptr;
+  if (dynamic_cast<GpuMatrix*>(output.get())) {
+    size_t height = output->getHeight();
+    size_t width = output->getWidth();
+    Matrix::resizeOrCreate(cpuOutput_,
+                           height,
+                           width,
+                           /* trans=*/false,
+                           /* useGpu=*/false);
+    cpuOutput_->copyFrom(*output);
+    IVector::resizeOrCreate(cpuLabel_, height, false);
+    cpuLabel_->copyFrom(*label);
+
+    if (supportWeight) {
+      Matrix::resizeOrCreate(cpuWeight_, height, (size_t)1, false, false);
+      cpuWeight_->copyFrom(*weight);
+    }
+
+    output = cpuOutput_;
+    label = cpuLabel_;
+    weight = cpuWeight_;
+  }
+
+  for (size_t i = 0; i < output->getHeight(); i++) {
+    float y1 = output->getData()[i * output->getWidth() + 1];
+    int* labels = label->getData();
+    predictArray_.push_back(PredictionResult(y1, labels[i]));
+  }
+  std::vector<Argument> arguments;
+  if (3 == inputLayers_.size()) {
+    arguments.resize(3);
+    arguments[2].value = weight;
+  } else {
+    arguments.resize(2);
+  }
+  arguments[0].value = output;
+  arguments[1].ids = label;
+  evaluator_->evalImp(arguments);
+}
+
+void AucValidation::onPassEnd() {
+  if (!FLAGS_predict_file.empty()) {
+    std::ofstream fs(FLAGS_predict_file);
+    CHECK(fs) << "Fail to open " << FLAGS_predict_file;
+    for (auto& res : predictArray_) {
+      fs << res.out << " " << res.label << std::endl;
+    }
+  }
+
+  evaluator_->finish();
+  LOG(INFO) << *evaluator_;
+  passBegin_ = false;
+  predictArray_.clear();
+}
+
+bool PnpairValidation::init(const LayerMap& layerMap,
+                            const ParameterMap& parameterMap) {
+  bool ret = ValidationLayer::init(layerMap, parameterMap);
+  if (!ret) return ret;
+  CHECK_GE(inputLayers_.size(), 3UL);
+  CHECK_LE(inputLayers_.size(), 4UL);
+  EvaluatorConfig config;
+  config.set_name(getName());
+  config.set_type("pnpair");
+  config.add_input_layers(inputLayers_[0]->getName());
+  config.add_input_layers(inputLayers_[1]->getName());
+  config.add_input_layers(inputLayers_[2]->getName());
+  if (4 == inputLayers_.size()) {
+    config.add_input_layers(inputLayers_[3]->getName());
+  }
+  evaluator_.reset(Evaluator::create(config));
+  passBegin_ = false;
+  return true;
+}
+
+void PnpairValidation::validationImp(MatrixPtr output, IVectorPtr label) {
+  if (!passBegin_) {
+    passBegin_ = true;
+    evaluator_->start();
+  }
+  MatrixPtr weight =
+      (4 == inputLayers_.size()) ? getInputValue(*inputLayers_[3]) : nullptr;
+  IVectorPtr info = getInputLabel(*getInfoLayer());
+  std::vector<Argument> arguments;
+  if (4 == inputLayers_.size()) {
+    arguments.resize(4);
+    arguments[3].value = weight;
+  } else {
+    arguments.resize(3);
+  }
+  arguments[0].value = output;
+  arguments[1].ids = label;
+  arguments[2].ids = info;
+  evaluator_->evalImp(arguments);
+}
+
+void PnpairValidation::onPassEnd() {
+  if (!FLAGS_predict_file.empty()) {
+    (dynamic_cast<PnpairEvaluator*>(evaluator_.get()))->printPredictResults();
+  }
+  evaluator_->finish();
+  LOG(INFO) << *evaluator_;
+  passBegin_ = false;
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/gserver/layers/ValidationLayer.h b/paddle/legacy/gserver/layers/ValidationLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbc94e8ef570e2eec1d3737aca97bbf91c1392b2
--- /dev/null
+++ b/paddle/legacy/gserver/layers/ValidationLayer.h
@@ -0,0 +1,104 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <memory>
+
+#include "Layer.h"
+#include "paddle/legacy/gserver/evaluators/Evaluator.h"
+
+DECLARE_int32(trainer_id);
+
+namespace paddle {
+
+class ValidationLayer : public Layer {
+ public:
+  explicit ValidationLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  LayerPtr getOutputLayer() { return inputLayers_[0]; }
+
+  LayerPtr getLabelLayer() { return inputLayers_[1]; }
+
+  LayerPtr getInfoLayer() {
+    assert(inputLayers_.size() > 2);
+    return inputLayers_[2];
+  }
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+  virtual void validationImp(MatrixPtr outputValue, IVectorPtr label) = 0;
+
+  void onPassEnd() override = 0;
+};
+
+/*
+ * AucValidation
+ */
+class AucValidation : public ValidationLayer {
+ public:
+  explicit AucValidation(const LayerConfig& config)
+      : ValidationLayer(config),
+        cpuOutput_(nullptr),
+        cpuLabel_(nullptr),
+        cpuWeight_(nullptr) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void validationImp(MatrixPtr outputValue, IVectorPtr label) override;
+
+  void onPassEnd() override;
+
+  struct PredictionResult {
+    PredictionResult(real __out, int __label) : out(__out), label(__label) {}
+    real out;
+    int label;
+  };
+  std::vector<PredictionResult> predictArray_;
+
+ private:
+  bool passBegin_;
+  std::unique_ptr<Evaluator> evaluator_;
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
+  MatrixPtr cpuWeight_;
+};
+
+/*
+ * positive-negative pair rate Validation
+ */
+class PnpairValidation : public ValidationLayer {
+ public:
+  explicit PnpairValidation(const LayerConfig& config)
+      : ValidationLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void validationImp(MatrixPtr outputValue, IVectorPtr label) override;
+
+  void onPassEnd() override;
+
+ private:
+  bool passBegin_;
+  std::unique_ptr<Evaluator> evaluator_;
+};
+
+typedef std::shared_ptr<ValidationLayer> ValidationLayerPtr;
+}  // namespace paddle
diff --git a/paddle/gserver/layers/WarpCTCLayer.cpp b/paddle/legacy/gserver/layers/WarpCTCLayer.cpp
similarity index 100%
rename from paddle/gserver/layers/WarpCTCLayer.cpp
rename to paddle/legacy/gserver/layers/WarpCTCLayer.cpp
diff --git a/paddle/legacy/gserver/layers/WarpCTCLayer.h b/paddle/legacy/gserver/layers/WarpCTCLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..3017ca794ecc14f5a3cbd0b302a4953a191a5065
--- /dev/null
+++ b/paddle/legacy/gserver/layers/WarpCTCLayer.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer integrating the open-source warp-ctc library
+ *        <https://github.com/baidu-research/warp-ctc> to compute connectionist
+ *        temporal classification cost.
+ *
+ * The config file api is warp_ctc_layer.
+ */
+class WarpCTCLayer : public Layer {
+ public:
+  explicit WarpCTCLayer(const LayerConfig& config) : Layer(config) {}
+  ~WarpCTCLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+
+ protected:
+  /**
+   * sequence matrix and batch matrix copy:
+   * sequence (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
+   * batch    (s0, s1, s2, s3; s0, s1, s2, 0; s0, 0, s2, 0; s0, 0, 0, 0)
+   */
+  void seq2batchPadding(const MatrixPtr& seqValue,
+                        MatrixPtr& batchValue,
+                        const ICpuGpuVectorPtr& seqStartPositions);
+  void batch2seqPadding(const MatrixPtr& seqValue,
+                        MatrixPtr& batchValue,
+                        const ICpuGpuVectorPtr& seqStartPositions,
+                        bool normByTimes);
+
+ protected:
+  size_t numClasses_;
+  size_t blank_;
+  size_t maxSequenceLength_;
+  bool normByTimes_;
+
+  MatrixPtr batchValue_;
+  MatrixPtr batchGrad_;
+  VectorPtr workspace_;
+
+  IVectorPtr cpuLabels_;
+  MatrixPtr cpuCosts_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/tests/.gitignore b/paddle/legacy/gserver/tests/.gitignore
similarity index 100%
rename from paddle/gserver/tests/.gitignore
rename to paddle/legacy/gserver/tests/.gitignore
diff --git a/paddle/legacy/gserver/tests/CMakeLists.txt b/paddle/legacy/gserver/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..93ddf5aa233017d4f5139a8add6c69ef3a4682b4
--- /dev/null
+++ b/paddle/legacy/gserver/tests/CMakeLists.txt
@@ -0,0 +1,103 @@
+# gserver pacakge unittests
+add_simple_unittest(test_LinearChainCRF)
+add_simple_unittest(test_RecurrentLayer)
+
+if(NOT MOBILE_INFERENCE)
+  add_simple_unittest(test_MultinomialSampler)
+endif()
+
+function(gserver_test TARGET)
+  add_unittest_without_exec(${TARGET}
+      ${TARGET}.cpp
+      LayerGradUtil.cpp)
+  add_test(NAME ${TARGET}
+      COMMAND ${TARGET})
+endfunction()
+
+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/concat_dotmul_a.conf
+    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/* ${CMAKE_CURRENT_BINARY_DIR}
+)
+add_custom_target(copy_gserver_conf ALL DEPENDS concat_dotmul_a.conf)
+
+gserver_test(test_LayerGrad)
+gserver_test(test_CRFLayerGrad)
+gserver_test(test_CrossEntropyOverBeamGrad)
+gserver_test(test_SeqSliceLayerGrad)
+gserver_test(test_ActivationGrad)
+gserver_test(test_ConvTrans)
+gserver_test(test_PriorBox)
+gserver_test(test_DetectionOutput)
+gserver_test(test_ConvUnify)
+gserver_test(test_BatchNorm)
+gserver_test(test_KmaxSeqScore)
+gserver_test(test_Expand)
+gserver_test(test_MaxPoolingWithMaskOutput)
+gserver_test(test_Upsample)
+
+set(PYTHON_PATH 
+   ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
+   ${PADDLE_BINARY_DIR}/python/:${PADDLE_BINARY_DIR}/paddle/legacy/gserver/tests)
+function(gserver_test_with_python TARGET)
+  add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
+  add_test(NAME ${TARGET}
+    COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
+      WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
+endfunction()
+
+gserver_test_with_python(test_PyDataProvider2)
+if(WITH_PYTHON)
+    gserver_test_with_python(test_PyDataProvider)
+endif()
+if(NOT MOBILE_INFERENCE)
+    gserver_test_with_python(test_CompareTwoNets)
+    # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine, I will fix it.
+    gserver_test_with_python(test_RecurrentGradientMachine)
+endif()
+
+########## test_MKLDNN layers and activations ##########
+if(WITH_MKLDNN)
+    add_unittest_without_exec(test_MKLDNN
+        test_MKLDNN.cpp
+        MKLDNNTester.cpp
+        LayerGradUtil.cpp)
+    add_test(NAME test_MKLDNN
+        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
+            WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
+endif()
+
+############### test_WarpCTCLayer #######################
+if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
+    add_unittest_without_exec(test_WarpCTCLayer
+        test_WarpCTCLayer.cpp)
+    add_test(NAME test_WarpCTCLayer
+        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
+        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
+endif()
+
+if(NOT MOBILE_INFERENCE)
+    ################## test_Evaluator #############
+    add_unittest(test_Evaluator
+        test_Evaluator.cpp)
+      
+    ########### test_NetworkCompare ###############
+    add_unittest_without_exec(test_NetworkCompare
+        test_NetworkCompare.cpp)
+    if(WITH_GPU)
+        set(use_gpu true)
+    else()
+        set(use_gpu false)
+    endif()
+    add_test(NAME test_NetworkCompare
+        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=${use_gpu}
+        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle)
+
+    ############ test_CompareSparse ################
+    add_unittest_without_exec(test_CompareSparse
+        test_CompareSparse.cpp)
+    if(NOT ON_TRAVIS)
+      add_test(NAME test_CompareSparse
+        COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port -n 6
+                ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
+        WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
+    endif()
+endif()
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/legacy/gserver/tests/LayerGradUtil.cpp
similarity index 100%
rename from paddle/gserver/tests/LayerGradUtil.cpp
rename to paddle/legacy/gserver/tests/LayerGradUtil.cpp
diff --git a/paddle/legacy/gserver/tests/LayerGradUtil.h b/paddle/legacy/gserver/tests/LayerGradUtil.h
new file mode 100644
index 0000000000000000000000000000000000000000..941989a1da49d215b9ed4af72e732d6a62fd225d
--- /dev/null
+++ b/paddle/legacy/gserver/tests/LayerGradUtil.h
@@ -0,0 +1,329 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+
+#include "paddle/testing/TestUtil.h"
+using namespace std;  // NOLINT
+
+namespace paddle {
+enum InputType {
+  INPUT_DATA,         // dense vector
+  INPUT_LABEL,        // id
+  INPUT_DATA_TARGET,  // dense vector, but no gradient
+  INPUT_SEQUENCE_DATA,
+  INPUT_HASSUB_SEQUENCE_DATA,  // sequence has sub-sequence
+  INPUT_SEQUENCE_MDIM_DATA,
+  INPUT_SEQUENCE_LABEL,
+  INPUT_SPARSE_NON_VALUE_DATA,
+  INPUT_SPARSE_FLOAT_VALUE_DATA,
+  INPUT_DENSE_DIM_DATA,    // using sequence length to init dense data
+  INPUT_SELF_DEFINE_DATA,  // support customizing for input value
+};
+
+struct ParaSparse {
+  bool sparse;
+  string format;
+  // if equalNnzPerSample is set true,
+  // every row of the sparse matrix in a format of CSR has a same
+  // number of nnz values. Currently, this flag is only used for
+  // selective_fc layer
+  bool equalNnzPerSample;
+  ParaSparse(const string& formatIn = "") {  // NOLINT
+    if (formatIn == "") {
+      sparse = false;
+    } else {
+      sparse = true;
+    }
+    equalNnzPerSample = false;
+  }
+  ParaSparse(const string& formatIn, bool equalNnz) {
+    format = formatIn;
+    sparse = true;
+    equalNnzPerSample = equalNnz;
+  }
+};
+
+struct InputDef {
+  InputType inputType;
+  string name;
+  size_t dim;
+  size_t paraSize;
+  ParaSparse sparse;
+  bool isStatic;
+  std::vector<int> labelInitValue;
+  std::vector<int> labelSeqStartPositions;
+  std::vector<int> labelSubSeqStartPositions;
+  std::vector<int> ids;
+  MatrixPtr selfDefinedData;
+
+  InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn) {
+    inputType = type;
+    name = nameIn;
+    dim = dimIn;
+    paraSize = sizeIn;
+    sparse = {""};
+    isStatic = false;
+  }
+
+  InputDef(InputType type,
+           string nameIn,
+           MatrixPtr selfDefinedData,
+           std::vector<int> selfDefinedSeqStartPos = {},
+           std::vector<int> selfDefinedSubSeqStartPos = {})
+      : labelSeqStartPositions(selfDefinedSeqStartPos),
+        labelSubSeqStartPositions(selfDefinedSubSeqStartPos),
+        selfDefinedData(selfDefinedData) {
+    inputType = type;
+    name = nameIn;
+    dim = 0;
+    sparse = {""};
+    paraSize = 0;
+    isStatic = false;
+  }
+
+  InputDef(InputType type,
+           string nameIn,
+           const std::vector<int>& ids,
+           const std::vector<int>& selfDefinedSeqStartPos = {},
+           const std::vector<int>& selfDefinedSubSeqStartPos = {})
+      : labelSeqStartPositions(selfDefinedSeqStartPos),
+        labelSubSeqStartPositions(selfDefinedSubSeqStartPos),
+        ids(ids) {
+    selfDefinedData = nullptr;
+    inputType = type;
+    name = nameIn;
+    dim = 0;
+    sparse = {""};
+    paraSize = 0;
+    isStatic = false;
+  }
+
+  InputDef(InputType type,
+           string nameIn,
+           size_t dimIn,
+           size_t sizeIn,
+           const std::vector<int>& labelInitValue,
+           const std::vector<int>& labelSeqStartPositions)
+      : labelInitValue(labelInitValue),
+        labelSeqStartPositions(labelSeqStartPositions) {
+    inputType = type;
+    name = nameIn;
+    dim = dimIn;
+    paraSize = sizeIn;
+    sparse = {""};
+    isStatic = false;
+  }
+
+  InputDef(InputType type,
+           string nameIn,
+           size_t dimIn,
+           size_t sizeIn,
+           ParaSparse sparseIn) {
+    inputType = type;
+    name = nameIn;
+    dim = dimIn;
+    paraSize = sizeIn;
+    sparse = sparseIn;
+  }
+};
+
+struct TestConfig {
+  LayerConfig layerConfig;
+  std::vector<InputDef> inputDefs;
+  size_t biasSize;
+  real paramInitialMean;
+  real paramInitialStd;
+  bool testAccumulate;
+  bool testState;
+  bool staticBias;
+  bool testBatchState;
+  TestConfig()
+      : biasSize(0),
+        paramInitialMean(0.0),
+        paramInitialStd(1.0),
+        testAccumulate(true),
+        testState(false),
+        staticBias(false),
+        testBatchState(false) {}
+};
+
+real getCostSum(ParameterPtr& parameter,
+                CpuVector& cpuPara,
+                LayerPtr& testLayer,
+                MatrixPtr weights = nullptr);
+
+real getDiffAndPrint(real newCost1,
+                     real newCost2,
+                     real callbackCount,
+                     char fill,
+                     string testLayerName,
+                     string name,
+                     real step,
+                     real delta);
+
+/**
+ * @brief verify that sequentially running forward() one timestamp at one time
+ *        has same result as running forward() with one whole sequence
+ *
+ * @param testLayer[in/out]    testLayer
+ * @param dataLayers[in/out]   dataLayers
+ * @param datas[in/out]        data of dataLayers
+ */
+void testState(LayerPtr testLayer,
+               vector<DataLayerPtr>& dataLayers,
+               vector<Argument>& datas);
+
+/**
+ * @brief verify that sequentially running forward() with short sequences one
+ *        time has same result as running forward() with long sequences.
+ *
+ * @param testLayer[in/out]    testLayer
+ * @param dataLayers[in/out]   dataLayers
+ * @param datas[in/out]        data of dataLayers
+ */
+void testBatchState(LayerPtr testLayer,
+                    vector<DataLayerPtr>& dataLayers,
+                    vector<Argument>& datas);
+
+/**
+ * @brief Generate a perturbation so that it is roughly aligned with the
+ *        gradient direction. This is to make sure that change along this
+ *        direction will make cost increase (or decrease) in a meaningful
+ *        way so that the finite difference can be used to approximate the
+ *        directional dirivative well.
+ *
+ * @param oldGrad[in]  input gradient
+ *        newGrad[out] output gradient
+ *        dim          dimension of oldGrad/newGrad
+ *
+ * @return sum_i(oldGrad[i] * newGrad[i])
+ */
+double genPerturbation(const real* oldGrad, real* newGrad, size_t dim);
+
+void initWeight(MatrixPtr& weights);
+
+void initBatchState(LayerPtr dataLayer,
+                    LayerPtr testLayer,
+                    LayerStatePtr state,
+                    bool useGpu);
+
+/**
+ * @brief initialize the dataLayer by its inputType
+ *
+ * @param testConf[in]        test config
+ *        dataLayers[out]     dataLayers
+ *        datas[out]          initialized data of dataLayers
+ *        layerMap[out]       layerMap
+ */
+void initDataLayer(TestConfig testConf,
+                   std::vector<DataLayerPtr>* dataLayers,
+                   vector<Argument>* datas,
+                   LayerMap* layerMap,
+                   string testLayerName,
+                   size_t batchSize,
+                   bool trans,
+                   bool useGpu);
+
+/**
+ * @brief initialize the parameter of testLayer
+ *
+ * @param testConf[in/out]    test config
+ *        layerMap[out]       layerMap
+ *        parameters[out]     parameters of testLayer
+ *        testLayer[out]      testLayer
+ */
+void initTestLayer(TestConfig testConf,
+                   LayerMap* layerMap,
+                   std::vector<ParameterPtr>* parameters,
+                   LayerPtr* testLayer);
+
+/**
+ * @brief Test whether the layer's forward calculation is stable by adding
+ *        perturbation to its parameters
+ *
+ * @param testConf[in]         test config
+ *        weights[in]          weights of testLayer
+ *        state[in]            state of testLayer
+ *        cost[in]             input cost
+ *        callbackCount[in]    number of done callback
+ *        maxDiff[in/out]      max of all previous diff
+ *        testLayer[in/out]    testLayer
+ *        parameters[in/out]   parameters of testLayer
+ */
+void testPerturbParameter(TestConfig testConf,
+                          const MatrixPtr weights,
+                          const LayerStatePtr state,
+                          real cost,
+                          real callbackCount,
+                          real* maxDiff,
+                          LayerPtr testLayer,
+                          std::vector<ParameterPtr>* parameters);
+
+/**
+ * @brief Test whether the layer's forward calculation is stable by adding
+ *        perturbation to its input layers
+ *
+ * @param testConf[in]         test config
+ *        weights[in]          weights of testLayer
+ *        state[in]            state of testLayer
+ *        cost[in]             input cost
+ *        callbackCount[in]    number of done callback
+ *        maxDiff[in/out]      max of all previous diff
+ *        testLayer[in/out]    testLayer
+ *        dataLayers[in/out]   dataLayers
+ */
+void testPerturbInput(TestConfig testConf,
+                      const MatrixPtr weights,
+                      const LayerStatePtr state,
+                      real cost,
+                      real callbackCount,
+                      real* maxDiff,
+                      LayerPtr testLayer,
+                      std::vector<DataLayerPtr> dataLayers);
+
+void testLayerGradKernel(TestConfig testConf,
+                         string testLayerName,
+                         size_t batchSize,
+                         bool trans,
+                         bool useGpu,
+                         bool useWeight = false,
+                         float epsilon = 0.02);
+
+void testLayerGrad(TestConfig testConf,
+                   string testLayerName,
+                   size_t batchSize,
+                   bool trans,
+                   bool useGpu,
+                   bool useWeight = false,
+                   float epsilon = 0.02);
+
+void testProjectionGrad(ProjectionConfig conf,
+                        InputType inputType,
+                        size_t parameterSize,
+                        size_t batchSize,
+                        bool useGpu,
+                        bool testState = false,
+                        int biasSize = 0,
+                        bool sharedBias = false);
+
+void testOperatorGrad(TestConfig& config,
+                      OperatorConfig& operatorConf,
+                      size_t batchSize,
+                      bool useGpu,
+                      bool testState = false);
+
+}  //  namespace paddle
diff --git a/paddle/legacy/gserver/tests/MKLDNNTester.cpp b/paddle/legacy/gserver/tests/MKLDNNTester.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b550ba9c72d85830dbf12485a6a645a6b5360026
--- /dev/null
+++ b/paddle/legacy/gserver/tests/MKLDNNTester.cpp
@@ -0,0 +1,580 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNTester.h"
+#include "paddle/legacy/gserver/layers/MKLDNNBase.h"
+#include "paddle/legacy/gserver/layers/MKLDNNLayer.h"
+#include "paddle/legacy/trainer/Trainer.h"
+
+namespace paddle {
+
+// init data layer and test layer of both dnn and reference
+void MKLDNNTester::reset(const TestConfig& dnn,
+                         const TestConfig& ref,
+                         size_t batchSize) {
+  const bool trans = false;
+  const bool useGpu = false;
+
+  // clear
+  configs_.clear();
+  layerNames_.clear();
+  dataLayers_.clear();
+  datas_.clear();
+  layerMaps_.clear();
+  parameters_.clear();
+  testLayers_.clear();
+
+  // resize
+  configs_.resize(NUM);
+  layerNames_.resize(NUM);
+  dataLayers_.resize(NUM);
+  datas_.resize(NUM);
+  layerMaps_.resize(NUM);
+  parameters_.resize(NUM);
+  testLayers_.resize(NUM);
+
+  // reset configs and layer names
+  configs_[DNN] = dnn;
+  configs_[REF] = ref;
+  layerNames_[DNN] = "mkldnn";     // the first is mkldnn layer
+  layerNames_[REF] = "reference";  // second is reference layer
+
+  // reset others
+  for (size_t i = 0; i < NUM; ++i) {
+    configs_[i].layerConfig.set_name(layerNames_[i]);
+    initDataLayer(configs_[i],
+                  &(dataLayers_[i]),
+                  &(datas_[i]),
+                  &(layerMaps_[i]),
+                  layerNames_[i],
+                  batchSize,
+                  trans,
+                  useGpu);
+    initTestLayer(
+        configs_[i], &(layerMaps_[i]), &(parameters_[i]), &(testLayers_[i]));
+  }
+  refLayer_ = testLayers_[REF];
+  dnnLayer_ = testLayers_[DNN];
+  EXPECT_EQ(dataLayers_[DNN].size(), dataLayers_[REF].size());
+  EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
+  setInputImgSize();
+
+  // for comparison with Paddle reference results,
+  // need manually add cpu device output for test
+  MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(dnnLayer_);
+  if (dnnLayer) {
+    dnnLayer->addOutputArgument(CPU_DEVICE);
+  }
+}
+
+void MKLDNNTester::setInputImgSize() {
+  for (size_t n = 0; n < dataLayers_.size(); ++n) {
+    for (size_t i = 0; i < dataLayers_[n].size(); ++i) {
+      // TODO(TJ): fix me when concat and elewise ready
+      dataLayers_[n][i]->getOutput().setFrameHeight(ih_);
+      dataLayers_[n][i]->getOutput().setFrameWidth(iw_);
+    }
+  }
+}
+
+// init randome parameters of ref, and copy to mkldnn
+void MKLDNNTester::randomWgtDatas() {
+  EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
+  const bool isBN = refLayer_->getType() == "batch_norm";
+  for (size_t i = 0; i < parameters_[REF].size(); ++i) {
+    const VectorPtr& dnnValue = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
+    const VectorPtr& refValue = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
+    parameters_[REF][i]->randomize();
+    if (isBN && i == 2) {
+      // this param is moving average in batch norm, which must larger than 0
+      real offset = fabs(refValue->getMin()) + 1.0;
+      refValue->add(offset);
+    }
+    dnnValue->copyFrom(*refValue);
+
+    VLOG(MKLDNN_TESTS) << "Random weight " << parameters_[DNN][i]->getName();
+    printVector(dnnValue);
+  }
+}
+
+// random botdata of ref layer and copy same to mkldnn
+void MKLDNNTester::randomBotDatas() {
+  CHECK_EQ(dataLayers_.size(), NUM);
+  for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
+    dataLayers_[REF][i]->getOutputValue()->randomizeUniform();
+    dataLayers_[DNN][i]->getOutputValue()->copyFrom(
+        *(dataLayers_[REF][i]->getOutputValue()));
+    VLOG(MKLDNN_TESTS) << "Random Foward, InputValue " << i;
+    printMatrix(dataLayers_[REF][i]->getOutputValue());
+  }
+}
+
+void MKLDNNTester::randomTopDiffs() {
+  refLayer_->getOutputGrad()->randomizeUniform();
+  dnnLayer_->getOutput(CPU_DEVICE)
+      .grad->copyFrom(*(refLayer_->getOutputGrad()));
+  VLOG(MKLDNN_TESTS) << "Random Backward, OutputGrad";
+  printMatrix(refLayer_->getOutputGrad());
+}
+
+void MKLDNNTester::checkForward() {
+  VLOG(MKLDNN_TESTS) << "Check Forward";
+  printTopDatas();
+  double delta =
+      compareMatrix(refLayer_->getOutputValue(), dnnLayer_->getOutputValue());
+  EXPECT_LE(fabs(delta), eps_);
+}
+
+void MKLDNNTester::checkBackwardData() {
+  VLOG(MKLDNN_TESTS) << "Check Backward Data";
+  const bool isBN = refLayer_->getType() == "batch_norm";
+  for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
+    const MatrixPtr& dnnDiff = dataLayers_[DNN][i]->getOutputGrad();
+    const MatrixPtr& refDiff = dataLayers_[REF][i]->getOutputGrad();
+    VLOG(MKLDNN_ALL) << "MKLDNN Backward Result: InputGrad " << i;
+    printMatrix(dnnDiff);
+    VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i;
+    printMatrix(refDiff);
+
+    double delta = compareMatrix(refDiff, dnnDiff);
+    EXPECT_LE(fabs(delta), eps_);
+    if (isBN) {
+      // the other two inputs in batch norm are for moving mean and var
+      // do not have grad to compare
+      break;
+    }
+  }
+}
+
+void MKLDNNTester::checkBackwardWgts() {
+  VLOG(MKLDNN_TESTS) << "Check Backward Weight";
+  CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
+  vector<VectorPtr> dnnWgts;  // used to temply save mkldnn weights
+  saveWgt(parameters_[DNN], dnnWgts);
+
+  MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(dnnLayer_);
+  if (dnnLayer) {
+    dnnLayer->convertWeightsToPaddle();
+  }
+  for (size_t i = 0; i < parameters_[DNN].size(); ++i) {
+    const VectorPtr& dnn = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
+    const VectorPtr& ref = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
+    VLOG(MKLDNN_ALL) << "MKLDNN Result: weight value"
+                     << parameters_[DNN][i]->getName();
+    printVector(dnn);
+    VLOG(MKLDNN_ALL) << "Reference Result: weight value "
+                     << parameters_[REF][i]->getName();
+    printVector(ref);
+
+    double delta = compareVector(ref, dnn);
+    EXPECT_LE(fabs(delta), eps_);
+  }
+
+  VLOG(MKLDNN_ALL) << "Restore dnn weights before comapre";
+  restoreWgt(dnnWgts, parameters_[DNN]);
+}
+
+void MKLDNNTester::saveWgt(const vector<ParameterPtr>& from,
+                           vector<VectorPtr>& to) {
+  const bool useGpu = false;
+  to.resize(from.size());
+  for (size_t i = 0; i < to.size(); ++i) {
+    const VectorPtr& wgt = from[i]->getBuf(PARAMETER_VALUE);
+    to[i] = Vector::create(wgt->getSize(), useGpu);
+    to[i]->copyFrom(*wgt);
+  }
+}
+
+void MKLDNNTester::restoreWgt(const vector<VectorPtr>& from,
+                              vector<ParameterPtr>& to) {
+  CHECK_EQ(from.size(), to.size());
+  for (size_t i = 0; i < from.size(); ++i) {
+    const VectorPtr& wgt = to[i]->getBuf(PARAMETER_VALUE);
+    wgt->copyFrom(*from[i]);
+  }
+}
+
+// clear parameters grad
+void MKLDNNTester::clearWgtDiffs(size_t id) {
+  CHECK_LE(id, parameters_.size());
+  for (size_t n = 0; n < parameters_.size(); ++n) {
+    if (id == n || id == parameters_.size()) {
+      for (size_t i = 0; i < parameters_[n].size(); ++i) {
+        const VectorPtr& grad = parameters_[n][i]->getBuf(PARAMETER_GRADIENT);
+        if (grad) {
+          grad->zeroMem();
+        }
+      }
+    }
+  }
+}
+
+void MKLDNNTester::clearBotDiffs(size_t id) {
+  CHECK_LE(id, dataLayers_.size());
+  for (size_t n = 0; n < dataLayers_.size(); ++n) {
+    if (id == n || id == dataLayers_.size()) {
+      // clear inputs layers of this specific layer
+      for (size_t i = 0; i < dataLayers_[n].size(); ++i) {
+        dataLayers_[n][i]->getOutputGrad()->zeroMem();
+      }
+    }
+  }
+}
+
+void MKLDNNTester::clearTopDatas(size_t id) {
+  CHECK_LE(id, testLayers_.size());
+  for (size_t i = 0; i < testLayers_.size(); ++i) {
+    if (id == i || id == testLayers_.size()) {
+      testLayers_[i]->getOutputValue()->zeroMem();
+    }
+  }
+}
+
+void MKLDNNTester::printTopDatas() {
+  if (!log_) {
+    return;
+  }
+
+  for (int n = 0; n < NUM; ++n) {
+    VLOG(MKLDNN_ALL) << testLayers_[n]->getType()
+                     << " Forward Result: OutputValue";
+    printMatrix(testLayers_[n]->getOutputValue());
+  }
+}
+
+void MKLDNNTester::printMatrix(const MatrixPtr& m) {
+  if (!log_) {
+    return;
+  }
+
+  std::ostringstream ostr;
+  m->print(ostr);
+  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
+}
+
+void MKLDNNTester::printVector(const VectorPtr& v) {
+  if (!log_) {
+    return;
+  }
+
+  std::ostringstream ostr;
+  v->print(ostr, v->getSize());
+  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
+}
+
+double MKLDNNTester::getDelta(const real* refer,
+                              const real* value,
+                              size_t len,
+                              const float failRate,
+                              const float thres) {
+  double delta = 0, sum = 0;
+  int failCnt = 0;
+  const double eps = 1e-5;
+  double maxRatio = 0;
+  for (size_t i = 0; i < len; ++i) {
+    double ref = fabs(refer[i]);
+    double val = fabs(value[i]);
+    double diff = fabs(refer[i] - value[i]);
+    delta += diff;
+    sum += ref;
+    if (ref < eps && val < eps) {  // both values are very small
+      continue;
+    }
+    double ratio = diff / ref;
+    if (ratio > thres) {
+      maxRatio = std::max(maxRatio, ratio);
+      failCnt++;
+    }
+  }
+  EXPECT_FALSE(std::isinf(sum));
+  EXPECT_FALSE(std::isnan(sum));
+  EXPECT_FALSE(std::isnan(delta));
+  VLOG(MKLDNN_ALL) << "reference avg data: " << sum / len
+                   << ", delta: " << delta / sum << ", failCnt:" << failCnt;
+  double res = sum > eps ? delta / sum : eps;
+  return (failCnt / (float)len) > failRate ? maxRatio : res;
+}
+
+double MKLDNNTester::compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2) {
+  CHECK_EQ(m1->getElementCnt(), m2->getElementCnt());
+  return getDelta(m1->getData(), m2->getData(), m1->getElementCnt());
+}
+
+double MKLDNNTester::compareVector(const VectorPtr& v1, const VectorPtr& v2) {
+  CHECK_EQ(v1->getSize(), v2->getSize());
+  return getDelta(v1->getData(), v2->getData(), v1->getSize());
+}
+
+void MKLDNNTester::runOnce() {
+  // test forward
+  randomBotDatas();
+  dnnLayer_->forward(passType_);
+  refLayer_->forward(passType_);
+  checkForward();
+
+  if (passType_ == PASS_TEST) {
+    return;
+  }
+
+  // test backward
+  // simple updater
+  UpdateCallback updateCallback = [](Parameter* para) {
+    auto& grad = para->getBuf(PARAMETER_GRADIENT);
+    auto& value = para->getBuf(PARAMETER_VALUE);
+    real lr = 1e-2;
+    value->add(*grad, lr);
+    grad->zeroMem();
+  };
+  randomTopDiffs();
+  dnnLayer_->backward(updateCallback);
+  refLayer_->backward(updateCallback);
+  checkBackwardData();
+  checkBackwardWgts();
+
+  // clear buffers
+  // ref code will addto the diff, dnn code will writeto it
+  // and clearTopDatas(REF) should be coverd by ref layers
+  clearBotDiffs(REF);
+  clearWgtDiffs(REF);
+  // it is necessary to clear bottom diffs when only activation is dnn type
+  if (configs_[DNN].layerConfig.active_type().compare(0, 7, "mkldnn_") == 0) {
+    clearBotDiffs(DNN);
+  }
+}
+
+void MKLDNNTester::run(const TestConfig& dnn,
+                       const TestConfig& ref,
+                       size_t batchSize,
+                       size_t inputImgH,
+                       size_t inputImgW,
+                       PassType passType,
+                       bool printDetails,
+                       size_t iter,
+                       float epsilon) {
+  CHECK(dnn.layerConfig.type().compare(0, 7, "mkldnn_") == 0 ||
+        dnn.layerConfig.active_type().compare(0, 7, "mkldnn_") == 0)
+      << "should be MKLDNN layer or MKLDNN activation";
+  if (dnn.layerConfig.type() == ref.layerConfig.type()) {
+    VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: "
+                       << dnn.layerConfig.active_type() << " vs "
+                       << ref.layerConfig.active_type();
+  } else {
+    VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: "
+                       << dnn.layerConfig.type() << " vs "
+                       << ref.layerConfig.type();
+  }
+
+  ih_ = inputImgH;
+  iw_ = inputImgW;
+  passType_ = passType;
+  log_ = printDetails;
+  iter_ = iter;
+  eps_ = epsilon;
+
+  // Firstly test mkldnn init from PARAM_FORMAT_ORIGINAL weight
+  reset(dnn, ref, batchSize);
+  randomWgtDatas();
+  clearWgtDiffs();
+  clearBotDiffs();
+  for (size_t i = 0; i < iter_; ++i) {
+    VLOG(MKLDNN_TESTS) << "Check Iteration " << i;
+    runOnce();
+  }
+
+  if (parameters_[DNN].empty()) {
+    // has no paramters
+    return;
+  }
+
+  // After run some iterations, the mkldnn weight has been stored in dnnLayer
+  // and we can also get the mkldnn weight parameter header format.
+  // Weight parameter should always be index 0 (and bias index 1).
+  // TODO(TJ): should also consider mean and var format when batchnorm ready
+  int dnnWgtFmt = parameters_[DNN][0]->getHeaderFormat();
+  int refWgtFmt = parameters_[REF][0]->getHeaderFormat();
+  if (dnnWgtFmt == refWgtFmt) {
+    // weight format are equal, so no need check more
+    return;
+  }
+
+  // then save the weights and restart again
+  vector<VectorPtr> dnnWgts, refWgts;
+  CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
+  saveWgt(parameters_[DNN], dnnWgts);
+  saveWgt(parameters_[REF], refWgts);
+
+  // restart again with dnn weight format
+  reset(dnn, ref, batchSize);
+  // TODO(TJ): should also considerate mean and var format when batchnorm ready
+  parameters_[DNN][0]->setHeaderFormat(dnnWgtFmt);
+
+  // restore wgt
+  restoreWgt(dnnWgts, parameters_[DNN]);
+  restoreWgt(refWgts, parameters_[REF]);
+  clearWgtDiffs();
+  clearBotDiffs();
+
+  for (size_t i = 0; i < iter_; ++i) {
+    VLOG(MKLDNN_TESTS) << "Check Iteration " << i;
+    runOnce();
+  }
+}
+
+void MKLDNNTester::initArgument(DataIn& data,
+                                const std::string& configPath,
+                                const size_t iter) {
+  TrainerConfigHelper config(configPath);
+  size_t batchSize = config.getOptConfig().batch_size();
+  data.inArgs.resize(iter);
+  data.outGrads.resize(iter);
+  data.paraValues.clear();
+  for (const auto& layer_name : config.getModelConfig().input_layer_names()) {
+    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
+                                     config.getModelConfig().layers().end(),
+                                     [=](const LayerConfig& layer_config) {
+                                       return layer_config.name() == layer_name;
+                                     });
+    CHECK(layer_config != config.getModelConfig().layers().end());
+
+    size_t layerSize = layer_config->size();
+    for (size_t i = 0; i < iter; ++i) {
+      Argument arg;
+      arg.value = Matrix::create(batchSize, layerSize, false, false);
+      arg.grad = Matrix::create(batchSize, layerSize, false, false);
+      arg.value->randomizeUniform();
+      arg.value->add(-0.5);
+      arg.value->sigmoid(*arg.value);
+      arg.grad->zeroMem();
+      arg.ids = VectorT<int>::create(batchSize, false);
+      arg.ids->rand(layerSize);
+      generateSequenceStartPositions(batchSize, arg.sequenceStartPositions);
+      data.inArgs[i].push_back(arg);
+    }
+  }
+
+  for (const auto& layer_name : config.getModelConfig().output_layer_names()) {
+    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
+                                     config.getModelConfig().layers().end(),
+                                     [=](const LayerConfig& layer_config) {
+                                       return layer_config.name() == layer_name;
+                                     });
+    CHECK(layer_config != config.getModelConfig().layers().end());
+
+    size_t layerSize = layer_config->size();
+    for (size_t i = 0; i < iter; ++i) {
+      MatrixPtr grad = Matrix::create(batchSize, layerSize, false, false);
+      grad->randomizeUniform();
+      data.outGrads[i].push_back(grad);
+    }
+  }
+
+  for (const auto& para_config : config.getModelConfig().parameters()) {
+    VectorPtr value = Vector::create(para_config.size(), false);
+    value->randnorm(0, 2);
+    data.paraValues.push_back(value);
+  }
+}
+
+void MKLDNNTester::getOutResult(const std::string& configPath,
+                                DataIn& in,
+                                DataOut& out,
+                                bool use_mkldnn,
+                                size_t iter) {
+  FLAGS_use_gpu = false;
+  FLAGS_use_mkldnn = use_mkldnn;
+  *ThreadLocalRand::getSeed() = 1;
+  srand(1);
+
+  Trainer trainer;
+  auto config = std::make_shared<TrainerConfigHelper>(configPath);
+  trainer.init(config, false);
+  auto gradientMachine = trainer.getGradientMachine();
+  std::vector<ParameterPtr> parameters = gradientMachine->getParameters();
+  for (size_t i = 0; i < in.paraValues.size(); i++) {
+    parameters[i]->getBuf(PARAMETER_VALUE)->copyFrom(*in.paraValues[i]);
+  }
+  UpdateCallback simpleUpdate = [](Parameter* para) {
+    auto& grad = para->getBuf(PARAMETER_GRADIENT);
+    auto& value = para->getBuf(PARAMETER_VALUE);
+    real lr = 1e-2;
+    value->add(*grad, lr);
+    grad->zeroMem();
+  };
+
+  vector<Argument> outArgs;
+  gradientMachine->start();
+  out.outValues.clear();
+  out.paraValues.clear();
+  for (size_t i = 0; i < iter; ++i) {
+    VLOG(MKLDNN_TESTS) << "runing iteration " << i;
+    gradientMachine->forward(in.inArgs[i], &outArgs, PASS_TRAIN);
+    // save forward result
+    for (size_t k = 0; k < outArgs.size(); k++) {
+      const MatrixPtr& src = outArgs[k].value;
+      MatrixPtr dst =
+          Matrix::create(src->getHeight(), src->getWidth(), false, false);
+      if (typeid(*src) == typeid(MKLDNNMatrix)) {
+        MKLDNNMatrixPtr dnnSrc = std::dynamic_pointer_cast<MKLDNNMatrix>(src);
+        dnnSrc->copyTo(*dst);
+      } else {
+        dst->copyFrom(*src);
+      }
+      out.outValues.push_back(dst);
+    }
+
+    // random backward input
+    for (size_t k = 0; k < outArgs.size(); k++) {
+      outArgs[k].grad->copyFrom(*in.outGrads[i][k]);
+    }
+    gradientMachine->backward(simpleUpdate);
+  }
+  gradientMachine->finish();
+
+  // save param value
+  for (size_t i = 0; i < in.paraValues.size(); i++) {
+    VectorPtr val = Vector::create(
+        parameters[i]->getBuf(PARAMETER_VALUE)->getSize(), false);
+    val->copyFrom(*parameters[i]->getBuf(PARAMETER_VALUE));
+    out.paraValues.push_back(val);
+  }
+}
+
+void MKLDNNTester::compareResult(DataOut& ref, DataOut& dnn, float eps) {
+  CHECK_EQ(ref.outValues.size(), dnn.outValues.size());
+  CHECK_EQ(ref.paraValues.size(), dnn.paraValues.size());
+  for (size_t i = 0; i < ref.outValues.size(); i++) {
+    VLOG(MKLDNN_TESTS) << "compare value index: " << i;
+    EXPECT_LE(fabs(compareMatrix(ref.outValues[i], dnn.outValues[i])), eps);
+  }
+  for (size_t i = 0; i < ref.paraValues.size(); i++) {
+    VLOG(MKLDNN_TESTS) << "compare param index: " << i;
+    EXPECT_LE(fabs(compareVector(ref.paraValues[i], dnn.paraValues[i])), eps);
+  }
+}
+
+void MKLDNNTester::runNetTest(const std::string& configPath,
+                              size_t iter,
+                              float eps) {
+  DataIn in;
+  initArgument(in, configPath, iter);
+  DataOut outCpu, outDnn;
+  VLOG(MKLDNN_TESTS) << "runing cpu network";
+  getOutResult(configPath, in, outCpu, false, iter);
+  VLOG(MKLDNN_TESTS) << "runing mkldnn network";
+  getOutResult(configPath, in, outDnn, true, iter);
+
+  compareResult(outCpu, outDnn, eps);
+}
+
+}  //  namespace paddle
diff --git a/paddle/legacy/gserver/tests/MKLDNNTester.h b/paddle/legacy/gserver/tests/MKLDNNTester.h
new file mode 100644
index 0000000000000000000000000000000000000000..086846ce537857eb76ffca492246677eb7982a42
--- /dev/null
+++ b/paddle/legacy/gserver/tests/MKLDNNTester.h
@@ -0,0 +1,143 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "LayerGradUtil.h"
+#include "paddle/legacy/gserver/layers/MKLDNNBase.h"
+#include "paddle/legacy/gserver/layers/MKLDNNLayer.h"
+
+namespace paddle {
+
+/**
+ * @brief test the functionality of MKLDNNlayers and MKLDNNActivations
+ * refer to paddle original function
+ */
+class MKLDNNTester {
+  enum {
+    DNN = 0,  // MKLDNN layer
+    REF = 1,  // Reference layer
+    NUM = 2,  // Number of total
+  };
+
+  struct DataIn {
+    std::vector<std::vector<Argument>> inArgs;
+    std::vector<std::vector<MatrixPtr>> outGrads;
+    std::vector<VectorPtr> paraValues;
+  };
+
+  struct DataOut {
+    std::vector<MatrixPtr> outValues;
+    std::vector<VectorPtr> paraValues;
+  };
+
+ protected:
+  std::vector<TestConfig> configs_;
+  vector<string> layerNames_;
+  vector<vector<DataLayerPtr>> dataLayers_;
+  vector<vector<Argument>> datas_;
+  vector<LayerMap> layerMaps_;
+  vector<vector<ParameterPtr>> parameters_;
+  vector<LayerPtr> testLayers_;
+  LayerPtr refLayer_, dnnLayer_;
+
+  /// run some iterations, all the result should pass
+  size_t iter_;
+  /// whether to print out the details
+  bool log_;
+  /// epsilon
+  float eps_;
+  /// input image size, default 1
+  size_t ih_, iw_;
+  /// passType, PASS_TRAIN, PASS_TEST or PASS_GC (Gradient Check pass)
+  PassType passType_;
+
+ public:
+  explicit MKLDNNTester(size_t iter = 3, float epsilon = 1e-4) {
+    iter_ = iter;
+    eps_ = epsilon;
+    log_ = false;
+    passType_ = PASS_TRAIN;
+  }
+
+  ~MKLDNNTester() {}
+
+ public:
+  void run(const TestConfig& dnn,
+           const TestConfig& ref,
+           size_t batchSize,
+           size_t inputImgH = 1,
+           size_t inputImgW = 1,
+           PassType passType = PASS_TRAIN,
+           bool printDetails = false,
+           size_t iter = 3,
+           float epsilon = 1e-4);
+  static void runNetTest(const std::string& configPath,
+                         size_t iter = 2,
+                         float eps = 1e-4);
+  static void initArgument(DataIn& data,
+                           const std::string& configPath,
+                           size_t iter = 2);
+  static void getOutResult(const std::string& configPath,
+                           DataIn& in,
+                           DataOut& out,
+                           bool use_mkldnn,
+                           size_t iter = 2);
+
+ private:
+  void reset(const TestConfig& dnn, const TestConfig& ref, size_t batchSize);
+  void setInputImgSize();
+  void runOnce();
+
+  void randomWgtDatas();
+  void randomBotDatas();
+  void randomTopDiffs();
+
+  void checkForward();
+  void checkBackwardData();
+  void checkBackwardWgts();
+
+  // clear specific layer, clear all when id equals NUM
+  void clearWgtDiffs(size_t id = NUM);
+  void clearBotDiffs(size_t id = NUM);
+  void clearTopDatas(size_t id = NUM);
+
+  void printTopDatas();
+  void printMatrix(const MatrixPtr& m);
+  void printVector(const VectorPtr& v);
+
+  void saveWgt(const vector<ParameterPtr>& from, vector<VectorPtr>& to);
+  void restoreWgt(const vector<VectorPtr>& from, vector<ParameterPtr>& to);
+
+  static double compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2);
+  static double compareVector(const VectorPtr& v1, const VectorPtr& v2);
+  static void compareResult(DataOut& ref, DataOut& dnn, float eps = 1e-4);
+
+  /**
+   * Get delta percent
+   * if many(>failRate) wrong(abs(val-ref)/abs(ref) > thres) points
+   * return the max(diff/ref)
+   * else return sum(abs(diff)) / sum(abs(ref))
+   * The return value should be smaller than eps when passing.
+   */
+  static double getDelta(const real* refer,
+                         const real* value,
+                         size_t len,
+                         const float failRate = 1e-3,
+                         const float thres = 0.1);
+};
+
+}  //  namespace paddle
diff --git a/paddle/gserver/tests/Sequence/dummy.list b/paddle/legacy/gserver/tests/Sequence/dummy.list
similarity index 100%
rename from paddle/gserver/tests/Sequence/dummy.list
rename to paddle/legacy/gserver/tests/Sequence/dummy.list
diff --git a/paddle/gserver/tests/Sequence/tour_dict_phrase.dict b/paddle/legacy/gserver/tests/Sequence/tour_dict_phrase.dict
similarity index 100%
rename from paddle/gserver/tests/Sequence/tour_dict_phrase.dict
rename to paddle/legacy/gserver/tests/Sequence/tour_dict_phrase.dict
diff --git a/paddle/gserver/tests/Sequence/tour_train_wdseg b/paddle/legacy/gserver/tests/Sequence/tour_train_wdseg
similarity index 100%
rename from paddle/gserver/tests/Sequence/tour_train_wdseg
rename to paddle/legacy/gserver/tests/Sequence/tour_train_wdseg
diff --git a/paddle/gserver/tests/Sequence/tour_train_wdseg.nest b/paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest
similarity index 100%
rename from paddle/gserver/tests/Sequence/tour_train_wdseg.nest
rename to paddle/legacy/gserver/tests/Sequence/tour_train_wdseg.nest
diff --git a/paddle/legacy/gserver/tests/Sequence/train.list b/paddle/legacy/gserver/tests/Sequence/train.list
new file mode 100644
index 0000000000000000000000000000000000000000..1109a2449252cb9bfcb10ece4cf9a96e655e5a25
--- /dev/null
+++ b/paddle/legacy/gserver/tests/Sequence/train.list
@@ -0,0 +1 @@
+legacy/gserver/tests/Sequence/tour_train_wdseg
diff --git a/paddle/legacy/gserver/tests/Sequence/train.list.nest b/paddle/legacy/gserver/tests/Sequence/train.list.nest
new file mode 100644
index 0000000000000000000000000000000000000000..a67df35024f456d517899f37272b0f74d822f03d
--- /dev/null
+++ b/paddle/legacy/gserver/tests/Sequence/train.list.nest
@@ -0,0 +1 @@
+legacy/gserver/tests/Sequence/tour_train_wdseg.nest
diff --git a/paddle/gserver/tests/__init__.py b/paddle/legacy/gserver/tests/__init__.py
similarity index 100%
rename from paddle/gserver/tests/__init__.py
rename to paddle/legacy/gserver/tests/__init__.py
diff --git a/paddle/gserver/tests/concat_dotmul_a.conf b/paddle/legacy/gserver/tests/concat_dotmul_a.conf
similarity index 100%
rename from paddle/gserver/tests/concat_dotmul_a.conf
rename to paddle/legacy/gserver/tests/concat_dotmul_a.conf
diff --git a/paddle/gserver/tests/concat_dotmul_b.conf b/paddle/legacy/gserver/tests/concat_dotmul_b.conf
similarity index 100%
rename from paddle/gserver/tests/concat_dotmul_b.conf
rename to paddle/legacy/gserver/tests/concat_dotmul_b.conf
diff --git a/paddle/gserver/tests/concat_fullmatrix_a.conf b/paddle/legacy/gserver/tests/concat_fullmatrix_a.conf
similarity index 100%
rename from paddle/gserver/tests/concat_fullmatrix_a.conf
rename to paddle/legacy/gserver/tests/concat_fullmatrix_a.conf
diff --git a/paddle/gserver/tests/concat_fullmatrix_b.conf b/paddle/legacy/gserver/tests/concat_fullmatrix_b.conf
similarity index 100%
rename from paddle/gserver/tests/concat_fullmatrix_b.conf
rename to paddle/legacy/gserver/tests/concat_fullmatrix_b.conf
diff --git a/paddle/gserver/tests/concat_slice_a.conf b/paddle/legacy/gserver/tests/concat_slice_a.conf
similarity index 100%
rename from paddle/gserver/tests/concat_slice_a.conf
rename to paddle/legacy/gserver/tests/concat_slice_a.conf
diff --git a/paddle/gserver/tests/concat_slice_b.conf b/paddle/legacy/gserver/tests/concat_slice_b.conf
similarity index 100%
rename from paddle/gserver/tests/concat_slice_b.conf
rename to paddle/legacy/gserver/tests/concat_slice_b.conf
diff --git a/paddle/gserver/tests/concat_table_a.conf b/paddle/legacy/gserver/tests/concat_table_a.conf
similarity index 100%
rename from paddle/gserver/tests/concat_table_a.conf
rename to paddle/legacy/gserver/tests/concat_table_a.conf
diff --git a/paddle/gserver/tests/concat_table_b.conf b/paddle/legacy/gserver/tests/concat_table_b.conf
similarity index 100%
rename from paddle/gserver/tests/concat_table_b.conf
rename to paddle/legacy/gserver/tests/concat_table_b.conf
diff --git a/paddle/gserver/tests/img_conv_a.conf b/paddle/legacy/gserver/tests/img_conv_a.conf
similarity index 100%
rename from paddle/gserver/tests/img_conv_a.conf
rename to paddle/legacy/gserver/tests/img_conv_a.conf
diff --git a/paddle/gserver/tests/img_conv_b.conf b/paddle/legacy/gserver/tests/img_conv_b.conf
similarity index 100%
rename from paddle/gserver/tests/img_conv_b.conf
rename to paddle/legacy/gserver/tests/img_conv_b.conf
diff --git a/paddle/gserver/tests/img_conv_c.conf b/paddle/legacy/gserver/tests/img_conv_c.conf
similarity index 100%
rename from paddle/gserver/tests/img_conv_c.conf
rename to paddle/legacy/gserver/tests/img_conv_c.conf
diff --git a/paddle/gserver/tests/img_conv_cudnn.py b/paddle/legacy/gserver/tests/img_conv_cudnn.py
similarity index 100%
rename from paddle/gserver/tests/img_conv_cudnn.py
rename to paddle/legacy/gserver/tests/img_conv_cudnn.py
diff --git a/paddle/gserver/tests/img_conv_exconv.py b/paddle/legacy/gserver/tests/img_conv_exconv.py
similarity index 100%
rename from paddle/gserver/tests/img_conv_exconv.py
rename to paddle/legacy/gserver/tests/img_conv_exconv.py
diff --git a/paddle/gserver/tests/img_pool_a.conf b/paddle/legacy/gserver/tests/img_pool_a.conf
similarity index 100%
rename from paddle/gserver/tests/img_pool_a.conf
rename to paddle/legacy/gserver/tests/img_pool_a.conf
diff --git a/paddle/gserver/tests/img_pool_b.conf b/paddle/legacy/gserver/tests/img_pool_b.conf
similarity index 100%
rename from paddle/gserver/tests/img_pool_b.conf
rename to paddle/legacy/gserver/tests/img_pool_b.conf
diff --git a/paddle/gserver/tests/mkldnn_branch_net.conf b/paddle/legacy/gserver/tests/mkldnn_branch_net.conf
similarity index 100%
rename from paddle/gserver/tests/mkldnn_branch_net.conf
rename to paddle/legacy/gserver/tests/mkldnn_branch_net.conf
diff --git a/paddle/gserver/tests/mkldnn_simple_net.conf b/paddle/legacy/gserver/tests/mkldnn_simple_net.conf
similarity index 100%
rename from paddle/gserver/tests/mkldnn_simple_net.conf
rename to paddle/legacy/gserver/tests/mkldnn_simple_net.conf
diff --git a/paddle/gserver/tests/pyDataProvider.py b/paddle/legacy/gserver/tests/pyDataProvider.py
similarity index 100%
rename from paddle/gserver/tests/pyDataProvider.py
rename to paddle/legacy/gserver/tests/pyDataProvider.py
diff --git a/paddle/gserver/tests/pyDataProvider/pyDataProviderList b/paddle/legacy/gserver/tests/pyDataProvider/pyDataProviderList
similarity index 100%
rename from paddle/gserver/tests/pyDataProvider/pyDataProviderList
rename to paddle/legacy/gserver/tests/pyDataProvider/pyDataProviderList
diff --git a/paddle/gserver/tests/pyDataProvider/trainer.conf b/paddle/legacy/gserver/tests/pyDataProvider/trainer.conf
similarity index 100%
rename from paddle/gserver/tests/pyDataProvider/trainer.conf
rename to paddle/legacy/gserver/tests/pyDataProvider/trainer.conf
diff --git a/paddle/gserver/tests/rnn_data_provider.py b/paddle/legacy/gserver/tests/rnn_data_provider.py
similarity index 100%
rename from paddle/gserver/tests/rnn_data_provider.py
rename to paddle/legacy/gserver/tests/rnn_data_provider.py
diff --git a/paddle/gserver/tests/sequenceGen.py b/paddle/legacy/gserver/tests/sequenceGen.py
similarity index 100%
rename from paddle/gserver/tests/sequenceGen.py
rename to paddle/legacy/gserver/tests/sequenceGen.py
diff --git a/paddle/legacy/gserver/tests/sequence_layer_group.conf b/paddle/legacy/gserver/tests/sequence_layer_group.conf
new file mode 100644
index 0000000000000000000000000000000000000000..ad1b61d5821fd20135e61bb95abdea16d27a6a9a
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_layer_group.conf
@@ -0,0 +1,62 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'legacy/gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+    train_list='legacy/gserver/tests/Sequence/train.list',
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 256
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+# (lstm_input + lstm) is equal to lstmemory 
+with mixed_layer(size=hidden_dim * 4) as lstm_input:
+    lstm_input += full_matrix_projection(input=emb)
+
+lstm = lstmemory_group(
+    input=lstm_input,
+    size=hidden_dim,
+    act=TanhActivation(),
+    gate_act=SigmoidActivation(),
+    state_act=TanhActivation())
+
+lstm_last = last_seq(input=lstm)
+
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=lstm_last)
+
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/legacy/gserver/tests/sequence_lstm.conf b/paddle/legacy/gserver/tests/sequence_lstm.conf
new file mode 100644
index 0000000000000000000000000000000000000000..6ab70e70713f31de31b5cd544cf132e7d0af0f2f
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_lstm.conf
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'legacy/gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+    train_list='legacy/gserver/tests/Sequence/train.list',
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 256
+label_dim = 3
+sparse_update = get_config_arg("sparse_update", bool, False)
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(
+    input=data,
+    size=word_dim,
+    param_attr=ParamAttr(sparse_update=sparse_update))
+
+with mixed_layer(size=hidden_dim * 4) as lstm_input:
+    lstm_input += full_matrix_projection(input=emb)
+
+lstm = lstmemory(
+    input=lstm_input,
+    act=TanhActivation(),
+    gate_act=SigmoidActivation(),
+    state_act=TanhActivation())
+
+lstm_last = last_seq(input=lstm)
+
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=lstm_last)
+
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/legacy/gserver/tests/sequence_nest_layer_group.conf b/paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
new file mode 100644
index 0000000000000000000000000000000000000000..75c36b118979760e034f81e3127a748651f53347
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'legacy/gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+    train_list='legacy/gserver/tests/Sequence/train.list.nest',
+    test_list=None,
+    module='sequenceGen',
+    obj='process2',
+    args={"dict_file": dict_file})
+
+settings(batch_size=2)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 256
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb_group = embedding_layer(input=data, size=word_dim)
+
+
+# (lstm_input + lstm) is equal to lstmemory 
+def lstm_group(lstm_group_input):
+    with mixed_layer(size=hidden_dim * 4) as group_input:
+        group_input += full_matrix_projection(input=lstm_group_input)
+
+    lstm_output = lstmemory_group(
+        input=group_input,
+        name="lstm_group",
+        size=hidden_dim,
+        act=TanhActivation(),
+        gate_act=SigmoidActivation(),
+        state_act=TanhActivation())
+    return lstm_output
+
+
+lstm_nest_group = recurrent_group(
+    input=SubsequenceInput(emb_group), step=lstm_group, name="lstm_nest_group")
+# hasSubseq ->(seqlastins) seq
+lstm_last = last_seq(
+    input=lstm_nest_group, agg_level=AggregateLevel.TO_SEQUENCE)
+
+# seq ->(expand) hasSubseq
+lstm_expand = expand_layer(
+    input=lstm_last,
+    expand_as=emb_group,
+    expand_level=ExpandLevel.FROM_SEQUENCE)
+
+# hasSubseq ->(average) seq
+lstm_average = pooling_layer(
+    input=lstm_expand,
+    pooling_type=AvgPooling(),
+    agg_level=AggregateLevel.TO_SEQUENCE)
+
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=lstm_average)
+
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/legacy/gserver/tests/sequence_nest_rnn.conf b/paddle/legacy/gserver/tests/sequence_nest_rnn.conf
new file mode 100644
index 0000000000000000000000000000000000000000..bc3b22c2a946a62c7a9d3163d3863a090d63539c
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_nest_rnn.conf
@@ -0,0 +1,74 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(train_list='legacy/gserver/tests/Sequence/dummy.list',
+                        test_list=None,
+                        module='rnn_data_provider',
+                        obj='process_subseq')
+
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+# This hierachical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn.conf
+
+def outer_step(x):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+    def inner_step(y):
+        inner_mem = memory(name="inner_rnn_state",
+                           size=hidden_dim,
+                           boot_layer=outer_mem)
+        out = fc_layer(input=[y, inner_mem],
+                        size=hidden_dim,
+                        act=TanhActivation(),
+                        bias_attr=True,
+                        name="inner_rnn_state")
+        return out
+
+    inner_rnn_output = recurrent_group(
+        step=inner_step,
+        name="inner",
+        input=x)
+    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
+
+    # "return last" won't work, because recurrent_group only support the input 
+    # sequence type is same as return sequence type.
+    return inner_rnn_output
+
+out = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=SubsequenceInput(emb))
+
+rep = last_seq(input=out)
+prob = fc_layer(size=label_dim,
+                input=rep,
+                act=SoftmaxActivation(),
+                bias_attr=True)
+
+outputs(classification_cost(input=prob,
+                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_input.conf b/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_input.conf
new file mode 100644
index 0000000000000000000000000000000000000000..165ab229897d32ce2cae1d483b3ffd81392a355a
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_input.conf
@@ -0,0 +1,76 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(train_list='legacy/gserver/tests/Sequence/dummy.list',
+                        test_list=None,
+                        module='rnn_data_provider',
+                        obj='process_subseq')
+
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+# This hierachical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn.conf
+
+def outer_step(wid, x):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+    def inner_step(y, wid):
+        z = embedding_layer(input=wid, size=word_dim)
+        inner_mem = memory(name="inner_rnn_state",
+                           size=hidden_dim,
+                           boot_layer=outer_mem)
+        out = fc_layer(input=[y, z, inner_mem],
+                        size=hidden_dim,
+                        act=TanhActivation(),
+                        bias_attr=True,
+                        name="inner_rnn_state")
+        return out
+
+    inner_rnn_output = recurrent_group(
+        step=inner_step,
+        name="inner",
+        input=[x, wid])
+    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
+
+    # "return last" should also work. But currently RecurrentGradientMachine
+    # does not handle it, and will report error: In hierachical RNN, all out
+    # links should be from sequences now.
+    return inner_rnn_output
+
+out = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=[SubsequenceInput(data), SubsequenceInput(emb)])
+
+rep = last_seq(input=out)
+prob = fc_layer(size=label_dim,
+                input=rep,
+                act=SoftmaxActivation(),
+                bias_attr=True)
+
+outputs(classification_cost(input=prob,
+                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py b/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a48b7f25c454b492d20e807f09f6d788af44681
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
@@ -0,0 +1,96 @@
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(
+    train_list='legacy/gserver/tests/Sequence/dummy.list',
+    test_list=None,
+    module='rnn_data_provider',
+    obj='process_unequalength_subseq')
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 2
+
+speaker1 = data_layer(name="word1", size=dict_dim)
+speaker2 = data_layer(name="word2", size=dict_dim)
+
+emb1 = embedding_layer(input=speaker1, size=word_dim)
+emb2 = embedding_layer(input=speaker2, size=word_dim)
+
+
+# This hierarchical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn_multi_unequalength_inputs.conf
+def outer_step(x1, x2):
+    index = [0]
+
+    def inner_step(ipt):
+        index[0] += 1
+        i = index[0]
+        outer_mem = memory(name="outer_rnn_state_%d" % i, size=hidden_dim)
+
+        def inner_step_impl(y):
+            inner_mem = memory(
+                name="inner_rnn_state_" + y.name,
+                size=hidden_dim,
+                boot_layer=outer_mem)
+            out = fc_layer(
+                input=[y, inner_mem],
+                size=hidden_dim,
+                act=TanhActivation(),
+                bias_attr=True,
+                name='inner_rnn_state_' + y.name)
+            return out
+
+        encoder = recurrent_group(
+            step=inner_step_impl, name='inner_%d' % i, input=ipt)
+        last = last_seq(name="outer_rnn_state_%d" % i, input=encoder)
+        return encoder, last
+
+    encoder1, sentence_last_state1 = inner_step(ipt=x1)
+    encoder2, sentence_last_state2 = inner_step(ipt=x2)
+
+    encoder1_expand = expand_layer(
+        input=sentence_last_state1, expand_as=encoder2)
+
+    return [encoder1_expand, encoder2]
+
+
+encoder1_rep, encoder2_rep = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=[SubsequenceInput(emb1), SubsequenceInput(emb2)],
+    targetInlink=emb2)
+
+encoder1_last = last_seq(input=encoder1_rep)
+encoder1_expandlast = expand_layer(input=encoder1_last, expand_as=encoder2_rep)
+context = mixed_layer(
+    input=[
+        identity_projection(encoder1_expandlast),
+        identity_projection(encoder2_rep)
+    ],
+    size=hidden_dim)
+
+rep = last_seq(input=context)
+prob = fc_layer(
+    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
+
+outputs(
+    classification_cost(
+        input=prob, label=data_layer(
+            name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/sequence_recurrent.py b/paddle/legacy/gserver/tests/sequence_recurrent.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2c6a7935c28838fb12fc6e44d99dd59636bf7dd
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_recurrent.py
@@ -0,0 +1,55 @@
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'legacy/gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+    train_list='legacy/gserver/tests/Sequence/train.list',
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 128
+label_dim = 3
+
+# This config is designed to be equivalent with sequence_recurrent_group.py
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(
+    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
+
+recurrent = recurrent_layer(input=emb, bias_attr=False, act=SoftmaxActivation())
+
+recurrent_last = last_seq(input=recurrent)
+
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=recurrent_last)
+
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/legacy/gserver/tests/sequence_recurrent_group.py b/paddle/legacy/gserver/tests/sequence_recurrent_group.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4638bd9075ff5cdd4a5ed1bc0e0d133f9a9ab86
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_recurrent_group.py
@@ -0,0 +1,68 @@
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'legacy/gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+    train_list='legacy/gserver/tests/Sequence/train.list',
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 128
+label_dim = 3
+
+# This config is designed to be equivalent with sequence_recurrent.py
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(
+    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
+
+
+def step(y):
+    mem = memory(name="rnn_state", size=hidden_dim)
+    with mixed_layer(
+            name="rnn_state",
+            size=hidden_dim,
+            bias_attr=False,
+            act=SoftmaxActivation()) as out:
+        out += identity_projection(input=y)
+        out += full_matrix_projection(
+            input=mem, param_attr=ParamAttr(name="___recurrent_layer_0__"))
+    return out
+
+
+recurrent = recurrent_group(name="rnn", step=step, input=emb)
+
+recurrent_last = last_seq(input=recurrent)
+
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=recurrent_last)
+
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/legacy/gserver/tests/sequence_rnn.conf b/paddle/legacy/gserver/tests/sequence_rnn.conf
new file mode 100644
index 0000000000000000000000000000000000000000..3133595c9ce4c25683c06d326a5ebe9d2bf13077
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_rnn.conf
@@ -0,0 +1,57 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(train_list='legacy/gserver/tests/Sequence/dummy.list',
+                        test_list=None,
+                        module='rnn_data_provider',
+                        obj='process_seq')
+
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+def step(y):
+    mem = memory(name="rnn_state", size=hidden_dim)
+    out = fc_layer(input=[y, mem],
+                    size=hidden_dim,
+                    act=TanhActivation(),
+                    bias_attr=True,
+                    name="rnn_state")
+    return out
+
+out = recurrent_group(
+    name="rnn",
+    step=step,
+    input=emb)
+
+rep = last_seq(input=out)
+prob = fc_layer(size=label_dim,
+                input=rep,
+                act=SoftmaxActivation(),
+                bias_attr=True)
+
+outputs(classification_cost(input=prob,
+                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/sequence_rnn_matched_inputs.py b/paddle/legacy/gserver/tests/sequence_rnn_matched_inputs.py
new file mode 100644
index 0000000000000000000000000000000000000000..921cef04dda0da396a79592b09d7a7e7177462d5
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_rnn_matched_inputs.py
@@ -0,0 +1,84 @@
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(
+    train_list='legacy/gserver/tests/Sequence/dummy.list',
+    test_list=None,
+    module='rnn_data_provider',
+    obj='process_mixed')
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 2
+hidden_dim = 2
+label_dim = 2
+
+data1 = data_layer(name="word1", size=dict_dim)
+data2 = data_layer(name="word2", size=dict_dim)
+label = data_layer(name="label", size=label_dim)
+
+encoding = embedding_layer(input=data2, size=word_dim)
+
+subseq = embedding_layer(input=data1, size=word_dim)
+seq = embedding_layer(input=data2, size=word_dim)
+nonseq = embedding_layer(input=label, size=word_dim)
+
+
+# This hierarchical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn_mixed_inputs.conf
+def outer_step(subseq, seq, nonseq, encoding):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+
+    def inner_step(subseq, seq, nonseq):
+        inner_mem = memory(
+            name="inner_rnn_state", size=hidden_dim, boot_layer=outer_mem)
+
+        out = fc_layer(
+            input=[subseq, seq, nonseq, inner_mem],
+            size=hidden_dim,
+            act=TanhActivation(),
+            bias_attr=True,
+            name='inner_rnn_state')
+        return out
+
+    decoder = recurrent_group(
+        step=inner_step, name='inner', input=[subseq, seq, nonseq])
+    last = last_seq(name="outer_rnn_state", input=decoder)
+    context = simple_attention(
+        encoded_sequence=encoding, encoded_proj=encoding, decoder_state=last)
+    return context
+
+
+out = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=[
+        subseq, expand_layer(
+            seq, expand_as=subseq,
+            expand_level=ExpandLevel.FROM_SEQUENCE), expand_layer(
+                nonseq,
+                expand_as=subseq,
+                expand_level=ExpandLevel.FROM_NO_SEQUENCE),
+        StaticInput(encoding)
+    ])
+
+rep = last_seq(input=out)
+prob = fc_layer(
+    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
+
+outputs(classification_cost(input=prob, label=label))
diff --git a/paddle/legacy/gserver/tests/sequence_rnn_mixed_inputs.py b/paddle/legacy/gserver/tests/sequence_rnn_mixed_inputs.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7bcaf6c4b21272e1c95d6de7e69e4558d52b9c6
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_rnn_mixed_inputs.py
@@ -0,0 +1,78 @@
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(
+    train_list='legacy/gserver/tests/Sequence/dummy.list',
+    test_list=None,
+    module='rnn_data_provider',
+    obj='process_mixed')
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 2
+hidden_dim = 2
+label_dim = 2
+
+data1 = data_layer(name="word1", size=dict_dim)
+data2 = data_layer(name="word2", size=dict_dim)
+label = data_layer(name="label", size=label_dim)
+
+encoding = embedding_layer(input=data2, size=word_dim)
+
+
+# This hierarchical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn_matched_inputs.conf
+def outer_step(subseq, seq, nonseq, encoding):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+
+    def inner_step(data1, data2, label):
+        inner_mem = memory(
+            name="inner_rnn_state", size=hidden_dim, boot_layer=outer_mem)
+
+        subseq = embedding_layer(input=data1, size=word_dim)
+        seq = embedding_layer(input=data2, size=word_dim)
+        nonseq = embedding_layer(input=label, size=word_dim)
+
+        print_layer(input=[data1, seq, label, inner_mem])
+        out = fc_layer(
+            input=[subseq, seq, nonseq, inner_mem],
+            size=hidden_dim,
+            act=TanhActivation(),
+            bias_attr=True,
+            name='inner_rnn_state')
+        return out
+
+    decoder = recurrent_group(
+        step=inner_step, name='inner',
+        input=[subseq, StaticInput(seq), nonseq])
+    last = last_seq(name="outer_rnn_state", input=decoder)
+    context = simple_attention(
+        encoded_sequence=encoding, encoded_proj=encoding, decoder_state=last)
+    return context
+
+
+out = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=[data1, data2, StaticInput(label), StaticInput(encoding)])
+
+rep = last_seq(input=out)
+prob = fc_layer(
+    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
+
+outputs(classification_cost(input=prob, label=label))
diff --git a/paddle/legacy/gserver/tests/sequence_rnn_multi_input.conf b/paddle/legacy/gserver/tests/sequence_rnn_multi_input.conf
new file mode 100644
index 0000000000000000000000000000000000000000..bf4be779a23e081cef33ce2b2734ad91cfa33c0d
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_rnn_multi_input.conf
@@ -0,0 +1,58 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(train_list='legacy/gserver/tests/Sequence/dummy.list',
+                        test_list=None,
+                        module='rnn_data_provider',
+                        obj='process_seq')
+
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+def step(y, wid):
+    z = embedding_layer(input=wid, size=word_dim)
+    mem = memory(name="rnn_state", size=hidden_dim)
+    out = fc_layer(input=[y, z, mem],
+                    size=hidden_dim,
+                    act=TanhActivation(),
+                    bias_attr=True,
+                    name="rnn_state")
+    return out
+
+out = recurrent_group(
+    name="rnn",
+    step=step,
+    input=[emb, data])
+
+rep = last_seq(input=out)
+prob = fc_layer(size=label_dim,
+                input=rep,
+                act=SoftmaxActivation(),
+                bias_attr=True)
+
+outputs(classification_cost(input=prob,
+                            label=data_layer(name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py b/paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
new file mode 100644
index 0000000000000000000000000000000000000000..3612b49c2279874a378d4aaed81623f7d0d2ea2f
--- /dev/null
+++ b/paddle/legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
@@ -0,0 +1,76 @@
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(
+    train_list='legacy/gserver/tests/Sequence/dummy.list',
+    test_list=None,
+    module='rnn_data_provider',
+    obj='process_unequalength_seq')
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 2
+
+speaker1 = data_layer(name="word1", size=dict_dim)
+speaker2 = data_layer(name="word2", size=dict_dim)
+
+emb1 = embedding_layer(input=speaker1, size=word_dim)
+emb2 = embedding_layer(input=speaker2, size=word_dim)
+
+# This hierachical RNN is designed to be equivalent to the RNN in
+# sequence_nest_rnn_multi_unequalength_inputs.conf
+
+
+def step(x1, x2):
+    def calrnn(y):
+        mem = memory(name='rnn_state_' + y.name, size=hidden_dim)
+        out = fc_layer(
+            input=[y, mem],
+            size=hidden_dim,
+            act=TanhActivation(),
+            bias_attr=True,
+            name='rnn_state_' + y.name)
+        return out
+
+    encoder1 = calrnn(x1)
+    encoder2 = calrnn(x2)
+    return [encoder1, encoder2]
+
+
+encoder1_rep, encoder2_rep = recurrent_group(
+    name="stepout", step=step, input=[emb1, emb2])
+
+encoder1_last = last_seq(input=encoder1_rep)
+encoder1_expandlast = expand_layer(input=encoder1_last, expand_as=encoder2_rep)
+context = mixed_layer(
+    input=[
+        identity_projection(encoder1_expandlast),
+        identity_projection(encoder2_rep)
+    ],
+    size=hidden_dim)
+
+rep = last_seq(input=context)
+prob = fc_layer(
+    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
+
+outputs(
+    classification_cost(
+        input=prob, label=data_layer(
+            name="label", size=label_dim)))
diff --git a/paddle/legacy/gserver/tests/test_ActivationGrad.cpp b/paddle/legacy/gserver/tests/test_ActivationGrad.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f468d229a889e02bf79baa29576c638acbd8eb08
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_ActivationGrad.cpp
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+void testActivation(const string& act) {
+  LOG(INFO) << "test activation: " << act;
+  size_t size = 10;
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("addto");
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type(act);
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", size, 0});
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  act + "_activation",
+                  100,
+                  /* trans= */ false,
+                  useGpu,
+                  /* useWeight */ true);
+  }
+}
+
+TEST(Activation, activation) {
+  auto types = ActivationFunction::getAllRegisteredTypes();
+  std::set<string> excluded{"sequence_softmax"};
+  for (auto type : types) {
+    if (excluded.count(type)) continue;
+    testActivation(type);
+  }
+}
+
+void testSequenceSoftmaxAct(bool hasSubseq) {
+  LOG(INFO) << "test activation: sequence softmax";
+
+  const size_t size = 1;
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("addto");
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type("sequence_softmax");
+  config.inputDefs.push_back(
+      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
+       "layer_0",
+       1,
+       0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "sequence_softmax",
+                  100,
+                  /* trans= */ false,
+                  useGpu,
+                  /* useWeight */ true);
+  }
+}
+
+TEST(SequenceSoftmaxActivation, activation) {
+  for (auto hasSubseq : {false, true}) {
+    LOG(INFO) << "hasSubseq = " << hasSubseq;
+    testSequenceSoftmaxAct(hasSubseq);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_BatchNorm.cpp b/paddle/legacy/gserver/tests/test_BatchNorm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e21fa16074406645be88eeb454d743531f825041
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_BatchNorm.cpp
@@ -0,0 +1,195 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/legacy/cuda/include/hl_batch_norm.h"
+#include "paddle/legacy/math/tests/TensorCheck.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
+
+// Test that the batchNormLayer can be followed by a ConvLayer
+TEST(Layer, batchNorm) {
+  FLAGS_use_gpu = false;
+  TestConfig configBN;
+  const int CHANNELS = 6272;
+  const int IMG_SIZE = 1;
+  configBN.layerConfig.set_type("batch_norm");
+  configBN.layerConfig.set_name("bn");
+  configBN.layerConfig.set_size(CHANNELS * IMG_SIZE * IMG_SIZE);
+  configBN.layerConfig.set_active_type("relu");
+  configBN.biasSize = CHANNELS;
+  configBN.inputDefs.push_back({INPUT_DATA,
+                                "layer_0",
+                                /* dim= */ IMG_SIZE * IMG_SIZE * CHANNELS,
+                                /* paraSize= */ CHANNELS});
+
+  configBN.inputDefs.push_back(
+      {INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
+  configBN.inputDefs.back().isStatic = true;
+  configBN.inputDefs.push_back(
+      {INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
+  configBN.inputDefs.back().isStatic = true;
+
+  LayerInputConfig* input = configBN.layerConfig.add_inputs();
+  configBN.layerConfig.add_inputs();
+  configBN.layerConfig.add_inputs();
+
+  ImageConfig* img_conf = input->mutable_image_conf();
+  img_conf->set_channels(CHANNELS);
+  img_conf->set_img_size(IMG_SIZE);
+
+  // Setting up conv-layer config
+  TestConfig config;
+  config.biasSize = 64;
+  config.layerConfig.set_type("exconv");
+  config.layerConfig.set_num_filters(64);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  config.inputDefs.push_back({INPUT_DATA, "bn", 6272, 204800});
+  input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(5);
+  conv->set_filter_size_y(5);
+  conv->set_channels(128);
+  conv->set_padding(1);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(7);
+  conv->set_output_x(3);
+  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+                              config.layerConfig.num_filters());
+  config.layerConfig.set_name("conv");
+
+  // data layer initialize
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(configBN,
+                &dataLayers,
+                &datas,
+                &layerMap,
+                "batch_norm",
+                100,
+                false,
+                false);
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr bnLayer;
+  initTestLayer(configBN, &layerMap, &parameters, &bnLayer);
+
+  std::vector<ParameterPtr> parameters2;
+  LayerPtr convLayer;
+  initTestLayer(config, &layerMap, &parameters2, &convLayer);
+
+  bnLayer->forward(PASS_GC);
+  convLayer->forward(PASS_GC);
+
+  CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getHeight()), 100);
+  CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getWidth()), 576);
+}
+
+#ifdef PADDLE_WITH_CUDA
+void batchNormInference(int n, int c, int h, int w) {
+  MatrixPtr input = std::make_shared<GpuMatrix>(n, c * h * w);
+  MatrixPtr cudnnOut = std::make_shared<GpuMatrix>(n, c * h * w);
+  MatrixPtr cudaOut = std::make_shared<GpuMatrix>(n, c * h * w);
+  MatrixPtr cudnnCheck = std::make_shared<CpuMatrix>(n, c * h * w);
+  MatrixPtr cudaCheck = std::make_shared<CpuMatrix>(n, c * h * w);
+  input->randomizeUniform();
+  cudnnOut->zeroMem();
+  cudaOut->zeroMem();
+
+  MatrixPtr scale = std::make_shared<GpuMatrix>(1, c);
+  scale->randomizeUniform();
+  MatrixPtr bias = std::make_shared<GpuMatrix>(1, c);
+  bias->randomizeUniform();
+
+  MatrixPtr movingMean = std::make_shared<GpuMatrix>(1, c);
+  movingMean->randomizeUniform();
+
+  MatrixPtr movingVar = std::make_shared<GpuMatrix>(1, c);
+  movingVar->randomizeUniform();
+  movingVar->clip(0.01, 50);
+
+  hl_tensor_descriptor ioDesc;
+  hl_tensor_descriptor bnDesc;
+  hl_create_tensor_descriptor(&ioDesc);
+  hl_create_tensor_descriptor(&bnDesc);
+  hl_tensor_reshape(ioDesc, n, c, h, w);
+  hl_tensor_reshape(bnDesc, 1, c, 1, 1);
+
+  double EPS = 1E-5;
+  hl_batch_norm_forward_inference(ioDesc,
+                                  input->getData(),
+                                  ioDesc,
+                                  cudnnOut->getData(),
+                                  bnDesc,
+                                  scale->getData(),
+                                  bias->getData(),
+                                  movingMean->getData(),
+                                  movingVar->getData(),
+                                  EPS);
+
+  hl_batch_norm_cuda_inference(input->getData(),
+                               cudaOut->getData(),
+                               scale->getData(),
+                               bias->getData(),
+                               movingMean->getData(),
+                               movingVar->getData(),
+                               EPS,
+                               n,
+                               c,
+                               h,
+                               w);
+
+  cudnnCheck->copyFrom(*cudnnOut);
+  cudaCheck->copyFrom(*cudaOut);
+  autotest::TensorCheckErr(*cudnnCheck, *cudaCheck);
+
+  hl_destroy_tensor_descriptor(ioDesc);
+  hl_destroy_tensor_descriptor(bnDesc);
+}
+
+TEST(BatchNorm, Inference) {
+  batchNormInference(33, 267, 1, 1);
+  batchNormInference(19, 105, 4, 4);
+}
+#endif
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_CRFLayerGrad.cpp b/paddle/legacy/gserver/tests/test_CRFLayerGrad.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1dafd1de4d82f1d306626090c30cf9203fa24dd0
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_CRFLayerGrad.cpp
@@ -0,0 +1,173 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/gserver/layers/LinearChainCRF.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+static inline bool getNextSequence(std::vector<int>& seq, int numClasses) {
+  for (auto& v : seq) {
+    if (++v < numClasses) {
+      return true;
+    }
+    v = 0;
+  }
+  return false;
+}
+
+// log(exp(x) + exp(y))
+static inline real logSum(real x, real y) {
+  real maxValue = std::max(x, y);
+  if (std::isinf(maxValue)) {
+    return -std::numeric_limits<real>::infinity();
+  } else {
+    return maxValue + log(exp(x - maxValue) + exp(y - maxValue));
+  }
+}
+
+static inline std::vector<int> genRandLabels(int numClasses, int length) {
+  std::vector<int> labels(length);
+  for (int i = 0; i < length; ++i) {
+    labels[i] = rand() % numClasses;  // NOLINT
+  }
+  return labels;
+}
+
+TEST(CRFLayer, cost) {
+  const int numClasses = 4;
+  CpuVector para(numClasses * (numClasses + 2));
+  real* a = para.getData();
+  real* b = para.getData() + numClasses;
+  real* w = para.getData() + 2 * numClasses;
+  LinearChainCRF crf(4, para.getData());
+  for (int length : {1, 2, 3, 10}) {
+    for (int tries = 0; tries < 10; ++tries) {
+      CpuMatrix x(length, numClasses);
+      x.randomizeUniform();
+      para.randnorm(0, 2);
+
+      std::vector<int> goldenLabels = genRandLabels(numClasses, length);
+
+      real cost = crf.forward(x.getData(), goldenLabels.data(), length);
+
+      real logZ = -std::numeric_limits<real>::infinity();
+      real logNominator = -std::numeric_limits<real>::infinity();
+      std::vector<int> testResult(length, 0);
+      do {
+        real score = a[testResult.front()];
+        score += x.getElement(0, testResult.front());
+        for (int k = 1; k < length; ++k) {
+          score += x.getElement(k, testResult[k]) +
+                   w[numClasses * testResult[k - 1] + testResult[k]];
+        }
+        score += b[testResult.back()];
+        logZ = logSum(logZ, score);
+
+        if (goldenLabels == testResult) {
+          logNominator = score;
+        }
+      } while (getNextSequence(testResult, numClasses));
+
+      real trueCost = -logNominator + logZ;
+
+      real diff = fabs(trueCost - cost);
+      diff /= fabs(cost) < fabs(trueCost) ? fabs(cost) : fabs(trueCost);
+      VLOG(1) << "cost=" << cost << " trueCost=" << trueCost << " diff=" << diff
+              << std::endl;
+      if (typeid(real) == typeid(double)) {  // NOLINT
+        EXPECT_LE(diff, 1e-10);
+      } else {
+        EXPECT_LE(diff, 5e-3);
+      }
+    }
+  }
+}
+
+inline real epsilon() { return typeid(real) == typeid(double) ? 1e-10 : 0.06; }
+
+TestConfig initTestConfig(size_t numClasses, bool withWeight) {
+  TestConfig config;
+  config.layerConfig.set_type("crf");
+  config.layerConfig.set_size(numClasses);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
+                              "layer_0",
+                              numClasses,
+                              numClasses * (numClasses + 2)});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_LABEL, "layer_label", numClasses, 0});
+  config.layerConfig.add_inputs();
+
+  if (withWeight) {
+    config.inputDefs.push_back({INPUT_DENSE_DIM_DATA, "layer_weight", 1, 0});
+    config.layerConfig.add_inputs();
+  }
+
+  return config;
+}
+
+TEST(Layer, CRFLayer) {
+  size_t numClasses = 10;
+  for (int tries = 0; tries < 5; ++tries) {
+    TestConfig config = initTestConfig(numClasses, /* withWeight= */ false);
+    for (int length : {1, 3, 100}) {
+      // Not support GPU now
+      testLayerGrad(config,
+                    "crf",
+                    length,
+                    /* trans= */ false,
+                    /* useGpu= */ false,
+                    /* useWeight= */ false,
+                    epsilon());
+    }
+  }
+}
+
+TEST(Layer, CRFLayerUseWeight) {
+  size_t numClasses = 10;
+  for (int tries = 0; tries < 5; ++tries) {
+    TestConfig config = initTestConfig(numClasses, /* withWeight= */ true);
+    for (int length : {1, 3, 100}) {
+      // Not support GPU now
+      testLayerGrad(config,
+                    "crf",
+                    length,
+                    /* trans= */ false,
+                    /* useGpu= */ false,
+                    /* useWeight= */ false,
+                    epsilon());
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  hl_start();
+  hl_init(FLAGS_gpu_id);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_CompareSparse.cpp b/paddle/legacy/gserver/tests/test_CompareSparse.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..11b633a5885180ae227f6e93330117b567d4a4ab
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_CompareSparse.cpp
@@ -0,0 +1,228 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/legacy/utils/PythonUtil.h>
+
+#include "paddle/legacy/trainer/Trainer.h"
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/pserver/ParameterServer2.h>
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+static const string& configFile1 = "legacy/gserver/tests/sequence_lstm.conf";
+
+DECLARE_bool(use_gpu);
+DECLARE_string(config);
+DECLARE_int32(gpu_id);
+DECLARE_int32(seed);
+DECLARE_int32(num_passes);
+DECLARE_int32(saving_period);
+
+DECLARE_int32(num_gradient_servers);
+DECLARE_int32(port);
+DECLARE_bool(local);
+DECLARE_bool(use_old_updater);
+DECLARE_bool(parallel_nn);
+DECLARE_string(config_args);
+DEFINE_double(max_diff_ratio,
+              0.0f,
+              "max diff ratio allowed for parameters value");
+
+int gNumDevices = 0;
+
+std::vector<ParameterPtr> trainerOnePassTest(const string& configFile,
+                                             bool sparseUpdate,
+                                             int trainerCount = 1,
+                                             bool useGpu = false) {
+  FLAGS_use_gpu = useGpu;
+  FLAGS_config = configFile;
+  FLAGS_trainer_count = trainerCount;
+  FLAGS_config_args = sparseUpdate ? "sparse_update=1" : "sparse_update=0";
+
+  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
+            << " configFile=" << configFile << " sparseUpdate=" << sparseUpdate;
+  srand(FLAGS_seed);
+  *ThreadLocalRand::getSeed() = FLAGS_seed;
+  ThreadLocalRandomEngine::get().seed(FLAGS_seed);
+  if (useGpu) {
+    CHECK_LE(trainerCount, gNumDevices);
+  }
+
+  std::vector<std::shared_ptr<ParameterServer2>> pservers;
+  if (!FLAGS_local) {
+    int numPorts = FLAGS_ports_num + FLAGS_ports_num_for_sparse;
+    pservers.resize(numPorts);
+
+    for (int i = 0; i < numPorts; ++i) {
+      pservers[i].reset(new ParameterServer2(std::string(), FLAGS_port + i));
+      pservers[i]->init();
+      pservers[i]->start();
+    }
+  }
+
+  Trainer trainer;
+  trainer.init(TrainerConfigHelper::createFromFlagConfig());
+  trainer.train();
+  return trainer.getGradientMachine()->getParameters();
+}
+
+std::vector<ParameterPtr>& getDenseParameters() {
+  static std::vector<ParameterPtr> denseParameters;
+  if (denseParameters.empty()) {
+    // use dense training as base
+    FLAGS_local = true;
+    denseParameters = trainerOnePassTest(configFile1, false);
+  }
+
+  return denseParameters;
+}
+
+void checkBuffer(real* A,
+                 const char* desA,
+                 real* B,
+                 const char* desB,
+                 size_t len,
+                 double maxDiffRatio) {
+  double maxDiff = 0;
+  double maxValue = 0;
+  for (size_t i = 0; i < len; ++i) {
+    double diff = fabs(A[i] - B[i]);
+    maxValue = std::max<double>(maxValue, std::max(fabs(A[i]), fabs(B[i])));
+    maxDiff = std::max(maxDiff, diff);
+  }
+  EXPECT_LE(maxDiff / maxValue, maxDiffRatio);
+  LOG(INFO) << " maxDiff=" << maxDiff << " maxValue=" << maxValue
+            << " maxDiff/maxValue=" << maxDiff / maxValue << "\n\n";
+}
+
+void compareValue(const vector<ParameterPtr>& parametersA,
+                  const vector<ParameterPtr>& parametersB,
+                  double maxDiffRatio = 0.0) {
+  LOG(INFO) << "\n\n--------------------------------"
+            << " Check Gradient Machine Parameters:"
+            << " -------------------------------------\n";
+  for (size_t i = 0; i < parametersA.size(); ++i) {
+    ParameterPtr parameterA, parameterB;
+    parameterA = parametersA[i];
+    parameterB = parametersB[i];
+
+    CpuVector paraA(parameterA->getSize());
+    CpuVector paraB(parameterB->getSize());
+    paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE));
+    paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE));
+
+    LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
+              << " ; size : " << paraA.getSize() << " ------------";
+    checkBuffer(paraA.getData(),
+                "para_A",
+                paraB.getData(),
+                "para_B",
+                paraA.getSize(),
+                maxDiffRatio);
+  }
+}
+
+TEST(compareSparse, cpu) {
+  FLAGS_local = 1;  // disable remote sparse update in parameter config
+  std::vector<ParameterPtr> parameters = trainerOnePassTest(configFile1, true);
+  compareValue(getDenseParameters(), parameters);
+}
+
+TEST(compareSparse, remote_cpu) {
+  FLAGS_local = 0;  // will enable remote sparse update
+  FLAGS_ports_num_for_sparse = 5;
+  std::vector<ParameterPtr> parameters = trainerOnePassTest(configFile1, true);
+  compareValue(getDenseParameters(), parameters);
+}
+
+TEST(compareSparse, cpu10_local_vs_remote) {
+  FLAGS_local = 1;  // disable remote sparse update in parameter config
+  std::vector<ParameterPtr> localParameters =
+      trainerOnePassTest(configFile1, true, 2);
+
+  FLAGS_local = 0;  // will enable remote sparse update
+  FLAGS_ports_num_for_sparse = 5;
+  std::vector<ParameterPtr> remoteParameters =
+      trainerOnePassTest(configFile1, true, 2);
+
+  compareValue(localParameters, remoteParameters);
+}
+
+TEST(compareSparse, multiGradientMachine) {
+  int numGpu;
+#ifdef PADDLE_TYPE_DOUBLE
+  double eps = 1e-8;
+#else
+  double eps = 1e-4;
+#endif
+  numGpu = hl_get_device_count();
+  for (bool local : {false, true}) {
+    FLAGS_local = local;
+    FLAGS_ports_num_for_sparse = 5;
+    for (bool useGpu : {false, true}) {
+#ifndef PADDLE_WITH_CUDA
+      if (useGpu) continue;
+#endif
+      FLAGS_parallel_nn = useGpu;
+      LOG(INFO) << " local=" << local << " useGpu=" << useGpu;
+      int trainerCount = useGpu ? numGpu : 2;
+      std::vector<ParameterPtr> parameters =
+          trainerOnePassTest(configFile1, true, trainerCount, useGpu);
+      compareValue(getDenseParameters(), parameters, eps);
+    }
+  }
+  FLAGS_parallel_nn = false;
+}
+
+TEST(compareSparse, NeuralNetwork) {
+#ifdef PADDLE_TYPE_DOUBLE
+  double eps = 1e-8;
+#else
+  double eps = 1e-4;
+#endif
+  for (bool local : {false, true}) {
+    FLAGS_local = local;
+    FLAGS_ports_num_for_sparse = 5;
+    for (bool useGpu : {false, true}) {
+#ifndef PADDLE_WITH_CUDA
+      if (useGpu) continue;
+#endif
+      FLAGS_parallel_nn = useGpu;
+      LOG(INFO) << " local=" << local << " useGpu=" << useGpu;
+      int trainerCount = 1;
+      std::vector<ParameterPtr> parameters =
+          trainerOnePassTest(configFile1, true, trainerCount, useGpu);
+      compareValue(getDenseParameters(), parameters, useGpu ? eps : 0);
+    }
+  }
+  FLAGS_parallel_nn = false;
+}
+
+int main(int argc, char** argv) {
+  // FIXME(tonyyang-svail):
+  //   Turn off this test due CI failure:
+  //   https://paddleci.ngrok.io/viewLog.html?buildId=27608&buildTypeId=Paddle_PrCi&tab=buildLog&_focus=10430
+  return 0;
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  initPython(argc, argv);
+
+  gNumDevices = hl_get_device_count();
+  FLAGS_num_passes = 1;          // train one pass
+  FLAGS_saving_period = 100000;  // do not save parameter
+
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_CompareTwoNets.cpp b/paddle/legacy/gserver/tests/test_CompareTwoNets.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e19c34abbd8a84660a9e79bcbf602437bfc92832
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_CompareTwoNets.cpp
@@ -0,0 +1,210 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/utils/PythonUtil.h>
+#include <algorithm>
+#include <cstdlib>
+
+#include "paddle/legacy/trainer/Trainer.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_int32(gpu_id);
+
+DECLARE_bool(local);
+DECLARE_bool(use_gpu);
+
+DECLARE_string(config);
+DECLARE_string(nics);
+
+DEFINE_bool(need_high_accuracy,
+            false,
+            "whether need to run in double accuracy");
+DEFINE_double(
+    max_diff_ratio,
+    0.0f,
+    "max diff ratio allowed for outputs and parameters (value/gradient)");
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_int32(seed);
+
+static const string& config_file_a =
+    "legacy/gserver/tests/sequence_recurrent.py";
+static const string& config_file_b =
+    "legacy/gserver/tests/sequence_recurrent_group.py";
+
+struct ComData {
+  vector<Argument> outArgs;
+  vector<ParameterPtr> parameters;
+};
+
+void calcGradient(ComData& data, const string configFile) {
+  FLAGS_config = configFile;
+
+  FLAGS_local = true;
+  FLAGS_use_gpu = false;
+
+  FLAGS_nics = "";
+
+  *ThreadLocalRand::getSeed() = FLAGS_seed;
+  srand(FLAGS_seed);
+
+  Trainer trainer;
+  trainer.init(TrainerConfigHelper::createFromFlagConfig(), false);
+
+  data.parameters = trainer.getGradientMachine()->getParameters();
+
+  DataBatch dataBatch;
+  int32_t batchSize = trainer.getConfig().opt_config().batch_size();
+
+  trainer.getDataProvider()->reset();
+  trainer.getDataProvider()->setSkipShuffle();
+  trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
+
+  CHECK(dataBatch.getSize()) << "No data from data provider";
+  vector<Argument>& inArgs = dataBatch.getStreams();
+
+  trainer.getGradientMachine()->start();
+  trainer.getGradientMachine()->forwardBackward(
+      inArgs, &data.outArgs, PASS_TRAIN);
+
+  trainer.getGradientMachine()->finish();
+}
+
+void checkBuffer(real* A,
+                 const char* desA,
+                 real* B,
+                 const char* desB,
+                 size_t len,
+                 size_t width = 1) {
+  int nNum = 0;
+  real maxVal = 0;
+  for (size_t i = 0; i < len; ++i) {
+    maxVal = std::max(maxVal, std::max(A[i], B[i]));
+  }
+  real maxDiff = 0;
+  for (size_t i = 0; i < len; ++i) {
+    real diff = fabs(A[i] - B[i]);
+    maxDiff = std::max(maxDiff, diff);
+    if (diff > maxVal * FLAGS_max_diff_ratio) {
+      nNum++;
+      VLOG(1) << "Row: " << i / width << ", " << desA << " : " << A[i] << "    "
+              << desB << " : " << B[i] << " diff=" << diff;
+    }
+  }
+  EXPECT_EQ(0, nNum);
+  LOG(INFO) << "maxValue=" << maxVal << " maxDiff=" << maxDiff << "\n\n";
+}
+
+void compareGradient(ComData& comDataA, ComData& comDataB) {
+  vector<Argument> outArgsA = comDataA.outArgs;
+  vector<Argument> outArgsB = comDataB.outArgs;
+
+  for (size_t i = 0; i < outArgsA.size(); ++i) {
+    CpuMatrix matA(outArgsA[i].value->getHeight(),
+                   outArgsA[i].value->getWidth());
+    CpuMatrix matB(outArgsB[i].value->getHeight(),
+                   outArgsB[i].value->getWidth());
+
+    matA.copyFrom(*outArgsA[i].value);
+    matB.copyFrom(*outArgsB[i].value);
+
+    LOG(INFO) << "\n--------------------------------"
+              << " Check Network Output_" << i << ":"
+              << " -------------------------------------\n";
+    checkBuffer(matA.getData(),
+                "network A output",
+                matB.getData(),
+                "network B output",
+                matA.getElementCnt(),
+                matA.getWidth());
+  }
+
+  vector<ParameterPtr>& parametersA = comDataA.parameters;
+  vector<ParameterPtr>& parametersB = comDataB.parameters;
+
+  LOG(INFO) << "\n\n--------------------------------"
+            << " Check Gradient Machine Parameters:"
+            << " -------------------------------------\n";
+  for (size_t i = 0; i < parametersA.size(); ++i) {
+    ParameterPtr parameterA, parameterB;
+    parameterA = parametersA[i];
+    parameterB = parametersB[i];
+
+    CpuVector paraA(parameterA->getSize());
+    CpuVector paraB(parameterB->getSize());
+    paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE));
+    paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE));
+
+    LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
+              << " ; size : " << paraA.getSize() << " ------------";
+    checkBuffer(paraA.getData(),
+                "Network A",
+                paraB.getData(),
+                "Network B",
+                paraA.getSize());
+
+    CpuVector gradA(*parameterA->getBuf(PARAMETER_GRADIENT));
+    CpuVector gradB(*parameterB->getBuf(PARAMETER_GRADIENT));
+
+    LOG(INFO) << "\n\n----------- PARAMETER_GRADIENT: " << parameterA->getName()
+              << " ; size : " << gradA.getSize() << " -----------";
+    checkBuffer(gradA.getData(),
+                "Network A",
+                gradB.getData(),
+                "Network B",
+                gradA.getSize());
+  }
+}
+
+TEST(Trainer, create) {
+  ComData dataA;
+  calcGradient(dataA, config_file_a);
+  LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n";
+
+  ComData dataB;
+  calcGradient(dataB, config_file_b);
+  LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n";
+
+  compareGradient(dataA, dataB);
+}
+
+int main(int argc, char** argv) {
+  FLAGS_thread_local_rand_use_global_seed = true;
+  paddle::initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  initPython(argc, argv);
+
+#ifndef PADDLE_TYPE_DOUBLE
+  if (FLAGS_need_high_accuracy) {
+    LOG(INFO) << "skip test due to it's need high accuracy";
+    return 0;
+  }
+  if (FLAGS_max_diff_ratio == 0.0f) {
+    FLAGS_max_diff_ratio = 1e-5;
+    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
+              << " in low accuracy mode";
+  }
+#else
+  if (FLAGS_max_diff_ratio == 0.0f) {
+    FLAGS_max_diff_ratio = 1e-10;
+    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
+              << " in high accuracy mode";
+  }
+#endif
+
+  int ret = RUN_ALL_TESTS();
+  return ret;
+}
diff --git a/paddle/legacy/gserver/tests/test_ConvTrans.cpp b/paddle/legacy/gserver/tests/test_ConvTrans.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4ea0a3d379b010fcb6ccb91a28e653a53cfe66d8
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_ConvTrans.cpp
@@ -0,0 +1,244 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
+
+// Test that the convTrans forward is the same as conv backward
+TEST(Layer, convTransLayerFwd) {
+  // Setting up conv-trans layer
+  TestConfig configt;
+  configt.biasSize = 3;
+  configt.layerConfig.set_type("exconvt");
+  configt.layerConfig.set_num_filters(3);
+  configt.layerConfig.set_partial_sum(1);
+  configt.layerConfig.set_shared_biases(true);
+
+  configt.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 384});
+  LayerInputConfig* input = configt.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(4);
+  conv->set_channels(16);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(3 / conv->groups());
+  conv->set_img_size(16);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+  configt.layerConfig.set_size(conv->img_size() * conv->img_size() *
+                               configt.layerConfig.num_filters());
+  configt.layerConfig.set_name("convTrans");
+
+  // data layer initialize
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(
+      configt, &dataLayers, &datas, &layerMap, "convTrans", 100, false, false);
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr convtLayer;
+  initTestLayer(configt, &layerMap, &parameters, &convtLayer);
+  convtLayer->getBiasParameter()->zeroMem();
+  convtLayer->forward(PASS_GC);
+
+  // Setting up conv-layer config
+  TestConfig config;
+  config.biasSize = 16;
+  config.layerConfig.set_type("exconv");
+  config.layerConfig.set_num_filters(16);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 768, 384});
+  input = config.layerConfig.add_inputs();
+  conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(4);
+  conv->set_channels(3);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(16);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+                              config.layerConfig.num_filters());
+  config.layerConfig.set_name("conv");
+
+  // data layer initialize
+  std::vector<DataLayerPtr> dataLayers2;
+  LayerMap layerMap2;
+  vector<Argument> datas2;
+  initDataLayer(
+      config, &dataLayers2, &datas2, &layerMap2, "conv", 100, false, false);
+  // test layer initialize
+  std::vector<ParameterPtr> parameters2;
+  LayerPtr convLayer;
+  initTestLayer(config, &layerMap2, &parameters2, &convLayer);
+
+  // Sync convLayer and convtLayer parameter
+  convLayer->getBiasParameter()->zeroMem();
+  convLayer->getParameters()[0]
+      ->getBuf(PARAMETER_VALUE)
+      ->copyFrom(*(convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)));
+
+  // Set convLayer outputGrad as convTransLayer input value
+  convLayer->forward(PASS_GC);
+  convLayer->getOutput().grad->copyFrom(*(dataLayers[0]->getOutputValue()));
+
+  vector<int> callbackFlags(parameters2.size(), 0);
+  auto callback = [&](Parameter* para) { ++callbackFlags[para->getID()]; };
+  convLayer->backward(callback);
+
+  // Check that the convLayer backward is the same as convTransLayer forward
+  checkMatrixEqual(convtLayer->getOutputValue(),
+                   dataLayers2[0]->getOutputGrad());
+}
+
+// Do one forward pass of convTrans layer and check to see if its output
+// matches the given result
+void doOneConvtTest(size_t imgSize,
+                    size_t output_x,
+                    size_t stride,
+                    size_t padding,
+                    size_t filter_size,
+                    MatrixPtr& result) {
+  TestConfig configt;
+  configt.biasSize = 1;
+  configt.layerConfig.set_type("exconvt");
+  configt.layerConfig.set_num_filters(1);
+  configt.layerConfig.set_partial_sum(1);
+  configt.layerConfig.set_shared_biases(true);
+
+  configt.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", output_x * output_x, filter_size * filter_size});
+  LayerInputConfig* input = configt.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(filter_size);
+  conv->set_filter_size_y(filter_size);
+  conv->set_channels(1);
+  conv->set_padding(padding);
+  conv->set_padding_y(padding);
+  conv->set_stride(stride);
+  conv->set_stride_y(stride);
+  conv->set_groups(1);
+  conv->set_filter_channels(1);
+  conv->set_img_size(imgSize);
+  conv->set_output_x(output_x);
+
+  configt.layerConfig.set_size(conv->img_size() * conv->img_size() *
+                               configt.layerConfig.num_filters());
+  configt.layerConfig.set_name("convTrans");
+
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(
+      configt, &dataLayers, &datas, &layerMap, "convTrans", 1, false, false);
+  dataLayers[0]->getOutputValue()->zeroMem();
+  dataLayers[0]->getOutputValue()->add(1.0);
+
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr convtLayer;
+  initTestLayer(configt, &layerMap, &parameters, &convtLayer);
+  convtLayer->getBiasParameter()->zeroMem();
+  convtLayer->getParameters()[0]->zeroMem();
+  convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)->add(1.0);
+  convtLayer->forward(PASS_GC);
+
+  checkMatrixEqual(convtLayer->getOutputValue(), result);
+}
+
+TEST(Layer, convTransLayerFwd2) {
+  MatrixPtr result;
+  result = Matrix::create(1, 5 * 5, false, false);
+  result->zeroMem();
+  result->add(1.0);
+  doOneConvtTest(/* imgSize */ 5,
+                 /* output_x */ 1,
+                 /* stride */ 1,
+                 /* padding */ 0,
+                 /* filter_size */ 5,
+                 result);
+
+  real resultData[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4,
+                       4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1};
+  result->setData(resultData);
+  doOneConvtTest(/* imgSize */ 5,
+                 /* output_x */ 2,
+                 /* stride */ 1,
+                 /* padding */ 0,
+                 /* filter_size */ 4,
+                 result);
+
+  real resultData2[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4,
+                        4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1};
+  result->setData(resultData2);
+  doOneConvtTest(/* imgSize */ 5,
+                 /* output_x */ 2,
+                 /* stride */ 2,
+                 /* padding */ 1,
+                 /* filter_size */ 5,
+                 result);
+
+  real resultData3[] = {1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 4,
+                        2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1};
+  result->setData(resultData3);
+  doOneConvtTest(/* imgSize */ 5,
+                 /* output_x */ 2,
+                 /* stride */ 2,
+                 /* padding */ 0,
+                 /* filter_size */ 3,
+                 result);
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_ConvUnify.cpp b/paddle/legacy/gserver/tests/test_ConvUnify.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d4ca158352d9e4bf859b31b7c7410518bdc20ac6
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_ConvUnify.cpp
@@ -0,0 +1,315 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
+
+// Do one forward pass of ConvLayer using either exconv or cudnn_conv
+MatrixPtr doOneConvTest(size_t imgSize,
+                        size_t output_x,
+                        size_t stride,
+                        size_t padding,
+                        size_t filter_size,
+                        size_t channel,
+                        size_t numfilters,
+                        size_t groups,
+                        MatrixPtr& inputData,
+                        real* param,
+                        bool useGpu,
+                        bool isDeconv = false) {
+  TestConfig config;
+  config.biasSize = numfilters;
+  string layerType;
+  if (useGpu) {
+    layerType = (isDeconv) ? "cudnn_convt" : "cudnn_conv";
+  } else {
+    layerType = (isDeconv) ? "exconvt" : "exconv";
+  }
+  config.layerConfig.set_type(layerType);
+  config.layerConfig.set_num_filters(numfilters);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  size_t weightSize = channel * filter_size * filter_size *
+                      config.layerConfig.num_filters() / groups;
+  if (isDeconv) {
+    config.inputDefs.push_back(
+        {INPUT_DATA, "layer_0", output_x * output_x * channel, weightSize});
+    config.layerConfig.set_size(imgSize * imgSize *
+                                config.layerConfig.num_filters());
+  } else {
+    config.inputDefs.push_back(
+        {INPUT_DATA, "layer_0", imgSize * imgSize * channel, weightSize});
+    config.layerConfig.set_size(output_x * output_x *
+                                config.layerConfig.num_filters());
+  }
+
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(filter_size);
+  conv->set_filter_size_y(filter_size);
+  conv->set_channels(channel);
+  conv->set_padding(padding);
+  conv->set_padding_y(padding);
+  conv->set_stride(stride);
+  conv->set_stride_y(stride);
+  conv->set_groups(groups);
+  conv->set_img_size(imgSize);
+  conv->set_output_x(output_x);
+
+  if (isDeconv) {
+    conv->set_filter_channels(numfilters / groups);
+  } else {
+    conv->set_filter_channels(channel / groups);
+  }
+
+  config.layerConfig.set_name("conv");
+
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(
+      config, &dataLayers, &datas, &layerMap, "conv", 1, false, useGpu);
+  dataLayers[0]->getOutputValue()->zeroMem();
+  dataLayers[0]->getOutputValue()->copyFrom(*inputData);
+
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr convLayer;
+  initTestLayer(config, &layerMap, &parameters, &convLayer);
+  convLayer->getBiasParameter()->zeroMem();
+  convLayer->getParameters()[0]->zeroMem();
+  convLayer->getParameters()[0]
+      ->getBuf(PARAMETER_VALUE)
+      ->copyFrom(param, weightSize);
+  convLayer->forward(PASS_GC);
+
+  return convLayer->getOutputValue();
+}
+
+TEST(Layer, convParaUnified) {
+#ifdef PADDLE_WITH_CUDA
+  MatrixPtr input, resultCpu, resultGpu;
+
+  /// TEST1 for conv ///
+  input = Matrix::create(1, 4 * 4, false, false);
+  real inputData[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+  real param[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 7, 6, 5, 4, 3, 2, 1};
+
+  input->setData(inputData);
+
+  resultCpu = doOneConvTest(/* imgSize */ 4,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 3,
+                            /*channel*/ 1,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param,
+                            /*useGpu*/ false);
+
+  resultGpu = doOneConvTest(/* imgSize */ 4,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 3,
+                            /*channel*/ 1,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param,
+                            /*useGpu*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  /// TEST1 for deconv ///
+  input = Matrix::create(1, 2 * 2, false, false);
+  real inputDataT[] = {1, 2, 3, 4};
+  input->setData(inputDataT);
+
+  resultCpu = doOneConvTest(/* imgSize */ 4,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 3,
+                            /*channel*/ 1,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param,
+                            /*useGpu*/ false,
+                            /*isDeconv*/ true);
+
+  resultGpu = doOneConvTest(/* imgSize */ 4,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 3,
+                            /*channel*/ 1,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param,
+                            /*useGpu*/ true,
+                            /*isDeconv*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  /// TEST2 for conv ///
+  input = Matrix::create(1, 3 * 3 * 2, false, false);
+  real inputData2[] = {
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
+  real param2[] = {1, 2, 3, 4, 5, 6, 7, 8, 8, 7, 6, 5, 4, 3, 2, 1};
+
+  input->setData(inputData2);
+
+  resultCpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param2,
+                            /*useGpu*/ false);
+
+  resultGpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param2,
+                            /*useGpu*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  /// TEST3 for conv ///
+  real param3[] = {1, 2, 3, 4, 4, 3, 2, 1};
+
+  resultCpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 2,
+                            input,
+                            param3,
+                            /*useGpu*/ false);
+
+  resultGpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 2,
+                            input,
+                            param3,
+                            /*useGpu*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  /// TEST2 for deconv ///
+  input = Matrix::create(1, 2 * 2 * 2, false, false);
+  real inputData2T[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  input->setData(inputData2T);
+
+  resultCpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param2,
+                            /*useGpu*/ false,
+                            /*isDeconv*/ true);
+
+  resultGpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param2,
+                            /*useGpu*/ true,
+                            /*isDeconv*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  /// TEST3 for deconv ///
+  resultCpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 2,
+                            input,
+                            param3,
+                            /*useGpu*/ false,
+                            /*isDeconv*/ true);
+
+  resultGpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 2,
+                            input,
+                            param3,
+                            /*useGpu*/ true,
+                            /*isDeconv*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+#endif
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_CrossEntropyOverBeamGrad.cpp b/paddle/legacy/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..34eb0dedffeba46c662a0e69ce9ba82f474a8358
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
@@ -0,0 +1,352 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <random>
+#include <sstream>
+
+#include <gtest/gtest.h>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+const size_t MAX_SEQ_NUM = 23;
+const size_t MAX_SEQ_LEN = 50;
+const size_t MAX_BEAM_SIZE = 27;
+
+const size_t SEED = (size_t)(time(NULL));
+
+struct SingleBeamExpansion {
+  vector<int> seqStartPos;
+  vector<int> subSeqStartPos;
+  vector<real> candidateScores;
+
+  // TODO(caoying): store this into Argument.ids
+  vector<real> selectedIndices;
+
+  vector<int> groundTruth;
+  vector<size_t> inBeam;
+  vector<int> rowIdxInBeam;
+  vector<int> colIdxInBeam;
+
+  void resetGroundTruth(size_t n) {
+    groundTruth.clear();
+    groundTruth.resize(n, -1);
+
+    inBeam.clear();
+    inBeam.resize(n, 0);
+
+    rowIdxInBeam.clear();
+    rowIdxInBeam.resize(n, -1);
+
+    colIdxInBeam.clear();
+    colIdxInBeam.resize(n, -1);
+  }
+};
+
+inline float randFloat() {
+  return static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
+}
+
+void genRand(real* numbers, size_t n) {
+  default_random_engine generator;
+  uniform_real_distribution<real> distribution(0.0, 1.0);
+  for (size_t i = 0; i < n; ++i) numbers[i] = distribution(generator);
+}
+
+vector<real> randSampling(real range, int n) {
+  CHECK_GE(range, n);
+  vector<real> num(range);
+  iota(begin(num), end(num), 0.);
+  if (range == n) return num;
+
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  sort(begin(num), end(num));
+  return num;
+}
+
+void genCandidateScores(bool hasSubseq,
+                        size_t beamSize,
+                        SingleBeamExpansion& prevBeam,
+                        SingleBeamExpansion& curBeam) {
+  vector<int>& seqStartPos = curBeam.seqStartPos;
+  seqStartPos.resize(1, 0);
+  vector<int>& subSeqStartPos = curBeam.subSeqStartPos;
+  subSeqStartPos.resize(1, 0);
+
+  srand(SEED);
+  if (prevBeam.selectedIndices.size()) {
+    if (prevBeam.subSeqStartPos.size() > 1) {
+      int seqIdx = 1;
+      // samples in previous beam are nested sequences.
+      for (size_t i = 1; i < prevBeam.subSeqStartPos.size(); ++i) {
+        for (size_t j = 0; j < beamSize; ++j) {
+          if (prevBeam.selectedIndices[(i - 1) * beamSize + j] == -1.) break;
+          subSeqStartPos.push_back(1 + (rand() % MAX_SEQ_LEN) +
+                                   subSeqStartPos.back());
+        }
+        if (prevBeam.seqStartPos[seqIdx] == prevBeam.subSeqStartPos[i]) {
+          seqStartPos.push_back(subSeqStartPos.back());
+          seqIdx++;
+        }
+      }
+    } else {
+      for (size_t i = 0; i <= prevBeam.selectedIndices.size(); ++i) {
+        if (i && i % beamSize == 0) {
+          seqStartPos.push_back(subSeqStartPos.back());
+          if (i == prevBeam.selectedIndices.size()) break;
+        }
+        if (prevBeam.selectedIndices[i] == -1.) continue;
+        subSeqStartPos.push_back(subSeqStartPos.back() +
+                                 (1 + (rand() % MAX_SEQ_LEN)));
+      }
+    }
+  } else {
+    // the first beam expansion
+    int seqNum = 1 + (rand() % MAX_SEQ_NUM);
+    for (int i = 0; i < seqNum; ++i) {
+      if (hasSubseq) {
+        for (size_t j = 0; j < 1 + (rand() % MAX_SEQ_NUM); ++j)
+          subSeqStartPos.push_back(subSeqStartPos.back() +
+                                   (1 + (rand() % MAX_SEQ_LEN)));
+        seqStartPos.push_back(subSeqStartPos.back());
+      } else {
+        seqStartPos.push_back(seqStartPos.back() +
+                              (1 + (rand() % MAX_SEQ_LEN)));
+      }
+    }
+  }
+
+  size_t totalSeqNum = hasSubseq ? subSeqStartPos.back() : seqStartPos.back();
+  curBeam.candidateScores.resize(totalSeqNum, 0.);
+  genRand(curBeam.candidateScores.data(), totalSeqNum);
+}
+
+void genSelectedIndices(size_t beamSize,
+                        vector<int>& seqStartPos,
+                        vector<real>& selectedIndices) {
+  size_t selectedIdsCount = beamSize * (seqStartPos.size() - 1);
+  selectedIndices.resize(selectedIdsCount, -1.);
+
+  for (size_t i = 0; i < seqStartPos.size() - 1; ++i) {
+    int seqLen = seqStartPos[i + 1] - seqStartPos[i];
+    int n = min(seqLen, static_cast<int>(beamSize));
+    vector<real> ids = randSampling(seqLen, n);
+    memcpy(selectedIndices.data() + i * beamSize,
+           ids.data(),
+           sizeof(real) * ids.size());
+  }
+}
+
+void genGroundTruth(vector<SingleBeamExpansion>& beamExpansions,
+                    size_t beamSize) {
+  SingleBeamExpansion& beam = beamExpansions[1];
+  size_t seqNum = beam.seqStartPos.size() - 1;
+  for (size_t i = 2; i < beamExpansions.size(); ++i)
+    CHECK_EQ(seqNum, beamExpansions[i].seqStartPos.size() - 1);
+
+  srand(SEED);
+
+  // initialize the first beam.
+  beam.resetGroundTruth(seqNum);
+  for (size_t i = 0; i < seqNum; ++i) {
+    if (randFloat() > 0.5) {
+      /*
+       * force the randomly generated label falls in the beam by chance 0.5.
+       * otherwise, when sequence length is relatively long and beam size is
+       * relatively small, the gold sequences falls off the beam at in the
+       * first search.
+       */
+      real* begPos = beam.selectedIndices.data() + i * beamSize;
+      beam.colIdxInBeam[i] =
+          rand() % count_if(begPos, begPos + beamSize, [](const real& val) {
+            return val != -1.;
+          });
+      beam.groundTruth[i] =
+          beam.selectedIndices[i * beamSize + beam.colIdxInBeam[i]];
+      beam.inBeam[i] = 1;
+    } else {
+      int label = rand() % (beam.seqStartPos[i + 1] - beam.seqStartPos[i]);
+      beam.groundTruth[i] = label;
+
+      real* begPos = beam.selectedIndices.data() + i * beamSize;
+      real* endPos = begPos + beamSize;
+      real* lblPos = find(begPos, endPos, real(label));
+      if (lblPos != endPos) {
+        beam.inBeam[i] = 1;
+        beam.colIdxInBeam[i] = lblPos - begPos;
+      }
+    }
+    beam.rowIdxInBeam[i] = i;
+  }
+
+  // iterate over each beam expansions
+  for (size_t i = 2; i < beamExpansions.size(); ++i) {
+    SingleBeamExpansion& curBeam = beamExpansions[i];
+    SingleBeamExpansion& prevBeam = beamExpansions[i - 1];
+    curBeam.resetGroundTruth(seqNum);
+
+    // iterate over each sequence
+    for (size_t j = 0; j < seqNum; ++j) {
+      if (!prevBeam.inBeam[j]) continue;
+
+      // gold sequence falls in the beam in previous search.
+      real* begPos = prevBeam.selectedIndices.data();
+      int offset =
+          prevBeam.rowIdxInBeam[j] * beamSize + prevBeam.colIdxInBeam[j];
+      curBeam.rowIdxInBeam[j] = count_if(
+          begPos, begPos + offset, [](const real& val) { return val != -1.; });
+
+      if (randFloat() > 0.5) {
+        // force the randomly generated label falls in the beam by chance 0.5.
+
+        real* start =
+            curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize;
+        int n = rand() % count_if(start, start + beamSize, [](const real& val) {
+                  return val != -1.;
+                });
+        curBeam.colIdxInBeam[j] = n;
+        curBeam.groundTruth[j] = *(start + n);
+        curBeam.inBeam[j] = 1;
+      } else {
+        CHECK_LE((size_t)curBeam.rowIdxInBeam[j] + 1,
+                 curBeam.subSeqStartPos.size() - 1);
+        int start = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j]];
+        int end = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j] + 1];
+        CHECK_GT(size_t(end), size_t(start));
+        int label = rand() % (end - start);
+
+        curBeam.groundTruth[j] = label;
+        real* findBeg =
+            curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize;
+        real* lblPos =
+            find(findBeg, findBeg + beamSize, static_cast<real>(label));
+        if (lblPos != (findBeg + beamSize)) {
+          curBeam.inBeam[j] = 1;
+          curBeam.colIdxInBeam[j] = lblPos - findBeg;
+        }
+      }
+    }
+  }
+}
+
+void genOneBeam(size_t beamSize,
+                bool hasSubseq,
+                SingleBeamExpansion& prevBeam,
+                SingleBeamExpansion& curBeam) {
+  genCandidateScores(hasSubseq, beamSize, prevBeam, curBeam);
+  genSelectedIndices(beamSize,
+                     hasSubseq ? curBeam.subSeqStartPos : curBeam.seqStartPos,
+                     curBeam.selectedIndices);
+}
+
+void genRandomBeamExpansion(size_t expansionCount,
+                            size_t beamSize,
+                            vector<SingleBeamExpansion>& beamExpansions) {
+  beamExpansions.clear();
+  beamExpansions.resize(expansionCount + 1);
+
+  // beamExpansions[0] is reserved.
+  for (size_t i = 1; i <= expansionCount; ++i)
+    genOneBeam(beamSize, bool(i - 1), beamExpansions[i - 1], beamExpansions[i]);
+  genGroundTruth(beamExpansions, beamSize);
+}
+
+void testCrossEntropyOverBeam(bool useGpu,
+                              size_t beamSize,
+                              vector<SingleBeamExpansion>& beams) {
+  TestConfig config;
+  config.layerConfig.set_type("cross_entropy_over_beam");
+
+  size_t seqNum = 0;
+  for (size_t i = 1; i < beams.size(); ++i) {
+    const SingleBeamExpansion& beam = beams[i];
+    // create scores for all the candidates
+    MatrixPtr candidateScorePtr =
+        Matrix::create(beam.candidateScores.size(), 1, false, false);
+    candidateScorePtr->copyFrom(beam.candidateScores.data(),
+                                beam.candidateScores.size());
+
+    ostringstream paramName;
+    paramName << "candidate_scores_" << i;
+
+    if (beam.subSeqStartPos.size() > 1) {
+      seqNum = beam.subSeqStartPos.size() - 1;
+      config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                                  paramName.str(),
+                                  candidateScorePtr,
+                                  beam.seqStartPos,
+                                  beam.subSeqStartPos});
+    } else {
+      seqNum = beam.seqStartPos.size() - 1;
+      config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                                  paramName.str(),
+                                  candidateScorePtr,
+                                  beam.seqStartPos});
+    }
+    config.layerConfig.add_inputs();
+
+    // create indices for the selected candidates
+    MatrixPtr selectedCandidates =
+        Matrix::create(seqNum, beamSize, false, false);
+    selectedCandidates->copyFrom(beam.selectedIndices.data(),
+                                 beam.selectedIndices.size());
+    paramName.clear();
+    paramName << "selected_candidates_" << i;
+    config.inputDefs.push_back(
+        {INPUT_SELF_DEFINE_DATA, paramName.str(), selectedCandidates});
+    config.layerConfig.add_inputs();
+
+    // create the ground truth
+    paramName.clear();
+    paramName << "label_" << i;
+    config.inputDefs.push_back(
+        {INPUT_SELF_DEFINE_DATA, paramName.str(), beam.groundTruth});
+    config.layerConfig.add_inputs();
+  }
+
+  testLayerGrad(
+      config, "cross_entropy_over_beam", seqNum, false, useGpu, false);
+}
+
+TEST(Layer, CrossEntropyOverBeam) {
+  LOG(INFO) << "SEED = " << SEED;
+  const size_t beamSize = 1 + rand() % MAX_BEAM_SIZE;
+  LOG(INFO) << "beamSize = " << beamSize;
+
+  // TODO(caoying): test with random beam expansions.
+  const size_t expansionCount = 3;
+  vector<SingleBeamExpansion> beams;
+  genRandomBeamExpansion(expansionCount, beamSize, beams);
+
+  for (bool useGpu : {false, true})
+    testCrossEntropyOverBeam(useGpu, beamSize, beams);
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  hl_start();
+  hl_init(FLAGS_gpu_id);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(SEED);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_DetectionOutput.cpp b/paddle/legacy/gserver/tests/test_DetectionOutput.cpp
similarity index 100%
rename from paddle/gserver/tests/test_DetectionOutput.cpp
rename to paddle/legacy/gserver/tests/test_DetectionOutput.cpp
diff --git a/paddle/legacy/gserver/tests/test_Evaluator.cpp b/paddle/legacy/gserver/tests/test_Evaluator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8aab50d23e56e449d86f22a315c45432253cdd07
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_Evaluator.cpp
@@ -0,0 +1,267 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/trainer/Trainer.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+enum InputType {
+  INPUT_DATA,         // dense vector
+  INPUT_LABEL,        // id
+  INPUT_DATA_TARGET,  // dense vector, but no gradient
+  INPUT_SEQUENCE_DATA,
+  INPUT_SEQUENCE_LABEL,
+  INPUT_SPARSE_NON_VALUE_DATA
+};
+
+struct InputDef {
+  InputType inputType;
+  string name;
+  size_t dim;
+};
+
+struct TestConfig {
+  EvaluatorConfig evaluatorConfig;
+  std::vector<InputDef> inputDefs;
+  bool testAccumulate;
+  TestConfig() : testAccumulate(true) {}
+};
+
+void testEvaluator(TestConfig testConf,
+                   string testEvaluatorName,
+                   size_t batchSize,
+                   bool useGpu) {
+#ifndef PADDLE_WITH_CUDA
+  if (useGpu) return;
+#endif
+  FLAGS_use_gpu = useGpu;
+  testConf.evaluatorConfig.set_name(testEvaluatorName);
+  LOG(INFO) << " evaluator_type=" << testConf.evaluatorConfig.type()
+            << " useGpu=" << useGpu;
+
+  std::vector<Argument> arguments;
+  for (size_t i = 0; i < testConf.inputDefs.size(); ++i) {
+    Argument data;
+    size_t dim = testConf.inputDefs[i].dim;
+    switch (testConf.inputDefs[i].inputType) {
+      case INPUT_DATA:
+      case INPUT_SEQUENCE_DATA:
+      case INPUT_DATA_TARGET:
+        data.value = Matrix::create(batchSize, dim, false, useGpu);
+        data.value->randomizeUniform();
+
+        // make sure output > 0 && output < 1
+        data.value->add(-0.5);
+        data.value->sigmoid(*data.value);
+        break;
+      case INPUT_LABEL:
+      case INPUT_SEQUENCE_LABEL:
+        data.ids = VectorT<int>::create(batchSize, useGpu);
+        data.ids->rand(dim);  // now rand number can be 0 to inputDefs[i].dim.
+        break;
+      case INPUT_SPARSE_NON_VALUE_DATA:
+        data.value = makeRandomSparseMatrix(batchSize,
+                                            dim,
+                                            /* withValue= */ false,
+                                            useGpu);
+        break;
+      default:
+        LOG(FATAL) << " unknown inputType ";
+        return;
+    }
+
+    ICpuGpuVectorPtr sequenceStartPositions;
+    if (testConf.inputDefs[i].inputType == INPUT_SEQUENCE_DATA ||
+        testConf.inputDefs[i].inputType == INPUT_SEQUENCE_LABEL) {
+      if (!sequenceStartPositions) {
+        generateSequenceStartPositions(batchSize, sequenceStartPositions);
+      }
+      data.sequenceStartPositions = sequenceStartPositions;
+    }
+
+    arguments.push_back(data);
+  }
+
+  Evaluator* testEvaluator = Evaluator::create(testConf.evaluatorConfig);
+  double totalScore = 0.0;
+  testEvaluator->start();
+  totalScore += testEvaluator->evalImp(arguments);
+  testEvaluator->updateSamplesNum(arguments);
+  testEvaluator->finish();
+  LOG(INFO) << *testEvaluator;
+
+  std::vector<std::string> names;
+  testEvaluator->getNames(&names);
+  paddle::Error err;
+  for (auto& name : names) {
+    auto value = testEvaluator->getValue(name, &err);
+    ASSERT_TRUE(err.isOK());
+    LOG(INFO) << name << " " << value;
+    auto tp = testEvaluator->getType(name, &err);
+    ASSERT_TRUE(err.isOK());
+    ASSERT_EQ(testConf.evaluatorConfig.type(), tp);
+  }
+
+  double totalScore2 = 0.0;
+  if (testConf.testAccumulate) {
+    testEvaluator->start();
+    totalScore2 += testEvaluator->evalImp(arguments);
+    testEvaluator->finish();
+    EXPECT_LE(fabs(totalScore - totalScore2), 1.0e-5);
+  }
+}
+
+void testEvaluatorAll(TestConfig testConf,
+                      string testEvaluatorName,
+                      size_t batchSize) {
+  testEvaluator(testConf, testEvaluatorName, batchSize, true);
+  testEvaluator(testConf, testEvaluatorName, batchSize, false);
+}
+
+TEST(Evaluator, detection_map) {
+  TestConfig config;
+  config.evaluatorConfig.set_type("detection_map");
+  config.evaluatorConfig.set_overlap_threshold(0.5);
+  config.evaluatorConfig.set_background_id(0);
+  config.evaluatorConfig.set_ap_type("Integral");
+  config.evaluatorConfig.set_evaluate_difficult(0);
+
+  config.inputDefs.push_back({INPUT_DATA, "output", 7});
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "label", 6});
+  config.evaluatorConfig.set_evaluate_difficult(false);
+  testEvaluatorAll(config, "detection_map", 100);
+
+  config.evaluatorConfig.set_evaluate_difficult(true);
+  testEvaluatorAll(config, "detection_map", 100);
+}
+
+TEST(Evaluator, classification_error) {
+  TestConfig config;
+  config.evaluatorConfig.set_type("classification_error");
+  config.evaluatorConfig.set_top_k(5);
+
+  config.inputDefs.push_back({INPUT_DATA, "output", 50});
+  config.inputDefs.push_back({INPUT_LABEL, "label", 50});
+  testEvaluatorAll(config, "classification_error", 100);
+  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
+  testEvaluatorAll(config, "classification_error_weight", 100);
+
+  // multi binary labels
+  config.inputDefs.clear();
+  config.inputDefs.push_back({INPUT_DATA, "output", 100});
+  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "label", 100});
+  // Not support GPU
+  testEvaluator(config, "classification_error_multi_binary_label", 50, false);
+
+  config.evaluatorConfig.set_classification_threshold(0.4);
+  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
+  // Not support GPU
+  testEvaluator(
+      config, "classification_error_weight_multi_binary_label", 50, false);
+}
+
+TEST(Evaluator, sum) {
+  TestConfig config;
+  config.evaluatorConfig.set_type("sum");
+
+  // sum of output
+  config.inputDefs.push_back({INPUT_DATA, "output", 10});
+  testEvaluatorAll(config, "sum_output", 200);
+  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
+  testEvaluatorAll(config, "sum_output_weight", 200);
+
+  // sum of label
+  config.inputDefs.clear();
+  config.inputDefs.push_back({INPUT_LABEL, "label", 10});
+  testEvaluatorAll(config, "sum_label", 200);
+  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
+  testEvaluatorAll(config, "sum_label_weight", 200);
+}
+
+TEST(Evaluator, last_column_sum) {
+  TestConfig config;
+  config.evaluatorConfig.set_type("last-column-sum");
+
+  config.inputDefs.push_back({INPUT_DATA, "output", 50});
+  testEvaluatorAll(config, "last-column-sum", 200);
+  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
+  testEvaluatorAll(config, "last-column-sum_weight", 200);
+}
+
+TEST(Evaluator, last_column_auc) {
+  TestConfig config;
+  config.evaluatorConfig.set_type("last-column-auc");
+
+  config.inputDefs.push_back({INPUT_DATA, "output", 2});
+  config.inputDefs.push_back({INPUT_LABEL, "label", 2});
+  testEvaluatorAll(config, "last-column-auc", 500);
+  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
+  testEvaluatorAll(config, "last-column-auc_weight", 200);
+}
+
+TEST(Evaluator, precision_recall) {
+  TestConfig config;
+  config.evaluatorConfig.set_type("precision_recall");
+
+  config.inputDefs.push_back({INPUT_DATA, "output", 10});
+  config.inputDefs.push_back({INPUT_LABEL, "label", 10});
+  testEvaluatorAll(config, "precision_recall", 200);
+  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
+  testEvaluatorAll(config, "precision_recall_weight", 200);
+
+  LOG(INFO) << "positive_label = 5";
+  config.evaluatorConfig.set_positive_label(5);
+  testEvaluatorAll(config, "precision_recall_weight", 200);
+
+  // multi binary labels
+  config.inputDefs.clear();
+  config.evaluatorConfig.set_positive_label(-1);
+  config.inputDefs.push_back({INPUT_DATA, "output", 10});
+  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "label", 10});
+  // Not support GPU
+  testEvaluator(config, "precision_recall_multi_binary_label", 100, false);
+
+  LOG(INFO) << "classification_threshold = 0.4";
+  config.evaluatorConfig.set_classification_threshold(0.4);
+  config.inputDefs.push_back({INPUT_DATA, "weight", 1});
+  // Not support GPU
+  testEvaluator(
+      config, "precision_recall_weight_multi_binary_label", 100, false);
+}
+
+TEST(Evaluator, ctc_error_evaluator) {
+  TestConfig config;
+  config.evaluatorConfig.set_type("ctc_edit_distance");
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "output", 32});
+  config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "label", 1});
+  testEvaluatorAll(config, "ctc_error_evaluator", 100);
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_Expand.cpp b/paddle/legacy/gserver/tests/test_Expand.cpp
similarity index 100%
rename from paddle/gserver/tests/test_Expand.cpp
rename to paddle/legacy/gserver/tests/test_Expand.cpp
diff --git a/paddle/legacy/gserver/tests/test_KmaxSeqScore.cpp b/paddle/legacy/gserver/tests/test_KmaxSeqScore.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e15b4e5038cddda00acdd06b7748984b03094e6e
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_KmaxSeqScore.cpp
@@ -0,0 +1,164 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+vector<int> randSampling(int range, int n) {
+  CHECK_GE(range, n);
+  vector<int> num(range);
+  iota(begin(num), end(num), 0);
+  if (range == n) return num;
+
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  return num;
+}
+
+void genRandomSeqInfo(vector<int>& seqStartPosition,
+                      vector<int>& subSeqStartPosition) {
+  const int maxSeqNum = 100;
+  // generate random start position information
+  int seqNum = 1 + (rand() % maxSeqNum);
+  seqStartPosition.resize(seqNum + 1, 0);
+  subSeqStartPosition.resize(1, 0);
+
+  for (int i = 0; i < seqNum; ++i) {
+    int subSeqLen = 1 + (rand() % maxSeqNum);
+    for (int j = 0; j < subSeqLen; ++j)
+      subSeqStartPosition.push_back(subSeqStartPosition.back() + subSeqLen);
+    seqStartPosition[i + 1] = subSeqStartPosition.back();
+  }
+}
+
+void genRandomGroundTruth(real* values,
+                          vector<vector<int>>& groundTruth,
+                          vector<int>& startPos,
+                          size_t beamSize) {
+  groundTruth.resize(startPos.size() - 1, vector<int>(beamSize, -1));
+  for (size_t i = 0; i < startPos.size() - 1; ++i) {
+    int seqLen = startPos[i + 1] - startPos[i];
+    vector<int> pos =
+        randSampling(seqLen, min(static_cast<int>(beamSize), seqLen));
+    for (size_t j = 0; j < pos.size(); ++j) {
+      groundTruth[i][j] = pos[j];
+      values[startPos[i] + pos[j]] = 1.;
+    }
+  }
+}
+
+void checkLayerOut(vector<vector<int>> groundTruth,
+                   real* layerOut,
+                   size_t beamSize) {
+  for (size_t i = 0; i < groundTruth.size(); ++i) {
+    int begPos = i * beamSize;
+    vector<real> tmp(layerOut + begPos, layerOut + begPos + beamSize);
+    sort(begin(tmp), end(tmp));
+    sort(begin(groundTruth[i]), end(groundTruth[i]));
+    for (size_t j = 0; j < beamSize; ++j) CHECK_EQ(tmp[j], groundTruth[i][j]);
+  }
+}
+
+TEST(Layer, kmaxSeqScoreLayer) {
+  const size_t maxBeamSize = 100;
+  size_t beamSize = 1 + (rand() % maxBeamSize);
+
+  vector<int> seqStartPosition;
+  vector<int> subSeqStartPosition;
+  genRandomSeqInfo(seqStartPosition, subSeqStartPosition);
+  MatrixPtr inValue =
+      Matrix::create(subSeqStartPosition.back(), 1, false, false);
+
+  std::vector<bool> mode = {false};
+#ifdef PADDLE_WITH_CUDA
+  mode.push_back(true);
+#endif
+
+  for (auto hasSubseq : {false, true}) {
+    vector<vector<int>> groundTruth;
+    inValue->randomizeUniform();
+    genRandomGroundTruth(inValue->getData(),
+                         groundTruth,
+                         hasSubseq ? subSeqStartPosition : seqStartPosition,
+                         beamSize);
+
+    for (auto useGpu : mode) {
+      TestConfig config;
+      config.layerConfig.set_type("kmax_seq_score");
+      config.layerConfig.set_beam_size(beamSize);
+
+      if (hasSubseq) {
+        config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                                    "scores",
+                                    inValue,
+                                    seqStartPosition,
+                                    subSeqStartPosition});
+      } else {
+        config.inputDefs.push_back(
+            {INPUT_SELF_DEFINE_DATA, "scores", inValue, seqStartPosition});
+      }
+      config.layerConfig.add_inputs();
+
+      // data layer initialize
+      std::vector<DataLayerPtr> dataLayers;
+      LayerMap layerMap;
+      vector<Argument> datas;
+      initDataLayer(
+          config,
+          &dataLayers,
+          &datas,
+          &layerMap,
+          "kmax_seq_score",
+          100 /* actually this parameter is unused in self-defined input*/,
+          false,
+          useGpu);
+      // test layer initialize
+      std::vector<ParameterPtr> parameters;
+      LayerPtr kmaxSeqScoreLayer;
+      FLAGS_use_gpu = useGpu;
+      initTestLayer(config, &layerMap, &parameters, &kmaxSeqScoreLayer);
+      kmaxSeqScoreLayer->forward(PASS_TRAIN);
+
+      const MatrixPtr outValue = kmaxSeqScoreLayer->getOutputValue();
+      CHECK_EQ(outValue->getHeight(),
+               hasSubseq ? subSeqStartPosition.size() - 1
+                         : seqStartPosition.size() - 1);
+      CHECK_EQ(outValue->getWidth(), beamSize);
+      checkLayerOut(groundTruth, outValue->getData(), beamSize);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand((size_t)(time(NULL)));
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_LayerGrad.cpp b/paddle/legacy/gserver/tests/test_LayerGrad.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..979cf8ee673291d66f8704f2deda6c7160f4b228
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_LayerGrad.cpp
@@ -0,0 +1,2532 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+#include <cudnn.h>
+#endif
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/math/MathUtils.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
+
+TEST(Operator, dot_mul) {
+  TestConfig config;
+  config.layerConfig.set_size(10);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
+  operatorConf.set_type("dot_mul");
+  operatorConf.set_dotmul_scale(-1);
+
+  testOperatorGrad(config, operatorConf, 100, false, false);
+}
+
+TEST(Projection, context) {
+  for (auto contextStart : {-5, -3, -1, 0, 3}) {
+    for (auto contextLength : {1, 2, 5, 7}) {
+      for (auto batchSize : {1, 2, 5, 20}) {
+        for (auto trainablePadding : {false, true}) {
+          LOG(INFO) << " contextStart=" << contextStart
+                    << " contextLength=" << contextLength
+                    << " batchSize=" << batchSize
+                    << " trainablePadding=" << trainablePadding;
+          ProjectionConfig conf;
+          conf.set_type("context");
+          conf.set_input_size(10);
+          conf.set_context_start(contextStart);
+          conf.set_context_length(contextLength);
+          conf.set_trainable_padding(trainablePadding);
+          conf.set_output_size(conf.context_length() * conf.input_size());
+          int pad =
+              std::max(0, -conf.context_start()) +
+              std::max(0, conf.context_start() + conf.context_length() - 1);
+          for (auto useGpu : {false, true}) {
+            testProjectionGrad(
+                conf,
+                INPUT_SEQUENCE_DATA,
+                trainablePadding ? conf.input_size() * pad : 0,
+                batchSize,
+                useGpu,
+                contextStart + contextLength <= 1);  // = testState
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Projection, trans_fc) {
+  ProjectionConfig conf;
+  conf.set_type("trans_fc");
+  conf.set_input_size(50);
+  conf.set_output_size(20);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 1000,
+                       /* batchSize */ 100,
+                       useGpu);
+  }
+}
+
+TEST(Projection, fc) {
+  ProjectionConfig conf;
+  conf.set_type("fc");
+  conf.set_input_size(10);
+  conf.set_output_size(20);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 200,
+                       /* batchSize */ 100,
+                       useGpu);
+  }
+}
+
+TEST(Projection, dot_mul) {
+  ProjectionConfig conf;
+  conf.set_type("dot_mul");
+  conf.set_input_size(20);
+  conf.set_output_size(20);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 20,
+                       /* batchSize */ 100,
+                       useGpu);
+  }
+}
+
+TEST(Projection, table) {
+  ProjectionConfig conf;
+  conf.set_type("table");
+  conf.set_input_size(10);
+  conf.set_output_size(20);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_LABEL,
+                       /* parameterSize */ 200,
+                       /* batchSize */ 100,
+                       useGpu);
+  }
+}
+
+TEST(Projection, identity) {
+  ProjectionConfig conf;
+  conf.set_type("identity");
+  conf.set_input_size(10);
+  conf.set_output_size(10);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 0,
+                       /* batchSize */ 100,
+                       useGpu);
+  }
+}
+
+TEST(Projection, slice) {
+  ProjectionConfig conf;
+  conf.set_type("slice");
+  conf.set_input_size(100);
+  SliceConfig& slice1 = *conf.add_slices();
+  slice1.set_start(10);
+  slice1.set_end(20);
+  SliceConfig& slice2 = *conf.add_slices();
+  slice2.set_start(50);
+  slice2.set_end(70);
+  conf.set_output_size(30);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 0,
+                       /* batchSize */ 10,
+                       useGpu);
+  }
+}
+
+TEST(Projection, scaling) {
+  ProjectionConfig conf;
+  conf.set_type("scaling");
+  conf.set_input_size(10);
+  conf.set_output_size(10);
+  for (auto useGpu : {false}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 1,
+                       /* batchSize */ 100,
+                       useGpu);
+  }
+}
+
+void testProjectionConv(size_t groups, bool isDeconv) {
+  const int NUM_FILTERS = 18;
+  const int FILTER_SIZE = 2;
+  const int FILTER_SIZE_Y = 2;
+  const int CHANNELS = 3;
+  const int IMAGE_SIZE = 16;
+
+#if CUDNN_VERSION >= 6000
+  const int DILATION = 2;
+#else
+  const int DILATION = 1;
+#endif
+
+  ProjectionConfig conf;
+  if (isDeconv) {
+    conf.set_type("convt");
+  } else {
+    conf.set_type("conv");
+  }
+  conf.set_num_filters(NUM_FILTERS);
+
+  ConvConfig* conv = conf.mutable_conv_conf();
+  conv->set_filter_size(FILTER_SIZE);
+  conv->set_filter_size_y(FILTER_SIZE_Y);
+  conv->set_channels(CHANNELS);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_dilation(DILATION);
+  conv->set_dilation_y(DILATION);
+  conv->set_groups(groups);
+  if (isDeconv) {
+    conv->set_filter_channels(NUM_FILTERS / conv->groups());
+  } else {
+    conv->set_filter_channels(conv->channels() / conv->groups());
+  }
+  conv->set_img_size(IMAGE_SIZE);
+  int output_x = outputSize(conv->img_size(),
+                            (conv->filter_size() - 1) * DILATION + 1,
+                            conv->padding(),
+                            conv->stride(),
+                            /* caffeMode */ true);
+  int output_y = outputSize(conv->img_size(),
+                            (conv->filter_size_y() - 1) * DILATION + 1,
+                            conv->padding_y(),
+                            conv->stride_y(),
+                            /* caffeMode */ true);
+  conv->set_output_x(output_x);
+  conv->set_output_y(output_y);
+  LOG(INFO) << "DILATION:" << DILATION << "; output_x: " << output_x
+            << "; output_y: " << output_y;
+  if (isDeconv) {
+    int deconv_image_x = imageSize(output_x,
+                                   (conv->filter_size() - 1) * DILATION + 1,
+                                   conv->padding(),
+                                   conv->stride(),
+                                   /* caffeMode */ true);
+    int deconv_image_y = imageSize(output_y,
+                                   (conv->filter_size_y() - 1) * DILATION + 1,
+                                   conv->padding_y(),
+                                   conv->stride_y(),
+                                   /* caffeMode */ true);
+
+    LOG(INFO) << " deconv_image_x: " << deconv_image_x
+              << "; deconv_image_y: " << deconv_image_y;
+    conf.set_input_size(output_x * output_y * CHANNELS);
+    conf.set_output_size(deconv_image_x * deconv_image_y * NUM_FILTERS);
+  } else {
+    conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
+    conf.set_output_size(output_x * output_y * NUM_FILTERS);
+  }
+
+  testProjectionGrad(conf,
+                     INPUT_DATA,
+                     /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE *
+                         FILTER_SIZE_Y / groups,
+                     /* batchSize */ 100,
+                     true,
+                     false,
+                     NUM_FILTERS,
+                     true);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(Projection, conv) {
+  /// test ConvProjection
+  testProjectionConv(1, false);
+  testProjectionConv(3, false);
+  /// test ConvTransProjection
+  testProjectionConv(1, true);
+  testProjectionConv(3, true);
+}
+#endif
+
+TEST(Layer, BilinearInterpLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("bilinear_interp");
+  config.biasSize = 0;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
+
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  BilinearInterpConfig* bilinear = input->mutable_bilinear_interp_conf();
+  ImageConfig* image = bilinear->mutable_image_conf();
+  image->set_img_size(32);
+  image->set_img_size_y(32);
+  image->set_channels(4);
+
+  for (auto useGpu : {false, true}) {
+    for (auto outSize : {32, 64}) {
+      bilinear->set_out_size_x(outSize);
+      bilinear->set_out_size_y(outSize);
+      testLayerGrad(config, "bilinear_interp", 10, false, useGpu);
+    }
+  }
+}
+
+TEST(Layer, concat) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("concat");
+  config.layerConfig.set_size(15);
+  config.layerConfig.set_active_type("sigmoid");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "concat", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, AddtoLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("addto");
+  config.layerConfig.set_size(10);
+  config.layerConfig.set_active_type("sigmoid");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "addto", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, CTCLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("ctc");
+  config.layerConfig.set_norm_by_times(false);
+  config.layerConfig.set_size(10);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "ctc",
+                  100,
+                  /* trans */ false, /* useGpu */
+                  useGpu);
+  }
+}
+
+TEST(Layer, cosSimLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("cos");
+  config.layerConfig.set_size(1);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 50, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "cos", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, CosSimVecMatLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("cos_vm");
+  config.layerConfig.set_size(5);  // output size
+  config.layerConfig.set_cos_scale(2.0);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 20, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 100, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "cos_vm", 100, false, useGpu);
+  }
+}
+
+void testDepthwiseConvLayer(const string& type, bool useGpu) {
+  TestConfig config;
+  config.biasSize = 32;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_num_filters(32);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 2048, 192});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(3);
+  conv->set_channels(16);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(16);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(16);
+  conv->set_img_size_y(8);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                conv->filter_size_y(),
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /* caffeMode */ true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                              config.layerConfig.num_filters());
+
+  testLayerGrad(config, "depthwise_conv", 100, false, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "depthwise_conv", 2, false, useGpu, true, 0.02);
+}
+
+TEST(Layer, depthwiseConvLayer) {
+  //  'depthwise_conv' is a sepecial case of 'exconv' whose
+  //  groups size equals to the input channels size.
+  testDepthwiseConvLayer("exconv", /* useGpu= */ false);
+#ifdef PADDLE_WITH_CUDA
+  testDepthwiseConvLayer("exconv", /* useGpu= */ true);
+#endif
+}
+
+void testConvLayer(const string& type, bool trans, bool useGpu) {
+  TestConfig config;
+  config.biasSize = 16;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_num_filters(16);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  int dilation = 2;
+  if (type == "cudnn_conv") {
+#if CUDNN_VERSION >= 6000
+    dilation = 2;
+#else
+    dilation = 1;
+#endif
+  }
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 768, 192});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(2);
+  conv->set_channels(3);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_dilation(dilation);
+  conv->set_dilation_y(dilation);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(16);
+  conv->set_img_size_y(16);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                (conv->filter_size() - 1) * dilation + 1,
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                (conv->filter_size_y() - 1) * dilation + 1,
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /* caffeMode */ true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                              config.layerConfig.num_filters());
+
+  testLayerGrad(config, "conv", 100, trans, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "conv", 2, trans, useGpu, true, 0.02);
+}
+
+TEST(Layer, convLayer) {
+  testConvLayer("exconv", /* trans= */ false, /* useGpu= */ false);
+#ifdef PADDLE_WITH_CUDA
+  testConvLayer("exconv", /* trans= */ false, /* useGpu= */ true);
+  testConvLayer("cudnn_conv", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
+void testConvTransLayer(const string& type, bool trans, bool useGpu) {
+  TestConfig config;
+  config.biasSize = 3;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_num_filters(3);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 384});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(4);
+  conv->set_channels(16);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(3 / conv->groups());
+  conv->set_img_size(16);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+
+  config.layerConfig.set_size(conv->img_size() * conv->img_size() *
+                              config.layerConfig.num_filters());
+
+  testLayerGrad(config, "convTrans", 100, trans, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "convTrans", 2, trans, useGpu, true, 0.02);
+}
+
+TEST(Layer, convTransLayer) {
+  for (auto useGpu : {false, true}) {
+    testConvTransLayer("exconvt", /* trans= */ false, /* useGpu= */ useGpu);
+  }
+#ifdef PADDLE_WITH_CUDA
+  testConvTransLayer("cudnn_convt", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
+TEST(Layer, blockExpandLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("blockexpand");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 6144, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  BlockExpandConfig* blockExpand = input->mutable_block_expand_conf();
+  blockExpand->set_img_size_x(64);
+  blockExpand->set_img_size_y(32);
+  blockExpand->set_channels(3);
+  blockExpand->set_padding_x(0);
+  blockExpand->set_padding_y(0);
+  blockExpand->set_block_x(4);
+  blockExpand->set_block_y(32);
+  blockExpand->set_stride_x(2);
+  blockExpand->set_stride_y(2);
+  blockExpand->set_output_x(outputSize(blockExpand->img_size_x(),
+                                       blockExpand->block_x(),
+                                       blockExpand->padding_x(),
+                                       blockExpand->stride_x(),
+                                       /* caffeMode */ false));
+  blockExpand->set_output_y(outputSize(blockExpand->img_size_y(),
+                                       blockExpand->block_y(),
+                                       blockExpand->padding_y(),
+                                       blockExpand->stride_y(),
+                                       /* caffeMode */ false));
+  config.layerConfig.set_size(blockExpand->block_x() * blockExpand->block_y() *
+                              blockExpand->channels());
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "blockexpand", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, maxoutLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("maxout");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  MaxOutConfig* maxout = input->mutable_maxout_conf();
+  ImageConfig* image = maxout->mutable_image_conf();
+
+  image->set_img_size(32);
+  image->set_img_size_y(32);
+  image->set_channels(4);
+  maxout->set_groups(2);
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "maxout", 10, false, useGpu);
+  }
+}
+
+void testFcLayer(string format, size_t nnz) {
+  TestConfig config;
+  config.biasSize = 1024;
+  config.layerConfig.set_type("fc");
+  config.layerConfig.set_size(1024);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_drop_rate(0.1);
+
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", 2048, nnz, ParaSparse(format)});
+  config.layerConfig.add_inputs();
+
+  LOG(INFO) << config.inputDefs[0].sparse.sparse << " "
+            << config.inputDefs[0].sparse.format;
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "fc",
+                  100,
+                  /* trans */ false,
+                  useGpu,
+                  /* weight */ true);
+  }
+}
+
+TEST(Layer, fcLayer) {
+  testFcLayer("", 1024 * 1024 * 2);
+  testFcLayer("csc", 1024 * 10);
+  testFcLayer("csr", 1024 * 10);
+}
+
+TEST(Layer, SelectiveFullyConnectedLayer) {
+  TestConfig config;
+  size_t nin = 16;
+  size_t nout = 256;
+  config.layerConfig.set_type("selective_fc");
+  config.layerConfig.set_size(nout);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_has_selected_colums(true);
+  config.layerConfig.set_selective_fc_pass_generation(false);
+  config.biasSize = nout;
+
+  config.inputDefs.push_back({INPUT_DATA, "input0", nin, nin * nout});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back(
+      {INPUT_SPARSE_NON_VALUE_DATA, "index", nout, 0, ParaSparse("csr", true)});
+  config.layerConfig.add_inputs();
+
+  testLayerGrad(config,
+                "selective_fc",
+                100,
+                /* trans= */ false,
+                /* useGup= */ false,
+                false);
+#ifdef PADDLE_WITH_CUDA
+  testLayerGrad(config,
+                "selective_fc",
+                100,
+                /* trans= */ false,
+                /* useGup= */ true,
+                false);
+#endif
+}
+
+TEST(Layer, DataNormLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("data_norm");
+  config.layerConfig.set_size(20);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 20, 100});
+  config.inputDefs.back().isStatic = true;
+  config.layerConfig.add_inputs();
+
+  for (auto strategy : {"z-score", "min-max", "decimal-scaling"}) {
+    config.layerConfig.set_data_norm_strategy(strategy);
+    // The parameters are static, so not support GPU now
+    testLayerGrad(config,
+                  "data_norm",
+                  200,
+                  /* trans */ false,
+                  /* useGpu */ false);
+  }
+}
+
+TEST(Layer, hsigmoidLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("hsigmoid");
+  config.layerConfig.set_num_classes(5);
+  config.layerConfig.set_size(1);
+  config.biasSize = config.layerConfig.num_classes() - 1;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 200});
+  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 5, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "hsigmoid",
+                  100,
+                  /* trans */ false,
+                  /* useGpu */ useGpu);
+  }
+}
+
+TEST(Layer, multi_cross) {
+  TestConfig config;
+  config.layerConfig.set_type("multi-class-cross-entropy");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(
+        config, "multi-class-cross-entropy", 100, /* trans */ false, useGpu);
+  }
+}
+
+TEST(Layer, multi_binary_label_sparse_mat) {
+  TestConfig config;
+  config.layerConfig.set_type("multi_binary_label_cross_entropy");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "layer_1", 50, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "multi_binary_label_cross_entropy",
+                  100,
+                  /* trans */ false,
+                  useGpu);
+  }
+}
+
+TEST(layer, multi_binary_label_id) {
+  TestConfig config;
+  config.layerConfig.set_type("multi_binary_label_cross_entropy");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "multi_binary_label_cross_entropy",
+                  100,
+                  /* trans */ false,
+                  useGpu);
+  }
+}
+
+TEST(Layer, multi_cross_with_selfnorm) {
+  TestConfig config;
+  config.layerConfig.set_type("multi_class_cross_entropy_with_selfnorm");
+  config.layerConfig.set_softmax_selfnorm_alpha(0.1);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  // Not support GPU now
+  testLayerGrad(config,
+                "multi_class_cross_entropy_with_selfnorm",
+                100,
+                /* trans */ false,
+                /* useGpu */ false);
+}
+
+TEST(Layer, multi_cross_soft) {
+  TestConfig config;
+  config.layerConfig.set_type("soft_binary_class_cross_entropy");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "soft_binary_class_cross_entropy",
+                  100,
+                  /* trans */ false,
+                  useGpu);
+  }
+}
+
+TEST(Layer, square_error) {
+  TestConfig config;
+  config.layerConfig.set_type("square_error");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "square_error", 100, /* trans */ false, useGpu);
+  }
+}
+
+TEST(Layer, sparse_square_error) {
+  TestConfig config;
+  config.layerConfig.set_type("square_error");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_SPARSE_NON_VALUE_DATA, "layer_1", 50, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  // "GpuSparseMatrix" as label is not supported
+  testLayerGrad(config,
+                "square_error",
+                100,
+                /* trans */ false,
+                /* useGpu */ false);
+}
+
+TEST(Layer, sparse_float_square_error) {
+  TestConfig config;
+  config.layerConfig.set_type("square_error");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 50, 0});
+  config.inputDefs.push_back({INPUT_SPARSE_FLOAT_VALUE_DATA, "layer_1", 50, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  // "GpuSparseMatrix" as label is not supported
+  testLayerGrad(config,
+                "square_error",
+                100,
+                /* trans */ false,
+                /* useGpu */ false);
+}
+
+TEST(Layer, square_error_weighted) {
+  TestConfig config;
+  config.layerConfig.set_type("square_error");
+  config.biasSize = 0;
+  config.testAccumulate = false;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "square_error", 100, /* trans */ false, useGpu);
+  }
+}
+
+TEST(Layer, huber_regression_loss) {
+  TestConfig config;
+  config.layerConfig.set_type("huber_regression");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    for (auto delta : {1, 3, 5}) {
+      config.layerConfig.set_delta(delta);
+      testLayerGrad(config, "huber_regression", 100, /* trans */ false, useGpu);
+    }
+  }
+}
+
+TEST(Layer, huber_two_class) {
+  TestConfig config;
+  config.layerConfig.set_type("huber_classification");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_LABEL, "layer_1", 2, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "huber_two_class", 100, /* trans */ false, useGpu);
+  }
+}
+
+void testExpandLayer(string trans_type, bool hasSubseq) {
+  TestConfig config;
+  config.layerConfig.set_type("expand");
+
+  config.inputDefs.push_back(
+      {trans_type == "non-seq" ? INPUT_DENSE_DIM_DATA : INPUT_SEQUENCE_DATA,
+       "layer_0",
+       10,
+       0});
+  config.inputDefs.push_back(
+      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
+       "layer_1",
+       10,
+       0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.set_trans_type(trans_type);
+  LOG(INFO) << " trans_type=" << trans_type << " hasSubseq=" << hasSubseq;
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "expand", 30, false, useGpu);
+  }
+}
+
+TEST(Layer, ExpandLayer) {
+  testExpandLayer("non-seq", false);  // non-seq expand to seq
+  testExpandLayer("non-seq", true);   // non-seq expand to hasSubseq
+  testExpandLayer("seq", true);       // seq expand to hasSubseq
+}
+
+void testDegradeLayer(bool hasSubseq,
+                      string layer_type,
+                      string trans_type,
+                      int stride) {
+  TestConfig config;
+  config.layerConfig.set_type(layer_type);
+  config.layerConfig.set_size(10);
+  config.layerConfig.set_seq_pool_stride(stride);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back(
+      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
+       "layer_0",
+       10,
+       0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.set_trans_type(trans_type);
+
+  auto testDegradeLayerGrad = [](TestConfig& config, string layer_type) {
+    for (auto useGpu : {false, true}) {
+      testLayerGrad(config, layer_type, 100, false, useGpu);
+    }
+  };
+
+  if (layer_type == "average") {
+    for (auto strategy : {"average", "sum", "squarerootn"}) {
+      LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
+                << " average_strategy=" << strategy
+                << " seq_pool_stride=" << stride;
+      config.layerConfig.set_average_strategy(strategy);
+      testDegradeLayerGrad(config, layer_type);
+    }
+  } else {
+    LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
+              << " seq_pool_stride=" << stride;
+    testDegradeLayerGrad(config, layer_type);
+  }
+}
+
+TEST(Layer, MaxLayer) {
+  testDegradeLayer(false, "max", "non-seq", -1);  // seq max to non-seq
+  testDegradeLayer(false,
+                   "max",
+                   "non-seq",
+                   5);  // seq max to a shorten seq, stride window = 5
+  testDegradeLayer(true, "max", "non-seq", -1);  // hasSubseq max to non-seq
+  testDegradeLayer(true, "max", "seq", -1);      // hasSubseq max to seq
+}
+
+TEST(Layer, SequenceLastInstanceLayer) {
+  testDegradeLayer(false,
+                   "seqlastins",
+                   "non-seq",
+                   -1);  // seq seqlastins to non-seq
+  testDegradeLayer(false,
+                   "seqlastins",
+                   "non-seq",
+                   5);  // seq seqlastins to a shorten seq, stride window = 5
+  testDegradeLayer(true,
+                   "seqlastins",
+                   "non-seq",
+                   -1);  // hasSubseq seqlastins to non-seq
+  testDegradeLayer(true,
+                   "seqlastins",
+                   "seq",
+                   -1);  // hasSubseq seqlastins to seq
+}
+
+TEST(Layer, AverageLayer) {
+  testDegradeLayer(false, "average", "non-seq", -1);  // seq average to non-seq
+  testDegradeLayer(false,
+                   "average",
+                   "non-seq",
+                   5);  // seq average to a shorten seq, stride window = 5
+  testDegradeLayer(true,
+                   "average",
+                   "non-seq",
+                   -1);                          // hasSubseq average to non-seq
+  testDegradeLayer(true, "average", "seq", -1);  // hasSubseq average to seq
+}
+
+TEST(Layer, SequenceConcatLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("seqconcat");
+  config.layerConfig.set_size(10);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "seqconcat", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, SequenceReshapeLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("seqreshape");
+  config.layerConfig.set_size(10);
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 100, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "seqreshape", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, ConvShiftLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("conv_shift");
+  config.layerConfig.set_size(10);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 3, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  // Not support GPU now
+  testLayerGrad(config, "conv_shift", 100, false, false);
+}
+
+TEST(Layer, PowerLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("power");
+  config.layerConfig.set_size(10);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "power", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, ConvexCombinationLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("convex_comb");
+  config.layerConfig.set_size(20);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 100, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "convex_comb", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, InterpolationLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("interpolation");
+  config.layerConfig.set_size(10);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_2", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "interpolation", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, DotProdLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("dot_prod");
+  config.layerConfig.set_size(1);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "dot_prod", 10, false, useGpu);
+  }
+}
+
+TEST(Layer, OuterProdLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("out_prod");
+  config.layerConfig.set_size(100);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "out_prod", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, SlopeInterceptLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("slope_intercept");
+  config.layerConfig.set_size(10);
+  config.layerConfig.set_slope(1.0);
+  config.layerConfig.set_intercept(0.1);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "slope_intercept", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, ScalingLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("scaling");
+  config.layerConfig.set_size(10);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "scaling", 100, false, useGpu);
+  }
+}
+
+void testNormLayer(const string& normType, bool trans, bool useGpu) {
+  TestConfig config;
+  config.layerConfig.set_type("norm");
+  config.layerConfig.set_active_type("relu");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1568, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_norm_type(normType);
+  norm->set_channels(16);
+  norm->set_size(5);
+  norm->set_scale(0.001);
+  norm->set_pow(0.75);
+  norm->set_blocked(0);
+  norm->set_img_size(14);
+  norm->set_img_size_y(7);
+  norm->set_output_x(norm->img_size());
+  norm->set_output_y(norm->img_size_y());
+  if (norm->norm_type() == "cmrnorm" ||
+      norm->norm_type() == "cmrnorm-projection") {
+    norm->set_scale(norm->scale() / norm->size());
+  } else {
+    norm->set_scale(norm->scale() / (norm->size() * norm->size()));
+  }
+
+  config.layerConfig.set_size(norm->output_x() * norm->output_y() *
+                              norm->channels());
+  config.biasSize = 0;
+
+  testLayerGrad(config, "norm", 100, trans, useGpu);
+}
+
+TEST(Layer, NormLayer) {
+  testNormLayer("cmrnorm-projection",
+                /* trans= */ false, /* useGpu= */
+                true);
+  testNormLayer("cmrnorm-projection",
+                /* trans= */ false, /* useGpu= */
+                false);
+}
+
+void setPoolConfig(TestConfig* config,
+                   PoolConfig* pool,
+                   const string& poolType) {
+  (*config).biasSize = 0;
+  (*config).layerConfig.set_type("pool");
+  (*config).layerConfig.set_num_filters(16);
+
+  int kw = 3, kh = 3;
+  int pw = 0, ph = 0;
+  int sw = 2, sh = 2;
+  pool->set_pool_type(poolType);
+  pool->set_channels(16);
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
+  pool->set_start(0);
+  pool->set_padding(pw);
+  pool->set_padding_y(ph);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+
+  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
+}
+
+void testPoolLayer(const string& poolType,
+                   bool trans,
+                   bool useGpu,
+                   bool excludeMode = true) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+
+  pool->set_img_size(14);
+  pool->set_img_size_y(14);
+  pool->set_exclude_mode(excludeMode);
+  setPoolConfig(&config, pool, poolType);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  testLayerGrad(config, "pool", 100, trans, useGpu);
+}
+
+#ifdef PADDLE_WITH_CUDA
+void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+
+  pool->set_size_y(4);
+  pool->set_stride_y(3);
+  pool->set_img_size(10);
+  pool->set_img_size_y(20);
+  setPoolConfig(&config, pool, poolType);
+  pool->set_output_y((pool->img_size_y() - pool->start() - pool->size_y()) /
+                         ((float)pool->stride_y()) +
+                     1.5);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  testLayerGrad(config, "pool", 100, trans, useGpu);
+}
+#endif
+
+TEST(Layer, PoolLayer) {
+  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
+  testPoolLayer("avg-projection",
+                /* trans= */ false,
+                /* useGpu= */ false,
+                /* excludeMode= */ false);
+  testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
+  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ false);
+
+#ifdef PADDLE_WITH_CUDA
+  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer("avg-projection",
+                /* trans= */ false,
+                /* useGpu= */ true,
+                /* excludeMode= */ false);
+  testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer2("cudnn-avg-incl-pad-pool",
+                 /* trans= */ false,
+                 /* useGpu= */ true);
+  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
+void setPool3DConfig(TestConfig* config,
+                     PoolConfig* pool,
+                     const string& poolType) {
+  // filter size
+  const int NUM_FILTERS = 16;
+  const int FILTER_SIZE = 3;
+  const int FILTER_SIZE_Y = 3;
+  const int FILTER_SIZE_Z = 3;
+  const int CHANNELS = 16;
+
+  (*config).biasSize = 0;
+  (*config).layerConfig.set_type("pool3d");
+  (*config).layerConfig.set_num_filters(NUM_FILTERS);
+
+  int kw = FILTER_SIZE, kh = FILTER_SIZE_Y, kd = FILTER_SIZE_Z;
+  int pw = 0, ph = 0, pd = 0;
+  int sw = 2, sh = 2, sd = 2;
+
+  pool->set_pool_type(poolType);
+  pool->set_pool_type("avg");
+  pool->set_channels(CHANNELS);
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
+  pool->set_size_z(kd);
+  pool->set_padding(0);
+  pool->set_padding_y(0);
+  pool->set_padding_z(0);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+  pool->set_stride_z(sd);
+  pool->set_start(0);
+  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+  int od = outputSize(pool->img_size_z(), kd, pd, sd, /* caffeMode */ false);
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
+  pool->set_output_z(od);
+}
+
+void testPool3DLayer(const string& poolType, bool trans, bool useGpu) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 11664, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+
+  const int IMAGE_SIZE = 9;
+  const int IMAGE_SIZE_Y = 9;
+  const int IMAGE_SIZE_Z = 9;
+
+  pool->set_img_size(IMAGE_SIZE);
+  pool->set_img_size_y(IMAGE_SIZE_Y);
+  pool->set_img_size_z(IMAGE_SIZE_Z);
+
+  setPool3DConfig(&config, pool, poolType);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  testLayerGrad(config, "pool3d", 100, trans, useGpu);
+}
+
+TEST(Layer, Pool3DLayer) {
+  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ false);
+  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ false);
+#ifdef PADDLE_WITH_CUDA
+  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ true);
+  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
+void testSppLayer(const string& poolType,
+                  const int pyramidHeight,
+                  bool trans,
+                  bool useGpu) {
+  TestConfig config;
+  config.layerConfig.set_type("spp");
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  SppConfig* sppConfig = input->mutable_spp_conf();
+  sppConfig->set_pool_type(poolType);
+  sppConfig->set_pyramid_height(pyramidHeight);
+  ImageConfig* imageConfig = sppConfig->mutable_image_conf();
+  imageConfig->set_channels(16);
+  imageConfig->set_img_size(10);
+  imageConfig->set_img_size_y(20);
+  int outputSize = (std::pow(4, sppConfig->pyramid_height()) - 1) / (4 - 1);
+  config.layerConfig.set_size(outputSize * imageConfig->channels());
+  testLayerGrad(config, "spp", 100, trans, useGpu);
+}
+
+TEST(Layer, SpatialPyramidPoolLayer) {
+  for (auto useGpu : {false, true}) {
+    for (auto pyramidHeight : {1, 2, 3}) {
+      testSppLayer("avg-projection", pyramidHeight, false, useGpu);
+      testSppLayer("max-projection", pyramidHeight, false, useGpu);
+    }
+  }
+}
+
+TEST(Layer, rankCostLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("rank-cost");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "rank-cost", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, sumCostLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("sum_cost");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "sum_cost", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, weightedRankCostLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("rank-cost");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_2", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_3", 1, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "weighted-rank-cost", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, TensorLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("tensor");
+  config.layerConfig.set_size(10);
+  config.layerConfig.set_active_type("sigmoid");
+  config.biasSize = config.layerConfig.size();
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 5, 250});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 5, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "tensor", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, RecurrentLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("recurrent");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("tanh");
+  config.biasSize = 4;
+
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 4, /* paraSize= */ 16});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    for (auto reversed : {false, true}) {
+      config.layerConfig.set_reversed(reversed);
+      config.testState = !reversed;
+      testLayerGrad(
+          config, "recurrent", 50, /* trans= */ false, useGpu, false, 1.0);
+    }
+  }
+}
+
+TEST(Layer, LstmLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("lstmemory");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("tanh");
+  config.layerConfig.set_active_state_type("sigmoid");
+  config.layerConfig.set_active_gate_type("sigmoid");
+  config.biasSize = 28;
+
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 64});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    for (auto reversed : {false, true}) {
+      config.layerConfig.set_reversed(reversed);
+      config.testState = !reversed;
+      testLayerGrad(
+          config, "lstmemory", 100, /* trans= */ false, useGpu, false, 0.02);
+    }
+  }
+  for (auto useGpu : {true}) {
+    config.testBatchState = true;
+    config.layerConfig.set_reversed(false);
+    testLayerGrad(config, "lstmemory", 10, /* trans= */ false, useGpu);
+  }
+}
+
+TEST(Layer, MDLstmLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("mdlstmemory");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_state_type("sigmoid");
+  config.layerConfig.set_active_gate_type("sigmoid");
+  config.biasSize = 4 * 9;
+
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_MDIM_DATA, "layer_0", 4 * 5, 4 * 4 * 5});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_directions(true);
+  config.layerConfig.add_directions(true);
+
+  for (auto useGpu : {false, true}) {
+    for (int i = 0; i < 2; i++) {
+      for (int j = 0; j < 2; j++) {
+        config.layerConfig.set_directions(0, bool(i));
+        config.layerConfig.set_directions(1, bool(j));
+        testLayerGrad(config, "mdlstmemory", 100, false, useGpu);
+      }
+    }
+  }
+}
+
+TEST(Layer, ParameterReluLayer) {
+  auto testParameterReluLayer = [&](size_t inputSize, size_t channels) {
+    TestConfig config;
+    config.layerConfig.set_type("prelu");
+    config.inputDefs.push_back({INPUT_DATA, "layer_0", inputSize, channels});
+    config.layerConfig.add_inputs();
+    config.layerConfig.set_size(inputSize);
+    config.layerConfig.set_partial_sum(inputSize /
+                                       channels);  // size of feature map
+    for (auto useGpu : {false, true}) {
+      testLayerGrad(config, "prelu", 100, false, useGpu);
+    }
+  };
+
+  testParameterReluLayer(192, 1);
+  testParameterReluLayer(192, 3);
+  testParameterReluLayer(192, 192);
+}
+
+TEST(Layer, ResizeLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("resize");
+  config.layerConfig.set_size(64);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 16, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "resize", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, RotateLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("rotate");
+  const int CHANNEL = 2;
+  const int HEIGHT = 8;
+  const int WIDTH = 4;
+  const int INPUT_SIZE = HEIGHT * WIDTH * CHANNEL;
+  config.layerConfig.set_size(INPUT_SIZE);
+  config.layerConfig.set_height(HEIGHT);
+  config.layerConfig.set_width(WIDTH);
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", INPUT_SIZE, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "rotate", 100, false, useGpu);
+  }
+}
+
+TEST(Layer, NCELayer) {
+  TestConfig config;
+  size_t numClasses = 4;
+  config.layerConfig.set_type("nce");
+  config.layerConfig.set_size(1);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_num_classes(numClasses);
+  config.biasSize = numClasses;
+
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 16 * numClasses});
+  config.inputDefs.push_back(
+      {INPUT_LABEL, "label", /* dim= */ numClasses, /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto withWeight : {false, true}) {
+    if (withWeight) {
+      config.inputDefs.push_back(
+          {INPUT_DATA_TARGET, "weight", /* dim= */ 1, /* paraSize= */ 0});
+      config.layerConfig.add_inputs();
+    }
+
+    for (auto isIdLabel : {false, true}) {
+      config.inputDefs[1] = {
+          isIdLabel ? INPUT_LABEL : INPUT_SPARSE_NON_VALUE_DATA,
+          "label",
+          /* dim= */ numClasses,
+          /* paraSize= */ 0};
+
+      for (auto withDist : {false, true}) {
+        config.layerConfig.clear_neg_sampling_dist();
+        if (withDist) {
+          double sum = 0;
+          for (size_t i = 0; i < numClasses; ++i) {
+            real p = rand();  // NOLINT use rand_r
+            config.layerConfig.add_neg_sampling_dist(p);
+            sum += p;
+          }
+          for (size_t i = 0; i < numClasses; ++i) {
+            real p = config.layerConfig.neg_sampling_dist(i) / sum;
+            config.layerConfig.set_neg_sampling_dist(i, p);
+          }
+        }
+        LOG(INFO) << "NCELayer "
+                  << " isIdLabel=" << isIdLabel << " withWeight=" << withWeight
+                  << " withDist=" << withDist;
+        // Not support GPU now
+        testLayerGrad(config,
+                      "nce",
+                      100,
+                      /* trans= */ false,
+                      /* useGpu */ false);
+      }
+    }
+  }
+}
+
+TEST(Layer, GatedRecurrentLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("gated_recurrent");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_gate_type("sigmoid");
+  config.biasSize = 12;
+
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_DATA, "layer_0", /* dim= */ 12, /* paraSize= */ 48});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    for (auto reversed : {false, true}) {
+      config.layerConfig.set_reversed(reversed);
+      config.testState = !reversed;
+      testLayerGrad(config, "gated_recurrent", 100, /* trans= */ false, useGpu);
+    }
+  }
+}
+
+TEST(Layer, GruStepLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("gru_step");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_gate_type("sigmoid");
+  config.biasSize = 12;
+
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", /* dim= */ 12, /* paraSize= */ 48});
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_1", /* dim= */ 4, /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "gruStep", 100, /* trans= */ false, useGpu);
+  }
+}
+
+TEST(Layer, LstmStepLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("lstm_step");
+  config.layerConfig.set_size(4);
+  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_state_type("sigmoid");
+  config.layerConfig.set_active_gate_type("sigmoid");
+  config.biasSize = 12;
+  config.testAccumulate = false;
+
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", /* dim= */ 16, /* paraSize= */ 0});
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_1", /* dim= */ 4, /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "lstmStep", 100, /* trans= */ false, useGpu);
+  }
+}
+
+void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
+  TestConfig config;
+  const int CHANNELS = 10;
+  const int IMG_SIZE = 16;
+  const int IMG_SIZE_Y = 8;
+  size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type("sigmoid");
+  config.biasSize = CHANNELS;
+  config.inputDefs.push_back({INPUT_DATA,
+                              "layer_0",
+                              /* dim= */ size,
+                              /* paraSize= */ CHANNELS});
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
+  config.inputDefs.back().isStatic = true;
+  config.inputDefs.push_back({INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
+  config.inputDefs.back().isStatic = true;
+
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  ImageConfig* img_conf = input->mutable_image_conf();
+  img_conf->set_channels(CHANNELS);
+  img_conf->set_img_size(IMG_SIZE);
+  img_conf->set_img_size_y(IMG_SIZE_Y);
+
+  testLayerGrad(config,
+                "batch_norm",
+                64,
+                /* trans= */ trans,
+                useGpu,
+                /* useWeight */ true);
+}
+
+TEST(Layer, BatchNormalizationLayer) {
+  testBatchNormLayer("batch_norm", false, false);
+#ifdef PADDLE_WITH_CUDA
+  testBatchNormLayer("batch_norm", false, true);
+  if (hl_get_cudnn_lib_version() >= int(4000)) {
+    testBatchNormLayer("cudnn_batch_norm", false, true);
+  }
+#endif
+}
+
+void testBatchNorm3DLayer(const string& type, bool trans, bool useGpu) {
+  TestConfig config;
+  const int CHANNELS = 10;
+  const int IMG_SIZE = 16;
+  const int IMG_SIZE_Y = 8;
+  const int IMG_SIZE_Z = 8;
+  size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y * IMG_SIZE_Z;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type("sigmoid");
+  config.biasSize = CHANNELS;
+  config.inputDefs.push_back({INPUT_DATA,
+                              "layer_0",
+                              /* dim= */ size,
+                              /* paraSize= */ CHANNELS});
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
+  config.inputDefs.back().isStatic = true;
+  config.inputDefs.push_back({INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
+  config.inputDefs.back().isStatic = true;
+
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  ImageConfig* img_conf = input->mutable_image_conf();
+  img_conf->set_channels(CHANNELS);
+  img_conf->set_img_size(IMG_SIZE);
+  img_conf->set_img_size_y(IMG_SIZE_Y);
+  img_conf->set_img_size_z(IMG_SIZE_Z);
+
+  testLayerGrad(config,
+                "batch_norm",
+                64,
+                /* trans= */ trans,
+                useGpu,
+                /* useWeight */ true);
+}
+
+TEST(Layer, testBatchNorm3DLayer) {
+  testBatchNorm3DLayer("batch_norm", false, false);
+#ifdef PADDLE_WITH_CUDA
+  testBatchNorm3DLayer("batch_norm", false, true);
+  if (hl_get_cudnn_lib_version() >= int(4000)) {
+    testBatchNorm3DLayer("cudnn_batch_norm", false, true);
+  }
+#endif
+}
+
+void testConvOperator(bool isDeconv) {
+  TestConfig config;
+  const int NUM_FILTERS = 16;
+  const int FILTER_SIZE = 2;
+  const int FILTER_SIZE_Y = 3;
+  const int CHANNELS = 3;
+  const int IMAGE_SIZE = 16;
+  const int IMAGE_SIZE_Y = 9;
+  OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
+  if (isDeconv) {
+    operatorConf.set_type("convt");
+  } else {
+    operatorConf.set_type("conv");
+  }
+  ConvConfig* conv = operatorConf.mutable_conv_conf();
+  operatorConf.set_num_filters(NUM_FILTERS);
+  conv->set_filter_size(FILTER_SIZE);
+  conv->set_filter_size_y(FILTER_SIZE_Y);
+  conv->set_channels(CHANNELS);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_img_size(IMAGE_SIZE);
+  conv->set_img_size_y(IMAGE_SIZE_Y);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /*  caffeMode */ true));
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                conv->filter_size_y(),
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /*  caffeMode */ true));
+
+  if (isDeconv) {
+    conv->set_filter_channels(NUM_FILTERS / conv->groups());
+    config.inputDefs.push_back({INPUT_DATA,
+                                "layer_0",
+                                conv->output_x() * conv->output_y() * CHANNELS,
+                                0});
+    config.layerConfig.set_size(IMAGE_SIZE * IMAGE_SIZE_Y * NUM_FILTERS);
+  } else {
+    conv->set_filter_channels(conv->channels() / conv->groups());
+    config.inputDefs.push_back(
+        {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE_Y * CHANNELS, 0});
+    config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                                NUM_FILTERS);
+  }
+
+  config.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_1",
+       FILTER_SIZE * FILTER_SIZE_Y * CHANNELS * NUM_FILTERS,
+       0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  testOperatorGrad(config, operatorConf, 100, /*useGpu*/ true, false);
+}
+
+TEST(Operator, conv) {
+  testConvOperator(/*isDeconv*/ true);
+  testConvOperator(/*isDeconv*/ false);
+}
+
+TEST(Layer, FeatureMapExpandLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("featmap_expand");
+  const int CHANNELS = 10;
+  const int INPUT_SIZE = 100;
+  config.layerConfig.set_size(INPUT_SIZE * CHANNELS);
+  config.layerConfig.set_num_filters(CHANNELS);
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
+                              "layer_0",
+                              /* dim= */ INPUT_SIZE,
+                              /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    for (auto asRowVec : {false, true}) {
+      config.layerConfig.set_user_arg(asRowVec ? "as_row_vec" : "as_col_vec");
+      testLayerGrad(config,
+                    "featmap_expand",
+                    /*batch_size*/ 100,
+                    /* trans= */ false,
+                    useGpu,
+                    /* useWeight */ true);
+    }
+  }
+}
+
+TEST(Layer, MultiplexLayer) {
+  TestConfig config;
+  const int LAYER_SIZE = 100;
+  config.layerConfig.set_type("multiplex");
+  config.layerConfig.set_size(LAYER_SIZE);
+
+  config.inputDefs.push_back({INPUT_LABEL, "layer_0", 2, 0});
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_1", /* dim= */ LAYER_SIZE, /* paraSize= */ 0});
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_2", /* dim= */ LAYER_SIZE, /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "multiplex", 512, /* trans= */ false, useGpu);
+  }
+}
+
+TEST(Layer, PadLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("pad");
+
+  int c = 4;
+  int h = 31;
+  int w = 36;
+  size_t size = c * h * w;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", size, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PadConfig* pad = input->mutable_pad_conf();
+  ImageConfig* image = pad->mutable_image_conf();
+
+  image->set_channels(c);
+  image->set_img_size(h);
+  image->set_img_size_y(w);
+  pad->add_pad_c(1);
+  pad->add_pad_c(2);
+  pad->add_pad_h(2);
+  pad->add_pad_h(3);
+  pad->add_pad_w(3);
+  pad->add_pad_w(5);
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "pad", 10, false, useGpu);
+  }
+}
+
+TEST(Layer, CrossChannelNormLayer) {
+  TestConfig config;
+  config.paramInitialMean = 1.;
+  config.paramInitialStd = 0.;
+  config.layerConfig.set_type("norm");
+  config.layerConfig.set_size(100);
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_norm_type("cross-channel-norm");
+  norm->set_channels(10);
+  norm->set_size(100);
+  norm->set_scale(0);
+  norm->set_pow(0);
+  norm->set_blocked(0);
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 100, 10});
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false);
+  }
+}
+
+TEST(Layer, smooth_l1) {
+  TestConfig config;
+  config.layerConfig.set_type("smooth_l1");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 200, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 200, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "smooth_l1", 100, false, useGpu, false);
+  }
+}
+
+TEST(Layer, multibox_loss) {
+  TestConfig config;
+  config.layerConfig.set_type("multibox_loss");
+  config.biasSize = 0;
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  MultiBoxLossConfig* multiboxLoss = input->mutable_multibox_loss_conf();
+  multiboxLoss->set_num_classes(21);
+  multiboxLoss->set_input_num(1);
+  multiboxLoss->set_overlap_threshold(0.5);
+  multiboxLoss->set_neg_pos_ratio(3);
+  multiboxLoss->set_neg_overlap(0.5);
+  multiboxLoss->set_background_id(0);
+  multiboxLoss->set_height(3);
+  multiboxLoss->set_width(3);
+
+  size_t gtNum = 1;
+  MatrixPtr labelValue = Matrix::create(gtNum, 6, false, false);
+  labelValue->randomizeUniform();
+  labelValue->add(-0.5);
+  labelValue->sigmoid(*labelValue);
+  real* labelData = labelValue->getData();
+  size_t labelWidth = labelValue->getWidth();
+  for (size_t i = 0; i < gtNum; ++i) {
+    *(labelData + i * labelWidth) = std::rand() % 20 + 1;
+    *(labelData + i * labelWidth + 1) = 0.400259;
+    *(labelData + i * labelWidth + 2) = 0.377857;
+    *(labelData + i * labelWidth + 3) = 0.525712;
+    *(labelData + i * labelWidth + 4) = 0.519368;
+  }
+  vector<int> seqStartPositions(gtNum + 1, 0);
+  for (size_t i = 1; i <= gtNum; ++i) {
+    seqStartPositions[i] = i;
+  }
+
+  // Ensure at lease one matched bbox
+  MatrixPtr priorValue = Matrix::create(1, 72, false, false);
+  priorValue->randomizeUniform();
+  priorValue->add(-0.5);
+  priorValue->sigmoid(*priorValue);
+  real* priorData = priorValue->getData();
+  *(priorData) = 0.424811;
+  *(priorData + 1) = 0.397059;
+  *(priorData + 2) = 0.538905;
+  *(priorData + 3) = 0.447091;
+  *(priorData + 4) = 0.425720;
+  *(priorData + 5) = 0.515228;
+  *(priorData + 6) = 0.519452;
+  *(priorData + 7) = 0.591065;
+
+  config.inputDefs.push_back(
+      {INPUT_SELF_DEFINE_DATA, "priorbox", priorValue, {}});
+  config.inputDefs.push_back(
+      {INPUT_SELF_DEFINE_DATA, "label", labelValue, seqStartPositions});
+  config.inputDefs.push_back({INPUT_DATA, "locPred", 36, 0});
+  config.inputDefs.push_back({INPUT_DATA, "confPred", 189, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "multibox_loss", 1, false, useGpu, false);
+  }
+}
+
+TEST(Layer, TransLayer) {
+  TestConfig config;
+  const int height = 128;
+  const int width = 256;
+  config.layerConfig.set_type("trans");
+  config.layerConfig.set_size(width);
+
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", /* dim= */ height * width, /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "trans", height, /* trans= */ false, useGpu);
+  }
+}
+
+TEST(Layer, RowConvLayer) {
+  const int context = 3;
+  const int size = 512;
+
+  TestConfig config;
+  config.layerConfig.set_type("row_conv");
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type("sigmoid");
+
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_DATA, "layer_0", size, context * size});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  RowConvConfig* conv = input->mutable_row_conv_conf();
+  conv->set_context_length(context);
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "row_conv", 100, false, useGpu, false);
+  }
+}
+
+TEST(Layer, CropLayer) {
+  TestConfig config;
+  // config input_0
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ImageConfig* img = input->mutable_image_conf();
+  img->set_channels(4);
+  img->set_img_size(16);
+  config.layerConfig.set_axis(2);
+  config.layerConfig.add_offset(0);
+  config.layerConfig.add_offset(0);
+
+  // config input_1
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 128, 0});
+  input = config.layerConfig.add_inputs();
+  img = input->mutable_image_conf();
+  img->set_channels(2);
+  img->set_img_size(8);
+
+  // config crop layer
+  config.layerConfig.set_type("crop");
+  config.layerConfig.set_name("cropLayer");
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "crop", 100, false, useGpu, false);
+  }
+}
+
+TEST(Layer, roi_pool) {
+  TestConfig config;
+  config.layerConfig.set_type("roi_pool");
+  config.biasSize = 0;
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ROIPoolConfig* roiPoolConf = input->mutable_roi_pool_conf();
+  roiPoolConf->set_pooled_width(7);
+  roiPoolConf->set_pooled_height(7);
+  roiPoolConf->set_spatial_scale(1. / 16);
+  roiPoolConf->set_width(14);
+  roiPoolConf->set_height(14);
+
+  const size_t roiNum = 10;
+  const size_t roiDim = 10;
+  const size_t batchSize = 5;
+  MatrixPtr roiValue = Matrix::create(roiNum, roiDim, false, false);
+  roiValue->zeroMem();
+  real* roiData = roiValue->getData();
+  for (size_t i = 0; i < roiNum; ++i) {
+    roiData[i * roiDim + 0] = std::rand() % batchSize;
+    roiData[i * roiDim + 1] = std::rand() % 224;  // xMin
+    roiData[i * roiDim + 2] = std::rand() % 224;  // yMin
+    size_t xMin = static_cast<size_t>(roiData[i * roiDim + 1]);
+    size_t yMin = static_cast<size_t>(roiData[i * roiDim + 2]);
+    roiData[i * roiDim + 3] = xMin + std::rand() % (224 - xMin);  // xMax
+    roiData[i * roiDim + 4] = yMin + std::rand() % (224 - yMin);  // yMax
+  }
+
+  config.inputDefs.push_back({INPUT_DATA, "input", 3 * 14 * 14, {}});
+  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "rois", roiValue, {}});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "roi_pool", batchSize, false, useGpu, false);
+  }
+}
+
+TEST(Layer, SwitchOrderLayer) {
+  TestConfig config;
+  // config input_0
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ImageConfig* img = input->mutable_image_conf();
+  img->set_channels(4);
+  img->set_img_size(16);
+  img->set_img_size_y(16);
+
+  ReshapeConfig* reshape = config.layerConfig.mutable_reshape_conf();
+  reshape->add_height_axis(0);
+  reshape->add_height_axis(1);
+  reshape->add_height_axis(2);
+  reshape->add_width_axis(3);
+
+  // config softmax layer
+  config.layerConfig.set_type("switch_order");
+  config.layerConfig.set_name("switchOrderLayer");
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "switch_order", 100, false, useGpu, true);
+  }
+}
+
+vector<real> randSampling(real range, int n) {
+  CHECK_GE(range, n);
+  vector<real> num(range);
+  iota(begin(num), end(num), 0.);
+  if (range == n) return num;
+
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  sort(begin(num), end(num));
+  return num;
+}
+
+TEST(Layer, SubNestedSequenceLayer) {
+  // layer size is not crutial for this layer,
+  // so use a small layer size in unittest
+  const int layerSize = 4;
+
+  const int maxSeqNum = 50;
+  const int maxSeqLen = 50;
+  const int maxBeamSize = 32;
+
+  srand((size_t)(time(NULL)));
+  int beamSize = 1 + (rand() % maxBeamSize);
+
+  TestConfig config;
+  config.layerConfig.set_type("sub_nested_seq");
+  config.layerConfig.set_name("sub_nested_seq_layer");
+  config.layerConfig.set_size(layerSize);
+
+  int seqNum = 1 + (rand() % maxSeqNum);
+
+  // sequence information for the first input, it is a nested sequence
+  vector<int> seqStartPos(seqNum + 1, 0);
+  vector<int> subSeqStartPos(1, 0);
+
+  // selected indices
+  MatrixPtr selectedIndices = Matrix::create(seqNum, beamSize, false, false);
+  selectedIndices->one();
+  selectedIndices->mulScalar(-1.);
+  real* indicesData = selectedIndices->getData();
+
+  for (int i = 0; i < seqNum; ++i) {
+    int subSeqNum = 1 + (rand() % maxSeqNum);
+    for (int j = 0; j < subSeqNum; ++j) {
+      subSeqStartPos.push_back(subSeqStartPos.back() +
+                               (1 + (rand() % maxSeqLen)));
+    }
+    vector<real> selSeqs =
+        randSampling(static_cast<real>(subSeqNum), min(beamSize, subSeqNum));
+    memcpy(indicesData + (i * beamSize),
+           selSeqs.data(),
+           selSeqs.size() * sizeof(real));
+    seqStartPos[i + 1] = subSeqStartPos.back();
+  }
+
+  MatrixPtr seqInputPtr =
+      Matrix::create(seqStartPos.back(), layerSize, false, false);
+  seqInputPtr->randomizeUniform();
+  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                              "nested_seq_input",
+                              seqInputPtr,
+                              seqStartPos,
+                              subSeqStartPos});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back(
+      {INPUT_SELF_DEFINE_DATA, "selected_indices", selectedIndices});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "sub_nested_seq",
+                  /* batchSize */ seqNum,
+                  /* trans */ false,
+                  /* useGpu*/ useGpu,
+                  /* useWeight */ false);
+  }
+}
+
+TEST(Layer, ClipLayer) {
+  const size_t batchSize = 128;
+  const size_t size = 512;
+  TestConfig config;
+  config.layerConfig.set_type("clip");
+  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ClipConfig* layerConf = input->mutable_clip_conf();
+  double p1 = std::rand() / (double)RAND_MAX;
+  double p2 = std::rand() / (double)RAND_MAX;
+  layerConf->set_min(std::min(p1, p2));
+  layerConf->set_max(std::max(p1, p2));
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "clip", batchSize, false, useGpu, false);
+  }
+}
+
+TEST(Layer, RowL2NormLayer) {
+  const size_t batchSize = 128;
+  const size_t size = 512;
+  TestConfig config;
+  config.layerConfig.set_type("row_l2_norm");
+  config.layerConfig.set_size(size);
+  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "row_l2_norm", batchSize, false, useGpu, false);
+  }
+}
+
+void test3DConvLayer(const string& type, bool trans, bool useGpu) {
+  // filter size
+  const int NUM_FILTERS = 6;
+  // const int CHANNELS = 3;
+  const int FILTER_SIZE = 3;
+  const int FILTER_SIZE_Y = 3;
+  const int FILTER_SIZE_Z = 3;
+
+  // input image
+  const int CHANNELS = 3;
+  const int IMAGE_SIZE = 9;
+  const int IMAGE_SIZE_Y = 9;
+  const int IMAGE_SIZE_Z = 9;
+
+  TestConfig config;
+  config.biasSize = NUM_FILTERS;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_num_filters(NUM_FILTERS);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  // Setting up conv3D-trans layer
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+
+  conv->set_channels(CHANNELS);
+  conv->set_filter_size(FILTER_SIZE);
+  conv->set_filter_size_y(FILTER_SIZE_Y);
+  conv->set_filter_size_z(FILTER_SIZE_Z);
+  conv->set_padding(0);
+  conv->set_padding_y(0);
+  conv->set_padding_z(0);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_stride_z(2);
+  conv->set_img_size(IMAGE_SIZE);
+  conv->set_img_size_y(IMAGE_SIZE_Y);
+  conv->set_img_size_z(IMAGE_SIZE_Z);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /*  caffeMode */ true));
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                conv->filter_size_y(),
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /*  caffeMode */ true));
+  conv->set_output_z(outputSize(conv->img_size_z(),
+                                conv->filter_size_z(),
+                                conv->padding_z(),
+                                conv->stride_z(),
+                                /*  caffeMode */ true));
+
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                              conv->output_z() * NUM_FILTERS);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  config.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       CHANNELS * IMAGE_SIZE * IMAGE_SIZE_Y * IMAGE_SIZE_Z,
+       conv->filter_channels() * FILTER_SIZE * FILTER_SIZE_Y * FILTER_SIZE_Z *
+           NUM_FILTERS});
+
+  testLayerGrad(config, "conv3D", 10, trans, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "conv3D", 2, trans, useGpu, true, 0.02);
+}
+
+TEST(Layer, test3DConvLayer) {
+  test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ false);
+#ifdef PADDLE_WITH_CUDA
+  test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
+void test3DDeConvLayer(const string& type, bool trans, bool useGpu) {
+  // filter size
+  const int NUM_FILTERS = 6;
+  // const int CHANNELS = 3;
+  const int FILTER_SIZE = 3;
+  const int FILTER_SIZE_Y = 3;
+  const int FILTER_SIZE_Z = 3;
+
+  // input image
+  const int CHANNELS = 3;
+  const int IMAGE_SIZE = 4;
+  const int IMAGE_SIZE_Y = 6;
+  const int IMAGE_SIZE_Z = 6;
+
+  // Setting up conv-trans layer
+  TestConfig config;
+  config.biasSize = NUM_FILTERS;
+  config.layerConfig.set_type("deconv3d");
+  config.layerConfig.set_num_filters(NUM_FILTERS);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+
+  conv->set_channels(CHANNELS);
+  conv->set_filter_size(FILTER_SIZE);
+  conv->set_filter_size_y(FILTER_SIZE_Y);
+  conv->set_filter_size_z(FILTER_SIZE_Z);
+  conv->set_padding(0);
+  conv->set_padding_y(0);
+  conv->set_padding_z(0);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_stride_z(2);
+  conv->set_output_x(IMAGE_SIZE);
+  conv->set_output_y(IMAGE_SIZE_Y);
+  conv->set_output_z(IMAGE_SIZE_Z);
+
+  conv->set_img_size(imageSize(conv->output_x(),
+                               conv->filter_size(),
+                               conv->padding(),
+                               conv->stride(),
+                               true));
+  conv->set_img_size_y(imageSize(conv->output_y(),
+                                 conv->filter_size_y(),
+                                 conv->padding_y(),
+                                 conv->stride_y(),
+                                 true));
+  conv->set_img_size_z(imageSize(conv->output_z(),
+                                 conv->filter_size_z(),
+                                 conv->padding_z(),
+                                 conv->stride_z(),
+                                 true));
+  config.layerConfig.set_size(conv->img_size() * conv->img_size_y() *
+                              conv->img_size_z() * NUM_FILTERS);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  config.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       CHANNELS * IMAGE_SIZE * IMAGE_SIZE_Y * IMAGE_SIZE_Z,
+       conv->filter_channels() * FILTER_SIZE * FILTER_SIZE_Y * FILTER_SIZE_Z *
+           NUM_FILTERS});
+
+  testLayerGrad(config, "deconv3D", 10, trans, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "deconv3D", 2, trans, useGpu, true, 0.02);
+}
+
+TEST(Layer, test3DDeConvLayer) {
+  test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ false);
+#ifdef PADDLE_WITH_CUDA
+  test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
+TEST(Layer, ScaleShiftLayer) {
+  // FIXME: Disable ScaleShiftLayer because it is not stable.
+  // https://github.com/PaddlePaddle/Paddle/issues/7781
+  return;
+  //  const size_t batchSize = 16;
+  //  const size_t size = 32;
+  //  TestConfig config;
+  //  config.layerConfig.set_type("scale_shift");
+  //  config.layerConfig.set_size(size);
+  //  config.biasSize = 1;
+  //  config.inputDefs.push_back(
+  //      {INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1});
+  //  config.layerConfig.add_inputs();
+  //  for (auto useGpu : {false, true}) {
+  //    testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false);
+  //  }
+}
+
+TEST(Layer, ScaleSubRegionLayer) {
+  const size_t batchSize = 64;
+  const size_t size = 4096;
+  TestConfig config;
+  config.layerConfig.set_type("scale_sub_region");
+  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
+  MatrixPtr indicesV = Matrix::create(batchSize, 6, false, false);
+  auto* data = indicesV->getData();
+  for (size_t i = 0; i < batchSize; ++i) {
+    data[i * 2] = 2;
+    data[i * 2 + 1] = 4;
+    data[i * 2 + 2] = 16;
+    data[i * 2 + 3] = 32;
+    data[i * 2 + 4] = 16;
+    data[i * 2 + 5] = 32;
+  }
+  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "indices", indicesV, {}});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ScaleSubRegionConfig* scaleSubRegionConf =
+      input->mutable_scale_sub_region_conf();
+  ImageConfig* imgConf = scaleSubRegionConf->mutable_image_conf();
+  imgConf->set_img_size(32);
+  imgConf->set_img_size_y(32);
+  imgConf->set_channels(4);
+  scaleSubRegionConf->set_value(2.0);
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "scale_sub_region", batchSize, false, useGpu, false);
+  }
+}
+
+TEST(Layer, L2DistanceLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("l2_distance");
+  config.layerConfig.set_size(1);
+  config.biasSize = 0;
+
+  const size_t input_dim = 27;
+  const size_t batch_size = 11;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", input_dim, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", input_dim, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "l2_distance", batch_size, false, useGpu);
+  }
+}
+
+void testFactorizationMachineLayer(InputType type, bool useGpu) {
+  const int FACTOR_SIZE = 10;
+  TestConfig config;
+  config.layerConfig.set_type("factorization_machine");
+  config.layerConfig.set_factor_size(FACTOR_SIZE);
+  config.layerConfig.set_size(1);
+  config.biasSize = 0;
+  config.inputDefs.push_back({type, "layer_0", 128, 1280});
+  config.layerConfig.add_inputs();
+  testLayerGrad(config, "factorization_machine", 16, false, useGpu, false);
+}
+
+TEST(Layer, FactorizationMachineLayer) {
+  for (auto useGpu : {false, true}) {
+    testFactorizationMachineLayer(INPUT_DATA, useGpu);
+  }
+  testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_LinearChainCRF.cpp b/paddle/legacy/gserver/tests/test_LinearChainCRF.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7082c1363a4cdadfd0e4a4497c20ae5c513bc7f1
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_LinearChainCRF.cpp
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/legacy/gserver/layers/LinearChainCRF.h"
+#include "paddle/legacy/utils/Util.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+static inline bool getNextSequence(vector<int>& seq, int numClasses) {
+  for (auto& v : seq) {
+    if (++v < numClasses) {
+      return true;
+    }
+    v = 0;
+  }
+  return false;
+}
+
+TEST(LinearChainCRF, decoding) {
+  const int numClasses = 4;
+  CpuVector para(numClasses * (numClasses + 2));
+  real* a = para.getData();
+  real* b = para.getData() + numClasses;
+  real* w = para.getData() + 2 * numClasses;
+  LinearChainCRF crf(4, para.getData());
+  for (int length : {1, 2, 3, 10}) {
+    for (int tries = 0; tries < 10; ++tries) {
+      CpuMatrix x(length, numClasses);
+      x.randomizeUniform();
+      para.randnorm(0, 2);
+      vector<int> decodingResult(length);
+      vector<int> bestResult(length);
+      vector<int> testResult(length, 0);
+      crf.decode(x.getData(), &decodingResult[0], length);
+      real bestScore = -std::numeric_limits<real>::max();
+      do {
+        real score = a[testResult.front()] + b[testResult.back()];
+        score += x.getElement(0, testResult.front());
+        for (int k = 1; k < length; ++k) {
+          score += x.getElement(k, testResult[k]) +
+                   w[numClasses * testResult[k - 1] + testResult[k]];
+        }
+        if (score > bestScore) {
+          bestScore = score;
+          bestResult = testResult;
+        }
+      } while (getNextSequence(testResult, numClasses));
+      for (int k = 0; k < length; ++k) {
+        EXPECT_EQ(decodingResult[k], bestResult[k]);
+      }
+    }
+  }
+}
diff --git a/paddle/legacy/gserver/tests/test_MKLDNN.cpp b/paddle/legacy/gserver/tests/test_MKLDNN.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c79ccd1956c5c68e5c97c2a185230b8ea9c3dea0
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_MKLDNN.cpp
@@ -0,0 +1,448 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/utils/PythonUtil.h>
+#include <string>
+#include <vector>
+#include "MKLDNNTester.h"
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/activations/MKLDNNActivation.h"
+#include "paddle/legacy/math/MathUtils.h"
+
+using namespace paddle;  // NOLINT
+
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(use_gpu);
+DECLARE_bool(use_mkldnn);
+
+#define RUN_MKLDNN_TEST(DNN_CONFIG, REF_CONFIG, DESC)         \
+  MKLDNNTester tester;                                        \
+  for (auto bs : {DESC.bs, 1}) {                              \
+    tester.run(DNN_CONFIG, REF_CONFIG, bs, DESC.ih, DESC.iw); \
+  }
+
+#define RUN_MKLDNN_TEST_LAYER(DNN_CONFIG, REF_TYPE, DESC) \
+  TestConfig ref = DNN_CONFIG;                            \
+  ref.layerConfig.set_type(REF_TYPE);                     \
+  RUN_MKLDNN_TEST(DNN_CONFIG, ref, DESC)
+
+struct testFcDesc {
+  int bs;
+  int ic;
+  int ih, iw;  // oh == ow == 1
+  int oc;
+};
+
+static void getMKLDNNFcConfig(TestConfig& cfg, const testFcDesc& pm) {
+  cfg.layerConfig.set_type("mkldnn_fc");
+  cfg.layerConfig.set_active_type("relu");
+  cfg.layerConfig.set_size(pm.oc);
+  cfg.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
+       /* size of weight= */ size_t(pm.oc * pm.ic * pm.ih * pm.iw)});
+  cfg.layerConfig.add_inputs();
+}
+
+void testFcLayer(const testFcDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNFcConfig(dnnConfig, pm);
+  for (auto biasSize : {pm.oc, 0}) {
+    dnnConfig.biasSize = biasSize;
+    RUN_MKLDNN_TEST_LAYER(dnnConfig, "fc", pm)
+  }
+}
+
+TEST(MKLDNNLayer, FcLayer) {
+  /* bs, ic, ih, iw, oc */
+  testFcLayer({2, 2, 1, 1, 3});
+  testFcLayer({3, 7, 1, 1, 19});
+  testFcLayer({8, 16, 13, 13, 32});
+  testFcLayer({4, 12, 13, 13, 18});
+  testFcLayer({2, 64, 16, 16, 32});
+  testFcLayer({15, 3, 16, 16, 6});
+}
+
+struct testConvDesc {
+  int bs, gp;
+  int ic, ih, iw;
+  int oc, oh, ow;
+  int fh, fw;
+  int ph, pw;
+  int sh, sw;
+  int dh, dw;
+};
+
+static void getMKLDNNConvConfig(TestConfig& cfg, const testConvDesc& pm) {
+  cfg.layerConfig.set_type("mkldnn_conv");
+  cfg.layerConfig.set_active_type("relu");
+  cfg.layerConfig.set_num_filters(pm.oc);
+  cfg.layerConfig.set_size(pm.oc * pm.oh * pm.ow);
+  cfg.layerConfig.set_shared_biases(true);
+  cfg.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
+       /* size of weight= */ size_t(pm.oc * pm.ic * pm.fh * pm.fw / pm.gp)});
+  LayerInputConfig* input = cfg.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_groups(pm.gp);
+  conv->set_img_size(pm.iw);
+  conv->set_img_size_y(pm.ih);
+  conv->set_output_x(pm.ow);
+  conv->set_output_y(pm.oh);
+  conv->set_filter_size(pm.fw);
+  conv->set_filter_size_y(pm.fh);
+  conv->set_channels(pm.ic);
+  conv->set_padding(pm.pw);
+  conv->set_padding_y(pm.ph);
+  conv->set_stride(pm.sw);
+  conv->set_stride_y(pm.sh);
+  conv->set_dilation(pm.dw);
+  conv->set_dilation_y(pm.dh);
+  conv->set_caffe_mode(true);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  CHECK_EQ(conv->filter_channels() * pm.gp, conv->channels())
+      << "it is indivisible";
+
+  int fh = (pm.fh - 1) * pm.dh + 1;
+  int fw = (pm.fw - 1) * pm.dw + 1;
+  int ow = outputSize(pm.iw, fw, pm.pw, pm.sw, true);
+  int oh = outputSize(pm.ih, fh, pm.ph, pm.sh, true);
+  CHECK_EQ(ow, pm.ow) << "output size check failed";
+  CHECK_EQ(oh, pm.oh) << "output size check failed";
+}
+
+void testConvLayer(const testConvDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNConvConfig(dnnConfig, pm);
+  for (auto biasSize : {pm.oc, 0}) {
+    dnnConfig.biasSize = biasSize;
+    RUN_MKLDNN_TEST_LAYER(dnnConfig, "exconv", pm)
+  }
+}
+
+TEST(MKLDNNLayer, ConvLayer) {
+  /* bs, gp, ic, ih, iw, oc, oh, ow, fh, fw, ph, pw, sh, sw, dh, dw */
+  testConvLayer({2, 1, 3, 32, 32, 16, 32, 32, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({2, 1, 8, 16, 16, 8, 16, 16, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({3, 1, 16, 32, 32, 3, 32, 32, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({8, 1, 16, 18, 18, 32, 18, 18, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({16, 1, 1, 42, 31, 32, 23, 11, 4, 5, 3, 2, 2, 3, 1, 1});
+  testConvLayer({2, 1, 8, 16, 16, 8, 8, 8, 3, 3, 1, 1, 2, 2, 1, 1});
+  testConvLayer({3, 1, 8, 13, 13, 8, 7, 7, 3, 3, 1, 1, 2, 2, 1, 1});
+  // with groups
+  testConvLayer({2, 2, 4, 5, 5, 8, 5, 5, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({2, 3, 3, 5, 5, 3, 5, 5, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({4, 4, 16, 3, 3, 16, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1});
+}
+
+struct testPoolDesc {
+  int bs, ic;  // input channel and output channel are the same
+  int ih, iw;
+  int oh, ow;
+  int fh, fw;
+  int ph, pw;
+  int sh, sw;
+};
+
+static void getMKLDNNPoolConfig(TestConfig& cfg, const testPoolDesc& pm) {
+  cfg.layerConfig.set_type("mkldnn_pool");
+  cfg.layerConfig.set_active_type("relu");
+  cfg.layerConfig.set_size(pm.ic * pm.oh * pm.ow);
+  cfg.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
+       0});
+  LayerInputConfig* input = cfg.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+  pool->set_pool_type("avg-projection");
+  pool->set_channels(pm.ic);
+  pool->set_img_size(pm.iw);
+  pool->set_img_size_y(pm.ih);
+  pool->set_output_x(pm.ow);
+  pool->set_output_y(pm.oh);
+  pool->set_size_x(pm.fw);
+  pool->set_size_y(pm.fh);
+  pool->set_padding(pm.pw);
+  pool->set_padding_y(pm.ph);
+  pool->set_stride(pm.sw);
+  pool->set_stride_y(pm.sh);
+
+  int oh = outputSize(pm.ih, pm.fh, pm.ph, pm.sh, false);
+  int ow = outputSize(pm.iw, pm.fw, pm.pw, pm.sw, false);
+  CHECK_EQ(ow, pm.ow) << "output size check failed";
+  CHECK_EQ(oh, pm.oh) << "output size check failed";
+}
+
+void testPoolLayer(const testPoolDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNPoolConfig(dnnConfig, pm);
+  LayerInputConfig* input = dnnConfig.layerConfig.mutable_inputs(0);
+  PoolConfig* pool = input->mutable_pool_conf();
+  for (auto type : {"max-projection", "avg-projection"}) {
+    pool->set_pool_type(type);
+    RUN_MKLDNN_TEST_LAYER(dnnConfig, "pool", pm)
+  }
+}
+
+TEST(MKLDNNLayer, PoolLayer) {
+  /* bs, ch, ih, iw, oh, ow, fh, fw, ph, pw, sh, sw */
+  testPoolLayer({2, 1, 4, 4, 2, 2, 3, 3, 0, 0, 2, 2});
+  testPoolLayer({10, 8, 16, 16, 8, 8, 2, 2, 0, 0, 2, 2});
+  testPoolLayer({4, 2, 5, 5, 3, 3, 3, 3, 1, 1, 2, 2});
+  testPoolLayer({8, 16, 56, 56, 28, 28, 3, 3, 0, 0, 2, 2});
+  testPoolLayer({8, 16, 14, 14, 7, 7, 3, 3, 0, 0, 2, 2});
+  testPoolLayer({4, 16, 7, 7, 1, 1, 7, 7, 0, 0, 1, 1});
+  testPoolLayer({4, 2, 5, 5, 3, 3, 5, 5, 1, 1, 1, 1});
+  testPoolLayer({2, 8, 56, 56, 29, 29, 3, 3, 1, 1, 2, 2});
+}
+
+struct testBatchNormDesc {
+  int bs;
+  int ic;
+  int ih, iw;
+};
+
+static void getMKLDNNBatchNormConfig(TestConfig& cfg,
+                                     const testBatchNormDesc& pm) {
+  cfg.layerConfig.set_size(pm.ic * pm.ih * pm.iw);
+  cfg.layerConfig.set_type("mkldnn_batch_norm");
+  cfg.biasSize = pm.ic;
+  cfg.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
+       /* size of weight= */ size_t(pm.ic)});
+  cfg.inputDefs.push_back(
+      {INPUT_DATA, "layer_1_moving_mean", 1, size_t(pm.ic)});
+  cfg.inputDefs.back().isStatic = true;
+  cfg.inputDefs.push_back({INPUT_DATA, "layer_2_moving_var", 1, size_t(pm.ic)});
+  cfg.inputDefs.back().isStatic = true;
+  LayerInputConfig* input = cfg.layerConfig.add_inputs();
+  cfg.layerConfig.set_active_type("relu");
+  cfg.layerConfig.add_inputs();
+  cfg.layerConfig.add_inputs();
+  ImageConfig* img_conf = input->mutable_image_conf();
+  img_conf->set_channels(pm.ic);
+  img_conf->set_img_size_y(pm.ih);
+  img_conf->set_img_size(pm.iw);
+}
+
+void testBatchNormLayer(const testBatchNormDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNBatchNormConfig(dnnConfig, pm);
+  TestConfig refConfig = dnnConfig;
+  refConfig.layerConfig.set_type("batch_norm");
+  // for PASS_TRAIN, use_global_stats always should be false, and batchsize != 1
+  VLOG(MKLDNN_TESTS) << "check train phase";
+  dnnConfig.layerConfig.set_use_global_stats(false);
+  refConfig.layerConfig.set_use_global_stats(false);
+  MKLDNNTester tester;
+  tester.run(dnnConfig, refConfig, pm.bs, pm.ih, pm.iw, PASS_TRAIN);
+  // for PASS_TEST, check use_global_stats true and false, and batchsize 1
+  VLOG(MKLDNN_TESTS) << "check test phase";
+  for (auto useGS : {false, true}) {
+    dnnConfig.layerConfig.set_use_global_stats(useGS);
+    refConfig.layerConfig.set_use_global_stats(useGS);
+    MKLDNNTester tester;
+    for (auto bs : {pm.bs, 1}) {
+      tester.run(dnnConfig, refConfig, bs, pm.ih, pm.iw, PASS_TEST);
+    }
+  }
+}
+
+TEST(MKLDNNLayer, BatchNormLayer) {
+  testBatchNormLayer({4, 10, 6, 6});
+  testBatchNormLayer({16, 32, 16, 16});
+  testBatchNormLayer({4, 16, 8, 10});
+}
+
+struct testLRNDesc {
+  int bs, ic, ih, iw;
+  float scale, pow;
+  int localSize;
+};
+
+void getMKLDNNLRNConfig(TestConfig& cfg, const testLRNDesc& pm) {
+  cfg.layerConfig.set_type("mkldnn_lrn");
+  cfg.layerConfig.set_active_type("relu");
+  size_t layerSize = pm.ic * pm.ih * pm.iw;
+  cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0});
+  LayerInputConfig* input = cfg.layerConfig.add_inputs();
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_channels(pm.ic);
+  norm->set_size(pm.localSize);
+  norm->set_scale(pm.scale);
+  norm->set_pow(pm.pow);
+  norm->set_blocked(0);
+  norm->set_img_size(pm.iw);
+  norm->set_img_size_y(pm.ih);
+  norm->set_output_x(norm->img_size());
+  norm->set_output_y(norm->img_size_y());
+  cfg.layerConfig.set_size(layerSize);
+  cfg.biasSize = 0;
+}
+
+void testLRNLayer(const testLRNDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNLRNConfig(dnnConfig, pm);
+  // mkldnn_lrn <==> norm with cmrnorm-projection type
+  TestConfig refConfig = dnnConfig;
+  refConfig.layerConfig.set_type("norm");
+  LayerInputConfig* input = refConfig.layerConfig.mutable_inputs(0);
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_norm_type("cmrnorm-projection");
+  norm->set_scale(norm->scale() / norm->size());
+  RUN_MKLDNN_TEST(dnnConfig, refConfig, pm)
+}
+
+TEST(MKLDNNLayer, LRNLayer) {
+  testLRNLayer({4, 10, 12, 12, 0.001f, 0.75f, 5});
+  testLRNLayer({2, 32, 6, 6, 0.001f, 0.75f, 5});
+  testLRNLayer({4, 16, 8, 10, 0.01f, 0.5f, 5});
+}
+
+struct testImageDesc {
+  int bs, ic, ih, iw;
+};
+
+static void getAddtoConfig(TestConfig& cfg,
+                           const testImageDesc& pm,
+                           const size_t nInputs = 1) {
+  cfg.biasSize = 0;
+  cfg.layerConfig.set_type("addto");
+  size_t layerSize = pm.ic * pm.ih * pm.iw;
+  cfg.layerConfig.set_size(layerSize);
+  cfg.layerConfig.set_active_type("relu");
+  for (size_t i = 0; i < nInputs; ++i) {
+    std::stringstream ss;
+    ss << "layer_" << i;
+    cfg.inputDefs.push_back({INPUT_DATA, ss.str(), layerSize, 0});
+    LayerInputConfig* input = cfg.layerConfig.add_inputs();
+    ImageConfig* img_conf = input->mutable_image_conf();
+    img_conf->set_channels(pm.ic);
+    img_conf->set_img_size_y(pm.ih);
+    img_conf->set_img_size(pm.iw);
+  }
+}
+
+void testAddtoLayer(const testImageDesc& pm, const size_t nInputs) {
+  CHECK_GE(nInputs, 1UL);
+  TestConfig dnnConfig;
+  getAddtoConfig(dnnConfig, pm, nInputs);
+  dnnConfig.layerConfig.set_type("mkldnn_addto");
+  for (auto withBias : {false, true}) {
+    dnnConfig.biasSize = withBias ? pm.ic * pm.ih * pm.iw : 0;
+    RUN_MKLDNN_TEST_LAYER(dnnConfig, "addto", pm)
+  }
+}
+
+TEST(MKLDNNLayer, AddtoLayer) {
+  testAddtoLayer({16, 5, 14, 14}, 1);
+  testAddtoLayer({8, 10, 8, 8}, 2);
+  testAddtoLayer({4, 12, 1, 1}, 3);
+}
+
+static void getMKLDNNConcatConfig(TestConfig& cfg,
+                                  const std::vector<testImageDesc>& inputs) {
+  CHECK_GE(inputs.size(), 2UL) << "at least two inputs";
+  int oc = inputs[0].ic;
+  for (size_t i = 1; i < inputs.size(); ++i) {
+    CHECK_EQ(inputs[i].bs, inputs[0].bs);
+    CHECK_EQ(inputs[i].ih, inputs[0].ih);
+    CHECK_EQ(inputs[i].iw, inputs[0].iw);
+    oc += inputs[i].ic;
+  }
+  cfg.biasSize = 0;
+  cfg.layerConfig.set_type("mkldnn_concat");
+  cfg.layerConfig.set_size(oc * inputs[0].ih * inputs[0].iw);
+  cfg.layerConfig.set_active_type("relu");
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    std::stringstream ss;
+    ss << "layer_" << i;
+    cfg.inputDefs.push_back(
+        {INPUT_DATA,
+         ss.str(),
+         (size_t)(inputs[i].ic) * inputs[i].ih * inputs[i].iw,
+         0});
+    LayerInputConfig* input = cfg.layerConfig.add_inputs();
+    ImageConfig* img_conf = input->mutable_image_conf();
+    img_conf->set_channels(inputs[i].ic);
+    img_conf->set_img_size_y(inputs[i].ih);
+    img_conf->set_img_size(inputs[i].iw);
+  }
+}
+
+void testConcatLayer(const std::vector<testImageDesc>& inputs) {
+  TestConfig dnnConfig;
+  getMKLDNNConcatConfig(dnnConfig, inputs);
+  RUN_MKLDNN_TEST_LAYER(dnnConfig, "concat", inputs[0])
+}
+
+TEST(MKLDNNLayer, ConcatLayer) {
+  testConcatLayer({{64, 128, 1, 1}, {64, 32, 1, 1}, {64, 64, 1, 1}});
+  testConcatLayer({{32, 100, 8, 8}, {32, 10, 8, 8}});
+}
+
+void testActivation(std::string actType, const testImageDesc& pm) {
+  // TODO(TJ): remove me when paddle support elu activation
+  if (actType == "mkldnn_elu") {
+    return;
+  }
+  const std::string compareTypes[] = {actType, actType.erase(0, 7)};
+  TestConfig cfg;
+  getAddtoConfig(cfg, pm);
+  TestConfig ref = cfg;
+  cfg.layerConfig.set_active_type(compareTypes[0]);
+  ref.layerConfig.set_active_type(compareTypes[1]);
+  RUN_MKLDNN_TEST(cfg, ref, pm)
+}
+
+TEST(MKLDNNActivation, Activations) {
+  auto types = MKLDNNActivation::getAllRegisteredTypes();
+  for (auto type : types) {
+    /* bs, c, h, w*/
+    testActivation(type, {16, 64, 32, 32});
+    testActivation(type, {2, 8, 1, 1});
+  }
+}
+
+DECLARE_string(config_args);
+TEST(MKLDNNNet, net) {
+  std::vector<std::string> cases = {"simple", "branch"};
+  for (auto name : cases) {
+    std::string config = "./legacy/gserver/tests/mkldnn_" + name + "_net.conf";
+    for (auto channels : {2, 32}) {
+      std::ostringstream oss;
+      oss << "channels=" << channels;
+      FLAGS_config_args = oss.str();
+      MKLDNNTester::runNetTest(config);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  FLAGS_use_gpu = false;
+  FLAGS_use_mkldnn = true;
+  initMain(argc, argv);
+  initPython(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_MaxPoolingWithMaskOutput.cpp b/paddle/legacy/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2bc261b4a87ce7f1f4ce1c936ee4151d75e17f3f
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+
+#include "LayerGradUtil.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;
+
+void setPoolConfig(TestConfig* config,
+                   PoolConfig* pool,
+                   const string& poolType) {
+  (*config).biasSize = 0;
+  (*config).layerConfig.set_type("pool");
+  (*config).layerConfig.set_num_filters(1);
+
+  int kw = 3, kh = 3;
+  int pw = 0, ph = 0;
+  int sw = 2, sh = 2;
+  pool->set_pool_type(poolType);
+  pool->set_channels(1);
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
+  pool->set_start(0);
+  pool->set_padding(pw);
+  pool->set_padding_y(ph);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+
+  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
+}
+
+void doOneMaxPoolingWithMaskOutputTest(MatrixPtr& inputMat,
+                                       const string& poolType,
+                                       bool use_gpu,
+                                       MatrixPtr& maskMat) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 25, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+
+  pool->set_img_size(5);
+  pool->set_img_size_y(5);
+  setPoolConfig(&config, pool, poolType);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  config.layerConfig.set_name("MaxPoolWithMask");
+
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+
+  initDataLayer(config,
+                &dataLayers,
+                &datas,
+                &layerMap,
+                "MaxPoolWithMask",
+                1,
+                false,
+                use_gpu);
+
+  dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
+
+  FLAGS_use_gpu = use_gpu;
+  std::vector<ParameterPtr> parameters;
+  LayerPtr maxPoolingWithMaskOutputLayer;
+  initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
+  maxPoolingWithMaskOutputLayer->forward(PASS_GC);
+
+  checkMatrixEqual(maxPoolingWithMaskOutputLayer->getOutput("mask").value,
+                   maskMat);
+}
+
+TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
+  bool useGpu = false;
+  MatrixPtr inputMat;
+  MatrixPtr maskMat;
+  real inputData[] = {0.1, 0.1, 0.5, 0.5, 1.1, 0.2, 0.2, 0.6, 0.1,
+                      0.1, 0.3, 0.3, 0.7, 0.1, 0.1, 0.4, 0.4, 0.8,
+                      0.8, 0.1, 1.0, 2.0, 3.0, 0.0, 9.0};
+  real maskData[] = {12, 4, 22, 24};
+
+  inputMat = Matrix::create(1, 25, false, useGpu);
+  maskMat = Matrix::create(1, 4, false, useGpu);
+  inputMat->setData(inputData);
+  maskMat->setData(maskData);
+  doOneMaxPoolingWithMaskOutputTest(
+      inputMat, "max-pool-with-mask", useGpu, maskMat);
+#ifdef PADDLE_WITH_CUDA
+  useGpu = true;
+  inputMat = Matrix::create(1, 25, false, useGpu);
+  maskMat = Matrix::create(1, 4, false, useGpu);
+  inputMat->copyFrom(inputData, 25);
+  maskMat->copyFrom(maskData, 4);
+  doOneMaxPoolingWithMaskOutputTest(
+      inputMat, "max-pool-with-mask", useGpu, maskMat);
+#endif
+}
diff --git a/paddle/legacy/gserver/tests/test_MultinomialSampler.cpp b/paddle/legacy/gserver/tests/test_MultinomialSampler.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..25b1a1191d0100c8ee625d3f5f36d1513164b23b
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_MultinomialSampler.cpp
@@ -0,0 +1,147 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <random>
+
+#include <gtest/gtest.h>
+#include <vector>
+
+#undef PADDLE_DISABLE_TIMER
+#include "paddle/legacy/utils/Stat.h"
+
+#include "paddle/legacy/gserver/layers/MultinomialSampler.h"
+#include "paddle/legacy/utils/Util.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+class MultinomialSamplerTester : public MultinomialSampler {
+ public:
+  MultinomialSamplerTester(real* prob, int size)
+      : MultinomialSampler(prob, size) {}
+
+  template <typename Rand1>
+  int testGen(Rand1 rand1) {
+    return gen1(rand1);
+  }
+};
+
+TEST(MultinomialSampler, gen) {
+  int numGrids = 1024 * 1024;
+  int size = 1024 * 4;
+  default_random_engine reng;
+
+  for (size_t iter = 0; iter < 256; ++iter) {
+    uniform_int_distribution<int> rand(1, numGrids / size * 1.8);
+    vector<real> prob;
+    int sum = 0;
+    for (int i = 0; i < size; ++i) {
+      prob.push_back(rand(reng));
+      sum += prob.back();
+    }
+
+    CHECK_LE(sum, numGrids);
+    prob.back() += numGrids - sum;
+
+    vector<int> counts(size);
+    MultinomialSamplerTester sampler(&prob[0], size);
+    counts.assign(size, 0);
+    {
+      double s = (double)size / (double)numGrids;
+      REGISTER_TIMER("MultinomialSampler");
+      for (double i = 0; i < numGrids; ++i) {
+        int ret = sampler.testGen([i, s]() { return s * i; });
+        if (ret < 0 || ret >= size) {
+          EXPECT_GE(ret, 0);
+          EXPECT_LT(ret, size);
+          break;
+        }
+        ++counts[ret];
+      }
+    }
+    for (int i = 0; i < size; ++i) {
+      if (prob[i] != counts[i]) {
+        EXPECT_EQ(prob[i], counts[i]);
+        LOG(INFO) << iter;
+        break;
+      }
+    }
+  }
+}
+
+void benchmarkRandom() {
+  int n = 1024 * 1024;
+
+  int sum;
+  double sum1;
+
+  sum = 0;
+  unsigned int seed = 1;
+  {
+    REGISTER_TIMER("crand");
+    for (int i = 0; i < n; ++i) {
+      sum += rand_r(&seed) % 1000;
+    }
+  }
+  LOG(INFO) << "sum=" << sum;
+
+  default_random_engine reng;
+  uniform_int_distribution<int> rand(1, 1000);
+  sum = 0;
+  {
+    REGISTER_TIMER("stdrand");
+    for (int i = 0; i < n; ++i) {
+      sum += rand(reng);
+    }
+  }
+  LOG(INFO) << "sum=" << sum;
+
+  sum = 0;
+  {
+    REGISTER_TIMER("default_random_engine");
+    for (int i = 0; i < n; ++i) {
+      sum += reng();
+    }
+  }
+  LOG(INFO) << "sum=" << sum;
+
+  uniform_real_distribution<double> rand1(0, 1);
+  sum1 = 0;
+  {
+    REGISTER_TIMER("stdrand1");
+    for (int i = 0; i < n; ++i) {
+      sum1 += rand1(reng);
+    }
+  }
+  LOG(INFO) << "sum1=" << sum1;
+
+  sum1 = 0;
+  {
+    real a = 1.0f / (real)RAND_MAX;
+    REGISTER_TIMER("crand1");
+    for (int i = 0; i < n; ++i) {
+      sum1 += a * rand_r(&seed);
+    }
+  }
+  LOG(INFO) << "sum1=" << sum1;
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  benchmarkRandom();
+  int ret = RUN_ALL_TESTS();
+  globalStat.printSegTimerStatus();
+  return ret;
+}
diff --git a/paddle/legacy/gserver/tests/test_NetworkCompare.cpp b/paddle/legacy/gserver/tests/test_NetworkCompare.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c9f9f3e61be11fa33ab37e27065fdf275f86453a
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_NetworkCompare.cpp
@@ -0,0 +1,294 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#undef PADDLE_DISABLE_TIMER
+#include <gtest/gtest.h>
+#include <paddle/legacy/utils/PythonUtil.h>
+#include <algorithm>
+#include <cstdlib>
+
+#include "paddle/legacy/trainer/Trainer.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DEFINE_bool(use_label, true, "input label or sequence label");
+DEFINE_bool(static_para, false, "static parameter");
+
+struct DataIn {
+  std::vector<Argument> inArgs;
+  std::vector<MatrixPtr> outGrads;
+  std::vector<VectorPtr> paraValues;
+};
+
+struct DataOut {
+  std::vector<MatrixPtr> outValues;
+  std::vector<VectorPtr> paraGrads;
+};
+
+void initArgument(DataIn& data,
+                  const std::string& configPath,
+                  bool useGpu = FLAGS_use_gpu) {
+  TrainerConfigHelper config(configPath);
+  size_t batchSize = config.getOptConfig().batch_size();
+
+  for (const auto& layer_name : config.getModelConfig().input_layer_names()) {
+    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
+                                     config.getModelConfig().layers().end(),
+                                     [=](const LayerConfig& layer_config) {
+                                       return layer_config.name() == layer_name;
+                                     });
+    CHECK(layer_config != config.getModelConfig().layers().end());
+
+    size_t layerSize = layer_config->size();
+    Argument arg;
+    arg.value = Matrix::create(batchSize, layerSize, false, useGpu);
+    arg.grad = Matrix::create(batchSize, layerSize, false, useGpu);
+    arg.value->randomizeUniform();
+    arg.value->add(-0.5);
+    arg.value->sigmoid(*arg.value);
+    arg.grad->zeroMem();
+    if (FLAGS_use_label) {
+      arg.ids = VectorT<int>::create(batchSize, useGpu);
+      arg.ids->rand(layerSize);
+    }
+    generateSequenceStartPositions(batchSize, arg.sequenceStartPositions);
+    data.inArgs.push_back(arg);
+  }
+
+  for (const auto& layer_name : config.getModelConfig().output_layer_names()) {
+    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
+                                     config.getModelConfig().layers().end(),
+                                     [=](const LayerConfig& layer_config) {
+                                       return layer_config.name() == layer_name;
+                                     });
+    CHECK(layer_config != config.getModelConfig().layers().end());
+
+    size_t layerSize = layer_config->size();
+    MatrixPtr grad = Matrix::create(batchSize, layerSize, false, useGpu);
+    grad->randomizeUniform();
+    data.outGrads.push_back(grad);
+  }
+
+  for (const auto& para_config : config.getModelConfig().parameters()) {
+    VectorPtr value = Vector::create(para_config.size(), useGpu);
+    value->randnorm(0, 2);
+    data.paraValues.push_back(value);
+  }
+}
+
+void calcGradient(DataIn& in, DataOut& out, const std::string& configPath) {
+  *ThreadLocalRand::getSeed() = 0;
+  srand(0);
+
+  Trainer trainer;
+  auto config = std::make_shared<TrainerConfigHelper>(configPath);
+  trainer.init(config, false);
+
+  std::vector<ParameterPtr> parameters;
+  vector<Argument> outArgs;
+
+  auto gradientMachine = trainer.getGradientMachine();
+  parameters = gradientMachine->getParameters();
+  if (FLAGS_static_para) {
+    for (size_t i = 0; i < parameters.size(); i++) {
+      parameters[i]->getBuf(PARAMETER_VALUE)->one();
+    }
+  } else {
+    for (size_t i = 0; i < in.paraValues.size(); i++) {
+      parameters[i]->getBuf(PARAMETER_VALUE)->copyFrom(*in.paraValues[i]);
+    }
+  }
+  gradientMachine->start();
+  gradientMachine->forward(in.inArgs, &outArgs, PASS_TRAIN);
+  for (size_t i = 0; i < in.outGrads.size(); i++) {
+    // If the all the layers in the config have no parameters, also
+    // not set NeedGradient(), the outArgs[i] will be nullptr.
+    outArgs[i].grad->copyFrom(*in.outGrads[i]);
+  }
+  gradientMachine->backward();
+  for (size_t i = 0; i < in.outGrads.size(); i++) {
+    MatrixPtr value = Matrix::create(outArgs[i].value->getHeight(),
+                                     outArgs[i].value->getWidth(),
+                                     false,
+                                     false);
+    value->copyFrom(*outArgs[i].value);
+    out.outValues.push_back(value);
+  }
+  for (size_t i = 0; i < in.paraValues.size(); i++) {
+    VectorPtr grad = Vector::create(
+        parameters[i]->getBuf(PARAMETER_GRADIENT)->getSize(), false);
+    grad->copyFrom(*parameters[i]->getBuf(PARAMETER_GRADIENT));
+    out.paraGrads.push_back(grad);
+  }
+
+  for (int i = 0; i < 20; i++) {
+    REGISTER_TIMER("forward");
+    gradientMachine->forward(in.inArgs, &outArgs, PASS_TRAIN);
+  }
+  for (int i = 0; i < 20; i++) {
+    REGISTER_TIMER("backward");
+    gradientMachine->backward();
+  }
+
+  gradientMachine->finish();
+}
+
+void checkBuffer(real* A,
+                 const char* desA,
+                 real* B,
+                 const char* desB,
+                 size_t len,
+                 size_t width = 1) {
+  int nNum = 0;
+  for (size_t i = 0; i < len; ++i) {
+    real diff = fabs(A[i] - B[i]);
+    if (diff > 0.0f &&
+        diff / std::max(fabs(A[i]), fabs(B[i])) > FLAGS_checkgrad_eps) {
+      nNum++;
+      LOG(INFO) << "Row: " << i / width << ", " << desA << " : " << A[i]
+                << "    " << desB << " : " << B[i];
+    }
+  }
+  EXPECT_EQ(0, nNum);
+}
+
+void compareGradient(DataOut& outA, DataOut& outB) {
+  LOG(INFO) << "------------------------------"
+            << " Check Network Output "
+            << "------------------------------";
+  for (size_t i = 0; i < outA.outValues.size(); ++i) {
+    LOG(INFO) << "OUTPUT VALUE: " << i;
+    checkBuffer(outA.outValues[i]->getData(),
+                "network A output",
+                outB.outValues[i]->getData(),
+                "network B output",
+                outA.outValues[i]->getElementCnt(),
+                outA.outValues[i]->getWidth());
+  }
+
+  if (!FLAGS_static_para) {
+    LOG(INFO) << "------------------------------"
+              << " Check Parameters "
+              << "------------------------------";
+    for (size_t i = 0; i < outA.paraGrads.size(); ++i) {
+      LOG(INFO) << "PARAMETER GRADIENT: " << i;
+      checkBuffer(outA.paraGrads[i]->getData(),
+                  "Network A",
+                  outB.paraGrads[i]->getData(),
+                  "Network B",
+                  outA.paraGrads[i]->getSize());
+    }
+  }
+}
+
+void compareNetwork(const std::string& config_file_a,
+                    const std::string& config_file_b) {
+  DataIn in;
+  initArgument(in, config_file_a);
+
+  DataOut dataA;
+  calcGradient(in, dataA, config_file_a);
+  LOG(INFO) << "forwardBackward of Network A is finished";
+  globalStat.printSegTimerStatus();
+  globalStat.reset();
+  LOG(INFO) << "\n\n";
+
+  DataOut dataB;
+  calcGradient(in, dataB, config_file_b);
+  LOG(INFO) << "forwardBackward of the Network B is finished";
+  globalStat.printSegTimerStatus();
+  globalStat.reset();
+  LOG(INFO) << "\n\n";
+
+  compareGradient(dataA, dataB);
+}
+
+TEST(Compare, concat_dotmul) {
+  std::string config_file_a = "./legacy/gserver/tests/concat_dotmul_a.conf";
+  std::string config_file_b = "./legacy/gserver/tests/concat_dotmul_b.conf";
+  compareNetwork(config_file_a, config_file_b);
+}
+
+TEST(Compare, concat_fullmatrix) {
+  std::string config_file_a = "./legacy/gserver/tests/concat_fullmatrix_a.conf";
+  std::string config_file_b = "./legacy/gserver/tests/concat_fullmatrix_b.conf";
+  compareNetwork(config_file_a, config_file_b);
+}
+
+TEST(Compare, concat_table) {
+  std::string config_file_a = "./legacy/gserver/tests/concat_table_a.conf";
+  std::string config_file_b = "./legacy/gserver/tests/concat_table_b.conf";
+  compareNetwork(config_file_a, config_file_b);
+}
+
+TEST(Compare, concat_slice) {
+  std::string config_file_a = "./legacy/gserver/tests/concat_slice_a.conf";
+  std::string config_file_b = "./legacy/gserver/tests/concat_slice_b.conf";
+  compareNetwork(config_file_a, config_file_b);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(Compare, img_pool) {
+  std::string config_file_a = "./legacy/gserver/tests/img_pool_a.conf";
+  std::string config_file_b = "./legacy/gserver/tests/img_pool_b.conf";
+  bool useGpu = FLAGS_use_gpu;
+  FLAGS_use_gpu = true;
+  compareNetwork(config_file_a, config_file_b);
+  FLAGS_use_gpu = useGpu;
+}
+
+TEST(Compare, img_conv) {
+  std::string config_file_a = "./legacy/gserver/tests/img_conv_a.conf";
+  std::string config_file_b = "./legacy/gserver/tests/img_conv_b.conf";
+  bool useGpu = FLAGS_use_gpu;
+  FLAGS_use_gpu = true;
+  compareNetwork(config_file_a, config_file_b);
+  FLAGS_use_gpu = useGpu;
+}
+
+// Test cudnn_conv and exconv give the same result
+TEST(Compare, img_conv2) {
+  std::string config_file_a = "./legacy/gserver/tests/img_conv_cudnn.py";
+  std::string config_file_b = "./legacy/gserver/tests/img_conv_exconv.py";
+  bool useGpu = FLAGS_use_gpu;
+  double eps = FLAGS_checkgrad_eps;
+  FLAGS_use_gpu = true;
+  // Sometimes, this unit test will fail with 1e-2
+  FLAGS_checkgrad_eps = 4e-2;
+  compareNetwork(config_file_a, config_file_b);
+  FLAGS_use_gpu = useGpu;
+  FLAGS_checkgrad_eps = eps;
+}
+#endif
+
+DEFINE_string(config_file_a, "", "config of one network to compare");
+DEFINE_string(config_file_b, "", "config of another network to compare");
+TEST(Compare, network) {
+  if (FLAGS_config_file_a != "" && FLAGS_config_file_b != "") {
+    compareNetwork(FLAGS_config_file_a, FLAGS_config_file_b);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  paddle::initMain(argc, argv);
+  initPython(argc, argv);
+  int ret = RUN_ALL_TESTS();
+  return ret;
+}
diff --git a/paddle/gserver/tests/test_PriorBox.cpp b/paddle/legacy/gserver/tests/test_PriorBox.cpp
similarity index 100%
rename from paddle/gserver/tests/test_PriorBox.cpp
rename to paddle/legacy/gserver/tests/test_PriorBox.cpp
diff --git a/paddle/legacy/gserver/tests/test_PyDataProvider.cpp b/paddle/legacy/gserver/tests/test_PyDataProvider.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0209e6818a8340fe128146909b9e8ec610e310a3
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_PyDataProvider.cpp
@@ -0,0 +1,177 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "paddle/legacy/gserver/dataproviders/PyDataProvider.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include "paddle/testing/TestUtil.h"
+
+using namespace std;     // NOLINT
+using namespace paddle;  // NOLINT
+
+void simpleValueCheck(const vector<Argument>& argumentList, bool useGpu);
+void simpleSequenceCheck(const vector<Argument>& argumentList, int sample_num);
+
+TEST(PyDataProvider, py_fill_slots) {
+  DataConfig config;
+  config.set_type("py");
+  config.set_async_load_data(false);
+  config.set_load_data_module(std::string("pyDataProvider"));
+  config.set_load_data_object(std::string("SimpleDataProvider"));
+  config.clear_files();
+  std::string dataFile =
+      "legacy/gserver/tests/pyDataProvider/pyDataProviderList";
+  config.set_files(dataFile);
+#ifndef PADDLE_WITH_CUDA
+  bool useGpu = false;
+#else
+  bool useGpu = true;
+#endif
+  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
+  DataBatch dataBatch;
+  dataProvider->getNextBatchInternal(2, &dataBatch);
+  const std::vector<Argument>& argumentList = dataBatch.getStreams();
+  // Check size
+  EXPECT_EQ(argumentList.size(), 3UL);
+  EXPECT_EQ(argumentList[0].value->getWidth(), 3UL);
+  EXPECT_EQ(argumentList[0].value->getHeight(), 2UL);
+  EXPECT_EQ(argumentList[0].value->getElementCnt(), 6UL);
+  EXPECT_EQ(argumentList[1].value->getWidth(), 7UL);
+  EXPECT_EQ(argumentList[1].value->getHeight(), 2UL);
+  EXPECT_EQ(argumentList[1].value->getElementCnt(), 4UL);
+  EXPECT_EQ(argumentList[2].ids->getSize(), 2UL);
+  // Check value
+  simpleValueCheck(argumentList, useGpu);
+  // Check sequenceStartPositions
+  simpleSequenceCheck(argumentList, 2);
+}
+
+TEST(PyDataProvider, py_fill_nest_slots) {
+  DataConfig config;
+  config.set_type("py");
+  config.set_async_load_data(false);
+  config.set_load_data_module(std::string("pyDataProvider"));
+  config.set_load_data_object(std::string("SimpleNestDataProvider"));
+  config.clear_files();
+  std::string dataFile =
+      "legacy/gserver/tests/pyDataProvider/pyDataProviderList";
+  config.set_files(dataFile);
+  EXPECT_EQ(config.IsInitialized(), true);
+#ifndef PADDLE_WITH_CUDA
+  bool useGpu = false;
+#else
+  bool useGpu = true;
+#endif
+  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
+  DataBatch dataBatch;
+  dataProvider->getNextBatchInternal(2, &dataBatch);
+  const std::vector<Argument>& argumentList = dataBatch.getStreams();
+  // Check size
+  EXPECT_EQ(argumentList.size(), 3UL);
+  EXPECT_EQ(argumentList[0].value->getWidth(), 3UL);
+  EXPECT_EQ(argumentList[0].value->getHeight(), 4UL);
+  EXPECT_EQ(argumentList[0].value->getElementCnt(), 12UL);
+  EXPECT_EQ(argumentList[1].value->getWidth(), 7UL);
+  EXPECT_EQ(argumentList[1].value->getHeight(), 4UL);
+  EXPECT_EQ(argumentList[1].value->getElementCnt(), 8UL);
+  EXPECT_EQ(argumentList[2].ids->getSize(), 4UL);
+  // Check value
+  simpleValueCheck(argumentList, useGpu);
+  // Check sequenceStartPositions
+  simpleSequenceCheck(argumentList, 4);
+  // Check subSequenceStartPositions
+  EXPECT_EQ(argumentList[0].subSequenceStartPositions->getSize(), 4UL);
+  EXPECT_EQ(argumentList[1].subSequenceStartPositions->getSize(), 3UL);
+  EXPECT_EQ(argumentList[2].subSequenceStartPositions->getSize(), 4UL);
+  for (size_t i = 0; i < argumentList.size(); i++) {
+    EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(0), 0);
+    EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(1), 1);
+    if (i != 1) {
+      EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(2), 2);
+      EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(3), 4);
+    } else {
+      EXPECT_EQ(argumentList[i].subSequenceStartPositions->getElement(2), 4);
+    }
+  }
+}
+
+void simpleValueCheck(const vector<Argument>& argumentList, bool useGpu) {
+  // Dense
+  real* data;
+  if (useGpu) {
+    MatrixPtr cpuMatrixPtr = Matrix::create(argumentList[0].value->getHeight(),
+                                            argumentList[0].value->getWidth(),
+                                            0,
+                                            0);
+    cpuMatrixPtr->copyFrom(*argumentList[0].value);
+    data = cpuMatrixPtr->getData();
+  } else {
+    data = argumentList[0].value->getData();
+  }
+  for (size_t i = 0; i < argumentList[0].value->getElementCnt(); ++i) {
+    EXPECT_EQ(*(data + i), (float)(i % 3 + 1));
+  }
+  // Sparse without value
+  GpuSparseMatrixPtr matGpu;
+  CpuSparseMatrixPtr matCpu;
+  if (useGpu) {
+    matGpu = dynamic_pointer_cast<GpuSparseMatrix>(argumentList[1].value);
+    ASSERT_TRUE(matGpu != NULL);
+  } else {
+    data = argumentList[0].value->getData();
+    matCpu = dynamic_pointer_cast<CpuSparseMatrix>(argumentList[1].value);
+    ASSERT_TRUE(matCpu != NULL);
+  }
+  for (size_t i = 0; i < argumentList[1].value->getHeight(); ++i) {
+    size_t colNum = useGpu ? matGpu->getColNum(i) : matCpu->getColNum(i);
+    EXPECT_EQ(colNum, (size_t)2);
+    const int* buf = useGpu ? matGpu->getRowCols(i) : matCpu->getRowCols(i);
+    for (size_t j = 0; j < colNum; ++j) {
+      EXPECT_EQ((size_t)buf[j], (size_t)(j + 1));
+    }
+  }
+  // Index
+  for (size_t j = 0; j < argumentList[2].ids->getSize(); ++j) {
+    EXPECT_EQ((size_t)argumentList[2].ids->get(j), 0UL);
+  }
+}
+
+void simpleSequenceCheck(const vector<Argument>& argumentList, int sample_num) {
+  EXPECT_EQ(argumentList[0].sequenceStartPositions->getSize(), 3UL);
+  EXPECT_EQ(argumentList[1].sequenceStartPositions->getSize(), 2UL);
+  EXPECT_EQ(argumentList[2].sequenceStartPositions->getSize(), 3UL);
+  for (size_t i = 0; i < argumentList.size(); i++) {
+    EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(0), 0);
+    if (i != 1) {
+      EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(1), 1);
+      EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(2),
+                sample_num);
+    } else {
+      EXPECT_EQ(argumentList[i].sequenceStartPositions->getElement(1),
+                sample_num);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  initPython(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_PyDataProvider2.cpp b/paddle/legacy/gserver/tests/test_PyDataProvider2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..de313ba82cf2697c13d6eae17056240b6272ca1c
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_PyDataProvider2.cpp
@@ -0,0 +1,409 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_NO_PYTHON
+#include <gtest/gtest.h>
+#include <fstream>
+#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+#include "paddle/legacy/utils/Util.h"
+
+DEFINE_string(train_list, "unittest.list", "file list for unittest");
+
+namespace paddle {
+namespace unittest {
+namespace pydp2 {
+extern void setOnPoolFilledHook(const std::function<void(size_t)> &func);
+extern void clearOnPoolFilledHook();
+
+}  // namespace pydp2
+}  // namespace unittest
+}  // namespace paddle
+
+const paddle::real epsilon = 1e-5;
+
+static inline int64_t readDataBatch(paddle::DataBatch *batch,
+                                    const std::string &funcName,
+                                    int64_t batchSize = 65535) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object(funcName);
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->setSkipShuffle();
+  provider->reset();
+  return provider->getNextBatchInternal(batchSize, batch);
+}
+
+TEST(PyDataProvider2, dense_no_seq) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_dense_no_seq");
+
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+
+  provider->setSkipShuffle();  // skip shuffle for unittest.
+
+  paddle::DataBatch batch;
+  for (size_t pass = 0; pass < 2; ++pass) {  // read 2 passes
+    provider->reset();
+    int64_t num = provider->getNextBatchInternal(100, &batch);
+    ASSERT_NE(num, 0);
+    ASSERT_EQ((size_t)batch.getStreams().size(), (size_t)1);
+    ASSERT_EQ((size_t)batch.getSize(), (size_t)100);
+    // Check batch data.
+    for (size_t i = 0; i < 100; ++i) {
+      for (size_t j = 0; j < 200; ++j) {
+        paddle::real tmp = (paddle::real)((j - 100.0) * (i + 1) / 200.0);
+        ASSERT_NEAR(
+            batch.getStreams()[0].value->getData()[i * 200 + j], tmp, epsilon);
+      }
+    }
+
+    num = provider->getNextBatchInternal(100, &batch);
+    ASSERT_NE(num, 0);
+    ASSERT_EQ(batch.getStreams().size(), (size_t)1);
+    ASSERT_EQ((size_t)batch.getSize(), (size_t)100);
+    // Check batch data.
+    for (size_t i = 0; i < 100; ++i) {
+      size_t ii = i + 100;
+      for (size_t j = 0; j < 200; ++j) {
+        paddle::real tmp = (paddle::real)((j - 100.0) * (ii + 1) / 200.0);
+        ASSERT_NEAR(
+            batch.getStreams()[0].value->getData()[i * 200 + j], tmp, epsilon);
+      }
+    }
+    num = provider->getNextBatchInternal(100, &batch);
+    ASSERT_EQ(num, 0);
+  }
+}
+
+TEST(PyDataProvider2, index_no_seq) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_index_no_seq");
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+
+  provider->setSkipShuffle();  // skip shuffle for unittest.
+  paddle::DataBatch batch;
+  for (size_t pass = 0; pass < 2; ++pass) {
+    provider->reset();
+    int64_t num = provider->getNextBatchInternal(10000, &batch);
+    CHECK_EQ(num, 200);
+    for (int i = 0; i < 200; ++i) {
+      CHECK_EQ(i, batch.getStreams()[0].ids->getData()[i]);
+    }
+  }
+}
+
+TEST(PyDataProvider2, init_hook) {
+  paddle::PyObjectPtr pickle = paddle::py::import("pickle");
+  paddle::PyObjectPtr globals(PyModule_GetDict(PyImport_AddModule("__main__")));
+  PyDict_SetItemString(globals.get(), "pickle", pickle.get());
+  paddle::PyObjectPtr locals(PyDict_New());
+  paddle::PyObjectPtr mdl(PyRun_String(
+      "dumps = pickle.dumps({'value':[float(x) for x in xrange(20)]})",
+      Py_file_input,
+      globals.get(),
+      locals.get()));
+  CHECK_PY(mdl) << "Error!";
+  paddle::PyObjectPtr dps(PyDict_GetItemString(locals.get(), "dumps"));
+  CHECK_PY(dps) << "Error!";
+
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_init_hook");
+  config.set_load_data_args(PyString_AsString(dps.get()));
+
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->setSkipShuffle();  // skip shuffle for unittest.
+  provider->reset();
+  paddle::DataBatch batch;
+  int64_t num = provider->getNextBatchInternal(100000, &batch);
+  ASSERT_EQ(num, 200);
+  auto &mat = batch.getStreams()[0].value;
+  ASSERT_EQ((size_t)mat->getWidth(), (size_t)20);
+  for (size_t i = 0; i < 200; ++i) {
+    for (size_t j = 0; j < 20; ++j) {
+      ASSERT_NEAR((paddle::real)j, mat->getData()[i * 20 + j], epsilon);
+    }
+  }
+}
+
+TEST(PyDataProvider2, sparse_no_value_no_seq) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_sparse_non_value_no_seq");
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->setSkipShuffle();
+  provider->reset();
+  paddle::DataBatch batch;
+  int64_t num = provider->getNextBatchInternal(10000, &batch);
+  CHECK_EQ(num, 200);
+  auto csm = std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(
+      batch.getStreams()[0].value);
+  CHECK(csm != nullptr);
+  for (int i = 0; i < 200; ++i) {
+    CHECK_EQ(csm->getColNum(i), (size_t)10);
+    int *cols = csm->getRowCols(i);
+    for (int j = 0; j < 10; ++j) {
+      CHECK_EQ(cols[j], (i + 1) * (j + 1));
+    }
+  }
+}
+
+TEST(PyDataProvider2, sparse_value_no_seq) {
+  paddle::DataBatch batch;
+  CHECK_EQ(readDataBatch(&batch, "test_sparse_value_no_seq"), 200);
+  auto csm = std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(
+      batch.getStreams()[0].value);
+  CHECK(csm != nullptr);
+  for (int i = 0; i < 200; ++i) {
+    CHECK_EQ(csm->getColNum(i), (size_t)10);
+    int *cols = csm->getRowCols(i);
+    real *dat = csm->getRowValues(i);
+    for (int j = 0; j < 10; ++j) {
+      EXPECT_EQ(cols[j], (i + 1) * (j + 1));
+      EXPECT_EQ(dat[j], real(j) / real(i + 1));
+    }
+  }
+}
+
+TEST(PyDataProvider2, index_seq) {
+  paddle::DataBatch batch;
+  CHECK_EQ(readDataBatch(&batch, "test_index_seq"), 200);
+  auto &arg = batch.getStreams()[0];
+  CHECK_EQ((int)arg.ids->getSize(), (200 + 1) * 200 / 2);
+  size_t tmp = 0;
+  for (size_t i = 0; i < 200; ++i) {  // CHECK DATA CORRECT
+    for (size_t j = 0; j < i + 1; ++j) {
+      ASSERT_EQ((size_t)arg.ids->getData()[tmp], j);
+      ++tmp;
+    }
+  }
+  ASSERT_EQ(arg.sequenceStartPositions->getSize(), (size_t)201);
+  tmp = 0;
+  for (size_t i = 0; i < 200; ++i) {
+    tmp += i;
+    ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[i], tmp);
+  }
+  tmp += 200;
+  ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[200], tmp);
+}
+
+TEST(PyDataProvider2, index_sub_seq) {
+  paddle::DataBatch batch;
+  ASSERT_EQ(readDataBatch(&batch, "test_index_sub_seq"), 200);
+  auto &arg = batch.getStreams()[0];
+  size_t tmp = 0;
+  for (size_t i = 0; i < 200; ++i) {
+    for (size_t j = 0; j < i + 1; ++j) {
+      for (size_t k = 0; k < j + 1; ++k) {
+        CHECK_EQ((size_t)arg.ids->getData()[tmp++], k);
+      }
+    }
+  }
+
+  CHECK_EQ(tmp, arg.ids->getSize());
+
+  ASSERT_EQ((size_t)arg.sequenceStartPositions->getSize(), (size_t)201);
+  ASSERT_EQ(arg.subSequenceStartPositions->getData(false)[0], 0);
+  ASSERT_EQ(arg.sequenceStartPositions->getData(false)[0], 0);
+  size_t idx = 1;
+  tmp = 0;
+  for (size_t i = 0; i < 200; ++i) {
+    for (size_t j = 0; j < i + 1; ++j) {
+      tmp += j + 1;
+      ASSERT_EQ((size_t)arg.subSequenceStartPositions->getData(false)[idx],
+                (size_t)tmp);
+      ++idx;
+    }
+    ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[i + 1], tmp);
+  }
+}
+
+TEST(PyDataProvider2, min_pool_size) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_min_pool_size");
+  config.set_load_data_args("");
+  size_t totalData = 1 << 14;
+  constexpr size_t batchSize = 100;
+  constexpr size_t minPoolSize = 1000;
+  paddle::DataBatch batch;
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->reset();
+
+  paddle::unittest::pydp2::setOnPoolFilledHook([&](size_t poolSize) {
+    if (totalData > batchSize) {
+      CHECK_GE(poolSize, std::min(totalData - batchSize, minPoolSize));
+    }
+  });
+  while (true) {
+    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
+    if (realBatchSize) {
+      totalData -= realBatchSize;
+    } else {
+      break;
+    }
+  }
+  paddle::unittest::pydp2::clearOnPoolFilledHook();
+}
+
+TEST(PyDataProvider2, can_over_batch_size) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_can_over_batch_size");
+  config.set_load_data_args("");
+  paddle::DataBatch batch;
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->reset();
+  constexpr size_t batchSize = 100;
+  while (true) {
+    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
+    if (realBatchSize) {
+      CHECK_LE(static_cast<size_t>(realBatchSize), batchSize);
+    } else {
+      break;
+    }
+  }
+}
+
+TEST(PyDataProvider2, input_order) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_input_order");
+  config.set_load_data_args("");
+
+  paddle::ModelConfig modelConfig;
+  *modelConfig.add_input_layer_names() = "input1";
+  *modelConfig.add_input_layer_names() = "input2";
+  paddle::DataBatch batch;
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, modelConfig, false));
+  provider->reset();
+  constexpr size_t batchSize = 100;
+  while (true) {
+    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
+    if (!realBatchSize) {
+      break;
+    }
+    ASSERT_EQ(batch.getStreams().size(), static_cast<size_t>(2));
+    for (int64_t i = 0; i < realBatchSize; ++i) {
+      ASSERT_EQ(batch.getStream(0).ids->getData()[i], 0);
+      ASSERT_EQ(batch.getStream(1).ids->getData()[i], 1);
+    }
+  }
+}
+
+TEST(PyDataProvider2, test_check) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_check");
+  config.set_load_data_args("");
+  paddle::DataBatch batch;
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->reset();
+  while (true) {
+    int64_t realBatchSize = provider->getNextBatchInternal(100, &batch);
+    if (!realBatchSize) {
+      break;
+    } else {
+      auto &ivec = batch.getStream(0).ids;
+      for (size_t i = 0; i < ivec->getSize(); ++i) {
+        CHECK_LT(ivec->getData()[i], 10);
+      }
+    }
+  }
+}
+
+TEST(PyDataProvider2, multiThread) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_dense_no_seq");
+  config.set_async_load_data(true);
+
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->reset();
+  paddle::DataBatch batch;
+  provider->getNextBatch(100, &batch);
+  provider->reset();
+  provider.reset();
+}
+
+TEST(PyDataProvider2, minPoolSizeWithCache) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_min_pool_size_with_cache");
+  config.set_async_load_data(true);
+
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+
+  paddle::DataBatch batch;
+
+  for (int i = 0; i < 10; ++i) {
+    provider->reset();
+    int64_t sum = 0;
+    while (int64_t actualNum = provider->getNextBatch(100, &batch)) {
+      sum += actualNum;
+    }
+    ASSERT_EQ(1 << 20, sum);
+  }
+}
+
+int main(int argc, char **argv) {
+  testing::InitGoogleTest(&argc, argv);
+  paddle::initMain(argc, argv);
+  paddle::initPython(argc, argv);
+
+  std::ofstream fout(FLAGS_train_list);
+  CHECK(fout.is_open());
+  fout << "stub file name" << std::endl;  // in unittest, filename is not used.
+  fout.close();
+
+  return RUN_ALL_TESTS();
+}
+
+#endif
diff --git a/paddle/gserver/tests/test_PyDataProvider2.py b/paddle/legacy/gserver/tests/test_PyDataProvider2.py
similarity index 100%
rename from paddle/gserver/tests/test_PyDataProvider2.py
rename to paddle/legacy/gserver/tests/test_PyDataProvider2.py
diff --git a/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..153c3e7f36a30a70d0c5870144a0091b1e5f7237
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -0,0 +1,180 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/gserver/gradientmachines/GradientMachine.h>
+#include <paddle/legacy/parameter/ParameterUpdateFunctions.h>
+#include <paddle/legacy/trainer/Trainer.h>
+#include <paddle/legacy/trainer/TrainerInternal.h>
+#include <paddle/legacy/utils/PythonUtil.h>
+#include <paddle/legacy/utils/Util.h>
+#include <paddle/legacy/utils/Version.h>
+
+DECLARE_int32(seed);
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+class TrainerForTest : public paddle::Trainer {
+ public:
+  void startTrain() {
+    GradientMachine& gm = *this->trainerInternal_.getGradientMachine();
+    gm.start();
+  }
+
+  void finishTrain() {
+    GradientMachine& gm = *this->trainerInternal_.getGradientMachine();
+    gm.finish();
+  }
+
+  /**
+   * Get total dimension of all parameters.
+   *
+   * @return the total dimension of all parameters
+   */
+  size_t getTotalParameterSize() const {
+    auto p = const_cast<TrainerForTest*>(this);
+    auto& params = p->getGradientMachine()->getParameters();
+    return std::accumulate(
+        params.begin(), params.end(), 0UL, [](size_t a, const ParameterPtr& p) {
+          return a + p->getSize();
+        });
+  }
+};
+
+void CalCost(const string& conf,
+             const string& dir,
+             real* cost,
+             int num_passes) {
+  auto config = std::make_shared<TrainerConfigHelper>(conf);
+  TrainerForTest trainer;
+  trainer.init(config);
+  mkDir(dir.c_str());
+  config->setSaveDir(dir);
+  auto dataProvider = trainer.getDataProvider();
+  int32_t batchSize = config->getOptConfig().batch_size();
+  real learningRate = config->getOptConfig().learning_rate();
+  real momentum = 0;
+  real decayRate = 0;
+  int64_t dim = trainer.getTotalParameterSize();
+  CpuVector vecW(dim);
+  CpuVector vecGradient(dim);
+  CpuVector vecMomentum(dim);
+
+  // vecW needs to be assigned, otherwise the variable is an uncertain value.
+
+  *ThreadLocalRand::getSeed() = FLAGS_seed;
+  vecW.randnorm(0, 0.1);
+  vecMomentum.randnorm(0, 0.1);
+
+  trainer.startTrain();
+  for (int i = 0; i < num_passes; ++i) {
+    real totalCost = 0;
+    dataProvider->reset();
+    while (true) {
+      DataBatch dataBatch;
+      int num = dataProvider->getNextBatch(batchSize, &dataBatch);
+      if (num == 0) break;
+      totalCost += trainer.calcGradient(dataBatch, vecW, vecGradient);
+      sgdUpdate(
+          learningRate, momentum, decayRate, &vecW, &vecGradient, &vecMomentum);
+    }
+    cost[i] = totalCost;
+  }
+  trainer.finishTrain();
+  rmDir(dir.c_str());
+}
+
+void test(const string& conf1, const string& conf2, double eps, bool useGpu) {
+  if (!paddle::version::isWithGpu() && useGpu) {
+    return;
+  }
+  FLAGS_use_gpu = useGpu;
+  int num_passes = 5;
+  real* cost1 = new real[num_passes];
+  const string dir1 = "legacy/gserver/tests/t1";
+  CalCost(conf1, dir1, cost1, num_passes);
+
+  real* cost2 = new real[num_passes];
+  const string dir2 = "legacy/gserver/tests/t2";
+  CalCost(conf2, dir2, cost2, num_passes);
+
+  for (int i = 0; i < num_passes; i++) {
+    LOG(INFO) << "num_passes: " << i << ", cost1=" << cost1[i]
+              << ", cost2=" << cost2[i]
+              << ", diff=" << std::abs(cost1[i] - cost2[i]);
+    ASSERT_NEAR(cost1[i], cost2[i], eps);
+  }
+  delete[] cost1;
+  delete[] cost2;
+}
+
+TEST(RecurrentGradientMachine, HasSubSequence) {
+  for (bool useGpu : {false, true}) {
+    test("legacy/gserver/tests/sequence_layer_group.conf",
+         "legacy/gserver/tests/sequence_nest_layer_group.conf",
+         1e-5,
+         useGpu);
+  }
+}
+
+TEST(RecurrentGradientMachine, rnn) {
+  for (bool useGpu : {false, true}) {
+    test("legacy/gserver/tests/sequence_rnn.conf",
+         "legacy/gserver/tests/sequence_nest_rnn.conf",
+         1e-6,
+         useGpu);
+  }
+}
+
+TEST(RecurrentGradientMachine, rnn_multi_input) {
+  for (bool useGpu : {false, true}) {
+    test("legacy/gserver/tests/sequence_rnn_multi_input.conf",
+         "legacy/gserver/tests/sequence_nest_rnn_multi_input.conf",
+         1e-6,
+         useGpu);
+  }
+}
+
+TEST(RecurrentGradientMachine, rnn_multi_unequalength_input) {
+  for (bool useGpu : {false, true}) {
+    test("legacy/gserver/tests/sequence_rnn_multi_unequalength_inputs.py",
+         "legacy/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py",
+         1e-6,
+         useGpu);
+  }
+}
+
+TEST(RecurrentGradientMachine, rnn_mixed_input) {
+  for (bool useGpu : {false, true}) {
+    test("legacy/gserver/tests/sequence_rnn_mixed_inputs.py",
+         "legacy/gserver/tests/sequence_rnn_matched_inputs.py",
+         1e-6,
+         useGpu);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+
+  if (paddle::version::isWithPyDataProvider()) {
+    if (!paddle::version::isWithGpu()) {
+      FLAGS_use_gpu = false;
+    }
+    initMain(argc, argv);
+    initPython(argc, argv);
+    return RUN_ALL_TESTS();
+  } else {
+    return 0;
+  }
+}
diff --git a/paddle/legacy/gserver/tests/test_RecurrentLayer.cpp b/paddle/legacy/gserver/tests/test_RecurrentLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..71198cb6a1d29433ed0e315378f5aee51b921766
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_RecurrentLayer.cpp
@@ -0,0 +1,571 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/utils/Version.h>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/gserver/layers/Layer.h"
+
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+DECLARE_bool(use_gpu);
+DECLARE_bool(rnn_use_batch);
+DECLARE_int32(fixed_seq_length);
+
+void checkError(const Matrix& matrix1, const Matrix& matrix2) {
+  CHECK(matrix1.getHeight() == matrix2.getHeight());
+  CHECK(matrix1.getWidth() == matrix2.getWidth());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+  const real* data1 = matrix1.getData();
+  const real* data2 = matrix2.getData();
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      if (fabs(data1[i * width + j] - data2[i * width + j]) > err) {
+        count++;
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+void checkError(const CpuVector& vector1, const CpuVector& vector2) {
+  CHECK(vector1.getSize() == vector2.getSize());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+
+  int size = vector1.getSize();
+  const real* data1 = vector1.getData();
+  const real* data2 = vector2.getData();
+  int count = 0;
+  for (int i = 0; i < size; i++) {
+    if (fabs(data1[i] - data2[i]) > err) {
+      count++;
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+LayerPtr creatDataLayer(string name,
+                        size_t batchSize,
+                        int layerSize,
+                        bool useGpu) {
+  LayerConfig dataConfig;
+  dataConfig.set_name(name);
+  dataConfig.set_type("data");
+  dataConfig.set_size(layerSize);
+  LayerPtr layer = LayerPtr(new DataLayer(dataConfig));
+
+  Argument data;
+  data.value = Matrix::create(batchSize, layer->getSize(), false, useGpu);
+  data.grad = Matrix::create(batchSize, layer->getSize(), false, useGpu);
+  data.value->randomizeUniform();
+  data.value->add(-0.5);
+  data.value->sigmoid(*data.value);
+  data.grad->zeroMem();
+
+  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
+
+  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
+  dataLayer->setData(data);
+  dataLayer->forward(PASS_GC);
+
+  return layer;
+}
+
+ParameterPtr creatParameter(string name,
+                            int pid,
+                            size_t paraSize,
+                            bool useGpu) {
+  ParameterConfig paraConfig;
+  paraConfig.set_name(name);
+  paraConfig.set_size(paraSize);
+
+  ParameterPtr parameter =
+      std::make_shared<Parameter>(paraConfig, useGpu, /*initialize */ false);
+  parameter->enableType(PARAMETER_VALUE);
+  parameter->enableType(PARAMETER_GRADIENT);
+  parameter->randomize();
+  parameter->setID(pid);
+
+  return parameter;
+}
+
+ParameterPtr creatParameterBias(string name,
+                                int pid,
+                                size_t paraSize,
+                                bool useGpu) {
+  ParameterConfig paraConfig;
+  paraConfig.set_name(name);
+  paraConfig.set_size(paraSize);
+  paraConfig.set_initial_std(1);
+
+  ParameterPtr parameter =
+      std::make_shared<Parameter>(paraConfig, useGpu, /*initialize */ true);
+  parameter->randomize();
+  parameter->setID(pid);
+
+  return parameter;
+}
+
+LayerPtr initRecurrentLayer(LayerConfig layerConfig,
+                            size_t batchSize,
+                            int layerSize,
+                            bool useGpu) {
+  FLAGS_use_gpu = useGpu;
+  LayerMap layerMap;
+  ParameterMap parameterMap;
+  LayerPtr dataLayer = creatDataLayer("layer_0", batchSize, layerSize, useGpu);
+  layerMap[dataLayer->getName()] = dataLayer;
+
+  ParameterPtr para =
+      creatParameter("para_0", 0, layerSize * layerSize, useGpu);
+  parameterMap[para->getName()] = para;
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
+  input.set_input_layer_name("layer_0");
+  input.set_input_parameter_name("para_0");
+  LayerPtr testLayer = Layer::create(layerConfig);
+  layerMap[testLayer->getName()] = testLayer;
+
+  testLayer->init(layerMap, parameterMap);
+  testLayer->setNeedGradient(true);
+
+  return testLayer;
+}
+
+void checkRecurrentLayer(LayerPtr testLayer) {
+  const VectorPtr& weightGrad =
+      (testLayer->getParameters()[0])->getBuf(PARAMETER_GRADIENT);
+  const MatrixPtr& inputGrad = testLayer->getPrev(0)->getOutputGrad();
+  CpuVector seqPara(weightGrad->getSize());
+  CpuVector batPara(weightGrad->getSize());
+  CpuMatrix seqInputGrad(inputGrad->getHeight(), inputGrad->getWidth());
+  CpuMatrix batInputGrad(inputGrad->getHeight(), inputGrad->getWidth());
+
+  CpuMatrix outputGrad(inputGrad->getHeight(), inputGrad->getWidth());
+  outputGrad.randomizeUniform();
+
+  /* use sequence calculate */
+  FLAGS_rnn_use_batch = false;
+  weightGrad->zero();
+  inputGrad->zero();
+  testLayer->forward(PASS_GC);
+  testLayer->getOutputGrad()->copyFrom(outputGrad);
+  testLayer->backward();
+  seqPara.copyFrom(*weightGrad);
+  seqInputGrad.copyFrom(*inputGrad);
+
+  /* use batch calculate */
+  FLAGS_rnn_use_batch = true;
+  weightGrad->zero();
+  inputGrad->zero();
+  testLayer->forward(PASS_GC);
+  testLayer->getOutputGrad()->copyFrom(outputGrad);
+  testLayer->backward();
+  batPara.copyFrom(*weightGrad);
+  batInputGrad.copyFrom(*inputGrad);
+
+  /* check */
+  checkError(seqInputGrad, batInputGrad);
+  checkError(seqPara, batPara);
+}
+
+TEST(Layer, RecurrentLayer) {
+  LayerConfig layerConfig;
+  layerConfig.set_name("rnn");
+  layerConfig.set_type("recurrent");
+  layerConfig.set_active_type("tanh");
+  for (auto layerSize : {1, 10, 64, 128, 256, 512}) {
+    for (auto batchSize : {1, 5, 20, 100, 128}) {
+      for (auto useGpu : {false, true}) {
+        for (auto reversed : {false, true}) {
+          LOG(INFO) << " layerSize=" << layerSize << " batchSize=" << batchSize
+                    << " useGpu=" << useGpu << " reversed=" << reversed;
+          layerConfig.set_size(layerSize);
+          layerConfig.set_reversed(reversed);
+          LayerPtr testLayer =
+              initRecurrentLayer(layerConfig, batchSize, layerSize, useGpu);
+          checkRecurrentLayer(testLayer);
+        }
+      }
+    }
+  }
+}
+
+#define protected public
+#include "paddle/legacy/gserver/layers/GatedRecurrentLayer.h"
+#include "paddle/legacy/gserver/layers/LstmLayer.h"
+#include "paddle/legacy/gserver/layers/RecurrentLayer.h"
+template <class T>
+class TestRecurrentLayer {
+ public:
+  LayerConfig config_;
+  bool useGpu_;
+  bool useBatch_;
+  LayerPtr testLayer_;
+  LayerPtr dataLayer_;
+  ParameterPtr para_;
+  ParameterPtr bias_;
+  LayerMap layerMap_;
+  ParameterMap parameterMap_;
+  TestRecurrentLayer(const LayerConfig& config,
+                     bool useGpu,
+                     bool useBatch = false)
+      : config_(config), useGpu_(useGpu), useBatch_(useBatch) {}
+  void init(size_t batchSize) {
+    FLAGS_use_gpu = useGpu_;
+    testLayer_ = Layer::create(config_);
+    if (typeid(T) == typeid(GatedRecurrentLayer)) {
+      dataLayer_ = creatDataLayer(config_.mutable_inputs(0)->input_layer_name(),
+                                  batchSize,
+                                  config_.size() * 3,
+                                  useGpu_);
+      para_ = creatParameter(config_.mutable_inputs(0)->input_parameter_name(),
+                             0,
+                             config_.size() * config_.size() * 3,
+                             useGpu_);
+      bias_ = creatParameterBias(
+          config_.bias_parameter_name(), 1, config_.size() * 3, useGpu_);
+    } else if (typeid(T) == typeid(LstmLayer)) {
+      dataLayer_ = creatDataLayer(config_.mutable_inputs(0)->input_layer_name(),
+                                  batchSize,
+                                  config_.size() * 4,
+                                  useGpu_);
+      para_ = creatParameter(config_.mutable_inputs(0)->input_parameter_name(),
+                             0,
+                             config_.size() * config_.size() * 4,
+                             useGpu_);
+      bias_ = creatParameterBias(
+          config_.bias_parameter_name(), 1, config_.size() * 7, useGpu_);
+    }
+    layerMap_[dataLayer_->getName()] = dataLayer_;
+    parameterMap_[para_->getName()] = para_;
+    parameterMap_[bias_->getName()] = bias_;
+
+    layerMap_[testLayer_->getName()] = testLayer_;
+    testLayer_->init(layerMap_, parameterMap_);
+    testLayer_->setNeedGradient(true);
+    (dynamic_cast<T*>(testLayer_.get()))->useBatch_ = useBatch_;
+  }
+  void forward() {
+    FLAGS_use_gpu = useGpu_;
+    testLayer_->forward(PASS_GC);
+  }
+  void backward() {
+    FLAGS_use_gpu = useGpu_;
+    testLayer_->backward(nullptr);
+  }
+};
+
+template <class T>
+void checkRecurrentLayer(LayerConfig layerConfig,
+                         size_t batchSize,
+                         bool cpuBatch,
+                         bool gpuBatch) {
+  TestRecurrentLayer<T> testCpu(layerConfig, false, cpuBatch);
+  TestRecurrentLayer<T> testGpu(layerConfig, true, gpuBatch);
+  testCpu.init(batchSize);
+  testGpu.init(batchSize);
+  auto checkError = [](
+      MatrixPtr cpu, MatrixPtr gpu, int numSequences, const char* str) {
+    CpuMatrix check(gpu->getHeight(), gpu->getWidth());
+    check.copyFrom(*gpu);
+    int height = cpu->getHeight();
+    int width = cpu->getWidth();
+    const real* data1 = cpu->getData();
+    const real* data2 = check.getData();
+    int count = 0;
+    for (int i = 0; i < height; i++) {
+      for (int j = 0; j < width; j++) {
+        if (fabs(data1[i * width + j] - data2[i * width + j]) / numSequences >
+            1e-4) {
+          count++;
+        }
+      }
+    }
+    EXPECT_EQ(count, 0) << "[" << str << "]"
+                        << "There are " << count << " different element.";
+  };
+  T* cpuLayer = dynamic_cast<T*>(testCpu.testLayer_.get());
+  T* gpuLayer = dynamic_cast<T*>(testGpu.testLayer_.get());
+
+  Argument& cpuInput = testCpu.dataLayer_->getOutput();
+  Argument& gpuInput = testGpu.dataLayer_->getOutput();
+  gpuInput.resizeAndCopyFrom(cpuInput, true);
+
+  const VectorPtr& cpuVec = testCpu.para_->getBuf(PARAMETER_VALUE);
+  const VectorPtr& gpuVec = testGpu.para_->getBuf(PARAMETER_VALUE);
+  gpuVec->copyFrom(*cpuVec);
+
+  const VectorPtr& cpuBiasVec = testCpu.bias_->getBuf(PARAMETER_VALUE);
+  const VectorPtr& gpuBiasVec = testGpu.bias_->getBuf(PARAMETER_VALUE);
+  gpuBiasVec->copyFrom(*cpuBiasVec);
+
+  /* check forward */
+  testCpu.forward();
+  testGpu.forward();
+
+  checkError(
+      cpuLayer->getOutputValue(), gpuLayer->getOutputValue(), 1, "outputValue");
+
+  /* check backward */
+  cpuLayer->getOutputGrad()->randomizeUniform();
+  gpuLayer->getOutputGrad()->copyFrom(*cpuLayer->getOutputGrad());
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+
+  testCpu.backward();
+  testGpu.backward();
+
+  // check input grad
+  checkError(cpuInput.grad, gpuInput.grad, 1, "inputGrad");
+  // check weight grad
+  int numSequences = cpuInput.getNumSequences();
+  checkError(cpuLayer->weight_->getWGrad(),
+             gpuLayer->weight_->getWGrad(),
+             numSequences,
+             "weightGrad");
+  // check bias grad
+  checkError(cpuLayer->bias_->getWGrad(),
+             gpuLayer->bias_->getWGrad(),
+             numSequences,
+             "biasGrad");
+}
+
+TEST(Layer, GatedRecurrentLayer) {
+  LayerConfig layerConfig;
+  layerConfig.set_type("gated_recurrent");
+  layerConfig.set_active_type("sigmoid");
+  layerConfig.set_active_gate_type("sigmoid");
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
+  input.set_input_layer_name("layer_0");
+  input.set_input_parameter_name("para_0");
+  layerConfig.set_bias_parameter_name("bias");
+
+  for (auto frameSize : {32, 64, 128, 256, 512}) {
+    for (auto batchSize : {1, 5, 100, 500}) {
+      for (auto reversed : {false, true}) {
+        for (auto cpuBatch : {false, true}) {
+          for (auto gpuBatch : {false, true}) {
+            LOG(INFO) << " batchSize=" << batchSize
+                      << " frameSize=" << frameSize << " reversed=" << reversed
+                      << " cpuBatch=" << cpuBatch << " gpuBatch=" << gpuBatch;
+            layerConfig.set_size(frameSize);
+            layerConfig.set_reversed(reversed);
+            checkRecurrentLayer<GatedRecurrentLayer>(
+                layerConfig, batchSize, cpuBatch, gpuBatch);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Layer, LstmLayer) {
+  LayerConfig layerConfig;
+  layerConfig.set_type("lstmemory");
+  layerConfig.set_active_type("relu");
+  layerConfig.set_active_state_type("tanh");
+  layerConfig.set_active_gate_type("sigmoid");
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
+  input.set_input_layer_name("layer_0");
+  input.set_input_parameter_name("para_0");
+  layerConfig.set_bias_parameter_name("bias");
+
+  for (auto frameSize : {32, 64, 128, 256, 512}) {
+    for (auto batchSize : {1, 5, 100, 500}) {
+      for (auto reversed : {false, true}) {
+        for (auto cpuBatch : {false, true}) {
+          for (auto gpuBatch : {false, true}) {
+            LOG(INFO) << " batchSize=" << batchSize
+                      << " frameSize=" << frameSize << " reversed=" << reversed
+                      << " cpuBatch=" << cpuBatch << " gpuBatch=" << gpuBatch;
+            layerConfig.set_size(frameSize);
+            layerConfig.set_reversed(reversed);
+            checkRecurrentLayer<LstmLayer>(
+                layerConfig, batchSize, cpuBatch, gpuBatch);
+          }
+        }
+      }
+    }
+  }
+}
+
+#ifdef PADDLE_WITH_MKLML
+
+#include "paddle/legacy/gserver/layers/MKLPackedRecurrentLayer.h"
+
+LayerPtr initMKLPackedLayer(LayerConfig layerConfig,
+                            bool reversed,
+                            int layerSize,
+                            LayerPtr dataLayer,
+                            ParameterPtr para,
+                            ParameterPtr bias = nullptr) {
+  LayerMap layerMap;
+  ParameterMap parameterMap;
+  layerMap[dataLayer->getName()] = dataLayer;
+  parameterMap[para->getName()] = para;
+  if (bias) {
+    parameterMap[bias->getName()] = bias;
+    layerConfig.set_bias_parameter_name("bias_0");
+  }
+
+  layerConfig.set_size(layerSize);
+  layerConfig.set_reversed(reversed);
+  layerConfig.add_inputs();
+  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
+  input.set_input_layer_name("layer_0");
+  input.set_input_parameter_name("para_0");
+
+  LayerPtr testLayer = Layer::create(layerConfig);
+  layerMap[testLayer->getName()] = testLayer;
+
+  testLayer->init(layerMap, parameterMap);
+  testLayer->setNeedGradient(true);
+
+  return testLayer;
+}
+
+void checkMKLPackedLayer(LayerConfig layerConfig1,
+                         LayerConfig layerConfig2,
+                         bool reversed,
+                         int layerSize,
+                         int batchSize,
+                         bool useBatch1,
+                         bool useBatch2) {
+  LayerPtr dataLayer;
+  ParameterPtr para, bias;
+
+  if (layerConfig1.type() == "recurrent") {
+    dataLayer = creatDataLayer("layer_0", batchSize, layerSize, false);
+    para = creatParameter("para_0", 0, layerSize * layerSize, false);
+    bias = nullptr;
+  } else if (layerConfig1.type() == "gated_recurrent") {
+    dataLayer = creatDataLayer("layer_0", batchSize, layerSize * 3, false);
+    para = creatParameter("para_0", 0, layerSize * layerSize * 3, false);
+    bias = creatParameterBias("bias_0", 1, layerSize * 3, false);
+  }
+
+  LayerPtr testLayer1 = initMKLPackedLayer(
+      layerConfig1, reversed, layerSize, dataLayer, para, bias);
+  LayerPtr testLayer2 = initMKLPackedLayer(
+      layerConfig2, reversed, layerSize, dataLayer, para, bias);
+
+  const VectorPtr& weightGrad =
+      (testLayer1->getParameters()[0])->getBuf(PARAMETER_GRADIENT);
+  const MatrixPtr& inputGrad = testLayer1->getPrev(0)->getOutputGrad();
+  CpuVector wgt_grad1(weightGrad->getSize());
+  CpuVector wgt_grad2(weightGrad->getSize());
+  CpuMatrix input_grad1(inputGrad->getHeight(), inputGrad->getWidth());
+  CpuMatrix input_grad2(inputGrad->getHeight(), inputGrad->getWidth());
+
+  for (int i = 0; i < 2; i++) {
+    FLAGS_rnn_use_batch = useBatch1;
+
+    testLayer1->forward(PASS_GC);
+
+    FLAGS_rnn_use_batch = useBatch2;
+    testLayer2->forward(PASS_GC);
+
+    testLayer1->getOutputGrad()->randomizeUniform();
+    testLayer2->getOutputGrad()->copyFrom(*testLayer1->getOutputGrad());
+
+    weightGrad->zero();
+    inputGrad->zero();
+    FLAGS_rnn_use_batch = useBatch1;
+    testLayer1->backward(nullptr);
+
+    wgt_grad1.copyFrom(*weightGrad);
+    input_grad1.copyFrom(*inputGrad);
+
+    weightGrad->zero();
+    inputGrad->zero();
+    FLAGS_rnn_use_batch = useBatch2;
+    testLayer2->backward(nullptr);
+
+    wgt_grad2.copyFrom(*weightGrad);
+    input_grad2.copyFrom(*inputGrad);
+
+    checkError(*testLayer1->getOutputValue(), *testLayer2->getOutputValue());
+    checkError(wgt_grad1, wgt_grad2);
+    checkError(input_grad1, input_grad2);
+  }
+}
+
+TEST(MKLPackedLayer, RecurrentLayer) {
+  LayerConfig layerConfig1;
+  LayerConfig layerConfig2;
+
+  layerConfig1.set_name("paddle-rnn");
+  layerConfig1.set_type("recurrent");
+  layerConfig1.set_active_type("relu");
+
+  layerConfig2.set_name("mkl-packed-rnn");
+  layerConfig2.set_type("mkl_packed_recurrent");
+  layerConfig2.set_active_type("relu");
+
+  FLAGS_use_gpu = false;
+
+  for (auto layerSize : {32, 64, 128, 256, 512}) {
+    for (auto batchSize : {1, 5, 100, 500}) {
+      for (auto reversed : {true, false}) {
+        for (auto paddle_use_batch : {true, false}) {
+          for (auto MKLPacked_use_batch : {true, false}) {
+            LOG(INFO) << " layerSize=" << layerSize
+                      << " batchSize=" << batchSize << " reversed=" << reversed
+                      << " paddle_use_batch=" << paddle_use_batch
+                      << " MKLPacked_use_batch=" << MKLPacked_use_batch;
+
+            checkMKLPackedLayer(layerConfig1,
+                                layerConfig2,
+                                reversed,
+                                layerSize,
+                                batchSize,
+                                paddle_use_batch,
+                                MKLPacked_use_batch);
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  if (!version::isWithGpu()) {
+    testing::GTEST_FLAG(filter) = "-Layer.*";
+  }
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_SelectiveFCLayer.cpp b/paddle/legacy/gserver/tests/test_SelectiveFCLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1975d9196d61dbb80667b2ba86c09d56bc568064
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_SelectiveFCLayer.cpp
@@ -0,0 +1,471 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <math.h>
+#include <paddle/legacy/utils/PythonUtil.h>
+#include <algorithm>
+#include <cstdlib>
+#include <ctime>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/gserver/layers/FullyConnectedLayer.h"
+#include "paddle/legacy/gserver/layers/Layer.h"
+#include "paddle/legacy/gserver/layers/SelectiveFullyConnectedLayer.h"
+#include "paddle/legacy/math/CpuSparseMatrix.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_int32(num_passes);
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_string(config_args);
+
+size_t fcLayerWidth = 1024;
+
+struct ComData {
+  vector<Argument> outArgs;
+  vector<ParameterPtr> parameters;
+};
+
+int randint(int* data, size_t int_max, size_t size) {
+  srand((size_t)(time(NULL)));
+  if (int_max < size) {
+    return -1;
+  }
+  size_t count = 0;
+  std::map<int, int> tmp;
+  int this_int = 0;
+
+  while (count < size) {
+    this_int = std::rand() % int_max;  // NOLINT
+    if (tmp.find(this_int) == tmp.end()) {
+      tmp[this_int] = 0;
+      count += 1;
+    }
+  }
+
+  if (tmp.size() != size) {
+    return -1;
+  }
+  count = 0;
+  for (auto itr = tmp.begin(); itr != tmp.end(); ++itr) {
+    data[count] = itr->first;
+    count += 1;
+  }
+  return 0;
+}
+
+void calcOutput(ComData& comData,
+                const string configFile,
+                const string configArgs,
+                bool useGpu) {
+  FLAGS_config = configFile;
+  FLAGS_config_args = configArgs;
+  FLAGS_use_gpu = useGpu;
+  FLAGS_init_model_path = "legacy/gserver/tests/SelectiveFcTest/model";
+  *ThreadLocalRand::getSeed() = 0;
+  srand(0);
+
+  Trainer trainer;
+  trainer.init(TrainerConfigHelper::createFromFlags(), false);
+
+  comData.parameters = trainer.getGradientMachine()->getParameters();
+
+  auto dataProvider = trainer.getDataProvider();
+  int32_t batchSize = trainer.getConfig().opt_config().batch_size();
+  DataBatch dataBatch;
+  dataProvider->setSkipShuffle();
+  dataProvider->reset();
+  dataProvider->getNextBatch(batchSize, &dataBatch);
+  CHECK(dataBatch.getSize()) << "No data from data provider";
+
+  vector<Argument>& inArgs = dataBatch.getStreams();
+  trainer.getGradientMachine()->start(trainer.getConfig(), nullptr);
+  trainer.getGradientMachine()->forwardBackward(
+      inArgs, &comData.outArgs, PASS_TRAIN);
+  trainer.getGradientMachine()->finish();
+}
+
+void checkMatrix(real* A, real* B, size_t matSize) {
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+  int diffNum = 0;
+  for (size_t i = 0; i < matSize; ++i) {
+    if (std::isinf(A[i]) || std::isnan(A[i]) || std::isinf(B[i]) ||
+        std::isnan(B[i])) {
+    } else if (fabs(A[i] - B[i]) > err) {
+      diffNum++;
+    }
+  }
+  EXPECT_EQ(0, diffNum);
+}
+
+void checkTranspose(real* matrix,
+                    real* transpose,
+                    size_t width,
+                    size_t matSize) {
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+  size_t height = matSize / width;
+  int diffNum = 0;
+  size_t rowId = 0;
+  size_t colId = 0;
+  for (size_t i = 0; i < matSize; ++i) {
+    if (i % width == 0 && i) {
+      rowId++;
+    }
+    colId = i % width;
+    if (fabs(matrix[i] - transpose[colId * height + rowId]) > err) {
+      diffNum++;
+      LOG(INFO) << i << " diff : " << matrix[i] << "\t"
+                << transpose[colId * height + rowId];
+    }
+  }
+  EXPECT_EQ(0, diffNum);
+}
+
+void compareOutput(ComData& fcData, ComData& selFcData) {
+  vector<Argument> outArgsFc = fcData.outArgs;
+  vector<Argument> outArgsSelfc = selFcData.outArgs;
+
+  // check cost
+  LOG(INFO) << "Check cost";
+  CpuMatrix fcCost(outArgsFc[0].value->getHeight(),
+                   outArgsFc[0].value->getWidth());
+  CpuMatrix selfcCost(outArgsSelfc[0].value->getHeight(),
+                      outArgsSelfc[0].value->getWidth());
+  fcCost.copyFrom(*outArgsFc[0].value);
+  selfcCost.copyFrom(*outArgsSelfc[0].value);
+  checkMatrix(fcCost.getData(), selfcCost.getData(), fcCost.getElementCnt());
+
+  // check selective fc output and fc output
+  LOG(INFO) << "Compare output of SelectiveFullyConectedLayer "
+            << "with FullyConectedLayer";
+  CpuMatrix fcOut(outArgsFc[1].value->getHeight(),
+                  outArgsFc[1].value->getWidth());
+  CpuMatrix selfcOut(outArgsSelfc[1].value->getHeight(),
+                     outArgsSelfc[1].value->getWidth());
+
+  fcOut.copyFrom(*outArgsFc[1].value);
+  selfcOut.copyFrom(*outArgsSelfc[1].value);
+  checkMatrix(fcOut.getData(), selfcOut.getData(), fcOut.getElementCnt());
+
+  // check gradient math
+  vector<ParameterPtr>& fcParam = fcData.parameters;
+  vector<ParameterPtr>& selfcParam = selFcData.parameters;
+  for (size_t i = 0; i < fcParam.size(); ++i) {
+    ParameterPtr p1, p2;
+    p1 = fcParam[i];
+    p2 = selfcParam[i];
+
+    string paramName = p1->getName();
+    LOG(INFO) << "check parameter : " << paramName;
+
+    // check parameter value
+    CpuVector paraValue1(p1->getSize());
+    CpuVector paraValue2(p2->getSize());
+    paraValue1.copyFrom(*p1->getBuf(PARAMETER_VALUE));
+    paraValue2.copyFrom(*p2->getBuf(PARAMETER_VALUE));
+
+    // check gradient
+    CpuVector paraGrad1(*p1->getBuf(PARAMETER_GRADIENT));
+    CpuVector paraGrad2(*p2->getBuf(PARAMETER_GRADIENT));
+    if (paramName == "rand_fc_param.bias") {
+      checkMatrix(
+          paraValue1.getData(), paraValue2.getData(), paraValue1.getSize());
+      checkMatrix(
+          paraGrad1.getData(), paraGrad2.getData(), paraGrad1.getSize());
+    } else {
+      checkTranspose(paraValue1.getData(),
+                     paraValue2.getData(),
+                     fcLayerWidth,
+                     paraValue1.getSize());
+      checkTranspose(paraGrad1.getData(),
+                     paraGrad2.getData(),
+                     fcLayerWidth,
+                     paraGrad1.getSize());
+    }
+  }
+}
+
+void compareSparseMulOutput(
+    real* fcOutput,
+    real* selOutput,
+    size_t nnz,
+    const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& selCols) {
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+  size_t nnzCount =
+      std::accumulate(selCols->begin(),
+                      selCols->end(),
+                      0UL,
+                      [](size_t a, const std::pair<int*, size_t>& arr) {
+                        return a + arr.second;
+                      });
+  EXPECT_EQ(nnz, nnzCount);
+
+  size_t sampleNum = selCols->size();
+  int diffNum = 0;
+  size_t count = 0;
+  for (size_t i = 0; i < sampleNum; ++i) {
+    for (size_t j = 0; j < (*selCols)[i].second; ++j) {
+      size_t selIdx = (*selCols)[i].first[j];
+      if (fabs(fcOutput[i * fcLayerWidth + selIdx] - selOutput[count]) > err) {
+        diffNum++;
+        LOG(INFO) << count << " diff : " << fcOutput[i * fcLayerWidth + selIdx]
+                  << "\t" << selOutput[count];
+      }
+      count++;
+    }
+  }
+  EXPECT_EQ(0, diffNum);
+}
+
+LayerPtr creatDataLayer(string name,
+                        size_t batchSize,
+                        size_t layerSize,
+                        std::vector<real>& values,
+                        bool useGpu) {
+  LayerConfig dataConfig;
+  dataConfig.set_name(name);
+  dataConfig.set_type("data");
+  dataConfig.set_size(layerSize);
+  LayerPtr layer = LayerPtr(new DataLayer(dataConfig));
+
+  Argument data;
+  data.value = Matrix::create(batchSize, layerSize, false, useGpu);
+  data.value->copyFrom(values.data(), batchSize * layerSize);
+
+  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
+  dataLayer->setData(data);
+  dataLayer->forward(PASS_TEST);
+  return layer;
+}
+
+ParameterPtr creatParameter(
+    string name, int pid, size_t paraSize, string paramFile, bool useGpu) {
+  ParameterConfig paraConfig;
+  paraConfig.set_name(name);
+  paraConfig.set_size(paraSize);
+
+  ParameterPtr parameter =
+      std::make_shared<Parameter>(paraConfig, useGpu, /*initialize */ false);
+  parameter->enableType(PARAMETER_VALUE);
+  parameter->randomize();
+  parameter->setID(pid);
+  parameter->load(paramFile);
+  return parameter;
+}
+
+LayerPtr initFcLayer(LayerPtr dataLayer,
+                     LayerConfig layerConfig,
+                     int dataLayerSize,
+                     int fcLayerSize,
+                     string paraName,
+                     string paraFile,
+                     bool useGpu) {
+  LayerMap layerMap;
+  ParameterMap parameterMap;
+
+  layerMap[dataLayer->getName()] = dataLayer;
+  ParameterPtr para = creatParameter(
+      paraName, 0, dataLayerSize * fcLayerSize, paraFile, useGpu);
+  parameterMap[para->getName()] = para;
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
+  input.set_input_layer_name(dataLayer->getName());
+  input.set_input_parameter_name(paraName);
+
+  LayerPtr testLayer = Layer::create(layerConfig);
+  layerMap[testLayer->getName()] = testLayer;
+
+  testLayer->setNeedGradient(false);
+  testLayer->init(layerMap, parameterMap);
+  return testLayer;
+}
+
+#ifndef PADDLE_TYPE_DOUBLE
+// The parameter file used in fc.conf and selective_fc.conf is float
+TEST(Layer, SelectiveFcLayer_train_dense_mul) {
+  const string& fcConfig = "legacy/gserver/tests/SelectiveFcTest/conf/fc.conf";
+  const string& fcConfigArgs =
+      "filelist=legacy/gserver/tests/SelectiveFcTest/dense_mul_list";
+  const string& selFcConfig =
+      "legacy/gserver/tests/SelectiveFcTest/conf/selective_fc.conf";
+  const string& selConfigArgs =
+      "filelist=legacy/gserver/tests/SelectiveFcTest/dense_mul_list";
+
+  for (auto useGpu : {false, true}) {
+#ifndef PADDLE_WITH_CUDA
+    if (useGpu) {
+      break;
+    }
+#endif
+    LOG(INFO) << "FullyConnectedLayer forwardBackward()";
+    ComData fcData;
+    calcOutput(fcData, fcConfig, fcConfigArgs, useGpu);
+
+    LOG(INFO) << "SelectiveFullyConnectedLayer forwardBackward()";
+    ComData selFcData;
+    calcOutput(selFcData, selFcConfig, selConfigArgs, useGpu);
+    compareOutput(fcData, selFcData);
+  }
+}
+#endif  // PADDLE_TYPE_DOUBLE
+
+void testSelectiveFcLayerTrainSparseMul(const LayerConfig& config,
+                                        bool useGpu) {
+  FLAGS_use_gpu = useGpu;
+  size_t batchSize = 100;
+  size_t dataLayerSize = 512;
+  std::vector<real> values(batchSize * dataLayerSize);
+  for (size_t j = 0; j < batchSize * dataLayerSize; ++j) {
+    values[j] = std::rand() / real(RAND_MAX);
+  }
+  LayerPtr dataLayer =
+      creatDataLayer("data", batchSize, dataLayerSize, values, useGpu);
+
+  const string& selfcParaFile =
+      "legacy/gserver/tests/SelectiveFcTest/model/rand_fc_param.w.transpose";
+  const string& selfcParaName = "rand_fc_param.w.transpose";
+
+  std::shared_ptr<SelectiveFullyConnectedLayer> selfcLayer =
+      std::dynamic_pointer_cast<SelectiveFullyConnectedLayer>(
+          initFcLayer(dataLayer,
+                      config,
+                      dataLayerSize,
+                      fcLayerWidth,
+                      selfcParaName,
+                      selfcParaFile,
+                      useGpu));
+
+  // create selected columns
+  std::shared_ptr<std::vector<std::pair<int*, size_t>>> selCols(
+      new std::vector<std::pair<int*, size_t>>(batchSize));
+  size_t maxNNZ = 30;
+  srand((size_t)(time(NULL)));
+  int total = 0;
+  while (total == 0) {
+    for (size_t i = 0; i < batchSize; ++i) {
+      size_t num = std::rand() % maxNNZ;
+      int* data = new int[num];
+      randint(data, fcLayerWidth, num);
+      (*selCols)[i] = std::make_pair(data, num);
+      total += num;
+    }
+  }
+  selfcLayer->fillSelectiveData(selCols);
+  selfcLayer->forward(PASS_TEST);
+
+  MatrixPtr outMatSelfc = selfcLayer->getOutputValue();
+  CpuSparseMatrixPtr cpuOutMatSelfc(
+      new CpuSparseMatrix(outMatSelfc->getHeight(),
+                          outMatSelfc->getWidth(),
+                          outMatSelfc->getElementCnt()));
+  cpuOutMatSelfc->copyFrom(*outMatSelfc, HPPL_STREAM_DEFAULT);
+#ifdef PADDLE_WITH_CUDA
+  if (useGpu) {
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  }
+#endif
+  real* outValueSelfc = cpuOutMatSelfc->getValue();
+  size_t nnz = cpuOutMatSelfc->getElementCnt();
+
+  const string& fcParaFile =
+      "legacy/gserver/tests/SelectiveFcTest/model/rand_fc_param.w";
+  const string& fcParaName = "rand_fc_param.w";
+  LayerConfig fcLayerConfig;
+  fcLayerConfig.set_name("fc_layer");
+  fcLayerConfig.set_type("fc");
+  fcLayerConfig.set_active_type("linear");
+  fcLayerConfig.set_size(fcLayerWidth);
+
+  LayerPtr fcLayer = initFcLayer(dataLayer,
+                                 fcLayerConfig,
+                                 dataLayerSize,
+                                 fcLayerWidth,
+                                 fcParaName,
+                                 fcParaFile,
+                                 useGpu);
+  fcLayer->forward(PASS_TEST);
+
+  MatrixPtr outMatFc = fcLayer->getOutputValue();
+  MatrixPtr cpuOutMatFc(
+      new CpuMatrix(outMatFc->getHeight(), outMatFc->getWidth()));
+  cpuOutMatFc->copyFrom(*outMatFc, HPPL_STREAM_DEFAULT);
+#ifdef PADDLE_WITH_CUDA
+  if (useGpu) {
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  }
+#endif
+  real* outValueFc = cpuOutMatFc->getData();
+
+  compareSparseMulOutput(outValueFc, outValueSelfc, nnz, selCols);
+  for (size_t i = 0; i < batchSize; ++i) {
+    delete[](*selCols)[i].first;
+  }
+}
+
+#ifndef PADDLE_TYPE_DOUBLE
+// The parameter file used in testSelectiveFcLayerTrainSparseMul is float
+TEST(Layer, SelectiveFcLayer_train_sparse_mul) {
+  LayerConfig selLayerConfig;
+  selLayerConfig.set_name("sel_fc");
+  selLayerConfig.set_type("selective_fc");
+  selLayerConfig.set_active_type("linear");
+  selLayerConfig.set_has_selected_colums(false);
+  selLayerConfig.set_selective_fc_pass_generation(true);
+  selLayerConfig.set_size(fcLayerWidth);
+
+  testSelectiveFcLayerTrainSparseMul(selLayerConfig, false);
+#ifdef PADDLE_WITH_CUDA
+  testSelectiveFcLayerTrainSparseMul(selLayerConfig, true);
+#endif
+}
+#endif  // PADDLE_TYPE_DOUBLE
+
+// TODO(dangqingqing) test multi threads after support in matrix
+// TEST(Layer, SelectiveFcLayer_train_sparse_mul_parallel) {
+//   LayerConfig selLayerConfig;
+//   selLayerConfig.set_name("sel_fc");
+//   selLayerConfig.set_type("selective_fc");
+//   selLayerConfig.set_active_type("linear");
+//   selLayerConfig.set_has_selected_colums(false);
+//   selLayerConfig.set_selective_fc_pass_generation(true);
+//   selLayerConfig.set_selective_fc_parallel_plain_mul_thread_num(10);
+//   selLayerConfig.set_selective_fc_full_mul_ratio(1000);
+//   selLayerConfig.set_size(fcLayerWidth);
+//   SelectiveFcLayer_test(selLayerConfig, false);
+// }
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  initPython(argc, argv);
+  int ret = RUN_ALL_TESTS();
+  return ret;
+}
diff --git a/paddle/legacy/gserver/tests/test_SeqSliceLayerGrad.cpp b/paddle/legacy/gserver/tests/test_SeqSliceLayerGrad.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..05acd714219fa5964b5b3595543682825ea67d84
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_SeqSliceLayerGrad.cpp
@@ -0,0 +1,224 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+const int MAX_SEQ_NUM = 17;
+const int MAX_SEQ_LEN = 23;
+const int MAX_BEAM_SIZE = 13;
+
+const size_t SEED = (size_t)(time(NULL));
+
+vector<real> randSampling(real range, int n) {
+  CHECK_GE(range, n);
+  vector<real> num(range);
+  iota(begin(num), end(num), 0.);
+  if (range == n) return num;
+
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  sort(begin(num), end(num));
+  return num;
+}
+
+void genSeqInfo(vector<int>& seqStartPos, vector<int>& subSeqStartPos) {
+  seqStartPos.resize(1, 0);
+  subSeqStartPos.resize(1, 0);
+
+  srand(SEED);
+  int seqNum = 1 + (rand() % MAX_SEQ_NUM);
+  for (int i = 0; i < seqNum; ++i) {
+    int subSeqNum = 1 + (rand() % MAX_SEQ_NUM);
+    for (int j = 0; j < subSeqNum; ++j)
+      subSeqStartPos.push_back(subSeqStartPos.back() +
+                               (1 + (rand() % MAX_SEQ_LEN)));
+    seqStartPos.push_back(subSeqStartPos.back());
+  }
+}
+
+/*
+  generate start indices according to sequence start positions.
+ */
+void genStarts(vector<int>& seqStartPos,
+               vector<vector<real>>& starts,
+               size_t beamSize) {
+  starts.clear();
+  starts.resize(seqStartPos.size() - 1, vector<real>(beamSize, -1.));
+
+  for (size_t i = 0; i < seqStartPos.size() - 1; ++i) {
+    int seqLen = seqStartPos[i + 1] - seqStartPos[i];
+    vector<real> randStarts =
+        randSampling(seqLen, min(seqLen, static_cast<int>(beamSize)));
+    copy(begin(randStarts), end(randStarts), begin(starts[i]));
+  }
+}
+
+/*
+  generate end indices according to sequence start positions and start indices.
+ */
+void genEnds(vector<int>& seqStartPos,
+             vector<vector<real>>& starts,
+             vector<vector<real>>& ends,
+             size_t beamSize) {
+  CHECK_EQ(seqStartPos.size() - 1, starts.size());
+  ends.clear();
+  ends.resize(seqStartPos.size() - 1, vector<real>(beamSize, -1.));
+
+  for (size_t i = 0; i < starts.size(); ++i) {
+    for (size_t j = 0; j < starts[i].size(); ++j) {
+      int seqLen = seqStartPos[i + 1] - seqStartPos[i];
+      CHECK_GE(seqLen - 1, starts[i][j]);
+      if (starts[i][j] == -1.) break;
+      if (starts[i][j] == (seqLen - 1)) {
+        ends[i][j] = starts[i][j];
+      } else {
+        ends[i][j] = starts[i][j] + randSampling(seqLen - starts[i][j], 1)[0];
+      }
+    }
+  }
+}
+
+void genTestData(vector<int>& seqStartPos,
+                 vector<int>& subSeqStartPos,
+                 vector<vector<real>>& starts,
+                 vector<vector<real>>& ends,
+                 bool hasSubseq) {
+  size_t beamSize = 1 + (rand() % MAX_BEAM_SIZE);
+  genSeqInfo(seqStartPos, subSeqStartPos);
+
+  genStarts(hasSubseq ? subSeqStartPos : seqStartPos, starts, beamSize);
+  genEnds(hasSubseq ? subSeqStartPos : seqStartPos, starts, ends, beamSize);
+}
+
+template <typename T>
+void flatten2dVector(vector<vector<T>>& inVec, vector<T>& outVec) {
+  size_t totalSize{0};
+  for (auto const& items : inVec) totalSize += items.size();
+  outVec.reserve(totalSize);
+
+  for (auto& items : inVec)
+    move(items.begin(), items.end(), back_inserter(outVec));
+}
+
+void testSeqSliceLayer(bool hasSubseq,
+                       bool useGpu,
+                       vector<int>& seqStartPos,
+                       vector<int>& subSeqStartPos,
+                       vector<vector<real>>& starts,
+                       vector<vector<real>>& ends) {
+  // layer size is not crutial for this layer,
+  // so here use a small layer size in the unittest.
+  const size_t layerSize{4};
+  TestConfig config;
+  config.layerConfig.set_type("seq_slice");
+  config.layerConfig.set_size(layerSize);
+
+  // add the first input
+  MatrixPtr seqInputPtr =
+      Matrix::create(hasSubseq ? subSeqStartPos.back() : seqStartPos.back(),
+                     layerSize,
+                     false,
+                     false);
+  seqInputPtr->randomizeUniform();
+
+  if (hasSubseq) {
+    config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                                "seq_input",
+                                seqInputPtr,
+                                seqStartPos,
+                                subSeqStartPos});
+  } else {
+    config.inputDefs.push_back(
+        {INPUT_SELF_DEFINE_DATA, "seq_input", seqInputPtr, seqStartPos});
+  }
+  config.layerConfig.add_inputs();
+
+  // add start indices
+  if (starts.size()) {
+    vector<real> startsToVec;
+    flatten2dVector(starts, startsToVec);
+
+    MatrixPtr startMatrixPtr =
+        Matrix::create(starts.size(), starts[0].size(), false, false);
+    startMatrixPtr->copyFrom(startsToVec.data(), startsToVec.size());
+
+    config.inputDefs.push_back(
+        {INPUT_SELF_DEFINE_DATA, "starts", startMatrixPtr});
+    config.layerConfig.add_inputs();
+    config.layerConfig.set_select_first(true);
+  }
+
+  // add end indices
+  if (ends.size()) {
+    vector<real> endsToVec;
+    flatten2dVector(ends, endsToVec);
+
+    MatrixPtr endMatrixPtr =
+        Matrix::create(ends.size(), ends[0].size(), false, false);
+    endMatrixPtr->copyFrom(endsToVec.data(), endsToVec.size());
+
+    config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "ends", endMatrixPtr});
+    config.layerConfig.add_inputs();
+    config.layerConfig.set_select_first(false);
+  }
+
+  testLayerGrad(config, "seq_slice", /*batchSize*/ 100, false, useGpu, false);
+}
+
+TEST(Layer, SeqSliceLayer) {
+  vector<int> seqStartPos;
+  vector<int> subSeqStartPos;
+  vector<vector<real>> starts;
+  vector<vector<real>> ends;
+
+  std::vector<bool> mode = {false};
+#ifdef PADDLE_WITH_CUDA
+  mode.push_back(true);
+#endif
+  genSeqInfo(seqStartPos, subSeqStartPos);
+  for (bool hasSubseq : {true, false}) {
+    LOG(INFO) << "hasSubSeq : " << hasSubseq;
+    genTestData(seqStartPos, subSeqStartPos, starts, ends, hasSubseq);
+    for (bool useGpu : mode) {
+      vector<vector<real>> tmp;
+      testSeqSliceLayer(
+          hasSubseq, useGpu, seqStartPos, subSeqStartPos, tmp, ends);
+      testSeqSliceLayer(
+          hasSubseq, useGpu, seqStartPos, subSeqStartPos, starts, tmp);
+      testSeqSliceLayer(
+          hasSubseq, useGpu, seqStartPos, subSeqStartPos, starts, ends);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  hl_start();
+  hl_init(FLAGS_gpu_id);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/gserver/tests/test_Upsample.cpp b/paddle/legacy/gserver/tests/test_Upsample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..940d46baf73f2d600cff6edc37c29a3a36bf5d90
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_Upsample.cpp
@@ -0,0 +1,153 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+
+#include "LayerGradUtil.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/testing/TestUtil.h"
+
+void setPoolConfig(paddle::TestConfig* config,
+                   paddle::PoolConfig* pool,
+                   const string& poolType) {
+  (*config).biasSize = 0;
+  (*config).layerConfig.set_type("pool");
+  (*config).layerConfig.set_num_filters(1);
+
+  int kw = 2, kh = 2;
+  int pw = 0, ph = 0;
+  int sw = 2, sh = 2;
+  pool->set_pool_type(poolType);
+  pool->set_channels(2);
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
+  pool->set_start(0);
+  pool->set_padding(pw);
+  pool->set_padding_y(ph);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+
+  int ow =
+      paddle::outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh =
+      paddle::outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
+}
+
+paddle::LayerPtr doOneUpsampleTest(const paddle::MatrixPtr& inputMat,
+                                   const string& poolType,
+                                   bool use_gpu,
+                                   real* tempGradData) {
+  /* prepare maxPoolWithMaskLayer */
+  paddle::TestConfig config;
+  config.inputDefs.push_back({paddle::INPUT_DATA, "layer_0", 128, 0});
+  paddle::LayerInputConfig* input = config.layerConfig.add_inputs();
+  paddle::PoolConfig* pool = input->mutable_pool_conf();
+
+  pool->set_img_size(8);
+  pool->set_img_size_y(8);
+  setPoolConfig(&config, pool, "max-pool-with-mask");
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  config.layerConfig.set_name("MaxPoolWithMask");
+
+  std::vector<paddle::DataLayerPtr> dataLayers;
+  paddle::LayerMap layerMap;
+  vector<paddle::Argument> datas;
+
+  initDataLayer(config,
+                &dataLayers,
+                &datas,
+                &layerMap,
+                "MaxPoolWithMask",
+                1,
+                false,
+                use_gpu);
+
+  dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
+
+  FLAGS_use_gpu = use_gpu;
+  std::vector<paddle::ParameterPtr> parameters;
+  paddle::LayerPtr maxPoolingWithMaskOutputLayer;
+  initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
+  maxPoolingWithMaskOutputLayer->forward(paddle::PASS_GC);
+
+  /* prepare the upsample layer */
+  paddle::LayerConfig upsampleLayerConfig;
+  upsampleLayerConfig.set_type("upsample");
+  paddle::LayerInputConfig* input1 = upsampleLayerConfig.add_inputs();
+  upsampleLayerConfig.add_inputs();
+
+  paddle::UpsampleConfig* upsampleConfig = input1->mutable_upsample_conf();
+  upsampleConfig->set_scale(2);
+  paddle::ImageConfig* imageConfig = upsampleConfig->mutable_image_conf();
+  imageConfig->set_channels(2);
+  imageConfig->set_img_size(4);
+  imageConfig->set_img_size_y(4);
+  upsampleLayerConfig.set_size(2 * 8 * 8);
+  upsampleLayerConfig.set_name("upsample");
+
+  for (size_t i = 0; i < 2; i++) {
+    paddle::LayerInputConfig& inputTemp =
+        *(upsampleLayerConfig.mutable_inputs(i));
+    inputTemp.set_input_layer_name("MaxPoolWithMask");
+  }
+
+  paddle::LayerPtr upsampleLayer;
+  paddle::ParameterMap parameterMap;
+  upsampleLayer = paddle::Layer::create(upsampleLayerConfig);
+  layerMap[upsampleLayerConfig.name()] = upsampleLayer;
+  upsampleLayer->init(layerMap, parameterMap);
+  upsampleLayer->setNeedGradient(true);
+  upsampleLayer->forward(paddle::PASS_GC);
+  upsampleLayer->getOutputGrad()->copyFrom(tempGradData, 128);
+  upsampleLayer->backward();
+
+  return upsampleLayer;
+}
+
+TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
+  bool useGpu = false;
+  paddle::MatrixPtr inputMat;
+  paddle::MatrixPtr inputGPUMat;
+  paddle::MatrixPtr tempGradMat;
+
+  inputMat = paddle::Matrix::create(1, 128, false, useGpu);
+  inputMat->randomizeUniform();
+
+  tempGradMat = paddle::Matrix::create(1, 128, false, useGpu);
+  tempGradMat->randomizeUniform();
+  real* tempGradData = tempGradMat->getData();
+
+  paddle::LayerPtr upsampleLayerCPU =
+      doOneUpsampleTest(inputMat, "max-pool-with-mask", useGpu, tempGradData);
+
+#ifdef PADDLE_WITH_CUDA
+  useGpu = true;
+  real* data = inputMat->getData();
+  inputGPUMat = paddle::Matrix::create(1, 128, false, useGpu);
+  inputGPUMat->copyFrom(data, 128);
+  paddle::LayerPtr upsampleLayerGPU = doOneUpsampleTest(
+      inputGPUMat, "max-pool-with-mask", useGpu, tempGradData);
+  paddle::checkMatrixEqual(upsampleLayerCPU->getOutput("").value,
+                           upsampleLayerGPU->getOutput("").value);
+
+  paddle::checkMatrixEqual(upsampleLayerCPU->getPrev(0)->getOutputGrad(),
+                           upsampleLayerGPU->getPrev(0)->getOutputGrad());
+#endif
+}
diff --git a/paddle/legacy/gserver/tests/test_WarpCTCLayer.cpp b/paddle/legacy/gserver/tests/test_WarpCTCLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b1697e1616484ec5389cdb5b59ba413a9615cf2e
--- /dev/null
+++ b/paddle/legacy/gserver/tests/test_WarpCTCLayer.cpp
@@ -0,0 +1,244 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/utils/Version.h>
+#include "ModelConfig.pb.h"
+#include "paddle/legacy/gserver/layers/CTCLayer.h"
+#include "paddle/legacy/gserver/layers/DataLayer.h"
+#include "paddle/legacy/gserver/layers/Layer.h"
+#include "paddle/legacy/gserver/layers/WarpCTCLayer.h"
+
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+
+const real* getData(const Matrix& matrix) {
+  if (matrix.useGpu()) {
+    MatrixPtr cpuMatrix = Matrix::create(
+        matrix.getHeight(), matrix.getWidth(), matrix.isTransposed(), false);
+    cpuMatrix->copyFrom(matrix);
+    return cpuMatrix->getData();
+  } else {
+    return matrix.getData();
+  }
+}
+
+int checkError(const Matrix& matrix1, const Matrix& matrix2) {
+  CHECK_EQ(matrix1.getHeight(), matrix2.getHeight());
+  CHECK_EQ(matrix1.getWidth(), matrix2.getWidth());
+  CHECK_EQ(matrix1.isTransposed(), matrix2.isTransposed());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+
+  const real* data1 = getData(matrix1);
+  const real* data2 = getData(matrix2);
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      if (fabs(data1[i * width + j] - data2[i * width + j]) > err) {
+        count++;
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+  return count;
+}
+
+void initArgument(size_t batchSize,
+                  int layerSize,
+                  bool useGpu,
+                  Argument& data) {
+  data.value = Matrix::create(batchSize, layerSize, false, useGpu);
+  data.grad = Matrix::create(batchSize, layerSize, false, useGpu);
+  data.value->randomizeUniform();
+  data.value->add(-0.5);
+  data.grad->zeroMem();
+
+  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
+}
+
+LayerPtr createDataLayer(
+    string name, size_t batchSize, int layerSize, bool useGpu, Argument& data) {
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("data");
+  layerConfig.set_size(layerSize);
+  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
+
+  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
+  dataLayer->setData(data);
+  dataLayer->forward(PASS_GC);
+
+  return layer;
+}
+
+LayerPtr createLabelLayer(string name,
+                          size_t batchSize,
+                          size_t numClasses,
+                          bool useGpu) {
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("data");
+  layerConfig.set_size(1);
+  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
+
+  Argument data;
+  data.ids = IVector::create(batchSize, useGpu);
+  data.ids->rand(numClasses - 1);
+
+  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
+
+  DataLayerPtr labelLayer = std::dynamic_pointer_cast<DataLayer>(layer);
+  labelLayer->setData(data);
+  labelLayer->forward(PASS_GC);
+
+  return layer;
+}
+
+LayerPtr createCTCLayer(string name,
+                        size_t numClasses,
+                        bool useGpu,
+                        bool normByTimes,
+                        LayerPtr dataLayer,
+                        LayerPtr labelLayer) {
+  LayerMap layerMap;
+  layerMap[dataLayer->getName()] = dataLayer;
+  layerMap[labelLayer->getName()] = labelLayer;
+
+  ParameterMap parameterMap;
+
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("ctc");
+  layerConfig.set_size(numClasses);
+  layerConfig.set_norm_by_times(normByTimes);
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
+  input0.set_input_layer_name(dataLayer->getName());
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
+  input1.set_input_layer_name(labelLayer->getName());
+
+  LayerPtr layer = LayerPtr(new CTCLayer(layerConfig));
+  layerMap[layer->getName()] = layer;
+  layer->init(layerMap, parameterMap);
+
+  ActivationFunction* softmaxActivation = ActivationFunction::create("softmax");
+
+  softmaxActivation->forward(dataLayer->getOutput()).check();
+  layer->forward(PASS_GC);
+
+  layer->backward();
+  softmaxActivation->backward(dataLayer->getOutput()).check();
+
+  return layer;
+}
+
+LayerPtr createWarpCTCLayer(string name,
+                            size_t numClasses,
+                            bool useGpu,
+                            bool normByTimes,
+                            LayerPtr dataLayer,
+                            LayerPtr labelLayer) {
+  LayerMap layerMap;
+  layerMap[dataLayer->getName()] = dataLayer;
+  layerMap[labelLayer->getName()] = labelLayer;
+
+  ParameterMap parameterMap;
+
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("warp_ctc");
+  layerConfig.set_size(numClasses);
+  layerConfig.set_blank(numClasses - 1);
+  layerConfig.set_norm_by_times(normByTimes);
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
+  input0.set_input_layer_name(dataLayer->getName());
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
+  input1.set_input_layer_name(labelLayer->getName());
+
+  LayerPtr layer = LayerPtr(new WarpCTCLayer(layerConfig));
+  layerMap[layer->getName()] = layer;
+  layer->init(layerMap, parameterMap);
+
+  layer->forward(PASS_GC);
+  layer->backward();
+
+  return layer;
+}
+
+TEST(Layer, WarpCTCLayer) {
+  for (auto layerSize : {10, 64}) {
+    for (auto batchSize : {1, 10, 32}) {
+      for (auto normByTimes : {false, true}) {
+        for (auto useGpu : {false, true}) {
+#ifndef PADDLE_WITH_CUDA
+          if (useGpu) continue;
+#endif
+          LOG(INFO) << "layerSize=" << layerSize << " batchSize=" << batchSize
+                    << " normByTimes = " << normByTimes << " useGpu=" << useGpu;
+
+          FLAGS_use_gpu = useGpu;
+
+          Argument data0;
+          initArgument(batchSize, layerSize, useGpu, data0);
+
+          Argument data1;
+          data1.resizeAndCopyFrom(data0);
+
+          LayerPtr dataLayer0 =
+              createDataLayer("data", batchSize, layerSize, useGpu, data0);
+          LayerPtr dataLayer1 =
+              createDataLayer("data", batchSize, layerSize, useGpu, data1);
+
+          LayerPtr labelLayer =
+              createLabelLayer("label", batchSize, layerSize, useGpu);
+
+          LayerPtr warpctcLayer = createWarpCTCLayer(
+              "cost", layerSize, useGpu, normByTimes, dataLayer0, labelLayer);
+          LayerPtr ctcLayer = createCTCLayer(
+              "cost", layerSize, useGpu, normByTimes, dataLayer1, labelLayer);
+
+          /// Check cost
+          LOG(INFO) << "Check cost: "
+                    << checkError(*(warpctcLayer->getOutput().value),
+                                  *(ctcLayer->getOutput().value))
+                    << " different elements.";
+
+          /// Check gradients
+          LOG(INFO) << "Check gradients: "
+                    << checkError(*(dataLayer0->getOutput().grad),
+                                  *(dataLayer1->getOutput().grad))
+                    << " different elements";
+        }
+      }
+    }
+  }
+}
diff --git a/paddle/legacy/math/Allocator.h b/paddle/legacy/math/Allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..ffb5ec1cad4113c2035daad8c385bbe57a161079
--- /dev/null
+++ b/paddle/legacy/math/Allocator.h
@@ -0,0 +1,137 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdlib.h>
+#include <mutex>
+#include "hl_gpu.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+/**
+ * @brief Allocator base class.
+ *
+ * This is the base class of all Allocator class.
+ */
+class Allocator {
+ public:
+  virtual ~Allocator() {}
+  virtual void* alloc(size_t size) = 0;
+  virtual void free(void* ptr) = 0;
+  virtual std::string getName() = 0;
+};
+
+/**
+ * @brief CPU allocator implementation.
+ */
+class CpuAllocator : public Allocator {
+ public:
+  ~CpuAllocator() {}
+
+  /**
+   * @brief Aligned allocation on CPU.
+   * @param size Size to be allocated.
+   * @return Pointer to the allocated memory
+   */
+  virtual void* alloc(size_t size) {
+    void* ptr;
+#ifdef PADDLE_WITH_MKLDNN
+    // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
+    // memory alignment
+    CHECK_EQ(posix_memalign(&ptr, 4096ul, size), 0);
+#else
+    CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
+#endif
+    CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
+    return ptr;
+  }
+
+  /**
+   * @brief Free the memory space.
+   * @param ptr  Pointer to be free.
+   */
+  virtual void free(void* ptr) {
+    if (ptr) {
+      ::free(ptr);
+    }
+  }
+
+  virtual std::string getName() { return "cpu_alloc"; }
+};
+
+/**
+ * @brief GPU allocator implementation.
+ */
+class GpuAllocator : public Allocator {
+ public:
+  ~GpuAllocator() {}
+
+  /**
+   * @brief Allocate GPU memory.
+   * @param size Size to be allocated.
+   * @return Pointer to the allocated memory
+   */
+  virtual void* alloc(size_t size) {
+    void* ptr = hl_malloc_device(size);
+    CHECK(ptr) << "Fail to allocate GPU memory " << size << " bytes";
+    return ptr;
+  }
+
+  /**
+   * @brief Free the GPU memory.
+   * @param ptr  Pointer to be free.
+   */
+  virtual void free(void* ptr) {
+    if (ptr) {
+      hl_free_mem_device(ptr);
+    }
+  }
+
+  virtual std::string getName() { return "gpu_alloc"; }
+};
+
+/**
+ * @brief CPU pinned memory allocator implementation.
+ */
+class CudaHostAllocator : public Allocator {
+ public:
+  ~CudaHostAllocator() {}
+
+  /**
+   * @brief Allocate pinned memory.
+   * @param size Size to be allocated.
+   * @return Pointer to the allocated memory
+   */
+  virtual void* alloc(size_t size) {
+    void* ptr = hl_malloc_host(size);
+    CHECK(ptr) << "Fail to allocate pinned memory " << size << " bytes";
+    return ptr;
+  }
+
+  /**
+   * @brief Free the pinned memory.
+   * @param ptr  Pointer to be free.
+   */
+  virtual void free(void* ptr) {
+    if (ptr) {
+      hl_free_mem_host(ptr);
+    }
+  }
+
+  virtual std::string getName() { return "cuda_host_alloc"; }
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/math/BaseMatrix.cu b/paddle/legacy/math/BaseMatrix.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7e7cdc57a9887152ecd9e0bbd9fe14fcba56799d
--- /dev/null
+++ b/paddle/legacy/math/BaseMatrix.cu
@@ -0,0 +1,1953 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/legacy/utils/Logging.h>
+#include <string.h>
+#include <cmath>
+#include "BaseMatrix.h"
+#include "MathFunctions.h"
+#include "NEONFunctions.h"
+#include "SIMDFunctions.h"
+#include "hl_matrix_apply.cuh"
+#include "hl_matrix_base.cuh"
+#include "hl_matrix_ops.cuh"
+
+namespace paddle {
+
+const char* SPARSE_SUPPORT_ERROR = "Sparse Matrix/Vector is not supported.";
+
+template <class T>
+template <class Op>
+int BaseMatrixT<T>::applyUnary(Op op) {
+  MatrixOffset offset(0, 0);
+  applyUnary(op, height_, width_, offset);
+  return 0;
+}
+
+template <class T>
+template <class Op>
+int BaseMatrixT<T>::applyUnary(Op op,
+                               int numRows,
+                               int numCols,
+                               MatrixOffset& offset) {
+  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
+  int dimM = numRows;
+  int dimN = numCols;
+  int lda = stride_;
+
+  T* A = data_;
+  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
+
+  CHECK_LE(dimM + offset.aRow_, this->height_);
+  CHECK_LE(dimN + offset.aCol_, this->width_);
+  if (true == useGpu_) {
+    hl_gpu_apply_unary_op(op, A, dimM, dimN, lda);
+  } else {
+    hl_cpu_apply_unary_op(op, A, dimM, dimN, lda);
+  }
+  return 0;
+}
+
+template <class T>
+template <class Op>
+int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b) {
+  CHECK(height_ == b.height_ && width_ == b.width_)
+      << "Matrix dimensions are not equal";
+
+  MatrixOffset offset(0, 0, 0, 0);
+  applyBinary(op, b, height_, width_, offset);
+  return 0;
+}
+
+template <class T>
+template <class Op>
+int BaseMatrixT<T>::applyBinary(
+    Op op, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset) {
+  applyBinary(op, b, numRows, numCols, offset, false_type(), false_type());
+  return 0;
+}
+
+template <class T>
+template <class Op, class bAsRowVector, class bAsColVector>
+int BaseMatrixT<T>::applyBinary(Op op,
+                                BaseMatrixT& b,
+                                int numRows,
+                                int numCols,
+                                MatrixOffset& offset,
+                                bAsRowVector,
+                                bAsColVector) {
+  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
+  CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
+  CHECK(useGpu_ == b.useGpu_) << "Matrix type mismatch";
+
+  int dimM = numRows;
+  int dimN = numCols;
+  int lda = stride_;
+  int ldb = b.stride_;
+
+  T* A = data_;
+  T* B = b.data_;
+  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
+  CAL_MATRIX_START_ADDRESS(
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
+  CHECK_LE(dimM + offset.aRow_, this->height_);
+  CHECK_LE(dimN + offset.aCol_, this->width_);
+  if (!bAsRowVector::value && !bAsColVector::value) {
+    CHECK_LE(dimM + offset.bRow_, b.height_);
+    CHECK_LE(dimN + offset.bCol_, b.width_);
+  } else if (bAsRowVector::value && !bAsColVector::value) {
+    CHECK_LE(dimN + offset.bCol_, b.width_);
+  } else if (!bAsRowVector::value && bAsColVector::value) {
+    CHECK_LE(dimM + offset.bRow_, b.height_);
+  } else {
+  }
+  if (true == useGpu_) {
+    hl_gpu_apply_binary_op<T, Op, bAsRowVector::value, bAsColVector::value>(
+        op, A, B, dimM, dimN, lda, ldb);
+  } else {
+    hl_cpu_apply_binary_op<T, Op, bAsRowVector::value, bAsColVector::value>(
+        op, A, B, dimM, dimN, lda, ldb);
+  }
+
+  return 0;
+}
+
+template <class T>
+template <class Op>
+int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) {
+  CHECK_EQ(height_, b.height_);
+  CHECK_EQ(width_, b.width_);
+  CHECK_EQ(height_, c.height_);
+  CHECK_EQ(width_, c.width_);
+
+  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+  applyTernary(op, b, c, height_, width_, offset);
+
+  return 0;
+}
+
+template <class T>
+template <class Op>
+int BaseMatrixT<T>::applyTernary(Op op,
+                                 BaseMatrixT& b,
+                                 BaseMatrixT& c,
+                                 int numRows,
+                                 int numCols,
+                                 MatrixOffset& offset) {
+  applyTernary(op, b, c, numRows, numCols, offset, false_type(), false_type());
+
+  return 0;
+}
+
+template <class T>
+template <class Op, class cAsRowVector, class cAsColVector>
+int BaseMatrixT<T>::applyTernary(Op op,
+                                 BaseMatrixT& b,
+                                 BaseMatrixT& c,
+                                 int numRows,
+                                 int numCols,
+                                 MatrixOffset& offset,
+                                 cAsRowVector,
+                                 cAsColVector) {
+  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
+  CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
+  CHECK(!c.isSparse()) << SPARSE_SUPPORT_ERROR;
+  CHECK_EQ(useGpu_, b.useGpu_);
+  CHECK_EQ(useGpu_, c.useGpu_);
+
+  int dimM = numRows;
+  int dimN = numCols;
+  int lda = stride_;
+  int ldb = b.stride_;
+  int ldc = c.stride_;
+
+  T* A = data_;
+  T* B = b.data_;
+  T* C = c.data_;
+  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
+  CAL_MATRIX_START_ADDRESS(
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(
+      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
+
+  CHECK_LE(dimM + offset.aRow_, this->height_);
+  CHECK_LE(dimN + offset.aCol_, this->width_);
+  CHECK_LE(dimM + offset.bRow_, b.height_);
+  CHECK_LE(dimN + offset.bCol_, b.width_);
+  if (!cAsRowVector::value && !cAsColVector::value) {
+    CHECK_LE(dimM + offset.cRow_, c.height_);
+    CHECK_LE(dimN + offset.cCol_, c.width_);
+  } else if (cAsRowVector::value && !cAsColVector::value) {
+    CHECK_LE(dimN + offset.cCol_, c.width_);
+  } else if (!cAsRowVector::value && cAsColVector::value) {
+    CHECK_LE(dimM + offset.cRow_, c.height_);
+  } else {
+  }
+
+  if (true == useGpu_) {
+    hl_gpu_apply_ternary_op<T, Op, cAsRowVector::value, cAsColVector::value>(
+        op, A, B, C, dimM, dimN, lda, ldb, ldc);
+  } else {
+    hl_cpu_apply_ternary_op<T, Op, cAsRowVector::value, cAsColVector::value>(
+        op, A, B, C, dimM, dimN, lda, ldb, ldc);
+  }
+
+  return 0;
+}
+
+template <class T>
+template <class Op>
+int BaseMatrixT<T>::applyQuaternary(Op op,
+                                    BaseMatrixT& b,
+                                    BaseMatrixT& c,
+                                    BaseMatrixT& d) {
+  CHECK_EQ(height_, b.height_);
+  CHECK_EQ(width_, b.width_);
+  CHECK_EQ(height_, c.height_);
+  CHECK_EQ(width_, c.width_);
+  CHECK_EQ(height_, d.height_);
+  CHECK_EQ(width_, d.width_);
+
+  MatrixOffset offset(0, 0, 0, 0, 0, 0, 0, 0);
+  applyQuaternary(op, b, c, d, height_, width_, offset);
+
+  return 0;
+}
+
+template <class T>
+template <class Op>
+int BaseMatrixT<T>::applyQuaternary(Op op,
+                                    BaseMatrixT& b,
+                                    BaseMatrixT& c,
+                                    BaseMatrixT& d,
+                                    int numRows,
+                                    int numCols,
+                                    MatrixOffset& offset) {
+  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
+  CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
+  CHECK(!c.isSparse()) << SPARSE_SUPPORT_ERROR;
+  CHECK(!d.isSparse()) << SPARSE_SUPPORT_ERROR;
+  CHECK_EQ(useGpu_, b.useGpu_);
+  CHECK_EQ(useGpu_, c.useGpu_);
+  CHECK_EQ(useGpu_, d.useGpu_);
+
+  int dimM = numRows;
+  int dimN = numCols;
+  int lda = stride_;
+  int ldb = b.stride_;
+  int ldc = c.stride_;
+  int ldd = d.stride_;
+
+  T* A = data_;
+  T* B = b.data_;
+  T* C = c.data_;
+  T* D = d.data_;
+  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
+  CAL_MATRIX_START_ADDRESS(
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(
+      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
+  CAL_MATRIX_START_ADDRESS(
+      D, d.height_, d.width_, ldd, offset.dCol_, offset.dRow_);
+
+  CHECK_LE(dimM + offset.aRow_, this->height_);
+  CHECK_LE(dimN + offset.aCol_, this->width_);
+  CHECK_LE(dimM + offset.bRow_, b.height_);
+  CHECK_LE(dimN + offset.bCol_, b.width_);
+  CHECK_LE(dimM + offset.cRow_, c.height_);
+  CHECK_LE(dimN + offset.cCol_, c.width_);
+  CHECK_LE(dimM + offset.dRow_, d.height_);
+  CHECK_LE(dimN + offset.dCol_, d.width_);
+  if (true == useGpu_) {
+    hl_gpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd);
+  } else {
+    hl_cpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd);
+  }
+
+  return 0;
+}
+
+template <class T>
+template <class Agg,
+          class Op,
+          class Saver,
+          class aAsRowVector,
+          class aAsColVector>
+int BaseMatrixT<T>::aggregate(Agg agg,
+                              Op op,
+                              Saver sv,
+                              BaseMatrixT& b,
+                              int numRows,
+                              int numCols,
+                              MatrixOffset& offset,
+                              aAsRowVector,
+                              aAsColVector) {
+  CHECK_EQ(useGpu_, b.useGpu_);
+
+  int ld = stride_;
+  int ldb = b.stride_;
+
+  T* dst = data_;
+  T* B = b.data_;
+  CAL_MATRIX_START_ADDRESS(
+      dst, height_, width_, ld, offset.aCol_, offset.aRow_);
+  CAL_MATRIX_START_ADDRESS(
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
+
+  if (aAsRowVector::value && !aAsColVector::value) {
+    if (useGpu_) {
+      hl_gpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B, ldb);
+    } else {
+      hl_cpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B, ldb);
+    }
+  } else if (!aAsRowVector::value && aAsColVector::value) {
+    if (useGpu_) {
+      hl_gpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B, ldb);
+    } else {
+      hl_cpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B, ldb);
+    }
+  } else {
+    LOG(FATAL) << "not supported";
+  }
+
+  return 0;
+}
+
+template <class T>
+template <class Agg,
+          class Op,
+          class Saver,
+          class aAsRowVector,
+          class aAsColVector>
+int BaseMatrixT<T>::aggregate(Agg agg,
+                              Op op,
+                              Saver sv,
+                              BaseMatrixT& b,
+                              BaseMatrixT& c,
+                              int numRows,
+                              int numCols,
+                              MatrixOffset& offset,
+                              aAsRowVector,
+                              aAsColVector) {
+  CHECK_EQ(useGpu_, b.useGpu_);
+  CHECK_EQ(useGpu_, c.useGpu_);
+
+  int ld = stride_;
+  int ldb = b.stride_;
+  int ldc = c.stride_;
+
+  T* dst = data_;
+  T* B = b.data_;
+  T* C = c.data_;
+  CAL_MATRIX_START_ADDRESS(
+      dst, height_, width_, ld, offset.aCol_, offset.aRow_);
+  CAL_MATRIX_START_ADDRESS(
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(
+      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
+
+  if (aAsRowVector::value && !aAsColVector::value) {
+    if (useGpu_) {
+      hl_gpu_matrix_column_op(
+          agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc);
+    } else {
+      hl_cpu_matrix_column_op(
+          agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc);
+    }
+  } else if (!aAsRowVector::value && aAsColVector::value) {
+    if (useGpu_) {
+      hl_gpu_matrix_row_op(
+          agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc);
+    } else {
+      hl_cpu_matrix_row_op(
+          agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc);
+    }
+  } else {
+    LOG(FATAL) << "not supported";
+  }
+
+  return 0;
+}
+
+/**
+ * @brief   unary operator.
+ *
+ */
+
+DEFINE_MATRIX_UNARY_OP(Neg, a = -a);
+template <class T>
+void BaseMatrixT<T>::neg() {
+  applyUnary(unary::Neg<T>());
+}
+
+DEFINE_MATRIX_UNARY_OP(Exp, a = exp(a));
+template <>
+void BaseMatrixT<real>::exp2() {
+  applyUnary(unary::Exp<real>());
+}
+
+DEFINE_MATRIX_UNARY_OP(Log, a = log(a));
+template <>
+void BaseMatrixT<real>::log2() {
+  if (useGpu_) {
+    applyUnary(unary::Log<real>());
+  } else {
+    vLog(height_ * width_, data_, data_);
+  }
+}
+
+DEFINE_MATRIX_UNARY_OP(Sqrt, a = sqrt(a));
+template <>
+void BaseMatrixT<real>::sqrt2() {
+  applyUnary(unary::Sqrt<real>());
+}
+
+DEFINE_MATRIX_UNARY_OP(Square, a = a * a);
+template <class T>
+void BaseMatrixT<T>::square2() {
+  applyUnary(unary::Square<T>());
+}
+
+DEFINE_MATRIX_UNARY_OP(Reciprocal, a = 1.0f / a);
+template <class T>
+void BaseMatrixT<T>::reciprocal2() {
+  applyUnary(unary::Reciprocal<T>());
+}
+
+DEFINE_MATRIX_UNARY_OP(Abs, a = a > 0 ? a : -a);
+template <class T>
+void BaseMatrixT<T>::abs2() {
+  applyUnary(unary::Abs<T>());
+}
+
+DEFINE_MATRIX_UNARY_OP(Sign, a = (a > 0) - (a < 0));
+template <class T>
+void BaseMatrixT<T>::sign2() {
+  applyUnary(unary::Sign<T>());
+}
+
+DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
+template <class T>
+void BaseMatrixT<T>::zero() {
+  applyUnary(unary::Zero<T>());
+}
+
+template <class T>
+void BaseMatrixT<T>::zeroAtOffset(int64_t columnOffset, int64_t numColumns) {
+  int numRows = height_;
+  int numCols = numColumns;
+  MatrixOffset offset(columnOffset, 0);
+  applyUnary(unary::Zero<T>(), numRows, numCols, offset);
+}
+
+DEFINE_MATRIX_UNARY_OP(One, a = 1);
+template <class T>
+void BaseMatrixT<T>::one() {
+  applyUnary(unary::One<T>());
+}
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(a, p));
+template <>
+void BaseMatrixT<real>::pow2(real p) {
+  if (useGpu_) {
+    applyUnary(unary::Pow<real>(p));
+  } else {
+    vPow(height_ * width_, data_, p, data_);
+  }
+}
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a -= p);
+template <class T>
+void BaseMatrixT<T>::subScalar(T p) {
+  applyUnary(unary::SubScalar<T>(p));
+}
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a *= p);
+template <class T>
+void BaseMatrixT<T>::mulScalar(T p) {
+  applyUnary(unary::MulScalar<T>(p));
+}
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a /= p);
+template <class T>
+void BaseMatrixT<T>::divScalar(T p) {
+  applyUnary(unary::DivScalar<T>(p));
+}
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(Assign, ONE_PARAMETER, a = p);
+template <class T>
+void BaseMatrixT<T>::assign(T p) {
+  applyUnary(unary::Assign<T>(p));
+}
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(Add, ONE_PARAMETER, a += p);
+template <class T>
+void BaseMatrixT<T>::add(T p) {
+  applyUnary(unary::Add<T>(p));
+}
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = a * p1 + p2);
+template <class T>
+void BaseMatrixT<T>::add(T p1, T p2) {
+  applyUnary(unary::Add2<T>(p1, p2));
+}
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip,
+                                 TWO_PARAMETER,
+                                 a = a < p1 ? p1 : (a > p2 ? p2 : a));
+template <class T>
+void BaseMatrixT<T>::clip(T p1, T p2) {
+  applyUnary(unary::Clip<T>(p1, p2));
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ClipDerivative,
+                                  TWO_PARAMETER,
+                                  a = b < p1 ? 0 : (b > p2 ? 0 : 1));
+template <class T>
+void BaseMatrixT<T>::clipDerivative(BaseMatrixT& b, T p1, T p2) {
+  applyBinary(binary::ClipDerivative<T>(p1, p2), b);
+}
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar,
+                                 ONE_PARAMETER,
+                                 a = a > p ? 1.0f : 0.0f);
+template <class T>
+void BaseMatrixT<T>::biggerThanScalar(T p) {
+  applyUnary(unary::BiggerThanScalar<T>(p));
+}
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(DownClip, ONE_PARAMETER, a = a > p ? a : p);
+template <class T>
+void BaseMatrixT<T>::downClip(T p) {
+  applyUnary(unary::DownClip<T>(p));
+}
+
+/**
+ * @brief   binary operator.
+ *
+ */
+
+DEFINE_MATRIX_BINARY_OP(Add, a += b);
+template <class T>
+void BaseMatrixT<T>::add(BaseMatrixT& b) {
+  applyBinary(binary::Add<T>(), b);
+}
+
+template <>
+void BaseMatrixT<real>::add(BaseMatrixT& b) {
+  if (useGpu_) {
+    applyBinary(binary::Add<real>(), b);
+  } else {  // cpu branch
+    CHECK_EQ(height_, b.height_);
+    CHECK_EQ(width_, b.width_);
+    vAdd(height_ * width_, data_, b.data_, data_);
+  }
+}
+
+template <class T>
+void BaseMatrixT<T>::addAtOffset(BaseMatrixT& b, int64_t columnOffset) {
+  if (columnOffset + b.width_ <= width_) {
+    int numRows = height_;
+    int numCols = b.width_;
+    MatrixOffset offset(columnOffset, 0, 0, 0);
+    applyBinary(binary::Add<T>(), b, numRows, numCols, offset);
+  } else if (columnOffset + width_ <= b.width_) {
+    int numRows = height_;
+    int numCols = width_;
+    MatrixOffset offset(0, 0, columnOffset, 0);
+    applyBinary(binary::Add<T>(), b, numRows, numCols, offset);
+  } else {
+    LOG(FATAL) << "Wrong argument "
+               << " a.width=" << width_ << " b.width=" << b.width_
+               << " columnOffset=" << columnOffset;
+  }
+}
+
+template <class T>
+void BaseMatrixT<T>::addP2P(BaseMatrixT& b) {
+  T* A = data_;
+  T* B = b.data_;
+  int dimM = height_;
+  int dimN = width_;
+
+  hl_gpu_apply_binary_op<T, binary::Add<T>, 0, 0>(
+      binary::Add<T>(), A, B, dimM, dimN, dimN, dimN);
+}
+
+template <class T>
+void BaseMatrixT<T>::addColVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::Add<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              false_type(),
+              true_type() /* bAsColVector */);
+}
+
+template <class T>
+void BaseMatrixT<T>::addRowVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::Add<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              true_type() /* bAsRowVector */,
+              false_type());
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(Add1, ONE_PARAMETER, a += b * p);
+template <class T>
+void BaseMatrixT<T>::add(BaseMatrixT& b, T p) {
+  applyBinary(binary::Add1<T>(p), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(b, p));
+template <>
+void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
+  if (useGpu_) {
+    applyBinary(binary::Pow<real>(p), b);
+  } else {
+    vPow(height_ * width_, b.data_, p, data_);
+  }
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = p1 * a + p2 * b);
+template <class T>
+void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, T p2) {
+  applyBinary(binary::Add2<T>(p1, p2), b);
+}
+
+template <class T>
+void BaseMatrixT<T>::addBias(BaseMatrixT& b, T scale) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::Add1<T>(scale),
+              b,
+              numRows,
+              numCols,
+              offset,
+              true_type() /* bAsRowVector */,
+              false_type());
+}
+
+DEFINE_MATRIX_BINARY_OP(Sub, a -= b);
+template <class T>
+void BaseMatrixT<T>::sub(BaseMatrixT& b) {
+  applyBinary(binary::Sub<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(Sub1, ONE_PARAMETER, a -= b * p);
+template <class T>
+void BaseMatrixT<T>::sub(BaseMatrixT& b, T p) {
+  applyBinary(binary::Sub1<T>(p), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Relu, b = a > 0.0f ? a : 0.0f);
+template <class T>
+void BaseMatrixT<T>::relu(BaseMatrixT& b) {
+  applyBinary(binary::Relu<T>(), b);
+}
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+template <>
+void BaseMatrixT<float>::relu(BaseMatrixT& b) {
+  neon::relu(data_, b.data_, height_ * width_);
+}
+#endif
+
+DEFINE_MATRIX_BINARY_OP(ReluDerivative, a *= (b > 0.0f ? 1.0f : 0.0f));
+template <class T>
+void BaseMatrixT<T>::reluDerivative(BaseMatrixT& b) {
+  applyBinary(binary::ReluDerivative<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Softrelu, const T THRESHOLD = 40.0;
+                        b = log(1.0 + exp((a > THRESHOLD)
+                                              ? THRESHOLD
+                                              : ((a < -THRESHOLD) ? (-THRESHOLD)
+                                                                  : a))));
+template <>
+void BaseMatrixT<real>::softrelu(BaseMatrixT& b) {
+  applyBinary(binary::Softrelu<real>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(
+    SoftreluDerivative, const T THRESHOLD = 40.0;
+    a *= (1.0 - exp(-1.0 * ((b > THRESHOLD)
+                                ? THRESHOLD
+                                : ((b < -THRESHOLD) ? (-THRESHOLD) : b)))));
+template <>
+void BaseMatrixT<real>::softreluDerivative(BaseMatrixT& b) {
+  applyBinary(binary::SoftreluDerivative<real>(), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(Brelu, TWO_PARAMETER, b = a > p1 ? a : p1;
+                                  b = b < p2 ? b : p2);
+template <class T>
+void BaseMatrixT<T>::brelu(BaseMatrixT& b) {
+  int p1 = 0, p2 = 24;  //! TODO(yuyang18): Make p1,p2 configuable.
+  applyBinary(binary::Brelu<T>(p1, p2), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(BreluDerivative,
+                                  TWO_PARAMETER,
+                                  a *= (b > p1 && b < p2) ? 1.0 : 0.0);
+template <class T>
+void BaseMatrixT<T>::breluDerivative(BaseMatrixT& b) {
+  int p1 = 0, p2 = 24;
+  applyBinary(binary::BreluDerivative<T>(p1, p2), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Square, b = a * a);
+template <class T>
+void BaseMatrixT<T>::square2(BaseMatrixT& b) {
+  applyBinary(binary::Square<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(SquareDerivative, a *= 2.0 * b);
+template <class T>
+void BaseMatrixT<T>::squareDerivative(BaseMatrixT& b) {
+  applyBinary(binary::SquareDerivative<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Tanh, T tmp = -2.0 * a;
+                        tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+                        b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
+template <>
+void BaseMatrixT<real>::tanh(BaseMatrixT& b) {
+  applyBinary(binary::Tanh<real>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(TanhDerivative, a *= 1 - b * b);
+template <class T>
+void BaseMatrixT<T>::tanhDerivative(BaseMatrixT& b) {
+  applyBinary(binary::TanhDerivative<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(
+    ScaledTanh, TWO_PARAMETER, b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0));
+template <>
+void BaseMatrixT<real>::scaledTanh(BaseMatrixT& b, real p1, real p2) {
+  applyBinary(binary::ScaledTanh<real>(p1, p2), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanhDerivative,
+                                  TWO_PARAMETER,
+                                  a *= p2 * (p1 - b * b));
+template <class T>
+void BaseMatrixT<T>::scaledTanhDerivative(BaseMatrixT& b, T p1, T p2) {
+  applyBinary(binary::ScaledTanhDerivative<T>(p1 * p1, p2 / p1), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Reciprocal, b = 1.0f / a);
+template <class T>
+void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b) {
+  applyBinary(binary::Reciprocal<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(ReciprocalDerivative, a *= -b * b);
+template <class T>
+void BaseMatrixT<T>::reciprocalDerivative(BaseMatrixT& b) {
+  applyBinary(binary::ReciprocalDerivative<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Abs, b = a > 0.0f ? a : -a);
+template <class T>
+void BaseMatrixT<T>::abs2(BaseMatrixT& b) {
+  applyBinary(binary::Abs<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(AbsDerivative, a = (b > 0) ? a : (b < 0) ? -a : 0);
+template <class T>
+void BaseMatrixT<T>::absDerivative(BaseMatrixT& b) {
+  applyBinary(binary::AbsDerivative<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Sigmoid, const T THRESHOLD_MIN = -40.0;
+                        const T THRESHOLD_MAX = 13.0;
+                        T tmp = (a < THRESHOLD_MIN)
+                                    ? THRESHOLD_MIN
+                                    : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
+                        b = 1.0f / (1.0f + exp(-tmp)));
+template <>
+void BaseMatrixT<real>::sigmoid(BaseMatrixT& b) {
+  if (useGpu_) {
+    applyBinary(binary::Sigmoid<real>(), b);
+  } else {  // cpu versioni
+    size_t numSamples = this->height_;
+    size_t dim = this->width_;
+    CHECK_EQ(b.height_, numSamples);
+    CHECK_EQ(b.width_, dim);
+    const real* in = this->data_;
+    real* out = b.data_;
+
+    // out = - in
+    const float THRESHOLD_MIN = -40.0;  // make sure sigmoid(x) > 0
+    const float THRESHOLD_MAX = 13.0;   // make sure sigmoid(x) < 1
+    for (size_t i = 0; i < numSamples * dim; ++i) {
+      real tmp = in[i];
+      tmp = (tmp < THRESHOLD_MIN)
+                ? THRESHOLD_MIN
+                : ((tmp > THRESHOLD_MAX) ? THRESHOLD_MAX : tmp);
+      out[i] = -tmp;
+    }
+
+    // out = exp(out)
+    vExp(numSamples * dim, out, out);
+
+    // out = 1 / (1 + out)
+    for (size_t i = 0; i < numSamples * dim; ++i) {
+      out[i] = 1 / (1 + out[i]);
+    }
+  }
+}
+
+DEFINE_MATRIX_BINARY_OP(SigmoidDerivative, a *= b * (1 - b));
+template <class T>
+void BaseMatrixT<T>::sigmoidDerivative(BaseMatrixT& b) {
+  applyBinary(binary::SigmoidDerivative<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(ExpDerivative, a *= b);
+template <class T>
+void BaseMatrixT<T>::expDerivative(BaseMatrixT& b) {
+  applyBinary(binary::ExpDerivative<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Sign, b = a > 0.0f ? 1.0f : -1.0f);
+template <class T>
+void BaseMatrixT<T>::sign2(BaseMatrixT& b) {
+  applyBinary(binary::Sign<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Exp, a = exp(b));
+template <>
+void BaseMatrixT<real>::exp2(BaseMatrixT& b) {
+  applyBinary(binary::Exp<real>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(Log, a = log(b));
+template <>
+void BaseMatrixT<real>::log2(BaseMatrixT& b) {
+  if (useGpu_) {
+    applyBinary(binary::Log<real>(), b);
+  } else {
+    vLog(height_ * width_, b.data_, data_);
+  }
+}
+
+DEFINE_MATRIX_BINARY_OP(Sqrt, a = sqrt(b));
+template <>
+void BaseMatrixT<real>::sqrt2(BaseMatrixT& b) {
+  applyBinary(binary::Sqrt<real>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(InvSqrt, a = 1.0f / sqrt(b));
+template <>
+void BaseMatrixT<real>::invSqrt(BaseMatrixT& b) {
+  if (useGpu_) {
+    applyBinary(binary::InvSqrt<real>(), b);
+  } else {  // cpu branch
+    CHECK_EQ(height_, b.height_);
+    CHECK_EQ(width_, b.width_);
+    vInvSqrt(height_ * width_, b.data_, data_);
+  }
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(IsEqual, ONE_PARAMETER, a = (b == p));
+template <class T>
+void BaseMatrixT<T>::isEqualTo(BaseMatrixT& b, T value) {
+  applyBinary(binary::IsEqual<T>(value), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(AddScalar, ONE_PARAMETER, a = b + p);
+template <class T>
+void BaseMatrixT<T>::addScalar(BaseMatrixT& b, T p) {
+  applyBinary(binary::AddScalar<T>(p), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a = b - p);
+template <class T>
+void BaseMatrixT<T>::subScalar(BaseMatrixT& b, T p) {
+  applyBinary(binary::SubScalar<T>(p), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a = b * p);
+template <class T>
+void BaseMatrixT<T>::mulScalar(BaseMatrixT& b, T p) {
+  applyBinary(binary::MulScalar<T>(p), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a = b / p);
+template <class T>
+void BaseMatrixT<T>::divScalar(BaseMatrixT& b, T p) {
+  applyBinary(binary::DivScalar<T>(p), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ScalarDiv, ONE_PARAMETER, a = p / b);
+template <class T>
+void BaseMatrixT<T>::scalarDiv(BaseMatrixT& b, T p) {
+  applyBinary(binary::ScalarDiv<T>(p), b);
+}
+
+/**
+ * @brief   ternary operator.
+ *
+ */
+
+DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropy,
+                         a = -c * log(b) - (1 - c) * log(1 - b));
+template <>
+void BaseMatrixT<real>::softCrossEntropy(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::SoftCrossEntropy<real>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropyBp, a += (b - c) / (b * (1 - b)));
+template <class T>
+void BaseMatrixT<T>::softCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::SoftCrossEntropyBp<T>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropy,
+                         a = c > 0.5 ? -log(b) : -log(1.0 - b));
+template <>
+void BaseMatrixT<real>::binaryLabelCrossEntropy(BaseMatrixT& b,
+                                                BaseMatrixT& c) {
+  if (useGpu_) {
+    applyTernary(ternary::BinaryCrossEntropy<real>(), b, c);
+  } else {
+    CHECK_EQ(height_, b.height_);
+    CHECK_EQ(height_, c.height_);
+    CHECK_EQ(width_, b.width_);
+    CHECK_EQ(width_, c.width_);
+
+    size_t size = height_ * width_;
+    real* out = b.data_;
+    real* label = c.data_;
+    real* cost = data_;
+
+    for (size_t i = 0; i < size; ++i) {
+      cost[i] = label[i] > 0.5 ? out[i] : 1.0 - out[i];
+    }
+    vLog(size, cost, cost);
+    for (size_t i = 0; i < size; ++i) {
+      cost[i] *= -1.0;
+    }
+  }
+}
+
+DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropyBp,
+                         a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b));
+template <class T>
+void BaseMatrixT<T>::binaryLabelCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::BinaryCrossEntropyBp<T>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_OP(Add, a = b + c);
+template <class T>
+void BaseMatrixT<T>::add(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::Add<T>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add1, TWO_PARAMETER, a = p1 * b + p2 * c);
+template <class T>
+void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) {
+  applyTernary(ternary::Add1<T>(p1, p2), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_OP(Sub, a = b - c);
+template <class T>
+void BaseMatrixT<T>::sub(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::Sub<T>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(Sub1, TWO_PARAMETER, a = p1 * b - p2 * c);
+template <class T>
+void BaseMatrixT<T>::sub(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) {
+  applyTernary(ternary::Sub1<T>(p1, p2), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_OP(Add2, a = a + b + c);
+template <class T>
+void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::Add2<T>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add3,
+                                   THREE_PARAMETER,
+                                   a = p1 * a + p2 * b + p3 * c);
+template <class T>
+void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) {
+  applyTernary(ternary::Add3<T>(p1, p2, p3), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(SgdUpdate,
+                                   THREE_PARAMETER,
+                                   c = p2 * c - p1 * (b + p3 * a);
+                                   a = a + c);
+template <class T>
+void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b,  // grad
+                               BaseMatrixT& c,  // mom
+                               T p1,            // learningRate,
+                               T p2,            // momentum,
+                               T p3) {          // decayRate
+  applyTernary(ternary::SgdUpdate<T>(p1, p2, p3), b, c);
+}
+
+DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(SgdUpdate,
+                                      THREE_PARAMETER,
+                                      c = p2 * c - p1 * d * (b + p3 * a);
+                                      a += c);
+template <class T>
+void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b,  // grad,
+                               BaseMatrixT& c,  // mom,
+                               BaseMatrixT& d,  // lr,
+                               T p1,            // learningRate,
+                               T p2,            // momentum,
+                               T p3) {          // decayRate
+  applyQuaternary(quaternary::SgdUpdate<T>(p1, p2, p3), b, c, d);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p * b;
+                                  a = (a > lambda)
+                                          ? (a - lambda)
+                                          : (a < -lambda) ? (a + lambda) : 0);
+template <class T>
+void BaseMatrixT<T>::applyL1(BaseMatrixT& lr, T learningRate, T decayRate) {
+  applyBinary(binary::ApplyL1<T>(learningRate * decayRate), lr);
+}
+
+template <>
+void BaseMatrixT<real>::applyL1(BaseMatrixT& lr,
+                                real learningRate,
+                                real decayRate) {
+  if (useGpu_) {
+    applyBinary(binary::ApplyL1<real>(learningRate * decayRate), lr);
+  } else {
+    simd::decayL1(this->data_,
+                  this->data_,
+                  lr.data_,
+                  learningRate * decayRate,
+                  height_ * width_);
+  }
+}
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p;
+                                 a = (a > lambda)
+                                         ? (a - lambda)
+                                         : (a < -lambda) ? (a + lambda) : 0);
+template <class T>
+void BaseMatrixT<T>::applyL1(T learningRate, T decayRate) {
+  applyUnary(unary::ApplyL1<T>(learningRate * decayRate));
+}
+
+template <>
+void BaseMatrixT<real>::applyL1(real learningRate, real decayRate) {
+  if (useGpu_) {
+    applyUnary(unary::ApplyL1<real>(learningRate * decayRate));
+  } else {
+    simd::decayL1(
+        this->data_, this->data_, learningRate * decayRate, height_ * width_);
+  }
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL2,
+                                  ONE_PARAMETER,
+                                  a *= (1.0f / (1.0f + p * b)));
+template <class T>
+void BaseMatrixT<T>::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) {
+  if (useGpu_) {
+    applyBinary(binary::ApplyL2<T>(learningRate * decayRate), lr);
+  } else {
+    size_t size = this->height_ * this->width_;
+    T decay = learningRate * decayRate;
+    for (size_t j = 0; j < size; ++j) {
+      this->data_[j] *= 1.0f / (1.0f + decay * lr.data_[j]);
+    }
+  }
+}
+
+template <class T>
+void BaseMatrixT<T>::applyL2(T learningRate, T decayRate) {
+  BaseMatrixT<T>::mulScalar(1.0f / (1.0f + learningRate * decayRate));
+}
+
+DEFINE_MATRIX_BINARY_OP(DotMul, a *= b);
+template <class T>
+void BaseMatrixT<T>::dotMul(BaseMatrixT& b) {
+  applyBinary(binary::DotMul<T>(), b);
+}
+
+DEFINE_MATRIX_TERNARY_OP(DotMul, a = b * c);
+template <class T>
+void BaseMatrixT<T>::dotMul(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::DotMul<T>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_OP(DotDiv, a = (b == 0.0) ? 0.0 : b / c);
+template <class T>
+void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::DotDiv<T>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotDiv2P,
+                                   TWO_PARAMETER,
+                                   a = (b + p1) / (c + p2));
+template <class T>
+void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
+  applyTernary(ternary::DotDiv2P<T>(p1, p2), b, c);
+}
+
+DEFINE_MATRIX_QUATERNARY_OP(RankLoss, const T THRESHOLD = 40.0; a = b - c;
+                            a = (a > THRESHOLD)
+                                    ? THRESHOLD
+                                    : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
+                            a = log(1 + exp(a)) - a * d);
+template <>
+void BaseMatrixT<real>::rankLoss(BaseMatrixT& b,
+                                 BaseMatrixT& c,
+                                 BaseMatrixT& d) {
+  applyQuaternary(quaternary::RankLoss<real>(), b, c, d);
+}
+
+DEFINE_MATRIX_QUATERNARY_OP(RankLossBp, const T THRESHOLD = 40.0; a = b - c;
+                            a = (a > THRESHOLD)
+                                    ? THRESHOLD
+                                    : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
+                            a = exp(a);
+                            a = (a / (1 + a) - d));
+template <>
+void BaseMatrixT<real>::rankLossBp(BaseMatrixT& b,
+                                   BaseMatrixT& c,
+                                   BaseMatrixT& d) {
+  applyQuaternary(quaternary::RankLossBp<real>(), b, c, d);
+}
+
+/* this = log(1 + exp(b)) - c * b */
+DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLoss, const T THRESHOLD = 40.0;
+                         T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
+                                                                 ? -THRESHOLD
+                                                                 : b;
+                         a = log(1 + exp(x)) - c * x);
+template <>
+void BaseMatrixT<real>::logisticRegressionLoss(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::LogisticRegressionLoss<real>(), b, c);
+}
+
+/* this = exp(b)/(1+exp(b)) - c */
+DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLossBp, const T THRESHOLD = 40.0;
+                         T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
+                                                                 ? -THRESHOLD
+                                                                 : b;
+                         x = exp(x);
+                         a = x / (1 + x) - c);
+template <>
+void BaseMatrixT<real>::logisticRegressionLossBp(BaseMatrixT& b,
+                                                 BaseMatrixT& c) {
+  applyTernary(ternary::LogisticRegressionLossBp<real>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_OP(BiggerThan, a = (b > c) ? 1.0f : 0.0f);
+template <class T>
+void BaseMatrixT<T>::biggerThan(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::BiggerThan<T>(), b, c);
+}
+
+DEFINE_MATRIX_QUATERNARY_OP(
+    BiggerThan, a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
+template <class T>
+void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
+                                BaseMatrixT& c,
+                                BaseMatrixT& d) {
+  applyQuaternary(quaternary::BiggerThan<T>(), b, c, d);
+}
+
+DEFINE_MATRIX_TERNARY_OP(Max, a = (b > c) ? b : c);
+template <class T>
+void BaseMatrixT<T>::max2(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::Max<T>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(BinaryClassificationError,
+                                   ONE_PARAMETER,
+                                   c += ((a > p) == (b > p)) ? 0.0f : 1.0f);
+template <class T>
+void BaseMatrixT<T>::binaryClassificationError2(size_t destCol,
+                                                BaseMatrixT& b,
+                                                BaseMatrixT& c,
+                                                T p) {
+  CHECK(!useGpu_) << "do not support gpu";
+  MatrixOffset offset(0, 0, 0, 0, destCol, 0);
+  int numRows = b.height_;
+  int numCols = b.width_;
+  b.applyTernary(ternary::BinaryClassificationError<T>(p),
+                 c,
+                 *this,
+                 numRows,
+                 numCols,
+                 offset,
+                 false_type(),
+                 true_type() /*cAsColVector*/);
+}
+
+template <>
+void BaseMatrixT<real>::binaryClassificationError(size_t destCol,
+                                                  BaseMatrixT& b,
+                                                  BaseMatrixT& c,
+                                                  real p) {
+  MatrixOffset offset(destCol, 0, 0, 0, 0, 0);
+  int numRows = b.height_;
+  int numCols = b.width_;
+  aggregate(aggregate::sum(),
+            base::binary::classificationError(p),
+            base::binary::add(),
+            b,
+            c,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
+            true_type() /*aAsColVector*/);
+}
+
+DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(Add3,
+                                      THREE_PARAMETER,
+                                      a = p1 * b + p2 * c + p3 * d);
+template <class T>
+void BaseMatrixT<T>::add3(
+    BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2, T p3) {
+  applyQuaternary(quaternary::Add3<T>(p1, p2, p3), b, c, d);
+}
+
+DEFINE_MATRIX_TERNARY_OP(DotMulSquare, a = b * c * c);
+template <class T>
+void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::DotMulSquare<T>(), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_OP(DotSquareSquare, a = b * b * c * c);
+template <class T>
+void BaseMatrixT<T>::dotSquareSquare(BaseMatrixT& b, BaseMatrixT& c) {
+  applyTernary(ternary::DotSquareSquare<T>(), b, c);
+}
+
+DEFINE_MATRIX_BINARY_OP(DotMulSquare, a *= b * b);
+template <class T>
+void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b) {
+  applyBinary(binary::DotMulSquare<T>(), b);
+}
+
+DEFINE_MATRIX_BINARY_OP(DotSquareMul, a = a * a * b);
+template <class T>
+void BaseMatrixT<T>::dotSquareMul(BaseMatrixT& b) {
+  applyBinary(binary::DotSquareMul<T>(), b);
+}
+
+DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(AddSquareSum,
+                                      THREE_PARAMETER,
+                                      T tmp = p1 * b + p2 * c + p3 * d;
+                                      a += tmp * tmp);
+template <class T>
+void BaseMatrixT<T>::addSquareSum(
+    BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1, T p2, T p3) {
+  applyQuaternary(quaternary::AddSquareSum<T>(p1, p2, p3), b, c, d);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(AddSquare, ONE_PARAMETER, a += p * b * b);
+template <class T>
+void BaseMatrixT<T>::addSquare(BaseMatrixT& b, T p) {
+  applyBinary(binary::AddSquare<T>(p), b);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(DecayAddSquare,
+                                  TWO_PARAMETER,
+                                  a = p1 * a + p2 * b * b);
+template <class T>
+void BaseMatrixT<T>::decayAddSquare(BaseMatrixT& b, T p1, T p2) {
+  applyBinary(binary::DecayAddSquare<T>(p1, p2), b);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DecayAddSquareMul,
+                                   TWO_PARAMETER,
+                                   a = p1 * a + p2 * b * b * c * c);
+template <class T>
+void BaseMatrixT<T>::decayAddSquareMul(BaseMatrixT& b,
+                                       BaseMatrixT& c,
+                                       T p1,
+                                       T p2) {
+  applyTernary(ternary::DecayAddSquareMul<T>(p1, p2), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(ReciprocalSum,
+                                   THREE_PARAMETER,
+                                   a = 1 / (p1 * b + p2 * c + p3));
+template <class T>
+void BaseMatrixT<T>::reciprocalSum(
+    BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) {
+  applyTernary(ternary::ReciprocalSum<T>(p1, p2, p3), b, c);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2,
+                                  TWO_PARAMETER,
+                                  a = 1 / (p1 * b + p2));
+template <class T>
+void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b, T p1, T p2) {
+  applyBinary(binary::Reciprocal2<T>(p1, p2), b);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSquareSum,
+                                   TWO_PARAMETER,
+                                   T tmp = p1 * b + p2 * c;
+                                   a *= tmp * tmp);
+template <class T>
+void BaseMatrixT<T>::dotMulSquareSum(BaseMatrixT& b,
+                                     BaseMatrixT& c,
+                                     T p1,
+                                     T p2) {
+  applyTernary(ternary::DotMulSquareSum<T>(p1, p2), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotSquareSum,
+                                   TWO_PARAMETER,
+                                   T tmp = p1 * b + p2 * c;
+                                   a = tmp * tmp);
+template <class T>
+void BaseMatrixT<T>::dotSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
+  applyTernary(ternary::DotSquareSum<T>(p1, p2), b, c);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSum,
+                                   TWO_PARAMETER,
+                                   a *= p1 * b + p2 * c);
+template <class T>
+void BaseMatrixT<T>::dotMulSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
+  applyTernary(ternary::DotMulSum<T>(p1, p2), b, c);
+}
+
+DEFINE_MATRIX_BINARY_OP(CopyAndClear, b = a; a = 0);
+template <class T>
+void BaseMatrixT<T>::copyAndClear(BaseMatrixT& b) {
+  applyBinary(binary::CopyAndClear<T>(), b);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(AddDotMul,
+                                   TWO_PARAMETER,
+                                   a = p1 * a + p2 * b * c);
+template <class T>
+void BaseMatrixT<T>::addDotMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
+  applyTernary(ternary::AddDotMul<T>(p1, p2), b, c);
+}
+
+DEFINE_MATRIX_BINARY_OP(Assign, a = b;);
+template <class T>
+void BaseMatrixT<T>::assign(BaseMatrixT& b) {
+  if (useGpu_) {
+    applyBinary(binary::Assign<T>(), b);
+  } else {  // cpu version
+    CHECK_EQ(this->height_, b.height_);
+    CHECK_EQ(this->width_, b.width_);
+    memcpy(data_, b.data_, sizeof(T) * height_ * width_);
+  }
+}
+
+template <class T>
+void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) {
+  if (columnOffset + b.width_ <= width_) {
+    int numRows = height_;
+    int numCols = b.width_;
+    MatrixOffset offset(columnOffset, 0, 0, 0);
+    applyBinary(binary::Assign<T>(), b, numRows, numCols, offset);
+  } else if (columnOffset + width_ <= b.width_) {
+    int numRows = height_;
+    int numCols = width_;
+    MatrixOffset offset(0, 0, columnOffset, 0);
+    applyBinary(binary::Assign<T>(), b, numRows, numCols, offset);
+  } else {
+    LOG(FATAL) << "Wrong argument "
+               << " a.width=" << width_ << " b.width=" << b.width_
+               << " columnOffset=" << columnOffset;
+  }
+}
+
+DEFINE_MATRIX_BINARY_OP(DeepSwap, T tmp = a; a = b; b = tmp);
+template <class T>
+void BaseMatrixT<T>::deepSwap(BaseMatrixT& b) {
+  applyBinary(binary::DeepSwap<T>(), b);
+}
+
+template <>
+void BaseMatrixT<real>::rowDotMul(size_t destCol,
+                                  BaseMatrixT& b,
+                                  BaseMatrixT& c) {
+  int numRows = b.height_;
+  int numCols = b.width_;
+  MatrixOffset offset(destCol, 0, 0, 0, 0, 0);
+  aggregate(aggregate::sum(),
+            base::binary::mul(),
+            base::binary::add(),
+            b,
+            c,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
+            true_type() /*aAsColVector*/);
+}
+
+template <class T>
+void BaseMatrixT<T>::rowDotMul2(size_t destCol,
+                                BaseMatrixT& b,
+                                BaseMatrixT& c) {
+  CHECK(!useGpu_) << "do not support gpu";
+
+  size_t height = this->height_;
+  CHECK_LT(destCol, this->width_);
+  CHECK_EQ(height, b.height_);
+  CHECK_EQ(height, c.height_);
+  CHECK_EQ(b.width_, c.width_);
+  size_t width = b.width_;
+  T* A = this->data_;
+  const T* B = b.data_;
+  const T* C = c.data_;
+  for (size_t i = 0; i < height;
+       ++i, A += this->width_, B += width, C += width) {
+    for (size_t j = 0; j < width; ++j) {
+      A[destCol] += B[j] * C[j];
+    }
+  }
+}
+
+template <>
+void BaseMatrixT<real>::addDotMulVMM(BaseMatrixT& b, BaseMatrixT& c) {
+  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+  int numRows = b.height_;
+  int numCols = b.width_;
+  aggregate(aggregate::sum(),
+            base::binary::mul(),
+            base::binary::add(),
+            b,
+            c,
+            numRows,
+            numCols,
+            offset,
+            true_type() /*aAsRowVector*/,
+            false_type());
+}
+
+template <class T>
+void BaseMatrixT<T>::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) {
+  CHECK(!useGpu_) << "do not support gpu";
+
+  CHECK_EQ(height_, 1LU);
+  CHECK_EQ(b.height_, c.height_);
+  CHECK_EQ(width_, b.width_);
+  CHECK_EQ(width_, c.width_);
+  size_t height = b.height_;
+  size_t width = b.width_;
+  T* A = this->data_;
+  const T* B = b.data_;
+  const T* C = c.data_;
+  for (size_t i = 0; i < height; ++i, B += width, C += width) {
+    for (size_t j = 0; j < width; ++j) {
+      A[j] += B[j] * C[j];
+    }
+  }
+}
+
+DEFINE_MATRIX_TERNARY_OP(addDotMulMMV, a += b * c);
+template <class T>
+void BaseMatrixT<T>::addDotMulMMV(BaseMatrixT& b, BaseMatrixT& c) {
+  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyTernary(ternary::addDotMulMMV<T>(),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               true_type() /*cAsRowVector*/,
+               false_type());
+}
+
+template <class T>
+void BaseMatrixT<T>::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) {
+  CHECK(!useGpu_) << "do not support gpu";
+
+  CHECK_EQ(c.height_, 1LU);
+  CHECK_EQ(height_, b.height_);
+  CHECK_EQ(width_, b.width_);
+  CHECK_EQ(width_, c.width_);
+  size_t height = height_;
+  size_t width = width_;
+  T* A = this->data_;
+  const T* B = b.data_;
+  const T* C = c.data_;
+  for (size_t i = 0; i < height; ++i, A += width, B += width) {
+    for (size_t j = 0; j < width; ++j) {
+      A[j] += B[j] * C[j];
+    }
+  }
+}
+
+template <class T>
+void BaseMatrixT<T>::rowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
+  MatrixOffset offset(0, 0, 0, 0, cCol, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyTernary(ternary::DotMul<T>(),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               false_type(),
+               true_type() /*cAsColVector*/);
+}
+
+template <class T>
+void BaseMatrixT<T>::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
+  CHECK(!useGpu_) << "do not support gpu";
+
+  size_t height = this->height_;
+  size_t width = this->width_;
+  CHECK_EQ(height, b.height_);
+  CHECK_EQ(width, b.width_);
+  CHECK_LT(cCol, c.width_);
+  CHECK_EQ(height, c.height_);
+  T* A = this->data_;
+  const T* B = b.data_;
+  const T* C = c.data_;
+  for (size_t i = 0; i < height; ++i, A += width, B += width, C += c.width_) {
+    for (size_t j = 0; j < width; ++j) {
+      A[j] = B[j] * C[cCol];
+    }
+  }
+}
+
+template <class T>
+void BaseMatrixT<T>::colScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) {
+  MatrixOffset offset(0, 0, 0, 0, 0, cRow);
+  int numRows = height_;
+  int numCols = width_;
+  applyTernary(ternary::DotMul<T>(),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               true_type() /* cAsRowVector */,
+               false_type() /* cAsColVector */);
+}
+
+template <class T>
+void BaseMatrixT<T>::addColScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) {
+  MatrixOffset offset(0, 0, 0, 0, 0, cRow);
+  int numRows = height_;
+  int numCols = width_;
+  applyTernary(ternary::addDotMulMMV<T>(),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               true_type() /* cAsRowVector */,
+               false_type() /* cAsColVector */);
+}
+
+template <class T>
+void BaseMatrixT<T>::addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
+  MatrixOffset offset(0, 0, 0, 0, cCol, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyTernary(ternary::addDotMulMMV<T>(),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               false_type(),
+               true_type() /*cAsColVector*/);
+}
+
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(RowAdd, ONE_PARAMETER, a = b + p * c);
+template <class T>
+void BaseMatrixT<T>::rowAdd(size_t cCol, BaseMatrixT& b, BaseMatrixT& c, T p) {
+  MatrixOffset offset(0, 0, 0, 0, cCol, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyTernary(ternary::RowAdd<T>(p),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               false_type(),
+               true_type() /*cAsColVector*/);
+}
+
+DEFINE_MATRIX_TERNARY_OP(RowPow, a = pow(b, c));
+template <>
+void BaseMatrixT<real>::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
+  if (useGpu_) {
+    MatrixOffset offset(0, 0, 0, 0, cCol, 0);
+    int numRows = height_;
+    int numCols = width_;
+    applyTernary(ternary::RowPow<real>(),
+                 b,
+                 c,
+                 numRows,
+                 numCols,
+                 offset,
+                 false_type(),
+                 true_type() /*cAsColVector*/);
+  } else {
+    size_t height = this->height_;
+    size_t width = this->width_;
+    CHECK_EQ(height, b.height_);
+    CHECK_EQ(width, b.width_);
+    CHECK_LT(cCol, c.width_);
+    CHECK_EQ(height, c.height_);
+    real* A = this->data_;
+    const real* B = b.data_;
+    const real* C = c.data_;
+    for (size_t i = 0; i < height; ++i, A += width, B += width, C += c.width_) {
+      vPow(width, B, C[cCol], A);
+    }
+  }
+}
+
+template <class T>
+void BaseMatrixT<T>::mulRowVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::DotMul<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              true_type() /* bAsRowVector */,
+              false_type());
+}
+
+DEFINE_MATRIX_BINARY_OP(DotDiv, a /= b);
+template <class T>
+void BaseMatrixT<T>::divRowVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::DotDiv<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              true_type() /* bAsRowVector */,
+              false_type());
+}
+
+template <class T>
+void BaseMatrixT<T>::mulColVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::DotMul<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              false_type(),
+              true_type() /* bAsColVector */);
+}
+
+template <class T>
+void BaseMatrixT<T>::divColVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::DotDiv<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              false_type(),
+              true_type() /* bAsColVector */);
+}
+
+template <>
+template <class Agg>
+int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
+  CHECK_EQ(height_, numRows);
+  CHECK_EQ(width_, 1UL);
+  aggregate(agg,
+            base::unary::identity(),
+            base::binary::second(),
+            b,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
+            true_type() /*aAsColVector*/);
+
+  return 0;
+}
+
+template <>
+template <class Agg, class Saver>
+int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
+  CHECK_EQ(height_, numRows);
+  CHECK_EQ(width_, 1UL);
+  aggregate(agg,
+            base::unary::identity(),
+            sv,
+            b,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
+            true_type() /*aAsColVector*/);
+
+  return 0;
+}
+
+template <>
+template <class Agg>
+int BaseMatrixT<real>::applyRow(Agg agg,
+                                real scaleDest,
+                                real scaleAgg,
+                                BaseMatrixT& b) {
+  if (scaleDest != 0) {
+    applyRow(agg, base::binary::add2(scaleDest, scaleAgg), b);
+  } else {
+    applyRow(agg, base::binary::second(), b);
+    if (scaleAgg != 1) {
+      mulScalar(scaleAgg);
+    }
+  }
+  return 0;
+}
+
+template <>
+template <class Agg, class Op, class Saver>
+int BaseMatrixT<real>::applyRow(
+    Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c) {
+  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
+  CHECK_EQ(height_, numRows);
+  CHECK_EQ(width_, 1UL);
+  CHECK_EQ(c.height_, numRows);
+  CHECK_EQ(c.width_, numCols);
+  aggregate(agg,
+            op,
+            sv,
+            b,
+            c,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
+            true_type() /*aAsColVector*/);
+  return 0;
+}
+
+template <>
+template <class Agg, class Op>
+int BaseMatrixT<real>::applyRow(Agg agg,
+                                Op op,
+                                real scaleDest,
+                                real scaleAgg,
+                                BaseMatrixT& b,
+                                BaseMatrixT& c) {
+  if (scaleDest != 0) {
+    applyRow(agg, op, base::binary::add2(scaleDest, scaleAgg), b, c);
+  } else {
+    applyRow(agg, op, base::binary::second(), b, c);
+    if (scaleAgg != 1) {
+      mulScalar(scaleAgg);
+    }
+  }
+  return 0;
+}
+
+template <>
+template <class Agg>
+int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
+  CHECK_EQ(width_, numCols);
+  CHECK_EQ(height_, 1UL);
+  aggregate(agg,
+            base::unary::identity(),
+            base::binary::second(),
+            b,
+            numRows,
+            numCols,
+            offset,
+            true_type() /*aAsRowVector*/,
+            false_type());
+
+  return 0;
+}
+
+template <>
+template <class Agg, class Saver>
+int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0, 0, 0);
+  size_t numRows = b.height_;
+  size_t numCols = b.width_;
+  CHECK_EQ(width_, numCols);
+  CHECK_EQ(height_, 1UL);
+  aggregate(agg,
+            base::unary::identity(),
+            sv,
+            b,
+            numRows,
+            numCols,
+            offset,
+            true_type() /*aAsRowVector*/,
+            false_type());
+
+  return 0;
+}
+
+template <>
+template <class Agg>
+int BaseMatrixT<real>::applyCol(Agg agg,
+                                real scaleDest,
+                                real scaleAgg,
+                                BaseMatrixT& b) {
+  if (scaleDest != 0) {
+    applyCol(agg, base::binary::add2(scaleDest, scaleAgg), b);
+  } else {
+    applyCol(agg, base::binary::second(), b);
+    if (scaleAgg != 1) {
+      mulScalar(scaleAgg);
+    }
+  }
+  return 0;
+}
+
+template <>
+void BaseMatrixT<real>::sumRows(BaseMatrixT& b, real scaleSum, real scaleDest) {
+  applyRow(aggregate::sum(), scaleDest, scaleSum, b);
+}
+
+template <>
+void BaseMatrixT<real>::maxRows(BaseMatrixT& b) {
+  applyRow(aggregate::max(), b);
+}
+
+template <>
+void BaseMatrixT<real>::minRows(BaseMatrixT& b) {
+  applyRow(aggregate::min(), b);
+}
+
+template <>
+void BaseMatrixT<real>::maxCols(BaseMatrixT& b) {
+  applyCol(aggregate::max(), b);
+}
+
+template <>
+void BaseMatrixT<real>::minCols(BaseMatrixT& b) {
+  applyCol(aggregate::min(), b);
+}
+
+template <>
+void BaseMatrixT<real>::sumCols(BaseMatrixT& b, real scaleSum, real scaleDest) {
+  applyCol(aggregate::sum(), scaleDest, scaleSum, b);
+}
+
+template <>
+void BaseMatrixT<real>::sumOfSquaredDiffs(BaseMatrixT& b,
+                                          BaseMatrixT& c,
+                                          real scaleSum,
+                                          real scaleDest) {
+  applyRow(
+      aggregate::sum(), base::binary::squaredDiff(), scaleDest, scaleSum, b, c);
+}
+
+template <>
+void BaseMatrixT<real>::sumOfProducts(BaseMatrixT& b,
+                                      BaseMatrixT& c,
+                                      real scaleSum,
+                                      real scaleDest) {
+  applyRow(aggregate::sum(), base::binary::mul(), scaleDest, scaleSum, b, c);
+}
+
+template class BaseMatrixT<real>;
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
+template class BaseMatrixT<int>;
+
+#else
+
+template <>
+void BaseMatrixT<int>::zero() {
+  applyUnary(unary::Zero<int>());
+}
+
+template <>
+void BaseMatrixT<int>::assign(int p) {
+  applyUnary(unary::Assign<int>(p));
+}
+
+template <>
+void BaseMatrixT<int>::isEqualTo(BaseMatrixT& b, int value) {
+  applyBinary(binary::IsEqual<int>(value), b);
+}
+
+template <>
+void BaseMatrixT<int>::neg() {
+  applyUnary(unary::Neg<int>());
+}
+
+template <>
+void BaseMatrixT<int>::abs2() {
+  applyUnary(unary::Abs<int>());
+}
+
+template <>
+void BaseMatrixT<int>::add(int p) {
+  applyUnary(unary::Add<int>(p));
+}
+
+template <>
+void BaseMatrixT<int>::add(int p1, int p2) {
+  applyUnary(unary::Add2<int>(p1, p2));
+}
+
+template <>
+void BaseMatrixT<int>::applyL1(int learningRate, int decayRate) {
+  applyUnary(unary::ApplyL1<int>(learningRate * decayRate));
+}
+
+#endif
+}  // namespace paddle
diff --git a/paddle/legacy/math/BaseMatrix.h b/paddle/legacy/math/BaseMatrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..4627f847d356f07600edae8cadcb02302e19381c
--- /dev/null
+++ b/paddle/legacy/math/BaseMatrix.h
@@ -0,0 +1,1095 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <stdint.h>
+#include <cstddef>
+#include "TensorExpression.h"
+#include "paddle/legacy/utils/Common.h"
+
+namespace paddle {
+
+/*
+ * nvcc currently does not support C++11,
+ * so I realized false_type and true_type.
+ */
+template <class T, T v>
+struct bool_constant {
+  static const T value = v;
+};
+typedef bool_constant<bool, false> false_type;
+typedef bool_constant<bool, true> true_type;
+
+/**
+ * @brief   Calculate matrix element address.
+ *
+ * For instance, address of A[i][j] = i * ld + j.
+ *
+ */
+#define CAL_MATRIX_START_ADDRESS(address, height, width, ld, col, row) \
+  CHECK_LE(col, width);                                                \
+  CHECK_LE(row, height);                                               \
+  address += row * ld + col;
+
+class MatrixOffset {
+ public:
+  size_t aCol_;
+  size_t aRow_;
+  size_t bCol_;
+  size_t bRow_;
+  size_t cCol_;
+  size_t cRow_;
+  size_t dCol_;
+  size_t dRow_;
+  MatrixOffset(size_t aCol = 0,
+               size_t aRow = 0,
+               size_t bCol = 0,
+               size_t bRow = 0,
+               size_t cCol = 0,
+               size_t cRow = 0,
+               size_t dCol = 0,
+               size_t dRow = 0)
+      : aCol_(aCol),
+        aRow_(aRow),
+        bCol_(bCol),
+        bRow_(bRow),
+        cCol_(cCol),
+        cRow_(cRow),
+        dCol_(dCol),
+        dRow_(dRow) {}
+};
+
+template <class T>
+class BaseMatrixT : public TensorExpression<BaseMatrixT<T>, T> {
+ public:
+  size_t height_, width_;
+  size_t stride_;
+  T* data_;
+  bool trans_;
+  bool useGpu_;
+
+ public:
+  virtual ~BaseMatrixT() {}
+  BaseMatrixT(size_t height, size_t width, T* data, bool trans, bool useGpu)
+      : height_(height),
+        width_(width),
+        stride_(width),
+        data_(data),
+        trans_(trans),
+        useGpu_(useGpu) {}
+
+  /**
+   * @note This constructor is for temporarily making a matrix with different
+   *       useGpu flag as the original matrix so that mixed gpu/cpu operations
+   *       can be performed successfully.
+   */
+  BaseMatrixT(BaseMatrixT& mat, bool useGpu)
+      : height_(mat.height_),
+        width_(mat.width_),
+        stride_(mat.stride_),
+        data_(mat.data_),
+        trans_(mat.trans_),
+        useGpu_(useGpu) {}
+
+  BaseMatrixT(size_t height,
+              size_t width,
+              size_t stride,
+              T* data,
+              bool trans,
+              bool use_gpu)
+      : height_(height),
+        width_(width),
+        stride_(stride),
+        data_(data),
+        trans_(trans),
+        useGpu_(use_gpu) {
+    /* CHECK_LE(width_, stride_); */
+  }
+
+  /// caller should make sure that the size of data is at least height*width
+  void setData(T* data) { data_ = data; }
+
+  /**
+   * unary operator: element wise op(a).
+   *
+   * @code
+   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
+   * @endcode
+   */
+  template <class Op>
+  int applyUnary(Op op);
+
+  /**
+   * unary operator: element wise op(a).
+   *
+   * @code
+   * for 0 <= i < numRows & for 0 <= j < numCols.
+   * While matrix start address is:
+   *  A = this->data_ + offset.aRow_*ld + offset.aCol_;
+   * @endcode
+   */
+  template <class Op>
+  int applyUnary(Op op, int numRows, int numCols, MatrixOffset& offset);
+
+  /**
+   * binary operator: element wise op(a, b).
+   *
+   * @code
+   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
+   * While this->height_ == b.height_ && this->width_ == b.width_.
+   * @endcode
+   */
+  template <class Op>
+  int applyBinary(Op op, BaseMatrixT& b);
+
+  /**
+   * binary operator: element wise op(a, b)
+   *
+   * @code
+   * for 0 <= i < numRows & for 0 <= j < numCols.
+   * While matrix start address is:
+   *   A = this->data_ + offset.aRow_*lda + offset.aCol_;
+   *   B = b->data_ + offset.bRow_*ldb + offset.bCol_;
+   *
+   * if (bAsRowVector == false_type && bAsColVector == false_type)
+   *   op(A[i * lda + j], B[i * ldb + j])
+   *
+   * if (bAsRowVector == true_type && bAsColVector == false_type)
+   *   op(A[i * lda + j], B[j])
+   *
+   * if (bAsRowVector == false_type && bAsColVector == true_type)
+   *   op(A[i * lda + j], B[i * ldb])
+   *
+   * if (bAsRowVector == true_type && bAsColVector == true_type)
+   *   op(A[i * lda + j], B[0])
+   * @endcode
+   */
+  template <class Op, class bAsRowVector, class bAsColVector>
+  int applyBinary(Op op,
+                  BaseMatrixT& b,
+                  int numRows,
+                  int numCols,
+                  MatrixOffset& offset,
+                  bAsRowVector,
+                  bAsColVector);
+
+  template <class Op>
+  int applyBinary(
+      Op op, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset);
+
+  /**
+   * ternary operator: element wise op(a, b, c).
+   *
+   * @code
+   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
+   *
+   * While this->height_ == b.height_ && this->width_ == b.width_
+   *    && this->height_ == c.height_ && this->width_ == c.width_
+   * @endcode
+   */
+  template <class Op>
+  int applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * ternary operator: element wise op(a, b, c).
+   *
+   * @code
+   *  for 0 <= i < numRows & for 0 <= j < numCols.
+   *  While matrix start address is:
+   *
+   *    A = this->data_ + offset.aRow_*lda + offset.aCol_;
+   *    B = b->data_ + offset.bRow_*ldb + offset.bCol_;
+   *    C = c->data_ + offset.cRow_*ldc + offset.cCol_;
+   *
+   *    if (cAsRowVector == false_type && cAsColVector == false_type)
+   *      op(A[i*lda + j], B[i*ldb + j], C[i*ldc + j])
+   *
+   *    if (cAsRowVector == true_type && cAsColVector == false_type)
+   *      op(A[i*lda + j], B[i*ldb + j], C[j])
+   *
+   *    if (cAsRowVector == false_type && cAsColVector == true_type)
+   *      op(A[i*lda + j], B[i*ldb + j], C[i*ldc])
+   *
+   *    if (cAsRowVector == 1 && cAsColVector == 1)
+   *      op(A[i*lda + j], B[i*ldb + j], C[0])
+   * @endcode
+   */
+  template <class Op, class cAsRowVector, class cAsColVector>
+  int applyTernary(Op op,
+                   BaseMatrixT& b,
+                   BaseMatrixT& c,
+                   int numRows,
+                   int numCols,
+                   MatrixOffset& offset,
+                   cAsRowVector,
+                   cAsColVector);
+
+  template <class Op>
+  int applyTernary(Op op,
+                   BaseMatrixT& b,
+                   BaseMatrixT& c,
+                   int numRows,
+                   int numCols,
+                   MatrixOffset& offset);
+
+  /**
+   * quaternary operator: element wise op(a, b, c, d).
+   *
+   * @code
+   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
+   *
+   * While this->height_ == b.height_ && this->width_ == b.width_
+   *    && this->height_ == c.height_ && this->width_ == c.width_
+   *    && this->height_ == d.height_ && this->width_ == d.width_
+   * @endcode
+   */
+  template <class Op>
+  int applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
+
+  /**
+   * quaternary operator: element wise op(a, b, c, d).
+   *
+   * @code
+   * for 0 <= i < numRows & for 0 <= j < numCols.
+   * While matrix start address is:
+   *    A = this->data_ + offset.aRow_*lda + offset.aCol_;
+   *    B = b->data_ + offset.bRow_*ldb + offset.bCol_;
+   *    C = c->data_ + offset.cRow_*ldc + offset.cCol_;
+   *    D = d->data_ + offset.dRow_*ldd + offset.dCol_;
+   * @endcode
+   */
+  template <class Op>
+  int applyQuaternary(Op op,
+                      BaseMatrixT& b,
+                      BaseMatrixT& c,
+                      BaseMatrixT& d,
+                      int numRows,
+                      int numCols,
+                      MatrixOffset& offset);
+
+  /**
+   * a aggregate expression that apply each row(or column) of matrix b.
+   * op and sv is element wise operator.
+   *
+   * @code
+   * if (aAsRowVector == true_type && aAsColVector == false_type)
+   *  for each column j & 0 <= i < numRows, do:
+   *    dst = agg(op(b[i*ldb + j]))
+   *    a[j] = sv(a[j], dst)
+   *
+   * if (aAsRowVector == false_type && aAsColVector == true_type)
+   *  for each row i & 0 <= j < numCols, do:
+   *    dst = agg(op(b[i*ldb + j]))
+   *    a[i] = sv(a[i], dst)
+   * @endcode
+   */
+  template <class Agg,
+            class Op,
+            class Saver,
+            class aAsRowVector,
+            class aAsColVector>
+  int aggregate(Agg agg,
+                Op op,
+                Saver sv,
+                BaseMatrixT& b,
+                int numRows,
+                int numCols,
+                MatrixOffset& offset,
+                aAsRowVector,
+                aAsColVector);
+
+  /**
+   * a aggregate expression that apply each row(or column) of matrix b and c.
+   *
+   * op and sv is element wise operator.
+   *
+   * @code
+   * if (aAsRowVector == true_type && aAsColVector == false_type)
+   *   for each column j & 0 <= i < numRows, do:
+   *     dst = agg(op(b[i*ldb + j], c[i*ldc + j]))
+   *     a[j] = sv(a[j], dst)
+   *
+   * if (aAsRowVector == false_type && aAsColVector == true_type)
+   *   for each row i & 0 <= j < numCols, do:
+   *     dst = agg(op(b[i*ldb + j], c[i*ldc + j]))
+   *     a[i] = sv(a[i], dst)
+   * @endcode
+   */
+  template <class Agg,
+            class Op,
+            class Saver,
+            class aAsRowVector,
+            class aAsColVector>
+  int aggregate(Agg agg,
+                Op op,
+                Saver sv,
+                BaseMatrixT& b,
+                BaseMatrixT& c,
+                int numRows,
+                int numCols,
+                MatrixOffset& offset,
+                aAsRowVector,
+                aAsColVector);
+
+  /**
+   * a aggregate expression that apply each row of matrix b.
+   *
+   * @code
+   * for each row i & 0 <= j < b.width_, do:
+   *   this[i] = agg(b[i*ldb + j])
+   * @endcode
+   */
+  template <class Agg>
+  int applyRow(Agg agg, BaseMatrixT& b);
+
+  /**
+   * a aggregate expression that apply each row of matrix b.
+   *
+   * @code
+   * for each row i & 0 <= j < b.width_, do:
+   *   dst = agg(op(b[i*ldb + j], c[i*ldc + j])
+   *   this[i] = sv(this[i], dst)
+   * @endcode
+   */
+  template <class Agg, class Op, class Saver>
+  int applyRow(Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c);
+
+  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
+  template <class Agg, class Op>
+  int applyRow(Agg agg,
+               Op op,
+               real scaleDest,
+               real scaleAgg,
+               BaseMatrixT& b,
+               BaseMatrixT& c);
+
+  /**
+   * a aggregate expression that apply each row of matrix b.
+   *
+   * @code
+   * for each row i & 0 <= j < b.width_, do:
+   *   dst = agg(b[i*ldb + j])
+   *   this[i] = sv(this[i], dst)
+   * @endcode
+   */
+  template <class Agg, class Saver>
+  int applyRow(Agg agg, Saver sv, BaseMatrixT& b);
+
+  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
+  template <class Agg>
+  int applyRow(Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b);
+
+  /**
+   * a aggregate expression that apply each column of matrix b.
+   *
+   * @code
+   * for each column j & 0 <= i < b.height_, do:
+   *   this[j] = agg(b[i*ldb + j])
+   * @endcode
+   */
+  template <class Agg>
+  int applyCol(Agg agg, BaseMatrixT& b);
+
+  /**
+   * a aggregate expression that apply each column of matrix b.
+   *
+   * @code
+   * for each column j & 0 <= i < b.height_, do:
+   *   dst = agg(b[i*ldb + j])
+   *   this[j] = sv(this[j], dst)
+   * @endcode
+   */
+  template <class Agg, class Saver>
+  int applyCol(Agg agg, Saver sv, BaseMatrixT& b);
+
+  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
+  template <class Agg>
+  int applyCol(Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b);
+
+  bool useGpu() const { return useGpu_; }
+
+  const T* rowBuf(size_t row) const { return data_ + width_ * row; }
+
+  T* rowBuf(size_t row) { return data_ + width_ * row; }
+
+  /**
+   * @brief   unary operator.
+   *
+   */
+  void neg();
+  void exp2();
+  void pow2(T p);
+  void log2();
+  void sqrt2();
+  void square2();
+  void reciprocal2();
+  void abs2();
+  void sign2();
+  void zero();
+
+  /**
+   * @code
+   * this(row, col + columnOffset) = 0 for 0 <= col < numColumns
+   * @endcode
+   */
+  void zeroAtOffset(int64_t columnOffset, int64_t numColumns);
+  void one();
+  void subScalar(T p);
+  void mulScalar(T p);
+  void divScalar(T p);
+
+  /**
+   * @code
+   * this = p
+   * @endcode
+   */
+  void assign(T p);
+
+  /**
+   * @code
+   * swap(this, b)
+   * example: swap two Matrices
+   * MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+   * MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
+   * cpuA->deepSwap(*cpuB);
+   * @endcode
+   */
+  void deepSwap(BaseMatrixT& b);
+
+  /**
+   * @code
+   * this = this + p
+   * @endcode
+   */
+  void add(T p);
+
+  /**
+   * @code
+   * this = this*p1 + p2
+   * @endcode
+   */
+  void add(T p1, T p2);
+
+  /**
+   * this = this < low ? low : this
+   *
+   * this = this > high ? high : this
+   */
+  void clip(T p1, T p2);
+
+  /**
+   * this = b < low ? 0 : 1
+   *
+   * this = b > high ? 0 : 1
+   */
+  void clipDerivative(BaseMatrixT& b, T p1, T p2);
+
+  /**
+   * @code
+   * a = a > p ? 1.0f : 0.0f
+   * @endcode
+   */
+  void biggerThanScalar(T p);
+
+  /**
+   * @code
+   * a = a > p ? a : p
+   * @endcode
+   */
+  void downClip(T p);
+
+  /**
+   * @code
+   * this = b
+   * @endcode
+   */
+  void assign(BaseMatrixT& b);
+
+  /**
+   * @code
+   * If b.width + columOffset <= this.width
+   *  this(row, col + columnOffset) = b(row, col) for 0 <= col < b.width
+   *
+   * If this.width + columnOffset <= b.width
+   *  this(row, col) = b(row, col + columnOffset) for 0 <= col < this.width
+   *
+   * Otherwise, FATAL
+   * @endcode
+   */
+  void assignAtOffset(BaseMatrixT& b, int64_t columnOffset);
+
+  /// this = this + b
+  void add(BaseMatrixT& b);
+
+  /**
+   * @code
+   * If b.width + columOffset <= this.width
+   *  this(row, col + columnOffset) += b(row, col) for 0 <= col < b.width
+   *
+   * If this.width + columnOffset <= b.width
+   *  this(row, col) += b(row, col + columnOffset) for 0 <= col < this.width
+   *
+   * Otherwise, FATAL
+   * @endcode
+   */
+  void addAtOffset(BaseMatrixT& b, int64_t columnOffset);
+
+  void addColVector(BaseMatrixT& b);
+  void addRowVector(BaseMatrixT& b);
+  void addBias(BaseMatrixT& b, T scale);
+
+  void mulRowVector(BaseMatrixT& b);
+  void divRowVector(BaseMatrixT& b);
+
+  void mulColVector(BaseMatrixT& b);
+  void divColVector(BaseMatrixT& b);
+
+  void addP2P(BaseMatrixT& b);
+
+  /**
+   * @code
+   * this = this + b*p
+   * @endcode
+   */
+  void add(BaseMatrixT& b, T p);
+
+  /**
+   * @code
+   * this = p1*this + p2*b
+   * @endcode
+   */
+  void add(BaseMatrixT& b, T p1, T p2);
+
+  /**
+   * @code
+   * this = this - b
+   * @endcode
+   */
+  void sub(BaseMatrixT& b);
+
+  /**
+   * @code
+   * this = this - b*p
+   * @endcode
+   */
+  void sub(BaseMatrixT& b, T p);
+
+  /**
+   * @code
+   * b = max(0, this)
+   * @endcode
+   */
+  void relu(BaseMatrixT& b);
+  void reluDerivative(BaseMatrixT& b);
+
+  /**
+   * @code
+   * b = log(1.0 + exp(this))
+   * @endcode
+   */
+  void softrelu(BaseMatrixT& b);
+  void softreluDerivative(BaseMatrixT& b);
+
+  /**
+   * @code
+   * b = min(max(this, p1), p2)
+   * @endcode
+   */
+  void brelu(BaseMatrixT& b);
+  void breluDerivative(BaseMatrixT& b);
+
+  /**
+   * @code
+   * b = this * this
+   * @endcode
+   */
+  void square2(BaseMatrixT& b);
+  void squareDerivative(BaseMatrixT& b);
+
+  /**
+   * @code
+   * b = tanh(this)
+   * @endcode
+   */
+  void tanh(BaseMatrixT& b);
+  void tanhDerivative(BaseMatrixT& b);
+
+  /**
+   * @code
+   * b = p1 * tanh(p2 * this)
+   * @endcode
+   */
+  void scaledTanh(BaseMatrixT& b, T p1, T p2);
+  void scaledTanhDerivative(BaseMatrixT& b, T p1, T p2);
+
+  /**
+   * @code
+   * b = 1.0f / this
+   * @endcode
+   */
+  void reciprocal2(BaseMatrixT& b);
+  void reciprocalDerivative(BaseMatrixT& b);
+
+  /**
+   * @code
+   * b = this > 0.0f ? this : -this
+   * @endcode
+   */
+  void abs2(BaseMatrixT& b);
+  void absDerivative(BaseMatrixT& b);
+
+  /**
+   * @code
+   * b = 1.0f / (1.0f + exp(-this))
+   * @endcode
+   */
+  void sigmoid(BaseMatrixT& b);
+  void sigmoidDerivative(BaseMatrixT& b);
+
+  /**
+   * @code
+   * b = a
+   * @endcode
+   */
+  void expDerivative(BaseMatrixT& b);
+
+  void sign2(BaseMatrixT& b);
+
+  void exp2(BaseMatrixT& b);
+  void pow2(BaseMatrixT& b, T p);
+  void log2(BaseMatrixT& b);
+  void sqrt2(BaseMatrixT& b);
+  void addScalar(BaseMatrixT& b, T p);
+  void subScalar(BaseMatrixT& b, T p);
+  void mulScalar(BaseMatrixT& b, T p);
+  void divScalar(BaseMatrixT& b, T p);
+  void scalarDiv(BaseMatrixT& b, T p);
+
+  /**
+   * @code
+   * this = 1.0f / sqrt(b)
+   * @endcode
+   */
+  void invSqrt(BaseMatrixT& b);
+
+  /// this = (b == value)
+  void isEqualTo(BaseMatrixT& b, T value);
+
+  /**
+   * @brief   ternary operator.
+   */
+  void softCrossEntropy(BaseMatrixT& b, BaseMatrixT& c);
+  void softCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c);
+  void binaryLabelCrossEntropy(BaseMatrixT& b, BaseMatrixT& c);
+  void binaryLabelCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this = b + c
+   * @endcode
+   */
+  void add(BaseMatrixT& b, BaseMatrixT& c);
+  /**
+   * @code
+   * this = b*p1 + c*p2
+   * @endcode
+   */
+  void add(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2);
+  /**
+   * @code
+   * this = b - c
+   * @endcode
+   */
+  void sub(BaseMatrixT& b, BaseMatrixT& c);
+  /**
+   * @code
+   * this = b*p1 - c*p2
+   * @endcode
+   */
+  void sub(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2);
+
+  /**
+   * @code
+   * this = this + b + c
+   * @endcode
+   */
+  void add2(BaseMatrixT& b, BaseMatrixT& c);
+  /**
+   * @code
+   * this = this*p1 + b*p2 + c*p3
+   * @endcode
+   */
+  void add2(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3);
+
+  /**
+   * @code
+   * this = a*p1 + b*p2 + c*p3
+   * @endcode
+   */
+  void add3(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2, T p3);
+
+  /**
+   * @code
+   *   c = p2 * c - p1 *  (b + p3 * this)
+   *   this += mom
+   * @endcode
+   */
+  void sgdUpdate(BaseMatrixT& b,  //  grad
+                 BaseMatrixT& c,  //  mom
+                 T p1,            //  learningRate,
+                 T p2,            //  momentum,
+                 T p3);           //  decayRate
+
+  /**
+   * @code
+   *   c = p2 * c - p1 * d * (b + p3 * this)
+   *   this += mom
+   * @endcode
+   */
+  void sgdUpdate(BaseMatrixT& b,  // grad,
+                 BaseMatrixT& c,  // mom,
+                 BaseMatrixT& d,  // lr,
+                 T p1,            // learningRate,
+                 T p2,            // momentum,
+                 T p3);           // decayRate
+
+  /// apply L1/L2 to *this*
+  virtual void applyL1(T learningRate, T decayRate);
+  void applyL1(BaseMatrixT& lr, T learningRate, T decayRate);
+  void applyL2(T learningRate, T decayRate);
+  void applyL2(BaseMatrixT& lr, T learningRate, T decayRate);
+
+  /**
+   * @code
+   * this *= b
+   * @endcode
+   */
+  void dotMul(BaseMatrixT& b);
+
+  /**
+   * @code
+   * this = b * c
+   * @endcode
+   */
+  void dotMul(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this = b / c
+   * @endcode
+   */
+  void dotDiv(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this = (b + p1) / (c + p2)
+   * @endcode
+   */
+  void dotDiv(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
+
+  /**
+   * @code
+   * this = log(1 + exp(b - c)) - d * (b - c)
+   * @endcode
+   */
+  void rankLoss(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
+  void rankLossBp(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
+
+  /**
+   * @code
+   * this = log(1 + exp(b)) - c * b
+   * @endcode
+   */
+  void logisticRegressionLoss(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this += exp(b)/(1+exp(b)) - c
+   * @endcode
+   */
+  void logisticRegressionLossBp(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this = b > c ? 1.0 : 0.0
+   * @endcode
+   */
+  void biggerThan(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this = ((b>c && d>0.5) || (b<c && d<0.5)) ? 1 : 0)
+   * @endcode
+   */
+  void biggerThan(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
+
+  /**
+   * @code
+   * this = b>c ? b : c
+   * @endcode
+   */
+  void max2(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this[destCol] += (b>p1 == c>p1) ? 0 : 1)
+   * @endcode
+   */
+  void binaryClassificationError(size_t destCol,
+                                 BaseMatrixT& b,
+                                 BaseMatrixT& c,
+                                 T p);
+  void binaryClassificationError2(size_t destCol,
+                                  BaseMatrixT& b,
+                                  BaseMatrixT& c,
+                                  T p);
+
+  /**
+   * @code
+   * this = this * b * b
+   * @endcode
+   */
+  void dotMulSquare(BaseMatrixT& b);
+
+  /**
+   * @code
+   * this = this * this * b
+   * @endcode
+   */
+  void dotSquareMul(BaseMatrixT& b);
+
+  /**
+   * @code
+   * this = b * c * c
+   * @endcode
+   */
+  void dotMulSquare(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this = b * b * c * c
+   * @endcode
+   */
+  void dotSquareSquare(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this = this * (p1*b + p2*c)^2
+   * @endcode
+   */
+  void dotMulSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
+
+  /**
+   * @code
+   * this = (p1*b + p2*c)^2
+   * @endcode
+   */
+  void dotSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
+
+  /**
+   * @code
+   * this=  this * (p1*b + p2*c)
+   * @endcode
+   */
+  void dotMulSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
+
+  /**
+   * @code
+   * this += sqr(p1*b + p2*c + p3*d)
+   * @endcode
+   */
+  void addSquareSum(
+      BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1, T p2, T p3);
+
+  /**
+   * @code
+   * this += p * sqr(b)
+   * @endcode
+   */
+  void addSquare(BaseMatrixT& b, T p);
+
+  /**
+   * @code
+   * this = p1 * this + p2 * sqr(b)
+   * @endcode
+   */
+  void decayAddSquare(BaseMatrixT& b, T p1, T p2);
+
+  /**
+   * @code
+   * this = p1 * this + p2 * sqr(b * c)
+   * @endcode
+   */
+  void decayAddSquareMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
+
+  /**
+   * @code
+   * this = 1 / (p1 * b + p2)
+   * @endcode
+   */
+  void reciprocal2(BaseMatrixT& b, T p1, T p2);
+
+  /**
+   * @code
+   * this = 1 / (p1 * b + p2 * c + p3)
+   * @endcode
+   */
+  void reciprocalSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3);
+
+  /**
+   * @code
+   * b = this; this = 0
+   * @endcode
+   */
+  void copyAndClear(BaseMatrixT& b);
+
+  /**
+   * @code
+   * this_row[destCol] += dotprod(b_row, c_row)
+   * @endcode
+   */
+  void rowDotMul(size_t destCol, BaseMatrixT& b, BaseMatrixT& c);
+  void rowDotMul2(size_t destCol, BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * this is vector (one row matrix)
+   *
+   * @code
+   *   for each row i, do:
+   *      this_row += dotmul(b_row_i, c_row_i)
+   * @endcode
+   */
+  void addDotMulVMM(BaseMatrixT& b, BaseMatrixT& c);
+  void addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * c is vector (one row matrix)
+   *
+   * @code
+   * for each row i, do:
+   *    this_row_i += dotmul(b_row_i, c_row)
+   * @endcode
+   */
+  void addDotMulMMV(BaseMatrixT& b, BaseMatrixT& c);
+  void addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this = p1 * this + p2 * b * c
+   * @endcode
+   */
+  void addDotMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
+
+  /**
+   * @code
+   * this_row = b_row * c_row[cCol]
+   * @endcode
+   */
+  void rowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
+  void rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this_col = b_col * c_col[cRow]
+   * @endcode
+   */
+  void colScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this_col += b_col * c_col[cRow]
+   * @endcode
+   */
+  void addColScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c);
+
+  /**
+   * @code
+   * this_row += b_row * c_row[cCol]
+   * @endcode
+   */
+  void addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
+
+  /// calculate the sum of each row of the matrix b.
+  /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij}
+  void sumRows(BaseMatrixT& b, T scaleSum, T scaleDest);
+
+  /// calculate the maximum value of each row of the matrix b.
+  void maxRows(BaseMatrixT& b);
+  /// calculate the minimum value of each row of the matrix b.
+  void minRows(BaseMatrixT& b);
+
+  /// calculate the maximum value of each column of the matrix b.
+  void maxCols(BaseMatrixT& b);
+  /// calculate the minimum value of each column of the matrix b.
+  void minCols(BaseMatrixT& b);
+
+  /// calculate the sum of each column of the matrix b.
+  /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ji}
+  void sumCols(BaseMatrixT& b, T scaleSum, T scaleDest);
+
+  /// this_i = scaleDest * this_i + scaleSum * \sum_j (b_{ij} - c_{ij})^2
+  void sumOfSquaredDiffs(BaseMatrixT& b,
+                         BaseMatrixT& c,
+                         T scaleSum,
+                         T scaleDest);
+
+  /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij} * c_{ij}
+  void sumOfProducts(BaseMatrixT& b, BaseMatrixT& c, T scaleSum, T scaleDest);
+
+  /**
+   * @code
+   * this_row = b_row + p * ones * c_row[cCol]
+   * @endcode
+   */
+  void rowAdd(size_t cCol, BaseMatrixT& b, BaseMatrixT& c, T p);
+  /**
+   * @code
+   * this_row = pow(b_row, c_row[cCol])
+   * @endcode
+   */
+  void rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
+
+  virtual bool isSparse() const { return false; }
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    if (useGpu_) {
+      TensorGpuApply<T>(*this, expr);
+    } else {
+      TensorCpuApply<T>(*this, expr);
+    }
+  }
+
+  template <typename ExpressionType>
+  void operator+=(const ExpressionType& expr) {
+    (*this) = (*this) + expr;
+  }
+  template <typename ExpressionType>
+  void operator-=(const ExpressionType& expr) {
+    (*this) = (*this) - expr;
+  }
+  template <typename ExpressionType>
+  void operator*=(const ExpressionType& expr) {
+    (*this) = (*this) * expr;
+  }
+  template <typename ExpressionType>
+  void operator/=(const ExpressionType& expr) {
+    (*this) = (*this) / expr;
+  }
+};
+
+typedef BaseMatrixT<real> BaseMatrix;
+typedef BaseMatrixT<int> IBaseMatrix;
+
+}  // namespace paddle
diff --git a/paddle/legacy/math/CMakeLists.txt b/paddle/legacy/math/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9992ec71f45b592e0a73e1cc9c655e773fa18e86
--- /dev/null
+++ b/paddle/legacy/math/CMakeLists.txt
@@ -0,0 +1,57 @@
+# common package contains:
+#   * the utilities:
+#       * Thread Libs
+#       * Memory Manage libs
+#       * CommandLine Parser
+#       * Logging
+#       * Timer/Stats
+#   * the math libraries:
+#       * Matrix/Vector
+#   * the parameter optimizers.
+#   * the parameter updater functions.
+#
+# TODO(yuyang18): separate libs.
+#
+file(GLOB MATH_HEADERS . *.h)
+file(GLOB MATH_SOURCES . *.cpp)
+
+if(NOT WITH_MKLDNN)
+    set(DNN_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.h")
+    set(DNN_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.cpp")
+    list(REMOVE_ITEM MATH_HEADERS "${DNN_HEADER}")
+    list(REMOVE_ITEM MATH_SOURCES "${DNN_SOURCE}")
+    message(STATUS "Skip compiling with MKLDNNMatrix")
+else()
+    message(STATUS "Compile with MKLDNNMatrix")
+endif()
+
+if(MOBILE_INFERENCE)
+    # Remove sparse
+    list(REMOVE_ITEM MATH_HEADERS
+         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.h
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.h
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.h)
+    list(REMOVE_ITEM MATH_SOURCES
+         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.cpp)
+endif()
+set(MATH_SOURCES
+    "${PADDLE_SOURCE_DIR}/paddle/legacy/math/BaseMatrix.cu"
+    "${PADDLE_SOURCE_DIR}/paddle/legacy/math/TrainingAlgorithmOp.cu"
+    ${MATH_SOURCES})
+if(NOT WITH_GPU)
+    # then compile BaseMatrix.cu as c++ file
+    compile_cu_as_cpp("${PADDLE_SOURCE_DIR}/paddle/legacy/math/BaseMatrix.cu")
+    compile_cu_as_cpp("${PADDLE_SOURCE_DIR}/paddle/legacy/math/TrainingAlgorithmOp.cu")
+    add_library(paddle_math STATIC
+        ${MATH_SOURCES})
+else()
+    cuda_add_library(paddle_math ${MATH_SOURCES})
+endif()
+
+
+add_dependencies(paddle_math paddle_proto ${external_project_dependencies})  # depends
+if(WITH_TESTING)
+    add_subdirectory(tests)
+endif()
diff --git a/paddle/legacy/math/CpuSparseMatrix.cpp b/paddle/legacy/math/CpuSparseMatrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..20c65a3a1d7099a73d8b3c490cd42e721e60823b
--- /dev/null
+++ b/paddle/legacy/math/CpuSparseMatrix.cpp
@@ -0,0 +1,787 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CpuSparseMatrix.h"
+#include "SparseMatrix.h"
+#include "float.h"
+#include "hl_gpu.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+const size_t CpuSparseMatrix::DEFAULT_AVG_WIDTH;
+
+CpuSparseMatrix::CpuSparseMatrix(size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
+                                 bool trans)
+    : Matrix(NULL, height, width, trans, false) {
+  resize(height, width, nnz, valueType, format);
+}
+
+CpuSparseMatrix::CpuSparseMatrix(CpuMemHandlePtr dataHandle,
+                                 size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
+                                 bool trans)
+    : Matrix(dataHandle, height, width, trans, false) {
+  resize(height, width, nnz, valueType, format);
+}
+
+CpuSparseMatrix::CpuSparseMatrix(real* data,
+                                 int* rows,
+                                 int* cols,
+                                 size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
+                                 bool trans)
+    : Matrix(NULL, height, width, trans, false) {
+  cols_ = cols;
+  rows_ = rows;
+  value_ = data;
+  height_ = height;
+  width_ = width;
+  elementCnt_ = nnz;
+  valueType_ = valueType;
+  format_ = format;
+}
+
+void CpuSparseMatrix::resize(size_t newHeight,
+                             size_t newWidth,
+                             size_t newNnz,
+                             SparseValueType valueType,
+                             SparseFormat format) {
+  CHECK_LE(newNnz, newHeight * newWidth);
+  size_t newSize = 0;
+  if (format == SPARSE_CSR) {
+    newSize = (newHeight + 1) * sizeof(int) + newNnz * sizeof(int);
+  } else {
+    newSize = (newWidth + 1) * sizeof(int) + newNnz * sizeof(int);
+  }
+
+  if (NO_VALUE != valueType) {
+    newSize += newNnz * sizeof(real);
+  }
+
+  if (NULL == memoryHandle_.get() || newSize > memoryHandle_->getSize()) {
+    memoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize);
+  }
+
+  height_ = newHeight;
+  width_ = newWidth;
+  elementCnt_ = newNnz;
+  valueType_ = valueType;
+  format_ = format;
+  sparseResize();
+}
+void CpuSparseMatrix::sparseResize() {
+  if (format_ == SPARSE_CSR) {
+    rows_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(memoryHandle_->getBuf()));
+    cols_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(memoryHandle_->getBuf()) +
+        (height_ + 1) * sizeof(int));
+    if (NO_VALUE != valueType_) {
+      value_ = reinterpret_cast<real*>(
+          reinterpret_cast<char*>(memoryHandle_->getBuf()) +
+          (height_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
+    } else {
+      value_ = NULL;
+    }
+  } else {
+    cols_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(memoryHandle_->getBuf()));
+    rows_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(memoryHandle_->getBuf()) +
+        (width_ + 1) * sizeof(int));
+    if (NO_VALUE != valueType_) {
+      value_ = reinterpret_cast<real*>(
+          reinterpret_cast<char*>(memoryHandle_->getBuf()) +
+          (width_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
+    } else {
+      value_ = NULL;
+    }
+  }
+}
+
+void CpuSparseMatrix::resize(size_t newHeight, size_t newWidth) {
+  resize(newHeight,
+         newWidth,
+         newHeight * std::min(DEFAULT_AVG_WIDTH, newWidth),
+         valueType_,
+         format_);
+}
+
+MatrixPtr CpuSparseMatrix::getTranspose() {
+  if (!memoryHandle_ && !value_) {
+    MatrixPtr dest(new CpuSparseMatrix(
+        height_, width_, elementCnt_, valueType_, format_, true));
+    return dest;
+  } else if (memoryHandle_) {
+    MatrixPtr dest(new CpuSparseMatrix(
+        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_),
+        height_,
+        width_,
+        elementCnt_,
+        valueType_,
+        format_,
+        true));
+    return dest;
+  } else if (value_) {
+    MatrixPtr dest(new CpuSparseMatrix(value_,
+                                       rows_,
+                                       cols_,
+                                       height_,
+                                       width_,
+                                       elementCnt_,
+                                       valueType_,
+                                       format_,
+                                       true));
+    return dest;
+  } else {
+    return NULL;
+  }
+}
+
+SparseValueType CpuSparseMatrix::getValueType() { return valueType_; }
+
+void CpuSparseMatrix::mul(const Matrix& a,
+                          const Matrix& b,
+                          real scaleAB,
+                          real scaleT) {
+  CHECK(!isTransposed()) << "Not supported";
+  const auto a_ptr = dynamic_cast<const CpuMatrix*>(&a);
+  const auto b_ptr = dynamic_cast<const CpuMatrix*>(&b);
+
+  if (a_ptr && b_ptr) {
+    CpuMatrix::mul((CpuMatrix*)a_ptr, (CpuMatrix*)b_ptr, this, scaleAB, scaleT);
+  } else {
+    LOG(FATAL) << "not supported";
+  }
+}
+
+void CpuSparseMatrix::add3(CpuMatrix* b) {
+  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
+  CHECK(height_ == b->getHeight());
+  CHECK(width_ == b->getWidth());
+  real* A = getValue();
+  real* B = b->getData();
+  int* cols = getCols();
+  for (size_t i = 0; i < height_; i++) {
+    size_t start = getRowStartIdx(i);
+    size_t end = getRowStartIdx(i + 1);
+    for (size_t j = start; j < end; j++) {
+      A[j] = B[i * width_ + cols[j]];
+    }
+  }
+}
+
+void CpuSparseMatrix::add3(MatrixPtr b) {
+  if (dynamic_cast<CpuMatrix*>(b.get())) {
+    add3(dynamic_cast<CpuMatrix*>(b.get()));
+  } else {
+    LOG(FATAL) << "not supported";
+  }
+}
+
+void CpuSparseMatrix::addBias(Matrix& b, real scale) {
+  CHECK_EQ(b.getHeight(), (size_t)1);
+  CHECK_EQ(width_, b.getWidth());
+  real* A = getValue();
+  real* B = b.getData();
+  int* cols = getCols();
+  size_t nnz = getElementCnt();
+  for (size_t i = 0; i < nnz; i++) {
+    A[i] += scale * B[cols[i]];
+  }
+}
+
+template <class T>
+void printBuf(std::ostream& os, T* a, size_t len, const char* name) {
+  os << "\n: " << name << " [";
+  for (size_t i = 0; i < len; i++) {
+    os << a[i] << " ";
+  }
+  os << "]\n";
+}
+
+void CpuSparseMatrix::print(std::ostream& os) const {
+  size_t rowSize = format_ == SPARSE_CSC ? elementCnt_ : height_ + 1;
+  size_t colSize = format_ == SPARSE_CSC ? width_ + 1 : elementCnt_;
+  printBuf(os, rows_, rowSize, "row");
+  printBuf(os, cols_, colSize, "col");
+  if (valueType_ == FLOAT_VALUE) {
+    printBuf(os, value_, elementCnt_, "value");
+  }
+  return;
+}
+
+void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const {
+  CHECK_LT(idx, height_);
+  if (format_ == SPARSE_CSC) {
+    LOG(FATAL) << "SPARSE_CSC not supported";
+    return;
+  }
+
+  const int* col = getRowCols(idx);
+  size_t num = getColNum(idx);
+  if (num > 0) {
+    if (valueType_ == FLOAT_VALUE) {
+      const real* data = getRowValues(idx);
+      os << col[0] << ":" << data[0];
+      for (size_t i = 1; i < num; ++i) {
+        os << " " << col[i] << ":" << data[i];
+      }
+    } else {
+      os << col[0];
+      for (size_t i = 1; i < num; ++i) {
+        os << " " << col[i];
+      }
+    }
+  }
+  os << ";";
+}
+
+void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) {
+  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
+  CHECK_EQ(height_, b.getHeight());
+  CHECK_EQ(width_, b.getWidth());
+  real* A = getValue();
+  real* B = b.getValue();
+  if (b.getValueType() == FLOAT_VALUE) {
+    for (size_t i = 0; i < height_; i++) {
+      size_t start = getRowStartIdx(i);
+      size_t end = getRowStartIdx(i + 1);
+      CHECK_EQ(start, b.getRowStartIdx(i));
+      CHECK_EQ(end, b.getRowStartIdx(i + 1));
+      for (size_t j = start; j < end; j++) {
+        A[j] = B[j] * c.getElement(i, cCol);
+      }
+    }
+  } else if (b.getValueType() == NO_VALUE) {
+    for (size_t i = 0; i < height_; i++) {
+      size_t start = getRowStartIdx(i);
+      size_t end = getRowStartIdx(i + 1);
+      CHECK_EQ(start, b.getRowStartIdx(i));
+      CHECK_EQ(end, b.getRowStartIdx(i + 1));
+      for (size_t j = start; j < end; j++) {
+        A[j] = c.getElement(i, cCol);
+      }
+    }
+  }
+}
+
+void CpuSparseMatrix::randomizeUniform() {
+  CHECK_LE(elementCnt_, height_ * width_);
+  if (valueType_ == FLOAT_VALUE) {
+    real* data = getValue();
+    for (size_t i = 0; i < elementCnt_; ++i) {
+      *data++ = rand() / static_cast<real>(RAND_MAX);  // NOLINT
+    }
+  }
+  if (format_ == SPARSE_CSR) {
+    sparseRand(rows_, cols_, elementCnt_, height_ + 1, width_, false);
+  } else {
+    sparseRand(cols_, rows_, elementCnt_, width_ + 1, height_, false);
+  }
+}
+
+void CpuSparseMatrix::copyFrom(std::vector<int>& rows,
+                               std::vector<int>& cols,
+                               std::vector<real>& values) {
+  size_t size = format_ == SPARSE_CSR ? cols.size() : rows.size();
+  resize(height_, width_, size, valueType_, format_);
+  if (valueType_ == FLOAT_VALUE) {
+    memcpy(&value_[0], &values[0], sizeof(real) * values.size());
+  }
+  memcpy(&cols_[0], &cols[0], sizeof(int) * cols.size());
+  memcpy(&rows_[0], &rows[0], sizeof(int) * rows.size());
+}
+
+// Copy from a CpuMatrix, only supported in sparse_float_value_t
+// SparseMatrix.
+void CpuSparseMatrix::copyFrom(const CpuMatrix& src) {
+  CHECK_EQ(getHeight(), src.getHeight());
+  CHECK_EQ(getWidth(), src.getWidth());
+  CHECK(!src.trans_ && !trans_);
+  if (format_ == SPARSE_CSR) {
+    std::vector<int> rows(getHeight() + 1);
+    std::vector<int> cols;
+    std::vector<real> values;
+    rows[0] = 0;
+    for (size_t r = 0; r < getHeight(); ++r) {
+      for (size_t c = 0; c < getWidth(); ++c) {
+        real v = src.getElement(r, c);
+        if (fabs(v) > FLT_EPSILON) {
+          cols.push_back(c);
+          values.push_back(v);
+        }
+      }
+      rows[r + 1] = values.size();
+    }
+    copyFrom(rows, cols, values);
+  } else {
+    std::vector<int> cols(getWidth() + 1);
+    std::vector<int> rows;
+    std::vector<real> values;
+    cols[0] = 0;
+    for (size_t r = 0; r < getWidth(); ++r) {
+      for (size_t c = 0; c < getHeight(); ++c) {
+        real v = src.getElement(c, r);
+        if (fabs(v) > FLT_EPSILON) {
+          rows.push_back(c);
+          values.push_back(v);
+        }
+      }
+      cols[r + 1] = values.size();
+    }
+    copyFrom(rows, cols, values);
+  }
+}
+
+MatrixPtr CpuSparseMatrix::clone(size_t height, size_t width, bool useGpu) {
+  if (height == 0 && width == 0) {
+    height = height_;
+    width = width_;
+  }
+  CHECK(width && height);
+  if (!useGpu) {
+    return std::make_shared<CpuSparseMatrix>(
+        height, width, 0, valueType_, format_);
+  } else {
+    return std::make_shared<GpuSparseMatrix>(
+        height, width, elementCnt_, valueType_, format_);
+  }
+}
+
+MatrixPtr CpuSparseMatrix::subMatrix(size_t startRow, size_t numRows) {
+  CHECK_LE(startRow + numRows, height_);
+  CHECK_EQ(format_, SPARSE_CSR);
+  if (valueType_ == NO_VALUE) {
+    return std::make_shared<CpuSparseMatrix>(
+        nullptr,
+        rows_ + startRow,
+        cols_,
+        numRows,
+        width_,
+        rows_[startRow + numRows] - rows_[startRow],
+        valueType_,
+        format_,
+        trans_);
+  } else {
+    return std::make_shared<CpuSparseMatrix>(
+        value_,
+        rows_ + startRow,
+        cols_,
+        numRows,
+        width_,
+        rows_[startRow + numRows] - rows_[startRow],
+        valueType_,
+        format_,
+        trans_);
+  }
+}
+
+/* mem MUST be alloced outside (memAlloc=false) */
+void CpuSparseMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
+  CHECK(!memAlloc);
+  CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(matTrans.get());
+  if (format_ == SPARSE_CSR) {
+    /*statistic element number in each col*/
+    int* colCounters = mat->getRows() + 1;
+    memset(colCounters, 0, sizeof(int) * width_);
+    for (size_t i = 0; i < elementCnt_; ++i) {
+      int col = cols_[i];
+      colCounters[col]++;
+    }
+    /*fill mat rows */
+    mat->getRows()[0] = 0;
+    for (size_t i = 1; i < width_ + 1; i++) {
+      mat->getRows()[i] = mat->getRows()[i - 1] + mat->getRows()[i];
+    }
+    /*fill mat values and cols*/
+    std::vector<int> colNumVec(width_, 0);
+    if (valueType_ == FLOAT_VALUE) {
+      for (size_t i = 0; i < height_; i++) {
+        for (int j = rows_[i]; j < rows_[i + 1]; j++) {
+          int colIdx = cols_[j];
+          int index = mat->getRows()[colIdx] + colNumVec[colIdx];
+          mat->getCols()[index] = i;
+          mat->getValue()[index] = value_[j];
+          colNumVec[colIdx]++;
+        }
+      }
+    } else {
+      for (size_t i = 0; i < height_; i++) {
+        for (int j = rows_[i]; j < rows_[i + 1]; j++) {
+          int colIdx = cols_[j];
+          int index = mat->getRows()[colIdx] + colNumVec[colIdx];
+          mat->getCols()[index] = i;
+          colNumVec[colIdx]++;
+        }
+      }
+    }
+  } else {
+    /*statistic element number in each row*/
+    int* rowCounters = mat->getCols() + 1;
+    memset(rowCounters, 0, sizeof(int) * height_);
+    for (size_t i = 0; i < elementCnt_; ++i) {
+      int row = rows_[i];
+      rowCounters[row]++;
+    }
+
+    /*fill mat cols */
+    mat->getCols()[0] = 0;
+    for (size_t i = 1; i < height_ + 1; i++) {
+      mat->getCols()[i] = mat->getCols()[i - 1] + mat->getCols()[i];
+    }
+    /*fill mat values and rows*/
+    std::vector<int> rowNumVec(height_, 0);
+    if (valueType_ == FLOAT_VALUE) {
+      for (size_t i = 0; i < width_; i++) {
+        for (int j = cols_[i]; j < cols_[i + 1]; j++) {
+          int rowIdx = rows_[j];
+          int index = mat->getCols()[rowIdx] + rowNumVec[rowIdx];
+          mat->getRows()[index] = i;
+          mat->getValue()[index] = value_[j];
+          rowNumVec[rowIdx]++;
+        }
+      }
+    } else {
+      for (size_t i = 0; i < width_; i++) {
+        for (int j = cols_[i]; j < cols_[i + 1]; j++) {
+          int rowIdx = rows_[j];
+          int index = mat->getCols()[rowIdx] + rowNumVec[rowIdx];
+          mat->getRows()[index] = i;
+          rowNumVec[rowIdx]++;
+        }
+      }
+    }
+  }
+}
+
+void CpuSparseMatrix::setRow(size_t row,
+                             size_t colNum,
+                             const unsigned int* cols,
+                             const real* values) {
+  if (format_ == SPARSE_CSR) {
+    CHECK_LT(row, height_);
+    CHECK(NULL != cols);
+    if (0 == row) {
+      rows_[row] = 0;
+    }
+    rows_[row + 1] = rows_[row] + colNum;
+    for (size_t i = 0; i < colNum; ++i) {
+      cols_[rows_[row] + i] = cols[i];
+    }
+    if (valueType_ == NO_VALUE) {
+      CHECK(!values);
+    } else {
+      for (size_t i = 0; i < colNum; ++i) {
+        value_[rows_[row] + i] = values[i];
+      }
+    }
+  } else {
+    LOG(FATAL) << "not supported";
+  }
+}
+
+void CpuSparseMatrix::fillRowIndices(IVectorPtr& outVec) const {
+  if (format_ == SPARSE_CSR) {
+    auto nnz = getElementCnt();
+    IVector::resizeOrCreate(outVec, nnz, false);
+    auto out = outVec->getData();
+    int* rows = getRows();
+    for (size_t i = 0; i < height_; i++) {
+      for (int j = rows[i]; j < rows[i + 1]; j++) {
+        out[j] = i;
+      }
+    }
+  } else {
+    LOG(FATAL) << "SPARSE_CSC not supported";
+  }
+}
+
+ThreadLocal<std::vector<CpuSparseMatrixPtr>> CpuSparseMatrix::cpuLocalMats_;
+
+CpuSparseMatrixPtr CpuSparseMatrix::getTmpSparseMatrix(size_t height,
+                                                       size_t width) {
+  std::vector<CpuSparseMatrixPtr>* localMats = cpuLocalMats_.get();
+  auto it = localMats->begin();
+  while (it != localMats->end()) {
+    if (it->unique()) {
+      (*it)->resize(height, width, elementCnt_, valueType_, format_);
+      return *it;
+    }
+  }
+  localMats->emplace_back(std::make_shared<CpuSparseMatrix>(
+      height, width, elementCnt_, valueType_, format_, false));
+  return localMats->back();
+}
+
+void CpuSparseMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
+  if (dynamic_cast<const GpuSparseMatrix*>(&src)) {
+    auto tmpSrc = dynamic_cast<const GpuSparseMatrix*>(&src);
+    copyFrom(*tmpSrc, stream);
+  } else if (dynamic_cast<const CpuSparseMatrix*>(&src)) {
+    auto tmpSrc = dynamic_cast<const CpuSparseMatrix*>(&src);
+    copyFrom(*tmpSrc);
+  } else if (dynamic_cast<const CpuMatrix*>(&src)) {
+    auto tmpSrc = dynamic_cast<const CpuMatrix*>(&src);
+    copyFrom(*tmpSrc);
+  } else {
+    LOG(FATAL) << "not implemented";
+  }
+}
+
+void CpuSparseMatrix::copyFrom(const Matrix& src) {
+  if (dynamic_cast<const CpuSparseMatrix*>(&src)) {
+    auto tmpSrc = dynamic_cast<const CpuSparseMatrix*>(&src);
+    copyFrom(*tmpSrc);
+  } else if (dynamic_cast<const CpuMatrix*>(&src)) {
+    auto tmpSrc = dynamic_cast<const CpuMatrix*>(&src);
+    copyFrom(*tmpSrc);
+  } else {
+    LOG(FATAL) << "not implemented";
+  }
+}
+
+void CpuSparseMatrix::copyFrom(const GpuSparseMatrix& src, hl_stream_t stream) {
+  CHECK_EQ(height_, src.getHeight());
+  CHECK_EQ(width_, src.getWidth());
+  CHECK_EQ(size_t(elementCnt_), src.getElementCnt());
+  size_t valSize = valueType_ == NO_VALUE ? 0 : elementCnt_;
+  if (format_ == SPARSE_CSC)
+    hl_memcpy_from_csc_matrix(value_,
+                              valSize,
+                              rows_,
+                              elementCnt_,
+                              cols_,
+                              width_ + 1,
+                              src.sMatrix_.get(),
+                              stream);
+  else
+    hl_memcpy_from_csr_matrix(value_,
+                              valSize,
+                              rows_,
+                              height_ + 1,
+                              cols_,
+                              elementCnt_,
+                              src.sMatrix_.get(),
+                              stream);
+}
+
+void CpuSparseMatrix::copyFrom(const CpuSparseMatrix& src) {
+  CHECK_EQ(height_, src.getHeight());
+  CHECK_EQ(width_, src.getWidth());
+  CHECK_EQ(format_, src.getFormat());
+  int start = format_ == SPARSE_CSR ? src.getRows()[0] : src.getCols()[0];
+  if (format_ == SPARSE_CSR) {
+    size_t totalColNum = 0;
+    for (size_t i = 0; i < height_; ++i) {
+      totalColNum += src.getColNum(i);
+    }
+    resize(height_, width_, totalColNum, valueType_, format_);
+    rows_[0] = 0;
+    for (size_t i = 0; i < height_; ++i) {
+      rows_[i + 1] = rows_[i] + src.getColNum(i);
+    }
+    memcpy(cols_, src.getCols() + start, totalColNum * sizeof(int));
+  } else {
+    size_t totalColNum = 0;
+    for (size_t i = 0; i < width_; ++i) {
+      totalColNum += src.getRowNum(i);
+    }
+    resize(height_, width_, totalColNum, valueType_, format_);
+    cols_[0] = 0;
+    for (size_t i = 0; i < width_; ++i) {
+      cols_[i + 1] = cols_[i] + src.getRowNum(i);
+    }
+    memcpy(rows_, src.getRows() + start, totalColNum * sizeof(int));
+  }
+
+  // if have different value type, only copy rows and cols
+  if (valueType_ == FLOAT_VALUE && src.getValueType() == FLOAT_VALUE) {
+    memcpy(value_, src.getValue() + start, elementCnt_ * sizeof(real));
+  }
+}
+
+void CpuSparseMatrix::copyRow(int offsets,
+                              size_t colNum,
+                              const sparse_non_value_t* row) {
+  for (size_t j = 0; j < colNum; j++) {
+    cols_[offsets + j] = row[j].col;
+  }
+}
+
+void CpuSparseMatrix::copyRow(int offsets,
+                              size_t colNum,
+                              const sparse_float_value_t* row) {
+  for (size_t j = 0; j < colNum; j++) {
+    cols_[offsets + j] = row[j].col;
+    value_[offsets + j] = row[j].value;
+  }
+}
+
+template <class T>
+void CpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices, T* data) {
+  size_t totalColNum = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    int64_t id = ids[i];
+    totalColNum += indices[id + 1] - indices[id];
+  }
+  valueType_ = typeid(T) == typeid(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE;
+
+  resize(height_, width_, totalColNum, valueType_, format_);
+
+  rows_[0] = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    int64_t id = ids[i];
+    T* row = data + indices[id];
+    size_t colNum = indices[id + 1] - indices[id];
+    rows_[i + 1] = rows_[i] + colNum;
+    copyRow(rows_[i], colNum, row);
+  }
+}
+
+template <class T>
+void CpuSparseMatrix::copyFrom(int64_t* indices, T* data) {
+  CHECK(format_ == SPARSE_CSR);
+  size_t totalColNum = indices[height_] - indices[0];
+  valueType_ = typeid(T) == typeid(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE;
+  resize(height_, width_, totalColNum, valueType_, format_);
+
+  rows_[0] = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    T* row = data + indices[i];
+    size_t colNum = indices[i + 1] - indices[i];
+    rows_[i + 1] = rows_[i] + colNum;
+    copyRow(rows_[i], colNum, row);
+  }
+}
+
+void CpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) {
+  CHECK_EQ(height_, src.getHeight());
+  CHECK_LE(width_, src.getWidth());
+  CHECK_EQ(format_, src.getFormat());
+  CHECK_EQ(valueType_, src.getValueType());
+  if (format_ == SPARSE_CSR) {
+    int* srcCols = src.getCols();
+    size_t numLessWidth =
+        std::count_if(srcCols, srcCols + src.getElementCnt(), [this](size_t n) {
+          return n < this->width_;
+        });
+    resize(height_, width_, numLessWidth, valueType_, format_);
+    rows_[0] = 0;
+    size_t index = 0;
+    for (size_t r = 0; r < height_; ++r) {
+      for (int i = src.getRows()[r]; i < src.getRows()[r + 1]; ++i) {
+        if (srcCols[i] < static_cast<int>(width_)) {
+          cols_[index] = srcCols[i];
+          if (valueType_ == FLOAT_VALUE) {
+            value_[index] = src.getValue()[i];
+          }
+          ++index;
+        }
+      }
+      rows_[r + 1] = index;
+    }
+    CHECK_EQ(index, numLessWidth);
+  } else {
+    size_t numLessWidth = src.getCols()[width_] - src.getCols()[0];
+    resize(height_, width_, numLessWidth, valueType_, format_);
+    cols_[0] = 0;
+    size_t index = 0;
+    // note: c < width_, not src.getWidth();
+    for (size_t c = 0; c < width_; ++c) {
+      for (int i = src.getCols()[c]; i < src.getCols()[c + 1]; ++i) {
+        rows_[index] = src.getRows()[i];
+        if (valueType_ == FLOAT_VALUE) {
+          value_[index] = src.getValue()[i];
+        }
+        ++index;
+      }
+      cols_[c + 1] = index;
+    }
+    CHECK_EQ(index, numLessWidth);
+  }
+}
+
+void CpuSparseMatrix::zeroMem() {
+  CHECK(valueType_ == FLOAT_VALUE);
+  memset(value_, 0, elementCnt_ * sizeof(real));
+}
+
+template void CpuSparseMatrix::copyFrom(int64_t* ids,
+                                        int64_t* indices,
+                                        sparse_non_value_t* data);
+
+template void CpuSparseMatrix::copyFrom(int64_t* ids,
+                                        int64_t* indices,
+                                        sparse_float_value_t* data);
+
+template void CpuSparseMatrix::copyFrom(int64_t* indices,
+                                        sparse_non_value_t* data);
+
+template void CpuSparseMatrix::copyFrom(int64_t* indices,
+                                        sparse_float_value_t* data);
+
+void CpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
+  size_t numSamples = getHeight();
+  size_t beam = maxVal.getWidth();
+  CHECK_EQ(maxIds.getSize(), numSamples * beam);
+  CHECK_EQ(maxVal.getHeight(), numSamples);
+  maxVal.zeroMem();
+  int* outids = maxIds.getData();
+  real* outvalues = maxVal.getData();
+
+  typedef std::pair<real, size_t> valuepair;
+  std::vector<valuepair> vec;
+  for (size_t i = 0; i < numSamples; i++) {
+    vec.clear();
+
+    auto num = getColNum(i);
+    auto ids = getRowCols(i);
+    auto values = getRowValues(i);
+    for (size_t j = 0; j < num; j++) {
+      vec.push_back(std::make_pair(values[j], ids[j]));
+    }
+
+    size_t outsize = std::min(num, beam);
+    std::partial_sort(vec.begin(),
+                      vec.begin() + outsize,
+                      vec.end(),
+                      [](const valuepair& a, const valuepair& b) {
+                        return a.first > b.first;
+                      });
+    for (size_t j = 0; j < outsize; j++) {
+      outids[i * beam + j] = vec[j].second;
+      outvalues[i * beam + j] = vec[j].first;
+    }
+    if (outsize < beam) {
+      // if the number of values to sort are less than the output size,
+      // use -1 to indicate the end of valid sorted values.
+      outids[i * beam + outsize] = -1;
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/math/CpuSparseMatrix.h b/paddle/legacy/math/CpuSparseMatrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..172792c2950ce56281715cb7f3eb076da252d77e
--- /dev/null
+++ b/paddle/legacy/math/CpuSparseMatrix.h
@@ -0,0 +1,377 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
+#include <cstddef>
+#include "Matrix.h"
+
+namespace paddle {
+
+class CpuSparseMatrix : public Matrix {
+ public:
+  CpuSparseMatrix(size_t height,
+                  size_t width,
+                  size_t nnz, /* used to allocate space */
+                  SparseValueType valueType = FLOAT_VALUE,
+                  SparseFormat format = SPARSE_CSR,
+                  bool trans = false);
+
+  CpuSparseMatrix(CpuMemHandlePtr memHandle,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
+                  bool trans);
+
+  CpuSparseMatrix(real* data,
+                  int* rows,
+                  int* cols,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
+                  bool trans);
+
+  ~CpuSparseMatrix() {}
+
+  void resize(size_t newHeight,
+              size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType,
+              SparseFormat format);
+  void resize(size_t newHeight, size_t newWidth);
+
+  MatrixPtr getTranspose();
+
+  SparseValueType getValueType();
+
+  real* getRowValues(size_t i) const {
+    if (format_ == SPARSE_CSR) {
+      return value_ + rows_[i];
+    } else {
+      LOG(FATAL) << "SPARSE_CSC not supported";
+      return 0;
+    }
+  }
+
+  int* getRowCols(size_t i) const {
+    if (format_ == SPARSE_CSR) {
+      return cols_ + rows_[i];
+    } else {
+      LOG(FATAL) << "SPARSE_CSC not supported";
+      return 0;
+    }
+  }
+
+  /// fill row indices of each value in CSR matrix
+  void fillRowIndices(IVectorPtr& outVec) const;
+
+  size_t getColNum(size_t i) const {
+    if (format_ == SPARSE_CSR) {
+      return rows_[i + 1] - rows_[i];
+    } else {
+      LOG(FATAL) << "SPARSE_CSC not supported";
+      return 0;
+    }
+  }
+
+  real* getColumn(size_t i) const {
+    if (format_ == SPARSE_CSC) {
+      return value_ + cols_[i];
+    } else {
+      LOG(FATAL) << "SPARSE_CSR not supported";
+      return 0;
+    }
+  }
+
+  size_t getColStartIdx(size_t i) const {
+    if (format_ == SPARSE_CSC) {
+      return cols_[i];
+    } else {
+      LOG(FATAL) << "SPARSE_CSR not supported";
+      return 0;
+    }
+  }
+
+  size_t getRowStartIdx(size_t i) const {
+    if (format_ == SPARSE_CSR) {
+      return rows_[i];
+    } else {
+      LOG(FATAL) << "SPARSE_CSC not supported";
+      return 0;
+    }
+  }
+
+  size_t getRowNum(size_t i) const {
+    if (format_ == SPARSE_CSC) {
+      return cols_[i + 1] - cols_[i];
+    } else {
+      LOG(FATAL) << "SPARSE_CSR not supported";
+      return 0;
+    }
+  }
+
+  virtual real getSum() {
+    CHECK(isContiguous());
+    if (valueType_ == NO_VALUE) {
+      return elementCnt_;
+    }
+    double sum = 0;
+    for (size_t i = 0; i < elementCnt_; ++i) {
+      sum += value_[i];
+    }
+    return sum;
+  }
+
+  virtual void square2() {
+    CHECK(isContiguous());
+    if (valueType_ == NO_VALUE) {
+      return;
+    }
+    for (size_t i = 0; i < elementCnt_; ++i) {
+      value_[i] = value_[i] * value_[i];
+    }
+  }
+
+  /**
+   * only consider nonzero values.
+   * the actual min value should compare with 0.0.
+   */
+  virtual real getMin() {
+    CHECK(isContiguous());
+    if (valueType_ == NO_VALUE) {
+      return (elementCnt_ > 0 ? 1.0 : 0.0);
+    }
+    real min = value_[0];
+    for (size_t i = 1; i < elementCnt_; ++i) {
+      min = value_[i] < min ? value_[i] : min;
+    }
+    return min;
+  }
+
+  /**
+   * only consider nonzero values.
+   * the actual max value should compare with 0.0.
+   */
+  virtual real getMax() {
+    CHECK(isContiguous());
+    if (valueType_ == NO_VALUE) {
+      return (elementCnt_ > 0 ? 1.0 : 0.0);
+    }
+    real max = value_[0];
+    for (size_t i = 1; i < elementCnt_; ++i) {
+      max = value_[i] > max ? value_[i] : max;
+    }
+    return max;
+  }
+
+  void rowMax(IVector& maxIds, Matrix& maxVal);
+  int* getRows() const { return rows_; }
+  int* getCols() const { return cols_; }
+  real* getValue() const { return value_; }
+  SparseFormat getFormat() const { return format_; }
+  SparseValueType getValueType() const { return valueType_; }
+
+  /**
+   * @brief return value_ of sparse matrix
+   *
+   * Some times CpuSparseMatrix maybe Matrix,
+   * if getValue, must dynamic_cast to CpuSparseMatrix,
+   * getData is convenient to get value
+   */
+  real* getData() { return getValue(); }
+  const real* getData() const { return getValue(); }
+
+  /**
+   * @brief only set value_ of FLOAT_VALUE sparse matrix to zero
+   */
+  void zeroMem();
+
+  /// mem MUST be alloced outside (memAlloc=false)
+  void transpose(MatrixPtr& matTrans, bool memAlloc);
+
+  void mul(const Matrix& A, const Matrix& B, real alpha, real beta);
+
+  /**
+   * @brief sparseMatrix += denseMatrix
+   *
+   *  Named add3 just because add/add2 has been used in BaseMatrix.cu
+   *  and they are not virtual function.
+   *
+   *  Only add value of same (row, col) index in dense matrix
+   *  and do not use others values whoes postions are not in sparse matirx.
+   *
+   * @param[in]  b   dense matrix
+   */
+  void add3(CpuMatrix* b);
+  void add3(MatrixPtr b);
+
+  /**
+   * @brief sparseMatrix[i,j] += bias[j], (j is the col index of sparse matrix)
+   *
+   * @param[in]  b      bias, dense matrix and height = 1
+   * @param[in]  scale  scale of b
+   */
+  void addBias(Matrix& b, real scale);
+
+  void print(std::ostream& os) const;
+
+  void printOneRow(std::ostream& os, size_t idx) const;
+
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
+              const real* values);
+
+  /**
+   * @brief this_row = b_row * c_row[cCol]
+   *
+   * @param[in]  cCol   the column of matrix c used to scale each row of b
+   * @param[in]  b      CpuSparseMatrix
+   * @param[in]  c      Matrix
+   */
+  void rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c);
+
+  void randomizeUniform();
+
+  void copyFrom(const GpuSparseMatrix& src, hl_stream_t stream);
+
+  void copyFrom(const Matrix& src, hl_stream_t stream = HPPL_STREAM_DEFAULT);
+
+  void copyFrom(const Matrix& src);
+
+  /**
+   * Get a temporary matrix. This is threadsafe. It should be only used
+   * temporarily, i.e. do not store it or use it as return value.
+   *
+   * @note  Do NOT use large amount of tmp matrix.
+   */
+  CpuSparseMatrixPtr getTmpSparseMatrix(size_t height, size_t width);
+
+  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows);
+
+  void copyFrom(std::vector<int>& rows,
+                std::vector<int>& cols,
+                std::vector<real>& values);
+
+  void copyFrom(const CpuMatrix& src);
+
+  void copyFrom(const CpuSparseMatrix& src);
+
+  // trim the large size
+  void trimFrom(const CpuSparseMatrix& src);
+
+  void copyRow(int offsets, size_t colNum, const sparse_non_value_t* row);
+
+  void copyRow(int offsets, size_t colNum, const sparse_float_value_t* row);
+
+  template <class T>
+  void copyFrom(int64_t* ids, int64_t* indices, T* data);
+
+  template <class T>
+  void copyFrom(int64_t* indices, T* data);
+
+  void copyFrom(const real* data, size_t len) {
+    LOG(FATAL) << "not supported!";
+  }
+
+ private:
+  MatrixPtr clone(size_t height = 0, size_t width = 0, bool useGpu = false);
+
+ protected:
+  void sparseResize();
+  /*for csr , record row start position, for csc, record row index for every no
+   * zero value*/
+  int* rows_;
+  /*for csc , record col start position, for csr, record col index for every no
+   * zero value*/
+  int* cols_;
+  real* value_;               /*nonzero value*/
+  SparseFormat format_;       /* matrix format */
+  SparseValueType valueType_; /*with value or not  */
+  static const size_t DEFAULT_AVG_WIDTH = 20;
+
+  static ThreadLocal<std::vector<CpuSparseMatrixPtr>> cpuLocalMats_;
+
+  // BaseMatrixT interface
+ public:
+  bool isSparse() const { return true; }
+
+ private:
+  using Matrix::mul;
+  using Matrix::copyFrom;
+  using Matrix::rowMax;
+  using Matrix::print;
+  using Matrix::subMatrix;
+};
+}  // namespace paddle
+
+#else
+
+#include "Matrix.h"
+
+namespace paddle {
+
+class CpuSparseMatrix : public Matrix {
+ public:
+  CpuSparseMatrix(size_t height,
+                  size_t width,
+                  size_t nnz, /* used to allocate space */
+                  SparseValueType valueType = FLOAT_VALUE,
+                  SparseFormat format = SPARSE_CSR,
+                  bool trans = false)
+      : Matrix(NULL, height, width, trans, false) {}
+
+  CpuSparseMatrix(real* data,
+                  int* rows,
+                  int* cols,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
+                  bool trans)
+      : Matrix(NULL, height, width, trans, false) {}
+
+  real* getValue() const { return nullptr; }
+  size_t getColStartIdx(size_t i) const { return 0; }
+  size_t getRowStartIdx(size_t i) const { return 0; }
+  size_t getColNum(size_t i) const { return 0; }
+  int* getRowCols(size_t i) const { return nullptr; }
+
+  CpuSparseMatrixPtr getTmpSparseMatrix(size_t height, size_t width) {
+    return nullptr;
+  }
+
+  void resize(size_t newHeight,
+              size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType,
+              SparseFormat format) {}
+  void resize(size_t newHeight, size_t newWidth) {}
+  MatrixPtr getTranspose() { return nullptr; }
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
+              const real* values) {}
+};
+
+}  // namespace paddle
+
+#endif
diff --git a/paddle/legacy/math/ExecViaCpu.h b/paddle/legacy/math/ExecViaCpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec2337545e9e3efdf31d3d786a096a67283715f2
--- /dev/null
+++ b/paddle/legacy/math/ExecViaCpu.h
@@ -0,0 +1,195 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ execViaCpu is used to do operations on GpuMatirx and/or GpuIVector through
+ cpu functions. It can automatically make a temporary CPU copy for the
+ gpu matrix/vector, and copy back after executing the CPU function.
+
+ Examples:
+ 1. For a function, functor or lambda:
+   r = execViaCpu(&f, mat, vec)
+
+ 2. For member function of CpuMatirx, execViaCpu2 should be used:
+   execViaCpu2(&CpuMatrix::selectElements, *this, table, ids)
+*/
+
+#pragma once
+
+namespace paddle {
+
+template <typename Arg>
+class CopyToCpu {
+ public:
+  explicit CopyToCpu(Arg& arg) : arg_(arg) {}
+  Arg& copiedArg() const { return arg_; }
+
+ private:
+  Arg& arg_;
+};
+
+template <>
+class CopyToCpu<Matrix> {
+ public:
+  explicit CopyToCpu(Matrix& arg) : arg_(arg) {
+    if (arg.useGpu()) {
+      CHECK(!arg.isTransposed()) << "Not supported";
+      copied_ = Matrix::create(arg.getHeight(),
+                               arg.getWidth(),
+                               /* trans= */ false,
+                               /* useGpu= */ false);
+      copied_->copyFrom(arg);
+    }
+  }
+  ~CopyToCpu() {
+    if (copied_) {
+      arg_.copyFrom(*copied_);
+    }
+  }
+  Matrix& copiedArg() const { return copied_ ? *copied_ : arg_; }
+
+ private:
+  Matrix& arg_;
+  MatrixPtr copied_;
+};
+
+template <>
+class CopyToCpu<const Matrix> {
+ public:
+  explicit CopyToCpu(const Matrix& arg) : arg_(arg) {
+    if (arg.useGpu()) {
+      CHECK(!arg.isTransposed()) << "Not supported";
+      copied_ = Matrix::create(arg.getHeight(),
+                               arg.getWidth(),
+                               /* trans= */ false,
+                               /* useGpu= */ false);
+      copied_->copyFrom(arg);
+    }
+  }
+  const Matrix& copiedArg() const { return copied_ ? *copied_ : arg_; }
+
+ private:
+  const Matrix& arg_;
+  MatrixPtr copied_;
+};
+
+template <>
+class CopyToCpu<IVector> {
+ public:
+  explicit CopyToCpu(IVector& arg) : arg_(arg) {
+    if (arg.useGpu()) {
+      copied_ = IVector::create(arg.getSize(), /* useGpu= */ false);
+      copied_->copyFrom(arg);
+    }
+  }
+  ~CopyToCpu() {
+    if (copied_) {
+      arg_.copyFrom(*copied_);
+    }
+  }
+  IVector& copiedArg() const { return copied_ ? *copied_ : arg_; }
+
+ private:
+  IVector& arg_;
+  IVectorPtr copied_;
+};
+
+template <>
+class CopyToCpu<const IVector> {
+ public:
+  explicit CopyToCpu(const IVector& arg) : arg_(arg) {
+    if (arg.useGpu()) {
+      copied_ = IVector::create(arg.getSize(), /* useGpu= */ false);
+      copied_->copyFrom(arg);
+    }
+  }
+  const IVector& copiedArg() const { return copied_ ? *copied_ : arg_; }
+
+ private:
+  const IVector& arg_;
+  IVectorPtr copied_;
+};
+
+namespace detail {
+
+template <bool isFunction, bool isFunctionPointer, bool isClass, typename F>
+class GpuFuncWrapperImp;
+
+template <typename F, typename R, typename... Args>
+class GpuFuncWrapperBase {
+ public:
+  typedef R ResultType;
+  R operator()(F&& f, Args... args) {
+    return f(CopyToCpu<typename std::remove_reference<Args>::type>(args)
+                 .copiedArg()...);
+  }
+};
+
+// function
+template <typename R, typename... Args>
+class GpuFuncWrapperImp<true, false, false, R(Args...)>
+    : public GpuFuncWrapperBase<R(Args...), R, Args...> {};
+
+// function pointer
+template <typename R, typename... Args>
+class GpuFuncWrapperImp<false, true, false, R (*)(Args...)>
+    : public GpuFuncWrapperBase<R (*)(Args...), R, Args...> {};
+
+template <typename F, typename Op>
+class GpuFuncWrapperImp2;
+
+template <typename F, typename C, typename R, typename... Args>
+class GpuFuncWrapperImp2<F, R (C::*)(Args...) const>
+    : public GpuFuncWrapperBase<F, R, Args...> {};
+
+template <typename F, typename C, typename R, typename... Args>
+class GpuFuncWrapperImp2<F, R (C::*)(Args...)>
+    : public GpuFuncWrapperBase<F, R, Args...> {};
+
+// functor or lambda
+template <typename F>
+class GpuFuncWrapperImp<false, false, true, F>
+    : public GpuFuncWrapperImp2<F, decltype(&F::operator())> {};
+
+template <typename F>
+class GpuFuncWrapper2
+    : public GpuFuncWrapperImp<
+          std::is_function<F>::value,
+          std::is_pointer<F>::value &&
+              std::is_function<typename std::remove_pointer<F>::type>::value,
+          std::is_class<F>::value,
+          F> {};
+
+template <typename F>
+class GpuFuncWrapper
+    : public GpuFuncWrapper2<typename std::remove_reference<F>::type> {};
+
+}  // namespace detail
+
+template <typename F, typename... Args>
+typename detail::GpuFuncWrapper<F>::ResultType execViaCpu(F&& f,
+                                                          Args&&... args) {
+  return detail::GpuFuncWrapper<F>()(std::move(f), args...);
+}
+
+// The second version is for F as member function of CpuMatrix
+template <typename R, typename... FArgs, typename... Args>
+R execViaCpu2(R (CpuMatrix::*f)(FArgs...), Args&&... args) {
+  auto lambda = [](R (CpuMatrix::*f)(FArgs...), Matrix& ths, FArgs... args) {
+    return (((CpuMatrix&)ths).*f)(args...);
+  };
+  return execViaCpu(lambda, f, args...);
+}
+
+}  // namespace paddle
diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/legacy/math/MKLDNNMatrix.cpp
similarity index 100%
rename from paddle/math/MKLDNNMatrix.cpp
rename to paddle/legacy/math/MKLDNNMatrix.cpp
diff --git a/paddle/legacy/math/MKLDNNMatrix.h b/paddle/legacy/math/MKLDNNMatrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..5a0e5f85923dfd822dad4c63679acde63719f217
--- /dev/null
+++ b/paddle/legacy/math/MKLDNNMatrix.h
@@ -0,0 +1,256 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Matrix.h"
+#include "mkldnn.hpp"
+#include "paddle/legacy/parameter/Parameter.h"
+
+namespace paddle {
+
+class MKLDNNMatrix;
+typedef std::shared_ptr<MKLDNNMatrix> MKLDNNMatrixPtr;
+
+#define CHECK_PRIMITIVE_DESC_EQ(MAT, PD, ...)                        \
+  CHECK(MAT) << " can not be empty.";                                \
+  CHECK(MAT->getPrimitiveDesc() == PD)                               \
+      << #MAT "->getPrimitiveDesc() and " #PD " should be equal.\n " \
+      << "" __VA_ARGS__;
+
+/**
+ * @brief MKLDNN Matrix.
+ *
+ */
+class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
+ public:
+  MKLDNNMatrix(CpuMatrixPtr m, mkldnn::memory::primitive_desc pd)
+      : CpuMatrix(m->getData(), m->getHeight(), m->getWidth(), false),
+        mkldnn::memory(pd, m->getData()),
+        m_(m) {}
+
+  ~MKLDNNMatrix() {}
+
+  /**
+   * Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc
+   */
+  static MKLDNNMatrixPtr create(mkldnn::memory::primitive_desc pd,
+                                MatrixPtr m = nullptr);
+
+  /**
+   * Create MKLDNNMatrix from a MatrixPtr and memory details info
+   */
+  static MKLDNNMatrixPtr create(
+      mkldnn::memory::dims dims,
+      mkldnn::memory::format fmt,
+      mkldnn::engine& eg,
+      MatrixPtr m = nullptr,
+      mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
+
+  /**
+   * Create primitive descriptor.
+   * default with f32 dtype
+   */
+  static mkldnn::memory::primitive_desc createPrimitiveDesc(
+      const mkldnn::memory::dims dims,
+      const mkldnn::memory::format& fmt,
+      const mkldnn::engine& eg,
+      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
+    return mkldnn::memory::primitive_desc(memory::desc(dims, dtype, fmt), eg);
+  }
+
+  /**
+   * Create Memory descriptor.
+   * default with any format and f32 dtype
+   */
+  static mkldnn::memory::desc createMemoryDesc(
+      const mkldnn::memory::dims dims,
+      const mkldnn::memory::format& fmt = mkldnn::memory::format::any,
+      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
+    return mkldnn::memory::desc(dims, dtype, fmt);
+  }
+
+  /**
+   * Create reorder primitive.
+   * Create a mkldnn::reorder handle for converting src MKLDNNMatrix to dst.
+   * checkData: whether to check the data handle of src and dst.
+   *            if true, it will check the data and do not allow them equal;
+   *            otherwise, it will not check them, then the reorder created
+   *            may have inplace buffer.
+   *            Do not set false, if you can not guarantee the inplace logical
+   *            would work with your reorder.
+   */
+  static std::shared_ptr<mkldnn::reorder> createReorder(
+      const MKLDNNMatrixPtr& src,
+      const MKLDNNMatrixPtr& dst,
+      bool checkData = true);
+
+  void copyFrom(const Matrix& src) {
+    // TODO(TJ): reorder data if this format is not nchw or x
+    m_->copyFrom(src);
+  }
+
+  void copyTo(Matrix& dst) {
+    // TODO(TJ): reorder data if this format is not nchw or x
+    dst.copyFrom(*m_);
+  }
+
+ public:
+  /**
+   * Reorder this MKLDNNMatrix from other format.
+   * Support inplace reorder.
+   * @note: this function would only reorder the data layout.
+   *        will NOT change this original dim or format info
+   */
+  void reorderDataFrom(const MKLDNNMatrixPtr& m,
+                       memory::format srcFmt,
+                       memory::dims targetDim);
+
+  /**
+   * Reorder this MKLDNNMatrix to other format.
+   * Support inplace reorder.
+   * @note: this function would only reorder the data layout.
+   *        will NOT change the dst dim or format info
+   */
+  void reorderDataTo(const MKLDNNMatrixPtr& m,
+                     memory::format dstFmt,
+                     memory::dims targetDim);
+
+  /**
+   * Dimensionality reduction.
+   * Change format "nchw --> nc" or "oihw --> oi" if the h and w are both 1
+   */
+  void downSpatial();
+
+  /**
+   * set the memory data handle.
+   * Caution: This will not check the buffer size of the data,
+   *          it should be coverd by user.
+   */
+  void setData(real* data) {
+    set_data_handle(data);
+    CpuMatrix::setData(data);
+    m_.reset();
+  }
+
+  /**
+   * override the CpuMatrix::resize
+   */
+  void resize(size_t newHeight, size_t newWidth) override {
+    m_->resize(newHeight, newWidth);
+    if (data_ == m_->getData() && elementCnt_ == newHeight * newWidth) {
+      return;
+    }
+    CpuMatrix::setData(data_);
+    height_ = newHeight;
+    width_ = newWidth;
+    elementCnt_ = newHeight * newWidth;
+    stride_ = width_;
+    auto pd = mkldnn::memory::primitive_desc(
+        mkldnn::memory::desc({(int)newHeight, (int)newWidth},
+                             getDtype(),
+                             mkldnn::memory::format::nc),
+        getEngine());
+    resetMKLDNNMemory(pd, data_);
+  }
+
+  /**
+   * override Matrix::getData
+   * check data before return
+   */
+  real* getData() override {
+    CHECK_EQ((void*)data_, get_data_handle());
+    return data_;
+  }
+
+  const real* getData() const override {
+    CHECK_EQ((void*)data_, get_data_handle());
+    return data_;
+  }
+
+  /**
+   * Get primitive descriptor.
+   */
+  mkldnn::memory::primitive_desc getPrimitiveDesc() {
+    return this->get_primitive_desc();
+  }
+
+  /**
+   * Get memory descriptor.
+   */
+  mkldnn::memory::desc getMemoryDesc() { return getPrimitiveDesc().desc(); }
+
+  /**
+   * Get dimensions.
+   */
+  mkldnn::memory::dims getDims() {
+    mkldnn::memory::desc md = getMemoryDesc();
+    const int* src = md.data.dims;
+    int ndims = md.data.ndims;
+    mkldnn::memory::dims dst;
+    dst.resize(ndims);
+    for (int i = 0; i < ndims; ++i) {
+      dst[i] = src[i];
+    }
+    return dst;
+  }
+
+  /**
+   * Get format.
+   */
+  mkldnn::memory::format getFormat() {
+    return (mkldnn::memory::format)(getMemoryDesc().data.format);
+  }
+
+  /**
+   * Get memory data type.
+   */
+  mkldnn::memory::data_type getDtype() {
+    return (mkldnn::memory::data_type)(getMemoryDesc().data.data_type);
+  }
+
+  /**
+   * Get engine.
+   */
+  mkldnn::engine getEngine() { return getPrimitiveDesc().get_engine(); }
+
+ protected:
+  /**
+   * Do reorder once.
+   * Can support inplace.
+   */
+  void reorderOnce(void* srcData,
+                   void* dstData,
+                   memory::format srcFmt,
+                   memory::format dstFmt,
+                   memory::dims dm);
+  /**
+   * reset this MKLDNN Memory from primitve desc
+   */
+  void resetMKLDNNMemory(memory::primitive_desc pd, real* data) {
+    mkldnn_primitive_t result;
+    mkldnn::error::wrap_c_api(
+        mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
+        "could not create a memory primitive");
+    reset(result);
+    set_data_handle(data);
+  }
+
+ private:
+  // save the CpuMatrixPtr in case the buffer released outside
+  CpuMatrixPtr m_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/math/MathFunctions.cpp b/paddle/legacy/math/MathFunctions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bbf34a32f36fa7988058f8d3bb7f91eaf2bc1ba0
--- /dev/null
+++ b/paddle/legacy/math/MathFunctions.cpp
@@ -0,0 +1,348 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/math/MathFunctions.h"
+#include "hl_matrix_apply.cuh"
+#include "hl_matrix_ops.cuh"
+#include "paddle/legacy/utils/DynamicLoader.h"
+
+namespace dynload {
+
+std::once_flag lapack_dso_flag;
+void* lapack_dso_handle = nullptr;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load lapack routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+
+// The argument for stringizing operator is not macro-expanded first.
+// We have to use two levels of macro to do the expansion.
+// See https://gcc.gnu.org/onlinedocs/cpp/Stringizing.html
+#define STR(x) #x
+
+// clang-format off
+#ifndef LAPACK_FOUND
+#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    auto operator()(Args... args) -> decltype(__name(args...)) {               \
+      using lapack_func = decltype(__name(args...)) (*)(Args...);              \
+      std::call_once(lapack_dso_flag, GetLapackDsoHandle, &lapack_dso_handle); \
+      void* p_##__name = dlsym(lapack_dso_handle, STR(__name));                \
+      CHECK(p_##__name) << "Cannot find symbol " << STR(__name)                \
+                        << " in liblapack.so";                                 \
+      return reinterpret_cast<lapack_func>(p_##__name)(args...);               \
+    }                                                                          \
+  } __name;  // struct DynLoad__##__name
+#else
+#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    auto operator()(Args... args) -> decltype(__name(args...)) {               \
+      return __name(args...);                                                  \
+    }                                                                          \
+  } __name;  // struct DynLoad__##__name
+#endif
+
+#define  PADDLE_SGETRF  LAPACKE_sgetrf
+#define  PADDLE_DGETRF  LAPACKE_dgetrf
+#define  PADDLE_SGETRI  LAPACKE_sgetri
+#define  PADDLE_DGETRI  LAPACKE_dgetri
+
+#define LAPACK_ROUTINE_EACH(__macro)       \
+  __macro(PADDLE_SGETRF)                   \
+  __macro(PADDLE_DGETRF)                   \
+  __macro(PADDLE_SGETRI)                   \
+  __macro(PADDLE_DGETRI)
+// clang-format on
+
+LAPACK_ROUTINE_EACH(DYNAMIC_LOAD_LAPACK_WRAP)
+
+}  // namespace dynload
+
+namespace paddle {
+
+#ifndef PADDLE_USE_EIGEN_FOR_BLAS
+template <>
+void gemm<float>(const CBLAS_TRANSPOSE transA,
+                 const CBLAS_TRANSPOSE transB,
+                 const int M,
+                 const int N,
+                 const int K,
+                 const float alpha,
+                 const float* A,
+                 const int lda,
+                 const float* B,
+                 const int ldb,
+                 const float beta,
+                 float* C,
+                 const int ldc) {
+  cblas_sgemm(CblasRowMajor,
+              transA,
+              transB,
+              M,
+              N,
+              K,
+              alpha,
+              A,
+              lda,
+              B,
+              ldb,
+              beta,
+              C,
+              ldc);
+}
+
+template <>
+void gemm<double>(const CBLAS_TRANSPOSE transA,
+                  const CBLAS_TRANSPOSE transB,
+                  const int M,
+                  const int N,
+                  const int K,
+                  const double alpha,
+                  const double* A,
+                  const int lda,
+                  const double* B,
+                  const int ldb,
+                  const double beta,
+                  double* C,
+                  const int ldc) {
+  cblas_dgemm(CblasRowMajor,
+              transA,
+              transB,
+              M,
+              N,
+              K,
+              alpha,
+              A,
+              lda,
+              B,
+              ldb,
+              beta,
+              C,
+              ldc);
+}
+#endif
+
+template <>
+int getrf<float>(const CBLAS_ORDER order,
+                 const int M,
+                 const int N,
+                 float* A,
+                 const int lda,
+                 int* ipiv) {
+  return dynload::PADDLE_SGETRF(order, M, N, A, lda, ipiv);
+}
+
+template <>
+int getrf<double>(const CBLAS_ORDER order,
+                  const int M,
+                  const int N,
+                  double* A,
+                  const int lda,
+                  int* ipiv) {
+  return dynload::PADDLE_DGETRF(order, M, N, A, lda, ipiv);
+}
+
+template <>
+int getri<float>(const CBLAS_ORDER order,
+                 const int N,
+                 float* A,
+                 const int lda,
+                 const int* ipiv) {
+  return dynload::PADDLE_SGETRI(order, N, A, lda, ipiv);
+}
+
+template <>
+int getri<double>(const CBLAS_ORDER order,
+                  const int N,
+                  double* A,
+                  const int lda,
+                  const int* ipiv) {
+  return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv);
+}
+
+#ifndef PADDLE_USE_EIGEN_FOR_BLAS
+template <>
+void axpy<float>(const int n, const float alpha, const float* x, float* y) {
+  cblas_saxpy(n, alpha, x, 1, y, 1);
+}
+
+template <>
+void axpy<double>(const int n, const double alpha, const double* x, double* y) {
+  cblas_daxpy(n, alpha, x, 1, y, 1);
+}
+
+template <>
+float dotProduct<float>(const int n, const float* x, const float* y) {
+  return cblas_sdot(n, x, 1, y, 1);
+}
+
+template <>
+double dotProduct<double>(const int n, const double* x, const double* y) {
+  return cblas_ddot(n, x, 1, y, 1);
+}
+#endif
+
+#if defined(PADDLE_WITH_MKLML)
+
+template <>
+void vExp<float>(const int n, const float* a, float* r) {
+  vsExp(n, a, r);
+}
+
+template <>
+void vExp<double>(const int n, const double* a, double* r) {
+  vdExp(n, a, r);
+}
+
+template <>
+void vPow<float>(const int n, const float* a, const float b, float* r) {
+  vsPowx(n, a, b, r);
+}
+
+template <>
+void vPow<double>(const int n, const double* a, const double b, double* r) {
+  vdPowx(n, a, b, r);
+}
+
+template <>
+void vLog<float>(const int n, const float* a, float* r) {
+  vsLn(n, a, r);
+}
+
+template <>
+void vLog<double>(const int n, const double* a, double* r) {
+  vdLn(n, a, r);
+}
+
+template <>
+void vAdd<float>(const int n, const float* a, const float* b, float* r) {
+  vsAdd(n, a, b, r);
+}
+
+template <>
+void vAdd<double>(const int n, const double* a, const double* b, double* r) {
+  vdAdd(n, a, b, r);
+}
+
+template <>
+void vTanh<float>(const int n, const float* a, float* r) {
+  vsTanh(n, a, r);
+}
+
+template <>
+void vTanh<double>(const int n, const double* a, double* r) {
+  vdTanh(n, a, r);
+}
+
+template <>
+void vInvSqrt<float>(const int n, const float* a, float* r) {
+  vsInvSqrt(n, a, r);
+}
+
+template <>
+void vInvSqrt<double>(const int n, const double* a, double* r) {
+  vdInvSqrt(n, a, r);
+}
+
+template <>
+void vLog1p<float>(const int n, const float* a, float* r) {
+  vsLog1p(n, a, r);
+}
+
+template <>
+void vLog1p<double>(const int n, const double* a, double* r) {
+  vdLog1p(n, a, r);
+}
+#else
+
+DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
+template <class T>
+void vExp(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
+      binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
+template <class T>
+void vLog(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
+      binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
+template <class T>
+void vPow(const int n, const T* a, const T b, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
+      binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
+template <class T>
+void vAdd(const int n, const T* a, const T* b, T* r) {
+  hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
+                                                     const_cast<T*>(a),
+                                                     const_cast<T*>(b),
+                                                     r,
+                                                     1,
+                                                     n,
+                                                     n,
+                                                     n,
+                                                     n);
+}
+
+DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
+template <class T>
+void vInvSqrt(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vInvSqrt<T>, 0, 0>(
+      binary::vInvSqrt<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_OP(vLog1p, b = std::log(1.0f + a));
+template <class T>
+void vLog1p(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vLog1p<T>, 0, 0>(
+      binary::vLog1p<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_OP(vTanh, T tmp = -2.0 * a;
+                        tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+                        b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
+template <class T>
+void vTanh(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vTanh<T>, 0, 0>(
+      binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+template void vExp(const int n, const float* a, float* r);
+template void vExp(const int n, const double* a, double* r);
+template void vLog(const int n, const float* a, float* r);
+template void vLog(const int n, const double* a, double* r);
+template void vPow(const int n, const float* a, const float b, float* r);
+template void vPow(const int n, const double* a, const double b, double* r);
+template void vAdd(const int n, const float* a, const float* b, float* r);
+template void vAdd(const int n, const double* a, const double* b, double* r);
+template void vInvSqrt(const int n, const double* a, double* r);
+template void vInvSqrt(const int n, const float* a, float* r);
+template void vLog1p(const int n, const float* a, float* r);
+template void vLog1p(const int n, const double* a, double* r);
+template void vTanh(const int n, const float* a, float* r);
+template void vTanh(const int n, const double* a, double* r);
+#endif
+}  // namespace paddle
diff --git a/paddle/legacy/math/MathFunctions.h b/paddle/legacy/math/MathFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..854e4baa3987f61353038c7b26acf43943c89636
--- /dev/null
+++ b/paddle/legacy/math/MathFunctions.h
@@ -0,0 +1,129 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_MKLML
+#include <mkl_cblas.h>
+#include <mkl_lapacke.h>
+#include <mkl_vml_functions.h>
+#endif
+
+#ifdef PADDLE_USE_VECLIB
+extern "C" {
+#include <cblas.h>
+#include <clapack.h>
+}
+#endif
+
+#ifdef PADDLE_USE_OPENBLAS
+#include <cblas.h>
+#ifdef LAPACK_FOUND
+#include <lapacke.h>
+#endif
+#endif
+
+#ifndef LAPACK_FOUND
+extern "C" {
+#ifndef PADDLE_USE_EIGEN_FOR_BLAS
+#include <cblas.h>
+#else
+typedef enum CBLAS_ORDER {
+  CblasRowMajor = 101,
+  CblasColMajor = 102
+} CBLAS_ORDER;
+#endif
+int LAPACKE_sgetrf(
+    int matrix_layout, int m, int n, float* a, int lda, int* ipiv);
+int LAPACKE_dgetrf(
+    int matrix_layout, int m, int n, double* a, int lda, int* ipiv);
+int LAPACKE_sgetri(
+    int matrix_layout, int n, float* a, int lda, const int* ipiv);
+int LAPACKE_dgetri(
+    int matrix_layout, int n, double* a, int lda, const int* ipiv);
+}
+#endif
+
+#include <cmath>
+
+namespace paddle {
+
+#ifndef PADDLE_USE_EIGEN_FOR_BLAS
+template <class T>
+void gemm(const CBLAS_TRANSPOSE transA,
+          const CBLAS_TRANSPOSE transB,
+          const int M,
+          const int N,
+          const int K,
+          const T alpha,
+          const T* A,
+          const int lda,
+          const T* B,
+          const int ldb,
+          const T beta,
+          T* C,
+          const int ldc);
+#endif
+
+template <class T>
+int getrf(const CBLAS_ORDER Order,
+          const int M,
+          const int N,
+          T* A,
+          const int lda,
+          int* ipiv);
+
+template <class T>
+int getri(
+    const CBLAS_ORDER Order, const int N, T* A, const int lda, const int* ipiv);
+
+template <class T>
+void axpy(const int n, const T alpha, const T* x, T* y) {
+  /// y = y + alpha * x
+  for (int i = 0; i < n; i++) {
+    y[i] = y[i] + alpha * x[i];
+  }
+}
+
+template <class T>
+T dotProduct(const int n, const T* x, const T* y) {
+  T result = static_cast<T>(0);
+  for (int i = 0; i < n; i++) {
+    result += x[i] * y[i];
+  }
+  return result;
+}
+
+template <class T>
+void vExp(const int n, const T* a, T* r);
+
+template <class T>
+void vPow(const int n, const T* a, const T b, T* r);
+
+template <class T>
+void vLog(const int n, const T* a, T* r);
+
+template <class T>
+void vAdd(const int n, const T* a, const T* b, T* r);
+
+template <class T>
+void vInvSqrt(const int n, const T* a, T* r);
+
+template <class T>
+void vLog1p(const int n, const T* a, T* r);
+
+template <class T>
+void vTanh(const int n, const T* a, T* r);
+
+}  // namespace paddle
diff --git a/paddle/legacy/math/MathUtils.cpp b/paddle/legacy/math/MathUtils.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..47ac9c187ca731c98c755501ff3633eabf095186
--- /dev/null
+++ b/paddle/legacy/math/MathUtils.cpp
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MathUtils.h"
+#include <algorithm>
+#include "Vector.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+/*if csc, major is cols and minor is rows, else
+ * major is rows and minor is cols, according to
+ * major value to initialize minor value"
+ */
+void sparseRand(
+    int* major, int* minor, int nnz, int majorLen, int minorMax, bool useGpu) {
+  CHECK(size_t(nnz) >= size_t(1));
+  int* cpuMajor;
+  int* cpuMinor;
+  CpuIVector cpuMinorVec(nnz);
+  CpuIVector cpuMajorVec(majorLen);
+  if (useGpu) {
+    cpuMajor = cpuMajorVec.getData();
+    cpuMinor = cpuMinorVec.getData();
+  } else {
+    cpuMajor = major;
+    cpuMinor = minor;
+  }
+
+  /*major value init*/
+  for (int i = 0; i < majorLen - 1; i++) {
+    cpuMajor[i] = 1.0 * i * nnz / (majorLen - 1);
+  }
+  cpuMajor[majorLen - 1] = nnz;
+
+  /*minor value init according to major value*/
+  std::vector<char> used(minorMax, 0);
+  for (int i = 0; i < majorLen - 1; i++) {
+    CHECK_LE(cpuMajor[i + 1] - cpuMajor[i], minorMax);
+    used.assign(minorMax, 0);
+    for (int j = cpuMajor[i]; j < cpuMajor[i + 1]; j++) {
+      int idx = ::rand() % minorMax;
+      while (used[idx]) {
+        idx = ::rand() % minorMax;
+      }
+      cpuMinor[j] = idx;
+      used[idx] = 1;
+    }
+    std::sort(cpuMinor + cpuMajor[i],
+              cpuMinor + cpuMajor[i + 1],
+              [](int a, int b) { return a < b; });
+  }
+  /*memcpy result to gpu*/
+  if (useGpu) {
+    hl_memcpy_host2device(major, cpuMajor, sizeof(int) * majorLen);
+    hl_memcpy_host2device(minor, cpuMinor, sizeof(int) * nnz);
+  }
+}
+
+int outputSize(
+    int imageSize, int filterSize, int padding, int stride, bool caffeMode) {
+  int outputSize;
+  if (!caffeMode) {
+    outputSize =
+        (imageSize - filterSize + 2 * padding + stride - 1) / stride + 1;
+  } else {
+    outputSize = (imageSize - filterSize + 2 * padding) / stride + 1;
+  }
+  CHECK_GE(outputSize, 1);
+  return outputSize;
+}
+
+int imageSize(
+    int outputSize, int filterSize, int padding, int stride, bool caffeMode) {
+  int imageSize;
+  if (!caffeMode) {
+    imageSize =
+        (outputSize - 1) * stride + filterSize - 2 * padding - stride + 1;
+  } else {
+    imageSize = (outputSize - 1) * stride + filterSize - 2 * padding;
+  }
+  CHECK_GE(imageSize, 1);
+  return imageSize;
+}
+
+}  // namespace paddle
diff --git a/paddle/math/MathUtils.h b/paddle/legacy/math/MathUtils.h
similarity index 100%
rename from paddle/math/MathUtils.h
rename to paddle/legacy/math/MathUtils.h
diff --git a/paddle/legacy/math/Matrix.cpp b/paddle/legacy/math/Matrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e53f95006c36bfce5df8e57e9efc249f56098b70
--- /dev/null
+++ b/paddle/legacy/math/Matrix.cpp
@@ -0,0 +1,4787 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Matrix.h"
+#include "MathFunctions.h"
+#include "SparseMatrix.h"
+#include "SparseRowMatrix.h"
+
+#include <float.h>
+#include <algorithm>
+#include <cmath>
+
+#include <string.h>
+#include "hl_cnn.h"
+#include "hl_gpu.h"
+#include "hl_table_apply.h"
+#include "hl_top_k.h"
+#include "paddle/legacy/utils/Logging.h"
+
+#include "NEONFunctions.h"
+#include "paddle/legacy/function/GemmFunctor.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+#include "SIMDFunctions.h"
+
+namespace paddle {
+
+inline real _pow(real a, real beta) { return std::pow(a, beta); }
+
+inline real _square(real a) { return a * a; }
+
+inline real _safelog(real a) { return a > 0.0f ? std::log(a) : -40.0f; }
+
+Matrix::Matrix(MemoryHandlePtr memHandle,
+               size_t height,
+               size_t width,
+               bool trans,
+               bool use_gpu)
+    : BaseMatrix(
+          height,
+          width,
+          memHandle ? (reinterpret_cast<real*>(memHandle->getBuf())) : nullptr,
+          trans,
+          use_gpu) {
+  elementCnt_ = width * height;
+  memoryHandle_ = memHandle;
+}
+
+Matrix::Matrix(
+    real* data, size_t height, size_t width, bool trans, bool use_gpu)
+    : BaseMatrix(height, width, data, trans, use_gpu) {
+  elementCnt_ = width * height;
+}
+
+Matrix::Matrix(real* data,
+               size_t height,
+               size_t width,
+               size_t stride,
+               bool trans,
+               bool use_gpu)
+    : BaseMatrix(height, width, stride, data, trans, use_gpu) {
+  elementCnt_ = width * height;
+}
+
+MatrixPtr Matrix::createSparseMatrix(real* data,
+                                     int* row,
+                                     int* col,
+                                     size_t height,
+                                     size_t width,
+                                     size_t nnz, /* used to allocate space */
+                                     SparseValueType valueType, /*value type*/
+                                     SparseFormat format,
+                                     bool trans,
+                                     bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuSparseMatrix>(
+        data, row, col, height, width, nnz, valueType, format, trans);
+  } else {
+    return std::make_shared<CpuSparseMatrix>(
+        data, row, col, height, width, nnz, valueType, format, trans);
+  }
+}
+
+MatrixPtr Matrix::createSparseMatrix(size_t height,
+                                     size_t width,
+                                     size_t nnz, /* used to allocate space */
+                                     SparseValueType valueType, /*value type*/
+                                     SparseFormat format,
+                                     bool trans,
+                                     bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuSparseMatrix>(
+        height, width, nnz, valueType, format, trans);
+  } else {
+    return std::make_shared<CpuSparseMatrix>(
+        height, width, nnz, valueType, format, trans);
+  }
+}
+
+MatrixPtr Matrix::create(MemoryHandlePtr memHandle,
+                         size_t height,
+                         size_t width,
+                         bool trans) {
+  if (auto gpuHandle = std::dynamic_pointer_cast<GpuMemoryHandle>(memHandle)) {
+    return std::make_shared<GpuMatrix>(gpuHandle, height, width, trans);
+  } else if (auto cpuHandle =
+                 std::dynamic_pointer_cast<CpuMemoryHandle>(memHandle)) {
+    return std::make_shared<CpuMatrix>(cpuHandle, height, width, trans);
+  } else {
+    LOG(FATAL) << "Wrong";
+    return nullptr;
+  }
+}
+
+MatrixPtr Matrix::create(size_t height, size_t width, bool trans, bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuMatrix>(height, width, trans);
+  } else {
+    return std::make_shared<CpuMatrix>(height, width, trans);
+  }
+}
+
+MatrixPtr Matrix::create(
+    real* data, size_t height, size_t width, bool trans, bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuMatrix>(data, height, width, trans);
+  } else {
+    return std::make_shared<CpuMatrix>(data, height, width, trans);
+  }
+}
+
+MatrixPtr Matrix::create(real* data,
+                         size_t height,
+                         size_t width,
+                         size_t stride,
+                         bool trans,
+                         bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuMatrix>(data, height, width, stride, trans);
+  } else {
+    return std::make_shared<CpuMatrix>(data, height, width, stride, trans);
+  }
+}
+
+MatrixPtr Matrix::createSparseMatrix(size_t height,
+                                     size_t width,
+                                     size_t nnz,
+                                     SparseValueType valueType,
+                                     bool trans,
+                                     bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuSparseMatrix>(
+        height, width, nnz, valueType, SPARSE_CSR, trans);
+  } else {
+    return std::make_shared<CpuSparseMatrix>(
+        height, width, nnz, valueType, SPARSE_CSR, trans);
+  }
+}
+
+void Matrix::resizeOrCreate(
+    MatrixPtr& matrix, size_t height, size_t width, bool trans, bool useGpu) {
+  if (!matrix) {
+    matrix = Matrix::create(height, width, trans, useGpu);
+  } else {
+    CHECK_EQ(matrix->useGpu(), useGpu);
+    matrix->resize(height, width);
+  }
+}
+
+void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix,
+                                        size_t height,
+                                        size_t width,
+                                        size_t nnz,
+                                        SparseValueType valueType,
+                                        SparseFormat format,
+                                        bool trans,
+                                        bool useGpu) {
+  if (!matrix) {
+    matrix = Matrix::createSparseMatrix(
+        height, width, nnz, valueType, format, trans, useGpu);
+  } else {
+    CHECK(dynamic_cast<CpuSparseMatrix*>(matrix.get()) ||
+          dynamic_cast<GpuSparseMatrix*>(matrix.get()));
+    CHECK_EQ(matrix->useGpu(), useGpu);
+    matrix->resize(height, width, nnz, valueType, format);
+  }
+}
+
+void Matrix::reshape(size_t height, size_t width) {
+  CHECK(isContiguous());
+  CHECK(height_ * width_ == height * width);
+  height_ = height;
+  width_ = width;
+  stride_ = width_;
+}
+
+MatrixPtr Matrix::subMatrix(size_t startRow,
+                            size_t endRow,
+                            size_t startCol,
+                            size_t endCol) {
+  CHECK_LE(startRow, endRow);
+  CHECK_LE(endRow, getHeight());
+  CHECK_LE(startCol, endCol);
+  CHECK_LE(endCol, getWidth());
+
+  return Matrix::create(getData() + startRow * getStride() + startCol,
+                        endRow - startRow,
+                        endCol - startCol,
+                        getStride(),
+                        trans_,
+                        useGpu_);
+}
+
+void Matrix::setDiag(real value) {
+  CHECK(data_ != NULL);
+  CHECK_EQ(height_, width_);
+
+  zeroMem();
+  BaseMatrix diag(height_, 1, stride_ + 1, data_, false, useGpu_);
+  diag.assign(value);
+}
+
+GpuMatrix::GpuMatrix(size_t height, size_t width, bool trans)
+    : Matrix(std::make_shared<GpuMemoryHandle>(height * width * sizeof(real)),
+             height,
+             width,
+             trans,
+             true) {}
+
+GpuMatrix::~GpuMatrix() {}
+
+void GpuMatrix::zeroMem() {
+  CHECK(data_ != NULL);
+  zero();
+}
+
+void GpuMatrix::resetOne() {
+  CHECK(data_ != NULL);
+  one();
+}
+
+void GpuMatrix::resize(size_t newHeight, size_t newWidth) {
+  size_t newSize = newHeight * newWidth;
+  if (NULL == memoryHandle_.get() ||
+      newSize * sizeof(real) > memoryHandle_->getAllocSize()) {
+    memoryHandle_ = std::make_shared<GpuMemoryHandle>(newSize * sizeof(real));
+    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
+  }
+  height_ = newHeight;
+  width_ = newWidth;
+  elementCnt_ = newSize;
+  stride_ = width_;
+}
+
+real GpuMatrix::getElement(size_t x, size_t y) const {
+  real elem = 0;
+  hl_memcpy_device2host(&elem, &data_[x * stride_ + y], sizeof(real));
+  return elem;
+}
+
+real GpuMatrix::getSum() {
+  CHECK(isContiguous());
+  real sum = 0.0f;
+  hl_vector_sum(data_, &sum, height_ * width_);
+  return sum;
+}
+
+real GpuMatrix::getMin() {
+  CHECK(isContiguous());
+  auto vec = GpuVector(height_ * width_, data_);
+  return vec.getMin();
+}
+
+real GpuMatrix::getMax() {
+  CHECK(isContiguous());
+  auto vec = GpuVector(height_ * width_, data_);
+  return vec.getMax();
+}
+
+void GpuMatrix::accumulateColSum(Matrix& src) {
+  CHECK_EQ(getWidth(), src.getWidth());
+  CHECK_EQ(getHeight(), (size_t)1);
+  sumCols(src, 1.0, 1.0);
+}
+
+real GpuMatrix::getAbsSum() {
+  CHECK(isContiguous());
+  real sum = 0.0f;
+  hl_vector_abs_sum(data_, &sum, height_ * width_);
+  return sum;
+}
+
+void GpuMatrix::copyFrom(const Matrix& src) {
+  CHECK(isContiguous());
+  CHECK(src.isContiguous());
+  CHECK(elementCnt_ == src.getElementCnt());
+
+  if (typeid(src) == typeid(CpuMatrix)) {
+    hl_memcpy_host2device(
+        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
+  } else if (typeid(src) == typeid(GpuMatrix)) {
+    hl_memcpy_device2device(
+        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
+  } else {
+    LOG(FATAL) << "Wrong";
+  }
+}
+
+void GpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
+  CHECK(isContiguous());
+  CHECK(src.isContiguous());
+  CHECK(elementCnt_ == src.getElementCnt());
+  hl_memcpy_async(this->getData(),
+                  const_cast<real*>(src.getData()),
+                  sizeof(real) * elementCnt_,
+                  stream);
+}
+
+void GpuMatrix::copyFrom(const real* hostSrc, size_t size) {
+  CHECK(isContiguous());
+  CHECK(size <= elementCnt_);
+  hl_memcpy_host2device(data_, const_cast<real*>(hostSrc), sizeof(real) * size);
+}
+
+void GpuMatrix::copyFrom(const real* hostSrc, const int64_t* seq) {
+  LOG(FATAL) << "not implemented";
+}
+
+void GpuMatrix::copyFrom(const IVector& src) {
+  CHECK(isContiguous());
+  CpuMatrix matrix(src.getSize(), 1, false);
+  matrix.copyFrom(src);
+  copyFrom(matrix);
+}
+
+void GpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
+  size_t height = getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(b.getWidth(), width);
+  real* dst = getData();
+  real* src = b.getData();
+  const int* index = rowIndex.getData();
+  hl_sequence2batch_copy(dst, src, index, width, height, true);
+}
+
+MatrixPtr GpuMatrix::clone(size_t height, size_t width, bool useGpu) {
+  CHECK(isContiguous());
+
+  if (height == 0 && width == 0) {
+    height = height_;
+    width = width_;
+  }
+
+  CHECK(width && height);
+
+  if (useGpu) {
+    return std::make_shared<GpuMatrix>(height, width);
+  } else {
+    return std::make_shared<CpuMatrix>(height, width);
+  }
+}
+
+MatrixPtr GpuMatrix::getTranspose() {
+  if (memoryHandle_.get() != NULL) {
+    MatrixPtr copy_T(
+        new GpuMatrix(std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle_),
+                      height_,
+                      width_,
+                      true));
+    return copy_T;
+  } else {
+    MatrixPtr copy_T(new GpuMatrix(data_, height_, width_, true));
+    return copy_T;
+  }
+}
+
+void GpuMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
+  if (memAlloc) {
+    matTrans = std::make_shared<GpuMatrix>(width_, height_);
+  } else {
+    CHECK(matTrans != NULL);
+    CHECK_EQ(matTrans->getHeight(), width_);
+    CHECK_EQ(matTrans->getWidth(), height_);
+  }
+  real* dataTrans = matTrans->getData();
+  real* data = getData();
+  int lda = getStride();
+  int ldc = matTrans->getStride();
+
+  hl_matrix_transpose(data, dataTrans, height_, width_, lda, ldc);
+}
+
+void GpuMatrix::rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
+  if (memAlloc) {
+    matRot = std::make_shared<GpuMatrix>(width_, height_);
+  } else {
+    CHECK(matRot != NULL);
+    CHECK_EQ(matRot->getHeight(), width_);
+    CHECK_EQ(matRot->getWidth(), height_);
+  }
+
+  real* dataRot = matRot->getData();
+  real* data = getData();
+  hl_matrix_rotate(data, dataRot, height_, width_, clockWise);
+}
+
+MatrixPtr GpuMatrix::getInverse() {
+  MatrixPtr matInv;
+  inverse(matInv, true);
+  return matInv;
+}
+
+void GpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
+  CHECK_EQ(height_, width_);
+
+  if (memAlloc) {
+    matInv = std::make_shared<GpuMatrix>(height_, width_);
+  } else {
+    CHECK(matInv != NULL);
+  }
+
+  real* data = getData();
+  real* dataInv = matInv->getData();
+  int lda = getStride();
+  int ldc = matInv->getStride();
+
+  hl_matrix_inverse(data, dataInv, height_, lda, ldc);
+}
+
+void GpuMatrix::addBias(Matrix& b, real scale) {
+  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
+  BaseMatrix::addBias(b, scale);
+}
+
+void GpuMatrix::addSharedBias(Matrix& b, real scale) {
+  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
+  CHECK_LE(b.getWidth(), getWidth());
+  CHECK_EQ(getWidth() % b.getWidth(), 0UL);
+  hl_matrix_add_shared_bias(
+      getData(), b.getData(), b.getWidth(), getHeight(), getWidth(), scale);
+}
+
+void GpuMatrix::collectBias(Matrix& a, real scale) {
+#ifdef PADDLE_WITH_CUDA
+  CHECK_EQ(getHeight(), (size_t)1);
+  CHECK_EQ(width_, a.getWidth());
+  GpuSparseMatrix* sMatPtr = dynamic_cast<GpuSparseMatrix*>(&a);
+  if (!sMatPtr) {
+    sumCols(a, /* scaleSum= */ scale, /* scaleDest= */ 1);
+  } else {
+    real* data = getData();
+    hl_sparse_matrix_s A_d = sMatPtr->sMatrix_.get();
+    hl_sparse_matrix_column_sum(data, A_d, sMatPtr->getHeight(), width_, scale);
+  }
+#endif
+}
+
+void GpuMatrix::collectSharedBias(Matrix& a, real scale) {
+  CHECK_EQ(getHeight(), (size_t)1);
+  CHECK_EQ(a.getWidth() % getWidth(), 0UL);
+  hl_matrix_collect_shared_bias(
+      getData(), a.getData(), getWidth(), a.getHeight(), a.getWidth(), scale);
+}
+
+void GpuMatrix::sequenceAvgForward(Matrix& a,
+                                   const IVector& startsPos,
+                                   int mode) {
+  size_t height = getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(height, startsPos.getSize() - 1);
+  CHECK_EQ(width, a.getWidth());
+  real* dst = getData();
+  real* src = a.getData();
+  const int* starts = startsPos.getData();
+
+  hl_sequence_avg_forward(dst, src, starts, height, width, mode);
+}
+
+void GpuMatrix::sequenceAvgBackward(Matrix& a,
+                                    const IVector& startsPos,
+                                    int mode) {
+  size_t height = a.getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(height, startsPos.getSize() - 1);
+  CHECK_EQ(width, a.getWidth());
+  real* dst = getData();
+  real* src = a.getData();
+  const int* starts = startsPos.getData();
+
+  hl_sequence_avg_backward(dst, src, starts, height, width, mode);
+}
+
+/* this = scaleAB*(a*b) +  scaleT*this */
+void GpuMatrix::mul(const GpuMatrix& a,
+                    const GpuMatrix& b,
+                    real scaleAB,
+                    real scaleT) {
+  CHECK(!isTransposed()) << "Not supported";
+
+  if (!a.isTransposed() && !b.isTransposed()) {
+    CHECK_EQ(width_, b.width_);
+    CHECK_EQ(height_, a.height_);
+    CHECK_EQ(a.width_, b.height_);
+  } else if (a.isTransposed() && !b.isTransposed()) {
+    CHECK_EQ(width_, b.width_);
+    CHECK_EQ(height_, a.width_);
+    CHECK_EQ(a.height_, b.height_);
+  } else if (!a.isTransposed() && b.isTransposed()) {
+    CHECK_EQ(width_, b.height_);
+    CHECK_EQ(height_, a.height_);
+    CHECK_EQ(a.width_, b.width_);
+  } else {
+    LOG(FATAL) << "Is not supported";
+  }
+
+  real* A_d = a.data_;
+  real* B_d = b.data_;
+  real* C_d = data_;
+  int dimM = getHeight();
+  int dimN = getWidth();
+  int dimK = !a.isTransposed() ? a.width_ : a.height_;
+  int lda = a.getStride();
+  int ldb = b.getStride();
+  int ldc = getStride();
+  hl_trans_op_t transa = !a.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
+  hl_trans_op_t transb = !b.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
+
+  hl_matrix_mul(A_d,
+                transa,
+                B_d,
+                transb,
+                C_d,
+                dimM,
+                dimN,
+                dimK,
+                scaleAB,
+                scaleT,
+                lda,
+                ldb,
+                ldc);
+}
+
+void GpuMatrix::mul(const GpuSparseMatrix& a,
+                    const GpuMatrix& b,
+                    real scaleAB,
+                    real scaleT) {
+#ifdef PADDLE_WITH_CUDA
+  CHECK(isContiguous());
+  CHECK(b.isContiguous());
+  CHECK(b.useGpu_ == true) << "Matrix type are not equal";
+  CHECK(!trans_ && !b.trans_) << "not supported";
+
+  if (!a.trans_) {
+    CHECK(width_ == b.width_ && height_ == a.height_ && a.width_ == b.height_)
+        << "Matrix dimensions are not equal";
+  } else {
+    CHECK(width_ == b.width_ && height_ == a.width_ && a.height_ == b.height_)
+        << "Matrix dimensions are not equal";
+  }
+  hl_trans_op_t transA = a.trans_ ? HPPL_OP_T : HPPL_OP_N;
+  hl_sparse_matrix_s A_d = a.sMatrix_.get();
+  real* B_d = b.data_;
+  real* C_d = data_;
+  hl_matrix_csr_mul_dense(A_d,
+                          transA,
+                          B_d,
+                          HPPL_OP_N,
+                          C_d,
+                          height_,
+                          width_,
+                          b.height_,
+                          scaleAB,
+                          scaleT);
+#endif
+}
+
+void GpuMatrix::mul(const GpuMatrix& a,
+                    const GpuSparseMatrix& b,
+                    real scaleAB,
+                    real scaleT) {
+#ifdef PADDLE_WITH_CUDA
+  CHECK(isContiguous());
+  CHECK(a.isContiguous());
+  CHECK(a.useGpu_ == true) << "Matrix type are not equal";
+
+  hl_sparse_matrix_s B_d = b.sMatrix_.get();
+  real* A_d = a.data_;
+  real* C_d = data_;
+  hl_trans_op_t transB = b.trans_ ? HPPL_OP_T : HPPL_OP_N;
+  if (!b.trans_) {
+    CHECK(width_ == b.width_ && height_ == a.height_ && a.width_ == b.height_)
+        << "Matrix dimensions are not equal";
+  } else {
+    CHECK(width_ == b.height_ && height_ == a.height_ && a.width_ == b.width_)
+        << "Matrix dimensions are not equal";
+  }
+  if (b.format_ == SPARSE_CSC) {
+    hl_matrix_dense_mul_csc(A_d,
+                            HPPL_OP_N,
+                            B_d,
+                            transB,
+                            C_d,
+                            height_,
+                            width_,
+                            a.width_,
+                            scaleAB,
+                            scaleT);
+  } else {
+    hl_matrix_dense_mul_csr(A_d,
+                            HPPL_OP_N,
+                            B_d,
+                            transB,
+                            C_d,
+                            height_,
+                            width_,
+                            a.width_,
+                            scaleAB,
+                            scaleT);
+  }
+#endif
+}
+
+/* this = a*b */
+void GpuMatrix::mul(const Matrix& a, const Matrix& b) { mul(a, b, 1.0, 0.0); }
+
+void GpuMatrix::mul(const Matrix& a,
+                    const Matrix& b,
+                    real scaleAB,
+                    real scaleT) {
+  const auto a_ptr = dynamic_cast<const GpuMatrix*>(&a);
+  const auto b_ptr = dynamic_cast<const GpuMatrix*>(&b);
+  const auto a_ptr_s = dynamic_cast<const GpuSparseMatrix*>(&a);
+  const auto b_ptr_s = dynamic_cast<const GpuSparseMatrix*>(&b);
+
+  if (a_ptr && b_ptr) {
+    mul(*a_ptr, *b_ptr, scaleAB, scaleT);
+  } else if (a_ptr_s && b_ptr) {
+    mul(*a_ptr_s, *b_ptr, scaleAB, scaleT);
+  } else if (a_ptr && b_ptr_s) {
+    mul(*a_ptr, *b_ptr_s, scaleAB, scaleT);
+  } else {
+    LOG(FATAL) << "Not supported";
+  }
+}
+
+/* this = this* b */
+void GpuMatrix::rightMul(Matrix& b) { rightMul(b, 1.0, 0.0); }
+
+/* this = scaleAB*(this*b) +  scaleT*this */
+void GpuMatrix::rightMul(Matrix& b, real scaleAB, real scaleT) {
+  CHECK(dynamic_cast<GpuMatrix*>(&b));
+  CHECK(!isTransposed()) << "Not supported";
+  CHECK(!b.isTransposed()) << "Not supported";
+  mul(*this, *dynamic_cast<GpuMatrix*>(&b), scaleAB, scaleT);
+}
+
+/* this = a*this */
+void GpuMatrix::leftMul(Matrix& a) { leftMul(a, 1.0, 0.0); }
+
+/* this = scaleAB*(a*this) +  scaleT*this */
+void GpuMatrix::leftMul(Matrix& a, real scaleAB, real scaleT) {
+  CHECK(dynamic_cast<GpuMatrix*>(&a));
+  CHECK(!isTransposed()) << "Not supported";
+  CHECK(!a.isTransposed()) << "Not supported";
+  mul(*dynamic_cast<GpuMatrix*>(&a), *this, scaleAB, scaleT);
+}
+
+void GpuMatrix::selectRows(Matrix& table, IVector& ids) {
+#ifdef PADDLE_WITH_CUDA
+  CHECK(dynamic_cast<GpuMatrix*>(&table));
+  CHECK(table.useGpu());
+  CHECK(ids.useGpu());
+  CHECK_EQ(getHeight(), ids.getSize());
+  CHECK_EQ(getWidth(), table.getWidth());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  real* a = getData();
+  size_t tableSize = table.getHeight();
+  int* index = ids.getData();
+
+  hl_matrix_select_rows(a,
+                        stride_,
+                        table.getData(),
+                        table.stride_,
+                        index,
+                        numSamples,
+                        tableSize,
+                        dim);
+#endif
+}
+
+void GpuMatrix::addToRows(Matrix& table, IVector& ids) {
+#ifdef PADDLE_WITH_CUDA
+  CHECK(dynamic_cast<GpuMatrix*>(&table));
+  CHECK(table.useGpu());
+  CHECK(ids.useGpu());
+  CHECK_EQ(getHeight(), ids.getSize());
+  CHECK_EQ(getWidth(), table.getWidth());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  real* a = getData();
+  size_t tableSize = table.getHeight();
+  int* index = ids.getData();
+
+  hl_matrix_add_to_rows(table.getData(),
+                        table.stride_,
+                        a,
+                        stride_,
+                        index,
+                        numSamples,
+                        tableSize,
+                        dim);
+#endif
+}
+
+void GpuMatrix::colMerge(Matrix& src) {
+  CHECK(src.height_ == height_);
+  if (!trans_ && !src.trans_) {
+    sumRows(src, /* scaleSum= */ 1, /* scaleDest= */ 0);
+  } else {
+    LOG(FATAL) << "Is not supported";
+  }
+}
+
+void GpuMatrix::rowSum(Matrix& sum) {
+  CHECK_EQ(sum.getHeight(), getHeight());
+  CHECK_EQ(sum.getWidth(), (size_t)1);
+
+  sum.sumRows(*this, /* scaleSum= */ 1, /* scaleDest= */ 0);
+}
+
+void GpuMatrix::rowMax(Matrix& max) {
+  CHECK_EQ(max.getHeight(), getHeight());
+  CHECK_EQ(max.getWidth(), (size_t)1);
+
+  max.maxRows(*this);
+}
+
+void GpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
+#ifdef PADDLE_WITH_CUDA
+  CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
+  size_t numSamples = getHeight();
+  size_t beam = maxVal.getWidth();
+  CHECK_EQ(maxIds.getSize(), numSamples * beam);
+  CHECK_EQ(maxVal.getHeight(), numSamples);
+  CHECK_EQ(maxVal.getWidth(), beam);
+
+  hl_matrix_top_k(maxVal.getData(),
+                  maxVal.getStride(),
+                  maxIds.getData(),
+                  this->getData(),
+                  this->getStride(),
+                  this->getWidth(),
+                  beam,
+                  numSamples);
+#endif
+}
+
+void GpuMatrix::colMax(Matrix& max) {
+  CHECK_EQ(max.getWidth(), getWidth());
+  CHECK_EQ(max.getHeight(), (size_t)1);
+
+  max.maxCols(*this);
+}
+
+void GpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
+  LOG(FATAL) << "Is not supported";
+}
+
+void GpuMatrix::maxoutForward(Matrix& a,
+                              IVector& id,
+                              size_t channels,
+                              size_t groups) {
+  CHECK(dynamic_cast<GpuMatrix*>(&a));
+  CHECK(dynamic_cast<GpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+
+  size_t size = getWidth();
+  size_t batchSize = getHeight();
+  const real* input = a.getData();
+  real* output = getData();
+  int* idForGpu = id.getData();
+
+  hl_maxout_forward(
+      input, output, idForGpu, batchSize, size, size / channels, groups);
+}
+
+void GpuMatrix::maxoutBackward(Matrix& a,
+                               IVector& id,
+                               size_t channels,
+                               size_t groups) {
+  CHECK(dynamic_cast<GpuMatrix*>(&a));
+  CHECK(dynamic_cast<GpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+
+  size_t size = a.getWidth();
+  size_t batchSize = getHeight();
+  real* input = getData();
+  const real* output = a.getData();
+  const int* idForGpu = id.getData();
+
+  hl_maxout_backward(
+      input, output, idForGpu, batchSize, size, size / channels, groups);
+}
+
+/*calulate the error of classification */
+void GpuMatrix::classificationError(Matrix& output,
+                                    IVector& label,
+                                    size_t topkSize) {
+  auto gpuOutput = dynamic_cast<GpuMatrix*>(&output);
+  auto gpuLabel = dynamic_cast<GpuIVector*>(&label);
+  size_t numSamples = this->getHeight();
+  GpuMatrixPtr gpuTopVal = std::make_shared<GpuMatrix>(numSamples, topkSize);
+  GpuIVectorPtr gpuTopIds = std::make_shared<GpuIVector>(numSamples * topkSize);
+
+  CHECK(gpuOutput && gpuLabel) << "Invalid argument pointer";
+  CHECK(gpuTopVal && gpuTopIds) << "Allocate GPU memory failed";
+  CHECK(gpuLabel->getSize() == numSamples) << "Vector size is not equal";
+  CHECK(numSamples == gpuOutput->getHeight() && this->getWidth() == 1)
+      << "Matrix dimensions are not equal";
+
+  size_t dim = gpuOutput->getWidth();
+  hl_matrix_classification_error(gpuTopVal->getData(),
+                                 gpuTopVal->getStride(),
+                                 gpuTopIds->getData(),
+                                 gpuOutput->getData(),
+                                 gpuOutput->getStride(),
+                                 dim,
+                                 topkSize,
+                                 numSamples,
+                                 gpuLabel->getData(),
+                                 this->getData());
+}
+
+/* copy -log(output[i * width + label]) to this->data[i] */
+void GpuMatrix::oneHotCrossEntropy(Matrix& output, IVector& label) {
+  GpuMatrix* output_ptr = dynamic_cast<GpuMatrix*>(&output);
+  GpuIVector* label_ptr = dynamic_cast<GpuIVector*>(&label);
+
+  CHECK(output_ptr && label_ptr) << "Invalid argument pointer";
+
+  CHECK(height_ == label.getSize() && width_ == 1 && height_ == output.height_)
+      << "Matrix dimensions are not equal";
+
+  real* A_d = output_ptr->data_;
+  real* C_d = data_;
+  int* label_d = label_ptr->getData();
+
+  hl_matrix_cross_entropy(A_d, C_d, label_d, height_, output.width_);
+}
+
+/* calculate the error of outputV according to label */
+void GpuMatrix::oneHotCrossEntropyBp(Matrix& outputV, IVector& label) {
+  GpuMatrix* output_ptr = dynamic_cast<GpuMatrix*>(&outputV);
+  GpuIVector* label_ptr = dynamic_cast<GpuIVector*>(&label);
+
+  CHECK(output_ptr && label_ptr) << "Invalid argument pointer";
+
+  CHECK(height_ == output_ptr->height_ && width_ == output_ptr->width_)
+      << "Matrix dimensions are not equal";
+
+  real* output_d = output_ptr->data_;
+  real* grad_d = data_;
+  int* label_d = label_ptr->getData();
+
+  hl_matrix_cross_entropy_bp(grad_d, output_d, label_d, height_, width_);
+}
+
+void GpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                               IVector& label,
+                                               real alpha) {
+  LOG(FATAL) << "Not implemented";
+}
+
+void GpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
+                                                 IVector& label,
+                                                 real alpha) {
+  LOG(FATAL) << "Not implemented";
+}
+
+void GpuMatrix::softmax(Matrix& output) {
+  CHECK(output.useGpu()) << "Matrix type are not equal";
+
+  size_t height = getHeight();
+  size_t width = getWidth();
+  CHECK(height == output.getHeight() && width == output.getWidth())
+      << "Matrix dimensions are not equal";
+
+  real* inputData = getData();
+  real* outputData = output.getData();
+  hl_matrix_softmax(inputData, outputData, height, width);
+}
+
+void GpuMatrix::sequenceSoftmax(Matrix& output, const IVector& index) {
+  CHECK_EQ(getWidth(), 1UL);
+  CHECK_EQ(output.getWidth(), 1UL);
+  CHECK(isContiguous());
+
+  real* inputData = getData();
+  real* outputData = output.getData();
+  auto starts = index.getData();
+  int numSequences = index.getSize() - 1;
+  hl_sequence_softmax_forward(inputData, outputData, starts, numSequences);
+}
+
+void GpuMatrix::softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
+  CHECK(output.useGpu_ == true && sftmaxSum.useGpu_ == true)
+      << "Matrix type are not equal";
+
+  CHECK(height_ == output.height_ && width_ == output.width_ &&
+        height_ == sftmaxSum.height_)
+      << "Matrix dimensions are not equal";
+
+  real* output_d = output.data_;
+  real* sftmaxSum_d = sftmaxSum.data_;
+  real* grad_d = data_;
+  hl_matrix_softmax_derivative(grad_d, output_d, sftmaxSum_d, height_, width_);
+}
+
+void GpuMatrix::softmaxBackward(Matrix& outputV) {
+  CHECK(outputV.useGpu()) << "Matrix type are not equal";
+
+  size_t height = getHeight();
+  size_t width = getWidth();
+  CHECK(height == outputV.getHeight() && width == outputV.getWidth())
+      << "Matrix dimensions are not equal";
+
+  real* output_grad = getData();
+  real* output_value = outputV.getData();
+  hl_softmax_backward(output_value, output_grad, height, width);
+}
+
+void GpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
+  CHECK_EQ(label.getHeight(), height_);
+  CHECK_EQ(output.getHeight(), height_);
+  CHECK_EQ(label.getWidth(), output.getWidth());
+  CHECK_EQ((size_t)1, width_);
+
+  auto labelptr = dynamic_cast<GpuSparseMatrix*>(&label);
+  if (labelptr) {
+    LOG(FATAL) << "not supported: GpuSparseMatrix as label";
+  }
+
+  BaseMatrix::sumOfSquaredDiffs(output,
+                                label,
+                                /* scaleSum= */ 1,
+                                /* scaleDest= */ 1);
+}
+
+void GpuMatrix::sumOfSquaresBp(Matrix& outputV, Matrix& label) {
+  add2(outputV, label, 1, 2, -2);
+}
+
+void GpuMatrix::tanh(Matrix& output) { BaseMatrix::tanh(output); }
+
+void GpuMatrix::tanhDerivative(Matrix& output) {
+  BaseMatrix::tanhDerivative(output);
+}
+
+void GpuMatrix::softrelu(Matrix& output) { BaseMatrix::softrelu(output); }
+
+void GpuMatrix::softreluDerivative(Matrix& output) {
+  BaseMatrix::softreluDerivative(output);
+}
+
+void GpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
+  BaseMatrix::scaledTanh(output, p1, p2);
+}
+
+void GpuMatrix::randomizeUniform() {
+  CHECK(isContiguous());
+  real* data = data_;
+  size_t size = height_ * width_;
+
+  hl_rand(data, size);
+}
+
+void GpuMatrix::print(std::ostream& os) const {
+  CHECK(isContiguous());
+  CpuMatrix cpuMat(getHeight(), getWidth());
+  cpuMat.copyFrom(*this);
+  cpuMat.print(os);
+}
+
+void GpuMatrix::print(std::ostream& os, size_t height, size_t width) const {
+  CHECK(isContiguous());
+  CpuMatrix cpuMat(getHeight(), getWidth());
+  cpuMat.copyFrom(*this);
+  cpuMat.print(os, height, width);
+}
+
+void GpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
+  CHECK(isContiguous());
+  CHECK(height_ == refMat.getHeight());
+  CHECK(width_ == refMat.getWidth());
+  CpuMatrix cpuRef(height_, width_);
+  GpuMatrix gpuRef(height_, width_);
+  cpuRef.copyFrom(refMat);
+  gpuRef.copyFrom(*this);
+  size_t diffCnt = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    for (size_t j = 0; j < width_; ++j) {
+      real a = gpuRef.getElement(i, j);
+      real b = cpuRef.getElement(i, j);
+      if (fabs(a - b) > 0.00001) {
+        ++diffCnt;
+        if (printDiff) {
+          os << "ref= " << a << "  check= " << b << std::endl;
+        }
+      }
+    }
+  }
+  LOG(INFO) << "the  diffCnt is " << diffCnt;
+}
+
+void GpuMatrix::upsampleForward(Matrix& input,
+                                Matrix& mask,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t channels,
+                                size_t outputH,
+                                size_t outputW) {
+  CHECK(input.useGpu_ == true) << "Matrix type are not equal";
+  CHECK(mask.useGpu_ == true) << "Matrix type are not equal";
+
+  real* inputData = input.getData();
+  real* maskData = mask.getData();
+  real* outData = data_;
+
+  size_t batch = input.getHeight();
+
+  CHECK(imgSizeH * imgSizeW * channels == input.getWidth());
+  CHECK(imgSizeH * imgSizeW * channels == mask.getWidth());
+  CHECK_EQ(batch, this->getHeight());
+  CHECK(width_ == outputH * outputW * channels);
+  hl_upsample_forward(inputData,
+                      maskData,
+                      batch,
+                      imgSizeH,
+                      imgSizeW,
+                      channels,
+                      outputH,
+                      outputW,
+                      outData);
+}
+
+void GpuMatrix::upsampleBackward(Matrix& outputGrad,
+                                 Matrix& mask,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t channels,
+                                 size_t outputH,
+                                 size_t outputW) {
+  CHECK(outputGrad.useGpu_ == true) << "Matrix type are not equal";
+  CHECK(mask.useGpu_ == true) << "Matrix type are not equal";
+
+  real* outputGradData = outputGrad.getData();
+  real* maskData = mask.getData();
+  real* inputGradData = data_;
+  size_t batch = outputGrad.getHeight();
+
+  CHECK(imgSizeH * imgSizeW == this->getWidth() / channels);
+  CHECK_EQ(batch, this->getHeight());
+  CHECK_EQ(channels * outputH * outputW, outputGrad.getWidth());
+  hl_upsample_backward(outputGradData,
+                       maskData,
+                       batch,
+                       imgSizeH,
+                       imgSizeW,
+                       channels,
+                       outputH,
+                       outputW,
+                       inputGradData);
+}
+
+void GpuMatrix::maxPoolForward(Matrix& inputMat,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               size_t paddingH,
+                               size_t paddingW,
+                               MatrixPtr maskMatP) {
+  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
+
+  real* inputData = inputMat.getData();
+  real* maskData = NULL;
+  size_t frameNum = inputMat.getHeight();
+  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(width_ == outputH * outputW * channels);
+
+  if (maskMatP != NULL) {
+    CHECK(maskMatP->useGpu_ == true) << "Matrix type are not equal";
+    CHECK(outputH * outputW * channels == maskMatP->getWidth());
+    maskData = maskMatP->getData();
+  }
+
+  hl_maxpool_forward(frameNum,
+                     inputData,
+                     channels,
+                     imgSizeH,
+                     imgSizeW,
+                     outputH,
+                     outputW,
+                     sizeX,
+                     sizeY,
+                     strideH,
+                     strideW,
+                     paddingH,
+                     paddingW,
+                     data_,
+                     getStride(),
+                     maskData);
+}
+
+void GpuMatrix::maxPoolBackward(Matrix& inputMat,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                Matrix& outGrad,
+                                Matrix& outV,
+                                size_t sizeX,
+                                size_t sizeY,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputH,
+                                size_t outputW,
+                                real scaleTargets,
+                                real scaleOutput,
+                                size_t paddingH,
+                                size_t paddingW) {
+  CHECK(inputMat.useGpu_ == true && outGrad.useGpu_ == true &&
+        outV.useGpu_ == true)
+      << "Matrix type are not equal";
+
+  real* inputData = inputMat.getData();
+  real* outData = outV.getData();
+  real* outDiff = outGrad.getData();
+  size_t frameNum = inputMat.getHeight();
+  size_t channels = outV.getWidth() / outputH / outputW;
+  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(outGrad.getHeight() == outV.getHeight() &&
+        outGrad.getWidth() == outV.getWidth());
+
+  hl_maxpool_backward(frameNum,
+                      inputData,
+                      outData,
+                      outDiff,
+                      channels,
+                      imgSizeH,
+                      imgSizeW,
+                      outputH,
+                      outputW,
+                      sizeX,
+                      sizeY,
+                      strideH,
+                      strideW,
+                      paddingH,
+                      paddingW,
+                      scaleTargets,
+                      scaleOutput,
+                      data_,
+                      outGrad.getStride());
+}
+
+void GpuMatrix::avgPoolForward(Matrix& inputMat,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               size_t paddingH,
+                               size_t paddingW,
+                               bool excludeMode) {
+  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
+
+  real* inputData = inputMat.getData();
+  size_t frameNum = inputMat.getHeight();
+  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(width_ == outputH * outputW * channels);
+
+  hl_avgpool_forward(frameNum,
+                     inputData,
+                     channels,
+                     imgSizeH,
+                     imgSizeW,
+                     outputH,
+                     outputW,
+                     sizeX,
+                     sizeY,
+                     strideH,
+                     strideW,
+                     paddingH,
+                     paddingW,
+                     data_,
+                     getStride(),
+                     excludeMode);
+}
+
+void GpuMatrix::avgPoolBackward(Matrix& outGrad,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t sizeX,
+                                size_t sizeY,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputH,
+                                size_t outputW,
+                                real scaleTargets,
+                                real scaleOutput,
+                                size_t paddingH,
+                                size_t paddingW,
+                                bool excludeMode) {
+  CHECK(outGrad.useGpu_ == true) << "Matrix type are not equal";
+
+  real* outDiff = outGrad.getData();
+  size_t frameNum = outGrad.getHeight();
+  size_t channels = outGrad.getWidth() / outputH / outputW;
+  CHECK(imgSizeH * imgSizeW * channels == width_);
+  CHECK(height_ == outGrad.getHeight());
+  CHECK(outGrad.getWidth() == outputH * outputW * channels);
+
+  hl_avgpool_backward(frameNum,
+                      outDiff,
+                      channels,
+                      imgSizeH,
+                      imgSizeW,
+                      outputH,
+                      outputW,
+                      sizeX,
+                      sizeY,
+                      strideH,
+                      strideW,
+                      paddingH,
+                      paddingW,
+                      scaleTargets,
+                      scaleOutput,
+                      data_,
+                      outGrad.getStride(),
+                      excludeMode);
+}
+
+void GpuMatrix::maxPool3DForward(Matrix& inputMat,
+                                 Matrix& maxPoolIdx,
+                                 size_t channels,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  CHECK(inputMat.useGpu_) << "Matrix type are not correct";
+
+  real* inputData = inputMat.getData();
+  real* maxPoolIdxData = maxPoolIdx.getData();
+  size_t num = inputMat.getHeight();
+  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(width_ == outputD * outputH * outputW * channels);
+
+  hl_maxpool3D_forward(num,
+                       inputData,
+                       channels,
+                       imgSizeD,
+                       imgSizeH,
+                       imgSizeW,
+                       outputD,
+                       outputH,
+                       outputW,
+                       sizeZ,
+                       sizeY,
+                       sizeX,
+                       strideD,
+                       strideH,
+                       strideW,
+                       paddingD,
+                       paddingH,
+                       paddingW,
+                       getData(),
+                       maxPoolIdxData,
+                       getStride());
+}
+
+void GpuMatrix::maxPool3DBackward(Matrix& outGrad,
+                                  Matrix& maxPoolIdx,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  real scaleTargets,
+                                  real scaleOutput) {
+  CHECK(outGrad.useGpu_ && maxPoolIdx.useGpu_) << "Matrix type are not equal";
+
+  real* outDiff = outGrad.getData();
+  real* maxPoolIdxData = maxPoolIdx.getData();
+  size_t frameNum = getHeight();
+  size_t channels = outGrad.getWidth() / outputD / outputH / outputW;
+  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == getWidth());
+  CHECK(outGrad.getHeight() == maxPoolIdx.getHeight() &&
+        outGrad.getWidth() == maxPoolIdx.getWidth());
+
+  hl_maxpool3D_backward(frameNum,
+                        outDiff,
+                        channels,
+                        imgSizeD,
+                        imgSizeH,
+                        imgSizeW,
+                        outputD,
+                        outputH,
+                        outputW,
+                        sizeZ,
+                        sizeY,
+                        sizeX,
+                        strideD,
+                        strideH,
+                        strideW,
+                        paddingD,
+                        paddingH,
+                        paddingW,
+                        scaleTargets,
+                        scaleOutput,
+                        getData(),
+                        maxPoolIdxData,
+                        outGrad.getStride());
+}
+
+void GpuMatrix::avgPool3DForward(Matrix& inputMat,
+                                 size_t channels,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  CHECK(inputMat.useGpu_) << "Matrix type are not equal";
+
+  real* inputData = inputMat.getData();
+  size_t frameNum = inputMat.getHeight();
+  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(width_ == outputD * outputH * outputW * channels);
+
+  hl_avgpool3D_forward(frameNum,
+                       inputData,
+                       channels,
+                       imgSizeD,
+                       imgSizeH,
+                       imgSizeW,
+                       outputD,
+                       outputH,
+                       outputW,
+                       sizeZ,
+                       sizeY,
+                       sizeX,
+                       strideD,
+                       strideH,
+                       strideW,
+                       paddingD,
+                       paddingH,
+                       paddingW,
+                       getData(),
+                       getStride());
+}
+
+void GpuMatrix::avgPool3DBackward(Matrix& outGrad,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  real scaleTargets,
+                                  real scaleOutput) {
+  CHECK(outGrad.useGpu_) << "Matrix type are not equal";
+
+  real* outDiff = outGrad.getData();
+  size_t frameNum = outGrad.getHeight();
+  size_t channels = outGrad.getWidth() / outputD / outputH / outputW;
+  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == width_);
+  CHECK(height_ == outGrad.getHeight());
+  CHECK(outGrad.getWidth() == outputD * outputH * outputW * channels);
+
+  hl_avgpool3D_backward(frameNum,
+                        outDiff,
+                        channels,
+                        imgSizeD,
+                        imgSizeH,
+                        imgSizeW,
+                        outputD,
+                        outputH,
+                        outputW,
+                        sizeZ,
+                        sizeY,
+                        sizeX,
+                        strideD,
+                        strideH,
+                        strideW,
+                        paddingD,
+                        paddingH,
+                        paddingW,
+                        scaleTargets,
+                        scaleOutput,
+                        getData(),
+                        outGrad.getStride());
+}
+
+void GpuMatrix::maxSequenceForward(Matrix& input,
+                                   const IVector& sequence,
+                                   IVector& index) {
+  CHECK(dynamic_cast<GpuMatrix*>(&input));
+  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
+  CHECK(dynamic_cast<GpuIVector*>(&index));
+
+  real* outData = getData();
+  real* inputData = input.getData();
+  const int* starts = sequence.getData();
+  int* maxIndex = index.getData();
+  size_t numSequences = getHeight();
+  size_t dim = getWidth();
+
+  CHECK_EQ(dim, input.getWidth());
+  CHECK_EQ(numSequences, sequence.getSize() - 1);
+  CHECK_EQ(numSequences * dim, index.getSize());
+
+  hl_max_sequence_forward(
+      inputData, starts, outData, maxIndex, numSequences, dim);
+}
+
+void GpuMatrix::maxSequenceBackward(Matrix& outputGrad,
+                                    const IVector& sequence,
+                                    IVector& index) {
+  CHECK(dynamic_cast<GpuMatrix*>(&outputGrad));
+  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
+  CHECK(dynamic_cast<GpuIVector*>(&index));
+
+  real* inputGrad = getData();
+  real* outGrad = outputGrad.getData();
+  int* maxIndex = index.getData();
+  size_t dim = getWidth();
+  size_t numSequences = sequence.getSize() - 1;
+
+  CHECK_EQ(dim, outputGrad.getWidth());
+  CHECK_EQ(numSequences, outputGrad.getHeight());
+  CHECK_EQ(numSequences * dim, index.getSize());
+
+  hl_max_sequence_backward(outGrad, maxIndex, inputGrad, numSequences, dim);
+}
+
+void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
+  CHECK(data.useGpu_ == true && W.useGpu_ == true)
+      << "Matrix type are not equal";
+  real* input = data.getData();
+  real* w = W.getData();
+  size_t numElements = data.getWidth();
+  size_t numSamples = data.getHeight();
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
+  real* output = getData();
+  hl_param_relu_forward(output, input, w, numElements, numSamples, partial_sum);
+}
+
+void GpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
+  CHECK(oGrad.useGpu_ == true && data.useGpu_ == true)
+      << "Matrix type are not equal";
+  real* ograd = oGrad.getData();
+  real* input = data.getData();
+  real* wgrad = data_;
+  size_t numElements = data.getWidth();
+  size_t numSamples = data.getHeight();
+  size_t paraSize = this->getHeight() * this->getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
+  hl_param_relu_backward_w(
+      wgrad, ograd, input, numElements, numSamples, partial_sum);
+}
+
+void GpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
+  real* diff = data_;
+  real* input = data.getData();
+  real* ograd = oGrad.getData();
+  real* w = W.getData();
+  size_t numElements = data.getWidth();
+  size_t numSamples = data.getHeight();
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
+  hl_param_relu_backward_diff(
+      ograd, input, w, diff, numElements, numSamples, partial_sum);
+}
+
+void GpuMatrix::addColumnVector(const Matrix& b) {
+  BaseMatrix::addColVector(const_cast<Matrix&>(b));
+}
+
+void GpuMatrix::bilinearForward(const Matrix& in,
+                                const size_t inImgH,
+                                const size_t inImgW,
+                                const size_t outImgH,
+                                const size_t outImgW,
+                                const size_t numChannels,
+                                const real ratioH,
+                                const real ratioW) {
+  CHECK(dynamic_cast<const GpuMatrix*>(&in));
+
+  const size_t outputW = getWidth();
+  const size_t outputH = getHeight();
+  const size_t inputW = in.getWidth();
+  const size_t inputH = in.getHeight();
+
+  real* outData = getData();
+  const real* inData = in.getData();
+
+  if (inImgH == outImgW && inImgW == outImgW) {
+    this->copyFrom(in);
+  } else {
+    hl_bilinear_forward(inData,
+                        inImgH,
+                        inImgW,
+                        inputH,
+                        inputW,
+                        outData,
+                        outImgH,
+                        outImgW,
+                        outputH,
+                        outputW,
+                        numChannels,
+                        ratioH,
+                        ratioW);
+  }
+}
+
+void GpuMatrix::bilinearBackward(const Matrix& out,
+                                 const size_t outImgH,
+                                 const size_t outImgW,
+                                 const size_t inImgH,
+                                 const size_t inImgW,
+                                 const size_t numChannels,
+                                 const real ratioH,
+                                 const real ratioW) {
+  CHECK(dynamic_cast<const GpuMatrix*>(&out));
+
+  const size_t inputW = getWidth();
+  const size_t inputH = getHeight();
+  const size_t outputW = out.getWidth();
+  const size_t outputH = out.getHeight();
+
+  real* inGrad = getData();
+  const real* outGrad = out.getData();
+
+  if (outImgH == inImgH && outImgW == inImgW) {
+    this->add(const_cast<Matrix&>(out));
+  } else {
+    hl_bilinear_backward(inGrad,
+                         inImgH,
+                         inImgW,
+                         inputH,
+                         inputW,
+                         outGrad,
+                         outImgH,
+                         outImgW,
+                         outputH,
+                         outputW,
+                         numChannels,
+                         ratioH,
+                         ratioW);
+  }
+}
+
+void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
+#ifdef PADDLE_WITH_CUDA
+  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
+  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
+
+  CHECK(outputPtr && labelPtr) << "Invalid argument pointer";
+  CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported";
+  CHECK(height_ == outputPtr->height_ && width_ == 1 &&
+        outputPtr->width_ == labelPtr->getWidth() &&
+        outputPtr->height_ == labelPtr->getHeight())
+      << "Matrix dimensions are not equal";
+
+  real* output_d = outputPtr->data_;
+  real* entropy_d = data_;
+  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
+  hl_matrix_multi_binary_cross_entropy(
+      output_d, entropy_d, mat_d, height_, outputPtr->width_);
+#endif
+}
+
+void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
+#ifdef PADDLE_WITH_CUDA
+  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
+  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
+
+  CHECK(outputPtr && labelPtr) << "Invalid argument pointer";
+  CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported";
+  CHECK(height_ == outputPtr->height_ && width_ == outputPtr->width_ &&
+        outputPtr->width_ == labelPtr->getWidth() &&
+        outputPtr->height_ == labelPtr->getHeight())
+      << "Matrix dimensions are not equal";
+
+  real* output_d = outputPtr->data_;
+  real* grad_d = data_;
+  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
+  hl_matrix_multi_binary_cross_entropy_bp(
+      output_d, grad_d, mat_d, height_, width_);
+#endif
+}
+
+void GpuMatrix::vol2Col(real* dataSrc,
+                        int channels,
+                        int depth,
+                        int height,
+                        int width,
+                        int filterD,
+                        int filterH,
+                        int filterW,
+                        int strideD,
+                        int strideH,
+                        int strideW,
+                        int paddingD,
+                        int paddingH,
+                        int paddingW) {
+  hl_matrix_vol2Col(dataSrc,
+                    channels,
+                    depth,
+                    height,
+                    width,
+                    filterD,
+                    filterH,
+                    filterW,
+                    strideD,
+                    strideH,
+                    strideW,
+                    paddingD,
+                    paddingH,
+                    paddingW,
+                    getData());
+}
+
+void GpuMatrix::col2Vol(real* dataDst,
+                        int channels,
+                        int depth,
+                        int height,
+                        int width,
+                        int filterD,
+                        int filterH,
+                        int filterW,
+                        int strideD,
+                        int strideH,
+                        int strideW,
+                        int paddingD,
+                        int paddingH,
+                        int paddingW,
+                        real alpha,
+                        real beta) {
+  hl_matrix_col2Vol(dataDst,
+                    channels,
+                    depth,
+                    height,
+                    width,
+                    filterD,
+                    filterH,
+                    filterW,
+                    strideD,
+                    strideH,
+                    strideW,
+                    paddingD,
+                    paddingH,
+                    paddingW,
+                    getData(),
+                    alpha,
+                    beta);
+}
+
+/**
+ * CpuMatrix
+ */
+
+CpuMatrix::CpuMatrix(size_t height, size_t width, bool trans)
+    : Matrix(std::make_shared<CpuMemoryHandle>(height * width * sizeof(real)),
+             height,
+             width,
+             trans,
+             false) {}
+
+CpuMatrix::~CpuMatrix() {}
+
+void CpuMatrix::zeroMem() {
+  CHECK(data_ != NULL);
+  if (isContiguous()) {
+    memset(data_, 0, height_ * width_ * sizeof(real));
+  } else {
+    BaseMatrix::zero();
+  }
+}
+void CpuMatrix::resetOne() {
+  CHECK(data_ != NULL);
+  BaseMatrix::one();
+}
+
+void CpuMatrix::copyFrom(const Matrix& src) {
+  CHECK(isContiguous());
+  if (typeid(src) == typeid(GpuMatrix)) {
+    CHECK(src.isContiguous());
+    CHECK(elementCnt_ == src.getElementCnt());
+    hl_memcpy_device2host(
+        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
+  } else if (typeid(src) == typeid(CpuMatrix) ||
+             typeid(src) == typeid(SharedCpuMatrix)) {
+    CHECK(src.isContiguous());
+    CHECK(elementCnt_ == src.getElementCnt());
+    memcpy(data_, src.getData(), sizeof(real) * elementCnt_);
+  } else if (typeid(src) == typeid(CpuSparseMatrix)) {
+    CHECK_GE(elementCnt_, src.getElementCnt());
+    copyFrom(dynamic_cast<CpuSparseMatrix&>(const_cast<Matrix&>(src)));
+  } else {
+    LOG(FATAL) << "Wrong";
+  }
+}
+
+void CpuMatrix::copyFrom(CpuSparseMatrix& src) {
+  CHECK(isContiguous());
+  CHECK(height_ == src.getHeight());
+  CHECK(width_ == src.getWidth());
+  memset(data_, 0, sizeof(real) * height_ * width_);
+  if (src.getValueType() == FLOAT_VALUE) {
+    if (src.getFormat() == SPARSE_CSC) {
+      int* rows = src.getRows();
+      real* vals = src.getValue();
+      for (size_t i = 0; i < width_; i++) {
+        for (size_t j = src.getColStartIdx(i); j < src.getColStartIdx(i + 1);
+             j++) {
+          data_[rows[j] * width_ + i] = vals[j];
+        }
+      }
+    } else {
+      int* cols = src.getCols();
+      real* vals = src.getValue();
+      for (size_t i = 0; i < height_; i++) {
+        for (size_t j = src.getRowStartIdx(i); j < src.getRowStartIdx(i + 1);
+             j++) {
+          data_[i * width_ + cols[j]] = vals[j];
+        }
+      }
+    }
+  } else {
+    if (src.getFormat() == SPARSE_CSC) {
+      int* rows = src.getRows();
+      for (size_t i = 0; i < width_; i++) {
+        for (size_t j = src.getColStartIdx(i); j < src.getColStartIdx(i + 1);
+             j++) {
+          data_[rows[j] * width_ + i] = 1.0;
+        }
+      }
+    } else {
+      int* cols = src.getCols();
+      for (size_t i = 0; i < height_; i++) {
+        for (size_t j = src.getRowStartIdx(i); j < src.getRowStartIdx(i + 1);
+             j++) {
+          data_[i * width_ + cols[j]] = 1.0;
+        }
+      }
+    }
+  }
+}
+
+void CpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
+  CHECK(isContiguous());
+  CHECK(src.isContiguous());
+  CHECK(elementCnt_ == src.getElementCnt());
+  if (typeid(src) == typeid(GpuMatrix)) {
+    hl_memcpy_async(this->getData(),
+                    const_cast<real*>(src.getData()),
+                    sizeof(real) * elementCnt_,
+                    stream);
+    // There is a need to add synchronization to ensure that the data is copied.
+    hl_stream_synchronize(stream);
+  } else if (typeid(src) == typeid(CpuMatrix)) {
+    memcpy(data_, src.getData(), sizeof(real) * elementCnt_);
+  } else {
+    LOG(FATAL) << "Wrong";
+  }
+}
+
+void CpuMatrix::copyFrom(const real* cpuSrc, size_t size) {
+  CHECK(isContiguous());
+  CHECK(size <= elementCnt_);
+  memcpy(data_, cpuSrc, sizeof(real) * size);
+}
+
+void CpuMatrix::copyFrom(const real* cpuSrc, const int64_t* seq) {
+  CHECK(isContiguous());
+  for (size_t i = 0; i < height_; i++) {
+    memcpy(data_ + i * width_, cpuSrc + seq[i] * width_, sizeof(real) * width_);
+  }
+}
+
+void CpuMatrix::copyFrom(const IVector& src) {
+  CHECK(isContiguous());
+  CHECK(elementCnt_ == src.getSize())
+      << "the src and dst should have same size.";
+  const int* cpuSrc = NULL;
+  IVectorPtr tmp;
+  if (src.useGpu()) {
+    CpuIVector tmp(src.getSize());
+    tmp.copyFrom(src);
+    cpuSrc = tmp.getData();
+  } else {
+    cpuSrc = src.getData();
+  }
+  for (size_t i = 0; i < elementCnt_; ++i) {
+    data_[i] = cpuSrc[i];
+  }
+}
+
+void CpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
+  size_t height = getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(b.getWidth(), width);
+  const int* index = rowIndex.getData();
+  for (size_t i = 0; i < height; i++) {
+    CHECK_LT(static_cast<size_t>(index[i]), b.getHeight());
+    real* src = b.getData() + index[i] * width;
+    real* dst = getData() + i * width;
+    memcpy(dst, src, sizeof(real) * width);
+  }
+}
+
+MatrixPtr CpuMatrix::clone(size_t height, size_t width, bool useGpu) {
+  CHECK(isContiguous());
+
+  if (height == 0 && width == 0) {
+    height = height_;
+    width = width_;
+  }
+
+  CHECK(width && height);
+
+  if (useGpu) {
+    return std::make_shared<GpuMatrix>(height, width);
+  } else {
+    return std::make_shared<CpuMatrix>(height, width);
+  }
+}
+
+void CpuMatrix::resize(size_t newHeight, size_t newWidth) {
+  size_t newSize = newHeight * newWidth;
+  if (NULL == memoryHandle_.get() ||
+      newSize * sizeof(real) > memoryHandle_->getAllocSize()) {
+    memoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize * sizeof(real));
+    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
+  }
+
+  height_ = newHeight;
+  width_ = newWidth;
+  elementCnt_ = newSize;
+  stride_ = width_;
+}
+
+real CpuMatrix::getElement(size_t x, size_t y) const {
+  return data_[x * stride_ + y];
+}
+
+real CpuMatrix::getSum() {
+  CHECK(isContiguous());
+  double sum = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    for (size_t j = 0; j < width_; ++j) {
+      sum += data_[i * width_ + j];
+    }
+  }
+  return sum;
+}
+
+void CpuMatrix::accumulateColSum(Matrix& src) {
+  CHECK_EQ(getWidth(), src.getWidth());
+  CHECK_EQ(getHeight(), (size_t)1);
+
+  sumCols(src, /* scaleSum= */ 1, /* scaleDest= */ 1);
+}
+
+real CpuMatrix::getAbsSum() {
+  CHECK(isContiguous());
+  double sum = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    for (size_t j = 0; j < width_; ++j) {
+      sum += fabs(data_[i * width_ + j]);
+    }
+  }
+  return sum;
+}
+
+MatrixPtr CpuMatrix::getTranspose() {
+  if (memoryHandle_.get() != NULL) {
+    return std::make_shared<CpuMatrix>(
+        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_),
+        height_,
+        width_,
+        true);
+  } else {
+    MatrixPtr copy_T(new CpuMatrix(data_, height_, width_, true));
+    return copy_T;
+  }
+}
+
+void CpuMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
+  if (memAlloc) {
+    matTrans = std::make_shared<CpuMatrix>(width_, height_);
+  } else {
+    CHECK(matTrans != NULL);
+    CHECK_EQ(matTrans->getHeight(), width_);
+    CHECK_EQ(matTrans->getWidth(), height_);
+  }
+  real* dataTrans = matTrans->getData();
+  real* data = getData();
+  int lda = getStride();
+  int ldc = matTrans->getStride();
+
+  for (size_t i = 0; i < height_; i++) {
+    for (size_t j = 0; j < width_; j++) {
+      dataTrans[j * ldc + i] = data[i * lda + j];
+    }
+  }
+}
+
+void CpuMatrix::rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
+  if (memAlloc) {
+    matRot = std::make_shared<CpuMatrix>(width_, height_);
+  } else {
+    CHECK(matRot != NULL);
+    CHECK_EQ(matRot->getHeight(), width_);
+    CHECK_EQ(matRot->getWidth(), height_);
+  }
+  real* dataRot = matRot->getData();
+  real* data = getData();
+
+  for (size_t i = 0; i < height_; i++) {
+    for (size_t j = 0; j < width_; j++) {
+      if (clockWise) {
+        dataRot[j * height_ + i] = data[(height_ - i - 1) * width_ + j];
+      } else {
+        dataRot[j * height_ + i] = data[i * width_ + (width_ - j - 1)];
+      }
+    }
+  }
+}
+
+MatrixPtr CpuMatrix::getInverse() {
+  MatrixPtr matInv;
+  inverse(matInv, true);
+  return matInv;
+}
+
+void CpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
+  CHECK_EQ(height_, width_);
+
+  if (memAlloc) {
+    matInv = std::make_shared<CpuMatrix>(height_, width_);
+  } else {
+    CHECK(matInv != NULL);
+  }
+
+  CHECK_EQ(height_, matInv->getHeight());
+  CHECK_EQ(width_, matInv->getWidth());
+  matInv->copyFrom(*this);
+
+  real* data = getData();
+  real* dataInv = matInv->getData();
+  int ldc = matInv->getStride();
+
+  if (height_ == 1) {
+    CHECK_NE(*data, 0);
+    *dataInv = 1.0 / (*data);
+    return;
+  }
+
+  /* Compute the LU decomposition of the matrix */
+  std::vector<int> ipiv(height_);
+  CBLAS_ORDER order = (matInv->isTransposed() ? CblasColMajor : CblasRowMajor);
+  int info = getrf<real>(order, height_, height_, dataInv, ldc, ipiv.data());
+  CHECK_EQ(info, 0);
+
+  /* Compute the inverse of the matrix given its LU decompsotion */
+  info = getri<real>(order, height_, dataInv, ldc, ipiv.data());
+  CHECK_EQ(info, 0);
+}
+
+void CpuMatrix::upsampleForward(Matrix& input,
+                                Matrix& mask,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t channels,
+                                size_t outputH,
+                                size_t outputW) {
+  real* inputData = input.getData();
+  real* maskData = mask.getData();
+  real* outData = data_;
+  size_t inLength = imgSizeH * imgSizeW;
+  size_t outLength = outputH * outputW;
+  size_t batch = input.getHeight();
+  CHECK(inLength == input.getWidth() / channels);
+  CHECK_EQ(batch, this->getHeight());
+  CHECK_EQ(channels * outLength, this->getWidth());
+
+  for (size_t k = 0; k < batch; k++) {
+    for (size_t c = 0; c < channels; c++) {
+      for (size_t i = 0; i < inLength; i++) {
+        size_t out_index = static_cast<int>(maskData[i]);
+        if (out_index >= outLength) {
+          LOG(FATAL) << "upsample index " << out_index << " out of range.";
+        }
+        outData[out_index] = inputData[i];
+      }
+      inputData += inLength;
+      maskData += inLength;
+      outData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::upsampleBackward(Matrix& outputGrad,
+                                 Matrix& mask,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t channels,
+                                 size_t outputH,
+                                 size_t outputW) {
+  real* outputGradData = outputGrad.getData();
+  real* maskData = mask.getData();
+  real* inputGradData = data_;
+  size_t inLength = imgSizeH * imgSizeW;
+  size_t outLength = outputH * outputW;
+  size_t batch = outputGrad.getHeight();
+  CHECK(inLength == this->getWidth() / channels);
+  CHECK_EQ(batch, this->getHeight());
+  CHECK_EQ(channels * outLength, outputGrad.getWidth());
+
+  for (size_t k = 0; k < batch; k++) {
+    for (size_t c = 0; c < channels; c++) {
+      for (size_t i = 0; i < inLength; i++) {
+        size_t out_index = static_cast<int>(maskData[i]);
+        if (out_index >= outLength) {
+          LOG(FATAL) << "upsample index " << out_index << " out of range.";
+        }
+        inputGradData[i] = outputGradData[out_index];
+      }
+      inputGradData += inLength;
+      maskData += inLength;
+      outputGradData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::maxPoolForward(Matrix& inputMat,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               size_t paddingH,
+                               size_t paddingW,
+                               MatrixPtr maskMatP) {
+  real* inputData = inputMat.getData();
+  real* outData = data_;
+  real* maskData = NULL;
+  size_t num = inputMat.getHeight();
+  size_t inLength = imgSizeH * imgSizeW;
+  size_t outLength = outputH * outputW;
+  CHECK(inLength == inputMat.getWidth() / channels);
+  CHECK_EQ(num, this->getHeight());
+  CHECK_EQ(channels * outLength, this->getWidth());
+  size_t outStride = getStride();
+
+  if (maskMatP != NULL) {
+    maskData = maskMatP->getData();
+    CHECK_EQ(channels * outLength, maskMatP->getWidth());
+  }
+
+  /* pool max one by one */
+  for (size_t n = 0; n < num; ++n) {  // frame by frame
+    if (!isContiguous()) {
+      outData = data_ + n * outStride;
+    }
+    for (size_t c = 0; c < channels; ++c) {  // channel by channel
+      for (size_t ph = 0; ph < outputH; ++ph) {
+        int hstart = ph * strideH - paddingH;
+        int hend = hstart + sizeY;
+        hstart = hstart < 0 ? 0 : hstart;
+        hend = hend < (int)imgSizeH ? hend : (int)imgSizeH;
+        for (size_t pw = 0; pw < outputW; ++pw) {
+          int wstart = pw * strideW - paddingW;
+          int wend = wstart + sizeX;
+          wstart = wstart < 0 ? 0 : wstart;
+          wend = wend < (int)imgSizeW ? wend : (int)imgSizeW;
+
+          real maxval = -(real)FLT_MAX;
+          int max_index = -1;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              if (maxval < inputData[h * imgSizeW + w]) {
+                maxval = inputData[h * imgSizeW + w];
+                max_index = h * imgSizeW + w;
+              }
+            }
+          }
+
+          outData[ph * outputW + pw] = maxval;
+          if (maskData != NULL) maskData[ph * outputW + pw] = max_index;
+        }
+      }
+      // compute offset
+      inputData += inLength;
+      outData += outLength;
+
+      if (maskData != NULL) maskData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::maxPoolBackward(Matrix& image,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                Matrix& outGrad,
+                                Matrix& outV,
+                                size_t sizeX,
+                                size_t sizeY,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputH,
+                                size_t outputW,
+                                real scaleTargets,
+                                real scaleOutput,
+                                size_t paddingH,
+                                size_t paddingW) {
+  size_t num = image.getHeight();
+  size_t inLength = imgSizeH * imgSizeW;
+  size_t outLength = outputH * outputW;
+  size_t channels = size_t(width_ / inLength);
+  CHECK(image.getWidth() == inLength * channels);
+  CHECK(image.getHeight() == height_ && image.getWidth() == width_);
+  CHECK(outV.getHeight() == outGrad.getHeight() &&
+        outV.getWidth() == outGrad.getWidth());
+
+  real* tgtGrad = data_;
+  real* inData = image.getData();
+  real* otData = outV.getData();
+  real* otGrad = outGrad.getData();
+
+  size_t outStride = outV.getStride();
+  real* origOutData = otData;
+  real* origOutGrad = otGrad;
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!outV.isContiguous()) {
+      otData = origOutData + n * outStride;
+      otGrad = origOutGrad + n * outStride;
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t ph = 0; ph < outputH; ++ph) {
+        int hstart = ph * strideH - paddingH;
+        int hend = std::min(hstart + sizeY, imgSizeH);
+        hstart = std::max(hstart, 0);
+        for (size_t pw = 0; pw < outputW; ++pw) {
+          int wstart = pw * strideW - paddingW;
+          int wend = std::min(wstart + sizeX, imgSizeW);
+          wstart = std::max(wstart, 0);
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              tgtGrad[h * imgSizeW + w] =
+                  scaleTargets * tgtGrad[h * imgSizeW + w] +
+                  scaleOutput * otGrad[ph * outputW + pw] *
+                      (inData[h * imgSizeW + w] == otData[ph * outputW + pw]);
+            }
+          }
+        }
+      }
+      // offset
+      inData += inLength;
+      tgtGrad += inLength;
+      otData += outLength;
+      otGrad += outLength;
+    }
+  }
+}
+
+void CpuMatrix::avgPoolForward(Matrix& input,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               size_t paddingH,
+                               size_t paddingW,
+                               bool excludeMode) {
+  // The main loop
+  size_t num = input.getHeight();
+  size_t inLength = imgSizeH * imgSizeW;
+  size_t outLength = outputH * outputW;
+  CHECK(inLength * channels == input.getWidth());
+  CHECK(outLength * channels * num == height_ * width_);
+  real* tgtData = data_;
+  real* inData = input.getData();
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!isContiguous()) {
+      tgtData = data_ + n * getStride();
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t ph = 0; ph < outputH; ++ph) {
+        int hstart = ph * strideH - paddingH;
+        int hend = std::min(hstart + sizeY, imgSizeH);
+        hstart = std::max(hstart, 0);
+        for (size_t pw = 0; pw < outputW; ++pw) {
+          int wstart = pw * strideW - paddingW;
+          int wend = std::min(wstart + sizeX, imgSizeW);
+          wstart = std::max(wstart, 0);
+          tgtData[ph * outputW + pw] = 0;  // clear
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              tgtData[ph * outputW + pw] += inData[h * imgSizeW + w];
+            }
+          }
+          int poolSize =
+              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
+          CHECK(poolSize);
+          tgtData[ph * outputW + pw] /= poolSize;
+        }
+      }
+      // compute offset
+      inData += inLength;
+      tgtData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::avgPoolBackward(Matrix& input,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t sizeX,
+                                size_t sizeY,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputH,
+                                size_t outputW,
+                                real scaleTargets,
+                                real scaleOutput,
+                                size_t paddingH,
+                                size_t paddingW,
+                                bool excludeMode) {
+  size_t num = input.getHeight();
+  size_t channels = input.getWidth() / outputH / outputW;
+  size_t inLength = imgSizeH * imgSizeW;
+  size_t outLength = outputH * outputW;
+  CHECK(inLength * channels == getWidth());
+  real* inData = input.getData();
+  real* outData = getData();
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!input.isContiguous()) {
+      inData = input.getData() + n * input.getStride();
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t ph = 0; ph < outputH; ++ph) {
+        int hstart = ph * strideH - paddingH;
+        int hend = std::min(hstart + sizeY, imgSizeH);
+        hstart = std::max(hstart, 0);
+        for (size_t pw = 0; pw < outputW; ++pw) {
+          int wstart = pw * strideW - paddingW;
+          int wend = std::min(wstart + sizeX, imgSizeW);
+          wstart = std::max(wstart, 0);
+          int poolSize =
+              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
+          CHECK(poolSize);
+
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              outData[h * imgSizeW + w] += inData[ph * outputW + pw] / poolSize;
+            }
+          }
+        }
+      }
+      // offset
+      outData += inLength;
+      inData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::maxPool3DForward(Matrix& inputMat,
+                                 Matrix& maxPoolIdx,
+                                 size_t channels,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  real* inputData = inputMat.getData();
+  real* outData = getData();
+  real* maxPoolIdxData = maxPoolIdx.getData();
+  size_t num = inputMat.getHeight();
+  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
+  size_t outLength = outputH * outputW * outputD;
+  CHECK(inLength == inputMat.getWidth() / channels);
+  CHECK_EQ(num, this->getHeight());
+  CHECK_EQ(channels * outLength, this->getWidth());
+  size_t outStride = getStride();
+
+  /* initialize the data_ */
+  for (size_t i = 0; i < height_; i++) {
+    for (size_t j = 0; j < width_; j++) {
+      outData[(i)*outStride + j] = -(real)FLT_MAX;
+      maxPoolIdxData[(i)*outStride + j] = -1;
+    }
+  }
+
+  /* pool max one by one */
+  for (size_t n = 0; n < num; ++n) {  // frame by frame
+    if (!isContiguous()) {
+      outData = getData() + n * outStride;
+      maxPoolIdxData = maxPoolIdx.getData() + n * outStride;
+    }
+    for (size_t c = 0; c < channels; ++c) {  // channel by channel
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        int dstart = pd * strideD - paddingD;
+        int dend = std::min(dstart + sizeZ, imgSizeD);
+        dstart = std::max(dstart, 0);
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          int hstart = ph * strideH - paddingH;
+          int hend = std::min(hstart + sizeY, imgSizeH);
+          hstart = std::max(hstart, 0);
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            int wstart = pw * strideW - paddingW;
+            int wend = std::min(wstart + sizeX, imgSizeW);
+            wstart = std::max(wstart, 0);
+            int maxIdx = -1;
+            real maxOutData = outData[(pd * outputH + ph) * outputW + pw];
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  if (maxOutData <
+                      inputData[(d * imgSizeH + h) * imgSizeW + w]) {
+                    maxOutData = inputData[(d * imgSizeH + h) * imgSizeW + w];
+                    maxIdx = (d * imgSizeH + h) * imgSizeW + w;
+                  }
+                }
+              }
+            }
+            outData[(pd * outputH + ph) * outputW + pw] = maxOutData;
+            maxPoolIdxData[(pd * outputH + ph) * outputW + pw] = maxIdx;
+          }
+        }
+      }
+      // compute offset
+      inputData += inLength;
+      outData += outLength;
+      maxPoolIdxData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::maxPool3DBackward(Matrix& outGrad,
+                                  Matrix& maxPoolIdx,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  real scaleTargets,
+                                  real scaleOutput) {
+  size_t num = getHeight();
+  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
+  size_t outLength = outputH * outputW * outputD;
+  size_t channels = size_t(width_ / inLength);
+  CHECK(maxPoolIdx.getHeight() == outGrad.getHeight() &&
+        maxPoolIdx.getWidth() == outGrad.getWidth());
+
+  real* tgtGrad = getData();
+  real* otGrad = outGrad.getData();
+  real* maxPoolIdxData = maxPoolIdx.getData();
+  size_t outStride = outGrad.getStride();
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!outGrad.isContiguous()) {
+      otGrad = outGrad.getData() + n * outStride;
+      maxPoolIdxData = maxPoolIdx.getData() + n * outStride;
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            const size_t index = (pd * outputH + ph) * outputW + pw;
+            const size_t tgtIdx = static_cast<size_t>(maxPoolIdxData[index]);
+            tgtGrad[tgtIdx] =
+                scaleTargets * tgtGrad[tgtIdx] + scaleOutput * otGrad[index];
+          }
+        }
+      }
+      // offset
+      tgtGrad += inLength;
+      otGrad += outLength;
+      maxPoolIdxData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::avgPool3DForward(Matrix& input,
+                                 size_t channels,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  // The main loop
+  size_t num = input.getHeight();
+  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
+  size_t outLength = outputH * outputW * outputD;
+  CHECK(inLength * channels == input.getWidth());
+  CHECK(outLength * channels * num == height_ * width_);
+  real* tgtData = getData();
+  real* inData = input.getData();
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!isContiguous()) {
+      tgtData = data_ + n * getStride();
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        int dstart = pd * strideD - paddingD;
+        int dend = std::min(dstart + sizeZ, imgSizeD);
+        dstart = std::max(dstart, 0);
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          int hstart = ph * strideH - paddingH;
+          int hend = std::min(hstart + sizeY, imgSizeH);
+          hstart = std::max(hstart, 0);
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            int wstart = pw * strideW - paddingW;
+            int wend = std::min(wstart + sizeX, imgSizeW);
+            wstart = std::max(wstart, 0);
+
+            tgtData[(pd * outputH + ph) * outputW + pw] = 0;  // clear
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  tgtData[(pd * outputH + ph) * outputW + pw] +=
+                      inData[(d * imgSizeH + h) * imgSizeW + w];
+                }
+              }
+            }
+            int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            CHECK(poolSize);
+            tgtData[(pd * outputH + ph) * outputW + pw] /= poolSize;
+          }
+        }
+      }
+      // compute offset
+      inData += inLength;
+      tgtData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::avgPool3DBackward(Matrix& input,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  real scaleTargets,
+                                  real scaleOutput) {
+  size_t num = input.getHeight();
+  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
+  size_t outLength = outputH * outputW * outputD;
+  size_t channels = input.getWidth() / outLength;
+  CHECK(inLength * channels == getWidth());
+  real* inData = input.getData();
+  real* outData = getData();
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!input.isContiguous()) {
+      inData = input.getData() + n * input.getStride();
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        int dstart = pd * strideD - paddingD;
+        int dend = std::min(dstart + sizeZ, imgSizeD);
+        dstart = std::max(dstart, 0);
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          int hstart = ph * strideH - paddingH;
+          int hend = std::min(hstart + sizeY, imgSizeH);
+          hstart = std::max(hstart, 0);
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            int wstart = pw * strideW - paddingW;
+            int wend = std::min(wstart + sizeX, imgSizeW);
+            wstart = std::max(wstart, 0);
+            int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            CHECK(poolSize);
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  outData[(d * imgSizeH + h) * imgSizeW + w] +=
+                      inData[(pd * outputH + ph) * outputW + pw] / poolSize;
+                }
+              }
+            }
+          }
+        }
+      }
+      // offset
+      outData += inLength;
+      inData += outLength;
+    }
+  }
+}
+
+/**
+ * Input: one or more sequences. Each sequence contains some instances.
+ * Output: output size is the number of input sequences (NOT input instances).
+ * output[i] is set to max_{for each instance in this sequence}{input[i]}
+ */
+void CpuMatrix::maxSequenceForward(Matrix& input,
+                                   const IVector& sequence,
+                                   IVector& index) {
+  CHECK(dynamic_cast<CpuMatrix*>(&input));
+  CHECK(dynamic_cast<const CpuIVector*>(&sequence));
+  CHECK(dynamic_cast<CpuIVector*>(&index));
+
+  real* outData = getData();
+  real* inputData = input.getData();
+  const int* starts = sequence.getData();
+  int* maxIndex = index.getData();
+  size_t numSequences = getHeight();
+  size_t dim = getWidth();
+
+  CHECK_EQ(dim, input.getWidth());
+  CHECK_EQ(numSequences, sequence.getSize() - 1);
+  CHECK_EQ(starts[numSequences], (int)input.getHeight());
+  CHECK_EQ(numSequences * dim, index.getSize());
+
+  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
+    // current sequence, loop for each input instance
+    // (1) first instance: do not need compare, copy value to outV directly
+    for (size_t k = 0; k < dim; ++k) {
+      outData[sequenceId * dim + k] = inputData[starts[sequenceId] * dim + k];
+      maxIndex[sequenceId * dim + k] = starts[sequenceId];
+    }
+    // (2) other instance in same sequence
+    for (int insId = starts[sequenceId] + 1; insId < starts[sequenceId + 1];
+         ++insId) {
+      // insId is the index on all instances
+      for (size_t k = 0; k < dim; ++k) {
+        // for each dim
+        if (inputData[insId * dim + k] > outData[sequenceId * dim + k]) {
+          // update max value and record index
+          outData[sequenceId * dim + k] = inputData[insId * dim + k];
+          maxIndex[sequenceId * dim + k] = insId;
+        }
+      }
+    }
+  }
+}
+
+void CpuMatrix::maxSequenceBackward(Matrix& outputGrad,
+                                    const IVector& sequence,
+                                    IVector& index) {
+  CHECK(dynamic_cast<CpuMatrix*>(&outputGrad));
+  CHECK(dynamic_cast<const CpuIVector*>(&sequence));
+  CHECK(dynamic_cast<CpuIVector*>(&index));
+
+  real* inputGrad = getData();
+  real* outGrad = outputGrad.getData();
+  int* maxIndex = index.getData();
+  size_t dim = getWidth();
+  size_t numSequences = sequence.getSize() - 1;
+
+  CHECK_EQ(dim, outputGrad.getWidth());
+  CHECK_EQ(numSequences, outputGrad.getHeight());
+  CHECK_EQ(numSequences * dim, index.getSize());
+
+  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
+    // current sequence
+    for (size_t j = 0; j < dim; ++j) {
+      // each dim
+      int insId = maxIndex[sequenceId * dim + j];
+      inputGrad[insId * dim + j] += outGrad[sequenceId * dim + j];
+    }
+  }
+}
+
+inline void vecAddTo(real* a, const real* b, size_t len) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i] += b[i];
+  }
+}
+
+inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i] += scaleB * b[i];
+  }
+}
+
+inline void colVecAddTo(
+    real* a, const real* b, size_t len, size_t aWidth, size_t bWidth) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i * aWidth] += b[i * bWidth];
+  }
+}
+
+inline void colVecAddTo(
+    real* a, real* b, real c, size_t len, size_t aWidth, size_t bWidth) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i * aWidth] += b[i * bWidth] * c;
+  }
+}
+
+void CpuMatrix::addBias(Matrix& b, real scale) {
+  CHECK(b.useGpu_ == false) << "Matrix type are not equal";
+
+  CHECK_EQ(b.getHeight(), (size_t)1);
+  CHECK_EQ(width_, b.getWidth());
+  real* aData = getData();
+  real* bData = b.getData();
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+
+  if (scale == 1 && getStride() % 32 == 0) {  // use libaddto
+    // @TODO(yuyang18) Make input addr can be unaligned.
+    // So merge this if and else
+    CHECK_EQ((size_t)aData % 32, 0UL);
+    CHECK_EQ((size_t)bData % 32, 0UL);
+    for (size_t i = 0; i < numSamples; i++) {
+      simd::addTo(aData + i * getStride(), bData, dim);
+    }
+  } else {
+    for (size_t i = 0; i < numSamples; i++) {
+      for (size_t j = 0; j < dim; j++) {
+        aData[i * getStride() + j] += scale * bData[j];
+      }
+    }
+  }
+}
+
+void CpuMatrix::addSharedBias(Matrix& b, real scale) {
+  CHECK_EQ(b.getHeight(), (size_t)1);
+  real* aData = getData();
+  real* bData = b.getData();
+  size_t numSamples = getHeight();
+  size_t channel = b.getWidth();
+  CHECK_EQ(getWidth() % channel, 0UL);
+  size_t dim = getWidth() / channel;
+
+  for (size_t i = 0; i < numSamples; i++) {
+    for (size_t c = 0; c < channel; c++) {
+      for (size_t j = 0; j < dim; j++) {
+        aData[i * getStride() + c * dim + j] += scale * bData[c];
+      }
+    }
+  }
+}
+
+void CpuMatrix::collectBias(Matrix& a, real scale) {
+  CHECK_EQ(getHeight(), (size_t)1);
+  CHECK_EQ(width_, a.getWidth());
+  CpuSparseMatrix* aptr = dynamic_cast<CpuSparseMatrix*>(&a);
+  if (!aptr) {
+    sumCols(a, /* scaleSum= */ scale, /* scaleDest= */ 1);
+  } else {
+    size_t nnz = aptr->getElementCnt();
+    int* cols = aptr->getCols();
+    real* A = aptr->getValue();
+    real* B = getData();
+    for (size_t i = 0; i < nnz; i++) {
+      B[cols[i]] += scale * A[i];
+    }
+  }
+}
+
+void CpuMatrix::collectSharedBias(Matrix& a, real scale) {
+  CHECK_EQ(getHeight(), (size_t)1);
+  real* B = getData();
+  real* A = a.getData();
+  size_t numSamples = a.getHeight();
+  size_t channel = getWidth();
+  CHECK_EQ(a.getWidth() % channel, 0UL);
+  size_t dim = a.getWidth() / channel;
+  for (size_t i = 0; i < numSamples; i++) {
+    for (size_t c = 0; c < channel; c++) {
+      for (size_t j = 0; j < dim; j++) {
+        B[c] += scale * A[i * channel * dim + c * dim + j];
+      }
+    }
+  }
+}
+
+void CpuMatrix::sequenceAvgForward(Matrix& a,
+                                   const IVector& startsPos,
+                                   int mode) {
+  size_t height = getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(height, startsPos.getSize() - 1);
+  CHECK_EQ(width, a.getWidth());
+  real* dst = getData();
+  real* src = a.getData();
+  const int* starts = startsPos.getData();
+  MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false);
+  MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false);
+  for (size_t i = 0; i < height; i++) {
+    int sequenceLength = starts[i + 1] - starts[i];
+    if (0 == sequenceLength) {
+      // empty sequence
+      continue;
+    }
+    outMtx->setData(dst + i * width);
+    dataMtx->setData(src + starts[i] * width, sequenceLength, width);
+    if (mode == 0) {
+      // plain average
+      outMtx->sumCols(*dataMtx,
+                      (real)1 / (real)sequenceLength,
+                      /* scaleDest= */ 1);
+    } else if (mode == 1) {
+      // sum instead of average
+      outMtx->sumCols(*dataMtx, /* scaleSum= */ 1, /* scaleDest= */ 1);
+    } else if (mode == 2) {
+      // divide by square root of sequenceLength
+      outMtx->sumCols(*dataMtx,
+                      (real)1 / std::sqrt(sequenceLength),
+                      /* scaleDest= */ 1);
+    } else {
+      LOG(FATAL) << "should not reach here";
+    }
+  }
+}
+
+void CpuMatrix::sequenceAvgBackward(Matrix& a,
+                                    const IVector& startsPos,
+                                    int mode) {
+  size_t height = a.getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(height, startsPos.getSize() - 1);
+  CHECK_EQ(width, a.getWidth());
+  real* dst = getData();
+  real* src = a.getData();
+  const int* starts = startsPos.getData();
+  MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false);
+  MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false);
+  for (size_t i = 0; i < height; ++i) {
+    int sequenceLength = starts[i + 1] - starts[i];
+    if (0 == sequenceLength) {
+      // empty sequence
+      continue;
+    }
+    outMtx->setData(dst + starts[i] * width, sequenceLength, width);
+    dataMtx->setData(src + i * width);
+    if (mode == 0) {
+      // plain average
+      outMtx->addBias(*dataMtx, 1.0f / sequenceLength);
+    } else if (mode == 1) {
+      // sum instead of average
+      outMtx->addBias(*dataMtx, 1.0f);
+    } else if (mode == 2) {
+      // divide by square root of sequenceLength
+      outMtx->addBias(*dataMtx, 1.0f / std::sqrt(sequenceLength));
+    } else {
+      LOG(FATAL) << "should not reach here";
+    }
+  }
+}
+
+/* this = scaleAB*(a*b) + scaleT*this*/
+void CpuMatrix::mul(const Matrix& a,
+                    const Matrix& b,
+                    real scaleAB,
+                    real scaleT) {
+  CHECK(!isTransposed()) << "Not supported";
+  const auto a_ptr = dynamic_cast<const CpuMatrix*>(&a);
+  const auto b_ptr = dynamic_cast<const CpuMatrix*>(&b);
+  const auto a_ptr_s = dynamic_cast<const CpuSparseMatrix*>(&a);
+  const auto b_ptr_s = dynamic_cast<const CpuSparseMatrix*>(&b);
+
+  if (a_ptr && b_ptr) {
+    mul((CpuMatrix*)a_ptr, (CpuMatrix*)b_ptr, scaleAB, scaleT);
+  } else if (a_ptr_s && b_ptr) {
+    mul((CpuSparseMatrix*)a_ptr_s, (CpuMatrix*)b_ptr, scaleAB, scaleT);
+  } else if (a_ptr && b_ptr_s) {
+    mul((CpuMatrix*)a_ptr, (CpuSparseMatrix*)b_ptr_s, scaleAB, scaleT);
+  } else {
+    LOG(FATAL) << "Not supported";
+  }
+}
+
+void CpuMatrix::mul(CpuSparseMatrix* a,
+                    CpuMatrix* b,
+                    real scaleAB,
+                    real scaleT) {
+  if (dynamic_cast<CacheRowCpuMatrix*>(b)) {
+    return mul(a, dynamic_cast<CacheRowCpuMatrix*>(b), this, scaleAB, scaleT);
+  } else if (dynamic_cast<SparseRowCpuMatrix*>(b)) {
+    return mul(a, dynamic_cast<SparseRowCpuMatrix*>(b), this, scaleAB, scaleT);
+  } else {
+    return mul(a, b, this, scaleAB, scaleT);
+  }
+}
+
+void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
+  CHECK(!isTransposed()) << "Not supported";
+
+  size_t a_col, b_col, a_row, b_row;
+  bool a_trans, b_trans;
+  if (!a->isTransposed()) {
+    a_col = a->getWidth();
+    a_row = a->getHeight();
+    a_trans = false;
+  } else {
+    a_col = a->getHeight();
+    a_row = a->getWidth();
+    a_trans = true;
+  }
+  if (!b->isTransposed()) {
+    b_col = b->getWidth();
+    b_row = b->getHeight();
+    b_trans = false;
+  } else {
+    b_col = b->getHeight();
+    b_row = b->getWidth();
+    b_trans = true;
+  }
+
+  CHECK_EQ(a_col, b_row);
+  CHECK_EQ(a_row, getHeight());
+  CHECK_EQ(b_col, getWidth());
+
+  real* A = a->getData();
+  real* B = b->getData();
+  real* C = getData();
+
+  int M = getHeight();
+  int N = getWidth();
+  int K = a_col;
+  int lda = a->getStride();
+  int ldb = b->getStride();
+  int ldc = getStride();
+  BlasGemm<DEVICE_TYPE_CPU, real>::compute(
+      a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc);
+}
+
+void CpuMatrix::mul(
+    CpuMatrix* a, CpuMatrix* b, CpuSparseMatrix* c, real scaleAB, real scaleT) {
+  CHECK(!c->isTransposed()) << "Not supported";
+  CHECK_EQ(c->getValueType(), FLOAT_VALUE);
+
+  real* A = a->getData();
+  real* B = b->getData();
+  real* C = c->getValue();
+  int* rows = c->getRows();
+  int* cols = c->getCols();
+  size_t height = c->getHeight();
+  size_t width = c->getWidth();
+  if (scaleT == 0) {
+    c->zeroMem();
+  }
+
+  if (!a->isTransposed() && !b->isTransposed()) {
+    size_t m = a->getWidth();
+    CHECK_EQ(b->getHeight(), m);
+    CHECK_EQ(a->getHeight(), height);
+    CHECK_EQ(b->getWidth(), width);
+    if (c->getFormat() == SPARSE_CSC) {
+      for (size_t i = 0; i < width; i++) {
+        size_t start = c->getColStartIdx(i);
+        size_t end = c->getColStartIdx(i + 1);
+        for (size_t j = start; j < end; j++) {
+          real sum = 0;
+          size_t rowIdx = rows[j];
+          for (size_t k = 0; k < m; k++) {
+            sum += A[rowIdx * m + k] * B[k * width + i];
+          }
+          C[j] = scaleAB * sum + scaleT * C[j];
+        }
+      }
+    } else {
+      for (size_t i = 0; i < height; i++) {
+        size_t start = c->getRowStartIdx(i);
+        size_t end = c->getRowStartIdx(i + 1);
+        for (size_t j = start; j < end; j++) {
+          real sum = 0;
+          size_t colIdx = cols[j];
+          for (size_t k = 0; k < m; k++) {
+            sum += A[i * m + k] * B[k * width + colIdx];
+          }
+          C[j] = scaleAB * sum + scaleT * C[j];
+        }
+      }
+    }
+  } else if (a->isTransposed() && !b->isTransposed()) {
+    size_t m = a->getHeight();
+    CHECK_EQ(m, b->getHeight());
+    CHECK_EQ(b->getWidth(), width);
+    CHECK_EQ(a->getWidth(), height);
+
+    if (c->getFormat() == SPARSE_CSC) {
+      for (size_t i = 0; i < width; i++) {
+        size_t start = c->getColStartIdx(i);
+        size_t end = c->getColStartIdx(i + 1);
+        for (size_t j = start; j < end; j++) {
+          real sum = 0;
+          size_t rowIdx = rows[j];
+          for (size_t k = 0; k < m; k++) {
+            sum += A[k * height + rowIdx] * B[k * width + i];
+          }
+          C[j] = scaleAB * sum + scaleT * C[j];
+        }
+      }
+    } else {
+      for (size_t i = 0; i < height; i++) {
+        int start = c->getRowStartIdx(i);
+        int end = c->getRowStartIdx(i + 1);
+        for (int j = start; j < end; j++) {
+          real sum = 0;
+          size_t colIdx = cols[j];
+          for (size_t k = 0; k < m; k++) {
+            sum += A[k * height + i] * B[k * width + colIdx];
+          }
+          C[j] = scaleAB * sum + scaleT * C[j];
+        }
+      }
+    }
+  } else if (!a->isTransposed() && b->isTransposed()) {
+    size_t m = a->getWidth();
+    CHECK_EQ(b->getWidth(), m);
+    CHECK_EQ(a->getHeight(), height);
+    CHECK_EQ(b->getHeight(), width);
+    if (c->getFormat() == SPARSE_CSR) {
+      for (size_t i = 0; i < height; i++) {
+        size_t start = c->getRowStartIdx(i);
+        size_t end = c->getRowStartIdx(i + 1);
+        for (size_t j = start; j < end; j++) {
+          real sum = 0;
+          size_t colIdx = cols[j];
+          for (size_t k = 0; k < m; k++) {
+            sum += A[i * m + k] * B[colIdx * m + k];
+          }
+          C[j] = scaleAB * sum + scaleT * C[j];
+        }
+      }
+    } else {
+      LOG(FATAL) << "Not supported csc format "
+                    "when a is not trans and b is trans";
+    }
+  } else {
+    LOG(FATAL) << "Not supported";
+  }
+}
+
+void CpuMatrix::mul(CpuMatrix* a,
+                    CpuSparseMatrix* b,
+                    real scaleAB,
+                    real scaleT) {
+  CHECK(!trans_) << "Not supported";
+  CHECK(!a->isTransposed()) << "Not supported";
+  CHECK(scaleT == 0 || scaleT == 1);
+
+  // TODO(yuyang18): Maybe bug implementation here
+  CHECK_EQ(scaleAB, static_cast<real>(1.0));
+
+  real* A = a->getData();
+  real* B = b->getValue();
+  real* C = getData();
+  int* rows = b->getRows();
+  int* cols = b->getCols();
+
+  if (scaleT == 0) {
+    zeroMem();
+  }
+  if (b->getFormat() == SPARSE_CSC) {
+    if (!b->isTransposed()) {
+      size_t m = a->getWidth();
+      CHECK_EQ(b->getHeight(), m);
+      CHECK_EQ(a->getHeight(), height_);
+      CHECK_EQ(b->getWidth(), width_);
+
+      if (b->getValueType() == NO_VALUE) {
+        for (size_t j = 0; j < b->getWidth(); ++j) {
+          int start = b->getColStartIdx(j);
+          int end = b->getColStartIdx(j + 1);
+          for (int i = start; i < end; ++i) {
+            colVecAddTo(C + j, A + rows[i], height_, width_, a->getWidth());
+          }
+        }
+      } else if (b->getValueType() == FLOAT_VALUE) {
+        for (size_t j = 0; j < b->getWidth(); ++j) {
+          int start = b->getColStartIdx(j);
+          int end = b->getColStartIdx(j + 1);
+          for (int i = start; i < end; ++i) {
+            colVecAddTo(
+                C + j, A + rows[i], B[i], height_, width_, a->getWidth());
+          }
+        }
+      }
+    } else /*if (b->isTransposed())*/ {
+      size_t m = a->getWidth();
+      CHECK_EQ(b->getHeight(), width_);
+      CHECK_EQ(a->getHeight(), height_);
+      CHECK_EQ(b->getWidth(), m);
+      if (b->getValueType() == NO_VALUE) {
+        for (size_t i = 0; i < b->getWidth(); ++i) {
+          int start = b->getColStartIdx(i);
+          int end = b->getColStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            colVecAddTo(C + rows[j], A + i, height_, width_, a->getWidth());
+          }
+        }
+      } else if (b->getValueType() == FLOAT_VALUE) {
+        for (size_t i = 0; i < b->getWidth(); ++i) {
+          int start = b->getColStartIdx(i);
+          int end = b->getColStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            colVecAddTo(
+                C + rows[j], A + i, B[j], height_, width_, a->getWidth());
+          }
+        }
+      }
+    }
+  } else {
+    if (!b->isTransposed()) {
+      size_t m = a->getWidth();
+      CHECK_EQ(b->getHeight(), m);
+      CHECK_EQ(a->getHeight(), height_);
+      CHECK_EQ(b->getWidth(), width_);
+
+      if (b->getValueType() == NO_VALUE) {
+        for (size_t j = 0; j < b->getHeight(); ++j) {
+          int start = b->getRowStartIdx(j);
+          int end = b->getRowStartIdx(j + 1);
+          for (int i = start; i < end; ++i) {
+            colVecAddTo(C + cols[i], A + j, height_, width_, a->getWidth());
+          }
+        }
+      } else if (b->getValueType() == FLOAT_VALUE) {
+        for (size_t j = 0; j < b->getHeight(); ++j) {
+          int start = b->getRowStartIdx(j);
+          int end = b->getRowStartIdx(j + 1);
+          for (int i = start; i < end; ++i) {
+            colVecAddTo(
+                C + cols[i], A + j, B[i], height_, width_, a->getWidth());
+          }
+        }
+      }
+    } else /*if (b->isTransposed())*/ {
+      size_t m = a->getWidth();
+      CHECK_EQ(b->getHeight(), width_);
+      CHECK_EQ(a->getHeight(), height_);
+      CHECK_EQ(b->getWidth(), m);
+      if (b->getValueType() == NO_VALUE) {
+        for (size_t i = 0; i < b->getHeight(); ++i) {
+          int start = b->getRowStartIdx(i);
+          int end = b->getRowStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            colVecAddTo(C + i, A + cols[j], height_, width_, a->getWidth());
+          }
+        }
+      } else if (b->getValueType() == FLOAT_VALUE) {
+        for (size_t i = 0; i < b->getHeight(); ++i) {
+          int start = b->getRowStartIdx(i);
+          int end = b->getRowStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            colVecAddTo(
+                C + i, A + cols[j], B[j], height_, width_, a->getWidth());
+          }
+        }
+      }
+    }
+  }
+}
+
+void CpuMatrix::selectRows(Matrix& table, IVector& ids) {
+  if (dynamic_cast<CacheRowCpuMatrix*>(&table)) {
+    selectRowsImp(*dynamic_cast<CacheRowCpuMatrix*>(&table), ids);
+  } else if (dynamic_cast<SparseRowCpuMatrix*>(&table)) {
+    selectRowsImp(*dynamic_cast<SparseRowCpuMatrix*>(&table), ids);
+  } else {
+    CHECK(table.isContiguous());
+    selectRowsImp(*dynamic_cast<CpuMatrix*>(&table), ids);
+  }
+}
+
+void CpuMatrix::selectElements(Matrix& table, IVector& ids) {
+  CHECK_EQ(table.getHeight(), ids.getSize());
+  CHECK_EQ(getHeight(), ids.getSize());
+  CHECK_EQ(getWidth(), 1U);
+  real* tableData = table.getData();
+  int* idsData = ids.getData();
+  for (size_t i = 0; i < table.getHeight(); i++) {
+    data_[i] += tableData[i * table.getWidth() + idsData[i]];
+  }
+}
+
+void CpuMatrix::addElements(Matrix& table, IVector& ids) {
+  CHECK_EQ(table.getHeight(), ids.getSize());
+  CHECK_EQ(getHeight(), ids.getSize());
+  CHECK_EQ(getWidth(), 1U);
+  real* tableData = table.getData();
+  int* idsData = ids.getData();
+  for (size_t i = 0; i < table.getHeight(); i++) {
+    tableData[i * table.getWidth() + idsData[i]] += data_[i];
+  }
+}
+
+// this.row[i] += table.row[ids[i]]
+template <typename TableMatType>
+void CpuMatrix::selectRowsImp(TableMatType& table, IVector& ids) {
+  CHECK(!table.useGpu());
+  CHECK(!ids.useGpu());
+  CHECK_EQ(getHeight(), ids.getSize());
+  CHECK_EQ(getWidth(), table.getWidth());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  real* a = getData();
+  size_t tableSize = table.getHeight();
+  int* index = ids.getData();
+
+  for (size_t i = 0; i < numSamples; ++i) {
+    if (index[i] == -1) continue;
+    CHECK_LT(index[i], (int)tableSize);
+    CHECK_GE(index[i], 0);
+    vecAddTo(a + i * stride_, table.getRow(index[i]), dim);
+  }
+}
+
+void CpuMatrix::addToRows(Matrix& table, IVector& ids) {
+  if (dynamic_cast<CacheRowCpuMatrix*>(&table)) {
+    addToRowsImp(*dynamic_cast<CacheRowCpuMatrix*>(&table), ids);
+  } else if (dynamic_cast<SparseAutoGrowRowCpuMatrix*>(&table)) {
+    addToRowsImp(*dynamic_cast<SparseAutoGrowRowCpuMatrix*>(&table), ids);
+  } else if (dynamic_cast<SparseRowCpuMatrix*>(&table)) {
+    addToRowsImp(*dynamic_cast<SparseRowCpuMatrix*>(&table), ids);
+  } else {
+    CHECK(table.isContiguous());
+    addToRowsImp(*dynamic_cast<CpuMatrix*>(&table), ids);
+  }
+}
+
+// table.row[ids[i]] += this.row[i]
+template <typename TableMatType>
+void CpuMatrix::addToRowsImp(TableMatType& table, IVector& ids) {
+  CHECK(!table.useGpu());
+  CHECK(!ids.useGpu());
+  CHECK_EQ(getHeight(), ids.getSize());
+  CHECK_EQ(getWidth(), table.getWidth());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  real* a = getData();
+  size_t tableSize = table.getHeight();
+  int* index = ids.getData();
+
+  for (size_t i = 0; i < numSamples; ++i) {
+    if (index[i] == -1) continue;
+    CHECK_LT(index[i], (int)tableSize);
+    CHECK_GE(index[i], 0);
+    vecAddTo(table.getRow(index[i]), a + i * stride_, dim);
+  }
+}
+
+static ThreadLocal<std::vector<const real*>> threadLocalColArray;
+
+template <typename MatBType, typename MatCType>
+void CpuMatrix::mul(
+    CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT) {
+  CHECK(!c->isTransposed()) << "Not supported";
+  CHECK(!b->isTransposed()) << "Not supported";
+  // TODO(yuyang18): Maybe bug implementation here.
+  CHECK(scaleAB == 1) << "Not supported";
+  CHECK(scaleT == 0 || scaleT == 1) << "Not supported";
+  CHECK_EQ(a->getFormat(), SPARSE_CSR) << "Not supported";
+
+  real* B = b->getData();
+  real* C = c->getData();
+  size_t height = c->getHeight();
+  size_t width = c->getWidth();
+  int* cols = a->getCols();
+  real* values = a->getValue();
+
+  if (scaleT == 0) {
+    c->zeroMem();
+  }
+
+  if (!a->isTransposed()) {
+    size_t m = a->getWidth();
+    CHECK_EQ(b->getHeight(), m);
+    CHECK_EQ(a->getHeight(), height);
+    CHECK_EQ(b->getWidth(), width);
+
+    if (a->getValueType() == NO_VALUE) {
+      if (width % 32 == 0) {  // use libaddto
+        // @TODO(yuyang18) Make input addr can be unaligned.
+        // So merge this if and else
+        CHECK_EQ((size_t)B % 32, 0UL);
+        CHECK_EQ((size_t)C % 32, 0UL);
+        auto& colArray = *threadLocalColArray;
+        for (size_t i = 0; i < a->getHeight(); ++i) {
+          const int start = a->getRowStartIdx(i);
+          const int end = a->getRowStartIdx(i + 1);
+          size_t colNum = end - start;
+          colArray.resize(colNum);
+          for (int j = 0; j < end - start; ++j) {
+            colArray[j] = b->getRow(cols[j + start]);
+          }
+          simd::batchAddTo(c->getRow(i), &colArray[0], colNum, width);
+        }
+
+      } else {
+        for (size_t i = 0; i < a->getHeight(); ++i) {
+          const int start = a->getRowStartIdx(i);
+          const int end = a->getRowStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            vecAddTo(c->getRow(i), b->getRow(cols[j]), width);
+          }
+        }
+      }
+    } else if (a->getValueType() == FLOAT_VALUE) {
+      for (size_t i = 0; i < a->getHeight(); ++i) {
+        const int start = a->getRowStartIdx(i);
+        const int end = a->getRowStartIdx(i + 1);
+        for (int j = start; j < end; ++j) {
+          vecAddTo(c->getRow(i), b->getRow(cols[j]), values[j], width);
+        }
+      }
+    }
+  } else /*if (a->isTransposed())*/ {
+    size_t m = a->getHeight();
+    CHECK_EQ(b->getHeight(), m);
+    CHECK_EQ(a->getWidth(), height);
+    CHECK_EQ(b->getWidth(), width);
+    if (a->getValueType() == NO_VALUE) {
+      if (width % 32 == 0) {  // use libaddto
+        // @TODO(yuyang18) Make input addr can be unaligned.
+        // So merge this if and else
+        CHECK_EQ((size_t)B % 32, 0UL);
+        CHECK_EQ((size_t)C % 32, 0UL);
+        for (size_t i = 0; i < a->getHeight(); ++i) {
+          const int start = a->getRowStartIdx(i);
+          const int end = a->getRowStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            simd::addTo(c->getRow(cols[j]), b->getRow(i), width);
+          }
+        }
+
+      } else {
+        for (size_t i = 0; i < a->getHeight(); ++i) {
+          const int start = a->getRowStartIdx(i);
+          const int end = a->getRowStartIdx(i + 1);
+          for (int j = start; j < end; ++j) {
+            vecAddTo(c->getRow(cols[j]), b->getRow(i), width);
+          }
+        }
+      }
+    } else if (a->getValueType() == FLOAT_VALUE) {
+      for (size_t i = 0; i < a->getHeight(); ++i) {
+        const int start = a->getRowStartIdx(i);
+        const int end = a->getRowStartIdx(i + 1);
+        for (int j = start; j < end; ++j) {
+          vecAddTo(c->getRow(cols[j]), b->getRow(i), values[j], width);
+        }
+      }
+    }
+  }
+}
+
+// instantiation mul() called in SparseRowMatrix.cpp
+template void CpuMatrix::mul<CpuMatrix, SparseRowCpuMatrix>(
+    CpuSparseMatrix* a,
+    CpuMatrix* b,
+    SparseRowCpuMatrix* c,
+    real scaleAB,
+    real scaleT);
+template void CpuMatrix::mul<CpuMatrix, SparseAutoGrowRowCpuMatrix>(
+    CpuSparseMatrix* a,
+    CpuMatrix* b,
+    SparseAutoGrowRowCpuMatrix* c,
+    real scaleAB,
+    real scaleT);
+template void CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(CpuSparseMatrix* a,
+                                                           CpuMatrix* b,
+                                                           CacheRowCpuMatrix* c,
+                                                           real scaleAB,
+                                                           real scaleT);
+
+#ifndef PADDLE_MOBILE_INFERENCE
+void SharedCpuMatrix::mul(CpuSparseMatrix* a,
+                          CpuMatrix* b,
+                          real scaleAB,
+                          real scaleT) {
+  CHECK(!isTransposed()) << "Not supported";
+  CHECK(!b->isTransposed()) << "Not supported";
+  CHECK_EQ(scaleAB, 1) << "Not supported";
+  CHECK_EQ(scaleT, 1) << "Not supported";
+  CHECK_EQ(a->getFormat(), SPARSE_CSR) << "not supported";
+
+  real* B = b->getData();
+  real* C = getData();
+  size_t height = getHeight();
+  size_t width = getWidth();
+
+  // get real trans
+  MatrixPtr aTrans;
+  if (a->isTransposed()) {
+    aTrans = a->getTmpSparseMatrix(a->getWidth(), a->getHeight());
+    a->transpose(aTrans, false);
+  }
+  a = dynamic_cast<CpuSparseMatrix*>(aTrans.get());
+
+  size_t m = a->getWidth();
+  CHECK_EQ(b->getHeight(), m);
+  CHECK_EQ(a->getHeight(), height);
+  CHECK_EQ(b->getWidth(), width);
+
+  size_t blockSize = (height / blockNum_) + 1;
+  CpuMatrixPtr localBuf = *localBuf_;
+  if (!localBuf) {
+    localBuf = std::make_shared<CpuMatrix>(blockSize, width);
+  } else {
+    localBuf->resize(blockSize, width);
+  }
+  localBuf->zeroMem();
+  real* localC = localBuf->getData();
+  std::vector<int>& blockSeq = *blockSeq_;
+  if (blockSeq.size() == 0) {
+    for (int k = 0; k < blockNum_; ++k) {
+      blockSeq.push_back(k);
+    }
+    std::shuffle(
+        blockSeq.begin(), blockSeq.end(), ThreadLocalRandomEngine::get());
+  }
+  std::vector<int>& localBufRows = *localBufRows_;
+  int* cols = a->getCols();
+  real* value = a->getValue();
+
+  for (int k = 0; k < blockNum_; ++k) {
+    int blockId = blockSeq[k];
+    size_t blockBegin = blockId * blockSize;
+    size_t blockEnd = (blockId + 1) * blockSize;
+    if (blockId == blockNum_ - 1) {
+      blockEnd = height;
+    }
+    if (a->getValueType() == NO_VALUE) {
+      for (size_t i = blockBegin; i < blockEnd; ++i) {
+        int start = a->getRowStartIdx(i);
+        int end = a->getRowStartIdx(i);
+        size_t colNum = a->getColNum(i);
+        if (colNum == 0) {
+          continue;
+        }  // skip empty row
+        localBufRows.push_back(i);
+        size_t bufPos = localBufRows.size() - 1;
+        for (int j = start; j < end; ++j) {
+          vecAddTo(localC + bufPos * width, B + cols[j] * width, width);
+        }
+      }
+    } else if (a->getValueType() == FLOAT_VALUE) {
+      for (size_t i = blockBegin; i < blockEnd; ++i) {
+        int start = a->getRowStartIdx(i);
+        int end = a->getRowStartIdx(i);
+        size_t colNum = a->getColNum(i);
+        if (colNum == 0) {
+          continue;
+        }  // skip empty row
+        localBufRows.push_back(i);
+        size_t bufPos = localBufRows.size() - 1;
+        for (int j = start; j < end; ++j) {
+          vecAddTo(
+              localC + bufPos * width, B + cols[j] * width, value[j], width);
+        }
+      }
+    }
+
+    {
+      std::lock_guard<std::mutex> guard(*blockLocks_[blockId]);
+      for (size_t i = 0; i < localBufRows.size(); ++i) {
+        vecAddTo(C + localBufRows[i] * width, localC + i * width, width);
+      }
+    }
+    memset(localC, 0, localBufRows.size() * width * sizeof(real));
+    localBufRows.clear();
+  }
+
+  VLOG(2) << " B[0]=" << B[0] << " B[1]=" << B[1] << " C[0]=" << C[0]
+          << " C[1]=" << C[1];
+}
+
+void SharedCpuMatrix::add(Matrix& b, real p1, real p2) {
+  CHECK_EQ(blockNum_, 1);
+  std::lock_guard<std::mutex> guard(*blockLocks_[0]);
+  CpuMatrix::add(b, p1, p2);
+}
+
+void SharedCpuMatrix::add(real p1, real p2) {
+  CHECK_EQ(blockNum_, 1);
+  std::lock_guard<std::mutex> guard(*blockLocks_[0]);
+  CpuMatrix::add(p1, p2);
+}
+
+void SharedCpuMatrix::initShared(int blockNum) {
+  CHECK_GT(height_ * width_, 1UL * 1024 * 1024)
+      << "should not share small matrix";
+  initBlock(blockNum);
+}
+
+void SharedCpuMatrix::initBlock(int blockNum) {
+  CHECK_LE(blockNum, 200) << "should not use large block number";
+  blockNum_ = blockNum;
+  blockLocks_.resize(blockNum);
+  for (auto& locker : blockLocks_) {
+    locker.reset(new std::mutex);
+  }
+}
+
+#endif
+/* Add a (column) vector b to matrix a, column by column */
+void CpuMatrix::addColumnVector(const Matrix& b) {
+  BaseMatrix::addColVector(const_cast<Matrix&>(b));
+}
+
+/* this = a*b */
+void CpuMatrix::mul(const Matrix& a, const Matrix& b) {
+  return mul(a, b, 1.0, 0.0);
+}
+
+/* this = scaleAB*(this*b) +  scaleT*this */
+void CpuMatrix::rightMul(Matrix& b, real scaleAB, real scaleT) {
+  (void)b;
+  (void)scaleAB;
+  (void)scaleT;
+  LOG(FATAL) << "Not implemented";
+}
+
+/* this = this* b */
+void CpuMatrix::rightMul(Matrix& b) { return rightMul(b, 1.0, 0.0); }
+
+/* this = scaleAB*(a*this) +  scaleT*this */
+void CpuMatrix::leftMul(Matrix& a, real scaleAB, real scaleT) {
+  (void)a;
+  (void)scaleAB;
+  (void)scaleT;
+  LOG(FATAL) << "Not implemented";
+}
+
+/* this = a*this) */
+void CpuMatrix::leftMul(Matrix& a) { return leftMul(a, 1.0, 0.0); }
+
+void CpuMatrix::colMerge(Matrix& src) { src.rowSum(*this); }
+
+void CpuMatrix::rowSum(Matrix& sum) {
+  CHECK_EQ(sum.getHeight(), getHeight());
+  CHECK_EQ(sum.getWidth(), (size_t)1);
+
+  sum.sumRows(*this, /* scaleSum= */ 1, /* scaleDest= */ 0);
+}
+
+void CpuMatrix::rowMaxId(IVector& maxIds) {
+  CHECK(!maxIds.useGpu()) << "Matrix type are not equal";
+
+  size_t numSamples = getHeight();
+  CHECK_EQ(maxIds.getSize(), numSamples);
+
+  real* a = getData();
+  int* s = maxIds.getData();
+  size_t dim = getWidth();
+
+  for (size_t i = 0; i < numSamples; i++) {
+    real sm = a[i * dim];
+    int maxId = 0;
+    for (size_t j = 1; j < dim; j++) {
+      if (a[i * dim + j] > sm) {
+        maxId = j;
+        sm = a[i * dim + j];
+      }
+    }
+    s[i] = maxId;
+  }
+}
+
+void CpuMatrix::rowMax(Matrix& max) {
+  CHECK_EQ(max.getHeight(), getHeight());
+  CHECK_EQ(max.getWidth(), (size_t)1);
+  max.maxRows(*this);
+}
+
+/* Get the top k elements of each row of this matrix */
+void CpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
+  CHECK(isContiguous());
+  CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
+  size_t numSamples = getHeight();
+  size_t beam = maxVal.getWidth();
+  CHECK_EQ(maxIds.getSize(), numSamples * beam);
+  CHECK_EQ(maxVal.getHeight(), numSamples);
+  CHECK_EQ(maxVal.getWidth(), beam);
+
+  real* a = getData();
+  int* s = maxIds.getData();
+  real* t = maxVal.getData();
+  size_t dim = getWidth();
+  for (size_t i = 0; i < numSamples; i++) {
+    std::vector<std::pair<real, size_t>> vec;
+    for (size_t j = 0; j < dim; j++) {
+      vec.push_back(std::pair<real, size_t>(a[i * dim + j], j));
+    }
+
+    std::partial_sort(
+        vec.begin(),
+        vec.begin() + beam,
+        vec.end(),
+        [](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
+          return l.first > r.first;
+        });
+    for (size_t j = 0; j < beam; j++) {
+      t[i * beam + j] = vec[j].first;
+      s[i * beam + j] = vec[j].second;
+    }
+  }
+}
+
+void CpuMatrix::colMax(Matrix& max) {
+  CHECK_EQ(max.getWidth(), getWidth());
+  CHECK_EQ(max.getHeight(), (size_t)1);
+  max.maxCols(*this);
+}
+
+void CpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
+  CHECK(isContiguous());
+  CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
+  size_t numSamples = getWidth();
+  size_t beam = maxVal.getHeight();
+  CHECK_EQ(maxIds.getSize(), numSamples * beam);
+  CHECK_EQ(maxVal.getWidth(), numSamples);
+
+  real* a = getData();
+  int* s = maxIds.getData();
+  real* t = maxVal.getData();
+  size_t dim = getHeight();
+  for (size_t i = 0; i < numSamples; i++) {
+    std::vector<std::pair<real, size_t>> vec;
+    for (size_t j = 0; j < dim; j++) {
+      vec.push_back(std::pair<real, size_t>(a[i + j * numSamples], j));
+    }
+
+    std::partial_sort(
+        vec.begin(),
+        vec.begin() + beam,
+        vec.end(),
+        [](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
+          return l.first > r.first;
+        });
+    for (size_t j = 0; j < beam; j++) {
+      t[i + j * numSamples] = vec[j].first;
+      s[i + j * numSamples] = vec[j].second;
+    }
+  }
+}
+
+void CpuMatrix::maxoutForward(Matrix& a,
+                              IVector& id,
+                              size_t channels,
+                              size_t groups) {
+  CHECK(dynamic_cast<CpuMatrix*>(&a));
+  CHECK(dynamic_cast<CpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+
+  size_t size = getWidth();
+  size_t batchSize = getHeight();
+  size_t featLen = size / channels;
+  const real* input = a.getData();
+  int* idForCpu = id.getData();
+
+  MatrixPtr maxInMat, maxOutMat;
+  Matrix::resizeOrCreate(maxInMat, groups, size, false, false);
+  Matrix::resizeOrCreate(maxOutMat, 1, size, false, false);
+
+  for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
+    size_t newIndex = batch_idx * size;
+    IVectorPtr tmpId = IVector::create(idForCpu + newIndex, size, false);
+
+    for (size_t i = 0; i < channels; ++i) {
+      size_t newFeatLen = i * featLen;
+      for (size_t j = 0; j < groups; ++j) {
+        maxInMat->subMatrix(j, j + 1, newFeatLen, newFeatLen + featLen)
+            ->copyFrom(input + (newIndex + newFeatLen) * groups + j * featLen,
+                       featLen);
+      }
+    }
+    maxInMat->colMax(*tmpId, *maxOutMat);
+    this->subRowMatrix(batch_idx, batch_idx + 1)->copyFrom(*maxOutMat);
+  }
+}
+
+void CpuMatrix::maxoutBackward(Matrix& a,
+                               IVector& id,
+                               size_t channels,
+                               size_t groups) {
+  CHECK(dynamic_cast<CpuMatrix*>(&a));
+  CHECK(dynamic_cast<CpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+
+  size_t size = a.getWidth();
+  size_t batchSize = getHeight();
+  size_t featLen = size / channels;
+  size_t newFeatLen = groups * featLen;
+  real* inputG = getData();
+  const real* outG = a.getData();
+  int* idForCpu = id.getData();
+
+  for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
+    size_t newIndex = batch_idx * size;
+    int* idData = idForCpu + newIndex;
+
+    for (size_t i = 0; i < size; ++i) {
+      int gradIdx =
+          idData[i] * featLen + (i / featLen) * newFeatLen + i % featLen;
+      (inputG + newIndex * groups)[gradIdx] += (outG + newIndex)[i];
+    }
+  }
+}
+
+void CpuMatrix::rowNormalizeL1(Matrix& out) {
+  CHECK(!out.useGpu());
+
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(out.getHeight(), numSamples);
+  CHECK_EQ(out.getWidth(), dim);
+  real* a = getData();
+  real* b = out.getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    real s = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      s += a[i * dim + j];
+    }
+    // Right now, we just bet that sum won't be zero. If this really happens,
+    // we will figure out what should be done then.
+    CHECK_GT(s, 0);
+    s = 1 / s;
+    for (size_t j = 0; j < dim; ++j) {
+      b[i * dim + j] = s * a[i * dim + j];
+    }
+  }
+}
+
+/* calulate classification error */
+void CpuMatrix::classificationError(Matrix& output,
+                                    IVector& label,
+                                    size_t topkSize) {
+  size_t numSamples = this->getHeight();
+  auto cpuOutput = dynamic_cast<CpuMatrix*>(&output);
+  auto cpuLabel = dynamic_cast<CpuIVector*>(&label);
+  IVectorPtr cpuTopIds = std::make_shared<CpuIVector>(numSamples * topkSize);
+  MatrixPtr cpuTopVal = std::make_shared<CpuMatrix>(numSamples, topkSize);
+
+  CHECK(cpuOutput && cpuLabel) << "Invalid argument pointer";
+  CHECK(cpuTopIds && cpuTopVal) << "Allocate cpu memory failed";
+  CHECK(cpuLabel->getSize() == numSamples) << "Vector size is not equal";
+  CHECK(cpuOutput->getHeight() == numSamples && this->getWidth() == 1)
+      << "Matrix dimensions are not equal";
+
+  // top k matrix classification
+  cpuOutput->rowMax(*cpuTopIds, *cpuTopVal);
+
+  size_t dim = cpuOutput->getWidth();
+  real* result = this->getData();
+  int* ids = cpuTopIds->getData();
+  int* lbl = cpuLabel->getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    CHECK_GE(lbl[i], 0);
+    CHECK_LT((size_t)lbl[i], dim);
+
+    for (size_t j = 0; j < topkSize; ++j) {
+      if (ids[j + i * topkSize] == lbl[i]) {
+        result[i] = 0;
+        break;
+      }
+      result[i] = 1.0f;
+    }
+  }
+}
+
+/* copy -log(output[label]) to this->data[i] */
+void CpuMatrix::oneHotCrossEntropy(Matrix& output, IVector& label) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  CHECK(dynamic_cast<CpuIVector*>(&label));
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(label.getSize(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(getWidth(), (size_t)1);
+
+  real* out = output.getData();
+  real* cost = getData();
+  int* lbl = label.getData();
+  for (size_t i = 0; i < numSamples; ++i, out += dim) {
+    CHECK_GE(lbl[i], 0);
+    CHECK_LT((size_t)lbl[i], dim);
+    cost[i] = -std::log(out[lbl[i]]);
+  }
+}
+
+/* calculate the error of outputV according to label */
+void CpuMatrix::oneHotCrossEntropyBp(Matrix& output, IVector& label) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  CHECK(dynamic_cast<CpuIVector*>(&label));
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(output.getWidth(), dim);
+  real* out = output.getData();
+  real* grad = getData();
+  int* lbl = label.getData();
+  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
+    grad[lbl[i]] -= 1 / out[lbl[i]];
+  }
+}
+
+/*
+    We implement the matrix functionality in CostLayer.cpp,
+    but we define the scalar function here for sanity check
+    deletion of the function does not affect anything neverthelss
+*/
+void CpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                               IVector& label,
+                                               real alpha) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  CHECK(dynamic_cast<CpuIVector*>(&label));
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(label.getSize(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(getWidth(), (size_t)1);
+
+  real* out = output.getData();
+  real* cost = getData();
+  int* lbl = label.getData();
+  for (size_t i = 0; i < numSamples; ++i, out += dim) {
+    CHECK_GE(lbl[i], 0);
+    CHECK_LT((size_t)lbl[i], dim);
+    real sum = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      sum += out[j];
+    }
+    sum = _safelog(sum);
+    cost[i] = -_safelog(out[lbl[i]]) + sum + alpha * _square(sum);
+  }
+}
+
+/*
+    We implement the matrix functionality in CostLayer.cpp,
+    but we define the scalar function here for sanity check
+    deletion of the function does not affect anything neverthelss
+*/
+void CpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& output,
+                                                 IVector& label,
+                                                 real alpha) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  CHECK(dynamic_cast<CpuIVector*>(&label));
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(output.getWidth(), dim);
+  real* out = output.getData();
+  real* grad = getData();
+  int* lbl = label.getData();
+
+  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
+    grad[lbl[i]] -= 1 / out[lbl[i]];
+    real sum = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      sum += out[j];
+    }
+    for (size_t j = 0; j < dim; ++j) {
+      if (j == (size_t)lbl[i]) {
+        grad[j] += -1 / out[j];
+      }
+      grad[j] += 1 / sum + 2 * alpha * _safelog(sum) / sum;
+    }
+  }
+}
+
+#define FORWARD_LOOP()                      \
+  size_t numSamples = getHeight();          \
+  size_t dim = getWidth();                  \
+  CHECK_EQ(output.getHeight(), numSamples); \
+  CHECK_EQ(output.getWidth(), dim);         \
+  const real* in = getData();               \
+  real* out = output.getData();             \
+  for (size_t i = 0; i < numSamples; ++i, in += dim, out += dim)
+
+#define BACKWARD_LOOP()                     \
+  size_t numSamples = getHeight();          \
+  size_t dim = getWidth();                  \
+  CHECK_EQ(output.getHeight(), numSamples); \
+  CHECK_EQ(output.getWidth(), dim);         \
+  real* grad = getData();                   \
+  real* out = output.getData();             \
+  for (size_t i = 0; i < numSamples; ++i, grad += dim, out += dim)
+
+void CpuMatrix::softmax(Matrix& output) {
+  CHECK(!output.useGpu());
+
+  const float THRESHOLD = -64.0;
+
+  FORWARD_LOOP() {
+    real max = -1.0e20;
+    for (size_t j = 0; j < dim; ++j) {
+      if (in[j] > max) {
+        max = in[j];
+      }
+    }
+    for (size_t j = 0; j < dim; ++j) {
+      real a = in[j] - max;
+      if (a < THRESHOLD) {
+        a = THRESHOLD;
+      }
+      out[j] = a;
+    }
+    vExp(dim, out, out);
+
+    real sum = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      sum += out[j];
+    }
+    sum = 1 / sum;
+    for (size_t j = 0; j < dim; ++j) {
+      out[j] *= sum;
+    }
+  }
+}
+
+void CpuMatrix::sequenceSoftmax(Matrix& output, const IVector& index) {
+  CHECK_EQ(getWidth(), 1UL);
+  CHECK_EQ(output.getWidth(), 1UL);
+  CHECK(isContiguous());
+
+  MatrixPtr inTmp = Matrix::create(nullptr,
+                                   /* height= */ 1,
+                                   1,
+                                   /* trans= */ false,
+                                   false);
+  MatrixPtr outTmp = Matrix::create(nullptr,
+                                    /* height= */ 1,
+                                    1,
+                                    /* trans= */ false,
+                                    false);
+  size_t numSequences = index.getSize() - 1;
+  auto starts = index.getData();
+  for (size_t i = 0; i < numSequences; ++i) {
+    size_t offset = starts[i];
+    size_t size = starts[i + 1] - starts[i];
+    inTmp->setData(getData() + offset, 1UL, size);
+    outTmp->setData(output.getData() + offset, 1UL, size);
+    inTmp->softmax(*outTmp);
+  }
+}
+
+void CpuMatrix::softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
+  CHECK(output.useGpu_ == false) << "Matrix type are not equal";
+  CHECK_EQ(getHeight(), sftmaxSum.getHeight());
+
+  real* sums = sftmaxSum.getData();
+
+  BACKWARD_LOOP() {
+    real sum = sums[i];
+    for (size_t j = 0; j < dim; ++j) {
+      grad[j] = out[j] * (grad[j] - sum);
+    }
+  }
+}
+
+void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
+  CHECK(output.useGpu_ == false && label.useGpu_ == false)
+      << "Matrix type are not equal";
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(label.getHeight(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(label.getWidth(), dim);
+  CHECK_EQ(getWidth(), (size_t)1);
+  real* out = output.getData();
+  real* cost = getData();
+
+  auto labelptr = dynamic_cast<CpuSparseMatrix*>(&label);
+  if (labelptr) {
+    // it is a CpuSparseMatrix
+    if (labelptr->getFormat() == SPARSE_CSR) {
+      // treat label as a SparseMatrix
+      for (size_t i = 0; i < numSamples; ++i) {
+        for (size_t j = 0; j < dim; ++j) {
+          cost[i] += _square(out[i * dim + j]);
+        }
+      }
+      if (labelptr->getValueType() == NO_VALUE) {
+        int* cols = labelptr->getCols();
+        for (size_t i = 0; i < numSamples; ++i) {
+          for (size_t j = labelptr->getRowStartIdx(i);
+               j < labelptr->getRowStartIdx(i + 1);
+               ++j) {
+            cost[i] += 1.0 - 2.0 * out[i * dim + cols[j]];
+            /*
+             * explanation of above line: original codes are follows:
+             * cost[i] -= _square(out[i * dim + feature.col]);
+             * cost[i] += _square(1.0 - out[i * dim + feature.col]);
+             */
+          }
+        }
+      } else if (labelptr->getValueType() == FLOAT_VALUE) {
+        int* cols = labelptr->getCols();
+        real* values = labelptr->getValue();
+        for (size_t i = 0; i < numSamples; ++i) {
+          real sum1 = 0;
+          real sum2 = 0;
+          for (size_t j = labelptr->getRowStartIdx(i);
+               j < labelptr->getRowStartIdx(i + 1);
+               ++j) {
+            sum1 += values[j] * values[j];
+            sum2 += values[j] * out[i * dim + cols[j]];
+            /*
+             * explanation of above line: original codes are follows:
+             * cost[i] -= _square(out[i * dim + feature.col]);
+             * cost[i] += _square(value.col - out[i * dim + feature.col]);
+             */
+          }
+          cost[i] += sum1 - 2.0 * sum2;
+        }
+      } else {
+        LOG(FATAL) << "unsupported sparse matrix value type in sumOfSquares";
+        return;
+      }
+      return;
+    } else {
+      LOG(FATAL) << "unsupported sparse matrix format in sumOfSquares";
+      return;
+    }
+  }
+
+  BaseMatrix::sumOfSquaredDiffs(output,
+                                label,
+                                /* scaleSum= */ 1,
+                                /* scaleDest= */ 1);
+}
+
+/* calculate the error of outputV according to label */
+void CpuMatrix::sumOfSquaresBp(Matrix& output, Matrix& label) {
+  CHECK(output.useGpu_ == false && label.useGpu_ == false)
+      << "Matrix type are not equal";
+
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(output.getWidth(), dim);
+  CHECK_EQ(label.getWidth(), dim);
+
+  real* out = output.getData();
+  real* grad = getData();
+
+  auto labelptr = dynamic_cast<CpuSparseMatrix*>(&label);
+  if (labelptr) {
+    // it is a CpuSparseMatrix
+    if (labelptr->getFormat() == SPARSE_CSR) {
+      // treat label as a SparseMatrix
+      for (size_t i = 0; i < numSamples; ++i) {
+        for (size_t j = 0; j < dim; ++j) {
+          grad[i * dim + j] += 2.0 * out[i * dim + j];
+        }
+      }
+      if (labelptr->getValueType() == NO_VALUE) {
+        int* cols = labelptr->getCols();
+        for (size_t i = 0; i < numSamples; ++i) {
+          for (size_t j = labelptr->getRowStartIdx(i);
+               j < labelptr->getRowStartIdx(i + 1);
+               ++j) {
+            grad[i * dim + cols[j]] -= 2.0;
+            /*
+             * explanation of above line: original codes are follows:
+             * grad[i * dim + feature.col] -= 2.0 * out[i * dim + feature.col];
+             * grad[i * dim + feature.col] += 2.0 * (out[i * dim + feature.col]
+             * - 1);
+             */
+          }
+        }
+      } else if (labelptr->getValueType() == FLOAT_VALUE) {
+        int* cols = labelptr->getCols();
+        real* values = labelptr->getValue();
+        for (size_t i = 0; i < numSamples; ++i) {
+          for (size_t j = labelptr->getRowStartIdx(i);
+               j < labelptr->getRowStartIdx(i + 1);
+               ++j) {
+            grad[i * dim + cols[j]] -= 2.0 * values[j];
+            /*
+             * explanation of above line: original codes are follows:
+             * grad[i * dim + feature.col] -= 2.0 * out[i * dim + feature.col];
+             * grad[i * dim + feature.col] += 2.0 * (out[i * dim + feature.col]
+             * - value.col);
+             */
+          }
+        }
+      } else {
+        LOG(FATAL) << "unsupported sparse matrix value type in sumOfSquares";
+        return;
+      }
+      return;
+    } else {
+      LOG(FATAL) << "unsupported sparse matrix format in sumOfSquares";
+      return;
+    }
+  }
+
+  real* lbl = label.getData();
+  size_t ld = getStride();
+  size_t outLd = output.getStride();
+  size_t lblLd = label.getStride();
+  CHECK(lbl);
+  for (size_t i = 0; i < numSamples;
+       ++i, out += outLd, lbl += lblLd, grad += ld) {
+    for (size_t j = 0; j < dim; ++j) {
+      grad[j] += 2.0 * (out[j] - lbl[j]);  // positive gradient;
+    }
+  }
+}
+
+void CpuMatrix::smoothL1(Matrix& output, Matrix& label, real destScale) {
+  CHECK(output.useGpu_ == false && label.useGpu_ == false)
+      << "Matrix type are not equal";
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(label.getHeight(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(label.getWidth(), dim);
+  CHECK_EQ(getWidth(), (size_t)1);
+
+  real* cost = getData();
+  real* out = output.getData();
+  real* lbl = label.getData();
+
+  for (size_t i = 0; i < numSamples; ++i, out += dim, lbl += dim) {
+    for (size_t j = 0; j < dim; ++j) {
+      real absVal = std::fabs(out[j] - lbl[j]);
+      cost[i] *= destScale;
+      if (absVal < 1.0)
+        cost[i] += 0.5 * absVal * absVal;
+      else
+        cost[i] += absVal - 0.5;
+    }
+  }
+}
+
+void CpuMatrix::smoothL1Bp(Matrix& output, Matrix& label, real destScale) {
+  CHECK(output.useGpu_ == false && label.useGpu_ == false)
+      << "Matrix type are not equal";
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(label.getHeight(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(label.getWidth(), dim);
+  CHECK_EQ(getWidth(), dim);
+
+  real* out = output.getData();
+  real* lbl = label.getData();
+  real* grad = getData();
+
+  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim, lbl += dim) {
+    for (size_t j = 0; j < dim; ++j) {
+      real val = out[j] - lbl[j];
+      grad[j] *= destScale;
+      if (std::fabs(val) < 1) {
+        grad[j] += val;
+      } else {
+        grad[j] += (real(0) < val) - (val < real(0));
+      }
+    }
+  }
+}
+
+void CpuMatrix::tanh(Matrix& output) {
+  CHECK(isContiguous());
+  CHECK(output.isContiguous());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(output.getWidth(), dim);
+  vTanh(numSamples * dim, getData(), output.getData());
+}
+
+void CpuMatrix::tanhDerivative(Matrix& output) {
+  BaseMatrix::tanhDerivative(output);
+}
+
+void CpuMatrix::softrelu(Matrix& output) {
+  CHECK(isContiguous());
+  CHECK(output.isContiguous());
+  const real THRESHOLD = 40.0;
+  FORWARD_LOOP() {  // TODO(yuyang18): SIMD it?
+    for (size_t j = 0; j < dim; ++j) {
+      real x = in[j];
+      if (x > THRESHOLD) {
+        x = THRESHOLD;
+      } else if (x < -THRESHOLD) {
+        x = -THRESHOLD;
+      }
+      out[j] = x;
+    }
+  }
+  vExp(numSamples * dim, output.getData(), output.getData());
+  vLog1p(numSamples * dim, output.getData(), output.getData());
+}
+
+void CpuMatrix::softreluDerivative(Matrix& output) {
+  CHECK(isContiguous());
+  CHECK(output.isContiguous());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  size_t size = numSamples * dim;
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(output.getWidth(), dim);
+  real* grad = getData();
+  MatrixPtr tmpMat = Matrix::create(numSamples, dim);
+  real* tmp = tmpMat->getData();
+
+  vExp(size, output.getData(), tmpMat->getData());
+
+  for (size_t i = 0; i < size; ++i) {
+    grad[i] *= (1.0 - 1.0 / tmp[i]);
+  }
+}
+
+void CpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
+  CHECK(isContiguous());
+  CHECK(output.isContiguous());
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(output.getWidth(), dim);
+
+  const real* in = getData();
+  real* out = output.getData();
+
+  // out = p2*in
+  for (size_t i = 0; i < numSamples * dim; ++i) {
+    out[i] = p2 * in[i];
+  }
+
+  vTanh(numSamples * dim, out, out);
+
+  // out = p1 * out
+  for (size_t i = 0; i < numSamples * dim; ++i) {
+    out[i] = p1 * out[i];
+  }
+}
+
+/* uniform randomization, minimize precision = 1e-5 */
+void CpuMatrix::randomizeUniform() {
+  CHECK(isContiguous());
+  real* data = getData();
+  unsigned int* randSeed = ThreadLocalRand::getSeed();
+  real recipRandMax = 1.0f / (real)RAND_MAX;
+  for (size_t i = 0; i < elementCnt_; ++i) {
+    *data++ = rand_r(randSeed) * recipRandMax;
+  }
+}
+
+void CpuMatrix::print(std::ostream& os) const {
+  CHECK(isContiguous());
+  for (size_t i = 0; i < height_; ++i) {
+    for (size_t j = 0; j < width_; ++j) {
+      os << data_[i * width_ + j] << " ";
+    }
+    os << std::endl;
+  }
+}
+
+void CpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
+  real* input = data.getData();
+  real* w = W.getData();
+  real* output = data_;
+  size_t numElements = data.getWidth();
+  size_t numSamples = data.getHeight();
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+
+  size_t partial_sum = numElements / paraSize;
+  if (paraSize == numElements) {
+    for (size_t n = 0; n < numSamples * numElements; ++n) {
+      output[n] = input[n] > 0 ? input[n] : input[n] * w[n % numElements];
+    }
+    return;
+  }
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  for (size_t n = 0; n < numSamples; ++n) {
+    for (size_t i = 0; i < paraSize; i++) {
+      neon::prelu(
+          input + i * partial_sum, w[i], output + i * partial_sum, partial_sum);
+    }
+    input = input + numElements;
+    output = output + numElements;
+  }
+#else
+  for (size_t n = 0, k = 0; n < numSamples; ++n) {
+    for (size_t i = 0; i < numElements; ++i, ++k) {
+      output[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum];
+    }
+  }
+#endif
+}
+
+void CpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
+  real* ograd = oGrad.getData();
+  real* input = data.getData();
+  real* wgrad = data_;
+  size_t numElements = data.getWidth();
+  size_t numSamples = data.getHeight();
+  size_t paraSize = this->getHeight() * this->getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
+  for (size_t n = 0, k = 0; n < numSamples; ++n) {
+    for (size_t i = 0; i < numElements; ++i, ++k) {
+      wgrad[i / partial_sum] += ograd[k] * (input[k] > 0 ? 0 : input[k]);
+    }
+  }
+}
+
+void CpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
+  real* diff = data_;
+  real* input = data.getData();
+  real* ograd = oGrad.getData();
+  real* w = W.getData();
+  size_t numElements = data.getWidth();
+  size_t numSamples = data.getHeight();
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
+  for (size_t n = 0, k = 0; n < numSamples; ++n) {
+    for (size_t i = 0; i < numElements; ++i, ++k) {
+      diff[k] += ograd[k] * (input[k] > 0 ? 1 : w[i / partial_sum]);
+    }
+  }
+}
+
+void CpuMatrix::print(std::ostream& os, size_t height, size_t width) const {
+  CHECK(isContiguous());
+  size_t h = height_ < height ? height_ : height;
+  size_t w = width_ < width ? width_ : width;
+  os.setf(std::ostream::scientific);
+  os << "[";
+  for (size_t i = 0; i < h; ++i) {
+    for (size_t j = 0; j < w; ++j) {
+      os << data_[i * width_ + j] << " ";
+    }
+    if (i == h - 1) {
+      os << "]";
+    }
+    os << std::endl;
+  }
+}
+
+void CpuMatrix::printOneRow(std::ostream& os, size_t idx) const {
+  CHECK_LT(idx, height_);
+  size_t offset = idx * stride_;
+  os << data_[offset];
+  for (size_t i = 1; i < width_; ++i) {
+    os << " " << data_[offset + i];
+  }
+  os << ";";
+}
+
+void CpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
+  CHECK(isContiguous());
+  CHECK(height_ == refMat.getHeight());
+  CHECK(width_ == refMat.getWidth());
+  CpuMatrix cpuRef(height_, width_);
+  cpuRef.copyFrom(refMat);
+  size_t diffCnt = 0;
+  for (size_t i = 0; i < height_; ++i) {
+    for (size_t j = 0; j < width_; ++j) {
+      real a = getElement(i, j);
+      real b = cpuRef.getElement(i, j);
+      if (fabs(a - b) > 0.00001) {
+        ++diffCnt;
+        if (printDiff) {
+          os << "ref= " << a << "  check= " << b << std::endl;
+        }
+      }
+    }
+  }
+  LOG(INFO) << "the  diffCnt is " << diffCnt;
+}
+
+real CpuMatrix::getMin() {
+  size_t size = getHeight() * getWidth();
+  real* data = getData();
+  real res = data[0];
+  for (size_t i = 1; i < size; ++i) {
+    if (res > data[i]) {
+      res = data[i];
+    }
+  }
+  return res;
+}
+
+real CpuMatrix::getMax() {
+  size_t size = getHeight() * getWidth();
+  real* data = getData();
+  real res = data[0];
+  for (size_t i = 1; i < size; ++i) {
+    if (res < data[i]) {
+      res = data[i];
+    }
+  }
+  return res;
+}
+
+void CpuMatrix::circularConv(Matrix& in0, Matrix& in1) {
+  size_t height = this->getHeight();
+  size_t width0 = this->getWidth();
+  size_t width1 = in1.getWidth();
+
+  CHECK_EQ(height, in0.getHeight());
+  CHECK_EQ(width0, in0.getWidth());
+  CHECK_EQ(height, in1.getHeight());
+
+  CHECK_EQ(width1 % 2, 1U);
+
+  real* outV = this->getData();
+  real* inV0 = in0.getData();
+  real* inV1 = in1.getData();
+
+  int leftCtxLen = (width1 - 1) / 2;
+  for (size_t x = 0; x < height;
+       ++x, outV += width0, inV0 += width0, inV1 += width1) {
+    for (size_t i = 0; i < width0; ++i) {  // each dimension of output
+      for (size_t j = 0; j < width1; ++j) {
+        // iterate over all dimentions of inV1
+        int index = i + j - leftCtxLen;
+        index = (index + width0) % width0;
+        outV[i] += inV0[index] * inV1[j];
+      }
+    }
+  }
+}
+
+void CpuMatrix::circularConvDerivative(
+    Matrix& outG, Matrix& in0, Matrix& in1, Matrix& inG0, Matrix& inG1) {
+  size_t height = in0.getHeight();
+  size_t width0 = in0.getWidth();
+  size_t width1 = in1.getWidth();
+
+  CHECK_EQ(height, in1.getHeight());
+  CHECK_EQ(height, inG0.getHeight());
+  CHECK_EQ(width0, inG0.getWidth());
+  CHECK_EQ(height, inG1.getHeight());
+  CHECK_EQ(width1, inG1.getWidth());
+  CHECK_EQ(height, outG.getHeight());
+  CHECK_EQ(width0, outG.getWidth());
+
+  real* outGV = outG.getData();
+  real* inV0 = in0.getData();
+  real* inV1 = in1.getData();
+  real* inGV0 = inG0.getData();
+  real* inGV1 = inG1.getData();
+
+  int leftCtxLen = (width1 - 1) / 2;
+  for (size_t x = 0; x < height; ++x,
+              outGV += width0,
+              inV0 += width0,
+              inV1 += width1,
+              inGV0 += width0,
+              inGV1 += width1) {
+    for (size_t j = 0; j < width1; ++j) {  // iterate over width1
+      for (size_t i = 0; i < width0; ++i) {
+        // such over all dimensions of outG
+        int index = i + j - leftCtxLen;
+        index = (index + width0) % width0;
+        inGV0[index] += outGV[i] * inV1[j];
+        inGV1[j] += outGV[i] * inV0[index];
+      }
+    }
+  }
+}
+
+void CpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
+  CHECK(labelPtr);
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(numSamples, output.getHeight());
+  CHECK_EQ(numSamples, labelPtr->getHeight());
+  CHECK_EQ(dim, labelPtr->getWidth());
+
+  real* out = output.getData();
+  real* cost = getData();
+  for (size_t i = 0; i < numSamples; ++i, out += dim) {
+    for (size_t j = 0; j < dim; ++j) {
+      CHECK(out[j] > 0 && out[j] < 1.0);
+      cost[i] -= std::log(1 - out[j]);
+    }
+
+    const int* cols = labelPtr->getRowCols(i);
+    for (size_t j = 0; j < labelPtr->getColNum(i); ++j) {
+      CHECK_LT(size_t(cols[j]), dim);
+      cost[i] -= std::log(out[cols[j]] / (1 - out[cols[j]]));
+    }
+  }
+}
+
+void CpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
+  CHECK(labelPtr);
+
+  size_t numSamples = getHeight();
+  size_t dim = getWidth();
+  CHECK_EQ(numSamples, output.getHeight());
+  CHECK_EQ(numSamples, labelPtr->getHeight());
+  CHECK_EQ(dim, output.getWidth());
+  CHECK_EQ(dim, labelPtr->getWidth());
+
+  real* out = output.getData();
+  real* grad = getData();
+  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
+    for (size_t j = 0; j < dim; ++j) {
+      CHECK(out[j] > 0 && out[j] < 1.0);
+      grad[j] += 1.0 / (1 - out[j]);
+    }
+
+    const int* cols = labelPtr->getRowCols(i);
+    for (size_t j = 0; j < labelPtr->getColNum(i); ++j) {
+      CHECK_LT(size_t(cols[j]), dim);
+      grad[cols[j]] -= 1.0 / (out[cols[j]] * (1 - out[cols[j]]));
+    }
+  }
+}
+
+/* calculate the classification error for multi binary label */
+void CpuMatrix::classificationErrorMulti(Matrix& output,
+                                         Matrix& label,
+                                         real threshold) {
+  CHECK(dynamic_cast<CpuMatrix*>(&output));
+  auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
+  CHECK(labelPtr);
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(numSamples, output.getHeight());
+  CHECK_EQ(numSamples, labelPtr->getHeight());
+  CHECK_EQ(dim, labelPtr->getWidth());
+
+  real* out = output.getData();
+  real* result = getData();
+  for (size_t i = 0; i < numSamples; ++i, out += dim) {
+    real sum = 0.0;
+    for (size_t j = 0; j < dim; ++j) {
+      if (out[j] >= threshold) {
+        sum += 1.0;
+      }
+    }
+
+    const int* cols = labelPtr->getRowCols(i);
+    for (size_t j = 0; j < labelPtr->getColNum(i); ++j) {
+      CHECK_LT(size_t(cols[j]), dim);
+      if (out[cols[j]] < threshold) {
+        sum += 1.0;
+      } else {
+        sum -= 1.0;
+      }
+    }
+    result[i] = sum / dim;
+  }
+}
+
+void CpuMatrix::bilinearForward(const Matrix& in,
+                                const size_t inImgH,
+                                const size_t inImgW,
+                                const size_t outImgH,
+                                const size_t outImgW,
+                                const size_t numChannels,
+                                const real ratioH,
+                                const real ratioW) {
+  CHECK(dynamic_cast<const CpuMatrix*>(&in));
+
+  size_t outputW = getWidth();
+  size_t batchSize = getHeight();
+  size_t inputW = in.getWidth();
+  size_t inputH = in.getHeight();
+  size_t inPosOffset = inImgH * inImgW;
+  size_t outPosOffset = outImgH * outImgW;
+  (void)(inputH);
+
+  real* outData = getData();
+  const real* inData = in.getData();
+
+  if (inImgH == outImgH && inImgW == outImgW) {
+    this->copyFrom(in);
+  } else {
+    for (size_t k = 0; k < batchSize; ++k) {  // loop for batches
+      for (size_t i = 0; i < outImgH; ++i) {  // loop for images
+        size_t h = ratioH * i;
+        size_t hid = (h < inImgH - 1) ? 1 : 0;
+        real h1lambda = ratioH * i - h;
+        real h2lambda = 1 - h1lambda;
+
+        for (size_t j = 0; j < outImgW; ++j) {
+          size_t w = ratioW * j;
+          size_t wid = (w < inImgW - 1) ? 1 : 0;
+          real w1lambda = ratioW * j - w;
+          real w2lambda = 1 - w1lambda;
+          // calculate four position for bilinear interpolation
+          const real* inPos = &inData[k * inputW + h * inImgW + w];
+          real* outPos = &outData[k * outputW + i * outImgW + j];
+          for (size_t c = 0; c < numChannels; ++c) {  // loop for channels
+            // bilinear interpolation
+            outPos[0] =
+                h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wid]) +
+                h1lambda * (w2lambda * inPos[hid * inImgW] +
+                            w1lambda * inPos[hid * inImgW + wid]);
+            inPos += inPosOffset;
+            outPos += outPosOffset;
+          }
+        }
+      }
+    }
+  }
+}
+
+void CpuMatrix::bilinearBackward(const Matrix& out,
+                                 const size_t outImgH,
+                                 const size_t outImgW,
+                                 const size_t inImgH,
+                                 const size_t inImgW,
+                                 const size_t numChannels,
+                                 const real ratioH,
+                                 const real ratioW) {
+  CHECK(dynamic_cast<const CpuMatrix*>(&out));
+
+  size_t inputW = getWidth();
+  size_t inputH = getHeight();
+  size_t outputW = out.getWidth();
+  size_t batchSize = out.getHeight();
+  size_t inPosOffset = inImgH * inImgW;
+  size_t outPosOffset = outImgH * outImgW;
+  (void)(inputH);
+
+  real* inGrad = getData();
+  const real* outGrad = out.getData();
+
+  if (inImgH == outImgH && inImgW == outImgW) {
+    this->add(const_cast<Matrix&>(out));
+  } else {
+    for (size_t k = 0; k < batchSize; ++k) {  // loop for batches
+      for (size_t i = 0; i < outImgH; ++i) {  // loop for images
+        size_t h = ratioH * i;
+        size_t hid = (h < inImgH - 1) ? 1 : 0;
+        real h1lambda = ratioH * i - h;
+        real h2lambda = 1 - h1lambda;
+        for (size_t j = 0; j < outImgW; ++j) {
+          size_t w = ratioW * j;
+          size_t wid = (w < inImgW - 1) ? 1 : 0;
+          real w1lambda = ratioW * j - w;
+          real w2lambda = 1 - w1lambda;
+
+          real* inPos = &inGrad[k * inputW + h * inImgW + w];
+          const real* outPos = &outGrad[k * outputW + i * outImgW + j];
+          for (size_t c = 0; c < numChannels; ++c) {  // loop for channels
+            inPos[0] += h2lambda * w2lambda * outPos[0];
+            inPos[wid] += h2lambda * w1lambda * outPos[0];
+            inPos[hid * inImgW] += h1lambda * w2lambda * outPos[0];
+            inPos[hid * inImgW + wid] += h1lambda * w1lambda * outPos[0];
+            inPos += inPosOffset;
+            outPos += outPosOffset;
+          }
+        }
+      }
+    }
+  }
+}
+
+void CpuMatrix::vol2Col(real* data,
+                        int channels,
+                        int depth,
+                        int height,
+                        int width,
+                        int filterD,
+                        int filterH,
+                        int filterW,
+                        int strideD,
+                        int strideH,
+                        int strideW,
+                        int paddingD,
+                        int paddingH,
+                        int paddingW) {
+  real* outData = getData();
+  int outHeight = (height + 2 * paddingH - filterH) / strideH + 1;
+  int outWidth = (width + 2 * paddingW - filterW) / strideW + 1;
+  int outDepth = (depth + 2 * paddingD - filterD) / strideD + 1;
+
+  int channelsCol = channels * filterD * filterH * filterW;
+  for (int c = 0; c < channelsCol; ++c) {
+    int wOffset = c % filterW;
+    int hOffset = (c / filterW) % filterH;
+    int dOffset = (c / filterW / filterH) % filterD;
+    int cIn = c / filterW / filterH / filterD;
+    for (int d = 0; d < outDepth; ++d) {
+      for (int h = 0; h < outHeight; ++h) {
+        for (int w = 0; w < outWidth; ++w) {
+          int dPad = d * strideD - paddingD + dOffset;
+          int hPad = h * strideH - paddingH + hOffset;
+          int wPad = w * strideW - paddingW + wOffset;
+
+          if (hPad >= 0 && hPad < height && wPad >= 0 && wPad < width &&
+              dPad >= 0 && dPad < depth)
+            outData[((c * outDepth + d) * outHeight + h) * outWidth + w] =
+                data[((cIn * depth + dPad) * height + hPad) * width + wPad];
+          else
+            outData[((c * outDepth + d) * outHeight + h) * outWidth + w] = 0;
+        }
+      }
+    }
+  }
+}
+
+void CpuMatrix::col2Vol(real* trg,
+                        int channels,
+                        int depth,
+                        int height,
+                        int width,
+                        int filterD,
+                        int filterH,
+                        int filterW,
+                        int strideD,
+                        int strideH,
+                        int strideW,
+                        int paddingD,
+                        int paddingH,
+                        int paddingW,
+                        real alpha,
+                        real beta) {
+  real* src = getData();
+  int outDepth = (depth + 2 * paddingD - filterD) / strideD + 1;
+  int outHeight = (height + 2 * paddingH - filterH) / strideH + 1;
+  int outWidth = (width + 2 * paddingW - filterW) / strideW + 1;
+  int channelsCol = channels * filterD * filterH * filterW;
+  for (int c = 0; c < channelsCol; ++c) {
+    int wOffset = c % filterW;
+    int hOffset = (c / filterW) % filterH;
+    int dOffset = (c / filterW / filterH) % filterD;
+    int cIm = c / filterW / filterH / filterD;
+    for (int d = 0; d < outDepth; ++d) {
+      for (int h = 0; h < outHeight; ++h) {
+        for (int w = 0; w < outWidth; ++w) {
+          int dPad = d * strideD - paddingD + dOffset;
+          int hPad = h * strideH - paddingH + hOffset;
+          int wPad = w * strideW - paddingW + wOffset;
+          if (hPad >= 0 && hPad < height && wPad >= 0 && wPad < width &&
+              dPad >= 0 && dPad < depth)
+            trg[((cIm * depth + dPad) * height + hPad) * width + wPad] =
+                alpha *
+                    src[((c * outDepth + d) * outHeight + h) * outWidth + w] +
+                beta *
+                    trg[((cIm * depth + dPad) * height + hPad) * width + wPad];
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////
+//               functions executed via cpu                   //
+////////////////////////////////////////////////////////////////
+
+void GpuMatrix::selectElements(Matrix& table, IVector& ids) {
+  execViaCpu2(&CpuMatrix::selectElements, *this, table, ids);
+}
+}  // namespace paddle
diff --git a/paddle/legacy/math/Matrix.h b/paddle/legacy/math/Matrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff4f4cfc2a41add1a06308556b38aba5bbdac884
--- /dev/null
+++ b/paddle/legacy/math/Matrix.h
@@ -0,0 +1,2189 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <memory>
+#include <thread>
+
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+#include <hl_gpu.h>
+
+#include "BaseMatrix.h"
+#include "MemoryHandle.h"
+#include "Vector.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+namespace paddle {
+
+/// TODO(tianbing), move to paddle/legacy/function/TensorType.h
+enum SparseValueType { NO_VALUE = 0, FLOAT_VALUE = 1 };
+
+/**
+ * @brief  matrix sparse_format .
+ *
+ * nnz represents nonzero number in sparse matrix.
+ *
+ * SPARSE_CSR: row major matrix. length of row is height_ + 1, each element
+ * represents row start index in Matrix. length of col and value are nnz.
+ *
+ * SPARSE_CSC: col major matrix. length of col is width_ + 1, each element
+ * represents col start index in Matrix. length of col and value are nnz.
+ *
+ * @code
+ * for example: [0, 1, 0, 2, 0;
+ *               1, 0, 0, 0, 0;
+ *               0, 0, 0, 2, 5];
+ * SPARSE_CSR row   [0, 2, 3, 5];
+ *            col   [1, 3, 0, 3, 4];
+ *            value [1, 2, 1, 2, 5]
+ * SPARSE_CSC col   [0, 1, 2, 2, 4, 5];
+ *            row   [1, 0, 0, 2, 2];
+ *            value [1, 1, 2, 2, 5]
+ * @endcode
+ */
+/// TODO(tianbing), move to paddle/legacy/function/TensorType.h
+enum SparseFormat { SPARSE_CSR = 0, SPARSE_CSC = 1 };
+
+class Matrix;
+class GpuMatrix;
+class CpuMatrix;
+class CpuSparseMatrix;
+class GpuSparseMatrix;
+typedef std::shared_ptr<Matrix> MatrixPtr;
+typedef std::shared_ptr<GpuMatrix> GpuMatrixPtr;
+typedef std::shared_ptr<CpuMatrix> CpuMatrixPtr;
+typedef std::shared_ptr<GpuSparseMatrix> GpuSparseMatrixPtr;
+typedef std::shared_ptr<CpuSparseMatrix> CpuSparseMatrixPtr;
+
+/**
+ * Copy or assignemnt constructor will share the data as opposed to making a
+ * copy of the original data. To make a copy of the orinal data, use copyFrom()
+ * instead.
+ */
+class Matrix : public BaseMatrix {
+ protected:
+  Matrix(MemoryHandlePtr memHandle,
+         size_t height,
+         size_t width,
+         bool trans,
+         bool use_gpu);
+
+  Matrix(real* data, size_t height, size_t width, bool trans, bool use_gpu);
+
+  Matrix(real* data,
+         size_t height,
+         size_t width,
+         size_t stride,
+         bool trans,
+         bool use_gpu);
+
+  static ThreadLocal<MatrixPtr> tmpMat_;
+
+ public:
+  size_t elementCnt_;  // maximal number of elements which can be held in data_
+  MemoryHandlePtr memoryHandle_;
+
+ public:
+  virtual ~Matrix() {}
+
+  static MatrixPtr create(MemoryHandlePtr memHandle,
+                          size_t height,
+                          size_t width,
+                          bool trans = false);
+  static MatrixPtr create(size_t height,
+                          size_t width,
+                          bool trans = false,
+                          bool useGpu = false);
+  static MatrixPtr create(real* data,
+                          size_t height,
+                          size_t width,
+                          bool trans = false,
+                          bool useGpu = false);
+  static MatrixPtr create(real* data,
+                          size_t height,
+                          size_t width,
+                          size_t stride,
+                          bool trans = false,
+                          bool useGpu = false);
+
+  static MatrixPtr createSparseMatrix(size_t height,
+                                      size_t width,
+                                      size_t nnz,
+                                      SparseValueType valueType = FLOAT_VALUE,
+                                      bool trans = false,
+                                      bool useGpu = false);
+  static MatrixPtr createSparseMatrix(size_t height,
+                                      size_t width,
+                                      size_t nnz,
+                                      SparseValueType valueType = FLOAT_VALUE,
+                                      SparseFormat foramt = SPARSE_CSR,
+                                      bool trans = false,
+                                      bool useGpu = false);
+
+  static MatrixPtr createSparseMatrix(real* data,
+                                      int* row,
+                                      int* col,
+                                      size_t height,
+                                      size_t width,
+                                      size_t nnz, /* used to allocate space */
+                                      SparseValueType valueType, /*value type*/
+                                      SparseFormat format,
+                                      bool trans,
+                                      bool useGpu);
+
+  static void resizeOrCreateSparseMatrix(
+      MatrixPtr& matrix,
+      size_t height,
+      size_t width,
+      size_t nnz,
+      SparseValueType valueType = FLOAT_VALUE,
+      SparseFormat foramt = SPARSE_CSR,
+      bool trans = false,
+      bool useGpu = false);
+
+  static void resizeOrCreate(MatrixPtr& a,
+                             size_t height,
+                             size_t width,
+                             bool trans = false,
+                             bool useGpu = false);
+
+  /**
+   * @brief  set the data buffer used to hold the matrix data.
+   *
+   * caller should make sure that the size of data is at least
+   * sizeof(real)*height*width.
+   */
+  void setData(real* data) {
+    BaseMatrix::setData(data);
+    memoryHandle_.reset();
+  }
+
+  /// the data should be contiguous
+  void setData(real* data, size_t newHeight, size_t newWidth) {
+    setData(data);
+    height_ = newHeight;
+    width_ = newWidth;
+    elementCnt_ = newHeight * newWidth;
+    stride_ = width_;
+  }
+
+  size_t getWidth() const { return width_; }
+  size_t getHeight() const { return height_; }
+  size_t getStride() const { return stride_; }
+  size_t getElementCnt() const { return elementCnt_; }
+  virtual real* getData() { return data_; }
+  virtual const real* getData() const { return data_; }
+  bool isTransposed() const { return trans_; }
+  bool isContiguous() const { return stride_ == width_ || height_ == 1; }
+
+  // If sparse matrix, need to dynamic_cast to CpuSparseMatrix/GpuSparseMatrix
+  // befor call the following functions.
+  // Declare these functions in the base class just easy to call them.
+  // And these declarations should be moved to base class of sparse matrix
+  // if refactor sparse matrix
+  virtual int* getRows() const {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;  //! suppress warning for no return value.
+  }
+
+  virtual int* getCols() const {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;  //! suppress warning for no return value.
+  }
+
+  virtual SparseFormat getFormat() const {
+    LOG(FATAL) << "Not implemented";
+    return SPARSE_CSR;  //! suppress warning for no return value.
+  }
+
+  virtual SparseValueType getValueType() const {
+    LOG(FATAL) << "Not implemented";
+    return NO_VALUE;  //! suppress warning for no return value.
+  }
+
+  /**
+   * @brief matrix elment-wise add
+   *
+   * Named add3 just because add/add2 has been used in BaseMatrix.cu
+   * and they are not virtual function.
+   */
+  virtual void add3(MatrixPtr b) { LOG(FATAL) << "Not implemented"; }
+
+  MemoryHandlePtr getMemoryHandle() const { return memoryHandle_; }
+
+  virtual void zeroMem() { LOG(FATAL) << "Not implemented"; }
+
+  virtual void resetOne() { LOG(FATAL) << "Not implemented"; }
+
+  void setDiag(real value);
+
+  virtual void copyFrom(const Matrix& src) { LOG(FATAL) << "Not implemented"; }
+
+  virtual void trimFrom(const CpuSparseMatrix& src) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  // For GpuMatrix this is an asynchronous copy interface
+  // For CpuMatrix this is an synchronous copy interface
+  virtual void copyFrom(const Matrix& src, hl_stream_t stream) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  MatrixPtr subMatrix(size_t startRow,
+                      size_t endRow,
+                      size_t startCol,
+                      size_t endCol);
+
+  MatrixPtr subRowMatrix(size_t startRow, size_t endRow) {
+    return subMatrix(startRow, endRow, 0, getWidth());
+  }
+
+  MatrixPtr subColMatrix(size_t startCol, size_t endCol) {
+    return subMatrix(0, getHeight(), startCol, endCol);
+  }
+
+  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows) {
+    CHECK_LE(startRow + numRows, getHeight());
+    return Matrix::create(getData() + startRow * getWidth(),
+                          numRows,
+                          getWidth(),
+                          trans_,
+                          useGpu_);
+  }
+  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows, MatrixPtr dest) {
+    CHECK_LE(startRow + numRows, getHeight());
+    CHECK_EQ(useGpu_, dest->useGpu_);
+    dest->setData(this->rowBuf(startRow), numRows, getWidth());
+    return dest;
+  }
+
+  /**
+   * If this is GpuMatrix, src is assumed to be CPU memory
+   *
+   * If this is CpuMatrix, src is assumed to be CPU memory
+   */
+  virtual void copyFrom(const real* src, size_t size) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void copyFrom(const real* src, const int64_t* seq) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief convert a int vector to a real matrix.
+   *
+   * (1) source and dest are both in CPU.
+   *
+   * (2) sizes are exactly match.
+   */
+  virtual void copyFrom(const IVector& src) {
+    LOG(FATAL) << "copy data from int vector only available on CpuMatrix.";
+  }
+
+  virtual void copyByRowIndex(Matrix& b, const IVector& rowIndex) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief Create a matrix with the same type (GpuMatrix, CpuMatrix,
+   *        NonValueSparseMatrix, etc.) as this.
+   *
+   * If height and width is zero, the new matrix will have the same size
+   * as this, otherwise the new matrix will have the specified size.
+   *
+   */
+  virtual MatrixPtr clone(size_t height = 0,
+                          size_t width = 0,
+                          bool useGpu = false) {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;
+  }
+
+  virtual real* getRowBuf(size_t row) {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;
+  }
+
+  virtual real getElement(size_t x, size_t y) const {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+
+  virtual real getSum() {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+
+  virtual void accumulateColSum(Matrix& src) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual real getAbsSum() {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+
+  /**
+   * @note Original data may not be preserved after resize().
+   */
+  virtual void resize(size_t newHeight, size_t newWidth) = 0;
+
+  /**
+   * @note This should only be used for sparse matrix.
+   */
+  virtual void resize(size_t newHeight,
+                      size_t newWidth,
+                      size_t newNnz, /* total item used to allocate space */
+                      SparseValueType valueType,
+                      SparseFormat format) = 0;
+
+  /**
+   * @brief This should only be used for sparse matrix.
+   *
+   * Currently must be called for each row in order.
+   * The matrix is not valid until setRow is called for the last row.
+   */
+  virtual void setRow(size_t row,
+                      size_t colNum,
+                      const unsigned int* cols,
+                      const real* values) = 0;
+
+  virtual MatrixPtr getTranspose() = 0;
+
+  /**
+   * @brief  hard transpose.
+   *
+   * allocate matTrans' memory outside, then set memAlloc as false;
+   * else set as true.
+   */
+  virtual void transpose(MatrixPtr& matTrans, bool memAlloc) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief  rotate 90 degrees in clock-wise if clockWise=true;
+   *         otherwise rotate in anti clock-wise
+   * clock-wise:
+   * \f[
+   *   y(j,i) = x(M-i-1,j)
+   * \f]
+   * anti clock-wise:
+   * \f[
+   *   y(j,i) = x(i, N-1-j)
+   * \f]
+   * where \f$x\f$ is (M x N) input, and \f$y\f$ is (N x M) output.
+   *
+   * allocate matRot' memory outside, then set memAlloc as false;
+   * else set as true.
+   */
+  virtual void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual MatrixPtr getInverse() {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;
+  }
+
+  /**
+   * @brief  inverse.
+   *
+   * if allocate matInv's memory outside, then set memAlloc as false;
+   * else set as true.
+   */
+  virtual void inverse(MatrixPtr& matInv, bool memAlloc) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+ public:
+  /// Only set all variables to 0 or NULL but not free them.
+  virtual void clear() {
+    height_ = 0;
+    width_ = 0;
+    data_ = NULL;
+  }
+
+  void reshape(size_t height, size_t width);
+
+  /// add b to each sample of this.
+  virtual void addBias(Matrix& b, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void addSharedBias(Matrix& b, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  void addBias(Matrix& b, real scale, bool sharedBias) {
+    if (!sharedBias) {
+      addBias(b, scale);
+    } else {
+      addSharedBias(b, scale);
+    }
+  }
+
+  /// add each sample from a to this.
+  virtual void collectBias(Matrix& a, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void collectSharedBias(Matrix& a, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  void collectBias(Matrix& a, real scale, bool sharedBias) {
+    if (!sharedBias) {
+      collectBias(a, scale);
+    } else {
+      collectSharedBias(a, scale);
+    }
+  }
+
+  virtual void sequenceAvgForward(Matrix& a,
+                                  const IVector& startsPos,
+                                  int mode) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void sequenceAvgBackward(Matrix& a,
+                                   const IVector& startsPos,
+                                   int mode) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this = scaleAB*(a*b) + scaleT*this
+   * @endcode
+   */
+  virtual void mul(const Matrix& a,
+                   const Matrix& b,
+                   real scaleAB,
+                   real scaleT) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// Add a vector (column) b to matrix a, column by column.
+  virtual void addColumnVector(const Matrix& b) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   this(i, j) += vec(index(i, j), 0)
+   * where index(i, j) = ((codes(i) + numClasses) >> (j + 1)) - 1
+   * @endcode
+   */
+  virtual void addByBitCode(size_t numClasses,
+                            const IVector& codes,
+                            const Matrix& vec) {
+    (void)numClasses;
+    (void)codes;
+    (void)vec;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   vec(index(i, j), 0) += this(i, j)
+   * where index is same as the index for addByBitCode
+   * @endcode
+   */
+  virtual void addByBitCodeBackward(size_t numClasses,
+                                    const IVector& codes,
+                                    Matrix& vec) {
+    (void)numClasses;
+    (void)codes;
+    (void)vec;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   this(i, j) += <mat.row(index(i, j)), input.row(i)>
+   * where index is same as the index for addByBitCode
+   * @endcode
+   */
+  virtual void mulByBitCode(size_t numClasses,
+                            const IVector& codes,
+                            const Matrix& mat,
+                            const Matrix& input) {
+    (void)numClasses;
+    (void)codes;
+    (void)mat;
+    (void)input;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   mat.row(index(i, j)) += this(i, j) * input.row(i)
+   * where index is same as the index for addByBitCode
+   * @endcode
+   */
+  virtual void mulByBitCodeBackwardWeight(size_t numClasses,
+                                          const IVector& codes,
+                                          Matrix& mat,
+                                          const Matrix& input) {
+    (void)numClasses;
+    (void)codes;
+    (void)mat;
+    (void)input;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   input.row(i) += this(i, j) * mat.row(index(i, j))
+   * where index is same as the index for addByBitCode
+   * @endcode
+   */
+  virtual void mulByBitCodeBackwardError(size_t numClasses,
+                                         const IVector& codes,
+                                         const Matrix& mat,
+                                         Matrix& input) {
+    (void)numClasses;
+    (void)codes;
+    (void)mat;
+    (void)input;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength
+   *   sum(i, 0) = scaleSum * \sum_j  bit(i, j) * this(i, j)
+   * where bit(i, j) = ((codes(i) + numClasses) & 2^j) ? 1 : 0
+   * @endcode
+   */
+  virtual void sumByBitCode(size_t numClasses,
+                            IVector& codes,
+                            Matrix& sum,
+                            real scaleSum) {
+    (void)numClasses;
+    (void)codes;
+    (void)sum;
+    (void)scaleSum;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength
+   *  this(i, j) -= bit(i, j)
+   * where bit(i, j) is same as that for sumByBitCode
+   * @endcode
+   */
+  virtual void subByBitCode(size_t numClasses_, IVector& codes) {
+    (void)numClasses_;
+    (void)codes;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * add the sum of each row of this to mat
+   */
+  virtual void rowSum(Matrix& sum) {
+    (void)sum;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * set the max of each row of this to mat
+   */
+  virtual void rowMax(Matrix& max) {
+    (void)max;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * set the max of each column of this to mat
+   */
+  virtual void colMax(Matrix& max) { LOG(FATAL) << "not implemented"; }
+
+  /**
+   * @brief Get the top k elements of each column of this matrix.
+   *
+   * The row ids and values of these elements are stored in
+   * maxIds and max respectively. where k is the size of maxIds.
+   * And note that the top k elements are not sorted.
+   */
+  virtual void colMax(IVector& maxIds, Matrix& maxVal) {
+    LOG(FATAL) << "not implemented";
+  }
+
+  virtual void maxoutForward(Matrix& a,
+                             IVector& id,
+                             size_t channels,
+                             size_t groups) {
+    LOG(FATAL) << "not implemented";
+  }
+
+  virtual void maxoutBackward(Matrix& a,
+                              IVector& id,
+                              size_t channels,
+                              size_t groups) {
+    LOG(FATAL) << "not implemented";
+  }
+
+  virtual void rowMaxId(IVector& maxIds) { LOG(FATAL) << "Not implemented"; }
+
+  /**
+   * @brief Get the top k elements of each row of this matrix.
+   *
+   * The column ids and values of these elements are stored in
+   * maxIds and max respectively. where k is the size of maxIds.
+   * And note that the top k elements are not sorted.
+   */
+  virtual void rowMax(IVector& maxIds, Matrix& max) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// normalize each row so that the sum of each row is 1.
+  virtual void rowNormalizeL1(Matrix& out) {
+    (void)out;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   *  this = a*b
+   * @endcode
+   */
+  virtual void mul(const Matrix& a, const Matrix& b) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this = scaleAB*(this*b) +  scaleT*this
+   * @endcode
+   */
+  virtual void rightMul(Matrix& b, real scaleAB, real scaleT) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this = this* b
+   * @endcode
+   */
+  virtual void rightMul(Matrix& b) { LOG(FATAL) << "Not implemented"; }
+
+  /**
+   * @code
+   * this = scaleAB*(a*this) +  scaleT*this
+   * @endcode
+   */
+  virtual void leftMul(Matrix& a, real scaleAB, real scaleT) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this = a*this)
+   * @endcode
+   */
+  virtual void leftMul(Matrix& a) { LOG(FATAL) << "Not implemented"; }
+
+  /// merge the element for each col.
+  virtual void colMerge(Matrix& src) { LOG(FATAL) << "Not implemented"; }
+
+  /// copy -log(output[label]) to this->data[i].
+  virtual void oneHotCrossEntropy(Matrix& output, IVector& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// calculate the error of outputV according to label.
+  virtual void oneHotCrossEntropyBp(Matrix& outputV, IVector& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// copy -log(output[label]) to this->data[i].
+  virtual void oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                              IVector& label,
+                                              real alpha) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// calculate the error of outputV according to label.
+  virtual void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
+                                                IVector& label,
+                                                real alpha) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * \f[
+   *  a[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} b_{i+j} * c_{j}
+   * \f]
+   *
+   * b contains M elements,
+   * c contains N elements (N is odd),
+   * b's index arithmetic is computed modulo M,
+   * c's index arithmetic is computed modulo N.
+   */
+  virtual void circularConv(Matrix& b, Matrix& c) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void circularConvDerivative(Matrix& output,
+                                      Matrix& prevOut1,
+                                      Matrix& prevOut2,
+                                      Matrix& prevGrad1,
+                                      Matrix& prevGrad2) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /* output_ij = exp(this_{ij}) / (sum_j exp(this_ij)) */
+  virtual void softmax(Matrix& output) {
+    (void)output;
+    LOG(FATAL) << "Not implemeted";
+  }
+  virtual void sequenceSoftmax(Matrix& output, const IVector& index) {
+    (void)output;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void softmaxBackward(Matrix& outputV) {
+    (void)outputV;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /*
+    sum_i = sum_j this_ij * output_ij
+    this_ij = output_ij* (this_ij - sum_i)
+  */
+  virtual void softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// calculate the sum of squares diff cost.
+  virtual void sumOfSquares(Matrix& output, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// gradient of sumOfSquares.
+  virtual void sumOfSquaresBp(Matrix& outputV, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void smoothL1(Matrix& output, Matrix& label, real destScale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void smoothL1Bp(Matrix& outputV, Matrix& label, real destScale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void tanh(Matrix& output) { LOG(FATAL) << "Not implemented"; }
+
+  virtual void tanhDerivative(Matrix& output) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void softrelu(Matrix& output) { LOG(FATAL) << "Not implemented"; }
+
+  virtual void softreluDerivative(Matrix& output) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void scaledTanh(Matrix& output, real p1, real p2) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// print out the values of elements to os
+  virtual void print(std::ostream& os) const {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * print a part of the matrix
+   * from the (top,left) value to the (height, width) value (not included)
+   */
+  virtual void print(std::ostream& os, size_t height, size_t width) const {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// print one row to os
+  virtual void printOneRow(std::ostream& os, size_t idx) const {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void check(std::ostream& os, Matrix& refMat, bool printDiff = true) {}
+
+  virtual real getMin() {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+  virtual real getMax() {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+
+  virtual void randomizeUniform() { LOG(FATAL) << "Not implemented"; }
+
+  /**
+   * @brief  calulate the error of classification
+   *
+   * output[i] = 1 if row i is an error.
+   *
+   * output[i] = 0 if row i is correct.
+   *
+   */
+  virtual void classificationError(Matrix& output,
+                                   IVector& label,
+                                   size_t topkSize = 1) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void upsampleForward(Matrix& input,
+                               Matrix& mask,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t channels,
+                               size_t outputH,
+                               size_t outputW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void upsampleBackward(Matrix& outputGrad,
+                                Matrix& mask,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t channels,
+                                size_t outputH,
+                                size_t outputW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * Pooling forward operation, pick out the largest element
+   * in the sizeX of value, if the maskMatP is not NULL, it will
+   * also caculate the location indices.
+   */
+  virtual void maxPoolForward(Matrix& inputMat,
+                              size_t imgSizeH,
+                              size_t imgSizeW,
+                              size_t channels,
+                              size_t sizeX,
+                              size_t sizeY,
+                              size_t strideH,
+                              size_t strideW,
+                              size_t outputH,
+                              size_t outputW,
+                              size_t paddingH,
+                              size_t paddingW,
+                              MatrixPtr maskMatP = NULL) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /// Pooling backward operation.
+  virtual void maxPoolBackward(Matrix& image,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               Matrix& outGrad,
+                               Matrix& outV,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               real scaleTargets,
+                               real scaleOutput,
+                               size_t paddingH,
+                               size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /// Pooling forward operation, caculate the average of sizeX elements.
+  virtual void avgPoolForward(Matrix& input,
+                              size_t imgSizeH,
+                              size_t imgSizeW,
+                              size_t channels,
+                              size_t sizeX,
+                              size_t sizeY,
+                              size_t strideH,
+                              size_t strideW,
+                              size_t outputH,
+                              size_t outputW,
+                              size_t paddingH,
+                              size_t paddingW,
+                              bool excludeMode = true) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void avgPoolBackward(Matrix& input,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               real scaleTargets,
+                               real scaleOutput,
+                               size_t paddingH,
+                               size_t paddingW,
+                               bool excludeMode = true) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * Pooling 3D forward operation, pick out the largest element
+   * in the sizeX of value
+   */
+  virtual void maxPool3DForward(Matrix& inputMat,
+                                Matrix& maxPoolIdx,
+                                size_t channels,
+                                size_t imgSizeD,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t outputD,
+                                size_t outputH,
+                                size_t outputW,
+                                size_t sizeZ,
+                                size_t sizeY,
+                                size_t sizeX,
+                                size_t strideD,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t paddingD,
+                                size_t paddingH,
+                                size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void maxPool3DBackward(Matrix& outGrad,
+                                 Matrix& maxPoolIdx,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW,
+                                 real scaleTargets,
+                                 real scaleOutput) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void avgPool3DForward(Matrix& input,
+                                size_t channels,
+                                size_t imgSizeD,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t outputD,
+                                size_t outputH,
+                                size_t outputW,
+                                size_t sizeZ,
+                                size_t sizeY,
+                                size_t sizeX,
+                                size_t strideD,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t paddingD,
+                                size_t paddingH,
+                                size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void avgPool3DBackward(Matrix& input,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW,
+                                 real scaleTargets,
+                                 real scaleOutput) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+ * Input: one or more sequences. Each sequence contains some instances.
+ *
+ * Output: output size is the number of input sequences (NOT input
+ * instances).
+ *
+ * output[i] is set to max_input[i].
+ */
+  virtual void maxSequenceForward(Matrix& input,
+                                  const IVector& sequence,
+                                  IVector& index) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void maxSequenceBackward(Matrix& outputGrad,
+                                   const IVector& sequence,
+                                   IVector& index) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * this.row[i] += table.row[ids[i]]
+   * if ids[i] == -1, it will be ignored
+   * @endcode
+   */
+  virtual void selectRows(Matrix& table, IVector& ids) {
+    (void)table;
+    (void)ids;
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this[i] = table[i, id[i]]
+   * @endcode
+   */
+  virtual void selectElements(Matrix& table, IVector& ids) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * table.row[ids[i]] += this.row[i]
+   * if ids[i] == -1, it will be ignored
+   * @endcode
+   */
+  virtual void addToRows(Matrix& table, IVector& ids) {
+    (void)table;
+    (void)ids;
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * table[i, id[i]] += this[i]
+   * @endcode
+   */
+  virtual void addElements(Matrix& table, IVector& ids) {
+    LOG(FATAL) << "Not implemented";
+  }
+  /**
+   * @brief  cross entropy for multi binary labels
+   *
+   * @code
+   * this[i] = -sum(label[i][j]*log(output[i][j])
+   *           + (1-label[i][j])*log(1-output[i][j]))
+   * @endcode
+   */
+  virtual void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief  The gradient of cross entropy for multi binary labels on output
+   *
+   * @code
+   * this[i][j] = -label[i][j]/output[i][j]
+   *              + (1-label[i][j])/(1-output[i][j])
+   * @endcode
+   */
+  virtual void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief  Calculate the classification error for multi binary labels
+   *
+   * @code
+   * this[i] = sum((output[i][j] >= threshold && label[i][j] == 0)
+   *            || (output[i][j] < threshold && label[i][j] == 1))
+   *            / output->getWidth()
+   * @endcode
+   */
+  virtual void classificationErrorMulti(Matrix& output,
+                                        Matrix& label,
+                                        real threshold) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void paramReluForward(Matrix& data, Matrix& W) {
+    LOG(FATAL) << "Not implemented";
+  }
+  virtual void paramReluBackwardW(Matrix& oGrad, Matrix& data) {
+    LOG(FATAL) << "Not implemented";
+  }
+  virtual void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void vol2Col(real* data,
+                       int channels,
+                       int depth,
+                       int height,
+                       int width,
+                       int filterD,
+                       int filterH,
+                       int filterW,
+                       int strideD,
+                       int strideH,
+                       int strideW,
+                       int paddingD,
+                       int paddingH,
+                       int paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void col2Vol(real* trg,
+                       int channels,
+                       int depth,
+                       int height,
+                       int width,
+                       int filterD,
+                       int filterH,
+                       int filterW,
+                       int strideD,
+                       int strideH,
+                       int strideW,
+                       int paddingD,
+                       int paddingH,
+                       int paddingW,
+                       real alpha,
+                       real beta) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void bilinearForward(const Matrix& in,
+                               const size_t inImgH,
+                               const size_t inImgW,
+                               const size_t outImgH,
+                               const size_t outImgW,
+                               const size_t numChannels,
+                               const real ratioH,
+                               const real ratioW) {
+    LOG(FATAL) << "Not implemented";
+  }
+  virtual void bilinearBackward(const Matrix& out,
+                                const size_t outImgH,
+                                const size_t outImgW,
+                                const size_t inImgH,
+                                const size_t inImgW,
+                                const size_t numChannels,
+                                const real ratioH,
+                                const real ratioW) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    if (useGpu_) {
+      TensorGpuApply<real>(*this, expr);
+    } else {
+      TensorCpuApply<real>(*this, expr);
+    }
+  }
+
+  bool isEmpty() const { return data_ == nullptr; }
+
+  explicit operator bool() const { return !isEmpty(); }
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
+  mat.print(os);
+  return os;
+}
+
+class GpuMatrix : public Matrix {
+ public:
+  GpuMatrix();
+
+  GpuMatrix(size_t height, size_t width, bool trans = false);
+  GpuMatrix(real* data, size_t height, size_t width, bool trans = false)
+      : Matrix(data, height, width, trans, true) {}
+  GpuMatrix(real* data,
+            size_t height,
+            size_t width,
+            size_t stride,
+            bool trans = false)
+      : Matrix(data, height, width, stride, trans, true) {}
+  GpuMatrix(GpuMemHandlePtr dataHandle,
+            size_t height,
+            size_t width,
+            bool trans = false)
+      : Matrix(dataHandle, height, width, trans, true) {}
+  ~GpuMatrix();
+
+  void zeroMem();
+  void resetOne();
+  void setDiag(real value);
+
+  void resize(size_t newHeight, size_t newWidth);
+  void resize(size_t newHeight,
+              size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType,
+              SparseFormat format) {
+    LOG(FATAL) << "Only Support Sparse Matrix";
+  }
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
+              const real* values) {
+    LOG(FATAL) << "Only Support Sparse Matrix";
+  }
+
+  /**
+   * Copy the data from cpu_memory buffer
+   */
+  void copyFrom(const real* hostSrc, size_t size);
+
+  void copyFrom(const real* hostSrc, const int64_t* seq);
+
+  void copyFrom(const Matrix& src, hl_stream_t stream);
+
+  void copyFrom(const Matrix& src);
+
+  void copyFrom(const IVector& src);
+
+  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
+
+  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
+
+  real getElement(size_t x, size_t y) const;
+
+  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
+  virtual real* getRowBuf(size_t row) { return getRow(row); }
+
+  real getSum();
+  void accumulateColSum(Matrix& src);
+  real getAbsSum();
+
+  real getMin();
+  real getMax();
+
+  MatrixPtr getTranspose();
+  void transpose(MatrixPtr& matTrans, bool memAlloc);
+  void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise);
+
+  MatrixPtr getInverse();
+  void inverse(MatrixPtr& matInv, bool memAlloc);
+
+  /// add b to each sample of this.
+  void addBias(Matrix& b, real scale);
+  void addSharedBias(Matrix& b, real scale);
+
+  /**
+   * @code
+   * add each sample from a to this.
+   * @endcode
+   */
+  void collectBias(Matrix& a, real scale);
+  void collectSharedBias(Matrix& a, real scale);
+
+  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
+  void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
+
+  /**
+   * @code
+   * this.row[i] += table.row[ids[i]]
+   * @endcode
+   */
+  virtual void selectRows(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * this[i] = table[i, id[i]]
+   * @endcode
+   */
+  virtual void selectElements(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * table.row[ids[i]] += this.row[i]
+   * @endcode
+   */
+  virtual void addToRows(Matrix& table, IVector& ids);
+
+  void addColumnVector(const Matrix& b);
+
+  /**
+   * @code
+   * this = scaleAB*(a*b) + scaleT*this
+   * @endcode
+   */
+  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
+
+  /**
+   * @code
+   * this = a*b
+   * @endcode
+   */
+  void mul(const Matrix& a, const Matrix& b);
+
+  void mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, real scaleT);
+
+  void mul(const GpuSparseMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
+           real scaleT);
+
+  void mul(const GpuMatrix& a,
+           const GpuSparseMatrix& b,
+           real scaleAB,
+           real scaleT);
+
+  /**
+   * @code
+   * this = scaleAB*(this*b) +  scaleT*this
+   * @endcode
+   */
+  void rightMul(Matrix& b, real scaleAB, real scaleT);
+
+  /**
+   * @code
+   * this = this* b
+   * @endcode
+   */
+  void rightMul(Matrix& b);
+
+  /**
+   * @code
+   * this = scaleAB*(a*this) +  scaleT*this
+   * @endcode
+   */
+  void leftMul(Matrix& a, real scaleAB, real scaleT);
+
+  /**
+   * @code
+   * this = a*this
+   * @endcode
+   */
+  void leftMul(Matrix& a);
+
+  void colMerge(Matrix& src);
+  void rowSum(Matrix& sum);
+  void rowMax(Matrix& max);
+  void rowMax(IVector& maxIds, Matrix& max);
+  void colMax(Matrix& max);
+  void colMax(IVector& maxIds, Matrix& max);
+  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
+  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
+
+  void oneHotCrossEntropy(Matrix& output, IVector& label);
+  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
+  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                      IVector& label,
+                                      real alpha);
+  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
+                                        IVector& label,
+                                        real alpha);
+
+  void softmax(Matrix& output);
+  void sequenceSoftmax(Matrix& output, const IVector& index);
+  void softmaxBackward(Matrix& outputV);
+  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
+
+  /// calculate the sum of squares diff cost.
+  void sumOfSquares(Matrix& output, Matrix& label);
+
+  /// gradient of sumOfSquares.
+  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
+  void tanh(Matrix& output);
+  void tanhDerivative(Matrix& output);
+  void softrelu(Matrix& output);
+  void softreluDerivative(Matrix& output);
+  void scaledTanh(Matrix& output, real p1, real p2);
+
+  virtual void print(std::ostream& os) const;
+  virtual void print(std::ostream& os, size_t height, size_t width) const;
+
+  void paramReluForward(Matrix& data, Matrix& W);
+  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
+  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
+
+  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
+  void randomizeUniform();
+
+  void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
+
+  void upsampleForward(Matrix& input,
+                       Matrix& mask,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       size_t channels,
+                       size_t outputH,
+                       size_t outputW);
+
+  void upsampleBackward(Matrix& outputGrad,
+                        Matrix& mask,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t channels,
+                        size_t outputH,
+                        size_t outputW);
+
+  void maxPoolForward(Matrix& inputMat,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW,
+                      MatrixPtr maskMatP);
+
+  void maxPoolBackward(Matrix& image,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       Matrix& outGrad,
+                       Matrix& outV,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW);
+
+  void avgPoolForward(Matrix& input,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW,
+                      bool excludeMode = true);
+
+  void avgPoolBackward(Matrix& input,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW,
+                       bool excludeMode = true);
+
+  void maxPool3DForward(Matrix& inputMat,
+                        Matrix& maxPoolIdx,
+                        size_t channels,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void maxPool3DBackward(Matrix& outGrad,
+                         Matrix& maxPoolIdx,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
+
+  void avgPool3DForward(Matrix& input,
+                        size_t channels,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void avgPool3DBackward(Matrix& input,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
+
+  void maxSequenceForward(Matrix& input,
+                          const IVector& sequence,
+                          IVector& index);
+
+  void maxSequenceBackward(Matrix& outputGrad,
+                           const IVector& sequence,
+                           IVector& index);
+
+  void bilinearForward(const Matrix& in,
+                       const size_t inImgH,
+                       const size_t inImgW,
+                       const size_t outImgH,
+                       const size_t outImgW,
+                       const size_t numChannels,
+                       const real ratioH,
+                       const real ratioW);
+
+  void bilinearBackward(const Matrix& out,
+                        const size_t outImgH,
+                        const size_t outImgW,
+                        const size_t inImgH,
+                        const size_t inImgW,
+                        const size_t numChannels,
+                        const real ratioH,
+                        const real ratioW);
+
+  void vol2Col(real* data,
+               int channels,
+               int depth,
+               int height,
+               int width,
+               int filterD,
+               int filterH,
+               int filterW,
+               int strideD,
+               int strideH,
+               int strideW,
+               int paddingD,
+               int paddingH,
+               int paddingW);
+
+  void col2Vol(real* trg,
+               int channels,
+               int depth,
+               int height,
+               int width,
+               int filterD,
+               int filterH,
+               int filterW,
+               int strideD,
+               int strideH,
+               int strideW,
+               int paddingD,
+               int paddingH,
+               int paddingW,
+               real alpha,
+               real beta);
+
+  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
+
+  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorGpuApply<real>(*this, expr);
+  }
+};
+
+class CpuMatrix : public Matrix {
+ private:
+  MatrixPtr sftmaxSum_;
+  MatrixPtr sftmaxDot_;
+
+ public:
+  CpuMatrix(size_t height, size_t width, bool trans = false);
+  CpuMatrix(real* data, size_t height, size_t width, bool trans = false)
+      : Matrix(data, height, width, trans, false) {}
+  CpuMatrix(real* data,
+            size_t height,
+            size_t width,
+            size_t stride,
+            bool trans = false)
+      : Matrix(data, height, width, stride, trans, false) {}
+
+  CpuMatrix(CpuMemHandlePtr dataHandle,
+            size_t height,
+            size_t width,
+            bool trans = false)
+      : Matrix(dataHandle, height, width, trans, false) {}
+
+  ~CpuMatrix();
+
+  void zeroMem();
+  void resetOne();
+  void setDiag(real value);
+
+  void resize(size_t newHeight, size_t newWidth);
+  void resize(size_t newHeight,
+              size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType,
+              SparseFormat format) {
+    LOG(FATAL) << "Only Support Sparse Matrix";
+  }
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
+              const real* values) {
+    LOG(FATAL) << "Only Support Sparse Matrix";
+  }
+
+  real getElement(size_t x, size_t y) const;
+  real getSum();
+  void accumulateColSum(Matrix& src);
+  real getAbsSum();
+
+  MatrixPtr getTranspose();
+  void transpose(MatrixPtr& matTrans, bool memAlloc);
+  void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise);
+
+  MatrixPtr getInverse();
+  void inverse(MatrixPtr& matInv, bool memAlloc);
+
+  void copyFrom(const Matrix& src);
+
+  void copyFrom(const Matrix& src, hl_stream_t stream);
+
+  void copyFrom(const real* cpuSrc, size_t size);
+
+  void copyFrom(const real* cpuSrc, const int64_t* seq);
+
+  void copyFrom(const IVector& src);
+
+  void copyFrom(CpuSparseMatrix& src);
+
+  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
+
+  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
+
+  void upsampleForward(Matrix& input,
+                       Matrix& mask,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       size_t channels,
+                       size_t outputH,
+                       size_t outputW);
+
+  void upsampleBackward(Matrix& outputGrad,
+                        Matrix& mask,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t channels,
+                        size_t outputH,
+                        size_t outputW);
+
+  void maxPoolForward(Matrix& inputMat,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW,
+                      MatrixPtr maskMatP);
+
+  void maxPoolBackward(Matrix& image,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       Matrix& outGrad,
+                       Matrix& outV,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW);
+
+  void avgPoolForward(Matrix& input,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW,
+                      bool excludeMode = true);
+
+  void avgPoolBackward(Matrix& input,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW,
+                       bool excludeMode = true);
+
+  void maxPool3DForward(Matrix& inputMat,
+                        Matrix& maxPoolIdx,
+                        size_t channels,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void maxPool3DBackward(Matrix& outGrad,
+                         Matrix& maxPoolIdx,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
+
+  void avgPool3DForward(Matrix& input,
+                        size_t channels,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void avgPool3DBackward(Matrix& input,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
+
+  void maxSequenceForward(Matrix& input,
+                          const IVector& sequence,
+                          IVector& index);
+
+  void maxSequenceBackward(Matrix& outputGrad,
+                           const IVector& sequence,
+                           IVector& index);
+
+  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
+  virtual real* getRowBuf(size_t row) { return getRow(row); }
+
+ public:
+  /// add b to each sample of this.
+  void addBias(Matrix& b, real scale);
+  void addSharedBias(Matrix& b, real scale);
+
+  /// add each sample of a to this.
+  void collectBias(Matrix& a, real scale);
+  void collectSharedBias(Matrix& a, real scale);
+
+  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
+  void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
+
+  /**
+   * @code
+   * this.row[i] += table.row[ids[i]]
+   * @endcode
+   */
+  virtual void selectRows(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * table.row[ids[i]] += this.row[i]
+   * @endcode
+   */
+  virtual void addToRows(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * this[i] = table[i, id[i]]
+   * @endcode
+   */
+  virtual void selectElements(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * table[i, id[i]] += this[i]
+   * @endcode
+   */
+  virtual void addElements(Matrix& table, IVector& ids);
+
+  /**
+   * use abstract getRow() to get row from table.
+   *
+   * Define table as template instead of virtual class for performance sake.
+   * internal used by above two virtual funcs.
+   */
+  template <typename TableMatType>
+  void selectRowsImp(TableMatType& table, IVector& ids);
+  template <typename TableMatType>
+  void addToRowsImp(TableMatType& table, IVector& ids);
+
+  void addColumnVector(const Matrix& b);
+
+  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
+  void mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
+
+  void mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB, real scaleT);
+
+  static void mul(CpuMatrix* a,
+                  CpuMatrix* b,
+                  CpuSparseMatrix* c,
+                  real scaleAB,
+                  real scaleT);
+
+  /**
+   * c = a * b
+   *
+   * use abstract getRow() to get row from B,C.
+   * Define B,C as template instead of virtual class for performance sake.
+   */
+  template <typename MatBType, typename MatCType>
+  static void mul(
+      CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT);
+
+  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
+
+  void mul(const Matrix& a, const Matrix& b);
+
+  void rightMul(Matrix& b, real scaleAB, real scaleT);
+  void rightMul(Matrix& b);
+
+  void leftMul(Matrix& a, real scaleAB, real scaleT);
+  void leftMul(Matrix& a);
+  void colMerge(Matrix& src);
+  void rowSum(Matrix& sum);
+  void rowMaxId(IVector& maxIds);
+  void rowMax(Matrix& max);
+  void rowMax(IVector& maxIds, Matrix& maxVal);
+  void colMax(Matrix& max);
+  void colMax(IVector& maxIds, Matrix& maxVal);
+  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
+  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
+  void rowNormalizeL1(Matrix& out);
+
+  void oneHotCrossEntropy(Matrix& output, IVector& label);
+  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
+  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                      IVector& label,
+                                      real alpha);
+  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
+                                        IVector& label,
+                                        real alpha);
+
+  void circularConv(Matrix& b, Matrix& c);
+  void circularConvDerivative(Matrix& output,
+                              Matrix& prevOut1,
+                              Matrix& prevOut2,
+                              Matrix& prevGrad1,
+                              Matrix& prevGrad2);
+
+  void softmax(Matrix& output);
+  void sequenceSoftmax(Matrix& output, const IVector& index);
+  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
+
+  /// calculate the sum of squares diff cost.
+  void sumOfSquares(Matrix& output, Matrix& label);
+
+  /// gradient of sumOfSquares.
+  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
+
+  void smoothL1(Matrix& output, Matrix& label, real destScale);
+  void smoothL1Bp(Matrix& output, Matrix& label, real destScale);
+
+  void tanh(Matrix& output);
+  void tanhDerivative(Matrix& output);
+
+  void softrelu(Matrix& output);
+  void softreluDerivative(Matrix& output);
+  void scaledTanh(Matrix& output, real p1, real p2);
+
+  void print(std::ostream& os) const;
+  void print(std::ostream& os, size_t height, size_t width) const;
+  void printOneRow(std::ostream& os, size_t idx) const;
+
+  void paramReluForward(Matrix& data, Matrix& W);
+  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
+  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
+
+  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
+
+  real getMin();
+  real getMax();
+
+  void randomizeUniform();
+
+  void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
+
+  void addByBitCode(size_t numClasses, const IVector& codes, const Matrix& vec);
+
+  void addByBitCodeBackward(size_t numClasses,
+                            const IVector& codes,
+                            Matrix& vec);
+
+  void mulByBitCode(size_t numClasses,
+                    const IVector& codes,
+                    const Matrix& mat,
+                    const Matrix& input);
+
+  void mulByBitCodeBackwardWeight(size_t numClasses,
+                                  const IVector& codes,
+                                  Matrix& mat,
+                                  const Matrix& input);
+
+  void mulByBitCodeBackwardError(size_t numClasses,
+                                 const IVector& codes,
+                                 const Matrix& mat,
+                                 Matrix& input);
+
+  void sumByBitCode(size_t numClasses,
+                    IVector& codes,
+                    Matrix& sum,
+                    real scaleSum);
+
+  void subByBitCode(size_t numClasses_, IVector& codes);
+
+  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
+  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
+  void classificationErrorMulti(Matrix& output, Matrix& label, real threshold);
+
+  void bilinearForward(const Matrix& in,
+                       const size_t inImgH,
+                       const size_t inImgW,
+                       const size_t outImgH,
+                       const size_t outImgW,
+                       const size_t numChannels,
+                       const real ratioH,
+                       const real ratioW);
+
+  void bilinearBackward(const Matrix& out,
+                        const size_t outImgH,
+                        const size_t outImgW,
+                        const size_t inImgH,
+                        const size_t inImgW,
+                        const size_t numChannels,
+                        const real ratioH,
+                        const real ratioW);
+
+  void vol2Col(real* data,
+               int channels,
+               int depth,
+               int height,
+               int width,
+               int filterD,
+               int filterH,
+               int filterW,
+               int strideD,
+               int strideH,
+               int strideW,
+               int paddingD,
+               int paddingH,
+               int paddingW);
+
+  void col2Vol(real* trg,
+               int channels,
+               int depth,
+               int height,
+               int width,
+               int filterD,
+               int filterH,
+               int filterW,
+               int strideD,
+               int strideH,
+               int strideW,
+               int paddingD,
+               int paddingH,
+               int paddingW,
+               real alpha,
+               real beta);
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorCpuApply<real>(*this, expr);
+  }
+};
+
+class SharedCpuMatrix : public CpuMatrix {
+ public:
+#ifndef PADDLE_MOBILE_INFERENCE
+  /* blockNum is number of partitions of the matrix  */
+  SharedCpuMatrix(int blockNum, size_t height, size_t width, bool trans = false)
+      : CpuMatrix(height, width, trans) {
+    initShared(blockNum);
+  }
+  SharedCpuMatrix(
+      int blockNum, real* data, size_t height, size_t width, bool trans = false)
+      : CpuMatrix(data, height, width, trans) {
+    initShared(blockNum);
+  }
+
+  SharedCpuMatrix(int blockNum,
+                  CpuMemHandlePtr dataHandle,
+                  size_t height,
+                  size_t width,
+                  bool trans = false)
+      : CpuMatrix(dataHandle, height, width, trans) {
+    initShared(blockNum);
+  }
+
+  SharedCpuMatrix(CpuMemHandlePtr dataHandle,
+                  size_t height,
+                  size_t width,
+                  bool trans = false)
+      : CpuMatrix(dataHandle, height, width, trans) {
+    initBlock(1);
+  }
+
+  ~SharedCpuMatrix() {}
+
+ public:
+  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
+  virtual void add(Matrix& b, real p1, real p2);
+  virtual void add(real p1, real p2);
+
+ private:
+  using Matrix::mul;
+  void initShared(int blockNum);
+  void initBlock(int blockNum);
+
+  int blockNum_;
+  std::vector<std::unique_ptr<std::mutex>> blockLocks_;
+  ThreadLocal<CpuMatrixPtr> localBuf_;
+  ThreadLocal<std::vector<int>> localBufRows_;
+  ThreadLocal<std::vector<int>> blockSeq_;
+#endif
+};
+
+typedef struct { unsigned int col; } sparse_non_value_t;
+
+typedef struct {
+  unsigned int col;
+  float value;
+} sparse_float_value_t;
+
+}  // namespace paddle
+#include "ExecViaCpu.h"
diff --git a/paddle/legacy/math/MatrixBitCode.cpp b/paddle/legacy/math/MatrixBitCode.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f35f266a30506110eb6c656f7b631d12d8f6ae90
--- /dev/null
+++ b/paddle/legacy/math/MatrixBitCode.cpp
@@ -0,0 +1,291 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Matrix.h"
+#include "hl_gpu.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+namespace {
+
+struct SimpleCode {
+  SimpleCode(size_t code, size_t numClasses) : c_(code + numClasses) {}
+  inline size_t calcIndex(int bit) const { return (c_ >> (bit + 1)) - 1; }
+  inline bool calcBit(int bit) const { return c_ & (1 << bit); }
+  inline int getLength() const { return findLastSet(c_) - 1; }
+
+ private:
+  size_t c_;
+};
+
+struct SimpleCodeTable {
+  explicit SimpleCodeTable(size_t numClasses) : numClasses_(numClasses) {}
+  SimpleCode operator()(size_t code) const {
+    return SimpleCode(code, numClasses_);
+  }
+  size_t size() const { return numClasses_; }
+  int getMaxCodeLength() const { return findLastSet(numClasses_ - 1); }
+
+ private:
+  size_t numClasses_;
+  int maxCodeLength_;
+};
+
+}  // namespace
+
+/**
+ * CodeTable class should support 3 functions:
+ *
+ * size_t size()
+ *   return the number of codes
+ *
+ * int getMaxCodeLength()
+ *   return the maximal code length
+ *
+ * Code operator()(size_t i)
+ *   return the i-th code. Code class is descriebed below.
+ *
+ * Code class should support 3 functions:
+ *
+ * int getLength()
+ *   return the length of the code
+ *
+ * bool calcIndex(int bit)
+ *   bit ranges from 0 to getLength() - 1
+ *   return the index for the (1+bit) level parent
+ *
+ * bool calcBit(int bit)
+ *   return true if the bit level parent is the right child of (1+bit) level
+ *   parent
+ *
+ */
+
+/*
+   for i:
+     for j < codeLength:
+       op(tmat(i, j), vec(0, index(i, j)))
+*/
+template <class CodeTable, class Op, class TMat, class Mat>
+static void addByBitCodeT(
+    Op op, CodeTable codeTable, const IVector& codes, TMat& tmat, Mat& vec) {
+  CHECK(!vec.useGpu());
+
+  size_t numClasses = codeTable.size();
+  size_t maxCodeLength = codeTable.getMaxCodeLength();
+  size_t numSamples = tmat.getHeight();
+  size_t oWidth = tmat.getWidth();
+  CHECK_EQ(tmat.getWidth(), maxCodeLength);
+  CHECK_EQ(codes.getSize(), numSamples);
+  CHECK_EQ(vec.getHeight(), (size_t)1);
+  CHECK_EQ(vec.getWidth(), numClasses - 1);
+
+  auto data = tmat.getData();
+  auto v = vec.getData();
+  const int* c = codes.getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    auto code = codeTable(c[i]);
+    int codeLength = code.getLength();
+    for (int j = 0; j < codeLength; ++j) {
+      size_t index = code.calcIndex(j);
+      op(data[i * oWidth + j], v[index]);
+    }
+  }
+}
+
+/* For j < codeLength:
+   this(i, j) += vec(0, index(i, j))
+*/
+void CpuMatrix::addByBitCode(size_t numClasses,
+                             const IVector& codes,
+                             const Matrix& vec) {
+  auto op = [](real& t, real v) { t += v; };
+  addByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, vec);
+}
+
+/* For j < codeLength:
+   vec(0, index(i, j)) += this(i, j)
+*/
+void CpuMatrix::addByBitCodeBackward(size_t numClasses,
+                                     const IVector& codes,
+                                     Matrix& vec) {
+  auto op = [](real t, real& v) { v += t; };
+  addByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, vec);
+}
+
+/*
+  for i:
+    for j < codeLength:
+      op(tmat(i, j), mat.row(index(i, j)), input.row(i))
+*/
+template <class Op,
+          class CodeTable,
+          class IVec,
+          class TMat,
+          class WMat,
+          class InMat>
+void mulByBitCodeT(Op op,
+                   CodeTable codeTable,
+                   IVec& codes,
+                   TMat& tmat,
+                   WMat& weight,
+                   InMat& input) {
+  CHECK(!tmat.useGpu() && !weight.useGpu() && !input.useGpu());
+
+  size_t numClasses = codeTable.size();
+  size_t maxCodeLength = codeTable.getMaxCodeLength();
+  size_t numSamples = tmat.getHeight();
+  size_t inputDim = input.getWidth();
+  size_t oWidth = tmat.getWidth();
+  CHECK_EQ(tmat.getWidth(), maxCodeLength);
+  CHECK_EQ(codes.getSize(), numSamples);
+  CHECK_EQ(input.getHeight(), numSamples);
+  CHECK_EQ(weight.getHeight(), numClasses - 1);
+  CHECK_EQ(weight.getWidth(), inputDim);
+
+  real* data = tmat.getData();
+  const int* c = codes.getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    auto code = codeTable(c[i]);
+    int codeLength = code.getLength();
+    for (int j = 0; j < codeLength; ++j) {
+      size_t index = code.calcIndex(j);
+      op(data[i * oWidth + j], weight.rowBuf(index), input.rowBuf(i), inputDim);
+    }
+  }
+}
+
+/* For j < codeLength:
+   this(i, j) += <weight.row(index(i, j)), input.row(i)>
+*/
+void CpuMatrix::mulByBitCode(size_t numClasses,
+                             const IVector& codes,
+                             const Matrix& weight,
+                             const Matrix& input) {
+  auto op = [](
+      real& t, const real* weightRow, const real* inputRow, size_t inputDim) {
+    real sum = 0;
+    for (size_t k = 0; k < inputDim; ++k) {
+      sum += weightRow[k] * inputRow[k];
+    }
+    t += sum;
+  };
+
+  mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input);
+}
+
+/* For index(i, j) >= 0:
+   weight.row(index(i, j)) += this(i, j) * input.row(i)
+*/
+void CpuMatrix::mulByBitCodeBackwardWeight(size_t numClasses,
+                                           const IVector& codes,
+                                           Matrix& weight,
+                                           const Matrix& input) {
+  auto op = [](
+      const real t, real* weightRow, const real* inputRow, size_t inputDim) {
+    for (size_t k = 0; k < inputDim; ++k) {
+      weightRow[k] += t * inputRow[k];
+    }
+  };
+
+  mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input);
+}
+
+/* For j < codeLength:
+   input.row(i) += this(i, j) * weight.row(index(i, j))
+*/
+void CpuMatrix::mulByBitCodeBackwardError(size_t numClasses,
+                                          const IVector& codes,
+                                          const Matrix& weight,
+                                          Matrix& input) {
+  auto op = [](
+      const real t, const real* weightRow, real* inputRow, size_t inputDim) {
+    for (size_t k = 0; k < inputDim; ++k) {
+      inputRow[k] += t * weightRow[k];
+    }
+  };
+
+  mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input);
+}
+
+template <class CodeTable>
+void sumByBitCodeT(CodeTable codeTable,
+                   IVector& codes,
+                   const CpuMatrix& tmat,
+                   Matrix& sum,
+                   real scaleSum) {
+  size_t maxCodeLength = codeTable.getMaxCodeLength();
+  size_t numSamples = tmat.getHeight();
+  size_t oWidth = tmat.getWidth();
+  CHECK_EQ(tmat.getWidth(), maxCodeLength);
+  CHECK_EQ(codes.getSize(), numSamples);
+  CHECK_EQ(sum.getHeight(), numSamples);
+  CHECK_EQ(sum.getWidth(), (size_t)1);
+
+  const real* data = tmat.getData();
+  real* s = sum.getData();
+  int* c = codes.getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    real sm = 0;
+    auto code = codeTable(c[i]);
+    int codeLength = code.getLength();
+    for (int j = 0; j < codeLength; ++j) {
+      if (code.calcBit(j)) {
+        sm += data[i * oWidth + j];
+      }
+    }
+    s[i] = scaleSum * sm;
+  }
+}
+
+/* For j < codeLength:
+   sum(i, 0) = \sum_j  bit(i, j) * this(i, j)
+*/
+void CpuMatrix::sumByBitCode(size_t numClasses,
+                             IVector& codes,
+                             Matrix& sum,
+                             real scaleSum) {
+  sumByBitCodeT(SimpleCodeTable(numClasses), codes, *this, sum, scaleSum);
+}
+
+template <class CodeTable>
+void subByBitCodeT(CodeTable codeTable, IVector& codes, CpuMatrix& tmat) {
+  size_t maxCodeLength = codeTable.getMaxCodeLength();
+  size_t numSamples = tmat.getHeight();
+  size_t oWidth = tmat.getWidth();
+  CHECK_EQ(tmat.getWidth(), maxCodeLength);
+  CHECK_EQ(codes.getSize(), numSamples);
+
+  real* data = tmat.getData();
+  int* c = codes.getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    auto code = codeTable(c[i]);
+    int codeLength = code.getLength();
+    for (int j = 0; j < codeLength; ++j) {
+      if (code.calcBit(j)) {
+        data[i * oWidth + j] -= 1;
+      }
+    }
+  }
+}
+
+/* For j < codeLength
+   this(i, j) -= bit(i, j)
+*/
+void CpuMatrix::subByBitCode(size_t numClasses, IVector& codes) {
+  subByBitCodeT(SimpleCodeTable(numClasses), codes, *this);
+}
+
+}  // namespace paddle
diff --git a/paddle/math/MemoryHandle.cpp b/paddle/legacy/math/MemoryHandle.cpp
similarity index 100%
rename from paddle/math/MemoryHandle.cpp
rename to paddle/legacy/math/MemoryHandle.cpp
diff --git a/paddle/legacy/math/MemoryHandle.h b/paddle/legacy/math/MemoryHandle.h
new file mode 100644
index 0000000000000000000000000000000000000000..516e09dbed47ac6b039ccb094614c9588eeb3cd5
--- /dev/null
+++ b/paddle/legacy/math/MemoryHandle.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include "PoolAllocator.h"
+
+namespace paddle {
+
+class MemoryHandle {
+ protected:
+  explicit MemoryHandle(size_t size);
+  virtual ~MemoryHandle() {}
+
+ public:
+  void* getBuf() const { return buf_; }
+  size_t getSize() const { return size_; }
+  size_t getAllocSize() const { return allocSize_; }
+
+ protected:
+  PoolAllocator* allocator_;
+  size_t size_;       // the requested size
+  size_t allocSize_;  // the allocated size
+  int deviceId_;      // the device id of memory if gpu memory
+  void* buf_;
+};
+
+/**
+ * Wrapper class for raw gpu memory handle.
+ *
+ * The raw handle will be released at destructor
+ */
+class GpuMemoryHandle : public MemoryHandle {
+ public:
+  explicit GpuMemoryHandle(size_t size);
+  virtual ~GpuMemoryHandle();
+};
+
+/**
+ * Wrapper class for raw cpu memory handle.
+ *
+ * The raw handle will be released at destructor
+ */
+class CpuMemoryHandle : public MemoryHandle {
+ public:
+  explicit CpuMemoryHandle(size_t size);
+  virtual ~CpuMemoryHandle();
+};
+
+typedef std::shared_ptr<MemoryHandle> MemoryHandlePtr;
+typedef std::shared_ptr<CpuMemoryHandle> CpuMemHandlePtr;
+typedef std::shared_ptr<GpuMemoryHandle> GpuMemHandlePtr;
+}  // namespace paddle
diff --git a/paddle/math/NEONFunctions.cpp b/paddle/legacy/math/NEONFunctions.cpp
similarity index 100%
rename from paddle/math/NEONFunctions.cpp
rename to paddle/legacy/math/NEONFunctions.cpp
diff --git a/paddle/math/NEONFunctions.h b/paddle/legacy/math/NEONFunctions.h
similarity index 100%
rename from paddle/math/NEONFunctions.h
rename to paddle/legacy/math/NEONFunctions.h
diff --git a/paddle/math/PoolAllocator.cpp b/paddle/legacy/math/PoolAllocator.cpp
similarity index 100%
rename from paddle/math/PoolAllocator.cpp
rename to paddle/legacy/math/PoolAllocator.cpp
diff --git a/paddle/legacy/math/PoolAllocator.h b/paddle/legacy/math/PoolAllocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..7239cf1c4494e207081e325a7e6067ba26a9c852
--- /dev/null
+++ b/paddle/legacy/math/PoolAllocator.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <vector>
+#include "Allocator.h"
+
+namespace paddle {
+
+/**
+ * @brief Memory pool allocator implementation.
+ */
+class PoolAllocator {
+ public:
+  /**
+   * @brief constructor.
+   * @param allocator a Allocator object.
+   * @param sizeLimit The maximum size memory can be managed,
+   * if sizeLimit == 0, the pool allocator is a simple wrapper of allocator.
+   */
+  PoolAllocator(Allocator* allocator,
+                size_t sizeLimit = 0,
+                const std::string& name = "pool");
+
+  /**
+   * @brief destructor.
+   */
+  ~PoolAllocator();
+
+  void* alloc(size_t size);
+  void free(void* ptr, size_t size);
+  std::string getName() { return name_; }
+
+ private:
+  void freeAll();
+  void printAll();
+  std::unique_ptr<Allocator> allocator_;
+  std::mutex mutex_;
+  std::unordered_map<size_t, std::vector<void*>> pool_;
+  size_t sizeLimit_;
+  size_t poolMemorySize_;
+  std::string name_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/math/RowBuffer.h b/paddle/legacy/math/RowBuffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..9dfd5eff06a39494cea6a8ce0b1f5ead6490b148
--- /dev/null
+++ b/paddle/legacy/math/RowBuffer.h
@@ -0,0 +1,139 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "MemoryHandle.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+/**
+ * @brief The RowBuffer class
+ * Represent the SparseRow Matrix Data.
+ *
+ * If not set memory handler, then the data could be auto growth.
+ */
+class RowBuffer {
+ public:
+  /**
+   * @brief RowBuffer create a auto-growth row buffer. The row length is width.
+   * @param width the length of each row, a.k.a matrix width.
+   */
+  explicit RowBuffer(size_t width) : width_(width) {}
+
+  /**
+   * @brief RowBuffer create a row buffer, which cannot be auto-growth.
+   * @param mem the pre-allocated memory.
+   * @param width the length of each row, a.k.a matrix width.
+   */
+  RowBuffer(const CpuMemHandlePtr& mem, size_t width)
+      : preallocatedBuf_(mem), width_(width) {}
+
+  /**
+   * @brief resize resize the buffer with rowCount
+   * @param rowCnt number of row. matrix height.
+   */
+  inline void resize(int rowCnt) {
+    if (preallocatedBuf_) {
+      CHECK(preallocatedBuf_->getSize() >= rowCnt * width_ * sizeof(real));
+    } else {
+      rowStore_.resize(rowCnt * width_);
+    }
+  }
+
+  /**
+   * @brief get a row buffer with row index.
+   * @param row the index of row.
+   * @return row buffer.
+   */
+  inline real* get(int row) const {
+    if (preallocatedBuf_) {
+      CHECK_LE((row)*width_ * sizeof(real), preallocatedBuf_->getSize());
+      return reinterpret_cast<real*>(preallocatedBuf_->getBuf()) + row * width_;
+    } else {
+      CHECK_LE((row + 1) * width_, rowStore_.size());
+      return const_cast<real*>(rowStore_.data() + row * width_);
+    }
+  }
+
+  /**
+   * @brief get a row buffer with row index. If row index is larger than local
+   *        buffer, the size of local buffer will grow.
+   * @param row the index of row.
+   * @return row buffer.
+   */
+  inline real* getWithAutoGrowth(int row) {
+    if (preallocatedBuf_) {
+      return get(row);
+    } else {
+      if ((rowStore_.size() <= row * width_)) {
+        rowStore_.resize((row + 1) * width_);
+      }
+      return rowStore_.data() + row * width_;
+    }
+  }
+
+  /**
+   * @return raw data buffer.
+   */
+  inline real* data() {
+    if (preallocatedBuf_) {
+      return reinterpret_cast<real*>(preallocatedBuf_->getBuf());
+    } else {
+      return rowStore_.data();
+    }
+  }
+
+  /**
+   * @brief clear local buffer. It only affect auto-growth buffer.
+   */
+  inline void clear() {
+    // swap an empty vector to it to free the memory.
+    std::vector<real, AlignedAllocator<real, 32>> empty;
+    rowStore_.swap(empty);
+  }
+
+  /**
+   * @brief get current number of rows.
+   * @return number of rows.
+   */
+  inline size_t getRowCount() const {
+    if (preallocatedBuf_) {
+      return preallocatedBuf_->getSize() / sizeof(real) / width_;
+    } else {
+      return rowStore_.size() / width_;
+    }
+  }
+
+  /**
+   * @brief get is this buffer can automatically grow or not.
+   * @return ture if can automacitally grow.
+   */
+  inline bool isAutoGrowth() const { return !preallocatedBuf_; }
+
+  /**
+   * @brief return the width of matrix. a.k.a length of row.
+   * @return width of matrix
+   */
+  inline size_t getWidth() const { return width_; }
+
+ private:
+  //! TODO(yuyang18): Add resize method to CpuMemHandlePtr, then we can get rid
+  //! of std::vector here.
+  CpuMemHandlePtr preallocatedBuf_;
+  std::vector<real, AlignedAllocator<real, 32>> rowStore_;
+  size_t width_;
+};
+}  // namespace paddle
diff --git a/paddle/math/SIMDFunctions.cpp b/paddle/legacy/math/SIMDFunctions.cpp
similarity index 100%
rename from paddle/math/SIMDFunctions.cpp
rename to paddle/legacy/math/SIMDFunctions.cpp
diff --git a/paddle/math/SIMDFunctions.h b/paddle/legacy/math/SIMDFunctions.h
similarity index 100%
rename from paddle/math/SIMDFunctions.h
rename to paddle/legacy/math/SIMDFunctions.h
diff --git a/paddle/legacy/math/SparseMatrix.cpp b/paddle/legacy/math/SparseMatrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6f68252b0a74802946e899e6e13e1da681d76986
--- /dev/null
+++ b/paddle/legacy/math/SparseMatrix.cpp
@@ -0,0 +1,864 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "SparseMatrix.h"
+#include <algorithm>
+#include <iostream>
+#include <vector>
+#include "hl_gpu.h"
+#include "hl_top_k.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+GpuSparseMatrix::GpuSparseMatrix(size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
+                                 bool trans)
+    : Matrix(NULL, height, width, trans, true) {
+  resize(height, width, nnz, valueType, format);
+}
+
+GpuSparseMatrix::GpuSparseMatrix(GpuMemHandlePtr dataHandle,
+                                 hl_sparse_matrix_s_ptr sMatrix,
+                                 size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
+                                 bool trans,
+                                 MemoryHandlePtr sMemoryHandle)
+    : Matrix(dataHandle, height, width, trans, true) {
+  CHECK(dataHandle && sMatrix) << "Invalid argument pointer";
+
+  size_t size = 0;
+  if (format == SPARSE_CSR) {
+    size = (height + 1) * sizeof(int) + nnz * sizeof(int);
+  } else {
+    size = (width + 1) * sizeof(int) + nnz * sizeof(int);
+  }
+
+  if (NO_VALUE != valueType) {
+    size += nnz * sizeof(real);
+  }
+  CHECK_LE(size, dataHandle->getSize());
+
+  sMatrix_ = sMatrix;
+
+  if (sMemoryHandle == NULL) {
+    sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(dataHandle->getSize());
+  } else {
+    CHECK_EQ(sMemoryHandle->getSize(), dataHandle->getSize());
+    sMemoryHandle_ = sMemoryHandle;
+  }
+
+  elementCnt_ = nnz;
+  valueType_ = valueType;
+  format_ = format;
+  if (format_ == SPARSE_CSR)
+    sparseResizeCSR();
+  else
+    sparseResizeCSC();
+}
+
+GpuSparseMatrix::GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix,
+                                 size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
+                                 bool trans,
+                                 MemoryHandlePtr sMemoryHandle)
+    : Matrix(NULL, height, width, trans, true) {
+  CHECK(sMatrix) << "Invalid argument pointer";
+  sMatrix_ = sMatrix;
+  sMemoryHandle_ = sMemoryHandle;
+  elementCnt_ = nnz;
+  format_ = format;
+  valueType_ = valueType;
+}
+
+GpuSparseMatrix::GpuSparseMatrix(real* value,
+                                 int* rows,
+                                 int* cols,
+                                 size_t height,
+                                 size_t width,
+                                 size_t nnz,
+                                 SparseValueType valueType,
+                                 SparseFormat format,
+                                 bool trans)
+    : Matrix(NULL, height, width, trans, true) {
+  size_t size = 0;
+  if (format == SPARSE_CSR) {
+    size = (height + 1) * sizeof(int) + nnz * sizeof(int);
+  } else {
+    size = (width + 1) * sizeof(int) + nnz * sizeof(int);
+  }
+
+  if (NO_VALUE != valueType) {
+    size += nnz * sizeof(real);
+  }
+  elementCnt_ = nnz;
+  valueType_ = valueType;
+  format_ = format;
+
+  sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(size);
+  if (format_ == SPARSE_CSR) {
+    rows_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
+    cols_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+        (height_ + 1) * sizeof(int));
+    if (NO_VALUE != valueType_) {
+      value_ = reinterpret_cast<real*>(
+          reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+          (height_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
+    } else {
+      value_ = NULL;
+    }
+
+    if (sMatrix_ == NULL) {
+      /* construct hl_sparse_matrix_s */
+      hl_sparse_matrix_s tmp;
+      hl_construct_sparse_matrix(
+          &tmp,
+          value,
+          rows,
+          cols,
+          HL_SPARSE_CSR,
+          valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
+          height_,
+          width_,
+          elementCnt_);
+      hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
+      sMatrix_ = tmp2;
+    }
+
+  } else {
+    cols_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
+    rows_ = reinterpret_cast<int*>(
+        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+        (width_ + 1) * sizeof(int));
+    if (NO_VALUE != valueType_) {
+      value_ = reinterpret_cast<real*>(
+          reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+          (width_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
+    } else {
+      value_ = NULL;
+    }
+
+    if (sMatrix_ == NULL) {
+      /* construct hl_sparse_matrix_s */
+      hl_sparse_matrix_s tmp;
+      hl_construct_sparse_matrix(
+          &tmp,
+          value,
+          rows,
+          cols,
+          HL_SPARSE_CSC,
+          valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
+          height_,
+          width_,
+          elementCnt_);
+      hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
+      sMatrix_ = tmp2;
+    }
+  }
+}
+
+void GpuSparseMatrix::sparseResizeCSR() {
+  rows_ =
+      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
+  cols_ =
+      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+                             (height_ + 1) * sizeof(int));
+  if (NO_VALUE != valueType_) {
+    value_ = reinterpret_cast<real*>(
+        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+        (height_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
+  } else {
+    value_ = NULL;
+  }
+
+  if (sMatrix_ == NULL) {
+    /* construct hl_sparse_matrix_s */
+    hl_sparse_matrix_s tmp;
+    hl_construct_sparse_matrix(
+        &tmp,
+        data_,
+        memoryHandle_->getSize(),
+        HL_SPARSE_CSR,
+        valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
+        height_,
+        width_,
+        elementCnt_);
+    hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
+    sMatrix_ = tmp2;
+  }
+}
+
+void GpuSparseMatrix::sparseResizeCSC() {
+  cols_ =
+      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
+  rows_ =
+      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+                             (width_ + 1) * sizeof(int));
+  if (NO_VALUE != valueType_) {
+    value_ = reinterpret_cast<real*>(
+        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+        (width_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
+  } else {
+    value_ = NULL;
+  }
+
+  if (sMatrix_ == NULL) {
+    /* construct hl_sparse_matrix_s */
+    hl_sparse_matrix_s tmp;
+    hl_construct_sparse_matrix(
+        &tmp,
+        memoryHandle_->getBuf(),
+        memoryHandle_->getSize(),
+        HL_SPARSE_CSC,
+        valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
+        height_,
+        width_,
+        elementCnt_);
+    hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
+    sMatrix_ = tmp2;
+  }
+}
+
+void GpuSparseMatrix::resize(size_t newHeight,
+                             size_t newWidth,
+                             size_t newNnz,
+                             SparseValueType valueType,
+                             SparseFormat format) {
+  if (format == SPARSE_CSR) {
+    resizeCSR(newHeight, newWidth, newNnz, valueType);
+  } else {
+    resizeCSC(newHeight, newWidth, newNnz, valueType);
+  }
+}
+
+void GpuSparseMatrix::resizeCSR(size_t newHeight,
+                                size_t newWidth,
+                                size_t newNnz,
+                                SparseValueType valueType) {
+  size_t newSize = (newHeight + 1) * sizeof(int) + newNnz * sizeof(int);
+  if (NO_VALUE != valueType) {
+    newSize += newNnz * sizeof(real);
+  }
+
+  if (NULL == memoryHandle_.get() || newSize > memoryHandle_->getSize()) {
+    memoryHandle_ = std::make_shared<GpuMemoryHandle>(newSize);
+    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
+    sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize);
+    end_ = reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+           sMemoryHandle_->getSize();
+    sMatrix_ = NULL;
+  } else if (valueType != valueType_) {
+    sMatrix_ = NULL;
+  } else {
+    /*
+     * newNnz > elementCnt_ is necessary for the following condition:
+     * Firstly, height_ is 9 elementCnt_ is 56
+     * Secondly, height_ is 11 elementCnt_ is 44
+     *   ==> height_ is bigger, sMatrix_ will resize, and total item is 44 now
+     * Then, height_ is 10 elementCnt_ is 52
+     *   ==> Without newNnz > elementCnt_ condition, sMatrix_ will fail
+     */
+    if ((ssize_t)((newHeight + 1) * sizeof(int)) >
+            ((char*)cols_ - (char*)rows_) ||
+        newNnz > static_cast<size_t>(sMatrix_->nnz)) {
+      sMatrix_ = NULL;
+    } else if (NO_VALUE == valueType) {
+      if ((ssize_t)(newNnz * sizeof(int)) > (end_ - (char*)cols_)) {
+        sMatrix_ = NULL;
+      }
+    } else {
+      if ((ssize_t)(newNnz * sizeof(int)) > ((char*)value_ - (char*)cols_) ||
+          (ssize_t)(newNnz * sizeof(real)) > (end_ - (char*)value_)) {
+        sMatrix_ = NULL;
+      }
+    }
+  }
+
+  height_ = newHeight;
+  width_ = newWidth;
+  elementCnt_ = newNnz;
+  valueType_ = valueType;
+  format_ = SPARSE_CSR;
+
+  if (sMatrix_ == NULL) {
+    sparseResizeCSR();
+  }
+}
+
+void GpuSparseMatrix::resizeCSC(size_t newHeight,
+                                size_t newWidth,
+                                size_t newNnz,
+                                SparseValueType valueType) {
+  size_t newSize = (newWidth + 1) * sizeof(int) + newNnz * sizeof(int);
+  if (NO_VALUE != valueType) {
+    newSize += newNnz * sizeof(real);
+  }
+
+  if (NULL == memoryHandle_.get() || newSize > memoryHandle_->getSize()) {
+    memoryHandle_ = std::make_shared<GpuMemoryHandle>(newSize);
+    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
+    sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize);
+    end_ = reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
+           sMemoryHandle_->getSize();
+    sMatrix_ = NULL;
+  } else if (valueType != valueType_) {
+    sMatrix_ = NULL;
+  } else {
+    /*
+     * newNnz > elementCnt_ is necessary for the following condition:
+     * Firstly, height_ is 9 elementCnt_ is 56
+     * Secondly, height_ is 11 elementCnt_ is 44
+     *   ==> height_ is bigger, sMatrix_ will resize,
+     *       and total item is 44 now
+     * Then, height_ is 10 elementCnt_ is 52
+     *   ==> Without newNnz > elementCnt_ condition, sMatrix_ will fail
+     */
+    if ((ssize_t)((newWidth + 1) * sizeof(int)) >
+            ((char*)rows_ - (char*)cols_) ||
+        newNnz > static_cast<size_t>(sMatrix_->nnz)) {
+      sMatrix_ = NULL;
+    } else if (NO_VALUE == valueType) {
+      if ((ssize_t)(newNnz * sizeof(int)) > (end_ - (char*)rows_)) {
+        sMatrix_ = NULL;
+      }
+    } else {
+      if ((ssize_t)(newNnz * sizeof(int)) > ((char*)value_ - (char*)rows_) ||
+          (ssize_t)(newNnz * sizeof(real)) > (end_ - (char*)value_)) {
+        sMatrix_ = NULL;
+      }
+    }
+  }
+
+  height_ = newHeight;
+  width_ = newWidth;
+  elementCnt_ = newNnz;
+  valueType_ = valueType;
+  format_ = SPARSE_CSC;
+
+  if (sMatrix_ == NULL) {
+    sparseResizeCSC();
+  }
+}
+
+void GpuSparseMatrix::resize(size_t newHeight, size_t newWidth) {
+  resize(newHeight, newWidth, elementCnt_, valueType_, format_);
+}
+
+MatrixPtr GpuSparseMatrix::getTranspose() {
+  CHECK(memoryHandle_.get() || sMatrix_) << "not supported";
+  if (memoryHandle_.get()) {
+    MatrixPtr copy_T(new GpuSparseMatrix(
+        std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle_),
+        sMatrix_,
+        height_,
+        width_,
+        elementCnt_,
+        valueType_,
+        format_,
+        true,
+        sMemoryHandle_));
+    return copy_T;
+  } else {
+    MatrixPtr copy_T(new GpuSparseMatrix(sMatrix_,
+                                         height_,
+                                         width_,
+                                         elementCnt_,
+                                         valueType_,
+                                         format_,
+                                         true,
+                                         sMemoryHandle_));
+    return copy_T;
+  }
+}
+
+void GpuSparseMatrix::copyRow(int offsets,
+                              size_t colNum,
+                              const sparse_non_value_t* row) {
+  memcpy(cols_ + offsets, row, sizeof(int) * colNum);
+}
+
+void GpuSparseMatrix::copyRow(int offsets,
+                              size_t colNum,
+                              const sparse_float_value_t* row) {
+  for (size_t j = 0; j < colNum; j++) {
+    cols_[offsets + j] = row[j].col;
+    value_[offsets + j] = row[j].value;
+  }
+}
+
+void GpuSparseMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
+  if (auto mat = dynamic_cast<const CpuSparseMatrix*>(&src)) {
+    copyFrom(*(const_cast<CpuSparseMatrix*>(mat)), stream);
+  } else if (auto mat = dynamic_cast<const GpuSparseMatrix*>(&src)) {
+    copyFrom(*(const_cast<GpuSparseMatrix*>(mat)), stream);
+  } else {
+    LOG(FATAL) << "Not implemented";
+  }
+}
+
+void GpuSparseMatrix::copyFrom(const Matrix& src) {
+  copyFrom(src, HPPL_STREAM_1);
+  hl_stream_synchronize(HPPL_STREAM_1);
+}
+
+template <class T>
+void GpuSparseMatrix::copyFrom(int64_t* ids,
+                               int64_t* indices,
+                               T* data,
+                               hl_stream_t stream) {
+  CHECK_EQ(format_, SPARSE_CSR);
+  size_t nnz = 0;
+  for (size_t i = 0; i < height_; i++) {
+    int64_t id = ids[i];
+    nnz += indices[id + 1] - indices[id];
+  }
+
+  resize(height_,
+         width_,
+         nnz,
+         sizeof(T) == sizeof(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE,
+         format_);
+
+  rows_[0] = 0;
+  for (size_t i = 0; i < height_; i++) {
+    int64_t id = ids[i];
+    size_t colNum = indices[id + 1] - indices[id];
+    rows_[i + 1] = rows_[i] + colNum;
+
+    T* row = data + indices[id];
+    copyRow(rows_[i], colNum, row);
+  }
+
+  sMatrix_->format = HL_SPARSE_CSR;
+  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
+  sMatrix_->rows = height_;
+  sMatrix_->cols = width_;
+  sMatrix_->nnz = nnz;
+  hl_memcpy_csr_matrix(sMatrix_.get(), value_, rows_, cols_, stream);
+}
+
+void GpuSparseMatrix::setRow(size_t row,
+                             size_t colNum,
+                             const unsigned int* cols,
+                             const real* values) {
+  CHECK_EQ(format_, SPARSE_CSR);
+  if (NO_VALUE == valueType_) {
+    CHECK_LT(row, height_);
+    CHECK(NULL != cols);
+    CHECK(NULL == values);
+  } else {
+    CHECK_LT(row, height_);
+    CHECK(NULL != cols);
+    CHECK(NULL != values);
+  }
+  if (0 == row) {
+    rows_[row] = 0;
+  }
+  rows_[row + 1] = rows_[row] + colNum;
+
+  memcpy(cols_ + rows_[row], cols, sizeof(*cols) * colNum);
+  if (FLOAT_VALUE == valueType_) {
+    memcpy(value_ + rows_[row], values, sizeof(*values) * colNum);
+  }
+
+  if (height_ - 1 == row) {
+    sMatrix_->format = HL_SPARSE_CSR;
+    sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
+    sMatrix_->rows = height_;
+    sMatrix_->cols = width_;
+    sMatrix_->nnz = elementCnt_;
+    hl_memcpy_csr_matrix(
+        sMatrix_.get(), value_, rows_, cols_, HPPL_STREAM_DEFAULT);
+  }
+}
+
+SparseValueType GpuSparseMatrix::getValueType() const { return valueType_; }
+
+void GpuSparseMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
+  CHECK_EQ(format_, SPARSE_CSC);
+  int nnz = sMatrix_->nnz;
+  if (memAlloc) {
+    matTrans = std::make_shared<GpuSparseMatrix>(
+        width_, height_, nnz, valueType_, format_, false);
+  } else {
+    CHECK(matTrans != nullptr);
+  }
+
+  CpuIVector rows(nnz);
+  CpuIVector cols(width_ + 1);
+  CpuIVector cols_full(nnz);
+  CpuVector value(nnz);
+  hl_stream_t stream = HPPL_STREAM_1;
+  hl_memcpy_from_csc_matrix(value.getData(),
+                            nnz,
+                            rows.getData(),
+                            nnz,
+                            cols.getData(),
+                            width_ + 1,
+                            sMatrix_.get(),
+                            stream);
+
+  hl_stream_synchronize(stream);
+
+  /*for every non zero number, get its column index*/
+  std::vector<Element> dataVec;
+  for (size_t i = 0; i < width_; i++) {
+    for (int j = cols.getData()[i]; j < cols.getData()[i + 1]; j++) {
+      cols_full.getData()[j] = i;
+    }
+  }
+
+  /*sort row index and column index by the ascending order*/
+  for (int i = 0; i < nnz; i++) {
+    dataVec.emplace_back(
+        rows.getData()[i], cols_full.getData()[i], value.getData()[i]);
+  }
+  std::sort(dataVec.begin(), dataVec.end(), [](Element a, Element b) {
+    return a.row < b.row || (a.row == b.row && a.col < b.col);
+  });
+
+  /*get sorted data, row index, and col index, put them in the right place*/
+  cols.resize(height_ + 1);
+  rows.resize(nnz);
+  value.resize(nnz);
+
+  cols.getData()[0] = 0;
+  rows.getData()[0] = dataVec[0].col;
+  value.getData()[0] = dataVec[0].val;
+  for (int i = 1; i < nnz; i++) {
+    if (dataVec[i].row != dataVec[i - 1].row) {
+      for (int j = dataVec[i - 1].row + 1; j <= dataVec[i].row; j++) {
+        cols.getData()[j] = i;
+      }
+    }
+    rows.getData()[i] = dataVec[i].col;
+    value.getData()[i] = dataVec[i].val;
+  }
+  cols.getData()[height_] = nnz;
+
+  /*copy back from cpu*/
+  GpuSparseMatrixPtr dest =
+      std::dynamic_pointer_cast<GpuSparseMatrix>(matTrans);
+  hl_memcpy_csc_matrix((dest->sMatrix_).get(),
+                       value.getData(),
+                       rows.getData(),
+                       cols.getData(),
+                       stream);
+  hl_stream_synchronize(stream);
+}
+
+void GpuSparseMatrix::mul(const GpuMatrix& a,
+                          const GpuMatrix& b,
+                          real scaleAB,
+                          real scaleT) {
+  CHECK(a.useGpu_ && b.useGpu_) << "type not match";
+  CHECK(!trans_) << "trans not supported";
+  real* A_d = (real*)a.getData();
+  real* B_d = (real*)b.getData();
+  hl_sparse_matrix_s C_d = sMatrix_.get();
+  hl_trans_op_t a_trans = a.trans_ ? HPPL_OP_T : HPPL_OP_N;
+  hl_trans_op_t b_trans = b.trans_ ? HPPL_OP_T : HPPL_OP_N;
+
+  if (!a.trans_ && !b.trans_) {
+    CHECK(height_ == a.getHeight());
+    CHECK(width_ == b.getWidth());
+    CHECK(a.getWidth() == b.getHeight());
+  } else if (a.trans_ && !b.trans_) {
+    CHECK(height_ == a.getWidth());
+    CHECK(width_ == b.getWidth());
+    CHECK(a.getHeight() == b.getHeight());
+  } else if (!a.trans_ && b.trans_) {
+    CHECK(height_ == a.getHeight());
+    CHECK(width_ == b.getHeight());
+    CHECK(a.getWidth() == b.getWidth());
+  } else {
+    LOG(INFO) << "Not support";
+  }
+  int dimM = height_;
+  int dimN = width_;
+  int dimK = !b.trans_ ? b.getHeight() : b.getWidth();
+  hl_sparse_matrix_mul(
+      A_d, a_trans, B_d, b_trans, C_d, dimM, dimN, dimK, scaleAB, scaleT);
+}
+
+void GpuSparseMatrix::mul(const Matrix& a,
+                          const Matrix& b,
+                          real scaleAB,
+                          real scaleT) {
+  const auto a_ptr = dynamic_cast<const GpuMatrix*>(&a);
+  const auto b_ptr = dynamic_cast<const GpuMatrix*>(&b);
+  if (a_ptr && b_ptr) {
+    mul(*a_ptr, *b_ptr, scaleAB, scaleT);
+  } else {
+    LOG(FATAL) << "not supported";
+  }
+}
+
+template <class T>
+void printBuf(std::ostream& os, T* a, size_t len, const char* name) {
+  os << "\n: " << name << " [";
+  for (size_t i = 0; i < len; i++) {
+    os << a[i] << " ";
+  }
+  os << "]\n";
+}
+
+void GpuSparseMatrix::print(std::ostream& os) const {
+  if (format_ == SPARSE_CSC) {
+    int nnz = sMatrix_->nnz;
+    IVectorPtr rows = IVector::create(nnz, false);
+    IVectorPtr cols = IVector::create(width_ + 1, false);
+    VectorPtr value = Vector::create(nnz, false);
+    hl_stream_t stream = HPPL_STREAM_DEFAULT;
+    hl_memcpy_from_csc_matrix(value->getData(),
+                              value->getSize(),
+                              rows->getData(),
+                              rows->getSize(),
+                              cols->getData(),
+                              cols->getSize(),
+                              sMatrix_.get(),
+                              stream);
+    hl_stream_synchronize(stream);
+
+    printBuf(os, cols->getData(), width_ + 1, "col idx");
+    printBuf(os, rows->getData(), elementCnt_, "row idx");
+    printBuf(os, value->getData(), elementCnt_, "value");
+  }
+}
+
+void GpuSparseMatrix::copyFromCSR(CpuSparseMatrix& src, hl_stream_t stream) {
+  trans_ = src.trans_;
+  size_t nnz = src.getElementCnt();
+
+  resize(src.getHeight(), src.getWidth(), nnz, valueType_, src.getFormat());
+  // if have different value type, only copy rows and cols
+  SparseValueType vType =
+      valueType_ != src.getValueType() ? NO_VALUE : valueType_;
+
+  sMatrix_->format = HL_SPARSE_CSR;
+  sMatrix_->type = vType == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
+  sMatrix_->rows = height_;
+  sMatrix_->cols = width_;
+  sMatrix_->nnz = nnz;
+
+  hl_memcpy_csr_matrix(sMatrix_.get(),
+                       vType == NO_VALUE ? NULL : src.getValue(),
+                       src.getRows(),
+                       src.getCols(),
+                       stream);
+
+  // restore type of sMatrix_
+  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
+}
+
+void GpuSparseMatrix::copyFromCSC(CpuSparseMatrix& src, hl_stream_t stream) {
+  trans_ = src.trans_;
+  size_t nnz = src.getElementCnt();
+
+  resize(src.getHeight(), src.getWidth(), nnz, valueType_, src.getFormat());
+
+  // if have different value type, only copy rows and cols
+  SparseValueType vType =
+      valueType_ != src.getValueType() ? NO_VALUE : valueType_;
+
+  sMatrix_->format = HL_SPARSE_CSC;
+  sMatrix_->type = vType == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
+  sMatrix_->rows = height_;
+  sMatrix_->cols = width_;
+  sMatrix_->nnz = nnz;
+
+  hl_memcpy_csc_matrix(sMatrix_.get(),
+                       vType == NO_VALUE ? NULL : src.getValue(),
+                       src.getRows(),
+                       src.getCols(),
+                       stream);
+
+  // restore type of sMatrix_
+  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
+}
+
+void GpuSparseMatrix::copyFrom(GpuSparseMatrix& src, hl_stream_t stream) {
+  CHECK(trans_ == src.trans_);
+  CHECK(format_ == src.getFormat());
+  resize(src.getHeight(),
+         src.getWidth(),
+         elementCnt_,
+         valueType_,
+         src.getFormat());
+
+  size_t rowSize = format_ == SPARSE_CSC ? elementCnt_ : height_ + 1;
+  size_t colSize = format_ == SPARSE_CSC ? width_ + 1 : elementCnt_;
+
+  if (valueType_ == FLOAT_VALUE && src.getValueType() == FLOAT_VALUE) {
+    hl_memcpy_async(
+        getValue(), src.getValue(), sizeof(real) * elementCnt_, stream);
+  }
+  CHECK(getRows());
+  CHECK(src.getRows());
+
+  hl_memcpy_async(getRows(), src.getRows(), sizeof(int) * rowSize, stream);
+  hl_memcpy_async(getCols(), src.getCols(), sizeof(int) * colSize, stream);
+}
+
+void GpuSparseMatrix::copyFrom(CpuSparseMatrix& src, hl_stream_t stream) {
+  if (format_ == SPARSE_CSR) {
+    copyFromCSR(src, stream);
+  } else {
+    copyFromCSC(src, stream);
+  }
+}
+
+void GpuSparseMatrix::trimFromCSR(const CpuSparseMatrix& src) {
+  trans_ = src.trans_;
+  int* srcCols = src.getCols();
+  size_t nnz = std::count_if(srcCols,
+                             srcCols + src.getElementCnt(),
+                             [this](size_t n) { return n < this->width_; });
+  resize(height_, width_, nnz, valueType_, format_);
+
+  rows_[0] = 0;
+  size_t index = 0;
+  for (size_t r = 0; r < height_; ++r) {
+    for (int i = src.getRows()[r]; i < src.getRows()[r + 1]; ++i) {
+      if (srcCols[i] < (int)width_) {
+        cols_[index] = srcCols[i];
+        if (valueType_ == FLOAT_VALUE) {
+          value_[index] = src.getValue()[i];
+        }
+        ++index;
+      }
+    }
+    rows_[r + 1] = index;
+  }
+  CHECK_EQ(index, nnz);
+
+  sMatrix_->format = HL_SPARSE_CSR;
+  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
+  sMatrix_->rows = height_;
+  sMatrix_->cols = width_;
+  sMatrix_->nnz = nnz;
+
+  hl_memcpy_csr_matrix(sMatrix_.get(),
+                       valueType_ == NO_VALUE ? NULL : value_,
+                       rows_,
+                       cols_,
+                       /*default stream = */ HPPL_STREAM_DEFAULT);
+}
+
+void GpuSparseMatrix::trimFromCSC(const CpuSparseMatrix& src) {
+  trans_ = src.trans_;
+  size_t nnz = src.getCols()[width_] - src.getCols()[0];
+  resize(height_, width_, nnz, valueType_, format_);
+
+  cols_[0] = 0;
+  for (size_t i = 0; i < width_; i++) {
+    cols_[i + 1] = cols_[i] + (int)(src.getRowNum(i));
+  }
+  memcpy(rows_, src.getRows() + src.getCols()[0], sizeof(int) * nnz);
+  if (valueType_ == FLOAT_VALUE) {
+    memcpy(value_, src.getValue() + src.getCols()[0], sizeof(real) * nnz);
+  }
+
+  sMatrix_->format = HL_SPARSE_CSC;
+  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
+  sMatrix_->rows = height_;
+  sMatrix_->cols = width_;
+  sMatrix_->nnz = nnz;
+
+  hl_memcpy_csc_matrix(sMatrix_.get(),
+                       valueType_ == NO_VALUE ? NULL : value_,
+                       rows_,
+                       cols_,
+                       /*default stream = */ HPPL_STREAM_DEFAULT);
+}
+
+void GpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) {
+  if (format_ == SPARSE_CSR) {
+    trimFromCSR(src);
+  } else {
+    trimFromCSC(src);
+  }
+}
+
+void GpuSparseMatrix::addBias(Matrix& b, real scale) {
+  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
+  hl_sparse_matrix_s A_d = sMatrix_.get();
+  hl_sparse_matrix_add_bias(A_d, b.getData(), scale);
+}
+
+void GpuSparseMatrix::add3(GpuMatrix* b) {
+  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
+  CHECK(height_ == b->getHeight());
+  CHECK(width_ == b->getWidth());
+  real* B_d = b->getData();
+  hl_sparse_matrix_s A_d = sMatrix_.get();
+  hl_sparse_matrix_add_dense(A_d, B_d, height_, width_, 1, 0);
+}
+
+void GpuSparseMatrix::add3(MatrixPtr b) {
+  if (dynamic_cast<GpuMatrix*>(b.get())) {
+    add3(dynamic_cast<GpuMatrix*>(b.get()));
+  } else {
+    LOG(FATAL) << "not supported";
+  }
+}
+
+void GpuSparseMatrix::zeroMem() {
+  CHECK(valueType_ == FLOAT_VALUE);
+  real* value = getValue();
+  if (value == NULL) {
+    LOG(FATAL) << "value is nullptr";
+  }
+  hl_matrix_zero_mem(value, elementCnt_);
+}
+
+void GpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
+#ifdef PADDLE_WITH_CUDA
+  CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
+  size_t numSamples = getHeight();
+  size_t beam = maxVal.getWidth();
+  CHECK_EQ(maxIds.getSize(), numSamples * beam);
+  CHECK_EQ(maxVal.getHeight(), numSamples);
+  CHECK_EQ(format_, SPARSE_CSR) << "Only support SPARSE_CSR";
+
+  hl_sparse_matrix_top_k(maxVal.getData(),
+                         maxVal.getStride(),
+                         maxIds.getData(),
+                         sMatrix_.get(),
+                         beam,
+                         numSamples);
+#endif
+}
+
+template void GpuSparseMatrix::copyFrom(int64_t* ids,
+                                        int64_t* indices,
+                                        sparse_non_value_t* data,
+                                        hl_stream_t stream);
+template void GpuSparseMatrix::copyFrom(int64_t* ids,
+                                        int64_t* indices,
+                                        sparse_float_value_t* data,
+                                        hl_stream_t stream);
+}  // namespace paddle
diff --git a/paddle/legacy/math/SparseMatrix.h b/paddle/legacy/math/SparseMatrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..9181fa29233677d8f4fac503905cc31eb66cb6c1
--- /dev/null
+++ b/paddle/legacy/math/SparseMatrix.h
@@ -0,0 +1,286 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
+#include <cstddef>
+#include "CpuSparseMatrix.h"
+#include "Matrix.h"
+
+namespace paddle {
+
+typedef std::shared_ptr<_hl_sparse_matrix_s> hl_sparse_matrix_s_ptr;
+
+class GpuSparseMatrix : public Matrix {
+ public:
+  MemoryHandlePtr sMemoryHandle_;
+  int* rows_;
+  int* cols_;
+  real* value_;
+  const char* end_; /* point to the end of sMemoryHandle_ */
+
+  hl_sparse_matrix_s_ptr sMatrix_;
+  SparseValueType valueType_;
+  SparseFormat format_;
+
+ public:
+  GpuSparseMatrix(size_t height,
+                  size_t width,
+                  size_t nnz, /* used to allocate space */
+                  SparseValueType valueType = FLOAT_VALUE,
+                  SparseFormat format_ = SPARSE_CSR,
+                  bool trans = false);
+
+  GpuSparseMatrix(GpuMemHandlePtr dataHandle,
+                  hl_sparse_matrix_s_ptr sMatrix,
+                  size_t height,
+                  size_t width,
+                  size_t nnz, /* used to allocate space */
+                  SparseValueType valueType = FLOAT_VALUE,
+                  SparseFormat format_ = SPARSE_CSR,
+                  bool trans = false,
+                  MemoryHandlePtr sMemoryHandle = NULL);
+
+  GpuSparseMatrix(real* value,
+                  int* rows,
+                  int* cols,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
+                  bool trans);
+
+  GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
+                  bool trans,
+                  MemoryHandlePtr sMemoryHandle);
+
+ protected:
+  struct Element {
+    int row;
+    int col;
+    real val;
+    Element(int rowIn, int colIn, real valIn)
+        : row(rowIn), col(colIn), val(valIn) {}
+  };
+
+ public:
+  ~GpuSparseMatrix() {}
+
+  void resize(size_t newHeight,
+              size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType,
+              SparseFormat format);
+
+  void resize(size_t newHeight, size_t newWidth);
+
+  void sparseResizeCSR();
+
+  void sparseResizeCSC();
+
+  void resizeCSR(size_t newHeight,
+                 size_t newWidth,
+                 size_t newNnz,
+                 SparseValueType valueType);
+
+  void resizeCSC(size_t newHeight,
+                 size_t newWidth,
+                 size_t newNnz,
+                 SparseValueType valueType);
+
+  void mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, real scaleT);
+  /// B = A , B.trans = !A.trans
+  MatrixPtr getTranspose();
+
+  /// B = A'
+  void transpose(MatrixPtr& matTrans, bool memAlloc);
+
+  void copyFrom(const Matrix& src);
+  void copyFrom(const Matrix& src, hl_stream_t stream);
+  void copyFromCSR(CpuSparseMatrix& src, hl_stream_t stream);
+  void copyFromCSC(CpuSparseMatrix& src, hl_stream_t stream);
+
+  void copyFrom(const IVector& src) { LOG(FATAL) << "not implemented"; }
+  void copyFrom(const IVector& src, hl_stream_t stream) {
+    LOG(FATAL) << "not implemented";
+  }
+
+  template <class T>
+  void copyFrom(int64_t* ids, int64_t* indices, T* data, hl_stream_t stream);
+
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
+              const real* values);
+  SparseValueType getValueType() const;
+  SparseFormat getFormat() const { return format_; }
+
+  const int* getRowCols(size_t x) const { return cols_ + rows_[x]; }
+  const real* getRowValues(size_t x) const { return value_ + rows_[x]; }
+  size_t getColNum(size_t x) const { return rows_[x + 1] - rows_[x]; }
+  void print(std::ostream& os) const;
+
+  /**
+   * @brief only set value_ of FLOAT_VALUE sparse matrix to zero
+   */
+  void zeroMem();
+
+  /**
+   * @brief sparseMatrix += denseMatrix
+   *
+   * Named add3 just because add/add2 has been used in BaseMatrix.cu
+   * and they are not virtual function.
+   *
+   * Only add value of same (row, col) index in dense matrix
+   * and do not use others values.
+   *
+   * @param[in]  b   dense matrix
+   */
+  void add3(GpuMatrix* b);
+  void add3(MatrixPtr b);
+
+  /**
+   * @brief sparseMatrix[i,j] += bias[j], (j is the col index of sparse matrix)
+   *
+   * @param[in]  b      bias, dense matrix and height = 1
+   * @param[in]  scale  scale of b
+   */
+  void addBias(Matrix& b, real scale);
+
+  /**
+   * @brief return rows, which is gpu address
+   */
+  int* getRows() const {
+    CHECK(sMatrix_.get()) << "sMatrix_ is NULL";
+    return hl_sparse_matrix_get_rows(sMatrix_.get());
+  }
+
+  /**
+   * @brief return cols, which is gpu address
+   */
+  int* getCols() const {
+    CHECK(sMatrix_.get()) << "sMatrix_ is NULL";
+    return hl_sparse_matrix_get_cols(sMatrix_.get());
+  }
+
+  /**
+   * @brief return value, which is gpu address
+   */
+  real* getValue() const {
+    CHECK(sMatrix_.get()) << "sMatrix_ is NULL";
+    return hl_sparse_matrix_get_value(sMatrix_.get());
+  }
+
+  /**
+   * @brief return value_ of sparse matrix
+   *
+   * Some times CpuSparseMatrix maybe Matrix,
+   * if getValue, must dynamic_cast to CpuSparseMatrix,
+   * getData is convenient to get value
+   */
+  real* getData() { return getValue(); }
+  const real* getData() const { return getValue(); }
+
+  /**
+   * @brief  Get top k value of each row in sparse matrix.
+   *
+   * Store the value in maxVal and theirs index in maxIds.
+   * k = maxVal.width
+   *
+   * @param[out]  maxIds    index of top k
+   * @param[out]  maxVal    value of top k
+   */
+  void rowMax(IVector& maxIds, Matrix& maxVal);
+
+ protected:
+  void sparseResize();
+
+  void copyRow(int offsets, size_t colNum, const sparse_non_value_t* row);
+  void copyRow(int offsets, size_t colNum, const sparse_float_value_t* row);
+
+ public:
+  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
+
+  void copyFrom(CpuSparseMatrix& src, hl_stream_t stream);
+  void copyFrom(GpuSparseMatrix& src, hl_stream_t stream);
+
+  void trimFrom(const CpuSparseMatrix& src);
+  void trimFromCSR(const CpuSparseMatrix& src);
+  void trimFromCSC(const CpuSparseMatrix& src);
+
+  // BaseMatrixT interface
+ public:
+  bool isSparse() const { return true; }
+
+ private:
+  using Matrix::mul;
+  using Matrix::copyFrom;
+  using Matrix::rowMax;
+  using Matrix::print;
+  using Matrix::subMatrix;
+};
+
+}  // namespace paddle
+
+#else
+
+#include "CpuSparseMatrix.h"
+
+namespace paddle {
+
+class GpuSparseMatrix : public Matrix {
+ public:
+  GpuSparseMatrix(size_t height,
+                  size_t width,
+                  size_t nnz, /* used to allocate space */
+                  SparseValueType valueType = FLOAT_VALUE,
+                  SparseFormat format_ = SPARSE_CSR,
+                  bool trans = false)
+      : Matrix(NULL, height, width, trans, false) {}
+
+  GpuSparseMatrix(real* value,
+                  int* rows,
+                  int* cols,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
+                  bool trans)
+      : Matrix(NULL, height, width, trans, true) {}
+
+  void resize(size_t newHeight,
+              size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType,
+              SparseFormat format) {}
+  void resize(size_t newHeight, size_t newWidth) {}
+  MatrixPtr getTranspose() { return nullptr; }
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
+              const real* values) {}
+};
+
+}  // namespace paddle
+
+#endif
diff --git a/paddle/legacy/math/SparseRowMatrix.cpp b/paddle/legacy/math/SparseRowMatrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..39bcdf22984db766283a3b4fbf56f224f730c5f8
--- /dev/null
+++ b/paddle/legacy/math/SparseRowMatrix.cpp
@@ -0,0 +1,282 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "SparseRowMatrix.h"
+#include "CpuSparseMatrix.h"
+
+#include <algorithm>
+
+#include "paddle/legacy/utils/Logging.h"
+
+#include "SIMDFunctions.h"
+
+#include "paddle/legacy/utils/Thread.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+const unsigned int SparseRowCpuMatrix::kUnusedId_ = -1U;
+
+void SparseRowCpuMatrix::init(size_t height, size_t width) {
+  height_ = height;
+  if (!indexDictHandle_) {
+    indexDictHandle_.reset(new IndexDict);
+    indexDictHandle_->globalIndices.assign(height, kUnusedId_);
+  }
+  localIndices_ = &indexDictHandle_->localIndices;
+  globalIndices_ = indexDictHandle_->globalIndices.data();
+}
+
+void SparseRowCpuMatrix::mul(CpuSparseMatrix* a,
+                             CpuMatrix* b,
+                             real scaleAB,
+                             real scaleT) {
+  CpuMatrix::mul<CpuMatrix, SparseRowCpuMatrix>(a, b, this, scaleAB, scaleT);
+}
+
+void SparseRowCpuMatrix::copyFrom(const real* src, size_t size) {
+  LOG(FATAL) << "This should not be called";
+}
+
+void SparseRowCpuMatrix::zeroMem() {
+  apply([](real* buf, size_t len) { memset(buf, 0, sizeof(real) * len); });
+  clearRows();
+}
+
+void SparseRowCpuMatrix::applyL1(real learningRate, real decayRate) {
+  apply([=](real* buf, size_t len) {
+    CpuVector value(0, nullptr);
+    value.subVecFrom(buf, 0, len);
+    value.applyL1(learningRate, decayRate);
+  });
+}
+
+void SparseRowCpuMatrix::sgdUpdate(BaseMatrix& value,
+                                   IVector& t0,
+                                   real learningRate,
+                                   int currentTime,
+                                   real decayRate,
+                                   bool useL1,
+                                   bool fini) {
+  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
+
+  // t0 and value are vectors
+  CHECK_EQ(t0.getSize(), this->height_);
+  CHECK_EQ(value.width_, this->height_ * this->width_);
+
+  if (decayRate == 0.0f) {
+    if (fini) {
+      return;
+    }
+
+    for (size_t i = 0; i < localIndices.size(); ++i) {
+      real* g = getLocalRow(i);
+      real* v = value.rowBuf(localIndices[i]);
+      for (size_t j = 0; j < this->width_; ++j) {
+        v[j] -= learningRate * g[j];
+      }
+    }
+    return;
+  }  // else
+
+  if (useL1) {  // L1 decay
+    if (fini) {
+      for (size_t i = 0; i < this->height_; ++i) {
+        real* v = value.rowBuf(i);
+        int* t = t0.getData() + i;
+        if (t[0] < currentTime) {
+          // W(t0) -> W(t+1)
+          int tDiff = currentTime - t[0];
+          real delta = tDiff * learningRate * decayRate;
+          simd::decayL1(v, v, delta, this->width_);
+        }
+      }
+      return;
+    }  // else
+
+    for (size_t i = 0; i < localIndices.size(); ++i) {
+      real* g = getLocalRow(i);
+      real* v = value.rowBuf(localIndices[i]);
+      int* t = t0.getData() + localIndices[i];
+      if (t[0] < currentTime) {
+        // W(t0) -> W(t)
+        int tDiff = currentTime - t[0];
+        real delta = tDiff * learningRate * decayRate;
+        simd::decayL1(v, v, delta, this->width_);
+      }
+
+      // W(t) -> W(t+1)
+      for (size_t j = 0; j < this->width_; ++j) {
+        v[j] -= learningRate * g[j];
+      }
+      simd::decayL1(v, v, learningRate * decayRate, this->width_);
+
+      // state update to t+1
+      t[0] = currentTime + 1;
+    }
+
+  } else {  // L2 decay
+    if (fini) {
+      for (size_t i = 0; i < this->height_; ++i) {
+        real* v = value.rowBuf(i);
+        int* t = t0.getData() + i;
+        if (t[0] < currentTime) {
+          // W(t0) -> W(t+1)
+          int tDiff = currentTime - t[0];
+          real recip = 1.0f / (1.0f + tDiff * learningRate * decayRate);
+          for (size_t j = 0; j < this->width_; ++j) {
+            v[j] *= recip;
+          }
+        }
+      }
+      return;
+    }  // else
+
+    real recipDecay = 1.0f / (1.0f + learningRate * decayRate);
+
+    for (size_t i = 0; i < localIndices.size(); ++i) {
+      real* g = getLocalRow(i);
+      real* v = value.rowBuf(localIndices[i]);
+      int* t = t0.getData() + localIndices[i];
+      if (t[0] < currentTime) {
+        // W(t0) -> W(t)
+        int tDiff = currentTime - t[0];
+        real recip = 1.0f / (1.0f + tDiff * learningRate * decayRate);
+        for (size_t j = 0; j < this->width_; ++j) {
+          v[j] *= recip;
+        }
+      }
+
+      // W(t) -> W(t+1)
+      for (size_t j = 0; j < this->width_; ++j) {
+        v[j] = recipDecay * (v[j] - learningRate * g[j]);
+      }
+
+      // state update to t+1
+      t[0] = currentTime + 1;
+    }
+  }
+}
+
+void SparseRowCpuMatrix::addTo(BaseMatrix& dest,
+                               std::vector<uint32_t>& ids,
+                               size_t tid,
+                               size_t numThreads) {
+  CHECK(!dest.useGpu_);
+  CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_);
+
+  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
+  for (size_t i = 0; i < localIndices.size(); ++i) {
+    uint32_t id = localIndices[i];
+    if (id % numThreads == tid) {
+      simd::addTo(dest.rowBuf(id), getLocalRow(i), this->width_);
+      ids.push_back(id);
+    }
+  }
+}
+
+void SparseRowCpuMatrix::addTo(SparseRowCpuMatrix& dest,
+                               size_t tid,
+                               size_t numThreads) {
+  CHECK(!dest.useGpu_);
+  CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_);
+
+  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
+  for (size_t i = 0; i < localIndices.size(); ++i) {
+    uint32_t id = localIndices[i];
+    if (id % numThreads == tid) {
+      dest.checkIndex(id);
+      simd::addTo(dest.getRow(id), getLocalRow(i), this->width_);
+    }
+  }
+}
+
+void SparseRowCpuMatrix::zeroMemThread(size_t tid, size_t numThreads) {
+  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
+  for (size_t i = 0; i < localIndices.size(); ++i) {
+    uint32_t id = localIndices[i];
+    if (id % numThreads == tid) {
+      memset(this->getLocalRow(i), 0, this->width_ * sizeof(real));
+    }
+  }
+}
+
+void SparseAutoGrowRowCpuMatrix::mul(CpuSparseMatrix* a,
+                                     CpuMatrix* b,
+                                     real scaleAB,
+                                     real scaleT) {
+  CpuMatrix::mul<CpuMatrix, SparseAutoGrowRowCpuMatrix>(
+      a, b, this, scaleAB, scaleT);
+}
+
+void CacheRowCpuMatrix::mul(CpuSparseMatrix* a,
+                            CpuMatrix* b,
+                            real scaleAB,
+                            real scaleT) {
+  CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(a, b, this, scaleAB, scaleT);
+}
+
+void SparsePrefetchRowCpuMatrix::addRows(const unsigned int* ids, size_t len) {
+  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
+  for (size_t i = 0; i < len; i++) {
+    CHECK_LT(*(ids + i), this->getHeight())
+        << "id:" << *(ids + i) << "Height:" << this->getHeight()
+        << "sparse id value exceeds the max input dimension, "
+        << "it could be caused invalid input data samples";
+  }
+  localIndices.insert(localIndices.end(), ids, ids + len);
+}
+
+void SparsePrefetchRowCpuMatrix::addRows(MatrixPtr input) {
+  CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(input.get());
+  CHECK(mat) << "only support sparse matrix";
+  addRows(reinterpret_cast<const unsigned int*>(mat->getCols()),
+          mat->getElementCnt());
+}
+
+void SparsePrefetchRowCpuMatrix::addRows(IVectorPtr ids) {
+  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
+  size_t numSamples = ids->getSize();
+  int* index = ids->getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    if (index[i] == -1) continue;
+
+    unsigned int id = (unsigned int)index[i];
+    CHECK_LT(id, this->getHeight())
+        << "id:" << id << "Height:" << this->getHeight()
+        << "sparse id value exceeds the max input dimension, "
+        << "it could be caused invalid input data samples";
+    localIndices.push_back(id);
+  }
+}
+
+void SparsePrefetchRowCpuMatrix::setupIndices() {
+  auto& localIndices = indexDictHandle_->localIndices;
+  uniqueIds(localIndices);
+  // for each sparse row
+  for (size_t id = 0; id < localIndices.size(); ++id) {
+    globalIndices_[localIndices[id]] = id;  // sparse row -> local id
+  }
+  checkStoreSize();
+}
+
+void SparseRowCpuMatrix::checkIndices() {
+  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
+  for (size_t i = 0; i < localIndices.size(); ++i) {
+    CHECK_EQ(globalIndices_[localIndices[i]], i);
+  }
+  checkStoreSize();
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/math/SparseRowMatrix.h b/paddle/legacy/math/SparseRowMatrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..e206747a41c9f3a0f058bf3b0a94472bf4b2c349
--- /dev/null
+++ b/paddle/legacy/math/SparseRowMatrix.h
@@ -0,0 +1,341 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
+#include <gflags/gflags.h>
+#include <string.h>
+#include <algorithm>
+#include "Matrix.h"
+#include "RowBuffer.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+/**
+ * Sparse Row
+ */
+class SparseRowCpuMatrix : public CpuMatrix {
+ public:
+  struct IndexDict {
+    // In the following, global id means the row id in the original matrix.
+    // Local id means the row id in the local storage which only contains
+    // the sparse rows.
+    std::vector<unsigned int> localIndices;   // local id -> global id
+    std::vector<unsigned int> globalIndices;  // global id -> local id
+  };
+  typedef std::shared_ptr<IndexDict> IndexDictPtr;
+
+  /// heightStore is max number of rows of the sparse matrix.
+  SparseRowCpuMatrix(CpuMemHandlePtr dataHandle,
+                     size_t height,
+                     size_t width,
+                     IndexDictPtr indexDictHandle = nullptr,
+                     bool trans = false)
+      : CpuMatrix(nullptr, height, width, trans),
+        indexDictHandle_(indexDictHandle) {
+    init(height, width);
+    buf_.reset(new RowBuffer(dataHandle, width));
+  }
+
+  virtual ~SparseRowCpuMatrix() {}
+
+ public:
+  /**
+   *  Get the row buf
+   *
+   *  @param row row id in the original matrix
+   */
+  real* getRow(size_t row) {
+    CHECK_NE(globalIndices_[row], kUnusedId_);
+    return getLocalRow(globalIndices_[row]);
+  }
+
+  /**
+   *  Get the row buf
+   *
+   *  @param row row id in local storage
+   */
+  real* getLocalRow(size_t row) { return buf_->getWithAutoGrowth(row); }
+
+  /**
+   *  reserve the storage for rows according to current size of
+   * indexDictHandle.
+   *
+   *  This is only used when SparseRowCpuMatrix is constructed with
+   *  indexDictHandle.
+   */
+  void reserveStore() { buf_->resize(localIndices_->size()); }
+
+  // row is the row id in the original matrix
+  virtual real* getRowBuf(size_t row) { return getRow(row); }
+
+  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
+
+  /**
+   * Fill data according to row indexs added, setup indices inside.
+   *
+   * *src* and *size* are data and size of normal dense CpuMatrix.
+   */
+  virtual void copyFrom(const real* src, size_t size);
+  virtual void zeroMem();
+
+  /**
+   * apply L1 to all sparse rows, should be apply after indices ready.
+   */
+  virtual void applyL1(real learningRate, real decayRate);
+
+  void clearIndices() { clearRows(); }
+  void zeroMemThread(size_t tid, size_t numThreads);
+
+  /**
+   *  value -= grad * learningRate,  this is gradient.
+   *
+   * If L1 decay set use L1, else if L2 set use L2, otherwise no decay atall.
+   *
+   * t0 is a int vector used by L1/L2 decay, size = height of parameter
+   * matrix,
+   * store the time that each weight row last updated.
+   *
+   * Time is batchId, currentTime is current batchId.
+   *
+   * While pass finished, caller should call this func one more time
+   *  with (fini=true) to let weight decay catch up current time.
+   */
+  void sgdUpdate(BaseMatrix& value,
+                 IVector& t0,
+                 real learningRate,
+                 int currentTime,
+                 real decayRate,
+                 bool useL1,
+                 bool fini = false);
+
+  /**
+   *  merge rows in *this* to *dest* for designated thread
+   *
+   *  values add to *dest* matrix
+   *
+   *  ids occured in *this* append to *ids*
+   *  filtered by  (id % numThreads == tid)
+   */
+  void addTo(BaseMatrix& dest,
+             std::vector<uint32_t>& ids,
+             size_t tid,
+             size_t numThreads);
+
+  /**
+   *  the second version addTo(), *dest* is a SparseRowCpuMatrix.
+   *
+   *  The dest's indices should be setup already, addTo() will
+   *  check src ids is exist in dest's indices.
+   */
+  void addTo(SparseRowCpuMatrix& dest, size_t tid, size_t numThreads);
+
+  const IndexDictPtr& getIndexDictHandle() const { return indexDictHandle_; }
+
+  /**
+   *  check all local and global indices consistency
+   */
+  void checkIndices();
+  /**
+   *  check whether row *i* exist in indices
+   */
+  void checkIndex(size_t i) {
+    size_t localId = globalIndices_[i];
+    CHECK_LT(localId, localIndices_->size());
+    CHECK_EQ((*localIndices_)[localId], i);
+  }
+
+  std::vector<unsigned int>& getLocalIndices() const {
+    return indexDictHandle_->localIndices;
+  }
+
+ protected:
+  template <typename Func>
+  void apply(Func f) {
+    f(buf_->data(), localIndices_->size() * width_);
+  }
+
+  void init(size_t height, size_t width);
+
+  /// clear row indices.
+  void clearRows() {
+    for (auto id : *localIndices_) {
+      globalIndices_[id] = kUnusedId_;
+    }
+    localIndices_->clear();
+    buf_->clear();
+  }
+
+  inline void checkStoreSize() {
+    if (buf_->isAutoGrowth()) {
+      if (buf_->getRowCount() > 0.5 * height_) {
+        LOG(WARNING) << "There are more than 0.5*height ("
+                     << localIndices_->size() << ") rows are used for sparse "
+                     << "update, which is not efficient. Considering not use "
+                     << "sparse_update.";
+      }
+    } else {
+      CHECK_LE(localIndices_->size(), buf_->getRowCount());
+    }
+  }
+
+  std::unique_ptr<RowBuffer> buf_;
+  IndexDictPtr indexDictHandle_;
+  std::vector<unsigned int>* localIndices_;  // =&indexDictHandle_->localIndices
+  unsigned int* globalIndices_;  // =indexDictHandle_->globalIndices.data();
+  static const unsigned int kUnusedId_;
+};
+
+class SyncThreadPool;
+
+/// For prefetching parameters from remote Parameter server
+class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
+ public:
+  SparsePrefetchRowCpuMatrix(CpuMemHandlePtr dataHandle,
+                             size_t height,
+                             size_t width,
+                             IndexDictPtr indexDictHandle = nullptr,
+                             SyncThreadPool* pool = nullptr,
+                             bool trans = false)
+      : SparseRowCpuMatrix(dataHandle, height, width, indexDictHandle, trans),
+        pool_(pool) {}
+
+  /**
+   * Extract feature ids from *input*, to fill row indexs.
+   *
+   * *input* must be sparse matrix.
+   *
+   * Can call many times before setup.
+   */
+  void addRows(MatrixPtr input);
+  void addRows(IVectorPtr ids);
+
+  /**
+   * setup global indices of SparseRowMatrix after finish add rows.
+   */
+  void setupIndices();
+
+ protected:
+  void addRows(const unsigned int* ids, size_t len);
+  SyncThreadPool* pool_;
+};
+
+class SparseAutoGrowRowCpuMatrix : public SparseRowCpuMatrix {
+ public:
+  SparseAutoGrowRowCpuMatrix(size_t height,
+                             size_t width,
+                             IndexDictPtr indexDictHandle = nullptr,
+                             bool trans = false)
+      : SparseRowCpuMatrix(nullptr, height, width, indexDictHandle, trans) {}
+
+  real* getRow(size_t row) {
+    auto id = globalIndices_[row];
+    if (id == kUnusedId_) {
+      id = globalIndices_[row] = localIndices_->size();
+      localIndices_->push_back(row);
+      checkStoreSize();
+    }
+    return getLocalRow(id);
+  }
+
+  virtual real* getRowBuf(size_t row) { return getRow(row); }
+
+  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
+};
+
+class CacheRowCpuMatrix : public SparseAutoGrowRowCpuMatrix {
+ public:
+  CacheRowCpuMatrix(size_t height,
+                    size_t width,
+                    IndexDictPtr indexDictHandle = nullptr,
+                    bool trans = false)
+      : SparseAutoGrowRowCpuMatrix(height, width, indexDictHandle, trans),
+        sourceData_(nullptr) {}
+
+  void setSourceData(CpuVectorPtr sourceVec) {
+    sourceDataVec_ = sourceVec;
+    sourceData_ = sourceVec->getData();
+  }
+
+  real* getRow(size_t row) {
+    auto id = globalIndices_[row];
+    if (id == kUnusedId_) {
+      id = globalIndices_[row] = localIndices_->size();
+      localIndices_->push_back(row);
+      checkStoreSize();
+      memcpy(
+          getLocalRow(id), sourceData_ + width_ * row, sizeof(float) * width_);
+    }
+    return getLocalRow(id);
+  }
+
+  virtual real* getRowBuf(size_t row) { return getRow(row); }
+
+  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
+
+ public:
+  CpuVectorPtr sourceDataVec_;
+  real* sourceData_;
+};
+
+/**
+ * Sparse Row Ids Matrix.
+ *
+ * mostly same as CpuMatrix, but maintain sparse row ids occured,
+ * ids are hashed by worker thread id.
+ */
+class SparseRowIdsCpuMatrix : public CpuMatrix {
+ public:
+  SparseRowIdsCpuMatrix(CpuMemHandlePtr dataHandle,
+                        size_t height,
+                        size_t width,
+                        bool trans = false)
+      : CpuMatrix(dataHandle, height, width, trans) {}
+
+  void setNumOfThreads(size_t numOfThreads) { idsArray_.resize(numOfThreads); }
+
+  std::vector<uint32_t>& getIds(size_t threadId) { return idsArray_[threadId]; }
+
+ private:
+  std::vector<std::vector<uint32_t>> idsArray_;
+};
+
+}  // namespace paddle
+
+#else
+namespace paddle {
+
+class SparseRowCpuMatrix : public CpuMatrix {
+ public:
+  void reserveStore() {}
+  void clearIndices() {}
+};
+
+class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
+ public:
+  void setupIndices() {}
+  void addRows(MatrixPtr input) {}
+  void addRows(IVectorPtr ids) {}
+};
+
+class SparseAutoGrowRowCpuMatrix : public SparseRowCpuMatrix {};
+class CacheRowCpuMatrix : public SparseAutoGrowRowCpuMatrix {};
+class SparseRowIdsCpuMatrix : public CpuMatrix {};
+
+}  // namespace paddle
+
+#endif
diff --git a/paddle/legacy/math/Storage.cpp b/paddle/legacy/math/Storage.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..65d53aeaa926690c7fe9e6fcac7affdfb68fede9
--- /dev/null
+++ b/paddle/legacy/math/Storage.cpp
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Storage.h"
+#include "Allocator.h"
+#include "paddle/legacy/utils/StringUtil.h"
+#include "paddle/legacy/utils/Util.h"
+
+#ifndef PADDLE_MOBILE_INFERENCE
+DEFINE_int32(pool_limit_size,
+             536870912,
+             "maximum memory size managed by a memory pool, default is 512M");
+#else
+DEFINE_int32(pool_limit_size, 0, "default is 0");
+#endif
+
+namespace paddle {
+
+// Initialization StorageEngine singleton.
+// Other modules may rely on storage management,
+// so StorageEngine need to be initialized before other modules.
+static InitFunction __init_storage_engine([]() { StorageEngine::singleton(); },
+                                          std::numeric_limits<int>::max());
+
+StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {}
+
+StorageEngine::~StorageEngine() {
+  delete cpuAllocator_;
+  for (auto it : gpuAllocator_) {
+    delete it;
+  }
+}
+
+StorageEngine* StorageEngine::singleton() {
+  static StorageEngine storage;
+  return &storage;
+}
+
+PoolAllocator* StorageEngine::getGpuAllocator(int deviceId) {
+  {
+    // if gpuAllocator_ has been constructed
+    ReadLockGuard guard(lock_);
+    if (deviceId < static_cast<int>(gpuAllocator_.size()) &&
+        (gpuAllocator_[deviceId] != nullptr)) {
+      return gpuAllocator_[deviceId];
+    }
+  }
+
+  {
+    // Construct gpuAllocator_
+    std::lock_guard<RWLock> guard(lock_);
+    if (deviceId >= static_cast<int>(gpuAllocator_.size())) {
+      gpuAllocator_.resize(deviceId + 1);
+    }
+    if (gpuAllocator_[deviceId] == nullptr) {
+      std::string name =
+          "gpu" + str::to_string(deviceId) + std::string("_pool");
+      gpuAllocator_[deviceId] =
+          new PoolAllocator(new GpuAllocator(), FLAGS_pool_limit_size, name);
+    }
+    return gpuAllocator_[deviceId];
+  }
+}
+
+PoolAllocator* StorageEngine::getCpuAllocator() {
+  {
+    // if cpuAllocator_ has been constructed
+    ReadLockGuard guard(lock_);
+    if (cpuAllocator_ != nullptr) {
+      return cpuAllocator_;
+    }
+  }
+
+  {
+    // Construct cpuAllocator_
+    std::lock_guard<RWLock> guard(lock_);
+    if (cpuAllocator_ == nullptr) {
+      if (FLAGS_use_gpu) {
+        cpuAllocator_ = new PoolAllocator(
+            new CudaHostAllocator(), FLAGS_pool_limit_size, "cuda_host_pool");
+      } else {
+        cpuAllocator_ = new PoolAllocator(
+            new CpuAllocator(), FLAGS_pool_limit_size, "cpu_pool");
+      }
+    }
+    return cpuAllocator_;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/math/Storage.h b/paddle/legacy/math/Storage.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd22dde2c85be5ba432cb3a259211c1900a17b6c
--- /dev/null
+++ b/paddle/legacy/math/Storage.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mutex>
+#include <vector>
+#include "PoolAllocator.h"
+#include "paddle/legacy/utils/Locks.h"
+
+namespace paddle {
+
+/**
+ * @brief Storage manager for multiple devices.
+ */
+class StorageEngine {
+ public:
+  /**
+   * @return Storage singleton
+   */
+  static StorageEngine* singleton();
+
+  /**
+   * @return return one gpu allocator by deviceId
+   */
+  PoolAllocator* getGpuAllocator(int deviceId);
+
+  /**
+   * @return return cpu allocator
+   */
+  PoolAllocator* getCpuAllocator();
+
+ protected:
+  StorageEngine();
+  ~StorageEngine();
+  RWLock lock_;
+  std::vector<PoolAllocator*> gpuAllocator_;
+  PoolAllocator* cpuAllocator_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/math/TensorApply.h b/paddle/legacy/math/TensorApply.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b642047bffa33b47dfb8ffc8e3fd2a9b7dbae3a
--- /dev/null
+++ b/paddle/legacy/math/TensorApply.h
@@ -0,0 +1,211 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+
+/**
+ * \brief The tensor evaluator classes.
+ */
+template <typename Derived, class T>
+class TensorApply {
+ public:
+  explicit INLINE TensorApply(const Derived& p)
+      : data_(p.data_),
+        stride_(p.stride_),
+        height_(p.height_),
+        width_(p.width_),
+        useGpu_(p.useGpu_) {}
+
+  INLINE T apply(int i, int j) const { return data_[i * stride_ + j]; }
+  INLINE T apply(int index) const { return data_[index]; }
+  INLINE T& applyRef(int i, int j) { return data_[i * stride_ + j]; }
+  INLINE T& applyRef(int index) { return data_[index]; }
+
+  INLINE size_t getWidth() const { return width_; }
+  INLINE size_t getHeight() const { return height_; }
+  INLINE bool isContiguous() const { return stride_ == width_ || height_ == 1; }
+  INLINE bool useGpu() const { return useGpu_; }
+
+  T* data_;
+  size_t stride_;
+  size_t height_;
+  size_t width_;
+  bool useGpu_;
+};
+
+/**
+ * \brief The tensor evaluator classes.
+ * evaluator for rvalues
+ */
+template <typename Derived, class T>
+class TensorApply<const Derived, T> {
+ public:
+  explicit INLINE TensorApply(const Derived& p)
+      : data_(p.data_),
+        stride_(p.stride_),
+        height_(p.height_),
+        width_(p.width_),
+        useGpu_(p.useGpu_) {}
+
+  INLINE T apply(int i, int j) const { return data_[i * stride_ + j]; }
+  INLINE T apply(int index) const { return data_[index]; }
+
+  INLINE size_t getWidth() const { return width_; }
+  INLINE size_t getHeight() const { return height_; }
+  INLINE bool isContiguous() const { return stride_ == width_ || height_ == 1; }
+  INLINE bool useGpu() const { return useGpu_; }
+
+  const T* data_;
+  size_t stride_;
+  size_t height_;
+  size_t width_;
+  bool useGpu_;
+};
+
+template <typename Derived, class T>
+class TensorApply<const TensorExpression<Derived, T>, T> {
+ public:
+  explicit TensorApply(const TensorExpression<Derived, T>& expr)
+      : expr_(expr.derived()) {}
+
+  INLINE T apply(int i, int j) const { return expr_.apply(i, j); }
+  INLINE T apply(int index) const { return expr_.apply(index); }
+
+  INLINE size_t getWidth() const { return expr_.getWidth(); }
+  INLINE size_t getHeight() const { return expr_.getHeight(); }
+  INLINE bool isContiguous() const { return expr_.isContiguous(); }
+  INLINE bool useGpu() const { return expr_.useGpu(); }
+
+  TensorApply<const Derived, T> expr_;
+};
+
+/**
+ * \brief The unary expression evaluator classes.
+ */
+template <class OP, typename ArgType, class T>
+class TensorApply<const TensorUnaryOp<OP, ArgType, T>, T> {
+ public:
+  explicit INLINE TensorApply(const TensorUnaryOp<OP, ArgType, T>& expr)
+      : op_(expr.op_), expr_(expr.expr_) {}
+
+  INLINE T apply(int i, int j) const { return op_(expr_.apply(i, j)); }
+  INLINE T apply(int index) const { return op_(expr_.apply(index)); }
+
+  INLINE size_t getWidth() const { return expr_.getWidth(); }
+  INLINE size_t getHeight() const { return expr_.getHeight(); }
+  INLINE bool isContiguous() const { return expr_.isContiguous(); }
+  INLINE bool useGpu() const { return expr_.useGpu(); }
+
+  const OP op_;
+  TensorApply<ArgType, T> expr_;
+};
+
+/**
+ * \brief The binary expression evaluator classes.
+ */
+template <class OP, typename LhsType, typename RhsType, class T>
+class TensorApply<const TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
+ public:
+  explicit INLINE TensorApply(
+      const TensorBinaryOp<OP, LhsType, RhsType, T>& expr)
+      : op_(expr.op_), lhs_(expr.lhs_), rhs_(expr.rhs_) {
+#ifndef __CUDA_ARCH__
+    CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
+    CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
+    CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
+#endif
+  }
+
+  INLINE T apply(int i, int j) const {
+    return op_(lhs_.apply(i, j), rhs_.apply(i, j));
+  }
+  INLINE T apply(int index) const {
+    return op_(lhs_.apply(index), rhs_.apply(index));
+  }
+
+  INLINE size_t getWidth() const { return lhs_.getWidth(); }
+  INLINE size_t getHeight() const { return rhs_.getHeight(); }
+  INLINE bool isContiguous() const {
+    return lhs_.isContiguous() && rhs_.isContiguous();
+  }
+  INLINE bool useGpu() const { return lhs_.useGpu(); }
+
+  const OP op_;
+  TensorApply<LhsType, T> lhs_;
+  TensorApply<RhsType, T> rhs_;
+};
+
+/**
+ * \brief The ternary expression evaluator classes.
+ */
+template <typename ArgType1, typename ArgType2, typename ArgType3, class T>
+class TensorApply<const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>, T> {
+ public:
+  explicit INLINE TensorApply(
+      const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>& expr)
+      : expr1_(expr.expr1_), expr2_(expr.expr2_), expr3_(expr.expr3_) {
+#ifndef __CUDA_ARCH__
+    CHECK_EQ(expr1_.getWidth(), expr2_.getWidth());
+    CHECK_EQ(expr1_.getWidth(), expr3_.getWidth());
+    CHECK_EQ(expr1_.getHeight(), expr2_.getHeight());
+    CHECK_EQ(expr1_.getHeight(), expr3_.getHeight());
+    CHECK_EQ(expr1_.useGpu(), expr2_.useGpu());
+    CHECK_EQ(expr1_.useGpu(), expr3_.useGpu());
+#endif
+  }
+
+  INLINE T apply(int i, int j) const {
+    return expr1_.apply(i, j) ? expr2_.apply(i, j) : expr3_.apply(i, j);
+  }
+  INLINE T apply(int index) const {
+    return expr1_.apply(index) ? expr2_.apply(index) : expr3_.apply(index);
+  }
+
+  INLINE size_t getWidth() const { return expr1_.getWidth(); }
+  INLINE size_t getHeight() const { return expr1_.getHeight(); }
+  INLINE bool isContiguous() const {
+    return expr1_.isContiguous() && expr2_.isContiguous() &&
+           expr3_.isContiguous();
+  }
+  INLINE bool useGpu() const { return expr1_.useGpu(); }
+
+  TensorApply<ArgType1, T> expr1_;
+  TensorApply<ArgType2, T> expr2_;
+  TensorApply<ArgType3, T> expr3_;
+};
+
+/**
+ * \brief The const expression evaluator classes.
+ */
+template <class OP, typename ArgType, class T>
+class TensorApply<const TensorConstant<OP, ArgType, T>, T> {
+ public:
+  explicit INLINE TensorApply(const TensorConstant<OP, ArgType, T>& expr)
+      : op_(expr.op_), expr_(expr.expr_) {}
+
+  INLINE T apply(int i, int j) const { return op_(i, j); }
+  INLINE T apply(int index) const { return op_(index); }
+
+  INLINE size_t getWidth() const { return expr_.getWidth(); }
+  INLINE size_t getHeight() const { return expr_.getHeight(); }
+  INLINE bool isContiguous() const { return true; }
+  INLINE bool useGpu() const { return expr_.useGpu(); }
+
+  const OP op_;
+  TensorApply<ArgType, T> expr_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/math/TensorAssign.h b/paddle/legacy/math/TensorAssign.h
new file mode 100644
index 0000000000000000000000000000000000000000..efbfce6c4f88197f18285e3679698b8bbb1ed3b8
--- /dev/null
+++ b/paddle/legacy/math/TensorAssign.h
@@ -0,0 +1,158 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+/**
+ * \brief Tensor Assign Expression(return by lazyAssign,
+ * and evaluated by AssignEvaluate)
+ */
+template <typename LhsType, typename RhsType, class T>
+class TensorAssignOp {
+ public:
+  explicit TensorAssignOp(const LhsType& lhs, const RhsType& rhs)
+      : lhs_(lhs), rhs_(rhs) {
+#ifndef __CUDA_ARCH__
+    CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
+    CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
+    CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
+#endif
+  }
+
+  INLINE void apply(const int i, const int j) {
+    lhs_.applyRef(i, j) = rhs_.apply(i, j);
+  }
+  INLINE void apply(const int index) {
+    lhs_.applyRef(index) = rhs_.apply(index);
+  }
+
+  INLINE size_t getWidth() const { return lhs_.getWidth(); }
+  INLINE size_t getHeight() const { return rhs_.getHeight(); }
+  INLINE bool isContiguous() const {
+    return lhs_.isContiguous() && rhs_.isContiguous();
+  }
+  INLINE bool useGpu() const { return lhs_.useGpu(); }
+
+ private:
+  TensorApply<LhsType, T> lhs_;
+  TensorApply<const RhsType, T> rhs_;
+};
+
+template <typename Assign, typename... AssignOp>
+void AssignCpuEvaluate(int height,
+                       int width,
+                       bool isContiguous,
+                       Assign&& assign,
+                       AssignOp&&... args) {
+  if (isContiguous) {
+    int size = height * width;
+    for (int index = 0; index < size; index++) {
+      assign.apply(index);
+      __attribute__((unused)) int dummy[] = {(((args)).apply(index), 0)...};
+    }
+  } else {
+    for (int i = 0; i < height; i++) {
+      for (int j = 0; j < width; j++) {
+        assign.apply(i, j);
+        __attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
+      }
+    }
+  }
+}
+
+#ifdef __NVCC__
+template <typename Assign, typename... AssignOp>
+__global__ void AssignGpuEvaluate1(const int border,
+                                   Assign assign,
+                                   AssignOp... args) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < border) {
+    assign.apply(idx);
+    __attribute__((unused)) int dummy[] = {(((args)).apply(idx), 0)...};
+  }
+}
+
+template <typename Assign, typename... AssignOp>
+__global__ void AssignGpuEvaluate2(const int height,
+                                   const int width,
+                                   Assign assign,
+                                   AssignOp... args) {
+  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  for (int i = rowIdx; i < height; i += gridDim.y * blockDim.y) {
+    for (int j = colIdx; j < width; j += gridDim.x * blockDim.x) {
+      assign.apply(i, j);
+      __attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
+    }
+  }
+}
+#endif
+
+/**
+ * \brief Evaluate one or more TensorAssignOp objects.
+ *
+ * \note At least one assignment expression is required
+ */
+template <typename Assign, typename... AssignOp>
+void AssignEvaluate(Assign&& assign, AssignOp&&... args) {
+  const bool useGpu_ = assign.useGpu();
+  bool isContiguous_ = assign.isContiguous();
+  const size_t height = assign.getHeight();
+  const size_t width = assign.getWidth();
+
+  const int packSize = sizeof...(args);
+  const bool packUseGpu[] = {((args)).useGpu()...};
+  const bool packIsContiguous[] = {((args)).isContiguous()...};
+  const size_t packHeight[] = {((args)).getHeight()...};
+  const size_t packWidth[] = {((args)).getWidth()...};
+
+  for (int i = 0; i < packSize; i++) {
+    CHECK_EQ(useGpu_, packUseGpu[i]);
+    CHECK_EQ(height, packHeight[i]);
+    CHECK_EQ(width, packWidth[i]);
+    isContiguous_ = isContiguous_ && packIsContiguous[i];
+  }
+
+  if (useGpu_) {
+#ifdef __NVCC__
+    if (isContiguous_) {
+      int size = height * width;
+      int blockSize = size <= 1024 ? size : 1024;
+      int gridSize = (size + 1024 - 1) / 1024;
+      AssignGpuEvaluate1<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+          size, assign, args...);
+    } else {
+      int blockSizeY = std::min(32, (int)height);
+      int blockSizeX = (32 / blockSizeY) * 32;
+      int gridSizeX = std::min(32, (int)(width + blockSizeX - 1) / blockSizeX);
+      int gridSizeY = std::min(32, (int)(height + blockSizeY - 1) / blockSizeY);
+      dim3 threads(blockSizeX, blockSizeY);
+      dim3 grid(gridSizeX, gridSizeY);
+      AssignGpuEvaluate2<<<grid, threads, 0, STREAM_DEFAULT>>>(
+          height, width, assign, args...);
+    }
+
+    CHECK_SYNC("AssignEvaluate failed");
+#endif
+  } else {
+    AssignCpuEvaluate(height, width, isContiguous_, assign, args...);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/math/TensorEvaluate.h b/paddle/legacy/math/TensorEvaluate.h
new file mode 100644
index 0000000000000000000000000000000000000000..3029dd35fb05c893f99cde0689f816f4257f21c4
--- /dev/null
+++ b/paddle/legacy/math/TensorEvaluate.h
@@ -0,0 +1,112 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include "hl_base.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+/**
+ * \brief The tensor cpu evaluate api.
+ */
+template <class T, typename LeftType, typename RightType>
+inline void TensorCpuApply(LeftType& lhs, const RightType& rhs) {
+  TensorApply<LeftType, T> lhs_(lhs);
+  TensorApply<const RightType, T> rhs_(rhs);
+  CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
+  CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
+  CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
+
+  int height = lhs_.getHeight();
+  int width = lhs_.getWidth();
+  if (lhs_.isContiguous() && rhs_.isContiguous()) {
+    int size = height * width;
+    for (int index = 0; index < size; index++) {
+      lhs_.applyRef(index) = rhs_.apply(index);
+    }
+  } else {
+    for (int i = 0; i < height; i++) {
+      for (int j = 0; j < width; j++) {
+        lhs_.applyRef(i, j) = rhs_.apply(i, j);
+      }
+    }
+  }
+}
+
+#ifdef __NVCC__
+template <typename LeftType, typename RightType>
+__global__ void TensorElementWiseOp(LeftType lhs,
+                                    RightType rhs,
+                                    const int border) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < border) {
+    lhs.applyRef(idx) = rhs.apply(idx);
+  }
+}
+
+template <typename LeftType, typename RightType>
+__global__ void TensorElementWiseOp(LeftType lhs, RightType rhs) {
+  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  for (int i = rowIdx; i < lhs.getHeight(); i += gridDim.y * blockDim.y) {
+    for (int j = colIdx; j < lhs.getWidth(); j += gridDim.x * blockDim.x) {
+      lhs.applyRef(i, j) = rhs.apply(i, j);
+    }
+  }
+}
+
+/**
+ * \brief The tensor gpu evaluate api.
+ */
+template <class T, typename LeftType, typename RightType>
+inline void TensorGpuApply(LeftType& lhs, const RightType& rhs) {
+  TensorApply<LeftType, T> lhs_(lhs);
+  TensorApply<const RightType, T> rhs_(rhs);
+  CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
+  CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
+  CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
+
+  int dimM = lhs_.getHeight();
+  int dimN = lhs_.getWidth();
+
+  if (lhs_.isContiguous() && rhs_.isContiguous()) {
+    int size = dimM * dimN;
+    int blockSize = size <= 1024 ? size : 1024;
+    int gridSize = (size + 1024 - 1) / 1024;
+    TensorElementWiseOp<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+        lhs_, rhs_, size);
+  } else {
+    int blockSizeY = std::min(32, dimM);
+    int blockSizeX = (32 / blockSizeY) * 32;
+    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
+    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
+    dim3 threads(blockSizeX, blockSizeY);
+    dim3 grid(gridSizeX, gridSizeY);
+    TensorElementWiseOp<<<grid, threads, 0, STREAM_DEFAULT>>>(lhs_, rhs_);
+  }
+
+  CHECK_SYNC("TensorGpuApply failed");
+}
+#else
+template <class T, typename LeftType, typename RightType>
+inline void TensorGpuApply(LeftType& lhs, RightType& rhs) {
+  LOG(FATAL) << "Since it is gcc compiled, "
+                "this calculation does not support GPU implementation.";
+}
+#endif
+
+}  // namespace paddle
diff --git a/paddle/legacy/math/TensorExpression.h b/paddle/legacy/math/TensorExpression.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c6cf07831487165445a3f59931c4ca9196375b9
--- /dev/null
+++ b/paddle/legacy/math/TensorExpression.h
@@ -0,0 +1,446 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <stdint.h>
+#include <cstddef>
+#include "hl_tensor_ops.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+template <class OP, typename ExprType, class T>
+class TensorConstant;
+template <class OP, typename ExprType, class T>
+class TensorUnaryOp;
+template <class OP, typename LhsType, typename RhsType, class T>
+class TensorBinaryOp;
+template <typename ExprType1, typename ExprType2, typename ExprType3, class T>
+class TensorTernaryOp;
+
+template <typename LhsType, typename RhsType, class T>
+class TensorAssignOp;
+
+/**
+ * \brief Tensor base class.
+ *
+ * This is the base class of all Tensor and Expression class.
+ */
+template <typename Derived, class T>
+class TensorExpression {
+ public:
+  /**
+   * Element wise unary expression.
+   */
+  template <typename UnaryOp>
+  const TensorUnaryOp<UnaryOp, const Derived, T> unaryExpression(
+      const UnaryOp& op) const {
+    return TensorUnaryOp<UnaryOp, const Derived, T>(op, derived());
+  }
+
+  const TensorUnaryOp<hppl::unary::add_scale<T>, const Derived, T> operator+(
+      T p) const {
+    return unaryExpression(hppl::unary::add_scale<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::sub_scale<T>, const Derived, T> operator-(
+      T p) const {
+    return unaryExpression(hppl::unary::sub_scale<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::mul_scale<T>, const Derived, T> operator*(
+      T p) const {
+    return unaryExpression(hppl::unary::mul_scale<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::div_scale<T>, const Derived, T> operator/(
+      T p) const {
+    return unaryExpression(hppl::unary::div_scale<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::neg<T>, const Derived, T> operator-() const {
+    return unaryExpression(hppl::unary::neg<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::exp_op<T>, const Derived, T> exp() const {
+    return unaryExpression(hppl::unary::exp_op<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::log_op<T>, const Derived, T> log() const {
+    return unaryExpression(hppl::unary::log_op<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::sqrt_op<T>, const Derived, T> sqrt() const {
+    return unaryExpression(hppl::unary::sqrt_op<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::square<T>, const Derived, T> square() const {
+    return unaryExpression(hppl::unary::square<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::reciprocal<T>, const Derived, T> reciprocal()
+      const {
+    return unaryExpression(hppl::unary::reciprocal<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::abs<T>, const Derived, T> abs() const {
+    return unaryExpression(hppl::unary::abs<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::sign<T>, const Derived, T> sign() const {
+    return unaryExpression(hppl::unary::sign<T>());
+  }
+
+  const TensorUnaryOp<hppl::unary::pow_op<T>, const Derived, T> pow(T p) const {
+    return unaryExpression(hppl::unary::pow_op<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::min<T>, const Derived, T> min(T p) const {
+    return unaryExpression(hppl::unary::min<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::max<T>, const Derived, T> max(T p) const {
+    return unaryExpression(hppl::unary::max<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::cmp_eq<T>, const Derived, T> operator==(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_eq<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::cmp_ne<T>, const Derived, T> operator!=(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_ne<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::cmp_le<T>, const Derived, T> operator<=(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_le<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::cmp_lt<T>, const Derived, T> operator<(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_lt<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::cmp_ge<T>, const Derived, T> operator>=(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_ge<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::cmp_gt<T>, const Derived, T> operator>(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_gt<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::and_op<T>, const Derived, T> operator&&(
+      T p) const {
+    return unaryExpression(hppl::unary::and_op<T>(p));
+  }
+
+  const TensorUnaryOp<hppl::unary::or_op<T>, const Derived, T> operator||(
+      T p) const {
+    return unaryExpression(hppl::unary::or_op<T>(p));
+  }
+
+  /**
+   * Element wise binary expression.
+   */
+  template <typename BinaryOp, typename ExpressionType>
+  const TensorBinaryOp<BinaryOp, const Derived, const ExpressionType, T>
+  binaryExpression(const BinaryOp& op, const ExpressionType& expr) const {
+    return TensorBinaryOp<BinaryOp, const Derived, const ExpressionType, T>(
+        op, derived(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_eq<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator==(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_eq<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_ne<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator!=(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_ne<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_le<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator<=(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_le<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_lt<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator<(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_lt<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_ge<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator>=(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_ge<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_gt<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator>(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_gt<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::and_op<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator&&(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::and_op<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::or_op<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator||(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::or_op<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::add<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator+(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::add<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::sub<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator-(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::sub<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::mul<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator*(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::mul<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::div<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator/(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::div<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::min<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  min(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::min<T>(), expr);
+  }
+
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::max<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  max(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::max<T>(), expr);
+  }
+
+  /**
+   * Element wise ternary expression.
+   *
+   * ternary conditional operator(?: operator).
+   * The conditional expression returns one of two values depending on
+   * the result of derived expression.
+   * If derived expression evaluates to true, then expression1 is evaluated.
+   * If derived expression evaluates to false, then expression2 is evaluated.
+   */
+  template <typename ExprType1, typename ExprType2>
+  const TensorTernaryOp<const Derived, const ExprType1, const ExprType2, T>
+  condition(const ExprType1& expr1, const ExprType2& expr2) const {
+    return TensorTernaryOp<const Derived, const ExprType1, const ExprType2, T>(
+        derived(), expr1, expr2);
+  }
+
+  template <typename ExprType>
+  const TensorTernaryOp<
+      const Derived,
+      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
+      const ExprType,
+      T>
+  condition(T p, const ExprType& expr) const {
+    return condition(constant(p), expr);
+  }
+
+  template <typename ExprType>
+  const TensorTernaryOp<
+      const Derived,
+      const ExprType,
+      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
+      T>
+  condition(const ExprType& expr, T p) const {
+    return condition(expr, constant(p));
+  }
+
+  const TensorTernaryOp<
+      const Derived,
+      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
+      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
+      T>
+  condition(T p1, T p2) const {
+    return condition(constant(p1), constant(p2));
+  }
+
+  /**
+   * return a TensorConstant. A TensorConstant object hold a constant value.
+   */
+  const TensorConstant<hppl::unary::constant<T>, const Derived, T> constant(
+      T p) const {
+    return TensorConstant<hppl::unary::constant<T>, const Derived, T>(
+        hppl::unary::constant<T>(p), derived());
+  }
+
+  /**
+   * return a TensorAssignOp, and use AssignEvaluate to evaluate one or more
+   * TensorAssignOp objects.
+   */
+  template <typename ExpressionType>
+  TensorAssignOp<Derived, ExpressionType, T> lazyAssign(
+      const ExpressionType& expr) const {
+    return TensorAssignOp<Derived, ExpressionType, T>(derived(), expr);
+  }
+
+ protected:
+  const Derived& derived() const { return *static_cast<const Derived*>(this); }
+};
+
+/**
+ * \brief Unary Operator Expression
+ */
+template <class OP, typename ExprType, class T>
+class TensorUnaryOp
+    : public TensorExpression<TensorUnaryOp<OP, ExprType, T>, T> {
+ public:
+  explicit TensorUnaryOp(const OP op, const ExprType& expr)
+      : op_(op), expr_(expr) {}
+
+  const OP op_;
+  const ExprType expr_;
+};
+
+/**
+ * \brief Binary Operator Expression
+ */
+template <class OP, typename LhsType, typename RhsType, class T>
+class TensorBinaryOp
+    : public TensorExpression<TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
+ public:
+  explicit TensorBinaryOp(const OP op, const LhsType& lhs, const RhsType& rhs)
+      : op_(op), lhs_(lhs), rhs_(rhs) {}
+
+  const OP op_;
+  const LhsType lhs_;
+  const RhsType rhs_;
+};
+
+/**
+ * \brief Ternary Operator Expression
+ */
+template <typename ExprType1, typename ExprType2, typename ExprType3, class T>
+class TensorTernaryOp : public TensorExpression<
+                            TensorTernaryOp<ExprType1, ExprType2, ExprType3, T>,
+                            T> {
+ public:
+  explicit TensorTernaryOp(const ExprType1& expr1,
+                           const ExprType2& expr2,
+                           const ExprType3& expr3)
+      : expr1_(expr1), expr2_(expr2), expr3_(expr3) {}
+
+  const ExprType1 expr1_;
+  const ExprType2 expr2_;
+  const ExprType3 expr3_;
+};
+
+/**
+ * \brief Constant Expression
+ */
+template <class OP, typename ExprType, class T>
+class TensorConstant
+    : public TensorExpression<TensorConstant<OP, ExprType, T>, T> {
+ public:
+  explicit TensorConstant(const OP op, const ExprType& expr)
+      : op_(op), expr_(expr) {}
+
+  const OP op_;
+  const ExprType expr_;
+};
+
+/**
+ * \brief operator+ overload
+ * \return a unary operator expression
+ */
+template <typename Derived, class T>
+const TensorUnaryOp<hppl::unary::add_scale<T>, const Derived, T> operator+(
+    T p, const TensorExpression<Derived, T>& expr) {
+  return expr + p;
+}
+
+/**
+ * \brief operator* overload
+ * \return a unary operator expression
+ */
+template <typename Derived, class T>
+const TensorUnaryOp<hppl::unary::mul_scale<T>, const Derived, T> operator*(
+    T p, const TensorExpression<Derived, T>& expr) {
+  return expr * p;
+}
+
+}  // namespace paddle
+
+#include "TensorApply.h"
+#include "TensorEvaluate.h"
diff --git a/paddle/legacy/math/TrainingAlgorithmOp.cu b/paddle/legacy/math/TrainingAlgorithmOp.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9e1eaa0f45ae94d12cf7763bbaff632fc473bcc8
--- /dev/null
+++ b/paddle/legacy/math/TrainingAlgorithmOp.cu
@@ -0,0 +1,356 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "BaseMatrix.h"
+#include "TrainingAlgorithmOp.h"
+#include "paddle/legacy/utils/Logging.h"
+
+#if __cplusplus > 199711L
+
+#include "TensorAssign.h"
+
+namespace paddle {
+
+void sparseMomentumApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& momU,
+                         BaseMatrix& momV,
+                         real alpha,
+                         real beta,
+                         real gamma,
+                         real tau,
+                         real learningRate) {
+  auto expr1 = momU.lazyAssign(momU - (alpha * gamma * learningRate) * grad);
+  auto expr2 =
+      momV.lazyAssign(momV + (tau * alpha * gamma * learningRate) * grad);
+  auto expr3 = value.lazyAssign((tau / beta + (real)1 / alpha) * momU +
+                                ((real)1 / beta) * momV);
+
+  AssignEvaluate(expr1, expr2, expr3);
+}
+
+void adadeltaApply(BaseMatrix& value,
+                   BaseMatrix& grad,
+                   BaseMatrix& mom,
+                   BaseMatrix& accum,
+                   BaseMatrix& accum_update,
+                   BaseMatrix& lr,
+                   real rou,
+                   real epsilon,
+                   real learningRate,
+                   real momentum,
+                   real decayRate) {
+  auto expr1 = accum.lazyAssign(rou * accum + ((real)1 - rou) * grad.square());
+  auto expr2 =
+      lr.lazyAssign(((accum_update + epsilon) / (accum + epsilon)).sqrt());
+  auto expr3 = accum_update.lazyAssign(rou * accum_update +
+                                       ((real)1 - rou) * (grad * lr).square());
+  auto expr4 = mom.lazyAssign(mom * momentum -
+                              learningRate * lr * (grad + value * decayRate));
+  auto expr5 = value.lazyAssign(value + mom);
+
+  AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
+}
+
+void adagradApply(BaseMatrix& value,
+                  BaseMatrix& grad,
+                  BaseMatrix& mom,
+                  BaseMatrix& accum_buffer,
+                  BaseMatrix& accum,
+                  BaseMatrix& lr,
+                  real epsilon,
+                  real learningRate,
+                  real momentum,
+                  real decayRate) {
+  auto expr1 = accum.lazyAssign(accum + grad.square());
+  auto expr2 =
+      lr.lazyAssign((accum_buffer + accum + epsilon).sqrt().reciprocal());
+  auto expr3 = mom.lazyAssign(mom * momentum -
+                              learningRate * lr * (grad + value * decayRate));
+  auto expr4 = value.lazyAssign(value + mom);
+
+  AssignEvaluate(expr1, expr2, expr3, expr4);
+}
+
+void rmspropApply(BaseMatrix& value,
+                  BaseMatrix& grad,
+                  BaseMatrix& mom,
+                  BaseMatrix& g,
+                  BaseMatrix& f,
+                  BaseMatrix& lr,
+                  real accumulatedRou,
+                  real rou,
+                  real epsilon,
+                  real learningRate,
+                  real momentum,
+                  real decayRate,
+                  bool firstTime) {
+  auto expr2 = f.lazyAssign(accumulatedRou * f + ((real)1 - rou) * grad);
+  auto expr3 = lr.lazyAssign((g - f.square() + epsilon).sqrt().reciprocal());
+  auto expr4 = mom.lazyAssign(mom * momentum -
+                              learningRate * lr * (grad + value * decayRate));
+  auto expr5 = value.lazyAssign(value + mom);
+
+  if (firstTime) {
+    auto expr1 = g.lazyAssign(accumulatedRou * g + grad.square());
+
+    AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
+  } else {
+    auto expr1 =
+        g.lazyAssign(accumulatedRou * g + ((real)1 - rou) * grad.square());
+
+    AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
+  }
+}
+
+void decayedAdagradApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& mom,
+                         BaseMatrix& accum,
+                         BaseMatrix& lr,
+                         real accumulatedRou,
+                         real rou,
+                         real epsilon,
+                         real learningRate,
+                         real momentum,
+                         real decayRate,
+                         bool firstTime) {
+  auto expr2 = lr.lazyAssign((accum + epsilon).sqrt().reciprocal());
+  auto expr3 = mom.lazyAssign(mom * momentum -
+                              learningRate * lr * (grad + value * decayRate));
+  auto expr4 = value.lazyAssign(value + mom);
+
+  if (firstTime) {
+    auto expr1 = accum.lazyAssign(accumulatedRou * accum + grad.square());
+
+    AssignEvaluate(expr1, expr2, expr3, expr4);
+  } else {
+    auto expr1 = accum.lazyAssign(accumulatedRou * accum +
+                                  ((real)1 - rou) * grad.square());
+
+    AssignEvaluate(expr1, expr2, expr3, expr4);
+  }
+}
+
+void adamApply(BaseMatrix& value,
+               BaseMatrix& grad,
+               BaseMatrix& mom,  // firse moment
+               BaseMatrix& v,    // second moment
+               real beta1,
+               real beta2,
+               real beta1_power,
+               real beta2_power,
+               real epsilon,
+               real learningRate) {
+  real alpha =
+      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
+
+  auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
+  auto expr2 = v.lazyAssign(beta2 * v + ((real)1 - beta2) * grad.square());
+  auto expr3 = value.lazyAssign(value - (mom * alpha) / (v.sqrt() + epsilon));
+
+  AssignEvaluate(expr1, expr2, expr3);
+}
+
+void adamaxApply(BaseMatrix& value,
+                 BaseMatrix& grad,
+                 BaseMatrix& mom,  // firse moment
+                 BaseMatrix& u,    // weighted infinity norm
+                 real beta1,
+                 real beta2,
+                 int64_t step,
+                 real alpha) {
+  auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
+  auto expr2 =
+      u.lazyAssign((beta2 * u > grad.abs()).condition(beta2 * u, grad.abs()));
+  auto expr3 = value.lazyAssign(
+      value - (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u));
+
+  AssignEvaluate(expr1, expr2, expr3);
+}
+
+}  // namespace paddle
+
+#else
+
+namespace paddle {
+
+void sparseMomentumApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& momU,
+                         BaseMatrix& momV,
+                         real alpha,
+                         real beta,
+                         real gamma,
+                         real tau,
+                         real learningRate) {
+  /**
+   * \alpha_t = \alpha_{t-1} / k
+   * \beta_t = \beta_{t-1} / (1 + \lambda\gamma_t)
+   * u_t = u_{t-1} - \alpha_t \gamma_t g_t
+   * v_t = v_{t-1} + \tau_{t-1} \alpha_t \gamma_t g_t
+   * \tau_t = \tau_{t-1} + \beta_t / \alpha_t
+   */
+  momU -= (alpha * gamma * learningRate) * grad;
+  momV += (tau * alpha * gamma * learningRate) * grad;
+  value = (tau / beta + (real)1 / alpha) * momU + ((real)1 / beta) * momV;
+}
+
+void adadeltaApply(BaseMatrix& value,
+                   BaseMatrix& grad,
+                   BaseMatrix& mom,
+                   BaseMatrix& accum,
+                   BaseMatrix& accum_update,
+                   BaseMatrix& lr,
+                   real rou,
+                   real epsilon,
+                   real learningRate,
+                   real momentum,
+                   real decayRate) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  accum = rou * accum + ((real)1 - rou) * grad.square();
+
+  // learn_rate: sqrt(( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ))
+  lr = ((accum_update + epsilon) / (accum + epsilon)).sqrt();
+
+  // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
+  accum_update = rou * accum_update + ((real)1 - rou) * (grad * lr).square();
+
+  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
+  value += mom;
+}
+
+void adagradApply(BaseMatrix& value,
+                  BaseMatrix& grad,
+                  BaseMatrix& mom,
+                  BaseMatrix& accum_buffer,
+                  BaseMatrix& accum,
+                  BaseMatrix& lr,
+                  real epsilon,
+                  real learningRate,
+                  real momentum,
+                  real decayRate) {
+  accum += grad.square();
+  lr = (accum_buffer + accum + epsilon).sqrt().reciprocal();
+  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
+  value += mom;
+}
+
+void rmspropApply(BaseMatrix& value,
+                  BaseMatrix& grad,
+                  BaseMatrix& mom,
+                  BaseMatrix& g,
+                  BaseMatrix& f,
+                  BaseMatrix& lr,
+                  real accumulatedRou,
+                  real rou,
+                  real epsilon,
+                  real learningRate,
+                  real momentum,
+                  real decayRate,
+                  bool firstTime) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  if (firstTime) {
+    g = accumulatedRou * g + grad.square();
+  } else {
+    g = accumulatedRou * g + ((real)1 - rou) * grad.square();
+  }
+
+  // E(f_t) = \rou * E(f_{t-1}) + (1-\rou) * g
+  f = accumulatedRou * f + ((real)1 - rou) * grad;
+
+  // learn_rate = 1/sqrt( ( E(g_t^2) - (E(f_t))^2 + epsilon )
+  // Basiclly if the sign of the gradient changes more often,
+  // the learning rate will be decreased.
+  lr = (g - f.square() + epsilon).sqrt().reciprocal();
+
+  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
+  value += mom;
+}
+
+void decayedAdagradApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& mom,
+                         BaseMatrix& accum,
+                         BaseMatrix& lr,
+                         real accumulatedRou,
+                         real rou,
+                         real epsilon,
+                         real learningRate,
+                         real momentum,
+                         real decayRate,
+                         bool firstTime) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  if (firstTime) {
+    accum = accumulatedRou * accum + grad.square();
+  } else {
+    accum = accumulatedRou * accum + ((real)1 - rou) * grad.square();
+  }
+
+  // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
+  // Basiclly if the bigger the magnitude gradient is,
+  // the smaller the learning rate will be.
+  lr = (accum + epsilon).sqrt().reciprocal();
+
+  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
+  value += mom;
+}
+
+void adamApply(BaseMatrix& value,
+               BaseMatrix& grad,
+               BaseMatrix& mom,  // firse moment
+               BaseMatrix& v,    // second moment
+               real beta1,
+               real beta2,
+               real beta1_power,
+               real beta2_power,
+               real epsilon,
+               real learningRate) {
+  real alpha =
+      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
+
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  mom = beta1 * mom + ((real)1 - beta1) * grad;
+
+  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
+  v = beta2 * v + ((real)1 - beta2) * grad.square();
+
+  value -= (mom * alpha) / (v.sqrt() + epsilon);
+}
+
+void adamaxApply(BaseMatrix& value,
+                 BaseMatrix& grad,
+                 BaseMatrix& mom,  // firse moment
+                 BaseMatrix& u,    // weighted infinity norm
+                 real beta1,
+                 real beta2,
+                 int64_t step,
+                 real alpha) {
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  mom = beta1 * mom + ((real)1 - beta1) * grad;
+
+  // u_t = max(\beta_2*u_{t-1}, abs(g_t))
+  u = (beta2 * u > grad.abs()).condition(beta2 * u, grad.abs());
+
+  // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
+  value -= (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u);
+}
+
+}  // namespace paddle
+
+#endif
diff --git a/paddle/legacy/math/TrainingAlgorithmOp.h b/paddle/legacy/math/TrainingAlgorithmOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..921c2742cfe2576785768da40ab11c94234be966
--- /dev/null
+++ b/paddle/legacy/math/TrainingAlgorithmOp.h
@@ -0,0 +1,122 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "BaseMatrix.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+/**
+ * \brief Sparse Momentum optimizer.
+ */
+extern void sparseMomentumApply(BaseMatrix& value,
+                                BaseMatrix& grad,
+                                BaseMatrix& momU,
+                                BaseMatrix& momV,
+                                real alpha,
+                                real beta,
+                                real gamma,
+                                real tau,
+                                real learningRate);
+
+/**
+ * \brief AdaDelta optimizer.
+ */
+extern void adadeltaApply(BaseMatrix& value,
+                          BaseMatrix& grad,
+                          BaseMatrix& sum,
+                          BaseMatrix& sum1,
+                          BaseMatrix& mom,
+                          BaseMatrix& lr,
+                          real rou,
+                          real epsilon,
+                          real learningRate,
+                          real momentum,
+                          real decayRate);
+
+/**
+ * \brief AdaGrad optimizer.
+ */
+extern void adagradApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& sum,
+                         BaseMatrix& sum1,
+                         BaseMatrix& mom,
+                         BaseMatrix& lr,
+                         real epsilon,
+                         real learningRate,
+                         real momentum,
+                         real decayRate);
+
+/**
+ * \brief RMSProp optimizer.
+ */
+extern void rmspropApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& g,
+                         BaseMatrix& f,
+                         BaseMatrix& mom,
+                         BaseMatrix& lr,
+                         real accumulatedRou,
+                         real rou,
+                         real epsilon,
+                         real learningRate,
+                         real momentum,
+                         real decayRate,
+                         bool firstTime);
+
+/**
+ * \brief Decayed AdaGrad optimizer.
+ */
+extern void decayedAdagradApply(BaseMatrix& value,
+                                BaseMatrix& grad,
+                                BaseMatrix& mom,
+                                BaseMatrix& accum,
+                                BaseMatrix& lr,
+                                real accumulatedRou,
+                                real rou,
+                                real epsilon,
+                                real learningRate,
+                                real momentum,
+                                real decayRate,
+                                bool firstTime);
+
+/**
+ * \brief Adam optimizer.
+ */
+extern void adamApply(BaseMatrix& value,
+                      BaseMatrix& grad,
+                      BaseMatrix& mom,
+                      BaseMatrix& v,
+                      real beta1,
+                      real beta2,
+                      real beta1_power,
+                      real beta2_power,
+                      real epsilon,
+                      real learningRate);
+
+/**
+ * \brief AdaMax optimizer.
+ */
+extern void adamaxApply(BaseMatrix& value,
+                        BaseMatrix& grad,
+                        BaseMatrix& mom,  // firse moment
+                        BaseMatrix& u,    // weighted infinity norm
+                        real beta1,
+                        real beta2,
+                        int64_t step,
+                        real alpha);
+}  // namespace paddle
diff --git a/paddle/legacy/math/Vector.cpp b/paddle/legacy/math/Vector.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..87f48bb1622f28f8cb53e5afc924f5cadb14c528
--- /dev/null
+++ b/paddle/legacy/math/Vector.cpp
@@ -0,0 +1,1091 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Vector.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include <memory>
+#include "Matrix.h"
+#include "hl_gpu.h"
+#include "hl_matrix.h"
+#include "hl_table_apply.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Thread.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+namespace paddle {
+
+template <class T>
+std::shared_ptr<VectorT<T>> VectorT<T>::create(size_t size, bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuVectorT<T>>(size);
+  } else {
+    return std::make_shared<CpuVectorT<T>>(size);
+  }
+}
+
+template <class T>
+std::shared_ptr<VectorT<T>> VectorT<T>::createParallelVector(
+    size_t size, bool useGpu, SyncThreadPool* pool) {
+  if (!useGpu && FLAGS_trainer_count > 1 && FLAGS_enable_parallel_vector &&
+      size >= (size_t)FLAGS_enable_parallel_vector) {
+    return std::make_shared<ParallelCpuVectorT<T>>(
+        size, pool ? pool : getGlobalSyncThreadPool());
+  } else {
+    return create(size, useGpu);
+  }
+}
+
+template <class T>
+std::shared_ptr<VectorT<T>> VectorT<T>::create(T* data,
+                                               size_t size,
+                                               bool useGpu) {
+  if (useGpu) {
+    return std::make_shared<GpuVectorT<T>>(size, data);
+  } else {
+    return std::make_shared<CpuVectorT<T>>(size, data);
+  }
+}
+
+template <class T>
+std::shared_ptr<VectorT<T>> VectorT<T>::create(size_t size,
+                                               MemoryHandlePtr memoryHandle,
+                                               size_t offset) {
+  if (auto cpuMemHandle =
+          std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle)) {
+    return std::make_shared<CpuVectorT<T>>(size, cpuMemHandle, offset);
+  } else if (auto gpuMemHandle =
+                 std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle)) {
+    return std::make_shared<GpuVectorT<T>>(size, gpuMemHandle, offset);
+  } else {
+    LOG(FATAL) << "Wrong";
+    return NULL;
+  }
+}
+
+template <>
+MatrixPtr VectorT<real>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
+  LOG(FATAL) << "Wrong for real vector";
+  return nullptr;
+}
+
+template <>
+MatrixPtr VectorT<int>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
+  size_t height = getSize();
+  size_t width = idRange;
+  MatrixPtr mat = Matrix::createSparseMatrix(
+      height, idRange, height, NO_VALUE, SPARSE_CSR, false, useGpu);
+
+  CpuIVector cpuIds(height);
+  cpuIds.copyFrom(*this);
+  int* idData = cpuIds.getData();
+
+  for (decltype(height) i = 0; i < height; i++) {
+    const unsigned int id = idData[i];
+    CHECK_LT(id, width);
+    mat->setRow(i, 1, &id, nullptr);
+  }
+  return mat;
+}
+
+template <>
+std::shared_ptr<VectorT<int>> VectorT<real>::castToInt() {
+  std::shared_ptr<VectorT<int>> ret = IVector::create(this->getSize(), useGpu_);
+  if (useGpu_) {
+    hl_vector_cast2int(ret->getData(), this->getData(), this->getSize());
+  } else {
+    for (size_t i = 0; i < getSize(); ++i) {
+      ret->getData()[i] = int(this->getData()[i]);
+    }
+  }
+  return ret;
+}
+
+template <class T>
+GpuVectorT<T>::GpuVectorT(size_t size)
+    : VectorT<T>(size,
+                 std::make_shared<GpuMemoryHandle>(sizeof(T) * size),
+                 0, /* offset = 0 */
+                 true /* useGpu = true */) {}
+
+template <class T>
+T GpuVectorT<T>::getElement(size_t i) const {
+  T elem = 0;
+  hl_memcpy_device2host(&elem, const_cast<T*>(&this->getData()[i]), sizeof(T));
+  return elem;
+}
+template <class T>
+void GpuVectorT<T>::setElement(size_t i, const T& value) {
+  hl_memcpy_host2device(&this->getData()[i], const_cast<T*>(&value), sizeof(T));
+}
+
+template <class T>
+T* GpuVectorT<T>::getPoint(const uint64_t beginPos) {
+  LOG(FATAL) << "Not implemented" << beginPos;
+  return NULL;
+}
+
+template <>
+int GpuVectorT<int>::getAbsSum() {
+  LOG(FATAL) << "Not implemented";
+  return 0;
+}
+
+template <>
+int GpuVectorT<int>::getSum() {
+  LOG(FATAL) << "Not implemented";
+  return 0;
+}
+
+template <>
+real GpuVectorT<real>::getAbsSum() {
+  real* A = this->getData();
+  real sum = 0;
+  hl_vector_abs_sum(A, &sum, this->getSize());
+  return sum;
+}
+
+template <>
+real GpuVectorT<real>::getSum() {
+  real* A = this->getData();
+  real sum = 0;
+  hl_vector_sum(A, &sum, this->getSize());
+  return sum;
+}
+
+template <>
+int GpuVectorT<int>::getMax() {
+  CpuIVector cpuIVec = CpuIVector(this->getSize());
+  copyTo(&cpuIVec);
+  return cpuIVec.getMax();
+}
+
+template <>
+int GpuVectorT<int>::getAbsMax() {
+  CpuIVector cpuIVec = CpuIVector(this->getSize());
+  copyTo(&cpuIVec);
+  return cpuIVec.getAbsMax();
+}
+
+template <class T>
+void GpuVectorT<T>::isEqualTo(const VectorT<T>& b, const T& value) {
+  BaseMatrixT<T>::isEqualTo((BaseMatrixT<T>&)b, value);
+}
+
+template <class T>
+void GpuVectorT<T>::selectFrom(const VectorT<T>& src, const VectorT<int>& ids) {
+#ifdef PADDLE_WITH_CUDA
+  hl_vector_select_from<T>(this->getData(),
+                           this->getSize(),
+                           src.getData(),
+                           src.getSize(),
+                           ids.getData(),
+                           ids.getSize());
+#endif
+}
+
+template <class Func>
+real gpuRowFunc(Func f, GpuVector& v) {
+  static ThreadLocal<std::unique_ptr<CpuVectorT<real>>> local;
+  if (!*local) {
+    (*local).reset(new CpuVector(1));
+  }
+  real* A = v.getData();
+  f(A, (*local)->getData(), 1, v.getSize());
+  return (*local)->getData()[0];
+}
+
+template <>
+real GpuVectorT<real>::getMax() {
+  return gpuRowFunc(hl_matrix_row_max, *this);
+}
+
+template <>
+real GpuVectorT<real>::getAbsMax() {
+  return std::max(gpuRowFunc(hl_matrix_row_max, *this),
+                  -gpuRowFunc(hl_matrix_row_min, *this));
+}
+
+template <>
+int GpuVectorT<int>::getMin() {
+  LOG(FATAL) << "Not implemented";
+  return 0;
+}
+
+template <>
+real GpuVectorT<real>::getMin() {
+  return gpuRowFunc(hl_matrix_row_min, *this);
+}
+
+template <class T>
+T GpuVectorT<T>::get(size_t pos) {
+  T val = (T)0;
+  hl_memcpy_device2host((void*)&val, (void*)(this->getData() + pos), sizeof(T));
+  return val;
+}
+
+template <class T>
+void GpuVectorT<T>::histogram(std::ostream& os, int type) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <class T>
+void GpuVectorT<T>::zeroMem() {
+  BaseMatrixT<T>::zero();
+}
+
+template <class T>
+void GpuVectorT<T>::reset(const T& value) {
+  BaseMatrixT<T>::assign(value);
+}
+
+template <class T>
+void GpuVectorT<T>::fillSequence() {
+  LOG(FATAL) << "not implemented";
+}
+
+template <class T>
+void GpuVectorT<T>::copyFrom(const VectorT<T>& src) {
+  src.copyTo(this);
+}
+
+template <class T>
+void GpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
+  CHECK_EQ(src.getSize(), this->getSize());
+  hl_memcpy_async((void*)this->getData(),
+                  (void*)src.getData(),
+                  sizeof(T) * this->getSize(),
+                  stream);
+}
+
+template <class T>
+void GpuVectorT<T>::copyFrom(const T* gpuSrc, size_t size) {
+  CHECK(gpuSrc != NULL);
+  CHECK_LE(size, this->size_);
+
+  hl_memcpy((void*)this->getData(), (void*)gpuSrc, sizeof(T) * size);
+}
+
+template <class T>
+void GpuVectorT<T>::copyFrom(const T* gpuSrc, size_t size, hl_stream_t stream) {
+  CHECK(gpuSrc != NULL);
+  CHECK_LE(size, this->size_);
+
+  hl_memcpy_async(
+      (void*)this->getData(), (void*)gpuSrc, sizeof(T) * size, stream);
+}
+
+template <class T>
+void GpuVectorT<T>::copyTo(CpuVectorT<T>* dest) const {
+  CHECK_EQ(this->getSize(), dest->getSize());
+
+  hl_memcpy_device2host((void*)dest->getData(),
+                        (void*)this->getData(),
+                        sizeof(T) * this->getSize());
+}
+
+template <class T>
+void GpuVectorT<T>::copyTo(GpuVectorT<T>* dest) const {
+  CHECK_EQ(this->getSize(), dest->getSize());
+
+  hl_memcpy_device2device((void*)dest->getData(),
+                          (void*)this->getData(),
+                          sizeof(T) * this->getSize());
+}
+
+template <>
+void GpuVectorT<int>::rand() {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <>
+void GpuVectorT<int>::print(std::ostream& os, size_t num) const {
+  IVectorPtr dest = IVector::create(this->size_, false);
+  hl_memcpy_device2host((void*)dest->getData(),
+                        (void*)this->getData(),
+                        sizeof(int) * this->getSize());
+  dest->print(os, num);
+}
+
+template <>
+void GpuVectorT<real>::print(std::ostream& os, size_t num) const {
+  VectorPtr dest = Vector::create(this->size_, false);
+  hl_memcpy_device2host((void*)dest->getData(),
+                        (void*)this->getData(),
+                        sizeof(int) * this->getSize());
+  dest->print(os, num);
+}
+
+template <>
+void GpuVectorT<int>::printOneElement(std::ostream& os, size_t idx) const {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <>
+void GpuVectorT<real>::printOneElement(std::ostream& os, size_t idx) const {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <>
+void CpuVectorT<int>::rand() {
+  LOG(FATAL) << "Not implemented";
+}
+template <>
+void GpuVectorT<real>::rand(size_t classNum) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <>
+void CpuVectorT<real>::rand(size_t classNum) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <>
+void GpuVectorT<real>::rand() {
+  VectorPtr cPtr = Vector::create(this->size_, false);
+  cPtr->rand();
+
+  hl_memcpy_host2device(data_, cPtr->getData(), this->size_ * sizeof(real));
+}
+
+template <>
+void GpuVectorT<int>::rand(size_t classNum) {
+  IVectorPtr cPtr = IVector::create(this->size_, false);
+  cPtr->rand(classNum);
+
+  hl_memcpy_host2device(data_, cPtr->getData(), this->size_ * sizeof(int));
+}
+
+template <>
+void CpuVectorT<int>::rand(size_t classNum) {
+  size_t size = this->getSize();
+  int* data = this->getData();
+  for (size_t i = 0; i < size; i++) {
+    data[i] =
+        std::min(classNum - 1,
+                 size_t(::rand() * (1. / ((double)RAND_MAX + 1)) * classNum));
+  }
+}
+
+template <>
+void CpuVectorT<real>::rand() {
+  size_t size = this->getSize();
+  real* data = this->getData();
+  for (size_t i = 0; i < size; i++) {
+    data[i] = ::rand() * (1. / (double)RAND_MAX);
+    // data[ii] = ((temp > RAND_MAX/2)? 1 : -1) *
+    // sqrt( abs((temp-RAND_MAX/2))/(double(RAND_MAX))/2048 );
+  }
+}
+
+template <class T>
+void CpuVectorT<T>::randnorm(real, real) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <class T>
+void CpuVectorT<T>::uniform(real, real) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <class T>
+void GpuVectorT<T>::randnorm(real, real) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <class T>
+void GpuVectorT<T>::uniform(real, real) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <>
+void CpuVectorT<real>::randnorm(real mean, real std) {
+  size_t size = this->getSize();
+  real* data = this->getData();
+  unsigned int* seed = ThreadLocalRand::getSeed();
+  auto rand1 = [&]() { return (1. + ::rand_r(seed)) * (1. / (1. + RAND_MAX)); };
+  for (size_t i = 0; i < size - 1; i += 2) {
+    real r1 = rand1();
+    r1 = std::sqrt(-2 * std::log(r1));
+    real r2 = rand1();
+    data[i] = mean + std * r1 * cos(2 * M_PI * r2);
+    data[i + 1] = mean + std * r1 * sin(2 * M_PI * r2);
+  }
+  real r1 = rand1();
+  r1 = std::sqrt(-2 * std::log(r1));
+  real r2 = rand1();
+  data[size - 1] = mean + std * r1 * cos(2 * M_PI * r2);
+}
+
+template <>
+void CpuVectorT<real>::uniform(real left, real right) {
+  size_t size = this->getSize();
+  real* data = this->getData();
+  real range = right - left;
+  unsigned int* seed = ThreadLocalRand::getSeed();
+  auto rand1 = [&]() { return ::rand_r(seed) * (1. / (1. + RAND_MAX)); };
+  for (size_t i = 0; i < size; ++i) {
+    data[i] = rand1() * range + left;
+  }
+}
+
+template <>
+void GpuVectorT<real>::randnorm(real mean, real std) {
+  CpuVector cpuVec = CpuVector(this->getSize());
+  cpuVec.randnorm(mean, std);
+
+  hl_memcpy_host2device(
+      data_, cpuVec.getData(), this->getSize() * sizeof(real));
+}
+
+template <>
+void GpuVectorT<real>::uniform(real left, real right) {
+  CpuVector cpuVec = CpuVector(this->getSize());
+  cpuVec.uniform(left, right);
+
+  hl_memcpy_host2device(
+      data_, cpuVec.getData(), this->getSize() * sizeof(real));
+}
+
+template <class T>
+CpuVectorT<T>::CpuVectorT(size_t size)
+    : VectorT<T>(size,
+                 std::make_shared<CpuMemoryHandle>(sizeof(T) * size),
+                 0, /* offset = 0 */
+                 false /* useGpu = false */) {}
+
+template <class T>
+CpuVectorT<T>::CpuVectorT(const VectorT<T>& src)
+    : VectorT<T>(src.getSize(),
+                 src.getMemoryHandle(),
+                 0, /* offset = 0 */
+                 false /* useGpu = false */) {
+  if (typeid(*this->memoryHandle_.get()) != typeid(CpuMemoryHandle)) {
+    this->memoryHandle_ =
+        std::make_shared<CpuMemoryHandle>(sizeof(T) * this->getSize());
+    this->data_ = reinterpret_cast<T*>(this->memoryHandle_->getBuf());
+  }
+  src.copyTo(this);
+}
+
+template <class T>
+T CpuVectorT<T>::getAbsSum() {
+  const T* A = this->getData();
+  size_t size = this->getSize();
+  T sum = 0;
+  for (size_t i = 0; i < size; i++) {
+    sum += (A[i] > 0) ? A[i] : -A[i];
+  }
+  return sum;
+}
+
+// cannot use above version, due to precision issue of float
+template <>
+real CpuVectorT<real>::getAbsSum() {
+  const real* A = this->getData();
+  size_t size = this->getSize();
+  double sum = 0;
+  for (size_t i = 0; i < size; i++) {
+    sum += (A[i] > 0) ? A[i] : -A[i];
+  }
+  return sum;
+}
+
+template <class T>
+T CpuVectorT<T>::getSum() {
+  const T* A = this->getData();
+  size_t size = this->getSize();
+  T sum = 0;
+  for (size_t i = 0; i < size; i++) {
+    sum += A[i];
+  }
+  return sum;
+}
+
+template <>
+real CpuVectorT<real>::getSum() {
+  const real* A = this->getData();
+  size_t size = this->getSize();
+  double sum = 0;
+  for (size_t i = 0; i < size; i++) {
+    sum += A[i];
+  }
+  return sum;
+}
+
+template <class T>
+T CpuVectorT<T>::get(size_t pos) {
+  return this->getData()[pos];
+}
+
+template <class T>
+T CpuVectorT<T>::getMax() {
+  const T* A = this->getData();
+  size_t size = this->getSize();
+  T res = A[0];
+  for (size_t i = 1; i < size; i++) {
+    if (res < A[i]) res = A[i];
+  }
+  return res;
+}
+
+template <class T>
+T CpuVectorT<T>::getAbsMax() {
+  const T* A = this->getData();
+  size_t size = this->getSize();
+  T res = std::abs(A[0]);
+  for (size_t i = 1; i < size; i++) {
+    if (res < std::abs(A[i])) res = std::abs(A[i]);
+  }
+  return res;
+}
+
+template <class T>
+T CpuVectorT<T>::getMin() {
+  const T* A = this->getData();
+  size_t size = this->getSize();
+  T res = A[0];
+  for (size_t i = 1; i < size; i++) {
+    if (res > A[i]) res = A[i];
+  }
+  return res;
+}
+
+template <class T>
+void CpuVectorT<T>::isEqualTo(const VectorT<T>& b, const T& value) {
+  size_t size = this->getSize();
+  CHECK_EQ(b.getSize(), size);
+
+  const T* B = b.getData();
+  T* A = this->getData();
+  for (size_t i = 0; i < size; i++) {
+    A[i] = (B[i] == value);
+  }
+}
+
+template <class T>
+void CpuVectorT<T>::selectFrom(const VectorT<T>& src, const VectorT<int>& ids) {
+  size_t size = this->getSize();
+  CHECK_EQ(ids.getSize(), size);
+
+  const int* indices = ids.getData();
+  const T* B = src.getData();
+  T* A = this->getData();
+  for (size_t i = 0; i < size; i++) {
+    int index = indices[i];
+    CHECK_LT(index, (int)src.getSize());
+    A[i] = B[index];
+  }
+}
+
+static int getSignAndExponentOfFloat(float a) {
+  uint32_t* pa = reinterpret_cast<uint32_t*>(&a);
+  return *pa >> 23;
+}
+
+template <class T>
+void CpuVectorT<T>::histogram(std::ostream& os, int type) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <>
+void CpuVectorT<real>::histogram(std::ostream& os, int type) {
+  int counters[512];
+  memset(counters, 0, sizeof(counters));
+  int counterZero = 0;
+
+  const real* A = this->getData();
+  size_t size = this->getSize();
+  for (size_t i = 0; i < size; i++) {
+    if (A[i] == 0.0f) {
+      ++counterZero;
+    } else {
+      ++counters[getSignAndExponentOfFloat(A[i])];
+    }
+  }
+
+  int64_t sum = 0;
+  float sizeNonZero = size - counterZero;
+  os << "zero:" << counterZero;
+  for (int i = 0; i < 256; i++) {
+    int counter = counters[i];
+    if (counter) {
+      os << " 2^" << i - 127 << ":" << counter / sizeNonZero * 100 << "%";
+      sum += counter * (i - 127);
+    }
+  }
+  for (int i = 0; i < 256; i++) {
+    int counter = counters[i + 256];
+    if (counter) {
+      os << " -2^" << i - 127 << ":" << counter / sizeNonZero * 100 << "%";
+      sum += counter * (i - 127);
+    }
+  }
+  os << ", nonzero_exponent_avg=" << sum / sizeNonZero;
+}
+
+template <class T>
+void CpuVectorT<T>::zeroMem() {
+  memset(this->getData(), 0, sizeof(T) * this->getSize());
+}
+
+template <class T>
+void CpuVectorT<T>::reset(const T& value) {
+  T* A = this->getData();
+  size_t size = this->getSize();
+  for (size_t i = 0; i < size; i++) {
+    A[i] = value;
+  }
+}
+
+template <class T>
+void CpuVectorT<T>::fillSequence() {
+  T* A = this->getData();
+  size_t size = this->getSize();
+  for (size_t i = 0; i < size; i++) {
+    A[i] = i;
+  }
+}
+
+template <class T>
+void CpuVectorT<T>::copyFrom(const VectorT<T>& src) {
+  src.copyTo(this);
+}
+
+template <class T>
+void CpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
+  if (typeid(src) == typeid(GpuVectorT<T>)) {
+    hl_memcpy_async((void*)this->getData(),
+                    (void*)src.getData(),
+                    sizeof(T) * this->getSize(),
+                    stream);
+    // There is a need to add synchronization to ensure that the data is copied.
+    hl_stream_synchronize(stream);
+  } else {
+    src.copyTo(this);
+  }
+}
+
+template <class T>
+void CpuVectorT<T>::copyFrom(const T* hostSrc, size_t size) {
+  CHECK(hostSrc != NULL);
+  CHECK_LE(size, this->size_);
+  memcpy(this->data_, hostSrc, sizeof(T) * size);
+}
+
+template <class T>
+void CpuVectorT<T>::copyFrom(const T* hostSrc,
+                             size_t size,
+                             hl_stream_t stream) {
+  (void)stream;
+
+  CHECK(hostSrc != NULL);
+  CHECK_LE(size, this->size_);
+  memcpy(this->data_, hostSrc, sizeof(T) * size);
+}
+
+template <class T>
+void CpuVectorT<T>::copyTo(CpuVectorT<T>* dest) const {
+  CHECK_EQ(this->getSize(), dest->getSize());
+  memcpy(dest->getData(), this->getData(), sizeof(T) * this->getSize());
+}
+
+template <class T>
+void CpuVectorT<T>::copyTo(GpuVectorT<T>* dest) const {
+  CHECK_EQ(this->getSize(), dest->getSize());
+  hl_memcpy_host2device((void*)dest->getData(),
+                        (void*)this->getData(),
+                        sizeof(T) * this->getSize());
+}
+
+template <>
+void CpuVectorT<real>::print(std::ostream& os, size_t num) const {
+  size_t w = size_ < num ? size_ : num;
+  os << "[";
+  for (size_t i = 0; i < w; ++i) {
+    os << data_[i] << " ";
+  }
+  os << "]" << std::endl;
+}
+
+template <>
+void CpuVectorT<int>::print(std::ostream& os, size_t num) const {
+  size_t w = size_ < num ? size_ : num;
+  os << "[";
+  for (size_t i = 0; i < w; ++i) {
+    os << (int)data_[i] << " ";
+  }
+  os << "]" << std::endl;
+}
+
+template <>
+void CpuVectorT<real>::printOneElement(std::ostream& os, size_t idx) const {
+  CHECK_LT(idx, size_);
+  os << data_[idx] << ";";
+}
+
+template <>
+void CpuVectorT<int>::printOneElement(std::ostream& os, size_t idx) const {
+  CHECK_LT(idx, size_);
+  os << (int)data_[idx] << ";";
+}
+
+template <class T>
+void ParallelCpuVectorT<T>::parallelExec(ExecFunc func) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <>
+void ParallelCpuVectorT<real>::parallelExec(ExecFunc func) {
+  pool_->exec([this, func](int tid, size_t numThreads) {
+    auto interval = calcSplitArrayInterval(
+        this->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
+    // setup sub bufs
+    CpuVector subVec(0, nullptr);
+    subVec.subVecFrom(*this, interval);
+    func(subVec);
+  });
+}
+
+template <class T>
+void ParallelCpuVectorT<T>::exec(SyncThreadPool::JobFunc func) {
+  LOG(FATAL) << "Not implemented";
+}
+
+template <>
+void ParallelCpuVectorT<real>::exec(SyncThreadPool::JobFunc func) {
+  pool_->exec(func);
+}
+
+template <class T>
+CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, bool useGpu) : sync_(nullptr) {
+  if (!useGpu) {
+    cpuVectorT_ = std::make_shared<CpuVectorT<T>>(size);
+  } else {
+    gpuVectorT_ = std::make_shared<GpuVectorT<T>>(size);
+  }
+  setSync(useGpu);
+}
+
+template <class T>
+CpuGpuVectorT<T>::CpuGpuVectorT(const std::shared_ptr<VectorT<T>>& src)
+    : sync_(nullptr) {
+  bool useGpu = src->useGpu();
+  if (useGpu) {
+    gpuVectorT_ = src;
+  } else {
+    cpuVectorT_ = src;
+  }
+  setSync(useGpu);
+}
+
+template <class T>
+CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, T* data, bool useGpu)
+    : sync_(nullptr) {
+  if (!useGpu) {
+    cpuVectorT_ = std::make_shared<CpuVectorT<T>>(size, data);
+    setSync(DATA_AT_CPU);
+  } else {
+    gpuVectorT_ = std::make_shared<GpuVectorT<T>>(size, data);
+    setSync(DATA_AT_GPU);
+  }
+}
+
+template <class T>
+std::shared_ptr<CpuGpuVectorT<T>> CpuGpuVectorT<T>::create(size_t size,
+                                                           bool useGpu) {
+  return std::make_shared<CpuGpuVectorT<T>>(size, useGpu);
+}
+
+template <class T>
+void CpuGpuVectorT<T>::resize(size_t size, bool useGpu) {
+  if (useGpu) {
+    CHECK(gpuVectorT_) << "gpuVectorT_ is null";
+    // If memoryHandle_ is nullptr,
+    // the data may be owned by the caller when it was constructed.
+    // It should not resize for this case.
+    if (gpuVectorT_->getMemoryHandle()) {
+      gpuVectorT_->resize(size);
+    } else {
+      CHECK_EQ(gpuVectorT_->getSize(), size);
+    }
+  } else {
+    CHECK(cpuVectorT_) << "cpuVectorT_ is null";
+    // If memoryHandle_ is nullptr,
+    // the data may be owned by the caller when it was constructed.
+    // It should not resize for this case.
+    if (cpuVectorT_->getMemoryHandle()) {
+      cpuVectorT_->resize(size);
+    } else {
+      CHECK_EQ(cpuVectorT_->getSize(), size);
+    }
+  }
+  setSync(useGpu);
+}
+
+template <class T>
+void CpuGpuVectorT<T>::resizeOrCreate(std::shared_ptr<CpuGpuVectorT<T>>& vec,
+                                      size_t size,
+                                      bool useGpu) {
+  if (vec) {
+    vec->resize(size, useGpu);
+  } else {
+    vec = create(size, useGpu);
+  }
+}
+
+template <class T>
+void CpuGpuVectorT<T>::resizeOrCreate(size_t size, bool useGpu) {
+  if (useGpu && (!gpuVectorT_)) {
+    gpuVectorT_ = VectorT<T>::create(size, true);
+  } else if ((!useGpu) && (!cpuVectorT_)) {
+    cpuVectorT_ = VectorT<T>::create(size, false);
+  } else {
+    CHECK((useGpu && gpuVectorT_) || (!useGpu && cpuVectorT_));
+    this->resize(size, useGpu);
+  }
+}
+
+template <class T>
+CpuGpuVectorT<T>::CpuGpuVectorT(CpuGpuVectorT<T>& src,
+                                size_t offset,
+                                size_t size)
+    : sync_(nullptr) {
+  CHECK_LE(offset + size, static_cast<size_t>(src.getSize()));
+#ifdef PADDLE_WITH_CUDA
+  SyncedFlag* flag = src.getSync();
+  if (*flag == DATA_AT_CPU) {
+    src.copyToGpu();  // will set synchronous data between CPU and GPU
+  } else if (*flag == DATA_AT_GPU) {
+    src.copyToCpu();  // will set synchronous data between CPU and GPU
+  }
+#endif
+  auto cMemHandle = (src.getVector(false))->getMemoryHandle();
+  cpuVectorT_ = std::make_shared<CpuVectorT<T>>(
+      size, std::dynamic_pointer_cast<CpuMemoryHandle>(cMemHandle), offset);
+#ifdef PADDLE_WITH_CUDA
+  auto gMemHandle = (src.getVector(true))->getMemoryHandle();
+  gpuVectorT_ = std::make_shared<GpuVectorT<T>>(
+      size, std::dynamic_pointer_cast<GpuMemoryHandle>(gMemHandle), offset);
+  src.setSync(SYNCED);
+#endif
+  setSync(src.getSync());
+}
+
+template <class T>
+std::shared_ptr<const VectorT<T>> CpuGpuVectorT<T>::getVector(
+    bool useGpu) const {
+  auto* self = const_cast<CpuGpuVectorT<T>*>(this);
+  if (useGpu) {
+    self->copyToGpu();
+    return std::const_pointer_cast<const VectorT<T>>(gpuVectorT_);
+  } else {
+    self->copyToCpu();
+    return std::const_pointer_cast<const VectorT<T>>(cpuVectorT_);
+  }
+}
+
+template <class T>
+std::shared_ptr<VectorT<T>>& CpuGpuVectorT<T>::getMutableVector(bool useGpu) {
+  setSync(useGpu);
+  if (useGpu) {
+    copyToGpu();
+    return gpuVectorT_;
+  } else {
+    copyToCpu();
+    return cpuVectorT_;
+  }
+}
+
+template <class T>
+const T* CpuGpuVectorT<T>::getData(bool useGpu) const {
+  auto self = const_cast<CpuGpuVectorT<T>*>(this);
+  if (useGpu) {
+    self->copyToGpu();
+    return gpuVectorT_->getData();
+  } else {
+    self->copyToCpu();
+    return cpuVectorT_->getData();
+  }
+}
+
+// Operation will change data and need to reset sync_ & syncFlag_.
+#define MUTABLE_VECTOR_OP(OP, useGpu, args...) \
+  do {                                         \
+    if (useGpu) {                              \
+      copyToGpu();                             \
+      setSync(useGpu);                         \
+      return gpuVectorT_->OP(args);            \
+    } else {                                   \
+      copyToCpu();                             \
+      setSync(useGpu);                         \
+      return cpuVectorT_->OP(args);            \
+    }                                          \
+  } while (0)
+
+template <class T>
+T* CpuGpuVectorT<T>::getMutableData(bool useGpu) {
+  MUTABLE_VECTOR_OP(getData, useGpu);
+}
+
+template <class T>
+void CpuGpuVectorT<T>::zeroMem(bool useGpu) {
+  MUTABLE_VECTOR_OP(zeroMem, useGpu);
+}
+
+template <class T>
+void CpuGpuVectorT<T>::fillSequence(bool useGpu) {
+  MUTABLE_VECTOR_OP(fillSequence, useGpu);
+}
+
+template <class T>
+void CpuGpuVectorT<T>::setElement(size_t i, const T& value, bool useGpu) {
+  MUTABLE_VECTOR_OP(setElement, useGpu, i, value);
+}
+
+template <class T>
+T CpuGpuVectorT<T>::getElement(size_t i) const {
+  switch (*this->getSync()) {
+    case SYNCED:
+    case DATA_AT_CPU:
+      return cpuVectorT_->getElement(i);
+      break;
+    case DATA_AT_GPU:
+      return gpuVectorT_->getElement(i);
+      break;
+    default:
+      LOG(FATAL) << "Not support";
+      break;
+  }
+}
+
+template <class T>
+void CpuGpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
+  auto cVec = dynamic_cast<const CpuVectorT<T>*>(&src);
+  auto gVec = dynamic_cast<const GpuVectorT<T>*>(&src);
+  if (cVec) {
+    copyToCpu(cVec->getData(), cVec->getSize(), stream);
+  } else if (gVec) {
+    copyToGpu(gVec->getData(), gVec->getSize(), stream);
+  } else {
+    LOG(FATAL) << "Invalid type of src";
+  }
+}
+
+template <class T>
+void CpuGpuVectorT<T>::copyFrom(const T* data, size_t size, bool useGpu) {
+  if (useGpu) {
+    copyToGpu(data, size);
+  } else {
+    copyToCpu(data, size);
+  }
+}
+
+template <class T>
+void CpuGpuVectorT<T>::copyFrom(const T* data,
+                                size_t size,
+                                hl_stream_t stream,
+                                bool useGpu) {
+  if (useGpu) {
+    copyToGpu(data, size, stream);
+  } else {
+    copyToCpu(data, size, stream);
+  }
+}
+
+template <class T>
+void CpuGpuVectorT<T>::copyFrom(CpuGpuVectorT<T>& src,
+                                size_t offset,
+                                size_t size,
+                                bool useGpu,
+                                hl_stream_t stream) {
+  if (useGpu) {
+    VectorT<T>::resizeOrCreate(gpuVectorT_, size, true);
+    gpuVectorT_->copyFrom(src.getData(true) + offset, size, stream);
+  } else {
+    VectorT<T>::resizeOrCreate(cpuVectorT_, size, false);
+    cpuVectorT_->copyFrom(src.getData(false) + offset, size, stream);
+  }
+  setSync(useGpu);
+}
+
+template <class T>
+void CpuGpuVectorT<T>::copyFrom(CpuGpuVectorT<T>& src, hl_stream_t stream) {
+  switch (*src.getSync()) {
+    case DATA_AT_CPU:
+      copyFrom(*(src.getVector(false)), stream);
+      break;
+    case DATA_AT_GPU:
+      copyFrom(*(src.getVector(true)), stream);
+      break;
+    case SYNCED:
+      copyFrom(*(src.getVector(false)), stream);
+      copyFrom(*(src.getVector(true)), stream);
+      setSync(SYNCED);
+      break;
+    default:
+      LOG(FATAL) << "Not support";
+      break;
+  }
+}
+
+template <class T>
+void CpuGpuVectorT<T>::copyToCpu() {
+  switch (*this->getSync()) {
+    case DATA_AT_GPU:
+      CHECK(gpuVectorT_);
+      this->resizeOrCreate(gpuVectorT_->getSize(), false);
+      cpuVectorT_->copyFrom(*gpuVectorT_);
+      setSync(SYNCED);
+      break;
+    case DATA_AT_CPU:
+    case SYNCED:
+      CHECK(cpuVectorT_);
+      break;
+    default:
+      LOG(FATAL) << "Not support";
+      break;
+  }
+}
+
+template <class T>
+void CpuGpuVectorT<T>::copyToGpu() {
+  switch (*this->getSync()) {
+    case DATA_AT_CPU:
+      CHECK(cpuVectorT_);
+      this->resizeOrCreate(cpuVectorT_->getSize(), true);
+      gpuVectorT_->copyFrom(*cpuVectorT_);
+      setSync(SYNCED);
+      break;
+    case DATA_AT_GPU:
+    case SYNCED:
+      CHECK(gpuVectorT_);
+      break;
+    default:
+      LOG(FATAL) << "Not support";
+      break;
+  }
+}
+
+template class VectorT<real>;
+template class VectorT<int>;
+template class CpuVectorT<real>;
+template class CpuVectorT<int>;
+template class GpuVectorT<real>;
+template class GpuVectorT<int>;
+template class CpuGpuVectorT<real>;
+template class CpuGpuVectorT<int>;
+
+}  // namespace paddle
diff --git a/paddle/legacy/math/Vector.h b/paddle/legacy/math/Vector.h
new file mode 100644
index 0000000000000000000000000000000000000000..63cb4651c52219807e11e778db9c42667759a055
--- /dev/null
+++ b/paddle/legacy/math/Vector.h
@@ -0,0 +1,726 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cmath>
+#include <memory>
+
+#include <hl_gpu.h>
+
+#include "BaseMatrix.h"
+#include "MemoryHandle.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/Thread.h"
+
+namespace paddle {
+
+template <class T>
+class GpuVectorT;
+template <class T>
+class CpuVectorT;
+
+template <class T>
+class BaseVector;
+
+class SyncThreadPool;
+
+class Matrix;
+
+template <class T>
+class BaseVector : public BaseMatrixT<T> {
+ public:
+  BaseVector(size_t size, T* data, bool useGpu)
+      : BaseMatrixT<T>(1, size, data, false, useGpu), size_(this->width_) {}
+
+  ~BaseVector() {}
+
+ protected:
+  size_t& size_;
+};
+
+/**
+ * Copy or assignemnt constructor will share the data as opposed to making a
+ * copy of the original data. To make a copy of the orinal data, use copyFrom()
+ * instead.
+ */
+template <class T>
+class VectorT : public BaseVector<T> {
+ protected:
+  VectorT(size_t size, MemoryHandlePtr memoryHandle, size_t offset, bool useGpu)
+      : BaseVector<T>(size,
+                      reinterpret_cast<T*>(memoryHandle->getBuf()) + offset,
+                      useGpu) {
+    memoryHandle_ = memoryHandle;
+  }
+
+  // data is still owned by the caller.
+  // data should be valid during the life of this vector.
+  // Caller is responsible for release the memory.
+  VectorT(size_t size, T* data, bool useGpu)
+      : BaseVector<T>(size, data, useGpu) {}
+
+ public:
+  virtual ~VectorT() {}
+
+  static std::shared_ptr<VectorT<T>> create(size_t size, bool useGpu);
+
+  static std::shared_ptr<VectorT<T>> create(T* data, size_t size, bool useGpu);
+
+  static std::shared_ptr<VectorT<T>> create(size_t size,
+                                            MemoryHandlePtr memoryHandle,
+                                            size_t offset = 0);
+
+  // owner can set SyncThreadPool,
+  // if not set, will use globalSyncThreadPool,
+  // which can be used in main thread only.
+  static std::shared_ptr<VectorT<T>> createParallelVector(
+      size_t size, bool useGpu, SyncThreadPool* pool = nullptr);
+
+  size_t getSize() const { return this->size_; }
+  const T* getData() const { return this->data_; }
+  T* getData() { return this->data_; }
+
+  virtual void zeroMem() = 0;
+  // set all elements to value
+  virtual void reset(const T& value) = 0;
+  // fill data by 0, 1, 2, ...
+  virtual void fillSequence() = 0;
+
+  MemoryHandlePtr getMemoryHandle() const { return memoryHandle_; }
+
+  /**
+   * resizing to a big vector will not preserve old values.
+   */
+  void resize(size_t newSize) {
+    if (!memoryHandle_ || newSize * sizeof(T) > memoryHandle_->getAllocSize()) {
+      memoryHandle_ = newMemory(newSize * sizeof(T));
+      this->data_ = reinterpret_cast<T*>(memoryHandle_->getBuf());
+    }
+    this->size_ = newSize;
+  }
+
+  static void resizeOrCreate(std::shared_ptr<VectorT<T>>& vec,
+                             size_t size,
+                             bool useGpu) {
+    if (vec) {
+      vec->resize(size);
+    } else {
+      vec = create(size, useGpu);
+    }
+  }
+
+  virtual MemoryHandlePtr newMemory(size_t size) = 0;
+
+  /**
+   * form sub vector from *src*, shallow copy
+   */
+  void subVecFrom(const VectorT<T>& src, size_t start, size_t size) {
+    CHECK_EQ(BaseVector<T>::useGpu_, src.useGpu_);
+    CHECK_LT(start, src.size_);
+    CHECK_LE(start + size, src.size_);
+
+    BaseVector<T>::size_ = size;
+    BaseVector<T>::data_ = const_cast<T*>(src.data_) + start;
+  }
+
+  std::shared_ptr<VectorT<T>> subVec(size_t start, size_t size) {
+    CHECK_LE(start + size, static_cast<size_t>(getSize()));
+    return VectorT<T>::create(getData() + start, size, BaseVector<T>::useGpu_);
+  }
+
+  /**
+   * form sub vector from *src*, shallow copy
+   */
+  void subVecFrom(const T* src, size_t start, size_t size) {
+    BaseVector<T>::size_ = size;
+    BaseVector<T>::data_ = const_cast<T*>(src) + start;
+  }
+
+  /**
+   * form sub vector from *src*, shallow copy
+   * in *interval* [interval.first, interval.second)
+   */
+  void subVecFrom(const VectorT<T>& src, std::pair<size_t, size_t> interval) {
+    subVecFrom(src, interval.first, interval.second - interval.first);
+  }
+
+  /**
+   * convert the vector to a sparse one_hot matrix of width idRange
+   * only applies to IVector
+   */
+  std::shared_ptr<Matrix> toOneHotSparseMatrix(size_t idRange, bool useGpu);
+
+  /**
+   * @brief cast vector of "real" elements to "int" elements.
+   *
+   * @note: float -> int must be casted, or you'll get wrong data.
+   */
+  std::shared_ptr<VectorT<int>> castToInt();
+
+  /**
+   * This function will crash if the size of src and dest is different.
+   */
+  virtual void copyFrom(const VectorT<T>& src) = 0;
+
+  /**
+   * If GpuVector, this function is an asynchronous interface,
+   * will push the copy-task to the specifed-stream and return immediately.
+   *
+   * If CpuVector, this function is an synchronous interface,
+   * same as the copyFrom(const VectorT<T>& src).
+   */
+  virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream) = 0;
+
+  /**
+   * copy size elements from src
+   *
+   * If this is GpuVector, src can be cpu or gpu memory
+   *
+   * If this is CpuVector, src is assumed to be cpu memory
+   */
+  virtual void copyFrom(const T* src, size_t size) = 0;
+
+  /**
+   * copy size elements from src
+   *
+   * If this is GpuVector, src can be cpu or gpu memory
+   *
+   * If this is CpuVector, src is assumed to be cpu memory,
+   */
+  virtual void copyFrom(const T* src, size_t size, hl_stream_t stream) = 0;
+
+  /**
+   * exec a func in single/multi thread
+   */
+  virtual void exec(SyncThreadPool::JobFunc func) { func(0, 1); }
+
+  /// Get the buffer point with beginPos
+  virtual T* getPoint(const uint64_t beginPos) = 0;
+
+  /// Get the value for the i'th element
+  virtual T getElement(size_t i) const = 0;
+  virtual void setElement(size_t i, const T& value) = 0;
+
+  //----------  math operations ----------------
+
+  // sum of the absolute value of each elements
+  virtual T getAbsSum() = 0;
+
+  virtual T getSum() = 0;
+  virtual T getMax() = 0;
+  virtual T getAbsMax() = 0;
+  virtual T getMin() = 0;
+
+  /// element-wise calc:  this = (b == value)
+  virtual void isEqualTo(const VectorT<T>& b, const T& value) = 0;
+
+  /// select elements indexed by *ids* from vector *src*
+  virtual void selectFrom(const VectorT<T>& src, const VectorT<int>& ids) = 0;
+
+  enum HistogramType {
+    HISTOGRAM_EXPONENT = 0,
+  };
+
+  /**
+   * @brief  print histogram of vector values
+   *
+   * @note   only exponent histogram supported currently
+   */
+  virtual void histogram(std::ostream& os, int type = HISTOGRAM_EXPONENT) = 0;
+
+  /// generate uniform random value for each element
+  virtual void rand() = 0;
+  /**
+   * generate uniform random value for each element,
+   * data range is from 0 to (classes - 1).
+   */
+  virtual void rand(size_t classes) = 0;
+
+  /**
+   * Debug use only. Very inefficient for GPU vector.
+   * get the value at pos.
+   */
+  virtual T get(size_t pos) = 0;
+
+  /**
+   * generate univariate Gaussian distributed random numbers
+   * with given mean and standardDeviation.
+   */
+  virtual void randnorm(real mean, real standardDeviation) = 0;
+
+  /**
+   * generate uniform distributed random numbers
+   * with given range.
+   */
+  virtual void uniform(real left, real right) = 0;
+
+  /// print the first "num" elements of the Vector
+  virtual void print(std::ostream& os, size_t num) const = 0;
+
+  /// print the "idx" element of the Vector
+  virtual void printOneElement(std::ostream& os, size_t idx) const = 0;
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    if (BaseVector<T>::useGpu_) {
+      TensorGpuApply<T>(*this, expr);
+    } else {
+      TensorCpuApply<T>(*this, expr);
+    }
+  }
+
+ protected:
+  friend class GpuVectorT<T>;
+  friend class CpuVectorT<T>;
+  virtual void copyTo(CpuVectorT<T>* dest) const = 0;
+  virtual void copyTo(GpuVectorT<T>* dest) const = 0;
+  MemoryHandlePtr memoryHandle_;
+};
+
+template <class T>
+std::ostream& operator<<(std::ostream& os, const VectorT<T>& vec) {
+  vec.print(os, vec.getSize());
+  return os;
+}
+
+template <class T>
+class GpuVectorT : public VectorT<T> {
+ public:
+  explicit GpuVectorT(size_t size);
+  GpuVectorT(size_t size, GpuMemHandlePtr memHandle, size_t offset)
+      : VectorT<T>(size, memHandle, offset, true) {}
+
+  // data is still owned by the caller.
+  // data should be valid during the life of this vector.
+  // Caller is responsible for release the memory.
+  GpuVectorT(size_t size, T* data) : VectorT<T>(size, data, true) {}
+
+  virtual MemoryHandlePtr newMemory(size_t size) {
+    return std::make_shared<GpuMemoryHandle>(size);
+  }
+  virtual void zeroMem();
+  virtual void reset(const T& value);
+  virtual void fillSequence();
+
+  virtual void copyFrom(const T* src, size_t size);
+  virtual void copyFrom(const T* src, size_t size, hl_stream_t stream);
+  virtual void copyFrom(const VectorT<T>& src);
+  virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream);
+  virtual T getElement(size_t i) const;
+  virtual void setElement(size_t i, const T& value);
+  virtual T* getPoint(const uint64_t beginPos);
+
+  virtual T getAbsSum();
+  virtual T getSum();
+  virtual T getMax();
+  virtual T getAbsMax();
+  virtual T getMin();
+  virtual void isEqualTo(const VectorT<T>& b, const T& value);
+  virtual void selectFrom(const VectorT<T>& src, const VectorT<int>& ids);
+  virtual void histogram(std::ostream& os, int type);
+  virtual void rand();
+  virtual void rand(size_t classes);
+  virtual void randnorm(real mean, real standardDeviation);
+  virtual void uniform(real left, real right);
+  virtual T get(size_t pos);
+  virtual void print(std::ostream& os, size_t num) const;
+  virtual void printOneElement(std::ostream& os, size_t idx) const;
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorGpuApply<T>(*this, expr);
+  }
+
+ protected:
+  virtual void copyTo(CpuVectorT<T>* dest) const;
+  virtual void copyTo(GpuVectorT<T>* dest) const;
+};
+
+template <class T>
+class CpuVectorT : public VectorT<T> {
+ public:
+  explicit CpuVectorT(size_t size);
+  CpuVectorT(size_t size, MemoryHandlePtr memoryHandle, size_t offset)
+      : VectorT<T>(size, memoryHandle, offset, false) {}
+
+  // data is still owned by the caller.
+  // data should be valid during the life of this vector.
+  // Caller is responsible for release the memory.
+  CpuVectorT(size_t size, T* data) : VectorT<T>(size, data, false) {}
+
+  /**
+   * If src is a CpuVector, the new CpuVector will share the data with src
+   *
+   * If src is a GpuVector, the new CpuVector will copy data from src
+   */
+  explicit CpuVectorT(const VectorT<T>& src);
+
+  virtual MemoryHandlePtr newMemory(size_t size) {
+    return std::make_shared<CpuMemoryHandle>(size);
+  }
+
+  virtual void zeroMem();
+  virtual void reset(const T& value);
+  virtual void fillSequence();
+  virtual void copyFrom(const T* src, size_t size);
+  virtual void copyFrom(const T* src, size_t size, hl_stream_t stream);
+  virtual void copyFrom(const VectorT<T>& src);
+  virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream);
+  virtual void copyTo(CpuVectorT<T>* dest) const;
+  virtual void copyTo(GpuVectorT<T>* dest) const;
+
+  /// Get the buffer point with beginPos
+  virtual T* getPoint(const uint64_t beginPos) {
+    return this->getData() + beginPos;
+  }
+
+  virtual T getElement(size_t i) const { return this->getData()[i]; }
+  virtual void setElement(size_t i, const T& value) {
+    this->getData()[i] = value;
+  }
+
+  virtual T getAbsSum();
+  virtual T getSum();
+  virtual T getMax();
+  virtual T getAbsMax();
+  virtual T getMin();
+  virtual void isEqualTo(const VectorT<T>& b, const T& value);
+  virtual void selectFrom(const VectorT<T>& src, const VectorT<int>& ids);
+  virtual void histogram(std::ostream& os, int type);
+  virtual void rand();
+  virtual void rand(size_t classes);
+  virtual void randnorm(real mean, real standardDeviation);
+  virtual void uniform(real left, real right);
+  virtual T get(size_t pos);
+  virtual void print(std::ostream& os, size_t num) const;
+  virtual void printOneElement(std::ostream& os, size_t idx) const;
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorCpuApply<T>(*this, expr);
+  }
+};
+
+template <class T>
+class ParallelCpuVectorT : public CpuVectorT<T> {
+ public:
+  ParallelCpuVectorT(size_t size, SyncThreadPool* pool)
+      : CpuVectorT<T>(size), pool_(pool) {}
+
+  virtual void zeroMem() {
+    parallelExec([](CpuVectorT<T>& vec) { vec.CpuVectorT<T>::zeroMem(); });
+  }
+  virtual void randnorm(real mean, real standardDeviation) {
+    parallelExec([=](CpuVectorT<T>& vec) {
+      vec.CpuVectorT<T>::randnorm(mean, standardDeviation);
+    });
+  }
+  virtual void uniform(real left, real right) {
+    parallelExec(
+        [=](CpuVectorT<T>& vec) { vec.CpuVectorT<T>::uniform(left, right); });
+  }
+
+  virtual void exec(SyncThreadPool::JobFunc jobFunc);
+
+ private:
+  typedef std::function<void(CpuVectorT<T>& vec)> ExecFunc;
+  void parallelExec(ExecFunc func);
+  SyncThreadPool* pool_;
+};
+
+/**
+ * A class to do conversion between CpuVector and GpuVector automatically.
+ */
+template <class T>
+class CpuGpuVectorT {
+ public:
+  /**
+   * @brief An enum type of SyncedFlag using to
+   *        mark data memory is in CPU or GPU.
+   *
+   * DATA_AT_CPU: data is located in CPU.
+   *
+   * DATA_AT_GPU: data is located in GPU.
+   *
+   * SYNCED: data is located in CPU and GPU simultaneously.
+   */
+  enum SyncedFlag { DATA_AT_CPU = 0, DATA_AT_GPU = 1, SYNCED = 2 };
+
+  /**
+   * @brief A constructor, create cpuVectorT_ or gpuVectorT_.
+   *
+   * @param[in] size    data size.
+   * @param[in] useGpu  use gpu or not.
+   */
+  explicit CpuGpuVectorT(size_t size, bool useGpu);
+
+  /**
+   * @brief A constructor, create CpuGpuVectorT by VectorT.
+   *
+   * If src is CpuVector, cpuVectorT_ is shared data with src.
+   *
+   * If src is GpuVector, gpuVectorT_ is shared data with src.
+   */
+  explicit CpuGpuVectorT(const std::shared_ptr<VectorT<T>>& src);
+
+  /**
+   * @brief A constructor.
+   *
+   * If useGpu is true, data should be located in device and
+   * create gpuVectorT_ with data.
+   *
+   * If useGpu is false, data should be located in host and
+   * create cpuVectorT_ with data.
+   *
+   * @note Data is owned by the caller and should be valid during
+   *       the life of this vector.
+   *       Caller is responsible for release the memory.
+   */
+  CpuGpuVectorT(size_t size, T* data, bool useGpu);
+
+  CpuGpuVectorT(CpuGpuVectorT<T>& src, size_t offset, size_t size);
+
+  virtual ~CpuGpuVectorT() {}
+
+  static std::shared_ptr<CpuGpuVectorT<T>> create(size_t size, bool useGpu);
+
+  /**
+   * @brief resize vector.
+   *
+   * If useGpu is true, resize gpuVectorT_ and set syncFlag_ to DATA_AT_GPU,
+   *
+   * otherwise resize cpuVectorT_ and set syncFlag_ to DATA_AT_CPU.
+   */
+  void resize(size_t size, bool useGpu);
+
+  /**
+   * @brief resize or create CpuGpuVectorT.
+   */
+  static void resizeOrCreate(std::shared_ptr<CpuGpuVectorT<T>>& vec,
+                             size_t size,
+                             bool useGpu);
+
+  /**
+   * @brief return a const cpuVectorT_ or gpuVectorT_.
+   *
+   * If useGpu is true, return gpuVectorT_.
+   *
+   * If useGpu is false, return cpuVectorT_.
+   *
+   * @note Caller should not change the data.
+   *       If caller changes const attribute,
+   *       should set syncFlag_.
+   */
+  std::shared_ptr<const VectorT<T>> getVector(bool useGpu) const;
+
+  /**
+   * @brief return a const cpuVectorT_ or gpuVectorT_.
+   *
+   * @note: This interface will change syncFlag_, so if you will
+   *        not change the data, you should call getVector.
+   */
+  std::shared_ptr<VectorT<T>>& getMutableVector(bool useGpu);
+
+  /**
+   * @brief return const T* data.
+   *
+   * If useGpu is true, return device data.
+   *
+   * If useGpu is false, return host data.
+   */
+  const T* getData(bool useGpu) const;
+
+  // TODO(yuyang18): Make getData more c++ style.
+  //  inline T* getData(bool useGpu) {
+  //    return getMutableData(useGpu);
+  //  }
+
+  T* getMutableData(bool useGpu);
+
+  /**
+   * If useGpu is true, gpuVectorT_->Op().
+   *
+   * If useGpu is false, cpuVectorT_->Op().
+   *
+   * Op is zeroMem, fillSequence, ...
+   */
+  void zeroMem(bool useGpu);
+  void fillSequence(bool useGpu);
+  void setElement(size_t i, const T& value, bool useGpu);
+
+  /**
+   * @brief return i-th element.
+   */
+  T getElement(size_t i) const;
+
+  /**
+   * @brief return vector size.
+   */
+  size_t getSize() const {
+    size_t size = 0;
+    switch (*sync_) {
+      case SYNCED:
+      case DATA_AT_CPU:
+        size = cpuVectorT_->getSize();
+        break;
+      case DATA_AT_GPU:
+        size = gpuVectorT_->getSize();
+        break;
+      default:
+        LOG(FATAL) << "Not support";
+        break;
+    }
+    return size;
+  }
+
+  /// copy data to cpuVectorT_.
+  inline void copyToCpu(const T* data, size_t size) {
+    this->resizeOrCreate(size, false);
+    cpuVectorT_->copyFrom(data, size);
+    setSync(DATA_AT_CPU);
+  }
+  /// copy data to cpuVectorT_ using specifed-stream.
+  inline void copyToCpu(const T* data, size_t size, hl_stream_t stream) {
+    this->resizeOrCreate(size, false);
+    cpuVectorT_->copyFrom(data, size, stream);
+    setSync(DATA_AT_CPU);
+  }
+
+  /// copy data to gpuVectorT_.
+  inline void copyToGpu(const T* data, size_t size) {
+    this->resizeOrCreate(size, true);
+    gpuVectorT_->copyFrom(data, size);
+    setSync(DATA_AT_GPU);
+  }
+  /// copy data to gpuVectorT_ using specifed-stream.
+  inline void copyToGpu(const T* data, size_t size, hl_stream_t stream) {
+    this->resizeOrCreate(size, true);
+    gpuVectorT_->copyFrom(data, size, stream);
+    setSync(DATA_AT_GPU);
+  }
+
+  /**
+   * @brief copy from src using specifed-stream.
+   *
+   * If src is CpuVectorT, copy to cpuVectorT_.
+   *
+   * If src is GpuVectorT, copy to gpuVectorT_.
+   */
+  void copyFrom(const VectorT<T>& src, hl_stream_t stream);
+
+  /**
+   * @brief copy data.
+   *
+   * If useGpu is false, copy host data to cpuVectorT_.
+   *
+   * If useGpu is true, copy device data to gpuVectorT_.
+   *
+   * @note  data address should consistent with useGpu.
+   */
+  void copyFrom(const T* data, size_t size, bool useGpu);
+  void copyFrom(const T* data, size_t size, hl_stream_t stream, bool useGpu);
+
+  /**
+   * @brief copy from (src + offset) using specifed-stream.
+   */
+  void copyFrom(CpuGpuVectorT<T>& src,
+                size_t offset,
+                size_t size,
+                bool useGpu,
+                hl_stream_t stream);
+
+  /**
+   * @brief copy from src using specifed-stream.
+   */
+  void copyFrom(CpuGpuVectorT<T>& src, hl_stream_t stream);
+
+  /**
+   * @brief return sync_.
+   */
+  inline SyncedFlag* getSync() const { return sync_; }
+
+  /**
+   * @brief set sync_.
+   */
+  inline void setSync(SyncedFlag* sync) { sync_ = sync; }
+
+  inline void setSync(SyncedFlag syncFlag) {
+    if (sync_) {
+      *sync_ = syncFlag;
+    } else {
+      syncFlag_ = syncFlag;
+      sync_ = &syncFlag_;
+    }
+  }
+
+  inline void setSync(bool useGpu) {
+    SyncedFlag flag = useGpu ? DATA_AT_GPU : DATA_AT_CPU;
+    setSync(flag);
+  }
+
+ protected:
+  void resizeOrCreate(size_t size, bool useGpu);
+
+  /**
+   * @brief copy between cpuVectorT_ and gpuVectorT_.
+   *
+   * If syncFlag_ is DATA_AT_CPU and SYNCED, do nothing.
+   *
+   * If syncFlag_ is DATA_AT_GPU, copy gpuVectorT_ to cpuVectorT_
+   *   and set syncFlag_ to SYNCED.
+   */
+  void copyToCpu();
+
+  /**
+   * @brief copy between cpuVectorT_ and gpuVectorT_.
+   *
+   * If syncFlag_ is DATA_AT_GPU and SYNCED, do nothing.
+   *
+   * If syncFlag_ is DATA_AT_CPU, copy cpuVectorT_ to gpuVectorT_
+   *   and set syncFlag_ to SYNCED.
+   */
+  void copyToGpu();
+
+  /// host pointer.
+  std::shared_ptr<VectorT<T>> cpuVectorT_;
+  /// device pointer.
+  std::shared_ptr<VectorT<T>> gpuVectorT_;
+  /// specify current data address.
+  SyncedFlag syncFlag_;
+  SyncedFlag* sync_;
+};
+
+typedef VectorT<real> Vector;
+typedef CpuVectorT<real> CpuVector;
+typedef GpuVectorT<real> GpuVector;
+
+typedef VectorT<int> IVector;
+typedef CpuVectorT<int> CpuIVector;
+typedef GpuVectorT<int> GpuIVector;
+
+typedef std::shared_ptr<Vector> VectorPtr;
+typedef std::shared_ptr<CpuVector> CpuVectorPtr;
+typedef std::shared_ptr<GpuVector> GpuVectorPtr;
+
+typedef std::shared_ptr<IVector> IVectorPtr;
+typedef std::shared_ptr<CpuIVector> CpuIVectorPtr;
+typedef std::shared_ptr<GpuIVector> GpuIVectorPtr;
+
+typedef CpuGpuVectorT<real> CpuGpuVector;
+typedef CpuGpuVectorT<int> ICpuGpuVector;
+typedef std::shared_ptr<CpuGpuVector> CpuGpuVectorPtr;
+typedef std::shared_ptr<ICpuGpuVector> ICpuGpuVectorPtr;
+
+}  // namespace paddle
diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/legacy/math/tests/CMakeLists.txt
similarity index 100%
rename from paddle/math/tests/CMakeLists.txt
rename to paddle/legacy/math/tests/CMakeLists.txt
diff --git a/paddle/legacy/math/tests/OriginalOptimizerApi.h b/paddle/legacy/math/tests/OriginalOptimizerApi.h
new file mode 100644
index 0000000000000000000000000000000000000000..f386e19958a21214151776e6d0ae7bb2a4530b6c
--- /dev/null
+++ b/paddle/legacy/math/tests/OriginalOptimizerApi.h
@@ -0,0 +1,201 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+
+using namespace paddle;  // NOLINT
+
+void SparseMomentumParameterOptimizer(const VectorPtr vecs[],
+                                      real alpha,
+                                      real beta,
+                                      real gamma,
+                                      real tau,
+                                      real learningRate) {
+  vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT],
+                                   -alpha * gamma * learningRate);
+  vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT],
+                                   tau * alpha * gamma * learningRate);
+  vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
+                             tau / beta + 1.0 / alpha,
+                             *vecs[PARAMETER_MOMENTUM_VT],
+                             1.0 / beta);
+}
+
+void AdagradParameterOptimizer(const VectorPtr vecs[],
+                               real epsilon,
+                               real learningRate,
+                               real momentum,
+                               real decayRate) {
+  vecs[PARAMETER_GRADIENT_SQURESUM1]->addSquare(*vecs[PARAMETER_GRADIENT],
+                                                1.0f);
+  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM],
+                                     *vecs[PARAMETER_GRADIENT_SQURESUM1]);
+  vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+
+void AdaDeltaParameterOptimizer(const VectorPtr vecs[],
+                                real rou,
+                                real epsilon,
+                                real learningRate,
+                                real momentum,
+                                real decayRate) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+      *vecs[PARAMETER_GRADIENT], rou, 1.0f - rou);
+
+  // learn_rate = sqrt( ( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ) )
+  vecs[PARAMETER_LEARNING_RATE]->dotDiv(*vecs[PARAMETER_GRADIENT_SQURESUM1],
+                                        *vecs[PARAMETER_GRADIENT_SQURESUM],
+                                        epsilon,
+                                        epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->sqrt2();
+
+  // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
+  vecs[PARAMETER_GRADIENT_SQURESUM1]->decayAddSquareMul(
+      *vecs[PARAMETER_GRADIENT],
+      *vecs[PARAMETER_LEARNING_RATE],
+      rou,
+      1.0f - rou);
+
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+
+void RMSPropParameterOptimizer(const VectorPtr vecs[],
+                               real accumulatedRou,
+                               real rou,
+                               real epsilon,
+                               real learningRate,
+                               real momentum,
+                               real decayRate,
+                               bool firstTime) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+      *vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
+
+  // E(g_t) = \rou * E(g_{t-1}) + (1-\rou) * g
+  vecs[PARAMETER_GRADIENT_SQURESUM1]->add(
+      *vecs[PARAMETER_GRADIENT], accumulatedRou, 1.0f - rou);
+
+  // learn_rate = 1/sqrt( ( E(g_t^2) - (E(g_t))^2 + epsilon )
+  // Basiclly if the sign of the gradient changes more often,
+  // the learning rate will be decreased.
+  vecs[PARAMETER_LEARNING_RATE]->assign(*vecs[PARAMETER_GRADIENT_SQURESUM]);
+  vecs[PARAMETER_LEARNING_RATE]->addSquare(*vecs[PARAMETER_GRADIENT_SQURESUM1],
+                                           -1.0f);
+  vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+
+void DecayedAdagradParameterOptimizer(const VectorPtr vecs[],
+                                      real accumulatedRou,
+                                      real rou,
+                                      real epsilon,
+                                      real learningRate,
+                                      real momentum,
+                                      real decayRate,
+                                      bool firstTime) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+      *vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
+
+  // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
+  // Basiclly if the bigger the magnitude gradient is,
+  // the smaller the learning rate will be.
+  vecs[PARAMETER_LEARNING_RATE]->assign(epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM]);
+  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+
+void AdamParameterOptimizer(const VectorPtr vecs[],
+                            real beta1,
+                            real beta2,
+                            real beta1_power,
+                            real beta2_power,
+                            real epsilon,
+                            real learningRate) {
+  Vector* m = vecs[PARAMETER_MOMENTUM].get();
+  Vector* g = vecs[PARAMETER_GRADIENT].get();
+  Vector* v = vecs[PARAMETER_SECOND_MOMENTUM].get();
+  Vector* theta = vecs[PARAMETER_VALUE].get();
+
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  m->add(*g, beta1, 1 - beta1);
+
+  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
+  g->square2();
+  v->add(*g, beta2, 1 - beta2);
+
+  // tmp = m_t / ( \sqrt{v_t} + \epsilon )
+  // \theta_t = \theta_{t-1} - \alpha * \sqrt(1-\beta_2^t) / (1-\beta_1^t) * tmp
+  g->sqrt2(*v);
+  g->dotDiv(*m, *g, 0., epsilon);
+  real alpha =
+      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
+  theta->add(*theta, 1.0, *g, -alpha);
+}
+
+void AdamaxParameterOptimizer(
+    const VectorPtr vecs[], real beta1, real beta2, int64_t step, real alpha) {
+  Vector* m = vecs[PARAMETER_MOMENTUM].get();
+  Vector* g = vecs[PARAMETER_GRADIENT].get();
+  Vector* u = vecs[PARAMETER_WEIGHTED_INFINITY_NORM].get();
+  Vector* theta = vecs[PARAMETER_VALUE].get();
+
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  m->add(*g, beta1, 1 - beta1);
+
+  // u_t = max(\beta_2*u_{t-1}, abs(g_t))
+  u->mulScalar(beta2);
+  g->abs2();
+  u->max2(*u, *g);
+
+  // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
+  g->dotDiv(*m, *u);
+  real learningRate = alpha / (1 - std::pow(beta1, step));
+  theta->add(*theta, 1.0, *g, -learningRate);
+}
diff --git a/paddle/legacy/math/tests/PerfUtils.h b/paddle/legacy/math/tests/PerfUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..eaf4869e4c994e5ec739fe650d0228687d24853f
--- /dev/null
+++ b/paddle/legacy/math/tests/PerfUtils.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// Performance Check
+#ifdef PADDLE_DISABLE_TIMER
+
+#define EXPRESSION_PERFORMANCE(expression) expression;
+
+#else
+
+#include "paddle/legacy/utils/Stat.h"
+using namespace paddle;  // NOLINT
+
+#define EXPRESSION_PERFORMANCE(expression)                             \
+  do {                                                                 \
+    char expr[30];                                                     \
+    strncpy(expr, #expression, 30);                                    \
+    if (expr[29] != '\0') {                                            \
+      expr[27] = '.';                                                  \
+      expr[28] = '.';                                                  \
+      expr[29] = '\0';                                                 \
+    }                                                                  \
+    expression;                                                        \
+    for (int i = 0; i < 20; i++) {                                     \
+      REGISTER_TIMER(expr);                                            \
+      expression;                                                      \
+    }                                                                  \
+    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ') \
+              << *globalStat.getStat(expr);                            \
+    globalStat.reset();                                                \
+  } while (0)
+
+#endif
diff --git a/paddle/legacy/math/tests/TensorCheck.h b/paddle/legacy/math/tests/TensorCheck.h
new file mode 100644
index 0000000000000000000000000000000000000000..41c8ece282e05f55d063e6ad0d8805629c847d34
--- /dev/null
+++ b/paddle/legacy/math/tests/TensorCheck.h
@@ -0,0 +1,216 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+/**
+ * This file provides a TensorCheck template function, which can be used to
+ * compare CpuMatrix and GpuMatrix, CpuVector and GpuVector, and so on.
+ */
+
+#include <cmath>
+#include "paddle/legacy/math/Matrix.h"
+
+namespace autotest {
+
+using paddle::Matrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using paddle::VectorT;
+using paddle::CpuVectorT;
+using paddle::GpuVectorT;
+
+class AssertEqual {
+ public:
+  AssertEqual(real err = 0) : err_(err) {}
+
+  inline bool operator()(real a, real b) {
+    if (err_ == 0) {
+      if (a != b) {
+        return false;
+      }
+    } else {
+      if (std::fabs(a - b) > err_) {
+        if ((std::fabs(a - b) / std::fabs(a)) > (err_ / 10.0f)) {
+          return false;
+        }
+      }
+    }
+
+    return true;
+  }
+
+ private:
+  real err_;
+};
+
+template <typename Tensor>
+class CopyToCpu;
+
+template <>
+class CopyToCpu<CpuMatrix> {
+ public:
+  explicit CopyToCpu(const CpuMatrix& arg) : arg_(arg) {}
+  const CpuMatrix& copiedArg() const { return arg_; }
+
+ private:
+  const CpuMatrix& arg_;
+};
+
+template <>
+class CopyToCpu<GpuMatrix> {
+ public:
+  explicit CopyToCpu(const GpuMatrix& arg)
+      : arg_(arg.getHeight(), arg.getWidth()) {
+    arg_.copyFrom(arg);
+  }
+  CpuMatrix& copiedArg() { return arg_; }
+
+ private:
+  CpuMatrix arg_;
+};
+
+template <>
+class CopyToCpu<Matrix> {
+ public:
+  explicit CopyToCpu(const Matrix& arg)
+      : arg_(arg.getHeight(), arg.getWidth()) {
+    arg_.copyFrom(arg);
+  }
+  CpuMatrix& copiedArg() { return arg_; }
+
+ private:
+  CpuMatrix arg_;
+};
+
+template <typename T>
+class CopyToCpu<CpuVectorT<T>> {
+ public:
+  explicit CopyToCpu(const CpuVectorT<T>& arg) : arg_(arg) {}
+  const CpuVectorT<T>& copiedArg() const { return arg_; }
+
+ private:
+  const CpuVectorT<T>& arg_;
+};
+
+template <typename T>
+class CopyToCpu<GpuVectorT<T>> {
+ public:
+  explicit CopyToCpu(const GpuVectorT<T>& arg) : arg_(arg.getSize()) {
+    arg_.copyFrom(arg);
+  }
+  CpuVectorT<T>& copiedArg() { return arg_; }
+
+ private:
+  CpuVectorT<T> arg_;
+};
+
+template <typename T>
+class CopyToCpu<VectorT<T>> {
+ public:
+  explicit CopyToCpu(const VectorT<T>& arg) : arg_(arg.getSize()) {
+    arg_.copyFrom(arg);
+  }
+  CpuVectorT<T>& copiedArg() { return arg_; }
+
+ private:
+  CpuVectorT<T> arg_;
+};
+
+template <typename AssertEq>
+void TensorCheck(AssertEq compare,
+                 const CpuMatrix& matrix1,
+                 const CpuMatrix& matrix2) {
+  CHECK(matrix1.getHeight() == matrix2.getHeight());
+  CHECK(matrix1.getWidth() == matrix2.getWidth());
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+  const real* data1 = matrix1.getData();
+  const real* data2 = matrix2.getData();
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      real a = data1[i * width + j];
+      real b = data2[i * width + j];
+      if (!compare(a, b)) {
+        count++;
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+template <typename AssertEq, class T>
+void TensorCheck(AssertEq compare,
+                 const CpuVectorT<T>& vector1,
+                 const CpuVectorT<T>& vector2) {
+  CHECK(vector1.getSize() == vector2.getSize());
+
+  const T* data1 = vector1.getData();
+  const T* data2 = vector2.getData();
+  size_t size = vector1.getSize();
+  int count = 0;
+  for (size_t i = 0; i < size; i++) {
+    real a = data1[i];
+    real b = data2[i];
+    if (!compare(a, b)) {
+      count++;
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
+}
+
+template <typename AssertEq, typename Tensor1, typename Tensor2>
+void TensorCheck(AssertEq compare,
+                 const Tensor1& tensor1,
+                 const Tensor2& tensor2) {
+  TensorCheck(compare,
+              CopyToCpu<Tensor1>(tensor1).copiedArg(),
+              CopyToCpu<Tensor2>(tensor2).copiedArg());
+}
+
+template <typename AssertEq>
+void TensorCheck(AssertEq compare, real args1, real args2) {
+  EXPECT_EQ(compare(args1, args2), true) << "[Test error] args1 = " << args1
+                                         << ", args2 = " << args2;
+}
+
+template <typename AssertEq>
+void TensorCheck(AssertEq compare, size_t args1, size_t args2) {
+  EXPECT_EQ(args1, args2) << "[Test error] args1 = " << args1
+                          << ", args2 = " << args2;
+}
+
+template <typename Tensor1, typename Tensor2>
+void TensorCheckEqual(const Tensor1& tensor1, const Tensor2& tensor2) {
+  AssertEqual compare(0);
+  TensorCheck(compare,
+              CopyToCpu<Tensor1>(tensor1).copiedArg(),
+              CopyToCpu<Tensor2>(tensor2).copiedArg());
+}
+
+template <typename Tensor1, typename Tensor2>
+void TensorCheckErr(const Tensor1& tensor1, const Tensor2& tensor2) {
+#ifndef PADDLE_TYPE_DOUBLE
+  AssertEqual compare(1e-3);
+#else
+  AssertEqual compare(1e-10);
+#endif
+  TensorCheck(compare,
+              CopyToCpu<Tensor1>(tensor1).copiedArg(),
+              CopyToCpu<Tensor2>(tensor2).copiedArg());
+}
+
+}  // namespace autotest
diff --git a/paddle/legacy/math/tests/TestUtils.h b/paddle/legacy/math/tests/TestUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..60e76359da61ac32346b093d9a9ff69104bfc494
--- /dev/null
+++ b/paddle/legacy/math/tests/TestUtils.h
@@ -0,0 +1,294 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+/**
+ * This file provides a AutoCompare calss to simplify the comparison
+ * of CPU and GPU member functions.
+ *
+ * This takes two steps
+ * 1. Construct an AutoCompare object.
+ *    When constructing an AutoCompare object, you can set the err argument
+ * to specify the maximum error for CPU and GPU functions.
+ *
+ * 2. Use the template functions cmpWithArg or cmpWithoutArg.
+ * A. [cmpWithArg] Requires the caller construct the cpu arguments.
+ *
+ *  AutoCompare test;
+ *  Init Argument arg1,arg2...
+ *  test.cmpWithArg(function, arg1, arg2....)
+ *
+ * B. [cmpWithoutArg] The caller do not need construct arguments.
+ *    If matrix used in these functions arguments is the same size.
+ *    Such as the element wise function and the aggregate function
+ *    defined in the BaseMatrix.cpp.
+ *
+ *  AutoCompare test;
+ *  test.cmpWithoutArg<I...>(function, height, width)
+ */
+
+#include <gtest/gtest.h>
+#include "TensorCheck.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+
+namespace autotest {
+
+using paddle::BaseMatrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using paddle::CpuIVector;
+using paddle::GpuIVector;
+using paddle::CpuSparseMatrix;
+using paddle::GpuSparseMatrix;
+
+template <typename T1, typename T2>
+class ReplaceType {
+ public:
+  typedef T1 type;
+};
+
+template <>
+class ReplaceType<BaseMatrix, CpuMatrix> {
+ public:
+  typedef CpuMatrix type;
+};
+
+template <>
+class ReplaceType<BaseMatrix, GpuMatrix> {
+ public:
+  typedef GpuMatrix type;
+};
+
+template <>
+class ReplaceType<Matrix, CpuMatrix> {
+ public:
+  typedef CpuMatrix type;
+};
+
+template <>
+class ReplaceType<Matrix, GpuMatrix> {
+ public:
+  typedef GpuMatrix type;
+};
+
+// construct a argument
+template <typename T>
+T construct(int height, int width);
+
+template <>
+float construct(int height, int width) {
+  return 0.5;
+}
+
+template <>
+double construct(int height, int width) {
+  return 0.5;
+}
+
+template <>
+size_t construct(int height, int width) {
+  size_t offset = std::rand() % (height < width ? height : width);
+  return offset;
+}
+
+template <>
+CpuMatrix construct(int height, int width) {
+  CpuMatrix a(height, width);
+  return a;
+}
+
+template <>
+GpuMatrix construct(int height, int width) {
+  GpuMatrix a(height, width);
+  return a;
+}
+
+// init a argument
+template <typename T>
+void init(T& v) {
+  return;
+}
+
+template <>
+void init(CpuMatrix& v) {
+  v.randomizeUniform();
+}
+
+template <>
+void init(GpuMatrix& v) {
+  v.randomizeUniform();
+}
+
+// init a tuple which contains a set of arguments.
+template <std::size_t I = 0, typename... Args>
+inline typename std::enable_if<I == sizeof...(Args), void>::type initTuple(
+    std::tuple<Args...>& t) {}
+
+template <std::size_t I = 0, typename... Args>
+    inline typename std::enable_if <
+    I<sizeof...(Args), void>::type initTuple(std::tuple<Args...>& t) {
+  init(std::get<I>(t));
+  initTuple<I + 1>(t);
+}
+
+// copy a argument, copy src to dest
+template <typename T1, typename T2>
+void copy(T1& dest, T2& src) {
+  dest = src;
+}
+
+template <>
+void copy(GpuMatrix& dest, CpuMatrix& src) {
+  dest.copyFrom(src);
+}
+
+// copy a tuple, copy src to dest
+template <std::size_t I = 0, typename... Args1, typename... Args2>
+inline typename std::enable_if<I == sizeof...(Args1), void>::type copyTuple(
+    std::tuple<Args1...>& dest, std::tuple<Args2...>& src) {}
+
+template <std::size_t I = 0, typename... Args1, typename... Args2>
+    inline typename std::enable_if <
+    I<sizeof...(Args1), void>::type copyTuple(std::tuple<Args1...>& dest,
+                                              std::tuple<Args2...>& src) {
+  copy(std::get<I>(dest), std::get<I>(src));
+  copyTuple<I + 1>(dest, src);
+}
+
+// call member function
+template <typename C,
+          typename FC,
+          typename R,
+          typename... FArgs,
+          typename... Args>
+R call(C& obj, R (FC::*f)(FArgs...), Args&&... args) {
+  return (obj.*f)(args...);
+}
+
+template <typename T>
+class ReturnType {
+ public:
+  typedef T type;
+};
+
+template <>
+class ReturnType<CpuMatrix> {
+ public:
+  typedef GpuMatrix type;
+};
+
+template <>
+class ReturnType<CpuIVector> {
+ public:
+  typedef GpuIVector type;
+};
+
+template <>
+class ReturnType<CpuSparseMatrix> {
+ public:
+  typedef GpuSparseMatrix type;
+};
+
+template <typename T>
+typename ReturnType<T>::type autoArgs(T& v) {
+  return v;
+}
+
+template <>
+GpuMatrix autoArgs(CpuMatrix& v) {
+  GpuMatrix a(v.getHeight(), v.getWidth());
+  a.copyFrom(v);
+  return a;
+}
+
+template <>
+GpuIVector autoArgs(CpuIVector& v) {
+  GpuIVector a(v.getSize());
+  a.copyFrom(v);
+  return a;
+}
+
+template <>
+GpuSparseMatrix autoArgs(CpuSparseMatrix& v) {
+  GpuSparseMatrix a(v.getHeight(),
+                    v.getWidth(),
+                    v.getElementCnt(),
+                    v.getValueType(),
+                    v.getFormat());
+  a.copyFrom(v, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  return a;
+}
+
+class AutoCompare {
+ public:
+  /**
+   * err is the allowed calculation error.
+   * The smaller the value of err,
+   * the stricter the comparison is between CPU and GPU calculations.
+   */
+  AutoCompare(size_t height, size_t width, real err = 1e-3)
+      : cpu(height, width), gpu(height, width), compare(err) {
+    init(cpu);
+    copy(gpu, cpu);
+  }
+
+  template <typename C, typename R, typename... FArgs, typename... Args>
+  void cmpWithArg(R (C::*f)(FArgs...), Args&&... args) {
+    static_assert(sizeof...(FArgs) == sizeof...(Args),
+                  "size of parameter packs are not equal");
+    call(cpu, f, args...);
+    call(gpu, f, autoArgs(args)...);
+
+    TensorCheck(compare, cpu, gpu);
+  }
+
+  template <std::size_t... I, typename C, typename R, typename... Args>
+  void cmpWithoutArg(R (C::*f)(Args...), size_t height, size_t width) {
+    static_assert(sizeof...(I) == sizeof...(Args),
+                  "size of parameter packs are not equal");
+    (void)height;
+    (void)width;
+    auto tuple1 = std::make_tuple(
+        construct<typename ReplaceType<
+            typename std::decay<
+                typename std::tuple_element<I,
+                                            std::tuple<Args...>>::type>::type,
+            CpuMatrix>::type>(height, width)...);
+
+    auto tuple2 = std::make_tuple(
+        construct<typename ReplaceType<
+            typename std::decay<
+                typename std::tuple_element<I,
+                                            std::tuple<Args...>>::type>::type,
+            GpuMatrix>::type>(height, width)...);
+
+    initTuple(tuple1);
+    copyTuple(tuple2, tuple1);
+
+    call(cpu, f, std::get<I>(tuple1)...);
+    call(gpu, f, std::get<I>(tuple2)...);
+
+    TensorCheck(compare, cpu, gpu);
+  }
+
+ protected:
+  CpuMatrix cpu;
+  GpuMatrix gpu;
+  AssertEqual compare;
+};
+
+}  // namespace autotest
diff --git a/paddle/legacy/math/tests/test_Allocator.cpp b/paddle/legacy/math/tests/test_Allocator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..122be9082a8db33caf55661091caad115f575099
--- /dev/null
+++ b/paddle/legacy/math/tests/test_Allocator.cpp
@@ -0,0 +1,122 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Util.h"
+#define private public
+#include "paddle/legacy/math/Allocator.h"
+#include "paddle/legacy/math/MemoryHandle.h"
+#include "paddle/legacy/math/PoolAllocator.h"
+
+using namespace paddle;  // NOLINT
+
+template <typename Allocator>
+void testPoolAllocator() {
+  PoolAllocator* pool =
+      new PoolAllocator(new Allocator(), /* sizeLimit */ 1024);
+
+  /* alloc from system memory */
+  void* ptr1 = pool->alloc(10);
+  void* ptr2 = pool->alloc(200);
+  void* ptr3 = pool->alloc(200);
+  pool->free(ptr1, 10);
+  pool->free(ptr2, 200);
+  pool->free(ptr3, 200);
+  pool->printAll();
+  EXPECT_EQ((size_t)2, pool->pool_.size());
+  EXPECT_EQ((size_t)1, pool->pool_[10].size());
+  EXPECT_EQ((size_t)2, pool->pool_[200].size());
+  EXPECT_EQ(ptr1, pool->pool_[10][0]);
+  EXPECT_EQ(ptr2, pool->pool_[200][0]);
+  EXPECT_EQ(ptr3, pool->pool_[200][1]);
+
+  /* alloc from pool */
+  void* ptr4 = pool->alloc(10);
+  void* ptr5 = pool->alloc(200);
+  pool->printAll();
+  EXPECT_EQ((size_t)0, pool->pool_[10].size());
+  EXPECT_EQ((size_t)1, pool->pool_[200].size());
+  EXPECT_EQ(ptr1, ptr4);
+  EXPECT_EQ(ptr3, ptr5);
+  pool->free(ptr4, 10);
+  pool->free(ptr5, 200);
+
+  /* alloc size > sizeLimit */
+  void* ptr6 = pool->alloc(1024);
+  pool->free(ptr6, 1024);
+  EXPECT_LE((size_t)1024, pool->poolMemorySize_);
+
+  void* ptr7 = pool->alloc(1);
+  EXPECT_EQ((size_t)0, pool->poolMemorySize_);
+  EXPECT_EQ((size_t)0, pool->pool_.size());
+  pool->free(ptr7, 1);
+
+  delete pool;
+}
+
+TEST(Allocator, Pool) {
+  testPoolAllocator<CpuAllocator>();
+#ifdef PADDLE_WITH_CUDA
+  testPoolAllocator<GpuAllocator>();
+#endif
+}
+
+TEST(MemoryHandle, Cpu) {
+  for (auto size : {10, 30, 50, 100, 200, 512, 1000, 1023, 1024, 1025, 8193}) {
+    CpuMemoryHandle handle(size);
+    EXPECT_LE(handle.getSize(), handle.getAllocSize());
+  }
+
+  void* ptr1;
+  void* ptr2;
+  {
+    CpuMemoryHandle handle(256);
+    ptr1 = handle.getBuf();
+  }
+  {
+    CpuMemoryHandle handle(256);
+    ptr2 = handle.getBuf();
+  }
+  EXPECT_EQ(ptr1, ptr2);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(MemoryHandle, Gpu) {
+  int numGpu = hl_get_device_count();
+
+  /* alloc from system memory */
+  void* ptr3[numGpu];
+  void* ptr4[numGpu];
+  for (int i = 0; i < numGpu; i++) {
+    SetDevice device(i);
+    GpuMemoryHandle handle1(30);
+    GpuMemoryHandle handle2(30);
+    GpuMemoryHandle handle3(4000);
+    GpuMemoryHandle handle4(500);
+    ptr3[i] = handle3.getBuf();
+    ptr4[i] = handle4.getBuf();
+  }
+
+  /* alloc from pool */
+  for (int i = 0; i < numGpu; i++) {
+    SetDevice device(i);
+    GpuMemoryHandle handle1(30);
+    GpuMemoryHandle handle3(4000);
+    GpuMemoryHandle handle4(500);
+    EXPECT_EQ(ptr3[i], handle3.getBuf());
+    EXPECT_EQ(ptr4[i], handle4.getBuf());
+  }
+}
+#endif
diff --git a/paddle/legacy/math/tests/test_BaseMatrix.cpp b/paddle/legacy/math/tests/test_BaseMatrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..488765c6ac203ad064146faaab7b8c423d53cf0b
--- /dev/null
+++ b/paddle/legacy/math/tests/test_BaseMatrix.cpp
@@ -0,0 +1,247 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+/**
+ * This test file use autotest::AutoCompare and cmpWithoutArg to compares the
+ * implementation of CPU and GPU member function in
+ * BaseMatrix.cpp and Matrix.cpp.
+ */
+
+#include <gtest/gtest.h>
+#include "TestUtils.h"
+#include "paddle/legacy/math/BaseMatrix.h"
+
+using paddle::BaseMatrix;
+using paddle::Matrix;
+using autotest::AutoCompare;
+
+// Test all void (BaseMatrix::*)() function
+TEST(BaseMatrix, void) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)()) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg(f, height, width);
+      };
+
+      compare(&BaseMatrix::neg);
+      compare(&BaseMatrix::exp2);
+      compare(&BaseMatrix::log2);
+      compare(&BaseMatrix::sqrt2);
+      compare(&BaseMatrix::square2);
+      compare(&BaseMatrix::reciprocal2);
+      compare(&BaseMatrix::abs2);
+      compare(&BaseMatrix::sign2);
+      compare(&BaseMatrix::zero);
+      compare(&BaseMatrix::one);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(real) function
+TEST(BaseMatrix, real) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(real)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0>(f, height, width);
+      };
+
+      compare(&BaseMatrix::pow2);
+      compare(&BaseMatrix::subScalar);
+      compare(&BaseMatrix::mulScalar);
+      compare(&BaseMatrix::divScalar);
+      compare(&BaseMatrix::assign);
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::biggerThanScalar);
+      compare(&BaseMatrix::downClip);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(BaseMatrix&) function
+TEST(BaseMatrix, BaseMatrix) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(BaseMatrix&)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0>(f, height, width);
+      };
+
+      compare(&BaseMatrix::assign);
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::relu);
+      compare(&BaseMatrix::reluDerivative);
+      compare(&BaseMatrix::softrelu);
+      compare(&BaseMatrix::softreluDerivative);
+      compare(&BaseMatrix::brelu);
+      compare(&BaseMatrix::breluDerivative);
+      compare(&BaseMatrix::square2);
+      compare(&BaseMatrix::squareDerivative);
+      compare(&BaseMatrix::tanh);
+      compare(&BaseMatrix::tanhDerivative);
+      compare(&BaseMatrix::reciprocal2);
+      compare(&BaseMatrix::reciprocalDerivative);
+      compare(&BaseMatrix::abs2);
+      compare(&BaseMatrix::absDerivative);
+      compare(&BaseMatrix::sigmoid);
+      compare(&BaseMatrix::sigmoidDerivative);
+      compare(&BaseMatrix::expDerivative);
+      compare(&BaseMatrix::sign2);
+      compare(&BaseMatrix::exp2);
+      compare(&BaseMatrix::log2);
+      compare(&BaseMatrix::sqrt2);
+      compare(&BaseMatrix::dotMul);
+      compare(&BaseMatrix::dotMulSquare);
+      compare(&BaseMatrix::dotSquareMul);
+      compare(&BaseMatrix::addColVector);
+      compare(&BaseMatrix::addRowVector);
+      compare(&BaseMatrix::mulRowVector);
+      compare(&BaseMatrix::divRowVector);
+      compare(&BaseMatrix::mulColVector);
+      compare(&BaseMatrix::divColVector);
+      compare(&BaseMatrix::addP2P);
+      compare(&BaseMatrix::invSqrt);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(real, real) function
+TEST(BaseMatrix, real_real) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(real, real)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0, 1>(f, height, width);
+      };
+
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::clip);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(BaseMatrix&, real) function
+TEST(BaseMatrix, BaseMatrix_real) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height, width](void (BaseMatrix::*f)(BaseMatrix&, real)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0, 1>(f, height, width);
+      };
+
+      compare(&BaseMatrix::addBias);
+      compare(&BaseMatrix::add);
+      compare(&BaseMatrix::sub);
+      compare(&BaseMatrix::pow2);
+      compare(&BaseMatrix::addScalar);
+      compare(&BaseMatrix::subScalar);
+      compare(&BaseMatrix::mulScalar);
+      compare(&BaseMatrix::divScalar);
+      compare(&BaseMatrix::scalarDiv);
+      compare(&BaseMatrix::addSquare);
+      compare(&BaseMatrix::isEqualTo);
+    }
+  }
+}
+
+// Test all void (BaseMatrix::*)(BaseMatrix&, BaseMatrix&) function
+TEST(BaseMatrix, BaseMatrix_BaseMatrix) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      auto compare = [height,
+                      width](void (BaseMatrix::*f)(BaseMatrix&, BaseMatrix&)) {
+        AutoCompare test(height, width, 1e-5);
+        test.cmpWithoutArg<0, 1>(f, height, width);
+      };
+
+      compare(&BaseMatrix::softCrossEntropy);
+      compare(&BaseMatrix::softCrossEntropyBp);
+      compare(&BaseMatrix::binaryLabelCrossEntropy);
+      compare(&BaseMatrix::binaryLabelCrossEntropyBp);
+      compare(&BaseMatrix::sub);
+      compare(&BaseMatrix::add2);
+      compare(&BaseMatrix::dotMul);
+      compare(&BaseMatrix::dotDiv);
+      compare(&BaseMatrix::logisticRegressionLoss);
+      compare(&BaseMatrix::logisticRegressionLossBp);
+      compare(&BaseMatrix::biggerThan);
+      compare(&BaseMatrix::max2);
+      compare(&BaseMatrix::dotMulSquare);
+      compare(&BaseMatrix::dotSquareSquare);
+    }
+  }
+}
+
+void TestEelementWise(size_t height, size_t width) {
+  AutoCompare rowScale(height, width);
+  rowScale.cmpWithoutArg<0, 1, 2>(&BaseMatrix::rowScale, height, width);
+
+  AutoCompare rowDotMul(height, width);
+  rowDotMul.cmpWithoutArg<0, 1, 2>(&BaseMatrix::rowDotMul, height, width);
+
+  AutoCompare binaryClassificationError(height, width);
+  binaryClassificationError.cmpWithoutArg<0, 1, 2, 3>(
+      &BaseMatrix::binaryClassificationError, height, width);
+
+  AutoCompare sumOfSquaresBp(height, width);
+  sumOfSquaresBp.cmpWithoutArg<0, 1>(&Matrix::sumOfSquaresBp, height, width);
+}
+
+void TestAggregateToRow(size_t height, size_t width) {
+  AutoCompare maxCols(1, width);
+  maxCols.cmpWithoutArg<0>(&BaseMatrix::maxCols, height, width);
+
+  AutoCompare minCols(1, width);
+  minCols.cmpWithoutArg<0>(&BaseMatrix::minCols, height, width);
+
+  AutoCompare addDotMulVMM(1, width);
+  addDotMulVMM.cmpWithoutArg<0, 1>(&BaseMatrix::addDotMulVMM, height, width);
+
+  AutoCompare sumCols(1, width);
+  sumCols.cmpWithoutArg<0, 1, 2>(&BaseMatrix::sumCols, height, width);
+
+  AutoCompare collectBias(1, width);
+  collectBias.cmpWithoutArg<0, 1>(
+      static_cast<void (Matrix::*)(Matrix&, real)>(&Matrix::collectBias),
+      height,
+      width);
+}
+
+void TestAggregateToCol(size_t height, size_t width) {
+  AutoCompare maxRows(height, 1);
+  maxRows.cmpWithoutArg<0>(&BaseMatrix::maxRows, height, width);
+
+  AutoCompare minRows(height, 1);
+  minRows.cmpWithoutArg<0>(&BaseMatrix::minRows, height, width);
+
+  AutoCompare sumRows(height, 1);
+  sumRows.cmpWithoutArg<0, 1, 2>(&BaseMatrix::sumRows, height, width);
+
+  AutoCompare sumOfSquares(height, 1);
+  sumOfSquares.cmpWithoutArg<0, 1>(&Matrix::sumOfSquares, height, width);
+}
+
+TEST(BaseMatrix, Other) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      TestEelementWise(height, width);
+      TestAggregateToRow(height, width);
+      TestAggregateToCol(height, width);
+    }
+  }
+}
+
+#endif
diff --git a/paddle/legacy/math/tests/test_CpuGpuVector.cpp b/paddle/legacy/math/tests/test_CpuGpuVector.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..010fef534d1e19d2d7d134298eb97aa1b56e2270
--- /dev/null
+++ b/paddle/legacy/math/tests/test_CpuGpuVector.cpp
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+
+#include <gtest/gtest.h>
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/utils/Util.h"
+#include "test_matrixUtil.h"
+
+using namespace paddle;  // NOLINT
+
+TEST(CpuGpuVector, getData) {
+  size_t size = 500;
+  hl_stream_t stream(HPPL_STREAM_DEFAULT);
+  CpuVectorPtr cpuVec = std::make_shared<CpuVector>(size);
+  GpuVectorPtr gpuVec = std::make_shared<GpuVector>(size);
+  cpuVec->uniform(0.0, 10.0);
+  gpuVec->copyFrom(*cpuVec, stream);
+  hl_stream_synchronize(stream);
+
+  CpuGpuVectorPtr vec = std::make_shared<CpuGpuVector>(gpuVec);
+  auto a = vec->getData(false);
+  auto b = cpuVec->getData();
+  hl_stream_synchronize(stream);
+  checkDataEqual(a, b, size);
+}
+
+TEST(CpuGpuVector, subCreate) {
+  size_t size1 = 1024;
+  size_t offset = 100;
+  size_t size2 = 500;
+  hl_stream_t stream(HPPL_STREAM_DEFAULT);
+  CpuGpuVectorPtr v1 = std::make_shared<CpuGpuVector>(size1, /*useGpu*/ false);
+  auto vec = v1->getMutableVector(false);
+  vec->uniform(0.0, 10.0);
+  auto v2 = std::make_shared<CpuGpuVector>(*v1, offset, size2);
+  CHECK_EQ(*v1->getSync(), *v2->getSync());
+
+  // check subVec equal
+  checkDataEqual(v1->getData(false) + offset, v2->getData(false), size2);
+
+  CpuVectorPtr v1Check = std::make_shared<CpuVector>(size1);
+  CpuVectorPtr v2Check = std::make_shared<CpuVector>(size2);
+  v1Check->copyFrom(*(v1->getVector(true)), stream);
+  v2Check->copyFrom(*(v2->getVector(true)), stream);
+  hl_stream_synchronize(stream);
+
+  checkDataEqual(v2->getData(false), v2Check->getData(), size2);
+  checkDataEqual(v1Check->getData() + offset, v2Check->getData(), size2);
+
+  CpuVectorPtr noise = std::make_shared<CpuVector>(size2);
+  noise->uniform(0.0, 1.0);
+  auto v = v2->getMutableVector(false);  // will change header
+  // add noise to subVec
+  v->add(*noise);
+
+  // check v1_cpu_data == v2_cpu_data
+  checkDataEqual(v1->getData(false) + offset, v2->getData(false), size2);
+
+  v1Check->copyFrom(*(v1->getVector(true)), stream);
+  v2Check->copyFrom(*(v2->getVector(true)), stream);
+  hl_stream_synchronize(stream);
+
+  // check v1_gpu_data == v2_gpu_data
+  checkDataEqual(v1Check->getData() + offset, v2Check->getData(), size2);
+}
+
+#endif
diff --git a/paddle/legacy/math/tests/test_ExecViaCpu.cpp b/paddle/legacy/math/tests/test_ExecViaCpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b2ce0bc7ede133028fff8a855ff336ff83f55d82
--- /dev/null
+++ b/paddle/legacy/math/tests/test_ExecViaCpu.cpp
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/utils/PythonUtil.h>
+#include <paddle/legacy/utils/Util.h>
+#include <vector>
+#include "paddle/legacy/math/SparseMatrix.h"
+
+using namespace paddle;  // NOLINT
+
+const int height = 10;
+const int width = 16;
+
+real f(Matrix& mat1,
+       const Matrix& mat2,
+       IVector& vec1,
+       const IVector& vec2,
+       real scalar) {
+  CHECK(!mat1.useGpu());
+  CHECK(!mat2.useGpu());
+  CHECK(!vec1.useGpu());
+  CHECK(!vec2.useGpu());
+  mat1.copyFrom(mat2);
+  vec1.copyFrom(vec2);
+
+  return scalar;
+}
+
+class Functor {
+ public:
+  real operator()(Matrix& mat1,
+                  const Matrix& mat2,
+                  IVector& vec1,
+                  const IVector& vec2,
+                  real scalar) {
+    a_ = f(mat1, mat2, vec1, vec2, scalar);
+    return a_;
+  }
+
+ private:
+  real a_;
+};
+
+template <typename F>
+void testWrapper(F&& f) {
+  MatrixPtr cpumat1 = Matrix::create(height, width, false, /*useGpu=*/false);
+  MatrixPtr cpumat2 = Matrix::create(height, width, false, /*useGpu=*/false);
+
+  IVectorPtr cpuvec1 = IVector::create(height, /*useGpu=*/false);
+  IVectorPtr cpuvec2 = IVector::create(height, /*useGpu=*/false);
+
+  const real scalar = 1.23456;
+
+  MatrixPtr gpumat1 = Matrix::create(height, width, false, /*useGpu=*/true);
+  MatrixPtr gpumat2 = Matrix::create(height, width, false, /*useGpu=*/true);
+  IVectorPtr gpuvec1 = IVector::create(height, /*useGpu=*/true);
+  IVectorPtr gpuvec2 = IVector::create(height, /*useGpu=*/true);
+
+  cpumat2->randomizeUniform();
+  cpuvec2->rand(width);
+  gpumat2->copyFrom(*cpumat2);
+  gpuvec2->copyFrom(*cpuvec2);
+
+  real ret = execViaCpu(f, *gpumat1, *gpumat2, *gpuvec1, *gpuvec2, 1.23456);
+  EXPECT_EQ(ret, scalar);
+  cpumat1->copyFrom(*gpumat1);
+  cpuvec1->copyFrom(*gpuvec1);
+
+  for (int i = 0; i < height; ++i) {
+    EXPECT_EQ(cpuvec1->getElement(i), cpuvec2->getElement(i));
+    for (int j = 0; j < width; ++j) {
+      EXPECT_EQ(cpumat1->getElement(i, j), cpumat2->getElement(i, j));
+    }
+  }
+  gpumat1->resize(height, 1);
+  execViaCpu2(&CpuMatrix::selectElements, *gpumat1, *gpumat2, *gpuvec1);
+
+  cpumat1->resize(height, 1);
+  cpumat1->selectElements(*cpumat2, *cpuvec1);
+  for (int i = 0; i < height; ++i) {
+    EXPECT_EQ(cpumat1->getElement(i, 0), gpumat1->getElement(i, 0));
+  }
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(ExecViaCpu, test1) {
+  testWrapper(f);
+  testWrapper(&f);
+
+  auto lambda = [](Matrix& mat1,
+                   const Matrix& mat2,
+                   IVector& vec1,
+                   const IVector& vec2,
+                   real scalar) -> real {
+    return f(mat1, mat2, vec1, vec2, scalar);
+  };
+  LOG(INFO) << "lambda is_class=" << std::is_class<decltype(lambda)>::value
+            << " is_function=" << std::is_function<decltype(lambda)>::value;
+  testWrapper(lambda);
+
+  Functor functor;
+  testWrapper(functor);
+}
+#endif
diff --git a/paddle/legacy/math/tests/test_FPException.cpp b/paddle/legacy/math/tests/test_FPException.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aa6aea71c8d959834ff11c04969e13bb36b630ff
--- /dev/null
+++ b/paddle/legacy/math/tests/test_FPException.cpp
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/**
+ * This test is about floating point calculation exception.
+ * Paddle catches FE_INVALID, FE DIVBYZERO and FE_OVERFLOW exceptions.
+ *
+ * Some exceptions occur in the middle of a set of formulas,
+ * that can be circumvented by some tricks.
+ * For example,
+ * calculate tanh
+ *   b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
+ *
+ * If the result of (-2 * a) is too large,
+ * a FE_OVERFLOW exception occurs when calculating exp.
+ * But the result of tanh is no overflow problem,
+ * so we can add some tricks to prevent exp calculate an excessive value.
+ *
+ */
+
+#include <gtest/gtest.h>
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Common.h"
+
+using namespace paddle;  // NOLINT
+
+void SetTensorValue(Matrix& matrix, real value) {
+  int height = matrix.getHeight();
+  int width = matrix.getWidth();
+  int stride = matrix.getStride();
+  real* data = matrix.getData();
+  for (int i = 0; i < height; i++) {
+    int j = rand() % width;  // NOLINT
+    if (typeid(matrix) == typeid(CpuMatrix)) {
+      data[i * stride + j] = value;
+    } else if (typeid(matrix) == typeid(GpuMatrix)) {
+      hl_memcpy(&data[i * stride + j], &value, sizeof(real));
+    } else {
+      LOG(FATAL) << "should not reach here";
+    }
+  }
+}
+
+template <typename Matrix>
+void testTanh(real illegal) {
+  MatrixPtr A = std::make_shared<Matrix>(10, 10);
+  MatrixPtr B = std::make_shared<Matrix>(10, 10);
+  A->randomizeUniform();
+  B->randomizeUniform();
+
+  SetTensorValue(*A, illegal);
+
+  A->tanh(*B);
+}
+
+template <typename Matrix>
+void testSigmoid(real illegal) {
+  MatrixPtr A = std::make_shared<Matrix>(10, 10);
+  MatrixPtr B = std::make_shared<Matrix>(10, 10);
+  A->randomizeUniform();
+  B->randomizeUniform();
+
+  SetTensorValue(*A, illegal);
+
+  A->sigmoid(*B);
+}
+
+TEST(fp, overflow) {
+  for (auto illegal : {-90.0, 90.0}) {
+    LOG(INFO) << " illegal=" << illegal;
+    testTanh<CpuMatrix>(illegal);
+    testSigmoid<CpuMatrix>(illegal);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+
+  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/math/tests/test_GpuProfiler.cpp b/paddle/legacy/math/tests/test_GpuProfiler.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ee27109f218ca56df8f42ca6395b22621f5fbc11
--- /dev/null
+++ b/paddle/legacy/math/tests/test_GpuProfiler.cpp
@@ -0,0 +1,165 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+
+#include <gtest/gtest.h>
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+void MatrixCheckErr(const Matrix& matrix1, const Matrix& matrix2) {
+  CHECK(matrix1.getHeight() == matrix2.getHeight());
+  CHECK(matrix1.getWidth() == matrix2.getWidth());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+  const real* data1 = matrix1.getData();
+  const real* data2 = matrix2.getData();
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      real a = data1[i * width + j];
+      real b = data2[i * width + j];
+      if (fabs(a - b) > err) {
+        if ((fabsf(a - b) / fabsf(a)) > (err / 10.0f)) {
+          count++;
+        }
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+void testBilinearFwdBwd(int numSamples,
+                        int imgSizeH,
+                        int imgSizeW,
+                        int channels) {
+  int inWidth = imgSizeH * imgSizeW * channels;
+  int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels;
+  real ratioH = 0.5;
+  real ratioW = 0.5;
+
+  // forward
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
+
+  input->randomizeUniform();
+  inputGpu->copyFrom(*input);
+
+  {
+    // nvprof: GPU Proflier
+    REGISTER_GPU_PROFILER("testBilinearFwdBwd");
+    target->bilinearForward(*input,
+                            imgSizeH,
+                            imgSizeW,
+                            2 * imgSizeH,
+                            2 * imgSizeW,
+                            channels,
+                            ratioH,
+                            ratioW);
+    targetGpu->bilinearForward(*inputGpu,
+                               imgSizeH,
+                               imgSizeW,
+                               2 * imgSizeH,
+                               2 * imgSizeW,
+                               channels,
+                               ratioH,
+                               ratioW);
+  }
+
+  // check
+  targetCheck->copyFrom(*targetGpu);
+  MatrixCheckErr(*target, *targetCheck);
+
+  // backward
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+  MatrixPtr targetCheckGrad =
+      CpuMatrix::create(numSamples, inWidth, false, false);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->bilinearBackward(*targetGrad,
+                              2 * imgSizeH,
+                              2 * imgSizeW,
+                              imgSizeH,
+                              imgSizeW,
+                              channels,
+                              ratioH,
+                              ratioW);
+  inputGpuGrad->bilinearBackward(*targetGpuGrad,
+                                 2 * imgSizeH,
+                                 2 * imgSizeW,
+                                 imgSizeH,
+                                 imgSizeW,
+                                 channels,
+                                 ratioH,
+                                 ratioW);
+
+  // check
+  targetCheckGrad->copyFrom(*inputGpuGrad);
+  MatrixCheckErr(*inputGrad, *targetCheckGrad);
+}
+
+TEST(Profiler, testBilinearFwdBwd) {
+  auto numSamples = 10;
+  auto channels = 16;
+  auto imgSize = 64;
+  {
+    // nvprof: GPU Proflier
+    REGISTER_GPU_PROFILER("testBilinearFwdBwd");
+    // Paddle built-in timer
+    REGISTER_TIMER_INFO(
+        "testBilinearFwdBwd",
+        "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
+    testBilinearFwdBwd(numSamples, imgSize, imgSize, channels);
+  }
+  globalStat.printAllStatus();
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+
+  // nvprof: GPU Proflier
+  REGISTER_GPU_PROFILER(
+      "RecursiveProfilingTest",
+      "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
+
+  return RUN_ALL_TESTS();
+}
+
+#endif
diff --git a/paddle/math/tests/test_Matrix.cpp b/paddle/legacy/math/tests/test_Matrix.cpp
similarity index 100%
rename from paddle/math/tests/test_Matrix.cpp
rename to paddle/legacy/math/tests/test_Matrix.cpp
diff --git a/paddle/legacy/math/tests/test_RowBuffer.cpp b/paddle/legacy/math/tests/test_RowBuffer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2ef8cd303d65f50cd18adb7f80fa18a665b67340
--- /dev/null
+++ b/paddle/legacy/math/tests/test_RowBuffer.cpp
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/legacy/math/RowBuffer.h"
+
+TEST(RowBuffer, testAutoGrow) {
+  paddle::RowBuffer buf(128);
+  ASSERT_EQ(128UL, buf.getWidth());
+  ASSERT_TRUE(buf.isAutoGrowth());
+  buf.resize(2);
+  ASSERT_EQ(2UL, buf.getRowCount());
+  for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
+    buf.data()[i] = i;
+  }
+  for (size_t i = 0; i < buf.getRowCount(); ++i) {
+    for (size_t j = 0; j < buf.getWidth(); ++j) {
+      ASSERT_NEAR(i * buf.getWidth() + j, buf.get(i)[j], 1e-5);
+    }
+  }
+
+  auto data = buf.getWithAutoGrowth(2);
+  for (size_t i = 0; i < buf.getWidth(); ++i) {
+    data[i] = i;
+  }
+
+  ASSERT_EQ(3UL, buf.getRowCount());
+  for (size_t i = 0; i < buf.getRowCount() - 1; ++i) {
+    for (size_t j = 0; j < buf.getWidth(); ++j) {
+      ASSERT_NEAR(i * buf.getWidth() + j, buf.get(i)[j], 1e-5);
+    }
+  }
+  for (size_t i = 0; i < buf.getWidth(); ++i) {
+    ASSERT_NEAR(i, buf.get(2)[i], 1e-5);
+  }
+}
+
+TEST(RowBuffer, testWithMemBuf) {
+  paddle::CpuMemHandlePtr mem =
+      std::make_shared<paddle::CpuMemoryHandle>(128 * 2 * sizeof(real));
+  paddle::RowBuffer buf(mem, 128);
+  ASSERT_TRUE(!buf.isAutoGrowth());
+  ASSERT_EQ(2UL, buf.getRowCount());
+  for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
+    buf.data()[i] = i;
+  }
+  for (size_t i = 0; i < buf.getRowCount(); ++i) {
+    for (size_t j = 0; j < buf.getWidth(); ++j) {
+      ASSERT_NEAR(i * buf.getWidth() + j, buf.getWithAutoGrowth(i)[j], 1e-5);
+    }
+  }
+
+  ASSERT_DEATH_IF_SUPPORTED(buf.getWithAutoGrowth(3), ".*");
+}
diff --git a/paddle/legacy/math/tests/test_SIMDFunctions.cpp b/paddle/legacy/math/tests/test_SIMDFunctions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c6490f70e336dadcf6710c83ced2afddc13b7812
--- /dev/null
+++ b/paddle/legacy/math/tests/test_SIMDFunctions.cpp
@@ -0,0 +1,171 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/math/SIMDFunctions.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <random>
+
+#include <stdlib.h>
+#include <time.h>
+
+static constexpr size_t VECTOR_LEN = 3072;
+static constexpr size_t BATCH_SIZE = 64;
+static constexpr size_t ALIGN = 32;
+static_assert(VECTOR_LEN % ALIGN == 0, "VECTOR_LEN % ALIGN == 0");
+static_assert(BATCH_SIZE % ALIGN == 0, "BATCH_SIZE % ALIGN == 0");
+static constexpr float EPSILON = 1e-5;
+static std::mt19937 RandomEngine(time(0));
+
+inline static std::unique_ptr<float[]> NewVector(size_t len = VECTOR_LEN,
+                                                 size_t align = ALIGN) {
+  float* ptr;
+  CHECK_EQ(posix_memalign((void**)&ptr, align, len * sizeof(float)), 0);
+  return std::unique_ptr<float[]>(ptr);
+}
+
+inline static std::unique_ptr<float[]> NewRandomVector(size_t len = VECTOR_LEN,
+                                                       size_t align = ALIGN) {
+  std::uniform_real_distribution<float> dist(-100.0f, 100.0f);
+  auto generator = std::bind(dist, RandomEngine);
+  auto retv = NewVector(len, align);
+  std::generate_n(retv.get(), len, generator);
+  return retv;
+}
+
+TEST(SIMDFunction, addTo) {
+  typedef std::function<void(float*, const float*, size_t)> AddToMethodType;
+
+  AddToMethodType naive = paddle::simd::naive::addTo<float>;
+  AddToMethodType simd = paddle::simd::addTo<float>;
+
+  auto A = NewRandomVector();
+  auto B = NewRandomVector();
+
+  auto ACopy = NewVector();
+  memcpy(ACopy.get(), A.get(), VECTOR_LEN * sizeof(float));
+
+  naive(A.get(), B.get(), VECTOR_LEN);
+  simd(ACopy.get(), B.get(), VECTOR_LEN);
+
+  for (size_t i = 0; i < VECTOR_LEN; ++i) {
+    ASSERT_NEAR(A[i], ACopy[i], EPSILON);
+  }
+}
+
+TEST(SIMDFunction, batchAddTo) {
+  auto A = NewRandomVector();
+  auto ACopy = NewVector();
+  memcpy(ACopy.get(), A.get(), sizeof(float) * VECTOR_LEN);
+
+  std::vector<std::unique_ptr<float[]>> B;
+  for (size_t i = 0; i < BATCH_SIZE; ++i) {
+    B.emplace_back(NewRandomVector());
+  }
+  std::unique_ptr<float* []> BRaw(new float*[BATCH_SIZE]);
+  for (size_t i = 0; i < BATCH_SIZE; ++i) {
+    BRaw[i] = B[i].get();
+  }
+
+  typedef std::function<void(float*, const float**, int, size_t)>
+      BatchAddToMethodType;
+
+  BatchAddToMethodType naive = paddle::simd::naive::batchAddTo<float>;
+  BatchAddToMethodType simd = paddle::simd::batchAddTo<float>;
+
+  naive(A.get(), (const float**)BRaw.get(), BATCH_SIZE, VECTOR_LEN);
+  simd(ACopy.get(), (const float**)BRaw.get(), BATCH_SIZE, VECTOR_LEN);
+
+  for (size_t i = 0; i < VECTOR_LEN; ++i) {
+    ASSERT_NEAR(A[i], ACopy[i], EPSILON);
+  }
+}
+
+TEST(SIMDFunction, colMax) {
+  auto A = NewRandomVector(VECTOR_LEN * BATCH_SIZE);
+  auto naiveResult = NewVector(BATCH_SIZE);
+  auto simdResult = NewVector(BATCH_SIZE);
+
+  typedef std::function<void(float*, const float*, int, int)> ColMaxMethodType;
+  ColMaxMethodType naive = paddle::simd::naive::colMax<float>;
+  ColMaxMethodType simd = paddle::simd::colMax<float>;
+
+  naive(naiveResult.get(), A.get(), BATCH_SIZE, VECTOR_LEN);
+  simd(simdResult.get(), A.get(), BATCH_SIZE, VECTOR_LEN);
+
+  for (size_t i = 0; i < BATCH_SIZE; ++i) {
+    ASSERT_NEAR(naiveResult[i], simdResult[i], EPSILON);
+  }
+}
+
+TEST(SIMDFunction, decayL1_WithLR) {
+  auto dest = NewRandomVector();
+  auto src = NewRandomVector();
+  auto lr = NewRandomVector();
+  auto lambda = 0.23f;
+
+  auto simd_dest = NewVector();
+  memcpy(simd_dest.get(), dest.get(), sizeof(float) * VECTOR_LEN);
+
+  typedef std::function<void(float*, float*, float*, float, size_t)>
+      DecayL1MethodType;
+
+  DecayL1MethodType naive = [](
+      float* d, float* s, float* lr, float l, size_t len) {
+    paddle::simd::naive::decayL1<float>(d, s, lr, l, len);
+  };
+
+  DecayL1MethodType simd = [](
+      float* d, float* s, float* lr, float l, size_t len) {
+    paddle::simd::decayL1<float>(d, s, lr, l, len);
+  };
+
+  naive(dest.get(), src.get(), lr.get(), lambda, VECTOR_LEN);
+  simd(simd_dest.get(), src.get(), lr.get(), lambda, VECTOR_LEN);
+
+  for (size_t i = 0; i < VECTOR_LEN; ++i) {
+    ASSERT_NEAR(dest[i], simd_dest[i], EPSILON);
+  }
+}
+
+TEST(SIMDFunction, decayL1_WithoutLR) {
+  auto dest = NewRandomVector();
+  auto src = NewRandomVector();
+  auto lambda = 0.23;
+
+  auto simd_dest = NewVector();
+  memcpy(simd_dest.get(), dest.get(), sizeof(float) * VECTOR_LEN);
+
+  typedef std::function<void(float*, float*, float, size_t)> DecayL1MethodType;
+
+  DecayL1MethodType naive = [](float* d, float* s, float l, size_t len) {
+    paddle::simd::naive::decayL1<float>(d, s, l, len);
+  };
+
+  DecayL1MethodType simd = [](float* d, float* s, float l, size_t len) {
+    paddle::simd::decayL1<float>(d, s, l, len);
+  };
+
+  naive(dest.get(), src.get(), lambda, VECTOR_LEN);
+  simd(simd_dest.get(), src.get(), lambda, VECTOR_LEN);
+
+  for (size_t i = 0; i < VECTOR_LEN; ++i) {
+    ASSERT_NEAR(dest[i], simd_dest[i], EPSILON);
+  }
+}
diff --git a/paddle/legacy/math/tests/test_SparseMatrix.cpp b/paddle/legacy/math/tests/test_SparseMatrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..30896a945ec6d111c35eea94d8008a62593d2893
--- /dev/null
+++ b/paddle/legacy/math/tests/test_SparseMatrix.cpp
@@ -0,0 +1,565 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/legacy/utils/PythonUtil.h>
+#include <vector>
+#include "test_matrixUtil.h"
+
+using namespace paddle;  // NOLINT
+
+TEST(Matrix, CopyCpuMatrixToSparseMatrix) {
+  const size_t HEIGHT = 20;
+  const size_t WIDTH = 10;
+  const size_t WIDTH_TEST = 15;
+  MatrixPtr testMatrix(
+      new CpuSparseMatrix(HEIGHT, WIDTH, HEIGHT * 5, FLOAT_VALUE, SPARSE_CSR));
+  MatrixPtr testCpuMatrix(new CpuMatrix(HEIGHT, WIDTH));
+  testCpuMatrix->randomizeUniform();
+  testMatrix->copyFrom(*testCpuMatrix, HPPL_STREAM_DEFAULT);
+  MatrixPtr mulCpuMatrix(new CpuMatrix(WIDTH, WIDTH_TEST));
+  mulCpuMatrix->randomizeUniform();
+  MatrixPtr ret1(new CpuMatrix(HEIGHT, WIDTH_TEST)),
+      ret2(new CpuMatrix(HEIGHT, WIDTH_TEST));
+  ret1->zeroMem();
+  ret2->zeroMem();
+  ret1->mul(*testMatrix, *mulCpuMatrix, 1.0, 1.0);
+  ret2->mul(*testCpuMatrix, *mulCpuMatrix, 1.0, 1.0);
+  checkMatrixEqual(ret1, ret2);
+}
+
+struct MatrixPara {
+  size_t height;
+  size_t width;
+  bool trans;
+  bool sparse;
+  size_t nnz;
+  SparseFormat format;
+};
+
+#ifdef PADDLE_WITH_CUDA
+void test_sparse_matrix_mul(MatrixPara paraA,
+                            MatrixPara paraB,
+                            MatrixPara paraC) {
+  // for cpu sparse matrix mul
+  MatrixPtr cpuMatrixA, cpuMatrixB, cpuMatrixC, gpuMatrixC_d2h;
+  // for gpu sparse matrix mul
+  MatrixPtr gpuMatrixA, gpuMatrixB, gpuMatrixC;
+  // for cpu dense matrix mul
+  MatrixPtr cpuDenseA, cpuDenseB, cpuDenseC;
+
+  if (paraA.sparse) {
+    cpuMatrixA = Matrix::createSparseMatrix(paraA.height,
+                                            paraA.width,
+                                            paraA.nnz,
+                                            FLOAT_VALUE,
+                                            paraA.format,
+                                            paraA.trans,
+                                            false);
+    gpuMatrixA = Matrix::createSparseMatrix(paraA.height,
+                                            paraA.width,
+                                            paraA.nnz,
+                                            FLOAT_VALUE,
+                                            paraA.format,
+                                            paraA.trans,
+                                            true);
+  } else {
+    cpuMatrixA = Matrix::create(paraA.height, paraA.width, paraA.trans, false);
+    gpuMatrixA = Matrix::create(paraA.height, paraA.width, paraA.trans, true);
+  }
+  cpuDenseA = Matrix::create(paraA.height, paraA.width, paraA.trans, false);
+
+  if (paraB.sparse) {
+    cpuMatrixB = Matrix::createSparseMatrix(paraB.height,
+                                            paraB.width,
+                                            paraB.nnz,
+                                            FLOAT_VALUE,
+                                            paraB.format,
+                                            paraB.trans,
+                                            false);
+    gpuMatrixB = Matrix::createSparseMatrix(paraB.height,
+                                            paraB.width,
+                                            paraB.nnz,
+                                            FLOAT_VALUE,
+                                            paraB.format,
+                                            paraB.trans,
+                                            true);
+  } else {
+    cpuMatrixB = Matrix::create(paraB.height, paraB.width, paraB.trans, false);
+    gpuMatrixB = Matrix::create(paraB.height, paraB.width, paraB.trans, true);
+  }
+  cpuDenseB = Matrix::create(paraB.height, paraB.width, paraB.trans, false);
+
+  if (paraC.sparse) {
+    cpuMatrixC = Matrix::createSparseMatrix(paraC.height,
+                                            paraC.width,
+                                            paraC.nnz,
+                                            FLOAT_VALUE,
+                                            paraC.format,
+                                            paraC.trans,
+                                            false);
+    gpuMatrixC = Matrix::createSparseMatrix(paraC.height,
+                                            paraC.width,
+                                            paraC.nnz,
+                                            FLOAT_VALUE,
+                                            paraC.format,
+                                            paraC.trans,
+                                            true);
+    gpuMatrixC_d2h = Matrix::createSparseMatrix(paraC.height,
+                                                paraC.width,
+                                                paraC.nnz,
+                                                FLOAT_VALUE,
+                                                paraC.format,
+                                                paraC.trans,
+                                                false);
+  } else {
+    cpuMatrixC = Matrix::create(paraC.height, paraC.width, paraC.trans, false);
+    gpuMatrixC = Matrix::create(paraC.height, paraC.width, paraC.trans, true);
+    gpuMatrixC_d2h =
+        Matrix::create(paraC.height, paraC.width, paraC.trans, false);
+  }
+  cpuDenseC = Matrix::create(paraC.height, paraC.width, paraC.trans, false);
+
+  /*matrix init*/
+  hl_stream_t stream(HPPL_STREAM_1);
+  cpuMatrixA->randomizeUniform();
+  cpuMatrixB->randomizeUniform();
+  cpuMatrixC->randomizeUniform();
+
+  gpuMatrixA->copyFrom(*cpuMatrixA, stream);
+  gpuMatrixB->copyFrom(*cpuMatrixB, stream);
+  gpuMatrixC->copyFrom(*cpuMatrixC, stream);
+
+  cpuDenseA->copyFrom(*cpuMatrixA);
+  cpuDenseB->copyFrom(*cpuMatrixB);
+  cpuDenseC->copyFrom(*cpuMatrixC);
+
+  hl_stream_synchronize(stream);
+
+  /*matrix mul*/
+  cpuMatrixC->mul(*cpuMatrixA, *cpuMatrixB, 1.0, 1.0);
+  gpuMatrixC->mul(*gpuMatrixA, *gpuMatrixB, 1.0, 1.0);
+  cpuDenseC->mul(*cpuDenseA, *cpuDenseB, 1.0, 1.0);
+
+  gpuMatrixC_d2h->copyFrom(*gpuMatrixC, stream);
+  hl_stream_synchronize(stream);
+
+  /*check result*/
+  if (paraC.sparse) {
+    checkSMatrixEqual(
+        std::dynamic_pointer_cast<CpuSparseMatrix>(cpuMatrixC),
+        std::dynamic_pointer_cast<CpuSparseMatrix>(gpuMatrixC_d2h));
+    checkSMatrixEqual2Dense(
+        std::dynamic_pointer_cast<CpuSparseMatrix>(cpuMatrixC),
+        std::dynamic_pointer_cast<CpuMatrix>(cpuDenseC));
+  } else {
+    checkMatrixEqual(cpuMatrixC, gpuMatrixC_d2h);
+    checkMatrixEqual(cpuMatrixC, cpuDenseC);
+  }
+}
+
+TEST(Matrix, SparseMatrixMul) {
+  const size_t DIM_M = 4;
+  const size_t DIM_N = 4;
+  const size_t DIM_K = 8;
+  const size_t NNZ = 5;
+  for (auto format : {SPARSE_CSC, SPARSE_CSR}) {
+    std::string str_format = format == SPARSE_CSC ? "CSC" : "CSR";
+    LOG(INFO) << "test dense mul " << str_format;
+    test_sparse_matrix_mul(
+        {DIM_M, DIM_K, /*trans*/ false, /*sparse*/ false, NNZ, format},
+        {DIM_K, DIM_N, /*trans*/ false, /*sparse*/ true, NNZ, format},
+        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format});
+
+    LOG(INFO) << "test dense mul " << str_format << "  trans";
+    test_sparse_matrix_mul(
+        {DIM_M, DIM_K, /*trans*/ false, /*sparse*/ false, NNZ, format},
+        {DIM_N, DIM_K, /*trans*/ true, /*sparse*/ true, NNZ, format},
+        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format});
+
+    LOG(INFO) << "test dense mul dense 2 " << str_format;
+    test_sparse_matrix_mul(
+        {DIM_M, DIM_K, /*trans*/ false, /*sparse*/ false, NNZ, format},
+        {DIM_K, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format},
+        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ true, NNZ, format});
+
+    LOG(INFO) << "test denseT mul dense 2 " << str_format;
+    test_sparse_matrix_mul(
+        {DIM_K, DIM_M, /*trans*/ true, /*sparse*/ false, NNZ, format},
+        {DIM_K, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format},
+        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ true, NNZ, format});
+  }
+}
+
+TEST(Matrix, CopySparseMatrixToGpuSparseMatrix) {
+  const size_t HEIGHT = 20;
+  const size_t WIDTH = 10;
+  const size_t WIDTH_TEST = 15;
+  MatrixPtr testMatrix(
+      new CpuSparseMatrix(HEIGHT, WIDTH, HEIGHT * 2, FLOAT_VALUE, SPARSE_CSR));
+  MatrixPtr testCpuMatrix(new CpuMatrix(HEIGHT, WIDTH));
+  testCpuMatrix->randomizeUniform();
+  testMatrix->copyFrom(*testCpuMatrix, HPPL_STREAM_DEFAULT);
+
+  MatrixPtr testGpuMatrix = testMatrix->clone(HEIGHT, WIDTH, true);
+  hl_stream_t gpuStream(HPPL_STREAM_3);
+  testGpuMatrix->copyFrom(*testMatrix, gpuStream);
+  hl_stream_synchronize(gpuStream);
+
+  MatrixPtr mulCpuMatrix(new CpuMatrix(WIDTH, WIDTH_TEST));
+  mulCpuMatrix->randomizeUniform();
+  MatrixPtr mulGpuMatrix(new GpuMatrix(WIDTH, WIDTH_TEST));
+  mulGpuMatrix->copyFrom(*mulCpuMatrix);
+  MatrixPtr ret1(new CpuMatrix(HEIGHT, WIDTH_TEST));
+  MatrixPtr ret2(new GpuMatrix(HEIGHT, WIDTH_TEST));
+  ret1->zeroMem();
+  ret2->zeroMem();
+  ret1->mul(*testMatrix, *mulCpuMatrix, 1.0, 1.0);
+  ret2->mul(*testGpuMatrix, *mulGpuMatrix, 1.0, 1.0);
+  checkMatrixEqual(ret1, ret2);
+}
+
+#endif
+
+TEST(Matrix, SparseMatrixTranspose) {
+  for (auto height : {10, 50, 100}) {
+    for (auto width : {10, 50, 100}) {
+      auto nnz = height * width;
+      for (auto valueType : {FLOAT_VALUE, NO_VALUE}) {
+        for (auto format : {SPARSE_CSR, SPARSE_CSC}) {
+          for (auto sparseRate : {0.1, 0.2, 0.5}) {
+            MatrixPtr matA = Matrix::createSparseMatrix(
+                height, width, size_t(nnz * sparseRate), valueType, format);
+            MatrixPtr matB(new CpuSparseMatrix(
+                width, height, size_t(nnz * sparseRate), valueType, format));
+            matA->randomizeUniform();
+            matA->transpose(matB, false);
+
+            /*dense matrix transpose*/
+            CpuMatrixPtr matC(new CpuMatrix(height, width));
+            matC->copyFrom(*matA);
+            MatrixPtr matD(new CpuMatrix(width, height));
+            matC->transpose(matD, false);
+
+            /*check result*/
+            checkSMatrixEqual2Dense(
+                std::dynamic_pointer_cast<CpuSparseMatrix>(matB),
+                std::dynamic_pointer_cast<CpuMatrix>(matD));
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Matrix, CpuSparseMatrixSubMatrix) {
+  const size_t HEIGHT = 10;
+  const size_t WIDTH = 10;
+  const size_t NNZ = HEIGHT * WIDTH;
+  for (auto valueType : {FLOAT_VALUE, NO_VALUE}) {
+    size_t startRow = 3;
+    size_t rowNum = 2;
+    real sparseRate = 0.1;
+    /*sparse matrix init and get subMatrix*/
+    CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
+        HEIGHT, WIDTH, size_t(NNZ * sparseRate), valueType, SPARSE_CSR);
+    matA->randomizeUniform();
+    CpuSparseMatrixPtr matB = std::dynamic_pointer_cast<CpuSparseMatrix>(
+        matA->subMatrix(startRow, rowNum));
+
+    int start = matA->getRows()[startRow];
+    int end = matA->getRows()[startRow + rowNum];
+
+    /*compare two matrix*/
+    ASSERT_EQ(matB->getElementCnt(), size_t(end - start));
+    if (valueType == FLOAT_VALUE) {
+      for (size_t i = 0; i < matB->getElementCnt(); i++) {
+        ASSERT_FLOAT_EQ(matB->getValue()[start + i],
+                        matA->getValue()[start + i]);
+      }
+    }
+
+    for (size_t i = 0; i < matB->getElementCnt(); i++) {
+      ASSERT_EQ(matB->getCols()[start + i], matA->getCols()[start + i]);
+    }
+    for (size_t i = 0; i < rowNum; i++) {
+      ASSERT_EQ(matB->getRows()[i], matA->getRows()[startRow + i]);
+    }
+  }
+}
+
+void sparseValid(
+    int* major, int* minor, size_t nnz, size_t majorLen, size_t minorLen) {
+  CHECK_EQ(nnz, size_t(major[majorLen - 1]));
+  CHECK_EQ(nnz, minorLen);
+  for (size_t i = 0; i < majorLen - 1; i++) {
+    EXPECT_LE(major[i], major[i + 1]);
+    for (int j = major[i]; j < major[i + 1] - 1; j++) {
+      EXPECT_LE(minor[j], minor[j + 1]);
+    }
+  }
+}
+
+TEST(Matrix, CpuSparseMatrixRandUniform) {
+  const size_t HEIGHT = 5;
+  const size_t WIDTH = 10;
+  const size_t NNZ = HEIGHT * WIDTH;
+  int* major = nullptr;
+  int* minor = nullptr;
+  size_t majorLen = 0;
+  size_t minorLen = 0;
+  size_t nnz = 0;
+  for (auto valueType : {NO_VALUE, FLOAT_VALUE}) {
+    for (auto format : {SPARSE_CSR, SPARSE_CSC}) {
+      CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
+          HEIGHT, WIDTH, size_t(NNZ * 0.1), valueType, format);
+      matA->randomizeUniform();
+      nnz = matA->getElementCnt();
+      if (format == SPARSE_CSR) {
+        majorLen = matA->getHeight() + 1;
+        minorLen = matA->getElementCnt();
+        major = matA->getRows();
+        minor = matA->getCols();
+      } else {
+        majorLen = matA->getWidth() + 1;
+        minorLen = matA->getElementCnt();
+        major = matA->getCols();
+        minor = matA->getRows();
+      }
+      sparseValid(major, minor, nnz, majorLen, minorLen);
+    }
+  }
+}
+
+TEST(Matrix, CpuSparseMatrixCopyFrom) {
+  size_t height = 10;
+  size_t width = 8;
+  int64_t indices[11] = {0, 1, 5, 5, 9, 13, 15, 17, 19, 30, 32};
+  sparse_non_value_t data[32];
+  for (size_t i = 0; i < 32; i++) {
+    data[i].col = ::rand() % width;
+  }
+  CpuSparseMatrixPtr mat = std::make_shared<CpuSparseMatrix>(
+      height, width, 32, NO_VALUE, SPARSE_CSR, false);
+  mat->copyFrom(indices, data);
+
+  /*compare indices*/
+  size_t sum = 0;
+  CHECK_EQ(sum, size_t(mat->getRows()[0]));
+  for (size_t i = 1; i < height + 1; i++) {
+    sum += indices[i] - indices[i - 1];
+    CHECK_EQ(sum, size_t(mat->getRows()[i]));
+  }
+  CHECK_EQ(mat->getElementCnt(), size_t(indices[height] - indices[0]));
+  for (size_t i = 0; i < mat->getElementCnt(); i++) {
+    CHECK_EQ(size_t(mat->getCols()[i]), size_t(data[i].col));
+  }
+}
+
+TEST(Matrix, SparseMatrixCSRFormatTrimFrom) {
+  size_t height = 10;
+  size_t width = 8;
+  int64_t indices[11] = {0, 1, 5, 5, 9, 13, 15, 17, 19, 27, 32};
+  sparse_float_value_t data[32];
+  int value[32] = {
+      1,                       // row_0 : 1
+      5, 3, 1, 6,              // row_1 : 4
+      0, 1, 2, 3,              // row_3 : 4
+      4, 5, 6, 7,              // row_4 : 4
+      2, 3,                    // row_5 : 2
+      3, 5,                    // row_6 : 2
+      0, 1,                    // row_7 : 2
+      0, 1, 2, 3, 4, 5, 6, 7,  // row_8 : 8
+      2, 4, 7, 3, 1            // row_9 : 5
+  };
+  for (size_t i = 0; i < 32; i++) {
+    data[i].col = value[i];
+    data[i].value = float(value[i]);
+  }
+  CpuSparseMatrixPtr mat = std::make_shared<CpuSparseMatrix>(
+      height, width, 32, FLOAT_VALUE, SPARSE_CSR, false);
+  mat->copyFrom(indices, data);
+
+  /*compare indices*/
+  size_t sum = 0;
+  CHECK_EQ(sum, size_t(mat->getRows()[0]));
+  for (size_t i = 1; i < height + 1; i++) {
+    sum += indices[i] - indices[i - 1];
+    CHECK_EQ(sum, size_t(mat->getRows()[i]));
+  }
+  CHECK_EQ(mat->getElementCnt(), size_t(indices[height] - indices[0]));
+  for (size_t i = 0; i < mat->getElementCnt(); i++) {
+    CHECK_EQ(size_t(mat->getCols()[i]), size_t(data[i].col));
+  }
+
+  size_t trimedWidth = 4;
+  int64_t trimedIndices[11] = {0, 1, 3, 3, 7, 7, 9, 10, 12, 16, 19};
+  sparse_float_value_t trimedData[19];
+  int trimedValue[19] = {
+      1,  // row_0 : 1
+      3,
+      1,  // row_1 : 2
+      0,
+      1,
+      2,
+      3,  // row_3 : 4
+      2,
+      3,  // row_5 : 2
+      3,  // row_6 : 1
+      0,
+      1,  // row_7 : 2
+      0,
+      1,
+      2,
+      3,  // row_8 : 4
+      2,
+      3,
+      1  // row_9 : 3
+  };
+  for (size_t i = 0; i < 19; i++) {
+    trimedData[i].col = trimedValue[i];
+    trimedData[i].value = float(trimedValue[i]);
+  }
+  CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
+      height, trimedWidth, 19, FLOAT_VALUE, SPARSE_CSR, false);
+  matA->copyFrom(trimedIndices, trimedData);
+
+  /*compare indices*/
+  sum = 0;
+  CHECK_EQ(sum, size_t(matA->getRows()[0]));
+  for (size_t i = 1; i < height + 1; i++) {
+    sum += trimedIndices[i] - trimedIndices[i - 1];
+    CHECK_EQ(sum, size_t(matA->getRows()[i]));
+  }
+  CHECK_EQ(matA->getElementCnt(),
+           size_t(trimedIndices[height] - trimedIndices[0]));
+  for (size_t i = 0; i < matA->getElementCnt(); i++) {
+    CHECK_EQ(size_t(matA->getCols()[i]), size_t(trimedData[i].col));
+  }
+
+  CpuSparseMatrixPtr matB = std::make_shared<CpuSparseMatrix>(
+      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSR, false);
+  matB->trimFrom(*mat);
+  checkSMatrixEqual2(matA, matB);
+
+#ifdef PADDLE_WITH_CUDA
+  GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>(
+      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSR, true);
+  matC->trimFrom(*mat);
+
+  CpuSparseMatrixPtr matD =
+      std::make_shared<CpuSparseMatrix>(height,
+                                        trimedWidth,
+                                        matC->getElementCnt(),
+                                        FLOAT_VALUE,
+                                        SPARSE_CSR,
+                                        false);
+  matD->copyFrom(*matC, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  checkSMatrixEqual2(matA, matD);
+#endif
+}
+
+TEST(Matrix, SparseMatrixCSCFormatTrimFrom) {
+  size_t height = 8;
+  size_t width = 10;
+  int indices[11] = {0, 1, 5, 5, 9, 13, 15, 17, 19, 27, 32};
+  int value[32] = {
+      1,                       // col_0 : 1
+      5, 3, 1, 6,              // col_1 : 4
+      0, 1, 2, 3,              // col_3 : 4
+      4, 5, 6, 7,              // col_4 : 4
+      2, 3,                    // col_5 : 2
+      3, 5,                    // col_6 : 2
+      0, 1,                    // col_7 : 2
+      0, 1, 2, 3, 4, 5, 6, 7,  // col_8 : 8
+      2, 4, 7, 3, 1            // col_9 : 5
+  };
+  std::vector<int> rows(value, value + 32);
+  std::vector<int> cols(indices, indices + 11);
+  std::vector<real> values(value, value + 32);
+  CpuSparseMatrixPtr mat = std::make_shared<CpuSparseMatrix>(
+      height, width, 32, FLOAT_VALUE, SPARSE_CSC, false);
+  mat->copyFrom(rows, cols, values);
+
+  /*compare indices*/
+  size_t sum = 0;
+  CHECK_EQ(sum, size_t(mat->getCols()[0]));
+  for (size_t i = 1; i < width + 1; i++) {
+    sum += indices[i] - indices[i - 1];
+    CHECK_EQ(sum, size_t(mat->getCols()[i]));
+  }
+  CHECK_EQ(mat->getElementCnt(), size_t(indices[width] - indices[0]));
+  for (size_t i = 0; i < mat->getElementCnt(); i++) {
+    CHECK_EQ(size_t(mat->getRows()[i]), size_t(value[i]));
+  }
+
+  size_t trimedWidth = 5;
+  int trimedIndices[6] = {0, 1, 5, 5, 9, 13};
+  int trimedValue[13] = {
+      1,  // col_0 : 1
+      5,
+      3,
+      1,
+      6,  // col_1 : 4
+      0,
+      1,
+      2,
+      3,  // col_3 : 4
+      4,
+      5,
+      6,
+      7  // col_4 : 4
+  };
+  std::vector<int> rowsA(trimedValue, trimedValue + 13);
+  std::vector<int> colsA(trimedIndices, trimedIndices + 6);
+  std::vector<real> valuesA(trimedValue, trimedValue + 13);
+  CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
+      height, trimedWidth, 13, FLOAT_VALUE, SPARSE_CSC, false);
+  matA->copyFrom(rowsA, colsA, valuesA);
+
+  /*compare indices*/
+  sum = 0;
+  CHECK_EQ(sum, size_t(matA->getCols()[0]));
+  for (size_t i = 1; i < trimedWidth + 1; i++) {
+    sum += trimedIndices[i] - trimedIndices[i - 1];
+    CHECK_EQ(sum, size_t(matA->getCols()[i]));
+  }
+  CHECK_EQ(matA->getElementCnt(),
+           size_t(trimedIndices[trimedWidth] - trimedIndices[0]));
+  for (size_t i = 0; i < matA->getElementCnt(); i++) {
+    CHECK_EQ(size_t(matA->getRows()[i]), size_t(rowsA[i]));
+  }
+
+  CpuSparseMatrixPtr matB = std::make_shared<CpuSparseMatrix>(
+      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSC, false);
+  matB->trimFrom(*mat);
+  checkSMatrixEqual2(matA, matB);
+
+#ifdef PADDLE_WITH_CUDA
+  GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>(
+      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSC, true);
+  matC->trimFrom(*mat);
+
+  CpuSparseMatrixPtr matD =
+      std::make_shared<CpuSparseMatrix>(height,
+                                        trimedWidth,
+                                        matC->getElementCnt(),
+                                        FLOAT_VALUE,
+                                        SPARSE_CSC,
+                                        false);
+  matD->copyFrom(*matC, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  checkSMatrixEqual2(matA, matD);
+#endif
+}
diff --git a/paddle/legacy/math/tests/test_Tensor.cu b/paddle/legacy/math/tests/test_Tensor.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3ce056d66140059be8145f7f49bb80cbff4686eb
--- /dev/null
+++ b/paddle/legacy/math/tests/test_Tensor.cu
@@ -0,0 +1,1162 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "TensorCheck.h"
+#include "paddle/legacy/math/Matrix.h"
+
+using paddle::Matrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using paddle::CpuVector;
+using paddle::GpuVector;
+using paddle::CpuIVector;
+using paddle::GpuIVector;
+using autotest::TensorCheckEqual;
+using autotest::TensorCheckErr;
+
+#define INIT_UNARY(A1, A2)  \
+  Tensor A1(height, width); \
+  Tensor A2(height, width); \
+  A1.randomizeUniform();    \
+  A2.copyFrom(A1)
+#define INIT_BINARY(A1, A2, B) \
+  INIT_UNARY(A1, A2);          \
+  Tensor B(height, width);     \
+  B.randomizeUniform()
+#define INIT_TERNARY(A1, A2, B, C) \
+  INIT_BINARY(A1, A2, B);          \
+  Tensor C(height, width);         \
+  C.randomizeUniform()
+#define INIT_QUATERNARY(A1, A2, B, C, D) \
+  INIT_TERNARY(A1, A2, B, C);            \
+  Tensor D(height, width);               \
+  D.randomizeUniform()
+
+template <typename Tensor>
+struct TestUnaryMatrix {
+  typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
+
+  explicit TestUnaryMatrix(UnaryFunc testUnaryFunc) {
+    for (auto height : {1, 11, 73, 128, 200, 330}) {
+      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
+        LOG(INFO) << " height=" << height << " width=" << width;
+        INIT_UNARY(A1, A2);
+        testUnaryFunc(A1, A2);
+      }
+    }
+  }
+};
+
+template <typename Tensor>
+struct TestBinaryMatrix {
+  typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B)> BinaryFunc;
+
+  explicit TestBinaryMatrix(BinaryFunc testBinaryFunc) {
+    for (auto height : {1, 11, 73, 128, 200, 330}) {
+      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
+        LOG(INFO) << " height=" << height << " width=" << width;
+        INIT_BINARY(A1, A2, B);
+        testBinaryFunc(A1, A2, B);
+      }
+    }
+  }
+};
+
+template <typename Tensor>
+struct TestTernaryMatrix {
+  typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C)>
+      TernaryFunc;
+
+  explicit TestTernaryMatrix(TernaryFunc testTernaryFunc) {
+    for (auto height : {1, 11, 73, 128, 200, 330}) {
+      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
+        LOG(INFO) << " height=" << height << " width=" << width;
+        INIT_TERNARY(A1, A2, B, C);
+        testTernaryFunc(A1, A2, B, C);
+      }
+    }
+  }
+};
+
+template <typename Tensor>
+struct TestQuaternaryMatrix {
+  typedef std::function<void(
+      Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D)>
+      QuaternaryFunc;
+
+  explicit TestQuaternaryMatrix(QuaternaryFunc testQuaternaryFunc) {
+    for (auto height : {1, 11, 73, 128, 200, 330}) {
+      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
+        LOG(INFO) << " height=" << height << " width=" << width;
+        INIT_QUATERNARY(A1, A2, B, C, D);
+        testQuaternaryFunc(A1, A2, B, C, D);
+      }
+    }
+  }
+};
+
+template <typename Tensor, class T>
+struct TestUnaryVectorT {
+  typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
+
+  explicit TestUnaryVectorT(UnaryFunc testUnaryFunc) {
+    for (auto size : {1, 11, 73, 128, 200, 330, 512, 1000, 4210}) {
+      LOG(INFO) << " size=" << size;
+      Tensor A1(size);
+      Tensor A2(size);
+      if (typeid(T) == typeid(real)) {
+        A1.rand();
+      } else {
+        A1.rand(1000);
+      }
+      A2.copyFrom(A1);
+      testUnaryFunc(A1, A2);
+    }
+  }
+};
+
+void SetTensorValue(Matrix& matrix, real value) {
+  int height = matrix.getHeight();
+  int width = matrix.getWidth();
+  int stride = matrix.getStride();
+  real* data = matrix.getData();
+  for (int i = 0; i < height; i++) {
+    int j = rand() % width;  // NOLINT
+    if (typeid(matrix) == typeid(CpuMatrix)) {
+      data[i * stride + j] = value;
+    } else if (typeid(matrix) == typeid(GpuMatrix)) {
+      hl_memcpy(&data[i * stride + j], &value, sizeof(real));
+    } else {
+    }
+  }
+}
+
+template <typename Tensor>
+void testTensorAddScalar(Tensor& A1, Tensor& A2) {
+  real p1 = 2.5;
+  real p2 = 3.0;
+  A1.add(p1);  // a += p
+  A2 += p1;
+  TensorCheckEqual(A1, A2);
+
+  A1.add(p1, p2);  // a = a * p1 + p2
+  A2 = A2 * p1 + p2;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSubScalar(Tensor& A1, Tensor& A2) {
+  real p = 2.5;
+  A1.subScalar(p);  // a -= p
+  A2 -= p;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorMulScalar(Tensor& A1, Tensor& A2) {
+  real p = 2.5;
+  A1.mulScalar(p);  // a *= p
+  A2 *= p;
+  TensorCheckEqual(A1, A2);
+
+  real learningRate = 0.7f;
+  real decayRate = 1.2f;
+  A1.applyL2(learningRate, decayRate);
+  A2 = A2 * (1.0f / (1.0f + learningRate * decayRate));
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorDivScalar(Tensor& A1, Tensor& A2) {
+  real p = 2.5;
+  A1.divScalar(p);  // a /= p
+  A2 /= p;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorNeg(Tensor& A1, Tensor& A2) {
+  A1.neg();  // a = -a
+  A2 = -A2;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorAbs(Tensor& A1, Tensor& A2) {
+  A1.abs2();  // a = a > 0 ? a : -a
+  A2 = A2.abs();
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSquare(Tensor& A1, Tensor& A2) {
+  A1.square2();  // a = a * a
+  A2 = A2.square();
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorReciprocal(Tensor& A1, Tensor& A2) {
+  A1.reciprocal2();  // a = 1.0f / a
+  A2 = A2.reciprocal();
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSign(Tensor& A1, Tensor& A2) {
+  A1.sign2();  // a = (a > 0) - (a < 0)
+  A2 = A2.sign();
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorAssign(Tensor& A1, Tensor& A2) {
+  A1.assign(1.5);  // a = p
+  A2 = A2.constant(1.5);
+  TensorCheckEqual(A1, A2);
+
+  A1.one();  // a = 1
+  A2 = A2.constant(1.0);
+  TensorCheckEqual(A1, A2);
+
+  A1.zero();  // a = 0
+  A2 = A2.constant(0.0);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testUnaryBaseOp(Tensor& A1, Tensor& A2) {
+  testTensorAddScalar(A1, A2);
+  testTensorSubScalar(A1, A2);
+  testTensorMulScalar(A1, A2);
+  testTensorDivScalar(A1, A2);
+  testTensorNeg(A1, A2);
+  testTensorAbs(A1, A2);
+  testTensorSquare(A1, A2);
+  testTensorReciprocal(A1, A2);
+  testTensorSign(A1, A2);
+  testTensorAssign(A1, A2);
+}
+
+template <typename Tensor>
+void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) {
+  A1.add(2);  // a += p
+  A2 += 2;
+  TensorCheckEqual(A1, A2);
+
+  A1.add(3, 2);  // a = a * p1 + p2
+  A2 = A2 * 3 + 2;
+  TensorCheckEqual(A1, A2);
+
+  testTensorNeg(A1, A2);
+  testTensorAbs(A1, A2);
+}
+
+TEST(Unary, BaseOp) {
+  TestUnaryMatrix<CpuMatrix> testCpuMatrix(testUnaryBaseOp<CpuMatrix>);
+  TestUnaryVectorT<CpuVector, real> testCpuVector(testUnaryBaseOp<CpuVector>);
+  TestUnaryVectorT<CpuIVector, int> testCpuIVector(
+      testUnaryBaseOpInt<CpuIVector>);
+
+#ifdef PADDLE_WITH_GPU
+  TestUnaryMatrix<GpuMatrix> testGpuMatrix(testUnaryBaseOp<GpuMatrix>);
+  TestUnaryVectorT<GpuVector, real> testGpuVector(testUnaryBaseOp<GpuVector>);
+  TestUnaryVectorT<GpuIVector, int> testGpuIVector(
+      testUnaryBaseOpInt<GpuIVector>);
+#endif
+}
+
+template <typename Tensor>
+void testTensorExp(Tensor& A1, Tensor& A2) {
+  A1.exp2();  // a = exp(a)
+  A2 = A2.exp();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorLog(Tensor& A1, Tensor& A2) {
+  A1.log2();  // a = log(a)
+  A2 = A2.log();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSqrt(Tensor& A1, Tensor& A2) {
+  A1.sqrt2();  // a = sqrt(a)
+  A2 = A2.sqrt();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorPow(Tensor& A1, Tensor& A2) {
+  A1.pow2(3.2);  // a = pow(a, p)
+  A2 = A2.pow(3.2);
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testUnayrMathOp(Tensor& A1, Tensor& A2) {
+  testTensorExp(A1, A2);
+  testTensorLog(A1, A2);
+  testTensorSqrt(A1, A2);
+  testTensorPow(A1, A2);
+}
+
+TEST(Unary, MathOp) {
+  TestUnaryMatrix<CpuMatrix> testCpu(testUnayrMathOp<CpuMatrix>);
+
+#ifdef PADDLE_WITH_GPU
+  TestUnaryMatrix<GpuMatrix> testGpu(testUnayrMathOp<GpuMatrix>);
+#endif
+}
+
+template <typename Tensor>
+void testTensorClip(Tensor& A1, Tensor& A2) {
+  real p1 = 0.003f;
+  real p2 = 0.877f;
+  A1.clip(p1, p2);  // a = a < p1 ? p1 : (a > p2 ? p2 : a)
+  // A2 = A2.min(0.877f).max(0.003f);
+  A2 = (A2 < p1).condition(p1, (A2 > p2).condition(p2, A2));
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) {
+  real p = 0.5f;
+  A1.biggerThanScalar(p);  // a = a > p ? 1.0f : 0.0f
+  A2 = (A2 > p).condition((real)1.0, (real)0.0);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorapplyL1(Tensor& A1, Tensor& A2) {
+  /**
+   * T lambda = p;
+   * a = (a > lambda) ? (a - lambda)
+   *                  : (a < -lambda) ? (a + lambda) : 0
+   *
+   * p = learningRate * decayRate;
+   */
+  real learningRate = 0.7f;
+  real decayRate = 0.6f;
+  A1.applyL1(learningRate, decayRate);
+  A2 = (A2 > (learningRate * decayRate))
+           .condition(
+               (A2 - (learningRate * decayRate)),
+               (A2 < -(learningRate * decayRate))
+                   .condition((A2 + (learningRate * decayRate)), (real)0.0));
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testUnayrCompareOp(Tensor& A1, Tensor& A2) {
+  testTensorClip(A1, A2);
+  testTensorBiggerThanScalar(A1, A2);
+
+  A1.randomizeUniform();
+  A1.subScalar(0.5f);
+  A2.copyFrom(A1);
+  testTensorapplyL1(A1, A2);
+}
+
+TEST(Unary, CompareOp) {
+  TestUnaryMatrix<CpuMatrix> testCpu(testUnayrCompareOp<CpuMatrix>);
+
+#ifdef PADDLE_WITH_GPU
+  TestUnaryMatrix<GpuMatrix> testGpu(testUnayrCompareOp<GpuMatrix>);
+#endif
+}
+
+template <typename Tensor>
+void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p1 = 2.5;
+  real p2 = 3.2;
+  A1.add(B);  // a += b
+  A2 += B;
+  TensorCheckEqual(A1, A2);
+
+  A1.add(B, p1);  // a += b * p
+  A2 += B * p1;
+  TensorCheckEqual(A1, A2);
+
+  A1.add(B, p1, p2);  // a = p1 * a + p2 * b
+  A2 = A2 * p1 + B * p2;
+  TensorCheckEqual(A1, A2);
+
+  A1.addScalar(B, p1);  // a = b + p
+  A2 = B + p1;
+  TensorCheckEqual(A1, A2);
+
+  A1.addSquare(B, p1);  // a += p * b * b
+  A2 += B.constant(p1) * B * B;
+  TensorCheckEqual(A1, A2);
+
+  A1.decayAddSquare(B, p1, p2);  // a = p1 * a + p2 * b * b
+  A2 = A2 * p1 + B.constant(p2) * B * B;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p = 2.5;
+  A1.sub(B);  // a -= b
+  A2 -= B;
+  TensorCheckEqual(A1, A2);
+
+  A1.sub(B, p);  // a -= b * p
+  A2 -= B * p;
+  TensorCheckEqual(A1, A2);
+
+  A1.subScalar(B, p);  // a = b - p
+  A2 = B - p;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p = 2.5;
+  A1.mulScalar(B, p);  // a = b * p
+  A2 = B * p;
+  TensorCheckEqual(A1, A2);
+
+  A1.dotMulSquare(B);  // a *= b * b
+  A2 *= B * B;
+  TensorCheckEqual(A1, A2);
+
+  A1.dotSquareMul(B);  // a = a * a * b
+  A2 = A2 * A2 * B;
+  TensorCheckEqual(A1, A2);
+
+  A1.dotMul(B);  // a *= b
+  A2 *= B;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p = 2.5;
+  A1.divScalar(B, p);  // a = b / p
+  A2 = B / p;
+  TensorCheckEqual(A1, A2);
+
+  A1.scalarDiv(B, p);  // a = p / b
+  A2 = B.constant(p) / B;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorAssign(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.assign(B);  // a = b
+  A2 = B;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSquare(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.square2(A1);  // b = a * a
+  A2 = B.square();
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSquareDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.squareDerivative(B);  // a *= 2.0 * b
+  A2 = A2 * (real)2.0 * B;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.reciprocal2(A1);  // b = 1.0f / a
+  A2 = B.reciprocal();
+  TensorCheckEqual(A1, A2);
+
+  real p1 = 0.58;
+  real p2 = 0.32;
+  A1.reciprocal2(B, p1, p2);  // a = 1 / (p1 * b + p2)
+  A2 = (B * p1 + p2).reciprocal();
+  TensorCheckEqual(A1, A2);
+
+  real learningRate = 0.7f;
+  real decayRate = 1.2f;
+  A1.applyL2(B, learningRate, decayRate);  // a *= (1.0f / (1.0f + p * b))
+  A2 *= (B.constant(1.0f) + B.constant(learningRate * decayRate) * B)
+            .reciprocal();
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorReciprocalDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.reciprocalDerivative(B);  // a *= -b * b
+  A2 *= (-B) * B;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSign(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.sign2(A1);  // b = a > 0.0f ? 1.0f : -1.0f
+  A2 = B.sign();
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorAbs(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.abs2(A1);  // b = a > 0.0f ? a : -a
+  A2 = B.abs();
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testBinaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B) {
+  testTensorAdd(A1, A2, B);
+  testTensorSub(A1, A2, B);
+  testTensorMul(A1, A2, B);
+  testTensorDiv(A1, A2, B);
+  testTensorSquare(A1, A2, B);
+  testTensorSquareDerivative(A1, A2, B);
+  testTensorReciprocal(A1, A2, B);
+  testTensorReciprocalDerivative(A1, A2, B);
+  testTensorAbs(A1, A2, B);
+  testTensorSign(A1, A2, B);
+  testTensorAssign(A1, A2, B);
+}
+
+TEST(Binary, BaseOp) {
+  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryBaseOp<CpuMatrix>);
+
+#ifdef PADDLE_WITH_GPU
+  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryBaseOp<GpuMatrix>);
+#endif
+}
+
+template <typename Tensor>
+void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) {
+  // a = exp(b)
+  A1.exp2(B);
+  A2 = B.exp();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorExpDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.expDerivative(B);  // a *= b
+  A2 *= B;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) {
+  // a = log(b)
+  A1.log2(B);
+  A2 = B.log();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
+  // a = sqrt(b)
+  A1.sqrt2(B);
+  A2 = B.sqrt();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
+  // a = 1.0f / sqrt(b)
+  A1.invSqrt(B);
+  A2 = B.sqrt().reciprocal();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorPow(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.pow2(B, 2.5f);  // a = pow(b, p)
+  A2 = B.pow(2.5f);
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) {
+  /*
+   * const T THRESHOLD = 40.0;
+   * b = log(1.0 +
+   *         exp((a > THRESHOLD) ? THRESHOLD
+   *             : ((a < -THRESHOLD) ? (-THRESHOLD) : a)))
+   */
+  B.softrelu(A1);
+
+  real THRESHOLD = 40.0;
+  A2 = (B.constant(1.0f) +
+        (B > THRESHOLD)
+            .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B))
+            .exp())
+           .log();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  /*
+   * const T THRESHOLD = 40.0;
+   * a *= (1.0 - exp(-1.0 * ((b > THRESHOLD)
+   *                             ? THRESHOLD
+   *                             : ((b < -THRESHOLD) ? (-THRESHOLD) : b)))));
+   */
+  A1.softreluDerivative(B);
+  real THRESHOLD = 40.0;
+  A2 = A2 *
+       (B.constant(1.0f) -
+        (B.constant(-1.0f) *
+         (B > THRESHOLD)
+             .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)))
+            .exp());
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) {
+  /*
+    const T THRESHOLD_MIN = -40.0;
+    const T THRESHOLD_MAX = 13.0;
+    T tmp = (a < THRESHOLD_MIN) ? THRESHOLD_MIN
+            : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
+    b = 1.0f / (1.0f + exp(-tmp)))
+   */
+  B.sigmoid(A1);
+
+  const real THRESHOLD_MIN = -40.0;
+  const real THRESHOLD_MAX = 13.0;
+  auto tmp = (B < THRESHOLD_MIN)
+                 .condition(THRESHOLD_MIN,
+                            (B > THRESHOLD_MAX).condition(THRESHOLD_MAX, B));
+  A2 = (B.constant(1.0f) + (-tmp).exp()).reciprocal();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSigmoidDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.sigmoidDerivative(B);  // a *= b * (1 - b)
+  A2 *= B * (B.constant(1.0f) - B);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorTanh(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.tanh(A1);  // b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
+  A2 = B.constant(2.0f) / ((B * ((real)-2.0f)).exp() + (real)1.0f) - (real)1.0f;
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.tanhDerivative(B);  // a *= 1 - b * b
+  A2 *= B.constant(1.0f) - B * B;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorScaledTanh(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p1 = 2.5;
+  real p2 = 3.1;
+  // b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0)
+  B.scaledTanh(A1, p1, p2);
+  A2 = B.constant(p1) *
+       (B.constant(2.0f) / ((B.constant(-2.0f) * p2 * B).exp() + (real)1.0) -
+        (real)1.0);
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p1 = 2.5;
+  real p2 = 3.1;
+  // a *= (p2 / p1) * (p1 * p1 - b * b));
+  A1.scaledTanhDerivative(B, p1, p2);
+  A2 = A2 * (B.constant(p2 / p1) * (B.constant(p1 * p1) - B * B));
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testBinaryMathOp(Tensor& A1, Tensor& A2, Tensor& B) {
+  testTensorTanhDerivative(A1, A2, B);
+  testTensorScaledTanhDerivative(A1, A2, B);
+  testTensorSigmoidDerivative(A1, A2, B);
+  testTensorExpDerivative(A1, A2, B);
+  testTensorScaledTanh(A1, A2, B);
+  testTensorTanh(A1, A2, B);
+  testTensorExp(A1, A2, B);
+  testTensorLog(A1, A2, B);
+  testTensorSqrt(A1, A2, B);
+  testTensorInvSqrt(A1, A2, B);
+  testTensorPow(A1, A2, B);
+
+  testTensorSoftrelu(A1, A2, B);
+  testTensorSoftreluDerivative(A1, A2, B);
+  testTensorSigmoid(A1, A2, B);
+}
+
+TEST(Binary, MathOp) {
+  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryMathOp<CpuMatrix>);
+
+#ifdef PADDLE_WITH_GPU
+  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryMathOp<GpuMatrix>);
+#endif
+}
+
+template <typename Tensor>
+void testTensorRelu(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.relu(A1);  // b = a > 0.0f ? a : 0.0f
+  A2 = (B > (real)0.0f).condition(B, (real)0.0f);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorReluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.reluDerivative(B);  // a *= (b > 0.0f ? 1.0f : 0.0f)
+  A2 *= (B > (real)0.0).condition((real)1.0, (real)0.0);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) {
+  /*
+   * b = a > p1 ? a : p1
+   * b = b < p2 ? b : p2
+   * int p1 = 0, p2 = 24;
+   */
+  SetTensorValue(B, 32.0f);
+  B.brelu(A1);
+  auto tmp = (B > (real)0.0f).condition(B, (real)0.0f);
+  A2 = (tmp < (real)24.0f).condition(tmp, (real)24.0f);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  SetTensorValue(B, 32.0f);
+  /*
+   * a *= (b > p1 && b < p2) ? 1.0 : 0.0
+   * int p1 = 0, p2 = 24;
+   */
+  A1.breluDerivative(B);
+  A2 *= (B > (real)0.0f && B < (real)24.0f).condition((real)1.0f, (real)0.0f);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorAbsDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.absDerivative(B);  // a = (b > 0) ? a : (b < 0) ? -a : 0
+  A2 = (B > (real)0.0f)
+           .condition(A2, (B < (real)0.0f).condition(-A2, (real)0.0f));
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p = 0.613;
+  SetTensorValue(B, p);
+  A1.isEqualTo(B, p);  // a = (b == p)
+  A2 = (B == p);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) {
+  /**
+   * T lambda = p * b;
+   * a = (a > lambda) ? (a - lambda)
+   *                  : (a < -lambda) ? (a + lambda) : 0
+   *
+   * p = learningRate * decayRate;
+   */
+  real learningRate = 0.7f;
+  real decayRate = 0.6f;
+  A1.applyL1(B, learningRate, decayRate);
+  auto lambda = B.constant(learningRate * decayRate) * B;
+  A2 = (A2 > lambda)
+           .condition((A2 - lambda),
+                      (A2 < -lambda).condition((A2 + lambda), (real)0.0f));
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testBinaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.subScalar(0.5f);
+  SetTensorValue(B, 0.0f);
+  testTensorReluDerivative(A1, A2, B);
+
+  A1.randomizeUniform();
+  A2.copyFrom(A1);
+  testTensorBreluDerivative(A1, A2, B);
+
+  testTensorAbsDerivative(A1, A2, B);
+  testTensorRelu(A1, A2, B);
+  testTensorBrelu(A1, A2, B);
+  testTensorIsEqualTo(A1, A2, B);
+}
+
+TEST(Binary, CompareOp) {
+  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryCompareOp<CpuMatrix>);
+
+#ifdef PADDLE_WITH_GPU
+  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryCompareOp<GpuMatrix>);
+#endif
+}
+
+template <typename Tensor>
+void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.add(B, C);  // a = b + c
+  A2 = B + C;
+  TensorCheckEqual(A1, A2);
+
+  real p1 = 1.5;
+  real p2 = 2.5;
+  real p3 = 3.8;
+  A1.add(B, p1, C, p2);  // a = p1 * b + p2 * c
+  A2 = B * p1 + C * p2;
+  TensorCheckEqual(A1, A2);
+
+  A1.add2(B, C);  // a = a + b + c
+  A2 = A2 + B + C;
+  TensorCheckEqual(A1, A2);
+
+  A1.add2(B, C, p1, p2, p3);  // a = p1 * a + p2 * b + p3 * c
+  A2 = A2 * p1 + B * p2 + C * p3;
+  TensorCheckEqual(A1, A2);
+
+  A1.decayAddSquareMul(B, C, p1, p2);  // a = p1 * a + p2 * b * b * c * c
+  A2 = A2 * p1 + B.constant(p2) * B * B * C * C;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.sub(B, C);  // a = b - c
+  A2 = B - C;
+  TensorCheckEqual(A1, A2);
+
+  real p1 = 1.5;
+  real p2 = 2.5;
+  A1.sub(B, p1, C, p2);  // a = p1 * b - p2 * c
+  A2 = B * p1 - C * p2;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.dotMul(B, C);  // a = b * c
+  A2 = B * C;
+  TensorCheckEqual(A1, A2);
+
+  A1.dotMulSquare(B, C);  // a = b * c * c
+  A2 = B * C * C;
+  TensorCheckEqual(A1, A2);
+
+  A1.dotSquareSquare(B, C);  // a = b * b * c * c
+  A2 = B * B * C * C;
+  TensorCheckEqual(A1, A2);
+
+  real p1 = 1.5;
+  real p2 = 2.5;
+
+  /*
+   * T tmp = p1 * b + p2 * c;
+   * a *= tmp * tmp
+   */
+  A1.dotMulSquareSum(B, C, p1, p2);
+  auto tmp = B * p1 + C * p2;
+  A2 *= tmp * tmp;
+  TensorCheckEqual(A1, A2);
+
+  /*
+   * T tmp = p1 * b + p2 * c;
+   * a = tmp * tmp
+   */
+  A1.dotSquareSum(B, C, p1, p2);
+  auto tmp2 = B * p1 + C * p2;
+  A2 = tmp2 * tmp2;
+  TensorCheckEqual(A1, A2);
+
+  // a *= p1 * b + p2 * c
+  A1.dotMulSum(B, C, p1, p2);
+  A2 *= B * p1 + C * p2;
+  TensorCheckEqual(A1, A2);
+
+  // a = p1 * a + p2 * b * c
+  A1.addDotMul(B, C, p1, p2);
+  A2 = A2 * p1 + B.constant(p2) * B * C;
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.dotDiv(B, C);  // a = (b == 0.0) ? 0.0 : b / c
+  A2 = (B == (real)0.0).condition((real)0.0, B / C);
+  TensorCheckEqual(A1, A2);
+
+  real p1 = 1.5;
+  real p2 = 2.5;
+  A1.dotDiv(B, C, p1, p2);  // a = (b + p1) / (c + p2)
+  A2 = (B + p1) / (C + p2);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  real p1 = 1.5;
+  real p2 = 2.5;
+  real p3 = 3.5;
+  A1.reciprocalSum(B, C, p1, p2, p3);  // a = 1 / (p1 * b + p2 * c + p3)
+  A2 = (B * p1 + C * p2 + p3).reciprocal();
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSoftCrossEntropy(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.softCrossEntropy(B, C);  // a = -c * log(b) - (1 - c) * log(1 - b)
+  A2 = -C * B.log() - (C.constant(1.0f) - C) * (B.constant(1.0f) - B).log();
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorSoftCrossEntropyBp(Tensor& A1,
+                                  Tensor& A2,
+                                  Tensor& B,
+                                  Tensor& C) {
+  A1.softCrossEntropyBp(B, C);  // a += (b - c) / (b * (1 - b))
+  A2 += (B - C) / (B * (B.constant(1.0f) - B));
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTernaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  testTensorAdd(A1, A2, B, C);
+  testTensorSub(A1, A2, B, C);
+  testTensorMul(A1, A2, B, C);
+  testTensorDiv(A1, A2, B, C);
+  testTensorReciprocal(A1, A2, B, C);
+  testTensorSoftCrossEntropyBp(A1, A2, B, C);
+
+  testTensorSoftCrossEntropy(A1, A2, B, C);
+}
+
+TEST(Ternary, BaseOp) {
+  TestTernaryMatrix<CpuMatrix> testCpu(testTernaryBaseOp<CpuMatrix>);
+
+#ifdef PADDLE_WITH_GPU
+  TestTernaryMatrix<GpuMatrix> testGpu(testTernaryBaseOp<GpuMatrix>);
+#endif
+}
+
+template <typename Tensor>
+void testTensorBinaryLabelCrossEntropy(Tensor& A1,
+                                       Tensor& A2,
+                                       Tensor& B,
+                                       Tensor& C) {
+  A1.binaryLabelCrossEntropy(B, C);  // a = c > 0.5 ? -log(b) : -log(1.0 - b)
+  A2 = (C > (real)0.5).condition(-(B.log()), -((B.constant(1.0f) - B).log()));
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorBinaryLabelCrossEntropyBp(Tensor& A1,
+                                         Tensor& A2,
+                                         Tensor& B,
+                                         Tensor& C) {
+  // a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b)
+  A1.binaryLabelCrossEntropyBp(B, C);
+  A2 += (C > (real)0.5)
+            .condition((B.constant(-1.0f) / B),
+                       (B.constant(1.0f) - B).reciprocal());
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorLogisticRegressionLoss(Tensor& A1,
+                                      Tensor& A2,
+                                      Tensor& B,
+                                      Tensor& C) {
+  SetTensorValue(B, 50.0f);
+  SetTensorValue(B, -50.0f);
+  /**
+   * const T THRESHOLD = 40.0;
+   * T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
+   *                                        ? -THRESHOLD
+   *                                        : b;
+   * a = log(1 + exp(x)) - c * x
+   */
+  A1.logisticRegressionLoss(B, C);
+  real THRESHOLD = 40.0;
+  auto tmp =
+      (B > THRESHOLD)
+          .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
+  A2 = (C.constant(1.0f) + tmp.exp()).log() - C * tmp;
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorLogisticRegressionLossBp(Tensor& A1,
+                                        Tensor& A2,
+                                        Tensor& B,
+                                        Tensor& C) {
+  SetTensorValue(B, 50.0f);
+  SetTensorValue(B, -50.0f);
+  /**
+   * const T THRESHOLD = 40.0;
+   * T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
+   *                                        ? -THRESHOLD
+   *                                        : b;
+   * x = exp(x); a = x / (1 + x) - c
+   */
+  A1.logisticRegressionLossBp(B, C);
+  real THRESHOLD = 40.0;
+  auto tmp =
+      (B > THRESHOLD)
+          .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
+  auto tmp2 = tmp.exp();
+  A2 = tmp2 / (C.constant(1.0) + tmp2) - C;
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorBiggerThan(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.biggerThan(B, C);  // a = (b > c) ? 1.0f : 0.0f
+  A2 = (B > C).condition((real)1.0f, (real)0.0f);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorMax(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.max2(B, C);  // a = (b > c) ? b : c
+  A2 = (B > C).condition(B, C);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTernaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  testTensorBinaryLabelCrossEntropyBp(A1, A2, B, C);
+  testTensorBinaryLabelCrossEntropy(A1, A2, B, C);
+  testTensorBiggerThan(A1, A2, B, C);
+  testTensorMax(A1, A2, B, C);
+
+  testTensorLogisticRegressionLoss(A1, A2, B, C);
+  testTensorLogisticRegressionLossBp(A1, A2, B, C);
+}
+
+TEST(Ternary, CompareOp) {
+  TestTernaryMatrix<CpuMatrix> testCpu(testTernaryCompareOp<CpuMatrix>);
+
+#ifdef PADDLE_WITH_GPU
+  TestTernaryMatrix<GpuMatrix> testGpu(testTernaryCompareOp<GpuMatrix>);
+#endif
+}
+
+template <typename Tensor>
+void testQuaternaryAdd(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
+  // A1.add3(B, C, D, 1.5f, 2.5f, 3.5f);  // a = p1 * b + p2 * c + p3 * d
+  // A2 = B * 1.5f + C * 2.5f + D * 3.5f;
+  // TensorCheckEqual(A1, A2);
+
+  /*
+   * T tmp = p1 * b + p2 * c + p3 * d;
+   * a += tmp * tmp
+   */
+  real p1 = 1.5f;
+  real p2 = 2.5f;
+  real p3 = 3.5f;
+  A1.addSquareSum(B, C, D, p1, p2, p3);
+  auto tmp = B * p1 + C * p2 + D * p3;
+  A2 += tmp * tmp;
+  TensorCheckEqual(A1, A2);
+}
+
+TEST(Quaternary, BaseOp) {
+  TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryAdd<CpuMatrix>);
+
+#ifdef PADDLE_WITH_GPU
+  TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryAdd<GpuMatrix>);
+#endif
+}
+
+template <typename Tensor>
+void testTensorBiggerThan(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
+  // a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
+  A1.biggerThan(B, C, D);
+  A2 = ((B > C && D > (real)0.5) || (B < C && D < (real)0.5))
+           .condition((real)1.0, (real)0.0);
+  TensorCheckEqual(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorRankLoss(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
+  /**
+   * const T THRESHOLD = 40.0; a = b - c;
+   * a = (a > THRESHOLD)
+   *         ? THRESHOLD
+   *         : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
+   * a = log(1 + exp(a)) - a * d
+   */
+  A1.rankLoss(B, C, D);
+
+  real THRESHOLD = 40.0;
+  auto tmp = B - C;
+  auto tmp2 =
+      (tmp > THRESHOLD)
+          .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
+  A2 = (D.constant(1.0f) + tmp2.exp()).log() - tmp2 * D;
+
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testTensorRankLossBp(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
+  /**
+   * const T THRESHOLD = 40.0; a = b - c;
+   * a = (a > THRESHOLD)
+   *         ? THRESHOLD
+   *         : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
+   * a = exp(a); a = (a / (1 + a) - d)
+   */
+  A1.rankLossBp(B, C, D);
+  real THRESHOLD = 40.0;
+  auto tmp = B - C;
+  auto tmp2 =
+      (tmp > THRESHOLD)
+          .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
+  auto tmp3 = tmp2.exp();
+  A2 = tmp3 / (D.constant(1.0f) + tmp3) - D;
+
+  TensorCheckErr(A1, A2);
+}
+
+template <typename Tensor>
+void testQuaternaryCompareOp(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
+  testTensorBiggerThan(A1, A2, B, C, D);
+  testTensorRankLoss(A1, A2, B, C, D);
+  testTensorRankLossBp(A1, A2, B, C, D);
+}
+
+TEST(Quaternary, CompareOp) {
+  TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryCompareOp<CpuMatrix>);
+
+#ifdef PADDLE_WITH_GPU
+  TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryCompareOp<GpuMatrix>);
+#endif
+}
diff --git a/paddle/legacy/math/tests/test_TrainingAlgorithm.cpp b/paddle/legacy/math/tests/test_TrainingAlgorithm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..214ae8971ae953ce0266f03dc3bba8c6160f1cf6
--- /dev/null
+++ b/paddle/legacy/math/tests/test_TrainingAlgorithm.cpp
@@ -0,0 +1,461 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "OriginalOptimizerApi.h"
+#include "PerfUtils.h"
+#include "TensorCheck.h"
+#include "paddle/legacy/math/TrainingAlgorithmOp.h"
+#include "paddle/legacy/utils/Util.h"
+
+using namespace paddle;  // NOLINT
+
+#ifndef PADDLE_TYPE_DOUBLE
+DEFINE_double(max_diff, 1e-5, "max diff allowed");
+#else
+DEFINE_double(max_diff, 1e-13, "max diff allowed");
+#endif
+
+class SetMaxDiff {
+ public:
+  explicit SetMaxDiff(double max_diff) {
+    max_diff_ = FLAGS_max_diff;
+    FLAGS_max_diff = max_diff;
+  }
+  ~SetMaxDiff() { FLAGS_max_diff = max_diff_; }
+
+ private:
+  double max_diff_;
+};
+
+#define COPY_VECTOR_TO_CPU(cpuVec, vector)               \
+  do {                                                   \
+    if (vector->useGpu()) {                              \
+      cpuVec = Vector::create(vector->getSize(), false); \
+      cpuVec->copyFrom(*vector);                         \
+    } else {                                             \
+      cpuVec = vector;                                   \
+    }                                                    \
+  } while (0)
+
+int VectorCheckErr(const Vector& vector1, const Vector& vector2) {
+  CHECK(vector1.getSize() == vector2.getSize());
+
+  const real* data1 = vector1.getData();
+  const real* data2 = vector2.getData();
+  size_t size = vector1.getSize();
+  int count = 0;
+  for (size_t i = 0; i < size; i++) {
+    real a = data1[i];
+    real b = data2[i];
+    if (fabs(a - b) > FLAGS_max_diff) {
+      if ((fabsf(a - b) / fabsf(a)) > (FLAGS_max_diff / 10.0f)) {
+        count++;
+      }
+    }
+  }
+
+  return count;
+}
+
+int VectorCheckErr(const VectorPtr& vector1, const VectorPtr& vector2) {
+  VectorPtr tmp1;
+  VectorPtr tmp2;
+  COPY_VECTOR_TO_CPU(tmp1, vector1);
+  COPY_VECTOR_TO_CPU(tmp2, vector2);
+  return VectorCheckErr(*tmp1, *tmp2);
+}
+
+#ifdef PADDLE_DISABLE_TIMER
+
+#define CHECK_VECTORPTR(vector1, vector2) \
+  EXPECT_EQ(VectorCheckErr(vector1, vector2), 0)
+
+#else
+
+#define CHECK_VECTORPTR(vector1, vector2)
+
+#endif
+
+typedef std::function<void(size_t size, bool useGpu)> testMatrixFunc;
+
+void testCase(testMatrixFunc matrixFunc) {
+#ifdef PADDLE_WITH_CUDA
+  for (auto useGpu : {false, true}) {
+#else
+  for (auto useGpu : {false}) {
+#endif
+    for (auto size : {1,
+                      32,
+                      64,
+                      128,
+                      512,
+                      1024,
+                      4096,
+                      32768,
+                      65536,
+                      131072,
+                      262144,
+                      524288,
+                      1048576,
+                      2097152}) {
+      LOG(INFO) << " size=" << size << " useGpu=" << useGpu;
+      matrixFunc(size, useGpu);
+    }
+  }
+}
+
+#define INIT_VECTOR(vec1, vec2, type, size, useGpu) \
+  vec1[type] = Vector::create(size, useGpu);        \
+  vec2[type] = Vector::create(size, useGpu);        \
+  vec1[type]->rand();                               \
+  vec2[type]->copyFrom(*vec1[type]);
+
+void testAdagrad(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+
+  EXPRESSION_PERFORMANCE(AdagradParameterOptimizer(
+      bufs1, epsilon, learningRate, momentum, decayRate));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& accum_buffer = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+
+  EXPRESSION_PERFORMANCE(adagradApply(value,
+                                      grad,
+                                      mom,
+                                      accum_buffer,
+                                      accum,
+                                      lr,
+                                      epsilon,
+                                      learningRate,
+                                      momentum,
+                                      decayRate));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+
+TEST(Training, Adagrad) { testCase(testAdagrad); }
+
+void testAdaDelta(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+
+  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+
+  EXPRESSION_PERFORMANCE(AdaDeltaParameterOptimizer(
+      bufs1, rou, epsilon, learningRate, momentum, decayRate));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& accum_update = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+
+  EXPRESSION_PERFORMANCE(adadeltaApply(value,
+                                       grad,
+                                       mom,
+                                       accum,
+                                       accum_update,
+                                       lr,
+                                       rou,
+                                       epsilon,
+                                       learningRate,
+                                       momentum,
+                                       decayRate));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+
+TEST(Training, AdaDelta) { testCase(testAdaDelta); }
+
+template <bool isFirstTime>
+void testRMSProp(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+
+  /* make sure 'g - f.square()' greater than 0 */
+  bufs1[PARAMETER_GRADIENT_SQURESUM]->add(1.0);
+  bufs2[PARAMETER_GRADIENT_SQURESUM]->copyFrom(
+      *bufs1[PARAMETER_GRADIENT_SQURESUM]);
+
+  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+  real accumulatedRou = rou;
+
+  EXPRESSION_PERFORMANCE(RMSPropParameterOptimizer(bufs1,
+                                                   accumulatedRou,
+                                                   rou,
+                                                   epsilon,
+                                                   learningRate,
+                                                   momentum,
+                                                   decayRate,
+                                                   isFirstTime));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& sum1 = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+
+  EXPRESSION_PERFORMANCE(rmspropApply(value,
+                                      grad,
+                                      mom,
+                                      sum,
+                                      sum1,
+                                      lr,
+                                      accumulatedRou,
+                                      rou,
+                                      epsilon,
+                                      learningRate,
+                                      momentum,
+                                      decayRate,
+                                      isFirstTime));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+
+TEST(Training, RMSProp) {
+  testCase(testRMSProp<true>);
+  testCase(testRMSProp<false>);
+}
+
+template <bool isFirstTime>
+void testDecayedAdagrad(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+
+  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+  real accumulatedRou = rou;
+
+  if (isFirstTime) {
+    bufs1[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
+    bufs2[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
+  }
+
+  EXPRESSION_PERFORMANCE(DecayedAdagradParameterOptimizer(bufs1,
+                                                          accumulatedRou,
+                                                          rou,
+                                                          epsilon,
+                                                          learningRate,
+                                                          momentum,
+                                                          decayRate,
+                                                          isFirstTime));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+
+  EXPRESSION_PERFORMANCE(decayedAdagradApply(value,
+                                             grad,
+                                             mom,
+                                             sum,
+                                             lr,
+                                             accumulatedRou,
+                                             rou,
+                                             epsilon,
+                                             learningRate,
+                                             momentum,
+                                             decayRate,
+                                             isFirstTime));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+
+TEST(Training, DecayedAdagrad) {
+  testCase(testDecayedAdagrad<false>);
+  testCase(testDecayedAdagrad<true>);
+}
+
+void testAdam(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_SECOND_MOMENTUM, size, useGpu);
+
+  real beta1 = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real beta2 = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real beta1_power = (real)rand() / (real)RAND_MAX;   // NOLINT
+  real beta2_power = (real)rand() / (real)RAND_MAX;   // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+
+  EXPRESSION_PERFORMANCE(AdamParameterOptimizer(
+      bufs1, beta1, beta2, beta1_power, beta2_power, epsilon, learningRate));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& v = *bufs2[PARAMETER_SECOND_MOMENTUM];
+
+  EXPRESSION_PERFORMANCE(adamApply(value,
+                                   grad,
+                                   mom,
+                                   v,
+                                   beta1,
+                                   beta2,
+                                   beta1_power,
+                                   beta2_power,
+                                   epsilon,
+                                   learningRate));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_SECOND_MOMENTUM],
+                  bufs2[PARAMETER_SECOND_MOMENTUM]);
+}
+
+TEST(Training, Adam) { testCase(testAdam); }
+
+void testAdamax(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_WEIGHTED_INFINITY_NORM, size, useGpu);
+
+  real beta1 = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real beta2 = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real alpha = (real)rand() / (real)RAND_MAX;  // NOLINT
+  int64_t step = 2;
+
+  EXPRESSION_PERFORMANCE(
+      AdamaxParameterOptimizer(bufs1, beta1, beta2, step, alpha));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& u = *bufs2[PARAMETER_WEIGHTED_INFINITY_NORM];
+
+  EXPRESSION_PERFORMANCE(
+      adamaxApply(value, grad, mom, u, beta1, beta2, step, alpha));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_WEIGHTED_INFINITY_NORM],
+                  bufs2[PARAMETER_WEIGHTED_INFINITY_NORM]);
+}
+
+TEST(Training, Adamax) {
+#ifndef PADDLE_TYPE_DOUBLE
+  SetMaxDiff diff(1e-4);
+#endif
+  testCase(testAdamax);
+}
+
+void testSparseMomentum(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_UT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_VT, size, useGpu);
+
+  real alpha = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real beta = (real)rand() / (real)RAND_MAX;          // NOLINT
+  real gamma = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real tau = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+
+  EXPRESSION_PERFORMANCE(SparseMomentumParameterOptimizer(
+      bufs1, alpha, beta, gamma, tau, learningRate));
+
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& momU = *bufs2[PARAMETER_MOMENTUM_UT];
+  BaseMatrix& momV = *bufs2[PARAMETER_MOMENTUM_VT];
+
+  EXPRESSION_PERFORMANCE(sparseMomentumApply(
+      value, grad, momU, momV, alpha, beta, gamma, tau, learningRate));
+
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_UT], bufs2[PARAMETER_MOMENTUM_UT]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_VT], bufs2[PARAMETER_MOMENTUM_VT]);
+}
+
+TEST(Training, SparseMomentum) { testCase(testSparseMomentum); }
diff --git a/paddle/math/tests/test_batchTranspose.cpp b/paddle/legacy/math/tests/test_batchTranspose.cpp
similarity index 100%
rename from paddle/math/tests/test_batchTranspose.cpp
rename to paddle/legacy/math/tests/test_batchTranspose.cpp
diff --git a/paddle/legacy/math/tests/test_lazyAssign.cu b/paddle/legacy/math/tests/test_lazyAssign.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cf8c3d77199571dff314446a1e1b14e9b746e947
--- /dev/null
+++ b/paddle/legacy/math/tests/test_lazyAssign.cu
@@ -0,0 +1,147 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "PerfUtils.h"
+#include "TensorCheck.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/TensorAssign.h"
+
+using paddle::BaseMatrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using autotest::TensorCheckEqual;
+using autotest::TensorCheckErr;
+
+typedef std::function<void(int height, int width)> testMatrixFunc;
+void testMatrixCase(testMatrixFunc matrixFunc) {
+  for (auto height : {1}) {
+    for (auto width : {1,
+                       32,
+                       64,
+                       128,
+                       512,
+                       1024,
+                       4096,
+                       32768,
+                       65536,
+                       131072,
+                       262144,
+                       524288,
+                       1048576,
+                       2097152,
+                       4194304,
+                       8388608}) {
+      matrixFunc(height, width);
+    }
+  }
+}
+
+template <typename Tensor>
+void testLazyAssign(int height, int width) {
+  Tensor A1(height, width);
+  Tensor A2(height, width);
+  Tensor B(height, width);
+  Tensor C(height, width);
+  Tensor D(height, width);
+  A1.randomizeUniform();
+  B.randomizeUniform();
+  C.randomizeUniform();
+  D.randomizeUniform();
+  A2.copyFrom(A1);
+
+  EXPRESSION_PERFORMANCE(A1 = B + C; A1 = A1 * D;);
+
+  EXPRESSION_PERFORMANCE(auto expr1 = A2.lazyAssign(B + C);
+                         auto expr2 = A2.lazyAssign(A2 * D);
+                         AssignEvaluate(expr1, expr2););
+
+  TensorCheckErr(A1, A2);
+}
+
+TEST(lazyAssign, CPU) { testMatrixCase(testLazyAssign<CpuMatrix>); }
+
+#ifdef PADDLE_WITH_GPU
+TEST(lazyAssign, GPU) { testMatrixCase(testLazyAssign<GpuMatrix>); }
+#endif
+
+template <typename Tensor>
+void sgdUpdateTensor(
+    Tensor& A, Tensor& B, Tensor& C, Tensor& D, real p1, real p2, real p3) {
+  C = C * p2 - D * (B + A * p3) * p1;
+  A += C;
+}
+
+void sgdUpdateLazyAssign(BaseMatrix& A,
+                         BaseMatrix& B,
+                         BaseMatrix& C,
+                         BaseMatrix& D,
+                         real p1,
+                         real p2,
+                         real p3) {
+  auto expr1 = C.lazyAssign(C * p2 - D * (B + A * p3) * p1);
+  auto expr2 = A.lazyAssign(A + C);
+  AssignEvaluate(expr1, expr2);
+}
+
+template <typename Tensor>
+void testSgdUpdate(int height, int width) {
+  Tensor A1(height, width);
+  Tensor A2(height, width);
+  Tensor A3(height, width);
+  A1.randomizeUniform();
+  A2.copyFrom(A1);
+  A3.copyFrom(A1);
+
+  Tensor B(height, width);
+  B.randomizeUniform();
+
+  Tensor C1(height, width);
+  Tensor C2(height, width);
+  Tensor C3(height, width);
+  C1.randomizeUniform();
+  C2.copyFrom(C1);
+  C3.copyFrom(C1);
+
+  Tensor D(height, width);
+  D.randomizeUniform();
+
+  real p1 = 0.2;
+  real p2 = 0.3;
+  real p3 = 0.5;
+
+  /**
+   * c = p2 * c - p1 * (b + p3 * a);
+   * a = a + c;
+   */
+  // BaseMatrix API
+  EXPRESSION_PERFORMANCE(A1.sgdUpdate(B, C1, D, p1, p2, p3););
+
+  // Tensor expression
+  EXPRESSION_PERFORMANCE(sgdUpdateTensor(A2, B, C2, D, p1, p2, p3));
+
+  // lazyAssign
+  EXPRESSION_PERFORMANCE(sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3));
+
+  TensorCheckErr(A1, A2);
+  TensorCheckErr(A1, A3);
+  TensorCheckErr(C1, C2);
+  TensorCheckErr(C1, C3);
+}
+
+TEST(sgdUpdate, CPU) { testMatrixCase(testSgdUpdate<CpuMatrix>); }
+
+#ifdef PADDLE_WITH_GPU
+TEST(sgdUpdate, GPU) { testMatrixCase(testSgdUpdate<GpuMatrix>); }
+#endif
diff --git a/paddle/legacy/math/tests/test_matrixCompare.cpp b/paddle/legacy/math/tests/test_matrixCompare.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a43adde46fc6526cc3ff5affec2ce1c7c3a44214
--- /dev/null
+++ b/paddle/legacy/math/tests/test_matrixCompare.cpp
@@ -0,0 +1,1698 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+/// This unittest checks GpuMatrix/CpuMatrix get same result, so disable when
+/// only cpu version.
+
+#include <gtest/gtest.h>
+#include "TensorCheck.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+#include "paddle/legacy/utils/DynamicLoader.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+using autotest::TensorCheckEqual;
+using autotest::TensorCheckErr;
+
+void testMatrixMaxSequence(int batchSize, int inputDim) {
+  // forward
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+  IVectorPtr cpuSequence;
+  generateSequenceStartPositions(batchSize, cpuSequence);
+  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
+  gpuSequence->copyFrom(*cpuSequence);
+
+  int newBatchSize = cpuSequence->getSize() - 1;
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
+  cpuOutput->zero();
+  gpuOutput->zero();
+
+  IVectorPtr cpuIndex = nullptr;
+  IVectorPtr gpuIndex = nullptr;
+  IVector::resizeOrCreate(cpuIndex, newBatchSize * inputDim, false);
+  IVector::resizeOrCreate(gpuIndex, newBatchSize * inputDim, true);
+  cpuIndex->zeroMem();
+  gpuIndex->zeroMem();
+
+  cpuOutput->maxSequenceForward(*cpuInput, *cpuSequence, *cpuIndex);
+  gpuOutput->maxSequenceForward(*gpuInput, *gpuSequence, *gpuIndex);
+
+  TensorCheckEqual(*cpuOutput, *gpuOutput);
+  TensorCheckEqual(*cpuIndex, *gpuIndex);
+
+  // backward
+  MatrixPtr cpuOutputGrad = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
+  MatrixPtr gpuOutputGrad = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
+  cpuOutputGrad->randomizeUniform();
+  gpuOutputGrad->copyFrom(*cpuOutputGrad);
+
+  MatrixPtr cpuInputGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInputGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInputGrad->randomizeUniform();
+  gpuInputGrad->copyFrom(*cpuInputGrad);
+
+  cpuInputGrad->maxSequenceBackward(*cpuOutputGrad, *cpuSequence, *cpuIndex);
+  gpuInputGrad->maxSequenceBackward(*gpuOutputGrad, *gpuSequence, *gpuIndex);
+
+  TensorCheckEqual(*cpuInputGrad, *gpuInputGrad);
+}
+
+TEST(Matrix, maxSequence) {
+  for (auto batchSize : {1, 3, 997}) {   // prime numbers close to 1, 4, 1024
+    for (auto inputDim : {1, 7, 131}) {  // prime numbers close to 1, 8, 128
+      VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim;
+      testMatrixMaxSequence(batchSize, inputDim);
+    }
+  }
+}
+
+void testMatrixGetSum(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+#ifndef PADDLE_TYPE_DOUBLE
+  int x = log10(height * width);
+  real err = 1e-6 * pow(10, x);
+#else
+  real err = 1e-8;
+#endif
+
+  real cpuSum = cpuInput->getSum();
+  real gpuSum = gpuInput->getSum();
+
+  EXPECT_LE(fabs(cpuSum - gpuSum), err);
+}
+
+void testMatrixGetMinMax(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+  real cpuMin = cpuInput->getMin();
+  real gpuMin = gpuInput->getMin();
+  real cpuMax = cpuInput->getMax();
+  real gpuMax = gpuInput->getMax();
+
+  EXPECT_EQ(cpuMin, gpuMin);
+  EXPECT_EQ(cpuMax, gpuMax);
+}
+
+void testMatrixZeroAtOffset(int height, int width) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr cpuTest = std::make_shared<CpuMatrix>(height, width);
+
+  cpuA->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  cpuTest->copyFrom(*cpuA);
+
+  int columnOffset = rand() % width;  // NOLINT we just use rand() for test.
+  int numColumns = rand() % (width - columnOffset);  // NOLINT
+
+  if (numColumns == 0) return;
+
+  cpuA->zeroAtOffset(columnOffset, numColumns);
+  gpuA->zeroAtOffset(columnOffset, numColumns);
+
+  /* cpuTest */
+  real* a = cpuTest->getData() + columnOffset;
+  for (int64_t i = 0; i < height; ++i) {
+    for (int64_t j = 0; j < numColumns; ++j) {
+      a[i * width + j] = 0;
+    }
+  }
+
+  TensorCheckEqual(*cpuA, *gpuA);
+  TensorCheckEqual(*cpuA, *cpuTest);
+}
+
+void testMatrixDeepSwap(int height, int width) {
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuCopyA = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuCopyB = std::make_shared<CpuMatrix>(height, width);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  cpuCopyA->copyFrom(*cpuA);
+  cpuCopyB->copyFrom(*cpuB);
+
+  // swap matrix cpuA and cpuB
+  cpuA->deepSwap(*cpuB);
+
+  TensorCheckEqual(*cpuA, *cpuCopyB);
+  TensorCheckEqual(*cpuB, *cpuCopyA);
+}
+
+void testMatrixTranspose(int height, int width) {
+  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr cpuT = std::make_shared<CpuMatrix>(width, height);
+  MatrixPtr gpuT = std::make_shared<GpuMatrix>(width, height);
+
+  cpu->randomizeUniform();
+  gpu->copyFrom(*cpu);
+  cpu->transpose(cpuT, false);
+  gpu->transpose(gpuT, true);
+
+  TensorCheckEqual(*cpuT, *gpuT);
+}
+
+void testMatrixRotate(int height, int width) {
+  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr cpuR = std::make_shared<CpuMatrix>(width, height);
+  MatrixPtr gpuR = std::make_shared<GpuMatrix>(width, height);
+
+  cpu->randomizeUniform();
+  gpu->copyFrom(*cpu);
+
+  cpu->rotate(cpuR, false, true);
+  gpu->rotate(gpuR, true, true);
+  TensorCheckEqual(*cpuR, *gpuR);
+
+  cpu->rotate(cpuR, true, false);
+  gpu->rotate(gpuR, false, false);
+  TensorCheckEqual(*cpuR, *gpuR);
+}
+
+void testMatrixInverse(int height) {
+  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, height);
+  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, height);
+  MatrixPtr cpuI = std::make_shared<CpuMatrix>(height, height);
+  MatrixPtr gpuI = std::make_shared<GpuMatrix>(height, height);
+
+  /* Make matrix well conditioned: cpu * cpuT + Identity */
+  cpu->randomizeUniform();
+  MatrixPtr cpuT = cpu->getTranspose();
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, height);
+  outputCheck->mul(*cpu, *cpuT);
+  cpu->setDiag(1.0);
+  cpu->add(*outputCheck);
+
+  gpu->copyFrom(*cpu);
+  cpu->inverse(cpuI, true);
+  gpu->inverse(gpuI, false);
+
+  TensorCheckErr(*cpuI, *gpuI);
+
+  outputCheck->mul(*cpu, *cpuI);
+  cpu->setDiag(1.0);
+  TensorCheckErr(*cpu, *outputCheck);
+}
+
+TEST(Matrix, unary) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
+      VLOG(3) << " height=" << height << " width=" << width;
+
+      testMatrixDeepSwap(height, width);
+      testMatrixZeroAtOffset(height, width);
+      testMatrixGetSum(height, width);
+      testMatrixTranspose(height, width);
+      testMatrixRotate(height, width);
+    }
+#ifdef LAPACK_FOUND
+    // inverse matrix
+    testMatrixInverse(height);
+#else
+    LOG(WARNING) << "This version of PaddlePaddle was not built with LAPACK"
+                 << "support so we cannot test matrix inverse. To test "
+                 << "matrix inverse, please install LAPACKE "
+                 << "and MKL/Openblas, and re-build PaddlePaddle.";
+#endif
+  }
+}
+
+void testMatrixSoftmax(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
+
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+  cpuOutput->zero();
+  gpuOutput->zero();
+  cpuInput->softmax(*cpuOutput);
+  gpuInput->softmax(*gpuOutput);
+
+  TensorCheckErr(*cpuOutput, *gpuOutput);
+}
+
+void testSequenceSoftmax(int batchSize) {
+  // forward
+  int inputDim = 1;
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+  IVectorPtr cpuSequence;
+  generateSequenceStartPositions(batchSize, cpuSequence);
+  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
+  gpuSequence->copyFrom(*cpuSequence);
+
+  cpuInput->sequenceSoftmax(*cpuInput, *cpuSequence);
+  gpuInput->sequenceSoftmax(*gpuInput, *gpuSequence);
+
+  TensorCheckErr(*cpuInput, *gpuInput);
+}
+
+void testMatrixSoftmaxThreshold(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
+
+  cpuInput->randomizeUniform();
+  cpuInput->getData()[0] = 100.0;
+  gpuInput->copyFrom(*cpuInput);
+  cpuOutput->zero();
+  gpuOutput->zero();
+  cpuInput->softmax(*cpuOutput);
+  gpuInput->softmax(*gpuOutput);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
+  outputCheck->copyFrom(*gpuOutput);
+  // check output zero
+  int cpuCount = 0;
+  int gpuCount = 0;
+  auto zeroNum = [](MatrixPtr out, int& count) {
+    for (size_t i = 0; i < out->getHeight(); i++) {
+      for (size_t j = 0; j < out->getWidth(); j++) {
+        if (out->getElement(i, j) == 0) count++;
+      }
+    }
+  };
+  zeroNum(cpuOutput, cpuCount);
+  zeroNum(outputCheck, gpuCount);
+  EXPECT_EQ(cpuCount, 0) << "Cpu softmax output value 0";
+  EXPECT_EQ(gpuCount, 0) << "Gpu softmax output value 0";
+}
+
+void testMatrixSoftmaxBp(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
+
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+  cpuOutput->randomizeUniform();
+  gpuOutput->copyFrom(*cpuOutput);
+  gpuOutput->softmaxBackward(*gpuInput);
+
+  MatrixPtr sftMaxSum = std::make_shared<CpuMatrix>(height, 1);
+  MatrixPtr sftMaxDot = std::make_shared<CpuMatrix>(height, width);
+  sftMaxDot->dotMul(*cpuOutput, *cpuInput);
+  sftMaxSum->colMerge(*sftMaxDot);
+  cpuOutput->softmaxDerivative(*cpuInput, *sftMaxSum);
+
+  TensorCheckErr(*cpuOutput, *gpuOutput);
+}
+
+TEST(Matrix, softmax) {
+  for (auto height : {1, 3, 131}) {    // prime numbers close to 1, 4, 127
+    for (auto width : {1, 17, 251}) {  // prime numbers close to 1, 16, 256
+      VLOG(3) << " height=" << height << " width=" << width;
+
+      testMatrixSoftmax(height, width);
+      testMatrixSoftmaxBp(height, width);
+      testMatrixSoftmaxThreshold(height, width);
+    }
+    testSequenceSoftmax(height);
+  }
+}
+
+void testMatrixAddToRows(int numSamples, int tableSize, int inputDim) {
+  MatrixPtr cpuTable = std::make_shared<CpuMatrix>(tableSize, inputDim);
+  MatrixPtr gpuTable = std::make_shared<GpuMatrix>(tableSize, inputDim);
+  cpuTable->randomizeUniform();
+  gpuTable->copyFrom(*cpuTable);
+
+  IVectorPtr cpuIds;
+  IVectorPtr gpuIds;
+  cpuIds = VectorT<int>::create(numSamples, false);
+  gpuIds = VectorT<int>::create(numSamples, true);
+  cpuIds->rand(tableSize);
+  gpuIds->copyFrom(*cpuIds);
+
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, inputDim);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, inputDim);
+  cpuOutput->randomizeUniform();
+  gpuOutput->copyFrom(*cpuOutput);
+
+  cpuOutput->addToRows(*cpuTable, *cpuIds);
+  gpuOutput->addToRows(*gpuTable, *gpuIds);
+
+  TensorCheckErr(*cpuTable, *gpuTable);
+}
+
+TEST(Matrix, tableProjection) {
+  for (auto numSamples : {10, 100, 1000, 10000, 80000}) {
+    for (auto tableSize : {10, 100}) {
+      for (auto inputDim : {20, 50}) {
+        VLOG(3) << " numSamples=" << numSamples << " tableSize=" << tableSize
+                << " inputDim=" << inputDim;
+        testMatrixAddToRows(numSamples, tableSize, inputDim);
+      }
+    }
+  }
+}
+
+void testMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
+  int heightA = transa == false ? dimM : dimK;
+  int widthA = transa == false ? dimK : dimM;
+  int heightB = transb == false ? dimK : dimN;
+  int widthB = transb == false ? dimN : dimK;
+  int heightC = dimM;
+  int widthC = dimN;
+
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(heightA, widthA, transa);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(heightB, widthB, transb);
+  MatrixPtr cpuC = std::make_shared<CpuMatrix>(heightC, widthC);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(heightA, widthA, transa);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(heightB, widthB, transb);
+  MatrixPtr gpuC = std::make_shared<GpuMatrix>(heightC, widthC);
+
+  real alpha = 1.5;
+  real beta = 2.0;
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  cpuC->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+  gpuC->copyFrom(*cpuC);
+
+  cpuC->mul(*cpuA, *cpuB, alpha, beta);
+  gpuC->mul(*gpuA, *gpuB, alpha, beta);
+
+  TensorCheckErr(*cpuC, *gpuC);
+}
+
+void testSubMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
+  int heightA = transa == false ? dimM : dimK;
+  int widthA = transa == false ? dimK : dimM;
+  int heightB = transb == false ? dimK : dimN;
+  int widthB = transb == false ? dimN : dimK;
+  int heightC = dimM;
+  int widthC = dimN;
+
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(heightA, widthA, transa);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(heightB, widthB, transb);
+  MatrixPtr cpuC = std::make_shared<CpuMatrix>(heightC, widthC);
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(heightA, widthA, transa);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(heightB, widthB, transb);
+  MatrixPtr gpuC = std::make_shared<GpuMatrix>(heightC, widthC);
+
+  real alpha = 1.5;
+  real beta = 2.0;
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  cpuC->randomizeUniform();
+  gpuA->copyFrom(*cpuA);
+  gpuB->copyFrom(*cpuB);
+  gpuC->copyFrom(*cpuC);
+
+  auto subSize = [](int& start, int& end, int dim) {
+    if (dim == 1) {
+      start = 0;
+      end = dim;
+    } else {
+      int subDim = rand() % (dim - 1) + 1;  // NOLINT
+      start = rand() % (dim - subDim);      // NOLINT
+      end = start + subDim;
+    }
+  };
+
+  auto subMatrix = [](MatrixPtr& sub,
+                      MatrixPtr matrix,
+                      size_t startRow,
+                      size_t endRow,
+                      size_t startCol,
+                      size_t endCol) {
+    if (!matrix->isTransposed()) {
+      sub = matrix->subMatrix(startRow, endRow, startCol, endCol);
+    } else {
+      sub = matrix->subMatrix(startCol, endCol, startRow, endRow);
+    }
+  };
+
+  int startM, endM;
+  int startN, endN;
+  int startK, endK;
+  subSize(startM, endM, dimM);
+  subSize(startN, endN, dimN);
+  subSize(startK, endK, dimK);
+
+  MatrixPtr subCpuA;
+  MatrixPtr subCpuB;
+  MatrixPtr subGpuA;
+  MatrixPtr subGpuB;
+  subMatrix(subCpuA, cpuA, startM, endM, startK, endK);
+  subMatrix(subGpuA, gpuA, startM, endM, startK, endK);
+  subMatrix(subCpuB, cpuB, startK, endK, startN, endN);
+  subMatrix(subGpuB, gpuB, startK, endK, startN, endN);
+  MatrixPtr subCpuC = cpuC->subMatrix(startM, endM, startN, endN);
+  MatrixPtr subGpuC = gpuC->subMatrix(startM, endM, startN, endN);
+
+  subCpuC->mul(*subCpuA, *subCpuB, alpha, beta);
+  subGpuC->mul(*subGpuA, *subGpuB, alpha, beta);
+
+  TensorCheckErr(*cpuC, *gpuC);
+}
+
+TEST(Matrix, mul) {
+  for (auto transa : {false, true}) {
+    for (auto transb : {false, true}) {
+      for (auto dimM : {1, 9, 53, 127, 345, 1023, 2135}) {
+        for (auto dimN : {1, 5, 37, 256, 1024}) {
+          for (auto dimK : {8, 45, 346, 784, 1025}) {
+            if (true == transa && true == transb) {
+              continue;
+            }
+            VLOG(3) << setiosflags(ios::left) << setfill(' ')
+                    << " transa=" << transa << " transb=" << transb
+                    << " dimM=" << setw(5) << dimM << " dimN=" << setw(5)
+                    << dimN << " dimK=" << setw(5) << dimK;
+
+            testMatrixMul(transa, transb, dimM, dimN, dimK);
+            testSubMatrixMul(transa, transb, dimM, dimN, dimK);
+          }
+        }
+      }
+    }
+  }
+}
+
+void testVectorRowFunc(int size) {
+  CpuVectorPtr cpu = std::make_shared<CpuVectorT<real>>(size);
+  GpuVectorPtr gpu = std::make_shared<GpuVectorT<real>>(size);
+
+  cpu->rand();
+  gpu->copyFrom(*cpu);
+
+  EXPECT_EQ(cpu->getMax(), gpu->getMax());
+  EXPECT_EQ(cpu->getMin(), gpu->getMin());
+  EXPECT_EQ(cpu->getAbsMax(), gpu->getAbsMax());
+}
+
+TEST(Vector, rowFunc) {
+  for (auto size : {1, 3, 997}) {  // prime numbers close to 1, 4, 1024
+    VLOG(3) << " size=" << size;
+    testVectorRowFunc(size);
+  }
+}
+
+template <class T>
+void testVectorReset(int size) {
+  std::shared_ptr<CpuVectorT<T>> cpu = std::make_shared<CpuVectorT<T>>(size);
+  std::shared_ptr<GpuVectorT<T>> gpu = std::make_shared<GpuVectorT<T>>(size);
+
+  T value = (T)((int)rand() % 100 + 1.0f / ((int)rand() % 100));
+  cpu->reset(value);
+  gpu->reset(value);
+
+  TensorCheckEqual(*cpu, *gpu);
+}
+
+template <class T>
+void testVecortSelectFrom(int size) {
+  std::shared_ptr<CpuVectorT<T>> cpuDst = std::make_shared<CpuVectorT<T>>(size);
+  std::shared_ptr<GpuVectorT<T>> gpuDst = std::make_shared<GpuVectorT<T>>(size);
+  std::shared_ptr<CpuVectorT<T>> cpuSrc =
+      std::make_shared<CpuVectorT<T>>(size * 2);
+  std::shared_ptr<GpuVectorT<T>> gpuSrc =
+      std::make_shared<GpuVectorT<T>>(size * 2);
+  CpuIVectorPtr cpuIds = std::make_shared<CpuVectorT<int>>(size);
+  GpuIVectorPtr gpuIds = std::make_shared<GpuVectorT<int>>(size);
+
+  if (std::is_same<T, real>::value) {
+    cpuSrc->rand();
+  } else {
+    cpuSrc->rand(100000);
+  }
+  gpuSrc->copyFrom(*cpuSrc);
+  cpuIds->rand(size);
+  gpuIds->copyFrom(*cpuIds);
+
+  cpuDst->selectFrom(*cpuSrc, *cpuIds);
+  gpuDst->selectFrom(*gpuSrc, *gpuIds);
+
+  TensorCheckEqual(*cpuDst, *gpuDst);
+}
+
+template <class T>
+void testVecotrZeroMem(int size) {
+  std::shared_ptr<CpuVectorT<T>> cpu = std::make_shared<CpuVectorT<T>>(size);
+  std::shared_ptr<GpuVectorT<T>> gpu = std::make_shared<GpuVectorT<T>>(size);
+
+  cpu->zeroMem();
+  gpu->zeroMem();
+
+  TensorCheckEqual(*cpu, *gpu);
+}
+
+template <class T>
+void testVectorIsEqual(int size) {
+  std::shared_ptr<CpuVectorT<T>> cpuA = std::make_shared<CpuVectorT<T>>(size);
+  std::shared_ptr<CpuVectorT<T>> cpuB = std::make_shared<CpuVectorT<T>>(size);
+  std::shared_ptr<GpuVectorT<T>> gpuA = std::make_shared<GpuVectorT<T>>(size);
+  std::shared_ptr<GpuVectorT<T>> gpuB = std::make_shared<GpuVectorT<T>>(size);
+
+  if (std::is_same<T, real>::value) {
+    cpuB->rand();
+  } else {
+    cpuB->rand(100000);
+  }
+  gpuB->copyFrom(*cpuB);
+
+  T value = (T)((int)rand() % 100 + 1.0f / ((int)rand() % 100));
+  cpuA->isEqualTo(*cpuB, value);
+  gpuA->isEqualTo(*gpuB, value);
+
+  TensorCheckEqual(*cpuA, *gpuA);
+}
+
+TEST(Vector, Equal) {
+  for (auto size : {1, 3, 997}) {  // prime numbers close to 1, 4, 1024
+    VLOG(3) << " size=" << size;
+    testVectorReset<int>(size);
+    testVectorReset<real>(size);
+    testVecortSelectFrom<int>(size);
+    testVecortSelectFrom<real>(size);
+    testVecotrZeroMem<int>(size);
+    testVecotrZeroMem<real>(size);
+    testVectorIsEqual<int>(size);
+    testVectorIsEqual<real>(size);
+  }
+}
+
+void testMatrixTopK(int samples, int dim, int beamSize) {
+  MatrixPtr cpuSrc = std::make_shared<CpuMatrix>(samples, dim);
+  MatrixPtr gpuSrc = std::make_shared<GpuMatrix>(samples, dim);
+  MatrixPtr cpuVal = std::make_shared<CpuMatrix>(samples, beamSize);
+  MatrixPtr gpuVal = std::make_shared<GpuMatrix>(samples, beamSize);
+  IVectorPtr cpuIds = std::make_shared<CpuIVector>(samples * beamSize);
+  IVectorPtr gpuIds = std::make_shared<GpuIVector>(samples * beamSize);
+
+  cpuSrc->randomizeUniform();
+  gpuSrc->copyFrom(*cpuSrc);
+
+  cpuSrc->rowMax(*cpuIds, *cpuVal);
+  gpuSrc->rowMax(*gpuIds, *gpuVal);
+
+  TensorCheckEqual(*cpuVal, *gpuVal);
+}
+
+TEST(Matrix, topK) {
+  for (auto samples : {1, 17, 131}) {  // prime numbers close to 1, 16, 127
+    for (auto dim : {1, 3, 997}) {     // prime numbers close to 1, 4, 1024
+      for (auto beamSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) {
+        if (beamSize > dim) continue;
+        VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
+                << " dim=" << dim;
+        testMatrixTopK(samples, dim, beamSize);
+      }
+    }
+  }
+}
+
+void testSMatrixTopK(int samples, int dim, int beamSize, real ratio) {
+  int nnz = samples * dim * ratio;
+  if (nnz < 1) nnz = 1;  // Because sparseRand in MathUtil.cpp requires this.
+  MatrixPtr cpuSrc = std::make_shared<CpuSparseMatrix>(samples, dim, nnz);
+  MatrixPtr gpuSrc = std::make_shared<GpuSparseMatrix>(samples, dim, nnz);
+  MatrixPtr cpuVal = std::make_shared<CpuMatrix>(samples, beamSize);
+  MatrixPtr gpuVal = std::make_shared<GpuMatrix>(samples, beamSize);
+  IVectorPtr cpuIds = std::make_shared<CpuIVector>(samples * beamSize);
+  IVectorPtr gpuIds = std::make_shared<GpuIVector>(samples * beamSize);
+
+  cpuSrc->randomizeUniform();
+  gpuSrc->copyFrom(*cpuSrc);
+  cpuVal->zero();
+  cpuIds->zero();
+  gpuVal->zero();
+  gpuIds->zero();
+
+  cpuSrc->rowMax(*cpuIds, *cpuVal);
+  gpuSrc->rowMax(*gpuIds, *gpuVal);
+
+  TensorCheckEqual(*cpuVal, *gpuVal);
+
+  IVectorPtr outCheckIds = std::make_shared<CpuIVector>(samples * beamSize);
+  outCheckIds->copyFrom(*gpuIds);
+
+  const int* data1 = cpuIds->getData();
+  const int* data2 = outCheckIds->getData();
+  size_t size = cpuIds->getSize();
+  for (size_t i = 0; i < size; i++) {
+    if (data1[i] == -1 && data1[i] != data2[i]) {
+      EXPECT_EQ(data1[i], data2[i]);
+    }
+  }
+}
+
+TEST(SMatrix, topK) {
+  for (auto samples : {1, 3, 61}) {
+    for (auto dim : {1, 3, 61}) {
+      for (auto beamSize : {1, 3, 61}) {
+        for (auto ratio : {0.01, 0.001}) {
+          if (beamSize > dim) continue;
+          VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
+                  << " dim=" << dim << " ratio=" << ratio;
+          testSMatrixTopK(samples, dim, beamSize, ratio);
+        }
+      }
+    }
+  }
+}
+
+void testMatrixSequenceAvg(int batchSize, int inputDim, int mode) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+  IVectorPtr cpuSequence;
+  generateSequenceStartPositions(batchSize, cpuSequence);
+  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
+  gpuSequence->copyFrom(*cpuSequence);
+
+  int newBatchSize = cpuSequence->getSize() - 1;
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
+  cpuOutput->zero();
+  gpuOutput->zero();
+
+  cpuOutput->sequenceAvgForward(*cpuInput, *cpuSequence, mode);
+  gpuOutput->sequenceAvgForward(*gpuInput, *gpuSequence, mode);
+
+  TensorCheckErr(*cpuOutput, *gpuOutput);
+
+  MatrixPtr cpuInGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInGrad->randomizeUniform();
+  gpuInGrad->copyFrom(*cpuInGrad);
+
+  cpuInGrad->sequenceAvgBackward(*cpuOutput, *cpuSequence, mode);
+  gpuInGrad->sequenceAvgBackward(*gpuOutput, *gpuSequence, mode);
+
+  TensorCheckErr(*cpuInGrad, *gpuInGrad);
+}
+
+TEST(Matrix, sequenceAvg) {
+  for (auto batchSize : {10, 128, 6000}) {
+    for (auto inputDim : {32, 100, 512}) {
+      for (auto mode : {0, 1, 2}) {
+        VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim
+                << " mode=" << mode;
+        testMatrixSequenceAvg(batchSize, inputDim, mode);
+      }
+    }
+  }
+}
+
+void testParamReluBackwardDiff(int height,
+                               int width,
+                               int w_height,
+                               int w_width) {
+  MatrixPtr oGrad = CpuMatrix::create(height, width, false, false);
+  MatrixPtr input = CpuMatrix::create(height, width, false, false);
+  MatrixPtr diff = CpuMatrix::create(height, width, false, false);
+  MatrixPtr w = CpuMatrix::create(w_height, w_width, false, false);
+
+  oGrad->randomizeUniform();
+  input->randomizeUniform();
+  w->randomizeUniform();
+  diff->randomizeUniform();
+  input->add(-0.5);
+
+  MatrixPtr oGradGpu = GpuMatrix::create(height, width, false, true);
+  MatrixPtr inputGpu = GpuMatrix::create(height, width, false, true);
+  MatrixPtr diffGpu = CpuMatrix::create(height, width, false, true);
+  MatrixPtr wGpu = GpuMatrix::create(w_height, w_width, false, true);
+
+  oGradGpu->copyFrom(*oGrad);
+  inputGpu->copyFrom(*input);
+  wGpu->copyFrom(*w);
+  diffGpu->copyFrom(*diff);
+
+  diff->paramReluBackwardDiff(*oGrad, *input, *w);
+  diffGpu->paramReluBackwardDiff(*oGradGpu, *inputGpu, *wGpu);
+
+  TensorCheckErr(*diff, *diffGpu);
+}
+
+TEST(Matrix, paramReluBackwardDiff) {
+  for (auto height : {10, 40, 100}) {
+    for (auto width : {10, 40, 100}) {
+      for (auto w_height : {1, 2}) {
+        for (auto w_width : {1, 2}) {
+          if (width % (w_height * w_width)) continue;
+          testParamReluBackwardDiff(height, width, w_height, w_width);
+        }
+      }
+    }
+  }
+}
+
+void testClassificationError(int numSamples, int dim, int topkSize) {
+  MatrixPtr cpuError = std::make_shared<CpuMatrix>(numSamples, 1);
+  MatrixPtr gpuError = std::make_shared<GpuMatrix>(numSamples, 1);
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, dim);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, dim);
+  IVectorPtr cpuLabel = std::make_shared<CpuIVector>(numSamples);
+  IVectorPtr gpuLabel = std::make_shared<GpuIVector>(numSamples);
+
+  cpuOutput->randomizeUniform();
+  cpuLabel->rand(dim);
+  gpuOutput->copyFrom(*cpuOutput);
+  gpuLabel->copyFrom(*cpuLabel);
+
+  cpuError->classificationError(*cpuOutput, *cpuLabel, topkSize);
+  gpuError->classificationError(*gpuOutput, *gpuLabel, topkSize);
+
+  TensorCheckEqual(*cpuError, *gpuError);
+}
+
+TEST(Matrix, classificationError) {
+  for (auto numSamples : {1, 3, 31}) {
+    for (auto dim : {1, 3, 31}) {
+      for (auto topkSize : {1, 3, (int)rand() % dim + 1}) {
+        if (topkSize > dim) continue;
+        VLOG(3) << " sample= " << numSamples << " topkSize= " << topkSize
+                << " dim= " << dim;
+        testClassificationError(numSamples, dim, topkSize);
+      }
+    }
+  }
+}
+
+void testMaxPoolFwdBwd(int numSamples,
+                       int channels,
+                       int imgSizeH,
+                       int imgSizeW,
+                       int ksizeH,
+                       int ksizeW,
+                       int strideH,
+                       int strideW,
+                       int padH,
+                       int padW) {
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
+
+  int inWidth = imgSizeH * imgSizeW * channels;
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  int outWidth = channels * outH * outW;
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  input->randomizeUniform();
+  target->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  targetGpu->copyFrom(*target);
+
+  target->maxPoolForward(*input,
+                         imgSizeH,
+                         imgSizeW,
+                         channels,
+                         ksizeW,
+                         ksizeH,
+                         strideH,
+                         strideW,
+                         outH,
+                         outW,
+                         padH,
+                         padW);
+  targetGpu->maxPoolForward(*inputGpu,
+                            imgSizeH,
+                            imgSizeW,
+                            channels,
+                            ksizeW,
+                            ksizeH,
+                            strideH,
+                            strideW,
+                            outH,
+                            outW,
+                            padH,
+                            padW);
+  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
+  targetCheck->copyFrom(*targetGpu);
+  checkMatrixEqual(target, targetCheck);
+
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->maxPoolBackward(*input,
+                             imgSizeH,
+                             imgSizeW,
+                             *targetGrad,
+                             *target,
+                             ksizeW,
+                             ksizeH,
+                             strideH,
+                             strideW,
+                             outH,
+                             outW,
+                             1.0,
+                             1.0,
+                             padH,
+                             padW);
+  inputGpuGrad->maxPoolBackward(*inputGpu,
+                                imgSizeH,
+                                imgSizeW,
+                                *targetGpuGrad,
+                                *targetGpu,
+                                ksizeW,
+                                ksizeH,
+                                strideH,
+                                strideW,
+                                outH,
+                                outW,
+                                1.0,
+                                1.0,
+                                padH,
+                                padW);
+  MatrixPtr targetBwdCheck =
+      CpuMatrix::create(numSamples, inWidth, false, false);
+  targetBwdCheck->copyFrom(*inputGpuGrad);
+  checkMatrixEqual(inputGrad, targetBwdCheck);
+}
+
+void testAvgPoolFwdBwd(int numSamples,
+                       int channels,
+                       int imgSizeH,
+                       int imgSizeW,
+                       int ksizeH,
+                       int ksizeW,
+                       int strideH,
+                       int strideW,
+                       int padH,
+                       int padW) {
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
+
+  int inWidth = imgSizeH * imgSizeW * channels;
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  int outWidth = channels * outH * outW;
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  input->randomizeUniform();
+  target->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  targetGpu->copyFrom(*target);
+
+  target->avgPoolForward(*input,
+                         imgSizeH,
+                         imgSizeW,
+                         channels,
+                         ksizeW,
+                         ksizeH,
+                         strideH,
+                         strideW,
+                         outH,
+                         outW,
+                         padH,
+                         padW);
+  targetGpu->avgPoolForward(*inputGpu,
+                            imgSizeH,
+                            imgSizeW,
+                            channels,
+                            ksizeW,
+                            ksizeH,
+                            strideH,
+                            strideW,
+                            outH,
+                            outW,
+                            padH,
+                            padW);
+
+  TensorCheckErr(*target, *targetGpu);
+
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->avgPoolBackward(*targetGrad,
+                             imgSizeH,
+                             imgSizeW,
+                             ksizeW,
+                             ksizeH,
+                             strideH,
+                             strideW,
+                             outH,
+                             outW,
+                             1.0,
+                             1.0,
+                             padH,
+                             padW);
+  inputGpuGrad->avgPoolBackward(*targetGpuGrad,
+                                imgSizeH,
+                                imgSizeW,
+                                ksizeW,
+                                ksizeH,
+                                strideH,
+                                strideW,
+                                outH,
+                                outW,
+                                1.0,
+                                1.0,
+                                padH,
+                                padW);
+
+  TensorCheckErr(*inputGrad, *inputGpuGrad);
+}
+
+// TODO(yi): I noticed many such blindly combinatorial tests in this
+// file.  They are no help to locate defects at all.
+TEST(Matrix, PoolFwdBwd) {
+  for (auto numSamples : {1, 3}) {
+    for (auto channels : {1, 3}) {
+      for (auto imgSizeH : {13, 17}) {
+        for (auto imgSizeW : {17, 19}) {
+          for (auto sizeX : {2, 3}) {
+            for (auto sizeY : {2, 3}) {
+              for (auto sH : {1, 2}) {
+                for (auto sW : {1, 2}) {
+                  for (auto pH : {0, (sizeY - 1) / 2}) {
+                    for (auto pW : {0, (sizeX - 1) / 2}) {
+                      VLOG(3) << " numSamples=" << numSamples
+                              << " channels=" << channels
+                              << " imgSizeH=" << imgSizeH
+                              << " imgSizeW=" << imgSizeW << " sizeX=" << sizeX
+                              << " sizeY=" << sizeY << " strideH=" << sH
+                              << " strideW=" << sW << " padingH=" << pH
+                              << " padingW=" << pW;
+                      testMaxPoolFwdBwd(numSamples,
+                                        channels,
+                                        imgSizeH,
+                                        imgSizeW,
+                                        sizeX,
+                                        sizeY,
+                                        sH,
+                                        sW,
+                                        pH,
+                                        pW);
+                      testAvgPoolFwdBwd(numSamples,
+                                        channels,
+                                        imgSizeH,
+                                        imgSizeW,
+                                        sizeX,
+                                        sizeY,
+                                        sH,
+                                        sW,
+                                        pH,
+                                        pW);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void testMaxOutFwdBwd(
+    int numSamples, int imgSizeH, int imgSizeW, int channels, int groups) {
+  int inWidth = imgSizeH * imgSizeW * channels;
+  int outChannels = channels / groups;
+  int outWidth = imgSizeH * imgSizeW * outChannels;
+
+  // forward
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  IVectorPtr id = CpuIVector::create(numSamples * outWidth, false);
+  IVectorPtr idGpu = GpuIVector::create(numSamples * outWidth, true);
+
+  input->randomizeUniform();
+  inputGpu->copyFrom(*input);
+
+  target->maxoutForward(*input, *id, outChannels, groups);
+  targetGpu->maxoutForward(*inputGpu, *idGpu, outChannels, groups);
+
+  TensorCheckErr(*target, *targetGpu);
+  TensorCheckEqual(*id, *idGpu);
+
+  // backward
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->maxoutBackward(*targetGrad, *id, outChannels, groups);
+  inputGpuGrad->maxoutBackward(*targetGpuGrad, *idGpu, outChannels, groups);
+
+  TensorCheckErr(*inputGrad, *inputGpuGrad);
+}
+
+TEST(Matrix, MaxOutFwdBwd) {
+  for (auto numSamples : {5, 10}) {
+    for (auto channels : {8, 16}) {
+      for (auto imgSizeH : {14, 28}) {
+        for (auto imgSizeW : {16, 30}) {
+          for (auto groups : {2, 4}) {
+            VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+                    << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
+                    << " groups=" << groups;
+            testMaxOutFwdBwd(numSamples, imgSizeH, imgSizeW, channels, groups);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(CpuMatrix, copyFrom) {
+  const size_t height = 31;
+  const size_t width = 53;
+  CpuMatrix cpu(height, width);
+  GpuMatrix gpu(height, width);
+  CpuMatrix copy(height, width);
+
+  cpu.randomizeUniform();
+  gpu.copyFrom(cpu);
+  copy.copyFrom(gpu, HPPL_STREAM_DEFAULT);
+
+  TensorCheckEqual(cpu, copy);
+}
+
+void testBatch2seqPadding(int batchSize, int inputDim) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+  IVectorPtr cpuSequence;
+  generateSequenceStartPositions(batchSize, cpuSequence);
+  for (int i = 0; i < int(cpuSequence->getSize()); ++i) {
+    (cpuSequence->getData())[i] += 1;  // so no way that maxSeqLen is 0;
+  }
+
+  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
+  gpuSequence->copyFrom(*cpuSequence);
+
+  size_t numSeq = cpuSequence->getSize() - 1;
+  size_t maxSeqLen = *std::max_element(cpuSequence->getData(),
+                                       cpuSequence->getData() + numSeq);
+
+  printf("numSeq = %ld, maxSeqLen = %ld\n", numSeq, maxSeqLen);
+  MatrixPtr cBatch = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
+  MatrixPtr gBatch = std::make_shared<GpuMatrix>(numSeq * maxSeqLen, inputDim);
+  MatrixPtr cCheck = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
+
+  // hl_sequence2batch_copy_padding(gBatch->getData(),
+  //                                gpuInput->getData(),
+  //                                cpuSequence->getData(),
+  //                                inputDim,
+  //                                maxSeqLen,
+  //                                numSeq,
+  //                                false,
+  //                                true);
+  // cCheck->copyFrom(*gBatch);
+
+  // int* seqStart = cpuSequence->getData();
+  // float* batchData = cBatch->getData();
+  // float* seqData = cpuInput->getData();
+  // for (size_t i = 0; i < maxSeqLen; i++) {
+  //   for (size_t j = 0; j < numSeq; j++) {
+  //     size_t sequenceStart = seqStart[j];
+  //     size_t sequenceLength = seqStart[j + 1] - seqStart[j];
+  //     if (i < sequenceLength) {
+  //       memcpy(batchData + (i * numSeq + j) * inputDim,
+  //              seqData + (sequenceStart + i) * inputDim,
+  //              inputDim * sizeof(real));
+  //     } else {
+  //       memset(batchData + (i * numSeq + j) * inputDim,
+  //              0,
+  //              inputDim * sizeof(real));
+  //     }
+  //   }
+  // }
+
+  // TensorCheckErr(*cBatch, *cCheck);
+}
+
+TEST(Matrix, warpCTC) {
+  for (auto batchSize : {1, 3, 17}) {
+    for (auto inputDim : {1, 3, 31}) {
+      VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim;
+      testBatch2seqPadding(batchSize, inputDim);
+    }
+  }
+}
+
+void testMaxPool3DFwdBwd(int numSamples,
+                         int channels,
+                         int imgSizeD,
+                         int imgSizeH,
+                         int imgSizeW,
+                         int ksizeD,
+                         int ksizeH,
+                         int ksizeW,
+                         int strideD,
+                         int strideH,
+                         int strideW,
+                         int padD,
+                         int padH,
+                         int padW) {
+  int outD = outputSize(imgSizeD, ksizeD, padD, strideD, true);
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
+
+  int inWidth = channels * imgSizeD * imgSizeH * imgSizeW;
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  int outWidth = channels * outD * outH * outW;
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+  MatrixPtr maxIdx = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr maxIdxGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  input->randomizeUniform();
+  target->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  targetGpu->copyFrom(*target);
+
+  target->maxPool3DForward(*input,
+                           *maxIdx,
+                           channels,
+                           imgSizeD,
+                           imgSizeH,
+                           imgSizeW,
+                           outD,
+                           outH,
+                           outW,
+                           ksizeD,
+                           ksizeH,
+                           ksizeW,
+                           strideD,
+                           strideH,
+                           strideW,
+                           padD,
+                           padH,
+                           padW);
+  targetGpu->maxPool3DForward(*inputGpu,
+                              *maxIdxGpu,
+                              channels,
+                              imgSizeD,
+                              imgSizeH,
+                              imgSizeW,
+                              outD,
+                              outH,
+                              outW,
+                              ksizeD,
+                              ksizeH,
+                              ksizeW,
+                              strideD,
+                              strideH,
+                              strideW,
+                              padD,
+                              padH,
+                              padW);
+  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
+  targetCheck->copyFrom(*targetGpu);
+  checkMatrixEqual(target, targetCheck);
+
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->maxPool3DBackward(*targetGrad,
+                               *maxIdx,
+                               imgSizeD,
+                               imgSizeH,
+                               imgSizeW,
+                               outD,
+                               outH,
+                               outW,
+                               ksizeD,
+                               ksizeH,
+                               ksizeW,
+                               strideD,
+                               strideH,
+                               strideW,
+                               padD,
+                               padH,
+                               padW,
+                               1.0,
+                               1.0);
+  inputGpuGrad->maxPool3DBackward(*targetGpuGrad,
+                                  *maxIdxGpu,
+                                  imgSizeD,
+                                  imgSizeH,
+                                  imgSizeW,
+                                  outD,
+                                  outH,
+                                  outW,
+                                  ksizeD,
+                                  ksizeH,
+                                  ksizeW,
+                                  strideD,
+                                  strideH,
+                                  strideW,
+                                  padD,
+                                  padH,
+                                  padW,
+                                  1.0,
+                                  1.0);
+  MatrixPtr targetBwdCheck =
+      CpuMatrix::create(numSamples, inWidth, false, false);
+  targetBwdCheck->copyFrom(*inputGpuGrad);
+  checkMatrixEqual(inputGrad, targetBwdCheck);
+}
+
+void testAvgPool3DFwdBwd(int numSamples,
+                         int channels,
+                         int imgSizeD,
+                         int imgSizeH,
+                         int imgSizeW,
+                         int ksizeD,
+                         int ksizeH,
+                         int ksizeW,
+                         int strideD,
+                         int strideH,
+                         int strideW,
+                         int padD,
+                         int padH,
+                         int padW) {
+  int outD = outputSize(imgSizeD, ksizeD, padD, strideD, true);
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
+
+  int inWidth = imgSizeD * imgSizeH * imgSizeW * channels;
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  int outWidth = channels * outD * outH * outW;
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  input->randomizeUniform();
+  target->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  targetGpu->copyFrom(*target);
+
+  target->avgPool3DForward(*input,
+                           channels,
+                           imgSizeD,
+                           imgSizeH,
+                           imgSizeW,
+                           outD,
+                           outH,
+                           outW,
+                           ksizeD,
+                           ksizeH,
+                           ksizeW,
+                           strideD,
+                           strideH,
+                           strideW,
+                           padD,
+                           padH,
+                           padW);
+
+  targetGpu->avgPool3DForward(*inputGpu,
+                              channels,
+                              imgSizeD,
+                              imgSizeH,
+                              imgSizeW,
+                              outD,
+                              outH,
+                              outW,
+                              ksizeD,
+                              ksizeH,
+                              ksizeW,
+                              strideD,
+                              strideH,
+                              strideW,
+                              padD,
+                              padH,
+                              padW);
+
+  TensorCheckErr(*target, *targetGpu);
+
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->avgPool3DBackward(*targetGrad,
+                               imgSizeD,
+                               imgSizeH,
+                               imgSizeW,
+                               outD,
+                               outH,
+                               outW,
+                               ksizeD,
+                               ksizeH,
+                               ksizeW,
+                               strideD,
+                               strideH,
+                               strideW,
+                               padD,
+                               padH,
+                               padW,
+                               1.0,
+                               1.0);
+
+  inputGpuGrad->avgPool3DBackward(*targetGpuGrad,
+                                  imgSizeD,
+                                  imgSizeH,
+                                  imgSizeW,
+                                  outD,
+                                  outH,
+                                  outW,
+                                  ksizeD,
+                                  ksizeH,
+                                  ksizeW,
+                                  strideD,
+                                  strideH,
+                                  strideW,
+                                  padD,
+                                  padH,
+                                  padW,
+                                  1.0,
+                                  1.0);
+  TensorCheckErr(*inputGrad, *inputGpuGrad);
+}
+
+// TODO(yi): I noticed many such blindly combinatorial tests in this
+// file.  They are no help to locate defects at all.
+TEST(Matrix, Pool3DFwdBwd) {
+  for (auto numSamples : {1, 3}) {
+    for (auto channels : {3}) {
+      for (auto imgSizeD : {9, 16}) {
+        for (auto imgSizeH : {9, 32}) {
+          for (auto imgSizeW : {9, 32}) {
+            for (auto sizeX : {3}) {
+              for (auto sizeY : {3}) {
+                for (auto sizeZ : {3}) {
+                  for (auto sD : {2}) {
+                    for (auto sH : {2}) {
+                      for (auto sW : {2}) {
+                        for (auto pD : {0, (sizeZ - 1) / 2}) {
+                          for (auto pH : {0, (sizeY - 1) / 2}) {
+                            for (auto pW : {0, (sizeX - 1) / 2}) {
+                              VLOG(3) << " numSamples=" << numSamples
+                                      << " channels=" << channels
+                                      << " imgSizeD=" << imgSizeD
+                                      << " imgSizeH=" << imgSizeH
+                                      << " imgSizeW=" << imgSizeW
+                                      << " sizeX=" << sizeX
+                                      << " sizeY=" << sizeY
+                                      << " sizeZ=" << sizeZ << " strideD=" << sD
+                                      << " strideH=" << sH << " strideW=" << sW
+                                      << " padingD=" << pD << " padingH=" << pH
+                                      << " padingW=" << pW;
+
+                              testMaxPool3DFwdBwd(numSamples,
+                                                  channels,
+                                                  imgSizeD,
+                                                  imgSizeH,
+                                                  imgSizeW,
+                                                  sizeX,
+                                                  sizeY,
+                                                  sizeZ,
+                                                  sD,
+                                                  sH,
+                                                  sW,
+                                                  pD,
+                                                  pH,
+                                                  pW);
+                              testAvgPool3DFwdBwd(numSamples,
+                                                  channels,
+                                                  imgSizeD,
+                                                  imgSizeH,
+                                                  imgSizeW,
+                                                  sizeX,
+                                                  sizeY,
+                                                  sizeZ,
+                                                  sD,
+                                                  sH,
+                                                  sW,
+                                                  pD,
+                                                  pH,
+                                                  pW);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  //  for (auto numSamples : {1, 3}) {
+  //    for (auto channels : {1, 3}) {
+  //      for (auto imgSizeD : {9,16}) {
+  //      for (auto imgSizeH : {9, 32}) {
+  //        for (auto imgSizeW : {9, 32}) {
+  //          for (auto sizeX : {2, 3}) {
+  //            for (auto sizeY : {2, 3}) {
+  //            for (auto sizeZ : {2,3}){
+  //              for (auto sD : {1, 2}) {
+  //              for (auto sH : {1, 2}) {
+  //                for (auto sW : {1, 2}) {
+  //                  for (auto pD : {0, (sizeZ - 1) / 2}){
+  //                  for (auto pH : {0, (sizeY - 1) / 2}) {
+  //                    for (auto pW : {0, (sizeX - 1) / 2}) {
+  //                      VLOG(3) << " numSamples=" << numSamples
+  //                              << " channels=" << channels
+  //                              << " imgSizeD=" << imgSizeD
+  //                              << " imgSizeH=" << imgSizeH
+  //                              << " imgSizeW=" << imgSizeW
+  //                              << " sizeX=" << sizeX
+  //                              << " sizeY=" << sizeY
+  //                              << " sizeZ=" << sizeZ
+  //                              << " strideD=" << sD
+  //                              << " strideH=" << sH
+  //                              << " strideW=" << sW
+  //                              << " padingD=" << pD
+  //                              << " padingH=" << pH
+  //                              << " padingW=" << pW;
+  //
+  //                      testMaxPool3DFwdBwd(numSamples,
+  //                                        channels,
+  //                                        imgSizeD,
+  //                                        imgSizeH,
+  //                                        imgSizeW,
+  //                                        sizeX,
+  //                                        sizeY,
+  //                                        sizeZ,
+  //                                        sD,
+  //                                        sH,
+  //                                        sW,
+  //                                        pD,
+  //                                        pH,
+  //                                        pW);
+  //                      testAvgPool3DFwdBwd(numSamples,
+  //                                        channels,
+  //                                        imgSizeD,
+  //                                        imgSizeH,
+  //                                        imgSizeW,
+  //                                        sizeX,
+  //                                        sizeY,
+  //                                        sizeZ,
+  //                                        sD,
+  //                                        sH,
+  //                                        sW,
+  //                                        pD,
+  //                                        pH,
+  //                                        pW);
+  //                    }
+  //                  }
+  //                }
+  //              }
+  //            }
+  //            }
+  //          }
+  //        }
+  //      }
+  //      }
+  //    }
+  //    }
+  //  }
+  //  }
+}
+
+void testMatrixCol2Vol(int depth, int height, int width) {
+  int channel = 3;
+  int filterX = 3, filterY = 4, filterZ = 5;
+  int strideX = 2, strideY = 2, strideZ = 2;
+  int padX = 1, padY = 1, padZ = 1;
+
+  MatrixPtr cpuImage =
+      std::make_shared<CpuMatrix>(channel, depth * height * width);
+  MatrixPtr gpuImage =
+      std::make_shared<GpuMatrix>(channel, depth * height * width);
+  cpuImage->randomizeUniform();
+  gpuImage->copyFrom(*cpuImage);
+
+  int outD = outputSize(depth, filterZ, padZ, strideZ, true);
+  int outH = outputSize(height, filterY, padY, strideY, true);
+  int outW = outputSize(width, filterX, padX, strideX, true);
+
+  int colBufHeight = channel * filterZ * filterY * filterX;
+  int colBufWidth = outD * outH * outW;
+  MatrixPtr cpuColBuf = std::make_shared<CpuMatrix>(colBufHeight, colBufWidth);
+  MatrixPtr gpuColBuf = std::make_shared<GpuMatrix>(colBufHeight, colBufWidth);
+  cpuColBuf->vol2Col(cpuImage->getData(),
+                     channel,
+                     depth,
+                     height,
+                     width,
+                     filterZ,
+                     filterY,
+                     filterX,
+                     strideZ,
+                     strideY,
+                     strideX,
+                     padZ,
+                     padY,
+                     padX);
+  gpuColBuf->vol2Col(gpuImage->getData(),
+                     channel,
+                     depth,
+                     height,
+                     width,
+                     filterZ,
+                     filterY,
+                     filterX,
+                     strideZ,
+                     strideY,
+                     strideX,
+                     padZ,
+                     padY,
+                     padX);
+  TensorCheckEqual(*cpuColBuf, *gpuColBuf);
+
+  cpuColBuf->randomizeUniform();
+  gpuColBuf->copyFrom(*cpuColBuf);
+  cpuColBuf->col2Vol(cpuImage->getData(),
+                     channel,
+                     depth,
+                     height,
+                     width,
+                     filterZ,
+                     filterY,
+                     filterX,
+                     strideZ,
+                     strideY,
+                     strideX,
+                     padZ,
+                     padY,
+                     padX,
+                     1.0,
+                     1.0);
+  gpuColBuf->col2Vol(gpuImage->getData(),
+                     channel,
+                     depth,
+                     height,
+                     width,
+                     filterZ,
+                     filterY,
+                     filterX,
+                     strideZ,
+                     strideY,
+                     strideX,
+                     padZ,
+                     padY,
+                     padX,
+                     1.0,
+                     1.0);
+  TensorCheckErr(*cpuImage, *gpuImage);
+}
+
+TEST(Matrix, col2Vol) {
+  for (auto depth : {9, 16, 64}) {
+    for (auto height : {9, 11, 128}) {
+      for (auto width : {9, 32, 128}) {
+        VLOG(3) << "depth=" << depth << " height=" << height
+                << " width=" << width;
+        testMatrixCol2Vol(depth, height, width);
+      }
+    }
+  }
+}
+
+#endif
diff --git a/paddle/legacy/math/tests/test_matrixUtil.h b/paddle/legacy/math/tests/test_matrixUtil.h
new file mode 100644
index 0000000000000000000000000000000000000000..58c93f746e7ef4e2f2f98d4f410c74909a723812
--- /dev/null
+++ b/paddle/legacy/math/tests/test_matrixUtil.h
@@ -0,0 +1,233 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <gtest/gtest.h>
+#include <paddle/legacy/utils/Util.h>
+#include "paddle/legacy/math/SparseMatrix.h"
+
+namespace paddle {
+
+void checkMatrixEqual(const MatrixPtr& a, const MatrixPtr& b) {
+  ASSERT_EQ(a->getWidth(), b->getWidth());
+  ASSERT_EQ(a->getHeight(), b->getHeight());
+  ASSERT_EQ(a->isTransposed(), b->isTransposed());
+  for (size_t r = 0; r < a->getHeight(); ++r) {
+    for (size_t c = 0; c < a->getWidth(); ++c) {
+      ASSERT_FLOAT_EQ(a->getElement(r, c), b->getElement(r, c));
+    }
+  }
+}
+
+void checkSMatrixEqual(const CpuSparseMatrix& a, const CpuSparseMatrix& b) {
+  ASSERT_EQ(a.getWidth(), b.getWidth());
+  ASSERT_EQ(a.getHeight(), b.getHeight());
+  ASSERT_EQ(a.isTransposed(), b.isTransposed());
+  ASSERT_EQ(a.getFormat(), b.getFormat());
+  ASSERT_EQ(a.getElementCnt(), b.getElementCnt());
+  for (size_t r = 0; r < a.getElementCnt(); ++r) {
+    ASSERT_FLOAT_EQ(a.getValue()[r], b.getValue()[r]);
+  }
+}
+
+void checkSMatrixEqual(const CpuSparseMatrixPtr& a,
+                       const CpuSparseMatrixPtr& b) {
+  ASSERT_EQ(a->getWidth(), b->getWidth());
+  ASSERT_EQ(a->getHeight(), b->getHeight());
+  ASSERT_EQ(a->isTransposed(), b->isTransposed());
+  ASSERT_EQ(a->getFormat(), b->getFormat());
+  ASSERT_EQ(a->getElementCnt(), b->getElementCnt());
+  for (size_t r = 0; r < a->getElementCnt(); ++r) {
+    ASSERT_FLOAT_EQ(a->getValue()[r], b->getValue()[r]);
+  }
+}
+
+void checkSMatrixEqual2(const CpuSparseMatrixPtr& a,
+                        const CpuSparseMatrixPtr& b) {
+  ASSERT_EQ(a->getWidth(), b->getWidth());
+  ASSERT_EQ(a->getHeight(), b->getHeight());
+  ASSERT_EQ(a->isTransposed(), b->isTransposed());
+  ASSERT_EQ(a->getFormat(), b->getFormat());
+  ASSERT_EQ(a->getValueType(), b->getValueType());
+  ASSERT_EQ(a->getElementCnt(), b->getElementCnt());
+  if (a->getFormat() == SPARSE_CSR) {
+    for (size_t r = 0; r < a->getElementCnt(); ++r) {
+      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
+      if (a->getValueType() == FLOAT_VALUE) {
+        ASSERT_FLOAT_EQ(a->getValue()[r], b->getValue()[r]);
+      }
+    }
+    for (size_t r = 0; r <= a->getHeight(); r++) {
+      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
+    }
+  } else {
+    for (size_t r = 0; r < a->getElementCnt(); ++r) {
+      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
+      if (a->getValueType() == FLOAT_VALUE) {
+        ASSERT_FLOAT_EQ(a->getValue()[r], b->getValue()[r]);
+      }
+    }
+    for (size_t r = 0; r <= a->getWidth(); r++) {
+      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
+    }
+  }
+}
+
+void checkSMatrixEqual2Dense(const CpuSparseMatrix& a, const CpuMatrix& b) {
+  ASSERT_EQ(a.getWidth(), b.getWidth());
+  ASSERT_EQ(a.getHeight(), b.getHeight());
+  ASSERT_EQ(a.isTransposed(), b.isTransposed());
+
+  if (a.getFormat() == SPARSE_CSC) {
+    int* rows = a.getRows();
+    for (size_t i = 0; i < a.getWidth(); i++) {
+      for (size_t j = a.getColStartIdx(i); j < a.getColStartIdx(i + 1); j++) {
+        if (a.getValueType() == FLOAT_VALUE) {
+          ASSERT_FLOAT_EQ(a.getValue()[j], b.getElement(rows[j], i));
+        } else {
+          ASSERT_FLOAT_EQ(1.0, b.getElement(rows[j], i));
+        }
+      }
+    }
+  } else {
+    int* cols = a.getCols();
+    for (size_t i = 0; i < a.getHeight(); i++) {
+      for (size_t j = a.getRowStartIdx(i); j < a.getRowStartIdx(i + 1); j++) {
+        if (a.getValueType() == FLOAT_VALUE) {
+          ASSERT_FLOAT_EQ(a.getValue()[j], b.getElement(i, cols[j]));
+        } else {
+          ASSERT_FLOAT_EQ(1.0, b.getElement(i, cols[j]));
+        }
+      }
+    }
+  }
+}
+
+void checkSMatrixEqual2Dense(const CpuSparseMatrixPtr& a,
+                             const CpuMatrixPtr& b) {
+  ASSERT_EQ(a->getWidth(), b->getWidth());
+  ASSERT_EQ(a->getHeight(), b->getHeight());
+  ASSERT_EQ(a->isTransposed(), b->isTransposed());
+
+  if (a->getFormat() == SPARSE_CSC) {
+    int* rows = a->getRows();
+    for (size_t i = 0; i < a->getWidth(); i++) {
+      for (size_t j = a->getColStartIdx(i); j < a->getColStartIdx(i + 1); j++) {
+        if (a->getValueType() == FLOAT_VALUE) {
+          ASSERT_FLOAT_EQ(a->getValue()[j], b->getElement(rows[j], i));
+        } else {
+          ASSERT_FLOAT_EQ(1.0, b->getElement(rows[j], i));
+        }
+      }
+    }
+  } else {
+    int* cols = a->getCols();
+    for (size_t i = 0; i < a->getHeight(); i++) {
+      for (size_t j = a->getRowStartIdx(i); j < a->getRowStartIdx(i + 1); j++) {
+        if (a->getValueType() == FLOAT_VALUE) {
+          ASSERT_FLOAT_EQ(a->getValue()[j], b->getElement(i, cols[j]));
+        } else {
+          ASSERT_FLOAT_EQ(1.0, b->getElement(i, cols[j]));
+        }
+      }
+    }
+  }
+}
+
+void checkSMatrixErr(const CpuSparseMatrixPtr& a, const CpuSparseMatrixPtr& b) {
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+  ASSERT_EQ(a->getWidth(), b->getWidth());
+  ASSERT_EQ(a->getHeight(), b->getHeight());
+  ASSERT_EQ(a->isTransposed(), b->isTransposed());
+  ASSERT_EQ(a->getFormat(), b->getFormat());
+  ASSERT_EQ(a->getValueType(), b->getValueType());
+  ASSERT_EQ(a->getElementCnt(), b->getElementCnt());
+  int count = 0;
+  if (a->getFormat() == SPARSE_CSR) {
+    for (size_t r = 0; r < a->getElementCnt(); ++r) {
+      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
+      if (a->getValueType() == FLOAT_VALUE) {
+        real aVal = a->getValue()[r];
+        real bVal = b->getValue()[r];
+        if (std::abs(aVal - bVal) > err) {
+          if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) {
+            LOG(INFO) << "a=" << aVal << "\t"
+                      << "b=" << bVal;
+            count++;
+          }
+        }
+      }
+    }
+    for (size_t r = 0; r <= a->getHeight(); r++) {
+      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
+    }
+  } else {
+    for (size_t r = 0; r < a->getElementCnt(); ++r) {
+      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
+      if (a->getValueType() == FLOAT_VALUE) {
+        real aVal = a->getValue()[r];
+        real bVal = b->getValue()[r];
+        if (std::abs(aVal - bVal) > err) {
+          if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) {
+            count++;
+          }
+        }
+      }
+    }
+    for (size_t r = 0; r <= a->getWidth(); r++) {
+      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+void checkMatrixErr(const Matrix& matrix1, const Matrix& matrix2) {
+  CHECK(matrix1.getHeight() == matrix2.getHeight());
+  CHECK(matrix1.getWidth() == matrix2.getWidth());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+  const real* data1 = matrix1.getData();
+  const real* data2 = matrix2.getData();
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      real a = data1[i * width + j];
+      real b = data2[i * width + j];
+      if (std::abs(a - b) > err) {
+        if ((std::abs(a - b) / std::abs(a)) > (err / 10.0f)) {
+          count++;
+        }
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+void checkDataEqual(const real* a, const real* b, size_t size) {
+  for (size_t i = 0; i < size; ++i) {
+    ASSERT_FLOAT_EQ(a[i], b[i]);
+  }
+}
+
+}  //  namespace paddle
diff --git a/paddle/legacy/math/tests/test_perturbation.cpp b/paddle/legacy/math/tests/test_perturbation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..969400666f12e4c6001f270be3ec144e7e4d0702
--- /dev/null
+++ b/paddle/legacy/math/tests/test_perturbation.cpp
@@ -0,0 +1,318 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+
+#include <cuda_runtime.h>
+#include <gtest/gtest.h>
+#include <cmath>
+#include <vector>
+#include "hl_cuda.h"
+#include "hl_perturbation_util.cuh"
+
+using namespace std;  // NOLINT
+
+#define _USE_MATH_DEFINES
+
+const int NUM_IMAGES = 2;
+const int SAMPLING_RATE = 2;
+const int IMG_SIZE = 41;
+const int TGT_SIZE = 21;
+const int CHANNELS = 3;
+
+class PerturbationTest : public testing::Test {
+ protected:
+  virtual void SetUp() { generateTestImages(gpuImages_); }
+
+  virtual void TearDown() {}
+
+  void allocateMem(real*& gpuAngle,
+                   real*& gpuScale,
+                   int*& gpuCenterR,
+                   int*& gpuCenterC) {
+    gpuAngle = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
+    gpuScale = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
+    gpuCenterR =
+        (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
+    gpuCenterC =
+        (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
+  }
+
+  // Generate translation parameters for testing.
+  void generateTranslationParams(int*& gpuCenterR,
+                                 int*& gpuCenterC,
+                                 int imgSize) {
+    int cpuCenterR[NUM_IMAGES * SAMPLING_RATE];
+    int cpuCenterC[NUM_IMAGES * SAMPLING_RATE];
+    for (int i = 0; i < NUM_IMAGES * SAMPLING_RATE; ++i) {
+      cpuCenterR[i] = (imgSize - 1) / 2;
+      cpuCenterC[i] = (imgSize - 1) / 2 - 1;
+    }
+
+    gpuCenterR =
+        (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
+    hl_memcpy_host2device(
+        gpuCenterR, cpuCenterR, sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
+
+    gpuCenterC =
+        (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
+    hl_memcpy_host2device(
+        gpuCenterC, cpuCenterC, sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
+  }
+
+  // Generate rotation parameters for testing.
+  void generateRotationParams(real*& gpuAngle) {
+    real cpuAngle[NUM_IMAGES];
+    for (int i = 0; i < NUM_IMAGES; ++i) {
+      cpuAngle[i] = 90.0 * M_PI / 180.0;
+    }
+    gpuAngle = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
+    hl_memcpy_host2device(gpuAngle, cpuAngle, sizeof(real) * NUM_IMAGES);
+  }
+
+  void generateScaleParams(real*& gpuScale) {
+    real cpuScale[NUM_IMAGES];
+    for (int i = 0; i < NUM_IMAGES; ++i) {
+      cpuScale[i] = static_cast<real>(TGT_SIZE - 2) / TGT_SIZE;
+    }
+    gpuScale = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
+    hl_memcpy_host2device(gpuScale, cpuScale, sizeof(real) * NUM_IMAGES);
+  }
+
+  // Generate the test images, only the center regions are set to 1.
+  // The other parts are set to 0.
+  void generateTestImages(real*& gpuImages) {
+    const int IMAGE_MEM_SIZE = NUM_IMAGES * IMG_SIZE * IMG_SIZE * CHANNELS;
+    real cpuImages[IMAGE_MEM_SIZE];
+    // Set the middle of each image to 1.
+    real* ptr = cpuImages;
+    for (int i = 0; i < NUM_IMAGES; ++i) {
+      for (int r = 0; r < IMG_SIZE; ++r) {
+        for (int c = 0; c < IMG_SIZE; ++c) {
+          for (int ch = 0; ch < CHANNELS; ++ch) {
+            if (r >= IMG_SIZE / 4 && r < IMG_SIZE - IMG_SIZE / 4 &&
+                c >= IMG_SIZE / 4 && c < IMG_SIZE - IMG_SIZE / 4) {
+              *ptr = 1.0;
+            } else {
+              *ptr = 0.0;
+            }
+            ++ptr;
+          }
+        }
+      }
+    }
+    gpuImages = (real*)hl_malloc_device(sizeof(real) * IMAGE_MEM_SIZE);
+    hl_memcpy_host2device(gpuImages, cpuImages, sizeof(real) * IMAGE_MEM_SIZE);
+  }
+
+  real* gpuImages_;
+};
+
+// Random perturbation. Only to make sure the code does not break.
+TEST_F(PerturbationTest, random_perturb) {
+  real *gpuAngle, *gpuScaleRatio;
+  int *gpuCenterR, *gpuCenterC;
+  allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
+
+  real* targets = NULL;
+  const int TARGET_MEM_SIZE =
+      NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
+  targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
+  hl_conv_random_disturb(gpuImages_,
+                         IMG_SIZE,
+                         TGT_SIZE,
+                         CHANNELS,
+                         NUM_IMAGES,
+                         1.0,
+                         1.0,
+                         SAMPLING_RATE,
+                         gpuAngle,
+                         gpuScaleRatio,
+                         gpuCenterR,
+                         gpuCenterC,
+                         2,
+                         true,
+                         targets);
+  real cpuTargets[TARGET_MEM_SIZE];
+  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
+}
+
+TEST_F(PerturbationTest, identity_perturb) {
+  real *gpuAngle, *gpuScaleRatio;
+  int *gpuCenterR, *gpuCenterC;
+  allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
+
+  real* targets = NULL;
+  const int TARGET_MEM_SIZE =
+      NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
+  targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
+  hl_conv_random_disturb(gpuImages_,
+                         IMG_SIZE,
+                         TGT_SIZE,
+                         CHANNELS,
+                         NUM_IMAGES,
+                         1.0,
+                         1.0,
+                         SAMPLING_RATE,
+                         gpuAngle,
+                         gpuScaleRatio,
+                         gpuCenterR,
+                         gpuCenterC,
+                         2,
+                         false,
+                         targets);
+  real cpuTargets[TARGET_MEM_SIZE];
+  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
+  for (int i = 0; i < TARGET_MEM_SIZE; ++i) {
+    EXPECT_FLOAT_EQ(1.0, cpuTargets[i]);
+  }
+}
+
+TEST_F(PerturbationTest, translation_test) {
+  real *gpuAngle, *gpuScaleRatio;
+  int *gpuCenterR, *gpuCenterC;
+  allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
+  hl_generate_disturb_params(gpuAngle,
+                             gpuScaleRatio,
+                             gpuCenterR,
+                             gpuCenterC,
+                             NUM_IMAGES,
+                             IMG_SIZE,
+                             0.0,
+                             0.0,
+                             SAMPLING_RATE,
+                             false);
+  generateTranslationParams(gpuCenterR, gpuCenterC, IMG_SIZE);
+
+  real* targets = NULL;
+  const int TARGET_MEM_SIZE =
+      NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
+  targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
+  hl_conv_random_disturb_with_params(gpuImages_,
+                                     IMG_SIZE,
+                                     TGT_SIZE,
+                                     CHANNELS,
+                                     NUM_IMAGES,
+                                     SAMPLING_RATE,
+                                     gpuAngle,
+                                     gpuScaleRatio,
+                                     gpuCenterR,
+                                     gpuCenterC,
+                                     2,
+                                     targets);
+
+  real cpuTargets[TARGET_MEM_SIZE];
+  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
+  for (int i = 0; i < SAMPLING_RATE * NUM_IMAGES; ++i) {
+    for (int p = 0; p < TGT_SIZE * TGT_SIZE * CHANNELS; ++p) {
+      const int offset = i * TGT_SIZE * TGT_SIZE * CHANNELS + p;
+      if (p < TGT_SIZE * CHANNELS) {
+        EXPECT_FLOAT_EQ(0.0, cpuTargets[offset]);
+      } else {
+        EXPECT_FLOAT_EQ(1.0, cpuTargets[offset]);
+      }
+    }
+  }
+}
+
+TEST_F(PerturbationTest, rotation_test) {
+  real *gpuAngle, *gpuScaleRatio;
+  int *gpuCenterR, *gpuCenterC;
+  allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
+  hl_generate_disturb_params(gpuAngle,
+                             gpuScaleRatio,
+                             gpuCenterR,
+                             gpuCenterC,
+                             NUM_IMAGES,
+                             IMG_SIZE,
+                             0.0,
+                             0.0,
+                             SAMPLING_RATE,
+                             false);
+  generateRotationParams(gpuAngle);
+
+  real* targets = NULL;
+  const int TARGET_MEM_SIZE =
+      NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
+  targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
+  hl_conv_random_disturb_with_params(gpuImages_,
+                                     IMG_SIZE,
+                                     TGT_SIZE,
+                                     CHANNELS,
+                                     NUM_IMAGES,
+                                     SAMPLING_RATE,
+                                     gpuAngle,
+                                     gpuScaleRatio,
+                                     gpuCenterR,
+                                     gpuCenterC,
+                                     2,
+                                     targets);
+
+  real cpuTargets[TARGET_MEM_SIZE];
+  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
+  for (int i = 0; i < TARGET_MEM_SIZE; ++i) {
+    EXPECT_FLOAT_EQ(1.0, cpuTargets[i]);
+  }
+}
+
+TEST_F(PerturbationTest, scale_test) {
+  real *gpuAngle, *gpuScaleRatio;
+  int *gpuCenterR, *gpuCenterC;
+  allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
+  hl_generate_disturb_params(gpuAngle,
+                             gpuScaleRatio,
+                             gpuCenterR,
+                             gpuCenterC,
+                             NUM_IMAGES,
+                             IMG_SIZE,
+                             0.0,
+                             0.0,
+                             SAMPLING_RATE,
+                             false);
+  generateScaleParams(gpuScaleRatio);
+
+  real* targets = NULL;
+  const int TARGET_MEM_SIZE =
+      NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
+  targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
+  hl_conv_random_disturb_with_params(gpuImages_,
+                                     IMG_SIZE,
+                                     TGT_SIZE,
+                                     CHANNELS,
+                                     NUM_IMAGES,
+                                     SAMPLING_RATE,
+                                     gpuAngle,
+                                     gpuScaleRatio,
+                                     gpuCenterR,
+                                     gpuCenterC,
+                                     2,
+                                     targets);
+
+  real cpuTargets[TARGET_MEM_SIZE];
+  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
+  for (int i = 0; i < SAMPLING_RATE * NUM_IMAGES; ++i) {
+    for (int p = 0; p < TGT_SIZE * TGT_SIZE * CHANNELS; ++p) {
+      const int offset = i * TGT_SIZE * TGT_SIZE * CHANNELS + p;
+      int c = (p / CHANNELS) % TGT_SIZE;
+      int r = (p / CHANNELS) / TGT_SIZE;
+      if (r == 0 || r == TGT_SIZE - 1 || c == 0 || c == TGT_SIZE - 1) {
+        EXPECT_FLOAT_EQ(0.0, cpuTargets[offset]);
+      } else {
+        EXPECT_FLOAT_EQ(1.0, cpuTargets[offset]);
+      }
+    }
+  }
+}
+
+#endif
diff --git a/paddle/legacy/math/tests/test_sparseMatrixCompare.cpp b/paddle/legacy/math/tests/test_sparseMatrixCompare.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..492aa0a689540dbb2c687326ff8a2919d89d2e6f
--- /dev/null
+++ b/paddle/legacy/math/tests/test_sparseMatrixCompare.cpp
@@ -0,0 +1,174 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+/// This unittest checks GpuSparseMatrix/CpuSparseMatrix get same result,
+//  so disable when
+/// only cpu version.
+
+#include <gtest/gtest.h>
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/utils/Util.h"
+#include "test_matrixUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+static inline int uniformRandom(int n) { return n == 0 ? 0 : rand() % n; }
+
+void testSpMatrixAddBias(int M, int N, real rate, real scale) {
+  int nnz = M * N * rate;
+
+  MatrixPtr cpuA(new CpuSparseMatrix(M, N, nnz));
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(1, N);
+
+  MatrixPtr gpuA(new GpuSparseMatrix(M, N, nnz));
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(1, N);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+
+  hl_stream_t stream(HPPL_STREAM_1);
+  gpuA->copyFrom(*cpuA, stream);
+  gpuB->copyFrom(*cpuB, stream);
+  hl_stream_synchronize(stream);
+
+  cpuA->addBias(*cpuB, scale);
+  gpuA->addBias(*gpuB, scale);
+
+  MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
+  outputCheck->copyFrom(*gpuA, stream);
+  hl_stream_synchronize(stream);
+  checkSMatrixEqual2(std::dynamic_pointer_cast<CpuSparseMatrix>(cpuA),
+                     std::dynamic_pointer_cast<CpuSparseMatrix>(outputCheck));
+}
+
+void testSpMatrixAddDense(int M, int N, real rate) {  // add3
+  int nnz = M * N * rate;
+
+  MatrixPtr cpuA(new CpuSparseMatrix(M, N, nnz));
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(M, N);
+
+  MatrixPtr gpuA(new GpuSparseMatrix(M, N, nnz));
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(M, N);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+
+  hl_stream_t stream(HPPL_STREAM_3);
+  gpuA->copyFrom(*cpuA, stream);
+  gpuB->copyFrom(*cpuB, stream);
+  hl_stream_synchronize(stream);
+
+  cpuA->add3(cpuB);
+  gpuA->add3(gpuB);
+
+  MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
+  outputCheck->copyFrom(*gpuA, stream);
+  hl_stream_synchronize(stream);
+  checkSMatrixEqual2(std::dynamic_pointer_cast<CpuSparseMatrix>(cpuA),
+                     std::dynamic_pointer_cast<CpuSparseMatrix>(outputCheck));
+}
+
+void testSpMatrixMul(int M, int N, int K, real rate) {
+  int nnz = M * N * rate;
+
+  MatrixPtr cpuA = std::make_shared<CpuMatrix>(M, K);
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(N, K);
+  MatrixPtr cpuC(new CpuSparseMatrix(M, N, nnz));
+
+  MatrixPtr gpuA = std::make_shared<GpuMatrix>(M, K);
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(N, K);
+  MatrixPtr gpuC(new GpuSparseMatrix(M, N, nnz));
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+  cpuC->randomizeUniform();
+
+  hl_stream_t stream(HPPL_STREAM_3);
+  gpuA->copyFrom(*cpuA, stream);
+  gpuB->copyFrom(*cpuB, stream);
+  gpuC->copyFrom(*cpuC, stream);
+  hl_stream_synchronize(stream);
+
+  cpuC->mul(*cpuA, *cpuB->getTranspose(), 1, 1);
+  gpuC->mul(*gpuA, *gpuB->getTranspose(), 1, 1);
+
+  MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
+  outputCheck->copyFrom(*gpuC, stream);
+  hl_stream_synchronize(stream);
+  checkSMatrixErr(std::dynamic_pointer_cast<CpuSparseMatrix>(cpuC),
+                  std::dynamic_pointer_cast<CpuSparseMatrix>(outputCheck));
+}
+
+void testSpMatrixCollectBias(int M, int N, real rate) {
+  int nnz = M * N * rate;
+  LOG(INFO) << "nnz=" << nnz;
+
+  MatrixPtr cpuA(new CpuSparseMatrix(M, N, nnz));
+  MatrixPtr cpuB = std::make_shared<CpuMatrix>(1, N);
+
+  MatrixPtr gpuA(new GpuSparseMatrix(M, N, nnz));
+  MatrixPtr gpuB = std::make_shared<GpuMatrix>(1, N);
+
+  cpuA->randomizeUniform();
+  cpuB->randomizeUniform();
+
+  hl_stream_t stream(HPPL_STREAM_3);
+  gpuA->copyFrom(*cpuA, stream);
+  gpuB->copyFrom(*cpuB, stream);
+  hl_stream_synchronize(stream);
+
+  cpuB->collectBias(*cpuA, 1);
+  gpuB->collectBias(*gpuA, 1);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(1, N);
+  outputCheck->copyFrom(*gpuB, stream);
+  hl_stream_synchronize(stream);
+  checkMatrixErr(*cpuB, *outputCheck);
+}
+
+TEST(SMatrix, sMatrixOp) {
+  for (auto height : {1, 11, 200}) {
+    for (auto width : {200, 2048, 20480}) {
+      VLOG(3) << " height=" << height << " width=" << width;
+      for (auto rate : {0.02, 0.1}) {
+        testSpMatrixAddDense(height, width, rate);
+        testSpMatrixAddBias(height, width, rate, 1.0);
+      }
+    }
+  }
+}
+
+TEST(SMatrix, sMatrixMul) {
+  for (auto M : {1, 40, 128, 200}) {
+    for (auto N : {100, 2000, 20480}) {
+      for (auto K : {100, 512, 1024}) {
+        VLOG(3) << " M=" << M << " N=" << N << " K=" << K;
+        testSpMatrixMul(M, N, K, 0.05);
+      }
+    }
+  }
+}
+
+TEST(SMatrix, sMatrixCollectBias) {
+  for (auto height : {1, 128, 200}) {
+    for (auto width : {100, 2048, 20480}) {
+      VLOG(3) << " height=" << height << " width=" << width;
+      testSpMatrixCollectBias(height, width, 0.1);
+    }
+  }
+}
+
+#endif
diff --git a/paddle/legacy/optimizer/CMakeLists.txt b/paddle/legacy/optimizer/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7c80faa48ce960a3a7eb7d88eda4f2b09756410e
--- /dev/null
+++ b/paddle/legacy/optimizer/CMakeLists.txt
@@ -0,0 +1,16 @@
+set(OPITMIZER_SRCS
+    adadelta_optimizer.cc
+    adagrad_optimizer.cc
+    adam_optimizer.cc
+    optimizer.cc
+    parameter_optimizer.cc
+    sgd_optimizer.cc
+  )
+
+add_library(paddle_optimizer ${OPITMIZER_SRCS})
+target_link_libraries(paddle_optimizer paddle_proto glog)
+
+if (WITH_TESTING)
+    add_unittest(serialization_test serialization_test.cc)
+    add_unittest(parameter_optimizer_test parameter_optimizer_test.cc)
+endif()
diff --git a/paddle/optimizer/adadelta_optimizer.cc b/paddle/legacy/optimizer/adadelta_optimizer.cc
similarity index 100%
rename from paddle/optimizer/adadelta_optimizer.cc
rename to paddle/legacy/optimizer/adadelta_optimizer.cc
diff --git a/paddle/legacy/optimizer/adadelta_optimizer.h b/paddle/legacy/optimizer/adadelta_optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..5beb62295a83ba4826e9a6b9caf21de78d2e8ced
--- /dev/null
+++ b/paddle/legacy/optimizer/adadelta_optimizer.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "parameter_optimizer.h"
+
+namespace paddle {
+namespace optimizer {
+
+class AdadeltaOptimizer : public ParameterOptimizer {
+ public:
+  AdadeltaOptimizer(
+      Tensor *parameter, LrPolicy *lr, double rho, double epsilon, double decay)
+      : ParameterOptimizer(parameter, lr),
+        accum_gradient_(new Tensor(parameter->size())),
+        accum_delta_(new Tensor(parameter->size())),
+        update_delta_(new Tensor(parameter->size())),
+        rho_(rho),
+        epsilon_(epsilon),
+        decay_(decay) {}
+
+  ~AdadeltaOptimizer() {
+    if (accum_gradient_) delete accum_gradient_;
+    if (accum_delta_) delete accum_delta_;
+    if (update_delta_) delete update_delta_;
+  }
+  void Update(const Tensor *gradient);
+  std::string SerializeState();
+  void DeserializeState(const std::string &state);
+
+ private:
+  Tensor *accum_gradient_;
+  Tensor *accum_delta_;
+  Tensor *update_delta_;
+  double rho_;
+  double epsilon_;
+  double decay_;
+};
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/adagrad_optimizer.cc b/paddle/legacy/optimizer/adagrad_optimizer.cc
similarity index 100%
rename from paddle/optimizer/adagrad_optimizer.cc
rename to paddle/legacy/optimizer/adagrad_optimizer.cc
diff --git a/paddle/legacy/optimizer/adagrad_optimizer.h b/paddle/legacy/optimizer/adagrad_optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b6fc06739970984cf4bbd27d3e6e1e9066bc350f
--- /dev/null
+++ b/paddle/legacy/optimizer/adagrad_optimizer.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "parameter_optimizer.h"
+
+namespace paddle {
+namespace optimizer {
+
+class AdagradOptimizer : public ParameterOptimizer {
+ public:
+  AdagradOptimizer(Tensor *parameter,
+                   LrPolicy *lr,
+                   double epsilon,
+                   double decay)
+      : ParameterOptimizer(parameter, lr),
+        accum_gradient_(new Tensor(parameter->size())),
+        epsilon_(epsilon),
+        decay_(decay) {}
+  ~AdagradOptimizer() {
+    if (accum_gradient_) delete accum_gradient_;
+  }
+  void Update(const Tensor *gradient);
+  std::string SerializeState();
+  void DeserializeState(const std::string &state);
+
+ private:
+  Tensor *accum_gradient_;
+  double epsilon_;
+  double decay_;
+};
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/adam_optimizer.cc b/paddle/legacy/optimizer/adam_optimizer.cc
similarity index 100%
rename from paddle/optimizer/adam_optimizer.cc
rename to paddle/legacy/optimizer/adam_optimizer.cc
diff --git a/paddle/legacy/optimizer/adam_optimizer.h b/paddle/legacy/optimizer/adam_optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..fce10960068364b40592b26a6b439494d75cfa03
--- /dev/null
+++ b/paddle/legacy/optimizer/adam_optimizer.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "parameter_optimizer.h"
+
+namespace paddle {
+namespace optimizer {
+
+class AdamOptimizer : public ParameterOptimizer {
+ public:
+  AdamOptimizer(Tensor *parameter,
+                LrPolicy *lr,
+                double beta_1,
+                double beta_2,
+                double epsilon,
+                double decay)
+      : ParameterOptimizer(parameter, lr),
+        momentums_(new Tensor(parameter->size())),
+        velocitys_(new Tensor(parameter->size())),
+        beta_1_(beta_1),
+        beta_2_(beta_2),
+        epsilon_(epsilon),
+        decay_(decay) {}
+  ~AdamOptimizer() {
+    if (momentums_) delete momentums_;
+    if (velocitys_) delete velocitys_;
+  }
+  void Update(const Tensor *gradient);
+  std::string SerializeState();
+  void DeserializeState(const std::string &state);
+
+ private:
+  Tensor *momentums_;
+  Tensor *velocitys_;
+  double beta_1_;
+  double beta_2_;
+  double epsilon_;
+  double decay_;
+};
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/legacy/optimizer/lr_policy.h b/paddle/legacy/optimizer/lr_policy.h
new file mode 100644
index 0000000000000000000000000000000000000000..d639c9f22c8ad77267f68e2c3b35257211bf90df
--- /dev/null
+++ b/paddle/legacy/optimizer/lr_policy.h
@@ -0,0 +1,82 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <algorithm>
+#include "OptimizerConfig.pb.h"
+
+namespace paddle {
+namespace optimizer {
+
+class LrPolicy {
+ public:
+  virtual ~LrPolicy() {}
+  virtual double LearningRate(const uint64_t num_sample_passed) = 0;
+  virtual std::string SerializeState() = 0;
+  virtual void DeserializeState(const std::string &state) = 0;
+};
+
+// constant learning rate policy
+class ConstLr final : public LrPolicy {
+ public:
+  ConstLr(double lr) : learning_rate_(lr){};
+  double LearningRate(const uint64_t num_sample_passed) {
+    return learning_rate_;
+  }
+  std::string SerializeState() {
+    LrPolicyState state;
+    state.set_learning_rate(learning_rate_);
+    return state.SerializeAsString();
+  }
+  void DeserializeState(const std::string &str) {
+    LrPolicyState state;
+    state.ParseFromString(str);
+    learning_rate_ = state.learning_rate();
+  }
+
+ private:
+  double learning_rate_;
+};
+
+class LinearLr final : public LrPolicy {
+ public:
+  LinearLr(double lr, double lr_decay_a, double lr_decay_b)
+      : learning_rate_(lr), lr_decay_a_(lr_decay_a), lr_decay_b_(lr_decay_b) {}
+  double LearningRate(const uint64_t num_sample_passed) {
+    return std::max(learning_rate_ - lr_decay_a_ * num_sample_passed,
+                    lr_decay_b_);
+  }
+  std::string SerializeState() {
+    LrPolicyState state;
+    state.set_learning_rate(learning_rate_);
+    state.set_lr_decay_a(lr_decay_a_);
+    state.set_lr_decay_b(lr_decay_b_);
+    return state.SerializeAsString();
+  }
+  void DeserializeState(const std::string &str) {
+    LrPolicyState state;
+    state.ParseFromString(str);
+    learning_rate_ = state.learning_rate();
+    lr_decay_a_ = state.lr_decay_a();
+    lr_decay_b_ = state.lr_decay_b();
+  }
+
+ private:
+  double learning_rate_;
+  double lr_decay_a_;
+  double lr_decay_b_;
+};
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/optimizer.cc b/paddle/legacy/optimizer/optimizer.cc
similarity index 100%
rename from paddle/optimizer/optimizer.cc
rename to paddle/legacy/optimizer/optimizer.cc
diff --git a/paddle/optimizer/optimizer.h b/paddle/legacy/optimizer/optimizer.h
similarity index 100%
rename from paddle/optimizer/optimizer.h
rename to paddle/legacy/optimizer/optimizer.h
diff --git a/paddle/optimizer/parameter_optimizer.cc b/paddle/legacy/optimizer/parameter_optimizer.cc
similarity index 100%
rename from paddle/optimizer/parameter_optimizer.cc
rename to paddle/legacy/optimizer/parameter_optimizer.cc
diff --git a/paddle/legacy/optimizer/parameter_optimizer.h b/paddle/legacy/optimizer/parameter_optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5abca82d55c12aed0f4fca0c4c1f21d20586155
--- /dev/null
+++ b/paddle/legacy/optimizer/parameter_optimizer.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+#include <functional>
+#include <string>
+#include "OptimizerConfig.pb.h"
+#include "lr_policy.h"
+#include "serialization.h"
+#include "tensor.h"
+
+namespace paddle {
+namespace optimizer {
+
+class ParameterOptimizer {
+ public:
+  /**
+   * @brief  update hook for algorithm need to traverse parameter more than
+   * once.
+   */
+  ParameterOptimizer(Tensor *parameter, LrPolicy *lr)
+      : parameter_(parameter), lr_policy_(lr), num_sample_passed_(0) {}
+  virtual ~ParameterOptimizer() {
+    delete parameter_;
+    delete lr_policy_;
+  }
+
+  static ParameterOptimizer *Create(const std::string &config_proto,
+                                    Tensor *parameter);
+  virtual void Update(const Tensor *gradient) = 0;
+  virtual float *get_weight(int *param_size) const;
+  virtual std::string SerializeState() = 0;
+  virtual void DeserializeState(const std::string &state) = 0;
+
+ protected:
+  Tensor *parameter_;
+  // learning rate policy
+  LrPolicy *lr_policy_;
+  uint64_t num_sample_passed_;
+};
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/legacy/optimizer/parameter_optimizer_test.cc b/paddle/legacy/optimizer/parameter_optimizer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1d9572999e9e0f10092eecbc1b41369a89629da7
--- /dev/null
+++ b/paddle/legacy/optimizer/parameter_optimizer_test.cc
@@ -0,0 +1,127 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "parameter_optimizer.h"
+#include <cmath>
+#include <map>
+#include <vector>
+#include "gtest/gtest.h"
+#include "lr_policy.h"
+
+paddle::optimizer::Tensor* FillTensor(size_t size) {
+  paddle::optimizer::Tensor* param = new paddle::optimizer::Tensor(size);
+  paddle::optimizer::Tensor& p = *param;
+  for (size_t i = 0; i < p.size(); ++i) {
+    p[i] = (float)rand() / (float)RAND_MAX;
+  }
+  return param;
+}
+
+paddle::optimizer::Tensor* FixedTensor(size_t size) {
+  paddle::optimizer::Tensor* param = new paddle::optimizer::Tensor(size);
+  paddle::optimizer::Tensor& p = *param;
+  for (size_t i = 0; i < p.size(); ++i) {
+    p[i] = i;
+  }
+  return param;
+}
+
+class OptimizerTest : public testing::Test {
+ public:
+  virtual ~OptimizerTest() {}
+  // init paddle::optimizer::Tensor shape
+  const size_t kSize = 5;
+
+  virtual void SetUp() {
+    CreateSGD();
+    CreateAdam();
+  }
+  virtual void TearDown() {}
+
+  void CreateSGD() {
+    paddle::optimizer::Tensor* parameter = FixedTensor(kSize);
+    config_.set_optimizer(paddle::OptimizerConfig::SGD);
+    config_.mutable_sgd()->set_momentum(0.0);
+    config_.mutable_sgd()->set_decay(0.0);
+    config_.mutable_sgd()->set_nesterov(false);
+    config_.set_lr_policy(paddle::OptimizerConfig::Const);
+    config_.mutable_const_lr()->set_learning_rate(0.1);
+    std::string str = config_.SerializeAsString();
+    paddle::optimizer::ParameterOptimizer* opt =
+        paddle::optimizer::ParameterOptimizer::Create(str, parameter);
+    opts_.push_back(opt);
+  }
+
+  void CreateAdam() {
+    paddle::optimizer::Tensor* parameter = FixedTensor(kSize);
+    config_.set_optimizer(paddle::OptimizerConfig::Adam);
+    config_.mutable_adam()->set_beta_1(0.9);
+    config_.mutable_adam()->set_beta_2(0.1);
+    config_.mutable_adam()->set_epsilon(1e-3);
+    config_.mutable_adam()->set_decay(0.0);
+    config_.set_lr_policy(paddle::OptimizerConfig::Const);
+    config_.mutable_const_lr()->set_learning_rate(0.1);
+    std::string str = config_.SerializeAsString();
+    paddle::optimizer::ParameterOptimizer* opt =
+        paddle::optimizer::ParameterOptimizer::Create(str, parameter);
+    opts_.push_back(opt);
+  }
+
+  void TestGetWeight() {
+    paddle::optimizer::Tensor* p = FixedTensor(kSize);
+    for (size_t i = 0; i < opts_.size(); ++i) {
+      int s = 0;
+      float* newp = (float*)opts_[i]->get_weight(&s);
+      EXPECT_EQ(static_cast<size_t>(s), kSize);
+      for (size_t j = 0; j < kSize; ++j) {
+        EXPECT_EQ(newp[j], (*p)[j]);
+      }
+    }
+  }
+
+  void TestUpdate() {
+    paddle::optimizer::Tensor* g = FixedTensor(kSize);
+    for (size_t i = 0; i < opts_.size(); ++i) {
+      opts_[i]->Update(g);
+    }
+  }
+
+  void TestCheckPoint() {
+    paddle::optimizer::Tensor* p = FixedTensor(kSize);
+    for (size_t i = 0; i < opts_.size(); ++i) {
+      auto state = opts_[i]->SerializeState();
+      opts_[i]->DeserializeState(state);
+      auto state1 = opts_[i]->SerializeState();
+      opts_[i]->DeserializeState(state);
+      EXPECT_EQ(state, state1);
+
+      int s = 0;
+      float* newp = (float*)opts_[i]->get_weight(&s);
+      EXPECT_EQ(static_cast<size_t>(s), kSize);
+      for (size_t j = 0; j < kSize; ++j) {
+        EXPECT_EQ(newp[j], (*p)[j]);
+      }
+    }
+  }
+
+ private:
+  std::vector<paddle::optimizer::ParameterOptimizer*> opts_;
+  paddle::OptimizerConfig config_;
+};
+
+TEST_F(OptimizerTest, TestGetWeight) { TestGetWeight(); }
+
+TEST_F(OptimizerTest, TestUpdate) { TestUpdate(); }
+
+TEST_F(OptimizerTest, TestCheckPoint) { TestCheckPoint(); }
diff --git a/paddle/legacy/optimizer/serialization.h b/paddle/legacy/optimizer/serialization.h
new file mode 100644
index 0000000000000000000000000000000000000000..2067a8d8cff23bff975d23a4df4d0aa7df20b00f
--- /dev/null
+++ b/paddle/legacy/optimizer/serialization.h
@@ -0,0 +1,49 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include "OptimizerConfig.pb.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "tensor.h"
+
+namespace paddle {
+namespace optimizer {
+
+static void TensorToProto(const Tensor& tensor, TensorProto* proto) {
+  proto->set_data_type(TensorProto::PADDLE_ELEMENT_TYPE_FLOAT32);
+  std::stringstream os;
+  for (size_t i = 0; i < tensor.size(); ++i) {
+    os << tensor[i];
+    proto->add_content(os.str());
+    os.str(std::string());
+  }
+}
+
+static void ProtoToTensor(const TensorProto& proto, Tensor* tensor) {
+  std::stringstream sin;
+  for (auto i = 0; i < proto.content_size(); ++i) {
+    sin << proto.content(i);
+    sin >> (*tensor)[i];
+    sin.str(std::string());
+    sin.clear();
+  }
+}
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/serialization_test.cc b/paddle/legacy/optimizer/serialization_test.cc
similarity index 100%
rename from paddle/optimizer/serialization_test.cc
rename to paddle/legacy/optimizer/serialization_test.cc
diff --git a/paddle/optimizer/sgd_optimizer.cc b/paddle/legacy/optimizer/sgd_optimizer.cc
similarity index 100%
rename from paddle/optimizer/sgd_optimizer.cc
rename to paddle/legacy/optimizer/sgd_optimizer.cc
diff --git a/paddle/legacy/optimizer/sgd_optimizer.h b/paddle/legacy/optimizer/sgd_optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8957cde54abd6667143d2a8265d732c849294e3
--- /dev/null
+++ b/paddle/legacy/optimizer/sgd_optimizer.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "parameter_optimizer.h"
+
+namespace paddle {
+namespace optimizer {
+
+class SGDOptimizer : public ParameterOptimizer {
+ public:
+  SGDOptimizer(Tensor* parameter, LrPolicy* lr, double m, double d, bool n)
+      : ParameterOptimizer(parameter, lr),
+        momentums_(nullptr),
+        momentum_(m),
+        decay_(d),
+        nesterov_(n) {
+    if (momentum_ != 0.0) {
+      size_t size = parameter->size();
+      momentums_ = new Tensor(size);
+    }
+  }
+  virtual ~SGDOptimizer() {
+    if (momentums_) delete momentums_;
+  }
+  void Update(const Tensor* gradient);
+  std::string SerializeState();
+  void DeserializeState(const std::string& state);
+
+ private:
+  Tensor* momentums_;
+  double momentum_;
+  double decay_;
+  bool nesterov_;
+};
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/legacy/optimizer/tensor.h b/paddle/legacy/optimizer/tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e58577d4df7aabd8cd218dc13837461cc681ac6
--- /dev/null
+++ b/paddle/legacy/optimizer/tensor.h
@@ -0,0 +1,68 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+/**
+ * @brief tensor used by optimizer
+ */
+
+#include <string.h>
+#include <memory>
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+namespace optimizer {
+
+template <class T>
+class TensorT {
+ public:
+  TensorT(size_t size) : height_(1), width_(size) {
+    // new T[size]() initializes all element to zero value.
+    data_ptr_ = std::shared_ptr<T>(new T[size](), std::default_delete<T[]>());
+    data_ = data_ptr_.get();
+  }
+
+  TensorT(T* data, size_t size)
+      : height_(1), width_(size), data_ptr_(nullptr), data_(data) {}
+
+  TensorT(T* data, size_t h, size_t w)
+      : height_(h), width_(w), data_ptr_(nullptr), data_(data) {}
+
+  virtual ~TensorT() {}
+
+  T* get_buffer() { return this->data_; }
+
+  T& operator[](const size_t idx) {
+    CHECK(idx >= 0 && idx < this->width_) << "out of index range";
+    return data_[idx];
+  }
+  T& operator[](const size_t idx) const {
+    CHECK(idx >= 0 && idx < this->width_) << "out of index range";
+    return data_[idx];
+  }
+  // TODO: replace with tensorshape
+  size_t size() const { return this->width_ * this->height_; }
+
+ protected:
+  size_t height_;
+  size_t width_;
+  std::shared_ptr<T> data_ptr_;
+  T* data_;
+};
+
+// TODO(zhihong): design problem of dynamic datatype, need to fix it
+typedef TensorT<float> Tensor;
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/Argument.cpp b/paddle/legacy/parameter/Argument.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3f1d599e901110a1c9390d76c45f8b4b1f4cab2a
--- /dev/null
+++ b/paddle/legacy/parameter/Argument.cpp
@@ -0,0 +1,707 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Argument.h"
+#include "paddle/legacy/math/SparseMatrix.h"
+
+#include <algorithm>
+
+namespace paddle {
+static void resizeAndCopy(MatrixPtr& dest,
+                          const MatrixPtr& src,
+                          bool useGpu,
+                          hl_stream_t stream) {
+  if (src) {
+    if (!dest) {
+      dest = src->clone(0, 0, useGpu);
+    } else {
+      CHECK_EQ(dest->useGpu(), useGpu);
+      dest->resize(src->getHeight(), src->getWidth());
+    }
+    dest->copyFrom(*src, stream);
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(IVectorPtr& dest,
+                          const IVectorPtr& src,
+                          bool useGpu,
+                          hl_stream_t stream) {
+  if (src) {
+    IVector::resizeOrCreate(dest, src->getSize(), useGpu);
+    dest->copyFrom(*src, stream);
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(ICpuGpuVectorPtr& dest,
+                          const ICpuGpuVectorPtr& src,
+                          bool useGpu,
+                          hl_stream_t stream) {
+  if (src) {
+    ICpuGpuVector::resizeOrCreate(dest, src->getSize(), useGpu);
+    dest->copyFrom(*src, stream);
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(MatrixPtr& dest,
+                          const MatrixPtr& src,
+                          int32_t startRow,
+                          int32_t copySize,
+                          bool useGpu,
+                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
+  if (src) {
+    CHECK_LE((size_t)startRow + copySize, src->getHeight());
+    int height = copySize;
+    int width = src->getWidth();
+    if (!dest) {
+      dest = src->clone(height, width, useGpu);
+    } else {
+      CHECK_EQ(dest->useGpu(), useGpu);
+      dest->resize(height, width);
+    }
+    MatrixPtr submat = src->subMatrix(startRow, copySize);
+    if (dynamic_cast<GpuSparseMatrix*>(dest.get())) {
+      // copy a subMatrix of CpuSparseMatrix to GpuSparseMatrix.
+      // First copy it to CPU, and then copy it to the GPU.
+      MatrixPtr tmp = src->clone(height, width, false);
+      tmp->copyFrom(*submat, stream);
+      dest->copyFrom(*tmp, stream);
+    } else {
+      dest->copyFrom(*submat, stream);
+    }
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(IVectorPtr& dest,
+                          const IVectorPtr& src,
+                          int32_t startPos,
+                          int32_t copySize,
+                          bool useGpu,
+                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
+  if (src) {
+    CHECK_LE((size_t)startPos + copySize, src->getSize());
+
+    int height = copySize;
+    IVector::resizeOrCreate(dest, height, useGpu);
+    dest->copyFrom(src->getData() + startPos, height, stream);
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(ICpuGpuVectorPtr& dest,
+                          const ICpuGpuVectorPtr& src,
+                          int32_t startPos,
+                          int32_t copySize,
+                          bool useGpu,
+                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
+  if (src) {
+    CHECK_LE((size_t)startPos + copySize, src->getSize());
+
+    ICpuGpuVector::resizeOrCreate(dest, copySize, useGpu);
+    dest->copyFrom(*src, startPos, copySize, useGpu, stream);
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(SVectorPtr& dest,
+                          const SVectorPtr& src,
+                          bool useGpu,
+                          hl_stream_t stream) {
+  if (src) {
+    size_t height = src->size();
+    if (!dest) {
+      dest = std::make_shared<std::vector<std::string>>(height);
+    } else {
+      dest->resize(height);
+    }
+    std::copy_n(src->begin(), height, dest->begin());
+  } else {
+    dest.reset();
+  }
+}
+
+static void resizeAndCopy(SVectorPtr& dest,
+                          const SVectorPtr& src,
+                          int32_t startPos,
+                          int32_t copySize,
+                          bool useGpu,
+                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
+  if (src) {
+    CHECK_LE((size_t)startPos + copySize, src->size());
+    size_t height = copySize;
+    if (!dest) {
+      dest = std::make_shared<std::vector<std::string>>(height);
+    } else {
+      dest->resize(height);
+    }
+    std::copy_n(src->begin() + startPos, height, dest->begin());
+  } else {
+    dest.reset();
+  }
+}
+
+void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu) {
+  resizeAndCopyFrom(src, useGpu, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+}
+
+void Argument::resizeAndCopyFrom(const Argument& src,
+                                 bool useGpu,
+                                 hl_stream_t stream) {
+  dataId = src.dataId;
+  resizeAndCopy(value, src.value, useGpu, stream);
+  resizeAndCopy(grad, src.grad, useGpu, stream);
+  resizeAndCopy(in, src.in, useGpu, stream);
+  resizeAndCopy(ids, src.ids, useGpu, stream);
+  resizeAndCopy(sequenceStartPositions,
+                src.sequenceStartPositions,
+                false /* useGpu */,
+                stream);
+  if (src.hasSubseq()) {
+    resizeAndCopy(subSequenceStartPositions,
+                  src.subSequenceStartPositions,
+                  false /* useGpu */,
+                  stream);
+  }
+  resizeAndCopy(strs, src.strs, useGpu, stream);
+  frameWidth = src.frameWidth;
+  frameHeight = src.frameHeight;
+  frameDepth = src.frameDepth;
+}
+
+int32_t Argument::resizeAndCopyFrom(const Argument& src,
+                                    int32_t startSeq,
+                                    int32_t copySize,
+                                    bool useGpu) {
+  int32_t size =
+      resizeAndCopyFrom(src, startSeq, copySize, useGpu, HPPL_STREAM_DEFAULT);
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  return size;
+}
+
+int32_t Argument::resizeAndCopyFrom(const Argument& src,
+                                    int32_t startSeq,
+                                    int32_t copySize,
+                                    bool useGpu,
+                                    hl_stream_t stream) {
+  dataId = src.dataId;
+  frameWidth = src.frameWidth;
+  frameHeight = src.frameHeight;
+  frameDepth = src.frameDepth;
+
+  if (!src.sequenceStartPositions) {
+    // non-sequence input, copy samples directly
+    int32_t startRow = startSeq;
+    resizeAndCopy(in, src.in, startRow, copySize, useGpu, stream);
+    resizeAndCopy(value, src.value, startRow, copySize, useGpu, stream);
+    resizeAndCopy(grad, src.grad, startRow, copySize, useGpu, stream);
+    resizeAndCopy(ids, src.ids, startRow, copySize, useGpu, stream);
+    resizeAndCopy(strs, src.strs, startRow, copySize, useGpu, stream);
+    return copySize;
+  } else {
+    // sequence input
+    const int* sequence = src.sequenceStartPositions->getData(false);
+    int32_t startRow = sequence[startSeq];           // sample start from here
+    int32_t endRow = sequence[startSeq + copySize];  // sample end
+    int32_t copyFeatureSize = endRow - startRow;     // num of samples
+    resizeAndCopy(in, src.in, startRow, copyFeatureSize, useGpu, stream);
+    resizeAndCopy(value, src.value, startRow, copyFeatureSize, useGpu, stream);
+    resizeAndCopy(grad, src.grad, startRow, copyFeatureSize, useGpu, stream);
+    resizeAndCopy(ids, src.ids, startRow, copyFeatureSize, useGpu, stream);
+    resizeAndCopy(sequenceStartPositions,
+                  src.sequenceStartPositions,
+                  startSeq,
+                  copySize + 1,
+                  false,
+                  stream);
+    // modify new sequenceStartPositions
+    int* destSequences = sequenceStartPositions->getMutableData(false);
+    for (int i = 0; i < copySize + 1; i++) {
+      destSequences[i] -= startRow;
+    }
+    CHECK_EQ(destSequences[0], 0);
+    CHECK_EQ(destSequences[copySize], copyFeatureSize);
+    if (src.hasSubseq()) {
+      // sequence has sub-sequence
+      int* subSequence = src.subSequenceStartPositions->getMutableData(false);
+      int32_t subStartSeq = 0;
+      int32_t subEndSeq = 0;
+      int numSubSequences = src.getNumSubSequences();
+      for (int i = 0; i < numSubSequences + 1; i++) {
+        if (subSequence[i] == startRow) {
+          subStartSeq = i;
+        } else if (subSequence[i] == endRow) {
+          subEndSeq = i;
+          break;
+        }
+      }
+      int32_t copySubSize = subEndSeq - subStartSeq;
+      resizeAndCopy(subSequenceStartPositions,
+                    src.subSequenceStartPositions,
+                    subStartSeq,
+                    copySubSize + 1,
+                    false,
+                    stream);
+      // modify new subSequenceStartPositions
+      int* destSubSequences = subSequenceStartPositions->getMutableData(false);
+      for (int i = 0; i < copySubSize + 1; i++) {
+        destSubSequences[i] -= startRow;
+      }
+      CHECK_EQ(destSubSequences[0], 0);
+      CHECK_EQ(destSubSequences[copySubSize], copyFeatureSize);
+    }
+    resizeAndCopy(strs, src.strs, startRow, copySize, useGpu, stream);
+    return copyFeatureSize;
+  }
+}
+
+void Argument::concat(const std::vector<Argument>& args,
+                      const std::vector<int>& selectRows,
+                      const std::vector<int>& seqStartPos,
+                      const std::vector<int>& copySize,
+                      bool useGpu,
+                      hl_stream_t stream,
+                      PassType passType) {
+  CHECK(!subSequenceStartPositions)
+      << "undefined behavior for subsequence positions";
+
+  size_t batchSize = 0;
+  for (size_t i = 0; i < copySize.size(); ++i)
+    batchSize += copySize[i] * (seqStartPos[i + 1] - seqStartPos[i]);
+
+  auto copyArg = [batchSize, stream](MatrixPtr& dst,
+                                     MatrixPtr src,
+                                     int desStartRow,
+                                     int srcStartRow,
+                                     int size,
+                                     bool useGpu) {
+    if (!src) {
+      dst.reset();
+      return;
+    }
+    size_t width = src->getWidth();
+    if (!dst) {
+      dst = src->clone(batchSize, width, useGpu);
+    } else {
+      dst->resize(batchSize, width);
+    }
+
+    MatrixPtr tmpMatrix = dst->subMatrix(desStartRow, size);
+    tmpMatrix->copyFrom(*src->subMatrix(srcStartRow, size), stream);
+  };
+
+  auto copyIds = [batchSize, stream](IVectorPtr& dst,
+                                     const IVectorPtr& src,
+                                     int desStartRow,
+                                     int srcStartRow,
+                                     int size,
+                                     bool useGpu) {
+    if (!src) {
+      dst.reset();
+      return;
+    }
+    IVector::resizeOrCreate(dst, batchSize, useGpu);
+    dst->subVec(desStartRow, size)
+        ->copyFrom(*src->subVec(srcStartRow, size), stream);
+  };
+
+  auto copyStrs = [batchSize](SVectorPtr& dst,
+                              const SVectorPtr& src,
+                              int desStartRow,
+                              int srcStartRow,
+                              int size,
+                              bool useGpu) {
+    if (!src) {
+      dst.reset();
+      return;
+    }
+    if (!dst) {
+      dst = std::make_shared<std::vector<std::string>>(batchSize);
+    } else {
+      dst->resize(batchSize);
+    }
+    std::copy(src->begin() + srcStartRow,
+              src->begin() + srcStartRow + size,
+              dst->begin() + desStartRow);
+  };
+
+  dataId = args[0].dataId;
+  CHECK_NE(seqStartPos.size(), 0UL);
+  int desStartRow = 0;
+  for (size_t i = 0; i < copySize.size(); ++i) {
+    int startPos = seqStartPos[i];
+    int endPos = seqStartPos[i + 1];
+    CHECK_GE(args.size(), static_cast<size_t>(endPos - startPos));
+    for (int j = startPos; j < endPos; ++j) {
+      const Argument& arg = args[j - startPos];
+      CHECK_EQ(arg.dataId, dataId) << "Arguments to concatenate should have "
+                                   << "the same dataId.";
+      const int srcStartRow = selectRows[j];
+      copyArg(in, arg.in, desStartRow, srcStartRow, copySize[i], useGpu);
+      copyArg(value, arg.value, desStartRow, srcStartRow, copySize[i], useGpu);
+      if (passType != PASS_TEST) {
+        copyArg(grad, arg.grad, desStartRow, srcStartRow, copySize[i], useGpu);
+      }
+      copyIds(ids, arg.ids, desStartRow, srcStartRow, copySize[i], useGpu);
+      copyStrs(strs, arg.strs, desStartRow, srcStartRow, copySize[i], useGpu);
+      desStartRow += copySize[i];
+    }
+  }
+  ICpuGpuVector::resizeOrCreate(
+      sequenceStartPositions, seqStartPos.size(), useGpu);
+  sequenceStartPositions->copyFrom(
+      seqStartPos.data(), seqStartPos.size(), useGpu);
+}
+
+void Argument::concat(const std::vector<Argument>& args,
+                      bool useGpu,
+                      hl_stream_t stream,
+                      PassType passType) {
+  int32_t batchSize = 0;
+  int64_t numSequences = 0;
+  int64_t numSubSequences = 0;
+  for (auto& arg : args) {
+    batchSize += arg.getBatchSize();
+    numSequences += arg.getNumSequences();
+    numSubSequences += arg.getNumSubSequences();
+  }
+
+  auto copyArg = [batchSize, stream](
+      MatrixPtr& dst, MatrixPtr src, int startRow, bool useGpu) {
+    if (!src) {
+      dst.reset();
+      return;
+    }
+    size_t width = src->getWidth();
+    if (!dst) {
+      dst = src->clone(batchSize, width, useGpu);
+    } else {
+      dst->resize(batchSize, width);
+    }
+
+    MatrixPtr tmpMatrix = dst->subMatrix(startRow, src->getHeight());
+    tmpMatrix->copyFrom(*src, stream);
+  };
+
+  auto copyIds = [batchSize, stream](
+      IVectorPtr& dst, const IVectorPtr& src, int startRow, bool useGpu) {
+    if (!src) {
+      dst.reset();
+      return;
+    }
+    IVector::resizeOrCreate(dst, batchSize, useGpu);
+    dst->subVec(startRow, src->getSize())->copyFrom(*src, stream);
+  };
+
+  auto copyStrs = [batchSize](
+      SVectorPtr& dst, const SVectorPtr& src, int startRow, bool useGpu) {
+    if (!src) {
+      dst.reset();
+      return;
+    }
+    if (!dst) {
+      dst = std::make_shared<std::vector<std::string>>(batchSize);
+    } else {
+      dst->resize(batchSize);
+    }
+    std::copy(src->begin(), src->end(), dst->begin() + startRow);
+  };
+
+  auto copySequencePos = [](ICpuGpuVectorPtr& dstSeq,
+                            const ICpuGpuVectorPtr& srcSeq,
+                            int dstNumSequences,
+                            int srcNumSequences,
+                            int& startSequences,
+                            int startRow) {
+    if (srcSeq) {
+      ICpuGpuVector::resizeOrCreate(dstSeq, dstNumSequences + 1, false);
+      const int* src = srcSeq->getData(false);
+      int* dest = dstSeq->getMutableData(false);
+      for (int i = 0; i < srcNumSequences + 1; ++i) {
+        dest[i + startSequences] = src[i] + startRow;
+      }
+      startSequences += srcNumSequences;
+    } else {
+      dstSeq.reset();
+    }
+  };
+
+  int startRow = 0;
+  int startSequences = 0;
+  int startSubSequences = 0;
+  dataId = args[0].dataId;
+  for (auto& arg : args) {
+    CHECK_EQ(arg.dataId, dataId) << "Arguments in concat should have"
+                                 << " same dataId";
+    copyArg(in, arg.in, startRow, useGpu);
+    copyArg(value, arg.value, startRow, useGpu);
+    if (passType != PASS_TEST) copyArg(grad, arg.grad, startRow, useGpu);
+    copyIds(ids, arg.ids, startRow, useGpu);
+    copySequencePos(sequenceStartPositions,
+                    arg.sequenceStartPositions,
+                    numSequences,
+                    arg.getNumSequences(),
+                    startSequences,
+                    startRow);
+    copySequencePos(subSequenceStartPositions,
+                    arg.subSequenceStartPositions,
+                    numSubSequences,
+                    arg.getNumSubSequences(),
+                    startSubSequences,
+                    startRow);
+    copyStrs(strs, arg.strs, startRow, useGpu);
+    startRow += arg.getBatchSize();
+  }
+}
+
+void Argument::splitByDataId(const std::vector<Argument>& argus,
+                             std::vector<std::vector<Argument>>* arguGroups) {
+  arguGroups->clear();
+  int lastDataId = -1;
+  for (const auto& argu : argus) {
+    if (argu.dataId == -1) {
+      // is -1, then create a new group
+      arguGroups->emplace_back();
+      lastDataId = -1;
+    } else if (argu.dataId != lastDataId) {
+      // not -1, also not equal to last Argument, then create a new group
+      arguGroups->emplace_back();
+      lastDataId = argu.dataId;
+    } else {
+      // not -1, and equal to last Argument, do nothing
+    }
+    arguGroups->back().push_back(argu);
+  }
+}
+
+void Argument::getSeqInfo(std::vector<SeqInfo>* seqInfo) const {
+  const int* starts = sequenceStartPositions->getData(false);
+  const int* subStarts =
+      hasSubseq() ? subSequenceStartPositions->getData(false) : nullptr;
+  size_t numSequences = getNumSequences();
+  seqInfo->reserve(numSequences);
+  int subSeqEnd = 0;
+  for (size_t i = 0; i < numSequences; ++i) {
+    SeqInfo info;
+    info.seqStart = starts[i];
+    info.subLevelLength = starts[i + 1] - starts[i];
+    info.seqId = i;
+    if (hasSubseq()) {
+      info.subSeqStart = subSeqEnd;
+      while (subStarts[subSeqEnd] < starts[i + 1]) {
+        ++subSeqEnd;
+      }
+      info.topLevelLength = subSeqEnd - info.subSeqStart;
+    } else {
+      info.topLevelLength = info.subLevelLength;
+      info.subSeqStart = 0;  // not used
+    }
+    seqInfo->push_back(info);
+  }
+  std::sort(
+      seqInfo->begin(), seqInfo->end(), [](const SeqInfo& a, const SeqInfo& b) {
+        return a.topLevelLength > b.topLevelLength;
+      });
+}
+
+void Argument::checkSubset() const {
+  if (getNumSequences() > getNumSubSequences()) {
+    LOG(FATAL) << "numSubSequences is less than numSequences ("
+               << getNumSubSequences() << " vs. " << getNumSequences() << ")";
+  }
+  const int* start = sequenceStartPositions->getData(false);
+  const int* subStart = subSequenceStartPositions->getData(false);
+  int seqId = 0;
+  int subSeqId = 0;
+  while (seqId < getNumSequences() && subSeqId < getNumSubSequences()) {
+    if (start[seqId] > subStart[subSeqId]) {
+      ++subSeqId;
+    } else if (start[seqId] == subStart[subSeqId]) {
+      ++subSeqId;
+      ++seqId;
+    } else {
+      LOG(FATAL) << "seqStartPositions is not subset of subSeqStartPositions";
+    }
+  }
+  if (seqId < getNumSequences()) {
+    LOG(FATAL) << "seqStartPositions is not subset of subSeqStartPositions";
+  }
+}
+
+void Argument::degradeSequence(const Argument& input) {
+  CHECK_EQ(input.hasSubseq(), 1UL);
+  size_t numSequences = input.getNumSequences();
+  size_t numSubSequences = input.getNumSubSequences();
+  ICpuGpuVector::resizeOrCreate(
+      sequenceStartPositions, numSequences + 1, false);
+  int* tgtBuf = sequenceStartPositions->getMutableData(false);
+  const int* starts = input.sequenceStartPositions->getData(false);
+  const int* subStarts = input.subSequenceStartPositions->getData(false);
+  int seqId = 0;
+  for (size_t subSeqId = 0; subSeqId < numSubSequences; ++subSeqId) {
+    if (subStarts[subSeqId] == starts[seqId]) {
+      tgtBuf[seqId] = subSeqId;
+      seqId++;
+    }
+  }
+  tgtBuf[numSequences] = numSubSequences;
+}
+
+void Argument::poolSequenceWithStride(const Argument& input,
+                                      size_t stride,
+                                      ICpuGpuVectorPtr* stridePostions,
+                                      bool reversed) {
+  // If input.sequenceStartPositions = [0, 9, 14, 17, 30] and stride = 5,
+  // then sequenceStartPositions = [0, 2, 3, 4, 7].
+  // If reversed = false, stridePostions = [0, 5, 9, 14, 17, 22, 27, 30];
+  // else reversed = true, stridePostions = [0, 4, 9, 14, 17, 20, 25, 30]
+
+  CHECK(input.sequenceStartPositions);
+  CHECK_EQ(input.hasSubseq(), 0UL);
+  CHECK_GT(stride, 0UL) << "stride must larger than 0";
+  size_t numSequences = input.getNumSequences();
+  ICpuGpuVector::resizeOrCreate(
+      sequenceStartPositions, numSequences + 1, false);
+  const int* starts = input.sequenceStartPositions->getData(false);
+  int* tgtBuf = sequenceStartPositions->getMutableData(false);
+  // first index of target sequence and stride positions are both 0
+  tgtBuf[0] = 0;
+  std::vector<int> stridePos;
+  for (size_t seqId = 0; seqId < numSequences; ++seqId) {
+    size_t seqLength = starts[seqId + 1] - starts[seqId];
+    stridePos.emplace_back(starts[seqId]);
+    if (seqLength == 0) {
+      // empty sequence
+      tgtBuf[seqId + 1] = tgtBuf[seqId];
+    } else {
+      int size = ceil((float)seqLength / stride);
+      tgtBuf[seqId + 1] = tgtBuf[seqId] + size;
+      for (int i = 0; i < size - 1; ++i) {
+        int cur = reversed ? starts[seqId + 1] - (size - 1 - i) * stride
+                           : stridePos.back() + stride;
+        stridePos.emplace_back(cur);
+      }
+    }
+  }
+  stridePos.emplace_back(starts[numSequences]);
+  int size = stridePos.size();
+  CHECK_EQ(size - 1, tgtBuf[numSequences]);
+  ICpuGpuVector::resizeOrCreate(*stridePostions, size, false);
+  (*stridePostions)->getMutableVector(false)->copyFrom(stridePos.data(), size);
+}
+
+void Argument::getValueString(
+    std::unordered_map<std::string, std::string>* out) const {
+  if (value) {
+    std::ostringstream os;
+    value->print(os);
+    out->insert({"value", os.str()});
+  }
+  if (ids) {
+    std::ostringstream os;
+    ids->print(os, ids->getSize());
+    out->insert({"ids", os.str()});
+  }
+  if (sequenceStartPositions) {
+    std::ostringstream os;
+    sequenceStartPositions->getVector(false)->print(
+        os, sequenceStartPositions->getSize());
+    out->insert({"sequence pos", os.str()});
+  }
+  if (subSequenceStartPositions) {
+    std::ostringstream os;
+    subSequenceStartPositions->getVector(false)->print(
+        os, subSequenceStartPositions->getSize());
+    out->insert({"sub-sequence pos", os.str()});
+  }
+}
+
+void Argument::printValueString(std::ostream& stream,
+                                const std::string& prefix) const {
+  std::unordered_map<std::string, std::string> out;
+  getValueString(&out);
+  for (auto field : {"value", "ids", "sequence pos", "sub-sequence pos"}) {
+    auto it = out.find(field);
+    if (it != out.end()) {
+      stream << prefix << field << ":\n" << it->second;
+    }
+  }
+}
+
+void Argument::subArgFrom(const Argument& input,
+                          size_t offset,
+                          size_t height,
+                          size_t width,
+                          bool useGpu,
+                          bool trans,
+                          bool seqFlag,
+                          size_t seqStart,
+                          size_t seqSize) {
+  if (input.value) {
+    value = Matrix::create(
+        input.value->getData() + offset * width, height, width, trans, useGpu);
+  }
+  if (input.ids) {
+    ids = IVector::create(input.ids->getData() + offset, height, useGpu);
+  }
+  if (input.grad) {
+    grad = Matrix::create(
+        input.grad->getData() + offset * width, height, width, trans, useGpu);
+  }
+  if (seqFlag) {
+    sequenceStartPositions = std::make_shared<ICpuGpuVector>(
+        *(input.sequenceStartPositions), seqStart, seqSize);
+  }
+}
+
+void Argument::reorganizeSeqInfo(
+    const ICpuGpuVectorPtr seqStartPos,
+    const ICpuGpuVectorPtr subSeqStartPos,
+    std::vector<std::vector<int>>& reorganizedSeqInfo) {
+  CHECK(seqStartPos);
+  reorganizedSeqInfo.clear();
+
+  int seqNum = seqStartPos->getSize() - 1;
+  int* seqStarts = seqStartPos->getMutableData(false);
+
+  if (subSeqStartPos) {
+    int* subSeqStarts = subSeqStartPos->getMutableData(false);
+    reorganizedSeqInfo.resize(seqNum, std::vector<int>());
+    int seqIdx = 0;
+    for (size_t i = 0; i < subSeqStartPos->getSize(); ++i) {
+      reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
+      if (subSeqStarts[i] == seqStarts[seqIdx + 1]) {
+        seqIdx++;
+        if (seqIdx == seqNum) return;
+        reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
+      }
+    }
+  } else {
+    reorganizedSeqInfo.resize(1, std::vector<int>(seqNum + 1, 0));
+    memcpy(reorganizedSeqInfo[0].data(),
+           seqStarts,
+           sizeof(int) * seqStartPos->getSize());
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/Argument.h b/paddle/legacy/parameter/Argument.h
new file mode 100644
index 0000000000000000000000000000000000000000..ea8634896c18c7c3516c0d584aec4b475d626e61
--- /dev/null
+++ b/paddle/legacy/parameter/Argument.h
@@ -0,0 +1,349 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "hl_gpu.h"
+
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+typedef std::shared_ptr<std::vector<std::string>> SVectorPtr;
+
+struct Argument {
+  Argument()
+      : in(nullptr),
+        value(nullptr),
+        ids(nullptr),
+        grad(nullptr),
+        strs(nullptr),
+        frameHeight(0),
+        frameWidth(0),
+        frameDepth(0),
+        sequenceStartPositions(nullptr),
+        subSequenceStartPositions(nullptr),
+        cpuSequenceDims(nullptr),
+        deviceId(-1),
+        allCount(0),
+        valueCount(0),
+        gradCount(0),
+        dataId(0) {}
+  Argument(const Argument& argument) {
+    *this = argument;
+    valueCount = 0;
+    gradCount = 0;
+    dataId = argument.dataId;
+  }
+  ~Argument() {}
+
+  void operator=(const Argument& argument) {
+    in = argument.in;
+    value = argument.value;
+    ids = argument.ids;
+    grad = argument.grad;
+    strs = argument.strs;
+    sequenceStartPositions = argument.sequenceStartPositions;
+    subSequenceStartPositions = argument.subSequenceStartPositions;
+    cpuSequenceDims = argument.cpuSequenceDims;
+    deviceId = argument.deviceId;
+    allCount = argument.allCount;
+    frameHeight = argument.frameHeight;
+    frameWidth = argument.frameWidth;
+    frameDepth = argument.frameDepth;
+    dataId = argument.dataId;
+  }
+
+  MatrixPtr in;  // used if needed
+  MatrixPtr value;
+  IVectorPtr ids;  // a sequence of ids. Can be use for class id for costLayer
+  MatrixPtr grad;  // If empty, gradient is not needed.
+  SVectorPtr strs;
+
+  // A dataBatch includes batchSize frames, one frame maybe not only vector
+  size_t frameHeight;
+  size_t frameWidth;
+  size_t frameDepth;
+
+  // If NULL, each position is treated independently.
+  // Otherwise, its size should be #NumberOfSequences + 1.
+  // The first position is always 0 and
+  // the last position should be equal to batchSize.
+  ICpuGpuVectorPtr sequenceStartPositions;
+
+  // If NULL, each sequence has no subsequence.
+  // Otherwise, its size should be #NumberOfSubSequences + 1.
+  // The first position is always 0 and
+  // the last position should be equal to batchSize.
+  ICpuGpuVectorPtr subSequenceStartPositions;
+
+  // dimension of sequence, stored only in CPU
+  IVectorPtr cpuSequenceDims;
+
+  int deviceId;            // the GPU device id which the argument in
+  int allCount;            // the number of output layers using this argument
+  mutable int valueCount;  // waiting this member when layer do forward
+  mutable int gradCount;   // waiting this member when layer do backward
+  mutable LockedCondition valueReadyCond;
+  mutable LockedCondition gradReadyCond;
+
+  int dataId;  // dataProvider id
+
+  /* Increase the reference count of the argument. */
+  void countIncrement() { allCount++; }
+
+  int getAllCount() const { return allCount; }
+
+  void waitValueReady() const {
+    valueReadyCond.wait([this] { return (valueCount != 0); });
+
+    std::lock_guard<std::mutex> guard(*valueReadyCond.mutex());
+    valueCount--;
+  }
+
+  void notifyValueReady() const {
+    valueReadyCond.notify_all([this] { valueCount = allCount; });
+  }
+
+  void waitGradReady() const {
+    gradReadyCond.wait([this] { return (gradCount == allCount); });
+    gradCount = 0;
+  }
+
+  void notifyGradReady() const {
+    gradReadyCond.notify_all([this] { gradCount++; });
+  }
+
+  int64_t getBatchSize() const {
+    if (value) return value->getHeight();
+    if (ids) return ids->getSize();
+    if (grad) return grad->getHeight();
+    if (in) return in->getHeight();
+    if (strs) return strs->size();
+    return 0;
+  }
+  size_t getFrameHeight() const { return frameHeight; }
+  size_t getFrameWidth() const { return frameWidth; }
+  size_t getFrameDepth() const { return frameDepth; }
+  void setFrameHeight(size_t h) { frameHeight = h; }
+  void setFrameWidth(size_t w) { frameWidth = w; }
+  void setFrameDepth(size_t d) { frameDepth = d; }
+
+  int64_t getNumSequences() const {
+    return sequenceStartPositions ? sequenceStartPositions->getSize() - 1
+                                  : getBatchSize();
+  }
+
+  int64_t getNumSubSequences() const {
+    return subSequenceStartPositions ? subSequenceStartPositions->getSize() - 1
+                                     : getBatchSize();
+  }
+
+  bool hasSeq() const { return sequenceStartPositions != nullptr; }
+  bool hasSubseq() const { return subSequenceStartPositions != nullptr; }
+
+  const int* getCpuStartPositions() const {
+    return hasSubseq() ? subSequenceStartPositions->getData(false)
+                       : sequenceStartPositions->getData(false);
+  }
+
+  static inline real sum(const std::vector<Argument>& arguments) {
+    real cost = 0;
+    for (auto& arg : arguments) {
+      if (arg.value) {
+        SetDevice device(arg.deviceId);
+        cost += arg.value->getSum();
+      }
+    }
+    return cost;
+  }
+
+  /**
+   * @brief (value, ids, grad, sequenceStartPositions) of output are subset of
+   *        input. Note that, output share the same memory of input.
+   *
+   * @param input[in]       input
+   * @param offset[in]      offset in terms of rows
+   * @param height[in]      height of output.value
+   * @param width[in]       width of output.value
+   * @param useGpu[in]
+   * @param trans[in]       whether input.value is transform
+   * @param seqFlag[in]     whether input has sequenceStartPositions
+   * @param seqStart[in]    offset of input.sequenceStartPositions
+   * @param seqSize[in]     lenght of output.sequenceStartPositions
+   */
+  void subArgFrom(const Argument& input,
+                  size_t offset,
+                  size_t height,
+                  size_t width,
+                  bool useGpu,
+                  bool trans = false,
+                  bool seqFlag = false,
+                  size_t seqStart = 0,
+                  size_t seqSize = 0);
+  /*
+   * for sequence input:
+   *   startSeq: the sequence id of start
+   *   copySize: how many sequences need to copy
+   *   return value: how many samples are copied
+   * for non-sequence input:
+   *   startSeq: the sample id of start
+   *   copySize: how many samples need to copy
+   *   return value: how many samples are copied
+   * Note that when specifying the stream explicitly in this case,
+   * synchronize should also be called somewhere after this function
+   */
+  int32_t resizeAndCopyFrom(const Argument& src,
+                            int32_t startSeq,
+                            int32_t copySize,
+                            bool useGpu,
+                            hl_stream_t stream);
+
+  /*
+   * same with the above function, except that the stream is
+   * HPPL_STREAM_DEFAULT and synchronize is automatically called
+   * inside it
+   */
+  int32_t resizeAndCopyFrom(const Argument& src,
+                            int32_t startSeq,
+                            int32_t copySize,
+                            bool useGpu = FLAGS_use_gpu);
+
+  void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream);
+
+  /*
+   * same with the above function, except that the stream is
+   * HPPL_STREAM_DEFAULT and synchronize is automatically called
+   * inside it
+   */
+  void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu);
+
+  /*
+    @brief Concatenate several arguments into one and put the result into it.
+    @param args : a vector of argument, each element of which is a frame in a
+    batch of sequences.
+    @param selectRows : select several row of args to concatenate
+    @param seqStartPos : sequence start positions in the final Argument
+    @param hl_stream_t : cuda stream
+    @param passTyoe : type of task, training or testing
+   */
+  void concat(const std::vector<Argument>& args,
+              const std::vector<int>& selectRows,
+              const std::vector<int>& seqStartPos,
+              const std::vector<int>& copySize,
+              bool useGpu,
+              hl_stream_t stream,
+              PassType passType);
+
+  /*
+    Concatenate several args into one and put the result into this.
+   */
+  void concat(const std::vector<Argument>& src,
+              bool useGpu = FLAGS_use_gpu,
+              hl_stream_t stream = HPPL_STREAM_DEFAULT,
+              PassType passType = PASS_TEST);
+
+  /*
+   * split vector<Argument> to several vectors according to dataId
+   */
+  static void splitByDataId(const std::vector<Argument>& argus,
+                            std::vector<std::vector<Argument>>* arguGroups);
+
+  struct SeqInfo {
+    // Equal to sequence length for sequence data
+    // Equal to number of subsequences for subsequence data
+    int topLevelLength;
+
+    int seqStart;
+    int seqId;
+
+    // Equal to topLevelLength for sequence data
+    // Equal to sum of the length of subsequences for subsequence data
+    int subLevelLength;
+
+    // Only used for subsequence data, start position of this sequence
+    // is subSequenceStartPositions, i.e.
+    // subSequenceStartPositions[subSeqStart] == seqStart
+    int subSeqStart;
+  };
+  /*
+    Get SeqInfo for each sequence of this argument
+    Elements in *seqInfo are sorted by topLevelLength in descending order
+  */
+  void getSeqInfo(std::vector<SeqInfo>* segInfo) const;
+
+  /*
+   Check Whether sequenceStartPositions is subset of
+   subSequenceStartPositions.
+   */
+  void checkSubset() const;
+
+  /*
+   sequence has sub-sequence degrades to a sequence.
+   */
+  void degradeSequence(const Argument& input);
+
+  /*
+   After pooling with stride n (n is smaller than sequence length),
+   a long sequence will be shorten.
+   This function is invalid for sequence having sub-sequence.
+   */
+  void poolSequenceWithStride(const Argument& input,
+                              size_t stride,
+                              ICpuGpuVectorPtr* stridePositions,
+                              bool reversed = false);
+  /**
+   * @brief getValueString will return the argument's output in string. There
+   * are several kinds of output. The keys of output dictionary are 'value',
+   * 'id', 'sequence pos', 'sub-sequence pos'.
+   * @param out [out]: the return values.
+   */
+  void getValueString(std::unordered_map<std::string, std::string>* out) const;
+
+  /**
+   * @brief printValueString will print the argument's output in order of
+   * 'value', 'id', 'sequence pos', 'sub-sequence pos'.
+   * @param stream: Output stream
+   * @param prefix: line prefix for printing.
+   */
+  void printValueString(std::ostream& stream,
+                        const std::string& prefix = "") const;
+
+  /**
+   * @brief reorganizeSeqInfo will reorganize sequenceStartPositions and
+   * subSequenceStartPositions into a 2 dimensional arrary: reorganizedSeqInfo.
+   *
+   * @param seqStartPos: sequenceStartPositions of an Argument.
+   * @param subSeqStartPos: subSequenceStartPositions of an Argument.
+   * @param the reorganized sequence start position information.
+   *
+   * Examples:
+   * seqStartPos: [0, 4, 15, 20, 28]
+   * subSeqStartPos: [0, 3, 4, 5, 7, 10, 15, 20, 22, 23, 25, 28]
+   * reorganizedSeqInfo:
+   *   [
+   *     [0,3,4],
+   *     [4,5,7,10,15],
+   *     [15,20],
+   *     [20,22,23,25,28]
+   *   ]
+   */
+  static void reorganizeSeqInfo(
+      const ICpuGpuVectorPtr seqStartPos,
+      const ICpuGpuVectorPtr subSeqStartPos,
+      std::vector<std::vector<int>>& reorganizedSeqInfo);
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/AverageOptimizer.cpp b/paddle/legacy/parameter/AverageOptimizer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..82a7fed6c6451b8908851f2d039f17b9dc513818
--- /dev/null
+++ b/paddle/legacy/parameter/AverageOptimizer.cpp
@@ -0,0 +1,206 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "AverageOptimizer.h"
+
+namespace paddle {
+
+// factory method to create an instance of AverageOptimizer
+ParameterOptimizer* AverageOptimizer::create(
+    const OptimizationConfig& optConfig,
+    ParameterOptimizer* optimizer,
+    bool isParameterSparse,
+    bool useParameterApply) {
+  if (optConfig.average_window() <= 0) {
+    return optimizer;
+  }
+  // disable average for embeded local updater
+  if (!useParameterApply && optConfig.num_batches_per_send_parameter() > 1) {
+    return optimizer;
+  }
+  if (isParameterSparse) {
+    return new AverageSparseOptimizer(optConfig, optimizer, useParameterApply);
+  }
+  return new AverageOptimizer(optConfig, optimizer, useParameterApply);
+}
+
+AverageOptimizer::AverageOptimizer(const OptimizationConfig& optConfig,
+                                   ParameterOptimizer* optimizer,
+                                   bool useParameterApply)
+    : ParameterOptimizer(optConfig),
+      optimizer_(optimizer),
+      useApply_(useParameterApply),
+      numUpdates_(0),
+      prevNumUpdates_(0),
+      numAccumulates_(0),
+      oldNumAccumulates_(0),
+      minAverageWindow_(
+          std::min<int64_t>(10000L, optConfig_.max_average_window())),
+      maxAverageWindow_(optConfig_.max_average_window()) {
+  parameterTypes_ = optimizer_->getParameterTypes();
+  addParameterType(PARAMETER_SUM1);
+  addParameterType(PARAMETER_SUM2);
+  addParameterType(PARAMETER_SUM3);
+  if (useParameterApply) {
+    addParameterType(PARAMETER_APPLY);
+  }
+}
+
+void AverageOptimizer::startBatch(int64_t numSamplesProcessed) {
+  optimizer_->startBatch(numSamplesProcessed);
+  learningRate_ = optimizer_->getLearningRate();
+
+  ++numUpdates_;
+  ++numAccumulates_;
+}
+
+/*
+  After traversal, the averaged parameter can be obtained by
+  ((PARAMETER_SUM1 + PARAMETER_SUM2 + PARAMETER_SUM3)
+  / (numAccumulates_ + oldNumAccumulates_))
+*/
+ParameterOptimizer::TraverseCallback AverageOptimizer::needSpecialTraversal(
+    const ParameterConfig& config) const {
+  TraverseCallbackVec callbacks;
+
+  if (auto callback = optimizer_->needSpecialTraversal(config)) {
+    callbacks.emplace_back(callback);
+  }
+
+  if (numUpdates_ % kMaxNumAccumulates == 0) {
+    // Move the sum to a different buffer to avoid loss of precision
+    // due to too many sums.
+    callbacks.emplace_back([](const VectorPtr vecs[],
+                              const ParameterConfig& config,
+                              size_t sparseId) {
+      vecs[PARAMETER_SUM2]->add(*vecs[PARAMETER_SUM1]);
+      vecs[PARAMETER_SUM1]->zeroMem();
+    });
+  }
+
+  if (isAverageWindowTooLong()) {
+    // Now the average window is too long, discard the old sum.
+    if (auto callback = this->startCatchUpWith()) {
+      callbacks.emplace_back(callback);
+    }
+    callbacks.emplace_back([](const VectorPtr vecs[],
+                              const ParameterConfig& config,
+                              size_t sparseId) {
+      vecs[PARAMETER_SUM3]->add(*vecs[PARAMETER_SUM1], *vecs[PARAMETER_SUM2]);
+      vecs[PARAMETER_SUM1]->zeroMem();
+      vecs[PARAMETER_SUM2]->zeroMem();
+    });
+  }
+
+  return composeCallbacks(callbacks);
+}
+
+void AverageOptimizer::finishBatch() {
+  optimizer_->finishBatch();
+  if (isAverageWindowTooLong()) {
+    this->finishCatchUpWith();
+    oldNumAccumulates_ = numAccumulates_;
+    numAccumulates_ = 0;
+  }
+}
+
+ParameterOptimizer::TraverseCallback AverageOptimizer::apply() {
+  if (numAccumulates_ + oldNumAccumulates_ == 0) {
+    return nullptr;
+  }
+
+  real scale = 1. / (numAccumulates_ + oldNumAccumulates_);
+  if (useApply_) {
+    return [scale](const VectorPtr vecs[],
+                   const ParameterConfig& config,
+                   size_t sparseId) {
+      vecs[PARAMETER_APPLY]->add3(*vecs[PARAMETER_SUM1],
+                                  *vecs[PARAMETER_SUM2],
+                                  *vecs[PARAMETER_SUM3],
+                                  scale,
+                                  scale,
+                                  scale);
+    };
+  } else {
+    return [scale](const VectorPtr vecs[],
+                   const ParameterConfig& config,
+                   size_t sparseId) {
+      vecs[PARAMETER_GRADIENT]->copyFrom(*vecs[PARAMETER_VALUE]);
+      vecs[PARAMETER_VALUE]->add3(*vecs[PARAMETER_SUM1],
+                                  *vecs[PARAMETER_SUM2],
+                                  *vecs[PARAMETER_SUM3],
+                                  scale,
+                                  scale,
+                                  scale);
+    };
+  }
+}
+
+ParameterOptimizer::TraverseCallback AverageOptimizer::restore() {
+  if (numAccumulates_ + oldNumAccumulates_ == 0) {
+    return nullptr;
+  }
+  if (useApply_) {
+    return nullptr;
+  }
+
+  return [](
+      const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) {
+    vecs[PARAMETER_VALUE]->copyFrom(*vecs[PARAMETER_GRADIENT]);
+    vecs[PARAMETER_GRADIENT]->zeroMem();
+  };
+}
+
+void AverageSparseOptimizer::update(const VectorPtr vecs[],
+                                    const ParameterConfig& paraConfig,
+                                    size_t sparseId) const {
+  optimizer_->update(vecs, paraConfig, sparseId);
+
+  CHECK_LT(sparseId, t0Vec_.size());
+  int timediff = timer_ + 1 - t0Vec_[sparseId];
+  if (timediff > 0) {
+    vecs[PARAMETER_SUM1]->add(*vecs[PARAMETER_VALUE], timediff);
+    t0Vec_[sparseId] = timer_ + 1;
+  }
+}
+
+ParameterOptimizer::TraverseCallback AverageSparseOptimizer::startCatchUpWith()
+    const {
+  TraverseCallbackVec callbacks;
+
+  if (auto callback = optimizer_->startCatchUpWith()) {
+    callbacks.emplace_back(callback);
+  }
+
+  if (timer_ > 0) {
+    callbacks.emplace_back(
+        [this](const VectorPtr vecs[],
+               const ParameterConfig& config,
+               size_t sparseId) { this->catchUpWith(vecs, config, sparseId); });
+  }
+
+  return composeCallbacks(callbacks);
+}
+
+void AverageSparseOptimizer::catchUpWith(const VectorPtr vecs[],
+                                         const ParameterConfig& paraConfig,
+                                         size_t sparseId) const {
+  CHECK_LT(sparseId, t0Vec_.size());
+  int timediff = timer_ - t0Vec_[sparseId];
+  if (timediff > 0) {
+    vecs[PARAMETER_SUM1]->add(*vecs[PARAMETER_VALUE], timediff);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/AverageOptimizer.h b/paddle/legacy/parameter/AverageOptimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..f0fe2fd28e4be7df8ebc52fd9b9b5540f3d76949
--- /dev/null
+++ b/paddle/legacy/parameter/AverageOptimizer.h
@@ -0,0 +1,145 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "FirstOrderOptimizer.h"
+
+namespace paddle {
+
+// After Optimization, parameter values are further averaged within
+// time range.
+class AverageOptimizer : public ParameterOptimizer {
+ public:
+  // if *useParameterApply* set, use PARAMETER_APPLY to store averaged parameter
+  // else use PARAMETER_VALUE, and value backup in PARAMETER_GRADIENT
+  AverageOptimizer(const OptimizationConfig& optConfig,
+                   ParameterOptimizer* optimizer,
+                   bool useParameterApply);
+
+  static ParameterOptimizer* create(const OptimizationConfig& optConfig,
+                                    ParameterOptimizer* optimizer,
+                                    bool isParameterSparse = false,
+                                    bool useParameterApply = false);
+
+  virtual void init(size_t numRows, const ParameterConfig* config) {
+    optimizer_->init(numRows, config);
+  }
+
+  virtual void startPass() { optimizer_->startPass(); }
+  virtual void finishPass() {
+    optimizer_->finishPass();
+    updateAverageWindowLimit();
+  }
+
+  virtual void startBatch(int64_t numSamplesProcessed);
+  virtual void finishBatch();
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      size_t sparseId) const {
+    optimizer_->update(vecs, paraConfig, sparseId);
+    vecs[PARAMETER_SUM1]->add(*vecs[PARAMETER_VALUE], 1.0f);
+  }
+
+  virtual TraverseCallback needSpecialTraversal(
+      const ParameterConfig& config) const;
+
+  virtual TraverseCallback startCatchUpWith() const {
+    return optimizer_->startCatchUpWith();
+  }
+  virtual void finishCatchUpWith() { return optimizer_->finishCatchUpWith(); }
+
+  virtual TraverseCallback apply();
+  virtual TraverseCallback restore();
+
+  virtual void setNoDecay() { optimizer_->setNoDecay(); }
+
+ protected:
+  std::unique_ptr<ParameterOptimizer> optimizer_;
+  bool useApply_;
+
+  // should only be called from finishPass()
+  void updateAverageWindowLimit() {
+    if (!optConfig_.has_max_average_window()) {
+      // use the number of batches in the last pass as maxAverageWindow_
+      CHECK_GT(numUpdates_, prevNumUpdates_);
+      maxAverageWindow_ = numUpdates_ - prevNumUpdates_;
+      prevNumUpdates_ = numUpdates_;
+    }
+    minAverageWindow_ = std::min(minAverageWindow_, numUpdates_);
+  }
+
+  bool isAverageWindowTooLong() const {
+    return numAccumulates_ >= minAverageWindow_ &&
+           numAccumulates_ >=
+               std::min<int64_t>(maxAverageWindow_,
+                                 numUpdates_ * optConfig_.average_window());
+  }
+
+  static const int64_t kMaxNumAccumulates = 16384;
+  int64_t numUpdates_;
+  int64_t prevNumUpdates_;
+  int64_t numAccumulates_;
+  int64_t oldNumAccumulates_;
+  int64_t minAverageWindow_;
+  int64_t maxAverageWindow_;
+};
+
+// Average Optimizer with Sparse support.
+class AverageSparseOptimizer : public AverageOptimizer {
+ public:
+  AverageSparseOptimizer(const OptimizationConfig& optConfig,
+                         ParameterOptimizer* optimizer,
+                         bool useParameterApply)
+      : AverageOptimizer(optConfig, optimizer, useParameterApply) {}
+
+  virtual void init(size_t numRows, const ParameterConfig* config) {
+    AverageOptimizer::init(numRows, config);
+
+    t0Vec_.resize(numRows);
+
+    timer_ = 0;
+    t0Vec_.assign(t0Vec_.size(), 0);
+  }
+  virtual void finishBatch() {
+    AverageOptimizer::finishBatch();
+    timer_++;
+  }
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      size_t sparseId) const;
+  void catchUpWith(const VectorPtr vecs[],
+                   const ParameterConfig& paraConfig,
+                   size_t sparseId) const;
+  virtual TraverseCallback startCatchUpWith() const;
+  virtual void finishCatchUpWith() {
+    optimizer_->finishCatchUpWith();
+
+    timer_ = 0;
+    t0Vec_.assign(t0Vec_.size(), 0);
+  }
+
+ protected:
+  /**
+   *  counting batches, clear after catch up with
+   *  t(timer_) is current time,
+   *  t0(t0Vec_) are last occur time of i rows.
+   *  if one block is update by multi threads,
+   *  caller should hash sparse ids to avoid write conflict in t0Vec_.
+   */
+  int timer_;
+  mutable std::vector<int32_t> t0Vec_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/CMakeLists.txt b/paddle/legacy/parameter/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..19ae07e077e2b8f55ce4050566c9cf6aaa0efa0a
--- /dev/null
+++ b/paddle/legacy/parameter/CMakeLists.txt
@@ -0,0 +1,11 @@
+# The utilities for paddle
+
+file(GLOB PARAMETERS_HEADERS . *.h)
+file(GLOB PARAMETERS_SOURCES . *.cpp)
+
+add_library(paddle_parameter STATIC
+        ${PARAMETERS_SOURCES})
+add_dependencies(paddle_parameter paddle_proto ${external_project_dependencies})
+if(WITH_TESTING)
+    add_subdirectory(tests)
+endif()
diff --git a/paddle/legacy/parameter/FirstOrderOptimizer.cpp b/paddle/legacy/parameter/FirstOrderOptimizer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4f82a115f7bb467737b53b9891d88d3c4f501faf
--- /dev/null
+++ b/paddle/legacy/parameter/FirstOrderOptimizer.cpp
@@ -0,0 +1,330 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "FirstOrderOptimizer.h"
+#include "paddle/legacy/math/TrainingAlgorithmOp.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include <cmath>
+
+DEFINE_bool(log_clipping, false, "enable log clipping or not");
+
+namespace paddle {
+
+SparseMomentumParameterOptimizer::SparseMomentumParameterOptimizer(
+    const OptimizationConfig& optConfig)
+    : ParameterOptimizer(optConfig) {
+  addParameterType(PARAMETER_MOMENTUM);
+  addParameterType(PARAMETER_MOMENTUM_UT);
+  addParameterType(PARAMETER_MOMENTUM_VT);
+  alpha_ = 1;
+  beta_ = 1;
+  tau_ = -1;
+  threshold_ = 1e+06;
+}
+
+void SparseMomentumParameterOptimizer::init(size_t numRows,
+                                            const ParameterConfig* config) {
+  isParameterSparse_ = numRows != 0;
+  t0Vec_.resize(numRows);
+  t0Vec_.assign(t0Vec_.size(), 0);
+  timer_ = 0;
+  momentum_ = config->momentum();
+  decayRate_ = config->decay_rate();
+  gamma_ = config->learning_rate();
+}
+
+void SparseMomentumParameterOptimizer::startBatch(int64_t numSamplesProcessed) {
+  learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
+  if (isParameterSparse_) {
+    tau_ = tau_ + beta_ / alpha_;
+    alpha_ = alpha_ / momentum_;
+    beta_ = beta_ / (1 + decayRate_ * gamma_ * learningRate_);
+  }
+}
+
+void SparseMomentumParameterOptimizer::update(const VectorPtr vecs[],
+                                              const ParameterConfig& paraConfig,
+                                              size_t sparseId) const {
+  if (sparseId != -1LU) {
+    CHECK_LT(sparseId, t0Vec_.size());
+    if (t0Vec_[sparseId] == 0) {
+      vecs[PARAMETER_MOMENTUM_VT]->assign(*vecs[PARAMETER_VALUE]);
+      t0Vec_[sparseId] = 1;
+    }
+    vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT],
+                                     -alpha_ * gamma_ * learningRate_);
+    vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT],
+                                     tau_ * alpha_ * gamma_ * learningRate_);
+    vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
+                               tau_ / beta_ + 1.0 / alpha_,
+                               *vecs[PARAMETER_MOMENTUM_VT],
+                               1.0 / beta_);
+
+  } else {
+    vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                     *vecs[PARAMETER_MOMENTUM],
+                                     learningRate_ * paraConfig.learning_rate(),
+                                     paraConfig.momentum(),
+                                     applyDecay_ ? paraConfig.decay_rate() : 0);
+  }
+}
+
+ParameterOptimizer::TraverseCallback
+SparseMomentumParameterOptimizer::needSpecialTraversal(
+    const ParameterConfig& config) const {
+  if (alpha_ > threshold_ && isParameterSparse_) {
+    //  Restart to avoid large value multiplication
+    //  1. \alpha = 1, \beta = 1, \tau = 0
+    //  2. Note that \tau * u_t + v_t = \beta \theta_t, therefore:
+    //     u_t should be rescaled to u_t/alpha_
+    //     v_t should be reset to \theta_t
+    return [this](const VectorPtr vecs[],
+                  const ParameterConfig& config,
+                  size_t sparseId) {
+      vecs[PARAMETER_MOMENTUM_UT]->divScalar(alpha_);
+      vecs[PARAMETER_MOMENTUM_VT]->assign(*vecs[PARAMETER_VALUE]);
+    };
+  } else {
+    return nullptr;
+  }
+}
+
+void SparseMomentumParameterOptimizer::finishBatch() {
+  timer_++;
+  if (!isParameterSparse_) return;
+  if (alpha_ > threshold_) {
+    alpha_ = 1;
+    beta_ = 1;
+    tau_ = -1;
+  }
+}
+
+void AdagradParameterOptimizer::update(const VectorPtr vecs[],
+                                       const ParameterConfig& config,
+                                       size_t sparseId) const {
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& accum_buffer = *vecs[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
+
+  real epsilon = optConfig_.ada_epsilon();
+  real learningRate = learningRate_ * config.learning_rate();
+  real momentum = config.momentum();
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
+
+  adagradApply(value,
+               grad,
+               mom,
+               accum_buffer,
+               accum,
+               lr,
+               epsilon,
+               learningRate,
+               momentum,
+               decayRate);
+}
+
+ParameterOptimizer::TraverseCallback
+AdagradParameterOptimizer::needSpecialTraversal(
+    const ParameterConfig& config) const {
+  if (numUpdates_ % kMaxNumAccumulates == 0) {
+    // Move the sum to a different buffer to avoid loss of precision
+    // due to too many sums.
+    return [](const VectorPtr vecs[],
+              const ParameterConfig& config,
+              size_t sparseId) {
+      vecs[PARAMETER_GRADIENT_SQURESUM]->add(
+          *vecs[PARAMETER_GRADIENT_SQURESUM1]);
+      vecs[PARAMETER_GRADIENT_SQURESUM1]->zeroMem();
+    };
+  } else {
+    return nullptr;
+  }
+}
+
+void AdaDeltaParameterOptimizer::update(const VectorPtr vecs[],
+                                        const ParameterConfig& config,
+                                        size_t sparseId) const {
+  CHECK(sparseId == -1LU) << "Sparse update is not supported";
+
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& accum_update = *vecs[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
+
+  real learningRate = learningRate_ * config.learning_rate();
+  real momentum = config.momentum();
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
+
+  adadeltaApply(value,
+                grad,
+                mom,
+                accum,
+                accum_update,
+                lr,
+                rou_,
+                epsilon_,
+                learningRate,
+                momentum,
+                decayRate);
+}
+
+void RMSPropParameterOptimizer::update(const VectorPtr vecs[],
+                                       const ParameterConfig& config,
+                                       size_t sparseId) const {
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& sum1 = *vecs[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
+
+  real accumulatedRou = rou_;
+  bool firstTime = timer_ == 0;
+  if (sparseId != -1LU) {
+    CHECK_LT(sparseId, t0Vec_.size());
+    accumulatedRou = std::pow(rou_, timer_ + 1 - t0Vec_[sparseId]);
+    firstTime = t0Vec_[sparseId] == 0;
+    t0Vec_[sparseId] = timer_ + 1;
+  }
+
+  real epsilon = optConfig_.ada_epsilon();
+  real learningRate = learningRate_ * config.learning_rate();
+  real momentum = config.momentum();
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
+
+  rmspropApply(value,
+               grad,
+               mom,
+               sum,
+               sum1,
+               lr,
+               accumulatedRou,
+               rou_,
+               epsilon,
+               learningRate,
+               momentum,
+               decayRate,
+               firstTime);
+}
+
+void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[],
+                                              const ParameterConfig& config,
+                                              size_t sparseId) const {
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
+
+  real accumulatedRou = rou_;
+  bool firstTime = timer_ == 0;
+  if (sparseId != -1LU) {
+    CHECK_LT(sparseId, t0Vec_.size());
+    accumulatedRou = std::pow(rou_, timer_ + 1 - t0Vec_[sparseId]);
+    firstTime = t0Vec_[sparseId] == 0;
+    t0Vec_[sparseId] = timer_ + 1;
+  }
+
+  real epsilon = optConfig_.ada_epsilon();
+  real learningRate = learningRate_ * config.learning_rate();
+  real momentum = config.momentum();
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
+
+  decayedAdagradApply(value,
+                      grad,
+                      mom,
+                      sum,
+                      lr,
+                      accumulatedRou,
+                      rou_,
+                      epsilon,
+                      learningRate,
+                      momentum,
+                      decayRate,
+                      firstTime);
+}
+
+void AdamParameterOptimizer::update(const VectorPtr vecs[],
+                                    const ParameterConfig& config,
+                                    size_t sparseId) const {
+  CHECK(sparseId == -1UL) << "Sparse update is not supported";
+
+  real beta1_power = std::pow(beta1_, step_);
+  real beta2_power = std::pow(beta2_, step_);
+  real learningRate = config.learning_rate() * learningRate_;
+
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& v = *vecs[PARAMETER_SECOND_MOMENTUM];
+
+  adamApply(value,
+            grad,
+            mom,
+            v,
+            beta1_,
+            beta2_,
+            beta1_power,
+            beta2_power,
+            epsilon_,
+            learningRate);
+}
+
+void AdamaxParameterOptimizer::update(const VectorPtr vecs[],
+                                      const ParameterConfig& config,
+                                      size_t sparseId) const {
+  CHECK(sparseId == -1UL) << "Sparse update is not supported";
+  real learningRate = config.learning_rate() * learningRate_;
+
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& u = *vecs[PARAMETER_WEIGHTED_INFINITY_NORM];
+
+  adamaxApply(value, grad, mom, u, beta1_, beta2_, step_, learningRate);
+}
+
+void OptimizerWithGradientClipping::update(const VectorPtr vecs[],
+                                           const ParameterConfig& config,
+                                           size_t sparseId) const {
+  real globalThreshold = optConfig_.gradient_clipping_threshold();
+  real localThreshold = config.gradient_clipping_threshold();
+
+  // Use local gradient clipping threshold if it's enabled,
+  // otherwise using the global one.
+  real threshold = localThreshold > 0.0f ? localThreshold : globalThreshold;
+  std::string field = localThreshold > 0.0f ? "local" : "global";
+
+  real maxAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsMax();
+  if (maxAbsGrad > threshold) {
+    if (FLAGS_log_clipping) {
+      real avgAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsSum() /
+                        vecs[PARAMETER_GRADIENT]->getSize();
+      LOG(INFO) << "parameter=" << config.name() << " need clipping by "
+                << field << " threshold=" << threshold
+                << ", max grad=" << maxAbsGrad << ", avg grad=" << avgAbsGrad;
+    }
+    vecs[PARAMETER_GRADIENT]->clip(-threshold, threshold);
+  }
+  optimizer_->update(vecs, config, sparseId);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/FirstOrderOptimizer.h b/paddle/legacy/parameter/FirstOrderOptimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..86b9a591aff7a58aafa194c64cb09cd6636d0454
--- /dev/null
+++ b/paddle/legacy/parameter/FirstOrderOptimizer.h
@@ -0,0 +1,381 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ParameterOptimizer.h"
+#include "ParameterUpdateFunctions.h"
+#include "Regularizer.h"
+
+namespace paddle {
+
+// Plain SGD optimization.
+class SgdOptimizer : public ParameterOptimizer {
+ public:
+  explicit SgdOptimizer(const OptimizationConfig& optConfig)
+      : ParameterOptimizer(optConfig) {
+    addParameterType(PARAMETER_MOMENTUM);
+  }
+
+  virtual void startBatch(int64_t numSamplesProcessed) {
+    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
+  }
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      size_t sparseId) const {
+    (void)sparseId;
+    real torch_learningRate = optConfig_.learning_method() == "torch_momentum"
+                                  ? 1.0 - paraConfig.momentum()
+                                  : 1.0;
+#ifdef PADDLE_WITH_MKLDNN
+    sgdUpdate(learningRate_ * paraConfig.learning_rate() *
+                  (firstTime_ ? 1.0 : torch_learningRate),
+              paraConfig.momentum(),
+              applyDecay_ ? paraConfig.decay_rate() : 0,
+              vecs[PARAMETER_VALUE].get(),
+              vecs[PARAMETER_GRADIENT].get(),
+              vecs[PARAMETER_MOMENTUM].get());
+#else
+    vecs[PARAMETER_VALUE]->sgdUpdate(
+        *vecs[PARAMETER_GRADIENT],
+        *vecs[PARAMETER_MOMENTUM],
+        learningRate_ * paraConfig.learning_rate() *
+            (firstTime_ ? 1.0 : torch_learningRate),
+        paraConfig.momentum(),
+        applyDecay_ ? paraConfig.decay_rate() : 0);
+#endif
+  }
+  virtual void finishBatch() { firstTime_ = false; }
+};
+
+// SGD optimization with sparse support.
+class SparseMomentumParameterOptimizer : public ParameterOptimizer {
+  /* sparse momentum optimizer
+
+    update scheme:
+
+    \alpha_t = \alpha_{t-1} / k
+    \beta_t = \beta_{t-1} / (1 + \lambda\gamma_t)
+    u_t = u_{t-1} - \alpha_t \gamma_t g_t
+    v_t = v_{t-1} + \tau_{t-1} \alpha_t \gamma_t g_t
+    \tau_t = \tau_{t-1} + \beta_t / \alpha_t
+
+    where:
+    k: momentum
+    lambda: decay rate
+    \gamma_t: learning rate at the t'th step
+  */
+
+ public:
+  explicit SparseMomentumParameterOptimizer(
+      const OptimizationConfig& optConfig);
+  virtual void init(size_t numRows, const ParameterConfig* config);
+  virtual void startBatch(int64_t numSamplesProcessed);
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      size_t sparseId) const;
+  virtual TraverseCallback needSpecialTraversal(
+      const ParameterConfig& config) const;
+  virtual void finishBatch();
+
+ private:
+  real alpha_;
+  real beta_;
+  real tau_;
+  real gamma_;
+  real threshold_;
+  real momentum_;
+  real decayRate_;
+
+ protected:
+  int64_t timer_;
+  mutable std::vector<int64_t> t0Vec_;
+  bool isParameterSparse_;
+};
+
+/*
+ * AdaGrad optimization.
+ * http://www.magicbroom.info/Papers/DuchiHaSi10.pdf
+ */
+class AdagradParameterOptimizer : public ParameterOptimizer {
+ public:
+  explicit AdagradParameterOptimizer(const OptimizationConfig& optConfig)
+      : ParameterOptimizer(optConfig) {
+    addParameterType(PARAMETER_MOMENTUM);
+    addParameterType(PARAMETER_GRADIENT_SQURESUM);
+    addParameterType(PARAMETER_GRADIENT_SQURESUM1);
+    addParameterType(PARAMETER_LEARNING_RATE);
+    numUpdates_ = 0;
+  }
+
+  virtual void startBatch(int64_t numSamplesProcessed) {
+    (void)numSamplesProcessed;
+    ++numUpdates_;
+  }
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
+                      size_t sparseId) const;
+  virtual TraverseCallback needSpecialTraversal(
+      const ParameterConfig& config) const;
+
+ protected:
+  int64_t numUpdates_;
+  static const int64_t kMaxNumAccumulates = 16384;
+};
+
+/*
+ * AdaDelta Optimization.
+ * http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf
+ */
+class AdaDeltaParameterOptimizer : public ParameterOptimizer {
+ public:
+  explicit AdaDeltaParameterOptimizer(const OptimizationConfig& optConfig)
+      : ParameterOptimizer(optConfig) {
+    addParameterType(PARAMETER_MOMENTUM);
+    addParameterType(PARAMETER_GRADIENT_SQURESUM);
+    addParameterType(PARAMETER_GRADIENT_SQURESUM1);
+    addParameterType(PARAMETER_LEARNING_RATE);
+    rou_ = optConfig.ada_rou();
+    epsilon_ = optConfig.ada_epsilon();
+  }
+
+  virtual void startBatch(int64_t numSamplesProcessed) {
+    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
+  }
+
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
+                      size_t sparseId) const;
+
+ protected:
+  real rou_;
+  real epsilon_;
+};
+
+// RMSProp Parameter Optimization.
+class RMSPropParameterOptimizer : public ParameterOptimizer {
+ public:
+  explicit RMSPropParameterOptimizer(const OptimizationConfig& optConfig)
+      : ParameterOptimizer(optConfig) {
+    addParameterType(PARAMETER_MOMENTUM);
+    addParameterType(PARAMETER_GRADIENT_SQURESUM1);
+    addParameterType(PARAMETER_GRADIENT_SQURESUM);
+    addParameterType(PARAMETER_LEARNING_RATE);
+    rou_ = optConfig.ada_rou();
+    epsilon_ = optConfig.ada_epsilon();
+  }
+
+  virtual void init(size_t numRows, const ParameterConfig* config) {
+    t0Vec_.resize(numRows);
+    t0Vec_.assign(t0Vec_.size(), 0);
+    timer_ = 0;
+  }
+
+  virtual void startBatch(int64_t numSamplesProcessed) {
+    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
+  }
+  virtual void finishBatch() { timer_++; }
+
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
+                      size_t sparseId) const;
+
+ protected:
+  real rou_;
+  real epsilon_;
+
+  /**
+   *  counting batches, donot need catch up with
+   *  t(timer_) is current time,
+   *  t0(t0Vec_) are last occur time of i rows.
+   *  if one block is update by multi threads,
+   *  caller should hash sparse ids to avoid write conflict in t0Vec_.
+   */
+  int64_t timer_;
+  mutable std::vector<int64_t> t0Vec_;
+};
+
+// Decayed AdaGrad Optimization.
+class DecayedAdagradParameterOptimizer : public ParameterOptimizer {
+ public:
+  explicit DecayedAdagradParameterOptimizer(const OptimizationConfig& optConfig)
+      : ParameterOptimizer(optConfig) {
+    addParameterType(PARAMETER_MOMENTUM);
+    addParameterType(PARAMETER_GRADIENT_SQURESUM);
+    addParameterType(PARAMETER_LEARNING_RATE);
+    rou_ = optConfig.ada_rou();
+    epsilon_ = optConfig.ada_epsilon();
+  }
+
+  virtual void init(size_t numRows, const ParameterConfig* config) {
+    t0Vec_.resize(numRows);
+    t0Vec_.assign(t0Vec_.size(), 0);
+    timer_ = 0;
+  }
+
+  virtual void startBatch(int64_t numSamplesProcessed) {
+    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
+  }
+  virtual void finishBatch() { timer_++; }
+
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
+                      size_t sparseId) const;
+
+ protected:
+  real rou_;
+  real epsilon_;
+
+  /**
+   *  counting batches, donot need catch up with
+   *  t(timer_) is current time,
+   *  t0(t0Vec_) are last occur time of i rows.
+   *  if one block is update by multi threads,
+   *  caller should hash sparse ids to avoid write conflict in t0Vec_.
+   */
+  int64_t timer_;
+  mutable std::vector<int64_t> t0Vec_;
+};
+
+/**
+ * Adam Optimizer.
+ * Reference Paper: http://arxiv.org/abs/1412.6980 Algorithm 1
+ */
+class AdamParameterOptimizer : public ParameterOptimizer {
+ public:
+  explicit AdamParameterOptimizer(const OptimizationConfig& optConfig)
+      : ParameterOptimizer(optConfig),
+        beta1_(optConfig.adam_beta1()),
+        beta2_(optConfig.adam_beta2()),
+        epsilon_(optConfig.adam_epsilon()),
+        step_(1),
+        learningRate_(optConfig.learning_rate()) {
+    addParameterType(PARAMETER_MOMENTUM);
+    addParameterType(PARAMETER_SECOND_MOMENTUM);
+  }
+
+  virtual void startBatch(int64_t numSamplesProcessed) {
+    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
+  }
+
+  virtual void finishBatch() { ++step_; }
+
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
+                      size_t sparseId) const;
+
+ protected:
+  real beta1_;
+  real beta2_;
+  real epsilon_;
+  int64_t step_;
+  real learningRate_;
+};
+
+/**
+ * AdaMax Optimizer.
+ * Reference Paper: http://arxiv.org/abs/1412.6980 Algorithm 2
+ */
+class AdamaxParameterOptimizer : public ParameterOptimizer {
+ public:
+  explicit AdamaxParameterOptimizer(const OptimizationConfig& optConfig)
+      : ParameterOptimizer(optConfig),
+        beta1_(optConfig.adam_beta1()),
+        beta2_(optConfig.adam_beta2()),
+        step_(1),
+        learningRate_(optConfig.learning_rate()) {
+    addParameterType(PARAMETER_MOMENTUM);
+    addParameterType(PARAMETER_WEIGHTED_INFINITY_NORM);
+  }
+
+  virtual void finishBatch() { ++step_; }
+
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
+                      size_t sparseId) const;
+
+ protected:
+  real beta1_;
+  real beta2_;
+  int64_t step_;
+  real learningRate_;
+};
+
+// Used in pserver,
+// when PARAMETER_DELTA stores in PARAMETER_GRADIENT.
+class AddOptimizer : public ParameterOptimizer {
+ public:
+  explicit AddOptimizer(const OptimizationConfig& optConfig)
+      : ParameterOptimizer(optConfig) {}
+
+  virtual void startBatch(int64_t numSamplesProcessed) {
+    // learningRate required by regularizer
+    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
+  }
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      size_t sparseId) const {
+    vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_GRADIENT],
+                               optConfig_.delta_add_rate());
+  }
+};
+
+// A optimizer which does nothing.
+class DummyOptimizer : public ParameterOptimizer {
+ public:
+  explicit DummyOptimizer(const OptimizationConfig& optConfig)
+      : ParameterOptimizer(optConfig) {}
+
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      size_t sparseId) const {}
+};
+
+// Do gradient clipping before sgd update
+class OptimizerWithGradientClipping : public ParameterOptimizer {
+ public:
+  OptimizerWithGradientClipping(const OptimizationConfig& optConfig,
+                                ParameterOptimizer* optimizer)
+      : ParameterOptimizer(optConfig), optimizer_(optimizer) {
+    parameterTypes_ = optimizer_->getParameterTypes();
+  }
+
+  virtual void init(size_t numRows, const ParameterConfig* config) {
+    optimizer_->init(numRows, config);
+  }
+
+  virtual void startPass() { optimizer_->startPass(); }
+  virtual void finishPass() { optimizer_->finishPass(); }
+
+  virtual void startBatch(int64_t numSamplesProcessed) {
+    optimizer_->startBatch(numSamplesProcessed);
+    learningRate_ = optimizer_->getLearningRate();
+  }
+  virtual void finishBatch() { optimizer_->finishBatch(); }
+
+  virtual TraverseCallback needSpecialTraversal(
+      const ParameterConfig& config) const {
+    return optimizer_->needSpecialTraversal(config);
+  }
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
+                      size_t sparseId) const;
+
+  virtual void setNoDecay() { optimizer_->setNoDecay(); }
+
+ protected:
+  std::unique_ptr<ParameterOptimizer> optimizer_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/LearningRateScheduler.cpp b/paddle/legacy/parameter/LearningRateScheduler.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..68c44a7ec49f64a1085609d906441c9ed4502888
--- /dev/null
+++ b/paddle/legacy/parameter/LearningRateScheduler.cpp
@@ -0,0 +1,173 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "LearningRateScheduler.h"
+#include "paddle/legacy/utils/StringUtil.h"
+
+namespace paddle {
+
+ClassRegistrar<LearningRateScheduler, OptimizationConfig>
+    LearningRateScheduler::registrar_;
+
+LearningRateScheduler* LearningRateScheduler::create(
+    const OptimizationConfig& config) {
+  return registrar_.createByType(config.learning_rate_schedule(), config);
+}
+
+// LRS stands for LearningRateScheduler
+
+class BaseLRS : public LearningRateScheduler {
+ public:
+  explicit BaseLRS(const OptimizationConfig& config)
+      : learningRate_(config.learning_rate()),
+        a_(config.learning_rate_decay_a()),
+        b_(config.learning_rate_decay_b()) {}
+
+ protected:
+  real learningRate_;
+  real a_;
+  real b_;
+};
+
+class ConstLRS : public BaseLRS {
+ public:
+  explicit ConstLRS(const OptimizationConfig& config) : BaseLRS(config) {}
+  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
+    return learningRate_;
+  }
+};
+REGISTER_LEARNING_RATE_SCHEDULER(constant, ConstLRS);
+
+class PolyLRS : public BaseLRS {
+ public:
+  explicit PolyLRS(const OptimizationConfig& config) : BaseLRS(config) {}
+  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
+    return learningRate_ * pow(1.0 + a_ * numSamplesProcessed, -b_);
+  }
+};
+REGISTER_LEARNING_RATE_SCHEDULER(poly, PolyLRS);
+
+class CaffePolyLRS : public BaseLRS {
+ public:
+  explicit CaffePolyLRS(const OptimizationConfig& config) : BaseLRS(config) {}
+  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
+    if (numSamplesProcessed > a_) {
+      LOG_FIRST_N(WARNING, 1)
+          << "Using caffe_poly learning rate schedule, "
+          << "learning rate hits ZERO when "
+          << "numSamplesProcessed > config.learning_rate_decay_b(), "
+          << "training is over and you can stop it. "
+          << "See common/LearningRateScheduler.cpp for more info.";
+      return 0;
+    } else {
+      return learningRate_ * pow(1.0 - numSamplesProcessed / a_, b_);
+    }
+  }
+};
+REGISTER_LEARNING_RATE_SCHEDULER(caffe_poly, CaffePolyLRS);
+
+class ExpLRS : public BaseLRS {
+ public:
+  explicit ExpLRS(const OptimizationConfig& config) : BaseLRS(config) {}
+  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
+    double decayRatio = (double)numSamplesProcessed / b_;
+    return learningRate_ * pow(a_, decayRatio);
+  }
+};
+REGISTER_LEARNING_RATE_SCHEDULER(exp, ExpLRS);
+
+class DiscreteExpLRS : public BaseLRS {
+ public:
+  explicit DiscreteExpLRS(const OptimizationConfig& config) : BaseLRS(config) {}
+  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
+    int numDecays = floor(numSamplesProcessed / b_);
+    return learningRate_ * pow(a_, numDecays);
+  }
+};
+REGISTER_LEARNING_RATE_SCHEDULER(discexp, DiscreteExpLRS);
+
+class LinearLRS : public BaseLRS {
+ public:
+  explicit LinearLRS(const OptimizationConfig& config) : BaseLRS(config) {}
+  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
+    return std::max(learningRate_ - a_ * numSamplesProcessed, b_);
+  }
+};
+REGISTER_LEARNING_RATE_SCHEDULER(linear, LinearLRS);
+
+/*
+  specify learning rate through
+  learning_rate_args = 'seg0:rate0,seg1:rate1,...,segK:rateK'
+  if seg_{i-1} <= numSamples <= seg_i,
+  then learning_rate = learning_rate_base * rate_i
+*/
+class ManualLRS : public BaseLRS {
+ public:
+  explicit ManualLRS(const OptimizationConfig& config)
+      : BaseLRS(config), currentSegment_(0), lastNum_(0) {
+    std::vector<std::string> pieces;
+    str::split(config.learning_rate_args(), ',', &pieces);
+    rates_.reserve(pieces.size());
+    std::string s1, s2;
+
+    for (auto& piece : pieces) {
+      auto pos = piece.find(':');
+      CHECK(pos != std::string::npos) << "Wrong format for learning_rate_args: "
+                                      << config.learning_rate_args();
+      segments_.push_back(str::to<int64_t>(piece.substr(0, pos)));
+      rates_.push_back(str::to<real>(piece.substr(pos + 1)));
+    }
+  }
+
+  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
+    return calc(numSamplesProcessed);
+  }
+
+  real calc(int64_t num) {
+    // We assume that num never decreases.
+    CHECK_LE(lastNum_, num);
+    lastNum_ = num;
+    while (currentSegment_ < rates_.size()) {
+      if (num <= segments_[currentSegment_]) {
+        return learningRate_ * rates_[currentSegment_];
+      }
+      ++currentSegment_;
+      if (currentSegment_ < rates_.size()) {
+        LOG(INFO) << " learning_rate changes to "
+                  << learningRate_ * rates_[currentSegment_];
+      }
+    }
+    return learningRate_ * rates_.back();
+  }
+
+ protected:
+  std::vector<real> rates_;
+  std::vector<int64_t> segments_;
+  size_t currentSegment_;
+  int64_t lastNum_;
+};
+
+REGISTER_LEARNING_RATE_SCHEDULER(manual, ManualLRS);
+
+class PassManualLRS : public ManualLRS {
+ public:
+  explicit PassManualLRS(const OptimizationConfig& config)
+      : ManualLRS(config) {}
+  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
+    return calc(pass);
+  }
+};
+
+REGISTER_LEARNING_RATE_SCHEDULER(pass_manual, PassManualLRS);
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/LearningRateScheduler.h b/paddle/legacy/parameter/LearningRateScheduler.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc7e380a6af58577f4ba319d85522535b8f93a45
--- /dev/null
+++ b/paddle/legacy/parameter/LearningRateScheduler.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "TrainerConfig.pb.h"
+#include "paddle/legacy/utils/ClassRegistrar.h"
+
+namespace paddle {
+// NOLINTNEXTLINES_4
+#define REGISTER_LEARNING_RATE_SCHEDULER(__type_name, __class_name) \
+  static InitFunction __reg_type_##__type_name([]() {               \
+    LearningRateScheduler::registrar_.registerClass<__class_name>(  \
+        #__type_name);                                              \
+  })
+
+class LearningRateScheduler {
+ public:
+  static LearningRateScheduler* create(const OptimizationConfig& config);
+  virtual ~LearningRateScheduler() {}
+  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) = 0;
+
+  static ClassRegistrar<LearningRateScheduler, OptimizationConfig> registrar_;
+};
+
+}  // namespace paddle
diff --git a/paddle/parameter/OptimizerFunctions.cpp b/paddle/legacy/parameter/OptimizerFunctions.cpp
similarity index 100%
rename from paddle/parameter/OptimizerFunctions.cpp
rename to paddle/legacy/parameter/OptimizerFunctions.cpp
diff --git a/paddle/parameter/OptimizerFunctions.h b/paddle/legacy/parameter/OptimizerFunctions.h
similarity index 100%
rename from paddle/parameter/OptimizerFunctions.h
rename to paddle/legacy/parameter/OptimizerFunctions.h
diff --git a/paddle/parameter/OptimizerWithRegularizer.cpp b/paddle/legacy/parameter/OptimizerWithRegularizer.cpp
similarity index 100%
rename from paddle/parameter/OptimizerWithRegularizer.cpp
rename to paddle/legacy/parameter/OptimizerWithRegularizer.cpp
diff --git a/paddle/legacy/parameter/OptimizerWithRegularizer.h b/paddle/legacy/parameter/OptimizerWithRegularizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd29b3966324b2e206cfe56cc15678539d1e870e
--- /dev/null
+++ b/paddle/legacy/parameter/OptimizerWithRegularizer.h
@@ -0,0 +1,157 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "FirstOrderOptimizer.h"
+
+namespace paddle {
+
+// add regularizer for objective function to do optimization
+class OptimizerWithRegularizer : public ParameterOptimizer {
+ public:
+  static ParameterOptimizer* create(const OptimizationConfig& optConfig,
+                                    const ParameterConfig& paraConfig,
+                                    bool isParameterSparse,
+                                    bool inPserver);
+
+  OptimizerWithRegularizer(const OptimizationConfig& optConfig,
+                           ParameterOptimizer* optimizer,
+                           Regularizer* regularizer)
+      : ParameterOptimizer(optConfig),
+        optimizer_(optimizer),
+        regularizer_(regularizer) {
+    parameterTypes_ = optimizer_->getParameterTypes();
+  }
+
+  virtual void init(size_t numRows, const ParameterConfig* config) {
+    optimizer_->init(numRows, config);
+  }
+
+  virtual void startPass() {
+    optimizer_->startPass();
+    timer_ = 0;
+  }
+
+  virtual void finishPass() { optimizer_->finishPass(); }
+
+  virtual void startBatch(int64_t numSamplesProcessed) {
+    optimizer_->startBatch(numSamplesProcessed);
+  }
+
+  virtual void finishBatch() {
+    optimizer_->finishBatch();
+    ++timer_;
+  }
+
+  virtual TraverseCallback needSpecialTraversal(
+      const ParameterConfig& config) const {
+    return optimizer_->needSpecialTraversal(config);
+  }
+
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
+                      size_t sparseId) const {
+    optimizer_->update(vecs, config, sparseId);
+    regularizer_->update(vecs, config, optimizer_->getLearningRate(), 0, 1);
+  }
+
+ protected:
+  std::unique_ptr<ParameterOptimizer> optimizer_;
+  Regularizer* regularizer_;
+
+  /**
+   *  counting batches, clear after catch up with
+   *  t(timer_) is current time,
+   *  t0(t0Vec_) are last occur time of i rows.
+   *  if one block is update by multi threads,
+   *  caller should hash sparse ids to avoid write conflict in t0Vec_.
+   */
+  int timer_;
+};
+
+// Regularized Loss function for every num of batches
+class OptimizerWithRegularizerEveryNumBatches
+    : public OptimizerWithRegularizer {
+ public:
+  OptimizerWithRegularizerEveryNumBatches(const OptimizationConfig& optConfig,
+                                          ParameterOptimizer* optimizer,
+                                          Regularizer* regularizer)
+      : OptimizerWithRegularizer(optConfig, optimizer, regularizer) {}
+
+  virtual void startPass() {
+    OptimizerWithRegularizer::startPass();
+    baseTimer_ = 0;
+  }
+
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
+                      size_t sparseId) const {
+    optimizer_->update(vecs, config, sparseId);
+  }
+
+  virtual TraverseCallback needSpecialTraversal(
+      const ParameterConfig& config) const;
+  void doTraversal(const VectorPtr vecs[], const ParameterConfig& config) const;
+
+  void catchUpWith(const VectorPtr vecs[],
+                   const ParameterConfig& config,
+                   size_t sparseId) const;
+
+  virtual TraverseCallback startCatchUpWith() const;
+  virtual void finishCatchUpWith() { baseTimer_ = timer_; }
+
+ protected:
+  bool isRegularizationBatch(const ParameterConfig& config) const {
+    return ((timer_ + 1) % config.num_batches_regularization() == 0);
+  }
+
+  /**
+   *  recored the timer_ value while catchUpWith called.
+   */
+  int baseTimer_;
+};
+
+// Regularized Loss function with Sparse support
+class OptimizerWithRegularizerSparse : public OptimizerWithRegularizer {
+ public:
+  OptimizerWithRegularizerSparse(const OptimizationConfig& optConfig,
+                                 ParameterOptimizer* optimizer,
+                                 Regularizer* regularizer)
+      : OptimizerWithRegularizer(optConfig, optimizer, regularizer) {}
+
+  virtual void init(size_t numRows, const ParameterConfig* config);
+
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
+                      size_t sparseId) const;
+  void catchUpWith(const VectorPtr vecs[],
+                   const ParameterConfig& config,
+                   size_t sparseId) const;
+  virtual TraverseCallback startCatchUpWith() const;
+  virtual void finishCatchUpWith() {
+    timer_ = 0;
+    t0Vec_.assign(t0Vec_.size(), 0);
+  }
+
+ protected:
+  /**
+   *  t0Vec_ are last occur time of i rows
+   *  if one block is update by multi threads,
+   *  caller should hash sparse ids to avoid write conflict in t0Vec_.
+   */
+  mutable std::vector<int32_t> t0Vec_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/Parameter.cpp b/paddle/legacy/parameter/Parameter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..666d808f0c13c5c828c51b2a36ee9d05f7f78c13
--- /dev/null
+++ b/paddle/legacy/parameter/Parameter.cpp
@@ -0,0 +1,425 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Parameter.h"
+#include <gflags/gflags.h>
+#include <fstream>
+#include "AverageOptimizer.h"
+#include "FirstOrderOptimizer.h"
+#include "OptimizerFunctions.h"
+#include "OptimizerWithRegularizer.h"
+#include "ParameterUpdateFunctions.h"
+#include "ThreadLocalBuffer.h"
+#include "hl_gpu.h"
+#include "paddle/legacy/math/CpuSparseMatrix.h"
+#include "paddle/legacy/math/MathUtils.h"
+#include "paddle/legacy/math/SparseRowMatrix.h"
+#include "paddle/legacy/utils/Logging.h"
+
+DEFINE_int32(enable_grad_share,
+             (100 * 1024 * 1024),
+             "threshold for enable gradient parameter share for batch "
+             "multi-cpu training");
+DEFINE_int32(
+    grad_share_block_num,
+    64,
+    "block number of gradient parameter share for batch multi-cpu training");
+
+namespace paddle {
+
+const std::string Parameter::kMissParameterFail = "fail";
+const std::string Parameter::kMissParameterRand = "rand";
+const std::string Parameter::kMissParameterZero = "zero";
+
+Parameter::Parameter(const ParameterConfig& config, bool useGpu, bool doInit)
+    : config_(config),
+      useGpu_(useGpu),
+      deviceId_(-1),
+      sharedCount_(0),
+      updateCounter_(0),
+      updated_(false),
+      headerFormat_(PARAM_FORMAT_ORIGINAL) {
+  setID(-1); /* capture uninitialized id */
+  if (useGpu_ && FLAGS_parallel_nn) {
+    /* gpu environment is specified by device property */
+    deviceId_ = config_.device();
+    if (deviceId_ < 0) {
+      useGpu_ = false;
+    }
+  }
+
+  if (doInit) {
+    initialize();
+  }
+
+  for (int i = 0; i < config.update_hooks_size(); ++i) {
+    this->updaterHooks_.push_back(IParameterUpdaterHook::create(config, i));
+  }
+}
+
+void Parameter::initialize() {
+  SetDevice device(deviceId_);
+
+  bufs_[PARAMETER_VALUE] =
+      Vector::createParallelVector(config_.size(), useGpu_);
+  bufs_[PARAMETER_VALUE]->zeroMem();
+
+  if (config_.is_sparse()) {
+    enableSparseParameter();
+  }
+
+  if (!isStatic()) {
+    bufs_[PARAMETER_GRADIENT] =
+        Vector::createParallelVector(config_.size(), useGpu_);
+    bufs_[PARAMETER_MOMENTUM] =
+        Vector::createParallelVector(config_.size(), useGpu_);
+
+    bufs_[PARAMETER_GRADIENT]->zeroMem();
+    bufs_[PARAMETER_MOMENTUM]->zeroMem();
+  }
+}
+
+void Parameter::randomize(const VectorPtr& value,
+                          const ParameterConfig& config) {
+  if (PARAMETER_INIT_UNIFORM == config.initial_strategy()) {
+    // initialize the parameter as uniform distribution
+    real initial_min = config.initial_mean() - config.initial_std();
+    real initial_max = config.initial_mean() + config.initial_std();
+    value->uniform(initial_min, initial_max);
+    VLOG(1) << config.name() << ": initial_min=" << initial_min
+            << ", initial_max=" << initial_max;
+  } else if (PARAMETER_INIT_NORMAL == config.initial_strategy()) {
+    /* Initialize the parameters randomly */
+    value->randnorm(config.initial_mean(), config.initial_std());
+    VLOG(1) << config.name() << ": initial_mean=" << config.initial_mean()
+            << ", initial_std=" << config.initial_std();
+  } else {
+    LOG(FATAL) << "not supported initial_strategy: "
+               << config.initial_strategy();
+  }
+}
+
+void Parameter::randomize() {
+  if (!bufs_[PARAMETER_VALUE]) return;
+  SetDevice device(deviceId_);
+  Parameter::randomize(bufs_[PARAMETER_VALUE], config_);
+
+  if (config_.is_sparse()) {
+    if (format_ == SPARSE_CSC) {
+      sparseRand(intBufs_[PARAMETER_COLS]->getData(),
+                 intBufs_[PARAMETER_ROWS]->getData(),
+                 config_.size(),
+                 config_.dims(1) + 1,
+                 config_.dims(0),
+                 useGpu_);
+    } else {
+      sparseRand(intBufs_[PARAMETER_ROWS]->getData(),
+                 intBufs_[PARAMETER_COLS]->getData(),
+                 config_.size(),
+                 config_.dims(0) + 1,
+                 config_.dims(1),
+                 useGpu_);
+    }
+  }
+  setValueUpdated();
+}
+
+void Parameter::zeroMem() {
+  if (!bufs_[PARAMETER_VALUE]) return;
+  bufs_[PARAMETER_VALUE]->zeroMem();
+  setValueUpdated();
+  LOG(INFO) << getName() << " set to 0";
+}
+
+bool Parameter::isGradShared(size_t* blockNum) {
+  if (!useGpu_ && !isStatic() && FLAGS_enable_grad_share > 0 &&
+      !isGradSparseUpdate() &&
+      this->getSize() > (size_t)FLAGS_enable_grad_share) {
+    if (blockNum) {
+      *blockNum = (size_t)FLAGS_grad_share_block_num;
+    }
+    return true;
+  }
+  return false;
+}
+
+bool Parameter::isValueShared() {
+  return !useGpu_ && config_.is_shared() && FLAGS_trainer_count > 1;
+}
+
+bool Parameter::isGradSparseUpdate() const {
+  return !useGpu_ && !isStatic() &&
+         (config_.sparse_update() || config_.sparse_remote_update());
+}
+
+void Parameter::setMat(ParameterType pType, int matType) {
+  CHECK(!mats_[pType]);
+
+  if (config_.dims_size() == 0 && matType == MAT_NORMAL) {
+    return;
+  }
+
+  CHECK_EQ((size_t)config_.dims_size(), 2LU);
+  size_t height = config_.dims(0);
+  size_t width = config_.dims(1);
+  if (matType == MAT_NORMAL) {
+    if (!config_.is_sparse()) {
+      CHECK_EQ(height * width, bufs_[pType]->getSize());
+      mats_[pType] =
+          Matrix::create(bufs_[pType]->getMemoryHandle(), height, width);
+    } else {
+      size_t size = bufs_[pType]->getSize();
+      CHECK_GE(height * width, size);
+      if (format_ == SPARSE_CSR) {
+        CHECK_EQ(height + 1, intBufs_[PARAMETER_ROWS]->getSize());
+        CHECK_EQ(size, intBufs_[PARAMETER_COLS]->getSize());
+      } else {
+        CHECK_EQ(width + 1, intBufs_[PARAMETER_COLS]->getSize());
+        CHECK_EQ(size, intBufs_[PARAMETER_ROWS]->getSize());
+      }
+      mats_[pType] =
+          Matrix::createSparseMatrix(bufs_[pType]->getData(),
+                                     intBufs_[PARAMETER_ROWS]->getData(),
+                                     intBufs_[PARAMETER_COLS]->getData(),
+                                     height,
+                                     width,
+                                     bufs_[pType]->getSize(),
+                                     FLOAT_VALUE,
+                                     format_,
+                                     false,
+                                     useGpu_);
+    }
+  }
+#ifndef PADDLE_MOBILE_INFERENCE
+  // NOLINTNEXTLINE
+  else if (matType == MAT_NORMAL_SHARED) {
+    CHECK_EQ(height * width, bufs_[pType]->getSize());
+    size_t blockNum = 0;
+    CHECK(isGradShared(&blockNum));
+    mats_[pType] = std::make_shared<SharedCpuMatrix>(
+        blockNum,
+        std::dynamic_pointer_cast<CpuMemoryHandle>(
+            bufs_[pType]->getMemoryHandle()),
+        height,
+        width);
+  } else if (matType == MAT_VALUE_SHARED) {
+    CHECK_EQ(height * width, bufs_[pType]->getSize());
+    mats_[pType] = std::make_shared<SharedCpuMatrix>(
+        std::dynamic_pointer_cast<CpuMemoryHandle>(
+            bufs_[pType]->getMemoryHandle()),
+        height,
+        width);
+  } else if (matType == MAT_SPARSE_ROW_IDS) {
+    CHECK_EQ(height * width, bufs_[pType]->getSize());
+    mats_[pType] = std::make_shared<SparseRowIdsCpuMatrix>(
+        std::dynamic_pointer_cast<CpuMemoryHandle>(
+            bufs_[pType]->getMemoryHandle()),
+        height,
+        width);
+  } else if (matType == MAT_SPARSE_ROW) {
+    auto valueMat =
+        std::dynamic_pointer_cast<SparseRowCpuMatrix>(mats_[PARAMETER_VALUE]);
+    SparseRowCpuMatrix::IndexDictPtr indexDict(nullptr);
+    if (pType != PARAMETER_VALUE) {
+      CHECK(valueMat) << "The matrix for PARAMETER_VALUE must be set "
+                      << " and its type must be MAT_SPARSE_ROW,"
+                      << " MAT_SPARSE_ROW_PREFETCH or MAT_CACHE_ROW";
+      indexDict = valueMat->getIndexDictHandle();
+    }
+    auto mat =
+        std::make_shared<SparseRowCpuMatrix>(nullptr,
+                                             height,
+                                             width,
+                                             // grad share index with value
+                                             indexDict);
+    mats_[pType] = mat;
+  } else if (matType == MAT_CACHE_ROW) {
+    CHECK(isGradSparseUpdate());
+    auto mat = std::make_shared<CacheRowCpuMatrix>(height, width);
+    mats_[pType] = mat;
+  } else if (matType == MAT_SPARSE_ROW_PREFETCH_FULL_SIZE ||
+             matType == MAT_SPARSE_ROW_PREFETCH) {
+    auto mat = std::make_shared<SparsePrefetchRowCpuMatrix>(
+        bufs_[pType] ? std::dynamic_pointer_cast<CpuMemoryHandle>(
+                           bufs_[pType]->getMemoryHandle())
+                     : nullptr,
+        height,
+        width,
+        nullptr,  // indexDictHandle
+        getGlobalSyncThreadPool());
+    mats_[pType] = mat;
+  } else if (matType == MAT_SPARSE_ROW_AUTO_GROW) {
+    CHECK(isGradSparseUpdate());
+    mats_[pType] = std::make_shared<SparseAutoGrowRowCpuMatrix>(height, width);
+  }
+#endif
+  // NOLINTNEXTLINE
+  else {
+    LOG(FATAL) << "Unsupported mat type" << matType;
+  }
+}
+
+void Parameter::incUpdate(const UpdateCallback& callback) {
+  // Static parameter is fixed, and does not need to be updated
+  if (isStatic()) {
+    return;
+  }
+
+  ++updateCounter_;
+  if (isUpdatable()) {
+    if (callback) callback(this);
+    clearUpdate();
+  }
+}
+
+bool Parameter::save(const std::string& filename) const {
+  std::ofstream fs(filename, std::ios_base::binary);
+  CHECK(fs) << "Fail to open " << filename;
+  return save(fs);
+}
+
+bool Parameter::save(std::ostream& s) const {
+  CpuVector vec(*bufs_[PARAMETER_VALUE].get());
+  Header header;
+  header.format = headerFormat_;
+  header.valueSize = sizeof(real);
+  header.size = getSize();
+
+  CHECK_EQ(header.size, vec.getSize());
+
+  CHECK(s.write(reinterpret_cast<char*>(&header), sizeof(header)))
+      << "Fail to write parameter " << getName();
+
+  CHECK(s.write(reinterpret_cast<char*>(vec.getData()),
+                header.size * sizeof(real)))
+      << "Fail to write parameter " << getName();
+  if (config_.is_sparse()) {
+    CpuIVector rows(*intBufs_[PARAMETER_ROWS].get());
+    CpuIVector cols(*intBufs_[PARAMETER_COLS].get());
+    CHECK(s.write(reinterpret_cast<char*>(rows.getData()),
+                  rows.getSize() * sizeof(int)))
+        << "Fail to write parameter " << getName();
+    CHECK(s.write(reinterpret_cast<char*>(cols.getData()),
+                  cols.getSize() * sizeof(int)))
+        << "Fail to write parameter " << getName();
+  }
+
+  return true;
+}
+
+/**
+ * Load parameter value from a file
+ */
+bool Parameter::load(const std::string& filename) {
+  std::ifstream fs(filename, std::ios_base::binary);
+  if (!fs) {
+    LOG(INFO) << "missing parameters [" << filename << "] while loading model.";
+    if (kMissParameterFail == FLAGS_load_missing_parameter_strategy) {
+      LOG(FATAL) << getName() << " missing, not allowed.";
+      return false;
+    }
+    if (kMissParameterRand == FLAGS_load_missing_parameter_strategy) {
+      LOG(INFO) << getName() << " missing, set to random.";
+      randomize();
+      return true;
+    }
+    if (kMissParameterZero == FLAGS_load_missing_parameter_strategy) {
+      LOG(INFO) << getName() << " missing, set to zero.";
+      zeroMem();
+      return true;
+    }
+    LOG(FATAL) << "unsupported load_missing_parameter_strategy: "
+               << FLAGS_load_missing_parameter_strategy;
+    return false;
+  }
+  return load(fs);
+}
+
+bool Parameter::load(std::istream& s) {
+  CpuVector vec(*bufs_[PARAMETER_VALUE].get());
+  Header header;
+  CHECK(s.read(reinterpret_cast<char*>(&header), sizeof(header)))
+      << "Fail to read parameter " << getName();
+  CHECK(isHeaderFormatSupported(header.format)) << "Incorrect format version: "
+                                                << header.format;
+  headerFormat_ = header.format;
+  CHECK_EQ(header.size, getSize())
+      << "The size (" << header.size << ") in the file does not match the size "
+      << "(" << getSize() << ") of the parameter: " << getName();
+  CHECK_EQ(header.valueSize, sizeof(real))
+      << "Unsupported valueSize " << header.valueSize << " at: " << getName();
+  CHECK(s.read(reinterpret_cast<char*>(vec.getData()),
+               header.size * sizeof(real)));
+
+  auto& tmp = *bufs_[PARAMETER_VALUE].get();
+  if (typeid(tmp) == typeid(GpuVector)) {
+    bufs_[PARAMETER_VALUE]->copyFrom(vec);
+  }
+
+  if (config_.is_sparse() && config_.need_compact()) {
+    // load from dense parameter with many zero
+    CHECK_EQ(config_.dims_size(), 2);
+    auto height = config_.dims(0);
+    auto width = config_.dims(1);
+    auto mat = Matrix::create(vec.getData(), height, width);
+    CpuSparseMatrix sparseMat(height,
+                              width,
+                              0,
+                              FLOAT_VALUE,
+                              format_,
+                              /*trans*/ false);
+    sparseMat.copyFrom(*mat, HPPL_STREAM_DEFAULT);
+    auto nnz = sparseMat.getElementCnt();
+    size_t rowSize = (format_ == SPARSE_CSR) ? height + 1 : nnz;
+    size_t colSize = (format_ == SPARSE_CSR) ? nnz : width + 1;
+
+    intBufs_[PARAMETER_ROWS]->copyFrom(sparseMat.getRows(), rowSize);
+    intBufs_[PARAMETER_COLS]->copyFrom(sparseMat.getCols(), colSize);
+    bufs_[PARAMETER_VALUE]->resize(nnz);  // for setMat check
+    bufs_[PARAMETER_VALUE]->copyFrom(sparseMat.getValue(), nnz);
+    config_.set_size(nnz);
+    LOG(INFO) << "compact nnz=" << (1. * nnz / (height * width))
+              << " name=" << config_.name();
+  } else if (config_.is_sparse()) {
+    CpuIVector rows(*intBufs_[PARAMETER_ROWS].get());
+    CpuIVector cols(*intBufs_[PARAMETER_COLS].get());
+    size_t rowSize, colSize;
+    CHECK_EQ(config_.dims_size(), 2);
+    if (format_ == SPARSE_CSR) {
+      rowSize = config_.dims(0) + 1;
+      colSize = config_.size();
+    } else {
+      rowSize = config_.size();
+      colSize = config_.dims(1) + 1;
+    }
+    CHECK(
+        s.read(reinterpret_cast<char*>(rows.getData()), rowSize * sizeof(int)));
+    CHECK(
+        s.read(reinterpret_cast<char*>(cols.getData()), colSize * sizeof(int)));
+    auto& paramRows = *intBufs_[PARAMETER_ROWS].get();
+    if (typeid(paramRows) == typeid(GpuIVector)) {
+      intBufs_[PARAMETER_ROWS]->copyFrom(rows);
+    }
+    auto& paramCols = *intBufs_[PARAMETER_COLS].get();
+    if (typeid(paramCols) == typeid(GpuIVector)) {
+      intBufs_[PARAMETER_COLS]->copyFrom(cols);
+    }
+  }
+
+  setValueUpdated();
+
+  return true;
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/Parameter.h b/paddle/legacy/parameter/Parameter.h
new file mode 100644
index 0000000000000000000000000000000000000000..43b567dad045ad786b1b3f2d3614072f58310527
--- /dev/null
+++ b/paddle/legacy/parameter/Parameter.h
@@ -0,0 +1,380 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "ParameterConfig.pb.h"
+#include "TrainerConfig.pb.h"
+
+#include "ParameterUpdaterHook.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+typedef enum {
+  /// The paddle original basic format
+  PARAM_FORMAT_ORIGINAL = 0,
+
+  /// See mkldnn_memory_format_t in
+  /// https://github.com/01org/mkl-dnn/blob/master/include/mkldnn_types.h
+  /// for a detailed description.
+  /// 2D weights tensor in the format (output channels, input channels).
+  PARAM_FORMAT_MKLDNN_OI,
+
+  /// The total format items numbers
+  PARAM_FORMAT_ITEMS,
+} PARAM_FORMAT;
+
+class SparsePrefetchRowCpuMatrix;
+
+class Parameter;
+typedef std::function<void(Parameter* param)> UpdateCallback;
+typedef std::function<void(int paramId, Parameter* param)> ParamInitCallback;
+
+class Parameter;
+typedef std::shared_ptr<Parameter> ParameterPtr;
+
+class Parameter {
+ public:
+  Parameter(const ParameterConfig& config, bool useGpu, bool doInit = true);
+  const std::string& getName() const { return config_.name(); }
+
+  size_t getSize() const { return config_.size(); }
+
+  bool isFullSize() const {
+    if (bufs_[PARAMETER_VALUE]) {
+      return this->getSize() == bufs_[PARAMETER_VALUE]->getSize();
+    }
+    return false;
+  }
+
+  inline bool useGpu() const { return useGpu_; }
+
+  int getDeviceId() const { return deviceId_; }
+
+  void setDevice(int deviceId) { deviceId_ = deviceId; }
+
+  /// The id ranges from 0 to the_total_number_of_parameters - 1
+  size_t getID() const { return config_.para_id(); }
+
+  /// ID is a implict value created until neural network is built.
+  void setID(size_t id) { config_.set_para_id(id); }
+
+  bool isStatic() const { return config_.is_static(); }
+
+  enum MatType {
+    MAT_NORMAL,
+    /// both value and grad are shared
+    MAT_NORMAL_SHARED,
+
+    /// Now used in BatchNorm in CPU mode
+    MAT_VALUE_SHARED,
+
+    /// sparse matrix, which has full size parameter
+    MAT_SPARSE_ROW_IDS,
+    /// sparse matrix, parameter size scale by sparse rates.
+    MAT_SPARSE_ROW_AUTO_GROW,
+    MAT_CACHE_ROW,
+    MAT_SPARSE_ROW,
+
+    /// sparse matrix for prefetching parameter from pserver
+    MAT_SPARSE_ROW_PREFETCH,
+    /// same as above, but parameter has full size for saving parameter in local
+    MAT_SPARSE_ROW_PREFETCH_FULL_SIZE,
+  };
+
+  void enableSparseParameter() {
+    if (config_.is_sparse()) {
+      if (config_.format() == "csr") {
+        size_t height = config_.dims(0);
+        size_t nnz = config_.size();
+        enableIntType(PARAMETER_ROWS, height + 1);
+        enableIntType(PARAMETER_COLS, nnz);
+        format_ = SPARSE_CSR;
+      } else {
+        size_t width = config_.dims(1);
+        size_t nnz = config_.size();
+        enableIntType(PARAMETER_COLS, width + 1);
+        enableIntType(PARAMETER_ROWS, nnz);
+        format_ = SPARSE_CSC;
+      }
+    }
+  }
+
+  /// allocate buffer for the give type
+  void enableType(ParameterType type, MatType matType = MAT_NORMAL) {
+    if (bufs_[type] || mats_[type]) {
+      return;
+    }
+    SetDevice device(deviceId_);
+    if (config_.dims_size() == 2) {
+      if (matType == MAT_NORMAL || matType == MAT_NORMAL_SHARED ||
+          matType == MAT_SPARSE_ROW_PREFETCH_FULL_SIZE ||
+          matType == MAT_VALUE_SHARED || matType == MAT_SPARSE_ROW_IDS) {
+        bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
+        bufs_[type]->zeroMem();
+      } else {
+        CHECK(isGradSparseUpdate());
+      }
+      if (config_.is_sparse() && type == PARAMETER_VALUE) {
+        enableSparseParameter();
+      }
+      setMat(type, matType);
+    } else {
+      bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
+      bufs_[type]->zeroMem();
+    }
+  }
+
+  void enableBufType(ParameterType type) {
+    if (bufs_[type]) return;
+    bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
+    bufs_[type]->zeroMem();
+  }
+
+  void enableIntType(ParameterType type, size_t intStoreSize = 0) {
+    if (!intBufs_[type]) {
+      SetDevice device(deviceId_);
+      size_t size = intStoreSize ? intStoreSize : config_.size();
+      intBufs_[type] = IVector::create(size, useGpu_);
+      intBufs_[type]->zeroMem();
+    }
+  }
+
+  void enableSharedType(ParameterType type,
+                        VectorPtr vec,
+                        MatrixPtr mat = nullptr) {
+    if (!bufs_[type] && !mats_[type]) {
+      bufs_[type] = vec;
+      mats_[type] = mat;
+    }
+  }
+
+  /// for batchGradientMachine: blockNum is number of partitions of the matrix.
+  bool isGradShared(size_t* blockNum = NULL);
+
+  bool isValueShared();
+
+  // for AsgdSparseGradientMachine & SgdSparseGradientMachine:
+  // and MultiGradientMachine
+  bool isGradSparseUpdate() const;
+
+  bool isSparseRemoteUpdate() const {
+    return config_.sparse_remote_update() && !useGpu();
+  }
+
+  const ParameterConfig& getConfig() const { return config_; }
+
+  ParameterConfig& getConfig() { return config_; }
+
+  bool hasType(ParameterType pType) const {
+    return bufs_[pType] || mats_[pType];
+  }
+
+  const VectorPtr& getBuf(ParameterType pType) const {
+    return this->bufs_[pType];
+  }
+
+  const VectorPtr* getBufs() const { return bufs_; }
+
+  const MatrixPtr& getMat(ParameterType pType) const { return mats_[pType]; }
+
+  void setValueUpdated() { updated_ = true; }
+
+  void clearValueUpdated() { updated_ = false; }
+
+  bool isValueUpdated() const { return updated_; }
+
+  /**
+   * Save parameter value to a file
+   */
+  bool save(const std::string& filename) const;
+
+  /**
+   * Save parameter to ostream
+   */
+  bool save(std::ostream& s) const;
+
+  /**
+   * Load parameter value from a file
+   */
+  bool load(const std::string& filename);
+
+  /**
+   * Load parameter from istream
+   */
+  bool load(std::istream& is);
+
+  void incShared() { sharedCount_++; }
+
+  /**
+   * After one of the parameter's gradient is merged
+   * You should call this function to do some additional processing,
+   */
+  void incUpdate(const UpdateCallback& callbacks = NULL);
+
+  void clearGradient() {
+    auto& mat = getMat(PARAMETER_GRADIENT);
+    if (mat) {
+      // zeroMem will also clear rows for SparseRowCpuMatrix
+      mat->zeroMem();
+    } else {
+      auto& gradBuf = getBuf(PARAMETER_GRADIENT);
+      if (gradBuf) gradBuf->zeroMem();
+    }
+  }
+
+  void initialize();
+
+  /**
+   * Initialize the value according to config_: initial_mean,
+   * initial_std and initial_strategy.
+   */
+  void randomize();
+  static void randomize(const VectorPtr& value, const ParameterConfig& config);
+
+  /// Initialize the value to 0
+  void zeroMem();
+
+  /// file header structure
+  struct Header {
+    int32_t format;      // = PARAM_FORMAT
+    uint32_t valueSize;  // = sizeof(real)
+    uint64_t size;       // = getSize()
+  };
+
+  /**
+   * @brief Is the header format supported.
+   */
+  static bool isHeaderFormatSupported(int32_t fmt) {
+    return fmt < PARAM_FORMAT_ITEMS;
+  }
+
+  /**
+   * @brief Get the format in header.
+   */
+  int getHeaderFormat() { return headerFormat_; }
+
+  /**
+   * @brief Set the format in header.
+   */
+  void setHeaderFormat(int32_t fmt) {
+    CHECK(isHeaderFormatSupported(fmt)) << "Unsupported format version: "
+                                        << fmt;
+    headerFormat_ = fmt;
+  }
+
+  /**
+   * @brief  Parameter Update Hook.
+   *
+   * The parameter's update hook before ParameterUpdater::updateImpl
+   * It could modify gradient/momentum/etc here. Such as drop some gradient,
+   * etc.
+   */
+  void updateHook() {
+    for (auto& hook : updaterHooks_) {
+      hook->update(this);
+    }
+  }
+
+  /**
+   * @brief  Initialize all updater hook.
+   *
+   * This method should be invoked in ParameterUpdater::init() only.
+   */
+  void initHook() {
+    for (auto& hook : updaterHooks_) {
+      hook->init(this);
+    }
+  }
+
+ protected:
+  /**
+   * @brief create matrix to matType.
+   *
+   * used by gradient machine which needs specify matrix type,
+   * instead of creating in weights.cpp.
+   *
+   * @note  pType should be enabled already.
+   */
+  void setMat(ParameterType pType, int matType);
+
+  bool isUpdatable() { return (updateCounter_ == sharedCount_); }
+
+  void clearUpdate() { updateCounter_ = 0; }
+
+ protected:
+  ParameterConfig config_;
+
+  bool useGpu_;
+
+  int deviceId_;
+
+  /**
+   * @brief bufs_ stores parameter value and gradient.
+   *
+   * Layer should use bufs_[PARAMETER_VALUE] to form weight matrix for
+   * calculation and stores gradient to bufs_[PARAMETER_GRADIENT].
+   */
+  VectorPtr bufs_[NUM_PARAMETER_TYPES];
+
+  /**
+   * @brief Weight matrix for bufs_.
+   *
+   * It's helpfull when parameter shared by multi-layers.
+   * Caller should check, if mats exist, do not create it again.
+   */
+  MatrixPtr mats_[NUM_PARAMETER_TYPES];
+
+  /// Int vectors, used in some User defined parameter types
+  IVectorPtr intBufs_[NUM_PARAMETER_TYPES];
+
+  int sharedCount_;
+  int updateCounter_;
+
+  bool updated_;
+  SparseFormat format_;
+
+  /// The header format for saving or loading param
+  int32_t headerFormat_;
+
+  std::vector<std::shared_ptr<IParameterUpdaterHook>> updaterHooks_;
+
+ public:
+  void setSharedCount(int cnt) { sharedCount_ = cnt; }
+  int getSharedCount() { return sharedCount_; }
+
+  bool isSparse() { return config_.is_sparse(); }
+  SparseFormat getFormat() { return format_; }
+
+  static const std::string kMissParameterFail;
+  static const std::string kMissParameterRand;
+  static const std::string kMissParameterZero;
+};
+
+typedef std::map<std::string, ParameterPtr> ParameterMap;
+
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/ParameterOptimizer.cpp b/paddle/legacy/parameter/ParameterOptimizer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b9dffa5afb4c99314869c7ed547ea9711d718b6e
--- /dev/null
+++ b/paddle/legacy/parameter/ParameterOptimizer.cpp
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/Logging.h"
+
+#include <fstream>
+
+#include "AverageOptimizer.h"
+#include "FirstOrderOptimizer.h"
+#include "OptimizerFunctions.h"
+#include "OptimizerWithRegularizer.h"
+#include "ParameterOptimizer.h"
+#include "hl_gpu.h"
+
+namespace paddle {
+
+ParameterOptimizer* ParameterOptimizer::create(
+    const OptimizationConfig& optConfig, bool inPserver) {
+  if (inPserver && optConfig.num_batches_per_send_parameter() > 1) {
+    return new AddOptimizer(optConfig);
+  }
+  if (optConfig.learning_method() == "momentum") {
+    return new SgdOptimizer(optConfig);
+  }
+  if (optConfig.learning_method() == "torch_momentum") {
+    return new SgdOptimizer(optConfig);
+  }
+  if (optConfig.learning_method() == "adagrad") {
+    return new AdagradParameterOptimizer(optConfig);
+  }
+  if (optConfig.learning_method() == "adadelta") {
+    return new AdaDeltaParameterOptimizer(optConfig);
+  }
+  if (optConfig.learning_method() == "rmsprop") {
+    return new RMSPropParameterOptimizer(optConfig);
+  }
+  if (optConfig.learning_method() == "decayed_adagrad") {
+    return new DecayedAdagradParameterOptimizer(optConfig);
+  }
+  if (optConfig.learning_method() == "adam") {
+    return new AdamParameterOptimizer(optConfig);
+  }
+  if (optConfig.learning_method() == "adamax") {
+    return new AdamaxParameterOptimizer(optConfig);
+  }
+  if (optConfig.learning_method() == "sparse_momentum") {
+    return new SparseMomentumParameterOptimizer(optConfig);
+  }
+  return nullptr;
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/ParameterOptimizer.h b/paddle/legacy/parameter/ParameterOptimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..019afa1358ae255fd096e84e5eb1d7b0b9d6859f
--- /dev/null
+++ b/paddle/legacy/parameter/ParameterOptimizer.h
@@ -0,0 +1,211 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "LearningRateScheduler.h"
+#include "Parameter.h"
+
+namespace paddle {
+
+/**
+ * Some member functions are set to const for two reasons:
+ *
+ * 1. For sparse update thread safe: update(), traverse callback(const this)
+ *    may be called many times, each time one row, and these function
+ *    can be called parallelly by multi worker, to speed up large block.
+ *
+ * 2. For predicate functions, needSpecialTraversal(), startCatchUpWith()
+ *    may be called many times, should be no state change between calls.
+ */
+class ParameterOptimizer {
+ public:
+  typedef std::function<void(
+      const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId)>
+      TraverseCallback;
+
+ public:
+  explicit ParameterOptimizer(const OptimizationConfig& optConfig)
+      : applyDecay_(true),
+        optConfig_(optConfig),
+        parameterTypes_{PARAMETER_VALUE, PARAMETER_GRADIENT},
+        learningRate_(optConfig.learning_rate()),
+        learningRateScheduler_(LearningRateScheduler::create(optConfig)),
+        pass_(0),
+        firstTime_(true) {}
+
+  real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
+    return learningRateScheduler_->calcLearningRate(numSamplesProcessed, pass);
+  }
+
+  virtual ~ParameterOptimizer() {}
+
+  /**
+   * For sparse update, optimizer can maintain numRows of timer(t0).
+   * Some sparse optimizer depends on parameter config in functions
+   * such as startBatch(). Optimizer can get it here. But notice that,
+   * not all callers can pass config here, so the optimizer should check
+   * config passed in is not null ptr.
+   */
+  virtual void init(size_t numRows, const ParameterConfig* config) {}
+
+  virtual void startPass() {}
+  virtual void finishPass() { ++pass_; }
+
+  /// called by Trainer before forward() of a batch.
+  virtual void startBatch(int64_t numSamplesProcessed) {
+    (void)numSamplesProcessed;
+  }
+
+  /**
+   * following hooks useful for sparse update,
+   * because the traversal in block costs.
+   * called by Trainer after update and before finishBatch
+   * e.g. Trainer call like this:
+   *
+   * @code
+   * startBatch();
+   * if (dense) {
+   *   update(blockVec);
+   * } else {//sparse
+   *   for (row : rows_in_block) {update(rowVec)}
+   * }
+   * auto callback = needSpecialTraversal();
+   * if (callback) {
+   *   // do traverse, maybe multi-thread
+   *   if (dense) {
+   *     callback();
+   *   } else {//sparse
+   *     for (row : all_rows_in_block) {callback();}
+   *   }
+   * }
+   * finishBatch();
+   * @endcode
+   *
+   * @return callback if need traverse,
+   *         else return nullptr.
+   *         It should be no state change.
+   */
+  virtual TraverseCallback needSpecialTraversal(
+      const ParameterConfig& config) const {
+    return nullptr;
+  }
+
+  /// called by Trainer after backward() of a batch
+  virtual void finishBatch() {}
+
+  /**
+   * between startBatch() and finishBatch(), update() will be called
+   * by the trainer multiple times, each time for updating one Parameter
+   * with its gradient in PARAMETER_GRADIENT. sparseId is row id,
+   * when sparseId set, update is sparse, each time one row.
+   */
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& config,
+                      size_t sparseId = -1LU) const = 0;
+
+  /**
+   * following hooks catch up with current time for sparse update,
+   * In the beginning, call startCatchUpWith() and check return.
+   * In the end, call finishCatchUpWith() to finish state.
+   * callback do the actual works, can call many times for sparse data.
+   * e.g. Trainer call like this:
+   *
+   * @code
+   * auto callback = startCatchUpWith();
+   * if (callback) {
+   *   // do catch up with, maybe multi-thread
+   *   if (dense) {
+   *     callback();
+   *   } else {//sparse
+   *     for (row : rows_in_block) {callback();}
+   *   }
+   *   // finish catch up with, main thread
+   *   finishCatchUpWith();
+   * }
+   * @endcode
+   *
+   * @return callback if need catch up with,
+   *         else return nullptr.
+   *         It should be no state change.
+   */
+  virtual TraverseCallback startCatchUpWith() const { return nullptr; }
+  virtual void finishCatchUpWith() {}
+
+  /**
+   * following two hooks used by averager,
+   * apply to final parameter value (PARAMETER_VALUE or PARAMETER_APPLY).
+   *
+   * restore() will restore orginal value if it apply to PARAMETER_VALUE.
+   * Caller must ensure it's catched up with current time before apply.
+   *
+   * Use returned callback same way as callback returned by
+   * ParameterOptimizer::needSpecialTraversal()
+   */
+  virtual TraverseCallback apply() { return nullptr; }
+  virtual TraverseCallback restore() { return nullptr; }
+
+  /// return the parameter types used by this updater
+  const std::vector<ParameterType>& getParameterTypes() const {
+    return parameterTypes_;
+  }
+
+  void addParameterType(ParameterType type) {
+    for (auto t : parameterTypes_) {
+      if (t == type) return;
+    }
+    parameterTypes_.push_back(type);
+  }
+
+  real getLearningRate() const { return learningRate_; }
+
+  virtual void setNoDecay() { applyDecay_ = false; }
+
+  static ParameterOptimizer* create(const OptimizationConfig& optConfig,
+                                    bool inPserver = false);
+
+ protected:
+  typedef std::vector<ParameterOptimizer::TraverseCallback> TraverseCallbackVec;
+
+  static TraverseCallback composeCallbacks(
+      const TraverseCallbackVec& callbacks) {
+    if (callbacks.size() > 1LU) {
+      return [callbacks](const VectorPtr vecs[],
+                         const ParameterConfig& config,
+                         size_t sparseId) {
+        for (auto callback : callbacks) {
+          callback(vecs, config, sparseId);
+        }
+      };
+    }
+    return (callbacks.size() == 1LU) ? callbacks[0] : nullptr;
+  }
+
+  bool applyDecay_;
+  const OptimizationConfig& optConfig_;
+  std::vector<ParameterType> parameterTypes_;
+
+  /**
+   * global learning rate, init value is opt_config.learning_rate,
+   * sparse regularizer get this value per batch, after StartBatch() called
+   * so, if lr change in StartBatch, please assign to learningRate_
+   */
+  real learningRate_;
+
+  std::unique_ptr<LearningRateScheduler> learningRateScheduler_;
+  int64_t pass_;  // current training pass (starting from 0)
+  bool firstTime_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/ParameterUpdateFunctions.cpp b/paddle/legacy/parameter/ParameterUpdateFunctions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..72c9841acf6d3eb1d28d631e1599a1a403175013
--- /dev/null
+++ b/paddle/legacy/parameter/ParameterUpdateFunctions.cpp
@@ -0,0 +1,300 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/Logging.h"
+#ifdef __AVX__
+#include <x86intrin.h>
+#include <xmmintrin.h>
+#endif
+
+#include "ParameterUpdateFunctions.h"
+
+namespace paddle {
+
+void sgdUpdateCpu(real learningRate,
+                  real momentum,
+                  real decayRate,
+                  size_t size,
+                  real* value,
+                  const real* grad,
+                  real* momentumVec) {
+  decayRate *= learningRate;
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (size_t i = 0; i < size; ++i) {
+    momentumVec[i] = momentum * momentumVec[i] - learningRate * grad[i] -
+                     decayRate * value[i];
+    value[i] += momentumVec[i];
+  }
+}
+
+void sgdUpdate(real learningRate,
+               real momentum,
+               real decayRate,
+               Vector* value,
+               Vector* grad,
+               Vector* momentumVec) {
+  size_t size = value->getSize();
+  real* val = value->getData();
+  real* grd = grad->getData();
+  real* mom = momentumVec->getData();
+  if (typeid(*value) == typeid(CpuVector)) {
+    sgdUpdateCpu(learningRate, momentum, decayRate, size, val, grd, mom);
+  } else if (typeid(*value) == typeid(GpuVector)) {
+    value->sgdUpdate(*grad, *momentumVec, learningRate, momentum, decayRate);
+  } else {
+    LOG(FATAL) << "Wrong";
+  }
+}
+
+void sgdUpdateAvx(float learningRate,
+                  float momentum,
+                  float decayRate,
+                  size_t size,
+                  float* value,
+                  const float* _grad,
+                  float* momentumVec) {
+#ifdef __AVX__
+  float* grad = const_cast<float*>(_grad);  // the gradient is not modified
+                                            // but when invoke simd functions
+                                            // need non-const pointer.
+  size_t gradientAlign = 0;
+  size_t gradientAlignHeader = (size_t)grad % sizeof(__m256);
+  CHECK_EQ(gradientAlignHeader, (size_t)momentumVec % sizeof(__m256))
+      << "Gradent buffer didn't align with momentum buffer";
+  CHECK_EQ(gradientAlignHeader, (size_t)value % sizeof(__m256))
+      << "Gradent buffer didn't align with value buffer";
+  if (0 != gradientAlignHeader) {
+    gradientAlignHeader = sizeof(__m256) - gradientAlignHeader;
+    gradientAlign = gradientAlignHeader / sizeof(real);
+
+    // handle the unalign buffer
+    for (size_t i = 0; i < gradientAlign; i++) {
+      momentumVec[i] = momentum * momentumVec[i] - (learningRate * grad[i]) -
+                       (decayRate * learningRate * value[i]);
+      value[i] += momentumVec[i];
+    }
+    grad += gradientAlign;
+    momentumVec += gradientAlign;
+    value += gradientAlign;
+  }
+
+  constexpr size_t kParallelNum = 8;
+  constexpr size_t nStepSize = (sizeof(__m256) / sizeof(real)) * kParallelNum;
+  size_t cntLoop = (size - gradientAlign) / nStepSize;
+  size_t cntRem = (size - gradientAlign) % nStepSize;
+  __m256 gradientTmp[kParallelNum];
+  __m256 valueTmp[kParallelNum];
+  __m256 lr, mom, dr;
+  std::function<void(void)> loopFun;
+
+  learningRate *= -1;
+  lr = _mm256_set_ps(learningRate,
+                     learningRate,
+                     learningRate,
+                     learningRate,
+                     learningRate,
+                     learningRate,
+                     learningRate,
+                     learningRate);
+
+  if (0 != momentum) {
+    mom = _mm256_set_ps(momentum,
+                        momentum,
+                        momentum,
+                        momentum,
+                        momentum,
+                        momentum,
+                        momentum,
+                        momentum);
+  }
+
+  decayRate *= learningRate;
+  if (0 != decayRate) {
+    dr = _mm256_set_ps(decayRate,
+                       decayRate,
+                       decayRate,
+                       decayRate,
+                       decayRate,
+                       decayRate,
+                       decayRate,
+                       decayRate);
+  }
+
+  auto gradMulFun = [&](void) {
+    gradientTmp[0] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad), lr);
+    gradientTmp[1] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 8), lr);
+    gradientTmp[2] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 16), lr);
+    gradientTmp[3] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 24), lr);
+    gradientTmp[4] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 32), lr);
+    gradientTmp[5] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 40), lr);
+    gradientTmp[6] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 48), lr);
+    gradientTmp[7] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 56), lr);
+  };
+
+  auto valueMulFun = [&](void) {
+    valueTmp[0] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value), dr);
+    valueTmp[1] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 8), dr);
+    valueTmp[2] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 16), dr);
+    valueTmp[3] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 24), dr);
+    valueTmp[4] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 32), dr);
+    valueTmp[5] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 40), dr);
+    valueTmp[6] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 48), dr);
+    valueTmp[7] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 56), dr);
+  };
+
+  auto momentumMulFun = [&](void) {
+    *reinterpret_cast<__m256*>(momentumVec) =
+        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec), mom);
+    *reinterpret_cast<__m256*>(momentumVec + 8) =
+        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 8), mom);
+    *reinterpret_cast<__m256*>(momentumVec + 16) =
+        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 16), mom);
+    *reinterpret_cast<__m256*>(momentumVec + 24) =
+        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 24), mom);
+    *reinterpret_cast<__m256*>(momentumVec + 32) =
+        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 32), mom);
+    *reinterpret_cast<__m256*>(momentumVec + 40) =
+        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 40), mom);
+    *reinterpret_cast<__m256*>(momentumVec + 48) =
+        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 48), mom);
+    *reinterpret_cast<__m256*>(momentumVec + 56) =
+        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 56), mom);
+  };
+
+  auto momentumAddGradFun = [&](void) {
+    *reinterpret_cast<__m256*>(momentumVec) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(momentumVec), gradientTmp[0]);
+    *reinterpret_cast<__m256*>(momentumVec + 8) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 8), gradientTmp[1]);
+    *reinterpret_cast<__m256*>(momentumVec + 16) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 16), gradientTmp[2]);
+    *reinterpret_cast<__m256*>(momentumVec + 24) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 24), gradientTmp[3]);
+    *reinterpret_cast<__m256*>(momentumVec + 32) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 32), gradientTmp[4]);
+    *reinterpret_cast<__m256*>(momentumVec + 40) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 40), gradientTmp[5]);
+    *reinterpret_cast<__m256*>(momentumVec + 48) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 48), gradientTmp[6]);
+    *reinterpret_cast<__m256*>(momentumVec + 56) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 56), gradientTmp[7]);
+  };
+
+  auto momentumZeroFun = [&](void) {
+    *reinterpret_cast<__m256*>(momentumVec) = gradientTmp[0];
+    *reinterpret_cast<__m256*>(momentumVec + 8) = gradientTmp[1];
+    *reinterpret_cast<__m256*>(momentumVec + 16) = gradientTmp[2];
+    *reinterpret_cast<__m256*>(momentumVec + 24) = gradientTmp[3];
+    *reinterpret_cast<__m256*>(momentumVec + 32) = gradientTmp[4];
+    *reinterpret_cast<__m256*>(momentumVec + 40) = gradientTmp[5];
+    *reinterpret_cast<__m256*>(momentumVec + 48) = gradientTmp[6];
+    *reinterpret_cast<__m256*>(momentumVec + 56) = gradientTmp[7];
+  };
+
+  auto momentumAddValueFun = [&](void) {
+    *reinterpret_cast<__m256*>(momentumVec) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(momentumVec), valueTmp[0]);
+    *reinterpret_cast<__m256*>(momentumVec + 8) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(momentumVec + 8), valueTmp[1]);
+    *reinterpret_cast<__m256*>(momentumVec + 16) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 16), valueTmp[2]);
+    *reinterpret_cast<__m256*>(momentumVec + 24) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 24), valueTmp[3]);
+    *reinterpret_cast<__m256*>(momentumVec + 32) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 32), valueTmp[4]);
+    *reinterpret_cast<__m256*>(momentumVec + 40) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 40), valueTmp[5]);
+    *reinterpret_cast<__m256*>(momentumVec + 48) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 48), valueTmp[6]);
+    *reinterpret_cast<__m256*>(momentumVec + 56) = _mm256_add_ps(
+        *reinterpret_cast<__m256*>(momentumVec + 56), valueTmp[7]);
+  };
+
+  auto valueAddMomentumFun = [&](void) {
+    *reinterpret_cast<__m256*>(value) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(value),
+                      *reinterpret_cast<__m256*>(momentumVec));
+    *reinterpret_cast<__m256*>(value + 8) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 8),
+                      *reinterpret_cast<__m256*>(momentumVec + 8));
+    *reinterpret_cast<__m256*>(value + 16) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 16),
+                      *reinterpret_cast<__m256*>(momentumVec + 16));
+    *reinterpret_cast<__m256*>(value + 24) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 24),
+                      *reinterpret_cast<__m256*>(momentumVec + 24));
+    *reinterpret_cast<__m256*>(value + 32) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 32),
+                      *reinterpret_cast<__m256*>(momentumVec + 32));
+    *reinterpret_cast<__m256*>(value + 40) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 40),
+                      *reinterpret_cast<__m256*>(momentumVec + 40));
+    *reinterpret_cast<__m256*>(value + 48) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 48),
+                      *reinterpret_cast<__m256*>(momentumVec + 48));
+    *reinterpret_cast<__m256*>(value + 56) =
+        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 56),
+                      *reinterpret_cast<__m256*>(momentumVec + 56));
+  };
+
+  if (0 == decayRate && 0 == momentum) {
+    loopFun = [&](void) {
+      gradMulFun();
+      momentumZeroFun();
+      valueAddMomentumFun();
+    };
+  } else if (0 == decayRate && 0 != momentum) {
+    loopFun = [&](void) {
+      gradMulFun();
+      momentumMulFun();
+      momentumAddGradFun();
+      valueAddMomentumFun();
+    };
+  } else if (0 != decayRate && 0 == momentum) {
+    loopFun = [&](void) {
+      gradMulFun();
+      valueMulFun();
+      momentumZeroFun();
+      momentumAddValueFun();
+      valueAddMomentumFun();
+    };
+  } else if (0 != decayRate && 0 != momentum) {
+    loopFun = [&](void) {
+      gradMulFun();
+      valueMulFun();
+      momentumMulFun();
+      momentumAddGradFun();
+      momentumAddValueFun();
+      valueAddMomentumFun();
+    };
+  }
+
+  for (size_t i = 0; i < cntLoop; i++) {
+    loopFun();
+    grad += nStepSize;
+    momentumVec += nStepSize;
+    value += nStepSize;
+  }
+
+  for (size_t i = 0; i < cntRem; i++) {
+    momentumVec[i] = momentum * momentumVec[i] + (learningRate * grad[i]) +
+                     (decayRate * value[i]);
+    value[i] += momentumVec[i];
+  }
+#endif
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/ParameterUpdateFunctions.h b/paddle/legacy/parameter/ParameterUpdateFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7cc1c4c47b6c8723520221cb0efc2afb53a900c
--- /dev/null
+++ b/paddle/legacy/parameter/ParameterUpdateFunctions.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/utils/Common.h"
+
+namespace paddle {
+
+/**
+ * Performs the following operations.
+ *
+ * momentumVec = momentum * momentumVec
+ *               - learningRate * grad
+ *               - learningRate * decayRate * value
+ *
+ * value = value + momentumVec
+ * momentum = 0 or decayRate = 0 are specially handled to avoid unnecessary
+ * computation.
+ */
+void sgdUpdate(real learningRate,
+               real momentum,
+               real decayRate,
+               Vector* value,
+               Vector* grad,
+               Vector* momentumVec);
+
+void sgdUpdateCpu(real learningRate,
+                  real momentum,
+                  real decayRate,
+                  size_t size,
+                  real* value,
+                  const real* grad,
+                  real* momentumVec);
+
+void sgdUpdateAvx(float learningRate,
+                  float momentum,
+                  float decayRate,
+                  size_t size,
+                  float* value,
+                  const float* grad,
+                  float* momentumVec);
+
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/ParameterUpdaterBase.cpp b/paddle/legacy/parameter/ParameterUpdaterBase.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7d9d3fad63160b76d6de0932f39596a8643d0a8e
--- /dev/null
+++ b/paddle/legacy/parameter/ParameterUpdaterBase.cpp
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ParameterUpdaterBase.h"
+#include <fstream>
+#include "hl_gpu.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+void ParameterUpdater::init(const std::vector<ParameterPtr>& parameters) {
+  parameters_ = parameters;
+  for (ParameterType type : getParameterTypes()) {
+    for (auto& para : parameters) {
+      para->enableType(type);
+    }
+  }
+  for (size_t pid = 0; pid < parameters_.size(); ++pid) {
+    nonStaticParaIDMap_.insert(
+        std::pair<size_t, size_t>(parameters_[pid]->getID(), pid));
+  }
+
+  for (auto& para : parameters) {
+    if (!para->isStatic()) {
+      para->initHook();
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/ParameterUpdaterBase.h b/paddle/legacy/parameter/ParameterUpdaterBase.h
new file mode 100644
index 0000000000000000000000000000000000000000..493512886cad3ea9b74026d6dfcc4fc90f6aadb9
--- /dev/null
+++ b/paddle/legacy/parameter/ParameterUpdaterBase.h
@@ -0,0 +1,182 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Parameter.h"
+
+namespace paddle {
+
+class ParameterOptimizer;
+
+class ParameterUpdater {
+ public:
+  ParameterUpdater() : parameterTypes_{PARAMETER_VALUE, PARAMETER_GRADIENT} {}
+  virtual ~ParameterUpdater() {}
+
+  void addParameterType(ParameterType type) {
+    for (auto t : parameterTypes_) {
+      if (t == type) return;
+    }
+    parameterTypes_.push_back(type);
+  }
+
+  virtual void init(const std::vector<ParameterPtr>& parameters);
+
+  // called by Trainer when starting a new pass
+  virtual void startPass() {}
+
+  // called by Trainer then finishing a pass, ruturn true if pass accepted
+  virtual bool finishPass() { return true; }
+
+  // called by Trainer before backward() of a batch
+  // Return the type of pass it needs. This pass type will be passed
+  // to GradientMachine::forward() by the caller.
+  virtual PassType startBatch(int64_t batchSize) {
+    (void)batchSize;
+    return PASS_TRAIN;
+  }
+
+  // called by Trainer after backward() of a batch
+  // cost: the cost for this batch
+  virtual void finishBatch(real cost) { (void)cost; }
+
+  // between startBatch() and finishBatch(), update() will be called
+  // by the trainer multiple times, each time for updating one Parameter
+  // with its gradient in PARAMETER_GRADIENT
+  void update(Parameter* para) {
+    SetDevice setDevice(para->getDeviceId());
+    para->updateHook();
+    this->updateImpl(para);
+  }
+
+  // only get required sparse rows by default,
+  // get full matrix parameter if *fullSize* set
+  // get PARAMETER_APPLY on pserver if *apply* set
+  virtual void getParametersRemote(bool fullSize = false, bool apply = false) {}
+
+  virtual void loadParametersRemote(const std::string& dirName) {}
+  virtual void saveParametersRemote(const std::string& dirName) {}
+  virtual void randParametersRemote() {}
+
+  // something like regularization may be delayed apply
+  // trainer should catch up with before parameter is saved or sended.
+  virtual void catchUpWith() {}
+
+  // following two hooks used by averager
+  // apply to final parameter value (PARAMETER_VALUE or PARAMETER_APPLY).
+  // restore() will restore orginal value if it apply to PARAMETER_VALUE.
+  virtual void apply() {}
+  virtual void restore() {}
+
+  // return the parameter types used by this updater
+  const std::vector<ParameterType>& getParameterTypes() const {
+    return parameterTypes_;
+  }
+
+#ifndef PADDLE_DISABLE_TIMER
+  virtual void setForwardbackwardTime(uint64_t delta) {}
+#endif
+
+ protected:
+  virtual void updateImpl(Parameter* para) = 0;
+
+  std::vector<ParameterType> parameterTypes_;
+  std::vector<ParameterPtr> parameters_;
+  std::map<size_t, size_t> nonStaticParaIDMap_;
+};
+
+// Composite of ParameterUpdaters, each ParameterUpdater handle
+// part of all Parameters. It's useful when we need different
+// update strategy for different Parameter.
+class ParameterUpdaterComposite : public ParameterUpdater {
+ public:
+  ParameterUpdaterComposite() {}
+  virtual ~ParameterUpdaterComposite() {}
+
+  virtual void init(const std::vector<ParameterPtr>& parameters) = 0;
+
+  virtual void startPass() {
+    syncThreadPool_->execPlusOwner(
+        [&](int tid, size_t numThreads) { updaters_[tid]->startPass(); });
+  }
+
+  virtual bool finishPass() {
+    syncThreadPool_->execPlusOwner(
+        [&](int tid, size_t numThreads) { updaters_[tid]->finishPass(); });
+    return true;
+  }
+
+  virtual PassType startBatch(int64_t batchSize) {
+    syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
+      updaters_[tid]->startBatch(batchSize);
+    });
+    return PASS_TRAIN;
+  }
+
+  virtual void finishBatch(real cost) {
+    syncThreadPool_->execPlusOwner(
+        [&](int tid, size_t numThreads) { updaters_[tid]->finishBatch(cost); });
+  }
+
+  virtual void getParametersRemote(bool fullSize, bool apply) {
+    syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
+      updaters_[tid]->getParametersRemote(fullSize, apply);
+    });
+  }
+  virtual void loadParametersRemote(const std::string& dirName) {
+    syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
+      updaters_[tid]->loadParametersRemote(dirName);
+    });
+  }
+  virtual void saveParametersRemote(const std::string& dirName) {
+    syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
+      updaters_[tid]->saveParametersRemote(dirName);
+    });
+  }
+  virtual void randParametersRemote() {
+    syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
+      updaters_[tid]->randParametersRemote();
+    });
+  }
+
+  virtual void catchUpWith() {
+    syncThreadPool_->execPlusOwner(
+        [&](int tid, size_t numThreads) { updaters_[tid]->catchUpWith(); });
+  }
+
+#ifndef PADDLE_DISABLE_TIMER
+  virtual void setForwardbackwardTime(uint64_t delta) {
+    for (auto& updater : updaters_) {
+      updater->setForwardbackwardTime(delta);
+    }
+  }
+#endif
+
+  virtual void apply() {
+    syncThreadPool_->execPlusOwner(
+        [&](int tid, size_t numThreads) { updaters_[tid]->apply(); });
+  }
+  virtual void restore() {
+    syncThreadPool_->execPlusOwner(
+        [&](int tid, size_t numThreads) { updaters_[tid]->restore(); });
+  }
+
+ protected:
+  virtual void updateImpl(Parameter* para) {}
+  std::vector<std::unique_ptr<ParameterUpdater>> updaters_;
+  std::unique_ptr<SyncThreadPool> syncThreadPool_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/ParameterUpdaterHook.cpp b/paddle/legacy/parameter/ParameterUpdaterHook.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bfb9769fb67fc71b6f96f09d44b2c108745eafa3
--- /dev/null
+++ b/paddle/legacy/parameter/ParameterUpdaterHook.cpp
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ParameterUpdaterHook.h"
+
+#include <algorithm>
+#include <atomic>
+#include <fstream>
+#include <mutex>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+/**
+ * The static pruning hook
+ * Static means user specify a sparsity_ratio before training started, and the
+ * network will prune the parameters based on the sparsity_ratio. More details
+ * can be found https://arxiv.org/pdf/1506.02626.pdf.
+ */
+
+class StaticPruningHook : public IParameterUpdaterHook {
+ public:
+  explicit StaticPruningHook(const ParameterUpdaterHookConfig &hookConfig)
+      : initCount_(0) {
+    sparsityRatio_ = hookConfig.sparsity_ratio();
+  }
+
+  static bool sortPairAscend(const std::pair<real, size_t> &pair1,
+                             const std::pair<real, size_t> &pair2) {
+    return pair1.first > pair2.first;
+  }
+
+  void update(Parameter *para) {
+    updateThreadChecker_.check();
+    auto &vec = para->getBuf(PARAMETER_GRADIENT);
+    if (vec) {
+      vec->dotMul(*maskVec_);
+    }
+  }
+
+  void generateMask(Parameter *para) {
+    VectorPtr maskTemp = Vector::create(para->getSize(), false);
+    maskTemp->zeroMem();
+    real *maskTempData = maskTemp->getData();
+    size_t nonZeroNum = para->getSize() * (1 - sparsityRatio_);
+
+    VectorPtr paraVec = para->getBuf(PARAMETER_VALUE);
+    VectorPtr paraCpuCopy = Vector::create(para->getSize(), false);
+
+    paraCpuCopy->copyFrom(*paraVec);
+    std::vector<std::pair<real, size_t>> param;
+
+    for (size_t i = 0; i < para->getSize(); i++)
+      param.push_back(std::make_pair(fabs(paraCpuCopy->getData()[i]), i));
+
+    std::partial_sort(
+        param.begin(), param.begin() + nonZeroNum, param.end(), sortPairAscend);
+    for (size_t i = 0; i < nonZeroNum; i++) maskTempData[param[i].second] = 1.0;
+
+    // Currently just use a mask vector for hack.
+    if (para->useGpu()) {
+      maskVec_ = Vector::create(para->getSize(), para->useGpu());
+      maskVec_->copyFrom(*maskTemp);
+    } else {
+      maskVec_ = maskTemp;
+    }
+  }
+
+  void init(Parameter *para) {
+    generateMask(para);
+    size_t initCount = this->initCount_.fetch_add(1);
+    CHECK_EQ(initCount, 0UL) << "Currently the StaticPruningHook must invoke "
+                                "in same ParamterUpdater";
+    VLOG(3) << "Initialize Parameter " << para;
+    SetDevice device(para->getDeviceId());
+
+    auto &paraVec = para->getBuf(PARAMETER_VALUE);
+    paraVec->dotMul(*maskVec_);
+  }
+
+ private:
+  SameThreadChecker updateThreadChecker_;
+  std::atomic<size_t> initCount_;
+  VectorPtr maskVec_;
+  real sparsityRatio_;
+};
+
+IParameterUpdaterHook::IParameterUpdaterHook() {}
+
+IParameterUpdaterHook::~IParameterUpdaterHook() {}
+
+/**
+ * A Hasher used by g_hooks.
+ *
+ * Use the independent hasher intendedly. There is a hasher in PServer for hash
+ * ParameterBlock. But not to use same hasher to reduce dependency.
+ *
+ * May be extracted to Util.h to unify the hasher.
+ */
+class StringIntPairHasher {
+ public:
+  size_t operator()(const std::pair<std::string, int> &k) const {
+    return intHasher_(strHasher_(k.first) + k.second);
+  }
+
+ private:
+  std::hash<std::string> strHasher_;
+  std::hash<int> intHasher_;
+};
+
+static WeakKVCache<std::pair<std::string, int>,
+                   IParameterUpdaterHook,
+                   StringIntPairHasher>
+    g_hookCache_;
+
+/**
+ * ParameterUpdaterHook actually factory method.
+ */
+static IParameterUpdaterHook *createImpl(
+    const ParameterUpdaterHookConfig &config) {
+  auto &type = config.type();
+  if (type == "pruning") {
+    return new StaticPruningHook(config);
+  }
+
+  LOG(FATAL) << "Unknown Hook type:  " << type;
+  return nullptr;
+}
+
+std::shared_ptr<IParameterUpdaterHook> IParameterUpdaterHook::create(
+    const ParameterConfig &paramConfig, int idx) {
+  std::pair<std::string, int> key = {paramConfig.name(), idx};
+  return g_hookCache_.get(
+      key, [&] { return createImpl(paramConfig.update_hooks(idx)); });
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/ParameterUpdaterHook.h b/paddle/legacy/parameter/ParameterUpdaterHook.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb96e4cf007572e9688c11719017a9d2771ecd51
--- /dev/null
+++ b/paddle/legacy/parameter/ParameterUpdaterHook.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <memory>
+
+#include "ParameterConfig.pb.h"
+
+namespace paddle {
+
+class Parameter;
+
+/**
+ * The parameter updater hook interface.
+ *
+ * The Parameter Updater hooks is a group of methods invoke before
+ * ParameterUpdater::updateImpl. It can modify gradient/momentum/etc before
+ * parameter optimization.
+ */
+class IParameterUpdaterHook {
+ public:
+  virtual ~IParameterUpdaterHook();
+
+  /**
+   * Create A ParameterUpdaterHook.
+   *
+   * The same parameter shared the same hooks. So it returns shared_ptr.
+   *
+   * @param param_config The parameter config.
+   * @param idx  The element index of param_config.updater_hooks() array.
+   */
+  static std::shared_ptr<IParameterUpdaterHook> create(
+      const ParameterConfig& paramConfig, int idx);
+
+  /**
+   * The update hook method. Invoke before ParameterUpdater::updateImpl
+   */
+  virtual void update(Parameter* para) = 0;
+
+  /**
+   * The init hook method. Invoke in ParameterUpdater::init
+   */
+  virtual void init(Parameter* para) = 0;
+
+ protected:
+  /**
+   * Ctor.
+   */
+  IParameterUpdaterHook();
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/Regularizer.cpp b/paddle/legacy/parameter/Regularizer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c1d5f4fa68403408bb44341e1e28f2ce3beb2e4c
--- /dev/null
+++ b/paddle/legacy/parameter/Regularizer.cpp
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Regularizer.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+Regularizer* Regularizer::get(const std::vector<ParameterType>& types,
+                              const ParameterConfig& paraConfig) {
+  bool useLearningRateVec =
+      std::find(types.begin(), types.end(), PARAMETER_LEARNING_RATE) !=
+      types.end();
+  if (paraConfig.decay_rate_l1() > 0.0f &&
+      paraConfig.decay_rate() > 0.0f) {  // use L1 and L2
+    if (useLearningRateVec) {
+      static L1L2LrRegularizer regularizer_;
+      return &regularizer_;
+    }
+    static L1L2Regularizer regularizer_;
+    return &regularizer_;
+  }
+  if (paraConfig.decay_rate_l1() > 0.0f) {  // use L1 only
+    if (useLearningRateVec) {
+      static L1LrRegularizer regularizer_;
+      return &regularizer_;
+    }
+    static L1Regularizer regularizer_;
+    return &regularizer_;
+  }
+  if (paraConfig.decay_rate() > 0.0f) {  // use L2 only
+    if (useLearningRateVec) {
+      static L2LrRegularizer regularizer_;
+      return &regularizer_;
+    }
+    static L2Regularizer regularizer_;
+    return &regularizer_;
+  }
+  return nullptr;
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/Regularizer.h b/paddle/legacy/parameter/Regularizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa5384e23251b918cc914df36c16ad790a5c59c5
--- /dev/null
+++ b/paddle/legacy/parameter/Regularizer.h
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ParameterUpdaterBase.h"
+
+namespace paddle {
+
+// Regularizer function for parameter, e.g. L1/L2
+class Regularizer {
+ public:
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      real learningRate,  // learningrate from optimizer
+                      int t0,             // last occurence time
+                      int t) const = 0;   // current time
+  virtual ~Regularizer() {}
+
+  static Regularizer* get(const std::vector<ParameterType>& types,
+                          const ParameterConfig& paraConfig);
+};
+
+// L1 Regularizer, |w|_1
+class L1Regularizer : public Regularizer {
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      real learningRate,
+                      int t0,
+                      int t) const {
+    vecs[PARAMETER_VALUE]->applyL1(learningRate * paraConfig.learning_rate(),
+                                   paraConfig.decay_rate_l1() * (t - t0));
+  }
+};
+
+// L1 Lr Regularizer
+class L1LrRegularizer : public Regularizer {
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      real learningRate,
+                      int t0,
+                      int t) const {
+    vecs[PARAMETER_VALUE]->applyL1(*vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate * paraConfig.learning_rate(),
+                                   paraConfig.decay_rate_l1() * (t - t0));
+  }
+};
+
+// L2 Regularizer, |w|_2^2
+class L2Regularizer : public Regularizer {
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      real learningRate,
+                      int t0,
+                      int t) const {
+    vecs[PARAMETER_VALUE]->applyL2(learningRate * paraConfig.learning_rate(),
+                                   paraConfig.decay_rate() * (t - t0));
+  }
+};
+
+// L2 Lr Regularizer
+class L2LrRegularizer : public Regularizer {
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      real learningRate,
+                      int t0,
+                      int t) const {
+    vecs[PARAMETER_VALUE]->applyL2(*vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate * paraConfig.learning_rate(),
+                                   paraConfig.decay_rate() * (t - t0));
+  }
+};
+
+// L1 + L2 Regularizer, |w|_1 + |w|_2^2
+class L1L2Regularizer : public Regularizer {
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      real learningRate,
+                      int t0,
+                      int t) const {
+    vecs[PARAMETER_VALUE]->applyL1(learningRate * paraConfig.learning_rate(),
+                                   paraConfig.decay_rate_l1() * (t - t0));
+    vecs[PARAMETER_VALUE]->applyL2(learningRate * paraConfig.learning_rate(),
+                                   paraConfig.decay_rate() * (t - t0));
+  }
+};
+
+// L1 + L2 Lr Regularizer
+class L1L2LrRegularizer : public Regularizer {
+  virtual void update(const VectorPtr vecs[],
+                      const ParameterConfig& paraConfig,
+                      real learningRate,
+                      int t0,
+                      int t) const {
+    vecs[PARAMETER_VALUE]->applyL1(*vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate * paraConfig.learning_rate(),
+                                   paraConfig.decay_rate_l1() * (t - t0));
+    vecs[PARAMETER_VALUE]->applyL2(*vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate * paraConfig.learning_rate(),
+                                   paraConfig.decay_rate() * (t - t0));
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/parameter/ThreadLocalBuffer.cpp b/paddle/legacy/parameter/ThreadLocalBuffer.cpp
similarity index 100%
rename from paddle/parameter/ThreadLocalBuffer.cpp
rename to paddle/legacy/parameter/ThreadLocalBuffer.cpp
diff --git a/paddle/legacy/parameter/ThreadLocalBuffer.h b/paddle/legacy/parameter/ThreadLocalBuffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..d360feeed6c98ee60e3bdae924434054080576b0
--- /dev/null
+++ b/paddle/legacy/parameter/ThreadLocalBuffer.h
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/legacy/math/Vector.h"
+
+namespace paddle {
+namespace parameter {
+extern VectorPtr* getThreadLocalBuffer();
+}  // namespace parameter
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/Weight.cpp b/paddle/legacy/parameter/Weight.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9d94050a5cd8c3570c286e8e82c2a1470c40e6db
--- /dev/null
+++ b/paddle/legacy/parameter/Weight.cpp
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Weight.h"
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+Weight::Weight(size_t height, size_t width, ParameterPtr param) {
+  VectorPtr vPtr = param->getBuf(PARAMETER_VALUE);
+  VectorPtr gPtr = param->getBuf(PARAMETER_GRADIENT);
+
+  // create a new weight
+  if (param->isSparse()) {
+    CHECK_LE(param->getSize(), width * height);
+  } else {
+    CHECK_EQ(param->getSize(), width * height);
+  }
+
+  // weight_
+  weight_ = param->getMat(PARAMETER_VALUE);
+  if (!weight_ && vPtr) {
+    weight_ = Matrix::create(vPtr->getMemoryHandle(), height, width);
+  }
+  if (weight_) {
+    CHECK_EQ(height, weight_->getHeight());
+    CHECK_EQ(width, weight_->getWidth());
+  }
+
+  // weightGrad
+  weightGrad_ = param->getMat(PARAMETER_GRADIENT);
+  if (!weightGrad_ && gPtr) {
+    weightGrad_ = Matrix::create(gPtr->getMemoryHandle(), height, width);
+  }
+  if (weightGrad_) {
+    CHECK_EQ(height, weightGrad_->getHeight());
+    CHECK_EQ(width, weightGrad_->getWidth());
+  }
+
+  parameter_ = param;
+}
+
+Weight::Weight(size_t height, size_t width, ParameterPtr param, size_t offset) {
+  VectorPtr vPtr = param->getBuf(PARAMETER_VALUE);
+  VectorPtr gPtr = param->getBuf(PARAMETER_GRADIENT);
+
+  // create a new weight
+  CHECK_LE(offset + width * height, param->getSize());
+
+  // weight_
+  if (vPtr) {
+    weight_ = Matrix::create(vPtr->getData() + offset,
+                             height,
+                             width,
+                             /* trans */ false,
+                             param->useGpu());
+  }
+
+  // weightGrad
+  if (gPtr) {
+    weightGrad_ = Matrix::create(gPtr->getData() + offset,
+                                 height,
+                                 width,
+                                 /* trans */ false,
+                                 param->useGpu());
+  }
+
+  parameter_ = param;
+}
+
+const ParameterPtr& Weight::getParameterPtr() { return parameter_; }
+void Weight::setParameterPtr(ParameterPtr param) { parameter_ = param; }
+}  // namespace paddle
diff --git a/paddle/legacy/parameter/Weight.h b/paddle/legacy/parameter/Weight.h
new file mode 100644
index 0000000000000000000000000000000000000000..241c8d829cd0c7b57964324d3378bdfcf09e6a70
--- /dev/null
+++ b/paddle/legacy/parameter/Weight.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <memory>
+#include <vector>
+
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/SparseRowMatrix.h"
+#include "paddle/legacy/parameter/Parameter.h"
+
+namespace paddle {
+
+class Weight {
+ private:
+  MatrixPtr weight_;
+  MatrixPtr weightGrad_;
+  ParameterPtr parameter_;
+
+ public:
+  Weight(size_t height, size_t width, ParameterPtr parameter);
+  Weight(size_t height, size_t width, ParameterPtr parameter, size_t offset);
+
+  const MatrixPtr& getW() { return weight_; }
+  const MatrixPtr& getWGrad() { return weightGrad_; }
+  const ParameterPtr& getParameterPtr();
+
+  void incUpdate(const UpdateCallback& callback) {
+    getParameterPtr()->incUpdate(callback);
+  }
+
+  void setParameterPtr(ParameterPtr param);
+};
+
+typedef std::vector<std::unique_ptr<Weight>> WeightList;
+
+}  // namespace paddle
diff --git a/paddle/parameter/tests/CMakeLists.txt b/paddle/legacy/parameter/tests/CMakeLists.txt
similarity index 100%
rename from paddle/parameter/tests/CMakeLists.txt
rename to paddle/legacy/parameter/tests/CMakeLists.txt
diff --git a/paddle/legacy/parameter/tests/test_argument.cpp b/paddle/legacy/parameter/tests/test_argument.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0c632e0cd10342431dfcada680a18d8f9eabeb9c
--- /dev/null
+++ b/paddle/legacy/parameter/tests/test_argument.cpp
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/parameter/Argument.h>
+
+using namespace paddle;  // NOLINT
+
+TEST(Argument, poolSequenceWithStride) {
+  Argument input, output;
+  ICpuGpuVector::resizeOrCreate(input.sequenceStartPositions, 5, false);
+  int* inStart = input.sequenceStartPositions->getMutableData(false);
+  inStart[0] = 0;
+  inStart[1] = 9;
+  inStart[2] = 14;
+  inStart[3] = 17;
+  inStart[4] = 30;
+
+  int strideResult[] = {0, 5, 9, 14, 17, 22, 27, 30};
+  int strideResultReversed[] = {0, 4, 9, 14, 17, 20, 25, 30};
+
+  for (auto reversed : {false, true}) {
+    ICpuGpuVectorPtr stridePositions;
+    output.poolSequenceWithStride(
+        input, 5 /* stride */, &stridePositions, reversed);
+
+    const int* outStart = output.sequenceStartPositions->getData(false);
+    CHECK_EQ(outStart[0], 0);
+    CHECK_EQ(outStart[1], 2);
+    CHECK_EQ(outStart[2], 3);
+    CHECK_EQ(outStart[3], 4);
+    CHECK_EQ(outStart[4], 7);
+
+    CHECK_EQ(stridePositions->getSize(), 8UL);
+    auto result = reversed ? strideResultReversed : strideResult;
+    for (int i = 0; i < 8; i++) {
+      CHECK_EQ(stridePositions->getData(false)[i], result[i]);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/parameter/tests/test_common.cpp b/paddle/legacy/parameter/tests/test_common.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8de9d6da983553c0b9e574ac27ae8fca14bea5b7
--- /dev/null
+++ b/paddle/legacy/parameter/tests/test_common.cpp
@@ -0,0 +1,174 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/legacy/utils/Util.h>
+#include <stdlib.h>
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/parameter/ParameterUpdateFunctions.h>
+#include <paddle/legacy/utils/Flags.h>
+#include <paddle/legacy/utils/Stat.h>
+#include <paddle/legacy/utils/Thread.h>
+
+using namespace paddle;  // NOLINT
+
+class CommonTest : public ::testing::Test {
+ protected:
+  CommonTest() : testStat_("test") {}
+  virtual ~CommonTest() {}
+  virtual void SetUp() {
+    const size_t buffSize[] = {
+        100, 128, 500, 1024, 4096, 10240, 102400, 1000000};
+    sizeVec_.resize(8);
+    memcpy(&sizeVec_[0], &buffSize[0], 8 * sizeof(size_t));
+    valueUint_.resize(4);
+    valueUint_[0].first = 0.0;
+    valueUint_[0].second = 0.0;
+    valueUint_[1].first = 0.0;
+    valueUint_[1].second = 1.0;
+    valueUint_[2].first = 1.0;
+    valueUint_[2].second = 0.0;
+    valueUint_[3].first = 1.0;
+    valueUint_[3].second = 1.0;
+    learningRate_ = 1.0;
+  }
+
+  void test_sgdUpadate(real* gradientBuffer,
+                       real* valueBuffer,
+                       real* momentumBuffer,
+                       size_t size);
+
+  virtual void TreaDown() { LOG(INFO) << "All Test Finished."; }
+
+ protected:
+  std::vector<std::pair<real, real>> valueUint_;
+  std::vector<size_t> sizeVec_;
+  real learningRate_;
+  StatSet testStat_;
+};
+
+void CommonTest::test_sgdUpadate(real* gradientBuffer,
+                                 real* valueBuffer,
+                                 real* momentumBuffer,
+                                 size_t size) {
+// sgdUpdateAvx has no double version yet
+#if defined(__AVX__) && !defined(PADDLE_TYPE_DOUBLE)
+  real valueSum1 = 0, valueSum2 = 0, momSum1 = 0, momSum2 = 0;
+  real* gradTmp = new real[size];
+  real* valueTmp = new real[size];
+  real* momentumTmp = new real[size];
+  memcpy(gradTmp, gradientBuffer, size * sizeof(real));
+  memcpy(valueTmp, valueBuffer, size * sizeof(real));
+  memcpy(momentumTmp, momentumBuffer, size * sizeof(real));
+  for (auto& arg : valueUint_) {
+    {
+      {
+        struct timeval t;
+        REGISTER_TIMER("gettimeofday", 0, testStat_);
+        gettimeofday(&t, NULL);
+      }
+      REGISTER_TIMER("avxTimer", 0);
+      sgdUpdateAvx(learningRate_,
+                   arg.first,
+                   arg.second,
+                   size,
+                   valueBuffer,
+                   gradientBuffer,
+                   momentumBuffer);
+    }
+    for (size_t i = 0; i < size; i++) {
+      valueSum1 += valueBuffer[i];
+      momSum1 += momentumBuffer[i];
+      // std::cout << "["
+      //          << valueBuffer[i]
+      //          << "," << momentumBuffer[i]
+      //          << "," << gradientBuffer[i] << "],";
+    }
+    {
+      REGISTER_TIMER("cpuTimer", 0);
+      sgdUpdateCpu(learningRate_,
+                   arg.first,
+                   arg.second,
+                   size,
+                   valueTmp,
+                   gradTmp,
+                   momentumTmp);
+    }
+    for (size_t i = 0; i < size; i++) {
+      valueSum2 += valueTmp[i];
+      momSum2 += momentumTmp[i];
+      // std::cout << "["
+      //          << valueTmp[i]
+      //          << "," << momentumTmp[i]
+      //          << "," << gradTmp[i] << "],";
+    }
+
+    VLOG(3) << "valueSum1 = " << valueSum1 << " ; valueSum2 = " << valueSum2;
+    VLOG(3) << "momSum1 = " << momSum1 << " ; momSum2 = " << momSum2;
+    ASSERT_EQ(valueSum1, valueSum2);
+    ASSERT_EQ(momSum1, momSum2);
+  }
+  delete[] gradTmp;
+  delete[] valueTmp;
+  delete[] momentumTmp;
+#endif
+}
+
+TEST_F(CommonTest, sgdUpdate) {
+  const size_t alignHeader[] = {0, 2, 3, 5, 7, 8};
+  for (auto& size : sizeVec_) {
+    real *gradientBuffer, *valueBuffer, *momentumBuffer;
+    CHECK_EQ(posix_memalign((void**)&gradientBuffer, 32, sizeof(real) * size),
+             0);
+    CHECK_EQ(posix_memalign((void**)&valueBuffer, 32, sizeof(real) * size), 0);
+    CHECK_EQ(posix_memalign((void**)&momentumBuffer, 32, sizeof(real) * size),
+             0);
+
+    for (size_t i = 0; i < size; i++) {
+      gradientBuffer[i] = 1.0;
+      valueBuffer[i] = 2.0;
+      momentumBuffer[i] = 3.0;
+    }
+    for (int i = 0; i < 6; i++) {
+      LOG(INFO) << "----------------------" << size << ":" << alignHeader[i]
+                << "-------------------------";
+      test_sgdUpadate(&gradientBuffer[alignHeader[i]],
+                      &valueBuffer[alignHeader[i]],
+                      &momentumBuffer[alignHeader[i]],
+                      size - alignHeader[i]);
+    }
+    free(gradientBuffer);
+    free(valueBuffer);
+    free(momentumBuffer);
+  }
+  globalStat.printAllStatus();
+  testStat_.printAllStatus();
+}
+
+TEST_F(CommonTest, syncThreadPool) {
+  SyncThreadPool pool(10);
+
+  std::vector<int> nums;
+  nums.resize(10);
+
+  pool.exec([&](int tid, size_t numThreads) { nums[tid] = tid; });
+  for (size_t i = 0; i < nums.size(); ++i) {
+    EXPECT_EQ((int)i, nums[i]);
+  }
+
+  pool.exec([&](int tid, size_t numThreads) { nums[tid] -= tid; });
+  for (size_t i = 0; i < nums.size(); ++i) {
+    EXPECT_EQ((int)0, nums[i]);
+  }
+}
diff --git a/paddle/legacy/pserver/BaseClient.cpp b/paddle/legacy/pserver/BaseClient.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..13bb8a1cc58580a8e0af31c23b420836c7422ad8
--- /dev/null
+++ b/paddle/legacy/pserver/BaseClient.cpp
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "BaseClient.h"
+#include <gflags/gflags.h>
+#include <string.h>
+#include <vector>
+#include "paddle/legacy/utils/Stat.h"
+
+DECLARE_string(pservers);
+
+namespace paddle {
+
+BaseClient::BaseClient(bool separate, int numPorts)
+    : stopping_(false), numPorts_(numPorts), separateSendAndRecv_(separate) {
+  CHECK_GT(numPorts, 0);
+}
+
+BaseClient::~BaseClient() {}
+
+void BaseClient::recvData() { recvSyncBarrier_->wait(); }
+
+void BaseClient::synchronize(SyncObject syncObjectId) {
+  SynchronizeRequest request;
+  request.set_sync_object_id(syncObjectId);
+  std::vector<SynchronizeResponse> responses;
+  multiCall(__func__, request, &responses);
+}
+
+void BaseClient::startThreads() {
+  if (!separateSendAndRecv_) {
+    return;
+  }
+  recvSyncBarrier_.reset(new ThreadBarrier(threadNum_ + 1));
+
+  sendThreads_.resize(threadNum_);
+  recvThreads_.resize(threadNum_);
+  sendJobQueue_.resize(threadNum_);
+  recvJobQueue_.resize(threadNum_);
+
+  for (int i = 0; i < threadNum_; ++i) {
+    sendJobQueue_[i].reset(new SendQueue());
+    recvJobQueue_[i].reset(new SendQueue());
+
+    sendThreads_[i].reset(
+        new std::thread([this](int id) { this->send(id); }, i));
+
+    recvThreads_[i].reset(
+        new std::thread([this](int id) { this->recv(id); }, i));
+  }
+}
+
+void BaseClient::finishThreads() {
+  if (!separateSendAndRecv_) {
+    return;
+  }
+  stopping_ = true;
+  for (int i = 0; i < threadNum_; i++) {
+    sendJobQueue_[i]->enqueue(nullptr);
+  }
+  for (auto& thread : sendThreads_) {
+    thread->join();
+  }
+  for (auto& thread : recvThreads_) {
+    thread->join();
+  }
+  stopping_ = false;
+}
+}  // namespace paddle
diff --git a/paddle/legacy/pserver/BaseClient.h b/paddle/legacy/pserver/BaseClient.h
new file mode 100644
index 0000000000000000000000000000000000000000..66e8f39cd60998122bb8958b12b23ee7142be94d
--- /dev/null
+++ b/paddle/legacy/pserver/BaseClient.h
@@ -0,0 +1,311 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ParameterService.pb.h"
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/pserver/ProtoServer.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/Queue.h"
+
+namespace paddle {
+
+/**
+ * it manages all connections to pservers.
+ * it exists two modes to manage connections to all pservers. Firstly, one
+ * connection owns two threads that separately manage to send and receive
+ * data. Secondly, each thread uses one connection for all activation in it.
+ * the first solution arms with sendThreads_/recvThreads_ and sendJobQueue_/
+ * recvJobQueue_. the second solution use some shared thread pool to manage
+ * connections.
+ */
+class BaseClient {
+ protected:
+  typedef std::unique_ptr<std::thread> ThreadPtr;
+  typedef std::vector<std::vector<iovec>> InputIovs;
+  typedef std::vector<SendParameterRequest> SendRequest;
+  typedef std::vector<SendDataRequest> SendDataRequestVec;
+
+  // TODO(yanfei):
+  // refine data structure to unify parameter and features communication
+  struct SendJob {
+    /// store parameters related blocks data
+    InputIovs parallelInputIovs;
+    /// store protobuf request
+    SendRequest parallelRequests;
+    /// store data, such as features for metric learning
+    SendDataRequestVec parallelDataRequests;
+  };
+
+ public:
+  explicit BaseClient(bool separate = false, int numPorts = FLAGS_ports_num);
+
+  virtual ~BaseClient();
+
+  typedef std::shared_ptr<SendJob> SendJobPtr;
+  typedef Queue<SendJobPtr> SendQueue;
+
+  /// send data to server, support only synchronize
+  template <class DataType>
+  void putData(int clientId,
+               SendDataType type,
+               DataType* datas,
+               size_t size,
+               DataUpdateMode mode) {
+    synchronize(SYNC_DATA);
+    sendData(clientId, type, mode, datas, size);
+    recvData();
+    synchronize(SYNC_DATA);
+  }
+
+  template <class DataType>
+  void putOwnData(int clientId,
+                  SendDataType type,
+                  DataType* datas,
+                  size_t size) {
+    putData(clientId, type, datas, size, DATA_UPDATE_MODE_SET_OWN);
+  }
+
+  template <class DataType>
+  void getAllData(int clientId,
+                  SendDataType type,
+                  DataType* datas,
+                  size_t size) {
+    sendData(clientId,
+             type,
+             DATA_UPDATE_MODE_GET_ALL,
+             reinterpret_cast<DataType*>(NULL),
+             0);
+    recvData();
+    size_t dataOffset = 0;
+    for (auto& recvMem : recvDataMems_) {
+      CHECK_LE(dataOffset, size);
+      size_t memSize = std::min(recvMem.get()->getSize(),
+                                sizeof(DataType) * (size - dataOffset));
+      CHECK_EQ(memSize % sizeof(DataType), size_t(0));
+      memcpy(datas + dataOffset, recvMem.get()->getBuf(), memSize);
+      dataOffset += memSize / sizeof(DataType);
+    }
+    CHECK_EQ(dataOffset, size);
+  }
+
+  /**
+   * Reduces values on all clients.
+   * This reduce just support SUM.
+   * The results are saved in recvBuf of rootId client
+   */
+  template <class DataType>
+  void reduce(DataType* sendBuf,
+              DataType* recvBuf,
+              size_t size,
+              int clientId,
+              int rootId) {
+    putOwnData(clientId, DATA_REDUCE_SUM, sendBuf, size);
+    if (rootId == clientId) {
+      getAllData(clientId, DATA_REDUCE_SUM, recvBuf, size);
+    }
+  }
+
+  /**
+   * return trans data type according to the input type
+   */
+  virtual TransDataType getTransDtype(const std::type_info& info) {
+    TransDataType dataType;
+    if (typeid(int*) == info) {  // NOLINT
+      dataType = TRANS_INT32;
+    } else if (typeid(uint32_t*) == info) {  // NOLINT
+      dataType = TRANS_UINT32_T;
+    } else if (typeid(int64_t*) == info) {  // NOLINT
+      dataType = TRANS_INT64_T;
+    } else if (typeid(uint64_t*) == info) {  // NOLINT
+      dataType = TRANS_UINT64_T;
+    } else if (typeid(float*) == info) {  // NOLINT
+      dataType = TRANS_FLOAT;
+    } else if (typeid(double*) == info) {  // NOLINT
+      dataType = TRANS_DOUBLE;
+    } else {
+      LOG(FATAL) << "not supported";
+    }
+    return dataType;
+  }
+
+ protected:
+  /// for a > 0, b > 0:
+  /// return the smallest x s.t. b*x >= a
+  static int divup(int a, int b) { return (a + b - 1) / b; }
+
+  int calcClientId(int i, int serviceNum) {
+    return (i + FLAGS_trainer_id * numPorts_) % serviceNum;
+  }
+
+  /// start threads in sendThreads_ and recvThreads_
+  void startThreads();
+
+  /// finish threads in sendThreads_ and recvThreads_
+  void finishThreads();
+
+  template <class DataType>
+  void prepareData(int clientId,
+                   SendDataType type,
+                   DataUpdateMode updateMode,
+                   DataType* datas,
+                   size_t size,
+                   SendJob* sendJob) {
+    sendJob->parallelDataRequests.resize(serviceNum_);
+    sendJob->parallelInputIovs.resize(serviceNum_);
+    for (int i = 0; i < serviceNum_; ++i) {
+      auto& request = sendJob->parallelDataRequests[i];
+      request.set_update_mode(updateMode);
+      request.set_type(type);
+      request.set_client_id(clientId);
+      request.set_server_id(i);
+    }
+
+    /// split datas which need send to Server into serviceNum_ pieces
+    if (!datas) {
+      CHECK(!size) << "ownSize should be zero since datas is nullptr";
+    }
+    size_t baseSize = size / serviceNum_;
+    size_t dataOffset = 0;
+    for (int i = 0; i < serviceNum_; ++i) {
+      auto& request = sendJob->parallelDataRequests[i];
+      DataBlock* block = request.add_blocks();
+      size_t ownSize = size_t(i) < size % serviceNum_ ? baseSize + 1 : baseSize;
+      size_t realSize = datas ? std::max(ownSize, size_t(1)) : 0;
+      block->set_total_size(realSize * sizeof(DataType));
+      block->set_data_size(sizeof(DataType));
+      // TODO(yuyang18): The getTransDtype can be rewritten as template method
+      //                 to reduce runtime overhead.
+      block->set_data_type(getTransDtype(typeid(DataType*)));  // NOLINT
+      if (datas) {
+        sendJob->parallelInputIovs[i].push_back(
+            {datas + dataOffset, realSize * sizeof(DataType)});
+      }
+      dataOffset += ownSize;
+    }
+    CHECK_EQ(dataOffset, size);
+  }
+
+  /**
+   * @brief send data to all data servers
+   *
+   * @note  each trainer sends all its data to all data servers
+   *        it's for broadcast data synchronization, such as features
+   *        synchronization in metric learning.
+   */
+  template <class DataType>
+  void sendData(int clientId,
+                SendDataType type,
+                DataUpdateMode updateMode,
+                DataType* datas,
+                size_t size) {
+    SendJobPtr sendJob = std::make_shared<SendJob>();
+    prepareData(clientId, type, updateMode, datas, size, sendJob.get());
+    for (int i = 0; i < threadNum_; ++i) {
+      sendJobQueue_[i]->enqueue(sendJob);
+    }
+  }
+
+  /**
+   * @brief recv data from all data servers
+   *
+   * @note  synchronize all recv threads
+   */
+  void recvData();
+
+  /// send request, and recv responses
+  template <typename ProtoIn, typename ProtoOut>
+  void multiCall(const char* funcName,
+                 const ProtoIn& request,
+                 std::vector<ProtoOut>* responses) {
+    responses->resize(clients_.size());
+    size_t numClients = clients_.size();
+    for (size_t i = 0; i < numClients; ++i) {
+      clients_[i].send(funcName, request);
+    }
+    for (size_t i = 0; i < numClients; ++i) {
+      clients_[i].recv(&(*responses)[i]);
+    }
+  }
+
+  /**
+   * @brief synchronize all trainers and pservers
+   *
+   * @note  used to ensure that data of all trainers have been received
+   */
+  void synchronize(SyncObject syncObjectId = SYNC_DEFAULT);
+
+  /**
+   * @brief use multithread to separately send data
+   *
+   * @note  each thread should read its own JobQueue to handle requests
+   *        each thread should calcClientId() to retrieve connections
+   *        managed by himself.
+   *        send and recv are implemented in child class.
+   */
+  virtual void send(int threadId) = 0;
+
+  /**
+   * @brief use multithread to separately receive data
+   *
+   * @note  almost same as send()
+   */
+  virtual void recv(int threadId) = 0;
+
+ protected:
+  bool stopping_;
+  /// nodes * ports that means the number of real pservers
+  int serviceNum_;
+  /**
+   * threads num for managing all services. Normally the
+   * number of pservers are relatively less than several
+   * hundreds so that using thread-based parallelization
+   * can benifit traffic performance and pserver's sgd
+   * optimization performance.
+   */
+  int threadNum_;
+  /// the connection manager at client end
+  std::vector<ProtoClient> clients_;
+  /// send threads for parallelization
+  std::vector<ThreadPtr> sendThreads_;
+  /// recv threads for parallelization
+  std::vector<ThreadPtr> recvThreads_;
+  std::unique_ptr<ThreadBarrier> recvSyncBarrier_;
+
+  // TODO(yanfei):
+  // current pserver's will return value until all parameters'
+  // optimization are finished so that recv are not overlapped
+  // in reality. More robust implimentation should be to pipeline
+  // all send/recv action based on parameter unit level, and
+  // it will benifits deep and larger model training in future,
+  // especially local node compution power surpasses inter-connection
+  // such as GPU cluster, even with BOX GPU cluster.
+  // queue for buffering send request
+  /**
+   * send/recv queue cooperates with each other to accomplish
+   * overlapping communication with forwardBackward action.
+   */
+  std::vector<std::unique_ptr<SendQueue>> sendJobQueue_;
+  /// queue for buffering recv request
+  std::vector<std::unique_ptr<SendQueue>> recvJobQueue_;
+  /// specific for dserver
+  SendJob sendJob_;
+  /// port num for each node
+  int numPorts_;
+  /// if set, overlapped optimization is disabled
+  bool separateSendAndRecv_;
+  std::vector<CpuMemHandlePtr> recvDataMems_;
+};
+}  // namespace paddle
diff --git a/paddle/legacy/pserver/CMakeLists.txt b/paddle/legacy/pserver/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0ae9c6ef6afc6ec5a99a685b08883def0db51cf1
--- /dev/null
+++ b/paddle/legacy/pserver/CMakeLists.txt
@@ -0,0 +1,56 @@
+# parameter server package
+
+######################### paddle_network ####################
+set(NETWORK_SOURCES
+    LightNetwork.cpp
+    SocketChannel.cpp
+    ProtoServer.cpp)
+
+set(NETWORK_HEADERS
+    LightNetwork.h
+    SocketChannel.h
+    ProtoServer.h)
+
+add_library(paddle_network STATIC
+    ${NETWORK_SOURCES})
+
+add_dependencies(paddle_network paddle_proto ${external_project_dependencies})
+
+################### paddle_pserver ######################
+set(PSERVER_SOURCES
+    BaseClient.cpp
+    ParameterClient2.cpp
+    ParameterServer2.cpp
+    SparseParameterDistribution.cpp
+    ParameterServerController.cpp)
+
+set(PSERVER_HEADERS
+    BaseClient.h
+    ParameterClient2.h
+    ParameterServer2.h
+    SparseParameterDistribution.h
+    ParameterServerController.h)
+
+add_library(paddle_pserver STATIC
+    ${PSERVER_SOURCES})
+
+add_dependencies(paddle_pserver paddle_proto ${external_project_dependencies})
+
+set(PSERVER_MAIN_SOURCES
+    ParameterServer2Main.cpp)
+
+if(WITH_TESTING)
+  add_subdirectory(test)
+endif()
+
+if(NOT MOBILE_INFERENCE)
+  add_executable(paddle_pserver_main ${PSERVER_MAIN_SOURCES})
+  link_paddle_exe(paddle_pserver_main)
+
+  install(TARGETS paddle_pserver_main
+          RUNTIME DESTINATION opt/paddle/bin
+          PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
+          GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
+
+  set_target_properties(paddle_pserver_main PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
+endif()
diff --git a/paddle/legacy/pserver/LightNetwork.cpp b/paddle/legacy/pserver/LightNetwork.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..469c95853ecdc02a6028417ca37b0020406eea09
--- /dev/null
+++ b/paddle/legacy/pserver/LightNetwork.cpp
@@ -0,0 +1,459 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fcntl.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <chrono>
+
+#include <arpa/inet.h>
+#include <net/if.h>
+#include <sys/ioctl.h>
+#include <sstream>
+
+#include "LightNetwork.h"
+#include "RDMANetwork.h"
+#include "paddle/legacy/utils/StringUtil.h"
+#include "paddle/legacy/utils/Util.h"
+
+/// quick ack can reduce the latency of small message
+DEFINE_bool(small_messages,
+            false,
+            "if message size is small, recommend set it True to enable quick "
+            "ack and no delay");
+
+/// reasonable sock_send_buf_size can control the traffic injected into switch
+/// network. Injecting too many data into traffic could cause packets loss which
+/// cause long latency and degrade the efficiency of communication.
+DEFINE_int32(sock_send_buf_size,
+             1024 * 1024 * 40,
+             "restrict sock send buff size, can reduce network congestion if "
+             "set carefully");
+
+/// reasonable size can hold bursted packets and reduce packets loss
+DEFINE_int32(sock_recv_buf_size,
+             1024 * 1024 * 40,
+             "restrict sock recv buff size");
+
+/// reasonable sock_listen_queue_size can control maximum pending connections.
+DEFINE_int32(sock_listen_queue_size,
+             1024,
+             "listen queue size when pserver listen a TCP port");
+
+namespace paddle {
+
+/**
+ * @brief get ip address from interface name
+ *
+ * @param[in] device device interface name
+ */
+std::string getIpAddr(std::string &device) {
+  int sock;
+  struct sockaddr_in sin;
+  struct ifreq ifr;
+
+  sock = socket(AF_INET, SOCK_DGRAM, 0);
+  CHECK(sock >= 0) << "Create socket error.";
+
+  strncpy(ifr.ifr_name, device.c_str(), IFNAMSIZ);
+  ifr.ifr_name[IFNAMSIZ - 1] = 0;
+
+  CHECK_GE(ioctl(sock, SIOCGIFADDR, &ifr), 0);
+  memcpy(&sin, &ifr.ifr_addr, sizeof(sin));
+  close(sock);
+  return std::string(inet_ntoa(sin.sin_addr));
+}
+
+/**
+ * @brief set sock option
+ *
+ * @param[in] sockfd sock file descriptor
+ *
+ * @note adjust some default sock option for better performance
+ */
+void setOption(int sockfd) {
+#if !defined(__APPLE__) && !defined(__OSX__)
+  int sendSize = FLAGS_sock_send_buf_size;
+  int recvSize = FLAGS_sock_recv_buf_size;
+  CHECK_GE(
+      setsockopt(sockfd, SOL_SOCKET, SO_RCVBUF, &recvSize, sizeof(recvSize)),
+      0);
+  CHECK_GE(
+      setsockopt(sockfd, SOL_SOCKET, SO_SNDBUF, &sendSize, sizeof(sendSize)),
+      0);
+#endif
+
+  if (FLAGS_small_messages) {
+    int optval = 1;
+    CHECK_GE(
+        setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, &optval, sizeof(optval)),
+        0);
+#ifdef TCP_QUICKACK
+    optval = 1;
+    CHECK_GE(
+        setsockopt(sockfd, IPPROTO_TCP, TCP_QUICKACK, &optval, sizeof(optval)),
+        0);
+#endif
+  }
+  int reuse = 1;
+  CHECK_GE(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)),
+           0);
+}
+
+/**
+ * @brief class constructor for SocketServer
+ * @param[in] addr sock bind address
+ * @param[in] port sock bind port
+ * @param[in] rdmaCpu rdma sock bind cpu core
+ *
+ * @note start one socket server which hosts parameter server process.
+ *       rdmaCpu is passed to rdma deamon for better performance, and
+ *       start tcp socket instead of rdma socket if rdmaCpu is equal
+ *       to -1. Each trainer process starts one connection to one socket
+ *       server, and use --ports_num to build more connections to harness
+ *       fat communication channel if necessary.
+ *       each connection is controlled by single thread with blocking
+ *       read and write.
+ */
+SocketServer::SocketServer(const std::string &addr, int port, int rdmaCpu)
+    : port_(port), addr_(addr), stopping_(false) {
+  if (rdmaCpu == -1) {
+    tcpRdma_ = F_TCP;
+    socket_ = 0;
+    maxPendingConnections_ = FLAGS_sock_listen_queue_size;
+  } else {
+    tcpRdma_ = F_RDMA;
+    rdmaCpu_ = rdmaCpu;
+    rdmaSocket_ = 0;
+
+    std::stringstream ss;
+    ss << port;
+    rdmaUri_ = "rdma://" + addr + ":" + ss.str();
+  }
+
+  /// trigger to initialize RDMA lib
+  CHECK(RdmaClientDaemons::get()) << "initilizate RDMA failed\n";
+}
+
+SocketServer::~SocketServer() {
+  stopping_ = true;
+  /// trigger accept thread to stop
+  {
+    SocketClient trigger(addr_.empty() ? "127.0.0.1" : addr_, port_, tcpRdma_);
+  }
+  this->join();
+}
+
+/**
+ * @brief start one tcp server which hosts parameter server
+ *
+ * @note do tcp socket bind and listen. it will spawn one thread
+ *       for each connection
+ */
+void SocketServer::tcpServer() {
+  int newsockfd;
+  socklen_t clilen;
+  struct sockaddr_in serv_addr, cli_addr;
+  struct hostent *server;
+
+  /// First call to socket() function
+  socket_ = socket(AF_INET, SOCK_STREAM, 0);
+  CHECK(socket_ >= 0) << "ERROR opening socket";
+
+  /// Initialize socket structure
+  bzero((char *)&serv_addr, sizeof(serv_addr));
+  serv_addr.sin_family = AF_INET;
+  serv_addr.sin_port = htons(port_);
+  if (!addr_.empty()) {
+    server = gethostbyname(addr_.c_str());
+    CHECK(server) << "ERROR, no such host: " << addr_;
+    bcopy((char *)server->h_addr,
+          (char *)&serv_addr.sin_addr.s_addr,
+          server->h_length);
+  } else {
+    serv_addr.sin_addr.s_addr = INADDR_ANY;
+  }
+
+  setOption(socket_);
+
+  /// Now bind the host address using bind() call.
+  CHECK(bind(socket_, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) >= 0)
+      << "ERROR on binding " << addr_;
+
+  /// Now start listening for the clients, here process will
+  /// go in sleep mode and will wait for the incoming connection
+  listen(socket_, maxPendingConnections_);
+  clilen = sizeof(cli_addr);
+
+  while (true) {
+    /// Accept actual connection from the client
+    newsockfd = accept(socket_, (struct sockaddr *)&cli_addr, &clilen);
+    if (stopping_) {
+      break;
+    }
+    CHECK(newsockfd >= 0) << "ERROR on accept";
+    constexpr int kPeerNameLen = 128;
+    char peerName[kPeerNameLen];
+    CHECK(inet_ntop(AF_INET, &cli_addr.sin_addr, peerName, kPeerNameLen));
+
+    SocketWorker *worker =
+        new SocketWorker(createChannel(newsockfd, std::string(peerName)), this);
+    worker->start();
+    worker->detach();
+  }
+  close(socket_);
+  LOG(INFO) << "pserver accept thread finish, addr=" << addr_
+            << " port=" << port_;
+}
+
+/**
+ * @brief start one rdma server which hosts parameter server
+ *
+ * @note do rdma bind and listen, which calling self-defined socket
+ *       like rdma library. it will spawn one thread for each connection
+ */
+void SocketServer::rdmaServer() {
+  struct sxi_sock *newsock;
+
+  /// First call to socket() function
+  rdmaSocket_ = rdma::ssocket(rdmaCpu_);
+  CHECK(rdmaSocket_) << "ERROR opening RDMA socket";
+
+  CHECK(rdma::bind(rdmaSocket_, rdmaUri_.c_str()) == 0)
+      << "ERROR bind RDMA socket";
+
+  /// Now start listening for the clients, here process will
+  /// go in sleep mode and will wait for the incoming connection
+  CHECK(rdma::listen(rdmaSocket_) == 0) << "ERROR listen RDMA socket";
+
+  while (true) {
+    /// Accept actual connection from the client
+    newsock = rdma::accept(rdmaSocket_);
+    if (stopping_) {
+      break;
+    }
+    CHECK(newsock) << "ERROR on accept";
+
+    constexpr int kPeerNameLen = 128;
+    char peerName[kPeerNameLen];
+
+    struct sockaddr_in *saddr = rdma::getSourceAddress(newsock);
+    CHECK(inet_ntop(AF_INET, &saddr->sin_addr, peerName, kPeerNameLen));
+
+    SocketWorker *worker =
+        new SocketWorker(createChannel(newsock, std::string(peerName)), this);
+    worker->start();
+    worker->detach();
+  }
+  rdma::close(rdmaSocket_);
+  LOG(INFO) << "pserver accept thread finish, rdma uri=" << rdmaUri_;
+}
+
+/**
+ * @brief start a socket server
+ *
+ * @note framework for starting socket server
+ */
+void SocketServer::run() {
+  if (tcpRdma_ == F_TCP) {
+    LOG(INFO) << "tcp server start ";
+    tcpServer();
+  } else if (tcpRdma_ == F_RDMA) {
+    LOG(INFO) << "rdma server start ";
+    rdmaServer();
+  }
+}
+
+/**
+ * @brief class constructor for rdma client deamons
+ *
+ * @note  automatically start several client deamons for better performance
+ */
+std::unique_ptr<RdmaClientDaemons> RdmaClientDaemons::daemons_ = nullptr;
+std::once_flag RdmaClientDaemons::initDataFlag_;
+
+RdmaClientDaemons::RdmaClientDaemons() {
+  if (FLAGS_rdma_tcp == "rdma") {
+    rdma::init();
+
+    struct sxi_socket *socket;
+    onlineCpus_ = rdma::numCpus();
+    for (auto i = 0; i < onlineCpus_; i++) {
+      socket = rdma::csocket(i);
+      CHECK(socket) << "ERROR open client socket daemon";
+
+      rdmaClientSocket_.push_back(socket);
+    }
+    LOG(INFO) << "RDMA client daemons started, onlineCpus_:" << onlineCpus_;
+    /// round robin scheduler for new connection
+    curCpu_ = 0;
+    /// wait daemons to start completely.
+    sleep(2);
+  }
+}
+
+RdmaClientDaemons::~RdmaClientDaemons() {
+  if (FLAGS_rdma_tcp == "rdma") {
+    for (auto i = 0; i < onlineCpus_; i++) {
+      rdma::close(rdmaClientSocket_[i]);
+    }
+    LOG(INFO) << "RDMA client daemons is destoryed, onlineCpus_ "
+              << onlineCpus_;
+  }
+}
+
+/**
+ * @brief worker thread main context
+ *
+ * @note  each connection from client(trainer) is controlled by single worker
+ *        thread, which is for handling all parameter server requests
+ */
+void SocketWorker::run() {
+  LOG(INFO) << "worker started, peer = " << channel_->getPeerName();
+
+  std::vector<iovec> inputIovs;
+
+  while (true) {
+    std::unique_ptr<MsgReader> msgReader = channel_->readMessage();
+    if (!msgReader) {
+      break;
+    }
+
+    auto callback = [this](const std::vector<iovec> &outputIovs) {
+      channel_->writeMessage(outputIovs);
+    };
+
+    server_->handleRequest(std::move(msgReader), callback);
+  }
+
+  LOG(INFO) << "worker begin to finish, peer = " << channel_->getPeerName();
+  delete this;
+}
+
+/**
+ * @brief start one tcp connection to tcp server
+ * @param[in] serverAddr  tcp server ip
+ * @param[in] serverPort  tcp server port
+ *
+ * @note each object contains one channel which accept byte stream
+ */
+void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
+  struct sockaddr_in serv_addr;
+  struct hostent *server;
+
+  int errRet;  // temp for gethostbyname_r
+
+  /// Create a socket point
+  int sockfd = socket(AF_INET, SOCK_STREAM, 0);
+  CHECK(sockfd >= 0) << "ERROR opening socket";
+
+#if defined(__OSX__) || defined(__APPLE__)
+  server = getipnodebyname(serverAddr.c_str(), AF_INET, AI_DEFAULT, &errRet);
+  CHECK_NE(HOST_NOT_FOUND, errRet) << "ERROR, no such host: " << serverAddr
+                                   << " ret = " << errRet;
+  CHECK(server) << "getipnodebyname error!";
+#else
+  struct hostent hostinfo;
+  char buf[1024];  // temp for gethostbyname_r
+  CHECK_EQ(
+      0,
+      gethostbyname_r(
+          serverAddr.c_str(), &hostinfo, buf, sizeof(buf), &server, &errRet))
+      << "ERROR, no such host: " << serverAddr << " ret = " << errRet;
+  CHECK(server) << "gethostbyname_r error!";
+#endif
+
+  bzero((char *)&serv_addr, sizeof(serv_addr));
+  serv_addr.sin_family = AF_INET;
+  bcopy((char *)server->h_addr,
+        (char *)&serv_addr.sin_addr.s_addr,
+        server->h_length);
+  serv_addr.sin_port = htons(serverPort);
+
+  setOption(sockfd);
+
+  /// Now connect to the server
+  int retry_count = 0;
+  do {
+    if (connect(sockfd, (sockaddr *)&serv_addr, sizeof(serv_addr)) == 0) {
+      break;
+    }
+
+    if (errno == ECONNREFUSED) {
+      LOG(WARNING) << "connection refused by pserver, try again!";
+      if (retry_count++ >= 7) {
+        LOG(FATAL) << "connection refused by pserver, maybe pserver failed!";
+      }
+      std::this_thread::sleep_for(std::chrono::seconds(1));
+    } else {
+      CHECK(errno != 0) << "ERROR connecting to " << serverAddr << ":"
+                        << serverPort << "errorno: " << errno;
+    }
+  } while (errno == ECONNREFUSED);
+
+  channel_.reset(new SocketChannel(sockfd, serverAddr));
+  tcpRdma_ = F_TCP;
+}
+
+/**
+ * @brief start one RDMA connection to rdma server
+ * @param[in] serverAddr  rdma server ip
+ * @param[in] serverPort  rdma server port
+ *
+ * @note  each object contains one channel which accept byte stream
+ *        for rdma, low level sock also provide byte stream api.
+ */
+void SocketClient::RdmaClient(const std::string &serverAddr, int serverPort) {
+  struct sxi_sock *sock;
+
+  std::stringstream ss;
+  ss << serverPort;
+
+  std::string rdmaUri = "rdma://" + serverAddr + ":" + ss.str();
+
+  RdmaClientDaemons *daemons = RdmaClientDaemons::daemons_->get();
+  socketDaemon_ = daemons->selectDaemon();
+
+  /// connect to server with socket daemon
+  sock = rdma::connect(socketDaemon_, rdmaUri.c_str());
+  CHECK(sock) << "ERROR connect to server" << rdmaUri;
+
+  std::vector<std::string> seg;
+  str::split(rdmaUri, '/', &seg);
+  std::string server = seg.at(seg.size() - 1);
+  channel_.reset(new SocketChannel(sock, server));
+  tcpRdma_ = F_RDMA;
+}
+
+/**
+ * @brief class constructor
+ * @param[in] serverAddr pserver ip address
+ * @param[in] serverPort pserver port
+ * @param[in] ChannelType F_TCP or F_RDMA
+ *
+ * @note  responsible for building one connection to specified pserver port
+ */
+SocketClient::SocketClient(const std::string &serverAddr,
+                           int serverPort,
+                           enum ChannelType channelType) {
+  if (channelType == F_RDMA)
+    RdmaClient(serverAddr, serverPort);
+  else
+    TcpClient(serverAddr, serverPort);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/pserver/LightNetwork.h b/paddle/legacy/pserver/LightNetwork.h
new file mode 100644
index 0000000000000000000000000000000000000000..380f86832f5894fdf29588dde9a77068c624e066
--- /dev/null
+++ b/paddle/legacy/pserver/LightNetwork.h
@@ -0,0 +1,185 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "SocketChannel.h"
+
+#include <atomic>
+#include <memory>
+#include <thread>
+#include <vector>
+
+#include "paddle/legacy/utils/Thread.h"
+
+struct sxi_socket;
+
+namespace paddle {
+
+class SocketWorker;
+
+/**
+ * @brief class for holding all parameters processing for current port
+ *
+ * @note  each parameter server inherits from one socket server, each
+ *        server contains serveral woker threads which are to parallelize
+ *        the processing of computation, but share some common datas stored
+ *        in child class of socketserver.
+ */
+class SocketServer : public Thread {
+  // rdmaCpu controls the cpu affinity of RDMA server daemon,
+  // which could benifit performance. rdmaCpu = -1 means TCP
+  // is used instead of RDMA transport.
+ public:
+  SocketServer(const std::string& addr, int port, int rdmaCpu);
+  ~SocketServer();
+
+  virtual void run();
+
+  typedef std::function<void(const std::vector<iovec>& outputIovs)>
+      ResponseCallback;
+
+ protected:
+  //
+  // The derived class needs to implement this function
+  // to handle the request received by SocketWorker
+  // The request is encapsulated by MsgReader, which contains
+  // a set of blocks.
+  virtual void handleRequest(std::unique_ptr<MsgReader> msgReader,
+                             ResponseCallback callback) = 0;
+
+  std::unique_ptr<SocketChannel> createChannel(int sock,
+                                               const std::string& peerName) {
+    return std::unique_ptr<SocketChannel>(new SocketChannel(sock, peerName));
+  }
+  std::unique_ptr<SocketChannel> createChannel(struct sxi_sock* sock,
+                                               const std::string& peerName) {
+    return std::unique_ptr<SocketChannel>(new SocketChannel(sock, peerName));
+  }
+
+  friend class SocketWorker;
+
+ private:
+  void rdmaServer();
+  void tcpServer();
+
+  void detach() {}  // detach accept thread is forbidden
+
+ protected:
+  enum ChannelType tcpRdma_;
+  // for rdma
+  int rdmaCpu_;
+  std::string rdmaUri_;
+  sxi_socket* rdmaSocket_;
+  // for tcp
+  int port_;
+  std::string addr_;
+  int socket_;
+  int maxPendingConnections_;
+  bool stopping_;
+};
+
+/**
+ * @brief class for holding one connection from one trainer
+ *
+ * @note  all parameter processing will run in the context of this worker
+ */
+class SocketWorker : public Thread {
+ public:
+  SocketWorker(std::unique_ptr<SocketChannel>&& channel, SocketServer* server)
+      : channel_(std::move(channel)), server_(server) {}
+
+  virtual ~SocketWorker() {}
+
+  virtual void run();
+
+ protected:
+  std::unique_ptr<SocketChannel> channel_;
+  SocketServer* server_;
+  enum ChannelType tcpRdma_;
+};
+
+/**
+ * @brief class for providing rdma client deamon thread
+ *
+ * @note  the deamons are required by sock like rdam library. Here
+ *        use singleton model for daemons. Each deamon hosts in
+ *        single cpu core for better load balance performance
+ */
+class RdmaClientDaemons {
+ private:
+  RdmaClientDaemons();
+
+  static std::unique_ptr<RdmaClientDaemons> daemons_;
+
+ public:
+  static RdmaClientDaemons* get() {
+    std::call_once(RdmaClientDaemons::initDataFlag_,
+                   &RdmaClientDaemons::getInstance);
+
+    return daemons_.get();
+  }
+
+  struct sxi_socket* selectDaemon() {
+    int cpu = curCpu_;
+    curCpu_ = (curCpu_ + 1) % onlineCpus_;
+
+    LOG(INFO) << "select daemon " << cpu << "onlineCpus_ " << onlineCpus_;
+    return rdmaClientSocket_[cpu];
+  }
+
+  ~RdmaClientDaemons();
+
+ public:
+  friend class SocketClient;
+
+ private:
+  static std::once_flag initDataFlag_;
+  static void getInstance() {
+    if (!daemons_.get()) daemons_.reset(new RdmaClientDaemons());
+  }
+
+  std::vector<struct sxi_socket*> rdmaClientSocket_;
+  std::atomic<int> curCpu_;
+  int onlineCpus_;
+};
+
+/**
+ * @brief management for client connection which are from trainers
+ *
+ * @note  it contains one channel descriptor which used to write and
+ *        read data
+ */
+class SocketClient {
+ public:
+  SocketClient(const std::string& serverAddr,
+               int serverPort,
+               enum ChannelType channelType);
+
+  SocketChannel* getChannel() { return channel_.get(); }
+
+ protected:
+  std::unique_ptr<SocketChannel> channel_;
+  struct sxi_socket* socketDaemon_;
+  enum ChannelType tcpRdma_;
+
+ private:
+  void RdmaClient(const std::string& serverAddr, int serverPort);
+  void TcpClient(const std::string& serverAddr, int serverPort);
+};
+
+std::string getIpAddr(std::string& device);
+void setOption(int sockfd);
+
+}  // namespace paddle
diff --git a/paddle/legacy/pserver/ParameterClient2.cpp b/paddle/legacy/pserver/ParameterClient2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4c544ddc28517f50e7deb23d4fa7a82b34d42677
--- /dev/null
+++ b/paddle/legacy/pserver/ParameterClient2.cpp
@@ -0,0 +1,781 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+
+#include "ParameterClient2.h"
+#include "paddle/legacy/math/SparseRowMatrix.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/StringUtil.h"
+
+DEFINE_string(pservers, "127.0.0.1", "Comma separated addresses of pservers");
+DEFINE_int32(parallel_thread_num, 1, "Thread number for parameter send");
+
+namespace paddle {
+
+template <typename T1, typename T2>
+void copyToRepeatedField(google::protobuf::RepeatedField<T1>* dest,
+                         const T2* src,
+                         size_t size) {
+  dest->Clear();
+  dest->Reserve(size);
+  for (size_t i = 0; i < size; ++i) {
+    dest->AddAlreadyReserved(src[i]);
+  }
+}
+
+ParameterClient2::ParameterClient2(bool separate, int port, int numPorts)
+    : BaseClient(separate, numPorts), port_(port) {
+#ifndef PADDLE_DISABLE_TIMER
+  forwardbackwordTime_ = 0;
+#endif
+}
+
+int ParameterClient2::calcParameterBlockSize(
+    const std::vector<ParameterPtr>& parameters, size_t serviceNum) {
+  size_t totalSize = 0;
+  for (auto& para : parameters) {
+    totalSize += para->getSize();
+  }
+  size_t perServerSize = totalSize / serviceNum;
+
+  int sizeBits = 64 - __builtin_clzl(perServerSize);
+
+  /// 2^10 is min block size
+  /// 2^7 will be max number of blocks in one pserver
+  int blockSizeBits = std::max((sizeBits - 7), 10);
+  return 1 << blockSizeBits;
+}
+
+void ParameterClient2::initThreads() {
+  threadNum_ = serviceNum_;
+  if (FLAGS_parallel_thread_num > 1) {
+    LOG(INFO) << "parallel_thread_num dosent need to set";
+  }
+  syncThreadPool_.reset(new SyncThreadPool(threadNum_));
+  startThreads();
+}
+
+bool ParameterClient2::init(const std::vector<ParameterPtr>& parameters) {
+  destroy();
+
+  std::vector<std::string> hosts;
+  str::split(FLAGS_pservers, ',', &hosts);
+  serviceNum_ = hosts.size() * numPorts_;
+  uint64_t denseBlockSize = calcParameterBlockSize(parameters, serviceNum_);
+
+  /// setup prefetch matrix if exists
+  for (auto& para : parameters) {
+    /// set block size for each parameter
+    para->getConfig().set_parameter_block_size(
+        para->getConfig().sparse_remote_update() ? para->getConfig().dims(1)
+                                                 : denseBlockSize);
+  }
+
+  for (auto& para : parameters) {
+    CHECK_NE(-1UL, para->getID()) << "id in parameter is not initialized";
+    parameterMap_[para->getID()] = para;
+  }
+
+  allSegments_.reserve(parameters.size());
+
+  for (auto& para : parameters) {
+    ParameterSegments segments;
+    segments.name = para->getName();
+    segments.id = para->getID();
+    allSegments_.push_back(segments);
+    if (para->getConfig().sparse_remote_update()) {
+      CHECK_EQ(para->getConfig().parameter_block_size(),
+               para->getConfig().dims(1))
+          << "For sparse remote update parameter,"
+          << " block size is the width of each row.";
+    }
+  }
+
+  /// init clients
+  clients_.reserve(serviceNum_);
+  recvDataMems_.resize(serviceNum_);
+
+  for (size_t i = 0; i < hosts.size(); ++i) {
+    for (int j = 0; j < numPorts_; ++j) {
+      LOG(INFO) << "pserver " << i * numPorts_ + j << " " << hosts[i] << ":"
+                << port_ + j;
+      if (FLAGS_rdma_tcp == "rdma") {
+        clients_.emplace_back(hosts[i], port_ + j, F_RDMA);
+      } else {
+        clients_.emplace_back(hosts[i], port_ + j, F_TCP);
+      }
+    }
+  }
+
+  sparseDistribution_.reset(new SparseParameterDistribution(serviceNum_));
+
+  sleep(2);
+
+  initThreads();
+
+  return true;
+}
+
+ParameterClient2::~ParameterClient2() { destroy(); }
+
+void ParameterClient2::destroy() {
+  if (clients_.empty()) {
+    /// this means not initialized.
+    return;
+  }
+  finishThreads();
+
+  parameterMap_.clear();
+  allSegments_.clear();
+  clients_.clear();
+}
+
+void ParameterClient2::sendParallel(int tid,
+                                    size_t numThreads,
+                                    ParameterType recvParameterType) {
+  int numMyClients = divup(serviceNum_ - tid, numThreads);
+
+  for (int j = 0; j < numMyClients; ++j) {
+    REGISTER_TIMER("client_sendAndRecv_send");
+    int i = numThreads * j + tid;
+    /// Try to make different clients to send data to different pservers
+    /// at the same time so that they will not flood data to the same
+    /// pserver.
+    i = calcClientId(i, serviceNum_);
+    clients_[i].send("sendParameter",
+                     sendJob_.parallelRequests[i],
+                     sendJob_.parallelInputIovs[i]);
+
+    /// clear large structure
+    sendJob_.parallelRequests[i].Clear();
+    sendJob_.parallelInputIovs[i].clear();
+  }
+
+  std::vector<void*> bufs;
+  SendParameterResponse response;
+  for (int j = 0; j < numMyClients; ++j) {
+    REGISTER_TIMER("client_sendAndRecv_recv");
+    int i = numThreads * j + tid;
+    i = calcClientId(i, serviceNum_);
+    auto msgReader = clients_[i].recv(&response);
+    CHECK_EQ(msgReader->getNumBlocks(), (size_t)response.blocks_size());
+    bufs.clear();
+    bufs.reserve(response.blocks_size());
+    for (auto& block : response.blocks()) {
+      auto it = parameterMap_.find(block.para_id());
+      CHECK(it != parameterMap_.end());
+      Parameter* parameter = it->second.get();
+      real* buf = nullptr;
+      if (parameter->getBuf(recvParameterType)) {
+        buf = parameter->getBuf(recvParameterType)->getPoint(block.begin_pos());
+      } else {
+        auto recvMat = dynamic_cast<SparseRowCpuMatrix*>(
+            parameter->getMat(recvParameterType).get());
+        CHECK(recvMat);
+        size_t width = parameter->getConfig().dims(1);
+        // TODO(wuyi): need add lock here? may also cause resize.
+        buf = recvMat->getLocalRow(block.begin_pos() / width);
+      }
+      /// sparse_id is not useful while receiving data since sparse data
+      /// storage is continuous, do commit recieved data as that of dense.
+      bufs.push_back(buf);
+    }
+    msgReader->readBlocks(bufs);
+  }
+}
+
+void ParameterClient2::prepareSendData(
+    ParameterUpdateMode updateMode,
+    ParameterType parameterType,
+    const std::vector<ParameterSegments>& parameterSegments,
+    int64_t numSamples,
+    real cost,
+    bool sendBackParameter,
+    ParameterType sendBackParameterType,
+    BatchStatus batchStatus,
+    SendJob* sendJob) {
+  sendJob->parallelRequests.resize(serviceNum_);
+  sendJob->parallelInputIovs.resize(serviceNum_);
+
+  for (auto& request : sendJob->parallelRequests) {
+#ifndef PADDLE_DISABLE_TIMER
+    if (updateMode == PSERVER_UPDATE_MODE_ADD_GRADIENT) {
+      request.set_forwardbackward_time(forwardbackwordTime_);
+    }
+#endif
+    request.set_trainer_id(trainerId_);
+    request.set_update_mode(updateMode);
+    request.set_send_back_parameter(sendBackParameter);
+    request.set_send_back_parameter_type(sendBackParameterType);
+    request.set_num_samples(numSamples);
+    request.set_cost(cost);
+    request.set_batch_status(batchStatus);
+    CHECK_EQ(request.blocks_size(), 0);
+    VLOG(10) << "request: trainer_id: " << request.trainer_id()
+             << " update_mode" << request.update_mode()
+             << " send_back_parameter: " << request.send_back_parameter()
+             << " send_back_parameter_type: "
+             << request.send_back_parameter_type()
+             << " num_samples: " << request.num_samples()
+             << " cost: " << request.cost()
+             << " batch_status: " << request.batch_status();
+  }
+  for (const auto& segments : parameterSegments) {
+    const auto it = parameterMap_.find(segments.id);
+    CHECK(it != parameterMap_.end());
+    Parameter* parameter = it->second.get();
+    CHECK(parameter != nullptr) << "parameter is nullptr";
+    int64_t nameHash = std::hash<std::string>()(segments.name);
+    bool sendingPara = !(updateMode == PSERVER_UPDATE_MODE_GET_PARAM ||
+                         updateMode == PSERVER_UPDATE_MODE_GET_PARAM_SPARSE ||
+                         updateMode == PSERVER_UPDATE_MODE_SET_PARAM_ZERO);
+    bool sparseUpdate = parameter->getConfig().sparse_remote_update() &&
+                        (updateMode == PSERVER_UPDATE_MODE_ADD_GRADIENT ||
+                         updateMode == PSERVER_UPDATE_MODE_ASYNC_SGD ||
+                         updateMode == PSERVER_UPDATE_MODE_GET_PARAM_SPARSE);
+
+    const auto blockSize = parameter->getConfig().parameter_block_size();
+    CHECK_GE(blockSize, 1LU) << "blockSize should > 0 " << blockSize;
+    const auto paraSize = parameter->getSize();
+    if (sparseUpdate) {
+      auto prefetchMat = std::dynamic_pointer_cast<SparsePrefetchRowCpuMatrix>(
+          parameter->getMat(PARAMETER_VALUE));
+      CHECK(prefetchMat != nullptr) << "prefetchMat is nullptr";
+      auto sendMat = dynamic_cast<SparseRowCpuMatrix*>(
+          parameter->getMat(parameterType).get());
+      CHECK(sendMat != nullptr) << "sendMat is nullptr";
+
+      syncThreadPool_->exec([&](int tid, size_t numThreads) {
+        std::lock_guard<std::mutex> guard(sparseAutoGrowthMutex_);
+        const auto& localIndices = prefetchMat->getLocalIndices();
+        /// num of sparse rows
+        size_t nLocalBlocks = localIndices.size();
+        uint64_t beginDim = 0;
+        uint64_t endDim = 0;
+
+        // HACK(typhoonzero): let it resize first
+        prefetchMat->getLocalRow(nLocalBlocks);
+        sendMat->getLocalRow(nLocalBlocks);
+
+        for (size_t row = 0; row < nLocalBlocks; ++row) {
+          int64_t blockId = localIndices[row];  // local row -> sparse row
+          int serverId = std::abs((blockId + nameHash) % serviceNum_);
+          if (serverId % numThreads != (size_t)tid) {
+            continue;
+          }
+
+          beginDim = blockId * blockSize;
+          endDim = std::min<int64_t>(beginDim + blockSize, paraSize);
+
+          auto& request = sendJob->parallelRequests[serverId];
+          ParameterBlock* block = request.add_blocks();
+          block->set_para_id(segments.id);
+          /// global sparse row id
+          block->set_block_id(blockId);
+          /// local row offset
+          block->set_begin_pos(row * blockSize);
+          /// block len
+          block->set_block_size(endDim - beginDim);
+          if (sendingPara) {
+            sendJob->parallelInputIovs[serverId].push_back(
+                {sendMat->getLocalRow(row), sizeof(real) * (size_t)blockSize});
+            /// detect sparse parameter distribution
+            sparseDistribution_->probeDistribution(serverId,
+                                                   sizeof(real) * blockSize);
+          }
+        }
+      });
+
+    } else {  /// parameter set for dense and sparse
+      real* buf =
+          sendingPara ? parameter->getBuf(parameterType)->getPoint(0) : nullptr;
+      uint64_t endDim = 0;
+      for (uint64_t beginDim = 0; beginDim < paraSize; beginDim = endDim) {
+        endDim = std::min<int64_t>(beginDim + blockSize, paraSize);
+        int64_t blockId = beginDim / blockSize;
+        int serverId = std::abs((blockId + nameHash) % serviceNum_);
+
+        auto& request = sendJob->parallelRequests[serverId];
+        ParameterBlock* block = request.add_blocks();
+        block->set_para_id(segments.id);
+        block->set_block_id(blockId);
+        block->set_begin_pos(beginDim);
+        block->set_block_size(endDim - beginDim);
+        if (buf) {
+          sendJob->parallelInputIovs[serverId].push_back(
+              {buf + beginDim, sizeof(real) * ((size_t)(endDim - beginDim))});
+        }
+      }
+    }
+  }  // parameterSegments
+
+  sparseDistribution_->checkAndResetDistribution();
+}
+
+void ParameterClient2::sendAndReceiveParameter(
+    ParameterUpdateMode updateMode,
+    ParameterType parameterType,
+    const std::vector<ParameterSegments>& parameterSegments,
+    int64_t numSamples,
+    real cost,
+    bool sendBackParameter,
+    ParameterType sendBackParameterType,
+    ParameterType recvParameterType) {
+  prepareSendData(updateMode,
+                  parameterType,
+                  parameterSegments,
+                  numSamples,
+                  cost,
+                  sendBackParameter,
+                  sendBackParameterType,
+                  /*batchStatus = */ BATCH_START_AND_FINISH,
+                  &sendJob_);
+
+  syncThreadPool_->exec([&](int tid, size_t numThreads) {
+    this->sendParallel(tid, numThreads, recvParameterType);
+  });
+}
+
+void ParameterClient2::sendParameter(
+    ParameterUpdateMode updateMode,
+    ParameterType parameterType,
+    const std::vector<ParameterSegments>& parameterSegments,
+    int64_t numSamples,
+    real cost,
+    bool sendBackParameter,
+    BatchStatus batchStatus) {
+  SendJobPtr sendJob = std::make_shared<SendJob>();
+  prepareSendData(updateMode,
+                  parameterType,
+                  parameterSegments,
+                  numSamples,
+                  cost,
+                  sendBackParameter,
+                  PARAMETER_VALUE,
+                  batchStatus,
+                  sendJob.get());
+
+  for (int i = 0; i < threadNum_; i++) {
+    sendJobQueue_[i]->enqueue(sendJob);
+  }
+}
+
+void ParameterClient2::recvParameter() { recvSyncBarrier_->wait(); }
+
+void ParameterClient2::send(int threadId) {
+  int index = threadId;
+  LOG(INFO) << "send thread " << threadId << " started";
+  int numMyClients = divup(serviceNum_ - index, threadNum_);
+  while (true) {
+    SendJobPtr recvJob = sendJobQueue_[index]->dequeue();
+    if (stopping_) {
+      recvJobQueue_[index]->enqueue(recvJob);
+      break;
+    }
+    for (int j = 0; j < numMyClients; ++j) {
+      REGISTER_TIMER("client_send");
+      int i = threadNum_ * j + index;
+      /// Try to make different clients to send data to different pservers
+      /// at the same time so that they will not flood data to the same
+      /// pserver.
+      i = calcClientId(i, serviceNum_);
+      if (recvJob->parallelRequests.size()) {
+        clients_[i].send("sendParameter",
+                         recvJob->parallelRequests[i],
+                         recvJob->parallelInputIovs[i]);
+      } else {
+        clients_[i].send("sendData",
+                         recvJob->parallelDataRequests[i],
+                         recvJob->parallelInputIovs[i]);
+      }
+    }
+    recvJobQueue_[index]->enqueue(recvJob);
+  }
+}
+
+void ParameterClient2::recv(int threadId) {
+  LOG(INFO) << "recv thread " << threadId << " started";
+  int index = threadId;
+  int numMyClients = divup(serviceNum_ - index, threadNum_);
+  while (true) {
+    std::vector<void*> bufs;
+    SendParameterResponse response;
+    SendDataResponse dataResponse;
+    SendJobPtr recvJob = recvJobQueue_[index]->dequeue();
+    if (stopping_) break;
+    for (int j = 0; j < numMyClients; ++j) {
+      REGISTER_TIMER("client_recv");
+      int i = threadNum_ * j + index;
+      i = calcClientId(i, serviceNum_);
+      if (recvJob->parallelRequests.size()) {
+        auto msgReader = clients_[i].recv(&response);
+        CHECK_EQ(msgReader->getNumBlocks(), (size_t)response.blocks_size());
+        bufs.clear();
+        bufs.reserve(response.blocks_size());
+        for (auto& block : response.blocks()) {
+          auto it = parameterMap_.find(block.para_id());
+          CHECK(it != parameterMap_.end());
+          Parameter* parameter = it->second.get();
+          real* buf =
+              parameter->getBuf(PARAMETER_VALUE)->getPoint(block.begin_pos());
+          CHECK_EQ(msgReader->getBlockLength(bufs.size()),
+                   sizeof(real) * (block.block_size()));
+          bufs.push_back(buf);
+        }
+        msgReader->readBlocks(bufs);
+      } else {
+        auto msgReader = clients_[i].recv(&dataResponse);
+        CHECK_EQ(msgReader->getNumBlocks(), (size_t)dataResponse.blocks_size());
+        size_t totalLen = msgReader->getTotalLength();
+        if (0 == totalLen) {
+          continue;
+        }
+        auto& recvMem = recvDataMems_[dataResponse.server_id()];
+        CHECK_EQ(dataResponse.blocks_size(), 1)
+            << "Only one block currently support now!";
+        auto& block = dataResponse.blocks(0);
+        CHECK_EQ(totalLen % sizeof(block.data_size()), 0U);
+        recvMem = std::make_shared<CpuMemoryHandle>(totalLen);
+        msgReader->readNextBlock(recvMem.get()->getBuf());
+      }
+    }
+    recvSyncBarrier_->wait();
+  }
+}
+
+void ParameterClient2::waitPassStart() {
+  WaitPassStartRequest request;
+  std::vector<WaitPassStartResponse> responses;
+  multiCall(__func__, request, &responses);
+}
+
+void ParameterClient2::waitPassFinish() {
+  WaitPassFinishRequest request;
+  std::vector<WaitPassFinishResponse> responses;
+  multiCall(__func__, request, &responses);
+}
+
+void ParameterClient2::synchronize(SyncObject syncObjectId) {
+  SynchronizeRequest request;
+  request.set_sync_object_id(syncObjectId);
+  std::vector<SynchronizeResponse> responses;
+  multiCall(__func__, request, &responses);
+}
+
+void ParameterClient2::asyncFinishPass(SyncObject syncObjectId) {
+  SynchronizeRequest request;
+  request.set_sync_object_id(syncObjectId);
+  request.set_trainer_id(trainerId_);
+  std::vector<SynchronizeResponse> responses;
+  multiCall(__func__, request, &responses);
+}
+
+void ParameterClient2::setConfig(const OptimizationConfig& optConfig,
+                                 const std::string& saveDir,
+                                 bool isSparseServer) {
+  SetConfigRequest request;
+  std::vector<SetConfigResponse> responses;
+
+  for (auto& nameAndPara : parameterMap_) {
+    *request.add_param_configs() = nameAndPara.second->getConfig();
+  }
+
+  *request.mutable_opt_config() = optConfig;
+  request.set_save_dir(saveDir);
+  request.set_is_sparse_server(isSparseServer);
+
+  std::vector<SetConfigRequest> requests;
+  requests.resize(clients_.size());
+  for (size_t i = 0; i < requests.size(); ++i) {
+    requests[i].CopyFrom(request);
+    requests[i].set_server_id(i);
+  }
+
+  responses.resize(clients_.size());
+  size_t numClients = clients_.size();
+  for (size_t i = 0; i < numClients; ++i) {
+    clients_[i].send(__func__, requests[i]);
+  }
+  for (size_t i = 0; i < numClients; ++i) {
+    clients_[i].recv(&responses[i]);
+  }
+}
+
+bool ParameterClient2::inStatus(PServerStatus status) {
+  GetStatusRequest request;
+  std::vector<GetStatusResponse> responses;
+
+  bool ok = true;
+  multiCall("getStatus", request, &responses);
+  for (auto& response : responses) {
+    if (response.status() != status) {
+      ok = false;
+    }
+  }
+
+  return ok;
+}
+
+void ParameterClient2::setStatus(PServerStatus status) {
+  SetStatusRequest request;
+  request.set_status(status);
+  std::vector<SetStatusResponse> responses;
+  multiCall(__func__, request, &responses);
+}
+
+void ParameterClient2::waitForStatus(PServerStatus status) {
+  while (!inStatus(status)) {
+    sleep(1);
+  }
+}
+
+template <typename Proto>
+static void validateResponses(const std::vector<Proto>& responses) {
+  for (auto& response : responses) {
+    CHECK(response.return_message().empty())
+        << "client" << &response - &responses[0]
+        << " error:" << response.return_message();
+  }
+}
+
+PServerVector ParameterClient2::createVector() {
+  CreateVectorRequest request;
+  std::vector<CreateVectorResponse> responses;
+  int64_t handle = -1;
+
+  multiCall(__func__, request, &responses);
+  validateResponses(responses);
+
+  for (auto& response : responses) {
+    if (handle == -1) {
+      handle = response.handle();
+    } else {
+      CHECK_EQ(handle, response.handle()) << "Inconsistent handle from client"
+                                          << &response - &responses[0] << " "
+                                          << handle << " " << response.handle();
+    }
+  }
+  return PServerVector{handle};
+}
+
+void ParameterClient2::releaseVector(PServerVector handle) {
+  ReleaseVectorRequest request;
+  std::vector<ReleaseVectorResponse> responses;
+
+  request.set_handle(handle.handle);
+  multiCall(__func__, request, &responses);
+  validateResponses(responses);
+}
+
+PServerMatrix ParameterClient2::createMatrix(int32_t numCols) {
+  CreateMatrixRequest request;
+  std::vector<CreateMatrixResponse> responses;
+  int64_t handle = -1;
+
+  request.set_num_cols(numCols);
+  multiCall(__func__, request, &responses);
+  validateResponses(responses);
+
+  for (auto& response : responses) {
+    if (handle == -1) {
+      handle = response.handle();
+    } else {
+      CHECK_EQ(handle, response.handle()) << "Inconsistent handle from client"
+                                          << &response - &responses[0] << " "
+                                          << handle << " " << response.handle();
+    }
+  }
+  return PServerMatrix{handle};
+}
+
+void ParameterClient2::releaseMatrix(PServerMatrix handle) {
+  ReleaseMatrixRequest request;
+  std::vector<ReleaseMatrixResponse> responses;
+
+  request.set_handle(handle.handle);
+  multiCall(__func__, request, &responses);
+  validateResponses(responses);
+}
+
+void PreparedOperations::addOperationHelper(Operation* op, CpuVectorPtr vec) {
+  ProtoVector& pvec = *op->add_vectors();
+  size_t dim = vec->getSize();
+  pvec.set_dim(dim);
+  copyToRepeatedField(pvec.mutable_values(), vec->getData(), vec->getSize());
+}
+
+void PreparedOperations::addOperationHelper(Operation* op, CpuMatrixPtr mat) {
+  ProtoMatrix& pmat = *op->add_matrices();
+  pmat.set_num_cols(mat->getWidth());
+  pmat.set_num_rows(mat->getHeight());
+  copyToRepeatedField(
+      pmat.mutable_values(), mat->getData(), pmat.num_cols() * pmat.num_rows());
+}
+
+static inline real addTwo(real a, double b) { return a + b; }
+
+void ParameterClient2::doOperation(PreparedOperations& ops,
+                                   bool waitForGradient,
+                                   bool sendBackGradient,
+                                   bool releasePass) {
+  std::vector<DoOperationResponse> responses;
+  ops.request_.set_wait_for_gradient(waitForGradient);
+  ops.request_.set_send_back_parameter(sendBackGradient);
+  ops.request_.set_release_pass(releasePass);
+  multiCall(__func__, ops.request_, &responses);
+  validateResponses(responses);
+  size_t numPassFinishServers = 0;
+
+  size_t numOps = ops.request_.operations_size();
+  for (auto& response : responses) {
+    numPassFinishServers += response.pass_finish();
+    CHECK_EQ(numOps, (size_t)response.results_size());
+    for (size_t opId = 0; opId < numOps; ++opId) {
+      const OperationResult& result = response.results(opId);
+      std::vector<real*>& resultScalars = ops.localResults_[opId].resultScalars;
+      std::vector<CpuVectorPtr>& resultVectors =
+          ops.localResults_[opId].resultVectors;
+      std::vector<CpuMatrixPtr>& resultMatrices =
+          ops.localResults_[opId].resultMatrices;
+
+      if (&response == &responses[0]) {
+        /// Initialize results to zero
+
+        resultScalars.resize(result.scalars_size());
+        for (auto p : resultScalars) {
+          if (!p) continue;
+          *p = 0;
+        }
+        size_t numVectors = result.vectors_size();
+        resultVectors.resize(numVectors);
+        for (size_t i = 0; i < numVectors; ++i) {
+          if (!resultVectors[i]) continue;
+          resultVectors[i]->resize(result.vectors(i).dim());
+          resultVectors[i]->zeroMem();
+        }
+        size_t numMatrices = result.matrices_size();
+        resultMatrices.resize(numMatrices);
+        for (size_t i = 0; i < numMatrices; ++i) {
+          if (!resultMatrices[i]) continue;
+          resultMatrices[i]->resize(result.matrices(i).num_rows(),
+                                    result.matrices(i).num_cols());
+          resultMatrices[i]->zeroMem();
+        }
+      }
+
+      // aggregate results from each pserver to results
+
+      CHECK_EQ(resultScalars.size(), (size_t)result.scalars_size());
+      for (ssize_t i = 0; i < result.scalars_size(); ++i) {
+        real* rscalar = resultScalars[i];
+        if (!rscalar) continue;
+        *rscalar += result.scalars(i);
+      }
+
+      CHECK_EQ(resultVectors.size(), (size_t)result.vectors_size());
+      for (auto& vec : result.vectors()) {
+        int i = &vec - &result.vectors(0);
+        CpuVectorPtr rvec = resultVectors[i];
+        if (!rvec) continue;
+        CHECK_EQ(rvec->getSize(), (size_t)vec.dim());
+        std::transform(rvec->getData(),
+                       rvec->getData() + rvec->getSize(),
+                       vec.values().data(),
+                       rvec->getData(),
+                       addTwo);
+      }
+
+      CHECK_EQ(resultMatrices.size(), (size_t)result.matrices_size());
+      for (auto& mat : result.matrices()) {
+        int i = &mat - &result.matrices(0);
+        CpuMatrixPtr rmat = resultMatrices[i];
+        if (!rmat) continue;
+        CHECK_EQ(rmat->getHeight(), (size_t)mat.num_rows());
+        CHECK_EQ(rmat->getWidth(), (size_t)mat.num_cols());
+
+        std::transform(rmat->getData(),
+                       rmat->getData() + rmat->getElementCnt(),
+                       mat.values().data(),
+                       rmat->getData(),
+                       addTwo);
+      }
+    }
+  }
+  passFinish_ = numPassFinishServers == clients_.size();
+}
+
+real ParameterClient2::vectorDotProduct(PServerVector u, PServerVector v) {
+  real result = 0.0;
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_utv, u, v)(&result);
+  doOperation(ops, false, false);
+  return result;
+}
+
+void ParameterClient2::vectorScale(PServerVector u, real a) {
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_au, u, a);
+  doOperation(ops, false, false);
+}
+
+void ParameterClient2::vectorCopy(PServerVector src, PServerVector dst) {
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_COPY, src, dst);
+  doOperation(ops, false, false);
+}
+
+void ParameterClient2::vectorAddMult(PServerVector u, PServerVector v, real a) {
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_au_bv, v, u, a, (real)1);
+  doOperation(ops, false, false);
+}
+
+void ParameterClient2::vectorAddMultInto(PServerVector u,
+                                         PServerVector v,
+                                         PServerVector w,
+                                         real a) {
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_au_bv_cw, v, w, u, (real)1, a, (real)0);
+  doOperation(ops, false, false);
+}
+
+void ParameterClient2::vectorScaleInto(PServerVector u,
+                                       PServerVector v,
+                                       real a) {
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_au_bv, v, u, a, (real)0);
+  doOperation(ops, false, false);
+}
+
+void ParameterClient2::loadValueVector(const std::string& dirName) {
+  LoadValueRequest request;
+  request.set_dir_name(dirName);
+  std::vector<LoadValueResponse> responses;
+
+  multiCall(__func__, request, &responses);
+  validateResponses(responses);
+}
+
+void ParameterClient2::saveValueVector(const std::string& dirName) {
+  SaveValueRequest request;
+  request.set_dir_name(dirName);
+  std::vector<SaveValueResponse> responses;
+
+  multiCall(__func__, request, &responses);
+  validateResponses(responses);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/pserver/ParameterClient2.h b/paddle/legacy/pserver/ParameterClient2.h
new file mode 100644
index 0000000000000000000000000000000000000000..9320e19c4df6c5439266f89e5599b9496f145172
--- /dev/null
+++ b/paddle/legacy/pserver/ParameterClient2.h
@@ -0,0 +1,602 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <atomic>
+#include <mutex>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/pserver/BaseClient.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/Queue.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include "ParameterService.pb.h"
+
+#include "ProtoServer.h"
+#include "SparseParameterDistribution.h"
+
+DECLARE_int32(parallel_thread_num);
+
+namespace paddle {
+
+struct PServerMatrix {
+  int64_t handle;
+};
+
+struct PServerVector {
+  int64_t handle;
+};
+
+/**
+ * @brief A class to help to prepare server-side operations.
+ */
+class PreparedOperations {
+ protected:
+  class ResultsAdder;
+  struct LocalOperationResult;
+
+ public:
+  /**
+   * Offers an easy way to prepare operations that will be performed on
+   * server-side.
+   *
+   * Usage:
+   * @code
+   *   addOperation(optype, arguments...)(results...)
+   * @endcode
+   *
+   * Examples:
+   * 1. set pserver vector to 1:
+   * @code
+   *   PServerVector u = parameterClient.createVector();
+   *   addOperation(PSERVER_OP_RESET, u, (real)1);
+   * @endcode
+   *
+   * 2. Compute inner product of to pserver vectors.
+   * @code
+   *   PServerVector u = parameterClient.createVector();
+   *   PServerVector v = parameterClient.createVector();
+   *   real result;
+   *   addOperation(PSERVER_OP_utv, u, v)(&result)
+   * @endcode
+   *
+   * @param[in] operation The operation that pserver will perform.
+   * @param[in] args Argument list of the operation
+   * @return A ResultsAdder object initialized with the last element of
+   *         localResults_.
+   */
+  template <typename... Args>
+  ResultsAdder addOperation(MatrixVectorOperation operation, Args... args) {
+    Operation* op = request_.add_operations();
+    op->set_operation(operation);
+    localResults_.emplace_back();
+    addOperationHelper(op, args...);
+    return ResultsAdder(&localResults_.back());
+  }
+
+ protected:
+  void addOperationHelper(Operation* op) {}
+
+  /**
+   * @brief Helper function to add an new operation that takes a PServerVector
+   *        as an operand.
+   */
+  void addOperationHelper(Operation* op, PServerVector arg) {
+    op->add_pvectors(arg.handle);
+  }
+
+  /**
+   * @brief Helper function to add an new operation that takes a PServerMatrix
+   *        as an operand.
+   */
+  void addOperationHelper(Operation* op, PServerMatrix arg) {
+    op->add_pmatrices(arg.handle);
+  }
+
+  /**
+   * @brief Helper function to add an new operation that takes a real valued
+   *        scalar as an operand.
+   */
+  void addOperationHelper(Operation* op, real arg) { op->add_scalars(arg); }
+
+  /**
+   * @brief Helper function to add an new operation that takes a CpuVectorPtr
+   *        as an operand.
+   * @note The array of CpuVectors that arg points to will be copied to
+   *       op's vectors field.
+   */
+  void addOperationHelper(Operation* op, CpuVectorPtr arg);
+
+  /**
+   * @brief Helper function to add an new operation that takes a CpuMatrixPtr
+   *        as an operand.
+   * @note The array of CpuMatrixs that arg points to will be copied to
+   *       op's matrices field.
+   */
+  void addOperationHelper(Operation* op, CpuMatrixPtr arg);
+
+  /**
+   * @brief Helper function to add an new operation and prepare the operands.
+   *
+   * @tparam Arg An operand of the operation.
+   * @tparam Args A list of rest operands of the operation.
+   * @param op Pointer to an Operation object.
+   */
+  template <typename Arg, typename... Args>
+  void addOperationHelper(Operation* op, Arg arg, Args... args) {
+    addOperationHelper(op, arg);
+    addOperationHelper(op, args...);
+  }
+
+  /**
+   * @brief ResultsAdder offers easy ways to quickly store operation results.
+   */
+  class ResultsAdder {
+   public:
+    explicit ResultsAdder(LocalOperationResult* localResult)
+        : localResult_(localResult) {}
+    template <typename... Args>
+    void operator()(Args... args) {
+      addResult(args...);
+    }
+    void addResult() {}
+    void addResult(real* arg) { localResult_->resultScalars.push_back(arg); }
+    void AddResult(CpuVectorPtr arg) {
+      localResult_->resultVectors.push_back(arg);
+    }
+    void AddResult(CpuMatrixPtr arg) {
+      localResult_->resultMatrices.push_back(arg);
+    }
+    template <typename Arg, typename... Args>
+    void addResult(Arg arg, Args... args) {
+      addResult(arg);
+      addResult(args...);
+    }
+
+   protected:
+    LocalOperationResult* localResult_;
+  };
+
+ protected:
+  DoOperationRequest request_;
+  std::vector<iovec> inputIovs_;
+  struct LocalOperationResult {
+    std::vector<real*> resultScalars;
+    std::vector<CpuVectorPtr> resultVectors;
+    std::vector<CpuMatrixPtr> resultMatrices;
+  };
+  std::vector<LocalOperationResult> localResults_;
+  friend class ParameterClient2;
+};
+
+struct ParameterSegments {
+  std::string name;  // name of the parameter
+  size_t id;         // id of the parameter
+};
+
+/**
+ * The client interface for parameter server. ParameterClient2 supports 2 modes
+ * for managing connections to parameter servers, in the 1st mode one connection
+ * is shared by 2 threads that are separately responsible for sending and
+ * recieving activities, in the 2nd mode one connection is owned by only one
+ * thread, and all the sending and recieving activities run in that single
+ * thread.
+ * TODO(yanfei):
+ * Additional core idea to further optimizate pserver performance is
+ * to do sync-sgd based parameter level instead of pserver level.
+ * full-parallelization based parameter level for sync-sgd also can
+ * sense forwardbackward computation layer-by-layer for more deeper layer
+ * model.
+ * Firstly, pserver can do full-parallelization on all computation based
+ * parameter level instead of waiting for all gradients are finished and
+ * start to send back parameters value immediately if parameter is ready
+ * instead of waiting for all parameters value are ready
+ * Secondly, parameter client can write back parameters to GPU instead of
+ * waiting until all parameters are received to CPU host end.
+ */
+class ParameterClient2 : public BaseClient {
+ public:
+  /** Constructor.
+   * @param separate True if sending and recieving activities are separated
+   *                 into 2 threads, otherwise false.
+   * @param port Port number that parameter client runs on.
+   * @param numPorts Number of ports parameter clients occupies,
+   *                 numPorts * pserver number is the total number of
+   *                 connections the parameter client maintains.
+   */
+  ParameterClient2(bool separate = false,
+                   int port = FLAGS_port,
+                   int numPorts = FLAGS_ports_num);
+
+  ~ParameterClient2();
+
+  static int calcParameterBlockSize(const std::vector<ParameterPtr>& parameters,
+                                    size_t serviceNum);
+
+ public:
+  bool init(const std::vector<ParameterPtr>& parameters);
+
+  /// service functions
+
+  /**
+   * @brief Sends the segments in parameter to parameter servers, then receives
+   *        the response from the servers.
+   * @param[in] updateMode Indicates how parameters should be updated on the
+   *            server side.
+   * @param[in] parameterType Type of parameter that will be sent.
+   * @param[in] segments Segments in the parameter that will be sent.
+   * @param[in] numSamples Number of samples this update is based on.
+   * @param[in] cost Cost of the batch, will be used to calculate global object
+   *            value.
+   * @param[in] sendBackParameter True if the updated parameters should be sent
+   *            back, otherwise false.
+   * @param[in] sendBackParameterType Send back parameter type on pserver,
+   *            PARAMETER_VALUE by default
+   * @param[in] recvParameterType pserver[sendBackParameterType] will be copy to
+   *            client[recvParameterType]
+   * @note Only parameterType will be sent.
+   */
+  void sendAndReceiveParameter(ParameterUpdateMode updateMode,
+                               ParameterType parameterType,
+                               const std::vector<ParameterSegments>& segments,
+                               int64_t numSamples,
+                               real cost,
+                               bool sendBackParameter,
+                               ParameterType sendBackParameterType,
+                               ParameterType recvParameterType);
+
+  /**
+   * @brief Sends all parameters to parameter servers, and receives the response
+   *        from the servers.
+   */
+  void sendAndReceiveParameter(
+      ParameterUpdateMode updateMode,
+      ParameterType parameterType,
+      int64_t numSamples,
+      real cost,
+      bool sendBackParameter,
+      ParameterType sendBackParameterType = PARAMETER_VALUE,
+      ParameterType recvParameterType = PARAMETER_VALUE) {
+    sendAndReceiveParameter(updateMode,
+                            parameterType,
+                            allSegments_,
+                            numSamples,
+                            cost,
+                            sendBackParameter,
+                            sendBackParameterType,
+                            recvParameterType);
+  }
+
+  /**
+   * @brief Sends the segments in parameter to parameter servers. Each
+   *        sendParameter() must be paired with a recvParameter() in the future.
+   *        Only parameterType will be sent.
+   *
+   * @param[in] updateMode Indicates how parameters should be updated on the
+   *            server side.
+   * @param[in] parameterType Type of parameter that will be sent.
+   * @param[in] segments Segments in the parameter that will be sent.
+   * @param[in] numSamples Number of samples this update is based on.
+   * @param[in] cost Cost of the batch, will be used to calculate global object
+   *            value.
+   * @param[in] sendBackParameter True if the updated parameters should be sent
+   *            back, otherwise false.
+   * @param[in] batchStatus Status of the batch.
+   * @note This function is non-blocking. This means that parameter should
+   *       not change between this call and recvParameter()
+   */
+  void sendParameter(ParameterUpdateMode updateMode,
+                     ParameterType parameterType,
+                     const std::vector<ParameterSegments>& segments,
+                     int64_t numSamples,
+                     real cost,
+                     bool sendBackParameter,
+                     BatchStatus batchStatus);
+
+  void recvParameter();
+
+  /**
+   * Sends all parameters to parameter servers, recvParameter() have to be
+   * invoked
+   * afterwards.
+   *
+   * @note This function is non-blocking. This means that if parameter should
+   *       not changes between this call and recvParameter()
+   */
+  void sendParameter(ParameterUpdateMode updateMode,
+                     ParameterType parameterType,
+                     int64_t numSamples,
+                     real cost,
+                     bool sendBackParameter,
+                     BatchStatus batchStatus) {
+    sendParameter(updateMode,
+                  parameterType,
+                  allSegments_,
+                  numSamples,
+                  cost,
+                  sendBackParameter,
+                  batchStatus);
+  }
+
+  /// Get all parameters from parameter servers
+  void getParameter(ParameterType recvParameterType = PARAMETER_VALUE,
+                    ParameterType sendBackParameterType = PARAMETER_VALUE) {
+    sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM,
+                            PARAMETER_VALUE,
+                            0,     // numSamples = 0
+                            0,     // cost = 0
+                            true,  // sendBackParameter = true
+                            sendBackParameterType,
+                            recvParameterType);
+  }
+
+  /// Get parameters by sparse row ids from parameter servers
+  void getParameterSparse(
+      ParameterType recvParameterType = PARAMETER_VALUE,
+      ParameterType sendBackParameterType = PARAMETER_VALUE) {
+    sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM_SPARSE,
+                            PARAMETER_VALUE,
+                            0,     // numSamples = 0
+                            0,     // cost = 0
+                            true,  // sendBackParameter = true
+                            sendBackParameterType,
+                            recvParameterType);
+  }
+
+  /// Set all parameters on parameter servers using the local parameters
+  void setParameter() {
+    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM,
+                            PARAMETER_VALUE,
+                            0,       // numSamples = 0
+                            0,       // cost = 0
+                            false);  // sendBackParameter = false
+  }
+  /**
+   * Set all parameters on parameter servers, values will be zero
+   * means do not sending local parameters
+   */
+  void setParameterZero() {
+    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM_ZERO,
+                            PARAMETER_VALUE,
+                            0,       // numSamples = 0
+                            0,       // cost = 0
+                            false);  // sendBackParameter = false
+  }
+
+  /**
+   * @brief Wait until all gradient servers start one pass.
+   *
+   * @note This is now only used by the gradient servers for "sgd"
+   *       algorithm. Calling this function means that the calling gradient
+   *       server is ready to start a new pass.
+   */
+  void waitPassStart();
+
+  /**
+   * @brief Wait until all gradient servers finish one pass.
+   *
+   * @note This is now only used by the gradient servers for "sgd" algorithm.
+   *       Calling this function means that the calling gradient server
+   *       finishes one pass.
+   */
+  void waitPassFinish();
+
+  /// Wait until all gradient servers call this function.
+  void synchronize(SyncObject syncObjectId = SYNC_DEFAULT);
+
+  /// Called when async-sgd finish pass.
+  void asyncFinishPass(SyncObject syncObjectId = SYNC_DEFAULT);
+
+  void asyncStartPass(SyncObject syncObjectId = SYNC_DEFAULT) {
+    return synchronize(syncObjectId);
+  }
+
+  /**
+   * @brief Execute the prepared operations on pservers, fetch the results and
+   *        aggregate results from different pservers.
+   * @param[in] ops Prepared operations that will be executed on pservers.
+   * @param[in] waitForGradient If true, wait for gradient to be ready before
+   *            starting the operations.
+   * @param[in] sendBackParameter If true, send back the parameter to clients
+   *            after the operations are finished.
+   * @param[in] If true, and if all clients call waitPassFinish, signal all
+   *            clients finish the pass.
+   */
+  void doOperation(PreparedOperations& ops,
+                   bool waitForGradient,
+                   bool sendBackParameter,
+                   bool releasePass = true);
+
+  /**
+   * Set the configuration of pserver, including parameter config and
+   * optimization config
+   */
+  void setConfig(const OptimizationConfig& optConfig,
+                 const std::string& saveDir = "",
+                 bool isSparseServer = false);
+
+  /// Return true if all pservers are in the given status
+  bool inStatus(PServerStatus status);
+  bool isPassFinish() { return passFinish_; }
+
+  /// Set pserver status
+  void setStatus(PServerStatus status);
+
+  /**
+   * @brief Wait until all pservers are at status
+   * @note This function is not suitable for frequent use,
+   *       because it sleeps 1 second each time when condition is satisfied.
+   */
+  void waitForStatus(PServerStatus status);
+
+  /// Create a column vector. The size is the dimension of parameter.
+  PServerVector createVector();
+
+  /// Release the PServerVector given handle.
+  void releaseVector(PServerVector handle);
+
+  /**
+   * Create a column major matrix. The number of rows is the dimension of
+   * parameter. The number of columns is specifed by numCols.
+   */
+  PServerMatrix createMatrix(int32_t numCols);
+
+  /// Release the PServerMatrix given handle.
+  void releaseMatrix(PServerMatrix handle);
+
+  // Some basic algebra functions
+  /// Calculate the dot product of u and v
+  real vectorDotProduct(PServerVector u, PServerVector v);
+
+  /// Scale u by a
+  void vectorScale(PServerVector u, real a);
+
+  /// Copy from src to dest
+  void vectorCopy(PServerVector src, PServerVector dst);
+
+  /// u += v * a
+  void vectorAddMult(PServerVector u, PServerVector v, real a);
+
+  /// u = v + w * a
+  void vectorAddMultInto(PServerVector u,
+                         PServerVector v,
+                         PServerVector w,
+                         real a);
+  /// u = v * a
+  void vectorScaleInto(PServerVector u, PServerVector v, real a);
+
+  /// Return pserver parameter value.
+  PServerVector getPServerParameterValue() {
+    PServerVector vec;
+    vec.handle = PARAMETER_VALUE;
+    return vec;
+  }
+
+  /// Return pserver parameter gradient.
+  PServerVector getPServerParameterGradient() {
+    PServerVector vec;
+    vec.handle = PARAMETER_GRADIENT;
+    return vec;
+  }
+
+  /**
+   * Tell pservers to load value vector from file.
+   *
+   * @param[in] dirName The directory that contains the value vector file.
+   */
+  void loadValueVector(const std::string& dirName);
+
+  /// Tell pservers to save value vector to file.
+  void saveValueVector(const std::string& dirName);
+
+  void setTrainerId(int trainerId) { trainerId_ = trainerId; }
+
+#ifndef PADDLE_DISABLE_TIMER
+  void setForwardbackwardTime(uint64_t delta) { forwardbackwordTime_ = delta; }
+#endif
+
+ protected:
+  template <typename ProtoIn, typename ProtoOut>
+  void multiCall(const char* funcName,
+                 const ProtoIn& request,
+                 std::vector<ProtoOut>* responses) {
+    responses->resize(clients_.size());
+    size_t numClients = clients_.size();
+    for (size_t i = 0; i < numClients; ++i) {
+      clients_[i].send(funcName, request);
+    }
+    for (size_t i = 0; i < numClients; ++i) {
+      clients_[i].recv(&(*responses)[i]);
+    }
+  }
+
+ private:
+  void destroy();
+
+  /**
+   * @brief management function for parallelizing send/recv all connections
+   *        to all pservers. it is called under one SyncThreadPool. it
+   *        supports to use N thread to control M connections. the receiving
+   *        actions can be started until all sending action to all connections
+   *        owned by current thread are finished. Different connections
+   * controlled
+   *        by different threads can transfer data asynchronously.
+   */
+  void sendParallel(int tid,
+                    size_t numThreads,
+                    ParameterType recvParameterType);
+  /// sending thread routine for asynchronously send data
+  void send(int threadId);
+  /// receiving thread routing for asynchronously receive data
+  void recv(int threadId);
+
+  /**
+   * @brief main routine to build data for pserver
+   *
+   * @note  it can prepare different kinds of parameter type data. it can
+   *        be regarded as layer for bridging real parameters data and
+   *        protobuf data for communication.
+   *        TODO(yanfei):
+   *        can abstract additional layer to encode and decode data to/from
+   *        protobuf data.
+   */
+  void prepareSendData(
+      ParameterUpdateMode updateMode,
+      ParameterType parameterType,  // client send type
+      const std::vector<ParameterSegments>& parameterSegments,
+      int64_t numSamples,
+      real cost,
+      bool sendBackParameter,
+      ParameterType sendBackParameterType,  // send back type in pserver
+      BatchStatus batchStatus,
+      SendJob* sendJob);
+
+  /// start necessary threads for threadPool
+  void initThreads();
+
+ protected:
+  /// start port number of pserver
+  /// it deduce all ports for dense and sparse with some rules
+  int port_;
+  /// identify the trainer id using this client
+  int trainerId_;
+
+#ifndef PADDLE_DISABLE_TIMER
+  uint64_t forwardbackwordTime_;
+#endif
+  std::mutex sparseAutoGrowthMutex_;
+
+  /// map id to parameter used for decoding protobuf data
+  std::unordered_map<size_t, ParameterPtr> parameterMap_;
+  /// segments for all parameters that needed to sync
+  std::vector<ParameterSegments> allSegments_;
+
+  /// module for sensing sparse parameters distribution on all pservers
+  std::unique_ptr<SparseParameterDistribution> sparseDistribution_;
+
+  /// thread pool for parallelizing all connections to pservers
+  std::unique_ptr<SyncThreadPool> syncThreadPool_;
+
+  bool passFinish_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/pserver/ParameterServer2.cpp b/paddle/legacy/pserver/ParameterServer2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8533a322d92d292ee613d44795cf60462082a11b
--- /dev/null
+++ b/paddle/legacy/pserver/ParameterServer2.cpp
@@ -0,0 +1,1401 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ParameterServer2.h"
+
+#include <algorithm>
+#include <fstream>
+
+#include "paddle/legacy/math/SIMDFunctions.h"
+#include "paddle/legacy/parameter/AverageOptimizer.h"
+#include "paddle/legacy/parameter/FirstOrderOptimizer.h"
+#include "paddle/legacy/parameter/OptimizerFunctions.h"
+#include "paddle/legacy/parameter/OptimizerWithRegularizer.h"
+#include "paddle/legacy/parameter/ParameterOptimizer.h"
+#include "paddle/legacy/parameter/ParameterUpdateFunctions.h"
+#include "paddle/legacy/parameter/Regularizer.h"
+#include "paddle/legacy/parameter/ThreadLocalBuffer.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/StringUtil.h"
+
+DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
+DEFINE_double(async_lagged_ratio_min,
+              1.0,
+              "control config_.async_lagged_grad_discard_ratio() min value");
+DEFINE_double(
+    async_lagged_ratio_default,
+    1.5,
+    "if async_lagged_grad_discard_ratio is not set in trainer_config.conf"
+    "use it as defalut value");
+
+namespace paddle {
+
+const std::string ParameterServer2::kRetMsgInvalidMatrixHandle =
+    "Invalid matrix handle";
+const std::string ParameterServer2::kRetMsgInvalidVectorHandle =
+    "Invalid vector handle";
+const std::string ParameterServer2::kRetMsgUnknownOperation =
+    "Unknown operation";
+
+ParameterServer2::ParameterServer2(const std::string& addr,
+                                   int port,
+                                   int rdmaCpu)
+    : ProtoServer(addr, port, rdmaCpu),
+      dataSize_(0),
+      size_(0),
+      gradientReadyBarrier_(FLAGS_num_gradient_servers + 1),
+      parameterReadyBarrier_(FLAGS_num_gradient_servers + 1),
+      passBarrier_(FLAGS_num_gradient_servers + 1),
+      numPassFinishClients_(0),
+      allClientPassFinish_(false),
+      serverId_(-1),
+      batchId_(-1) {
+  /**
+   * register function for remote client calling, these functions
+   * will be mapped to a data structure for quick looking up. each
+   * request from trainer can contains one function name to indicate
+   * remote action. this architecture looks like rpc style for pserver.
+   */
+  REGISTER_SERVICE_FUNCTION_EX(ParameterServer2, sendParameter);
+  REGISTER_SERVICE_FUNCTION_EX(ParameterServer2, sendData);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, setConfig);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, setStatus);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, getStatus);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, doOperation);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, createVector);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, releaseVector);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, createMatrix);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, releaseMatrix);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, waitPassStart);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, waitPassFinish);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, synchronize);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, asyncFinishPass);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, loadValueVector);
+  REGISTER_SERVICE_FUNCTION(ParameterServer2, saveValueVector);
+
+  /// thread pool for parallelizing some computations
+  if (FLAGS_pserver_num_threads > 1) {
+    syncThreadPool_.reset(new SyncThreadPool(FLAGS_pserver_num_threads, false));
+  }
+}
+
+bool ParameterServer2::init() {
+  vectors_.resize(NUM_PARAMETER_TYPES);
+  configMap_.clear();
+
+  numSamplesProcessed_ = 0;
+  cost_ = 0;
+  char* mpienv = getenv("OMPI_COMM_WORLD_SIZE");
+  if (mpienv != NULL) {
+    mpiSize_ = atoi(mpienv);
+  } else {
+    mpiSize_ = 1;
+  }
+  status_ = PSERVER_STATUS_NOT_SET;
+  dataMems_.resize(FLAGS_num_gradient_servers);
+  synchronizeBarriers_.resize(SyncObject_ARRAYSIZE);
+  for (auto& barrier : synchronizeBarriers_) {
+    barrier.reset(new ThreadBarrier(FLAGS_num_gradient_servers));
+  }
+
+  // initialization for dicarding lagging gradient
+  asyncUpdateSteps_ = 0;
+  asyncTrainerSteps_.resize(FLAGS_num_gradient_servers);
+  asyncTrainerSteps_.assign(asyncTrainerSteps_.size(), 0);
+  asyncLaggedGradientsNum_ = 0;
+  asyncUpdateStat_.resize(static_cast<int>(FLAGS_num_gradient_servers *
+                                           FLAGS_async_lagged_ratio_default));
+  asyncUpdateStat_.assign(asyncUpdateStat_.size(), 0);
+  asyncTrainerDiscardStat_.resize(FLAGS_num_gradient_servers);
+  asyncTrainerDiscardStat_.assign(asyncTrainerDiscardStat_.size(), 0);
+  asyncTrainerCommitStat_.resize(FLAGS_num_gradient_servers);
+  asyncTrainerCommitStat_.assign(asyncTrainerCommitStat_.size(), 0);
+
+  return true;
+}
+
+void ParameterServer2::getStatus(const GetStatusRequest& request,
+                                 ProtoResponseCallback callback) {
+  (void)request;
+  GetStatusResponse response;
+  response.set_status(status_);
+  callback(response);
+}
+
+void ParameterServer2::setStatus(const SetStatusRequest& request,
+                                 ProtoResponseCallback callback) {
+  status_ = request.status();
+  SetStatusResponse response;
+  callback(response);
+}
+
+void ParameterServer2::setConfig(const SetConfigRequest& request,
+                                 ProtoResponseCallback callback) {
+  {
+    std::lock_guard<RWLock> guard(parameterMutex_);
+
+    serverId_ = request.server_id();
+    isSparseServer_ = request.is_sparse_server();
+
+    if (!request.save_dir().empty()) {
+      mkDir(request.save_dir().c_str());
+    }
+
+    for (const auto& config : request.param_configs()) {
+      CHECK(!configMap_.count(config.para_id()))
+          << "Duplicated parameter name: " << config.name();
+      configMap_[config.para_id()] = config;
+      CHECK_EQ(config.sparse_remote_update(), isSparseServer_);
+    }
+
+    config_ = request.opt_config();
+    if (config_.algorithm() == TrainAlgorithm::AsyncSGD) {
+      auto asyncLaggedRatio = config_.async_lagged_grad_discard_ratio();
+      if (asyncLaggedRatio <= FLAGS_async_lagged_ratio_min) {
+        LOG(INFO) << "WARNING: async_lagged_grad_discard_ratio is too small"
+                  << "reset to default, async_lagged_grad_discard_ratio = "
+                  << FLAGS_async_lagged_ratio_default;
+        asyncLaggedRatio = FLAGS_async_lagged_ratio_default;
+      }
+      asyncLaggedThreshold_ =
+          static_cast<int64_t>(FLAGS_num_gradient_servers * asyncLaggedRatio);
+      LOG(INFO) << "discard lagged async gradient ratio: " << asyncLaggedRatio
+                << " asyncLaggedhreshold: " << asyncLaggedThreshold_;
+    }
+    if (isSparseServer_ && config_.num_batches_per_send_parameter() > 1) {
+      /// sparse server must NOT use local update mode
+      config_.set_num_batches_per_send_parameter(1);
+    }
+
+    if (config_.num_batches_per_send_parameter() > 1 &&
+        config_.center_parameter_update_method() == "average") {
+      /// scaling L1/L2 decay rate as large as L1/L2 apply in trainer
+      /// if parameter regularization in pserver
+      for (auto& pair : configMap_) {
+        ParameterConfig& config = pair.second;
+        if (config_.num_batches_per_send_parameter() ==
+            config.num_batches_regularization()) {
+          real scale =
+              config_.delta_add_rate() * config.num_batches_regularization();
+          if (config_.algorithm() == "sgd") {
+            scale *= FLAGS_num_gradient_servers;
+          }
+          config.set_decay_rate(config.decay_rate() * scale);
+          if (config.decay_rate() > 0.1f) {
+            LOG(FATAL) << "L2 decay=" << config.decay_rate()
+                       << " for parameter:" << config.name()
+                       << " is too large after scale in pserver!";
+          }
+          config.set_decay_rate_l1(config.decay_rate_l1() * scale);
+          if (config.decay_rate_l1() > 0.1f) {
+            LOG(FATAL) << "L1 decay=" << config.decay_rate_l1()
+                       << " for parameter:" << config.name()
+                       << " is too large after scale in pserver!";
+          }
+
+          LOG(INFO) << "parameter:" << config.name()
+                    << " decay apply in pserver,"
+                    << " L1 decay=" << config.decay_rate_l1()
+                    << " L2 decay=" << config.decay_rate();
+        }
+      }
+    }
+  }
+
+  SetConfigResponse response;
+  callback(response);
+}
+
+real bufferSum(const std::vector<ParameterServer2::Buffer>& buffers) {
+  real sum = 0;
+  for (const auto buffer : buffers) {
+    for (size_t i = 0; i < buffer.size; ++i) {
+      sum += buffer.base[i];
+    }
+  }
+  return sum;
+}
+
+void ParameterServer2::mergeSegments(BlockSegments* segments) {
+  if (segments->empty()) {
+    return;
+  }
+  std::sort(segments->begin(), segments->end());
+  auto curr = segments->begin();
+  for (auto it = segments->begin(); it != segments->end(); ++it) {
+    if (it->first <= curr->second) {
+      curr->second = std::max(curr->second, it->second);
+    } else {
+      ++curr;
+      *curr = *it;
+    }
+  }
+  ++curr;
+  segments->erase(curr, segments->end());
+}
+
+void ParameterServer2::setParameter(const SendParameterRequest& request,
+                                    std::vector<Buffer>& inputBuffers,
+                                    SendParameterResponse* response,
+                                    std::vector<Buffer>* outputBuffers) {
+  (void)response;
+  (void)outputBuffers;
+  LOG(INFO) << "pserver: setParameter";
+  std::lock_guard<RWLock> guard(parameterMutex_);
+
+  int64_t numBlocks = blockIdMap_.size();
+  CHECK_EQ(blockIdMap_.size(), blockOffsetMap_.size());
+  /// total bytes for all the added blocks
+  int64_t totalSize = size_;
+  std::vector<int64_t> offsets;
+  offsets.reserve(request.blocks_size());
+  std::vector<int64_t> blockIds;
+  blockIds.reserve(request.blocks_size());
+  int bufferIndex = 0;
+
+  if (!request.blocks().size()) {
+    LOG(WARNING)
+        << "--ports_num or --ports_num_for_sparse might be too large, "
+        << "or total dense parameter size or sparse parameters size "
+        << "might be too small, this psever doesn't store any parameter.";
+    return;
+  }
+
+  for (const auto& block : request.blocks()) {
+    /// block size for parameter(e.g. 128 for sparse row, 1K for dense)
+    uint64_t blockSize = getParameterConfig(block).parameter_block_size();
+    BlockKey key(block.para_id(), block.block_id());
+    if (inputBuffers.size()) {  // if !=PSERVER_UPDATE_MODE_SET_PARAM_ZERO
+      Buffer buffer = inputBuffers[bufferIndex];
+      ++bufferIndex;
+      CHECK_EQ(buffer.size, block.block_size())
+          << "data size is too big:"
+          << " block_size=" << block.block_size()
+          << " data_size=" << buffer.size;
+    }
+
+    /// add a new block
+    if (blockIdMap_.count(key) == 0) {
+      blockOffsetMap_[key] = totalSize;
+      blockIdMap_[key] = numBlocks;
+      ++numBlocks;
+      totalSize += blockSize;
+    }
+    offsets.push_back(blockOffsetMap_[key]);
+    blockIds.push_back(blockIdMap_[key]);
+  }
+
+  size_ = totalSize;
+  LOG(INFO) << "pserver: new cpuvector: size=" << size_;
+  if (!vectors_[PARAMETER_VALUE]) {
+    /// vectors_
+    const auto types = sgdOptimizerGetTypes(config_, true /*inPserver*/);
+    for (const auto type : types) {
+      vectors_[type].reset(new CpuVector(size_));
+      vectors_[type]->zeroMem();
+    }
+
+    blockInfos_.resize(numBlocks);
+    for (auto& info : blockInfos_) {
+      info.lock.reset(new std::mutex());
+    }
+  } else {
+    CHECK_EQ((size_t)size_, vectors_[PARAMETER_VALUE]->getSize())
+        << "Currently adding new blocks is not supported. "
+        << "All blocks must be added in one setParameter call";
+  }
+
+  VectorPtr buf = vectors_[PARAMETER_VALUE];
+  usedSegments_.reserve(offsets.size());
+  /// if offsets is empty, means parameter_block_size is too big or too many
+  /// nodes.
+  if (offsets.empty()) {
+    LOG(WARNING) << "in setParameter: offsets is empty";
+  }
+  for (size_t i = 0; i < offsets.size(); ++i) {
+    size_t blockId = blockIds[i];
+    BlockInfo& info = blockInfos_[blockId];
+    const ParameterConfig& config = getParameterConfig(request.blocks(i));
+    info.config = &config;
+    info.offset = offsets[i];
+    info.optimizer.reset(sgdOptimizerCreate(
+        config_, config, config.sparse_remote_update(), true /*inPserver*/));
+    if (config.sparse_remote_update()) {
+      size_t width = config.dims(1);
+      CHECK_EQ(config.parameter_block_size(), width)
+          << "block size: " << config.parameter_block_size()
+          << "width : " << width;
+    }
+    info.optimizer->init(1, info.config);
+    usedSegments_.push_back(std::make_pair(
+        offsets[i], offsets[i] + request.blocks(i).block_size()));
+  }
+  mergeSegments(&usedSegments_);
+
+  if (request.update_mode() == PSERVER_UPDATE_MODE_SET_PARAM) {
+    /// copy param from trainer
+    for (size_t i = 0; i < offsets.size(); ++i) {
+      Buffer buffer = inputBuffers[i];
+      real* start = buf->getPoint(offsets[i]);
+      CHECK_LE(offsets[i] + buffer.size, buf->getSize());
+      memcpy(start, buffer.base, sizeof(real) * buffer.size);
+    }
+  } else {
+    CHECK(request.update_mode() == PSERVER_UPDATE_MODE_SET_PARAM_ZERO);
+    /// nothing to do, value vector zero mem already
+  }
+}
+
+void ParameterServer2::addGradient(const SendParameterRequest& request,
+                                   std::vector<Buffer>& inputBuffers,
+                                   SendParameterResponse* response,
+                                   std::vector<Buffer>* outputBuffers) {
+  VLOG(1) << "pserver: addGradient";
+
+  {
+    ReadLockGuard guard(parameterMutex_);
+    int bufferIndex = 0;
+    for (const auto& block : request.blocks()) {
+      int64_t offset = getBlockOffset(block);
+      CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
+                          << " id=" << block.para_id()
+                          << " block id=" << block.block_id();
+
+      int64_t blockId = getBlockId(block);
+      CHECK_GE(blockId, 0) << "Only existing parameter block is allowed: "
+                           << " id=" << block.para_id()
+                           << " block id=" << block.block_id();
+
+      Buffer buffer = inputBuffers[bufferIndex];
+      ++bufferIndex;
+
+      const real* gradientBuffer = buffer.base;
+      real* gradientSumBuffer = vectors_[PARAMETER_GRADIENT]->getPoint(offset);
+
+      size_t size = buffer.size;
+
+      BlockInfo& info = blockInfos_[blockId];
+      const ParameterConfig& config = getParameterConfig(blockId);
+      if (config.sparse_remote_update()) {
+        CHECK_EQ(size, config.parameter_block_size());
+      } else {  // dense
+        CHECK_LE(size, config.parameter_block_size());
+      }
+      std::lock_guard<std::mutex> guard(*info.lock);
+      simd::addTo(gradientSumBuffer, gradientBuffer, size);
+    }
+  }
+  if (request.batch_status() == BATCH_FINISH ||
+      request.batch_status() == BATCH_START_AND_FINISH) {
+    numSamplesProcessed_ += request.num_samples();
+    cost_ += request.cost();
+    VLOG(1) << "num samples: " << numSamplesProcessed_
+            << ", new cost:" << cost_;
+
+    /// notify doOperation gradient ready
+    gradientReadyBarrier_.wait();
+
+    /// wait doOperation finish
+    parameterReadyBarrier_.wait();
+    VLOG(1) << "start send back";
+  }
+}
+
+bool ParameterServer2::asyncGrdientCommitCheckAndStat(
+    const SendParameterRequest& request) {
+  const auto trainerId = request.trainer_id();
+  int64_t trainerSteps = asyncTrainerSteps_[trainerId];
+  CHECK_GE(asyncUpdateSteps_, trainerSteps)
+      << " async update steps overflows "
+      << " trainer id: " << trainerId
+      << " async update steps in pserver: " << asyncUpdateSteps_
+      << " async update steps in request: " << trainerSteps;
+
+  asyncUpdateSteps_++;
+  bool commitGradient = true;
+
+  int64_t delta = asyncUpdateSteps_ - trainerSteps;
+  if (delta >= asyncLaggedThreshold_) {
+    VLOG(1) << "discard Async Update: "
+            << " trainer id: " << trainerId
+            << " pserver steps: " << asyncUpdateSteps_
+            << " request steps: " << trainerSteps;
+    asyncLaggedGradientsNum_++;
+    commitGradient = false;
+  }
+  /// stat on lagged steps, to get total discard distribution
+  if (static_cast<size_t>(delta) < asyncUpdateStat_.size()) {
+    asyncUpdateStat_[delta]++;
+  } else {
+    asyncUpdateStat_[asyncUpdateStat_.size() - 1]++;
+  }
+  /// stat on trainerId and discard, to get trainer condition
+  if (commitGradient) {
+    asyncTrainerCommitStat_[trainerId]++;
+  } else {
+    asyncTrainerDiscardStat_[trainerId]++;
+  }
+
+  return commitGradient;
+}
+
+static ThreadLocal<std::vector<bool>> localBlockBitset_;
+
+void ParameterServer2::asyncSGD(const SendParameterRequest& request,
+                                std::vector<Buffer>& inputBuffers,
+                                SendParameterResponse* response,
+                                std::vector<Buffer>* outputBuffers) {
+  int64_t numBlocks = blockIdMap_.size();
+  auto& localBlockBitset = *localBlockBitset_;
+
+  if (isSparseServer_) {
+    if (localBlockBitset.empty()) {
+      localBlockBitset.resize(numBlocks);
+    }
+    localBlockBitset.assign(numBlocks, false);
+  }
+
+  ReadLockGuard guard(parameterMutex_);
+
+  if (request.send_back_parameter()) {
+    outputBuffers->reserve(request.blocks_size());
+  }
+
+  bool commitGradient = asyncGrdientCommitCheckAndStat(request);
+
+  VectorPtr* vecs = parameter::getThreadLocalBuffer();
+  size_t bufferIndex = 0;
+  for (const auto& block : request.blocks()) {
+    int64_t offset = getBlockOffset(block);
+    CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
+                        << " id=" << block.para_id()
+                        << " block id=" << block.block_id();
+    int64_t blockId = getBlockId(block);
+    CHECK_GE(blockId, 0) << "Only existing parameter block is allowed: "
+                         << " id=" << block.para_id()
+                         << " block id=" << block.block_id();
+    Buffer buffer = inputBuffers[bufferIndex];
+    ++bufferIndex;
+
+    size_t size = buffer.size;
+
+    BlockInfo& info = blockInfos_[blockId];
+    const ParameterConfig& config = getParameterConfig(blockId);
+
+    std::lock_guard<std::mutex> guard(*info.lock);
+    /// gradients are too obsolete, will be discarded
+    if (commitGradient) {
+      info.optimizer->startBatch(numSamplesProcessed_);
+
+      for (const auto type : info.optimizer->getParameterTypes()) {
+        vecs[type]->subVecFrom(*vectors_[type], offset, size);
+      }
+      vecs[PARAMETER_GRADIENT]->subVecFrom(buffer.base, 0, size);
+      info.optimizer->update(vecs, config, isSparseServer_ ? 0 : -1);
+
+      if (auto callback = info.optimizer->needSpecialTraversal(config)) {
+        blockTraverse(info, config, offset, size, vecs, callback);
+      }
+      info.optimizer->finishBatch();
+    }
+
+    if (commitGradient && isSparseServer_) {
+      localBlockBitset[blockId] = true;
+    }
+
+    if (!isSparseServer_ && request.send_back_parameter()) {  // dense
+      int type = request.send_back_parameter_type();
+      sendBackParameter(block, type, response, &buffer, outputBuffers);
+    }
+  }  /// foreach block
+
+  asyncTrainerSteps_[request.trainer_id()] = asyncUpdateSteps_;
+
+  if (commitGradient && isSparseServer_) {
+    /// find blocks that trainer do not request update
+    for (int64_t blockId = 0; blockId < numBlocks; ++blockId) {
+      if (localBlockBitset[blockId]) {
+        continue;
+      }
+
+      BlockInfo& info = blockInfos_[blockId];
+      const ParameterConfig& config = *info.config;
+      size_t size = config.parameter_block_size();
+
+      std::lock_guard<std::mutex> guard(*info.lock);
+      info.optimizer->startBatch(numSamplesProcessed_);
+      if (auto callback = info.optimizer->needSpecialTraversal(config)) {
+        blockTraverse(info, config, info.offset, size, vecs, callback);
+      }
+      info.optimizer->finishBatch();
+    }
+  }
+
+  if (commitGradient && (request.batch_status() == BATCH_FINISH ||
+                         request.batch_status() == BATCH_START_AND_FINISH)) {
+    numSamplesProcessed_ += request.num_samples();
+  }
+
+  /// show some performance log if needed
+  if (request.trainer_id() == 0) {
+    /// batchId_ is approximately equal to "real batchId_"
+    batchId_++;
+  }
+}
+
+void ParameterServer2::getParameter(const SendParameterRequest& request,
+                                    std::vector<Buffer>& inputBuffers,
+                                    SendParameterResponse* response,
+                                    std::vector<Buffer>* outputBuffers) {
+  (void)inputBuffers;
+  LOG(INFO) << "pserver: getParameter";
+  ReadLockGuard guard(parameterMutex_);
+  for (const auto& block : request.blocks()) {
+    int type = request.send_back_parameter_type();
+    sendBackParameter(block, type, response, outputBuffers);
+  }
+}
+
+void ParameterServer2::getParameterSparse(const SendParameterRequest& request,
+                                          std::vector<Buffer>& inputBuffers,
+                                          SendParameterResponse* response,
+                                          std::vector<Buffer>* outputBuffers) {
+  (void)inputBuffers;
+  auto& buffer = *readWriteBuffer_;
+  size_t numReals = 0;
+  for (const auto& block : request.blocks()) {
+    numReals += getParameterConfig(block).dims(1);
+  }
+  buffer.resize(numReals);
+
+  VLOG(3) << "pserver: getParameterSparse, numReals=" << numReals;
+
+  ReadLockGuard guard(parameterMutex_);
+  size_t offset = 0;
+  for (const auto& block : request.blocks()) {
+    size_t width = getParameterConfig(block).dims(1);
+    Buffer buf = {buffer.data() + offset, width};
+    int type = request.send_back_parameter_type();
+    sendBackParameterSparse(block, type, response, &buf, width, outputBuffers);
+    offset += width;
+  }
+}
+
+void ParameterServer2::sendBackParameter(const ParameterBlock& block,
+                                         int parameterType,
+                                         SendParameterResponse* response,
+                                         std::vector<Buffer>* outputBuffers) {
+  ParameterBlock* returnBlock = response->add_blocks();
+  returnBlock->set_para_id(block.para_id());
+  returnBlock->set_block_id(block.block_id());
+  returnBlock->set_begin_pos(block.begin_pos());
+  returnBlock->set_block_size(block.block_size());
+
+  int64_t offset = getBlockOffset(block);
+  CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
+                      << " id=" << block.para_id()
+                      << " block id=" << block.block_id();
+
+  real* valueBuffer = vectors_[parameterType]->getPoint(offset);
+  outputBuffers->push_back({valueBuffer, (size_t)block.block_size()});
+}
+
+void ParameterServer2::sendBackParameter(const ParameterBlock& block,
+                                         int parameterType,
+                                         SendParameterResponse* response,
+                                         Buffer* buffer,
+                                         std::vector<Buffer>* outputBuffers) {
+  ParameterBlock* returnBlock = response->add_blocks();
+  returnBlock->set_para_id(block.para_id());
+  returnBlock->set_block_id(block.block_id());
+  returnBlock->set_begin_pos(block.begin_pos());
+  returnBlock->set_block_size(block.block_size());
+
+  int64_t offset = getBlockOffset(block);
+  CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
+                      << " id=" << block.para_id()
+                      << " block id=" << block.block_id();
+
+  size_t size = buffer->size;
+  real* valueBuffer = vectors_[parameterType]->getPoint(offset);
+  /// copy to second buffer to avoid to be polluted by other request
+  memcpy(buffer->base, valueBuffer, sizeof(real) * size);
+  outputBuffers->push_back({buffer->base, size});
+}
+
+void ParameterServer2::sendBackParameterSparse(
+    const ParameterBlock& block,
+    int parameterType,
+    SendParameterResponse* response,
+    Buffer* buffer,
+    size_t width,
+    std::vector<Buffer>* outputBuffers) {
+  ParameterBlock* returnBlock = response->add_blocks();
+  returnBlock->set_para_id(block.para_id());
+  returnBlock->set_block_id(block.block_id());
+  returnBlock->set_begin_pos(block.begin_pos());
+  returnBlock->set_block_size(block.block_size());
+  int64_t offset = getBlockOffset(block);
+  CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
+                      << " id=" << block.para_id()
+                      << " block id=" << block.block_id();
+
+  real* valueBuffer = vectors_[parameterType]->getPoint(offset);
+  CHECK_EQ(buffer->size, width);
+  memcpy(buffer->base, valueBuffer, width * sizeof(real));
+  outputBuffers->push_back(*buffer);
+}
+
+void ParameterServer2::readAllBlocks(
+    MsgReader* msgReader, std::vector<ParameterServer2::Buffer>* buffers) {
+  auto& buffer = *readWriteBuffer_;
+  size_t numBlocks = msgReader->getNumBlocks();
+  buffer.resizeWithAlignHints(msgReader->getTotalLength() / sizeof(real),
+                              numBlocks);
+  std::vector<void*> bufs(numBlocks);
+  buffers->clear();
+  buffers->reserve(numBlocks);
+  buffer.resetAlignAlloc();
+  for (size_t i = 0; i < numBlocks; ++i) {
+    size_t len = msgReader->getBlockLength(i);
+    CHECK_EQ(len % sizeof(real), (size_t)0);
+    size_t size = len / sizeof(real);
+    bufs[i] = buffer.nextBlock(size);
+    buffers->push_back({(real*)bufs[i], size});
+  }
+  msgReader->readBlocks(bufs);
+}
+
+void ParameterServer2::sendParameter(const SendParameterRequest& request,
+                                     std::unique_ptr<MsgReader> msgReader,
+                                     ProtoResponseCallbackEx callback) {
+  SendParameterResponse response;
+  std::vector<Buffer> inputBuffers;
+  std::vector<Buffer> outputBuffers;
+  readAllBlocks(msgReader.get(), &inputBuffers);
+  msgReader.reset();
+
+  switch (request.update_mode()) {
+    case PSERVER_UPDATE_MODE_SET_PARAM:
+    case PSERVER_UPDATE_MODE_SET_PARAM_ZERO:
+      setParameter(request, inputBuffers, &response, &outputBuffers);
+      break;
+    case PSERVER_UPDATE_MODE_GET_PARAM:
+      getParameter(request, inputBuffers, &response, &outputBuffers);
+      break;
+    case PSERVER_UPDATE_MODE_GET_PARAM_SPARSE:
+      getParameterSparse(request, inputBuffers, &response, &outputBuffers);
+      break;
+    case PSERVER_UPDATE_MODE_ASYNC_SGD:
+      asyncSGD(request, inputBuffers, &response, &outputBuffers);
+      break;
+    case PSERVER_UPDATE_MODE_ADD_GRADIENT:
+      addGradient(request, inputBuffers, &response, &outputBuffers);
+      break;
+    case PSERVER_UPDATE_MODE_AVERAGE_PARAMETER:
+      break;
+  }
+  switch (request.update_mode()) {
+    case PSERVER_UPDATE_MODE_ADD_GRADIENT:
+      (*requestVec_).push_back(request);
+      (*callbackVec_).push_back(callback);
+      if (request.batch_status() == BATCH_FINISH ||
+          request.batch_status() == BATCH_START_AND_FINISH) {
+        for (size_t i = 0; i < (*requestVec_).size(); i++) {
+          ReadLockGuard guard(parameterMutex_);
+          SendParameterRequest& request = (*requestVec_)[i];
+          SendParameterResponse responseTemp;
+
+          std::vector<iovec> outputIovs;
+          if (request.send_back_parameter()) {
+            CHECK(!isSparseServer_);
+            std::vector<Buffer> outputBuffersTemp;
+            for (const auto& block : request.blocks()) {
+              int type = request.send_back_parameter_type();
+              sendBackParameter(block, type, &responseTemp, &outputBuffersTemp);
+            }
+            outputIovs.reserve(outputBuffersTemp.size());
+            for (auto buffer : outputBuffersTemp) {
+              outputIovs.push_back({buffer.base, buffer.size * sizeof(real)});
+            }
+          }
+
+          ProtoResponseCallbackEx& callbackTemp = (*callbackVec_)[i];
+          callbackTemp(responseTemp, outputIovs);
+        }
+        (*requestVec_).clear();
+        (*callbackVec_).clear();
+      }
+      break;
+    case PSERVER_UPDATE_MODE_SET_PARAM:
+    case PSERVER_UPDATE_MODE_SET_PARAM_ZERO:
+    case PSERVER_UPDATE_MODE_GET_PARAM:
+    case PSERVER_UPDATE_MODE_GET_PARAM_SPARSE:
+    case PSERVER_UPDATE_MODE_ASYNC_SGD:
+    case PSERVER_UPDATE_MODE_AVERAGE_PARAMETER:
+      std::vector<iovec> outputIovs;
+      outputIovs.reserve(outputBuffers.size());
+      for (auto buffer : outputBuffers) {
+        outputIovs.push_back({buffer.base, buffer.size * sizeof(real)});
+      }
+      callback(response, outputIovs);
+      break;
+  }
+}
+
+template <typename Dtype>
+void ParameterServer2::reduceAndSendData(const SendDataRequest& request,
+                                         std::unique_ptr<MsgReader>& msgReader,
+                                         ProtoResponseCallbackEx& callback) {
+  SendDataResponse response;
+  response.set_type(request.type());
+  response.set_server_id(serverId_);
+
+  auto sendData = reinterpret_cast<Dtype*>(dataMems_[0].get()->getBuf());
+  size_t rawMemSize = dataMems_[0].get()->getSize();
+  CHECK_EQ(rawMemSize % sizeof(Dtype), 0U);
+  size_t dataMemSize = rawMemSize / sizeof(Dtype);
+  for (size_t i = 1; i < dataMems_.size(); ++i) {
+    CHECK_EQ(dataMems_[i].get()->getSize(), rawMemSize);
+    auto data = reinterpret_cast<Dtype*>(dataMems_[i].get()->getBuf());
+    for (size_t j = 0; j < dataMemSize; ++j) {
+      sendData[j] += data[j];
+    }
+  }
+  std::vector<iovec> outputIovs;
+  auto block = response.add_blocks();
+  outputIovs.push_back({sendData, rawMemSize});
+  block->set_total_size(rawMemSize);
+  block->set_data_size(sizeof(Dtype));
+  callback(response, outputIovs);
+}
+
+void ParameterServer2::templateReduceSum(const SendDataRequest& request,
+                                         std::unique_ptr<MsgReader>& msgReader,
+                                         ProtoResponseCallbackEx& callback) {
+  const auto& block = request.blocks(0);
+  switch (block.data_type()) {
+    case TRANS_FLOAT:
+      reduceAndSendData<float>(request, msgReader, callback);
+      break;
+    case TRANS_DOUBLE:
+      reduceAndSendData<double>(request, msgReader, callback);
+      break;
+    case TRANS_INT32:
+      reduceAndSendData<int>(request, msgReader, callback);
+      break;
+    case TRANS_UINT32_T:
+      reduceAndSendData<uint32_t>(request, msgReader, callback);
+      break;
+    case TRANS_INT64_T:
+      reduceAndSendData<int64_t>(request, msgReader, callback);
+      break;
+    case TRANS_UINT64_T:
+      reduceAndSendData<uint64_t>(request, msgReader, callback);
+      break;
+    default:
+      LOG(FATAL) << "not supported";
+      break;
+  }
+}
+
+void ParameterServer2::sendData(const SendDataRequest& request,
+                                std::unique_ptr<MsgReader> msgReader,
+                                ProtoResponseCallbackEx callback) {
+  SendDataResponse response;
+  response.set_type(request.type());
+  response.set_server_id(serverId_);
+
+  switch (request.update_mode()) {
+    case DATA_UPDATE_MODE_SET_OWN: {
+      CHECK_EQ(msgReader->getNumBlocks(), (size_t)(request.blocks_size()));
+      size_t totalLen = msgReader->getTotalLength();
+      if (totalLen > 0) {
+        CHECK_EQ(msgReader->getNumBlocks(), 1U)
+            << "Only one block currently support now!";
+        const auto& block = request.blocks(0);
+        if (0 == dataSize_) {
+          dataSize_ = block.data_size();
+        } else {
+          CHECK_EQ(dataSize_, block.data_size());
+        }
+        int64_t serverId = request.server_id();
+        if (serverId_ < 0) {
+          serverId_ = serverId;
+        } else {
+          CHECK_EQ(serverId_, serverId);
+        }
+        int64_t clientId = request.client_id();
+        dataMems_[clientId] = std::make_shared<CpuMemoryHandle>(totalLen);
+        CHECK_EQ(totalLen % sizeof(block.data_size()), 0U);
+        msgReader->readNextBlock(dataMems_[clientId].get()->getBuf());
+      }
+      msgReader.reset();
+      std::vector<iovec> outputIovs;
+      callback(response, outputIovs);
+      break;
+    }
+    case DATA_UPDATE_MODE_GET_ALL: {
+      /// Currently only support DATA_REDUCE_SUM
+      /// And their Operations are just add
+      CHECK(DATA_REDUCE_SUM == request.type());
+      templateReduceSum(request, msgReader, callback);
+      break;
+    }
+    default: { LOG(FATAL) << "not supported"; }
+  }
+}
+
+void ParameterServer2::clearUnusedSegments(CpuVector* vec) {
+  real* data = vec->getData();
+  if (usedSegments_.empty()) {
+    return;
+  }
+  memset(data, 0, sizeof(real) * usedSegments_[0].first);
+  memset(data + usedSegments_.back().second,
+         0,
+         sizeof(real) * (size_ - usedSegments_.back().second));
+  size_t n = size_ - usedSegments_.back().second;
+
+  for (size_t i = 1; i < usedSegments_.size(); ++i) {
+    memset(
+        data + usedSegments_[i - 1].second,
+        0,
+        sizeof(real) * (usedSegments_[i].first - usedSegments_[i - 1].second));
+    n += usedSegments_[i].first - usedSegments_[i - 1].second;
+  }
+}
+
+void ParameterServer2::parallelExecForEachBlock(ExecFunc func) {
+  SyncThreadPool::execHelper(
+      syncThreadPool_.get(), [&](int tid, size_t numThreads) {
+        int64_t numBlocks = blockIdMap_.size();
+        VectorPtr* vecs = parameter::getThreadLocalBuffer();
+        for (int64_t blockId = tid; blockId < numBlocks;
+             blockId += numThreads) {
+          func(blockId, vecs);
+        }
+      });
+}
+
+void ParameterServer2::blockTraverse(
+    BlockInfo& info,
+    const ParameterConfig& config,
+    int64_t offset,
+    size_t size,
+    const VectorPtr vecs[],
+    const ParameterOptimizer::TraverseCallback& callback) {
+  /// setup sub bufs
+  for (const auto type : info.optimizer->getParameterTypes()) {
+    vecs[type]->subVecFrom(*vectors_[type], offset, size);
+  }
+  callback(vecs, config, config.sparse_remote_update() ? 0 : -1LU);
+}
+
+void ParameterServer2::op_SGD(const Operation& operation,
+                              OperationResult* result) {
+  (void)operation;
+  (void)result;
+
+  if (allClientPassFinish_) {
+    /// when all clients signal pass finished, the update
+    /// is empty.
+    return;
+  }
+
+  {
+    parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
+      BlockInfo& info = blockInfos_[blockId];
+      const ParameterConfig& config = getParameterConfig(blockId);
+      int64_t offset = info.offset;
+      size_t size = config.parameter_block_size();
+
+      info.optimizer->startBatch(numSamplesProcessed_);
+
+      for (const auto type : info.optimizer->getParameterTypes()) {
+        vecs[type]->subVecFrom(*vectors_[type], offset, size);
+      }
+      info.optimizer->update(
+          vecs, config, config.sparse_remote_update() ? 0 : -1LU);
+      vecs[PARAMETER_GRADIENT]->zeroMem();
+
+      if (auto callback = info.optimizer->needSpecialTraversal(config)) {
+        blockTraverse(info, config, offset, size, vecs, callback);
+      }
+      info.optimizer->finishBatch();
+    });
+  }
+
+  batchId_++;
+}
+
+void ParameterServer2::op_start_pass(const Operation& operation,
+                                     OperationResult* result) {
+  (void)operation;
+  (void)result;
+
+  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
+    BlockInfo& info = blockInfos_[blockId];
+    info.optimizer->startPass();
+  });
+}
+
+void ParameterServer2::op_finish_pass(const Operation& operation,
+                                      OperationResult* result) {
+  (void)operation;
+  (void)result;
+
+  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
+    BlockInfo& info = blockInfos_[blockId];
+    const ParameterConfig& config = getParameterConfig(blockId);
+    size_t size = config.parameter_block_size();
+
+    /// catch up with
+    if (auto callback = info.optimizer->startCatchUpWith()) {
+      blockTraverse(info, config, info.offset, size, vecs, callback);
+      info.optimizer->finishCatchUpWith();
+    }
+
+    /// finish pass
+    info.optimizer->finishPass();
+  });
+  batchId_ = 0;
+}
+
+void ParameterServer2::op_apply(const Operation& operation,
+                                OperationResult* result) {
+  (void)operation;
+  (void)result;
+
+  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
+    BlockInfo& info = blockInfos_[blockId];
+    const ParameterConfig& config = getParameterConfig(blockId);
+    int64_t offset = info.offset;
+    size_t size = config.parameter_block_size();
+
+    // catch up with
+    if (auto callback = info.optimizer->startCatchUpWith()) {
+      blockTraverse(info, config, offset, size, vecs, callback);
+      info.optimizer->finishCatchUpWith();
+    }
+
+    // apply to PARAMETER_APPLY
+    if (auto callback = info.optimizer->apply()) {
+      blockTraverse(info, config, offset, size, vecs, callback);
+    }
+  });
+}
+
+void ParameterServer2::op_randomize(const Operation& operation,
+                                    OperationResult* result) {
+  LOG(INFO) << "ParameterServer2::op_randomize: serverId=" << serverId_;
+
+  CpuVector& valueVec = *vectors_[PARAMETER_VALUE];
+
+  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
+    BlockInfo& info = blockInfos_[blockId];
+    const ParameterConfig& config = getParameterConfig(blockId);
+    size_t size = config.parameter_block_size();
+
+    vecs[PARAMETER_VALUE]->subVecFrom(valueVec, info.offset, size);
+    Parameter::randomize(vecs[PARAMETER_VALUE], config);
+  });
+}
+
+void ParameterServer2::loadValueVector(const LoadValueRequest& request,
+                                       ProtoResponseCallback callback) {
+  LoadValueResponse response;
+  LOG(INFO) << "ParameterServer2::loadValueVector: serverId=" << serverId_;
+
+  constexpr int kBufLen = 100;
+  char buf[kBufLen];
+  snprintf(buf, kBufLen, "/pserver.%04d", static_cast<int>(serverId_));
+  std::string filename = request.dir_name() + buf;
+
+  std::ifstream fs(filename, std::ios_base::binary);
+  CHECK(fs) << "Fail to open " << filename;
+
+  CpuVector& vec = *vectors_[PARAMETER_VALUE];
+  Parameter::Header header;
+  CHECK(fs.read(reinterpret_cast<char*>(&header), sizeof(header)))
+      << "Fail to read parameters in pserver";
+  CHECK(Parameter::isHeaderFormatSupported(header.format))
+      << "Incorrect format version: " << header.format;
+  CHECK_EQ(header.size, (size_t)size_)
+      << "The size (" << header.size << ") in the file does not match the size "
+      << "(" << size_ << ") of the pserver: " << serverId_;
+  CHECK_EQ(header.valueSize, sizeof(real)) << "Unsupported valueSize "
+                                           << header.valueSize;
+  CHECK(fs.read(reinterpret_cast<char*>(vec.getData()),
+                header.size * sizeof(real)));
+
+  callback(response);
+}
+
+void ParameterServer2::saveValueVector(const SaveValueRequest& request,
+                                       ProtoResponseCallback callback) {
+  SaveValueResponse response;
+  LOG(INFO) << "ParameterServer2::SaveValueVector: serverId=" << serverId_;
+
+  mkDir(request.dir_name().c_str());
+
+  constexpr int kBufLen = 100;
+  char buf[kBufLen];
+  snprintf(buf, kBufLen, "/pserver.%04d", static_cast<int>(serverId_));
+  std::string filename = request.dir_name() + buf;
+
+  std::ofstream fs(filename, std::ios_base::binary);
+  CHECK(fs) << "Fail to open " << filename;
+
+  CpuVector& vec = vectors_[PARAMETER_APPLY] ? *vectors_[PARAMETER_APPLY]
+                                             : *vectors_[PARAMETER_VALUE];
+  Parameter::Header header;
+  // TODO(TJ): save param headerFormat_
+  header.format = PARAM_FORMAT_ORIGINAL;
+  header.valueSize = sizeof(real);
+  header.size = size_;
+
+  CHECK_EQ(header.size, vec.getSize());
+
+  CHECK(fs.write(reinterpret_cast<char*>(&header), sizeof(header)))
+      << "Fail to write parameter in pserver: " << serverId_;
+
+  CHECK(fs.write(reinterpret_cast<char*>(vec.getData()),
+                 header.size * sizeof(real)))
+      << "Fail to write parameter in pserver: " << serverId_;
+
+  callback(response);
+}
+
+void ParameterServer2::op_RESET(const Operation& operation,
+                                OperationResult* result) {
+  (void)result;
+  CpuVector* u = vectors_[operation.pvectors(0)].get();
+  u->reset(operation.scalars(0));
+  clearUnusedSegments(u);
+}
+
+void ParameterServer2::op_utv(const Operation& operation,
+                              OperationResult* result) {
+  real* u = vectors_[operation.pvectors(0)]->getData();
+  real* v = vectors_[operation.pvectors(1)]->getData();
+  int64_t size = size_;
+  double sum = 0;
+  for (int64_t i = 0; i < size; ++i) {
+    sum += (double)u[i] * (double)v[i];
+  }
+  result->add_scalars(sum);
+}
+
+void ParameterServer2::op_au_bv(const Operation& operation,
+                                OperationResult* result) {
+  (void)result;
+  real* u = vectors_[operation.pvectors(0)]->getData();
+  real* v = vectors_[operation.pvectors(1)]->getData();
+  int64_t size = size_;
+  real a = operation.scalars(0);
+  real b = operation.scalars(1);
+  for (int64_t i = 0; i < size; ++i) {
+    v[i] = a * u[i] + b * v[i];
+  }
+}
+
+void ParameterServer2::op_COPY(const Operation& operation,
+                               OperationResult* result) {
+  (void)result;
+  real* u = vectors_[operation.pvectors(0)]->getData();
+  real* v = vectors_[operation.pvectors(1)]->getData();
+  int64_t size = size_;
+  for (int64_t i = 0; i < size; ++i) {
+    v[i] = u[i];
+  }
+}
+
+void ParameterServer2::op_au(const Operation& operation,
+                             OperationResult* result) {
+  (void)result;
+  real* u = vectors_[operation.pvectors(0)]->getData();
+  int64_t size = size_;
+  real a = operation.scalars(0);
+  for (int64_t i = 0; i < size; ++i) {
+    u[i] *= a;
+  }
+}
+
+void ParameterServer2::op_au_bv_cw(const Operation& operation,
+                                   OperationResult* result) {
+  (void)result;
+  real* u = vectors_[operation.pvectors(0)]->getData();
+  real* v = vectors_[operation.pvectors(1)]->getData();
+  real* w = vectors_[operation.pvectors(2)]->getData();
+  int64_t size = size_;
+  real a = operation.scalars(0);
+  real b = operation.scalars(1);
+  real c = operation.scalars(2);
+  for (int64_t i = 0; i < size; ++i) {
+    w[i] = a * u[i] + b * v[i] + c * w[i];
+  }
+}
+
+void ParameterServer2::op_make_steepest_desc_dir(const Operation& operation,
+                                                 OperationResult* result) {
+  (void)result;
+  real* dir = vectors_[operation.pvectors(0)]->getData();
+  real* grad = vectors_[operation.pvectors(1)]->getData();
+  real* x = vectors_[operation.pvectors(2)]->getData();
+  int64_t size = size_;
+  real l1weight = operation.scalars(0);
+  for (int64_t i = 0; i < size; ++i) {
+    if (x[i] < 0) {
+      dir[i] = -grad[i] + l1weight;
+    } else if (x[i] > 0) {
+      dir[i] = -grad[i] - l1weight;
+    } else {
+      if (grad[i] < -l1weight) {
+        dir[i] = -grad[i] - l1weight;
+      } else if (grad[i] > l1weight) {
+        dir[i] = -grad[i] + l1weight;
+      } else {
+        dir[i] = 0;
+      }
+    }
+  }
+}
+
+void ParameterServer2::op_fix_dir_signs(const Operation& operation,
+                                        OperationResult* result) {
+  (void)result;
+  real* dir = vectors_[operation.pvectors(0)]->getData();
+  real* steepestDescDir = vectors_[operation.pvectors(1)]->getData();
+  int64_t size = size_;
+  for (int64_t i = 0; i < size; ++i) {
+    if (dir[i] * steepestDescDir[i] <= 0) {
+      dir[i] = 0;
+    }
+  }
+}
+
+void ParameterServer2::op_fix_omega_signs(const Operation& operation,
+                                          OperationResult* result) {
+  (void)result;
+  real* x = vectors_[operation.pvectors(0)]->getData();
+  real* newx = vectors_[operation.pvectors(1)]->getData();
+  int64_t size = size_;
+  for (int64_t i = 0; i < size; ++i) {
+    if (x[i] * newx[i] < 0) {
+      newx[i] = 0;
+    }
+  }
+}
+
+void ParameterServer2::op_dir_deriv(const Operation& operation,
+                                    OperationResult* result) {
+  real* dir = vectors_[operation.pvectors(0)]->getData();
+  real* grad = vectors_[operation.pvectors(1)]->getData();
+  real* x = vectors_[operation.pvectors(2)]->getData();
+  int64_t size = size_;
+  real l1weight = operation.scalars(0);
+  double sum = 0;
+  for (int64_t i = 0; i < size; ++i) {
+    if (dir[i] != 0) {
+      if (x[i] < 0) {
+        sum += dir[i] * (grad[i] - l1weight);
+      } else if (x[i] > 0) {
+        sum += dir[i] * (grad[i] + l1weight);
+      } else if (dir[i] < 0) {
+        sum += dir[i] * (grad[i] - l1weight);
+      } else if (dir[i] > 0) {
+        sum += dir[i] * (grad[i] + l1weight);
+      }
+    }
+  }
+  result->add_scalars(sum);
+}
+
+void ParameterServer2::op_cost(const Operation& operation,
+                               OperationResult* result) {
+  real* x = vectors_[operation.pvectors(0)]->getData();
+  real* newgrad = vectors_[operation.pvectors(1)]->getData();
+  int64_t size = size_;
+  real l1weight = operation.scalars(0);
+  real l2weight = operation.scalars(1);
+  double cost_real = cost_ / mpiSize_;
+  double sum_weight_l1 = 0;
+  double sum_weight_l2 = 0;
+  for (int64_t i = 0; i < size; ++i) {
+    sum_weight_l1 += std::abs(x[i]);
+    sum_weight_l2 += x[i] * x[i];
+    newgrad[i] += 2.0 * l2weight * x[i];
+  }
+  cost_real += l1weight * sum_weight_l1 + l2weight * sum_weight_l2;
+  result->add_scalars(cost_real);
+}
+
+ParameterServer2::OperatorFunction ParameterServer2::opFuncs[] = {
+    nullptr,                         // PSERVER_OP_utu = 0;
+    &ParameterServer2::op_utv,       // PSERVER_OP_utv = 1;
+    &ParameterServer2::op_au,        // PSERVER_OP_au = 2;
+    &ParameterServer2::op_au_bv,     // PSERVER_OP_au_bv = 3;
+    nullptr,                         // PSERVER_OP_aAx_bu = 4;
+    &ParameterServer2::op_SGD,       // PSERVER_OP_SGD = 5;
+    &ParameterServer2::op_RESET,     // PSERVER_OP_RESET = 6;
+    &ParameterServer2::op_COPY,      // PSERVER_OP_COPY = 7;
+    &ParameterServer2::op_au_bv_cw,  // PSERVER_OP_au_bv_cw = 8;
+    &ParameterServer2::op_make_steepest_desc_dir,
+    /// PSERVER_OP_MAKE_STEEPEST_DESC_DIR = 9;
+    &ParameterServer2::op_fix_dir_signs,    // PSERVER_OP_FIX_SIGNS = 10;
+    &ParameterServer2::op_dir_deriv,        // PSERVER_OP_DIR_DERIV = 11;
+    &ParameterServer2::op_fix_omega_signs,  // PSERVER_OP_FIX_OMEGA_SIGNS = 12;
+    &ParameterServer2::op_cost,             // PSERVER_OP_COST = 13
+    &ParameterServer2::op_start_pass,       // PSERVER_OP_START_PASS = 14
+    &ParameterServer2::op_finish_pass,      // PSERVER_OP_FINISH_PASS = 15
+    &ParameterServer2::op_randomize,        // PSERVER_OP_RANDOMIZE = 16
+    &ParameterServer2::op_apply,            // PSERVER_OP_APPLY = 17
+};
+
+void ParameterServer2::doOperation(const DoOperationRequest& request,
+                                   ProtoResponseCallback callback) {
+  if (request.wait_for_gradient()) {
+    /// wait gradient update
+    gradientReadyBarrier_.wait();
+    allClientPassFinish_ = numPassFinishClients_ == FLAGS_num_gradient_servers;
+  }
+
+  DoOperationResponse response;
+  response.set_pass_finish(allClientPassFinish_);
+
+  for (const auto& op : request.operations()) {
+    OperationResult* opResult = response.add_results();
+    if (op.operation() >= ARRAYSIZE(opFuncs)) {
+      LOG(ERROR) << "Unknown operation " << op.operation();
+      response.set_return_message(kRetMsgUnknownOperation);
+    }
+    OperatorFunction opFunc = opFuncs[op.operation()];
+    if (!opFunc) {
+      LOG(ERROR) << "Operation not implemented: " << op.operation();
+      response.set_return_message(kRetMsgUnknownOperation);
+    }
+    (this->*opFunc)(op, opResult);
+  }
+
+  if (request.send_back_parameter()) {
+    /// clean current cost
+    cost_ = 0;
+
+    if (allClientPassFinish_ && request.release_pass()) {
+      /// This signals that all clients finish one pass, so waitPassFinish()
+      /// will stop waiting.
+      numPassFinishClients_ = 0;
+    }
+
+    /// notify addGradient() to send back parameter
+    parameterReadyBarrier_.wait();
+  }
+  callback(response);
+}
+
+void ParameterServer2::waitPassStart(const WaitPassStartRequest& request,
+                                     ProtoResponseCallback callback) {
+  passBarrier_.wait();
+  callback(WaitPassStartResponse());
+}
+
+void ParameterServer2::waitPassFinish(const WaitPassFinishRequest& request,
+                                      ProtoResponseCallback callback) {
+  numPassFinishClients_ += 1;
+
+  while (numPassFinishClients_ != 0) {
+    /// notify doOperation gradient ready
+    gradientReadyBarrier_.wait();
+    /// wait doOperation finish
+    parameterReadyBarrier_.wait();
+  }
+
+  callback(WaitPassFinishResponse());
+}
+
+void ParameterServer2::synchronize(const SynchronizeRequest& request,
+                                   ProtoResponseCallback callback) {
+  synchronizeBarriers_[request.sync_object_id()]->wait();
+  dataSize_ = 0;
+  callback(SynchronizeResponse());
+}
+
+void ParameterServer2::asyncFinishPass(const SynchronizeRequest& request,
+                                       ProtoResponseCallback callback) {
+  synchronizeBarriers_[request.sync_object_id()]->wait();
+  callback(SynchronizeResponse());
+
+  if (request.trainer_id() == 0) {
+    batchId_ = 0;
+  }
+}
+
+void ParameterServer2::createVector(const CreateVectorRequest& request,
+                                    ProtoResponseCallback callback) {
+  (void)request;
+  CreateVectorResponse response;
+  LOG(INFO) << "ParameterServer2::createVector: size=" << size_;
+  CpuVectorPtr vec = std::make_shared<CpuVector>(size_);
+  int64_t handle = -1;
+  {
+    std::lock_guard<RWLock> guard(parameterMutex_);
+    handle = vectors_.size();
+    vectors_.push_back(vec);
+  }
+  response.set_handle(handle);
+  callback(response);
+}
+
+void ParameterServer2::releaseVector(const ReleaseVectorRequest& request,
+                                     ProtoResponseCallback callback) {
+  ReleaseVectorResponse response;
+  CpuVectorPtr vec;
+  {
+    std::lock_guard<RWLock> guard(parameterMutex_);
+    vec.swap(vectors_[request.handle()]);
+  }
+  callback(response);
+}
+
+void ParameterServer2::createMatrix(const CreateMatrixRequest& request,
+                                    ProtoResponseCallback callback) {
+  CreateMatrixResponse response;
+  /// We need to create column major matrix of size_ * num_cols
+  /// Matrix is row majoar. Need to tranpose when use it.
+  CpuMatrixPtr mat = std::make_shared<CpuMatrix>(request.num_cols(), size_);
+  int64_t handle = -1;
+  {
+    std::lock_guard<RWLock> guard(parameterMutex_);
+    handle = matrices_.size();
+    matrices_.push_back(mat);
+  }
+  response.set_handle(handle);
+  callback(response);
+}
+
+void ParameterServer2::releaseMatrix(const ReleaseMatrixRequest& request,
+                                     ProtoResponseCallback callback) {
+  ReleaseMatrixResponse response;
+  CpuMatrixPtr mat;
+  {
+    std::lock_guard<RWLock> guard(parameterMutex_);
+    mat.swap(matrices_[request.handle()]);
+  }
+  callback(response);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/pserver/ParameterServer2.h b/paddle/legacy/pserver/ParameterServer2.h
new file mode 100644
index 0000000000000000000000000000000000000000..069e730ea4ea4b253518d70142f0f242145cd326
--- /dev/null
+++ b/paddle/legacy/pserver/ParameterServer2.h
@@ -0,0 +1,696 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <atomic>
+#include <limits>
+#include <mutex>
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+#include <vector>
+
+#include <stddef.h>
+#include <stdlib.h>
+
+#include "paddle/legacy/math/Matrix.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/parameter/ParameterOptimizer.h"
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/ThreadLocal.h"
+
+#include "ParameterService.pb.h"
+
+#include "ProtoServer.h"
+
+DECLARE_int32(port);
+
+namespace paddle {
+
+// @TODO(yanfei):
+// if armed with high density computation resource per node, pserver could also
+// utilize GPU to reduce overhead. if this mechanism is used, it could pipeline
+// network receiving and GPU computation to reduce the network overhead even
+// further. the pipeline could help to accelerate BIG model training.
+// @TODO:(yanfei)
+// for cpu and less/low gpu machine, the time exhausted by forward and backward
+// could be larger than optimization at pserver. However, if armed with lots of
+// gpus per node and if the model size is so large enough that limited cpu
+// computation causes big optmization latency, the GPU may be required by
+// pserver.
+
+/**
+ * Client interface for the parameter server
+ *
+ * it implements several rpc API for remote parameter client usage.
+ * for sync-sgd, client needs one controller thread to build connections
+ * to all pservers, these controller connections do barriers
+ * synchronization with these connections used for transfering data.
+ * each data connection uses block based fine grained synchronization
+ * to gain better scalability. Merging gradients from different trainers
+ * are concurrently executed with block units, so that some network
+ * overhead will be hidden in merging gradient.
+ * for async-sgd, the difference is that pserver will do optimization
+ * immediately if the gradients are ready, so that pserver needs to
+ * prepare separate buffer to store value for sending back to trainer
+ * to prevent from being polluted.
+ */
+class ParameterServer2 : public ProtoServer {
+ protected:
+  /// parameter_ mutex.
+  RWLock parameterMutex_;
+
+  typedef std::pair<size_t, int64_t> BlockKey;
+  struct BlockKeyHash {
+    size_t operator()(const BlockKey& key) const {
+      return std::hash<size_t>()(key.first) + key.second;
+    }
+  };
+
+  // TODO(yanfei):
+  // if index data structure is based on parameters instead of blocks, the
+  // lookup performance could be better. In addition, the block memory
+  // access almost exhibits good locality, so index data structure and
+  // block data structure can be refined further, especially if gpu is used
+  // for pserver.
+  /**
+   * all parameters are stored in CpuVector with a blockMap_ data structure
+   * to index block data required by requests.
+   */
+  typedef std::unordered_map<BlockKey, int64_t, BlockKeyHash> BlockMap;
+  /// <(para, block), global offset(byte) in all parameters>
+  BlockMap blockOffsetMap_;
+  /// <(para, block), global idx [0, nBlocksInAllParameters]>
+  BlockMap blockIdMap_;
+
+  std::vector<CpuVectorPtr> vectors_;
+  std::vector<CpuMatrixPtr> matrices_;
+  std::vector<CpuMemHandlePtr> dataMems_;
+
+  // TODO(yanfei):
+  // if storing sparse_remote_update() flag in request instead of
+  // reading configMap_, and storing config within new block wise
+  // overview data structure, the config mapping, block mapping
+  // can be unified in single clean data structure. Use para_id
+  // to index parameters, use offset to index block within parameter
+  // and keep two index into single one.
+  /**
+   * mapping between parameter and config
+   * different parameter allows different config, such as decay_rate.
+   * for each request, it need to read config for adding gradient
+   * and optmization.
+   */
+  std::unordered_map<size_t, ParameterConfig> configMap_;
+
+  /**
+   * to parallelize the multi-thread and multi-connnection
+   * computation at pserver, it use block unit to reduce
+   * the contention for computation, even further use block
+   * level optimizater control for each block for some special
+   * reason annotated below.
+   */
+  struct BlockInfo {
+    const ParameterConfig* config;
+    std::unique_ptr<std::mutex> lock;
+    /// global offset for all parameters
+    uint64_t offset;
+    /**
+     *
+     * Async sgd in pserver is very different from sync sgd.
+     * Each trainer follows startBatch, update*, finishBatch as in
+     * sync sgd, but all these actions are almost executed by
+     * multi-core and multi-thread simutaneously, so that async
+     * sgd optimization is based on block level in reality, then
+     * per block optimization is necessary indeed. In addition,
+     * per block optimization is also perfered for performance
+     * with multithreads.
+     */
+    std::unique_ptr<ParameterOptimizer> optimizer;
+  };
+  std::vector<BlockInfo> blockInfos_;
+
+  typedef std::vector<std::pair<int64_t, int64_t>> BlockSegments;
+  /// Because some blocks might not be fully used. We keep a
+  /// record of which segments are used.
+  BlockSegments usedSegments_;
+
+  /// record pserver status, all status defined in ParameterService.pb
+  PServerStatus status_;
+  /// record all samples processed which could be used by optimizater
+  std::atomic<int64_t> numSamplesProcessed_;
+  double cost_;
+  int mpiSize_;
+  int dataSize_;
+  /// configuration for current parameter optimizer
+  OptimizationConfig config_;
+
+  /**
+   * The ReadWriteBuffer is based on std::vector, but aligned for avx/sse
+   * compute. And add some helper method to allocate memory aligned blocks.
+   *
+   * @param T          type of element.
+   * @param AlignBytes the memory aligned bytes for allocated blocks.
+   */
+  template <typename T, size_t AlignBytes>
+  class ReadWriteBuffer
+      : public std::vector<T, AlignedAllocator<T, AlignBytes>> {
+   public:
+    static_assert(sizeof(T) % AlignBytes == 0 || AlignBytes % sizeof(T) == 0,
+                  "Type T must be able to aligned.");
+
+    /**
+     * @brief IsTLargerThanAlign compiled time calculated constant for is type
+     * T larger than alignments.
+     */
+    constexpr static bool IsTLargerThanAlign = sizeof(T) >= AlignBytes;
+
+    static_assert(std::is_pod<T>::value, "T must be POD type.");
+
+    /**
+     * @brief if AlignBytes > sizeof(T), then will calcuate how many elements
+     * can be stored in AlignBytes.
+     */
+    constexpr static size_t AlignElementCount = AlignBytes / sizeof(T);
+
+    static_assert(AlignElementCount ==
+                          (AlignElementCount & -AlignElementCount) ||
+                      AlignBytes > sizeof(T),
+                  "AlignElementCount should be exp of 2");
+
+    /**
+     * @brief Resize Buffer, with block count that will be allocated. Each block
+     * will be memory aligned in AlignBytes.
+     * @param size The element count in all blocks.
+     * @param alignBlockCount The block count that will be allocated.
+     */
+    void resizeWithAlignHints(size_t size, size_t alignBlockCount = 1) {
+      if (IsTLargerThanAlign) {  //! So, each elements is memory aligned.
+        this->resize(size);
+      } else {
+        //! at most, we need such elements in buffer to make sure each block is
+        //! aligned.
+        this->resize(size + alignBlockCount * (AlignElementCount - 1));
+      }
+    }
+
+    /**
+     * @brief reset aligned allocate blocks.
+     */
+    void resetAlignAlloc() { this->curOffset_ = 0; }
+
+    /**
+     * @brief get next aligned block address.
+     * @param blockSize is the element count in each block.
+     * @return Aligned block address.
+     */
+    T* nextBlock(size_t blockSize) {
+      T* r = &this->operator[](curOffset_);
+      curOffset_ += blockSize;
+
+      if (!IsTLargerThanAlign) {
+        curOffset_ =
+            (curOffset_ + AlignElementCount - 1) & ~(AlignElementCount - 1);
+      }
+      return r;
+    }
+
+   private:
+    size_t curOffset_;
+  };
+
+  /// to buffer the data from network for further processing to
+  /// reduce redundant memory allocation.
+  ThreadLocal<ReadWriteBuffer<real, ALIGN_HINT>> readWriteBuffer_;
+
+  /// size of the parameter
+  int64_t size_;
+
+  /// for synchronized training, check details in addGradient()
+  /// and doOperation()
+  ThreadBarrier gradientReadyBarrier_;
+  ThreadBarrier parameterReadyBarrier_;
+  ThreadBarrier passBarrier_;
+  ThreadLocal<std::vector<SendParameterRequest>> requestVec_;
+  ThreadLocal<std::vector<ProtoResponseCallbackEx>> callbackVec_;
+
+  std::atomic<int> numPassFinishClients_;
+  bool allClientPassFinish_;
+
+  std::vector<std::unique_ptr<ThreadBarrier>> synchronizeBarriers_;
+  std::atomic<int> serverId_;
+
+  /**
+   *
+   * for lagged async gradient gradient commit control in Async Sgd.
+   * discard lagged gradients from too slow nodes, whose gradients
+   * exhibits bad quality.
+   * Algorithm:
+   * pserver:
+   * 1. initial asyncUpdaterSteps = 0, asyncTrainerSteps_[N] = 0.
+   * syncUpdaterSteps means
+   *    the version of parameter value.
+   * 2. when pull arrives, record asyncUpdateSteps_ into
+   * syncTrainerSteps_[trainer_id]
+   * 3. when push arrives, compare asyncUpdateSteps_ with
+   * syncTrainerSteps_[trainer_id]
+   *    if delta > threshold, discard current gradient, else commit
+   *    gradient.
+   * 4. reset asyncUpdaterSteps_ and asyncTrainerSteps_[N] when pass
+   * finished
+   * Note:
+   * it can not discard all lag-gradient strictly in some special
+   * condition. part of gradients could be discarded if
+   * ConcurrentRemoteParameterUpdater is sed.
+   * this algorithm is implemented in asynSGD()
+   */
+  int64_t asyncLaggedThreshold_;
+  std::atomic<int64_t> asyncUpdateSteps_;
+  std::vector<int64_t> asyncTrainerSteps_;
+  size_t asyncLaggedGradientsNum_;
+  /// stat all async update
+  std::vector<size_t> asyncUpdateStat_;
+  /// stat per trainer_id
+  std::vector<size_t> asyncTrainerDiscardStat_;
+  /// stat per trainer_id
+  std::vector<size_t> asyncTrainerCommitStat_;
+
+  /// only used by controller and other control cmd from trainer number 0
+  std::unique_ptr<SyncThreadPool> syncThreadPool_;
+
+  /// pserver for sparse remote update parameters
+  bool isSparseServer_;
+
+  /// barrier performance tuning sync-sgd required
+  std::atomic<int64_t> batchId_;
+
+ public:
+  struct Buffer {
+    real* base;
+    size_t size;
+  };
+
+ protected:
+  /// async gradient commit control
+  bool asyncGrdientCommitCheckAndStat(const SendParameterRequest& request);
+
+ public:
+  /// disable default parameter for overloading
+  /// @rdmaCpu:the id of cpu core hosting RDMA server(0-N)
+  /// -1 means using TCP transport instead of RDMA
+  ParameterServer2(const std::string& addr, int port, int rdmaCpu = -1);
+
+  ~ParameterServer2() {}
+
+  static const std::string kRetMsgInvalidMatrixHandle;
+  static const std::string kRetMsgInvalidVectorHandle;
+  static const std::string kRetMsgUnknownOperation;
+
+  /// service functions
+  template <typename Dtype>
+  void reduceAndSendData(const SendDataRequest& request,
+                         std::unique_ptr<MsgReader>& msgReader,
+                         ProtoResponseCallbackEx& callback);
+
+  void templateReduceSum(const SendDataRequest& request,
+                         std::unique_ptr<MsgReader>& msgReader,
+                         ProtoResponseCallbackEx& callback);
+
+  /**
+   * @brief framework for sending parameters
+   *
+   * @note  different parameter data type can be sent to pserver.
+   *        in most case, the api is used to send gradients from
+   *        trainer to pserver.
+   *        it also can be used to retrieve parameters from pserver
+   */
+  void sendParameter(const SendParameterRequest& request,
+                     std::unique_ptr<MsgReader> msgReader,
+                     ProtoResponseCallbackEx callback);
+
+  void sendData(const SendDataRequest& request,
+                std::unique_ptr<MsgReader> msgReader,
+                ProtoResponseCallbackEx callback);
+
+  /**
+   * @brief send config to pserver
+   *
+   * @note  it can help pserver to understand the configuration for
+   * optimization,
+   *        logging control, duplicated initialization, etc.
+   */
+  void setConfig(const SetConfigRequest& request,
+                 ProtoResponseCallback callback);
+
+  /**
+   * @brief get status for pserver
+   *
+   * @note  used to check if parameters are ready at pserver
+   */
+  void getStatus(const GetStatusRequest& request,
+                 ProtoResponseCallback callback);
+
+  /**
+   * @brief set status for pserver
+   *
+   * @note  used to check if parameters are ready at pserver, since parameters
+   *        at pserver are initialized by trainer
+   */
+  void setStatus(const SetStatusRequest& request,
+                 ProtoResponseCallback callback);
+
+  /**
+   * @brief framework for doing some operation at pserver end
+   *
+   * @note  if sync-sgd is used, controller will calling op_SGD action
+   *        for gradient optimization.
+   *        check avaiable operations in opFuncs[]
+   */
+  void doOperation(const DoOperationRequest& request,
+                   ProtoResponseCallback callback);
+
+  /// Create a column vector. The size is the dimension of parameter
+  void createVector(const CreateVectorRequest& request,
+                    ProtoResponseCallback callback);
+
+  void releaseVector(const ReleaseVectorRequest& request,
+                     ProtoResponseCallback callback);
+
+  /// Create a column major matrix. The number of rows is the dimension of
+  /// parameter. The number of columns is specifed by num_cols.
+  void createMatrix(const CreateMatrixRequest& request,
+                    ProtoResponseCallback callback);
+
+  void releaseMatrix(const ReleaseMatrixRequest& request,
+                     ProtoResponseCallback callback);
+  /**
+   * @brief stateful control for indicationg sync pass start
+   *
+   * @note  it is valuable for logging and state control,
+   *        especially for sync-sgd control
+   */
+  void waitPassStart(const WaitPassStartRequest& request,
+                     ProtoResponseCallback callback);
+
+  /**
+   * @brief stateful control for indicationg sync pass end
+   *
+   * @note  it is valuable for logging and state control,
+   *        especially for sync-sgd control
+   */
+  void waitPassFinish(const WaitPassFinishRequest& request,
+                      ProtoResponseCallback callback);
+
+  /**
+   * @brief synchronize all distributed trainers
+   *
+   * @note  it's general api for synchronizing trainer and pserver
+   */
+  void synchronize(const SynchronizeRequest& request,
+                   ProtoResponseCallback callback);
+
+  /**
+   * @brief stateful control for indicating async pass is finished
+   *
+   * @note  it is valuable for logging control, state reset, etc.
+   */
+  void asyncFinishPass(const SynchronizeRequest& request,
+                       ProtoResponseCallback callback);
+
+  void loadValueVector(const LoadValueRequest& request,
+                       ProtoResponseCallback callback);
+
+  void saveValueVector(const SaveValueRequest& request,
+                       ProtoResponseCallback callback);
+
+ public:
+  /**
+   * @brief initialize parameter server
+   */
+  bool init();
+
+  /**
+   * @brief set parameters at pserver
+   *
+   * @note  do parameter initialization if neccessy.
+   */
+  void setParameter(const SendParameterRequest& request,
+                    std::vector<Buffer>& inputBuffers,
+                    SendParameterResponse* response,
+                    std::vector<Buffer>* outputBuffers);
+
+  /**
+   * @brief receive gradients and do optimization for async-sgd
+   *
+   * @note  this api asynchronizately receives all data from all
+   *        trainers, and immediately do optimization and return
+   *        optimizated value for trainer.
+   *        this above routine are block based atomic updating,
+   *        which means different block could based different stale
+   *        gradient.
+   *        it will discard some lagged gradients by default for
+   *        better convergence.
+   */
+  void asyncSGD(const SendParameterRequest& request,
+                std::vector<Buffer>& inputBuffers,
+                SendParameterResponse* response,
+                std::vector<Buffer>* outputBuffers);
+
+  /**
+   * @brief merge gradients from all trainer
+   *
+   * @note  this api use block based parallelization as fine grained
+   *        parallelization which benifits lock contention and latency
+   *        hidden for communication, also can harness multi-core
+   *        efficiently.
+   *        it also implements the synchronization for sync-sgd
+   */
+  void addGradient(const SendParameterRequest& request,
+                   std::vector<Buffer>& inputBuffers,
+                   SendParameterResponse* response,
+                   std::vector<Buffer>* outputBuffers);
+
+  /**
+   * @brief get dense parameters from pserver
+   *
+   * @note  for some specified condition, trainer will get parameters from
+   *        pservers.
+   *        e.g.
+   *        if all parameters are stored at perver end for big model training
+   *        trainer can use it to retrieve all parameters if necessary.
+   */
+  void getParameter(const SendParameterRequest& request,
+                    std::vector<Buffer>& inputBuffers,
+                    SendParameterResponse* response,
+                    std::vector<Buffer>* outputBuffers);
+
+  /**
+   * @brief get sparse value from parameter server
+   *
+   * @note  with sparse enabled, pservers own all latest value
+   *        while trainer only retrieve value that only are needed.
+   *        e.g.
+   *        trainer will do prefetch action to retrieve necessary latest
+   *        value from pserver for sparse calculation.
+   */
+  void getParameterSparse(const SendParameterRequest& request,
+                          std::vector<Buffer>& inputBuffers,
+                          SendParameterResponse* response,
+                          std::vector<Buffer>* outputBuffers);
+
+ protected:
+  void mergeSegments(BlockSegments* segments);
+
+  /// set the unused segments to zero
+  void clearUnusedSegments(CpuVector* vec);
+
+  // TODO(yanfei):
+  // if read data and do optimization interleavely block by block,
+  // the performance could be better for gaining less network congestion.
+  /// read all data from connection and store it in static pre-allocated buffer
+  void readAllBlocks(MsgReader* msgReader,
+                     std::vector<ParameterServer2::Buffer>* buffers);
+
+  const ParameterConfig& getParameterConfig(const ParameterBlock& block) {
+    CHECK_LT(block.para_id(), -1UL) << "invalid parameter id:"
+                                    << block.para_id();
+    const auto it = configMap_.find(block.para_id());
+    CHECK(it != configMap_.end()) << "can not find parameter id: "
+                                  << block.para_id();
+    return it->second;
+  }
+
+  /// it implictly check blockOffsetMap_ while retrieving blockId
+  const ParameterConfig& getParameterConfig(int64_t blockId) const {
+    CHECK(blockId >= 0 && blockId < (int64_t)blockInfos_.size())
+        << "block idx out of range, id: " << blockId
+        << " info size: " << blockInfos_.size();
+    return *(blockInfos_[blockId].config);
+  }
+
+  template <class Response>
+  bool isValidVectorHandle(int64_t handle, Response* response) {
+    if (handle < 0 || (size_t)handle >= vectors_.size()) {
+      LOG(ERROR) << "Invalid vector handle " << handle;
+      response->set_return_message(kRetMsgInvalidVectorHandle);
+      return false;
+    }
+    return true;
+  }
+
+  template <class Response>
+  bool isValidMatrixHandle(int64_t handle, Response* response) {
+    if (handle < 0 || (size_t)handle >= matrices_.size()) {
+      LOG(ERROR) << "Invalid matrix handle " << handle;
+      response->set_return_message(kRetMsgInvalidMatrixHandle);
+      return false;
+    }
+    return true;
+  }
+
+  /**
+   * @brief get block offset
+   *
+   * @note  block.begin_dim is added to the block offset.
+   *        return -1 if block cannot be found
+   */
+  int64_t getBlockOffset(const ParameterBlock& block) const {
+    BlockKey key(block.para_id(), block.block_id());
+    auto it = blockOffsetMap_.find(key);
+    if (it == blockOffsetMap_.end()) {
+      return -1;
+    }
+    return it->second;
+  }
+
+  /// return -1 if block cannot be found
+  int64_t getBlockId(const ParameterBlock& block) const {
+    BlockKey key(block.para_id(), block.block_id());
+    auto it = blockIdMap_.find(key);
+    if (it == blockIdMap_.end()) {
+      return -1;
+    }
+    return it->second;
+  }
+
+  /**
+   * @brief prepare data for sending back
+   *
+   * @note  modify reponse and outputBuffers for sending parameter
+   *        back to client. The buffer for socket sending uses
+   *        vectors_[parameterType] directly
+   *        for dense with sync-sgd
+   */
+  void sendBackParameter(const ParameterBlock& block,
+                         int parameterType,
+                         SendParameterResponse* response,
+                         std::vector<Buffer>* outputBuffers);
+
+  /**
+   * @brief prepare data for sending back
+   *
+   * @note  modify response and outputBuffers for sending parameter
+   *        back to client. The buffer for socket sending uses buffer->base
+   *        The parameter values are copied from vectors_[parameterType]
+   *        to buffer->base.
+   *        for dense with async-sgd
+   */
+  void sendBackParameter(const ParameterBlock& block,
+                         int parameterType,
+                         SendParameterResponse* response,
+                         Buffer* buffer,
+                         std::vector<Buffer>* outputBuffers);
+  /**
+   * @brief prepare data for sending back
+   *
+   * @note  specified for sparse
+   */
+  void sendBackParameterSparse(const ParameterBlock& block,
+                               int parameterType,
+                               SendParameterResponse* response,
+                               Buffer* buffer,
+                               size_t width,
+                               std::vector<Buffer>* outputBuffers);
+
+  /**
+   * framework routine for block parallelization
+   * e.g.
+   * for optimization on all blocks at pserver end, this routine can facilitize
+   * the parallelize of do optimization on all blocks with multithreads.
+   */
+  typedef std::function<void(int64_t blockId, const VectorPtr vecs[])> ExecFunc;
+  void parallelExecForEachBlock(ExecFunc func);
+  void blockTraverse(BlockInfo& info,
+                     const ParameterConfig& config,
+                     int64_t offset,
+                     size_t size,
+                     const VectorPtr vecs[],
+                     const ParameterOptimizer::TraverseCallback& callback);
+
+ public:
+  typedef void (ParameterServer2::*OperatorFunction)(const Operation& operation,
+                                                     OperationResult* result);
+
+  /**
+   * doOperation will call following operations indirectly
+   * e.g.
+   * for sync-sgd control, the controller in remote updater will send op_SGD
+   * command to pserver, then send sendParameter request to pserver immediately.
+   * the two function at pserver end will do cooperation to achieve the sync-sgd
+   * gradient merge and optimization.
+   * the most following operations are specified for owlqn, all operations are
+   * under the context of doOperation function
+   */
+  static OperatorFunction opFuncs[];
+
+  void op_SGD(const Operation& operation, OperationResult* result);
+
+  void op_RESET(const Operation& operation, OperationResult* result);
+
+  void op_utv(const Operation& operation, OperationResult* result);
+
+  void op_au_bv(const Operation& operation, OperationResult* result);
+
+  void op_COPY(const Operation& operation, OperationResult* result);
+
+  void op_au(const Operation& operation, OperationResult* result);
+
+  void op_au_bv_cw(const Operation& operation, OperationResult* result);
+
+  void op_make_steepest_desc_dir(const Operation& operation,
+                                 OperationResult* result);
+
+  void op_fix_dir_signs(const Operation& operation, OperationResult* result);
+
+  void op_dir_deriv(const Operation& operation, OperationResult* result);
+
+  void op_fix_omega_signs(const Operation& operation, OperationResult* result);
+
+  void op_cost(const Operation& operation, OperationResult* result);
+
+  void op_start_pass(const Operation& operation, OperationResult* result);
+  void op_finish_pass(const Operation& operation, OperationResult* result);
+
+  void op_apply(const Operation& operation, OperationResult* result);
+
+  void op_randomize(const Operation& operation, OperationResult* result);
+
+  void op_load(const Operation& operation, OperationResult* result);
+  void op_save(const Operation& operation, OperationResult* result);
+};
+
+}  // namespace paddle
diff --git a/paddle/pserver/ParameterServer2Main.cpp b/paddle/legacy/pserver/ParameterServer2Main.cpp
similarity index 100%
rename from paddle/pserver/ParameterServer2Main.cpp
rename to paddle/legacy/pserver/ParameterServer2Main.cpp
diff --git a/paddle/pserver/ParameterServerController.cpp b/paddle/legacy/pserver/ParameterServerController.cpp
similarity index 100%
rename from paddle/pserver/ParameterServerController.cpp
rename to paddle/legacy/pserver/ParameterServerController.cpp
diff --git a/paddle/legacy/pserver/ParameterServerController.h b/paddle/legacy/pserver/ParameterServerController.h
new file mode 100644
index 0000000000000000000000000000000000000000..b90d0cbceaa879b8cb281867b5326ff50c1e311a
--- /dev/null
+++ b/paddle/legacy/pserver/ParameterServerController.h
@@ -0,0 +1,74 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ParameterServer2.h"
+#include "ParameterServerConfig.pb.h"
+#include "RDMANetwork.h"
+#include "paddle/legacy/utils/StringUtil.h"
+
+namespace paddle {
+
+/**
+ * @brief ParameterServerController is used for create, init and manage multi
+ * parameter server instances. The num of the instances is decided by port
+ * num(the ports number for parameter send) and network devices configured
+ * by gflags or proto.
+ */
+class ParameterServerController final {
+ public:
+  DISABLE_COPY(ParameterServerController);
+
+  /**
+   * @brief Ctor, Create a ParameterServerController from ParameterServerConfig.
+   */
+  explicit ParameterServerController(const ParameterServerConfig& config);
+
+  /**
+   * @brief Dtor.
+   */
+  ~ParameterServerController();
+
+  /**
+   * @brief create ParameterServerController from gflags, this is used for
+   * compatibility with the old usage of configuration by gflags.
+   */
+  static ParameterServerController* createFromGflags();
+
+  /**
+   * @brief create ParameterServerController with ParameterServerConfig, remove
+   * gflags from ParameterServer. Init all ParameterServer2 instances according
+   * to
+   * the config.
+   */
+  static ParameterServerController* create(const ParameterServerConfig& config);
+
+  /**
+   * @brief start all ParameterServer2 instances in this
+   * ParameterServerController.
+   */
+  void start();
+
+  /**
+   * @brief join and wait for all ParameterServer2 instances thread in this
+   * ParameterServerController.
+   */
+  void wait();
+
+ private:
+  std::vector<std::unique_ptr<ParameterServer2>> parameterServers_;
+};
+
+}  // namespace paddle
diff --git a/paddle/pserver/ProtoServer.cpp b/paddle/legacy/pserver/ProtoServer.cpp
similarity index 100%
rename from paddle/pserver/ProtoServer.cpp
rename to paddle/legacy/pserver/ProtoServer.cpp
diff --git a/paddle/legacy/pserver/ProtoServer.h b/paddle/legacy/pserver/ProtoServer.h
new file mode 100644
index 0000000000000000000000000000000000000000..2943867de5885ab1af1aa0f69e93a931092b28e3
--- /dev/null
+++ b/paddle/legacy/pserver/ProtoServer.h
@@ -0,0 +1,267 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "LightNetwork.h"
+
+#include <map>
+
+#include <google/protobuf/message_lite.h>
+
+namespace paddle {
+
+/**
+ *
+ * It implements the rpc framework, which launchs one thread for each
+ * connection. Here define one parameter server as single TCP server
+ * binding on single port. All connections share single tcp ProtoServer
+ * object, each connection handles all requests from specified trainer
+ * within single worker thread.
+ * to accelerate bandwidth efficiency and harness multicore for pserver
+ * optimization to reduce pserver latency, you could launch more port
+ * for single NIC hardward with --port=N(N>1) for small cluster job.
+ */
+class ProtoServer : public SocketServer {
+ public:
+  /// rdmaCpu controls the cpu affinity of RDMA server daemon,
+  /// which could benifit performance. rdmaCpu = -1 means TCP
+  /// is used instead of RDMA transport.
+  ProtoServer(const std::string& addr, int port, int rdmaCpu = -1)
+      : SocketServer(addr, port, rdmaCpu) {}
+
+  typedef std::function<void(const google::protobuf::MessageLite& protoOut,
+                             const std::vector<iovec>& outputIovs)>
+      ProtoResponseCallbackEx;
+
+  typedef std::function<void(const google::protobuf::MessageLite& protoOut)>
+      ProtoResponseCallback;
+
+  /**
+   * Register a service function for this server
+   * void(const ProtoIn& request,
+   *      ProtoResponseCallback callback)
+   * The service function process the request and call the callback
+   * after it finishes the request.
+
+   * Use macro REGISTER_SERVICE_FUNCTION as a helper
+   * to simplify the use.
+   */
+  template <class ProtoIn>
+  void registerServiceFunction(
+      const std::string& funcName,
+      std::function<void(const ProtoIn& request,
+                         ProtoResponseCallback callback)> func);
+
+  /**
+   * Register a service function for this server
+   * The signature of the service function is
+   * void(const ProtoIn&,
+   *      std::unique_ptr<MsgReader> msgReader,
+   *      ProtoResponseCallbackEx callback)
+   * The service function process the request and call the callback
+   * after it finishes the request.
+   * The extended service function can take extra input blocks from
+   * the communication channel by reading msgReader. It can also
+   * send extra blocks to the communication channel by providing
+   * outputIovs as the argument for the callback function.
+
+   * Use macro REGISTER_SERVICE_FUNCTION_EX as a helper
+   * to simplify the use.
+   */
+  template <class ProtoIn>
+  void registerServiceFunctionEx(
+      const std::string& funcName,
+      std::function<void(const ProtoIn&,
+                         std::unique_ptr<MsgReader> msgReader,
+                         ProtoResponseCallbackEx callback)> func);
+
+ protected:
+  /**
+   * @brief handle rpc request
+   * @param[in] msgReader  Message reader for reading data from connection
+   * @param[in] callback   equal to channel->writeMessage
+   *
+   * @note  it lookups rpc function mapping table to find function pointer,
+   *        then call this function with further reading data from connection
+   */
+  virtual void handleRequest(std::unique_ptr<MsgReader> msgReader,
+                             ResponseCallback callback);
+
+  typedef std::function<void(std::unique_ptr<MsgReader> msgReader,
+                             ResponseCallback callback)>
+      ServiceFunction;
+
+  /**
+   * @brief register one RPC function in function mapping
+   * @param[in] funcName  function name string
+   * @param[in] func      rpc function wrapped with reading and writing data
+   */
+  void registerServiceFunctionImp(const std::string& funcName,
+                                  ServiceFunction func);
+
+ protected:
+  /// Tuning bare network overhead: the beginning of receiving request
+  ThreadLocal<struct timeval> handleRequestBegin_;
+
+  /// mapping to find rpc function while handling request
+  std::map<std::string, ServiceFunction> nameToFuncMap_;
+};
+
+class ProtoClient : public SocketClient {
+ public:
+  ProtoClient(const std::string& serverAddr,
+              int serverPort,
+              enum ChannelType channelType = F_TCP)
+      : SocketClient(serverAddr, serverPort, channelType) {}
+
+  /**
+   * @brief Make a request to the server.
+   * @param[in] funcName  request rpc function name string
+   * @param[in] proto     protobuf data for sending to pserver
+   * @param[in] iov       additional iov data for sending to pserver
+   *
+   * @note  iov provides additional blocks which need to be written to the
+   *        communication channel
+   */
+  void send(const char* funcName,
+            const google::protobuf::MessageLite& proto,
+            const std::vector<iovec>& iov = std::vector<iovec>());
+
+  /**
+   * @brief receive the response from the server.
+   * @param[in] proto     proto binary buffer
+   *
+   * @note  this must be paired with a corresponding send() call. The
+   *        returned MsgReader allows the caller to receive additional
+   *        blocks from the communication channel.
+   */
+  std::unique_ptr<MsgReader> recv(google::protobuf::MessageLite* proto);
+
+  /// combines send() and recv()
+  std::unique_ptr<MsgReader> sendAndRecv(
+      const char* funcName,
+      const google::protobuf::MessageLite& protoIn,
+      google::protobuf::MessageLite* protoOut) {
+    send(funcName, protoIn);
+    return recv(protoOut);
+  }
+
+  /// combines send() and recv()
+  std::unique_ptr<MsgReader> sendAndRecv(
+      const char* funcName,
+      const google::protobuf::MessageLite& protoIn,
+      const std::vector<iovec>& iov,
+      google::protobuf::MessageLite* protoOut) {
+    send(funcName, protoIn, iov);
+    return recv(protoOut);
+  }
+};
+
+template <class>
+struct service_arg_type;
+/// helper class for obtaining the argument type of a service function
+template <class R, class C, class Arg1, class Arg2>
+struct service_arg_type<R (C::*)(const Arg1&, Arg2)> {
+  typedef Arg1 _1;
+};
+
+template <class R, class C, class Arg1, class Arg2>
+struct service_arg_type<R (C::*)(  // NOLINT
+    const Arg1&,
+    std::unique_ptr<MsgReader>,
+    Arg2)> {
+  typedef Arg1 _1;
+};
+
+/// register a service function to the ProtoServer
+/// This should only be used within a member function of className
+#define REGISTER_SERVICE_FUNCTION(className, funcName)       \
+  registerServiceFunction<                                   \
+      service_arg_type<decltype(&className::funcName)>::_1>( \
+      #funcName,                                             \
+      std::bind(&className::funcName,                        \
+                this,                                        \
+                std::placeholders::_1,                       \
+                std::placeholders::_2))
+
+/// register a service function to the ProtoServer
+/// This should only be used within a member function of className
+#define REGISTER_SERVICE_FUNCTION_EX(className, funcName)    \
+  registerServiceFunctionEx<                                 \
+      service_arg_type<decltype(&className::funcName)>::_1>( \
+      #funcName,                                             \
+      std::bind(&className::funcName,                        \
+                this,                                        \
+                std::placeholders::_1,                       \
+                std::placeholders::_2,                       \
+                std::placeholders::_3))
+
+/// create wrapper function for parameter server high level function and
+/// register the wrapper function into function mapping.
+template <class ProtoIn>
+void ProtoServer::registerServiceFunctionEx(
+    const std::string& funcName,
+    std::function<void(const ProtoIn&,
+                       std::unique_ptr<MsgReader> msgReader,
+                       ProtoResponseCallbackEx callback)> func) {
+  auto f = [func](std::unique_ptr<MsgReader> msgReader,
+                  ResponseCallback callback) {
+    ProtoIn request;
+    std::string str(msgReader->getNextBlockLength(), 0);
+    msgReader->readNextBlock(&str[0]);
+    CHECK(request.ParseFromString(str));
+    auto pcob = [callback](const google::protobuf::MessageLite& response,
+                           const std::vector<iovec>& outputIovs) {
+      std::string out;
+      CHECK(response.SerializeToString(&out));
+      std::vector<iovec> iovs;
+      iovs.push_back({&out[0], out.size()});
+      iovs.insert(iovs.end(), outputIovs.begin(), outputIovs.end());
+      callback(iovs);
+    };
+
+    func(request, std::move(msgReader), pcob);
+  };
+
+  registerServiceFunctionImp(funcName, f);
+}
+
+template <class ProtoIn>
+void ProtoServer::registerServiceFunction(
+    const std::string& funcName,
+    std::function<void(const ProtoIn&, ProtoResponseCallback callback)> func) {
+  auto f = [func](std::unique_ptr<MsgReader> msgReader,
+                  ResponseCallback callback) {
+    ProtoIn request;
+    std::string str(msgReader->getNextBlockLength(), 0);
+    msgReader->readNextBlock(&str[0]);
+    CHECK(request.ParseFromString(str));
+    msgReader.reset();
+
+    auto pcob = [callback](const google::protobuf::MessageLite& response) {
+      std::string out;
+      CHECK(response.SerializeToString(&out));
+      std::vector<iovec> iovs;
+      iovs.push_back({&out[0], out.size()});
+      callback(iovs);
+    };
+
+    func(request, pcob);
+  };
+
+  registerServiceFunctionImp(funcName, f);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/pserver/RDMANetwork.h b/paddle/legacy/pserver/RDMANetwork.h
new file mode 100644
index 0000000000000000000000000000000000000000..c87056f72c56647c827cdbd7bdd6a992b4bb1cf6
--- /dev/null
+++ b/paddle/legacy/pserver/RDMANetwork.h
@@ -0,0 +1,158 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifndef PADDLE_DISABLE_RDMA
+#include "sxi_sock.h"
+#else
+#define PROMPT_ERR() LOG(FATAL) << "Paddle is not compiled with rdma"
+#endif
+#include "paddle/legacy/utils/Logging.h"
+
+#include <netinet/in.h>
+struct sxi_sock;
+struct sxi_socket;
+
+#ifndef MAX_VEC_SIZE
+// define default MAX_VEC_SIZE
+#define MAX_VEC_SIZE (1UL << 16)
+#endif
+
+namespace paddle {
+/// Namespace rdma is adaptors for sxi_sock.h. Make paddle not depend on it
+/// when disable rdma support
+namespace rdma {
+inline int numCpus() {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_num_configured_cpus();
+#else
+  return 0;
+#endif
+}
+
+inline sxi_socket* ssocket(int cpuId) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_ssocket(cpuId);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline int listen(sxi_socket* s) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_listen(s);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline int bind(sxi_socket* s, const char* str) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_bind(s, str);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline sxi_sock* accept(sxi_socket* s) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_accept(s);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline sockaddr_in* getSourceAddress(sxi_sock* sock) {
+#ifndef PADDLE_DISABLE_RDMA
+  return reinterpret_cast<sockaddr_in*>(&sock->sa);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline int close(sxi_socket* sock) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_socket_close(sock);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline int close(sxi_sock* sock) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_sock_close(sock);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline void init() {
+#ifndef PADDLE_DISABLE_RDMA
+  sxi_module_init();
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline sxi_socket* csocket(int cpuId) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_csocket(cpuId);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline ssize_t read(sxi_sock* channel, void* data, size_t len) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_read(channel, data, len);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline ssize_t write(sxi_sock* channel, void* data, size_t len) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_write(channel, data, len);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline ssize_t readv(sxi_sock* channel, iovec* iov, int count) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_readv(channel, iov, count);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline ssize_t writev(sxi_sock* channel, iovec* iov, int count) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_writev(channel, iov, count);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+inline sxi_sock* connect(sxi_socket* socket, const char* url) {
+#ifndef PADDLE_DISABLE_RDMA
+  return sxi_connect(socket, url);
+#else
+  PROMPT_ERR();
+#endif
+}
+
+}  //  namespace rdma
+}  //  namespace paddle
diff --git a/paddle/legacy/pserver/SocketChannel.cpp b/paddle/legacy/pserver/SocketChannel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..79c763c62ba845067c7729eafb5b218fc7b91482
--- /dev/null
+++ b/paddle/legacy/pserver/SocketChannel.cpp
@@ -0,0 +1,235 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "SocketChannel.h"
+
+#include <netdb.h>
+#include <netinet/in.h>
+#include <stdio.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include "RDMANetwork.h"
+
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+/**
+ * UIO_MAXIOV is documented in writev(2), but <sys/uio.h> only
+ * declares it on osx/ios if defined(KERNEL)
+ */
+#ifndef UIO_MAXIOV
+#define UIO_MAXIOV 512
+#endif
+
+SocketChannel::~SocketChannel() {
+  if (tcpRdma_ == F_TCP)
+    close(tcpSocket_);
+  else
+    rdma::close(rdmaSocket_);
+  LOG(INFO) << "destory connection in socket channel, peer = " << peerName_;
+}
+
+size_t SocketChannel::read(void* buf, size_t size) {
+  size_t total = 0;
+  while (total < size) {
+    ssize_t len;
+    if (tcpRdma_ == F_TCP)
+      len = ::read(tcpSocket_, (char*)buf + total, size - total);
+    else
+      len = rdma::read(rdmaSocket_, (char*)buf + total, size - total);
+
+    CHECK(len >= 0) << " peer=" << peerName_;
+    if (len <= 0) {
+      return total;
+    }
+    total += len;
+  }
+  return total;
+}
+
+size_t SocketChannel::write(const void* buf, size_t size) {
+  size_t total = 0;
+  while (total < size) {
+    ssize_t len;
+    if (tcpRdma_ == F_TCP)
+      len = ::write(tcpSocket_, (const char*)buf + total, size - total);
+    else
+      len = rdma::write(rdmaSocket_, (char*)buf + total, size - total);
+
+    CHECK(len >= 0) << " peer=" << peerName_;
+    if (len <= 0) {
+      return total;
+    }
+    total += len;
+  }
+  return total;
+}
+
+template <class IOFunc, class SocketType>
+static size_t readwritev(IOFunc iofunc,
+                         SocketType socket,
+                         iovec* iovs,
+                         int iovcnt,
+                         int maxiovs,
+                         const std::string& peerName) {
+  int curIov = 0;
+  size_t total = 0;
+
+  for (int i = 0; i < iovcnt; ++i) {
+    total += iovs[i].iov_len;
+  }
+
+  size_t size = 0;
+  size_t curIovSizeDone = 0;
+
+  while (size < total) {
+    ssize_t len =
+        iofunc(socket, &iovs[curIov], std::min(iovcnt - curIov, maxiovs));
+    CHECK(len > 0) << " peer=" << peerName << " curIov=" << curIov
+                   << " iovCnt=" << iovcnt
+                   << " iovs[curIov].base=" << iovs[curIov].iov_base
+                   << " iovs[curIov].iov_len=" << iovs[curIov].iov_len;
+    size += len;
+
+    /// restore iovs[curIov] to the original value
+    iovs[curIov].iov_base =
+        (void*)((char*)iovs[curIov].iov_base - curIovSizeDone);
+    iovs[curIov].iov_len += curIovSizeDone;
+
+    len += curIovSizeDone;
+
+    while (curIov < iovcnt) {
+      if ((size_t)len < iovs[curIov].iov_len) break;
+      len -= iovs[curIov].iov_len;
+      ++curIov;
+    }
+    if (curIov < iovcnt) {
+      curIovSizeDone = len;
+      iovs[curIov].iov_base = (void*)((char*)iovs[curIov].iov_base + len);
+      iovs[curIov].iov_len -= len;
+    }
+  }
+  return size;
+}
+
+/// rdma::readv and rdma::writev can take advantage of RDMA blocking offload
+/// transfering
+size_t SocketChannel::writev(const std::vector<struct iovec>& iovs) {
+  if (tcpRdma_ == F_TCP)
+    return readwritev(::writev,
+                      tcpSocket_,
+                      const_cast<iovec*>(&iovs[0]),
+                      iovs.size(),
+                      UIO_MAXIOV,
+                      peerName_);
+  else
+    return readwritev(rdma::writev,
+                      rdmaSocket_,
+                      const_cast<iovec*>(&iovs[0]),
+                      iovs.size(),
+                      MAX_VEC_SIZE,
+                      peerName_);
+}
+
+size_t SocketChannel::readv(std::vector<struct iovec>* iovs) {
+  if (tcpRdma_ == F_TCP)
+    return readwritev(::readv,
+                      tcpSocket_,
+                      const_cast<iovec*>(&(*iovs)[0]),
+                      iovs->size(),
+                      UIO_MAXIOV,
+                      peerName_);
+  else
+    return readwritev(rdma::readv,
+                      rdmaSocket_,
+                      const_cast<iovec*>(&(*iovs)[0]),
+                      iovs->size(),
+                      MAX_VEC_SIZE,
+                      peerName_);
+}
+
+void SocketChannel::writeMessage(const std::vector<struct iovec>& userIovs) {
+  MessageHeader header;
+  header.numIovs = userIovs.size();
+
+  std::vector<size_t> iovLengths;
+  iovLengths.reserve(userIovs.size());
+  for (auto& iov : userIovs) {
+    iovLengths.push_back(iov.iov_len);
+  }
+
+  std::vector<iovec> iovs;
+  iovs.reserve(userIovs.size() + 2);
+  iovs.push_back({&header, sizeof(header)});
+  iovs.push_back({&iovLengths[0],
+                  static_cast<size_t>(sizeof(iovLengths[0]) * header.numIovs)});
+  iovs.insert(iovs.end(), userIovs.begin(), userIovs.end());
+
+  header.totalLength = 0;
+  for (auto& iov : iovs) {
+    header.totalLength += iov.iov_len;
+  }
+
+  CHECK(writev(iovs) == (size_t)header.totalLength);
+}
+
+std::unique_ptr<MsgReader> SocketChannel::readMessage() {
+  MessageHeader header;
+
+  size_t len = read(&header, sizeof(header));
+  if (len == 0) {
+    return nullptr;
+  }
+
+  CHECK(len == sizeof(header));
+
+  std::unique_ptr<MsgReader> msgReader(new MsgReader(this, header.numIovs));
+
+  CHECK_EQ(msgReader->getTotalLength() + sizeof(header) +
+               msgReader->getNumBlocks() * sizeof(size_t),
+           (size_t)header.totalLength)
+      << " totalLength=" << msgReader->getTotalLength()
+      << " numBlocks=" << msgReader->getNumBlocks();
+  return msgReader;
+}
+
+MsgReader::MsgReader(SocketChannel* channel, size_t numBlocks)
+    : channel_(channel), blockLengths_(numBlocks), currentBlockIndex_(0) {
+  size_t size = numBlocks * sizeof(blockLengths_[0]);
+  CHECK(channel_->read(&blockLengths_[0], size) == size);
+}
+
+void MsgReader::readBlocks(const std::vector<void*>& bufs) {
+  CHECK_LE(currentBlockIndex_ + bufs.size(), blockLengths_.size());
+  std::vector<iovec> iovs;
+  iovs.reserve(bufs.size());
+  size_t totalLength = 0;
+  for (void* buf : bufs) {
+    iovs.push_back({buf, getNextBlockLength()});
+    totalLength += getNextBlockLength();
+    ++currentBlockIndex_;
+  }
+
+  CHECK(channel_->readv(&iovs) == totalLength);
+}
+
+void MsgReader::readNextBlock(void* buf) {
+  CHECK_LT(currentBlockIndex_, blockLengths_.size());
+  CHECK(channel_->read(buf, getNextBlockLength()) == getNextBlockLength());
+  ++currentBlockIndex_;
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/pserver/SocketChannel.h b/paddle/legacy/pserver/SocketChannel.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7b3cd42f0aa32c3a74e14f87dbfe64d25473254
--- /dev/null
+++ b/paddle/legacy/pserver/SocketChannel.h
@@ -0,0 +1,153 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/utils/Util.h"
+
+#include <sys/uio.h>
+
+#include <memory>
+#include <vector>
+
+struct sxi_sock;
+
+namespace paddle {
+
+class SocketChannel;
+enum ChannelType {
+  F_TCP = 1,
+  F_RDMA = 2,
+};
+
+/// reading a set of blocks of data from SocketChannel.
+class MsgReader {
+ public:
+  MsgReader(SocketChannel* channel, size_t numIovs);
+  ~MsgReader() {
+    /// ensure all data blocks have been processed
+    CHECK_EQ(currentBlockIndex_, blockLengths_.size());
+  }
+  /**
+   * @brief number of remaining parts
+   */
+  size_t getNumBlocks() const {
+    return blockLengths_.size() - currentBlockIndex_;
+  }
+
+  /**
+   * @brief lenght of next block
+   */
+  size_t getNextBlockLength() const { return getBlockLength(0); }
+
+  /**
+   * @brief get the total length of all the remaining blocks
+   */
+  size_t getTotalLength() const {
+    size_t total = 0;
+    for (size_t i = currentBlockIndex_; i < blockLengths_.size(); ++i) {
+      total += blockLengths_[i];
+    }
+    return total;
+  }
+
+  /**
+   * @brief Get the length for block currentBlockIndex + i
+   */
+  size_t getBlockLength(size_t i) const {
+    return blockLengths_[currentBlockIndex_ + i];
+  }
+
+  /**
+   * @brief  read blocks data and store it to buf
+   */
+  void readBlocks(const std::vector<void*>& bufs);
+  void readNextBlock(void* buf);
+
+ protected:
+  SocketChannel* channel_;
+  std::vector<size_t> blockLengths_;
+  size_t currentBlockIndex_;
+};
+
+/// APIs for reading and writing byte stream data or naive iov data
+/// from the APIs both RDMA and TCP exhibits byte stream style
+class SocketChannel {
+ public:
+  SocketChannel(int socket, const std::string& peerName)
+      : tcpSocket_(socket), peerName_(peerName) {
+    tcpRdma_ = F_TCP;
+  }
+  SocketChannel(struct sxi_sock* socket, const std::string& peerName)
+      : rdmaSocket_(socket), peerName_(peerName) {
+    tcpRdma_ = F_RDMA;
+  }
+
+  ~SocketChannel();
+
+  const std::string& getPeerName() const { return peerName_; }
+
+  /**
+   * @brief read size bytes.
+   *
+   * @note  keep reading until getting size bytes or sock is closed
+   *        is closed
+   */
+  size_t read(void* buf, size_t size);
+
+  /**
+   * @brief write size bytes.
+   *
+   * @note  keep writing until writing size bytes or sock is closed
+   */
+  size_t write(const void* buf, size_t size);
+
+  /**
+   * @brief write a set of buffers.
+   *
+   * @note  keep writing until all buffers are written or sock is closed
+   */
+  size_t writev(const std::vector<struct iovec>& iov);
+
+  /**
+   * @brief read a set of buffers.
+   *
+   * @note  keep reading until all buffers are full or sock is closed.
+   */
+  size_t readv(std::vector<struct iovec>* iov);
+
+  /**
+   * @brief write a set of buffers.
+   *
+   * @note  keep writing until all buffers are passed or sock is closed
+   */
+  void writeMessage(const std::vector<struct iovec>& iov);
+
+  /// return null to indicate socket is closed
+  std::unique_ptr<MsgReader> readMessage();
+
+ protected:
+  struct MessageHeader {
+    int64_t totalLength;  /// include the header
+    int64_t numIovs;
+    int64_t iovLengths[0];
+  };
+
+  int tcpSocket_;
+  struct sxi_sock* rdmaSocket_;
+  const std::string peerName_;
+  enum ChannelType tcpRdma_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/pserver/SparseParameterDistribution.cpp b/paddle/legacy/pserver/SparseParameterDistribution.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3f17b228f0e5fd33b7e7db2afe1fb9421acc69c5
--- /dev/null
+++ b/paddle/legacy/pserver/SparseParameterDistribution.cpp
@@ -0,0 +1,123 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+
+#include "paddle/legacy/utils/Logging.h"
+
+#include "paddle/legacy/utils/Flags.h"
+
+#include "SparseParameterDistribution.h"
+
+DEFINE_bool(check_sparse_distribution_in_pserver,
+            false,
+            "check whether sparse parameter exhibts balanced distribution at "
+            "all pservers");
+DEFINE_bool(show_check_sparse_distribution_log,
+            false,
+            "show logs details for sparse parameter distribution in pserver");
+DEFINE_int32(check_sparse_distribution_batches,
+             100,
+             "run sparse parameter distribution check for N batches");
+DEFINE_double(
+    check_sparse_distribution_ratio,
+    0.6,
+    "if parameters dispatched to different pservers exhibit unbalanced "
+    " distribution for check_sparse_distribution_ratio * "
+    " check_sparse_distribution_batches times, crash program");
+DEFINE_double(check_sparse_distribution_unbalance_degree,
+              2.0,
+              "the ratio of maximum data size and minimun data size for "
+              "different pserver");
+
+namespace paddle {
+
+SparseParameterDistribution::SparseParameterDistribution(size_t serviceNum) {
+  totBytes_ = 0;
+  data_.resize(serviceNum);
+
+  batchPassed_ = 0;
+  unbalanceCnt_ = 0;
+}
+
+void SparseParameterDistribution::probeDistribution(int serverId,
+                                                    size_t dataSize) {
+  if (!FLAGS_check_sparse_distribution_in_pserver ||
+      batchPassed_ > FLAGS_check_sparse_distribution_batches) {
+    return;
+  }
+
+  CHECK_LT((size_t)serverId, data_.size())
+      << "invalid sparse parameter distribution probe";
+
+  data_[serverId] += dataSize;
+  totBytes_ += dataSize;
+}
+
+void SparseParameterDistribution::checkAndResetDistribution() {
+  if (!FLAGS_check_sparse_distribution_in_pserver ||
+      batchPassed_ >= FLAGS_check_sparse_distribution_batches) {
+    return;
+  }
+
+  /// at runtime, prepareSendData is called by many contexts,
+  /// so need to check if data is avaiable.
+  if (!totBytes_) {
+    return;
+  }
+
+  /// check if distribution is balanced
+  auto avgSize = totBytes_ / data_.size();
+  auto unbalanceDegree = FLAGS_check_sparse_distribution_unbalance_degree;
+  for (auto& dataSize : data_) {
+    if (dataSize > unbalanceDegree * avgSize ||
+        dataSize * unbalanceDegree < avgSize) {
+      unbalanceCnt_++;
+      break;
+    }
+  }
+
+  auto printData = [&]() {
+    std::stringstream ss;
+    for (auto& dataSize : data_) {
+      ss << dataSize * 0.001 << "KB ";
+    }
+    ss << std::endl;
+    LOG(INFO) << ss.str();
+  };
+
+  /// show all sparse data size for different pserver
+  if (FLAGS_show_check_sparse_distribution_log) {
+    LOG(INFO) << "sparse distribution:";
+    printData();
+  }
+
+  totBytes_ = 0;
+  batchPassed_++;
+
+  if (batchPassed_ == FLAGS_check_sparse_distribution_batches) {
+    LOG(INFO) << "show last parameter distribution sample:";
+    printData();
+    LOG(INFO) << "total unbalanced batches: " << unbalanceCnt_
+              << " in passed batches: " << batchPassed_;
+    CHECK_LE((float)unbalanceCnt_ / (float)batchPassed_,
+             FLAGS_check_sparse_distribution_ratio)
+        << "unbalanced sparse parameter distribution for different pserver. "
+        << "it could be caused by unbalanced sparse ids distribution, try "
+        << "to shuffle dimensions in input samples";
+  }
+
+  std::fill(data_.begin(), data_.end(), 0);
+}
+}  // namespace paddle
diff --git a/paddle/legacy/pserver/SparseParameterDistribution.h b/paddle/legacy/pserver/SparseParameterDistribution.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee78029958f675d07ec0aba2d0c1ea92d664e8fd
--- /dev/null
+++ b/paddle/legacy/pserver/SparseParameterDistribution.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <unistd.h>
+
+#include <atomic>
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+/*
+ * if sparse_remote_updater is used, different ParameterServer could
+ * be assigned with unbalanced gradients. the parameter value from
+ * ParameterServer also be not balanced. the distribution of different
+ * dimensions of sparse ids determines the unbalanced degree of data
+ * distributed among all ParameterServers. Even distribution will
+ * benifits cluster efficiency.
+ * do check the unbalanced degree of gradients at runtime, crash program
+ * if unbalanced distribution exhibts by default.
+ */
+class SparseParameterDistribution {
+ public:
+  /// serviceNum means the number of ParameterServers
+  explicit SparseParameterDistribution(size_t serviceNum);
+  ~SparseParameterDistribution() {}
+  /// collect data
+  void probeDistribution(int serverId, size_t data);
+  void checkAndResetDistribution();
+
+ private:
+  std::vector<size_t> data_;
+  std::atomic<size_t> totBytes_;
+
+  /// after some batches, stop to check
+  int batchPassed_;
+
+  /// stat on unbalanced distribution found
+  int unbalanceCnt_;
+};
+}  // namespace paddle
diff --git a/paddle/pserver/test/.gitignore b/paddle/legacy/pserver/test/.gitignore
similarity index 100%
rename from paddle/pserver/test/.gitignore
rename to paddle/legacy/pserver/test/.gitignore
diff --git a/paddle/pserver/test/CMakeLists.txt b/paddle/legacy/pserver/test/CMakeLists.txt
similarity index 100%
rename from paddle/pserver/test/CMakeLists.txt
rename to paddle/legacy/pserver/test/CMakeLists.txt
diff --git a/paddle/legacy/pserver/test/SocketTest.cpp b/paddle/legacy/pserver/test/SocketTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3a781fcbf655b554e79fc753f3409d12f10f6646
--- /dev/null
+++ b/paddle/legacy/pserver/test/SocketTest.cpp
@@ -0,0 +1,256 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/Util.h"
+
+#include <netdb.h>
+#include <netinet/in.h>
+#include <stdio.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include <thread>
+
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/utils/Logging.h"
+
+struct MessageHeader {
+  int64_t dataLength;
+};
+
+class Thread {
+ public:
+  void start();
+  virtual void run() = 0;
+  virtual ~Thread() {}
+
+ protected:
+  std::unique_ptr<std::thread> thread_;
+};
+
+void Thread::start() {
+  thread_.reset(new std::thread([this]() { this->run(); }));
+}
+
+class SocketChannel {
+ public:
+  explicit SocketChannel(int socket) : socket_(socket) {}
+  int getSocketFd() const { return socket_; }
+  uint64_t readAll(void* buf, size_t size);
+  uint64_t writeAll(const void* buf, size_t size);
+
+ protected:
+  int socket_;
+};
+
+uint64_t SocketChannel::readAll(void* buf, size_t size) {
+  uint64_t total = 0;
+  while (total < size) {
+    int64_t len = read(socket_, (char*)buf + total, size - total);
+    if (len <= 0) {
+      return total;
+    }
+    total += len;
+  }
+  return total;
+}
+
+uint64_t SocketChannel::writeAll(const void* buf, size_t size) {
+  uint64_t total = 0;
+  while (total < size) {
+    int64_t len = write(socket_, (const char*)buf + total, size - total);
+    if (len <= 0) {
+      return total;
+    }
+    total += len;
+  }
+  return total;
+}
+
+class SocketWorker : public Thread {
+ public:
+  explicit SocketWorker(int socket) : channel_(socket) {}
+  virtual void run();
+
+  // read n bytes.
+  int64_t readAll(char* buf, size_t n);
+
+  // write n bytes
+
+ protected:
+  SocketChannel channel_;
+  std::string buffer_;
+};
+
+class SocketServer : public Thread {
+ public:
+  explicit SocketServer(int port)
+      : port_(port), socket_(0), maxPendingConnections_(100) {}
+
+  virtual void run();
+
+ protected:
+  int port_;
+  int socket_;
+  int maxPendingConnections_;
+};
+
+void SocketServer::run() {
+  int newsockfd;
+  socklen_t clilen;
+  struct sockaddr_in serv_addr, cli_addr;
+
+  /* First call to socket() function */
+  socket_ = socket(AF_INET, SOCK_STREAM, 0);
+  CHECK(socket_ >= 0) << "ERROR opening socket";
+
+  /* Initialize socket structure */
+  bzero((char*)&serv_addr, sizeof(serv_addr));
+  serv_addr.sin_family = AF_INET;
+  serv_addr.sin_addr.s_addr = INADDR_ANY;
+  serv_addr.sin_port = htons(port_);
+
+  /* Now bind the host address using bind() call.*/
+  CHECK(bind(socket_, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
+      << "ERROR on binding";
+
+  /* Now start listening for the clients, here process will
+   * go in sleep mode and will wait for the incoming connection
+   */
+  listen(socket_, maxPendingConnections_);
+  clilen = sizeof(cli_addr);
+
+  while (true) {
+    /* Accept actual connection from the client */
+    newsockfd = accept(socket_, (struct sockaddr*)&cli_addr, &clilen);
+    CHECK(newsockfd >= 0) << "ERROR on accept";
+
+    SocketWorker* worker = new SocketWorker(newsockfd);
+    worker->start();
+  }
+}
+
+void SocketWorker::run() {
+  MessageHeader header;
+
+  while (true) {
+    int64_t n = channel_.readAll(&header, sizeof(header));
+    CHECK(n == sizeof(header)) << "ERROR reading from socket";
+
+    buffer_.resize(header.dataLength);
+    n = channel_.readAll(&buffer_[0], header.dataLength);
+    CHECK(n == header.dataLength) << "ERROR reading from socket";
+
+    /* Write a response to the client */
+    n = channel_.writeAll(&header, sizeof(header));
+    CHECK(n == sizeof(header)) << "ERROR reading from socket";
+    n = channel_.writeAll(buffer_.data(), buffer_.size());
+    CHECK(n == header.dataLength) << "ERROR writing to socket";
+  }
+}
+
+class SocketClient {
+ public:
+  SocketClient(const std::string& serverAddr, int serverPort);
+  SocketChannel* getChannel() const { return channel_.get(); }
+
+ protected:
+  std::unique_ptr<SocketChannel> channel_;
+};
+
+SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
+  struct sockaddr_in serv_addr;
+  struct hostent* server;
+
+  // char buffer[256];
+
+  /* Create a socket point */
+  int sockfd = socket(AF_INET, SOCK_STREAM, 0);
+  CHECK(sockfd >= 0) << "ERROR opening socket";
+  server = gethostbyname(serverAddr.c_str());
+  CHECK(server) << "ERROR, no such host: " << serverAddr;
+
+  bzero((char*)&serv_addr, sizeof(serv_addr));
+  serv_addr.sin_family = AF_INET;
+  bcopy((char*)server->h_addr,
+        (char*)&serv_addr.sin_addr.s_addr,
+        server->h_length);
+  serv_addr.sin_port = htons(serverPort);
+
+  /* Now connect to the server */
+  CHECK(connect(sockfd, (sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
+      << "ERROR connecting";
+
+  channel_.reset(new SocketChannel(sockfd));
+}
+
+DEFINE_string(server_addr, "127.0.0.1", "Server address");
+DEFINE_int64(dim, 10000000, "Data size");
+DEFINE_int32(loop_time, 100000, "test loop time");
+
+using namespace paddle;  // NOLINT
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  SocketServer server(FLAGS_port);
+  server.start();
+  sleep(1);
+
+  SocketClient client(FLAGS_server_addr, FLAGS_port);
+
+  SocketChannel* channel = client.getChannel();
+
+  MessageHeader header;
+
+  uint64_t dataSize = FLAGS_dim * sizeof(real);
+
+#ifdef PADDLE_WITH_CUDA
+  GpuVector gpuParam(FLAGS_dim);
+  GpuVector gpuGrad(FLAGS_dim);
+#else
+  CpuVector gpuParam(FLAGS_dim);
+  CpuVector gpuGrad(FLAGS_dim);
+#endif
+  CpuVector cpuParam(FLAGS_dim);
+  CpuVector cpuGrad(FLAGS_dim);
+
+  gpuParam.rand();
+  gpuGrad.rand();
+  cpuParam.rand();
+  cpuGrad.rand();
+
+  for (int i = 0; i < FLAGS_loop_time; ++i) {
+    cpuGrad.copyFrom(gpuGrad);
+
+    header.dataLength = dataSize;
+    CHECK(channel->writeAll(&header, sizeof(header)) == sizeof(header))
+        << "Client write header error";
+
+    CHECK(channel->writeAll(cpuGrad.getData(), dataSize) == dataSize)
+        << "Client write data error";
+
+    /* Now read server response */
+    CHECK(channel->readAll(&header, sizeof(header)) == sizeof(header))
+        << "Client read header error";
+
+    CHECK_EQ((uint64_t)header.dataLength, dataSize);
+    CHECK(channel->readAll(cpuParam.getData(), dataSize) == dataSize)
+        << "Client read data error";
+
+    gpuParam.copyFrom(cpuParam);
+
+    LOG_EVERY_N(INFO, 100) << "i=" << i;
+  }
+  exit(0);
+}
diff --git a/paddle/legacy/pserver/test/test_ParameterServer2.cpp b/paddle/legacy/pserver/test/test_ParameterServer2.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..542e80e046972be38d403bc3223f7e7fcd15e3f0
--- /dev/null
+++ b/paddle/legacy/pserver/test/test_ParameterServer2.cpp
@@ -0,0 +1,624 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/pserver/ParameterClient2.h>
+#include <paddle/legacy/pserver/ParameterServer2.h>
+#include <paddle/legacy/utils/Flags.h>
+#include <paddle/legacy/utils/Util.h>
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_int32(num_gradient_servers);
+DEFINE_string(server_addr, "127.0.0.1", "assign server address");
+DEFINE_int32(server_cpu, 0, "assign server cpu");
+
+class ParameterServer2Tester : public ParameterServer2 {
+ public:
+  ParameterServer2Tester(std::string serverAddr,
+                         int port,
+                         int rdmaCpu = -1,
+                         bool sepSendAndRecv = false)
+      : ParameterServer2(serverAddr, port, rdmaCpu), client_(sepSendAndRecv) {}
+  virtual ~ParameterServer2Tester() {}
+  void setup() {
+    CHECK(ParameterServer2::init());
+
+    parameters_.clear();
+    clientConfigs_.clear();
+
+    clientConfigs_.resize(2);
+    {
+      ParameterConfig& config = clientConfigs_[0];
+      config.set_name("para0");
+      config.set_para_id(0);
+      config.set_size(10000);
+      config.set_device(-1);
+      config.set_learning_rate(1.0);
+      config.set_momentum(0.9);
+    }
+
+    {
+      ParameterConfig& config = clientConfigs_[1];
+      config.set_name("para1");
+      config.set_para_id(1);
+      config.set_size(5000);
+      config.set_device(-1);
+      config.set_learning_rate(0.5);
+      config.set_momentum(0.4);
+    }
+
+    for (auto& config : clientConfigs_) {
+      parameters_.emplace_back(new Parameter(config, /* useGpu= */ false));
+    }
+
+    size_t id = 0;
+    for (auto& para : parameters_) {
+      para->setID(id++);
+    }
+
+    CHECK(client_.init(parameters_));
+    OptimizationConfig optConfig;
+    optConfig.set_algorithm("async_sgd");
+    optConfig.set_batch_size(100);
+    optConfig.set_learning_rate(0.1);
+    client_.setConfig(optConfig);
+    client_.setParameter();
+  }
+
+  void setConfigTest();
+  void setStatusTest();
+  void sendParameterTest();
+  void sendDataTest(SendDataType type, size_t size);
+  void operationTest();
+  void mergeBlockSegmentTest();
+  void checkSegments(const BlockSegments& expected, const BlockSegments& segs);
+  void waitPassFinishTest();
+  void synchronizeTest();
+
+ protected:
+  ParameterClient2 client_;
+  vector<ParameterConfig> clientConfigs_;
+  vector<ParameterPtr> parameters_;
+};
+
+std::unique_ptr<ParameterServer2Tester> g_server;
+
+void ParameterServer2Tester::setConfigTest() {
+  setup();
+
+  for (auto& config : clientConfigs_) {
+    auto it = configMap_.find(config.para_id());
+    EXPECT_TRUE(it != configMap_.end());
+    auto& serverConfig = it->second;
+    EXPECT_EQ(config.name(), serverConfig.name());
+    EXPECT_EQ(config.size(), serverConfig.size());
+    EXPECT_EQ(config.learning_rate(), serverConfig.learning_rate());
+    EXPECT_EQ(config.momentum(), serverConfig.momentum());
+  }
+}
+
+void ParameterServer2Tester::setStatusTest() {
+  setup();
+  EXPECT_TRUE(client_.inStatus(PSERVER_STATUS_NOT_SET));
+  client_.setStatus(PSERVER_STATUS_PARAMETER_READY);
+  EXPECT_EQ(PSERVER_STATUS_PARAMETER_READY, status_);
+  EXPECT_TRUE(client_.inStatus(PSERVER_STATUS_PARAMETER_READY));
+}
+
+real sumVector(const CpuVector& vec) {
+  const real* data = vec.getData();
+  size_t dim = vec.getSize();
+  real sum = 0;
+  for (size_t i = 0; i < dim; ++i) {
+    sum += data[i];
+  }
+  return sum;
+}
+
+void ParameterServer2Tester::sendParameterTest() {
+  setup();
+
+  client_.sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM,
+                                  PARAMETER_VALUE,
+                                  0,       // numSamples = 0
+                                  0,       // cost = 0
+                                  false);  // sendBackParameter = false
+
+  vector<ParameterPtr> parameterCopies;
+
+  for (auto& parameter : parameters_) {
+    parameterCopies.emplace_back(
+        new Parameter(parameter->getConfig(), /* useGpu= */ false));
+    parameterCopies.back()
+        ->getBuf(PARAMETER_VALUE)
+        ->copyFrom(*parameter->getBuf(PARAMETER_VALUE));
+  }
+
+  client_.sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM,
+                                  PARAMETER_VALUE,
+                                  0,      // numSamples = 0
+                                  0,      // cost = 0
+                                  true);  // sendBackParameter = true
+
+  for (size_t i = 0; i != parameters_.size(); ++i) {
+    real* v1 = parameters_[i]->getBuf(PARAMETER_VALUE)->getData();
+    real* v2 = parameterCopies[i]->getBuf(PARAMETER_VALUE)->getData();
+    EXPECT_EQ(parameters_[i]->getSize(), parameterCopies[i]->getSize());
+    size_t size = parameters_[i]->getSize();
+    real sum1 = 0, sum2 = 0;
+    for (size_t j = 0; j < size; ++j) {
+      sum1 += v1[j];
+      sum2 += v2[j];
+    }
+    EXPECT_EQ(sum1, sum2);
+  }
+}
+
+void ParameterServer2Tester::sendDataTest(SendDataType type, size_t size) {
+  ParameterClient2 client1(true);
+  client1.init(parameters_);
+  ParameterClient2 client2(true);
+  client2.init(parameters_);
+  ParameterClient2 client3(true);
+  client3.init(parameters_);
+
+  ThreadWorker worker1;
+  ThreadWorker worker2;
+  ThreadWorker worker3;
+
+  double* testData1 = new double[size];
+  double* testData2 = new double[size];
+  double* testData3 = new double[size];
+  double* getDataExpect = new double[size];
+  double* getDataReal = new double[size];
+  for (size_t i = 0; i < size; ++i) {
+    testData1[i] = rand();  // NOLINT TODO(yuyang18): Use rand_r instead.
+    testData2[i] = rand();  // NOLINT
+    testData3[i] = rand();  // NOLINT
+    getDataExpect[i] = testData1[i] + testData2[i] + testData3[i];
+  }
+
+  auto put1 = [&]() {
+    LOG(INFO) << "putOwnData1 start";
+    client1.putOwnData(0, type, testData1, size);
+    LOG(INFO) << "putOwnData1 finish";
+  };
+
+  auto get1 = [&]() {
+    LOG(INFO) << "sendData1 get all start";
+    client1.getAllData(0, type, getDataReal, size);
+    for (size_t i = 0; i < size; ++i) {
+      CHECK_EQ(getDataReal[i], getDataExpect[i]);
+    }
+    LOG(INFO) << "sendData1 get all finish";
+  };
+
+  auto put2 = [&]() {
+    LOG(INFO) << "putOwnData2 start";
+    client2.putOwnData(1, type, testData2, size);
+    LOG(INFO) << "putOwnData2 finish";
+  };
+
+  auto put3 = [&]() {
+    LOG(INFO) << "putOwnData3 start";
+    client3.putOwnData(2, type, testData3, size);
+    LOG(INFO) << "putOwnData3 finish";
+  };
+
+  worker1.addJob(put1);
+  worker1.addJob(get1);
+  worker2.addJob(put2);
+  worker3.addJob(put3);
+
+  worker1.addJob(put1);
+  worker2.addJob(put2);
+  worker3.addJob(put3);
+  worker1.addJob(get1);
+
+  worker1.wait();
+  worker2.wait();
+  worker3.wait();
+  free(testData1);
+  free(testData2);
+  free(testData3);
+  free(getDataExpect);
+  free(getDataReal);
+}
+
+void ParameterServer2Tester::operationTest() {
+  PServerVector v1, v2;
+  v1 = client_.createVector();
+  EXPECT_EQ(NUM_PARAMETER_TYPES, v1.handle);
+
+  v2 = client_.createVector();
+  EXPECT_EQ(NUM_PARAMETER_TYPES + 1, v2.handle);
+
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_RESET, v1, (real)1);
+  ops.addOperation(PSERVER_OP_RESET, v2, (real)2);
+
+  real res1, res2, res3;
+  ops.addOperation(PSERVER_OP_utv, v1, v2)(&res1);
+
+  ops.addOperation(PSERVER_OP_au_bv, v1, v2, (real)-1, (real)1);
+  ops.addOperation(PSERVER_OP_utv, v1, v2)(&res2);
+
+  ops.addOperation(PSERVER_OP_au_bv, v1, v2, (real)-1, (real)1);
+  ops.addOperation(PSERVER_OP_utv, v1, v2)(&res3);
+  client_.doOperation(ops, false, false);
+
+  EXPECT_EQ(30000, res1);
+  EXPECT_EQ(15000, res2);
+  EXPECT_EQ(0, res3);
+
+  PServerMatrix m1, m2;
+  m1 = client_.createMatrix(4);
+  EXPECT_EQ(0, m1.handle);
+  m2 = client_.createMatrix(8);
+  EXPECT_EQ(1, m2.handle);
+
+  // TODO(yuyang18): add tests for other operations OP_COPY, OP_au
+
+  client_.releaseVector(v1);
+  client_.releaseVector(v2);
+  client_.releaseMatrix(m1);
+  client_.releaseMatrix(m2);
+}
+
+void ParameterServer2Tester::checkSegments(const BlockSegments& expected,
+                                           const BlockSegments& segs) {
+  EXPECT_EQ(expected.size(), segs.size());
+  if (expected.size() != segs.size()) {
+    return;
+  }
+  for (size_t i = 0; i < expected.size(); ++i) {
+    EXPECT_EQ(expected[i], segs[i]);
+  }
+}
+
+void ParameterServer2Tester::mergeBlockSegmentTest() {
+  {
+    BlockSegments segs{{10, 20}, {30, 45}, {50, 70}};
+    mergeSegments(&segs);
+    checkSegments({{10, 20}, {30, 45}, {50, 70}}, segs);
+  }
+  {
+    BlockSegments segs{{30, 45}, {50, 70}, {10, 20}};
+    mergeSegments(&segs);
+    checkSegments({{10, 20}, {30, 45}, {50, 70}}, segs);
+  }
+  {
+    BlockSegments segs{{30, 45}, {50, 70}, {10, 30}};
+    mergeSegments(&segs);
+    checkSegments({{10, 45}, {50, 70}}, segs);
+  }
+  {
+    BlockSegments segs{{30, 45}, {10, 70}, {10, 30}};
+    mergeSegments(&segs);
+    checkSegments({{10, 70}}, segs);
+  }
+  {
+    BlockSegments segs{{30, 45}, {50, 70}, {10, 35}};
+    mergeSegments(&segs);
+    checkSegments({{10, 45}, {50, 70}}, segs);
+  }
+  {
+    BlockSegments segs{{30, 45}, {50, 70}, {10, 60}};
+    mergeSegments(&segs);
+    checkSegments({{10, 70}}, segs);
+  }
+  {
+    BlockSegments segs{{30, 45}, {50, 70}, {30, 47}};
+    mergeSegments(&segs);
+    checkSegments({{30, 47}, {50, 70}}, segs);
+  }
+}
+
+void ParameterServer2Tester::waitPassFinishTest() {
+  ParameterClient2 client1;
+  ParameterClient2 client2;
+  ParameterClient2 client3;
+
+  ThreadWorker worker1;
+  ThreadWorker worker2;
+  ThreadWorker worker3;
+
+  auto init1 = [&]() {
+    LOG(INFO) << "init1 start";
+    client1.init(parameters_);
+    LOG(INFO) << "init1 finish";
+  };
+
+  auto init2 = [&]() {
+    LOG(INFO) << "init2 start";
+    client2.init(parameters_);
+    LOG(INFO) << "init2 finish";
+  };
+
+  auto init3 = [&]() {
+    LOG(INFO) << "init3 start";
+    client3.init(parameters_);
+    LOG(INFO) << "init3 finish";
+  };
+
+  auto update1 = [&]() {
+    LOG(INFO) << "update1 start";
+    client1.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ADD_GRADIENT,
+                                    PARAMETER_VALUE,
+                                    0,      // numSamples = 0
+                                    0,      // cost = 0
+                                    true);  // sendBackParameter = false
+    LOG(INFO) << "update1 finish";
+  };
+
+  auto wait1 = [&]() {
+    LOG(INFO) << "wait1 start";
+    client1.waitPassFinish();
+    LOG(INFO) << "wait1 finish";
+  };
+
+  auto update2 = [&]() {
+    LOG(INFO) << "update2 start";
+    client2.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ADD_GRADIENT,
+                                    PARAMETER_VALUE,
+                                    0,      // numSamples = 0
+                                    0,      // cost = 0
+                                    true);  // sendBackParameter = false
+    LOG(INFO) << "update2 finish";
+  };
+
+  auto wait2 = [&]() {
+    LOG(INFO) << "wait2 start";
+    client2.waitPassFinish();
+    LOG(INFO) << "wait2 finish";
+  };
+
+  auto op3 = [&]() {
+    LOG(INFO) << "op3 start";
+    PreparedOperations ops;
+    ops.addOperation(PSERVER_OP_SGD);
+    client3.doOperation(ops,
+                        /* waitForGradient= */ true,
+                        /* sendBackarameter= */ true);
+    LOG(INFO) << "op3 finish";
+  };
+
+  worker1.addJob(init1);
+  worker2.addJob(init2);
+  worker3.addJob(init3);
+
+  worker1.addJob(update1);
+  worker2.addJob(update2);
+  worker3.addJob(op3);
+
+  worker3.addJob(op3);
+  worker3.addJob(op3);
+  worker2.addJob(update2);
+  worker2.addJob(update2);
+  worker1.addJob(wait1);
+
+  worker2.addJob(wait2);
+  worker3.addJob(op3);
+
+  worker1.wait();
+  worker2.wait();
+  worker3.wait();
+
+  LOG(INFO) << "Pass 1 finished";
+
+  worker1.addJob(update1);
+  worker2.addJob(update2);
+  worker3.addJob(op3);
+
+  worker1.wait();
+  worker2.wait();
+  worker3.wait();
+
+  worker3.addJob(op3);
+  worker3.addJob(op3);
+  worker1.addJob(update1);
+  worker1.addJob(wait1);
+  worker2.addJob(wait2);
+
+  worker1.wait();
+  worker2.wait();
+  worker3.wait();
+
+  LOG(INFO) << "Pass 2 finished";
+}
+
+void ParameterServer2Tester::synchronizeTest() {
+  ParameterClient2 client1;
+  ParameterClient2 client2;
+
+  ThreadWorker worker1;
+  ThreadWorker worker2;
+
+  FLAGS_log_period_server = 2;
+
+  auto init1 = [&]() {
+    LOG(INFO) << "init1 start";
+    client1.init(parameters_);
+    client1.setTrainerId(0);
+    LOG(INFO) << "init1 finish";
+  };
+
+  auto init2 = [&]() {
+    LOG(INFO) << "init2 start";
+    client2.init(parameters_);
+    client2.setTrainerId(1);
+    LOG(INFO) << "init2 finish";
+  };
+
+  auto update1 = [&]() {
+    LOG(INFO) << "update1 start";
+    client1.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ASYNC_SGD,
+                                    PARAMETER_VALUE,
+                                    0,      // numSamples = 0
+                                    0,      // cost = 0
+                                    true);  // sendBackParameter = false
+    LOG(INFO) << "update1 finish";
+  };
+
+  auto wait1 = [&]() {
+    LOG(INFO) << "wait1 start";
+    client1.asyncFinishPass();
+    LOG(INFO) << "wait1 finish";
+  };
+
+  auto update2 = [&]() {
+    LOG(INFO) << "update2 start";
+    client2.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ASYNC_SGD,
+                                    PARAMETER_VALUE,
+                                    0,      // numSamples = 0
+                                    0,      // cost = 0
+                                    true);  // sendBackParameter = false
+    LOG(INFO) << "update2 finish";
+  };
+
+  auto wait2 = [&]() {
+    LOG(INFO) << "wait2 start";
+    client2.asyncFinishPass();
+    LOG(INFO) << "wait2 finish";
+  };
+
+  worker1.addJob(init1);
+  worker2.addJob(init2);
+  // call wait to reset some stats at pserver
+  worker1.addJob(wait1);
+  worker2.addJob(wait2);
+
+  worker1.addJob(update1);
+  worker2.addJob(update2);
+
+  worker2.addJob(update2);
+  worker2.addJob(update2);
+  worker1.addJob(wait1);
+
+  worker2.addJob(wait2);
+
+  worker1.wait();
+  worker2.wait();
+  LOG(INFO) << "Pass 1 finished";
+
+  worker1.addJob(update1);
+  worker2.addJob(update2);
+
+  worker1.wait();
+  worker2.wait();
+
+  worker1.addJob(update1);
+  worker2.addJob(update2);
+  worker1.addJob(update1);
+  worker1.addJob(update1);
+  worker1.addJob(update1);
+  worker1.addJob(update1);
+  worker1.addJob(update1);
+  worker1.addJob(update1);
+  worker1.addJob(wait1);
+  worker2.addJob(wait2);
+
+  worker1.wait();
+  worker2.wait();
+  LOG(INFO) << "Pass 2 finished";
+}
+
+TEST(ParameterServer2, sendParameter) { g_server->sendParameterTest(); }
+
+TEST(ParameterServer2, setConfig) { g_server->setConfigTest(); }
+
+TEST(ParameterServer2, setStatus) { g_server->setStatusTest(); }
+
+TEST(ParameterServer2, operation) { g_server->operationTest(); }
+
+TEST(ParameterServer2, mergeBlockSegment) { g_server->mergeBlockSegmentTest(); }
+
+TEST(ParameterServer2, waitPassFinish) { g_server->waitPassFinishTest(); }
+
+TEST(ParameterServer2, synchronize) { g_server->synchronizeTest(); }
+
+TEST(ParameterServer2, sendData) {
+  // Set gserver and pserver all 3, so that the test is sufficient.
+  int oldFlagsPortsNUm = FLAGS_ports_num;
+  int oldFlagsNumGradientServers = FLAGS_num_gradient_servers;
+  int oldFlagsPort = FLAGS_port;
+  FLAGS_ports_num = 3;
+  FLAGS_num_gradient_servers = 3;
+  FLAGS_port = FLAGS_port + 1;
+  std::unique_ptr<ParameterServer2Tester> g_server1;
+  std::unique_ptr<ParameterServer2Tester> g_server2;
+  std::unique_ptr<ParameterServer2Tester> g_server3;
+  if (FLAGS_rdma_tcp == "rdma") {
+    g_server1.reset(new ParameterServer2Tester(
+        FLAGS_server_addr, FLAGS_port, FLAGS_server_cpu));
+    g_server1->start();
+    g_server2.reset(new ParameterServer2Tester(
+        FLAGS_server_addr, FLAGS_port + 1, FLAGS_server_cpu + 1));
+    g_server2->start();
+    g_server3.reset(new ParameterServer2Tester(
+        FLAGS_server_addr, FLAGS_port + 2, FLAGS_server_cpu + 2));
+    g_server3->start();
+  } else {  // tcp
+    g_server1.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port));
+    g_server1->start();
+    g_server2.reset(
+        new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port + 1));
+    g_server2->start();
+    g_server3.reset(
+        new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port + 2));
+    g_server3->start();
+  }
+
+  g_server2->init();
+  g_server3->init();
+  sleep(2);
+  g_server1->setup();
+  g_server1->sendDataTest(DATA_REDUCE_SUM, 1 << 24);
+  sleep(2);
+  g_server1->sendDataTest(DATA_REDUCE_SUM, 2);
+  sleep(2);
+  g_server1.reset();
+  g_server2.reset();
+  g_server3.reset();
+
+  FLAGS_ports_num = oldFlagsPortsNUm;
+  FLAGS_num_gradient_servers = oldFlagsNumGradientServers;
+  FLAGS_port = oldFlagsPort;
+}
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+
+  FLAGS_num_gradient_servers = 2;
+
+  if (FLAGS_rdma_tcp == "rdma") {
+    g_server.reset(new ParameterServer2Tester(
+        FLAGS_server_addr, FLAGS_port, FLAGS_server_cpu));
+  } else {
+    g_server.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port));
+  }
+
+  g_server->start();
+
+  sleep(2);
+
+  int ret = RUN_ALL_TESTS();
+
+  g_server.reset();
+
+  exit(ret);
+}
diff --git a/paddle/legacy/pserver/test/test_ProtoServer.cpp b/paddle/legacy/pserver/test/test_ProtoServer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f7ab2e8af45f97a6537d41ca1afe51a4d3270b80
--- /dev/null
+++ b/paddle/legacy/pserver/test/test_ProtoServer.cpp
@@ -0,0 +1,169 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+#include "ParameterService.pb.h"
+#include "paddle/legacy/math/Vector.h"
+#include "paddle/legacy/pserver/ProtoServer.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
+
+DEFINE_string(server_addr, "127.0.0.1", "Server address");
+DEFINE_int64(dim, 50000000, "Data size");
+DEFINE_bool(test_proto_server, true, "whether to test ProtoServer");
+DEFINE_bool(benchmark, false, "Do benchmark. Skip some tests");
+
+using namespace paddle;  // NOLINT
+
+class MyServer : public ProtoServer {
+ public:
+  explicit MyServer(int port, int rdmaCpu = -1)
+      : ProtoServer(FLAGS_server_addr, port, rdmaCpu),
+        status_(PSERVER_STATUS_NOT_SET) {
+    REGISTER_SERVICE_FUNCTION(MyServer, getStatus);
+    REGISTER_SERVICE_FUNCTION(MyServer, setStatus);
+    REGISTER_SERVICE_FUNCTION_EX(MyServer, getStatusEx);
+  }
+  void getStatus(const GetStatusRequest& request,
+                 ProtoResponseCallback callback) {
+    (void)request;
+    GetStatusResponse response;
+    response.set_status(status_);
+    callback(response);
+  }
+
+  void getStatusEx(const GetStatusRequest& request,
+                   std::unique_ptr<MsgReader> msgReader,
+                   ProtoResponseCallbackEx callback) {
+    (void)request;
+    GetStatusResponse response;
+    response.set_status(status_);
+    buffer_.resize(msgReader->getNextBlockLength());
+    msgReader->readNextBlock(&buffer_[0]);
+    callback(response, {{&buffer_[0], buffer_.size()}});
+  }
+
+  void setStatus(const SetStatusRequest& request,
+                 ProtoResponseCallback callback) {
+    SetStatusResponse response;
+    status_ = request.status();
+    callback(response);
+  }
+
+ protected:
+  PServerStatus status_;
+  std::string buffer_;
+};
+
+TEST(ProtoServer, regular) {
+  ProtoClient* client;
+  if (FLAGS_rdma_tcp == "rdma")
+    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_RDMA);
+  else
+    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_TCP);
+  {
+    GetStatusRequest request;
+    GetStatusResponse response;
+    auto msgReader = client->sendAndRecv("getStatus", request, &response);
+    EXPECT_EQ(response.status(), PSERVER_STATUS_NOT_SET);
+    EXPECT_EQ(msgReader->getNumBlocks(), (size_t)0);
+  }
+
+  {
+    SetStatusRequest request;
+    SetStatusResponse response;
+    request.set_status(PSERVER_STATUS_PARAMETER_READY);
+    client->sendAndRecv("setStatus", request, &response);
+  }
+
+  {
+    GetStatusRequest request;
+    GetStatusResponse response;
+    client->sendAndRecv("getStatus", request, &response);
+    EXPECT_EQ(response.status(), PSERVER_STATUS_PARAMETER_READY);
+  }
+
+  delete client;
+}
+
+TEST(ProtoServer, extended) {
+#ifdef PADDLE_WITH_CUDA
+  ProtoClient* client;
+  if (FLAGS_rdma_tcp == "rdma")
+    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_RDMA);
+  else
+    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_TCP);
+  int64_t dataSize = FLAGS_dim * sizeof(real);
+
+  GpuVector gpuParam(FLAGS_dim);
+  GpuVector gpuGrad(FLAGS_dim);
+  CpuVector cpuParam(FLAGS_dim);
+  CpuVector cpuGrad(FLAGS_dim);
+
+  gpuParam.rand();
+  gpuGrad.rand();
+  cpuParam.rand();
+  cpuGrad.rand();
+
+  for (int k = 0; k < 4; ++k) {
+    for (int i = 0; i < 10; ++i) {
+      cpuGrad.copyFrom(gpuGrad);
+      if (FLAGS_test_proto_server) {
+        GetStatusRequest request;
+        GetStatusResponse response;
+        {
+          REGISTER_TIMER("sendAndRecv");
+          auto msgReader =
+              client->sendAndRecv("getStatusEx",
+                                  request,
+                                  {{cpuGrad.getData(), (size_t)dataSize}},
+                                  &response);
+
+          EXPECT_EQ(msgReader->getNumBlocks(), (size_t)1);
+          EXPECT_EQ(msgReader->getNextBlockLength(), (size_t)dataSize);
+          msgReader->readNextBlock(cpuParam.getData());
+        }
+        if (!FLAGS_benchmark) {
+          real* v1 = cpuGrad.getData();
+          real* v2 = cpuParam.getData();
+          real sum1 = 0, sum2 = 0;
+          for (int j = 0; j < FLAGS_dim; ++j) {
+            sum1 += v1[j];
+            sum2 += v2[j];
+          }
+          EXPECT_EQ(sum1, sum2);
+        }
+      }
+      gpuParam.copyFrom(cpuParam);
+
+      LOG_EVERY_N(INFO, 10) << "i=" << i;
+    }
+    globalStat.printAllStatus();
+    globalStat.reset();
+  }
+
+  delete client;
+#endif
+}
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  MyServer server(FLAGS_port, FLAGS_rdma_tcp == "rdma" ? 0 : -1);
+  server.start();
+  usleep(10000);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/pserver/test/test_ProtoServer.sh b/paddle/legacy/pserver/test/test_ProtoServer.sh
new file mode 100755
index 0000000000000000000000000000000000000000..1439350847308cc5590329b0fe2a6d2c77d04409
--- /dev/null
+++ b/paddle/legacy/pserver/test/test_ProtoServer.sh
@@ -0,0 +1,33 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+for ((port=12340;port<=12360;port++))
+do
+    port_used_num=`netstat -a |grep $port|wc -l`
+    if [ $port_used_num -eq 0 ]
+    then
+        echo $port;
+        legacy/pserver/test/test_ProtoServer --port=$port
+        if [ $? -eq 0 ]
+           then
+               exit 0
+           else
+               echo "test_ProtoServer run wrong"
+       	       exit 1
+        fi
+fi
+done
+echo "test_ProtoServer port not found"
+exit 1
diff --git a/paddle/legacy/trainer/CMakeLists.txt b/paddle/legacy/trainer/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6192de4388c8c3f5165fb88b443d372748f7a17e
--- /dev/null
+++ b/paddle/legacy/trainer/CMakeLists.txt
@@ -0,0 +1,73 @@
+# paddle trainer package
+
+set(TRAINER_SOURCES
+        ParameterUpdater.cpp
+        ParamUtil.cpp
+        RemoteParameterUpdater.cpp
+        NewRemoteParameterUpdater.cpp
+        Tester.cpp
+        Trainer.cpp
+        TrainerInternal.cpp
+        TrainerBenchmark.cpp
+        ThreadParameterUpdater.cpp
+        TrainerInternalConfig.cpp
+        TrainerConfigHelper.cpp)
+
+set(TRAINER_HEADERS
+        ParameterUpdater.h
+        ParamUtil.h
+        RemoteParameterUpdater.h
+        NewRemoteParameterUpdater.h
+        Tester.h
+        TesterConfig.h
+        Trainer.h
+        TrainerInternal.h
+        TrainerInternalConfig.h
+        ThreadParameterUpdater.h
+        TrainerConfigHelper.h)
+
+if(NOT WITH_GOLANG)
+  list(REMOVE_ITEM TRAINER_SOURCES
+          NewRemoteParameterUpdater.cpp)
+  list(REMOVE_ITEM TRAINER_HEADERS
+          NewRemoteParameterUpdater.h)
+endif()
+
+add_library(paddle_trainer_lib STATIC
+    ${TRAINER_SOURCES})
+
+add_dependencies(paddle_trainer_lib
+    paddle_proto
+    ${external_project_dependencies})
+
+macro(add_paddle_exe TARGET_NAME)
+  add_executable(${TARGET_NAME} ${ARGN})
+  link_paddle_exe(${TARGET_NAME})
+endmacro()
+
+if(WITH_TESTING)
+  add_subdirectory(tests)
+endif()
+
+if(NOT MOBILE_INFERENCE)
+  add_paddle_exe(paddle_trainer TrainerMain.cpp)
+  add_paddle_exe(paddle_merge_model MergeModel.cpp)
+
+  install(TARGETS paddle_trainer paddle_merge_model
+          RUNTIME DESTINATION opt/paddle/bin
+          PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
+          GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
+
+  set_target_properties(paddle_trainer PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
+  set_target_properties(paddle_merge_model PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
+endif()
+
+if(APPLE)
+  set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
+endif()
+
+if(WITH_GOLANG)
+  add_dependencies(paddle_trainer_lib paddle_pserver_cclient)
+  target_link_libraries(paddle_trainer_lib paddle_pserver_cclient)
+  target_link_libraries(paddle_trainer paddle_pserver_cclient)
+endif(WITH_GOLANG)
diff --git a/paddle/legacy/trainer/MergeModel.cpp b/paddle/legacy/trainer/MergeModel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8a3601f192224a43687191527374149d99285ae0
--- /dev/null
+++ b/paddle/legacy/trainer/MergeModel.cpp
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+
+#include "ParamUtil.h"
+#include "Trainer.h"
+#include "paddle/legacy/pserver/ParameterServer2.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+
+DEFINE_string(model_dir, "", "Directory for separated model files");
+DEFINE_string(config_file, "", "Config file for the model");
+DEFINE_string(model_file, "", "File for merged model file");
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  initPython(argc, argv);
+
+  if (FLAGS_model_dir.empty() || FLAGS_config_file.empty() ||
+      FLAGS_model_file.empty()) {
+    LOG(INFO) << "Usage: ./paddle_merge_model --model_dir=pass-00000 "
+                 "--config_file=config.py --model_file=out.paddle";
+    return 0;
+  }
+
+  string confFile = FLAGS_config_file;
+#ifndef PADDLE_WITH_CUDA
+  FLAGS_use_gpu = false;
+#endif
+  auto config = std::make_shared<TrainerConfigHelper>(confFile);
+  unique_ptr<GradientMachine> gradientMachine(GradientMachine::create(*config));
+  gradientMachine->loadParameters(FLAGS_model_dir);
+
+  ofstream os(FLAGS_model_file);
+
+  string buf;
+  config->getConfig().SerializeToString(&buf);
+  int64_t size = buf.size();
+  os.write((char*)&size, sizeof(size));
+  CHECK(os) << "Fail to write to " << FLAGS_model_file;
+  os.write(buf.data(), buf.size());
+  vector<ParameterPtr>& parameters = gradientMachine->getParameters();
+  for (auto& para : parameters) {
+    para->save(os);
+    CHECK(os) << "Fail to write to " << FLAGS_model_file;
+  }
+  os.close();
+
+  return 0;
+}
diff --git a/paddle/legacy/trainer/NewRemoteParameterUpdater.cpp b/paddle/legacy/trainer/NewRemoteParameterUpdater.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cdd832acd16e5c259a7f6463aac537e4e6537c97
--- /dev/null
+++ b/paddle/legacy/trainer/NewRemoteParameterUpdater.cpp
@@ -0,0 +1,150 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "NewRemoteParameterUpdater.h"
+#include "Trainer.h"
+#include "paddle/legacy/utils/Stat.h"
+
+DECLARE_int32(trainer_id);
+DECLARE_string(save_dir);
+
+namespace paddle {
+NewRemoteParameterUpdater::NewRemoteParameterUpdater(
+    const OptimizationConfig &config, const std::string pserverSpec)
+    : trainerConfig_(config),
+      parameterClient_(-1),
+      newParameters_(nullptr),
+      newGradients_(nullptr),
+      pserverSpec_(pserverSpec) {}
+
+NewRemoteParameterUpdater::NewRemoteParameterUpdater(
+    const OptimizationConfig &config,
+    const std::string pserverSpec,
+    const bool useEtcd)
+    : trainerConfig_(config),
+      parameterClient_(-1),
+      newParameters_(nullptr),
+      newGradients_(nullptr),
+      pserverSpec_(pserverSpec),
+      useEtcd_(useEtcd) {}
+
+void NewRemoteParameterUpdater::init(
+    const std::vector<ParameterPtr> &parameters) {
+  ParameterUpdater::init(parameters);
+
+  // create parameter server client.
+  if (useEtcd_) {
+    parameterClient_ =
+        paddle_new_etcd_pserver_client((char *)pserverSpec_.c_str());
+  } else {
+    parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(),
+                                                 FLAGS_trainer_id == 0);
+  }
+
+  // init new parameter and gradient.
+  newParameters_ = initNewParameter(PARAMETER_VALUE);
+  newGradients_ = initNewParameter(PARAMETER_GRADIENT);
+
+  // init parameter, one trainer will get the opportunity to int parameter and
+  // send them to parameter server. Others will get the initialized parameter
+  // from parameter server
+  if (paddle_begin_init_params(parameterClient_)) {
+    LOG(INFO) << "paddle_begin_init_params start";
+    // NOTE: convert V1 OptimizatioinConfig proto to V2 OptimizerConfig.
+    // This makes golang pserver compatible with handy V1 demos.
+    // TODO(wuyi): Refine or remove these ugly converting lines
+    OptimizerConfig optimizerConfigV2;
+    if (trainerConfig_.learning_method() == "momentum") {
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
+    } else if (trainerConfig_.learning_method() == "adagrad") {
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adagrad);
+      optimizerConfigV2.mutable_adagrad()->set_epsilon(
+          trainerConfig_.ada_epsilon());
+    } else if (trainerConfig_.learning_method() == "adadelta") {
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adagrad);
+      optimizerConfigV2.mutable_adadelta()->set_epsilon(
+          trainerConfig_.ada_epsilon());
+      optimizerConfigV2.mutable_adadelta()->set_rho(trainerConfig_.ada_rou());
+    } else if (trainerConfig_.learning_method() == "adam") {
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adam);
+      optimizerConfigV2.mutable_adam()->set_beta_1(trainerConfig_.adam_beta1());
+      optimizerConfigV2.mutable_adam()->set_beta_2(trainerConfig_.adam_beta2());
+      optimizerConfigV2.mutable_adam()->set_epsilon(
+          trainerConfig_.adam_epsilon());
+    } else {
+      LOG(ERROR) << "got unsupported v1 optimizer config: "
+                 << trainerConfig_.learning_method();
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
+    }
+
+    if (trainerConfig_.learning_rate_schedule() == "constant") {
+      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
+      optimizerConfigV2.mutable_const_lr()->set_learning_rate(
+          trainerConfig_.learning_rate());
+    } else if (trainerConfig_.learning_rate_schedule() == "linear") {
+      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Linear);
+      optimizerConfigV2.mutable_linear_lr()->set_learning_rate(
+          trainerConfig_.learning_rate());
+      optimizerConfigV2.mutable_linear_lr()->set_lr_decay_a(
+          trainerConfig_.learning_rate_decay_a());
+      optimizerConfigV2.mutable_linear_lr()->set_lr_decay_b(
+          trainerConfig_.learning_rate_decay_b());
+    } else {
+      LOG(ERROR) << "got unsupported v1 learning_rate_schedule config: "
+                 << trainerConfig_.learning_rate_schedule() << ", set to const";
+      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
+      optimizerConfigV2.mutable_const_lr()->set_learning_rate(
+          trainerConfig_.learning_rate());
+    }
+
+    // overwrite optimizerConfigV2 for per-parameter(layer) configs
+    for (int i = 0; i < parameterSize(); ++i) {
+      // FIXME(typhoonzero): paramConfig always have default values,
+      // how to check if it's default?
+      // TODO(typhoonzero): log output: optimizerConfigV2.DebugString();
+      LOG(INFO) << "trainerConfig_: " << trainerConfig_.DebugString();
+      // send param and config to pserver
+      std::string bytes = optimizerConfigV2.SerializeAsString();
+      const char *array = bytes.data();
+      int size = (int)bytes.size();
+      paddle_init_param(
+          parameterClient_, *newParameters_[i], (void *)array, size);
+    }
+    paddle_finish_init_params(parameterClient_);
+    LOG(INFO) << "paddle_begin_init_params done";
+  } else {
+    paddle_get_params(parameterClient_, newParameters_, parameterSize());
+  }
+
+  LOG(INFO) << "NewRemoteParameterUpdater initialized";
+}
+
+void NewRemoteParameterUpdater::updateImpl(Parameter *para) {}
+
+void NewRemoteParameterUpdater::finishBatch(real cost) {
+  // send gradient to parameter server.
+  paddle_send_grads(parameterClient_, newGradients_, parameterSize());
+  // get the updated parameter from parameterClient.
+  paddle_get_params(parameterClient_, newParameters_, parameterSize());
+
+  // clear gradient after update parameter.
+  for (auto &para : parameters_) {
+    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
+  }
+}
+
+void NewRemoteParameterUpdater::startPass() {}
+
+bool NewRemoteParameterUpdater::finishPass() { return true; }
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/NewRemoteParameterUpdater.h b/paddle/legacy/trainer/NewRemoteParameterUpdater.h
new file mode 100644
index 0000000000000000000000000000000000000000..707e9ceb9b6a22d265f9bf7b02af7f3002930fd4
--- /dev/null
+++ b/paddle/legacy/trainer/NewRemoteParameterUpdater.h
@@ -0,0 +1,121 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <thread>
+#include "OptimizerConfig.pb.h"
+#include "ParameterUpdater.h"
+#include "libpaddle_pserver_cclient.h"
+#include "paddle/legacy/pserver/ParameterClient2.h"
+#include "paddle/legacy/utils/Queue.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+/**
+ * New remote parameter updater for dense parameters that use cclient of go.
+ */
+class NewRemoteParameterUpdater : public ParameterUpdater {
+ public:
+  NewRemoteParameterUpdater(const OptimizationConfig& config,
+                            const std::string pserverSpec);
+  NewRemoteParameterUpdater(const OptimizationConfig& config,
+                            const std::string pserverSpec,
+                            const bool useEtcd);
+  ~NewRemoteParameterUpdater() {
+    releaseNewParameter(newParameters_);
+    releaseNewParameter(newGradients_);
+    if (parameterClient_ >= 0) paddle_pserver_client_release(parameterClient_);
+  }
+
+  /**
+   * initialize the internal parameter client and itself.
+   */
+  virtual void init(const std::vector<ParameterPtr>& parameters);
+  /**
+   * @brief start batch
+   *
+   * @note  one batch training exhibits stateful feature to help
+   *        to do performance tuning, sgd optimization if necessary.
+   */
+  virtual PassType startBatch(int64_t batchSize) { return PASS_TRAIN; }
+
+  /**
+   * send parameters to pservers and get returned parameters
+   * from all pservers if necessary.
+   */
+  virtual void finishBatch(real cost);
+  virtual void startPass();
+  virtual bool finishPass();
+
+ protected:
+  /**
+   * work need to do after finishBatch
+   */
+  virtual void updateImpl(Parameter* para);
+
+ private:
+  int parameterSize() { return (int)parameters_.size(); }
+
+  /**
+   * init parameter of go paddle pserver cclient.
+   * @param new_params
+   * @param type
+   */
+  paddle_parameter** initNewParameter(ParameterType type) {
+    paddle_parameter** new_params =
+        (paddle_parameter**)malloc(sizeof(paddle_parameter*) * parameterSize());
+    for (int i = 0; i < parameterSize(); ++i) {
+      new_params[i] = (paddle_parameter*)malloc(sizeof(paddle_parameter));
+      memset(new_params[i], 0, sizeof(paddle_parameter));
+    }
+
+    for (int i = 0; i < parameterSize(); ++i) {
+      ParameterPtr param = parameters_[i];
+      new_params[i]->element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+      new_params[i]->name = (char*)param->getName().c_str();
+      new_params[i]->content =
+          (unsigned char*)(param->getBuf(type).get()->getData());
+      new_params[i]->content_len =
+          (int)param->getBuf(type).get()->getSize() * sizeof(real);
+    }
+    return new_params;
+  }
+
+  void releaseNewParameter(paddle_parameter** newParams) {
+    if (newParams != nullptr) {
+      for (int i = 0; i < parameterSize(); ++i) {
+        free(newParams[i]);
+      }
+      free(newParams);
+    }
+  }
+
+ protected:
+  const OptimizationConfig& trainerConfig_;
+  /// internal parameter client object for exchanging data with pserver
+  paddle_pserver_client parameterClient_;
+  /// the parameters for new pserver client
+  paddle_parameter** newParameters_;
+  /// the gradinets for new pserver client
+  paddle_parameter** newGradients_;
+  /// the specification of parameter server "host1:port,host1:port"
+  std::string pserverSpec_;
+  /// true if pserverSpec_ is etcd endpoint, else pserverSpec_ is pserver addr
+  bool useEtcd_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/ParamUtil.cpp b/paddle/legacy/trainer/ParamUtil.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b5aba32dee1d07015ae3fce1cc76242b8ae80fe5
--- /dev/null
+++ b/paddle/legacy/trainer/ParamUtil.cpp
@@ -0,0 +1,163 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ParamUtil.h"
+
+#include <fenv.h>
+#include <stdio.h>
+
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <sstream>
+
+#include <google/protobuf/text_format.h>
+#include <paddle/legacy/utils/Version.h>
+
+#include "paddle/legacy/utils/GlobalConstants.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include "TesterConfig.h"
+#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/legacy/gserver/layers/ValidationLayer.h"
+
+namespace paddle {
+
+ParameterUtil::ParameterUtil(
+    const std::shared_ptr<TrainerConfigHelper> &config,
+    std::unique_ptr<ParameterUtilConfig> &&intconfig,
+    const GradientMachinePtr &gradientMachine,
+    const std::shared_ptr<ParameterUpdater> &parameterUpdater) {
+  config_ = config;
+  intConfig_ = std::move(intconfig);
+  gserver_ = gradientMachine;
+  pUpdater_ = parameterUpdater;
+}
+
+bool ParameterUtil::loadParameters(int passId, bool local, bool remote) {
+  constexpr int kBufLen = 100;
+  char buf[kBufLen];
+  snprintf(buf, kBufLen, "pass-%05d", passId);
+  std::string doneFile = path::join(config_->getSaveDir(), buf, "done");
+  if (!fileExist(doneFile.c_str())) return false;
+  loadParametersWithPath(path::join(config_->getSaveDir(), buf), local, remote);
+  return true;
+}
+
+void ParameterUtil::loadParametersWithPath(const std::string &dir,
+                                           bool local,
+                                           bool remote) {
+  if (local) {
+    gserver_->loadParameters(dir);
+  }
+  if (remote && pUpdater_) {
+    pUpdater_->loadParametersRemote(dir);
+  }
+}
+
+void ParameterUtil::saveParametersOnePass(int passId, int passInnerId) {
+  pUpdater_->apply();
+  saveParameters(passId, passInnerId);
+  if (intConfig_->save_only_one_ && passId >= intConfig_->saving_period_) {
+    deleteParameters(passId - intConfig_->saving_period_);
+  }
+  pUpdater_->restore();
+}
+
+void ParameterUtil::saveParameters(int passId, int passInnerId) {
+  constexpr int kBufLen = 100;
+  char buf[kBufLen];
+  if (passInnerId > 0) {
+    snprintf(buf, kBufLen, "pass-%05d-%03d", passId, passInnerId);
+  } else {
+    snprintf(buf, kBufLen, "pass-%05d", passId);
+  }
+
+  std::string basePath = config_->getSaveDir();
+  if (basePath.find('/') == std::string::npos) {
+    basePath = "./" + basePath;
+  }
+  mkDirRecursively(basePath.c_str());
+
+  std::string saveDir = path::join(basePath, buf);
+  mkDir(saveDir.c_str());
+  if (!intConfig_->load_save_param_pserver_) {
+    pUpdater_->getParametersRemote(true /*full parameter*/,
+                                   true /*after apply*/);
+  }
+
+  gserver_->saveParameters(saveDir);
+  if (intConfig_->load_save_param_pserver_) {
+    pUpdater_->saveParametersRemote(saveDir);
+  }
+  std::string doneFile = path::join(saveDir, "done");
+  touchFile(doneFile.c_str());
+  std::ofstream out(doneFile);
+  version::printVersion(out);
+  out.close();
+  VLOG(1) << "save dir " << saveDir;
+  saveConfigWithPath(saveDir);
+}
+
+void ParameterUtil::deleteParameters(int passId, int passInnerId) {
+  constexpr int kBufLen = 100;
+  char buf[kBufLen];
+  const std::string &saveDir = config_->getSaveDir();
+  if (passInnerId > 0) {
+    snprintf(buf,
+             kBufLen,
+             "%s/pass-%05d-%03d",
+             saveDir.c_str(),
+             passId,
+             passInnerId);
+  } else {
+    snprintf(buf, kBufLen, "%s/pass-%05d", saveDir.c_str(), passId);
+  }
+  mkDir(saveDir.c_str());
+  LOG(INFO) << "delete dir " << buf;
+  rmDir(buf);
+}
+
+void ParameterUtil::saveConfigWithPath(const std::string &path) {
+  std::string src;
+  // save config in some path
+  if (!intConfig_->config_.empty()) {
+    src = intConfig_->config_;
+  } else {
+    bool ok;
+    src = config_->getConfigName(&ok);
+    if (!ok) {
+      return;
+    }
+  }
+  copyFileToPath(src, path);
+
+  // save other import config file name to path.txt
+  std::string ss = path::join(path, "path.txt");
+  std::ofstream os(ss);
+  std::string fileName = path::basename(src);
+  CHECK(os.write(fileName.c_str(), fileName.length()))
+      << "Fail to write config file name " << ss;
+  VLOG(1) << "fileName " << fileName;
+  os.close();
+
+  // copy other import config files
+  for (int i = 0; i < config_->getConfig().config_files_size(); ++i) {
+    copyFileToPath(config_->getConfig().config_files(i), path);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/ParamUtil.h b/paddle/legacy/trainer/ParamUtil.h
new file mode 100644
index 0000000000000000000000000000000000000000..07786967762a7b9267d190de5275f0f94bbd21ef
--- /dev/null
+++ b/paddle/legacy/trainer/ParamUtil.h
@@ -0,0 +1,125 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/utils/Util.h"
+
+#include <stdio.h>
+
+#include "hl_gpu.h"
+#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
+
+#include <stdlib.h>
+#include <fstream>
+#include "ParameterUpdater.h"
+#include "TrainerConfig.pb.h"
+#include "TrainerConfigHelper.h"
+
+namespace paddle {
+
+/**
+ * Configuration for parameter utils.
+ */
+struct ParameterUtilConfig {
+  DISABLE_COPY(ParameterUtilConfig);
+
+  ParameterUtilConfig(bool save_only_one,
+                      int saving_period,
+                      bool load_save_parameters_in_pserver,
+                      std::string config)
+      : save_only_one_(save_only_one),
+        saving_period_(saving_period),
+        load_save_param_pserver_(load_save_parameters_in_pserver),
+        config_(config) {}
+
+  bool save_only_one_;
+  int saving_period_;
+  bool load_save_param_pserver_;
+  std::string config_;
+};
+
+/**
+ * ParameterUtil
+ * Utility class for loading and saving parameters
+ */
+class ParameterUtil {
+ public:
+  /**
+   * Ctor.
+   *
+   * @param config
+   * @param intconfig
+   * @param gradientMachine
+   * @param parameterUpdater
+   * @return
+   */
+  ParameterUtil(const std::shared_ptr<TrainerConfigHelper> &config,
+                std::unique_ptr<ParameterUtilConfig> &&intconfig,
+                const GradientMachinePtr &gradientMachine,
+                const std::shared_ptr<ParameterUpdater> &parameterUpdater);
+
+  /// Load parameter from the saved parameter file as pass passId
+  /// if loadsave_parameters_in_pserver is set, some parameters MUST
+  /// load in pserver, which is "remote".
+  /// loadParameters can choose to load local/remote parameter, or both.
+  bool loadParameters(int passId, bool local = true, bool remote = false);
+
+  /// load parameters given path info
+  void loadParametersWithPath(const std::string &dir,
+                              bool local = true,
+                              bool remote = false);
+
+  /// Save parameter to dist for pass passId
+  /// passInnerId means saving times in one pass, some users want to
+  /// save parameters when have processed some batches in one pass
+  /// passInnerId = 0 means do not need to save in one inner pass
+  void saveParameters(int passId, int passInnerId = 0);
+
+  /// save parameters for one pass, when passInnerId > 0 means saving
+  /// the passInnerId times in one pass
+  void saveParametersOnePass(int passId, int passInnerId = 0);
+
+  /// delete parameter from disk via passId
+  void deleteParameters(int passId, int passInnerId = 0);
+
+  /// save config given path info
+  void saveConfigWithPath(const std::string &path);
+
+  /**
+   * Try to load parameter from config.
+   * @return true if can load from trainer config.
+   */
+  inline bool tryLoadParametersFromConfig() {
+    auto &c = config_->getConfig();
+    if (!c.init_model_path().empty()) {
+      loadParametersWithPath(c.init_model_path());
+      return true;
+    } else if (c.start_pass() > 0) {
+      CHECK(loadParameters(c.start_pass() - 1));
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+ private:
+  std::shared_ptr<TrainerConfigHelper> config_;
+  std::unique_ptr<ParameterUtilConfig> intConfig_;
+  GradientMachinePtr gserver_;
+  std::shared_ptr<ParameterUpdater> pUpdater_;
+};
+
+}  //  namespace paddle
diff --git a/paddle/legacy/trainer/ParameterUpdater.cpp b/paddle/legacy/trainer/ParameterUpdater.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..549fb0332da78053a261928b5558beb1ffbc79c5
--- /dev/null
+++ b/paddle/legacy/trainer/ParameterUpdater.cpp
@@ -0,0 +1,152 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ParameterUpdater.h"
+
+#include "paddle/legacy/utils/Logging.h"
+
+#include "paddle/legacy/utils/Thread.h"
+
+namespace paddle {
+
+static const hl_stream_t kDeviceToHostStream = HPPL_STREAM_1;
+static const hl_stream_t kHostToDeviceStream = HPPL_STREAM_2;
+
+SgdUpdaterWithCpuAverager::SgdUpdaterWithCpuAverager(
+    const OptimizationConfig& optConfig)
+    : SgdLocalUpdater(optConfig, false /*with averager*/) {
+  CHECK(FLAGS_use_gpu && optConfig.do_average_in_cpu());
+  averager_.reset(AverageOptimizer::create(optConfig,
+                                           new DummyOptimizer(optConfig),
+                                           false /*sparse*/,
+                                           true /*apply*/));
+  updateWorker_.addJob([]() { hl_set_device(FLAGS_gpu_id); });
+}
+
+void SgdUpdaterWithCpuAverager::init(
+    const std::vector<ParameterPtr>& parameters) {
+  SgdLocalUpdater::init(parameters);
+  averager_->init(parameters_.size(), nullptr);
+  copyEvents_.resize(parameters_.size());
+  for (auto& parameter : parameters) {
+    SetDevice device(parameter->getDeviceId());
+    cpuParameters_.emplace_back(new Parameter(parameter->getConfig(),
+                                              /* useGpu= */ false,
+                                              /* doInit= */ false));
+    if (parameter->useGpu()) {
+      cpuParameters_.back()->enableType(PARAMETER_APPLY);
+    } else {
+      cpuParameters_.back()->enableSharedType(
+          PARAMETER_APPLY, parameter->getBuf(PARAMETER_VALUE));
+    }
+    for (ParameterType type : averager_->getParameterTypes()) {
+      cpuParameters_.back()->enableType(type);
+    }
+
+    hl_create_event(&copyEvents_[nonStaticParaIDMap_[parameter->getID()]]);
+  }
+}
+
+SgdUpdaterWithCpuAverager::~SgdUpdaterWithCpuAverager() {
+  for (auto& event : copyEvents_) {
+    hl_destroy_event(event);
+  }
+}
+
+void SgdUpdaterWithCpuAverager::updateImpl(Parameter* para) {
+  SgdLocalUpdater::updateImpl(para);
+
+  if (para->useGpu()) {
+    size_t pid = nonStaticParaIDMap_[para->getID()];
+    Parameter* cpuPara = cpuParameters_[pid].get();
+    cpuPara->getBuf(PARAMETER_VALUE)
+        ->copyFrom(*para->getBuf(PARAMETER_VALUE), kDeviceToHostStream);
+    hl_stream_record_event(kDeviceToHostStream, copyEvents_[pid]);
+  }
+
+  updateWorker_.addJob(
+      std::bind(&SgdUpdaterWithCpuAverager::updateFunc, this, para));
+}
+
+void SgdUpdaterWithCpuAverager::updateFunc(Parameter* para) {
+  SetDevice setDevice(para->getDeviceId());
+  size_t pid = nonStaticParaIDMap_[para->getID()];
+  Parameter* cpuPara = cpuParameters_[pid].get();
+  if (para->useGpu()) {
+    hl_event_synchronize(copyEvents_[pid]);
+  }
+  averager_->update(cpuPara->getBufs(), cpuPara->getConfig(), -1LU);
+}
+
+void SgdUpdaterWithCpuAverager::finishBatch(real cost) {
+  SgdLocalUpdater::finishBatch(cost);
+
+  updateWorker_.wait();
+  for (auto para : cpuParameters_) {
+    if (auto callback = averager_->needSpecialTraversal(para->getConfig())) {
+      callback(para->getBufs(), para->getConfig(), -1LU);
+    }
+  }
+  averager_->finishBatch();
+}
+
+void SgdUpdaterWithCpuAverager::apply() {
+  // backup gpu value
+  for (auto& para : parameters_) {
+    SetDevice setDevice(para->getDeviceId());
+    para->getBuf(PARAMETER_GRADIENT)
+        ->copyFrom(*para->getBuf(PARAMETER_VALUE), kHostToDeviceStream);
+  }
+
+  // apply on cpu parameter
+  if (auto callback = averager_->apply()) {
+    for (auto para : cpuParameters_) {
+      callback(para->getBufs(), para->getConfig(), -1LU);
+    }
+  }
+
+  // copy to gpu value
+  for (auto& para : parameters_) {
+    SetDevice setDevice(para->getDeviceId());
+    size_t pid = nonStaticParaIDMap_[para->getID()];
+    Parameter* cpuPara = cpuParameters_[pid].get();
+    if (parameters_[pid]->useGpu()) {
+      para->getBuf(PARAMETER_VALUE)
+          ->copyFrom(*cpuPara->getBuf(PARAMETER_APPLY), kHostToDeviceStream);
+    }
+  }
+  hl_stream_synchronize(kHostToDeviceStream);
+  for (auto& para : parameters_) {
+    para->setValueUpdated();
+  }
+}
+
+void SgdUpdaterWithCpuAverager::restore() {
+  // restore on cpu parameter
+  if (auto callback = averager_->restore()) {
+    for (auto para : cpuParameters_) {
+      callback(para->getBufs(), para->getConfig(), -1LU);
+    }
+  }
+
+  // restore gpu value
+  for (auto& para : parameters_) {
+    SetDevice device(para->getDeviceId());
+    para->getBuf(PARAMETER_VALUE)->copyFrom(*para->getBuf(PARAMETER_GRADIENT));
+    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
+    para->setValueUpdated();
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/ParameterUpdater.h b/paddle/legacy/trainer/ParameterUpdater.h
new file mode 100644
index 0000000000000000000000000000000000000000..acddc3702d78fdb198973f70a8642c5192af992b
--- /dev/null
+++ b/paddle/legacy/trainer/ParameterUpdater.h
@@ -0,0 +1,265 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/utils/Thread.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include "paddle/legacy/parameter/AverageOptimizer.h"
+#include "paddle/legacy/parameter/FirstOrderOptimizer.h"
+#include "paddle/legacy/parameter/OptimizerFunctions.h"
+#include "paddle/legacy/parameter/OptimizerWithRegularizer.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/parameter/ParameterUpdaterBase.h"
+
+#include "TrainerConfig.pb.h"
+#include "paddle/legacy/gserver/layers/Layer.h"
+
+#include <memory>
+#include <vector>
+
+namespace paddle {
+
+/**
+ * @brief Parameter Updater for SGD, and local(not cluster) run.
+ */
+class SgdLocalUpdater : public ParameterUpdater {
+ public:
+  /**
+   * @brief Ctor. Initialize optimizer locally by optConfig.
+   * @param optConfig optimization config.
+   * @param withAverager with average optimizer or not, default is true.
+   */
+  explicit SgdLocalUpdater(const OptimizationConfig& optConfig,
+                           bool withAverager = true)
+      : numSamplesProcessed_(0) {
+    auto baseOptimizer = ParameterOptimizer::create(optConfig);
+    optimizer_.reset(withAverager
+                         ? AverageOptimizer::create(optConfig, baseOptimizer)
+                         : baseOptimizer);
+    CHECK(optimizer_) << "fail to create optimizer: "
+                      << optConfig.learning_method();
+    auto types = optimizer_->getParameterTypes();
+    for (auto type : types) {
+      addParameterType(type);
+    }
+  }
+
+  /**
+   * @brief Initialize parameters and optimizer_.
+   *        For example,
+   *           If optimizer need hassien vector, then parameter's hassien will
+   *           be initialized.
+   * @param parameters The parameter need to be initialized.
+   */
+  virtual void init(const std::vector<ParameterPtr>& parameters) {
+    ParameterUpdater::init(parameters);
+    optimizer_->init(parameters_.size(), nullptr);
+    // check no L1 decay in parameter configs
+    CHECK(std::find_if(parameters.begin(),
+                       parameters.end(),
+                       [](const ParameterPtr& para) {
+                         return para->getConfig().decay_rate_l1() > 0.0f;
+                       }) == parameters.end())
+        << "SgdLocalUpdater cannot support L1 decay in parameter";
+  }
+
+  /**
+   * @brief Start a batch with current mini-batch size
+   * @param current mini-batch size.
+   * @return Always PASS_TRAIN.
+   */
+  virtual PassType startBatch(int64_t batchSize) {
+    numSamplesProcessed_ += batchSize;
+    optimizer_->startBatch(numSamplesProcessed_);
+    return PASS_TRAIN;
+  }
+
+  /**
+   * @brief finish a mini-batch.
+   */
+  virtual void finishBatch(real cost) { optimizer_->finishBatch(); }
+
+  /**
+   * @brief start a pass.
+   */
+  virtual void startPass() { optimizer_->startPass(); }
+
+  /**
+   * @brief finish a pass.
+   * @param cost sum cost during one pass.
+   * @return true if accept (used for owlqn).
+   */
+  virtual bool finishPass() {
+    optimizer_->finishPass();
+    return ParameterUpdater::finishPass();
+  }
+
+  /**
+   * @brief apply model average.
+   */
+  virtual void apply() {
+    if (auto callback = optimizer_->apply()) {
+      for (auto para : parameters_) {
+        SetDevice device(para->getDeviceId());
+        callback(para->getBufs(), para->getConfig(), -1UL);
+      }
+    }
+  }
+
+  /**
+   * @brief restore parameter value before model average
+   */
+  virtual void restore() {
+    if (auto callback = optimizer_->restore()) {
+      for (auto para : parameters_) {
+        SetDevice device(para->getDeviceId());
+        callback(para->getBufs(), para->getConfig(), -1UL);
+      }
+    }
+  }
+
+ protected:
+  /**
+   * @brief update method. Update value from gradient.
+   * @param para parameter that will be updated.
+   */
+  virtual void updateImpl(Parameter* para) {
+    optimizer_->update(para->getBufs(), para->getConfig());
+    if (auto callback = optimizer_->needSpecialTraversal(para->getConfig())) {
+      callback(para->getBufs(), para->getConfig(), -1UL);
+    }
+
+    para->setValueUpdated();
+    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
+  }
+
+  std::unique_ptr<ParameterOptimizer> optimizer_;
+
+  /**
+   * @brief total number of samples processed.
+   */
+  int64_t numSamplesProcessed_;
+};
+
+/**
+ * @brief SgdCpuUpdater is used only in recursive neural network
+ * @deprecated
+ */
+class SgdCpuUpdater : public SgdLocalUpdater, public Deprecated {
+ public:
+  explicit SgdCpuUpdater(const OptimizationConfig& optConfig)
+      : SgdLocalUpdater(optConfig),
+        Deprecated(
+            "SgdCpuUpdater is used only in recursive neural network, "
+            "and recursive neural network is deprecated in paddle. "
+            "Use it all by your own.") {}
+
+  /**
+   * @brief update all parameter on finish batch.
+   * @param cost
+   */
+  virtual void finishBatch(real cost) {
+    for (auto para : parameters_) {
+      SgdLocalUpdater::update(para.get());
+    }
+    optimizer_->finishBatch();
+  }
+
+ protected:
+  /**
+   * @brief do nothing.
+   * @param para
+   */
+  virtual void updateImpl(Parameter* para) {}
+};
+
+/**
+ * @brief Sgd Local Updater With average in cpu.
+ *
+ * It will do model average in cpu to reduce gpu memory comsuption.
+ */
+class SgdUpdaterWithCpuAverager : public SgdLocalUpdater {
+ public:
+  /**
+   * @brief Ctor.
+   *
+   * SgdUpdaterWithCpuAverager will do everything as a
+   * SgdLocalUpdater, then copy parameter from GPU to CPU, and do model
+   * average in cpu.
+   */
+  explicit SgdUpdaterWithCpuAverager(const OptimizationConfig& optConfig);
+  ~SgdUpdaterWithCpuAverager();
+
+  /**
+   * @brief init. Initialize cpu parameters, model average optimizer.
+   * @param parameters
+   */
+  virtual void init(const std::vector<ParameterPtr>& parameters);
+
+  virtual PassType startBatch(int64_t batchSize) {
+    averager_->startBatch(-1UL);
+    return SgdLocalUpdater::startBatch(batchSize);
+  }
+  virtual void finishBatch(real cost);
+
+  virtual void startPass() {
+    averager_->startPass();
+    SgdLocalUpdater::startPass();
+  }
+  virtual bool finishPass() {
+    averager_->finishPass();
+    return SgdLocalUpdater::finishPass();
+  }
+
+  /// apply the averaged parameter to PARAMETER_VALUE
+  /// use PARAETER_GRADIENT for backing up PARAMETER_VALUE
+  virtual void apply();
+
+  /**
+   * @brief Restore parameter before apply().
+   */
+  virtual void restore();
+
+ protected:
+  virtual void updateImpl(Parameter* para);
+
+  void updateFunc(Parameter* para);
+
+ protected:
+  std::unique_ptr<ParameterOptimizer> averager_;
+
+  /**
+   * @brief The thread worker which do model average.
+   *
+   * For each parameter, GPU->CPU parameter is async, and do model average in
+   * another thread. Because the training process don't need model average while
+   * training, and model average only used in evaluation stage and saving stage.
+   * So the model average is totally async.
+   */
+  ThreadWorker updateWorker_;
+
+  /**
+   * @brief The parameter mirror in cpu.
+   */
+  std::vector<ParameterPtr> cpuParameters_;
+
+  /**
+   * @brief GPU -> CPU copy event. Model average will wait after copy done.
+   */
+  std::vector<hl_event_t> copyEvents_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/RemoteParameterUpdater.cpp b/paddle/legacy/trainer/RemoteParameterUpdater.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5de1cc7827aa8f219de60fe9da67fbb0595eb1d5
--- /dev/null
+++ b/paddle/legacy/trainer/RemoteParameterUpdater.cpp
@@ -0,0 +1,843 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "RemoteParameterUpdater.h"
+#include "Trainer.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+#include "paddle/legacy/utils/Stat.h"
+
+DECLARE_int32(trainer_id);
+DECLARE_string(save_dir);
+
+namespace paddle {
+
+static const hl_stream_t kDeviceToHostStream = HPPL_STREAM_1;
+static const hl_stream_t kHostToDeviceStream = HPPL_STREAM_2;
+static const int kFinishBatchPid = -1;
+
+const std::string RemoteParameterUpdater::kAverage = "average";
+const std::string RemoteParameterUpdater::kElasticAverage = "elastic_average";
+
+RemoteParameterUpdater::RemoteParameterUpdater(
+    const OptimizationConfig& config,
+    int expectedPassCount,
+    std::unique_ptr<ParameterUpdater>&& localUpdater)
+    : config_(config),
+      localUpdater_(std::move(localUpdater)),
+      numBatches_(0),
+      passCount_(0),
+      expectedPassCount_(expectedPassCount),
+      separateSendAndRecv_(false),
+      isFirstPass_(true),
+      useApplyInPserver_(false) {
+  addParameterType(PARAMETER_MOMENTUM);
+}
+
+void RemoteParameterUpdater::init(const std::vector<ParameterPtr>& parameters) {
+  ParameterUpdater::init(parameters);
+
+  if (localUpdater_) {
+    localUpdater_->init(parameters);
+
+    for (auto& parameter : parameters) {
+      parameter->enableType(PARAMETER_DELTA);
+    }
+
+    CHECK(config_.center_parameter_update_method() == kAverage ||
+          config_.center_parameter_update_method() == kElasticAverage)
+        << "unknown center_parameter_update_method";
+
+    // modify delta_add_rate
+    CHECK_GT(FLAGS_num_gradient_servers, 1)
+        << "FLAGS_num_gradient_servers should be set in trainer args.";
+    real delta_add_rate = config_.delta_add_rate() / FLAGS_num_gradient_servers;
+    config_.set_delta_add_rate(delta_add_rate);
+    LOG(INFO) << "center parameter in pserver,"
+              << " modify delta_add_rate=" << delta_add_rate;
+  }
+
+  if (!FLAGS_use_gpu) {
+    cpuParameters_ = parameters;
+  } else {
+    for (auto& parameter : parameters) {
+      cpuParameters_.emplace_back(new Parameter(parameter->getConfig(),
+                                                /* useGpu= */ false));
+      cpuParameters_.back()->setID(parameter->getID());
+      if (localUpdater_) {
+        cpuParameters_.back()->enableType(PARAMETER_DELTA);
+      }
+    }
+  }
+
+  parameterClient_.reset(new ParameterClient2(separateSendAndRecv_));
+  parameterClient_->init(cpuParameters_);
+  parameterClient_->setTrainerId(FLAGS_trainer_id);
+
+  if (FLAGS_trainer_id == 0) {
+    parameterClient_->setConfig(config_);
+    copyParametersFromDevice(PARAMETER_VALUE);
+    parameterClient_->setParameter();
+    parameterClient_->setStatus(PSERVER_STATUS_PARAMETER_READY);
+  } else {
+    parameterClient_->waitForStatus(PSERVER_STATUS_PARAMETER_READY);
+    parameterClient_->getParameter();
+    copyParametersToDevice(PARAMETER_VALUE);
+  }
+  if (FLAGS_trainer_id == 0 &&
+      (config_.algorithm() != TrainAlgorithm::AsyncSGD)) {
+    startController();
+    useApplyInPserver_ = useApplyInPserver(config_);
+  }
+}
+
+void RemoteParameterUpdater::startController() {
+  controllerThread_.reset(new std::thread([this]() { this->controller(); }));
+}
+
+void RemoteParameterUpdater::controller() {
+  ParameterClient2 client(false);
+  client.init(cpuParameters_);
+  while (true) {
+    /*start pass*/ {
+      client.waitPassStart();
+
+      PreparedOperations ops;
+      ops.addOperation(PSERVER_OP_START_PASS);
+      client.doOperation(ops,
+                         /* waitForGradient= */ false,
+                         /* sendBackarameter= */ false,
+                         /* releasePass= */ false);
+    }
+
+    while (true) {
+      PreparedOperations ops;
+      ops.addOperation(PSERVER_OP_SGD);
+      client.doOperation(ops,
+                         /* waitForGradient= */ true,
+                         /* sendBackarameter= */ true,
+                         /* releasePass= */ false);
+      if (client.isPassFinish()) {
+        break;
+      }
+    }
+
+    /*finish pass*/ {
+      PreparedOperations ops;
+      ops.addOperation(PSERVER_OP_FINISH_PASS);
+      client.doOperation(ops,
+                         /* waitForGradient= */ true,
+                         /* sendBackarameter= */ true,
+                         /* releasePass= */ true);
+    }
+
+    passCount_++;
+    if (passCount_ == expectedPassCount_) {
+      break;
+    }
+  }
+}
+
+void RemoteParameterUpdater::copyParametersToDevice(
+    ParameterType parameterType) {
+  if (!FLAGS_use_gpu) {
+    return;
+  }
+  int numParameters = cpuParameters_.size();
+  for (int i = 0; i < numParameters; ++i) {
+    parameters_[i]
+        ->getBuf(parameterType)
+        ->copyFrom(*cpuParameters_[i]->getBuf(parameterType));
+    if (parameterType == PARAMETER_VALUE) {
+      parameters_[i]->setValueUpdated();
+    }
+  }
+}
+
+void RemoteParameterUpdater::copyParametersFromDevice(
+    ParameterType parameterType) {
+  if (!FLAGS_use_gpu) {
+    return;
+  }
+  int numParameters = cpuParameters_.size();
+  for (int i = 0; i < numParameters; ++i) {
+    cpuParameters_[i]
+        ->getBuf(parameterType)
+        ->copyFrom(*parameters_[i]->getBuf(parameterType));
+  }
+}
+
+void RemoteParameterUpdater::updateImpl(Parameter* para) {
+  REGISTER_TIMER("update");
+  if (localUpdater_) {
+    localUpdater_->update(para);
+  }
+}
+
+void RemoteParameterUpdater::finishBatch(real cost) {
+  if (localUpdater_) {
+    localUpdater_->finishBatch(cost);
+  }
+
+  const std::string& algorithm = config_.algorithm();
+  ParameterUpdateMode mode;
+  if (algorithm == TrainAlgorithm::AsyncSGD) {
+    mode = PSERVER_UPDATE_MODE_ASYNC_SGD;
+  } else if (algorithm == TrainAlgorithm::SGD) {
+    mode = PSERVER_UPDATE_MODE_ADD_GRADIENT;
+  } else {
+    LOG(FATAL) << "Unknown algorithm: " << algorithm;
+  }
+
+  ParameterType sendType;
+  bool sendBackParameter = true;
+  if (localUpdater_) {
+    ++numBatches_;
+    if (numBatches_ % config_.num_batches_per_send_parameter() != 0) {
+      return;
+    }
+
+    if (config_.center_parameter_update_method() == kElasticAverage) {
+      parameterClient_->getParameter(PARAMETER_DELTA);
+      copyParametersToDevice(PARAMETER_DELTA);
+      sendBackParameter = false;  // no need send back after send
+
+      // calc delta
+      for (auto& para : parameters_) {
+        // DELTA = LOCAL_VALUE - CENTER_VALUE/*store in DELTA*/
+        para->getBuf(PARAMETER_DELTA)
+            ->add(*para->getBuf(PARAMETER_VALUE), -1.0f, 1.0f);
+
+        // when delta send to pserver, pserver will do:
+        // CENTER_VALUE += alpha * (LOCAL_VALUE - CENTER_VALUE)
+      }
+    } else {
+      // calc delta
+      for (auto& para : parameters_) {
+        // DELTA = NEW_VALUE - OLD_VALUE/*store in DELTA*/
+        para->getBuf(PARAMETER_DELTA)
+            ->add(*para->getBuf(PARAMETER_VALUE), -1.0f, 1.0f);
+      }
+    }
+
+    sendType = PARAMETER_DELTA;
+
+  } else {
+    // In this case, we perform SGD on pserver.
+    sendType = PARAMETER_GRADIENT;
+  }
+
+  copyParametersFromDevice(sendType);
+
+  {
+    REGISTER_TIMER("sendAndRecv_dense");
+    parameterClient_->sendAndReceiveParameter(mode,
+                                              sendType,
+                                              batchSize_,
+                                              0,  // cost = 0
+                                              sendBackParameter);
+  }
+
+  if (sendBackParameter) {
+    copyParametersToDevice(PARAMETER_VALUE);
+  }
+
+  if (localUpdater_) {
+    if (config_.center_parameter_update_method() == kElasticAverage) {
+      for (auto& para : parameters_) {
+        SetDevice device(para->getDeviceId());
+        // LOCAL_VALUE += -alpha * (LOCAL_VALUE - CENTER_VALUE)
+        para->getBuf(PARAMETER_VALUE)
+            ->add(*para->getBuf(PARAMETER_DELTA), -config_.delta_add_rate());
+      }
+
+    } else {  // average
+      // copy value to delta
+      for (auto& para : parameters_) {
+        SetDevice device(para->getDeviceId());
+        para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
+      }
+    }
+  } else {
+    for (auto& para : parameters_) {
+      SetDevice device(para->getDeviceId());
+      para->getBuf(sendType)->zeroMem();
+    }
+  }
+}
+
+void RemoteParameterUpdater::startPass() {
+  if (config_.algorithm() == TrainAlgorithm::SGD) {
+    parameterClient_->waitPassStart();
+  } else {
+    // sync could benifits reducing lagged trainer for async-sgd
+    // even if sync could not remove all lagged trainer for the
+    // sake of file loading, buffer etc.
+    parameterClient_->asyncStartPass();
+  }
+
+  if (localUpdater_) {
+    localUpdater_->startPass();
+    numBatches_ = 0;
+
+    if (config_.center_parameter_update_method() == kElasticAverage) {
+      if (!isFirstPass_) {
+        // restore local value from delta
+        for (auto& para : parameters_) {
+          SetDevice device(para->getDeviceId());
+          para->getBuf(PARAMETER_VALUE)
+              ->copyFrom(*para->getBuf(PARAMETER_DELTA));
+        }
+      }
+    } else {  // average
+      // copy value to delta
+      for (auto& para : parameters_) {
+        SetDevice device(para->getDeviceId());
+        para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
+      }
+    }
+  }
+}
+
+bool RemoteParameterUpdater::finishPass() {
+  if (localUpdater_) {
+    localUpdater_->finishPass();
+  }
+
+  if (config_.algorithm() == TrainAlgorithm::SGD) {
+    parameterClient_->waitPassFinish();
+  } else {
+    parameterClient_->asyncFinishPass();
+  }
+  if (localUpdater_) {
+    if (config_.center_parameter_update_method() == kElasticAverage) {
+      // backup local value to delta as we will get
+      // the remote parameter for saving/testing
+      for (auto& para : parameters_) {
+        SetDevice device(para->getDeviceId());
+        para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
+      }
+    }
+  }
+  parameterClient_->getParameter();
+  copyParametersToDevice(PARAMETER_VALUE);
+
+  isFirstPass_ = false;
+  return true;
+}
+
+void RemoteParameterUpdater::apply() {
+  if (useApplyInPserver_) {
+    PreparedOperations ops;
+    ops.addOperation(PSERVER_OP_APPLY);
+    parameterClient_->doOperation(ops,
+                                  /* waitForGradient= */ false,
+                                  /* sendBackarameter= */ false);
+    parameterClient_->getParameter(
+        /* recvParameterType= */ PARAMETER_VALUE,
+        /* sendBackParameterType= */ PARAMETER_APPLY);
+    copyParametersToDevice(PARAMETER_VALUE);
+  }
+}
+
+void RemoteParameterUpdater::restore() {
+  if (useApplyInPserver_) {
+    parameterClient_->getParameter();
+    copyParametersToDevice(PARAMETER_VALUE);
+  }
+}
+
+ConcurrentRemoteParameterUpdater::ConcurrentRemoteParameterUpdater(
+    OptimizationConfig config,
+    int passCount,
+    std::unique_ptr<ParameterUpdater>&& localUpdater)
+    : RemoteParameterUpdater(config, passCount, std::move(localUpdater)) {
+  sendThread_.reset(new std::thread([this]() { this->send(); }));
+  recvThread_.reset(new std::thread([this]() { this->recv(); }));
+
+  stopping_ = false;
+  oneBatchFinished_ = false;
+  separateSendAndRecv_ = true;
+}
+
+ConcurrentRemoteParameterUpdater::~ConcurrentRemoteParameterUpdater() {
+  stopping_ = true;
+  sendQueue_.enqueue(0);
+  sendThread_->join();
+  recvQueue_.enqueue(0);
+  recvThread_->join();
+}
+
+void ConcurrentRemoteParameterUpdater::finishBatch(real cost) {
+  if (localUpdater_) {
+    localUpdater_->finishBatch(cost);
+
+    if (!needToUpdateRemotely()) {
+      ++numBatches_;
+      return;
+    }
+  }
+
+  sendQueue_.enqueue(kFinishBatchPid);
+
+  finishBatchCond_.wait([this]() { return oneBatchFinished_; });
+  oneBatchFinished_ = false;
+  {
+    REGISTER_TIMER("sync_hostToDeviceStream");
+    for (auto& para : parameters_) {
+      SetDevice device(para->getDeviceId());
+      hl_stream_synchronize(kHostToDeviceStream);
+    }
+  }
+
+  if (localUpdater_) {
+    ++numBatches_;
+  }
+}
+
+// Use para=NULL to signal the end of one batch
+void ConcurrentRemoteParameterUpdater::send(Parameter* para) {
+  const std::string& algorithm = config_.algorithm();
+  ParameterUpdateMode mode;
+  if (algorithm == TrainAlgorithm::AsyncSGD) {
+    mode = PSERVER_UPDATE_MODE_ASYNC_SGD;
+  } else if (algorithm == TrainAlgorithm::SGD) {
+    mode = PSERVER_UPDATE_MODE_ADD_GRADIENT;
+  } else {
+    LOG(FATAL) << "Unknown algorithm: " << algorithm;
+  }
+  ParameterType sendType;
+  if (localUpdater_) {
+    sendType = PARAMETER_DELTA;
+  } else {
+    // In this case, we perform SGD on pserver.
+    sendType = PARAMETER_GRADIENT;
+  }
+  std::vector<ParameterSegments> paraSegment;
+  if (para == NULL) {
+    parameterClient_->sendParameter(
+        mode,
+        sendType,
+        paraSegment,
+        batchSize_,
+        0,              // cost=0
+        true,           // sendBackParameter = true
+        batchStatus_);  // batchStatus_ = BATCH_FINISH
+
+  } else {
+    ParameterSegments paraSegTemp;
+    paraSegment.reserve(1);
+    paraSegTemp.name = para->getName();
+    paraSegTemp.id = para->getID();
+    paraSegment.push_back(paraSegTemp);
+    {
+      SetDevice device(para->getDeviceId());
+      REGISTER_TIMER("copySingleParaFromDevice");
+      copySingleParaFromDevice(para, sendType);
+      hl_stream_synchronize(kDeviceToHostStream);
+    }
+    parameterClient_->sendParameter(mode,
+                                    sendType,
+                                    paraSegment,
+                                    batchSize_,
+                                    0,     // cost=0
+                                    true,  // sendBackParameter = true
+                                    batchStatus_);
+    if (batchStatus_ == BATCH_START) batchStatus_ = BATCH_ON;
+  }
+}
+void ConcurrentRemoteParameterUpdater::recv(Parameter* para) {
+  parameterClient_->recvParameter();
+  if (para != NULL) {
+    REGISTER_TIMER("copySingleParaToDevice");
+    SetDevice device(para->getDeviceId());
+    copySingleParaToDevice(para, PARAMETER_VALUE);
+
+    if (localUpdater_) {
+      para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
+    } else {
+      // if cpu, parameter should not changes until recvParameter().
+      // if gpu, zero mem when send finish
+      if (!FLAGS_use_gpu) {
+        para->getBuf(PARAMETER_GRADIENT)->zeroMem();
+      }
+    }
+  }
+}
+
+void ConcurrentRemoteParameterUpdater::recv() {
+  if (FLAGS_use_gpu) hl_set_device(FLAGS_gpu_id);
+  StatPtr stat = getStat("recv");
+  FOR_TIMING(Timer timer);
+  while (true) {
+    int pid;
+    {
+      REGISTER_TIMER("recv_dequeue");
+      pid = recvQueue_.dequeue();
+    }
+    if (pid == kFinishBatchPid) {
+      Parameter* para = NULL;
+      FOR_TIMING(timer.start());
+      recv(para);
+      FOR_TIMING(timer.stop());
+      FOR_TIMING(stat->addSample(timer.get()));
+      FOR_TIMING(timer.reset());
+      finishBatchCond_.notify_all([this] { oneBatchFinished_ = true; });
+    } else {
+      if (stopping_) break;
+      Parameter* para = parameters_[pid].get();
+      FOR_TIMING(timer.start());
+      recv(para);
+      FOR_TIMING(timer.stop());
+      oneBatchFinished_ = false;
+    }
+  }
+}
+
+void ConcurrentRemoteParameterUpdater::send() {
+  if (FLAGS_use_gpu) hl_set_device(FLAGS_gpu_id);
+  StatPtr stat = getStat("send");
+  FOR_TIMING(Timer timer);
+  while (true) {
+    int pid;
+    {
+      REGISTER_TIMER("send_dequeue");
+      pid = sendQueue_.dequeue();
+    }
+    if (pid == kFinishBatchPid) {
+      batchStatus_ = BATCH_FINISH;
+      if (!localUpdater_) {
+        // if cpu, parameter should not changes until recvParameter().
+        // if gpu, zeroMem() at the end of batch so that it won't
+        // interfere with computation.
+        if (FLAGS_use_gpu) {
+          REGISTER_TIMER("para_zeroMem");
+          for (auto& para : parameters_) {
+            SetDevice device(para->getDeviceId());
+            para->getBuf(PARAMETER_GRADIENT)->zeroMem();
+          }
+        }
+      }
+      Parameter* para = NULL;
+      FOR_TIMING(timer.start());
+      send(para);
+      FOR_TIMING(timer.stop());
+      FOR_TIMING(stat->addSample(timer.get()));
+      FOR_TIMING(timer.reset());
+      recvQueue_.enqueue(pid);
+    } else {
+      if (stopping_) break;
+      Parameter* para = parameters_[pid].get();
+      if (localUpdater_) {
+        // DELTA = NEW_VALUE - OLD_VALUE/*store in DELTA*/
+        para->getBuf(PARAMETER_DELTA)
+            ->add(*para->getBuf(PARAMETER_VALUE), -1.0f, 1.0f);
+      }
+      FOR_TIMING(timer.start());
+      send(para);
+      FOR_TIMING(timer.stop());
+      recvQueue_.enqueue(nonStaticParaIDMap_[para->getID()]);
+    }
+  }
+}
+
+void ConcurrentRemoteParameterUpdater::updateImpl(Parameter* para) {
+  REGISTER_TIMER("update");
+  if (localUpdater_) {
+    localUpdater_->update(para);
+    if (!needToUpdateRemotely()) {
+      return;
+    }
+  }
+  sendQueue_.enqueue(nonStaticParaIDMap_[para->getID()]);
+}
+
+void ConcurrentRemoteParameterUpdater::copySingleParaToDevice(
+    Parameter* para, ParameterType parameterType) {
+  if (!FLAGS_use_gpu) {
+    return;
+  }
+  int i = nonStaticParaIDMap_[para->getID()];
+  para->getBuf(parameterType)
+      ->copyFrom(*cpuParameters_[i]->getBuf(parameterType),
+                 kHostToDeviceStream);
+  if (parameterType == PARAMETER_VALUE) {
+    para->setValueUpdated();
+  }
+}
+
+void ConcurrentRemoteParameterUpdater::copySingleParaFromDevice(
+    Parameter* para, ParameterType parameterType) {
+  if (!FLAGS_use_gpu) {
+    return;
+  }
+  int i = nonStaticParaIDMap_[para->getID()];
+  cpuParameters_[i]
+      ->getBuf(parameterType)
+      ->copyFrom(*para->getBuf(parameterType), kDeviceToHostStream);
+}
+
+SparseRemoteParameterUpdater::SparseRemoteParameterUpdater(
+    const OptimizationConfig& config, int expectedPassCount, bool testing)
+    : config_(config),
+      passCount_(0),
+      expectedPassCount_(expectedPassCount),
+      testing_(testing),
+      useApplyInPserver_(false) {}
+
+void SparseRemoteParameterUpdater::init(
+    const std::vector<ParameterPtr>& parameters) {
+  ParameterUpdater::init(parameters);
+
+  parameterClient_.reset(new ParameterClient2(
+      false, FLAGS_port + FLAGS_ports_num, FLAGS_ports_num_for_sparse));
+  parameterClient_->init(parameters_);
+  parameterClient_->setTrainerId(FLAGS_trainer_id);
+
+  if (FLAGS_trainer_id == 0) {
+    parameterClient_->setConfig(
+        config_, FLAGS_save_dir, true /*is_sparse_server*/);
+    if (parameters[0]->isFullSize()) {
+      parameterClient_->setParameter();
+    } else {  // init in pserver
+      parameterClient_->setParameterZero();
+    }
+  }
+  if (FLAGS_trainer_id == 0 && !testing_ &&
+      config_.algorithm() == TrainAlgorithm::SGD) {
+    startController();
+    useApplyInPserver_ = useApplyInPserver(config_);
+  }
+}
+
+void SparseRemoteParameterUpdater::startController() {
+  controllerThread_.reset(new std::thread([this]() { this->controller(); }));
+}
+
+void SparseRemoteParameterUpdater::controller() {
+  ParameterClient2 client(
+      false, FLAGS_port + FLAGS_ports_num, FLAGS_ports_num_for_sparse);
+  client.init(parameters_);
+
+  while (true) {
+    /*start pass*/ {
+      client.waitPassStart();
+
+      PreparedOperations ops;
+      ops.addOperation(PSERVER_OP_START_PASS);
+      client.doOperation(ops,
+                         /* waitForGradient= */ false,
+                         /* sendBackarameter= */ false,
+                         /* releasePass= */ false);
+    }
+
+    while (true) {
+      PreparedOperations ops;
+      ops.addOperation(PSERVER_OP_SGD);
+      client.doOperation(ops,
+                         /* waitForGradient= */ true,
+                         /* sendBackarameter= */ true,
+                         /* releasePass= */ false);
+      if (client.isPassFinish()) {
+        break;
+      }
+    }
+
+    /*finish pass*/ {
+      PreparedOperations ops;
+      ops.addOperation(PSERVER_OP_FINISH_PASS);
+      client.doOperation(ops,
+                         /* waitForGradient= */ true,
+                         /* sendBackarameter= */ true,
+                         /* releasePass= */ true);
+    }
+
+    passCount_++;
+    if (passCount_ == expectedPassCount_) {
+      break;
+    }
+  }
+}
+
+PassType SparseRemoteParameterUpdater::startBatch(int64_t batchSize) {
+  batchSize_ = batchSize;
+  return PASS_TRAIN;
+}
+
+void SparseRemoteParameterUpdater::finishBatch(real cost) {
+  const std::string& algorithm = config_.algorithm();
+  ParameterUpdateMode mode;
+  if (algorithm == TrainAlgorithm::AsyncSGD) {
+    mode = PSERVER_UPDATE_MODE_ASYNC_SGD;
+  } else if (algorithm == TrainAlgorithm::SGD) {
+    mode = PSERVER_UPDATE_MODE_ADD_GRADIENT;
+  } else {
+    LOG(FATAL) << "Unknown algorithm: " << algorithm;
+  }
+
+  ParameterType sendType = PARAMETER_GRADIENT;
+
+  REGISTER_TIMER("sendSparseParam");
+  parameterClient_->sendAndReceiveParameter(mode,
+                                            sendType,
+                                            batchSize_,
+                                            0,       // cost = 0
+                                            false);  // sendBackParameter
+
+  // grad zero move to sgd grad machine, before merge grad sparse remote
+}
+
+void SparseRemoteParameterUpdater::startPass() {
+  if (config_.algorithm() == TrainAlgorithm::SGD) {
+    parameterClient_->waitPassStart();
+  } else {
+    if (FLAGS_trainer_id == 0) {
+      PreparedOperations ops;
+      ops.addOperation(PSERVER_OP_START_PASS);
+      parameterClient_->doOperation(ops,
+                                    /* waitForGradient= */ false,
+                                    /* sendBackarameter= */ false);
+    }
+    parameterClient_->asyncStartPass();
+  }
+}
+
+bool SparseRemoteParameterUpdater::finishPass() {
+  if (config_.algorithm() == TrainAlgorithm::SGD) {
+    parameterClient_->waitPassFinish();
+  } else {
+    if (FLAGS_trainer_id == 0) {
+      PreparedOperations ops;
+      ops.addOperation(PSERVER_OP_FINISH_PASS);
+      parameterClient_->doOperation(ops,
+                                    /* waitForGradient= */ false,
+                                    /* sendBackarameter= */ false);
+    }
+    parameterClient_->asyncFinishPass();
+  }
+
+  return true;
+}
+
+// Trainer will call getParametersRemote at batch start or before save,
+// so we do not get values in apply() and restore().
+void SparseRemoteParameterUpdater::apply() {
+  if (useApplyInPserver_) {
+    PreparedOperations ops;
+    ops.addOperation(PSERVER_OP_APPLY);
+    parameterClient_->doOperation(ops,
+                                  /* waitForGradient= */ false,
+                                  /* sendBackarameter= */ false);
+  }
+}
+
+void SparseRemoteParameterUpdater::restore() {}
+
+void SparseRemoteParameterUpdater::getParametersRemote(bool fullSize,
+                                                       bool apply) {
+  ParameterType sendBackParameterType =
+      (useApplyInPserver_ && apply) ? PARAMETER_APPLY : PARAMETER_VALUE;
+  std::function<void()> getParams;
+  std::function<void(Parameter&, real)> applyL1;
+  if (fullSize) {
+    getParams = [&] {
+      parameterClient_->getParameter(
+          /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
+    };
+    applyL1 = [](Parameter& para, real decayRate) {
+      para.getBuf(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate);
+    };
+  } else {
+    getParams = [&] {
+      parameterClient_->getParameterSparse(
+          /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
+    };
+    applyL1 = [](Parameter& para, real decayRate) {
+      para.getMat(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate);
+    };
+  }
+  {
+    REGISTER_TIMER("getParamDenseAndSparse");
+    getParams();
+    if (config_.shrink_parameter_value() > 0) {
+      for (auto& para : parameters_) {
+        if (para->getConfig().decay_rate_l1() > 0) {
+          applyL1(*para, config_.shrink_parameter_value());
+        }
+      }
+    }
+  }
+}
+
+void SparseRemoteParameterUpdater::randParametersRemote() {
+  CHECK_EQ(FLAGS_trainer_id, 0);
+
+  PreparedOperations ops;
+  ops.addOperation(PSERVER_OP_RANDOMIZE);
+  parameterClient_->doOperation(ops,
+                                /* waitForGradient= */ false,
+                                /* sendBackarameter= */ false);
+}
+
+void SparseRemoteParameterUpdater::loadParametersRemote(
+    const std::string& dirName) {
+  if (FLAGS_trainer_id == 0) {
+    parameterClient_->loadValueVector(dirName);
+  }
+
+  if (testing_) {
+    // we do not use synchronize() here,
+    // because test mode may run only one tester
+    if (FLAGS_trainer_id == 0) {
+      parameterClient_->setStatus(PSERVER_STATUS_PARAMETER_READY);
+    } else {
+      parameterClient_->waitForStatus(PSERVER_STATUS_PARAMETER_READY);
+    }
+  }
+}
+
+void SparseRemoteParameterUpdater::saveParametersRemote(
+    const std::string& dirName) {
+  if (FLAGS_trainer_id == 0) {
+    parameterClient_->saveValueVector(dirName);
+  }
+}
+
+void SparseRemoteParameterUpdaterComposite::init(
+    const std::vector<ParameterPtr>& parameters) {
+  parameters_ = parameters;
+
+  std::vector<ParameterPtr> parametersArray[NUMBER_UPDATERS];
+
+  for (auto& para : parameters_) {
+    if (para->isSparseRemoteUpdate()) {
+      parametersArray[UPDATER_SPARSE_REMOTE].push_back(para);
+    } else {
+      parametersArray[UPDATER_NORMAL].push_back(para);
+    }
+  }
+  CHECK(!parametersArray[UPDATER_SPARSE_REMOTE].empty());
+  CHECK(!parametersArray[UPDATER_NORMAL].empty());
+
+  syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
+    updaters_[tid]->init(parametersArray[tid]);
+  });
+
+  parameterTypes_ = updaters_[UPDATER_NORMAL]->getParameterTypes();
+}
+
+std::vector<std::function<ParameterUpdater*(
+    const std::string&, const OptimizationConfig&, bool, size_t)>>
+    ParameterUpdaterCreators::constructors_;
+
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/RemoteParameterUpdater.h b/paddle/legacy/trainer/RemoteParameterUpdater.h
new file mode 100644
index 0000000000000000000000000000000000000000..68468532981a49ef32f5f0da1170815d657d86c1
--- /dev/null
+++ b/paddle/legacy/trainer/RemoteParameterUpdater.h
@@ -0,0 +1,416 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <thread>
+#include "ParameterUpdater.h"
+#include "paddle/legacy/pserver/ParameterClient2.h"
+#include "paddle/legacy/utils/Queue.h"
+#include "paddle/legacy/utils/Util.h"
+
+namespace paddle {
+
+// TODO(yanfei):
+// I think that the biggest feature of rdma is packet lossless control
+// feature instead of high bandwiths, zero copy and gpu-direct rdma in
+// theroy.
+// But zero-copy and gpu-direct rdma features can help to reduce latency
+// caused by os system.
+// So, for some specified cluster, such as high density gpu cluster,
+// gpu-direct and zero copy could help to improve cluster communication
+// performance.
+//
+
+/**
+ * Normal remote parameter updater for dense parameters.
+ *
+ * It first packs all parameters for all pservers using ParameterClient
+ * module, then wait for merged parameters data from all pservers.
+ * The synchronization pattern specified by sync-sgd or async-sgd is
+ * achieved by all pservers with the help of the controller within this
+ * remote parameter updater.
+ * This module indeedly bridges the gradient machines and parameter servers.
+ * It helps to transfer the parameters from acceleration device to cpu end
+ * for network. It contains additional parameters copy buffers for
+ * acceleration devices at cpu end, such as gpu, otherwise it will
+ * directly use original parameters data to update pservers.
+ *
+ * This remote parameter updater does not use pipeline mechanism to hide
+ * copy latency from gpu to cpu buffer. In addition the overlapped between
+ * backward and communication is not supported.
+ */
+class RemoteParameterUpdater : public ParameterUpdater {
+ public:
+  RemoteParameterUpdater(
+      const OptimizationConfig& config,
+      int expectedPassCount,
+      std::unique_ptr<ParameterUpdater>&& localUpdater = nullptr);
+  ~RemoteParameterUpdater() {
+    if (controllerThread_) {
+      controllerThread_->join();
+    }
+  }
+
+  /**
+   * initialize the internal parameter client and itself.
+   */
+  virtual void init(const std::vector<ParameterPtr>& parameters);
+  /**
+   * @brief start batch
+   *
+   * @note  one batch training exhibits stateful feature to help
+   *        to do performance tuning, sgd optimization if necessary.
+   */
+  virtual PassType startBatch(int64_t batchSize) {
+    if (localUpdater_) {
+      localUpdater_->startBatch(batchSize);
+    }
+    batchSize_ = batchSize;
+    batchStatus_ = BATCH_START;
+    return PASS_TRAIN;
+  }
+
+  /**
+   * send parameters to pservers and get returned parameters
+   * from all pservers if necessary. it will implictly
+   * cooperate with controller thread for sync-sgd.
+   */
+  virtual void finishBatch(real cost);
+  virtual void startPass();
+  virtual bool finishPass();
+
+#ifndef PADDLE_DISABLE_TIMER
+  virtual void setForwardbackwardTime(uint64_t delta) {
+    parameterClient_->setForwardbackwardTime(delta);
+  }
+#endif
+
+  virtual void apply();
+  virtual void restore();
+
+ protected:
+  /**
+   * control all pservers with all trainers for sync-sgd
+   */
+  virtual void controller();
+
+  /**
+   * work need to do after finishBatch
+   */
+  virtual void updateImpl(Parameter* para);
+
+  void startController();
+
+  /**
+   * @brief copy parameters from cpu host to device, such as gpu.
+   *
+   * @note  return if all data are transfered.
+   */
+  void copyParametersToDevice(ParameterType parameterType);
+
+  /**
+   * @brief copy parameters from device to cpu host
+   *
+   * @note  return if all data are transfered
+   */
+  void copyParametersFromDevice(ParameterType parameterType);
+
+ protected:
+  /// Optimization config used to guide initialization and finishBatch
+  OptimizationConfig config_;
+  /// internal parameter client object for exchanging data with pserver
+  std::unique_ptr<ParameterClient2> parameterClient_;
+  /// internal shadow buffer at cpu host end, use original parameters_
+  /// if no acceleration devices are used.
+  std::vector<ParameterPtr> cpuParameters_;
+  /// local updater for aggregating multi-batches local delta
+  std::unique_ptr<ParameterUpdater> localUpdater_;
+  /// the size of mini-batch
+  int64_t batchSize_;
+  /// batches passed
+  int64_t numBatches_;
+  /// for stateful control
+  BatchStatus batchStatus_;
+  /// controller thread for sync-sgd
+  std::unique_ptr<std::thread> controllerThread_;
+  /// passed already finished
+  int64_t passCount_;
+  /// expected passes to finished
+  int64_t expectedPassCount_;
+  /// use normal synchronization communication if True
+  bool separateSendAndRecv_;
+  /// true if it's first pass
+  bool isFirstPass_;
+  bool useApplyInPserver_;
+
+  static const std::string kAverage;
+  static const std::string kElasticAverage;
+};
+
+// TODO(yanfei):
+// do parameters level synchronization Optimization at pserver end with
+// ConcurrentRemoteParameterUpdater to get more parallelization, at last
+// to really hide pserver latency in backward computation.
+//
+/**
+ * This updater add additional optimization for overlapping synchronization
+ * from pservers with backward computation.
+ *
+ * Parameter can be sent to pservers when related backward stage is finished.
+ * This concurrent udpater does data copy from acceleration device to host
+ * memory aynchronously. In addition internal parameter client reads data in
+ * host memory and send them to all pservers in next stage. So this class
+ * help to pipeline device-to-host copy and host-to-network to hide network
+ * latency in backward stage.
+ * It contains separate send and recv thread for pipeline usage.
+ */
+class ConcurrentRemoteParameterUpdater : public RemoteParameterUpdater {
+ public:
+  ConcurrentRemoteParameterUpdater(
+      OptimizationConfig config,
+      int expectedPassCount,
+      std::unique_ptr<ParameterUpdater>&& localUpdater);
+  ~ConcurrentRemoteParameterUpdater();
+
+  /**
+   * @brief send paraemeters to all pservers
+   *
+   * @note  it just signal the end signal to internal parameter client
+   *        to finished the aynchronous send action. In addition it also
+   *        do synchronization for all asynchronous host-to-device copy.
+   */
+  virtual void finishBatch(real cost);
+
+ protected:
+  virtual void updateImpl(Parameter* para);
+  /// internal thread called in send thread
+  void send(Parameter* para);  // para == NULL indicate end of a minibatch
+  /// internal function called in recv thread
+  void recv(Parameter* para);
+  /**
+   * @brief send thread for relaying data from gradient to parameter client
+   *
+   * @note  just pipe data to internal parameter client for pipeline
+   */
+  void send();
+  /**
+   * @brief recv thread for relaying data from internal parameter client to
+   *        host memory
+   *
+   * @note  it contains the asynchronous data copy form host to device
+   */
+  void recv();
+  /// copy specified parameter from host to device
+  void copySingleParaToDevice(Parameter* para, ParameterType parameterType);
+  /// copy specified parameter from device to host
+  void copySingleParaFromDevice(Parameter* para, ParameterType parameterType);
+  bool needToUpdateRemotely() {
+    return (numBatches_ + 1) % config_.num_batches_per_send_parameter() == 0;
+  }
+
+ private:
+  /// send thread used for overlapping
+  std::unique_ptr<std::thread> sendThread_;
+  /// recv thread used for overlapping
+  std::unique_ptr<std::thread> recvThread_;
+  /// buffer queue for overlapping
+  Queue<int> sendQueue_;
+  /// buffer queue for overlapping
+  Queue<int> recvQueue_;
+  /// flags indicating to stop
+  bool stopping_;
+  /// conditional variable for threads synchronization between the
+  /// thread calling finishBatch and internal recv thread
+  LockedCondition finishBatchCond_;
+  bool oneBatchFinished_;
+};
+
+// TODO(yanfei):
+// merge sparse updater with dense updater, and could help to reduce
+// the synchronization between sparse and dense udpater. it could also
+// reduce the threads for managing all connections.
+/**
+ * This class is specified for updating sparse parameters.
+ *
+ * It allows part of parameter to be exchanged with all pservers.
+ * If sparse input assigned, part gradients of first hidden layer
+ * could remained zero which can not need to be exchanged within
+ * all pservers. This is the key optimization point for this updater
+ *
+ * For updating sparse parameters, all latest parameters are stored
+ * in pservers instead of keeping full copy at train end, so need to
+ * prefetch parameters weight value which can be changed in next-batch
+ * before doing next forwardbackward. Also, with above fact that the
+ * parameters can be stored in pserver instead of trainer, we can
+ * fetch specified parmeters if necessary, and can support huge
+ * parameters which is larger enough than  the RAM size in single
+ * node.
+ *
+ * Internally, this updater will direct internal parameter client
+ * to encapsulate sparse specified message for all pservers.
+ */
+class SparseRemoteParameterUpdater : public ParameterUpdater {
+ public:
+  SparseRemoteParameterUpdater(const OptimizationConfig& config,
+                               int expectedPassCount,
+                               bool testing);
+  ~SparseRemoteParameterUpdater() {
+    if (controllerThread_) {
+      controllerThread_->join();
+    }
+  }
+
+  /// initialization
+  virtual void init(const std::vector<ParameterPtr>& parameters);
+
+  /// stateful batch control
+  virtual PassType startBatch(int64_t batchSize);
+  /// send all sparse related parameters to all pservers
+  virtual void finishBatch(real cost);
+  virtual void startPass();
+  virtual bool finishPass();
+
+  virtual void apply();
+  virtual void restore();
+
+  /// load parameters from pservers
+  virtual void loadParametersRemote(const std::string& dirName);
+  /// save parameters to pservers
+  virtual void saveParametersRemote(const std::string& dirName);
+  /**
+   * @brief get latest sparse parameters value from all pservers
+   *
+   * @note  call it before next mini-batch
+   */
+  virtual void getParametersRemote(bool fullSize, bool apply);
+  virtual void randParametersRemote();
+#ifndef PADDLE_DISABLE_TIMER
+  virtual void setForwardbackwardTime(uint64_t delta) {
+    parameterClient_->setForwardbackwardTime(delta);
+  }
+#endif
+
+ protected:
+  /// update implimentation, not implemented
+  virtual void updateImpl(Parameter* para) {}
+
+  /// internal controller routine for controller thread
+  virtual void controller();
+
+  /// start controller thread
+  void startController();
+
+ protected:
+  /// optimization config
+  OptimizationConfig config_;
+  /// internal parameter client
+  std::unique_ptr<ParameterClient2> parameterClient_;
+  int64_t batchSize_;
+  std::unique_ptr<std::thread> controllerThread_;
+  int64_t passCount_;
+  int64_t expectedPassCount_;
+  bool testing_;
+  bool useApplyInPserver_;
+};
+
+/**
+ * Class for supporting normal updater and sparse updater
+ *
+ * Not all parts of one model are sparse, so it exists dense updater
+ * for normal layers while sparse updater is for sparse layers.
+ *
+ * it directly call internal dense and sparse udpater individually.
+ */
+class SparseRemoteParameterUpdaterComposite : public ParameterUpdaterComposite {
+ public:
+  enum {
+    UPDATER_SPARSE_REMOTE = 0,  // execute in sync thread pool(tid:0)
+    UPDATER_NORMAL = 1,         // execute in Owner thread(tid:1)
+    NUMBER_UPDATERS = 2,
+  };
+  /**
+   * @brief create one dense updater and one sparse updater
+   *
+   * @note  use syncThreadPool to synchronize these two updaters
+   */
+  SparseRemoteParameterUpdaterComposite(
+      const OptimizationConfig& config,
+      int expectedPassCount,
+      bool testing,
+      std::unique_ptr<ParameterUpdater>&& normalUpdater) {
+    updaters_.resize(NUMBER_UPDATERS);
+    updaters_[UPDATER_SPARSE_REMOTE].reset(
+        new SparseRemoteParameterUpdater(config, expectedPassCount, testing));
+    updaters_[UPDATER_NORMAL] = std::move(normalUpdater);
+
+    syncThreadPool_.reset(new SyncThreadPool(NUMBER_UPDATERS - 1));
+  }
+
+  /// initialization of dense and sparse updaters
+  virtual void init(const std::vector<ParameterPtr>& parameters);
+};
+
+class ParameterUpdaterCreators {
+ public:
+  /**
+   * @brief add a creator to create custom ParameterUpdater while training.
+   *        The creator is a function with type (alogrithm, optConfig, isLocal,
+   *        numPasses) -> ParameterUpdater*. Trainer will use this
+   *        ParameterUpdater if creator can create a no nullptr
+   *        ParameterUpdater. Return nullptr will use trainer's default
+   *        updaters.
+   *
+   * @param creator method which can create ParameterUpdater.
+   */
+  static void addCreator(
+      const std::function<ParameterUpdater*(
+          const std::string&,         // algo
+          const OptimizationConfig&,  // optConfig
+          bool,                       // isLocal
+          size_t                      // numPasses
+          )>& creator) {  // NOLINT  explicit move closing ) in this line
+                          // for readability
+    constructors_.push_back(creator);
+  }
+
+  /**
+   * @brief Try to create an updater by given algo, optConfig, isLocal,
+   *        numPasses. Return nullptr if cannot create anyone.
+   * @param algo algorithm string.
+   * @param optConfig optimization config.
+   * @param isLocal is in local mode or not.
+   * @param numPasses total passes that trainer will train.
+   * @return nullptr if fail, not nullptr if we can create an updater.
+   */
+  static ParameterUpdater* tryCreateUpdater(const std::string& algo,
+                                            const OptimizationConfig& optConfig,
+                                            bool isLocal,
+                                            size_t numPasses) {
+    for (auto& c : constructors_) {
+      if (auto updater = c(algo, optConfig, isLocal, numPasses)) {
+        return updater;
+      }
+    }
+    return nullptr;
+  }
+
+ private:
+  static std::vector<std::function<ParameterUpdater*(
+      const std::string&, const OptimizationConfig&, bool, size_t)>>
+      constructors_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/Tester.cpp b/paddle/legacy/trainer/Tester.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d977ca9657a7688c101ed060935c644e4876e6d1
--- /dev/null
+++ b/paddle/legacy/trainer/Tester.cpp
@@ -0,0 +1,380 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Tester.h"
+
+#include <fenv.h>
+#include <stdio.h>
+
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <sstream>
+
+#include <google/protobuf/text_format.h>
+
+#include "paddle/legacy/utils/GlobalConstants.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include "TesterConfig.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachineMode.h"
+#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/legacy/gserver/layers/ValidationLayer.h"
+
+namespace paddle {
+
+Tester::Tester(const std::shared_ptr<TrainerConfigHelper>& config,
+               std::unique_ptr<TesterConfig>&& intconfig,
+               const GradientMachinePtr& gradientMachine,
+               const std::shared_ptr<ParameterUpdater>& parameterUpdater,
+               std::shared_ptr<DataProvider> testDataProvider)
+    : config_(config),
+      intconfig_(std::move(intconfig)),
+      gradientMachine_(gradientMachine),
+      parameterUpdater_(parameterUpdater),
+      testDataProvider_(testDataProvider) {
+  if (config_->getOptConfig().use_sparse_remote_updater()) {
+    LOG(FATAL) << "It's prohibited to set sparse_remote_update "
+               << "when doing train and test jobs in the same "
+               << "process. You could run paddle --job=test in "
+               << "a separate process.";
+  }
+  testEvaluator_.reset(gradientMachine_->makeEvaluator());
+  if (intconfig_->distributeTest) {
+    testParameterClient_.reset(new ParameterClient2(true));
+  }
+
+  if (testParameterClient_) {
+    testParameterClient_->init(gradientMachine_->getParameters());
+  }
+
+  std::unique_ptr<ParameterUtilConfig> paramConfig(
+      new ParameterUtilConfig(intconfig_->saveOnlyOne,
+                              intconfig_->savingPeriod,
+                              intconfig_->loadsaveParametersInPserver,
+                              intconfig_->config));
+
+  paramUtil_.reset(new ParameterUtil(
+      config_, std::move(paramConfig), gradientMachine_, parameterUpdater_));
+}
+
+void Tester::startTestPeriod() {
+  if (testDataProvider_) {
+    testDataProvider_->reset();
+  }
+  testEvaluator_->start();
+  testContext_.cost = 0;
+  testContext_.numSamples = 0;
+
+  parameterUpdater_->apply();
+  if (intconfig_->prevBatchState) {
+    gradientMachine_->getState(*intconfig_->trainState);
+    gradientMachine_->setState(*intconfig_->testState);
+  }
+}
+
+void Tester::testOneDataBatch(const DataBatch& dataBatch,
+                              std::vector<Argument>* outArgs) {
+  testContext_.cost +=
+      forwardOneBatch(dataBatch, testEvaluator_.get(), outArgs);
+  testContext_.numSamples += dataBatch.getSize();
+}
+
+void Tester::testOnePeriod() {
+  DataBatch dataBatch;
+  int64_t batchSize = config_->getOptConfig().batch_size();
+  std::vector<Argument> outArgs;
+  startTestPeriod();
+  while (testDataProvider_->getNextBatch(batchSize, &dataBatch) != 0) {
+    testOneDataBatch(dataBatch, &outArgs);
+  }
+  finishTestPeriod();
+}
+
+void Tester::finishTestPeriod() {
+  if (intconfig_->prevBatchState) {
+    gradientMachine_->resetState();
+  }
+  testEvaluator_->finish();
+  CHECK_GT(testContext_.numSamples, 0)
+      << "There is no samples in your test batch. Possibly "
+         "wrong implementation of DataProvidor.reset()";
+  LOG(INFO) << " Test samples=" << testContext_.numSamples
+            << " cost=" << testContext_.cost / testContext_.numSamples
+            << " Eval: " << *testEvaluator_;
+  parameterUpdater_->restore();
+  if (intconfig_->prevBatchState) {
+    gradientMachine_->getState(*intconfig_->testState);
+    gradientMachine_->setState(*intconfig_->trainState);
+  }
+}
+
+int64_t Tester::testOneBatchById(int64_t batchId) {
+  DataBatch dataBatch;
+  int32_t batchSize = config_->getOptConfig().batch_size();
+
+  testDataProvider_->getNextBatch(batchSize, &dataBatch);
+
+  int64_t actualBatchSize = dataBatch.getSize();
+  if (actualBatchSize == 0) {
+    return 0;
+  }
+
+  std::vector<Argument> outArgs;
+
+  stats_ += std::pair<int64_t, real>{
+      actualBatchSize,
+      forwardOneBatch(dataBatch, testEvaluator_.get(), &outArgs)};
+
+  if (((batchId + 1) % intconfig_->logPeriod) == 0) {
+    LOG(INFO) << " Batch=" << batchId + 1 << " " << stats_.getStats(false);
+  }
+
+  return actualBatchSize;
+}
+
+real Tester::forwardOneBatch(const DataBatch& dataBatch,
+                             Evaluator* evaluator,
+                             std::vector<Argument>* pOutArgs) {
+  auto& outArgs = *pOutArgs;
+  const std::vector<Argument>& inArgs = dataBatch.getStreams();
+  if (intconfig_->loadsaveParametersInPserver) {
+    REGISTER_TIMER("prefetch");
+    gradientMachine_->prefetch(inArgs);
+    parameterUpdater_->getParametersRemote(false /*full parameter*/,
+                                           true /*after apply*/);
+  }
+
+  gradientMachine_->forward(inArgs, &outArgs, PASS_TEST);
+
+  // write features if set this flag and outArgs is not empty
+  std::string featFile = intconfig_->featFile;
+  if (!featFile.empty() && outArgs.empty()) {
+    size_t numOutputs = outArgs.size();
+    std::vector<MatrixPtr> featMatrices;
+    featMatrices.resize(numOutputs);
+    for (size_t i = 0; i < numOutputs; ++i) {
+      featMatrices[i] = Matrix::create(outArgs[i].value->getHeight(),
+                                       outArgs[i].value->getWidth(),
+                                       false,
+                                       false);  // CPU data buffer
+      featMatrices[i]->copyFrom(*(outArgs[i].value), HPPL_STREAM_DEFAULT);
+    }
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+    FILE* fp = fopen(featFile.c_str(), "ab+");
+    CHECK(!ferror(fp)) << "Fail to open " << featFile;
+
+    size_t sampleNum = featMatrices[0]->getHeight();
+    for (size_t i = 0; i < sampleNum; ++i) {
+      for (size_t j = 0; j < numOutputs; ++j) {
+        size_t dim = featMatrices[j]->getWidth();
+        fwrite(featMatrices[j]->getData() + i * dim, sizeof(real), dim, fp);
+      }
+    }
+    fclose(fp);
+  }
+  if (evaluator) {
+    gradientMachine_->eval(evaluator);
+  }
+
+  // Save the output layers if predict_output_dir is not empty
+  std::string predictOutputDir = intconfig_->predictOutputDir;
+  if (!predictOutputDir.empty() && !outArgs.empty()) {
+    CHECK(intconfig_->testing) << "Only valid in test mode";
+    if (!os_.is_open()) {
+      // TODO(yuyang18): Refactor these lines.
+      constexpr int kBufLen = 100;
+      char buf[kBufLen];
+      snprintf(buf, kBufLen, "rank-%05d", intconfig_->trainerId);
+      mkDir(predictOutputDir.c_str());
+      std::string filename = path::join(predictOutputDir, buf);
+      os_.open(filename, std::ofstream::trunc);
+      CHECK(os_.is_open()) << "Failed to open file " << filename;
+    }
+    printOutput(outArgs, os_);
+    return 0.0;  // In this case, there is no meaning to calculate cost
+  }
+
+  return Argument::sum(outArgs);
+}
+
+void Tester::testOnePassBatch(int passId) {
+  stats_.reset();
+  const std::vector<Argument> inArgs;
+  gradientMachine_->forward(inArgs, nullptr, PASS_TEST);
+  int64_t num;
+  real cost;
+  gradientMachine_->getStats(cost, num);
+  stats_ += std::pair<int64_t, real>{num, cost};
+  gradientMachine_->onPassEnd();
+
+  LOG(INFO) << " Pass=" << passId << " " << stats_.getStats(false);
+}
+
+void Tester::testOnePass(int passId) {
+  stats_.reset();
+  int64_t batchId = 0;
+  int num = 0;
+  if (intconfig_->prevBatchState) {
+    gradientMachine_->resetState();
+  }
+
+  testEvaluator_->start();
+
+  do {
+    num = testOneBatchById(batchId);
+    ++batchId;
+  } while (num > 0);
+
+  gradientMachine_->onPassEnd();
+  testEvaluator_->finish();
+
+  LOG(INFO) << " Pass=" << passId << " " << stats_.getStats(false)
+            << " Eval: " << *testEvaluator_;
+
+  if (intconfig_->distributeTest) {
+    testEvaluator_->distributeEval(testParameterClient_.get());
+    if (0 == intconfig_->trainerId) {
+      LOG(INFO) << "distribute eval: " << *testEvaluator_;
+    }
+  }
+}
+
+void Tester::test() {
+  CHECK(testDataProvider_) << "TestData is not specified";
+  testDataProvider_->setSkipShuffle();
+  testDataProvider_->reset();
+  gradientMachine_->start();
+
+  // For evaluation
+  std::vector<std::string> modelList;
+  std::string modelListFromConfig = intconfig_->modelList;
+  std::string initModelPath = intconfig_->initModelPath;
+  if (!modelListFromConfig.empty()) {
+    loadFileList(modelListFromConfig, modelList);
+    intconfig_->testPass = 0;
+    intconfig_->numPasses = modelList.size();
+    intconfig_->savingPeriod = 1;
+    CHECK_EQ(intconfig_->testWait, 0) << "--test_wait must be 0 for evaluation";
+  } else if (!initModelPath.empty()) {
+    modelList.push_back(initModelPath);
+    intconfig_->testPass = 0;
+    intconfig_->numPasses = 1;
+    intconfig_->savingPeriod = 1;
+    CHECK_EQ(intconfig_->testWait, 0) << "--test_wait must be 0 for evaluation";
+  }
+
+  for (int i = intconfig_->testPass; i < intconfig_->numPasses; ++i) {
+    int passId = i;
+    if (passId % intconfig_->savingPeriod == 0) {
+      if (intconfig_->testWait) {
+        while (paramUtil_->loadParameters(
+                   passId, true /*local*/, true /*remote*/) == false) {
+          LOG(INFO) << "Waiting for parameters of pass " << passId;
+          sleep(60);  // sleep 60s
+        }
+      } else {
+        if (modelList.size() == 0) {
+          CHECK_EQ(paramUtil_->loadParameters(
+                       passId, true /*local*/, true /*remote*/),
+                   true);
+        } else {
+          paramUtil_->loadParametersWithPath(
+              modelList[i], true /*local*/, true /*remote*/);
+        }
+      }
+      if (IGradientMachineMode::trainWholeDataInOneBatch(intconfig_->mode)) {
+        testOnePassBatch(passId);
+      } else {
+        testOnePass(passId);
+      }
+      if (passId + intconfig_->savingPeriod < intconfig_->numPasses) {
+        // if there is at least 1 more pass to test, then call reset,
+        // otherwise not.
+        testDataProvider_->reset();
+      }
+    }
+  }
+
+  gradientMachine_->finish();
+}
+
+void Tester::printOutput(const std::vector<Argument>& outArgs,
+                         std::ostream& os) {
+  size_t numOutputs = outArgs.size();
+  size_t numIns = outArgs[0].getBatchSize();
+  if (cpuMat_.size() != numOutputs || cpuVec_.size() != numOutputs) {
+    cpuMat_.resize(numOutputs, nullptr);
+    cpuVec_.resize(numOutputs, nullptr);
+  }
+
+  for (size_t i = 0; i < numOutputs; ++i) {
+    if (outArgs[i].value != nullptr) {
+      if (outArgs[i].value->useGpu()) {
+        if (dynamic_cast<GpuMatrix*>(outArgs[i].value.get())) {
+          size_t dim = outArgs[i].value->getWidth();
+          Matrix::resizeOrCreate(cpuMat_[i], numIns, dim, false, false);
+          cpuMat_[i]->copyFrom(*outArgs[i].value);
+        } else if (dynamic_cast<GpuSparseMatrix*>(outArgs[i].value.get())) {
+          auto sparseMat =
+              dynamic_cast<GpuSparseMatrix*>(outArgs[i].value.get());
+          cpuMat_[i] = Matrix::createSparseMatrix(sparseMat->getHeight(),
+                                                  sparseMat->getWidth(),
+                                                  sparseMat->getElementCnt(),
+                                                  sparseMat->getValueType(),
+                                                  sparseMat->format_,
+                                                  false,  /* trans */
+                                                  false); /* useGpu */
+          hl_stream_t stream = HPPL_STREAM_DEFAULT;
+          cpuMat_[i]->copyFrom(*sparseMat, stream);
+        } else {
+          LOG(WARNING) << "Not supported gpu matrix type";
+        }
+      }
+    } else if (outArgs[i].ids != nullptr) {
+      if (outArgs[i].ids->useGpu()) {
+        IVector::resizeOrCreate(cpuVec_[i], outArgs[i].ids->getSize(), false);
+        cpuVec_[i]->copyFrom(*outArgs[i].ids);
+      }
+    } else if (outArgs[i].strs != nullptr) {
+      continue;
+    } else {
+      LOG(WARNING) << "outArgs[" << i << "] has no data to print";
+    }
+  }
+
+  for (size_t i = 0; i < numIns; ++i) {
+    for (size_t j = 0; j < numOutputs; ++j) {
+      if (outArgs[j].value != nullptr) {
+        if (outArgs[j].value->useGpu()) {
+          cpuMat_[j]->printOneRow(os, i);
+        } else {
+          outArgs[j].value->printOneRow(os, i);
+        }
+      } else if (outArgs[j].ids != nullptr) {
+        if (outArgs[j].ids->useGpu()) {
+          cpuVec_[j]->printOneElement(os, i);
+        } else {
+          outArgs[j].ids->printOneElement(os, i);
+        }
+      } else if (outArgs[j].strs != nullptr) {
+        os << (*outArgs[j].strs)[i] << ";";
+      }
+    }
+    os << std::endl;
+  }
+}
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/Tester.h b/paddle/legacy/trainer/Tester.h
new file mode 100644
index 0000000000000000000000000000000000000000..a298602d1d0894af90c098818908862a553cb3e7
--- /dev/null
+++ b/paddle/legacy/trainer/Tester.h
@@ -0,0 +1,149 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/utils/Util.h"
+
+#include <stdio.h>
+
+#include "hl_gpu.h"
+#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
+
+#include "TrainerConfig.pb.h"
+
+#include <stdlib.h>
+#include <fstream>
+#include "ParamUtil.h"
+#include "ParameterUpdater.h"
+#include "TesterConfig.h"
+#include "TrainerInternalConfig.h"
+
+namespace paddle {
+
+/**
+ * Neural Network test logics code.
+ * It is a private class for Trainer.
+ */
+class Tester {
+ public:
+  /**
+   * Ctor
+   * @param config Trainer Config.
+   * @param intconfig Tester Config.
+   * @param gradientMachine Gradient machine(neuralnetwork) that will be tested.
+   * @param parameterUpdater Parameter Updater. Not for updating parameter, just
+   *                         for getting parameter from parameter-server.
+   * @param testDataProvider Test data provider.
+   */
+  Tester(const std::shared_ptr<TrainerConfigHelper>& config,
+         std::unique_ptr<TesterConfig>&& intconfig,
+         const GradientMachinePtr& gradientMachine,
+         const std::shared_ptr<ParameterUpdater>& parameterUpdater,
+         std::shared_ptr<DataProvider> testDataProvider);
+
+  /**
+   * test one period.
+   *
+   * One period means 2 things.
+   *   if test_period !=0 and not test_all_data_in_one_period, then
+   *      will test test_period * batch_size data.
+   *   else
+   *      will test whole test data.
+   *
+   * It is convenience to test small set of data when test data set is large and
+   * is training at same time.
+   */
+  void testOnePeriod();
+  void startTestPeriod();
+  void finishTestPeriod();
+  void testOneDataBatch(const DataBatch& dataBatch,
+                        std::vector<Argument>* outArgs);
+
+  /**
+   * Test for given data batch.
+   * @param dataBatch Data batch.
+   * @param evaluator Evaluator
+   * @return cost
+   */
+  real forwardOneBatch(const DataBatch& dataBatch,
+                       Evaluator* evaluator,
+                       std::vector<Argument>* outArgs);
+
+  /**
+   * performance the full pass of test given test data provider
+   */
+  void test();
+
+ protected:
+  std::shared_ptr<ParameterClient2> testParameterClient_;
+  std::shared_ptr<TrainerConfigHelper> config_;
+  std::unique_ptr<TesterConfig> intconfig_;
+  GradientMachinePtr gradientMachine_;
+  std::shared_ptr<ParameterUpdater> parameterUpdater_;
+  std::unique_ptr<Evaluator> testEvaluator_;
+  std::unique_ptr<ParameterUtil> paramUtil_;
+  DataProviderPtr testDataProvider_;
+  TrainerStats stats_;
+
+  // Used for saving the values of output layers
+  std::ofstream os_;
+  std::vector<MatrixPtr> cpuMat_;
+  std::vector<IVectorPtr> cpuVec_;
+  struct {
+    int64_t numSamples;
+    real cost;
+  } testContext_;
+
+ private:
+  /**
+   * Test one batch by batchId. It is only used for testOnePass.
+   *
+   * Durning testOnePass, each log_period will print cost statistics.
+   *
+   * @param batchId current batch id (from 0)
+   * @return num of tested samples. Zero if end of pass.
+   */
+  int64_t testOneBatchById(int64_t batchId);
+
+  /**
+   * Test whole pass in one batch.
+   *
+   *
+   * @param passId current pass id (from 0)
+   */
+  void testOnePassBatch(int passId);
+
+  /**
+   * test for one pass in several mini-batches.
+   *
+   * Used for sgd method.
+   *
+   * @param passId current pass id (from 0)
+   */
+  void testOnePass(int passId);
+
+  /**
+   * print the outArgs to a stream
+   *
+   * used for save feature file
+   *
+   * @param [in] outArgs output arguments for network.
+   * @param [in,out] os output stream.
+   */
+  void printOutput(const std::vector<Argument>& outArgs, std::ostream& os);
+};
+
+}  //  namespace paddle
diff --git a/paddle/legacy/trainer/TesterConfig.h b/paddle/legacy/trainer/TesterConfig.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c78f7cda347d5808d11e8af98672ef56898d643
--- /dev/null
+++ b/paddle/legacy/trainer/TesterConfig.h
@@ -0,0 +1,138 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/utils/Util.h"
+
+#include <stdio.h>
+
+#include "hl_gpu.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
+
+#include "TrainerConfig.pb.h"
+
+#include <stdlib.h>
+#include <fstream>
+#include "ParameterUpdater.h"
+
+namespace paddle {
+
+/**
+ * TesterConfig
+ * general configs for training
+ */
+struct TesterConfig {
+  /**
+   * indicate test period
+   */
+  int testPeriod;
+
+  /**
+   * indicate whether to save previous batch state
+   */
+  bool prevBatchState;
+
+  /**
+   * log period
+   */
+  int logPeriod;
+
+  /**
+   * loadsave parameters in pserver
+   */
+  bool loadsaveParametersInPserver;
+
+  /**
+   * feat file
+   */
+  std::string featFile;
+
+  /**
+   * predict output dir
+   */
+  std::string predictOutputDir;
+
+  /**
+   * trianer id
+   */
+  int trainerId;
+
+  /**
+   * distribute test
+   */
+  bool distributeTest;
+
+  /**
+   * training state
+   */
+  MachineState* trainState;
+
+  /**
+   * test state
+   */
+  MachineState* testState;
+
+  /**
+   * model list
+   */
+  std::string modelList;
+
+  /**
+   * test passes
+   */
+  int testPass;
+
+  /**
+   * num passes
+   */
+  int numPasses;
+
+  /**
+   * saving period
+   */
+  int savingPeriod;
+
+  /**
+   * test wait
+   */
+  int testWait;
+
+  /**
+   * init model path
+   */
+  std::string initModelPath;
+
+  /**
+   * save only one
+   */
+  bool saveOnlyOne;
+
+  /**
+   * testing mode
+   */
+  bool testing;
+
+  /**
+   * mode
+   */
+  int mode;
+
+  /**
+   * config loc
+   */
+  std::string config;
+};
+
+}  //  namespace paddle
diff --git a/paddle/legacy/trainer/ThreadParameterUpdater.cpp b/paddle/legacy/trainer/ThreadParameterUpdater.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0601bdf24e3150f5d182e2addde3a91609a967e4
--- /dev/null
+++ b/paddle/legacy/trainer/ThreadParameterUpdater.cpp
@@ -0,0 +1,309 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ThreadParameterUpdater.h"
+
+#include "paddle/legacy/utils/Logging.h"
+
+#include "paddle/legacy/math/SparseRowMatrix.h"
+#include "paddle/legacy/parameter/ThreadLocalBuffer.h"
+#include "paddle/legacy/utils/Thread.h"
+
+DECLARE_int32(trainer_count);
+
+namespace paddle {
+
+SgdThreadUpdater::SgdThreadUpdater(const OptimizationConfig& optConfig)
+    : config_(optConfig), numSamplesProcessed_(0) {
+  // fill types
+  auto types = sgdOptimizerGetTypes(optConfig, false /*inPserver*/);
+  for (auto type : types) {
+    addParameterType(type);
+  }
+}
+
+void SgdThreadUpdater::init(const std::vector<ParameterPtr>& parameters) {
+  ParameterUpdater::init(parameters);
+
+  // calc max parameter id
+  size_t maxId = 0;
+  for (auto& para : parameters_) {
+    maxId = std::max(maxId, para->getID());
+  }
+
+  optimizers_.resize(maxId + 1);
+  for (auto& para : parameters_) {
+    int pid = para->getID();
+    optimizers_[pid].reset(sgdOptimizerCreate(config_,
+                                              para->getConfig(),
+                                              para->isGradSparseUpdate(),
+                                              false /*inPserver*/));
+    size_t numRows = para->isGradSparseUpdate() ? para->getConfig().dims(0) : 0;
+    optimizers_[pid]->init(numRows, &para->getConfig());
+    if (para->isGradSparseUpdate() && FLAGS_trainer_count == 1) {
+      // For trainer_count=1, the gradient machine is NeuralNetwork, which does
+      // not create parameter buf for PARAMETER_GRADIENT for sparse update in
+      // Parameter::enableType(). But gradient parameter buf is still used
+      // in SgdThreadUpdater. We need to explicitly create it.
+      //
+      // The AverageOptimizer::restore/apply method will use PARAMETER_GRADIENT
+      // as a temp buffer.
+      para->enableBufType(PARAMETER_GRADIENT);
+    }
+  }
+}
+
+void SgdThreadUpdater::startPass() {
+  for (auto& para : parameters_) {
+    int pid = para->getID();
+    optimizers_[pid]->startPass();
+  }
+}
+
+bool SgdThreadUpdater::finishPass() {
+  catchUpWith();
+
+  for (auto& para : parameters_) {
+    int pid = para->getID();
+    optimizers_[pid]->finishPass();
+  }
+  return true;
+}
+
+void SgdThreadUpdater::updateImpl(Parameter* para) {
+  if (!para->useGpu()) return;
+  SetDevice setDevice(para->getDeviceId());
+  ParameterOptimizer* optimizer = optimizers_[para->getID()].get();
+  optimizer->update(para->getBufs(), para->getConfig());
+  if (auto callback = optimizer->needSpecialTraversal(para->getConfig())) {
+    callback(para->getBufs(), para->getConfig(), -1LU);
+  }
+
+  para->setValueUpdated();
+  para->clearGradient();
+}
+
+void SgdThreadUpdater::threadTraverse(
+    const ParameterOptimizer::TraverseCallback& callback,
+    int tid,
+    size_t numThreads,
+    Parameter* para) {
+  VectorPtr* vecs = parameter::getThreadLocalBuffer();
+  if (para->isGradSparseUpdate()) {
+    size_t height = para->getConfig().dims(0);
+    size_t width = para->getConfig().dims(1);
+    for (size_t i = tid; i < height; i += numThreads) {
+      // setup sub bufs
+      for (auto type : parameterTypes_) {
+        vecs[type]->subVecFrom(*para->getBuf(type), i * width, width);
+      }
+      callback(vecs, para->getConfig(), i);
+    }
+  } else {  // dense
+    // setup sub bufs
+    auto interval = calcSplitArrayInterval(
+        para->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
+    for (auto type : parameterTypes_) {
+      vecs[type]->subVecFrom(*para->getBuf(type), interval);
+    }
+
+    callback(vecs, para->getConfig(), -1LU);
+  }
+}
+
+void SgdThreadUpdater::traverse(GetTraverseCallback getTraverseCallback) {
+  bool hasCpuPara = false;
+  bool hasGpuPara = false;
+  for (auto& para : parameters_) {
+    if (para->useGpu()) {
+      hasGpuPara = true;
+    } else {
+      hasCpuPara = true;
+    }
+  }
+
+  auto cpuTraverse = [&](int tid, size_t numThreads) {
+    for (auto& para : parameters_) {
+      if (auto callback = getTraverseCallback(para.get())) {
+        threadTraverse(callback, tid, numThreads, para.get());
+      }
+    }
+  };
+  auto gpuTraverse = [&](int tid, size_t numThreads) {
+    for (auto& para : parameters_) {
+      if (para->useGpu()) {
+        if (auto callback = getTraverseCallback(para.get())) {
+          SetDevice setDevice(para->getDeviceId());
+          callback(para->getBufs(), para->getConfig(), -1LU);
+        }
+      }
+    }
+  };
+
+  if (hasCpuPara && hasGpuPara) {
+    getGlobalSyncThreadPool()->exec(cpuTraverse, gpuTraverse);
+  } else if (hasCpuPara) {
+    getGlobalSyncThreadPool()->exec(cpuTraverse);
+  } else if (hasGpuPara) {
+    gpuTraverse(0, 0);
+  }
+}
+
+void SgdThreadUpdater::catchUpWith() {
+  traverse([this](Parameter* para) {
+    return optimizers_[para->getID()]->startCatchUpWith();
+  });
+
+  for (auto& para : parameters_) {
+    int pid = para->getID();
+    optimizers_[pid]->finishCatchUpWith();
+  }
+}
+
+void SgdThreadUpdater::apply() {
+  catchUpWith();
+
+  traverse(
+      [this](Parameter* para) { return optimizers_[para->getID()]->apply(); });
+}
+
+void SgdThreadUpdater::restore() {
+  traverse([this](Parameter* para) {
+    return optimizers_[para->getID()]->restore();
+  });
+}
+
+PassType SgdThreadUpdater::startBatch(int64_t batchSize) {
+  numSamplesProcessed_ += batchSize;
+  for (auto& para : parameters_) {
+    int pid = para->getID();
+    optimizers_[pid]->startBatch(numSamplesProcessed_);
+  }
+  return PASS_TRAIN;
+}
+
+void SgdThreadUpdater::finishBatch(real cost) {
+  getGlobalSyncThreadPool()->exec([&](int tid, size_t numThreads) {
+    for (auto& para : parameters_) {
+      if (para->isGradSparseUpdate()) {
+        threadUpdateSparse(tid, numThreads, para.get());
+      } else if (!para->useGpu()) {
+        threadUpdateDense(tid, numThreads, para.get());
+      }
+    }
+  });
+
+  for (auto& para : parameters_) {
+    int pid = para->getID();
+    optimizers_[pid]->finishBatch();
+  }
+}
+
+void SgdThreadUpdater::threadUpdateSparse(int tid,
+                                          size_t numThreads,
+                                          Parameter* para) {
+  int pid = para->getID();
+  ParameterOptimizer* optimizer = optimizers_[pid].get();
+  VectorPtr* vecs = parameter::getThreadLocalBuffer();
+
+  size_t height = para->getConfig().dims(0);
+  size_t width = para->getConfig().dims(1);
+
+  if (dynamic_cast<SparseRowIdsCpuMatrix*>(
+          para->getMat(PARAMETER_GRADIENT).get())) {
+    // From MultiGradientMachine
+    SparseRowIdsCpuMatrix* mainMat = dynamic_cast<SparseRowIdsCpuMatrix*>(
+        para->getMat(PARAMETER_GRADIENT).get());
+    std::vector<uint32_t>& sparseIds = mainMat->getIds(tid);
+
+    for (auto id : sparseIds) {
+      // setup sub bufs
+      for (auto type : parameterTypes_) {
+        vecs[type]->subVecFrom(*para->getBuf(type), id * width, width);
+      }
+      optimizer->update(vecs, para->getConfig(), id);
+      vecs[PARAMETER_GRADIENT]->zeroMem();
+    }
+    sparseIds.clear();
+  } else if (dynamic_cast<SparseRowCpuMatrix*>(
+                 para->getMat(PARAMETER_GRADIENT).get())) {
+    // From NeuralNetwork
+    SparseRowCpuMatrix* mainMat = dynamic_cast<SparseRowCpuMatrix*>(
+        para->getMat(PARAMETER_GRADIENT).get());
+
+    std::vector<unsigned int>& localIndices =
+        mainMat->getIndexDictHandle()->localIndices;
+
+    auto interval =
+        calcSplitArrayInterval(localIndices.size(), tid, numThreads);
+    for (size_t i = interval.first; i < interval.second; ++i) {
+      auto id = localIndices[i];
+      real* row = mainMat->getLocalRow(i);
+      // setup sub bufs
+      for (auto type : parameterTypes_) {
+        if (type == PARAMETER_GRADIENT) {
+          vecs[type]->subVecFrom(row, 0, width);
+        } else {
+          vecs[type]->subVecFrom(*para->getBuf(type), id * width, width);
+        }
+      }
+      optimizer->update(vecs, para->getConfig(), id);
+      vecs[PARAMETER_GRADIENT]->zeroMem();
+    }
+    // For numThreads > 1, MultiGradientMachine is used, which goes
+    // to the above branch.
+    CHECK_EQ(numThreads, 1UL);
+    mainMat->clearIndices();
+  } else {
+    auto& m = *para->getMat(PARAMETER_GRADIENT).get();
+    LOG(FATAL) << "Internal error: " << para->getName() << " "
+               << typeid(m).name();
+  }
+
+  if (auto callback = optimizer->needSpecialTraversal(para->getConfig())) {
+    for (size_t i = tid; i < height; i += numThreads) {
+      // setup sub bufs
+      for (auto type : parameterTypes_) {
+        vecs[type]->subVecFrom(*para->getBuf(type), i * width, width);
+      }
+      callback(vecs, para->getConfig(), i);
+    }
+  }
+}
+
+void SgdThreadUpdater::threadUpdateDense(int tid,
+                                         size_t numThreads,
+                                         Parameter* para) {
+  int pid = para->getID();
+  ParameterOptimizer* optimizer = optimizers_[pid].get();
+  VectorPtr* vecs = parameter::getThreadLocalBuffer();
+
+  auto interval = calcSplitArrayInterval(
+      para->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
+
+  // setup sub bufs
+  for (auto type : parameterTypes_) {
+    vecs[type]->subVecFrom(*para->getBuf(type), interval);
+  }
+
+  // update
+  optimizer->update(vecs, para->getConfig());
+  vecs[PARAMETER_GRADIENT]->zeroMem();
+
+  if (auto callback = optimizer->needSpecialTraversal(para->getConfig())) {
+    callback(vecs, para->getConfig(), -1LU);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/ThreadParameterUpdater.h b/paddle/legacy/trainer/ThreadParameterUpdater.h
new file mode 100644
index 0000000000000000000000000000000000000000..172287d4eb56828c83e6670226b4c1f179fac6d8
--- /dev/null
+++ b/paddle/legacy/trainer/ThreadParameterUpdater.h
@@ -0,0 +1,85 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/parameter/AverageOptimizer.h"
+#include "paddle/legacy/parameter/FirstOrderOptimizer.h"
+#include "paddle/legacy/parameter/OptimizerFunctions.h"
+#include "paddle/legacy/parameter/OptimizerWithRegularizer.h"
+#include "paddle/legacy/parameter/Parameter.h"
+#include "paddle/legacy/parameter/Regularizer.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include <memory>
+#include <vector>
+
+namespace paddle {
+
+/**
+ * \brief A parameter updater that uses multiple threads to update parameters.
+   This parameter updater handles GPU and CPU updates differently,
+   because at the current moment, the merging on CPU is happening on the
+   main thread, and the its parameter size can be much larger than the one GPU.
+   Thus, for GPU, the parameter updates happens in updateImpl() function, which
+   is called by gradient machines as a callback function supplied to backward()
+   and forwardBackward().
+   For CPU, the parameter updates happens in separate threads maintained by this
+   class.
+ */
+class SgdThreadUpdater : public ParameterUpdater {
+ public:
+  explicit SgdThreadUpdater(const OptimizationConfig& optConfig);
+  virtual ~SgdThreadUpdater() {}
+
+  // Use the startPass() function of the base optimizer.
+  virtual void startPass();
+
+  // Use the finishPass() function of the base optimizer.
+  virtual bool finishPass();
+
+  virtual void init(const std::vector<ParameterPtr>& parameters);
+  virtual PassType startBatch(int64_t batchSize);
+  // Call finishBatch for each optimizer.
+  virtual void finishBatch(real cost);
+  virtual void catchUpWith();
+  virtual void apply();
+  virtual void restore();
+
+ protected:
+  // This is the function that will be eventualy called by the GradientMachine.
+  // used only for GPU update.
+  virtual void updateImpl(Parameter* para);
+  OptimizationConfig config_;
+  int64_t numSamplesProcessed_;
+
+  // One optimizers for each parameter.
+  std::vector<std::unique_ptr<ParameterOptimizer>> optimizers_;
+
+  // The update function for CPU sparse parameters.
+  void threadUpdateSparse(int tid, size_t numThreads, Parameter* para);
+
+  // The update function for CPU dense parameters.
+  void threadUpdateDense(int tid, size_t numThreads, Parameter* para);
+  // The update function for after update operations, such as averager.
+  void threadTraverse(const ParameterOptimizer::TraverseCallback& callback,
+                      int tid,
+                      size_t numThreads,
+                      Parameter* para);
+  typedef std::function<const ParameterOptimizer::TraverseCallback(Parameter*)>
+      GetTraverseCallback;
+  void traverse(GetTraverseCallback getTraverseCallback);
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/Trainer.cpp b/paddle/legacy/trainer/Trainer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2db754793cf19e0c29455f61ada5f1d15b3204af
--- /dev/null
+++ b/paddle/legacy/trainer/Trainer.cpp
@@ -0,0 +1,653 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Trainer.h"
+
+#include <stdio.h>
+
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <sstream>
+
+#include <google/protobuf/text_format.h>
+
+#include "paddle/legacy/utils/Common.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include "RemoteParameterUpdater.h"
+#include "TesterConfig.h"
+#include "ThreadParameterUpdater.h"
+#include "TrainerConfigHelper.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachineMode.h"
+#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/legacy/gserver/layers/ValidationLayer.h"
+
+DEFINE_string(config, "", "Trainer config file");
+
+DEFINE_int32(test_period,
+             0,
+             "if equal 0, do test on all test data at the end of "
+             "each pass. While if equal non-zero, do test on all test "
+             "data every test_period batches");
+DEFINE_bool(test_all_data_in_one_period,
+            false,
+            "This option was deprecated, since we will always do "
+            "test on all test set ");
+
+DEFINE_bool(local, true, "Train in local mode or not");
+
+DEFINE_int32(average_test_period,
+             0,
+             "Do test on average parameter every so"
+             " many batches. MUST be devided by FLAGS_log_period."
+             " Default 0 means do not test average parameter");
+
+DEFINE_int32(saving_period, 1, "Save parameteres every so many passes");
+DEFINE_int64(saving_period_by_batches,
+             0,
+             "Save parameters every so many batches in one pass");
+DEFINE_string(save_dir, "", "Directory for saving model parameter");
+DEFINE_int32(start_pass,
+             0,
+             "Start training from this pass. "
+             "Will load parameter from the previous pass");
+DEFINE_int32(test_pass, -1, "Will load parameter start from this pass to test");
+DEFINE_int32(test_wait, 0, "Waiting for pass parameter if not exist");
+DEFINE_bool(with_cost, true, "enable cost layer or not");
+DEFINE_bool(distribute_test, false, "test in distribute mode");
+
+DEFINE_int32(num_passes, 100, "train for so many passes");
+
+DEFINE_string(config_args,
+              "",
+              "arguments passed to config file."
+              "Format: key1=value1,key2=value2");
+
+DEFINE_bool(save_only_one,
+            false,
+            "Save only parameters in last pass, remove previous.");
+
+DEFINE_string(feat_file, "", "File name of extracted feature.");
+DEFINE_string(predict_output_dir,
+              "",
+              "Directory that saves the predicted results of output layers");
+DEFINE_string(model_list, "", "File that saves the model list when evaluation");
+
+namespace paddle {
+
+void Trainer::init(const std::shared_ptr<TrainerConfigHelper>& config,
+                   bool testing,
+                   const std::shared_ptr<GradientMachine>& gradientMachine,
+                   const std::shared_ptr<DataProvider>& dataProvider,
+                   const std::shared_ptr<DataProvider>& testDataProvider) {
+  this->stats_ = std::make_shared<TrainerStats>();
+
+  config_ = config;
+
+  config_->updateConfigFromFlags();
+
+  testing_ = testing;
+
+  // in testing, mode_ may GradientMachine::kTesting or
+  // GradientMachine::kSgdSparseCpuTraining
+
+  if (FLAGS_local) {
+    CHECK(!FLAGS_loadsave_parameters_in_pserver)
+        << "local and loadsave_parameters_in_pserver can not both true";
+    if (config_->getOptConfig().use_sparse_remote_updater()) {
+      config_->disableRemoteSparseUpdaterForEachParams();
+      LOG(INFO) << "ignore sparse_remote_update=true due to  --local=true";
+    }
+  }
+  if (FLAGS_loadsave_parameters_in_pserver) {
+    CHECK(config_->getOptConfig().use_sparse_remote_updater())
+        << "no parameter to load from pserver, please check network config";
+  }
+  if (testing && !FLAGS_loadsave_parameters_in_pserver) {
+    if (config_->getOptConfig().use_sparse_remote_updater()) {
+      config_->disableRemoteSparseUpdater();
+      LOG(INFO) << "because parameter is loaded local,"
+                << "tester ignore sparse_remote_update flag";
+    }
+  }
+
+  CHECK(TrainAlgorithm::isValid(config_->getOptConfig().algorithm()))
+      << "invalid algorithm configuration: "
+      << config_->getOptConfig().algorithm();
+
+  bool useSparseUpdater = false;
+  for (auto& paraConfig : config_->getModelConfig().parameters()) {
+    if (paraConfig.sparse_update() || paraConfig.sparse_remote_update()) {
+      useSparseUpdater = true;
+    }
+  }
+
+  if (FLAGS_use_mkldnn) {
+    CHECK_EQ(FLAGS_trainer_count, 1) << "MKLDNN only need 1 trainer";
+  }
+
+  if (testing) {
+    LOG(INFO) << "trainer: in testing mode";
+    if (config_->getOptConfig().use_sparse_remote_updater() ||
+        FLAGS_trainer_count > 1) {
+      mode_ = GradientMachine::kSgdSparseCpuTraining;
+      LOG(INFO) << "trainer mode: SgdSparseCpuTraining";
+    } else {
+      mode_ = GradientMachine::kTesting;
+      LOG(INFO) << "trainer mode: Testing";
+    }
+  } else if (IGradientMachineMode::tryGetMode(
+                 (int*)&mode_,
+                 config_->getOptConfig().algorithm(),
+                 FLAGS_trainer_count,
+                 FLAGS_local,
+                 FLAGS_use_gpu)) {
+    LOG(INFO) << "Custom trainer mode.";
+  } else if ((config_->getOptConfig().algorithm() == TrainAlgorithm::SGD ||
+              config_->getOptConfig().algorithm() ==
+                  TrainAlgorithm::AsyncSGD) &&
+             useSparseUpdater) {
+    mode_ = GradientMachine::kSgdSparseCpuTraining;
+    LOG(INFO) << "trainer mode: SgdSparseCpuTraining";
+  } else {
+    mode_ = GradientMachine::kNormal;
+    LOG(INFO) << "trainer mode: Normal";
+  }
+
+  // initialize trainer internal
+  trainerInternal_.init(config_,
+                        gradientMachine,
+                        TrainerInternalConfig::createFromMode(mode_),
+                        stats_,
+                        testing);
+  std::unique_ptr<ParameterUtilConfig> paramConfig(
+      new ParameterUtilConfig(FLAGS_save_only_one,
+                              FLAGS_saving_period,
+                              FLAGS_loadsave_parameters_in_pserver,
+                              FLAGS_config));
+
+  paramUtil_.reset(
+      new paddle::ParameterUtil(config_,
+                                std::move(paramConfig),
+                                trainerInternal_.getGradientMachine(),
+                                trainerInternal_.getParameterUpdater()));
+
+  bool gpuData =
+      FLAGS_use_gpu && (!FLAGS_parallel_nn) &&
+      (!IGradientMachineMode::dataMustInCpu(mode_, FLAGS_trainer_count));
+
+  dataProvider_ = dataProvider;
+  if (!dataProvider_ && config_->hasDataConfig() && !testing_) {
+    dataProvider_.reset(DataProvider::create(*config_, *config_, gpuData));
+  }
+  if (!testDataProvider_) {
+    // No evaluator_ if there is testDataProvider but no dataProvider.
+    evaluator_.reset(trainerInternal_.getGradientMachine()->makeEvaluator());
+    currentEvaluator_.reset(
+        trainerInternal_.getGradientMachine()->makeEvaluator());
+    if (FLAGS_average_test_period > 0 && FLAGS_trainer_id == 0 &&
+        config_->getOptConfig().average_window() > 0) {
+      CHECK_EQ(FLAGS_average_test_period % FLAGS_log_period, 0)
+          << "FLAGS_average_test_period must be divided by FALGS_log_period";
+      averageEvaluator_.reset(
+          trainerInternal_.getGradientMachine()->makeEvaluator());
+    }
+  }
+
+  testDataProvider_ = testDataProvider;
+  if (!testDataProvider_ && config_->hasTestDataConfig()) {
+    testDataProvider_.reset(
+        DataProvider::create(config_->getTestDataConfig(), *config_, gpuData));
+  }
+  if (testDataProvider_) {
+    createTester();
+  }
+
+  if (!testing &&
+      (trainerInternal_.getGradientMachine()->hasStaticParameters())) {
+    CHECK(!FLAGS_loadsave_parameters_in_pserver)
+        << "is_static and loadsave_parameters_in_pserver can not both true";
+  }
+  if (testing) {
+    // will load per pass for tester
+  } else if (paramUtil_->tryLoadParametersFromConfig()) {
+    // load from config already.
+  } else {
+    trainerInternal_.getGradientMachine()->randParameters();
+  }
+
+  // Only non static parameters need to be updated
+  std::vector<ParameterPtr>& parameters =
+      trainerInternal_.getGradientMachine()->getNonStaticParameters();
+  if (trainerInternal_.getParameterUpdater()) {
+    trainerInternal_.getParameterUpdater()->init(parameters);
+
+    if (FLAGS_loadsave_parameters_in_pserver && FLAGS_trainer_id == 0) {
+      if (testing) {
+        // will load per pass for tester
+      } else if (!config_->getConfig().init_model_path().empty() &&
+                 (FLAGS_local || FLAGS_trainer_id == 0)) {
+        paramUtil_->loadParametersWithPath(
+            config_->getConfig().init_model_path(),
+            false /*local*/,
+            true /*remote*/);
+      } else if (config_->getConfig().start_pass() > 0 &&
+                 (FLAGS_local || FLAGS_trainer_id == 0)) {
+        CHECK(paramUtil_->loadParameters(config_->getConfig().start_pass() - 1,
+                                         false /*local*/,
+                                         true /*remote*/));
+      } else {
+        trainerInternal_.getParameterUpdater()->randParametersRemote();
+      }
+    }
+  }
+
+  // set current evaluator and evalutor
+  trainerInternal_.setCurrentEvaluator(currentEvaluator_.get());
+  trainerInternal_.setEvaluator(evaluator_.get());
+}
+
+void Trainer::train(size_t numPasses) {
+  startTrain();
+  for (size_t i = 0; i < numPasses; ++i) {
+    if (IGradientMachineMode::trainWholeDataInOneBatch(mode_)) {
+      trainOnePassBatch(config_->getConfig().start_pass() + i);
+    } else {
+      trainOnePass();
+    }
+    if (i < numPasses - 1) {
+      dataProvider_->reset();
+    }
+  }
+
+  finishTrain();
+}
+
+static double genPerturbation(real* d, real* grad, size_t dim) {
+  auto& reng = ThreadLocalRandomEngine::get();
+  std::uniform_real_distribution<double> dist(-1, 1);
+  double gradNorm = 0, dNorm = 0;
+  for (size_t i = 0; i < dim; ++i) {
+    d[i] = dist(reng);
+    dNorm += d[i] * d[i];
+    gradNorm += grad[i] * grad[i];
+  }
+  if (gradNorm > 0) {
+    real s = 0.5 * sqrt(gradNorm / dNorm);
+    for (size_t i = 0; i < dim; ++i) {
+      d[i] = s * d[i] + grad[i];
+    }
+  }
+  double delta = 0;
+  for (size_t i = 0; i < dim; ++i) {
+    delta += grad[i] * d[i];
+  }
+  return delta;
+}
+
+real Trainer::checkGradient() {
+  trainerInternal_.getGradientMachine()->start();
+  std::vector<ParameterPtr>& parameters =
+      trainerInternal_.getGradientMachine()->getNonStaticParameters();
+  DataBatch dataBatch;
+  int32_t batchSize = config_->getOptConfig().batch_size();
+
+  dataProvider_->getNextBatch(batchSize, &dataBatch);
+
+  CHECK(dataBatch.getSize()) << "No data from data provider";
+  std::vector<Argument>& inArgs = dataBatch.getStreams();
+  std::vector<Argument> outArgs;
+
+  trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
+  real cost = Argument::sum(outArgs);
+  LOG(INFO) << "original cost=" << cost;
+  trainerInternal_.getGradientMachine()->backward();
+
+  real maxDiff = 0;
+  char fill = ' ';
+  for (auto& parameter : parameters) {
+    CpuVector oldPara(parameter->getSize());
+    CpuVector newPara(parameter->getSize());
+    oldPara.copyFrom(*parameter->getBuf(PARAMETER_VALUE));
+    real* newp = newPara.getData();
+    real* oldp = oldPara.getData();
+    CpuVector cpuGrad(*parameter->getBuf(PARAMETER_GRADIENT));
+    real* grad = cpuGrad.getData();
+    size_t dim = parameter->getSize();
+    std::vector<real> d(dim);
+
+    double delta = genPerturbation(d.data(), grad, dim);
+
+    // use a step such that delta / cost is FLAGS_checkgrad_eps
+    real step =
+        (delta != 0) ? cost / delta * FLAGS_checkgrad_eps : FLAGS_checkgrad_eps;
+    delta *= step;
+    for (size_t i = 0; i < dim; ++i) {
+      newp[i] = oldp[i] + step * d[i];
+    }
+
+    parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara);
+    parameter->setValueUpdated();
+    trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
+    real newCost1 = Argument::sum(outArgs);
+
+    for (size_t i = 0; i < dim; ++i) {
+      newp[i] = oldp[i] - step * d[i];
+    }
+
+    parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara);
+    parameter->setValueUpdated();
+    trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
+    real newCost2 = Argument::sum(outArgs);
+
+    real trueDelta = 0.5 * (newCost1 - newCost2);
+    real diff = (1e-20 + trueDelta) / (1e-20 + delta) - 1;
+    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(fill)
+              << std::setw(20) << parameter->getName()
+              << "step=" << std::setw(15) << step << "cost1=" << std::setw(10)
+              << newCost1 << "cost2=" << std::setw(10) << newCost2
+              << "true_delta=" << std::setw(15) << trueDelta
+              << "analytic_delta=" << std::setw(15) << delta << "diff=" << diff
+              << (std::abs(diff) > 0.01 ? " ***" : "");
+
+    maxDiff = std::max(maxDiff, std::abs(diff));
+
+    // restore parameter
+    parameter->getBuf(PARAMETER_VALUE)->copyFrom(oldPara);
+    parameter->setValueUpdated();
+
+    fill = (fill == ' ') ? '.' : ' ';
+  }
+  return maxDiff;
+}
+
+void Trainer::startTrain() {
+  trainPassContext_.passId = config_->getConfig().start_pass();
+  srand(config_->getConfig().start_pass() + 1);
+  if (dataProvider_) {
+    dataProvider_->reset();
+  }
+
+  trainerInternal_.getGradientMachine()->start();
+}
+
+void Trainer::finishTrain() { trainerInternal_.getGradientMachine()->finish(); }
+
+void Trainer::startTrainPass() {
+  stats_->reset();
+  trainPassContext_.batchId = 0;
+  trainPassContext_.avgTestCost = 0;
+  trainPassContext_.numAvgTests = 0;
+  trainPassContext_.passInnerId = 1;
+
+  trainerInternal_.getParameterUpdater()->startPass();
+  evaluator_->start();
+  if (FLAGS_prev_batch_state) {
+    trainerInternal_.getGradientMachine()->resetState();
+    trainerInternal_.getGradientMachine()->getState(testState_);
+  }
+}
+
+void Trainer::trainOneDataBatch(DataBatch& dataBatch) {
+  int num = dataBatch.getSize();
+  if (averageEvaluator_) {
+    int64_t mod = trainPassContext_.batchId % FLAGS_average_test_period;
+    if (mod >= FLAGS_average_test_period - FLAGS_log_period) {
+      if (mod == FLAGS_average_test_period - FLAGS_log_period) {
+        averageEvaluator_->start();
+      }
+      trainerInternal_.getParameterUpdater()->apply();
+      if (FLAGS_prev_batch_state) {
+        trainerInternal_.getGradientMachine()->getState(trainState_);
+      }
+      trainPassContext_.avgTestCost += tester_->forwardOneBatch(
+          dataBatch, averageEvaluator_.get(), &forwardOutput_);
+      if (FLAGS_prev_batch_state) {
+        trainerInternal_.getGradientMachine()->setState(trainState_);
+      }
+      trainPassContext_.numAvgTests += num;
+      trainerInternal_.getParameterUpdater()->restore();
+    }
+  }
+  {
+    REGISTER_TIMER("TrainBatch");
+    trainerInternal_.trainOneBatch(
+        trainPassContext_.batchId, dataBatch, &forwardOutput_);
+  }
+
+  if (averageEvaluator_ &&
+      trainPassContext_.batchId % FLAGS_average_test_period ==
+          FLAGS_average_test_period - 1) {
+    averageEvaluator_->finish();
+    LOG(INFO) << " Averaged parameter:"
+              << " cost="
+              << trainPassContext_.avgTestCost / trainPassContext_.numAvgTests
+              << " Eval: " << *averageEvaluator_;
+    trainPassContext_.numAvgTests = 0;
+    trainPassContext_.avgTestCost = 0;
+  }
+
+  ++trainPassContext_.batchId;
+
+  if (trainPassContext_.batchId % FLAGS_log_period == 0) {
+    FOR_TIMING(globalStat.setThreadInfo(true));
+    FOR_TIMING(globalStat.printAllStatus());
+    FOR_TIMING(globalStat.reset());
+  }
+
+  if (testDataProvider_ && FLAGS_test_period > 0 &&
+      trainPassContext_.batchId % FLAGS_test_period == 0) {
+    tester_->testOnePeriod();
+  }
+
+  if (FLAGS_saving_period_by_batches > 0 &&
+      trainPassContext_.batchId >
+          FLAGS_saving_period_by_batches * trainPassContext_.passInnerId &&
+      0 == FLAGS_trainer_id) {
+    trainerInternal_.getParameterUpdater()->catchUpWith();
+    if (testDataProvider_) {
+      tester_->testOnePeriod();
+    }
+    paramUtil_->saveParametersOnePass(trainPassContext_.passId,
+                                      trainPassContext_.passInnerId);
+    ++trainPassContext_.passInnerId;
+  }
+}
+
+void Trainer::finishTrainPass() {
+  if (trainPassContext_.batchId == 0) {
+    // This means no more data from DataProvider
+    return;
+  }
+
+  trainerInternal_.finishTrainPass(trainPassContext_.passId,
+                                   trainPassContext_.batchId);
+
+  FOR_TIMING(globalStat.setThreadInfo(true));
+  FOR_TIMING(globalStat.printAllStatus());
+  FOR_TIMING(globalStat.reset());
+
+  if (testDataProvider_) {
+    tester_->testOnePeriod();
+  }
+
+  if (trainPassContext_.passId % FLAGS_saving_period == 0 &&
+      FLAGS_trainer_id == 0) {
+    paramUtil_->saveParametersOnePass(trainPassContext_.passId);
+  }
+  ++trainPassContext_.passId;
+}
+
+void Trainer::trainOnePass() {
+  startTrainPass();
+  size_t batchSize = config_->getOptConfig().batch_size();
+  while (true) {
+    DataBatch dataBatch;
+
+    int num = 0;
+    {
+      REGISTER_TIMER("getTrainBatch");
+      num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+    }
+    if (num == 0) break;
+    CHECK_EQ(num, dataBatch.getSize());
+    trainOneDataBatch(dataBatch);
+  }
+
+  finishTrainPass();
+}
+
+void Trainer::trainOnePassBatch(int passId) {
+  this->stats_->reset();
+
+  trainerInternal_.getParameterUpdater()->startPass();
+  const std::vector<Argument> inArgs;
+  {
+    REGISTER_TIMER("onePass");
+    trainerInternal_.getGradientMachine()->forwardBackward(
+        inArgs, nullptr, PASS_TRAIN, nullptr);
+  }
+
+  real cost = .0;
+  int64_t num = 0;
+  trainerInternal_.getGradientMachine()->getStats(cost, num);
+  *stats_ += {num, cost};
+
+  trainerInternal_.getGradientMachine()->onPassEnd();
+
+  bool accepted = trainerInternal_.getParameterUpdater()->finishPass();
+
+  globalStat.setThreadInfo(true);
+  globalStat.printAllStatus();
+  globalStat.reset();
+
+  LOG(INFO) << " Pass=" << passId
+            << " AcceptedPass=" << (accepted ? acceptedPassId_ : -1)
+            << stats_->getStats(false /*withCurrentCost*/);
+
+  if (accepted) {
+    if (acceptedPassId_ % FLAGS_saving_period == 0 && FLAGS_trainer_id == 0) {
+      paramUtil_->saveParameters(acceptedPassId_);
+    }
+    acceptedPassId_++;
+    if (FLAGS_save_only_one && acceptedPassId_ >= FLAGS_saving_period) {
+      paramUtil_->deleteParameters(acceptedPassId_ - FLAGS_saving_period);
+    }
+  }
+}
+
+real Trainer::calcGradient(const DataBatch& dataBatch,
+                           const Vector& value,
+                           Vector& gradient) {
+  CHECK_EQ(value.getSize(), gradient.getSize());
+  std::vector<ParameterPtr>& parameters =
+      trainerInternal_.getGradientMachine()->getParameters();
+
+  clearGradient();
+
+  size_t offset = 0;
+  size_t valueSize = value.getSize();
+
+  for (auto& para : parameters) {
+    CHECK_LE(offset + para->getSize(), valueSize);
+    VectorPtr val =
+        Vector::create(para->getSize(), value.getMemoryHandle(), offset);
+    para->getBuf(PARAMETER_VALUE)->copyFrom(*val);
+    para->setValueUpdated();
+    offset += para->getSize();
+  }
+
+  CHECK_EQ(offset, valueSize);
+
+  std::vector<Argument> inArgs = dataBatch.getStreams();
+  std::vector<Argument> outArgs;
+
+  trainerInternal_.getGradientMachine()->forwardBackward(
+      inArgs, &outArgs, PASS_TRAIN);
+  real cost = Argument::sum(outArgs);
+
+  offset = 0;
+  for (auto& para : parameters) {
+    VectorPtr grad =
+        Vector::create(para->getSize(), gradient.getMemoryHandle(), offset);
+    if (para->getBuf(PARAMETER_GRADIENT)) {
+      grad->copyFrom(*para->getBuf(PARAMETER_GRADIENT));
+    }
+    offset += para->getSize();
+  }
+
+  return cost;
+}
+
+void Trainer::clearGradient() {
+  std::vector<ParameterPtr>& parameters =
+      trainerInternal_.getGradientMachine()->getNonStaticParameters();
+  for (auto& parameter : parameters) {
+    parameter->clearGradient();
+  }
+}
+
+int Trainer::getBatchSize() { return config_->getOptConfig().batch_size(); }
+
+void Trainer::createTester() {
+  tester_.reset(new paddle::Tester(config_,
+                                   createTesterConfig(),
+                                   trainerInternal_.getGradientMachine(),
+                                   trainerInternal_.getParameterUpdater(),
+                                   testDataProvider_));
+}
+
+void Trainer::test() { tester_->test(); }
+
+std::unique_ptr<TesterConfig> Trainer::createTesterConfig() {
+  TesterConfig* conf = new TesterConfig;
+  if (FLAGS_test_period) {
+    LOG(WARNING) << "The meaning of --test_period is changed: "
+                 << "if equal 0, do test on all test data at the end of "
+                 << "each pass. While if equal non-zero, do test on all test "
+                 << "data every test_period batches ";
+  }
+  if (FLAGS_test_all_data_in_one_period) {
+    LOG(WARNING) << "--test_all_data_in_one_period was deprecated, since "
+                 << "we will always do test on all test set ";
+  }
+  conf->testPeriod = FLAGS_test_period;
+  conf->prevBatchState = FLAGS_prev_batch_state;
+  conf->logPeriod = FLAGS_log_period;
+  conf->loadsaveParametersInPserver = FLAGS_loadsave_parameters_in_pserver;
+  conf->featFile = FLAGS_feat_file;
+  conf->predictOutputDir = FLAGS_predict_output_dir;
+  conf->trainerId = FLAGS_trainer_id;
+  conf->distributeTest = FLAGS_distribute_test;
+  conf->config = FLAGS_config;
+  conf->modelList = FLAGS_model_list;
+  conf->testPass = FLAGS_test_pass;
+  conf->numPasses = FLAGS_num_passes;
+  conf->savingPeriod = FLAGS_saving_period;
+  conf->testWait = FLAGS_test_wait;
+  conf->initModelPath = FLAGS_init_model_path;
+  conf->saveOnlyOne = FLAGS_save_only_one;
+  conf->testing = testing_;
+  conf->mode = mode_;
+  conf->trainState = &trainState_;
+  conf->testState = &testState_;
+  return std::unique_ptr<TesterConfig>(conf);
+}
+
+ParameterUtil* Trainer::getParameterUtilPtr() { return paramUtil_.get(); }
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/Trainer.h b/paddle/legacy/trainer/Trainer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b467f9af0cf12a39dd3d119c59e6cafcb05474b4
--- /dev/null
+++ b/paddle/legacy/trainer/Trainer.h
@@ -0,0 +1,204 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/utils/Util.h"
+
+#include <stdio.h>
+
+#include "hl_gpu.h"
+#include "paddle/legacy/gserver/dataproviders/DataProvider.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
+
+#include <stdlib.h>
+#include <fstream>
+#include "ParamUtil.h"
+#include "ParameterUpdater.h"
+#include "Tester.h"
+#include "TrainerConfigHelper.h"
+#include "TrainerInternal.h"
+
+DECLARE_int32(num_passes);
+
+namespace paddle {
+
+/**
+ * Trainer Class
+ *
+ * Trainer combines GradientMachine, ParameterUpdater, DataProvider together to
+ * train/test a NeuralNetwork.
+ */
+class Trainer {
+ public:
+  /**
+   * Ctor.
+   * @return
+   */
+  Trainer() : acceptedPassId_(0) {}
+
+  virtual ~Trainer() {}
+
+  /**
+   * initialize a new trainer using config
+   *
+   * @param config TrainerConfig.
+   * @param testing true if only for testing
+   * @param gradientMachine GradientMachine that will be trained.
+   *                        nullptr if create from config.
+   * @param dataProvider Train Data Provider. null if create from config.
+   * @param testDataProvider Test Data Provider. null if create from config.
+   */
+  virtual void init(
+      const std::shared_ptr<TrainerConfigHelper>& config,
+      bool testing = false,
+      const std::shared_ptr<GradientMachine>& gradientMachine = nullptr,
+      const std::shared_ptr<DataProvider>& dataProvider = nullptr,
+      const std::shared_ptr<DataProvider>& testDataProvider = nullptr);
+
+  /**
+   * Train until num_passes reached.
+   * One pass means neural network train through all training data.
+   *
+   * @param numPasses the number of traning pass.
+   * @note Durning neural network training, the num passes may set a very large
+   * value, and kill training process when result is good enough.
+   */
+  void train(size_t numPasses = (size_t)FLAGS_num_passes);
+
+  /**
+   * compare the gradient from bp with finite difference
+   * @return  the maximal difference
+   */
+  real checkGradient();
+
+  void startTrain();
+  void finishTrain();
+  void startTrainPass();
+  void finishTrainPass();
+  void trainOneDataBatch(DataBatch& dataBatch);
+  void time();
+
+  /**
+   * given a dataBatch and the current parameter value
+   * calculate its gradient and return the cost.
+   *
+   * TODO(yuyang18): I think this method is deprecated and buggy. Should it be
+   * removed?
+   */
+  real calcGradient(const DataBatch& dataBatch,
+                    const Vector& value,
+                    Vector& gradient);
+
+  /**
+   * Get Trainer Config.
+   */
+  const TrainerConfig& getConfig() const { return config_->getConfig(); }
+
+  /**
+   * Get Train Data Provider
+   */
+  const DataProviderPtr& getDataProvider() { return dataProvider_; }
+
+  /**
+   * Get Gradient Machine.
+   */
+  const GradientMachinePtr& getGradientMachine() {
+    return trainerInternal_.getGradientMachine();
+  }
+
+  /**
+   * Get batch size in optimization config.
+   * @note This method didn't return the actual batch size. Just batch size
+   * set in the optimization config. The actual batch size in one trainer may
+   * less than batch size in config due to there are not enough data.
+   */
+  int getBatchSize();
+
+  /**
+   * Do test job
+   */
+  void test();
+
+  /**
+   * Get parameter util ptr
+   *
+   * TODO(yuyang18): Make it return a smart pointer.
+   */
+  ParameterUtil* getParameterUtilPtr();
+
+ protected:
+  /**
+   * Train one pass of data.
+   *
+   * SGD Method.
+   */
+  void trainOnePass();
+
+  /**
+   * Train one pass in one batch.
+   *
+   */
+  void trainOnePassBatch(int passId);
+
+  /**
+   * set parameter gradient to zero
+   */
+  void clearGradient();
+
+  void createTester();
+
+ private:
+  std::unique_ptr<TesterConfig> createTesterConfig();
+
+ protected:
+  std::shared_ptr<TrainerConfigHelper> config_;
+  std::shared_ptr<TrainerStats> stats_;
+
+  DataProviderPtr dataProvider_;
+  DataProviderPtr testDataProvider_;
+  MachineState trainState_;
+  MachineState testState_;
+
+  struct TrainPassContext {
+    int64_t batchId;
+    real avgTestCost;
+    int64_t numAvgTests;
+    int passId;
+    int passInnerId;
+  };
+  std::vector<paddle::Argument> forwardOutput_;
+
+  TrainPassContext trainPassContext_;
+
+  std::unique_ptr<Evaluator> evaluator_;
+  std::unique_ptr<Evaluator> currentEvaluator_;
+  std::unique_ptr<Evaluator> averageEvaluator_;
+  // training mode
+  // used to decide which GradientMachine and ParameterUpdater to create
+  GradientMachine::CreateMode mode_;
+  int testing_;
+  int acceptedPassId_;
+
+  // trainer tester
+  std::unique_ptr<Tester> tester_;
+
+  // parameter util
+  std::unique_ptr<ParameterUtil> paramUtil_;
+
+  // trainer Internal
+  TrainerInternal trainerInternal_;
+};
+
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/TrainerBenchmark.cpp b/paddle/legacy/trainer/TrainerBenchmark.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7f5bd2335481c417b466ac4ca9ca54798524045f
--- /dev/null
+++ b/paddle/legacy/trainer/TrainerBenchmark.cpp
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#undef PADDLE_DISABLE_TIMER
+
+#include "Trainer.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
+
+DECLARE_int32(test_period);
+
+DEFINE_bool(feed_data, false, "Wether to read data from DataProvider.");
+
+namespace paddle {
+
+void Trainer::time() {
+  startTrain();
+
+  trainerInternal_.getParameterUpdater()->startPass();
+  evaluator_->start();
+
+  DataBatch dataBatch;
+  int32_t batchSize = config_->getOptConfig().batch_size();
+  int32_t num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+  CHECK_EQ(num, batchSize) << "The sample number is less than batch size "
+                           << num << " != " << batchSize;
+
+  CHECK(dataBatch.getSize()) << "No data from data provider";
+
+  std::vector<paddle::Argument> outputs;
+  // burning time
+  LOG(INFO) << "Burning time...";
+  for (int n = 0; n < 10; ++n) {
+    trainerInternal_.trainOneBatch(n, dataBatch, &outputs);
+  }
+  LOG(INFO) << "Burning time end.";
+
+  for (int n = 0; n < FLAGS_test_period; n++) {
+    if (FLAGS_feed_data) {
+      REGISTER_TIMER("GetData");
+      num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+    }
+
+    if (num != batchSize) {
+      break;
+    }
+
+    {
+      REGISTER_TIMER("FwdBwd");
+      trainerInternal_.trainOneBatch(n, dataBatch, &outputs);
+    }
+  }
+  globalStat.setThreadInfo(true);
+  globalStat.printSegTimerStatus();
+  globalStat.reset();
+
+  finishTrain();
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/TrainerConfigHelper.cpp b/paddle/legacy/trainer/TrainerConfigHelper.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4d31ba8d71d52ac51191affc612a79b6734dee74
--- /dev/null
+++ b/paddle/legacy/trainer/TrainerConfigHelper.cpp
@@ -0,0 +1,199 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "TrainerConfigHelper.h"
+#include "ParamUtil.h"
+#include "TrainerConfig.pb.h"
+#include "paddle/legacy/utils/Flags.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_int32(start_pass);
+DECLARE_string(save_dir);
+DECLARE_int32(trainer_id);
+DECLARE_bool(local);
+DECLARE_bool(with_cost);
+DECLARE_bool(with_gpu);
+DECLARE_bool(parallel_nn);
+DECLARE_string(config_args);
+DECLARE_bool(use_mkldnn);
+DECLARE_bool(use_mkl_packed);
+
+const char *kConfigParserModuleName = "paddle.trainer.config_parser";
+const char *kConfigParserFuncName = "parse_config_and_serialize";
+
+namespace paddle {
+
+struct TrainerConfigHelperPrivate {
+  TrainerConfig conf;
+};
+
+TrainerConfigHelper::TrainerConfigHelper(const std::string &configFilePath)
+    : m(new TrainerConfigHelperPrivate()) {
+  std::ostringstream configArgs;
+  configArgs << "trainer_id=" << FLAGS_trainer_id << ",local=" << FLAGS_local
+             << ",with_cost=" << FLAGS_with_cost << ",use_gpu=" << FLAGS_use_gpu
+             << ",parallel_nn=" << FLAGS_parallel_nn
+             << ",use_mkldnn=" << FLAGS_use_mkldnn
+             << ",use_mkl_packed=" << FLAGS_use_mkl_packed
+             << ",cudnn_version=" << hl_get_cudnn_lib_version();
+  if (!FLAGS_config_args.empty()) {
+    configArgs << "," << FLAGS_config_args;
+  }
+
+  VLOG(3) << "Parsing trainer config " << configFilePath;
+  std::string configProtoStr =
+      callPythonFunc(kConfigParserModuleName,
+                     kConfigParserFuncName,
+                     {configFilePath, configArgs.str()});
+  CHECK(m->conf.ParseFromString(configProtoStr));
+}
+
+TrainerConfigHelper::TrainerConfigHelper(const TrainerConfig &config)
+    : m(new TrainerConfigHelperPrivate()) {
+  m->conf = config;
+}
+
+TrainerConfigHelper::~TrainerConfigHelper() { delete m; }
+
+const TrainerConfig &TrainerConfigHelper::getConfig() const { return m->conf; }
+
+TrainerConfig &TrainerConfigHelper::getMutableConfig() { return m->conf; }
+
+const OptimizationConfig &TrainerConfigHelper::getOptConfig() const {
+  return m->conf.opt_config();
+}
+
+const ModelConfig &TrainerConfigHelper::getModelConfig() const {
+  return m->conf.model_config();
+}
+
+const DataConfig *TrainerConfigHelper::getDataConfigPtr() const {
+  if (m->conf.has_data_config()) {
+    return &m->conf.data_config();
+  } else {
+    return nullptr;
+  }
+}
+
+const DataConfig &TrainerConfigHelper::getTestDataConfig() const {
+  CHECK(m->conf.has_test_data_config());
+  return m->conf.test_data_config();
+}
+
+bool TrainerConfigHelper::hasDataConfig() const {
+  return m->conf.has_data_config();
+}
+
+bool TrainerConfigHelper::hasTestDataConfig() const {
+  return m->conf.has_test_data_config();
+}
+
+void TrainerConfigHelper::updateConfigFromFlags() {
+  if (!FLAGS_save_dir.empty()) {
+    m->conf.set_save_dir(FLAGS_save_dir);
+  }
+  if (!FLAGS_init_model_path.empty()) {
+    m->conf.set_init_model_path(FLAGS_init_model_path);
+  }
+  if (FLAGS_start_pass != 0) {
+    m->conf.set_start_pass(FLAGS_start_pass);
+  }
+}
+
+void TrainerConfigHelper::disableRemoteSparseUpdater() {
+  m->conf.mutable_opt_config()->set_use_sparse_remote_updater(false);
+}
+
+void TrainerConfigHelper::disableRemoteSparseUpdaterForEachParams() {
+  this->disableRemoteSparseUpdater();
+  for (int i = 0; i < m->conf.model_config().parameters_size(); ++i) {
+    m->conf.mutable_model_config()
+        ->mutable_parameters(i)
+        ->set_sparse_remote_update(false);
+  }
+}
+
+OptimizationConfig &TrainerConfigHelper::getOptConfig() {
+  return *m->conf.mutable_opt_config();
+}
+
+void TrainerConfigHelper::setSaveDir(const std::string &saveDir) {
+  m->conf.set_save_dir(saveDir);
+}
+
+const std::string &TrainerConfigHelper::getSaveDir() const {
+  return m->conf.save_dir();
+}
+
+std::string TrainerConfigHelper::getConfigNameFromPath(
+    const std::string &modelPath) {
+  std::ifstream s(path::join(modelPath, "path.txt"));
+  CHECK(s.is_open()) << " fail to open path.txt";
+  std::string ss;
+  getline(s, ss);
+  VLOG(3) << "fileName " << path::join(modelPath, ss);
+  s.close();
+  return path::join(modelPath, ss);
+}
+
+std::string TrainerConfigHelper::getConfigNameFromPassId(
+    int passId, const std::string &modelPath) {
+  constexpr int kBufLen = 100;
+  char buf[kBufLen];
+  snprintf(buf, kBufLen, "pass-%05d", passId);
+  return TrainerConfigHelper::getConfigNameFromPath(path::join(modelPath, buf));
+}
+
+std::string TrainerConfigHelper::getConfigName(bool *ok) const {
+  std::string retv = "";
+
+  if (!m->conf.config_file().empty()) {
+    retv = m->conf.config_file();
+  } else if (!m->conf.init_model_path().empty()) {
+    retv = getConfigNameFromPath(m->conf.init_model_path());
+  } else if (m->conf.start_pass() >= 1) {
+    retv = getConfigNameFromPassId(m->conf.start_pass(), m->conf.save_dir());
+  }
+
+  if (ok) {
+    *ok = !retv.empty();
+  }
+
+  return retv;
+}
+
+std::shared_ptr<TrainerConfigHelper> TrainerConfigHelper::createFromFlags() {
+  std::string configPath;
+  if (!FLAGS_config.empty()) {
+    configPath = FLAGS_config;
+  } else if (!FLAGS_init_model_path.empty()) {
+    configPath = getConfigNameFromPath(FLAGS_init_model_path);
+  } else if (FLAGS_start_pass >= 1) {
+    configPath =
+        getConfigNameFromPassId(FLAGS_start_pass - 1, FLAGS_init_model_path);
+  } else {
+    return nullptr;
+  }
+  return std::make_shared<TrainerConfigHelper>(configPath);
+}
+
+std::shared_ptr<TrainerConfigHelper>
+TrainerConfigHelper::createFromFlagConfig() {
+  CHECK(!FLAGS_config.empty());
+  return std::make_shared<TrainerConfigHelper>(FLAGS_config);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/TrainerConfigHelper.h b/paddle/legacy/trainer/TrainerConfigHelper.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e428bea2c4b44bf98772ccca8f8b10d315efbbd
--- /dev/null
+++ b/paddle/legacy/trainer/TrainerConfigHelper.h
@@ -0,0 +1,205 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <paddle/legacy/utils/Logging.h>
+#include <paddle/legacy/utils/Util.h>
+#include <memory>
+
+namespace paddle {
+
+class TrainerConfig;
+class OptimizationConfig;
+struct TrainerConfigHelperPrivate;
+class ModelConfig;
+class DataConfig;
+
+/**
+ * @brief TrainerConfig Helper. A class wrap protobuf's TrainerConfig Object,
+ * simplize the usage for TrainerConfig.
+ *
+ * The all operation to TrainerConfig object should use this object. It remove
+ * many copy & paste code in trainer.
+ *
+ * @TODO(yuyang18): Make cmake check compiler support keyword 'final' or not.
+ * Define a macro to unify 'final' keyword
+ */
+class TrainerConfigHelper /*final*/ {
+ public:
+  DISABLE_COPY(TrainerConfigHelper);
+
+  /**
+   * @brief Ctor, Create a TrainerConfig from config file
+   * @param configFilePath Config file path.
+   */
+  explicit TrainerConfigHelper(const std::string& configFilePath);
+  explicit TrainerConfigHelper(const TrainerConfig& config);
+
+  /**
+   * Dtor
+   * @warning this class is a final class. Should not be inherited.
+   */
+  ~TrainerConfigHelper();
+
+  /**
+   * @brief Get Trainer Config itself.
+   */
+  const TrainerConfig& getConfig() const;
+
+  TrainerConfig& getMutableConfig();
+
+  /**
+   * @brief Get Optimizer Config.
+   */
+  const OptimizationConfig& getOptConfig() const;
+
+  /**
+   * @brief Get Model Config.
+   */
+  const ModelConfig& getModelConfig() const;
+
+  /**
+   * @brief Get Train Data Config Pointer.
+   * @return nullptr if there is no train data. Else will return pointer
+   */
+  const DataConfig* getDataConfigPtr() const;
+
+  /**
+   * @brief Get Tain Data Config.
+   * @warning Core when there is no train data.
+   */
+  const DataConfig& getDataConfig() const {
+    CHECK(this->hasDataConfig());
+    auto conf = this->getDataConfigPtr();
+    return *conf;
+  }
+
+  /**
+   * @brief Get test data config
+   * @warning Core when there is no test data.
+   */
+  const DataConfig& getTestDataConfig() const;
+
+  /**
+   * @brief Has train data config or not.
+   * @return true if has train data.
+   */
+  bool hasDataConfig() const;
+
+  /**
+   * @brief Has test data config or not.
+   * @return true if has test data.
+   */
+  bool hasTestDataConfig() const;
+
+  /**
+   * @brief Update trainer config from command line flags.
+   *        Override config's (save_dir, init_model_path, start_pass) if command
+   *        flags is existed.
+   */
+  void updateConfigFromFlags();
+
+  /**
+   * @brief Disable optimization's sparse remote update.
+   */
+  void disableRemoteSparseUpdater();
+
+  /**
+   * @brief Disable optimization and each parameter's sparse remote update.
+   */
+  void disableRemoteSparseUpdaterForEachParams();
+
+  /**
+   * @brief implicit conversion.
+   */
+  inline operator const TrainerConfig&() const { return this->getConfig(); }
+
+  /**
+   * @brief implicit conversion.
+   */
+  inline operator const OptimizationConfig&() const {
+    return this->getOptConfig();
+  }
+
+  /**
+   * @brief implicit conversion.
+   */
+  inline operator const DataConfig&() const { return this->getDataConfig(); }
+
+  /**
+   * @brief implicit conversion.
+   */
+  inline operator const ModelConfig&() const { return this->getModelConfig(); }
+
+  /**
+   * @brief Get mutable optimization config.
+   */
+  OptimizationConfig& getOptConfig();
+
+  /**
+   * @brief set model save directory.
+   * @param saveDir Directory path.
+   */
+  void setSaveDir(const std::string& saveDir);
+
+  /**
+   * @brief get model save directory.
+   * @return save directory path.
+   */
+  const std::string& getSaveDir() const;
+
+  /**
+   * @brief Get config file name from model path.
+   *
+   * Paddle save model to a directory, and write a file 'path.txt' which save
+   * config filename.
+   *
+   * @param modelPath model saved directory.
+   * @return config file name.
+   */
+  static std::string getConfigNameFromPath(const std::string& modelPath);
+
+  /**
+   * @brief Get config file name from this config instance.
+   * @param[out] ok true if no error.
+   * @return config file name.
+   */
+  std::string getConfigName(bool* ok = nullptr) const;
+
+  /**
+   * @brief Try to create TrainerConfigHelper from all command line flags.
+   *        Try to load from --config, --init_model_path, --start_pass one by
+   *        one. Return nullptr if cannot load TrainerConfigHelper from all
+   *        these place.
+   * @return nullptr if cannot load, otherwise return a TrainerConfigHelper.
+   */
+  static std::shared_ptr<TrainerConfigHelper> createFromFlags();
+
+  /**
+   * @brief Try to create TrainerConfigHelper only from '--config' flag.
+   * @return nullptr if cannot load, otherwise return a TrainerConfigHelper.
+   */
+  static std::shared_ptr<TrainerConfigHelper> createFromFlagConfig();
+
+ private:
+  static std::string getConfigNameFromPassId(int passId,
+                                             const std::string& modelPath);
+
+  TrainerConfigHelperPrivate* m;
+};
+
+typedef std::shared_ptr<TrainerConfigHelper> TrainerConfigHelperPtr;
+
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/TrainerInternal.cpp b/paddle/legacy/trainer/TrainerInternal.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ee3dea6340167ab16d2bfefe3d757b10f5d90bb5
--- /dev/null
+++ b/paddle/legacy/trainer/TrainerInternal.cpp
@@ -0,0 +1,303 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "TrainerInternal.h"
+
+#include <fenv.h>
+#include <stdio.h>
+
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <sstream>
+
+#include <google/protobuf/text_format.h>
+
+#include "paddle/legacy/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/legacy/gserver/layers/ValidationLayer.h"
+#include "paddle/legacy/utils/GlobalConstants.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+#include "paddle/legacy/utils/Stat.h"
+#include "paddle/legacy/utils/Util.h"
+
+#include "RemoteParameterUpdater.h"
+#include "ThreadParameterUpdater.h"
+
+namespace paddle {
+
+void TrainerInternal::init(const std::shared_ptr<TrainerConfigHelper>& config,
+                           const GradientMachinePtr& gradientMachine,
+                           std::unique_ptr<TrainerInternalConfig>&& intconfig,
+                           const std::shared_ptr<TrainerStats>& stats,
+                           bool testing) {
+  config_ = config;
+  intconfig_ = std::move(intconfig);
+  stats_ = stats;
+
+  //! in training will use parameter updater definitly.
+  //! But only use parameter in testing mode when some parameter in pserver.
+  if (!testing || (config_->getOptConfig().use_sparse_remote_updater() &&
+                   intconfig_->loadsave_parameters_in_pserver)) {
+    createParameterUpdater(testing);
+  }
+
+  gradientMachine_ = gradientMachine;
+  if (!gradientMachine) {
+    CHECK(config_->getConfig().has_model_config())
+        << "Missing model_config in trainer_config";
+    gradientMachine_.reset(
+        GradientMachine::create(config_->getConfig().model_config(),
+                                intconfig_->mode,
+                                parameterUpdater_->getParameterTypes()));
+  }
+}
+
+void TrainerInternal::trainOneBatch(int64_t batchId,
+                                    const DataBatch& dataBatch,
+                                    std::vector<Argument>* outArgs) {
+  // true means updating parameter whenever gradient is ready during backward()
+  bool doPipelineUpdate =
+      (intconfig_->mode != GradientMachine::kSgdSparseCpuTraining) &&
+      (intconfig_->local || intconfig_->use_gpu ||
+       intconfig_->trainer_count <= 1);
+
+  int64_t actualBatchSize = dataBatch.getSize();
+  if (actualBatchSize == 0) {
+    return;
+  }
+
+  bool showStats = intconfig_->show_param_stats_period > 0 &&
+                   (batchId + 1) % intconfig_->show_param_stats_period == 0 &&
+                   intconfig_->trainer_id == 0;
+
+  std::vector<ParaStat> paraStats;
+  if (showStats) {
+    paraStats.resize(gradientMachine_->getParameters().size());
+  }
+
+  const std::vector<Argument>& inArgs = dataBatch.getStreams();
+
+  PassType passType = parameterUpdater_->startBatch(actualBatchSize);
+
+  if (config_->getOptConfig().use_sparse_remote_updater()) {
+    REGISTER_TIMER("prefetch");
+    gradientMachine_->prefetch(inArgs);
+    parameterUpdater_->getParametersRemote();
+  }
+
+  UpdateCallback updateCallback = [this, showStats, &paraStats](
+      Parameter* para) {
+    if (showStats) {
+      //! @TODO(yuyang18) Show stats is actually a ParameterHook, refactor
+      // it
+      //! to ParameterHook.
+      auto& grad = para->getBuf(PARAMETER_GRADIENT);
+      SetDevice device(para->getDeviceId());
+      paraStats[para->getID()].avgAbsGrad = grad->getAbsSum() / para->getSize();
+      paraStats[para->getID()].maxAbsGrad = grad->getAbsMax();
+    }
+    parameterUpdater_->update(para);
+  };
+
+  {
+#ifndef PADDLE_DISABLE_TIMER
+    Timer timer;
+    timer.start();
+#endif
+    REGISTER_TIMER("forwardBackward");
+    forwardBackwardBatch(
+        inArgs, *outArgs, passType, updateCallback, doPipelineUpdate);
+#ifndef PADDLE_DISABLE_TIMER
+    timer.stop();
+    parameterUpdater_->setForwardbackwardTime(timer.get());
+#endif
+  }
+
+  if (!doPipelineUpdate) {
+    auto& parameters = gradientMachine_->getNonStaticParameters();
+    for (auto& para : parameters) {
+      updateCallback(para.get());
+    }
+  }
+
+  real cost = 0;
+  {
+    REGISTER_TIMER("sumCost");
+    cost = Argument::sum(*outArgs);
+  }
+
+  if (batchId % intconfig_->log_period == 0) {
+    currentEvaluator_->start();
+    stats_->resetCurrentStat();
+  }
+  {
+    REGISTER_TIMER("eval");
+    gradientMachine_->eval(currentEvaluator_);
+    gradientMachine_->eval(evaluator_);
+  }
+
+  *stats_ += {actualBatchSize, cost};
+  {
+    REGISTER_TIMER("finishBatch");
+    parameterUpdater_->finishBatch(cost);
+  }
+
+  if (showStats) {
+    showParameterStats(paraStats);
+  }
+  if ((batchId + 1) % intconfig_->log_period == 0) {
+    currentEvaluator_->finish();
+
+    if (intconfig_->dot_period > 0) {
+      std::cerr << std::endl;
+    }
+    LOG(INFO) << " Batch=" << batchId + 1 << " " << *stats_
+              << " Eval: " << *evaluator_
+              << " CurrentEval: " << *currentEvaluator_;
+  } else if (intconfig_->dot_period > 0 &&
+             (batchId + 1) % intconfig_->dot_period == 0) {
+    std::cerr << ".";
+  }
+}
+
+/**
+ * finish train pass
+ */
+void TrainerInternal::finishTrainPass(int passId, int batchId) {
+  gradientMachine_->onPassEnd();
+  parameterUpdater_->finishPass();
+  evaluator_->finish();
+  LOG(INFO) << " Pass=" << passId << " Batch=" << batchId << " "
+            << stats_->getStats(false /*without current cost*/)
+            << " Eval: " << *evaluator_;
+}
+
+void TrainerInternal::showParameterStats(
+    const std::vector<ParaStat>& paraStats) {
+  std::vector<ParameterPtr>& parameters = gradientMachine_->getParameters();
+  for (auto& parameter : parameters) {
+    SetDevice device(parameter->getDeviceId());
+    real sum = parameter->getBuf(PARAMETER_VALUE)->getAbsSum();
+    const auto& lr = parameter->getBuf(PARAMETER_LEARNING_RATE);
+    std::ostringstream osLrHistogram;
+    if (lr) {
+      if (VLOG_IS_ON(2)) {
+        osLrHistogram << " lr_histogram: ";
+        lr->histogram(osLrHistogram);
+      } else {
+        osLrHistogram << " max_lr=" << std::setw(11) << lr->getMax()
+                      << " min_lr=" << std::setw(11) << lr->getMin()
+                      << " avg_lr=" << std::setw(11)
+                      << lr->getSum() / parameter->getSize();
+      }
+    }
+    int pid = parameter->getID();
+    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
+              << std::setw(20) << parameter->getName()
+              << " avg_abs_val=" << std::setw(11) << sum / parameter->getSize()
+              << " max_val=" << std::setw(11)
+              << parameter->getBuf(PARAMETER_VALUE)->getAbsMax()
+              << " avg_abs_grad=" << std::setw(11) << paraStats[pid].avgAbsGrad
+              << " max_grad=" << std::setw(11) << paraStats[pid].maxAbsGrad
+              << osLrHistogram.str();
+  }
+}
+
+void TrainerInternal::createParameterUpdater(bool testing) {
+  const std::string& alg = config_->getOptConfig().algorithm();
+  parameterUpdater_.reset(ParameterUpdaterCreators::tryCreateUpdater(
+      alg, config_->getOptConfig(), intconfig_->local, intconfig_->num_passes));
+  if (parameterUpdater_) {
+    return;
+  }
+
+  if (!intconfig_->local) {
+    if (testing && config_->getOptConfig().use_sparse_remote_updater()) {
+      std::unique_ptr<ParameterUpdater> localUpdater;
+      localUpdater.reset(
+          new SgdLocalUpdater(config_->getOptConfig()));  // do nothing
+      parameterUpdater_.reset(
+          new SparseRemoteParameterUpdaterComposite(config_->getOptConfig(),
+                                                    intconfig_->num_passes,
+                                                    testing,
+                                                    std::move(localUpdater)));
+    } else {
+      if (GradientMachine::kSgdSparseCpuTraining == intconfig_->mode &&
+          !intconfig_->use_old_updater) {
+        intconfig_->use_old_updater = true;
+        LOG(INFO) << "Sgd sparse training can not work with"
+                  << " ConcurrentRemoteParameterUpdater,"
+                  << " automatically reset --use_old_updater=true";
+      }
+
+      std::unique_ptr<ParameterUpdater> localUpdater;
+      if (config_->getOptConfig().num_batches_per_send_parameter() > 1) {
+        CHECK(alg == TrainAlgorithm::SGD || alg == TrainAlgorithm::AsyncSGD)
+            << "Unsupported algorithm in remote-local mode: " << alg;
+        if (GradientMachine::kSgdSparseCpuTraining == intconfig_->mode) {
+          localUpdater.reset(new SgdThreadUpdater(*config_));
+        } else {
+          localUpdater.reset(new SgdLocalUpdater(*config_));
+        }
+      }
+
+      localUpdater.reset(
+          intconfig_->use_old_updater
+              ? new RemoteParameterUpdater(
+                    *config_, intconfig_->num_passes, std::move(localUpdater))
+              : new ConcurrentRemoteParameterUpdater(
+                    *config_, intconfig_->num_passes, std::move(localUpdater)));
+
+      if (config_->getOptConfig().use_sparse_remote_updater()) {
+        localUpdater.reset(
+            new SparseRemoteParameterUpdaterComposite(*config_,
+                                                      intconfig_->num_passes,
+                                                      testing,
+                                                      std::move(localUpdater)));
+      }
+
+      this->parameterUpdater_ = std::move(localUpdater);
+    }
+  } else {
+    CHECK_EQ(config_->getOptConfig().num_batches_per_send_parameter(), 1)
+        << "num_batches_per_send_parameter should be one in local mode!";
+
+    if (GradientMachine::kSgdSparseCpuTraining == intconfig_->mode) {
+      parameterUpdater_.reset(new SgdThreadUpdater(*config_));
+    } else if (alg == TrainAlgorithm::SGD || alg == TrainAlgorithm::AsyncSGD) {
+      if (config_->getModelConfig().type() == "recursive_nn") {
+        parameterUpdater_.reset(new SgdCpuUpdater(*config_));
+      } else if (intconfig_->use_gpu &&
+                 config_->getOptConfig().do_average_in_cpu() &&
+                 config_->getOptConfig().average_window() > 0) {
+        parameterUpdater_.reset(new SgdUpdaterWithCpuAverager(*config_));
+      } else {
+        parameterUpdater_.reset(new SgdLocalUpdater(*config_));
+      }
+    } else {
+      LOG(FATAL) << "Unsupported algorithm in local mode: " << alg;
+    }
+  }
+}
+
+void TrainerInternal::forwardBackwardBatch(const std::vector<Argument>& inArgs,
+                                           std::vector<Argument>& outArgs,
+                                           PassType& passType,
+                                           UpdateCallback updateCallback,
+                                           bool doPipelineUpdate) {
+  gradientMachine_->forwardBackward(
+      inArgs, &outArgs, passType, doPipelineUpdate ? updateCallback : nullptr);
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/trainer/TrainerInternal.h b/paddle/legacy/trainer/TrainerInternal.h
new file mode 100644
index 0000000000000000000000000000000000000000..93919a68fca2930cdf106f45d112e2a459fe695a
--- /dev/null
+++ b/paddle/legacy/trainer/TrainerInternal.h
@@ -0,0 +1,139 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/utils/Util.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fstream>
+
+#include "ParameterUpdater.h"
+#include "TrainerConfig.pb.h"
+#include "TrainerConfigHelper.h"
+#include "TrainerInternalConfig.h"
+#include "hl_gpu.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
+
+namespace paddle {
+
+/**
+ * TrainerInteral
+ * the core training class for driving training logic
+ */
+class TrainerInternal {
+ public:
+  struct ParaStat {
+    real maxAbsGrad;
+    real avgAbsGrad;
+    ParaStat() : maxAbsGrad(.0), avgAbsGrad(.0) {}
+  };
+
+  TrainerInternal() {}
+
+  /**
+   * Intializes trainer internal class
+   * @param config network config
+   * @param machine gradient machine
+   * @param intconfig training config
+   * @param stats training stats
+   * @param testing if it is in testing phase
+   */
+  void init(const std::shared_ptr<TrainerConfigHelper>& config,
+            const GradientMachinePtr& machine,
+            std::unique_ptr<TrainerInternalConfig>&& intconfig,
+            const std::shared_ptr<TrainerStats>& stats,
+            bool testing);
+
+  virtual ~TrainerInternal() {}
+
+  /**
+   * CreateParameterUpdater
+   * @param testing if it is in testing phase
+   */
+  void createParameterUpdater(bool testing);
+
+  /**
+   * FinishTrainPass
+   * @param passId current pass id
+   * @param batchId current batch id, starts from 0
+   */
+  void finishTrainPass(int passId, int batchId);
+
+  /**
+   * trainOneBatch
+   * @param batchId current batch id
+   * @param dataBatch data for the batch
+   */
+  void trainOneBatch(int64_t batchId,
+                     const DataBatch& dataBatch,
+                     std::vector<Argument>* outArgs);
+
+  /**
+   * showParameterStats
+   * @param paraStats training stats
+   */
+  void showParameterStats(const std::vector<ParaStat>& paraStats);
+
+  /**
+   * getGradientMachine
+   */
+  inline const GradientMachinePtr& getGradientMachine() const {
+    return gradientMachine_;
+  }
+
+  /**
+   * getParameterUpdater
+   */
+  inline const std::shared_ptr<ParameterUpdater>& getParameterUpdater() {
+    return parameterUpdater_;
+  }
+
+  /**
+   * setCurrentEvaluator
+   * @param eval evaluator to set
+   */
+  inline void setCurrentEvaluator(Evaluator* eval) { currentEvaluator_ = eval; }
+
+  /**
+   * setEvaluator
+   * @param eval evaluator to set
+   */
+  inline void setEvaluator(Evaluator* eval) { evaluator_ = eval; }
+
+  /**
+   * forwardBackwardBatch
+   * @param inArgs input argument for data batch
+   * @param outArgs output argument from neural network
+   * @param updateCallback layerwise parameter gradient statistics
+   * @param doPipelineUpdate whether to do pipeline update
+   */
+  virtual void forwardBackwardBatch(const std::vector<Argument>& inArgs,
+                                    std::vector<Argument>& outArgs,
+                                    PassType& passType,
+                                    UpdateCallback updateCallback,
+                                    bool doPipelineUpdate);
+
+ protected:
+  std::shared_ptr<ParameterUpdater> parameterUpdater_;
+  GradientMachinePtr gradientMachine_;
+  std::shared_ptr<TrainerConfigHelper> config_;
+  std::unique_ptr<TrainerInternalConfig> intconfig_;
+  std::shared_ptr<TrainerStats> stats_;
+  Evaluator* currentEvaluator_;
+  Evaluator* evaluator_;
+};
+
+}  // namespace paddle
diff --git a/paddle/trainer/TrainerInternalConfig.cpp b/paddle/legacy/trainer/TrainerInternalConfig.cpp
similarity index 100%
rename from paddle/trainer/TrainerInternalConfig.cpp
rename to paddle/legacy/trainer/TrainerInternalConfig.cpp
diff --git a/paddle/legacy/trainer/TrainerInternalConfig.h b/paddle/legacy/trainer/TrainerInternalConfig.h
new file mode 100644
index 0000000000000000000000000000000000000000..b91b53932381a8698b331a2989b5f16829c06a25
--- /dev/null
+++ b/paddle/legacy/trainer/TrainerInternalConfig.h
@@ -0,0 +1,233 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/legacy/utils/Util.h"
+
+#include <stdio.h>
+
+#include "hl_gpu.h"
+#include "paddle/legacy/gserver/gradientmachines/GradientMachine.h"
+
+#include "TrainerConfig.pb.h"
+
+#include <stdlib.h>
+#include <fstream>
+#include <sstream>
+#include "ParameterUpdater.h"
+
+namespace paddle {
+/**
+ * @brief TrainerStats object will statistics sample processed and total cost.
+ *
+ * There are two stats in it, the 'AvgCost' and 'CurrentAvgCost'. 'AvgCost'
+ * means cost through one pass(all mini-batches). 'CurrentAvgCost' means cost
+ * through one mini-batch.
+ */
+class TrainerStats {
+ public:
+  /**
+   * @brief reset all stats.
+   *
+   * often used before pass start.
+   */
+  inline void reset() {
+    numProcessed_ = 0;
+    totalCost_ = .0;
+    this->resetCurrentStat();
+  }
+
+  /**
+   * @brief reset current stat.
+   *
+   * 'current' means the most recent --log_period mini-batches
+   */
+  inline void resetCurrentStat() {
+    currentCost_ = .0;
+    currentSamples_ = 0;
+  }
+
+  /**
+   * @brief add cost to stat.
+   * @param numProcessed current mini-batch size
+   * @param cost current mini-batch cost
+   */
+  inline void addCost(int64_t numProcessed, real cost) {
+    this->numProcessed_ += numProcessed;
+    this->totalCost_ += cost;
+    this->currentSamples_ += numProcessed;
+    this->currentCost_ += cost;
+  }
+
+  /**
+   * @brief get average cost through on pass(all processed mini-batches)
+   * @return pass average cost
+   */
+  inline real getAvgCost() const {
+    CHECK_NE(this->numProcessed_, 0);
+    return this->totalCost_ / this->numProcessed_;
+  }
+
+  /**
+   * @brief get current mini-batch's average cost.
+   * @return mini-batch average cost
+   */
+  inline real getCurrentAvgCost() const {
+    CHECK_NE(this->currentSamples_, 0);
+    return this->currentCost_ / this->currentSamples_;
+  }
+
+  /**
+   * @brief get all processed samples' number
+   * @return all processed samples' number
+   */
+  inline int64_t getNumProcessed() const { return this->numProcessed_; }
+
+  /**
+   * @brief same function as addCost. But it is simple to invoke.
+   * For example:
+   *
+   * @code{.cpp}
+   * TrainerStats stat;
+   * cost = neuralNetwork.forward(batchSize);
+   * stat += {batchSize, cost};
+   * @endcode
+   *
+   * @param p a pair of parameter, first is numProcessed, second is cost.
+   * @return *this
+   */
+  inline TrainerStats& operator+=(const std::pair<int64_t, real>& p) {
+    this->addCost(p.first, p.second);
+    return *this;
+  }
+
+  /**
+   * @brief TrainerStats Constructor.
+   *
+   * reset stat when constructed.
+   */
+  inline TrainerStats() { this->reset(); }
+
+  /**
+   * @brief show stats to ostream.
+   *
+   * If there is no need to print current cost, set withCurrentCost to False.
+   *
+   * @param os output stream.
+   * @param withCurrentCost print current cost or not.
+   */
+  void showStats(std::ostream& os, bool withCurrentCost = true) const {
+    os << "samples=" << this->getNumProcessed()
+       << " AvgCost=" << this->getAvgCost();
+    if (withCurrentCost) {
+      os << " CurrentCost=" << this->getCurrentAvgCost();
+    }
+  }
+
+  /**
+   * @brief get stats to std::string
+   * @param withCurrentCost return current cost or not
+   * @return stats string
+   */
+  std::string getStats(bool withCurrentCost = true) const {
+    std::ostringstream os;
+    this->showStats(os, withCurrentCost);
+    return os.str();
+  }
+
+ private:
+  int64_t numProcessed_;
+  real totalCost_;
+  real currentCost_;
+  int64_t currentSamples_;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const TrainerStats& stats) {
+  stats.showStats(os);
+  return os;
+}
+
+/**
+ * TrainerInternalConfig
+ * general configs for training
+ */
+struct TrainerInternalConfig {
+  /**
+   * @brief Create TrainerInternalConfig from GradientMachine::CreateMode and
+   * command line arguments.
+   * @param mode
+   * @return
+   */
+  static std::unique_ptr<TrainerInternalConfig> createFromMode(
+      GradientMachine::CreateMode mode);
+
+  /**
+   * indicate whether the training is local
+   * if local, no parameter server is used
+   */
+  bool local;
+
+  /**
+   * indicate whether training uses GPU
+   */
+  bool use_gpu;
+
+  /**
+   * indicate number of trainer
+   */
+  int trainer_count;
+
+  /**
+   * how frequently to show param stats
+   */
+  int show_param_stats_period;
+
+  /**
+   * current trainer id
+   */
+  int trainer_id;
+
+  /**
+   * frequency to dump log
+   */
+  int log_period;
+
+  /**
+   * dot period
+   */
+  int dot_period;
+
+  /**
+   * num passes for training
+   */
+  int num_passes;
+
+  /**
+   * use old updater
+   */
+  bool use_old_updater;
+
+  /**
+   * whether to load and save parameter in pserver
+   */
+  bool loadsave_parameters_in_pserver;
+
+  /**
+   * training mode
+   */
+  GradientMachine::CreateMode mode;
+};
+
+}  //  namespace paddle
diff --git a/paddle/legacy/trainer/TrainerMain.cpp b/paddle/legacy/trainer/TrainerMain.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..911aeba1928f7208aecb92910dac981f00fc6db5
--- /dev/null
+++ b/paddle/legacy/trainer/TrainerMain.cpp
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fenv.h>
+#include "paddle/legacy/pserver/ParameterServerController.h"
+#include "paddle/legacy/utils/PythonUtil.h"
+
+#include "ParamUtil.h"
+#include "Trainer.h"
+
+DEFINE_bool(start_pserver, false, "Whether to start pserver");
+DECLARE_int32(gpu_id);
+DEFINE_string(job, "train", "one of (train, test, checkgrad)");
+DECLARE_int32(start_pass);
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_string(rdma_tcp);
+
+using namespace paddle;  // NOLINT
+
+int main(int argc, char** argv) {
+  // write logs instantly (never buffer log messages)
+  FLAGS_logbuflevel = -1;
+
+  initMain(argc, argv);
+  initPython(argc, argv);
+
+  std::unique_ptr<ParameterServerController> parameterServerPtr(nullptr);
+  if (FLAGS_start_pserver) {
+    parameterServerPtr.reset(
+        paddle::ParameterServerController::createFromGflags());
+    parameterServerPtr->start();
+  }
+  Trainer trainer;
+  auto config = TrainerConfigHelper::createFromFlags();
+  CHECK(config != nullptr) << "no valid config";
+
+  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
+  trainer.init(config, FLAGS_job == "test");
+
+  if (FLAGS_job == "train") {
+    trainer.train();
+  } else if (FLAGS_job == "checkgrad") {
+    trainer.checkGradient();
+  } else if (FLAGS_job == "test") {
+    trainer.test();
+  } else if (FLAGS_job == "time") {
+    trainer.time();
+  } else {
+    LOG(FATAL) << "Unknown job type: " << FLAGS_job;
+  }
+
+  return 0;
+}
diff --git a/paddle/trainer/tests/.gitignore b/paddle/legacy/trainer/tests/.gitignore
similarity index 100%
rename from paddle/trainer/tests/.gitignore
rename to paddle/legacy/trainer/tests/.gitignore
diff --git a/paddle/legacy/trainer/tests/CMakeLists.txt b/paddle/legacy/trainer/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..08548bea4c4a7fc4fa99d9305208abd4ee442572
--- /dev/null
+++ b/paddle/legacy/trainer/tests/CMakeLists.txt
@@ -0,0 +1,37 @@
+add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/sample_trainer_config.conf
+    COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/* ${CMAKE_CURRENT_BINARY_DIR}
+)
+add_custom_target(copy_trainer_conf ALL DEPENDS sample_trainer_config.conf)
+
+set(PYTHON_PATH 
+   ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
+   ${PADDLE_BINARY_DIR}/python/:${PADDLE_BINARY_DIR}/paddle/legacy/trainer/tests)
+function(trainer_test TARGET)
+  add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
+  add_test(NAME ${TARGET}
+    COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
+      WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
+endfunction()
+
+trainer_test(test_Compare)
+trainer_test(test_PyDataProviderWrapper)
+trainer_test(test_recurrent_machine_generation)
+trainer_test(test_Trainer)
+
+############### test_TrainerOnePass ##########################
+if(WITH_PYTHON)
+  # only run test_TrainerOnePass when PYTHON is enabled, because train one pass
+  # is using PyDataProvider2.
+  add_unittest_without_exec(test_TrainerOnePass
+      test_TrainerOnePass.cpp)
+  add_test(NAME test_TrainerOnePass
+    COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port 
+          ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
+      WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
+endif()
+
+#################### test_config_parser #########################
+add_test(NAME test_config_parser
+  COMMAND ${PYTHON_PATH} ${PYTHON_EXECUTABLE} 
+        ${PADDLE_SOURCE_DIR}/paddle/legacy/trainer/tests/config_parser_test.py
+    WORKING_DIRECTORY ${PADDLE_BINARY_DIR}/paddle/)
diff --git a/paddle/trainer/tests/__init__.py b/paddle/legacy/trainer/tests/__init__.py
similarity index 100%
rename from paddle/trainer/tests/__init__.py
rename to paddle/legacy/trainer/tests/__init__.py
diff --git a/paddle/legacy/trainer/tests/config_parser_test.py b/paddle/legacy/trainer/tests/config_parser_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d3d82cbdafcf85d42247e810fe7caa685a86e4d
--- /dev/null
+++ b/paddle/legacy/trainer/tests/config_parser_test.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.config_parser import parse_config_and_serialize
+
+if __name__ == '__main__':
+    parse_config_and_serialize('legacy/trainer/tests/test_config.conf', '')
+    parse_config_and_serialize(
+        'legacy/trainer/tests/sample_trainer_config.conf',
+        'extension_module_name=paddle.trainer.config_parser_extension')
+    parse_config_and_serialize(
+        'legacy/gserver/tests/pyDataProvider/trainer.conf', '')
diff --git a/paddle/trainer/tests/fake_file_list.list b/paddle/legacy/trainer/tests/fake_file_list.list
similarity index 100%
rename from paddle/trainer/tests/fake_file_list.list
rename to paddle/legacy/trainer/tests/fake_file_list.list
diff --git a/paddle/legacy/trainer/tests/picojson.h b/paddle/legacy/trainer/tests/picojson.h
new file mode 100644
index 0000000000000000000000000000000000000000..75349537b1c7f10d23bae788e8414a753c7ccab0
--- /dev/null
+++ b/paddle/legacy/trainer/tests/picojson.h
@@ -0,0 +1,1103 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * Copyright 2009-2010 Cybozu Labs, Inc.
+ * Copyright 2011-2014 Kazuho Oku
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef picojson_h
+#define picojson_h
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <map>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+// for isnan/isinf
+#if __cplusplus >= 201103L
+#include <cmath>
+#else
+extern "C" {
+#ifdef _MSC_VER
+#include <float.h>
+#elif defined(__INTEL_COMPILER)
+#include <mathimf.h>
+#else
+#include <math.h>
+#endif
+}
+#endif
+
+// experimental support for int64_t (see README.mkdn for detail)
+#ifdef PICOJSON_USE_INT64
+#define __STDC_FORMAT_MACROS
+#include <errno.h>
+#include <inttypes.h>
+#endif
+
+// to disable the use of localeconv(3), set PICOJSON_USE_LOCALE to 0
+#ifndef PICOJSON_USE_LOCALE
+#define PICOJSON_USE_LOCALE 1
+#endif
+#if PICOJSON_USE_LOCALE
+extern "C" {
+#include <locale.h>
+}
+#endif
+
+#ifndef PICOJSON_ASSERT
+#define PICOJSON_ASSERT(e)                  \
+  do {                                      \
+    if (!(e)) throw std::runtime_error(#e); \
+  } while (0)
+#endif
+
+#ifdef _MSC_VER
+#define SNPRINTF _snprintf_s
+#pragma warning(push)
+#pragma warning(disable : 4244)  // conversion from int to char
+#pragma warning(disable : 4127)  // conditional expression is constant
+#pragma warning(disable : 4702)  // unreachable code
+#else
+#define SNPRINTF snprintf
+#endif
+
+namespace picojson {
+
+enum {
+  null_type,
+  boolean_type,
+  number_type,
+  string_type,
+  array_type,
+  object_type
+#ifdef PICOJSON_USE_INT64
+  ,
+  int64_type
+#endif
+};
+
+enum { INDENT_WIDTH = 2 };
+
+struct null {};
+
+class value {
+ public:
+  typedef std::vector<value> array;
+  typedef std::map<std::string, value> object;
+  union _storage {
+    bool boolean_;
+    double number_;
+#ifdef PICOJSON_USE_INT64
+    int64_t int64_;
+#endif
+    std::string* string_;
+    array* array_;
+    object* object_;
+  };
+
+ protected:
+  int type_;
+  _storage u_;
+
+ public:
+  value();
+  value(int type, bool);
+  explicit value(bool b);
+#ifdef PICOJSON_USE_INT64
+  explicit value(int64_t i);
+#endif
+  explicit value(double n);
+  explicit value(const std::string& s);
+  explicit value(const array& a);
+  explicit value(const object& o);
+  explicit value(const char* s);
+  value(const char* s, size_t len);
+  ~value();
+  value(const value& x);
+  value& operator=(const value& x);
+  void swap(value& x);
+  template <typename T>
+  bool is() const;
+  template <typename T>
+  const T& get() const;
+  template <typename T>
+  T& get();
+  bool evaluate_as_boolean() const;
+  const value& get(size_t idx) const;
+  const value& get(const std::string& key) const;
+  value& get(size_t idx);
+  value& get(const std::string& key);
+
+  bool contains(size_t idx) const;
+  bool contains(const std::string& key) const;
+  std::string to_str() const;
+  template <typename Iter>
+  void serialize(Iter os, bool prettify = false) const;
+  std::string serialize(bool prettify = false) const;
+
+ private:
+  template <typename T>
+  value(const T*);  // intentionally defined to block implicit conversion of
+                    // pointer to bool
+  template <typename Iter>
+  static void _indent(Iter os, int indent);
+  template <typename Iter>
+  void _serialize(Iter os, int indent) const;
+  std::string _serialize(int indent) const;
+};
+
+typedef value::array array;
+typedef value::object object;
+
+inline value::value() : type_(null_type) {}
+
+inline value::value(int type, bool) : type_(type) {
+  switch (type) {
+#define INIT(p, v) \
+  case p##type:    \
+    u_.p = v;      \
+    break
+    INIT(boolean_, false);
+    INIT(number_, 0.0);
+#ifdef PICOJSON_USE_INT64
+    INIT(int64_, 0);
+#endif
+    INIT(string_, new std::string());
+    INIT(array_, new array());
+    INIT(object_, new object());
+#undef INIT
+    default:
+      break;
+  }
+}
+
+inline value::value(bool b) : type_(boolean_type) { u_.boolean_ = b; }
+
+#ifdef PICOJSON_USE_INT64
+inline value::value(int64_t i) : type_(int64_type) { u_.int64_ = i; }
+#endif
+
+inline value::value(double n) : type_(number_type) {
+  if (
+#ifdef _MSC_VER
+      !_finite(n)
+#elif __cplusplus >= 201103L || !(defined(isnan) && defined(isinf))
+      std::isnan(n) || std::isinf(n)
+#else
+      isnan(n) || isinf(n)
+#endif
+          ) {
+    throw std::overflow_error("");
+  }
+  u_.number_ = n;
+}
+
+inline value::value(const std::string& s) : type_(string_type) {
+  u_.string_ = new std::string(s);
+}
+
+inline value::value(const array& a) : type_(array_type) {
+  u_.array_ = new array(a);
+}
+
+inline value::value(const object& o) : type_(object_type) {
+  u_.object_ = new object(o);
+}
+
+inline value::value(const char* s) : type_(string_type) {
+  u_.string_ = new std::string(s);
+}
+
+inline value::value(const char* s, size_t len) : type_(string_type) {
+  u_.string_ = new std::string(s, len);
+}
+
+inline value::~value() {
+  switch (type_) {
+#define DEINIT(p) \
+  case p##type:   \
+    delete u_.p;  \
+    break
+    DEINIT(string_);
+    DEINIT(array_);
+    DEINIT(object_);
+#undef DEINIT
+    default:
+      break;
+  }
+}
+
+inline value::value(const value& x) : type_(x.type_) {
+  switch (type_) {
+#define INIT(p, v) \
+  case p##type:    \
+    u_.p = v;      \
+    break
+    INIT(string_, new std::string(*x.u_.string_));
+    INIT(array_, new array(*x.u_.array_));
+    INIT(object_, new object(*x.u_.object_));
+#undef INIT
+    default:
+      u_ = x.u_;
+      break;
+  }
+}
+
+inline value& value::operator=(const value& x) {
+  if (this != &x) {
+    value t(x);
+    swap(t);
+  }
+  return *this;
+}
+
+inline void value::swap(value& x) {
+  std::swap(type_, x.type_);
+  std::swap(u_, x.u_);
+}
+
+#define IS(ctype, jtype)                 \
+  template <>                            \
+  inline bool value::is<ctype>() const { \
+    return type_ == jtype##_type;        \
+  }
+IS(null, null)
+IS(bool, boolean)
+#ifdef PICOJSON_USE_INT64
+IS(int64_t, int64)
+#endif
+IS(std::string, string)
+IS(array, array)
+IS(object, object)
+#undef IS
+template <>
+inline bool value::is<double>() const {
+  return type_ == number_type
+#ifdef PICOJSON_USE_INT64
+         || type_ == int64_type
+#endif
+      ;
+}
+
+#define GET(ctype, var)                                                    \
+  template <>                                                              \
+  inline const ctype& value::get<ctype>() const {                          \
+    PICOJSON_ASSERT("type mismatch! call is<type>() before get<type>()" && \
+                    is<ctype>());                                          \
+    return var;                                                            \
+  }                                                                        \
+  template <>                                                              \
+  inline ctype& value::get<ctype>() {                                      \
+    PICOJSON_ASSERT("type mismatch! call is<type>() before get<type>()" && \
+                    is<ctype>());                                          \
+    return var;                                                            \
+  }
+GET(bool, u_.boolean_)
+GET(std::string, *u_.string_)
+GET(array, *u_.array_)
+GET(object, *u_.object_)
+#ifdef PICOJSON_USE_INT64
+GET(double,
+    (type_ == int64_type && (const_cast<value*>(this)->type_ = number_type,
+                             const_cast<value*>(this)->u_.number_ = u_.int64_),
+     u_.number_))
+GET(int64_t, u_.int64_)
+#else
+GET(double, u_.number_)
+#endif
+#undef GET
+
+inline bool value::evaluate_as_boolean() const {
+  switch (type_) {
+    case null_type:
+      return false;
+    case boolean_type:
+      return u_.boolean_;
+    case number_type:
+      return u_.number_ != 0;
+#ifdef PICOJSON_USE_INT64
+    case int64_type:
+      return u_.int64_ != 0;
+#endif
+    case string_type:
+      return !u_.string_->empty();
+    default:
+      return true;
+  }
+}
+
+inline const value& value::get(size_t idx) const {
+  static value s_null;
+  PICOJSON_ASSERT(is<array>());
+  return idx < u_.array_->size() ? (*u_.array_)[idx] : s_null;
+}
+
+inline value& value::get(size_t idx) {
+  static value s_null;
+  PICOJSON_ASSERT(is<array>());
+  return idx < u_.array_->size() ? (*u_.array_)[idx] : s_null;
+}
+
+inline const value& value::get(const std::string& key) const {
+  static value s_null;
+  PICOJSON_ASSERT(is<object>());
+  object::const_iterator i = u_.object_->find(key);
+  return i != u_.object_->end() ? i->second : s_null;
+}
+
+inline value& value::get(const std::string& key) {
+  static value s_null;
+  PICOJSON_ASSERT(is<object>());
+  object::iterator i = u_.object_->find(key);
+  return i != u_.object_->end() ? i->second : s_null;
+}
+
+inline bool value::contains(size_t idx) const {
+  PICOJSON_ASSERT(is<array>());
+  return idx < u_.array_->size();
+}
+
+inline bool value::contains(const std::string& key) const {
+  PICOJSON_ASSERT(is<object>());
+  object::const_iterator i = u_.object_->find(key);
+  return i != u_.object_->end();
+}
+
+inline std::string value::to_str() const {
+  switch (type_) {
+    case null_type:
+      return "null";
+    case boolean_type:
+      return u_.boolean_ ? "true" : "false";
+#ifdef PICOJSON_USE_INT64
+    case int64_type: {
+      char buf[sizeof("-9223372036854775808")];
+      SNPRINTF(buf, sizeof(buf), "%" PRId64, u_.int64_);
+      return buf;
+    }
+#endif
+    case number_type: {
+      char buf[256];
+      double tmp;
+      SNPRINTF(buf,
+               sizeof(buf),
+               fabs(u_.number_) < (1ULL << 53) && modf(u_.number_, &tmp) == 0
+                   ? "%.f"
+                   : "%.17g",
+               u_.number_);
+#if PICOJSON_USE_LOCALE
+      char* decimal_point = localeconv()->decimal_point;
+      if (strcmp(decimal_point, ".") != 0) {
+        size_t decimal_point_len = strlen(decimal_point);
+        for (char* p = buf; *p != '\0'; ++p) {
+          if (strncmp(p, decimal_point, decimal_point_len) == 0) {
+            return std::string(buf, p) + "." + (p + decimal_point_len);
+          }
+        }
+      }
+#endif
+      return buf;
+    }
+    case string_type:
+      return *u_.string_;
+    case array_type:
+      return "array";
+    case object_type:
+      return "object";
+    default:
+      PICOJSON_ASSERT(0);
+#ifdef _MSC_VER
+      __assume(0);
+#endif
+  }
+  return std::string();
+}
+
+template <typename Iter>
+void copy(const std::string& s, Iter oi) {
+  std::copy(s.begin(), s.end(), oi);
+}
+
+template <typename Iter>
+void serialize_str(const std::string& s, Iter oi) {
+  *oi++ = '"';
+  for (std::string::const_iterator i = s.begin(); i != s.end(); ++i) {
+    switch (*i) {
+#define MAP(val, sym) \
+  case val:           \
+    copy(sym, oi);    \
+    break
+      MAP('"', "\\\"");
+      MAP('\\', "\\\\");
+      MAP('/', "\\/");
+      MAP('\b', "\\b");
+      MAP('\f', "\\f");
+      MAP('\n', "\\n");
+      MAP('\r', "\\r");
+      MAP('\t', "\\t");
+#undef MAP
+      default:
+        if (static_cast<unsigned char>(*i) < 0x20 || *i == 0x7f) {
+          char buf[7];
+          SNPRINTF(buf, sizeof(buf), "\\u%04x", *i & 0xff);
+          copy(buf, buf + 6, oi);
+        } else {
+          *oi++ = *i;
+        }
+        break;
+    }
+  }
+  *oi++ = '"';
+}
+
+template <typename Iter>
+void value::serialize(Iter oi, bool prettify) const {
+  return _serialize(oi, prettify ? 0 : -1);
+}
+
+inline std::string value::serialize(bool prettify) const {
+  return _serialize(prettify ? 0 : -1);
+}
+
+template <typename Iter>
+void value::_indent(Iter oi, int indent) {
+  *oi++ = '\n';
+  for (int i = 0; i < indent * INDENT_WIDTH; ++i) {
+    *oi++ = ' ';
+  }
+}
+
+template <typename Iter>
+void value::_serialize(Iter oi, int indent) const {
+  switch (type_) {
+    case string_type:
+      serialize_str(*u_.string_, oi);
+      break;
+    case array_type: {
+      *oi++ = '[';
+      if (indent != -1) {
+        ++indent;
+      }
+      for (array::const_iterator i = u_.array_->begin(); i != u_.array_->end();
+           ++i) {
+        if (i != u_.array_->begin()) {
+          *oi++ = ',';
+        }
+        if (indent != -1) {
+          _indent(oi, indent);
+        }
+        i->_serialize(oi, indent);
+      }
+      if (indent != -1) {
+        --indent;
+        if (!u_.array_->empty()) {
+          _indent(oi, indent);
+        }
+      }
+      *oi++ = ']';
+      break;
+    }
+    case object_type: {
+      *oi++ = '{';
+      if (indent != -1) {
+        ++indent;
+      }
+      for (object::const_iterator i = u_.object_->begin();
+           i != u_.object_->end();
+           ++i) {
+        if (i != u_.object_->begin()) {
+          *oi++ = ',';
+        }
+        if (indent != -1) {
+          _indent(oi, indent);
+        }
+        serialize_str(i->first, oi);
+        *oi++ = ':';
+        if (indent != -1) {
+          *oi++ = ' ';
+        }
+        i->second._serialize(oi, indent);
+      }
+      if (indent != -1) {
+        --indent;
+        if (!u_.object_->empty()) {
+          _indent(oi, indent);
+        }
+      }
+      *oi++ = '}';
+      break;
+    }
+    default:
+      copy(to_str(), oi);
+      break;
+  }
+  if (indent == 0) {
+    *oi++ = '\n';
+  }
+}
+
+inline std::string value::_serialize(int indent) const {
+  std::string s;
+  _serialize(std::back_inserter(s), indent);
+  return s;
+}
+
+template <typename Iter>
+class input {
+ protected:
+  Iter cur_, end_;
+  int last_ch_;
+  bool ungot_;
+  int line_;
+
+ public:
+  input(const Iter& first, const Iter& last)
+      : cur_(first), end_(last), last_ch_(-1), ungot_(false), line_(1) {}
+  int getc() {
+    if (ungot_) {
+      ungot_ = false;
+      return last_ch_;
+    }
+    if (cur_ == end_) {
+      last_ch_ = -1;
+      return -1;
+    }
+    if (last_ch_ == '\n') {
+      line_++;
+    }
+    last_ch_ = *cur_ & 0xff;
+    ++cur_;
+    return last_ch_;
+  }
+  void ungetc() {
+    if (last_ch_ != -1) {
+      PICOJSON_ASSERT(!ungot_);
+      ungot_ = true;
+    }
+  }
+  Iter cur() const { return cur_; }
+  int line() const { return line_; }
+  void skip_ws() {
+    while (1) {
+      int ch = getc();
+      if (!(ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r')) {
+        ungetc();
+        break;
+      }
+    }
+  }
+  bool expect(int expect) {
+    skip_ws();
+    if (getc() != expect) {
+      ungetc();
+      return false;
+    }
+    return true;
+  }
+  bool match(const std::string& pattern) {
+    for (std::string::const_iterator pi(pattern.begin()); pi != pattern.end();
+         ++pi) {
+      if (getc() != *pi) {
+        ungetc();
+        return false;
+      }
+    }
+    return true;
+  }
+};
+
+template <typename Iter>
+inline int _parse_quadhex(input<Iter>& in) {
+  int uni_ch = 0, hex;
+  for (int i = 0; i < 4; i++) {
+    if ((hex = in.getc()) == -1) {
+      return -1;
+    }
+    if ('0' <= hex && hex <= '9') {
+      hex -= '0';
+    } else if ('A' <= hex && hex <= 'F') {
+      hex -= 'A' - 0xa;
+    } else if ('a' <= hex && hex <= 'f') {
+      hex -= 'a' - 0xa;
+    } else {
+      in.ungetc();
+      return -1;
+    }
+    uni_ch = uni_ch * 16 + hex;
+  }
+  return uni_ch;
+}
+
+template <typename String, typename Iter>
+inline bool _parse_codepoint(String& out, input<Iter>& in) {
+  int uni_ch;
+  if ((uni_ch = _parse_quadhex(in)) == -1) {
+    return false;
+  }
+  if (0xd800 <= uni_ch && uni_ch <= 0xdfff) {
+    if (0xdc00 <= uni_ch) {
+      // a second 16-bit of a surrogate pair appeared
+      return false;
+    }
+    // first 16-bit of surrogate pair, get the next one
+    if (in.getc() != '\\' || in.getc() != 'u') {
+      in.ungetc();
+      return false;
+    }
+    int second = _parse_quadhex(in);
+    if (!(0xdc00 <= second && second <= 0xdfff)) {
+      return false;
+    }
+    uni_ch = ((uni_ch - 0xd800) << 10) | ((second - 0xdc00) & 0x3ff);
+    uni_ch += 0x10000;
+  }
+  if (uni_ch < 0x80) {
+    out.push_back(uni_ch);
+  } else {
+    if (uni_ch < 0x800) {
+      out.push_back(0xc0 | (uni_ch >> 6));
+    } else {
+      if (uni_ch < 0x10000) {
+        out.push_back(0xe0 | (uni_ch >> 12));
+      } else {
+        out.push_back(0xf0 | (uni_ch >> 18));
+        out.push_back(0x80 | ((uni_ch >> 12) & 0x3f));
+      }
+      out.push_back(0x80 | ((uni_ch >> 6) & 0x3f));
+    }
+    out.push_back(0x80 | (uni_ch & 0x3f));
+  }
+  return true;
+}
+
+template <typename String, typename Iter>
+inline bool _parse_string(String& out, input<Iter>& in) {
+  while (1) {
+    int ch = in.getc();
+    if (ch < ' ') {
+      in.ungetc();
+      return false;
+    } else if (ch == '"') {
+      return true;
+    } else if (ch == '\\') {
+      if ((ch = in.getc()) == -1) {
+        return false;
+      }
+      switch (ch) {
+#define MAP(sym, val)   \
+  case sym:             \
+    out.push_back(val); \
+    break
+        MAP('"', '\"');
+        MAP('\\', '\\');
+        MAP('/', '/');
+        MAP('b', '\b');
+        MAP('f', '\f');
+        MAP('n', '\n');
+        MAP('r', '\r');
+        MAP('t', '\t');
+#undef MAP
+        case 'u':
+          if (!_parse_codepoint(out, in)) {
+            return false;
+          }
+          break;
+        default:
+          return false;
+      }
+    } else {
+      out.push_back(ch);
+    }
+  }
+  return false;
+}
+
+template <typename Context, typename Iter>
+inline bool _parse_array(Context& ctx, input<Iter>& in) {
+  if (!ctx.parse_array_start()) {
+    return false;
+  }
+  size_t idx = 0;
+  if (in.expect(']')) {
+    return ctx.parse_array_stop(idx);
+  }
+  do {
+    if (!ctx.parse_array_item(in, idx)) {
+      return false;
+    }
+    idx++;
+  } while (in.expect(','));
+  return in.expect(']') && ctx.parse_array_stop(idx);
+}
+
+template <typename Context, typename Iter>
+inline bool _parse_object(Context& ctx, input<Iter>& in) {
+  if (!ctx.parse_object_start()) {
+    return false;
+  }
+  if (in.expect('}')) {
+    return true;
+  }
+  do {
+    std::string key;
+    if (!in.expect('"') || !_parse_string(key, in) || !in.expect(':')) {
+      return false;
+    }
+    if (!ctx.parse_object_item(in, key)) {
+      return false;
+    }
+  } while (in.expect(','));
+  return in.expect('}');
+}
+
+template <typename Iter>
+inline std::string _parse_number(input<Iter>& in) {
+  std::string num_str;
+  while (1) {
+    int ch = in.getc();
+    if (('0' <= ch && ch <= '9') || ch == '+' || ch == '-' || ch == 'e' ||
+        ch == 'E') {
+      num_str.push_back(ch);
+    } else if (ch == '.') {
+#if PICOJSON_USE_LOCALE
+      num_str += localeconv()->decimal_point;
+#else
+      num_str.push_back('.');
+#endif
+    } else {
+      in.ungetc();
+      break;
+    }
+  }
+  return num_str;
+}
+
+template <typename Context, typename Iter>
+inline bool _parse(Context& ctx, input<Iter>& in) {
+  in.skip_ws();
+  int ch = in.getc();
+  switch (ch) {
+#define IS(ch, text, op)        \
+  case ch:                      \
+    if (in.match(text) && op) { \
+      return true;              \
+    } else {                    \
+      return false;             \
+    }
+    IS('n', "ull", ctx.set_null());
+    IS('f', "alse", ctx.set_bool(false));
+    IS('t', "rue", ctx.set_bool(true));
+#undef IS
+    case '"':
+      return ctx.parse_string(in);
+    case '[':
+      return _parse_array(ctx, in);
+    case '{':
+      return _parse_object(ctx, in);
+    default:
+      if (('0' <= ch && ch <= '9') || ch == '-') {
+        double f;
+        char* endp;
+        in.ungetc();
+        std::string num_str = _parse_number(in);
+        if (num_str.empty()) {
+          return false;
+        }
+#ifdef PICOJSON_USE_INT64
+        {
+          errno = 0;
+          intmax_t ival = strtoimax(num_str.c_str(), &endp, 10);
+          if (errno == 0 && std::numeric_limits<int64_t>::min() <= ival &&
+              ival <= std::numeric_limits<int64_t>::max() &&
+              endp == num_str.c_str() + num_str.size()) {
+            ctx.set_int64(ival);
+            return true;
+          }
+        }
+#endif
+        f = strtod(num_str.c_str(), &endp);
+        if (endp == num_str.c_str() + num_str.size()) {
+          ctx.set_number(f);
+          return true;
+        }
+        return false;
+      }
+      break;
+  }
+  in.ungetc();
+  return false;
+}
+
+class deny_parse_context {
+ public:
+  bool set_null() { return false; }
+  bool set_bool(bool) { return false; }
+#ifdef PICOJSON_USE_INT64
+  bool set_int64(int64_t) { return false; }
+#endif
+  bool set_number(double) { return false; }
+  template <typename Iter>
+  bool parse_string(input<Iter>&) {
+    return false;
+  }
+  bool parse_array_start() { return false; }
+  template <typename Iter>
+  bool parse_array_item(input<Iter>&, size_t) {
+    return false;
+  }
+  bool parse_array_stop(size_t) { return false; }
+  bool parse_object_start() { return false; }
+  template <typename Iter>
+  bool parse_object_item(input<Iter>&, const std::string&) {
+    return false;
+  }
+};
+
+class default_parse_context {
+ protected:
+  value* out_;
+
+ public:
+  default_parse_context(value* out) : out_(out) {}
+  bool set_null() {
+    *out_ = value();
+    return true;
+  }
+  bool set_bool(bool b) {
+    *out_ = value(b);
+    return true;
+  }
+#ifdef PICOJSON_USE_INT64
+  bool set_int64(int64_t i) {
+    *out_ = value(i);
+    return true;
+  }
+#endif
+  bool set_number(double f) {
+    *out_ = value(f);
+    return true;
+  }
+  template <typename Iter>
+  bool parse_string(input<Iter>& in) {
+    *out_ = value(string_type, false);
+    return _parse_string(out_->get<std::string>(), in);
+  }
+  bool parse_array_start() {
+    *out_ = value(array_type, false);
+    return true;
+  }
+  template <typename Iter>
+  bool parse_array_item(input<Iter>& in, size_t) {
+    array& a = out_->get<array>();
+    a.push_back(value());
+    default_parse_context ctx(&a.back());
+    return _parse(ctx, in);
+  }
+  bool parse_array_stop(size_t) { return true; }
+  bool parse_object_start() {
+    *out_ = value(object_type, false);
+    return true;
+  }
+  template <typename Iter>
+  bool parse_object_item(input<Iter>& in, const std::string& key) {
+    object& o = out_->get<object>();
+    default_parse_context ctx(&o[key]);
+    return _parse(ctx, in);
+  }
+
+ private:
+  default_parse_context(const default_parse_context&);
+  default_parse_context& operator=(const default_parse_context&);
+};
+
+class null_parse_context {
+ public:
+  struct dummy_str {
+    void push_back(int) {}
+  };
+
+ public:
+  null_parse_context() {}
+  bool set_null() { return true; }
+  bool set_bool(bool) { return true; }
+#ifdef PICOJSON_USE_INT64
+  bool set_int64(int64_t) { return true; }
+#endif
+  bool set_number(double) { return true; }
+  template <typename Iter>
+  bool parse_string(input<Iter>& in) {
+    dummy_str s;
+    return _parse_string(s, in);
+  }
+  bool parse_array_start() { return true; }
+  template <typename Iter>
+  bool parse_array_item(input<Iter>& in, size_t) {
+    return _parse(*this, in);
+  }
+  bool parse_array_stop(size_t) { return true; }
+  bool parse_object_start() { return true; }
+  template <typename Iter>
+  bool parse_object_item(input<Iter>& in, const std::string&) {
+    return _parse(*this, in);
+  }
+
+ private:
+  null_parse_context(const null_parse_context&);
+  null_parse_context& operator=(const null_parse_context&);
+};
+
+// obsolete, use the version below
+template <typename Iter>
+inline std::string parse(value& out, Iter& pos, const Iter& last) {
+  std::string err;
+  pos = parse(out, pos, last, &err);
+  return err;
+}
+
+template <typename Context, typename Iter>
+inline Iter _parse(Context& ctx,
+                   const Iter& first,
+                   const Iter& last,
+                   std::string* err) {
+  input<Iter> in(first, last);
+  if (!_parse(ctx, in) && err != NULL) {
+    char buf[64];
+    SNPRINTF(buf, sizeof(buf), "syntax error at line %d near: ", in.line());
+    *err = buf;
+    while (1) {
+      int ch = in.getc();
+      if (ch == -1 || ch == '\n') {
+        break;
+      } else if (ch >= ' ') {
+        err->push_back(ch);
+      }
+    }
+  }
+  return in.cur();
+}
+
+template <typename Iter>
+inline Iter parse(value& out,
+                  const Iter& first,
+                  const Iter& last,
+                  std::string* err) {
+  default_parse_context ctx(&out);
+  return _parse(ctx, first, last, err);
+}
+
+inline std::string parse(value& out, const std::string& s) {
+  std::string err;
+  parse(out, s.begin(), s.end(), &err);
+  return err;
+}
+
+inline std::string parse(value& out, std::istream& is) {
+  std::string err;
+  parse(out,
+        std::istreambuf_iterator<char>(is.rdbuf()),
+        std::istreambuf_iterator<char>(),
+        &err);
+  return err;
+}
+
+template <typename T>
+struct last_error_t {
+  static std::string s;
+};
+template <typename T>
+std::string last_error_t<T>::s;
+
+inline void set_last_error(const std::string& s) { last_error_t<bool>::s = s; }
+
+inline const std::string& get_last_error() { return last_error_t<bool>::s; }
+
+inline bool operator==(const value& x, const value& y) {
+  if (x.is<null>()) return y.is<null>();
+#define PICOJSON_CMP(type) \
+  if (x.is<type>()) return y.is<type>() && x.get<type>() == y.get<type>()
+  PICOJSON_CMP(bool);
+  PICOJSON_CMP(double);
+  PICOJSON_CMP(std::string);
+  PICOJSON_CMP(array);
+  PICOJSON_CMP(object);
+#undef PICOJSON_CMP
+  PICOJSON_ASSERT(0);
+#ifdef _MSC_VER
+  __assume(0);
+#endif
+  return false;
+}
+
+inline bool operator!=(const value& x, const value& y) { return !(x == y); }
+}  // namespace picojson
+
+namespace std {
+template <>
+inline void swap(picojson::value& x, picojson::value& y) {
+  x.swap(y);
+}
+}  // namespace std
+
+inline std::istream& operator>>(std::istream& is, picojson::value& x) {
+  picojson::set_last_error(std::string());
+  std::string err = picojson::parse(x, is);
+  if (!err.empty()) {
+    picojson::set_last_error(err);
+    is.setstate(std::ios::failbit);
+  }
+  return is;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const picojson::value& x) {
+  x.serialize(std::ostream_iterator<char>(os));
+  return os;
+}
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#endif
diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data b/paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data
similarity index 100%
rename from paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data
rename to paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data
diff --git a/paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list b/paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list
new file mode 100644
index 0000000000000000000000000000000000000000..11c1b1b38b9edacc4953fdf526906d28bcc2d720
--- /dev/null
+++ b/paddle/legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list
@@ -0,0 +1 @@
+legacy/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data
diff --git a/paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.beam b/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.beam
similarity index 100%
rename from paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.beam
rename to paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.beam
diff --git a/paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.nest b/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nest
similarity index 100%
rename from paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.nest
rename to paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nest
diff --git a/paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.nobeam b/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nobeam
similarity index 100%
rename from paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.nobeam
rename to paddle/legacy/trainer/tests/rnn_gen_test_model_dir/r1.test.nobeam
diff --git a/paddle/trainer/tests/rnn_gen_test_model_dir/t1/transtable b/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/t1/transtable
similarity index 100%
rename from paddle/trainer/tests/rnn_gen_test_model_dir/t1/transtable
rename to paddle/legacy/trainer/tests/rnn_gen_test_model_dir/t1/transtable
diff --git a/paddle/trainer/tests/rnn_gen_test_model_dir/t1/wordvec b/paddle/legacy/trainer/tests/rnn_gen_test_model_dir/t1/wordvec
similarity index 100%
rename from paddle/trainer/tests/rnn_gen_test_model_dir/t1/wordvec
rename to paddle/legacy/trainer/tests/rnn_gen_test_model_dir/t1/wordvec
diff --git a/paddle/trainer/tests/sample_data.txt b/paddle/legacy/trainer/tests/sample_data.txt
similarity index 100%
rename from paddle/trainer/tests/sample_data.txt
rename to paddle/legacy/trainer/tests/sample_data.txt
diff --git a/paddle/legacy/trainer/tests/sample_filelist.txt b/paddle/legacy/trainer/tests/sample_filelist.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8573f9e1795edd37cfa0d21f0effc08a80d38e29
--- /dev/null
+++ b/paddle/legacy/trainer/tests/sample_filelist.txt
@@ -0,0 +1 @@
+legacy/trainer/tests/sample_data.txt
diff --git a/paddle/legacy/trainer/tests/sample_trainer_config.conf b/paddle/legacy/trainer/tests/sample_trainer_config.conf
new file mode 100644
index 0000000000000000000000000000000000000000..5800b3625661efac80b84b19c2a5cedc34718488
--- /dev/null
+++ b/paddle/legacy/trainer/tests/sample_trainer_config.conf
@@ -0,0 +1,87 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+TrainData(SimpleData(
+            files = "legacy/trainer/tests/sample_filelist.txt",
+            feat_dim = 3,
+            context_len = 0,
+            buffer_capacity = 1000000))
+
+TestData(SimpleData(
+           files = "legacy/trainer/tests/sample_filelist.txt",
+           feat_dim = 3,
+           context_len = 0,
+           buffer_capacity = 1000000))
+
+settings(batch_size = 100)
+
+data = data_layer(name='input', size=3)
+
+fc1 = fc_layer(input=data, size=5,
+               bias_attr=False,
+               act=SigmoidActivation())
+
+fc2 = fc_layer(input=data, size=9,
+               bias_attr=False,
+               act=LinearActivation())
+
+fc3 = fc_layer(input=data, size=3,
+               bias_attr=False,
+               act=TanhActivation())
+
+fc4 = fc_layer(input=data, size=5,
+               bias_attr=False,
+               act=LinearActivation(),
+               param_attr=ParamAttr(name='sharew'))
+
+fc5 = fc_layer(input=data, size=5,
+               bias_attr=False,
+               act=BReluActivation())
+
+fc6 = fc_layer(input=data, size=5,
+               bias_attr=False,
+               act=SoftReluActivation())
+
+fc7 = fc_layer(input=data, size=3,
+               bias_attr=False,
+               act=SquareActivation())
+
+fc8 = fc_layer(input=data, size=5,
+               bias_attr=True,
+               act=SquareActivation())
+
+with mixed_layer(size=3, act=SoftmaxActivation()) as layer9:
+    layer9 += full_matrix_projection(input=fc1)
+    layer9 += full_matrix_projection(input=fc2)
+    layer9 += full_matrix_projection(input=fc3)
+    layer9 += trans_full_matrix_projection(input=fc4,
+                                           param_attr=ParamAttr(name='sharew'))
+    layer9 += full_matrix_projection(input=fc5)
+    layer9 += full_matrix_projection(input=fc6)
+    layer9 += full_matrix_projection(input=fc7)
+    layer9 += full_matrix_projection(input=fc8)
+
+if get_config_arg('with_cost', bool, True):
+    # This is for training the neural network.
+    # We need to have another data layer for label
+    # and a layer for calculating cost
+    lbl = data_layer(name='label', size=1)
+    outputs(classification_cost(input=layer9, label=lbl))
+else:    
+    # This is for prediction where we don't have label
+    # and don't need to calculate cost
+    outputs(layer9)
diff --git a/paddle/legacy/trainer/tests/sample_trainer_config_hsigmoid.conf b/paddle/legacy/trainer/tests/sample_trainer_config_hsigmoid.conf
new file mode 100644
index 0000000000000000000000000000000000000000..155c40b31f30c40e1ddeb65500f55162beb9a0ee
--- /dev/null
+++ b/paddle/legacy/trainer/tests/sample_trainer_config_hsigmoid.conf
@@ -0,0 +1,53 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from paddle.trainer_config_helpers import *
+
+TrainData(SimpleData(
+    files = "legacy/trainer/tests/sample_filelist.txt",
+    feat_dim = 3,
+    context_len = 0,
+    buffer_capacity = 1000000,
+))
+
+settings(batch_size = 100)
+
+data = data_layer(name='input', size=3)
+
+fc1 = fc_layer(input=data, size=12,
+               bias_attr=False,
+               act=SigmoidActivation())
+
+fc2 = fc_layer(input=data, size=19,
+               bias_attr=False,
+               act=LinearActivation())
+
+fc3 = fc_layer(input=data, size=5,
+               bias_attr=False,
+               act=TanhActivation())
+
+fc4 = fc_layer(input=data, size=5,
+               bias_attr=False,
+               act=LinearActivation())
+
+# This is for training the neural network.
+# We need to have another data layer for label
+# and a layer for calculating cost
+lbl = data_layer(name='label', size=1)
+
+outputs(hsigmoid(input=[fc1, fc2, fc3, fc4],
+                 label=lbl,
+                 num_classes=3))
diff --git a/paddle/legacy/trainer/tests/sample_trainer_config_parallel.conf b/paddle/legacy/trainer/tests/sample_trainer_config_parallel.conf
new file mode 100644
index 0000000000000000000000000000000000000000..49cdde7fa2c55e6536a49633f959af6a888ec463
--- /dev/null
+++ b/paddle/legacy/trainer/tests/sample_trainer_config_parallel.conf
@@ -0,0 +1,86 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+TrainData(SimpleData(
+            files = "legacy/trainer/tests/sample_filelist.txt",
+            feat_dim = 3,
+            context_len = 0,
+            buffer_capacity = 1000000))
+
+TestData(SimpleData(
+           files = "legacy/trainer/tests/sample_filelist.txt",
+           feat_dim = 3,
+           context_len = 0,
+           buffer_capacity = 1000000))
+
+settings(batch_size = 100)
+
+# Output layer, label layer, cost layer, preferably set to the same environment.
+output_device = 0
+
+# Input Layer does not need to specify the device number.
+data = data_layer(name='input', size=3)
+
+# Calculate in the CPU.
+fc1 = fc_layer(input=data, size=5,
+               bias_attr=True,
+               layer_attr=ExtraAttr(device=-1),
+               act=SigmoidActivation())
+
+# Calculate in the GPU 0.
+fc2 = fc_layer(input=fc1, size=10,
+               bias_attr=True,
+               layer_attr=ExtraAttr(device=0),
+               act=SigmoidActivation())
+
+# Calculate in the GPU 1.
+fc3 = fc_layer(input=fc1, size=10,
+               bias_attr=True,
+               layer_attr=ExtraAttr(device=1),
+               act=SigmoidActivation())
+
+# Calculate in the GPU 0.
+fc4 = fc_layer(input=[fc2,fc3], size=10,
+               bias_attr=True,
+               layer_attr=ExtraAttr(device=0),
+               act=SigmoidActivation())
+
+# Calculate in the GPU 1.
+fc5 = fc_layer(input=[fc2,fc3], size=10,
+               bias_attr=True,
+               layer_attr=ExtraAttr(device=1),
+               act=SigmoidActivation())
+
+output = fc_layer(input=[fc4,fc5], size=10,
+                  bias_attr=True,
+                  layer_attr=ExtraAttr(device=output_device),
+                  act=SoftmaxActivation())
+
+if get_config_arg('with_cost', bool, True):
+    # This is for training the neural network.
+    # We need to have another data layer for label
+    # and a layer for calculating cost
+    lbl = data_layer(name='label', size=1,
+                    layer_attr=ExtraAttr(device=output_device))
+                    
+    outputs(classification_cost(input=output, 
+                                label=lbl,
+                                layer_attr=ExtraAttr(device=output_device)))
+else:
+    # This is for prediction where we don't have label
+    # and don't need to calculate cost
+    outputs(output)
diff --git a/paddle/legacy/trainer/tests/sample_trainer_nest_rnn_gen.conf b/paddle/legacy/trainer/tests/sample_trainer_nest_rnn_gen.conf
new file mode 100644
index 0000000000000000000000000000000000000000..51ef905a5a182464f69a1629e51bf8180eadb3fb
--- /dev/null
+++ b/paddle/legacy/trainer/tests/sample_trainer_nest_rnn_gen.conf
@@ -0,0 +1,73 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=15, learning_rate=0)
+
+num_words = 5
+beam_flag = get_config_arg('beam_search', bool, False)
+
+sent_id = data_layer(name="sent_id", size=1)
+
+# This layer has no actual use, but only to decide batch_size in generation.
+# When generating, at least one Memory in RecurrentLayer MUST have a boot layer.
+dummy_data = data_layer(name="dummy_data_input", size=2)
+
+def outer_step(dummy_data):
+
+    gen_inputs = [StaticInput(input=dummy_data, size=2, is_seq=True),
+                  GeneratedInput(size=num_words,
+                                 embedding_name="wordvec",
+                                 embedding_size=num_words)]
+
+    def inner_step(dummy_memory, predict_word):
+
+        # simplified RNN for testing
+        with mixed_layer(size=num_words) as layer:
+            layer += full_matrix_projection(input=predict_word,
+                                            param_attr=ParamAttr(name="transtable"))
+
+        with mixed_layer(size=num_words, act=ExpActivation()) as out:
+            out += trans_full_matrix_projection(input=layer,
+                                                param_attr=ParamAttr(name="wordvec"))
+
+        return out
+
+    beam_gen = beam_search(name="rnn_gen",
+                           step=inner_step,
+                           input=gen_inputs,
+                           bos_id=0,
+                           eos_id=num_words-1,
+                           beam_size=2 if beam_flag else 1,
+                           num_results_per_sample=1,
+                           max_length=10)
+    return beam_gen
+
+beam_gen_concat = recurrent_group(name="rnn_gen_concat",
+                                  step=outer_step,
+                                  input=[SubsequenceInput(dummy_data)])
+
+seqtext_printer_evaluator(input=beam_gen_concat,
+                          id_input=sent_id,
+                          dict_file="./legacy/trainer/tests/test_gen_dict.txt",
+                          result_file="./legacy/trainer/tests/dump_text.test")
+#outputs(beam_gen_concat)
+# In this config, as dummy_data_input doesn't work on beam_gen (we can find dummy_memory
+# is read-only memory, and isn't used by other layers of step), we show the Inputs and Outputs
+# as follows. Note that "__beam_search_predict__" is the default output name of beam_search.
+Inputs("sent_id","dummy_data_input")
+Outputs("__beam_search_predict__")
diff --git a/paddle/legacy/trainer/tests/sample_trainer_rnn_gen.conf b/paddle/legacy/trainer/tests/sample_trainer_rnn_gen.conf
new file mode 100644
index 0000000000000000000000000000000000000000..35c7f0fcd91f9b534a4f535387af720659d7f9b8
--- /dev/null
+++ b/paddle/legacy/trainer/tests/sample_trainer_rnn_gen.conf
@@ -0,0 +1,66 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=15, learning_rate=0)
+
+num_words = 5
+beam_flag = get_config_arg('beam_search', bool, False)
+
+sent_id = data_layer(name="sent_id", size=1)
+
+# This layer has no actual use, but only to decide batch_size in generation.
+# When generating, at least one Memory in RecurrentLayer MUST have a boot layer.
+dummy_data = data_layer(name="dummy_data_input", size=2)
+
+gen_inputs = [StaticInput(input=dummy_data, size=2),
+              GeneratedInput(size=num_words,
+                             embedding_name="wordvec",
+                             embedding_size=num_words)]
+
+def step(dummy_memory, predict_word):
+
+    # simplified RNN for testing
+    with mixed_layer(size=num_words) as layer:
+        layer += full_matrix_projection(input=predict_word,
+                                        param_attr=ParamAttr(name="transtable"))
+
+    with mixed_layer(size=num_words, act=ExpActivation()) as out:
+        out += trans_full_matrix_projection(input=layer,
+                                            param_attr=ParamAttr(name="wordvec"))
+
+    return out
+
+beam_gen = beam_search(name="rnn_gen",
+                       step=step,
+                       input=gen_inputs,
+                       bos_id=0,
+                       eos_id=num_words-1,
+                       beam_size=2 if beam_flag else 1,
+                       num_results_per_sample=2 if beam_flag else 1,
+                       max_length=10)
+
+seqtext_printer_evaluator(input=beam_gen,
+                          id_input=sent_id,
+                          dict_file="./legacy/trainer/tests/test_gen_dict.txt",
+                          result_file="./legacy/trainer/tests/dump_text.test")
+#outputs(beam_gen)
+# In this config, as dummy_data_input doesn't work on beam_gen (we can find dummy_memory
+# is read-only memory, and isn't used by other layers of step), we show the Inputs and Outputs
+# as follows. Note that "__beam_search_predict__" is the default output name of beam_search.
+Inputs("sent_id","dummy_data_input")
+Outputs("__beam_search_predict__")
diff --git a/paddle/legacy/trainer/tests/simple_sparse_neural_network.py b/paddle/legacy/trainer/tests/simple_sparse_neural_network.py
new file mode 100644
index 0000000000000000000000000000000000000000..9419f4d903b1de205a6c549c7dcd9bb85ed7396b
--- /dev/null
+++ b/paddle/legacy/trainer/tests/simple_sparse_neural_network.py
@@ -0,0 +1,37 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=17, learning_method=AdaGradOptimizer(), learning_rate=1e-4)
+
+file_list = 'legacy/trainer/tests/fake_file_list.list'
+
+define_py_data_sources2(
+    train_list=file_list,
+    test_list=file_list,
+    module="simple_sparse_neural_network_dp",
+    obj="process")
+
+embedding = embedding_layer(
+    input=data_layer(
+        name="word_ids", size=8191),
+    size=128,
+    param_attr=ParamAttr(sparse_update=True))
+prediction = fc_layer(input=embedding, size=10, act=SoftmaxActivation())
+
+outputs(
+    classification_cost(
+        input=prediction, label=data_layer(
+            name='label', size=10)))
diff --git a/paddle/trainer/tests/simple_sparse_neural_network_dp.py b/paddle/legacy/trainer/tests/simple_sparse_neural_network_dp.py
similarity index 100%
rename from paddle/trainer/tests/simple_sparse_neural_network_dp.py
rename to paddle/legacy/trainer/tests/simple_sparse_neural_network_dp.py
diff --git a/paddle/trainer/tests/testPyDataWrapper.py b/paddle/legacy/trainer/tests/testPyDataWrapper.py
similarity index 100%
rename from paddle/trainer/tests/testPyDataWrapper.py
rename to paddle/legacy/trainer/tests/testPyDataWrapper.py
diff --git a/paddle/legacy/trainer/tests/test_Compare.cpp b/paddle/legacy/trainer/tests/test_Compare.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e37e546be8513b1cc7438810a01641859a4bad18
--- /dev/null
+++ b/paddle/legacy/trainer/tests/test_Compare.cpp
@@ -0,0 +1,158 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/legacy/utils/PythonUtil.h>
+
+#include "paddle/legacy/trainer/Trainer.h"
+
+#include <gtest/gtest.h>
+#include <cstdlib>
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+static const string& configFile =
+    "legacy/trainer/tests/sample_trainer_config.conf";
+
+DECLARE_int32(gpu_id);
+DECLARE_bool(use_gpu);
+DECLARE_string(config);
+DECLARE_string(config_args);
+
+struct comData {
+  vector<Argument> outArgs;
+  vector<ParameterPtr> parameters;
+};
+
+void calcGradient(bool useGpu, comData& Data) {
+  FLAGS_use_gpu = useGpu;
+  FLAGS_config = configFile;
+
+  *ThreadLocalRand::getSeed() = 0;
+  srand(0);
+  Trainer trainer;
+  trainer.init(TrainerConfigHelper::createFromFlagConfig());
+
+  Data.parameters = trainer.getGradientMachine()->getParameters();
+  DataBatch dataBatch;
+  int32_t batchSize = trainer.getConfig().opt_config().batch_size();
+  trainer.getDataProvider()->setSkipShuffle();
+  trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
+  CHECK(dataBatch.getSize()) << "No data from data provider";
+  vector<Argument>& inArgs = dataBatch.getStreams();
+  trainer.getGradientMachine()->start();
+  for (int i = 0; i < 2; ++i) {
+    trainer.getGradientMachine()->forwardBackward(
+        inArgs, &Data.outArgs, PASS_TRAIN);
+  }
+  trainer.getGradientMachine()->finish();
+}
+
+void compareGradient(comData& comDataCpu, comData& comDataGpu);
+
+TEST(Trainer, create) {
+  int devCount = 0;
+  devCount = hl_get_device_count();
+  FLAGS_config_args = "drop_rate=0";
+
+  comData comDataCpu;
+  calcGradient(false, comDataCpu);
+  LOG(INFO) << "Cpu is completed";
+
+  {
+    LOG(INFO) << "Test GPU";
+    comData comData;
+    calcGradient(true, comData);
+    compareGradient(comDataCpu, comData);
+    LOG(INFO) << "Gpu is completed";
+  }
+
+  {
+    LOG(INFO) << "Test test multi gpu";
+    comData comData;
+    FLAGS_trainer_count = devCount;
+    calcGradient(true, comData);
+    compareGradient(comDataCpu, comData);
+    LOG(INFO) << "Gpu4 is completed";
+  }
+
+  {
+    LOG(INFO) << "Test use_sparse_update=true";
+    comData comData;
+    calcGradient(false, comData);
+    compareGradient(comDataCpu, comData);
+    LOG(INFO) << "Cpu4 is completed";
+  }
+}
+
+double checkBuffer(real* A, real* B, size_t len) {
+#ifdef PADDLE_TYPE_DOUBLE
+  double precision = 1e-7;
+#else
+  double precision = 2e-3;
+#endif
+  int nNum = 0;
+  double maxE = 0;
+  for (size_t i = 0; i < len; ++i) {
+    double e = fabs(A[i] - B[i]);
+    maxE = std::max(e, maxE);
+    nNum += e > precision * fabs(A[i]);
+  }
+  EXPECT_EQ(0, nNum);
+  return maxE;
+}
+
+void compareGradient(comData& comDataCpu, comData& comDataGpu) {
+  /*compare outArgs*/
+  vector<Argument> outArgs1 = comDataCpu.outArgs;
+  vector<Argument> outArgs2 = comDataGpu.outArgs;
+  CpuMatrix out1(outArgs1[0].value->getHeight(), outArgs1[0].value->getWidth());
+  CpuMatrix out2(outArgs2[0].value->getHeight(), outArgs2[0].value->getWidth());
+  out1.copyFrom(*outArgs1[0].value);
+  out2.copyFrom(*outArgs2[0].value);
+  checkBuffer(out1.getData(), out2.getData(), out1.getElementCnt());
+
+  /*compare parameters*/
+  vector<ParameterPtr>& parameters1 = comDataCpu.parameters;
+  vector<ParameterPtr>& parameters2 = comDataGpu.parameters;
+  for (size_t i = 0; i < parameters1.size(); ++i) {
+    ParameterPtr parameter1, parameter2;
+    parameter1 = parameters1[i];
+    parameter2 = parameters2[i];
+    /*compare parameters value*/
+    CpuVector para1(parameter1->getSize());
+    CpuVector para2(parameter2->getSize());
+    para1.copyFrom(*parameter1->getBuf(PARAMETER_VALUE));
+    para2.copyFrom(*parameter2->getBuf(PARAMETER_VALUE));
+    checkBuffer(para1.getData(), para2.getData(), para1.getSize());
+
+    /*compare parameters grad*/
+    CpuVector cpuGrad1(*parameter1->getBuf(PARAMETER_GRADIENT));
+    CpuVector cpuGrad2(*parameter2->getBuf(PARAMETER_GRADIENT));
+    double e =
+        checkBuffer(cpuGrad1.getData(), cpuGrad2.getData(), cpuGrad1.getSize());
+    LOG(INFO) << parameter1->getName() << " max error=" << e;
+  }
+}
+
+int main(int argc, char** argv) {
+#ifndef PADDLE_WITH_CUDA
+  exit(0);
+#endif
+  paddle::initMain(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  initPython(argc, argv);
+  int ret = RUN_ALL_TESTS();
+  exit(ret);
+}
diff --git a/paddle/legacy/trainer/tests/test_PyDataProviderWrapper.cpp b/paddle/legacy/trainer/tests/test_PyDataProviderWrapper.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..847adcfabada18e11203d3f18fb6dc355c670afb
--- /dev/null
+++ b/paddle/legacy/trainer/tests/test_PyDataProviderWrapper.cpp
@@ -0,0 +1,220 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef PADDLE_NO_PYTHON
+#include <DataConfig.pb.h>
+#include <gtest/gtest.h>
+#include <paddle/legacy/gserver/dataproviders/DataProvider.h>
+#include <paddle/legacy/math/Matrix.h>
+#include <paddle/legacy/parameter/Argument.h>
+#include <paddle/legacy/utils/PythonUtil.h>
+#include <fstream>
+#include <typeinfo>
+#include <unordered_map>
+#include <unordered_set>
+#include "picojson.h"
+
+void checkValue(std::vector<paddle::Argument>& arguments, picojson::array& arr);
+const std::string kDir = "./legacy/trainer/tests/pydata_provider_wrapper_dir/";
+
+TEST(PyDataProviderWrapper, SequenceData) {
+  paddle::DataConfig conf;
+  conf.set_type("py");
+  conf.set_load_data_module("testPyDataWrapper");
+  conf.set_load_data_object("processSeqAndGenerateData");
+  conf.set_load_data_args(kDir + "test_pydata_provider_wrapper.json");
+  conf.clear_files();
+  conf.set_files(kDir + "test_pydata_provider_wrapper.list");
+  paddle::DataProviderPtr provider(paddle::DataProvider::create(conf, false));
+  provider->setSkipShuffle();
+  provider->reset();
+  paddle::DataBatch batchFromPy;
+  provider->getNextBatch(100, &batchFromPy);
+
+  picojson::value val;
+  std::fstream fin;
+  fin.open(kDir + "test_pydata_provider_wrapper.json", std::ios_base::in);
+  EXPECT_TRUE(fin.is_open());
+  if (fin.is_open()) {
+    std::string err = picojson::parse(val, fin);
+    EXPECT_TRUE(err.empty());
+    EXPECT_TRUE(val.is<picojson::array>());
+    picojson::array& arr = val.get<picojson::array>();
+    std::vector<paddle::Argument>& arguments = batchFromPy.getStreams();
+    // CHECK Value
+    checkValue(arguments, arr);
+    // CHECK sequenceStartPositions
+    for (size_t i = 0; i < arr.size(); i++) {
+      int row_id = arr[i].get<picojson::array>().size();
+      EXPECT_EQ(0, arguments[i].sequenceStartPositions->getData(false)[0]);
+      EXPECT_EQ((int)row_id,
+                arguments[i].sequenceStartPositions->getData(false)[1]);
+    }
+    fin.close();
+  }
+}
+
+TEST(PyDataProviderWrapper, HasSubSequenceData) {
+  paddle::DataConfig conf;
+  conf.set_type("py");
+  conf.set_load_data_module("testPyDataWrapper");
+  conf.set_load_data_object("processSubSeqAndGenerateData");
+  conf.set_load_data_args(kDir + "test_pydata_provider_wrapper.json");
+  conf.clear_files();
+  conf.set_files(kDir + "test_pydata_provider_wrapper.list");
+  paddle::DataProviderPtr provider(paddle::DataProvider::create(conf, false));
+  provider->setSkipShuffle();
+  provider->reset();
+  paddle::DataBatch batchFromPy;
+  provider->getNextBatch(1, &batchFromPy);
+
+  picojson::value val;
+  std::fstream fin;
+  fin.open(kDir + "test_pydata_provider_wrapper.json", std::ios_base::in);
+  EXPECT_TRUE(fin.is_open());
+  if (fin.is_open()) {
+    std::string err = picojson::parse(val, fin);
+    EXPECT_TRUE(err.empty());
+    EXPECT_TRUE(val.is<picojson::array>());
+    picojson::array& arr = val.get<picojson::array>();
+    std::vector<paddle::Argument>& arguments = batchFromPy.getStreams();
+    // CHECK Value
+    checkValue(arguments, arr);
+    // CHECK sequenceStartPositions and subSequenceStartPositions
+    for (size_t i = 0; i < arr.size(); i++) {
+      int row_id = arr[i].get<picojson::array>().size();
+      EXPECT_EQ(0, arguments[i].sequenceStartPositions->getData(false)[0]);
+      EXPECT_EQ((int)row_id,
+                arguments[i].sequenceStartPositions->getData(false)[1]);
+      EXPECT_EQ(0, arguments[i].subSequenceStartPositions->getData(false)[0]);
+      EXPECT_EQ((int)row_id,
+                arguments[i].subSequenceStartPositions->getData(false)[1]);
+    }
+    fin.close();
+  }
+}
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  paddle::initPython(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+void checkValue(std::vector<paddle::Argument>& arguments,
+                picojson::array& arr) {
+  // CHECK SLOT 0, Sparse Value.
+  paddle::Argument& sparse_values_seq = arguments[0];
+  paddle::MatrixPtr& sparse_values_seq_rawmatrix = sparse_values_seq.value;
+  EXPECT_TRUE(sparse_values_seq_rawmatrix != nullptr);
+  paddle::CpuSparseMatrix* sparse_val_seq_sparse_mat =
+      dynamic_cast<paddle::CpuSparseMatrix*>(sparse_values_seq_rawmatrix.get());
+  EXPECT_TRUE(sparse_val_seq_sparse_mat != nullptr);
+  EXPECT_EQ(arr.size(), arguments.size());
+  EXPECT_TRUE(arr[0].is<picojson::array>());
+  size_t row_id = 0;
+  for (picojson::value& sparse_val_seq : arr[0].get<picojson::array>()) {
+    std::unordered_map<int, real> cols;
+    for (picojson::value& kv : sparse_val_seq.get<picojson::array>()) {
+      EXPECT_TRUE(kv.get(0).is<double>());
+      EXPECT_TRUE(kv.get(1).is<double>());
+      int col = (int)(kv.get(0).get<double>());
+      real val = (real)(kv.get(1).get<double>());
+      cols.insert({col, val});
+    }
+    size_t colNum = sparse_val_seq_sparse_mat->getColNum(row_id);
+    EXPECT_EQ(cols.size(), colNum);
+    int* rowIds = sparse_val_seq_sparse_mat->getRowCols(row_id);
+    real* rowBuf = sparse_val_seq_sparse_mat->getRowValues(row_id);
+    for (size_t i = 0; i < colNum; ++i) {
+      int id = rowIds[i];
+      auto it = cols.find(id);
+      EXPECT_NE(cols.end(), it);
+      real expect = it->second;
+      EXPECT_NEAR(expect, *rowBuf, 1e-5);
+      ++rowBuf;
+    }
+    ++row_id;
+  }
+
+  // CHECK SLOT 1, Dense Value.
+  paddle::Argument& dense_arg = arguments[1];
+  paddle::MatrixPtr& dense_mat = dense_arg.value;
+  EXPECT_NE(nullptr, dense_mat);
+  EXPECT_TRUE(arr[1].is<picojson::array>());
+  row_id = 0;
+  for (picojson::value& dense_seq : arr[1].get<picojson::array>()) {
+    EXPECT_TRUE(dense_seq.is<picojson::array>());
+    picojson::array& row = dense_seq.get<picojson::array>();
+    EXPECT_EQ(row.size(), dense_mat->getWidth());
+    real* rowBuf = dense_mat->getRowBuf(row_id++);
+
+    for (picojson::value& val : row) {
+      EXPECT_TRUE(val.is<double>());
+      real expect = val.get<double>();
+      EXPECT_NEAR(expect, *rowBuf++, 1e-5);
+    }
+  }
+
+  // CHECK SLOT 2, Sparse Non Value.
+  paddle::Argument& sparse_non_val_arg = arguments[2];
+  paddle::MatrixPtr& sparse_non_val_rawm = sparse_non_val_arg.value;
+  EXPECT_NE(nullptr, sparse_non_val_rawm);
+  paddle::CpuSparseMatrix* sparse_non_val_m =
+      dynamic_cast<paddle::CpuSparseMatrix*>(sparse_non_val_rawm.get());
+  EXPECT_NE(nullptr, sparse_non_val_m);
+  row_id = 0;
+  for (picojson::value& row : arr[2].get<picojson::array>()) {
+    EXPECT_TRUE(row.is<picojson::array>());
+    std::unordered_set<int> ids;
+    for (picojson::value& id : row.get<picojson::array>()) {
+      EXPECT_TRUE(id.is<double>());
+      ids.insert((int)(id.get<double>()));
+    }
+    size_t colNum = sparse_non_val_m->getColNum(row_id);
+    EXPECT_EQ(ids.size(), colNum);
+    for (size_t i = 0; i < colNum; ++i) {
+      int col = sparse_non_val_m->getRowCols(row_id)[i];
+      EXPECT_TRUE(ids.find(col) != ids.end());
+    }
+    ++row_id;
+  }
+
+  // CHECK SLOT 3, Index.
+  paddle::Argument& index_arg = arguments[3];
+  paddle::IVectorPtr indices = index_arg.ids;
+  EXPECT_NE(nullptr, indices);
+  int* idPtr = indices->getData();
+  for (picojson::value& id : arr[3].get<picojson::array>()) {
+    EXPECT_TRUE(id.is<double>());
+    int _id = (int)(id.get<double>());
+    EXPECT_EQ(_id, *idPtr++);
+  }
+
+  // CHECK SLOT 4, String.
+  paddle::Argument& strArg = arguments[4];
+  std::vector<std::string>* strPtr = strArg.strs.get();
+  EXPECT_NE(nullptr, strPtr);
+  size_t vecIndex = 0;
+  for (picojson::value& str : arr[4].get<picojson::array>()) {
+    EXPECT_TRUE(str.is<std::string>());
+    std::string _str = str.get<std::string>();
+    EXPECT_EQ(_str, (*strPtr)[vecIndex++]);
+  }
+}
+
+#else
+int main() { return 0; }
+
+#endif
diff --git a/paddle/legacy/trainer/tests/test_Trainer.cpp b/paddle/legacy/trainer/tests/test_Trainer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..14ad0a265281a8df20a70b0da2873ea451338ddb
--- /dev/null
+++ b/paddle/legacy/trainer/tests/test_Trainer.cpp
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/legacy/utils/PythonUtil.h>
+#include <paddle/legacy/utils/Version.h>
+#include "paddle/legacy/trainer/Trainer.h"
+
+#include <gtest/gtest.h>
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+static const string& configFile1 =
+    "legacy/trainer/tests/sample_trainer_config.conf";
+static const string& configFile2 =
+    "legacy/trainer/tests/sample_trainer_config_hsigmoid.conf";
+static const string& configFile4 =
+    "legacy/trainer/tests/sample_trainer_config_parallel.conf";
+
+DECLARE_bool(use_gpu);
+DECLARE_string(config);
+DECLARE_int32(gpu_id);
+DECLARE_bool(allow_only_one_model_on_one_gpu);
+
+void checkGradientTest(const string& configFile,
+                       bool useGpu,
+                       bool parallel,
+                       int trainerCount = 1) {
+  FLAGS_use_gpu = useGpu;
+  FLAGS_parallel_nn = parallel;
+  FLAGS_config = configFile;
+  FLAGS_trainer_count = trainerCount;
+  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
+            << " configFile=" << configFile;
+
+  Trainer trainer;
+  trainer.init(TrainerConfigHelper::createFromFlagConfig());
+  EXPECT_LE(fabs(trainer.checkGradient()), 0.02);
+}
+
+TEST(checkGradient, cpu) { checkGradientTest(configFile1, false, false); }
+
+#ifdef PADDLE_WITH_CUDA
+TEST(checkGradient, gpu) { checkGradientTest(configFile1, true, false); }
+
+TEST(checkGradient, multiGpu) {
+  int numGpu;
+  numGpu = hl_get_device_count();
+  for (auto count : {2, 4}) {
+    if (count <= numGpu) {
+      checkGradientTest(configFile1, true, false, count);
+    }
+  }
+}
+
+TEST(checkGradient, parallel) {
+  if (hl_get_device_count() >= 2) {
+    checkGradientTest(configFile4, true, true);
+  }
+}
+
+TEST(checkGradient, multiParallel) {
+  FLAGS_allow_only_one_model_on_one_gpu = false;
+  checkGradientTest(configFile4, true, true, 2);
+  FLAGS_allow_only_one_model_on_one_gpu = true;
+}
+
+#endif
+
+TEST(checkGradient, multi) {
+  int numGpu;
+  if (version::isWithGpu()) {
+    numGpu = hl_get_device_count();
+  } else {
+    numGpu = 0;
+  }
+  for (bool useGpu : {false, true}) {
+    for (auto count : {2, 4}) {
+      if (useGpu && count > numGpu) continue;
+      checkGradientTest(configFile1, useGpu, false, count);
+    }
+  }
+}
+
+TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }
+
+TEST(checkGradient, non_parallel) {
+  checkGradientTest(configFile4, false, false);
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  initPython(argc, argv);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/trainer/tests/test_TrainerOnePass.cpp b/paddle/legacy/trainer/tests/test_TrainerOnePass.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3e5c5ea723f3fd80316ee826fe9c6566e7049b7b
--- /dev/null
+++ b/paddle/legacy/trainer/tests/test_TrainerOnePass.cpp
@@ -0,0 +1,318 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/legacy/utils/GlobalConstants.h>
+#include <paddle/legacy/utils/PythonUtil.h>
+#include "paddle/legacy/trainer/Trainer.h"
+#include "paddle/legacy/trainer/TrainerInternal.h"
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/pserver/ParameterServer2.h>
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+static const string& configFile1 =
+    "legacy/trainer/tests/sample_trainer_config.conf";
+static const string& configFile2 =
+    "legacy/trainer/tests/sample_trainer_config_parallel.conf";
+
+static const string& configFileSimpleSparse =
+    "legacy/trainer/tests/simple_sparse_neural_network.py";
+
+DECLARE_bool(use_gpu);
+DECLARE_string(config);
+DECLARE_int32(gpu_id);
+DECLARE_int32(seed);
+DECLARE_int32(num_passes);
+DECLARE_int32(saving_period);
+
+class TrainerForTest : public paddle::Trainer {
+ public:
+  inline const std::shared_ptr<ParameterUpdater>& getParameterUpdaterForTest() {
+    return this->trainerInternal_.getParameterUpdater();
+  }
+};
+
+int gNumDevices = 0;
+
+void trainerOnePassTest(const string& configFile,
+                        bool useGpu,
+                        bool parallel,
+                        int trainerCount = 1,
+                        double averageWindow = 0.0f,
+                        bool doAverageInCpu = false) {
+  FLAGS_use_gpu = useGpu;
+  FLAGS_parallel_nn = parallel;
+  FLAGS_config = configFile;
+  FLAGS_trainer_count = trainerCount;
+  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
+            << " configFile=" << configFile;
+  srand(FLAGS_seed);
+
+  if (useGpu) {
+    if (gNumDevices < trainerCount) {
+      return;
+    }
+  }
+
+  Trainer trainer;
+  auto config = TrainerConfigHelper::createFromFlagConfig();
+  if (averageWindow > 0) {
+    config->getOptConfig().set_average_window(averageWindow);
+    config->getOptConfig().set_do_average_in_cpu(doAverageInCpu);
+  }
+  trainer.init(config);
+  trainer.train();
+}
+
+// 1. test trainer (cpu, gpu).
+TEST(trainerOnePass, cpu) { trainerOnePassTest(configFile1, false, false); }
+
+#ifdef PADDLE_WITH_CUDA
+TEST(trainerOnePass, gpu) { trainerOnePassTest(configFile1, true, false); }
+
+TEST(trainerOnePass, gpu2) { trainerOnePassTest(configFile1, true, false, 2); }
+
+TEST(trainerOnePass, gpu4) { trainerOnePassTest(configFile1, true, false, 4); }
+
+TEST(trainerOnePass, parallel) {
+  if (hl_get_device_count() >= 2) {
+    trainerOnePassTest(configFile2, true, true);
+  }
+}
+#endif
+
+// 2. test average_window.
+#ifdef PADDLE_WITH_CUDA
+TEST(average_window, gpu) {
+  trainerOnePassTest(configFile1, true, false, 4, 0.01);
+}
+
+TEST(average_window, gpu2) {
+  FLAGS_num_passes = 20;
+  trainerOnePassTest(configFile1, true, false, 2, 0.01);
+  FLAGS_num_passes = 1;
+}
+
+TEST(average_window, gpu4) {
+  FLAGS_num_passes = 20;
+  trainerOnePassTest(configFile1, true, false, 4, 0.01);
+  FLAGS_num_passes = 1;
+}
+
+TEST(average_window_cpu, gpu2) {
+  FLAGS_num_passes = 20;
+  trainerOnePassTest(configFile1, true, false, 2, 0.01, true);
+  FLAGS_num_passes = 1;
+}
+
+TEST(average_window_cpu, gpu4) {
+  FLAGS_num_passes = 20;
+  trainerOnePassTest(configFile1, true, false, 4, 0.01, true);
+  FLAGS_num_passes = 1;
+}
+#endif
+
+// 3. test trainer + pserver.
+DECLARE_int32(num_gradient_servers);
+DECLARE_int32(port);
+DECLARE_bool(local);
+DECLARE_bool(use_old_updater);
+
+double checkRemoteParameterUpdater(TrainerForTest& trainer) {
+  auto gradientMachine = trainer.getGradientMachine();
+  auto parameterUpdater = trainer.getParameterUpdaterForTest();
+  auto dataProvider = trainer.getDataProvider();
+  auto& parameters = gradientMachine->getParameters();
+  const TrainerConfig& config = trainer.getConfig();
+  const string& alg = config.opt_config().algorithm();
+
+  vector<ParameterPtr> parameterCheck;
+  for (auto& parameter : parameters) {
+    parameterCheck.emplace_back(
+        new Parameter(parameter->getConfig(), /* useGpu= */ false));
+    parameterCheck.back()
+        ->getBuf(PARAMETER_VALUE)
+        ->copyFrom(*parameter->getBuf(PARAMETER_VALUE));
+    parameterCheck.back()
+        ->getBuf(PARAMETER_GRADIENT)
+        ->copyFrom(*parameter->getBuf(PARAMETER_GRADIENT));
+  }
+
+  std::unique_ptr<ParameterUpdater> parameterUpdaterCheck;
+  if (alg == TrainAlgorithm::SGD) {
+    parameterUpdaterCheck.reset(new SgdLocalUpdater(config.opt_config()));
+  } else {
+    LOG(INFO) << "unsupported algorithm in remote parameter check: " << alg;
+    return -1.0;
+  }
+  parameterUpdaterCheck->init(parameterCheck);
+
+  // gradientMachine->start(config, *dataProvider);
+  DataBatch dataBatch;
+  int32_t batchSize = config.opt_config().batch_size();
+  dataProvider->getNextBatch(batchSize, &dataBatch);
+  CHECK(dataBatch.getSize()) << "No data from data provider";
+  int64_t actualBatchSize = dataBatch.getSize();
+  const vector<Argument>& inArgs = dataBatch.getStreams();
+  vector<Argument> outArgs;
+
+  UpdateCallback updateCallback = [parameterUpdater,
+                                   parameterCheck](Parameter* para) {
+    parameterCheck[para->getID()]
+        ->getBuf(PARAMETER_GRADIENT)
+        ->copyFrom(*para->getBuf(PARAMETER_GRADIENT));
+    parameterUpdater->update(para);
+  };
+
+  parameterUpdater->startPass();
+  parameterUpdaterCheck->startPass();
+
+  for (int i = 0; i < config.opt_config().num_batches_per_get_parameter() * 2;
+       ++i) {
+    PassType passType = parameterUpdater->startBatch(actualBatchSize);
+    gradientMachine->forwardBackward(
+        inArgs, &outArgs, passType, updateCallback);
+    parameterUpdater->finishBatch(0);
+
+    parameterUpdaterCheck->startBatch(actualBatchSize);
+    for (auto& para : parameterCheck) {
+      parameterUpdaterCheck->update(para.get());
+    }
+    parameterUpdaterCheck->finishBatch(0);
+  }
+
+  double sum = 0.0f;
+  for (size_t i = 0; i != parameters.size(); ++i) {
+    real *v1, *v2;
+    CpuVector trainerPara(parameters[i]->getSize());
+    trainerPara.copyFrom(*parameters[i]->getBuf(PARAMETER_VALUE));
+    if (!FLAGS_use_gpu) {
+      v1 = parameters[i]->getBuf(PARAMETER_VALUE)->getData();
+    } else {
+      v1 = trainerPara.getData();
+    }
+    v2 = parameterCheck[i]->getBuf(PARAMETER_VALUE)->getData();
+
+    size_t size = parameters[i]->getSize();
+    double diff = 0;
+    for (size_t j = 0; j < size; ++j) {
+      diff += fabs(v1[j] - v2[j]);
+    }
+    sum += diff;
+    LOG(INFO) << setiosflags(ios::left) << setfill(' ') << setw(20)
+              << parameters[i]->getName() << "diff=" << setw(15) << diff;
+  }
+
+  parameterUpdater->finishPass();
+  parameterUpdaterCheck->finishPass();
+  gradientMachine->finish();
+  return sum;
+}
+
+void checkRemoteParameterUpdaterTest(const string& configFile,
+                                     bool useGpu,
+                                     bool parallel,
+                                     int trainerCount = 1,
+                                     bool useOldUpdater = false,
+                                     int num_batches_per_get_parameter = 1) {
+  FLAGS_use_gpu = useGpu;
+  FLAGS_parallel_nn = parallel;
+  FLAGS_config = configFile;
+  FLAGS_trainer_count = trainerCount;
+  FLAGS_use_old_updater = useOldUpdater;
+  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
+            << " configFile=" << configFile;
+  srand(FLAGS_seed);
+
+  if (useGpu) {
+    if (gNumDevices < trainerCount) {
+      return;
+    }
+  }
+
+  FLAGS_local = 0;
+  std::shared_ptr<ParameterServer2> pserver;
+  pserver.reset(new ParameterServer2(std::string(), FLAGS_port));
+  pserver->init();
+  pserver->start();
+
+  TrainerForTest trainer;
+  auto config = TrainerConfigHelper::createFromFlagConfig();
+  config->getOptConfig().set_num_batches_per_get_parameter(
+      num_batches_per_get_parameter);
+  trainer.init(config);
+  EXPECT_EQ(checkRemoteParameterUpdater(trainer), 0);
+
+  FLAGS_local = 1;
+}
+
+TEST(checkRemoteUpdater, cpuTrainer) {
+  checkRemoteParameterUpdaterTest(configFile1, false, false);
+}
+
+TEST(checkRemoteUpdater, cpuTrainerOldUpdater) {
+  checkRemoteParameterUpdaterTest(configFile1, false, false, 1, true);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(checkRemoteUpdater, gpuTrainer) {
+  checkRemoteParameterUpdaterTest(configFile1, true, false);
+}
+
+TEST(checkRemoteUpdater, gpu2Trainer) {
+  checkRemoteParameterUpdaterTest(configFile1, true, false, 2);
+}
+
+TEST(checkRemoteUpdater, gpu4Trainer) {
+  checkRemoteParameterUpdaterTest(configFile1, true, false, 4);
+}
+
+TEST(checkRemoteUpdater, gpuTrainerOldUpdater) {
+  checkRemoteParameterUpdaterTest(configFile1, true, false, 1, true);
+}
+
+TEST(checkRemoteUpdater, gpu2TrainerOldUpdater) {
+  checkRemoteParameterUpdaterTest(configFile1, true, false, 2, true);
+}
+
+TEST(checkRemoteUpdater, gpu4TrainerOldUpdater) {
+  checkRemoteParameterUpdaterTest(configFile1, true, false, 4, true);
+}
+
+#endif
+
+TEST(checkRemoteUpdater, cpuDeltaTrainer) {
+  checkRemoteParameterUpdaterTest(configFile1, false, false, 1, false, 10);
+}
+
+TEST(checkRemoteUpdater, cpuDeltaTrainerOldUpdater) {
+  checkRemoteParameterUpdaterTest(configFile1, false, false, 1, true, 10);
+}
+
+TEST(SgdThreadUpdater, simpleSparseNN) {
+  trainerOnePassTest(configFileSimpleSparse, false, false, 1, 0.5, true);
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  initPython(argc, argv);
+  gNumDevices = hl_get_device_count();
+
+  FLAGS_num_passes = 1;          // train one pass
+  FLAGS_saving_period = 100000;  // do not save parameteres
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/legacy/trainer/tests/test_config.conf b/paddle/legacy/trainer/tests/test_config.conf
new file mode 100644
index 0000000000000000000000000000000000000000..bce687ad83686d465987d72defd37db2b50953a1
--- /dev/null
+++ b/paddle/legacy/trainer/tests/test_config.conf
@@ -0,0 +1,77 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+TrainData(SimpleData(
+    files = "legacy/trainer/tests/sample_filelist.txt",
+    feat_dim = 3,
+    context_len = 0,
+    buffer_capacity = 1000000,
+    async_load_data = False))
+
+settings(batch_size = 100)
+
+data = data_layer(name='input', size=3)
+
+wt = data_layer(name='weight', size=1)
+
+fc1 = fc_layer(input=data, size=5,
+               bias_attr=True,
+               act=SigmoidActivation())
+
+fc2 = fc_layer(input=data, size=12,
+               bias_attr=True,
+               param_attr=ParamAttr(name='sharew'),
+               act=LinearActivation())
+
+fc3 = fc_layer(input=data, size=3,
+               bias_attr=True,
+               act=TanhActivation())
+
+fc4 = fc_layer(input=data, size=5,
+               bias_attr=True,
+               layer_attr=ExtraAttr(drop_rate=0.5),
+               act=SquareActivation())
+
+pool = img_pool_layer(input=fc2,
+                      pool_size=2,
+                      pool_size_y=3,
+                      num_channels=1,
+                      padding=1,
+                      padding_y=2,
+                      stride=2,
+                      stride_y=3,
+                      pool_type=CudnnAvgPooling())
+
+concat = concat_layer(input=[fc3, fc4])
+
+with mixed_layer(size=3, act=SoftmaxActivation()) as output:
+    output += full_matrix_projection(input=fc1)
+    output += trans_full_matrix_projection(input=fc2,
+                                           param_attr=ParamAttr(name='sharew'))
+    output += full_matrix_projection(input=concat)
+    output += identity_projection(input=fc3)
+
+lbl = data_layer(name='label', size=1)
+
+cost = classification_cost(input=output, label=lbl, weight=wt,
+                           layer_attr=ExtraAttr(device=-1))
+
+nce = nce_layer(input=fc2, label=lbl, weight=wt,
+                num_classes=3, 
+                neg_distribution=[0.1, 0.3, 0.6])
+                
+outputs(cost, nce)
diff --git a/paddle/trainer/tests/test_gen_dict.txt b/paddle/legacy/trainer/tests/test_gen_dict.txt
similarity index 100%
rename from paddle/trainer/tests/test_gen_dict.txt
rename to paddle/legacy/trainer/tests/test_gen_dict.txt
diff --git a/paddle/legacy/trainer/tests/test_recurrent_machine_generation.cpp b/paddle/legacy/trainer/tests/test_recurrent_machine_generation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..47b4e82cd32917fcf32dbb5ffabb47330dab93d9
--- /dev/null
+++ b/paddle/legacy/trainer/tests/test_recurrent_machine_generation.cpp
@@ -0,0 +1,157 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+
+#include <paddle/legacy/trainer/Trainer.h>
+#include <paddle/legacy/utils/PythonUtil.h>
+
+#include <gtest/gtest.h>
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+static const string& CONFIG_FILE =
+    "legacy/trainer/tests/sample_trainer_rnn_gen.conf";
+static const string& NEST_CONFIG_FILE =
+    "legacy/trainer/tests/sample_trainer_nest_rnn_gen.conf";
+static const string& OUTPUT_DIR = "legacy/trainer/tests/dump_text.test";
+static string modelDir =
+    "legacy/trainer/tests/rnn_gen_test_model_dir/t1";       // NOLINT
+static string expectFile =                                  // NOLINT
+    "legacy/trainer/tests/rnn_gen_test_model_dir/r1.test";  // NOLINT
+
+DECLARE_string(config_args);
+
+vector<float> readRetFile(const string& fname) {
+  ifstream inFile(fname);
+  float ret;
+  vector<float> nums;
+  while (inFile >> ret) {
+    nums.push_back(ret);
+  }
+  return nums;
+}
+
+void checkOutput(const string& expRetFile) {
+  vector<float> rets = readRetFile(OUTPUT_DIR);
+  vector<float> expRets = readRetFile(expRetFile);
+  EXPECT_EQ(rets.size(), expRets.size());
+  for (size_t i = 0; i < rets.size(); i++) {
+    EXPECT_FLOAT_EQ(rets[i], expRets[i]);
+  }
+}
+
+void prepareInArgs(vector<Argument>& inArgs,
+                   const size_t batchSize,
+                   bool useGpu,
+                   bool hasSubseq) {
+  inArgs.clear();
+  // sentence id
+  Argument sentId;
+  sentId.value = nullptr;
+  if (hasSubseq) {
+    // as there is only one sequence, there is only one label.
+    IVector::resizeOrCreate(sentId.ids, 1, useGpu);
+    sentId.ids->setElement(0, 0);
+  } else {
+    // as there is batchSize word, there is batchSize label.
+    IVector::resizeOrCreate(sentId.ids, batchSize, useGpu);
+    for (size_t i = 0; i < batchSize; ++i) sentId.ids->setElement(i, i);
+  }
+  inArgs.emplace_back(sentId);
+
+  // a dummy layer to decide batch size
+  Argument dummyInput;
+  dummyInput.value = Matrix::create(batchSize, 2, false, useGpu);
+  dummyInput.value->randomizeUniform();
+  if (hasSubseq) {
+    // generate one sequence with batchSize subsequence,
+    // and each subsequence has only one word.
+    dummyInput.sequenceStartPositions = ICpuGpuVector::create(2, false);
+    int* buf = dummyInput.sequenceStartPositions->getMutableData(false);
+    dummyInput.subSequenceStartPositions =
+        ICpuGpuVector::create(batchSize + 1, false);
+    int* subBuf = dummyInput.subSequenceStartPositions->getMutableData(false);
+    buf[0] = 0;
+    buf[1] = batchSize;
+    for (size_t i = 0; i < batchSize + 1; i++) subBuf[i] = i;
+  }
+  inArgs.emplace_back(dummyInput);
+}
+
+void testGeneration(const string& configFile,
+                    bool useGpu,
+                    bool hasSubseq,
+                    const string& expRetFile) {
+  FLAGS_use_gpu = useGpu;
+  auto config = std::make_shared<TrainerConfigHelper>(configFile);
+  unique_ptr<GradientMachine> gradientMachine(GradientMachine::create(*config));
+  gradientMachine->loadParameters(modelDir);
+  vector<Argument> inArgs(2);
+
+  const size_t batchSize = 15;
+  prepareInArgs(inArgs, batchSize, useGpu, hasSubseq);
+  vector<Argument> outArgs;
+  unique_ptr<Evaluator> testEvaluator(gradientMachine->makeEvaluator());
+  testEvaluator->start();
+  gradientMachine->forward(inArgs, &outArgs, PASS_TEST);
+  gradientMachine->eval(testEvaluator.get());
+  testEvaluator->finish();
+  checkOutput(expRetFile);
+}
+
+#ifndef PADDLE_TYPE_DOUBLE
+
+TEST(RecurrentGradientMachine, test_generation) {
+#ifndef PADDLE_WITH_CUDA
+  const auto useGpuConfs = {false};
+#else
+  const auto useGpuConfs = {true, false};
+#endif
+  auto testGen = [&](const string& configFile,
+                     bool hasSubseq,
+                     const string& expRetFile,
+                     bool beam_search) {
+    FLAGS_config_args = beam_search ? "beam_search=1" : "beam_search=0";
+    for (auto useGpu : useGpuConfs) {
+      LOG(INFO) << configFile << " useGpu=" << useGpu
+                << " beam_search=" << beam_search;
+      testGeneration(configFile, useGpu, hasSubseq, expRetFile);
+    }
+  };
+  testGen(CONFIG_FILE, false, expectFile + ".nobeam", false);  // no beam search
+  testGen(CONFIG_FILE, false, expectFile + ".beam", true);     // beam search
+  // In hierarchical RNN, beam search and one way search are only in inner-RNN,
+  // outer-RNN will concat the generated inner-results (first for beam search)
+  // from inner-RNN. Thus, they have the same outer-results.
+  testGen(NEST_CONFIG_FILE,
+          true,
+          expectFile + ".nest",
+          false);  // no beam search
+  testGen(NEST_CONFIG_FILE, true, expectFile + ".nest", true);  // beam search
+}
+#endif
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  initPython(argc, argv);
+  CHECK(argc == 1 || argc == 3);
+  if (argc == 3) {
+    modelDir = argv[1];
+    expectFile = argv[2];
+  }
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/utils/.gitignore b/paddle/legacy/utils/.gitignore
similarity index 100%
rename from paddle/utils/.gitignore
rename to paddle/legacy/utils/.gitignore
diff --git a/paddle/utils/Any.h b/paddle/legacy/utils/Any.h
similarity index 100%
rename from paddle/utils/Any.h
rename to paddle/legacy/utils/Any.h
diff --git a/paddle/legacy/utils/CMakeLists.txt b/paddle/legacy/utils/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b42b2bae968a10c581c594054f853347eb5d5445
--- /dev/null
+++ b/paddle/legacy/utils/CMakeLists.txt
@@ -0,0 +1,20 @@
+# The utilities for paddle
+file(GLOB UTIL_HEADERS . *.h)
+file(GLOB UTIL_SOURCES . *.cpp)
+create_resources(${CMAKE_CURRENT_SOURCE_DIR}/enable_virtualenv.py
+  ${CMAKE_CURRENT_BINARY_DIR}/enable_virtualenv.c)
+set(UTIL_RES ${CMAKE_CURRENT_BINARY_DIR}/enable_virtualenv.c)
+
+if(APPLE)
+    file(GLOB UTIL_ARCH_SOURCES . arch/osx/*.cpp)
+else()
+    file(GLOB UTIL_ARCH_SOURCES . arch/linux/*.cpp)
+endif()
+add_library(paddle_utils STATIC
+        ${UTIL_SOURCES}
+        ${UTIL_ARCH_SOURCES}
+        ${UTIL_RES})
+add_dependencies(paddle_utils paddle_proto ${external_project_dependencies})
+if(WITH_TESTING)
+    add_subdirectory(tests)
+endif()
diff --git a/paddle/legacy/utils/ClassRegistrar.h b/paddle/legacy/utils/ClassRegistrar.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f40a0b25e92c7adcfe3f8c4be96016be801da3b
--- /dev/null
+++ b/paddle/legacy/utils/ClassRegistrar.h
@@ -0,0 +1,81 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <string>
+
+#include "Util.h"
+
+namespace paddle {
+
+/**
+ * This class is used to keep a set of class types. It can register a
+ * class by a type name and create an instance of a class by type.
+ * Example:
+ *   // Declare the registrar
+ *   ClassRegistrar<Layer, LayerConfig> registar_;
+ *
+ *   // Register a class using its constructor
+ *   registrar_.registerClass<ConvLayer>("conv");
+ *
+ *   // Register a class using a creation function
+ *   registrar_.registerClass("pool", [](LayerConfig& config){
+ *     return PoolLayer::create(config);
+ *   });
+ *
+ *   // create a class instance by type name
+ *   Layer* layer = registrar_.createByType("conv", config);
+ */
+template <class BaseClass, typename... CreateArgs>
+class ClassRegistrar {
+ public:
+  typedef std::function<BaseClass*(CreateArgs...)> ClassCreator;
+
+  // Register a class using a creation function.
+  // The creation function's arguments are CreateArgs
+  void registerClass(const std::string& type, ClassCreator creator) {
+    CHECK(creatorMap_.count(type) == 0) << "Duplicated class type: " << type;
+    creatorMap_[type] = creator;
+  }
+
+  // Register a class using its constructor
+  // The constructor's arguments are CreateArgs
+  template <class ClassType>
+  void registerClass(const std::string& type) {
+    registerClass(type,
+                  [](CreateArgs... args) { return new ClassType(args...); });
+  }
+
+  // Create a class instance of type @type using args
+  BaseClass* createByType(const std::string& type, CreateArgs... args) {
+    ClassCreator creator;
+    CHECK(mapGet(type, creatorMap_, &creator)) << "Unknown class type: "
+                                               << type;
+    return creator(args...);
+  }
+
+  template <typename T>
+  inline void forEachType(T callback) {
+    for (auto it = creatorMap_.begin(); it != creatorMap_.end(); ++it) {
+      callback(it->first);
+    }
+  }
+
+ protected:
+  std::map<std::string, ClassCreator> creatorMap_;
+};
+
+}  // namespace paddle
diff --git a/paddle/utils/Common.h b/paddle/legacy/utils/Common.h
similarity index 100%
rename from paddle/utils/Common.h
rename to paddle/legacy/utils/Common.h
diff --git a/paddle/legacy/utils/CpuId.cpp b/paddle/legacy/utils/CpuId.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..66e7c6606f070aef4fd954b8f4ada994b2f4fb96
--- /dev/null
+++ b/paddle/legacy/utils/CpuId.cpp
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/CpuId.h"
+#include "paddle/legacy/utils/Util.h"
+
+#ifdef _WIN32
+
+#include <intrin.h>
+
+/// for MSVC
+#define CPUID(info, x) __cpuidex(info, x, 0)
+
+#else
+
+#if !defined(__arm__) && !defined(__aarch64__)
+#include <cpuid.h>
+/// for GCC/Clang
+#define CPUID(info, x) __cpuid_count(x, 0, info[0], info[1], info[2], info[3])
+#endif
+
+#endif
+
+namespace paddle {
+
+SIMDFlags::SIMDFlags() {
+#if defined(__arm__) || defined(__aarch64__)
+  simd_flags_ = SIMD_NEON;
+#else
+  unsigned int cpuInfo[4];
+  // CPUID: https://en.wikipedia.org/wiki/CPUID
+  // clang-format off
+  CPUID(cpuInfo, 0x00000001);
+  simd_flags_ |= cpuInfo[3] & (1 << 25) ? SIMD_SSE   : SIMD_NONE;
+  simd_flags_ |= cpuInfo[3] & (1 << 26) ? SIMD_SSE2  : SIMD_NONE;
+  simd_flags_ |= cpuInfo[2] & (1 <<  0) ? SIMD_SSE3  : SIMD_NONE;
+  simd_flags_ |= cpuInfo[2] & (1 <<  9) ? SIMD_SSSE3 : SIMD_NONE;
+  simd_flags_ |= cpuInfo[2] & (1 << 19) ? SIMD_SSE41 : SIMD_NONE;
+  simd_flags_ |= cpuInfo[2] & (1 << 20) ? SIMD_SSE42 : SIMD_NONE;
+  simd_flags_ |= cpuInfo[2] & (1 << 12) ? SIMD_FMA3  : SIMD_NONE;
+  simd_flags_ |= cpuInfo[2] & (1 << 28) ? SIMD_AVX   : SIMD_NONE;
+
+  CPUID(cpuInfo, 0x00000007);
+  simd_flags_ |= cpuInfo[1] & (1 <<  5) ? SIMD_AVX2  : SIMD_NONE;
+  simd_flags_ |= cpuInfo[1] & (1 << 16) ? SIMD_AVX512: SIMD_NONE;
+
+  CPUID(cpuInfo, 0x80000001);
+  simd_flags_ |= cpuInfo[2] & (1 << 16) ? SIMD_FMA4  : SIMD_NONE;
+  // clang-fotmat on
+#endif
+}
+
+SIMDFlags const* SIMDFlags::instance() {
+  static SIMDFlags instance;
+  return &instance;
+}
+
+}  // namespace paddle
diff --git a/paddle/legacy/utils/CpuId.h b/paddle/legacy/utils/CpuId.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed58211d13ac1e0f80d6728950f0b88dc0ae625f
--- /dev/null
+++ b/paddle/legacy/utils/CpuId.h
@@ -0,0 +1,136 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Common.h"
+#include "Error.h"
+
+namespace paddle {
+
+// clang-format off
+enum simd_t {
+  SIMD_NONE   = 0,          ///< None
+  SIMD_SSE    = 1 << 0,     ///< SSE
+  SIMD_SSE2   = 1 << 1,     ///< SSE 2
+  SIMD_SSE3   = 1 << 2,     ///< SSE 3
+  SIMD_SSSE3  = 1 << 3,     ///< SSSE 3
+  SIMD_SSE41  = 1 << 4,     ///< SSE 4.1
+  SIMD_SSE42  = 1 << 5,     ///< SSE 4.2
+  SIMD_FMA3   = 1 << 6,     ///< FMA 3
+  SIMD_FMA4   = 1 << 7,     ///< FMA 4
+  SIMD_AVX    = 1 << 8,     ///< AVX
+  SIMD_AVX2   = 1 << 9,     ///< AVX 2
+  SIMD_AVX512 = 1 << 10,    ///< AVX 512
+  SIMD_NEON   = 1 << 11,    ///  NEON
+};
+// clang-format on
+
+class SIMDFlags final {
+ public:
+  DISABLE_COPY(SIMDFlags);
+
+  SIMDFlags();
+
+  static SIMDFlags const* instance();
+
+  inline bool check(int flags) const {
+    return !((simd_flags_ & flags) ^ flags);
+  }
+
+ private:
+  int simd_flags_ = SIMD_NONE;
+};
+
+/**
+ * @brief   Check SIMD flags at runtime.
+ *
+ * For example.
+ * @code{.cpp}
+ *
+ * if (HAS_SIMD(SIMD_AVX2 | SIMD_FMA4)) {
+ *      avx2_fm4_stub();
+ * } else if (HAS_SIMD(SIMD_AVX)) {
+ *      avx_stub();
+ * }
+ *
+ * @endcode
+ */
+#define HAS_SIMD(__flags) SIMDFlags::instance()->check(__flags)
+
+/**
+ * @brief   Check SIMD flags at runtime.
+ *
+ * 1. Check all SIMD flags at runtime:
+ *
+ * @code{.cpp}
+ * if (HAS_AVX && HAS_AVX2) {
+ *      avx2_stub();
+ * }
+ * @endcod
+ *
+ * 2. Check one SIMD flag at runtime:
+ *
+ * @code{.cpp}
+ * if (HAS_SSE41 || HAS_SSE42) {
+ *      sse4_stub();
+ * }
+ * @endcode
+ */
+// clang-format off
+#define HAS_SSE     HAS_SIMD(SIMD_SSE)
+#define HAS_SSE2    HAS_SIMD(SIMD_SSE2)
+#define HAS_SSE3    HAS_SIMD(SIMD_SSE3)
+#define HAS_SSSE3   HAS_SIMD(SIMD_SSSE3)
+#define HAS_SSE41   HAS_SIMD(SIMD_SSE41)
+#define HAS_SSE42   HAS_SIMD(SIMD_SSE42)
+#define HAS_FMA3    HAS_SIMD(SIMD_FMA3)
+#define HAS_FMA4    HAS_SIMD(SIMD_FMA4)
+#define HAS_AVX     HAS_SIMD(SIMD_AVX)
+#define HAS_AVX2    HAS_SIMD(SIMD_AVX2)
+#define HAS_AVX512  HAS_SIMD(SIMD_AVX512)
+#define HAS_NEON    HAS_SIMD(SIMD_NEON)
+// clang-format on
+
+/**
+ * Invoke checkCPUFeature() before Paddle initialization to
+ * check target machine whether support compiled instructions.
+ * If not, simply throw out an error.
+ */
+inline Error __must_check checkCPUFeature() {
+  Error err;
+#ifndef __AVX__
+  if (HAS_AVX) {
+    LOG(WARNING) << "PaddlePaddle wasn't compiled to use avx instructions, "
+                 << "but these are available on your machine and could "
+                 << "speed up CPU computations via CMAKE .. -DWITH_AVX=ON";
+  }
+#else
+  if (!HAS_AVX) {
+    err = Error(
+        "PaddlePaddle was compiled to use avx instructions, "
+        "but these aren't available on your machine, please "
+        "disable it via CMAKE .. -DWITH_AVX=OFF");
+  }
+#endif  // __AVX__
+#ifdef __SSE3__
+  if (!HAS_SSE3) {
+    err = Error(
+        "PaddlePaddle was compiled to use sse3 instructions, "
+        "which is the minimum requirement of PaddlePaddle. "
+        "But these aren't available on your current machine.");
+  }
+#endif  // __SSE3__
+
+  return err;
+}
+
+}  // namespace paddle
diff --git a/paddle/utils/CustomStackTrace.cpp b/paddle/legacy/utils/CustomStackTrace.cpp
similarity index 100%
rename from paddle/utils/CustomStackTrace.cpp
rename to paddle/legacy/utils/CustomStackTrace.cpp
diff --git a/paddle/legacy/utils/CustomStackTrace.h b/paddle/legacy/utils/CustomStackTrace.h
new file mode 100644
index 0000000000000000000000000000000000000000..b60077ea2d946366910780eeb773635972211e04
--- /dev/null
+++ b/paddle/legacy/utils/CustomStackTrace.h
@@ -0,0 +1,193 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <stack>
+#include <thread>
+#include <unordered_map>
+
+#include "ThreadLocal.h"
+
+namespace paddle {
+
+/**
+ * A ThreadLocal stack for tracing train/test process.
+ * (More details of ThreadLocal can be find
+ * in the comments of ThreadLocal class.)
+ *
+ * For example.
+ * @code{.cpp}
+ *
+ * paddle::CustomStackTrace<std::string> stack;
+ * for (auto& layer : layers){
+ *   stack.push(layer->getName());
+ *   layer->forward();
+ * }
+ *
+ * stack.pop("");  // mark under pop stage.
+ *
+ * for (auto it = layers.rbegin(); it != layers.rend(); ++it){
+ *   auto& layer = *it;
+ *   layer->backward(passType);
+ *   stack.pop(layer->getName());
+ * }
+ *
+ * @endcode
+ */
+template <typename T>
+class CustomStackTrace {
+ public:
+  /**
+   * @brief Pop out an item from the top of the stack if item == top.
+   *        Else, just set status to popping.
+   */
+  void pop(const T& item) {
+    auto& s = this->stack();
+    if (item == s.top()) {
+      s.pop();
+    }
+  }
+
+  /**
+   * @brief Indicate whether we are at forward or backward stage of computation
+   */
+  void set_stage(bool isForward) { pushing() = isForward; }
+
+  /**
+   * @brief clear current thread stack.
+   */
+  void clear() {
+    auto& s = stack();
+    while (!s.empty()) {
+      s.pop();
+    }
+  }
+
+  /**
+   * @brief return true if all thread's stack is empty.
+   * @return true if empty
+   */
+  bool empty() const {
+    std::lock_guard<std::mutex> g(this->mtx_);
+    for (auto p : this->stackBuffers_) {
+      std::stack<T>& s = *p.second;
+      if (!s.empty()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  /**
+   * @brief DumpCallback Type. It will be invoked many times by dump method.
+   *
+   * The first parameter is stack thread id.
+   * The second parameter is the last action of stack is push or not.
+   * The third parameter is the item in stack.
+   */
+  typedef std::function<void(const std::thread::id& /*threadId*/,
+                             bool* /*isPushing*/,
+                             const T& /*item*/)>
+      DumpCallback;
+
+  /**
+   * Dump all thread stack, and all stack will be cleared.
+   */
+  void dump(const DumpCallback& callback, bool onlyCurrentThread = false) {
+    std::lock_guard<std::mutex> g(this->mtx_);
+    for (auto p : this->stackBuffers_) {
+      std::thread::id tid = p.first;
+      if (onlyCurrentThread && tid != std::this_thread::get_id()) {
+        continue;
+      }
+      std::stack<T>& s = *p.second;
+      bool* isPush = nullptr;
+      auto it = this->pushingBuffers_.find(tid);
+      if (it != this->pushingBuffers_.end()) {
+        isPush = it->second;
+      }
+
+      while (!s.empty()) {
+        callback(tid, isPush, s.top());
+        s.pop();
+      }
+    }
+  }
+
+  /**
+   * @brief Push item to current thread stack.
+   */
+  void push(const T& item) {
+    pushing() = true;
+    auto& p = this->stack();
+    p.push(item);
+  }
+
+ private:
+  /**
+   * Get thread local attribute, and save them into a map (threadId => TYPE*)
+   *
+   * @tparam TYPE thread local attribute type.
+   * @param threadLocal Thread Local object.
+   * @param buffers a map from threadId to TYPE*
+   */
+  template <typename TYPE>
+  inline TYPE& getThreadLocal(
+      ThreadLocal<TYPE>& threadLocal,
+      std::unordered_map<std::thread::id, TYPE*>& buffers) {
+    TYPE* retv = threadLocal.get(false);
+    if (retv) {
+      return *retv;
+    } else {
+      std::lock_guard<std::mutex> guard(this->mtx_);
+      retv = threadLocal.get();
+      auto id = std::this_thread::get_id();
+      buffers.insert({id, retv});
+      return *retv;
+    }
+  }
+
+  /**
+   * @brief Get thread local stack reference.
+   */
+  std::stack<T>& stack() {
+    return this->getThreadLocal(this->logStack_, this->stackBuffers_);
+  }
+
+  /**
+   * @brief Get thread local pushing flag.
+   */
+  bool& pushing() {
+    return this->getThreadLocal(this->isPushing_, this->pushingBuffers_);
+  }
+
+ private:
+  mutable std::mutex mtx_;
+
+  std::unordered_map<std::thread::id, std::stack<T>*> stackBuffers_;
+  std::unordered_map<std::thread::id, bool*> pushingBuffers_;
+  ThreadLocal<bool> isPushing_;
+  ThreadLocal<std::stack<T>> logStack_;
+};
+
+extern CustomStackTrace<std::string> gLayerStackTrace;
+
+/**
+ * @brief Install a failure handler to print layer stack when error.
+ */
+extern void installLayerStackTracer();
+
+}  // namespace paddle
diff --git a/paddle/legacy/utils/DynamicLoader.cpp b/paddle/legacy/utils/DynamicLoader.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9ac4a56c6e300d299467630b39a32567af72cf40
--- /dev/null
+++ b/paddle/legacy/utils/DynamicLoader.cpp
@@ -0,0 +1,170 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DynamicLoader.h"
+#include <gflags/gflags.h>
+#include "Logging.h"
+
+DEFINE_string(cudnn_dir,
+              "",
+              "Specify path for loading libcudnn.so. For instance, "
+              "/usr/local/cudnn/lib. If empty [default], dlopen "
+              "will search cudnn from LD_LIBRARY_PATH");
+
+DEFINE_string(cuda_dir,
+              "",
+              "Specify path for loading cuda library, such as libcublas, "
+              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
+              "dlopen will search cuda from LD_LIBRARY_PATH");
+
+DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
+
+DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
+
+DEFINE_string(tensorrt_dir, "", "Specify path for loading libnvinfer.so.");
+
+static inline std::string join(const std::string& part1,
+                               const std::string& part2) {
+  // directory separator
+  const char sep = '/';
+  if (!part2.empty() && part2.front() == sep) {
+    return part2;
+  }
+  std::string ret;
+  ret.reserve(part1.size() + part2.size() + 1);
+  ret = part1;
+  if (!ret.empty() && ret.back() != sep) {
+    ret += sep;
+  }
+  ret += part2;
+  return ret;
+}
+
+static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
+                                               void** dso_handle,
+                                               int dynload_flags) {
+  VLOG(3) << "Try to find library: " << dso_path
+          << " from default system path.";
+  // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
+  *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+
+// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
+// bring System Integrity Projection (SIP), if dso_handle
+// is null, search from default package path in Mac OS.
+#if defined(__APPLE__) || defined(__OSX__)
+  if (nullptr == *dso_handle) {
+    dso_path = join("/usr/local/cuda/lib/", dso_path);
+    *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+    if (nullptr == *dso_handle) {
+      if (dso_path == "libcudnn.dylib") {
+        LOG(FATAL)
+            << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n"  // NOLINT
+            << "For instance, sudo tar -xzf "
+               "cudnn-7.5-osx-x64-v5.0-ga.tgz -C "  // NOLINT
+            << "/usr/local \n sudo chmod a+r "
+               "/usr/local/cuda/include/cudnn.h "  // NOLINT
+            << "/usr/local/cuda/lib/libcudnn*";
+      }
+    }
+  }
+#endif
+}
+
+static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
+                                              const std::string& dso_name,
+                                              void** dso_handle) {
+  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
+  *dso_handle = nullptr;
+
+  std::string dlPath = dso_name;
+  if (search_root.empty()) {
+    GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
+  } else {
+    // search xxx.so from custom path
+    dlPath = join(search_root, dso_name);
+    *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
+    // if not found, search from default path
+    if (nullptr == *dso_handle) {
+      LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
+                   << dlerror() << ")";
+      dlPath = dso_name;
+      GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
+    }
+  }
+
+  CHECK(nullptr != *dso_handle) << "Failed to find dynamic library: " << dlPath
+                                << " (" << dlerror() << ") \n"
+                                << "Please specify its path correctly using "
+                                   "following ways: \n"
+
+                                << "Method. set environment variable "
+                                   "LD_LIBRARY_PATH on Linux or "
+                                << "DYLD_LIBRARY_PATH on Mac OS. \n"
+                                << "For instance, issue command: export "
+                                   "LD_LIBRARY_PATH=... \n"
+
+                                << "Note: After Mac OS 10.11, using the "
+                                   "DYLD_LIBRARY_PATH is impossible "
+                                << "unless System Integrity Protection (SIP) "
+                                   "is disabled.";
+}
+
+void GetCublasDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
+#endif
+}
+
+void GetCudnnDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
+#endif
+}
+
+void GetCurandDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
+#endif
+}
+
+void GetWarpCTCDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle);
+#endif
+}
+
+void GetLapackDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle);
+#endif
+}
+
+void GetTensorRtDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(
+      FLAGS_tensorrt_dir, "libnvinfer.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so", dso_handle);
+#endif
+}
diff --git a/paddle/legacy/utils/DynamicLoader.h b/paddle/legacy/utils/DynamicLoader.h
new file mode 100644
index 0000000000000000000000000000000000000000..02f519de4b3988fb6aca323aaa1751ee2c4bd738
--- /dev/null
+++ b/paddle/legacy/utils/DynamicLoader.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <dlfcn.h>
+#include <memory>
+#include <mutex>
+#include <string>
+
+/**
+ * @brief    load the DSO of CUBLAS
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetCublasDsoHandle(void** dso_handle);
+
+/**
+ * @brief    load the DSO of CUDNN
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetCudnnDsoHandle(void** dso_handle);
+
+/**
+ * @brief    load the DSO of CURAND
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetCurandDsoHandle(void** dso_handle);
+
+/**
+ * @brief    load the DSO of warp-ctc
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetWarpCTCDsoHandle(void** dso_handle);
+
+/**
+ * @brief    load the DSO of lapack
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetLapackDsoHandle(void** dso_handle);
+
+/**
+ * @brief    load the DSO of tensorrt
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetTensorRtDsoHandle(void** dso_handle);
diff --git a/paddle/legacy/utils/Error.h b/paddle/legacy/utils/Error.h
new file mode 100644
index 0000000000000000000000000000000000000000..1fc8482e3a1bef869d4df147bbd3cab6e62ccf49
--- /dev/null
+++ b/paddle/legacy/utils/Error.h
@@ -0,0 +1,145 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <memory>
+#include <string>
+
+/**
+ * __must_check macro. It make the function's return value must be used,
+ * otherwise it will raise a compile warning. And also Paddle treat all compile
+ * warnings as errors.
+ */
+#ifdef __GNUC__
+#if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) >= 30400
+#define __must_check __attribute__((warn_unused_result))
+#else
+#define __must_check
+#endif
+#else
+#define __must_check
+#endif
+
+namespace paddle {
+
+/**
+ * Error is Paddle error code. It only contain a std::string as error message.
+ *
+ *
+ * There are two styles to return error in Paddle.
+ *
+ * 1. Return Error
+ *    When method return a status, the return must use `__must_check` attribute.
+ *    Example as below.
+ * @code{cpp}
+ * Error __must_check foo();
+ *
+ * Error __must_check bar() {
+ *   // do something.
+ *   Error err = foo();  // invoke other method return status.
+ *   if (err) return err;
+ *   // do something else.
+ *   return Error();
+ * }
+ * @endcode{cpp}
+ *
+ * 2. Return by parameter.
+ *    It is another way to return an error, by using a pointer parameter.
+ *    Example as below.
+ *
+ * @code{cpp}
+ * Error bar();
+ *
+ * int foo(Error* error) {
+ *   // Do something.
+ *   Error err = bar();
+ *   if (err) {
+ *     *error = s;
+ *     return 0;
+ *   }
+ *   // Do something else.
+ *   if (someInternalErrorHappend) {
+ *     *error = Error("Some dimension is too large, %d", dimension);
+ *     return 0;
+ *   }
+ *   // End of method.
+ *   return someValue;
+ * }
+ *
+ * Error foobar() {
+ *   Error err;
+ *   // do something.
+ *   foo(&err);
+ *   if (err) return err;
+ * }
+ * @endcode{cpp}
+ *
+ *
+ * Currently there is a helper method 'check' in status, because Paddle always
+ * use log(FATAL) or CHECK to make program exit before. When we clean all
+ * log(FATAL) and CHECK in Paddle, 'check' method will be removed.
+ */
+class Error {
+ public:
+  /**
+   * Construct a no-error value.
+   */
+  Error() {}
+
+  /**
+   * @brief Create an Error use printf syntax.
+   */
+  explicit Error(const char* fmt, ...) {
+    va_list ap;
+    va_start(ap, fmt);
+    constexpr size_t kBufferSize = 1024;
+    char buffer[kBufferSize];
+    vsnprintf(buffer, kBufferSize, fmt, ap);
+    this->msg_.reset(new std::string(buffer));
+    va_end(ap);
+  }
+
+  /**
+   * @brief msg will return the error message. If no error, return nullptr.
+   */
+  const char* msg() const {
+    if (msg_) {
+      return msg_->c_str();
+    } else {
+      return nullptr;
+    }
+  }
+
+  /**
+   * @brief check this status by glog.
+   * @note It is a temp method used during cleaning Paddle code. It will be
+   *       removed later.
+   */
+  void check() const { CHECK(this->isOK()) << msg(); }
+
+  /**
+   * @brief isOK return True if there is no error.
+   * @return True if no error.
+   */
+  bool isOK() const { return msg_ == nullptr; }
+
+ private:
+  std::shared_ptr<std::string> msg_;
+};
+
+}  // namespace paddle
diff --git a/paddle/utils/Excepts.h b/paddle/legacy/utils/Excepts.h
similarity index 100%
rename from paddle/utils/Excepts.h
rename to paddle/legacy/utils/Excepts.h
diff --git a/paddle/utils/Flags.cpp b/paddle/legacy/utils/Flags.cpp
similarity index 100%
rename from paddle/utils/Flags.cpp
rename to paddle/legacy/utils/Flags.cpp
diff --git a/paddle/utils/Flags.h b/paddle/legacy/utils/Flags.h
similarity index 100%
rename from paddle/utils/Flags.h
rename to paddle/legacy/utils/Flags.h
diff --git a/paddle/utils/GlobalConstants.cpp b/paddle/legacy/utils/GlobalConstants.cpp
similarity index 100%
rename from paddle/utils/GlobalConstants.cpp
rename to paddle/legacy/utils/GlobalConstants.cpp
diff --git a/paddle/legacy/utils/GlobalConstants.h b/paddle/legacy/utils/GlobalConstants.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f45e82268435e4c22d1879e909b0c90838d6693
--- /dev/null
+++ b/paddle/legacy/utils/GlobalConstants.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+
+namespace paddle {
+
+namespace enumeration_wrapper {
+enum PassType {
+  PASS_TRAIN,   // Train pass
+  PASS_TEST,    // Test pass
+  PASS_GC,      // Gradient Check pass
+  PASS_METRIC,  // pass for generate template output with no drop rate.
+};
+
+enum ParameterType {
+  PARAMETER_VALUE = 0,
+  PARAMETER_GRADIENT,
+  PARAMETER_MOMENTUM,
+
+  // Used by ParameterAverager
+  PARAMETER_SUM1,
+  PARAMETER_SUM2,
+  PARAMETER_SUM3,
+
+  //   also used by AdagradParameterUpdater/AdadeltaParameterUpdater
+  PARAMETER_LEARNING_RATE,
+
+  // Used by Sparse SGD update
+  PARAMETER_UPDATE_TIME,
+
+  // Used by async_sgd
+  // Change of the parameter since last remote update
+  PARAMETER_DELTA,
+
+  // Used by BatchRemoteParameterUpdater
+  PARAMETER_GRADIENT_SUM,
+
+  // Used by AdagradParameterUpdater/AdadeltaParameterUpdater
+  PARAMETER_GRADIENT_SQURESUM,
+  PARAMETER_GRADIENT_SQURESUM1,
+
+  // Used by SparseConnected layer
+  PARAMETER_ROWS,
+  PARAMETER_COLS,
+
+  // Used by Adam Optimizer.
+  PARAMETER_SECOND_MOMENTUM,
+
+  // Used By AdaMax Optimizer.
+  PARAMETER_WEIGHTED_INFINITY_NORM,
+
+  // Used by remote parameter average
+  PARAMETER_APPLY,
+
+  // Used by sparse momentum
+  PARAMETER_MOMENTUM_UT,
+  PARAMETER_MOMENTUM_VT,
+
+  NUM_PARAMETER_TYPES,
+};
+
+}  // namespace enumeration_wrapper
+
+//! explicit import enum into paddle namespace.
+using namespace enumeration_wrapper;  // NOLINT
+
+class TrainAlgorithm {
+ public:
+  static const std::string SGD;
+  static const std::string AsyncSGD;
+  static const std::string OWLQN;
+
+  static inline bool isValid(const std::string& algo) {
+    return algo == SGD || algo == AsyncSGD || algo == OWLQN;
+  }
+};
+
+#ifdef __AVX__
+const int ALIGN_HINT = 32;
+#else
+const int ALIGN_HINT = 16;
+#endif
+
+}  // namespace paddle
diff --git a/paddle/legacy/utils/Locks.h b/paddle/legacy/utils/Locks.h
new file mode 100644
index 0000000000000000000000000000000000000000..65f983685f5e178345a6a875a79a6573ce1ccca1
--- /dev/null
+++ b/paddle/legacy/utils/Locks.h
@@ -0,0 +1,242 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <pthread.h>
+#include <sys/time.h>
+#include <condition_variable>
+#include <mutex>
+
+#include "Common.h"
+
+namespace paddle {
+
+/**
+ * A simple read-write lock.
+ * The RWlock allows a number of readers or at most one writer
+ * at any point in time.
+ * The RWlock disable copy.
+ *
+ * Lock:
+ *
+ * Use lock() to lock on write mode, no other thread can get it
+ * until unlock.
+ *
+ * Use lock_shared() to lock on read mode, other thread can get
+ * it by using the same method lock_shared().
+ *
+ * Unlock:
+ *
+ * Use unlock() to unlock the lock.
+ */
+class RWLock {
+ public:
+  RWLock() { pthread_rwlock_init(&rwlock_, NULL); }
+  ~RWLock() { pthread_rwlock_destroy(&rwlock_); }
+  RWLock(const RWLock&) = delete;
+  RWLock& operator=(const RWLock&) = delete;
+
+  /**
+   * @brief lock on write mode.
+   * @note the method will block the thread, if failed to get the lock.
+   */
+  // std::mutex interface
+  void lock() { pthread_rwlock_wrlock(&rwlock_); }
+  /**
+   * @brief lock on read mode.
+   * @note if another thread is writing, it can't get the lock,
+   * and will block the thread.
+   */
+  void lock_shared() { pthread_rwlock_rdlock(&rwlock_); }
+  void unlock() { pthread_rwlock_unlock(&rwlock_); }
+
+ protected:
+  pthread_rwlock_t rwlock_;
+};
+
+/**
+ * The ReadLockGuard is a read mode RWLock
+ * using RAII management mechanism.
+ */
+class ReadLockGuard {
+ public:
+  /**
+   * @brief Construct Function. Lock on rwlock in read mode.
+   */
+  explicit ReadLockGuard(RWLock& rwlock) : rwlock_(&rwlock) {
+    rwlock_->lock_shared();
+  }
+
+  /**
+   * @brief Destruct Function.
+   * @note This method just unlock the read mode rwlock,
+   * won't destroy the lock.
+   */
+  ~ReadLockGuard() { rwlock_->unlock(); }
+
+ protected:
+  RWLock* rwlock_;
+};
+
+/**
+ * A simple wrapper for spin lock.
+ * The lock() method of SpinLock is busy-waiting
+ * which means it will keep trying to lock until lock on successfully.
+ * The SpinLock disable copy.
+ */
+class SpinLockPrivate;
+class SpinLock {
+ public:
+  DISABLE_COPY(SpinLock);
+  SpinLock();
+  ~SpinLock();
+
+  // std::mutext interface
+  void lock();
+  void unlock();
+
+ private:
+  SpinLockPrivate* m;
+};
+
+/**
+ * A simple wapper of semaphore which can only be shared in the same process.
+ */
+class SemaphorePrivate;
+class Semaphore {
+ public:
+  //! Disable copy & assign
+  Semaphore(const Semaphore& other) = delete;
+  Semaphore& operator=(const Semaphore&& other) = delete;
+
+  //! Enable move.
+  Semaphore(Semaphore&& other) : m(std::move(other.m)) {}
+
+ public:
+  /**
+   * @brief Construct Function.
+   * @param[in] initValue the initial value of the
+   * semaphore, default 0.
+   */
+  explicit Semaphore(int initValue = 0);
+
+  ~Semaphore();
+
+  /**
+   * @brief The same as wait(), except if the decrement can not
+   * be performed until ts return false install of blocking.
+   * @param[in] ts an absolute timeout in seconds and nanoseconds
+   * since the Epoch 1970-01-01 00:00:00 +0000(UTC).
+   * @return ture if the decrement proceeds before ts,
+   * else return false.
+   */
+  bool timeWait(struct timespec* ts);
+
+  /**
+   * @brief decrement the semaphore. If the semaphore's value is 0, then call
+   * blocks.
+   */
+  void wait();
+
+  /**
+   * @brief increment the semaphore. If the semaphore's value
+   * greater than 0, wake up a thread blocked in wait().
+   */
+  void post();
+
+ private:
+  SemaphorePrivate* m;
+};
+
+/**
+ * A simple wrapper of thread barrier.
+ * The ThreadBarrier disable copy.
+ */
+class ThreadBarrierPrivate;
+class ThreadBarrier {
+ public:
+  DISABLE_COPY(ThreadBarrier);
+
+  /**
+   * @brief Construct Function. Initialize the barrier should
+   * wait for count threads in wait().
+   */
+  explicit ThreadBarrier(int count);
+  ~ThreadBarrier();
+
+  /**
+   * @brief .
+   * If there were count - 1 threads waiting before,
+   * then wake up all the count - 1 threads and continue run together.
+   * Else block the thread until waked by other thread .
+   */
+  void wait();
+
+ private:
+  ThreadBarrierPrivate* m;
+};
+
+/**
+ * A wrapper for condition variable with mutex.
+ */
+class LockedCondition : public std::condition_variable {
+ public:
+  /**
+   * @brief execute op and notify one thread which was blocked.
+   * @param[in] op a thread can do something in op before notify.
+   */
+  template <class Op>
+  void notify_one(Op op) {
+    std::lock_guard<std::mutex> guard(mutex_);
+    op();
+    std::condition_variable::notify_one();
+  }
+
+  /**
+   * @brief execute op and notify all the threads which were blocked.
+   * @param[in] op a thread can do something in op before notify.
+   */
+  template <class Op>
+  void notify_all(Op op) {
+    std::lock_guard<std::mutex> guard(mutex_);
+    op();
+    std::condition_variable::notify_all();
+  }
+
+  /**
+   * @brief wait until pred return ture.
+   * @tparam Predicate c++ concepts, describes a function object
+   * that takes a single iterator argument
+   * that is dereferenced and used to
+   * return a value testable as a bool.
+   * @note pred shall not apply any non-constant function
+   * through the dereferenced iterator.
+   */
+  template <class Predicate>
+  void wait(Predicate pred) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    std::condition_variable::wait(lock, pred);
+  }
+
+  /**
+   * @brief get mutex.
+   */
+  std::mutex* mutex() { return &mutex_; }
+
+ protected:
+  std::mutex mutex_;
+};
+
+}  // namespace paddle
diff --git a/paddle/utils/Logging.cpp b/paddle/legacy/utils/Logging.cpp
similarity index 100%
rename from paddle/utils/Logging.cpp
rename to paddle/legacy/utils/Logging.cpp
diff --git a/paddle/utils/Logging.h b/paddle/legacy/utils/Logging.h
similarity index 100%
rename from paddle/utils/Logging.h
rename to paddle/legacy/utils/Logging.h
diff --git a/paddle/utils/PythonUtil.cpp b/paddle/legacy/utils/PythonUtil.cpp
similarity index 100%
rename from paddle/utils/PythonUtil.cpp
rename to paddle/legacy/utils/PythonUtil.cpp
diff --git a/paddle/legacy/utils/PythonUtil.h b/paddle/legacy/utils/PythonUtil.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0c8612c378fbe12cdf24e51a5b6546740b2d4c8
--- /dev/null
+++ b/paddle/legacy/utils/PythonUtil.h
@@ -0,0 +1,353 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+// clang-format off
+#include "paddle/legacy/utils/Util.h"
+
+#ifndef PADDLE_NO_PYTHON
+// must include the following two blocks, otherwise,
+// gcc compiler may produce warning
+#ifdef __APPLE__
+#define _POSIX_SOURCE
+#define _POSIX_C_SOURCE 200809L
+#define _XOPEN_SOURCE 700
+#endif
+
+#ifdef _POSIX_C_SOURCE
+#define __TEMP_POSIX_C_SOURCE _POSIX_C_SOURCE
+#undef _POSIX_C_SOURCE
+#endif
+#ifdef _XOPEN_SOURCE
+#define __TEMP_XOPEN_SOURCE _XOPEN_SOURCE
+#undef _XOPEN_SOURCE
+#endif
+#include <Python.h>
+#include <frameobject.h>
+#endif
+
+#include <stdarg.h>
+#include <map>
+#include <mutex>
+// clang-format on
+
+namespace paddle {
+
+std::string callPythonFunc(const std::string& moduleName,
+                           const std::string& funcName,
+                           const std::vector<std::string>& args);
+
+#ifndef PADDLE_NO_PYTHON
+
+/**
+ * Global lock guard of python C-api invokes.
+ * NOTE: the lock of this guard is reentrant or recursive.
+ */
+class PyGuard {
+ public:
+  PyGuard();
+  PyGuard(const PyGuard& other) = delete;
+  PyGuard& operator=(const PyGuard& other) = delete;
+
+ private:
+  std::lock_guard<std::recursive_mutex> guard_;
+};
+
+struct PyObjectDeleter {
+  void operator()(PyObject* obj) {
+    if (obj) {
+      Py_DECREF(obj);
+    }
+  }
+};
+
+typedef std::unique_ptr<PyObject, PyObjectDeleter> PyObjectPtr;
+
+PyObjectPtr callPythonFuncRetPyObj(const std::string& moduleName,
+                                   const std::string& funcName,
+                                   const std::vector<std::string>& args);
+
+PyObjectPtr createPythonClass(const std::string& moduleName,
+                              const std::string& className,
+                              const std::vector<std::string>& args,
+                              const std::map<std::string, std::string>& kwargs);
+
+#define CHECK_PY(x) CHECK((x) != nullptr) << ::paddle::py::getPyCallStack()
+
+namespace py {
+PyObjectPtr import(const std::string& moduleName);
+
+/**
+ * Cast a PyLong or PyInt to int type T.
+ * @tparam T return type.
+ * @param [in] obj PyLong or PyInt object.
+ * @param [out] ok status for casting. False if error occured. nullptr if user
+ *                 don't care is ok or not.
+ * @return The value of python object, or 0 if not ok.
+ */
+template <typename T>
+T castInt(PyObject* obj, bool* ok = nullptr) {
+  if (PyLong_Check(obj)) {
+    if (ok) *ok = true;
+    return (T)PyLong_AsUnsignedLong(obj);
+  } else if (PyInt_Check(obj)) {
+    if (ok) *ok = true;
+    return (T)PyInt_AsLong(obj);
+  } else {
+    if (ok) *ok = false;
+    return (T)0;
+  }
+}
+
+/**
+ * Invoke repr of python object.
+ *
+ * Just like toString method in java.
+ */
+char* repr(PyObject* obj);
+
+/**
+ * Invoke repr of python object.
+ */
+inline char* repr(const PyObjectPtr& obj) { return repr(obj.get()); }
+
+/**
+ * Get Python Error Stack String.
+ */
+std::string getPyCallStack();
+
+/**
+ * Object Helper for PyObjectPtr.
+ *
+ * Implements getAttr method for object.
+ */
+class ObjectHelper {
+ public:
+  explicit ObjectHelper(const PyObjectPtr& obj) : obj_(obj) {}
+
+  /**
+   * get attribute
+   */
+  inline PyObject* getAttr(const std::string& field) const {
+    auto obj = PyObject_GetAttrString(obj_.get(), field.c_str());
+    CHECK_PY(obj) << "Cannot get attribute on python object " << obj_.get();
+    return obj;
+  }
+
+  /**
+   * Get Int attribute
+   * @param [in] field  attribute name.
+   * @param [out] ok true if this attribute is int.
+   * @tparam T int type.
+   * @return int value.
+   */
+  template <typename T>
+  T getIntAttr(const std::string& field, bool* ok = nullptr) const {
+    PyObjectPtr tmp(getAttr(field));
+    return castInt<T>(tmp.get(), ok);
+  }
+
+  /**
+   * Get int attribute. Log(Fatal) when not ok
+   * @param field attribute name.
+   * @return int value.
+   */
+  template <typename T>
+  T getIntAttrWithError(const std::string& field) const {
+    bool ok;
+    T tmp = getIntAttr<T>(field, &ok);
+    CHECK(ok) << "Cannot get integer attribute on object " << obj_.get();
+    return tmp;
+  }
+
+  /**
+   * Get bool attribute.
+   * @param field
+   * @param [out] isBoolType return true if attribute is bool type. If the
+   *                         attribute is not bool type, then an implicit
+   *                         conversion will happens, and will return the
+   *                         conversion result.
+   *
+   *                         Such as, if the attribute is 1, then the return
+   *                         value of function will be true, but the isBoolType
+   *                         will return false.
+   * @return
+   */
+  bool getBoolAttr(const std::string& field, bool* isBoolType = nullptr) const {
+    PyObjectPtr tmp(getAttr(field));
+    if (isBoolType) {
+      *isBoolType = PyBool_Check(tmp.get());
+    }
+    return PyObject_IsTrue(tmp.get());
+  }
+
+ private:
+  const PyObjectPtr& obj_;
+};
+
+/**
+ * Python Sequence Helper
+ *
+ * The python sequence means list or tuple.
+ */
+class SequenceHelper {
+ public:
+  explicit SequenceHelper(const PyObjectPtr& seq) : seq_(seq.get()) {
+    CHECK(PySequence_Check(seq_));
+  }
+
+  explicit SequenceHelper(PyObject* seq) : seq_(seq) {
+    CHECK(PySequence_Check(seq_));
+  }
+
+  inline size_t size() const { return (size_t)PySequence_Size(seq_); }
+
+  inline PyObject* operator[](size_t i) const {
+    return PySequence_Fast_GET_ITEM(seq_, i);
+  }
+
+  inline double getDouble(size_t i) const {
+    auto* ptr = (*this)[i];
+    return PyFloat_AsDouble(ptr);
+  }
+
+  /**
+   * Set a sequence item o[i] = obj;
+   * @param i index
+   * @param obj setted item.
+   * @param steal if steal = true, sequence will move object in iteself,
+   *              just like std::move. Otherwise, it will increase reference
+   *              count. Default is false.
+   */
+  inline void set(size_t i, const PyObjectPtr& obj, bool steal = false) {
+    this->set(i, obj.get(), steal);
+  }
+
+  /**
+   * Set a sequence item o[i] = obj;
+   */
+  inline void set(size_t i, PyObject* obj, bool steal = false) {
+    if (!steal) {
+      Py_XINCREF(obj);
+    }
+    if (PyTuple_Check(seq_)) {
+      CHECK_NE(PyTuple_SetItem(seq_, i, obj), -1) << getPyCallStack();
+    } else {
+      CHECK_NE(PySequence_SetItem(seq_, i, obj), -1) << getPyCallStack();
+    }
+  }
+
+ private:
+  PyObject* seq_;
+};
+
+class DictHelper {
+ public:
+  explicit DictHelper(PyObject* d) : dict_(d) {}
+
+  explicit DictHelper(const PyObjectPtr& d) : dict_(d.get()) {}
+
+  void set(const std::string& key, PyObject* item) {
+    PyDict_SetItemString(dict_, key.c_str(), item);
+  }
+
+  void setBool(const std::string& key, bool b) {
+    this->set(key, PyBool_FromLong(b));
+  }
+
+  void setStringList(const std::string& key,
+                     const std::vector<std::string>& items) {
+    auto* list = PyList_New(items.size());
+    for (size_t i = 0; i < items.size(); ++i) {
+      PyList_SetItem(list, i, PyString_FromString(items[i].c_str()));
+    }
+    this->set(key, list);
+  }
+
+ private:
+  inline void checkDict() { CHECK(PyDict_Check(this->dict_)); }
+
+  PyObject* dict_;
+};
+
+inline static bool isCallable(const PyObjectPtr& obj) {
+  return PyCallable_Check(obj.get());
+}
+
+/**
+ * Wrap a callable object.
+ */
+class CallableHelper {
+ public:
+  explicit CallableHelper(const PyObjectPtr& obj) : obj_(obj) {
+    CHECK(py::isCallable(obj_));
+  }
+
+  ~CallableHelper() {}
+
+  /**
+   * reset args, and create new tuple.
+   * @param sz args size.
+   */
+  void setArgsSize(size_t sz) { args.reset(PyTuple_New(sz)); }
+
+  /**
+   * Get args sequence. User can set/get by SequenceHelper.
+   */
+  SequenceHelper getArgs() { return SequenceHelper(args); }
+
+  /**
+   * Call python method, return an object.
+   */
+  PyObject* operator()() {
+    PyGuard guard;
+    return PyObject_Call(obj_.get(), args.get(), kwargs.get());
+  }
+
+ private:
+  const PyObjectPtr& obj_;
+  PyObjectPtr args;
+  PyObjectPtr kwargs;
+};
+
+inline static PyObject* iterNext(const PyObjectPtr& context, bool* atEnd) {
+  PyGuard g;
+  PyObject* data = PyIter_Next(context.get());
+  if (data == nullptr) {
+    if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
+      PyErr_Clear();
+      *atEnd = true;
+      return nullptr;
+    } else if (PyErr_Occurred()) {
+      CHECK_PY(data) << "Calling iterator next error";
+      return nullptr;
+    } else {
+      *atEnd = false;
+      return data;  // just return none in iterator.
+    }
+  } else {
+    *atEnd = false;
+    return data;
+  }
+}
+}  // namespace py
+
+#endif
+
+/**
+ * Initialize python.
+ */
+void initPython(int argc, char** argv);
+
+}  // namespace paddle
diff --git a/paddle/legacy/utils/Queue.h b/paddle/legacy/utils/Queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..189e1a14f7b2d133408a50418d96431164248f0e
--- /dev/null
+++ b/paddle/legacy/utils/Queue.h
@@ -0,0 +1,255 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+
+#include "Locks.h"
+
+namespace paddle {
+
+/**
+ * A thread-safe queue that automatically grows but never shrinks.
+ * Dequeue a empty queue will block current thread. Enqueue an element
+ * will wake up another thread that blocked by dequeue method.
+ *
+ * For example.
+ * @code{.cpp}
+ *
+ * paddle::Queue<int> q;
+ * END_OF_JOB=-1
+ * void thread1() {
+ *   while (true) {
+ *     auto job = q.dequeue();
+ *     if (job == END_OF_JOB) {
+ *       break;
+ *     }
+ *     processJob(job);
+ *   }
+ * }
+ *
+ * void thread2() {
+ *   while (true) {
+ *      auto job = getJob();
+ *      q.enqueue(job);
+ *      if (job == END_OF_JOB) {
+ *        break;
+ *      }
+ *   }
+ * }
+ *
+ * @endcode
+ */
+template <class T>
+class Queue {
+ public:
+  /**
+   * @brief Construct Function. Default capacity of Queue is zero.
+   */
+  Queue() : numElements_(0) {}
+
+  ~Queue() {}
+
+  /**
+   * @brief enqueue an element into Queue.
+   * @param[in] el The enqueue element.
+   * @note This method is thread-safe, and will wake up another blocked thread.
+   */
+  void enqueue(const T& el) {
+    std::unique_lock<std::mutex> lock(queueLock_);
+    elements_.emplace_back(el);
+    numElements_++;
+
+    queueCV_.notify_all();
+  }
+
+  /**
+   * @brief enqueue an element into Queue.
+   * @param[in] el The enqueue element. rvalue reference .
+   * @note This method is thread-safe, and will wake up another blocked thread.
+   */
+  void enqueue(T&& el) {
+    std::unique_lock<std::mutex> lock(queueLock_);
+    elements_.emplace_back(std::move(el));
+    numElements_++;
+
+    queueCV_.notify_all();
+  }
+
+  /**
+   * Dequeue from a queue and return a element.
+   * @note this method will be blocked until not empty.
+   */
+  T dequeue() {
+    std::unique_lock<std::mutex> lock(queueLock_);
+    queueCV_.wait(lock, [this]() { return numElements_ != 0; });
+    T el;
+
+    using std::swap;
+    // Becuase of the previous statement, the right swap() can be found
+    // via argument-dependent lookup (ADL).
+    swap(elements_.front(), el);
+
+    elements_.pop_front();
+    numElements_--;
+    if (numElements_ == 0) {
+      queueCV_.notify_all();
+    }
+    return el;
+  }
+
+  /**
+   * Return size of queue.
+   *
+   * @note This method is not thread safe. Obviously this number
+   * can change by the time you actually look at it.
+   */
+  inline int size() const { return numElements_; }
+
+  /**
+   * @brief is empty or not.
+   * @return true if empty.
+   * @note This method is not thread safe.
+   */
+  inline bool empty() const { return numElements_ == 0; }
+
+  /**
+   * @brief wait util queue is empty
+   */
+  void waitEmpty() {
+    std::unique_lock<std::mutex> lock(queueLock_);
+    queueCV_.wait(lock, [this]() { return numElements_ == 0; });
+  }
+
+  /**
+   * @brief wait queue is not empty at most for some seconds.
+   * @param seconds wait time limit.
+   * @return true if queue is not empty. false if timeout.
+   */
+  bool waitNotEmptyFor(int seconds) {
+    std::unique_lock<std::mutex> lock(queueLock_);
+    return queueCV_.wait_for(lock, std::chrono::seconds(seconds), [this] {
+      return numElements_ != 0;
+    });
+  }
+
+ private:
+  std::deque<T> elements_;
+  int numElements_;
+  std::mutex queueLock_;
+  std::condition_variable queueCV_;
+};
+
+/*
+ * A thread-safe circular queue that
+ * automatically blocking calling thread if capacity reached.
+ *
+ * For example.
+ * @code{.cpp}
+ *
+ * paddle::BlockingQueue<int> q(capacity);
+ * END_OF_JOB=-1
+ * void thread1() {
+ *   while (true) {
+ *     auto job = q.dequeue();
+ *     if (job == END_OF_JOB) {
+ *       break;
+ *     }
+ *     processJob(job);
+ *   }
+ * }
+ *
+ * void thread2() {
+ *   while (true) {
+ *      auto job = getJob();
+ *      q.enqueue(job); //Block until q.size() < capacity .
+ *      if (job == END_OF_JOB) {
+ *        break;
+ *      }
+ *   }
+ * }
+ */
+template <typename T>
+class BlockingQueue {
+ public:
+  /**
+   * @brief Construct Function.
+   * @param[in] capacity the max numer of elements the queue can have.
+   */
+  explicit BlockingQueue(size_t capacity) : capacity_(capacity) {}
+
+  /**
+   * @brief enqueue an element into Queue.
+   * @param[in] x The enqueue element, pass by reference .
+   * @note This method is thread-safe, and will wake up another thread
+   * who was blocked because of the queue is empty.
+   * @note If it's size() >= capacity before enqueue,
+   * this method will block and wait until size() < capacity.
+   */
+  void enqueue(const T& x) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    notFull_.wait(lock, [&] { return queue_.size() < capacity_; });
+    queue_.push_back(x);
+    notEmpty_.notify_one();
+  }
+
+  /**
+   * Dequeue from a queue and return a element.
+   * @note this method will be blocked until not empty.
+   * @note this method will wake up another thread who was blocked because
+   * of the queue is full.
+   */
+  T dequeue() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    notEmpty_.wait(lock, [&] { return !queue_.empty(); });
+
+    T front(queue_.front());
+    queue_.pop_front();
+    notFull_.notify_one();
+    return front;
+  }
+
+  /**
+   * Return size of queue.
+   *
+   * @note This method is thread safe.
+   * The size of the queue won't change until the method return.
+   */
+  size_t size() {
+    std::lock_guard<std::mutex> guard(mutex_);
+    return queue_.size();
+  }
+
+  /**
+   * @brief is empty or not.
+   * @return true if empty.
+   * @note This method is thread safe.
+   */
+  size_t empty() {
+    std::lock_guard<std::mutex> guard(mutex_);
+    return queue_.empty();
+  }
+
+ private:
+  std::mutex mutex_;
+  std::condition_variable notEmpty_;
+  std::condition_variable notFull_;
+  std::deque<T> queue_;
+  size_t capacity_;
+};
+
+}  // namespace paddle
diff --git a/paddle/utils/Stat.cpp b/paddle/legacy/utils/Stat.cpp
similarity index 100%
rename from paddle/utils/Stat.cpp
rename to paddle/legacy/utils/Stat.cpp
diff --git a/paddle/legacy/utils/Stat.h b/paddle/legacy/utils/Stat.h
new file mode 100644
index 0000000000000000000000000000000000000000..100e9eba909466fcca57f755405ab63b638a8ebd
--- /dev/null
+++ b/paddle/legacy/utils/Stat.h
@@ -0,0 +1,302 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <sys/time.h>
+#include <iostream>
+#include <list>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+
+#include "Locks.h"
+#include "Logging.h"
+#include "ThreadLocal.h"
+#include "hl_gpu.h"
+
+namespace paddle {
+
+class Stat;
+
+class StatInfo {
+ public:
+  explicit StatInfo(Stat* stat = nullptr) : stat_(stat) {
+    total_ = 0;
+    max_ = 0;
+    count_ = 0;
+    min_ = UINT64_MAX;
+  }
+
+  void reset() {
+    total_ = 0;
+    count_ = 0;
+    max_ = 0;
+    min_ = UINT64_MAX;
+  }
+
+  ~StatInfo();
+
+  Stat* stat_;
+  uint64_t total_;
+  uint64_t max_;
+  uint64_t count_;
+  uint64_t min_;
+};
+
+class Stat;
+typedef std::shared_ptr<Stat> StatPtr;
+
+class StatSet {
+ public:
+  explicit StatSet(const std::string& name) : name_(name) {}
+  ~StatSet() {}
+
+  // print to LOG(INFO)
+  void printSegTimerStatus();
+  void printAllStatus();
+
+  StatPtr getStat(const std::string& name) {
+    {
+      ReadLockGuard guard(lock_);
+      auto it = statSet_.find(name);
+      if (it != statSet_.end()) {
+        return it->second;
+      }
+    }
+    StatPtr stat = std::make_shared<Stat>(name);
+    std::lock_guard<RWLock> guard(lock_);
+    auto ret = statSet_.insert(std::make_pair(name, stat));
+    return ret.first->second;
+  }
+
+  // true for showing stats for each thread
+  // false for showing stats aggragated over threads
+  void setThreadInfo(const std::string& name, bool flag);
+
+  // true for showing stats for each thread
+  // false for showing stats aggragated over threads
+  void setThreadInfo(bool flag) {
+    for (auto& iter : statSet_) {
+      setThreadInfo(iter.first, flag);
+    }
+  }
+
+  // reset the counters for all stats
+  // clearRawData means also clearing raw tuning data, because at pserver end,
+  // barrier rawData(timeVector_) is stateful, clearing it will cause rubbish
+  // data, while rawData should be cleared at the new pass (so complicated
+  // pserver code logic, -_- ).
+  void reset(bool clearRawData = true);
+
+ private:
+  std::unordered_map<std::string, StatPtr> statSet_;
+  const std::string name_;
+  RWLock lock_;
+};
+
+extern StatSet globalStat;
+
+/*@brief : a simple stat*/
+class Stat {
+ public:
+  explicit Stat(const std::string& statName)
+      : destructStat_(nullptr), name_(statName), openThreadInfo_(false) {}
+  ~Stat() {}
+
+  typedef std::list<std::pair<StatInfo*, pid_t>> ThreadLocalBuf;
+
+  const std::string& getName() const { return name_; }
+
+  void addSample(uint64_t value);
+
+  // clear all stats
+  void reset();
+
+  friend std::ostream& operator<<(std::ostream& outPut, const Stat& stat);
+
+  /*  Set operator << whether to print thread info.
+   *  If openThreadInfo_ == true, then print, else print merge thread info.
+   */
+  void setThreadInfo(bool flag) { openThreadInfo_ = flag; }
+
+  bool getThreadInfo() const { return openThreadInfo_; }
+
+  friend class StatInfo;
+
+ private:
+  void mergeThreadStat(StatInfo& allThreadStat);
+
+  std::mutex lock_;
+  ThreadLocalBuf threadLocalBuf_;
+  StatInfo destructStat_;
+  ThreadLocal<StatInfo> statInfo_;
+  const std::string name_;
+  bool openThreadInfo_;
+};
+
+extern StatSet globalStat;
+
+inline StatPtr getStat(const std::string& name) {
+  return globalStat.getStat(name);
+}
+
+inline uint64_t nowInMicroSec() {
+  timeval tvTime;
+  (void)gettimeofday(&tvTime, NULL);
+  return tvTime.tv_sec * 1000000LU + tvTime.tv_usec;
+}
+
+/**
+ * A simple help class to measure time interval
+ */
+class Timer {
+ public:
+  explicit Timer(bool autoStart = true) : total_(0), startStamp_(0) {
+    if (autoStart) {
+      start();
+    }
+  }
+  void start() { startStamp_ = nowInMicroSec(); }
+  void setStartStamp(uint64_t startStamp) { startStamp_ = startStamp; }
+  uint64_t stop() {
+    total_ += nowInMicroSec() - startStamp_;
+    return total_;
+  }
+
+  uint64_t get() const { return total_; }
+
+  void reset() { total_ = 0; }
+
+ protected:
+  uint64_t total_;
+  uint64_t startStamp_;
+};
+
+class TimerOnce {
+ public:
+  TimerOnce(Stat* stat,
+            const char* info = "",
+            uint64_t threshold = -1,
+            bool autoStart = true,
+            uint64_t startStamp = 0)
+      : stat_(stat), info_(info), timer_(autoStart), threshold_(threshold) {
+    if (!autoStart) {
+      timer_.setStartStamp(startStamp);
+    }
+  }
+  ~TimerOnce() {
+    uint64_t span = timer_.stop();
+    if (span >= threshold_) {
+      LOG(INFO) << "Stat: [" << stat_->getName() << "] " << info_
+                << " [Span:" << span / 1000 << "ms" << span % 1000 << "us"
+                << "] ";
+    }
+    stat_->addSample(span);
+  }
+
+ private:
+  Stat* stat_;
+  const char* info_;
+  Timer timer_;
+  uint64_t threshold_;
+};
+
+inline uint64_t registerTimerArg1(uint64_t threshold = -1,
+                                  StatSet& statSet = globalStat) {
+  return threshold;
+}
+
+inline StatSet& registerTimerArg2(uint64_t threshold = -1,
+                                  StatSet& statSet = globalStat) {
+  return statSet;
+}
+
+#ifdef PADDLE_DISABLE_TIMER
+
+#define REGISTER_TIMER(statName, ...)
+#define REGISTER_TIMER_SET(statName, start, ...)
+#define REGISTER_TIMER_DYNAMIC(statName, ...)
+#define REGISTER_TIMER_DYNAMIC_SET(statName, start, ...)
+#define REGISTER_TIMER_INFO(statName, info)
+#define FOR_TIMING(statement)
+
+#else
+
+#define FOR_TIMING(statement) statement
+
+// The default arguments are shown in the following line:
+// REGISTER_TIMER(statName, threshold = -1, statSet = globalStat)
+// TODO(yuyang18,wangyanfei01): if UNIQUE_NAME is needed
+#define REGISTER_TIMER(statName, ...)                             \
+  static ::paddle::StatPtr __stat =                               \
+      ::paddle::registerTimerArg2(__VA_ARGS__).getStat(statName); \
+  ::paddle::TimerOnce __timerOnce(                                \
+      __stat.get(), "", ::paddle::registerTimerArg1(__VA_ARGS__));
+
+#define REGISTER_TIMER_SET(statName, start, ...)                            \
+  static ::paddle::StatPtr __stat =                                         \
+      ::paddle::registerTimerArg2(__VA_ARGS__).getStat(statName);           \
+  ::paddle::TimerOnce __timerOnce(__stat.get(),                             \
+                                  "",                                       \
+                                  ::paddle::registerTimerArg1(__VA_ARGS__), \
+                                  false,                                    \
+                                  start);
+
+// dynmaic timer, support to discriminate runtime entity, used in pserver
+#define REGISTER_TIMER_DYNAMIC(statName, ...)                     \
+  ::paddle::StatPtr __stat =                                      \
+      ::paddle::registerTimerArg2(__VA_ARGS__).getStat(statName); \
+  ::paddle::TimerOnce __timerOnce(                                \
+      __stat.get(), "", ::paddle::registerTimerArg1(__VA_ARGS__));
+
+#define REGISTER_TIMER_DYNAMIC_SET(statName, start, ...)                    \
+  ::paddle::StatPtr __stat =                                                \
+      ::paddle::registerTimerArg2(__VA_ARGS__).getStat(statName);           \
+  ::paddle::TimerOnce __timerOnce(__stat.get(),                             \
+                                  "",                                       \
+                                  ::paddle::registerTimerArg1(__VA_ARGS__), \
+                                  false,                                    \
+                                  start);
+
+#define REGISTER_TIMER_INFO(statName, info)                                 \
+  static ::paddle::StatPtr __stat = ::paddle::globalStat.getStat(statName); \
+  ::paddle::TimerOnce __timerOnce(                                          \
+      __stat.get(), info, 10 * 1000000LU /*threshold*/);
+
+#endif  // DISABLE_TIMER
+
+class GpuProfiler final {
+ public:
+  GpuProfiler(std::string statName, std::string info);
+  ~GpuProfiler();
+
+ private:
+  std::lock_guard<std::recursive_mutex> guard_;
+};
+
+#ifdef PADDLE_DISABLE_PROFILER
+
+#define REGISTER_GPU_PROFILER(statName, ...)
+
+#else
+
+#define REGISTER_GPU_PROFILER(statName, ...) \
+  GpuProfiler __gpuProfiler(statName, #__VA_ARGS__);
+
+#endif  // DISABLE_PROFILER
+
+}  // namespace paddle
diff --git a/paddle/utils/StringUtil.cpp b/paddle/legacy/utils/StringUtil.cpp
similarity index 100%
rename from paddle/utils/StringUtil.cpp
rename to paddle/legacy/utils/StringUtil.cpp
diff --git a/paddle/utils/StringUtil.h b/paddle/legacy/utils/StringUtil.h
similarity index 100%
rename from paddle/utils/StringUtil.h
rename to paddle/legacy/utils/StringUtil.h
diff --git a/paddle/legacy/utils/Thread.h b/paddle/legacy/utils/Thread.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ee6eba1a68202282537788160a77f7689a2ffdb
--- /dev/null
+++ b/paddle/legacy/utils/Thread.h
@@ -0,0 +1,615 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <thread>
+#include "Logging.h"
+#include "Util.h"
+
+#include "Queue.h"
+#include "ThreadLocal.h"
+
+#include <future>
+
+namespace paddle {
+
+/**
+ * A simple wrapper for std::thread
+ */
+
+class Thread {
+ public:
+  /**
+   * @brief Construct Function. Default thread pointer is null.
+   */
+  Thread() { thread_ = nullptr; }
+
+  virtual ~Thread() {}
+
+  /**
+   * @brief Creat a new thread and call *run()* function.
+   */
+  void start() {
+    thread_.reset(new std::thread([this]() { this->run(); }));
+  }
+
+  /**
+   * @brief Detach the thread.
+   * It don't need to be waited until it finish.
+   */
+  void detach() { thread_->detach(); }
+
+  /**
+   * @brief Join the thread.
+   * It should be waited until it finish.
+   */
+  void join() { thread_->join(); }
+
+  /**
+   * @brief Define what to be done on this thread through override this
+   * function.
+   */
+  virtual void run() = 0;
+
+ protected:
+  std::unique_ptr<std::thread> thread_;
+};
+
+/**
+ * ThreadWorker maintains a job queue. It executes the jobs in the job queue
+ * sequentianlly in a separate thread.
+ *
+ * Use addJob() to add a new job to the job queue.
+ */
+class ThreadWorker : protected Thread {
+ public:
+  typedef std::function<void()> JobFunc;
+
+  /**
+   * @brief Construct Function. Default size of job queue is 0 and not stopping.
+   */
+  ThreadWorker() : stopping_(false), empty_(true) { start(); }
+
+  /**
+   * @brief Destruct Function.
+   * If it's running, wait until all job finish and then stop it.
+   */
+  ~ThreadWorker() {
+    if (!stopping_) {
+      wait();
+      stop();
+    }
+  }
+
+  /**
+   * @brief Finish current running job and quit the thread.
+   */
+  void stop() {
+    stopping_ = true;
+    jobs_.enqueue([]() {});
+    join();
+  }
+
+  /**
+   * @brief Add a new job to the job queue.
+   */
+  void addJob(JobFunc func) {
+    empty_ = false;
+    jobs_.enqueue(func);
+  }
+
+  /**
+   * @brief Wait until all jobs was done (the job queue was empty).
+   */
+  void wait() {
+    finishCV_.wait([this] { return empty_; });
+  }
+
+ protected:
+  /**
+   * @brief Execute jobs in the job queue sequentianlly,
+   * @note If finish all the jobs in the job queue,
+   * notifies all the waiting threads the job queue was empty.
+   */
+  virtual void run() {
+    while (true) {
+      JobFunc func = jobs_.dequeue();
+      if (stopping_) break;
+      func();
+      if (jobs_.empty()) {
+        finishCV_.notify_all([this] { empty_ = true; });
+      }
+    }
+  }
+
+  Queue<JobFunc> jobs_;
+  bool stopping_;
+  LockedCondition finishCV_;
+  bool empty_;
+};
+
+/**
+ * SyncThreadPool maintains a pool of threads.
+ * It executes the job use all workers in the pool.
+ *
+ * Use exec() to run a new job, job complete when exec returned.
+ * Only one job can exec simultaneously.
+ *
+ * Each worker has an tid whose range is [0, getNumThreads()).
+ * JobFunc can use tid to divide input data.
+ */
+class SyncThreadPool {
+ public:
+  typedef std::function<void(int tid, size_t numThreads)> JobFunc;
+
+  /**
+   * @brief Construct Function. No thread will be created.
+   */
+  SyncThreadPool() : jobStartBarrier_(0), jobFinishBarrier_(0) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief Construct Fucntion. Create numWorkers of threads in the pool.
+   * @param[in] numWorkers Number of the workers in the pool.
+   * @param[in] checkOwner Default true. If checkOwner is true,
+   * this sync thread pool should be used by it's owner thread.
+   */
+  explicit SyncThreadPool(size_t numWorkers, bool checkOwner = true)
+      : stopping_(false),
+        jobStartBarrier_(numWorkers + 1),
+        jobFinishBarrier_(numWorkers + 1),
+        jobFunc_(nullptr),
+        checkOwner_(checkOwner) {
+    ownerThreadId_ = getTID();
+    workers_.resize(numWorkers);
+    start();
+  }
+
+  ~SyncThreadPool() {
+    if (!stopping_) {
+      stop();
+    }
+  }
+
+  /**
+   * @brief Return num of threads in the pool.
+   */
+  size_t getNumThreads() { return workers_.size(); }
+
+  /**
+   * @brief Execute a job using all the theads in the pool.
+   * @param[in] jobFunc The function to be executed.
+   * @param[in] ownerFunc Owner thread can do something in owerFunc when job
+   * executing.
+   * @note For the ownerFunc, tid=getNumThreads().
+   */
+  void exec(JobFunc jobFunc, JobFunc ownerFunc = nullptr) {
+    if (checkOwner_) {
+      CHECK_EQ(ownerThreadId_, getTID())
+          << "this sync thread pool should be used in one thread";
+    }
+
+    CHECK(jobFunc_ == nullptr);
+    jobFunc_ = jobFunc;
+    jobStartBarrier_.wait();  // notify worker thread start job
+
+    if (ownerFunc) {
+      ownerFunc(workers_.size(), workers_.size());
+    }
+
+    jobFinishBarrier_.wait();  // wait all worker thread complete
+    jobFunc_ = nullptr;
+  }
+
+  /**
+   * @brief Execute a job using all the threads in the pool.
+   * And the owner thread will do the same job.
+   * @param jobFunc The job to be executed.
+   * @note  Assume that JobFunc will execute numThread + 1 times,
+   * with tid ranging [0,numThread]. The thread whose tid is numThread
+   * is the owner thread.
+   */
+  void execPlusOwner(JobFunc jobFunc) { exec(jobFunc, jobFunc); }
+
+  /**
+   * @brief Execute a job if has pool, else use caller thread as a worker.
+   * @param[in] pool The pool to execute the job.
+   * @param[in] jobFunc The job to be excuted.
+   */
+  static void execHelper(SyncThreadPool* pool, JobFunc jobFunc) {
+    if (pool) {
+      pool->exec(jobFunc);
+    } else {
+      jobFunc(0, 1);
+    }
+  }
+
+ protected:
+  /**
+   * @brief Start all the workers in the pool, call their run() function.
+   */
+  void start() {
+    for (size_t i = 0; i < workers_.size(); ++i) {
+      workers_[i].reset(
+          new std::thread([this](int tid) { this->run(tid); }, i));
+    }
+  }
+
+  /**
+   * @brief Stop all the workers in the pool.
+   */
+  void stop() {
+    stopping_ = true;
+    // notify worker thread to stop
+    jobStartBarrier_.wait();
+
+    // stop workers
+    for (auto& thread : workers_) {
+      if (thread) {
+        thread->join();
+        thread.reset(nullptr);
+      }
+    }
+  }
+
+  /**
+   * @brief Execute the jobFunc_ using the worker thread tid, if not stopping.
+   */
+  void run(int tid) {
+    VLOG(1) << "SyncThreadPool worker thread " << tid;
+    // init seed deterministic, but differs from global srand()
+    ThreadLocalRand::initThreadSeed(tid + workers_.size());
+
+    while (true) {
+      jobStartBarrier_.wait();  // wait job
+
+      if (stopping_) {
+        break;
+      }
+
+      jobFunc_(tid, workers_.size());
+
+      jobFinishBarrier_.wait();  // notify job complete
+    }
+  }
+
+ protected:
+  pid_t ownerThreadId_;
+  bool stopping_;
+  ThreadBarrier jobStartBarrier_;
+  ThreadBarrier jobFinishBarrier_;
+
+  JobFunc jobFunc_;
+  bool checkOwner_;
+  std::vector<std::unique_ptr<std::thread>> workers_;
+};
+
+/**
+ * MultiThreadWorker maintains a job queue and a result queue.
+ * It executes the jobs in the job queue and puts the results into the
+ * result queue sequentially in multi separate threads.
+ *
+ * Add jobs:
+ *
+ *    Use addJob() to add a new job to the job queue
+ *        (the user added jobs should not return nullptr).
+ *
+ *    Use stopAddJob() to stop adding new jobs to the job queue
+ *        (addJob() can not be called after stopAddJob()).
+ *
+ * Normal stop:
+ *
+ *    Use waitResult() to get the results until nullptr is returned.
+ *    Use stop() to exit normally
+ *        (stopAddJob() should be called first).
+ *
+ * Force stop:
+ *
+ *    Use forceStop() to exit forcibly even though there are remaining jobs in
+ * the
+ * job queue.
+ */
+template <class T>
+class MultiThreadWorker {
+ public:
+  typedef T ResultType;
+  typedef std::shared_ptr<ResultType> ResultPtrType;
+  typedef std::function<ResultPtrType()> JobFunc;
+  /**
+   * @brief Construct Function. Initialize the multithread worker.
+   * @param[in] workerNum Number of the workers.
+   * @param[in] queueCapacity Capapcity of the result queue.
+   */
+  MultiThreadWorker(size_t workerNum, size_t queueCapacity)
+      : stopping_(false),
+        jobAdding_(true),
+        nullResultNum_(0),
+        results_(queueCapacity) {
+    workers_.resize(workerNum);
+    for (auto& worker : workers_) {
+      worker.reset(new std::thread([this]() { this->run(); }));
+    }
+  }
+
+  /**
+   * @brief Destruct Function. Force stop the workers
+   * even though there are remaining jobs in the job queue.
+   */
+  virtual ~MultiThreadWorker() { forceStop(); }
+
+  /**
+   * @brief Stop all the workers normally.
+   * @note stopAddJob() should be called before it.
+   */
+  void stop() {
+    CHECK(!jobAdding_) << "stopAddJob() should be called before stop()";
+    for (auto& worker : workers_) {
+      if (worker) {
+        worker->join();
+        worker = nullptr;
+      }
+    }
+    stopping_ = true;
+  }
+
+  /**
+   * @brief Stop all the workers forcibly.
+   * @note This function will call stopAddJob() first
+   * and empty the result queue.
+   */
+  void forceStop() {
+    if (!stopping_) {
+      stopping_ = true;
+      stopAddJob();
+      while (nullptr != waitResult()) {
+      }
+      stop();
+    }
+  }
+
+  /**
+   * @brief Add a job to the job queue.
+   * @note Job can not be added after calling stopAddJob().
+   */
+  void addJob(JobFunc func) {
+    CHECK(jobAdding_) << "addJob() can not be called after stopAddJob()";
+    jobs_.enqueue(func);
+  }
+
+  /**
+   * @brief Stop adding new jobs to the job queue.
+   * @note This fuction enqueue a return nullptr function to the job queue.
+   */
+  void stopAddJob() {
+    for (size_t i = 0; i < workers_.size(); ++i) {
+      jobs_.enqueue([]() { return nullptr; });
+    }
+    jobAdding_ = false;
+  }
+
+  /**
+   * @brief Dequeue the first result in the result queue and return it.
+   * @note If the result queue is empty, wait until it's not empty
+   * or return nullptr if all the results have been returned.
+   */
+  ResultPtrType waitResult() {
+    while (true) {
+      ResultPtrType result = results_.dequeue();
+      if (result) {
+        return result;
+      }
+
+      ++nullResultNum_;
+      if (nullResultNum_ == workers_.size()) {
+        return nullptr;
+      }
+    }
+  }
+
+  /**
+   * @brief The result queue is empty or not.
+   * @return true if empty.
+   */
+  bool testResult() { return results_.empty(); }
+
+ protected:
+  /**
+   * @brief Do the jobs in the job queue sequentianlly
+   * and enqueue the result into the result queue.
+   * @note A nullptr will be enqueued into the resulte queue, when a worker
+   * finished.
+   */
+  virtual void run() {
+    while (true) {
+      JobFunc func = jobs_.dequeue();
+      ResultPtrType result = func();
+      if (result == nullptr || stopping_) {
+        // When a worker finished, a nullptr would be enqueued into results_
+        results_.enqueue(nullptr);
+        break;
+      }
+      results_.enqueue(result);
+    }
+  }
+
+  bool stopping_;
+  bool jobAdding_;
+  size_t nullResultNum_;
+  Queue<JobFunc> jobs_;
+  BlockingQueue<ResultPtrType> results_;
+  std::vector<std::unique_ptr<std::thread>> workers_;
+};
+
+/**
+ * AsyncThreadPool maintains a job queue and threads pool.
+ * It executes the jobs from queue asynchronously.
+ *
+ * Add jobs:
+ *
+ *    Use addJob() to add a new job to the job queue and get a std::future
+ *    result. The caller's thread continues running. Call std::future::get()
+ *    when the result's value is needed, and the caller's thread may be
+ *    blocked until thread-pool finished the job.
+ *
+ *    Use addBatchJobs() to add a batch of jobs.
+ *    Unlike addJob()'s asynchronization, addBatchJobs will block caller's
+ *    thread until all jobs in the batch are finished.
+ *
+ * Stop:
+ *    Use stop() to stop the thread pool. Job can be added once stopped.
+ *
+ * Process-wide Singleton:
+ *    Use AsyncThreadPool::ProcessChannel(N) first to create N threads.
+ *    Then call AsyncThreadPool::ProcessChannel() to get the process-wide global
+ *    thread pool.
+ */
+class AsyncThreadPool {
+ public:
+  typedef std::function<void()> JobFunc;
+
+  AsyncThreadPool() { LOG(FATAL) << "Not implemented"; }
+
+  /**
+   * @brief Construct Function. Install all the workers.
+   * @param[in] threadNum Number of the threads, must greater than 1.
+   */
+  explicit AsyncThreadPool(size_t threadNum) {
+    CHECK_GT(threadNum, 1U);
+    stopping_ = false;
+    workers_.resize(threadNum);
+    for (auto& worker : workers_) {
+      worker.reset(new std::thread([this]() { this->run(); }));
+    }
+  }
+
+  ~AsyncThreadPool() {
+    if (!stopping_) {
+      stop();
+    }
+  }
+
+  /**
+   * @brief Stop all the workers normally.
+   */
+  void stop() {
+    stopping_ = true;
+    for (size_t i = 0; i < workers_.size(); i++) {
+      jobs_.enqueue([] {});
+    }
+    for (auto& worker : workers_) {
+      worker->join();
+    }
+  }
+
+  /**
+   * @brief A process-wide singleton. Used as a global thread pool
+   *    It should be initialized by calling
+   *    AsyncThreadPool::ProcessChannel(N) first to create N threads,
+   *    then call AsyncThreadPool::ProcessChannel() will get the thread pool.
+   */
+  static AsyncThreadPool& ProcessChannel(size_t initThreadNum = 0) {
+    static std::shared_ptr<AsyncThreadPool> channel(
+        new AsyncThreadPool(initThreadNum));
+    return *channel;
+  }
+
+  /**
+   * @brief Add a job to queue and return a std::future.
+   * @note The job will be executed
+   * asynchronously.
+   * Call std::future::get() when the execturation result is needed;
+   */
+  template <class F, class... Args>
+  auto addJob(F&& f, Args&&... args)
+      -> std::future<typename std::result_of<F(Args...)>::type> {
+    CHECK(!stopping_) << "AsyncThreadPool is closed";
+    typedef typename std::result_of<F(Args...)>::type T;
+
+    auto task = std::make_shared<std::packaged_task<T()>>(
+        std::bind(std::forward<F>(f), std::forward<Args>(args)...));
+    auto res = task->get_future();
+    jobs_.enqueue([task] { (*task)(); });
+    return res;
+  }
+
+  /**
+   * @brief Add a batch of jobs to the queue. The main thread will be blocked
+   * until these jobs are finished.
+   * The results will be stored in  `results` according to `jobs` order.
+   *
+   * @tparam F should have a return value.
+   *
+   * @param[in] jobs a vector of executable objection.
+   * @param[in] results a vector to store the results.
+   *
+   * @note *results* may need to be carefully cleared before *addBatchJobs()*.
+   */
+  template <class F>
+  void addBatchJobs(const std::vector<F>& jobs,
+                    std::vector<typename std::result_of<F()>::type>& results) {
+    typedef typename std::result_of<F()>::type T;
+    static_assert(!std::is_same<T, void>::value,
+                  "should pass a non-void function as job");
+
+    std::vector<std::future<T>> resFuts;
+    for (const auto& job : jobs) {
+      resFuts.emplace_back(addJob(job));
+    }
+    for (auto& fut : resFuts) {
+      results.emplace_back(fut.get());
+    }
+  }
+
+  /**
+   * @brief Add a batch of jobs reguardless of its result.
+   * @tparam F don't need to have a return value.
+   * @param[in] jobs a vector of executable objection.
+   */
+  template <class F>
+  void addBatchJobs(const std::vector<F>& jobs) {
+    CHECK(!stopping_) << "AsyncThreadPool is closed";
+    std::vector<std::future<bool>> tmpRes;
+
+    for (const auto& job : jobs) {
+      tmpRes.emplace_back(addJob([&job] {
+        job();
+        return true;
+      }));
+    }
+
+    for (auto& res : tmpRes) {
+      res.get();
+    }
+  }
+
+ protected:
+  /**
+   * @brief Execute the jobs in the job queue.
+   */
+  void run() {
+    while (true) {
+      JobFunc func = jobs_.dequeue();
+      func();
+      if (stopping_) break;
+    }
+  }
+
+ private:
+  std::vector<std::unique_ptr<std::thread>> workers_;
+  Queue<JobFunc> jobs_;
+  bool stopping_;
+};  // class AsyncThreadPool
+
+}  // namespace paddle
diff --git a/paddle/utils/ThreadLocal.cpp b/paddle/legacy/utils/ThreadLocal.cpp
similarity index 100%
rename from paddle/utils/ThreadLocal.cpp
rename to paddle/legacy/utils/ThreadLocal.cpp
diff --git a/paddle/legacy/utils/ThreadLocal.h b/paddle/legacy/utils/ThreadLocal.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5b07506d36875ead65887ea2e221e762be0d621
--- /dev/null
+++ b/paddle/legacy/utils/ThreadLocal.h
@@ -0,0 +1,229 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <pthread.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <map>
+#include <mutex>
+#include <random>
+#include "Logging.h"
+#include "Util.h"
+
+namespace paddle {
+
+/**
+ * Thread local storage for object.
+ * Example:
+ *
+ * Declarartion:
+ * ThreadLocal<vector<int>> vec_;
+ *
+ * Use in thread:
+ * vector<int>& vec = *vec; // obtain the thread specific object
+ * vec.resize(100);
+ *
+ * Note that this ThreadLocal will desconstruct all internal data when thread
+ * exits
+ * This class is suitable for cases when frequently creating and deleting
+ * threads.
+ *
+ * Consider implementing a new ThreadLocal if one needs to frequently create
+ * both instances and threads.
+ *
+ * see also ThreadLocalD
+ */
+template <class T>
+class ThreadLocal {
+ public:
+  ThreadLocal() {
+    CHECK_EQ(pthread_key_create(&threadSpecificKey_, dataDestructor), 0);
+  }
+  ~ThreadLocal() { pthread_key_delete(threadSpecificKey_); }
+
+  /**
+   * @brief get thread local object.
+   * @param if createLocal is true and thread local object is never created,
+   * return a new object. Otherwise, return nullptr.
+   */
+  T* get(bool createLocal = true) {
+    T* p = (T*)pthread_getspecific(threadSpecificKey_);
+    if (!p && createLocal) {
+      p = new T();
+      int ret = pthread_setspecific(threadSpecificKey_, p);
+      CHECK_EQ(ret, 0);
+    }
+    return p;
+  }
+
+  /**
+   * @brief set (overwrite) thread local object. If there is a thread local
+   * object before, the previous object will be destructed before.
+   *
+   */
+  void set(T* p) {
+    if (T* q = get(false)) {
+      dataDestructor(q);
+    }
+    CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
+  }
+
+  /**
+   * return reference.
+   */
+  T& operator*() { return *get(); }
+
+  /**
+   * Implicit conversion to T*
+   */
+  operator T*() { return get(); }
+
+ private:
+  static void dataDestructor(void* p) { delete (T*)p; }
+
+  pthread_key_t threadSpecificKey_;
+};
+
+/**
+ * Almost the same as ThreadLocal, but note that this ThreadLocalD will
+ * destruct all internal data when ThreadLocalD instance destructs.
+ *
+ * This class is suitable for cases when frequently creating and deleting
+ * objects.
+ *
+ * see also ThreadLocal
+ *
+ * @note The type T must implemented default constructor.
+ */
+template <class T>
+class ThreadLocalD {
+ public:
+  ThreadLocalD() { CHECK_EQ(pthread_key_create(&threadSpecificKey_, NULL), 0); }
+  ~ThreadLocalD() {
+    pthread_key_delete(threadSpecificKey_);
+    for (auto t : threadMap_) {
+      dataDestructor(t.second);
+    }
+  }
+
+  /**
+   * @brief Get thread local object. If not exists, create new one.
+   */
+  T* get() {
+    T* p = (T*)pthread_getspecific(threadSpecificKey_);
+    if (!p) {
+      p = new T();
+      CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
+      updateMap(p);
+    }
+    return p;
+  }
+
+  /**
+   * @brief Set thread local object. If there is an object create before, the
+   * old object will be destructed.
+   */
+  void set(T* p) {
+    if (T* q = (T*)pthread_getspecific(threadSpecificKey_)) {
+      dataDestructor(q);
+    }
+    CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
+    updateMap(p);
+  }
+
+  /**
+   * @brief Get reference of the thread local object.
+   */
+  T& operator*() { return *get(); }
+
+ private:
+  static void dataDestructor(void* p) { delete (T*)p; }
+
+  void updateMap(T* p) {
+    pid_t tid = getTID();
+    CHECK_NE(tid, -1);
+    std::lock_guard<std::mutex> guard(mutex_);
+    auto ret = threadMap_.insert(std::make_pair(tid, p));
+    if (!ret.second) {
+      ret.first->second = p;
+    }
+  }
+
+  pthread_key_t threadSpecificKey_;
+  std::mutex mutex_;
+  std::map<pid_t, T*> threadMap_;
+};
+
+/**
+ * @brief Thread-safe C-style random API.
+ */
+class ThreadLocalRand {
+ public:
+  /**
+   * initSeed just like srand,
+   * called by main thread,
+   * init defaultSeed for all thread
+   */
+  static void initSeed(unsigned int seed) { defaultSeed_ = seed; }
+
+  /**
+   * initThreadSeed called by each thread,
+   * init seed to defaultSeed + *tid*
+   * It should be called after main initSeed and before using rand()
+   * It's optional, getSeed will init seed if it's not initialized.
+   */
+  static void initThreadSeed(int tid) {
+    seed_.set(new unsigned int(defaultSeed_ + tid));
+  }
+
+  /// thread get seed, then can call rand_r many times.
+  /// Caller thread can modify the seed value if it's necessary.
+  ///
+  /// if flag thread_local_rand_use_global_seed set,
+  /// the seed will be set to defaultSeed in thread's first call.
+  static unsigned int* getSeed();
+
+  /// like ::rand
+  static int rand() { return rand_r(getSeed()); }
+
+  /**
+   * Get defaultSeed for all thread.
+   */
+  static int getDefaultSeed() { return defaultSeed_; }
+
+ protected:
+  static unsigned int defaultSeed_;
+  static ThreadLocal<unsigned int> seed_;
+};
+
+/**
+ * @brief Thread-safe C++ style random engine.
+ */
+class ThreadLocalRandomEngine {
+ public:
+  /**
+   * get random_engine for each thread.
+   *
+   * Engine's seed will be initialized by ThreadLocalRand.
+   */
+  static std::default_random_engine& get();
+
+ protected:
+  static ThreadLocal<std::default_random_engine> engine_;
+};
+
+}  // namespace paddle
diff --git a/paddle/utils/Util.cpp b/paddle/legacy/utils/Util.cpp
similarity index 100%
rename from paddle/utils/Util.cpp
rename to paddle/legacy/utils/Util.cpp
diff --git a/paddle/legacy/utils/Util.h b/paddle/legacy/utils/Util.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6f05e30d308b8b94935897e947350934a5971ee
--- /dev/null
+++ b/paddle/legacy/utils/Util.h
@@ -0,0 +1,570 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <sys/syscall.h>  // for syscall()
+#include <sys/types.h>
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+#include "Common.h"
+#include "Logging.h"
+#include "TrainerConfig.pb.h"
+
+#include "Flags.h"
+#include "hl_gpu.h"
+
+#if defined(__ANDROID__) && (__ANDROID_API__ < 21)
+inline int rand_r(unsigned int* seedp) {
+  (void)seedp;
+  return rand();
+}
+#endif
+
+/**
+ * Loop over the elements in a container
+ * TODO(yuyang18): It's this foreach useful? Why not use C++ 11 foreach,
+ *                 or make it a inline method?
+ * Example:
+ * FOR_EACH(it, array) {
+ *  sum += *it;
+ * }
+ */
+#define FOR_EACH(iterator_name, container)                              \
+  for (auto iterator_name = (container).begin(), e = (container).end(); \
+       iterator_name != e;                                              \
+       ++iterator_name)
+
+/**
+ * Loop over the elements in a container in reverse order
+ * TODO(yuyang18): It's this foreach useful? Why not use C++ 11 foreach,
+ *                 or make it a inline method?
+ * Example:
+ * FOR_EACH_R(it, array) {
+ *  sum += *it;
+ * }
+ */
+#define FOR_EACH_R(iterator_name, container)                              \
+  for (auto iterator_name = (container).rbegin(), e = (container).rend(); \
+       iterator_name != e;                                                \
+       ++iterator_name)
+
+namespace paddle {
+
+// return the thread id used by glog
+pid_t getTID();
+
+/**
+ * return the 1-based index of the highest bit set
+ *
+ * for x > 0:
+ * \f[
+ *    findLastSet(x) = 1 + \floor*{\log_{2}x}
+ * \f]
+ */
+inline constexpr size_t findLastSet(size_t x) {
+  return std::is_same<size_t, unsigned int>::value
+             ? (x ? 8 * sizeof(x) - __builtin_clz(x) : 0)
+             : (std::is_same<size_t, unsigned long>::value  // NOLINT
+                    ? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
+                    : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
+}
+
+/**
+ * calculate the non-negative remainder of a/b
+ * @param[in] a
+ * @param[in] b, should be positive
+ * @return the non-negative remainder of a / b
+ */
+inline int mod(int a, int b) {
+  int r = a % b;
+  return r >= 0 ? r : r + b;
+}
+
+/**
+ * find the value given a key k from container c.
+ * If the key can be found, the value is stored in *value
+ * return true if the key can be found. false otherwise.
+ */
+template <class K, class V, class C>
+bool mapGet(const K& k, const C& c, V* value) {
+  auto it = c.find(k);
+  if (it != c.end()) {
+    *value = it->second;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+template <class Container, class T>
+static bool contains(const Container& container, const T& val) {
+  return std::find(container.begin(), container.end(), val) != container.end();
+}
+
+/**
+ * pop and get the front element of a container
+ */
+template <typename Container>
+typename Container::value_type pop_get_front(Container& c) {
+  typename Container::value_type v;
+  swap(v, c.front());
+  c.pop_front();
+  return v;
+}
+
+#define ARRAYSIZE(a) (sizeof(a) / sizeof(*(a)))
+
+/**
+ * Initialize some creators or initFunctions for layers and data
+ * providers.
+ * Client codes should call this function before they refer any other
+ * codes that use the layer class and data provider class.
+ *
+ * Codes inside 'core' directory can call initMain which calls
+ * runInitFunctions directly, while codes outside core can simply
+ * call runInitFunctions if they don't need the commandline flags
+ * designed for PADDLE main procedure.
+ */
+void runInitFunctions();
+
+/**
+ * Initialize logging and parse commandline
+ */
+void initMain(int argc, char** argv);
+
+// read the whole file into a string
+std::string readFile(const std::string& fileName);
+
+// copy file to path
+void copyFileToPath(const std::string& file, const std::string& path);
+
+// test file exist or not
+bool fileExist(const char* filename);
+// touch file if not exist
+void touchFile(const char* filename);
+// make dir if not exist
+void mkDir(const char* filename);
+void mkDirRecursively(const char* filename);
+
+void rmDir(const char* folderName);
+
+// load a file list file into a vector(fileList)
+void loadFileList(const std::string& fileListFileName,
+                  std::vector<std::string>& fileList);
+
+/**
+ * Register a function, the function will be called in initMain(). Functions
+ * with higher priority will be called first. The execution order of functions
+ * with same priority is not defined.
+ */
+void registerInitFunction(std::function<void()> func, int priority = 0);
+class InitFunction {
+ public:
+  explicit InitFunction(std::function<void()> func, int priority = 0) {
+    registerInitFunction(func, priority);
+  }
+};
+
+/**
+ * Class SetDevice provides a mechanism for set device enviroment.
+ * When a SetDevice object is created, it attempts to change device enviroment.
+ * When the SetDevice object is destructed, it will restore device environment.
+ */
+class SetDevice {
+ public:
+  explicit SetDevice(int deviceId) {
+    isSet_ = deviceId >= 0;
+    devId_ = 0;
+    if (isSet_) {
+      devId_ = hl_get_device();
+      hl_set_device(deviceId);
+    }
+  }
+  ~SetDevice() {
+    if (isSet_) {
+      hl_set_device(devId_);
+    }
+  }
+
+ protected:
+  bool isSet_;
+  int devId_;
+};
+
+/**
+ * Enables direct access to memory allocations on a peer device(d2).
+ * input:
+ * *d1* is device can direct access device d2.
+ * *d2* is peer device to enable direct access to by the d1 device.
+ */
+inline void enablePeerAccess(int d1, int d2) {
+#ifdef PADDLE_WITH_CUDA
+  if (hl_device_can_access_peer(d1, d2)) {
+    SetDevice dev(d1);
+    hl_device_enable_peer_access(d2);
+  }
+#else
+  LOG(FATAL) << "Paddle should be compiled in GPU mode to use this method.";
+#endif
+}
+
+/**
+ * Change the gpu computation mode to asynchronized mode for the rest of the
+ * compilation block. This is useful if the computation consists of multiple
+ * small steps. Async mode can overlap the cuda-kernel launch overhead with the
+ * actual computation.
+ * Example:
+ * {
+ *    AsycnGpuBlock asyncBlock;
+ *    do_some_gpu_computation
+ * }
+ */
+class AsyncGpuBlock {
+ public:
+  AsyncGpuBlock() : syncFlag_(hl_get_sync_flag()) { hl_set_sync_flag(false); }
+  ~AsyncGpuBlock() {
+    if (syncFlag_) {
+      hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+      hl_set_sync_flag(syncFlag_);
+    }
+  }
+
+ private:
+  bool syncFlag_;
+};
+
+inline bool useGpu(int deviceId) {
+  return FLAGS_parallel_nn ? (deviceId >= 0 ? true : false) : FLAGS_use_gpu;
+}
+
+/*
+ * hppl activation mode
+ */
+hl_activation_mode_t hlActiveType(const std::string& type);
+
+/**
+ * Return value: memory usage ratio (from 0-1)
+ */
+double getMemoryUsage();
+
+/**
+ * split array by index.
+ * used by sync multi thread task,
+ * each thread call calcSplitArrayInterval with thread id,
+ * get a interval as return.
+ * input:
+ * *totalSize* is array size,
+ * *tId* is thread id, *tSize* is total worker thread num
+ * output:
+ * start and end index as a std::pair
+ */
+inline std::pair<size_t, size_t> calcSplitArrayInterval(size_t totalSize,
+                                                        size_t tId,
+                                                        size_t tSize) {
+  size_t start = totalSize * tId / tSize;
+  size_t end = totalSize * (tId + 1) / tSize;
+  return std::make_pair(start, end);
+}
+
+/**
+ * same as above, but split at boundary of block.
+ */
+inline std::pair<size_t, size_t> calcSplitArrayInterval(size_t totalSize,
+                                                        size_t tId,
+                                                        size_t tSize,
+                                                        size_t blockSize) {
+  size_t numBlocks = totalSize / blockSize;
+  if (numBlocks * blockSize < totalSize) {
+    numBlocks++;
+  }
+
+  auto interval = calcSplitArrayInterval(numBlocks, tId, tSize);
+  size_t start = std::min(interval.first * blockSize, totalSize);
+  size_t end = std::min(interval.second * blockSize, totalSize);
+
+  return std::make_pair(start, end);
+}
+
+// Calculate the number of pservers/dservers based
+// on the host list and port_num.
+size_t calculateServiceNum(const std::string& pservers, int ports_num);
+
+/**
+ * sort and unique ids vector.
+ */
+inline void uniqueIds(std::vector<uint32_t>& ids) {
+  std::sort(ids.begin(), ids.end());
+  auto endpos = std::unique(ids.begin(), ids.end());
+  ids.erase(endpos, ids.end());
+}
+
+/**
+ * Read Type value
+ */
+template <typename T>
+T readT(char*& p, const char* pEnd) {
+  int minus = pEnd - p - sizeof(T);
+  CHECK_LE(0, minus) << "readT: Out of range.";
+  T v = *reinterpret_cast<T*>(p);
+  p += sizeof(T);
+  return v;
+}
+
+void memcpyWithCheck(void* dest,
+                     const void* src,
+                     size_t num,
+                     const void* srcEnd);
+
+/**
+ * A global sync thread pool, has #FLAGS_trainer_count of threads.
+ * can be used in main thread.
+ */
+class SyncThreadPool;
+SyncThreadPool* getGlobalSyncThreadPool();
+
+namespace path {
+
+// directory separator
+const char sep = '/';
+
+// Return the base name of pathname path.
+std::string basename(const std::string& path);
+
+// Return the directory name of path. If the path does not contains any
+// directory, it returns an empty string.
+std::string dirname(const std::string& path);
+
+/*
+  Join two path components intelligently.
+  The return value is the concatenation of part1 and part2 with exactly one
+  directory separator (path.sep) following each non-empty part except the last,
+  meaning that the result will only end in a separator if the last part is
+  empty.
+  If a component is an absolute path, all previous components are thrown away
+  and joining continues from the absolute path component.
+*/
+std::string join(const std::string& part1, const std::string& part2);
+
+template <typename... Args>
+std::string join(const std::string& part1,
+                 const std::string& part2,
+                 Args... args) {
+  return join(join(part1, part2), args...);
+}
+
+}  // namespace path
+
+/**
+ * A Checker for each invoke of method in same thread.
+ */
+class SameThreadChecker {
+ public:
+  SameThreadChecker() {}
+
+  /**
+   * Disable copy
+   */
+  SameThreadChecker(const SameThreadChecker& other) = delete;
+  SameThreadChecker& operator=(const SameThreadChecker& other) = delete;
+
+  /**
+   * Each invoke of check method should be in same thread, otherwise, it will
+   * failed and core dump.
+   */
+  void check() {
+    std::thread::id curThreadId = std::this_thread::get_id();
+    std::call_once(onceFlag_, [&] { invokeThreadId_ = curThreadId; });
+    CHECK_EQ(invokeThreadId_, curThreadId)
+        << "This method should invoke in "
+           "same thread, but first invoked in "
+        << invokeThreadId_ << " current invoked in " << curThreadId;
+  }
+
+ private:
+  std::once_flag onceFlag_;
+  std::thread::id invokeThreadId_;
+};
+
+/**
+ * Key-Value Cache Helper.
+ *
+ * It store a object instance global. User can invoke get method by key and a
+ * object creator callback. If there is a instance stored in cache, then it will
+ * return a shared_ptr of it, otherwise, it will invoke creator callback, create
+ * a new instance store global, and return it.
+ *
+ * The cache instance will release when nobody hold a reference to it.
+ *
+ * The KType is the key type.
+ * The VType is the value type.
+ * The Hash is the key hasher object.
+ */
+template <typename KType, typename VType, typename Hash>
+class WeakKVCache {
+ public:
+  WeakKVCache() {}
+
+  std::shared_ptr<VType> get(const KType& key,
+                             const std::function<VType*()>& creator) {
+    std::lock_guard<std::mutex> guard(this->lock_);
+    auto it = this->storage_.find(key);
+    if (it != this->storage_.end()) {
+      auto& val = it->second;
+      auto retVal = val.lock();
+      if (retVal != nullptr) {
+        return retVal;
+      }  // else fall trough. Because it is WeakPtr Cache.
+    }
+    auto rawPtr = creator();
+    CHECK(rawPtr != nullptr);
+    std::shared_ptr<VType> retVal(rawPtr);
+    this->storage_[key] = retVal;
+    return retVal;
+  }
+
+ private:
+  std::mutex lock_;
+  std::unordered_map<KType, std::weak_ptr<VType>, Hash> storage_;
+};
+
+/**
+ * @brief The ScopedCallbacks class is a callback invoker when object is
+ *        created and destroyed.
+ */
+template <typename CallbackType, typename... Args>
+class ScopedCallbacks {
+ public:
+  ScopedCallbacks(CallbackType enter, CallbackType exit, Args&... args)
+      : exit_(std::bind(exit, args...)) {
+    enter(args...);
+  }
+
+  ScopedCallbacks(const ScopedCallbacks& other) = delete;
+  ScopedCallbacks& operator=(const ScopedCallbacks& other) = delete;
+
+  ~ScopedCallbacks() { exit_(); }
+
+ private:
+  std::function<void()> exit_;
+};
+
+/**
+ * std compatible allocator with memory alignment.
+ * @tparam T type of allocator elements.
+ * @tparam Alignment the alignment in bytes.
+ */
+template <typename T, size_t Alignment>
+class AlignedAllocator {
+ public:
+  /// std campatible typedefs.
+  typedef T* pointer;
+  typedef const T* const_pointer;
+  typedef T& reference;
+  typedef const T& const_reference;
+  typedef T value_type;
+  typedef size_t size_type;
+  typedef ptrdiff_t difference_type;
+
+  T* address(T& r) const { return &r; }
+
+  const T* address(const T& r) const { return &r; }
+
+  size_t max_size() const {
+    return std::numeric_limits<size_t>::max() / sizeof(T);
+  }
+
+  template <typename U>
+  struct rebind {
+    typedef AlignedAllocator<U, Alignment> other;
+  };
+
+  bool operator==(const AlignedAllocator& other) const { return true; }
+
+  bool operator!=(const AlignedAllocator& other) const {
+    return !(*this == &other);
+  }
+
+  void construct(const T* p, const T& t) const {
+    void* pv = const_cast<T*>(p);
+    new (pv) T(t);
+  }
+
+  void deallocate(const T* p, const size_type n) const {
+    (void)(n);  // UNUSED n
+    free(const_cast<T*>(p));
+  }
+
+  void destroy(const T* p) const { p->~T(); }
+
+  AlignedAllocator() {}
+  ~AlignedAllocator() {}
+
+  AlignedAllocator(const AlignedAllocator&) {}
+  template <typename U>
+  AlignedAllocator(const AlignedAllocator<U, Alignment>&) {}
+
+  /**
+   * @brief allocate n elements of type T, the first address is aligned by
+   *        Alignment bytes.
+   * @param n element count.
+   * @return begin address of allocated buffer
+   * @throw std::length_error for n * sizeof(T) is overflowed.
+   * @throw std::bad_alloc
+   */
+  T* allocate(const size_type n) const {
+    if (n == 0) {
+      return nullptr;
+    }
+    if (n > max_size()) {
+      throw std::length_error("AlignAllocator<T>::allocate() - Int Overflow.");
+    }
+    void* r = nullptr;
+    CHECK_EQ(posix_memalign(&r, Alignment * 8, sizeof(T) * n), 0);
+    if (r == nullptr) {
+      throw std::bad_alloc();
+    } else {
+      return static_cast<T*>(r);
+    }
+  }
+
+  template <typename U>
+  T* allocate(const std::size_t n, const U* /* const hint */) const {
+    return this->allocate(n);
+  }
+
+ private:
+  AlignedAllocator& operator=(const AlignedAllocator&);  // disable
+};
+
+class Deprecated {
+ public:
+  explicit Deprecated(const std::string& msg = "") {
+    if (msg.empty()) {
+      LOG(WARNING) << "This class is deprecated, please do not use this class.";
+    } else {
+      LOG(WARNING) << msg;
+    }
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/utils/Version.cpp b/paddle/legacy/utils/Version.cpp
similarity index 100%
rename from paddle/utils/Version.cpp
rename to paddle/legacy/utils/Version.cpp
diff --git a/paddle/utils/Version.h b/paddle/legacy/utils/Version.h
similarity index 100%
rename from paddle/utils/Version.h
rename to paddle/legacy/utils/Version.h
diff --git a/paddle/legacy/utils/arch/linux/Locks.cpp b/paddle/legacy/utils/arch/linux/Locks.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..32d351e3328afd79007aea7a51e59cbfc941eeeb
--- /dev/null
+++ b/paddle/legacy/utils/arch/linux/Locks.cpp
@@ -0,0 +1,149 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/Locks.h"
+#include <semaphore.h>
+#include <unistd.h>
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+class SemaphorePrivate {
+ public:
+  sem_t sem;
+};
+
+Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) {
+  sem_init(&m->sem, 0, initValue);
+}
+
+Semaphore::~Semaphore() {
+  sem_destroy(&m->sem);
+  delete m;
+}
+
+bool Semaphore::timeWait(struct timespec* ts) {
+  return (0 == sem_timedwait(&m->sem, ts));
+}
+
+void Semaphore::wait() { sem_wait(&m->sem); }
+
+void Semaphore::post() { sem_post(&m->sem); }
+
+/// SpinLockPrivate
+
+#ifdef PADDLE_USE_PTHREAD_SPINLOCK
+
+class SpinLockPrivate {
+ public:
+  inline SpinLockPrivate() { pthread_spin_init(&lock_, 0); }
+  inline ~SpinLockPrivate() { pthread_spin_destroy(&lock_); }
+
+  inline void lock() { pthread_spin_lock(&lock_); }
+  inline void unlock() { pthread_spin_unlock(&lock_); }
+
+  pthread_spinlock_t lock_;
+  char padding_[64 - sizeof(pthread_spinlock_t)];
+};
+
+#else
+// clang-format off
+#include <cstddef>
+#include <atomic>
+// clang-format on
+
+class SpinLockPrivate {
+ public:
+  inline void lock() {
+    while (lock_.test_and_set(std::memory_order_acquire)) {
+    }
+  }
+  inline void unlock() { lock_.clear(std::memory_order_release); }
+
+  std::atomic_flag lock_ = ATOMIC_FLAG_INIT;
+  char padding_[64 - sizeof(lock_)];  // Padding to cache line size
+};
+
+#endif
+
+SpinLock::SpinLock() : m(new SpinLockPrivate()) {}
+SpinLock::~SpinLock() { delete m; }
+void SpinLock::lock() { m->lock(); }
+void SpinLock::unlock() { m->unlock(); }
+
+/// ThreadBarrierPrivate
+
+#ifdef PADDLE_USE_PTHREAD_BARRIER
+
+class ThreadBarrierPrivate {
+ public:
+  pthread_barrier_t barrier_;
+
+  inline explicit ThreadBarrierPrivate(int count) {
+    pthread_barrier_init(&barrier_, nullptr, count);
+  }
+
+  inline ~ThreadBarrierPrivate() { pthread_barrier_destroy(&barrier_); }
+
+  inline void wait() { pthread_barrier_wait(&barrier_); }
+};
+
+#else
+
+class ThreadBarrierPrivate {
+ public:
+  pthread_mutex_t mutex_;
+  pthread_cond_t cond_;
+  int count_;
+  int tripCount_;
+
+  inline explicit ThreadBarrierPrivate(int cnt) : count_(0), tripCount_(cnt) {
+    CHECK_NE(cnt, 0);
+    CHECK_GE(pthread_mutex_init(&mutex_, 0), 0);
+    CHECK_GE(pthread_cond_init(&cond_, 0), 0);
+  }
+
+  inline ~ThreadBarrierPrivate() {
+    pthread_cond_destroy(&cond_);
+    pthread_mutex_destroy(&mutex_);
+  }
+
+  /**
+   * @brief wait
+   * @return true if the last wait
+   */
+  inline bool wait() {
+    pthread_mutex_lock(&mutex_);
+    ++count_;
+    if (count_ >= tripCount_) {
+      count_ = 0;
+      pthread_cond_broadcast(&cond_);
+      pthread_mutex_unlock(&mutex_);
+      return true;
+    } else {
+      pthread_cond_wait(&cond_, &mutex_);
+      pthread_mutex_unlock(&mutex_);
+      return false;
+    }
+  }
+};
+
+#endif
+
+/// ThreadBarrier
+
+ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate(count)) {}
+ThreadBarrier::~ThreadBarrier() { delete m; }
+void ThreadBarrier::wait() { m->wait(); }
+
+}  // namespace paddle
diff --git a/paddle/legacy/utils/arch/osx/Excepts.cpp b/paddle/legacy/utils/arch/osx/Excepts.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2b7d6dca8454417fd78f6da7f906785d24a6219b
--- /dev/null
+++ b/paddle/legacy/utils/arch/osx/Excepts.cpp
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/Excepts.h"
+
+#if defined(__APPLE__) || defined(__OSX__)
+#if defined(__arm__) || defined(__arm64__)
+// TODO(liuyiqun): implement the arm version
+int fegetexcept(void) { return -1; }
+int feenableexcept(unsigned int excepts) { return -1; }
+int fedisableexcept(unsigned int excepts) { return -1; }
+#else
+int fegetexcept(void) {
+  static fenv_t fenv;
+  return fegetenv(&fenv) ? -1 : (fenv.__control & FE_ALL_EXCEPT);
+}
+
+int feenableexcept(unsigned int excepts) {
+  static fenv_t fenv;
+  unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts;
+
+  if (fegetenv(&fenv)) return -1;
+  old_excepts = fenv.__control & FE_ALL_EXCEPT;
+
+  // unmask
+  fenv.__control &= ~new_excepts;
+  fenv.__mxcsr &= ~(new_excepts << 7);
+
+  return (fesetenv(&fenv) ? -1 : old_excepts);
+}
+
+int fedisableexcept(unsigned int excepts) {
+  static fenv_t fenv;
+  unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts;
+
+  if (fegetenv(&fenv)) return -1;
+  old_excepts = fenv.__control & FE_ALL_EXCEPT;
+
+  // mask
+  fenv.__control |= new_excepts;
+  fenv.__mxcsr |= new_excepts << 7;
+
+  return (fesetenv(&fenv) ? -1 : old_excepts);
+}
+#endif
+#endif
diff --git a/paddle/legacy/utils/arch/osx/Locks.cpp b/paddle/legacy/utils/arch/osx/Locks.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b68c48f0c31aa928a634e0369295ec084b9ccd8e
--- /dev/null
+++ b/paddle/legacy/utils/arch/osx/Locks.cpp
@@ -0,0 +1,105 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/Locks.h"
+#include <dispatch/dispatch.h>
+#include <libkern/OSAtomic.h>
+#include <atomic>
+#include "paddle/legacy/utils/Logging.h"
+
+namespace paddle {
+
+class SemaphorePrivate {
+ public:
+  ~SemaphorePrivate() { dispatch_release(sem); }
+
+  dispatch_semaphore_t sem;
+};
+
+Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) {
+  m->sem = dispatch_semaphore_create(initValue);
+}
+
+Semaphore::~Semaphore() { delete m; }
+
+bool Semaphore::timeWait(timespec *ts) {
+  dispatch_time_t tm = dispatch_walltime(ts, 0);
+  return (0 == dispatch_semaphore_wait(m->sem, tm));
+}
+
+void Semaphore::wait() {
+  dispatch_semaphore_wait(m->sem, DISPATCH_TIME_FOREVER);
+}
+
+void Semaphore::post() { dispatch_semaphore_signal(m->sem); }
+
+class SpinLockPrivate {
+ public:
+  std::atomic_flag lock_ = ATOMIC_FLAG_INIT;
+  char padding_[64 - sizeof(lock_)];  // Padding to cache line size
+};
+
+SpinLock::SpinLock() : m(new SpinLockPrivate()) {}
+SpinLock::~SpinLock() { delete m; }
+
+void SpinLock::lock() {
+  while (m->lock_.test_and_set(std::memory_order_acquire)) {
+  }
+}
+
+void SpinLock::unlock() { m->lock_.clear(std::memory_order_release); }
+
+class ThreadBarrierPrivate {
+ public:
+  pthread_mutex_t mutex_;
+  pthread_cond_t cond_;
+  int count_;
+  int tripCount_;
+
+  inline explicit ThreadBarrierPrivate(int cnt) : count_(0), tripCount_(cnt) {
+    CHECK_NE(cnt, 0);
+    CHECK_GE(pthread_mutex_init(&mutex_, 0), 0);
+    CHECK_GE(pthread_cond_init(&cond_, 0), 0);
+  }
+
+  inline ~ThreadBarrierPrivate() {
+    pthread_cond_destroy(&cond_);
+    pthread_mutex_destroy(&mutex_);
+  }
+
+  /**
+   * @brief wait
+   * @return true if the last wait
+   */
+  inline bool wait() {
+    pthread_mutex_lock(&mutex_);
+    ++count_;
+    if (count_ >= tripCount_) {
+      count_ = 0;
+      pthread_cond_broadcast(&cond_);
+      pthread_mutex_unlock(&mutex_);
+      return true;
+    } else {
+      pthread_cond_wait(&cond_, &mutex_);
+      pthread_mutex_unlock(&mutex_);
+      return false;
+    }
+  }
+};
+
+ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate(count)) {}
+ThreadBarrier::~ThreadBarrier() { delete m; }
+void ThreadBarrier::wait() { m->wait(); }
+
+}  // namespace paddle
diff --git a/paddle/utils/enable_virtualenv.py b/paddle/legacy/utils/enable_virtualenv.py
similarity index 100%
rename from paddle/utils/enable_virtualenv.py
rename to paddle/legacy/utils/enable_virtualenv.py
diff --git a/paddle/legacy/utils/tests/CMakeLists.txt b/paddle/legacy/utils/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4af01db5c84cb497b756027cbb6ad06c081a8899
--- /dev/null
+++ b/paddle/legacy/utils/tests/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_simple_unittest(test_Thread)
+add_simple_unittest(test_StringUtils)
+add_simple_unittest(test_CustomStackTrace)
+add_simple_unittest(test_ThreadBarrier)
+add_simple_unittest(test_SpinLock)
+add_simple_unittest(test_SIMDFlags)
+add_simple_unittest(test_Error)
+
+add_executable(
+    test_CustomStackTracePrint
+    test_CustomStackTracePrint.cpp
+)
+link_paddle_exe(test_CustomStackTracePrint)
+if(NOT APPLE)
+    add_test(NAME test_CustomStackTracePrint
+        COMMAND ${PADDLE_SOURCE_DIR}/paddle/legacy/utils/tests/test_CustomStackTracePrint.sh
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+endif()
diff --git a/paddle/legacy/utils/tests/test_CustomStackTrace.cpp b/paddle/legacy/utils/tests/test_CustomStackTrace.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2a418e3ae2277fc5dc6856d131dafa9daf0bad47
--- /dev/null
+++ b/paddle/legacy/utils/tests/test_CustomStackTrace.cpp
@@ -0,0 +1,92 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gflags/gflags.h>  // NOLINT
+#include <gtest/gtest.h>    // NOLINT
+
+#include "paddle/legacy/utils/CustomStackTrace.h"
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/StringUtil.h"
+#include "paddle/legacy/utils/Util.h"
+
+DEFINE_int32(test_thread_num, 10, "testing thread number");
+
+void testNormalImpl(
+    const std::function<void(paddle::CustomStackTrace<std::string>&,
+                             size_t,
+                             size_t,
+                             paddle::ThreadBarrier&,
+                             paddle::ThreadBarrier&)>& callback) {
+  paddle::CustomStackTrace<std::string> tracer;
+  paddle::ThreadBarrier doneBarrier(FLAGS_test_thread_num + 1);
+  paddle::ThreadBarrier startBarrier(FLAGS_test_thread_num + 1);
+  constexpr size_t countDown = 10;
+  constexpr size_t layerSize = 1000;
+  std::vector<std::unique_ptr<std::thread>> threads;
+  threads.reserve(FLAGS_test_thread_num);
+
+  for (int32_t i = 0; i < FLAGS_test_thread_num; ++i) {
+    threads.emplace_back(
+        new std::thread([&tracer, &startBarrier, &doneBarrier, &callback] {
+          callback(tracer, countDown, layerSize, startBarrier, doneBarrier);
+        }));
+  }
+  size_t cntDown = countDown;
+  while (cntDown-- > 0) {
+    startBarrier.wait();
+    sleep(1);
+    doneBarrier.wait();
+    ASSERT_TRUE(tracer.empty());
+  }
+
+  for (auto& thread : threads) {
+    thread->join();
+  }
+}
+
+TEST(CustomStackTrace, normalTrain) {
+  testNormalImpl([](paddle::CustomStackTrace<std::string>& tracer,
+                    size_t countDown,
+                    size_t layerSize,
+                    paddle::ThreadBarrier& start,
+                    paddle::ThreadBarrier& finish) {
+    while (countDown-- > 0) {
+      start.wait();
+      for (size_t i = 0; i < layerSize; ++i) {
+        tracer.push("layer_" + paddle::str::to_string(i));
+      }
+      for (size_t i = 0; i < layerSize; ++i) {
+        tracer.pop("layer_" + paddle::str::to_string(layerSize - 1 - i));
+      }
+      finish.wait();
+    }
+  });
+}
+
+TEST(CustomStackTrace, normalTest) {
+  testNormalImpl([](paddle::CustomStackTrace<std::string>& tracer,
+                    size_t countDown,
+                    size_t layerSize,
+                    paddle::ThreadBarrier& start,
+                    paddle::ThreadBarrier& finish) {
+    while (countDown-- > 0) {
+      start.wait();
+      for (size_t i = 0; i < layerSize; ++i) {
+        tracer.push("layer_" + paddle::str::to_string(i));
+      }
+      tracer.clear();  // in forward test, tracer will clear after forward.
+      finish.wait();
+    }
+  });
+}
diff --git a/paddle/legacy/utils/tests/test_CustomStackTracePrint.cpp b/paddle/legacy/utils/tests/test_CustomStackTracePrint.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..78886a3ed9f237a39079bbf604a376f98bd86b59
--- /dev/null
+++ b/paddle/legacy/utils/tests/test_CustomStackTracePrint.cpp
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/CustomStackTrace.h"
+#include "paddle/legacy/utils/StringUtil.h"
+#include "paddle/legacy/utils/Util.h"
+
+int main(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+
+  for (size_t i = 0; i < 1000; ++i) {
+    paddle::gLayerStackTrace.push("layer_" + paddle::str::to_string(i));
+    if (i == 998) {
+      throw "Unhandle exception";
+    }
+  }
+
+  return 0;
+}
diff --git a/paddle/utils/tests/test_CustomStackTracePrint.sh b/paddle/legacy/utils/tests/test_CustomStackTracePrint.sh
similarity index 100%
rename from paddle/utils/tests/test_CustomStackTracePrint.sh
rename to paddle/legacy/utils/tests/test_CustomStackTracePrint.sh
diff --git a/paddle/legacy/utils/tests/test_Error.cpp b/paddle/legacy/utils/tests/test_Error.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..250c4d58a64a0d284a15418e40264f1857d30050
--- /dev/null
+++ b/paddle/legacy/utils/tests/test_Error.cpp
@@ -0,0 +1,34 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/Error.h"
+
+#include <gtest/gtest.h>
+
+TEST(Error, testAll) {
+  paddle::Error error;
+  ASSERT_TRUE(error.isOK());
+  error = paddle::Error("I'm the error");
+  ASSERT_FALSE(error.isOK());
+  ASSERT_STREQ("I'm the error", error.msg());
+
+  error = paddle::Error("error2");
+  ASSERT_FALSE(error.isOK());
+  ASSERT_STREQ("error2", error.msg());
+
+  int i = 3;
+  auto error3 = paddle::Error("error%d", i);
+  ASSERT_FALSE(error3.isOK());
+  ASSERT_STREQ("error3", error3.msg());
+}
diff --git a/paddle/legacy/utils/tests/test_SIMDFlags.cpp b/paddle/legacy/utils/tests/test_SIMDFlags.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6362210acdaf26a26a2548ddaf8ed455b9c76618
--- /dev/null
+++ b/paddle/legacy/utils/tests/test_SIMDFlags.cpp
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+
+#include "paddle/legacy/utils/CpuId.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Util.h"
+
+using namespace paddle;  // NOLINT
+
+TEST(SIMDFlags, gccTest) {
+#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__)) && \
+    !defined(__arm__) && !defined(__aarch64__)
+  // clang-format off
+  CHECK(!__builtin_cpu_supports("sse")    != HAS_SSE);
+  CHECK(!__builtin_cpu_supports("sse2")   != HAS_SSE2);
+  CHECK(!__builtin_cpu_supports("sse3")   != HAS_SSE3);
+  CHECK(!__builtin_cpu_supports("ssse3")  != HAS_SSSE3);
+  CHECK(!__builtin_cpu_supports("sse4.1") != HAS_SSE41);
+  CHECK(!__builtin_cpu_supports("sse4.2") != HAS_SSE42);
+  CHECK(!__builtin_cpu_supports("avx")    != HAS_AVX);
+  CHECK(!__builtin_cpu_supports("avx2")   != HAS_AVX2);
+// clang-format on
+#endif
+}
+
+TEST(SIMDFlags, normalPrint) {
+  LOG(INFO) << "Has SSE:     " << std::boolalpha << HAS_SSE;
+  LOG(INFO) << "Has SSE2:    " << std::boolalpha << HAS_SSE2;
+  LOG(INFO) << "Has SSE3:    " << std::boolalpha << HAS_SSE3;
+  LOG(INFO) << "Has SSSE3:   " << std::boolalpha << HAS_SSSE3;
+  LOG(INFO) << "Has SSE4:    " << std::boolalpha << HAS_SSE41 || HAS_SSE42;
+  LOG(INFO) << "Has FMA3:    " << std::boolalpha << HAS_FMA3;
+  LOG(INFO) << "Has FMA4:    " << std::boolalpha << HAS_FMA4;
+  LOG(INFO) << "Has AVX:     " << std::boolalpha << HAS_AVX;
+  LOG(INFO) << "Has AVX2:    " << std::boolalpha << HAS_AVX2;
+  LOG(INFO) << "Has AVX512:  " << std::boolalpha << HAS_AVX512;
+  LOG(INFO) << "Has NEON:    " << std::boolalpha << HAS_NEON;
+}
diff --git a/paddle/legacy/utils/tests/test_SpinLock.cpp b/paddle/legacy/utils/tests/test_SpinLock.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4cd7836d6af251b48925de95c2811361313d7b41
--- /dev/null
+++ b/paddle/legacy/utils/tests/test_SpinLock.cpp
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Util.h"
+
+DEFINE_int32(test_thread_num, 100, "testing thread number");
+
+void testNormalImpl(
+    size_t thread_num,
+    const std::function<void(size_t, size_t&, paddle::SpinLock&)>& callback) {
+  paddle::SpinLock mutex;
+  std::vector<std::thread> threads;
+  threads.reserve(thread_num);
+
+  size_t count = 0;
+  for (size_t i = 0; i < thread_num; ++i) {
+    threads.emplace_back([&thread_num, &count, &mutex, &callback] {
+      callback(thread_num, count, mutex);
+    });
+  }
+  for (auto& thread : threads) {
+    thread.join();
+  }
+  // Check whether all threads reach this point or not
+  CHECK_EQ(count, thread_num);
+}
+
+TEST(ThreadSpinLock, normalTest) {
+  for (auto& thread_num : {10, 30, 50, 100, 300, 1000}) {
+    testNormalImpl(
+        thread_num,
+        [](size_t thread_num, size_t& count, paddle::SpinLock& mutex) {
+          std::lock_guard<paddle::SpinLock> lock(mutex);
+          ++count;
+        });
+  }
+}
diff --git a/paddle/legacy/utils/tests/test_StringUtils.cpp b/paddle/legacy/utils/tests/test_StringUtils.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..61d2815f097af7125bfefdc4909509564300d6aa
--- /dev/null
+++ b/paddle/legacy/utils/tests/test_StringUtils.cpp
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/legacy/utils/StringUtil.h"
+
+#include <gtest/gtest.h>
+
+TEST(StringUtil, to) {
+  ASSERT_NEAR(paddle::str::to<double>("12.45"), 12.45, 1e-5);
+  ASSERT_DEATH_IF_SUPPORTED(paddle::str::to<double>("12.45x23"), ".*");
+  ASSERT_DEATH_IF_SUPPORTED(paddle::str::to<int>(""), ".*");
+}
diff --git a/paddle/legacy/utils/tests/test_Thread.cpp b/paddle/legacy/utils/tests/test_Thread.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5e07da3236862c5f585671d9bb8e3fbbd1c5b5fc
--- /dev/null
+++ b/paddle/legacy/utils/tests/test_Thread.cpp
@@ -0,0 +1,81 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/legacy/utils/Thread.h>
+#include <atomic>
+
+using paddle::AsyncThreadPool;  // NOLINT
+
+TEST(AsyncThreadPool, addJob) {
+  AsyncThreadPool pool(8);
+  auto a = pool.addJob([] { return 1; });
+  auto b = pool.addJob([] { return true; });
+  auto c = pool.addJob([] { return false; });
+
+  ASSERT_EQ(a.get(), 1);
+  ASSERT_TRUE(b.get());
+  ASSERT_FALSE(c.get());
+}
+
+TEST(AsyncThreadPool, addBatchJob) {
+  AsyncThreadPool pool(8);
+  std::atomic<int> counter{0};
+
+  std::vector<AsyncThreadPool::JobFunc> jobs;
+
+  for (int i = 0; i < 10000; i++) {
+    jobs.emplace_back([&] { counter++; });
+  }
+
+  pool.addBatchJobs(jobs);
+
+  ASSERT_EQ(counter, 10000);
+}
+
+TEST(AsyncThreadPool, multiThreadAddBatchJob) {
+  AsyncThreadPool levelOnePool(200);
+  AsyncThreadPool levelTwoPool(200);
+
+  std::shared_ptr<std::mutex> mut = std::make_shared<std::mutex>();
+  int counter = 0;
+  const int numMonitors = 300;
+  const int numSlaves = 300;
+  std::vector<AsyncThreadPool::JobFunc> moniterJobs(numMonitors, [&] {
+    std::vector<AsyncThreadPool::JobFunc> slaveJobs(numSlaves, [mut, &counter] {
+      std::lock_guard<std::mutex> lk(*mut);
+      counter++;
+    });
+    levelTwoPool.addBatchJobs(slaveJobs);
+  });
+  levelOnePool.addBatchJobs(moniterJobs);
+  ASSERT_EQ(counter, numMonitors * numSlaves);
+}
+
+TEST(AsyncThreadPool, addBatchJobWithResults) {
+  AsyncThreadPool pool(100);
+
+  std::vector<std::function<int()>> jobs;
+  const int numJobs = 100;
+  for (int i = 0; i < numJobs; i++) {
+    jobs.emplace_back([i] { return i; });
+  }
+
+  std::vector<int> res;
+  pool.addBatchJobs(jobs, res);
+
+  for (int i = 0; i < numJobs; i++) {
+    ASSERT_EQ(res[i], i);
+  }
+}
diff --git a/paddle/legacy/utils/tests/test_ThreadBarrier.cpp b/paddle/legacy/utils/tests/test_ThreadBarrier.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9c8851ae2112320c89aa3e7ed6e850d00cc14006
--- /dev/null
+++ b/paddle/legacy/utils/tests/test_ThreadBarrier.cpp
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <set>
+#include <vector>
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#include "paddle/legacy/utils/Locks.h"
+#include "paddle/legacy/utils/Logging.h"
+#include "paddle/legacy/utils/Util.h"
+
+DEFINE_int32(test_thread_num, 100, "testing thread number");
+
+void testNormalImpl(
+    size_t thread_num,
+    const std::function<void(size_t,
+                             std::mutex&,
+                             std::set<std::thread::id>&,
+                             paddle::ThreadBarrier&)>& callback) {
+  std::mutex mutex;
+  std::set<std::thread::id> tids;
+  paddle::ThreadBarrier barrier(thread_num);
+
+  std::vector<std::thread> threads;
+  threads.reserve(thread_num);
+  for (size_t i = 0; i < thread_num; ++i) {
+    threads.emplace_back([&thread_num, &mutex, &tids, &barrier, &callback] {
+      callback(thread_num, mutex, tids, barrier);
+    });
+  }
+
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+
+TEST(ThreadBarrier, normalTest) {
+  for (auto& thread_num : {10, 30, 50, 100, 300, 1000}) {
+    testNormalImpl(thread_num,
+                   [](size_t thread_num,
+                      std::mutex& mutex,
+                      std::set<std::thread::id>& tids,
+                      paddle::ThreadBarrier& barrier) {
+                     {
+                       std::lock_guard<std::mutex> guard(mutex);
+                       tids.insert(std::this_thread::get_id());
+                     }
+                     barrier.wait();
+                     // Check whether all threads reach this point or not
+                     CHECK_EQ(tids.size(), thread_num);
+                   });
+  }
+}
diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h
deleted file mode 100644
index ae60f6fe5fa142bdffeafc31b5816b8fcc94ad5c..0000000000000000000000000000000000000000
--- a/paddle/math/Allocator.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdlib.h>
-#include <mutex>
-#include "hl_gpu.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-/**
- * @brief Allocator base class.
- *
- * This is the base class of all Allocator class.
- */
-class Allocator {
-public:
-  virtual ~Allocator() {}
-  virtual void* alloc(size_t size) = 0;
-  virtual void free(void* ptr) = 0;
-  virtual std::string getName() = 0;
-};
-
-/**
- * @brief CPU allocator implementation.
- */
-class CpuAllocator : public Allocator {
-public:
-  ~CpuAllocator() {}
-
-  /**
-   * @brief Aligned allocation on CPU.
-   * @param size Size to be allocated.
-   * @return Pointer to the allocated memory
-   */
-  virtual void* alloc(size_t size) {
-    void* ptr;
-#ifdef PADDLE_WITH_MKLDNN
-    // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
-    // memory alignment
-    CHECK_EQ(posix_memalign(&ptr, 4096ul, size), 0);
-#else
-    CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
-#endif
-    CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
-    return ptr;
-  }
-
-  /**
-   * @brief Free the memory space.
-   * @param ptr  Pointer to be free.
-   */
-  virtual void free(void* ptr) {
-    if (ptr) {
-      ::free(ptr);
-    }
-  }
-
-  virtual std::string getName() { return "cpu_alloc"; }
-};
-
-/**
- * @brief GPU allocator implementation.
- */
-class GpuAllocator : public Allocator {
-public:
-  ~GpuAllocator() {}
-
-  /**
-   * @brief Allocate GPU memory.
-   * @param size Size to be allocated.
-   * @return Pointer to the allocated memory
-   */
-  virtual void* alloc(size_t size) {
-    void* ptr = hl_malloc_device(size);
-    CHECK(ptr) << "Fail to allocate GPU memory " << size << " bytes";
-    return ptr;
-  }
-
-  /**
-   * @brief Free the GPU memory.
-   * @param ptr  Pointer to be free.
-   */
-  virtual void free(void* ptr) {
-    if (ptr) {
-      hl_free_mem_device(ptr);
-    }
-  }
-
-  virtual std::string getName() { return "gpu_alloc"; }
-};
-
-/**
- * @brief CPU pinned memory allocator implementation.
- */
-class CudaHostAllocator : public Allocator {
-public:
-  ~CudaHostAllocator() {}
-
-  /**
-   * @brief Allocate pinned memory.
-   * @param size Size to be allocated.
-   * @return Pointer to the allocated memory
-   */
-  virtual void* alloc(size_t size) {
-    void* ptr = hl_malloc_host(size);
-    CHECK(ptr) << "Fail to allocate pinned memory " << size << " bytes";
-    return ptr;
-  }
-
-  /**
-   * @brief Free the pinned memory.
-   * @param ptr  Pointer to be free.
-   */
-  virtual void free(void* ptr) {
-    if (ptr) {
-      hl_free_mem_host(ptr);
-    }
-  }
-
-  virtual std::string getName() { return "cuda_host_alloc"; }
-};
-
-}  // namespace paddle
diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
deleted file mode 100644
index 7b57419e5a510ba50aff0b47681d1294607e31f9..0000000000000000000000000000000000000000
--- a/paddle/math/BaseMatrix.cu
+++ /dev/null
@@ -1,1953 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/utils/Logging.h>
-#include <string.h>
-#include <cmath>
-#include "BaseMatrix.h"
-#include "MathFunctions.h"
-#include "NEONFunctions.h"
-#include "SIMDFunctions.h"
-#include "hl_matrix_apply.cuh"
-#include "hl_matrix_base.cuh"
-#include "hl_matrix_ops.cuh"
-
-namespace paddle {
-
-const char* SPARSE_SUPPORT_ERROR = "Sparse Matrix/Vector is not supported.";
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyUnary(Op op) {
-  MatrixOffset offset(0, 0);
-  applyUnary(op, height_, width_, offset);
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyUnary(Op op,
-                               int numRows,
-                               int numCols,
-                               MatrixOffset& offset) {
-  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
-  int dimM = numRows;
-  int dimN = numCols;
-  int lda = stride_;
-
-  T* A = data_;
-  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-
-  CHECK_LE(dimM + offset.aRow_, this->height_);
-  CHECK_LE(dimN + offset.aCol_, this->width_);
-  if (true == useGpu_) {
-    hl_gpu_apply_unary_op(op, A, dimM, dimN, lda);
-  } else {
-    hl_cpu_apply_unary_op(op, A, dimM, dimN, lda);
-  }
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b) {
-  CHECK(height_ == b.height_ && width_ == b.width_)
-      << "Matrix dimensions are not equal";
-
-  MatrixOffset offset(0, 0, 0, 0);
-  applyBinary(op, b, height_, width_, offset);
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyBinary(
-    Op op, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset) {
-  applyBinary(op, b, numRows, numCols, offset, false_type(), false_type());
-  return 0;
-}
-
-template <class T>
-template <class Op, class bAsRowVector, class bAsColVector>
-int BaseMatrixT<T>::applyBinary(Op op,
-                                BaseMatrixT& b,
-                                int numRows,
-                                int numCols,
-                                MatrixOffset& offset,
-                                bAsRowVector,
-                                bAsColVector) {
-  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(useGpu_ == b.useGpu_) << "Matrix type mismatch";
-
-  int dimM = numRows;
-  int dimN = numCols;
-  int lda = stride_;
-  int ldb = b.stride_;
-
-  T* A = data_;
-  T* B = b.data_;
-  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(
-      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
-  CHECK_LE(dimM + offset.aRow_, this->height_);
-  CHECK_LE(dimN + offset.aCol_, this->width_);
-  if (!bAsRowVector::value && !bAsColVector::value) {
-    CHECK_LE(dimM + offset.bRow_, b.height_);
-    CHECK_LE(dimN + offset.bCol_, b.width_);
-  } else if (bAsRowVector::value && !bAsColVector::value) {
-    CHECK_LE(dimN + offset.bCol_, b.width_);
-  } else if (!bAsRowVector::value && bAsColVector::value) {
-    CHECK_LE(dimM + offset.bRow_, b.height_);
-  } else {
-  }
-  if (true == useGpu_) {
-    hl_gpu_apply_binary_op<T, Op, bAsRowVector::value, bAsColVector::value>(
-        op, A, B, dimM, dimN, lda, ldb);
-  } else {
-    hl_cpu_apply_binary_op<T, Op, bAsRowVector::value, bAsColVector::value>(
-        op, A, B, dimM, dimN, lda, ldb);
-  }
-
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) {
-  CHECK_EQ(height_, b.height_);
-  CHECK_EQ(width_, b.width_);
-  CHECK_EQ(height_, c.height_);
-  CHECK_EQ(width_, c.width_);
-
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  applyTernary(op, b, c, height_, width_, offset);
-
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyTernary(Op op,
-                                 BaseMatrixT& b,
-                                 BaseMatrixT& c,
-                                 int numRows,
-                                 int numCols,
-                                 MatrixOffset& offset) {
-  applyTernary(op, b, c, numRows, numCols, offset, false_type(), false_type());
-
-  return 0;
-}
-
-template <class T>
-template <class Op, class cAsRowVector, class cAsColVector>
-int BaseMatrixT<T>::applyTernary(Op op,
-                                 BaseMatrixT& b,
-                                 BaseMatrixT& c,
-                                 int numRows,
-                                 int numCols,
-                                 MatrixOffset& offset,
-                                 cAsRowVector,
-                                 cAsColVector) {
-  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(!c.isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK_EQ(useGpu_, b.useGpu_);
-  CHECK_EQ(useGpu_, c.useGpu_);
-
-  int dimM = numRows;
-  int dimN = numCols;
-  int lda = stride_;
-  int ldb = b.stride_;
-  int ldc = c.stride_;
-
-  T* A = data_;
-  T* B = b.data_;
-  T* C = c.data_;
-  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(
-      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
-  CAL_MATRIX_START_ADDRESS(
-      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
-
-  CHECK_LE(dimM + offset.aRow_, this->height_);
-  CHECK_LE(dimN + offset.aCol_, this->width_);
-  CHECK_LE(dimM + offset.bRow_, b.height_);
-  CHECK_LE(dimN + offset.bCol_, b.width_);
-  if (!cAsRowVector::value && !cAsColVector::value) {
-    CHECK_LE(dimM + offset.cRow_, c.height_);
-    CHECK_LE(dimN + offset.cCol_, c.width_);
-  } else if (cAsRowVector::value && !cAsColVector::value) {
-    CHECK_LE(dimN + offset.cCol_, c.width_);
-  } else if (!cAsRowVector::value && cAsColVector::value) {
-    CHECK_LE(dimM + offset.cRow_, c.height_);
-  } else {
-  }
-
-  if (true == useGpu_) {
-    hl_gpu_apply_ternary_op<T, Op, cAsRowVector::value, cAsColVector::value>(
-        op, A, B, C, dimM, dimN, lda, ldb, ldc);
-  } else {
-    hl_cpu_apply_ternary_op<T, Op, cAsRowVector::value, cAsColVector::value>(
-        op, A, B, C, dimM, dimN, lda, ldb, ldc);
-  }
-
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyQuaternary(Op op,
-                                    BaseMatrixT& b,
-                                    BaseMatrixT& c,
-                                    BaseMatrixT& d) {
-  CHECK_EQ(height_, b.height_);
-  CHECK_EQ(width_, b.width_);
-  CHECK_EQ(height_, c.height_);
-  CHECK_EQ(width_, c.width_);
-  CHECK_EQ(height_, d.height_);
-  CHECK_EQ(width_, d.width_);
-
-  MatrixOffset offset(0, 0, 0, 0, 0, 0, 0, 0);
-  applyQuaternary(op, b, c, d, height_, width_, offset);
-
-  return 0;
-}
-
-template <class T>
-template <class Op>
-int BaseMatrixT<T>::applyQuaternary(Op op,
-                                    BaseMatrixT& b,
-                                    BaseMatrixT& c,
-                                    BaseMatrixT& d,
-                                    int numRows,
-                                    int numCols,
-                                    MatrixOffset& offset) {
-  CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(!c.isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK(!d.isSparse()) << SPARSE_SUPPORT_ERROR;
-  CHECK_EQ(useGpu_, b.useGpu_);
-  CHECK_EQ(useGpu_, c.useGpu_);
-  CHECK_EQ(useGpu_, d.useGpu_);
-
-  int dimM = numRows;
-  int dimN = numCols;
-  int lda = stride_;
-  int ldb = b.stride_;
-  int ldc = c.stride_;
-  int ldd = d.stride_;
-
-  T* A = data_;
-  T* B = b.data_;
-  T* C = c.data_;
-  T* D = d.data_;
-  CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(
-      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
-  CAL_MATRIX_START_ADDRESS(
-      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
-  CAL_MATRIX_START_ADDRESS(
-      D, d.height_, d.width_, ldd, offset.dCol_, offset.dRow_);
-
-  CHECK_LE(dimM + offset.aRow_, this->height_);
-  CHECK_LE(dimN + offset.aCol_, this->width_);
-  CHECK_LE(dimM + offset.bRow_, b.height_);
-  CHECK_LE(dimN + offset.bCol_, b.width_);
-  CHECK_LE(dimM + offset.cRow_, c.height_);
-  CHECK_LE(dimN + offset.cCol_, c.width_);
-  CHECK_LE(dimM + offset.dRow_, d.height_);
-  CHECK_LE(dimN + offset.dCol_, d.width_);
-  if (true == useGpu_) {
-    hl_gpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd);
-  } else {
-    hl_cpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd);
-  }
-
-  return 0;
-}
-
-template <class T>
-template <class Agg,
-          class Op,
-          class Saver,
-          class aAsRowVector,
-          class aAsColVector>
-int BaseMatrixT<T>::aggregate(Agg agg,
-                              Op op,
-                              Saver sv,
-                              BaseMatrixT& b,
-                              int numRows,
-                              int numCols,
-                              MatrixOffset& offset,
-                              aAsRowVector,
-                              aAsColVector) {
-  CHECK_EQ(useGpu_, b.useGpu_);
-
-  int ld = stride_;
-  int ldb = b.stride_;
-
-  T* dst = data_;
-  T* B = b.data_;
-  CAL_MATRIX_START_ADDRESS(
-      dst, height_, width_, ld, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(
-      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
-
-  if (aAsRowVector::value && !aAsColVector::value) {
-    if (useGpu_) {
-      hl_gpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B, ldb);
-    } else {
-      hl_cpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B, ldb);
-    }
-  } else if (!aAsRowVector::value && aAsColVector::value) {
-    if (useGpu_) {
-      hl_gpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B, ldb);
-    } else {
-      hl_cpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B, ldb);
-    }
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-
-  return 0;
-}
-
-template <class T>
-template <class Agg,
-          class Op,
-          class Saver,
-          class aAsRowVector,
-          class aAsColVector>
-int BaseMatrixT<T>::aggregate(Agg agg,
-                              Op op,
-                              Saver sv,
-                              BaseMatrixT& b,
-                              BaseMatrixT& c,
-                              int numRows,
-                              int numCols,
-                              MatrixOffset& offset,
-                              aAsRowVector,
-                              aAsColVector) {
-  CHECK_EQ(useGpu_, b.useGpu_);
-  CHECK_EQ(useGpu_, c.useGpu_);
-
-  int ld = stride_;
-  int ldb = b.stride_;
-  int ldc = c.stride_;
-
-  T* dst = data_;
-  T* B = b.data_;
-  T* C = c.data_;
-  CAL_MATRIX_START_ADDRESS(
-      dst, height_, width_, ld, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(
-      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
-  CAL_MATRIX_START_ADDRESS(
-      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
-
-  if (aAsRowVector::value && !aAsColVector::value) {
-    if (useGpu_) {
-      hl_gpu_matrix_column_op(
-          agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc);
-    } else {
-      hl_cpu_matrix_column_op(
-          agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc);
-    }
-  } else if (!aAsRowVector::value && aAsColVector::value) {
-    if (useGpu_) {
-      hl_gpu_matrix_row_op(
-          agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc);
-    } else {
-      hl_cpu_matrix_row_op(
-          agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc);
-    }
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-
-  return 0;
-}
-
-/**
- * @brief   unary operator.
- *
- */
-
-DEFINE_MATRIX_UNARY_OP(Neg, a = -a);
-template <class T>
-void BaseMatrixT<T>::neg() {
-  applyUnary(unary::Neg<T>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Exp, a = exp(a));
-template <>
-void BaseMatrixT<real>::exp2() {
-  applyUnary(unary::Exp<real>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Log, a = log(a));
-template <>
-void BaseMatrixT<real>::log2() {
-  if (useGpu_) {
-    applyUnary(unary::Log<real>());
-  } else {
-    vLog(height_ * width_, data_, data_);
-  }
-}
-
-DEFINE_MATRIX_UNARY_OP(Sqrt, a = sqrt(a));
-template <>
-void BaseMatrixT<real>::sqrt2() {
-  applyUnary(unary::Sqrt<real>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Square, a = a * a);
-template <class T>
-void BaseMatrixT<T>::square2() {
-  applyUnary(unary::Square<T>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Reciprocal, a = 1.0f / a);
-template <class T>
-void BaseMatrixT<T>::reciprocal2() {
-  applyUnary(unary::Reciprocal<T>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Abs, a = a > 0 ? a : -a);
-template <class T>
-void BaseMatrixT<T>::abs2() {
-  applyUnary(unary::Abs<T>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Sign, a = (a > 0) - (a < 0));
-template <class T>
-void BaseMatrixT<T>::sign2() {
-  applyUnary(unary::Sign<T>());
-}
-
-DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
-template <class T>
-void BaseMatrixT<T>::zero() {
-  applyUnary(unary::Zero<T>());
-}
-
-template <class T>
-void BaseMatrixT<T>::zeroAtOffset(int64_t columnOffset, int64_t numColumns) {
-  int numRows = height_;
-  int numCols = numColumns;
-  MatrixOffset offset(columnOffset, 0);
-  applyUnary(unary::Zero<T>(), numRows, numCols, offset);
-}
-
-DEFINE_MATRIX_UNARY_OP(One, a = 1);
-template <class T>
-void BaseMatrixT<T>::one() {
-  applyUnary(unary::One<T>());
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(a, p));
-template <>
-void BaseMatrixT<real>::pow2(real p) {
-  if (useGpu_) {
-    applyUnary(unary::Pow<real>(p));
-  } else {
-    vPow(height_ * width_, data_, p, data_);
-  }
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a -= p);
-template <class T>
-void BaseMatrixT<T>::subScalar(T p) {
-  applyUnary(unary::SubScalar<T>(p));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a *= p);
-template <class T>
-void BaseMatrixT<T>::mulScalar(T p) {
-  applyUnary(unary::MulScalar<T>(p));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a /= p);
-template <class T>
-void BaseMatrixT<T>::divScalar(T p) {
-  applyUnary(unary::DivScalar<T>(p));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(Assign, ONE_PARAMETER, a = p);
-template <class T>
-void BaseMatrixT<T>::assign(T p) {
-  applyUnary(unary::Assign<T>(p));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(Add, ONE_PARAMETER, a += p);
-template <class T>
-void BaseMatrixT<T>::add(T p) {
-  applyUnary(unary::Add<T>(p));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = a * p1 + p2);
-template <class T>
-void BaseMatrixT<T>::add(T p1, T p2) {
-  applyUnary(unary::Add2<T>(p1, p2));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip,
-                                 TWO_PARAMETER,
-                                 a = a < p1 ? p1 : (a > p2 ? p2 : a));
-template <class T>
-void BaseMatrixT<T>::clip(T p1, T p2) {
-  applyUnary(unary::Clip<T>(p1, p2));
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ClipDerivative,
-                                  TWO_PARAMETER,
-                                  a = b < p1 ? 0 : (b > p2 ? 0 : 1));
-template <class T>
-void BaseMatrixT<T>::clipDerivative(BaseMatrixT& b, T p1, T p2) {
-  applyBinary(binary::ClipDerivative<T>(p1, p2), b);
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar,
-                                 ONE_PARAMETER,
-                                 a = a > p ? 1.0f : 0.0f);
-template <class T>
-void BaseMatrixT<T>::biggerThanScalar(T p) {
-  applyUnary(unary::BiggerThanScalar<T>(p));
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(DownClip, ONE_PARAMETER, a = a > p ? a : p);
-template <class T>
-void BaseMatrixT<T>::downClip(T p) {
-  applyUnary(unary::DownClip<T>(p));
-}
-
-/**
- * @brief   binary operator.
- *
- */
-
-DEFINE_MATRIX_BINARY_OP(Add, a += b);
-template <class T>
-void BaseMatrixT<T>::add(BaseMatrixT& b) {
-  applyBinary(binary::Add<T>(), b);
-}
-
-template <>
-void BaseMatrixT<real>::add(BaseMatrixT& b) {
-  if (useGpu_) {
-    applyBinary(binary::Add<real>(), b);
-  } else {  // cpu branch
-    CHECK_EQ(height_, b.height_);
-    CHECK_EQ(width_, b.width_);
-    vAdd(height_ * width_, data_, b.data_, data_);
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::addAtOffset(BaseMatrixT& b, int64_t columnOffset) {
-  if (columnOffset + b.width_ <= width_) {
-    int numRows = height_;
-    int numCols = b.width_;
-    MatrixOffset offset(columnOffset, 0, 0, 0);
-    applyBinary(binary::Add<T>(), b, numRows, numCols, offset);
-  } else if (columnOffset + width_ <= b.width_) {
-    int numRows = height_;
-    int numCols = width_;
-    MatrixOffset offset(0, 0, columnOffset, 0);
-    applyBinary(binary::Add<T>(), b, numRows, numCols, offset);
-  } else {
-    LOG(FATAL) << "Wrong argument "
-               << " a.width=" << width_ << " b.width=" << b.width_
-               << " columnOffset=" << columnOffset;
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::addP2P(BaseMatrixT& b) {
-  T* A = data_;
-  T* B = b.data_;
-  int dimM = height_;
-  int dimN = width_;
-
-  hl_gpu_apply_binary_op<T, binary::Add<T>, 0, 0>(
-      binary::Add<T>(), A, B, dimM, dimN, dimN, dimN);
-}
-
-template <class T>
-void BaseMatrixT<T>::addColVector(BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::Add<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              false_type(),
-              true_type() /* bAsColVector */);
-}
-
-template <class T>
-void BaseMatrixT<T>::addRowVector(BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::Add<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              true_type() /* bAsRowVector */,
-              false_type());
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Add1, ONE_PARAMETER, a += b * p);
-template <class T>
-void BaseMatrixT<T>::add(BaseMatrixT& b, T p) {
-  applyBinary(binary::Add1<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(b, p));
-template <>
-void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
-  if (useGpu_) {
-    applyBinary(binary::Pow<real>(p), b);
-  } else {
-    vPow(height_ * width_, b.data_, p, data_);
-  }
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = p1 * a + p2 * b);
-template <class T>
-void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, T p2) {
-  applyBinary(binary::Add2<T>(p1, p2), b);
-}
-
-template <class T>
-void BaseMatrixT<T>::addBias(BaseMatrixT& b, T scale) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::Add1<T>(scale),
-              b,
-              numRows,
-              numCols,
-              offset,
-              true_type() /* bAsRowVector */,
-              false_type());
-}
-
-DEFINE_MATRIX_BINARY_OP(Sub, a -= b);
-template <class T>
-void BaseMatrixT<T>::sub(BaseMatrixT& b) {
-  applyBinary(binary::Sub<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Sub1, ONE_PARAMETER, a -= b * p);
-template <class T>
-void BaseMatrixT<T>::sub(BaseMatrixT& b, T p) {
-  applyBinary(binary::Sub1<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Relu, b = a > 0.0f ? a : 0.0f);
-template <class T>
-void BaseMatrixT<T>::relu(BaseMatrixT& b) {
-  applyBinary(binary::Relu<T>(), b);
-}
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-template <>
-void BaseMatrixT<float>::relu(BaseMatrixT& b) {
-  neon::relu(data_, b.data_, height_ * width_);
-}
-#endif
-
-DEFINE_MATRIX_BINARY_OP(ReluDerivative, a *= (b > 0.0f ? 1.0f : 0.0f));
-template <class T>
-void BaseMatrixT<T>::reluDerivative(BaseMatrixT& b) {
-  applyBinary(binary::ReluDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Softrelu, const T THRESHOLD = 40.0;
-                        b = log(1.0 + exp((a > THRESHOLD)
-                                              ? THRESHOLD
-                                              : ((a < -THRESHOLD) ? (-THRESHOLD)
-                                                                  : a))));
-template <>
-void BaseMatrixT<real>::softrelu(BaseMatrixT& b) {
-  applyBinary(binary::Softrelu<real>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(
-    SoftreluDerivative, const T THRESHOLD = 40.0;
-    a *= (1.0 - exp(-1.0 * ((b > THRESHOLD)
-                                ? THRESHOLD
-                                : ((b < -THRESHOLD) ? (-THRESHOLD) : b)))));
-template <>
-void BaseMatrixT<real>::softreluDerivative(BaseMatrixT& b) {
-  applyBinary(binary::SoftreluDerivative<real>(), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Brelu, TWO_PARAMETER, b = a > p1 ? a : p1;
-                                  b = b < p2 ? b : p2);
-template <class T>
-void BaseMatrixT<T>::brelu(BaseMatrixT& b) {
-  int p1 = 0, p2 = 24;  //! TODO(yuyang18): Make p1,p2 configuable.
-  applyBinary(binary::Brelu<T>(p1, p2), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(BreluDerivative,
-                                  TWO_PARAMETER,
-                                  a *= (b > p1 && b < p2) ? 1.0 : 0.0);
-template <class T>
-void BaseMatrixT<T>::breluDerivative(BaseMatrixT& b) {
-  int p1 = 0, p2 = 24;
-  applyBinary(binary::BreluDerivative<T>(p1, p2), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Square, b = a * a);
-template <class T>
-void BaseMatrixT<T>::square2(BaseMatrixT& b) {
-  applyBinary(binary::Square<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(SquareDerivative, a *= 2.0 * b);
-template <class T>
-void BaseMatrixT<T>::squareDerivative(BaseMatrixT& b) {
-  applyBinary(binary::SquareDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Tanh, T tmp = -2.0 * a;
-                        tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-                        b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
-template <>
-void BaseMatrixT<real>::tanh(BaseMatrixT& b) {
-  applyBinary(binary::Tanh<real>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(TanhDerivative, a *= 1 - b * b);
-template <class T>
-void BaseMatrixT<T>::tanhDerivative(BaseMatrixT& b) {
-  applyBinary(binary::TanhDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(
-    ScaledTanh, TWO_PARAMETER, b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0));
-template <>
-void BaseMatrixT<real>::scaledTanh(BaseMatrixT& b, real p1, real p2) {
-  applyBinary(binary::ScaledTanh<real>(p1, p2), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanhDerivative,
-                                  TWO_PARAMETER,
-                                  a *= p2 * (p1 - b * b));
-template <class T>
-void BaseMatrixT<T>::scaledTanhDerivative(BaseMatrixT& b, T p1, T p2) {
-  applyBinary(binary::ScaledTanhDerivative<T>(p1 * p1, p2 / p1), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Reciprocal, b = 1.0f / a);
-template <class T>
-void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b) {
-  applyBinary(binary::Reciprocal<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(ReciprocalDerivative, a *= -b * b);
-template <class T>
-void BaseMatrixT<T>::reciprocalDerivative(BaseMatrixT& b) {
-  applyBinary(binary::ReciprocalDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Abs, b = a > 0.0f ? a : -a);
-template <class T>
-void BaseMatrixT<T>::abs2(BaseMatrixT& b) {
-  applyBinary(binary::Abs<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(AbsDerivative, a = (b > 0) ? a : (b < 0) ? -a : 0);
-template <class T>
-void BaseMatrixT<T>::absDerivative(BaseMatrixT& b) {
-  applyBinary(binary::AbsDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Sigmoid, const T THRESHOLD_MIN = -40.0;
-                        const T THRESHOLD_MAX = 13.0;
-                        T tmp = (a < THRESHOLD_MIN)
-                                    ? THRESHOLD_MIN
-                                    : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
-                        b = 1.0f / (1.0f + exp(-tmp)));
-template <>
-void BaseMatrixT<real>::sigmoid(BaseMatrixT& b) {
-  if (useGpu_) {
-    applyBinary(binary::Sigmoid<real>(), b);
-  } else {  // cpu versioni
-    size_t numSamples = this->height_;
-    size_t dim = this->width_;
-    CHECK_EQ(b.height_, numSamples);
-    CHECK_EQ(b.width_, dim);
-    const real* in = this->data_;
-    real* out = b.data_;
-
-    // out = - in
-    const float THRESHOLD_MIN = -40.0;  // make sure sigmoid(x) > 0
-    const float THRESHOLD_MAX = 13.0;   // make sure sigmoid(x) < 1
-    for (size_t i = 0; i < numSamples * dim; ++i) {
-      real tmp = in[i];
-      tmp = (tmp < THRESHOLD_MIN)
-                ? THRESHOLD_MIN
-                : ((tmp > THRESHOLD_MAX) ? THRESHOLD_MAX : tmp);
-      out[i] = -tmp;
-    }
-
-    // out = exp(out)
-    vExp(numSamples * dim, out, out);
-
-    // out = 1 / (1 + out)
-    for (size_t i = 0; i < numSamples * dim; ++i) {
-      out[i] = 1 / (1 + out[i]);
-    }
-  }
-}
-
-DEFINE_MATRIX_BINARY_OP(SigmoidDerivative, a *= b * (1 - b));
-template <class T>
-void BaseMatrixT<T>::sigmoidDerivative(BaseMatrixT& b) {
-  applyBinary(binary::SigmoidDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(ExpDerivative, a *= b);
-template <class T>
-void BaseMatrixT<T>::expDerivative(BaseMatrixT& b) {
-  applyBinary(binary::ExpDerivative<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Sign, b = a > 0.0f ? 1.0f : -1.0f);
-template <class T>
-void BaseMatrixT<T>::sign2(BaseMatrixT& b) {
-  applyBinary(binary::Sign<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Exp, a = exp(b));
-template <>
-void BaseMatrixT<real>::exp2(BaseMatrixT& b) {
-  applyBinary(binary::Exp<real>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(Log, a = log(b));
-template <>
-void BaseMatrixT<real>::log2(BaseMatrixT& b) {
-  if (useGpu_) {
-    applyBinary(binary::Log<real>(), b);
-  } else {
-    vLog(height_ * width_, b.data_, data_);
-  }
-}
-
-DEFINE_MATRIX_BINARY_OP(Sqrt, a = sqrt(b));
-template <>
-void BaseMatrixT<real>::sqrt2(BaseMatrixT& b) {
-  applyBinary(binary::Sqrt<real>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(InvSqrt, a = 1.0f / sqrt(b));
-template <>
-void BaseMatrixT<real>::invSqrt(BaseMatrixT& b) {
-  if (useGpu_) {
-    applyBinary(binary::InvSqrt<real>(), b);
-  } else {  // cpu branch
-    CHECK_EQ(height_, b.height_);
-    CHECK_EQ(width_, b.width_);
-    vInvSqrt(height_ * width_, b.data_, data_);
-  }
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(IsEqual, ONE_PARAMETER, a = (b == p));
-template <class T>
-void BaseMatrixT<T>::isEqualTo(BaseMatrixT& b, T value) {
-  applyBinary(binary::IsEqual<T>(value), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(AddScalar, ONE_PARAMETER, a = b + p);
-template <class T>
-void BaseMatrixT<T>::addScalar(BaseMatrixT& b, T p) {
-  applyBinary(binary::AddScalar<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a = b - p);
-template <class T>
-void BaseMatrixT<T>::subScalar(BaseMatrixT& b, T p) {
-  applyBinary(binary::SubScalar<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a = b * p);
-template <class T>
-void BaseMatrixT<T>::mulScalar(BaseMatrixT& b, T p) {
-  applyBinary(binary::MulScalar<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a = b / p);
-template <class T>
-void BaseMatrixT<T>::divScalar(BaseMatrixT& b, T p) {
-  applyBinary(binary::DivScalar<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ScalarDiv, ONE_PARAMETER, a = p / b);
-template <class T>
-void BaseMatrixT<T>::scalarDiv(BaseMatrixT& b, T p) {
-  applyBinary(binary::ScalarDiv<T>(p), b);
-}
-
-/**
- * @brief   ternary operator.
- *
- */
-
-DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropy,
-                         a = -c * log(b) - (1 - c) * log(1 - b));
-template <>
-void BaseMatrixT<real>::softCrossEntropy(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::SoftCrossEntropy<real>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropyBp, a += (b - c) / (b * (1 - b)));
-template <class T>
-void BaseMatrixT<T>::softCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::SoftCrossEntropyBp<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropy,
-                         a = c > 0.5 ? -log(b) : -log(1.0 - b));
-template <>
-void BaseMatrixT<real>::binaryLabelCrossEntropy(BaseMatrixT& b,
-                                                BaseMatrixT& c) {
-  if (useGpu_) {
-    applyTernary(ternary::BinaryCrossEntropy<real>(), b, c);
-  } else {
-    CHECK_EQ(height_, b.height_);
-    CHECK_EQ(height_, c.height_);
-    CHECK_EQ(width_, b.width_);
-    CHECK_EQ(width_, c.width_);
-
-    size_t size = height_ * width_;
-    real* out = b.data_;
-    real* label = c.data_;
-    real* cost = data_;
-
-    for (size_t i = 0; i < size; ++i) {
-      cost[i] = label[i] > 0.5 ? out[i] : 1.0 - out[i];
-    }
-    vLog(size, cost, cost);
-    for (size_t i = 0; i < size; ++i) {
-      cost[i] *= -1.0;
-    }
-  }
-}
-
-DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropyBp,
-                         a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b));
-template <class T>
-void BaseMatrixT<T>::binaryLabelCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::BinaryCrossEntropyBp<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(Add, a = b + c);
-template <class T>
-void BaseMatrixT<T>::add(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::Add<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add1, TWO_PARAMETER, a = p1 * b + p2 * c);
-template <class T>
-void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) {
-  applyTernary(ternary::Add1<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(Sub, a = b - c);
-template <class T>
-void BaseMatrixT<T>::sub(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::Sub<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(Sub1, TWO_PARAMETER, a = p1 * b - p2 * c);
-template <class T>
-void BaseMatrixT<T>::sub(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) {
-  applyTernary(ternary::Sub1<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(Add2, a = a + b + c);
-template <class T>
-void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::Add2<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add3,
-                                   THREE_PARAMETER,
-                                   a = p1 * a + p2 * b + p3 * c);
-template <class T>
-void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) {
-  applyTernary(ternary::Add3<T>(p1, p2, p3), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(SgdUpdate,
-                                   THREE_PARAMETER,
-                                   c = p2 * c - p1 * (b + p3 * a);
-                                   a = a + c);
-template <class T>
-void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b,  // grad
-                               BaseMatrixT& c,  // mom
-                               T p1,            // learningRate,
-                               T p2,            // momentum,
-                               T p3) {          // decayRate
-  applyTernary(ternary::SgdUpdate<T>(p1, p2, p3), b, c);
-}
-
-DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(SgdUpdate,
-                                      THREE_PARAMETER,
-                                      c = p2 * c - p1 * d * (b + p3 * a);
-                                      a += c);
-template <class T>
-void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b,  // grad,
-                               BaseMatrixT& c,  // mom,
-                               BaseMatrixT& d,  // lr,
-                               T p1,            // learningRate,
-                               T p2,            // momentum,
-                               T p3) {          // decayRate
-  applyQuaternary(quaternary::SgdUpdate<T>(p1, p2, p3), b, c, d);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p * b;
-                                  a = (a > lambda)
-                                          ? (a - lambda)
-                                          : (a < -lambda) ? (a + lambda) : 0);
-template <class T>
-void BaseMatrixT<T>::applyL1(BaseMatrixT& lr, T learningRate, T decayRate) {
-  applyBinary(binary::ApplyL1<T>(learningRate * decayRate), lr);
-}
-
-template <>
-void BaseMatrixT<real>::applyL1(BaseMatrixT& lr,
-                                real learningRate,
-                                real decayRate) {
-  if (useGpu_) {
-    applyBinary(binary::ApplyL1<real>(learningRate * decayRate), lr);
-  } else {
-    simd::decayL1(this->data_,
-                  this->data_,
-                  lr.data_,
-                  learningRate * decayRate,
-                  height_ * width_);
-  }
-}
-
-DEFINE_MATRIX_UNARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p;
-                                 a = (a > lambda)
-                                         ? (a - lambda)
-                                         : (a < -lambda) ? (a + lambda) : 0);
-template <class T>
-void BaseMatrixT<T>::applyL1(T learningRate, T decayRate) {
-  applyUnary(unary::ApplyL1<T>(learningRate * decayRate));
-}
-
-template <>
-void BaseMatrixT<real>::applyL1(real learningRate, real decayRate) {
-  if (useGpu_) {
-    applyUnary(unary::ApplyL1<real>(learningRate * decayRate));
-  } else {
-    simd::decayL1(
-        this->data_, this->data_, learningRate * decayRate, height_ * width_);
-  }
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL2,
-                                  ONE_PARAMETER,
-                                  a *= (1.0f / (1.0f + p * b)));
-template <class T>
-void BaseMatrixT<T>::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) {
-  if (useGpu_) {
-    applyBinary(binary::ApplyL2<T>(learningRate * decayRate), lr);
-  } else {
-    size_t size = this->height_ * this->width_;
-    T decay = learningRate * decayRate;
-    for (size_t j = 0; j < size; ++j) {
-      this->data_[j] *= 1.0f / (1.0f + decay * lr.data_[j]);
-    }
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::applyL2(T learningRate, T decayRate) {
-  BaseMatrixT<T>::mulScalar(1.0f / (1.0f + learningRate * decayRate));
-}
-
-DEFINE_MATRIX_BINARY_OP(DotMul, a *= b);
-template <class T>
-void BaseMatrixT<T>::dotMul(BaseMatrixT& b) {
-  applyBinary(binary::DotMul<T>(), b);
-}
-
-DEFINE_MATRIX_TERNARY_OP(DotMul, a = b * c);
-template <class T>
-void BaseMatrixT<T>::dotMul(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::DotMul<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(DotDiv, a = (b == 0.0) ? 0.0 : b / c);
-template <class T>
-void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::DotDiv<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotDiv2P,
-                                   TWO_PARAMETER,
-                                   a = (b + p1) / (c + p2));
-template <class T>
-void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
-  applyTernary(ternary::DotDiv2P<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_QUATERNARY_OP(RankLoss, const T THRESHOLD = 40.0; a = b - c;
-                            a = (a > THRESHOLD)
-                                    ? THRESHOLD
-                                    : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
-                            a = log(1 + exp(a)) - a * d);
-template <>
-void BaseMatrixT<real>::rankLoss(BaseMatrixT& b,
-                                 BaseMatrixT& c,
-                                 BaseMatrixT& d) {
-  applyQuaternary(quaternary::RankLoss<real>(), b, c, d);
-}
-
-DEFINE_MATRIX_QUATERNARY_OP(RankLossBp, const T THRESHOLD = 40.0; a = b - c;
-                            a = (a > THRESHOLD)
-                                    ? THRESHOLD
-                                    : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
-                            a = exp(a);
-                            a = (a / (1 + a) - d));
-template <>
-void BaseMatrixT<real>::rankLossBp(BaseMatrixT& b,
-                                   BaseMatrixT& c,
-                                   BaseMatrixT& d) {
-  applyQuaternary(quaternary::RankLossBp<real>(), b, c, d);
-}
-
-/* this = log(1 + exp(b)) - c * b */
-DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLoss, const T THRESHOLD = 40.0;
-                         T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
-                                                                 ? -THRESHOLD
-                                                                 : b;
-                         a = log(1 + exp(x)) - c * x);
-template <>
-void BaseMatrixT<real>::logisticRegressionLoss(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::LogisticRegressionLoss<real>(), b, c);
-}
-
-/* this = exp(b)/(1+exp(b)) - c */
-DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLossBp, const T THRESHOLD = 40.0;
-                         T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
-                                                                 ? -THRESHOLD
-                                                                 : b;
-                         x = exp(x);
-                         a = x / (1 + x) - c);
-template <>
-void BaseMatrixT<real>::logisticRegressionLossBp(BaseMatrixT& b,
-                                                 BaseMatrixT& c) {
-  applyTernary(ternary::LogisticRegressionLossBp<real>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(BiggerThan, a = (b > c) ? 1.0f : 0.0f);
-template <class T>
-void BaseMatrixT<T>::biggerThan(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::BiggerThan<T>(), b, c);
-}
-
-DEFINE_MATRIX_QUATERNARY_OP(
-    BiggerThan, a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
-template <class T>
-void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
-                                BaseMatrixT& c,
-                                BaseMatrixT& d) {
-  applyQuaternary(quaternary::BiggerThan<T>(), b, c, d);
-}
-
-DEFINE_MATRIX_TERNARY_OP(Max, a = (b > c) ? b : c);
-template <class T>
-void BaseMatrixT<T>::max2(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::Max<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(BinaryClassificationError,
-                                   ONE_PARAMETER,
-                                   c += ((a > p) == (b > p)) ? 0.0f : 1.0f);
-template <class T>
-void BaseMatrixT<T>::binaryClassificationError2(size_t destCol,
-                                                BaseMatrixT& b,
-                                                BaseMatrixT& c,
-                                                T p) {
-  CHECK(!useGpu_) << "do not support gpu";
-  MatrixOffset offset(0, 0, 0, 0, destCol, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
-  b.applyTernary(ternary::BinaryClassificationError<T>(p),
-                 c,
-                 *this,
-                 numRows,
-                 numCols,
-                 offset,
-                 false_type(),
-                 true_type() /*cAsColVector*/);
-}
-
-template <>
-void BaseMatrixT<real>::binaryClassificationError(size_t destCol,
-                                                  BaseMatrixT& b,
-                                                  BaseMatrixT& c,
-                                                  real p) {
-  MatrixOffset offset(destCol, 0, 0, 0, 0, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
-  aggregate(aggregate::sum(),
-            base::binary::classificationError(p),
-            base::binary::add(),
-            b,
-            c,
-            numRows,
-            numCols,
-            offset,
-            false_type(),
-            true_type() /*aAsColVector*/);
-}
-
-DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(Add3,
-                                      THREE_PARAMETER,
-                                      a = p1 * b + p2 * c + p3 * d);
-template <class T>
-void BaseMatrixT<T>::add3(
-    BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2, T p3) {
-  applyQuaternary(quaternary::Add3<T>(p1, p2, p3), b, c, d);
-}
-
-DEFINE_MATRIX_TERNARY_OP(DotMulSquare, a = b * c * c);
-template <class T>
-void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::DotMulSquare<T>(), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_OP(DotSquareSquare, a = b * b * c * c);
-template <class T>
-void BaseMatrixT<T>::dotSquareSquare(BaseMatrixT& b, BaseMatrixT& c) {
-  applyTernary(ternary::DotSquareSquare<T>(), b, c);
-}
-
-DEFINE_MATRIX_BINARY_OP(DotMulSquare, a *= b * b);
-template <class T>
-void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b) {
-  applyBinary(binary::DotMulSquare<T>(), b);
-}
-
-DEFINE_MATRIX_BINARY_OP(DotSquareMul, a = a * a * b);
-template <class T>
-void BaseMatrixT<T>::dotSquareMul(BaseMatrixT& b) {
-  applyBinary(binary::DotSquareMul<T>(), b);
-}
-
-DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(AddSquareSum,
-                                      THREE_PARAMETER,
-                                      T tmp = p1 * b + p2 * c + p3 * d;
-                                      a += tmp * tmp);
-template <class T>
-void BaseMatrixT<T>::addSquareSum(
-    BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1, T p2, T p3) {
-  applyQuaternary(quaternary::AddSquareSum<T>(p1, p2, p3), b, c, d);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(AddSquare, ONE_PARAMETER, a += p * b * b);
-template <class T>
-void BaseMatrixT<T>::addSquare(BaseMatrixT& b, T p) {
-  applyBinary(binary::AddSquare<T>(p), b);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(DecayAddSquare,
-                                  TWO_PARAMETER,
-                                  a = p1 * a + p2 * b * b);
-template <class T>
-void BaseMatrixT<T>::decayAddSquare(BaseMatrixT& b, T p1, T p2) {
-  applyBinary(binary::DecayAddSquare<T>(p1, p2), b);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DecayAddSquareMul,
-                                   TWO_PARAMETER,
-                                   a = p1 * a + p2 * b * b * c * c);
-template <class T>
-void BaseMatrixT<T>::decayAddSquareMul(BaseMatrixT& b,
-                                       BaseMatrixT& c,
-                                       T p1,
-                                       T p2) {
-  applyTernary(ternary::DecayAddSquareMul<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(ReciprocalSum,
-                                   THREE_PARAMETER,
-                                   a = 1 / (p1 * b + p2 * c + p3));
-template <class T>
-void BaseMatrixT<T>::reciprocalSum(
-    BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) {
-  applyTernary(ternary::ReciprocalSum<T>(p1, p2, p3), b, c);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2,
-                                  TWO_PARAMETER,
-                                  a = 1 / (p1 * b + p2));
-template <class T>
-void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b, T p1, T p2) {
-  applyBinary(binary::Reciprocal2<T>(p1, p2), b);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSquareSum,
-                                   TWO_PARAMETER,
-                                   T tmp = p1 * b + p2 * c;
-                                   a *= tmp * tmp);
-template <class T>
-void BaseMatrixT<T>::dotMulSquareSum(BaseMatrixT& b,
-                                     BaseMatrixT& c,
-                                     T p1,
-                                     T p2) {
-  applyTernary(ternary::DotMulSquareSum<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotSquareSum,
-                                   TWO_PARAMETER,
-                                   T tmp = p1 * b + p2 * c;
-                                   a = tmp * tmp);
-template <class T>
-void BaseMatrixT<T>::dotSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
-  applyTernary(ternary::DotSquareSum<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSum,
-                                   TWO_PARAMETER,
-                                   a *= p1 * b + p2 * c);
-template <class T>
-void BaseMatrixT<T>::dotMulSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
-  applyTernary(ternary::DotMulSum<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_BINARY_OP(CopyAndClear, b = a; a = 0);
-template <class T>
-void BaseMatrixT<T>::copyAndClear(BaseMatrixT& b) {
-  applyBinary(binary::CopyAndClear<T>(), b);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(AddDotMul,
-                                   TWO_PARAMETER,
-                                   a = p1 * a + p2 * b * c);
-template <class T>
-void BaseMatrixT<T>::addDotMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
-  applyTernary(ternary::AddDotMul<T>(p1, p2), b, c);
-}
-
-DEFINE_MATRIX_BINARY_OP(Assign, a = b;);
-template <class T>
-void BaseMatrixT<T>::assign(BaseMatrixT& b) {
-  if (useGpu_) {
-    applyBinary(binary::Assign<T>(), b);
-  } else {  // cpu version
-    CHECK_EQ(this->height_, b.height_);
-    CHECK_EQ(this->width_, b.width_);
-    memcpy(data_, b.data_, sizeof(T) * height_ * width_);
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) {
-  if (columnOffset + b.width_ <= width_) {
-    int numRows = height_;
-    int numCols = b.width_;
-    MatrixOffset offset(columnOffset, 0, 0, 0);
-    applyBinary(binary::Assign<T>(), b, numRows, numCols, offset);
-  } else if (columnOffset + width_ <= b.width_) {
-    int numRows = height_;
-    int numCols = width_;
-    MatrixOffset offset(0, 0, columnOffset, 0);
-    applyBinary(binary::Assign<T>(), b, numRows, numCols, offset);
-  } else {
-    LOG(FATAL) << "Wrong argument "
-               << " a.width=" << width_ << " b.width=" << b.width_
-               << " columnOffset=" << columnOffset;
-  }
-}
-
-DEFINE_MATRIX_BINARY_OP(DeepSwap, T tmp = a; a = b; b = tmp);
-template <class T>
-void BaseMatrixT<T>::deepSwap(BaseMatrixT& b) {
-  applyBinary(binary::DeepSwap<T>(), b);
-}
-
-template <>
-void BaseMatrixT<real>::rowDotMul(size_t destCol,
-                                  BaseMatrixT& b,
-                                  BaseMatrixT& c) {
-  int numRows = b.height_;
-  int numCols = b.width_;
-  MatrixOffset offset(destCol, 0, 0, 0, 0, 0);
-  aggregate(aggregate::sum(),
-            base::binary::mul(),
-            base::binary::add(),
-            b,
-            c,
-            numRows,
-            numCols,
-            offset,
-            false_type(),
-            true_type() /*aAsColVector*/);
-}
-
-template <class T>
-void BaseMatrixT<T>::rowDotMul2(size_t destCol,
-                                BaseMatrixT& b,
-                                BaseMatrixT& c) {
-  CHECK(!useGpu_) << "do not support gpu";
-
-  size_t height = this->height_;
-  CHECK_LT(destCol, this->width_);
-  CHECK_EQ(height, b.height_);
-  CHECK_EQ(height, c.height_);
-  CHECK_EQ(b.width_, c.width_);
-  size_t width = b.width_;
-  T* A = this->data_;
-  const T* B = b.data_;
-  const T* C = c.data_;
-  for (size_t i = 0; i < height;
-       ++i, A += this->width_, B += width, C += width) {
-    for (size_t j = 0; j < width; ++j) {
-      A[destCol] += B[j] * C[j];
-    }
-  }
-}
-
-template <>
-void BaseMatrixT<real>::addDotMulVMM(BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  int numRows = b.height_;
-  int numCols = b.width_;
-  aggregate(aggregate::sum(),
-            base::binary::mul(),
-            base::binary::add(),
-            b,
-            c,
-            numRows,
-            numCols,
-            offset,
-            true_type() /*aAsRowVector*/,
-            false_type());
-}
-
-template <class T>
-void BaseMatrixT<T>::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) {
-  CHECK(!useGpu_) << "do not support gpu";
-
-  CHECK_EQ(height_, 1LU);
-  CHECK_EQ(b.height_, c.height_);
-  CHECK_EQ(width_, b.width_);
-  CHECK_EQ(width_, c.width_);
-  size_t height = b.height_;
-  size_t width = b.width_;
-  T* A = this->data_;
-  const T* B = b.data_;
-  const T* C = c.data_;
-  for (size_t i = 0; i < height; ++i, B += width, C += width) {
-    for (size_t j = 0; j < width; ++j) {
-      A[j] += B[j] * C[j];
-    }
-  }
-}
-
-DEFINE_MATRIX_TERNARY_OP(addDotMulMMV, a += b * c);
-template <class T>
-void BaseMatrixT<T>::addDotMulMMV(BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyTernary(ternary::addDotMulMMV<T>(),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               true_type() /*cAsRowVector*/,
-               false_type());
-}
-
-template <class T>
-void BaseMatrixT<T>::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) {
-  CHECK(!useGpu_) << "do not support gpu";
-
-  CHECK_EQ(c.height_, 1LU);
-  CHECK_EQ(height_, b.height_);
-  CHECK_EQ(width_, b.width_);
-  CHECK_EQ(width_, c.width_);
-  size_t height = height_;
-  size_t width = width_;
-  T* A = this->data_;
-  const T* B = b.data_;
-  const T* C = c.data_;
-  for (size_t i = 0; i < height; ++i, A += width, B += width) {
-    for (size_t j = 0; j < width; ++j) {
-      A[j] += B[j] * C[j];
-    }
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::rowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, cCol, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyTernary(ternary::DotMul<T>(),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               false_type(),
-               true_type() /*cAsColVector*/);
-}
-
-template <class T>
-void BaseMatrixT<T>::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
-  CHECK(!useGpu_) << "do not support gpu";
-
-  size_t height = this->height_;
-  size_t width = this->width_;
-  CHECK_EQ(height, b.height_);
-  CHECK_EQ(width, b.width_);
-  CHECK_LT(cCol, c.width_);
-  CHECK_EQ(height, c.height_);
-  T* A = this->data_;
-  const T* B = b.data_;
-  const T* C = c.data_;
-  for (size_t i = 0; i < height; ++i, A += width, B += width, C += c.width_) {
-    for (size_t j = 0; j < width; ++j) {
-      A[j] = B[j] * C[cCol];
-    }
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::colScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, 0, cRow);
-  int numRows = height_;
-  int numCols = width_;
-  applyTernary(ternary::DotMul<T>(),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               true_type() /* cAsRowVector */,
-               false_type() /* cAsColVector */);
-}
-
-template <class T>
-void BaseMatrixT<T>::addColScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, 0, cRow);
-  int numRows = height_;
-  int numCols = width_;
-  applyTernary(ternary::addDotMulMMV<T>(),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               true_type() /* cAsRowVector */,
-               false_type() /* cAsColVector */);
-}
-
-template <class T>
-void BaseMatrixT<T>::addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, cCol, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyTernary(ternary::addDotMulMMV<T>(),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               false_type(),
-               true_type() /*cAsColVector*/);
-}
-
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(RowAdd, ONE_PARAMETER, a = b + p * c);
-template <class T>
-void BaseMatrixT<T>::rowAdd(size_t cCol, BaseMatrixT& b, BaseMatrixT& c, T p) {
-  MatrixOffset offset(0, 0, 0, 0, cCol, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyTernary(ternary::RowAdd<T>(p),
-               b,
-               c,
-               numRows,
-               numCols,
-               offset,
-               false_type(),
-               true_type() /*cAsColVector*/);
-}
-
-DEFINE_MATRIX_TERNARY_OP(RowPow, a = pow(b, c));
-template <>
-void BaseMatrixT<real>::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
-  if (useGpu_) {
-    MatrixOffset offset(0, 0, 0, 0, cCol, 0);
-    int numRows = height_;
-    int numCols = width_;
-    applyTernary(ternary::RowPow<real>(),
-                 b,
-                 c,
-                 numRows,
-                 numCols,
-                 offset,
-                 false_type(),
-                 true_type() /*cAsColVector*/);
-  } else {
-    size_t height = this->height_;
-    size_t width = this->width_;
-    CHECK_EQ(height, b.height_);
-    CHECK_EQ(width, b.width_);
-    CHECK_LT(cCol, c.width_);
-    CHECK_EQ(height, c.height_);
-    real* A = this->data_;
-    const real* B = b.data_;
-    const real* C = c.data_;
-    for (size_t i = 0; i < height; ++i, A += width, B += width, C += c.width_) {
-      vPow(width, B, C[cCol], A);
-    }
-  }
-}
-
-template <class T>
-void BaseMatrixT<T>::mulRowVector(BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::DotMul<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              true_type() /* bAsRowVector */,
-              false_type());
-}
-
-DEFINE_MATRIX_BINARY_OP(DotDiv, a /= b);
-template <class T>
-void BaseMatrixT<T>::divRowVector(BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::DotDiv<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              true_type() /* bAsRowVector */,
-              false_type());
-}
-
-template <class T>
-void BaseMatrixT<T>::mulColVector(BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::DotMul<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              false_type(),
-              true_type() /* bAsColVector */);
-}
-
-template <class T>
-void BaseMatrixT<T>::divColVector(BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0);
-  int numRows = height_;
-  int numCols = width_;
-  applyBinary(binary::DotDiv<T>(),
-              b,
-              numRows,
-              numCols,
-              offset,
-              false_type(),
-              true_type() /* bAsColVector */);
-}
-
-template <>
-template <class Agg>
-int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  size_t numRows = b.height_;
-  size_t numCols = b.width_;
-  CHECK_EQ(height_, numRows);
-  CHECK_EQ(width_, 1UL);
-  aggregate(agg,
-            base::unary::identity(),
-            base::binary::second(),
-            b,
-            numRows,
-            numCols,
-            offset,
-            false_type(),
-            true_type() /*aAsColVector*/);
-
-  return 0;
-}
-
-template <>
-template <class Agg, class Saver>
-int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  size_t numRows = b.height_;
-  size_t numCols = b.width_;
-  CHECK_EQ(height_, numRows);
-  CHECK_EQ(width_, 1UL);
-  aggregate(agg,
-            base::unary::identity(),
-            sv,
-            b,
-            numRows,
-            numCols,
-            offset,
-            false_type(),
-            true_type() /*aAsColVector*/);
-
-  return 0;
-}
-
-template <>
-template <class Agg>
-int BaseMatrixT<real>::applyRow(Agg agg,
-                                real scaleDest,
-                                real scaleAgg,
-                                BaseMatrixT& b) {
-  if (scaleDest != 0) {
-    applyRow(agg, base::binary::add2(scaleDest, scaleAgg), b);
-  } else {
-    applyRow(agg, base::binary::second(), b);
-    if (scaleAgg != 1) {
-      mulScalar(scaleAgg);
-    }
-  }
-  return 0;
-}
-
-template <>
-template <class Agg, class Op, class Saver>
-int BaseMatrixT<real>::applyRow(
-    Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  size_t numRows = b.height_;
-  size_t numCols = b.width_;
-  CHECK_EQ(height_, numRows);
-  CHECK_EQ(width_, 1UL);
-  CHECK_EQ(c.height_, numRows);
-  CHECK_EQ(c.width_, numCols);
-  aggregate(agg,
-            op,
-            sv,
-            b,
-            c,
-            numRows,
-            numCols,
-            offset,
-            false_type(),
-            true_type() /*aAsColVector*/);
-  return 0;
-}
-
-template <>
-template <class Agg, class Op>
-int BaseMatrixT<real>::applyRow(Agg agg,
-                                Op op,
-                                real scaleDest,
-                                real scaleAgg,
-                                BaseMatrixT& b,
-                                BaseMatrixT& c) {
-  if (scaleDest != 0) {
-    applyRow(agg, op, base::binary::add2(scaleDest, scaleAgg), b, c);
-  } else {
-    applyRow(agg, op, base::binary::second(), b, c);
-    if (scaleAgg != 1) {
-      mulScalar(scaleAgg);
-    }
-  }
-  return 0;
-}
-
-template <>
-template <class Agg>
-int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  size_t numRows = b.height_;
-  size_t numCols = b.width_;
-  CHECK_EQ(width_, numCols);
-  CHECK_EQ(height_, 1UL);
-  aggregate(agg,
-            base::unary::identity(),
-            base::binary::second(),
-            b,
-            numRows,
-            numCols,
-            offset,
-            true_type() /*aAsRowVector*/,
-            false_type());
-
-  return 0;
-}
-
-template <>
-template <class Agg, class Saver>
-int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
-  MatrixOffset offset(0, 0, 0, 0, 0, 0);
-  size_t numRows = b.height_;
-  size_t numCols = b.width_;
-  CHECK_EQ(width_, numCols);
-  CHECK_EQ(height_, 1UL);
-  aggregate(agg,
-            base::unary::identity(),
-            sv,
-            b,
-            numRows,
-            numCols,
-            offset,
-            true_type() /*aAsRowVector*/,
-            false_type());
-
-  return 0;
-}
-
-template <>
-template <class Agg>
-int BaseMatrixT<real>::applyCol(Agg agg,
-                                real scaleDest,
-                                real scaleAgg,
-                                BaseMatrixT& b) {
-  if (scaleDest != 0) {
-    applyCol(agg, base::binary::add2(scaleDest, scaleAgg), b);
-  } else {
-    applyCol(agg, base::binary::second(), b);
-    if (scaleAgg != 1) {
-      mulScalar(scaleAgg);
-    }
-  }
-  return 0;
-}
-
-template <>
-void BaseMatrixT<real>::sumRows(BaseMatrixT& b, real scaleSum, real scaleDest) {
-  applyRow(aggregate::sum(), scaleDest, scaleSum, b);
-}
-
-template <>
-void BaseMatrixT<real>::maxRows(BaseMatrixT& b) {
-  applyRow(aggregate::max(), b);
-}
-
-template <>
-void BaseMatrixT<real>::minRows(BaseMatrixT& b) {
-  applyRow(aggregate::min(), b);
-}
-
-template <>
-void BaseMatrixT<real>::maxCols(BaseMatrixT& b) {
-  applyCol(aggregate::max(), b);
-}
-
-template <>
-void BaseMatrixT<real>::minCols(BaseMatrixT& b) {
-  applyCol(aggregate::min(), b);
-}
-
-template <>
-void BaseMatrixT<real>::sumCols(BaseMatrixT& b, real scaleSum, real scaleDest) {
-  applyCol(aggregate::sum(), scaleDest, scaleSum, b);
-}
-
-template <>
-void BaseMatrixT<real>::sumOfSquaredDiffs(BaseMatrixT& b,
-                                          BaseMatrixT& c,
-                                          real scaleSum,
-                                          real scaleDest) {
-  applyRow(
-      aggregate::sum(), base::binary::squaredDiff(), scaleDest, scaleSum, b, c);
-}
-
-template <>
-void BaseMatrixT<real>::sumOfProducts(BaseMatrixT& b,
-                                      BaseMatrixT& c,
-                                      real scaleSum,
-                                      real scaleDest) {
-  applyRow(aggregate::sum(), base::binary::mul(), scaleDest, scaleSum, b, c);
-}
-
-template class BaseMatrixT<real>;
-
-#ifndef PADDLE_MOBILE_INFERENCE
-
-template class BaseMatrixT<int>;
-
-#else
-
-template <>
-void BaseMatrixT<int>::zero() {
-  applyUnary(unary::Zero<int>());
-}
-
-template <>
-void BaseMatrixT<int>::assign(int p) {
-  applyUnary(unary::Assign<int>(p));
-}
-
-template <>
-void BaseMatrixT<int>::isEqualTo(BaseMatrixT& b, int value) {
-  applyBinary(binary::IsEqual<int>(value), b);
-}
-
-template <>
-void BaseMatrixT<int>::neg() {
-  applyUnary(unary::Neg<int>());
-}
-
-template <>
-void BaseMatrixT<int>::abs2() {
-  applyUnary(unary::Abs<int>());
-}
-
-template <>
-void BaseMatrixT<int>::add(int p) {
-  applyUnary(unary::Add<int>(p));
-}
-
-template <>
-void BaseMatrixT<int>::add(int p1, int p2) {
-  applyUnary(unary::Add2<int>(p1, p2));
-}
-
-template <>
-void BaseMatrixT<int>::applyL1(int learningRate, int decayRate) {
-  applyUnary(unary::ApplyL1<int>(learningRate * decayRate));
-}
-
-#endif
-}  // namespace paddle
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
deleted file mode 100644
index 00ce5a19491048f3339d608ac37669816a9ad3f5..0000000000000000000000000000000000000000
--- a/paddle/math/BaseMatrix.h
+++ /dev/null
@@ -1,1095 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <stdint.h>
-#include <cstddef>
-#include "TensorExpression.h"
-#include "paddle/utils/Common.h"
-
-namespace paddle {
-
-/*
- * nvcc currently does not support C++11,
- * so I realized false_type and true_type.
- */
-template <class T, T v>
-struct bool_constant {
-  static const T value = v;
-};
-typedef bool_constant<bool, false> false_type;
-typedef bool_constant<bool, true> true_type;
-
-/**
- * @brief   Calculate matrix element address.
- *
- * For instance, address of A[i][j] = i * ld + j.
- *
- */
-#define CAL_MATRIX_START_ADDRESS(address, height, width, ld, col, row) \
-  CHECK_LE(col, width);                                                \
-  CHECK_LE(row, height);                                               \
-  address += row * ld + col;
-
-class MatrixOffset {
-public:
-  size_t aCol_;
-  size_t aRow_;
-  size_t bCol_;
-  size_t bRow_;
-  size_t cCol_;
-  size_t cRow_;
-  size_t dCol_;
-  size_t dRow_;
-  MatrixOffset(size_t aCol = 0,
-               size_t aRow = 0,
-               size_t bCol = 0,
-               size_t bRow = 0,
-               size_t cCol = 0,
-               size_t cRow = 0,
-               size_t dCol = 0,
-               size_t dRow = 0)
-      : aCol_(aCol),
-        aRow_(aRow),
-        bCol_(bCol),
-        bRow_(bRow),
-        cCol_(cCol),
-        cRow_(cRow),
-        dCol_(dCol),
-        dRow_(dRow) {}
-};
-
-template <class T>
-class BaseMatrixT : public TensorExpression<BaseMatrixT<T>, T> {
-public:
-  size_t height_, width_;
-  size_t stride_;
-  T* data_;
-  bool trans_;
-  bool useGpu_;
-
-public:
-  virtual ~BaseMatrixT() {}
-  BaseMatrixT(size_t height, size_t width, T* data, bool trans, bool useGpu)
-      : height_(height),
-        width_(width),
-        stride_(width),
-        data_(data),
-        trans_(trans),
-        useGpu_(useGpu) {}
-
-  /**
-   * @note This constructor is for temporarily making a matrix with different
-   *       useGpu flag as the original matrix so that mixed gpu/cpu operations
-   *       can be performed successfully.
-   */
-  BaseMatrixT(BaseMatrixT& mat, bool useGpu)
-      : height_(mat.height_),
-        width_(mat.width_),
-        stride_(mat.stride_),
-        data_(mat.data_),
-        trans_(mat.trans_),
-        useGpu_(useGpu) {}
-
-  BaseMatrixT(size_t height,
-              size_t width,
-              size_t stride,
-              T* data,
-              bool trans,
-              bool use_gpu)
-      : height_(height),
-        width_(width),
-        stride_(stride),
-        data_(data),
-        trans_(trans),
-        useGpu_(use_gpu) {
-    /* CHECK_LE(width_, stride_); */
-  }
-
-  /// caller should make sure that the size of data is at least height*width
-  void setData(T* data) { data_ = data; }
-
-  /**
-   * unary operator: element wise op(a).
-   *
-   * @code
-   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
-   * @endcode
-   */
-  template <class Op>
-  int applyUnary(Op op);
-
-  /**
-   * unary operator: element wise op(a).
-   *
-   * @code
-   * for 0 <= i < numRows & for 0 <= j < numCols.
-   * While matrix start address is:
-   *  A = this->data_ + offset.aRow_*ld + offset.aCol_;
-   * @endcode
-   */
-  template <class Op>
-  int applyUnary(Op op, int numRows, int numCols, MatrixOffset& offset);
-
-  /**
-   * binary operator: element wise op(a, b).
-   *
-   * @code
-   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
-   * While this->height_ == b.height_ && this->width_ == b.width_.
-   * @endcode
-   */
-  template <class Op>
-  int applyBinary(Op op, BaseMatrixT& b);
-
-  /**
-   * binary operator: element wise op(a, b)
-   *
-   * @code
-   * for 0 <= i < numRows & for 0 <= j < numCols.
-   * While matrix start address is:
-   *   A = this->data_ + offset.aRow_*lda + offset.aCol_;
-   *   B = b->data_ + offset.bRow_*ldb + offset.bCol_;
-   *
-   * if (bAsRowVector == false_type && bAsColVector == false_type)
-   *   op(A[i * lda + j], B[i * ldb + j])
-   *
-   * if (bAsRowVector == true_type && bAsColVector == false_type)
-   *   op(A[i * lda + j], B[j])
-   *
-   * if (bAsRowVector == false_type && bAsColVector == true_type)
-   *   op(A[i * lda + j], B[i * ldb])
-   *
-   * if (bAsRowVector == true_type && bAsColVector == true_type)
-   *   op(A[i * lda + j], B[0])
-   * @endcode
-   */
-  template <class Op, class bAsRowVector, class bAsColVector>
-  int applyBinary(Op op,
-                  BaseMatrixT& b,
-                  int numRows,
-                  int numCols,
-                  MatrixOffset& offset,
-                  bAsRowVector,
-                  bAsColVector);
-
-  template <class Op>
-  int applyBinary(
-      Op op, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset);
-
-  /**
-   * ternary operator: element wise op(a, b, c).
-   *
-   * @code
-   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
-   *
-   * While this->height_ == b.height_ && this->width_ == b.width_
-   *    && this->height_ == c.height_ && this->width_ == c.width_
-   * @endcode
-   */
-  template <class Op>
-  int applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * ternary operator: element wise op(a, b, c).
-   *
-   * @code
-   *  for 0 <= i < numRows & for 0 <= j < numCols.
-   *  While matrix start address is:
-   *
-   *    A = this->data_ + offset.aRow_*lda + offset.aCol_;
-   *    B = b->data_ + offset.bRow_*ldb + offset.bCol_;
-   *    C = c->data_ + offset.cRow_*ldc + offset.cCol_;
-   *
-   *    if (cAsRowVector == false_type && cAsColVector == false_type)
-   *      op(A[i*lda + j], B[i*ldb + j], C[i*ldc + j])
-   *
-   *    if (cAsRowVector == true_type && cAsColVector == false_type)
-   *      op(A[i*lda + j], B[i*ldb + j], C[j])
-   *
-   *    if (cAsRowVector == false_type && cAsColVector == true_type)
-   *      op(A[i*lda + j], B[i*ldb + j], C[i*ldc])
-   *
-   *    if (cAsRowVector == 1 && cAsColVector == 1)
-   *      op(A[i*lda + j], B[i*ldb + j], C[0])
-   * @endcode
-   */
-  template <class Op, class cAsRowVector, class cAsColVector>
-  int applyTernary(Op op,
-                   BaseMatrixT& b,
-                   BaseMatrixT& c,
-                   int numRows,
-                   int numCols,
-                   MatrixOffset& offset,
-                   cAsRowVector,
-                   cAsColVector);
-
-  template <class Op>
-  int applyTernary(Op op,
-                   BaseMatrixT& b,
-                   BaseMatrixT& c,
-                   int numRows,
-                   int numCols,
-                   MatrixOffset& offset);
-
-  /**
-   * quaternary operator: element wise op(a, b, c, d).
-   *
-   * @code
-   * for 0 <= i < this->height_ & for 0 <= j < this->width_.
-   *
-   * While this->height_ == b.height_ && this->width_ == b.width_
-   *    && this->height_ == c.height_ && this->width_ == c.width_
-   *    && this->height_ == d.height_ && this->width_ == d.width_
-   * @endcode
-   */
-  template <class Op>
-  int applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
-
-  /**
-   * quaternary operator: element wise op(a, b, c, d).
-   *
-   * @code
-   * for 0 <= i < numRows & for 0 <= j < numCols.
-   * While matrix start address is:
-   *    A = this->data_ + offset.aRow_*lda + offset.aCol_;
-   *    B = b->data_ + offset.bRow_*ldb + offset.bCol_;
-   *    C = c->data_ + offset.cRow_*ldc + offset.cCol_;
-   *    D = d->data_ + offset.dRow_*ldd + offset.dCol_;
-   * @endcode
-   */
-  template <class Op>
-  int applyQuaternary(Op op,
-                      BaseMatrixT& b,
-                      BaseMatrixT& c,
-                      BaseMatrixT& d,
-                      int numRows,
-                      int numCols,
-                      MatrixOffset& offset);
-
-  /**
-   * a aggregate expression that apply each row(or column) of matrix b.
-   * op and sv is element wise operator.
-   *
-   * @code
-   * if (aAsRowVector == true_type && aAsColVector == false_type)
-   *  for each column j & 0 <= i < numRows, do:
-   *    dst = agg(op(b[i*ldb + j]))
-   *    a[j] = sv(a[j], dst)
-   *
-   * if (aAsRowVector == false_type && aAsColVector == true_type)
-   *  for each row i & 0 <= j < numCols, do:
-   *    dst = agg(op(b[i*ldb + j]))
-   *    a[i] = sv(a[i], dst)
-   * @endcode
-   */
-  template <class Agg,
-            class Op,
-            class Saver,
-            class aAsRowVector,
-            class aAsColVector>
-  int aggregate(Agg agg,
-                Op op,
-                Saver sv,
-                BaseMatrixT& b,
-                int numRows,
-                int numCols,
-                MatrixOffset& offset,
-                aAsRowVector,
-                aAsColVector);
-
-  /**
-   * a aggregate expression that apply each row(or column) of matrix b and c.
-   *
-   * op and sv is element wise operator.
-   *
-   * @code
-   * if (aAsRowVector == true_type && aAsColVector == false_type)
-   *   for each column j & 0 <= i < numRows, do:
-   *     dst = agg(op(b[i*ldb + j], c[i*ldc + j]))
-   *     a[j] = sv(a[j], dst)
-   *
-   * if (aAsRowVector == false_type && aAsColVector == true_type)
-   *   for each row i & 0 <= j < numCols, do:
-   *     dst = agg(op(b[i*ldb + j], c[i*ldc + j]))
-   *     a[i] = sv(a[i], dst)
-   * @endcode
-   */
-  template <class Agg,
-            class Op,
-            class Saver,
-            class aAsRowVector,
-            class aAsColVector>
-  int aggregate(Agg agg,
-                Op op,
-                Saver sv,
-                BaseMatrixT& b,
-                BaseMatrixT& c,
-                int numRows,
-                int numCols,
-                MatrixOffset& offset,
-                aAsRowVector,
-                aAsColVector);
-
-  /**
-   * a aggregate expression that apply each row of matrix b.
-   *
-   * @code
-   * for each row i & 0 <= j < b.width_, do:
-   *   this[i] = agg(b[i*ldb + j])
-   * @endcode
-   */
-  template <class Agg>
-  int applyRow(Agg agg, BaseMatrixT& b);
-
-  /**
-   * a aggregate expression that apply each row of matrix b.
-   *
-   * @code
-   * for each row i & 0 <= j < b.width_, do:
-   *   dst = agg(op(b[i*ldb + j], c[i*ldc + j])
-   *   this[i] = sv(this[i], dst)
-   * @endcode
-   */
-  template <class Agg, class Op, class Saver>
-  int applyRow(Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c);
-
-  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
-  template <class Agg, class Op>
-  int applyRow(Agg agg,
-               Op op,
-               real scaleDest,
-               real scaleAgg,
-               BaseMatrixT& b,
-               BaseMatrixT& c);
-
-  /**
-   * a aggregate expression that apply each row of matrix b.
-   *
-   * @code
-   * for each row i & 0 <= j < b.width_, do:
-   *   dst = agg(b[i*ldb + j])
-   *   this[i] = sv(this[i], dst)
-   * @endcode
-   */
-  template <class Agg, class Saver>
-  int applyRow(Agg agg, Saver sv, BaseMatrixT& b);
-
-  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
-  template <class Agg>
-  int applyRow(Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b);
-
-  /**
-   * a aggregate expression that apply each column of matrix b.
-   *
-   * @code
-   * for each column j & 0 <= i < b.height_, do:
-   *   this[j] = agg(b[i*ldb + j])
-   * @endcode
-   */
-  template <class Agg>
-  int applyCol(Agg agg, BaseMatrixT& b);
-
-  /**
-   * a aggregate expression that apply each column of matrix b.
-   *
-   * @code
-   * for each column j & 0 <= i < b.height_, do:
-   *   dst = agg(b[i*ldb + j])
-   *   this[j] = sv(this[j], dst)
-   * @endcode
-   */
-  template <class Agg, class Saver>
-  int applyCol(Agg agg, Saver sv, BaseMatrixT& b);
-
-  // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg)
-  template <class Agg>
-  int applyCol(Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b);
-
-  bool useGpu() const { return useGpu_; }
-
-  const T* rowBuf(size_t row) const { return data_ + width_ * row; }
-
-  T* rowBuf(size_t row) { return data_ + width_ * row; }
-
-  /**
-   * @brief   unary operator.
-   *
-   */
-  void neg();
-  void exp2();
-  void pow2(T p);
-  void log2();
-  void sqrt2();
-  void square2();
-  void reciprocal2();
-  void abs2();
-  void sign2();
-  void zero();
-
-  /**
-   * @code
-   * this(row, col + columnOffset) = 0 for 0 <= col < numColumns
-   * @endcode
-   */
-  void zeroAtOffset(int64_t columnOffset, int64_t numColumns);
-  void one();
-  void subScalar(T p);
-  void mulScalar(T p);
-  void divScalar(T p);
-
-  /**
-   * @code
-   * this = p
-   * @endcode
-   */
-  void assign(T p);
-
-  /**
-   * @code
-   * swap(this, b)
-   * example: swap two Matrices
-   * MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-   * MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-   * cpuA->deepSwap(*cpuB);
-   * @endcode
-   */
-  void deepSwap(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this = this + p
-   * @endcode
-   */
-  void add(T p);
-
-  /**
-   * @code
-   * this = this*p1 + p2
-   * @endcode
-   */
-  void add(T p1, T p2);
-
-  /**
-   * this = this < low ? low : this
-   *
-   * this = this > high ? high : this
-   */
-  void clip(T p1, T p2);
-
-  /**
-   * this = b < low ? 0 : 1
-   *
-   * this = b > high ? 0 : 1
-   */
-  void clipDerivative(BaseMatrixT& b, T p1, T p2);
-
-  /**
-   * @code
-   * a = a > p ? 1.0f : 0.0f
-   * @endcode
-   */
-  void biggerThanScalar(T p);
-
-  /**
-   * @code
-   * a = a > p ? a : p
-   * @endcode
-   */
-  void downClip(T p);
-
-  /**
-   * @code
-   * this = b
-   * @endcode
-   */
-  void assign(BaseMatrixT& b);
-
-  /**
-   * @code
-   * If b.width + columOffset <= this.width
-   *  this(row, col + columnOffset) = b(row, col) for 0 <= col < b.width
-   *
-   * If this.width + columnOffset <= b.width
-   *  this(row, col) = b(row, col + columnOffset) for 0 <= col < this.width
-   *
-   * Otherwise, FATAL
-   * @endcode
-   */
-  void assignAtOffset(BaseMatrixT& b, int64_t columnOffset);
-
-  /// this = this + b
-  void add(BaseMatrixT& b);
-
-  /**
-   * @code
-   * If b.width + columOffset <= this.width
-   *  this(row, col + columnOffset) += b(row, col) for 0 <= col < b.width
-   *
-   * If this.width + columnOffset <= b.width
-   *  this(row, col) += b(row, col + columnOffset) for 0 <= col < this.width
-   *
-   * Otherwise, FATAL
-   * @endcode
-   */
-  void addAtOffset(BaseMatrixT& b, int64_t columnOffset);
-
-  void addColVector(BaseMatrixT& b);
-  void addRowVector(BaseMatrixT& b);
-  void addBias(BaseMatrixT& b, T scale);
-
-  void mulRowVector(BaseMatrixT& b);
-  void divRowVector(BaseMatrixT& b);
-
-  void mulColVector(BaseMatrixT& b);
-  void divColVector(BaseMatrixT& b);
-
-  void addP2P(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this = this + b*p
-   * @endcode
-   */
-  void add(BaseMatrixT& b, T p);
-
-  /**
-   * @code
-   * this = p1*this + p2*b
-   * @endcode
-   */
-  void add(BaseMatrixT& b, T p1, T p2);
-
-  /**
-   * @code
-   * this = this - b
-   * @endcode
-   */
-  void sub(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this = this - b*p
-   * @endcode
-   */
-  void sub(BaseMatrixT& b, T p);
-
-  /**
-   * @code
-   * b = max(0, this)
-   * @endcode
-   */
-  void relu(BaseMatrixT& b);
-  void reluDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = log(1.0 + exp(this))
-   * @endcode
-   */
-  void softrelu(BaseMatrixT& b);
-  void softreluDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = min(max(this, p1), p2)
-   * @endcode
-   */
-  void brelu(BaseMatrixT& b);
-  void breluDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = this * this
-   * @endcode
-   */
-  void square2(BaseMatrixT& b);
-  void squareDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = tanh(this)
-   * @endcode
-   */
-  void tanh(BaseMatrixT& b);
-  void tanhDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = p1 * tanh(p2 * this)
-   * @endcode
-   */
-  void scaledTanh(BaseMatrixT& b, T p1, T p2);
-  void scaledTanhDerivative(BaseMatrixT& b, T p1, T p2);
-
-  /**
-   * @code
-   * b = 1.0f / this
-   * @endcode
-   */
-  void reciprocal2(BaseMatrixT& b);
-  void reciprocalDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = this > 0.0f ? this : -this
-   * @endcode
-   */
-  void abs2(BaseMatrixT& b);
-  void absDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = 1.0f / (1.0f + exp(-this))
-   * @endcode
-   */
-  void sigmoid(BaseMatrixT& b);
-  void sigmoidDerivative(BaseMatrixT& b);
-
-  /**
-   * @code
-   * b = a
-   * @endcode
-   */
-  void expDerivative(BaseMatrixT& b);
-
-  void sign2(BaseMatrixT& b);
-
-  void exp2(BaseMatrixT& b);
-  void pow2(BaseMatrixT& b, T p);
-  void log2(BaseMatrixT& b);
-  void sqrt2(BaseMatrixT& b);
-  void addScalar(BaseMatrixT& b, T p);
-  void subScalar(BaseMatrixT& b, T p);
-  void mulScalar(BaseMatrixT& b, T p);
-  void divScalar(BaseMatrixT& b, T p);
-  void scalarDiv(BaseMatrixT& b, T p);
-
-  /**
-   * @code
-   * this = 1.0f / sqrt(b)
-   * @endcode
-   */
-  void invSqrt(BaseMatrixT& b);
-
-  /// this = (b == value)
-  void isEqualTo(BaseMatrixT& b, T value);
-
-  /**
-   * @brief   ternary operator.
-   */
-  void softCrossEntropy(BaseMatrixT& b, BaseMatrixT& c);
-  void softCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c);
-  void binaryLabelCrossEntropy(BaseMatrixT& b, BaseMatrixT& c);
-  void binaryLabelCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = b + c
-   * @endcode
-   */
-  void add(BaseMatrixT& b, BaseMatrixT& c);
-  /**
-   * @code
-   * this = b*p1 + c*p2
-   * @endcode
-   */
-  void add(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2);
-  /**
-   * @code
-   * this = b - c
-   * @endcode
-   */
-  void sub(BaseMatrixT& b, BaseMatrixT& c);
-  /**
-   * @code
-   * this = b*p1 - c*p2
-   * @endcode
-   */
-  void sub(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2);
-
-  /**
-   * @code
-   * this = this + b + c
-   * @endcode
-   */
-  void add2(BaseMatrixT& b, BaseMatrixT& c);
-  /**
-   * @code
-   * this = this*p1 + b*p2 + c*p3
-   * @endcode
-   */
-  void add2(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3);
-
-  /**
-   * @code
-   * this = a*p1 + b*p2 + c*p3
-   * @endcode
-   */
-  void add3(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2, T p3);
-
-  /**
-   * @code
-   *   c = p2 * c - p1 *  (b + p3 * this)
-   *   this += mom
-   * @endcode
-   */
-  void sgdUpdate(BaseMatrixT& b,  //  grad
-                 BaseMatrixT& c,  //  mom
-                 T p1,            //  learningRate,
-                 T p2,            //  momentum,
-                 T p3);           //  decayRate
-
-  /**
-   * @code
-   *   c = p2 * c - p1 * d * (b + p3 * this)
-   *   this += mom
-   * @endcode
-   */
-  void sgdUpdate(BaseMatrixT& b,  // grad,
-                 BaseMatrixT& c,  // mom,
-                 BaseMatrixT& d,  // lr,
-                 T p1,            // learningRate,
-                 T p2,            // momentum,
-                 T p3);           // decayRate
-
-  /// apply L1/L2 to *this*
-  virtual void applyL1(T learningRate, T decayRate);
-  void applyL1(BaseMatrixT& lr, T learningRate, T decayRate);
-  void applyL2(T learningRate, T decayRate);
-  void applyL2(BaseMatrixT& lr, T learningRate, T decayRate);
-
-  /**
-   * @code
-   * this *= b
-   * @endcode
-   */
-  void dotMul(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this = b * c
-   * @endcode
-   */
-  void dotMul(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = b / c
-   * @endcode
-   */
-  void dotDiv(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = (b + p1) / (c + p2)
-   * @endcode
-   */
-  void dotDiv(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
-
-  /**
-   * @code
-   * this = log(1 + exp(b - c)) - d * (b - c)
-   * @endcode
-   */
-  void rankLoss(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
-  void rankLossBp(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
-
-  /**
-   * @code
-   * this = log(1 + exp(b)) - c * b
-   * @endcode
-   */
-  void logisticRegressionLoss(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this += exp(b)/(1+exp(b)) - c
-   * @endcode
-   */
-  void logisticRegressionLossBp(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = b > c ? 1.0 : 0.0
-   * @endcode
-   */
-  void biggerThan(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = ((b>c && d>0.5) || (b<c && d<0.5)) ? 1 : 0)
-   * @endcode
-   */
-  void biggerThan(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d);
-
-  /**
-   * @code
-   * this = b>c ? b : c
-   * @endcode
-   */
-  void max2(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this[destCol] += (b>p1 == c>p1) ? 0 : 1)
-   * @endcode
-   */
-  void binaryClassificationError(size_t destCol,
-                                 BaseMatrixT& b,
-                                 BaseMatrixT& c,
-                                 T p);
-  void binaryClassificationError2(size_t destCol,
-                                  BaseMatrixT& b,
-                                  BaseMatrixT& c,
-                                  T p);
-
-  /**
-   * @code
-   * this = this * b * b
-   * @endcode
-   */
-  void dotMulSquare(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this = this * this * b
-   * @endcode
-   */
-  void dotSquareMul(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this = b * c * c
-   * @endcode
-   */
-  void dotMulSquare(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = b * b * c * c
-   * @endcode
-   */
-  void dotSquareSquare(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = this * (p1*b + p2*c)^2
-   * @endcode
-   */
-  void dotMulSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
-
-  /**
-   * @code
-   * this = (p1*b + p2*c)^2
-   * @endcode
-   */
-  void dotSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
-
-  /**
-   * @code
-   * this=  this * (p1*b + p2*c)
-   * @endcode
-   */
-  void dotMulSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
-
-  /**
-   * @code
-   * this += sqr(p1*b + p2*c + p3*d)
-   * @endcode
-   */
-  void addSquareSum(
-      BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1, T p2, T p3);
-
-  /**
-   * @code
-   * this += p * sqr(b)
-   * @endcode
-   */
-  void addSquare(BaseMatrixT& b, T p);
-
-  /**
-   * @code
-   * this = p1 * this + p2 * sqr(b)
-   * @endcode
-   */
-  void decayAddSquare(BaseMatrixT& b, T p1, T p2);
-
-  /**
-   * @code
-   * this = p1 * this + p2 * sqr(b * c)
-   * @endcode
-   */
-  void decayAddSquareMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
-
-  /**
-   * @code
-   * this = 1 / (p1 * b + p2)
-   * @endcode
-   */
-  void reciprocal2(BaseMatrixT& b, T p1, T p2);
-
-  /**
-   * @code
-   * this = 1 / (p1 * b + p2 * c + p3)
-   * @endcode
-   */
-  void reciprocalSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3);
-
-  /**
-   * @code
-   * b = this; this = 0
-   * @endcode
-   */
-  void copyAndClear(BaseMatrixT& b);
-
-  /**
-   * @code
-   * this_row[destCol] += dotprod(b_row, c_row)
-   * @endcode
-   */
-  void rowDotMul(size_t destCol, BaseMatrixT& b, BaseMatrixT& c);
-  void rowDotMul2(size_t destCol, BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * this is vector (one row matrix)
-   *
-   * @code
-   *   for each row i, do:
-   *      this_row += dotmul(b_row_i, c_row_i)
-   * @endcode
-   */
-  void addDotMulVMM(BaseMatrixT& b, BaseMatrixT& c);
-  void addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * c is vector (one row matrix)
-   *
-   * @code
-   * for each row i, do:
-   *    this_row_i += dotmul(b_row_i, c_row)
-   * @endcode
-   */
-  void addDotMulMMV(BaseMatrixT& b, BaseMatrixT& c);
-  void addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this = p1 * this + p2 * b * c
-   * @endcode
-   */
-  void addDotMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2);
-
-  /**
-   * @code
-   * this_row = b_row * c_row[cCol]
-   * @endcode
-   */
-  void rowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
-  void rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this_col = b_col * c_col[cRow]
-   * @endcode
-   */
-  void colScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this_col += b_col * c_col[cRow]
-   * @endcode
-   */
-  void addColScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c);
-
-  /**
-   * @code
-   * this_row += b_row * c_row[cCol]
-   * @endcode
-   */
-  void addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
-
-  /// calculate the sum of each row of the matrix b.
-  /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij}
-  void sumRows(BaseMatrixT& b, T scaleSum, T scaleDest);
-
-  /// calculate the maximum value of each row of the matrix b.
-  void maxRows(BaseMatrixT& b);
-  /// calculate the minimum value of each row of the matrix b.
-  void minRows(BaseMatrixT& b);
-
-  /// calculate the maximum value of each column of the matrix b.
-  void maxCols(BaseMatrixT& b);
-  /// calculate the minimum value of each column of the matrix b.
-  void minCols(BaseMatrixT& b);
-
-  /// calculate the sum of each column of the matrix b.
-  /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ji}
-  void sumCols(BaseMatrixT& b, T scaleSum, T scaleDest);
-
-  /// this_i = scaleDest * this_i + scaleSum * \sum_j (b_{ij} - c_{ij})^2
-  void sumOfSquaredDiffs(BaseMatrixT& b,
-                         BaseMatrixT& c,
-                         T scaleSum,
-                         T scaleDest);
-
-  /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij} * c_{ij}
-  void sumOfProducts(BaseMatrixT& b, BaseMatrixT& c, T scaleSum, T scaleDest);
-
-  /**
-   * @code
-   * this_row = b_row + p * ones * c_row[cCol]
-   * @endcode
-   */
-  void rowAdd(size_t cCol, BaseMatrixT& b, BaseMatrixT& c, T p);
-  /**
-   * @code
-   * this_row = pow(b_row, c_row[cCol])
-   * @endcode
-   */
-  void rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
-
-  virtual bool isSparse() const { return false; }
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    if (useGpu_) {
-      TensorGpuApply<T>(*this, expr);
-    } else {
-      TensorCpuApply<T>(*this, expr);
-    }
-  }
-
-  template <typename ExpressionType>
-  void operator+=(const ExpressionType& expr) {
-    (*this) = (*this) + expr;
-  }
-  template <typename ExpressionType>
-  void operator-=(const ExpressionType& expr) {
-    (*this) = (*this) - expr;
-  }
-  template <typename ExpressionType>
-  void operator*=(const ExpressionType& expr) {
-    (*this) = (*this) * expr;
-  }
-  template <typename ExpressionType>
-  void operator/=(const ExpressionType& expr) {
-    (*this) = (*this) / expr;
-  }
-};
-
-typedef BaseMatrixT<real> BaseMatrix;
-typedef BaseMatrixT<int> IBaseMatrix;
-
-}  // namespace paddle
diff --git a/paddle/math/CMakeLists.txt b/paddle/math/CMakeLists.txt
deleted file mode 100644
index 922fb5172273da24f9c48786961a6d850b1ed7c5..0000000000000000000000000000000000000000
--- a/paddle/math/CMakeLists.txt
+++ /dev/null
@@ -1,61 +0,0 @@
-# common package contains:
-#   * the utilities:
-#       * Thread Libs
-#       * Memory Manage libs
-#       * CommandLine Parser
-#       * Logging
-#       * Timer/Stats
-#   * the math libraries:
-#       * Matrix/Vector
-#   * the parameter optimizers.
-#   * the parameter updater functions.
-#
-# TODO(yuyang18): separate libs.
-#
-file(GLOB MATH_HEADERS . *.h)
-file(GLOB MATH_SOURCES . *.cpp)
-
-if(NOT WITH_MKLDNN)
-    set(DNN_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.h")
-    set(DNN_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.cpp")
-    list(REMOVE_ITEM MATH_HEADERS "${DNN_HEADER}")
-    list(REMOVE_ITEM MATH_SOURCES "${DNN_SOURCE}")
-    message(STATUS "Skip compiling with MKLDNNMatrix")
-else()
-    message(STATUS "Compile with MKLDNNMatrix")
-endif()
-
-if(MOBILE_INFERENCE)
-    # Remove sparse
-    list(REMOVE_ITEM MATH_HEADERS
-         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.h
-         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.h
-         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.h)
-    list(REMOVE_ITEM MATH_SOURCES
-         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.cpp
-         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.cpp
-         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.cpp)
-endif()
-set(MATH_SOURCES
-    "${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu"
-    "${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu"
-    ${MATH_SOURCES})
-if(NOT WITH_GPU)
-    # then compile BaseMatrix.cu as c++ file
-    compile_cu_as_cpp("${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu")
-    compile_cu_as_cpp("${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu")
-    add_library(paddle_math STATIC
-        ${MATH_SOURCES})
-else()
-    cuda_add_library(paddle_math ${MATH_SOURCES})
-endif()
-
-
-
-add_style_check_target(paddle_math ${MATH_SOURCES})
-add_style_check_target(paddle_math ${MATH_HEADERS})
-
-add_dependencies(paddle_math paddle_proto ${external_project_dependencies})  # depends
-if(WITH_TESTING)
-    add_subdirectory(tests)
-endif()
diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp
deleted file mode 100644
index 023450ffb794086399d7131ba5faa4dbefeaaf7d..0000000000000000000000000000000000000000
--- a/paddle/math/CpuSparseMatrix.cpp
+++ /dev/null
@@ -1,787 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "CpuSparseMatrix.h"
-#include "SparseMatrix.h"
-#include "float.h"
-#include "hl_gpu.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-const size_t CpuSparseMatrix::DEFAULT_AVG_WIDTH;
-
-CpuSparseMatrix::CpuSparseMatrix(size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans)
-    : Matrix(NULL, height, width, trans, false) {
-  resize(height, width, nnz, valueType, format);
-}
-
-CpuSparseMatrix::CpuSparseMatrix(CpuMemHandlePtr dataHandle,
-                                 size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans)
-    : Matrix(dataHandle, height, width, trans, false) {
-  resize(height, width, nnz, valueType, format);
-}
-
-CpuSparseMatrix::CpuSparseMatrix(real* data,
-                                 int* rows,
-                                 int* cols,
-                                 size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans)
-    : Matrix(NULL, height, width, trans, false) {
-  cols_ = cols;
-  rows_ = rows;
-  value_ = data;
-  height_ = height;
-  width_ = width;
-  elementCnt_ = nnz;
-  valueType_ = valueType;
-  format_ = format;
-}
-
-void CpuSparseMatrix::resize(size_t newHeight,
-                             size_t newWidth,
-                             size_t newNnz,
-                             SparseValueType valueType,
-                             SparseFormat format) {
-  CHECK_LE(newNnz, newHeight * newWidth);
-  size_t newSize = 0;
-  if (format == SPARSE_CSR) {
-    newSize = (newHeight + 1) * sizeof(int) + newNnz * sizeof(int);
-  } else {
-    newSize = (newWidth + 1) * sizeof(int) + newNnz * sizeof(int);
-  }
-
-  if (NO_VALUE != valueType) {
-    newSize += newNnz * sizeof(real);
-  }
-
-  if (NULL == memoryHandle_.get() || newSize > memoryHandle_->getSize()) {
-    memoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize);
-  }
-
-  height_ = newHeight;
-  width_ = newWidth;
-  elementCnt_ = newNnz;
-  valueType_ = valueType;
-  format_ = format;
-  sparseResize();
-}
-void CpuSparseMatrix::sparseResize() {
-  if (format_ == SPARSE_CSR) {
-    rows_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(memoryHandle_->getBuf()));
-    cols_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(memoryHandle_->getBuf()) +
-        (height_ + 1) * sizeof(int));
-    if (NO_VALUE != valueType_) {
-      value_ = reinterpret_cast<real*>(
-          reinterpret_cast<char*>(memoryHandle_->getBuf()) +
-          (height_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
-    } else {
-      value_ = NULL;
-    }
-  } else {
-    cols_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(memoryHandle_->getBuf()));
-    rows_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(memoryHandle_->getBuf()) +
-        (width_ + 1) * sizeof(int));
-    if (NO_VALUE != valueType_) {
-      value_ = reinterpret_cast<real*>(
-          reinterpret_cast<char*>(memoryHandle_->getBuf()) +
-          (width_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
-    } else {
-      value_ = NULL;
-    }
-  }
-}
-
-void CpuSparseMatrix::resize(size_t newHeight, size_t newWidth) {
-  resize(newHeight,
-         newWidth,
-         newHeight * std::min(DEFAULT_AVG_WIDTH, newWidth),
-         valueType_,
-         format_);
-}
-
-MatrixPtr CpuSparseMatrix::getTranspose() {
-  if (!memoryHandle_ && !value_) {
-    MatrixPtr dest(new CpuSparseMatrix(
-        height_, width_, elementCnt_, valueType_, format_, true));
-    return dest;
-  } else if (memoryHandle_) {
-    MatrixPtr dest(new CpuSparseMatrix(
-        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_),
-        height_,
-        width_,
-        elementCnt_,
-        valueType_,
-        format_,
-        true));
-    return dest;
-  } else if (value_) {
-    MatrixPtr dest(new CpuSparseMatrix(value_,
-                                       rows_,
-                                       cols_,
-                                       height_,
-                                       width_,
-                                       elementCnt_,
-                                       valueType_,
-                                       format_,
-                                       true));
-    return dest;
-  } else {
-    return NULL;
-  }
-}
-
-SparseValueType CpuSparseMatrix::getValueType() { return valueType_; }
-
-void CpuSparseMatrix::mul(const Matrix& a,
-                          const Matrix& b,
-                          real scaleAB,
-                          real scaleT) {
-  CHECK(!isTransposed()) << "Not supported";
-  const auto a_ptr = dynamic_cast<const CpuMatrix*>(&a);
-  const auto b_ptr = dynamic_cast<const CpuMatrix*>(&b);
-
-  if (a_ptr && b_ptr) {
-    CpuMatrix::mul((CpuMatrix*)a_ptr, (CpuMatrix*)b_ptr, this, scaleAB, scaleT);
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-}
-
-void CpuSparseMatrix::add3(CpuMatrix* b) {
-  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
-  CHECK(height_ == b->getHeight());
-  CHECK(width_ == b->getWidth());
-  real* A = getValue();
-  real* B = b->getData();
-  int* cols = getCols();
-  for (size_t i = 0; i < height_; i++) {
-    size_t start = getRowStartIdx(i);
-    size_t end = getRowStartIdx(i + 1);
-    for (size_t j = start; j < end; j++) {
-      A[j] = B[i * width_ + cols[j]];
-    }
-  }
-}
-
-void CpuSparseMatrix::add3(MatrixPtr b) {
-  if (dynamic_cast<CpuMatrix*>(b.get())) {
-    add3(dynamic_cast<CpuMatrix*>(b.get()));
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-}
-
-void CpuSparseMatrix::addBias(Matrix& b, real scale) {
-  CHECK_EQ(b.getHeight(), (size_t)1);
-  CHECK_EQ(width_, b.getWidth());
-  real* A = getValue();
-  real* B = b.getData();
-  int* cols = getCols();
-  size_t nnz = getElementCnt();
-  for (size_t i = 0; i < nnz; i++) {
-    A[i] += scale * B[cols[i]];
-  }
-}
-
-template <class T>
-void printBuf(std::ostream& os, T* a, size_t len, const char* name) {
-  os << "\n: " << name << " [";
-  for (size_t i = 0; i < len; i++) {
-    os << a[i] << " ";
-  }
-  os << "]\n";
-}
-
-void CpuSparseMatrix::print(std::ostream& os) const {
-  size_t rowSize = format_ == SPARSE_CSC ? elementCnt_ : height_ + 1;
-  size_t colSize = format_ == SPARSE_CSC ? width_ + 1 : elementCnt_;
-  printBuf(os, rows_, rowSize, "row");
-  printBuf(os, cols_, colSize, "col");
-  if (valueType_ == FLOAT_VALUE) {
-    printBuf(os, value_, elementCnt_, "value");
-  }
-  return;
-}
-
-void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const {
-  CHECK_LT(idx, height_);
-  if (format_ == SPARSE_CSC) {
-    LOG(FATAL) << "SPARSE_CSC not supported";
-    return;
-  }
-
-  const int* col = getRowCols(idx);
-  size_t num = getColNum(idx);
-  if (num > 0) {
-    if (valueType_ == FLOAT_VALUE) {
-      const real* data = getRowValues(idx);
-      os << col[0] << ":" << data[0];
-      for (size_t i = 1; i < num; ++i) {
-        os << " " << col[i] << ":" << data[i];
-      }
-    } else {
-      os << col[0];
-      for (size_t i = 1; i < num; ++i) {
-        os << " " << col[i];
-      }
-    }
-  }
-  os << ";";
-}
-
-void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) {
-  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
-  CHECK_EQ(height_, b.getHeight());
-  CHECK_EQ(width_, b.getWidth());
-  real* A = getValue();
-  real* B = b.getValue();
-  if (b.getValueType() == FLOAT_VALUE) {
-    for (size_t i = 0; i < height_; i++) {
-      size_t start = getRowStartIdx(i);
-      size_t end = getRowStartIdx(i + 1);
-      CHECK_EQ(start, b.getRowStartIdx(i));
-      CHECK_EQ(end, b.getRowStartIdx(i + 1));
-      for (size_t j = start; j < end; j++) {
-        A[j] = B[j] * c.getElement(i, cCol);
-      }
-    }
-  } else if (b.getValueType() == NO_VALUE) {
-    for (size_t i = 0; i < height_; i++) {
-      size_t start = getRowStartIdx(i);
-      size_t end = getRowStartIdx(i + 1);
-      CHECK_EQ(start, b.getRowStartIdx(i));
-      CHECK_EQ(end, b.getRowStartIdx(i + 1));
-      for (size_t j = start; j < end; j++) {
-        A[j] = c.getElement(i, cCol);
-      }
-    }
-  }
-}
-
-void CpuSparseMatrix::randomizeUniform() {
-  CHECK_LE(elementCnt_, height_ * width_);
-  if (valueType_ == FLOAT_VALUE) {
-    real* data = getValue();
-    for (size_t i = 0; i < elementCnt_; ++i) {
-      *data++ = rand() / static_cast<real>(RAND_MAX);  // NOLINT
-    }
-  }
-  if (format_ == SPARSE_CSR) {
-    sparseRand(rows_, cols_, elementCnt_, height_ + 1, width_, false);
-  } else {
-    sparseRand(cols_, rows_, elementCnt_, width_ + 1, height_, false);
-  }
-}
-
-void CpuSparseMatrix::copyFrom(std::vector<int>& rows,
-                               std::vector<int>& cols,
-                               std::vector<real>& values) {
-  size_t size = format_ == SPARSE_CSR ? cols.size() : rows.size();
-  resize(height_, width_, size, valueType_, format_);
-  if (valueType_ == FLOAT_VALUE) {
-    memcpy(&value_[0], &values[0], sizeof(real) * values.size());
-  }
-  memcpy(&cols_[0], &cols[0], sizeof(int) * cols.size());
-  memcpy(&rows_[0], &rows[0], sizeof(int) * rows.size());
-}
-
-// Copy from a CpuMatrix, only supported in sparse_float_value_t
-// SparseMatrix.
-void CpuSparseMatrix::copyFrom(const CpuMatrix& src) {
-  CHECK_EQ(getHeight(), src.getHeight());
-  CHECK_EQ(getWidth(), src.getWidth());
-  CHECK(!src.trans_ && !trans_);
-  if (format_ == SPARSE_CSR) {
-    std::vector<int> rows(getHeight() + 1);
-    std::vector<int> cols;
-    std::vector<real> values;
-    rows[0] = 0;
-    for (size_t r = 0; r < getHeight(); ++r) {
-      for (size_t c = 0; c < getWidth(); ++c) {
-        real v = src.getElement(r, c);
-        if (fabs(v) > FLT_EPSILON) {
-          cols.push_back(c);
-          values.push_back(v);
-        }
-      }
-      rows[r + 1] = values.size();
-    }
-    copyFrom(rows, cols, values);
-  } else {
-    std::vector<int> cols(getWidth() + 1);
-    std::vector<int> rows;
-    std::vector<real> values;
-    cols[0] = 0;
-    for (size_t r = 0; r < getWidth(); ++r) {
-      for (size_t c = 0; c < getHeight(); ++c) {
-        real v = src.getElement(c, r);
-        if (fabs(v) > FLT_EPSILON) {
-          rows.push_back(c);
-          values.push_back(v);
-        }
-      }
-      cols[r + 1] = values.size();
-    }
-    copyFrom(rows, cols, values);
-  }
-}
-
-MatrixPtr CpuSparseMatrix::clone(size_t height, size_t width, bool useGpu) {
-  if (height == 0 && width == 0) {
-    height = height_;
-    width = width_;
-  }
-  CHECK(width && height);
-  if (!useGpu) {
-    return std::make_shared<CpuSparseMatrix>(
-        height, width, 0, valueType_, format_);
-  } else {
-    return std::make_shared<GpuSparseMatrix>(
-        height, width, elementCnt_, valueType_, format_);
-  }
-}
-
-MatrixPtr CpuSparseMatrix::subMatrix(size_t startRow, size_t numRows) {
-  CHECK_LE(startRow + numRows, height_);
-  CHECK_EQ(format_, SPARSE_CSR);
-  if (valueType_ == NO_VALUE) {
-    return std::make_shared<CpuSparseMatrix>(
-        nullptr,
-        rows_ + startRow,
-        cols_,
-        numRows,
-        width_,
-        rows_[startRow + numRows] - rows_[startRow],
-        valueType_,
-        format_,
-        trans_);
-  } else {
-    return std::make_shared<CpuSparseMatrix>(
-        value_,
-        rows_ + startRow,
-        cols_,
-        numRows,
-        width_,
-        rows_[startRow + numRows] - rows_[startRow],
-        valueType_,
-        format_,
-        trans_);
-  }
-}
-
-/* mem MUST be alloced outside (memAlloc=false) */
-void CpuSparseMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
-  CHECK(!memAlloc);
-  CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(matTrans.get());
-  if (format_ == SPARSE_CSR) {
-    /*statistic element number in each col*/
-    int* colCounters = mat->getRows() + 1;
-    memset(colCounters, 0, sizeof(int) * width_);
-    for (size_t i = 0; i < elementCnt_; ++i) {
-      int col = cols_[i];
-      colCounters[col]++;
-    }
-    /*fill mat rows */
-    mat->getRows()[0] = 0;
-    for (size_t i = 1; i < width_ + 1; i++) {
-      mat->getRows()[i] = mat->getRows()[i - 1] + mat->getRows()[i];
-    }
-    /*fill mat values and cols*/
-    std::vector<int> colNumVec(width_, 0);
-    if (valueType_ == FLOAT_VALUE) {
-      for (size_t i = 0; i < height_; i++) {
-        for (int j = rows_[i]; j < rows_[i + 1]; j++) {
-          int colIdx = cols_[j];
-          int index = mat->getRows()[colIdx] + colNumVec[colIdx];
-          mat->getCols()[index] = i;
-          mat->getValue()[index] = value_[j];
-          colNumVec[colIdx]++;
-        }
-      }
-    } else {
-      for (size_t i = 0; i < height_; i++) {
-        for (int j = rows_[i]; j < rows_[i + 1]; j++) {
-          int colIdx = cols_[j];
-          int index = mat->getRows()[colIdx] + colNumVec[colIdx];
-          mat->getCols()[index] = i;
-          colNumVec[colIdx]++;
-        }
-      }
-    }
-  } else {
-    /*statistic element number in each row*/
-    int* rowCounters = mat->getCols() + 1;
-    memset(rowCounters, 0, sizeof(int) * height_);
-    for (size_t i = 0; i < elementCnt_; ++i) {
-      int row = rows_[i];
-      rowCounters[row]++;
-    }
-
-    /*fill mat cols */
-    mat->getCols()[0] = 0;
-    for (size_t i = 1; i < height_ + 1; i++) {
-      mat->getCols()[i] = mat->getCols()[i - 1] + mat->getCols()[i];
-    }
-    /*fill mat values and rows*/
-    std::vector<int> rowNumVec(height_, 0);
-    if (valueType_ == FLOAT_VALUE) {
-      for (size_t i = 0; i < width_; i++) {
-        for (int j = cols_[i]; j < cols_[i + 1]; j++) {
-          int rowIdx = rows_[j];
-          int index = mat->getCols()[rowIdx] + rowNumVec[rowIdx];
-          mat->getRows()[index] = i;
-          mat->getValue()[index] = value_[j];
-          rowNumVec[rowIdx]++;
-        }
-      }
-    } else {
-      for (size_t i = 0; i < width_; i++) {
-        for (int j = cols_[i]; j < cols_[i + 1]; j++) {
-          int rowIdx = rows_[j];
-          int index = mat->getCols()[rowIdx] + rowNumVec[rowIdx];
-          mat->getRows()[index] = i;
-          rowNumVec[rowIdx]++;
-        }
-      }
-    }
-  }
-}
-
-void CpuSparseMatrix::setRow(size_t row,
-                             size_t colNum,
-                             const unsigned int* cols,
-                             const real* values) {
-  if (format_ == SPARSE_CSR) {
-    CHECK_LT(row, height_);
-    CHECK(NULL != cols);
-    if (0 == row) {
-      rows_[row] = 0;
-    }
-    rows_[row + 1] = rows_[row] + colNum;
-    for (size_t i = 0; i < colNum; ++i) {
-      cols_[rows_[row] + i] = cols[i];
-    }
-    if (valueType_ == NO_VALUE) {
-      CHECK(!values);
-    } else {
-      for (size_t i = 0; i < colNum; ++i) {
-        value_[rows_[row] + i] = values[i];
-      }
-    }
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-}
-
-void CpuSparseMatrix::fillRowIndices(IVectorPtr& outVec) const {
-  if (format_ == SPARSE_CSR) {
-    auto nnz = getElementCnt();
-    IVector::resizeOrCreate(outVec, nnz, false);
-    auto out = outVec->getData();
-    int* rows = getRows();
-    for (size_t i = 0; i < height_; i++) {
-      for (int j = rows[i]; j < rows[i + 1]; j++) {
-        out[j] = i;
-      }
-    }
-  } else {
-    LOG(FATAL) << "SPARSE_CSC not supported";
-  }
-}
-
-ThreadLocal<std::vector<CpuSparseMatrixPtr>> CpuSparseMatrix::cpuLocalMats_;
-
-CpuSparseMatrixPtr CpuSparseMatrix::getTmpSparseMatrix(size_t height,
-                                                       size_t width) {
-  std::vector<CpuSparseMatrixPtr>* localMats = cpuLocalMats_.get();
-  auto it = localMats->begin();
-  while (it != localMats->end()) {
-    if (it->unique()) {
-      (*it)->resize(height, width, elementCnt_, valueType_, format_);
-      return *it;
-    }
-  }
-  localMats->emplace_back(std::make_shared<CpuSparseMatrix>(
-      height, width, elementCnt_, valueType_, format_, false));
-  return localMats->back();
-}
-
-void CpuSparseMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
-  if (dynamic_cast<const GpuSparseMatrix*>(&src)) {
-    auto tmpSrc = dynamic_cast<const GpuSparseMatrix*>(&src);
-    copyFrom(*tmpSrc, stream);
-  } else if (dynamic_cast<const CpuSparseMatrix*>(&src)) {
-    auto tmpSrc = dynamic_cast<const CpuSparseMatrix*>(&src);
-    copyFrom(*tmpSrc);
-  } else if (dynamic_cast<const CpuMatrix*>(&src)) {
-    auto tmpSrc = dynamic_cast<const CpuMatrix*>(&src);
-    copyFrom(*tmpSrc);
-  } else {
-    LOG(FATAL) << "not implemented";
-  }
-}
-
-void CpuSparseMatrix::copyFrom(const Matrix& src) {
-  if (dynamic_cast<const CpuSparseMatrix*>(&src)) {
-    auto tmpSrc = dynamic_cast<const CpuSparseMatrix*>(&src);
-    copyFrom(*tmpSrc);
-  } else if (dynamic_cast<const CpuMatrix*>(&src)) {
-    auto tmpSrc = dynamic_cast<const CpuMatrix*>(&src);
-    copyFrom(*tmpSrc);
-  } else {
-    LOG(FATAL) << "not implemented";
-  }
-}
-
-void CpuSparseMatrix::copyFrom(const GpuSparseMatrix& src, hl_stream_t stream) {
-  CHECK_EQ(height_, src.getHeight());
-  CHECK_EQ(width_, src.getWidth());
-  CHECK_EQ(size_t(elementCnt_), src.getElementCnt());
-  size_t valSize = valueType_ == NO_VALUE ? 0 : elementCnt_;
-  if (format_ == SPARSE_CSC)
-    hl_memcpy_from_csc_matrix(value_,
-                              valSize,
-                              rows_,
-                              elementCnt_,
-                              cols_,
-                              width_ + 1,
-                              src.sMatrix_.get(),
-                              stream);
-  else
-    hl_memcpy_from_csr_matrix(value_,
-                              valSize,
-                              rows_,
-                              height_ + 1,
-                              cols_,
-                              elementCnt_,
-                              src.sMatrix_.get(),
-                              stream);
-}
-
-void CpuSparseMatrix::copyFrom(const CpuSparseMatrix& src) {
-  CHECK_EQ(height_, src.getHeight());
-  CHECK_EQ(width_, src.getWidth());
-  CHECK_EQ(format_, src.getFormat());
-  int start = format_ == SPARSE_CSR ? src.getRows()[0] : src.getCols()[0];
-  if (format_ == SPARSE_CSR) {
-    size_t totalColNum = 0;
-    for (size_t i = 0; i < height_; ++i) {
-      totalColNum += src.getColNum(i);
-    }
-    resize(height_, width_, totalColNum, valueType_, format_);
-    rows_[0] = 0;
-    for (size_t i = 0; i < height_; ++i) {
-      rows_[i + 1] = rows_[i] + src.getColNum(i);
-    }
-    memcpy(cols_, src.getCols() + start, totalColNum * sizeof(int));
-  } else {
-    size_t totalColNum = 0;
-    for (size_t i = 0; i < width_; ++i) {
-      totalColNum += src.getRowNum(i);
-    }
-    resize(height_, width_, totalColNum, valueType_, format_);
-    cols_[0] = 0;
-    for (size_t i = 0; i < width_; ++i) {
-      cols_[i + 1] = cols_[i] + src.getRowNum(i);
-    }
-    memcpy(rows_, src.getRows() + start, totalColNum * sizeof(int));
-  }
-
-  // if have different value type, only copy rows and cols
-  if (valueType_ == FLOAT_VALUE && src.getValueType() == FLOAT_VALUE) {
-    memcpy(value_, src.getValue() + start, elementCnt_ * sizeof(real));
-  }
-}
-
-void CpuSparseMatrix::copyRow(int offsets,
-                              size_t colNum,
-                              const sparse_non_value_t* row) {
-  for (size_t j = 0; j < colNum; j++) {
-    cols_[offsets + j] = row[j].col;
-  }
-}
-
-void CpuSparseMatrix::copyRow(int offsets,
-                              size_t colNum,
-                              const sparse_float_value_t* row) {
-  for (size_t j = 0; j < colNum; j++) {
-    cols_[offsets + j] = row[j].col;
-    value_[offsets + j] = row[j].value;
-  }
-}
-
-template <class T>
-void CpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices, T* data) {
-  size_t totalColNum = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    int64_t id = ids[i];
-    totalColNum += indices[id + 1] - indices[id];
-  }
-  valueType_ = typeid(T) == typeid(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE;
-
-  resize(height_, width_, totalColNum, valueType_, format_);
-
-  rows_[0] = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    int64_t id = ids[i];
-    T* row = data + indices[id];
-    size_t colNum = indices[id + 1] - indices[id];
-    rows_[i + 1] = rows_[i] + colNum;
-    copyRow(rows_[i], colNum, row);
-  }
-}
-
-template <class T>
-void CpuSparseMatrix::copyFrom(int64_t* indices, T* data) {
-  CHECK(format_ == SPARSE_CSR);
-  size_t totalColNum = indices[height_] - indices[0];
-  valueType_ = typeid(T) == typeid(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE;
-  resize(height_, width_, totalColNum, valueType_, format_);
-
-  rows_[0] = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    T* row = data + indices[i];
-    size_t colNum = indices[i + 1] - indices[i];
-    rows_[i + 1] = rows_[i] + colNum;
-    copyRow(rows_[i], colNum, row);
-  }
-}
-
-void CpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) {
-  CHECK_EQ(height_, src.getHeight());
-  CHECK_LE(width_, src.getWidth());
-  CHECK_EQ(format_, src.getFormat());
-  CHECK_EQ(valueType_, src.getValueType());
-  if (format_ == SPARSE_CSR) {
-    int* srcCols = src.getCols();
-    size_t numLessWidth =
-        std::count_if(srcCols, srcCols + src.getElementCnt(), [this](size_t n) {
-          return n < this->width_;
-        });
-    resize(height_, width_, numLessWidth, valueType_, format_);
-    rows_[0] = 0;
-    size_t index = 0;
-    for (size_t r = 0; r < height_; ++r) {
-      for (int i = src.getRows()[r]; i < src.getRows()[r + 1]; ++i) {
-        if (srcCols[i] < static_cast<int>(width_)) {
-          cols_[index] = srcCols[i];
-          if (valueType_ == FLOAT_VALUE) {
-            value_[index] = src.getValue()[i];
-          }
-          ++index;
-        }
-      }
-      rows_[r + 1] = index;
-    }
-    CHECK_EQ(index, numLessWidth);
-  } else {
-    size_t numLessWidth = src.getCols()[width_] - src.getCols()[0];
-    resize(height_, width_, numLessWidth, valueType_, format_);
-    cols_[0] = 0;
-    size_t index = 0;
-    // note: c < width_, not src.getWidth();
-    for (size_t c = 0; c < width_; ++c) {
-      for (int i = src.getCols()[c]; i < src.getCols()[c + 1]; ++i) {
-        rows_[index] = src.getRows()[i];
-        if (valueType_ == FLOAT_VALUE) {
-          value_[index] = src.getValue()[i];
-        }
-        ++index;
-      }
-      cols_[c + 1] = index;
-    }
-    CHECK_EQ(index, numLessWidth);
-  }
-}
-
-void CpuSparseMatrix::zeroMem() {
-  CHECK(valueType_ == FLOAT_VALUE);
-  memset(value_, 0, elementCnt_ * sizeof(real));
-}
-
-template void CpuSparseMatrix::copyFrom(int64_t* ids,
-                                        int64_t* indices,
-                                        sparse_non_value_t* data);
-
-template void CpuSparseMatrix::copyFrom(int64_t* ids,
-                                        int64_t* indices,
-                                        sparse_float_value_t* data);
-
-template void CpuSparseMatrix::copyFrom(int64_t* indices,
-                                        sparse_non_value_t* data);
-
-template void CpuSparseMatrix::copyFrom(int64_t* indices,
-                                        sparse_float_value_t* data);
-
-void CpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-  size_t numSamples = getHeight();
-  size_t beam = maxVal.getWidth();
-  CHECK_EQ(maxIds.getSize(), numSamples * beam);
-  CHECK_EQ(maxVal.getHeight(), numSamples);
-  maxVal.zeroMem();
-  int* outids = maxIds.getData();
-  real* outvalues = maxVal.getData();
-
-  typedef std::pair<real, size_t> valuepair;
-  std::vector<valuepair> vec;
-  for (size_t i = 0; i < numSamples; i++) {
-    vec.clear();
-
-    auto num = getColNum(i);
-    auto ids = getRowCols(i);
-    auto values = getRowValues(i);
-    for (size_t j = 0; j < num; j++) {
-      vec.push_back(std::make_pair(values[j], ids[j]));
-    }
-
-    size_t outsize = std::min(num, beam);
-    std::partial_sort(vec.begin(),
-                      vec.begin() + outsize,
-                      vec.end(),
-                      [](const valuepair& a, const valuepair& b) {
-                        return a.first > b.first;
-                      });
-    for (size_t j = 0; j < outsize; j++) {
-      outids[i * beam + j] = vec[j].second;
-      outvalues[i * beam + j] = vec[j].first;
-    }
-    if (outsize < beam) {
-      // if the number of values to sort are less than the output size,
-      // use -1 to indicate the end of valid sorted values.
-      outids[i * beam + outsize] = -1;
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h
deleted file mode 100644
index 22b6b71688bd555cf8bf8a29088ad01b092d67cf..0000000000000000000000000000000000000000
--- a/paddle/math/CpuSparseMatrix.h
+++ /dev/null
@@ -1,377 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef PADDLE_MOBILE_INFERENCE
-
-#include <cstddef>
-#include "Matrix.h"
-
-namespace paddle {
-
-class CpuSparseMatrix : public Matrix {
-public:
-  CpuSparseMatrix(size_t height,
-                  size_t width,
-                  size_t nnz, /* used to allocate space */
-                  SparseValueType valueType = FLOAT_VALUE,
-                  SparseFormat format = SPARSE_CSR,
-                  bool trans = false);
-
-  CpuSparseMatrix(CpuMemHandlePtr memHandle,
-                  size_t height,
-                  size_t width,
-                  size_t nnz,
-                  SparseValueType valueType,
-                  SparseFormat format,
-                  bool trans);
-
-  CpuSparseMatrix(real* data,
-                  int* rows,
-                  int* cols,
-                  size_t height,
-                  size_t width,
-                  size_t nnz,
-                  SparseValueType valueType,
-                  SparseFormat format,
-                  bool trans);
-
-  ~CpuSparseMatrix() {}
-
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format);
-  void resize(size_t newHeight, size_t newWidth);
-
-  MatrixPtr getTranspose();
-
-  SparseValueType getValueType();
-
-  real* getRowValues(size_t i) const {
-    if (format_ == SPARSE_CSR) {
-      return value_ + rows_[i];
-    } else {
-      LOG(FATAL) << "SPARSE_CSC not supported";
-      return 0;
-    }
-  }
-
-  int* getRowCols(size_t i) const {
-    if (format_ == SPARSE_CSR) {
-      return cols_ + rows_[i];
-    } else {
-      LOG(FATAL) << "SPARSE_CSC not supported";
-      return 0;
-    }
-  }
-
-  /// fill row indices of each value in CSR matrix
-  void fillRowIndices(IVectorPtr& outVec) const;
-
-  size_t getColNum(size_t i) const {
-    if (format_ == SPARSE_CSR) {
-      return rows_[i + 1] - rows_[i];
-    } else {
-      LOG(FATAL) << "SPARSE_CSC not supported";
-      return 0;
-    }
-  }
-
-  real* getColumn(size_t i) const {
-    if (format_ == SPARSE_CSC) {
-      return value_ + cols_[i];
-    } else {
-      LOG(FATAL) << "SPARSE_CSR not supported";
-      return 0;
-    }
-  }
-
-  size_t getColStartIdx(size_t i) const {
-    if (format_ == SPARSE_CSC) {
-      return cols_[i];
-    } else {
-      LOG(FATAL) << "SPARSE_CSR not supported";
-      return 0;
-    }
-  }
-
-  size_t getRowStartIdx(size_t i) const {
-    if (format_ == SPARSE_CSR) {
-      return rows_[i];
-    } else {
-      LOG(FATAL) << "SPARSE_CSC not supported";
-      return 0;
-    }
-  }
-
-  size_t getRowNum(size_t i) const {
-    if (format_ == SPARSE_CSC) {
-      return cols_[i + 1] - cols_[i];
-    } else {
-      LOG(FATAL) << "SPARSE_CSR not supported";
-      return 0;
-    }
-  }
-
-  virtual real getSum() {
-    CHECK(isContiguous());
-    if (valueType_ == NO_VALUE) {
-      return elementCnt_;
-    }
-    double sum = 0;
-    for (size_t i = 0; i < elementCnt_; ++i) {
-      sum += value_[i];
-    }
-    return sum;
-  }
-
-  virtual void square2() {
-    CHECK(isContiguous());
-    if (valueType_ == NO_VALUE) {
-      return;
-    }
-    for (size_t i = 0; i < elementCnt_; ++i) {
-      value_[i] = value_[i] * value_[i];
-    }
-  }
-
-  /**
-   * only consider nonzero values.
-   * the actual min value should compare with 0.0.
-   */
-  virtual real getMin() {
-    CHECK(isContiguous());
-    if (valueType_ == NO_VALUE) {
-      return (elementCnt_ > 0 ? 1.0 : 0.0);
-    }
-    real min = value_[0];
-    for (size_t i = 1; i < elementCnt_; ++i) {
-      min = value_[i] < min ? value_[i] : min;
-    }
-    return min;
-  }
-
-  /**
-   * only consider nonzero values.
-   * the actual max value should compare with 0.0.
-   */
-  virtual real getMax() {
-    CHECK(isContiguous());
-    if (valueType_ == NO_VALUE) {
-      return (elementCnt_ > 0 ? 1.0 : 0.0);
-    }
-    real max = value_[0];
-    for (size_t i = 1; i < elementCnt_; ++i) {
-      max = value_[i] > max ? value_[i] : max;
-    }
-    return max;
-  }
-
-  void rowMax(IVector& maxIds, Matrix& maxVal);
-  int* getRows() const { return rows_; }
-  int* getCols() const { return cols_; }
-  real* getValue() const { return value_; }
-  SparseFormat getFormat() const { return format_; }
-  SparseValueType getValueType() const { return valueType_; }
-
-  /**
-   * @brief return value_ of sparse matrix
-   *
-   * Some times CpuSparseMatrix maybe Matrix,
-   * if getValue, must dynamic_cast to CpuSparseMatrix,
-   * getData is convenient to get value
-   */
-  real* getData() { return getValue(); }
-  const real* getData() const { return getValue(); }
-
-  /**
-   * @brief only set value_ of FLOAT_VALUE sparse matrix to zero
-   */
-  void zeroMem();
-
-  /// mem MUST be alloced outside (memAlloc=false)
-  void transpose(MatrixPtr& matTrans, bool memAlloc);
-
-  void mul(const Matrix& A, const Matrix& B, real alpha, real beta);
-
-  /**
-   * @brief sparseMatrix += denseMatrix
-   *
-   *  Named add3 just because add/add2 has been used in BaseMatrix.cu
-   *  and they are not virtual function.
-   *
-   *  Only add value of same (row, col) index in dense matrix
-   *  and do not use others values whoes postions are not in sparse matirx.
-   *
-   * @param[in]  b   dense matrix
-   */
-  void add3(CpuMatrix* b);
-  void add3(MatrixPtr b);
-
-  /**
-   * @brief sparseMatrix[i,j] += bias[j], (j is the col index of sparse matrix)
-   *
-   * @param[in]  b      bias, dense matrix and height = 1
-   * @param[in]  scale  scale of b
-   */
-  void addBias(Matrix& b, real scale);
-
-  void print(std::ostream& os) const;
-
-  void printOneRow(std::ostream& os, size_t idx) const;
-
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values);
-
-  /**
-   * @brief this_row = b_row * c_row[cCol]
-   *
-   * @param[in]  cCol   the column of matrix c used to scale each row of b
-   * @param[in]  b      CpuSparseMatrix
-   * @param[in]  c      Matrix
-   */
-  void rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c);
-
-  void randomizeUniform();
-
-  void copyFrom(const GpuSparseMatrix& src, hl_stream_t stream);
-
-  void copyFrom(const Matrix& src, hl_stream_t stream = HPPL_STREAM_DEFAULT);
-
-  void copyFrom(const Matrix& src);
-
-  /**
-   * Get a temporary matrix. This is threadsafe. It should be only used
-   * temporarily, i.e. do not store it or use it as return value.
-   *
-   * @note  Do NOT use large amount of tmp matrix.
-   */
-  CpuSparseMatrixPtr getTmpSparseMatrix(size_t height, size_t width);
-
-  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows);
-
-  void copyFrom(std::vector<int>& rows,
-                std::vector<int>& cols,
-                std::vector<real>& values);
-
-  void copyFrom(const CpuMatrix& src);
-
-  void copyFrom(const CpuSparseMatrix& src);
-
-  // trim the large size
-  void trimFrom(const CpuSparseMatrix& src);
-
-  void copyRow(int offsets, size_t colNum, const sparse_non_value_t* row);
-
-  void copyRow(int offsets, size_t colNum, const sparse_float_value_t* row);
-
-  template <class T>
-  void copyFrom(int64_t* ids, int64_t* indices, T* data);
-
-  template <class T>
-  void copyFrom(int64_t* indices, T* data);
-
-  void copyFrom(const real* data, size_t len) {
-    LOG(FATAL) << "not supported!";
-  }
-
-private:
-  MatrixPtr clone(size_t height = 0, size_t width = 0, bool useGpu = false);
-
-protected:
-  void sparseResize();
-  /*for csr , record row start position, for csc, record row index for every no
-   * zero value*/
-  int* rows_;
-  /*for csc , record col start position, for csr, record col index for every no
-   * zero value*/
-  int* cols_;
-  real* value_;               /*nonzero value*/
-  SparseFormat format_;       /* matrix format */
-  SparseValueType valueType_; /*with value or not  */
-  static const size_t DEFAULT_AVG_WIDTH = 20;
-
-  static ThreadLocal<std::vector<CpuSparseMatrixPtr>> cpuLocalMats_;
-
-  // BaseMatrixT interface
-public:
-  bool isSparse() const { return true; }
-
-private:
-  using Matrix::mul;
-  using Matrix::copyFrom;
-  using Matrix::rowMax;
-  using Matrix::print;
-  using Matrix::subMatrix;
-};
-}  // namespace paddle
-
-#else
-
-#include "Matrix.h"
-
-namespace paddle {
-
-class CpuSparseMatrix : public Matrix {
-public:
-  CpuSparseMatrix(size_t height,
-                  size_t width,
-                  size_t nnz, /* used to allocate space */
-                  SparseValueType valueType = FLOAT_VALUE,
-                  SparseFormat format = SPARSE_CSR,
-                  bool trans = false)
-      : Matrix(NULL, height, width, trans, false) {}
-
-  CpuSparseMatrix(real* data,
-                  int* rows,
-                  int* cols,
-                  size_t height,
-                  size_t width,
-                  size_t nnz,
-                  SparseValueType valueType,
-                  SparseFormat format,
-                  bool trans)
-      : Matrix(NULL, height, width, trans, false) {}
-
-  real* getValue() const { return nullptr; }
-  size_t getColStartIdx(size_t i) const { return 0; }
-  size_t getRowStartIdx(size_t i) const { return 0; }
-  size_t getColNum(size_t i) const { return 0; }
-  int* getRowCols(size_t i) const { return nullptr; }
-
-  CpuSparseMatrixPtr getTmpSparseMatrix(size_t height, size_t width) {
-    return nullptr;
-  }
-
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format) {}
-  void resize(size_t newHeight, size_t newWidth) {}
-  MatrixPtr getTranspose() { return nullptr; }
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values) {}
-};
-
-}  // namespace paddle
-
-#endif
diff --git a/paddle/math/ExecViaCpu.h b/paddle/math/ExecViaCpu.h
deleted file mode 100644
index 9b2a3c2b8accd384aac896e86ef8315a744633e1..0000000000000000000000000000000000000000
--- a/paddle/math/ExecViaCpu.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- execViaCpu is used to do operations on GpuMatirx and/or GpuIVector through
- cpu functions. It can automatically make a temporary CPU copy for the
- gpu matrix/vector, and copy back after executing the CPU function.
-
- Examples:
- 1. For a function, functor or lambda:
-   r = execViaCpu(&f, mat, vec)
-
- 2. For member function of CpuMatirx, execViaCpu2 should be used:
-   execViaCpu2(&CpuMatrix::selectElements, *this, table, ids)
-*/
-
-#pragma once
-
-namespace paddle {
-
-template <typename Arg>
-class CopyToCpu {
-public:
-  explicit CopyToCpu(Arg& arg) : arg_(arg) {}
-  Arg& copiedArg() const { return arg_; }
-
-private:
-  Arg& arg_;
-};
-
-template <>
-class CopyToCpu<Matrix> {
-public:
-  explicit CopyToCpu(Matrix& arg) : arg_(arg) {
-    if (arg.useGpu()) {
-      CHECK(!arg.isTransposed()) << "Not supported";
-      copied_ = Matrix::create(arg.getHeight(),
-                               arg.getWidth(),
-                               /* trans= */ false,
-                               /* useGpu= */ false);
-      copied_->copyFrom(arg);
-    }
-  }
-  ~CopyToCpu() {
-    if (copied_) {
-      arg_.copyFrom(*copied_);
-    }
-  }
-  Matrix& copiedArg() const { return copied_ ? *copied_ : arg_; }
-
-private:
-  Matrix& arg_;
-  MatrixPtr copied_;
-};
-
-template <>
-class CopyToCpu<const Matrix> {
-public:
-  explicit CopyToCpu(const Matrix& arg) : arg_(arg) {
-    if (arg.useGpu()) {
-      CHECK(!arg.isTransposed()) << "Not supported";
-      copied_ = Matrix::create(arg.getHeight(),
-                               arg.getWidth(),
-                               /* trans= */ false,
-                               /* useGpu= */ false);
-      copied_->copyFrom(arg);
-    }
-  }
-  const Matrix& copiedArg() const { return copied_ ? *copied_ : arg_; }
-
-private:
-  const Matrix& arg_;
-  MatrixPtr copied_;
-};
-
-template <>
-class CopyToCpu<IVector> {
-public:
-  explicit CopyToCpu(IVector& arg) : arg_(arg) {
-    if (arg.useGpu()) {
-      copied_ = IVector::create(arg.getSize(), /* useGpu= */ false);
-      copied_->copyFrom(arg);
-    }
-  }
-  ~CopyToCpu() {
-    if (copied_) {
-      arg_.copyFrom(*copied_);
-    }
-  }
-  IVector& copiedArg() const { return copied_ ? *copied_ : arg_; }
-
-private:
-  IVector& arg_;
-  IVectorPtr copied_;
-};
-
-template <>
-class CopyToCpu<const IVector> {
-public:
-  explicit CopyToCpu(const IVector& arg) : arg_(arg) {
-    if (arg.useGpu()) {
-      copied_ = IVector::create(arg.getSize(), /* useGpu= */ false);
-      copied_->copyFrom(arg);
-    }
-  }
-  const IVector& copiedArg() const { return copied_ ? *copied_ : arg_; }
-
-private:
-  const IVector& arg_;
-  IVectorPtr copied_;
-};
-
-namespace detail {
-
-template <bool isFunction, bool isFunctionPointer, bool isClass, typename F>
-class GpuFuncWrapperImp;
-
-template <typename F, typename R, typename... Args>
-class GpuFuncWrapperBase {
-public:
-  typedef R ResultType;
-  R operator()(F&& f, Args... args) {
-    return f(CopyToCpu<typename std::remove_reference<Args>::type>(args)
-                 .copiedArg()...);
-  }
-};
-
-// function
-template <typename R, typename... Args>
-class GpuFuncWrapperImp<true, false, false, R(Args...)>
-    : public GpuFuncWrapperBase<R(Args...), R, Args...> {};
-
-// function pointer
-template <typename R, typename... Args>
-class GpuFuncWrapperImp<false, true, false, R (*)(Args...)>
-    : public GpuFuncWrapperBase<R (*)(Args...), R, Args...> {};
-
-template <typename F, typename Op>
-class GpuFuncWrapperImp2;
-
-template <typename F, typename C, typename R, typename... Args>
-class GpuFuncWrapperImp2<F, R (C::*)(Args...) const>
-    : public GpuFuncWrapperBase<F, R, Args...> {};
-
-template <typename F, typename C, typename R, typename... Args>
-class GpuFuncWrapperImp2<F, R (C::*)(Args...)>
-    : public GpuFuncWrapperBase<F, R, Args...> {};
-
-// functor or lambda
-template <typename F>
-class GpuFuncWrapperImp<false, false, true, F>
-    : public GpuFuncWrapperImp2<F, decltype(&F::operator())> {};
-
-template <typename F>
-class GpuFuncWrapper2
-    : public GpuFuncWrapperImp<
-          std::is_function<F>::value,
-          std::is_pointer<F>::value &&
-              std::is_function<typename std::remove_pointer<F>::type>::value,
-          std::is_class<F>::value,
-          F> {};
-
-template <typename F>
-class GpuFuncWrapper
-    : public GpuFuncWrapper2<typename std::remove_reference<F>::type> {};
-
-}  // namespace detail
-
-template <typename F, typename... Args>
-typename detail::GpuFuncWrapper<F>::ResultType execViaCpu(F&& f,
-                                                          Args&&... args) {
-  return detail::GpuFuncWrapper<F>()(std::move(f), args...);
-}
-
-// The second version is for F as member function of CpuMatrix
-template <typename R, typename... FArgs, typename... Args>
-R execViaCpu2(R (CpuMatrix::*f)(FArgs...), Args&&... args) {
-  auto lambda = [](R (CpuMatrix::*f)(FArgs...), Matrix& ths, FArgs... args) {
-    return (((CpuMatrix&)ths).*f)(args...);
-  };
-  return execViaCpu(lambda, f, args...);
-}
-
-}  // namespace paddle
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
deleted file mode 100644
index e1fb81679adf4658a58ceee73c8d5da6c0b61050..0000000000000000000000000000000000000000
--- a/paddle/math/MKLDNNMatrix.h
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "Matrix.h"
-#include "mkldnn.hpp"
-#include "paddle/parameter/Parameter.h"
-
-namespace paddle {
-
-class MKLDNNMatrix;
-typedef std::shared_ptr<MKLDNNMatrix> MKLDNNMatrixPtr;
-
-#define CHECK_PRIMITIVE_DESC_EQ(MAT, PD, ...)                        \
-  CHECK(MAT) << " can not be empty.";                                \
-  CHECK(MAT->getPrimitiveDesc() == PD)                               \
-      << #MAT "->getPrimitiveDesc() and " #PD " should be equal.\n " \
-      << "" __VA_ARGS__;
-
-/**
- * @brief MKLDNN Matrix.
- *
- */
-class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
-public:
-  MKLDNNMatrix(CpuMatrixPtr m, mkldnn::memory::primitive_desc pd)
-      : CpuMatrix(m->getData(), m->getHeight(), m->getWidth(), false),
-        mkldnn::memory(pd, m->getData()),
-        m_(m) {}
-
-  ~MKLDNNMatrix() {}
-
-  /**
-   * Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc
-   */
-  static MKLDNNMatrixPtr create(mkldnn::memory::primitive_desc pd,
-                                MatrixPtr m = nullptr);
-
-  /**
-   * Create MKLDNNMatrix from a MatrixPtr and memory details info
-   */
-  static MKLDNNMatrixPtr create(
-      mkldnn::memory::dims dims,
-      mkldnn::memory::format fmt,
-      mkldnn::engine& eg,
-      MatrixPtr m = nullptr,
-      mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
-
-  /**
-   * Create primitive descriptor.
-   * default with f32 dtype
-   */
-  static mkldnn::memory::primitive_desc createPrimitiveDesc(
-      const mkldnn::memory::dims dims,
-      const mkldnn::memory::format& fmt,
-      const mkldnn::engine& eg,
-      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
-    return mkldnn::memory::primitive_desc(memory::desc(dims, dtype, fmt), eg);
-  }
-
-  /**
-   * Create Memory descriptor.
-   * default with any format and f32 dtype
-   */
-  static mkldnn::memory::desc createMemoryDesc(
-      const mkldnn::memory::dims dims,
-      const mkldnn::memory::format& fmt = mkldnn::memory::format::any,
-      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
-    return mkldnn::memory::desc(dims, dtype, fmt);
-  }
-
-  /**
-   * Create reorder primitive.
-   * Create a mkldnn::reorder handle for converting src MKLDNNMatrix to dst.
-   * checkData: whether to check the data handle of src and dst.
-   *            if true, it will check the data and do not allow them equal;
-   *            otherwise, it will not check them, then the reorder created
-   *            may have inplace buffer.
-   *            Do not set false, if you can not guarantee the inplace logical
-   *            would work with your reorder.
-   */
-  static std::shared_ptr<mkldnn::reorder> createReorder(
-      const MKLDNNMatrixPtr& src,
-      const MKLDNNMatrixPtr& dst,
-      bool checkData = true);
-
-  void copyFrom(const Matrix& src) {
-    // TODO(TJ): reorder data if this format is not nchw or x
-    m_->copyFrom(src);
-  }
-
-  void copyTo(Matrix& dst) {
-    // TODO(TJ): reorder data if this format is not nchw or x
-    dst.copyFrom(*m_);
-  }
-
-public:
-  /**
-   * Reorder this MKLDNNMatrix from other format.
-   * Support inplace reorder.
-   * @note: this function would only reorder the data layout.
-   *        will NOT change this original dim or format info
-   */
-  void reorderDataFrom(const MKLDNNMatrixPtr& m,
-                       memory::format srcFmt,
-                       memory::dims targetDim);
-
-  /**
-   * Reorder this MKLDNNMatrix to other format.
-   * Support inplace reorder.
-   * @note: this function would only reorder the data layout.
-   *        will NOT change the dst dim or format info
-   */
-  void reorderDataTo(const MKLDNNMatrixPtr& m,
-                     memory::format dstFmt,
-                     memory::dims targetDim);
-
-  /**
-   * Dimensionality reduction.
-   * Change format "nchw --> nc" or "oihw --> oi" if the h and w are both 1
-   */
-  void downSpatial();
-
-  /**
-   * set the memory data handle.
-   * Caution: This will not check the buffer size of the data,
-   *          it should be coverd by user.
-   */
-  void setData(real* data) {
-    set_data_handle(data);
-    CpuMatrix::setData(data);
-    m_.reset();
-  }
-
-  /**
-   * override the CpuMatrix::resize
-   */
-  void resize(size_t newHeight, size_t newWidth) override {
-    m_->resize(newHeight, newWidth);
-    if (data_ == m_->getData() && elementCnt_ == newHeight * newWidth) {
-      return;
-    }
-    CpuMatrix::setData(data_);
-    height_ = newHeight;
-    width_ = newWidth;
-    elementCnt_ = newHeight * newWidth;
-    stride_ = width_;
-    auto pd = mkldnn::memory::primitive_desc(
-        mkldnn::memory::desc({(int)newHeight, (int)newWidth},
-                             getDtype(),
-                             mkldnn::memory::format::nc),
-        getEngine());
-    resetMKLDNNMemory(pd, data_);
-  }
-
-  /**
-   * override Matrix::getData
-   * check data before return
-   */
-  real* getData() override {
-    CHECK_EQ((void*)data_, get_data_handle());
-    return data_;
-  }
-
-  const real* getData() const override {
-    CHECK_EQ((void*)data_, get_data_handle());
-    return data_;
-  }
-
-  /**
-   * Get primitive descriptor.
-   */
-  mkldnn::memory::primitive_desc getPrimitiveDesc() {
-    return this->get_primitive_desc();
-  }
-
-  /**
-   * Get memory descriptor.
-   */
-  mkldnn::memory::desc getMemoryDesc() { return getPrimitiveDesc().desc(); }
-
-  /**
-   * Get dimensions.
-   */
-  mkldnn::memory::dims getDims() {
-    mkldnn::memory::desc md = getMemoryDesc();
-    const int* src = md.data.dims;
-    int ndims = md.data.ndims;
-    mkldnn::memory::dims dst;
-    dst.resize(ndims);
-    for (int i = 0; i < ndims; ++i) {
-      dst[i] = src[i];
-    }
-    return dst;
-  }
-
-  /**
-   * Get format.
-   */
-  mkldnn::memory::format getFormat() {
-    return (mkldnn::memory::format)(getMemoryDesc().data.format);
-  }
-
-  /**
-   * Get memory data type.
-   */
-  mkldnn::memory::data_type getDtype() {
-    return (mkldnn::memory::data_type)(getMemoryDesc().data.data_type);
-  }
-
-  /**
-   * Get engine.
-   */
-  mkldnn::engine getEngine() { return getPrimitiveDesc().get_engine(); }
-
-protected:
-  /**
-   * Do reorder once.
-   * Can support inplace.
-   */
-  void reorderOnce(void* srcData,
-                   void* dstData,
-                   memory::format srcFmt,
-                   memory::format dstFmt,
-                   memory::dims dm);
-  /**
-   * reset this MKLDNN Memory from primitve desc
-   */
-  void resetMKLDNNMemory(memory::primitive_desc pd, real* data) {
-    mkldnn_primitive_t result;
-    mkldnn::error::wrap_c_api(
-        mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
-        "could not create a memory primitive");
-    reset(result);
-    set_data_handle(data);
-  }
-
-private:
-  // save the CpuMatrixPtr in case the buffer released outside
-  CpuMatrixPtr m_;
-};
-
-}  // namespace paddle
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
deleted file mode 100644
index b2ff4bc3232a8e5d5d7b49bf49c62fe756d303f4..0000000000000000000000000000000000000000
--- a/paddle/math/MathFunctions.cpp
+++ /dev/null
@@ -1,328 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MathFunctions.h"
-#include "hl_matrix_apply.cuh"
-#include "hl_matrix_ops.cuh"
-#include "paddle/utils/DynamicLoader.h"
-
-namespace dynload {
-
-std::once_flag lapack_dso_flag;
-void* lapack_dso_handle = nullptr;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load lapack routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-
-// The argument for stringizing operator is not macro-expanded first.
-// We have to use two levels of macro to do the expansion.
-// See https://gcc.gnu.org/onlinedocs/cpp/Stringizing.html
-#define STR(x) #x
-
-// clang-format off
-#ifndef LAPACK_FOUND
-#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                                       \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    auto operator()(Args... args) -> decltype(__name(args...)) {               \
-      using lapack_func = decltype(__name(args...)) (*)(Args...);              \
-      std::call_once(lapack_dso_flag, GetLapackDsoHandle, &lapack_dso_handle); \
-      void* p_##__name = dlsym(lapack_dso_handle, STR(__name));                \
-      CHECK(p_##__name) << "Cannot find symbol " << STR(__name)                \
-                        << " in liblapack.so";                                 \
-      return reinterpret_cast<lapack_func>(p_##__name)(args...);               \
-    }                                                                          \
-  } __name;  // struct DynLoad__##__name
-#else
-#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                                       \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    auto operator()(Args... args) -> decltype(__name(args...)) {               \
-      return __name(args...);                                                  \
-    }                                                                          \
-  } __name;  // struct DynLoad__##__name
-#endif
-
-#ifdef PADDLE_USE_ATLAS
-  #define  PADDLE_SGETRF  clapack_sgetrf
-  #define  PADDLE_DGETRF  clapack_dgetrf
-  #define  PADDLE_SGETRI  clapack_sgetri
-  #define  PADDLE_DGETRI  clapack_dgetri
-#else
-  #define  PADDLE_SGETRF  LAPACKE_sgetrf
-  #define  PADDLE_DGETRF  LAPACKE_dgetrf
-  #define  PADDLE_SGETRI  LAPACKE_sgetri
-  #define  PADDLE_DGETRI  LAPACKE_dgetri
-#endif
-
-#define LAPACK_ROUTINE_EACH(__macro)       \
-  __macro(PADDLE_SGETRF)                   \
-  __macro(PADDLE_DGETRF)                   \
-  __macro(PADDLE_SGETRI)                   \
-  __macro(PADDLE_DGETRI)
-// clang-format on
-
-LAPACK_ROUTINE_EACH(DYNAMIC_LOAD_LAPACK_WRAP)
-
-}  // namespace dynload
-
-namespace paddle {
-
-#ifndef PADDLE_USE_EIGEN_FOR_BLAS
-template <>
-void gemm<float>(const CBLAS_TRANSPOSE transA,
-                 const CBLAS_TRANSPOSE transB,
-                 const int M,
-                 const int N,
-                 const int K,
-                 const float alpha,
-                 const float* A,
-                 const int lda,
-                 const float* B,
-                 const int ldb,
-                 const float beta,
-                 float* C,
-                 const int ldc) {
-  cblas_sgemm(CblasRowMajor,
-              transA,
-              transB,
-              M,
-              N,
-              K,
-              alpha,
-              A,
-              lda,
-              B,
-              ldb,
-              beta,
-              C,
-              ldc);
-}
-
-template <>
-void gemm<double>(const CBLAS_TRANSPOSE transA,
-                  const CBLAS_TRANSPOSE transB,
-                  const int M,
-                  const int N,
-                  const int K,
-                  const double alpha,
-                  const double* A,
-                  const int lda,
-                  const double* B,
-                  const int ldb,
-                  const double beta,
-                  double* C,
-                  const int ldc) {
-  cblas_dgemm(CblasRowMajor,
-              transA,
-              transB,
-              M,
-              N,
-              K,
-              alpha,
-              A,
-              lda,
-              B,
-              ldb,
-              beta,
-              C,
-              ldc);
-}
-#endif
-
-template <>
-int getrf<float>(const CBLAS_ORDER order,
-                 const int M,
-                 const int N,
-                 float* A,
-                 const int lda,
-                 int* ipiv) {
-  return dynload::PADDLE_SGETRF(order, M, N, A, lda, ipiv);
-}
-
-template <>
-int getrf<double>(const CBLAS_ORDER order,
-                  const int M,
-                  const int N,
-                  double* A,
-                  const int lda,
-                  int* ipiv) {
-  return dynload::PADDLE_DGETRF(order, M, N, A, lda, ipiv);
-}
-
-template <>
-int getri<float>(const CBLAS_ORDER order,
-                 const int N,
-                 float* A,
-                 const int lda,
-                 const int* ipiv) {
-  return dynload::PADDLE_SGETRI(order, N, A, lda, ipiv);
-}
-
-template <>
-int getri<double>(const CBLAS_ORDER order,
-                  const int N,
-                  double* A,
-                  const int lda,
-                  const int* ipiv) {
-  return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv);
-}
-
-#ifndef PADDLE_USE_EIGEN_FOR_BLAS
-template <>
-void axpy<float>(const int n, const float alpha, const float* x, float* y) {
-  cblas_saxpy(n, alpha, x, 1, y, 1);
-}
-
-template <>
-void axpy<double>(const int n, const double alpha, const double* x, double* y) {
-  cblas_daxpy(n, alpha, x, 1, y, 1);
-}
-
-template <>
-float dotProduct<float>(const int n, const float* x, const float* y) {
-  return cblas_sdot(n, x, 1, y, 1);
-}
-
-template <>
-double dotProduct<double>(const int n, const double* x, const double* y) {
-  return cblas_ddot(n, x, 1, y, 1);
-}
-#endif
-
-#if defined(PADDLE_WITH_MKLML)
-
-template <>
-void vExp<float>(const int n, const float* a, float* r) {
-  vsExp(n, a, r);
-}
-
-template <>
-void vExp<double>(const int n, const double* a, double* r) {
-  vdExp(n, a, r);
-}
-
-template <>
-void vPow<float>(const int n, const float* a, const float b, float* r) {
-  vsPowx(n, a, b, r);
-}
-
-template <>
-void vPow<double>(const int n, const double* a, const double b, double* r) {
-  vdPowx(n, a, b, r);
-}
-
-template <>
-void vLog<float>(const int n, const float* a, float* r) {
-  vsLn(n, a, r);
-}
-
-template <>
-void vLog<double>(const int n, const double* a, double* r) {
-  vdLn(n, a, r);
-}
-
-template <>
-void vAdd<float>(const int n, const float* a, const float* b, float* r) {
-  vsAdd(n, a, b, r);
-}
-
-template <>
-void vAdd<double>(const int n, const double* a, const double* b, double* r) {
-  vdAdd(n, a, b, r);
-}
-#else
-
-DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
-template <class T>
-void vExp(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
-      binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
-template <class T>
-void vLog(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
-      binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
-template <class T>
-void vPow(const int n, const T* a, const T b, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
-      binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
-template <class T>
-void vAdd(const int n, const T* a, const T* b, T* r) {
-  hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
-                                                     const_cast<T*>(a),
-                                                     const_cast<T*>(b),
-                                                     r,
-                                                     1,
-                                                     n,
-                                                     n,
-                                                     n,
-                                                     n);
-}
-
-template void vExp(const int n, const float* a, float* r);
-template void vExp(const int n, const double* a, double* r);
-template void vLog(const int n, const float* a, float* r);
-template void vLog(const int n, const double* a, double* r);
-template void vPow(const int n, const float* a, const float b, float* r);
-template void vPow(const int n, const double* a, const double b, double* r);
-template void vAdd(const int n, const float* a, const float* b, float* r);
-template void vAdd(const int n, const double* a, const double* b, double* r);
-
-#endif
-
-DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
-template <class T>
-void vInvSqrt(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vInvSqrt<T>, 0, 0>(
-      binary::vInvSqrt<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_BINARY_OP(vLog1p, b = std::log(1.0f + a));
-template <class T>
-void vLog1p(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vLog1p<T>, 0, 0>(
-      binary::vLog1p<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_BINARY_OP(vTanh, T tmp = -2.0 * a;
-                        tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-                        b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
-template <class T>
-void vTanh(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vTanh<T>, 0, 0>(
-      binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-template void vInvSqrt(const int n, const double* a, double* r);
-template void vInvSqrt(const int n, const float* a, float* r);
-template void vLog1p(const int n, const float* a, float* r);
-template void vLog1p(const int n, const double* a, double* r);
-template void vTanh(const int n, const float* a, float* r);
-template void vTanh(const int n, const double* a, double* r);
-
-}  // namespace paddle
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
deleted file mode 100644
index f4cf6bd6c2c06f95cda098af389b37b7ff2983eb..0000000000000000000000000000000000000000
--- a/paddle/math/MathFunctions.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef MATHFUNCTIONS_H_
-#define MATHFUNCTIONS_H_
-
-#ifdef PADDLE_WITH_MKLML
-#include <mkl_cblas.h>
-#include <mkl_lapacke.h>
-#include <mkl_vml_functions.h>
-#endif
-
-#if defined(PADDLE_USE_ATLAS) || defined(PADDLE_USE_VECLIB)
-extern "C" {
-#include <cblas.h>
-#include <clapack.h>
-}
-#endif
-
-#ifdef PADDLE_USE_OPENBLAS
-#include <cblas.h>
-#include <lapacke.h>
-#endif
-
-#ifndef LAPACK_FOUND
-extern "C" {
-#ifndef PADDLE_USE_EIGEN_FOR_BLAS
-#include <cblas.h>
-#else
-typedef enum CBLAS_ORDER {
-  CblasRowMajor = 101,
-  CblasColMajor = 102
-} CBLAS_ORDER;
-#endif
-int LAPACKE_sgetrf(
-    int matrix_layout, int m, int n, float* a, int lda, int* ipiv);
-int LAPACKE_dgetrf(
-    int matrix_layout, int m, int n, double* a, int lda, int* ipiv);
-int LAPACKE_sgetri(
-    int matrix_layout, int n, float* a, int lda, const int* ipiv);
-int LAPACKE_dgetri(
-    int matrix_layout, int n, double* a, int lda, const int* ipiv);
-}
-#endif
-
-#include <cmath>
-
-namespace paddle {
-
-#ifndef PADDLE_USE_EIGEN_FOR_BLAS
-template <class T>
-void gemm(const CBLAS_TRANSPOSE transA,
-          const CBLAS_TRANSPOSE transB,
-          const int M,
-          const int N,
-          const int K,
-          const T alpha,
-          const T* A,
-          const int lda,
-          const T* B,
-          const int ldb,
-          const T beta,
-          T* C,
-          const int ldc);
-#endif
-
-template <class T>
-int getrf(const CBLAS_ORDER Order,
-          const int M,
-          const int N,
-          T* A,
-          const int lda,
-          int* ipiv);
-
-template <class T>
-int getri(
-    const CBLAS_ORDER Order, const int N, T* A, const int lda, const int* ipiv);
-
-template <class T>
-void axpy(const int n, const T alpha, const T* x, T* y) {
-  /// y = y + alpha * x
-  for (int i = 0; i < n; i++) {
-    y[i] = y[i] + alpha * x[i];
-  }
-}
-
-template <class T>
-T dotProduct(const int n, const T* x, const T* y) {
-  T result = static_cast<T>(0);
-  for (int i = 0; i < n; i++) {
-    result += x[i] * y[i];
-  }
-  return result;
-}
-
-template <class T>
-void vExp(const int n, const T* a, T* r);
-
-template <class T>
-void vPow(const int n, const T* a, const T b, T* r);
-
-template <class T>
-void vLog(const int n, const T* a, T* r);
-
-template <class T>
-void vAdd(const int n, const T* a, const T* b, T* r);
-
-template <class T>
-void vInvSqrt(const int n, const T* a, T* r);
-
-template <class T>
-void vLog1p(const int n, const T* a, T* r);
-
-template <class T>
-void vTanh(const int n, const T* a, T* r);
-
-}  // namespace paddle
-
-#endif  // MATHFUNCTIONS_H_
diff --git a/paddle/math/MathUtils.cpp b/paddle/math/MathUtils.cpp
deleted file mode 100644
index b2afdbcd51a3cf5d3e6f3e2bb14902bf78fe68c8..0000000000000000000000000000000000000000
--- a/paddle/math/MathUtils.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "MathUtils.h"
-#include <algorithm>
-#include "Vector.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-/*if csc, major is cols and minor is rows, else
- * major is rows and minor is cols, according to
- * major value to initialize minor value"
- */
-void sparseRand(
-    int* major, int* minor, int nnz, int majorLen, int minorMax, bool useGpu) {
-  CHECK(size_t(nnz) >= size_t(1));
-  int* cpuMajor;
-  int* cpuMinor;
-  CpuIVector cpuMinorVec(nnz);
-  CpuIVector cpuMajorVec(majorLen);
-  if (useGpu) {
-    cpuMajor = cpuMajorVec.getData();
-    cpuMinor = cpuMinorVec.getData();
-  } else {
-    cpuMajor = major;
-    cpuMinor = minor;
-  }
-
-  /*major value init*/
-  for (int i = 0; i < majorLen - 1; i++) {
-    cpuMajor[i] = 1.0 * i * nnz / (majorLen - 1);
-  }
-  cpuMajor[majorLen - 1] = nnz;
-
-  /*minor value init according to major value*/
-  std::vector<char> used(minorMax, 0);
-  for (int i = 0; i < majorLen - 1; i++) {
-    CHECK_LE(cpuMajor[i + 1] - cpuMajor[i], minorMax);
-    used.assign(minorMax, 0);
-    for (int j = cpuMajor[i]; j < cpuMajor[i + 1]; j++) {
-      int idx = ::rand() % minorMax;
-      while (used[idx]) {
-        idx = ::rand() % minorMax;
-      }
-      cpuMinor[j] = idx;
-      used[idx] = 1;
-    }
-    std::sort(cpuMinor + cpuMajor[i],
-              cpuMinor + cpuMajor[i + 1],
-              [](int a, int b) { return a < b; });
-  }
-  /*memcpy result to gpu*/
-  if (useGpu) {
-    hl_memcpy_host2device(major, cpuMajor, sizeof(int) * majorLen);
-    hl_memcpy_host2device(minor, cpuMinor, sizeof(int) * nnz);
-  }
-}
-
-int outputSize(
-    int imageSize, int filterSize, int padding, int stride, bool caffeMode) {
-  int outputSize;
-  if (!caffeMode) {
-    outputSize =
-        (imageSize - filterSize + 2 * padding + stride - 1) / stride + 1;
-  } else {
-    outputSize = (imageSize - filterSize + 2 * padding) / stride + 1;
-  }
-  CHECK_GE(outputSize, 1);
-  return outputSize;
-}
-
-int imageSize(
-    int outputSize, int filterSize, int padding, int stride, bool caffeMode) {
-  int imageSize;
-  if (!caffeMode) {
-    imageSize =
-        (outputSize - 1) * stride + filterSize - 2 * padding - stride + 1;
-  } else {
-    imageSize = (outputSize - 1) * stride + filterSize - 2 * padding;
-  }
-  CHECK_GE(imageSize, 1);
-  return imageSize;
-}
-
-}  // namespace paddle
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
deleted file mode 100644
index 35359d4b5a8fb9715317257538a6e2e38fc16b60..0000000000000000000000000000000000000000
--- a/paddle/math/Matrix.cpp
+++ /dev/null
@@ -1,4667 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Matrix.h"
-#include "MathFunctions.h"
-#include "SparseMatrix.h"
-#include "SparseRowMatrix.h"
-
-#include <float.h>
-#include <algorithm>
-#include <cmath>
-
-#include <string.h>
-#include "hl_cnn.h"
-#include "hl_gpu.h"
-#include "hl_table_apply.h"
-#include "hl_top_k.h"
-#include "paddle/utils/Logging.h"
-
-#include "NEONFunctions.h"
-#include "paddle/function/GemmFunctor.h"
-#include "paddle/utils/ThreadLocal.h"
-
-#include "SIMDFunctions.h"
-
-namespace paddle {
-
-inline real _pow(real a, real beta) { return std::pow(a, beta); }
-
-inline real _square(real a) { return a * a; }
-
-inline real _safelog(real a) { return a > 0.0f ? std::log(a) : -40.0f; }
-
-Matrix::Matrix(MemoryHandlePtr memHandle,
-               size_t height,
-               size_t width,
-               bool trans,
-               bool use_gpu)
-    : BaseMatrix(
-          height,
-          width,
-          memHandle ? (reinterpret_cast<real*>(memHandle->getBuf())) : nullptr,
-          trans,
-          use_gpu) {
-  elementCnt_ = width * height;
-  memoryHandle_ = memHandle;
-}
-
-Matrix::Matrix(
-    real* data, size_t height, size_t width, bool trans, bool use_gpu)
-    : BaseMatrix(height, width, data, trans, use_gpu) {
-  elementCnt_ = width * height;
-}
-
-Matrix::Matrix(real* data,
-               size_t height,
-               size_t width,
-               size_t stride,
-               bool trans,
-               bool use_gpu)
-    : BaseMatrix(height, width, stride, data, trans, use_gpu) {
-  elementCnt_ = width * height;
-}
-
-MatrixPtr Matrix::createSparseMatrix(real* data,
-                                     int* row,
-                                     int* col,
-                                     size_t height,
-                                     size_t width,
-                                     size_t nnz, /* used to allocate space */
-                                     SparseValueType valueType, /*value type*/
-                                     SparseFormat format,
-                                     bool trans,
-                                     bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuSparseMatrix>(
-        data, row, col, height, width, nnz, valueType, format, trans);
-  } else {
-    return std::make_shared<CpuSparseMatrix>(
-        data, row, col, height, width, nnz, valueType, format, trans);
-  }
-}
-
-MatrixPtr Matrix::createSparseMatrix(size_t height,
-                                     size_t width,
-                                     size_t nnz, /* used to allocate space */
-                                     SparseValueType valueType, /*value type*/
-                                     SparseFormat format,
-                                     bool trans,
-                                     bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuSparseMatrix>(
-        height, width, nnz, valueType, format, trans);
-  } else {
-    return std::make_shared<CpuSparseMatrix>(
-        height, width, nnz, valueType, format, trans);
-  }
-}
-
-MatrixPtr Matrix::create(MemoryHandlePtr memHandle,
-                         size_t height,
-                         size_t width,
-                         bool trans) {
-  if (auto gpuHandle = std::dynamic_pointer_cast<GpuMemoryHandle>(memHandle)) {
-    return std::make_shared<GpuMatrix>(gpuHandle, height, width, trans);
-  } else if (auto cpuHandle =
-                 std::dynamic_pointer_cast<CpuMemoryHandle>(memHandle)) {
-    return std::make_shared<CpuMatrix>(cpuHandle, height, width, trans);
-  } else {
-    LOG(FATAL) << "Wrong";
-    return nullptr;
-  }
-}
-
-MatrixPtr Matrix::create(size_t height, size_t width, bool trans, bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuMatrix>(height, width, trans);
-  } else {
-    return std::make_shared<CpuMatrix>(height, width, trans);
-  }
-}
-
-MatrixPtr Matrix::create(
-    real* data, size_t height, size_t width, bool trans, bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuMatrix>(data, height, width, trans);
-  } else {
-    return std::make_shared<CpuMatrix>(data, height, width, trans);
-  }
-}
-
-MatrixPtr Matrix::create(real* data,
-                         size_t height,
-                         size_t width,
-                         size_t stride,
-                         bool trans,
-                         bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuMatrix>(data, height, width, stride, trans);
-  } else {
-    return std::make_shared<CpuMatrix>(data, height, width, stride, trans);
-  }
-}
-
-MatrixPtr Matrix::createSparseMatrix(size_t height,
-                                     size_t width,
-                                     size_t nnz,
-                                     SparseValueType valueType,
-                                     bool trans,
-                                     bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuSparseMatrix>(
-        height, width, nnz, valueType, SPARSE_CSR, trans);
-  } else {
-    return std::make_shared<CpuSparseMatrix>(
-        height, width, nnz, valueType, SPARSE_CSR, trans);
-  }
-}
-
-void Matrix::resizeOrCreate(
-    MatrixPtr& matrix, size_t height, size_t width, bool trans, bool useGpu) {
-  if (!matrix) {
-    matrix = Matrix::create(height, width, trans, useGpu);
-  } else {
-    CHECK_EQ(matrix->useGpu(), useGpu);
-    matrix->resize(height, width);
-  }
-}
-
-void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix,
-                                        size_t height,
-                                        size_t width,
-                                        size_t nnz,
-                                        SparseValueType valueType,
-                                        SparseFormat format,
-                                        bool trans,
-                                        bool useGpu) {
-  if (!matrix) {
-    matrix = Matrix::createSparseMatrix(
-        height, width, nnz, valueType, format, trans, useGpu);
-  } else {
-    CHECK(dynamic_cast<CpuSparseMatrix*>(matrix.get()) ||
-          dynamic_cast<GpuSparseMatrix*>(matrix.get()));
-    CHECK_EQ(matrix->useGpu(), useGpu);
-    matrix->resize(height, width, nnz, valueType, format);
-  }
-}
-
-void Matrix::reshape(size_t height, size_t width) {
-  CHECK(isContiguous());
-  CHECK(height_ * width_ == height * width);
-  height_ = height;
-  width_ = width;
-  stride_ = width_;
-}
-
-MatrixPtr Matrix::subMatrix(size_t startRow,
-                            size_t endRow,
-                            size_t startCol,
-                            size_t endCol) {
-  CHECK_LE(startRow, endRow);
-  CHECK_LE(endRow, getHeight());
-  CHECK_LE(startCol, endCol);
-  CHECK_LE(endCol, getWidth());
-
-  return Matrix::create(getData() + startRow * getStride() + startCol,
-                        endRow - startRow,
-                        endCol - startCol,
-                        getStride(),
-                        trans_,
-                        useGpu_);
-}
-
-void Matrix::setDiag(real value) {
-  CHECK(data_ != NULL);
-  CHECK_EQ(height_, width_);
-
-  zeroMem();
-  BaseMatrix diag(height_, 1, stride_ + 1, data_, false, useGpu_);
-  diag.assign(value);
-}
-
-GpuMatrix::GpuMatrix(size_t height, size_t width, bool trans)
-    : Matrix(std::make_shared<GpuMemoryHandle>(height * width * sizeof(real)),
-             height,
-             width,
-             trans,
-             true) {}
-
-GpuMatrix::~GpuMatrix() {}
-
-void GpuMatrix::zeroMem() {
-  CHECK(data_ != NULL);
-  zero();
-}
-
-void GpuMatrix::resetOne() {
-  CHECK(data_ != NULL);
-  one();
-}
-
-void GpuMatrix::resize(size_t newHeight, size_t newWidth) {
-  size_t newSize = newHeight * newWidth;
-  if (NULL == memoryHandle_.get() ||
-      newSize * sizeof(real) > memoryHandle_->getAllocSize()) {
-    memoryHandle_ = std::make_shared<GpuMemoryHandle>(newSize * sizeof(real));
-    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
-  }
-  height_ = newHeight;
-  width_ = newWidth;
-  elementCnt_ = newSize;
-  stride_ = width_;
-}
-
-real GpuMatrix::getElement(size_t x, size_t y) const {
-  real elem = 0;
-  hl_memcpy_device2host(&elem, &data_[x * stride_ + y], sizeof(real));
-  return elem;
-}
-
-real GpuMatrix::getSum() {
-  CHECK(isContiguous());
-  real sum = 0.0f;
-  hl_vector_sum(data_, &sum, height_ * width_);
-  return sum;
-}
-
-real GpuMatrix::getMin() {
-  CHECK(isContiguous());
-  auto vec = GpuVector(height_ * width_, data_);
-  return vec.getMin();
-}
-
-real GpuMatrix::getMax() {
-  CHECK(isContiguous());
-  auto vec = GpuVector(height_ * width_, data_);
-  return vec.getMax();
-}
-
-void GpuMatrix::accumulateColSum(Matrix& src) {
-  CHECK_EQ(getWidth(), src.getWidth());
-  CHECK_EQ(getHeight(), (size_t)1);
-  sumCols(src, 1.0, 1.0);
-}
-
-real GpuMatrix::getAbsSum() {
-  CHECK(isContiguous());
-  real sum = 0.0f;
-  hl_vector_abs_sum(data_, &sum, height_ * width_);
-  return sum;
-}
-
-void GpuMatrix::copyFrom(const Matrix& src) {
-  CHECK(isContiguous());
-  CHECK(src.isContiguous());
-  CHECK(elementCnt_ == src.getElementCnt());
-
-  if (typeid(src) == typeid(CpuMatrix)) {
-    hl_memcpy_host2device(
-        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
-  } else if (typeid(src) == typeid(GpuMatrix)) {
-    hl_memcpy_device2device(
-        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
-  } else {
-    LOG(FATAL) << "Wrong";
-  }
-}
-
-void GpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
-  CHECK(isContiguous());
-  CHECK(src.isContiguous());
-  CHECK(elementCnt_ == src.getElementCnt());
-  hl_memcpy_async(this->getData(),
-                  const_cast<real*>(src.getData()),
-                  sizeof(real) * elementCnt_,
-                  stream);
-}
-
-void GpuMatrix::copyFrom(const real* hostSrc, size_t size) {
-  CHECK(isContiguous());
-  CHECK(size <= elementCnt_);
-  hl_memcpy_host2device(data_, const_cast<real*>(hostSrc), sizeof(real) * size);
-}
-
-void GpuMatrix::copyFrom(const real* hostSrc, const int64_t* seq) {
-  LOG(FATAL) << "not implemented";
-}
-
-void GpuMatrix::copyFrom(const IVector& src) {
-  CHECK(isContiguous());
-  CpuMatrix matrix(src.getSize(), 1, false);
-  matrix.copyFrom(src);
-  copyFrom(matrix);
-}
-
-void GpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(b.getWidth(), width);
-  real* dst = getData();
-  real* src = b.getData();
-  const int* index = rowIndex.getData();
-  hl_sequence2batch_copy(dst, src, index, width, height, true);
-}
-
-MatrixPtr GpuMatrix::clone(size_t height, size_t width, bool useGpu) {
-  CHECK(isContiguous());
-
-  if (height == 0 && width == 0) {
-    height = height_;
-    width = width_;
-  }
-
-  CHECK(width && height);
-
-  if (useGpu) {
-    return std::make_shared<GpuMatrix>(height, width);
-  } else {
-    return std::make_shared<CpuMatrix>(height, width);
-  }
-}
-
-MatrixPtr GpuMatrix::getTranspose() {
-  if (memoryHandle_.get() != NULL) {
-    MatrixPtr copy_T(
-        new GpuMatrix(std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle_),
-                      height_,
-                      width_,
-                      true));
-    return copy_T;
-  } else {
-    MatrixPtr copy_T(new GpuMatrix(data_, height_, width_, true));
-    return copy_T;
-  }
-}
-
-void GpuMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
-  if (memAlloc) {
-    matTrans = std::make_shared<GpuMatrix>(width_, height_);
-  } else {
-    CHECK(matTrans != NULL);
-    CHECK_EQ(matTrans->getHeight(), width_);
-    CHECK_EQ(matTrans->getWidth(), height_);
-  }
-  real* dataTrans = matTrans->getData();
-  real* data = getData();
-  int lda = getStride();
-  int ldc = matTrans->getStride();
-
-  hl_matrix_transpose(data, dataTrans, height_, width_, lda, ldc);
-}
-
-void GpuMatrix::rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
-  if (memAlloc) {
-    matRot = std::make_shared<GpuMatrix>(width_, height_);
-  } else {
-    CHECK(matRot != NULL);
-    CHECK_EQ(matRot->getHeight(), width_);
-    CHECK_EQ(matRot->getWidth(), height_);
-  }
-
-  real* dataRot = matRot->getData();
-  real* data = getData();
-  hl_matrix_rotate(data, dataRot, height_, width_, clockWise);
-}
-
-MatrixPtr GpuMatrix::getInverse() {
-  MatrixPtr matInv;
-  inverse(matInv, true);
-  return matInv;
-}
-
-void GpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
-  CHECK_EQ(height_, width_);
-
-  if (memAlloc) {
-    matInv = std::make_shared<GpuMatrix>(height_, width_);
-  } else {
-    CHECK(matInv != NULL);
-  }
-
-  real* data = getData();
-  real* dataInv = matInv->getData();
-  int lda = getStride();
-  int ldc = matInv->getStride();
-
-  hl_matrix_inverse(data, dataInv, height_, lda, ldc);
-}
-
-void GpuMatrix::addBias(Matrix& b, real scale) {
-  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
-  BaseMatrix::addBias(b, scale);
-}
-
-void GpuMatrix::addSharedBias(Matrix& b, real scale) {
-  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
-  CHECK_LE(b.getWidth(), getWidth());
-  CHECK_EQ(getWidth() % b.getWidth(), 0UL);
-  hl_matrix_add_shared_bias(
-      getData(), b.getData(), b.getWidth(), getHeight(), getWidth(), scale);
-}
-
-void GpuMatrix::collectBias(Matrix& a, real scale) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK_EQ(getHeight(), (size_t)1);
-  CHECK_EQ(width_, a.getWidth());
-  GpuSparseMatrix* sMatPtr = dynamic_cast<GpuSparseMatrix*>(&a);
-  if (!sMatPtr) {
-    sumCols(a, /* scaleSum= */ scale, /* scaleDest= */ 1);
-  } else {
-    real* data = getData();
-    hl_sparse_matrix_s A_d = sMatPtr->sMatrix_.get();
-    hl_sparse_matrix_column_sum(data, A_d, sMatPtr->getHeight(), width_, scale);
-  }
-#endif
-}
-
-void GpuMatrix::collectSharedBias(Matrix& a, real scale) {
-  CHECK_EQ(getHeight(), (size_t)1);
-  CHECK_EQ(a.getWidth() % getWidth(), 0UL);
-  hl_matrix_collect_shared_bias(
-      getData(), a.getData(), getWidth(), a.getHeight(), a.getWidth(), scale);
-}
-
-void GpuMatrix::sequenceAvgForward(Matrix& a,
-                                   const IVector& startsPos,
-                                   int mode) {
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(height, startsPos.getSize() - 1);
-  CHECK_EQ(width, a.getWidth());
-  real* dst = getData();
-  real* src = a.getData();
-  const int* starts = startsPos.getData();
-
-  hl_sequence_avg_forward(dst, src, starts, height, width, mode);
-}
-
-void GpuMatrix::sequenceAvgBackward(Matrix& a,
-                                    const IVector& startsPos,
-                                    int mode) {
-  size_t height = a.getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(height, startsPos.getSize() - 1);
-  CHECK_EQ(width, a.getWidth());
-  real* dst = getData();
-  real* src = a.getData();
-  const int* starts = startsPos.getData();
-
-  hl_sequence_avg_backward(dst, src, starts, height, width, mode);
-}
-
-/* this = scaleAB*(a*b) +  scaleT*this */
-void GpuMatrix::mul(const GpuMatrix& a,
-                    const GpuMatrix& b,
-                    real scaleAB,
-                    real scaleT) {
-  CHECK(!isTransposed()) << "Not supported";
-
-  if (!a.isTransposed() && !b.isTransposed()) {
-    CHECK_EQ(width_, b.width_);
-    CHECK_EQ(height_, a.height_);
-    CHECK_EQ(a.width_, b.height_);
-  } else if (a.isTransposed() && !b.isTransposed()) {
-    CHECK_EQ(width_, b.width_);
-    CHECK_EQ(height_, a.width_);
-    CHECK_EQ(a.height_, b.height_);
-  } else if (!a.isTransposed() && b.isTransposed()) {
-    CHECK_EQ(width_, b.height_);
-    CHECK_EQ(height_, a.height_);
-    CHECK_EQ(a.width_, b.width_);
-  } else {
-    LOG(FATAL) << "Is not supported";
-  }
-
-  real* A_d = a.data_;
-  real* B_d = b.data_;
-  real* C_d = data_;
-  int dimM = getHeight();
-  int dimN = getWidth();
-  int dimK = !a.isTransposed() ? a.width_ : a.height_;
-  int lda = a.getStride();
-  int ldb = b.getStride();
-  int ldc = getStride();
-  hl_trans_op_t transa = !a.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
-  hl_trans_op_t transb = !b.isTransposed() ? HPPL_OP_N : HPPL_OP_T;
-
-  hl_matrix_mul(A_d,
-                transa,
-                B_d,
-                transb,
-                C_d,
-                dimM,
-                dimN,
-                dimK,
-                scaleAB,
-                scaleT,
-                lda,
-                ldb,
-                ldc);
-}
-
-void GpuMatrix::mul(const GpuSparseMatrix& a,
-                    const GpuMatrix& b,
-                    real scaleAB,
-                    real scaleT) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(isContiguous());
-  CHECK(b.isContiguous());
-  CHECK(b.useGpu_ == true) << "Matrix type are not equal";
-  CHECK(!trans_ && !b.trans_) << "not supported";
-
-  if (!a.trans_) {
-    CHECK(width_ == b.width_ && height_ == a.height_ && a.width_ == b.height_)
-        << "Matrix dimensions are not equal";
-  } else {
-    CHECK(width_ == b.width_ && height_ == a.width_ && a.height_ == b.height_)
-        << "Matrix dimensions are not equal";
-  }
-  hl_trans_op_t transA = a.trans_ ? HPPL_OP_T : HPPL_OP_N;
-  hl_sparse_matrix_s A_d = a.sMatrix_.get();
-  real* B_d = b.data_;
-  real* C_d = data_;
-  hl_matrix_csr_mul_dense(A_d,
-                          transA,
-                          B_d,
-                          HPPL_OP_N,
-                          C_d,
-                          height_,
-                          width_,
-                          b.height_,
-                          scaleAB,
-                          scaleT);
-#endif
-}
-
-void GpuMatrix::mul(const GpuMatrix& a,
-                    const GpuSparseMatrix& b,
-                    real scaleAB,
-                    real scaleT) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(isContiguous());
-  CHECK(a.isContiguous());
-  CHECK(a.useGpu_ == true) << "Matrix type are not equal";
-
-  hl_sparse_matrix_s B_d = b.sMatrix_.get();
-  real* A_d = a.data_;
-  real* C_d = data_;
-  hl_trans_op_t transB = b.trans_ ? HPPL_OP_T : HPPL_OP_N;
-  if (!b.trans_) {
-    CHECK(width_ == b.width_ && height_ == a.height_ && a.width_ == b.height_)
-        << "Matrix dimensions are not equal";
-  } else {
-    CHECK(width_ == b.height_ && height_ == a.height_ && a.width_ == b.width_)
-        << "Matrix dimensions are not equal";
-  }
-  if (b.format_ == SPARSE_CSC) {
-    hl_matrix_dense_mul_csc(A_d,
-                            HPPL_OP_N,
-                            B_d,
-                            transB,
-                            C_d,
-                            height_,
-                            width_,
-                            a.width_,
-                            scaleAB,
-                            scaleT);
-  } else {
-    hl_matrix_dense_mul_csr(A_d,
-                            HPPL_OP_N,
-                            B_d,
-                            transB,
-                            C_d,
-                            height_,
-                            width_,
-                            a.width_,
-                            scaleAB,
-                            scaleT);
-  }
-#endif
-}
-
-/* this = a*b */
-void GpuMatrix::mul(const Matrix& a, const Matrix& b) { mul(a, b, 1.0, 0.0); }
-
-void GpuMatrix::mul(const Matrix& a,
-                    const Matrix& b,
-                    real scaleAB,
-                    real scaleT) {
-  const auto a_ptr = dynamic_cast<const GpuMatrix*>(&a);
-  const auto b_ptr = dynamic_cast<const GpuMatrix*>(&b);
-  const auto a_ptr_s = dynamic_cast<const GpuSparseMatrix*>(&a);
-  const auto b_ptr_s = dynamic_cast<const GpuSparseMatrix*>(&b);
-
-  if (a_ptr && b_ptr) {
-    mul(*a_ptr, *b_ptr, scaleAB, scaleT);
-  } else if (a_ptr_s && b_ptr) {
-    mul(*a_ptr_s, *b_ptr, scaleAB, scaleT);
-  } else if (a_ptr && b_ptr_s) {
-    mul(*a_ptr, *b_ptr_s, scaleAB, scaleT);
-  } else {
-    LOG(FATAL) << "Not supported";
-  }
-}
-
-/* this = this* b */
-void GpuMatrix::rightMul(Matrix& b) { rightMul(b, 1.0, 0.0); }
-
-/* this = scaleAB*(this*b) +  scaleT*this */
-void GpuMatrix::rightMul(Matrix& b, real scaleAB, real scaleT) {
-  CHECK(dynamic_cast<GpuMatrix*>(&b));
-  CHECK(!isTransposed()) << "Not supported";
-  CHECK(!b.isTransposed()) << "Not supported";
-  mul(*this, *dynamic_cast<GpuMatrix*>(&b), scaleAB, scaleT);
-}
-
-/* this = a*this */
-void GpuMatrix::leftMul(Matrix& a) { leftMul(a, 1.0, 0.0); }
-
-/* this = scaleAB*(a*this) +  scaleT*this */
-void GpuMatrix::leftMul(Matrix& a, real scaleAB, real scaleT) {
-  CHECK(dynamic_cast<GpuMatrix*>(&a));
-  CHECK(!isTransposed()) << "Not supported";
-  CHECK(!a.isTransposed()) << "Not supported";
-  mul(*dynamic_cast<GpuMatrix*>(&a), *this, scaleAB, scaleT);
-}
-
-void GpuMatrix::selectRows(Matrix& table, IVector& ids) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(dynamic_cast<GpuMatrix*>(&table));
-  CHECK(table.useGpu());
-  CHECK(ids.useGpu());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), table.getWidth());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  real* a = getData();
-  size_t tableSize = table.getHeight();
-  int* index = ids.getData();
-
-  hl_matrix_select_rows(a,
-                        stride_,
-                        table.getData(),
-                        table.stride_,
-                        index,
-                        numSamples,
-                        tableSize,
-                        dim);
-#endif
-}
-
-void GpuMatrix::addToRows(Matrix& table, IVector& ids) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(dynamic_cast<GpuMatrix*>(&table));
-  CHECK(table.useGpu());
-  CHECK(ids.useGpu());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), table.getWidth());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  real* a = getData();
-  size_t tableSize = table.getHeight();
-  int* index = ids.getData();
-
-  hl_matrix_add_to_rows(table.getData(),
-                        table.stride_,
-                        a,
-                        stride_,
-                        index,
-                        numSamples,
-                        tableSize,
-                        dim);
-#endif
-}
-
-void GpuMatrix::colMerge(Matrix& src) {
-  CHECK(src.height_ == height_);
-  if (!trans_ && !src.trans_) {
-    sumRows(src, /* scaleSum= */ 1, /* scaleDest= */ 0);
-  } else {
-    LOG(FATAL) << "Is not supported";
-  }
-}
-
-void GpuMatrix::rowSum(Matrix& sum) {
-  CHECK_EQ(sum.getHeight(), getHeight());
-  CHECK_EQ(sum.getWidth(), (size_t)1);
-
-  sum.sumRows(*this, /* scaleSum= */ 1, /* scaleDest= */ 0);
-}
-
-void GpuMatrix::rowMax(Matrix& max) {
-  CHECK_EQ(max.getHeight(), getHeight());
-  CHECK_EQ(max.getWidth(), (size_t)1);
-
-  max.maxRows(*this);
-}
-
-void GpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
-  size_t numSamples = getHeight();
-  size_t beam = maxVal.getWidth();
-  CHECK_EQ(maxIds.getSize(), numSamples * beam);
-  CHECK_EQ(maxVal.getHeight(), numSamples);
-  CHECK_EQ(maxVal.getWidth(), beam);
-
-  hl_matrix_top_k(maxVal.getData(),
-                  maxVal.getStride(),
-                  maxIds.getData(),
-                  this->getData(),
-                  this->getStride(),
-                  this->getWidth(),
-                  beam,
-                  numSamples);
-#endif
-}
-
-void GpuMatrix::colMax(Matrix& max) {
-  CHECK_EQ(max.getWidth(), getWidth());
-  CHECK_EQ(max.getHeight(), (size_t)1);
-
-  max.maxCols(*this);
-}
-
-void GpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
-  LOG(FATAL) << "Is not supported";
-}
-
-void GpuMatrix::maxoutForward(Matrix& a,
-                              IVector& id,
-                              size_t channels,
-                              size_t groups) {
-  CHECK(dynamic_cast<GpuMatrix*>(&a));
-  CHECK(dynamic_cast<GpuIVector*>(&id));
-  CHECK_EQ(a.getHeight(), getHeight());
-
-  size_t size = getWidth();
-  size_t batchSize = getHeight();
-  const real* input = a.getData();
-  real* output = getData();
-  int* idForGpu = id.getData();
-
-  hl_maxout_forward(
-      input, output, idForGpu, batchSize, size, size / channels, groups);
-}
-
-void GpuMatrix::maxoutBackward(Matrix& a,
-                               IVector& id,
-                               size_t channels,
-                               size_t groups) {
-  CHECK(dynamic_cast<GpuMatrix*>(&a));
-  CHECK(dynamic_cast<GpuIVector*>(&id));
-  CHECK_EQ(a.getHeight(), getHeight());
-
-  size_t size = a.getWidth();
-  size_t batchSize = getHeight();
-  real* input = getData();
-  const real* output = a.getData();
-  const int* idForGpu = id.getData();
-
-  hl_maxout_backward(
-      input, output, idForGpu, batchSize, size, size / channels, groups);
-}
-
-/*calulate the error of classification */
-void GpuMatrix::classificationError(Matrix& output,
-                                    IVector& label,
-                                    size_t topkSize) {
-  auto gpuOutput = dynamic_cast<GpuMatrix*>(&output);
-  auto gpuLabel = dynamic_cast<GpuIVector*>(&label);
-  size_t numSamples = this->getHeight();
-  GpuMatrixPtr gpuTopVal = std::make_shared<GpuMatrix>(numSamples, topkSize);
-  GpuIVectorPtr gpuTopIds = std::make_shared<GpuIVector>(numSamples * topkSize);
-
-  CHECK(gpuOutput && gpuLabel) << "Invalid argument pointer";
-  CHECK(gpuTopVal && gpuTopIds) << "Allocate GPU memory failed";
-  CHECK(gpuLabel->getSize() == numSamples) << "Vector size is not equal";
-  CHECK(numSamples == gpuOutput->getHeight() && this->getWidth() == 1)
-      << "Matrix dimensions are not equal";
-
-  size_t dim = gpuOutput->getWidth();
-  hl_matrix_classification_error(gpuTopVal->getData(),
-                                 gpuTopVal->getStride(),
-                                 gpuTopIds->getData(),
-                                 gpuOutput->getData(),
-                                 gpuOutput->getStride(),
-                                 dim,
-                                 topkSize,
-                                 numSamples,
-                                 gpuLabel->getData(),
-                                 this->getData());
-}
-
-/* copy -log(output[i * width + label]) to this->data[i] */
-void GpuMatrix::oneHotCrossEntropy(Matrix& output, IVector& label) {
-  GpuMatrix* output_ptr = dynamic_cast<GpuMatrix*>(&output);
-  GpuIVector* label_ptr = dynamic_cast<GpuIVector*>(&label);
-
-  CHECK(output_ptr && label_ptr) << "Invalid argument pointer";
-
-  CHECK(height_ == label.getSize() && width_ == 1 && height_ == output.height_)
-      << "Matrix dimensions are not equal";
-
-  real* A_d = output_ptr->data_;
-  real* C_d = data_;
-  int* label_d = label_ptr->getData();
-
-  hl_matrix_cross_entropy(A_d, C_d, label_d, height_, output.width_);
-}
-
-/* calculate the error of outputV according to label */
-void GpuMatrix::oneHotCrossEntropyBp(Matrix& outputV, IVector& label) {
-  GpuMatrix* output_ptr = dynamic_cast<GpuMatrix*>(&outputV);
-  GpuIVector* label_ptr = dynamic_cast<GpuIVector*>(&label);
-
-  CHECK(output_ptr && label_ptr) << "Invalid argument pointer";
-
-  CHECK(height_ == output_ptr->height_ && width_ == output_ptr->width_)
-      << "Matrix dimensions are not equal";
-
-  real* output_d = output_ptr->data_;
-  real* grad_d = data_;
-  int* label_d = label_ptr->getData();
-
-  hl_matrix_cross_entropy_bp(grad_d, output_d, label_d, height_, width_);
-}
-
-void GpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                               IVector& label,
-                                               real alpha) {
-  LOG(FATAL) << "Not implemented";
-}
-
-void GpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                                 IVector& label,
-                                                 real alpha) {
-  LOG(FATAL) << "Not implemented";
-}
-
-void GpuMatrix::softmax(Matrix& output) {
-  CHECK(output.useGpu()) << "Matrix type are not equal";
-
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK(height == output.getHeight() && width == output.getWidth())
-      << "Matrix dimensions are not equal";
-
-  real* inputData = getData();
-  real* outputData = output.getData();
-  hl_matrix_softmax(inputData, outputData, height, width);
-}
-
-void GpuMatrix::sequenceSoftmax(Matrix& output, const IVector& index) {
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output.getWidth(), 1UL);
-  CHECK(isContiguous());
-
-  real* inputData = getData();
-  real* outputData = output.getData();
-  auto starts = index.getData();
-  int numSequences = index.getSize() - 1;
-  hl_sequence_softmax_forward(inputData, outputData, starts, numSequences);
-}
-
-void GpuMatrix::softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
-  CHECK(output.useGpu_ == true && sftmaxSum.useGpu_ == true)
-      << "Matrix type are not equal";
-
-  CHECK(height_ == output.height_ && width_ == output.width_ &&
-        height_ == sftmaxSum.height_)
-      << "Matrix dimensions are not equal";
-
-  real* output_d = output.data_;
-  real* sftmaxSum_d = sftmaxSum.data_;
-  real* grad_d = data_;
-  hl_matrix_softmax_derivative(grad_d, output_d, sftmaxSum_d, height_, width_);
-}
-
-void GpuMatrix::softmaxBackward(Matrix& outputV) {
-  CHECK(outputV.useGpu()) << "Matrix type are not equal";
-
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK(height == outputV.getHeight() && width == outputV.getWidth())
-      << "Matrix dimensions are not equal";
-
-  real* output_grad = getData();
-  real* output_value = outputV.getData();
-  hl_softmax_backward(output_value, output_grad, height, width);
-}
-
-void GpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
-  CHECK_EQ(label.getHeight(), height_);
-  CHECK_EQ(output.getHeight(), height_);
-  CHECK_EQ(label.getWidth(), output.getWidth());
-  CHECK_EQ((size_t)1, width_);
-
-  auto labelptr = dynamic_cast<GpuSparseMatrix*>(&label);
-  if (labelptr) {
-    LOG(FATAL) << "not supported: GpuSparseMatrix as label";
-  }
-
-  BaseMatrix::sumOfSquaredDiffs(output,
-                                label,
-                                /* scaleSum= */ 1,
-                                /* scaleDest= */ 1);
-}
-
-void GpuMatrix::sumOfSquaresBp(Matrix& outputV, Matrix& label) {
-  add2(outputV, label, 1, 2, -2);
-}
-
-void GpuMatrix::tanh(Matrix& output) { BaseMatrix::tanh(output); }
-
-void GpuMatrix::tanhDerivative(Matrix& output) {
-  BaseMatrix::tanhDerivative(output);
-}
-
-void GpuMatrix::softrelu(Matrix& output) { BaseMatrix::softrelu(output); }
-
-void GpuMatrix::softreluDerivative(Matrix& output) {
-  BaseMatrix::softreluDerivative(output);
-}
-
-void GpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
-  BaseMatrix::scaledTanh(output, p1, p2);
-}
-
-void GpuMatrix::randomizeUniform() {
-  CHECK(isContiguous());
-  real* data = data_;
-  size_t size = height_ * width_;
-
-  hl_rand(data, size);
-}
-
-void GpuMatrix::print(std::ostream& os) const {
-  CHECK(isContiguous());
-  CpuMatrix cpuMat(getHeight(), getWidth());
-  cpuMat.copyFrom(*this);
-  cpuMat.print(os);
-}
-
-void GpuMatrix::print(std::ostream& os, size_t height, size_t width) const {
-  CHECK(isContiguous());
-  CpuMatrix cpuMat(getHeight(), getWidth());
-  cpuMat.copyFrom(*this);
-  cpuMat.print(os, height, width);
-}
-
-void GpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
-  CHECK(isContiguous());
-  CHECK(height_ == refMat.getHeight());
-  CHECK(width_ == refMat.getWidth());
-  CpuMatrix cpuRef(height_, width_);
-  GpuMatrix gpuRef(height_, width_);
-  cpuRef.copyFrom(refMat);
-  gpuRef.copyFrom(*this);
-  size_t diffCnt = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    for (size_t j = 0; j < width_; ++j) {
-      real a = gpuRef.getElement(i, j);
-      real b = cpuRef.getElement(i, j);
-      if (fabs(a - b) > 0.00001) {
-        ++diffCnt;
-        if (printDiff) {
-          os << "ref= " << a << "  check= " << b << std::endl;
-        }
-      }
-    }
-  }
-  LOG(INFO) << "the  diffCnt is " << diffCnt;
-}
-
-void GpuMatrix::maxPoolForward(Matrix& inputMat,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t channels,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               size_t paddingH,
-                               size_t paddingW,
-                               MatrixPtr maskMatP) {
-  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
-
-  real* inputData = inputMat.getData();
-  real* maskData = NULL;
-  size_t frameNum = inputMat.getHeight();
-  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
-  CHECK(height_ == inputMat.getHeight());
-  CHECK(width_ == outputH * outputW * channels);
-
-  if (maskMatP != NULL) {
-    CHECK(maskMatP->useGpu_ == true) << "Matrix type are not equal";
-    CHECK(outputH * outputW * channels == maskMatP->getWidth());
-    maskData = maskMatP->getData();
-  }
-
-  hl_maxpool_forward(frameNum,
-                     inputData,
-                     channels,
-                     imgSizeH,
-                     imgSizeW,
-                     outputH,
-                     outputW,
-                     sizeX,
-                     sizeY,
-                     strideH,
-                     strideW,
-                     paddingH,
-                     paddingW,
-                     data_,
-                     getStride(),
-                     maskData);
-}
-
-void GpuMatrix::maxPoolBackward(Matrix& inputMat,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                Matrix& outGrad,
-                                Matrix& outV,
-                                size_t sizeX,
-                                size_t sizeY,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t outputH,
-                                size_t outputW,
-                                real scaleTargets,
-                                real scaleOutput,
-                                size_t paddingH,
-                                size_t paddingW) {
-  CHECK(inputMat.useGpu_ == true && outGrad.useGpu_ == true &&
-        outV.useGpu_ == true)
-      << "Matrix type are not equal";
-
-  real* inputData = inputMat.getData();
-  real* outData = outV.getData();
-  real* outDiff = outGrad.getData();
-  size_t frameNum = inputMat.getHeight();
-  size_t channels = outV.getWidth() / outputH / outputW;
-  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
-  CHECK(height_ == inputMat.getHeight());
-  CHECK(outGrad.getHeight() == outV.getHeight() &&
-        outGrad.getWidth() == outV.getWidth());
-
-  hl_maxpool_backward(frameNum,
-                      inputData,
-                      outData,
-                      outDiff,
-                      channels,
-                      imgSizeH,
-                      imgSizeW,
-                      outputH,
-                      outputW,
-                      sizeX,
-                      sizeY,
-                      strideH,
-                      strideW,
-                      paddingH,
-                      paddingW,
-                      scaleTargets,
-                      scaleOutput,
-                      data_,
-                      outGrad.getStride());
-}
-
-void GpuMatrix::avgPoolForward(Matrix& inputMat,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t channels,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               size_t paddingH,
-                               size_t paddingW,
-                               bool excludeMode) {
-  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
-
-  real* inputData = inputMat.getData();
-  size_t frameNum = inputMat.getHeight();
-  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
-  CHECK(height_ == inputMat.getHeight());
-  CHECK(width_ == outputH * outputW * channels);
-
-  hl_avgpool_forward(frameNum,
-                     inputData,
-                     channels,
-                     imgSizeH,
-                     imgSizeW,
-                     outputH,
-                     outputW,
-                     sizeX,
-                     sizeY,
-                     strideH,
-                     strideW,
-                     paddingH,
-                     paddingW,
-                     data_,
-                     getStride(),
-                     excludeMode);
-}
-
-void GpuMatrix::avgPoolBackward(Matrix& outGrad,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t sizeX,
-                                size_t sizeY,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t outputH,
-                                size_t outputW,
-                                real scaleTargets,
-                                real scaleOutput,
-                                size_t paddingH,
-                                size_t paddingW,
-                                bool excludeMode) {
-  CHECK(outGrad.useGpu_ == true) << "Matrix type are not equal";
-
-  real* outDiff = outGrad.getData();
-  size_t frameNum = outGrad.getHeight();
-  size_t channels = outGrad.getWidth() / outputH / outputW;
-  CHECK(imgSizeH * imgSizeW * channels == width_);
-  CHECK(height_ == outGrad.getHeight());
-  CHECK(outGrad.getWidth() == outputH * outputW * channels);
-
-  hl_avgpool_backward(frameNum,
-                      outDiff,
-                      channels,
-                      imgSizeH,
-                      imgSizeW,
-                      outputH,
-                      outputW,
-                      sizeX,
-                      sizeY,
-                      strideH,
-                      strideW,
-                      paddingH,
-                      paddingW,
-                      scaleTargets,
-                      scaleOutput,
-                      data_,
-                      outGrad.getStride(),
-                      excludeMode);
-}
-
-void GpuMatrix::maxPool3DForward(Matrix& inputMat,
-                                 Matrix& maxPoolIdx,
-                                 size_t channels,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW) {
-  CHECK(inputMat.useGpu_) << "Matrix type are not correct";
-
-  real* inputData = inputMat.getData();
-  real* maxPoolIdxData = maxPoolIdx.getData();
-  size_t num = inputMat.getHeight();
-  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == inputMat.getWidth());
-  CHECK(height_ == inputMat.getHeight());
-  CHECK(width_ == outputD * outputH * outputW * channels);
-
-  hl_maxpool3D_forward(num,
-                       inputData,
-                       channels,
-                       imgSizeD,
-                       imgSizeH,
-                       imgSizeW,
-                       outputD,
-                       outputH,
-                       outputW,
-                       sizeZ,
-                       sizeY,
-                       sizeX,
-                       strideD,
-                       strideH,
-                       strideW,
-                       paddingD,
-                       paddingH,
-                       paddingW,
-                       getData(),
-                       maxPoolIdxData,
-                       getStride());
-}
-
-void GpuMatrix::maxPool3DBackward(Matrix& outGrad,
-                                  Matrix& maxPoolIdx,
-                                  size_t imgSizeD,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  size_t outputD,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  size_t sizeZ,
-                                  size_t sizeY,
-                                  size_t sizeX,
-                                  size_t strideD,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingD,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  real scaleTargets,
-                                  real scaleOutput) {
-  CHECK(outGrad.useGpu_ && maxPoolIdx.useGpu_) << "Matrix type are not equal";
-
-  real* outDiff = outGrad.getData();
-  real* maxPoolIdxData = maxPoolIdx.getData();
-  size_t frameNum = getHeight();
-  size_t channels = outGrad.getWidth() / outputD / outputH / outputW;
-  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == getWidth());
-  CHECK(outGrad.getHeight() == maxPoolIdx.getHeight() &&
-        outGrad.getWidth() == maxPoolIdx.getWidth());
-
-  hl_maxpool3D_backward(frameNum,
-                        outDiff,
-                        channels,
-                        imgSizeD,
-                        imgSizeH,
-                        imgSizeW,
-                        outputD,
-                        outputH,
-                        outputW,
-                        sizeZ,
-                        sizeY,
-                        sizeX,
-                        strideD,
-                        strideH,
-                        strideW,
-                        paddingD,
-                        paddingH,
-                        paddingW,
-                        scaleTargets,
-                        scaleOutput,
-                        getData(),
-                        maxPoolIdxData,
-                        outGrad.getStride());
-}
-
-void GpuMatrix::avgPool3DForward(Matrix& inputMat,
-                                 size_t channels,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW) {
-  CHECK(inputMat.useGpu_) << "Matrix type are not equal";
-
-  real* inputData = inputMat.getData();
-  size_t frameNum = inputMat.getHeight();
-  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == inputMat.getWidth());
-  CHECK(height_ == inputMat.getHeight());
-  CHECK(width_ == outputD * outputH * outputW * channels);
-
-  hl_avgpool3D_forward(frameNum,
-                       inputData,
-                       channels,
-                       imgSizeD,
-                       imgSizeH,
-                       imgSizeW,
-                       outputD,
-                       outputH,
-                       outputW,
-                       sizeZ,
-                       sizeY,
-                       sizeX,
-                       strideD,
-                       strideH,
-                       strideW,
-                       paddingD,
-                       paddingH,
-                       paddingW,
-                       getData(),
-                       getStride());
-}
-
-void GpuMatrix::avgPool3DBackward(Matrix& outGrad,
-                                  size_t imgSizeD,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  size_t outputD,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  size_t sizeZ,
-                                  size_t sizeY,
-                                  size_t sizeX,
-                                  size_t strideD,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingD,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  real scaleTargets,
-                                  real scaleOutput) {
-  CHECK(outGrad.useGpu_) << "Matrix type are not equal";
-
-  real* outDiff = outGrad.getData();
-  size_t frameNum = outGrad.getHeight();
-  size_t channels = outGrad.getWidth() / outputD / outputH / outputW;
-  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == width_);
-  CHECK(height_ == outGrad.getHeight());
-  CHECK(outGrad.getWidth() == outputD * outputH * outputW * channels);
-
-  hl_avgpool3D_backward(frameNum,
-                        outDiff,
-                        channels,
-                        imgSizeD,
-                        imgSizeH,
-                        imgSizeW,
-                        outputD,
-                        outputH,
-                        outputW,
-                        sizeZ,
-                        sizeY,
-                        sizeX,
-                        strideD,
-                        strideH,
-                        strideW,
-                        paddingD,
-                        paddingH,
-                        paddingW,
-                        scaleTargets,
-                        scaleOutput,
-                        getData(),
-                        outGrad.getStride());
-}
-
-void GpuMatrix::maxSequenceForward(Matrix& input,
-                                   const IVector& sequence,
-                                   IVector& index) {
-  CHECK(dynamic_cast<GpuMatrix*>(&input));
-  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
-  CHECK(dynamic_cast<GpuIVector*>(&index));
-
-  real* outData = getData();
-  real* inputData = input.getData();
-  const int* starts = sequence.getData();
-  int* maxIndex = index.getData();
-  size_t numSequences = getHeight();
-  size_t dim = getWidth();
-
-  CHECK_EQ(dim, input.getWidth());
-  CHECK_EQ(numSequences, sequence.getSize() - 1);
-  CHECK_EQ(numSequences * dim, index.getSize());
-
-  hl_max_sequence_forward(
-      inputData, starts, outData, maxIndex, numSequences, dim);
-}
-
-void GpuMatrix::maxSequenceBackward(Matrix& outputGrad,
-                                    const IVector& sequence,
-                                    IVector& index) {
-  CHECK(dynamic_cast<GpuMatrix*>(&outputGrad));
-  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
-  CHECK(dynamic_cast<GpuIVector*>(&index));
-
-  real* inputGrad = getData();
-  real* outGrad = outputGrad.getData();
-  int* maxIndex = index.getData();
-  size_t dim = getWidth();
-  size_t numSequences = sequence.getSize() - 1;
-
-  CHECK_EQ(dim, outputGrad.getWidth());
-  CHECK_EQ(numSequences, outputGrad.getHeight());
-  CHECK_EQ(numSequences * dim, index.getSize());
-
-  hl_max_sequence_backward(outGrad, maxIndex, inputGrad, numSequences, dim);
-}
-
-void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
-  CHECK(data.useGpu_ == true && W.useGpu_ == true)
-      << "Matrix type are not equal";
-  real* input = data.getData();
-  real* w = W.getData();
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = W.getHeight() * W.getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-  size_t partial_sum = numElements / paraSize;
-  real* output = getData();
-  hl_param_relu_forward(output, input, w, numElements, numSamples, partial_sum);
-}
-
-void GpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
-  CHECK(oGrad.useGpu_ == true && data.useGpu_ == true)
-      << "Matrix type are not equal";
-  real* ograd = oGrad.getData();
-  real* input = data.getData();
-  real* wgrad = data_;
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = this->getHeight() * this->getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-  size_t partial_sum = numElements / paraSize;
-  hl_param_relu_backward_w(
-      wgrad, ograd, input, numElements, numSamples, partial_sum);
-}
-
-void GpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
-  real* diff = data_;
-  real* input = data.getData();
-  real* ograd = oGrad.getData();
-  real* w = W.getData();
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = W.getHeight() * W.getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-  size_t partial_sum = numElements / paraSize;
-  hl_param_relu_backward_diff(
-      ograd, input, w, diff, numElements, numSamples, partial_sum);
-}
-
-void GpuMatrix::addColumnVector(const Matrix& b) {
-  BaseMatrix::addColVector(const_cast<Matrix&>(b));
-}
-
-void GpuMatrix::bilinearForward(const Matrix& in,
-                                const size_t inImgH,
-                                const size_t inImgW,
-                                const size_t outImgH,
-                                const size_t outImgW,
-                                const size_t numChannels,
-                                const real ratioH,
-                                const real ratioW) {
-  CHECK(dynamic_cast<const GpuMatrix*>(&in));
-
-  const size_t outputW = getWidth();
-  const size_t outputH = getHeight();
-  const size_t inputW = in.getWidth();
-  const size_t inputH = in.getHeight();
-
-  real* outData = getData();
-  const real* inData = in.getData();
-
-  if (inImgH == outImgW && inImgW == outImgW) {
-    this->copyFrom(in);
-  } else {
-    hl_bilinear_forward(inData,
-                        inImgH,
-                        inImgW,
-                        inputH,
-                        inputW,
-                        outData,
-                        outImgH,
-                        outImgW,
-                        outputH,
-                        outputW,
-                        numChannels,
-                        ratioH,
-                        ratioW);
-  }
-}
-
-void GpuMatrix::bilinearBackward(const Matrix& out,
-                                 const size_t outImgH,
-                                 const size_t outImgW,
-                                 const size_t inImgH,
-                                 const size_t inImgW,
-                                 const size_t numChannels,
-                                 const real ratioH,
-                                 const real ratioW) {
-  CHECK(dynamic_cast<const GpuMatrix*>(&out));
-
-  const size_t inputW = getWidth();
-  const size_t inputH = getHeight();
-  const size_t outputW = out.getWidth();
-  const size_t outputH = out.getHeight();
-
-  real* inGrad = getData();
-  const real* outGrad = out.getData();
-
-  if (outImgH == inImgH && outImgW == inImgW) {
-    this->add(const_cast<Matrix&>(out));
-  } else {
-    hl_bilinear_backward(inGrad,
-                         inImgH,
-                         inImgW,
-                         inputH,
-                         inputW,
-                         outGrad,
-                         outImgH,
-                         outImgW,
-                         outputH,
-                         outputW,
-                         numChannels,
-                         ratioH,
-                         ratioW);
-  }
-}
-
-void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
-#ifdef PADDLE_WITH_CUDA
-  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
-  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
-
-  CHECK(outputPtr && labelPtr) << "Invalid argument pointer";
-  CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported";
-  CHECK(height_ == outputPtr->height_ && width_ == 1 &&
-        outputPtr->width_ == labelPtr->getWidth() &&
-        outputPtr->height_ == labelPtr->getHeight())
-      << "Matrix dimensions are not equal";
-
-  real* output_d = outputPtr->data_;
-  real* entropy_d = data_;
-  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
-  hl_matrix_multi_binary_cross_entropy(
-      output_d, entropy_d, mat_d, height_, outputPtr->width_);
-#endif
-}
-
-void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
-#ifdef PADDLE_WITH_CUDA
-  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
-  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
-
-  CHECK(outputPtr && labelPtr) << "Invalid argument pointer";
-  CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported";
-  CHECK(height_ == outputPtr->height_ && width_ == outputPtr->width_ &&
-        outputPtr->width_ == labelPtr->getWidth() &&
-        outputPtr->height_ == labelPtr->getHeight())
-      << "Matrix dimensions are not equal";
-
-  real* output_d = outputPtr->data_;
-  real* grad_d = data_;
-  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
-  hl_matrix_multi_binary_cross_entropy_bp(
-      output_d, grad_d, mat_d, height_, width_);
-#endif
-}
-
-void GpuMatrix::vol2Col(real* dataSrc,
-                        int channels,
-                        int depth,
-                        int height,
-                        int width,
-                        int filterD,
-                        int filterH,
-                        int filterW,
-                        int strideD,
-                        int strideH,
-                        int strideW,
-                        int paddingD,
-                        int paddingH,
-                        int paddingW) {
-  hl_matrix_vol2Col(dataSrc,
-                    channels,
-                    depth,
-                    height,
-                    width,
-                    filterD,
-                    filterH,
-                    filterW,
-                    strideD,
-                    strideH,
-                    strideW,
-                    paddingD,
-                    paddingH,
-                    paddingW,
-                    getData());
-}
-
-void GpuMatrix::col2Vol(real* dataDst,
-                        int channels,
-                        int depth,
-                        int height,
-                        int width,
-                        int filterD,
-                        int filterH,
-                        int filterW,
-                        int strideD,
-                        int strideH,
-                        int strideW,
-                        int paddingD,
-                        int paddingH,
-                        int paddingW,
-                        real alpha,
-                        real beta) {
-  hl_matrix_col2Vol(dataDst,
-                    channels,
-                    depth,
-                    height,
-                    width,
-                    filterD,
-                    filterH,
-                    filterW,
-                    strideD,
-                    strideH,
-                    strideW,
-                    paddingD,
-                    paddingH,
-                    paddingW,
-                    getData(),
-                    alpha,
-                    beta);
-}
-
-/**
- * CpuMatrix
- */
-
-CpuMatrix::CpuMatrix(size_t height, size_t width, bool trans)
-    : Matrix(std::make_shared<CpuMemoryHandle>(height * width * sizeof(real)),
-             height,
-             width,
-             trans,
-             false) {}
-
-CpuMatrix::~CpuMatrix() {}
-
-void CpuMatrix::zeroMem() {
-  CHECK(data_ != NULL);
-  if (isContiguous()) {
-    memset(data_, 0, height_ * width_ * sizeof(real));
-  } else {
-    BaseMatrix::zero();
-  }
-}
-void CpuMatrix::resetOne() {
-  CHECK(data_ != NULL);
-  BaseMatrix::one();
-}
-
-void CpuMatrix::copyFrom(const Matrix& src) {
-  CHECK(isContiguous());
-  if (typeid(src) == typeid(GpuMatrix)) {
-    CHECK(src.isContiguous());
-    CHECK(elementCnt_ == src.getElementCnt());
-    hl_memcpy_device2host(
-        data_, const_cast<real*>(src.getData()), sizeof(real) * elementCnt_);
-  } else if (typeid(src) == typeid(CpuMatrix) ||
-             typeid(src) == typeid(SharedCpuMatrix)) {
-    CHECK(src.isContiguous());
-    CHECK(elementCnt_ == src.getElementCnt());
-    memcpy(data_, src.getData(), sizeof(real) * elementCnt_);
-  } else if (typeid(src) == typeid(CpuSparseMatrix)) {
-    CHECK_GE(elementCnt_, src.getElementCnt());
-    copyFrom(dynamic_cast<CpuSparseMatrix&>(const_cast<Matrix&>(src)));
-  } else {
-    LOG(FATAL) << "Wrong";
-  }
-}
-
-void CpuMatrix::copyFrom(CpuSparseMatrix& src) {
-  CHECK(isContiguous());
-  CHECK(height_ == src.getHeight());
-  CHECK(width_ == src.getWidth());
-  memset(data_, 0, sizeof(real) * height_ * width_);
-  if (src.getValueType() == FLOAT_VALUE) {
-    if (src.getFormat() == SPARSE_CSC) {
-      int* rows = src.getRows();
-      real* vals = src.getValue();
-      for (size_t i = 0; i < width_; i++) {
-        for (size_t j = src.getColStartIdx(i); j < src.getColStartIdx(i + 1);
-             j++) {
-          data_[rows[j] * width_ + i] = vals[j];
-        }
-      }
-    } else {
-      int* cols = src.getCols();
-      real* vals = src.getValue();
-      for (size_t i = 0; i < height_; i++) {
-        for (size_t j = src.getRowStartIdx(i); j < src.getRowStartIdx(i + 1);
-             j++) {
-          data_[i * width_ + cols[j]] = vals[j];
-        }
-      }
-    }
-  } else {
-    if (src.getFormat() == SPARSE_CSC) {
-      int* rows = src.getRows();
-      for (size_t i = 0; i < width_; i++) {
-        for (size_t j = src.getColStartIdx(i); j < src.getColStartIdx(i + 1);
-             j++) {
-          data_[rows[j] * width_ + i] = 1.0;
-        }
-      }
-    } else {
-      int* cols = src.getCols();
-      for (size_t i = 0; i < height_; i++) {
-        for (size_t j = src.getRowStartIdx(i); j < src.getRowStartIdx(i + 1);
-             j++) {
-          data_[i * width_ + cols[j]] = 1.0;
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
-  CHECK(isContiguous());
-  CHECK(src.isContiguous());
-  CHECK(elementCnt_ == src.getElementCnt());
-  if (typeid(src) == typeid(GpuMatrix)) {
-    hl_memcpy_async(this->getData(),
-                    const_cast<real*>(src.getData()),
-                    sizeof(real) * elementCnt_,
-                    stream);
-    // There is a need to add synchronization to ensure that the data is copied.
-    hl_stream_synchronize(stream);
-  } else if (typeid(src) == typeid(CpuMatrix)) {
-    memcpy(data_, src.getData(), sizeof(real) * elementCnt_);
-  } else {
-    LOG(FATAL) << "Wrong";
-  }
-}
-
-void CpuMatrix::copyFrom(const real* cpuSrc, size_t size) {
-  CHECK(isContiguous());
-  CHECK(size <= elementCnt_);
-  memcpy(data_, cpuSrc, sizeof(real) * size);
-}
-
-void CpuMatrix::copyFrom(const real* cpuSrc, const int64_t* seq) {
-  CHECK(isContiguous());
-  for (size_t i = 0; i < height_; i++) {
-    memcpy(data_ + i * width_, cpuSrc + seq[i] * width_, sizeof(real) * width_);
-  }
-}
-
-void CpuMatrix::copyFrom(const IVector& src) {
-  CHECK(isContiguous());
-  CHECK(elementCnt_ == src.getSize())
-      << "the src and dst should have same size.";
-  const int* cpuSrc = NULL;
-  IVectorPtr tmp;
-  if (src.useGpu()) {
-    CpuIVector tmp(src.getSize());
-    tmp.copyFrom(src);
-    cpuSrc = tmp.getData();
-  } else {
-    cpuSrc = src.getData();
-  }
-  for (size_t i = 0; i < elementCnt_; ++i) {
-    data_[i] = cpuSrc[i];
-  }
-}
-
-void CpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(b.getWidth(), width);
-  const int* index = rowIndex.getData();
-  for (size_t i = 0; i < height; i++) {
-    CHECK_LT(static_cast<size_t>(index[i]), b.getHeight());
-    real* src = b.getData() + index[i] * width;
-    real* dst = getData() + i * width;
-    memcpy(dst, src, sizeof(real) * width);
-  }
-}
-
-MatrixPtr CpuMatrix::clone(size_t height, size_t width, bool useGpu) {
-  CHECK(isContiguous());
-
-  if (height == 0 && width == 0) {
-    height = height_;
-    width = width_;
-  }
-
-  CHECK(width && height);
-
-  if (useGpu) {
-    return std::make_shared<GpuMatrix>(height, width);
-  } else {
-    return std::make_shared<CpuMatrix>(height, width);
-  }
-}
-
-void CpuMatrix::resize(size_t newHeight, size_t newWidth) {
-  size_t newSize = newHeight * newWidth;
-  if (NULL == memoryHandle_.get() ||
-      newSize * sizeof(real) > memoryHandle_->getAllocSize()) {
-    memoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize * sizeof(real));
-    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
-  }
-
-  height_ = newHeight;
-  width_ = newWidth;
-  elementCnt_ = newSize;
-  stride_ = width_;
-}
-
-real CpuMatrix::getElement(size_t x, size_t y) const {
-  return data_[x * stride_ + y];
-}
-
-real CpuMatrix::getSum() {
-  CHECK(isContiguous());
-  double sum = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    for (size_t j = 0; j < width_; ++j) {
-      sum += data_[i * width_ + j];
-    }
-  }
-  return sum;
-}
-
-void CpuMatrix::accumulateColSum(Matrix& src) {
-  CHECK_EQ(getWidth(), src.getWidth());
-  CHECK_EQ(getHeight(), (size_t)1);
-
-  sumCols(src, /* scaleSum= */ 1, /* scaleDest= */ 1);
-}
-
-real CpuMatrix::getAbsSum() {
-  CHECK(isContiguous());
-  double sum = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    for (size_t j = 0; j < width_; ++j) {
-      sum += fabs(data_[i * width_ + j]);
-    }
-  }
-  return sum;
-}
-
-MatrixPtr CpuMatrix::getTranspose() {
-  if (memoryHandle_.get() != NULL) {
-    return std::make_shared<CpuMatrix>(
-        std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle_),
-        height_,
-        width_,
-        true);
-  } else {
-    MatrixPtr copy_T(new CpuMatrix(data_, height_, width_, true));
-    return copy_T;
-  }
-}
-
-void CpuMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
-  if (memAlloc) {
-    matTrans = std::make_shared<CpuMatrix>(width_, height_);
-  } else {
-    CHECK(matTrans != NULL);
-    CHECK_EQ(matTrans->getHeight(), width_);
-    CHECK_EQ(matTrans->getWidth(), height_);
-  }
-  real* dataTrans = matTrans->getData();
-  real* data = getData();
-  int lda = getStride();
-  int ldc = matTrans->getStride();
-
-  for (size_t i = 0; i < height_; i++) {
-    for (size_t j = 0; j < width_; j++) {
-      dataTrans[j * ldc + i] = data[i * lda + j];
-    }
-  }
-}
-
-void CpuMatrix::rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
-  if (memAlloc) {
-    matRot = std::make_shared<CpuMatrix>(width_, height_);
-  } else {
-    CHECK(matRot != NULL);
-    CHECK_EQ(matRot->getHeight(), width_);
-    CHECK_EQ(matRot->getWidth(), height_);
-  }
-  real* dataRot = matRot->getData();
-  real* data = getData();
-
-  for (size_t i = 0; i < height_; i++) {
-    for (size_t j = 0; j < width_; j++) {
-      if (clockWise) {
-        dataRot[j * height_ + i] = data[(height_ - i - 1) * width_ + j];
-      } else {
-        dataRot[j * height_ + i] = data[i * width_ + (width_ - j - 1)];
-      }
-    }
-  }
-}
-
-MatrixPtr CpuMatrix::getInverse() {
-  MatrixPtr matInv;
-  inverse(matInv, true);
-  return matInv;
-}
-
-void CpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
-  CHECK_EQ(height_, width_);
-
-  if (memAlloc) {
-    matInv = std::make_shared<CpuMatrix>(height_, width_);
-  } else {
-    CHECK(matInv != NULL);
-  }
-
-  CHECK_EQ(height_, matInv->getHeight());
-  CHECK_EQ(width_, matInv->getWidth());
-  matInv->copyFrom(*this);
-
-  real* data = getData();
-  real* dataInv = matInv->getData();
-  int ldc = matInv->getStride();
-
-  if (height_ == 1) {
-    CHECK_NE(*data, 0);
-    *dataInv = 1.0 / (*data);
-    return;
-  }
-
-  /* Compute the LU decomposition of the matrix */
-  std::vector<int> ipiv(height_);
-  CBLAS_ORDER order = (matInv->isTransposed() ? CblasColMajor : CblasRowMajor);
-  int info = getrf<real>(order, height_, height_, dataInv, ldc, ipiv.data());
-  CHECK_EQ(info, 0);
-
-  /* Compute the inverse of the matrix given its LU decompsotion */
-  info = getri<real>(order, height_, dataInv, ldc, ipiv.data());
-  CHECK_EQ(info, 0);
-}
-
-void CpuMatrix::maxPoolForward(Matrix& inputMat,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t channels,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               size_t paddingH,
-                               size_t paddingW,
-                               MatrixPtr maskMatP) {
-  real* inputData = inputMat.getData();
-  real* outData = data_;
-  real* maskData = NULL;
-  size_t num = inputMat.getHeight();
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  CHECK(inLength == inputMat.getWidth() / channels);
-  CHECK_EQ(num, this->getHeight());
-  CHECK_EQ(channels * outLength, this->getWidth());
-  size_t outStride = getStride();
-
-  if (maskMatP != NULL) {
-    maskData = maskMatP->getData();
-    CHECK_EQ(channels * outLength, maskMatP->getWidth());
-  }
-
-  /* pool max one by one */
-  for (size_t n = 0; n < num; ++n) {  // frame by frame
-    if (!isContiguous()) {
-      outData = data_ + n * outStride;
-    }
-    for (size_t c = 0; c < channels; ++c) {  // channel by channel
-      for (size_t ph = 0; ph < outputH; ++ph) {
-        int hstart = ph * strideH - paddingH;
-        int hend = hstart + sizeY;
-        hstart = hstart < 0 ? 0 : hstart;
-        hend = hend < (int)imgSizeH ? hend : (int)imgSizeH;
-        for (size_t pw = 0; pw < outputW; ++pw) {
-          int wstart = pw * strideW - paddingW;
-          int wend = wstart + sizeX;
-          wstart = wstart < 0 ? 0 : wstart;
-          wend = wend < (int)imgSizeW ? wend : (int)imgSizeW;
-          if (maskData == NULL) {
-            real tmp = -(real)FLT_MAX;
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                tmp = tmp < inputData[h * imgSizeW + w]
-                          ? inputData[h * imgSizeW + w]
-                          : tmp;
-              }
-            }
-            outData[ph * outputW + pw] = tmp;
-          } else {
-            for (int h = hstart; h < hend; ++h) {
-              for (int w = wstart; w < wend; ++w) {
-                if (outData[ph * outputW + pw] < inputData[h * imgSizeW + w]) {
-                  outData[ph * outputW + pw] = inputData[h * imgSizeW + w];
-                  maskData[ph * outputW + pw] = h * imgSizeW + w;
-                }
-              }
-            }
-          }
-        }
-      }
-      // compute offset
-      inputData += inLength;
-      outData += outLength;
-
-      if (maskData != NULL) maskData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::maxPoolBackward(Matrix& image,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                Matrix& outGrad,
-                                Matrix& outV,
-                                size_t sizeX,
-                                size_t sizeY,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t outputH,
-                                size_t outputW,
-                                real scaleTargets,
-                                real scaleOutput,
-                                size_t paddingH,
-                                size_t paddingW) {
-  size_t num = image.getHeight();
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  size_t channels = size_t(width_ / inLength);
-  CHECK(image.getWidth() == inLength * channels);
-  CHECK(image.getHeight() == height_ && image.getWidth() == width_);
-  CHECK(outV.getHeight() == outGrad.getHeight() &&
-        outV.getWidth() == outGrad.getWidth());
-
-  real* tgtGrad = data_;
-  real* inData = image.getData();
-  real* otData = outV.getData();
-  real* otGrad = outGrad.getData();
-
-  size_t outStride = outV.getStride();
-  real* origOutData = otData;
-  real* origOutGrad = otGrad;
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!outV.isContiguous()) {
-      otData = origOutData + n * outStride;
-      otGrad = origOutGrad + n * outStride;
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t ph = 0; ph < outputH; ++ph) {
-        int hstart = ph * strideH - paddingH;
-        int hend = std::min(hstart + sizeY, imgSizeH);
-        hstart = std::max(hstart, 0);
-        for (size_t pw = 0; pw < outputW; ++pw) {
-          int wstart = pw * strideW - paddingW;
-          int wend = std::min(wstart + sizeX, imgSizeW);
-          wstart = std::max(wstart, 0);
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              tgtGrad[h * imgSizeW + w] =
-                  scaleTargets * tgtGrad[h * imgSizeW + w] +
-                  scaleOutput * otGrad[ph * outputW + pw] *
-                      (inData[h * imgSizeW + w] == otData[ph * outputW + pw]);
-            }
-          }
-        }
-      }
-      // offset
-      inData += inLength;
-      tgtGrad += inLength;
-      otData += outLength;
-      otGrad += outLength;
-    }
-  }
-}
-
-void CpuMatrix::avgPoolForward(Matrix& input,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t channels,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               size_t paddingH,
-                               size_t paddingW,
-                               bool excludeMode) {
-  // The main loop
-  size_t num = input.getHeight();
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  CHECK(inLength * channels == input.getWidth());
-  CHECK(outLength * channels * num == height_ * width_);
-  real* tgtData = data_;
-  real* inData = input.getData();
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!isContiguous()) {
-      tgtData = data_ + n * getStride();
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t ph = 0; ph < outputH; ++ph) {
-        int hstart = ph * strideH - paddingH;
-        int hend = std::min(hstart + sizeY, imgSizeH);
-        hstart = std::max(hstart, 0);
-        for (size_t pw = 0; pw < outputW; ++pw) {
-          int wstart = pw * strideW - paddingW;
-          int wend = std::min(wstart + sizeX, imgSizeW);
-          wstart = std::max(wstart, 0);
-          tgtData[ph * outputW + pw] = 0;  // clear
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              tgtData[ph * outputW + pw] += inData[h * imgSizeW + w];
-            }
-          }
-          int poolSize =
-              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
-          CHECK(poolSize);
-          tgtData[ph * outputW + pw] /= poolSize;
-        }
-      }
-      // compute offset
-      inData += inLength;
-      tgtData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::avgPoolBackward(Matrix& input,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t sizeX,
-                                size_t sizeY,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t outputH,
-                                size_t outputW,
-                                real scaleTargets,
-                                real scaleOutput,
-                                size_t paddingH,
-                                size_t paddingW,
-                                bool excludeMode) {
-  size_t num = input.getHeight();
-  size_t channels = input.getWidth() / outputH / outputW;
-  size_t inLength = imgSizeH * imgSizeW;
-  size_t outLength = outputH * outputW;
-  CHECK(inLength * channels == getWidth());
-  real* inData = input.getData();
-  real* outData = getData();
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!input.isContiguous()) {
-      inData = input.getData() + n * input.getStride();
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t ph = 0; ph < outputH; ++ph) {
-        int hstart = ph * strideH - paddingH;
-        int hend = std::min(hstart + sizeY, imgSizeH);
-        hstart = std::max(hstart, 0);
-        for (size_t pw = 0; pw < outputW; ++pw) {
-          int wstart = pw * strideW - paddingW;
-          int wend = std::min(wstart + sizeX, imgSizeW);
-          wstart = std::max(wstart, 0);
-          int poolSize =
-              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
-          CHECK(poolSize);
-
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              outData[h * imgSizeW + w] += inData[ph * outputW + pw] / poolSize;
-            }
-          }
-        }
-      }
-      // offset
-      outData += inLength;
-      inData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::maxPool3DForward(Matrix& inputMat,
-                                 Matrix& maxPoolIdx,
-                                 size_t channels,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW) {
-  real* inputData = inputMat.getData();
-  real* outData = getData();
-  real* maxPoolIdxData = maxPoolIdx.getData();
-  size_t num = inputMat.getHeight();
-  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
-  size_t outLength = outputH * outputW * outputD;
-  CHECK(inLength == inputMat.getWidth() / channels);
-  CHECK_EQ(num, this->getHeight());
-  CHECK_EQ(channels * outLength, this->getWidth());
-  size_t outStride = getStride();
-
-  /* initialize the data_ */
-  for (size_t i = 0; i < height_; i++) {
-    for (size_t j = 0; j < width_; j++) {
-      outData[(i)*outStride + j] = -(real)FLT_MAX;
-      maxPoolIdxData[(i)*outStride + j] = -1;
-    }
-  }
-
-  /* pool max one by one */
-  for (size_t n = 0; n < num; ++n) {  // frame by frame
-    if (!isContiguous()) {
-      outData = getData() + n * outStride;
-      maxPoolIdxData = maxPoolIdx.getData() + n * outStride;
-    }
-    for (size_t c = 0; c < channels; ++c) {  // channel by channel
-      for (size_t pd = 0; pd < outputD; ++pd) {
-        int dstart = pd * strideD - paddingD;
-        int dend = std::min(dstart + sizeZ, imgSizeD);
-        dstart = std::max(dstart, 0);
-        for (size_t ph = 0; ph < outputH; ++ph) {
-          int hstart = ph * strideH - paddingH;
-          int hend = std::min(hstart + sizeY, imgSizeH);
-          hstart = std::max(hstart, 0);
-          for (size_t pw = 0; pw < outputW; ++pw) {
-            int wstart = pw * strideW - paddingW;
-            int wend = std::min(wstart + sizeX, imgSizeW);
-            wstart = std::max(wstart, 0);
-            int maxIdx = -1;
-            real maxOutData = outData[(pd * outputH + ph) * outputW + pw];
-            for (int d = dstart; d < dend; ++d) {
-              for (int h = hstart; h < hend; ++h) {
-                for (int w = wstart; w < wend; ++w) {
-                  if (maxOutData <
-                      inputData[(d * imgSizeH + h) * imgSizeW + w]) {
-                    maxOutData = inputData[(d * imgSizeH + h) * imgSizeW + w];
-                    maxIdx = (d * imgSizeH + h) * imgSizeW + w;
-                  }
-                }
-              }
-            }
-            outData[(pd * outputH + ph) * outputW + pw] = maxOutData;
-            maxPoolIdxData[(pd * outputH + ph) * outputW + pw] = maxIdx;
-          }
-        }
-      }
-      // compute offset
-      inputData += inLength;
-      outData += outLength;
-      maxPoolIdxData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::maxPool3DBackward(Matrix& outGrad,
-                                  Matrix& maxPoolIdx,
-                                  size_t imgSizeD,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  size_t outputD,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  size_t sizeZ,
-                                  size_t sizeY,
-                                  size_t sizeX,
-                                  size_t strideD,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingD,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  real scaleTargets,
-                                  real scaleOutput) {
-  size_t num = getHeight();
-  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
-  size_t outLength = outputH * outputW * outputD;
-  size_t channels = size_t(width_ / inLength);
-  CHECK(maxPoolIdx.getHeight() == outGrad.getHeight() &&
-        maxPoolIdx.getWidth() == outGrad.getWidth());
-
-  real* tgtGrad = getData();
-  real* otGrad = outGrad.getData();
-  real* maxPoolIdxData = maxPoolIdx.getData();
-  size_t outStride = outGrad.getStride();
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!outGrad.isContiguous()) {
-      otGrad = outGrad.getData() + n * outStride;
-      maxPoolIdxData = maxPoolIdx.getData() + n * outStride;
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t pd = 0; pd < outputD; ++pd) {
-        for (size_t ph = 0; ph < outputH; ++ph) {
-          for (size_t pw = 0; pw < outputW; ++pw) {
-            const size_t index = (pd * outputH + ph) * outputW + pw;
-            const size_t tgtIdx = static_cast<size_t>(maxPoolIdxData[index]);
-            tgtGrad[tgtIdx] =
-                scaleTargets * tgtGrad[tgtIdx] + scaleOutput * otGrad[index];
-          }
-        }
-      }
-      // offset
-      tgtGrad += inLength;
-      otGrad += outLength;
-      maxPoolIdxData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::avgPool3DForward(Matrix& input,
-                                 size_t channels,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW) {
-  // The main loop
-  size_t num = input.getHeight();
-  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
-  size_t outLength = outputH * outputW * outputD;
-  CHECK(inLength * channels == input.getWidth());
-  CHECK(outLength * channels * num == height_ * width_);
-  real* tgtData = getData();
-  real* inData = input.getData();
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!isContiguous()) {
-      tgtData = data_ + n * getStride();
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t pd = 0; pd < outputD; ++pd) {
-        int dstart = pd * strideD - paddingD;
-        int dend = std::min(dstart + sizeZ, imgSizeD);
-        dstart = std::max(dstart, 0);
-        for (size_t ph = 0; ph < outputH; ++ph) {
-          int hstart = ph * strideH - paddingH;
-          int hend = std::min(hstart + sizeY, imgSizeH);
-          hstart = std::max(hstart, 0);
-          for (size_t pw = 0; pw < outputW; ++pw) {
-            int wstart = pw * strideW - paddingW;
-            int wend = std::min(wstart + sizeX, imgSizeW);
-            wstart = std::max(wstart, 0);
-
-            tgtData[(pd * outputH + ph) * outputW + pw] = 0;  // clear
-            for (int d = dstart; d < dend; ++d) {
-              for (int h = hstart; h < hend; ++h) {
-                for (int w = wstart; w < wend; ++w) {
-                  tgtData[(pd * outputH + ph) * outputW + pw] +=
-                      inData[(d * imgSizeH + h) * imgSizeW + w];
-                }
-              }
-            }
-            int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart);
-            CHECK(poolSize);
-            tgtData[(pd * outputH + ph) * outputW + pw] /= poolSize;
-          }
-        }
-      }
-      // compute offset
-      inData += inLength;
-      tgtData += outLength;
-    }
-  }
-}
-
-void CpuMatrix::avgPool3DBackward(Matrix& input,
-                                  size_t imgSizeD,
-                                  size_t imgSizeH,
-                                  size_t imgSizeW,
-                                  size_t outputD,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  size_t sizeZ,
-                                  size_t sizeY,
-                                  size_t sizeX,
-                                  size_t strideD,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingD,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  real scaleTargets,
-                                  real scaleOutput) {
-  size_t num = input.getHeight();
-  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
-  size_t outLength = outputH * outputW * outputD;
-  size_t channels = input.getWidth() / outLength;
-  CHECK(inLength * channels == getWidth());
-  real* inData = input.getData();
-  real* outData = getData();
-
-  for (size_t n = 0; n < num; ++n) {
-    if (!input.isContiguous()) {
-      inData = input.getData() + n * input.getStride();
-    }
-    for (size_t c = 0; c < channels; ++c) {
-      for (size_t pd = 0; pd < outputD; ++pd) {
-        int dstart = pd * strideD - paddingD;
-        int dend = std::min(dstart + sizeZ, imgSizeD);
-        dstart = std::max(dstart, 0);
-        for (size_t ph = 0; ph < outputH; ++ph) {
-          int hstart = ph * strideH - paddingH;
-          int hend = std::min(hstart + sizeY, imgSizeH);
-          hstart = std::max(hstart, 0);
-          for (size_t pw = 0; pw < outputW; ++pw) {
-            int wstart = pw * strideW - paddingW;
-            int wend = std::min(wstart + sizeX, imgSizeW);
-            wstart = std::max(wstart, 0);
-            int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart);
-            CHECK(poolSize);
-            for (int d = dstart; d < dend; ++d) {
-              for (int h = hstart; h < hend; ++h) {
-                for (int w = wstart; w < wend; ++w) {
-                  outData[(d * imgSizeH + h) * imgSizeW + w] +=
-                      inData[(pd * outputH + ph) * outputW + pw] / poolSize;
-                }
-              }
-            }
-          }
-        }
-      }
-      // offset
-      outData += inLength;
-      inData += outLength;
-    }
-  }
-}
-
-/**
- * Input: one or more sequences. Each sequence contains some instances.
- * Output: output size is the number of input sequences (NOT input instances).
- * output[i] is set to max_{for each instance in this sequence}{input[i]}
- */
-void CpuMatrix::maxSequenceForward(Matrix& input,
-                                   const IVector& sequence,
-                                   IVector& index) {
-  CHECK(dynamic_cast<CpuMatrix*>(&input));
-  CHECK(dynamic_cast<const CpuIVector*>(&sequence));
-  CHECK(dynamic_cast<CpuIVector*>(&index));
-
-  real* outData = getData();
-  real* inputData = input.getData();
-  const int* starts = sequence.getData();
-  int* maxIndex = index.getData();
-  size_t numSequences = getHeight();
-  size_t dim = getWidth();
-
-  CHECK_EQ(dim, input.getWidth());
-  CHECK_EQ(numSequences, sequence.getSize() - 1);
-  CHECK_EQ(starts[numSequences], (int)input.getHeight());
-  CHECK_EQ(numSequences * dim, index.getSize());
-
-  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
-    // current sequence, loop for each input instance
-    // (1) first instance: do not need compare, copy value to outV directly
-    for (size_t k = 0; k < dim; ++k) {
-      outData[sequenceId * dim + k] = inputData[starts[sequenceId] * dim + k];
-      maxIndex[sequenceId * dim + k] = starts[sequenceId];
-    }
-    // (2) other instance in same sequence
-    for (int insId = starts[sequenceId] + 1; insId < starts[sequenceId + 1];
-         ++insId) {
-      // insId is the index on all instances
-      for (size_t k = 0; k < dim; ++k) {
-        // for each dim
-        if (inputData[insId * dim + k] > outData[sequenceId * dim + k]) {
-          // update max value and record index
-          outData[sequenceId * dim + k] = inputData[insId * dim + k];
-          maxIndex[sequenceId * dim + k] = insId;
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::maxSequenceBackward(Matrix& outputGrad,
-                                    const IVector& sequence,
-                                    IVector& index) {
-  CHECK(dynamic_cast<CpuMatrix*>(&outputGrad));
-  CHECK(dynamic_cast<const CpuIVector*>(&sequence));
-  CHECK(dynamic_cast<CpuIVector*>(&index));
-
-  real* inputGrad = getData();
-  real* outGrad = outputGrad.getData();
-  int* maxIndex = index.getData();
-  size_t dim = getWidth();
-  size_t numSequences = sequence.getSize() - 1;
-
-  CHECK_EQ(dim, outputGrad.getWidth());
-  CHECK_EQ(numSequences, outputGrad.getHeight());
-  CHECK_EQ(numSequences * dim, index.getSize());
-
-  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
-    // current sequence
-    for (size_t j = 0; j < dim; ++j) {
-      // each dim
-      int insId = maxIndex[sequenceId * dim + j];
-      inputGrad[insId * dim + j] += outGrad[sequenceId * dim + j];
-    }
-  }
-}
-
-inline void vecAddTo(real* a, const real* b, size_t len) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i] += b[i];
-  }
-}
-
-inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i] += scaleB * b[i];
-  }
-}
-
-inline void colVecAddTo(
-    real* a, const real* b, size_t len, size_t aWidth, size_t bWidth) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i * aWidth] += b[i * bWidth];
-  }
-}
-
-inline void colVecAddTo(
-    real* a, real* b, real c, size_t len, size_t aWidth, size_t bWidth) {
-  for (unsigned int i = 0; i < len; ++i) {
-    a[i * aWidth] += b[i * bWidth] * c;
-  }
-}
-
-void CpuMatrix::addBias(Matrix& b, real scale) {
-  CHECK(b.useGpu_ == false) << "Matrix type are not equal";
-
-  CHECK_EQ(b.getHeight(), (size_t)1);
-  CHECK_EQ(width_, b.getWidth());
-  real* aData = getData();
-  real* bData = b.getData();
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-
-  if (scale == 1 && getStride() % 32 == 0) {  // use libaddto
-    // @TODO(yuyang18) Make input addr can be unaligned.
-    // So merge this if and else
-    CHECK_EQ((size_t)aData % 32, 0UL);
-    CHECK_EQ((size_t)bData % 32, 0UL);
-    for (size_t i = 0; i < numSamples; i++) {
-      simd::addTo(aData + i * getStride(), bData, dim);
-    }
-  } else {
-    for (size_t i = 0; i < numSamples; i++) {
-      for (size_t j = 0; j < dim; j++) {
-        aData[i * getStride() + j] += scale * bData[j];
-      }
-    }
-  }
-}
-
-void CpuMatrix::addSharedBias(Matrix& b, real scale) {
-  CHECK_EQ(b.getHeight(), (size_t)1);
-  real* aData = getData();
-  real* bData = b.getData();
-  size_t numSamples = getHeight();
-  size_t channel = b.getWidth();
-  CHECK_EQ(getWidth() % channel, 0UL);
-  size_t dim = getWidth() / channel;
-
-  for (size_t i = 0; i < numSamples; i++) {
-    for (size_t c = 0; c < channel; c++) {
-      for (size_t j = 0; j < dim; j++) {
-        aData[i * getStride() + c * dim + j] += scale * bData[c];
-      }
-    }
-  }
-}
-
-void CpuMatrix::collectBias(Matrix& a, real scale) {
-  CHECK_EQ(getHeight(), (size_t)1);
-  CHECK_EQ(width_, a.getWidth());
-  CpuSparseMatrix* aptr = dynamic_cast<CpuSparseMatrix*>(&a);
-  if (!aptr) {
-    sumCols(a, /* scaleSum= */ scale, /* scaleDest= */ 1);
-  } else {
-    size_t nnz = aptr->getElementCnt();
-    int* cols = aptr->getCols();
-    real* A = aptr->getValue();
-    real* B = getData();
-    for (size_t i = 0; i < nnz; i++) {
-      B[cols[i]] += scale * A[i];
-    }
-  }
-}
-
-void CpuMatrix::collectSharedBias(Matrix& a, real scale) {
-  CHECK_EQ(getHeight(), (size_t)1);
-  real* B = getData();
-  real* A = a.getData();
-  size_t numSamples = a.getHeight();
-  size_t channel = getWidth();
-  CHECK_EQ(a.getWidth() % channel, 0UL);
-  size_t dim = a.getWidth() / channel;
-  for (size_t i = 0; i < numSamples; i++) {
-    for (size_t c = 0; c < channel; c++) {
-      for (size_t j = 0; j < dim; j++) {
-        B[c] += scale * A[i * channel * dim + c * dim + j];
-      }
-    }
-  }
-}
-
-void CpuMatrix::sequenceAvgForward(Matrix& a,
-                                   const IVector& startsPos,
-                                   int mode) {
-  size_t height = getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(height, startsPos.getSize() - 1);
-  CHECK_EQ(width, a.getWidth());
-  real* dst = getData();
-  real* src = a.getData();
-  const int* starts = startsPos.getData();
-  MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false);
-  MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false);
-  for (size_t i = 0; i < height; i++) {
-    int sequenceLength = starts[i + 1] - starts[i];
-    if (0 == sequenceLength) {
-      // empty sequence
-      continue;
-    }
-    outMtx->setData(dst + i * width);
-    dataMtx->setData(src + starts[i] * width, sequenceLength, width);
-    if (mode == 0) {
-      // plain average
-      outMtx->sumCols(*dataMtx,
-                      (real)1 / (real)sequenceLength,
-                      /* scaleDest= */ 1);
-    } else if (mode == 1) {
-      // sum instead of average
-      outMtx->sumCols(*dataMtx, /* scaleSum= */ 1, /* scaleDest= */ 1);
-    } else if (mode == 2) {
-      // divide by square root of sequenceLength
-      outMtx->sumCols(*dataMtx,
-                      (real)1 / std::sqrt(sequenceLength),
-                      /* scaleDest= */ 1);
-    } else {
-      LOG(FATAL) << "should not reach here";
-    }
-  }
-}
-
-void CpuMatrix::sequenceAvgBackward(Matrix& a,
-                                    const IVector& startsPos,
-                                    int mode) {
-  size_t height = a.getHeight();
-  size_t width = getWidth();
-  CHECK_EQ(height, startsPos.getSize() - 1);
-  CHECK_EQ(width, a.getWidth());
-  real* dst = getData();
-  real* src = a.getData();
-  const int* starts = startsPos.getData();
-  MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false);
-  MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false);
-  for (size_t i = 0; i < height; ++i) {
-    int sequenceLength = starts[i + 1] - starts[i];
-    if (0 == sequenceLength) {
-      // empty sequence
-      continue;
-    }
-    outMtx->setData(dst + starts[i] * width, sequenceLength, width);
-    dataMtx->setData(src + i * width);
-    if (mode == 0) {
-      // plain average
-      outMtx->addBias(*dataMtx, 1.0f / sequenceLength);
-    } else if (mode == 1) {
-      // sum instead of average
-      outMtx->addBias(*dataMtx, 1.0f);
-    } else if (mode == 2) {
-      // divide by square root of sequenceLength
-      outMtx->addBias(*dataMtx, 1.0f / std::sqrt(sequenceLength));
-    } else {
-      LOG(FATAL) << "should not reach here";
-    }
-  }
-}
-
-/* this = scaleAB*(a*b) + scaleT*this*/
-void CpuMatrix::mul(const Matrix& a,
-                    const Matrix& b,
-                    real scaleAB,
-                    real scaleT) {
-  CHECK(!isTransposed()) << "Not supported";
-  const auto a_ptr = dynamic_cast<const CpuMatrix*>(&a);
-  const auto b_ptr = dynamic_cast<const CpuMatrix*>(&b);
-  const auto a_ptr_s = dynamic_cast<const CpuSparseMatrix*>(&a);
-  const auto b_ptr_s = dynamic_cast<const CpuSparseMatrix*>(&b);
-
-  if (a_ptr && b_ptr) {
-    mul((CpuMatrix*)a_ptr, (CpuMatrix*)b_ptr, scaleAB, scaleT);
-  } else if (a_ptr_s && b_ptr) {
-    mul((CpuSparseMatrix*)a_ptr_s, (CpuMatrix*)b_ptr, scaleAB, scaleT);
-  } else if (a_ptr && b_ptr_s) {
-    mul((CpuMatrix*)a_ptr, (CpuSparseMatrix*)b_ptr_s, scaleAB, scaleT);
-  } else {
-    LOG(FATAL) << "Not supported";
-  }
-}
-
-void CpuMatrix::mul(CpuSparseMatrix* a,
-                    CpuMatrix* b,
-                    real scaleAB,
-                    real scaleT) {
-  if (dynamic_cast<CacheRowCpuMatrix*>(b)) {
-    return mul(a, dynamic_cast<CacheRowCpuMatrix*>(b), this, scaleAB, scaleT);
-  } else if (dynamic_cast<SparseRowCpuMatrix*>(b)) {
-    return mul(a, dynamic_cast<SparseRowCpuMatrix*>(b), this, scaleAB, scaleT);
-  } else {
-    return mul(a, b, this, scaleAB, scaleT);
-  }
-}
-
-void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
-  CHECK(!isTransposed()) << "Not supported";
-
-  size_t a_col, b_col, a_row, b_row;
-  bool a_trans, b_trans;
-  if (!a->isTransposed()) {
-    a_col = a->getWidth();
-    a_row = a->getHeight();
-    a_trans = false;
-  } else {
-    a_col = a->getHeight();
-    a_row = a->getWidth();
-    a_trans = true;
-  }
-  if (!b->isTransposed()) {
-    b_col = b->getWidth();
-    b_row = b->getHeight();
-    b_trans = false;
-  } else {
-    b_col = b->getHeight();
-    b_row = b->getWidth();
-    b_trans = true;
-  }
-
-  CHECK_EQ(a_col, b_row);
-  CHECK_EQ(a_row, getHeight());
-  CHECK_EQ(b_col, getWidth());
-
-  real* A = a->getData();
-  real* B = b->getData();
-  real* C = getData();
-
-  int M = getHeight();
-  int N = getWidth();
-  int K = a_col;
-  int lda = a->getStride();
-  int ldb = b->getStride();
-  int ldc = getStride();
-  BlasGemm<DEVICE_TYPE_CPU, real>::compute(
-      a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc);
-}
-
-void CpuMatrix::mul(
-    CpuMatrix* a, CpuMatrix* b, CpuSparseMatrix* c, real scaleAB, real scaleT) {
-  CHECK(!c->isTransposed()) << "Not supported";
-  CHECK_EQ(c->getValueType(), FLOAT_VALUE);
-
-  real* A = a->getData();
-  real* B = b->getData();
-  real* C = c->getValue();
-  int* rows = c->getRows();
-  int* cols = c->getCols();
-  size_t height = c->getHeight();
-  size_t width = c->getWidth();
-  if (scaleT == 0) {
-    c->zeroMem();
-  }
-
-  if (!a->isTransposed() && !b->isTransposed()) {
-    size_t m = a->getWidth();
-    CHECK_EQ(b->getHeight(), m);
-    CHECK_EQ(a->getHeight(), height);
-    CHECK_EQ(b->getWidth(), width);
-    if (c->getFormat() == SPARSE_CSC) {
-      for (size_t i = 0; i < width; i++) {
-        size_t start = c->getColStartIdx(i);
-        size_t end = c->getColStartIdx(i + 1);
-        for (size_t j = start; j < end; j++) {
-          real sum = 0;
-          size_t rowIdx = rows[j];
-          for (size_t k = 0; k < m; k++) {
-            sum += A[rowIdx * m + k] * B[k * width + i];
-          }
-          C[j] = scaleAB * sum + scaleT * C[j];
-        }
-      }
-    } else {
-      for (size_t i = 0; i < height; i++) {
-        size_t start = c->getRowStartIdx(i);
-        size_t end = c->getRowStartIdx(i + 1);
-        for (size_t j = start; j < end; j++) {
-          real sum = 0;
-          size_t colIdx = cols[j];
-          for (size_t k = 0; k < m; k++) {
-            sum += A[i * m + k] * B[k * width + colIdx];
-          }
-          C[j] = scaleAB * sum + scaleT * C[j];
-        }
-      }
-    }
-  } else if (a->isTransposed() && !b->isTransposed()) {
-    size_t m = a->getHeight();
-    CHECK_EQ(m, b->getHeight());
-    CHECK_EQ(b->getWidth(), width);
-    CHECK_EQ(a->getWidth(), height);
-
-    if (c->getFormat() == SPARSE_CSC) {
-      for (size_t i = 0; i < width; i++) {
-        size_t start = c->getColStartIdx(i);
-        size_t end = c->getColStartIdx(i + 1);
-        for (size_t j = start; j < end; j++) {
-          real sum = 0;
-          size_t rowIdx = rows[j];
-          for (size_t k = 0; k < m; k++) {
-            sum += A[k * height + rowIdx] * B[k * width + i];
-          }
-          C[j] = scaleAB * sum + scaleT * C[j];
-        }
-      }
-    } else {
-      for (size_t i = 0; i < height; i++) {
-        int start = c->getRowStartIdx(i);
-        int end = c->getRowStartIdx(i + 1);
-        for (int j = start; j < end; j++) {
-          real sum = 0;
-          size_t colIdx = cols[j];
-          for (size_t k = 0; k < m; k++) {
-            sum += A[k * height + i] * B[k * width + colIdx];
-          }
-          C[j] = scaleAB * sum + scaleT * C[j];
-        }
-      }
-    }
-  } else if (!a->isTransposed() && b->isTransposed()) {
-    size_t m = a->getWidth();
-    CHECK_EQ(b->getWidth(), m);
-    CHECK_EQ(a->getHeight(), height);
-    CHECK_EQ(b->getHeight(), width);
-    if (c->getFormat() == SPARSE_CSR) {
-      for (size_t i = 0; i < height; i++) {
-        size_t start = c->getRowStartIdx(i);
-        size_t end = c->getRowStartIdx(i + 1);
-        for (size_t j = start; j < end; j++) {
-          real sum = 0;
-          size_t colIdx = cols[j];
-          for (size_t k = 0; k < m; k++) {
-            sum += A[i * m + k] * B[colIdx * m + k];
-          }
-          C[j] = scaleAB * sum + scaleT * C[j];
-        }
-      }
-    } else {
-      LOG(FATAL) << "Not supported csc format "
-                    "when a is not trans and b is trans";
-    }
-  } else {
-    LOG(FATAL) << "Not supported";
-  }
-}
-
-void CpuMatrix::mul(CpuMatrix* a,
-                    CpuSparseMatrix* b,
-                    real scaleAB,
-                    real scaleT) {
-  CHECK(!trans_) << "Not supported";
-  CHECK(!a->isTransposed()) << "Not supported";
-  CHECK(scaleT == 0 || scaleT == 1);
-
-  // TODO(yuyang18): Maybe bug implementation here
-  CHECK_EQ(scaleAB, static_cast<real>(1.0));
-
-  real* A = a->getData();
-  real* B = b->getValue();
-  real* C = getData();
-  int* rows = b->getRows();
-  int* cols = b->getCols();
-
-  if (scaleT == 0) {
-    zeroMem();
-  }
-  if (b->getFormat() == SPARSE_CSC) {
-    if (!b->isTransposed()) {
-      size_t m = a->getWidth();
-      CHECK_EQ(b->getHeight(), m);
-      CHECK_EQ(a->getHeight(), height_);
-      CHECK_EQ(b->getWidth(), width_);
-
-      if (b->getValueType() == NO_VALUE) {
-        for (size_t j = 0; j < b->getWidth(); ++j) {
-          int start = b->getColStartIdx(j);
-          int end = b->getColStartIdx(j + 1);
-          for (int i = start; i < end; ++i) {
-            colVecAddTo(C + j, A + rows[i], height_, width_, a->getWidth());
-          }
-        }
-      } else if (b->getValueType() == FLOAT_VALUE) {
-        for (size_t j = 0; j < b->getWidth(); ++j) {
-          int start = b->getColStartIdx(j);
-          int end = b->getColStartIdx(j + 1);
-          for (int i = start; i < end; ++i) {
-            colVecAddTo(
-                C + j, A + rows[i], B[i], height_, width_, a->getWidth());
-          }
-        }
-      }
-    } else /*if (b->isTransposed())*/ {
-      size_t m = a->getWidth();
-      CHECK_EQ(b->getHeight(), width_);
-      CHECK_EQ(a->getHeight(), height_);
-      CHECK_EQ(b->getWidth(), m);
-      if (b->getValueType() == NO_VALUE) {
-        for (size_t i = 0; i < b->getWidth(); ++i) {
-          int start = b->getColStartIdx(i);
-          int end = b->getColStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            colVecAddTo(C + rows[j], A + i, height_, width_, a->getWidth());
-          }
-        }
-      } else if (b->getValueType() == FLOAT_VALUE) {
-        for (size_t i = 0; i < b->getWidth(); ++i) {
-          int start = b->getColStartIdx(i);
-          int end = b->getColStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            colVecAddTo(
-                C + rows[j], A + i, B[j], height_, width_, a->getWidth());
-          }
-        }
-      }
-    }
-  } else {
-    if (!b->isTransposed()) {
-      size_t m = a->getWidth();
-      CHECK_EQ(b->getHeight(), m);
-      CHECK_EQ(a->getHeight(), height_);
-      CHECK_EQ(b->getWidth(), width_);
-
-      if (b->getValueType() == NO_VALUE) {
-        for (size_t j = 0; j < b->getHeight(); ++j) {
-          int start = b->getRowStartIdx(j);
-          int end = b->getRowStartIdx(j + 1);
-          for (int i = start; i < end; ++i) {
-            colVecAddTo(C + cols[i], A + j, height_, width_, a->getWidth());
-          }
-        }
-      } else if (b->getValueType() == FLOAT_VALUE) {
-        for (size_t j = 0; j < b->getHeight(); ++j) {
-          int start = b->getRowStartIdx(j);
-          int end = b->getRowStartIdx(j + 1);
-          for (int i = start; i < end; ++i) {
-            colVecAddTo(
-                C + cols[i], A + j, B[i], height_, width_, a->getWidth());
-          }
-        }
-      }
-    } else /*if (b->isTransposed())*/ {
-      size_t m = a->getWidth();
-      CHECK_EQ(b->getHeight(), width_);
-      CHECK_EQ(a->getHeight(), height_);
-      CHECK_EQ(b->getWidth(), m);
-      if (b->getValueType() == NO_VALUE) {
-        for (size_t i = 0; i < b->getHeight(); ++i) {
-          int start = b->getRowStartIdx(i);
-          int end = b->getRowStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            colVecAddTo(C + i, A + cols[j], height_, width_, a->getWidth());
-          }
-        }
-      } else if (b->getValueType() == FLOAT_VALUE) {
-        for (size_t i = 0; i < b->getHeight(); ++i) {
-          int start = b->getRowStartIdx(i);
-          int end = b->getRowStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            colVecAddTo(
-                C + i, A + cols[j], B[j], height_, width_, a->getWidth());
-          }
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::selectRows(Matrix& table, IVector& ids) {
-  if (dynamic_cast<CacheRowCpuMatrix*>(&table)) {
-    selectRowsImp(*dynamic_cast<CacheRowCpuMatrix*>(&table), ids);
-  } else if (dynamic_cast<SparseRowCpuMatrix*>(&table)) {
-    selectRowsImp(*dynamic_cast<SparseRowCpuMatrix*>(&table), ids);
-  } else {
-    CHECK(table.isContiguous());
-    selectRowsImp(*dynamic_cast<CpuMatrix*>(&table), ids);
-  }
-}
-
-void CpuMatrix::selectElements(Matrix& table, IVector& ids) {
-  CHECK_EQ(table.getHeight(), ids.getSize());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), 1U);
-  real* tableData = table.getData();
-  int* idsData = ids.getData();
-  for (size_t i = 0; i < table.getHeight(); i++) {
-    data_[i] += tableData[i * table.getWidth() + idsData[i]];
-  }
-}
-
-void CpuMatrix::addElements(Matrix& table, IVector& ids) {
-  CHECK_EQ(table.getHeight(), ids.getSize());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), 1U);
-  real* tableData = table.getData();
-  int* idsData = ids.getData();
-  for (size_t i = 0; i < table.getHeight(); i++) {
-    tableData[i * table.getWidth() + idsData[i]] += data_[i];
-  }
-}
-
-// this.row[i] += table.row[ids[i]]
-template <typename TableMatType>
-void CpuMatrix::selectRowsImp(TableMatType& table, IVector& ids) {
-  CHECK(!table.useGpu());
-  CHECK(!ids.useGpu());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), table.getWidth());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  real* a = getData();
-  size_t tableSize = table.getHeight();
-  int* index = ids.getData();
-
-  for (size_t i = 0; i < numSamples; ++i) {
-    if (index[i] == -1) continue;
-    CHECK_LT(index[i], (int)tableSize);
-    CHECK_GE(index[i], 0);
-    vecAddTo(a + i * stride_, table.getRow(index[i]), dim);
-  }
-}
-
-void CpuMatrix::addToRows(Matrix& table, IVector& ids) {
-  if (dynamic_cast<CacheRowCpuMatrix*>(&table)) {
-    addToRowsImp(*dynamic_cast<CacheRowCpuMatrix*>(&table), ids);
-  } else if (dynamic_cast<SparseAutoGrowRowCpuMatrix*>(&table)) {
-    addToRowsImp(*dynamic_cast<SparseAutoGrowRowCpuMatrix*>(&table), ids);
-  } else if (dynamic_cast<SparseRowCpuMatrix*>(&table)) {
-    addToRowsImp(*dynamic_cast<SparseRowCpuMatrix*>(&table), ids);
-  } else {
-    CHECK(table.isContiguous());
-    addToRowsImp(*dynamic_cast<CpuMatrix*>(&table), ids);
-  }
-}
-
-// table.row[ids[i]] += this.row[i]
-template <typename TableMatType>
-void CpuMatrix::addToRowsImp(TableMatType& table, IVector& ids) {
-  CHECK(!table.useGpu());
-  CHECK(!ids.useGpu());
-  CHECK_EQ(getHeight(), ids.getSize());
-  CHECK_EQ(getWidth(), table.getWidth());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  real* a = getData();
-  size_t tableSize = table.getHeight();
-  int* index = ids.getData();
-
-  for (size_t i = 0; i < numSamples; ++i) {
-    if (index[i] == -1) continue;
-    CHECK_LT(index[i], (int)tableSize);
-    CHECK_GE(index[i], 0);
-    vecAddTo(table.getRow(index[i]), a + i * stride_, dim);
-  }
-}
-
-static ThreadLocal<std::vector<const real*>> threadLocalColArray;
-
-template <typename MatBType, typename MatCType>
-void CpuMatrix::mul(
-    CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT) {
-  CHECK(!c->isTransposed()) << "Not supported";
-  CHECK(!b->isTransposed()) << "Not supported";
-  // TODO(yuyang18): Maybe bug implementation here.
-  CHECK(scaleAB == 1) << "Not supported";
-  CHECK(scaleT == 0 || scaleT == 1) << "Not supported";
-  CHECK_EQ(a->getFormat(), SPARSE_CSR) << "Not supported";
-
-  real* B = b->getData();
-  real* C = c->getData();
-  size_t height = c->getHeight();
-  size_t width = c->getWidth();
-  int* cols = a->getCols();
-  real* values = a->getValue();
-
-  if (scaleT == 0) {
-    c->zeroMem();
-  }
-
-  if (!a->isTransposed()) {
-    size_t m = a->getWidth();
-    CHECK_EQ(b->getHeight(), m);
-    CHECK_EQ(a->getHeight(), height);
-    CHECK_EQ(b->getWidth(), width);
-
-    if (a->getValueType() == NO_VALUE) {
-      if (width % 32 == 0) {  // use libaddto
-        // @TODO(yuyang18) Make input addr can be unaligned.
-        // So merge this if and else
-        CHECK_EQ((size_t)B % 32, 0UL);
-        CHECK_EQ((size_t)C % 32, 0UL);
-        auto& colArray = *threadLocalColArray;
-        for (size_t i = 0; i < a->getHeight(); ++i) {
-          const int start = a->getRowStartIdx(i);
-          const int end = a->getRowStartIdx(i + 1);
-          size_t colNum = end - start;
-          colArray.resize(colNum);
-          for (int j = 0; j < end - start; ++j) {
-            colArray[j] = b->getRow(cols[j + start]);
-          }
-          simd::batchAddTo(c->getRow(i), &colArray[0], colNum, width);
-        }
-
-      } else {
-        for (size_t i = 0; i < a->getHeight(); ++i) {
-          const int start = a->getRowStartIdx(i);
-          const int end = a->getRowStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            vecAddTo(c->getRow(i), b->getRow(cols[j]), width);
-          }
-        }
-      }
-    } else if (a->getValueType() == FLOAT_VALUE) {
-      for (size_t i = 0; i < a->getHeight(); ++i) {
-        const int start = a->getRowStartIdx(i);
-        const int end = a->getRowStartIdx(i + 1);
-        for (int j = start; j < end; ++j) {
-          vecAddTo(c->getRow(i), b->getRow(cols[j]), values[j], width);
-        }
-      }
-    }
-  } else /*if (a->isTransposed())*/ {
-    size_t m = a->getHeight();
-    CHECK_EQ(b->getHeight(), m);
-    CHECK_EQ(a->getWidth(), height);
-    CHECK_EQ(b->getWidth(), width);
-    if (a->getValueType() == NO_VALUE) {
-      if (width % 32 == 0) {  // use libaddto
-        // @TODO(yuyang18) Make input addr can be unaligned.
-        // So merge this if and else
-        CHECK_EQ((size_t)B % 32, 0UL);
-        CHECK_EQ((size_t)C % 32, 0UL);
-        for (size_t i = 0; i < a->getHeight(); ++i) {
-          const int start = a->getRowStartIdx(i);
-          const int end = a->getRowStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            simd::addTo(c->getRow(cols[j]), b->getRow(i), width);
-          }
-        }
-
-      } else {
-        for (size_t i = 0; i < a->getHeight(); ++i) {
-          const int start = a->getRowStartIdx(i);
-          const int end = a->getRowStartIdx(i + 1);
-          for (int j = start; j < end; ++j) {
-            vecAddTo(c->getRow(cols[j]), b->getRow(i), width);
-          }
-        }
-      }
-    } else if (a->getValueType() == FLOAT_VALUE) {
-      for (size_t i = 0; i < a->getHeight(); ++i) {
-        const int start = a->getRowStartIdx(i);
-        const int end = a->getRowStartIdx(i + 1);
-        for (int j = start; j < end; ++j) {
-          vecAddTo(c->getRow(cols[j]), b->getRow(i), values[j], width);
-        }
-      }
-    }
-  }
-}
-
-// instantiation mul() called in SparseRowMatrix.cpp
-template void CpuMatrix::mul<CpuMatrix, SparseRowCpuMatrix>(
-    CpuSparseMatrix* a,
-    CpuMatrix* b,
-    SparseRowCpuMatrix* c,
-    real scaleAB,
-    real scaleT);
-template void CpuMatrix::mul<CpuMatrix, SparseAutoGrowRowCpuMatrix>(
-    CpuSparseMatrix* a,
-    CpuMatrix* b,
-    SparseAutoGrowRowCpuMatrix* c,
-    real scaleAB,
-    real scaleT);
-template void CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(CpuSparseMatrix* a,
-                                                           CpuMatrix* b,
-                                                           CacheRowCpuMatrix* c,
-                                                           real scaleAB,
-                                                           real scaleT);
-
-#ifndef PADDLE_MOBILE_INFERENCE
-void SharedCpuMatrix::mul(CpuSparseMatrix* a,
-                          CpuMatrix* b,
-                          real scaleAB,
-                          real scaleT) {
-  CHECK(!isTransposed()) << "Not supported";
-  CHECK(!b->isTransposed()) << "Not supported";
-  CHECK_EQ(scaleAB, 1) << "Not supported";
-  CHECK_EQ(scaleT, 1) << "Not supported";
-  CHECK_EQ(a->getFormat(), SPARSE_CSR) << "not supported";
-
-  real* B = b->getData();
-  real* C = getData();
-  size_t height = getHeight();
-  size_t width = getWidth();
-
-  // get real trans
-  MatrixPtr aTrans;
-  if (a->isTransposed()) {
-    aTrans = a->getTmpSparseMatrix(a->getWidth(), a->getHeight());
-    a->transpose(aTrans, false);
-  }
-  a = dynamic_cast<CpuSparseMatrix*>(aTrans.get());
-
-  size_t m = a->getWidth();
-  CHECK_EQ(b->getHeight(), m);
-  CHECK_EQ(a->getHeight(), height);
-  CHECK_EQ(b->getWidth(), width);
-
-  size_t blockSize = (height / blockNum_) + 1;
-  CpuMatrixPtr localBuf = *localBuf_;
-  if (!localBuf) {
-    localBuf = std::make_shared<CpuMatrix>(blockSize, width);
-  } else {
-    localBuf->resize(blockSize, width);
-  }
-  localBuf->zeroMem();
-  real* localC = localBuf->getData();
-  std::vector<int>& blockSeq = *blockSeq_;
-  if (blockSeq.size() == 0) {
-    for (int k = 0; k < blockNum_; ++k) {
-      blockSeq.push_back(k);
-    }
-    std::shuffle(
-        blockSeq.begin(), blockSeq.end(), ThreadLocalRandomEngine::get());
-  }
-  std::vector<int>& localBufRows = *localBufRows_;
-  int* cols = a->getCols();
-  real* value = a->getValue();
-
-  for (int k = 0; k < blockNum_; ++k) {
-    int blockId = blockSeq[k];
-    size_t blockBegin = blockId * blockSize;
-    size_t blockEnd = (blockId + 1) * blockSize;
-    if (blockId == blockNum_ - 1) {
-      blockEnd = height;
-    }
-    if (a->getValueType() == NO_VALUE) {
-      for (size_t i = blockBegin; i < blockEnd; ++i) {
-        int start = a->getRowStartIdx(i);
-        int end = a->getRowStartIdx(i);
-        size_t colNum = a->getColNum(i);
-        if (colNum == 0) {
-          continue;
-        }  // skip empty row
-        localBufRows.push_back(i);
-        size_t bufPos = localBufRows.size() - 1;
-        for (int j = start; j < end; ++j) {
-          vecAddTo(localC + bufPos * width, B + cols[j] * width, width);
-        }
-      }
-    } else if (a->getValueType() == FLOAT_VALUE) {
-      for (size_t i = blockBegin; i < blockEnd; ++i) {
-        int start = a->getRowStartIdx(i);
-        int end = a->getRowStartIdx(i);
-        size_t colNum = a->getColNum(i);
-        if (colNum == 0) {
-          continue;
-        }  // skip empty row
-        localBufRows.push_back(i);
-        size_t bufPos = localBufRows.size() - 1;
-        for (int j = start; j < end; ++j) {
-          vecAddTo(
-              localC + bufPos * width, B + cols[j] * width, value[j], width);
-        }
-      }
-    }
-
-    {
-      std::lock_guard<std::mutex> guard(*blockLocks_[blockId]);
-      for (size_t i = 0; i < localBufRows.size(); ++i) {
-        vecAddTo(C + localBufRows[i] * width, localC + i * width, width);
-      }
-    }
-    memset(localC, 0, localBufRows.size() * width * sizeof(real));
-    localBufRows.clear();
-  }
-
-  VLOG(2) << " B[0]=" << B[0] << " B[1]=" << B[1] << " C[0]=" << C[0]
-          << " C[1]=" << C[1];
-}
-
-void SharedCpuMatrix::add(Matrix& b, real p1, real p2) {
-  CHECK_EQ(blockNum_, 1);
-  std::lock_guard<std::mutex> guard(*blockLocks_[0]);
-  CpuMatrix::add(b, p1, p2);
-}
-
-void SharedCpuMatrix::add(real p1, real p2) {
-  CHECK_EQ(blockNum_, 1);
-  std::lock_guard<std::mutex> guard(*blockLocks_[0]);
-  CpuMatrix::add(p1, p2);
-}
-
-void SharedCpuMatrix::initShared(int blockNum) {
-  CHECK_GT(height_ * width_, 1UL * 1024 * 1024)
-      << "should not share small matrix";
-  initBlock(blockNum);
-}
-
-void SharedCpuMatrix::initBlock(int blockNum) {
-  CHECK_LE(blockNum, 200) << "should not use large block number";
-  blockNum_ = blockNum;
-  blockLocks_.resize(blockNum);
-  for (auto& locker : blockLocks_) {
-    locker.reset(new std::mutex);
-  }
-}
-
-#endif
-/* Add a (column) vector b to matrix a, column by column */
-void CpuMatrix::addColumnVector(const Matrix& b) {
-  BaseMatrix::addColVector(const_cast<Matrix&>(b));
-}
-
-/* this = a*b */
-void CpuMatrix::mul(const Matrix& a, const Matrix& b) {
-  return mul(a, b, 1.0, 0.0);
-}
-
-/* this = scaleAB*(this*b) +  scaleT*this */
-void CpuMatrix::rightMul(Matrix& b, real scaleAB, real scaleT) {
-  (void)b;
-  (void)scaleAB;
-  (void)scaleT;
-  LOG(FATAL) << "Not implemented";
-}
-
-/* this = this* b */
-void CpuMatrix::rightMul(Matrix& b) { return rightMul(b, 1.0, 0.0); }
-
-/* this = scaleAB*(a*this) +  scaleT*this */
-void CpuMatrix::leftMul(Matrix& a, real scaleAB, real scaleT) {
-  (void)a;
-  (void)scaleAB;
-  (void)scaleT;
-  LOG(FATAL) << "Not implemented";
-}
-
-/* this = a*this) */
-void CpuMatrix::leftMul(Matrix& a) { return leftMul(a, 1.0, 0.0); }
-
-void CpuMatrix::colMerge(Matrix& src) { src.rowSum(*this); }
-
-void CpuMatrix::rowSum(Matrix& sum) {
-  CHECK_EQ(sum.getHeight(), getHeight());
-  CHECK_EQ(sum.getWidth(), (size_t)1);
-
-  sum.sumRows(*this, /* scaleSum= */ 1, /* scaleDest= */ 0);
-}
-
-void CpuMatrix::rowMaxId(IVector& maxIds) {
-  CHECK(!maxIds.useGpu()) << "Matrix type are not equal";
-
-  size_t numSamples = getHeight();
-  CHECK_EQ(maxIds.getSize(), numSamples);
-
-  real* a = getData();
-  int* s = maxIds.getData();
-  size_t dim = getWidth();
-
-  for (size_t i = 0; i < numSamples; i++) {
-    real sm = a[i * dim];
-    int maxId = 0;
-    for (size_t j = 1; j < dim; j++) {
-      if (a[i * dim + j] > sm) {
-        maxId = j;
-        sm = a[i * dim + j];
-      }
-    }
-    s[i] = maxId;
-  }
-}
-
-void CpuMatrix::rowMax(Matrix& max) {
-  CHECK_EQ(max.getHeight(), getHeight());
-  CHECK_EQ(max.getWidth(), (size_t)1);
-  max.maxRows(*this);
-}
-
-/* Get the top k elements of each row of this matrix */
-void CpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-  CHECK(isContiguous());
-  CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
-  size_t numSamples = getHeight();
-  size_t beam = maxVal.getWidth();
-  CHECK_EQ(maxIds.getSize(), numSamples * beam);
-  CHECK_EQ(maxVal.getHeight(), numSamples);
-  CHECK_EQ(maxVal.getWidth(), beam);
-
-  real* a = getData();
-  int* s = maxIds.getData();
-  real* t = maxVal.getData();
-  size_t dim = getWidth();
-  for (size_t i = 0; i < numSamples; i++) {
-    std::vector<std::pair<real, size_t>> vec;
-    for (size_t j = 0; j < dim; j++) {
-      vec.push_back(std::pair<real, size_t>(a[i * dim + j], j));
-    }
-
-    std::partial_sort(
-        vec.begin(),
-        vec.begin() + beam,
-        vec.end(),
-        [](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
-          return l.first > r.first;
-        });
-    for (size_t j = 0; j < beam; j++) {
-      t[i * beam + j] = vec[j].first;
-      s[i * beam + j] = vec[j].second;
-    }
-  }
-}
-
-void CpuMatrix::colMax(Matrix& max) {
-  CHECK_EQ(max.getWidth(), getWidth());
-  CHECK_EQ(max.getHeight(), (size_t)1);
-  max.maxCols(*this);
-}
-
-void CpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
-  CHECK(isContiguous());
-  CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
-  size_t numSamples = getWidth();
-  size_t beam = maxVal.getHeight();
-  CHECK_EQ(maxIds.getSize(), numSamples * beam);
-  CHECK_EQ(maxVal.getWidth(), numSamples);
-
-  real* a = getData();
-  int* s = maxIds.getData();
-  real* t = maxVal.getData();
-  size_t dim = getHeight();
-  for (size_t i = 0; i < numSamples; i++) {
-    std::vector<std::pair<real, size_t>> vec;
-    for (size_t j = 0; j < dim; j++) {
-      vec.push_back(std::pair<real, size_t>(a[i + j * numSamples], j));
-    }
-
-    std::partial_sort(
-        vec.begin(),
-        vec.begin() + beam,
-        vec.end(),
-        [](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
-          return l.first > r.first;
-        });
-    for (size_t j = 0; j < beam; j++) {
-      t[i + j * numSamples] = vec[j].first;
-      s[i + j * numSamples] = vec[j].second;
-    }
-  }
-}
-
-void CpuMatrix::maxoutForward(Matrix& a,
-                              IVector& id,
-                              size_t channels,
-                              size_t groups) {
-  CHECK(dynamic_cast<CpuMatrix*>(&a));
-  CHECK(dynamic_cast<CpuIVector*>(&id));
-  CHECK_EQ(a.getHeight(), getHeight());
-
-  size_t size = getWidth();
-  size_t batchSize = getHeight();
-  size_t featLen = size / channels;
-  const real* input = a.getData();
-  int* idForCpu = id.getData();
-
-  MatrixPtr maxInMat, maxOutMat;
-  Matrix::resizeOrCreate(maxInMat, groups, size, false, false);
-  Matrix::resizeOrCreate(maxOutMat, 1, size, false, false);
-
-  for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
-    size_t newIndex = batch_idx * size;
-    IVectorPtr tmpId = IVector::create(idForCpu + newIndex, size, false);
-
-    for (size_t i = 0; i < channels; ++i) {
-      size_t newFeatLen = i * featLen;
-      for (size_t j = 0; j < groups; ++j) {
-        maxInMat->subMatrix(j, j + 1, newFeatLen, newFeatLen + featLen)
-            ->copyFrom(input + (newIndex + newFeatLen) * groups + j * featLen,
-                       featLen);
-      }
-    }
-    maxInMat->colMax(*tmpId, *maxOutMat);
-    this->subRowMatrix(batch_idx, batch_idx + 1)->copyFrom(*maxOutMat);
-  }
-}
-
-void CpuMatrix::maxoutBackward(Matrix& a,
-                               IVector& id,
-                               size_t channels,
-                               size_t groups) {
-  CHECK(dynamic_cast<CpuMatrix*>(&a));
-  CHECK(dynamic_cast<CpuIVector*>(&id));
-  CHECK_EQ(a.getHeight(), getHeight());
-
-  size_t size = a.getWidth();
-  size_t batchSize = getHeight();
-  size_t featLen = size / channels;
-  size_t newFeatLen = groups * featLen;
-  real* inputG = getData();
-  const real* outG = a.getData();
-  int* idForCpu = id.getData();
-
-  for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
-    size_t newIndex = batch_idx * size;
-    int* idData = idForCpu + newIndex;
-
-    for (size_t i = 0; i < size; ++i) {
-      int gradIdx =
-          idData[i] * featLen + (i / featLen) * newFeatLen + i % featLen;
-      (inputG + newIndex * groups)[gradIdx] += (outG + newIndex)[i];
-    }
-  }
-}
-
-void CpuMatrix::rowNormalizeL1(Matrix& out) {
-  CHECK(!out.useGpu());
-
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(out.getHeight(), numSamples);
-  CHECK_EQ(out.getWidth(), dim);
-  real* a = getData();
-  real* b = out.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    real s = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      s += a[i * dim + j];
-    }
-    // Right now, we just bet that sum won't be zero. If this really happens,
-    // we will figure out what should be done then.
-    CHECK_GT(s, 0);
-    s = 1 / s;
-    for (size_t j = 0; j < dim; ++j) {
-      b[i * dim + j] = s * a[i * dim + j];
-    }
-  }
-}
-
-/* calulate classification error */
-void CpuMatrix::classificationError(Matrix& output,
-                                    IVector& label,
-                                    size_t topkSize) {
-  size_t numSamples = this->getHeight();
-  auto cpuOutput = dynamic_cast<CpuMatrix*>(&output);
-  auto cpuLabel = dynamic_cast<CpuIVector*>(&label);
-  IVectorPtr cpuTopIds = std::make_shared<CpuIVector>(numSamples * topkSize);
-  MatrixPtr cpuTopVal = std::make_shared<CpuMatrix>(numSamples, topkSize);
-
-  CHECK(cpuOutput && cpuLabel) << "Invalid argument pointer";
-  CHECK(cpuTopIds && cpuTopVal) << "Allocate cpu memory failed";
-  CHECK(cpuLabel->getSize() == numSamples) << "Vector size is not equal";
-  CHECK(cpuOutput->getHeight() == numSamples && this->getWidth() == 1)
-      << "Matrix dimensions are not equal";
-
-  // top k matrix classification
-  cpuOutput->rowMax(*cpuTopIds, *cpuTopVal);
-
-  size_t dim = cpuOutput->getWidth();
-  real* result = this->getData();
-  int* ids = cpuTopIds->getData();
-  int* lbl = cpuLabel->getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    CHECK_GE(lbl[i], 0);
-    CHECK_LT((size_t)lbl[i], dim);
-
-    for (size_t j = 0; j < topkSize; ++j) {
-      if (ids[j + i * topkSize] == lbl[i]) {
-        result[i] = 0;
-        break;
-      }
-      result[i] = 1.0f;
-    }
-  }
-}
-
-/* copy -log(output[label]) to this->data[i] */
-void CpuMatrix::oneHotCrossEntropy(Matrix& output, IVector& label) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  CHECK(dynamic_cast<CpuIVector*>(&label));
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(label.getSize(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(getWidth(), (size_t)1);
-
-  real* out = output.getData();
-  real* cost = getData();
-  int* lbl = label.getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim) {
-    CHECK_GE(lbl[i], 0);
-    CHECK_LT((size_t)lbl[i], dim);
-    cost[i] = -std::log(out[lbl[i]]);
-  }
-}
-
-/* calculate the error of outputV according to label */
-void CpuMatrix::oneHotCrossEntropyBp(Matrix& output, IVector& label) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  CHECK(dynamic_cast<CpuIVector*>(&label));
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(output.getWidth(), dim);
-  real* out = output.getData();
-  real* grad = getData();
-  int* lbl = label.getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
-    grad[lbl[i]] -= 1 / out[lbl[i]];
-  }
-}
-
-/*
-    We implement the matrix functionality in CostLayer.cpp,
-    but we define the scalar function here for sanity check
-    deletion of the function does not affect anything neverthelss
-*/
-void CpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                               IVector& label,
-                                               real alpha) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  CHECK(dynamic_cast<CpuIVector*>(&label));
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(label.getSize(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(getWidth(), (size_t)1);
-
-  real* out = output.getData();
-  real* cost = getData();
-  int* lbl = label.getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim) {
-    CHECK_GE(lbl[i], 0);
-    CHECK_LT((size_t)lbl[i], dim);
-    real sum = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      sum += out[j];
-    }
-    sum = _safelog(sum);
-    cost[i] = -_safelog(out[lbl[i]]) + sum + alpha * _square(sum);
-  }
-}
-
-/*
-    We implement the matrix functionality in CostLayer.cpp,
-    but we define the scalar function here for sanity check
-    deletion of the function does not affect anything neverthelss
-*/
-void CpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& output,
-                                                 IVector& label,
-                                                 real alpha) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  CHECK(dynamic_cast<CpuIVector*>(&label));
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(output.getWidth(), dim);
-  real* out = output.getData();
-  real* grad = getData();
-  int* lbl = label.getData();
-
-  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
-    grad[lbl[i]] -= 1 / out[lbl[i]];
-    real sum = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      sum += out[j];
-    }
-    for (size_t j = 0; j < dim; ++j) {
-      if (j == (size_t)lbl[i]) {
-        grad[j] += -1 / out[j];
-      }
-      grad[j] += 1 / sum + 2 * alpha * _safelog(sum) / sum;
-    }
-  }
-}
-
-#define FORWARD_LOOP()                      \
-  size_t numSamples = getHeight();          \
-  size_t dim = getWidth();                  \
-  CHECK_EQ(output.getHeight(), numSamples); \
-  CHECK_EQ(output.getWidth(), dim);         \
-  const real* in = getData();               \
-  real* out = output.getData();             \
-  for (size_t i = 0; i < numSamples; ++i, in += dim, out += dim)
-
-#define BACKWARD_LOOP()                     \
-  size_t numSamples = getHeight();          \
-  size_t dim = getWidth();                  \
-  CHECK_EQ(output.getHeight(), numSamples); \
-  CHECK_EQ(output.getWidth(), dim);         \
-  real* grad = getData();                   \
-  real* out = output.getData();             \
-  for (size_t i = 0; i < numSamples; ++i, grad += dim, out += dim)
-
-void CpuMatrix::softmax(Matrix& output) {
-  CHECK(!output.useGpu());
-
-  const float THRESHOLD = -64.0;
-
-  FORWARD_LOOP() {
-    real max = -1.0e20;
-    for (size_t j = 0; j < dim; ++j) {
-      if (in[j] > max) {
-        max = in[j];
-      }
-    }
-    for (size_t j = 0; j < dim; ++j) {
-      real a = in[j] - max;
-      if (a < THRESHOLD) {
-        a = THRESHOLD;
-      }
-      out[j] = a;
-    }
-    vExp(dim, out, out);
-
-    real sum = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      sum += out[j];
-    }
-    sum = 1 / sum;
-    for (size_t j = 0; j < dim; ++j) {
-      out[j] *= sum;
-    }
-  }
-}
-
-void CpuMatrix::sequenceSoftmax(Matrix& output, const IVector& index) {
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output.getWidth(), 1UL);
-  CHECK(isContiguous());
-
-  MatrixPtr inTmp = Matrix::create(nullptr,
-                                   /* height= */ 1,
-                                   1,
-                                   /* trans= */ false,
-                                   false);
-  MatrixPtr outTmp = Matrix::create(nullptr,
-                                    /* height= */ 1,
-                                    1,
-                                    /* trans= */ false,
-                                    false);
-  size_t numSequences = index.getSize() - 1;
-  auto starts = index.getData();
-  for (size_t i = 0; i < numSequences; ++i) {
-    size_t offset = starts[i];
-    size_t size = starts[i + 1] - starts[i];
-    inTmp->setData(getData() + offset, 1UL, size);
-    outTmp->setData(output.getData() + offset, 1UL, size);
-    inTmp->softmax(*outTmp);
-  }
-}
-
-void CpuMatrix::softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
-  CHECK(output.useGpu_ == false) << "Matrix type are not equal";
-  CHECK_EQ(getHeight(), sftmaxSum.getHeight());
-
-  real* sums = sftmaxSum.getData();
-
-  BACKWARD_LOOP() {
-    real sum = sums[i];
-    for (size_t j = 0; j < dim; ++j) {
-      grad[j] = out[j] * (grad[j] - sum);
-    }
-  }
-}
-
-void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
-  CHECK(output.useGpu_ == false && label.useGpu_ == false)
-      << "Matrix type are not equal";
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(label.getHeight(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(label.getWidth(), dim);
-  CHECK_EQ(getWidth(), (size_t)1);
-  real* out = output.getData();
-  real* cost = getData();
-
-  auto labelptr = dynamic_cast<CpuSparseMatrix*>(&label);
-  if (labelptr) {
-    // it is a CpuSparseMatrix
-    if (labelptr->getFormat() == SPARSE_CSR) {
-      // treat label as a SparseMatrix
-      for (size_t i = 0; i < numSamples; ++i) {
-        for (size_t j = 0; j < dim; ++j) {
-          cost[i] += _square(out[i * dim + j]);
-        }
-      }
-      if (labelptr->getValueType() == NO_VALUE) {
-        int* cols = labelptr->getCols();
-        for (size_t i = 0; i < numSamples; ++i) {
-          for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1);
-               ++j) {
-            cost[i] += 1.0 - 2.0 * out[i * dim + cols[j]];
-            /*
-             * explanation of above line: original codes are follows:
-             * cost[i] -= _square(out[i * dim + feature.col]);
-             * cost[i] += _square(1.0 - out[i * dim + feature.col]);
-             */
-          }
-        }
-      } else if (labelptr->getValueType() == FLOAT_VALUE) {
-        int* cols = labelptr->getCols();
-        real* values = labelptr->getValue();
-        for (size_t i = 0; i < numSamples; ++i) {
-          real sum1 = 0;
-          real sum2 = 0;
-          for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1);
-               ++j) {
-            sum1 += values[j] * values[j];
-            sum2 += values[j] * out[i * dim + cols[j]];
-            /*
-             * explanation of above line: original codes are follows:
-             * cost[i] -= _square(out[i * dim + feature.col]);
-             * cost[i] += _square(value.col - out[i * dim + feature.col]);
-             */
-          }
-          cost[i] += sum1 - 2.0 * sum2;
-        }
-      } else {
-        LOG(FATAL) << "unsupported sparse matrix value type in sumOfSquares";
-        return;
-      }
-      return;
-    } else {
-      LOG(FATAL) << "unsupported sparse matrix format in sumOfSquares";
-      return;
-    }
-  }
-
-  BaseMatrix::sumOfSquaredDiffs(output,
-                                label,
-                                /* scaleSum= */ 1,
-                                /* scaleDest= */ 1);
-}
-
-/* calculate the error of outputV according to label */
-void CpuMatrix::sumOfSquaresBp(Matrix& output, Matrix& label) {
-  CHECK(output.useGpu_ == false && label.useGpu_ == false)
-      << "Matrix type are not equal";
-
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(output.getWidth(), dim);
-  CHECK_EQ(label.getWidth(), dim);
-
-  real* out = output.getData();
-  real* grad = getData();
-
-  auto labelptr = dynamic_cast<CpuSparseMatrix*>(&label);
-  if (labelptr) {
-    // it is a CpuSparseMatrix
-    if (labelptr->getFormat() == SPARSE_CSR) {
-      // treat label as a SparseMatrix
-      for (size_t i = 0; i < numSamples; ++i) {
-        for (size_t j = 0; j < dim; ++j) {
-          grad[i * dim + j] += 2.0 * out[i * dim + j];
-        }
-      }
-      if (labelptr->getValueType() == NO_VALUE) {
-        int* cols = labelptr->getCols();
-        for (size_t i = 0; i < numSamples; ++i) {
-          for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1);
-               ++j) {
-            grad[i * dim + cols[j]] -= 2.0;
-            /*
-             * explanation of above line: original codes are follows:
-             * grad[i * dim + feature.col] -= 2.0 * out[i * dim + feature.col];
-             * grad[i * dim + feature.col] += 2.0 * (out[i * dim + feature.col]
-             * - 1);
-             */
-          }
-        }
-      } else if (labelptr->getValueType() == FLOAT_VALUE) {
-        int* cols = labelptr->getCols();
-        real* values = labelptr->getValue();
-        for (size_t i = 0; i < numSamples; ++i) {
-          for (size_t j = labelptr->getRowStartIdx(i);
-               j < labelptr->getRowStartIdx(i + 1);
-               ++j) {
-            grad[i * dim + cols[j]] -= 2.0 * values[j];
-            /*
-             * explanation of above line: original codes are follows:
-             * grad[i * dim + feature.col] -= 2.0 * out[i * dim + feature.col];
-             * grad[i * dim + feature.col] += 2.0 * (out[i * dim + feature.col]
-             * - value.col);
-             */
-          }
-        }
-      } else {
-        LOG(FATAL) << "unsupported sparse matrix value type in sumOfSquares";
-        return;
-      }
-      return;
-    } else {
-      LOG(FATAL) << "unsupported sparse matrix format in sumOfSquares";
-      return;
-    }
-  }
-
-  real* lbl = label.getData();
-  size_t ld = getStride();
-  size_t outLd = output.getStride();
-  size_t lblLd = label.getStride();
-  CHECK(lbl);
-  for (size_t i = 0; i < numSamples;
-       ++i, out += outLd, lbl += lblLd, grad += ld) {
-    for (size_t j = 0; j < dim; ++j) {
-      grad[j] += 2.0 * (out[j] - lbl[j]);  // positive gradient;
-    }
-  }
-}
-
-void CpuMatrix::smoothL1(Matrix& output, Matrix& label, real destScale) {
-  CHECK(output.useGpu_ == false && label.useGpu_ == false)
-      << "Matrix type are not equal";
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(label.getHeight(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(label.getWidth(), dim);
-  CHECK_EQ(getWidth(), (size_t)1);
-
-  real* cost = getData();
-  real* out = output.getData();
-  real* lbl = label.getData();
-
-  for (size_t i = 0; i < numSamples; ++i, out += dim, lbl += dim) {
-    for (size_t j = 0; j < dim; ++j) {
-      real absVal = std::fabs(out[j] - lbl[j]);
-      cost[i] *= destScale;
-      if (absVal < 1.0)
-        cost[i] += 0.5 * absVal * absVal;
-      else
-        cost[i] += absVal - 0.5;
-    }
-  }
-}
-
-void CpuMatrix::smoothL1Bp(Matrix& output, Matrix& label, real destScale) {
-  CHECK(output.useGpu_ == false && label.useGpu_ == false)
-      << "Matrix type are not equal";
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(label.getHeight(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(label.getWidth(), dim);
-  CHECK_EQ(getWidth(), dim);
-
-  real* out = output.getData();
-  real* lbl = label.getData();
-  real* grad = getData();
-
-  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim, lbl += dim) {
-    for (size_t j = 0; j < dim; ++j) {
-      real val = out[j] - lbl[j];
-      grad[j] *= destScale;
-      if (std::fabs(val) < 1) {
-        grad[j] += val;
-      } else {
-        grad[j] += (real(0) < val) - (val < real(0));
-      }
-    }
-  }
-}
-
-void CpuMatrix::tanh(Matrix& output) {
-  CHECK(isContiguous());
-  CHECK(output.isContiguous());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(output.getWidth(), dim);
-  vTanh(numSamples * dim, getData(), output.getData());
-}
-
-void CpuMatrix::tanhDerivative(Matrix& output) {
-  BaseMatrix::tanhDerivative(output);
-}
-
-void CpuMatrix::softrelu(Matrix& output) {
-  CHECK(isContiguous());
-  CHECK(output.isContiguous());
-  const real THRESHOLD = 40.0;
-  FORWARD_LOOP() {  // TODO(yuyang18): SIMD it?
-    for (size_t j = 0; j < dim; ++j) {
-      real x = in[j];
-      if (x > THRESHOLD) {
-        x = THRESHOLD;
-      } else if (x < -THRESHOLD) {
-        x = -THRESHOLD;
-      }
-      out[j] = x;
-    }
-  }
-  vExp(numSamples * dim, output.getData(), output.getData());
-  vLog1p(numSamples * dim, output.getData(), output.getData());
-}
-
-void CpuMatrix::softreluDerivative(Matrix& output) {
-  CHECK(isContiguous());
-  CHECK(output.isContiguous());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  size_t size = numSamples * dim;
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(output.getWidth(), dim);
-  real* grad = getData();
-  MatrixPtr tmpMat = Matrix::create(numSamples, dim);
-  real* tmp = tmpMat->getData();
-
-  vExp(size, output.getData(), tmpMat->getData());
-
-  for (size_t i = 0; i < size; ++i) {
-    grad[i] *= (1.0 - 1.0 / tmp[i]);
-  }
-}
-
-void CpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
-  CHECK(isContiguous());
-  CHECK(output.isContiguous());
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(output.getWidth(), dim);
-
-  const real* in = getData();
-  real* out = output.getData();
-
-  // out = p2*in
-  for (size_t i = 0; i < numSamples * dim; ++i) {
-    out[i] = p2 * in[i];
-  }
-
-  vTanh(numSamples * dim, out, out);
-
-  // out = p1 * out
-  for (size_t i = 0; i < numSamples * dim; ++i) {
-    out[i] = p1 * out[i];
-  }
-}
-
-/* uniform randomization, minimize precision = 1e-5 */
-void CpuMatrix::randomizeUniform() {
-  CHECK(isContiguous());
-  real* data = getData();
-  unsigned int* randSeed = ThreadLocalRand::getSeed();
-  real recipRandMax = 1.0f / (real)RAND_MAX;
-  for (size_t i = 0; i < elementCnt_; ++i) {
-    *data++ = rand_r(randSeed) * recipRandMax;
-  }
-}
-
-void CpuMatrix::print(std::ostream& os) const {
-  CHECK(isContiguous());
-  for (size_t i = 0; i < height_; ++i) {
-    for (size_t j = 0; j < width_; ++j) {
-      os << data_[i * width_ + j] << " ";
-    }
-    os << std::endl;
-  }
-}
-
-void CpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
-  real* input = data.getData();
-  real* w = W.getData();
-  real* output = data_;
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = W.getHeight() * W.getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-
-  size_t partial_sum = numElements / paraSize;
-  if (paraSize == numElements) {
-    for (size_t n = 0; n < numSamples * numElements; ++n) {
-      output[n] = input[n] > 0 ? input[n] : input[n] * w[n % numElements];
-    }
-    return;
-  }
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-  for (size_t n = 0; n < numSamples; ++n) {
-    for (size_t i = 0; i < paraSize; i++) {
-      neon::prelu(
-          input + i * partial_sum, w[i], output + i * partial_sum, partial_sum);
-    }
-    input = input + numElements;
-    output = output + numElements;
-  }
-#else
-  for (size_t n = 0, k = 0; n < numSamples; ++n) {
-    for (size_t i = 0; i < numElements; ++i, ++k) {
-      output[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum];
-    }
-  }
-#endif
-}
-
-void CpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
-  real* ograd = oGrad.getData();
-  real* input = data.getData();
-  real* wgrad = data_;
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = this->getHeight() * this->getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-  size_t partial_sum = numElements / paraSize;
-  for (size_t n = 0, k = 0; n < numSamples; ++n) {
-    for (size_t i = 0; i < numElements; ++i, ++k) {
-      wgrad[i / partial_sum] += ograd[k] * (input[k] > 0 ? 0 : input[k]);
-    }
-  }
-}
-
-void CpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
-  real* diff = data_;
-  real* input = data.getData();
-  real* ograd = oGrad.getData();
-  real* w = W.getData();
-  size_t numElements = data.getWidth();
-  size_t numSamples = data.getHeight();
-  size_t paraSize = W.getHeight() * W.getWidth();
-  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
-  size_t partial_sum = numElements / paraSize;
-  for (size_t n = 0, k = 0; n < numSamples; ++n) {
-    for (size_t i = 0; i < numElements; ++i, ++k) {
-      diff[k] += ograd[k] * (input[k] > 0 ? 1 : w[i / partial_sum]);
-    }
-  }
-}
-
-void CpuMatrix::print(std::ostream& os, size_t height, size_t width) const {
-  CHECK(isContiguous());
-  size_t h = height_ < height ? height_ : height;
-  size_t w = width_ < width ? width_ : width;
-  os.setf(std::ostream::scientific);
-  os << "[";
-  for (size_t i = 0; i < h; ++i) {
-    for (size_t j = 0; j < w; ++j) {
-      os << data_[i * width_ + j] << " ";
-    }
-    if (i == h - 1) {
-      os << "]";
-    }
-    os << std::endl;
-  }
-}
-
-void CpuMatrix::printOneRow(std::ostream& os, size_t idx) const {
-  CHECK_LT(idx, height_);
-  size_t offset = idx * stride_;
-  os << data_[offset];
-  for (size_t i = 1; i < width_; ++i) {
-    os << " " << data_[offset + i];
-  }
-  os << ";";
-}
-
-void CpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
-  CHECK(isContiguous());
-  CHECK(height_ == refMat.getHeight());
-  CHECK(width_ == refMat.getWidth());
-  CpuMatrix cpuRef(height_, width_);
-  cpuRef.copyFrom(refMat);
-  size_t diffCnt = 0;
-  for (size_t i = 0; i < height_; ++i) {
-    for (size_t j = 0; j < width_; ++j) {
-      real a = getElement(i, j);
-      real b = cpuRef.getElement(i, j);
-      if (fabs(a - b) > 0.00001) {
-        ++diffCnt;
-        if (printDiff) {
-          os << "ref= " << a << "  check= " << b << std::endl;
-        }
-      }
-    }
-  }
-  LOG(INFO) << "the  diffCnt is " << diffCnt;
-}
-
-real CpuMatrix::getMin() {
-  size_t size = getHeight() * getWidth();
-  real* data = getData();
-  real res = data[0];
-  for (size_t i = 1; i < size; ++i) {
-    if (res > data[i]) {
-      res = data[i];
-    }
-  }
-  return res;
-}
-
-real CpuMatrix::getMax() {
-  size_t size = getHeight() * getWidth();
-  real* data = getData();
-  real res = data[0];
-  for (size_t i = 1; i < size; ++i) {
-    if (res < data[i]) {
-      res = data[i];
-    }
-  }
-  return res;
-}
-
-void CpuMatrix::circularConv(Matrix& in0, Matrix& in1) {
-  size_t height = this->getHeight();
-  size_t width0 = this->getWidth();
-  size_t width1 = in1.getWidth();
-
-  CHECK_EQ(height, in0.getHeight());
-  CHECK_EQ(width0, in0.getWidth());
-  CHECK_EQ(height, in1.getHeight());
-
-  CHECK_EQ(width1 % 2, 1U);
-
-  real* outV = this->getData();
-  real* inV0 = in0.getData();
-  real* inV1 = in1.getData();
-
-  int leftCtxLen = (width1 - 1) / 2;
-  for (size_t x = 0; x < height;
-       ++x, outV += width0, inV0 += width0, inV1 += width1) {
-    for (size_t i = 0; i < width0; ++i) {  // each dimension of output
-      for (size_t j = 0; j < width1; ++j) {
-        // iterate over all dimentions of inV1
-        int index = i + j - leftCtxLen;
-        index = (index + width0) % width0;
-        outV[i] += inV0[index] * inV1[j];
-      }
-    }
-  }
-}
-
-void CpuMatrix::circularConvDerivative(
-    Matrix& outG, Matrix& in0, Matrix& in1, Matrix& inG0, Matrix& inG1) {
-  size_t height = in0.getHeight();
-  size_t width0 = in0.getWidth();
-  size_t width1 = in1.getWidth();
-
-  CHECK_EQ(height, in1.getHeight());
-  CHECK_EQ(height, inG0.getHeight());
-  CHECK_EQ(width0, inG0.getWidth());
-  CHECK_EQ(height, inG1.getHeight());
-  CHECK_EQ(width1, inG1.getWidth());
-  CHECK_EQ(height, outG.getHeight());
-  CHECK_EQ(width0, outG.getWidth());
-
-  real* outGV = outG.getData();
-  real* inV0 = in0.getData();
-  real* inV1 = in1.getData();
-  real* inGV0 = inG0.getData();
-  real* inGV1 = inG1.getData();
-
-  int leftCtxLen = (width1 - 1) / 2;
-  for (size_t x = 0; x < height; ++x,
-              outGV += width0,
-              inV0 += width0,
-              inV1 += width1,
-              inGV0 += width0,
-              inGV1 += width1) {
-    for (size_t j = 0; j < width1; ++j) {  // iterate over width1
-      for (size_t i = 0; i < width0; ++i) {
-        // such over all dimensions of outG
-        int index = i + j - leftCtxLen;
-        index = (index + width0) % width0;
-        inGV0[index] += outGV[i] * inV1[j];
-        inGV1[j] += outGV[i] * inV0[index];
-      }
-    }
-  }
-}
-
-void CpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
-  CHECK(labelPtr);
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(numSamples, output.getHeight());
-  CHECK_EQ(numSamples, labelPtr->getHeight());
-  CHECK_EQ(dim, labelPtr->getWidth());
-
-  real* out = output.getData();
-  real* cost = getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim) {
-    for (size_t j = 0; j < dim; ++j) {
-      CHECK(out[j] > 0 && out[j] < 1.0);
-      cost[i] -= std::log(1 - out[j]);
-    }
-
-    const int* cols = labelPtr->getRowCols(i);
-    for (size_t j = 0; j < labelPtr->getColNum(i); ++j) {
-      CHECK_LT(size_t(cols[j]), dim);
-      cost[i] -= std::log(out[cols[j]] / (1 - out[cols[j]]));
-    }
-  }
-}
-
-void CpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
-  CHECK(labelPtr);
-
-  size_t numSamples = getHeight();
-  size_t dim = getWidth();
-  CHECK_EQ(numSamples, output.getHeight());
-  CHECK_EQ(numSamples, labelPtr->getHeight());
-  CHECK_EQ(dim, output.getWidth());
-  CHECK_EQ(dim, labelPtr->getWidth());
-
-  real* out = output.getData();
-  real* grad = getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
-    for (size_t j = 0; j < dim; ++j) {
-      CHECK(out[j] > 0 && out[j] < 1.0);
-      grad[j] += 1.0 / (1 - out[j]);
-    }
-
-    const int* cols = labelPtr->getRowCols(i);
-    for (size_t j = 0; j < labelPtr->getColNum(i); ++j) {
-      CHECK_LT(size_t(cols[j]), dim);
-      grad[cols[j]] -= 1.0 / (out[cols[j]] * (1 - out[cols[j]]));
-    }
-  }
-}
-
-/* calculate the classification error for multi binary label */
-void CpuMatrix::classificationErrorMulti(Matrix& output,
-                                         Matrix& label,
-                                         real threshold) {
-  CHECK(dynamic_cast<CpuMatrix*>(&output));
-  auto labelPtr = dynamic_cast<CpuSparseMatrix*>(&label);
-  CHECK(labelPtr);
-
-  size_t numSamples = getHeight();
-  size_t dim = output.getWidth();
-  CHECK_EQ(numSamples, output.getHeight());
-  CHECK_EQ(numSamples, labelPtr->getHeight());
-  CHECK_EQ(dim, labelPtr->getWidth());
-
-  real* out = output.getData();
-  real* result = getData();
-  for (size_t i = 0; i < numSamples; ++i, out += dim) {
-    real sum = 0.0;
-    for (size_t j = 0; j < dim; ++j) {
-      if (out[j] >= threshold) {
-        sum += 1.0;
-      }
-    }
-
-    const int* cols = labelPtr->getRowCols(i);
-    for (size_t j = 0; j < labelPtr->getColNum(i); ++j) {
-      CHECK_LT(size_t(cols[j]), dim);
-      if (out[cols[j]] < threshold) {
-        sum += 1.0;
-      } else {
-        sum -= 1.0;
-      }
-    }
-    result[i] = sum / dim;
-  }
-}
-
-void CpuMatrix::bilinearForward(const Matrix& in,
-                                const size_t inImgH,
-                                const size_t inImgW,
-                                const size_t outImgH,
-                                const size_t outImgW,
-                                const size_t numChannels,
-                                const real ratioH,
-                                const real ratioW) {
-  CHECK(dynamic_cast<const CpuMatrix*>(&in));
-
-  size_t outputW = getWidth();
-  size_t batchSize = getHeight();
-  size_t inputW = in.getWidth();
-  size_t inputH = in.getHeight();
-  size_t inPosOffset = inImgH * inImgW;
-  size_t outPosOffset = outImgH * outImgW;
-  (void)(inputH);
-
-  real* outData = getData();
-  const real* inData = in.getData();
-
-  if (inImgH == outImgH && inImgW == outImgW) {
-    this->copyFrom(in);
-  } else {
-    for (size_t k = 0; k < batchSize; ++k) {  // loop for batches
-      for (size_t i = 0; i < outImgH; ++i) {  // loop for images
-        size_t h = ratioH * i;
-        size_t hid = (h < inImgH - 1) ? 1 : 0;
-        real h1lambda = ratioH * i - h;
-        real h2lambda = 1 - h1lambda;
-
-        for (size_t j = 0; j < outImgW; ++j) {
-          size_t w = ratioW * j;
-          size_t wid = (w < inImgW - 1) ? 1 : 0;
-          real w1lambda = ratioW * j - w;
-          real w2lambda = 1 - w1lambda;
-          // calculate four position for bilinear interpolation
-          const real* inPos = &inData[k * inputW + h * inImgW + w];
-          real* outPos = &outData[k * outputW + i * outImgW + j];
-          for (size_t c = 0; c < numChannels; ++c) {  // loop for channels
-            // bilinear interpolation
-            outPos[0] =
-                h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wid]) +
-                h1lambda * (w2lambda * inPos[hid * inImgW] +
-                            w1lambda * inPos[hid * inImgW + wid]);
-            inPos += inPosOffset;
-            outPos += outPosOffset;
-          }
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::bilinearBackward(const Matrix& out,
-                                 const size_t outImgH,
-                                 const size_t outImgW,
-                                 const size_t inImgH,
-                                 const size_t inImgW,
-                                 const size_t numChannels,
-                                 const real ratioH,
-                                 const real ratioW) {
-  CHECK(dynamic_cast<const CpuMatrix*>(&out));
-
-  size_t inputW = getWidth();
-  size_t inputH = getHeight();
-  size_t outputW = out.getWidth();
-  size_t batchSize = out.getHeight();
-  size_t inPosOffset = inImgH * inImgW;
-  size_t outPosOffset = outImgH * outImgW;
-  (void)(inputH);
-
-  real* inGrad = getData();
-  const real* outGrad = out.getData();
-
-  if (inImgH == outImgH && inImgW == outImgW) {
-    this->add(const_cast<Matrix&>(out));
-  } else {
-    for (size_t k = 0; k < batchSize; ++k) {  // loop for batches
-      for (size_t i = 0; i < outImgH; ++i) {  // loop for images
-        size_t h = ratioH * i;
-        size_t hid = (h < inImgH - 1) ? 1 : 0;
-        real h1lambda = ratioH * i - h;
-        real h2lambda = 1 - h1lambda;
-        for (size_t j = 0; j < outImgW; ++j) {
-          size_t w = ratioW * j;
-          size_t wid = (w < inImgW - 1) ? 1 : 0;
-          real w1lambda = ratioW * j - w;
-          real w2lambda = 1 - w1lambda;
-
-          real* inPos = &inGrad[k * inputW + h * inImgW + w];
-          const real* outPos = &outGrad[k * outputW + i * outImgW + j];
-          for (size_t c = 0; c < numChannels; ++c) {  // loop for channels
-            inPos[0] += h2lambda * w2lambda * outPos[0];
-            inPos[wid] += h2lambda * w1lambda * outPos[0];
-            inPos[hid * inImgW] += h1lambda * w2lambda * outPos[0];
-            inPos[hid * inImgW + wid] += h1lambda * w1lambda * outPos[0];
-            inPos += inPosOffset;
-            outPos += outPosOffset;
-          }
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::vol2Col(real* data,
-                        int channels,
-                        int depth,
-                        int height,
-                        int width,
-                        int filterD,
-                        int filterH,
-                        int filterW,
-                        int strideD,
-                        int strideH,
-                        int strideW,
-                        int paddingD,
-                        int paddingH,
-                        int paddingW) {
-  real* outData = getData();
-  int outHeight = (height + 2 * paddingH - filterH) / strideH + 1;
-  int outWidth = (width + 2 * paddingW - filterW) / strideW + 1;
-  int outDepth = (depth + 2 * paddingD - filterD) / strideD + 1;
-
-  int channelsCol = channels * filterD * filterH * filterW;
-  for (int c = 0; c < channelsCol; ++c) {
-    int wOffset = c % filterW;
-    int hOffset = (c / filterW) % filterH;
-    int dOffset = (c / filterW / filterH) % filterD;
-    int cIn = c / filterW / filterH / filterD;
-    for (int d = 0; d < outDepth; ++d) {
-      for (int h = 0; h < outHeight; ++h) {
-        for (int w = 0; w < outWidth; ++w) {
-          int dPad = d * strideD - paddingD + dOffset;
-          int hPad = h * strideH - paddingH + hOffset;
-          int wPad = w * strideW - paddingW + wOffset;
-
-          if (hPad >= 0 && hPad < height && wPad >= 0 && wPad < width &&
-              dPad >= 0 && dPad < depth)
-            outData[((c * outDepth + d) * outHeight + h) * outWidth + w] =
-                data[((cIn * depth + dPad) * height + hPad) * width + wPad];
-          else
-            outData[((c * outDepth + d) * outHeight + h) * outWidth + w] = 0;
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::col2Vol(real* trg,
-                        int channels,
-                        int depth,
-                        int height,
-                        int width,
-                        int filterD,
-                        int filterH,
-                        int filterW,
-                        int strideD,
-                        int strideH,
-                        int strideW,
-                        int paddingD,
-                        int paddingH,
-                        int paddingW,
-                        real alpha,
-                        real beta) {
-  real* src = getData();
-  int outDepth = (depth + 2 * paddingD - filterD) / strideD + 1;
-  int outHeight = (height + 2 * paddingH - filterH) / strideH + 1;
-  int outWidth = (width + 2 * paddingW - filterW) / strideW + 1;
-  int channelsCol = channels * filterD * filterH * filterW;
-  for (int c = 0; c < channelsCol; ++c) {
-    int wOffset = c % filterW;
-    int hOffset = (c / filterW) % filterH;
-    int dOffset = (c / filterW / filterH) % filterD;
-    int cIm = c / filterW / filterH / filterD;
-    for (int d = 0; d < outDepth; ++d) {
-      for (int h = 0; h < outHeight; ++h) {
-        for (int w = 0; w < outWidth; ++w) {
-          int dPad = d * strideD - paddingD + dOffset;
-          int hPad = h * strideH - paddingH + hOffset;
-          int wPad = w * strideW - paddingW + wOffset;
-          if (hPad >= 0 && hPad < height && wPad >= 0 && wPad < width &&
-              dPad >= 0 && dPad < depth)
-            trg[((cIm * depth + dPad) * height + hPad) * width + wPad] =
-                alpha *
-                    src[((c * outDepth + d) * outHeight + h) * outWidth + w] +
-                beta *
-                    trg[((cIm * depth + dPad) * height + hPad) * width + wPad];
-        }
-      }
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////
-//               functions executed via cpu                   //
-////////////////////////////////////////////////////////////////
-
-void GpuMatrix::selectElements(Matrix& table, IVector& ids) {
-  execViaCpu2(&CpuMatrix::selectElements, *this, table, ids);
-}
-}  // namespace paddle
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
deleted file mode 100644
index 631e69edc1b0f5c4ef4a115d4bd5ea29fc418018..0000000000000000000000000000000000000000
--- a/paddle/math/Matrix.h
+++ /dev/null
@@ -1,2137 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <memory>
-#include <thread>
-
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/ThreadLocal.h"
-
-#include <hl_gpu.h>
-
-#include "BaseMatrix.h"
-#include "MemoryHandle.h"
-#include "Vector.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-
-/// TODO(tianbing), move to paddle/function/TensorType.h
-enum SparseValueType { NO_VALUE = 0, FLOAT_VALUE = 1 };
-
-/**
- * @brief  matrix sparse_format .
- *
- * nnz represents nonzero number in sparse matrix.
- *
- * SPARSE_CSR: row major matrix. length of row is height_ + 1, each element
- * represents row start index in Matrix. length of col and value are nnz.
- *
- * SPARSE_CSC: col major matrix. length of col is width_ + 1, each element
- * represents col start index in Matrix. length of col and value are nnz.
- *
- * @code
- * for example: [0, 1, 0, 2, 0;
- *               1, 0, 0, 0, 0;
- *               0, 0, 0, 2, 5];
- * SPARSE_CSR row   [0, 2, 3, 5];
- *            col   [1, 3, 0, 3, 4];
- *            value [1, 2, 1, 2, 5]
- * SPARSE_CSC col   [0, 1, 2, 2, 4, 5];
- *            row   [1, 0, 0, 2, 2];
- *            value [1, 1, 2, 2, 5]
- * @endcode
- */
-/// TODO(tianbing), move to paddle/function/TensorType.h
-enum SparseFormat { SPARSE_CSR = 0, SPARSE_CSC = 1 };
-
-class Matrix;
-class GpuMatrix;
-class CpuMatrix;
-class CpuSparseMatrix;
-class GpuSparseMatrix;
-typedef std::shared_ptr<Matrix> MatrixPtr;
-typedef std::shared_ptr<GpuMatrix> GpuMatrixPtr;
-typedef std::shared_ptr<CpuMatrix> CpuMatrixPtr;
-typedef std::shared_ptr<GpuSparseMatrix> GpuSparseMatrixPtr;
-typedef std::shared_ptr<CpuSparseMatrix> CpuSparseMatrixPtr;
-
-/**
- * Copy or assignemnt constructor will share the data as opposed to making a
- * copy of the original data. To make a copy of the orinal data, use copyFrom()
- * instead.
- */
-class Matrix : public BaseMatrix {
-protected:
-  Matrix(MemoryHandlePtr memHandle,
-         size_t height,
-         size_t width,
-         bool trans,
-         bool use_gpu);
-
-  Matrix(real* data, size_t height, size_t width, bool trans, bool use_gpu);
-
-  Matrix(real* data,
-         size_t height,
-         size_t width,
-         size_t stride,
-         bool trans,
-         bool use_gpu);
-
-  static ThreadLocal<MatrixPtr> tmpMat_;
-
-public:
-  size_t elementCnt_;  // maximal number of elements which can be held in data_
-  MemoryHandlePtr memoryHandle_;
-
-public:
-  virtual ~Matrix() {}
-
-  static MatrixPtr create(MemoryHandlePtr memHandle,
-                          size_t height,
-                          size_t width,
-                          bool trans = false);
-  static MatrixPtr create(size_t height,
-                          size_t width,
-                          bool trans = false,
-                          bool useGpu = false);
-  static MatrixPtr create(real* data,
-                          size_t height,
-                          size_t width,
-                          bool trans = false,
-                          bool useGpu = false);
-  static MatrixPtr create(real* data,
-                          size_t height,
-                          size_t width,
-                          size_t stride,
-                          bool trans = false,
-                          bool useGpu = false);
-
-  static MatrixPtr createSparseMatrix(size_t height,
-                                      size_t width,
-                                      size_t nnz,
-                                      SparseValueType valueType = FLOAT_VALUE,
-                                      bool trans = false,
-                                      bool useGpu = false);
-  static MatrixPtr createSparseMatrix(size_t height,
-                                      size_t width,
-                                      size_t nnz,
-                                      SparseValueType valueType = FLOAT_VALUE,
-                                      SparseFormat foramt = SPARSE_CSR,
-                                      bool trans = false,
-                                      bool useGpu = false);
-
-  static MatrixPtr createSparseMatrix(real* data,
-                                      int* row,
-                                      int* col,
-                                      size_t height,
-                                      size_t width,
-                                      size_t nnz, /* used to allocate space */
-                                      SparseValueType valueType, /*value type*/
-                                      SparseFormat format,
-                                      bool trans,
-                                      bool useGpu);
-
-  static void resizeOrCreateSparseMatrix(
-      MatrixPtr& matrix,
-      size_t height,
-      size_t width,
-      size_t nnz,
-      SparseValueType valueType = FLOAT_VALUE,
-      SparseFormat foramt = SPARSE_CSR,
-      bool trans = false,
-      bool useGpu = false);
-
-  static void resizeOrCreate(MatrixPtr& a,
-                             size_t height,
-                             size_t width,
-                             bool trans = false,
-                             bool useGpu = false);
-
-  /**
-   * @brief  set the data buffer used to hold the matrix data.
-   *
-   * caller should make sure that the size of data is at least
-   * sizeof(real)*height*width.
-   */
-  void setData(real* data) {
-    BaseMatrix::setData(data);
-    memoryHandle_.reset();
-  }
-
-  /// the data should be contiguous
-  void setData(real* data, size_t newHeight, size_t newWidth) {
-    setData(data);
-    height_ = newHeight;
-    width_ = newWidth;
-    elementCnt_ = newHeight * newWidth;
-    stride_ = width_;
-  }
-
-  size_t getWidth() const { return width_; }
-  size_t getHeight() const { return height_; }
-  size_t getStride() const { return stride_; }
-  size_t getElementCnt() const { return elementCnt_; }
-  virtual real* getData() { return data_; }
-  virtual const real* getData() const { return data_; }
-  bool isTransposed() const { return trans_; }
-  bool isContiguous() const { return stride_ == width_ || height_ == 1; }
-
-  // If sparse matrix, need to dynamic_cast to CpuSparseMatrix/GpuSparseMatrix
-  // befor call the following functions.
-  // Declare these functions in the base class just easy to call them.
-  // And these declarations should be moved to base class of sparse matrix
-  // if refactor sparse matrix
-  virtual int* getRows() const {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;  //! suppress warning for no return value.
-  }
-
-  virtual int* getCols() const {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;  //! suppress warning for no return value.
-  }
-
-  virtual SparseFormat getFormat() const {
-    LOG(FATAL) << "Not implemented";
-    return SPARSE_CSR;  //! suppress warning for no return value.
-  }
-
-  virtual SparseValueType getValueType() const {
-    LOG(FATAL) << "Not implemented";
-    return NO_VALUE;  //! suppress warning for no return value.
-  }
-
-  /**
-   * @brief matrix elment-wise add
-   *
-   * Named add3 just because add/add2 has been used in BaseMatrix.cu
-   * and they are not virtual function.
-   */
-  virtual void add3(MatrixPtr b) { LOG(FATAL) << "Not implemented"; }
-
-  MemoryHandlePtr getMemoryHandle() const { return memoryHandle_; }
-
-  virtual void zeroMem() { LOG(FATAL) << "Not implemented"; }
-
-  virtual void resetOne() { LOG(FATAL) << "Not implemented"; }
-
-  void setDiag(real value);
-
-  virtual void copyFrom(const Matrix& src) { LOG(FATAL) << "Not implemented"; }
-
-  virtual void trimFrom(const CpuSparseMatrix& src) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  // For GpuMatrix this is an asynchronous copy interface
-  // For CpuMatrix this is an synchronous copy interface
-  virtual void copyFrom(const Matrix& src, hl_stream_t stream) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  MatrixPtr subMatrix(size_t startRow,
-                      size_t endRow,
-                      size_t startCol,
-                      size_t endCol);
-
-  MatrixPtr subRowMatrix(size_t startRow, size_t endRow) {
-    return subMatrix(startRow, endRow, 0, getWidth());
-  }
-
-  MatrixPtr subColMatrix(size_t startCol, size_t endCol) {
-    return subMatrix(0, getHeight(), startCol, endCol);
-  }
-
-  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows) {
-    CHECK_LE(startRow + numRows, getHeight());
-    return Matrix::create(getData() + startRow * getWidth(),
-                          numRows,
-                          getWidth(),
-                          trans_,
-                          useGpu_);
-  }
-  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows, MatrixPtr dest) {
-    CHECK_LE(startRow + numRows, getHeight());
-    CHECK_EQ(useGpu_, dest->useGpu_);
-    dest->setData(this->rowBuf(startRow), numRows, getWidth());
-    return dest;
-  }
-
-  /**
-   * If this is GpuMatrix, src is assumed to be CPU memory
-   *
-   * If this is CpuMatrix, src is assumed to be CPU memory
-   */
-  virtual void copyFrom(const real* src, size_t size) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void copyFrom(const real* src, const int64_t* seq) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief convert a int vector to a real matrix.
-   *
-   * (1) source and dest are both in CPU.
-   *
-   * (2) sizes are exactly match.
-   */
-  virtual void copyFrom(const IVector& src) {
-    LOG(FATAL) << "copy data from int vector only available on CpuMatrix.";
-  }
-
-  virtual void copyByRowIndex(Matrix& b, const IVector& rowIndex) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief Create a matrix with the same type (GpuMatrix, CpuMatrix,
-   *        NonValueSparseMatrix, etc.) as this.
-   *
-   * If height and width is zero, the new matrix will have the same size
-   * as this, otherwise the new matrix will have the specified size.
-   *
-   */
-  virtual MatrixPtr clone(size_t height = 0,
-                          size_t width = 0,
-                          bool useGpu = false) {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;
-  }
-
-  virtual real* getRowBuf(size_t row) {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;
-  }
-
-  virtual real getElement(size_t x, size_t y) const {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  virtual real getSum() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  virtual void accumulateColSum(Matrix& src) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual real getAbsSum() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  /**
-   * @note Original data may not be preserved after resize().
-   */
-  virtual void resize(size_t newHeight, size_t newWidth) = 0;
-
-  /**
-   * @note This should only be used for sparse matrix.
-   */
-  virtual void resize(size_t newHeight,
-                      size_t newWidth,
-                      size_t newNnz, /* total item used to allocate space */
-                      SparseValueType valueType,
-                      SparseFormat format) = 0;
-
-  /**
-   * @brief This should only be used for sparse matrix.
-   *
-   * Currently must be called for each row in order.
-   * The matrix is not valid until setRow is called for the last row.
-   */
-  virtual void setRow(size_t row,
-                      size_t colNum,
-                      const unsigned int* cols,
-                      const real* values) = 0;
-
-  virtual MatrixPtr getTranspose() = 0;
-
-  /**
-   * @brief  hard transpose.
-   *
-   * allocate matTrans' memory outside, then set memAlloc as false;
-   * else set as true.
-   */
-  virtual void transpose(MatrixPtr& matTrans, bool memAlloc) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief  rotate 90 degrees in clock-wise if clockWise=true;
-   *         otherwise rotate in anti clock-wise
-   * clock-wise:
-   * \f[
-   *   y(j,i) = x(M-i-1,j)
-   * \f]
-   * anti clock-wise:
-   * \f[
-   *   y(j,i) = x(i, N-1-j)
-   * \f]
-   * where \f$x\f$ is (M x N) input, and \f$y\f$ is (N x M) output.
-   *
-   * allocate matRot' memory outside, then set memAlloc as false;
-   * else set as true.
-   */
-  virtual void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual MatrixPtr getInverse() {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;
-  }
-
-  /**
-   * @brief  inverse.
-   *
-   * if allocate matInv's memory outside, then set memAlloc as false;
-   * else set as true.
-   */
-  virtual void inverse(MatrixPtr& matInv, bool memAlloc) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-public:
-  /// Only set all variables to 0 or NULL but not free them.
-  virtual void clear() {
-    height_ = 0;
-    width_ = 0;
-    data_ = NULL;
-  }
-
-  void reshape(size_t height, size_t width);
-
-  /// add b to each sample of this.
-  virtual void addBias(Matrix& b, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void addSharedBias(Matrix& b, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  void addBias(Matrix& b, real scale, bool sharedBias) {
-    if (!sharedBias) {
-      addBias(b, scale);
-    } else {
-      addSharedBias(b, scale);
-    }
-  }
-
-  /// add each sample from a to this.
-  virtual void collectBias(Matrix& a, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void collectSharedBias(Matrix& a, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  void collectBias(Matrix& a, real scale, bool sharedBias) {
-    if (!sharedBias) {
-      collectBias(a, scale);
-    } else {
-      collectSharedBias(a, scale);
-    }
-  }
-
-  virtual void sequenceAvgForward(Matrix& a,
-                                  const IVector& startsPos,
-                                  int mode) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void sequenceAvgBackward(Matrix& a,
-                                   const IVector& startsPos,
-                                   int mode) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = scaleAB*(a*b) + scaleT*this
-   * @endcode
-   */
-  virtual void mul(const Matrix& a,
-                   const Matrix& b,
-                   real scaleAB,
-                   real scaleT) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// Add a vector (column) b to matrix a, column by column.
-  virtual void addColumnVector(const Matrix& b) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   this(i, j) += vec(index(i, j), 0)
-   * where index(i, j) = ((codes(i) + numClasses) >> (j + 1)) - 1
-   * @endcode
-   */
-  virtual void addByBitCode(size_t numClasses,
-                            const IVector& codes,
-                            const Matrix& vec) {
-    (void)numClasses;
-    (void)codes;
-    (void)vec;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   vec(index(i, j), 0) += this(i, j)
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void addByBitCodeBackward(size_t numClasses,
-                                    const IVector& codes,
-                                    Matrix& vec) {
-    (void)numClasses;
-    (void)codes;
-    (void)vec;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   this(i, j) += <mat.row(index(i, j)), input.row(i)>
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void mulByBitCode(size_t numClasses,
-                            const IVector& codes,
-                            const Matrix& mat,
-                            const Matrix& input) {
-    (void)numClasses;
-    (void)codes;
-    (void)mat;
-    (void)input;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   mat.row(index(i, j)) += this(i, j) * input.row(i)
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void mulByBitCodeBackwardWeight(size_t numClasses,
-                                          const IVector& codes,
-                                          Matrix& mat,
-                                          const Matrix& input) {
-    (void)numClasses;
-    (void)codes;
-    (void)mat;
-    (void)input;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   input.row(i) += this(i, j) * mat.row(index(i, j))
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void mulByBitCodeBackwardError(size_t numClasses,
-                                         const IVector& codes,
-                                         const Matrix& mat,
-                                         Matrix& input) {
-    (void)numClasses;
-    (void)codes;
-    (void)mat;
-    (void)input;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength
-   *   sum(i, 0) = scaleSum * \sum_j  bit(i, j) * this(i, j)
-   * where bit(i, j) = ((codes(i) + numClasses) & 2^j) ? 1 : 0
-   * @endcode
-   */
-  virtual void sumByBitCode(size_t numClasses,
-                            IVector& codes,
-                            Matrix& sum,
-                            real scaleSum) {
-    (void)numClasses;
-    (void)codes;
-    (void)sum;
-    (void)scaleSum;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength
-   *  this(i, j) -= bit(i, j)
-   * where bit(i, j) is same as that for sumByBitCode
-   * @endcode
-   */
-  virtual void subByBitCode(size_t numClasses_, IVector& codes) {
-    (void)numClasses_;
-    (void)codes;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * add the sum of each row of this to mat
-   */
-  virtual void rowSum(Matrix& sum) {
-    (void)sum;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * set the max of each row of this to mat
-   */
-  virtual void rowMax(Matrix& max) {
-    (void)max;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * set the max of each column of this to mat
-   */
-  virtual void colMax(Matrix& max) { LOG(FATAL) << "not implemented"; }
-
-  /**
-   * @brief Get the top k elements of each column of this matrix.
-   *
-   * The row ids and values of these elements are stored in
-   * maxIds and max respectively. where k is the size of maxIds.
-   * And note that the top k elements are not sorted.
-   */
-  virtual void colMax(IVector& maxIds, Matrix& maxVal) {
-    LOG(FATAL) << "not implemented";
-  }
-
-  virtual void maxoutForward(Matrix& a,
-                             IVector& id,
-                             size_t channels,
-                             size_t groups) {
-    LOG(FATAL) << "not implemented";
-  }
-
-  virtual void maxoutBackward(Matrix& a,
-                              IVector& id,
-                              size_t channels,
-                              size_t groups) {
-    LOG(FATAL) << "not implemented";
-  }
-
-  virtual void rowMaxId(IVector& maxIds) { LOG(FATAL) << "Not implemented"; }
-
-  /**
-   * @brief Get the top k elements of each row of this matrix.
-   *
-   * The column ids and values of these elements are stored in
-   * maxIds and max respectively. where k is the size of maxIds.
-   * And note that the top k elements are not sorted.
-   */
-  virtual void rowMax(IVector& maxIds, Matrix& max) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// normalize each row so that the sum of each row is 1.
-  virtual void rowNormalizeL1(Matrix& out) {
-    (void)out;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   *  this = a*b
-   * @endcode
-   */
-  virtual void mul(const Matrix& a, const Matrix& b) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = scaleAB*(this*b) +  scaleT*this
-   * @endcode
-   */
-  virtual void rightMul(Matrix& b, real scaleAB, real scaleT) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = this* b
-   * @endcode
-   */
-  virtual void rightMul(Matrix& b) { LOG(FATAL) << "Not implemented"; }
-
-  /**
-   * @code
-   * this = scaleAB*(a*this) +  scaleT*this
-   * @endcode
-   */
-  virtual void leftMul(Matrix& a, real scaleAB, real scaleT) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = a*this)
-   * @endcode
-   */
-  virtual void leftMul(Matrix& a) { LOG(FATAL) << "Not implemented"; }
-
-  /// merge the element for each col.
-  virtual void colMerge(Matrix& src) { LOG(FATAL) << "Not implemented"; }
-
-  /// copy -log(output[label]) to this->data[i].
-  virtual void oneHotCrossEntropy(Matrix& output, IVector& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// calculate the error of outputV according to label.
-  virtual void oneHotCrossEntropyBp(Matrix& outputV, IVector& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// copy -log(output[label]) to this->data[i].
-  virtual void oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                              IVector& label,
-                                              real alpha) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// calculate the error of outputV according to label.
-  virtual void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                                IVector& label,
-                                                real alpha) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * \f[
-   *  a[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} b_{i+j} * c_{j}
-   * \f]
-   *
-   * b contains M elements,
-   * c contains N elements (N is odd),
-   * b's index arithmetic is computed modulo M,
-   * c's index arithmetic is computed modulo N.
-   */
-  virtual void circularConv(Matrix& b, Matrix& c) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void circularConvDerivative(Matrix& output,
-                                      Matrix& prevOut1,
-                                      Matrix& prevOut2,
-                                      Matrix& prevGrad1,
-                                      Matrix& prevGrad2) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /* output_ij = exp(this_{ij}) / (sum_j exp(this_ij)) */
-  virtual void softmax(Matrix& output) {
-    (void)output;
-    LOG(FATAL) << "Not implemeted";
-  }
-  virtual void sequenceSoftmax(Matrix& output, const IVector& index) {
-    (void)output;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void softmaxBackward(Matrix& outputV) {
-    (void)outputV;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /*
-    sum_i = sum_j this_ij * output_ij
-    this_ij = output_ij* (this_ij - sum_i)
-  */
-  virtual void softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// calculate the sum of squares diff cost.
-  virtual void sumOfSquares(Matrix& output, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// gradient of sumOfSquares.
-  virtual void sumOfSquaresBp(Matrix& outputV, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void smoothL1(Matrix& output, Matrix& label, real destScale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void smoothL1Bp(Matrix& outputV, Matrix& label, real destScale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void tanh(Matrix& output) { LOG(FATAL) << "Not implemented"; }
-
-  virtual void tanhDerivative(Matrix& output) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void softrelu(Matrix& output) { LOG(FATAL) << "Not implemented"; }
-
-  virtual void softreluDerivative(Matrix& output) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void scaledTanh(Matrix& output, real p1, real p2) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// print out the values of elements to os
-  virtual void print(std::ostream& os) const {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * print a part of the matrix
-   * from the (top,left) value to the (height, width) value (not included)
-   */
-  virtual void print(std::ostream& os, size_t height, size_t width) const {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// print one row to os
-  virtual void printOneRow(std::ostream& os, size_t idx) const {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void check(std::ostream& os, Matrix& refMat, bool printDiff = true) {}
-
-  virtual real getMin() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-  virtual real getMax() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  virtual void randomizeUniform() { LOG(FATAL) << "Not implemented"; }
-
-  /**
-   * @brief  calulate the error of classification
-   *
-   * output[i] = 1 if row i is an error.
-   *
-   * output[i] = 0 if row i is correct.
-   *
-   */
-  virtual void classificationError(Matrix& output,
-                                   IVector& label,
-                                   size_t topkSize = 1) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * Pooling forward operation, pick out the largest element
-   * in the sizeX of value, if the maskMatP is not NULL, it will
-   * also caculate the location indices.
-   */
-  virtual void maxPoolForward(Matrix& inputMat,
-                              size_t imgSizeH,
-                              size_t imgSizeW,
-                              size_t channels,
-                              size_t sizeX,
-                              size_t sizeY,
-                              size_t strideH,
-                              size_t strideW,
-                              size_t outputH,
-                              size_t outputW,
-                              size_t paddingH,
-                              size_t paddingW,
-                              MatrixPtr maskMatP = NULL) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /// Pooling backward operation.
-  virtual void maxPoolBackward(Matrix& image,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               Matrix& outGrad,
-                               Matrix& outV,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               real scaleTargets,
-                               real scaleOutput,
-                               size_t paddingH,
-                               size_t paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /// Pooling forward operation, caculate the average of sizeX elements.
-  virtual void avgPoolForward(Matrix& input,
-                              size_t imgSizeH,
-                              size_t imgSizeW,
-                              size_t channels,
-                              size_t sizeX,
-                              size_t sizeY,
-                              size_t strideH,
-                              size_t strideW,
-                              size_t outputH,
-                              size_t outputW,
-                              size_t paddingH,
-                              size_t paddingW,
-                              bool excludeMode = true) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void avgPoolBackward(Matrix& input,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               real scaleTargets,
-                               real scaleOutput,
-                               size_t paddingH,
-                               size_t paddingW,
-                               bool excludeMode = true) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * Pooling 3D forward operation, pick out the largest element
-   * in the sizeX of value
-   */
-  virtual void maxPool3DForward(Matrix& inputMat,
-                                Matrix& maxPoolIdx,
-                                size_t channels,
-                                size_t imgSizeD,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t outputD,
-                                size_t outputH,
-                                size_t outputW,
-                                size_t sizeZ,
-                                size_t sizeY,
-                                size_t sizeX,
-                                size_t strideD,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t paddingD,
-                                size_t paddingH,
-                                size_t paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void maxPool3DBackward(Matrix& outGrad,
-                                 Matrix& maxPoolIdx,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW,
-                                 real scaleTargets,
-                                 real scaleOutput) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void avgPool3DForward(Matrix& input,
-                                size_t channels,
-                                size_t imgSizeD,
-                                size_t imgSizeH,
-                                size_t imgSizeW,
-                                size_t outputD,
-                                size_t outputH,
-                                size_t outputW,
-                                size_t sizeZ,
-                                size_t sizeY,
-                                size_t sizeX,
-                                size_t strideD,
-                                size_t strideH,
-                                size_t strideW,
-                                size_t paddingD,
-                                size_t paddingH,
-                                size_t paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void avgPool3DBackward(Matrix& input,
-                                 size_t imgSizeD,
-                                 size_t imgSizeH,
-                                 size_t imgSizeW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 size_t sizeZ,
-                                 size_t sizeY,
-                                 size_t sizeX,
-                                 size_t strideD,
-                                 size_t strideH,
-                                 size_t strideW,
-                                 size_t paddingD,
-                                 size_t paddingH,
-                                 size_t paddingW,
-                                 real scaleTargets,
-                                 real scaleOutput) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
- * Input: one or more sequences. Each sequence contains some instances.
- *
- * Output: output size is the number of input sequences (NOT input
- * instances).
- *
- * output[i] is set to max_input[i].
- */
-  virtual void maxSequenceForward(Matrix& input,
-                                  const IVector& sequence,
-                                  IVector& index) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void maxSequenceBackward(Matrix& outputGrad,
-                                   const IVector& sequence,
-                                   IVector& index) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * this.row[i] += table.row[ids[i]]
-   * if ids[i] == -1, it will be ignored
-   * @endcode
-   */
-  virtual void selectRows(Matrix& table, IVector& ids) {
-    (void)table;
-    (void)ids;
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this[i] = table[i, id[i]]
-   * @endcode
-   */
-  virtual void selectElements(Matrix& table, IVector& ids) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * table.row[ids[i]] += this.row[i]
-   * if ids[i] == -1, it will be ignored
-   * @endcode
-   */
-  virtual void addToRows(Matrix& table, IVector& ids) {
-    (void)table;
-    (void)ids;
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * table[i, id[i]] += this[i]
-   * @endcode
-   */
-  virtual void addElements(Matrix& table, IVector& ids) {
-    LOG(FATAL) << "Not implemented";
-  }
-  /**
-   * @brief  cross entropy for multi binary labels
-   *
-   * @code
-   * this[i] = -sum(label[i][j]*log(output[i][j])
-   *           + (1-label[i][j])*log(1-output[i][j]))
-   * @endcode
-   */
-  virtual void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief  The gradient of cross entropy for multi binary labels on output
-   *
-   * @code
-   * this[i][j] = -label[i][j]/output[i][j]
-   *              + (1-label[i][j])/(1-output[i][j])
-   * @endcode
-   */
-  virtual void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief  Calculate the classification error for multi binary labels
-   *
-   * @code
-   * this[i] = sum((output[i][j] >= threshold && label[i][j] == 0)
-   *            || (output[i][j] < threshold && label[i][j] == 1))
-   *            / output->getWidth()
-   * @endcode
-   */
-  virtual void classificationErrorMulti(Matrix& output,
-                                        Matrix& label,
-                                        real threshold) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void paramReluForward(Matrix& data, Matrix& W) {
-    LOG(FATAL) << "Not implemented";
-  }
-  virtual void paramReluBackwardW(Matrix& oGrad, Matrix& data) {
-    LOG(FATAL) << "Not implemented";
-  }
-  virtual void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void vol2Col(real* data,
-                       int channels,
-                       int depth,
-                       int height,
-                       int width,
-                       int filterD,
-                       int filterH,
-                       int filterW,
-                       int strideD,
-                       int strideH,
-                       int strideW,
-                       int paddingD,
-                       int paddingH,
-                       int paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void col2Vol(real* trg,
-                       int channels,
-                       int depth,
-                       int height,
-                       int width,
-                       int filterD,
-                       int filterH,
-                       int filterW,
-                       int strideD,
-                       int strideH,
-                       int strideW,
-                       int paddingD,
-                       int paddingH,
-                       int paddingW,
-                       real alpha,
-                       real beta) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void bilinearForward(const Matrix& in,
-                               const size_t inImgH,
-                               const size_t inImgW,
-                               const size_t outImgH,
-                               const size_t outImgW,
-                               const size_t numChannels,
-                               const real ratioH,
-                               const real ratioW) {
-    LOG(FATAL) << "Not implemented";
-  }
-  virtual void bilinearBackward(const Matrix& out,
-                                const size_t outImgH,
-                                const size_t outImgW,
-                                const size_t inImgH,
-                                const size_t inImgW,
-                                const size_t numChannels,
-                                const real ratioH,
-                                const real ratioW) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    if (useGpu_) {
-      TensorGpuApply<real>(*this, expr);
-    } else {
-      TensorCpuApply<real>(*this, expr);
-    }
-  }
-
-  bool isEmpty() const { return data_ == nullptr; }
-
-  explicit operator bool() const { return !isEmpty(); }
-};
-
-inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
-  mat.print(os);
-  return os;
-}
-
-class GpuMatrix : public Matrix {
-public:
-  GpuMatrix();
-
-  GpuMatrix(size_t height, size_t width, bool trans = false);
-  GpuMatrix(real* data, size_t height, size_t width, bool trans = false)
-      : Matrix(data, height, width, trans, true) {}
-  GpuMatrix(real* data,
-            size_t height,
-            size_t width,
-            size_t stride,
-            bool trans = false)
-      : Matrix(data, height, width, stride, trans, true) {}
-  GpuMatrix(GpuMemHandlePtr dataHandle,
-            size_t height,
-            size_t width,
-            bool trans = false)
-      : Matrix(dataHandle, height, width, trans, true) {}
-  ~GpuMatrix();
-
-  void zeroMem();
-  void resetOne();
-  void setDiag(real value);
-
-  void resize(size_t newHeight, size_t newWidth);
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-
-  /**
-   * Copy the data from cpu_memory buffer
-   */
-  void copyFrom(const real* hostSrc, size_t size);
-
-  void copyFrom(const real* hostSrc, const int64_t* seq);
-
-  void copyFrom(const Matrix& src, hl_stream_t stream);
-
-  void copyFrom(const Matrix& src);
-
-  void copyFrom(const IVector& src);
-
-  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
-
-  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
-
-  real getElement(size_t x, size_t y) const;
-
-  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
-  real getSum();
-  void accumulateColSum(Matrix& src);
-  real getAbsSum();
-
-  real getMin();
-  real getMax();
-
-  MatrixPtr getTranspose();
-  void transpose(MatrixPtr& matTrans, bool memAlloc);
-  void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise);
-
-  MatrixPtr getInverse();
-  void inverse(MatrixPtr& matInv, bool memAlloc);
-
-  /// add b to each sample of this.
-  void addBias(Matrix& b, real scale);
-  void addSharedBias(Matrix& b, real scale);
-
-  /**
-   * @code
-   * add each sample from a to this.
-   * @endcode
-   */
-  void collectBias(Matrix& a, real scale);
-  void collectSharedBias(Matrix& a, real scale);
-
-  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
-  void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
-
-  /**
-   * @code
-   * this.row[i] += table.row[ids[i]]
-   * @endcode
-   */
-  virtual void selectRows(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * this[i] = table[i, id[i]]
-   * @endcode
-   */
-  virtual void selectElements(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * table.row[ids[i]] += this.row[i]
-   * @endcode
-   */
-  virtual void addToRows(Matrix& table, IVector& ids);
-
-  void addColumnVector(const Matrix& b);
-
-  /**
-   * @code
-   * this = scaleAB*(a*b) + scaleT*this
-   * @endcode
-   */
-  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
-
-  /**
-   * @code
-   * this = a*b
-   * @endcode
-   */
-  void mul(const Matrix& a, const Matrix& b);
-
-  void mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, real scaleT);
-
-  void mul(const GpuSparseMatrix& a,
-           const GpuMatrix& b,
-           real scaleAB,
-           real scaleT);
-
-  void mul(const GpuMatrix& a,
-           const GpuSparseMatrix& b,
-           real scaleAB,
-           real scaleT);
-
-  /**
-   * @code
-   * this = scaleAB*(this*b) +  scaleT*this
-   * @endcode
-   */
-  void rightMul(Matrix& b, real scaleAB, real scaleT);
-
-  /**
-   * @code
-   * this = this* b
-   * @endcode
-   */
-  void rightMul(Matrix& b);
-
-  /**
-   * @code
-   * this = scaleAB*(a*this) +  scaleT*this
-   * @endcode
-   */
-  void leftMul(Matrix& a, real scaleAB, real scaleT);
-
-  /**
-   * @code
-   * this = a*this
-   * @endcode
-   */
-  void leftMul(Matrix& a);
-
-  void colMerge(Matrix& src);
-  void rowSum(Matrix& sum);
-  void rowMax(Matrix& max);
-  void rowMax(IVector& maxIds, Matrix& max);
-  void colMax(Matrix& max);
-  void colMax(IVector& maxIds, Matrix& max);
-  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
-  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
-
-  void oneHotCrossEntropy(Matrix& output, IVector& label);
-  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
-  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                      IVector& label,
-                                      real alpha);
-  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                        IVector& label,
-                                        real alpha);
-
-  void softmax(Matrix& output);
-  void sequenceSoftmax(Matrix& output, const IVector& index);
-  void softmaxBackward(Matrix& outputV);
-  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
-
-  /// calculate the sum of squares diff cost.
-  void sumOfSquares(Matrix& output, Matrix& label);
-
-  /// gradient of sumOfSquares.
-  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
-  void tanh(Matrix& output);
-  void tanhDerivative(Matrix& output);
-  void softrelu(Matrix& output);
-  void softreluDerivative(Matrix& output);
-  void scaledTanh(Matrix& output, real p1, real p2);
-
-  virtual void print(std::ostream& os) const;
-  virtual void print(std::ostream& os, size_t height, size_t width) const;
-
-  void paramReluForward(Matrix& data, Matrix& W);
-  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
-  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
-
-  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
-  void randomizeUniform();
-
-  void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
-
-  void maxPoolForward(Matrix& inputMat,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW,
-                      MatrixPtr maskMatP);
-
-  void maxPoolBackward(Matrix& image,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       Matrix& outGrad,
-                       Matrix& outV,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW);
-
-  void avgPoolForward(Matrix& input,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW,
-                      bool excludeMode = true);
-
-  void avgPoolBackward(Matrix& input,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW,
-                       bool excludeMode = true);
-
-  void maxPool3DForward(Matrix& inputMat,
-                        Matrix& maxPoolIdx,
-                        size_t channels,
-                        size_t imgSizeD,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t outputD,
-                        size_t outputH,
-                        size_t outputW,
-                        size_t sizeZ,
-                        size_t sizeY,
-                        size_t sizeX,
-                        size_t strideD,
-                        size_t strideH,
-                        size_t strideW,
-                        size_t paddingD,
-                        size_t paddingH,
-                        size_t paddingW);
-
-  void maxPool3DBackward(Matrix& outGrad,
-                         Matrix& maxPoolIdx,
-                         size_t imgSizeD,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         size_t outputD,
-                         size_t outputH,
-                         size_t outputW,
-                         size_t sizeZ,
-                         size_t sizeY,
-                         size_t sizeX,
-                         size_t strideD,
-                         size_t strideH,
-                         size_t strideW,
-                         size_t paddingD,
-                         size_t paddingH,
-                         size_t paddingW,
-                         real scaleTargets,
-                         real scaleOutput);
-
-  void avgPool3DForward(Matrix& input,
-                        size_t channels,
-                        size_t imgSizeD,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t outputD,
-                        size_t outputH,
-                        size_t outputW,
-                        size_t sizeZ,
-                        size_t sizeY,
-                        size_t sizeX,
-                        size_t strideD,
-                        size_t strideH,
-                        size_t strideW,
-                        size_t paddingD,
-                        size_t paddingH,
-                        size_t paddingW);
-
-  void avgPool3DBackward(Matrix& input,
-                         size_t imgSizeD,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         size_t outputD,
-                         size_t outputH,
-                         size_t outputW,
-                         size_t sizeZ,
-                         size_t sizeY,
-                         size_t sizeX,
-                         size_t strideD,
-                         size_t strideH,
-                         size_t strideW,
-                         size_t paddingD,
-                         size_t paddingH,
-                         size_t paddingW,
-                         real scaleTargets,
-                         real scaleOutput);
-
-  void maxSequenceForward(Matrix& input,
-                          const IVector& sequence,
-                          IVector& index);
-
-  void maxSequenceBackward(Matrix& outputGrad,
-                           const IVector& sequence,
-                           IVector& index);
-
-  void bilinearForward(const Matrix& in,
-                       const size_t inImgH,
-                       const size_t inImgW,
-                       const size_t outImgH,
-                       const size_t outImgW,
-                       const size_t numChannels,
-                       const real ratioH,
-                       const real ratioW);
-
-  void bilinearBackward(const Matrix& out,
-                        const size_t outImgH,
-                        const size_t outImgW,
-                        const size_t inImgH,
-                        const size_t inImgW,
-                        const size_t numChannels,
-                        const real ratioH,
-                        const real ratioW);
-
-  void vol2Col(real* data,
-               int channels,
-               int depth,
-               int height,
-               int width,
-               int filterD,
-               int filterH,
-               int filterW,
-               int strideD,
-               int strideH,
-               int strideW,
-               int paddingD,
-               int paddingH,
-               int paddingW);
-
-  void col2Vol(real* trg,
-               int channels,
-               int depth,
-               int height,
-               int width,
-               int filterD,
-               int filterH,
-               int filterW,
-               int strideD,
-               int strideH,
-               int strideW,
-               int paddingD,
-               int paddingH,
-               int paddingW,
-               real alpha,
-               real beta);
-
-  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
-
-  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    TensorGpuApply<real>(*this, expr);
-  }
-};
-
-class CpuMatrix : public Matrix {
-private:
-  MatrixPtr sftmaxSum_;
-  MatrixPtr sftmaxDot_;
-
-public:
-  CpuMatrix(size_t height, size_t width, bool trans = false);
-  CpuMatrix(real* data, size_t height, size_t width, bool trans = false)
-      : Matrix(data, height, width, trans, false) {}
-  CpuMatrix(real* data,
-            size_t height,
-            size_t width,
-            size_t stride,
-            bool trans = false)
-      : Matrix(data, height, width, stride, trans, false) {}
-
-  CpuMatrix(CpuMemHandlePtr dataHandle,
-            size_t height,
-            size_t width,
-            bool trans = false)
-      : Matrix(dataHandle, height, width, trans, false) {}
-
-  ~CpuMatrix();
-
-  void zeroMem();
-  void resetOne();
-  void setDiag(real value);
-
-  void resize(size_t newHeight, size_t newWidth);
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-
-  real getElement(size_t x, size_t y) const;
-  real getSum();
-  void accumulateColSum(Matrix& src);
-  real getAbsSum();
-
-  MatrixPtr getTranspose();
-  void transpose(MatrixPtr& matTrans, bool memAlloc);
-  void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise);
-
-  MatrixPtr getInverse();
-  void inverse(MatrixPtr& matInv, bool memAlloc);
-
-  void copyFrom(const Matrix& src);
-
-  void copyFrom(const Matrix& src, hl_stream_t stream);
-
-  void copyFrom(const real* cpuSrc, size_t size);
-
-  void copyFrom(const real* cpuSrc, const int64_t* seq);
-
-  void copyFrom(const IVector& src);
-
-  void copyFrom(CpuSparseMatrix& src);
-
-  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
-
-  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
-
-  void maxPoolForward(Matrix& inputMat,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW,
-                      MatrixPtr maskMatP);
-
-  void maxPoolBackward(Matrix& image,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       Matrix& outGrad,
-                       Matrix& outV,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW);
-
-  void avgPoolForward(Matrix& input,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW,
-                      bool excludeMode = true);
-
-  void avgPoolBackward(Matrix& input,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW,
-                       bool excludeMode = true);
-
-  void maxPool3DForward(Matrix& inputMat,
-                        Matrix& maxPoolIdx,
-                        size_t channels,
-                        size_t imgSizeD,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t outputD,
-                        size_t outputH,
-                        size_t outputW,
-                        size_t sizeZ,
-                        size_t sizeY,
-                        size_t sizeX,
-                        size_t strideD,
-                        size_t strideH,
-                        size_t strideW,
-                        size_t paddingD,
-                        size_t paddingH,
-                        size_t paddingW);
-
-  void maxPool3DBackward(Matrix& outGrad,
-                         Matrix& maxPoolIdx,
-                         size_t imgSizeD,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         size_t outputD,
-                         size_t outputH,
-                         size_t outputW,
-                         size_t sizeZ,
-                         size_t sizeY,
-                         size_t sizeX,
-                         size_t strideD,
-                         size_t strideH,
-                         size_t strideW,
-                         size_t paddingD,
-                         size_t paddingH,
-                         size_t paddingW,
-                         real scaleTargets,
-                         real scaleOutput);
-
-  void avgPool3DForward(Matrix& input,
-                        size_t channels,
-                        size_t imgSizeD,
-                        size_t imgSizeH,
-                        size_t imgSizeW,
-                        size_t outputD,
-                        size_t outputH,
-                        size_t outputW,
-                        size_t sizeZ,
-                        size_t sizeY,
-                        size_t sizeX,
-                        size_t strideD,
-                        size_t strideH,
-                        size_t strideW,
-                        size_t paddingD,
-                        size_t paddingH,
-                        size_t paddingW);
-
-  void avgPool3DBackward(Matrix& input,
-                         size_t imgSizeD,
-                         size_t imgSizeH,
-                         size_t imgSizeW,
-                         size_t outputD,
-                         size_t outputH,
-                         size_t outputW,
-                         size_t sizeZ,
-                         size_t sizeY,
-                         size_t sizeX,
-                         size_t strideD,
-                         size_t strideH,
-                         size_t strideW,
-                         size_t paddingD,
-                         size_t paddingH,
-                         size_t paddingW,
-                         real scaleTargets,
-                         real scaleOutput);
-
-  void maxSequenceForward(Matrix& input,
-                          const IVector& sequence,
-                          IVector& index);
-
-  void maxSequenceBackward(Matrix& outputGrad,
-                           const IVector& sequence,
-                           IVector& index);
-
-  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
-public:
-  /// add b to each sample of this.
-  void addBias(Matrix& b, real scale);
-  void addSharedBias(Matrix& b, real scale);
-
-  /// add each sample of a to this.
-  void collectBias(Matrix& a, real scale);
-  void collectSharedBias(Matrix& a, real scale);
-
-  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
-  void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
-
-  /**
-   * @code
-   * this.row[i] += table.row[ids[i]]
-   * @endcode
-   */
-  virtual void selectRows(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * table.row[ids[i]] += this.row[i]
-   * @endcode
-   */
-  virtual void addToRows(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * this[i] = table[i, id[i]]
-   * @endcode
-   */
-  virtual void selectElements(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * table[i, id[i]] += this[i]
-   * @endcode
-   */
-  virtual void addElements(Matrix& table, IVector& ids);
-
-  /**
-   * use abstract getRow() to get row from table.
-   *
-   * Define table as template instead of virtual class for performance sake.
-   * internal used by above two virtual funcs.
-   */
-  template <typename TableMatType>
-  void selectRowsImp(TableMatType& table, IVector& ids);
-  template <typename TableMatType>
-  void addToRowsImp(TableMatType& table, IVector& ids);
-
-  void addColumnVector(const Matrix& b);
-
-  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
-  void mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-
-  void mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB, real scaleT);
-
-  static void mul(CpuMatrix* a,
-                  CpuMatrix* b,
-                  CpuSparseMatrix* c,
-                  real scaleAB,
-                  real scaleT);
-
-  /**
-   * c = a * b
-   *
-   * use abstract getRow() to get row from B,C.
-   * Define B,C as template instead of virtual class for performance sake.
-   */
-  template <typename MatBType, typename MatCType>
-  static void mul(
-      CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT);
-
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-
-  void mul(const Matrix& a, const Matrix& b);
-
-  void rightMul(Matrix& b, real scaleAB, real scaleT);
-  void rightMul(Matrix& b);
-
-  void leftMul(Matrix& a, real scaleAB, real scaleT);
-  void leftMul(Matrix& a);
-  void colMerge(Matrix& src);
-  void rowSum(Matrix& sum);
-  void rowMaxId(IVector& maxIds);
-  void rowMax(Matrix& max);
-  void rowMax(IVector& maxIds, Matrix& maxVal);
-  void colMax(Matrix& max);
-  void colMax(IVector& maxIds, Matrix& maxVal);
-  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
-  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
-  void rowNormalizeL1(Matrix& out);
-
-  void oneHotCrossEntropy(Matrix& output, IVector& label);
-  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
-  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                      IVector& label,
-                                      real alpha);
-  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                        IVector& label,
-                                        real alpha);
-
-  void circularConv(Matrix& b, Matrix& c);
-  void circularConvDerivative(Matrix& output,
-                              Matrix& prevOut1,
-                              Matrix& prevOut2,
-                              Matrix& prevGrad1,
-                              Matrix& prevGrad2);
-
-  void softmax(Matrix& output);
-  void sequenceSoftmax(Matrix& output, const IVector& index);
-  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
-
-  /// calculate the sum of squares diff cost.
-  void sumOfSquares(Matrix& output, Matrix& label);
-
-  /// gradient of sumOfSquares.
-  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
-
-  void smoothL1(Matrix& output, Matrix& label, real destScale);
-  void smoothL1Bp(Matrix& output, Matrix& label, real destScale);
-
-  void tanh(Matrix& output);
-  void tanhDerivative(Matrix& output);
-
-  void softrelu(Matrix& output);
-  void softreluDerivative(Matrix& output);
-  void scaledTanh(Matrix& output, real p1, real p2);
-
-  void print(std::ostream& os) const;
-  void print(std::ostream& os, size_t height, size_t width) const;
-  void printOneRow(std::ostream& os, size_t idx) const;
-
-  void paramReluForward(Matrix& data, Matrix& W);
-  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
-  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
-
-  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
-
-  real getMin();
-  real getMax();
-
-  void randomizeUniform();
-
-  void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
-
-  void addByBitCode(size_t numClasses, const IVector& codes, const Matrix& vec);
-
-  void addByBitCodeBackward(size_t numClasses,
-                            const IVector& codes,
-                            Matrix& vec);
-
-  void mulByBitCode(size_t numClasses,
-                    const IVector& codes,
-                    const Matrix& mat,
-                    const Matrix& input);
-
-  void mulByBitCodeBackwardWeight(size_t numClasses,
-                                  const IVector& codes,
-                                  Matrix& mat,
-                                  const Matrix& input);
-
-  void mulByBitCodeBackwardError(size_t numClasses,
-                                 const IVector& codes,
-                                 const Matrix& mat,
-                                 Matrix& input);
-
-  void sumByBitCode(size_t numClasses,
-                    IVector& codes,
-                    Matrix& sum,
-                    real scaleSum);
-
-  void subByBitCode(size_t numClasses_, IVector& codes);
-
-  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
-  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
-  void classificationErrorMulti(Matrix& output, Matrix& label, real threshold);
-
-  void bilinearForward(const Matrix& in,
-                       const size_t inImgH,
-                       const size_t inImgW,
-                       const size_t outImgH,
-                       const size_t outImgW,
-                       const size_t numChannels,
-                       const real ratioH,
-                       const real ratioW);
-
-  void bilinearBackward(const Matrix& out,
-                        const size_t outImgH,
-                        const size_t outImgW,
-                        const size_t inImgH,
-                        const size_t inImgW,
-                        const size_t numChannels,
-                        const real ratioH,
-                        const real ratioW);
-
-  void vol2Col(real* data,
-               int channels,
-               int depth,
-               int height,
-               int width,
-               int filterD,
-               int filterH,
-               int filterW,
-               int strideD,
-               int strideH,
-               int strideW,
-               int paddingD,
-               int paddingH,
-               int paddingW);
-
-  void col2Vol(real* trg,
-               int channels,
-               int depth,
-               int height,
-               int width,
-               int filterD,
-               int filterH,
-               int filterW,
-               int strideD,
-               int strideH,
-               int strideW,
-               int paddingD,
-               int paddingH,
-               int paddingW,
-               real alpha,
-               real beta);
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    TensorCpuApply<real>(*this, expr);
-  }
-};
-
-class SharedCpuMatrix : public CpuMatrix {
-public:
-#ifndef PADDLE_MOBILE_INFERENCE
-  /* blockNum is number of partitions of the matrix  */
-  SharedCpuMatrix(int blockNum, size_t height, size_t width, bool trans = false)
-      : CpuMatrix(height, width, trans) {
-    initShared(blockNum);
-  }
-  SharedCpuMatrix(
-      int blockNum, real* data, size_t height, size_t width, bool trans = false)
-      : CpuMatrix(data, height, width, trans) {
-    initShared(blockNum);
-  }
-
-  SharedCpuMatrix(int blockNum,
-                  CpuMemHandlePtr dataHandle,
-                  size_t height,
-                  size_t width,
-                  bool trans = false)
-      : CpuMatrix(dataHandle, height, width, trans) {
-    initShared(blockNum);
-  }
-
-  SharedCpuMatrix(CpuMemHandlePtr dataHandle,
-                  size_t height,
-                  size_t width,
-                  bool trans = false)
-      : CpuMatrix(dataHandle, height, width, trans) {
-    initBlock(1);
-  }
-
-  ~SharedCpuMatrix() {}
-
-public:
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-  virtual void add(Matrix& b, real p1, real p2);
-  virtual void add(real p1, real p2);
-
-private:
-  using Matrix::mul;
-  void initShared(int blockNum);
-  void initBlock(int blockNum);
-
-  int blockNum_;
-  std::vector<std::unique_ptr<std::mutex>> blockLocks_;
-  ThreadLocal<CpuMatrixPtr> localBuf_;
-  ThreadLocal<std::vector<int>> localBufRows_;
-  ThreadLocal<std::vector<int>> blockSeq_;
-#endif
-};
-
-typedef struct { unsigned int col; } sparse_non_value_t;
-
-typedef struct {
-  unsigned int col;
-  float value;
-} sparse_float_value_t;
-
-}  // namespace paddle
-#include "ExecViaCpu.h"
diff --git a/paddle/math/MatrixBitCode.cpp b/paddle/math/MatrixBitCode.cpp
deleted file mode 100644
index 61a9923bc2e6f358738f80de4a30d83c0cc00656..0000000000000000000000000000000000000000
--- a/paddle/math/MatrixBitCode.cpp
+++ /dev/null
@@ -1,291 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Matrix.h"
-#include "hl_gpu.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-namespace {
-
-struct SimpleCode {
-  SimpleCode(size_t code, size_t numClasses) : c_(code + numClasses) {}
-  inline size_t calcIndex(int bit) const { return (c_ >> (bit + 1)) - 1; }
-  inline bool calcBit(int bit) const { return c_ & (1 << bit); }
-  inline int getLength() const { return findLastSet(c_) - 1; }
-
-private:
-  size_t c_;
-};
-
-struct SimpleCodeTable {
-  explicit SimpleCodeTable(size_t numClasses) : numClasses_(numClasses) {}
-  SimpleCode operator()(size_t code) const {
-    return SimpleCode(code, numClasses_);
-  }
-  size_t size() const { return numClasses_; }
-  int getMaxCodeLength() const { return findLastSet(numClasses_ - 1); }
-
-private:
-  size_t numClasses_;
-  int maxCodeLength_;
-};
-
-}  // namespace
-
-/**
- * CodeTable class should support 3 functions:
- *
- * size_t size()
- *   return the number of codes
- *
- * int getMaxCodeLength()
- *   return the maximal code length
- *
- * Code operator()(size_t i)
- *   return the i-th code. Code class is descriebed below.
- *
- * Code class should support 3 functions:
- *
- * int getLength()
- *   return the length of the code
- *
- * bool calcIndex(int bit)
- *   bit ranges from 0 to getLength() - 1
- *   return the index for the (1+bit) level parent
- *
- * bool calcBit(int bit)
- *   return true if the bit level parent is the right child of (1+bit) level
- *   parent
- *
- */
-
-/*
-   for i:
-     for j < codeLength:
-       op(tmat(i, j), vec(0, index(i, j)))
-*/
-template <class CodeTable, class Op, class TMat, class Mat>
-static void addByBitCodeT(
-    Op op, CodeTable codeTable, const IVector& codes, TMat& tmat, Mat& vec) {
-  CHECK(!vec.useGpu());
-
-  size_t numClasses = codeTable.size();
-  size_t maxCodeLength = codeTable.getMaxCodeLength();
-  size_t numSamples = tmat.getHeight();
-  size_t oWidth = tmat.getWidth();
-  CHECK_EQ(tmat.getWidth(), maxCodeLength);
-  CHECK_EQ(codes.getSize(), numSamples);
-  CHECK_EQ(vec.getHeight(), (size_t)1);
-  CHECK_EQ(vec.getWidth(), numClasses - 1);
-
-  auto data = tmat.getData();
-  auto v = vec.getData();
-  const int* c = codes.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    auto code = codeTable(c[i]);
-    int codeLength = code.getLength();
-    for (int j = 0; j < codeLength; ++j) {
-      size_t index = code.calcIndex(j);
-      op(data[i * oWidth + j], v[index]);
-    }
-  }
-}
-
-/* For j < codeLength:
-   this(i, j) += vec(0, index(i, j))
-*/
-void CpuMatrix::addByBitCode(size_t numClasses,
-                             const IVector& codes,
-                             const Matrix& vec) {
-  auto op = [](real& t, real v) { t += v; };
-  addByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, vec);
-}
-
-/* For j < codeLength:
-   vec(0, index(i, j)) += this(i, j)
-*/
-void CpuMatrix::addByBitCodeBackward(size_t numClasses,
-                                     const IVector& codes,
-                                     Matrix& vec) {
-  auto op = [](real t, real& v) { v += t; };
-  addByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, vec);
-}
-
-/*
-  for i:
-    for j < codeLength:
-      op(tmat(i, j), mat.row(index(i, j)), input.row(i))
-*/
-template <class Op,
-          class CodeTable,
-          class IVec,
-          class TMat,
-          class WMat,
-          class InMat>
-void mulByBitCodeT(Op op,
-                   CodeTable codeTable,
-                   IVec& codes,
-                   TMat& tmat,
-                   WMat& weight,
-                   InMat& input) {
-  CHECK(!tmat.useGpu() && !weight.useGpu() && !input.useGpu());
-
-  size_t numClasses = codeTable.size();
-  size_t maxCodeLength = codeTable.getMaxCodeLength();
-  size_t numSamples = tmat.getHeight();
-  size_t inputDim = input.getWidth();
-  size_t oWidth = tmat.getWidth();
-  CHECK_EQ(tmat.getWidth(), maxCodeLength);
-  CHECK_EQ(codes.getSize(), numSamples);
-  CHECK_EQ(input.getHeight(), numSamples);
-  CHECK_EQ(weight.getHeight(), numClasses - 1);
-  CHECK_EQ(weight.getWidth(), inputDim);
-
-  real* data = tmat.getData();
-  const int* c = codes.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    auto code = codeTable(c[i]);
-    int codeLength = code.getLength();
-    for (int j = 0; j < codeLength; ++j) {
-      size_t index = code.calcIndex(j);
-      op(data[i * oWidth + j], weight.rowBuf(index), input.rowBuf(i), inputDim);
-    }
-  }
-}
-
-/* For j < codeLength:
-   this(i, j) += <weight.row(index(i, j)), input.row(i)>
-*/
-void CpuMatrix::mulByBitCode(size_t numClasses,
-                             const IVector& codes,
-                             const Matrix& weight,
-                             const Matrix& input) {
-  auto op = [](
-      real& t, const real* weightRow, const real* inputRow, size_t inputDim) {
-    real sum = 0;
-    for (size_t k = 0; k < inputDim; ++k) {
-      sum += weightRow[k] * inputRow[k];
-    }
-    t += sum;
-  };
-
-  mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input);
-}
-
-/* For index(i, j) >= 0:
-   weight.row(index(i, j)) += this(i, j) * input.row(i)
-*/
-void CpuMatrix::mulByBitCodeBackwardWeight(size_t numClasses,
-                                           const IVector& codes,
-                                           Matrix& weight,
-                                           const Matrix& input) {
-  auto op = [](
-      const real t, real* weightRow, const real* inputRow, size_t inputDim) {
-    for (size_t k = 0; k < inputDim; ++k) {
-      weightRow[k] += t * inputRow[k];
-    }
-  };
-
-  mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input);
-}
-
-/* For j < codeLength:
-   input.row(i) += this(i, j) * weight.row(index(i, j))
-*/
-void CpuMatrix::mulByBitCodeBackwardError(size_t numClasses,
-                                          const IVector& codes,
-                                          const Matrix& weight,
-                                          Matrix& input) {
-  auto op = [](
-      const real t, const real* weightRow, real* inputRow, size_t inputDim) {
-    for (size_t k = 0; k < inputDim; ++k) {
-      inputRow[k] += t * weightRow[k];
-    }
-  };
-
-  mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input);
-}
-
-template <class CodeTable>
-void sumByBitCodeT(CodeTable codeTable,
-                   IVector& codes,
-                   const CpuMatrix& tmat,
-                   Matrix& sum,
-                   real scaleSum) {
-  size_t maxCodeLength = codeTable.getMaxCodeLength();
-  size_t numSamples = tmat.getHeight();
-  size_t oWidth = tmat.getWidth();
-  CHECK_EQ(tmat.getWidth(), maxCodeLength);
-  CHECK_EQ(codes.getSize(), numSamples);
-  CHECK_EQ(sum.getHeight(), numSamples);
-  CHECK_EQ(sum.getWidth(), (size_t)1);
-
-  const real* data = tmat.getData();
-  real* s = sum.getData();
-  int* c = codes.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    real sm = 0;
-    auto code = codeTable(c[i]);
-    int codeLength = code.getLength();
-    for (int j = 0; j < codeLength; ++j) {
-      if (code.calcBit(j)) {
-        sm += data[i * oWidth + j];
-      }
-    }
-    s[i] = scaleSum * sm;
-  }
-}
-
-/* For j < codeLength:
-   sum(i, 0) = \sum_j  bit(i, j) * this(i, j)
-*/
-void CpuMatrix::sumByBitCode(size_t numClasses,
-                             IVector& codes,
-                             Matrix& sum,
-                             real scaleSum) {
-  sumByBitCodeT(SimpleCodeTable(numClasses), codes, *this, sum, scaleSum);
-}
-
-template <class CodeTable>
-void subByBitCodeT(CodeTable codeTable, IVector& codes, CpuMatrix& tmat) {
-  size_t maxCodeLength = codeTable.getMaxCodeLength();
-  size_t numSamples = tmat.getHeight();
-  size_t oWidth = tmat.getWidth();
-  CHECK_EQ(tmat.getWidth(), maxCodeLength);
-  CHECK_EQ(codes.getSize(), numSamples);
-
-  real* data = tmat.getData();
-  int* c = codes.getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    auto code = codeTable(c[i]);
-    int codeLength = code.getLength();
-    for (int j = 0; j < codeLength; ++j) {
-      if (code.calcBit(j)) {
-        data[i * oWidth + j] -= 1;
-      }
-    }
-  }
-}
-
-/* For j < codeLength
-   this(i, j) -= bit(i, j)
-*/
-void CpuMatrix::subByBitCode(size_t numClasses, IVector& codes) {
-  subByBitCodeT(SimpleCodeTable(numClasses), codes, *this);
-}
-
-}  // namespace paddle
diff --git a/paddle/math/MemoryHandle.h b/paddle/math/MemoryHandle.h
deleted file mode 100644
index 03ee413c1218376635c4696ebb774c584aa67aa4..0000000000000000000000000000000000000000
--- a/paddle/math/MemoryHandle.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include "PoolAllocator.h"
-
-namespace paddle {
-
-class MemoryHandle {
-protected:
-  explicit MemoryHandle(size_t size);
-  virtual ~MemoryHandle() {}
-
-public:
-  void* getBuf() const { return buf_; }
-  size_t getSize() const { return size_; }
-  size_t getAllocSize() const { return allocSize_; }
-
-protected:
-  PoolAllocator* allocator_;
-  size_t size_;       // the requested size
-  size_t allocSize_;  // the allocated size
-  int deviceId_;      // the device id of memory if gpu memory
-  void* buf_;
-};
-
-/**
- * Wrapper class for raw gpu memory handle.
- *
- * The raw handle will be released at destructor
- */
-class GpuMemoryHandle : public MemoryHandle {
-public:
-  explicit GpuMemoryHandle(size_t size);
-  virtual ~GpuMemoryHandle();
-};
-
-/**
- * Wrapper class for raw cpu memory handle.
- *
- * The raw handle will be released at destructor
- */
-class CpuMemoryHandle : public MemoryHandle {
-public:
-  explicit CpuMemoryHandle(size_t size);
-  virtual ~CpuMemoryHandle();
-};
-
-typedef std::shared_ptr<MemoryHandle> MemoryHandlePtr;
-typedef std::shared_ptr<CpuMemoryHandle> CpuMemHandlePtr;
-typedef std::shared_ptr<GpuMemoryHandle> GpuMemHandlePtr;
-}  // namespace paddle
diff --git a/paddle/math/PoolAllocator.h b/paddle/math/PoolAllocator.h
deleted file mode 100644
index 90141fef3fd43fe221874cc50e688f6db9e2dee6..0000000000000000000000000000000000000000
--- a/paddle/math/PoolAllocator.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <mutex>
-#include <unordered_map>
-#include <vector>
-#include "Allocator.h"
-
-namespace paddle {
-
-/**
- * @brief Memory pool allocator implementation.
- */
-class PoolAllocator {
-public:
-  /**
-   * @brief constructor.
-   * @param allocator a Allocator object.
-   * @param sizeLimit The maximum size memory can be managed,
-   * if sizeLimit == 0, the pool allocator is a simple wrapper of allocator.
-   */
-  PoolAllocator(Allocator* allocator,
-                size_t sizeLimit = 0,
-                const std::string& name = "pool");
-
-  /**
-   * @brief destructor.
-   */
-  ~PoolAllocator();
-
-  void* alloc(size_t size);
-  void free(void* ptr, size_t size);
-  std::string getName() { return name_; }
-
-private:
-  void freeAll();
-  void printAll();
-  std::unique_ptr<Allocator> allocator_;
-  std::mutex mutex_;
-  std::unordered_map<size_t, std::vector<void*>> pool_;
-  size_t sizeLimit_;
-  size_t poolMemorySize_;
-  std::string name_;
-};
-
-}  // namespace paddle
diff --git a/paddle/math/RowBuffer.h b/paddle/math/RowBuffer.h
deleted file mode 100644
index 2e4d11a86bf8bd1308b2972f549bc7c201044785..0000000000000000000000000000000000000000
--- a/paddle/math/RowBuffer.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "MemoryHandle.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-/**
- * @brief The RowBuffer class
- * Represent the SparseRow Matrix Data.
- *
- * If not set memory handler, then the data could be auto growth.
- */
-class RowBuffer {
-public:
-  /**
-   * @brief RowBuffer create a auto-growth row buffer. The row length is width.
-   * @param width the length of each row, a.k.a matrix width.
-   */
-  explicit RowBuffer(size_t width) : width_(width) {}
-
-  /**
-   * @brief RowBuffer create a row buffer, which cannot be auto-growth.
-   * @param mem the pre-allocated memory.
-   * @param width the length of each row, a.k.a matrix width.
-   */
-  RowBuffer(const CpuMemHandlePtr& mem, size_t width)
-      : preallocatedBuf_(mem), width_(width) {}
-
-  /**
-   * @brief resize resize the buffer with rowCount
-   * @param rowCnt number of row. matrix height.
-   */
-  inline void resize(int rowCnt) {
-    if (preallocatedBuf_) {
-      CHECK(preallocatedBuf_->getSize() >= rowCnt * width_ * sizeof(real));
-    } else {
-      rowStore_.resize(rowCnt * width_);
-    }
-  }
-
-  /**
-   * @brief get a row buffer with row index.
-   * @param row the index of row.
-   * @return row buffer.
-   */
-  inline real* get(int row) const {
-    if (preallocatedBuf_) {
-      CHECK_LE((row)*width_ * sizeof(real), preallocatedBuf_->getSize());
-      return reinterpret_cast<real*>(preallocatedBuf_->getBuf()) + row * width_;
-    } else {
-      CHECK_LE((row + 1) * width_, rowStore_.size());
-      return const_cast<real*>(rowStore_.data() + row * width_);
-    }
-  }
-
-  /**
-   * @brief get a row buffer with row index. If row index is larger than local
-   *        buffer, the size of local buffer will grow.
-   * @param row the index of row.
-   * @return row buffer.
-   */
-  inline real* getWithAutoGrowth(int row) {
-    if (preallocatedBuf_) {
-      return get(row);
-    } else {
-      if ((rowStore_.size() <= row * width_)) {
-        rowStore_.resize((row + 1) * width_);
-      }
-      return rowStore_.data() + row * width_;
-    }
-  }
-
-  /**
-   * @return raw data buffer.
-   */
-  inline real* data() {
-    if (preallocatedBuf_) {
-      return reinterpret_cast<real*>(preallocatedBuf_->getBuf());
-    } else {
-      return rowStore_.data();
-    }
-  }
-
-  /**
-   * @brief clear local buffer. It only affect auto-growth buffer.
-   */
-  inline void clear() {
-    // swap an empty vector to it to free the memory.
-    std::vector<real, AlignedAllocator<real, 32>> empty;
-    rowStore_.swap(empty);
-  }
-
-  /**
-   * @brief get current number of rows.
-   * @return number of rows.
-   */
-  inline size_t getRowCount() const {
-    if (preallocatedBuf_) {
-      return preallocatedBuf_->getSize() / sizeof(real) / width_;
-    } else {
-      return rowStore_.size() / width_;
-    }
-  }
-
-  /**
-   * @brief get is this buffer can automatically grow or not.
-   * @return ture if can automacitally grow.
-   */
-  inline bool isAutoGrowth() const { return !preallocatedBuf_; }
-
-  /**
-   * @brief return the width of matrix. a.k.a length of row.
-   * @return width of matrix
-   */
-  inline size_t getWidth() const { return width_; }
-
-private:
-  //! TODO(yuyang18): Add resize method to CpuMemHandlePtr, then we can get rid
-  //! of std::vector here.
-  CpuMemHandlePtr preallocatedBuf_;
-  std::vector<real, AlignedAllocator<real, 32>> rowStore_;
-  size_t width_;
-};
-}  // namespace paddle
diff --git a/paddle/math/SparseMatrix.cpp b/paddle/math/SparseMatrix.cpp
deleted file mode 100644
index 1faa343dbcef3d20b29b272a8da37f8e2bba654b..0000000000000000000000000000000000000000
--- a/paddle/math/SparseMatrix.cpp
+++ /dev/null
@@ -1,864 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SparseMatrix.h"
-#include <algorithm>
-#include <iostream>
-#include <vector>
-#include "hl_gpu.h"
-#include "hl_top_k.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-GpuSparseMatrix::GpuSparseMatrix(size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans)
-    : Matrix(NULL, height, width, trans, true) {
-  resize(height, width, nnz, valueType, format);
-}
-
-GpuSparseMatrix::GpuSparseMatrix(GpuMemHandlePtr dataHandle,
-                                 hl_sparse_matrix_s_ptr sMatrix,
-                                 size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans,
-                                 MemoryHandlePtr sMemoryHandle)
-    : Matrix(dataHandle, height, width, trans, true) {
-  CHECK(dataHandle && sMatrix) << "Invalid argument pointer";
-
-  size_t size = 0;
-  if (format == SPARSE_CSR) {
-    size = (height + 1) * sizeof(int) + nnz * sizeof(int);
-  } else {
-    size = (width + 1) * sizeof(int) + nnz * sizeof(int);
-  }
-
-  if (NO_VALUE != valueType) {
-    size += nnz * sizeof(real);
-  }
-  CHECK_LE(size, dataHandle->getSize());
-
-  sMatrix_ = sMatrix;
-
-  if (sMemoryHandle == NULL) {
-    sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(dataHandle->getSize());
-  } else {
-    CHECK_EQ(sMemoryHandle->getSize(), dataHandle->getSize());
-    sMemoryHandle_ = sMemoryHandle;
-  }
-
-  elementCnt_ = nnz;
-  valueType_ = valueType;
-  format_ = format;
-  if (format_ == SPARSE_CSR)
-    sparseResizeCSR();
-  else
-    sparseResizeCSC();
-}
-
-GpuSparseMatrix::GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix,
-                                 size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans,
-                                 MemoryHandlePtr sMemoryHandle)
-    : Matrix(NULL, height, width, trans, true) {
-  CHECK(sMatrix) << "Invalid argument pointer";
-  sMatrix_ = sMatrix;
-  sMemoryHandle_ = sMemoryHandle;
-  elementCnt_ = nnz;
-  format_ = format;
-  valueType_ = valueType;
-}
-
-GpuSparseMatrix::GpuSparseMatrix(real* value,
-                                 int* rows,
-                                 int* cols,
-                                 size_t height,
-                                 size_t width,
-                                 size_t nnz,
-                                 SparseValueType valueType,
-                                 SparseFormat format,
-                                 bool trans)
-    : Matrix(NULL, height, width, trans, true) {
-  size_t size = 0;
-  if (format == SPARSE_CSR) {
-    size = (height + 1) * sizeof(int) + nnz * sizeof(int);
-  } else {
-    size = (width + 1) * sizeof(int) + nnz * sizeof(int);
-  }
-
-  if (NO_VALUE != valueType) {
-    size += nnz * sizeof(real);
-  }
-  elementCnt_ = nnz;
-  valueType_ = valueType;
-  format_ = format;
-
-  sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(size);
-  if (format_ == SPARSE_CSR) {
-    rows_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
-    cols_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-        (height_ + 1) * sizeof(int));
-    if (NO_VALUE != valueType_) {
-      value_ = reinterpret_cast<real*>(
-          reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-          (height_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
-    } else {
-      value_ = NULL;
-    }
-
-    if (sMatrix_ == NULL) {
-      /* construct hl_sparse_matrix_s */
-      hl_sparse_matrix_s tmp;
-      hl_construct_sparse_matrix(
-          &tmp,
-          value,
-          rows,
-          cols,
-          HL_SPARSE_CSR,
-          valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
-          height_,
-          width_,
-          elementCnt_);
-      hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
-      sMatrix_ = tmp2;
-    }
-
-  } else {
-    cols_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
-    rows_ = reinterpret_cast<int*>(
-        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-        (width_ + 1) * sizeof(int));
-    if (NO_VALUE != valueType_) {
-      value_ = reinterpret_cast<real*>(
-          reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-          (width_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
-    } else {
-      value_ = NULL;
-    }
-
-    if (sMatrix_ == NULL) {
-      /* construct hl_sparse_matrix_s */
-      hl_sparse_matrix_s tmp;
-      hl_construct_sparse_matrix(
-          &tmp,
-          value,
-          rows,
-          cols,
-          HL_SPARSE_CSC,
-          valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
-          height_,
-          width_,
-          elementCnt_);
-      hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
-      sMatrix_ = tmp2;
-    }
-  }
-}
-
-void GpuSparseMatrix::sparseResizeCSR() {
-  rows_ =
-      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
-  cols_ =
-      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-                             (height_ + 1) * sizeof(int));
-  if (NO_VALUE != valueType_) {
-    value_ = reinterpret_cast<real*>(
-        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-        (height_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
-  } else {
-    value_ = NULL;
-  }
-
-  if (sMatrix_ == NULL) {
-    /* construct hl_sparse_matrix_s */
-    hl_sparse_matrix_s tmp;
-    hl_construct_sparse_matrix(
-        &tmp,
-        data_,
-        memoryHandle_->getSize(),
-        HL_SPARSE_CSR,
-        valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
-        height_,
-        width_,
-        elementCnt_);
-    hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
-    sMatrix_ = tmp2;
-  }
-}
-
-void GpuSparseMatrix::sparseResizeCSC() {
-  cols_ =
-      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()));
-  rows_ =
-      reinterpret_cast<int*>(reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-                             (width_ + 1) * sizeof(int));
-  if (NO_VALUE != valueType_) {
-    value_ = reinterpret_cast<real*>(
-        reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-        (width_ + 1) * sizeof(int) + elementCnt_ * sizeof(int));
-  } else {
-    value_ = NULL;
-  }
-
-  if (sMatrix_ == NULL) {
-    /* construct hl_sparse_matrix_s */
-    hl_sparse_matrix_s tmp;
-    hl_construct_sparse_matrix(
-        &tmp,
-        memoryHandle_->getBuf(),
-        memoryHandle_->getSize(),
-        HL_SPARSE_CSC,
-        valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE,
-        height_,
-        width_,
-        elementCnt_);
-    hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
-    sMatrix_ = tmp2;
-  }
-}
-
-void GpuSparseMatrix::resize(size_t newHeight,
-                             size_t newWidth,
-                             size_t newNnz,
-                             SparseValueType valueType,
-                             SparseFormat format) {
-  if (format == SPARSE_CSR) {
-    resizeCSR(newHeight, newWidth, newNnz, valueType);
-  } else {
-    resizeCSC(newHeight, newWidth, newNnz, valueType);
-  }
-}
-
-void GpuSparseMatrix::resizeCSR(size_t newHeight,
-                                size_t newWidth,
-                                size_t newNnz,
-                                SparseValueType valueType) {
-  size_t newSize = (newHeight + 1) * sizeof(int) + newNnz * sizeof(int);
-  if (NO_VALUE != valueType) {
-    newSize += newNnz * sizeof(real);
-  }
-
-  if (NULL == memoryHandle_.get() || newSize > memoryHandle_->getSize()) {
-    memoryHandle_ = std::make_shared<GpuMemoryHandle>(newSize);
-    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
-    sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize);
-    end_ = reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-           sMemoryHandle_->getSize();
-    sMatrix_ = NULL;
-  } else if (valueType != valueType_) {
-    sMatrix_ = NULL;
-  } else {
-    /*
-     * newNnz > elementCnt_ is necessary for the following condition:
-     * Firstly, height_ is 9 elementCnt_ is 56
-     * Secondly, height_ is 11 elementCnt_ is 44
-     *   ==> height_ is bigger, sMatrix_ will resize, and total item is 44 now
-     * Then, height_ is 10 elementCnt_ is 52
-     *   ==> Without newNnz > elementCnt_ condition, sMatrix_ will fail
-     */
-    if ((ssize_t)((newHeight + 1) * sizeof(int)) >
-            ((char*)cols_ - (char*)rows_) ||
-        newNnz > static_cast<size_t>(sMatrix_->nnz)) {
-      sMatrix_ = NULL;
-    } else if (NO_VALUE == valueType) {
-      if ((ssize_t)(newNnz * sizeof(int)) > (end_ - (char*)cols_)) {
-        sMatrix_ = NULL;
-      }
-    } else {
-      if ((ssize_t)(newNnz * sizeof(int)) > ((char*)value_ - (char*)cols_) ||
-          (ssize_t)(newNnz * sizeof(real)) > (end_ - (char*)value_)) {
-        sMatrix_ = NULL;
-      }
-    }
-  }
-
-  height_ = newHeight;
-  width_ = newWidth;
-  elementCnt_ = newNnz;
-  valueType_ = valueType;
-  format_ = SPARSE_CSR;
-
-  if (sMatrix_ == NULL) {
-    sparseResizeCSR();
-  }
-}
-
-void GpuSparseMatrix::resizeCSC(size_t newHeight,
-                                size_t newWidth,
-                                size_t newNnz,
-                                SparseValueType valueType) {
-  size_t newSize = (newWidth + 1) * sizeof(int) + newNnz * sizeof(int);
-  if (NO_VALUE != valueType) {
-    newSize += newNnz * sizeof(real);
-  }
-
-  if (NULL == memoryHandle_.get() || newSize > memoryHandle_->getSize()) {
-    memoryHandle_ = std::make_shared<GpuMemoryHandle>(newSize);
-    data_ = reinterpret_cast<real*>(memoryHandle_->getBuf());
-    sMemoryHandle_ = std::make_shared<CpuMemoryHandle>(newSize);
-    end_ = reinterpret_cast<char*>(sMemoryHandle_->getBuf()) +
-           sMemoryHandle_->getSize();
-    sMatrix_ = NULL;
-  } else if (valueType != valueType_) {
-    sMatrix_ = NULL;
-  } else {
-    /*
-     * newNnz > elementCnt_ is necessary for the following condition:
-     * Firstly, height_ is 9 elementCnt_ is 56
-     * Secondly, height_ is 11 elementCnt_ is 44
-     *   ==> height_ is bigger, sMatrix_ will resize,
-     *       and total item is 44 now
-     * Then, height_ is 10 elementCnt_ is 52
-     *   ==> Without newNnz > elementCnt_ condition, sMatrix_ will fail
-     */
-    if ((ssize_t)((newWidth + 1) * sizeof(int)) >
-            ((char*)rows_ - (char*)cols_) ||
-        newNnz > static_cast<size_t>(sMatrix_->nnz)) {
-      sMatrix_ = NULL;
-    } else if (NO_VALUE == valueType) {
-      if ((ssize_t)(newNnz * sizeof(int)) > (end_ - (char*)rows_)) {
-        sMatrix_ = NULL;
-      }
-    } else {
-      if ((ssize_t)(newNnz * sizeof(int)) > ((char*)value_ - (char*)rows_) ||
-          (ssize_t)(newNnz * sizeof(real)) > (end_ - (char*)value_)) {
-        sMatrix_ = NULL;
-      }
-    }
-  }
-
-  height_ = newHeight;
-  width_ = newWidth;
-  elementCnt_ = newNnz;
-  valueType_ = valueType;
-  format_ = SPARSE_CSC;
-
-  if (sMatrix_ == NULL) {
-    sparseResizeCSC();
-  }
-}
-
-void GpuSparseMatrix::resize(size_t newHeight, size_t newWidth) {
-  resize(newHeight, newWidth, elementCnt_, valueType_, format_);
-}
-
-MatrixPtr GpuSparseMatrix::getTranspose() {
-  CHECK(memoryHandle_.get() || sMatrix_) << "not supported";
-  if (memoryHandle_.get()) {
-    MatrixPtr copy_T(new GpuSparseMatrix(
-        std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle_),
-        sMatrix_,
-        height_,
-        width_,
-        elementCnt_,
-        valueType_,
-        format_,
-        true,
-        sMemoryHandle_));
-    return copy_T;
-  } else {
-    MatrixPtr copy_T(new GpuSparseMatrix(sMatrix_,
-                                         height_,
-                                         width_,
-                                         elementCnt_,
-                                         valueType_,
-                                         format_,
-                                         true,
-                                         sMemoryHandle_));
-    return copy_T;
-  }
-}
-
-void GpuSparseMatrix::copyRow(int offsets,
-                              size_t colNum,
-                              const sparse_non_value_t* row) {
-  memcpy(cols_ + offsets, row, sizeof(int) * colNum);
-}
-
-void GpuSparseMatrix::copyRow(int offsets,
-                              size_t colNum,
-                              const sparse_float_value_t* row) {
-  for (size_t j = 0; j < colNum; j++) {
-    cols_[offsets + j] = row[j].col;
-    value_[offsets + j] = row[j].value;
-  }
-}
-
-void GpuSparseMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
-  if (auto mat = dynamic_cast<const CpuSparseMatrix*>(&src)) {
-    copyFrom(*(const_cast<CpuSparseMatrix*>(mat)), stream);
-  } else if (auto mat = dynamic_cast<const GpuSparseMatrix*>(&src)) {
-    copyFrom(*(const_cast<GpuSparseMatrix*>(mat)), stream);
-  } else {
-    LOG(FATAL) << "Not implemented";
-  }
-}
-
-void GpuSparseMatrix::copyFrom(const Matrix& src) {
-  copyFrom(src, HPPL_STREAM_1);
-  hl_stream_synchronize(HPPL_STREAM_1);
-}
-
-template <class T>
-void GpuSparseMatrix::copyFrom(int64_t* ids,
-                               int64_t* indices,
-                               T* data,
-                               hl_stream_t stream) {
-  CHECK_EQ(format_, SPARSE_CSR);
-  size_t nnz = 0;
-  for (size_t i = 0; i < height_; i++) {
-    int64_t id = ids[i];
-    nnz += indices[id + 1] - indices[id];
-  }
-
-  resize(height_,
-         width_,
-         nnz,
-         sizeof(T) == sizeof(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE,
-         format_);
-
-  rows_[0] = 0;
-  for (size_t i = 0; i < height_; i++) {
-    int64_t id = ids[i];
-    size_t colNum = indices[id + 1] - indices[id];
-    rows_[i + 1] = rows_[i] + colNum;
-
-    T* row = data + indices[id];
-    copyRow(rows_[i], colNum, row);
-  }
-
-  sMatrix_->format = HL_SPARSE_CSR;
-  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-  sMatrix_->rows = height_;
-  sMatrix_->cols = width_;
-  sMatrix_->nnz = nnz;
-  hl_memcpy_csr_matrix(sMatrix_.get(), value_, rows_, cols_, stream);
-}
-
-void GpuSparseMatrix::setRow(size_t row,
-                             size_t colNum,
-                             const unsigned int* cols,
-                             const real* values) {
-  CHECK_EQ(format_, SPARSE_CSR);
-  if (NO_VALUE == valueType_) {
-    CHECK_LT(row, height_);
-    CHECK(NULL != cols);
-    CHECK(NULL == values);
-  } else {
-    CHECK_LT(row, height_);
-    CHECK(NULL != cols);
-    CHECK(NULL != values);
-  }
-  if (0 == row) {
-    rows_[row] = 0;
-  }
-  rows_[row + 1] = rows_[row] + colNum;
-
-  memcpy(cols_ + rows_[row], cols, sizeof(*cols) * colNum);
-  if (FLOAT_VALUE == valueType_) {
-    memcpy(value_ + rows_[row], values, sizeof(*values) * colNum);
-  }
-
-  if (height_ - 1 == row) {
-    sMatrix_->format = HL_SPARSE_CSR;
-    sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-    sMatrix_->rows = height_;
-    sMatrix_->cols = width_;
-    sMatrix_->nnz = elementCnt_;
-    hl_memcpy_csr_matrix(
-        sMatrix_.get(), value_, rows_, cols_, HPPL_STREAM_DEFAULT);
-  }
-}
-
-SparseValueType GpuSparseMatrix::getValueType() const { return valueType_; }
-
-void GpuSparseMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
-  CHECK_EQ(format_, SPARSE_CSC);
-  int nnz = sMatrix_->nnz;
-  if (memAlloc) {
-    matTrans = std::make_shared<GpuSparseMatrix>(
-        width_, height_, nnz, valueType_, format_, false);
-  } else {
-    CHECK(matTrans != nullptr);
-  }
-
-  CpuIVector rows(nnz);
-  CpuIVector cols(width_ + 1);
-  CpuIVector cols_full(nnz);
-  CpuVector value(nnz);
-  hl_stream_t stream = HPPL_STREAM_1;
-  hl_memcpy_from_csc_matrix(value.getData(),
-                            nnz,
-                            rows.getData(),
-                            nnz,
-                            cols.getData(),
-                            width_ + 1,
-                            sMatrix_.get(),
-                            stream);
-
-  hl_stream_synchronize(stream);
-
-  /*for every non zero number, get its column index*/
-  std::vector<Element> dataVec;
-  for (size_t i = 0; i < width_; i++) {
-    for (int j = cols.getData()[i]; j < cols.getData()[i + 1]; j++) {
-      cols_full.getData()[j] = i;
-    }
-  }
-
-  /*sort row index and column index by the ascending order*/
-  for (int i = 0; i < nnz; i++) {
-    dataVec.emplace_back(
-        rows.getData()[i], cols_full.getData()[i], value.getData()[i]);
-  }
-  std::sort(dataVec.begin(), dataVec.end(), [](Element a, Element b) {
-    return a.row < b.row || (a.row == b.row && a.col < b.col);
-  });
-
-  /*get sorted data, row index, and col index, put them in the right place*/
-  cols.resize(height_ + 1);
-  rows.resize(nnz);
-  value.resize(nnz);
-
-  cols.getData()[0] = 0;
-  rows.getData()[0] = dataVec[0].col;
-  value.getData()[0] = dataVec[0].val;
-  for (int i = 1; i < nnz; i++) {
-    if (dataVec[i].row != dataVec[i - 1].row) {
-      for (int j = dataVec[i - 1].row + 1; j <= dataVec[i].row; j++) {
-        cols.getData()[j] = i;
-      }
-    }
-    rows.getData()[i] = dataVec[i].col;
-    value.getData()[i] = dataVec[i].val;
-  }
-  cols.getData()[height_] = nnz;
-
-  /*copy back from cpu*/
-  GpuSparseMatrixPtr dest =
-      std::dynamic_pointer_cast<GpuSparseMatrix>(matTrans);
-  hl_memcpy_csc_matrix((dest->sMatrix_).get(),
-                       value.getData(),
-                       rows.getData(),
-                       cols.getData(),
-                       stream);
-  hl_stream_synchronize(stream);
-}
-
-void GpuSparseMatrix::mul(const GpuMatrix& a,
-                          const GpuMatrix& b,
-                          real scaleAB,
-                          real scaleT) {
-  CHECK(a.useGpu_ && b.useGpu_) << "type not match";
-  CHECK(!trans_) << "trans not supported";
-  real* A_d = (real*)a.getData();
-  real* B_d = (real*)b.getData();
-  hl_sparse_matrix_s C_d = sMatrix_.get();
-  hl_trans_op_t a_trans = a.trans_ ? HPPL_OP_T : HPPL_OP_N;
-  hl_trans_op_t b_trans = b.trans_ ? HPPL_OP_T : HPPL_OP_N;
-
-  if (!a.trans_ && !b.trans_) {
-    CHECK(height_ == a.getHeight());
-    CHECK(width_ == b.getWidth());
-    CHECK(a.getWidth() == b.getHeight());
-  } else if (a.trans_ && !b.trans_) {
-    CHECK(height_ == a.getWidth());
-    CHECK(width_ == b.getWidth());
-    CHECK(a.getHeight() == b.getHeight());
-  } else if (!a.trans_ && b.trans_) {
-    CHECK(height_ == a.getHeight());
-    CHECK(width_ == b.getHeight());
-    CHECK(a.getWidth() == b.getWidth());
-  } else {
-    LOG(INFO) << "Not support";
-  }
-  int dimM = height_;
-  int dimN = width_;
-  int dimK = !b.trans_ ? b.getHeight() : b.getWidth();
-  hl_sparse_matrix_mul(
-      A_d, a_trans, B_d, b_trans, C_d, dimM, dimN, dimK, scaleAB, scaleT);
-}
-
-void GpuSparseMatrix::mul(const Matrix& a,
-                          const Matrix& b,
-                          real scaleAB,
-                          real scaleT) {
-  const auto a_ptr = dynamic_cast<const GpuMatrix*>(&a);
-  const auto b_ptr = dynamic_cast<const GpuMatrix*>(&b);
-  if (a_ptr && b_ptr) {
-    mul(*a_ptr, *b_ptr, scaleAB, scaleT);
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-}
-
-template <class T>
-void printBuf(std::ostream& os, T* a, size_t len, const char* name) {
-  os << "\n: " << name << " [";
-  for (size_t i = 0; i < len; i++) {
-    os << a[i] << " ";
-  }
-  os << "]\n";
-}
-
-void GpuSparseMatrix::print(std::ostream& os) const {
-  if (format_ == SPARSE_CSC) {
-    int nnz = sMatrix_->nnz;
-    IVectorPtr rows = IVector::create(nnz, false);
-    IVectorPtr cols = IVector::create(width_ + 1, false);
-    VectorPtr value = Vector::create(nnz, false);
-    hl_stream_t stream = HPPL_STREAM_DEFAULT;
-    hl_memcpy_from_csc_matrix(value->getData(),
-                              value->getSize(),
-                              rows->getData(),
-                              rows->getSize(),
-                              cols->getData(),
-                              cols->getSize(),
-                              sMatrix_.get(),
-                              stream);
-    hl_stream_synchronize(stream);
-
-    printBuf(os, cols->getData(), width_ + 1, "col idx");
-    printBuf(os, rows->getData(), elementCnt_, "row idx");
-    printBuf(os, value->getData(), elementCnt_, "value");
-  }
-}
-
-void GpuSparseMatrix::copyFromCSR(CpuSparseMatrix& src, hl_stream_t stream) {
-  trans_ = src.trans_;
-  size_t nnz = src.getElementCnt();
-
-  resize(src.getHeight(), src.getWidth(), nnz, valueType_, src.getFormat());
-  // if have different value type, only copy rows and cols
-  SparseValueType vType =
-      valueType_ != src.getValueType() ? NO_VALUE : valueType_;
-
-  sMatrix_->format = HL_SPARSE_CSR;
-  sMatrix_->type = vType == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-  sMatrix_->rows = height_;
-  sMatrix_->cols = width_;
-  sMatrix_->nnz = nnz;
-
-  hl_memcpy_csr_matrix(sMatrix_.get(),
-                       vType == NO_VALUE ? NULL : src.getValue(),
-                       src.getRows(),
-                       src.getCols(),
-                       stream);
-
-  // restore type of sMatrix_
-  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-}
-
-void GpuSparseMatrix::copyFromCSC(CpuSparseMatrix& src, hl_stream_t stream) {
-  trans_ = src.trans_;
-  size_t nnz = src.getElementCnt();
-
-  resize(src.getHeight(), src.getWidth(), nnz, valueType_, src.getFormat());
-
-  // if have different value type, only copy rows and cols
-  SparseValueType vType =
-      valueType_ != src.getValueType() ? NO_VALUE : valueType_;
-
-  sMatrix_->format = HL_SPARSE_CSC;
-  sMatrix_->type = vType == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-  sMatrix_->rows = height_;
-  sMatrix_->cols = width_;
-  sMatrix_->nnz = nnz;
-
-  hl_memcpy_csc_matrix(sMatrix_.get(),
-                       vType == NO_VALUE ? NULL : src.getValue(),
-                       src.getRows(),
-                       src.getCols(),
-                       stream);
-
-  // restore type of sMatrix_
-  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-}
-
-void GpuSparseMatrix::copyFrom(GpuSparseMatrix& src, hl_stream_t stream) {
-  CHECK(trans_ == src.trans_);
-  CHECK(format_ == src.getFormat());
-  resize(src.getHeight(),
-         src.getWidth(),
-         elementCnt_,
-         valueType_,
-         src.getFormat());
-
-  size_t rowSize = format_ == SPARSE_CSC ? elementCnt_ : height_ + 1;
-  size_t colSize = format_ == SPARSE_CSC ? width_ + 1 : elementCnt_;
-
-  if (valueType_ == FLOAT_VALUE && src.getValueType() == FLOAT_VALUE) {
-    hl_memcpy_async(
-        getValue(), src.getValue(), sizeof(real) * elementCnt_, stream);
-  }
-  CHECK(getRows());
-  CHECK(src.getRows());
-
-  hl_memcpy_async(getRows(), src.getRows(), sizeof(int) * rowSize, stream);
-  hl_memcpy_async(getCols(), src.getCols(), sizeof(int) * colSize, stream);
-}
-
-void GpuSparseMatrix::copyFrom(CpuSparseMatrix& src, hl_stream_t stream) {
-  if (format_ == SPARSE_CSR) {
-    copyFromCSR(src, stream);
-  } else {
-    copyFromCSC(src, stream);
-  }
-}
-
-void GpuSparseMatrix::trimFromCSR(const CpuSparseMatrix& src) {
-  trans_ = src.trans_;
-  int* srcCols = src.getCols();
-  size_t nnz = std::count_if(srcCols,
-                             srcCols + src.getElementCnt(),
-                             [this](size_t n) { return n < this->width_; });
-  resize(height_, width_, nnz, valueType_, format_);
-
-  rows_[0] = 0;
-  size_t index = 0;
-  for (size_t r = 0; r < height_; ++r) {
-    for (int i = src.getRows()[r]; i < src.getRows()[r + 1]; ++i) {
-      if (srcCols[i] < (int)width_) {
-        cols_[index] = srcCols[i];
-        if (valueType_ == FLOAT_VALUE) {
-          value_[index] = src.getValue()[i];
-        }
-        ++index;
-      }
-    }
-    rows_[r + 1] = index;
-  }
-  CHECK_EQ(index, nnz);
-
-  sMatrix_->format = HL_SPARSE_CSR;
-  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-  sMatrix_->rows = height_;
-  sMatrix_->cols = width_;
-  sMatrix_->nnz = nnz;
-
-  hl_memcpy_csr_matrix(sMatrix_.get(),
-                       valueType_ == NO_VALUE ? NULL : value_,
-                       rows_,
-                       cols_,
-                       /*default stream = */ HPPL_STREAM_DEFAULT);
-}
-
-void GpuSparseMatrix::trimFromCSC(const CpuSparseMatrix& src) {
-  trans_ = src.trans_;
-  size_t nnz = src.getCols()[width_] - src.getCols()[0];
-  resize(height_, width_, nnz, valueType_, format_);
-
-  cols_[0] = 0;
-  for (size_t i = 0; i < width_; i++) {
-    cols_[i + 1] = cols_[i] + (int)(src.getRowNum(i));
-  }
-  memcpy(rows_, src.getRows() + src.getCols()[0], sizeof(int) * nnz);
-  if (valueType_ == FLOAT_VALUE) {
-    memcpy(value_, src.getValue() + src.getCols()[0], sizeof(real) * nnz);
-  }
-
-  sMatrix_->format = HL_SPARSE_CSC;
-  sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE;
-  sMatrix_->rows = height_;
-  sMatrix_->cols = width_;
-  sMatrix_->nnz = nnz;
-
-  hl_memcpy_csc_matrix(sMatrix_.get(),
-                       valueType_ == NO_VALUE ? NULL : value_,
-                       rows_,
-                       cols_,
-                       /*default stream = */ HPPL_STREAM_DEFAULT);
-}
-
-void GpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) {
-  if (format_ == SPARSE_CSR) {
-    trimFromCSR(src);
-  } else {
-    trimFromCSC(src);
-  }
-}
-
-void GpuSparseMatrix::addBias(Matrix& b, real scale) {
-  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
-  hl_sparse_matrix_s A_d = sMatrix_.get();
-  hl_sparse_matrix_add_bias(A_d, b.getData(), scale);
-}
-
-void GpuSparseMatrix::add3(GpuMatrix* b) {
-  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
-  CHECK(height_ == b->getHeight());
-  CHECK(width_ == b->getWidth());
-  real* B_d = b->getData();
-  hl_sparse_matrix_s A_d = sMatrix_.get();
-  hl_sparse_matrix_add_dense(A_d, B_d, height_, width_, 1, 0);
-}
-
-void GpuSparseMatrix::add3(MatrixPtr b) {
-  if (dynamic_cast<GpuMatrix*>(b.get())) {
-    add3(dynamic_cast<GpuMatrix*>(b.get()));
-  } else {
-    LOG(FATAL) << "not supported";
-  }
-}
-
-void GpuSparseMatrix::zeroMem() {
-  CHECK(valueType_ == FLOAT_VALUE);
-  real* value = getValue();
-  if (value == NULL) {
-    LOG(FATAL) << "value is nullptr";
-  }
-  hl_matrix_zero_mem(value, elementCnt_);
-}
-
-void GpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-#ifdef PADDLE_WITH_CUDA
-  CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
-  size_t numSamples = getHeight();
-  size_t beam = maxVal.getWidth();
-  CHECK_EQ(maxIds.getSize(), numSamples * beam);
-  CHECK_EQ(maxVal.getHeight(), numSamples);
-  CHECK_EQ(format_, SPARSE_CSR) << "Only support SPARSE_CSR";
-
-  hl_sparse_matrix_top_k(maxVal.getData(),
-                         maxVal.getStride(),
-                         maxIds.getData(),
-                         sMatrix_.get(),
-                         beam,
-                         numSamples);
-#endif
-}
-
-template void GpuSparseMatrix::copyFrom(int64_t* ids,
-                                        int64_t* indices,
-                                        sparse_non_value_t* data,
-                                        hl_stream_t stream);
-template void GpuSparseMatrix::copyFrom(int64_t* ids,
-                                        int64_t* indices,
-                                        sparse_float_value_t* data,
-                                        hl_stream_t stream);
-}  // namespace paddle
diff --git a/paddle/math/SparseMatrix.h b/paddle/math/SparseMatrix.h
deleted file mode 100644
index 7c525f4edf3d53544c195f8e253c27a03854a793..0000000000000000000000000000000000000000
--- a/paddle/math/SparseMatrix.h
+++ /dev/null
@@ -1,286 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef PADDLE_MOBILE_INFERENCE
-
-#include <cstddef>
-#include "CpuSparseMatrix.h"
-#include "Matrix.h"
-
-namespace paddle {
-
-typedef std::shared_ptr<_hl_sparse_matrix_s> hl_sparse_matrix_s_ptr;
-
-class GpuSparseMatrix : public Matrix {
-public:
-  MemoryHandlePtr sMemoryHandle_;
-  int* rows_;
-  int* cols_;
-  real* value_;
-  const char* end_; /* point to the end of sMemoryHandle_ */
-
-  hl_sparse_matrix_s_ptr sMatrix_;
-  SparseValueType valueType_;
-  SparseFormat format_;
-
-public:
-  GpuSparseMatrix(size_t height,
-                  size_t width,
-                  size_t nnz, /* used to allocate space */
-                  SparseValueType valueType = FLOAT_VALUE,
-                  SparseFormat format_ = SPARSE_CSR,
-                  bool trans = false);
-
-  GpuSparseMatrix(GpuMemHandlePtr dataHandle,
-                  hl_sparse_matrix_s_ptr sMatrix,
-                  size_t height,
-                  size_t width,
-                  size_t nnz, /* used to allocate space */
-                  SparseValueType valueType = FLOAT_VALUE,
-                  SparseFormat format_ = SPARSE_CSR,
-                  bool trans = false,
-                  MemoryHandlePtr sMemoryHandle = NULL);
-
-  GpuSparseMatrix(real* value,
-                  int* rows,
-                  int* cols,
-                  size_t height,
-                  size_t width,
-                  size_t nnz,
-                  SparseValueType valueType,
-                  SparseFormat format,
-                  bool trans);
-
-  GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix,
-                  size_t height,
-                  size_t width,
-                  size_t nnz,
-                  SparseValueType valueType,
-                  SparseFormat format,
-                  bool trans,
-                  MemoryHandlePtr sMemoryHandle);
-
-protected:
-  struct Element {
-    int row;
-    int col;
-    real val;
-    Element(int rowIn, int colIn, real valIn)
-        : row(rowIn), col(colIn), val(valIn) {}
-  };
-
-public:
-  ~GpuSparseMatrix() {}
-
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format);
-
-  void resize(size_t newHeight, size_t newWidth);
-
-  void sparseResizeCSR();
-
-  void sparseResizeCSC();
-
-  void resizeCSR(size_t newHeight,
-                 size_t newWidth,
-                 size_t newNnz,
-                 SparseValueType valueType);
-
-  void resizeCSC(size_t newHeight,
-                 size_t newWidth,
-                 size_t newNnz,
-                 SparseValueType valueType);
-
-  void mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, real scaleT);
-  /// B = A , B.trans = !A.trans
-  MatrixPtr getTranspose();
-
-  /// B = A'
-  void transpose(MatrixPtr& matTrans, bool memAlloc);
-
-  void copyFrom(const Matrix& src);
-  void copyFrom(const Matrix& src, hl_stream_t stream);
-  void copyFromCSR(CpuSparseMatrix& src, hl_stream_t stream);
-  void copyFromCSC(CpuSparseMatrix& src, hl_stream_t stream);
-
-  void copyFrom(const IVector& src) { LOG(FATAL) << "not implemented"; }
-  void copyFrom(const IVector& src, hl_stream_t stream) {
-    LOG(FATAL) << "not implemented";
-  }
-
-  template <class T>
-  void copyFrom(int64_t* ids, int64_t* indices, T* data, hl_stream_t stream);
-
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values);
-  SparseValueType getValueType() const;
-  SparseFormat getFormat() const { return format_; }
-
-  const int* getRowCols(size_t x) const { return cols_ + rows_[x]; }
-  const real* getRowValues(size_t x) const { return value_ + rows_[x]; }
-  size_t getColNum(size_t x) const { return rows_[x + 1] - rows_[x]; }
-  void print(std::ostream& os) const;
-
-  /**
-   * @brief only set value_ of FLOAT_VALUE sparse matrix to zero
-   */
-  void zeroMem();
-
-  /**
-   * @brief sparseMatrix += denseMatrix
-   *
-   * Named add3 just because add/add2 has been used in BaseMatrix.cu
-   * and they are not virtual function.
-   *
-   * Only add value of same (row, col) index in dense matrix
-   * and do not use others values.
-   *
-   * @param[in]  b   dense matrix
-   */
-  void add3(GpuMatrix* b);
-  void add3(MatrixPtr b);
-
-  /**
-   * @brief sparseMatrix[i,j] += bias[j], (j is the col index of sparse matrix)
-   *
-   * @param[in]  b      bias, dense matrix and height = 1
-   * @param[in]  scale  scale of b
-   */
-  void addBias(Matrix& b, real scale);
-
-  /**
-   * @brief return rows, which is gpu address
-   */
-  int* getRows() const {
-    CHECK(sMatrix_.get()) << "sMatrix_ is NULL";
-    return hl_sparse_matrix_get_rows(sMatrix_.get());
-  }
-
-  /**
-   * @brief return cols, which is gpu address
-   */
-  int* getCols() const {
-    CHECK(sMatrix_.get()) << "sMatrix_ is NULL";
-    return hl_sparse_matrix_get_cols(sMatrix_.get());
-  }
-
-  /**
-   * @brief return value, which is gpu address
-   */
-  real* getValue() const {
-    CHECK(sMatrix_.get()) << "sMatrix_ is NULL";
-    return hl_sparse_matrix_get_value(sMatrix_.get());
-  }
-
-  /**
-   * @brief return value_ of sparse matrix
-   *
-   * Some times CpuSparseMatrix maybe Matrix,
-   * if getValue, must dynamic_cast to CpuSparseMatrix,
-   * getData is convenient to get value
-   */
-  real* getData() { return getValue(); }
-  const real* getData() const { return getValue(); }
-
-  /**
-   * @brief  Get top k value of each row in sparse matrix.
-   *
-   * Store the value in maxVal and theirs index in maxIds.
-   * k = maxVal.width
-   *
-   * @param[out]  maxIds    index of top k
-   * @param[out]  maxVal    value of top k
-   */
-  void rowMax(IVector& maxIds, Matrix& maxVal);
-
-protected:
-  void sparseResize();
-
-  void copyRow(int offsets, size_t colNum, const sparse_non_value_t* row);
-  void copyRow(int offsets, size_t colNum, const sparse_float_value_t* row);
-
-public:
-  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
-
-  void copyFrom(CpuSparseMatrix& src, hl_stream_t stream);
-  void copyFrom(GpuSparseMatrix& src, hl_stream_t stream);
-
-  void trimFrom(const CpuSparseMatrix& src);
-  void trimFromCSR(const CpuSparseMatrix& src);
-  void trimFromCSC(const CpuSparseMatrix& src);
-
-  // BaseMatrixT interface
-public:
-  bool isSparse() const { return true; }
-
-private:
-  using Matrix::mul;
-  using Matrix::copyFrom;
-  using Matrix::rowMax;
-  using Matrix::print;
-  using Matrix::subMatrix;
-};
-
-}  // namespace paddle
-
-#else
-
-#include "CpuSparseMatrix.h"
-
-namespace paddle {
-
-class GpuSparseMatrix : public Matrix {
-public:
-  GpuSparseMatrix(size_t height,
-                  size_t width,
-                  size_t nnz, /* used to allocate space */
-                  SparseValueType valueType = FLOAT_VALUE,
-                  SparseFormat format_ = SPARSE_CSR,
-                  bool trans = false)
-      : Matrix(NULL, height, width, trans, false) {}
-
-  GpuSparseMatrix(real* value,
-                  int* rows,
-                  int* cols,
-                  size_t height,
-                  size_t width,
-                  size_t nnz,
-                  SparseValueType valueType,
-                  SparseFormat format,
-                  bool trans)
-      : Matrix(NULL, height, width, trans, true) {}
-
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format) {}
-  void resize(size_t newHeight, size_t newWidth) {}
-  MatrixPtr getTranspose() { return nullptr; }
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values) {}
-};
-
-}  // namespace paddle
-
-#endif
diff --git a/paddle/math/SparseRowMatrix.cpp b/paddle/math/SparseRowMatrix.cpp
deleted file mode 100644
index 4254175aabc8c32edb243d4a82c2e34c81393f74..0000000000000000000000000000000000000000
--- a/paddle/math/SparseRowMatrix.cpp
+++ /dev/null
@@ -1,282 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SparseRowMatrix.h"
-#include "CpuSparseMatrix.h"
-
-#include <algorithm>
-
-#include "paddle/utils/Logging.h"
-
-#include "SIMDFunctions.h"
-
-#include "paddle/utils/Thread.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-const unsigned int SparseRowCpuMatrix::kUnusedId_ = -1U;
-
-void SparseRowCpuMatrix::init(size_t height, size_t width) {
-  height_ = height;
-  if (!indexDictHandle_) {
-    indexDictHandle_.reset(new IndexDict);
-    indexDictHandle_->globalIndices.assign(height, kUnusedId_);
-  }
-  localIndices_ = &indexDictHandle_->localIndices;
-  globalIndices_ = indexDictHandle_->globalIndices.data();
-}
-
-void SparseRowCpuMatrix::mul(CpuSparseMatrix* a,
-                             CpuMatrix* b,
-                             real scaleAB,
-                             real scaleT) {
-  CpuMatrix::mul<CpuMatrix, SparseRowCpuMatrix>(a, b, this, scaleAB, scaleT);
-}
-
-void SparseRowCpuMatrix::copyFrom(const real* src, size_t size) {
-  LOG(FATAL) << "This should not be called";
-}
-
-void SparseRowCpuMatrix::zeroMem() {
-  apply([](real* buf, size_t len) { memset(buf, 0, sizeof(real) * len); });
-  clearRows();
-}
-
-void SparseRowCpuMatrix::applyL1(real learningRate, real decayRate) {
-  apply([=](real* buf, size_t len) {
-    CpuVector value(0, nullptr);
-    value.subVecFrom(buf, 0, len);
-    value.applyL1(learningRate, decayRate);
-  });
-}
-
-void SparseRowCpuMatrix::sgdUpdate(BaseMatrix& value,
-                                   IVector& t0,
-                                   real learningRate,
-                                   int currentTime,
-                                   real decayRate,
-                                   bool useL1,
-                                   bool fini) {
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-
-  // t0 and value are vectors
-  CHECK_EQ(t0.getSize(), this->height_);
-  CHECK_EQ(value.width_, this->height_ * this->width_);
-
-  if (decayRate == 0.0f) {
-    if (fini) {
-      return;
-    }
-
-    for (size_t i = 0; i < localIndices.size(); ++i) {
-      real* g = getLocalRow(i);
-      real* v = value.rowBuf(localIndices[i]);
-      for (size_t j = 0; j < this->width_; ++j) {
-        v[j] -= learningRate * g[j];
-      }
-    }
-    return;
-  }  // else
-
-  if (useL1) {  // L1 decay
-    if (fini) {
-      for (size_t i = 0; i < this->height_; ++i) {
-        real* v = value.rowBuf(i);
-        int* t = t0.getData() + i;
-        if (t[0] < currentTime) {
-          // W(t0) -> W(t+1)
-          int tDiff = currentTime - t[0];
-          real delta = tDiff * learningRate * decayRate;
-          simd::decayL1(v, v, delta, this->width_);
-        }
-      }
-      return;
-    }  // else
-
-    for (size_t i = 0; i < localIndices.size(); ++i) {
-      real* g = getLocalRow(i);
-      real* v = value.rowBuf(localIndices[i]);
-      int* t = t0.getData() + localIndices[i];
-      if (t[0] < currentTime) {
-        // W(t0) -> W(t)
-        int tDiff = currentTime - t[0];
-        real delta = tDiff * learningRate * decayRate;
-        simd::decayL1(v, v, delta, this->width_);
-      }
-
-      // W(t) -> W(t+1)
-      for (size_t j = 0; j < this->width_; ++j) {
-        v[j] -= learningRate * g[j];
-      }
-      simd::decayL1(v, v, learningRate * decayRate, this->width_);
-
-      // state update to t+1
-      t[0] = currentTime + 1;
-    }
-
-  } else {  // L2 decay
-    if (fini) {
-      for (size_t i = 0; i < this->height_; ++i) {
-        real* v = value.rowBuf(i);
-        int* t = t0.getData() + i;
-        if (t[0] < currentTime) {
-          // W(t0) -> W(t+1)
-          int tDiff = currentTime - t[0];
-          real recip = 1.0f / (1.0f + tDiff * learningRate * decayRate);
-          for (size_t j = 0; j < this->width_; ++j) {
-            v[j] *= recip;
-          }
-        }
-      }
-      return;
-    }  // else
-
-    real recipDecay = 1.0f / (1.0f + learningRate * decayRate);
-
-    for (size_t i = 0; i < localIndices.size(); ++i) {
-      real* g = getLocalRow(i);
-      real* v = value.rowBuf(localIndices[i]);
-      int* t = t0.getData() + localIndices[i];
-      if (t[0] < currentTime) {
-        // W(t0) -> W(t)
-        int tDiff = currentTime - t[0];
-        real recip = 1.0f / (1.0f + tDiff * learningRate * decayRate);
-        for (size_t j = 0; j < this->width_; ++j) {
-          v[j] *= recip;
-        }
-      }
-
-      // W(t) -> W(t+1)
-      for (size_t j = 0; j < this->width_; ++j) {
-        v[j] = recipDecay * (v[j] - learningRate * g[j]);
-      }
-
-      // state update to t+1
-      t[0] = currentTime + 1;
-    }
-  }
-}
-
-void SparseRowCpuMatrix::addTo(BaseMatrix& dest,
-                               std::vector<uint32_t>& ids,
-                               size_t tid,
-                               size_t numThreads) {
-  CHECK(!dest.useGpu_);
-  CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_);
-
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-  for (size_t i = 0; i < localIndices.size(); ++i) {
-    uint32_t id = localIndices[i];
-    if (id % numThreads == tid) {
-      simd::addTo(dest.rowBuf(id), getLocalRow(i), this->width_);
-      ids.push_back(id);
-    }
-  }
-}
-
-void SparseRowCpuMatrix::addTo(SparseRowCpuMatrix& dest,
-                               size_t tid,
-                               size_t numThreads) {
-  CHECK(!dest.useGpu_);
-  CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_);
-
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-  for (size_t i = 0; i < localIndices.size(); ++i) {
-    uint32_t id = localIndices[i];
-    if (id % numThreads == tid) {
-      dest.checkIndex(id);
-      simd::addTo(dest.getRow(id), getLocalRow(i), this->width_);
-    }
-  }
-}
-
-void SparseRowCpuMatrix::zeroMemThread(size_t tid, size_t numThreads) {
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-  for (size_t i = 0; i < localIndices.size(); ++i) {
-    uint32_t id = localIndices[i];
-    if (id % numThreads == tid) {
-      memset(this->getLocalRow(i), 0, this->width_ * sizeof(real));
-    }
-  }
-}
-
-void SparseAutoGrowRowCpuMatrix::mul(CpuSparseMatrix* a,
-                                     CpuMatrix* b,
-                                     real scaleAB,
-                                     real scaleT) {
-  CpuMatrix::mul<CpuMatrix, SparseAutoGrowRowCpuMatrix>(
-      a, b, this, scaleAB, scaleT);
-}
-
-void CacheRowCpuMatrix::mul(CpuSparseMatrix* a,
-                            CpuMatrix* b,
-                            real scaleAB,
-                            real scaleT) {
-  CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(a, b, this, scaleAB, scaleT);
-}
-
-void SparsePrefetchRowCpuMatrix::addRows(const unsigned int* ids, size_t len) {
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-  for (size_t i = 0; i < len; i++) {
-    CHECK_LT(*(ids + i), this->getHeight())
-        << "id:" << *(ids + i) << "Height:" << this->getHeight()
-        << "sparse id value exceeds the max input dimension, "
-        << "it could be caused invalid input data samples";
-  }
-  localIndices.insert(localIndices.end(), ids, ids + len);
-}
-
-void SparsePrefetchRowCpuMatrix::addRows(MatrixPtr input) {
-  CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(input.get());
-  CHECK(mat) << "only support sparse matrix";
-  addRows(reinterpret_cast<const unsigned int*>(mat->getCols()),
-          mat->getElementCnt());
-}
-
-void SparsePrefetchRowCpuMatrix::addRows(IVectorPtr ids) {
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-  size_t numSamples = ids->getSize();
-  int* index = ids->getData();
-  for (size_t i = 0; i < numSamples; ++i) {
-    if (index[i] == -1) continue;
-
-    unsigned int id = (unsigned int)index[i];
-    CHECK_LT(id, this->getHeight())
-        << "id:" << id << "Height:" << this->getHeight()
-        << "sparse id value exceeds the max input dimension, "
-        << "it could be caused invalid input data samples";
-    localIndices.push_back(id);
-  }
-}
-
-void SparsePrefetchRowCpuMatrix::setupIndices() {
-  auto& localIndices = indexDictHandle_->localIndices;
-  uniqueIds(localIndices);
-  // for each sparse row
-  for (size_t id = 0; id < localIndices.size(); ++id) {
-    globalIndices_[localIndices[id]] = id;  // sparse row -> local id
-  }
-  checkStoreSize();
-}
-
-void SparseRowCpuMatrix::checkIndices() {
-  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
-  for (size_t i = 0; i < localIndices.size(); ++i) {
-    CHECK_EQ(globalIndices_[localIndices[i]], i);
-  }
-  checkStoreSize();
-}
-
-}  // namespace paddle
diff --git a/paddle/math/SparseRowMatrix.h b/paddle/math/SparseRowMatrix.h
deleted file mode 100644
index 3920de32df7de925d6e22e17b93b15bff8785675..0000000000000000000000000000000000000000
--- a/paddle/math/SparseRowMatrix.h
+++ /dev/null
@@ -1,341 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef PADDLE_MOBILE_INFERENCE
-
-#include <gflags/gflags.h>
-#include <string.h>
-#include <algorithm>
-#include "Matrix.h"
-#include "RowBuffer.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-/**
- * Sparse Row
- */
-class SparseRowCpuMatrix : public CpuMatrix {
-public:
-  struct IndexDict {
-    // In the following, global id means the row id in the original matrix.
-    // Local id means the row id in the local storage which only contains
-    // the sparse rows.
-    std::vector<unsigned int> localIndices;   // local id -> global id
-    std::vector<unsigned int> globalIndices;  // global id -> local id
-  };
-  typedef std::shared_ptr<IndexDict> IndexDictPtr;
-
-  /// heightStore is max number of rows of the sparse matrix.
-  SparseRowCpuMatrix(CpuMemHandlePtr dataHandle,
-                     size_t height,
-                     size_t width,
-                     IndexDictPtr indexDictHandle = nullptr,
-                     bool trans = false)
-      : CpuMatrix(nullptr, height, width, trans),
-        indexDictHandle_(indexDictHandle) {
-    init(height, width);
-    buf_.reset(new RowBuffer(dataHandle, width));
-  }
-
-  virtual ~SparseRowCpuMatrix() {}
-
-public:
-  /**
-   *  Get the row buf
-   *
-   *  @param row row id in the original matrix
-   */
-  real* getRow(size_t row) {
-    CHECK_NE(globalIndices_[row], kUnusedId_);
-    return getLocalRow(globalIndices_[row]);
-  }
-
-  /**
-   *  Get the row buf
-   *
-   *  @param row row id in local storage
-   */
-  real* getLocalRow(size_t row) { return buf_->getWithAutoGrowth(row); }
-
-  /**
-   *  reserve the storage for rows according to current size of
-   * indexDictHandle.
-   *
-   *  This is only used when SparseRowCpuMatrix is constructed with
-   *  indexDictHandle.
-   */
-  void reserveStore() { buf_->resize(localIndices_->size()); }
-
-  // row is the row id in the original matrix
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-
-  /**
-   * Fill data according to row indexs added, setup indices inside.
-   *
-   * *src* and *size* are data and size of normal dense CpuMatrix.
-   */
-  virtual void copyFrom(const real* src, size_t size);
-  virtual void zeroMem();
-
-  /**
-   * apply L1 to all sparse rows, should be apply after indices ready.
-   */
-  virtual void applyL1(real learningRate, real decayRate);
-
-  void clearIndices() { clearRows(); }
-  void zeroMemThread(size_t tid, size_t numThreads);
-
-  /**
-   *  value -= grad * learningRate,  this is gradient.
-   *
-   * If L1 decay set use L1, else if L2 set use L2, otherwise no decay atall.
-   *
-   * t0 is a int vector used by L1/L2 decay, size = height of parameter
-   * matrix,
-   * store the time that each weight row last updated.
-   *
-   * Time is batchId, currentTime is current batchId.
-   *
-   * While pass finished, caller should call this func one more time
-   *  with (fini=true) to let weight decay catch up current time.
-   */
-  void sgdUpdate(BaseMatrix& value,
-                 IVector& t0,
-                 real learningRate,
-                 int currentTime,
-                 real decayRate,
-                 bool useL1,
-                 bool fini = false);
-
-  /**
-   *  merge rows in *this* to *dest* for designated thread
-   *
-   *  values add to *dest* matrix
-   *
-   *  ids occured in *this* append to *ids*
-   *  filtered by  (id % numThreads == tid)
-   */
-  void addTo(BaseMatrix& dest,
-             std::vector<uint32_t>& ids,
-             size_t tid,
-             size_t numThreads);
-
-  /**
-   *  the second version addTo(), *dest* is a SparseRowCpuMatrix.
-   *
-   *  The dest's indices should be setup already, addTo() will
-   *  check src ids is exist in dest's indices.
-   */
-  void addTo(SparseRowCpuMatrix& dest, size_t tid, size_t numThreads);
-
-  const IndexDictPtr& getIndexDictHandle() const { return indexDictHandle_; }
-
-  /**
-   *  check all local and global indices consistency
-   */
-  void checkIndices();
-  /**
-   *  check whether row *i* exist in indices
-   */
-  void checkIndex(size_t i) {
-    size_t localId = globalIndices_[i];
-    CHECK_LT(localId, localIndices_->size());
-    CHECK_EQ((*localIndices_)[localId], i);
-  }
-
-  std::vector<unsigned int>& getLocalIndices() const {
-    return indexDictHandle_->localIndices;
-  }
-
-protected:
-  template <typename Func>
-  void apply(Func f) {
-    f(buf_->data(), localIndices_->size() * width_);
-  }
-
-  void init(size_t height, size_t width);
-
-  /// clear row indices.
-  void clearRows() {
-    for (auto id : *localIndices_) {
-      globalIndices_[id] = kUnusedId_;
-    }
-    localIndices_->clear();
-    buf_->clear();
-  }
-
-  inline void checkStoreSize() {
-    if (buf_->isAutoGrowth()) {
-      if (buf_->getRowCount() > 0.5 * height_) {
-        LOG(WARNING) << "There are more than 0.5*height ("
-                     << localIndices_->size() << ") rows are used for sparse "
-                     << "update, which is not efficient. Considering not use "
-                     << "sparse_update.";
-      }
-    } else {
-      CHECK_LE(localIndices_->size(), buf_->getRowCount());
-    }
-  }
-
-  std::unique_ptr<RowBuffer> buf_;
-  IndexDictPtr indexDictHandle_;
-  std::vector<unsigned int>* localIndices_;  // =&indexDictHandle_->localIndices
-  unsigned int* globalIndices_;  // =indexDictHandle_->globalIndices.data();
-  static const unsigned int kUnusedId_;
-};
-
-class SyncThreadPool;
-
-/// For prefetching parameters from remote Parameter server
-class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
-public:
-  SparsePrefetchRowCpuMatrix(CpuMemHandlePtr dataHandle,
-                             size_t height,
-                             size_t width,
-                             IndexDictPtr indexDictHandle = nullptr,
-                             SyncThreadPool* pool = nullptr,
-                             bool trans = false)
-      : SparseRowCpuMatrix(dataHandle, height, width, indexDictHandle, trans),
-        pool_(pool) {}
-
-  /**
-   * Extract feature ids from *input*, to fill row indexs.
-   *
-   * *input* must be sparse matrix.
-   *
-   * Can call many times before setup.
-   */
-  void addRows(MatrixPtr input);
-  void addRows(IVectorPtr ids);
-
-  /**
-   * setup global indices of SparseRowMatrix after finish add rows.
-   */
-  void setupIndices();
-
-protected:
-  void addRows(const unsigned int* ids, size_t len);
-  SyncThreadPool* pool_;
-};
-
-class SparseAutoGrowRowCpuMatrix : public SparseRowCpuMatrix {
-public:
-  SparseAutoGrowRowCpuMatrix(size_t height,
-                             size_t width,
-                             IndexDictPtr indexDictHandle = nullptr,
-                             bool trans = false)
-      : SparseRowCpuMatrix(nullptr, height, width, indexDictHandle, trans) {}
-
-  real* getRow(size_t row) {
-    auto id = globalIndices_[row];
-    if (id == kUnusedId_) {
-      id = globalIndices_[row] = localIndices_->size();
-      localIndices_->push_back(row);
-      checkStoreSize();
-    }
-    return getLocalRow(id);
-  }
-
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-};
-
-class CacheRowCpuMatrix : public SparseAutoGrowRowCpuMatrix {
-public:
-  CacheRowCpuMatrix(size_t height,
-                    size_t width,
-                    IndexDictPtr indexDictHandle = nullptr,
-                    bool trans = false)
-      : SparseAutoGrowRowCpuMatrix(height, width, indexDictHandle, trans),
-        sourceData_(nullptr) {}
-
-  void setSourceData(CpuVectorPtr sourceVec) {
-    sourceDataVec_ = sourceVec;
-    sourceData_ = sourceVec->getData();
-  }
-
-  real* getRow(size_t row) {
-    auto id = globalIndices_[row];
-    if (id == kUnusedId_) {
-      id = globalIndices_[row] = localIndices_->size();
-      localIndices_->push_back(row);
-      checkStoreSize();
-      memcpy(
-          getLocalRow(id), sourceData_ + width_ * row, sizeof(float) * width_);
-    }
-    return getLocalRow(id);
-  }
-
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-
-public:
-  CpuVectorPtr sourceDataVec_;
-  real* sourceData_;
-};
-
-/**
- * Sparse Row Ids Matrix.
- *
- * mostly same as CpuMatrix, but maintain sparse row ids occured,
- * ids are hashed by worker thread id.
- */
-class SparseRowIdsCpuMatrix : public CpuMatrix {
-public:
-  SparseRowIdsCpuMatrix(CpuMemHandlePtr dataHandle,
-                        size_t height,
-                        size_t width,
-                        bool trans = false)
-      : CpuMatrix(dataHandle, height, width, trans) {}
-
-  void setNumOfThreads(size_t numOfThreads) { idsArray_.resize(numOfThreads); }
-
-  std::vector<uint32_t>& getIds(size_t threadId) { return idsArray_[threadId]; }
-
-private:
-  std::vector<std::vector<uint32_t>> idsArray_;
-};
-
-}  // namespace paddle
-
-#else
-namespace paddle {
-
-class SparseRowCpuMatrix : public CpuMatrix {
-public:
-  void reserveStore() {}
-  void clearIndices() {}
-};
-
-class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
-public:
-  void setupIndices() {}
-  void addRows(MatrixPtr input) {}
-  void addRows(IVectorPtr ids) {}
-};
-
-class SparseAutoGrowRowCpuMatrix : public SparseRowCpuMatrix {};
-class CacheRowCpuMatrix : public SparseAutoGrowRowCpuMatrix {};
-class SparseRowIdsCpuMatrix : public CpuMatrix {};
-
-}  // namespace paddle
-
-#endif
diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp
deleted file mode 100644
index 5982bf2e5637ff4b4af6baae47e40b68e0c07c86..0000000000000000000000000000000000000000
--- a/paddle/math/Storage.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Storage.h"
-#include "Allocator.h"
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Util.h"
-
-#ifndef PADDLE_MOBILE_INFERENCE
-DEFINE_int32(pool_limit_size,
-             536870912,
-             "maximum memory size managed by a memory pool, default is 512M");
-#else
-DEFINE_int32(pool_limit_size, 0, "default is 0");
-#endif
-
-namespace paddle {
-
-// Initialization StorageEngine singleton.
-// Other modules may rely on storage management,
-// so StorageEngine need to be initialized before other modules.
-static InitFunction __init_storage_engine([]() { StorageEngine::singleton(); },
-                                          std::numeric_limits<int>::max());
-
-StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {}
-
-StorageEngine::~StorageEngine() {
-  delete cpuAllocator_;
-  for (auto it : gpuAllocator_) {
-    delete it;
-  }
-}
-
-StorageEngine* StorageEngine::singleton() {
-  static StorageEngine storage;
-  return &storage;
-}
-
-PoolAllocator* StorageEngine::getGpuAllocator(int deviceId) {
-  {
-    // if gpuAllocator_ has been constructed
-    ReadLockGuard guard(lock_);
-    if (deviceId < static_cast<int>(gpuAllocator_.size()) &&
-        (gpuAllocator_[deviceId] != nullptr)) {
-      return gpuAllocator_[deviceId];
-    }
-  }
-
-  {
-    // Construct gpuAllocator_
-    std::lock_guard<RWLock> guard(lock_);
-    if (deviceId >= static_cast<int>(gpuAllocator_.size())) {
-      gpuAllocator_.resize(deviceId + 1);
-    }
-    if (gpuAllocator_[deviceId] == nullptr) {
-      std::string name =
-          "gpu" + str::to_string(deviceId) + std::string("_pool");
-      gpuAllocator_[deviceId] =
-          new PoolAllocator(new GpuAllocator(), FLAGS_pool_limit_size, name);
-    }
-    return gpuAllocator_[deviceId];
-  }
-}
-
-PoolAllocator* StorageEngine::getCpuAllocator() {
-  {
-    // if cpuAllocator_ has been constructed
-    ReadLockGuard guard(lock_);
-    if (cpuAllocator_ != nullptr) {
-      return cpuAllocator_;
-    }
-  }
-
-  {
-    // Construct cpuAllocator_
-    std::lock_guard<RWLock> guard(lock_);
-    if (cpuAllocator_ == nullptr) {
-      if (FLAGS_use_gpu) {
-        cpuAllocator_ = new PoolAllocator(
-            new CudaHostAllocator(), FLAGS_pool_limit_size, "cuda_host_pool");
-      } else {
-        cpuAllocator_ = new PoolAllocator(
-            new CpuAllocator(), FLAGS_pool_limit_size, "cpu_pool");
-      }
-    }
-    return cpuAllocator_;
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/math/Storage.h b/paddle/math/Storage.h
deleted file mode 100644
index ba8f4689a1e896304aa14821b40fc8ff0c304bb2..0000000000000000000000000000000000000000
--- a/paddle/math/Storage.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <mutex>
-#include <vector>
-#include "PoolAllocator.h"
-#include "paddle/utils/Locks.h"
-
-namespace paddle {
-
-/**
- * @brief Storage manager for multiple devices.
- */
-class StorageEngine {
-public:
-  /**
-   * @return Storage singleton
-   */
-  static StorageEngine* singleton();
-
-  /**
-   * @return return one gpu allocator by deviceId
-   */
-  PoolAllocator* getGpuAllocator(int deviceId);
-
-  /**
-   * @return return cpu allocator
-   */
-  PoolAllocator* getCpuAllocator();
-
-protected:
-  StorageEngine();
-  ~StorageEngine();
-  RWLock lock_;
-  std::vector<PoolAllocator*> gpuAllocator_;
-  PoolAllocator* cpuAllocator_;
-};
-
-}  // namespace paddle
diff --git a/paddle/math/TensorApply.h b/paddle/math/TensorApply.h
deleted file mode 100644
index 7d79cae5a11851b190afbb9ac94efdf2ba2510b7..0000000000000000000000000000000000000000
--- a/paddle/math/TensorApply.h
+++ /dev/null
@@ -1,211 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle {
-
-/**
- * \brief The tensor evaluator classes.
- */
-template <typename Derived, class T>
-class TensorApply {
-public:
-  explicit INLINE TensorApply(const Derived& p)
-      : data_(p.data_),
-        stride_(p.stride_),
-        height_(p.height_),
-        width_(p.width_),
-        useGpu_(p.useGpu_) {}
-
-  INLINE T apply(int i, int j) const { return data_[i * stride_ + j]; }
-  INLINE T apply(int index) const { return data_[index]; }
-  INLINE T& applyRef(int i, int j) { return data_[i * stride_ + j]; }
-  INLINE T& applyRef(int index) { return data_[index]; }
-
-  INLINE size_t getWidth() const { return width_; }
-  INLINE size_t getHeight() const { return height_; }
-  INLINE bool isContiguous() const { return stride_ == width_ || height_ == 1; }
-  INLINE bool useGpu() const { return useGpu_; }
-
-  T* data_;
-  size_t stride_;
-  size_t height_;
-  size_t width_;
-  bool useGpu_;
-};
-
-/**
- * \brief The tensor evaluator classes.
- * evaluator for rvalues
- */
-template <typename Derived, class T>
-class TensorApply<const Derived, T> {
-public:
-  explicit INLINE TensorApply(const Derived& p)
-      : data_(p.data_),
-        stride_(p.stride_),
-        height_(p.height_),
-        width_(p.width_),
-        useGpu_(p.useGpu_) {}
-
-  INLINE T apply(int i, int j) const { return data_[i * stride_ + j]; }
-  INLINE T apply(int index) const { return data_[index]; }
-
-  INLINE size_t getWidth() const { return width_; }
-  INLINE size_t getHeight() const { return height_; }
-  INLINE bool isContiguous() const { return stride_ == width_ || height_ == 1; }
-  INLINE bool useGpu() const { return useGpu_; }
-
-  const T* data_;
-  size_t stride_;
-  size_t height_;
-  size_t width_;
-  bool useGpu_;
-};
-
-template <typename Derived, class T>
-class TensorApply<const TensorExpression<Derived, T>, T> {
-public:
-  explicit TensorApply(const TensorExpression<Derived, T>& expr)
-      : expr_(expr.derived()) {}
-
-  INLINE T apply(int i, int j) const { return expr_.apply(i, j); }
-  INLINE T apply(int index) const { return expr_.apply(index); }
-
-  INLINE size_t getWidth() const { return expr_.getWidth(); }
-  INLINE size_t getHeight() const { return expr_.getHeight(); }
-  INLINE bool isContiguous() const { return expr_.isContiguous(); }
-  INLINE bool useGpu() const { return expr_.useGpu(); }
-
-  TensorApply<const Derived, T> expr_;
-};
-
-/**
- * \brief The unary expression evaluator classes.
- */
-template <class OP, typename ArgType, class T>
-class TensorApply<const TensorUnaryOp<OP, ArgType, T>, T> {
-public:
-  explicit INLINE TensorApply(const TensorUnaryOp<OP, ArgType, T>& expr)
-      : op_(expr.op_), expr_(expr.expr_) {}
-
-  INLINE T apply(int i, int j) const { return op_(expr_.apply(i, j)); }
-  INLINE T apply(int index) const { return op_(expr_.apply(index)); }
-
-  INLINE size_t getWidth() const { return expr_.getWidth(); }
-  INLINE size_t getHeight() const { return expr_.getHeight(); }
-  INLINE bool isContiguous() const { return expr_.isContiguous(); }
-  INLINE bool useGpu() const { return expr_.useGpu(); }
-
-  const OP op_;
-  TensorApply<ArgType, T> expr_;
-};
-
-/**
- * \brief The binary expression evaluator classes.
- */
-template <class OP, typename LhsType, typename RhsType, class T>
-class TensorApply<const TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
-public:
-  explicit INLINE TensorApply(
-      const TensorBinaryOp<OP, LhsType, RhsType, T>& expr)
-      : op_(expr.op_), lhs_(expr.lhs_), rhs_(expr.rhs_) {
-#ifndef __CUDA_ARCH__
-    CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
-    CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
-    CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
-#endif
-  }
-
-  INLINE T apply(int i, int j) const {
-    return op_(lhs_.apply(i, j), rhs_.apply(i, j));
-  }
-  INLINE T apply(int index) const {
-    return op_(lhs_.apply(index), rhs_.apply(index));
-  }
-
-  INLINE size_t getWidth() const { return lhs_.getWidth(); }
-  INLINE size_t getHeight() const { return rhs_.getHeight(); }
-  INLINE bool isContiguous() const {
-    return lhs_.isContiguous() && rhs_.isContiguous();
-  }
-  INLINE bool useGpu() const { return lhs_.useGpu(); }
-
-  const OP op_;
-  TensorApply<LhsType, T> lhs_;
-  TensorApply<RhsType, T> rhs_;
-};
-
-/**
- * \brief The ternary expression evaluator classes.
- */
-template <typename ArgType1, typename ArgType2, typename ArgType3, class T>
-class TensorApply<const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>, T> {
-public:
-  explicit INLINE TensorApply(
-      const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>& expr)
-      : expr1_(expr.expr1_), expr2_(expr.expr2_), expr3_(expr.expr3_) {
-#ifndef __CUDA_ARCH__
-    CHECK_EQ(expr1_.getWidth(), expr2_.getWidth());
-    CHECK_EQ(expr1_.getWidth(), expr3_.getWidth());
-    CHECK_EQ(expr1_.getHeight(), expr2_.getHeight());
-    CHECK_EQ(expr1_.getHeight(), expr3_.getHeight());
-    CHECK_EQ(expr1_.useGpu(), expr2_.useGpu());
-    CHECK_EQ(expr1_.useGpu(), expr3_.useGpu());
-#endif
-  }
-
-  INLINE T apply(int i, int j) const {
-    return expr1_.apply(i, j) ? expr2_.apply(i, j) : expr3_.apply(i, j);
-  }
-  INLINE T apply(int index) const {
-    return expr1_.apply(index) ? expr2_.apply(index) : expr3_.apply(index);
-  }
-
-  INLINE size_t getWidth() const { return expr1_.getWidth(); }
-  INLINE size_t getHeight() const { return expr1_.getHeight(); }
-  INLINE bool isContiguous() const {
-    return expr1_.isContiguous() && expr2_.isContiguous() &&
-           expr3_.isContiguous();
-  }
-  INLINE bool useGpu() const { return expr1_.useGpu(); }
-
-  TensorApply<ArgType1, T> expr1_;
-  TensorApply<ArgType2, T> expr2_;
-  TensorApply<ArgType3, T> expr3_;
-};
-
-/**
- * \brief The const expression evaluator classes.
- */
-template <class OP, typename ArgType, class T>
-class TensorApply<const TensorConstant<OP, ArgType, T>, T> {
-public:
-  explicit INLINE TensorApply(const TensorConstant<OP, ArgType, T>& expr)
-      : op_(expr.op_), expr_(expr.expr_) {}
-
-  INLINE T apply(int i, int j) const { return op_(i, j); }
-  INLINE T apply(int index) const { return op_(index); }
-
-  INLINE size_t getWidth() const { return expr_.getWidth(); }
-  INLINE size_t getHeight() const { return expr_.getHeight(); }
-  INLINE bool isContiguous() const { return true; }
-  INLINE bool useGpu() const { return expr_.useGpu(); }
-
-  const OP op_;
-  TensorApply<ArgType, T> expr_;
-};
-
-}  // namespace paddle
diff --git a/paddle/math/TensorAssign.h b/paddle/math/TensorAssign.h
deleted file mode 100644
index 113d98c16b22b06971040b1a1ce52c696f6c3c14..0000000000000000000000000000000000000000
--- a/paddle/math/TensorAssign.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-/**
- * \brief Tensor Assign Expression(return by lazyAssign,
- * and evaluated by AssignEvaluate)
- */
-template <typename LhsType, typename RhsType, class T>
-class TensorAssignOp {
-public:
-  explicit TensorAssignOp(const LhsType& lhs, const RhsType& rhs)
-      : lhs_(lhs), rhs_(rhs) {
-#ifndef __CUDA_ARCH__
-    CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
-    CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
-    CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
-#endif
-  }
-
-  INLINE void apply(const int i, const int j) {
-    lhs_.applyRef(i, j) = rhs_.apply(i, j);
-  }
-  INLINE void apply(const int index) {
-    lhs_.applyRef(index) = rhs_.apply(index);
-  }
-
-  INLINE size_t getWidth() const { return lhs_.getWidth(); }
-  INLINE size_t getHeight() const { return rhs_.getHeight(); }
-  INLINE bool isContiguous() const {
-    return lhs_.isContiguous() && rhs_.isContiguous();
-  }
-  INLINE bool useGpu() const { return lhs_.useGpu(); }
-
-private:
-  TensorApply<LhsType, T> lhs_;
-  TensorApply<const RhsType, T> rhs_;
-};
-
-template <typename Assign, typename... AssignOp>
-void AssignCpuEvaluate(int height,
-                       int width,
-                       bool isContiguous,
-                       Assign&& assign,
-                       AssignOp&&... args) {
-  if (isContiguous) {
-    int size = height * width;
-    for (int index = 0; index < size; index++) {
-      assign.apply(index);
-      __attribute__((unused)) int dummy[] = {(((args)).apply(index), 0)...};
-    }
-  } else {
-    for (int i = 0; i < height; i++) {
-      for (int j = 0; j < width; j++) {
-        assign.apply(i, j);
-        __attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
-      }
-    }
-  }
-}
-
-#ifdef __NVCC__
-template <typename Assign, typename... AssignOp>
-__global__ void AssignGpuEvaluate1(const int border,
-                                   Assign assign,
-                                   AssignOp... args) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < border) {
-    assign.apply(idx);
-    __attribute__((unused)) int dummy[] = {(((args)).apply(idx), 0)...};
-  }
-}
-
-template <typename Assign, typename... AssignOp>
-__global__ void AssignGpuEvaluate2(const int height,
-                                   const int width,
-                                   Assign assign,
-                                   AssignOp... args) {
-  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  for (int i = rowIdx; i < height; i += gridDim.y * blockDim.y) {
-    for (int j = colIdx; j < width; j += gridDim.x * blockDim.x) {
-      assign.apply(i, j);
-      __attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
-    }
-  }
-}
-#endif
-
-/**
- * \brief Evaluate one or more TensorAssignOp objects.
- *
- * \note At least one assignment expression is required
- */
-template <typename Assign, typename... AssignOp>
-void AssignEvaluate(Assign&& assign, AssignOp&&... args) {
-  const bool useGpu_ = assign.useGpu();
-  bool isContiguous_ = assign.isContiguous();
-  const size_t height = assign.getHeight();
-  const size_t width = assign.getWidth();
-
-  const int packSize = sizeof...(args);
-  const bool packUseGpu[] = {((args)).useGpu()...};
-  const bool packIsContiguous[] = {((args)).isContiguous()...};
-  const size_t packHeight[] = {((args)).getHeight()...};
-  const size_t packWidth[] = {((args)).getWidth()...};
-
-  for (int i = 0; i < packSize; i++) {
-    CHECK_EQ(useGpu_, packUseGpu[i]);
-    CHECK_EQ(height, packHeight[i]);
-    CHECK_EQ(width, packWidth[i]);
-    isContiguous_ = isContiguous_ && packIsContiguous[i];
-  }
-
-  if (useGpu_) {
-#ifdef __NVCC__
-    if (isContiguous_) {
-      int size = height * width;
-      int blockSize = size <= 1024 ? size : 1024;
-      int gridSize = (size + 1024 - 1) / 1024;
-      AssignGpuEvaluate1<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
-          size, assign, args...);
-    } else {
-      int blockSizeY = std::min(32, (int)height);
-      int blockSizeX = (32 / blockSizeY) * 32;
-      int gridSizeX = std::min(32, (int)(width + blockSizeX - 1) / blockSizeX);
-      int gridSizeY = std::min(32, (int)(height + blockSizeY - 1) / blockSizeY);
-      dim3 threads(blockSizeX, blockSizeY);
-      dim3 grid(gridSizeX, gridSizeY);
-      AssignGpuEvaluate2<<<grid, threads, 0, STREAM_DEFAULT>>>(
-          height, width, assign, args...);
-    }
-
-    CHECK_SYNC("AssignEvaluate failed");
-#endif
-  } else {
-    AssignCpuEvaluate(height, width, isContiguous_, assign, args...);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/math/TensorEvaluate.h b/paddle/math/TensorEvaluate.h
deleted file mode 100644
index 2a722016e777a131ef14636a6871d29d9b131044..0000000000000000000000000000000000000000
--- a/paddle/math/TensorEvaluate.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include "hl_base.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-/**
- * \brief The tensor cpu evaluate api.
- */
-template <class T, typename LeftType, typename RightType>
-inline void TensorCpuApply(LeftType& lhs, const RightType& rhs) {
-  TensorApply<LeftType, T> lhs_(lhs);
-  TensorApply<const RightType, T> rhs_(rhs);
-  CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
-  CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
-  CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
-
-  int height = lhs_.getHeight();
-  int width = lhs_.getWidth();
-  if (lhs_.isContiguous() && rhs_.isContiguous()) {
-    int size = height * width;
-    for (int index = 0; index < size; index++) {
-      lhs_.applyRef(index) = rhs_.apply(index);
-    }
-  } else {
-    for (int i = 0; i < height; i++) {
-      for (int j = 0; j < width; j++) {
-        lhs_.applyRef(i, j) = rhs_.apply(i, j);
-      }
-    }
-  }
-}
-
-#ifdef __NVCC__
-template <typename LeftType, typename RightType>
-__global__ void TensorElementWiseOp(LeftType lhs,
-                                    RightType rhs,
-                                    const int border) {
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < border) {
-    lhs.applyRef(idx) = rhs.apply(idx);
-  }
-}
-
-template <typename LeftType, typename RightType>
-__global__ void TensorElementWiseOp(LeftType lhs, RightType rhs) {
-  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
-  for (int i = rowIdx; i < lhs.getHeight(); i += gridDim.y * blockDim.y) {
-    for (int j = colIdx; j < lhs.getWidth(); j += gridDim.x * blockDim.x) {
-      lhs.applyRef(i, j) = rhs.apply(i, j);
-    }
-  }
-}
-
-/**
- * \brief The tensor gpu evaluate api.
- */
-template <class T, typename LeftType, typename RightType>
-inline void TensorGpuApply(LeftType& lhs, const RightType& rhs) {
-  TensorApply<LeftType, T> lhs_(lhs);
-  TensorApply<const RightType, T> rhs_(rhs);
-  CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
-  CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
-  CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
-
-  int dimM = lhs_.getHeight();
-  int dimN = lhs_.getWidth();
-
-  if (lhs_.isContiguous() && rhs_.isContiguous()) {
-    int size = dimM * dimN;
-    int blockSize = size <= 1024 ? size : 1024;
-    int gridSize = (size + 1024 - 1) / 1024;
-    TensorElementWiseOp<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
-        lhs_, rhs_, size);
-  } else {
-    int blockSizeY = std::min(32, dimM);
-    int blockSizeX = (32 / blockSizeY) * 32;
-    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
-    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
-    dim3 threads(blockSizeX, blockSizeY);
-    dim3 grid(gridSizeX, gridSizeY);
-    TensorElementWiseOp<<<grid, threads, 0, STREAM_DEFAULT>>>(lhs_, rhs_);
-  }
-
-  CHECK_SYNC("TensorGpuApply failed");
-}
-#else
-template <class T, typename LeftType, typename RightType>
-inline void TensorGpuApply(LeftType& lhs, RightType& rhs) {
-  LOG(FATAL) << "Since it is gcc compiled, "
-                "this calculation does not support GPU implementation.";
-}
-#endif
-
-}  // namespace paddle
diff --git a/paddle/math/TensorExpression.h b/paddle/math/TensorExpression.h
deleted file mode 100644
index 83229ae65dd1f4ed6b885c3d6195b3758b8ba039..0000000000000000000000000000000000000000
--- a/paddle/math/TensorExpression.h
+++ /dev/null
@@ -1,446 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <stdint.h>
-#include <cstddef>
-#include "hl_tensor_ops.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-template <class OP, typename ExprType, class T>
-class TensorConstant;
-template <class OP, typename ExprType, class T>
-class TensorUnaryOp;
-template <class OP, typename LhsType, typename RhsType, class T>
-class TensorBinaryOp;
-template <typename ExprType1, typename ExprType2, typename ExprType3, class T>
-class TensorTernaryOp;
-
-template <typename LhsType, typename RhsType, class T>
-class TensorAssignOp;
-
-/**
- * \brief Tensor base class.
- *
- * This is the base class of all Tensor and Expression class.
- */
-template <typename Derived, class T>
-class TensorExpression {
-public:
-  /**
-   * Element wise unary expression.
-   */
-  template <typename UnaryOp>
-  const TensorUnaryOp<UnaryOp, const Derived, T> unaryExpression(
-      const UnaryOp& op) const {
-    return TensorUnaryOp<UnaryOp, const Derived, T>(op, derived());
-  }
-
-  const TensorUnaryOp<hppl::unary::add_scale<T>, const Derived, T> operator+(
-      T p) const {
-    return unaryExpression(hppl::unary::add_scale<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::sub_scale<T>, const Derived, T> operator-(
-      T p) const {
-    return unaryExpression(hppl::unary::sub_scale<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::mul_scale<T>, const Derived, T> operator*(
-      T p) const {
-    return unaryExpression(hppl::unary::mul_scale<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::div_scale<T>, const Derived, T> operator/(
-      T p) const {
-    return unaryExpression(hppl::unary::div_scale<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::neg<T>, const Derived, T> operator-() const {
-    return unaryExpression(hppl::unary::neg<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::exp_op<T>, const Derived, T> exp() const {
-    return unaryExpression(hppl::unary::exp_op<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::log_op<T>, const Derived, T> log() const {
-    return unaryExpression(hppl::unary::log_op<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::sqrt_op<T>, const Derived, T> sqrt() const {
-    return unaryExpression(hppl::unary::sqrt_op<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::square<T>, const Derived, T> square() const {
-    return unaryExpression(hppl::unary::square<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::reciprocal<T>, const Derived, T> reciprocal()
-      const {
-    return unaryExpression(hppl::unary::reciprocal<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::abs<T>, const Derived, T> abs() const {
-    return unaryExpression(hppl::unary::abs<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::sign<T>, const Derived, T> sign() const {
-    return unaryExpression(hppl::unary::sign<T>());
-  }
-
-  const TensorUnaryOp<hppl::unary::pow_op<T>, const Derived, T> pow(T p) const {
-    return unaryExpression(hppl::unary::pow_op<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::min<T>, const Derived, T> min(T p) const {
-    return unaryExpression(hppl::unary::min<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::max<T>, const Derived, T> max(T p) const {
-    return unaryExpression(hppl::unary::max<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::cmp_eq<T>, const Derived, T> operator==(
-      T p) const {
-    return unaryExpression(hppl::unary::cmp_eq<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::cmp_ne<T>, const Derived, T> operator!=(
-      T p) const {
-    return unaryExpression(hppl::unary::cmp_ne<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::cmp_le<T>, const Derived, T> operator<=(
-      T p) const {
-    return unaryExpression(hppl::unary::cmp_le<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::cmp_lt<T>, const Derived, T> operator<(
-      T p) const {
-    return unaryExpression(hppl::unary::cmp_lt<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::cmp_ge<T>, const Derived, T> operator>=(
-      T p) const {
-    return unaryExpression(hppl::unary::cmp_ge<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::cmp_gt<T>, const Derived, T> operator>(
-      T p) const {
-    return unaryExpression(hppl::unary::cmp_gt<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::and_op<T>, const Derived, T> operator&&(
-      T p) const {
-    return unaryExpression(hppl::unary::and_op<T>(p));
-  }
-
-  const TensorUnaryOp<hppl::unary::or_op<T>, const Derived, T> operator||(
-      T p) const {
-    return unaryExpression(hppl::unary::or_op<T>(p));
-  }
-
-  /**
-   * Element wise binary expression.
-   */
-  template <typename BinaryOp, typename ExpressionType>
-  const TensorBinaryOp<BinaryOp, const Derived, const ExpressionType, T>
-  binaryExpression(const BinaryOp& op, const ExpressionType& expr) const {
-    return TensorBinaryOp<BinaryOp, const Derived, const ExpressionType, T>(
-        op, derived(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::cmp_eq<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator==(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::cmp_eq<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::cmp_ne<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator!=(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::cmp_ne<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::cmp_le<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator<=(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::cmp_le<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::cmp_lt<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator<(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::cmp_lt<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::cmp_ge<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator>=(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::cmp_ge<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::cmp_gt<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator>(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::cmp_gt<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::and_op<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator&&(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::and_op<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::or_op<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator||(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::or_op<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::add<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator+(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::add<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::sub<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator-(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::sub<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::mul<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator*(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::mul<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::div<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  operator/(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::div<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::min<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  min(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::min<T>(), expr);
-  }
-
-  template <typename ExpressionType>
-  const TensorBinaryOp<hppl::binary::max<T>,
-                       const Derived,
-                       const ExpressionType,
-                       T>
-  max(const ExpressionType& expr) const {
-    return binaryExpression(hppl::binary::max<T>(), expr);
-  }
-
-  /**
-   * Element wise ternary expression.
-   *
-   * ternary conditional operator(?: operator).
-   * The conditional expression returns one of two values depending on
-   * the result of derived expression.
-   * If derived expression evaluates to true, then expression1 is evaluated.
-   * If derived expression evaluates to false, then expression2 is evaluated.
-   */
-  template <typename ExprType1, typename ExprType2>
-  const TensorTernaryOp<const Derived, const ExprType1, const ExprType2, T>
-  condition(const ExprType1& expr1, const ExprType2& expr2) const {
-    return TensorTernaryOp<const Derived, const ExprType1, const ExprType2, T>(
-        derived(), expr1, expr2);
-  }
-
-  template <typename ExprType>
-  const TensorTernaryOp<
-      const Derived,
-      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
-      const ExprType,
-      T>
-  condition(T p, const ExprType& expr) const {
-    return condition(constant(p), expr);
-  }
-
-  template <typename ExprType>
-  const TensorTernaryOp<
-      const Derived,
-      const ExprType,
-      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
-      T>
-  condition(const ExprType& expr, T p) const {
-    return condition(expr, constant(p));
-  }
-
-  const TensorTernaryOp<
-      const Derived,
-      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
-      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
-      T>
-  condition(T p1, T p2) const {
-    return condition(constant(p1), constant(p2));
-  }
-
-  /**
-   * return a TensorConstant. A TensorConstant object hold a constant value.
-   */
-  const TensorConstant<hppl::unary::constant<T>, const Derived, T> constant(
-      T p) const {
-    return TensorConstant<hppl::unary::constant<T>, const Derived, T>(
-        hppl::unary::constant<T>(p), derived());
-  }
-
-  /**
-   * return a TensorAssignOp, and use AssignEvaluate to evaluate one or more
-   * TensorAssignOp objects.
-   */
-  template <typename ExpressionType>
-  TensorAssignOp<Derived, ExpressionType, T> lazyAssign(
-      const ExpressionType& expr) const {
-    return TensorAssignOp<Derived, ExpressionType, T>(derived(), expr);
-  }
-
-protected:
-  const Derived& derived() const { return *static_cast<const Derived*>(this); }
-};
-
-/**
- * \brief Unary Operator Expression
- */
-template <class OP, typename ExprType, class T>
-class TensorUnaryOp
-    : public TensorExpression<TensorUnaryOp<OP, ExprType, T>, T> {
-public:
-  explicit TensorUnaryOp(const OP op, const ExprType& expr)
-      : op_(op), expr_(expr) {}
-
-  const OP op_;
-  const ExprType expr_;
-};
-
-/**
- * \brief Binary Operator Expression
- */
-template <class OP, typename LhsType, typename RhsType, class T>
-class TensorBinaryOp
-    : public TensorExpression<TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
-public:
-  explicit TensorBinaryOp(const OP op, const LhsType& lhs, const RhsType& rhs)
-      : op_(op), lhs_(lhs), rhs_(rhs) {}
-
-  const OP op_;
-  const LhsType lhs_;
-  const RhsType rhs_;
-};
-
-/**
- * \brief Ternary Operator Expression
- */
-template <typename ExprType1, typename ExprType2, typename ExprType3, class T>
-class TensorTernaryOp : public TensorExpression<
-                            TensorTernaryOp<ExprType1, ExprType2, ExprType3, T>,
-                            T> {
-public:
-  explicit TensorTernaryOp(const ExprType1& expr1,
-                           const ExprType2& expr2,
-                           const ExprType3& expr3)
-      : expr1_(expr1), expr2_(expr2), expr3_(expr3) {}
-
-  const ExprType1 expr1_;
-  const ExprType2 expr2_;
-  const ExprType3 expr3_;
-};
-
-/**
- * \brief Constant Expression
- */
-template <class OP, typename ExprType, class T>
-class TensorConstant
-    : public TensorExpression<TensorConstant<OP, ExprType, T>, T> {
-public:
-  explicit TensorConstant(const OP op, const ExprType& expr)
-      : op_(op), expr_(expr) {}
-
-  const OP op_;
-  const ExprType expr_;
-};
-
-/**
- * \brief operator+ overload
- * \return a unary operator expression
- */
-template <typename Derived, class T>
-const TensorUnaryOp<hppl::unary::add_scale<T>, const Derived, T> operator+(
-    T p, const TensorExpression<Derived, T>& expr) {
-  return expr + p;
-}
-
-/**
- * \brief operator* overload
- * \return a unary operator expression
- */
-template <typename Derived, class T>
-const TensorUnaryOp<hppl::unary::mul_scale<T>, const Derived, T> operator*(
-    T p, const TensorExpression<Derived, T>& expr) {
-  return expr * p;
-}
-
-}  // namespace paddle
-
-#include "TensorApply.h"
-#include "TensorEvaluate.h"
diff --git a/paddle/math/TrainingAlgorithmOp.cu b/paddle/math/TrainingAlgorithmOp.cu
deleted file mode 100644
index b844768d3b9fd05b5a0eada5e315b9e91588a4ee..0000000000000000000000000000000000000000
--- a/paddle/math/TrainingAlgorithmOp.cu
+++ /dev/null
@@ -1,356 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "BaseMatrix.h"
-#include "TrainingAlgorithmOp.h"
-#include "paddle/utils/Logging.h"
-
-#if __cplusplus > 199711L
-
-#include "TensorAssign.h"
-
-namespace paddle {
-
-void sparseMomentumApply(BaseMatrix& value,
-                         BaseMatrix& grad,
-                         BaseMatrix& momU,
-                         BaseMatrix& momV,
-                         real alpha,
-                         real beta,
-                         real gamma,
-                         real tau,
-                         real learningRate) {
-  auto expr1 = momU.lazyAssign(momU - (alpha * gamma * learningRate) * grad);
-  auto expr2 =
-      momV.lazyAssign(momV + (tau * alpha * gamma * learningRate) * grad);
-  auto expr3 = value.lazyAssign((tau / beta + (real)1 / alpha) * momU +
-                                ((real)1 / beta) * momV);
-
-  AssignEvaluate(expr1, expr2, expr3);
-}
-
-void adadeltaApply(BaseMatrix& value,
-                   BaseMatrix& grad,
-                   BaseMatrix& mom,
-                   BaseMatrix& accum,
-                   BaseMatrix& accum_update,
-                   BaseMatrix& lr,
-                   real rou,
-                   real epsilon,
-                   real learningRate,
-                   real momentum,
-                   real decayRate) {
-  auto expr1 = accum.lazyAssign(rou * accum + ((real)1 - rou) * grad.square());
-  auto expr2 =
-      lr.lazyAssign(((accum_update + epsilon) / (accum + epsilon)).sqrt());
-  auto expr3 = accum_update.lazyAssign(rou * accum_update +
-                                       ((real)1 - rou) * (grad * lr).square());
-  auto expr4 = mom.lazyAssign(mom * momentum -
-                              learningRate * lr * (grad + value * decayRate));
-  auto expr5 = value.lazyAssign(value + mom);
-
-  AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
-}
-
-void adagradApply(BaseMatrix& value,
-                  BaseMatrix& grad,
-                  BaseMatrix& mom,
-                  BaseMatrix& accum_buffer,
-                  BaseMatrix& accum,
-                  BaseMatrix& lr,
-                  real epsilon,
-                  real learningRate,
-                  real momentum,
-                  real decayRate) {
-  auto expr1 = accum.lazyAssign(accum + grad.square());
-  auto expr2 =
-      lr.lazyAssign((accum_buffer + accum + epsilon).sqrt().reciprocal());
-  auto expr3 = mom.lazyAssign(mom * momentum -
-                              learningRate * lr * (grad + value * decayRate));
-  auto expr4 = value.lazyAssign(value + mom);
-
-  AssignEvaluate(expr1, expr2, expr3, expr4);
-}
-
-void rmspropApply(BaseMatrix& value,
-                  BaseMatrix& grad,
-                  BaseMatrix& mom,
-                  BaseMatrix& g,
-                  BaseMatrix& f,
-                  BaseMatrix& lr,
-                  real accumulatedRou,
-                  real rou,
-                  real epsilon,
-                  real learningRate,
-                  real momentum,
-                  real decayRate,
-                  bool firstTime) {
-  auto expr2 = f.lazyAssign(accumulatedRou * f + ((real)1 - rou) * grad);
-  auto expr3 = lr.lazyAssign((g - f.square() + epsilon).sqrt().reciprocal());
-  auto expr4 = mom.lazyAssign(mom * momentum -
-                              learningRate * lr * (grad + value * decayRate));
-  auto expr5 = value.lazyAssign(value + mom);
-
-  if (firstTime) {
-    auto expr1 = g.lazyAssign(accumulatedRou * g + grad.square());
-
-    AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
-  } else {
-    auto expr1 =
-        g.lazyAssign(accumulatedRou * g + ((real)1 - rou) * grad.square());
-
-    AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
-  }
-}
-
-void decayedAdagradApply(BaseMatrix& value,
-                         BaseMatrix& grad,
-                         BaseMatrix& mom,
-                         BaseMatrix& accum,
-                         BaseMatrix& lr,
-                         real accumulatedRou,
-                         real rou,
-                         real epsilon,
-                         real learningRate,
-                         real momentum,
-                         real decayRate,
-                         bool firstTime) {
-  auto expr2 = lr.lazyAssign((accum + epsilon).sqrt().reciprocal());
-  auto expr3 = mom.lazyAssign(mom * momentum -
-                              learningRate * lr * (grad + value * decayRate));
-  auto expr4 = value.lazyAssign(value + mom);
-
-  if (firstTime) {
-    auto expr1 = accum.lazyAssign(accumulatedRou * accum + grad.square());
-
-    AssignEvaluate(expr1, expr2, expr3, expr4);
-  } else {
-    auto expr1 = accum.lazyAssign(accumulatedRou * accum +
-                                  ((real)1 - rou) * grad.square());
-
-    AssignEvaluate(expr1, expr2, expr3, expr4);
-  }
-}
-
-void adamApply(BaseMatrix& value,
-               BaseMatrix& grad,
-               BaseMatrix& mom,  // firse moment
-               BaseMatrix& v,    // second moment
-               real beta1,
-               real beta2,
-               real beta1_power,
-               real beta2_power,
-               real epsilon,
-               real learningRate) {
-  real alpha =
-      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
-
-  auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
-  auto expr2 = v.lazyAssign(beta2 * v + ((real)1 - beta2) * grad.square());
-  auto expr3 = value.lazyAssign(value - (mom * alpha) / (v.sqrt() + epsilon));
-
-  AssignEvaluate(expr1, expr2, expr3);
-}
-
-void adamaxApply(BaseMatrix& value,
-                 BaseMatrix& grad,
-                 BaseMatrix& mom,  // firse moment
-                 BaseMatrix& u,    // weighted infinity norm
-                 real beta1,
-                 real beta2,
-                 int64_t step,
-                 real alpha) {
-  auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
-  auto expr2 =
-      u.lazyAssign((beta2 * u > grad.abs()).condition(beta2 * u, grad.abs()));
-  auto expr3 = value.lazyAssign(
-      value - (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u));
-
-  AssignEvaluate(expr1, expr2, expr3);
-}
-
-}  // namespace paddle
-
-#else
-
-namespace paddle {
-
-void sparseMomentumApply(BaseMatrix& value,
-                         BaseMatrix& grad,
-                         BaseMatrix& momU,
-                         BaseMatrix& momV,
-                         real alpha,
-                         real beta,
-                         real gamma,
-                         real tau,
-                         real learningRate) {
-  /**
-   * \alpha_t = \alpha_{t-1} / k
-   * \beta_t = \beta_{t-1} / (1 + \lambda\gamma_t)
-   * u_t = u_{t-1} - \alpha_t \gamma_t g_t
-   * v_t = v_{t-1} + \tau_{t-1} \alpha_t \gamma_t g_t
-   * \tau_t = \tau_{t-1} + \beta_t / \alpha_t
-   */
-  momU -= (alpha * gamma * learningRate) * grad;
-  momV += (tau * alpha * gamma * learningRate) * grad;
-  value = (tau / beta + (real)1 / alpha) * momU + ((real)1 / beta) * momV;
-}
-
-void adadeltaApply(BaseMatrix& value,
-                   BaseMatrix& grad,
-                   BaseMatrix& mom,
-                   BaseMatrix& accum,
-                   BaseMatrix& accum_update,
-                   BaseMatrix& lr,
-                   real rou,
-                   real epsilon,
-                   real learningRate,
-                   real momentum,
-                   real decayRate) {
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  accum = rou * accum + ((real)1 - rou) * grad.square();
-
-  // learn_rate: sqrt(( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ))
-  lr = ((accum_update + epsilon) / (accum + epsilon)).sqrt();
-
-  // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
-  accum_update = rou * accum_update + ((real)1 - rou) * (grad * lr).square();
-
-  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
-  value += mom;
-}
-
-void adagradApply(BaseMatrix& value,
-                  BaseMatrix& grad,
-                  BaseMatrix& mom,
-                  BaseMatrix& accum_buffer,
-                  BaseMatrix& accum,
-                  BaseMatrix& lr,
-                  real epsilon,
-                  real learningRate,
-                  real momentum,
-                  real decayRate) {
-  accum += grad.square();
-  lr = (accum_buffer + accum + epsilon).sqrt().reciprocal();
-  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
-  value += mom;
-}
-
-void rmspropApply(BaseMatrix& value,
-                  BaseMatrix& grad,
-                  BaseMatrix& mom,
-                  BaseMatrix& g,
-                  BaseMatrix& f,
-                  BaseMatrix& lr,
-                  real accumulatedRou,
-                  real rou,
-                  real epsilon,
-                  real learningRate,
-                  real momentum,
-                  real decayRate,
-                  bool firstTime) {
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  // For the first time update, make the sum be the current square
-  // so that the initial estimation of E(g_t^2) will not be too small.
-  if (firstTime) {
-    g = accumulatedRou * g + grad.square();
-  } else {
-    g = accumulatedRou * g + ((real)1 - rou) * grad.square();
-  }
-
-  // E(f_t) = \rou * E(f_{t-1}) + (1-\rou) * g
-  f = accumulatedRou * f + ((real)1 - rou) * grad;
-
-  // learn_rate = 1/sqrt( ( E(g_t^2) - (E(f_t))^2 + epsilon )
-  // Basiclly if the sign of the gradient changes more often,
-  // the learning rate will be decreased.
-  lr = (g - f.square() + epsilon).sqrt().reciprocal();
-
-  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
-  value += mom;
-}
-
-void decayedAdagradApply(BaseMatrix& value,
-                         BaseMatrix& grad,
-                         BaseMatrix& mom,
-                         BaseMatrix& accum,
-                         BaseMatrix& lr,
-                         real accumulatedRou,
-                         real rou,
-                         real epsilon,
-                         real learningRate,
-                         real momentum,
-                         real decayRate,
-                         bool firstTime) {
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  // For the first time update, make the sum be the current square
-  // so that the initial estimation of E(g_t^2) will not be too small.
-  if (firstTime) {
-    accum = accumulatedRou * accum + grad.square();
-  } else {
-    accum = accumulatedRou * accum + ((real)1 - rou) * grad.square();
-  }
-
-  // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
-  // Basiclly if the bigger the magnitude gradient is,
-  // the smaller the learning rate will be.
-  lr = (accum + epsilon).sqrt().reciprocal();
-
-  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
-  value += mom;
-}
-
-void adamApply(BaseMatrix& value,
-               BaseMatrix& grad,
-               BaseMatrix& mom,  // firse moment
-               BaseMatrix& v,    // second moment
-               real beta1,
-               real beta2,
-               real beta1_power,
-               real beta2_power,
-               real epsilon,
-               real learningRate) {
-  real alpha =
-      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
-
-  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
-  mom = beta1 * mom + ((real)1 - beta1) * grad;
-
-  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
-  v = beta2 * v + ((real)1 - beta2) * grad.square();
-
-  value -= (mom * alpha) / (v.sqrt() + epsilon);
-}
-
-void adamaxApply(BaseMatrix& value,
-                 BaseMatrix& grad,
-                 BaseMatrix& mom,  // firse moment
-                 BaseMatrix& u,    // weighted infinity norm
-                 real beta1,
-                 real beta2,
-                 int64_t step,
-                 real alpha) {
-  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
-  mom = beta1 * mom + ((real)1 - beta1) * grad;
-
-  // u_t = max(\beta_2*u_{t-1}, abs(g_t))
-  u = (beta2 * u > grad.abs()).condition(beta2 * u, grad.abs());
-
-  // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
-  value -= (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u);
-}
-
-}  // namespace paddle
-
-#endif
diff --git a/paddle/math/TrainingAlgorithmOp.h b/paddle/math/TrainingAlgorithmOp.h
deleted file mode 100644
index fe40fc2d36e796bd4be7b7fc1e12a6eafa5d4700..0000000000000000000000000000000000000000
--- a/paddle/math/TrainingAlgorithmOp.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "BaseMatrix.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-/**
- * \brief Sparse Momentum optimizer.
- */
-extern void sparseMomentumApply(BaseMatrix& value,
-                                BaseMatrix& grad,
-                                BaseMatrix& momU,
-                                BaseMatrix& momV,
-                                real alpha,
-                                real beta,
-                                real gamma,
-                                real tau,
-                                real learningRate);
-
-/**
- * \brief AdaDelta optimizer.
- */
-extern void adadeltaApply(BaseMatrix& value,
-                          BaseMatrix& grad,
-                          BaseMatrix& sum,
-                          BaseMatrix& sum1,
-                          BaseMatrix& mom,
-                          BaseMatrix& lr,
-                          real rou,
-                          real epsilon,
-                          real learningRate,
-                          real momentum,
-                          real decayRate);
-
-/**
- * \brief AdaGrad optimizer.
- */
-extern void adagradApply(BaseMatrix& value,
-                         BaseMatrix& grad,
-                         BaseMatrix& sum,
-                         BaseMatrix& sum1,
-                         BaseMatrix& mom,
-                         BaseMatrix& lr,
-                         real epsilon,
-                         real learningRate,
-                         real momentum,
-                         real decayRate);
-
-/**
- * \brief RMSProp optimizer.
- */
-extern void rmspropApply(BaseMatrix& value,
-                         BaseMatrix& grad,
-                         BaseMatrix& g,
-                         BaseMatrix& f,
-                         BaseMatrix& mom,
-                         BaseMatrix& lr,
-                         real accumulatedRou,
-                         real rou,
-                         real epsilon,
-                         real learningRate,
-                         real momentum,
-                         real decayRate,
-                         bool firstTime);
-
-/**
- * \brief Decayed AdaGrad optimizer.
- */
-extern void decayedAdagradApply(BaseMatrix& value,
-                                BaseMatrix& grad,
-                                BaseMatrix& mom,
-                                BaseMatrix& accum,
-                                BaseMatrix& lr,
-                                real accumulatedRou,
-                                real rou,
-                                real epsilon,
-                                real learningRate,
-                                real momentum,
-                                real decayRate,
-                                bool firstTime);
-
-/**
- * \brief Adam optimizer.
- */
-extern void adamApply(BaseMatrix& value,
-                      BaseMatrix& grad,
-                      BaseMatrix& mom,
-                      BaseMatrix& v,
-                      real beta1,
-                      real beta2,
-                      real beta1_power,
-                      real beta2_power,
-                      real epsilon,
-                      real learningRate);
-
-/**
- * \brief AdaMax optimizer.
- */
-extern void adamaxApply(BaseMatrix& value,
-                        BaseMatrix& grad,
-                        BaseMatrix& mom,  // firse moment
-                        BaseMatrix& u,    // weighted infinity norm
-                        real beta1,
-                        real beta2,
-                        int64_t step,
-                        real alpha);
-}  // namespace paddle
diff --git a/paddle/math/Vector.cpp b/paddle/math/Vector.cpp
deleted file mode 100644
index 2a47ed7ef81a2e969757c244370cc346b13e1c03..0000000000000000000000000000000000000000
--- a/paddle/math/Vector.cpp
+++ /dev/null
@@ -1,1091 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Vector.h"
-#include "paddle/utils/Util.h"
-
-#include <memory>
-#include "Matrix.h"
-#include "hl_gpu.h"
-#include "hl_matrix.h"
-#include "hl_table_apply.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Thread.h"
-#include "paddle/utils/ThreadLocal.h"
-
-namespace paddle {
-
-template <class T>
-std::shared_ptr<VectorT<T>> VectorT<T>::create(size_t size, bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuVectorT<T>>(size);
-  } else {
-    return std::make_shared<CpuVectorT<T>>(size);
-  }
-}
-
-template <class T>
-std::shared_ptr<VectorT<T>> VectorT<T>::createParallelVector(
-    size_t size, bool useGpu, SyncThreadPool* pool) {
-  if (!useGpu && FLAGS_trainer_count > 1 && FLAGS_enable_parallel_vector &&
-      size >= (size_t)FLAGS_enable_parallel_vector) {
-    return std::make_shared<ParallelCpuVectorT<T>>(
-        size, pool ? pool : getGlobalSyncThreadPool());
-  } else {
-    return create(size, useGpu);
-  }
-}
-
-template <class T>
-std::shared_ptr<VectorT<T>> VectorT<T>::create(T* data,
-                                               size_t size,
-                                               bool useGpu) {
-  if (useGpu) {
-    return std::make_shared<GpuVectorT<T>>(size, data);
-  } else {
-    return std::make_shared<CpuVectorT<T>>(size, data);
-  }
-}
-
-template <class T>
-std::shared_ptr<VectorT<T>> VectorT<T>::create(size_t size,
-                                               MemoryHandlePtr memoryHandle,
-                                               size_t offset) {
-  if (auto cpuMemHandle =
-          std::dynamic_pointer_cast<CpuMemoryHandle>(memoryHandle)) {
-    return std::make_shared<CpuVectorT<T>>(size, cpuMemHandle, offset);
-  } else if (auto gpuMemHandle =
-                 std::dynamic_pointer_cast<GpuMemoryHandle>(memoryHandle)) {
-    return std::make_shared<GpuVectorT<T>>(size, gpuMemHandle, offset);
-  } else {
-    LOG(FATAL) << "Wrong";
-    return NULL;
-  }
-}
-
-template <>
-MatrixPtr VectorT<real>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
-  LOG(FATAL) << "Wrong for real vector";
-  return nullptr;
-}
-
-template <>
-MatrixPtr VectorT<int>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
-  size_t height = getSize();
-  size_t width = idRange;
-  MatrixPtr mat = Matrix::createSparseMatrix(
-      height, idRange, height, NO_VALUE, SPARSE_CSR, false, useGpu);
-
-  CpuIVector cpuIds(height);
-  cpuIds.copyFrom(*this);
-  int* idData = cpuIds.getData();
-
-  for (decltype(height) i = 0; i < height; i++) {
-    const unsigned int id = idData[i];
-    CHECK_LT(id, width);
-    mat->setRow(i, 1, &id, nullptr);
-  }
-  return mat;
-}
-
-template <>
-std::shared_ptr<VectorT<int>> VectorT<real>::castToInt() {
-  std::shared_ptr<VectorT<int>> ret = IVector::create(this->getSize(), useGpu_);
-  if (useGpu_) {
-    hl_vector_cast2int(ret->getData(), this->getData(), this->getSize());
-  } else {
-    for (size_t i = 0; i < getSize(); ++i) {
-      ret->getData()[i] = int(this->getData()[i]);
-    }
-  }
-  return ret;
-}
-
-template <class T>
-GpuVectorT<T>::GpuVectorT(size_t size)
-    : VectorT<T>(size,
-                 std::make_shared<GpuMemoryHandle>(sizeof(T) * size),
-                 0, /* offset = 0 */
-                 true /* useGpu = true */) {}
-
-template <class T>
-T GpuVectorT<T>::getElement(size_t i) const {
-  T elem = 0;
-  hl_memcpy_device2host(&elem, const_cast<T*>(&this->getData()[i]), sizeof(T));
-  return elem;
-}
-template <class T>
-void GpuVectorT<T>::setElement(size_t i, const T& value) {
-  hl_memcpy_host2device(&this->getData()[i], const_cast<T*>(&value), sizeof(T));
-}
-
-template <class T>
-T* GpuVectorT<T>::getPoint(const uint64_t beginPos) {
-  LOG(FATAL) << "Not implemented" << beginPos;
-  return NULL;
-}
-
-template <>
-int GpuVectorT<int>::getAbsSum() {
-  LOG(FATAL) << "Not implemented";
-  return 0;
-}
-
-template <>
-int GpuVectorT<int>::getSum() {
-  LOG(FATAL) << "Not implemented";
-  return 0;
-}
-
-template <>
-real GpuVectorT<real>::getAbsSum() {
-  real* A = this->getData();
-  real sum = 0;
-  hl_vector_abs_sum(A, &sum, this->getSize());
-  return sum;
-}
-
-template <>
-real GpuVectorT<real>::getSum() {
-  real* A = this->getData();
-  real sum = 0;
-  hl_vector_sum(A, &sum, this->getSize());
-  return sum;
-}
-
-template <>
-int GpuVectorT<int>::getMax() {
-  CpuIVector cpuIVec = CpuIVector(this->getSize());
-  copyTo(&cpuIVec);
-  return cpuIVec.getMax();
-}
-
-template <>
-int GpuVectorT<int>::getAbsMax() {
-  CpuIVector cpuIVec = CpuIVector(this->getSize());
-  copyTo(&cpuIVec);
-  return cpuIVec.getAbsMax();
-}
-
-template <class T>
-void GpuVectorT<T>::isEqualTo(const VectorT<T>& b, const T& value) {
-  BaseMatrixT<T>::isEqualTo((BaseMatrixT<T>&)b, value);
-}
-
-template <class T>
-void GpuVectorT<T>::selectFrom(const VectorT<T>& src, const VectorT<int>& ids) {
-#ifdef PADDLE_WITH_CUDA
-  hl_vector_select_from<T>(this->getData(),
-                           this->getSize(),
-                           src.getData(),
-                           src.getSize(),
-                           ids.getData(),
-                           ids.getSize());
-#endif
-}
-
-template <class Func>
-real gpuRowFunc(Func f, GpuVector& v) {
-  static ThreadLocal<std::unique_ptr<CpuVectorT<real>>> local;
-  if (!*local) {
-    (*local).reset(new CpuVector(1));
-  }
-  real* A = v.getData();
-  f(A, (*local)->getData(), 1, v.getSize());
-  return (*local)->getData()[0];
-}
-
-template <>
-real GpuVectorT<real>::getMax() {
-  return gpuRowFunc(hl_matrix_row_max, *this);
-}
-
-template <>
-real GpuVectorT<real>::getAbsMax() {
-  return std::max(gpuRowFunc(hl_matrix_row_max, *this),
-                  -gpuRowFunc(hl_matrix_row_min, *this));
-}
-
-template <>
-int GpuVectorT<int>::getMin() {
-  LOG(FATAL) << "Not implemented";
-  return 0;
-}
-
-template <>
-real GpuVectorT<real>::getMin() {
-  return gpuRowFunc(hl_matrix_row_min, *this);
-}
-
-template <class T>
-T GpuVectorT<T>::get(size_t pos) {
-  T val = (T)0;
-  hl_memcpy_device2host((void*)&val, (void*)(this->getData() + pos), sizeof(T));
-  return val;
-}
-
-template <class T>
-void GpuVectorT<T>::histogram(std::ostream& os, int type) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <class T>
-void GpuVectorT<T>::zeroMem() {
-  BaseMatrixT<T>::zero();
-}
-
-template <class T>
-void GpuVectorT<T>::reset(const T& value) {
-  BaseMatrixT<T>::assign(value);
-}
-
-template <class T>
-void GpuVectorT<T>::fillSequence() {
-  LOG(FATAL) << "not implemented";
-}
-
-template <class T>
-void GpuVectorT<T>::copyFrom(const VectorT<T>& src) {
-  src.copyTo(this);
-}
-
-template <class T>
-void GpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
-  CHECK_EQ(src.getSize(), this->getSize());
-  hl_memcpy_async((void*)this->getData(),
-                  (void*)src.getData(),
-                  sizeof(T) * this->getSize(),
-                  stream);
-}
-
-template <class T>
-void GpuVectorT<T>::copyFrom(const T* gpuSrc, size_t size) {
-  CHECK(gpuSrc != NULL);
-  CHECK_LE(size, this->size_);
-
-  hl_memcpy((void*)this->getData(), (void*)gpuSrc, sizeof(T) * size);
-}
-
-template <class T>
-void GpuVectorT<T>::copyFrom(const T* gpuSrc, size_t size, hl_stream_t stream) {
-  CHECK(gpuSrc != NULL);
-  CHECK_LE(size, this->size_);
-
-  hl_memcpy_async(
-      (void*)this->getData(), (void*)gpuSrc, sizeof(T) * size, stream);
-}
-
-template <class T>
-void GpuVectorT<T>::copyTo(CpuVectorT<T>* dest) const {
-  CHECK_EQ(this->getSize(), dest->getSize());
-
-  hl_memcpy_device2host((void*)dest->getData(),
-                        (void*)this->getData(),
-                        sizeof(T) * this->getSize());
-}
-
-template <class T>
-void GpuVectorT<T>::copyTo(GpuVectorT<T>* dest) const {
-  CHECK_EQ(this->getSize(), dest->getSize());
-
-  hl_memcpy_device2device((void*)dest->getData(),
-                          (void*)this->getData(),
-                          sizeof(T) * this->getSize());
-}
-
-template <>
-void GpuVectorT<int>::rand() {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void GpuVectorT<int>::print(std::ostream& os, size_t num) const {
-  IVectorPtr dest = IVector::create(this->size_, false);
-  hl_memcpy_device2host((void*)dest->getData(),
-                        (void*)this->getData(),
-                        sizeof(int) * this->getSize());
-  dest->print(os, num);
-}
-
-template <>
-void GpuVectorT<real>::print(std::ostream& os, size_t num) const {
-  VectorPtr dest = Vector::create(this->size_, false);
-  hl_memcpy_device2host((void*)dest->getData(),
-                        (void*)this->getData(),
-                        sizeof(int) * this->getSize());
-  dest->print(os, num);
-}
-
-template <>
-void GpuVectorT<int>::printOneElement(std::ostream& os, size_t idx) const {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void GpuVectorT<real>::printOneElement(std::ostream& os, size_t idx) const {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void CpuVectorT<int>::rand() {
-  LOG(FATAL) << "Not implemented";
-}
-template <>
-void GpuVectorT<real>::rand(size_t classNum) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void CpuVectorT<real>::rand(size_t classNum) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void GpuVectorT<real>::rand() {
-  VectorPtr cPtr = Vector::create(this->size_, false);
-  cPtr->rand();
-
-  hl_memcpy_host2device(data_, cPtr->getData(), this->size_ * sizeof(real));
-}
-
-template <>
-void GpuVectorT<int>::rand(size_t classNum) {
-  IVectorPtr cPtr = IVector::create(this->size_, false);
-  cPtr->rand(classNum);
-
-  hl_memcpy_host2device(data_, cPtr->getData(), this->size_ * sizeof(int));
-}
-
-template <>
-void CpuVectorT<int>::rand(size_t classNum) {
-  size_t size = this->getSize();
-  int* data = this->getData();
-  for (size_t i = 0; i < size; i++) {
-    data[i] =
-        std::min(classNum - 1,
-                 size_t(::rand() * (1. / ((double)RAND_MAX + 1)) * classNum));
-  }
-}
-
-template <>
-void CpuVectorT<real>::rand() {
-  size_t size = this->getSize();
-  real* data = this->getData();
-  for (size_t i = 0; i < size; i++) {
-    data[i] = ::rand() * (1. / (double)RAND_MAX);
-    // data[ii] = ((temp > RAND_MAX/2)? 1 : -1) *
-    // sqrt( abs((temp-RAND_MAX/2))/(double(RAND_MAX))/2048 );
-  }
-}
-
-template <class T>
-void CpuVectorT<T>::randnorm(real, real) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <class T>
-void CpuVectorT<T>::uniform(real, real) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <class T>
-void GpuVectorT<T>::randnorm(real, real) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <class T>
-void GpuVectorT<T>::uniform(real, real) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void CpuVectorT<real>::randnorm(real mean, real std) {
-  size_t size = this->getSize();
-  real* data = this->getData();
-  unsigned int* seed = ThreadLocalRand::getSeed();
-  auto rand1 = [&]() { return (1. + ::rand_r(seed)) * (1. / (1. + RAND_MAX)); };
-  for (size_t i = 0; i < size - 1; i += 2) {
-    real r1 = rand1();
-    r1 = std::sqrt(-2 * std::log(r1));
-    real r2 = rand1();
-    data[i] = mean + std * r1 * cos(2 * M_PI * r2);
-    data[i + 1] = mean + std * r1 * sin(2 * M_PI * r2);
-  }
-  real r1 = rand1();
-  r1 = std::sqrt(-2 * std::log(r1));
-  real r2 = rand1();
-  data[size - 1] = mean + std * r1 * cos(2 * M_PI * r2);
-}
-
-template <>
-void CpuVectorT<real>::uniform(real left, real right) {
-  size_t size = this->getSize();
-  real* data = this->getData();
-  real range = right - left;
-  unsigned int* seed = ThreadLocalRand::getSeed();
-  auto rand1 = [&]() { return ::rand_r(seed) * (1. / (1. + RAND_MAX)); };
-  for (size_t i = 0; i < size; ++i) {
-    data[i] = rand1() * range + left;
-  }
-}
-
-template <>
-void GpuVectorT<real>::randnorm(real mean, real std) {
-  CpuVector cpuVec = CpuVector(this->getSize());
-  cpuVec.randnorm(mean, std);
-
-  hl_memcpy_host2device(
-      data_, cpuVec.getData(), this->getSize() * sizeof(real));
-}
-
-template <>
-void GpuVectorT<real>::uniform(real left, real right) {
-  CpuVector cpuVec = CpuVector(this->getSize());
-  cpuVec.uniform(left, right);
-
-  hl_memcpy_host2device(
-      data_, cpuVec.getData(), this->getSize() * sizeof(real));
-}
-
-template <class T>
-CpuVectorT<T>::CpuVectorT(size_t size)
-    : VectorT<T>(size,
-                 std::make_shared<CpuMemoryHandle>(sizeof(T) * size),
-                 0, /* offset = 0 */
-                 false /* useGpu = false */) {}
-
-template <class T>
-CpuVectorT<T>::CpuVectorT(const VectorT<T>& src)
-    : VectorT<T>(src.getSize(),
-                 src.getMemoryHandle(),
-                 0, /* offset = 0 */
-                 false /* useGpu = false */) {
-  if (typeid(*this->memoryHandle_.get()) != typeid(CpuMemoryHandle)) {
-    this->memoryHandle_ =
-        std::make_shared<CpuMemoryHandle>(sizeof(T) * this->getSize());
-    this->data_ = reinterpret_cast<T*>(this->memoryHandle_->getBuf());
-  }
-  src.copyTo(this);
-}
-
-template <class T>
-T CpuVectorT<T>::getAbsSum() {
-  const T* A = this->getData();
-  size_t size = this->getSize();
-  T sum = 0;
-  for (size_t i = 0; i < size; i++) {
-    sum += (A[i] > 0) ? A[i] : -A[i];
-  }
-  return sum;
-}
-
-// cannot use above version, due to precision issue of float
-template <>
-real CpuVectorT<real>::getAbsSum() {
-  const real* A = this->getData();
-  size_t size = this->getSize();
-  double sum = 0;
-  for (size_t i = 0; i < size; i++) {
-    sum += (A[i] > 0) ? A[i] : -A[i];
-  }
-  return sum;
-}
-
-template <class T>
-T CpuVectorT<T>::getSum() {
-  const T* A = this->getData();
-  size_t size = this->getSize();
-  T sum = 0;
-  for (size_t i = 0; i < size; i++) {
-    sum += A[i];
-  }
-  return sum;
-}
-
-template <>
-real CpuVectorT<real>::getSum() {
-  const real* A = this->getData();
-  size_t size = this->getSize();
-  double sum = 0;
-  for (size_t i = 0; i < size; i++) {
-    sum += A[i];
-  }
-  return sum;
-}
-
-template <class T>
-T CpuVectorT<T>::get(size_t pos) {
-  return this->getData()[pos];
-}
-
-template <class T>
-T CpuVectorT<T>::getMax() {
-  const T* A = this->getData();
-  size_t size = this->getSize();
-  T res = A[0];
-  for (size_t i = 1; i < size; i++) {
-    if (res < A[i]) res = A[i];
-  }
-  return res;
-}
-
-template <class T>
-T CpuVectorT<T>::getAbsMax() {
-  const T* A = this->getData();
-  size_t size = this->getSize();
-  T res = std::abs(A[0]);
-  for (size_t i = 1; i < size; i++) {
-    if (res < std::abs(A[i])) res = std::abs(A[i]);
-  }
-  return res;
-}
-
-template <class T>
-T CpuVectorT<T>::getMin() {
-  const T* A = this->getData();
-  size_t size = this->getSize();
-  T res = A[0];
-  for (size_t i = 1; i < size; i++) {
-    if (res > A[i]) res = A[i];
-  }
-  return res;
-}
-
-template <class T>
-void CpuVectorT<T>::isEqualTo(const VectorT<T>& b, const T& value) {
-  size_t size = this->getSize();
-  CHECK_EQ(b.getSize(), size);
-
-  const T* B = b.getData();
-  T* A = this->getData();
-  for (size_t i = 0; i < size; i++) {
-    A[i] = (B[i] == value);
-  }
-}
-
-template <class T>
-void CpuVectorT<T>::selectFrom(const VectorT<T>& src, const VectorT<int>& ids) {
-  size_t size = this->getSize();
-  CHECK_EQ(ids.getSize(), size);
-
-  const int* indices = ids.getData();
-  const T* B = src.getData();
-  T* A = this->getData();
-  for (size_t i = 0; i < size; i++) {
-    int index = indices[i];
-    CHECK_LT(index, (int)src.getSize());
-    A[i] = B[index];
-  }
-}
-
-static int getSignAndExponentOfFloat(float a) {
-  uint32_t* pa = reinterpret_cast<uint32_t*>(&a);
-  return *pa >> 23;
-}
-
-template <class T>
-void CpuVectorT<T>::histogram(std::ostream& os, int type) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void CpuVectorT<real>::histogram(std::ostream& os, int type) {
-  int counters[512];
-  memset(counters, 0, sizeof(counters));
-  int counterZero = 0;
-
-  const real* A = this->getData();
-  size_t size = this->getSize();
-  for (size_t i = 0; i < size; i++) {
-    if (A[i] == 0.0f) {
-      ++counterZero;
-    } else {
-      ++counters[getSignAndExponentOfFloat(A[i])];
-    }
-  }
-
-  int64_t sum = 0;
-  float sizeNonZero = size - counterZero;
-  os << "zero:" << counterZero;
-  for (int i = 0; i < 256; i++) {
-    int counter = counters[i];
-    if (counter) {
-      os << " 2^" << i - 127 << ":" << counter / sizeNonZero * 100 << "%";
-      sum += counter * (i - 127);
-    }
-  }
-  for (int i = 0; i < 256; i++) {
-    int counter = counters[i + 256];
-    if (counter) {
-      os << " -2^" << i - 127 << ":" << counter / sizeNonZero * 100 << "%";
-      sum += counter * (i - 127);
-    }
-  }
-  os << ", nonzero_exponent_avg=" << sum / sizeNonZero;
-}
-
-template <class T>
-void CpuVectorT<T>::zeroMem() {
-  memset(this->getData(), 0, sizeof(T) * this->getSize());
-}
-
-template <class T>
-void CpuVectorT<T>::reset(const T& value) {
-  T* A = this->getData();
-  size_t size = this->getSize();
-  for (size_t i = 0; i < size; i++) {
-    A[i] = value;
-  }
-}
-
-template <class T>
-void CpuVectorT<T>::fillSequence() {
-  T* A = this->getData();
-  size_t size = this->getSize();
-  for (size_t i = 0; i < size; i++) {
-    A[i] = i;
-  }
-}
-
-template <class T>
-void CpuVectorT<T>::copyFrom(const VectorT<T>& src) {
-  src.copyTo(this);
-}
-
-template <class T>
-void CpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
-  if (typeid(src) == typeid(GpuVectorT<T>)) {
-    hl_memcpy_async((void*)this->getData(),
-                    (void*)src.getData(),
-                    sizeof(T) * this->getSize(),
-                    stream);
-    // There is a need to add synchronization to ensure that the data is copied.
-    hl_stream_synchronize(stream);
-  } else {
-    src.copyTo(this);
-  }
-}
-
-template <class T>
-void CpuVectorT<T>::copyFrom(const T* hostSrc, size_t size) {
-  CHECK(hostSrc != NULL);
-  CHECK_LE(size, this->size_);
-  memcpy(this->data_, hostSrc, sizeof(T) * size);
-}
-
-template <class T>
-void CpuVectorT<T>::copyFrom(const T* hostSrc,
-                             size_t size,
-                             hl_stream_t stream) {
-  (void)stream;
-
-  CHECK(hostSrc != NULL);
-  CHECK_LE(size, this->size_);
-  memcpy(this->data_, hostSrc, sizeof(T) * size);
-}
-
-template <class T>
-void CpuVectorT<T>::copyTo(CpuVectorT<T>* dest) const {
-  CHECK_EQ(this->getSize(), dest->getSize());
-  memcpy(dest->getData(), this->getData(), sizeof(T) * this->getSize());
-}
-
-template <class T>
-void CpuVectorT<T>::copyTo(GpuVectorT<T>* dest) const {
-  CHECK_EQ(this->getSize(), dest->getSize());
-  hl_memcpy_host2device((void*)dest->getData(),
-                        (void*)this->getData(),
-                        sizeof(T) * this->getSize());
-}
-
-template <>
-void CpuVectorT<real>::print(std::ostream& os, size_t num) const {
-  size_t w = size_ < num ? size_ : num;
-  os << "[";
-  for (size_t i = 0; i < w; ++i) {
-    os << data_[i] << " ";
-  }
-  os << "]" << std::endl;
-}
-
-template <>
-void CpuVectorT<int>::print(std::ostream& os, size_t num) const {
-  size_t w = size_ < num ? size_ : num;
-  os << "[";
-  for (size_t i = 0; i < w; ++i) {
-    os << (int)data_[i] << " ";
-  }
-  os << "]" << std::endl;
-}
-
-template <>
-void CpuVectorT<real>::printOneElement(std::ostream& os, size_t idx) const {
-  CHECK_LT(idx, size_);
-  os << data_[idx] << ";";
-}
-
-template <>
-void CpuVectorT<int>::printOneElement(std::ostream& os, size_t idx) const {
-  CHECK_LT(idx, size_);
-  os << (int)data_[idx] << ";";
-}
-
-template <class T>
-void ParallelCpuVectorT<T>::parallelExec(ExecFunc func) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void ParallelCpuVectorT<real>::parallelExec(ExecFunc func) {
-  pool_->exec([this, func](int tid, size_t numThreads) {
-    auto interval = calcSplitArrayInterval(
-        this->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
-    // setup sub bufs
-    CpuVector subVec(0, nullptr);
-    subVec.subVecFrom(*this, interval);
-    func(subVec);
-  });
-}
-
-template <class T>
-void ParallelCpuVectorT<T>::exec(SyncThreadPool::JobFunc func) {
-  LOG(FATAL) << "Not implemented";
-}
-
-template <>
-void ParallelCpuVectorT<real>::exec(SyncThreadPool::JobFunc func) {
-  pool_->exec(func);
-}
-
-template <class T>
-CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, bool useGpu) : sync_(nullptr) {
-  if (!useGpu) {
-    cpuVectorT_ = std::make_shared<CpuVectorT<T>>(size);
-  } else {
-    gpuVectorT_ = std::make_shared<GpuVectorT<T>>(size);
-  }
-  setSync(useGpu);
-}
-
-template <class T>
-CpuGpuVectorT<T>::CpuGpuVectorT(const std::shared_ptr<VectorT<T>>& src)
-    : sync_(nullptr) {
-  bool useGpu = src->useGpu();
-  if (useGpu) {
-    gpuVectorT_ = src;
-  } else {
-    cpuVectorT_ = src;
-  }
-  setSync(useGpu);
-}
-
-template <class T>
-CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, T* data, bool useGpu)
-    : sync_(nullptr) {
-  if (!useGpu) {
-    cpuVectorT_ = std::make_shared<CpuVectorT<T>>(size, data);
-    setSync(DATA_AT_CPU);
-  } else {
-    gpuVectorT_ = std::make_shared<GpuVectorT<T>>(size, data);
-    setSync(DATA_AT_GPU);
-  }
-}
-
-template <class T>
-std::shared_ptr<CpuGpuVectorT<T>> CpuGpuVectorT<T>::create(size_t size,
-                                                           bool useGpu) {
-  return std::make_shared<CpuGpuVectorT<T>>(size, useGpu);
-}
-
-template <class T>
-void CpuGpuVectorT<T>::resize(size_t size, bool useGpu) {
-  if (useGpu) {
-    CHECK(gpuVectorT_) << "gpuVectorT_ is null";
-    // If memoryHandle_ is nullptr,
-    // the data may be owned by the caller when it was constructed.
-    // It should not resize for this case.
-    if (gpuVectorT_->getMemoryHandle()) {
-      gpuVectorT_->resize(size);
-    } else {
-      CHECK_EQ(gpuVectorT_->getSize(), size);
-    }
-  } else {
-    CHECK(cpuVectorT_) << "cpuVectorT_ is null";
-    // If memoryHandle_ is nullptr,
-    // the data may be owned by the caller when it was constructed.
-    // It should not resize for this case.
-    if (cpuVectorT_->getMemoryHandle()) {
-      cpuVectorT_->resize(size);
-    } else {
-      CHECK_EQ(cpuVectorT_->getSize(), size);
-    }
-  }
-  setSync(useGpu);
-}
-
-template <class T>
-void CpuGpuVectorT<T>::resizeOrCreate(std::shared_ptr<CpuGpuVectorT<T>>& vec,
-                                      size_t size,
-                                      bool useGpu) {
-  if (vec) {
-    vec->resize(size, useGpu);
-  } else {
-    vec = create(size, useGpu);
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::resizeOrCreate(size_t size, bool useGpu) {
-  if (useGpu && (!gpuVectorT_)) {
-    gpuVectorT_ = VectorT<T>::create(size, true);
-  } else if ((!useGpu) && (!cpuVectorT_)) {
-    cpuVectorT_ = VectorT<T>::create(size, false);
-  } else {
-    CHECK((useGpu && gpuVectorT_) || (!useGpu && cpuVectorT_));
-    this->resize(size, useGpu);
-  }
-}
-
-template <class T>
-CpuGpuVectorT<T>::CpuGpuVectorT(CpuGpuVectorT<T>& src,
-                                size_t offset,
-                                size_t size)
-    : sync_(nullptr) {
-  CHECK_LE(offset + size, static_cast<size_t>(src.getSize()));
-#ifdef PADDLE_WITH_CUDA
-  SyncedFlag* flag = src.getSync();
-  if (*flag == DATA_AT_CPU) {
-    src.copyToGpu();  // will set synchronous data between CPU and GPU
-  } else if (*flag == DATA_AT_GPU) {
-    src.copyToCpu();  // will set synchronous data between CPU and GPU
-  }
-#endif
-  auto cMemHandle = (src.getVector(false))->getMemoryHandle();
-  cpuVectorT_ = std::make_shared<CpuVectorT<T>>(
-      size, std::dynamic_pointer_cast<CpuMemoryHandle>(cMemHandle), offset);
-#ifdef PADDLE_WITH_CUDA
-  auto gMemHandle = (src.getVector(true))->getMemoryHandle();
-  gpuVectorT_ = std::make_shared<GpuVectorT<T>>(
-      size, std::dynamic_pointer_cast<GpuMemoryHandle>(gMemHandle), offset);
-  src.setSync(SYNCED);
-#endif
-  setSync(src.getSync());
-}
-
-template <class T>
-std::shared_ptr<const VectorT<T>> CpuGpuVectorT<T>::getVector(
-    bool useGpu) const {
-  auto* self = const_cast<CpuGpuVectorT<T>*>(this);
-  if (useGpu) {
-    self->copyToGpu();
-    return std::const_pointer_cast<const VectorT<T>>(gpuVectorT_);
-  } else {
-    self->copyToCpu();
-    return std::const_pointer_cast<const VectorT<T>>(cpuVectorT_);
-  }
-}
-
-template <class T>
-std::shared_ptr<VectorT<T>>& CpuGpuVectorT<T>::getMutableVector(bool useGpu) {
-  setSync(useGpu);
-  if (useGpu) {
-    copyToGpu();
-    return gpuVectorT_;
-  } else {
-    copyToCpu();
-    return cpuVectorT_;
-  }
-}
-
-template <class T>
-const T* CpuGpuVectorT<T>::getData(bool useGpu) const {
-  auto self = const_cast<CpuGpuVectorT<T>*>(this);
-  if (useGpu) {
-    self->copyToGpu();
-    return gpuVectorT_->getData();
-  } else {
-    self->copyToCpu();
-    return cpuVectorT_->getData();
-  }
-}
-
-// Operation will change data and need to reset sync_ & syncFlag_.
-#define MUTABLE_VECTOR_OP(OP, useGpu, args...) \
-  do {                                         \
-    if (useGpu) {                              \
-      copyToGpu();                             \
-      setSync(useGpu);                         \
-      return gpuVectorT_->OP(args);            \
-    } else {                                   \
-      copyToCpu();                             \
-      setSync(useGpu);                         \
-      return cpuVectorT_->OP(args);            \
-    }                                          \
-  } while (0)
-
-template <class T>
-T* CpuGpuVectorT<T>::getMutableData(bool useGpu) {
-  MUTABLE_VECTOR_OP(getData, useGpu);
-}
-
-template <class T>
-void CpuGpuVectorT<T>::zeroMem(bool useGpu) {
-  MUTABLE_VECTOR_OP(zeroMem, useGpu);
-}
-
-template <class T>
-void CpuGpuVectorT<T>::fillSequence(bool useGpu) {
-  MUTABLE_VECTOR_OP(fillSequence, useGpu);
-}
-
-template <class T>
-void CpuGpuVectorT<T>::setElement(size_t i, const T& value, bool useGpu) {
-  MUTABLE_VECTOR_OP(setElement, useGpu, i, value);
-}
-
-template <class T>
-T CpuGpuVectorT<T>::getElement(size_t i) const {
-  switch (*this->getSync()) {
-    case SYNCED:
-    case DATA_AT_CPU:
-      return cpuVectorT_->getElement(i);
-      break;
-    case DATA_AT_GPU:
-      return gpuVectorT_->getElement(i);
-      break;
-    default:
-      LOG(FATAL) << "Not support";
-      break;
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
-  auto cVec = dynamic_cast<const CpuVectorT<T>*>(&src);
-  auto gVec = dynamic_cast<const GpuVectorT<T>*>(&src);
-  if (cVec) {
-    copyToCpu(cVec->getData(), cVec->getSize(), stream);
-  } else if (gVec) {
-    copyToGpu(gVec->getData(), gVec->getSize(), stream);
-  } else {
-    LOG(FATAL) << "Invalid type of src";
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyFrom(const T* data, size_t size, bool useGpu) {
-  if (useGpu) {
-    copyToGpu(data, size);
-  } else {
-    copyToCpu(data, size);
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyFrom(const T* data,
-                                size_t size,
-                                hl_stream_t stream,
-                                bool useGpu) {
-  if (useGpu) {
-    copyToGpu(data, size, stream);
-  } else {
-    copyToCpu(data, size, stream);
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyFrom(CpuGpuVectorT<T>& src,
-                                size_t offset,
-                                size_t size,
-                                bool useGpu,
-                                hl_stream_t stream) {
-  if (useGpu) {
-    VectorT<T>::resizeOrCreate(gpuVectorT_, size, true);
-    gpuVectorT_->copyFrom(src.getData(true) + offset, size, stream);
-  } else {
-    VectorT<T>::resizeOrCreate(cpuVectorT_, size, false);
-    cpuVectorT_->copyFrom(src.getData(false) + offset, size, stream);
-  }
-  setSync(useGpu);
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyFrom(CpuGpuVectorT<T>& src, hl_stream_t stream) {
-  switch (*src.getSync()) {
-    case DATA_AT_CPU:
-      copyFrom(*(src.getVector(false)), stream);
-      break;
-    case DATA_AT_GPU:
-      copyFrom(*(src.getVector(true)), stream);
-      break;
-    case SYNCED:
-      copyFrom(*(src.getVector(false)), stream);
-      copyFrom(*(src.getVector(true)), stream);
-      setSync(SYNCED);
-      break;
-    default:
-      LOG(FATAL) << "Not support";
-      break;
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyToCpu() {
-  switch (*this->getSync()) {
-    case DATA_AT_GPU:
-      CHECK(gpuVectorT_);
-      this->resizeOrCreate(gpuVectorT_->getSize(), false);
-      cpuVectorT_->copyFrom(*gpuVectorT_);
-      setSync(SYNCED);
-      break;
-    case DATA_AT_CPU:
-    case SYNCED:
-      CHECK(cpuVectorT_);
-      break;
-    default:
-      LOG(FATAL) << "Not support";
-      break;
-  }
-}
-
-template <class T>
-void CpuGpuVectorT<T>::copyToGpu() {
-  switch (*this->getSync()) {
-    case DATA_AT_CPU:
-      CHECK(cpuVectorT_);
-      this->resizeOrCreate(cpuVectorT_->getSize(), true);
-      gpuVectorT_->copyFrom(*cpuVectorT_);
-      setSync(SYNCED);
-      break;
-    case DATA_AT_GPU:
-    case SYNCED:
-      CHECK(gpuVectorT_);
-      break;
-    default:
-      LOG(FATAL) << "Not support";
-      break;
-  }
-}
-
-template class VectorT<real>;
-template class VectorT<int>;
-template class CpuVectorT<real>;
-template class CpuVectorT<int>;
-template class GpuVectorT<real>;
-template class GpuVectorT<int>;
-template class CpuGpuVectorT<real>;
-template class CpuGpuVectorT<int>;
-
-}  // namespace paddle
diff --git a/paddle/math/Vector.h b/paddle/math/Vector.h
deleted file mode 100644
index 3efbc769dff5aa1dbc9d5015b0cbac313710d70d..0000000000000000000000000000000000000000
--- a/paddle/math/Vector.h
+++ /dev/null
@@ -1,726 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cmath>
-#include <memory>
-
-#include <hl_gpu.h>
-
-#include "BaseMatrix.h"
-#include "MemoryHandle.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Thread.h"
-
-namespace paddle {
-
-template <class T>
-class GpuVectorT;
-template <class T>
-class CpuVectorT;
-
-template <class T>
-class BaseVector;
-
-class SyncThreadPool;
-
-class Matrix;
-
-template <class T>
-class BaseVector : public BaseMatrixT<T> {
-public:
-  BaseVector(size_t size, T* data, bool useGpu)
-      : BaseMatrixT<T>(1, size, data, false, useGpu), size_(this->width_) {}
-
-  ~BaseVector() {}
-
-protected:
-  size_t& size_;
-};
-
-/**
- * Copy or assignemnt constructor will share the data as opposed to making a
- * copy of the original data. To make a copy of the orinal data, use copyFrom()
- * instead.
- */
-template <class T>
-class VectorT : public BaseVector<T> {
-protected:
-  VectorT(size_t size, MemoryHandlePtr memoryHandle, size_t offset, bool useGpu)
-      : BaseVector<T>(size,
-                      reinterpret_cast<T*>(memoryHandle->getBuf()) + offset,
-                      useGpu) {
-    memoryHandle_ = memoryHandle;
-  }
-
-  // data is still owned by the caller.
-  // data should be valid during the life of this vector.
-  // Caller is responsible for release the memory.
-  VectorT(size_t size, T* data, bool useGpu)
-      : BaseVector<T>(size, data, useGpu) {}
-
-public:
-  virtual ~VectorT() {}
-
-  static std::shared_ptr<VectorT<T>> create(size_t size, bool useGpu);
-
-  static std::shared_ptr<VectorT<T>> create(T* data, size_t size, bool useGpu);
-
-  static std::shared_ptr<VectorT<T>> create(size_t size,
-                                            MemoryHandlePtr memoryHandle,
-                                            size_t offset = 0);
-
-  // owner can set SyncThreadPool,
-  // if not set, will use globalSyncThreadPool,
-  // which can be used in main thread only.
-  static std::shared_ptr<VectorT<T>> createParallelVector(
-      size_t size, bool useGpu, SyncThreadPool* pool = nullptr);
-
-  size_t getSize() const { return this->size_; }
-  const T* getData() const { return this->data_; }
-  T* getData() { return this->data_; }
-
-  virtual void zeroMem() = 0;
-  // set all elements to value
-  virtual void reset(const T& value) = 0;
-  // fill data by 0, 1, 2, ...
-  virtual void fillSequence() = 0;
-
-  MemoryHandlePtr getMemoryHandle() const { return memoryHandle_; }
-
-  /**
-   * resizing to a big vector will not preserve old values.
-   */
-  void resize(size_t newSize) {
-    if (!memoryHandle_ || newSize * sizeof(T) > memoryHandle_->getAllocSize()) {
-      memoryHandle_ = newMemory(newSize * sizeof(T));
-      this->data_ = reinterpret_cast<T*>(memoryHandle_->getBuf());
-    }
-    this->size_ = newSize;
-  }
-
-  static void resizeOrCreate(std::shared_ptr<VectorT<T>>& vec,
-                             size_t size,
-                             bool useGpu) {
-    if (vec) {
-      vec->resize(size);
-    } else {
-      vec = create(size, useGpu);
-    }
-  }
-
-  virtual MemoryHandlePtr newMemory(size_t size) = 0;
-
-  /**
-   * form sub vector from *src*, shallow copy
-   */
-  void subVecFrom(const VectorT<T>& src, size_t start, size_t size) {
-    CHECK_EQ(BaseVector<T>::useGpu_, src.useGpu_);
-    CHECK_LT(start, src.size_);
-    CHECK_LE(start + size, src.size_);
-
-    BaseVector<T>::size_ = size;
-    BaseVector<T>::data_ = const_cast<T*>(src.data_) + start;
-  }
-
-  std::shared_ptr<VectorT<T>> subVec(size_t start, size_t size) {
-    CHECK_LE(start + size, static_cast<size_t>(getSize()));
-    return VectorT<T>::create(getData() + start, size, BaseVector<T>::useGpu_);
-  }
-
-  /**
-   * form sub vector from *src*, shallow copy
-   */
-  void subVecFrom(const T* src, size_t start, size_t size) {
-    BaseVector<T>::size_ = size;
-    BaseVector<T>::data_ = const_cast<T*>(src) + start;
-  }
-
-  /**
-   * form sub vector from *src*, shallow copy
-   * in *interval* [interval.first, interval.second)
-   */
-  void subVecFrom(const VectorT<T>& src, std::pair<size_t, size_t> interval) {
-    subVecFrom(src, interval.first, interval.second - interval.first);
-  }
-
-  /**
-   * convert the vector to a sparse one_hot matrix of width idRange
-   * only applies to IVector
-   */
-  std::shared_ptr<Matrix> toOneHotSparseMatrix(size_t idRange, bool useGpu);
-
-  /**
-   * @brief cast vector of "real" elements to "int" elements.
-   *
-   * @note: float -> int must be casted, or you'll get wrong data.
-   */
-  std::shared_ptr<VectorT<int>> castToInt();
-
-  /**
-   * This function will crash if the size of src and dest is different.
-   */
-  virtual void copyFrom(const VectorT<T>& src) = 0;
-
-  /**
-   * If GpuVector, this function is an asynchronous interface,
-   * will push the copy-task to the specifed-stream and return immediately.
-   *
-   * If CpuVector, this function is an synchronous interface,
-   * same as the copyFrom(const VectorT<T>& src).
-   */
-  virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream) = 0;
-
-  /**
-   * copy size elements from src
-   *
-   * If this is GpuVector, src can be cpu or gpu memory
-   *
-   * If this is CpuVector, src is assumed to be cpu memory
-   */
-  virtual void copyFrom(const T* src, size_t size) = 0;
-
-  /**
-   * copy size elements from src
-   *
-   * If this is GpuVector, src can be cpu or gpu memory
-   *
-   * If this is CpuVector, src is assumed to be cpu memory,
-   */
-  virtual void copyFrom(const T* src, size_t size, hl_stream_t stream) = 0;
-
-  /**
-   * exec a func in single/multi thread
-   */
-  virtual void exec(SyncThreadPool::JobFunc func) { func(0, 1); }
-
-  /// Get the buffer point with beginPos
-  virtual T* getPoint(const uint64_t beginPos) = 0;
-
-  /// Get the value for the i'th element
-  virtual T getElement(size_t i) const = 0;
-  virtual void setElement(size_t i, const T& value) = 0;
-
-  //----------  math operations ----------------
-
-  // sum of the absolute value of each elements
-  virtual T getAbsSum() = 0;
-
-  virtual T getSum() = 0;
-  virtual T getMax() = 0;
-  virtual T getAbsMax() = 0;
-  virtual T getMin() = 0;
-
-  /// element-wise calc:  this = (b == value)
-  virtual void isEqualTo(const VectorT<T>& b, const T& value) = 0;
-
-  /// select elements indexed by *ids* from vector *src*
-  virtual void selectFrom(const VectorT<T>& src, const VectorT<int>& ids) = 0;
-
-  enum HistogramType {
-    HISTOGRAM_EXPONENT = 0,
-  };
-
-  /**
-   * @brief  print histogram of vector values
-   *
-   * @note   only exponent histogram supported currently
-   */
-  virtual void histogram(std::ostream& os, int type = HISTOGRAM_EXPONENT) = 0;
-
-  /// generate uniform random value for each element
-  virtual void rand() = 0;
-  /**
-   * generate uniform random value for each element,
-   * data range is from 0 to (classes - 1).
-   */
-  virtual void rand(size_t classes) = 0;
-
-  /**
-   * Debug use only. Very inefficient for GPU vector.
-   * get the value at pos.
-   */
-  virtual T get(size_t pos) = 0;
-
-  /**
-   * generate univariate Gaussian distributed random numbers
-   * with given mean and standardDeviation.
-   */
-  virtual void randnorm(real mean, real standardDeviation) = 0;
-
-  /**
-   * generate uniform distributed random numbers
-   * with given range.
-   */
-  virtual void uniform(real left, real right) = 0;
-
-  /// print the first "num" elements of the Vector
-  virtual void print(std::ostream& os, size_t num) const = 0;
-
-  /// print the "idx" element of the Vector
-  virtual void printOneElement(std::ostream& os, size_t idx) const = 0;
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    if (BaseVector<T>::useGpu_) {
-      TensorGpuApply<T>(*this, expr);
-    } else {
-      TensorCpuApply<T>(*this, expr);
-    }
-  }
-
-protected:
-  friend class GpuVectorT<T>;
-  friend class CpuVectorT<T>;
-  virtual void copyTo(CpuVectorT<T>* dest) const = 0;
-  virtual void copyTo(GpuVectorT<T>* dest) const = 0;
-  MemoryHandlePtr memoryHandle_;
-};
-
-template <class T>
-std::ostream& operator<<(std::ostream& os, const VectorT<T>& vec) {
-  vec.print(os, vec.getSize());
-  return os;
-}
-
-template <class T>
-class GpuVectorT : public VectorT<T> {
-public:
-  explicit GpuVectorT(size_t size);
-  GpuVectorT(size_t size, GpuMemHandlePtr memHandle, size_t offset)
-      : VectorT<T>(size, memHandle, offset, true) {}
-
-  // data is still owned by the caller.
-  // data should be valid during the life of this vector.
-  // Caller is responsible for release the memory.
-  GpuVectorT(size_t size, T* data) : VectorT<T>(size, data, true) {}
-
-  virtual MemoryHandlePtr newMemory(size_t size) {
-    return std::make_shared<GpuMemoryHandle>(size);
-  }
-  virtual void zeroMem();
-  virtual void reset(const T& value);
-  virtual void fillSequence();
-
-  virtual void copyFrom(const T* src, size_t size);
-  virtual void copyFrom(const T* src, size_t size, hl_stream_t stream);
-  virtual void copyFrom(const VectorT<T>& src);
-  virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream);
-  virtual T getElement(size_t i) const;
-  virtual void setElement(size_t i, const T& value);
-  virtual T* getPoint(const uint64_t beginPos);
-
-  virtual T getAbsSum();
-  virtual T getSum();
-  virtual T getMax();
-  virtual T getAbsMax();
-  virtual T getMin();
-  virtual void isEqualTo(const VectorT<T>& b, const T& value);
-  virtual void selectFrom(const VectorT<T>& src, const VectorT<int>& ids);
-  virtual void histogram(std::ostream& os, int type);
-  virtual void rand();
-  virtual void rand(size_t classes);
-  virtual void randnorm(real mean, real standardDeviation);
-  virtual void uniform(real left, real right);
-  virtual T get(size_t pos);
-  virtual void print(std::ostream& os, size_t num) const;
-  virtual void printOneElement(std::ostream& os, size_t idx) const;
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    TensorGpuApply<T>(*this, expr);
-  }
-
-protected:
-  virtual void copyTo(CpuVectorT<T>* dest) const;
-  virtual void copyTo(GpuVectorT<T>* dest) const;
-};
-
-template <class T>
-class CpuVectorT : public VectorT<T> {
-public:
-  explicit CpuVectorT(size_t size);
-  CpuVectorT(size_t size, MemoryHandlePtr memoryHandle, size_t offset)
-      : VectorT<T>(size, memoryHandle, offset, false) {}
-
-  // data is still owned by the caller.
-  // data should be valid during the life of this vector.
-  // Caller is responsible for release the memory.
-  CpuVectorT(size_t size, T* data) : VectorT<T>(size, data, false) {}
-
-  /**
-   * If src is a CpuVector, the new CpuVector will share the data with src
-   *
-   * If src is a GpuVector, the new CpuVector will copy data from src
-   */
-  explicit CpuVectorT(const VectorT<T>& src);
-
-  virtual MemoryHandlePtr newMemory(size_t size) {
-    return std::make_shared<CpuMemoryHandle>(size);
-  }
-
-  virtual void zeroMem();
-  virtual void reset(const T& value);
-  virtual void fillSequence();
-  virtual void copyFrom(const T* src, size_t size);
-  virtual void copyFrom(const T* src, size_t size, hl_stream_t stream);
-  virtual void copyFrom(const VectorT<T>& src);
-  virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream);
-  virtual void copyTo(CpuVectorT<T>* dest) const;
-  virtual void copyTo(GpuVectorT<T>* dest) const;
-
-  /// Get the buffer point with beginPos
-  virtual T* getPoint(const uint64_t beginPos) {
-    return this->getData() + beginPos;
-  }
-
-  virtual T getElement(size_t i) const { return this->getData()[i]; }
-  virtual void setElement(size_t i, const T& value) {
-    this->getData()[i] = value;
-  }
-
-  virtual T getAbsSum();
-  virtual T getSum();
-  virtual T getMax();
-  virtual T getAbsMax();
-  virtual T getMin();
-  virtual void isEqualTo(const VectorT<T>& b, const T& value);
-  virtual void selectFrom(const VectorT<T>& src, const VectorT<int>& ids);
-  virtual void histogram(std::ostream& os, int type);
-  virtual void rand();
-  virtual void rand(size_t classes);
-  virtual void randnorm(real mean, real standardDeviation);
-  virtual void uniform(real left, real right);
-  virtual T get(size_t pos);
-  virtual void print(std::ostream& os, size_t num) const;
-  virtual void printOneElement(std::ostream& os, size_t idx) const;
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    TensorCpuApply<T>(*this, expr);
-  }
-};
-
-template <class T>
-class ParallelCpuVectorT : public CpuVectorT<T> {
-public:
-  ParallelCpuVectorT(size_t size, SyncThreadPool* pool)
-      : CpuVectorT<T>(size), pool_(pool) {}
-
-  virtual void zeroMem() {
-    parallelExec([](CpuVectorT<T>& vec) { vec.CpuVectorT<T>::zeroMem(); });
-  }
-  virtual void randnorm(real mean, real standardDeviation) {
-    parallelExec([=](CpuVectorT<T>& vec) {
-      vec.CpuVectorT<T>::randnorm(mean, standardDeviation);
-    });
-  }
-  virtual void uniform(real left, real right) {
-    parallelExec(
-        [=](CpuVectorT<T>& vec) { vec.CpuVectorT<T>::uniform(left, right); });
-  }
-
-  virtual void exec(SyncThreadPool::JobFunc jobFunc);
-
-private:
-  typedef std::function<void(CpuVectorT<T>& vec)> ExecFunc;
-  void parallelExec(ExecFunc func);
-  SyncThreadPool* pool_;
-};
-
-/**
- * A class to do conversion between CpuVector and GpuVector automatically.
- */
-template <class T>
-class CpuGpuVectorT {
-public:
-  /**
-   * @brief An enum type of SyncedFlag using to
-   *        mark data memory is in CPU or GPU.
-   *
-   * DATA_AT_CPU: data is located in CPU.
-   *
-   * DATA_AT_GPU: data is located in GPU.
-   *
-   * SYNCED: data is located in CPU and GPU simultaneously.
-   */
-  enum SyncedFlag { DATA_AT_CPU = 0, DATA_AT_GPU = 1, SYNCED = 2 };
-
-  /**
-   * @brief A constructor, create cpuVectorT_ or gpuVectorT_.
-   *
-   * @param[in] size    data size.
-   * @param[in] useGpu  use gpu or not.
-   */
-  explicit CpuGpuVectorT(size_t size, bool useGpu);
-
-  /**
-   * @brief A constructor, create CpuGpuVectorT by VectorT.
-   *
-   * If src is CpuVector, cpuVectorT_ is shared data with src.
-   *
-   * If src is GpuVector, gpuVectorT_ is shared data with src.
-   */
-  explicit CpuGpuVectorT(const std::shared_ptr<VectorT<T>>& src);
-
-  /**
-   * @brief A constructor.
-   *
-   * If useGpu is true, data should be located in device and
-   * create gpuVectorT_ with data.
-   *
-   * If useGpu is false, data should be located in host and
-   * create cpuVectorT_ with data.
-   *
-   * @note Data is owned by the caller and should be valid during
-   *       the life of this vector.
-   *       Caller is responsible for release the memory.
-   */
-  CpuGpuVectorT(size_t size, T* data, bool useGpu);
-
-  CpuGpuVectorT(CpuGpuVectorT<T>& src, size_t offset, size_t size);
-
-  virtual ~CpuGpuVectorT() {}
-
-  static std::shared_ptr<CpuGpuVectorT<T>> create(size_t size, bool useGpu);
-
-  /**
-   * @brief resize vector.
-   *
-   * If useGpu is true, resize gpuVectorT_ and set syncFlag_ to DATA_AT_GPU,
-   *
-   * otherwise resize cpuVectorT_ and set syncFlag_ to DATA_AT_CPU.
-   */
-  void resize(size_t size, bool useGpu);
-
-  /**
-   * @brief resize or create CpuGpuVectorT.
-   */
-  static void resizeOrCreate(std::shared_ptr<CpuGpuVectorT<T>>& vec,
-                             size_t size,
-                             bool useGpu);
-
-  /**
-   * @brief return a const cpuVectorT_ or gpuVectorT_.
-   *
-   * If useGpu is true, return gpuVectorT_.
-   *
-   * If useGpu is false, return cpuVectorT_.
-   *
-   * @note Caller should not change the data.
-   *       If caller changes const attribute,
-   *       should set syncFlag_.
-   */
-  std::shared_ptr<const VectorT<T>> getVector(bool useGpu) const;
-
-  /**
-   * @brief return a const cpuVectorT_ or gpuVectorT_.
-   *
-   * @note: This interface will change syncFlag_, so if you will
-   *        not change the data, you should call getVector.
-   */
-  std::shared_ptr<VectorT<T>>& getMutableVector(bool useGpu);
-
-  /**
-   * @brief return const T* data.
-   *
-   * If useGpu is true, return device data.
-   *
-   * If useGpu is false, return host data.
-   */
-  const T* getData(bool useGpu) const;
-
-  // TODO(yuyang18): Make getData more c++ style.
-  //  inline T* getData(bool useGpu) {
-  //    return getMutableData(useGpu);
-  //  }
-
-  T* getMutableData(bool useGpu);
-
-  /**
-   * If useGpu is true, gpuVectorT_->Op().
-   *
-   * If useGpu is false, cpuVectorT_->Op().
-   *
-   * Op is zeroMem, fillSequence, ...
-   */
-  void zeroMem(bool useGpu);
-  void fillSequence(bool useGpu);
-  void setElement(size_t i, const T& value, bool useGpu);
-
-  /**
-   * @brief return i-th element.
-   */
-  T getElement(size_t i) const;
-
-  /**
-   * @brief return vector size.
-   */
-  size_t getSize() const {
-    size_t size = 0;
-    switch (*sync_) {
-      case SYNCED:
-      case DATA_AT_CPU:
-        size = cpuVectorT_->getSize();
-        break;
-      case DATA_AT_GPU:
-        size = gpuVectorT_->getSize();
-        break;
-      default:
-        LOG(FATAL) << "Not support";
-        break;
-    }
-    return size;
-  }
-
-  /// copy data to cpuVectorT_.
-  inline void copyToCpu(const T* data, size_t size) {
-    this->resizeOrCreate(size, false);
-    cpuVectorT_->copyFrom(data, size);
-    setSync(DATA_AT_CPU);
-  }
-  /// copy data to cpuVectorT_ using specifed-stream.
-  inline void copyToCpu(const T* data, size_t size, hl_stream_t stream) {
-    this->resizeOrCreate(size, false);
-    cpuVectorT_->copyFrom(data, size, stream);
-    setSync(DATA_AT_CPU);
-  }
-
-  /// copy data to gpuVectorT_.
-  inline void copyToGpu(const T* data, size_t size) {
-    this->resizeOrCreate(size, true);
-    gpuVectorT_->copyFrom(data, size);
-    setSync(DATA_AT_GPU);
-  }
-  /// copy data to gpuVectorT_ using specifed-stream.
-  inline void copyToGpu(const T* data, size_t size, hl_stream_t stream) {
-    this->resizeOrCreate(size, true);
-    gpuVectorT_->copyFrom(data, size, stream);
-    setSync(DATA_AT_GPU);
-  }
-
-  /**
-   * @brief copy from src using specifed-stream.
-   *
-   * If src is CpuVectorT, copy to cpuVectorT_.
-   *
-   * If src is GpuVectorT, copy to gpuVectorT_.
-   */
-  void copyFrom(const VectorT<T>& src, hl_stream_t stream);
-
-  /**
-   * @brief copy data.
-   *
-   * If useGpu is false, copy host data to cpuVectorT_.
-   *
-   * If useGpu is true, copy device data to gpuVectorT_.
-   *
-   * @note  data address should consistent with useGpu.
-   */
-  void copyFrom(const T* data, size_t size, bool useGpu);
-  void copyFrom(const T* data, size_t size, hl_stream_t stream, bool useGpu);
-
-  /**
-   * @brief copy from (src + offset) using specifed-stream.
-   */
-  void copyFrom(CpuGpuVectorT<T>& src,
-                size_t offset,
-                size_t size,
-                bool useGpu,
-                hl_stream_t stream);
-
-  /**
-   * @brief copy from src using specifed-stream.
-   */
-  void copyFrom(CpuGpuVectorT<T>& src, hl_stream_t stream);
-
-  /**
-   * @brief return sync_.
-   */
-  inline SyncedFlag* getSync() const { return sync_; }
-
-  /**
-   * @brief set sync_.
-   */
-  inline void setSync(SyncedFlag* sync) { sync_ = sync; }
-
-  inline void setSync(SyncedFlag syncFlag) {
-    if (sync_) {
-      *sync_ = syncFlag;
-    } else {
-      syncFlag_ = syncFlag;
-      sync_ = &syncFlag_;
-    }
-  }
-
-  inline void setSync(bool useGpu) {
-    SyncedFlag flag = useGpu ? DATA_AT_GPU : DATA_AT_CPU;
-    setSync(flag);
-  }
-
-protected:
-  void resizeOrCreate(size_t size, bool useGpu);
-
-  /**
-   * @brief copy between cpuVectorT_ and gpuVectorT_.
-   *
-   * If syncFlag_ is DATA_AT_CPU and SYNCED, do nothing.
-   *
-   * If syncFlag_ is DATA_AT_GPU, copy gpuVectorT_ to cpuVectorT_
-   *   and set syncFlag_ to SYNCED.
-   */
-  void copyToCpu();
-
-  /**
-   * @brief copy between cpuVectorT_ and gpuVectorT_.
-   *
-   * If syncFlag_ is DATA_AT_GPU and SYNCED, do nothing.
-   *
-   * If syncFlag_ is DATA_AT_CPU, copy cpuVectorT_ to gpuVectorT_
-   *   and set syncFlag_ to SYNCED.
-   */
-  void copyToGpu();
-
-  /// host pointer.
-  std::shared_ptr<VectorT<T>> cpuVectorT_;
-  /// device pointer.
-  std::shared_ptr<VectorT<T>> gpuVectorT_;
-  /// specify current data address.
-  SyncedFlag syncFlag_;
-  SyncedFlag* sync_;
-};
-
-typedef VectorT<real> Vector;
-typedef CpuVectorT<real> CpuVector;
-typedef GpuVectorT<real> GpuVector;
-
-typedef VectorT<int> IVector;
-typedef CpuVectorT<int> CpuIVector;
-typedef GpuVectorT<int> GpuIVector;
-
-typedef std::shared_ptr<Vector> VectorPtr;
-typedef std::shared_ptr<CpuVector> CpuVectorPtr;
-typedef std::shared_ptr<GpuVector> GpuVectorPtr;
-
-typedef std::shared_ptr<IVector> IVectorPtr;
-typedef std::shared_ptr<CpuIVector> CpuIVectorPtr;
-typedef std::shared_ptr<GpuIVector> GpuIVectorPtr;
-
-typedef CpuGpuVectorT<real> CpuGpuVector;
-typedef CpuGpuVectorT<int> ICpuGpuVector;
-typedef std::shared_ptr<CpuGpuVector> CpuGpuVectorPtr;
-typedef std::shared_ptr<ICpuGpuVector> ICpuGpuVectorPtr;
-
-}  // namespace paddle
diff --git a/paddle/math/tests/OriginalOptimizerApi.h b/paddle/math/tests/OriginalOptimizerApi.h
deleted file mode 100644
index e30d784b232dd7d477877d3f7c90cd185357328c..0000000000000000000000000000000000000000
--- a/paddle/math/tests/OriginalOptimizerApi.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/math/Vector.h"
-#include "paddle/utils/GlobalConstants.h"
-
-using namespace paddle;  // NOLINT
-
-void SparseMomentumParameterOptimizer(const VectorPtr vecs[],
-                                      real alpha,
-                                      real beta,
-                                      real gamma,
-                                      real tau,
-                                      real learningRate) {
-  vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT],
-                                   -alpha * gamma * learningRate);
-  vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT],
-                                   tau * alpha * gamma * learningRate);
-  vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
-                             tau / beta + 1.0 / alpha,
-                             *vecs[PARAMETER_MOMENTUM_VT],
-                             1.0 / beta);
-}
-
-void AdagradParameterOptimizer(const VectorPtr vecs[],
-                               real epsilon,
-                               real learningRate,
-                               real momentum,
-                               real decayRate) {
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->addSquare(*vecs[PARAMETER_GRADIENT],
-                                                1.0f);
-  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM],
-                                     *vecs[PARAMETER_GRADIENT_SQURESUM1]);
-  vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
-  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                   *vecs[PARAMETER_MOMENTUM],
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate,
-                                   momentum,
-                                   decayRate);
-}
-
-void AdaDeltaParameterOptimizer(const VectorPtr vecs[],
-                                real rou,
-                                real epsilon,
-                                real learningRate,
-                                real momentum,
-                                real decayRate) {
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
-      *vecs[PARAMETER_GRADIENT], rou, 1.0f - rou);
-
-  // learn_rate = sqrt( ( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ) )
-  vecs[PARAMETER_LEARNING_RATE]->dotDiv(*vecs[PARAMETER_GRADIENT_SQURESUM1],
-                                        *vecs[PARAMETER_GRADIENT_SQURESUM],
-                                        epsilon,
-                                        epsilon);
-  vecs[PARAMETER_LEARNING_RATE]->sqrt2();
-
-  // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->decayAddSquareMul(
-      *vecs[PARAMETER_GRADIENT],
-      *vecs[PARAMETER_LEARNING_RATE],
-      rou,
-      1.0f - rou);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                   *vecs[PARAMETER_MOMENTUM],
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate,
-                                   momentum,
-                                   decayRate);
-}
-
-void RMSPropParameterOptimizer(const VectorPtr vecs[],
-                               real accumulatedRou,
-                               real rou,
-                               real epsilon,
-                               real learningRate,
-                               real momentum,
-                               real decayRate,
-                               bool firstTime) {
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  // For the first time update, make the sum be the current square
-  // so that the initial estimation of E(g_t^2) will not be too small.
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
-      *vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
-
-  // E(g_t) = \rou * E(g_{t-1}) + (1-\rou) * g
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->add(
-      *vecs[PARAMETER_GRADIENT], accumulatedRou, 1.0f - rou);
-
-  // learn_rate = 1/sqrt( ( E(g_t^2) - (E(g_t))^2 + epsilon )
-  // Basiclly if the sign of the gradient changes more often,
-  // the learning rate will be decreased.
-  vecs[PARAMETER_LEARNING_RATE]->assign(*vecs[PARAMETER_GRADIENT_SQURESUM]);
-  vecs[PARAMETER_LEARNING_RATE]->addSquare(*vecs[PARAMETER_GRADIENT_SQURESUM1],
-                                           -1.0f);
-  vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
-  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                   *vecs[PARAMETER_MOMENTUM],
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate,
-                                   momentum,
-                                   decayRate);
-}
-
-void DecayedAdagradParameterOptimizer(const VectorPtr vecs[],
-                                      real accumulatedRou,
-                                      real rou,
-                                      real epsilon,
-                                      real learningRate,
-                                      real momentum,
-                                      real decayRate,
-                                      bool firstTime) {
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
-  // For the first time update, make the sum be the current square
-  // so that the initial estimation of E(g_t^2) will not be too small.
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
-      *vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
-
-  // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
-  // Basiclly if the bigger the magnitude gradient is,
-  // the smaller the learning rate will be.
-  vecs[PARAMETER_LEARNING_RATE]->assign(epsilon);
-  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM]);
-  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
-
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                   *vecs[PARAMETER_MOMENTUM],
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate,
-                                   momentum,
-                                   decayRate);
-}
-
-void AdamParameterOptimizer(const VectorPtr vecs[],
-                            real beta1,
-                            real beta2,
-                            real beta1_power,
-                            real beta2_power,
-                            real epsilon,
-                            real learningRate) {
-  Vector* m = vecs[PARAMETER_MOMENTUM].get();
-  Vector* g = vecs[PARAMETER_GRADIENT].get();
-  Vector* v = vecs[PARAMETER_SECOND_MOMENTUM].get();
-  Vector* theta = vecs[PARAMETER_VALUE].get();
-
-  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
-  m->add(*g, beta1, 1 - beta1);
-
-  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
-  g->square2();
-  v->add(*g, beta2, 1 - beta2);
-
-  // tmp = m_t / ( \sqrt{v_t} + \epsilon )
-  // \theta_t = \theta_{t-1} - \alpha * \sqrt(1-\beta_2^t) / (1-\beta_1^t) * tmp
-  g->sqrt2(*v);
-  g->dotDiv(*m, *g, 0., epsilon);
-  real alpha =
-      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
-  theta->add(*theta, 1.0, *g, -alpha);
-}
-
-void AdamaxParameterOptimizer(
-    const VectorPtr vecs[], real beta1, real beta2, int64_t step, real alpha) {
-  Vector* m = vecs[PARAMETER_MOMENTUM].get();
-  Vector* g = vecs[PARAMETER_GRADIENT].get();
-  Vector* u = vecs[PARAMETER_WEIGHTED_INFINITY_NORM].get();
-  Vector* theta = vecs[PARAMETER_VALUE].get();
-
-  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
-  m->add(*g, beta1, 1 - beta1);
-
-  // u_t = max(\beta_2*u_{t-1}, abs(g_t))
-  u->mulScalar(beta2);
-  g->abs2();
-  u->max2(*u, *g);
-
-  // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
-  g->dotDiv(*m, *u);
-  real learningRate = alpha / (1 - std::pow(beta1, step));
-  theta->add(*theta, 1.0, *g, -learningRate);
-}
diff --git a/paddle/math/tests/PerfUtils.h b/paddle/math/tests/PerfUtils.h
deleted file mode 100644
index bee2351e2fb80f9ccef670535c92485389f0c51a..0000000000000000000000000000000000000000
--- a/paddle/math/tests/PerfUtils.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// Performance Check
-#ifdef PADDLE_DISABLE_TIMER
-
-#define EXPRESSION_PERFORMANCE(expression) expression;
-
-#else
-
-#include "paddle/utils/Stat.h"
-using namespace paddle;  // NOLINT
-
-#define EXPRESSION_PERFORMANCE(expression)                             \
-  do {                                                                 \
-    char expr[30];                                                     \
-    strncpy(expr, #expression, 30);                                    \
-    if (expr[29] != '\0') {                                            \
-      expr[27] = '.';                                                  \
-      expr[28] = '.';                                                  \
-      expr[29] = '\0';                                                 \
-    }                                                                  \
-    expression;                                                        \
-    for (int i = 0; i < 20; i++) {                                     \
-      REGISTER_TIMER(expr);                                            \
-      expression;                                                      \
-    }                                                                  \
-    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ') \
-              << *globalStat.getStat(expr);                            \
-    globalStat.reset();                                                \
-  } while (0)
-
-#endif
diff --git a/paddle/math/tests/TensorCheck.h b/paddle/math/tests/TensorCheck.h
deleted file mode 100644
index f4332ede36356bc666612a240448c1be71e5170e..0000000000000000000000000000000000000000
--- a/paddle/math/tests/TensorCheck.h
+++ /dev/null
@@ -1,216 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-/**
- * This file provides a TensorCheck template function, which can be used to
- * compare CpuMatrix and GpuMatrix, CpuVector and GpuVector, and so on.
- */
-
-#include <cmath>
-#include "paddle/math/Matrix.h"
-
-namespace autotest {
-
-using paddle::Matrix;
-using paddle::CpuMatrix;
-using paddle::GpuMatrix;
-using paddle::VectorT;
-using paddle::CpuVectorT;
-using paddle::GpuVectorT;
-
-class AssertEqual {
-public:
-  AssertEqual(real err = 0) : err_(err) {}
-
-  inline bool operator()(real a, real b) {
-    if (err_ == 0) {
-      if (a != b) {
-        return false;
-      }
-    } else {
-      if (std::fabs(a - b) > err_) {
-        if ((std::fabs(a - b) / std::fabs(a)) > (err_ / 10.0f)) {
-          return false;
-        }
-      }
-    }
-
-    return true;
-  }
-
-private:
-  real err_;
-};
-
-template <typename Tensor>
-class CopyToCpu;
-
-template <>
-class CopyToCpu<CpuMatrix> {
-public:
-  explicit CopyToCpu(const CpuMatrix& arg) : arg_(arg) {}
-  const CpuMatrix& copiedArg() const { return arg_; }
-
-private:
-  const CpuMatrix& arg_;
-};
-
-template <>
-class CopyToCpu<GpuMatrix> {
-public:
-  explicit CopyToCpu(const GpuMatrix& arg)
-      : arg_(arg.getHeight(), arg.getWidth()) {
-    arg_.copyFrom(arg);
-  }
-  CpuMatrix& copiedArg() { return arg_; }
-
-private:
-  CpuMatrix arg_;
-};
-
-template <>
-class CopyToCpu<Matrix> {
-public:
-  explicit CopyToCpu(const Matrix& arg)
-      : arg_(arg.getHeight(), arg.getWidth()) {
-    arg_.copyFrom(arg);
-  }
-  CpuMatrix& copiedArg() { return arg_; }
-
-private:
-  CpuMatrix arg_;
-};
-
-template <typename T>
-class CopyToCpu<CpuVectorT<T>> {
-public:
-  explicit CopyToCpu(const CpuVectorT<T>& arg) : arg_(arg) {}
-  const CpuVectorT<T>& copiedArg() const { return arg_; }
-
-private:
-  const CpuVectorT<T>& arg_;
-};
-
-template <typename T>
-class CopyToCpu<GpuVectorT<T>> {
-public:
-  explicit CopyToCpu(const GpuVectorT<T>& arg) : arg_(arg.getSize()) {
-    arg_.copyFrom(arg);
-  }
-  CpuVectorT<T>& copiedArg() { return arg_; }
-
-private:
-  CpuVectorT<T> arg_;
-};
-
-template <typename T>
-class CopyToCpu<VectorT<T>> {
-public:
-  explicit CopyToCpu(const VectorT<T>& arg) : arg_(arg.getSize()) {
-    arg_.copyFrom(arg);
-  }
-  CpuVectorT<T>& copiedArg() { return arg_; }
-
-private:
-  CpuVectorT<T> arg_;
-};
-
-template <typename AssertEq>
-void TensorCheck(AssertEq compare,
-                 const CpuMatrix& matrix1,
-                 const CpuMatrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      real a = data1[i * width + j];
-      real b = data2[i * width + j];
-      if (!compare(a, b)) {
-        count++;
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-template <typename AssertEq, class T>
-void TensorCheck(AssertEq compare,
-                 const CpuVectorT<T>& vector1,
-                 const CpuVectorT<T>& vector2) {
-  CHECK(vector1.getSize() == vector2.getSize());
-
-  const T* data1 = vector1.getData();
-  const T* data2 = vector2.getData();
-  size_t size = vector1.getSize();
-  int count = 0;
-  for (size_t i = 0; i < size; i++) {
-    real a = data1[i];
-    real b = data2[i];
-    if (!compare(a, b)) {
-      count++;
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
-}
-
-template <typename AssertEq, typename Tensor1, typename Tensor2>
-void TensorCheck(AssertEq compare,
-                 const Tensor1& tensor1,
-                 const Tensor2& tensor2) {
-  TensorCheck(compare,
-              CopyToCpu<Tensor1>(tensor1).copiedArg(),
-              CopyToCpu<Tensor2>(tensor2).copiedArg());
-}
-
-template <typename AssertEq>
-void TensorCheck(AssertEq compare, real args1, real args2) {
-  EXPECT_EQ(compare(args1, args2), true) << "[Test error] args1 = " << args1
-                                         << ", args2 = " << args2;
-}
-
-template <typename AssertEq>
-void TensorCheck(AssertEq compare, size_t args1, size_t args2) {
-  EXPECT_EQ(args1, args2) << "[Test error] args1 = " << args1
-                          << ", args2 = " << args2;
-}
-
-template <typename Tensor1, typename Tensor2>
-void TensorCheckEqual(const Tensor1& tensor1, const Tensor2& tensor2) {
-  AssertEqual compare(0);
-  TensorCheck(compare,
-              CopyToCpu<Tensor1>(tensor1).copiedArg(),
-              CopyToCpu<Tensor2>(tensor2).copiedArg());
-}
-
-template <typename Tensor1, typename Tensor2>
-void TensorCheckErr(const Tensor1& tensor1, const Tensor2& tensor2) {
-#ifndef PADDLE_TYPE_DOUBLE
-  AssertEqual compare(1e-3);
-#else
-  AssertEqual compare(1e-10);
-#endif
-  TensorCheck(compare,
-              CopyToCpu<Tensor1>(tensor1).copiedArg(),
-              CopyToCpu<Tensor2>(tensor2).copiedArg());
-}
-
-}  // namespace autotest
diff --git a/paddle/math/tests/TestUtils.h b/paddle/math/tests/TestUtils.h
deleted file mode 100644
index d2b9706432f84fa082e071eb09d2ffe7402a085f..0000000000000000000000000000000000000000
--- a/paddle/math/tests/TestUtils.h
+++ /dev/null
@@ -1,294 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-/**
- * This file provides a AutoCompare calss to simplify the comparison
- * of CPU and GPU member functions.
- *
- * This takes two steps
- * 1. Construct an AutoCompare object.
- *    When constructing an AutoCompare object, you can set the err argument
- * to specify the maximum error for CPU and GPU functions.
- *
- * 2. Use the template functions cmpWithArg or cmpWithoutArg.
- * A. [cmpWithArg] Requires the caller construct the cpu arguments.
- *
- *  AutoCompare test;
- *  Init Argument arg1,arg2...
- *  test.cmpWithArg(function, arg1, arg2....)
- *
- * B. [cmpWithoutArg] The caller do not need construct arguments.
- *    If matrix used in these functions arguments is the same size.
- *    Such as the element wise function and the aggregate function
- *    defined in the BaseMatrix.cpp.
- *
- *  AutoCompare test;
- *  test.cmpWithoutArg<I...>(function, height, width)
- */
-
-#include <gtest/gtest.h>
-#include "TensorCheck.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-
-namespace autotest {
-
-using paddle::BaseMatrix;
-using paddle::CpuMatrix;
-using paddle::GpuMatrix;
-using paddle::CpuIVector;
-using paddle::GpuIVector;
-using paddle::CpuSparseMatrix;
-using paddle::GpuSparseMatrix;
-
-template <typename T1, typename T2>
-class ReplaceType {
-public:
-  typedef T1 type;
-};
-
-template <>
-class ReplaceType<BaseMatrix, CpuMatrix> {
-public:
-  typedef CpuMatrix type;
-};
-
-template <>
-class ReplaceType<BaseMatrix, GpuMatrix> {
-public:
-  typedef GpuMatrix type;
-};
-
-template <>
-class ReplaceType<Matrix, CpuMatrix> {
-public:
-  typedef CpuMatrix type;
-};
-
-template <>
-class ReplaceType<Matrix, GpuMatrix> {
-public:
-  typedef GpuMatrix type;
-};
-
-// construct a argument
-template <typename T>
-T construct(int height, int width);
-
-template <>
-float construct(int height, int width) {
-  return 0.5;
-}
-
-template <>
-double construct(int height, int width) {
-  return 0.5;
-}
-
-template <>
-size_t construct(int height, int width) {
-  size_t offset = std::rand() % (height < width ? height : width);
-  return offset;
-}
-
-template <>
-CpuMatrix construct(int height, int width) {
-  CpuMatrix a(height, width);
-  return a;
-}
-
-template <>
-GpuMatrix construct(int height, int width) {
-  GpuMatrix a(height, width);
-  return a;
-}
-
-// init a argument
-template <typename T>
-void init(T& v) {
-  return;
-}
-
-template <>
-void init(CpuMatrix& v) {
-  v.randomizeUniform();
-}
-
-template <>
-void init(GpuMatrix& v) {
-  v.randomizeUniform();
-}
-
-// init a tuple which contains a set of arguments.
-template <std::size_t I = 0, typename... Args>
-inline typename std::enable_if<I == sizeof...(Args), void>::type initTuple(
-    std::tuple<Args...>& t) {}
-
-template <std::size_t I = 0, typename... Args>
-    inline typename std::enable_if <
-    I<sizeof...(Args), void>::type initTuple(std::tuple<Args...>& t) {
-  init(std::get<I>(t));
-  initTuple<I + 1>(t);
-}
-
-// copy a argument, copy src to dest
-template <typename T1, typename T2>
-void copy(T1& dest, T2& src) {
-  dest = src;
-}
-
-template <>
-void copy(GpuMatrix& dest, CpuMatrix& src) {
-  dest.copyFrom(src);
-}
-
-// copy a tuple, copy src to dest
-template <std::size_t I = 0, typename... Args1, typename... Args2>
-inline typename std::enable_if<I == sizeof...(Args1), void>::type copyTuple(
-    std::tuple<Args1...>& dest, std::tuple<Args2...>& src) {}
-
-template <std::size_t I = 0, typename... Args1, typename... Args2>
-    inline typename std::enable_if <
-    I<sizeof...(Args1), void>::type copyTuple(std::tuple<Args1...>& dest,
-                                              std::tuple<Args2...>& src) {
-  copy(std::get<I>(dest), std::get<I>(src));
-  copyTuple<I + 1>(dest, src);
-}
-
-// call member function
-template <typename C,
-          typename FC,
-          typename R,
-          typename... FArgs,
-          typename... Args>
-R call(C& obj, R (FC::*f)(FArgs...), Args&&... args) {
-  return (obj.*f)(args...);
-}
-
-template <typename T>
-class ReturnType {
-public:
-  typedef T type;
-};
-
-template <>
-class ReturnType<CpuMatrix> {
-public:
-  typedef GpuMatrix type;
-};
-
-template <>
-class ReturnType<CpuIVector> {
-public:
-  typedef GpuIVector type;
-};
-
-template <>
-class ReturnType<CpuSparseMatrix> {
-public:
-  typedef GpuSparseMatrix type;
-};
-
-template <typename T>
-typename ReturnType<T>::type autoArgs(T& v) {
-  return v;
-}
-
-template <>
-GpuMatrix autoArgs(CpuMatrix& v) {
-  GpuMatrix a(v.getHeight(), v.getWidth());
-  a.copyFrom(v);
-  return a;
-}
-
-template <>
-GpuIVector autoArgs(CpuIVector& v) {
-  GpuIVector a(v.getSize());
-  a.copyFrom(v);
-  return a;
-}
-
-template <>
-GpuSparseMatrix autoArgs(CpuSparseMatrix& v) {
-  GpuSparseMatrix a(v.getHeight(),
-                    v.getWidth(),
-                    v.getElementCnt(),
-                    v.getValueType(),
-                    v.getFormat());
-  a.copyFrom(v, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  return a;
-}
-
-class AutoCompare {
-public:
-  /**
-   * err is the allowed calculation error.
-   * The smaller the value of err,
-   * the stricter the comparison is between CPU and GPU calculations.
-   */
-  AutoCompare(size_t height, size_t width, real err = 1e-3)
-      : cpu(height, width), gpu(height, width), compare(err) {
-    init(cpu);
-    copy(gpu, cpu);
-  }
-
-  template <typename C, typename R, typename... FArgs, typename... Args>
-  void cmpWithArg(R (C::*f)(FArgs...), Args&&... args) {
-    static_assert(sizeof...(FArgs) == sizeof...(Args),
-                  "size of parameter packs are not equal");
-    call(cpu, f, args...);
-    call(gpu, f, autoArgs(args)...);
-
-    TensorCheck(compare, cpu, gpu);
-  }
-
-  template <std::size_t... I, typename C, typename R, typename... Args>
-  void cmpWithoutArg(R (C::*f)(Args...), size_t height, size_t width) {
-    static_assert(sizeof...(I) == sizeof...(Args),
-                  "size of parameter packs are not equal");
-    (void)height;
-    (void)width;
-    auto tuple1 = std::make_tuple(
-        construct<typename ReplaceType<
-            typename std::decay<
-                typename std::tuple_element<I,
-                                            std::tuple<Args...>>::type>::type,
-            CpuMatrix>::type>(height, width)...);
-
-    auto tuple2 = std::make_tuple(
-        construct<typename ReplaceType<
-            typename std::decay<
-                typename std::tuple_element<I,
-                                            std::tuple<Args...>>::type>::type,
-            GpuMatrix>::type>(height, width)...);
-
-    initTuple(tuple1);
-    copyTuple(tuple2, tuple1);
-
-    call(cpu, f, std::get<I>(tuple1)...);
-    call(gpu, f, std::get<I>(tuple2)...);
-
-    TensorCheck(compare, cpu, gpu);
-  }
-
-protected:
-  CpuMatrix cpu;
-  GpuMatrix gpu;
-  AssertEqual compare;
-};
-
-}  // namespace autotest
diff --git a/paddle/math/tests/test_Allocator.cpp b/paddle/math/tests/test_Allocator.cpp
deleted file mode 100644
index 84bc1c1d9e0a8368a69c1e53a63056eb45b9239f..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_Allocator.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Util.h"
-#define private public
-#include "paddle/math/Allocator.h"
-#include "paddle/math/MemoryHandle.h"
-#include "paddle/math/PoolAllocator.h"
-
-using namespace paddle;  // NOLINT
-
-template <typename Allocator>
-void testPoolAllocator() {
-  PoolAllocator* pool =
-      new PoolAllocator(new Allocator(), /* sizeLimit */ 1024);
-
-  /* alloc from system memory */
-  void* ptr1 = pool->alloc(10);
-  void* ptr2 = pool->alloc(200);
-  void* ptr3 = pool->alloc(200);
-  pool->free(ptr1, 10);
-  pool->free(ptr2, 200);
-  pool->free(ptr3, 200);
-  pool->printAll();
-  EXPECT_EQ((size_t)2, pool->pool_.size());
-  EXPECT_EQ((size_t)1, pool->pool_[10].size());
-  EXPECT_EQ((size_t)2, pool->pool_[200].size());
-  EXPECT_EQ(ptr1, pool->pool_[10][0]);
-  EXPECT_EQ(ptr2, pool->pool_[200][0]);
-  EXPECT_EQ(ptr3, pool->pool_[200][1]);
-
-  /* alloc from pool */
-  void* ptr4 = pool->alloc(10);
-  void* ptr5 = pool->alloc(200);
-  pool->printAll();
-  EXPECT_EQ((size_t)0, pool->pool_[10].size());
-  EXPECT_EQ((size_t)1, pool->pool_[200].size());
-  EXPECT_EQ(ptr1, ptr4);
-  EXPECT_EQ(ptr3, ptr5);
-  pool->free(ptr4, 10);
-  pool->free(ptr5, 200);
-
-  /* alloc size > sizeLimit */
-  void* ptr6 = pool->alloc(1024);
-  pool->free(ptr6, 1024);
-  EXPECT_LE((size_t)1024, pool->poolMemorySize_);
-
-  void* ptr7 = pool->alloc(1);
-  EXPECT_EQ((size_t)0, pool->poolMemorySize_);
-  EXPECT_EQ((size_t)0, pool->pool_.size());
-  pool->free(ptr7, 1);
-
-  delete pool;
-}
-
-TEST(Allocator, Pool) {
-  testPoolAllocator<CpuAllocator>();
-#ifdef PADDLE_WITH_CUDA
-  testPoolAllocator<GpuAllocator>();
-#endif
-}
-
-TEST(MemoryHandle, Cpu) {
-  for (auto size : {10, 30, 50, 100, 200, 512, 1000, 1023, 1024, 1025, 8193}) {
-    CpuMemoryHandle handle(size);
-    EXPECT_LE(handle.getSize(), handle.getAllocSize());
-  }
-
-  void* ptr1;
-  void* ptr2;
-  {
-    CpuMemoryHandle handle(256);
-    ptr1 = handle.getBuf();
-  }
-  {
-    CpuMemoryHandle handle(256);
-    ptr2 = handle.getBuf();
-  }
-  EXPECT_EQ(ptr1, ptr2);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(MemoryHandle, Gpu) {
-  int numGpu = hl_get_device_count();
-
-  /* alloc from system memory */
-  void* ptr3[numGpu];
-  void* ptr4[numGpu];
-  for (int i = 0; i < numGpu; i++) {
-    SetDevice device(i);
-    GpuMemoryHandle handle1(30);
-    GpuMemoryHandle handle2(30);
-    GpuMemoryHandle handle3(4000);
-    GpuMemoryHandle handle4(500);
-    ptr3[i] = handle3.getBuf();
-    ptr4[i] = handle4.getBuf();
-  }
-
-  /* alloc from pool */
-  for (int i = 0; i < numGpu; i++) {
-    SetDevice device(i);
-    GpuMemoryHandle handle1(30);
-    GpuMemoryHandle handle3(4000);
-    GpuMemoryHandle handle4(500);
-    EXPECT_EQ(ptr3[i], handle3.getBuf());
-    EXPECT_EQ(ptr4[i], handle4.getBuf());
-  }
-}
-#endif
diff --git a/paddle/math/tests/test_BaseMatrix.cpp b/paddle/math/tests/test_BaseMatrix.cpp
deleted file mode 100644
index 6f7beb60c8f535d51b18c4984b89d1972f4c82bd..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ /dev/null
@@ -1,247 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-/**
- * This test file use autotest::AutoCompare and cmpWithoutArg to compares the
- * implementation of CPU and GPU member function in
- * BaseMatrix.cpp and Matrix.cpp.
- */
-
-#include <gtest/gtest.h>
-#include "TestUtils.h"
-#include "paddle/math/BaseMatrix.h"
-
-using paddle::BaseMatrix;
-using paddle::Matrix;
-using autotest::AutoCompare;
-
-// Test all void (BaseMatrix::*)() function
-TEST(BaseMatrix, void) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height, width](void (BaseMatrix::*f)()) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg(f, height, width);
-      };
-
-      compare(&BaseMatrix::neg);
-      compare(&BaseMatrix::exp2);
-      compare(&BaseMatrix::log2);
-      compare(&BaseMatrix::sqrt2);
-      compare(&BaseMatrix::square2);
-      compare(&BaseMatrix::reciprocal2);
-      compare(&BaseMatrix::abs2);
-      compare(&BaseMatrix::sign2);
-      compare(&BaseMatrix::zero);
-      compare(&BaseMatrix::one);
-    }
-  }
-}
-
-// Test all void (BaseMatrix::*)(real) function
-TEST(BaseMatrix, real) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height, width](void (BaseMatrix::*f)(real)) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg<0>(f, height, width);
-      };
-
-      compare(&BaseMatrix::pow2);
-      compare(&BaseMatrix::subScalar);
-      compare(&BaseMatrix::mulScalar);
-      compare(&BaseMatrix::divScalar);
-      compare(&BaseMatrix::assign);
-      compare(&BaseMatrix::add);
-      compare(&BaseMatrix::biggerThanScalar);
-      compare(&BaseMatrix::downClip);
-    }
-  }
-}
-
-// Test all void (BaseMatrix::*)(BaseMatrix&) function
-TEST(BaseMatrix, BaseMatrix) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height, width](void (BaseMatrix::*f)(BaseMatrix&)) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg<0>(f, height, width);
-      };
-
-      compare(&BaseMatrix::assign);
-      compare(&BaseMatrix::add);
-      compare(&BaseMatrix::relu);
-      compare(&BaseMatrix::reluDerivative);
-      compare(&BaseMatrix::softrelu);
-      compare(&BaseMatrix::softreluDerivative);
-      compare(&BaseMatrix::brelu);
-      compare(&BaseMatrix::breluDerivative);
-      compare(&BaseMatrix::square2);
-      compare(&BaseMatrix::squareDerivative);
-      compare(&BaseMatrix::tanh);
-      compare(&BaseMatrix::tanhDerivative);
-      compare(&BaseMatrix::reciprocal2);
-      compare(&BaseMatrix::reciprocalDerivative);
-      compare(&BaseMatrix::abs2);
-      compare(&BaseMatrix::absDerivative);
-      compare(&BaseMatrix::sigmoid);
-      compare(&BaseMatrix::sigmoidDerivative);
-      compare(&BaseMatrix::expDerivative);
-      compare(&BaseMatrix::sign2);
-      compare(&BaseMatrix::exp2);
-      compare(&BaseMatrix::log2);
-      compare(&BaseMatrix::sqrt2);
-      compare(&BaseMatrix::dotMul);
-      compare(&BaseMatrix::dotMulSquare);
-      compare(&BaseMatrix::dotSquareMul);
-      compare(&BaseMatrix::addColVector);
-      compare(&BaseMatrix::addRowVector);
-      compare(&BaseMatrix::mulRowVector);
-      compare(&BaseMatrix::divRowVector);
-      compare(&BaseMatrix::mulColVector);
-      compare(&BaseMatrix::divColVector);
-      compare(&BaseMatrix::addP2P);
-      compare(&BaseMatrix::invSqrt);
-    }
-  }
-}
-
-// Test all void (BaseMatrix::*)(real, real) function
-TEST(BaseMatrix, real_real) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height, width](void (BaseMatrix::*f)(real, real)) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg<0, 1>(f, height, width);
-      };
-
-      compare(&BaseMatrix::add);
-      compare(&BaseMatrix::clip);
-    }
-  }
-}
-
-// Test all void (BaseMatrix::*)(BaseMatrix&, real) function
-TEST(BaseMatrix, BaseMatrix_real) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height, width](void (BaseMatrix::*f)(BaseMatrix&, real)) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg<0, 1>(f, height, width);
-      };
-
-      compare(&BaseMatrix::addBias);
-      compare(&BaseMatrix::add);
-      compare(&BaseMatrix::sub);
-      compare(&BaseMatrix::pow2);
-      compare(&BaseMatrix::addScalar);
-      compare(&BaseMatrix::subScalar);
-      compare(&BaseMatrix::mulScalar);
-      compare(&BaseMatrix::divScalar);
-      compare(&BaseMatrix::scalarDiv);
-      compare(&BaseMatrix::addSquare);
-      compare(&BaseMatrix::isEqualTo);
-    }
-  }
-}
-
-// Test all void (BaseMatrix::*)(BaseMatrix&, BaseMatrix&) function
-TEST(BaseMatrix, BaseMatrix_BaseMatrix) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      auto compare = [height,
-                      width](void (BaseMatrix::*f)(BaseMatrix&, BaseMatrix&)) {
-        AutoCompare test(height, width, 1e-5);
-        test.cmpWithoutArg<0, 1>(f, height, width);
-      };
-
-      compare(&BaseMatrix::softCrossEntropy);
-      compare(&BaseMatrix::softCrossEntropyBp);
-      compare(&BaseMatrix::binaryLabelCrossEntropy);
-      compare(&BaseMatrix::binaryLabelCrossEntropyBp);
-      compare(&BaseMatrix::sub);
-      compare(&BaseMatrix::add2);
-      compare(&BaseMatrix::dotMul);
-      compare(&BaseMatrix::dotDiv);
-      compare(&BaseMatrix::logisticRegressionLoss);
-      compare(&BaseMatrix::logisticRegressionLossBp);
-      compare(&BaseMatrix::biggerThan);
-      compare(&BaseMatrix::max2);
-      compare(&BaseMatrix::dotMulSquare);
-      compare(&BaseMatrix::dotSquareSquare);
-    }
-  }
-}
-
-void TestEelementWise(size_t height, size_t width) {
-  AutoCompare rowScale(height, width);
-  rowScale.cmpWithoutArg<0, 1, 2>(&BaseMatrix::rowScale, height, width);
-
-  AutoCompare rowDotMul(height, width);
-  rowDotMul.cmpWithoutArg<0, 1, 2>(&BaseMatrix::rowDotMul, height, width);
-
-  AutoCompare binaryClassificationError(height, width);
-  binaryClassificationError.cmpWithoutArg<0, 1, 2, 3>(
-      &BaseMatrix::binaryClassificationError, height, width);
-
-  AutoCompare sumOfSquaresBp(height, width);
-  sumOfSquaresBp.cmpWithoutArg<0, 1>(&Matrix::sumOfSquaresBp, height, width);
-}
-
-void TestAggregateToRow(size_t height, size_t width) {
-  AutoCompare maxCols(1, width);
-  maxCols.cmpWithoutArg<0>(&BaseMatrix::maxCols, height, width);
-
-  AutoCompare minCols(1, width);
-  minCols.cmpWithoutArg<0>(&BaseMatrix::minCols, height, width);
-
-  AutoCompare addDotMulVMM(1, width);
-  addDotMulVMM.cmpWithoutArg<0, 1>(&BaseMatrix::addDotMulVMM, height, width);
-
-  AutoCompare sumCols(1, width);
-  sumCols.cmpWithoutArg<0, 1, 2>(&BaseMatrix::sumCols, height, width);
-
-  AutoCompare collectBias(1, width);
-  collectBias.cmpWithoutArg<0, 1>(
-      static_cast<void (Matrix::*)(Matrix&, real)>(&Matrix::collectBias),
-      height,
-      width);
-}
-
-void TestAggregateToCol(size_t height, size_t width) {
-  AutoCompare maxRows(height, 1);
-  maxRows.cmpWithoutArg<0>(&BaseMatrix::maxRows, height, width);
-
-  AutoCompare minRows(height, 1);
-  minRows.cmpWithoutArg<0>(&BaseMatrix::minRows, height, width);
-
-  AutoCompare sumRows(height, 1);
-  sumRows.cmpWithoutArg<0, 1, 2>(&BaseMatrix::sumRows, height, width);
-
-  AutoCompare sumOfSquares(height, 1);
-  sumOfSquares.cmpWithoutArg<0, 1>(&Matrix::sumOfSquares, height, width);
-}
-
-TEST(BaseMatrix, Other) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      TestEelementWise(height, width);
-      TestAggregateToRow(height, width);
-      TestAggregateToCol(height, width);
-    }
-  }
-}
-
-#endif
diff --git a/paddle/math/tests/test_CpuGpuVector.cpp b/paddle/math/tests/test_CpuGpuVector.cpp
deleted file mode 100644
index 395541a76ae5e5497fdaa8b4870e421cbf62608a..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_CpuGpuVector.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-
-#include <gtest/gtest.h>
-#include "paddle/math/Vector.h"
-#include "paddle/utils/Util.h"
-#include "test_matrixUtil.h"
-
-using namespace paddle;  // NOLINT
-
-TEST(CpuGpuVector, getData) {
-  size_t size = 500;
-  hl_stream_t stream(HPPL_STREAM_DEFAULT);
-  CpuVectorPtr cpuVec = std::make_shared<CpuVector>(size);
-  GpuVectorPtr gpuVec = std::make_shared<GpuVector>(size);
-  cpuVec->uniform(0.0, 10.0);
-  gpuVec->copyFrom(*cpuVec, stream);
-  hl_stream_synchronize(stream);
-
-  CpuGpuVectorPtr vec = std::make_shared<CpuGpuVector>(gpuVec);
-  auto a = vec->getData(false);
-  auto b = cpuVec->getData();
-  hl_stream_synchronize(stream);
-  checkDataEqual(a, b, size);
-}
-
-TEST(CpuGpuVector, subCreate) {
-  size_t size1 = 1024;
-  size_t offset = 100;
-  size_t size2 = 500;
-  hl_stream_t stream(HPPL_STREAM_DEFAULT);
-  CpuGpuVectorPtr v1 = std::make_shared<CpuGpuVector>(size1, /*useGpu*/ false);
-  auto vec = v1->getMutableVector(false);
-  vec->uniform(0.0, 10.0);
-  auto v2 = std::make_shared<CpuGpuVector>(*v1, offset, size2);
-  CHECK_EQ(*v1->getSync(), *v2->getSync());
-
-  // check subVec equal
-  checkDataEqual(v1->getData(false) + offset, v2->getData(false), size2);
-
-  CpuVectorPtr v1Check = std::make_shared<CpuVector>(size1);
-  CpuVectorPtr v2Check = std::make_shared<CpuVector>(size2);
-  v1Check->copyFrom(*(v1->getVector(true)), stream);
-  v2Check->copyFrom(*(v2->getVector(true)), stream);
-  hl_stream_synchronize(stream);
-
-  checkDataEqual(v2->getData(false), v2Check->getData(), size2);
-  checkDataEqual(v1Check->getData() + offset, v2Check->getData(), size2);
-
-  CpuVectorPtr noise = std::make_shared<CpuVector>(size2);
-  noise->uniform(0.0, 1.0);
-  auto v = v2->getMutableVector(false);  // will change header
-  // add noise to subVec
-  v->add(*noise);
-
-  // check v1_cpu_data == v2_cpu_data
-  checkDataEqual(v1->getData(false) + offset, v2->getData(false), size2);
-
-  v1Check->copyFrom(*(v1->getVector(true)), stream);
-  v2Check->copyFrom(*(v2->getVector(true)), stream);
-  hl_stream_synchronize(stream);
-
-  // check v1_gpu_data == v2_gpu_data
-  checkDataEqual(v1Check->getData() + offset, v2Check->getData(), size2);
-}
-
-#endif
diff --git a/paddle/math/tests/test_ExecViaCpu.cpp b/paddle/math/tests/test_ExecViaCpu.cpp
deleted file mode 100644
index 513c7b440e0aa6f20cc8209a3624f32f4892225b..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_ExecViaCpu.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/utils/PythonUtil.h>
-#include <paddle/utils/Util.h>
-#include <vector>
-#include "paddle/math/SparseMatrix.h"
-
-using namespace paddle;  // NOLINT
-
-const int height = 10;
-const int width = 16;
-
-real f(Matrix& mat1,
-       const Matrix& mat2,
-       IVector& vec1,
-       const IVector& vec2,
-       real scalar) {
-  CHECK(!mat1.useGpu());
-  CHECK(!mat2.useGpu());
-  CHECK(!vec1.useGpu());
-  CHECK(!vec2.useGpu());
-  mat1.copyFrom(mat2);
-  vec1.copyFrom(vec2);
-
-  return scalar;
-}
-
-class Functor {
-public:
-  real operator()(Matrix& mat1,
-                  const Matrix& mat2,
-                  IVector& vec1,
-                  const IVector& vec2,
-                  real scalar) {
-    a_ = f(mat1, mat2, vec1, vec2, scalar);
-    return a_;
-  }
-
-private:
-  real a_;
-};
-
-template <typename F>
-void testWrapper(F&& f) {
-  MatrixPtr cpumat1 = Matrix::create(height, width, false, /*useGpu=*/false);
-  MatrixPtr cpumat2 = Matrix::create(height, width, false, /*useGpu=*/false);
-
-  IVectorPtr cpuvec1 = IVector::create(height, /*useGpu=*/false);
-  IVectorPtr cpuvec2 = IVector::create(height, /*useGpu=*/false);
-
-  const real scalar = 1.23456;
-
-  MatrixPtr gpumat1 = Matrix::create(height, width, false, /*useGpu=*/true);
-  MatrixPtr gpumat2 = Matrix::create(height, width, false, /*useGpu=*/true);
-  IVectorPtr gpuvec1 = IVector::create(height, /*useGpu=*/true);
-  IVectorPtr gpuvec2 = IVector::create(height, /*useGpu=*/true);
-
-  cpumat2->randomizeUniform();
-  cpuvec2->rand(width);
-  gpumat2->copyFrom(*cpumat2);
-  gpuvec2->copyFrom(*cpuvec2);
-
-  real ret = execViaCpu(f, *gpumat1, *gpumat2, *gpuvec1, *gpuvec2, 1.23456);
-  EXPECT_EQ(ret, scalar);
-  cpumat1->copyFrom(*gpumat1);
-  cpuvec1->copyFrom(*gpuvec1);
-
-  for (int i = 0; i < height; ++i) {
-    EXPECT_EQ(cpuvec1->getElement(i), cpuvec2->getElement(i));
-    for (int j = 0; j < width; ++j) {
-      EXPECT_EQ(cpumat1->getElement(i, j), cpumat2->getElement(i, j));
-    }
-  }
-  gpumat1->resize(height, 1);
-  execViaCpu2(&CpuMatrix::selectElements, *gpumat1, *gpumat2, *gpuvec1);
-
-  cpumat1->resize(height, 1);
-  cpumat1->selectElements(*cpumat2, *cpuvec1);
-  for (int i = 0; i < height; ++i) {
-    EXPECT_EQ(cpumat1->getElement(i, 0), gpumat1->getElement(i, 0));
-  }
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(ExecViaCpu, test1) {
-  testWrapper(f);
-  testWrapper(&f);
-
-  auto lambda = [](Matrix& mat1,
-                   const Matrix& mat2,
-                   IVector& vec1,
-                   const IVector& vec2,
-                   real scalar) -> real {
-    return f(mat1, mat2, vec1, vec2, scalar);
-  };
-  LOG(INFO) << "lambda is_class=" << std::is_class<decltype(lambda)>::value
-            << " is_function=" << std::is_function<decltype(lambda)>::value;
-  testWrapper(lambda);
-
-  Functor functor;
-  testWrapper(functor);
-}
-#endif
diff --git a/paddle/math/tests/test_FPException.cpp b/paddle/math/tests/test_FPException.cpp
deleted file mode 100644
index d87fdcda9edc8644301b7fe77f4c0c751d5a774a..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_FPException.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/**
- * This test is about floating point calculation exception.
- * Paddle catches FE_INVALID, FE DIVBYZERO and FE_OVERFLOW exceptions.
- *
- * Some exceptions occur in the middle of a set of formulas,
- * that can be circumvented by some tricks.
- * For example,
- * calculate tanh
- *   b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
- *
- * If the result of (-2 * a) is too large,
- * a FE_OVERFLOW exception occurs when calculating exp.
- * But the result of tanh is no overflow problem,
- * so we can add some tricks to prevent exp calculate an excessive value.
- *
- */
-
-#include <gtest/gtest.h>
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Common.h"
-
-using namespace paddle;  // NOLINT
-
-void SetTensorValue(Matrix& matrix, real value) {
-  int height = matrix.getHeight();
-  int width = matrix.getWidth();
-  int stride = matrix.getStride();
-  real* data = matrix.getData();
-  for (int i = 0; i < height; i++) {
-    int j = rand() % width;  // NOLINT
-    if (typeid(matrix) == typeid(CpuMatrix)) {
-      data[i * stride + j] = value;
-    } else if (typeid(matrix) == typeid(GpuMatrix)) {
-      hl_memcpy(&data[i * stride + j], &value, sizeof(real));
-    } else {
-      LOG(FATAL) << "should not reach here";
-    }
-  }
-}
-
-template <typename Matrix>
-void testTanh(real illegal) {
-  MatrixPtr A = std::make_shared<Matrix>(10, 10);
-  MatrixPtr B = std::make_shared<Matrix>(10, 10);
-  A->randomizeUniform();
-  B->randomizeUniform();
-
-  SetTensorValue(*A, illegal);
-
-  A->tanh(*B);
-}
-
-template <typename Matrix>
-void testSigmoid(real illegal) {
-  MatrixPtr A = std::make_shared<Matrix>(10, 10);
-  MatrixPtr B = std::make_shared<Matrix>(10, 10);
-  A->randomizeUniform();
-  B->randomizeUniform();
-
-  SetTensorValue(*A, illegal);
-
-  A->sigmoid(*B);
-}
-
-TEST(fp, overflow) {
-  for (auto illegal : {-90.0, 90.0}) {
-    LOG(INFO) << " illegal=" << illegal;
-    testTanh<CpuMatrix>(illegal);
-    testSigmoid<CpuMatrix>(illegal);
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-
-  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/tests/test_GpuProfiler.cpp b/paddle/math/tests/test_GpuProfiler.cpp
deleted file mode 100644
index 828159660bae1ad1c0b56fd7202f0357549877ca..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_GpuProfiler.cpp
+++ /dev/null
@@ -1,165 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-
-#include <gtest/gtest.h>
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/testing/TestUtil.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-void MatrixCheckErr(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      real a = data1[i * width + j];
-      real b = data2[i * width + j];
-      if (fabs(a - b) > err) {
-        if ((fabsf(a - b) / fabsf(a)) > (err / 10.0f)) {
-          count++;
-        }
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void testBilinearFwdBwd(int numSamples,
-                        int imgSizeH,
-                        int imgSizeW,
-                        int channels) {
-  int inWidth = imgSizeH * imgSizeW * channels;
-  int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels;
-  real ratioH = 0.5;
-  real ratioW = 0.5;
-
-  // forward
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
-
-  input->randomizeUniform();
-  inputGpu->copyFrom(*input);
-
-  {
-    // nvprof: GPU Proflier
-    REGISTER_GPU_PROFILER("testBilinearFwdBwd");
-    target->bilinearForward(*input,
-                            imgSizeH,
-                            imgSizeW,
-                            2 * imgSizeH,
-                            2 * imgSizeW,
-                            channels,
-                            ratioH,
-                            ratioW);
-    targetGpu->bilinearForward(*inputGpu,
-                               imgSizeH,
-                               imgSizeW,
-                               2 * imgSizeH,
-                               2 * imgSizeW,
-                               channels,
-                               ratioH,
-                               ratioW);
-  }
-
-  // check
-  targetCheck->copyFrom(*targetGpu);
-  MatrixCheckErr(*target, *targetCheck);
-
-  // backward
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr targetCheckGrad =
-      CpuMatrix::create(numSamples, inWidth, false, false);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->bilinearBackward(*targetGrad,
-                              2 * imgSizeH,
-                              2 * imgSizeW,
-                              imgSizeH,
-                              imgSizeW,
-                              channels,
-                              ratioH,
-                              ratioW);
-  inputGpuGrad->bilinearBackward(*targetGpuGrad,
-                                 2 * imgSizeH,
-                                 2 * imgSizeW,
-                                 imgSizeH,
-                                 imgSizeW,
-                                 channels,
-                                 ratioH,
-                                 ratioW);
-
-  // check
-  targetCheckGrad->copyFrom(*inputGpuGrad);
-  MatrixCheckErr(*inputGrad, *targetCheckGrad);
-}
-
-TEST(Profiler, testBilinearFwdBwd) {
-  auto numSamples = 10;
-  auto channels = 16;
-  auto imgSize = 64;
-  {
-    // nvprof: GPU Proflier
-    REGISTER_GPU_PROFILER("testBilinearFwdBwd");
-    // Paddle built-in timer
-    REGISTER_TIMER_INFO(
-        "testBilinearFwdBwd",
-        "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
-    testBilinearFwdBwd(numSamples, imgSize, imgSize, channels);
-  }
-  globalStat.printAllStatus();
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-
-  // nvprof: GPU Proflier
-  REGISTER_GPU_PROFILER(
-      "RecursiveProfilingTest",
-      "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
-
-  return RUN_ALL_TESTS();
-}
-
-#endif
diff --git a/paddle/math/tests/test_RowBuffer.cpp b/paddle/math/tests/test_RowBuffer.cpp
deleted file mode 100644
index e38de853e03874be3fd3582f7b39b1d490886d78..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_RowBuffer.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "paddle/math/RowBuffer.h"
-
-TEST(RowBuffer, testAutoGrow) {
-  paddle::RowBuffer buf(128);
-  ASSERT_EQ(128UL, buf.getWidth());
-  ASSERT_TRUE(buf.isAutoGrowth());
-  buf.resize(2);
-  ASSERT_EQ(2UL, buf.getRowCount());
-  for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
-    buf.data()[i] = i;
-  }
-  for (size_t i = 0; i < buf.getRowCount(); ++i) {
-    for (size_t j = 0; j < buf.getWidth(); ++j) {
-      ASSERT_NEAR(i * buf.getWidth() + j, buf.get(i)[j], 1e-5);
-    }
-  }
-
-  auto data = buf.getWithAutoGrowth(2);
-  for (size_t i = 0; i < buf.getWidth(); ++i) {
-    data[i] = i;
-  }
-
-  ASSERT_EQ(3UL, buf.getRowCount());
-  for (size_t i = 0; i < buf.getRowCount() - 1; ++i) {
-    for (size_t j = 0; j < buf.getWidth(); ++j) {
-      ASSERT_NEAR(i * buf.getWidth() + j, buf.get(i)[j], 1e-5);
-    }
-  }
-  for (size_t i = 0; i < buf.getWidth(); ++i) {
-    ASSERT_NEAR(i, buf.get(2)[i], 1e-5);
-  }
-}
-
-TEST(RowBuffer, testWithMemBuf) {
-  paddle::CpuMemHandlePtr mem =
-      std::make_shared<paddle::CpuMemoryHandle>(128 * 2 * sizeof(real));
-  paddle::RowBuffer buf(mem, 128);
-  ASSERT_TRUE(!buf.isAutoGrowth());
-  ASSERT_EQ(2UL, buf.getRowCount());
-  for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
-    buf.data()[i] = i;
-  }
-  for (size_t i = 0; i < buf.getRowCount(); ++i) {
-    for (size_t j = 0; j < buf.getWidth(); ++j) {
-      ASSERT_NEAR(i * buf.getWidth() + j, buf.getWithAutoGrowth(i)[j], 1e-5);
-    }
-  }
-
-  ASSERT_DEATH_IF_SUPPORTED(buf.getWithAutoGrowth(3), ".*");
-}
diff --git a/paddle/math/tests/test_SIMDFunctions.cpp b/paddle/math/tests/test_SIMDFunctions.cpp
deleted file mode 100644
index b692679436ee7bd3b8c4a675e969e15b065cc534..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_SIMDFunctions.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/math/SIMDFunctions.h"
-#include "paddle/utils/Util.h"
-
-#include <gtest/gtest.h>
-
-#include <algorithm>
-#include <functional>
-#include <memory>
-#include <random>
-
-#include <stdlib.h>
-#include <time.h>
-
-static constexpr size_t VECTOR_LEN = 3072;
-static constexpr size_t BATCH_SIZE = 64;
-static constexpr size_t ALIGN = 32;
-static_assert(VECTOR_LEN % ALIGN == 0, "VECTOR_LEN % ALIGN == 0");
-static_assert(BATCH_SIZE % ALIGN == 0, "BATCH_SIZE % ALIGN == 0");
-static constexpr float EPSILON = 1e-5;
-static std::mt19937 RandomEngine(time(0));
-
-inline static std::unique_ptr<float[]> NewVector(size_t len = VECTOR_LEN,
-                                                 size_t align = ALIGN) {
-  float* ptr;
-  CHECK_EQ(posix_memalign((void**)&ptr, align, len * sizeof(float)), 0);
-  return std::unique_ptr<float[]>(ptr);
-}
-
-inline static std::unique_ptr<float[]> NewRandomVector(size_t len = VECTOR_LEN,
-                                                       size_t align = ALIGN) {
-  std::uniform_real_distribution<float> dist(-100.0f, 100.0f);
-  auto generator = std::bind(dist, RandomEngine);
-  auto retv = NewVector(len, align);
-  std::generate_n(retv.get(), len, generator);
-  return retv;
-}
-
-TEST(SIMDFunction, addTo) {
-  typedef std::function<void(float*, const float*, size_t)> AddToMethodType;
-
-  AddToMethodType naive = paddle::simd::naive::addTo<float>;
-  AddToMethodType simd = paddle::simd::addTo<float>;
-
-  auto A = NewRandomVector();
-  auto B = NewRandomVector();
-
-  auto ACopy = NewVector();
-  memcpy(ACopy.get(), A.get(), VECTOR_LEN * sizeof(float));
-
-  naive(A.get(), B.get(), VECTOR_LEN);
-  simd(ACopy.get(), B.get(), VECTOR_LEN);
-
-  for (size_t i = 0; i < VECTOR_LEN; ++i) {
-    ASSERT_NEAR(A[i], ACopy[i], EPSILON);
-  }
-}
-
-TEST(SIMDFunction, batchAddTo) {
-  auto A = NewRandomVector();
-  auto ACopy = NewVector();
-  memcpy(ACopy.get(), A.get(), sizeof(float) * VECTOR_LEN);
-
-  std::vector<std::unique_ptr<float[]>> B;
-  for (size_t i = 0; i < BATCH_SIZE; ++i) {
-    B.emplace_back(NewRandomVector());
-  }
-  std::unique_ptr<float* []> BRaw(new float*[BATCH_SIZE]);
-  for (size_t i = 0; i < BATCH_SIZE; ++i) {
-    BRaw[i] = B[i].get();
-  }
-
-  typedef std::function<void(float*, const float**, int, size_t)>
-      BatchAddToMethodType;
-
-  BatchAddToMethodType naive = paddle::simd::naive::batchAddTo<float>;
-  BatchAddToMethodType simd = paddle::simd::batchAddTo<float>;
-
-  naive(A.get(), (const float**)BRaw.get(), BATCH_SIZE, VECTOR_LEN);
-  simd(ACopy.get(), (const float**)BRaw.get(), BATCH_SIZE, VECTOR_LEN);
-
-  for (size_t i = 0; i < VECTOR_LEN; ++i) {
-    ASSERT_NEAR(A[i], ACopy[i], EPSILON);
-  }
-}
-
-TEST(SIMDFunction, colMax) {
-  auto A = NewRandomVector(VECTOR_LEN * BATCH_SIZE);
-  auto naiveResult = NewVector(BATCH_SIZE);
-  auto simdResult = NewVector(BATCH_SIZE);
-
-  typedef std::function<void(float*, const float*, int, int)> ColMaxMethodType;
-  ColMaxMethodType naive = paddle::simd::naive::colMax<float>;
-  ColMaxMethodType simd = paddle::simd::colMax<float>;
-
-  naive(naiveResult.get(), A.get(), BATCH_SIZE, VECTOR_LEN);
-  simd(simdResult.get(), A.get(), BATCH_SIZE, VECTOR_LEN);
-
-  for (size_t i = 0; i < BATCH_SIZE; ++i) {
-    ASSERT_NEAR(naiveResult[i], simdResult[i], EPSILON);
-  }
-}
-
-TEST(SIMDFunction, decayL1_WithLR) {
-  auto dest = NewRandomVector();
-  auto src = NewRandomVector();
-  auto lr = NewRandomVector();
-  auto lambda = 0.23f;
-
-  auto simd_dest = NewVector();
-  memcpy(simd_dest.get(), dest.get(), sizeof(float) * VECTOR_LEN);
-
-  typedef std::function<void(float*, float*, float*, float, size_t)>
-      DecayL1MethodType;
-
-  DecayL1MethodType naive = [](
-      float* d, float* s, float* lr, float l, size_t len) {
-    paddle::simd::naive::decayL1<float>(d, s, lr, l, len);
-  };
-
-  DecayL1MethodType simd = [](
-      float* d, float* s, float* lr, float l, size_t len) {
-    paddle::simd::decayL1<float>(d, s, lr, l, len);
-  };
-
-  naive(dest.get(), src.get(), lr.get(), lambda, VECTOR_LEN);
-  simd(simd_dest.get(), src.get(), lr.get(), lambda, VECTOR_LEN);
-
-  for (size_t i = 0; i < VECTOR_LEN; ++i) {
-    ASSERT_NEAR(dest[i], simd_dest[i], EPSILON);
-  }
-}
-
-TEST(SIMDFunction, decayL1_WithoutLR) {
-  auto dest = NewRandomVector();
-  auto src = NewRandomVector();
-  auto lambda = 0.23;
-
-  auto simd_dest = NewVector();
-  memcpy(simd_dest.get(), dest.get(), sizeof(float) * VECTOR_LEN);
-
-  typedef std::function<void(float*, float*, float, size_t)> DecayL1MethodType;
-
-  DecayL1MethodType naive = [](float* d, float* s, float l, size_t len) {
-    paddle::simd::naive::decayL1<float>(d, s, l, len);
-  };
-
-  DecayL1MethodType simd = [](float* d, float* s, float l, size_t len) {
-    paddle::simd::decayL1<float>(d, s, l, len);
-  };
-
-  naive(dest.get(), src.get(), lambda, VECTOR_LEN);
-  simd(simd_dest.get(), src.get(), lambda, VECTOR_LEN);
-
-  for (size_t i = 0; i < VECTOR_LEN; ++i) {
-    ASSERT_NEAR(dest[i], simd_dest[i], EPSILON);
-  }
-}
diff --git a/paddle/math/tests/test_SparseMatrix.cpp b/paddle/math/tests/test_SparseMatrix.cpp
deleted file mode 100644
index dbcbeb8d506cf22c026bb7299bf7f71de488cb4a..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_SparseMatrix.cpp
+++ /dev/null
@@ -1,565 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/utils/PythonUtil.h>
-#include <vector>
-#include "test_matrixUtil.h"
-
-using namespace paddle;  // NOLINT
-
-TEST(Matrix, CopyCpuMatrixToSparseMatrix) {
-  const size_t HEIGHT = 20;
-  const size_t WIDTH = 10;
-  const size_t WIDTH_TEST = 15;
-  MatrixPtr testMatrix(
-      new CpuSparseMatrix(HEIGHT, WIDTH, HEIGHT * 5, FLOAT_VALUE, SPARSE_CSR));
-  MatrixPtr testCpuMatrix(new CpuMatrix(HEIGHT, WIDTH));
-  testCpuMatrix->randomizeUniform();
-  testMatrix->copyFrom(*testCpuMatrix, HPPL_STREAM_DEFAULT);
-  MatrixPtr mulCpuMatrix(new CpuMatrix(WIDTH, WIDTH_TEST));
-  mulCpuMatrix->randomizeUniform();
-  MatrixPtr ret1(new CpuMatrix(HEIGHT, WIDTH_TEST)),
-      ret2(new CpuMatrix(HEIGHT, WIDTH_TEST));
-  ret1->zeroMem();
-  ret2->zeroMem();
-  ret1->mul(*testMatrix, *mulCpuMatrix, 1.0, 1.0);
-  ret2->mul(*testCpuMatrix, *mulCpuMatrix, 1.0, 1.0);
-  checkMatrixEqual(ret1, ret2);
-}
-
-struct MatrixPara {
-  size_t height;
-  size_t width;
-  bool trans;
-  bool sparse;
-  size_t nnz;
-  SparseFormat format;
-};
-
-#ifdef PADDLE_WITH_CUDA
-void test_sparse_matrix_mul(MatrixPara paraA,
-                            MatrixPara paraB,
-                            MatrixPara paraC) {
-  // for cpu sparse matrix mul
-  MatrixPtr cpuMatrixA, cpuMatrixB, cpuMatrixC, gpuMatrixC_d2h;
-  // for gpu sparse matrix mul
-  MatrixPtr gpuMatrixA, gpuMatrixB, gpuMatrixC;
-  // for cpu dense matrix mul
-  MatrixPtr cpuDenseA, cpuDenseB, cpuDenseC;
-
-  if (paraA.sparse) {
-    cpuMatrixA = Matrix::createSparseMatrix(paraA.height,
-                                            paraA.width,
-                                            paraA.nnz,
-                                            FLOAT_VALUE,
-                                            paraA.format,
-                                            paraA.trans,
-                                            false);
-    gpuMatrixA = Matrix::createSparseMatrix(paraA.height,
-                                            paraA.width,
-                                            paraA.nnz,
-                                            FLOAT_VALUE,
-                                            paraA.format,
-                                            paraA.trans,
-                                            true);
-  } else {
-    cpuMatrixA = Matrix::create(paraA.height, paraA.width, paraA.trans, false);
-    gpuMatrixA = Matrix::create(paraA.height, paraA.width, paraA.trans, true);
-  }
-  cpuDenseA = Matrix::create(paraA.height, paraA.width, paraA.trans, false);
-
-  if (paraB.sparse) {
-    cpuMatrixB = Matrix::createSparseMatrix(paraB.height,
-                                            paraB.width,
-                                            paraB.nnz,
-                                            FLOAT_VALUE,
-                                            paraB.format,
-                                            paraB.trans,
-                                            false);
-    gpuMatrixB = Matrix::createSparseMatrix(paraB.height,
-                                            paraB.width,
-                                            paraB.nnz,
-                                            FLOAT_VALUE,
-                                            paraB.format,
-                                            paraB.trans,
-                                            true);
-  } else {
-    cpuMatrixB = Matrix::create(paraB.height, paraB.width, paraB.trans, false);
-    gpuMatrixB = Matrix::create(paraB.height, paraB.width, paraB.trans, true);
-  }
-  cpuDenseB = Matrix::create(paraB.height, paraB.width, paraB.trans, false);
-
-  if (paraC.sparse) {
-    cpuMatrixC = Matrix::createSparseMatrix(paraC.height,
-                                            paraC.width,
-                                            paraC.nnz,
-                                            FLOAT_VALUE,
-                                            paraC.format,
-                                            paraC.trans,
-                                            false);
-    gpuMatrixC = Matrix::createSparseMatrix(paraC.height,
-                                            paraC.width,
-                                            paraC.nnz,
-                                            FLOAT_VALUE,
-                                            paraC.format,
-                                            paraC.trans,
-                                            true);
-    gpuMatrixC_d2h = Matrix::createSparseMatrix(paraC.height,
-                                                paraC.width,
-                                                paraC.nnz,
-                                                FLOAT_VALUE,
-                                                paraC.format,
-                                                paraC.trans,
-                                                false);
-  } else {
-    cpuMatrixC = Matrix::create(paraC.height, paraC.width, paraC.trans, false);
-    gpuMatrixC = Matrix::create(paraC.height, paraC.width, paraC.trans, true);
-    gpuMatrixC_d2h =
-        Matrix::create(paraC.height, paraC.width, paraC.trans, false);
-  }
-  cpuDenseC = Matrix::create(paraC.height, paraC.width, paraC.trans, false);
-
-  /*matrix init*/
-  hl_stream_t stream(HPPL_STREAM_1);
-  cpuMatrixA->randomizeUniform();
-  cpuMatrixB->randomizeUniform();
-  cpuMatrixC->randomizeUniform();
-
-  gpuMatrixA->copyFrom(*cpuMatrixA, stream);
-  gpuMatrixB->copyFrom(*cpuMatrixB, stream);
-  gpuMatrixC->copyFrom(*cpuMatrixC, stream);
-
-  cpuDenseA->copyFrom(*cpuMatrixA);
-  cpuDenseB->copyFrom(*cpuMatrixB);
-  cpuDenseC->copyFrom(*cpuMatrixC);
-
-  hl_stream_synchronize(stream);
-
-  /*matrix mul*/
-  cpuMatrixC->mul(*cpuMatrixA, *cpuMatrixB, 1.0, 1.0);
-  gpuMatrixC->mul(*gpuMatrixA, *gpuMatrixB, 1.0, 1.0);
-  cpuDenseC->mul(*cpuDenseA, *cpuDenseB, 1.0, 1.0);
-
-  gpuMatrixC_d2h->copyFrom(*gpuMatrixC, stream);
-  hl_stream_synchronize(stream);
-
-  /*check result*/
-  if (paraC.sparse) {
-    checkSMatrixEqual(
-        std::dynamic_pointer_cast<CpuSparseMatrix>(cpuMatrixC),
-        std::dynamic_pointer_cast<CpuSparseMatrix>(gpuMatrixC_d2h));
-    checkSMatrixEqual2Dense(
-        std::dynamic_pointer_cast<CpuSparseMatrix>(cpuMatrixC),
-        std::dynamic_pointer_cast<CpuMatrix>(cpuDenseC));
-  } else {
-    checkMatrixEqual(cpuMatrixC, gpuMatrixC_d2h);
-    checkMatrixEqual(cpuMatrixC, cpuDenseC);
-  }
-}
-
-TEST(Matrix, SparseMatrixMul) {
-  const size_t DIM_M = 4;
-  const size_t DIM_N = 4;
-  const size_t DIM_K = 8;
-  const size_t NNZ = 5;
-  for (auto format : {SPARSE_CSC, SPARSE_CSR}) {
-    std::string str_format = format == SPARSE_CSC ? "CSC" : "CSR";
-    LOG(INFO) << "test dense mul " << str_format;
-    test_sparse_matrix_mul(
-        {DIM_M, DIM_K, /*trans*/ false, /*sparse*/ false, NNZ, format},
-        {DIM_K, DIM_N, /*trans*/ false, /*sparse*/ true, NNZ, format},
-        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format});
-
-    LOG(INFO) << "test dense mul " << str_format << "  trans";
-    test_sparse_matrix_mul(
-        {DIM_M, DIM_K, /*trans*/ false, /*sparse*/ false, NNZ, format},
-        {DIM_N, DIM_K, /*trans*/ true, /*sparse*/ true, NNZ, format},
-        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format});
-
-    LOG(INFO) << "test dense mul dense 2 " << str_format;
-    test_sparse_matrix_mul(
-        {DIM_M, DIM_K, /*trans*/ false, /*sparse*/ false, NNZ, format},
-        {DIM_K, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format},
-        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ true, NNZ, format});
-
-    LOG(INFO) << "test denseT mul dense 2 " << str_format;
-    test_sparse_matrix_mul(
-        {DIM_K, DIM_M, /*trans*/ true, /*sparse*/ false, NNZ, format},
-        {DIM_K, DIM_N, /*trans*/ false, /*sparse*/ false, NNZ, format},
-        {DIM_M, DIM_N, /*trans*/ false, /*sparse*/ true, NNZ, format});
-  }
-}
-
-TEST(Matrix, CopySparseMatrixToGpuSparseMatrix) {
-  const size_t HEIGHT = 20;
-  const size_t WIDTH = 10;
-  const size_t WIDTH_TEST = 15;
-  MatrixPtr testMatrix(
-      new CpuSparseMatrix(HEIGHT, WIDTH, HEIGHT * 2, FLOAT_VALUE, SPARSE_CSR));
-  MatrixPtr testCpuMatrix(new CpuMatrix(HEIGHT, WIDTH));
-  testCpuMatrix->randomizeUniform();
-  testMatrix->copyFrom(*testCpuMatrix, HPPL_STREAM_DEFAULT);
-
-  MatrixPtr testGpuMatrix = testMatrix->clone(HEIGHT, WIDTH, true);
-  hl_stream_t gpuStream(HPPL_STREAM_3);
-  testGpuMatrix->copyFrom(*testMatrix, gpuStream);
-  hl_stream_synchronize(gpuStream);
-
-  MatrixPtr mulCpuMatrix(new CpuMatrix(WIDTH, WIDTH_TEST));
-  mulCpuMatrix->randomizeUniform();
-  MatrixPtr mulGpuMatrix(new GpuMatrix(WIDTH, WIDTH_TEST));
-  mulGpuMatrix->copyFrom(*mulCpuMatrix);
-  MatrixPtr ret1(new CpuMatrix(HEIGHT, WIDTH_TEST));
-  MatrixPtr ret2(new GpuMatrix(HEIGHT, WIDTH_TEST));
-  ret1->zeroMem();
-  ret2->zeroMem();
-  ret1->mul(*testMatrix, *mulCpuMatrix, 1.0, 1.0);
-  ret2->mul(*testGpuMatrix, *mulGpuMatrix, 1.0, 1.0);
-  checkMatrixEqual(ret1, ret2);
-}
-
-#endif
-
-TEST(Matrix, SparseMatrixTranspose) {
-  for (auto height : {10, 50, 100}) {
-    for (auto width : {10, 50, 100}) {
-      auto nnz = height * width;
-      for (auto valueType : {FLOAT_VALUE, NO_VALUE}) {
-        for (auto format : {SPARSE_CSR, SPARSE_CSC}) {
-          for (auto sparseRate : {0.1, 0.2, 0.5}) {
-            MatrixPtr matA = Matrix::createSparseMatrix(
-                height, width, size_t(nnz * sparseRate), valueType, format);
-            MatrixPtr matB(new CpuSparseMatrix(
-                width, height, size_t(nnz * sparseRate), valueType, format));
-            matA->randomizeUniform();
-            matA->transpose(matB, false);
-
-            /*dense matrix transpose*/
-            CpuMatrixPtr matC(new CpuMatrix(height, width));
-            matC->copyFrom(*matA);
-            MatrixPtr matD(new CpuMatrix(width, height));
-            matC->transpose(matD, false);
-
-            /*check result*/
-            checkSMatrixEqual2Dense(
-                std::dynamic_pointer_cast<CpuSparseMatrix>(matB),
-                std::dynamic_pointer_cast<CpuMatrix>(matD));
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(Matrix, CpuSparseMatrixSubMatrix) {
-  const size_t HEIGHT = 10;
-  const size_t WIDTH = 10;
-  const size_t NNZ = HEIGHT * WIDTH;
-  for (auto valueType : {FLOAT_VALUE, NO_VALUE}) {
-    size_t startRow = 3;
-    size_t rowNum = 2;
-    real sparseRate = 0.1;
-    /*sparse matrix init and get subMatrix*/
-    CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
-        HEIGHT, WIDTH, size_t(NNZ * sparseRate), valueType, SPARSE_CSR);
-    matA->randomizeUniform();
-    CpuSparseMatrixPtr matB = std::dynamic_pointer_cast<CpuSparseMatrix>(
-        matA->subMatrix(startRow, rowNum));
-
-    int start = matA->getRows()[startRow];
-    int end = matA->getRows()[startRow + rowNum];
-
-    /*compare two matrix*/
-    ASSERT_EQ(matB->getElementCnt(), size_t(end - start));
-    if (valueType == FLOAT_VALUE) {
-      for (size_t i = 0; i < matB->getElementCnt(); i++) {
-        ASSERT_FLOAT_EQ(matB->getValue()[start + i],
-                        matA->getValue()[start + i]);
-      }
-    }
-
-    for (size_t i = 0; i < matB->getElementCnt(); i++) {
-      ASSERT_EQ(matB->getCols()[start + i], matA->getCols()[start + i]);
-    }
-    for (size_t i = 0; i < rowNum; i++) {
-      ASSERT_EQ(matB->getRows()[i], matA->getRows()[startRow + i]);
-    }
-  }
-}
-
-void sparseValid(
-    int* major, int* minor, size_t nnz, size_t majorLen, size_t minorLen) {
-  CHECK_EQ(nnz, size_t(major[majorLen - 1]));
-  CHECK_EQ(nnz, minorLen);
-  for (size_t i = 0; i < majorLen - 1; i++) {
-    EXPECT_LE(major[i], major[i + 1]);
-    for (int j = major[i]; j < major[i + 1] - 1; j++) {
-      EXPECT_LE(minor[j], minor[j + 1]);
-    }
-  }
-}
-
-TEST(Matrix, CpuSparseMatrixRandUniform) {
-  const size_t HEIGHT = 5;
-  const size_t WIDTH = 10;
-  const size_t NNZ = HEIGHT * WIDTH;
-  int* major = nullptr;
-  int* minor = nullptr;
-  size_t majorLen = 0;
-  size_t minorLen = 0;
-  size_t nnz = 0;
-  for (auto valueType : {NO_VALUE, FLOAT_VALUE}) {
-    for (auto format : {SPARSE_CSR, SPARSE_CSC}) {
-      CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
-          HEIGHT, WIDTH, size_t(NNZ * 0.1), valueType, format);
-      matA->randomizeUniform();
-      nnz = matA->getElementCnt();
-      if (format == SPARSE_CSR) {
-        majorLen = matA->getHeight() + 1;
-        minorLen = matA->getElementCnt();
-        major = matA->getRows();
-        minor = matA->getCols();
-      } else {
-        majorLen = matA->getWidth() + 1;
-        minorLen = matA->getElementCnt();
-        major = matA->getCols();
-        minor = matA->getRows();
-      }
-      sparseValid(major, minor, nnz, majorLen, minorLen);
-    }
-  }
-}
-
-TEST(Matrix, CpuSparseMatrixCopyFrom) {
-  size_t height = 10;
-  size_t width = 8;
-  int64_t indices[11] = {0, 1, 5, 5, 9, 13, 15, 17, 19, 30, 32};
-  sparse_non_value_t data[32];
-  for (size_t i = 0; i < 32; i++) {
-    data[i].col = ::rand() % width;
-  }
-  CpuSparseMatrixPtr mat = std::make_shared<CpuSparseMatrix>(
-      height, width, 32, NO_VALUE, SPARSE_CSR, false);
-  mat->copyFrom(indices, data);
-
-  /*compare indices*/
-  size_t sum = 0;
-  CHECK_EQ(sum, size_t(mat->getRows()[0]));
-  for (size_t i = 1; i < height + 1; i++) {
-    sum += indices[i] - indices[i - 1];
-    CHECK_EQ(sum, size_t(mat->getRows()[i]));
-  }
-  CHECK_EQ(mat->getElementCnt(), size_t(indices[height] - indices[0]));
-  for (size_t i = 0; i < mat->getElementCnt(); i++) {
-    CHECK_EQ(size_t(mat->getCols()[i]), size_t(data[i].col));
-  }
-}
-
-TEST(Matrix, SparseMatrixCSRFormatTrimFrom) {
-  size_t height = 10;
-  size_t width = 8;
-  int64_t indices[11] = {0, 1, 5, 5, 9, 13, 15, 17, 19, 27, 32};
-  sparse_float_value_t data[32];
-  int value[32] = {
-      1,                       // row_0 : 1
-      5, 3, 1, 6,              // row_1 : 4
-      0, 1, 2, 3,              // row_3 : 4
-      4, 5, 6, 7,              // row_4 : 4
-      2, 3,                    // row_5 : 2
-      3, 5,                    // row_6 : 2
-      0, 1,                    // row_7 : 2
-      0, 1, 2, 3, 4, 5, 6, 7,  // row_8 : 8
-      2, 4, 7, 3, 1            // row_9 : 5
-  };
-  for (size_t i = 0; i < 32; i++) {
-    data[i].col = value[i];
-    data[i].value = float(value[i]);
-  }
-  CpuSparseMatrixPtr mat = std::make_shared<CpuSparseMatrix>(
-      height, width, 32, FLOAT_VALUE, SPARSE_CSR, false);
-  mat->copyFrom(indices, data);
-
-  /*compare indices*/
-  size_t sum = 0;
-  CHECK_EQ(sum, size_t(mat->getRows()[0]));
-  for (size_t i = 1; i < height + 1; i++) {
-    sum += indices[i] - indices[i - 1];
-    CHECK_EQ(sum, size_t(mat->getRows()[i]));
-  }
-  CHECK_EQ(mat->getElementCnt(), size_t(indices[height] - indices[0]));
-  for (size_t i = 0; i < mat->getElementCnt(); i++) {
-    CHECK_EQ(size_t(mat->getCols()[i]), size_t(data[i].col));
-  }
-
-  size_t trimedWidth = 4;
-  int64_t trimedIndices[11] = {0, 1, 3, 3, 7, 7, 9, 10, 12, 16, 19};
-  sparse_float_value_t trimedData[19];
-  int trimedValue[19] = {
-      1,  // row_0 : 1
-      3,
-      1,  // row_1 : 2
-      0,
-      1,
-      2,
-      3,  // row_3 : 4
-      2,
-      3,  // row_5 : 2
-      3,  // row_6 : 1
-      0,
-      1,  // row_7 : 2
-      0,
-      1,
-      2,
-      3,  // row_8 : 4
-      2,
-      3,
-      1  // row_9 : 3
-  };
-  for (size_t i = 0; i < 19; i++) {
-    trimedData[i].col = trimedValue[i];
-    trimedData[i].value = float(trimedValue[i]);
-  }
-  CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
-      height, trimedWidth, 19, FLOAT_VALUE, SPARSE_CSR, false);
-  matA->copyFrom(trimedIndices, trimedData);
-
-  /*compare indices*/
-  sum = 0;
-  CHECK_EQ(sum, size_t(matA->getRows()[0]));
-  for (size_t i = 1; i < height + 1; i++) {
-    sum += trimedIndices[i] - trimedIndices[i - 1];
-    CHECK_EQ(sum, size_t(matA->getRows()[i]));
-  }
-  CHECK_EQ(matA->getElementCnt(),
-           size_t(trimedIndices[height] - trimedIndices[0]));
-  for (size_t i = 0; i < matA->getElementCnt(); i++) {
-    CHECK_EQ(size_t(matA->getCols()[i]), size_t(trimedData[i].col));
-  }
-
-  CpuSparseMatrixPtr matB = std::make_shared<CpuSparseMatrix>(
-      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSR, false);
-  matB->trimFrom(*mat);
-  checkSMatrixEqual2(matA, matB);
-
-#ifdef PADDLE_WITH_CUDA
-  GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>(
-      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSR, true);
-  matC->trimFrom(*mat);
-
-  CpuSparseMatrixPtr matD =
-      std::make_shared<CpuSparseMatrix>(height,
-                                        trimedWidth,
-                                        matC->getElementCnt(),
-                                        FLOAT_VALUE,
-                                        SPARSE_CSR,
-                                        false);
-  matD->copyFrom(*matC, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  checkSMatrixEqual2(matA, matD);
-#endif
-}
-
-TEST(Matrix, SparseMatrixCSCFormatTrimFrom) {
-  size_t height = 8;
-  size_t width = 10;
-  int indices[11] = {0, 1, 5, 5, 9, 13, 15, 17, 19, 27, 32};
-  int value[32] = {
-      1,                       // col_0 : 1
-      5, 3, 1, 6,              // col_1 : 4
-      0, 1, 2, 3,              // col_3 : 4
-      4, 5, 6, 7,              // col_4 : 4
-      2, 3,                    // col_5 : 2
-      3, 5,                    // col_6 : 2
-      0, 1,                    // col_7 : 2
-      0, 1, 2, 3, 4, 5, 6, 7,  // col_8 : 8
-      2, 4, 7, 3, 1            // col_9 : 5
-  };
-  std::vector<int> rows(value, value + 32);
-  std::vector<int> cols(indices, indices + 11);
-  std::vector<real> values(value, value + 32);
-  CpuSparseMatrixPtr mat = std::make_shared<CpuSparseMatrix>(
-      height, width, 32, FLOAT_VALUE, SPARSE_CSC, false);
-  mat->copyFrom(rows, cols, values);
-
-  /*compare indices*/
-  size_t sum = 0;
-  CHECK_EQ(sum, size_t(mat->getCols()[0]));
-  for (size_t i = 1; i < width + 1; i++) {
-    sum += indices[i] - indices[i - 1];
-    CHECK_EQ(sum, size_t(mat->getCols()[i]));
-  }
-  CHECK_EQ(mat->getElementCnt(), size_t(indices[width] - indices[0]));
-  for (size_t i = 0; i < mat->getElementCnt(); i++) {
-    CHECK_EQ(size_t(mat->getRows()[i]), size_t(value[i]));
-  }
-
-  size_t trimedWidth = 5;
-  int trimedIndices[6] = {0, 1, 5, 5, 9, 13};
-  int trimedValue[13] = {
-      1,  // col_0 : 1
-      5,
-      3,
-      1,
-      6,  // col_1 : 4
-      0,
-      1,
-      2,
-      3,  // col_3 : 4
-      4,
-      5,
-      6,
-      7  // col_4 : 4
-  };
-  std::vector<int> rowsA(trimedValue, trimedValue + 13);
-  std::vector<int> colsA(trimedIndices, trimedIndices + 6);
-  std::vector<real> valuesA(trimedValue, trimedValue + 13);
-  CpuSparseMatrixPtr matA = std::make_shared<CpuSparseMatrix>(
-      height, trimedWidth, 13, FLOAT_VALUE, SPARSE_CSC, false);
-  matA->copyFrom(rowsA, colsA, valuesA);
-
-  /*compare indices*/
-  sum = 0;
-  CHECK_EQ(sum, size_t(matA->getCols()[0]));
-  for (size_t i = 1; i < trimedWidth + 1; i++) {
-    sum += trimedIndices[i] - trimedIndices[i - 1];
-    CHECK_EQ(sum, size_t(matA->getCols()[i]));
-  }
-  CHECK_EQ(matA->getElementCnt(),
-           size_t(trimedIndices[trimedWidth] - trimedIndices[0]));
-  for (size_t i = 0; i < matA->getElementCnt(); i++) {
-    CHECK_EQ(size_t(matA->getRows()[i]), size_t(rowsA[i]));
-  }
-
-  CpuSparseMatrixPtr matB = std::make_shared<CpuSparseMatrix>(
-      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSC, false);
-  matB->trimFrom(*mat);
-  checkSMatrixEqual2(matA, matB);
-
-#ifdef PADDLE_WITH_CUDA
-  GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>(
-      height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSC, true);
-  matC->trimFrom(*mat);
-
-  CpuSparseMatrixPtr matD =
-      std::make_shared<CpuSparseMatrix>(height,
-                                        trimedWidth,
-                                        matC->getElementCnt(),
-                                        FLOAT_VALUE,
-                                        SPARSE_CSC,
-                                        false);
-  matD->copyFrom(*matC, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  checkSMatrixEqual2(matA, matD);
-#endif
-}
diff --git a/paddle/math/tests/test_Tensor.cu b/paddle/math/tests/test_Tensor.cu
deleted file mode 100644
index acb2da86d0f41d12fced97d1ddaf5be00959fb82..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_Tensor.cu
+++ /dev/null
@@ -1,1162 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "TensorCheck.h"
-#include "paddle/math/Matrix.h"
-
-using paddle::Matrix;
-using paddle::CpuMatrix;
-using paddle::GpuMatrix;
-using paddle::CpuVector;
-using paddle::GpuVector;
-using paddle::CpuIVector;
-using paddle::GpuIVector;
-using autotest::TensorCheckEqual;
-using autotest::TensorCheckErr;
-
-#define INIT_UNARY(A1, A2)  \
-  Tensor A1(height, width); \
-  Tensor A2(height, width); \
-  A1.randomizeUniform();    \
-  A2.copyFrom(A1)
-#define INIT_BINARY(A1, A2, B) \
-  INIT_UNARY(A1, A2);          \
-  Tensor B(height, width);     \
-  B.randomizeUniform()
-#define INIT_TERNARY(A1, A2, B, C) \
-  INIT_BINARY(A1, A2, B);          \
-  Tensor C(height, width);         \
-  C.randomizeUniform()
-#define INIT_QUATERNARY(A1, A2, B, C, D) \
-  INIT_TERNARY(A1, A2, B, C);            \
-  Tensor D(height, width);               \
-  D.randomizeUniform()
-
-template <typename Tensor>
-struct TestUnaryMatrix {
-  typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
-
-  explicit TestUnaryMatrix(UnaryFunc testUnaryFunc) {
-    for (auto height : {1, 11, 73, 128, 200, 330}) {
-      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
-        LOG(INFO) << " height=" << height << " width=" << width;
-        INIT_UNARY(A1, A2);
-        testUnaryFunc(A1, A2);
-      }
-    }
-  }
-};
-
-template <typename Tensor>
-struct TestBinaryMatrix {
-  typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B)> BinaryFunc;
-
-  explicit TestBinaryMatrix(BinaryFunc testBinaryFunc) {
-    for (auto height : {1, 11, 73, 128, 200, 330}) {
-      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
-        LOG(INFO) << " height=" << height << " width=" << width;
-        INIT_BINARY(A1, A2, B);
-        testBinaryFunc(A1, A2, B);
-      }
-    }
-  }
-};
-
-template <typename Tensor>
-struct TestTernaryMatrix {
-  typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C)>
-      TernaryFunc;
-
-  explicit TestTernaryMatrix(TernaryFunc testTernaryFunc) {
-    for (auto height : {1, 11, 73, 128, 200, 330}) {
-      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
-        LOG(INFO) << " height=" << height << " width=" << width;
-        INIT_TERNARY(A1, A2, B, C);
-        testTernaryFunc(A1, A2, B, C);
-      }
-    }
-  }
-};
-
-template <typename Tensor>
-struct TestQuaternaryMatrix {
-  typedef std::function<void(
-      Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D)>
-      QuaternaryFunc;
-
-  explicit TestQuaternaryMatrix(QuaternaryFunc testQuaternaryFunc) {
-    for (auto height : {1, 11, 73, 128, 200, 330}) {
-      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
-        LOG(INFO) << " height=" << height << " width=" << width;
-        INIT_QUATERNARY(A1, A2, B, C, D);
-        testQuaternaryFunc(A1, A2, B, C, D);
-      }
-    }
-  }
-};
-
-template <typename Tensor, class T>
-struct TestUnaryVectorT {
-  typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
-
-  explicit TestUnaryVectorT(UnaryFunc testUnaryFunc) {
-    for (auto size : {1, 11, 73, 128, 200, 330, 512, 1000, 4210}) {
-      LOG(INFO) << " size=" << size;
-      Tensor A1(size);
-      Tensor A2(size);
-      if (typeid(T) == typeid(real)) {
-        A1.rand();
-      } else {
-        A1.rand(1000);
-      }
-      A2.copyFrom(A1);
-      testUnaryFunc(A1, A2);
-    }
-  }
-};
-
-void SetTensorValue(Matrix& matrix, real value) {
-  int height = matrix.getHeight();
-  int width = matrix.getWidth();
-  int stride = matrix.getStride();
-  real* data = matrix.getData();
-  for (int i = 0; i < height; i++) {
-    int j = rand() % width;  // NOLINT
-    if (typeid(matrix) == typeid(CpuMatrix)) {
-      data[i * stride + j] = value;
-    } else if (typeid(matrix) == typeid(GpuMatrix)) {
-      hl_memcpy(&data[i * stride + j], &value, sizeof(real));
-    } else {
-    }
-  }
-}
-
-template <typename Tensor>
-void testTensorAddScalar(Tensor& A1, Tensor& A2) {
-  real p1 = 2.5;
-  real p2 = 3.0;
-  A1.add(p1);  // a += p
-  A2 += p1;
-  TensorCheckEqual(A1, A2);
-
-  A1.add(p1, p2);  // a = a * p1 + p2
-  A2 = A2 * p1 + p2;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSubScalar(Tensor& A1, Tensor& A2) {
-  real p = 2.5;
-  A1.subScalar(p);  // a -= p
-  A2 -= p;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorMulScalar(Tensor& A1, Tensor& A2) {
-  real p = 2.5;
-  A1.mulScalar(p);  // a *= p
-  A2 *= p;
-  TensorCheckEqual(A1, A2);
-
-  real learningRate = 0.7f;
-  real decayRate = 1.2f;
-  A1.applyL2(learningRate, decayRate);
-  A2 = A2 * (1.0f / (1.0f + learningRate * decayRate));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorDivScalar(Tensor& A1, Tensor& A2) {
-  real p = 2.5;
-  A1.divScalar(p);  // a /= p
-  A2 /= p;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorNeg(Tensor& A1, Tensor& A2) {
-  A1.neg();  // a = -a
-  A2 = -A2;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorAbs(Tensor& A1, Tensor& A2) {
-  A1.abs2();  // a = a > 0 ? a : -a
-  A2 = A2.abs();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSquare(Tensor& A1, Tensor& A2) {
-  A1.square2();  // a = a * a
-  A2 = A2.square();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorReciprocal(Tensor& A1, Tensor& A2) {
-  A1.reciprocal2();  // a = 1.0f / a
-  A2 = A2.reciprocal();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSign(Tensor& A1, Tensor& A2) {
-  A1.sign2();  // a = (a > 0) - (a < 0)
-  A2 = A2.sign();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorAssign(Tensor& A1, Tensor& A2) {
-  A1.assign(1.5);  // a = p
-  A2 = A2.constant(1.5);
-  TensorCheckEqual(A1, A2);
-
-  A1.one();  // a = 1
-  A2 = A2.constant(1.0);
-  TensorCheckEqual(A1, A2);
-
-  A1.zero();  // a = 0
-  A2 = A2.constant(0.0);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testUnaryBaseOp(Tensor& A1, Tensor& A2) {
-  testTensorAddScalar(A1, A2);
-  testTensorSubScalar(A1, A2);
-  testTensorMulScalar(A1, A2);
-  testTensorDivScalar(A1, A2);
-  testTensorNeg(A1, A2);
-  testTensorAbs(A1, A2);
-  testTensorSquare(A1, A2);
-  testTensorReciprocal(A1, A2);
-  testTensorSign(A1, A2);
-  testTensorAssign(A1, A2);
-}
-
-template <typename Tensor>
-void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) {
-  A1.add(2);  // a += p
-  A2 += 2;
-  TensorCheckEqual(A1, A2);
-
-  A1.add(3, 2);  // a = a * p1 + p2
-  A2 = A2 * 3 + 2;
-  TensorCheckEqual(A1, A2);
-
-  testTensorNeg(A1, A2);
-  testTensorAbs(A1, A2);
-}
-
-TEST(Unary, BaseOp) {
-  TestUnaryMatrix<CpuMatrix> testCpuMatrix(testUnaryBaseOp<CpuMatrix>);
-  TestUnaryVectorT<CpuVector, real> testCpuVector(testUnaryBaseOp<CpuVector>);
-  TestUnaryVectorT<CpuIVector, int> testCpuIVector(
-      testUnaryBaseOpInt<CpuIVector>);
-
-#ifdef PADDLE_WITH_GPU
-  TestUnaryMatrix<GpuMatrix> testGpuMatrix(testUnaryBaseOp<GpuMatrix>);
-  TestUnaryVectorT<GpuVector, real> testGpuVector(testUnaryBaseOp<GpuVector>);
-  TestUnaryVectorT<GpuIVector, int> testGpuIVector(
-      testUnaryBaseOpInt<GpuIVector>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorExp(Tensor& A1, Tensor& A2) {
-  A1.exp2();  // a = exp(a)
-  A2 = A2.exp();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorLog(Tensor& A1, Tensor& A2) {
-  A1.log2();  // a = log(a)
-  A2 = A2.log();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSqrt(Tensor& A1, Tensor& A2) {
-  A1.sqrt2();  // a = sqrt(a)
-  A2 = A2.sqrt();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorPow(Tensor& A1, Tensor& A2) {
-  A1.pow2(3.2);  // a = pow(a, p)
-  A2 = A2.pow(3.2);
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testUnayrMathOp(Tensor& A1, Tensor& A2) {
-  testTensorExp(A1, A2);
-  testTensorLog(A1, A2);
-  testTensorSqrt(A1, A2);
-  testTensorPow(A1, A2);
-}
-
-TEST(Unary, MathOp) {
-  TestUnaryMatrix<CpuMatrix> testCpu(testUnayrMathOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestUnaryMatrix<GpuMatrix> testGpu(testUnayrMathOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorClip(Tensor& A1, Tensor& A2) {
-  real p1 = 0.003f;
-  real p2 = 0.877f;
-  A1.clip(p1, p2);  // a = a < p1 ? p1 : (a > p2 ? p2 : a)
-  // A2 = A2.min(0.877f).max(0.003f);
-  A2 = (A2 < p1).condition(p1, (A2 > p2).condition(p2, A2));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) {
-  real p = 0.5f;
-  A1.biggerThanScalar(p);  // a = a > p ? 1.0f : 0.0f
-  A2 = (A2 > p).condition((real)1.0, (real)0.0);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorapplyL1(Tensor& A1, Tensor& A2) {
-  /**
-   * T lambda = p;
-   * a = (a > lambda) ? (a - lambda)
-   *                  : (a < -lambda) ? (a + lambda) : 0
-   *
-   * p = learningRate * decayRate;
-   */
-  real learningRate = 0.7f;
-  real decayRate = 0.6f;
-  A1.applyL1(learningRate, decayRate);
-  A2 = (A2 > (learningRate * decayRate))
-           .condition(
-               (A2 - (learningRate * decayRate)),
-               (A2 < -(learningRate * decayRate))
-                   .condition((A2 + (learningRate * decayRate)), (real)0.0));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testUnayrCompareOp(Tensor& A1, Tensor& A2) {
-  testTensorClip(A1, A2);
-  testTensorBiggerThanScalar(A1, A2);
-
-  A1.randomizeUniform();
-  A1.subScalar(0.5f);
-  A2.copyFrom(A1);
-  testTensorapplyL1(A1, A2);
-}
-
-TEST(Unary, CompareOp) {
-  TestUnaryMatrix<CpuMatrix> testCpu(testUnayrCompareOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestUnaryMatrix<GpuMatrix> testGpu(testUnayrCompareOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p1 = 2.5;
-  real p2 = 3.2;
-  A1.add(B);  // a += b
-  A2 += B;
-  TensorCheckEqual(A1, A2);
-
-  A1.add(B, p1);  // a += b * p
-  A2 += B * p1;
-  TensorCheckEqual(A1, A2);
-
-  A1.add(B, p1, p2);  // a = p1 * a + p2 * b
-  A2 = A2 * p1 + B * p2;
-  TensorCheckEqual(A1, A2);
-
-  A1.addScalar(B, p1);  // a = b + p
-  A2 = B + p1;
-  TensorCheckEqual(A1, A2);
-
-  A1.addSquare(B, p1);  // a += p * b * b
-  A2 += B.constant(p1) * B * B;
-  TensorCheckEqual(A1, A2);
-
-  A1.decayAddSquare(B, p1, p2);  // a = p1 * a + p2 * b * b
-  A2 = A2 * p1 + B.constant(p2) * B * B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p = 2.5;
-  A1.sub(B);  // a -= b
-  A2 -= B;
-  TensorCheckEqual(A1, A2);
-
-  A1.sub(B, p);  // a -= b * p
-  A2 -= B * p;
-  TensorCheckEqual(A1, A2);
-
-  A1.subScalar(B, p);  // a = b - p
-  A2 = B - p;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p = 2.5;
-  A1.mulScalar(B, p);  // a = b * p
-  A2 = B * p;
-  TensorCheckEqual(A1, A2);
-
-  A1.dotMulSquare(B);  // a *= b * b
-  A2 *= B * B;
-  TensorCheckEqual(A1, A2);
-
-  A1.dotSquareMul(B);  // a = a * a * b
-  A2 = A2 * A2 * B;
-  TensorCheckEqual(A1, A2);
-
-  A1.dotMul(B);  // a *= b
-  A2 *= B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p = 2.5;
-  A1.divScalar(B, p);  // a = b / p
-  A2 = B / p;
-  TensorCheckEqual(A1, A2);
-
-  A1.scalarDiv(B, p);  // a = p / b
-  A2 = B.constant(p) / B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorAssign(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.assign(B);  // a = b
-  A2 = B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSquare(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.square2(A1);  // b = a * a
-  A2 = B.square();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSquareDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.squareDerivative(B);  // a *= 2.0 * b
-  A2 = A2 * (real)2.0 * B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.reciprocal2(A1);  // b = 1.0f / a
-  A2 = B.reciprocal();
-  TensorCheckEqual(A1, A2);
-
-  real p1 = 0.58;
-  real p2 = 0.32;
-  A1.reciprocal2(B, p1, p2);  // a = 1 / (p1 * b + p2)
-  A2 = (B * p1 + p2).reciprocal();
-  TensorCheckEqual(A1, A2);
-
-  real learningRate = 0.7f;
-  real decayRate = 1.2f;
-  A1.applyL2(B, learningRate, decayRate);  // a *= (1.0f / (1.0f + p * b))
-  A2 *= (B.constant(1.0f) + B.constant(learningRate * decayRate) * B)
-            .reciprocal();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorReciprocalDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.reciprocalDerivative(B);  // a *= -b * b
-  A2 *= (-B) * B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSign(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.sign2(A1);  // b = a > 0.0f ? 1.0f : -1.0f
-  A2 = B.sign();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorAbs(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.abs2(A1);  // b = a > 0.0f ? a : -a
-  A2 = B.abs();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testBinaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B) {
-  testTensorAdd(A1, A2, B);
-  testTensorSub(A1, A2, B);
-  testTensorMul(A1, A2, B);
-  testTensorDiv(A1, A2, B);
-  testTensorSquare(A1, A2, B);
-  testTensorSquareDerivative(A1, A2, B);
-  testTensorReciprocal(A1, A2, B);
-  testTensorReciprocalDerivative(A1, A2, B);
-  testTensorAbs(A1, A2, B);
-  testTensorSign(A1, A2, B);
-  testTensorAssign(A1, A2, B);
-}
-
-TEST(Binary, BaseOp) {
-  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryBaseOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryBaseOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) {
-  // a = exp(b)
-  A1.exp2(B);
-  A2 = B.exp();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorExpDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.expDerivative(B);  // a *= b
-  A2 *= B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) {
-  // a = log(b)
-  A1.log2(B);
-  A2 = B.log();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
-  // a = sqrt(b)
-  A1.sqrt2(B);
-  A2 = B.sqrt();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
-  // a = 1.0f / sqrt(b)
-  A1.invSqrt(B);
-  A2 = B.sqrt().reciprocal();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorPow(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.pow2(B, 2.5f);  // a = pow(b, p)
-  A2 = B.pow(2.5f);
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) {
-  /*
-   * const T THRESHOLD = 40.0;
-   * b = log(1.0 +
-   *         exp((a > THRESHOLD) ? THRESHOLD
-   *             : ((a < -THRESHOLD) ? (-THRESHOLD) : a)))
-   */
-  B.softrelu(A1);
-
-  real THRESHOLD = 40.0;
-  A2 = (B.constant(1.0f) +
-        (B > THRESHOLD)
-            .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B))
-            .exp())
-           .log();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  /*
-   * const T THRESHOLD = 40.0;
-   * a *= (1.0 - exp(-1.0 * ((b > THRESHOLD)
-   *                             ? THRESHOLD
-   *                             : ((b < -THRESHOLD) ? (-THRESHOLD) : b)))));
-   */
-  A1.softreluDerivative(B);
-  real THRESHOLD = 40.0;
-  A2 = A2 *
-       (B.constant(1.0f) -
-        (B.constant(-1.0f) *
-         (B > THRESHOLD)
-             .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)))
-            .exp());
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) {
-  /*
-    const T THRESHOLD_MIN = -40.0;
-    const T THRESHOLD_MAX = 13.0;
-    T tmp = (a < THRESHOLD_MIN) ? THRESHOLD_MIN
-            : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
-    b = 1.0f / (1.0f + exp(-tmp)))
-   */
-  B.sigmoid(A1);
-
-  const real THRESHOLD_MIN = -40.0;
-  const real THRESHOLD_MAX = 13.0;
-  auto tmp = (B < THRESHOLD_MIN)
-                 .condition(THRESHOLD_MIN,
-                            (B > THRESHOLD_MAX).condition(THRESHOLD_MAX, B));
-  A2 = (B.constant(1.0f) + (-tmp).exp()).reciprocal();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSigmoidDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.sigmoidDerivative(B);  // a *= b * (1 - b)
-  A2 *= B * (B.constant(1.0f) - B);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorTanh(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.tanh(A1);  // b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
-  A2 = B.constant(2.0f) / ((B * ((real)-2.0f)).exp() + (real)1.0f) - (real)1.0f;
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.tanhDerivative(B);  // a *= 1 - b * b
-  A2 *= B.constant(1.0f) - B * B;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorScaledTanh(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p1 = 2.5;
-  real p2 = 3.1;
-  // b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0)
-  B.scaledTanh(A1, p1, p2);
-  A2 = B.constant(p1) *
-       (B.constant(2.0f) / ((B.constant(-2.0f) * p2 * B).exp() + (real)1.0) -
-        (real)1.0);
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p1 = 2.5;
-  real p2 = 3.1;
-  // a *= (p2 / p1) * (p1 * p1 - b * b));
-  A1.scaledTanhDerivative(B, p1, p2);
-  A2 = A2 * (B.constant(p2 / p1) * (B.constant(p1 * p1) - B * B));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testBinaryMathOp(Tensor& A1, Tensor& A2, Tensor& B) {
-  testTensorTanhDerivative(A1, A2, B);
-  testTensorScaledTanhDerivative(A1, A2, B);
-  testTensorSigmoidDerivative(A1, A2, B);
-  testTensorExpDerivative(A1, A2, B);
-  testTensorScaledTanh(A1, A2, B);
-  testTensorTanh(A1, A2, B);
-  testTensorExp(A1, A2, B);
-  testTensorLog(A1, A2, B);
-  testTensorSqrt(A1, A2, B);
-  testTensorInvSqrt(A1, A2, B);
-  testTensorPow(A1, A2, B);
-
-  testTensorSoftrelu(A1, A2, B);
-  testTensorSoftreluDerivative(A1, A2, B);
-  testTensorSigmoid(A1, A2, B);
-}
-
-TEST(Binary, MathOp) {
-  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryMathOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryMathOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorRelu(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.relu(A1);  // b = a > 0.0f ? a : 0.0f
-  A2 = (B > (real)0.0f).condition(B, (real)0.0f);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorReluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.reluDerivative(B);  // a *= (b > 0.0f ? 1.0f : 0.0f)
-  A2 *= (B > (real)0.0).condition((real)1.0, (real)0.0);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) {
-  /*
-   * b = a > p1 ? a : p1
-   * b = b < p2 ? b : p2
-   * int p1 = 0, p2 = 24;
-   */
-  SetTensorValue(B, 32.0f);
-  B.brelu(A1);
-  auto tmp = (B > (real)0.0f).condition(B, (real)0.0f);
-  A2 = (tmp < (real)24.0f).condition(tmp, (real)24.0f);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  SetTensorValue(B, 32.0f);
-  /*
-   * a *= (b > p1 && b < p2) ? 1.0 : 0.0
-   * int p1 = 0, p2 = 24;
-   */
-  A1.breluDerivative(B);
-  A2 *= (B > (real)0.0f && B < (real)24.0f).condition((real)1.0f, (real)0.0f);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorAbsDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
-  A1.absDerivative(B);  // a = (b > 0) ? a : (b < 0) ? -a : 0
-  A2 = (B > (real)0.0f)
-           .condition(A2, (B < (real)0.0f).condition(-A2, (real)0.0f));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) {
-  real p = 0.613;
-  SetTensorValue(B, p);
-  A1.isEqualTo(B, p);  // a = (b == p)
-  A2 = (B == p);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) {
-  /**
-   * T lambda = p * b;
-   * a = (a > lambda) ? (a - lambda)
-   *                  : (a < -lambda) ? (a + lambda) : 0
-   *
-   * p = learningRate * decayRate;
-   */
-  real learningRate = 0.7f;
-  real decayRate = 0.6f;
-  A1.applyL1(B, learningRate, decayRate);
-  auto lambda = B.constant(learningRate * decayRate) * B;
-  A2 = (A2 > lambda)
-           .condition((A2 - lambda),
-                      (A2 < -lambda).condition((A2 + lambda), (real)0.0f));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testBinaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.subScalar(0.5f);
-  SetTensorValue(B, 0.0f);
-  testTensorReluDerivative(A1, A2, B);
-
-  A1.randomizeUniform();
-  A2.copyFrom(A1);
-  testTensorBreluDerivative(A1, A2, B);
-
-  testTensorAbsDerivative(A1, A2, B);
-  testTensorRelu(A1, A2, B);
-  testTensorBrelu(A1, A2, B);
-  testTensorIsEqualTo(A1, A2, B);
-}
-
-TEST(Binary, CompareOp) {
-  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryCompareOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryCompareOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.add(B, C);  // a = b + c
-  A2 = B + C;
-  TensorCheckEqual(A1, A2);
-
-  real p1 = 1.5;
-  real p2 = 2.5;
-  real p3 = 3.8;
-  A1.add(B, p1, C, p2);  // a = p1 * b + p2 * c
-  A2 = B * p1 + C * p2;
-  TensorCheckEqual(A1, A2);
-
-  A1.add2(B, C);  // a = a + b + c
-  A2 = A2 + B + C;
-  TensorCheckEqual(A1, A2);
-
-  A1.add2(B, C, p1, p2, p3);  // a = p1 * a + p2 * b + p3 * c
-  A2 = A2 * p1 + B * p2 + C * p3;
-  TensorCheckEqual(A1, A2);
-
-  A1.decayAddSquareMul(B, C, p1, p2);  // a = p1 * a + p2 * b * b * c * c
-  A2 = A2 * p1 + B.constant(p2) * B * B * C * C;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.sub(B, C);  // a = b - c
-  A2 = B - C;
-  TensorCheckEqual(A1, A2);
-
-  real p1 = 1.5;
-  real p2 = 2.5;
-  A1.sub(B, p1, C, p2);  // a = p1 * b - p2 * c
-  A2 = B * p1 - C * p2;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.dotMul(B, C);  // a = b * c
-  A2 = B * C;
-  TensorCheckEqual(A1, A2);
-
-  A1.dotMulSquare(B, C);  // a = b * c * c
-  A2 = B * C * C;
-  TensorCheckEqual(A1, A2);
-
-  A1.dotSquareSquare(B, C);  // a = b * b * c * c
-  A2 = B * B * C * C;
-  TensorCheckEqual(A1, A2);
-
-  real p1 = 1.5;
-  real p2 = 2.5;
-
-  /*
-   * T tmp = p1 * b + p2 * c;
-   * a *= tmp * tmp
-   */
-  A1.dotMulSquareSum(B, C, p1, p2);
-  auto tmp = B * p1 + C * p2;
-  A2 *= tmp * tmp;
-  TensorCheckEqual(A1, A2);
-
-  /*
-   * T tmp = p1 * b + p2 * c;
-   * a = tmp * tmp
-   */
-  A1.dotSquareSum(B, C, p1, p2);
-  auto tmp2 = B * p1 + C * p2;
-  A2 = tmp2 * tmp2;
-  TensorCheckEqual(A1, A2);
-
-  // a *= p1 * b + p2 * c
-  A1.dotMulSum(B, C, p1, p2);
-  A2 *= B * p1 + C * p2;
-  TensorCheckEqual(A1, A2);
-
-  // a = p1 * a + p2 * b * c
-  A1.addDotMul(B, C, p1, p2);
-  A2 = A2 * p1 + B.constant(p2) * B * C;
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.dotDiv(B, C);  // a = (b == 0.0) ? 0.0 : b / c
-  A2 = (B == (real)0.0).condition((real)0.0, B / C);
-  TensorCheckEqual(A1, A2);
-
-  real p1 = 1.5;
-  real p2 = 2.5;
-  A1.dotDiv(B, C, p1, p2);  // a = (b + p1) / (c + p2)
-  A2 = (B + p1) / (C + p2);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  real p1 = 1.5;
-  real p2 = 2.5;
-  real p3 = 3.5;
-  A1.reciprocalSum(B, C, p1, p2, p3);  // a = 1 / (p1 * b + p2 * c + p3)
-  A2 = (B * p1 + C * p2 + p3).reciprocal();
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSoftCrossEntropy(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.softCrossEntropy(B, C);  // a = -c * log(b) - (1 - c) * log(1 - b)
-  A2 = -C * B.log() - (C.constant(1.0f) - C) * (B.constant(1.0f) - B).log();
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorSoftCrossEntropyBp(Tensor& A1,
-                                  Tensor& A2,
-                                  Tensor& B,
-                                  Tensor& C) {
-  A1.softCrossEntropyBp(B, C);  // a += (b - c) / (b * (1 - b))
-  A2 += (B - C) / (B * (B.constant(1.0f) - B));
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTernaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  testTensorAdd(A1, A2, B, C);
-  testTensorSub(A1, A2, B, C);
-  testTensorMul(A1, A2, B, C);
-  testTensorDiv(A1, A2, B, C);
-  testTensorReciprocal(A1, A2, B, C);
-  testTensorSoftCrossEntropyBp(A1, A2, B, C);
-
-  testTensorSoftCrossEntropy(A1, A2, B, C);
-}
-
-TEST(Ternary, BaseOp) {
-  TestTernaryMatrix<CpuMatrix> testCpu(testTernaryBaseOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestTernaryMatrix<GpuMatrix> testGpu(testTernaryBaseOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorBinaryLabelCrossEntropy(Tensor& A1,
-                                       Tensor& A2,
-                                       Tensor& B,
-                                       Tensor& C) {
-  A1.binaryLabelCrossEntropy(B, C);  // a = c > 0.5 ? -log(b) : -log(1.0 - b)
-  A2 = (C > (real)0.5).condition(-(B.log()), -((B.constant(1.0f) - B).log()));
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorBinaryLabelCrossEntropyBp(Tensor& A1,
-                                         Tensor& A2,
-                                         Tensor& B,
-                                         Tensor& C) {
-  // a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b)
-  A1.binaryLabelCrossEntropyBp(B, C);
-  A2 += (C > (real)0.5)
-            .condition((B.constant(-1.0f) / B),
-                       (B.constant(1.0f) - B).reciprocal());
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorLogisticRegressionLoss(Tensor& A1,
-                                      Tensor& A2,
-                                      Tensor& B,
-                                      Tensor& C) {
-  SetTensorValue(B, 50.0f);
-  SetTensorValue(B, -50.0f);
-  /**
-   * const T THRESHOLD = 40.0;
-   * T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
-   *                                        ? -THRESHOLD
-   *                                        : b;
-   * a = log(1 + exp(x)) - c * x
-   */
-  A1.logisticRegressionLoss(B, C);
-  real THRESHOLD = 40.0;
-  auto tmp =
-      (B > THRESHOLD)
-          .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
-  A2 = (C.constant(1.0f) + tmp.exp()).log() - C * tmp;
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorLogisticRegressionLossBp(Tensor& A1,
-                                        Tensor& A2,
-                                        Tensor& B,
-                                        Tensor& C) {
-  SetTensorValue(B, 50.0f);
-  SetTensorValue(B, -50.0f);
-  /**
-   * const T THRESHOLD = 40.0;
-   * T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
-   *                                        ? -THRESHOLD
-   *                                        : b;
-   * x = exp(x); a = x / (1 + x) - c
-   */
-  A1.logisticRegressionLossBp(B, C);
-  real THRESHOLD = 40.0;
-  auto tmp =
-      (B > THRESHOLD)
-          .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
-  auto tmp2 = tmp.exp();
-  A2 = tmp2 / (C.constant(1.0) + tmp2) - C;
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorBiggerThan(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.biggerThan(B, C);  // a = (b > c) ? 1.0f : 0.0f
-  A2 = (B > C).condition((real)1.0f, (real)0.0f);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorMax(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  A1.max2(B, C);  // a = (b > c) ? b : c
-  A2 = (B > C).condition(B, C);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTernaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
-  testTensorBinaryLabelCrossEntropyBp(A1, A2, B, C);
-  testTensorBinaryLabelCrossEntropy(A1, A2, B, C);
-  testTensorBiggerThan(A1, A2, B, C);
-  testTensorMax(A1, A2, B, C);
-
-  testTensorLogisticRegressionLoss(A1, A2, B, C);
-  testTensorLogisticRegressionLossBp(A1, A2, B, C);
-}
-
-TEST(Ternary, CompareOp) {
-  TestTernaryMatrix<CpuMatrix> testCpu(testTernaryCompareOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestTernaryMatrix<GpuMatrix> testGpu(testTernaryCompareOp<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testQuaternaryAdd(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-  // A1.add3(B, C, D, 1.5f, 2.5f, 3.5f);  // a = p1 * b + p2 * c + p3 * d
-  // A2 = B * 1.5f + C * 2.5f + D * 3.5f;
-  // TensorCheckEqual(A1, A2);
-
-  /*
-   * T tmp = p1 * b + p2 * c + p3 * d;
-   * a += tmp * tmp
-   */
-  real p1 = 1.5f;
-  real p2 = 2.5f;
-  real p3 = 3.5f;
-  A1.addSquareSum(B, C, D, p1, p2, p3);
-  auto tmp = B * p1 + C * p2 + D * p3;
-  A2 += tmp * tmp;
-  TensorCheckEqual(A1, A2);
-}
-
-TEST(Quaternary, BaseOp) {
-  TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryAdd<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryAdd<GpuMatrix>);
-#endif
-}
-
-template <typename Tensor>
-void testTensorBiggerThan(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-  // a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
-  A1.biggerThan(B, C, D);
-  A2 = ((B > C && D > (real)0.5) || (B < C && D < (real)0.5))
-           .condition((real)1.0, (real)0.0);
-  TensorCheckEqual(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorRankLoss(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-  /**
-   * const T THRESHOLD = 40.0; a = b - c;
-   * a = (a > THRESHOLD)
-   *         ? THRESHOLD
-   *         : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
-   * a = log(1 + exp(a)) - a * d
-   */
-  A1.rankLoss(B, C, D);
-
-  real THRESHOLD = 40.0;
-  auto tmp = B - C;
-  auto tmp2 =
-      (tmp > THRESHOLD)
-          .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
-  A2 = (D.constant(1.0f) + tmp2.exp()).log() - tmp2 * D;
-
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testTensorRankLossBp(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-  /**
-   * const T THRESHOLD = 40.0; a = b - c;
-   * a = (a > THRESHOLD)
-   *         ? THRESHOLD
-   *         : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
-   * a = exp(a); a = (a / (1 + a) - d)
-   */
-  A1.rankLossBp(B, C, D);
-  real THRESHOLD = 40.0;
-  auto tmp = B - C;
-  auto tmp2 =
-      (tmp > THRESHOLD)
-          .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
-  auto tmp3 = tmp2.exp();
-  A2 = tmp3 / (D.constant(1.0f) + tmp3) - D;
-
-  TensorCheckErr(A1, A2);
-}
-
-template <typename Tensor>
-void testQuaternaryCompareOp(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
-  testTensorBiggerThan(A1, A2, B, C, D);
-  testTensorRankLoss(A1, A2, B, C, D);
-  testTensorRankLossBp(A1, A2, B, C, D);
-}
-
-TEST(Quaternary, CompareOp) {
-  TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryCompareOp<CpuMatrix>);
-
-#ifdef PADDLE_WITH_GPU
-  TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryCompareOp<GpuMatrix>);
-#endif
-}
diff --git a/paddle/math/tests/test_TrainingAlgorithm.cpp b/paddle/math/tests/test_TrainingAlgorithm.cpp
deleted file mode 100644
index fb146176ca8eb97a9cdbaf9ebd5c4997a8439718..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_TrainingAlgorithm.cpp
+++ /dev/null
@@ -1,461 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "OriginalOptimizerApi.h"
-#include "PerfUtils.h"
-#include "TensorCheck.h"
-#include "paddle/math/TrainingAlgorithmOp.h"
-#include "paddle/utils/Util.h"
-
-using namespace paddle;  // NOLINT
-
-#ifndef PADDLE_TYPE_DOUBLE
-DEFINE_double(max_diff, 1e-5, "max diff allowed");
-#else
-DEFINE_double(max_diff, 1e-13, "max diff allowed");
-#endif
-
-class SetMaxDiff {
-public:
-  explicit SetMaxDiff(double max_diff) {
-    max_diff_ = FLAGS_max_diff;
-    FLAGS_max_diff = max_diff;
-  }
-  ~SetMaxDiff() { FLAGS_max_diff = max_diff_; }
-
-private:
-  double max_diff_;
-};
-
-#define COPY_VECTOR_TO_CPU(cpuVec, vector)               \
-  do {                                                   \
-    if (vector->useGpu()) {                              \
-      cpuVec = Vector::create(vector->getSize(), false); \
-      cpuVec->copyFrom(*vector);                         \
-    } else {                                             \
-      cpuVec = vector;                                   \
-    }                                                    \
-  } while (0)
-
-int VectorCheckErr(const Vector& vector1, const Vector& vector2) {
-  CHECK(vector1.getSize() == vector2.getSize());
-
-  const real* data1 = vector1.getData();
-  const real* data2 = vector2.getData();
-  size_t size = vector1.getSize();
-  int count = 0;
-  for (size_t i = 0; i < size; i++) {
-    real a = data1[i];
-    real b = data2[i];
-    if (fabs(a - b) > FLAGS_max_diff) {
-      if ((fabsf(a - b) / fabsf(a)) > (FLAGS_max_diff / 10.0f)) {
-        count++;
-      }
-    }
-  }
-
-  return count;
-}
-
-int VectorCheckErr(const VectorPtr& vector1, const VectorPtr& vector2) {
-  VectorPtr tmp1;
-  VectorPtr tmp2;
-  COPY_VECTOR_TO_CPU(tmp1, vector1);
-  COPY_VECTOR_TO_CPU(tmp2, vector2);
-  return VectorCheckErr(*tmp1, *tmp2);
-}
-
-#ifdef PADDLE_DISABLE_TIMER
-
-#define CHECK_VECTORPTR(vector1, vector2) \
-  EXPECT_EQ(VectorCheckErr(vector1, vector2), 0)
-
-#else
-
-#define CHECK_VECTORPTR(vector1, vector2)
-
-#endif
-
-typedef std::function<void(size_t size, bool useGpu)> testMatrixFunc;
-
-void testCase(testMatrixFunc matrixFunc) {
-#ifdef PADDLE_WITH_CUDA
-  for (auto useGpu : {false, true}) {
-#else
-  for (auto useGpu : {false}) {
-#endif
-    for (auto size : {1,
-                      32,
-                      64,
-                      128,
-                      512,
-                      1024,
-                      4096,
-                      32768,
-                      65536,
-                      131072,
-                      262144,
-                      524288,
-                      1048576,
-                      2097152}) {
-      LOG(INFO) << " size=" << size << " useGpu=" << useGpu;
-      matrixFunc(size, useGpu);
-    }
-  }
-}
-
-#define INIT_VECTOR(vec1, vec2, type, size, useGpu) \
-  vec1[type] = Vector::create(size, useGpu);        \
-  vec2[type] = Vector::create(size, useGpu);        \
-  vec1[type]->rand();                               \
-  vec2[type]->copyFrom(*vec1[type]);
-
-void testAdagrad(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
-
-  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
-  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
-
-  EXPRESSION_PERFORMANCE(AdagradParameterOptimizer(
-      bufs1, epsilon, learningRate, momentum, decayRate));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& accum_buffer = *bufs2[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
-
-  EXPRESSION_PERFORMANCE(adagradApply(value,
-                                      grad,
-                                      mom,
-                                      accum_buffer,
-                                      accum,
-                                      lr,
-                                      epsilon,
-                                      learningRate,
-                                      momentum,
-                                      decayRate));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
-                  bufs2[PARAMETER_LEARNING_RATE]);
-}
-
-TEST(Training, Adagrad) { testCase(testAdagrad); }
-
-void testAdaDelta(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
-
-  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
-  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
-  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
-
-  EXPRESSION_PERFORMANCE(AdaDeltaParameterOptimizer(
-      bufs1, rou, epsilon, learningRate, momentum, decayRate));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& accum_update = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
-
-  EXPRESSION_PERFORMANCE(adadeltaApply(value,
-                                       grad,
-                                       mom,
-                                       accum,
-                                       accum_update,
-                                       lr,
-                                       rou,
-                                       epsilon,
-                                       learningRate,
-                                       momentum,
-                                       decayRate));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
-                  bufs2[PARAMETER_LEARNING_RATE]);
-}
-
-TEST(Training, AdaDelta) { testCase(testAdaDelta); }
-
-template <bool isFirstTime>
-void testRMSProp(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
-
-  /* make sure 'g - f.square()' greater than 0 */
-  bufs1[PARAMETER_GRADIENT_SQURESUM]->add(1.0);
-  bufs2[PARAMETER_GRADIENT_SQURESUM]->copyFrom(
-      *bufs1[PARAMETER_GRADIENT_SQURESUM]);
-
-  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
-  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
-  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
-  real accumulatedRou = rou;
-
-  EXPRESSION_PERFORMANCE(RMSPropParameterOptimizer(bufs1,
-                                                   accumulatedRou,
-                                                   rou,
-                                                   epsilon,
-                                                   learningRate,
-                                                   momentum,
-                                                   decayRate,
-                                                   isFirstTime));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& sum1 = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
-
-  EXPRESSION_PERFORMANCE(rmspropApply(value,
-                                      grad,
-                                      mom,
-                                      sum,
-                                      sum1,
-                                      lr,
-                                      accumulatedRou,
-                                      rou,
-                                      epsilon,
-                                      learningRate,
-                                      momentum,
-                                      decayRate,
-                                      isFirstTime));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
-                  bufs2[PARAMETER_LEARNING_RATE]);
-}
-
-TEST(Training, RMSProp) {
-  testCase(testRMSProp<true>);
-  testCase(testRMSProp<false>);
-}
-
-template <bool isFirstTime>
-void testDecayedAdagrad(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
-
-  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
-  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
-  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
-  real accumulatedRou = rou;
-
-  if (isFirstTime) {
-    bufs1[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
-    bufs2[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
-  }
-
-  EXPRESSION_PERFORMANCE(DecayedAdagradParameterOptimizer(bufs1,
-                                                          accumulatedRou,
-                                                          rou,
-                                                          epsilon,
-                                                          learningRate,
-                                                          momentum,
-                                                          decayRate,
-                                                          isFirstTime));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
-
-  EXPRESSION_PERFORMANCE(decayedAdagradApply(value,
-                                             grad,
-                                             mom,
-                                             sum,
-                                             lr,
-                                             accumulatedRou,
-                                             rou,
-                                             epsilon,
-                                             learningRate,
-                                             momentum,
-                                             decayRate,
-                                             isFirstTime));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
-                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
-                  bufs2[PARAMETER_LEARNING_RATE]);
-}
-
-TEST(Training, DecayedAdagrad) {
-  testCase(testDecayedAdagrad<false>);
-  testCase(testDecayedAdagrad<true>);
-}
-
-void testAdam(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_SECOND_MOMENTUM, size, useGpu);
-
-  real beta1 = (real)rand() / (real)RAND_MAX;         // NOLINT
-  real beta2 = (real)rand() / (real)RAND_MAX;         // NOLINT
-  real beta1_power = (real)rand() / (real)RAND_MAX;   // NOLINT
-  real beta2_power = (real)rand() / (real)RAND_MAX;   // NOLINT
-  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-
-  EXPRESSION_PERFORMANCE(AdamParameterOptimizer(
-      bufs1, beta1, beta2, beta1_power, beta2_power, epsilon, learningRate));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& v = *bufs2[PARAMETER_SECOND_MOMENTUM];
-
-  EXPRESSION_PERFORMANCE(adamApply(value,
-                                   grad,
-                                   mom,
-                                   v,
-                                   beta1,
-                                   beta2,
-                                   beta1_power,
-                                   beta2_power,
-                                   epsilon,
-                                   learningRate));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_SECOND_MOMENTUM],
-                  bufs2[PARAMETER_SECOND_MOMENTUM]);
-}
-
-TEST(Training, Adam) { testCase(testAdam); }
-
-void testAdamax(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_WEIGHTED_INFINITY_NORM, size, useGpu);
-
-  real beta1 = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real beta2 = (real)rand() / (real)RAND_MAX;  // NOLINT
-  real alpha = (real)rand() / (real)RAND_MAX;  // NOLINT
-  int64_t step = 2;
-
-  EXPRESSION_PERFORMANCE(
-      AdamaxParameterOptimizer(bufs1, beta1, beta2, step, alpha));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
-  BaseMatrix& u = *bufs2[PARAMETER_WEIGHTED_INFINITY_NORM];
-
-  EXPRESSION_PERFORMANCE(
-      adamaxApply(value, grad, mom, u, beta1, beta2, step, alpha));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_WEIGHTED_INFINITY_NORM],
-                  bufs2[PARAMETER_WEIGHTED_INFINITY_NORM]);
-}
-
-TEST(Training, Adamax) {
-#ifndef PADDLE_TYPE_DOUBLE
-  SetMaxDiff diff(1e-4);
-#endif
-  testCase(testAdamax);
-}
-
-void testSparseMomentum(size_t size, bool useGpu) {
-  VectorPtr bufs1[NUM_PARAMETER_TYPES];
-  VectorPtr bufs2[NUM_PARAMETER_TYPES];
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_UT, size, useGpu);
-  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_VT, size, useGpu);
-
-  real alpha = (real)rand() / (real)RAND_MAX;         // NOLINT
-  real beta = (real)rand() / (real)RAND_MAX;          // NOLINT
-  real gamma = (real)rand() / (real)RAND_MAX;         // NOLINT
-  real tau = (real)rand() / (real)RAND_MAX;           // NOLINT
-  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
-
-  EXPRESSION_PERFORMANCE(SparseMomentumParameterOptimizer(
-      bufs1, alpha, beta, gamma, tau, learningRate));
-
-  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
-  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
-  BaseMatrix& momU = *bufs2[PARAMETER_MOMENTUM_UT];
-  BaseMatrix& momV = *bufs2[PARAMETER_MOMENTUM_VT];
-
-  EXPRESSION_PERFORMANCE(sparseMomentumApply(
-      value, grad, momU, momV, alpha, beta, gamma, tau, learningRate));
-
-  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_UT], bufs2[PARAMETER_MOMENTUM_UT]);
-  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_VT], bufs2[PARAMETER_MOMENTUM_VT]);
-}
-
-TEST(Training, SparseMomentum) { testCase(testSparseMomentum); }
diff --git a/paddle/math/tests/test_lazyAssign.cu b/paddle/math/tests/test_lazyAssign.cu
deleted file mode 100644
index cbd74bbfe33270f351632b58d7e89f8e60d15b83..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_lazyAssign.cu
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include "PerfUtils.h"
-#include "TensorCheck.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/TensorAssign.h"
-
-using paddle::BaseMatrix;
-using paddle::CpuMatrix;
-using paddle::GpuMatrix;
-using autotest::TensorCheckEqual;
-using autotest::TensorCheckErr;
-
-typedef std::function<void(int height, int width)> testMatrixFunc;
-void testMatrixCase(testMatrixFunc matrixFunc) {
-  for (auto height : {1}) {
-    for (auto width : {1,
-                       32,
-                       64,
-                       128,
-                       512,
-                       1024,
-                       4096,
-                       32768,
-                       65536,
-                       131072,
-                       262144,
-                       524288,
-                       1048576,
-                       2097152,
-                       4194304,
-                       8388608}) {
-      matrixFunc(height, width);
-    }
-  }
-}
-
-template <typename Tensor>
-void testLazyAssign(int height, int width) {
-  Tensor A1(height, width);
-  Tensor A2(height, width);
-  Tensor B(height, width);
-  Tensor C(height, width);
-  Tensor D(height, width);
-  A1.randomizeUniform();
-  B.randomizeUniform();
-  C.randomizeUniform();
-  D.randomizeUniform();
-  A2.copyFrom(A1);
-
-  EXPRESSION_PERFORMANCE(A1 = B + C; A1 = A1 * D;);
-
-  EXPRESSION_PERFORMANCE(auto expr1 = A2.lazyAssign(B + C);
-                         auto expr2 = A2.lazyAssign(A2 * D);
-                         AssignEvaluate(expr1, expr2););
-
-  TensorCheckErr(A1, A2);
-}
-
-TEST(lazyAssign, CPU) { testMatrixCase(testLazyAssign<CpuMatrix>); }
-
-#ifdef PADDLE_WITH_GPU
-TEST(lazyAssign, GPU) { testMatrixCase(testLazyAssign<GpuMatrix>); }
-#endif
-
-template <typename Tensor>
-void sgdUpdateTensor(
-    Tensor& A, Tensor& B, Tensor& C, Tensor& D, real p1, real p2, real p3) {
-  C = C * p2 - D * (B + A * p3) * p1;
-  A += C;
-}
-
-void sgdUpdateLazyAssign(BaseMatrix& A,
-                         BaseMatrix& B,
-                         BaseMatrix& C,
-                         BaseMatrix& D,
-                         real p1,
-                         real p2,
-                         real p3) {
-  auto expr1 = C.lazyAssign(C * p2 - D * (B + A * p3) * p1);
-  auto expr2 = A.lazyAssign(A + C);
-  AssignEvaluate(expr1, expr2);
-}
-
-template <typename Tensor>
-void testSgdUpdate(int height, int width) {
-  Tensor A1(height, width);
-  Tensor A2(height, width);
-  Tensor A3(height, width);
-  A1.randomizeUniform();
-  A2.copyFrom(A1);
-  A3.copyFrom(A1);
-
-  Tensor B(height, width);
-  B.randomizeUniform();
-
-  Tensor C1(height, width);
-  Tensor C2(height, width);
-  Tensor C3(height, width);
-  C1.randomizeUniform();
-  C2.copyFrom(C1);
-  C3.copyFrom(C1);
-
-  Tensor D(height, width);
-  D.randomizeUniform();
-
-  real p1 = 0.2;
-  real p2 = 0.3;
-  real p3 = 0.5;
-
-  /**
-   * c = p2 * c - p1 * (b + p3 * a);
-   * a = a + c;
-   */
-  // BaseMatrix API
-  EXPRESSION_PERFORMANCE(A1.sgdUpdate(B, C1, D, p1, p2, p3););
-
-  // Tensor expression
-  EXPRESSION_PERFORMANCE(sgdUpdateTensor(A2, B, C2, D, p1, p2, p3));
-
-  // lazyAssign
-  EXPRESSION_PERFORMANCE(sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3));
-
-  TensorCheckErr(A1, A2);
-  TensorCheckErr(A1, A3);
-  TensorCheckErr(C1, C2);
-  TensorCheckErr(C1, C3);
-}
-
-TEST(sgdUpdate, CPU) { testMatrixCase(testSgdUpdate<CpuMatrix>); }
-
-#ifdef PADDLE_WITH_GPU
-TEST(sgdUpdate, GPU) { testMatrixCase(testSgdUpdate<GpuMatrix>); }
-#endif
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
deleted file mode 100644
index e45ddd433faf18dbcd647b305db3a36d38c90825..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ /dev/null
@@ -1,1698 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-/// This unittest checks GpuMatrix/CpuMatrix get same result, so disable when
-/// only cpu version.
-
-#include <gtest/gtest.h>
-#include "TensorCheck.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/testing/TestUtil.h"
-#include "paddle/utils/DynamicLoader.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-using autotest::TensorCheckEqual;
-using autotest::TensorCheckErr;
-
-void testMatrixMaxSequence(int batchSize, int inputDim) {
-  // forward
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  int newBatchSize = cpuSequence->getSize() - 1;
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
-  cpuOutput->zero();
-  gpuOutput->zero();
-
-  IVectorPtr cpuIndex = nullptr;
-  IVectorPtr gpuIndex = nullptr;
-  IVector::resizeOrCreate(cpuIndex, newBatchSize * inputDim, false);
-  IVector::resizeOrCreate(gpuIndex, newBatchSize * inputDim, true);
-  cpuIndex->zeroMem();
-  gpuIndex->zeroMem();
-
-  cpuOutput->maxSequenceForward(*cpuInput, *cpuSequence, *cpuIndex);
-  gpuOutput->maxSequenceForward(*gpuInput, *gpuSequence, *gpuIndex);
-
-  TensorCheckEqual(*cpuOutput, *gpuOutput);
-  TensorCheckEqual(*cpuIndex, *gpuIndex);
-
-  // backward
-  MatrixPtr cpuOutputGrad = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
-  MatrixPtr gpuOutputGrad = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
-  cpuOutputGrad->randomizeUniform();
-  gpuOutputGrad->copyFrom(*cpuOutputGrad);
-
-  MatrixPtr cpuInputGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInputGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInputGrad->randomizeUniform();
-  gpuInputGrad->copyFrom(*cpuInputGrad);
-
-  cpuInputGrad->maxSequenceBackward(*cpuOutputGrad, *cpuSequence, *cpuIndex);
-  gpuInputGrad->maxSequenceBackward(*gpuOutputGrad, *gpuSequence, *gpuIndex);
-
-  TensorCheckEqual(*cpuInputGrad, *gpuInputGrad);
-}
-
-TEST(Matrix, maxSequence) {
-  for (auto batchSize : {1, 3, 997}) {   // prime numbers close to 1, 4, 1024
-    for (auto inputDim : {1, 7, 131}) {  // prime numbers close to 1, 8, 128
-      VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim;
-      testMatrixMaxSequence(batchSize, inputDim);
-    }
-  }
-}
-
-void testMatrixGetSum(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-#ifndef PADDLE_TYPE_DOUBLE
-  int x = log10(height * width);
-  real err = 1e-6 * pow(10, x);
-#else
-  real err = 1e-8;
-#endif
-
-  real cpuSum = cpuInput->getSum();
-  real gpuSum = gpuInput->getSum();
-
-  EXPECT_LE(fabs(cpuSum - gpuSum), err);
-}
-
-void testMatrixGetMinMax(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  real cpuMin = cpuInput->getMin();
-  real gpuMin = gpuInput->getMin();
-  real cpuMax = cpuInput->getMax();
-  real gpuMax = gpuInput->getMax();
-
-  EXPECT_EQ(cpuMin, gpuMin);
-  EXPECT_EQ(cpuMax, gpuMax);
-}
-
-void testMatrixZeroAtOffset(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr cpuTest = std::make_shared<CpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  cpuTest->copyFrom(*cpuA);
-
-  int columnOffset = rand() % width;  // NOLINT we just use rand() for test.
-  int numColumns = rand() % (width - columnOffset);  // NOLINT
-
-  if (numColumns == 0) return;
-
-  cpuA->zeroAtOffset(columnOffset, numColumns);
-  gpuA->zeroAtOffset(columnOffset, numColumns);
-
-  /* cpuTest */
-  real* a = cpuTest->getData() + columnOffset;
-  for (int64_t i = 0; i < height; ++i) {
-    for (int64_t j = 0; j < numColumns; ++j) {
-      a[i * width + j] = 0;
-    }
-  }
-
-  TensorCheckEqual(*cpuA, *gpuA);
-  TensorCheckEqual(*cpuA, *cpuTest);
-}
-
-void testMatrixDeepSwap(int height, int width) {
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuCopyA = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuCopyB = std::make_shared<CpuMatrix>(height, width);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuCopyA->copyFrom(*cpuA);
-  cpuCopyB->copyFrom(*cpuB);
-
-  // swap matrix cpuA and cpuB
-  cpuA->deepSwap(*cpuB);
-
-  TensorCheckEqual(*cpuA, *cpuCopyB);
-  TensorCheckEqual(*cpuB, *cpuCopyA);
-}
-
-void testMatrixTranspose(int height, int width) {
-  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr cpuT = std::make_shared<CpuMatrix>(width, height);
-  MatrixPtr gpuT = std::make_shared<GpuMatrix>(width, height);
-
-  cpu->randomizeUniform();
-  gpu->copyFrom(*cpu);
-  cpu->transpose(cpuT, false);
-  gpu->transpose(gpuT, true);
-
-  TensorCheckEqual(*cpuT, *gpuT);
-}
-
-void testMatrixRotate(int height, int width) {
-  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr cpuR = std::make_shared<CpuMatrix>(width, height);
-  MatrixPtr gpuR = std::make_shared<GpuMatrix>(width, height);
-
-  cpu->randomizeUniform();
-  gpu->copyFrom(*cpu);
-
-  cpu->rotate(cpuR, false, true);
-  gpu->rotate(gpuR, true, true);
-  TensorCheckEqual(*cpuR, *gpuR);
-
-  cpu->rotate(cpuR, true, false);
-  gpu->rotate(gpuR, false, false);
-  TensorCheckEqual(*cpuR, *gpuR);
-}
-
-void testMatrixInverse(int height) {
-  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, height);
-  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, height);
-  MatrixPtr cpuI = std::make_shared<CpuMatrix>(height, height);
-  MatrixPtr gpuI = std::make_shared<GpuMatrix>(height, height);
-
-  /* Make matrix well conditioned: cpu * cpuT + Identity */
-  cpu->randomizeUniform();
-  MatrixPtr cpuT = cpu->getTranspose();
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, height);
-  outputCheck->mul(*cpu, *cpuT);
-  cpu->setDiag(1.0);
-  cpu->add(*outputCheck);
-
-  gpu->copyFrom(*cpu);
-  cpu->inverse(cpuI, true);
-  gpu->inverse(gpuI, false);
-
-  TensorCheckErr(*cpuI, *gpuI);
-
-  outputCheck->mul(*cpu, *cpuI);
-  cpu->setDiag(1.0);
-  TensorCheckErr(*cpu, *outputCheck);
-}
-
-TEST(Matrix, unary) {
-  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
-      VLOG(3) << " height=" << height << " width=" << width;
-
-      testMatrixDeepSwap(height, width);
-      testMatrixZeroAtOffset(height, width);
-      testMatrixGetSum(height, width);
-      testMatrixTranspose(height, width);
-      testMatrixRotate(height, width);
-    }
-#ifdef LAPACK_FOUND
-    // inverse matrix
-    testMatrixInverse(height);
-#else
-    LOG(WARNING) << "This version of PaddlePaddle was not built with LAPACK"
-                 << "support so we cannot test matrix inverse. To test "
-                 << "matrix inverse, please install LAPACKE "
-                 << "and MKL/Openblas, and re-build PaddlePaddle.";
-#endif
-  }
-}
-
-void testMatrixSoftmax(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
-
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->zero();
-  gpuOutput->zero();
-  cpuInput->softmax(*cpuOutput);
-  gpuInput->softmax(*gpuOutput);
-
-  TensorCheckErr(*cpuOutput, *gpuOutput);
-}
-
-void testSequenceSoftmax(int batchSize) {
-  // forward
-  int inputDim = 1;
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  cpuInput->sequenceSoftmax(*cpuInput, *cpuSequence);
-  gpuInput->sequenceSoftmax(*gpuInput, *gpuSequence);
-
-  TensorCheckErr(*cpuInput, *gpuInput);
-}
-
-void testMatrixSoftmaxThreshold(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
-
-  cpuInput->randomizeUniform();
-  cpuInput->getData()[0] = 100.0;
-  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->zero();
-  gpuOutput->zero();
-  cpuInput->softmax(*cpuOutput);
-  gpuInput->softmax(*gpuOutput);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, width);
-  outputCheck->copyFrom(*gpuOutput);
-  // check output zero
-  int cpuCount = 0;
-  int gpuCount = 0;
-  auto zeroNum = [](MatrixPtr out, int& count) {
-    for (size_t i = 0; i < out->getHeight(); i++) {
-      for (size_t j = 0; j < out->getWidth(); j++) {
-        if (out->getElement(i, j) == 0) count++;
-      }
-    }
-  };
-  zeroNum(cpuOutput, cpuCount);
-  zeroNum(outputCheck, gpuCount);
-  EXPECT_EQ(cpuCount, 0) << "Cpu softmax output value 0";
-  EXPECT_EQ(gpuCount, 0) << "Gpu softmax output value 0";
-}
-
-void testMatrixSoftmaxBp(int height, int width) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(height, width);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(height, width);
-
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-  gpuOutput->softmaxBackward(*gpuInput);
-
-  MatrixPtr sftMaxSum = std::make_shared<CpuMatrix>(height, 1);
-  MatrixPtr sftMaxDot = std::make_shared<CpuMatrix>(height, width);
-  sftMaxDot->dotMul(*cpuOutput, *cpuInput);
-  sftMaxSum->colMerge(*sftMaxDot);
-  cpuOutput->softmaxDerivative(*cpuInput, *sftMaxSum);
-
-  TensorCheckErr(*cpuOutput, *gpuOutput);
-}
-
-TEST(Matrix, softmax) {
-  for (auto height : {1, 3, 131}) {    // prime numbers close to 1, 4, 127
-    for (auto width : {1, 17, 251}) {  // prime numbers close to 1, 16, 256
-      VLOG(3) << " height=" << height << " width=" << width;
-
-      testMatrixSoftmax(height, width);
-      testMatrixSoftmaxBp(height, width);
-      testMatrixSoftmaxThreshold(height, width);
-    }
-    testSequenceSoftmax(height);
-  }
-}
-
-void testMatrixAddToRows(int numSamples, int tableSize, int inputDim) {
-  MatrixPtr cpuTable = std::make_shared<CpuMatrix>(tableSize, inputDim);
-  MatrixPtr gpuTable = std::make_shared<GpuMatrix>(tableSize, inputDim);
-  cpuTable->randomizeUniform();
-  gpuTable->copyFrom(*cpuTable);
-
-  IVectorPtr cpuIds;
-  IVectorPtr gpuIds;
-  cpuIds = VectorT<int>::create(numSamples, false);
-  gpuIds = VectorT<int>::create(numSamples, true);
-  cpuIds->rand(tableSize);
-  gpuIds->copyFrom(*cpuIds);
-
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, inputDim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, inputDim);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-
-  cpuOutput->addToRows(*cpuTable, *cpuIds);
-  gpuOutput->addToRows(*gpuTable, *gpuIds);
-
-  TensorCheckErr(*cpuTable, *gpuTable);
-}
-
-TEST(Matrix, tableProjection) {
-  for (auto numSamples : {10, 100, 1000, 10000, 80000}) {
-    for (auto tableSize : {10, 100}) {
-      for (auto inputDim : {20, 50}) {
-        VLOG(3) << " numSamples=" << numSamples << " tableSize=" << tableSize
-                << " inputDim=" << inputDim;
-        testMatrixAddToRows(numSamples, tableSize, inputDim);
-      }
-    }
-  }
-}
-
-void testMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
-  int heightA = transa == false ? dimM : dimK;
-  int widthA = transa == false ? dimK : dimM;
-  int heightB = transb == false ? dimK : dimN;
-  int widthB = transb == false ? dimN : dimK;
-  int heightC = dimM;
-  int widthC = dimN;
-
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(heightA, widthA, transa);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(heightB, widthB, transb);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(heightC, widthC);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(heightA, widthA, transa);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(heightB, widthB, transb);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(heightC, widthC);
-
-  real alpha = 1.5;
-  real beta = 2.0;
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-
-  cpuC->mul(*cpuA, *cpuB, alpha, beta);
-  gpuC->mul(*gpuA, *gpuB, alpha, beta);
-
-  TensorCheckErr(*cpuC, *gpuC);
-}
-
-void testSubMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) {
-  int heightA = transa == false ? dimM : dimK;
-  int widthA = transa == false ? dimK : dimM;
-  int heightB = transb == false ? dimK : dimN;
-  int widthB = transb == false ? dimN : dimK;
-  int heightC = dimM;
-  int widthC = dimN;
-
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(heightA, widthA, transa);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(heightB, widthB, transb);
-  MatrixPtr cpuC = std::make_shared<CpuMatrix>(heightC, widthC);
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(heightA, widthA, transa);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(heightB, widthB, transb);
-  MatrixPtr gpuC = std::make_shared<GpuMatrix>(heightC, widthC);
-
-  real alpha = 1.5;
-  real beta = 2.0;
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-  gpuA->copyFrom(*cpuA);
-  gpuB->copyFrom(*cpuB);
-  gpuC->copyFrom(*cpuC);
-
-  auto subSize = [](int& start, int& end, int dim) {
-    if (dim == 1) {
-      start = 0;
-      end = dim;
-    } else {
-      int subDim = rand() % (dim - 1) + 1;  // NOLINT
-      start = rand() % (dim - subDim);      // NOLINT
-      end = start + subDim;
-    }
-  };
-
-  auto subMatrix = [](MatrixPtr& sub,
-                      MatrixPtr matrix,
-                      size_t startRow,
-                      size_t endRow,
-                      size_t startCol,
-                      size_t endCol) {
-    if (!matrix->isTransposed()) {
-      sub = matrix->subMatrix(startRow, endRow, startCol, endCol);
-    } else {
-      sub = matrix->subMatrix(startCol, endCol, startRow, endRow);
-    }
-  };
-
-  int startM, endM;
-  int startN, endN;
-  int startK, endK;
-  subSize(startM, endM, dimM);
-  subSize(startN, endN, dimN);
-  subSize(startK, endK, dimK);
-
-  MatrixPtr subCpuA;
-  MatrixPtr subCpuB;
-  MatrixPtr subGpuA;
-  MatrixPtr subGpuB;
-  subMatrix(subCpuA, cpuA, startM, endM, startK, endK);
-  subMatrix(subGpuA, gpuA, startM, endM, startK, endK);
-  subMatrix(subCpuB, cpuB, startK, endK, startN, endN);
-  subMatrix(subGpuB, gpuB, startK, endK, startN, endN);
-  MatrixPtr subCpuC = cpuC->subMatrix(startM, endM, startN, endN);
-  MatrixPtr subGpuC = gpuC->subMatrix(startM, endM, startN, endN);
-
-  subCpuC->mul(*subCpuA, *subCpuB, alpha, beta);
-  subGpuC->mul(*subGpuA, *subGpuB, alpha, beta);
-
-  TensorCheckErr(*cpuC, *gpuC);
-}
-
-TEST(Matrix, mul) {
-  for (auto transa : {false, true}) {
-    for (auto transb : {false, true}) {
-      for (auto dimM : {1, 9, 53, 127, 345, 1023, 2135}) {
-        for (auto dimN : {1, 5, 37, 256, 1024}) {
-          for (auto dimK : {8, 45, 346, 784, 1025}) {
-            if (true == transa && true == transb) {
-              continue;
-            }
-            VLOG(3) << setiosflags(ios::left) << setfill(' ')
-                    << " transa=" << transa << " transb=" << transb
-                    << " dimM=" << setw(5) << dimM << " dimN=" << setw(5)
-                    << dimN << " dimK=" << setw(5) << dimK;
-
-            testMatrixMul(transa, transb, dimM, dimN, dimK);
-            testSubMatrixMul(transa, transb, dimM, dimN, dimK);
-          }
-        }
-      }
-    }
-  }
-}
-
-void testVectorRowFunc(int size) {
-  CpuVectorPtr cpu = std::make_shared<CpuVectorT<real>>(size);
-  GpuVectorPtr gpu = std::make_shared<GpuVectorT<real>>(size);
-
-  cpu->rand();
-  gpu->copyFrom(*cpu);
-
-  EXPECT_EQ(cpu->getMax(), gpu->getMax());
-  EXPECT_EQ(cpu->getMin(), gpu->getMin());
-  EXPECT_EQ(cpu->getAbsMax(), gpu->getAbsMax());
-}
-
-TEST(Vector, rowFunc) {
-  for (auto size : {1, 3, 997}) {  // prime numbers close to 1, 4, 1024
-    VLOG(3) << " size=" << size;
-    testVectorRowFunc(size);
-  }
-}
-
-template <class T>
-void testVectorReset(int size) {
-  std::shared_ptr<CpuVectorT<T>> cpu = std::make_shared<CpuVectorT<T>>(size);
-  std::shared_ptr<GpuVectorT<T>> gpu = std::make_shared<GpuVectorT<T>>(size);
-
-  T value = (T)((int)rand() % 100 + 1.0f / ((int)rand() % 100));
-  cpu->reset(value);
-  gpu->reset(value);
-
-  TensorCheckEqual(*cpu, *gpu);
-}
-
-template <class T>
-void testVecortSelectFrom(int size) {
-  std::shared_ptr<CpuVectorT<T>> cpuDst = std::make_shared<CpuVectorT<T>>(size);
-  std::shared_ptr<GpuVectorT<T>> gpuDst = std::make_shared<GpuVectorT<T>>(size);
-  std::shared_ptr<CpuVectorT<T>> cpuSrc =
-      std::make_shared<CpuVectorT<T>>(size * 2);
-  std::shared_ptr<GpuVectorT<T>> gpuSrc =
-      std::make_shared<GpuVectorT<T>>(size * 2);
-  CpuIVectorPtr cpuIds = std::make_shared<CpuVectorT<int>>(size);
-  GpuIVectorPtr gpuIds = std::make_shared<GpuVectorT<int>>(size);
-
-  if (std::is_same<T, real>::value) {
-    cpuSrc->rand();
-  } else {
-    cpuSrc->rand(100000);
-  }
-  gpuSrc->copyFrom(*cpuSrc);
-  cpuIds->rand(size);
-  gpuIds->copyFrom(*cpuIds);
-
-  cpuDst->selectFrom(*cpuSrc, *cpuIds);
-  gpuDst->selectFrom(*gpuSrc, *gpuIds);
-
-  TensorCheckEqual(*cpuDst, *gpuDst);
-}
-
-template <class T>
-void testVecotrZeroMem(int size) {
-  std::shared_ptr<CpuVectorT<T>> cpu = std::make_shared<CpuVectorT<T>>(size);
-  std::shared_ptr<GpuVectorT<T>> gpu = std::make_shared<GpuVectorT<T>>(size);
-
-  cpu->zeroMem();
-  gpu->zeroMem();
-
-  TensorCheckEqual(*cpu, *gpu);
-}
-
-template <class T>
-void testVectorIsEqual(int size) {
-  std::shared_ptr<CpuVectorT<T>> cpuA = std::make_shared<CpuVectorT<T>>(size);
-  std::shared_ptr<CpuVectorT<T>> cpuB = std::make_shared<CpuVectorT<T>>(size);
-  std::shared_ptr<GpuVectorT<T>> gpuA = std::make_shared<GpuVectorT<T>>(size);
-  std::shared_ptr<GpuVectorT<T>> gpuB = std::make_shared<GpuVectorT<T>>(size);
-
-  if (std::is_same<T, real>::value) {
-    cpuB->rand();
-  } else {
-    cpuB->rand(100000);
-  }
-  gpuB->copyFrom(*cpuB);
-
-  T value = (T)((int)rand() % 100 + 1.0f / ((int)rand() % 100));
-  cpuA->isEqualTo(*cpuB, value);
-  gpuA->isEqualTo(*gpuB, value);
-
-  TensorCheckEqual(*cpuA, *gpuA);
-}
-
-TEST(Vector, Equal) {
-  for (auto size : {1, 3, 997}) {  // prime numbers close to 1, 4, 1024
-    VLOG(3) << " size=" << size;
-    testVectorReset<int>(size);
-    testVectorReset<real>(size);
-    testVecortSelectFrom<int>(size);
-    testVecortSelectFrom<real>(size);
-    testVecotrZeroMem<int>(size);
-    testVecotrZeroMem<real>(size);
-    testVectorIsEqual<int>(size);
-    testVectorIsEqual<real>(size);
-  }
-}
-
-void testMatrixTopK(int samples, int dim, int beamSize) {
-  MatrixPtr cpuSrc = std::make_shared<CpuMatrix>(samples, dim);
-  MatrixPtr gpuSrc = std::make_shared<GpuMatrix>(samples, dim);
-  MatrixPtr cpuVal = std::make_shared<CpuMatrix>(samples, beamSize);
-  MatrixPtr gpuVal = std::make_shared<GpuMatrix>(samples, beamSize);
-  IVectorPtr cpuIds = std::make_shared<CpuIVector>(samples * beamSize);
-  IVectorPtr gpuIds = std::make_shared<GpuIVector>(samples * beamSize);
-
-  cpuSrc->randomizeUniform();
-  gpuSrc->copyFrom(*cpuSrc);
-
-  cpuSrc->rowMax(*cpuIds, *cpuVal);
-  gpuSrc->rowMax(*gpuIds, *gpuVal);
-
-  TensorCheckEqual(*cpuVal, *gpuVal);
-}
-
-TEST(Matrix, topK) {
-  for (auto samples : {1, 17, 131}) {  // prime numbers close to 1, 16, 127
-    for (auto dim : {1, 3, 997}) {     // prime numbers close to 1, 4, 1024
-      for (auto beamSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) {
-        if (beamSize > dim) continue;
-        VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
-                << " dim=" << dim;
-        testMatrixTopK(samples, dim, beamSize);
-      }
-    }
-  }
-}
-
-void testSMatrixTopK(int samples, int dim, int beamSize, real ratio) {
-  int nnz = samples * dim * ratio;
-  if (nnz < 1) nnz = 1;  // Because sparseRand in MathUtil.cpp requires this.
-  MatrixPtr cpuSrc = std::make_shared<CpuSparseMatrix>(samples, dim, nnz);
-  MatrixPtr gpuSrc = std::make_shared<GpuSparseMatrix>(samples, dim, nnz);
-  MatrixPtr cpuVal = std::make_shared<CpuMatrix>(samples, beamSize);
-  MatrixPtr gpuVal = std::make_shared<GpuMatrix>(samples, beamSize);
-  IVectorPtr cpuIds = std::make_shared<CpuIVector>(samples * beamSize);
-  IVectorPtr gpuIds = std::make_shared<GpuIVector>(samples * beamSize);
-
-  cpuSrc->randomizeUniform();
-  gpuSrc->copyFrom(*cpuSrc);
-  cpuVal->zero();
-  cpuIds->zero();
-  gpuVal->zero();
-  gpuIds->zero();
-
-  cpuSrc->rowMax(*cpuIds, *cpuVal);
-  gpuSrc->rowMax(*gpuIds, *gpuVal);
-
-  TensorCheckEqual(*cpuVal, *gpuVal);
-
-  IVectorPtr outCheckIds = std::make_shared<CpuIVector>(samples * beamSize);
-  outCheckIds->copyFrom(*gpuIds);
-
-  const int* data1 = cpuIds->getData();
-  const int* data2 = outCheckIds->getData();
-  size_t size = cpuIds->getSize();
-  for (size_t i = 0; i < size; i++) {
-    if (data1[i] == -1 && data1[i] != data2[i]) {
-      EXPECT_EQ(data1[i], data2[i]);
-    }
-  }
-}
-
-TEST(SMatrix, topK) {
-  for (auto samples : {1, 3, 61}) {
-    for (auto dim : {1, 3, 61}) {
-      for (auto beamSize : {1, 3, 61}) {
-        for (auto ratio : {0.01, 0.001}) {
-          if (beamSize > dim) continue;
-          VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
-                  << " dim=" << dim << " ratio=" << ratio;
-          testSMatrixTopK(samples, dim, beamSize, ratio);
-        }
-      }
-    }
-  }
-}
-
-void testMatrixSequenceAvg(int batchSize, int inputDim, int mode) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  int newBatchSize = cpuSequence->getSize() - 1;
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(newBatchSize, inputDim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(newBatchSize, inputDim);
-  cpuOutput->zero();
-  gpuOutput->zero();
-
-  cpuOutput->sequenceAvgForward(*cpuInput, *cpuSequence, mode);
-  gpuOutput->sequenceAvgForward(*gpuInput, *gpuSequence, mode);
-
-  TensorCheckErr(*cpuOutput, *gpuOutput);
-
-  MatrixPtr cpuInGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInGrad->randomizeUniform();
-  gpuInGrad->copyFrom(*cpuInGrad);
-
-  cpuInGrad->sequenceAvgBackward(*cpuOutput, *cpuSequence, mode);
-  gpuInGrad->sequenceAvgBackward(*gpuOutput, *gpuSequence, mode);
-
-  TensorCheckErr(*cpuInGrad, *gpuInGrad);
-}
-
-TEST(Matrix, sequenceAvg) {
-  for (auto batchSize : {10, 128, 6000}) {
-    for (auto inputDim : {32, 100, 512}) {
-      for (auto mode : {0, 1, 2}) {
-        VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim
-                << " mode=" << mode;
-        testMatrixSequenceAvg(batchSize, inputDim, mode);
-      }
-    }
-  }
-}
-
-void testParamReluBackwardDiff(int height,
-                               int width,
-                               int w_height,
-                               int w_width) {
-  MatrixPtr oGrad = CpuMatrix::create(height, width, false, false);
-  MatrixPtr input = CpuMatrix::create(height, width, false, false);
-  MatrixPtr diff = CpuMatrix::create(height, width, false, false);
-  MatrixPtr w = CpuMatrix::create(w_height, w_width, false, false);
-
-  oGrad->randomizeUniform();
-  input->randomizeUniform();
-  w->randomizeUniform();
-  diff->randomizeUniform();
-  input->add(-0.5);
-
-  MatrixPtr oGradGpu = GpuMatrix::create(height, width, false, true);
-  MatrixPtr inputGpu = GpuMatrix::create(height, width, false, true);
-  MatrixPtr diffGpu = CpuMatrix::create(height, width, false, true);
-  MatrixPtr wGpu = GpuMatrix::create(w_height, w_width, false, true);
-
-  oGradGpu->copyFrom(*oGrad);
-  inputGpu->copyFrom(*input);
-  wGpu->copyFrom(*w);
-  diffGpu->copyFrom(*diff);
-
-  diff->paramReluBackwardDiff(*oGrad, *input, *w);
-  diffGpu->paramReluBackwardDiff(*oGradGpu, *inputGpu, *wGpu);
-
-  TensorCheckErr(*diff, *diffGpu);
-}
-
-TEST(Matrix, paramReluBackwardDiff) {
-  for (auto height : {10, 40, 100}) {
-    for (auto width : {10, 40, 100}) {
-      for (auto w_height : {1, 2}) {
-        for (auto w_width : {1, 2}) {
-          if (width % (w_height * w_width)) continue;
-          testParamReluBackwardDiff(height, width, w_height, w_width);
-        }
-      }
-    }
-  }
-}
-
-void testClassificationError(int numSamples, int dim, int topkSize) {
-  MatrixPtr cpuError = std::make_shared<CpuMatrix>(numSamples, 1);
-  MatrixPtr gpuError = std::make_shared<GpuMatrix>(numSamples, 1);
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, dim);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(numSamples, dim);
-  IVectorPtr cpuLabel = std::make_shared<CpuIVector>(numSamples);
-  IVectorPtr gpuLabel = std::make_shared<GpuIVector>(numSamples);
-
-  cpuOutput->randomizeUniform();
-  cpuLabel->rand(dim);
-  gpuOutput->copyFrom(*cpuOutput);
-  gpuLabel->copyFrom(*cpuLabel);
-
-  cpuError->classificationError(*cpuOutput, *cpuLabel, topkSize);
-  gpuError->classificationError(*gpuOutput, *gpuLabel, topkSize);
-
-  TensorCheckEqual(*cpuError, *gpuError);
-}
-
-TEST(Matrix, classificationError) {
-  for (auto numSamples : {1, 3, 31}) {
-    for (auto dim : {1, 3, 31}) {
-      for (auto topkSize : {1, 3, (int)rand() % dim + 1}) {
-        if (topkSize > dim) continue;
-        VLOG(3) << " sample= " << numSamples << " topkSize= " << topkSize
-                << " dim= " << dim;
-        testClassificationError(numSamples, dim, topkSize);
-      }
-    }
-  }
-}
-
-void testMaxPoolFwdBwd(int numSamples,
-                       int channels,
-                       int imgSizeH,
-                       int imgSizeW,
-                       int ksizeH,
-                       int ksizeW,
-                       int strideH,
-                       int strideW,
-                       int padH,
-                       int padW) {
-  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
-  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
-
-  int inWidth = imgSizeH * imgSizeW * channels;
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  int outWidth = channels * outH * outW;
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-
-  input->randomizeUniform();
-  target->randomizeUniform();
-  inputGpu->copyFrom(*input);
-  targetGpu->copyFrom(*target);
-
-  target->maxPoolForward(*input,
-                         imgSizeH,
-                         imgSizeW,
-                         channels,
-                         ksizeW,
-                         ksizeH,
-                         strideH,
-                         strideW,
-                         outH,
-                         outW,
-                         padH,
-                         padW);
-  targetGpu->maxPoolForward(*inputGpu,
-                            imgSizeH,
-                            imgSizeW,
-                            channels,
-                            ksizeW,
-                            ksizeH,
-                            strideH,
-                            strideW,
-                            outH,
-                            outW,
-                            padH,
-                            padW);
-  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
-  targetCheck->copyFrom(*targetGpu);
-  checkMatrixEqual(target, targetCheck);
-
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->maxPoolBackward(*input,
-                             imgSizeH,
-                             imgSizeW,
-                             *targetGrad,
-                             *target,
-                             ksizeW,
-                             ksizeH,
-                             strideH,
-                             strideW,
-                             outH,
-                             outW,
-                             1.0,
-                             1.0,
-                             padH,
-                             padW);
-  inputGpuGrad->maxPoolBackward(*inputGpu,
-                                imgSizeH,
-                                imgSizeW,
-                                *targetGpuGrad,
-                                *targetGpu,
-                                ksizeW,
-                                ksizeH,
-                                strideH,
-                                strideW,
-                                outH,
-                                outW,
-                                1.0,
-                                1.0,
-                                padH,
-                                padW);
-  MatrixPtr targetBwdCheck =
-      CpuMatrix::create(numSamples, inWidth, false, false);
-  targetBwdCheck->copyFrom(*inputGpuGrad);
-  checkMatrixEqual(inputGrad, targetBwdCheck);
-}
-
-void testAvgPoolFwdBwd(int numSamples,
-                       int channels,
-                       int imgSizeH,
-                       int imgSizeW,
-                       int ksizeH,
-                       int ksizeW,
-                       int strideH,
-                       int strideW,
-                       int padH,
-                       int padW) {
-  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
-  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
-
-  int inWidth = imgSizeH * imgSizeW * channels;
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  int outWidth = channels * outH * outW;
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-
-  input->randomizeUniform();
-  target->randomizeUniform();
-  inputGpu->copyFrom(*input);
-  targetGpu->copyFrom(*target);
-
-  target->avgPoolForward(*input,
-                         imgSizeH,
-                         imgSizeW,
-                         channels,
-                         ksizeW,
-                         ksizeH,
-                         strideH,
-                         strideW,
-                         outH,
-                         outW,
-                         padH,
-                         padW);
-  targetGpu->avgPoolForward(*inputGpu,
-                            imgSizeH,
-                            imgSizeW,
-                            channels,
-                            ksizeW,
-                            ksizeH,
-                            strideH,
-                            strideW,
-                            outH,
-                            outW,
-                            padH,
-                            padW);
-
-  TensorCheckErr(*target, *targetGpu);
-
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->avgPoolBackward(*targetGrad,
-                             imgSizeH,
-                             imgSizeW,
-                             ksizeW,
-                             ksizeH,
-                             strideH,
-                             strideW,
-                             outH,
-                             outW,
-                             1.0,
-                             1.0,
-                             padH,
-                             padW);
-  inputGpuGrad->avgPoolBackward(*targetGpuGrad,
-                                imgSizeH,
-                                imgSizeW,
-                                ksizeW,
-                                ksizeH,
-                                strideH,
-                                strideW,
-                                outH,
-                                outW,
-                                1.0,
-                                1.0,
-                                padH,
-                                padW);
-
-  TensorCheckErr(*inputGrad, *inputGpuGrad);
-}
-
-// TODO(yi): I noticed many such blindly combinatorial tests in this
-// file.  They are no help to locate defects at all.
-TEST(Matrix, PoolFwdBwd) {
-  for (auto numSamples : {1, 3}) {
-    for (auto channels : {1, 3}) {
-      for (auto imgSizeH : {13, 17}) {
-        for (auto imgSizeW : {17, 19}) {
-          for (auto sizeX : {2, 3}) {
-            for (auto sizeY : {2, 3}) {
-              for (auto sH : {1, 2}) {
-                for (auto sW : {1, 2}) {
-                  for (auto pH : {0, (sizeY - 1) / 2}) {
-                    for (auto pW : {0, (sizeX - 1) / 2}) {
-                      VLOG(3) << " numSamples=" << numSamples
-                              << " channels=" << channels
-                              << " imgSizeH=" << imgSizeH
-                              << " imgSizeW=" << imgSizeW << " sizeX=" << sizeX
-                              << " sizeY=" << sizeY << " strideH=" << sH
-                              << " strideW=" << sW << " padingH=" << pH
-                              << " padingW=" << pW;
-                      testMaxPoolFwdBwd(numSamples,
-                                        channels,
-                                        imgSizeH,
-                                        imgSizeW,
-                                        sizeX,
-                                        sizeY,
-                                        sH,
-                                        sW,
-                                        pH,
-                                        pW);
-                      testAvgPoolFwdBwd(numSamples,
-                                        channels,
-                                        imgSizeH,
-                                        imgSizeW,
-                                        sizeX,
-                                        sizeY,
-                                        sH,
-                                        sW,
-                                        pH,
-                                        pW);
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-void testMaxOutFwdBwd(
-    int numSamples, int imgSizeH, int imgSizeW, int channels, int groups) {
-  int inWidth = imgSizeH * imgSizeW * channels;
-  int outChannels = channels / groups;
-  int outWidth = imgSizeH * imgSizeW * outChannels;
-
-  // forward
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-
-  IVectorPtr id = CpuIVector::create(numSamples * outWidth, false);
-  IVectorPtr idGpu = GpuIVector::create(numSamples * outWidth, true);
-
-  input->randomizeUniform();
-  inputGpu->copyFrom(*input);
-
-  target->maxoutForward(*input, *id, outChannels, groups);
-  targetGpu->maxoutForward(*inputGpu, *idGpu, outChannels, groups);
-
-  TensorCheckErr(*target, *targetGpu);
-  TensorCheckEqual(*id, *idGpu);
-
-  // backward
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->maxoutBackward(*targetGrad, *id, outChannels, groups);
-  inputGpuGrad->maxoutBackward(*targetGpuGrad, *idGpu, outChannels, groups);
-
-  TensorCheckErr(*inputGrad, *inputGpuGrad);
-}
-
-TEST(Matrix, MaxOutFwdBwd) {
-  for (auto numSamples : {5, 10}) {
-    for (auto channels : {8, 16}) {
-      for (auto imgSizeH : {14, 28}) {
-        for (auto imgSizeW : {16, 30}) {
-          for (auto groups : {2, 4}) {
-            VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
-                    << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
-                    << " groups=" << groups;
-            testMaxOutFwdBwd(numSamples, imgSizeH, imgSizeW, channels, groups);
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST(CpuMatrix, copyFrom) {
-  const size_t height = 31;
-  const size_t width = 53;
-  CpuMatrix cpu(height, width);
-  GpuMatrix gpu(height, width);
-  CpuMatrix copy(height, width);
-
-  cpu.randomizeUniform();
-  gpu.copyFrom(cpu);
-  copy.copyFrom(gpu, HPPL_STREAM_DEFAULT);
-
-  TensorCheckEqual(cpu, copy);
-}
-
-void testBatch2seqPadding(int batchSize, int inputDim) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  for (int i = 0; i < int(cpuSequence->getSize()); ++i) {
-    (cpuSequence->getData())[i] += 1;  // so no way that maxSeqLen is 0;
-  }
-
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  size_t numSeq = cpuSequence->getSize() - 1;
-  size_t maxSeqLen = *std::max_element(cpuSequence->getData(),
-                                       cpuSequence->getData() + numSeq);
-
-  printf("numSeq = %ld, maxSeqLen = %ld\n", numSeq, maxSeqLen);
-  MatrixPtr cBatch = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
-  MatrixPtr gBatch = std::make_shared<GpuMatrix>(numSeq * maxSeqLen, inputDim);
-  MatrixPtr cCheck = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
-
-  // hl_sequence2batch_copy_padding(gBatch->getData(),
-  //                                gpuInput->getData(),
-  //                                cpuSequence->getData(),
-  //                                inputDim,
-  //                                maxSeqLen,
-  //                                numSeq,
-  //                                false,
-  //                                true);
-  // cCheck->copyFrom(*gBatch);
-
-  // int* seqStart = cpuSequence->getData();
-  // float* batchData = cBatch->getData();
-  // float* seqData = cpuInput->getData();
-  // for (size_t i = 0; i < maxSeqLen; i++) {
-  //   for (size_t j = 0; j < numSeq; j++) {
-  //     size_t sequenceStart = seqStart[j];
-  //     size_t sequenceLength = seqStart[j + 1] - seqStart[j];
-  //     if (i < sequenceLength) {
-  //       memcpy(batchData + (i * numSeq + j) * inputDim,
-  //              seqData + (sequenceStart + i) * inputDim,
-  //              inputDim * sizeof(real));
-  //     } else {
-  //       memset(batchData + (i * numSeq + j) * inputDim,
-  //              0,
-  //              inputDim * sizeof(real));
-  //     }
-  //   }
-  // }
-
-  // TensorCheckErr(*cBatch, *cCheck);
-}
-
-TEST(Matrix, warpCTC) {
-  for (auto batchSize : {1, 3, 17}) {
-    for (auto inputDim : {1, 3, 31}) {
-      VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim;
-      testBatch2seqPadding(batchSize, inputDim);
-    }
-  }
-}
-
-void testMaxPool3DFwdBwd(int numSamples,
-                         int channels,
-                         int imgSizeD,
-                         int imgSizeH,
-                         int imgSizeW,
-                         int ksizeD,
-                         int ksizeH,
-                         int ksizeW,
-                         int strideD,
-                         int strideH,
-                         int strideW,
-                         int padD,
-                         int padH,
-                         int padW) {
-  int outD = outputSize(imgSizeD, ksizeD, padD, strideD, true);
-  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
-  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
-
-  int inWidth = channels * imgSizeD * imgSizeH * imgSizeW;
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  int outWidth = channels * outD * outH * outW;
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-  MatrixPtr maxIdx = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr maxIdxGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-
-  input->randomizeUniform();
-  target->randomizeUniform();
-  inputGpu->copyFrom(*input);
-  targetGpu->copyFrom(*target);
-
-  target->maxPool3DForward(*input,
-                           *maxIdx,
-                           channels,
-                           imgSizeD,
-                           imgSizeH,
-                           imgSizeW,
-                           outD,
-                           outH,
-                           outW,
-                           ksizeD,
-                           ksizeH,
-                           ksizeW,
-                           strideD,
-                           strideH,
-                           strideW,
-                           padD,
-                           padH,
-                           padW);
-  targetGpu->maxPool3DForward(*inputGpu,
-                              *maxIdxGpu,
-                              channels,
-                              imgSizeD,
-                              imgSizeH,
-                              imgSizeW,
-                              outD,
-                              outH,
-                              outW,
-                              ksizeD,
-                              ksizeH,
-                              ksizeW,
-                              strideD,
-                              strideH,
-                              strideW,
-                              padD,
-                              padH,
-                              padW);
-  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
-  targetCheck->copyFrom(*targetGpu);
-  checkMatrixEqual(target, targetCheck);
-
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->maxPool3DBackward(*targetGrad,
-                               *maxIdx,
-                               imgSizeD,
-                               imgSizeH,
-                               imgSizeW,
-                               outD,
-                               outH,
-                               outW,
-                               ksizeD,
-                               ksizeH,
-                               ksizeW,
-                               strideD,
-                               strideH,
-                               strideW,
-                               padD,
-                               padH,
-                               padW,
-                               1.0,
-                               1.0);
-  inputGpuGrad->maxPool3DBackward(*targetGpuGrad,
-                                  *maxIdxGpu,
-                                  imgSizeD,
-                                  imgSizeH,
-                                  imgSizeW,
-                                  outD,
-                                  outH,
-                                  outW,
-                                  ksizeD,
-                                  ksizeH,
-                                  ksizeW,
-                                  strideD,
-                                  strideH,
-                                  strideW,
-                                  padD,
-                                  padH,
-                                  padW,
-                                  1.0,
-                                  1.0);
-  MatrixPtr targetBwdCheck =
-      CpuMatrix::create(numSamples, inWidth, false, false);
-  targetBwdCheck->copyFrom(*inputGpuGrad);
-  checkMatrixEqual(inputGrad, targetBwdCheck);
-}
-
-void testAvgPool3DFwdBwd(int numSamples,
-                         int channels,
-                         int imgSizeD,
-                         int imgSizeH,
-                         int imgSizeW,
-                         int ksizeD,
-                         int ksizeH,
-                         int ksizeW,
-                         int strideD,
-                         int strideH,
-                         int strideW,
-                         int padD,
-                         int padH,
-                         int padW) {
-  int outD = outputSize(imgSizeD, ksizeD, padD, strideD, true);
-  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
-  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
-
-  int inWidth = imgSizeD * imgSizeH * imgSizeW * channels;
-  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
-
-  int outWidth = channels * outD * outH * outW;
-  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
-
-  input->randomizeUniform();
-  target->randomizeUniform();
-  inputGpu->copyFrom(*input);
-  targetGpu->copyFrom(*target);
-
-  target->avgPool3DForward(*input,
-                           channels,
-                           imgSizeD,
-                           imgSizeH,
-                           imgSizeW,
-                           outD,
-                           outH,
-                           outW,
-                           ksizeD,
-                           ksizeH,
-                           ksizeW,
-                           strideD,
-                           strideH,
-                           strideW,
-                           padD,
-                           padH,
-                           padW);
-
-  targetGpu->avgPool3DForward(*inputGpu,
-                              channels,
-                              imgSizeD,
-                              imgSizeH,
-                              imgSizeW,
-                              outD,
-                              outH,
-                              outW,
-                              ksizeD,
-                              ksizeH,
-                              ksizeW,
-                              strideD,
-                              strideH,
-                              strideW,
-                              padD,
-                              padH,
-                              padW);
-
-  TensorCheckErr(*target, *targetGpu);
-
-  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
-  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
-  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad =
-      GpuMatrix::create(numSamples, outWidth, false, true);
-
-  inputGrad->randomizeUniform();
-  targetGrad->randomizeUniform();
-  inputGpuGrad->copyFrom(*inputGrad);
-  targetGpuGrad->copyFrom(*targetGrad);
-
-  inputGrad->avgPool3DBackward(*targetGrad,
-                               imgSizeD,
-                               imgSizeH,
-                               imgSizeW,
-                               outD,
-                               outH,
-                               outW,
-                               ksizeD,
-                               ksizeH,
-                               ksizeW,
-                               strideD,
-                               strideH,
-                               strideW,
-                               padD,
-                               padH,
-                               padW,
-                               1.0,
-                               1.0);
-
-  inputGpuGrad->avgPool3DBackward(*targetGpuGrad,
-                                  imgSizeD,
-                                  imgSizeH,
-                                  imgSizeW,
-                                  outD,
-                                  outH,
-                                  outW,
-                                  ksizeD,
-                                  ksizeH,
-                                  ksizeW,
-                                  strideD,
-                                  strideH,
-                                  strideW,
-                                  padD,
-                                  padH,
-                                  padW,
-                                  1.0,
-                                  1.0);
-  TensorCheckErr(*inputGrad, *inputGpuGrad);
-}
-
-// TODO(yi): I noticed many such blindly combinatorial tests in this
-// file.  They are no help to locate defects at all.
-TEST(Matrix, Pool3DFwdBwd) {
-  for (auto numSamples : {1, 3}) {
-    for (auto channels : {3}) {
-      for (auto imgSizeD : {9, 16}) {
-        for (auto imgSizeH : {9, 32}) {
-          for (auto imgSizeW : {9, 32}) {
-            for (auto sizeX : {3}) {
-              for (auto sizeY : {3}) {
-                for (auto sizeZ : {3}) {
-                  for (auto sD : {2}) {
-                    for (auto sH : {2}) {
-                      for (auto sW : {2}) {
-                        for (auto pD : {0, (sizeZ - 1) / 2}) {
-                          for (auto pH : {0, (sizeY - 1) / 2}) {
-                            for (auto pW : {0, (sizeX - 1) / 2}) {
-                              VLOG(3) << " numSamples=" << numSamples
-                                      << " channels=" << channels
-                                      << " imgSizeD=" << imgSizeD
-                                      << " imgSizeH=" << imgSizeH
-                                      << " imgSizeW=" << imgSizeW
-                                      << " sizeX=" << sizeX
-                                      << " sizeY=" << sizeY
-                                      << " sizeZ=" << sizeZ << " strideD=" << sD
-                                      << " strideH=" << sH << " strideW=" << sW
-                                      << " padingD=" << pD << " padingH=" << pH
-                                      << " padingW=" << pW;
-
-                              testMaxPool3DFwdBwd(numSamples,
-                                                  channels,
-                                                  imgSizeD,
-                                                  imgSizeH,
-                                                  imgSizeW,
-                                                  sizeX,
-                                                  sizeY,
-                                                  sizeZ,
-                                                  sD,
-                                                  sH,
-                                                  sW,
-                                                  pD,
-                                                  pH,
-                                                  pW);
-                              testAvgPool3DFwdBwd(numSamples,
-                                                  channels,
-                                                  imgSizeD,
-                                                  imgSizeH,
-                                                  imgSizeW,
-                                                  sizeX,
-                                                  sizeY,
-                                                  sizeZ,
-                                                  sD,
-                                                  sH,
-                                                  sW,
-                                                  pD,
-                                                  pH,
-                                                  pW);
-                            }
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  //  for (auto numSamples : {1, 3}) {
-  //    for (auto channels : {1, 3}) {
-  //      for (auto imgSizeD : {9,16}) {
-  //      for (auto imgSizeH : {9, 32}) {
-  //        for (auto imgSizeW : {9, 32}) {
-  //          for (auto sizeX : {2, 3}) {
-  //            for (auto sizeY : {2, 3}) {
-  //            for (auto sizeZ : {2,3}){
-  //              for (auto sD : {1, 2}) {
-  //              for (auto sH : {1, 2}) {
-  //                for (auto sW : {1, 2}) {
-  //                  for (auto pD : {0, (sizeZ - 1) / 2}){
-  //                  for (auto pH : {0, (sizeY - 1) / 2}) {
-  //                    for (auto pW : {0, (sizeX - 1) / 2}) {
-  //                      VLOG(3) << " numSamples=" << numSamples
-  //                              << " channels=" << channels
-  //                              << " imgSizeD=" << imgSizeD
-  //                              << " imgSizeH=" << imgSizeH
-  //                              << " imgSizeW=" << imgSizeW
-  //                              << " sizeX=" << sizeX
-  //                              << " sizeY=" << sizeY
-  //                              << " sizeZ=" << sizeZ
-  //                              << " strideD=" << sD
-  //                              << " strideH=" << sH
-  //                              << " strideW=" << sW
-  //                              << " padingD=" << pD
-  //                              << " padingH=" << pH
-  //                              << " padingW=" << pW;
-  //
-  //                      testMaxPool3DFwdBwd(numSamples,
-  //                                        channels,
-  //                                        imgSizeD,
-  //                                        imgSizeH,
-  //                                        imgSizeW,
-  //                                        sizeX,
-  //                                        sizeY,
-  //                                        sizeZ,
-  //                                        sD,
-  //                                        sH,
-  //                                        sW,
-  //                                        pD,
-  //                                        pH,
-  //                                        pW);
-  //                      testAvgPool3DFwdBwd(numSamples,
-  //                                        channels,
-  //                                        imgSizeD,
-  //                                        imgSizeH,
-  //                                        imgSizeW,
-  //                                        sizeX,
-  //                                        sizeY,
-  //                                        sizeZ,
-  //                                        sD,
-  //                                        sH,
-  //                                        sW,
-  //                                        pD,
-  //                                        pH,
-  //                                        pW);
-  //                    }
-  //                  }
-  //                }
-  //              }
-  //            }
-  //            }
-  //          }
-  //        }
-  //      }
-  //      }
-  //    }
-  //    }
-  //  }
-  //  }
-}
-
-void testMatrixCol2Vol(int depth, int height, int width) {
-  int channel = 3;
-  int filterX = 3, filterY = 4, filterZ = 5;
-  int strideX = 2, strideY = 2, strideZ = 2;
-  int padX = 1, padY = 1, padZ = 1;
-
-  MatrixPtr cpuImage =
-      std::make_shared<CpuMatrix>(channel, depth * height * width);
-  MatrixPtr gpuImage =
-      std::make_shared<GpuMatrix>(channel, depth * height * width);
-  cpuImage->randomizeUniform();
-  gpuImage->copyFrom(*cpuImage);
-
-  int outD = outputSize(depth, filterZ, padZ, strideZ, true);
-  int outH = outputSize(height, filterY, padY, strideY, true);
-  int outW = outputSize(width, filterX, padX, strideX, true);
-
-  int colBufHeight = channel * filterZ * filterY * filterX;
-  int colBufWidth = outD * outH * outW;
-  MatrixPtr cpuColBuf = std::make_shared<CpuMatrix>(colBufHeight, colBufWidth);
-  MatrixPtr gpuColBuf = std::make_shared<GpuMatrix>(colBufHeight, colBufWidth);
-  cpuColBuf->vol2Col(cpuImage->getData(),
-                     channel,
-                     depth,
-                     height,
-                     width,
-                     filterZ,
-                     filterY,
-                     filterX,
-                     strideZ,
-                     strideY,
-                     strideX,
-                     padZ,
-                     padY,
-                     padX);
-  gpuColBuf->vol2Col(gpuImage->getData(),
-                     channel,
-                     depth,
-                     height,
-                     width,
-                     filterZ,
-                     filterY,
-                     filterX,
-                     strideZ,
-                     strideY,
-                     strideX,
-                     padZ,
-                     padY,
-                     padX);
-  TensorCheckEqual(*cpuColBuf, *gpuColBuf);
-
-  cpuColBuf->randomizeUniform();
-  gpuColBuf->copyFrom(*cpuColBuf);
-  cpuColBuf->col2Vol(cpuImage->getData(),
-                     channel,
-                     depth,
-                     height,
-                     width,
-                     filterZ,
-                     filterY,
-                     filterX,
-                     strideZ,
-                     strideY,
-                     strideX,
-                     padZ,
-                     padY,
-                     padX,
-                     1.0,
-                     1.0);
-  gpuColBuf->col2Vol(gpuImage->getData(),
-                     channel,
-                     depth,
-                     height,
-                     width,
-                     filterZ,
-                     filterY,
-                     filterX,
-                     strideZ,
-                     strideY,
-                     strideX,
-                     padZ,
-                     padY,
-                     padX,
-                     1.0,
-                     1.0);
-  TensorCheckErr(*cpuImage, *gpuImage);
-}
-
-TEST(Matrix, col2Vol) {
-  for (auto depth : {9, 16, 64}) {
-    for (auto height : {9, 11, 128}) {
-      for (auto width : {9, 32, 128}) {
-        VLOG(3) << "depth=" << depth << " height=" << height
-                << " width=" << width;
-        testMatrixCol2Vol(depth, height, width);
-      }
-    }
-  }
-}
-
-#endif
diff --git a/paddle/math/tests/test_matrixUtil.h b/paddle/math/tests/test_matrixUtil.h
deleted file mode 100644
index 86297547dcd83ca87d1c87a8489f7af2f3e9f492..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_matrixUtil.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <gtest/gtest.h>
-#include <paddle/utils/Util.h>
-#include "paddle/math/SparseMatrix.h"
-
-namespace paddle {
-
-void checkMatrixEqual(const MatrixPtr& a, const MatrixPtr& b) {
-  ASSERT_EQ(a->getWidth(), b->getWidth());
-  ASSERT_EQ(a->getHeight(), b->getHeight());
-  ASSERT_EQ(a->isTransposed(), b->isTransposed());
-  for (size_t r = 0; r < a->getHeight(); ++r) {
-    for (size_t c = 0; c < a->getWidth(); ++c) {
-      ASSERT_FLOAT_EQ(a->getElement(r, c), b->getElement(r, c));
-    }
-  }
-}
-
-void checkSMatrixEqual(const CpuSparseMatrix& a, const CpuSparseMatrix& b) {
-  ASSERT_EQ(a.getWidth(), b.getWidth());
-  ASSERT_EQ(a.getHeight(), b.getHeight());
-  ASSERT_EQ(a.isTransposed(), b.isTransposed());
-  ASSERT_EQ(a.getFormat(), b.getFormat());
-  ASSERT_EQ(a.getElementCnt(), b.getElementCnt());
-  for (size_t r = 0; r < a.getElementCnt(); ++r) {
-    ASSERT_FLOAT_EQ(a.getValue()[r], b.getValue()[r]);
-  }
-}
-
-void checkSMatrixEqual(const CpuSparseMatrixPtr& a,
-                       const CpuSparseMatrixPtr& b) {
-  ASSERT_EQ(a->getWidth(), b->getWidth());
-  ASSERT_EQ(a->getHeight(), b->getHeight());
-  ASSERT_EQ(a->isTransposed(), b->isTransposed());
-  ASSERT_EQ(a->getFormat(), b->getFormat());
-  ASSERT_EQ(a->getElementCnt(), b->getElementCnt());
-  for (size_t r = 0; r < a->getElementCnt(); ++r) {
-    ASSERT_FLOAT_EQ(a->getValue()[r], b->getValue()[r]);
-  }
-}
-
-void checkSMatrixEqual2(const CpuSparseMatrixPtr& a,
-                        const CpuSparseMatrixPtr& b) {
-  ASSERT_EQ(a->getWidth(), b->getWidth());
-  ASSERT_EQ(a->getHeight(), b->getHeight());
-  ASSERT_EQ(a->isTransposed(), b->isTransposed());
-  ASSERT_EQ(a->getFormat(), b->getFormat());
-  ASSERT_EQ(a->getValueType(), b->getValueType());
-  ASSERT_EQ(a->getElementCnt(), b->getElementCnt());
-  if (a->getFormat() == SPARSE_CSR) {
-    for (size_t r = 0; r < a->getElementCnt(); ++r) {
-      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
-      if (a->getValueType() == FLOAT_VALUE) {
-        ASSERT_FLOAT_EQ(a->getValue()[r], b->getValue()[r]);
-      }
-    }
-    for (size_t r = 0; r <= a->getHeight(); r++) {
-      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
-    }
-  } else {
-    for (size_t r = 0; r < a->getElementCnt(); ++r) {
-      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
-      if (a->getValueType() == FLOAT_VALUE) {
-        ASSERT_FLOAT_EQ(a->getValue()[r], b->getValue()[r]);
-      }
-    }
-    for (size_t r = 0; r <= a->getWidth(); r++) {
-      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
-    }
-  }
-}
-
-void checkSMatrixEqual2Dense(const CpuSparseMatrix& a, const CpuMatrix& b) {
-  ASSERT_EQ(a.getWidth(), b.getWidth());
-  ASSERT_EQ(a.getHeight(), b.getHeight());
-  ASSERT_EQ(a.isTransposed(), b.isTransposed());
-
-  if (a.getFormat() == SPARSE_CSC) {
-    int* rows = a.getRows();
-    for (size_t i = 0; i < a.getWidth(); i++) {
-      for (size_t j = a.getColStartIdx(i); j < a.getColStartIdx(i + 1); j++) {
-        if (a.getValueType() == FLOAT_VALUE) {
-          ASSERT_FLOAT_EQ(a.getValue()[j], b.getElement(rows[j], i));
-        } else {
-          ASSERT_FLOAT_EQ(1.0, b.getElement(rows[j], i));
-        }
-      }
-    }
-  } else {
-    int* cols = a.getCols();
-    for (size_t i = 0; i < a.getHeight(); i++) {
-      for (size_t j = a.getRowStartIdx(i); j < a.getRowStartIdx(i + 1); j++) {
-        if (a.getValueType() == FLOAT_VALUE) {
-          ASSERT_FLOAT_EQ(a.getValue()[j], b.getElement(i, cols[j]));
-        } else {
-          ASSERT_FLOAT_EQ(1.0, b.getElement(i, cols[j]));
-        }
-      }
-    }
-  }
-}
-
-void checkSMatrixEqual2Dense(const CpuSparseMatrixPtr& a,
-                             const CpuMatrixPtr& b) {
-  ASSERT_EQ(a->getWidth(), b->getWidth());
-  ASSERT_EQ(a->getHeight(), b->getHeight());
-  ASSERT_EQ(a->isTransposed(), b->isTransposed());
-
-  if (a->getFormat() == SPARSE_CSC) {
-    int* rows = a->getRows();
-    for (size_t i = 0; i < a->getWidth(); i++) {
-      for (size_t j = a->getColStartIdx(i); j < a->getColStartIdx(i + 1); j++) {
-        if (a->getValueType() == FLOAT_VALUE) {
-          ASSERT_FLOAT_EQ(a->getValue()[j], b->getElement(rows[j], i));
-        } else {
-          ASSERT_FLOAT_EQ(1.0, b->getElement(rows[j], i));
-        }
-      }
-    }
-  } else {
-    int* cols = a->getCols();
-    for (size_t i = 0; i < a->getHeight(); i++) {
-      for (size_t j = a->getRowStartIdx(i); j < a->getRowStartIdx(i + 1); j++) {
-        if (a->getValueType() == FLOAT_VALUE) {
-          ASSERT_FLOAT_EQ(a->getValue()[j], b->getElement(i, cols[j]));
-        } else {
-          ASSERT_FLOAT_EQ(1.0, b->getElement(i, cols[j]));
-        }
-      }
-    }
-  }
-}
-
-void checkSMatrixErr(const CpuSparseMatrixPtr& a, const CpuSparseMatrixPtr& b) {
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-  ASSERT_EQ(a->getWidth(), b->getWidth());
-  ASSERT_EQ(a->getHeight(), b->getHeight());
-  ASSERT_EQ(a->isTransposed(), b->isTransposed());
-  ASSERT_EQ(a->getFormat(), b->getFormat());
-  ASSERT_EQ(a->getValueType(), b->getValueType());
-  ASSERT_EQ(a->getElementCnt(), b->getElementCnt());
-  int count = 0;
-  if (a->getFormat() == SPARSE_CSR) {
-    for (size_t r = 0; r < a->getElementCnt(); ++r) {
-      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
-      if (a->getValueType() == FLOAT_VALUE) {
-        real aVal = a->getValue()[r];
-        real bVal = b->getValue()[r];
-        if (std::abs(aVal - bVal) > err) {
-          if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) {
-            LOG(INFO) << "a=" << aVal << "\t"
-                      << "b=" << bVal;
-            count++;
-          }
-        }
-      }
-    }
-    for (size_t r = 0; r <= a->getHeight(); r++) {
-      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
-    }
-  } else {
-    for (size_t r = 0; r < a->getElementCnt(); ++r) {
-      ASSERT_EQ(a->getRows()[r], b->getRows()[r]);
-      if (a->getValueType() == FLOAT_VALUE) {
-        real aVal = a->getValue()[r];
-        real bVal = b->getValue()[r];
-        if (std::abs(aVal - bVal) > err) {
-          if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) {
-            count++;
-          }
-        }
-      }
-    }
-    for (size_t r = 0; r <= a->getWidth(); r++) {
-      ASSERT_EQ(a->getCols()[r], b->getCols()[r]);
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void checkMatrixErr(const Matrix& matrix1, const Matrix& matrix2) {
-  CHECK(matrix1.getHeight() == matrix2.getHeight());
-  CHECK(matrix1.getWidth() == matrix2.getWidth());
-#ifndef PADDLE_TYPE_DOUBLE
-  real err = 1e-3;
-#else
-  real err = 1e-10;
-#endif
-
-  int height = matrix1.getHeight();
-  int width = matrix1.getWidth();
-  const real* data1 = matrix1.getData();
-  const real* data2 = matrix2.getData();
-  int count = 0;
-  for (int i = 0; i < height; i++) {
-    for (int j = 0; j < width; j++) {
-      real a = data1[i * width + j];
-      real b = data2[i * width + j];
-      if (std::abs(a - b) > err) {
-        if ((std::abs(a - b) / std::abs(a)) > (err / 10.0f)) {
-          count++;
-        }
-      }
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
-}
-
-void checkDataEqual(const real* a, const real* b, size_t size) {
-  for (size_t i = 0; i < size; ++i) {
-    ASSERT_FLOAT_EQ(a[i], b[i]);
-  }
-}
-
-}  //  namespace paddle
diff --git a/paddle/math/tests/test_perturbation.cpp b/paddle/math/tests/test_perturbation.cpp
deleted file mode 100644
index ef99dab60a874846d04c5ce07d38b2857640ad7b..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_perturbation.cpp
+++ /dev/null
@@ -1,318 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-
-#include <cuda_runtime.h>
-#include <gtest/gtest.h>
-#include <cmath>
-#include <vector>
-#include "hl_cuda.h"
-#include "hl_perturbation_util.cuh"
-
-using namespace std;  // NOLINT
-
-#define _USE_MATH_DEFINES
-
-const int NUM_IMAGES = 2;
-const int SAMPLING_RATE = 2;
-const int IMG_SIZE = 41;
-const int TGT_SIZE = 21;
-const int CHANNELS = 3;
-
-class PerturbationTest : public testing::Test {
-protected:
-  virtual void SetUp() { generateTestImages(gpuImages_); }
-
-  virtual void TearDown() {}
-
-  void allocateMem(real*& gpuAngle,
-                   real*& gpuScale,
-                   int*& gpuCenterR,
-                   int*& gpuCenterC) {
-    gpuAngle = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
-    gpuScale = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
-    gpuCenterR =
-        (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
-    gpuCenterC =
-        (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
-  }
-
-  // Generate translation parameters for testing.
-  void generateTranslationParams(int*& gpuCenterR,
-                                 int*& gpuCenterC,
-                                 int imgSize) {
-    int cpuCenterR[NUM_IMAGES * SAMPLING_RATE];
-    int cpuCenterC[NUM_IMAGES * SAMPLING_RATE];
-    for (int i = 0; i < NUM_IMAGES * SAMPLING_RATE; ++i) {
-      cpuCenterR[i] = (imgSize - 1) / 2;
-      cpuCenterC[i] = (imgSize - 1) / 2 - 1;
-    }
-
-    gpuCenterR =
-        (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
-    hl_memcpy_host2device(
-        gpuCenterR, cpuCenterR, sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
-
-    gpuCenterC =
-        (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
-    hl_memcpy_host2device(
-        gpuCenterC, cpuCenterC, sizeof(int) * NUM_IMAGES * SAMPLING_RATE);
-  }
-
-  // Generate rotation parameters for testing.
-  void generateRotationParams(real*& gpuAngle) {
-    real cpuAngle[NUM_IMAGES];
-    for (int i = 0; i < NUM_IMAGES; ++i) {
-      cpuAngle[i] = 90.0 * M_PI / 180.0;
-    }
-    gpuAngle = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
-    hl_memcpy_host2device(gpuAngle, cpuAngle, sizeof(real) * NUM_IMAGES);
-  }
-
-  void generateScaleParams(real*& gpuScale) {
-    real cpuScale[NUM_IMAGES];
-    for (int i = 0; i < NUM_IMAGES; ++i) {
-      cpuScale[i] = static_cast<real>(TGT_SIZE - 2) / TGT_SIZE;
-    }
-    gpuScale = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES);
-    hl_memcpy_host2device(gpuScale, cpuScale, sizeof(real) * NUM_IMAGES);
-  }
-
-  // Generate the test images, only the center regions are set to 1.
-  // The other parts are set to 0.
-  void generateTestImages(real*& gpuImages) {
-    const int IMAGE_MEM_SIZE = NUM_IMAGES * IMG_SIZE * IMG_SIZE * CHANNELS;
-    real cpuImages[IMAGE_MEM_SIZE];
-    // Set the middle of each image to 1.
-    real* ptr = cpuImages;
-    for (int i = 0; i < NUM_IMAGES; ++i) {
-      for (int r = 0; r < IMG_SIZE; ++r) {
-        for (int c = 0; c < IMG_SIZE; ++c) {
-          for (int ch = 0; ch < CHANNELS; ++ch) {
-            if (r >= IMG_SIZE / 4 && r < IMG_SIZE - IMG_SIZE / 4 &&
-                c >= IMG_SIZE / 4 && c < IMG_SIZE - IMG_SIZE / 4) {
-              *ptr = 1.0;
-            } else {
-              *ptr = 0.0;
-            }
-            ++ptr;
-          }
-        }
-      }
-    }
-    gpuImages = (real*)hl_malloc_device(sizeof(real) * IMAGE_MEM_SIZE);
-    hl_memcpy_host2device(gpuImages, cpuImages, sizeof(real) * IMAGE_MEM_SIZE);
-  }
-
-  real* gpuImages_;
-};
-
-// Random perturbation. Only to make sure the code does not break.
-TEST_F(PerturbationTest, random_perturb) {
-  real *gpuAngle, *gpuScaleRatio;
-  int *gpuCenterR, *gpuCenterC;
-  allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
-
-  real* targets = NULL;
-  const int TARGET_MEM_SIZE =
-      NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
-  targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb(gpuImages_,
-                         IMG_SIZE,
-                         TGT_SIZE,
-                         CHANNELS,
-                         NUM_IMAGES,
-                         1.0,
-                         1.0,
-                         SAMPLING_RATE,
-                         gpuAngle,
-                         gpuScaleRatio,
-                         gpuCenterR,
-                         gpuCenterC,
-                         2,
-                         true,
-                         targets);
-  real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
-}
-
-TEST_F(PerturbationTest, identity_perturb) {
-  real *gpuAngle, *gpuScaleRatio;
-  int *gpuCenterR, *gpuCenterC;
-  allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
-
-  real* targets = NULL;
-  const int TARGET_MEM_SIZE =
-      NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
-  targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb(gpuImages_,
-                         IMG_SIZE,
-                         TGT_SIZE,
-                         CHANNELS,
-                         NUM_IMAGES,
-                         1.0,
-                         1.0,
-                         SAMPLING_RATE,
-                         gpuAngle,
-                         gpuScaleRatio,
-                         gpuCenterR,
-                         gpuCenterC,
-                         2,
-                         false,
-                         targets);
-  real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
-  for (int i = 0; i < TARGET_MEM_SIZE; ++i) {
-    EXPECT_FLOAT_EQ(1.0, cpuTargets[i]);
-  }
-}
-
-TEST_F(PerturbationTest, translation_test) {
-  real *gpuAngle, *gpuScaleRatio;
-  int *gpuCenterR, *gpuCenterC;
-  allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
-  hl_generate_disturb_params(gpuAngle,
-                             gpuScaleRatio,
-                             gpuCenterR,
-                             gpuCenterC,
-                             NUM_IMAGES,
-                             IMG_SIZE,
-                             0.0,
-                             0.0,
-                             SAMPLING_RATE,
-                             false);
-  generateTranslationParams(gpuCenterR, gpuCenterC, IMG_SIZE);
-
-  real* targets = NULL;
-  const int TARGET_MEM_SIZE =
-      NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
-  targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb_with_params(gpuImages_,
-                                     IMG_SIZE,
-                                     TGT_SIZE,
-                                     CHANNELS,
-                                     NUM_IMAGES,
-                                     SAMPLING_RATE,
-                                     gpuAngle,
-                                     gpuScaleRatio,
-                                     gpuCenterR,
-                                     gpuCenterC,
-                                     2,
-                                     targets);
-
-  real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
-  for (int i = 0; i < SAMPLING_RATE * NUM_IMAGES; ++i) {
-    for (int p = 0; p < TGT_SIZE * TGT_SIZE * CHANNELS; ++p) {
-      const int offset = i * TGT_SIZE * TGT_SIZE * CHANNELS + p;
-      if (p < TGT_SIZE * CHANNELS) {
-        EXPECT_FLOAT_EQ(0.0, cpuTargets[offset]);
-      } else {
-        EXPECT_FLOAT_EQ(1.0, cpuTargets[offset]);
-      }
-    }
-  }
-}
-
-TEST_F(PerturbationTest, rotation_test) {
-  real *gpuAngle, *gpuScaleRatio;
-  int *gpuCenterR, *gpuCenterC;
-  allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
-  hl_generate_disturb_params(gpuAngle,
-                             gpuScaleRatio,
-                             gpuCenterR,
-                             gpuCenterC,
-                             NUM_IMAGES,
-                             IMG_SIZE,
-                             0.0,
-                             0.0,
-                             SAMPLING_RATE,
-                             false);
-  generateRotationParams(gpuAngle);
-
-  real* targets = NULL;
-  const int TARGET_MEM_SIZE =
-      NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
-  targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb_with_params(gpuImages_,
-                                     IMG_SIZE,
-                                     TGT_SIZE,
-                                     CHANNELS,
-                                     NUM_IMAGES,
-                                     SAMPLING_RATE,
-                                     gpuAngle,
-                                     gpuScaleRatio,
-                                     gpuCenterR,
-                                     gpuCenterC,
-                                     2,
-                                     targets);
-
-  real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
-  for (int i = 0; i < TARGET_MEM_SIZE; ++i) {
-    EXPECT_FLOAT_EQ(1.0, cpuTargets[i]);
-  }
-}
-
-TEST_F(PerturbationTest, scale_test) {
-  real *gpuAngle, *gpuScaleRatio;
-  int *gpuCenterR, *gpuCenterC;
-  allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC);
-  hl_generate_disturb_params(gpuAngle,
-                             gpuScaleRatio,
-                             gpuCenterR,
-                             gpuCenterC,
-                             NUM_IMAGES,
-                             IMG_SIZE,
-                             0.0,
-                             0.0,
-                             SAMPLING_RATE,
-                             false);
-  generateScaleParams(gpuScaleRatio);
-
-  real* targets = NULL;
-  const int TARGET_MEM_SIZE =
-      NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS;
-  targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE);
-  hl_conv_random_disturb_with_params(gpuImages_,
-                                     IMG_SIZE,
-                                     TGT_SIZE,
-                                     CHANNELS,
-                                     NUM_IMAGES,
-                                     SAMPLING_RATE,
-                                     gpuAngle,
-                                     gpuScaleRatio,
-                                     gpuCenterR,
-                                     gpuCenterC,
-                                     2,
-                                     targets);
-
-  real cpuTargets[TARGET_MEM_SIZE];
-  hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE);
-  for (int i = 0; i < SAMPLING_RATE * NUM_IMAGES; ++i) {
-    for (int p = 0; p < TGT_SIZE * TGT_SIZE * CHANNELS; ++p) {
-      const int offset = i * TGT_SIZE * TGT_SIZE * CHANNELS + p;
-      int c = (p / CHANNELS) % TGT_SIZE;
-      int r = (p / CHANNELS) / TGT_SIZE;
-      if (r == 0 || r == TGT_SIZE - 1 || c == 0 || c == TGT_SIZE - 1) {
-        EXPECT_FLOAT_EQ(0.0, cpuTargets[offset]);
-      } else {
-        EXPECT_FLOAT_EQ(1.0, cpuTargets[offset]);
-      }
-    }
-  }
-}
-
-#endif
diff --git a/paddle/math/tests/test_sparseMatrixCompare.cpp b/paddle/math/tests/test_sparseMatrixCompare.cpp
deleted file mode 100644
index 12647d21a29936e169b893ec8119b64fec9af580..0000000000000000000000000000000000000000
--- a/paddle/math/tests/test_sparseMatrixCompare.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_CUDA
-/// This unittest checks GpuSparseMatrix/CpuSparseMatrix get same result,
-//  so disable when
-/// only cpu version.
-
-#include <gtest/gtest.h>
-#include "paddle/math/Matrix.h"
-#include "paddle/utils/Util.h"
-#include "test_matrixUtil.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static inline int uniformRandom(int n) { return n == 0 ? 0 : rand() % n; }
-
-void testSpMatrixAddBias(int M, int N, real rate, real scale) {
-  int nnz = M * N * rate;
-
-  MatrixPtr cpuA(new CpuSparseMatrix(M, N, nnz));
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(1, N);
-
-  MatrixPtr gpuA(new GpuSparseMatrix(M, N, nnz));
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(1, N);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-
-  hl_stream_t stream(HPPL_STREAM_1);
-  gpuA->copyFrom(*cpuA, stream);
-  gpuB->copyFrom(*cpuB, stream);
-  hl_stream_synchronize(stream);
-
-  cpuA->addBias(*cpuB, scale);
-  gpuA->addBias(*gpuB, scale);
-
-  MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
-  outputCheck->copyFrom(*gpuA, stream);
-  hl_stream_synchronize(stream);
-  checkSMatrixEqual2(std::dynamic_pointer_cast<CpuSparseMatrix>(cpuA),
-                     std::dynamic_pointer_cast<CpuSparseMatrix>(outputCheck));
-}
-
-void testSpMatrixAddDense(int M, int N, real rate) {  // add3
-  int nnz = M * N * rate;
-
-  MatrixPtr cpuA(new CpuSparseMatrix(M, N, nnz));
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(M, N);
-
-  MatrixPtr gpuA(new GpuSparseMatrix(M, N, nnz));
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(M, N);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-
-  hl_stream_t stream(HPPL_STREAM_3);
-  gpuA->copyFrom(*cpuA, stream);
-  gpuB->copyFrom(*cpuB, stream);
-  hl_stream_synchronize(stream);
-
-  cpuA->add3(cpuB);
-  gpuA->add3(gpuB);
-
-  MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
-  outputCheck->copyFrom(*gpuA, stream);
-  hl_stream_synchronize(stream);
-  checkSMatrixEqual2(std::dynamic_pointer_cast<CpuSparseMatrix>(cpuA),
-                     std::dynamic_pointer_cast<CpuSparseMatrix>(outputCheck));
-}
-
-void testSpMatrixMul(int M, int N, int K, real rate) {
-  int nnz = M * N * rate;
-
-  MatrixPtr cpuA = std::make_shared<CpuMatrix>(M, K);
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(N, K);
-  MatrixPtr cpuC(new CpuSparseMatrix(M, N, nnz));
-
-  MatrixPtr gpuA = std::make_shared<GpuMatrix>(M, K);
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(N, K);
-  MatrixPtr gpuC(new GpuSparseMatrix(M, N, nnz));
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-  cpuC->randomizeUniform();
-
-  hl_stream_t stream(HPPL_STREAM_3);
-  gpuA->copyFrom(*cpuA, stream);
-  gpuB->copyFrom(*cpuB, stream);
-  gpuC->copyFrom(*cpuC, stream);
-  hl_stream_synchronize(stream);
-
-  cpuC->mul(*cpuA, *cpuB->getTranspose(), 1, 1);
-  gpuC->mul(*gpuA, *gpuB->getTranspose(), 1, 1);
-
-  MatrixPtr outputCheck(new CpuSparseMatrix(M, N, nnz));
-  outputCheck->copyFrom(*gpuC, stream);
-  hl_stream_synchronize(stream);
-  checkSMatrixErr(std::dynamic_pointer_cast<CpuSparseMatrix>(cpuC),
-                  std::dynamic_pointer_cast<CpuSparseMatrix>(outputCheck));
-}
-
-void testSpMatrixCollectBias(int M, int N, real rate) {
-  int nnz = M * N * rate;
-  LOG(INFO) << "nnz=" << nnz;
-
-  MatrixPtr cpuA(new CpuSparseMatrix(M, N, nnz));
-  MatrixPtr cpuB = std::make_shared<CpuMatrix>(1, N);
-
-  MatrixPtr gpuA(new GpuSparseMatrix(M, N, nnz));
-  MatrixPtr gpuB = std::make_shared<GpuMatrix>(1, N);
-
-  cpuA->randomizeUniform();
-  cpuB->randomizeUniform();
-
-  hl_stream_t stream(HPPL_STREAM_3);
-  gpuA->copyFrom(*cpuA, stream);
-  gpuB->copyFrom(*cpuB, stream);
-  hl_stream_synchronize(stream);
-
-  cpuB->collectBias(*cpuA, 1);
-  gpuB->collectBias(*gpuA, 1);
-
-  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(1, N);
-  outputCheck->copyFrom(*gpuB, stream);
-  hl_stream_synchronize(stream);
-  checkMatrixErr(*cpuB, *outputCheck);
-}
-
-TEST(SMatrix, sMatrixOp) {
-  for (auto height : {1, 11, 200}) {
-    for (auto width : {200, 2048, 20480}) {
-      VLOG(3) << " height=" << height << " width=" << width;
-      for (auto rate : {0.02, 0.1}) {
-        testSpMatrixAddDense(height, width, rate);
-        testSpMatrixAddBias(height, width, rate, 1.0);
-      }
-    }
-  }
-}
-
-TEST(SMatrix, sMatrixMul) {
-  for (auto M : {1, 40, 128, 200}) {
-    for (auto N : {100, 2000, 20480}) {
-      for (auto K : {100, 512, 1024}) {
-        VLOG(3) << " M=" << M << " N=" << N << " K=" << K;
-        testSpMatrixMul(M, N, K, 0.05);
-      }
-    }
-  }
-}
-
-TEST(SMatrix, sMatrixCollectBias) {
-  for (auto height : {1, 128, 200}) {
-    for (auto width : {100, 2048, 20480}) {
-      VLOG(3) << " height=" << height << " width=" << width;
-      testSpMatrixCollectBias(height, width, 0.1);
-    }
-  }
-}
-
-#endif
diff --git a/paddle/optimizer/CMakeLists.txt b/paddle/optimizer/CMakeLists.txt
deleted file mode 100644
index 25fc35311fc63988c64a445d72fc6255e49e8d4b..0000000000000000000000000000000000000000
--- a/paddle/optimizer/CMakeLists.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-set(OPITMIZER_SRCS
-    adadelta_optimizer.cc
-    adagrad_optimizer.cc
-    adam_optimizer.cc
-    optimizer.cc
-    parameter_optimizer.cc
-    sgd_optimizer.cc
-  )
-
-cc_library(paddle_optimizer STATIC SRCS ${OPITMIZER_SRCS} DEPS paddle_proto glog)
-cc_test(serialization_test SRCS serialization_test.cc DEPS paddle_proto)
-cc_test(parameter_optimizer_test SRCS parameter_optimizer_test.cc DEPS paddle_optimizer)
diff --git a/paddle/optimizer/adadelta_optimizer.h b/paddle/optimizer/adadelta_optimizer.h
deleted file mode 100644
index 74df9d54be734fedec8aeddff5f50b1d1aefb1d3..0000000000000000000000000000000000000000
--- a/paddle/optimizer/adadelta_optimizer.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "parameter_optimizer.h"
-
-namespace paddle {
-namespace optimizer {
-
-class AdadeltaOptimizer : public ParameterOptimizer {
-public:
-  AdadeltaOptimizer(
-      Tensor *parameter, LrPolicy *lr, double rho, double epsilon, double decay)
-      : ParameterOptimizer(parameter, lr),
-        accum_gradient_(new Tensor(parameter->size())),
-        accum_delta_(new Tensor(parameter->size())),
-        update_delta_(new Tensor(parameter->size())),
-        rho_(rho),
-        epsilon_(epsilon),
-        decay_(decay) {}
-
-  ~AdadeltaOptimizer() {
-    if (accum_gradient_) delete accum_gradient_;
-    if (accum_delta_) delete accum_delta_;
-    if (update_delta_) delete update_delta_;
-  }
-  void Update(const Tensor *gradient);
-  std::string SerializeState();
-  void DeserializeState(const std::string &state);
-
-private:
-  Tensor *accum_gradient_;
-  Tensor *accum_delta_;
-  Tensor *update_delta_;
-  double rho_;
-  double epsilon_;
-  double decay_;
-};
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/optimizer/adagrad_optimizer.h b/paddle/optimizer/adagrad_optimizer.h
deleted file mode 100644
index 1d58402d78ff9ada8b084a472d46c96580d01e5b..0000000000000000000000000000000000000000
--- a/paddle/optimizer/adagrad_optimizer.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "parameter_optimizer.h"
-
-namespace paddle {
-namespace optimizer {
-
-class AdagradOptimizer : public ParameterOptimizer {
-public:
-  AdagradOptimizer(Tensor *parameter,
-                   LrPolicy *lr,
-                   double epsilon,
-                   double decay)
-      : ParameterOptimizer(parameter, lr),
-        accum_gradient_(new Tensor(parameter->size())),
-        epsilon_(epsilon),
-        decay_(decay) {}
-  ~AdagradOptimizer() {
-    if (accum_gradient_) delete accum_gradient_;
-  }
-  void Update(const Tensor *gradient);
-  std::string SerializeState();
-  void DeserializeState(const std::string &state);
-
-private:
-  Tensor *accum_gradient_;
-  double epsilon_;
-  double decay_;
-};
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/optimizer/adam_optimizer.h b/paddle/optimizer/adam_optimizer.h
deleted file mode 100644
index 7977226c8602745d5733021a51fc03d932b0921a..0000000000000000000000000000000000000000
--- a/paddle/optimizer/adam_optimizer.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "parameter_optimizer.h"
-
-namespace paddle {
-namespace optimizer {
-
-class AdamOptimizer : public ParameterOptimizer {
-public:
-  AdamOptimizer(Tensor *parameter,
-                LrPolicy *lr,
-                double beta_1,
-                double beta_2,
-                double epsilon,
-                double decay)
-      : ParameterOptimizer(parameter, lr),
-        momentums_(new Tensor(parameter->size())),
-        velocitys_(new Tensor(parameter->size())),
-        beta_1_(beta_1),
-        beta_2_(beta_2),
-        epsilon_(epsilon),
-        decay_(decay) {}
-  ~AdamOptimizer() {
-    if (momentums_) delete momentums_;
-    if (velocitys_) delete velocitys_;
-  }
-  void Update(const Tensor *gradient);
-  std::string SerializeState();
-  void DeserializeState(const std::string &state);
-
-private:
-  Tensor *momentums_;
-  Tensor *velocitys_;
-  double beta_1_;
-  double beta_2_;
-  double epsilon_;
-  double decay_;
-};
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/optimizer/lr_policy.h b/paddle/optimizer/lr_policy.h
deleted file mode 100644
index 14422d1f42fc45d5e9a560c45259d4003a0b3d11..0000000000000000000000000000000000000000
--- a/paddle/optimizer/lr_policy.h
+++ /dev/null
@@ -1,82 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <algorithm>
-#include "OptimizerConfig.pb.h"
-
-namespace paddle {
-namespace optimizer {
-
-class LrPolicy {
-public:
-  virtual ~LrPolicy() {}
-  virtual double LearningRate(const uint64_t num_sample_passed) = 0;
-  virtual std::string SerializeState() = 0;
-  virtual void DeserializeState(const std::string &state) = 0;
-};
-
-// constant learning rate policy
-class ConstLr final : public LrPolicy {
-public:
-  ConstLr(double lr) : learning_rate_(lr){};
-  double LearningRate(const uint64_t num_sample_passed) {
-    return learning_rate_;
-  }
-  std::string SerializeState() {
-    LrPolicyState state;
-    state.set_learning_rate(learning_rate_);
-    return state.SerializeAsString();
-  }
-  void DeserializeState(const std::string &str) {
-    LrPolicyState state;
-    state.ParseFromString(str);
-    learning_rate_ = state.learning_rate();
-  }
-
-private:
-  double learning_rate_;
-};
-
-class LinearLr final : public LrPolicy {
-public:
-  LinearLr(double lr, double lr_decay_a, double lr_decay_b)
-      : learning_rate_(lr), lr_decay_a_(lr_decay_a), lr_decay_b_(lr_decay_b) {}
-  double LearningRate(const uint64_t num_sample_passed) {
-    return std::max(learning_rate_ - lr_decay_a_ * num_sample_passed,
-                    lr_decay_b_);
-  }
-  std::string SerializeState() {
-    LrPolicyState state;
-    state.set_learning_rate(learning_rate_);
-    state.set_lr_decay_a(lr_decay_a_);
-    state.set_lr_decay_b(lr_decay_b_);
-    return state.SerializeAsString();
-  }
-  void DeserializeState(const std::string &str) {
-    LrPolicyState state;
-    state.ParseFromString(str);
-    learning_rate_ = state.learning_rate();
-    lr_decay_a_ = state.lr_decay_a();
-    lr_decay_b_ = state.lr_decay_b();
-  }
-
-private:
-  double learning_rate_;
-  double lr_decay_a_;
-  double lr_decay_b_;
-};
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/optimizer/parameter_optimizer.h b/paddle/optimizer/parameter_optimizer.h
deleted file mode 100644
index c7cf8db3ee05c75c171b68bcbcb06a5ae8fa5b48..0000000000000000000000000000000000000000
--- a/paddle/optimizer/parameter_optimizer.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <glog/logging.h>
-#include <functional>
-#include <string>
-#include "OptimizerConfig.pb.h"
-#include "lr_policy.h"
-#include "serialization.h"
-#include "tensor.h"
-
-namespace paddle {
-namespace optimizer {
-
-class ParameterOptimizer {
-public:
-  /**
-   * @brief  update hook for algorithm need to traverse parameter more than
-   * once.
-   */
-  ParameterOptimizer(Tensor *parameter, LrPolicy *lr)
-      : parameter_(parameter), lr_policy_(lr), num_sample_passed_(0) {}
-  virtual ~ParameterOptimizer() {
-    delete parameter_;
-    delete lr_policy_;
-  }
-
-  static ParameterOptimizer *Create(const std::string &config_proto,
-                                    Tensor *parameter);
-  virtual void Update(const Tensor *gradient) = 0;
-  virtual float *get_weight(int *param_size) const;
-  virtual std::string SerializeState() = 0;
-  virtual void DeserializeState(const std::string &state) = 0;
-
-protected:
-  Tensor *parameter_;
-  // learning rate policy
-  LrPolicy *lr_policy_;
-  uint64_t num_sample_passed_;
-};
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/optimizer/parameter_optimizer_test.cc b/paddle/optimizer/parameter_optimizer_test.cc
deleted file mode 100644
index d663e2fd007febd3b9f0f43d213d63d2b20656b8..0000000000000000000000000000000000000000
--- a/paddle/optimizer/parameter_optimizer_test.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "parameter_optimizer.h"
-#include <cmath>
-#include <map>
-#include <vector>
-#include "gtest/gtest.h"
-#include "lr_policy.h"
-
-paddle::optimizer::Tensor* FillTensor(size_t size) {
-  paddle::optimizer::Tensor* param = new paddle::optimizer::Tensor(size);
-  paddle::optimizer::Tensor& p = *param;
-  for (size_t i = 0; i < p.size(); ++i) {
-    p[i] = (float)rand() / (float)RAND_MAX;
-  }
-  return param;
-}
-
-paddle::optimizer::Tensor* FixedTensor(size_t size) {
-  paddle::optimizer::Tensor* param = new paddle::optimizer::Tensor(size);
-  paddle::optimizer::Tensor& p = *param;
-  for (size_t i = 0; i < p.size(); ++i) {
-    p[i] = i;
-  }
-  return param;
-}
-
-class OptimizerTest : public testing::Test {
-public:
-  virtual ~OptimizerTest() {}
-  // init paddle::optimizer::Tensor shape
-  const size_t kSize = 5;
-
-  virtual void SetUp() {
-    CreateSGD();
-    CreateAdam();
-  }
-  virtual void TearDown() {}
-
-  void CreateSGD() {
-    paddle::optimizer::Tensor* parameter = FixedTensor(kSize);
-    config_.set_optimizer(paddle::OptimizerConfig::SGD);
-    config_.mutable_sgd()->set_momentum(0.0);
-    config_.mutable_sgd()->set_decay(0.0);
-    config_.mutable_sgd()->set_nesterov(false);
-    config_.set_lr_policy(paddle::OptimizerConfig::Const);
-    config_.mutable_const_lr()->set_learning_rate(0.1);
-    std::string str = config_.SerializeAsString();
-    paddle::optimizer::ParameterOptimizer* opt =
-        paddle::optimizer::ParameterOptimizer::Create(str, parameter);
-    opts_.push_back(opt);
-  }
-
-  void CreateAdam() {
-    paddle::optimizer::Tensor* parameter = FixedTensor(kSize);
-    config_.set_optimizer(paddle::OptimizerConfig::Adam);
-    config_.mutable_adam()->set_beta_1(0.9);
-    config_.mutable_adam()->set_beta_2(0.1);
-    config_.mutable_adam()->set_epsilon(1e-3);
-    config_.mutable_adam()->set_decay(0.0);
-    config_.set_lr_policy(paddle::OptimizerConfig::Const);
-    config_.mutable_const_lr()->set_learning_rate(0.1);
-    std::string str = config_.SerializeAsString();
-    paddle::optimizer::ParameterOptimizer* opt =
-        paddle::optimizer::ParameterOptimizer::Create(str, parameter);
-    opts_.push_back(opt);
-  }
-
-  void TestGetWeight() {
-    paddle::optimizer::Tensor* p = FixedTensor(kSize);
-    for (size_t i = 0; i < opts_.size(); ++i) {
-      int s = 0;
-      float* newp = (float*)opts_[i]->get_weight(&s);
-      EXPECT_EQ(static_cast<size_t>(s), kSize);
-      for (size_t j = 0; j < kSize; ++j) {
-        EXPECT_EQ(newp[j], (*p)[j]);
-      }
-    }
-  }
-
-  void TestUpdate() {
-    paddle::optimizer::Tensor* g = FixedTensor(kSize);
-    for (size_t i = 0; i < opts_.size(); ++i) {
-      opts_[i]->Update(g);
-    }
-  }
-
-  void TestCheckPoint() {
-    paddle::optimizer::Tensor* p = FixedTensor(kSize);
-    for (size_t i = 0; i < opts_.size(); ++i) {
-      auto state = opts_[i]->SerializeState();
-      opts_[i]->DeserializeState(state);
-      auto state1 = opts_[i]->SerializeState();
-      opts_[i]->DeserializeState(state);
-      EXPECT_EQ(state, state1);
-
-      int s = 0;
-      float* newp = (float*)opts_[i]->get_weight(&s);
-      EXPECT_EQ(static_cast<size_t>(s), kSize);
-      for (size_t j = 0; j < kSize; ++j) {
-        EXPECT_EQ(newp[j], (*p)[j]);
-      }
-    }
-  }
-
-private:
-  std::vector<paddle::optimizer::ParameterOptimizer*> opts_;
-  paddle::OptimizerConfig config_;
-};
-
-TEST_F(OptimizerTest, TestGetWeight) { TestGetWeight(); }
-
-TEST_F(OptimizerTest, TestUpdate) { TestUpdate(); }
-
-TEST_F(OptimizerTest, TestCheckPoint) { TestCheckPoint(); }
diff --git a/paddle/optimizer/serialization.h b/paddle/optimizer/serialization.h
deleted file mode 100644
index bf12eed15f0190b8e856163c68690f3f6eef9a12..0000000000000000000000000000000000000000
--- a/paddle/optimizer/serialization.h
+++ /dev/null
@@ -1,49 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <type_traits>
-#include "OptimizerConfig.pb.h"
-#include "paddle/utils/Logging.h"
-#include "tensor.h"
-
-namespace paddle {
-namespace optimizer {
-
-static void TensorToProto(const Tensor& tensor, TensorProto* proto) {
-  proto->set_data_type(TensorProto::PADDLE_ELEMENT_TYPE_FLOAT32);
-  std::stringstream os;
-  for (size_t i = 0; i < tensor.size(); ++i) {
-    os << tensor[i];
-    proto->add_content(os.str());
-    os.str(std::string());
-  }
-}
-
-static void ProtoToTensor(const TensorProto& proto, Tensor* tensor) {
-  std::stringstream sin;
-  for (auto i = 0; i < proto.content_size(); ++i) {
-    sin << proto.content(i);
-    sin >> (*tensor)[i];
-    sin.str(std::string());
-    sin.clear();
-  }
-}
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/optimizer/sgd_optimizer.h b/paddle/optimizer/sgd_optimizer.h
deleted file mode 100644
index f504d98adb8a01fd69ff313075b4c417222c765e..0000000000000000000000000000000000000000
--- a/paddle/optimizer/sgd_optimizer.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "parameter_optimizer.h"
-
-namespace paddle {
-namespace optimizer {
-
-class SGDOptimizer : public ParameterOptimizer {
-public:
-  SGDOptimizer(Tensor* parameter, LrPolicy* lr, double m, double d, bool n)
-      : ParameterOptimizer(parameter, lr),
-        momentums_(nullptr),
-        momentum_(m),
-        decay_(d),
-        nesterov_(n) {
-    if (momentum_ != 0.0) {
-      size_t size = parameter->size();
-      momentums_ = new Tensor(size);
-    }
-  }
-  virtual ~SGDOptimizer() {
-    if (momentums_) delete momentums_;
-  }
-  void Update(const Tensor* gradient);
-  std::string SerializeState();
-  void DeserializeState(const std::string& state);
-
-private:
-  Tensor* momentums_;
-  double momentum_;
-  double decay_;
-  bool nesterov_;
-};
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/optimizer/tensor.h b/paddle/optimizer/tensor.h
deleted file mode 100644
index fd32398a237e7e08a198707347cd3c0a4ed77bb3..0000000000000000000000000000000000000000
--- a/paddle/optimizer/tensor.h
+++ /dev/null
@@ -1,68 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-/**
- * @brief tensor used by optimizer
- */
-
-#include <string.h>
-#include <memory>
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-namespace optimizer {
-
-template <class T>
-class TensorT {
-public:
-  TensorT(size_t size) : height_(1), width_(size) {
-    // new T[size]() initializes all element to zero value.
-    data_ptr_ = std::shared_ptr<T>(new T[size](), std::default_delete<T[]>());
-    data_ = data_ptr_.get();
-  }
-
-  TensorT(T* data, size_t size)
-      : height_(1), width_(size), data_ptr_(nullptr), data_(data) {}
-
-  TensorT(T* data, size_t h, size_t w)
-      : height_(h), width_(w), data_ptr_(nullptr), data_(data) {}
-
-  virtual ~TensorT() {}
-
-  T* get_buffer() { return this->data_; }
-
-  T& operator[](const size_t idx) {
-    CHECK(idx >= 0 && idx < this->width_) << "out of index range";
-    return data_[idx];
-  }
-  T& operator[](const size_t idx) const {
-    CHECK(idx >= 0 && idx < this->width_) << "out of index range";
-    return data_[idx];
-  }
-  // TODO: replace with tensorshape
-  size_t size() const { return this->width_ * this->height_; }
-
-protected:
-  size_t height_;
-  size_t width_;
-  std::shared_ptr<T> data_ptr_;
-  T* data_;
-};
-
-// TODO(zhihong): design problem of dynamic datatype, need to fix it
-typedef TensorT<float> Tensor;
-
-}  // namespace optimizer
-}  // namespace paddle
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
deleted file mode 100644
index cfdaf8998b04e0307bc442dec0df734452634c67..0000000000000000000000000000000000000000
--- a/paddle/parameter/Argument.cpp
+++ /dev/null
@@ -1,707 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Argument.h"
-#include "paddle/math/SparseMatrix.h"
-
-#include <algorithm>
-
-namespace paddle {
-static void resizeAndCopy(MatrixPtr& dest,
-                          const MatrixPtr& src,
-                          bool useGpu,
-                          hl_stream_t stream) {
-  if (src) {
-    if (!dest) {
-      dest = src->clone(0, 0, useGpu);
-    } else {
-      CHECK_EQ(dest->useGpu(), useGpu);
-      dest->resize(src->getHeight(), src->getWidth());
-    }
-    dest->copyFrom(*src, stream);
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(IVectorPtr& dest,
-                          const IVectorPtr& src,
-                          bool useGpu,
-                          hl_stream_t stream) {
-  if (src) {
-    IVector::resizeOrCreate(dest, src->getSize(), useGpu);
-    dest->copyFrom(*src, stream);
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(ICpuGpuVectorPtr& dest,
-                          const ICpuGpuVectorPtr& src,
-                          bool useGpu,
-                          hl_stream_t stream) {
-  if (src) {
-    ICpuGpuVector::resizeOrCreate(dest, src->getSize(), useGpu);
-    dest->copyFrom(*src, stream);
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(MatrixPtr& dest,
-                          const MatrixPtr& src,
-                          int32_t startRow,
-                          int32_t copySize,
-                          bool useGpu,
-                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
-  if (src) {
-    CHECK_LE((size_t)startRow + copySize, src->getHeight());
-    int height = copySize;
-    int width = src->getWidth();
-    if (!dest) {
-      dest = src->clone(height, width, useGpu);
-    } else {
-      CHECK_EQ(dest->useGpu(), useGpu);
-      dest->resize(height, width);
-    }
-    MatrixPtr submat = src->subMatrix(startRow, copySize);
-    if (dynamic_cast<GpuSparseMatrix*>(dest.get())) {
-      // copy a subMatrix of CpuSparseMatrix to GpuSparseMatrix.
-      // First copy it to CPU, and then copy it to the GPU.
-      MatrixPtr tmp = src->clone(height, width, false);
-      tmp->copyFrom(*submat, stream);
-      dest->copyFrom(*tmp, stream);
-    } else {
-      dest->copyFrom(*submat, stream);
-    }
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(IVectorPtr& dest,
-                          const IVectorPtr& src,
-                          int32_t startPos,
-                          int32_t copySize,
-                          bool useGpu,
-                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
-  if (src) {
-    CHECK_LE((size_t)startPos + copySize, src->getSize());
-
-    int height = copySize;
-    IVector::resizeOrCreate(dest, height, useGpu);
-    dest->copyFrom(src->getData() + startPos, height, stream);
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(ICpuGpuVectorPtr& dest,
-                          const ICpuGpuVectorPtr& src,
-                          int32_t startPos,
-                          int32_t copySize,
-                          bool useGpu,
-                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
-  if (src) {
-    CHECK_LE((size_t)startPos + copySize, src->getSize());
-
-    ICpuGpuVector::resizeOrCreate(dest, copySize, useGpu);
-    dest->copyFrom(*src, startPos, copySize, useGpu, stream);
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(SVectorPtr& dest,
-                          const SVectorPtr& src,
-                          bool useGpu,
-                          hl_stream_t stream) {
-  if (src) {
-    size_t height = src->size();
-    if (!dest) {
-      dest = std::make_shared<std::vector<std::string>>(height);
-    } else {
-      dest->resize(height);
-    }
-    std::copy_n(src->begin(), height, dest->begin());
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(SVectorPtr& dest,
-                          const SVectorPtr& src,
-                          int32_t startPos,
-                          int32_t copySize,
-                          bool useGpu,
-                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
-  if (src) {
-    CHECK_LE((size_t)startPos + copySize, src->size());
-    size_t height = copySize;
-    if (!dest) {
-      dest = std::make_shared<std::vector<std::string>>(height);
-    } else {
-      dest->resize(height);
-    }
-    std::copy_n(src->begin() + startPos, height, dest->begin());
-  } else {
-    dest.reset();
-  }
-}
-
-void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu) {
-  resizeAndCopyFrom(src, useGpu, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-}
-
-void Argument::resizeAndCopyFrom(const Argument& src,
-                                 bool useGpu,
-                                 hl_stream_t stream) {
-  dataId = src.dataId;
-  resizeAndCopy(value, src.value, useGpu, stream);
-  resizeAndCopy(grad, src.grad, useGpu, stream);
-  resizeAndCopy(in, src.in, useGpu, stream);
-  resizeAndCopy(ids, src.ids, useGpu, stream);
-  resizeAndCopy(sequenceStartPositions,
-                src.sequenceStartPositions,
-                false /* useGpu */,
-                stream);
-  if (src.hasSubseq()) {
-    resizeAndCopy(subSequenceStartPositions,
-                  src.subSequenceStartPositions,
-                  false /* useGpu */,
-                  stream);
-  }
-  resizeAndCopy(strs, src.strs, useGpu, stream);
-  frameWidth = src.frameWidth;
-  frameHeight = src.frameHeight;
-  frameDepth = src.frameDepth;
-}
-
-int32_t Argument::resizeAndCopyFrom(const Argument& src,
-                                    int32_t startSeq,
-                                    int32_t copySize,
-                                    bool useGpu) {
-  int32_t size =
-      resizeAndCopyFrom(src, startSeq, copySize, useGpu, HPPL_STREAM_DEFAULT);
-  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-  return size;
-}
-
-int32_t Argument::resizeAndCopyFrom(const Argument& src,
-                                    int32_t startSeq,
-                                    int32_t copySize,
-                                    bool useGpu,
-                                    hl_stream_t stream) {
-  dataId = src.dataId;
-  frameWidth = src.frameWidth;
-  frameHeight = src.frameHeight;
-  frameDepth = src.frameDepth;
-
-  if (!src.sequenceStartPositions) {
-    // non-sequence input, copy samples directly
-    int32_t startRow = startSeq;
-    resizeAndCopy(in, src.in, startRow, copySize, useGpu, stream);
-    resizeAndCopy(value, src.value, startRow, copySize, useGpu, stream);
-    resizeAndCopy(grad, src.grad, startRow, copySize, useGpu, stream);
-    resizeAndCopy(ids, src.ids, startRow, copySize, useGpu, stream);
-    resizeAndCopy(strs, src.strs, startRow, copySize, useGpu, stream);
-    return copySize;
-  } else {
-    // sequence input
-    const int* sequence = src.sequenceStartPositions->getData(false);
-    int32_t startRow = sequence[startSeq];           // sample start from here
-    int32_t endRow = sequence[startSeq + copySize];  // sample end
-    int32_t copyFeatureSize = endRow - startRow;     // num of samples
-    resizeAndCopy(in, src.in, startRow, copyFeatureSize, useGpu, stream);
-    resizeAndCopy(value, src.value, startRow, copyFeatureSize, useGpu, stream);
-    resizeAndCopy(grad, src.grad, startRow, copyFeatureSize, useGpu, stream);
-    resizeAndCopy(ids, src.ids, startRow, copyFeatureSize, useGpu, stream);
-    resizeAndCopy(sequenceStartPositions,
-                  src.sequenceStartPositions,
-                  startSeq,
-                  copySize + 1,
-                  false,
-                  stream);
-    // modify new sequenceStartPositions
-    int* destSequences = sequenceStartPositions->getMutableData(false);
-    for (int i = 0; i < copySize + 1; i++) {
-      destSequences[i] -= startRow;
-    }
-    CHECK_EQ(destSequences[0], 0);
-    CHECK_EQ(destSequences[copySize], copyFeatureSize);
-    if (src.hasSubseq()) {
-      // sequence has sub-sequence
-      int* subSequence = src.subSequenceStartPositions->getMutableData(false);
-      int32_t subStartSeq = 0;
-      int32_t subEndSeq = 0;
-      int numSubSequences = src.getNumSubSequences();
-      for (int i = 0; i < numSubSequences + 1; i++) {
-        if (subSequence[i] == startRow) {
-          subStartSeq = i;
-        } else if (subSequence[i] == endRow) {
-          subEndSeq = i;
-          break;
-        }
-      }
-      int32_t copySubSize = subEndSeq - subStartSeq;
-      resizeAndCopy(subSequenceStartPositions,
-                    src.subSequenceStartPositions,
-                    subStartSeq,
-                    copySubSize + 1,
-                    false,
-                    stream);
-      // modify new subSequenceStartPositions
-      int* destSubSequences = subSequenceStartPositions->getMutableData(false);
-      for (int i = 0; i < copySubSize + 1; i++) {
-        destSubSequences[i] -= startRow;
-      }
-      CHECK_EQ(destSubSequences[0], 0);
-      CHECK_EQ(destSubSequences[copySubSize], copyFeatureSize);
-    }
-    resizeAndCopy(strs, src.strs, startRow, copySize, useGpu, stream);
-    return copyFeatureSize;
-  }
-}
-
-void Argument::concat(const std::vector<Argument>& args,
-                      const std::vector<int>& selectRows,
-                      const std::vector<int>& seqStartPos,
-                      const std::vector<int>& copySize,
-                      bool useGpu,
-                      hl_stream_t stream,
-                      PassType passType) {
-  CHECK(!subSequenceStartPositions)
-      << "undefined behavior for subsequence positions";
-
-  size_t batchSize = 0;
-  for (size_t i = 0; i < copySize.size(); ++i)
-    batchSize += copySize[i] * (seqStartPos[i + 1] - seqStartPos[i]);
-
-  auto copyArg = [batchSize, stream](MatrixPtr& dst,
-                                     MatrixPtr src,
-                                     int desStartRow,
-                                     int srcStartRow,
-                                     int size,
-                                     bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    size_t width = src->getWidth();
-    if (!dst) {
-      dst = src->clone(batchSize, width, useGpu);
-    } else {
-      dst->resize(batchSize, width);
-    }
-
-    MatrixPtr tmpMatrix = dst->subMatrix(desStartRow, size);
-    tmpMatrix->copyFrom(*src->subMatrix(srcStartRow, size), stream);
-  };
-
-  auto copyIds = [batchSize, stream](IVectorPtr& dst,
-                                     const IVectorPtr& src,
-                                     int desStartRow,
-                                     int srcStartRow,
-                                     int size,
-                                     bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    IVector::resizeOrCreate(dst, batchSize, useGpu);
-    dst->subVec(desStartRow, size)
-        ->copyFrom(*src->subVec(srcStartRow, size), stream);
-  };
-
-  auto copyStrs = [batchSize, stream](SVectorPtr& dst,
-                                      const SVectorPtr& src,
-                                      int desStartRow,
-                                      int srcStartRow,
-                                      int size,
-                                      bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    if (!dst) {
-      dst = std::make_shared<std::vector<std::string>>(batchSize);
-    } else {
-      dst->resize(batchSize);
-    }
-    std::copy(src->begin() + srcStartRow,
-              src->begin() + srcStartRow + size,
-              dst->begin() + desStartRow);
-  };
-
-  dataId = args[0].dataId;
-  CHECK_NE(seqStartPos.size(), 0UL);
-  int desStartRow = 0;
-  for (size_t i = 0; i < copySize.size(); ++i) {
-    int startPos = seqStartPos[i];
-    int endPos = seqStartPos[i + 1];
-    CHECK_GE(args.size(), static_cast<size_t>(endPos - startPos));
-    for (int j = startPos; j < endPos; ++j) {
-      const Argument& arg = args[j - startPos];
-      CHECK_EQ(arg.dataId, dataId) << "Arguments to concatenate should have "
-                                   << "the same dataId.";
-      const int srcStartRow = selectRows[j];
-      copyArg(in, arg.in, desStartRow, srcStartRow, copySize[i], useGpu);
-      copyArg(value, arg.value, desStartRow, srcStartRow, copySize[i], useGpu);
-      if (passType != PASS_TEST) {
-        copyArg(grad, arg.grad, desStartRow, srcStartRow, copySize[i], useGpu);
-      }
-      copyIds(ids, arg.ids, desStartRow, srcStartRow, copySize[i], useGpu);
-      copyStrs(strs, arg.strs, desStartRow, srcStartRow, copySize[i], useGpu);
-      desStartRow += copySize[i];
-    }
-  }
-  ICpuGpuVector::resizeOrCreate(
-      sequenceStartPositions, seqStartPos.size(), useGpu);
-  sequenceStartPositions->copyFrom(
-      seqStartPos.data(), seqStartPos.size(), useGpu);
-}
-
-void Argument::concat(const std::vector<Argument>& args,
-                      bool useGpu,
-                      hl_stream_t stream,
-                      PassType passType) {
-  int32_t batchSize = 0;
-  int64_t numSequences = 0;
-  int64_t numSubSequences = 0;
-  for (auto& arg : args) {
-    batchSize += arg.getBatchSize();
-    numSequences += arg.getNumSequences();
-    numSubSequences += arg.getNumSubSequences();
-  }
-
-  auto copyArg = [batchSize, stream](
-      MatrixPtr& dst, MatrixPtr src, int startRow, bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    size_t width = src->getWidth();
-    if (!dst) {
-      dst = src->clone(batchSize, width, useGpu);
-    } else {
-      dst->resize(batchSize, width);
-    }
-
-    MatrixPtr tmpMatrix = dst->subMatrix(startRow, src->getHeight());
-    tmpMatrix->copyFrom(*src, stream);
-  };
-
-  auto copyIds = [batchSize, stream](
-      IVectorPtr& dst, const IVectorPtr& src, int startRow, bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    IVector::resizeOrCreate(dst, batchSize, useGpu);
-    dst->subVec(startRow, src->getSize())->copyFrom(*src, stream);
-  };
-
-  auto copyStrs = [batchSize, stream](
-      SVectorPtr& dst, const SVectorPtr& src, int startRow, bool useGpu) {
-    if (!src) {
-      dst.reset();
-      return;
-    }
-    if (!dst) {
-      dst = std::make_shared<std::vector<std::string>>(batchSize);
-    } else {
-      dst->resize(batchSize);
-    }
-    std::copy(src->begin(), src->end(), dst->begin() + startRow);
-  };
-
-  auto copySequencePos = [](ICpuGpuVectorPtr& dstSeq,
-                            const ICpuGpuVectorPtr& srcSeq,
-                            int dstNumSequences,
-                            int srcNumSequences,
-                            int& startSequences,
-                            int startRow) {
-    if (srcSeq) {
-      ICpuGpuVector::resizeOrCreate(dstSeq, dstNumSequences + 1, false);
-      const int* src = srcSeq->getData(false);
-      int* dest = dstSeq->getMutableData(false);
-      for (int i = 0; i < srcNumSequences + 1; ++i) {
-        dest[i + startSequences] = src[i] + startRow;
-      }
-      startSequences += srcNumSequences;
-    } else {
-      dstSeq.reset();
-    }
-  };
-
-  int startRow = 0;
-  int startSequences = 0;
-  int startSubSequences = 0;
-  dataId = args[0].dataId;
-  for (auto& arg : args) {
-    CHECK_EQ(arg.dataId, dataId) << "Arguments in concat should have"
-                                 << " same dataId";
-    copyArg(in, arg.in, startRow, useGpu);
-    copyArg(value, arg.value, startRow, useGpu);
-    if (passType != PASS_TEST) copyArg(grad, arg.grad, startRow, useGpu);
-    copyIds(ids, arg.ids, startRow, useGpu);
-    copySequencePos(sequenceStartPositions,
-                    arg.sequenceStartPositions,
-                    numSequences,
-                    arg.getNumSequences(),
-                    startSequences,
-                    startRow);
-    copySequencePos(subSequenceStartPositions,
-                    arg.subSequenceStartPositions,
-                    numSubSequences,
-                    arg.getNumSubSequences(),
-                    startSubSequences,
-                    startRow);
-    copyStrs(strs, arg.strs, startRow, useGpu);
-    startRow += arg.getBatchSize();
-  }
-}
-
-void Argument::splitByDataId(const std::vector<Argument>& argus,
-                             std::vector<std::vector<Argument>>* arguGroups) {
-  arguGroups->clear();
-  int lastDataId = -1;
-  for (const auto& argu : argus) {
-    if (argu.dataId == -1) {
-      // is -1, then create a new group
-      arguGroups->emplace_back();
-      lastDataId = -1;
-    } else if (argu.dataId != lastDataId) {
-      // not -1, also not equal to last Argument, then create a new group
-      arguGroups->emplace_back();
-      lastDataId = argu.dataId;
-    } else {
-      // not -1, and equal to last Argument, do nothing
-    }
-    arguGroups->back().push_back(argu);
-  }
-}
-
-void Argument::getSeqInfo(std::vector<SeqInfo>* seqInfo) const {
-  const int* starts = sequenceStartPositions->getData(false);
-  const int* subStarts =
-      hasSubseq() ? subSequenceStartPositions->getData(false) : nullptr;
-  size_t numSequences = getNumSequences();
-  seqInfo->reserve(numSequences);
-  int subSeqEnd = 0;
-  for (size_t i = 0; i < numSequences; ++i) {
-    SeqInfo info;
-    info.seqStart = starts[i];
-    info.subLevelLength = starts[i + 1] - starts[i];
-    info.seqId = i;
-    if (hasSubseq()) {
-      info.subSeqStart = subSeqEnd;
-      while (subStarts[subSeqEnd] < starts[i + 1]) {
-        ++subSeqEnd;
-      }
-      info.topLevelLength = subSeqEnd - info.subSeqStart;
-    } else {
-      info.topLevelLength = info.subLevelLength;
-      info.subSeqStart = 0;  // not used
-    }
-    seqInfo->push_back(info);
-  }
-  std::sort(
-      seqInfo->begin(), seqInfo->end(), [](const SeqInfo& a, const SeqInfo& b) {
-        return a.topLevelLength > b.topLevelLength;
-      });
-}
-
-void Argument::checkSubset() const {
-  if (getNumSequences() > getNumSubSequences()) {
-    LOG(FATAL) << "numSubSequences is less than numSequences ("
-               << getNumSubSequences() << " vs. " << getNumSequences() << ")";
-  }
-  const int* start = sequenceStartPositions->getData(false);
-  const int* subStart = subSequenceStartPositions->getData(false);
-  int seqId = 0;
-  int subSeqId = 0;
-  while (seqId < getNumSequences() && subSeqId < getNumSubSequences()) {
-    if (start[seqId] > subStart[subSeqId]) {
-      ++subSeqId;
-    } else if (start[seqId] == subStart[subSeqId]) {
-      ++subSeqId;
-      ++seqId;
-    } else {
-      LOG(FATAL) << "seqStartPositions is not subset of subSeqStartPositions";
-    }
-  }
-  if (seqId < getNumSequences()) {
-    LOG(FATAL) << "seqStartPositions is not subset of subSeqStartPositions";
-  }
-}
-
-void Argument::degradeSequence(const Argument& input) {
-  CHECK_EQ(input.hasSubseq(), 1UL);
-  size_t numSequences = input.getNumSequences();
-  size_t numSubSequences = input.getNumSubSequences();
-  ICpuGpuVector::resizeOrCreate(
-      sequenceStartPositions, numSequences + 1, false);
-  int* tgtBuf = sequenceStartPositions->getMutableData(false);
-  const int* starts = input.sequenceStartPositions->getData(false);
-  const int* subStarts = input.subSequenceStartPositions->getData(false);
-  int seqId = 0;
-  for (size_t subSeqId = 0; subSeqId < numSubSequences; ++subSeqId) {
-    if (subStarts[subSeqId] == starts[seqId]) {
-      tgtBuf[seqId] = subSeqId;
-      seqId++;
-    }
-  }
-  tgtBuf[numSequences] = numSubSequences;
-}
-
-void Argument::poolSequenceWithStride(const Argument& input,
-                                      size_t stride,
-                                      ICpuGpuVectorPtr* stridePostions,
-                                      bool reversed) {
-  // If input.sequenceStartPositions = [0, 9, 14, 17, 30] and stride = 5,
-  // then sequenceStartPositions = [0, 2, 3, 4, 7].
-  // If reversed = false, stridePostions = [0, 5, 9, 14, 17, 22, 27, 30];
-  // else reversed = true, stridePostions = [0, 4, 9, 14, 17, 20, 25, 30]
-
-  CHECK(input.sequenceStartPositions);
-  CHECK_EQ(input.hasSubseq(), 0UL);
-  CHECK_GT(stride, 0UL) << "stride must larger than 0";
-  size_t numSequences = input.getNumSequences();
-  ICpuGpuVector::resizeOrCreate(
-      sequenceStartPositions, numSequences + 1, false);
-  const int* starts = input.sequenceStartPositions->getData(false);
-  int* tgtBuf = sequenceStartPositions->getMutableData(false);
-  // first index of target sequence and stride positions are both 0
-  tgtBuf[0] = 0;
-  std::vector<int> stridePos;
-  for (size_t seqId = 0; seqId < numSequences; ++seqId) {
-    size_t seqLength = starts[seqId + 1] - starts[seqId];
-    stridePos.emplace_back(starts[seqId]);
-    if (seqLength == 0) {
-      // empty sequence
-      tgtBuf[seqId + 1] = tgtBuf[seqId];
-    } else {
-      int size = ceil((float)seqLength / stride);
-      tgtBuf[seqId + 1] = tgtBuf[seqId] + size;
-      for (int i = 0; i < size - 1; ++i) {
-        int cur = reversed ? starts[seqId + 1] - (size - 1 - i) * stride
-                           : stridePos.back() + stride;
-        stridePos.emplace_back(cur);
-      }
-    }
-  }
-  stridePos.emplace_back(starts[numSequences]);
-  int size = stridePos.size();
-  CHECK_EQ(size - 1, tgtBuf[numSequences]);
-  ICpuGpuVector::resizeOrCreate(*stridePostions, size, false);
-  (*stridePostions)->getMutableVector(false)->copyFrom(stridePos.data(), size);
-}
-
-void Argument::getValueString(
-    std::unordered_map<std::string, std::string>* out) const {
-  if (value) {
-    std::ostringstream os;
-    value->print(os);
-    out->insert({"value", os.str()});
-  }
-  if (ids) {
-    std::ostringstream os;
-    ids->print(os, ids->getSize());
-    out->insert({"ids", os.str()});
-  }
-  if (sequenceStartPositions) {
-    std::ostringstream os;
-    sequenceStartPositions->getVector(false)->print(
-        os, sequenceStartPositions->getSize());
-    out->insert({"sequence pos", os.str()});
-  }
-  if (subSequenceStartPositions) {
-    std::ostringstream os;
-    subSequenceStartPositions->getVector(false)->print(
-        os, subSequenceStartPositions->getSize());
-    out->insert({"sub-sequence pos", os.str()});
-  }
-}
-
-void Argument::printValueString(std::ostream& stream,
-                                const std::string& prefix) const {
-  std::unordered_map<std::string, std::string> out;
-  getValueString(&out);
-  for (auto field : {"value", "ids", "sequence pos", "sub-sequence pos"}) {
-    auto it = out.find(field);
-    if (it != out.end()) {
-      stream << prefix << field << ":\n" << it->second;
-    }
-  }
-}
-
-void Argument::subArgFrom(const Argument& input,
-                          size_t offset,
-                          size_t height,
-                          size_t width,
-                          bool useGpu,
-                          bool trans,
-                          bool seqFlag,
-                          size_t seqStart,
-                          size_t seqSize) {
-  if (input.value) {
-    value = Matrix::create(
-        input.value->getData() + offset * width, height, width, trans, useGpu);
-  }
-  if (input.ids) {
-    ids = IVector::create(input.ids->getData() + offset, height, useGpu);
-  }
-  if (input.grad) {
-    grad = Matrix::create(
-        input.grad->getData() + offset * width, height, width, trans, useGpu);
-  }
-  if (seqFlag) {
-    sequenceStartPositions = std::make_shared<ICpuGpuVector>(
-        *(input.sequenceStartPositions), seqStart, seqSize);
-  }
-}
-
-void Argument::reorganizeSeqInfo(
-    const ICpuGpuVectorPtr seqStartPos,
-    const ICpuGpuVectorPtr subSeqStartPos,
-    std::vector<std::vector<int>>& reorganizedSeqInfo) {
-  CHECK(seqStartPos);
-  reorganizedSeqInfo.clear();
-
-  int seqNum = seqStartPos->getSize() - 1;
-  int* seqStarts = seqStartPos->getMutableData(false);
-
-  if (subSeqStartPos) {
-    int* subSeqStarts = subSeqStartPos->getMutableData(false);
-    reorganizedSeqInfo.resize(seqNum, std::vector<int>());
-    int seqIdx = 0;
-    for (size_t i = 0; i < subSeqStartPos->getSize(); ++i) {
-      reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
-      if (subSeqStarts[i] == seqStarts[seqIdx + 1]) {
-        seqIdx++;
-        if (seqIdx == seqNum) return;
-        reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
-      }
-    }
-  } else {
-    reorganizedSeqInfo.resize(1, std::vector<int>(seqNum + 1, 0));
-    memcpy(reorganizedSeqInfo[0].data(),
-           seqStarts,
-           sizeof(int) * seqStartPos->getSize());
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
deleted file mode 100644
index e580d38216b699360fb30f135be8052ab56abf66..0000000000000000000000000000000000000000
--- a/paddle/parameter/Argument.h
+++ /dev/null
@@ -1,349 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "hl_gpu.h"
-
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-typedef std::shared_ptr<std::vector<std::string>> SVectorPtr;
-
-struct Argument {
-  Argument()
-      : in(nullptr),
-        value(nullptr),
-        ids(nullptr),
-        grad(nullptr),
-        strs(nullptr),
-        frameHeight(0),
-        frameWidth(0),
-        frameDepth(0),
-        sequenceStartPositions(nullptr),
-        subSequenceStartPositions(nullptr),
-        cpuSequenceDims(nullptr),
-        deviceId(-1),
-        allCount(0),
-        valueCount(0),
-        gradCount(0),
-        dataId(0) {}
-  Argument(const Argument& argument) {
-    *this = argument;
-    valueCount = 0;
-    gradCount = 0;
-    dataId = argument.dataId;
-  }
-  ~Argument() {}
-
-  void operator=(const Argument& argument) {
-    in = argument.in;
-    value = argument.value;
-    ids = argument.ids;
-    grad = argument.grad;
-    strs = argument.strs;
-    sequenceStartPositions = argument.sequenceStartPositions;
-    subSequenceStartPositions = argument.subSequenceStartPositions;
-    cpuSequenceDims = argument.cpuSequenceDims;
-    deviceId = argument.deviceId;
-    allCount = argument.allCount;
-    frameHeight = argument.frameHeight;
-    frameWidth = argument.frameWidth;
-    frameDepth = argument.frameDepth;
-    dataId = argument.dataId;
-  }
-
-  MatrixPtr in;  // used if needed
-  MatrixPtr value;
-  IVectorPtr ids;  // a sequence of ids. Can be use for class id for costLayer
-  MatrixPtr grad;  // If empty, gradient is not needed.
-  SVectorPtr strs;
-
-  // A dataBatch includes batchSize frames, one frame maybe not only vector
-  size_t frameHeight;
-  size_t frameWidth;
-  size_t frameDepth;
-
-  // If NULL, each position is treated independently.
-  // Otherwise, its size should be #NumberOfSequences + 1.
-  // The first position is always 0 and
-  // the last position should be equal to batchSize.
-  ICpuGpuVectorPtr sequenceStartPositions;
-
-  // If NULL, each sequence has no subsequence.
-  // Otherwise, its size should be #NumberOfSubSequences + 1.
-  // The first position is always 0 and
-  // the last position should be equal to batchSize.
-  ICpuGpuVectorPtr subSequenceStartPositions;
-
-  // dimension of sequence, stored only in CPU
-  IVectorPtr cpuSequenceDims;
-
-  int deviceId;            // the GPU device id which the argument in
-  int allCount;            // the number of output layers using this argument
-  mutable int valueCount;  // waiting this member when layer do forward
-  mutable int gradCount;   // waiting this member when layer do backward
-  mutable LockedCondition valueReadyCond;
-  mutable LockedCondition gradReadyCond;
-
-  int dataId;  // dataProvider id
-
-  /* Increase the reference count of the argument. */
-  void countIncrement() { allCount++; }
-
-  int getAllCount() const { return allCount; }
-
-  void waitValueReady() const {
-    valueReadyCond.wait([this] { return (valueCount != 0); });
-
-    std::lock_guard<std::mutex> guard(*valueReadyCond.mutex());
-    valueCount--;
-  }
-
-  void notifyValueReady() const {
-    valueReadyCond.notify_all([this] { valueCount = allCount; });
-  }
-
-  void waitGradReady() const {
-    gradReadyCond.wait([this] { return (gradCount == allCount); });
-    gradCount = 0;
-  }
-
-  void notifyGradReady() const {
-    gradReadyCond.notify_all([this] { gradCount++; });
-  }
-
-  int64_t getBatchSize() const {
-    if (value) return value->getHeight();
-    if (ids) return ids->getSize();
-    if (grad) return grad->getHeight();
-    if (in) return in->getHeight();
-    if (strs) return strs->size();
-    return 0;
-  }
-  size_t getFrameHeight() const { return frameHeight; }
-  size_t getFrameWidth() const { return frameWidth; }
-  size_t getFrameDepth() const { return frameDepth; }
-  void setFrameHeight(size_t h) { frameHeight = h; }
-  void setFrameWidth(size_t w) { frameWidth = w; }
-  void setFrameDepth(size_t d) { frameDepth = d; }
-
-  int64_t getNumSequences() const {
-    return sequenceStartPositions ? sequenceStartPositions->getSize() - 1
-                                  : getBatchSize();
-  }
-
-  int64_t getNumSubSequences() const {
-    return subSequenceStartPositions ? subSequenceStartPositions->getSize() - 1
-                                     : getBatchSize();
-  }
-
-  bool hasSeq() const { return sequenceStartPositions != nullptr; }
-  bool hasSubseq() const { return subSequenceStartPositions != nullptr; }
-
-  const int* getCpuStartPositions() const {
-    return hasSubseq() ? subSequenceStartPositions->getData(false)
-                       : sequenceStartPositions->getData(false);
-  }
-
-  static inline real sum(const std::vector<Argument>& arguments) {
-    real cost = 0;
-    for (auto& arg : arguments) {
-      if (arg.value) {
-        SetDevice device(arg.deviceId);
-        cost += arg.value->getSum();
-      }
-    }
-    return cost;
-  }
-
-  /**
-   * @brief (value, ids, grad, sequenceStartPositions) of output are subset of
-   *        input. Note that, output share the same memory of input.
-   *
-   * @param input[in]       input
-   * @param offset[in]      offset in terms of rows
-   * @param height[in]      height of output.value
-   * @param width[in]       width of output.value
-   * @param useGpu[in]
-   * @param trans[in]       whether input.value is transform
-   * @param seqFlag[in]     whether input has sequenceStartPositions
-   * @param seqStart[in]    offset of input.sequenceStartPositions
-   * @param seqSize[in]     lenght of output.sequenceStartPositions
-   */
-  void subArgFrom(const Argument& input,
-                  size_t offset,
-                  size_t height,
-                  size_t width,
-                  bool useGpu,
-                  bool trans = false,
-                  bool seqFlag = false,
-                  size_t seqStart = 0,
-                  size_t seqSize = 0);
-  /*
-   * for sequence input:
-   *   startSeq: the sequence id of start
-   *   copySize: how many sequences need to copy
-   *   return value: how many samples are copied
-   * for non-sequence input:
-   *   startSeq: the sample id of start
-   *   copySize: how many samples need to copy
-   *   return value: how many samples are copied
-   * Note that when specifying the stream explicitly in this case,
-   * synchronize should also be called somewhere after this function
-   */
-  int32_t resizeAndCopyFrom(const Argument& src,
-                            int32_t startSeq,
-                            int32_t copySize,
-                            bool useGpu,
-                            hl_stream_t stream);
-
-  /*
-   * same with the above function, except that the stream is
-   * HPPL_STREAM_DEFAULT and synchronize is automatically called
-   * inside it
-   */
-  int32_t resizeAndCopyFrom(const Argument& src,
-                            int32_t startSeq,
-                            int32_t copySize,
-                            bool useGpu = FLAGS_use_gpu);
-
-  void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream);
-
-  /*
-   * same with the above function, except that the stream is
-   * HPPL_STREAM_DEFAULT and synchronize is automatically called
-   * inside it
-   */
-  void resizeAndCopyFrom(const Argument& src, bool useGpu = FLAGS_use_gpu);
-
-  /*
-    @brief Concatenate several arguments into one and put the result into it.
-    @param args : a vector of argument, each element of which is a frame in a
-    batch of sequences.
-    @param selectRows : select several row of args to concatenate
-    @param seqStartPos : sequence start positions in the final Argument
-    @param hl_stream_t : cuda stream
-    @param passTyoe : type of task, training or testing
-   */
-  void concat(const std::vector<Argument>& args,
-              const std::vector<int>& selectRows,
-              const std::vector<int>& seqStartPos,
-              const std::vector<int>& copySize,
-              bool useGpu,
-              hl_stream_t stream,
-              PassType passType);
-
-  /*
-    Concatenate several args into one and put the result into this.
-   */
-  void concat(const std::vector<Argument>& src,
-              bool useGpu = FLAGS_use_gpu,
-              hl_stream_t stream = HPPL_STREAM_DEFAULT,
-              PassType passType = PASS_TEST);
-
-  /*
-   * split vector<Argument> to several vectors according to dataId
-   */
-  static void splitByDataId(const std::vector<Argument>& argus,
-                            std::vector<std::vector<Argument>>* arguGroups);
-
-  struct SeqInfo {
-    // Equal to sequence length for sequence data
-    // Equal to number of subsequences for subsequence data
-    int topLevelLength;
-
-    int seqStart;
-    int seqId;
-
-    // Equal to topLevelLength for sequence data
-    // Equal to sum of the length of subsequences for subsequence data
-    int subLevelLength;
-
-    // Only used for subsequence data, start position of this sequence
-    // is subSequenceStartPositions, i.e.
-    // subSequenceStartPositions[subSeqStart] == seqStart
-    int subSeqStart;
-  };
-  /*
-    Get SeqInfo for each sequence of this argument
-    Elements in *seqInfo are sorted by topLevelLength in descending order
-  */
-  void getSeqInfo(std::vector<SeqInfo>* segInfo) const;
-
-  /*
-   Check Whether sequenceStartPositions is subset of
-   subSequenceStartPositions.
-   */
-  void checkSubset() const;
-
-  /*
-   sequence has sub-sequence degrades to a sequence.
-   */
-  void degradeSequence(const Argument& input);
-
-  /*
-   After pooling with stride n (n is smaller than sequence length),
-   a long sequence will be shorten.
-   This function is invalid for sequence having sub-sequence.
-   */
-  void poolSequenceWithStride(const Argument& input,
-                              size_t stride,
-                              ICpuGpuVectorPtr* stridePositions,
-                              bool reversed = false);
-  /**
-   * @brief getValueString will return the argument's output in string. There
-   * are several kinds of output. The keys of output dictionary are 'value',
-   * 'id', 'sequence pos', 'sub-sequence pos'.
-   * @param out [out]: the return values.
-   */
-  void getValueString(std::unordered_map<std::string, std::string>* out) const;
-
-  /**
-   * @brief printValueString will print the argument's output in order of
-   * 'value', 'id', 'sequence pos', 'sub-sequence pos'.
-   * @param stream: Output stream
-   * @param prefix: line prefix for printing.
-   */
-  void printValueString(std::ostream& stream,
-                        const std::string& prefix = "") const;
-
-  /**
-   * @brief reorganizeSeqInfo will reorganize sequenceStartPositions and
-   * subSequenceStartPositions into a 2 dimensional arrary: reorganizedSeqInfo.
-   *
-   * @param seqStartPos: sequenceStartPositions of an Argument.
-   * @param subSeqStartPos: subSequenceStartPositions of an Argument.
-   * @param the reorganized sequence start position information.
-   *
-   * Examples:
-   * seqStartPos: [0, 4, 15, 20, 28]
-   * subSeqStartPos: [0, 3, 4, 5, 7, 10, 15, 20, 22, 23, 25, 28]
-   * reorganizedSeqInfo:
-   *   [
-   *     [0,3,4],
-   *     [4,5,7,10,15],
-   *     [15,20],
-   *     [20,22,23,25,28]
-   *   ]
-   */
-  static void reorganizeSeqInfo(
-      const ICpuGpuVectorPtr seqStartPos,
-      const ICpuGpuVectorPtr subSeqStartPos,
-      std::vector<std::vector<int>>& reorganizedSeqInfo);
-};
-
-}  // namespace paddle
diff --git a/paddle/parameter/AverageOptimizer.cpp b/paddle/parameter/AverageOptimizer.cpp
deleted file mode 100644
index 75998d81dd9c8be35fe45e903dc1cd69068f83c6..0000000000000000000000000000000000000000
--- a/paddle/parameter/AverageOptimizer.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "AverageOptimizer.h"
-
-namespace paddle {
-
-// factory method to create an instance of AverageOptimizer
-ParameterOptimizer* AverageOptimizer::create(
-    const OptimizationConfig& optConfig,
-    ParameterOptimizer* optimizer,
-    bool isParameterSparse,
-    bool useParameterApply) {
-  if (optConfig.average_window() <= 0) {
-    return optimizer;
-  }
-  // disable average for embeded local updater
-  if (!useParameterApply && optConfig.num_batches_per_send_parameter() > 1) {
-    return optimizer;
-  }
-  if (isParameterSparse) {
-    return new AverageSparseOptimizer(optConfig, optimizer, useParameterApply);
-  }
-  return new AverageOptimizer(optConfig, optimizer, useParameterApply);
-}
-
-AverageOptimizer::AverageOptimizer(const OptimizationConfig& optConfig,
-                                   ParameterOptimizer* optimizer,
-                                   bool useParameterApply)
-    : ParameterOptimizer(optConfig),
-      optimizer_(optimizer),
-      useApply_(useParameterApply),
-      numUpdates_(0),
-      prevNumUpdates_(0),
-      numAccumulates_(0),
-      oldNumAccumulates_(0),
-      minAverageWindow_(
-          std::min<int64_t>(10000L, optConfig_.max_average_window())),
-      maxAverageWindow_(optConfig_.max_average_window()) {
-  parameterTypes_ = optimizer_->getParameterTypes();
-  addParameterType(PARAMETER_SUM1);
-  addParameterType(PARAMETER_SUM2);
-  addParameterType(PARAMETER_SUM3);
-  if (useParameterApply) {
-    addParameterType(PARAMETER_APPLY);
-  }
-}
-
-void AverageOptimizer::startBatch(int64_t numSamplesProcessed) {
-  optimizer_->startBatch(numSamplesProcessed);
-  learningRate_ = optimizer_->getLearningRate();
-
-  ++numUpdates_;
-  ++numAccumulates_;
-}
-
-/*
-  After traversal, the averaged parameter can be obtained by
-  ((PARAMETER_SUM1 + PARAMETER_SUM2 + PARAMETER_SUM3)
-  / (numAccumulates_ + oldNumAccumulates_))
-*/
-ParameterOptimizer::TraverseCallback AverageOptimizer::needSpecialTraversal(
-    const ParameterConfig& config) const {
-  TraverseCallbackVec callbacks;
-
-  if (auto callback = optimizer_->needSpecialTraversal(config)) {
-    callbacks.emplace_back(callback);
-  }
-
-  if (numUpdates_ % kMaxNumAccumulates == 0) {
-    // Move the sum to a different buffer to avoid loss of precision
-    // due to too many sums.
-    callbacks.emplace_back([this](const VectorPtr vecs[],
-                                  const ParameterConfig& config,
-                                  size_t sparseId) {
-      vecs[PARAMETER_SUM2]->add(*vecs[PARAMETER_SUM1]);
-      vecs[PARAMETER_SUM1]->zeroMem();
-    });
-  }
-
-  if (isAverageWindowTooLong()) {
-    // Now the average window is too long, discard the old sum.
-    if (auto callback = this->startCatchUpWith()) {
-      callbacks.emplace_back(callback);
-    }
-    callbacks.emplace_back([this](const VectorPtr vecs[],
-                                  const ParameterConfig& config,
-                                  size_t sparseId) {
-      vecs[PARAMETER_SUM3]->add(*vecs[PARAMETER_SUM1], *vecs[PARAMETER_SUM2]);
-      vecs[PARAMETER_SUM1]->zeroMem();
-      vecs[PARAMETER_SUM2]->zeroMem();
-    });
-  }
-
-  return composeCallbacks(callbacks);
-}
-
-void AverageOptimizer::finishBatch() {
-  optimizer_->finishBatch();
-  if (isAverageWindowTooLong()) {
-    this->finishCatchUpWith();
-    oldNumAccumulates_ = numAccumulates_;
-    numAccumulates_ = 0;
-  }
-}
-
-ParameterOptimizer::TraverseCallback AverageOptimizer::apply() {
-  if (numAccumulates_ + oldNumAccumulates_ == 0) {
-    return nullptr;
-  }
-
-  real scale = 1. / (numAccumulates_ + oldNumAccumulates_);
-  if (useApply_) {
-    return [scale](const VectorPtr vecs[],
-                   const ParameterConfig& config,
-                   size_t sparseId) {
-      vecs[PARAMETER_APPLY]->add3(*vecs[PARAMETER_SUM1],
-                                  *vecs[PARAMETER_SUM2],
-                                  *vecs[PARAMETER_SUM3],
-                                  scale,
-                                  scale,
-                                  scale);
-    };
-  } else {
-    return [scale](const VectorPtr vecs[],
-                   const ParameterConfig& config,
-                   size_t sparseId) {
-      vecs[PARAMETER_GRADIENT]->copyFrom(*vecs[PARAMETER_VALUE]);
-      vecs[PARAMETER_VALUE]->add3(*vecs[PARAMETER_SUM1],
-                                  *vecs[PARAMETER_SUM2],
-                                  *vecs[PARAMETER_SUM3],
-                                  scale,
-                                  scale,
-                                  scale);
-    };
-  }
-}
-
-ParameterOptimizer::TraverseCallback AverageOptimizer::restore() {
-  if (numAccumulates_ + oldNumAccumulates_ == 0) {
-    return nullptr;
-  }
-  if (useApply_) {
-    return nullptr;
-  }
-
-  return [](
-      const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) {
-    vecs[PARAMETER_VALUE]->copyFrom(*vecs[PARAMETER_GRADIENT]);
-    vecs[PARAMETER_GRADIENT]->zeroMem();
-  };
-}
-
-void AverageSparseOptimizer::update(const VectorPtr vecs[],
-                                    const ParameterConfig& paraConfig,
-                                    size_t sparseId) const {
-  optimizer_->update(vecs, paraConfig, sparseId);
-
-  CHECK_LT(sparseId, t0Vec_.size());
-  int timediff = timer_ + 1 - t0Vec_[sparseId];
-  if (timediff > 0) {
-    vecs[PARAMETER_SUM1]->add(*vecs[PARAMETER_VALUE], timediff);
-    t0Vec_[sparseId] = timer_ + 1;
-  }
-}
-
-ParameterOptimizer::TraverseCallback AverageSparseOptimizer::startCatchUpWith()
-    const {
-  TraverseCallbackVec callbacks;
-
-  if (auto callback = optimizer_->startCatchUpWith()) {
-    callbacks.emplace_back(callback);
-  }
-
-  if (timer_ > 0) {
-    callbacks.emplace_back(
-        [this](const VectorPtr vecs[],
-               const ParameterConfig& config,
-               size_t sparseId) { this->catchUpWith(vecs, config, sparseId); });
-  }
-
-  return composeCallbacks(callbacks);
-}
-
-void AverageSparseOptimizer::catchUpWith(const VectorPtr vecs[],
-                                         const ParameterConfig& paraConfig,
-                                         size_t sparseId) const {
-  CHECK_LT(sparseId, t0Vec_.size());
-  int timediff = timer_ - t0Vec_[sparseId];
-  if (timediff > 0) {
-    vecs[PARAMETER_SUM1]->add(*vecs[PARAMETER_VALUE], timediff);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/parameter/AverageOptimizer.h b/paddle/parameter/AverageOptimizer.h
deleted file mode 100644
index 4ad3c18d56abf16d1274c5b3b8e0347b85e64dea..0000000000000000000000000000000000000000
--- a/paddle/parameter/AverageOptimizer.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "FirstOrderOptimizer.h"
-
-namespace paddle {
-
-// After Optimization, parameter values are further averaged within
-// time range.
-class AverageOptimizer : public ParameterOptimizer {
-public:
-  // if *useParameterApply* set, use PARAMETER_APPLY to store averaged parameter
-  // else use PARAMETER_VALUE, and value backup in PARAMETER_GRADIENT
-  AverageOptimizer(const OptimizationConfig& optConfig,
-                   ParameterOptimizer* optimizer,
-                   bool useParameterApply);
-
-  static ParameterOptimizer* create(const OptimizationConfig& optConfig,
-                                    ParameterOptimizer* optimizer,
-                                    bool isParameterSparse = false,
-                                    bool useParameterApply = false);
-
-  virtual void init(size_t numRows, const ParameterConfig* config) {
-    optimizer_->init(numRows, config);
-  }
-
-  virtual void startPass() { optimizer_->startPass(); }
-  virtual void finishPass() {
-    optimizer_->finishPass();
-    updateAverageWindowLimit();
-  }
-
-  virtual void startBatch(int64_t numSamplesProcessed);
-  virtual void finishBatch();
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      size_t sparseId) const {
-    optimizer_->update(vecs, paraConfig, sparseId);
-    vecs[PARAMETER_SUM1]->add(*vecs[PARAMETER_VALUE], 1.0f);
-  }
-
-  virtual TraverseCallback needSpecialTraversal(
-      const ParameterConfig& config) const;
-
-  virtual TraverseCallback startCatchUpWith() const {
-    return optimizer_->startCatchUpWith();
-  }
-  virtual void finishCatchUpWith() { return optimizer_->finishCatchUpWith(); }
-
-  virtual TraverseCallback apply();
-  virtual TraverseCallback restore();
-
-  virtual void setNoDecay() { optimizer_->setNoDecay(); }
-
-protected:
-  std::unique_ptr<ParameterOptimizer> optimizer_;
-  bool useApply_;
-
-  // should only be called from finishPass()
-  void updateAverageWindowLimit() {
-    if (!optConfig_.has_max_average_window()) {
-      // use the number of batches in the last pass as maxAverageWindow_
-      CHECK_GT(numUpdates_, prevNumUpdates_);
-      maxAverageWindow_ = numUpdates_ - prevNumUpdates_;
-      prevNumUpdates_ = numUpdates_;
-    }
-    minAverageWindow_ = std::min(minAverageWindow_, numUpdates_);
-  }
-
-  bool isAverageWindowTooLong() const {
-    return numAccumulates_ >= minAverageWindow_ &&
-           numAccumulates_ >=
-               std::min<int64_t>(maxAverageWindow_,
-                                 numUpdates_ * optConfig_.average_window());
-  }
-
-  static const int64_t kMaxNumAccumulates = 16384;
-  int64_t numUpdates_;
-  int64_t prevNumUpdates_;
-  int64_t numAccumulates_;
-  int64_t oldNumAccumulates_;
-  int64_t minAverageWindow_;
-  int64_t maxAverageWindow_;
-};
-
-// Average Optimizer with Sparse support.
-class AverageSparseOptimizer : public AverageOptimizer {
-public:
-  AverageSparseOptimizer(const OptimizationConfig& optConfig,
-                         ParameterOptimizer* optimizer,
-                         bool useParameterApply)
-      : AverageOptimizer(optConfig, optimizer, useParameterApply) {}
-
-  virtual void init(size_t numRows, const ParameterConfig* config) {
-    AverageOptimizer::init(numRows, config);
-
-    t0Vec_.resize(numRows);
-
-    timer_ = 0;
-    t0Vec_.assign(t0Vec_.size(), 0);
-  }
-  virtual void finishBatch() {
-    AverageOptimizer::finishBatch();
-    timer_++;
-  }
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      size_t sparseId) const;
-  void catchUpWith(const VectorPtr vecs[],
-                   const ParameterConfig& paraConfig,
-                   size_t sparseId) const;
-  virtual TraverseCallback startCatchUpWith() const;
-  virtual void finishCatchUpWith() {
-    optimizer_->finishCatchUpWith();
-
-    timer_ = 0;
-    t0Vec_.assign(t0Vec_.size(), 0);
-  }
-
-protected:
-  /**
-   *  counting batches, clear after catch up with
-   *  t(timer_) is current time,
-   *  t0(t0Vec_) are last occur time of i rows.
-   *  if one block is update by multi threads,
-   *  caller should hash sparse ids to avoid write conflict in t0Vec_.
-   */
-  int timer_;
-  mutable std::vector<int32_t> t0Vec_;
-};
-
-}  // namespace paddle
diff --git a/paddle/parameter/CMakeLists.txt b/paddle/parameter/CMakeLists.txt
deleted file mode 100644
index d2ae1c16c6b7316f1a6facdef4b933693d6ba818..0000000000000000000000000000000000000000
--- a/paddle/parameter/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-# The utilities for paddle
-
-file(GLOB PARAMETERS_HEADERS . *.h)
-file(GLOB PARAMETERS_SOURCES . *.cpp)
-
-add_library(paddle_parameter STATIC
-        ${PARAMETERS_SOURCES})
-add_style_check_target(paddle_parameter ${PARAMETERS_SOURCES})
-add_style_check_target(paddle_parameter ${PARAMETERS_HEADERS})
-add_dependencies(paddle_parameter paddle_proto ${external_project_dependencies})
-if(WITH_TESTING)
-    add_subdirectory(tests)
-endif()
diff --git a/paddle/parameter/FirstOrderOptimizer.cpp b/paddle/parameter/FirstOrderOptimizer.cpp
deleted file mode 100644
index 5e280bcac3389179181d2eda58c08e579e867ecc..0000000000000000000000000000000000000000
--- a/paddle/parameter/FirstOrderOptimizer.cpp
+++ /dev/null
@@ -1,330 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "FirstOrderOptimizer.h"
-#include "paddle/math/TrainingAlgorithmOp.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Util.h"
-
-#include <cmath>
-
-DEFINE_bool(log_clipping, false, "enable log clipping or not");
-
-namespace paddle {
-
-SparseMomentumParameterOptimizer::SparseMomentumParameterOptimizer(
-    const OptimizationConfig& optConfig)
-    : ParameterOptimizer(optConfig) {
-  addParameterType(PARAMETER_MOMENTUM);
-  addParameterType(PARAMETER_MOMENTUM_UT);
-  addParameterType(PARAMETER_MOMENTUM_VT);
-  alpha_ = 1;
-  beta_ = 1;
-  tau_ = -1;
-  threshold_ = 1e+06;
-}
-
-void SparseMomentumParameterOptimizer::init(size_t numRows,
-                                            const ParameterConfig* config) {
-  isParameterSparse_ = numRows != 0;
-  t0Vec_.resize(numRows);
-  t0Vec_.assign(t0Vec_.size(), 0);
-  timer_ = 0;
-  momentum_ = config->momentum();
-  decayRate_ = config->decay_rate();
-  gamma_ = config->learning_rate();
-}
-
-void SparseMomentumParameterOptimizer::startBatch(int64_t numSamplesProcessed) {
-  learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
-  if (isParameterSparse_) {
-    tau_ = tau_ + beta_ / alpha_;
-    alpha_ = alpha_ / momentum_;
-    beta_ = beta_ / (1 + decayRate_ * gamma_ * learningRate_);
-  }
-}
-
-void SparseMomentumParameterOptimizer::update(const VectorPtr vecs[],
-                                              const ParameterConfig& paraConfig,
-                                              size_t sparseId) const {
-  if (sparseId != -1LU) {
-    CHECK_LT(sparseId, t0Vec_.size());
-    if (t0Vec_[sparseId] == 0) {
-      vecs[PARAMETER_MOMENTUM_VT]->assign(*vecs[PARAMETER_VALUE]);
-      t0Vec_[sparseId] = 1;
-    }
-    vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT],
-                                     -alpha_ * gamma_ * learningRate_);
-    vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT],
-                                     tau_ * alpha_ * gamma_ * learningRate_);
-    vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
-                               tau_ / beta_ + 1.0 / alpha_,
-                               *vecs[PARAMETER_MOMENTUM_VT],
-                               1.0 / beta_);
-
-  } else {
-    vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                     *vecs[PARAMETER_MOMENTUM],
-                                     learningRate_ * paraConfig.learning_rate(),
-                                     paraConfig.momentum(),
-                                     applyDecay_ ? paraConfig.decay_rate() : 0);
-  }
-}
-
-ParameterOptimizer::TraverseCallback
-SparseMomentumParameterOptimizer::needSpecialTraversal(
-    const ParameterConfig& config) const {
-  if (alpha_ > threshold_ && isParameterSparse_) {
-    //  Restart to avoid large value multiplication
-    //  1. \alpha = 1, \beta = 1, \tau = 0
-    //  2. Note that \tau * u_t + v_t = \beta \theta_t, therefore:
-    //     u_t should be rescaled to u_t/alpha_
-    //     v_t should be reset to \theta_t
-    return [this](const VectorPtr vecs[],
-                  const ParameterConfig& config,
-                  size_t sparseId) {
-      vecs[PARAMETER_MOMENTUM_UT]->divScalar(alpha_);
-      vecs[PARAMETER_MOMENTUM_VT]->assign(*vecs[PARAMETER_VALUE]);
-    };
-  } else {
-    return nullptr;
-  }
-}
-
-void SparseMomentumParameterOptimizer::finishBatch() {
-  timer_++;
-  if (!isParameterSparse_) return;
-  if (alpha_ > threshold_) {
-    alpha_ = 1;
-    beta_ = 1;
-    tau_ = -1;
-  }
-}
-
-void AdagradParameterOptimizer::update(const VectorPtr vecs[],
-                                       const ParameterConfig& config,
-                                       size_t sparseId) const {
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& accum_buffer = *vecs[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
-
-  real epsilon = optConfig_.ada_epsilon();
-  real learningRate = learningRate_ * config.learning_rate();
-  real momentum = config.momentum();
-  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-
-  adagradApply(value,
-               grad,
-               mom,
-               accum_buffer,
-               accum,
-               lr,
-               epsilon,
-               learningRate,
-               momentum,
-               decayRate);
-}
-
-ParameterOptimizer::TraverseCallback
-AdagradParameterOptimizer::needSpecialTraversal(
-    const ParameterConfig& config) const {
-  if (numUpdates_ % kMaxNumAccumulates == 0) {
-    // Move the sum to a different buffer to avoid loss of precision
-    // due to too many sums.
-    return [this](const VectorPtr vecs[],
-                  const ParameterConfig& config,
-                  size_t sparseId) {
-      vecs[PARAMETER_GRADIENT_SQURESUM]->add(
-          *vecs[PARAMETER_GRADIENT_SQURESUM1]);
-      vecs[PARAMETER_GRADIENT_SQURESUM1]->zeroMem();
-    };
-  } else {
-    return nullptr;
-  }
-}
-
-void AdaDeltaParameterOptimizer::update(const VectorPtr vecs[],
-                                        const ParameterConfig& config,
-                                        size_t sparseId) const {
-  CHECK(sparseId == -1LU) << "Sparse update is not supported";
-
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& accum_update = *vecs[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
-
-  real learningRate = learningRate_ * config.learning_rate();
-  real momentum = config.momentum();
-  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-
-  adadeltaApply(value,
-                grad,
-                mom,
-                accum,
-                accum_update,
-                lr,
-                rou_,
-                epsilon_,
-                learningRate,
-                momentum,
-                decayRate);
-}
-
-void RMSPropParameterOptimizer::update(const VectorPtr vecs[],
-                                       const ParameterConfig& config,
-                                       size_t sparseId) const {
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& sum1 = *vecs[PARAMETER_GRADIENT_SQURESUM1];
-  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
-
-  real accumulatedRou = rou_;
-  bool firstTime = timer_ == 0;
-  if (sparseId != -1LU) {
-    CHECK_LT(sparseId, t0Vec_.size());
-    accumulatedRou = std::pow(rou_, timer_ + 1 - t0Vec_[sparseId]);
-    firstTime = t0Vec_[sparseId] == 0;
-    t0Vec_[sparseId] = timer_ + 1;
-  }
-
-  real epsilon = optConfig_.ada_epsilon();
-  real learningRate = learningRate_ * config.learning_rate();
-  real momentum = config.momentum();
-  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-
-  rmspropApply(value,
-               grad,
-               mom,
-               sum,
-               sum1,
-               lr,
-               accumulatedRou,
-               rou_,
-               epsilon,
-               learningRate,
-               momentum,
-               decayRate,
-               firstTime);
-}
-
-void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[],
-                                              const ParameterConfig& config,
-                                              size_t sparseId) const {
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM];
-  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
-
-  real accumulatedRou = rou_;
-  bool firstTime = timer_ == 0;
-  if (sparseId != -1LU) {
-    CHECK_LT(sparseId, t0Vec_.size());
-    accumulatedRou = std::pow(rou_, timer_ + 1 - t0Vec_[sparseId]);
-    firstTime = t0Vec_[sparseId] == 0;
-    t0Vec_[sparseId] = timer_ + 1;
-  }
-
-  real epsilon = optConfig_.ada_epsilon();
-  real learningRate = learningRate_ * config.learning_rate();
-  real momentum = config.momentum();
-  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-
-  decayedAdagradApply(value,
-                      grad,
-                      mom,
-                      sum,
-                      lr,
-                      accumulatedRou,
-                      rou_,
-                      epsilon,
-                      learningRate,
-                      momentum,
-                      decayRate,
-                      firstTime);
-}
-
-void AdamParameterOptimizer::update(const VectorPtr vecs[],
-                                    const ParameterConfig& config,
-                                    size_t sparseId) const {
-  CHECK(sparseId == -1UL) << "Sparse update is not supported";
-
-  real beta1_power = std::pow(beta1_, step_);
-  real beta2_power = std::pow(beta2_, step_);
-  real learningRate = config.learning_rate() * learningRate_;
-
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& v = *vecs[PARAMETER_SECOND_MOMENTUM];
-
-  adamApply(value,
-            grad,
-            mom,
-            v,
-            beta1_,
-            beta2_,
-            beta1_power,
-            beta2_power,
-            epsilon_,
-            learningRate);
-}
-
-void AdamaxParameterOptimizer::update(const VectorPtr vecs[],
-                                      const ParameterConfig& config,
-                                      size_t sparseId) const {
-  CHECK(sparseId == -1UL) << "Sparse update is not supported";
-  real learningRate = config.learning_rate() * learningRate_;
-
-  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  BaseMatrix& u = *vecs[PARAMETER_WEIGHTED_INFINITY_NORM];
-
-  adamaxApply(value, grad, mom, u, beta1_, beta2_, step_, learningRate);
-}
-
-void OptimizerWithGradientClipping::update(const VectorPtr vecs[],
-                                           const ParameterConfig& config,
-                                           size_t sparseId) const {
-  real globalThreshold = optConfig_.gradient_clipping_threshold();
-  real localThreshold = config.gradient_clipping_threshold();
-
-  // Use local gradient clipping threshold if it's enabled,
-  // otherwise using the global one.
-  real threshold = localThreshold > 0.0f ? localThreshold : globalThreshold;
-  std::string field = localThreshold > 0.0f ? "local" : "global";
-
-  real maxAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsMax();
-  if (maxAbsGrad > threshold) {
-    if (FLAGS_log_clipping) {
-      real avgAbsGrad = vecs[PARAMETER_GRADIENT]->getAbsSum() /
-                        vecs[PARAMETER_GRADIENT]->getSize();
-      LOG(INFO) << "parameter=" << config.name() << " need clipping by "
-                << field << " threshold=" << threshold
-                << ", max grad=" << maxAbsGrad << ", avg grad=" << avgAbsGrad;
-    }
-    vecs[PARAMETER_GRADIENT]->clip(-threshold, threshold);
-  }
-  optimizer_->update(vecs, config, sparseId);
-}
-
-}  // namespace paddle
diff --git a/paddle/parameter/FirstOrderOptimizer.h b/paddle/parameter/FirstOrderOptimizer.h
deleted file mode 100644
index 047989fcad52afc1d4d4c347258d0fb2f069f3d4..0000000000000000000000000000000000000000
--- a/paddle/parameter/FirstOrderOptimizer.h
+++ /dev/null
@@ -1,381 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ParameterOptimizer.h"
-#include "ParameterUpdateFunctions.h"
-#include "Regularizer.h"
-
-namespace paddle {
-
-// Plain SGD optimization.
-class SgdOptimizer : public ParameterOptimizer {
-public:
-  explicit SgdOptimizer(const OptimizationConfig& optConfig)
-      : ParameterOptimizer(optConfig) {
-    addParameterType(PARAMETER_MOMENTUM);
-  }
-
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
-  }
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      size_t sparseId) const {
-    (void)sparseId;
-    real torch_learningRate = optConfig_.learning_method() == "torch_momentum"
-                                  ? 1.0 - paraConfig.momentum()
-                                  : 1.0;
-#ifdef PADDLE_WITH_MKLDNN
-    sgdUpdate(learningRate_ * paraConfig.learning_rate() *
-                  (firstTime_ ? 1.0 : torch_learningRate),
-              paraConfig.momentum(),
-              applyDecay_ ? paraConfig.decay_rate() : 0,
-              vecs[PARAMETER_VALUE].get(),
-              vecs[PARAMETER_GRADIENT].get(),
-              vecs[PARAMETER_MOMENTUM].get());
-#else
-    vecs[PARAMETER_VALUE]->sgdUpdate(
-        *vecs[PARAMETER_GRADIENT],
-        *vecs[PARAMETER_MOMENTUM],
-        learningRate_ * paraConfig.learning_rate() *
-            (firstTime_ ? 1.0 : torch_learningRate),
-        paraConfig.momentum(),
-        applyDecay_ ? paraConfig.decay_rate() : 0);
-#endif
-  }
-  virtual void finishBatch() { firstTime_ = false; }
-};
-
-// SGD optimization with sparse support.
-class SparseMomentumParameterOptimizer : public ParameterOptimizer {
-  /* sparse momentum optimizer
-
-    update scheme:
-
-    \alpha_t = \alpha_{t-1} / k
-    \beta_t = \beta_{t-1} / (1 + \lambda\gamma_t)
-    u_t = u_{t-1} - \alpha_t \gamma_t g_t
-    v_t = v_{t-1} + \tau_{t-1} \alpha_t \gamma_t g_t
-    \tau_t = \tau_{t-1} + \beta_t / \alpha_t
-
-    where:
-    k: momentum
-    lambda: decay rate
-    \gamma_t: learning rate at the t'th step
-  */
-
-public:
-  explicit SparseMomentumParameterOptimizer(
-      const OptimizationConfig& optConfig);
-  virtual void init(size_t numRows, const ParameterConfig* config);
-  virtual void startBatch(int64_t numSamplesProcessed);
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      size_t sparseId) const;
-  virtual TraverseCallback needSpecialTraversal(
-      const ParameterConfig& config) const;
-  virtual void finishBatch();
-
-private:
-  real alpha_;
-  real beta_;
-  real tau_;
-  real gamma_;
-  real threshold_;
-  real momentum_;
-  real decayRate_;
-
-protected:
-  int64_t timer_;
-  mutable std::vector<int64_t> t0Vec_;
-  bool isParameterSparse_;
-};
-
-/*
- * AdaGrad optimization.
- * http://www.magicbroom.info/Papers/DuchiHaSi10.pdf
- */
-class AdagradParameterOptimizer : public ParameterOptimizer {
-public:
-  explicit AdagradParameterOptimizer(const OptimizationConfig& optConfig)
-      : ParameterOptimizer(optConfig) {
-    addParameterType(PARAMETER_MOMENTUM);
-    addParameterType(PARAMETER_GRADIENT_SQURESUM);
-    addParameterType(PARAMETER_GRADIENT_SQURESUM1);
-    addParameterType(PARAMETER_LEARNING_RATE);
-    numUpdates_ = 0;
-  }
-
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    (void)numSamplesProcessed;
-    ++numUpdates_;
-  }
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const;
-  virtual TraverseCallback needSpecialTraversal(
-      const ParameterConfig& config) const;
-
-protected:
-  int64_t numUpdates_;
-  static const int64_t kMaxNumAccumulates = 16384;
-};
-
-/*
- * AdaDelta Optimization.
- * http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf
- */
-class AdaDeltaParameterOptimizer : public ParameterOptimizer {
-public:
-  explicit AdaDeltaParameterOptimizer(const OptimizationConfig& optConfig)
-      : ParameterOptimizer(optConfig) {
-    addParameterType(PARAMETER_MOMENTUM);
-    addParameterType(PARAMETER_GRADIENT_SQURESUM);
-    addParameterType(PARAMETER_GRADIENT_SQURESUM1);
-    addParameterType(PARAMETER_LEARNING_RATE);
-    rou_ = optConfig.ada_rou();
-    epsilon_ = optConfig.ada_epsilon();
-  }
-
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
-  }
-
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const;
-
-protected:
-  real rou_;
-  real epsilon_;
-};
-
-// RMSProp Parameter Optimization.
-class RMSPropParameterOptimizer : public ParameterOptimizer {
-public:
-  explicit RMSPropParameterOptimizer(const OptimizationConfig& optConfig)
-      : ParameterOptimizer(optConfig) {
-    addParameterType(PARAMETER_MOMENTUM);
-    addParameterType(PARAMETER_GRADIENT_SQURESUM1);
-    addParameterType(PARAMETER_GRADIENT_SQURESUM);
-    addParameterType(PARAMETER_LEARNING_RATE);
-    rou_ = optConfig.ada_rou();
-    epsilon_ = optConfig.ada_epsilon();
-  }
-
-  virtual void init(size_t numRows, const ParameterConfig* config) {
-    t0Vec_.resize(numRows);
-    t0Vec_.assign(t0Vec_.size(), 0);
-    timer_ = 0;
-  }
-
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
-  }
-  virtual void finishBatch() { timer_++; }
-
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const;
-
-protected:
-  real rou_;
-  real epsilon_;
-
-  /**
-   *  counting batches, donot need catch up with
-   *  t(timer_) is current time,
-   *  t0(t0Vec_) are last occur time of i rows.
-   *  if one block is update by multi threads,
-   *  caller should hash sparse ids to avoid write conflict in t0Vec_.
-   */
-  int64_t timer_;
-  mutable std::vector<int64_t> t0Vec_;
-};
-
-// Decayed AdaGrad Optimization.
-class DecayedAdagradParameterOptimizer : public ParameterOptimizer {
-public:
-  explicit DecayedAdagradParameterOptimizer(const OptimizationConfig& optConfig)
-      : ParameterOptimizer(optConfig) {
-    addParameterType(PARAMETER_MOMENTUM);
-    addParameterType(PARAMETER_GRADIENT_SQURESUM);
-    addParameterType(PARAMETER_LEARNING_RATE);
-    rou_ = optConfig.ada_rou();
-    epsilon_ = optConfig.ada_epsilon();
-  }
-
-  virtual void init(size_t numRows, const ParameterConfig* config) {
-    t0Vec_.resize(numRows);
-    t0Vec_.assign(t0Vec_.size(), 0);
-    timer_ = 0;
-  }
-
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
-  }
-  virtual void finishBatch() { timer_++; }
-
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const;
-
-protected:
-  real rou_;
-  real epsilon_;
-
-  /**
-   *  counting batches, donot need catch up with
-   *  t(timer_) is current time,
-   *  t0(t0Vec_) are last occur time of i rows.
-   *  if one block is update by multi threads,
-   *  caller should hash sparse ids to avoid write conflict in t0Vec_.
-   */
-  int64_t timer_;
-  mutable std::vector<int64_t> t0Vec_;
-};
-
-/**
- * Adam Optimizer.
- * Reference Paper: http://arxiv.org/abs/1412.6980 Algorithm 1
- */
-class AdamParameterOptimizer : public ParameterOptimizer {
-public:
-  explicit AdamParameterOptimizer(const OptimizationConfig& optConfig)
-      : ParameterOptimizer(optConfig),
-        beta1_(optConfig.adam_beta1()),
-        beta2_(optConfig.adam_beta2()),
-        epsilon_(optConfig.adam_epsilon()),
-        step_(1),
-        learningRate_(optConfig.learning_rate()) {
-    addParameterType(PARAMETER_MOMENTUM);
-    addParameterType(PARAMETER_SECOND_MOMENTUM);
-  }
-
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
-  }
-
-  virtual void finishBatch() { ++step_; }
-
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const;
-
-protected:
-  real beta1_;
-  real beta2_;
-  real epsilon_;
-  int64_t step_;
-  real learningRate_;
-};
-
-/**
- * AdaMax Optimizer.
- * Reference Paper: http://arxiv.org/abs/1412.6980 Algorithm 2
- */
-class AdamaxParameterOptimizer : public ParameterOptimizer {
-public:
-  explicit AdamaxParameterOptimizer(const OptimizationConfig& optConfig)
-      : ParameterOptimizer(optConfig),
-        beta1_(optConfig.adam_beta1()),
-        beta2_(optConfig.adam_beta2()),
-        step_(1),
-        learningRate_(optConfig.learning_rate()) {
-    addParameterType(PARAMETER_MOMENTUM);
-    addParameterType(PARAMETER_WEIGHTED_INFINITY_NORM);
-  }
-
-  virtual void finishBatch() { ++step_; }
-
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const;
-
-protected:
-  real beta1_;
-  real beta2_;
-  int64_t step_;
-  real learningRate_;
-};
-
-// Used in pserver,
-// when PARAMETER_DELTA stores in PARAMETER_GRADIENT.
-class AddOptimizer : public ParameterOptimizer {
-public:
-  explicit AddOptimizer(const OptimizationConfig& optConfig)
-      : ParameterOptimizer(optConfig) {}
-
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    // learningRate required by regularizer
-    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
-  }
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      size_t sparseId) const {
-    vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_GRADIENT],
-                               optConfig_.delta_add_rate());
-  }
-};
-
-// A optimizer which does nothing.
-class DummyOptimizer : public ParameterOptimizer {
-public:
-  explicit DummyOptimizer(const OptimizationConfig& optConfig)
-      : ParameterOptimizer(optConfig) {}
-
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      size_t sparseId) const {}
-};
-
-// Do gradient clipping before sgd update
-class OptimizerWithGradientClipping : public ParameterOptimizer {
-public:
-  OptimizerWithGradientClipping(const OptimizationConfig& optConfig,
-                                ParameterOptimizer* optimizer)
-      : ParameterOptimizer(optConfig), optimizer_(optimizer) {
-    parameterTypes_ = optimizer_->getParameterTypes();
-  }
-
-  virtual void init(size_t numRows, const ParameterConfig* config) {
-    optimizer_->init(numRows, config);
-  }
-
-  virtual void startPass() { optimizer_->startPass(); }
-  virtual void finishPass() { optimizer_->finishPass(); }
-
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    optimizer_->startBatch(numSamplesProcessed);
-    learningRate_ = optimizer_->getLearningRate();
-  }
-  virtual void finishBatch() { optimizer_->finishBatch(); }
-
-  virtual TraverseCallback needSpecialTraversal(
-      const ParameterConfig& config) const {
-    return optimizer_->needSpecialTraversal(config);
-  }
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const;
-
-  virtual void setNoDecay() { optimizer_->setNoDecay(); }
-
-protected:
-  std::unique_ptr<ParameterOptimizer> optimizer_;
-};
-
-}  // namespace paddle
diff --git a/paddle/parameter/LearningRateScheduler.cpp b/paddle/parameter/LearningRateScheduler.cpp
deleted file mode 100644
index b6b58e3ddad6a0e8811bf56502c3f2f0c8728f5c..0000000000000000000000000000000000000000
--- a/paddle/parameter/LearningRateScheduler.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "LearningRateScheduler.h"
-#include "paddle/utils/StringUtil.h"
-
-namespace paddle {
-
-ClassRegistrar<LearningRateScheduler, OptimizationConfig>
-    LearningRateScheduler::registrar_;
-
-LearningRateScheduler* LearningRateScheduler::create(
-    const OptimizationConfig& config) {
-  return registrar_.createByType(config.learning_rate_schedule(), config);
-}
-
-// LRS stands for LearningRateScheduler
-
-class BaseLRS : public LearningRateScheduler {
-public:
-  explicit BaseLRS(const OptimizationConfig& config)
-      : learningRate_(config.learning_rate()),
-        a_(config.learning_rate_decay_a()),
-        b_(config.learning_rate_decay_b()) {}
-
-protected:
-  real learningRate_;
-  real a_;
-  real b_;
-};
-
-class ConstLRS : public BaseLRS {
-public:
-  explicit ConstLRS(const OptimizationConfig& config) : BaseLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    return learningRate_;
-  }
-};
-REGISTER_LEARNING_RATE_SCHEDULER(constant, ConstLRS);
-
-class PolyLRS : public BaseLRS {
-public:
-  explicit PolyLRS(const OptimizationConfig& config) : BaseLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    return learningRate_ * pow(1.0 + a_ * numSamplesProcessed, -b_);
-  }
-};
-REGISTER_LEARNING_RATE_SCHEDULER(poly, PolyLRS);
-
-class CaffePolyLRS : public BaseLRS {
-public:
-  explicit CaffePolyLRS(const OptimizationConfig& config) : BaseLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    if (numSamplesProcessed > a_) {
-      LOG_FIRST_N(WARNING, 1)
-          << "Using caffe_poly learning rate schedule, "
-          << "learning rate hits ZERO when "
-          << "numSamplesProcessed > config.learning_rate_decay_b(), "
-          << "training is over and you can stop it. "
-          << "See common/LearningRateScheduler.cpp for more info.";
-      return 0;
-    } else {
-      return learningRate_ * pow(1.0 - numSamplesProcessed / a_, b_);
-    }
-  }
-};
-REGISTER_LEARNING_RATE_SCHEDULER(caffe_poly, CaffePolyLRS);
-
-class ExpLRS : public BaseLRS {
-public:
-  explicit ExpLRS(const OptimizationConfig& config) : BaseLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    double decayRatio = (double)numSamplesProcessed / b_;
-    return learningRate_ * pow(a_, decayRatio);
-  }
-};
-REGISTER_LEARNING_RATE_SCHEDULER(exp, ExpLRS);
-
-class DiscreteExpLRS : public BaseLRS {
-public:
-  explicit DiscreteExpLRS(const OptimizationConfig& config) : BaseLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    int numDecays = floor(numSamplesProcessed / b_);
-    return learningRate_ * pow(a_, numDecays);
-  }
-};
-REGISTER_LEARNING_RATE_SCHEDULER(discexp, DiscreteExpLRS);
-
-class LinearLRS : public BaseLRS {
-public:
-  explicit LinearLRS(const OptimizationConfig& config) : BaseLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    return std::max(learningRate_ - a_ * numSamplesProcessed, b_);
-  }
-};
-REGISTER_LEARNING_RATE_SCHEDULER(linear, LinearLRS);
-
-/*
-  specify learning rate through
-  learning_rate_args = 'seg0:rate0,seg1:rate1,...,segK:rateK'
-  if seg_{i-1} <= numSamples <= seg_i,
-  then learning_rate = learning_rate_base * rate_i
-*/
-class ManualLRS : public BaseLRS {
-public:
-  explicit ManualLRS(const OptimizationConfig& config)
-      : BaseLRS(config), currentSegment_(0), lastNum_(0) {
-    std::vector<std::string> pieces;
-    str::split(config.learning_rate_args(), ',', &pieces);
-    rates_.reserve(pieces.size());
-    std::string s1, s2;
-
-    for (auto& piece : pieces) {
-      auto pos = piece.find(':');
-      CHECK(pos != std::string::npos) << "Wrong format for learning_rate_args: "
-                                      << config.learning_rate_args();
-      segments_.push_back(str::to<int64_t>(piece.substr(0, pos)));
-      rates_.push_back(str::to<real>(piece.substr(pos + 1)));
-    }
-  }
-
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    return calc(numSamplesProcessed);
-  }
-
-  real calc(int64_t num) {
-    // We assume that num never decreases.
-    CHECK_LE(lastNum_, num);
-    lastNum_ = num;
-    while (currentSegment_ < rates_.size()) {
-      if (num <= segments_[currentSegment_]) {
-        return learningRate_ * rates_[currentSegment_];
-      }
-      ++currentSegment_;
-      if (currentSegment_ < rates_.size()) {
-        LOG(INFO) << " learning_rate changes to "
-                  << learningRate_ * rates_[currentSegment_];
-      }
-    }
-    return learningRate_ * rates_.back();
-  }
-
-protected:
-  std::vector<real> rates_;
-  std::vector<int64_t> segments_;
-  size_t currentSegment_;
-  int64_t lastNum_;
-};
-
-REGISTER_LEARNING_RATE_SCHEDULER(manual, ManualLRS);
-
-class PassManualLRS : public ManualLRS {
-public:
-  explicit PassManualLRS(const OptimizationConfig& config)
-      : ManualLRS(config) {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    return calc(pass);
-  }
-};
-
-REGISTER_LEARNING_RATE_SCHEDULER(pass_manual, PassManualLRS);
-}  // namespace paddle
diff --git a/paddle/parameter/LearningRateScheduler.h b/paddle/parameter/LearningRateScheduler.h
deleted file mode 100644
index aea99a1c204b46e937135cbde22360a12d087ae2..0000000000000000000000000000000000000000
--- a/paddle/parameter/LearningRateScheduler.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "TrainerConfig.pb.h"
-#include "paddle/utils/ClassRegistrar.h"
-
-namespace paddle {
-// NOLINTNEXTLINES_4
-#define REGISTER_LEARNING_RATE_SCHEDULER(__type_name, __class_name) \
-  static InitFunction __reg_type_##__type_name([]() {               \
-    LearningRateScheduler::registrar_.registerClass<__class_name>(  \
-        #__type_name);                                              \
-  })
-
-class LearningRateScheduler {
-public:
-  static LearningRateScheduler* create(const OptimizationConfig& config);
-  virtual ~LearningRateScheduler() {}
-  virtual real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) = 0;
-
-  static ClassRegistrar<LearningRateScheduler, OptimizationConfig> registrar_;
-};
-
-}  // namespace paddle
diff --git a/paddle/parameter/OptimizerWithRegularizer.h b/paddle/parameter/OptimizerWithRegularizer.h
deleted file mode 100644
index 7219d96d924dfa26d3ab52b8c6a2ce1249e4f45c..0000000000000000000000000000000000000000
--- a/paddle/parameter/OptimizerWithRegularizer.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "FirstOrderOptimizer.h"
-
-namespace paddle {
-
-// add regularizer for objective function to do optimization
-class OptimizerWithRegularizer : public ParameterOptimizer {
-public:
-  static ParameterOptimizer* create(const OptimizationConfig& optConfig,
-                                    const ParameterConfig& paraConfig,
-                                    bool isParameterSparse,
-                                    bool inPserver);
-
-  OptimizerWithRegularizer(const OptimizationConfig& optConfig,
-                           ParameterOptimizer* optimizer,
-                           Regularizer* regularizer)
-      : ParameterOptimizer(optConfig),
-        optimizer_(optimizer),
-        regularizer_(regularizer) {
-    parameterTypes_ = optimizer_->getParameterTypes();
-  }
-
-  virtual void init(size_t numRows, const ParameterConfig* config) {
-    optimizer_->init(numRows, config);
-  }
-
-  virtual void startPass() {
-    optimizer_->startPass();
-    timer_ = 0;
-  }
-
-  virtual void finishPass() { optimizer_->finishPass(); }
-
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    optimizer_->startBatch(numSamplesProcessed);
-  }
-
-  virtual void finishBatch() {
-    optimizer_->finishBatch();
-    ++timer_;
-  }
-
-  virtual TraverseCallback needSpecialTraversal(
-      const ParameterConfig& config) const {
-    return optimizer_->needSpecialTraversal(config);
-  }
-
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const {
-    optimizer_->update(vecs, config, sparseId);
-    regularizer_->update(vecs, config, optimizer_->getLearningRate(), 0, 1);
-  }
-
-protected:
-  std::unique_ptr<ParameterOptimizer> optimizer_;
-  Regularizer* regularizer_;
-
-  /**
-   *  counting batches, clear after catch up with
-   *  t(timer_) is current time,
-   *  t0(t0Vec_) are last occur time of i rows.
-   *  if one block is update by multi threads,
-   *  caller should hash sparse ids to avoid write conflict in t0Vec_.
-   */
-  int timer_;
-};
-
-// Regularized Loss function for every num of batches
-class OptimizerWithRegularizerEveryNumBatches
-    : public OptimizerWithRegularizer {
-public:
-  OptimizerWithRegularizerEveryNumBatches(const OptimizationConfig& optConfig,
-                                          ParameterOptimizer* optimizer,
-                                          Regularizer* regularizer)
-      : OptimizerWithRegularizer(optConfig, optimizer, regularizer) {}
-
-  virtual void startPass() {
-    OptimizerWithRegularizer::startPass();
-    baseTimer_ = 0;
-  }
-
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const {
-    optimizer_->update(vecs, config, sparseId);
-  }
-
-  virtual TraverseCallback needSpecialTraversal(
-      const ParameterConfig& config) const;
-  void doTraversal(const VectorPtr vecs[], const ParameterConfig& config) const;
-
-  void catchUpWith(const VectorPtr vecs[],
-                   const ParameterConfig& config,
-                   size_t sparseId) const;
-
-  virtual TraverseCallback startCatchUpWith() const;
-  virtual void finishCatchUpWith() { baseTimer_ = timer_; }
-
-protected:
-  bool isRegularizationBatch(const ParameterConfig& config) const {
-    return ((timer_ + 1) % config.num_batches_regularization() == 0);
-  }
-
-  /**
-   *  recored the timer_ value while catchUpWith called.
-   */
-  int baseTimer_;
-};
-
-// Regularized Loss function with Sparse support
-class OptimizerWithRegularizerSparse : public OptimizerWithRegularizer {
-public:
-  OptimizerWithRegularizerSparse(const OptimizationConfig& optConfig,
-                                 ParameterOptimizer* optimizer,
-                                 Regularizer* regularizer)
-      : OptimizerWithRegularizer(optConfig, optimizer, regularizer) {}
-
-  virtual void init(size_t numRows, const ParameterConfig* config);
-
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId) const;
-  void catchUpWith(const VectorPtr vecs[],
-                   const ParameterConfig& config,
-                   size_t sparseId) const;
-  virtual TraverseCallback startCatchUpWith() const;
-  virtual void finishCatchUpWith() {
-    timer_ = 0;
-    t0Vec_.assign(t0Vec_.size(), 0);
-  }
-
-protected:
-  /**
-   *  t0Vec_ are last occur time of i rows
-   *  if one block is update by multi threads,
-   *  caller should hash sparse ids to avoid write conflict in t0Vec_.
-   */
-  mutable std::vector<int32_t> t0Vec_;
-};
-
-}  // namespace paddle
diff --git a/paddle/parameter/Parameter.cpp b/paddle/parameter/Parameter.cpp
deleted file mode 100644
index 0e6ea90f3d582e843c62bda000313eb71289d5b4..0000000000000000000000000000000000000000
--- a/paddle/parameter/Parameter.cpp
+++ /dev/null
@@ -1,425 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Parameter.h"
-#include <gflags/gflags.h>
-#include <fstream>
-#include "AverageOptimizer.h"
-#include "FirstOrderOptimizer.h"
-#include "OptimizerFunctions.h"
-#include "OptimizerWithRegularizer.h"
-#include "ParameterUpdateFunctions.h"
-#include "ThreadLocalBuffer.h"
-#include "hl_gpu.h"
-#include "paddle/math/CpuSparseMatrix.h"
-#include "paddle/math/MathUtils.h"
-#include "paddle/math/SparseRowMatrix.h"
-#include "paddle/utils/Logging.h"
-
-DEFINE_int32(enable_grad_share,
-             (100 * 1024 * 1024),
-             "threshold for enable gradient parameter share for batch "
-             "multi-cpu training");
-DEFINE_int32(
-    grad_share_block_num,
-    64,
-    "block number of gradient parameter share for batch multi-cpu training");
-
-namespace paddle {
-
-const std::string Parameter::kMissParameterFail = "fail";
-const std::string Parameter::kMissParameterRand = "rand";
-const std::string Parameter::kMissParameterZero = "zero";
-
-Parameter::Parameter(const ParameterConfig& config, bool useGpu, bool doInit)
-    : config_(config),
-      useGpu_(useGpu),
-      deviceId_(-1),
-      sharedCount_(0),
-      updateCounter_(0),
-      updated_(false),
-      headerFormat_(PARAM_FORMAT_ORIGINAL) {
-  setID(-1); /* capture uninitialized id */
-  if (useGpu_ && FLAGS_parallel_nn) {
-    /* gpu environment is specified by device property */
-    deviceId_ = config_.device();
-    if (deviceId_ < 0) {
-      useGpu_ = false;
-    }
-  }
-
-  if (doInit) {
-    initialize();
-  }
-
-  for (int i = 0; i < config.update_hooks_size(); ++i) {
-    this->updaterHooks_.push_back(IParameterUpdaterHook::create(config, i));
-  }
-}
-
-void Parameter::initialize() {
-  SetDevice device(deviceId_);
-
-  bufs_[PARAMETER_VALUE] =
-      Vector::createParallelVector(config_.size(), useGpu_);
-  bufs_[PARAMETER_VALUE]->zeroMem();
-
-  if (config_.is_sparse()) {
-    enableSparseParameter();
-  }
-
-  if (!isStatic()) {
-    bufs_[PARAMETER_GRADIENT] =
-        Vector::createParallelVector(config_.size(), useGpu_);
-    bufs_[PARAMETER_MOMENTUM] =
-        Vector::createParallelVector(config_.size(), useGpu_);
-
-    bufs_[PARAMETER_GRADIENT]->zeroMem();
-    bufs_[PARAMETER_MOMENTUM]->zeroMem();
-  }
-}
-
-void Parameter::randomize(const VectorPtr& value,
-                          const ParameterConfig& config) {
-  if (PARAMETER_INIT_UNIFORM == config.initial_strategy()) {
-    // initialize the parameter as uniform distribution
-    real initial_min = config.initial_mean() - config.initial_std();
-    real initial_max = config.initial_mean() + config.initial_std();
-    value->uniform(initial_min, initial_max);
-    VLOG(1) << config.name() << ": initial_min=" << initial_min
-            << ", initial_max=" << initial_max;
-  } else if (PARAMETER_INIT_NORMAL == config.initial_strategy()) {
-    /* Initialize the parameters randomly */
-    value->randnorm(config.initial_mean(), config.initial_std());
-    VLOG(1) << config.name() << ": initial_mean=" << config.initial_mean()
-            << ", initial_std=" << config.initial_std();
-  } else {
-    LOG(FATAL) << "not supported initial_strategy: "
-               << config.initial_strategy();
-  }
-}
-
-void Parameter::randomize() {
-  if (!bufs_[PARAMETER_VALUE]) return;
-  SetDevice device(deviceId_);
-  Parameter::randomize(bufs_[PARAMETER_VALUE], config_);
-
-  if (config_.is_sparse()) {
-    if (format_ == SPARSE_CSC) {
-      sparseRand(intBufs_[PARAMETER_COLS]->getData(),
-                 intBufs_[PARAMETER_ROWS]->getData(),
-                 config_.size(),
-                 config_.dims(1) + 1,
-                 config_.dims(0),
-                 useGpu_);
-    } else {
-      sparseRand(intBufs_[PARAMETER_ROWS]->getData(),
-                 intBufs_[PARAMETER_COLS]->getData(),
-                 config_.size(),
-                 config_.dims(0) + 1,
-                 config_.dims(1),
-                 useGpu_);
-    }
-  }
-  setValueUpdated();
-}
-
-void Parameter::zeroMem() {
-  if (!bufs_[PARAMETER_VALUE]) return;
-  bufs_[PARAMETER_VALUE]->zeroMem();
-  setValueUpdated();
-  LOG(INFO) << getName() << " set to 0";
-}
-
-bool Parameter::isGradShared(size_t* blockNum) {
-  if (!useGpu_ && !isStatic() && FLAGS_enable_grad_share > 0 &&
-      !isGradSparseUpdate() &&
-      this->getSize() > (size_t)FLAGS_enable_grad_share) {
-    if (blockNum) {
-      *blockNum = (size_t)FLAGS_grad_share_block_num;
-    }
-    return true;
-  }
-  return false;
-}
-
-bool Parameter::isValueShared() {
-  return !useGpu_ && config_.is_shared() && FLAGS_trainer_count > 1;
-}
-
-bool Parameter::isGradSparseUpdate() const {
-  return !useGpu_ && !isStatic() &&
-         (config_.sparse_update() || config_.sparse_remote_update());
-}
-
-void Parameter::setMat(ParameterType pType, int matType) {
-  CHECK(!mats_[pType]);
-
-  if (config_.dims_size() == 0 && matType == MAT_NORMAL) {
-    return;
-  }
-
-  CHECK_EQ((size_t)config_.dims_size(), 2LU);
-  size_t height = config_.dims(0);
-  size_t width = config_.dims(1);
-  if (matType == MAT_NORMAL) {
-    if (!config_.is_sparse()) {
-      CHECK_EQ(height * width, bufs_[pType]->getSize());
-      mats_[pType] =
-          Matrix::create(bufs_[pType]->getMemoryHandle(), height, width);
-    } else {
-      size_t size = bufs_[pType]->getSize();
-      CHECK_GE(height * width, size);
-      if (format_ == SPARSE_CSR) {
-        CHECK_EQ(height + 1, intBufs_[PARAMETER_ROWS]->getSize());
-        CHECK_EQ(size, intBufs_[PARAMETER_COLS]->getSize());
-      } else {
-        CHECK_EQ(width + 1, intBufs_[PARAMETER_COLS]->getSize());
-        CHECK_EQ(size, intBufs_[PARAMETER_ROWS]->getSize());
-      }
-      mats_[pType] =
-          Matrix::createSparseMatrix(bufs_[pType]->getData(),
-                                     intBufs_[PARAMETER_ROWS]->getData(),
-                                     intBufs_[PARAMETER_COLS]->getData(),
-                                     height,
-                                     width,
-                                     bufs_[pType]->getSize(),
-                                     FLOAT_VALUE,
-                                     format_,
-                                     false,
-                                     useGpu_);
-    }
-  }
-#ifndef PADDLE_MOBILE_INFERENCE
-  // NOLINTNEXTLINE
-  else if (matType == MAT_NORMAL_SHARED) {
-    CHECK_EQ(height * width, bufs_[pType]->getSize());
-    size_t blockNum = 0;
-    CHECK(isGradShared(&blockNum));
-    mats_[pType] = std::make_shared<SharedCpuMatrix>(
-        blockNum,
-        std::dynamic_pointer_cast<CpuMemoryHandle>(
-            bufs_[pType]->getMemoryHandle()),
-        height,
-        width);
-  } else if (matType == MAT_VALUE_SHARED) {
-    CHECK_EQ(height * width, bufs_[pType]->getSize());
-    mats_[pType] = std::make_shared<SharedCpuMatrix>(
-        std::dynamic_pointer_cast<CpuMemoryHandle>(
-            bufs_[pType]->getMemoryHandle()),
-        height,
-        width);
-  } else if (matType == MAT_SPARSE_ROW_IDS) {
-    CHECK_EQ(height * width, bufs_[pType]->getSize());
-    mats_[pType] = std::make_shared<SparseRowIdsCpuMatrix>(
-        std::dynamic_pointer_cast<CpuMemoryHandle>(
-            bufs_[pType]->getMemoryHandle()),
-        height,
-        width);
-  } else if (matType == MAT_SPARSE_ROW) {
-    auto valueMat =
-        std::dynamic_pointer_cast<SparseRowCpuMatrix>(mats_[PARAMETER_VALUE]);
-    SparseRowCpuMatrix::IndexDictPtr indexDict(nullptr);
-    if (pType != PARAMETER_VALUE) {
-      CHECK(valueMat) << "The matrix for PARAMETER_VALUE must be set "
-                      << " and its type must be MAT_SPARSE_ROW,"
-                      << " MAT_SPARSE_ROW_PREFETCH or MAT_CACHE_ROW";
-      indexDict = valueMat->getIndexDictHandle();
-    }
-    auto mat =
-        std::make_shared<SparseRowCpuMatrix>(nullptr,
-                                             height,
-                                             width,
-                                             // grad share index with value
-                                             indexDict);
-    mats_[pType] = mat;
-  } else if (matType == MAT_CACHE_ROW) {
-    CHECK(isGradSparseUpdate());
-    auto mat = std::make_shared<CacheRowCpuMatrix>(height, width);
-    mats_[pType] = mat;
-  } else if (matType == MAT_SPARSE_ROW_PREFETCH_FULL_SIZE ||
-             matType == MAT_SPARSE_ROW_PREFETCH) {
-    auto mat = std::make_shared<SparsePrefetchRowCpuMatrix>(
-        bufs_[pType] ? std::dynamic_pointer_cast<CpuMemoryHandle>(
-                           bufs_[pType]->getMemoryHandle())
-                     : nullptr,
-        height,
-        width,
-        nullptr,  // indexDictHandle
-        getGlobalSyncThreadPool());
-    mats_[pType] = mat;
-  } else if (matType == MAT_SPARSE_ROW_AUTO_GROW) {
-    CHECK(isGradSparseUpdate());
-    mats_[pType] = std::make_shared<SparseAutoGrowRowCpuMatrix>(height, width);
-  }
-#endif
-  // NOLINTNEXTLINE
-  else {
-    LOG(FATAL) << "Unsupported mat type" << matType;
-  }
-}
-
-void Parameter::incUpdate(const UpdateCallback& callback) {
-  // Static parameter is fixed, and does not need to be updated
-  if (isStatic()) {
-    return;
-  }
-
-  ++updateCounter_;
-  if (isUpdatable()) {
-    if (callback) callback(this);
-    clearUpdate();
-  }
-}
-
-bool Parameter::save(const std::string& filename) const {
-  std::ofstream fs(filename, std::ios_base::binary);
-  CHECK(fs) << "Fail to open " << filename;
-  return save(fs);
-}
-
-bool Parameter::save(std::ostream& s) const {
-  CpuVector vec(*bufs_[PARAMETER_VALUE].get());
-  Header header;
-  header.format = headerFormat_;
-  header.valueSize = sizeof(real);
-  header.size = getSize();
-
-  CHECK_EQ(header.size, vec.getSize());
-
-  CHECK(s.write(reinterpret_cast<char*>(&header), sizeof(header)))
-      << "Fail to write parameter " << getName();
-
-  CHECK(s.write(reinterpret_cast<char*>(vec.getData()),
-                header.size * sizeof(real)))
-      << "Fail to write parameter " << getName();
-  if (config_.is_sparse()) {
-    CpuIVector rows(*intBufs_[PARAMETER_ROWS].get());
-    CpuIVector cols(*intBufs_[PARAMETER_COLS].get());
-    CHECK(s.write(reinterpret_cast<char*>(rows.getData()),
-                  rows.getSize() * sizeof(int)))
-        << "Fail to write parameter " << getName();
-    CHECK(s.write(reinterpret_cast<char*>(cols.getData()),
-                  cols.getSize() * sizeof(int)))
-        << "Fail to write parameter " << getName();
-  }
-
-  return true;
-}
-
-/**
- * Load parameter value from a file
- */
-bool Parameter::load(const std::string& filename) {
-  std::ifstream fs(filename, std::ios_base::binary);
-  if (!fs) {
-    LOG(INFO) << "missing parameters [" << filename << "] while loading model.";
-    if (kMissParameterFail == FLAGS_load_missing_parameter_strategy) {
-      LOG(FATAL) << getName() << " missing, not allowed.";
-      return false;
-    }
-    if (kMissParameterRand == FLAGS_load_missing_parameter_strategy) {
-      LOG(INFO) << getName() << " missing, set to random.";
-      randomize();
-      return true;
-    }
-    if (kMissParameterZero == FLAGS_load_missing_parameter_strategy) {
-      LOG(INFO) << getName() << " missing, set to zero.";
-      zeroMem();
-      return true;
-    }
-    LOG(FATAL) << "unsupported load_missing_parameter_strategy: "
-               << FLAGS_load_missing_parameter_strategy;
-    return false;
-  }
-  return load(fs);
-}
-
-bool Parameter::load(std::istream& s) {
-  CpuVector vec(*bufs_[PARAMETER_VALUE].get());
-  Header header;
-  CHECK(s.read(reinterpret_cast<char*>(&header), sizeof(header)))
-      << "Fail to read parameter " << getName();
-  CHECK(isHeaderFormatSupported(header.format)) << "Incorrect format version: "
-                                                << header.format;
-  headerFormat_ = header.format;
-  CHECK_EQ(header.size, getSize())
-      << "The size (" << header.size << ") in the file does not match the size "
-      << "(" << getSize() << ") of the parameter: " << getName();
-  CHECK_EQ(header.valueSize, sizeof(real))
-      << "Unsupported valueSize " << header.valueSize << " at: " << getName();
-  CHECK(s.read(reinterpret_cast<char*>(vec.getData()),
-               header.size * sizeof(real)));
-
-  auto& tmp = *bufs_[PARAMETER_VALUE].get();
-  if (typeid(tmp) == typeid(GpuVector)) {
-    bufs_[PARAMETER_VALUE]->copyFrom(vec);
-  }
-
-  if (config_.is_sparse() && config_.need_compact()) {
-    // load from dense parameter with many zero
-    CHECK_EQ(config_.dims_size(), 2);
-    auto height = config_.dims(0);
-    auto width = config_.dims(1);
-    auto mat = Matrix::create(vec.getData(), height, width);
-    CpuSparseMatrix sparseMat(height,
-                              width,
-                              0,
-                              FLOAT_VALUE,
-                              format_,
-                              /*trans*/ false);
-    sparseMat.copyFrom(*mat, HPPL_STREAM_DEFAULT);
-    auto nnz = sparseMat.getElementCnt();
-    size_t rowSize = (format_ == SPARSE_CSR) ? height + 1 : nnz;
-    size_t colSize = (format_ == SPARSE_CSR) ? nnz : width + 1;
-
-    intBufs_[PARAMETER_ROWS]->copyFrom(sparseMat.getRows(), rowSize);
-    intBufs_[PARAMETER_COLS]->copyFrom(sparseMat.getCols(), colSize);
-    bufs_[PARAMETER_VALUE]->resize(nnz);  // for setMat check
-    bufs_[PARAMETER_VALUE]->copyFrom(sparseMat.getValue(), nnz);
-    config_.set_size(nnz);
-    LOG(INFO) << "compact nnz=" << (1. * nnz / (height * width))
-              << " name=" << config_.name();
-  } else if (config_.is_sparse()) {
-    CpuIVector rows(*intBufs_[PARAMETER_ROWS].get());
-    CpuIVector cols(*intBufs_[PARAMETER_COLS].get());
-    size_t rowSize, colSize;
-    CHECK_EQ(config_.dims_size(), 2);
-    if (format_ == SPARSE_CSR) {
-      rowSize = config_.dims(0) + 1;
-      colSize = config_.size();
-    } else {
-      rowSize = config_.size();
-      colSize = config_.dims(1) + 1;
-    }
-    CHECK(
-        s.read(reinterpret_cast<char*>(rows.getData()), rowSize * sizeof(int)));
-    CHECK(
-        s.read(reinterpret_cast<char*>(cols.getData()), colSize * sizeof(int)));
-    auto& paramRows = *intBufs_[PARAMETER_ROWS].get();
-    if (typeid(paramRows) == typeid(GpuIVector)) {
-      intBufs_[PARAMETER_ROWS]->copyFrom(rows);
-    }
-    auto& paramCols = *intBufs_[PARAMETER_COLS].get();
-    if (typeid(paramCols) == typeid(GpuIVector)) {
-      intBufs_[PARAMETER_COLS]->copyFrom(cols);
-    }
-  }
-
-  setValueUpdated();
-
-  return true;
-}
-
-}  // namespace paddle
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
deleted file mode 100644
index 24ac10f3fe5977553332a9a8402d6795577b5ad8..0000000000000000000000000000000000000000
--- a/paddle/parameter/Parameter.h
+++ /dev/null
@@ -1,380 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "ParameterConfig.pb.h"
-#include "TrainerConfig.pb.h"
-
-#include "ParameterUpdaterHook.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-typedef enum {
-  /// The paddle original basic format
-  PARAM_FORMAT_ORIGINAL = 0,
-
-  /// See mkldnn_memory_format_t in
-  /// https://github.com/01org/mkl-dnn/blob/master/include/mkldnn_types.h
-  /// for a detailed description.
-  /// 2D weights tensor in the format (output channels, input channels).
-  PARAM_FORMAT_MKLDNN_OI,
-
-  /// The total format items numbers
-  PARAM_FORMAT_ITEMS,
-} PARAM_FORMAT;
-
-class SparsePrefetchRowCpuMatrix;
-
-class Parameter;
-typedef std::function<void(Parameter* param)> UpdateCallback;
-typedef std::function<void(int paramId, Parameter* param)> ParamInitCallback;
-
-class Parameter;
-typedef std::shared_ptr<Parameter> ParameterPtr;
-
-class Parameter {
-public:
-  Parameter(const ParameterConfig& config, bool useGpu, bool doInit = true);
-  const std::string& getName() const { return config_.name(); }
-
-  size_t getSize() const { return config_.size(); }
-
-  bool isFullSize() const {
-    if (bufs_[PARAMETER_VALUE]) {
-      return this->getSize() == bufs_[PARAMETER_VALUE]->getSize();
-    }
-    return false;
-  }
-
-  inline bool useGpu() const { return useGpu_; }
-
-  int getDeviceId() const { return deviceId_; }
-
-  void setDevice(int deviceId) { deviceId_ = deviceId; }
-
-  /// The id ranges from 0 to the_total_number_of_parameters - 1
-  size_t getID() const { return config_.para_id(); }
-
-  /// ID is a implict value created until neural network is built.
-  void setID(size_t id) { config_.set_para_id(id); }
-
-  bool isStatic() const { return config_.is_static(); }
-
-  enum MatType {
-    MAT_NORMAL,
-    /// both value and grad are shared
-    MAT_NORMAL_SHARED,
-
-    /// Now used in BatchNorm in CPU mode
-    MAT_VALUE_SHARED,
-
-    /// sparse matrix, which has full size parameter
-    MAT_SPARSE_ROW_IDS,
-    /// sparse matrix, parameter size scale by sparse rates.
-    MAT_SPARSE_ROW_AUTO_GROW,
-    MAT_CACHE_ROW,
-    MAT_SPARSE_ROW,
-
-    /// sparse matrix for prefetching parameter from pserver
-    MAT_SPARSE_ROW_PREFETCH,
-    /// same as above, but parameter has full size for saving parameter in local
-    MAT_SPARSE_ROW_PREFETCH_FULL_SIZE,
-  };
-
-  void enableSparseParameter() {
-    if (config_.is_sparse()) {
-      if (config_.format() == "csr") {
-        size_t height = config_.dims(0);
-        size_t nnz = config_.size();
-        enableIntType(PARAMETER_ROWS, height + 1);
-        enableIntType(PARAMETER_COLS, nnz);
-        format_ = SPARSE_CSR;
-      } else {
-        size_t width = config_.dims(1);
-        size_t nnz = config_.size();
-        enableIntType(PARAMETER_COLS, width + 1);
-        enableIntType(PARAMETER_ROWS, nnz);
-        format_ = SPARSE_CSC;
-      }
-    }
-  }
-
-  /// allocate buffer for the give type
-  void enableType(ParameterType type, MatType matType = MAT_NORMAL) {
-    if (bufs_[type] || mats_[type]) {
-      return;
-    }
-    SetDevice device(deviceId_);
-    if (config_.dims_size() == 2) {
-      if (matType == MAT_NORMAL || matType == MAT_NORMAL_SHARED ||
-          matType == MAT_SPARSE_ROW_PREFETCH_FULL_SIZE ||
-          matType == MAT_VALUE_SHARED || matType == MAT_SPARSE_ROW_IDS) {
-        bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
-        bufs_[type]->zeroMem();
-      } else {
-        CHECK(isGradSparseUpdate());
-      }
-      if (config_.is_sparse() && type == PARAMETER_VALUE) {
-        enableSparseParameter();
-      }
-      setMat(type, matType);
-    } else {
-      bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
-      bufs_[type]->zeroMem();
-    }
-  }
-
-  void enableBufType(ParameterType type) {
-    if (bufs_[type]) return;
-    bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
-    bufs_[type]->zeroMem();
-  }
-
-  void enableIntType(ParameterType type, size_t intStoreSize = 0) {
-    if (!intBufs_[type]) {
-      SetDevice device(deviceId_);
-      size_t size = intStoreSize ? intStoreSize : config_.size();
-      intBufs_[type] = IVector::create(size, useGpu_);
-      intBufs_[type]->zeroMem();
-    }
-  }
-
-  void enableSharedType(ParameterType type,
-                        VectorPtr vec,
-                        MatrixPtr mat = nullptr) {
-    if (!bufs_[type] && !mats_[type]) {
-      bufs_[type] = vec;
-      mats_[type] = mat;
-    }
-  }
-
-  /// for batchGradientMachine: blockNum is number of partitions of the matrix.
-  bool isGradShared(size_t* blockNum = NULL);
-
-  bool isValueShared();
-
-  // for AsgdSparseGradientMachine & SgdSparseGradientMachine:
-  // and MultiGradientMachine
-  bool isGradSparseUpdate() const;
-
-  bool isSparseRemoteUpdate() const {
-    return config_.sparse_remote_update() && !useGpu();
-  }
-
-  const ParameterConfig& getConfig() const { return config_; }
-
-  ParameterConfig& getConfig() { return config_; }
-
-  bool hasType(ParameterType pType) const {
-    return bufs_[pType] || mats_[pType];
-  }
-
-  const VectorPtr& getBuf(ParameterType pType) const {
-    return this->bufs_[pType];
-  }
-
-  const VectorPtr* getBufs() const { return bufs_; }
-
-  const MatrixPtr& getMat(ParameterType pType) const { return mats_[pType]; }
-
-  void setValueUpdated() { updated_ = true; }
-
-  void clearValueUpdated() { updated_ = false; }
-
-  bool isValueUpdated() const { return updated_; }
-
-  /**
-   * Save parameter value to a file
-   */
-  bool save(const std::string& filename) const;
-
-  /**
-   * Save parameter to ostream
-   */
-  bool save(std::ostream& s) const;
-
-  /**
-   * Load parameter value from a file
-   */
-  bool load(const std::string& filename);
-
-  /**
-   * Load parameter from istream
-   */
-  bool load(std::istream& is);
-
-  void incShared() { sharedCount_++; }
-
-  /**
-   * After one of the parameter's gradient is merged
-   * You should call this function to do some additional processing,
-   */
-  void incUpdate(const UpdateCallback& callbacks = NULL);
-
-  void clearGradient() {
-    auto& mat = getMat(PARAMETER_GRADIENT);
-    if (mat) {
-      // zeroMem will also clear rows for SparseRowCpuMatrix
-      mat->zeroMem();
-    } else {
-      auto& gradBuf = getBuf(PARAMETER_GRADIENT);
-      if (gradBuf) gradBuf->zeroMem();
-    }
-  }
-
-  void initialize();
-
-  /**
-   * Initialize the value according to config_: initial_mean,
-   * initial_std and initial_strategy.
-   */
-  void randomize();
-  static void randomize(const VectorPtr& value, const ParameterConfig& config);
-
-  /// Initialize the value to 0
-  void zeroMem();
-
-  /// file header structure
-  struct Header {
-    int32_t format;      // = PARAM_FORMAT
-    uint32_t valueSize;  // = sizeof(real)
-    uint64_t size;       // = getSize()
-  };
-
-  /**
-   * @brief Is the header format supported.
-   */
-  static bool isHeaderFormatSupported(int32_t fmt) {
-    return fmt < PARAM_FORMAT_ITEMS;
-  }
-
-  /**
-   * @brief Get the format in header.
-   */
-  int getHeaderFormat() { return headerFormat_; }
-
-  /**
-   * @brief Set the format in header.
-   */
-  void setHeaderFormat(int32_t fmt) {
-    CHECK(isHeaderFormatSupported(fmt)) << "Unsupported format version: "
-                                        << fmt;
-    headerFormat_ = fmt;
-  }
-
-  /**
-   * @brief  Parameter Update Hook.
-   *
-   * The parameter's update hook before ParameterUpdater::updateImpl
-   * It could modify gradient/momentum/etc here. Such as drop some gradient,
-   * etc.
-   */
-  void updateHook() {
-    for (auto& hook : updaterHooks_) {
-      hook->update(this);
-    }
-  }
-
-  /**
-   * @brief  Initialize all updater hook.
-   *
-   * This method should be invoked in ParameterUpdater::init() only.
-   */
-  void initHook() {
-    for (auto& hook : updaterHooks_) {
-      hook->init(this);
-    }
-  }
-
-protected:
-  /**
-   * @brief create matrix to matType.
-   *
-   * used by gradient machine which needs specify matrix type,
-   * instead of creating in weights.cpp.
-   *
-   * @note  pType should be enabled already.
-   */
-  void setMat(ParameterType pType, int matType);
-
-  bool isUpdatable() { return (updateCounter_ == sharedCount_); }
-
-  void clearUpdate() { updateCounter_ = 0; }
-
-protected:
-  ParameterConfig config_;
-
-  bool useGpu_;
-
-  int deviceId_;
-
-  /**
-   * @brief bufs_ stores parameter value and gradient.
-   *
-   * Layer should use bufs_[PARAMETER_VALUE] to form weight matrix for
-   * calculation and stores gradient to bufs_[PARAMETER_GRADIENT].
-   */
-  VectorPtr bufs_[NUM_PARAMETER_TYPES];
-
-  /**
-   * @brief Weight matrix for bufs_.
-   *
-   * It's helpfull when parameter shared by multi-layers.
-   * Caller should check, if mats exist, do not create it again.
-   */
-  MatrixPtr mats_[NUM_PARAMETER_TYPES];
-
-  /// Int vectors, used in some User defined parameter types
-  IVectorPtr intBufs_[NUM_PARAMETER_TYPES];
-
-  int sharedCount_;
-  int updateCounter_;
-
-  bool updated_;
-  SparseFormat format_;
-
-  /// The header format for saving or loading param
-  int32_t headerFormat_;
-
-  std::vector<std::shared_ptr<IParameterUpdaterHook>> updaterHooks_;
-
-public:
-  void setSharedCount(int cnt) { sharedCount_ = cnt; }
-  int getSharedCount() { return sharedCount_; }
-
-  bool isSparse() { return config_.is_sparse(); }
-  SparseFormat getFormat() { return format_; }
-
-  static const std::string kMissParameterFail;
-  static const std::string kMissParameterRand;
-  static const std::string kMissParameterZero;
-};
-
-typedef std::map<std::string, ParameterPtr> ParameterMap;
-
-}  // namespace paddle
diff --git a/paddle/parameter/ParameterOptimizer.cpp b/paddle/parameter/ParameterOptimizer.cpp
deleted file mode 100644
index 638daa58f1e5f3f416d7f90ad2662a523eaf6741..0000000000000000000000000000000000000000
--- a/paddle/parameter/ParameterOptimizer.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Logging.h"
-
-#include <fstream>
-
-#include "AverageOptimizer.h"
-#include "FirstOrderOptimizer.h"
-#include "OptimizerFunctions.h"
-#include "OptimizerWithRegularizer.h"
-#include "ParameterOptimizer.h"
-#include "hl_gpu.h"
-
-namespace paddle {
-
-ParameterOptimizer* ParameterOptimizer::create(
-    const OptimizationConfig& optConfig, bool inPserver) {
-  if (inPserver && optConfig.num_batches_per_send_parameter() > 1) {
-    return new AddOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "momentum") {
-    return new SgdOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "torch_momentum") {
-    return new SgdOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "adagrad") {
-    return new AdagradParameterOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "adadelta") {
-    return new AdaDeltaParameterOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "rmsprop") {
-    return new RMSPropParameterOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "decayed_adagrad") {
-    return new DecayedAdagradParameterOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "adam") {
-    return new AdamParameterOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "adamax") {
-    return new AdamaxParameterOptimizer(optConfig);
-  }
-  if (optConfig.learning_method() == "sparse_momentum") {
-    return new SparseMomentumParameterOptimizer(optConfig);
-  }
-  return nullptr;
-}
-
-}  // namespace paddle
diff --git a/paddle/parameter/ParameterOptimizer.h b/paddle/parameter/ParameterOptimizer.h
deleted file mode 100644
index a8d0ca72f21d04e0e65a9dd6a07e8f53b23e4223..0000000000000000000000000000000000000000
--- a/paddle/parameter/ParameterOptimizer.h
+++ /dev/null
@@ -1,211 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "LearningRateScheduler.h"
-#include "Parameter.h"
-
-namespace paddle {
-
-/**
- * Some member functions are set to const for two reasons:
- *
- * 1. For sparse update thread safe: update(), traverse callback(const this)
- *    may be called many times, each time one row, and these function
- *    can be called parallelly by multi worker, to speed up large block.
- *
- * 2. For predicate functions, needSpecialTraversal(), startCatchUpWith()
- *    may be called many times, should be no state change between calls.
- */
-class ParameterOptimizer {
-public:
-  typedef std::function<void(
-      const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId)>
-      TraverseCallback;
-
-public:
-  explicit ParameterOptimizer(const OptimizationConfig& optConfig)
-      : applyDecay_(true),
-        optConfig_(optConfig),
-        parameterTypes_{PARAMETER_VALUE, PARAMETER_GRADIENT},
-        learningRate_(optConfig.learning_rate()),
-        learningRateScheduler_(LearningRateScheduler::create(optConfig)),
-        pass_(0),
-        firstTime_(true) {}
-
-  real calcLearningRate(int64_t numSamplesProcessed, int64_t pass) {
-    return learningRateScheduler_->calcLearningRate(numSamplesProcessed, pass);
-  }
-
-  virtual ~ParameterOptimizer() {}
-
-  /**
-   * For sparse update, optimizer can maintain numRows of timer(t0).
-   * Some sparse optimizer depends on parameter config in functions
-   * such as startBatch(). Optimizer can get it here. But notice that,
-   * not all callers can pass config here, so the optimizer should check
-   * config passed in is not null ptr.
-   */
-  virtual void init(size_t numRows, const ParameterConfig* config) {}
-
-  virtual void startPass() {}
-  virtual void finishPass() { ++pass_; }
-
-  /// called by Trainer before forward() of a batch.
-  virtual void startBatch(int64_t numSamplesProcessed) {
-    (void)numSamplesProcessed;
-  }
-
-  /**
-   * following hooks useful for sparse update,
-   * because the traversal in block costs.
-   * called by Trainer after update and before finishBatch
-   * e.g. Trainer call like this:
-   *
-   * @code
-   * startBatch();
-   * if (dense) {
-   *   update(blockVec);
-   * } else {//sparse
-   *   for (row : rows_in_block) {update(rowVec)}
-   * }
-   * auto callback = needSpecialTraversal();
-   * if (callback) {
-   *   // do traverse, maybe multi-thread
-   *   if (dense) {
-   *     callback();
-   *   } else {//sparse
-   *     for (row : all_rows_in_block) {callback();}
-   *   }
-   * }
-   * finishBatch();
-   * @endcode
-   *
-   * @return callback if need traverse,
-   *         else return nullptr.
-   *         It should be no state change.
-   */
-  virtual TraverseCallback needSpecialTraversal(
-      const ParameterConfig& config) const {
-    return nullptr;
-  }
-
-  /// called by Trainer after backward() of a batch
-  virtual void finishBatch() {}
-
-  /**
-   * between startBatch() and finishBatch(), update() will be called
-   * by the trainer multiple times, each time for updating one Parameter
-   * with its gradient in PARAMETER_GRADIENT. sparseId is row id,
-   * when sparseId set, update is sparse, each time one row.
-   */
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& config,
-                      size_t sparseId = -1LU) const = 0;
-
-  /**
-   * following hooks catch up with current time for sparse update,
-   * In the beginning, call startCatchUpWith() and check return.
-   * In the end, call finishCatchUpWith() to finish state.
-   * callback do the actual works, can call many times for sparse data.
-   * e.g. Trainer call like this:
-   *
-   * @code
-   * auto callback = startCatchUpWith();
-   * if (callback) {
-   *   // do catch up with, maybe multi-thread
-   *   if (dense) {
-   *     callback();
-   *   } else {//sparse
-   *     for (row : rows_in_block) {callback();}
-   *   }
-   *   // finish catch up with, main thread
-   *   finishCatchUpWith();
-   * }
-   * @endcode
-   *
-   * @return callback if need catch up with,
-   *         else return nullptr.
-   *         It should be no state change.
-   */
-  virtual TraverseCallback startCatchUpWith() const { return nullptr; }
-  virtual void finishCatchUpWith() {}
-
-  /**
-   * following two hooks used by averager,
-   * apply to final parameter value (PARAMETER_VALUE or PARAMETER_APPLY).
-   *
-   * restore() will restore orginal value if it apply to PARAMETER_VALUE.
-   * Caller must ensure it's catched up with current time before apply.
-   *
-   * Use returned callback same way as callback returned by
-   * ParameterOptimizer::needSpecialTraversal()
-   */
-  virtual TraverseCallback apply() { return nullptr; }
-  virtual TraverseCallback restore() { return nullptr; }
-
-  /// return the parameter types used by this updater
-  const std::vector<ParameterType>& getParameterTypes() const {
-    return parameterTypes_;
-  }
-
-  void addParameterType(ParameterType type) {
-    for (auto t : parameterTypes_) {
-      if (t == type) return;
-    }
-    parameterTypes_.push_back(type);
-  }
-
-  real getLearningRate() const { return learningRate_; }
-
-  virtual void setNoDecay() { applyDecay_ = false; }
-
-  static ParameterOptimizer* create(const OptimizationConfig& optConfig,
-                                    bool inPserver = false);
-
-protected:
-  typedef std::vector<ParameterOptimizer::TraverseCallback> TraverseCallbackVec;
-
-  static TraverseCallback composeCallbacks(
-      const TraverseCallbackVec& callbacks) {
-    if (callbacks.size() > 1LU) {
-      return [callbacks](const VectorPtr vecs[],
-                         const ParameterConfig& config,
-                         size_t sparseId) {
-        for (auto callback : callbacks) {
-          callback(vecs, config, sparseId);
-        }
-      };
-    }
-    return (callbacks.size() == 1LU) ? callbacks[0] : nullptr;
-  }
-
-  bool applyDecay_;
-  const OptimizationConfig& optConfig_;
-  std::vector<ParameterType> parameterTypes_;
-
-  /**
-   * global learning rate, init value is opt_config.learning_rate,
-   * sparse regularizer get this value per batch, after StartBatch() called
-   * so, if lr change in StartBatch, please assign to learningRate_
-   */
-  real learningRate_;
-
-  std::unique_ptr<LearningRateScheduler> learningRateScheduler_;
-  int64_t pass_;  // current training pass (starting from 0)
-  bool firstTime_;
-};
-
-}  // namespace paddle
diff --git a/paddle/parameter/ParameterUpdateFunctions.cpp b/paddle/parameter/ParameterUpdateFunctions.cpp
deleted file mode 100644
index db1153c2d6430e453d776b92b63152c311771668..0000000000000000000000000000000000000000
--- a/paddle/parameter/ParameterUpdateFunctions.cpp
+++ /dev/null
@@ -1,300 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Logging.h"
-#ifdef __AVX__
-#include <x86intrin.h>
-#include <xmmintrin.h>
-#endif
-
-#include "ParameterUpdateFunctions.h"
-
-namespace paddle {
-
-void sgdUpdateCpu(real learningRate,
-                  real momentum,
-                  real decayRate,
-                  size_t size,
-                  real* value,
-                  const real* grad,
-                  real* momentumVec) {
-  decayRate *= learningRate;
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (size_t i = 0; i < size; ++i) {
-    momentumVec[i] = momentum * momentumVec[i] - learningRate * grad[i] -
-                     decayRate * value[i];
-    value[i] += momentumVec[i];
-  }
-}
-
-void sgdUpdate(real learningRate,
-               real momentum,
-               real decayRate,
-               Vector* value,
-               Vector* grad,
-               Vector* momentumVec) {
-  size_t size = value->getSize();
-  real* val = value->getData();
-  real* grd = grad->getData();
-  real* mom = momentumVec->getData();
-  if (typeid(*value) == typeid(CpuVector)) {
-    sgdUpdateCpu(learningRate, momentum, decayRate, size, val, grd, mom);
-  } else if (typeid(*value) == typeid(GpuVector)) {
-    value->sgdUpdate(*grad, *momentumVec, learningRate, momentum, decayRate);
-  } else {
-    LOG(FATAL) << "Wrong";
-  }
-}
-
-void sgdUpdateAvx(float learningRate,
-                  float momentum,
-                  float decayRate,
-                  size_t size,
-                  float* value,
-                  const float* _grad,
-                  float* momentumVec) {
-#ifdef __AVX__
-  float* grad = const_cast<float*>(_grad);  // the gradient is not modified
-                                            // but when invoke simd functions
-                                            // need non-const pointer.
-  size_t gradientAlign = 0;
-  size_t gradientAlignHeader = (size_t)grad % sizeof(__m256);
-  CHECK_EQ(gradientAlignHeader, (size_t)momentumVec % sizeof(__m256))
-      << "Gradent buffer didn't align with momentum buffer";
-  CHECK_EQ(gradientAlignHeader, (size_t)value % sizeof(__m256))
-      << "Gradent buffer didn't align with value buffer";
-  if (0 != gradientAlignHeader) {
-    gradientAlignHeader = sizeof(__m256) - gradientAlignHeader;
-    gradientAlign = gradientAlignHeader / sizeof(real);
-
-    // handle the unalign buffer
-    for (size_t i = 0; i < gradientAlign; i++) {
-      momentumVec[i] = momentum * momentumVec[i] - (learningRate * grad[i]) -
-                       (decayRate * learningRate * value[i]);
-      value[i] += momentumVec[i];
-    }
-    grad += gradientAlign;
-    momentumVec += gradientAlign;
-    value += gradientAlign;
-  }
-
-  constexpr size_t kParallelNum = 8;
-  constexpr size_t nStepSize = (sizeof(__m256) / sizeof(real)) * kParallelNum;
-  size_t cntLoop = (size - gradientAlign) / nStepSize;
-  size_t cntRem = (size - gradientAlign) % nStepSize;
-  __m256 gradientTmp[kParallelNum];
-  __m256 valueTmp[kParallelNum];
-  __m256 lr, mom, dr;
-  std::function<void(void)> loopFun;
-
-  learningRate *= -1;
-  lr = _mm256_set_ps(learningRate,
-                     learningRate,
-                     learningRate,
-                     learningRate,
-                     learningRate,
-                     learningRate,
-                     learningRate,
-                     learningRate);
-
-  if (0 != momentum) {
-    mom = _mm256_set_ps(momentum,
-                        momentum,
-                        momentum,
-                        momentum,
-                        momentum,
-                        momentum,
-                        momentum,
-                        momentum);
-  }
-
-  decayRate *= learningRate;
-  if (0 != decayRate) {
-    dr = _mm256_set_ps(decayRate,
-                       decayRate,
-                       decayRate,
-                       decayRate,
-                       decayRate,
-                       decayRate,
-                       decayRate,
-                       decayRate);
-  }
-
-  auto gradMulFun = [&](void) {
-    gradientTmp[0] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad), lr);
-    gradientTmp[1] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 8), lr);
-    gradientTmp[2] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 16), lr);
-    gradientTmp[3] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 24), lr);
-    gradientTmp[4] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 32), lr);
-    gradientTmp[5] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 40), lr);
-    gradientTmp[6] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 48), lr);
-    gradientTmp[7] = _mm256_mul_ps(*reinterpret_cast<__m256*>(grad + 56), lr);
-  };
-
-  auto valueMulFun = [&](void) {
-    valueTmp[0] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value), dr);
-    valueTmp[1] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 8), dr);
-    valueTmp[2] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 16), dr);
-    valueTmp[3] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 24), dr);
-    valueTmp[4] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 32), dr);
-    valueTmp[5] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 40), dr);
-    valueTmp[6] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 48), dr);
-    valueTmp[7] = _mm256_mul_ps(*reinterpret_cast<__m256*>(value + 56), dr);
-  };
-
-  auto momentumMulFun = [&](void) {
-    *reinterpret_cast<__m256*>(momentumVec) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 8) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 8), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 16) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 16), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 24) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 24), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 32) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 32), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 40) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 40), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 48) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 48), mom);
-    *reinterpret_cast<__m256*>(momentumVec + 56) =
-        _mm256_mul_ps(*reinterpret_cast<__m256*>(momentumVec + 56), mom);
-  };
-
-  auto momentumAddGradFun = [&](void) {
-    *reinterpret_cast<__m256*>(momentumVec) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(momentumVec), gradientTmp[0]);
-    *reinterpret_cast<__m256*>(momentumVec + 8) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 8), gradientTmp[1]);
-    *reinterpret_cast<__m256*>(momentumVec + 16) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 16), gradientTmp[2]);
-    *reinterpret_cast<__m256*>(momentumVec + 24) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 24), gradientTmp[3]);
-    *reinterpret_cast<__m256*>(momentumVec + 32) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 32), gradientTmp[4]);
-    *reinterpret_cast<__m256*>(momentumVec + 40) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 40), gradientTmp[5]);
-    *reinterpret_cast<__m256*>(momentumVec + 48) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 48), gradientTmp[6]);
-    *reinterpret_cast<__m256*>(momentumVec + 56) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 56), gradientTmp[7]);
-  };
-
-  auto momentumZeroFun = [&](void) {
-    *reinterpret_cast<__m256*>(momentumVec) = gradientTmp[0];
-    *reinterpret_cast<__m256*>(momentumVec + 8) = gradientTmp[1];
-    *reinterpret_cast<__m256*>(momentumVec + 16) = gradientTmp[2];
-    *reinterpret_cast<__m256*>(momentumVec + 24) = gradientTmp[3];
-    *reinterpret_cast<__m256*>(momentumVec + 32) = gradientTmp[4];
-    *reinterpret_cast<__m256*>(momentumVec + 40) = gradientTmp[5];
-    *reinterpret_cast<__m256*>(momentumVec + 48) = gradientTmp[6];
-    *reinterpret_cast<__m256*>(momentumVec + 56) = gradientTmp[7];
-  };
-
-  auto momentumAddValueFun = [&](void) {
-    *reinterpret_cast<__m256*>(momentumVec) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(momentumVec), valueTmp[0]);
-    *reinterpret_cast<__m256*>(momentumVec + 8) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(momentumVec + 8), valueTmp[1]);
-    *reinterpret_cast<__m256*>(momentumVec + 16) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 16), valueTmp[2]);
-    *reinterpret_cast<__m256*>(momentumVec + 24) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 24), valueTmp[3]);
-    *reinterpret_cast<__m256*>(momentumVec + 32) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 32), valueTmp[4]);
-    *reinterpret_cast<__m256*>(momentumVec + 40) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 40), valueTmp[5]);
-    *reinterpret_cast<__m256*>(momentumVec + 48) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 48), valueTmp[6]);
-    *reinterpret_cast<__m256*>(momentumVec + 56) = _mm256_add_ps(
-        *reinterpret_cast<__m256*>(momentumVec + 56), valueTmp[7]);
-  };
-
-  auto valueAddMomentumFun = [&](void) {
-    *reinterpret_cast<__m256*>(value) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value),
-                      *reinterpret_cast<__m256*>(momentumVec));
-    *reinterpret_cast<__m256*>(value + 8) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 8),
-                      *reinterpret_cast<__m256*>(momentumVec + 8));
-    *reinterpret_cast<__m256*>(value + 16) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 16),
-                      *reinterpret_cast<__m256*>(momentumVec + 16));
-    *reinterpret_cast<__m256*>(value + 24) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 24),
-                      *reinterpret_cast<__m256*>(momentumVec + 24));
-    *reinterpret_cast<__m256*>(value + 32) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 32),
-                      *reinterpret_cast<__m256*>(momentumVec + 32));
-    *reinterpret_cast<__m256*>(value + 40) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 40),
-                      *reinterpret_cast<__m256*>(momentumVec + 40));
-    *reinterpret_cast<__m256*>(value + 48) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 48),
-                      *reinterpret_cast<__m256*>(momentumVec + 48));
-    *reinterpret_cast<__m256*>(value + 56) =
-        _mm256_add_ps(*reinterpret_cast<__m256*>(value + 56),
-                      *reinterpret_cast<__m256*>(momentumVec + 56));
-  };
-
-  if (0 == decayRate && 0 == momentum) {
-    loopFun = [&](void) {
-      gradMulFun();
-      momentumZeroFun();
-      valueAddMomentumFun();
-    };
-  } else if (0 == decayRate && 0 != momentum) {
-    loopFun = [&](void) {
-      gradMulFun();
-      momentumMulFun();
-      momentumAddGradFun();
-      valueAddMomentumFun();
-    };
-  } else if (0 != decayRate && 0 == momentum) {
-    loopFun = [&](void) {
-      gradMulFun();
-      valueMulFun();
-      momentumZeroFun();
-      momentumAddValueFun();
-      valueAddMomentumFun();
-    };
-  } else if (0 != decayRate && 0 != momentum) {
-    loopFun = [&](void) {
-      gradMulFun();
-      valueMulFun();
-      momentumMulFun();
-      momentumAddGradFun();
-      momentumAddValueFun();
-      valueAddMomentumFun();
-    };
-  }
-
-  for (size_t i = 0; i < cntLoop; i++) {
-    loopFun();
-    grad += nStepSize;
-    momentumVec += nStepSize;
-    value += nStepSize;
-  }
-
-  for (size_t i = 0; i < cntRem; i++) {
-    momentumVec[i] = momentum * momentumVec[i] + (learningRate * grad[i]) +
-                     (decayRate * value[i]);
-    value[i] += momentumVec[i];
-  }
-#endif
-}
-
-}  // namespace paddle
diff --git a/paddle/parameter/ParameterUpdateFunctions.h b/paddle/parameter/ParameterUpdateFunctions.h
deleted file mode 100644
index 7434baa2d3d6297cc6d8d99b46cff516d6ac49f9..0000000000000000000000000000000000000000
--- a/paddle/parameter/ParameterUpdateFunctions.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/math/Vector.h"
-#include "paddle/utils/Common.h"
-
-namespace paddle {
-
-/**
- * Performs the following operations.
- *
- * momentumVec = momentum * momentumVec
- *               - learningRate * grad
- *               - learningRate * decayRate * value
- *
- * value = value + momentumVec
- * momentum = 0 or decayRate = 0 are specially handled to avoid unnecessary
- * computation.
- */
-void sgdUpdate(real learningRate,
-               real momentum,
-               real decayRate,
-               Vector* value,
-               Vector* grad,
-               Vector* momentumVec);
-
-void sgdUpdateCpu(real learningRate,
-                  real momentum,
-                  real decayRate,
-                  size_t size,
-                  real* value,
-                  const real* grad,
-                  real* momentumVec);
-
-void sgdUpdateAvx(float learningRate,
-                  float momentum,
-                  float decayRate,
-                  size_t size,
-                  float* value,
-                  const float* grad,
-                  float* momentumVec);
-
-}  // namespace paddle
diff --git a/paddle/parameter/ParameterUpdaterBase.cpp b/paddle/parameter/ParameterUpdaterBase.cpp
deleted file mode 100644
index 7815856b45d93406597b332469de1c57a7781da5..0000000000000000000000000000000000000000
--- a/paddle/parameter/ParameterUpdaterBase.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParameterUpdaterBase.h"
-#include <fstream>
-#include "hl_gpu.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-void ParameterUpdater::init(const std::vector<ParameterPtr>& parameters) {
-  parameters_ = parameters;
-  for (ParameterType type : getParameterTypes()) {
-    for (auto& para : parameters) {
-      para->enableType(type);
-    }
-  }
-  for (size_t pid = 0; pid < parameters_.size(); ++pid) {
-    nonStaticParaIDMap_.insert(
-        std::pair<size_t, size_t>(parameters_[pid]->getID(), pid));
-  }
-
-  for (auto& para : parameters) {
-    if (!para->isStatic()) {
-      para->initHook();
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/parameter/ParameterUpdaterBase.h b/paddle/parameter/ParameterUpdaterBase.h
deleted file mode 100644
index 717e1c6721b6e4d3ff81172eb06213677c3bff98..0000000000000000000000000000000000000000
--- a/paddle/parameter/ParameterUpdaterBase.h
+++ /dev/null
@@ -1,182 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Parameter.h"
-
-namespace paddle {
-
-class ParameterOptimizer;
-
-class ParameterUpdater {
-public:
-  ParameterUpdater() : parameterTypes_{PARAMETER_VALUE, PARAMETER_GRADIENT} {}
-  virtual ~ParameterUpdater() {}
-
-  void addParameterType(ParameterType type) {
-    for (auto t : parameterTypes_) {
-      if (t == type) return;
-    }
-    parameterTypes_.push_back(type);
-  }
-
-  virtual void init(const std::vector<ParameterPtr>& parameters);
-
-  // called by Trainer when starting a new pass
-  virtual void startPass() {}
-
-  // called by Trainer then finishing a pass, ruturn true if pass accepted
-  virtual bool finishPass() { return true; }
-
-  // called by Trainer before backward() of a batch
-  // Return the type of pass it needs. This pass type will be passed
-  // to GradientMachine::forward() by the caller.
-  virtual PassType startBatch(int64_t batchSize) {
-    (void)batchSize;
-    return PASS_TRAIN;
-  }
-
-  // called by Trainer after backward() of a batch
-  // cost: the cost for this batch
-  virtual void finishBatch(real cost) { (void)cost; }
-
-  // between startBatch() and finishBatch(), update() will be called
-  // by the trainer multiple times, each time for updating one Parameter
-  // with its gradient in PARAMETER_GRADIENT
-  void update(Parameter* para) {
-    SetDevice setDevice(para->getDeviceId());
-    para->updateHook();
-    this->updateImpl(para);
-  }
-
-  // only get required sparse rows by default,
-  // get full matrix parameter if *fullSize* set
-  // get PARAMETER_APPLY on pserver if *apply* set
-  virtual void getParametersRemote(bool fullSize = false, bool apply = false) {}
-
-  virtual void loadParametersRemote(const std::string& dirName) {}
-  virtual void saveParametersRemote(const std::string& dirName) {}
-  virtual void randParametersRemote() {}
-
-  // something like regularization may be delayed apply
-  // trainer should catch up with before parameter is saved or sended.
-  virtual void catchUpWith() {}
-
-  // following two hooks used by averager
-  // apply to final parameter value (PARAMETER_VALUE or PARAMETER_APPLY).
-  // restore() will restore orginal value if it apply to PARAMETER_VALUE.
-  virtual void apply() {}
-  virtual void restore() {}
-
-  // return the parameter types used by this updater
-  const std::vector<ParameterType>& getParameterTypes() const {
-    return parameterTypes_;
-  }
-
-#ifndef PADDLE_DISABLE_TIMER
-  virtual void setForwardbackwardTime(uint64_t delta) {}
-#endif
-
-protected:
-  virtual void updateImpl(Parameter* para) = 0;
-
-  std::vector<ParameterType> parameterTypes_;
-  std::vector<ParameterPtr> parameters_;
-  std::map<size_t, size_t> nonStaticParaIDMap_;
-};
-
-// Composite of ParameterUpdaters, each ParameterUpdater handle
-// part of all Parameters. It's useful when we need different
-// update strategy for different Parameter.
-class ParameterUpdaterComposite : public ParameterUpdater {
-public:
-  ParameterUpdaterComposite() {}
-  virtual ~ParameterUpdaterComposite() {}
-
-  virtual void init(const std::vector<ParameterPtr>& parameters) = 0;
-
-  virtual void startPass() {
-    syncThreadPool_->execPlusOwner(
-        [&](int tid, size_t numThreads) { updaters_[tid]->startPass(); });
-  }
-
-  virtual bool finishPass() {
-    syncThreadPool_->execPlusOwner(
-        [&](int tid, size_t numThreads) { updaters_[tid]->finishPass(); });
-    return true;
-  }
-
-  virtual PassType startBatch(int64_t batchSize) {
-    syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
-      updaters_[tid]->startBatch(batchSize);
-    });
-    return PASS_TRAIN;
-  }
-
-  virtual void finishBatch(real cost) {
-    syncThreadPool_->execPlusOwner(
-        [&](int tid, size_t numThreads) { updaters_[tid]->finishBatch(cost); });
-  }
-
-  virtual void getParametersRemote(bool fullSize, bool apply) {
-    syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
-      updaters_[tid]->getParametersRemote(fullSize, apply);
-    });
-  }
-  virtual void loadParametersRemote(const std::string& dirName) {
-    syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
-      updaters_[tid]->loadParametersRemote(dirName);
-    });
-  }
-  virtual void saveParametersRemote(const std::string& dirName) {
-    syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
-      updaters_[tid]->saveParametersRemote(dirName);
-    });
-  }
-  virtual void randParametersRemote() {
-    syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
-      updaters_[tid]->randParametersRemote();
-    });
-  }
-
-  virtual void catchUpWith() {
-    syncThreadPool_->execPlusOwner(
-        [&](int tid, size_t numThreads) { updaters_[tid]->catchUpWith(); });
-  }
-
-#ifndef PADDLE_DISABLE_TIMER
-  virtual void setForwardbackwardTime(uint64_t delta) {
-    for (auto& updater : updaters_) {
-      updater->setForwardbackwardTime(delta);
-    }
-  }
-#endif
-
-  virtual void apply() {
-    syncThreadPool_->execPlusOwner(
-        [&](int tid, size_t numThreads) { updaters_[tid]->apply(); });
-  }
-  virtual void restore() {
-    syncThreadPool_->execPlusOwner(
-        [&](int tid, size_t numThreads) { updaters_[tid]->restore(); });
-  }
-
-protected:
-  virtual void updateImpl(Parameter* para) {}
-  std::vector<std::unique_ptr<ParameterUpdater>> updaters_;
-  std::unique_ptr<SyncThreadPool> syncThreadPool_;
-};
-
-}  // namespace paddle
diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp
deleted file mode 100644
index e6aec3c34820764b3515f47f13a432961de1a673..0000000000000000000000000000000000000000
--- a/paddle/parameter/ParameterUpdaterHook.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParameterUpdaterHook.h"
-
-#include <algorithm>
-#include <atomic>
-#include <fstream>
-#include <mutex>
-#include <thread>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/math/Vector.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-/**
- * The static pruning hook
- * Static means user specify a sparsity_ratio before training started, and the
- * network will prune the parameters based on the sparsity_ratio. More details
- * can be found https://arxiv.org/pdf/1506.02626.pdf.
- */
-
-class StaticPruningHook : public IParameterUpdaterHook {
-public:
-  explicit StaticPruningHook(const ParameterUpdaterHookConfig &hookConfig)
-      : initCount_(0) {
-    sparsityRatio_ = hookConfig.sparsity_ratio();
-  }
-
-  static bool sortPairAscend(const std::pair<real, size_t> &pair1,
-                             const std::pair<real, size_t> &pair2) {
-    return pair1.first > pair2.first;
-  }
-
-  void update(Parameter *para) {
-    updateThreadChecker_.check();
-    auto &vec = para->getBuf(PARAMETER_GRADIENT);
-    if (vec) {
-      vec->dotMul(*maskVec_);
-    }
-  }
-
-  void generateMask(Parameter *para) {
-    VectorPtr maskTemp = Vector::create(para->getSize(), false);
-    maskTemp->zeroMem();
-    real *maskTempData = maskTemp->getData();
-    size_t nonZeroNum = para->getSize() * (1 - sparsityRatio_);
-
-    VectorPtr paraVec = para->getBuf(PARAMETER_VALUE);
-    VectorPtr paraCpuCopy = Vector::create(para->getSize(), false);
-
-    paraCpuCopy->copyFrom(*paraVec);
-    std::vector<std::pair<real, size_t>> param;
-
-    for (size_t i = 0; i < para->getSize(); i++)
-      param.push_back(std::make_pair(fabs(paraCpuCopy->getData()[i]), i));
-
-    std::partial_sort(
-        param.begin(), param.begin() + nonZeroNum, param.end(), sortPairAscend);
-    for (size_t i = 0; i < nonZeroNum; i++) maskTempData[param[i].second] = 1.0;
-
-    // Currently just use a mask vector for hack.
-    if (para->useGpu()) {
-      maskVec_ = Vector::create(para->getSize(), para->useGpu());
-      maskVec_->copyFrom(*maskTemp);
-    } else {
-      maskVec_ = maskTemp;
-    }
-  }
-
-  void init(Parameter *para) {
-    generateMask(para);
-    size_t initCount = this->initCount_.fetch_add(1);
-    CHECK_EQ(initCount, 0UL) << "Currently the StaticPruningHook must invoke "
-                                "in same ParamterUpdater";
-    VLOG(3) << "Initialize Parameter " << para;
-    SetDevice device(para->getDeviceId());
-
-    auto &paraVec = para->getBuf(PARAMETER_VALUE);
-    paraVec->dotMul(*maskVec_);
-  }
-
-private:
-  SameThreadChecker updateThreadChecker_;
-  std::atomic<size_t> initCount_;
-  VectorPtr maskVec_;
-  real sparsityRatio_;
-};
-
-IParameterUpdaterHook::IParameterUpdaterHook() {}
-
-IParameterUpdaterHook::~IParameterUpdaterHook() {}
-
-/**
- * A Hasher used by g_hooks.
- *
- * Use the independent hasher intendedly. There is a hasher in PServer for hash
- * ParameterBlock. But not to use same hasher to reduce dependency.
- *
- * May be extracted to Util.h to unify the hasher.
- */
-class StringIntPairHasher {
-public:
-  size_t operator()(const std::pair<std::string, int> &k) const {
-    return intHasher_(strHasher_(k.first) + k.second);
-  }
-
-private:
-  std::hash<std::string> strHasher_;
-  std::hash<int> intHasher_;
-};
-
-static WeakKVCache<std::pair<std::string, int>,
-                   IParameterUpdaterHook,
-                   StringIntPairHasher>
-    g_hookCache_;
-
-/**
- * ParameterUpdaterHook actually factory method.
- */
-static IParameterUpdaterHook *createImpl(
-    const ParameterUpdaterHookConfig &config) {
-  auto &type = config.type();
-  if (type == "pruning") {
-    return new StaticPruningHook(config);
-  }
-
-  LOG(FATAL) << "Unknown Hook type:  " << type;
-  return nullptr;
-}
-
-std::shared_ptr<IParameterUpdaterHook> IParameterUpdaterHook::create(
-    const ParameterConfig &paramConfig, int idx) {
-  std::pair<std::string, int> key = {paramConfig.name(), idx};
-  return g_hookCache_.get(
-      key, [&] { return createImpl(paramConfig.update_hooks(idx)); });
-}
-
-}  // namespace paddle
diff --git a/paddle/parameter/ParameterUpdaterHook.h b/paddle/parameter/ParameterUpdaterHook.h
deleted file mode 100644
index d30530ec393c097bf77e5e376e3c4dc84b321ed8..0000000000000000000000000000000000000000
--- a/paddle/parameter/ParameterUpdaterHook.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-
-#include "ParameterConfig.pb.h"
-
-namespace paddle {
-
-class Parameter;
-
-/**
- * The parameter updater hook interface.
- *
- * The Parameter Updater hooks is a group of methods invoke before
- * ParameterUpdater::updateImpl. It can modify gradient/momentum/etc before
- * parameter optimization.
- */
-class IParameterUpdaterHook {
-public:
-  virtual ~IParameterUpdaterHook();
-
-  /**
-   * Create A ParameterUpdaterHook.
-   *
-   * The same parameter shared the same hooks. So it returns shared_ptr.
-   *
-   * @param param_config The parameter config.
-   * @param idx  The element index of param_config.updater_hooks() array.
-   */
-  static std::shared_ptr<IParameterUpdaterHook> create(
-      const ParameterConfig& paramConfig, int idx);
-
-  /**
-   * The update hook method. Invoke before ParameterUpdater::updateImpl
-   */
-  virtual void update(Parameter* para) = 0;
-
-  /**
-   * The init hook method. Invoke in ParameterUpdater::init
-   */
-  virtual void init(Parameter* para) = 0;
-
-protected:
-  /**
-   * Ctor.
-   */
-  IParameterUpdaterHook();
-};
-
-}  // namespace paddle
diff --git a/paddle/parameter/Regularizer.cpp b/paddle/parameter/Regularizer.cpp
deleted file mode 100644
index d223fd2df679af1e983e84f48a4d3b0715ce1569..0000000000000000000000000000000000000000
--- a/paddle/parameter/Regularizer.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Regularizer.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-Regularizer* Regularizer::get(const std::vector<ParameterType>& types,
-                              const ParameterConfig& paraConfig) {
-  bool useLearningRateVec =
-      std::find(types.begin(), types.end(), PARAMETER_LEARNING_RATE) !=
-      types.end();
-  if (paraConfig.decay_rate_l1() > 0.0f &&
-      paraConfig.decay_rate() > 0.0f) {  // use L1 and L2
-    if (useLearningRateVec) {
-      static L1L2LrRegularizer regularizer_;
-      return &regularizer_;
-    }
-    static L1L2Regularizer regularizer_;
-    return &regularizer_;
-  }
-  if (paraConfig.decay_rate_l1() > 0.0f) {  // use L1 only
-    if (useLearningRateVec) {
-      static L1LrRegularizer regularizer_;
-      return &regularizer_;
-    }
-    static L1Regularizer regularizer_;
-    return &regularizer_;
-  }
-  if (paraConfig.decay_rate() > 0.0f) {  // use L2 only
-    if (useLearningRateVec) {
-      static L2LrRegularizer regularizer_;
-      return &regularizer_;
-    }
-    static L2Regularizer regularizer_;
-    return &regularizer_;
-  }
-  return nullptr;
-}
-
-}  // namespace paddle
diff --git a/paddle/parameter/Regularizer.h b/paddle/parameter/Regularizer.h
deleted file mode 100644
index 6bed7b0ddfe7b72c697af60f5243f9037999d54a..0000000000000000000000000000000000000000
--- a/paddle/parameter/Regularizer.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ParameterUpdaterBase.h"
-
-namespace paddle {
-
-// Regularizer function for parameter, e.g. L1/L2
-class Regularizer {
-public:
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      real learningRate,  // learningrate from optimizer
-                      int t0,             // last occurence time
-                      int t) const = 0;   // current time
-  virtual ~Regularizer() {}
-
-  static Regularizer* get(const std::vector<ParameterType>& types,
-                          const ParameterConfig& paraConfig);
-};
-
-// L1 Regularizer, |w|_1
-class L1Regularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      real learningRate,
-                      int t0,
-                      int t) const {
-    vecs[PARAMETER_VALUE]->applyL1(learningRate * paraConfig.learning_rate(),
-                                   paraConfig.decay_rate_l1() * (t - t0));
-  }
-};
-
-// L1 Lr Regularizer
-class L1LrRegularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      real learningRate,
-                      int t0,
-                      int t) const {
-    vecs[PARAMETER_VALUE]->applyL1(*vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate * paraConfig.learning_rate(),
-                                   paraConfig.decay_rate_l1() * (t - t0));
-  }
-};
-
-// L2 Regularizer, |w|_2^2
-class L2Regularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      real learningRate,
-                      int t0,
-                      int t) const {
-    vecs[PARAMETER_VALUE]->applyL2(learningRate * paraConfig.learning_rate(),
-                                   paraConfig.decay_rate() * (t - t0));
-  }
-};
-
-// L2 Lr Regularizer
-class L2LrRegularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      real learningRate,
-                      int t0,
-                      int t) const {
-    vecs[PARAMETER_VALUE]->applyL2(*vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate * paraConfig.learning_rate(),
-                                   paraConfig.decay_rate() * (t - t0));
-  }
-};
-
-// L1 + L2 Regularizer, |w|_1 + |w|_2^2
-class L1L2Regularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      real learningRate,
-                      int t0,
-                      int t) const {
-    vecs[PARAMETER_VALUE]->applyL1(learningRate * paraConfig.learning_rate(),
-                                   paraConfig.decay_rate_l1() * (t - t0));
-    vecs[PARAMETER_VALUE]->applyL2(learningRate * paraConfig.learning_rate(),
-                                   paraConfig.decay_rate() * (t - t0));
-  }
-};
-
-// L1 + L2 Lr Regularizer
-class L1L2LrRegularizer : public Regularizer {
-  virtual void update(const VectorPtr vecs[],
-                      const ParameterConfig& paraConfig,
-                      real learningRate,
-                      int t0,
-                      int t) const {
-    vecs[PARAMETER_VALUE]->applyL1(*vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate * paraConfig.learning_rate(),
-                                   paraConfig.decay_rate_l1() * (t - t0));
-    vecs[PARAMETER_VALUE]->applyL2(*vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate * paraConfig.learning_rate(),
-                                   paraConfig.decay_rate() * (t - t0));
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/parameter/ThreadLocalBuffer.h b/paddle/parameter/ThreadLocalBuffer.h
deleted file mode 100644
index 07c96e59d0bc0a58ce9956a54e7de359896e5618..0000000000000000000000000000000000000000
--- a/paddle/parameter/ThreadLocalBuffer.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-namespace parameter {
-extern VectorPtr* getThreadLocalBuffer();
-}  // namespace parameter
-}  // namespace paddle
diff --git a/paddle/parameter/Weight.cpp b/paddle/parameter/Weight.cpp
deleted file mode 100644
index ba4ddce69fb9c2ad0fa937efca5ba470247978e4..0000000000000000000000000000000000000000
--- a/paddle/parameter/Weight.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Weight.h"
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-Weight::Weight(size_t height, size_t width, ParameterPtr param) {
-  VectorPtr vPtr = param->getBuf(PARAMETER_VALUE);
-  VectorPtr gPtr = param->getBuf(PARAMETER_GRADIENT);
-
-  // create a new weight
-  if (param->isSparse()) {
-    CHECK_LE(param->getSize(), width * height);
-  } else {
-    CHECK_EQ(param->getSize(), width * height);
-  }
-
-  // weight_
-  weight_ = param->getMat(PARAMETER_VALUE);
-  if (!weight_ && vPtr) {
-    weight_ = Matrix::create(vPtr->getMemoryHandle(), height, width);
-  }
-  if (weight_) {
-    CHECK_EQ(height, weight_->getHeight());
-    CHECK_EQ(width, weight_->getWidth());
-  }
-
-  // weightGrad
-  weightGrad_ = param->getMat(PARAMETER_GRADIENT);
-  if (!weightGrad_ && gPtr) {
-    weightGrad_ = Matrix::create(gPtr->getMemoryHandle(), height, width);
-  }
-  if (weightGrad_) {
-    CHECK_EQ(height, weightGrad_->getHeight());
-    CHECK_EQ(width, weightGrad_->getWidth());
-  }
-
-  parameter_ = param;
-}
-
-Weight::Weight(size_t height, size_t width, ParameterPtr param, size_t offset) {
-  VectorPtr vPtr = param->getBuf(PARAMETER_VALUE);
-  VectorPtr gPtr = param->getBuf(PARAMETER_GRADIENT);
-
-  // create a new weight
-  CHECK_LE(offset + width * height, param->getSize());
-
-  // weight_
-  if (vPtr) {
-    weight_ = Matrix::create(vPtr->getData() + offset,
-                             height,
-                             width,
-                             /* trans */ false,
-                             param->useGpu());
-  }
-
-  // weightGrad
-  if (gPtr) {
-    weightGrad_ = Matrix::create(gPtr->getData() + offset,
-                                 height,
-                                 width,
-                                 /* trans */ false,
-                                 param->useGpu());
-  }
-
-  parameter_ = param;
-}
-
-const ParameterPtr& Weight::getParameterPtr() { return parameter_; }
-void Weight::setParameterPtr(ParameterPtr param) { parameter_ = param; }
-}  // namespace paddle
diff --git a/paddle/parameter/Weight.h b/paddle/parameter/Weight.h
deleted file mode 100644
index 7314c29d0db92db06d5b921c09de39d3b0029ef3..0000000000000000000000000000000000000000
--- a/paddle/parameter/Weight.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-#include <vector>
-
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseRowMatrix.h"
-#include "paddle/parameter/Parameter.h"
-
-namespace paddle {
-
-class Weight {
-private:
-  MatrixPtr weight_;
-  MatrixPtr weightGrad_;
-  ParameterPtr parameter_;
-
-public:
-  Weight(size_t height, size_t width, ParameterPtr parameter);
-  Weight(size_t height, size_t width, ParameterPtr parameter, size_t offset);
-
-  const MatrixPtr& getW() { return weight_; }
-  const MatrixPtr& getWGrad() { return weightGrad_; }
-  const ParameterPtr& getParameterPtr();
-
-  void incUpdate(const UpdateCallback& callback) {
-    getParameterPtr()->incUpdate(callback);
-  }
-
-  void setParameterPtr(ParameterPtr param);
-};
-
-typedef std::vector<std::unique_ptr<Weight>> WeightList;
-
-}  // namespace paddle
diff --git a/paddle/parameter/tests/test_argument.cpp b/paddle/parameter/tests/test_argument.cpp
deleted file mode 100644
index 54ceb3e08714e37abb5d491c8973bee631b993be..0000000000000000000000000000000000000000
--- a/paddle/parameter/tests/test_argument.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/parameter/Argument.h>
-
-using namespace paddle;  // NOLINT
-
-TEST(Argument, poolSequenceWithStride) {
-  Argument input, output;
-  ICpuGpuVector::resizeOrCreate(input.sequenceStartPositions, 5, false);
-  int* inStart = input.sequenceStartPositions->getMutableData(false);
-  inStart[0] = 0;
-  inStart[1] = 9;
-  inStart[2] = 14;
-  inStart[3] = 17;
-  inStart[4] = 30;
-
-  int strideResult[] = {0, 5, 9, 14, 17, 22, 27, 30};
-  int strideResultReversed[] = {0, 4, 9, 14, 17, 20, 25, 30};
-
-  for (auto reversed : {false, true}) {
-    ICpuGpuVectorPtr stridePositions;
-    output.poolSequenceWithStride(
-        input, 5 /* stride */, &stridePositions, reversed);
-
-    const int* outStart = output.sequenceStartPositions->getData(false);
-    CHECK_EQ(outStart[0], 0);
-    CHECK_EQ(outStart[1], 2);
-    CHECK_EQ(outStart[2], 3);
-    CHECK_EQ(outStart[3], 4);
-    CHECK_EQ(outStart[4], 7);
-
-    CHECK_EQ(stridePositions->getSize(), 8UL);
-    auto result = reversed ? strideResultReversed : strideResult;
-    for (int i = 0; i < 8; i++) {
-      CHECK_EQ(stridePositions->getData(false)[i], result[i]);
-    }
-  }
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/parameter/tests/test_common.cpp b/paddle/parameter/tests/test_common.cpp
deleted file mode 100644
index 6e10becabbbbb8861095fed5aab9ac1e05bcac91..0000000000000000000000000000000000000000
--- a/paddle/parameter/tests/test_common.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/utils/Util.h>
-#include <stdlib.h>
-
-#include <gtest/gtest.h>
-#include <paddle/parameter/ParameterUpdateFunctions.h>
-#include <paddle/utils/Flags.h>
-#include <paddle/utils/Stat.h>
-#include <paddle/utils/Thread.h>
-
-using namespace paddle;  // NOLINT
-
-class CommonTest : public ::testing::Test {
-protected:
-  CommonTest() : testStat_("test") {}
-  virtual ~CommonTest() {}
-  virtual void SetUp() {
-    const size_t buffSize[] = {
-        100, 128, 500, 1024, 4096, 10240, 102400, 1000000};
-    sizeVec_.resize(8);
-    memcpy(&sizeVec_[0], &buffSize[0], 8 * sizeof(size_t));
-    valueUint_.resize(4);
-    valueUint_[0].first = 0.0;
-    valueUint_[0].second = 0.0;
-    valueUint_[1].first = 0.0;
-    valueUint_[1].second = 1.0;
-    valueUint_[2].first = 1.0;
-    valueUint_[2].second = 0.0;
-    valueUint_[3].first = 1.0;
-    valueUint_[3].second = 1.0;
-    learningRate_ = 1.0;
-  }
-
-  void test_sgdUpadate(real* gradientBuffer,
-                       real* valueBuffer,
-                       real* momentumBuffer,
-                       size_t size);
-
-  virtual void TreaDown() { LOG(INFO) << "All Test Finished."; }
-
-protected:
-  std::vector<std::pair<real, real>> valueUint_;
-  std::vector<size_t> sizeVec_;
-  real learningRate_;
-  StatSet testStat_;
-};
-
-void CommonTest::test_sgdUpadate(real* gradientBuffer,
-                                 real* valueBuffer,
-                                 real* momentumBuffer,
-                                 size_t size) {
-// sgdUpdateAvx has no double version yet
-#if defined(__AVX__) && !defined(PADDLE_TYPE_DOUBLE)
-  real valueSum1 = 0, valueSum2 = 0, momSum1 = 0, momSum2 = 0;
-  real* gradTmp = new real[size];
-  real* valueTmp = new real[size];
-  real* momentumTmp = new real[size];
-  memcpy(gradTmp, gradientBuffer, size * sizeof(real));
-  memcpy(valueTmp, valueBuffer, size * sizeof(real));
-  memcpy(momentumTmp, momentumBuffer, size * sizeof(real));
-  for (auto& arg : valueUint_) {
-    {
-      {
-        struct timeval t;
-        REGISTER_TIMER("gettimeofday", 0, testStat_);
-        gettimeofday(&t, NULL);
-      }
-      REGISTER_TIMER("avxTimer", 0);
-      sgdUpdateAvx(learningRate_,
-                   arg.first,
-                   arg.second,
-                   size,
-                   valueBuffer,
-                   gradientBuffer,
-                   momentumBuffer);
-    }
-    for (size_t i = 0; i < size; i++) {
-      valueSum1 += valueBuffer[i];
-      momSum1 += momentumBuffer[i];
-      // std::cout << "["
-      //          << valueBuffer[i]
-      //          << "," << momentumBuffer[i]
-      //          << "," << gradientBuffer[i] << "],";
-    }
-    {
-      REGISTER_TIMER("cpuTimer", 0);
-      sgdUpdateCpu(learningRate_,
-                   arg.first,
-                   arg.second,
-                   size,
-                   valueTmp,
-                   gradTmp,
-                   momentumTmp);
-    }
-    for (size_t i = 0; i < size; i++) {
-      valueSum2 += valueTmp[i];
-      momSum2 += momentumTmp[i];
-      // std::cout << "["
-      //          << valueTmp[i]
-      //          << "," << momentumTmp[i]
-      //          << "," << gradTmp[i] << "],";
-    }
-
-    VLOG(3) << "valueSum1 = " << valueSum1 << " ; valueSum2 = " << valueSum2;
-    VLOG(3) << "momSum1 = " << momSum1 << " ; momSum2 = " << momSum2;
-    ASSERT_EQ(valueSum1, valueSum2);
-    ASSERT_EQ(momSum1, momSum2);
-  }
-  delete[] gradTmp;
-  delete[] valueTmp;
-  delete[] momentumTmp;
-#endif
-}
-
-TEST_F(CommonTest, sgdUpdate) {
-  const size_t alignHeader[] = {0, 2, 3, 5, 7, 8};
-  for (auto& size : sizeVec_) {
-    real *gradientBuffer, *valueBuffer, *momentumBuffer;
-    CHECK_EQ(posix_memalign((void**)&gradientBuffer, 32, sizeof(real) * size),
-             0);
-    CHECK_EQ(posix_memalign((void**)&valueBuffer, 32, sizeof(real) * size), 0);
-    CHECK_EQ(posix_memalign((void**)&momentumBuffer, 32, sizeof(real) * size),
-             0);
-
-    for (size_t i = 0; i < size; i++) {
-      gradientBuffer[i] = 1.0;
-      valueBuffer[i] = 2.0;
-      momentumBuffer[i] = 3.0;
-    }
-    for (int i = 0; i < 6; i++) {
-      LOG(INFO) << "----------------------" << size << ":" << alignHeader[i]
-                << "-------------------------";
-      test_sgdUpadate(&gradientBuffer[alignHeader[i]],
-                      &valueBuffer[alignHeader[i]],
-                      &momentumBuffer[alignHeader[i]],
-                      size - alignHeader[i]);
-    }
-    free(gradientBuffer);
-    free(valueBuffer);
-    free(momentumBuffer);
-  }
-  globalStat.printAllStatus();
-  testStat_.printAllStatus();
-}
-
-TEST_F(CommonTest, syncThreadPool) {
-  SyncThreadPool pool(10);
-
-  std::vector<int> nums;
-  nums.resize(10);
-
-  pool.exec([&](int tid, size_t numThreads) { nums[tid] = tid; });
-  for (size_t i = 0; i < nums.size(); ++i) {
-    EXPECT_EQ((int)i, nums[i]);
-  }
-
-  pool.exec([&](int tid, size_t numThreads) { nums[tid] -= tid; });
-  for (size_t i = 0; i < nums.size(); ++i) {
-    EXPECT_EQ((int)0, nums[i]);
-  }
-}
diff --git a/paddle/pserver/BaseClient.cpp b/paddle/pserver/BaseClient.cpp
deleted file mode 100644
index a6204ef47ea553246ddadbb2eae6cc714cafe594..0000000000000000000000000000000000000000
--- a/paddle/pserver/BaseClient.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "BaseClient.h"
-#include <gflags/gflags.h>
-#include <string.h>
-#include <vector>
-#include "paddle/utils/Stat.h"
-
-DECLARE_string(pservers);
-
-namespace paddle {
-
-BaseClient::BaseClient(bool separate, int numPorts)
-    : stopping_(false), numPorts_(numPorts), separateSendAndRecv_(separate) {
-  CHECK_GT(numPorts, 0);
-}
-
-BaseClient::~BaseClient() {}
-
-void BaseClient::recvData() { recvSyncBarrier_->wait(); }
-
-void BaseClient::synchronize(SyncObject syncObjectId) {
-  SynchronizeRequest request;
-  request.set_sync_object_id(syncObjectId);
-  std::vector<SynchronizeResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void BaseClient::startThreads() {
-  if (!separateSendAndRecv_) {
-    return;
-  }
-  recvSyncBarrier_.reset(new ThreadBarrier(threadNum_ + 1));
-
-  sendThreads_.resize(threadNum_);
-  recvThreads_.resize(threadNum_);
-  sendJobQueue_.resize(threadNum_);
-  recvJobQueue_.resize(threadNum_);
-
-  for (int i = 0; i < threadNum_; ++i) {
-    sendJobQueue_[i].reset(new SendQueue());
-    recvJobQueue_[i].reset(new SendQueue());
-
-    sendThreads_[i].reset(
-        new std::thread([this](int id) { this->send(id); }, i));
-
-    recvThreads_[i].reset(
-        new std::thread([this](int id) { this->recv(id); }, i));
-  }
-}
-
-void BaseClient::finishThreads() {
-  if (!separateSendAndRecv_) {
-    return;
-  }
-  stopping_ = true;
-  for (int i = 0; i < threadNum_; i++) {
-    sendJobQueue_[i]->enqueue(nullptr);
-  }
-  for (auto& thread : sendThreads_) {
-    thread->join();
-  }
-  for (auto& thread : recvThreads_) {
-    thread->join();
-  }
-  stopping_ = false;
-}
-}  // namespace paddle
diff --git a/paddle/pserver/BaseClient.h b/paddle/pserver/BaseClient.h
deleted file mode 100644
index a932d34712f56de1cbbf84a9db4476f862febca0..0000000000000000000000000000000000000000
--- a/paddle/pserver/BaseClient.h
+++ /dev/null
@@ -1,311 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ParameterService.pb.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/pserver/ProtoServer.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Queue.h"
-
-namespace paddle {
-
-/**
- * it manages all connections to pservers.
- * it exists two modes to manage connections to all pservers. Firstly, one
- * connection owns two threads that separately manage to send and receive
- * data. Secondly, each thread uses one connection for all activation in it.
- * the first solution arms with sendThreads_/recvThreads_ and sendJobQueue_/
- * recvJobQueue_. the second solution use some shared thread pool to manage
- * connections.
- */
-class BaseClient {
-protected:
-  typedef std::unique_ptr<std::thread> ThreadPtr;
-  typedef std::vector<std::vector<iovec>> InputIovs;
-  typedef std::vector<SendParameterRequest> SendRequest;
-  typedef std::vector<SendDataRequest> SendDataRequestVec;
-
-  // TODO(yanfei):
-  // refine data structure to unify parameter and features communication
-  struct SendJob {
-    /// store parameters related blocks data
-    InputIovs parallelInputIovs;
-    /// store protobuf request
-    SendRequest parallelRequests;
-    /// store data, such as features for metric learning
-    SendDataRequestVec parallelDataRequests;
-  };
-
-public:
-  explicit BaseClient(bool separate = false, int numPorts = FLAGS_ports_num);
-
-  virtual ~BaseClient();
-
-  typedef std::shared_ptr<SendJob> SendJobPtr;
-  typedef Queue<SendJobPtr> SendQueue;
-
-  /// send data to server, support only synchronize
-  template <class DataType>
-  void putData(int clientId,
-               SendDataType type,
-               DataType* datas,
-               size_t size,
-               DataUpdateMode mode) {
-    synchronize(SYNC_DATA);
-    sendData(clientId, type, mode, datas, size);
-    recvData();
-    synchronize(SYNC_DATA);
-  }
-
-  template <class DataType>
-  void putOwnData(int clientId,
-                  SendDataType type,
-                  DataType* datas,
-                  size_t size) {
-    putData(clientId, type, datas, size, DATA_UPDATE_MODE_SET_OWN);
-  }
-
-  template <class DataType>
-  void getAllData(int clientId,
-                  SendDataType type,
-                  DataType* datas,
-                  size_t size) {
-    sendData(clientId,
-             type,
-             DATA_UPDATE_MODE_GET_ALL,
-             reinterpret_cast<DataType*>(NULL),
-             0);
-    recvData();
-    size_t dataOffset = 0;
-    for (auto& recvMem : recvDataMems_) {
-      CHECK_LE(dataOffset, size);
-      size_t memSize = std::min(recvMem.get()->getSize(),
-                                sizeof(DataType) * (size - dataOffset));
-      CHECK_EQ(memSize % sizeof(DataType), size_t(0));
-      memcpy(datas + dataOffset, recvMem.get()->getBuf(), memSize);
-      dataOffset += memSize / sizeof(DataType);
-    }
-    CHECK_EQ(dataOffset, size);
-  }
-
-  /**
-   * Reduces values on all clients.
-   * This reduce just support SUM.
-   * The results are saved in recvBuf of rootId client
-   */
-  template <class DataType>
-  void reduce(DataType* sendBuf,
-              DataType* recvBuf,
-              size_t size,
-              int clientId,
-              int rootId) {
-    putOwnData(clientId, DATA_REDUCE_SUM, sendBuf, size);
-    if (rootId == clientId) {
-      getAllData(clientId, DATA_REDUCE_SUM, recvBuf, size);
-    }
-  }
-
-  /**
-   * return trans data type according to the input type
-   */
-  virtual TransDataType getTransDtype(const std::type_info& info) {
-    TransDataType dataType;
-    if (typeid(int*) == info) {  // NOLINT
-      dataType = TRANS_INT32;
-    } else if (typeid(uint32_t*) == info) {  // NOLINT
-      dataType = TRANS_UINT32_T;
-    } else if (typeid(int64_t*) == info) {  // NOLINT
-      dataType = TRANS_INT64_T;
-    } else if (typeid(uint64_t*) == info) {  // NOLINT
-      dataType = TRANS_UINT64_T;
-    } else if (typeid(float*) == info) {  // NOLINT
-      dataType = TRANS_FLOAT;
-    } else if (typeid(double*) == info) {  // NOLINT
-      dataType = TRANS_DOUBLE;
-    } else {
-      LOG(FATAL) << "not supported";
-    }
-    return dataType;
-  }
-
-protected:
-  /// for a > 0, b > 0:
-  /// return the smallest x s.t. b*x >= a
-  static int divup(int a, int b) { return (a + b - 1) / b; }
-
-  int calcClientId(int i, int serviceNum) {
-    return (i + FLAGS_trainer_id * numPorts_) % serviceNum;
-  }
-
-  /// start threads in sendThreads_ and recvThreads_
-  void startThreads();
-
-  /// finish threads in sendThreads_ and recvThreads_
-  void finishThreads();
-
-  template <class DataType>
-  void prepareData(int clientId,
-                   SendDataType type,
-                   DataUpdateMode updateMode,
-                   DataType* datas,
-                   size_t size,
-                   SendJob* sendJob) {
-    sendJob->parallelDataRequests.resize(serviceNum_);
-    sendJob->parallelInputIovs.resize(serviceNum_);
-    for (int i = 0; i < serviceNum_; ++i) {
-      auto& request = sendJob->parallelDataRequests[i];
-      request.set_update_mode(updateMode);
-      request.set_type(type);
-      request.set_client_id(clientId);
-      request.set_server_id(i);
-    }
-
-    /// split datas which need send to Server into serviceNum_ pieces
-    if (!datas) {
-      CHECK(!size) << "ownSize should be zero since datas is nullptr";
-    }
-    size_t baseSize = size / serviceNum_;
-    size_t dataOffset = 0;
-    for (int i = 0; i < serviceNum_; ++i) {
-      auto& request = sendJob->parallelDataRequests[i];
-      DataBlock* block = request.add_blocks();
-      size_t ownSize = size_t(i) < size % serviceNum_ ? baseSize + 1 : baseSize;
-      size_t realSize = datas ? std::max(ownSize, size_t(1)) : 0;
-      block->set_total_size(realSize * sizeof(DataType));
-      block->set_data_size(sizeof(DataType));
-      // TODO(yuyang18): The getTransDtype can be rewritten as template method
-      //                 to reduce runtime overhead.
-      block->set_data_type(getTransDtype(typeid(DataType*)));  // NOLINT
-      if (datas) {
-        sendJob->parallelInputIovs[i].push_back(
-            {datas + dataOffset, realSize * sizeof(DataType)});
-      }
-      dataOffset += ownSize;
-    }
-    CHECK_EQ(dataOffset, size);
-  }
-
-  /**
-   * @brief send data to all data servers
-   *
-   * @note  each trainer sends all its data to all data servers
-   *        it's for broadcast data synchronization, such as features
-   *        synchronization in metric learning.
-   */
-  template <class DataType>
-  void sendData(int clientId,
-                SendDataType type,
-                DataUpdateMode updateMode,
-                DataType* datas,
-                size_t size) {
-    SendJobPtr sendJob = std::make_shared<SendJob>();
-    prepareData(clientId, type, updateMode, datas, size, sendJob.get());
-    for (int i = 0; i < threadNum_; ++i) {
-      sendJobQueue_[i]->enqueue(sendJob);
-    }
-  }
-
-  /**
-   * @brief recv data from all data servers
-   *
-   * @note  synchronize all recv threads
-   */
-  void recvData();
-
-  /// send request, and recv responses
-  template <typename ProtoIn, typename ProtoOut>
-  void multiCall(const char* funcName,
-                 const ProtoIn& request,
-                 std::vector<ProtoOut>* responses) {
-    responses->resize(clients_.size());
-    size_t numClients = clients_.size();
-    for (size_t i = 0; i < numClients; ++i) {
-      clients_[i].send(funcName, request);
-    }
-    for (size_t i = 0; i < numClients; ++i) {
-      clients_[i].recv(&(*responses)[i]);
-    }
-  }
-
-  /**
-   * @brief synchronize all trainers and pservers
-   *
-   * @note  used to ensure that data of all trainers have been received
-   */
-  void synchronize(SyncObject syncObjectId = SYNC_DEFAULT);
-
-  /**
-   * @brief use multithread to separately send data
-   *
-   * @note  each thread should read its own JobQueue to handle requests
-   *        each thread should calcClientId() to retrieve connections
-   *        managed by himself.
-   *        send and recv are implemented in child class.
-   */
-  virtual void send(int threadId) = 0;
-
-  /**
-   * @brief use multithread to separately receive data
-   *
-   * @note  almost same as send()
-   */
-  virtual void recv(int threadId) = 0;
-
-protected:
-  bool stopping_;
-  /// nodes * ports that means the number of real pservers
-  int serviceNum_;
-  /**
-   * threads num for managing all services. Normally the
-   * number of pservers are relatively less than several
-   * hundreds so that using thread-based parallelization
-   * can benifit traffic performance and pserver's sgd
-   * optimization performance.
-   */
-  int threadNum_;
-  /// the connection manager at client end
-  std::vector<ProtoClient> clients_;
-  /// send threads for parallelization
-  std::vector<ThreadPtr> sendThreads_;
-  /// recv threads for parallelization
-  std::vector<ThreadPtr> recvThreads_;
-  std::unique_ptr<ThreadBarrier> recvSyncBarrier_;
-
-  // TODO(yanfei):
-  // current pserver's will return value until all parameters'
-  // optimization are finished so that recv are not overlapped
-  // in reality. More robust implimentation should be to pipeline
-  // all send/recv action based on parameter unit level, and
-  // it will benifits deep and larger model training in future,
-  // especially local node compution power surpasses inter-connection
-  // such as GPU cluster, even with BOX GPU cluster.
-  // queue for buffering send request
-  /**
-   * send/recv queue cooperates with each other to accomplish
-   * overlapping communication with forwardBackward action.
-   */
-  std::vector<std::unique_ptr<SendQueue>> sendJobQueue_;
-  /// queue for buffering recv request
-  std::vector<std::unique_ptr<SendQueue>> recvJobQueue_;
-  /// specific for dserver
-  SendJob sendJob_;
-  /// port num for each node
-  int numPorts_;
-  /// if set, overlapped optimization is disabled
-  bool separateSendAndRecv_;
-  std::vector<CpuMemHandlePtr> recvDataMems_;
-};
-}  // namespace paddle
diff --git a/paddle/pserver/CMakeLists.txt b/paddle/pserver/CMakeLists.txt
deleted file mode 100644
index f75475a88f7224ee3889827795088c8aa920b63b..0000000000000000000000000000000000000000
--- a/paddle/pserver/CMakeLists.txt
+++ /dev/null
@@ -1,62 +0,0 @@
-# parameter server package
-
-######################### paddle_network ####################
-set(NETWORK_SOURCES
-    LightNetwork.cpp
-    SocketChannel.cpp
-    ProtoServer.cpp)
-
-set(NETWORK_HEADERS
-    LightNetwork.h
-    SocketChannel.h
-    ProtoServer.h)
-
-add_library(paddle_network STATIC
-    ${NETWORK_SOURCES})
-
-add_style_check_target(paddle_network ${NETWORK_SOURCES})
-add_style_check_target(paddle_network ${NETWORK_HEADERS})
-
-add_dependencies(paddle_network paddle_proto ${external_project_dependencies})
-
-################### paddle_pserver ######################
-set(PSERVER_SOURCES
-    BaseClient.cpp
-    ParameterClient2.cpp
-    ParameterServer2.cpp
-    SparseParameterDistribution.cpp
-    ParameterServerController.cpp)
-
-set(PSERVER_HEADERS
-    BaseClient.h
-    ParameterClient2.h
-    ParameterServer2.h
-    SparseParameterDistribution.h
-    ParameterServerController.h)
-
-add_library(paddle_pserver STATIC
-    ${PSERVER_SOURCES})
-
-add_style_check_target(paddle_pserver ${PSERVER_SOURCES})
-add_style_check_target(paddle_pserver ${PSERVER_HEADERS})
-
-add_dependencies(paddle_pserver paddle_proto ${external_project_dependencies})
-
-set(PSERVER_MAIN_SOURCES
-    ParameterServer2Main.cpp)
-
-if(WITH_TESTING)
-  add_subdirectory(test)
-endif()
-
-if(NOT MOBILE_INFERENCE)
-  add_executable(paddle_pserver_main ${PSERVER_MAIN_SOURCES})
-  link_paddle_exe(paddle_pserver_main)
-
-  install(TARGETS paddle_pserver_main
-          RUNTIME DESTINATION opt/paddle/bin
-          PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
-          GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
-
-  set_target_properties(paddle_pserver_main PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
-endif()
diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp
deleted file mode 100644
index 4c0da2217e880b7509ea5f42da5ac7ffe93a53ec..0000000000000000000000000000000000000000
--- a/paddle/pserver/LightNetwork.cpp
+++ /dev/null
@@ -1,459 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fcntl.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <netinet/tcp.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <chrono>
-
-#include <arpa/inet.h>
-#include <net/if.h>
-#include <sys/ioctl.h>
-#include <sstream>
-
-#include "LightNetwork.h"
-#include "RDMANetwork.h"
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Util.h"
-
-/// quick ack can reduce the latency of small message
-DEFINE_bool(small_messages,
-            false,
-            "if message size is small, recommend set it True to enable quick "
-            "ack and no delay");
-
-/// reasonable sock_send_buf_size can control the traffic injected into switch
-/// network. Injecting too many data into traffic could cause packets loss which
-/// cause long latency and degrade the efficiency of communication.
-DEFINE_int32(sock_send_buf_size,
-             1024 * 1024 * 40,
-             "restrict sock send buff size, can reduce network congestion if "
-             "set carefully");
-
-/// reasonable size can hold bursted packets and reduce packets loss
-DEFINE_int32(sock_recv_buf_size,
-             1024 * 1024 * 40,
-             "restrict sock recv buff size");
-
-/// reasonable sock_listen_queue_size can control maximum pending connections.
-DEFINE_int32(sock_listen_queue_size,
-             1024,
-             "listen queue size when pserver listen a TCP port");
-
-namespace paddle {
-
-/**
- * @brief get ip address from interface name
- *
- * @param[in] device device interface name
- */
-std::string getIpAddr(std::string &device) {
-  int sock;
-  struct sockaddr_in sin;
-  struct ifreq ifr;
-
-  sock = socket(AF_INET, SOCK_DGRAM, 0);
-  CHECK(sock >= 0) << "Create socket error.";
-
-  strncpy(ifr.ifr_name, device.c_str(), IFNAMSIZ);
-  ifr.ifr_name[IFNAMSIZ - 1] = 0;
-
-  CHECK_GE(ioctl(sock, SIOCGIFADDR, &ifr), 0);
-  memcpy(&sin, &ifr.ifr_addr, sizeof(sin));
-  close(sock);
-  return std::string(inet_ntoa(sin.sin_addr));
-}
-
-/**
- * @brief set sock option
- *
- * @param[in] sockfd sock file descriptor
- *
- * @note adjust some default sock option for better performance
- */
-void setOption(int sockfd) {
-#if !defined(__APPLE__) && !defined(__OSX__)
-  int sendSize = FLAGS_sock_send_buf_size;
-  int recvSize = FLAGS_sock_recv_buf_size;
-  CHECK_GE(
-      setsockopt(sockfd, SOL_SOCKET, SO_RCVBUF, &recvSize, sizeof(recvSize)),
-      0);
-  CHECK_GE(
-      setsockopt(sockfd, SOL_SOCKET, SO_SNDBUF, &sendSize, sizeof(sendSize)),
-      0);
-#endif
-
-  if (FLAGS_small_messages) {
-    int optval = 1;
-    CHECK_GE(
-        setsockopt(sockfd, IPPROTO_TCP, TCP_NODELAY, &optval, sizeof(optval)),
-        0);
-#ifdef TCP_QUICKACK
-    optval = 1;
-    CHECK_GE(
-        setsockopt(sockfd, IPPROTO_TCP, TCP_QUICKACK, &optval, sizeof(optval)),
-        0);
-#endif
-  }
-  int reuse = 1;
-  CHECK_GE(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)),
-           0);
-}
-
-/**
- * @brief class constructor for SocketServer
- * @param[in] addr sock bind address
- * @param[in] port sock bind port
- * @param[in] rdmaCpu rdma sock bind cpu core
- *
- * @note start one socket server which hosts parameter server process.
- *       rdmaCpu is passed to rdma deamon for better performance, and
- *       start tcp socket instead of rdma socket if rdmaCpu is equal
- *       to -1. Each trainer process starts one connection to one socket
- *       server, and use --ports_num to build more connections to harness
- *       fat communication channel if necessary.
- *       each connection is controlled by single thread with blocking
- *       read and write.
- */
-SocketServer::SocketServer(const std::string &addr, int port, int rdmaCpu)
-    : port_(port), addr_(addr), stopping_(false) {
-  if (rdmaCpu == -1) {
-    tcpRdma_ = F_TCP;
-    socket_ = 0;
-    maxPendingConnections_ = FLAGS_sock_listen_queue_size;
-  } else {
-    tcpRdma_ = F_RDMA;
-    rdmaCpu_ = rdmaCpu;
-    rdmaSocket_ = 0;
-
-    std::stringstream ss;
-    ss << port;
-    rdmaUri_ = "rdma://" + addr + ":" + ss.str();
-  }
-
-  /// trigger to initialize RDMA lib
-  CHECK(RdmaClientDaemons::get()) << "initilizate RDMA failed\n";
-}
-
-SocketServer::~SocketServer() {
-  stopping_ = true;
-  /// trigger accept thread to stop
-  {
-    SocketClient trigger(addr_.empty() ? "127.0.0.1" : addr_, port_, tcpRdma_);
-  }
-  this->join();
-}
-
-/**
- * @brief start one tcp server which hosts parameter server
- *
- * @note do tcp socket bind and listen. it will spawn one thread
- *       for each connection
- */
-void SocketServer::tcpServer() {
-  int newsockfd;
-  socklen_t clilen;
-  struct sockaddr_in serv_addr, cli_addr;
-  struct hostent *server;
-
-  /// First call to socket() function
-  socket_ = socket(AF_INET, SOCK_STREAM, 0);
-  CHECK(socket_ >= 0) << "ERROR opening socket";
-
-  /// Initialize socket structure
-  bzero((char *)&serv_addr, sizeof(serv_addr));
-  serv_addr.sin_family = AF_INET;
-  serv_addr.sin_port = htons(port_);
-  if (!addr_.empty()) {
-    server = gethostbyname(addr_.c_str());
-    CHECK(server) << "ERROR, no such host: " << addr_;
-    bcopy((char *)server->h_addr,
-          (char *)&serv_addr.sin_addr.s_addr,
-          server->h_length);
-  } else {
-    serv_addr.sin_addr.s_addr = INADDR_ANY;
-  }
-
-  setOption(socket_);
-
-  /// Now bind the host address using bind() call.
-  CHECK(bind(socket_, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) >= 0)
-      << "ERROR on binding " << addr_;
-
-  /// Now start listening for the clients, here process will
-  /// go in sleep mode and will wait for the incoming connection
-  listen(socket_, maxPendingConnections_);
-  clilen = sizeof(cli_addr);
-
-  while (true) {
-    /// Accept actual connection from the client
-    newsockfd = accept(socket_, (struct sockaddr *)&cli_addr, &clilen);
-    if (stopping_) {
-      break;
-    }
-    CHECK(newsockfd >= 0) << "ERROR on accept";
-    constexpr int kPeerNameLen = 128;
-    char peerName[kPeerNameLen];
-    CHECK(inet_ntop(AF_INET, &cli_addr.sin_addr, peerName, kPeerNameLen));
-
-    SocketWorker *worker =
-        new SocketWorker(createChannel(newsockfd, std::string(peerName)), this);
-    worker->start();
-    worker->detach();
-  }
-  close(socket_);
-  LOG(INFO) << "pserver accept thread finish, addr=" << addr_
-            << " port=" << port_;
-}
-
-/**
- * @brief start one rdma server which hosts parameter server
- *
- * @note do rdma bind and listen, which calling self-defined socket
- *       like rdma library. it will spawn one thread for each connection
- */
-void SocketServer::rdmaServer() {
-  struct sxi_sock *newsock;
-
-  /// First call to socket() function
-  rdmaSocket_ = rdma::ssocket(rdmaCpu_);
-  CHECK(rdmaSocket_) << "ERROR opening RDMA socket";
-
-  CHECK(rdma::bind(rdmaSocket_, rdmaUri_.c_str()) == 0)
-      << "ERROR bind RDMA socket";
-
-  /// Now start listening for the clients, here process will
-  /// go in sleep mode and will wait for the incoming connection
-  CHECK(rdma::listen(rdmaSocket_) == 0) << "ERROR listen RDMA socket";
-
-  while (true) {
-    /// Accept actual connection from the client
-    newsock = rdma::accept(rdmaSocket_);
-    if (stopping_) {
-      break;
-    }
-    CHECK(newsock) << "ERROR on accept";
-
-    constexpr int kPeerNameLen = 128;
-    char peerName[kPeerNameLen];
-
-    struct sockaddr_in *saddr = rdma::getSourceAddress(newsock);
-    CHECK(inet_ntop(AF_INET, &saddr->sin_addr, peerName, kPeerNameLen));
-
-    SocketWorker *worker =
-        new SocketWorker(createChannel(newsock, std::string(peerName)), this);
-    worker->start();
-    worker->detach();
-  }
-  rdma::close(rdmaSocket_);
-  LOG(INFO) << "pserver accept thread finish, rdma uri=" << rdmaUri_;
-}
-
-/**
- * @brief start a socket server
- *
- * @note framework for starting socket server
- */
-void SocketServer::run() {
-  if (tcpRdma_ == F_TCP) {
-    LOG(INFO) << "tcp server start ";
-    tcpServer();
-  } else if (tcpRdma_ == F_RDMA) {
-    LOG(INFO) << "rdma server start ";
-    rdmaServer();
-  }
-}
-
-/**
- * @brief class constructor for rdma client deamons
- *
- * @note  automatically start several client deamons for better performance
- */
-std::unique_ptr<RdmaClientDaemons> RdmaClientDaemons::daemons_ = nullptr;
-std::once_flag RdmaClientDaemons::initDataFlag_;
-
-RdmaClientDaemons::RdmaClientDaemons() {
-  if (FLAGS_rdma_tcp == "rdma") {
-    rdma::init();
-
-    struct sxi_socket *socket;
-    onlineCpus_ = rdma::numCpus();
-    for (auto i = 0; i < onlineCpus_; i++) {
-      socket = rdma::csocket(i);
-      CHECK(socket) << "ERROR open client socket daemon";
-
-      rdmaClientSocket_.push_back(socket);
-    }
-    LOG(INFO) << "RDMA client daemons started, onlineCpus_:" << onlineCpus_;
-    /// round robin scheduler for new connection
-    curCpu_ = 0;
-    /// wait daemons to start completely.
-    sleep(2);
-  }
-}
-
-RdmaClientDaemons::~RdmaClientDaemons() {
-  if (FLAGS_rdma_tcp == "rdma") {
-    for (auto i = 0; i < onlineCpus_; i++) {
-      rdma::close(rdmaClientSocket_[i]);
-    }
-    LOG(INFO) << "RDMA client daemons is destoryed, onlineCpus_ "
-              << onlineCpus_;
-  }
-}
-
-/**
- * @brief worker thread main context
- *
- * @note  each connection from client(trainer) is controlled by single worker
- *        thread, which is for handling all parameter server requests
- */
-void SocketWorker::run() {
-  LOG(INFO) << "worker started, peer = " << channel_->getPeerName();
-
-  std::vector<iovec> inputIovs;
-
-  while (true) {
-    std::unique_ptr<MsgReader> msgReader = channel_->readMessage();
-    if (!msgReader) {
-      break;
-    }
-
-    auto callback = [this](const std::vector<iovec> &outputIovs) {
-      channel_->writeMessage(outputIovs);
-    };
-
-    server_->handleRequest(std::move(msgReader), callback);
-  }
-
-  LOG(INFO) << "worker begin to finish, peer = " << channel_->getPeerName();
-  delete this;
-}
-
-/**
- * @brief start one tcp connection to tcp server
- * @param[in] serverAddr  tcp server ip
- * @param[in] serverPort  tcp server port
- *
- * @note each object contains one channel which accept byte stream
- */
-void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
-  struct sockaddr_in serv_addr;
-  struct hostent *server;
-
-  int errRet;  // temp for gethostbyname_r
-
-  /// Create a socket point
-  int sockfd = socket(AF_INET, SOCK_STREAM, 0);
-  CHECK(sockfd >= 0) << "ERROR opening socket";
-
-#if defined(__OSX__) || defined(__APPLE__)
-  server = getipnodebyname(serverAddr.c_str(), AF_INET, AI_DEFAULT, &errRet);
-  CHECK_NE(HOST_NOT_FOUND, errRet) << "ERROR, no such host: " << serverAddr
-                                   << " ret = " << errRet;
-  CHECK(server) << "getipnodebyname error!";
-#else
-  struct hostent hostinfo;
-  char buf[1024];  // temp for gethostbyname_r
-  CHECK_EQ(
-      0,
-      gethostbyname_r(
-          serverAddr.c_str(), &hostinfo, buf, sizeof(buf), &server, &errRet))
-      << "ERROR, no such host: " << serverAddr << " ret = " << errRet;
-  CHECK(server) << "gethostbyname_r error!";
-#endif
-
-  bzero((char *)&serv_addr, sizeof(serv_addr));
-  serv_addr.sin_family = AF_INET;
-  bcopy((char *)server->h_addr,
-        (char *)&serv_addr.sin_addr.s_addr,
-        server->h_length);
-  serv_addr.sin_port = htons(serverPort);
-
-  setOption(sockfd);
-
-  /// Now connect to the server
-  int retry_count = 0;
-  do {
-    if (connect(sockfd, (sockaddr *)&serv_addr, sizeof(serv_addr)) == 0) {
-      break;
-    }
-
-    if (errno == ECONNREFUSED) {
-      LOG(WARNING) << "connection refused by pserver, try again!";
-      if (retry_count++ >= 7) {
-        LOG(FATAL) << "connection refused by pserver, maybe pserver failed!";
-      }
-      std::this_thread::sleep_for(std::chrono::seconds(1));
-    } else {
-      CHECK(errno != 0) << "ERROR connecting to " << serverAddr << ":"
-                        << serverPort << "errorno: " << errno;
-    }
-  } while (errno == ECONNREFUSED);
-
-  channel_.reset(new SocketChannel(sockfd, serverAddr));
-  tcpRdma_ = F_TCP;
-}
-
-/**
- * @brief start one RDMA connection to rdma server
- * @param[in] serverAddr  rdma server ip
- * @param[in] serverPort  rdma server port
- *
- * @note  each object contains one channel which accept byte stream
- *        for rdma, low level sock also provide byte stream api.
- */
-void SocketClient::RdmaClient(const std::string &serverAddr, int serverPort) {
-  struct sxi_sock *sock;
-
-  std::stringstream ss;
-  ss << serverPort;
-
-  std::string rdmaUri = "rdma://" + serverAddr + ":" + ss.str();
-
-  RdmaClientDaemons *daemons = RdmaClientDaemons::daemons_->get();
-  socketDaemon_ = daemons->selectDaemon();
-
-  /// connect to server with socket daemon
-  sock = rdma::connect(socketDaemon_, rdmaUri.c_str());
-  CHECK(sock) << "ERROR connect to server" << rdmaUri;
-
-  std::vector<std::string> seg;
-  str::split(rdmaUri, '/', &seg);
-  std::string server = seg.at(seg.size() - 1);
-  channel_.reset(new SocketChannel(sock, server));
-  tcpRdma_ = F_RDMA;
-}
-
-/**
- * @brief class constructor
- * @param[in] serverAddr pserver ip address
- * @param[in] serverPort pserver port
- * @param[in] ChannelType F_TCP or F_RDMA
- *
- * @note  responsible for building one connection to specified pserver port
- */
-SocketClient::SocketClient(const std::string &serverAddr,
-                           int serverPort,
-                           enum ChannelType channelType) {
-  if (channelType == F_RDMA)
-    RdmaClient(serverAddr, serverPort);
-  else
-    TcpClient(serverAddr, serverPort);
-}
-
-}  // namespace paddle
diff --git a/paddle/pserver/LightNetwork.h b/paddle/pserver/LightNetwork.h
deleted file mode 100644
index 2aaa26a5c708f9c01f006136619f599bcfe0db71..0000000000000000000000000000000000000000
--- a/paddle/pserver/LightNetwork.h
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "SocketChannel.h"
-
-#include <atomic>
-#include <memory>
-#include <thread>
-#include <vector>
-
-#include "paddle/utils/Thread.h"
-
-struct sxi_socket;
-
-namespace paddle {
-
-class SocketWorker;
-
-/**
- * @brief class for holding all parameters processing for current port
- *
- * @note  each parameter server inherits from one socket server, each
- *        server contains serveral woker threads which are to parallelize
- *        the processing of computation, but share some common datas stored
- *        in child class of socketserver.
- */
-class SocketServer : public Thread {
-  // rdmaCpu controls the cpu affinity of RDMA server daemon,
-  // which could benifit performance. rdmaCpu = -1 means TCP
-  // is used instead of RDMA transport.
-public:
-  SocketServer(const std::string& addr, int port, int rdmaCpu);
-  ~SocketServer();
-
-  virtual void run();
-
-  typedef std::function<void(const std::vector<iovec>& outputIovs)>
-      ResponseCallback;
-
-protected:
-  //
-  // The derived class needs to implement this function
-  // to handle the request received by SocketWorker
-  // The request is encapsulated by MsgReader, which contains
-  // a set of blocks.
-  virtual void handleRequest(std::unique_ptr<MsgReader> msgReader,
-                             ResponseCallback callback) = 0;
-
-  std::unique_ptr<SocketChannel> createChannel(int sock,
-                                               const std::string& peerName) {
-    return std::unique_ptr<SocketChannel>(new SocketChannel(sock, peerName));
-  }
-  std::unique_ptr<SocketChannel> createChannel(struct sxi_sock* sock,
-                                               const std::string& peerName) {
-    return std::unique_ptr<SocketChannel>(new SocketChannel(sock, peerName));
-  }
-
-  friend class SocketWorker;
-
-private:
-  void rdmaServer();
-  void tcpServer();
-
-  void detach() {}  // detach accept thread is forbidden
-
-protected:
-  enum ChannelType tcpRdma_;
-  // for rdma
-  int rdmaCpu_;
-  std::string rdmaUri_;
-  sxi_socket* rdmaSocket_;
-  // for tcp
-  int port_;
-  std::string addr_;
-  int socket_;
-  int maxPendingConnections_;
-  bool stopping_;
-};
-
-/**
- * @brief class for holding one connection from one trainer
- *
- * @note  all parameter processing will run in the context of this worker
- */
-class SocketWorker : public Thread {
-public:
-  SocketWorker(std::unique_ptr<SocketChannel>&& channel, SocketServer* server)
-      : channel_(std::move(channel)), server_(server) {}
-
-  virtual ~SocketWorker() {}
-
-  virtual void run();
-
-protected:
-  std::unique_ptr<SocketChannel> channel_;
-  SocketServer* server_;
-  enum ChannelType tcpRdma_;
-};
-
-/**
- * @brief class for providing rdma client deamon thread
- *
- * @note  the deamons are required by sock like rdam library. Here
- *        use singleton model for daemons. Each deamon hosts in
- *        single cpu core for better load balance performance
- */
-class RdmaClientDaemons {
-private:
-  RdmaClientDaemons();
-
-  static std::unique_ptr<RdmaClientDaemons> daemons_;
-
-public:
-  static RdmaClientDaemons* get() {
-    std::call_once(RdmaClientDaemons::initDataFlag_,
-                   &RdmaClientDaemons::getInstance);
-
-    return daemons_.get();
-  }
-
-  struct sxi_socket* selectDaemon() {
-    int cpu = curCpu_;
-    curCpu_ = (curCpu_ + 1) % onlineCpus_;
-
-    LOG(INFO) << "select daemon " << cpu << "onlineCpus_ " << onlineCpus_;
-    return rdmaClientSocket_[cpu];
-  }
-
-  ~RdmaClientDaemons();
-
-public:
-  friend class SocketClient;
-
-private:
-  static std::once_flag initDataFlag_;
-  static void getInstance() {
-    if (!daemons_.get()) daemons_.reset(new RdmaClientDaemons());
-  }
-
-  std::vector<struct sxi_socket*> rdmaClientSocket_;
-  std::atomic<int> curCpu_;
-  int onlineCpus_;
-};
-
-/**
- * @brief management for client connection which are from trainers
- *
- * @note  it contains one channel descriptor which used to write and
- *        read data
- */
-class SocketClient {
-public:
-  SocketClient(const std::string& serverAddr,
-               int serverPort,
-               enum ChannelType channelType);
-
-  SocketChannel* getChannel() { return channel_.get(); }
-
-protected:
-  std::unique_ptr<SocketChannel> channel_;
-  struct sxi_socket* socketDaemon_;
-  enum ChannelType tcpRdma_;
-
-private:
-  void RdmaClient(const std::string& serverAddr, int serverPort);
-  void TcpClient(const std::string& serverAddr, int serverPort);
-};
-
-std::string getIpAddr(std::string& device);
-void setOption(int sockfd);
-
-}  // namespace paddle
diff --git a/paddle/pserver/ParameterClient2.cpp b/paddle/pserver/ParameterClient2.cpp
deleted file mode 100644
index 43e4902b0f0f73840624041f19ba7f4eb9a45844..0000000000000000000000000000000000000000
--- a/paddle/pserver/ParameterClient2.cpp
+++ /dev/null
@@ -1,781 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-
-#include "ParameterClient2.h"
-#include "paddle/math/SparseRowMatrix.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/StringUtil.h"
-
-DEFINE_string(pservers, "127.0.0.1", "Comma separated addresses of pservers");
-DEFINE_int32(parallel_thread_num, 1, "Thread number for parameter send");
-
-namespace paddle {
-
-template <typename T1, typename T2>
-void copyToRepeatedField(google::protobuf::RepeatedField<T1>* dest,
-                         const T2* src,
-                         size_t size) {
-  dest->Clear();
-  dest->Reserve(size);
-  for (size_t i = 0; i < size; ++i) {
-    dest->AddAlreadyReserved(src[i]);
-  }
-}
-
-ParameterClient2::ParameterClient2(bool separate, int port, int numPorts)
-    : BaseClient(separate, numPorts), port_(port) {
-#ifndef PADDLE_DISABLE_TIMER
-  forwardbackwordTime_ = 0;
-#endif
-}
-
-int ParameterClient2::calcParameterBlockSize(
-    const std::vector<ParameterPtr>& parameters, size_t serviceNum) {
-  size_t totalSize = 0;
-  for (auto& para : parameters) {
-    totalSize += para->getSize();
-  }
-  size_t perServerSize = totalSize / serviceNum;
-
-  int sizeBits = 64 - __builtin_clzl(perServerSize);
-
-  /// 2^10 is min block size
-  /// 2^7 will be max number of blocks in one pserver
-  int blockSizeBits = std::max((sizeBits - 7), 10);
-  return 1 << blockSizeBits;
-}
-
-void ParameterClient2::initThreads() {
-  threadNum_ = serviceNum_;
-  if (FLAGS_parallel_thread_num > 1) {
-    LOG(INFO) << "parallel_thread_num dosent need to set";
-  }
-  syncThreadPool_.reset(new SyncThreadPool(threadNum_));
-  startThreads();
-}
-
-bool ParameterClient2::init(const std::vector<ParameterPtr>& parameters) {
-  destroy();
-
-  std::vector<std::string> hosts;
-  str::split(FLAGS_pservers, ',', &hosts);
-  serviceNum_ = hosts.size() * numPorts_;
-  uint64_t denseBlockSize = calcParameterBlockSize(parameters, serviceNum_);
-
-  /// setup prefetch matrix if exists
-  for (auto& para : parameters) {
-    /// set block size for each parameter
-    para->getConfig().set_parameter_block_size(
-        para->getConfig().sparse_remote_update() ? para->getConfig().dims(1)
-                                                 : denseBlockSize);
-  }
-
-  for (auto& para : parameters) {
-    CHECK_NE(-1UL, para->getID()) << "id in parameter is not initialized";
-    parameterMap_[para->getID()] = para;
-  }
-
-  allSegments_.reserve(parameters.size());
-
-  for (auto& para : parameters) {
-    ParameterSegments segments;
-    segments.name = para->getName();
-    segments.id = para->getID();
-    allSegments_.push_back(segments);
-    if (para->getConfig().sparse_remote_update()) {
-      CHECK_EQ(para->getConfig().parameter_block_size(),
-               para->getConfig().dims(1))
-          << "For sparse remote update parameter,"
-          << " block size is the width of each row.";
-    }
-  }
-
-  /// init clients
-  clients_.reserve(serviceNum_);
-  recvDataMems_.resize(serviceNum_);
-
-  for (size_t i = 0; i < hosts.size(); ++i) {
-    for (int j = 0; j < numPorts_; ++j) {
-      LOG(INFO) << "pserver " << i * numPorts_ + j << " " << hosts[i] << ":"
-                << port_ + j;
-      if (FLAGS_rdma_tcp == "rdma") {
-        clients_.emplace_back(hosts[i], port_ + j, F_RDMA);
-      } else {
-        clients_.emplace_back(hosts[i], port_ + j, F_TCP);
-      }
-    }
-  }
-
-  sparseDistribution_.reset(new SparseParameterDistribution(serviceNum_));
-
-  sleep(2);
-
-  initThreads();
-
-  return true;
-}
-
-ParameterClient2::~ParameterClient2() { destroy(); }
-
-void ParameterClient2::destroy() {
-  if (clients_.empty()) {
-    /// this means not initialized.
-    return;
-  }
-  finishThreads();
-
-  parameterMap_.clear();
-  allSegments_.clear();
-  clients_.clear();
-}
-
-void ParameterClient2::sendParallel(int tid,
-                                    size_t numThreads,
-                                    ParameterType recvParameterType) {
-  int numMyClients = divup(serviceNum_ - tid, numThreads);
-
-  for (int j = 0; j < numMyClients; ++j) {
-    REGISTER_TIMER("client_sendAndRecv_send");
-    int i = numThreads * j + tid;
-    /// Try to make different clients to send data to different pservers
-    /// at the same time so that they will not flood data to the same
-    /// pserver.
-    i = calcClientId(i, serviceNum_);
-    clients_[i].send("sendParameter",
-                     sendJob_.parallelRequests[i],
-                     sendJob_.parallelInputIovs[i]);
-
-    /// clear large structure
-    sendJob_.parallelRequests[i].Clear();
-    sendJob_.parallelInputIovs[i].clear();
-  }
-
-  std::vector<void*> bufs;
-  SendParameterResponse response;
-  for (int j = 0; j < numMyClients; ++j) {
-    REGISTER_TIMER("client_sendAndRecv_recv");
-    int i = numThreads * j + tid;
-    i = calcClientId(i, serviceNum_);
-    auto msgReader = clients_[i].recv(&response);
-    CHECK_EQ(msgReader->getNumBlocks(), (size_t)response.blocks_size());
-    bufs.clear();
-    bufs.reserve(response.blocks_size());
-    for (auto& block : response.blocks()) {
-      auto it = parameterMap_.find(block.para_id());
-      CHECK(it != parameterMap_.end());
-      Parameter* parameter = it->second.get();
-      real* buf = nullptr;
-      if (parameter->getBuf(recvParameterType)) {
-        buf = parameter->getBuf(recvParameterType)->getPoint(block.begin_pos());
-      } else {
-        auto recvMat = dynamic_cast<SparseRowCpuMatrix*>(
-            parameter->getMat(recvParameterType).get());
-        CHECK(recvMat);
-        size_t width = parameter->getConfig().dims(1);
-        // TODO(wuyi): need add lock here? may also cause resize.
-        buf = recvMat->getLocalRow(block.begin_pos() / width);
-      }
-      /// sparse_id is not useful while receiving data since sparse data
-      /// storage is continuous, do commit recieved data as that of dense.
-      bufs.push_back(buf);
-    }
-    msgReader->readBlocks(bufs);
-  }
-}
-
-void ParameterClient2::prepareSendData(
-    ParameterUpdateMode updateMode,
-    ParameterType parameterType,
-    const std::vector<ParameterSegments>& parameterSegments,
-    int64_t numSamples,
-    real cost,
-    bool sendBackParameter,
-    ParameterType sendBackParameterType,
-    BatchStatus batchStatus,
-    SendJob* sendJob) {
-  sendJob->parallelRequests.resize(serviceNum_);
-  sendJob->parallelInputIovs.resize(serviceNum_);
-
-  for (auto& request : sendJob->parallelRequests) {
-#ifndef PADDLE_DISABLE_TIMER
-    if (updateMode == PSERVER_UPDATE_MODE_ADD_GRADIENT) {
-      request.set_forwardbackward_time(forwardbackwordTime_);
-    }
-#endif
-    request.set_trainer_id(trainerId_);
-    request.set_update_mode(updateMode);
-    request.set_send_back_parameter(sendBackParameter);
-    request.set_send_back_parameter_type(sendBackParameterType);
-    request.set_num_samples(numSamples);
-    request.set_cost(cost);
-    request.set_batch_status(batchStatus);
-    CHECK_EQ(request.blocks_size(), 0);
-    VLOG(10) << "request: trainer_id: " << request.trainer_id()
-             << " update_mode" << request.update_mode()
-             << " send_back_parameter: " << request.send_back_parameter()
-             << " send_back_parameter_type: "
-             << request.send_back_parameter_type()
-             << " num_samples: " << request.num_samples()
-             << " cost: " << request.cost()
-             << " batch_status: " << request.batch_status();
-  }
-  for (const auto& segments : parameterSegments) {
-    const auto it = parameterMap_.find(segments.id);
-    CHECK(it != parameterMap_.end());
-    Parameter* parameter = it->second.get();
-    CHECK(parameter != nullptr) << "parameter is nullptr";
-    int64_t nameHash = std::hash<std::string>()(segments.name);
-    bool sendingPara = !(updateMode == PSERVER_UPDATE_MODE_GET_PARAM ||
-                         updateMode == PSERVER_UPDATE_MODE_GET_PARAM_SPARSE ||
-                         updateMode == PSERVER_UPDATE_MODE_SET_PARAM_ZERO);
-    bool sparseUpdate = parameter->getConfig().sparse_remote_update() &&
-                        (updateMode == PSERVER_UPDATE_MODE_ADD_GRADIENT ||
-                         updateMode == PSERVER_UPDATE_MODE_ASYNC_SGD ||
-                         updateMode == PSERVER_UPDATE_MODE_GET_PARAM_SPARSE);
-
-    const auto blockSize = parameter->getConfig().parameter_block_size();
-    CHECK_GE(blockSize, 1LU) << "blockSize should > 0 " << blockSize;
-    const auto paraSize = parameter->getSize();
-    if (sparseUpdate) {
-      auto prefetchMat = std::dynamic_pointer_cast<SparsePrefetchRowCpuMatrix>(
-          parameter->getMat(PARAMETER_VALUE));
-      CHECK(prefetchMat != nullptr) << "prefetchMat is nullptr";
-      auto sendMat = dynamic_cast<SparseRowCpuMatrix*>(
-          parameter->getMat(parameterType).get());
-      CHECK(sendMat != nullptr) << "sendMat is nullptr";
-
-      syncThreadPool_->exec([&](int tid, size_t numThreads) {
-        std::lock_guard<std::mutex> guard(sparseAutoGrowthMutex_);
-        const auto& localIndices = prefetchMat->getLocalIndices();
-        /// num of sparse rows
-        size_t nLocalBlocks = localIndices.size();
-        uint64_t beginDim = 0;
-        uint64_t endDim = 0;
-
-        // HACK(typhoonzero): let it resize first
-        prefetchMat->getLocalRow(nLocalBlocks);
-        sendMat->getLocalRow(nLocalBlocks);
-
-        for (size_t row = 0; row < nLocalBlocks; ++row) {
-          int64_t blockId = localIndices[row];  // local row -> sparse row
-          int serverId = std::abs((blockId + nameHash) % serviceNum_);
-          if (serverId % numThreads != (size_t)tid) {
-            continue;
-          }
-
-          beginDim = blockId * blockSize;
-          endDim = std::min<int64_t>(beginDim + blockSize, paraSize);
-
-          auto& request = sendJob->parallelRequests[serverId];
-          ParameterBlock* block = request.add_blocks();
-          block->set_para_id(segments.id);
-          /// global sparse row id
-          block->set_block_id(blockId);
-          /// local row offset
-          block->set_begin_pos(row * blockSize);
-          /// block len
-          block->set_block_size(endDim - beginDim);
-          if (sendingPara) {
-            sendJob->parallelInputIovs[serverId].push_back(
-                {sendMat->getLocalRow(row), sizeof(real) * (size_t)blockSize});
-            /// detect sparse parameter distribution
-            sparseDistribution_->probeDistribution(serverId,
-                                                   sizeof(real) * blockSize);
-          }
-        }
-      });
-
-    } else {  /// parameter set for dense and sparse
-      real* buf =
-          sendingPara ? parameter->getBuf(parameterType)->getPoint(0) : nullptr;
-      uint64_t endDim = 0;
-      for (uint64_t beginDim = 0; beginDim < paraSize; beginDim = endDim) {
-        endDim = std::min<int64_t>(beginDim + blockSize, paraSize);
-        int64_t blockId = beginDim / blockSize;
-        int serverId = std::abs((blockId + nameHash) % serviceNum_);
-
-        auto& request = sendJob->parallelRequests[serverId];
-        ParameterBlock* block = request.add_blocks();
-        block->set_para_id(segments.id);
-        block->set_block_id(blockId);
-        block->set_begin_pos(beginDim);
-        block->set_block_size(endDim - beginDim);
-        if (buf) {
-          sendJob->parallelInputIovs[serverId].push_back(
-              {buf + beginDim, sizeof(real) * ((size_t)(endDim - beginDim))});
-        }
-      }
-    }
-  }  // parameterSegments
-
-  sparseDistribution_->checkAndResetDistribution();
-}
-
-void ParameterClient2::sendAndReceiveParameter(
-    ParameterUpdateMode updateMode,
-    ParameterType parameterType,
-    const std::vector<ParameterSegments>& parameterSegments,
-    int64_t numSamples,
-    real cost,
-    bool sendBackParameter,
-    ParameterType sendBackParameterType,
-    ParameterType recvParameterType) {
-  prepareSendData(updateMode,
-                  parameterType,
-                  parameterSegments,
-                  numSamples,
-                  cost,
-                  sendBackParameter,
-                  sendBackParameterType,
-                  /*batchStatus = */ BATCH_START_AND_FINISH,
-                  &sendJob_);
-
-  syncThreadPool_->exec([&](int tid, size_t numThreads) {
-    this->sendParallel(tid, numThreads, recvParameterType);
-  });
-}
-
-void ParameterClient2::sendParameter(
-    ParameterUpdateMode updateMode,
-    ParameterType parameterType,
-    const std::vector<ParameterSegments>& parameterSegments,
-    int64_t numSamples,
-    real cost,
-    bool sendBackParameter,
-    BatchStatus batchStatus) {
-  SendJobPtr sendJob = std::make_shared<SendJob>();
-  prepareSendData(updateMode,
-                  parameterType,
-                  parameterSegments,
-                  numSamples,
-                  cost,
-                  sendBackParameter,
-                  PARAMETER_VALUE,
-                  batchStatus,
-                  sendJob.get());
-
-  for (int i = 0; i < threadNum_; i++) {
-    sendJobQueue_[i]->enqueue(sendJob);
-  }
-}
-
-void ParameterClient2::recvParameter() { recvSyncBarrier_->wait(); }
-
-void ParameterClient2::send(int threadId) {
-  int index = threadId;
-  LOG(INFO) << "send thread " << threadId << " started";
-  int numMyClients = divup(serviceNum_ - index, threadNum_);
-  while (true) {
-    SendJobPtr recvJob = sendJobQueue_[index]->dequeue();
-    if (stopping_) {
-      recvJobQueue_[index]->enqueue(recvJob);
-      break;
-    }
-    for (int j = 0; j < numMyClients; ++j) {
-      REGISTER_TIMER("client_send");
-      int i = threadNum_ * j + index;
-      /// Try to make different clients to send data to different pservers
-      /// at the same time so that they will not flood data to the same
-      /// pserver.
-      i = calcClientId(i, serviceNum_);
-      if (recvJob->parallelRequests.size()) {
-        clients_[i].send("sendParameter",
-                         recvJob->parallelRequests[i],
-                         recvJob->parallelInputIovs[i]);
-      } else {
-        clients_[i].send("sendData",
-                         recvJob->parallelDataRequests[i],
-                         recvJob->parallelInputIovs[i]);
-      }
-    }
-    recvJobQueue_[index]->enqueue(recvJob);
-  }
-}
-
-void ParameterClient2::recv(int threadId) {
-  LOG(INFO) << "recv thread " << threadId << " started";
-  int index = threadId;
-  int numMyClients = divup(serviceNum_ - index, threadNum_);
-  while (true) {
-    std::vector<void*> bufs;
-    SendParameterResponse response;
-    SendDataResponse dataResponse;
-    SendJobPtr recvJob = recvJobQueue_[index]->dequeue();
-    if (stopping_) break;
-    for (int j = 0; j < numMyClients; ++j) {
-      REGISTER_TIMER("client_recv");
-      int i = threadNum_ * j + index;
-      i = calcClientId(i, serviceNum_);
-      if (recvJob->parallelRequests.size()) {
-        auto msgReader = clients_[i].recv(&response);
-        CHECK_EQ(msgReader->getNumBlocks(), (size_t)response.blocks_size());
-        bufs.clear();
-        bufs.reserve(response.blocks_size());
-        for (auto& block : response.blocks()) {
-          auto it = parameterMap_.find(block.para_id());
-          CHECK(it != parameterMap_.end());
-          Parameter* parameter = it->second.get();
-          real* buf =
-              parameter->getBuf(PARAMETER_VALUE)->getPoint(block.begin_pos());
-          CHECK_EQ(msgReader->getBlockLength(bufs.size()),
-                   sizeof(real) * (block.block_size()));
-          bufs.push_back(buf);
-        }
-        msgReader->readBlocks(bufs);
-      } else {
-        auto msgReader = clients_[i].recv(&dataResponse);
-        CHECK_EQ(msgReader->getNumBlocks(), (size_t)dataResponse.blocks_size());
-        size_t totalLen = msgReader->getTotalLength();
-        if (0 == totalLen) {
-          continue;
-        }
-        auto& recvMem = recvDataMems_[dataResponse.server_id()];
-        CHECK_EQ(dataResponse.blocks_size(), 1)
-            << "Only one block currently support now!";
-        auto& block = dataResponse.blocks(0);
-        CHECK_EQ(totalLen % sizeof(block.data_size()), 0U);
-        recvMem = std::make_shared<CpuMemoryHandle>(totalLen);
-        msgReader->readNextBlock(recvMem.get()->getBuf());
-      }
-    }
-    recvSyncBarrier_->wait();
-  }
-}
-
-void ParameterClient2::waitPassStart() {
-  WaitPassStartRequest request;
-  std::vector<WaitPassStartResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void ParameterClient2::waitPassFinish() {
-  WaitPassFinishRequest request;
-  std::vector<WaitPassFinishResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void ParameterClient2::synchronize(SyncObject syncObjectId) {
-  SynchronizeRequest request;
-  request.set_sync_object_id(syncObjectId);
-  std::vector<SynchronizeResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void ParameterClient2::asyncFinishPass(SyncObject syncObjectId) {
-  SynchronizeRequest request;
-  request.set_sync_object_id(syncObjectId);
-  request.set_trainer_id(trainerId_);
-  std::vector<SynchronizeResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void ParameterClient2::setConfig(const OptimizationConfig& optConfig,
-                                 const std::string& saveDir,
-                                 bool isSparseServer) {
-  SetConfigRequest request;
-  std::vector<SetConfigResponse> responses;
-
-  for (auto& nameAndPara : parameterMap_) {
-    *request.add_param_configs() = nameAndPara.second->getConfig();
-  }
-
-  *request.mutable_opt_config() = optConfig;
-  request.set_save_dir(saveDir);
-  request.set_is_sparse_server(isSparseServer);
-
-  std::vector<SetConfigRequest> requests;
-  requests.resize(clients_.size());
-  for (size_t i = 0; i < requests.size(); ++i) {
-    requests[i].CopyFrom(request);
-    requests[i].set_server_id(i);
-  }
-
-  responses.resize(clients_.size());
-  size_t numClients = clients_.size();
-  for (size_t i = 0; i < numClients; ++i) {
-    clients_[i].send(__func__, requests[i]);
-  }
-  for (size_t i = 0; i < numClients; ++i) {
-    clients_[i].recv(&responses[i]);
-  }
-}
-
-bool ParameterClient2::inStatus(PServerStatus status) {
-  GetStatusRequest request;
-  std::vector<GetStatusResponse> responses;
-
-  bool ok = true;
-  multiCall("getStatus", request, &responses);
-  for (auto& response : responses) {
-    if (response.status() != status) {
-      ok = false;
-    }
-  }
-
-  return ok;
-}
-
-void ParameterClient2::setStatus(PServerStatus status) {
-  SetStatusRequest request;
-  request.set_status(status);
-  std::vector<SetStatusResponse> responses;
-  multiCall(__func__, request, &responses);
-}
-
-void ParameterClient2::waitForStatus(PServerStatus status) {
-  while (!inStatus(status)) {
-    sleep(1);
-  }
-}
-
-template <typename Proto>
-static void validateResponses(const std::vector<Proto>& responses) {
-  for (auto& response : responses) {
-    CHECK(response.return_message().empty())
-        << "client" << &response - &responses[0]
-        << " error:" << response.return_message();
-  }
-}
-
-PServerVector ParameterClient2::createVector() {
-  CreateVectorRequest request;
-  std::vector<CreateVectorResponse> responses;
-  int64_t handle = -1;
-
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-
-  for (auto& response : responses) {
-    if (handle == -1) {
-      handle = response.handle();
-    } else {
-      CHECK_EQ(handle, response.handle()) << "Inconsistent handle from client"
-                                          << &response - &responses[0] << " "
-                                          << handle << " " << response.handle();
-    }
-  }
-  return PServerVector{handle};
-}
-
-void ParameterClient2::releaseVector(PServerVector handle) {
-  ReleaseVectorRequest request;
-  std::vector<ReleaseVectorResponse> responses;
-
-  request.set_handle(handle.handle);
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-}
-
-PServerMatrix ParameterClient2::createMatrix(int32_t numCols) {
-  CreateMatrixRequest request;
-  std::vector<CreateMatrixResponse> responses;
-  int64_t handle = -1;
-
-  request.set_num_cols(numCols);
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-
-  for (auto& response : responses) {
-    if (handle == -1) {
-      handle = response.handle();
-    } else {
-      CHECK_EQ(handle, response.handle()) << "Inconsistent handle from client"
-                                          << &response - &responses[0] << " "
-                                          << handle << " " << response.handle();
-    }
-  }
-  return PServerMatrix{handle};
-}
-
-void ParameterClient2::releaseMatrix(PServerMatrix handle) {
-  ReleaseMatrixRequest request;
-  std::vector<ReleaseMatrixResponse> responses;
-
-  request.set_handle(handle.handle);
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-}
-
-void PreparedOperations::addOperationHelper(Operation* op, CpuVectorPtr vec) {
-  ProtoVector& pvec = *op->add_vectors();
-  size_t dim = vec->getSize();
-  pvec.set_dim(dim);
-  copyToRepeatedField(pvec.mutable_values(), vec->getData(), vec->getSize());
-}
-
-void PreparedOperations::addOperationHelper(Operation* op, CpuMatrixPtr mat) {
-  ProtoMatrix& pmat = *op->add_matrices();
-  pmat.set_num_cols(mat->getWidth());
-  pmat.set_num_rows(mat->getHeight());
-  copyToRepeatedField(
-      pmat.mutable_values(), mat->getData(), pmat.num_cols() * pmat.num_rows());
-}
-
-static inline real addTwo(real a, double b) { return a + b; }
-
-void ParameterClient2::doOperation(PreparedOperations& ops,
-                                   bool waitForGradient,
-                                   bool sendBackGradient,
-                                   bool releasePass) {
-  std::vector<DoOperationResponse> responses;
-  ops.request_.set_wait_for_gradient(waitForGradient);
-  ops.request_.set_send_back_parameter(sendBackGradient);
-  ops.request_.set_release_pass(releasePass);
-  multiCall(__func__, ops.request_, &responses);
-  validateResponses(responses);
-  size_t numPassFinishServers = 0;
-
-  size_t numOps = ops.request_.operations_size();
-  for (auto& response : responses) {
-    numPassFinishServers += response.pass_finish();
-    CHECK_EQ(numOps, (size_t)response.results_size());
-    for (size_t opId = 0; opId < numOps; ++opId) {
-      const OperationResult& result = response.results(opId);
-      std::vector<real*>& resultScalars = ops.localResults_[opId].resultScalars;
-      std::vector<CpuVectorPtr>& resultVectors =
-          ops.localResults_[opId].resultVectors;
-      std::vector<CpuMatrixPtr>& resultMatrices =
-          ops.localResults_[opId].resultMatrices;
-
-      if (&response == &responses[0]) {
-        /// Initialize results to zero
-
-        resultScalars.resize(result.scalars_size());
-        for (auto p : resultScalars) {
-          if (!p) continue;
-          *p = 0;
-        }
-        size_t numVectors = result.vectors_size();
-        resultVectors.resize(numVectors);
-        for (size_t i = 0; i < numVectors; ++i) {
-          if (!resultVectors[i]) continue;
-          resultVectors[i]->resize(result.vectors(i).dim());
-          resultVectors[i]->zeroMem();
-        }
-        size_t numMatrices = result.matrices_size();
-        resultMatrices.resize(numMatrices);
-        for (size_t i = 0; i < numMatrices; ++i) {
-          if (!resultMatrices[i]) continue;
-          resultMatrices[i]->resize(result.matrices(i).num_rows(),
-                                    result.matrices(i).num_cols());
-          resultMatrices[i]->zeroMem();
-        }
-      }
-
-      // aggregate results from each pserver to results
-
-      CHECK_EQ(resultScalars.size(), (size_t)result.scalars_size());
-      for (ssize_t i = 0; i < result.scalars_size(); ++i) {
-        real* rscalar = resultScalars[i];
-        if (!rscalar) continue;
-        *rscalar += result.scalars(i);
-      }
-
-      CHECK_EQ(resultVectors.size(), (size_t)result.vectors_size());
-      for (auto& vec : result.vectors()) {
-        int i = &vec - &result.vectors(0);
-        CpuVectorPtr rvec = resultVectors[i];
-        if (!rvec) continue;
-        CHECK_EQ(rvec->getSize(), (size_t)vec.dim());
-        std::transform(rvec->getData(),
-                       rvec->getData() + rvec->getSize(),
-                       vec.values().data(),
-                       rvec->getData(),
-                       addTwo);
-      }
-
-      CHECK_EQ(resultMatrices.size(), (size_t)result.matrices_size());
-      for (auto& mat : result.matrices()) {
-        int i = &mat - &result.matrices(0);
-        CpuMatrixPtr rmat = resultMatrices[i];
-        if (!rmat) continue;
-        CHECK_EQ(rmat->getHeight(), (size_t)mat.num_rows());
-        CHECK_EQ(rmat->getWidth(), (size_t)mat.num_cols());
-
-        std::transform(rmat->getData(),
-                       rmat->getData() + rmat->getElementCnt(),
-                       mat.values().data(),
-                       rmat->getData(),
-                       addTwo);
-      }
-    }
-  }
-  passFinish_ = numPassFinishServers == clients_.size();
-}
-
-real ParameterClient2::vectorDotProduct(PServerVector u, PServerVector v) {
-  real result = 0.0;
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_utv, u, v)(&result);
-  doOperation(ops, false, false);
-  return result;
-}
-
-void ParameterClient2::vectorScale(PServerVector u, real a) {
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_au, u, a);
-  doOperation(ops, false, false);
-}
-
-void ParameterClient2::vectorCopy(PServerVector src, PServerVector dst) {
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_COPY, src, dst);
-  doOperation(ops, false, false);
-}
-
-void ParameterClient2::vectorAddMult(PServerVector u, PServerVector v, real a) {
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_au_bv, v, u, a, (real)1);
-  doOperation(ops, false, false);
-}
-
-void ParameterClient2::vectorAddMultInto(PServerVector u,
-                                         PServerVector v,
-                                         PServerVector w,
-                                         real a) {
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_au_bv_cw, v, w, u, (real)1, a, (real)0);
-  doOperation(ops, false, false);
-}
-
-void ParameterClient2::vectorScaleInto(PServerVector u,
-                                       PServerVector v,
-                                       real a) {
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_au_bv, v, u, a, (real)0);
-  doOperation(ops, false, false);
-}
-
-void ParameterClient2::loadValueVector(const std::string& dirName) {
-  LoadValueRequest request;
-  request.set_dir_name(dirName);
-  std::vector<LoadValueResponse> responses;
-
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-}
-
-void ParameterClient2::saveValueVector(const std::string& dirName) {
-  SaveValueRequest request;
-  request.set_dir_name(dirName);
-  std::vector<SaveValueResponse> responses;
-
-  multiCall(__func__, request, &responses);
-  validateResponses(responses);
-}
-
-}  // namespace paddle
diff --git a/paddle/pserver/ParameterClient2.h b/paddle/pserver/ParameterClient2.h
deleted file mode 100644
index d63273ccbc8ed30d9df50d9f8b1a4d1e4fba6720..0000000000000000000000000000000000000000
--- a/paddle/pserver/ParameterClient2.h
+++ /dev/null
@@ -1,602 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <atomic>
-#include <mutex>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/pserver/BaseClient.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/Queue.h"
-#include "paddle/utils/Util.h"
-
-#include "ParameterService.pb.h"
-
-#include "ProtoServer.h"
-#include "SparseParameterDistribution.h"
-
-DECLARE_int32(parallel_thread_num);
-
-namespace paddle {
-
-struct PServerMatrix {
-  int64_t handle;
-};
-
-struct PServerVector {
-  int64_t handle;
-};
-
-/**
- * @brief A class to help to prepare server-side operations.
- */
-class PreparedOperations {
-protected:
-  class ResultsAdder;
-  struct LocalOperationResult;
-
-public:
-  /**
-   * Offers an easy way to prepare operations that will be performed on
-   * server-side.
-   *
-   * Usage:
-   * @code
-   *   addOperation(optype, arguments...)(results...)
-   * @endcode
-   *
-   * Examples:
-   * 1. set pserver vector to 1:
-   * @code
-   *   PServerVector u = parameterClient.createVector();
-   *   addOperation(PSERVER_OP_RESET, u, (real)1);
-   * @endcode
-   *
-   * 2. Compute inner product of to pserver vectors.
-   * @code
-   *   PServerVector u = parameterClient.createVector();
-   *   PServerVector v = parameterClient.createVector();
-   *   real result;
-   *   addOperation(PSERVER_OP_utv, u, v)(&result)
-   * @endcode
-   *
-   * @param[in] operation The operation that pserver will perform.
-   * @param[in] args Argument list of the operation
-   * @return A ResultsAdder object initialized with the last element of
-   *         localResults_.
-   */
-  template <typename... Args>
-  ResultsAdder addOperation(MatrixVectorOperation operation, Args... args) {
-    Operation* op = request_.add_operations();
-    op->set_operation(operation);
-    localResults_.emplace_back();
-    addOperationHelper(op, args...);
-    return ResultsAdder(&localResults_.back());
-  }
-
-protected:
-  void addOperationHelper(Operation* op) {}
-
-  /**
-   * @brief Helper function to add an new operation that takes a PServerVector
-   *        as an operand.
-   */
-  void addOperationHelper(Operation* op, PServerVector arg) {
-    op->add_pvectors(arg.handle);
-  }
-
-  /**
-   * @brief Helper function to add an new operation that takes a PServerMatrix
-   *        as an operand.
-   */
-  void addOperationHelper(Operation* op, PServerMatrix arg) {
-    op->add_pmatrices(arg.handle);
-  }
-
-  /**
-   * @brief Helper function to add an new operation that takes a real valued
-   *        scalar as an operand.
-   */
-  void addOperationHelper(Operation* op, real arg) { op->add_scalars(arg); }
-
-  /**
-   * @brief Helper function to add an new operation that takes a CpuVectorPtr
-   *        as an operand.
-   * @note The array of CpuVectors that arg points to will be copied to
-   *       op's vectors field.
-   */
-  void addOperationHelper(Operation* op, CpuVectorPtr arg);
-
-  /**
-   * @brief Helper function to add an new operation that takes a CpuMatrixPtr
-   *        as an operand.
-   * @note The array of CpuMatrixs that arg points to will be copied to
-   *       op's matrices field.
-   */
-  void addOperationHelper(Operation* op, CpuMatrixPtr arg);
-
-  /**
-   * @brief Helper function to add an new operation and prepare the operands.
-   *
-   * @tparam Arg An operand of the operation.
-   * @tparam Args A list of rest operands of the operation.
-   * @param op Pointer to an Operation object.
-   */
-  template <typename Arg, typename... Args>
-  void addOperationHelper(Operation* op, Arg arg, Args... args) {
-    addOperationHelper(op, arg);
-    addOperationHelper(op, args...);
-  }
-
-  /**
-   * @brief ResultsAdder offers easy ways to quickly store operation results.
-   */
-  class ResultsAdder {
-  public:
-    explicit ResultsAdder(LocalOperationResult* localResult)
-        : localResult_(localResult) {}
-    template <typename... Args>
-    void operator()(Args... args) {
-      addResult(args...);
-    }
-    void addResult() {}
-    void addResult(real* arg) { localResult_->resultScalars.push_back(arg); }
-    void AddResult(CpuVectorPtr arg) {
-      localResult_->resultVectors.push_back(arg);
-    }
-    void AddResult(CpuMatrixPtr arg) {
-      localResult_->resultMatrices.push_back(arg);
-    }
-    template <typename Arg, typename... Args>
-    void addResult(Arg arg, Args... args) {
-      addResult(arg);
-      addResult(args...);
-    }
-
-  protected:
-    LocalOperationResult* localResult_;
-  };
-
-protected:
-  DoOperationRequest request_;
-  std::vector<iovec> inputIovs_;
-  struct LocalOperationResult {
-    std::vector<real*> resultScalars;
-    std::vector<CpuVectorPtr> resultVectors;
-    std::vector<CpuMatrixPtr> resultMatrices;
-  };
-  std::vector<LocalOperationResult> localResults_;
-  friend class ParameterClient2;
-};
-
-struct ParameterSegments {
-  std::string name;  // name of the parameter
-  size_t id;         // id of the parameter
-};
-
-/**
- * The client interface for parameter server. ParameterClient2 supports 2 modes
- * for managing connections to parameter servers, in the 1st mode one connection
- * is shared by 2 threads that are separately responsible for sending and
- * recieving activities, in the 2nd mode one connection is owned by only one
- * thread, and all the sending and recieving activities run in that single
- * thread.
- * TODO(yanfei):
- * Additional core idea to further optimizate pserver performance is
- * to do sync-sgd based parameter level instead of pserver level.
- * full-parallelization based parameter level for sync-sgd also can
- * sense forwardbackward computation layer-by-layer for more deeper layer
- * model.
- * Firstly, pserver can do full-parallelization on all computation based
- * parameter level instead of waiting for all gradients are finished and
- * start to send back parameters value immediately if parameter is ready
- * instead of waiting for all parameters value are ready
- * Secondly, parameter client can write back parameters to GPU instead of
- * waiting until all parameters are received to CPU host end.
- */
-class ParameterClient2 : public BaseClient {
-public:
-  /** Constructor.
-   * @param separate True if sending and recieving activities are separated
-   *                 into 2 threads, otherwise false.
-   * @param port Port number that parameter client runs on.
-   * @param numPorts Number of ports parameter clients occupies,
-   *                 numPorts * pserver number is the total number of
-   *                 connections the parameter client maintains.
-   */
-  ParameterClient2(bool separate = false,
-                   int port = FLAGS_port,
-                   int numPorts = FLAGS_ports_num);
-
-  ~ParameterClient2();
-
-  static int calcParameterBlockSize(const std::vector<ParameterPtr>& parameters,
-                                    size_t serviceNum);
-
-public:
-  bool init(const std::vector<ParameterPtr>& parameters);
-
-  /// service functions
-
-  /**
-   * @brief Sends the segments in parameter to parameter servers, then receives
-   *        the response from the servers.
-   * @param[in] updateMode Indicates how parameters should be updated on the
-   *            server side.
-   * @param[in] parameterType Type of parameter that will be sent.
-   * @param[in] segments Segments in the parameter that will be sent.
-   * @param[in] numSamples Number of samples this update is based on.
-   * @param[in] cost Cost of the batch, will be used to calculate global object
-   *            value.
-   * @param[in] sendBackParameter True if the updated parameters should be sent
-   *            back, otherwise false.
-   * @param[in] sendBackParameterType Send back parameter type on pserver,
-   *            PARAMETER_VALUE by default
-   * @param[in] recvParameterType pserver[sendBackParameterType] will be copy to
-   *            client[recvParameterType]
-   * @note Only parameterType will be sent.
-   */
-  void sendAndReceiveParameter(ParameterUpdateMode updateMode,
-                               ParameterType parameterType,
-                               const std::vector<ParameterSegments>& segments,
-                               int64_t numSamples,
-                               real cost,
-                               bool sendBackParameter,
-                               ParameterType sendBackParameterType,
-                               ParameterType recvParameterType);
-
-  /**
-   * @brief Sends all parameters to parameter servers, and receives the response
-   *        from the servers.
-   */
-  void sendAndReceiveParameter(
-      ParameterUpdateMode updateMode,
-      ParameterType parameterType,
-      int64_t numSamples,
-      real cost,
-      bool sendBackParameter,
-      ParameterType sendBackParameterType = PARAMETER_VALUE,
-      ParameterType recvParameterType = PARAMETER_VALUE) {
-    sendAndReceiveParameter(updateMode,
-                            parameterType,
-                            allSegments_,
-                            numSamples,
-                            cost,
-                            sendBackParameter,
-                            sendBackParameterType,
-                            recvParameterType);
-  }
-
-  /**
-   * @brief Sends the segments in parameter to parameter servers. Each
-   *        sendParameter() must be paired with a recvParameter() in the future.
-   *        Only parameterType will be sent.
-   *
-   * @param[in] updateMode Indicates how parameters should be updated on the
-   *            server side.
-   * @param[in] parameterType Type of parameter that will be sent.
-   * @param[in] segments Segments in the parameter that will be sent.
-   * @param[in] numSamples Number of samples this update is based on.
-   * @param[in] cost Cost of the batch, will be used to calculate global object
-   *            value.
-   * @param[in] sendBackParameter True if the updated parameters should be sent
-   *            back, otherwise false.
-   * @param[in] batchStatus Status of the batch.
-   * @note This function is non-blocking. This means that parameter should
-   *       not change between this call and recvParameter()
-   */
-  void sendParameter(ParameterUpdateMode updateMode,
-                     ParameterType parameterType,
-                     const std::vector<ParameterSegments>& segments,
-                     int64_t numSamples,
-                     real cost,
-                     bool sendBackParameter,
-                     BatchStatus batchStatus);
-
-  void recvParameter();
-
-  /**
-   * Sends all parameters to parameter servers, recvParameter() have to be
-   * invoked
-   * afterwards.
-   *
-   * @note This function is non-blocking. This means that if parameter should
-   *       not changes between this call and recvParameter()
-   */
-  void sendParameter(ParameterUpdateMode updateMode,
-                     ParameterType parameterType,
-                     int64_t numSamples,
-                     real cost,
-                     bool sendBackParameter,
-                     BatchStatus batchStatus) {
-    sendParameter(updateMode,
-                  parameterType,
-                  allSegments_,
-                  numSamples,
-                  cost,
-                  sendBackParameter,
-                  batchStatus);
-  }
-
-  /// Get all parameters from parameter servers
-  void getParameter(ParameterType recvParameterType = PARAMETER_VALUE,
-                    ParameterType sendBackParameterType = PARAMETER_VALUE) {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM,
-                            PARAMETER_VALUE,
-                            0,     // numSamples = 0
-                            0,     // cost = 0
-                            true,  // sendBackParameter = true
-                            sendBackParameterType,
-                            recvParameterType);
-  }
-
-  /// Get parameters by sparse row ids from parameter servers
-  void getParameterSparse(
-      ParameterType recvParameterType = PARAMETER_VALUE,
-      ParameterType sendBackParameterType = PARAMETER_VALUE) {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM_SPARSE,
-                            PARAMETER_VALUE,
-                            0,     // numSamples = 0
-                            0,     // cost = 0
-                            true,  // sendBackParameter = true
-                            sendBackParameterType,
-                            recvParameterType);
-  }
-
-  /// Set all parameters on parameter servers using the local parameters
-  void setParameter() {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM,
-                            PARAMETER_VALUE,
-                            0,       // numSamples = 0
-                            0,       // cost = 0
-                            false);  // sendBackParameter = false
-  }
-  /**
-   * Set all parameters on parameter servers, values will be zero
-   * means do not sending local parameters
-   */
-  void setParameterZero() {
-    sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM_ZERO,
-                            PARAMETER_VALUE,
-                            0,       // numSamples = 0
-                            0,       // cost = 0
-                            false);  // sendBackParameter = false
-  }
-
-  /**
-   * @brief Wait until all gradient servers start one pass.
-   *
-   * @note This is now only used by the gradient servers for "sgd"
-   *       algorithm. Calling this function means that the calling gradient
-   *       server is ready to start a new pass.
-   */
-  void waitPassStart();
-
-  /**
-   * @brief Wait until all gradient servers finish one pass.
-   *
-   * @note This is now only used by the gradient servers for "sgd" algorithm.
-   *       Calling this function means that the calling gradient server
-   *       finishes one pass.
-   */
-  void waitPassFinish();
-
-  /// Wait until all gradient servers call this function.
-  void synchronize(SyncObject syncObjectId = SYNC_DEFAULT);
-
-  /// Called when async-sgd finish pass.
-  void asyncFinishPass(SyncObject syncObjectId = SYNC_DEFAULT);
-
-  void asyncStartPass(SyncObject syncObjectId = SYNC_DEFAULT) {
-    return synchronize(syncObjectId);
-  }
-
-  /**
-   * @brief Execute the prepared operations on pservers, fetch the results and
-   *        aggregate results from different pservers.
-   * @param[in] ops Prepared operations that will be executed on pservers.
-   * @param[in] waitForGradient If true, wait for gradient to be ready before
-   *            starting the operations.
-   * @param[in] sendBackParameter If true, send back the parameter to clients
-   *            after the operations are finished.
-   * @param[in] If true, and if all clients call waitPassFinish, signal all
-   *            clients finish the pass.
-   */
-  void doOperation(PreparedOperations& ops,
-                   bool waitForGradient,
-                   bool sendBackParameter,
-                   bool releasePass = true);
-
-  /**
-   * Set the configuration of pserver, including parameter config and
-   * optimization config
-   */
-  void setConfig(const OptimizationConfig& optConfig,
-                 const std::string& saveDir = "",
-                 bool isSparseServer = false);
-
-  /// Return true if all pservers are in the given status
-  bool inStatus(PServerStatus status);
-  bool isPassFinish() { return passFinish_; }
-
-  /// Set pserver status
-  void setStatus(PServerStatus status);
-
-  /**
-   * @brief Wait until all pservers are at status
-   * @note This function is not suitable for frequent use,
-   *       because it sleeps 1 second each time when condition is satisfied.
-   */
-  void waitForStatus(PServerStatus status);
-
-  /// Create a column vector. The size is the dimension of parameter.
-  PServerVector createVector();
-
-  /// Release the PServerVector given handle.
-  void releaseVector(PServerVector handle);
-
-  /**
-   * Create a column major matrix. The number of rows is the dimension of
-   * parameter. The number of columns is specifed by numCols.
-   */
-  PServerMatrix createMatrix(int32_t numCols);
-
-  /// Release the PServerMatrix given handle.
-  void releaseMatrix(PServerMatrix handle);
-
-  // Some basic algebra functions
-  /// Calculate the dot product of u and v
-  real vectorDotProduct(PServerVector u, PServerVector v);
-
-  /// Scale u by a
-  void vectorScale(PServerVector u, real a);
-
-  /// Copy from src to dest
-  void vectorCopy(PServerVector src, PServerVector dst);
-
-  /// u += v * a
-  void vectorAddMult(PServerVector u, PServerVector v, real a);
-
-  /// u = v + w * a
-  void vectorAddMultInto(PServerVector u,
-                         PServerVector v,
-                         PServerVector w,
-                         real a);
-  /// u = v * a
-  void vectorScaleInto(PServerVector u, PServerVector v, real a);
-
-  /// Return pserver parameter value.
-  PServerVector getPServerParameterValue() {
-    PServerVector vec;
-    vec.handle = PARAMETER_VALUE;
-    return vec;
-  }
-
-  /// Return pserver parameter gradient.
-  PServerVector getPServerParameterGradient() {
-    PServerVector vec;
-    vec.handle = PARAMETER_GRADIENT;
-    return vec;
-  }
-
-  /**
-   * Tell pservers to load value vector from file.
-   *
-   * @param[in] dirName The directory that contains the value vector file.
-   */
-  void loadValueVector(const std::string& dirName);
-
-  /// Tell pservers to save value vector to file.
-  void saveValueVector(const std::string& dirName);
-
-  void setTrainerId(int trainerId) { trainerId_ = trainerId; }
-
-#ifndef PADDLE_DISABLE_TIMER
-  void setForwardbackwardTime(uint64_t delta) { forwardbackwordTime_ = delta; }
-#endif
-
-protected:
-  template <typename ProtoIn, typename ProtoOut>
-  void multiCall(const char* funcName,
-                 const ProtoIn& request,
-                 std::vector<ProtoOut>* responses) {
-    responses->resize(clients_.size());
-    size_t numClients = clients_.size();
-    for (size_t i = 0; i < numClients; ++i) {
-      clients_[i].send(funcName, request);
-    }
-    for (size_t i = 0; i < numClients; ++i) {
-      clients_[i].recv(&(*responses)[i]);
-    }
-  }
-
-private:
-  void destroy();
-
-  /**
-   * @brief management function for parallelizing send/recv all connections
-   *        to all pservers. it is called under one SyncThreadPool. it
-   *        supports to use N thread to control M connections. the receiving
-   *        actions can be started until all sending action to all connections
-   *        owned by current thread are finished. Different connections
-   * controlled
-   *        by different threads can transfer data asynchronously.
-   */
-  void sendParallel(int tid,
-                    size_t numThreads,
-                    ParameterType recvParameterType);
-  /// sending thread routine for asynchronously send data
-  void send(int threadId);
-  /// receiving thread routing for asynchronously receive data
-  void recv(int threadId);
-
-  /**
-   * @brief main routine to build data for pserver
-   *
-   * @note  it can prepare different kinds of parameter type data. it can
-   *        be regarded as layer for bridging real parameters data and
-   *        protobuf data for communication.
-   *        TODO(yanfei):
-   *        can abstract additional layer to encode and decode data to/from
-   *        protobuf data.
-   */
-  void prepareSendData(
-      ParameterUpdateMode updateMode,
-      ParameterType parameterType,  // client send type
-      const std::vector<ParameterSegments>& parameterSegments,
-      int64_t numSamples,
-      real cost,
-      bool sendBackParameter,
-      ParameterType sendBackParameterType,  // send back type in pserver
-      BatchStatus batchStatus,
-      SendJob* sendJob);
-
-  /// start necessary threads for threadPool
-  void initThreads();
-
-protected:
-  /// start port number of pserver
-  /// it deduce all ports for dense and sparse with some rules
-  int port_;
-  /// identify the trainer id using this client
-  int trainerId_;
-
-#ifndef PADDLE_DISABLE_TIMER
-  uint64_t forwardbackwordTime_;
-#endif
-  std::mutex sparseAutoGrowthMutex_;
-
-  /// map id to parameter used for decoding protobuf data
-  std::unordered_map<size_t, ParameterPtr> parameterMap_;
-  /// segments for all parameters that needed to sync
-  std::vector<ParameterSegments> allSegments_;
-
-  /// module for sensing sparse parameters distribution on all pservers
-  std::unique_ptr<SparseParameterDistribution> sparseDistribution_;
-
-  /// thread pool for parallelizing all connections to pservers
-  std::unique_ptr<SyncThreadPool> syncThreadPool_;
-
-  bool passFinish_;
-};
-
-}  // namespace paddle
diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp
deleted file mode 100644
index f8814714c29a9776adde6a979a84241f733f65bd..0000000000000000000000000000000000000000
--- a/paddle/pserver/ParameterServer2.cpp
+++ /dev/null
@@ -1,1401 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParameterServer2.h"
-
-#include <algorithm>
-#include <fstream>
-
-#include "paddle/math/SIMDFunctions.h"
-#include "paddle/parameter/AverageOptimizer.h"
-#include "paddle/parameter/FirstOrderOptimizer.h"
-#include "paddle/parameter/OptimizerFunctions.h"
-#include "paddle/parameter/OptimizerWithRegularizer.h"
-#include "paddle/parameter/ParameterOptimizer.h"
-#include "paddle/parameter/ParameterUpdateFunctions.h"
-#include "paddle/parameter/Regularizer.h"
-#include "paddle/parameter/ThreadLocalBuffer.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/StringUtil.h"
-
-DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
-DEFINE_double(async_lagged_ratio_min,
-              1.0,
-              "control config_.async_lagged_grad_discard_ratio() min value");
-DEFINE_double(
-    async_lagged_ratio_default,
-    1.5,
-    "if async_lagged_grad_discard_ratio is not set in trainer_config.conf"
-    "use it as defalut value");
-
-namespace paddle {
-
-const std::string ParameterServer2::kRetMsgInvalidMatrixHandle =
-    "Invalid matrix handle";
-const std::string ParameterServer2::kRetMsgInvalidVectorHandle =
-    "Invalid vector handle";
-const std::string ParameterServer2::kRetMsgUnknownOperation =
-    "Unknown operation";
-
-ParameterServer2::ParameterServer2(const std::string& addr,
-                                   int port,
-                                   int rdmaCpu)
-    : ProtoServer(addr, port, rdmaCpu),
-      dataSize_(0),
-      size_(0),
-      gradientReadyBarrier_(FLAGS_num_gradient_servers + 1),
-      parameterReadyBarrier_(FLAGS_num_gradient_servers + 1),
-      passBarrier_(FLAGS_num_gradient_servers + 1),
-      numPassFinishClients_(0),
-      allClientPassFinish_(false),
-      serverId_(-1),
-      batchId_(-1) {
-  /**
-   * register function for remote client calling, these functions
-   * will be mapped to a data structure for quick looking up. each
-   * request from trainer can contains one function name to indicate
-   * remote action. this architecture looks like rpc style for pserver.
-   */
-  REGISTER_SERVICE_FUNCTION_EX(ParameterServer2, sendParameter);
-  REGISTER_SERVICE_FUNCTION_EX(ParameterServer2, sendData);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, setConfig);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, setStatus);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, getStatus);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, doOperation);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, createVector);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, releaseVector);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, createMatrix);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, releaseMatrix);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, waitPassStart);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, waitPassFinish);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, synchronize);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, asyncFinishPass);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, loadValueVector);
-  REGISTER_SERVICE_FUNCTION(ParameterServer2, saveValueVector);
-
-  /// thread pool for parallelizing some computations
-  if (FLAGS_pserver_num_threads > 1) {
-    syncThreadPool_.reset(new SyncThreadPool(FLAGS_pserver_num_threads, false));
-  }
-}
-
-bool ParameterServer2::init() {
-  vectors_.resize(NUM_PARAMETER_TYPES);
-  configMap_.clear();
-
-  numSamplesProcessed_ = 0;
-  cost_ = 0;
-  char* mpienv = getenv("OMPI_COMM_WORLD_SIZE");
-  if (mpienv != NULL) {
-    mpiSize_ = atoi(mpienv);
-  } else {
-    mpiSize_ = 1;
-  }
-  status_ = PSERVER_STATUS_NOT_SET;
-  dataMems_.resize(FLAGS_num_gradient_servers);
-  synchronizeBarriers_.resize(SyncObject_ARRAYSIZE);
-  for (auto& barrier : synchronizeBarriers_) {
-    barrier.reset(new ThreadBarrier(FLAGS_num_gradient_servers));
-  }
-
-  // initialization for dicarding lagging gradient
-  asyncUpdateSteps_ = 0;
-  asyncTrainerSteps_.resize(FLAGS_num_gradient_servers);
-  asyncTrainerSteps_.assign(asyncTrainerSteps_.size(), 0);
-  asyncLaggedGradientsNum_ = 0;
-  asyncUpdateStat_.resize(static_cast<int>(FLAGS_num_gradient_servers *
-                                           FLAGS_async_lagged_ratio_default));
-  asyncUpdateStat_.assign(asyncUpdateStat_.size(), 0);
-  asyncTrainerDiscardStat_.resize(FLAGS_num_gradient_servers);
-  asyncTrainerDiscardStat_.assign(asyncTrainerDiscardStat_.size(), 0);
-  asyncTrainerCommitStat_.resize(FLAGS_num_gradient_servers);
-  asyncTrainerCommitStat_.assign(asyncTrainerCommitStat_.size(), 0);
-
-  return true;
-}
-
-void ParameterServer2::getStatus(const GetStatusRequest& request,
-                                 ProtoResponseCallback callback) {
-  (void)request;
-  GetStatusResponse response;
-  response.set_status(status_);
-  callback(response);
-}
-
-void ParameterServer2::setStatus(const SetStatusRequest& request,
-                                 ProtoResponseCallback callback) {
-  status_ = request.status();
-  SetStatusResponse response;
-  callback(response);
-}
-
-void ParameterServer2::setConfig(const SetConfigRequest& request,
-                                 ProtoResponseCallback callback) {
-  {
-    std::lock_guard<RWLock> guard(parameterMutex_);
-
-    serverId_ = request.server_id();
-    isSparseServer_ = request.is_sparse_server();
-
-    if (!request.save_dir().empty()) {
-      mkDir(request.save_dir().c_str());
-    }
-
-    for (const auto& config : request.param_configs()) {
-      CHECK(!configMap_.count(config.para_id()))
-          << "Duplicated parameter name: " << config.name();
-      configMap_[config.para_id()] = config;
-      CHECK_EQ(config.sparse_remote_update(), isSparseServer_);
-    }
-
-    config_ = request.opt_config();
-    if (config_.algorithm() == TrainAlgorithm::AsyncSGD) {
-      auto asyncLaggedRatio = config_.async_lagged_grad_discard_ratio();
-      if (asyncLaggedRatio <= FLAGS_async_lagged_ratio_min) {
-        LOG(INFO) << "WARNING: async_lagged_grad_discard_ratio is too small"
-                  << "reset to default, async_lagged_grad_discard_ratio = "
-                  << FLAGS_async_lagged_ratio_default;
-        asyncLaggedRatio = FLAGS_async_lagged_ratio_default;
-      }
-      asyncLaggedThreshold_ =
-          static_cast<int64_t>(FLAGS_num_gradient_servers * asyncLaggedRatio);
-      LOG(INFO) << "discard lagged async gradient ratio: " << asyncLaggedRatio
-                << " asyncLaggedhreshold: " << asyncLaggedThreshold_;
-    }
-    if (isSparseServer_ && config_.num_batches_per_send_parameter() > 1) {
-      /// sparse server must NOT use local update mode
-      config_.set_num_batches_per_send_parameter(1);
-    }
-
-    if (config_.num_batches_per_send_parameter() > 1 &&
-        config_.center_parameter_update_method() == "average") {
-      /// scaling L1/L2 decay rate as large as L1/L2 apply in trainer
-      /// if parameter regularization in pserver
-      for (auto& pair : configMap_) {
-        ParameterConfig& config = pair.second;
-        if (config_.num_batches_per_send_parameter() ==
-            config.num_batches_regularization()) {
-          real scale =
-              config_.delta_add_rate() * config.num_batches_regularization();
-          if (config_.algorithm() == "sgd") {
-            scale *= FLAGS_num_gradient_servers;
-          }
-          config.set_decay_rate(config.decay_rate() * scale);
-          if (config.decay_rate() > 0.1f) {
-            LOG(FATAL) << "L2 decay=" << config.decay_rate()
-                       << " for parameter:" << config.name()
-                       << " is too large after scale in pserver!";
-          }
-          config.set_decay_rate_l1(config.decay_rate_l1() * scale);
-          if (config.decay_rate_l1() > 0.1f) {
-            LOG(FATAL) << "L1 decay=" << config.decay_rate_l1()
-                       << " for parameter:" << config.name()
-                       << " is too large after scale in pserver!";
-          }
-
-          LOG(INFO) << "parameter:" << config.name()
-                    << " decay apply in pserver,"
-                    << " L1 decay=" << config.decay_rate_l1()
-                    << " L2 decay=" << config.decay_rate();
-        }
-      }
-    }
-  }
-
-  SetConfigResponse response;
-  callback(response);
-}
-
-real bufferSum(const std::vector<ParameterServer2::Buffer>& buffers) {
-  real sum = 0;
-  for (const auto buffer : buffers) {
-    for (size_t i = 0; i < buffer.size; ++i) {
-      sum += buffer.base[i];
-    }
-  }
-  return sum;
-}
-
-void ParameterServer2::mergeSegments(BlockSegments* segments) {
-  if (segments->empty()) {
-    return;
-  }
-  std::sort(segments->begin(), segments->end());
-  auto curr = segments->begin();
-  for (auto it = segments->begin(); it != segments->end(); ++it) {
-    if (it->first <= curr->second) {
-      curr->second = std::max(curr->second, it->second);
-    } else {
-      ++curr;
-      *curr = *it;
-    }
-  }
-  ++curr;
-  segments->erase(curr, segments->end());
-}
-
-void ParameterServer2::setParameter(const SendParameterRequest& request,
-                                    std::vector<Buffer>& inputBuffers,
-                                    SendParameterResponse* response,
-                                    std::vector<Buffer>* outputBuffers) {
-  (void)response;
-  (void)outputBuffers;
-  LOG(INFO) << "pserver: setParameter";
-  std::lock_guard<RWLock> guard(parameterMutex_);
-
-  int64_t numBlocks = blockIdMap_.size();
-  CHECK_EQ(blockIdMap_.size(), blockOffsetMap_.size());
-  /// total bytes for all the added blocks
-  int64_t totalSize = size_;
-  std::vector<int64_t> offsets;
-  offsets.reserve(request.blocks_size());
-  std::vector<int64_t> blockIds;
-  blockIds.reserve(request.blocks_size());
-  int bufferIndex = 0;
-
-  if (!request.blocks().size()) {
-    LOG(WARNING)
-        << "--ports_num or --ports_num_for_sparse might be too large, "
-        << "or total dense parameter size or sparse parameters size "
-        << "might be too small, this psever doesn't store any parameter.";
-    return;
-  }
-
-  for (const auto& block : request.blocks()) {
-    /// block size for parameter(e.g. 128 for sparse row, 1K for dense)
-    uint64_t blockSize = getParameterConfig(block).parameter_block_size();
-    BlockKey key(block.para_id(), block.block_id());
-    if (inputBuffers.size()) {  // if !=PSERVER_UPDATE_MODE_SET_PARAM_ZERO
-      Buffer buffer = inputBuffers[bufferIndex];
-      ++bufferIndex;
-      CHECK_EQ(buffer.size, block.block_size())
-          << "data size is too big:"
-          << " block_size=" << block.block_size()
-          << " data_size=" << buffer.size;
-    }
-
-    /// add a new block
-    if (blockIdMap_.count(key) == 0) {
-      blockOffsetMap_[key] = totalSize;
-      blockIdMap_[key] = numBlocks;
-      ++numBlocks;
-      totalSize += blockSize;
-    }
-    offsets.push_back(blockOffsetMap_[key]);
-    blockIds.push_back(blockIdMap_[key]);
-  }
-
-  size_ = totalSize;
-  LOG(INFO) << "pserver: new cpuvector: size=" << size_;
-  if (!vectors_[PARAMETER_VALUE]) {
-    /// vectors_
-    const auto types = sgdOptimizerGetTypes(config_, true /*inPserver*/);
-    for (const auto type : types) {
-      vectors_[type].reset(new CpuVector(size_));
-      vectors_[type]->zeroMem();
-    }
-
-    blockInfos_.resize(numBlocks);
-    for (auto& info : blockInfos_) {
-      info.lock.reset(new std::mutex());
-    }
-  } else {
-    CHECK_EQ((size_t)size_, vectors_[PARAMETER_VALUE]->getSize())
-        << "Currently adding new blocks is not supported. "
-        << "All blocks must be added in one setParameter call";
-  }
-
-  VectorPtr buf = vectors_[PARAMETER_VALUE];
-  usedSegments_.reserve(offsets.size());
-  /// if offsets is empty, means parameter_block_size is too big or too many
-  /// nodes.
-  if (offsets.empty()) {
-    LOG(WARNING) << "in setParameter: offsets is empty";
-  }
-  for (size_t i = 0; i < offsets.size(); ++i) {
-    size_t blockId = blockIds[i];
-    BlockInfo& info = blockInfos_[blockId];
-    const ParameterConfig& config = getParameterConfig(request.blocks(i));
-    info.config = &config;
-    info.offset = offsets[i];
-    info.optimizer.reset(sgdOptimizerCreate(
-        config_, config, config.sparse_remote_update(), true /*inPserver*/));
-    if (config.sparse_remote_update()) {
-      size_t width = config.dims(1);
-      CHECK_EQ(config.parameter_block_size(), width)
-          << "block size: " << config.parameter_block_size()
-          << "width : " << width;
-    }
-    info.optimizer->init(1, info.config);
-    usedSegments_.push_back(std::make_pair(
-        offsets[i], offsets[i] + request.blocks(i).block_size()));
-  }
-  mergeSegments(&usedSegments_);
-
-  if (request.update_mode() == PSERVER_UPDATE_MODE_SET_PARAM) {
-    /// copy param from trainer
-    for (size_t i = 0; i < offsets.size(); ++i) {
-      Buffer buffer = inputBuffers[i];
-      real* start = buf->getPoint(offsets[i]);
-      CHECK_LE(offsets[i] + buffer.size, buf->getSize());
-      memcpy(start, buffer.base, sizeof(real) * buffer.size);
-    }
-  } else {
-    CHECK(request.update_mode() == PSERVER_UPDATE_MODE_SET_PARAM_ZERO);
-    /// nothing to do, value vector zero mem already
-  }
-}
-
-void ParameterServer2::addGradient(const SendParameterRequest& request,
-                                   std::vector<Buffer>& inputBuffers,
-                                   SendParameterResponse* response,
-                                   std::vector<Buffer>* outputBuffers) {
-  VLOG(1) << "pserver: addGradient";
-
-  {
-    ReadLockGuard guard(parameterMutex_);
-    int bufferIndex = 0;
-    for (const auto& block : request.blocks()) {
-      int64_t offset = getBlockOffset(block);
-      CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-                          << " id=" << block.para_id()
-                          << " block id=" << block.block_id();
-
-      int64_t blockId = getBlockId(block);
-      CHECK_GE(blockId, 0) << "Only existing parameter block is allowed: "
-                           << " id=" << block.para_id()
-                           << " block id=" << block.block_id();
-
-      Buffer buffer = inputBuffers[bufferIndex];
-      ++bufferIndex;
-
-      const real* gradientBuffer = buffer.base;
-      real* gradientSumBuffer = vectors_[PARAMETER_GRADIENT]->getPoint(offset);
-
-      size_t size = buffer.size;
-
-      BlockInfo& info = blockInfos_[blockId];
-      const ParameterConfig& config = getParameterConfig(blockId);
-      if (config.sparse_remote_update()) {
-        CHECK_EQ(size, config.parameter_block_size());
-      } else {  // dense
-        CHECK_LE(size, config.parameter_block_size());
-      }
-      std::lock_guard<std::mutex> guard(*info.lock);
-      simd::addTo(gradientSumBuffer, gradientBuffer, size);
-    }
-  }
-  if (request.batch_status() == BATCH_FINISH ||
-      request.batch_status() == BATCH_START_AND_FINISH) {
-    numSamplesProcessed_ += request.num_samples();
-    cost_ += request.cost();
-    VLOG(1) << "num samples: " << numSamplesProcessed_
-            << ", new cost:" << cost_;
-
-    /// notify doOperation gradient ready
-    gradientReadyBarrier_.wait();
-
-    /// wait doOperation finish
-    parameterReadyBarrier_.wait();
-    VLOG(1) << "start send back";
-  }
-}
-
-bool ParameterServer2::asyncGrdientCommitCheckAndStat(
-    const SendParameterRequest& request) {
-  const auto trainerId = request.trainer_id();
-  int64_t trainerSteps = asyncTrainerSteps_[trainerId];
-  CHECK_GE(asyncUpdateSteps_, trainerSteps)
-      << " async update steps overflows "
-      << " trainer id: " << trainerId
-      << " async update steps in pserver: " << asyncUpdateSteps_
-      << " async update steps in request: " << trainerSteps;
-
-  asyncUpdateSteps_++;
-  bool commitGradient = true;
-
-  int64_t delta = asyncUpdateSteps_ - trainerSteps;
-  if (delta >= asyncLaggedThreshold_) {
-    VLOG(1) << "discard Async Update: "
-            << " trainer id: " << trainerId
-            << " pserver steps: " << asyncUpdateSteps_
-            << " request steps: " << trainerSteps;
-    asyncLaggedGradientsNum_++;
-    commitGradient = false;
-  }
-  /// stat on lagged steps, to get total discard distribution
-  if (static_cast<size_t>(delta) < asyncUpdateStat_.size()) {
-    asyncUpdateStat_[delta]++;
-  } else {
-    asyncUpdateStat_[asyncUpdateStat_.size() - 1]++;
-  }
-  /// stat on trainerId and discard, to get trainer condition
-  if (commitGradient) {
-    asyncTrainerCommitStat_[trainerId]++;
-  } else {
-    asyncTrainerDiscardStat_[trainerId]++;
-  }
-
-  return commitGradient;
-}
-
-static ThreadLocal<std::vector<bool>> localBlockBitset_;
-
-void ParameterServer2::asyncSGD(const SendParameterRequest& request,
-                                std::vector<Buffer>& inputBuffers,
-                                SendParameterResponse* response,
-                                std::vector<Buffer>* outputBuffers) {
-  int64_t numBlocks = blockIdMap_.size();
-  auto& localBlockBitset = *localBlockBitset_;
-
-  if (isSparseServer_) {
-    if (localBlockBitset.empty()) {
-      localBlockBitset.resize(numBlocks);
-    }
-    localBlockBitset.assign(numBlocks, false);
-  }
-
-  ReadLockGuard guard(parameterMutex_);
-
-  if (request.send_back_parameter()) {
-    outputBuffers->reserve(request.blocks_size());
-  }
-
-  bool commitGradient = asyncGrdientCommitCheckAndStat(request);
-
-  VectorPtr* vecs = parameter::getThreadLocalBuffer();
-  size_t bufferIndex = 0;
-  for (const auto& block : request.blocks()) {
-    int64_t offset = getBlockOffset(block);
-    CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-                        << " id=" << block.para_id()
-                        << " block id=" << block.block_id();
-    int64_t blockId = getBlockId(block);
-    CHECK_GE(blockId, 0) << "Only existing parameter block is allowed: "
-                         << " id=" << block.para_id()
-                         << " block id=" << block.block_id();
-    Buffer buffer = inputBuffers[bufferIndex];
-    ++bufferIndex;
-
-    size_t size = buffer.size;
-
-    BlockInfo& info = blockInfos_[blockId];
-    const ParameterConfig& config = getParameterConfig(blockId);
-
-    std::lock_guard<std::mutex> guard(*info.lock);
-    /// gradients are too obsolete, will be discarded
-    if (commitGradient) {
-      info.optimizer->startBatch(numSamplesProcessed_);
-
-      for (const auto type : info.optimizer->getParameterTypes()) {
-        vecs[type]->subVecFrom(*vectors_[type], offset, size);
-      }
-      vecs[PARAMETER_GRADIENT]->subVecFrom(buffer.base, 0, size);
-      info.optimizer->update(vecs, config, isSparseServer_ ? 0 : -1);
-
-      if (auto callback = info.optimizer->needSpecialTraversal(config)) {
-        blockTraverse(info, config, offset, size, vecs, callback);
-      }
-      info.optimizer->finishBatch();
-    }
-
-    if (commitGradient && isSparseServer_) {
-      localBlockBitset[blockId] = true;
-    }
-
-    if (!isSparseServer_ && request.send_back_parameter()) {  // dense
-      int type = request.send_back_parameter_type();
-      sendBackParameter(block, type, response, &buffer, outputBuffers);
-    }
-  }  /// foreach block
-
-  asyncTrainerSteps_[request.trainer_id()] = asyncUpdateSteps_;
-
-  if (commitGradient && isSparseServer_) {
-    /// find blocks that trainer do not request update
-    for (int64_t blockId = 0; blockId < numBlocks; ++blockId) {
-      if (localBlockBitset[blockId]) {
-        continue;
-      }
-
-      BlockInfo& info = blockInfos_[blockId];
-      const ParameterConfig& config = *info.config;
-      size_t size = config.parameter_block_size();
-
-      std::lock_guard<std::mutex> guard(*info.lock);
-      info.optimizer->startBatch(numSamplesProcessed_);
-      if (auto callback = info.optimizer->needSpecialTraversal(config)) {
-        blockTraverse(info, config, info.offset, size, vecs, callback);
-      }
-      info.optimizer->finishBatch();
-    }
-  }
-
-  if (commitGradient && (request.batch_status() == BATCH_FINISH ||
-                         request.batch_status() == BATCH_START_AND_FINISH)) {
-    numSamplesProcessed_ += request.num_samples();
-  }
-
-  /// show some performance log if needed
-  if (request.trainer_id() == 0) {
-    /// batchId_ is approximately equal to "real batchId_"
-    batchId_++;
-  }
-}
-
-void ParameterServer2::getParameter(const SendParameterRequest& request,
-                                    std::vector<Buffer>& inputBuffers,
-                                    SendParameterResponse* response,
-                                    std::vector<Buffer>* outputBuffers) {
-  (void)inputBuffers;
-  LOG(INFO) << "pserver: getParameter";
-  ReadLockGuard guard(parameterMutex_);
-  for (const auto& block : request.blocks()) {
-    int type = request.send_back_parameter_type();
-    sendBackParameter(block, type, response, outputBuffers);
-  }
-}
-
-void ParameterServer2::getParameterSparse(const SendParameterRequest& request,
-                                          std::vector<Buffer>& inputBuffers,
-                                          SendParameterResponse* response,
-                                          std::vector<Buffer>* outputBuffers) {
-  (void)inputBuffers;
-  auto& buffer = *readWriteBuffer_;
-  size_t numReals = 0;
-  for (const auto& block : request.blocks()) {
-    numReals += getParameterConfig(block).dims(1);
-  }
-  buffer.resize(numReals);
-
-  VLOG(3) << "pserver: getParameterSparse, numReals=" << numReals;
-
-  ReadLockGuard guard(parameterMutex_);
-  size_t offset = 0;
-  for (const auto& block : request.blocks()) {
-    size_t width = getParameterConfig(block).dims(1);
-    Buffer buf = {buffer.data() + offset, width};
-    int type = request.send_back_parameter_type();
-    sendBackParameterSparse(block, type, response, &buf, width, outputBuffers);
-    offset += width;
-  }
-}
-
-void ParameterServer2::sendBackParameter(const ParameterBlock& block,
-                                         int parameterType,
-                                         SendParameterResponse* response,
-                                         std::vector<Buffer>* outputBuffers) {
-  ParameterBlock* returnBlock = response->add_blocks();
-  returnBlock->set_para_id(block.para_id());
-  returnBlock->set_block_id(block.block_id());
-  returnBlock->set_begin_pos(block.begin_pos());
-  returnBlock->set_block_size(block.block_size());
-
-  int64_t offset = getBlockOffset(block);
-  CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-                      << " id=" << block.para_id()
-                      << " block id=" << block.block_id();
-
-  real* valueBuffer = vectors_[parameterType]->getPoint(offset);
-  outputBuffers->push_back({valueBuffer, (size_t)block.block_size()});
-}
-
-void ParameterServer2::sendBackParameter(const ParameterBlock& block,
-                                         int parameterType,
-                                         SendParameterResponse* response,
-                                         Buffer* buffer,
-                                         std::vector<Buffer>* outputBuffers) {
-  ParameterBlock* returnBlock = response->add_blocks();
-  returnBlock->set_para_id(block.para_id());
-  returnBlock->set_block_id(block.block_id());
-  returnBlock->set_begin_pos(block.begin_pos());
-  returnBlock->set_block_size(block.block_size());
-
-  int64_t offset = getBlockOffset(block);
-  CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-                      << " id=" << block.para_id()
-                      << " block id=" << block.block_id();
-
-  size_t size = buffer->size;
-  real* valueBuffer = vectors_[parameterType]->getPoint(offset);
-  /// copy to second buffer to avoid to be polluted by other request
-  memcpy(buffer->base, valueBuffer, sizeof(real) * size);
-  outputBuffers->push_back({buffer->base, size});
-}
-
-void ParameterServer2::sendBackParameterSparse(
-    const ParameterBlock& block,
-    int parameterType,
-    SendParameterResponse* response,
-    Buffer* buffer,
-    size_t width,
-    std::vector<Buffer>* outputBuffers) {
-  ParameterBlock* returnBlock = response->add_blocks();
-  returnBlock->set_para_id(block.para_id());
-  returnBlock->set_block_id(block.block_id());
-  returnBlock->set_begin_pos(block.begin_pos());
-  returnBlock->set_block_size(block.block_size());
-  int64_t offset = getBlockOffset(block);
-  CHECK_GE(offset, 0) << "Only existing parameter block is allowed: "
-                      << " id=" << block.para_id()
-                      << " block id=" << block.block_id();
-
-  real* valueBuffer = vectors_[parameterType]->getPoint(offset);
-  CHECK_EQ(buffer->size, width);
-  memcpy(buffer->base, valueBuffer, width * sizeof(real));
-  outputBuffers->push_back(*buffer);
-}
-
-void ParameterServer2::readAllBlocks(
-    MsgReader* msgReader, std::vector<ParameterServer2::Buffer>* buffers) {
-  auto& buffer = *readWriteBuffer_;
-  size_t numBlocks = msgReader->getNumBlocks();
-  buffer.resizeWithAlignHints(msgReader->getTotalLength() / sizeof(real),
-                              numBlocks);
-  std::vector<void*> bufs(numBlocks);
-  buffers->clear();
-  buffers->reserve(numBlocks);
-  buffer.resetAlignAlloc();
-  for (size_t i = 0; i < numBlocks; ++i) {
-    size_t len = msgReader->getBlockLength(i);
-    CHECK_EQ(len % sizeof(real), (size_t)0);
-    size_t size = len / sizeof(real);
-    bufs[i] = buffer.nextBlock(size);
-    buffers->push_back({(real*)bufs[i], size});
-  }
-  msgReader->readBlocks(bufs);
-}
-
-void ParameterServer2::sendParameter(const SendParameterRequest& request,
-                                     std::unique_ptr<MsgReader> msgReader,
-                                     ProtoResponseCallbackEx callback) {
-  SendParameterResponse response;
-  std::vector<Buffer> inputBuffers;
-  std::vector<Buffer> outputBuffers;
-  readAllBlocks(msgReader.get(), &inputBuffers);
-  msgReader.reset();
-
-  switch (request.update_mode()) {
-    case PSERVER_UPDATE_MODE_SET_PARAM:
-    case PSERVER_UPDATE_MODE_SET_PARAM_ZERO:
-      setParameter(request, inputBuffers, &response, &outputBuffers);
-      break;
-    case PSERVER_UPDATE_MODE_GET_PARAM:
-      getParameter(request, inputBuffers, &response, &outputBuffers);
-      break;
-    case PSERVER_UPDATE_MODE_GET_PARAM_SPARSE:
-      getParameterSparse(request, inputBuffers, &response, &outputBuffers);
-      break;
-    case PSERVER_UPDATE_MODE_ASYNC_SGD:
-      asyncSGD(request, inputBuffers, &response, &outputBuffers);
-      break;
-    case PSERVER_UPDATE_MODE_ADD_GRADIENT:
-      addGradient(request, inputBuffers, &response, &outputBuffers);
-      break;
-    case PSERVER_UPDATE_MODE_AVERAGE_PARAMETER:
-      break;
-  }
-  switch (request.update_mode()) {
-    case PSERVER_UPDATE_MODE_ADD_GRADIENT:
-      (*requestVec_).push_back(request);
-      (*callbackVec_).push_back(callback);
-      if (request.batch_status() == BATCH_FINISH ||
-          request.batch_status() == BATCH_START_AND_FINISH) {
-        for (size_t i = 0; i < (*requestVec_).size(); i++) {
-          ReadLockGuard guard(parameterMutex_);
-          SendParameterRequest& request = (*requestVec_)[i];
-          SendParameterResponse responseTemp;
-
-          std::vector<iovec> outputIovs;
-          if (request.send_back_parameter()) {
-            CHECK(!isSparseServer_);
-            std::vector<Buffer> outputBuffersTemp;
-            for (const auto& block : request.blocks()) {
-              int type = request.send_back_parameter_type();
-              sendBackParameter(block, type, &responseTemp, &outputBuffersTemp);
-            }
-            outputIovs.reserve(outputBuffersTemp.size());
-            for (auto buffer : outputBuffersTemp) {
-              outputIovs.push_back({buffer.base, buffer.size * sizeof(real)});
-            }
-          }
-
-          ProtoResponseCallbackEx& callbackTemp = (*callbackVec_)[i];
-          callbackTemp(responseTemp, outputIovs);
-        }
-        (*requestVec_).clear();
-        (*callbackVec_).clear();
-      }
-      break;
-    case PSERVER_UPDATE_MODE_SET_PARAM:
-    case PSERVER_UPDATE_MODE_SET_PARAM_ZERO:
-    case PSERVER_UPDATE_MODE_GET_PARAM:
-    case PSERVER_UPDATE_MODE_GET_PARAM_SPARSE:
-    case PSERVER_UPDATE_MODE_ASYNC_SGD:
-    case PSERVER_UPDATE_MODE_AVERAGE_PARAMETER:
-      std::vector<iovec> outputIovs;
-      outputIovs.reserve(outputBuffers.size());
-      for (auto buffer : outputBuffers) {
-        outputIovs.push_back({buffer.base, buffer.size * sizeof(real)});
-      }
-      callback(response, outputIovs);
-      break;
-  }
-}
-
-template <typename Dtype>
-void ParameterServer2::reduceAndSendData(const SendDataRequest& request,
-                                         std::unique_ptr<MsgReader>& msgReader,
-                                         ProtoResponseCallbackEx& callback) {
-  SendDataResponse response;
-  response.set_type(request.type());
-  response.set_server_id(serverId_);
-
-  auto sendData = reinterpret_cast<Dtype*>(dataMems_[0].get()->getBuf());
-  size_t rawMemSize = dataMems_[0].get()->getSize();
-  CHECK_EQ(rawMemSize % sizeof(Dtype), 0U);
-  size_t dataMemSize = rawMemSize / sizeof(Dtype);
-  for (size_t i = 1; i < dataMems_.size(); ++i) {
-    CHECK_EQ(dataMems_[i].get()->getSize(), rawMemSize);
-    auto data = reinterpret_cast<Dtype*>(dataMems_[i].get()->getBuf());
-    for (size_t j = 0; j < dataMemSize; ++j) {
-      sendData[j] += data[j];
-    }
-  }
-  std::vector<iovec> outputIovs;
-  auto block = response.add_blocks();
-  outputIovs.push_back({sendData, rawMemSize});
-  block->set_total_size(rawMemSize);
-  block->set_data_size(sizeof(Dtype));
-  callback(response, outputIovs);
-}
-
-void ParameterServer2::templateReduceSum(const SendDataRequest& request,
-                                         std::unique_ptr<MsgReader>& msgReader,
-                                         ProtoResponseCallbackEx& callback) {
-  const auto& block = request.blocks(0);
-  switch (block.data_type()) {
-    case TRANS_FLOAT:
-      reduceAndSendData<float>(request, msgReader, callback);
-      break;
-    case TRANS_DOUBLE:
-      reduceAndSendData<double>(request, msgReader, callback);
-      break;
-    case TRANS_INT32:
-      reduceAndSendData<int>(request, msgReader, callback);
-      break;
-    case TRANS_UINT32_T:
-      reduceAndSendData<uint32_t>(request, msgReader, callback);
-      break;
-    case TRANS_INT64_T:
-      reduceAndSendData<int64_t>(request, msgReader, callback);
-      break;
-    case TRANS_UINT64_T:
-      reduceAndSendData<uint64_t>(request, msgReader, callback);
-      break;
-    default:
-      LOG(FATAL) << "not supported";
-      break;
-  }
-}
-
-void ParameterServer2::sendData(const SendDataRequest& request,
-                                std::unique_ptr<MsgReader> msgReader,
-                                ProtoResponseCallbackEx callback) {
-  SendDataResponse response;
-  response.set_type(request.type());
-  response.set_server_id(serverId_);
-
-  switch (request.update_mode()) {
-    case DATA_UPDATE_MODE_SET_OWN: {
-      CHECK_EQ(msgReader->getNumBlocks(), (size_t)(request.blocks_size()));
-      size_t totalLen = msgReader->getTotalLength();
-      if (totalLen > 0) {
-        CHECK_EQ(msgReader->getNumBlocks(), 1U)
-            << "Only one block currently support now!";
-        const auto& block = request.blocks(0);
-        if (0 == dataSize_) {
-          dataSize_ = block.data_size();
-        } else {
-          CHECK_EQ(dataSize_, block.data_size());
-        }
-        int64_t serverId = request.server_id();
-        if (serverId_ < 0) {
-          serverId_ = serverId;
-        } else {
-          CHECK_EQ(serverId_, serverId);
-        }
-        int64_t clientId = request.client_id();
-        dataMems_[clientId] = std::make_shared<CpuMemoryHandle>(totalLen);
-        CHECK_EQ(totalLen % sizeof(block.data_size()), 0U);
-        msgReader->readNextBlock(dataMems_[clientId].get()->getBuf());
-      }
-      msgReader.reset();
-      std::vector<iovec> outputIovs;
-      callback(response, outputIovs);
-      break;
-    }
-    case DATA_UPDATE_MODE_GET_ALL: {
-      /// Currently only support DATA_REDUCE_SUM
-      /// And their Operations are just add
-      CHECK(DATA_REDUCE_SUM == request.type());
-      templateReduceSum(request, msgReader, callback);
-      break;
-    }
-    default: { LOG(FATAL) << "not supported"; }
-  }
-}
-
-void ParameterServer2::clearUnusedSegments(CpuVector* vec) {
-  real* data = vec->getData();
-  if (usedSegments_.empty()) {
-    return;
-  }
-  memset(data, 0, sizeof(real) * usedSegments_[0].first);
-  memset(data + usedSegments_.back().second,
-         0,
-         sizeof(real) * (size_ - usedSegments_.back().second));
-  size_t n = size_ - usedSegments_.back().second;
-
-  for (size_t i = 1; i < usedSegments_.size(); ++i) {
-    memset(
-        data + usedSegments_[i - 1].second,
-        0,
-        sizeof(real) * (usedSegments_[i].first - usedSegments_[i - 1].second));
-    n += usedSegments_[i].first - usedSegments_[i - 1].second;
-  }
-}
-
-void ParameterServer2::parallelExecForEachBlock(ExecFunc func) {
-  SyncThreadPool::execHelper(
-      syncThreadPool_.get(), [&](int tid, size_t numThreads) {
-        int64_t numBlocks = blockIdMap_.size();
-        VectorPtr* vecs = parameter::getThreadLocalBuffer();
-        for (int64_t blockId = tid; blockId < numBlocks;
-             blockId += numThreads) {
-          func(blockId, vecs);
-        }
-      });
-}
-
-void ParameterServer2::blockTraverse(
-    BlockInfo& info,
-    const ParameterConfig& config,
-    int64_t offset,
-    size_t size,
-    const VectorPtr vecs[],
-    const ParameterOptimizer::TraverseCallback& callback) {
-  /// setup sub bufs
-  for (const auto type : info.optimizer->getParameterTypes()) {
-    vecs[type]->subVecFrom(*vectors_[type], offset, size);
-  }
-  callback(vecs, config, config.sparse_remote_update() ? 0 : -1LU);
-}
-
-void ParameterServer2::op_SGD(const Operation& operation,
-                              OperationResult* result) {
-  (void)operation;
-  (void)result;
-
-  if (allClientPassFinish_) {
-    /// when all clients signal pass finished, the update
-    /// is empty.
-    return;
-  }
-
-  {
-    parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
-      BlockInfo& info = blockInfos_[blockId];
-      const ParameterConfig& config = getParameterConfig(blockId);
-      int64_t offset = info.offset;
-      size_t size = config.parameter_block_size();
-
-      info.optimizer->startBatch(numSamplesProcessed_);
-
-      for (const auto type : info.optimizer->getParameterTypes()) {
-        vecs[type]->subVecFrom(*vectors_[type], offset, size);
-      }
-      info.optimizer->update(
-          vecs, config, config.sparse_remote_update() ? 0 : -1LU);
-      vecs[PARAMETER_GRADIENT]->zeroMem();
-
-      if (auto callback = info.optimizer->needSpecialTraversal(config)) {
-        blockTraverse(info, config, offset, size, vecs, callback);
-      }
-      info.optimizer->finishBatch();
-    });
-  }
-
-  batchId_++;
-}
-
-void ParameterServer2::op_start_pass(const Operation& operation,
-                                     OperationResult* result) {
-  (void)operation;
-  (void)result;
-
-  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
-    BlockInfo& info = blockInfos_[blockId];
-    info.optimizer->startPass();
-  });
-}
-
-void ParameterServer2::op_finish_pass(const Operation& operation,
-                                      OperationResult* result) {
-  (void)operation;
-  (void)result;
-
-  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
-    BlockInfo& info = blockInfos_[blockId];
-    const ParameterConfig& config = getParameterConfig(blockId);
-    size_t size = config.parameter_block_size();
-
-    /// catch up with
-    if (auto callback = info.optimizer->startCatchUpWith()) {
-      blockTraverse(info, config, info.offset, size, vecs, callback);
-      info.optimizer->finishCatchUpWith();
-    }
-
-    /// finish pass
-    info.optimizer->finishPass();
-  });
-  batchId_ = 0;
-}
-
-void ParameterServer2::op_apply(const Operation& operation,
-                                OperationResult* result) {
-  (void)operation;
-  (void)result;
-
-  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
-    BlockInfo& info = blockInfos_[blockId];
-    const ParameterConfig& config = getParameterConfig(blockId);
-    int64_t offset = info.offset;
-    size_t size = config.parameter_block_size();
-
-    // catch up with
-    if (auto callback = info.optimizer->startCatchUpWith()) {
-      blockTraverse(info, config, offset, size, vecs, callback);
-      info.optimizer->finishCatchUpWith();
-    }
-
-    // apply to PARAMETER_APPLY
-    if (auto callback = info.optimizer->apply()) {
-      blockTraverse(info, config, offset, size, vecs, callback);
-    }
-  });
-}
-
-void ParameterServer2::op_randomize(const Operation& operation,
-                                    OperationResult* result) {
-  LOG(INFO) << "ParameterServer2::op_randomize: serverId=" << serverId_;
-
-  CpuVector& valueVec = *vectors_[PARAMETER_VALUE];
-
-  parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
-    BlockInfo& info = blockInfos_[blockId];
-    const ParameterConfig& config = getParameterConfig(blockId);
-    size_t size = config.parameter_block_size();
-
-    vecs[PARAMETER_VALUE]->subVecFrom(valueVec, info.offset, size);
-    Parameter::randomize(vecs[PARAMETER_VALUE], config);
-  });
-}
-
-void ParameterServer2::loadValueVector(const LoadValueRequest& request,
-                                       ProtoResponseCallback callback) {
-  LoadValueResponse response;
-  LOG(INFO) << "ParameterServer2::loadValueVector: serverId=" << serverId_;
-
-  constexpr int kBufLen = 100;
-  char buf[kBufLen];
-  snprintf(buf, kBufLen, "/pserver.%04d", static_cast<int>(serverId_));
-  std::string filename = request.dir_name() + buf;
-
-  std::ifstream fs(filename, std::ios_base::binary);
-  CHECK(fs) << "Fail to open " << filename;
-
-  CpuVector& vec = *vectors_[PARAMETER_VALUE];
-  Parameter::Header header;
-  CHECK(fs.read(reinterpret_cast<char*>(&header), sizeof(header)))
-      << "Fail to read parameters in pserver";
-  CHECK(Parameter::isHeaderFormatSupported(header.format))
-      << "Incorrect format version: " << header.format;
-  CHECK_EQ(header.size, (size_t)size_)
-      << "The size (" << header.size << ") in the file does not match the size "
-      << "(" << size_ << ") of the pserver: " << serverId_;
-  CHECK_EQ(header.valueSize, sizeof(real)) << "Unsupported valueSize "
-                                           << header.valueSize;
-  CHECK(fs.read(reinterpret_cast<char*>(vec.getData()),
-                header.size * sizeof(real)));
-
-  callback(response);
-}
-
-void ParameterServer2::saveValueVector(const SaveValueRequest& request,
-                                       ProtoResponseCallback callback) {
-  SaveValueResponse response;
-  LOG(INFO) << "ParameterServer2::SaveValueVector: serverId=" << serverId_;
-
-  mkDir(request.dir_name().c_str());
-
-  constexpr int kBufLen = 100;
-  char buf[kBufLen];
-  snprintf(buf, kBufLen, "/pserver.%04d", static_cast<int>(serverId_));
-  std::string filename = request.dir_name() + buf;
-
-  std::ofstream fs(filename, std::ios_base::binary);
-  CHECK(fs) << "Fail to open " << filename;
-
-  CpuVector& vec = vectors_[PARAMETER_APPLY] ? *vectors_[PARAMETER_APPLY]
-                                             : *vectors_[PARAMETER_VALUE];
-  Parameter::Header header;
-  // TODO(TJ): save param headerFormat_
-  header.format = PARAM_FORMAT_ORIGINAL;
-  header.valueSize = sizeof(real);
-  header.size = size_;
-
-  CHECK_EQ(header.size, vec.getSize());
-
-  CHECK(fs.write(reinterpret_cast<char*>(&header), sizeof(header)))
-      << "Fail to write parameter in pserver: " << serverId_;
-
-  CHECK(fs.write(reinterpret_cast<char*>(vec.getData()),
-                 header.size * sizeof(real)))
-      << "Fail to write parameter in pserver: " << serverId_;
-
-  callback(response);
-}
-
-void ParameterServer2::op_RESET(const Operation& operation,
-                                OperationResult* result) {
-  (void)result;
-  CpuVector* u = vectors_[operation.pvectors(0)].get();
-  u->reset(operation.scalars(0));
-  clearUnusedSegments(u);
-}
-
-void ParameterServer2::op_utv(const Operation& operation,
-                              OperationResult* result) {
-  real* u = vectors_[operation.pvectors(0)]->getData();
-  real* v = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  double sum = 0;
-  for (int64_t i = 0; i < size; ++i) {
-    sum += (double)u[i] * (double)v[i];
-  }
-  result->add_scalars(sum);
-}
-
-void ParameterServer2::op_au_bv(const Operation& operation,
-                                OperationResult* result) {
-  (void)result;
-  real* u = vectors_[operation.pvectors(0)]->getData();
-  real* v = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  real a = operation.scalars(0);
-  real b = operation.scalars(1);
-  for (int64_t i = 0; i < size; ++i) {
-    v[i] = a * u[i] + b * v[i];
-  }
-}
-
-void ParameterServer2::op_COPY(const Operation& operation,
-                               OperationResult* result) {
-  (void)result;
-  real* u = vectors_[operation.pvectors(0)]->getData();
-  real* v = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  for (int64_t i = 0; i < size; ++i) {
-    v[i] = u[i];
-  }
-}
-
-void ParameterServer2::op_au(const Operation& operation,
-                             OperationResult* result) {
-  (void)result;
-  real* u = vectors_[operation.pvectors(0)]->getData();
-  int64_t size = size_;
-  real a = operation.scalars(0);
-  for (int64_t i = 0; i < size; ++i) {
-    u[i] *= a;
-  }
-}
-
-void ParameterServer2::op_au_bv_cw(const Operation& operation,
-                                   OperationResult* result) {
-  (void)result;
-  real* u = vectors_[operation.pvectors(0)]->getData();
-  real* v = vectors_[operation.pvectors(1)]->getData();
-  real* w = vectors_[operation.pvectors(2)]->getData();
-  int64_t size = size_;
-  real a = operation.scalars(0);
-  real b = operation.scalars(1);
-  real c = operation.scalars(2);
-  for (int64_t i = 0; i < size; ++i) {
-    w[i] = a * u[i] + b * v[i] + c * w[i];
-  }
-}
-
-void ParameterServer2::op_make_steepest_desc_dir(const Operation& operation,
-                                                 OperationResult* result) {
-  (void)result;
-  real* dir = vectors_[operation.pvectors(0)]->getData();
-  real* grad = vectors_[operation.pvectors(1)]->getData();
-  real* x = vectors_[operation.pvectors(2)]->getData();
-  int64_t size = size_;
-  real l1weight = operation.scalars(0);
-  for (int64_t i = 0; i < size; ++i) {
-    if (x[i] < 0) {
-      dir[i] = -grad[i] + l1weight;
-    } else if (x[i] > 0) {
-      dir[i] = -grad[i] - l1weight;
-    } else {
-      if (grad[i] < -l1weight) {
-        dir[i] = -grad[i] - l1weight;
-      } else if (grad[i] > l1weight) {
-        dir[i] = -grad[i] + l1weight;
-      } else {
-        dir[i] = 0;
-      }
-    }
-  }
-}
-
-void ParameterServer2::op_fix_dir_signs(const Operation& operation,
-                                        OperationResult* result) {
-  (void)result;
-  real* dir = vectors_[operation.pvectors(0)]->getData();
-  real* steepestDescDir = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  for (int64_t i = 0; i < size; ++i) {
-    if (dir[i] * steepestDescDir[i] <= 0) {
-      dir[i] = 0;
-    }
-  }
-}
-
-void ParameterServer2::op_fix_omega_signs(const Operation& operation,
-                                          OperationResult* result) {
-  (void)result;
-  real* x = vectors_[operation.pvectors(0)]->getData();
-  real* newx = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  for (int64_t i = 0; i < size; ++i) {
-    if (x[i] * newx[i] < 0) {
-      newx[i] = 0;
-    }
-  }
-}
-
-void ParameterServer2::op_dir_deriv(const Operation& operation,
-                                    OperationResult* result) {
-  real* dir = vectors_[operation.pvectors(0)]->getData();
-  real* grad = vectors_[operation.pvectors(1)]->getData();
-  real* x = vectors_[operation.pvectors(2)]->getData();
-  int64_t size = size_;
-  real l1weight = operation.scalars(0);
-  double sum = 0;
-  for (int64_t i = 0; i < size; ++i) {
-    if (dir[i] != 0) {
-      if (x[i] < 0) {
-        sum += dir[i] * (grad[i] - l1weight);
-      } else if (x[i] > 0) {
-        sum += dir[i] * (grad[i] + l1weight);
-      } else if (dir[i] < 0) {
-        sum += dir[i] * (grad[i] - l1weight);
-      } else if (dir[i] > 0) {
-        sum += dir[i] * (grad[i] + l1weight);
-      }
-    }
-  }
-  result->add_scalars(sum);
-}
-
-void ParameterServer2::op_cost(const Operation& operation,
-                               OperationResult* result) {
-  real* x = vectors_[operation.pvectors(0)]->getData();
-  real* newgrad = vectors_[operation.pvectors(1)]->getData();
-  int64_t size = size_;
-  real l1weight = operation.scalars(0);
-  real l2weight = operation.scalars(1);
-  double cost_real = cost_ / mpiSize_;
-  double sum_weight_l1 = 0;
-  double sum_weight_l2 = 0;
-  for (int64_t i = 0; i < size; ++i) {
-    sum_weight_l1 += std::abs(x[i]);
-    sum_weight_l2 += x[i] * x[i];
-    newgrad[i] += 2.0 * l2weight * x[i];
-  }
-  cost_real += l1weight * sum_weight_l1 + l2weight * sum_weight_l2;
-  result->add_scalars(cost_real);
-}
-
-ParameterServer2::OperatorFunction ParameterServer2::opFuncs[] = {
-    nullptr,                         // PSERVER_OP_utu = 0;
-    &ParameterServer2::op_utv,       // PSERVER_OP_utv = 1;
-    &ParameterServer2::op_au,        // PSERVER_OP_au = 2;
-    &ParameterServer2::op_au_bv,     // PSERVER_OP_au_bv = 3;
-    nullptr,                         // PSERVER_OP_aAx_bu = 4;
-    &ParameterServer2::op_SGD,       // PSERVER_OP_SGD = 5;
-    &ParameterServer2::op_RESET,     // PSERVER_OP_RESET = 6;
-    &ParameterServer2::op_COPY,      // PSERVER_OP_COPY = 7;
-    &ParameterServer2::op_au_bv_cw,  // PSERVER_OP_au_bv_cw = 8;
-    &ParameterServer2::op_make_steepest_desc_dir,
-    /// PSERVER_OP_MAKE_STEEPEST_DESC_DIR = 9;
-    &ParameterServer2::op_fix_dir_signs,    // PSERVER_OP_FIX_SIGNS = 10;
-    &ParameterServer2::op_dir_deriv,        // PSERVER_OP_DIR_DERIV = 11;
-    &ParameterServer2::op_fix_omega_signs,  // PSERVER_OP_FIX_OMEGA_SIGNS = 12;
-    &ParameterServer2::op_cost,             // PSERVER_OP_COST = 13
-    &ParameterServer2::op_start_pass,       // PSERVER_OP_START_PASS = 14
-    &ParameterServer2::op_finish_pass,      // PSERVER_OP_FINISH_PASS = 15
-    &ParameterServer2::op_randomize,        // PSERVER_OP_RANDOMIZE = 16
-    &ParameterServer2::op_apply,            // PSERVER_OP_APPLY = 17
-};
-
-void ParameterServer2::doOperation(const DoOperationRequest& request,
-                                   ProtoResponseCallback callback) {
-  if (request.wait_for_gradient()) {
-    /// wait gradient update
-    gradientReadyBarrier_.wait();
-    allClientPassFinish_ = numPassFinishClients_ == FLAGS_num_gradient_servers;
-  }
-
-  DoOperationResponse response;
-  response.set_pass_finish(allClientPassFinish_);
-
-  for (const auto& op : request.operations()) {
-    OperationResult* opResult = response.add_results();
-    if (op.operation() >= ARRAYSIZE(opFuncs)) {
-      LOG(ERROR) << "Unknown operation " << op.operation();
-      response.set_return_message(kRetMsgUnknownOperation);
-    }
-    OperatorFunction opFunc = opFuncs[op.operation()];
-    if (!opFunc) {
-      LOG(ERROR) << "Operation not implemented: " << op.operation();
-      response.set_return_message(kRetMsgUnknownOperation);
-    }
-    (this->*opFunc)(op, opResult);
-  }
-
-  if (request.send_back_parameter()) {
-    /// clean current cost
-    cost_ = 0;
-
-    if (allClientPassFinish_ && request.release_pass()) {
-      /// This signals that all clients finish one pass, so waitPassFinish()
-      /// will stop waiting.
-      numPassFinishClients_ = 0;
-    }
-
-    /// notify addGradient() to send back parameter
-    parameterReadyBarrier_.wait();
-  }
-  callback(response);
-}
-
-void ParameterServer2::waitPassStart(const WaitPassStartRequest& request,
-                                     ProtoResponseCallback callback) {
-  passBarrier_.wait();
-  callback(WaitPassStartResponse());
-}
-
-void ParameterServer2::waitPassFinish(const WaitPassFinishRequest& request,
-                                      ProtoResponseCallback callback) {
-  numPassFinishClients_ += 1;
-
-  while (numPassFinishClients_ != 0) {
-    /// notify doOperation gradient ready
-    gradientReadyBarrier_.wait();
-    /// wait doOperation finish
-    parameterReadyBarrier_.wait();
-  }
-
-  callback(WaitPassFinishResponse());
-}
-
-void ParameterServer2::synchronize(const SynchronizeRequest& request,
-                                   ProtoResponseCallback callback) {
-  synchronizeBarriers_[request.sync_object_id()]->wait();
-  dataSize_ = 0;
-  callback(SynchronizeResponse());
-}
-
-void ParameterServer2::asyncFinishPass(const SynchronizeRequest& request,
-                                       ProtoResponseCallback callback) {
-  synchronizeBarriers_[request.sync_object_id()]->wait();
-  callback(SynchronizeResponse());
-
-  if (request.trainer_id() == 0) {
-    batchId_ = 0;
-  }
-}
-
-void ParameterServer2::createVector(const CreateVectorRequest& request,
-                                    ProtoResponseCallback callback) {
-  (void)request;
-  CreateVectorResponse response;
-  LOG(INFO) << "ParameterServer2::createVector: size=" << size_;
-  CpuVectorPtr vec = std::make_shared<CpuVector>(size_);
-  int64_t handle = -1;
-  {
-    std::lock_guard<RWLock> guard(parameterMutex_);
-    handle = vectors_.size();
-    vectors_.push_back(vec);
-  }
-  response.set_handle(handle);
-  callback(response);
-}
-
-void ParameterServer2::releaseVector(const ReleaseVectorRequest& request,
-                                     ProtoResponseCallback callback) {
-  ReleaseVectorResponse response;
-  CpuVectorPtr vec;
-  {
-    std::lock_guard<RWLock> guard(parameterMutex_);
-    vec.swap(vectors_[request.handle()]);
-  }
-  callback(response);
-}
-
-void ParameterServer2::createMatrix(const CreateMatrixRequest& request,
-                                    ProtoResponseCallback callback) {
-  CreateMatrixResponse response;
-  /// We need to create column major matrix of size_ * num_cols
-  /// Matrix is row majoar. Need to tranpose when use it.
-  CpuMatrixPtr mat = std::make_shared<CpuMatrix>(request.num_cols(), size_);
-  int64_t handle = -1;
-  {
-    std::lock_guard<RWLock> guard(parameterMutex_);
-    handle = matrices_.size();
-    matrices_.push_back(mat);
-  }
-  response.set_handle(handle);
-  callback(response);
-}
-
-void ParameterServer2::releaseMatrix(const ReleaseMatrixRequest& request,
-                                     ProtoResponseCallback callback) {
-  ReleaseMatrixResponse response;
-  CpuMatrixPtr mat;
-  {
-    std::lock_guard<RWLock> guard(parameterMutex_);
-    mat.swap(matrices_[request.handle()]);
-  }
-  callback(response);
-}
-
-}  // namespace paddle
diff --git a/paddle/pserver/ParameterServer2.h b/paddle/pserver/ParameterServer2.h
deleted file mode 100644
index 3ed06b6b045802bcfd48bcff6bd0c1b34e9bbb86..0000000000000000000000000000000000000000
--- a/paddle/pserver/ParameterServer2.h
+++ /dev/null
@@ -1,696 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <atomic>
-#include <limits>
-#include <mutex>
-#include <string>
-#include <type_traits>
-#include <unordered_map>
-#include <vector>
-
-#include <stddef.h>
-#include <stdlib.h>
-
-#include "paddle/math/Matrix.h"
-#include "paddle/math/Vector.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/parameter/ParameterOptimizer.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/ThreadLocal.h"
-
-#include "ParameterService.pb.h"
-
-#include "ProtoServer.h"
-
-DECLARE_int32(port);
-
-namespace paddle {
-
-// @TODO(yanfei):
-// if armed with high density computation resource per node, pserver could also
-// utilize GPU to reduce overhead. if this mechanism is used, it could pipeline
-// network receiving and GPU computation to reduce the network overhead even
-// further. the pipeline could help to accelerate BIG model training.
-// @TODO:(yanfei)
-// for cpu and less/low gpu machine, the time exhausted by forward and backward
-// could be larger than optimization at pserver. However, if armed with lots of
-// gpus per node and if the model size is so large enough that limited cpu
-// computation causes big optmization latency, the GPU may be required by
-// pserver.
-
-/**
- * Client interface for the parameter server
- *
- * it implements several rpc API for remote parameter client usage.
- * for sync-sgd, client needs one controller thread to build connections
- * to all pservers, these controller connections do barriers
- * synchronization with these connections used for transfering data.
- * each data connection uses block based fine grained synchronization
- * to gain better scalability. Merging gradients from different trainers
- * are concurrently executed with block units, so that some network
- * overhead will be hidden in merging gradient.
- * for async-sgd, the difference is that pserver will do optimization
- * immediately if the gradients are ready, so that pserver needs to
- * prepare separate buffer to store value for sending back to trainer
- * to prevent from being polluted.
- */
-class ParameterServer2 : public ProtoServer {
-protected:
-  /// parameter_ mutex.
-  RWLock parameterMutex_;
-
-  typedef std::pair<size_t, int64_t> BlockKey;
-  struct BlockKeyHash {
-    size_t operator()(const BlockKey& key) const {
-      return std::hash<size_t>()(key.first) + key.second;
-    }
-  };
-
-  // TODO(yanfei):
-  // if index data structure is based on parameters instead of blocks, the
-  // lookup performance could be better. In addition, the block memory
-  // access almost exhibits good locality, so index data structure and
-  // block data structure can be refined further, especially if gpu is used
-  // for pserver.
-  /**
-   * all parameters are stored in CpuVector with a blockMap_ data structure
-   * to index block data required by requests.
-   */
-  typedef std::unordered_map<BlockKey, int64_t, BlockKeyHash> BlockMap;
-  /// <(para, block), global offset(byte) in all parameters>
-  BlockMap blockOffsetMap_;
-  /// <(para, block), global idx [0, nBlocksInAllParameters]>
-  BlockMap blockIdMap_;
-
-  std::vector<CpuVectorPtr> vectors_;
-  std::vector<CpuMatrixPtr> matrices_;
-  std::vector<CpuMemHandlePtr> dataMems_;
-
-  // TODO(yanfei):
-  // if storing sparse_remote_update() flag in request instead of
-  // reading configMap_, and storing config within new block wise
-  // overview data structure, the config mapping, block mapping
-  // can be unified in single clean data structure. Use para_id
-  // to index parameters, use offset to index block within parameter
-  // and keep two index into single one.
-  /**
-   * mapping between parameter and config
-   * different parameter allows different config, such as decay_rate.
-   * for each request, it need to read config for adding gradient
-   * and optmization.
-   */
-  std::unordered_map<size_t, ParameterConfig> configMap_;
-
-  /**
-   * to parallelize the multi-thread and multi-connnection
-   * computation at pserver, it use block unit to reduce
-   * the contention for computation, even further use block
-   * level optimizater control for each block for some special
-   * reason annotated below.
-   */
-  struct BlockInfo {
-    const ParameterConfig* config;
-    std::unique_ptr<std::mutex> lock;
-    /// global offset for all parameters
-    uint64_t offset;
-    /**
-     *
-     * Async sgd in pserver is very different from sync sgd.
-     * Each trainer follows startBatch, update*, finishBatch as in
-     * sync sgd, but all these actions are almost executed by
-     * multi-core and multi-thread simutaneously, so that async
-     * sgd optimization is based on block level in reality, then
-     * per block optimization is necessary indeed. In addition,
-     * per block optimization is also perfered for performance
-     * with multithreads.
-     */
-    std::unique_ptr<ParameterOptimizer> optimizer;
-  };
-  std::vector<BlockInfo> blockInfos_;
-
-  typedef std::vector<std::pair<int64_t, int64_t>> BlockSegments;
-  /// Because some blocks might not be fully used. We keep a
-  /// record of which segments are used.
-  BlockSegments usedSegments_;
-
-  /// record pserver status, all status defined in ParameterService.pb
-  PServerStatus status_;
-  /// record all samples processed which could be used by optimizater
-  std::atomic<int64_t> numSamplesProcessed_;
-  double cost_;
-  int mpiSize_;
-  int dataSize_;
-  /// configuration for current parameter optimizer
-  OptimizationConfig config_;
-
-  /**
-   * The ReadWriteBuffer is based on std::vector, but aligned for avx/sse
-   * compute. And add some helper method to allocate memory aligned blocks.
-   *
-   * @param T          type of element.
-   * @param AlignBytes the memory aligned bytes for allocated blocks.
-   */
-  template <typename T, size_t AlignBytes>
-  class ReadWriteBuffer
-      : public std::vector<T, AlignedAllocator<T, AlignBytes>> {
-  public:
-    static_assert(sizeof(T) % AlignBytes == 0 || AlignBytes % sizeof(T) == 0,
-                  "Type T must be able to aligned.");
-
-    /**
-     * @brief IsTLargerThanAlign compiled time calculated constant for is type
-     * T larger than alignments.
-     */
-    constexpr static bool IsTLargerThanAlign = sizeof(T) >= AlignBytes;
-
-    static_assert(std::is_pod<T>::value, "T must be POD type.");
-
-    /**
-     * @brief if AlignBytes > sizeof(T), then will calcuate how many elements
-     * can be stored in AlignBytes.
-     */
-    constexpr static size_t AlignElementCount = AlignBytes / sizeof(T);
-
-    static_assert(AlignElementCount ==
-                          (AlignElementCount & -AlignElementCount) ||
-                      AlignBytes > sizeof(T),
-                  "AlignElementCount should be exp of 2");
-
-    /**
-     * @brief Resize Buffer, with block count that will be allocated. Each block
-     * will be memory aligned in AlignBytes.
-     * @param size The element count in all blocks.
-     * @param alignBlockCount The block count that will be allocated.
-     */
-    void resizeWithAlignHints(size_t size, size_t alignBlockCount = 1) {
-      if (IsTLargerThanAlign) {  //! So, each elements is memory aligned.
-        this->resize(size);
-      } else {
-        //! at most, we need such elements in buffer to make sure each block is
-        //! aligned.
-        this->resize(size + alignBlockCount * (AlignElementCount - 1));
-      }
-    }
-
-    /**
-     * @brief reset aligned allocate blocks.
-     */
-    void resetAlignAlloc() { this->curOffset_ = 0; }
-
-    /**
-     * @brief get next aligned block address.
-     * @param blockSize is the element count in each block.
-     * @return Aligned block address.
-     */
-    T* nextBlock(size_t blockSize) {
-      T* r = &this->operator[](curOffset_);
-      curOffset_ += blockSize;
-
-      if (!IsTLargerThanAlign) {
-        curOffset_ =
-            (curOffset_ + AlignElementCount - 1) & ~(AlignElementCount - 1);
-      }
-      return r;
-    }
-
-  private:
-    size_t curOffset_;
-  };
-
-  /// to buffer the data from network for further processing to
-  /// reduce redundant memory allocation.
-  ThreadLocal<ReadWriteBuffer<real, ALIGN_HINT>> readWriteBuffer_;
-
-  /// size of the parameter
-  int64_t size_;
-
-  /// for synchronized training, check details in addGradient()
-  /// and doOperation()
-  ThreadBarrier gradientReadyBarrier_;
-  ThreadBarrier parameterReadyBarrier_;
-  ThreadBarrier passBarrier_;
-  ThreadLocal<std::vector<SendParameterRequest>> requestVec_;
-  ThreadLocal<std::vector<ProtoResponseCallbackEx>> callbackVec_;
-
-  std::atomic<int> numPassFinishClients_;
-  bool allClientPassFinish_;
-
-  std::vector<std::unique_ptr<ThreadBarrier>> synchronizeBarriers_;
-  std::atomic<int> serverId_;
-
-  /**
-   *
-   * for lagged async gradient gradient commit control in Async Sgd.
-   * discard lagged gradients from too slow nodes, whose gradients
-   * exhibits bad quality.
-   * Algorithm:
-   * pserver:
-   * 1. initial asyncUpdaterSteps = 0, asyncTrainerSteps_[N] = 0.
-   * syncUpdaterSteps means
-   *    the version of parameter value.
-   * 2. when pull arrives, record asyncUpdateSteps_ into
-   * syncTrainerSteps_[trainer_id]
-   * 3. when push arrives, compare asyncUpdateSteps_ with
-   * syncTrainerSteps_[trainer_id]
-   *    if delta > threshold, discard current gradient, else commit
-   *    gradient.
-   * 4. reset asyncUpdaterSteps_ and asyncTrainerSteps_[N] when pass
-   * finished
-   * Note:
-   * it can not discard all lag-gradient strictly in some special
-   * condition. part of gradients could be discarded if
-   * ConcurrentRemoteParameterUpdater is sed.
-   * this algorithm is implemented in asynSGD()
-   */
-  int64_t asyncLaggedThreshold_;
-  std::atomic<int64_t> asyncUpdateSteps_;
-  std::vector<int64_t> asyncTrainerSteps_;
-  size_t asyncLaggedGradientsNum_;
-  /// stat all async update
-  std::vector<size_t> asyncUpdateStat_;
-  /// stat per trainer_id
-  std::vector<size_t> asyncTrainerDiscardStat_;
-  /// stat per trainer_id
-  std::vector<size_t> asyncTrainerCommitStat_;
-
-  /// only used by controller and other control cmd from trainer number 0
-  std::unique_ptr<SyncThreadPool> syncThreadPool_;
-
-  /// pserver for sparse remote update parameters
-  bool isSparseServer_;
-
-  /// barrier performance tuning sync-sgd required
-  std::atomic<int64_t> batchId_;
-
-public:
-  struct Buffer {
-    real* base;
-    size_t size;
-  };
-
-protected:
-  /// async gradient commit control
-  bool asyncGrdientCommitCheckAndStat(const SendParameterRequest& request);
-
-public:
-  /// disable default parameter for overloading
-  /// @rdmaCpu:the id of cpu core hosting RDMA server(0-N)
-  /// -1 means using TCP transport instead of RDMA
-  ParameterServer2(const std::string& addr, int port, int rdmaCpu = -1);
-
-  ~ParameterServer2() {}
-
-  static const std::string kRetMsgInvalidMatrixHandle;
-  static const std::string kRetMsgInvalidVectorHandle;
-  static const std::string kRetMsgUnknownOperation;
-
-  /// service functions
-  template <typename Dtype>
-  void reduceAndSendData(const SendDataRequest& request,
-                         std::unique_ptr<MsgReader>& msgReader,
-                         ProtoResponseCallbackEx& callback);
-
-  void templateReduceSum(const SendDataRequest& request,
-                         std::unique_ptr<MsgReader>& msgReader,
-                         ProtoResponseCallbackEx& callback);
-
-  /**
-   * @brief framework for sending parameters
-   *
-   * @note  different parameter data type can be sent to pserver.
-   *        in most case, the api is used to send gradients from
-   *        trainer to pserver.
-   *        it also can be used to retrieve parameters from pserver
-   */
-  void sendParameter(const SendParameterRequest& request,
-                     std::unique_ptr<MsgReader> msgReader,
-                     ProtoResponseCallbackEx callback);
-
-  void sendData(const SendDataRequest& request,
-                std::unique_ptr<MsgReader> msgReader,
-                ProtoResponseCallbackEx callback);
-
-  /**
-   * @brief send config to pserver
-   *
-   * @note  it can help pserver to understand the configuration for
-   * optimization,
-   *        logging control, duplicated initialization, etc.
-   */
-  void setConfig(const SetConfigRequest& request,
-                 ProtoResponseCallback callback);
-
-  /**
-   * @brief get status for pserver
-   *
-   * @note  used to check if parameters are ready at pserver
-   */
-  void getStatus(const GetStatusRequest& request,
-                 ProtoResponseCallback callback);
-
-  /**
-   * @brief set status for pserver
-   *
-   * @note  used to check if parameters are ready at pserver, since parameters
-   *        at pserver are initialized by trainer
-   */
-  void setStatus(const SetStatusRequest& request,
-                 ProtoResponseCallback callback);
-
-  /**
-   * @brief framework for doing some operation at pserver end
-   *
-   * @note  if sync-sgd is used, controller will calling op_SGD action
-   *        for gradient optimization.
-   *        check avaiable operations in opFuncs[]
-   */
-  void doOperation(const DoOperationRequest& request,
-                   ProtoResponseCallback callback);
-
-  /// Create a column vector. The size is the dimension of parameter
-  void createVector(const CreateVectorRequest& request,
-                    ProtoResponseCallback callback);
-
-  void releaseVector(const ReleaseVectorRequest& request,
-                     ProtoResponseCallback callback);
-
-  /// Create a column major matrix. The number of rows is the dimension of
-  /// parameter. The number of columns is specifed by num_cols.
-  void createMatrix(const CreateMatrixRequest& request,
-                    ProtoResponseCallback callback);
-
-  void releaseMatrix(const ReleaseMatrixRequest& request,
-                     ProtoResponseCallback callback);
-  /**
-   * @brief stateful control for indicationg sync pass start
-   *
-   * @note  it is valuable for logging and state control,
-   *        especially for sync-sgd control
-   */
-  void waitPassStart(const WaitPassStartRequest& request,
-                     ProtoResponseCallback callback);
-
-  /**
-   * @brief stateful control for indicationg sync pass end
-   *
-   * @note  it is valuable for logging and state control,
-   *        especially for sync-sgd control
-   */
-  void waitPassFinish(const WaitPassFinishRequest& request,
-                      ProtoResponseCallback callback);
-
-  /**
-   * @brief synchronize all distributed trainers
-   *
-   * @note  it's general api for synchronizing trainer and pserver
-   */
-  void synchronize(const SynchronizeRequest& request,
-                   ProtoResponseCallback callback);
-
-  /**
-   * @brief stateful control for indicating async pass is finished
-   *
-   * @note  it is valuable for logging control, state reset, etc.
-   */
-  void asyncFinishPass(const SynchronizeRequest& request,
-                       ProtoResponseCallback callback);
-
-  void loadValueVector(const LoadValueRequest& request,
-                       ProtoResponseCallback callback);
-
-  void saveValueVector(const SaveValueRequest& request,
-                       ProtoResponseCallback callback);
-
-public:
-  /**
-   * @brief initialize parameter server
-   */
-  bool init();
-
-  /**
-   * @brief set parameters at pserver
-   *
-   * @note  do parameter initialization if neccessy.
-   */
-  void setParameter(const SendParameterRequest& request,
-                    std::vector<Buffer>& inputBuffers,
-                    SendParameterResponse* response,
-                    std::vector<Buffer>* outputBuffers);
-
-  /**
-   * @brief receive gradients and do optimization for async-sgd
-   *
-   * @note  this api asynchronizately receives all data from all
-   *        trainers, and immediately do optimization and return
-   *        optimizated value for trainer.
-   *        this above routine are block based atomic updating,
-   *        which means different block could based different stale
-   *        gradient.
-   *        it will discard some lagged gradients by default for
-   *        better convergence.
-   */
-  void asyncSGD(const SendParameterRequest& request,
-                std::vector<Buffer>& inputBuffers,
-                SendParameterResponse* response,
-                std::vector<Buffer>* outputBuffers);
-
-  /**
-   * @brief merge gradients from all trainer
-   *
-   * @note  this api use block based parallelization as fine grained
-   *        parallelization which benifits lock contention and latency
-   *        hidden for communication, also can harness multi-core
-   *        efficiently.
-   *        it also implements the synchronization for sync-sgd
-   */
-  void addGradient(const SendParameterRequest& request,
-                   std::vector<Buffer>& inputBuffers,
-                   SendParameterResponse* response,
-                   std::vector<Buffer>* outputBuffers);
-
-  /**
-   * @brief get dense parameters from pserver
-   *
-   * @note  for some specified condition, trainer will get parameters from
-   *        pservers.
-   *        e.g.
-   *        if all parameters are stored at perver end for big model training
-   *        trainer can use it to retrieve all parameters if necessary.
-   */
-  void getParameter(const SendParameterRequest& request,
-                    std::vector<Buffer>& inputBuffers,
-                    SendParameterResponse* response,
-                    std::vector<Buffer>* outputBuffers);
-
-  /**
-   * @brief get sparse value from parameter server
-   *
-   * @note  with sparse enabled, pservers own all latest value
-   *        while trainer only retrieve value that only are needed.
-   *        e.g.
-   *        trainer will do prefetch action to retrieve necessary latest
-   *        value from pserver for sparse calculation.
-   */
-  void getParameterSparse(const SendParameterRequest& request,
-                          std::vector<Buffer>& inputBuffers,
-                          SendParameterResponse* response,
-                          std::vector<Buffer>* outputBuffers);
-
-protected:
-  void mergeSegments(BlockSegments* segments);
-
-  /// set the unused segments to zero
-  void clearUnusedSegments(CpuVector* vec);
-
-  // TODO(yanfei):
-  // if read data and do optimization interleavely block by block,
-  // the performance could be better for gaining less network congestion.
-  /// read all data from connection and store it in static pre-allocated buffer
-  void readAllBlocks(MsgReader* msgReader,
-                     std::vector<ParameterServer2::Buffer>* buffers);
-
-  const ParameterConfig& getParameterConfig(const ParameterBlock& block) {
-    CHECK_LT(block.para_id(), -1UL) << "invalid parameter id:"
-                                    << block.para_id();
-    const auto it = configMap_.find(block.para_id());
-    CHECK(it != configMap_.end()) << "can not find parameter id: "
-                                  << block.para_id();
-    return it->second;
-  }
-
-  /// it implictly check blockOffsetMap_ while retrieving blockId
-  const ParameterConfig& getParameterConfig(int64_t blockId) const {
-    CHECK(blockId >= 0 && blockId < (int64_t)blockInfos_.size())
-        << "block idx out of range, id: " << blockId
-        << " info size: " << blockInfos_.size();
-    return *(blockInfos_[blockId].config);
-  }
-
-  template <class Response>
-  bool isValidVectorHandle(int64_t handle, Response* response) {
-    if (handle < 0 || (size_t)handle >= vectors_.size()) {
-      LOG(ERROR) << "Invalid vector handle " << handle;
-      response->set_return_message(kRetMsgInvalidVectorHandle);
-      return false;
-    }
-    return true;
-  }
-
-  template <class Response>
-  bool isValidMatrixHandle(int64_t handle, Response* response) {
-    if (handle < 0 || (size_t)handle >= matrices_.size()) {
-      LOG(ERROR) << "Invalid matrix handle " << handle;
-      response->set_return_message(kRetMsgInvalidMatrixHandle);
-      return false;
-    }
-    return true;
-  }
-
-  /**
-   * @brief get block offset
-   *
-   * @note  block.begin_dim is added to the block offset.
-   *        return -1 if block cannot be found
-   */
-  int64_t getBlockOffset(const ParameterBlock& block) const {
-    BlockKey key(block.para_id(), block.block_id());
-    auto it = blockOffsetMap_.find(key);
-    if (it == blockOffsetMap_.end()) {
-      return -1;
-    }
-    return it->second;
-  }
-
-  /// return -1 if block cannot be found
-  int64_t getBlockId(const ParameterBlock& block) const {
-    BlockKey key(block.para_id(), block.block_id());
-    auto it = blockIdMap_.find(key);
-    if (it == blockIdMap_.end()) {
-      return -1;
-    }
-    return it->second;
-  }
-
-  /**
-   * @brief prepare data for sending back
-   *
-   * @note  modify reponse and outputBuffers for sending parameter
-   *        back to client. The buffer for socket sending uses
-   *        vectors_[parameterType] directly
-   *        for dense with sync-sgd
-   */
-  void sendBackParameter(const ParameterBlock& block,
-                         int parameterType,
-                         SendParameterResponse* response,
-                         std::vector<Buffer>* outputBuffers);
-
-  /**
-   * @brief prepare data for sending back
-   *
-   * @note  modify response and outputBuffers for sending parameter
-   *        back to client. The buffer for socket sending uses buffer->base
-   *        The parameter values are copied from vectors_[parameterType]
-   *        to buffer->base.
-   *        for dense with async-sgd
-   */
-  void sendBackParameter(const ParameterBlock& block,
-                         int parameterType,
-                         SendParameterResponse* response,
-                         Buffer* buffer,
-                         std::vector<Buffer>* outputBuffers);
-  /**
-   * @brief prepare data for sending back
-   *
-   * @note  specified for sparse
-   */
-  void sendBackParameterSparse(const ParameterBlock& block,
-                               int parameterType,
-                               SendParameterResponse* response,
-                               Buffer* buffer,
-                               size_t width,
-                               std::vector<Buffer>* outputBuffers);
-
-  /**
-   * framework routine for block parallelization
-   * e.g.
-   * for optimization on all blocks at pserver end, this routine can facilitize
-   * the parallelize of do optimization on all blocks with multithreads.
-   */
-  typedef std::function<void(int64_t blockId, const VectorPtr vecs[])> ExecFunc;
-  void parallelExecForEachBlock(ExecFunc func);
-  void blockTraverse(BlockInfo& info,
-                     const ParameterConfig& config,
-                     int64_t offset,
-                     size_t size,
-                     const VectorPtr vecs[],
-                     const ParameterOptimizer::TraverseCallback& callback);
-
-public:
-  typedef void (ParameterServer2::*OperatorFunction)(const Operation& operation,
-                                                     OperationResult* result);
-
-  /**
-   * doOperation will call following operations indirectly
-   * e.g.
-   * for sync-sgd control, the controller in remote updater will send op_SGD
-   * command to pserver, then send sendParameter request to pserver immediately.
-   * the two function at pserver end will do cooperation to achieve the sync-sgd
-   * gradient merge and optimization.
-   * the most following operations are specified for owlqn, all operations are
-   * under the context of doOperation function
-   */
-  static OperatorFunction opFuncs[];
-
-  void op_SGD(const Operation& operation, OperationResult* result);
-
-  void op_RESET(const Operation& operation, OperationResult* result);
-
-  void op_utv(const Operation& operation, OperationResult* result);
-
-  void op_au_bv(const Operation& operation, OperationResult* result);
-
-  void op_COPY(const Operation& operation, OperationResult* result);
-
-  void op_au(const Operation& operation, OperationResult* result);
-
-  void op_au_bv_cw(const Operation& operation, OperationResult* result);
-
-  void op_make_steepest_desc_dir(const Operation& operation,
-                                 OperationResult* result);
-
-  void op_fix_dir_signs(const Operation& operation, OperationResult* result);
-
-  void op_dir_deriv(const Operation& operation, OperationResult* result);
-
-  void op_fix_omega_signs(const Operation& operation, OperationResult* result);
-
-  void op_cost(const Operation& operation, OperationResult* result);
-
-  void op_start_pass(const Operation& operation, OperationResult* result);
-  void op_finish_pass(const Operation& operation, OperationResult* result);
-
-  void op_apply(const Operation& operation, OperationResult* result);
-
-  void op_randomize(const Operation& operation, OperationResult* result);
-
-  void op_load(const Operation& operation, OperationResult* result);
-  void op_save(const Operation& operation, OperationResult* result);
-};
-
-}  // namespace paddle
diff --git a/paddle/pserver/ParameterServerController.h b/paddle/pserver/ParameterServerController.h
deleted file mode 100644
index 3a9bc74edf240a12fe1f7bd266f0311555349311..0000000000000000000000000000000000000000
--- a/paddle/pserver/ParameterServerController.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ParameterServer2.h"
-#include "ParameterServerConfig.pb.h"
-#include "RDMANetwork.h"
-#include "paddle/utils/StringUtil.h"
-
-namespace paddle {
-
-/**
- * @brief ParameterServerController is used for create, init and manage multi
- * parameter server instances. The num of the instances is decided by port
- * num(the ports number for parameter send) and network devices configured
- * by gflags or proto.
- */
-class ParameterServerController final {
-public:
-  DISABLE_COPY(ParameterServerController);
-
-  /**
-   * @brief Ctor, Create a ParameterServerController from ParameterServerConfig.
-   */
-  explicit ParameterServerController(const ParameterServerConfig& config);
-
-  /**
-   * @brief Dtor.
-   */
-  ~ParameterServerController();
-
-  /**
-   * @brief create ParameterServerController from gflags, this is used for
-   * compatibility with the old usage of configuration by gflags.
-   */
-  static ParameterServerController* createFromGflags();
-
-  /**
-   * @brief create ParameterServerController with ParameterServerConfig, remove
-   * gflags from ParameterServer. Init all ParameterServer2 instances according
-   * to
-   * the config.
-   */
-  static ParameterServerController* create(const ParameterServerConfig& config);
-
-  /**
-   * @brief start all ParameterServer2 instances in this
-   * ParameterServerController.
-   */
-  void start();
-
-  /**
-   * @brief join and wait for all ParameterServer2 instances thread in this
-   * ParameterServerController.
-   */
-  void wait();
-
-private:
-  std::vector<std::unique_ptr<ParameterServer2>> parameterServers_;
-};
-
-}  // namespace paddle
diff --git a/paddle/pserver/ProtoServer.h b/paddle/pserver/ProtoServer.h
deleted file mode 100644
index 3f78799dbfe1d4b80249e8cb27f269e6358903dd..0000000000000000000000000000000000000000
--- a/paddle/pserver/ProtoServer.h
+++ /dev/null
@@ -1,267 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "LightNetwork.h"
-
-#include <map>
-
-#include <google/protobuf/message_lite.h>
-
-namespace paddle {
-
-/**
- *
- * It implements the rpc framework, which launchs one thread for each
- * connection. Here define one parameter server as single TCP server
- * binding on single port. All connections share single tcp ProtoServer
- * object, each connection handles all requests from specified trainer
- * within single worker thread.
- * to accelerate bandwidth efficiency and harness multicore for pserver
- * optimization to reduce pserver latency, you could launch more port
- * for single NIC hardward with --port=N(N>1) for small cluster job.
- */
-class ProtoServer : public SocketServer {
-public:
-  /// rdmaCpu controls the cpu affinity of RDMA server daemon,
-  /// which could benifit performance. rdmaCpu = -1 means TCP
-  /// is used instead of RDMA transport.
-  ProtoServer(const std::string& addr, int port, int rdmaCpu = -1)
-      : SocketServer(addr, port, rdmaCpu) {}
-
-  typedef std::function<void(const google::protobuf::MessageLite& protoOut,
-                             const std::vector<iovec>& outputIovs)>
-      ProtoResponseCallbackEx;
-
-  typedef std::function<void(const google::protobuf::MessageLite& protoOut)>
-      ProtoResponseCallback;
-
-  /**
-   * Register a service function for this server
-   * void(const ProtoIn& request,
-   *      ProtoResponseCallback callback)
-   * The service function process the request and call the callback
-   * after it finishes the request.
-
-   * Use macro REGISTER_SERVICE_FUNCTION as a helper
-   * to simplify the use.
-   */
-  template <class ProtoIn>
-  void registerServiceFunction(
-      const std::string& funcName,
-      std::function<void(const ProtoIn& request,
-                         ProtoResponseCallback callback)> func);
-
-  /**
-   * Register a service function for this server
-   * The signature of the service function is
-   * void(const ProtoIn&,
-   *      std::unique_ptr<MsgReader> msgReader,
-   *      ProtoResponseCallbackEx callback)
-   * The service function process the request and call the callback
-   * after it finishes the request.
-   * The extended service function can take extra input blocks from
-   * the communication channel by reading msgReader. It can also
-   * send extra blocks to the communication channel by providing
-   * outputIovs as the argument for the callback function.
-
-   * Use macro REGISTER_SERVICE_FUNCTION_EX as a helper
-   * to simplify the use.
-   */
-  template <class ProtoIn>
-  void registerServiceFunctionEx(
-      const std::string& funcName,
-      std::function<void(const ProtoIn&,
-                         std::unique_ptr<MsgReader> msgReader,
-                         ProtoResponseCallbackEx callback)> func);
-
-protected:
-  /**
-   * @brief handle rpc request
-   * @param[in] msgReader  Message reader for reading data from connection
-   * @param[in] callback   equal to channel->writeMessage
-   *
-   * @note  it lookups rpc function mapping table to find function pointer,
-   *        then call this function with further reading data from connection
-   */
-  virtual void handleRequest(std::unique_ptr<MsgReader> msgReader,
-                             ResponseCallback callback);
-
-  typedef std::function<void(std::unique_ptr<MsgReader> msgReader,
-                             ResponseCallback callback)>
-      ServiceFunction;
-
-  /**
-   * @brief register one RPC function in function mapping
-   * @param[in] funcName  function name string
-   * @param[in] func      rpc function wrapped with reading and writing data
-   */
-  void registerServiceFunctionImp(const std::string& funcName,
-                                  ServiceFunction func);
-
-protected:
-  /// Tuning bare network overhead: the beginning of receiving request
-  ThreadLocal<struct timeval> handleRequestBegin_;
-
-  /// mapping to find rpc function while handling request
-  std::map<std::string, ServiceFunction> nameToFuncMap_;
-};
-
-class ProtoClient : public SocketClient {
-public:
-  ProtoClient(const std::string& serverAddr,
-              int serverPort,
-              enum ChannelType channelType = F_TCP)
-      : SocketClient(serverAddr, serverPort, channelType) {}
-
-  /**
-   * @brief Make a request to the server.
-   * @param[in] funcName  request rpc function name string
-   * @param[in] proto     protobuf data for sending to pserver
-   * @param[in] iov       additional iov data for sending to pserver
-   *
-   * @note  iov provides additional blocks which need to be written to the
-   *        communication channel
-   */
-  void send(const char* funcName,
-            const google::protobuf::MessageLite& proto,
-            const std::vector<iovec>& iov = std::vector<iovec>());
-
-  /**
-   * @brief receive the response from the server.
-   * @param[in] proto     proto binary buffer
-   *
-   * @note  this must be paired with a corresponding send() call. The
-   *        returned MsgReader allows the caller to receive additional
-   *        blocks from the communication channel.
-   */
-  std::unique_ptr<MsgReader> recv(google::protobuf::MessageLite* proto);
-
-  /// combines send() and recv()
-  std::unique_ptr<MsgReader> sendAndRecv(
-      const char* funcName,
-      const google::protobuf::MessageLite& protoIn,
-      google::protobuf::MessageLite* protoOut) {
-    send(funcName, protoIn);
-    return recv(protoOut);
-  }
-
-  /// combines send() and recv()
-  std::unique_ptr<MsgReader> sendAndRecv(
-      const char* funcName,
-      const google::protobuf::MessageLite& protoIn,
-      const std::vector<iovec>& iov,
-      google::protobuf::MessageLite* protoOut) {
-    send(funcName, protoIn, iov);
-    return recv(protoOut);
-  }
-};
-
-template <class>
-struct service_arg_type;
-/// helper class for obtaining the argument type of a service function
-template <class R, class C, class Arg1, class Arg2>
-struct service_arg_type<R (C::*)(const Arg1&, Arg2)> {
-  typedef Arg1 _1;
-};
-
-template <class R, class C, class Arg1, class Arg2>
-struct service_arg_type<R (C::*)(  // NOLINT
-    const Arg1&,
-    std::unique_ptr<MsgReader>,
-    Arg2)> {
-  typedef Arg1 _1;
-};
-
-/// register a service function to the ProtoServer
-/// This should only be used within a member function of className
-#define REGISTER_SERVICE_FUNCTION(className, funcName)       \
-  registerServiceFunction<                                   \
-      service_arg_type<decltype(&className::funcName)>::_1>( \
-      #funcName,                                             \
-      std::bind(&className::funcName,                        \
-                this,                                        \
-                std::placeholders::_1,                       \
-                std::placeholders::_2))
-
-/// register a service function to the ProtoServer
-/// This should only be used within a member function of className
-#define REGISTER_SERVICE_FUNCTION_EX(className, funcName)    \
-  registerServiceFunctionEx<                                 \
-      service_arg_type<decltype(&className::funcName)>::_1>( \
-      #funcName,                                             \
-      std::bind(&className::funcName,                        \
-                this,                                        \
-                std::placeholders::_1,                       \
-                std::placeholders::_2,                       \
-                std::placeholders::_3))
-
-/// create wrapper function for parameter server high level function and
-/// register the wrapper function into function mapping.
-template <class ProtoIn>
-void ProtoServer::registerServiceFunctionEx(
-    const std::string& funcName,
-    std::function<void(const ProtoIn&,
-                       std::unique_ptr<MsgReader> msgReader,
-                       ProtoResponseCallbackEx callback)> func) {
-  auto f = [func](std::unique_ptr<MsgReader> msgReader,
-                  ResponseCallback callback) {
-    ProtoIn request;
-    std::string str(msgReader->getNextBlockLength(), 0);
-    msgReader->readNextBlock(&str[0]);
-    CHECK(request.ParseFromString(str));
-    auto pcob = [callback](const google::protobuf::MessageLite& response,
-                           const std::vector<iovec>& outputIovs) {
-      std::string out;
-      CHECK(response.SerializeToString(&out));
-      std::vector<iovec> iovs;
-      iovs.push_back({&out[0], out.size()});
-      iovs.insert(iovs.end(), outputIovs.begin(), outputIovs.end());
-      callback(iovs);
-    };
-
-    func(request, std::move(msgReader), pcob);
-  };
-
-  registerServiceFunctionImp(funcName, f);
-}
-
-template <class ProtoIn>
-void ProtoServer::registerServiceFunction(
-    const std::string& funcName,
-    std::function<void(const ProtoIn&, ProtoResponseCallback callback)> func) {
-  auto f = [func](std::unique_ptr<MsgReader> msgReader,
-                  ResponseCallback callback) {
-    ProtoIn request;
-    std::string str(msgReader->getNextBlockLength(), 0);
-    msgReader->readNextBlock(&str[0]);
-    CHECK(request.ParseFromString(str));
-    msgReader.reset();
-
-    auto pcob = [callback](const google::protobuf::MessageLite& response) {
-      std::string out;
-      CHECK(response.SerializeToString(&out));
-      std::vector<iovec> iovs;
-      iovs.push_back({&out[0], out.size()});
-      callback(iovs);
-    };
-
-    func(request, pcob);
-  };
-
-  registerServiceFunctionImp(funcName, f);
-}
-
-}  // namespace paddle
diff --git a/paddle/pserver/RDMANetwork.h b/paddle/pserver/RDMANetwork.h
deleted file mode 100644
index 83db6b9df71274c3a8eb3403457877b68f2b6dea..0000000000000000000000000000000000000000
--- a/paddle/pserver/RDMANetwork.h
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef PADDLE_DISABLE_RDMA
-#include "sxi_sock.h"
-#else
-#define PROMPT_ERR() LOG(FATAL) << "Paddle is not compiled with rdma"
-#endif
-#include "paddle/utils/Logging.h"
-
-#include <netinet/in.h>
-struct sxi_sock;
-struct sxi_socket;
-
-#ifndef MAX_VEC_SIZE
-// define default MAX_VEC_SIZE
-#define MAX_VEC_SIZE (1UL << 16)
-#endif
-
-namespace paddle {
-/// Namespace rdma is adaptors for sxi_sock.h. Make paddle not depend on it
-/// when disable rdma support
-namespace rdma {
-inline int numCpus() {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_num_configured_cpus();
-#else
-  return 0;
-#endif
-}
-
-inline sxi_socket* ssocket(int cpuId) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_ssocket(cpuId);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline int listen(sxi_socket* s) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_listen(s);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline int bind(sxi_socket* s, const char* str) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_bind(s, str);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline sxi_sock* accept(sxi_socket* s) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_accept(s);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline sockaddr_in* getSourceAddress(sxi_sock* sock) {
-#ifndef PADDLE_DISABLE_RDMA
-  return reinterpret_cast<sockaddr_in*>(&sock->sa);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline int close(sxi_socket* sock) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_socket_close(sock);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline int close(sxi_sock* sock) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_sock_close(sock);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline void init() {
-#ifndef PADDLE_DISABLE_RDMA
-  sxi_module_init();
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline sxi_socket* csocket(int cpuId) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_csocket(cpuId);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline ssize_t read(sxi_sock* channel, void* data, size_t len) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_read(channel, data, len);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline ssize_t write(sxi_sock* channel, void* data, size_t len) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_write(channel, data, len);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline ssize_t readv(sxi_sock* channel, iovec* iov, int count) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_readv(channel, iov, count);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline ssize_t writev(sxi_sock* channel, iovec* iov, int count) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_writev(channel, iov, count);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-inline sxi_sock* connect(sxi_socket* socket, const char* url) {
-#ifndef PADDLE_DISABLE_RDMA
-  return sxi_connect(socket, url);
-#else
-  PROMPT_ERR();
-#endif
-}
-
-}  //  namespace rdma
-}  //  namespace paddle
diff --git a/paddle/pserver/SocketChannel.cpp b/paddle/pserver/SocketChannel.cpp
deleted file mode 100644
index 72e6943408a1856db214262ff0b0698a2eb89a91..0000000000000000000000000000000000000000
--- a/paddle/pserver/SocketChannel.cpp
+++ /dev/null
@@ -1,235 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "SocketChannel.h"
-
-#include <netdb.h>
-#include <netinet/in.h>
-#include <stdio.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include "RDMANetwork.h"
-
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-/**
- * UIO_MAXIOV is documented in writev(2), but <sys/uio.h> only
- * declares it on osx/ios if defined(KERNEL)
- */
-#ifndef UIO_MAXIOV
-#define UIO_MAXIOV 512
-#endif
-
-SocketChannel::~SocketChannel() {
-  if (tcpRdma_ == F_TCP)
-    close(tcpSocket_);
-  else
-    rdma::close(rdmaSocket_);
-  LOG(INFO) << "destory connection in socket channel, peer = " << peerName_;
-}
-
-size_t SocketChannel::read(void* buf, size_t size) {
-  size_t total = 0;
-  while (total < size) {
-    ssize_t len;
-    if (tcpRdma_ == F_TCP)
-      len = ::read(tcpSocket_, (char*)buf + total, size - total);
-    else
-      len = rdma::read(rdmaSocket_, (char*)buf + total, size - total);
-
-    CHECK(len >= 0) << " peer=" << peerName_;
-    if (len <= 0) {
-      return total;
-    }
-    total += len;
-  }
-  return total;
-}
-
-size_t SocketChannel::write(const void* buf, size_t size) {
-  size_t total = 0;
-  while (total < size) {
-    ssize_t len;
-    if (tcpRdma_ == F_TCP)
-      len = ::write(tcpSocket_, (const char*)buf + total, size - total);
-    else
-      len = rdma::write(rdmaSocket_, (char*)buf + total, size - total);
-
-    CHECK(len >= 0) << " peer=" << peerName_;
-    if (len <= 0) {
-      return total;
-    }
-    total += len;
-  }
-  return total;
-}
-
-template <class IOFunc, class SocketType>
-static size_t readwritev(IOFunc iofunc,
-                         SocketType socket,
-                         iovec* iovs,
-                         int iovcnt,
-                         int maxiovs,
-                         const std::string& peerName) {
-  int curIov = 0;
-  size_t total = 0;
-
-  for (int i = 0; i < iovcnt; ++i) {
-    total += iovs[i].iov_len;
-  }
-
-  size_t size = 0;
-  size_t curIovSizeDone = 0;
-
-  while (size < total) {
-    ssize_t len =
-        iofunc(socket, &iovs[curIov], std::min(iovcnt - curIov, maxiovs));
-    CHECK(len > 0) << " peer=" << peerName << " curIov=" << curIov
-                   << " iovCnt=" << iovcnt
-                   << " iovs[curIov].base=" << iovs[curIov].iov_base
-                   << " iovs[curIov].iov_len=" << iovs[curIov].iov_len;
-    size += len;
-
-    /// restore iovs[curIov] to the original value
-    iovs[curIov].iov_base =
-        (void*)((char*)iovs[curIov].iov_base - curIovSizeDone);
-    iovs[curIov].iov_len += curIovSizeDone;
-
-    len += curIovSizeDone;
-
-    while (curIov < iovcnt) {
-      if ((size_t)len < iovs[curIov].iov_len) break;
-      len -= iovs[curIov].iov_len;
-      ++curIov;
-    }
-    if (curIov < iovcnt) {
-      curIovSizeDone = len;
-      iovs[curIov].iov_base = (void*)((char*)iovs[curIov].iov_base + len);
-      iovs[curIov].iov_len -= len;
-    }
-  }
-  return size;
-}
-
-/// rdma::readv and rdma::writev can take advantage of RDMA blocking offload
-/// transfering
-size_t SocketChannel::writev(const std::vector<struct iovec>& iovs) {
-  if (tcpRdma_ == F_TCP)
-    return readwritev(::writev,
-                      tcpSocket_,
-                      const_cast<iovec*>(&iovs[0]),
-                      iovs.size(),
-                      UIO_MAXIOV,
-                      peerName_);
-  else
-    return readwritev(rdma::writev,
-                      rdmaSocket_,
-                      const_cast<iovec*>(&iovs[0]),
-                      iovs.size(),
-                      MAX_VEC_SIZE,
-                      peerName_);
-}
-
-size_t SocketChannel::readv(std::vector<struct iovec>* iovs) {
-  if (tcpRdma_ == F_TCP)
-    return readwritev(::readv,
-                      tcpSocket_,
-                      const_cast<iovec*>(&(*iovs)[0]),
-                      iovs->size(),
-                      UIO_MAXIOV,
-                      peerName_);
-  else
-    return readwritev(rdma::readv,
-                      rdmaSocket_,
-                      const_cast<iovec*>(&(*iovs)[0]),
-                      iovs->size(),
-                      MAX_VEC_SIZE,
-                      peerName_);
-}
-
-void SocketChannel::writeMessage(const std::vector<struct iovec>& userIovs) {
-  MessageHeader header;
-  header.numIovs = userIovs.size();
-
-  std::vector<size_t> iovLengths;
-  iovLengths.reserve(userIovs.size());
-  for (auto& iov : userIovs) {
-    iovLengths.push_back(iov.iov_len);
-  }
-
-  std::vector<iovec> iovs;
-  iovs.reserve(userIovs.size() + 2);
-  iovs.push_back({&header, sizeof(header)});
-  iovs.push_back({&iovLengths[0],
-                  static_cast<size_t>(sizeof(iovLengths[0]) * header.numIovs)});
-  iovs.insert(iovs.end(), userIovs.begin(), userIovs.end());
-
-  header.totalLength = 0;
-  for (auto& iov : iovs) {
-    header.totalLength += iov.iov_len;
-  }
-
-  CHECK(writev(iovs) == (size_t)header.totalLength);
-}
-
-std::unique_ptr<MsgReader> SocketChannel::readMessage() {
-  MessageHeader header;
-
-  size_t len = read(&header, sizeof(header));
-  if (len == 0) {
-    return nullptr;
-  }
-
-  CHECK(len == sizeof(header));
-
-  std::unique_ptr<MsgReader> msgReader(new MsgReader(this, header.numIovs));
-
-  CHECK_EQ(msgReader->getTotalLength() + sizeof(header) +
-               msgReader->getNumBlocks() * sizeof(size_t),
-           (size_t)header.totalLength)
-      << " totalLength=" << msgReader->getTotalLength()
-      << " numBlocks=" << msgReader->getNumBlocks();
-  return msgReader;
-}
-
-MsgReader::MsgReader(SocketChannel* channel, size_t numBlocks)
-    : channel_(channel), blockLengths_(numBlocks), currentBlockIndex_(0) {
-  size_t size = numBlocks * sizeof(blockLengths_[0]);
-  CHECK(channel_->read(&blockLengths_[0], size) == size);
-}
-
-void MsgReader::readBlocks(const std::vector<void*>& bufs) {
-  CHECK_LE(currentBlockIndex_ + bufs.size(), blockLengths_.size());
-  std::vector<iovec> iovs;
-  iovs.reserve(bufs.size());
-  size_t totalLength = 0;
-  for (void* buf : bufs) {
-    iovs.push_back({buf, getNextBlockLength()});
-    totalLength += getNextBlockLength();
-    ++currentBlockIndex_;
-  }
-
-  CHECK(channel_->readv(&iovs) == totalLength);
-}
-
-void MsgReader::readNextBlock(void* buf) {
-  CHECK_LT(currentBlockIndex_, blockLengths_.size());
-  CHECK(channel_->read(buf, getNextBlockLength()) == getNextBlockLength());
-  ++currentBlockIndex_;
-}
-
-}  // namespace paddle
diff --git a/paddle/pserver/SocketChannel.h b/paddle/pserver/SocketChannel.h
deleted file mode 100644
index c0f30d0db760045a8c0cb001fcadaae8f0c03f9d..0000000000000000000000000000000000000000
--- a/paddle/pserver/SocketChannel.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/utils/Util.h"
-
-#include <sys/uio.h>
-
-#include <memory>
-#include <vector>
-
-struct sxi_sock;
-
-namespace paddle {
-
-class SocketChannel;
-enum ChannelType {
-  F_TCP = 1,
-  F_RDMA = 2,
-};
-
-/// reading a set of blocks of data from SocketChannel.
-class MsgReader {
-public:
-  MsgReader(SocketChannel* channel, size_t numIovs);
-  ~MsgReader() {
-    /// ensure all data blocks have been processed
-    CHECK_EQ(currentBlockIndex_, blockLengths_.size());
-  }
-  /**
-   * @brief number of remaining parts
-   */
-  size_t getNumBlocks() const {
-    return blockLengths_.size() - currentBlockIndex_;
-  }
-
-  /**
-   * @brief lenght of next block
-   */
-  size_t getNextBlockLength() const { return getBlockLength(0); }
-
-  /**
-   * @brief get the total length of all the remaining blocks
-   */
-  size_t getTotalLength() const {
-    size_t total = 0;
-    for (size_t i = currentBlockIndex_; i < blockLengths_.size(); ++i) {
-      total += blockLengths_[i];
-    }
-    return total;
-  }
-
-  /**
-   * @brief Get the length for block currentBlockIndex + i
-   */
-  size_t getBlockLength(size_t i) const {
-    return blockLengths_[currentBlockIndex_ + i];
-  }
-
-  /**
-   * @brief  read blocks data and store it to buf
-   */
-  void readBlocks(const std::vector<void*>& bufs);
-  void readNextBlock(void* buf);
-
-protected:
-  SocketChannel* channel_;
-  std::vector<size_t> blockLengths_;
-  size_t currentBlockIndex_;
-};
-
-/// APIs for reading and writing byte stream data or naive iov data
-/// from the APIs both RDMA and TCP exhibits byte stream style
-class SocketChannel {
-public:
-  SocketChannel(int socket, const std::string& peerName)
-      : tcpSocket_(socket), peerName_(peerName) {
-    tcpRdma_ = F_TCP;
-  }
-  SocketChannel(struct sxi_sock* socket, const std::string& peerName)
-      : rdmaSocket_(socket), peerName_(peerName) {
-    tcpRdma_ = F_RDMA;
-  }
-
-  ~SocketChannel();
-
-  const std::string& getPeerName() const { return peerName_; }
-
-  /**
-   * @brief read size bytes.
-   *
-   * @note  keep reading until getting size bytes or sock is closed
-   *        is closed
-   */
-  size_t read(void* buf, size_t size);
-
-  /**
-   * @brief write size bytes.
-   *
-   * @note  keep writing until writing size bytes or sock is closed
-   */
-  size_t write(const void* buf, size_t size);
-
-  /**
-   * @brief write a set of buffers.
-   *
-   * @note  keep writing until all buffers are written or sock is closed
-   */
-  size_t writev(const std::vector<struct iovec>& iov);
-
-  /**
-   * @brief read a set of buffers.
-   *
-   * @note  keep reading until all buffers are full or sock is closed.
-   */
-  size_t readv(std::vector<struct iovec>* iov);
-
-  /**
-   * @brief write a set of buffers.
-   *
-   * @note  keep writing until all buffers are passed or sock is closed
-   */
-  void writeMessage(const std::vector<struct iovec>& iov);
-
-  /// return null to indicate socket is closed
-  std::unique_ptr<MsgReader> readMessage();
-
-protected:
-  struct MessageHeader {
-    int64_t totalLength;  /// include the header
-    int64_t numIovs;
-    int64_t iovLengths[0];
-  };
-
-  int tcpSocket_;
-  struct sxi_sock* rdmaSocket_;
-  const std::string peerName_;
-  enum ChannelType tcpRdma_;
-};
-
-}  // namespace paddle
diff --git a/paddle/pserver/SparseParameterDistribution.cpp b/paddle/pserver/SparseParameterDistribution.cpp
deleted file mode 100644
index bb247f389cc26b32ff79d36bdf5c81ba8591dc58..0000000000000000000000000000000000000000
--- a/paddle/pserver/SparseParameterDistribution.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-
-#include "paddle/utils/Logging.h"
-
-#include "paddle/utils/Flags.h"
-
-#include "SparseParameterDistribution.h"
-
-DEFINE_bool(check_sparse_distribution_in_pserver,
-            false,
-            "check whether sparse parameter exhibts balanced distribution at "
-            "all pservers");
-DEFINE_bool(show_check_sparse_distribution_log,
-            false,
-            "show logs details for sparse parameter distribution in pserver");
-DEFINE_int32(check_sparse_distribution_batches,
-             100,
-             "run sparse parameter distribution check for N batches");
-DEFINE_double(
-    check_sparse_distribution_ratio,
-    0.6,
-    "if parameters dispatched to different pservers exhibit unbalanced "
-    " distribution for check_sparse_distribution_ratio * "
-    " check_sparse_distribution_batches times, crash program");
-DEFINE_double(check_sparse_distribution_unbalance_degree,
-              2.0,
-              "the ratio of maximum data size and minimun data size for "
-              "different pserver");
-
-namespace paddle {
-
-SparseParameterDistribution::SparseParameterDistribution(size_t serviceNum) {
-  totBytes_ = 0;
-  data_.resize(serviceNum);
-
-  batchPassed_ = 0;
-  unbalanceCnt_ = 0;
-}
-
-void SparseParameterDistribution::probeDistribution(int serverId,
-                                                    size_t dataSize) {
-  if (!FLAGS_check_sparse_distribution_in_pserver ||
-      batchPassed_ > FLAGS_check_sparse_distribution_batches) {
-    return;
-  }
-
-  CHECK_LT((size_t)serverId, data_.size())
-      << "invalid sparse parameter distribution probe";
-
-  data_[serverId] += dataSize;
-  totBytes_ += dataSize;
-}
-
-void SparseParameterDistribution::checkAndResetDistribution() {
-  if (!FLAGS_check_sparse_distribution_in_pserver ||
-      batchPassed_ >= FLAGS_check_sparse_distribution_batches) {
-    return;
-  }
-
-  /// at runtime, prepareSendData is called by many contexts,
-  /// so need to check if data is avaiable.
-  if (!totBytes_) {
-    return;
-  }
-
-  /// check if distribution is balanced
-  auto avgSize = totBytes_ / data_.size();
-  auto unbalanceDegree = FLAGS_check_sparse_distribution_unbalance_degree;
-  for (auto& dataSize : data_) {
-    if (dataSize > unbalanceDegree * avgSize ||
-        dataSize * unbalanceDegree < avgSize) {
-      unbalanceCnt_++;
-      break;
-    }
-  }
-
-  auto printData = [&]() {
-    std::stringstream ss;
-    for (auto& dataSize : data_) {
-      ss << dataSize * 0.001 << "KB ";
-    }
-    ss << std::endl;
-    LOG(INFO) << ss.str();
-  };
-
-  /// show all sparse data size for different pserver
-  if (FLAGS_show_check_sparse_distribution_log) {
-    LOG(INFO) << "sparse distribution:";
-    printData();
-  }
-
-  totBytes_ = 0;
-  batchPassed_++;
-
-  if (batchPassed_ == FLAGS_check_sparse_distribution_batches) {
-    LOG(INFO) << "show last parameter distribution sample:";
-    printData();
-    LOG(INFO) << "total unbalanced batches: " << unbalanceCnt_
-              << " in passed batches: " << batchPassed_;
-    CHECK_LE((float)unbalanceCnt_ / (float)batchPassed_,
-             FLAGS_check_sparse_distribution_ratio)
-        << "unbalanced sparse parameter distribution for different pserver. "
-        << "it could be caused by unbalanced sparse ids distribution, try "
-        << "to shuffle dimensions in input samples";
-  }
-
-  std::fill(data_.begin(), data_.end(), 0);
-}
-}  // namespace paddle
diff --git a/paddle/pserver/SparseParameterDistribution.h b/paddle/pserver/SparseParameterDistribution.h
deleted file mode 100644
index 13f199548d56262e77e91e45052f3e435dea407c..0000000000000000000000000000000000000000
--- a/paddle/pserver/SparseParameterDistribution.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <unistd.h>
-
-#include <atomic>
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-/*
- * if sparse_remote_updater is used, different ParameterServer could
- * be assigned with unbalanced gradients. the parameter value from
- * ParameterServer also be not balanced. the distribution of different
- * dimensions of sparse ids determines the unbalanced degree of data
- * distributed among all ParameterServers. Even distribution will
- * benifits cluster efficiency.
- * do check the unbalanced degree of gradients at runtime, crash program
- * if unbalanced distribution exhibts by default.
- */
-class SparseParameterDistribution {
-public:
-  /// serviceNum means the number of ParameterServers
-  explicit SparseParameterDistribution(size_t serviceNum);
-  ~SparseParameterDistribution() {}
-  /// collect data
-  void probeDistribution(int serverId, size_t data);
-  void checkAndResetDistribution();
-
-private:
-  std::vector<size_t> data_;
-  std::atomic<size_t> totBytes_;
-
-  /// after some batches, stop to check
-  int batchPassed_;
-
-  /// stat on unbalanced distribution found
-  int unbalanceCnt_;
-};
-}  // namespace paddle
diff --git a/paddle/pserver/test/SocketTest.cpp b/paddle/pserver/test/SocketTest.cpp
deleted file mode 100644
index 6019dccaadf7fab5a1db7183c07cbbd9562dab2e..0000000000000000000000000000000000000000
--- a/paddle/pserver/test/SocketTest.cpp
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Util.h"
-
-#include <netdb.h>
-#include <netinet/in.h>
-#include <stdio.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-
-#include <thread>
-
-#include "paddle/math/Vector.h"
-#include "paddle/utils/Logging.h"
-
-struct MessageHeader {
-  int64_t dataLength;
-};
-
-class Thread {
-public:
-  void start();
-  virtual void run() = 0;
-  virtual ~Thread() {}
-
-protected:
-  std::unique_ptr<std::thread> thread_;
-};
-
-void Thread::start() {
-  thread_.reset(new std::thread([this]() { this->run(); }));
-}
-
-class SocketChannel {
-public:
-  explicit SocketChannel(int socket) : socket_(socket) {}
-  int getSocketFd() const { return socket_; }
-  uint64_t readAll(void* buf, size_t size);
-  uint64_t writeAll(const void* buf, size_t size);
-
-protected:
-  int socket_;
-};
-
-uint64_t SocketChannel::readAll(void* buf, size_t size) {
-  uint64_t total = 0;
-  while (total < size) {
-    int64_t len = read(socket_, (char*)buf + total, size - total);
-    if (len <= 0) {
-      return total;
-    }
-    total += len;
-  }
-  return total;
-}
-
-uint64_t SocketChannel::writeAll(const void* buf, size_t size) {
-  uint64_t total = 0;
-  while (total < size) {
-    int64_t len = write(socket_, (const char*)buf + total, size - total);
-    if (len <= 0) {
-      return total;
-    }
-    total += len;
-  }
-  return total;
-}
-
-class SocketWorker : public Thread {
-public:
-  explicit SocketWorker(int socket) : channel_(socket) {}
-  virtual void run();
-
-  // read n bytes.
-  int64_t readAll(char* buf, size_t n);
-
-  // write n bytes
-
-protected:
-  SocketChannel channel_;
-  std::string buffer_;
-};
-
-class SocketServer : public Thread {
-public:
-  explicit SocketServer(int port)
-      : port_(port), socket_(0), maxPendingConnections_(100) {}
-
-  virtual void run();
-
-protected:
-  int port_;
-  int socket_;
-  int maxPendingConnections_;
-};
-
-void SocketServer::run() {
-  int newsockfd;
-  socklen_t clilen;
-  struct sockaddr_in serv_addr, cli_addr;
-
-  /* First call to socket() function */
-  socket_ = socket(AF_INET, SOCK_STREAM, 0);
-  CHECK(socket_ >= 0) << "ERROR opening socket";
-
-  /* Initialize socket structure */
-  bzero((char*)&serv_addr, sizeof(serv_addr));
-  serv_addr.sin_family = AF_INET;
-  serv_addr.sin_addr.s_addr = INADDR_ANY;
-  serv_addr.sin_port = htons(port_);
-
-  /* Now bind the host address using bind() call.*/
-  CHECK(bind(socket_, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
-      << "ERROR on binding";
-
-  /* Now start listening for the clients, here process will
-   * go in sleep mode and will wait for the incoming connection
-   */
-  listen(socket_, maxPendingConnections_);
-  clilen = sizeof(cli_addr);
-
-  while (true) {
-    /* Accept actual connection from the client */
-    newsockfd = accept(socket_, (struct sockaddr*)&cli_addr, &clilen);
-    CHECK(newsockfd >= 0) << "ERROR on accept";
-
-    SocketWorker* worker = new SocketWorker(newsockfd);
-    worker->start();
-  }
-}
-
-void SocketWorker::run() {
-  MessageHeader header;
-
-  while (true) {
-    int64_t n = channel_.readAll(&header, sizeof(header));
-    CHECK(n == sizeof(header)) << "ERROR reading from socket";
-
-    buffer_.resize(header.dataLength);
-    n = channel_.readAll(&buffer_[0], header.dataLength);
-    CHECK(n == header.dataLength) << "ERROR reading from socket";
-
-    /* Write a response to the client */
-    n = channel_.writeAll(&header, sizeof(header));
-    CHECK(n == sizeof(header)) << "ERROR reading from socket";
-    n = channel_.writeAll(buffer_.data(), buffer_.size());
-    CHECK(n == header.dataLength) << "ERROR writing to socket";
-  }
-}
-
-class SocketClient {
-public:
-  SocketClient(const std::string& serverAddr, int serverPort);
-  SocketChannel* getChannel() const { return channel_.get(); }
-
-protected:
-  std::unique_ptr<SocketChannel> channel_;
-};
-
-SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
-  struct sockaddr_in serv_addr;
-  struct hostent* server;
-
-  // char buffer[256];
-
-  /* Create a socket point */
-  int sockfd = socket(AF_INET, SOCK_STREAM, 0);
-  CHECK(sockfd >= 0) << "ERROR opening socket";
-  server = gethostbyname(serverAddr.c_str());
-  CHECK(server) << "ERROR, no such host: " << serverAddr;
-
-  bzero((char*)&serv_addr, sizeof(serv_addr));
-  serv_addr.sin_family = AF_INET;
-  bcopy((char*)server->h_addr,
-        (char*)&serv_addr.sin_addr.s_addr,
-        server->h_length);
-  serv_addr.sin_port = htons(serverPort);
-
-  /* Now connect to the server */
-  CHECK(connect(sockfd, (sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
-      << "ERROR connecting";
-
-  channel_.reset(new SocketChannel(sockfd));
-}
-
-DEFINE_string(server_addr, "127.0.0.1", "Server address");
-DEFINE_int64(dim, 10000000, "Data size");
-DEFINE_int32(loop_time, 100000, "test loop time");
-
-using namespace paddle;  // NOLINT
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  SocketServer server(FLAGS_port);
-  server.start();
-  sleep(1);
-
-  SocketClient client(FLAGS_server_addr, FLAGS_port);
-
-  SocketChannel* channel = client.getChannel();
-
-  MessageHeader header;
-
-  uint64_t dataSize = FLAGS_dim * sizeof(real);
-
-#ifdef PADDLE_WITH_CUDA
-  GpuVector gpuParam(FLAGS_dim);
-  GpuVector gpuGrad(FLAGS_dim);
-#else
-  CpuVector gpuParam(FLAGS_dim);
-  CpuVector gpuGrad(FLAGS_dim);
-#endif
-  CpuVector cpuParam(FLAGS_dim);
-  CpuVector cpuGrad(FLAGS_dim);
-
-  gpuParam.rand();
-  gpuGrad.rand();
-  cpuParam.rand();
-  cpuGrad.rand();
-
-  for (int i = 0; i < FLAGS_loop_time; ++i) {
-    cpuGrad.copyFrom(gpuGrad);
-
-    header.dataLength = dataSize;
-    CHECK(channel->writeAll(&header, sizeof(header)) == sizeof(header))
-        << "Client write header error";
-
-    CHECK(channel->writeAll(cpuGrad.getData(), dataSize) == dataSize)
-        << "Client write data error";
-
-    /* Now read server response */
-    CHECK(channel->readAll(&header, sizeof(header)) == sizeof(header))
-        << "Client read header error";
-
-    CHECK_EQ((uint64_t)header.dataLength, dataSize);
-    CHECK(channel->readAll(cpuParam.getData(), dataSize) == dataSize)
-        << "Client read data error";
-
-    gpuParam.copyFrom(cpuParam);
-
-    LOG_EVERY_N(INFO, 100) << "i=" << i;
-  }
-  exit(0);
-}
diff --git a/paddle/pserver/test/test_ParameterServer2.cpp b/paddle/pserver/test/test_ParameterServer2.cpp
deleted file mode 100644
index e742cd0871da865e02a60a125a936eea8f15e575..0000000000000000000000000000000000000000
--- a/paddle/pserver/test/test_ParameterServer2.cpp
+++ /dev/null
@@ -1,624 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/pserver/ParameterClient2.h>
-#include <paddle/pserver/ParameterServer2.h>
-#include <paddle/utils/Flags.h>
-#include <paddle/utils/Util.h>
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_int32(num_gradient_servers);
-DEFINE_string(server_addr, "127.0.0.1", "assign server address");
-DEFINE_int32(server_cpu, 0, "assign server cpu");
-
-class ParameterServer2Tester : public ParameterServer2 {
-public:
-  ParameterServer2Tester(std::string serverAddr,
-                         int port,
-                         int rdmaCpu = -1,
-                         bool sepSendAndRecv = false)
-      : ParameterServer2(serverAddr, port, rdmaCpu), client_(sepSendAndRecv) {}
-  virtual ~ParameterServer2Tester() {}
-  void setup() {
-    CHECK(ParameterServer2::init());
-
-    parameters_.clear();
-    clientConfigs_.clear();
-
-    clientConfigs_.resize(2);
-    {
-      ParameterConfig& config = clientConfigs_[0];
-      config.set_name("para0");
-      config.set_para_id(0);
-      config.set_size(10000);
-      config.set_device(-1);
-      config.set_learning_rate(1.0);
-      config.set_momentum(0.9);
-    }
-
-    {
-      ParameterConfig& config = clientConfigs_[1];
-      config.set_name("para1");
-      config.set_para_id(1);
-      config.set_size(5000);
-      config.set_device(-1);
-      config.set_learning_rate(0.5);
-      config.set_momentum(0.4);
-    }
-
-    for (auto& config : clientConfigs_) {
-      parameters_.emplace_back(new Parameter(config, /* useGpu= */ false));
-    }
-
-    size_t id = 0;
-    for (auto& para : parameters_) {
-      para->setID(id++);
-    }
-
-    CHECK(client_.init(parameters_));
-    OptimizationConfig optConfig;
-    optConfig.set_algorithm("async_sgd");
-    optConfig.set_batch_size(100);
-    optConfig.set_learning_rate(0.1);
-    client_.setConfig(optConfig);
-    client_.setParameter();
-  }
-
-  void setConfigTest();
-  void setStatusTest();
-  void sendParameterTest();
-  void sendDataTest(SendDataType type, size_t size);
-  void operationTest();
-  void mergeBlockSegmentTest();
-  void checkSegments(const BlockSegments& expected, const BlockSegments& segs);
-  void waitPassFinishTest();
-  void synchronizeTest();
-
-protected:
-  ParameterClient2 client_;
-  vector<ParameterConfig> clientConfigs_;
-  vector<ParameterPtr> parameters_;
-};
-
-std::unique_ptr<ParameterServer2Tester> g_server;
-
-void ParameterServer2Tester::setConfigTest() {
-  setup();
-
-  for (auto& config : clientConfigs_) {
-    auto it = configMap_.find(config.para_id());
-    EXPECT_TRUE(it != configMap_.end());
-    auto& serverConfig = it->second;
-    EXPECT_EQ(config.name(), serverConfig.name());
-    EXPECT_EQ(config.size(), serverConfig.size());
-    EXPECT_EQ(config.learning_rate(), serverConfig.learning_rate());
-    EXPECT_EQ(config.momentum(), serverConfig.momentum());
-  }
-}
-
-void ParameterServer2Tester::setStatusTest() {
-  setup();
-  EXPECT_TRUE(client_.inStatus(PSERVER_STATUS_NOT_SET));
-  client_.setStatus(PSERVER_STATUS_PARAMETER_READY);
-  EXPECT_EQ(PSERVER_STATUS_PARAMETER_READY, status_);
-  EXPECT_TRUE(client_.inStatus(PSERVER_STATUS_PARAMETER_READY));
-}
-
-real sumVector(const CpuVector& vec) {
-  const real* data = vec.getData();
-  size_t dim = vec.getSize();
-  real sum = 0;
-  for (size_t i = 0; i < dim; ++i) {
-    sum += data[i];
-  }
-  return sum;
-}
-
-void ParameterServer2Tester::sendParameterTest() {
-  setup();
-
-  client_.sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM,
-                                  PARAMETER_VALUE,
-                                  0,       // numSamples = 0
-                                  0,       // cost = 0
-                                  false);  // sendBackParameter = false
-
-  vector<ParameterPtr> parameterCopies;
-
-  for (auto& parameter : parameters_) {
-    parameterCopies.emplace_back(
-        new Parameter(parameter->getConfig(), /* useGpu= */ false));
-    parameterCopies.back()
-        ->getBuf(PARAMETER_VALUE)
-        ->copyFrom(*parameter->getBuf(PARAMETER_VALUE));
-  }
-
-  client_.sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM,
-                                  PARAMETER_VALUE,
-                                  0,      // numSamples = 0
-                                  0,      // cost = 0
-                                  true);  // sendBackParameter = true
-
-  for (size_t i = 0; i != parameters_.size(); ++i) {
-    real* v1 = parameters_[i]->getBuf(PARAMETER_VALUE)->getData();
-    real* v2 = parameterCopies[i]->getBuf(PARAMETER_VALUE)->getData();
-    EXPECT_EQ(parameters_[i]->getSize(), parameterCopies[i]->getSize());
-    size_t size = parameters_[i]->getSize();
-    real sum1 = 0, sum2 = 0;
-    for (size_t j = 0; j < size; ++j) {
-      sum1 += v1[j];
-      sum2 += v2[j];
-    }
-    EXPECT_EQ(sum1, sum2);
-  }
-}
-
-void ParameterServer2Tester::sendDataTest(SendDataType type, size_t size) {
-  ParameterClient2 client1(true);
-  client1.init(parameters_);
-  ParameterClient2 client2(true);
-  client2.init(parameters_);
-  ParameterClient2 client3(true);
-  client3.init(parameters_);
-
-  ThreadWorker worker1;
-  ThreadWorker worker2;
-  ThreadWorker worker3;
-
-  double* testData1 = new double[size];
-  double* testData2 = new double[size];
-  double* testData3 = new double[size];
-  double* getDataExpect = new double[size];
-  double* getDataReal = new double[size];
-  for (size_t i = 0; i < size; ++i) {
-    testData1[i] = rand();  // NOLINT TODO(yuyang18): Use rand_r instead.
-    testData2[i] = rand();  // NOLINT
-    testData3[i] = rand();  // NOLINT
-    getDataExpect[i] = testData1[i] + testData2[i] + testData3[i];
-  }
-
-  auto put1 = [&]() {
-    LOG(INFO) << "putOwnData1 start";
-    client1.putOwnData(0, type, testData1, size);
-    LOG(INFO) << "putOwnData1 finish";
-  };
-
-  auto get1 = [&]() {
-    LOG(INFO) << "sendData1 get all start";
-    client1.getAllData(0, type, getDataReal, size);
-    for (size_t i = 0; i < size; ++i) {
-      CHECK_EQ(getDataReal[i], getDataExpect[i]);
-    }
-    LOG(INFO) << "sendData1 get all finish";
-  };
-
-  auto put2 = [&]() {
-    LOG(INFO) << "putOwnData2 start";
-    client2.putOwnData(1, type, testData2, size);
-    LOG(INFO) << "putOwnData2 finish";
-  };
-
-  auto put3 = [&]() {
-    LOG(INFO) << "putOwnData3 start";
-    client3.putOwnData(2, type, testData3, size);
-    LOG(INFO) << "putOwnData3 finish";
-  };
-
-  worker1.addJob(put1);
-  worker1.addJob(get1);
-  worker2.addJob(put2);
-  worker3.addJob(put3);
-
-  worker1.addJob(put1);
-  worker2.addJob(put2);
-  worker3.addJob(put3);
-  worker1.addJob(get1);
-
-  worker1.wait();
-  worker2.wait();
-  worker3.wait();
-  free(testData1);
-  free(testData2);
-  free(testData3);
-  free(getDataExpect);
-  free(getDataReal);
-}
-
-void ParameterServer2Tester::operationTest() {
-  PServerVector v1, v2;
-  v1 = client_.createVector();
-  EXPECT_EQ(NUM_PARAMETER_TYPES, v1.handle);
-
-  v2 = client_.createVector();
-  EXPECT_EQ(NUM_PARAMETER_TYPES + 1, v2.handle);
-
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_RESET, v1, (real)1);
-  ops.addOperation(PSERVER_OP_RESET, v2, (real)2);
-
-  real res1, res2, res3;
-  ops.addOperation(PSERVER_OP_utv, v1, v2)(&res1);
-
-  ops.addOperation(PSERVER_OP_au_bv, v1, v2, (real)-1, (real)1);
-  ops.addOperation(PSERVER_OP_utv, v1, v2)(&res2);
-
-  ops.addOperation(PSERVER_OP_au_bv, v1, v2, (real)-1, (real)1);
-  ops.addOperation(PSERVER_OP_utv, v1, v2)(&res3);
-  client_.doOperation(ops, false, false);
-
-  EXPECT_EQ(30000, res1);
-  EXPECT_EQ(15000, res2);
-  EXPECT_EQ(0, res3);
-
-  PServerMatrix m1, m2;
-  m1 = client_.createMatrix(4);
-  EXPECT_EQ(0, m1.handle);
-  m2 = client_.createMatrix(8);
-  EXPECT_EQ(1, m2.handle);
-
-  // TODO(yuyang18): add tests for other operations OP_COPY, OP_au
-
-  client_.releaseVector(v1);
-  client_.releaseVector(v2);
-  client_.releaseMatrix(m1);
-  client_.releaseMatrix(m2);
-}
-
-void ParameterServer2Tester::checkSegments(const BlockSegments& expected,
-                                           const BlockSegments& segs) {
-  EXPECT_EQ(expected.size(), segs.size());
-  if (expected.size() != segs.size()) {
-    return;
-  }
-  for (size_t i = 0; i < expected.size(); ++i) {
-    EXPECT_EQ(expected[i], segs[i]);
-  }
-}
-
-void ParameterServer2Tester::mergeBlockSegmentTest() {
-  {
-    BlockSegments segs{{10, 20}, {30, 45}, {50, 70}};
-    mergeSegments(&segs);
-    checkSegments({{10, 20}, {30, 45}, {50, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {50, 70}, {10, 20}};
-    mergeSegments(&segs);
-    checkSegments({{10, 20}, {30, 45}, {50, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {50, 70}, {10, 30}};
-    mergeSegments(&segs);
-    checkSegments({{10, 45}, {50, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {10, 70}, {10, 30}};
-    mergeSegments(&segs);
-    checkSegments({{10, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {50, 70}, {10, 35}};
-    mergeSegments(&segs);
-    checkSegments({{10, 45}, {50, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {50, 70}, {10, 60}};
-    mergeSegments(&segs);
-    checkSegments({{10, 70}}, segs);
-  }
-  {
-    BlockSegments segs{{30, 45}, {50, 70}, {30, 47}};
-    mergeSegments(&segs);
-    checkSegments({{30, 47}, {50, 70}}, segs);
-  }
-}
-
-void ParameterServer2Tester::waitPassFinishTest() {
-  ParameterClient2 client1;
-  ParameterClient2 client2;
-  ParameterClient2 client3;
-
-  ThreadWorker worker1;
-  ThreadWorker worker2;
-  ThreadWorker worker3;
-
-  auto init1 = [&]() {
-    LOG(INFO) << "init1 start";
-    client1.init(parameters_);
-    LOG(INFO) << "init1 finish";
-  };
-
-  auto init2 = [&]() {
-    LOG(INFO) << "init2 start";
-    client2.init(parameters_);
-    LOG(INFO) << "init2 finish";
-  };
-
-  auto init3 = [&]() {
-    LOG(INFO) << "init3 start";
-    client3.init(parameters_);
-    LOG(INFO) << "init3 finish";
-  };
-
-  auto update1 = [&]() {
-    LOG(INFO) << "update1 start";
-    client1.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ADD_GRADIENT,
-                                    PARAMETER_VALUE,
-                                    0,      // numSamples = 0
-                                    0,      // cost = 0
-                                    true);  // sendBackParameter = false
-    LOG(INFO) << "update1 finish";
-  };
-
-  auto wait1 = [&]() {
-    LOG(INFO) << "wait1 start";
-    client1.waitPassFinish();
-    LOG(INFO) << "wait1 finish";
-  };
-
-  auto update2 = [&]() {
-    LOG(INFO) << "update2 start";
-    client2.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ADD_GRADIENT,
-                                    PARAMETER_VALUE,
-                                    0,      // numSamples = 0
-                                    0,      // cost = 0
-                                    true);  // sendBackParameter = false
-    LOG(INFO) << "update2 finish";
-  };
-
-  auto wait2 = [&]() {
-    LOG(INFO) << "wait2 start";
-    client2.waitPassFinish();
-    LOG(INFO) << "wait2 finish";
-  };
-
-  auto op3 = [&]() {
-    LOG(INFO) << "op3 start";
-    PreparedOperations ops;
-    ops.addOperation(PSERVER_OP_SGD);
-    client3.doOperation(ops,
-                        /* waitForGradient= */ true,
-                        /* sendBackarameter= */ true);
-    LOG(INFO) << "op3 finish";
-  };
-
-  worker1.addJob(init1);
-  worker2.addJob(init2);
-  worker3.addJob(init3);
-
-  worker1.addJob(update1);
-  worker2.addJob(update2);
-  worker3.addJob(op3);
-
-  worker3.addJob(op3);
-  worker3.addJob(op3);
-  worker2.addJob(update2);
-  worker2.addJob(update2);
-  worker1.addJob(wait1);
-
-  worker2.addJob(wait2);
-  worker3.addJob(op3);
-
-  worker1.wait();
-  worker2.wait();
-  worker3.wait();
-
-  LOG(INFO) << "Pass 1 finished";
-
-  worker1.addJob(update1);
-  worker2.addJob(update2);
-  worker3.addJob(op3);
-
-  worker1.wait();
-  worker2.wait();
-  worker3.wait();
-
-  worker3.addJob(op3);
-  worker3.addJob(op3);
-  worker1.addJob(update1);
-  worker1.addJob(wait1);
-  worker2.addJob(wait2);
-
-  worker1.wait();
-  worker2.wait();
-  worker3.wait();
-
-  LOG(INFO) << "Pass 2 finished";
-}
-
-void ParameterServer2Tester::synchronizeTest() {
-  ParameterClient2 client1;
-  ParameterClient2 client2;
-
-  ThreadWorker worker1;
-  ThreadWorker worker2;
-
-  FLAGS_log_period_server = 2;
-
-  auto init1 = [&]() {
-    LOG(INFO) << "init1 start";
-    client1.init(parameters_);
-    client1.setTrainerId(0);
-    LOG(INFO) << "init1 finish";
-  };
-
-  auto init2 = [&]() {
-    LOG(INFO) << "init2 start";
-    client2.init(parameters_);
-    client2.setTrainerId(1);
-    LOG(INFO) << "init2 finish";
-  };
-
-  auto update1 = [&]() {
-    LOG(INFO) << "update1 start";
-    client1.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ASYNC_SGD,
-                                    PARAMETER_VALUE,
-                                    0,      // numSamples = 0
-                                    0,      // cost = 0
-                                    true);  // sendBackParameter = false
-    LOG(INFO) << "update1 finish";
-  };
-
-  auto wait1 = [&]() {
-    LOG(INFO) << "wait1 start";
-    client1.asyncFinishPass();
-    LOG(INFO) << "wait1 finish";
-  };
-
-  auto update2 = [&]() {
-    LOG(INFO) << "update2 start";
-    client2.sendAndReceiveParameter(PSERVER_UPDATE_MODE_ASYNC_SGD,
-                                    PARAMETER_VALUE,
-                                    0,      // numSamples = 0
-                                    0,      // cost = 0
-                                    true);  // sendBackParameter = false
-    LOG(INFO) << "update2 finish";
-  };
-
-  auto wait2 = [&]() {
-    LOG(INFO) << "wait2 start";
-    client2.asyncFinishPass();
-    LOG(INFO) << "wait2 finish";
-  };
-
-  worker1.addJob(init1);
-  worker2.addJob(init2);
-  // call wait to reset some stats at pserver
-  worker1.addJob(wait1);
-  worker2.addJob(wait2);
-
-  worker1.addJob(update1);
-  worker2.addJob(update2);
-
-  worker2.addJob(update2);
-  worker2.addJob(update2);
-  worker1.addJob(wait1);
-
-  worker2.addJob(wait2);
-
-  worker1.wait();
-  worker2.wait();
-  LOG(INFO) << "Pass 1 finished";
-
-  worker1.addJob(update1);
-  worker2.addJob(update2);
-
-  worker1.wait();
-  worker2.wait();
-
-  worker1.addJob(update1);
-  worker2.addJob(update2);
-  worker1.addJob(update1);
-  worker1.addJob(update1);
-  worker1.addJob(update1);
-  worker1.addJob(update1);
-  worker1.addJob(update1);
-  worker1.addJob(update1);
-  worker1.addJob(wait1);
-  worker2.addJob(wait2);
-
-  worker1.wait();
-  worker2.wait();
-  LOG(INFO) << "Pass 2 finished";
-}
-
-TEST(ParameterServer2, sendParameter) { g_server->sendParameterTest(); }
-
-TEST(ParameterServer2, setConfig) { g_server->setConfigTest(); }
-
-TEST(ParameterServer2, setStatus) { g_server->setStatusTest(); }
-
-TEST(ParameterServer2, operation) { g_server->operationTest(); }
-
-TEST(ParameterServer2, mergeBlockSegment) { g_server->mergeBlockSegmentTest(); }
-
-TEST(ParameterServer2, waitPassFinish) { g_server->waitPassFinishTest(); }
-
-TEST(ParameterServer2, synchronize) { g_server->synchronizeTest(); }
-
-TEST(ParameterServer2, sendData) {
-  // Set gserver and pserver all 3, so that the test is sufficient.
-  int oldFlagsPortsNUm = FLAGS_ports_num;
-  int oldFlagsNumGradientServers = FLAGS_num_gradient_servers;
-  int oldFlagsPort = FLAGS_port;
-  FLAGS_ports_num = 3;
-  FLAGS_num_gradient_servers = 3;
-  FLAGS_port = FLAGS_port + 1;
-  std::unique_ptr<ParameterServer2Tester> g_server1;
-  std::unique_ptr<ParameterServer2Tester> g_server2;
-  std::unique_ptr<ParameterServer2Tester> g_server3;
-  if (FLAGS_rdma_tcp == "rdma") {
-    g_server1.reset(new ParameterServer2Tester(
-        FLAGS_server_addr, FLAGS_port, FLAGS_server_cpu));
-    g_server1->start();
-    g_server2.reset(new ParameterServer2Tester(
-        FLAGS_server_addr, FLAGS_port + 1, FLAGS_server_cpu + 1));
-    g_server2->start();
-    g_server3.reset(new ParameterServer2Tester(
-        FLAGS_server_addr, FLAGS_port + 2, FLAGS_server_cpu + 2));
-    g_server3->start();
-  } else {  // tcp
-    g_server1.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port));
-    g_server1->start();
-    g_server2.reset(
-        new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port + 1));
-    g_server2->start();
-    g_server3.reset(
-        new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port + 2));
-    g_server3->start();
-  }
-
-  g_server2->init();
-  g_server3->init();
-  sleep(2);
-  g_server1->setup();
-  g_server1->sendDataTest(DATA_REDUCE_SUM, 1 << 24);
-  sleep(2);
-  g_server1->sendDataTest(DATA_REDUCE_SUM, 2);
-  sleep(2);
-  g_server1.reset();
-  g_server2.reset();
-  g_server3.reset();
-
-  FLAGS_ports_num = oldFlagsPortsNUm;
-  FLAGS_num_gradient_servers = oldFlagsNumGradientServers;
-  FLAGS_port = oldFlagsPort;
-}
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-
-  FLAGS_num_gradient_servers = 2;
-
-  if (FLAGS_rdma_tcp == "rdma") {
-    g_server.reset(new ParameterServer2Tester(
-        FLAGS_server_addr, FLAGS_port, FLAGS_server_cpu));
-  } else {
-    g_server.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port));
-  }
-
-  g_server->start();
-
-  sleep(2);
-
-  int ret = RUN_ALL_TESTS();
-
-  g_server.reset();
-
-  exit(ret);
-}
diff --git a/paddle/pserver/test/test_ProtoServer.cpp b/paddle/pserver/test/test_ProtoServer.cpp
deleted file mode 100644
index d68a8d2180cc3081346106132799498f6dc3fa20..0000000000000000000000000000000000000000
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <memory>
-#include "ParameterService.pb.h"
-#include "paddle/math/Vector.h"
-#include "paddle/pserver/ProtoServer.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-
-DEFINE_string(server_addr, "127.0.0.1", "Server address");
-DEFINE_int64(dim, 50000000, "Data size");
-DEFINE_bool(test_proto_server, true, "whether to test ProtoServer");
-DEFINE_bool(benchmark, false, "Do benchmark. Skip some tests");
-
-using namespace paddle;  // NOLINT
-
-class MyServer : public ProtoServer {
-public:
-  explicit MyServer(int port, int rdmaCpu = -1)
-      : ProtoServer(FLAGS_server_addr, port, rdmaCpu),
-        status_(PSERVER_STATUS_NOT_SET) {
-    REGISTER_SERVICE_FUNCTION(MyServer, getStatus);
-    REGISTER_SERVICE_FUNCTION(MyServer, setStatus);
-    REGISTER_SERVICE_FUNCTION_EX(MyServer, getStatusEx);
-  }
-  void getStatus(const GetStatusRequest& request,
-                 ProtoResponseCallback callback) {
-    (void)request;
-    GetStatusResponse response;
-    response.set_status(status_);
-    callback(response);
-  }
-
-  void getStatusEx(const GetStatusRequest& request,
-                   std::unique_ptr<MsgReader> msgReader,
-                   ProtoResponseCallbackEx callback) {
-    (void)request;
-    GetStatusResponse response;
-    response.set_status(status_);
-    buffer_.resize(msgReader->getNextBlockLength());
-    msgReader->readNextBlock(&buffer_[0]);
-    callback(response, {{&buffer_[0], buffer_.size()}});
-  }
-
-  void setStatus(const SetStatusRequest& request,
-                 ProtoResponseCallback callback) {
-    SetStatusResponse response;
-    status_ = request.status();
-    callback(response);
-  }
-
-protected:
-  PServerStatus status_;
-  std::string buffer_;
-};
-
-TEST(ProtoServer, regular) {
-  ProtoClient* client;
-  if (FLAGS_rdma_tcp == "rdma")
-    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_RDMA);
-  else
-    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_TCP);
-  {
-    GetStatusRequest request;
-    GetStatusResponse response;
-    auto msgReader = client->sendAndRecv("getStatus", request, &response);
-    EXPECT_EQ(response.status(), PSERVER_STATUS_NOT_SET);
-    EXPECT_EQ(msgReader->getNumBlocks(), (size_t)0);
-  }
-
-  {
-    SetStatusRequest request;
-    SetStatusResponse response;
-    request.set_status(PSERVER_STATUS_PARAMETER_READY);
-    client->sendAndRecv("setStatus", request, &response);
-  }
-
-  {
-    GetStatusRequest request;
-    GetStatusResponse response;
-    client->sendAndRecv("getStatus", request, &response);
-    EXPECT_EQ(response.status(), PSERVER_STATUS_PARAMETER_READY);
-  }
-
-  delete client;
-}
-
-TEST(ProtoServer, extended) {
-#ifdef PADDLE_WITH_CUDA
-  ProtoClient* client;
-  if (FLAGS_rdma_tcp == "rdma")
-    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_RDMA);
-  else
-    client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_TCP);
-  int64_t dataSize = FLAGS_dim * sizeof(real);
-
-  GpuVector gpuParam(FLAGS_dim);
-  GpuVector gpuGrad(FLAGS_dim);
-  CpuVector cpuParam(FLAGS_dim);
-  CpuVector cpuGrad(FLAGS_dim);
-
-  gpuParam.rand();
-  gpuGrad.rand();
-  cpuParam.rand();
-  cpuGrad.rand();
-
-  for (int k = 0; k < 4; ++k) {
-    for (int i = 0; i < 10; ++i) {
-      cpuGrad.copyFrom(gpuGrad);
-      if (FLAGS_test_proto_server) {
-        GetStatusRequest request;
-        GetStatusResponse response;
-        {
-          REGISTER_TIMER("sendAndRecv");
-          auto msgReader =
-              client->sendAndRecv("getStatusEx",
-                                  request,
-                                  {{cpuGrad.getData(), (size_t)dataSize}},
-                                  &response);
-
-          EXPECT_EQ(msgReader->getNumBlocks(), (size_t)1);
-          EXPECT_EQ(msgReader->getNextBlockLength(), (size_t)dataSize);
-          msgReader->readNextBlock(cpuParam.getData());
-        }
-        if (!FLAGS_benchmark) {
-          real* v1 = cpuGrad.getData();
-          real* v2 = cpuParam.getData();
-          real sum1 = 0, sum2 = 0;
-          for (int j = 0; j < FLAGS_dim; ++j) {
-            sum1 += v1[j];
-            sum2 += v2[j];
-          }
-          EXPECT_EQ(sum1, sum2);
-        }
-      }
-      gpuParam.copyFrom(cpuParam);
-
-      LOG_EVERY_N(INFO, 10) << "i=" << i;
-    }
-    globalStat.printAllStatus();
-    globalStat.reset();
-  }
-
-  delete client;
-#endif
-}
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  MyServer server(FLAGS_port, FLAGS_rdma_tcp == "rdma" ? 0 : -1);
-  server.start();
-  usleep(10000);
-
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/pserver/test/test_ProtoServer.sh b/paddle/pserver/test/test_ProtoServer.sh
deleted file mode 100755
index 970c90b494c2a256cf22f3de7b7ea7964fed58ab..0000000000000000000000000000000000000000
--- a/paddle/pserver/test/test_ProtoServer.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -x
-for ((port=12340;port<=12360;port++))
-do
-    port_used_num=`netstat -a |grep $port|wc -l`
-    if [ $port_used_num -eq 0 ]
-    then
-        echo $port;
-        pserver/test/test_ProtoServer --port=$port 
-        if [ $? -eq 0 ]
-           then
-               exit 0
-           else
-               echo "test_ProtoServer run wrong"
-       	       exit 1
-        fi
-fi
-done
-echo "test_ProtoServer port not found"
-exit 1
diff --git a/paddle/scripts/README.md b/paddle/scripts/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..9e8b135c1bc7fc05d88fe6f3bed17dd3b48e9615
--- /dev/null
+++ b/paddle/scripts/README.md
@@ -0,0 +1,188 @@
+# Building PaddlePaddle
+
+## Goals
+
+We want to make the building procedures:
+
+1. Static, can reproduce easily.
+1. Generate python `whl` packages that can be widely use cross many distributions.
+1. Build different binaries per release to satisfy different environments:
+    - Binaries for different CUDA and CUDNN versions, like CUDA 7.5, 8.0, 9.0
+    - Binaries containing only capi
+    - Binaries for python with wide unicode support or not.
+1. Build docker images with PaddlePaddle pre-installed, so that we can run
+PaddlePaddle applications directly in docker or on Kubernetes clusters.
+
+To achieve this, we maintain a dockerhub repo:https://hub.docker.com/r/paddlepaddle/paddle
+which provides pre-built environment images to build PaddlePaddle and generate corresponding `whl`
+binaries.(**We strongly recommend building paddlepaddle in our pre-specified Docker environment.**) 
+
+## Development Workflow
+
+Here we describe how the workflow goes on.  We start from considering our daily development environment.
+
+Developers work on a computer, which is usually a laptop or desktop:
+
+<img src="doc/paddle-development-environment.png" width=500 />
+
+or, they might rely on a more sophisticated box (like with GPUs):
+
+<img src="doc/paddle-development-environment-gpu.png" width=500 />
+
+A principle here is that source code lies on the development computer (host) so that editors like Eclipse can parse the source code to support auto-completion.
+
+## Build With Docker
+
+### Build Environments
+
+The lastest pre-built build environment images are:
+
+| Image | Tag |
+| ----- | --- |
+| paddlepaddle/paddle | latest-dev |
+| paddlepaddle/paddle | latest-dev-android |
+
+### Start Build
+
+```bash
+git clone https://github.com/PaddlePaddle/Paddle.git
+cd Paddle
+./paddle/scripts/paddle_docker_build.sh build
+```
+
+After the build finishes, you can get output `whl` package under
+`build/python/dist`.
+
+This command will download the most recent dev image from docker hub, start a container in the backend and then run the build script `/paddle/paddle/scripts/paddle_build.sh build` in the container. 
+The container mounts the source directory on the host into `/paddle`. 
+When it writes to `/paddle/build` in the container, it writes to `$PWD/build` on the host indeed.
+
+### Build Options
+
+Users can specify the following Docker build arguments with either "ON" or "OFF" value:
+
+| Option | Default | Description |
+| ------ | -------- | ----------- |
+| `WITH_GPU` | OFF | Generates NVIDIA CUDA GPU code and relies on CUDA libraries. |
+| `WITH_AVX` | OFF | Set to "ON" to enable AVX support. |
+| `WITH_TESTING` | OFF | Build unit tests binaries. |
+| `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
+| `WITH_GOLANG` | OFF | Build fault-tolerant parameter server written in go. |
+| `WITH_SWIG_PY` | ON | Build with SWIG python API support. |
+| `WITH_C_API` | OFF | Build capi libraries for inference. |
+| `WITH_PYTHON` | ON | Build with python support. Turn this off if build is only for capi. |
+| `WITH_STYLE_CHECK` | ON | Check the code style when building. |
+| `PYTHON_ABI` | "" | Build for different python ABI support, can be cp27-cp27m or cp27-cp27mu |
+| `RUN_TEST` | OFF | Run unit test immediently after the build. |
+| `WITH_DOC` | OFF | Build docs after build binaries. |
+| `WOBOQ` | OFF | Generate WOBOQ code viewer under `build/woboq_out` |
+
+## Docker Images
+
+You can get the latest PaddlePaddle docker images by
+`docker pull paddlepaddle/paddle:<version>` or build one by yourself.
+
+### Official Docker Releases
+
+Official docker images at
+[here](https://hub.docker.com/r/paddlepaddle/paddle/tags/),
+you can choose either latest or images with a release tag like `0.10.0`,
+Currently available tags are:
+
+|   Tag  | Description |
+| ------ | --------------------- |
+| latest | latest CPU only image |
+| latest-gpu | latest binary with GPU support |
+| 0.10.0 | release 0.10.0 CPU only binary image |
+| 0.10.0-gpu | release 0.10.0 with GPU support |
+
+### Build Your Own Image
+
+Build PaddlePaddle docker images are quite simple since PaddlePaddle can
+be installed by just running `pip install`. A sample `Dockerfile` is:
+
+```dockerfile
+FROM nvidia/cuda:7.5-cudnn5-runtime-centos6
+RUN yum install -y centos-release-SCL
+RUN yum install -y python27
+# This whl package is generated by previous build steps.
+ADD python/dist/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl /
+RUN pip install /paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl && rm -f /*.whl
+```
+
+Then build the image by running `docker build -t [REPO]/paddle:[TAG] .` under
+the directory containing your own `Dockerfile`.
+
+- NOTE: note that you can choose different base images for your environment, you can find all the versions [here](https://hub.docker.com/r/nvidia/cuda/).
+
+### Use Docker Images
+
+Suppose that you have written an application program `train.py` using
+PaddlePaddle, we can test and run it using docker:
+
+```bash
+docker run --rm -it -v $PWD:/work paddlepaddle/paddle /work/a.py
+```
+
+But this works only if all dependencies of `train.py` are in the production image. If this is not the case, we need to build a new Docker image from the production image and with more dependencies installs.
+
+### Run PaddlePaddle Book In Docker
+
+Our [book repo](https://github.com/paddlepaddle/book) also provide a docker
+image to start a jupiter notebook inside docker so that you can run this book
+using docker:
+
+```bash
+docker run -d -p 8888:8888 paddlepaddle/book
+```
+
+Please refer to https://github.com/paddlepaddle/book if you want to build this
+docker image by your self.
+
+### Run Distributed Applications
+
+In our [API design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md#distributed-training), we proposed an API that starts a distributed training job on a cluster.  This API need to build a PaddlePaddle application into a Docker image as above and calls kubectl to run it on the cluster.  This API might need to generate a Dockerfile look like above and call `docker build`.
+
+Of course, we can manually build an application image and launch the job using the kubectl tool:
+
+```bash
+docker build -f some/Dockerfile -t myapp .
+docker tag myapp me/myapp
+docker push
+kubectl ...
+```
+
+### Reading source code with woboq codebrowser
+
+For developers who are interested in the C++ source code, you can build C++ source code into HTML pages using [Woboq codebrowser](https://github.com/woboq/woboq_codebrowser).
+
+- The following command builds PaddlePaddle, generates HTML pages from C++ source code, and writes HTML pages into `$HOME/woboq_out` on the host:
+
+```bash
+./paddle/scripts/paddle_docker_build.sh html
+```
+
+- You can open the generated HTML files in your Web browser. Or, if you want to run a Nginx container to serve them for a wider audience, you can run:
+
+```
+docker run -v $HOME/woboq_out:/usr/share/nginx/html -d -p 8080:80 nginx
+```
+
+## More Options
+
+### Build Without Docker
+
+Follow the *Dockerfile* in the paddlepaddle repo to set up your local dev environment and run:
+
+```bash
+./paddle/scripts/paddle_build.sh build
+```
+
+### Additional Tasks
+
+You can get the help menu for the build scripts by running with no options:
+
+```bash
+./paddle/scripts/paddle_build.sh
+or ./paddle/scripts/paddle_docker_build.sh
+```
diff --git a/paddle/scripts/docker/doc/paddle-development-environment-gpu.graffle b/paddle/scripts/doc/paddle-development-environment-gpu.graffle
similarity index 100%
rename from paddle/scripts/docker/doc/paddle-development-environment-gpu.graffle
rename to paddle/scripts/doc/paddle-development-environment-gpu.graffle
diff --git a/paddle/scripts/docker/doc/paddle-development-environment-gpu.png b/paddle/scripts/doc/paddle-development-environment-gpu.png
similarity index 100%
rename from paddle/scripts/docker/doc/paddle-development-environment-gpu.png
rename to paddle/scripts/doc/paddle-development-environment-gpu.png
diff --git a/paddle/scripts/docker/doc/paddle-development-environment.graffle b/paddle/scripts/doc/paddle-development-environment.graffle
similarity index 100%
rename from paddle/scripts/docker/doc/paddle-development-environment.graffle
rename to paddle/scripts/doc/paddle-development-environment.graffle
diff --git a/paddle/scripts/docker/doc/paddle-development-environment.png b/paddle/scripts/doc/paddle-development-environment.png
similarity index 100%
rename from paddle/scripts/docker/doc/paddle-development-environment.png
rename to paddle/scripts/doc/paddle-development-environment.png
diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md
deleted file mode 100644
index 78c0cc378231f763597556cc5450f6f03ab2b291..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/README.md
+++ /dev/null
@@ -1,202 +0,0 @@
-# Building PaddlePaddle
-
-## Goals
-
-We want to make the building procedures:
-
-1. Static, can reproduce easily.
-1. Generate python `whl` packages that can be widely use cross many distributions.
-1. Build different binaries per release to satisfy different environments:
-    - Binaries for different CUDA and CUDNN versions, like CUDA 7.5, 8.0, 9.0
-    - Binaries containing only capi
-    - Binaries for python with wide unicode support or not.
-1. Build docker images with PaddlePaddle pre-installed, so that we can run
-PaddlePaddle applications directly in docker or on Kubernetes clusters.
-
-To achieve this, we created a repo: https://github.com/PaddlePaddle/buildtools
-which gives several docker images that are `manylinux1` sufficient. Then we
-can build PaddlePaddle using these images to generate corresponding `whl`
-binaries.
-
-## Run The Build
-
-### Build Environments
-
-The pre-built build environment images are:
-
-| Image | Tag |
-| ----- | --- |
-| paddlepaddle/paddle_manylinux_devel | cuda7.5_cudnn5 |
-| paddlepaddle/paddle_manylinux_devel | cuda8.0_cudnn5 |
-| paddlepaddle/paddle_manylinux_devel | cuda7.5_cudnn7 |
-| paddlepaddle/paddle_manylinux_devel | cuda9.0_cudnn7 |
-
-### Start Build
-
-Choose one docker image that suit your environment and run the following
-command to start a build:
-
-```bash
-git clone https://github.com/PaddlePaddle/Paddle.git
-cd Paddle
-docker run --rm -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=OFF" -e "RUN_TEST=OFF" -e "PYTHON_ABI=cp27-cp27mu" paddlepaddle/paddle_manylinux_devel /paddle/paddle/scripts/docker/build.sh
-```
-
-After the build finishes, you can get output `whl` package under
-`build/python/dist`.
-
-This command mounts the source directory on the host into `/paddle` in the container, then run the build script `/paddle/paddle/scripts/docker/build.sh`
-in the container. When it writes to `/paddle/build` in the container, it writes to `$PWD/build` on the host indeed.
-
-### Build Options
-
-Users can specify the following Docker build arguments with either "ON" or "OFF" value:
-
-| Option | Default | Description |
-| ------ | -------- | ----------- |
-| `WITH_GPU` | OFF | Generates NVIDIA CUDA GPU code and relies on CUDA libraries. |
-| `WITH_AVX` | OFF | Set to "ON" to enable AVX support. |
-| `WITH_TESTING` | OFF | Build unit tests binaries. |
-| `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
-| `WITH_GOLANG` | OFF | Build fault-tolerant parameter server written in go. |
-| `WITH_SWIG_PY` | ON | Build with SWIG python API support. |
-| `WITH_C_API` | OFF | Build capi libraries for inference. |
-| `WITH_PYTHON` | ON | Build with python support. Turn this off if build is only for capi. |
-| `WITH_STYLE_CHECK` | ON | Check the code style when building. |
-| `PYTHON_ABI` | "" | Build for different python ABI support, can be cp27-cp27m or cp27-cp27mu |
-| `RUN_TEST` | OFF | Run unit test immediently after the build. |
-| `WITH_DOC` | OFF | Build docs after build binaries. |
-| `WOBOQ` | OFF | Generate WOBOQ code viewer under `build/woboq_out` |
-
-
-## Docker Images
-
-You can get the latest PaddlePaddle docker images by
-`docker pull paddlepaddle/paddle:<version>` or build one by yourself.
-
-### Official Docker Releases
-
-Official docker images at
-[here](https://hub.docker.com/r/paddlepaddle/paddle/tags/),
-you can choose either latest or images with a release tag like `0.10.0`,
-Currently available tags are:
-
-|   Tag  | Description |
-| ------ | --------------------- |
-| latest | latest CPU only image |
-| latest-gpu | latest binary with GPU support |
-| 0.10.0 | release 0.10.0 CPU only binary image |
-| 0.10.0-gpu | release 0.10.0 with GPU support |
-
-### Build Your Own Image
-
-Build PaddlePaddle docker images are quite simple since PaddlePaddle can
-be installed by just running `pip install`. A sample `Dockerfile` is:
-
-```dockerfile
-FROM nvidia/cuda:7.5-cudnn5-runtime-centos6
-RUN yum install -y centos-release-SCL
-RUN yum install -y python27
-# This whl package is generated by previous build steps.
-ADD python/dist/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl /
-RUN pip install /paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl && rm -f /*.whl
-```
-
-Then build the image by running `docker build -t [REPO]/paddle:[TAG] .` under
-the directory containing your own `Dockerfile`.
-
-- NOTE: note that you can choose different base images for your environment, you can find all the versions [here](https://hub.docker.com/r/nvidia/cuda/).
-
-### Use Docker Images
-
-Suppose that you have written an application program `train.py` using
-PaddlePaddle, we can test and run it using docker:
-
-```bash
-docker run --rm -it -v $PWD:/work paddlepaddle/paddle /work/a.py
-```
-
-But this works only if all dependencies of `train.py` are in the production image. If this is not the case, we need to build a new Docker image from the production image and with more dependencies installs.
-
-### Run PaddlePaddle Book In Docker
-
-Our [book repo](https://github.com/paddlepaddle/book) also provide a docker
-image to start a jupiter notebook inside docker so that you can run this book
-using docker:
-
-```bash
-docker run -d -p 8888:8888 paddlepaddle/book
-```
-
-Please refer to https://github.com/paddlepaddle/book if you want to build this
-docker image by your self.
-
-### Run Distributed Applications
-
-In our [API design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md#distributed-training), we proposed an API that starts a distributed training job on a cluster.  This API need to build a PaddlePaddle application into a Docker image as above and calls kubectl to run it on the cluster.  This API might need to generate a Dockerfile look like above and call `docker build`.
-
-Of course, we can manually build an application image and launch the job using the kubectl tool:
-
-```bash
-docker build -f some/Dockerfile -t myapp .
-docker tag myapp me/myapp
-docker push
-kubectl ...
-```
-
-## Docker Images for Developers
-
-We have a special docker image for developers:
-`paddlepaddle/paddle:<version>-dev`. This image is also generated from
-https://github.com/PaddlePaddle/buildtools
-
-This a development image contains only the
-development tools and standardizes the building procedure.  Users include:
-
-- developers -- no longer need to install development tools on the host, and can build their current work on the host (development computer).
-- release engineers -- use this to build the official release from certain branch/tag on Github.com.
-- document writers / Website developers -- Our documents are in the source repo in the form of .md/.rst files and comments in source code.  We need tools to extract the information, typeset, and generate Web pages.
-
-Of course, developers can install building tools on their development computers.  But different versions of PaddlePaddle might require different set or version of building tools.  Also, it makes collaborative debugging easier if all developers use a unified development environment.
-
-The development image contains the following tools:
-
-   - gcc/clang
-   - nvcc
-   - Python
-   - sphinx
-   - woboq
-   - sshd
-
-Many developers work on a remote computer with GPU; they could ssh into the computer and  `docker exec` into the development container. However, running `sshd` in the container allows developers to ssh into the container directly.
-
-
-### Development Workflow
-
-Here we describe how the workflow goes on.  We start from considering our daily development environment.
-
-Developers work on a computer, which is usually a laptop or desktop:
-
-<img src="doc/paddle-development-environment.png" width=500 />
-
-or, they might rely on a more sophisticated box (like with GPUs):
-
-<img src="doc/paddle-development-environment-gpu.png" width=500 />
-
-A principle here is that source code lies on the development computer (host) so that editors like Eclipse can parse the source code to support auto-completion.
-
-### Reading source code with woboq codebrowser
-
-For developers who are interested in the C++ source code, please use -e "WOBOQ=ON" to enable the building of C++ source code into HTML pages using [Woboq codebrowser](https://github.com/woboq/woboq_codebrowser).
-
-- The following command builds PaddlePaddle, generates HTML pages from C++ source code, and writes HTML pages into `$HOME/woboq_out` on the host:
-
-```bash
-docker run -v $PWD:/paddle -v $HOME/woboq_out:/woboq_out -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" -e "WOBOQ=ON" paddlepaddle/paddle:latest-dev
-```
-
-- You can open the generated HTML files in your Web browser. Or, if you want to run a Nginx container to serve them for a wider audience, you can run:
-
-```
-docker run -v $HOME/woboq_out:/usr/share/nginx/html -d -p 8080:80 nginx
-```
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
deleted file mode 100644
index 6be2bd8fad9e33cf4e1dcafdd6b8f39111bdbe88..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/build.sh
+++ /dev/null
@@ -1,238 +0,0 @@
-#!/bin/bash
-
-function cmake_gen() {
-    mkdir -p /paddle/build
-    cd /paddle/build
-
-    # build script will not fail if *.deb does not exist
-    rm *.deb 2>/dev/null || true
-    # delete previous built whl packages
-    rm -rf /paddle/paddle/dist 2>/dev/null || true
-
-    # Support build for all python versions, currently
-    # including cp27-cp27m and cp27-cp27mu.
-    PYTHON_FLAGS=""
-    if [ "$1" != "" ]; then
-        echo "using python abi: $1"
-        if [ "$1" == "cp27-cp27m" ]; then
-            export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
-            export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
-            PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
-        -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
-        -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
-        elif [ "$1" == "cp27-cp27mu" ]; then
-            export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
-            export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
-            PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
-        -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
-        -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
-        fi
-    fi
-
-    cat <<EOF
-    ========================================
-    Configuring cmake in /paddle/build ...
-        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
-        ${PYTHON_FLAGS}
-        -DWITH_DSO=ON
-        -DWITH_DOC=OFF
-        -DWITH_GPU=${WITH_GPU:-OFF}
-        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
-        -DWITH_MKL=${WITH_MKL:-ON}
-        -DWITH_AVX=${WITH_AVX:-OFF}
-        -DWITH_GOLANG=${WITH_GOLANG:-OFF}
-        -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
-        -DWITH_SWIG_PY=ON
-        -DWITH_C_API=${WITH_C_API:-OFF}
-        -DWITH_PYTHON=${WITH_PYTHON:-ON}
-        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
-        -DCUDNN_ROOT=/usr/
-        -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
-        -DWITH_TESTING=${WITH_TESTING:-ON}
-        -DWITH_FAST_BUNDLE_TEST=ON
-        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-    ========================================
-EOF
-    # Disable UNITTEST_USE_VIRTUALENV in docker because
-    # docker environment is fully controlled by this script.
-    # See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
-    cmake .. \
-        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \
-        ${PYTHON_FLAGS} \
-        -DWITH_DSO=ON \
-        -DWITH_DOC=OFF \
-        -DWITH_GPU=${WITH_GPU:-OFF} \
-        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
-        -DWITH_MKL=${WITH_MKL:-ON} \
-        -DWITH_AVX=${WITH_AVX:-OFF} \
-        -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
-        -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
-        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
-        -DWITH_C_API=${WITH_C_API:-OFF} \
-        -DWITH_PYTHON=${WITH_PYTHON:-ON} \
-        -DCUDNN_ROOT=/usr/ \
-        -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
-        -DWITH_TESTING=${WITH_TESTING:-ON} \
-        -DWITH_FAST_BUNDLE_TEST=ON \
-        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-}
-
-function run_build() {
-    cat <<EOF
-    ============================================
-    Building in /paddle/build ...
-    ============================================
-EOF
-    make clean
-    make -j `nproc`
-}
-
-function run_test() {
-    if [ ${WITH_TESTING:-ON} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
-    cat <<EOF
-    ========================================
-    Running unit tests ...
-    ========================================
-EOF
-        ctest --output-on-failure
-        # make install should also be test when unittest
-        make install -j `nproc`
-        pip install /usr/local/opt/paddle/share/wheels/*.whl
-        paddle version
-    fi
-}
-
-
-function gen_docs() {
-    if [[ ${WITH_DOC:-OFF} == "ON" ]]; then
-        cat <<EOF
-    ========================================
-    Building documentation ...
-    In /paddle/build_doc
-    ========================================
-EOF
-        mkdir -p /paddle/build_doc
-        pushd /paddle/build_doc
-        cmake .. \
-            -DWITH_DOC=ON \
-            -DWITH_GPU=OFF \
-            -DWITH_AVX=${WITH_AVX:-ON} \
-            -DWITH_SWIG_PY=ON \
-            -DWITH_STYLE_CHECK=OFF
-        make -j `nproc` gen_proto_py framework_py_proto
-        make -j `nproc` copy_paddle_pybind
-        make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
-        popd
-    fi
-
-
-    if [[ ${WOBOQ:-OFF} == 'ON' ]]; then
-        cat <<EOF
-    ========================================
-    Converting C++ source code into HTML ...
-    ========================================
-EOF
-        export WOBOQ_OUT=/paddle/build/woboq_out
-        mkdir -p $WOBOQ_OUT
-        cp -rv /woboq/data $WOBOQ_OUT/../data
-        /woboq/generator/codebrowser_generator \
-            -b /paddle/build \
-            -a \
-            -o $WOBOQ_OUT \
-            -p paddle:/paddle
-        /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
-    fi
-}
-
-
-function gen_dockerfile() {
-    # Set BASE_IMAGE according to env variables
-    if [[ ${WITH_GPU} == "ON" ]]; then
-    BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04"
-    else
-    BASE_IMAGE="ubuntu:16.04"
-    fi
-
-    DOCKERFILE_GPU_ENV=""
-    DOCKERFILE_CUDNN_DSO=""
-    if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
-        DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}"
-        DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so"
-    fi
-
-    cat <<EOF
-    ========================================
-    Generate /paddle/build/Dockerfile ...
-    ========================================
-EOF
-
-    cat > /paddle/build/Dockerfile <<EOF
-    FROM ${BASE_IMAGE}
-    MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-    ENV HOME /root
-EOF
-
-    if [[ ${WITH_GPU} == "ON"  ]]; then
-        NCCL_DEPS="apt-get install -y libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 &&"
-    else
-        NCCL_DEPS=""
-    fi
-
-    cat >> /paddle/build/Dockerfile <<EOF
-    ADD python/dist/*.whl /
-    # run paddle version to install python packages first
-    RUN apt-get update &&\
-        ${NCCL_DEPS}\
-        apt-get install -y wget python-pip dmidecode python-tk && pip install -U pip && \
-        pip install /*.whl; apt-get install -f -y && \
-        apt-get clean -y && \
-        rm -f /*.whl && \
-        paddle version && \
-        ldconfig
-    ${DOCKERFILE_CUDNN_DSO}
-    ${DOCKERFILE_GPU_ENV}
-    ENV NCCL_LAUNCH_MODE PARALLEL
-    ADD go/cmd/pserver/pserver /usr/bin/
-    ADD go/cmd/master/master /usr/bin/
-    # default command shows the paddle version and exit
-    CMD ["paddle", "version"]
-EOF
-}
-
-function gen_capi_package() {
-  if [[ ${WITH_C_API} == "ON" ]]; then
-    install_prefix="/paddle/build/capi_output"
-    rm -rf $install_prefix
-    make DESTDIR="$install_prefix" install
-    cd $install_prefix/usr/local
-    ls | egrep -v "^Found.*item$" | xargs tar -cf /paddle/build/paddle.tgz
-  fi
-}
-
-function gen_fluid_inference_lib() {
-    if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
-    cat <<EOF
-    ========================================
-    Deploying fluid inference library ...
-    ========================================
-EOF
-        make inference_lib_dist
-    fi
-}
-
-set -xe
-
-cmake_gen ${PYTHON_ABI:-""}
-run_build
-run_test
-gen_docs
-gen_dockerfile
-gen_capi_package
-gen_fluid_inference_lib
-
-if [[ ${WITH_C_API:-OFF} == "ON" ]]; then
-  printf "PaddlePaddle C-API libraries was generated on build/paddle.tgz\n" 
-else
-  printf "If you need to install PaddlePaddle in develop docker image,"
-  printf "please make install or pip install build/python/dist/*.whl.\n"
-fi
diff --git a/paddle/scripts/docker/build_android.sh b/paddle/scripts/docker/build_android.sh
deleted file mode 100644
index 3d5e775fafb6b94a3429dbf3368a8949bca3d612..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/build_android.sh
+++ /dev/null
@@ -1,89 +0,0 @@
-#!/bin/bash
-
-set -xe
-
-if [ $ANDROID_ABI == "arm64-v8a" ]; then
-  ANDROID_ARCH=arm64
-  if [ $ANDROID_API -lt 21 ]; then
-    echo "Warning: arm64-v8a requires ANDROID_API >= 21."
-    ANDROID_API=21
-  fi
-else # armeabi, armeabi-v7a
-  ANDROID_ARCH=arm
-fi
-
-ANDROID_STANDALONE_TOOLCHAIN=$ANDROID_TOOLCHAINS_DIR/$ANDROID_ARCH-android-$ANDROID_API
-
-cat <<EOF
-============================================
-Generating the standalone toolchain ...
-${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh
-      --arch=$ANDROID_ARCH
-      --platform=android-$ANDROID_API
-      --install-dir=${ANDROID_STANDALONE_TOOLCHAIN}
-============================================
-EOF
-${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh \
-      --arch=$ANDROID_ARCH \
-      --platform=android-$ANDROID_API \
-      --install-dir=$ANDROID_STANDALONE_TOOLCHAIN
-
-BUILD_ROOT=/paddle/build_android
-DEST_ROOT=/paddle/install_android
-
-mkdir -p $BUILD_ROOT
-cd $BUILD_ROOT
-
-if [ $ANDROID_ABI == "armeabi-v7a" ]; then
-  cmake -DCMAKE_SYSTEM_NAME=Android \
-        -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-        -DANDROID_ABI=$ANDROID_ABI \
-        -DANDROID_ARM_NEON=ON \
-        -DANDROID_ARM_MODE=ON \
-        -DHOST_C_COMPILER=/usr/bin/gcc \
-        -DHOST_CXX_COMPILER=/usr/bin/g++ \
-        -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-        -DCMAKE_BUILD_TYPE=MinSizeRel \
-        -DUSE_EIGEN_FOR_BLAS=ON \
-        -DWITH_C_API=ON \
-        -DWITH_SWIG_PY=OFF \
-        -DWITH_STYLE_CHECK=OFF \
-        ..
-elif [ $ANDROID_ABI == "arm64-v8a" ]; then
-  cmake -DCMAKE_SYSTEM_NAME=Android \
-        -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-        -DANDROID_ABI=$ANDROID_ABI \
-        -DANDROID_ARM_MODE=ON \
-        -DHOST_C_COMPILER=/usr/bin/gcc \
-        -DHOST_CXX_COMPILER=/usr/bin/g++ \
-        -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-        -DCMAKE_BUILD_TYPE=MinSizeRel \
-        -DUSE_EIGEN_FOR_BLAS=OFF \
-        -DWITH_C_API=ON \
-        -DWITH_SWIG_PY=OFF \
-        -DWITH_STYLE_CHECK=OFF \
-        ..
-elif [ $ANDROID_ABI == "armeabi" ]; then
-  cmake -DCMAKE_SYSTEM_NAME=Android \
-        -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-        -DANDROID_ABI=$ANDROID_ABI \
-        -DANDROID_ARM_MODE=ON \
-        -DHOST_C_COMPILER=/usr/bin/gcc \
-        -DHOST_CXX_COMPILER=/usr/bin/g++ \
-        -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-        -DCMAKE_BUILD_TYPE=MinSizeRel \
-        -DWITH_C_API=ON \
-        -DWITH_SWIG_PY=OFF \
-        -DWITH_STYLE_CHECK=OFF \
-        ..
-else
-  echo "Invalid ANDROID_ABI: $ANDROID_ABI"
-fi
-
-cat <<EOF
-============================================
-Building in $BUILD_ROOT ...
-============================================
-EOF
-make -j `nproc`
-make install -j `nproc`
diff --git a/paddle/scripts/docker/entrypoint b/paddle/scripts/docker/entrypoint
deleted file mode 100755
index bc194bd909aa308fd5fe920c9319f62a0ec2dac7..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/entrypoint
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-
-/usr/sbin/sshd -D &
-jupyter notebook --ip=0.0.0.0 /paddle/book/
diff --git a/paddle/scripts/docker/test.sh b/paddle/scripts/docker/test.sh
deleted file mode 100755
index 8180737a8f431d6eb8bab4b2ef7bdcc50cce41f3..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/test.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-set -e
-
-# the number of process to run tests
-NUM_PROC=6
-
-# calculate and set the memory usage for each process
-MEM_USAGE=$(printf "%.2f" `echo "scale=5; 1.0 / $NUM_PROC" | bc`)
-export FLAGS_fraction_of_gpu_memory_to_use=$MEM_USAGE
-
-# get the CUDA device count
-CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
-
-for (( i = 0; i < $NUM_PROC; i++ )); do
-    cuda_list=()
-    for (( j = 0; j < $CUDA_DEVICE_COUNT; j++ )); do
-        s=$[i+j]
-        n=$[s%CUDA_DEVICE_COUNT]
-        if [ $j -eq 0 ]; then
-            cuda_list=("$n")
-        else
-            cuda_list="$cuda_list,$n"
-        fi
-    done
-    echo $cuda_list
-    # CUDA_VISIBLE_DEVICES http://acceleware.com/blog/cudavisibledevices-masking-gpus
-    # ctest -I https://cmake.org/cmake/help/v3.0/manual/ctest.1.html?highlight=ctest
-    env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC --output-on-failure &
-done
-wait
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
new file mode 100755
index 0000000000000000000000000000000000000000..d8f0b76b7ba0fedfe411aa86f6f8a0c77a02beca
--- /dev/null
+++ b/paddle/scripts/paddle_build.sh
@@ -0,0 +1,564 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#=================================================
+#                   Utils
+#=================================================
+
+function print_usage() {
+    echo -e "\n${RED}Usage${NONE}:
+    ${BOLD}${SCRIPT_NAME}${NONE} [OPTION]"
+
+    echo -e "\n${RED}Options${NONE}:
+    ${BLUE}build${NONE}: run build for x86 platform
+    ${BLUE}build_android${NONE}: run build for android platform
+    ${BLUE}build_ios${NONE}: run build for ios platform
+    ${BLUE}test${NONE}: run all unit tests
+    ${BLUE}single_test${NONE}: run a single unit test
+    ${BLUE}bind_test${NONE}: parallel tests bind to different GPU
+    ${BLUE}doc${NONE}: generate paddle documents
+    ${BLUE}html${NONE}: convert C++ source code into HTML
+    ${BLUE}dockerfile${NONE}: generate paddle release dockerfile
+    ${BLUE}capi${NONE}: generate paddle CAPI package
+    ${BLUE}fluid_inference_lib${NONE}: deploy fluid inference library
+    ${BLUE}check_style${NONE}: run code style check
+    ${BLUE}cicheck${NONE}: run CI tasks
+    "
+}
+
+function init() {
+    RED='\033[0;31m'
+    BLUE='\033[0;34m'
+    BOLD='\033[1m'
+    NONE='\033[0m'
+
+    PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
+    if [ -z "${SCRIPT_NAME}" ]; then
+        SCRIPT_NAME=$0
+    fi
+}
+
+function cmake_gen() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build
+
+    # build script will not fail if *.deb does not exist
+    rm *.deb 2>/dev/null || true
+    # delete previous built whl packages
+    rm -rf python/dist 2>/dev/null || true
+
+    # Support build for all python versions, currently
+    # including cp27-cp27m and cp27-cp27mu.
+    PYTHON_FLAGS=""
+    if [ "$1" != "" ]; then
+        echo "using python abi: $1"
+        if [ "$1" == "cp27-cp27m" ]; then
+            export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
+            export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
+            PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
+        -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
+        -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
+        elif [ "$1" == "cp27-cp27mu" ]; then
+            export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
+            export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
+            PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
+        -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
+        -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
+        fi
+    fi
+
+    cat <<EOF
+    ========================================
+    Configuring cmake in /paddle/build ...
+        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
+        ${PYTHON_FLAGS}
+        -DWITH_DSO=ON
+        -DWITH_DOC=${WITH_DOC:-OFF}
+        -DWITH_GPU=${WITH_GPU:-OFF}
+        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF}
+        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
+        -DWITH_MKL=${WITH_MKL:-ON}
+        -DWITH_AVX=${WITH_AVX:-OFF}
+        -DWITH_GOLANG=${WITH_GOLANG:-OFF}
+        -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All}
+        -DWITH_C_API=${WITH_C_API:-OFF}
+        -DWITH_PYTHON=${WITH_PYTHON:-ON}
+        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
+        -DCUDNN_ROOT=/usr/
+        -DWITH_TESTING=${WITH_TESTING:-ON}
+        -DWITH_FAST_BUNDLE_TEST=ON
+        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+        -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
+        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
+        -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
+    ========================================
+EOF
+    # Disable UNITTEST_USE_VIRTUALENV in docker because
+    # docker environment is fully controlled by this script.
+    # See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
+    cmake .. \
+        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \
+        ${PYTHON_FLAGS} \
+        -DWITH_DSO=ON \
+        -DWITH_DOC=${WITH_DOC:-OFF} \
+        -DWITH_GPU=${WITH_GPU:-OFF} \
+        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \
+        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
+        -DWITH_MKL=${WITH_MKL:-ON} \
+        -DWITH_AVX=${WITH_AVX:-OFF} \
+        -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
+        -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
+        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
+        -DWITH_C_API=${WITH_C_API:-OFF} \
+        -DWITH_PYTHON=${WITH_PYTHON:-ON} \
+        -DCUDNN_ROOT=/usr/ \
+        -DWITH_TESTING=${WITH_TESTING:-ON} \
+        -DWITH_FAST_BUNDLE_TEST=ON \
+        -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
+        -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+        -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
+        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
+        -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON}
+}
+
+function abort(){
+    echo "Your change doesn't follow PaddlePaddle's code style." 1>&2
+    echo "Please use pre-commit to check what is wrong." 1>&2
+    exit 1
+}
+
+function check_style() {
+    trap 'abort' 0
+    set -e
+
+    if [ -x "$(command -v gimme)" ]; then
+    	eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
+    fi
+
+    # set up go environment for running gometalinter
+    mkdir -p $GOPATH/src/github.com/PaddlePaddle/
+    ln -sf ${PADDLE_ROOT} $GOPATH/src/github.com/PaddlePaddle/Paddle
+    mkdir -p ./build/go
+    cp go/glide.* build/go
+    cd build/go; glide install; cd -
+
+    export PATH=/usr/bin:$PATH
+    pre-commit install
+    clang-format --version
+
+    if ! pre-commit run -a ; then
+        git diff
+        exit 1
+    fi
+
+    trap : 0
+}
+
+#=================================================
+#              Build
+#=================================================
+
+function build() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build
+    cat <<EOF
+    ============================================
+    Building in /paddle/build ...
+    ============================================
+EOF
+    make clean
+    make -j `nproc`
+    make install -j `nproc`
+}
+
+function build_android() {
+    if [ $ANDROID_ABI == "arm64-v8a" ]; then
+      ANDROID_ARCH=arm64
+      if [ $ANDROID_API -lt 21 ]; then
+        echo "Warning: arm64-v8a requires ANDROID_API >= 21."
+        ANDROID_API=21
+      fi
+    else # armeabi, armeabi-v7a
+      ANDROID_ARCH=arm
+    fi
+
+    ANDROID_STANDALONE_TOOLCHAIN=$ANDROID_TOOLCHAINS_DIR/$ANDROID_ARCH-android-$ANDROID_API
+
+    cat <<EOF
+    ============================================
+    Generating the standalone toolchain ...
+    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh
+          --arch=$ANDROID_ARCH
+          --platform=android-$ANDROID_API
+          --install-dir=${ANDROID_STANDALONE_TOOLCHAIN}
+    ============================================
+EOF
+    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh \
+          --arch=$ANDROID_ARCH \
+          --platform=android-$ANDROID_API \
+          --install-dir=$ANDROID_STANDALONE_TOOLCHAIN
+
+    BUILD_ROOT=${PADDLE_ROOT}/build_android
+    DEST_ROOT=${PADDLE_ROOT}/install_android
+
+    mkdir -p $BUILD_ROOT
+    cd $BUILD_ROOT
+
+    if [ $ANDROID_ABI == "armeabi-v7a" ]; then
+      cmake -DCMAKE_SYSTEM_NAME=Android \
+            -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
+            -DANDROID_ABI=$ANDROID_ABI \
+            -DANDROID_ARM_NEON=ON \
+            -DANDROID_ARM_MODE=ON \
+            -DHOST_C_COMPILER=/usr/bin/gcc \
+            -DHOST_CXX_COMPILER=/usr/bin/g++ \
+            -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
+            -DCMAKE_BUILD_TYPE=MinSizeRel \
+            -DUSE_EIGEN_FOR_BLAS=ON \
+            -DWITH_C_API=ON \
+            -DWITH_SWIG_PY=OFF \
+            ..
+    elif [ $ANDROID_ABI == "arm64-v8a" ]; then
+      cmake -DCMAKE_SYSTEM_NAME=Android \
+            -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
+            -DANDROID_ABI=$ANDROID_ABI \
+            -DANDROID_ARM_MODE=ON \
+            -DHOST_C_COMPILER=/usr/bin/gcc \
+            -DHOST_CXX_COMPILER=/usr/bin/g++ \
+            -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
+            -DCMAKE_BUILD_TYPE=MinSizeRel \
+            -DUSE_EIGEN_FOR_BLAS=OFF \
+            -DWITH_C_API=ON \
+            -DWITH_SWIG_PY=OFF \
+            ..
+    elif [ $ANDROID_ABI == "armeabi" ]; then
+      cmake -DCMAKE_SYSTEM_NAME=Android \
+            -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
+            -DANDROID_ABI=$ANDROID_ABI \
+            -DANDROID_ARM_MODE=ON \
+            -DHOST_C_COMPILER=/usr/bin/gcc \
+            -DHOST_CXX_COMPILER=/usr/bin/g++ \
+            -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
+            -DCMAKE_BUILD_TYPE=MinSizeRel \
+            -DWITH_C_API=ON \
+            -DWITH_SWIG_PY=OFF \
+            ..
+    else
+      echo "Invalid ANDROID_ABI: $ANDROID_ABI"
+    fi
+
+    cat <<EOF
+    ============================================
+    Building in $BUILD_ROOT ...
+    ============================================
+EOF
+    make -j `nproc`
+    make install -j `nproc`
+}
+
+function build_ios() {
+    # Create the build directory for CMake.
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build
+
+    # Compile paddle binaries
+    cmake .. \
+          -DCMAKE_SYSTEM_NAME=iOS \
+          -DIOS_PLATFORM=OS \
+          -DCMAKE_OSX_ARCHITECTURES="arm64" \
+          -DWITH_C_API=ON \
+          -DUSE_EIGEN_FOR_BLAS=ON \
+          -DWITH_TESTING=OFF \
+          -DWITH_SWIG_PY=OFF \
+          -DCMAKE_BUILD_TYPE=Release
+
+    make -j 2
+}
+
+function run_test() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build
+    if [ ${WITH_TESTING:-ON} == "ON" ] ; then
+    cat <<EOF
+    ========================================
+    Running unit tests ...
+    ========================================
+EOF
+        ctest --output-on-failure
+        # make install should also be test when unittest
+        make install -j `nproc`
+        pip install /usr/local/opt/paddle/share/wheels/*.whl
+        if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
+            paddle version
+        fi
+    fi
+}
+
+function single_test() {
+    TEST_NAME=$1
+    if [ -z "${TEST_NAME}" ]; then
+        echo -e "${RED}Usage:${NONE}"
+        echo -e "${BOLD}${SCRIPT_NAME}${NONE} ${BLUE}single_test${NONE} [test_name]"
+        exit 1
+    fi
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build
+    if [ ${WITH_TESTING:-ON} == "ON" ] ; then
+    cat <<EOF
+    ========================================
+    Running ${TEST_NAME} ...
+    ========================================
+EOF
+        ctest --output-on-failure -R ${TEST_NAME}
+    fi
+}
+
+function bind_test() {
+    # the number of process to run tests
+    NUM_PROC=6
+
+    # calculate and set the memory usage for each process
+    MEM_USAGE=$(printf "%.2f" `echo "scale=5; 1.0 / $NUM_PROC" | bc`)
+    export FLAGS_fraction_of_gpu_memory_to_use=$MEM_USAGE
+
+    # get the CUDA device count
+    CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
+
+    for (( i = 0; i < $NUM_PROC; i++ )); do
+        cuda_list=()
+        for (( j = 0; j < $CUDA_DEVICE_COUNT; j++ )); do
+            s=$[i+j]
+            n=$[s%CUDA_DEVICE_COUNT]
+            if [ $j -eq 0 ]; then
+                cuda_list=("$n")
+            else
+                cuda_list="$cuda_list,$n"
+            fi
+        done
+        echo $cuda_list
+        # CUDA_VISIBLE_DEVICES http://acceleware.com/blog/cudavisibledevices-masking-gpus
+        # ctest -I https://cmake.org/cmake/help/v3.0/manual/ctest.1.html?highlight=ctest
+        env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC --output-on-failure &
+    done
+    wait
+}
+
+
+function gen_docs() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build
+    cat <<EOF
+    ========================================
+    Building documentation ...
+    In /paddle/build
+    ========================================
+EOF
+    cmake .. \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DWITH_DOC=ON \
+        -DWITH_GPU=OFF \
+        -DWITH_MKL=OFF
+
+    make -j `nproc` paddle_docs paddle_apis
+
+    # check websites for broken links
+    linkchecker doc/v2/en/html/index.html
+    linkchecker doc/v2/cn/html/index.html
+    linkchecker doc/v2/api/en/html/index.html
+}
+
+function gen_html() {
+    cat <<EOF
+    ========================================
+    Converting C++ source code into HTML ...
+    ========================================
+EOF
+    export WOBOQ_OUT=${PADDLE_ROOT}/build/woboq_out
+    mkdir -p $WOBOQ_OUT
+    cp -rv /woboq/data $WOBOQ_OUT/../data
+    /woboq/generator/codebrowser_generator \
+    	-b ${PADDLE_ROOT}/build \
+    	-a \
+    	-o $WOBOQ_OUT \
+    	-p paddle:${PADDLE_ROOT}
+    /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
+}
+
+function gen_dockerfile() {
+    # Set BASE_IMAGE according to env variables
+    CUDA_MAJOR="$(echo $CUDA_VERSION | cut -d '.' -f 1).$(echo $CUDA_VERSION | cut -d '.' -f 2)"
+    CUDNN_MAJOR=$(echo $CUDNN_VERSION | cut -d '.' -f 1)
+    if [[ ${WITH_GPU} == "ON" ]]; then
+        BASE_IMAGE="nvidia/cuda:${CUDA_MAJOR}-cudnn${CUDNN_MAJOR}-runtime-ubuntu16.04"
+    else
+        BASE_IMAGE="ubuntu:16.04"
+    fi
+
+    DOCKERFILE_GPU_ENV=""
+    DOCKERFILE_CUDNN_DSO=""
+    DOCKERFILE_CUBLAS_DSO=""
+    if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
+        DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}"
+        DOCKERFILE_CUDNN_DSO="RUN ln -sf /usr/lib/x86_64-linux-gnu/libcudnn.so.${CUDNN_MAJOR} /usr/lib/x86_64-linux-gnu/libcudnn.so"
+        DOCKERFILE_CUBLAS_DSO="RUN ln -sf /usr/local/cuda/targets/x86_64-linux/lib/libcublas.so.${CUDA_MAJOR} /usr/lib/x86_64-linux-gnu/libcublas.so"
+    fi
+
+    cat <<EOF
+    ========================================
+    Generate ${PADDLE_ROOT}/build/Dockerfile ...
+    ========================================
+EOF
+
+    cat > ${PADDLE_ROOT}/build/Dockerfile <<EOF
+    FROM ${BASE_IMAGE}
+    MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+    ENV HOME /root
+EOF
+
+    if [[ ${WITH_GPU} == "ON"  ]]; then
+        NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.1.2-1+cuda${CUDA_MAJOR} libnccl-dev=2.1.2-1+cuda${CUDA_MAJOR} &&"
+    else
+        NCCL_DEPS=""
+    fi
+
+    if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]]; then
+        PADDLE_VERSION="paddle version"
+        CMD='"paddle", "version"'
+    else
+        PADDLE_VERSION="true"
+        CMD='"true"'
+    fi
+
+    cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+    ADD python/dist/*.whl /
+    # run paddle version to install python packages first
+    RUN apt-get update &&\
+        ${NCCL_DEPS}\
+        apt-get install -y wget python-pip python-opencv libgtk2.0-dev dmidecode python-tk && easy_install -U pip && \
+        pip install /*.whl; apt-get install -f -y && \
+        apt-get clean -y && \
+        rm -f /*.whl && \
+        ${PADDLE_VERSION} && \
+        ldconfig
+    ${DOCKERFILE_CUDNN_DSO}
+    ${DOCKERFILE_CUBLAS_DSO}
+    ${DOCKERFILE_GPU_ENV}
+    ENV NCCL_LAUNCH_MODE PARALLEL
+EOF
+    if [[ ${WITH_GOLANG:-OFF} == "ON" ]]; then
+        cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+        ADD go/cmd/pserver/pserver /usr/bin/
+        ADD go/cmd/master/master /usr/bin/
+EOF
+    fi
+    cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+    # default command shows the paddle version and exit
+    CMD [${CMD}]
+EOF
+}
+
+function gen_capi_package() {
+    if [[ ${WITH_C_API} == "ON" ]]; then
+        install_prefix="${PADDLE_ROOT}/build/capi_output"
+        rm -rf $install_prefix
+        make DESTDIR="$install_prefix" install
+        cd $install_prefix/usr/local
+        ls | egrep -v "^Found.*item$" | xargs tar -cf ${PADDLE_ROOT}/build/paddle.tgz
+    fi
+}
+
+function gen_fluid_inference_lib() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build
+    if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
+        cat <<EOF
+    ========================================
+    Deploying fluid inference library ...
+    ========================================
+EOF
+        make -j `nproc` inference_lib_dist
+        cd ${PADDLE_ROOT}/build
+        mv fluid_install_dir fluid
+        tar -cf fluid.tgz fluid
+      fi
+}
+
+function main() {
+    set -e
+    local CMD=$1
+    init
+    case $CMD in
+      build)
+        cmake_gen ${PYTHON_ABI:-""}
+        build
+        gen_dockerfile
+        ;;
+      build_android)
+        build_android
+        ;;
+      build_ios)
+        build_ios
+        ;;
+      test)
+        run_test
+        ;;
+      single_test)
+        single_test $2
+        ;;
+      bind_test)
+        bind_test
+        ;;
+      doc)
+        gen_docs
+        ;;
+      html)
+        gen_html
+        ;;
+      dockerfile)
+        gen_dockerfile
+        ;;
+      capi)
+        cmake_gen ${PYTHON_ABI:-""}
+        build
+        gen_capi_package
+        ;;
+      fluid_inference_lib)
+        cmake_gen ${PYTHON_ABI:-""}
+        gen_fluid_inference_lib
+        ;;
+      check_style)
+        check_style
+        ;;
+      cicheck)
+        cmake_gen ${PYTHON_ABI:-""}
+        build
+        run_test
+        gen_capi_package
+        gen_fluid_inference_lib
+        ;;
+      *)
+        print_usage
+        exit 0
+        ;;
+      esac
+}
+
+main $@
diff --git a/paddle/scripts/paddle_docker_build.sh b/paddle/scripts/paddle_docker_build.sh
new file mode 100755
index 0000000000000000000000000000000000000000..3462deb9c2f88b6da643d6aa833449ed5f4a9b34
--- /dev/null
+++ b/paddle/scripts/paddle_docker_build.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+function start_build_docker() {
+    docker pull $IMG
+
+    apt_mirror='s#http://archive.ubuntu.com/ubuntu#mirror://mirrors.ubuntu.com/mirrors.txt#g'
+    DOCKER_ENV=$(cat <<EOL
+        -e FLAGS_fraction_of_gpu_memory_to_use=0.15 \
+        -e CTEST_OUTPUT_ON_FAILURE=1 \
+        -e CTEST_PARALLEL_LEVEL=1 \
+        -e APT_MIRROR=${apt_mirror} \
+        -e WITH_GPU=ON \
+        -e CUDA_ARCH_NAME=Auto \
+        -e WITH_AVX=ON \
+        -e WITH_GOLANG=OFF \
+        -e WITH_TESTING=ON \
+        -e WITH_C_API=OFF \
+        -e WITH_COVERAGE=ON \
+        -e COVERALLS_UPLOAD=ON \
+        -e WITH_DEB=OFF \
+        -e CMAKE_BUILD_TYPE=RelWithDebInfo \
+        -e PADDLE_FRACTION_GPU_MEMORY_TO_USE=0.15 \
+        -e CUDA_VISIBLE_DEVICES=0,1 \
+        -e WITH_DISTRIBUTE=ON \
+        -e WITH_FLUID_ONLY=ON \
+        -e RUN_TEST=ON
+EOL
+    )
+
+    DOCKER_CMD="nvidia-docker"
+    if ! [ -x "$(command -v ${DOCKER_CMD})" ]; then
+        DOCKER_CMD="docker"
+    fi
+    if [ ! -d "${HOME}/.ccache" ]; then
+        mkdir ${HOME}/.ccache
+    fi
+    set -ex
+    ${DOCKER_CMD} run -it \
+        ${DOCKER_ENV} \
+        -e SCRIPT_NAME=$0 \
+        -v $PADDLE_ROOT:/paddle \
+        -v ${HOME}/.ccache:/root/.ccache \
+        -w /paddle \
+        $IMG \
+        paddle/scripts/paddle_build.sh $@
+    set +x
+}
+
+function main() {
+    DOCKER_REPO="paddlepaddle/paddle"
+    VERSION="latest-dev"
+    PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
+    if [ "$1" == "build_android" ]; then
+        VERSION="latest-dev-android"
+    fi
+    IMG=${DOCKER_REPO}:${VERSION}
+    start_build_docker $@
+}
+
+main $@
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 80fa0c72af65cbdc21ba955389318a233e02657c..1283de9d957a46b848c7bb6caf9c5f49398468e2 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -153,9 +153,15 @@ if [ $? -ne 0 ]; then
     exit 1
 fi
 
-INSTALLED_VERSION=`pip freeze 2>/dev/null | grep '^paddle' | sed 's/.*==//g'`
+if [ "@WITH_GPU@" == "ON" ]; then
+    PADDLE_NAME="paddlepaddle-gpu"
+else 
+    PADDLE_NAME="paddlepaddle"
+fi
+
+INSTALLED_VERSION=`pip freeze 2>/dev/null | grep "^${PADDLE_NAME}==" | sed 's/.*==//g'`
 
-if [ -z ${INSTALLED_VERSION} ]; then
+if [ -z "${INSTALLED_VERSION}" ]; then
    INSTALLED_VERSION="0.0.0"  # not installed
 fi
 cat <<EOF | python -
diff --git a/paddle/scripts/tools/build_docs/.gitignore b/paddle/scripts/tools/build_docs/.gitignore
deleted file mode 100644
index 6ec14c8f5bc3774a81dbe87c44f458594b38f12c..0000000000000000000000000000000000000000
--- a/paddle/scripts/tools/build_docs/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-doc
-doc_cn
diff --git a/paddle/scripts/tools/build_docs/build_docs.sh b/paddle/scripts/tools/build_docs/build_docs.sh
deleted file mode 100755
index f9bc8bf63ae9afdfca1ff660bc83e62e71f03005..0000000000000000000000000000000000000000
--- a/paddle/scripts/tools/build_docs/build_docs.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-docker run --rm \
-       -v $(git rev-parse --show-toplevel):/paddle \
-       -e "WITH_GPU=OFF" \
-       -e "WITH_AVX=ON" \
-       -e "WITH_DOC=ON" \
-       -e "WOBOQ=ON" \
-       ${1:-"paddlepaddle/paddle:latest-dev"}
diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh
deleted file mode 100755
index c3892491725dc960375f3f2d8fdda7f39dc84d04..0000000000000000000000000000000000000000
--- a/paddle/scripts/travis/build_doc.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-set -e
-
-# Create the build directory for CMake.
-mkdir -p $TRAVIS_BUILD_DIR/build
-cd $TRAVIS_BUILD_DIR/build
-
-# Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON -DWITH_STYLE_CHECK=OFF
-make -j `nproc` gen_proto_py framework_py_proto
-make -j `nproc` copy_paddle_pybind
-make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
-
-# check websites for broken links
-linkchecker doc/v2/en/html/index.html
-linkchecker doc/v2/cn/html/index.html
-linkchecker doc/v2/api/en/html/index.html
diff --git a/paddle/scripts/travis/build_ios.sh b/paddle/scripts/travis/build_ios.sh
deleted file mode 100755
index dee7cf7cbbcccffd727002108ae7f6b6ee2fbba8..0000000000000000000000000000000000000000
--- a/paddle/scripts/travis/build_ios.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-set -e
-
-# Create the build directory for CMake.
-mkdir -p $TRAVIS_BUILD_DIR/build_ios
-cd $TRAVIS_BUILD_DIR/build_ios
-
-# Compile paddle binaries
-cmake -DCMAKE_SYSTEM_NAME=iOS \
-      -DIOS_PLATFORM=OS \
-      -DCMAKE_OSX_ARCHITECTURES="arm64" \
-      -DWITH_C_API=ON \
-      -DUSE_EIGEN_FOR_BLAS=ON \
-      -DWITH_TESTING=OFF \
-      -DWITH_SWIG_PY=OFF \
-      -DWITH_STYLE_CHECK=OFF \
-      -DCMAKE_BUILD_TYPE=Release \
-      ..
-
-make -j 2
diff --git a/paddle/scripts/travis/check_style.sh b/paddle/scripts/travis/check_style.sh
deleted file mode 100755
index e71d243efa2041cc0624b8273e1bfabaa03ce106..0000000000000000000000000000000000000000
--- a/paddle/scripts/travis/check_style.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-function abort(){
-    echo "Your change doesn't follow PaddlePaddle's code style." 1>&2
-    echo "Please use pre-commit to check what is wrong." 1>&2
-    exit 1
-}
-
-trap 'abort' 0
-set -e
-
-# install glide
-curl https://glide.sh/get | bash
-eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
-
-# set up go environment for running gometalinter
-mkdir -p $GOPATH/src/github.com/PaddlePaddle/
-ln -sf $TRAVIS_BUILD_DIR $GOPATH/src/github.com/PaddlePaddle/Paddle
-cd  $GOPATH/src/github.com/PaddlePaddle/Paddle/go; glide install; cd -
-
-go get github.com/alecthomas/gometalinter
-gometalinter --install
-
-cd $TRAVIS_BUILD_DIR
-export PATH=/usr/bin:$PATH
-pre-commit install
-clang-format --version
-
-
-
-if ! pre-commit run -a ; then
-    git diff
-    exit 1
-fi
-
-trap : 0
diff --git a/paddle/scripts/travis/deploy_key.enc b/paddle/scripts/travis/deploy_key.enc
deleted file mode 100644
index b0aa45c5ac626c735735fd8541a43bf8b099d0a0..0000000000000000000000000000000000000000
Binary files a/paddle/scripts/travis/deploy_key.enc and /dev/null differ
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index 77f84cd43bdf35ae6f54b0db2b5f720d24872878..a1f446817e0cbc1b4391398a82b0846d01bbec2c 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -6,6 +6,6 @@ if(WITH_TESTING)
   add_library(paddle_test_util STATIC TestUtil.cpp)
   add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
   if(NOT MOBILE_INFERENCE)
-    cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init paddle_memory gtest gflags)
+    cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init memory gtest gflags)
   endif()
 endif()
diff --git a/paddle/testing/TestMain.cpp b/paddle/testing/TestMain.cpp
index 3e14532d1878fa374a5a2241c7b8319da2dc79d3..1811dbbd1a5f3f6078e7acd24b55d13a242c98bf 100644
--- a/paddle/testing/TestMain.cpp
+++ b/paddle/testing/TestMain.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include "paddle/utils/Util.h"
+#include "paddle/legacy/utils/Util.h"
 
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
diff --git a/paddle/testing/TestUtil.cpp b/paddle/testing/TestUtil.cpp
index cfb8c713d96008a74287fb1248657c30f3b81164..fa8efc20f59addb4526d2cbeaf34f161307c588a 100644
--- a/paddle/testing/TestUtil.cpp
+++ b/paddle/testing/TestUtil.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "TestUtil.h"
 #include <gflags/gflags.h>
-#include "paddle/math/SparseMatrix.h"
+#include "paddle/legacy/math/SparseMatrix.h"
 
 DEFINE_int32(fixed_seq_length, 0, "Produce some sequence of fixed length");
 
diff --git a/paddle/testing/TestUtil.h b/paddle/testing/TestUtil.h
index ec86469aebbafbf5406a21e6825eda6c105a6b9d..98b864e3c56f1938075bd039ba13a49ec457de50 100644
--- a/paddle/testing/TestUtil.h
+++ b/paddle/testing/TestUtil.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <gtest/gtest.h>
-#include "paddle/math/Matrix.h"
+#include "paddle/legacy/math/Matrix.h"
 
 namespace paddle {
 
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 0fea6a80794a64abc2dbf1428d534840febcd450..555be3d00e2dc467eec45210cc997779827ed69f 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -30,7 +30,9 @@ int main(int argc, char** argv) {
   new_argv.push_back(
       strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
 #else
-  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory"));
+  new_argv.push_back(strdup(
+      "--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_mb"));
+  new_argv.push_back(strdup("--undefok=use_mkldnn,initial_cpu_memory_in_mb"));
 #endif
   int new_argc = static_cast<int>(new_argv.size());
   char** new_argv_address = new_argv.data();
@@ -41,6 +43,6 @@ int main(int argc, char** argv) {
   paddle::memory::Used(paddle::platform::CUDAPlace(0));
 #endif
 
-  paddle::framework::InitDevices();
+  paddle::framework::InitDevices(true);
   return RUN_ALL_TESTS();
 }
diff --git a/paddle/trainer/CMakeLists.txt b/paddle/trainer/CMakeLists.txt
deleted file mode 100644
index 72911695bd4959d73d783897b0c5e674454c30bc..0000000000000000000000000000000000000000
--- a/paddle/trainer/CMakeLists.txt
+++ /dev/null
@@ -1,78 +0,0 @@
-# paddle trainer package
-
-set(TRAINER_SOURCES
-        ParameterUpdater.cpp
-        ParamUtil.cpp
-        RemoteParameterUpdater.cpp
-        NewRemoteParameterUpdater.cpp
-        Tester.cpp
-        Trainer.cpp
-        TrainerInternal.cpp
-        TrainerBenchmark.cpp
-        ThreadParameterUpdater.cpp
-        TrainerInternalConfig.cpp
-        TrainerConfigHelper.cpp)
-
-set(TRAINER_HEADERS
-        ParameterUpdater.h
-        ParamUtil.h
-        RemoteParameterUpdater.h
-        NewRemoteParameterUpdater.h
-        Tester.h
-        TesterConfig.h
-        Trainer.h
-        TrainerInternal.h
-        TrainerInternalConfig.h
-        ThreadParameterUpdater.h
-        TrainerConfigHelper.h)
-
-if(NOT WITH_GOLANG)
-  list(REMOVE_ITEM TRAINER_SOURCES
-          NewRemoteParameterUpdater.cpp)
-  list(REMOVE_ITEM TRAINER_HEADERS
-          NewRemoteParameterUpdater.h)
-endif()
-
-add_library(paddle_trainer_lib STATIC
-    ${TRAINER_SOURCES})
-
-add_style_check_target(paddle_trainer_lib
-    ${TRAINER_SOURCES})
-add_style_check_target(paddle_trainer_lib
-    ${TRAINER_HEADERS})
-add_dependencies(paddle_trainer_lib
-    paddle_proto
-    ${external_project_dependencies})
-
-macro(add_paddle_exe TARGET_NAME)
-  add_executable(${TARGET_NAME} ${ARGN})
-  add_style_check_target(${TARGET_NAME} ${ARGN})
-  link_paddle_exe(${TARGET_NAME})
-endmacro()
-
-if(WITH_TESTING)
-  add_subdirectory(tests)
-endif()
-
-if(NOT MOBILE_INFERENCE)
-  add_paddle_exe(paddle_trainer TrainerMain.cpp)
-  add_paddle_exe(paddle_merge_model MergeModel.cpp)
-
-  install(TARGETS paddle_trainer paddle_merge_model
-          RUNTIME DESTINATION opt/paddle/bin
-          PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
-          GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
-
-  set_target_properties(paddle_trainer PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
-  set_target_properties(paddle_merge_model PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
-endif()
-
-if(APPLE)
-  set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
-endif()
-
-if(WITH_GOLANG)
-  add_dependencies(paddle_trainer_lib paddle_pserver_cclient)
-  target_link_libraries(paddle_trainer_lib paddle_pserver_cclient)
-  target_link_libraries(paddle_trainer paddle_pserver_cclient)
-endif(WITH_GOLANG)
diff --git a/paddle/trainer/MergeModel.cpp b/paddle/trainer/MergeModel.cpp
deleted file mode 100644
index 56c38015fb2398f8b39fac6b5a5d4af1c2fd56aa..0000000000000000000000000000000000000000
--- a/paddle/trainer/MergeModel.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-
-#include "ParamUtil.h"
-#include "Trainer.h"
-#include "paddle/pserver/ParameterServer2.h"
-#include "paddle/utils/PythonUtil.h"
-
-DEFINE_string(model_dir, "", "Directory for separated model files");
-DEFINE_string(config_file, "", "Config file for the model");
-DEFINE_string(model_file, "", "File for merged model file");
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  initPython(argc, argv);
-
-  if (FLAGS_model_dir.empty() || FLAGS_config_file.empty() ||
-      FLAGS_model_file.empty()) {
-    LOG(INFO) << "Usage: ./paddle_merge_model --model_dir=pass-00000 "
-                 "--config_file=config.py --model_file=out.paddle";
-    return 0;
-  }
-
-  string confFile = FLAGS_config_file;
-#ifndef PADDLE_WITH_CUDA
-  FLAGS_use_gpu = false;
-#endif
-  auto config = std::make_shared<TrainerConfigHelper>(confFile);
-  unique_ptr<GradientMachine> gradientMachine(GradientMachine::create(*config));
-  gradientMachine->loadParameters(FLAGS_model_dir);
-
-  ofstream os(FLAGS_model_file);
-
-  string buf;
-  config->getConfig().SerializeToString(&buf);
-  int64_t size = buf.size();
-  os.write((char*)&size, sizeof(size));
-  CHECK(os) << "Fail to write to " << FLAGS_model_file;
-  os.write(buf.data(), buf.size());
-  vector<ParameterPtr>& parameters = gradientMachine->getParameters();
-  for (auto& para : parameters) {
-    para->save(os);
-    CHECK(os) << "Fail to write to " << FLAGS_model_file;
-  }
-  os.close();
-
-  return 0;
-}
diff --git a/paddle/trainer/NewRemoteParameterUpdater.cpp b/paddle/trainer/NewRemoteParameterUpdater.cpp
deleted file mode 100644
index 410ac6d95c4d65ce6fb25c05351bb8ddb24473f4..0000000000000000000000000000000000000000
--- a/paddle/trainer/NewRemoteParameterUpdater.cpp
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "NewRemoteParameterUpdater.h"
-#include "Trainer.h"
-#include "paddle/utils/Stat.h"
-
-DECLARE_int32(trainer_id);
-DECLARE_string(save_dir);
-
-namespace paddle {
-NewRemoteParameterUpdater::NewRemoteParameterUpdater(
-    const OptimizationConfig &config, const std::string pserverSpec)
-    : trainerConfig_(config),
-      parameterClient_(-1),
-      newParameters_(nullptr),
-      newGradients_(nullptr),
-      pserverSpec_(pserverSpec) {}
-
-NewRemoteParameterUpdater::NewRemoteParameterUpdater(
-    const OptimizationConfig &config,
-    const std::string pserverSpec,
-    const bool useEtcd)
-    : trainerConfig_(config),
-      parameterClient_(-1),
-      newParameters_(nullptr),
-      newGradients_(nullptr),
-      pserverSpec_(pserverSpec),
-      useEtcd_(useEtcd) {}
-
-void NewRemoteParameterUpdater::init(
-    const std::vector<ParameterPtr> &parameters) {
-  ParameterUpdater::init(parameters);
-
-  // create parameter server client.
-  if (useEtcd_) {
-    parameterClient_ =
-        paddle_new_etcd_pserver_client((char *)pserverSpec_.c_str());
-  } else {
-    parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(),
-                                                 FLAGS_trainer_id == 0);
-  }
-
-  // init new parameter and gradient.
-  newParameters_ = initNewParameter(PARAMETER_VALUE);
-  newGradients_ = initNewParameter(PARAMETER_GRADIENT);
-
-  // init parameter, one trainer will get the opportunity to int parameter and
-  // send them to parameter server. Others will get the initialized parameter
-  // from parameter server
-  if (paddle_begin_init_params(parameterClient_)) {
-    LOG(INFO) << "paddle_begin_init_params start";
-    // NOTE: convert V1 OptimizatioinConfig proto to V2 OptimizerConfig.
-    // This makes golang pserver compatible with handy V1 demos.
-    // TODO(wuyi): Refine or remove these ugly converting lines
-    OptimizerConfig optimizerConfigV2;
-    if (trainerConfig_.learning_method() == "momentum") {
-      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
-    } else if (trainerConfig_.learning_method() == "adagrad") {
-      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adagrad);
-      optimizerConfigV2.mutable_adagrad()->set_epsilon(
-          trainerConfig_.ada_epsilon());
-    } else if (trainerConfig_.learning_method() == "adadelta") {
-      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adagrad);
-      optimizerConfigV2.mutable_adadelta()->set_epsilon(
-          trainerConfig_.ada_epsilon());
-      optimizerConfigV2.mutable_adadelta()->set_rho(trainerConfig_.ada_rou());
-    } else if (trainerConfig_.learning_method() == "adam") {
-      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adam);
-      optimizerConfigV2.mutable_adam()->set_beta_1(trainerConfig_.adam_beta1());
-      optimizerConfigV2.mutable_adam()->set_beta_2(trainerConfig_.adam_beta2());
-      optimizerConfigV2.mutable_adam()->set_epsilon(
-          trainerConfig_.adam_epsilon());
-    } else {
-      LOG(ERROR) << "got unsupported v1 optimizer config: "
-                 << trainerConfig_.learning_method();
-      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
-    }
-
-    if (trainerConfig_.learning_rate_schedule() == "constant") {
-      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
-      optimizerConfigV2.mutable_const_lr()->set_learning_rate(
-          trainerConfig_.learning_rate());
-    } else if (trainerConfig_.learning_rate_schedule() == "linear") {
-      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Linear);
-      optimizerConfigV2.mutable_linear_lr()->set_learning_rate(
-          trainerConfig_.learning_rate());
-      optimizerConfigV2.mutable_linear_lr()->set_lr_decay_a(
-          trainerConfig_.learning_rate_decay_a());
-      optimizerConfigV2.mutable_linear_lr()->set_lr_decay_b(
-          trainerConfig_.learning_rate_decay_b());
-    } else {
-      LOG(ERROR) << "got unsupported v1 learning_rate_schedule config: "
-                 << trainerConfig_.learning_rate_schedule() << ", set to const";
-      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
-      optimizerConfigV2.mutable_const_lr()->set_learning_rate(
-          trainerConfig_.learning_rate());
-    }
-
-    // overwrite optimizerConfigV2 for per-parameter(layer) configs
-    for (int i = 0; i < parameterSize(); ++i) {
-      // FIXME(typhoonzero): paramConfig always have default values,
-      // how to check if it's default?
-      // TODO(typhoonzero): log output: optimizerConfigV2.DebugString();
-      LOG(INFO) << "trainerConfig_: " << trainerConfig_.DebugString();
-      // send param and config to pserver
-      std::string bytes = optimizerConfigV2.SerializeAsString();
-      const char *array = bytes.data();
-      int size = (int)bytes.size();
-      paddle_init_param(
-          parameterClient_, *newParameters_[i], (void *)array, size);
-    }
-    paddle_finish_init_params(parameterClient_);
-    LOG(INFO) << "paddle_begin_init_params done";
-  } else {
-    paddle_get_params(parameterClient_, newParameters_, parameterSize());
-  }
-
-  LOG(INFO) << "NewRemoteParameterUpdater initialized";
-}
-
-void NewRemoteParameterUpdater::updateImpl(Parameter *para) {}
-
-void NewRemoteParameterUpdater::finishBatch(real cost) {
-  // send gradient to parameter server.
-  paddle_send_grads(parameterClient_, newGradients_, parameterSize());
-  // get the updated parameter from parameterClient.
-  paddle_get_params(parameterClient_, newParameters_, parameterSize());
-
-  // clear gradient after update parameter.
-  for (auto &para : parameters_) {
-    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
-  }
-}
-
-void NewRemoteParameterUpdater::startPass() {}
-
-bool NewRemoteParameterUpdater::finishPass() { return true; }
-}  // namespace paddle
diff --git a/paddle/trainer/NewRemoteParameterUpdater.h b/paddle/trainer/NewRemoteParameterUpdater.h
deleted file mode 100644
index 6223ba427c9b94494c2bee8f0847442f1b0574c9..0000000000000000000000000000000000000000
--- a/paddle/trainer/NewRemoteParameterUpdater.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <thread>
-#include "OptimizerConfig.pb.h"
-#include "ParameterUpdater.h"
-#include "libpaddle_pserver_cclient.h"
-#include "paddle/pserver/ParameterClient2.h"
-#include "paddle/utils/Queue.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-/**
- * New remote parameter updater for dense parameters that use cclient of go.
- */
-class NewRemoteParameterUpdater : public ParameterUpdater {
-public:
-  NewRemoteParameterUpdater(const OptimizationConfig& config,
-                            const std::string pserverSpec);
-  NewRemoteParameterUpdater(const OptimizationConfig& config,
-                            const std::string pserverSpec,
-                            const bool useEtcd);
-  ~NewRemoteParameterUpdater() {
-    releaseNewParameter(newParameters_);
-    releaseNewParameter(newGradients_);
-    if (parameterClient_ >= 0) paddle_pserver_client_release(parameterClient_);
-  }
-
-  /**
-   * initialize the internal parameter client and itself.
-   */
-  virtual void init(const std::vector<ParameterPtr>& parameters);
-  /**
-   * @brief start batch
-   *
-   * @note  one batch training exhibits stateful feature to help
-   *        to do performance tuning, sgd optimization if necessary.
-   */
-  virtual PassType startBatch(int64_t batchSize) { return PASS_TRAIN; }
-
-  /**
-   * send parameters to pservers and get returned parameters
-   * from all pservers if necessary.
-   */
-  virtual void finishBatch(real cost);
-  virtual void startPass();
-  virtual bool finishPass();
-
-protected:
-  /**
-   * work need to do after finishBatch
-   */
-  virtual void updateImpl(Parameter* para);
-
-private:
-  int parameterSize() { return (int)parameters_.size(); }
-
-  /**
-   * init parameter of go paddle pserver cclient.
-   * @param new_params
-   * @param type
-   */
-  paddle_parameter** initNewParameter(ParameterType type) {
-    paddle_parameter** new_params =
-        (paddle_parameter**)malloc(sizeof(paddle_parameter*) * parameterSize());
-    for (int i = 0; i < parameterSize(); ++i) {
-      new_params[i] = (paddle_parameter*)malloc(sizeof(paddle_parameter));
-      memset(new_params[i], 0, sizeof(paddle_parameter));
-    }
-
-    for (int i = 0; i < parameterSize(); ++i) {
-      ParameterPtr param = parameters_[i];
-      new_params[i]->element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
-      new_params[i]->name = (char*)param->getName().c_str();
-      new_params[i]->content =
-          (unsigned char*)(param->getBuf(type).get()->getData());
-      new_params[i]->content_len =
-          (int)param->getBuf(type).get()->getSize() * sizeof(real);
-    }
-    return new_params;
-  }
-
-  void releaseNewParameter(paddle_parameter** newParams) {
-    if (newParams != nullptr) {
-      for (int i = 0; i < parameterSize(); ++i) {
-        free(newParams[i]);
-      }
-      free(newParams);
-    }
-  }
-
-protected:
-  const OptimizationConfig& trainerConfig_;
-  /// internal parameter client object for exchanging data with pserver
-  paddle_pserver_client parameterClient_;
-  /// the parameters for new pserver client
-  paddle_parameter** newParameters_;
-  /// the gradinets for new pserver client
-  paddle_parameter** newGradients_;
-  /// the specification of parameter server "host1:port,host1:port"
-  std::string pserverSpec_;
-  /// true if pserverSpec_ is etcd endpoint, else pserverSpec_ is pserver addr
-  bool useEtcd_;
-};
-
-}  // namespace paddle
diff --git a/paddle/trainer/ParamUtil.cpp b/paddle/trainer/ParamUtil.cpp
deleted file mode 100644
index ffbca42e106591ddeb2cefcfafbeb408c544371b..0000000000000000000000000000000000000000
--- a/paddle/trainer/ParamUtil.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParamUtil.h"
-
-#include <fenv.h>
-#include <stdio.h>
-
-#include <iomanip>
-#include <iostream>
-#include <limits>
-#include <sstream>
-
-#include <google/protobuf/text_format.h>
-#include <paddle/utils/Version.h>
-
-#include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-
-#include "TesterConfig.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/gserver/layers/ValidationLayer.h"
-
-namespace paddle {
-
-ParameterUtil::ParameterUtil(
-    const std::shared_ptr<TrainerConfigHelper> &config,
-    std::unique_ptr<ParameterUtilConfig> &&intconfig,
-    const GradientMachinePtr &gradientMachine,
-    const std::shared_ptr<ParameterUpdater> &parameterUpdater) {
-  config_ = config;
-  intConfig_ = std::move(intconfig);
-  gserver_ = gradientMachine;
-  pUpdater_ = parameterUpdater;
-}
-
-bool ParameterUtil::loadParameters(int passId, bool local, bool remote) {
-  constexpr int kBufLen = 100;
-  char buf[kBufLen];
-  snprintf(buf, kBufLen, "pass-%05d", passId);
-  std::string doneFile = path::join(config_->getSaveDir(), buf, "done");
-  if (!fileExist(doneFile.c_str())) return false;
-  loadParametersWithPath(path::join(config_->getSaveDir(), buf), local, remote);
-  return true;
-}
-
-void ParameterUtil::loadParametersWithPath(const std::string &dir,
-                                           bool local,
-                                           bool remote) {
-  if (local) {
-    gserver_->loadParameters(dir);
-  }
-  if (remote && pUpdater_) {
-    pUpdater_->loadParametersRemote(dir);
-  }
-}
-
-void ParameterUtil::saveParametersOnePass(int passId, int passInnerId) {
-  pUpdater_->apply();
-  saveParameters(passId, passInnerId);
-  if (intConfig_->save_only_one_ && passId >= intConfig_->saving_period_) {
-    deleteParameters(passId - intConfig_->saving_period_);
-  }
-  pUpdater_->restore();
-}
-
-void ParameterUtil::saveParameters(int passId, int passInnerId) {
-  constexpr int kBufLen = 100;
-  char buf[kBufLen];
-  if (passInnerId > 0) {
-    snprintf(buf, kBufLen, "pass-%05d-%03d", passId, passInnerId);
-  } else {
-    snprintf(buf, kBufLen, "pass-%05d", passId);
-  }
-
-  std::string basePath = config_->getSaveDir();
-  if (basePath.find('/') == std::string::npos) {
-    basePath = "./" + basePath;
-  }
-  mkDirRecursively(basePath.c_str());
-
-  std::string saveDir = path::join(basePath, buf);
-  mkDir(saveDir.c_str());
-  if (!intConfig_->load_save_param_pserver_) {
-    pUpdater_->getParametersRemote(true /*full parameter*/,
-                                   true /*after apply*/);
-  }
-
-  gserver_->saveParameters(saveDir);
-  if (intConfig_->load_save_param_pserver_) {
-    pUpdater_->saveParametersRemote(saveDir);
-  }
-  std::string doneFile = path::join(saveDir, "done");
-  touchFile(doneFile.c_str());
-  std::ofstream out(doneFile);
-  version::printVersion(out);
-  out.close();
-  VLOG(1) << "save dir " << saveDir;
-  saveConfigWithPath(saveDir);
-}
-
-void ParameterUtil::deleteParameters(int passId, int passInnerId) {
-  constexpr int kBufLen = 100;
-  char buf[kBufLen];
-  const std::string &saveDir = config_->getSaveDir();
-  if (passInnerId > 0) {
-    snprintf(buf,
-             kBufLen,
-             "%s/pass-%05d-%03d",
-             saveDir.c_str(),
-             passId,
-             passInnerId);
-  } else {
-    snprintf(buf, kBufLen, "%s/pass-%05d", saveDir.c_str(), passId);
-  }
-  mkDir(saveDir.c_str());
-  LOG(INFO) << "delete dir " << buf;
-  rmDir(buf);
-}
-
-void ParameterUtil::saveConfigWithPath(const std::string &path) {
-  std::string src;
-  // save config in some path
-  if (!intConfig_->config_.empty()) {
-    src = intConfig_->config_;
-  } else {
-    bool ok;
-    src = config_->getConfigName(&ok);
-    if (!ok) {
-      return;
-    }
-  }
-  copyFileToPath(src, path);
-
-  // save other import config file name to path.txt
-  std::string ss = path::join(path, "path.txt");
-  std::ofstream os(ss);
-  std::string fileName = path::basename(src);
-  CHECK(os.write(fileName.c_str(), fileName.length()))
-      << "Fail to write config file name " << ss;
-  VLOG(1) << "fileName " << fileName;
-  os.close();
-
-  // copy other import config files
-  for (int i = 0; i < config_->getConfig().config_files_size(); ++i) {
-    copyFileToPath(config_->getConfig().config_files(i), path);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/trainer/ParamUtil.h b/paddle/trainer/ParamUtil.h
deleted file mode 100644
index 2e05595848760c9abd7d916003656c8103151abf..0000000000000000000000000000000000000000
--- a/paddle/trainer/ParamUtil.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/utils/Util.h"
-
-#include <stdio.h>
-
-#include "hl_gpu.h"
-#include "paddle/gserver/dataproviders/DataProvider.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-
-#include <stdlib.h>
-#include <fstream>
-#include "ParameterUpdater.h"
-#include "TrainerConfig.pb.h"
-#include "TrainerConfigHelper.h"
-
-namespace paddle {
-
-/**
- * Configuration for parameter utils.
- */
-struct ParameterUtilConfig {
-  DISABLE_COPY(ParameterUtilConfig);
-
-  ParameterUtilConfig(bool save_only_one,
-                      int saving_period,
-                      bool load_save_parameters_in_pserver,
-                      std::string config)
-      : save_only_one_(save_only_one),
-        saving_period_(saving_period),
-        load_save_param_pserver_(load_save_parameters_in_pserver),
-        config_(config) {}
-
-  bool save_only_one_;
-  int saving_period_;
-  bool load_save_param_pserver_;
-  std::string config_;
-};
-
-/**
- * ParameterUtil
- * Utility class for loading and saving parameters
- */
-class ParameterUtil {
-public:
-  /**
-   * Ctor.
-   *
-   * @param config
-   * @param intconfig
-   * @param gradientMachine
-   * @param parameterUpdater
-   * @return
-   */
-  ParameterUtil(const std::shared_ptr<TrainerConfigHelper> &config,
-                std::unique_ptr<ParameterUtilConfig> &&intconfig,
-                const GradientMachinePtr &gradientMachine,
-                const std::shared_ptr<ParameterUpdater> &parameterUpdater);
-
-  /// Load parameter from the saved parameter file as pass passId
-  /// if loadsave_parameters_in_pserver is set, some parameters MUST
-  /// load in pserver, which is "remote".
-  /// loadParameters can choose to load local/remote parameter, or both.
-  bool loadParameters(int passId, bool local = true, bool remote = false);
-
-  /// load parameters given path info
-  void loadParametersWithPath(const std::string &dir,
-                              bool local = true,
-                              bool remote = false);
-
-  /// Save parameter to dist for pass passId
-  /// passInnerId means saving times in one pass, some users want to
-  /// save parameters when have processed some batches in one pass
-  /// passInnerId = 0 means do not need to save in one inner pass
-  void saveParameters(int passId, int passInnerId = 0);
-
-  /// save parameters for one pass, when passInnerId > 0 means saving
-  /// the passInnerId times in one pass
-  void saveParametersOnePass(int passId, int passInnerId = 0);
-
-  /// delete parameter from disk via passId
-  void deleteParameters(int passId, int passInnerId = 0);
-
-  /// save config given path info
-  void saveConfigWithPath(const std::string &path);
-
-  /**
-   * Try to load parameter from config.
-   * @return true if can load from trainer config.
-   */
-  inline bool tryLoadParametersFromConfig() {
-    auto &c = config_->getConfig();
-    if (!c.init_model_path().empty()) {
-      loadParametersWithPath(c.init_model_path());
-      return true;
-    } else if (c.start_pass() > 0) {
-      CHECK(loadParameters(c.start_pass() - 1));
-      return true;
-    } else {
-      return false;
-    }
-  }
-
-private:
-  std::shared_ptr<TrainerConfigHelper> config_;
-  std::unique_ptr<ParameterUtilConfig> intConfig_;
-  GradientMachinePtr gserver_;
-  std::shared_ptr<ParameterUpdater> pUpdater_;
-};
-
-}  //  namespace paddle
diff --git a/paddle/trainer/ParameterUpdater.cpp b/paddle/trainer/ParameterUpdater.cpp
deleted file mode 100644
index 4e9e890c85945aedd7e604f52a06902191c95d4c..0000000000000000000000000000000000000000
--- a/paddle/trainer/ParameterUpdater.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ParameterUpdater.h"
-
-#include "paddle/utils/Logging.h"
-
-#include "paddle/utils/Thread.h"
-
-namespace paddle {
-
-static const hl_stream_t kDeviceToHostStream = HPPL_STREAM_1;
-static const hl_stream_t kHostToDeviceStream = HPPL_STREAM_2;
-
-SgdUpdaterWithCpuAverager::SgdUpdaterWithCpuAverager(
-    const OptimizationConfig& optConfig)
-    : SgdLocalUpdater(optConfig, false /*with averager*/) {
-  CHECK(FLAGS_use_gpu && optConfig.do_average_in_cpu());
-  averager_.reset(AverageOptimizer::create(optConfig,
-                                           new DummyOptimizer(optConfig),
-                                           false /*sparse*/,
-                                           true /*apply*/));
-  updateWorker_.addJob([]() { hl_set_device(FLAGS_gpu_id); });
-}
-
-void SgdUpdaterWithCpuAverager::init(
-    const std::vector<ParameterPtr>& parameters) {
-  SgdLocalUpdater::init(parameters);
-  averager_->init(parameters_.size(), nullptr);
-  copyEvents_.resize(parameters_.size());
-  for (auto& parameter : parameters) {
-    SetDevice device(parameter->getDeviceId());
-    cpuParameters_.emplace_back(new Parameter(parameter->getConfig(),
-                                              /* useGpu= */ false,
-                                              /* doInit= */ false));
-    if (parameter->useGpu()) {
-      cpuParameters_.back()->enableType(PARAMETER_APPLY);
-    } else {
-      cpuParameters_.back()->enableSharedType(
-          PARAMETER_APPLY, parameter->getBuf(PARAMETER_VALUE));
-    }
-    for (ParameterType type : averager_->getParameterTypes()) {
-      cpuParameters_.back()->enableType(type);
-    }
-
-    hl_create_event(&copyEvents_[nonStaticParaIDMap_[parameter->getID()]]);
-  }
-}
-
-SgdUpdaterWithCpuAverager::~SgdUpdaterWithCpuAverager() {
-  for (auto& event : copyEvents_) {
-    hl_destroy_event(event);
-  }
-}
-
-void SgdUpdaterWithCpuAverager::updateImpl(Parameter* para) {
-  SgdLocalUpdater::updateImpl(para);
-
-  if (para->useGpu()) {
-    size_t pid = nonStaticParaIDMap_[para->getID()];
-    Parameter* cpuPara = cpuParameters_[pid].get();
-    cpuPara->getBuf(PARAMETER_VALUE)
-        ->copyFrom(*para->getBuf(PARAMETER_VALUE), kDeviceToHostStream);
-    hl_stream_record_event(kDeviceToHostStream, copyEvents_[pid]);
-  }
-
-  updateWorker_.addJob(
-      std::bind(&SgdUpdaterWithCpuAverager::updateFunc, this, para));
-}
-
-void SgdUpdaterWithCpuAverager::updateFunc(Parameter* para) {
-  SetDevice setDevice(para->getDeviceId());
-  size_t pid = nonStaticParaIDMap_[para->getID()];
-  Parameter* cpuPara = cpuParameters_[pid].get();
-  if (para->useGpu()) {
-    hl_event_synchronize(copyEvents_[pid]);
-  }
-  averager_->update(cpuPara->getBufs(), cpuPara->getConfig(), -1LU);
-}
-
-void SgdUpdaterWithCpuAverager::finishBatch(real cost) {
-  SgdLocalUpdater::finishBatch(cost);
-
-  updateWorker_.wait();
-  for (auto para : cpuParameters_) {
-    if (auto callback = averager_->needSpecialTraversal(para->getConfig())) {
-      callback(para->getBufs(), para->getConfig(), -1LU);
-    }
-  }
-  averager_->finishBatch();
-}
-
-void SgdUpdaterWithCpuAverager::apply() {
-  // backup gpu value
-  for (auto& para : parameters_) {
-    SetDevice setDevice(para->getDeviceId());
-    para->getBuf(PARAMETER_GRADIENT)
-        ->copyFrom(*para->getBuf(PARAMETER_VALUE), kHostToDeviceStream);
-  }
-
-  // apply on cpu parameter
-  if (auto callback = averager_->apply()) {
-    for (auto para : cpuParameters_) {
-      callback(para->getBufs(), para->getConfig(), -1LU);
-    }
-  }
-
-  // copy to gpu value
-  for (auto& para : parameters_) {
-    SetDevice setDevice(para->getDeviceId());
-    size_t pid = nonStaticParaIDMap_[para->getID()];
-    Parameter* cpuPara = cpuParameters_[pid].get();
-    if (parameters_[pid]->useGpu()) {
-      para->getBuf(PARAMETER_VALUE)
-          ->copyFrom(*cpuPara->getBuf(PARAMETER_APPLY), kHostToDeviceStream);
-    }
-  }
-  hl_stream_synchronize(kHostToDeviceStream);
-  for (auto& para : parameters_) {
-    para->setValueUpdated();
-  }
-}
-
-void SgdUpdaterWithCpuAverager::restore() {
-  // restore on cpu parameter
-  if (auto callback = averager_->restore()) {
-    for (auto para : cpuParameters_) {
-      callback(para->getBufs(), para->getConfig(), -1LU);
-    }
-  }
-
-  // restore gpu value
-  for (auto& para : parameters_) {
-    SetDevice device(para->getDeviceId());
-    para->getBuf(PARAMETER_VALUE)->copyFrom(*para->getBuf(PARAMETER_GRADIENT));
-    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
-    para->setValueUpdated();
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/trainer/ParameterUpdater.h b/paddle/trainer/ParameterUpdater.h
deleted file mode 100644
index 9e9e948b8856d2712f8894b3d14db9c795d5f694..0000000000000000000000000000000000000000
--- a/paddle/trainer/ParameterUpdater.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/utils/Thread.h"
-#include "paddle/utils/Util.h"
-
-#include "paddle/parameter/AverageOptimizer.h"
-#include "paddle/parameter/FirstOrderOptimizer.h"
-#include "paddle/parameter/OptimizerFunctions.h"
-#include "paddle/parameter/OptimizerWithRegularizer.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/parameter/ParameterUpdaterBase.h"
-
-#include "TrainerConfig.pb.h"
-#include "paddle/gserver/layers/Layer.h"
-
-#include <memory>
-#include <vector>
-
-namespace paddle {
-
-/**
- * @brief Parameter Updater for SGD, and local(not cluster) run.
- */
-class SgdLocalUpdater : public ParameterUpdater {
-public:
-  /**
-   * @brief Ctor. Initialize optimizer locally by optConfig.
-   * @param optConfig optimization config.
-   * @param withAverager with average optimizer or not, default is true.
-   */
-  explicit SgdLocalUpdater(const OptimizationConfig& optConfig,
-                           bool withAverager = true)
-      : numSamplesProcessed_(0) {
-    auto baseOptimizer = ParameterOptimizer::create(optConfig);
-    optimizer_.reset(withAverager
-                         ? AverageOptimizer::create(optConfig, baseOptimizer)
-                         : baseOptimizer);
-    CHECK(optimizer_) << "fail to create optimizer: "
-                      << optConfig.learning_method();
-    auto types = optimizer_->getParameterTypes();
-    for (auto type : types) {
-      addParameterType(type);
-    }
-  }
-
-  /**
-   * @brief Initialize parameters and optimizer_.
-   *        For example,
-   *           If optimizer need hassien vector, then parameter's hassien will
-   *           be initialized.
-   * @param parameters The parameter need to be initialized.
-   */
-  virtual void init(const std::vector<ParameterPtr>& parameters) {
-    ParameterUpdater::init(parameters);
-    optimizer_->init(parameters_.size(), nullptr);
-    // check no L1 decay in parameter configs
-    CHECK(std::find_if(parameters.begin(),
-                       parameters.end(),
-                       [](const ParameterPtr& para) {
-                         return para->getConfig().decay_rate_l1() > 0.0f;
-                       }) == parameters.end())
-        << "SgdLocalUpdater cannot support L1 decay in parameter";
-  }
-
-  /**
-   * @brief Start a batch with current mini-batch size
-   * @param current mini-batch size.
-   * @return Always PASS_TRAIN.
-   */
-  virtual PassType startBatch(int64_t batchSize) {
-    numSamplesProcessed_ += batchSize;
-    optimizer_->startBatch(numSamplesProcessed_);
-    return PASS_TRAIN;
-  }
-
-  /**
-   * @brief finish a mini-batch.
-   */
-  virtual void finishBatch(real cost) { optimizer_->finishBatch(); }
-
-  /**
-   * @brief start a pass.
-   */
-  virtual void startPass() { optimizer_->startPass(); }
-
-  /**
-   * @brief finish a pass.
-   * @param cost sum cost during one pass.
-   * @return true if accept (used for owlqn).
-   */
-  virtual bool finishPass() {
-    optimizer_->finishPass();
-    return ParameterUpdater::finishPass();
-  }
-
-  /**
-   * @brief apply model average.
-   */
-  virtual void apply() {
-    if (auto callback = optimizer_->apply()) {
-      for (auto para : parameters_) {
-        SetDevice device(para->getDeviceId());
-        callback(para->getBufs(), para->getConfig(), -1UL);
-      }
-    }
-  }
-
-  /**
-   * @brief restore parameter value before model average
-   */
-  virtual void restore() {
-    if (auto callback = optimizer_->restore()) {
-      for (auto para : parameters_) {
-        SetDevice device(para->getDeviceId());
-        callback(para->getBufs(), para->getConfig(), -1UL);
-      }
-    }
-  }
-
-protected:
-  /**
-   * @brief update method. Update value from gradient.
-   * @param para parameter that will be updated.
-   */
-  virtual void updateImpl(Parameter* para) {
-    optimizer_->update(para->getBufs(), para->getConfig());
-    if (auto callback = optimizer_->needSpecialTraversal(para->getConfig())) {
-      callback(para->getBufs(), para->getConfig(), -1UL);
-    }
-
-    para->setValueUpdated();
-    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
-  }
-
-  std::unique_ptr<ParameterOptimizer> optimizer_;
-
-  /**
-   * @brief total number of samples processed.
-   */
-  int64_t numSamplesProcessed_;
-};
-
-/**
- * @brief SgdCpuUpdater is used only in recursive neural network
- * @deprecated
- */
-class SgdCpuUpdater : public SgdLocalUpdater, public Deprecated {
-public:
-  explicit SgdCpuUpdater(const OptimizationConfig& optConfig)
-      : SgdLocalUpdater(optConfig),
-        Deprecated(
-            "SgdCpuUpdater is used only in recursive neural network, "
-            "and recursive neural network is deprecated in paddle. "
-            "Use it all by your own.") {}
-
-  /**
-   * @brief update all parameter on finish batch.
-   * @param cost
-   */
-  virtual void finishBatch(real cost) {
-    for (auto para : parameters_) {
-      SgdLocalUpdater::update(para.get());
-    }
-    optimizer_->finishBatch();
-  }
-
-protected:
-  /**
-   * @brief do nothing.
-   * @param para
-   */
-  virtual void updateImpl(Parameter* para) {}
-};
-
-/**
- * @brief Sgd Local Updater With average in cpu.
- *
- * It will do model average in cpu to reduce gpu memory comsuption.
- */
-class SgdUpdaterWithCpuAverager : public SgdLocalUpdater {
-public:
-  /**
-   * @brief Ctor.
-   *
-   * SgdUpdaterWithCpuAverager will do everything as a
-   * SgdLocalUpdater, then copy parameter from GPU to CPU, and do model
-   * average in cpu.
-   */
-  explicit SgdUpdaterWithCpuAverager(const OptimizationConfig& optConfig);
-  ~SgdUpdaterWithCpuAverager();
-
-  /**
-   * @brief init. Initialize cpu parameters, model average optimizer.
-   * @param parameters
-   */
-  virtual void init(const std::vector<ParameterPtr>& parameters);
-
-  virtual PassType startBatch(int64_t batchSize) {
-    averager_->startBatch(-1UL);
-    return SgdLocalUpdater::startBatch(batchSize);
-  }
-  virtual void finishBatch(real cost);
-
-  virtual void startPass() {
-    averager_->startPass();
-    SgdLocalUpdater::startPass();
-  }
-  virtual bool finishPass() {
-    averager_->finishPass();
-    return SgdLocalUpdater::finishPass();
-  }
-
-  /// apply the averaged parameter to PARAMETER_VALUE
-  /// use PARAETER_GRADIENT for backing up PARAMETER_VALUE
-  virtual void apply();
-
-  /**
-   * @brief Restore parameter before apply().
-   */
-  virtual void restore();
-
-protected:
-  virtual void updateImpl(Parameter* para);
-
-  void updateFunc(Parameter* para);
-
-protected:
-  std::unique_ptr<ParameterOptimizer> averager_;
-
-  /**
-   * @brief The thread worker which do model average.
-   *
-   * For each parameter, GPU->CPU parameter is async, and do model average in
-   * another thread. Because the training process don't need model average while
-   * training, and model average only used in evaluation stage and saving stage.
-   * So the model average is totally async.
-   */
-  ThreadWorker updateWorker_;
-
-  /**
-   * @brief The parameter mirror in cpu.
-   */
-  std::vector<ParameterPtr> cpuParameters_;
-
-  /**
-   * @brief GPU -> CPU copy event. Model average will wait after copy done.
-   */
-  std::vector<hl_event_t> copyEvents_;
-};
-
-}  // namespace paddle
diff --git a/paddle/trainer/RemoteParameterUpdater.cpp b/paddle/trainer/RemoteParameterUpdater.cpp
deleted file mode 100644
index 7314266cb24da9b9e9f0f1cbe61ed363247f51fe..0000000000000000000000000000000000000000
--- a/paddle/trainer/RemoteParameterUpdater.cpp
+++ /dev/null
@@ -1,843 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "RemoteParameterUpdater.h"
-#include "Trainer.h"
-#include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/Stat.h"
-
-DECLARE_int32(trainer_id);
-DECLARE_string(save_dir);
-
-namespace paddle {
-
-static const hl_stream_t kDeviceToHostStream = HPPL_STREAM_1;
-static const hl_stream_t kHostToDeviceStream = HPPL_STREAM_2;
-static const int kFinishBatchPid = -1;
-
-const std::string RemoteParameterUpdater::kAverage = "average";
-const std::string RemoteParameterUpdater::kElasticAverage = "elastic_average";
-
-RemoteParameterUpdater::RemoteParameterUpdater(
-    const OptimizationConfig& config,
-    int expectedPassCount,
-    std::unique_ptr<ParameterUpdater>&& localUpdater)
-    : config_(config),
-      localUpdater_(std::move(localUpdater)),
-      numBatches_(0),
-      passCount_(0),
-      expectedPassCount_(expectedPassCount),
-      separateSendAndRecv_(false),
-      isFirstPass_(true),
-      useApplyInPserver_(false) {
-  addParameterType(PARAMETER_MOMENTUM);
-}
-
-void RemoteParameterUpdater::init(const std::vector<ParameterPtr>& parameters) {
-  ParameterUpdater::init(parameters);
-
-  if (localUpdater_) {
-    localUpdater_->init(parameters);
-
-    for (auto& parameter : parameters) {
-      parameter->enableType(PARAMETER_DELTA);
-    }
-
-    CHECK(config_.center_parameter_update_method() == kAverage ||
-          config_.center_parameter_update_method() == kElasticAverage)
-        << "unknown center_parameter_update_method";
-
-    // modify delta_add_rate
-    CHECK_GT(FLAGS_num_gradient_servers, 1)
-        << "FLAGS_num_gradient_servers should be set in trainer args.";
-    real delta_add_rate = config_.delta_add_rate() / FLAGS_num_gradient_servers;
-    config_.set_delta_add_rate(delta_add_rate);
-    LOG(INFO) << "center parameter in pserver,"
-              << " modify delta_add_rate=" << delta_add_rate;
-  }
-
-  if (!FLAGS_use_gpu) {
-    cpuParameters_ = parameters;
-  } else {
-    for (auto& parameter : parameters) {
-      cpuParameters_.emplace_back(new Parameter(parameter->getConfig(),
-                                                /* useGpu= */ false));
-      cpuParameters_.back()->setID(parameter->getID());
-      if (localUpdater_) {
-        cpuParameters_.back()->enableType(PARAMETER_DELTA);
-      }
-    }
-  }
-
-  parameterClient_.reset(new ParameterClient2(separateSendAndRecv_));
-  parameterClient_->init(cpuParameters_);
-  parameterClient_->setTrainerId(FLAGS_trainer_id);
-
-  if (FLAGS_trainer_id == 0) {
-    parameterClient_->setConfig(config_);
-    copyParametersFromDevice(PARAMETER_VALUE);
-    parameterClient_->setParameter();
-    parameterClient_->setStatus(PSERVER_STATUS_PARAMETER_READY);
-  } else {
-    parameterClient_->waitForStatus(PSERVER_STATUS_PARAMETER_READY);
-    parameterClient_->getParameter();
-    copyParametersToDevice(PARAMETER_VALUE);
-  }
-  if (FLAGS_trainer_id == 0 &&
-      (config_.algorithm() != TrainAlgorithm::AsyncSGD)) {
-    startController();
-    useApplyInPserver_ = useApplyInPserver(config_);
-  }
-}
-
-void RemoteParameterUpdater::startController() {
-  controllerThread_.reset(new std::thread([this]() { this->controller(); }));
-}
-
-void RemoteParameterUpdater::controller() {
-  ParameterClient2 client(false);
-  client.init(cpuParameters_);
-  while (true) {
-    /*start pass*/ {
-      client.waitPassStart();
-
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_START_PASS);
-      client.doOperation(ops,
-                         /* waitForGradient= */ false,
-                         /* sendBackarameter= */ false,
-                         /* releasePass= */ false);
-    }
-
-    while (true) {
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_SGD);
-      client.doOperation(ops,
-                         /* waitForGradient= */ true,
-                         /* sendBackarameter= */ true,
-                         /* releasePass= */ false);
-      if (client.isPassFinish()) {
-        break;
-      }
-    }
-
-    /*finish pass*/ {
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_FINISH_PASS);
-      client.doOperation(ops,
-                         /* waitForGradient= */ true,
-                         /* sendBackarameter= */ true,
-                         /* releasePass= */ true);
-    }
-
-    passCount_++;
-    if (passCount_ == expectedPassCount_) {
-      break;
-    }
-  }
-}
-
-void RemoteParameterUpdater::copyParametersToDevice(
-    ParameterType parameterType) {
-  if (!FLAGS_use_gpu) {
-    return;
-  }
-  int numParameters = cpuParameters_.size();
-  for (int i = 0; i < numParameters; ++i) {
-    parameters_[i]
-        ->getBuf(parameterType)
-        ->copyFrom(*cpuParameters_[i]->getBuf(parameterType));
-    if (parameterType == PARAMETER_VALUE) {
-      parameters_[i]->setValueUpdated();
-    }
-  }
-}
-
-void RemoteParameterUpdater::copyParametersFromDevice(
-    ParameterType parameterType) {
-  if (!FLAGS_use_gpu) {
-    return;
-  }
-  int numParameters = cpuParameters_.size();
-  for (int i = 0; i < numParameters; ++i) {
-    cpuParameters_[i]
-        ->getBuf(parameterType)
-        ->copyFrom(*parameters_[i]->getBuf(parameterType));
-  }
-}
-
-void RemoteParameterUpdater::updateImpl(Parameter* para) {
-  REGISTER_TIMER("update");
-  if (localUpdater_) {
-    localUpdater_->update(para);
-  }
-}
-
-void RemoteParameterUpdater::finishBatch(real cost) {
-  if (localUpdater_) {
-    localUpdater_->finishBatch(cost);
-  }
-
-  const std::string& algorithm = config_.algorithm();
-  ParameterUpdateMode mode;
-  if (algorithm == TrainAlgorithm::AsyncSGD) {
-    mode = PSERVER_UPDATE_MODE_ASYNC_SGD;
-  } else if (algorithm == TrainAlgorithm::SGD) {
-    mode = PSERVER_UPDATE_MODE_ADD_GRADIENT;
-  } else {
-    LOG(FATAL) << "Unknown algorithm: " << algorithm;
-  }
-
-  ParameterType sendType;
-  bool sendBackParameter = true;
-  if (localUpdater_) {
-    ++numBatches_;
-    if (numBatches_ % config_.num_batches_per_send_parameter() != 0) {
-      return;
-    }
-
-    if (config_.center_parameter_update_method() == kElasticAverage) {
-      parameterClient_->getParameter(PARAMETER_DELTA);
-      copyParametersToDevice(PARAMETER_DELTA);
-      sendBackParameter = false;  // no need send back after send
-
-      // calc delta
-      for (auto& para : parameters_) {
-        // DELTA = LOCAL_VALUE - CENTER_VALUE/*store in DELTA*/
-        para->getBuf(PARAMETER_DELTA)
-            ->add(*para->getBuf(PARAMETER_VALUE), -1.0f, 1.0f);
-
-        // when delta send to pserver, pserver will do:
-        // CENTER_VALUE += alpha * (LOCAL_VALUE - CENTER_VALUE)
-      }
-    } else {
-      // calc delta
-      for (auto& para : parameters_) {
-        // DELTA = NEW_VALUE - OLD_VALUE/*store in DELTA*/
-        para->getBuf(PARAMETER_DELTA)
-            ->add(*para->getBuf(PARAMETER_VALUE), -1.0f, 1.0f);
-      }
-    }
-
-    sendType = PARAMETER_DELTA;
-
-  } else {
-    // In this case, we perform SGD on pserver.
-    sendType = PARAMETER_GRADIENT;
-  }
-
-  copyParametersFromDevice(sendType);
-
-  {
-    REGISTER_TIMER("sendAndRecv_dense");
-    parameterClient_->sendAndReceiveParameter(mode,
-                                              sendType,
-                                              batchSize_,
-                                              0,  // cost = 0
-                                              sendBackParameter);
-  }
-
-  if (sendBackParameter) {
-    copyParametersToDevice(PARAMETER_VALUE);
-  }
-
-  if (localUpdater_) {
-    if (config_.center_parameter_update_method() == kElasticAverage) {
-      for (auto& para : parameters_) {
-        SetDevice device(para->getDeviceId());
-        // LOCAL_VALUE += -alpha * (LOCAL_VALUE - CENTER_VALUE)
-        para->getBuf(PARAMETER_VALUE)
-            ->add(*para->getBuf(PARAMETER_DELTA), -config_.delta_add_rate());
-      }
-
-    } else {  // average
-      // copy value to delta
-      for (auto& para : parameters_) {
-        SetDevice device(para->getDeviceId());
-        para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
-      }
-    }
-  } else {
-    for (auto& para : parameters_) {
-      SetDevice device(para->getDeviceId());
-      para->getBuf(sendType)->zeroMem();
-    }
-  }
-}
-
-void RemoteParameterUpdater::startPass() {
-  if (config_.algorithm() == TrainAlgorithm::SGD) {
-    parameterClient_->waitPassStart();
-  } else {
-    // sync could benifits reducing lagged trainer for async-sgd
-    // even if sync could not remove all lagged trainer for the
-    // sake of file loading, buffer etc.
-    parameterClient_->asyncStartPass();
-  }
-
-  if (localUpdater_) {
-    localUpdater_->startPass();
-    numBatches_ = 0;
-
-    if (config_.center_parameter_update_method() == kElasticAverage) {
-      if (!isFirstPass_) {
-        // restore local value from delta
-        for (auto& para : parameters_) {
-          SetDevice device(para->getDeviceId());
-          para->getBuf(PARAMETER_VALUE)
-              ->copyFrom(*para->getBuf(PARAMETER_DELTA));
-        }
-      }
-    } else {  // average
-      // copy value to delta
-      for (auto& para : parameters_) {
-        SetDevice device(para->getDeviceId());
-        para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
-      }
-    }
-  }
-}
-
-bool RemoteParameterUpdater::finishPass() {
-  if (localUpdater_) {
-    localUpdater_->finishPass();
-  }
-
-  if (config_.algorithm() == TrainAlgorithm::SGD) {
-    parameterClient_->waitPassFinish();
-  } else {
-    parameterClient_->asyncFinishPass();
-  }
-  if (localUpdater_) {
-    if (config_.center_parameter_update_method() == kElasticAverage) {
-      // backup local value to delta as we will get
-      // the remote parameter for saving/testing
-      for (auto& para : parameters_) {
-        SetDevice device(para->getDeviceId());
-        para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
-      }
-    }
-  }
-  parameterClient_->getParameter();
-  copyParametersToDevice(PARAMETER_VALUE);
-
-  isFirstPass_ = false;
-  return true;
-}
-
-void RemoteParameterUpdater::apply() {
-  if (useApplyInPserver_) {
-    PreparedOperations ops;
-    ops.addOperation(PSERVER_OP_APPLY);
-    parameterClient_->doOperation(ops,
-                                  /* waitForGradient= */ false,
-                                  /* sendBackarameter= */ false);
-    parameterClient_->getParameter(
-        /* recvParameterType= */ PARAMETER_VALUE,
-        /* sendBackParameterType= */ PARAMETER_APPLY);
-    copyParametersToDevice(PARAMETER_VALUE);
-  }
-}
-
-void RemoteParameterUpdater::restore() {
-  if (useApplyInPserver_) {
-    parameterClient_->getParameter();
-    copyParametersToDevice(PARAMETER_VALUE);
-  }
-}
-
-ConcurrentRemoteParameterUpdater::ConcurrentRemoteParameterUpdater(
-    OptimizationConfig config,
-    int passCount,
-    std::unique_ptr<ParameterUpdater>&& localUpdater)
-    : RemoteParameterUpdater(config, passCount, std::move(localUpdater)) {
-  sendThread_.reset(new std::thread([this]() { this->send(); }));
-  recvThread_.reset(new std::thread([this]() { this->recv(); }));
-
-  stopping_ = false;
-  oneBatchFinished_ = false;
-  separateSendAndRecv_ = true;
-}
-
-ConcurrentRemoteParameterUpdater::~ConcurrentRemoteParameterUpdater() {
-  stopping_ = true;
-  sendQueue_.enqueue(0);
-  sendThread_->join();
-  recvQueue_.enqueue(0);
-  recvThread_->join();
-}
-
-void ConcurrentRemoteParameterUpdater::finishBatch(real cost) {
-  if (localUpdater_) {
-    localUpdater_->finishBatch(cost);
-
-    if (!needToUpdateRemotely()) {
-      ++numBatches_;
-      return;
-    }
-  }
-
-  sendQueue_.enqueue(kFinishBatchPid);
-
-  finishBatchCond_.wait([this]() { return oneBatchFinished_; });
-  oneBatchFinished_ = false;
-  {
-    REGISTER_TIMER("sync_hostToDeviceStream");
-    for (auto& para : parameters_) {
-      SetDevice device(para->getDeviceId());
-      hl_stream_synchronize(kHostToDeviceStream);
-    }
-  }
-
-  if (localUpdater_) {
-    ++numBatches_;
-  }
-}
-
-// Use para=NULL to signal the end of one batch
-void ConcurrentRemoteParameterUpdater::send(Parameter* para) {
-  const std::string& algorithm = config_.algorithm();
-  ParameterUpdateMode mode;
-  if (algorithm == TrainAlgorithm::AsyncSGD) {
-    mode = PSERVER_UPDATE_MODE_ASYNC_SGD;
-  } else if (algorithm == TrainAlgorithm::SGD) {
-    mode = PSERVER_UPDATE_MODE_ADD_GRADIENT;
-  } else {
-    LOG(FATAL) << "Unknown algorithm: " << algorithm;
-  }
-  ParameterType sendType;
-  if (localUpdater_) {
-    sendType = PARAMETER_DELTA;
-  } else {
-    // In this case, we perform SGD on pserver.
-    sendType = PARAMETER_GRADIENT;
-  }
-  std::vector<ParameterSegments> paraSegment;
-  if (para == NULL) {
-    parameterClient_->sendParameter(
-        mode,
-        sendType,
-        paraSegment,
-        batchSize_,
-        0,              // cost=0
-        true,           // sendBackParameter = true
-        batchStatus_);  // batchStatus_ = BATCH_FINISH
-
-  } else {
-    ParameterSegments paraSegTemp;
-    paraSegment.reserve(1);
-    paraSegTemp.name = para->getName();
-    paraSegTemp.id = para->getID();
-    paraSegment.push_back(paraSegTemp);
-    {
-      SetDevice device(para->getDeviceId());
-      REGISTER_TIMER("copySingleParaFromDevice");
-      copySingleParaFromDevice(para, sendType);
-      hl_stream_synchronize(kDeviceToHostStream);
-    }
-    parameterClient_->sendParameter(mode,
-                                    sendType,
-                                    paraSegment,
-                                    batchSize_,
-                                    0,     // cost=0
-                                    true,  // sendBackParameter = true
-                                    batchStatus_);
-    if (batchStatus_ == BATCH_START) batchStatus_ = BATCH_ON;
-  }
-}
-void ConcurrentRemoteParameterUpdater::recv(Parameter* para) {
-  parameterClient_->recvParameter();
-  if (para != NULL) {
-    REGISTER_TIMER("copySingleParaToDevice");
-    SetDevice device(para->getDeviceId());
-    copySingleParaToDevice(para, PARAMETER_VALUE);
-
-    if (localUpdater_) {
-      para->getBuf(PARAMETER_DELTA)->copyFrom(*para->getBuf(PARAMETER_VALUE));
-    } else {
-      // if cpu, parameter should not changes until recvParameter().
-      // if gpu, zero mem when send finish
-      if (!FLAGS_use_gpu) {
-        para->getBuf(PARAMETER_GRADIENT)->zeroMem();
-      }
-    }
-  }
-}
-
-void ConcurrentRemoteParameterUpdater::recv() {
-  if (FLAGS_use_gpu) hl_set_device(FLAGS_gpu_id);
-  StatPtr stat = getStat("recv");
-  FOR_TIMING(Timer timer);
-  while (true) {
-    int pid;
-    {
-      REGISTER_TIMER("recv_dequeue");
-      pid = recvQueue_.dequeue();
-    }
-    if (pid == kFinishBatchPid) {
-      Parameter* para = NULL;
-      FOR_TIMING(timer.start());
-      recv(para);
-      FOR_TIMING(timer.stop());
-      FOR_TIMING(stat->addSample(timer.get()));
-      FOR_TIMING(timer.reset());
-      finishBatchCond_.notify_all([this] { oneBatchFinished_ = true; });
-    } else {
-      if (stopping_) break;
-      Parameter* para = parameters_[pid].get();
-      FOR_TIMING(timer.start());
-      recv(para);
-      FOR_TIMING(timer.stop());
-      oneBatchFinished_ = false;
-    }
-  }
-}
-
-void ConcurrentRemoteParameterUpdater::send() {
-  if (FLAGS_use_gpu) hl_set_device(FLAGS_gpu_id);
-  StatPtr stat = getStat("send");
-  FOR_TIMING(Timer timer);
-  while (true) {
-    int pid;
-    {
-      REGISTER_TIMER("send_dequeue");
-      pid = sendQueue_.dequeue();
-    }
-    if (pid == kFinishBatchPid) {
-      batchStatus_ = BATCH_FINISH;
-      if (!localUpdater_) {
-        // if cpu, parameter should not changes until recvParameter().
-        // if gpu, zeroMem() at the end of batch so that it won't
-        // interfere with computation.
-        if (FLAGS_use_gpu) {
-          REGISTER_TIMER("para_zeroMem");
-          for (auto& para : parameters_) {
-            SetDevice device(para->getDeviceId());
-            para->getBuf(PARAMETER_GRADIENT)->zeroMem();
-          }
-        }
-      }
-      Parameter* para = NULL;
-      FOR_TIMING(timer.start());
-      send(para);
-      FOR_TIMING(timer.stop());
-      FOR_TIMING(stat->addSample(timer.get()));
-      FOR_TIMING(timer.reset());
-      recvQueue_.enqueue(pid);
-    } else {
-      if (stopping_) break;
-      Parameter* para = parameters_[pid].get();
-      if (localUpdater_) {
-        // DELTA = NEW_VALUE - OLD_VALUE/*store in DELTA*/
-        para->getBuf(PARAMETER_DELTA)
-            ->add(*para->getBuf(PARAMETER_VALUE), -1.0f, 1.0f);
-      }
-      FOR_TIMING(timer.start());
-      send(para);
-      FOR_TIMING(timer.stop());
-      recvQueue_.enqueue(nonStaticParaIDMap_[para->getID()]);
-    }
-  }
-}
-
-void ConcurrentRemoteParameterUpdater::updateImpl(Parameter* para) {
-  REGISTER_TIMER("update");
-  if (localUpdater_) {
-    localUpdater_->update(para);
-    if (!needToUpdateRemotely()) {
-      return;
-    }
-  }
-  sendQueue_.enqueue(nonStaticParaIDMap_[para->getID()]);
-}
-
-void ConcurrentRemoteParameterUpdater::copySingleParaToDevice(
-    Parameter* para, ParameterType parameterType) {
-  if (!FLAGS_use_gpu) {
-    return;
-  }
-  int i = nonStaticParaIDMap_[para->getID()];
-  para->getBuf(parameterType)
-      ->copyFrom(*cpuParameters_[i]->getBuf(parameterType),
-                 kHostToDeviceStream);
-  if (parameterType == PARAMETER_VALUE) {
-    para->setValueUpdated();
-  }
-}
-
-void ConcurrentRemoteParameterUpdater::copySingleParaFromDevice(
-    Parameter* para, ParameterType parameterType) {
-  if (!FLAGS_use_gpu) {
-    return;
-  }
-  int i = nonStaticParaIDMap_[para->getID()];
-  cpuParameters_[i]
-      ->getBuf(parameterType)
-      ->copyFrom(*para->getBuf(parameterType), kDeviceToHostStream);
-}
-
-SparseRemoteParameterUpdater::SparseRemoteParameterUpdater(
-    const OptimizationConfig& config, int expectedPassCount, bool testing)
-    : config_(config),
-      passCount_(0),
-      expectedPassCount_(expectedPassCount),
-      testing_(testing),
-      useApplyInPserver_(false) {}
-
-void SparseRemoteParameterUpdater::init(
-    const std::vector<ParameterPtr>& parameters) {
-  ParameterUpdater::init(parameters);
-
-  parameterClient_.reset(new ParameterClient2(
-      false, FLAGS_port + FLAGS_ports_num, FLAGS_ports_num_for_sparse));
-  parameterClient_->init(parameters_);
-  parameterClient_->setTrainerId(FLAGS_trainer_id);
-
-  if (FLAGS_trainer_id == 0) {
-    parameterClient_->setConfig(
-        config_, FLAGS_save_dir, true /*is_sparse_server*/);
-    if (parameters[0]->isFullSize()) {
-      parameterClient_->setParameter();
-    } else {  // init in pserver
-      parameterClient_->setParameterZero();
-    }
-  }
-  if (FLAGS_trainer_id == 0 && !testing_ &&
-      config_.algorithm() == TrainAlgorithm::SGD) {
-    startController();
-    useApplyInPserver_ = useApplyInPserver(config_);
-  }
-}
-
-void SparseRemoteParameterUpdater::startController() {
-  controllerThread_.reset(new std::thread([this]() { this->controller(); }));
-}
-
-void SparseRemoteParameterUpdater::controller() {
-  ParameterClient2 client(
-      false, FLAGS_port + FLAGS_ports_num, FLAGS_ports_num_for_sparse);
-  client.init(parameters_);
-
-  while (true) {
-    /*start pass*/ {
-      client.waitPassStart();
-
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_START_PASS);
-      client.doOperation(ops,
-                         /* waitForGradient= */ false,
-                         /* sendBackarameter= */ false,
-                         /* releasePass= */ false);
-    }
-
-    while (true) {
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_SGD);
-      client.doOperation(ops,
-                         /* waitForGradient= */ true,
-                         /* sendBackarameter= */ true,
-                         /* releasePass= */ false);
-      if (client.isPassFinish()) {
-        break;
-      }
-    }
-
-    /*finish pass*/ {
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_FINISH_PASS);
-      client.doOperation(ops,
-                         /* waitForGradient= */ true,
-                         /* sendBackarameter= */ true,
-                         /* releasePass= */ true);
-    }
-
-    passCount_++;
-    if (passCount_ == expectedPassCount_) {
-      break;
-    }
-  }
-}
-
-PassType SparseRemoteParameterUpdater::startBatch(int64_t batchSize) {
-  batchSize_ = batchSize;
-  return PASS_TRAIN;
-}
-
-void SparseRemoteParameterUpdater::finishBatch(real cost) {
-  const std::string& algorithm = config_.algorithm();
-  ParameterUpdateMode mode;
-  if (algorithm == TrainAlgorithm::AsyncSGD) {
-    mode = PSERVER_UPDATE_MODE_ASYNC_SGD;
-  } else if (algorithm == TrainAlgorithm::SGD) {
-    mode = PSERVER_UPDATE_MODE_ADD_GRADIENT;
-  } else {
-    LOG(FATAL) << "Unknown algorithm: " << algorithm;
-  }
-
-  ParameterType sendType = PARAMETER_GRADIENT;
-
-  REGISTER_TIMER("sendSparseParam");
-  parameterClient_->sendAndReceiveParameter(mode,
-                                            sendType,
-                                            batchSize_,
-                                            0,       // cost = 0
-                                            false);  // sendBackParameter
-
-  // grad zero move to sgd grad machine, before merge grad sparse remote
-}
-
-void SparseRemoteParameterUpdater::startPass() {
-  if (config_.algorithm() == TrainAlgorithm::SGD) {
-    parameterClient_->waitPassStart();
-  } else {
-    if (FLAGS_trainer_id == 0) {
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_START_PASS);
-      parameterClient_->doOperation(ops,
-                                    /* waitForGradient= */ false,
-                                    /* sendBackarameter= */ false);
-    }
-    parameterClient_->asyncStartPass();
-  }
-}
-
-bool SparseRemoteParameterUpdater::finishPass() {
-  if (config_.algorithm() == TrainAlgorithm::SGD) {
-    parameterClient_->waitPassFinish();
-  } else {
-    if (FLAGS_trainer_id == 0) {
-      PreparedOperations ops;
-      ops.addOperation(PSERVER_OP_FINISH_PASS);
-      parameterClient_->doOperation(ops,
-                                    /* waitForGradient= */ false,
-                                    /* sendBackarameter= */ false);
-    }
-    parameterClient_->asyncFinishPass();
-  }
-
-  return true;
-}
-
-// Trainer will call getParametersRemote at batch start or before save,
-// so we do not get values in apply() and restore().
-void SparseRemoteParameterUpdater::apply() {
-  if (useApplyInPserver_) {
-    PreparedOperations ops;
-    ops.addOperation(PSERVER_OP_APPLY);
-    parameterClient_->doOperation(ops,
-                                  /* waitForGradient= */ false,
-                                  /* sendBackarameter= */ false);
-  }
-}
-
-void SparseRemoteParameterUpdater::restore() {}
-
-void SparseRemoteParameterUpdater::getParametersRemote(bool fullSize,
-                                                       bool apply) {
-  ParameterType sendBackParameterType =
-      (useApplyInPserver_ && apply) ? PARAMETER_APPLY : PARAMETER_VALUE;
-  std::function<void()> getParams;
-  std::function<void(Parameter&, real)> applyL1;
-  if (fullSize) {
-    getParams = [&] {
-      parameterClient_->getParameter(
-          /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
-    };
-    applyL1 = [](Parameter& para, real decayRate) {
-      para.getBuf(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate);
-    };
-  } else {
-    getParams = [&] {
-      parameterClient_->getParameterSparse(
-          /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
-    };
-    applyL1 = [](Parameter& para, real decayRate) {
-      para.getMat(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate);
-    };
-  }
-  {
-    REGISTER_TIMER("getParamDenseAndSparse");
-    getParams();
-    if (config_.shrink_parameter_value() > 0) {
-      for (auto& para : parameters_) {
-        if (para->getConfig().decay_rate_l1() > 0) {
-          applyL1(*para, config_.shrink_parameter_value());
-        }
-      }
-    }
-  }
-}
-
-void SparseRemoteParameterUpdater::randParametersRemote() {
-  CHECK_EQ(FLAGS_trainer_id, 0);
-
-  PreparedOperations ops;
-  ops.addOperation(PSERVER_OP_RANDOMIZE);
-  parameterClient_->doOperation(ops,
-                                /* waitForGradient= */ false,
-                                /* sendBackarameter= */ false);
-}
-
-void SparseRemoteParameterUpdater::loadParametersRemote(
-    const std::string& dirName) {
-  if (FLAGS_trainer_id == 0) {
-    parameterClient_->loadValueVector(dirName);
-  }
-
-  if (testing_) {
-    // we do not use synchronize() here,
-    // because test mode may run only one tester
-    if (FLAGS_trainer_id == 0) {
-      parameterClient_->setStatus(PSERVER_STATUS_PARAMETER_READY);
-    } else {
-      parameterClient_->waitForStatus(PSERVER_STATUS_PARAMETER_READY);
-    }
-  }
-}
-
-void SparseRemoteParameterUpdater::saveParametersRemote(
-    const std::string& dirName) {
-  if (FLAGS_trainer_id == 0) {
-    parameterClient_->saveValueVector(dirName);
-  }
-}
-
-void SparseRemoteParameterUpdaterComposite::init(
-    const std::vector<ParameterPtr>& parameters) {
-  parameters_ = parameters;
-
-  std::vector<ParameterPtr> parametersArray[NUMBER_UPDATERS];
-
-  for (auto& para : parameters_) {
-    if (para->isSparseRemoteUpdate()) {
-      parametersArray[UPDATER_SPARSE_REMOTE].push_back(para);
-    } else {
-      parametersArray[UPDATER_NORMAL].push_back(para);
-    }
-  }
-  CHECK(!parametersArray[UPDATER_SPARSE_REMOTE].empty());
-  CHECK(!parametersArray[UPDATER_NORMAL].empty());
-
-  syncThreadPool_->execPlusOwner([&](int tid, size_t numThreads) {
-    updaters_[tid]->init(parametersArray[tid]);
-  });
-
-  parameterTypes_ = updaters_[UPDATER_NORMAL]->getParameterTypes();
-}
-
-std::vector<std::function<ParameterUpdater*(
-    const std::string&, const OptimizationConfig&, bool, size_t)>>
-    ParameterUpdaterCreators::constructors_;
-
-}  // namespace paddle
diff --git a/paddle/trainer/RemoteParameterUpdater.h b/paddle/trainer/RemoteParameterUpdater.h
deleted file mode 100644
index 5e82c944751629632ea8d16992bd8f4178a2fbd5..0000000000000000000000000000000000000000
--- a/paddle/trainer/RemoteParameterUpdater.h
+++ /dev/null
@@ -1,416 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <thread>
-#include "ParameterUpdater.h"
-#include "paddle/pserver/ParameterClient2.h"
-#include "paddle/utils/Queue.h"
-#include "paddle/utils/Util.h"
-
-namespace paddle {
-
-// TODO(yanfei):
-// I think that the biggest feature of rdma is packet lossless control
-// feature instead of high bandwiths, zero copy and gpu-direct rdma in
-// theroy.
-// But zero-copy and gpu-direct rdma features can help to reduce latency
-// caused by os system.
-// So, for some specified cluster, such as high density gpu cluster,
-// gpu-direct and zero copy could help to improve cluster communication
-// performance.
-//
-
-/**
- * Normal remote parameter updater for dense parameters.
- *
- * It first packs all parameters for all pservers using ParameterClient
- * module, then wait for merged parameters data from all pservers.
- * The synchronization pattern specified by sync-sgd or async-sgd is
- * achieved by all pservers with the help of the controller within this
- * remote parameter updater.
- * This module indeedly bridges the gradient machines and parameter servers.
- * It helps to transfer the parameters from acceleration device to cpu end
- * for network. It contains additional parameters copy buffers for
- * acceleration devices at cpu end, such as gpu, otherwise it will
- * directly use original parameters data to update pservers.
- *
- * This remote parameter updater does not use pipeline mechanism to hide
- * copy latency from gpu to cpu buffer. In addition the overlapped between
- * backward and communication is not supported.
- */
-class RemoteParameterUpdater : public ParameterUpdater {
-public:
-  RemoteParameterUpdater(
-      const OptimizationConfig& config,
-      int expectedPassCount,
-      std::unique_ptr<ParameterUpdater>&& localUpdater = nullptr);
-  ~RemoteParameterUpdater() {
-    if (controllerThread_) {
-      controllerThread_->join();
-    }
-  }
-
-  /**
-   * initialize the internal parameter client and itself.
-   */
-  virtual void init(const std::vector<ParameterPtr>& parameters);
-  /**
-   * @brief start batch
-   *
-   * @note  one batch training exhibits stateful feature to help
-   *        to do performance tuning, sgd optimization if necessary.
-   */
-  virtual PassType startBatch(int64_t batchSize) {
-    if (localUpdater_) {
-      localUpdater_->startBatch(batchSize);
-    }
-    batchSize_ = batchSize;
-    batchStatus_ = BATCH_START;
-    return PASS_TRAIN;
-  }
-
-  /**
-   * send parameters to pservers and get returned parameters
-   * from all pservers if necessary. it will implictly
-   * cooperate with controller thread for sync-sgd.
-   */
-  virtual void finishBatch(real cost);
-  virtual void startPass();
-  virtual bool finishPass();
-
-#ifndef PADDLE_DISABLE_TIMER
-  virtual void setForwardbackwardTime(uint64_t delta) {
-    parameterClient_->setForwardbackwardTime(delta);
-  }
-#endif
-
-  virtual void apply();
-  virtual void restore();
-
-protected:
-  /**
-   * control all pservers with all trainers for sync-sgd
-   */
-  virtual void controller();
-
-  /**
-   * work need to do after finishBatch
-   */
-  virtual void updateImpl(Parameter* para);
-
-  void startController();
-
-  /**
-   * @brief copy parameters from cpu host to device, such as gpu.
-   *
-   * @note  return if all data are transfered.
-   */
-  void copyParametersToDevice(ParameterType parameterType);
-
-  /**
-   * @brief copy parameters from device to cpu host
-   *
-   * @note  return if all data are transfered
-   */
-  void copyParametersFromDevice(ParameterType parameterType);
-
-protected:
-  /// Optimization config used to guide initialization and finishBatch
-  OptimizationConfig config_;
-  /// internal parameter client object for exchanging data with pserver
-  std::unique_ptr<ParameterClient2> parameterClient_;
-  /// internal shadow buffer at cpu host end, use original parameters_
-  /// if no acceleration devices are used.
-  std::vector<ParameterPtr> cpuParameters_;
-  /// local updater for aggregating multi-batches local delta
-  std::unique_ptr<ParameterUpdater> localUpdater_;
-  /// the size of mini-batch
-  int64_t batchSize_;
-  /// batches passed
-  int64_t numBatches_;
-  /// for stateful control
-  BatchStatus batchStatus_;
-  /// controller thread for sync-sgd
-  std::unique_ptr<std::thread> controllerThread_;
-  /// passed already finished
-  int64_t passCount_;
-  /// expected passes to finished
-  int64_t expectedPassCount_;
-  /// use normal synchronization communication if True
-  bool separateSendAndRecv_;
-  /// true if it's first pass
-  bool isFirstPass_;
-  bool useApplyInPserver_;
-
-  static const std::string kAverage;
-  static const std::string kElasticAverage;
-};
-
-// TODO(yanfei):
-// do parameters level synchronization Optimization at pserver end with
-// ConcurrentRemoteParameterUpdater to get more parallelization, at last
-// to really hide pserver latency in backward computation.
-//
-/**
- * This updater add additional optimization for overlapping synchronization
- * from pservers with backward computation.
- *
- * Parameter can be sent to pservers when related backward stage is finished.
- * This concurrent udpater does data copy from acceleration device to host
- * memory aynchronously. In addition internal parameter client reads data in
- * host memory and send them to all pservers in next stage. So this class
- * help to pipeline device-to-host copy and host-to-network to hide network
- * latency in backward stage.
- * It contains separate send and recv thread for pipeline usage.
- */
-class ConcurrentRemoteParameterUpdater : public RemoteParameterUpdater {
-public:
-  ConcurrentRemoteParameterUpdater(
-      OptimizationConfig config,
-      int expectedPassCount,
-      std::unique_ptr<ParameterUpdater>&& localUpdater);
-  ~ConcurrentRemoteParameterUpdater();
-
-  /**
-   * @brief send paraemeters to all pservers
-   *
-   * @note  it just signal the end signal to internal parameter client
-   *        to finished the aynchronous send action. In addition it also
-   *        do synchronization for all asynchronous host-to-device copy.
-   */
-  virtual void finishBatch(real cost);
-
-protected:
-  virtual void updateImpl(Parameter* para);
-  /// internal thread called in send thread
-  void send(Parameter* para);  // para == NULL indicate end of a minibatch
-  /// internal function called in recv thread
-  void recv(Parameter* para);
-  /**
-   * @brief send thread for relaying data from gradient to parameter client
-   *
-   * @note  just pipe data to internal parameter client for pipeline
-   */
-  void send();
-  /**
-   * @brief recv thread for relaying data from internal parameter client to
-   *        host memory
-   *
-   * @note  it contains the asynchronous data copy form host to device
-   */
-  void recv();
-  /// copy specified parameter from host to device
-  void copySingleParaToDevice(Parameter* para, ParameterType parameterType);
-  /// copy specified parameter from device to host
-  void copySingleParaFromDevice(Parameter* para, ParameterType parameterType);
-  bool needToUpdateRemotely() {
-    return (numBatches_ + 1) % config_.num_batches_per_send_parameter() == 0;
-  }
-
-private:
-  /// send thread used for overlapping
-  std::unique_ptr<std::thread> sendThread_;
-  /// recv thread used for overlapping
-  std::unique_ptr<std::thread> recvThread_;
-  /// buffer queue for overlapping
-  Queue<int> sendQueue_;
-  /// buffer queue for overlapping
-  Queue<int> recvQueue_;
-  /// flags indicating to stop
-  bool stopping_;
-  /// conditional variable for threads synchronization between the
-  /// thread calling finishBatch and internal recv thread
-  LockedCondition finishBatchCond_;
-  bool oneBatchFinished_;
-};
-
-// TODO(yanfei):
-// merge sparse updater with dense updater, and could help to reduce
-// the synchronization between sparse and dense udpater. it could also
-// reduce the threads for managing all connections.
-/**
- * This class is specified for updating sparse parameters.
- *
- * It allows part of parameter to be exchanged with all pservers.
- * If sparse input assigned, part gradients of first hidden layer
- * could remained zero which can not need to be exchanged within
- * all pservers. This is the key optimization point for this updater
- *
- * For updating sparse parameters, all latest parameters are stored
- * in pservers instead of keeping full copy at train end, so need to
- * prefetch parameters weight value which can be changed in next-batch
- * before doing next forwardbackward. Also, with above fact that the
- * parameters can be stored in pserver instead of trainer, we can
- * fetch specified parmeters if necessary, and can support huge
- * parameters which is larger enough than  the RAM size in single
- * node.
- *
- * Internally, this updater will direct internal parameter client
- * to encapsulate sparse specified message for all pservers.
- */
-class SparseRemoteParameterUpdater : public ParameterUpdater {
-public:
-  SparseRemoteParameterUpdater(const OptimizationConfig& config,
-                               int expectedPassCount,
-                               bool testing);
-  ~SparseRemoteParameterUpdater() {
-    if (controllerThread_) {
-      controllerThread_->join();
-    }
-  }
-
-  /// initialization
-  virtual void init(const std::vector<ParameterPtr>& parameters);
-
-  /// stateful batch control
-  virtual PassType startBatch(int64_t batchSize);
-  /// send all sparse related parameters to all pservers
-  virtual void finishBatch(real cost);
-  virtual void startPass();
-  virtual bool finishPass();
-
-  virtual void apply();
-  virtual void restore();
-
-  /// load parameters from pservers
-  virtual void loadParametersRemote(const std::string& dirName);
-  /// save parameters to pservers
-  virtual void saveParametersRemote(const std::string& dirName);
-  /**
-   * @brief get latest sparse parameters value from all pservers
-   *
-   * @note  call it before next mini-batch
-   */
-  virtual void getParametersRemote(bool fullSize, bool apply);
-  virtual void randParametersRemote();
-#ifndef PADDLE_DISABLE_TIMER
-  virtual void setForwardbackwardTime(uint64_t delta) {
-    parameterClient_->setForwardbackwardTime(delta);
-  }
-#endif
-
-protected:
-  /// update implimentation, not implemented
-  virtual void updateImpl(Parameter* para) {}
-
-  /// internal controller routine for controller thread
-  virtual void controller();
-
-  /// start controller thread
-  void startController();
-
-protected:
-  /// optimization config
-  OptimizationConfig config_;
-  /// internal parameter client
-  std::unique_ptr<ParameterClient2> parameterClient_;
-  int64_t batchSize_;
-  std::unique_ptr<std::thread> controllerThread_;
-  int64_t passCount_;
-  int64_t expectedPassCount_;
-  bool testing_;
-  bool useApplyInPserver_;
-};
-
-/**
- * Class for supporting normal updater and sparse updater
- *
- * Not all parts of one model are sparse, so it exists dense updater
- * for normal layers while sparse updater is for sparse layers.
- *
- * it directly call internal dense and sparse udpater individually.
- */
-class SparseRemoteParameterUpdaterComposite : public ParameterUpdaterComposite {
-public:
-  enum {
-    UPDATER_SPARSE_REMOTE = 0,  // execute in sync thread pool(tid:0)
-    UPDATER_NORMAL = 1,         // execute in Owner thread(tid:1)
-    NUMBER_UPDATERS = 2,
-  };
-  /**
-   * @brief create one dense updater and one sparse updater
-   *
-   * @note  use syncThreadPool to synchronize these two updaters
-   */
-  SparseRemoteParameterUpdaterComposite(
-      const OptimizationConfig& config,
-      int expectedPassCount,
-      bool testing,
-      std::unique_ptr<ParameterUpdater>&& normalUpdater) {
-    updaters_.resize(NUMBER_UPDATERS);
-    updaters_[UPDATER_SPARSE_REMOTE].reset(
-        new SparseRemoteParameterUpdater(config, expectedPassCount, testing));
-    updaters_[UPDATER_NORMAL] = std::move(normalUpdater);
-
-    syncThreadPool_.reset(new SyncThreadPool(NUMBER_UPDATERS - 1));
-  }
-
-  /// initialization of dense and sparse updaters
-  virtual void init(const std::vector<ParameterPtr>& parameters);
-};
-
-class ParameterUpdaterCreators {
-public:
-  /**
-   * @brief add a creator to create custom ParameterUpdater while training.
-   *        The creator is a function with type (alogrithm, optConfig, isLocal,
-   *        numPasses) -> ParameterUpdater*. Trainer will use this
-   *        ParameterUpdater if creator can create a no nullptr
-   *        ParameterUpdater. Return nullptr will use trainer's default
-   *        updaters.
-   *
-   * @param creator method which can create ParameterUpdater.
-   */
-  static void addCreator(
-      const std::function<ParameterUpdater*(
-          const std::string&,         // algo
-          const OptimizationConfig&,  // optConfig
-          bool,                       // isLocal
-          size_t                      // numPasses
-          )>& creator) {  // NOLINT  explicit move closing ) in this line
-                          // for readability
-    constructors_.push_back(creator);
-  }
-
-  /**
-   * @brief Try to create an updater by given algo, optConfig, isLocal,
-   *        numPasses. Return nullptr if cannot create anyone.
-   * @param algo algorithm string.
-   * @param optConfig optimization config.
-   * @param isLocal is in local mode or not.
-   * @param numPasses total passes that trainer will train.
-   * @return nullptr if fail, not nullptr if we can create an updater.
-   */
-  static ParameterUpdater* tryCreateUpdater(const std::string& algo,
-                                            const OptimizationConfig& optConfig,
-                                            bool isLocal,
-                                            size_t numPasses) {
-    for (auto& c : constructors_) {
-      if (auto updater = c(algo, optConfig, isLocal, numPasses)) {
-        return updater;
-      }
-    }
-    return nullptr;
-  }
-
-private:
-  static std::vector<std::function<ParameterUpdater*(
-      const std::string&, const OptimizationConfig&, bool, size_t)>>
-      constructors_;
-};
-
-}  // namespace paddle
diff --git a/paddle/trainer/Tester.cpp b/paddle/trainer/Tester.cpp
deleted file mode 100644
index 16e676d60248dfe6d443c50fbf34970e63c1f412..0000000000000000000000000000000000000000
--- a/paddle/trainer/Tester.cpp
+++ /dev/null
@@ -1,380 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Tester.h"
-
-#include <fenv.h>
-#include <stdio.h>
-
-#include <iomanip>
-#include <iostream>
-#include <limits>
-#include <sstream>
-
-#include <google/protobuf/text_format.h>
-
-#include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-
-#include "TesterConfig.h"
-#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/gserver/layers/ValidationLayer.h"
-
-namespace paddle {
-
-Tester::Tester(const std::shared_ptr<TrainerConfigHelper>& config,
-               std::unique_ptr<TesterConfig>&& intconfig,
-               const GradientMachinePtr& gradientMachine,
-               const std::shared_ptr<ParameterUpdater>& parameterUpdater,
-               std::shared_ptr<DataProvider> testDataProvider)
-    : config_(config),
-      intconfig_(std::move(intconfig)),
-      gradientMachine_(gradientMachine),
-      parameterUpdater_(parameterUpdater),
-      testDataProvider_(testDataProvider) {
-  if (config_->getOptConfig().use_sparse_remote_updater()) {
-    LOG(FATAL) << "It's prohibited to set sparse_remote_update "
-               << "when doing train and test jobs in the same "
-               << "process. You could run paddle --job=test in "
-               << "a separate process.";
-  }
-  testEvaluator_.reset(gradientMachine_->makeEvaluator());
-  if (intconfig_->distributeTest) {
-    testParameterClient_.reset(new ParameterClient2(true));
-  }
-
-  if (testParameterClient_) {
-    testParameterClient_->init(gradientMachine_->getParameters());
-  }
-
-  std::unique_ptr<ParameterUtilConfig> paramConfig(
-      new ParameterUtilConfig(intconfig_->saveOnlyOne,
-                              intconfig_->savingPeriod,
-                              intconfig_->loadsaveParametersInPserver,
-                              intconfig_->config));
-
-  paramUtil_.reset(new ParameterUtil(
-      config_, std::move(paramConfig), gradientMachine_, parameterUpdater_));
-}
-
-void Tester::startTestPeriod() {
-  if (testDataProvider_) {
-    testDataProvider_->reset();
-  }
-  testEvaluator_->start();
-  testContext_.cost = 0;
-  testContext_.numSamples = 0;
-
-  parameterUpdater_->apply();
-  if (intconfig_->prevBatchState) {
-    gradientMachine_->getState(*intconfig_->trainState);
-    gradientMachine_->setState(*intconfig_->testState);
-  }
-}
-
-void Tester::testOneDataBatch(const DataBatch& dataBatch,
-                              std::vector<Argument>* outArgs) {
-  testContext_.cost +=
-      forwardOneBatch(dataBatch, testEvaluator_.get(), outArgs);
-  testContext_.numSamples += dataBatch.getSize();
-}
-
-void Tester::testOnePeriod() {
-  DataBatch dataBatch;
-  int64_t batchSize = config_->getOptConfig().batch_size();
-  std::vector<Argument> outArgs;
-  startTestPeriod();
-  while (testDataProvider_->getNextBatch(batchSize, &dataBatch) != 0) {
-    testOneDataBatch(dataBatch, &outArgs);
-  }
-  finishTestPeriod();
-}
-
-void Tester::finishTestPeriod() {
-  if (intconfig_->prevBatchState) {
-    gradientMachine_->resetState();
-  }
-  testEvaluator_->finish();
-  CHECK_GT(testContext_.numSamples, 0)
-      << "There is no samples in your test batch. Possibly "
-         "wrong implementation of DataProvidor.reset()";
-  LOG(INFO) << " Test samples=" << testContext_.numSamples
-            << " cost=" << testContext_.cost / testContext_.numSamples
-            << " Eval: " << *testEvaluator_;
-  parameterUpdater_->restore();
-  if (intconfig_->prevBatchState) {
-    gradientMachine_->getState(*intconfig_->testState);
-    gradientMachine_->setState(*intconfig_->trainState);
-  }
-}
-
-int64_t Tester::testOneBatchById(int64_t batchId) {
-  DataBatch dataBatch;
-  int32_t batchSize = config_->getOptConfig().batch_size();
-
-  testDataProvider_->getNextBatch(batchSize, &dataBatch);
-
-  int64_t actualBatchSize = dataBatch.getSize();
-  if (actualBatchSize == 0) {
-    return 0;
-  }
-
-  std::vector<Argument> outArgs;
-
-  stats_ += std::pair<int64_t, real>{
-      actualBatchSize,
-      forwardOneBatch(dataBatch, testEvaluator_.get(), &outArgs)};
-
-  if (((batchId + 1) % intconfig_->logPeriod) == 0) {
-    LOG(INFO) << " Batch=" << batchId + 1 << " " << stats_.getStats(false);
-  }
-
-  return actualBatchSize;
-}
-
-real Tester::forwardOneBatch(const DataBatch& dataBatch,
-                             Evaluator* evaluator,
-                             std::vector<Argument>* pOutArgs) {
-  auto& outArgs = *pOutArgs;
-  const std::vector<Argument>& inArgs = dataBatch.getStreams();
-  if (intconfig_->loadsaveParametersInPserver) {
-    REGISTER_TIMER("prefetch");
-    gradientMachine_->prefetch(inArgs);
-    parameterUpdater_->getParametersRemote(false /*full parameter*/,
-                                           true /*after apply*/);
-  }
-
-  gradientMachine_->forward(inArgs, &outArgs, PASS_TEST);
-
-  // write features if set this flag and outArgs is not empty
-  std::string featFile = intconfig_->featFile;
-  if (!featFile.empty() && outArgs.empty()) {
-    size_t numOutputs = outArgs.size();
-    std::vector<MatrixPtr> featMatrices;
-    featMatrices.resize(numOutputs);
-    for (size_t i = 0; i < numOutputs; ++i) {
-      featMatrices[i] = Matrix::create(outArgs[i].value->getHeight(),
-                                       outArgs[i].value->getWidth(),
-                                       false,
-                                       false);  // CPU data buffer
-      featMatrices[i]->copyFrom(*(outArgs[i].value), HPPL_STREAM_DEFAULT);
-    }
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-    FILE* fp = fopen(featFile.c_str(), "ab+");
-    CHECK(!ferror(fp)) << "Fail to open " << featFile;
-
-    size_t sampleNum = featMatrices[0]->getHeight();
-    for (size_t i = 0; i < sampleNum; ++i) {
-      for (size_t j = 0; j < numOutputs; ++j) {
-        size_t dim = featMatrices[j]->getWidth();
-        fwrite(featMatrices[j]->getData() + i * dim, sizeof(real), dim, fp);
-      }
-    }
-    fclose(fp);
-  }
-  if (evaluator) {
-    gradientMachine_->eval(evaluator);
-  }
-
-  // Save the output layers if predict_output_dir is not empty
-  std::string predictOutputDir = intconfig_->predictOutputDir;
-  if (!predictOutputDir.empty() && !outArgs.empty()) {
-    CHECK(intconfig_->testing) << "Only valid in test mode";
-    if (!os_.is_open()) {
-      // TODO(yuyang18): Refactor these lines.
-      constexpr int kBufLen = 100;
-      char buf[kBufLen];
-      snprintf(buf, kBufLen, "rank-%05d", intconfig_->trainerId);
-      mkDir(predictOutputDir.c_str());
-      std::string filename = path::join(predictOutputDir, buf);
-      os_.open(filename, std::ofstream::trunc);
-      CHECK(os_.is_open()) << "Failed to open file " << filename;
-    }
-    printOutput(outArgs, os_);
-    return 0.0;  // In this case, there is no meaning to calculate cost
-  }
-
-  return Argument::sum(outArgs);
-}
-
-void Tester::testOnePassBatch(int passId) {
-  stats_.reset();
-  const std::vector<Argument> inArgs;
-  gradientMachine_->forward(inArgs, nullptr, PASS_TEST);
-  int64_t num;
-  real cost;
-  gradientMachine_->getStats(cost, num);
-  stats_ += std::pair<int64_t, real>{num, cost};
-  gradientMachine_->onPassEnd();
-
-  LOG(INFO) << " Pass=" << passId << " " << stats_.getStats(false);
-}
-
-void Tester::testOnePass(int passId) {
-  stats_.reset();
-  int64_t batchId = 0;
-  int num = 0;
-  if (intconfig_->prevBatchState) {
-    gradientMachine_->resetState();
-  }
-
-  testEvaluator_->start();
-
-  do {
-    num = testOneBatchById(batchId);
-    ++batchId;
-  } while (num > 0);
-
-  gradientMachine_->onPassEnd();
-  testEvaluator_->finish();
-
-  LOG(INFO) << " Pass=" << passId << " " << stats_.getStats(false)
-            << " Eval: " << *testEvaluator_;
-
-  if (intconfig_->distributeTest) {
-    testEvaluator_->distributeEval(testParameterClient_.get());
-    if (0 == intconfig_->trainerId) {
-      LOG(INFO) << "distribute eval: " << *testEvaluator_;
-    }
-  }
-}
-
-void Tester::test() {
-  CHECK(testDataProvider_) << "TestData is not specified";
-  testDataProvider_->setSkipShuffle();
-  testDataProvider_->reset();
-  gradientMachine_->start();
-
-  // For evaluation
-  std::vector<std::string> modelList;
-  std::string modelListFromConfig = intconfig_->modelList;
-  std::string initModelPath = intconfig_->initModelPath;
-  if (!modelListFromConfig.empty()) {
-    loadFileList(modelListFromConfig, modelList);
-    intconfig_->testPass = 0;
-    intconfig_->numPasses = modelList.size();
-    intconfig_->savingPeriod = 1;
-    CHECK_EQ(intconfig_->testWait, 0) << "--test_wait must be 0 for evaluation";
-  } else if (!initModelPath.empty()) {
-    modelList.push_back(initModelPath);
-    intconfig_->testPass = 0;
-    intconfig_->numPasses = 1;
-    intconfig_->savingPeriod = 1;
-    CHECK_EQ(intconfig_->testWait, 0) << "--test_wait must be 0 for evaluation";
-  }
-
-  for (int i = intconfig_->testPass; i < intconfig_->numPasses; ++i) {
-    int passId = i;
-    if (passId % intconfig_->savingPeriod == 0) {
-      if (intconfig_->testWait) {
-        while (paramUtil_->loadParameters(
-                   passId, true /*local*/, true /*remote*/) == false) {
-          LOG(INFO) << "Waiting for parameters of pass " << passId;
-          sleep(60);  // sleep 60s
-        }
-      } else {
-        if (modelList.size() == 0) {
-          CHECK_EQ(paramUtil_->loadParameters(
-                       passId, true /*local*/, true /*remote*/),
-                   true);
-        } else {
-          paramUtil_->loadParametersWithPath(
-              modelList[i], true /*local*/, true /*remote*/);
-        }
-      }
-      if (IGradientMachineMode::trainWholeDataInOneBatch(intconfig_->mode)) {
-        testOnePassBatch(passId);
-      } else {
-        testOnePass(passId);
-      }
-      if (passId + intconfig_->savingPeriod < intconfig_->numPasses) {
-        // if there is at least 1 more pass to test, then call reset,
-        // otherwise not.
-        testDataProvider_->reset();
-      }
-    }
-  }
-
-  gradientMachine_->finish();
-}
-
-void Tester::printOutput(const std::vector<Argument>& outArgs,
-                         std::ostream& os) {
-  size_t numOutputs = outArgs.size();
-  size_t numIns = outArgs[0].getBatchSize();
-  if (cpuMat_.size() != numOutputs || cpuVec_.size() != numOutputs) {
-    cpuMat_.resize(numOutputs, nullptr);
-    cpuVec_.resize(numOutputs, nullptr);
-  }
-
-  for (size_t i = 0; i < numOutputs; ++i) {
-    if (outArgs[i].value != nullptr) {
-      if (outArgs[i].value->useGpu()) {
-        if (dynamic_cast<GpuMatrix*>(outArgs[i].value.get())) {
-          size_t dim = outArgs[i].value->getWidth();
-          Matrix::resizeOrCreate(cpuMat_[i], numIns, dim, false, false);
-          cpuMat_[i]->copyFrom(*outArgs[i].value);
-        } else if (dynamic_cast<GpuSparseMatrix*>(outArgs[i].value.get())) {
-          auto sparseMat =
-              dynamic_cast<GpuSparseMatrix*>(outArgs[i].value.get());
-          cpuMat_[i] = Matrix::createSparseMatrix(sparseMat->getHeight(),
-                                                  sparseMat->getWidth(),
-                                                  sparseMat->getElementCnt(),
-                                                  sparseMat->getValueType(),
-                                                  sparseMat->format_,
-                                                  false,  /* trans */
-                                                  false); /* useGpu */
-          hl_stream_t stream = HPPL_STREAM_DEFAULT;
-          cpuMat_[i]->copyFrom(*sparseMat, stream);
-        } else {
-          LOG(WARNING) << "Not supported gpu matrix type";
-        }
-      }
-    } else if (outArgs[i].ids != nullptr) {
-      if (outArgs[i].ids->useGpu()) {
-        IVector::resizeOrCreate(cpuVec_[i], outArgs[i].ids->getSize(), false);
-        cpuVec_[i]->copyFrom(*outArgs[i].ids);
-      }
-    } else if (outArgs[i].strs != nullptr) {
-      continue;
-    } else {
-      LOG(WARNING) << "outArgs[" << i << "] has no data to print";
-    }
-  }
-
-  for (size_t i = 0; i < numIns; ++i) {
-    for (size_t j = 0; j < numOutputs; ++j) {
-      if (outArgs[j].value != nullptr) {
-        if (outArgs[j].value->useGpu()) {
-          cpuMat_[j]->printOneRow(os, i);
-        } else {
-          outArgs[j].value->printOneRow(os, i);
-        }
-      } else if (outArgs[j].ids != nullptr) {
-        if (outArgs[j].ids->useGpu()) {
-          cpuVec_[j]->printOneElement(os, i);
-        } else {
-          outArgs[j].ids->printOneElement(os, i);
-        }
-      } else if (outArgs[j].strs != nullptr) {
-        os << (*outArgs[j].strs)[i] << ";";
-      }
-    }
-    os << std::endl;
-  }
-}
-}  // namespace paddle
diff --git a/paddle/trainer/Tester.h b/paddle/trainer/Tester.h
deleted file mode 100644
index e892744db278586f2fd5b3cb527aa7c17752c477..0000000000000000000000000000000000000000
--- a/paddle/trainer/Tester.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/utils/Util.h"
-
-#include <stdio.h>
-
-#include "hl_gpu.h"
-#include "paddle/gserver/dataproviders/DataProvider.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-
-#include "TrainerConfig.pb.h"
-
-#include <stdlib.h>
-#include <fstream>
-#include "ParamUtil.h"
-#include "ParameterUpdater.h"
-#include "TesterConfig.h"
-#include "TrainerInternalConfig.h"
-
-namespace paddle {
-
-/**
- * Neural Network test logics code.
- * It is a private class for Trainer.
- */
-class Tester {
-public:
-  /**
-   * Ctor
-   * @param config Trainer Config.
-   * @param intconfig Tester Config.
-   * @param gradientMachine Gradient machine(neuralnetwork) that will be tested.
-   * @param parameterUpdater Parameter Updater. Not for updating parameter, just
-   *                         for getting parameter from parameter-server.
-   * @param testDataProvider Test data provider.
-   */
-  Tester(const std::shared_ptr<TrainerConfigHelper>& config,
-         std::unique_ptr<TesterConfig>&& intconfig,
-         const GradientMachinePtr& gradientMachine,
-         const std::shared_ptr<ParameterUpdater>& parameterUpdater,
-         std::shared_ptr<DataProvider> testDataProvider);
-
-  /**
-   * test one period.
-   *
-   * One period means 2 things.
-   *   if test_period !=0 and not test_all_data_in_one_period, then
-   *      will test test_period * batch_size data.
-   *   else
-   *      will test whole test data.
-   *
-   * It is convenience to test small set of data when test data set is large and
-   * is training at same time.
-   */
-  void testOnePeriod();
-  void startTestPeriod();
-  void finishTestPeriod();
-  void testOneDataBatch(const DataBatch& dataBatch,
-                        std::vector<Argument>* outArgs);
-
-  /**
-   * Test for given data batch.
-   * @param dataBatch Data batch.
-   * @param evaluator Evaluator
-   * @return cost
-   */
-  real forwardOneBatch(const DataBatch& dataBatch,
-                       Evaluator* evaluator,
-                       std::vector<Argument>* outArgs);
-
-  /**
-   * performance the full pass of test given test data provider
-   */
-  void test();
-
-protected:
-  std::shared_ptr<ParameterClient2> testParameterClient_;
-  std::shared_ptr<TrainerConfigHelper> config_;
-  std::unique_ptr<TesterConfig> intconfig_;
-  GradientMachinePtr gradientMachine_;
-  std::shared_ptr<ParameterUpdater> parameterUpdater_;
-  std::unique_ptr<Evaluator> testEvaluator_;
-  std::unique_ptr<ParameterUtil> paramUtil_;
-  DataProviderPtr testDataProvider_;
-  TrainerStats stats_;
-
-  // Used for saving the values of output layers
-  std::ofstream os_;
-  std::vector<MatrixPtr> cpuMat_;
-  std::vector<IVectorPtr> cpuVec_;
-  struct {
-    int64_t numSamples;
-    real cost;
-  } testContext_;
-
-private:
-  /**
-   * Test one batch by batchId. It is only used for testOnePass.
-   *
-   * Durning testOnePass, each log_period will print cost statistics.
-   *
-   * @param batchId current batch id (from 0)
-   * @return num of tested samples. Zero if end of pass.
-   */
-  int64_t testOneBatchById(int64_t batchId);
-
-  /**
-   * Test whole pass in one batch.
-   *
-   *
-   * @param passId current pass id (from 0)
-   */
-  void testOnePassBatch(int passId);
-
-  /**
-   * test for one pass in several mini-batches.
-   *
-   * Used for sgd method.
-   *
-   * @param passId current pass id (from 0)
-   */
-  void testOnePass(int passId);
-
-  /**
-   * print the outArgs to a stream
-   *
-   * used for save feature file
-   *
-   * @param [in] outArgs output arguments for network.
-   * @param [in,out] os output stream.
-   */
-  void printOutput(const std::vector<Argument>& outArgs, std::ostream& os);
-};
-
-}  //  namespace paddle
diff --git a/paddle/trainer/TesterConfig.h b/paddle/trainer/TesterConfig.h
deleted file mode 100644
index 68d4c931ff2df8e24acaa9fe6b35bfd613197c72..0000000000000000000000000000000000000000
--- a/paddle/trainer/TesterConfig.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/utils/Util.h"
-
-#include <stdio.h>
-
-#include "hl_gpu.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-
-#include "TrainerConfig.pb.h"
-
-#include <stdlib.h>
-#include <fstream>
-#include "ParameterUpdater.h"
-
-namespace paddle {
-
-/**
- * TesterConfig
- * general configs for training
- */
-struct TesterConfig {
-  /**
-   * indicate test period
-   */
-  int testPeriod;
-
-  /**
-   * indicate whether to save previous batch state
-   */
-  bool prevBatchState;
-
-  /**
-   * log period
-   */
-  int logPeriod;
-
-  /**
-   * loadsave parameters in pserver
-   */
-  bool loadsaveParametersInPserver;
-
-  /**
-   * feat file
-   */
-  std::string featFile;
-
-  /**
-   * predict output dir
-   */
-  std::string predictOutputDir;
-
-  /**
-   * trianer id
-   */
-  int trainerId;
-
-  /**
-   * distribute test
-   */
-  bool distributeTest;
-
-  /**
-   * training state
-   */
-  MachineState* trainState;
-
-  /**
-   * test state
-   */
-  MachineState* testState;
-
-  /**
-   * model list
-   */
-  std::string modelList;
-
-  /**
-   * test passes
-   */
-  int testPass;
-
-  /**
-   * num passes
-   */
-  int numPasses;
-
-  /**
-   * saving period
-   */
-  int savingPeriod;
-
-  /**
-   * test wait
-   */
-  int testWait;
-
-  /**
-   * init model path
-   */
-  std::string initModelPath;
-
-  /**
-   * save only one
-   */
-  bool saveOnlyOne;
-
-  /**
-   * testing mode
-   */
-  bool testing;
-
-  /**
-   * mode
-   */
-  int mode;
-
-  /**
-   * config loc
-   */
-  std::string config;
-};
-
-}  //  namespace paddle
diff --git a/paddle/trainer/ThreadParameterUpdater.cpp b/paddle/trainer/ThreadParameterUpdater.cpp
deleted file mode 100644
index 3c85c3aaac68fc29da90c24d1208887a17009d5f..0000000000000000000000000000000000000000
--- a/paddle/trainer/ThreadParameterUpdater.cpp
+++ /dev/null
@@ -1,309 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ThreadParameterUpdater.h"
-
-#include "paddle/utils/Logging.h"
-
-#include "paddle/math/SparseRowMatrix.h"
-#include "paddle/parameter/ThreadLocalBuffer.h"
-#include "paddle/utils/Thread.h"
-
-DECLARE_int32(trainer_count);
-
-namespace paddle {
-
-SgdThreadUpdater::SgdThreadUpdater(const OptimizationConfig& optConfig)
-    : config_(optConfig), numSamplesProcessed_(0) {
-  // fill types
-  auto types = sgdOptimizerGetTypes(optConfig, false /*inPserver*/);
-  for (auto type : types) {
-    addParameterType(type);
-  }
-}
-
-void SgdThreadUpdater::init(const std::vector<ParameterPtr>& parameters) {
-  ParameterUpdater::init(parameters);
-
-  // calc max parameter id
-  size_t maxId = 0;
-  for (auto& para : parameters_) {
-    maxId = std::max(maxId, para->getID());
-  }
-
-  optimizers_.resize(maxId + 1);
-  for (auto& para : parameters_) {
-    int pid = para->getID();
-    optimizers_[pid].reset(sgdOptimizerCreate(config_,
-                                              para->getConfig(),
-                                              para->isGradSparseUpdate(),
-                                              false /*inPserver*/));
-    size_t numRows = para->isGradSparseUpdate() ? para->getConfig().dims(0) : 0;
-    optimizers_[pid]->init(numRows, &para->getConfig());
-    if (para->isGradSparseUpdate() && FLAGS_trainer_count == 1) {
-      // For trainer_count=1, the gradient machine is NeuralNetwork, which does
-      // not create parameter buf for PARAMETER_GRADIENT for sparse update in
-      // Parameter::enableType(). But gradient parameter buf is still used
-      // in SgdThreadUpdater. We need to explicitly create it.
-      //
-      // The AverageOptimizer::restore/apply method will use PARAMETER_GRADIENT
-      // as a temp buffer.
-      para->enableBufType(PARAMETER_GRADIENT);
-    }
-  }
-}
-
-void SgdThreadUpdater::startPass() {
-  for (auto& para : parameters_) {
-    int pid = para->getID();
-    optimizers_[pid]->startPass();
-  }
-}
-
-bool SgdThreadUpdater::finishPass() {
-  catchUpWith();
-
-  for (auto& para : parameters_) {
-    int pid = para->getID();
-    optimizers_[pid]->finishPass();
-  }
-  return true;
-}
-
-void SgdThreadUpdater::updateImpl(Parameter* para) {
-  if (!para->useGpu()) return;
-  SetDevice setDevice(para->getDeviceId());
-  ParameterOptimizer* optimizer = optimizers_[para->getID()].get();
-  optimizer->update(para->getBufs(), para->getConfig());
-  if (auto callback = optimizer->needSpecialTraversal(para->getConfig())) {
-    callback(para->getBufs(), para->getConfig(), -1LU);
-  }
-
-  para->setValueUpdated();
-  para->clearGradient();
-}
-
-void SgdThreadUpdater::threadTraverse(
-    const ParameterOptimizer::TraverseCallback& callback,
-    int tid,
-    size_t numThreads,
-    Parameter* para) {
-  VectorPtr* vecs = parameter::getThreadLocalBuffer();
-  if (para->isGradSparseUpdate()) {
-    size_t height = para->getConfig().dims(0);
-    size_t width = para->getConfig().dims(1);
-    for (size_t i = tid; i < height; i += numThreads) {
-      // setup sub bufs
-      for (auto type : parameterTypes_) {
-        vecs[type]->subVecFrom(*para->getBuf(type), i * width, width);
-      }
-      callback(vecs, para->getConfig(), i);
-    }
-  } else {  // dense
-    // setup sub bufs
-    auto interval = calcSplitArrayInterval(
-        para->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
-    for (auto type : parameterTypes_) {
-      vecs[type]->subVecFrom(*para->getBuf(type), interval);
-    }
-
-    callback(vecs, para->getConfig(), -1LU);
-  }
-}
-
-void SgdThreadUpdater::traverse(GetTraverseCallback getTraverseCallback) {
-  bool hasCpuPara = false;
-  bool hasGpuPara = false;
-  for (auto& para : parameters_) {
-    if (para->useGpu()) {
-      hasGpuPara = true;
-    } else {
-      hasCpuPara = true;
-    }
-  }
-
-  auto cpuTraverse = [&](int tid, size_t numThreads) {
-    for (auto& para : parameters_) {
-      if (auto callback = getTraverseCallback(para.get())) {
-        threadTraverse(callback, tid, numThreads, para.get());
-      }
-    }
-  };
-  auto gpuTraverse = [&](int tid, size_t numThreads) {
-    for (auto& para : parameters_) {
-      if (para->useGpu()) {
-        if (auto callback = getTraverseCallback(para.get())) {
-          SetDevice setDevice(para->getDeviceId());
-          callback(para->getBufs(), para->getConfig(), -1LU);
-        }
-      }
-    }
-  };
-
-  if (hasCpuPara && hasGpuPara) {
-    getGlobalSyncThreadPool()->exec(cpuTraverse, gpuTraverse);
-  } else if (hasCpuPara) {
-    getGlobalSyncThreadPool()->exec(cpuTraverse);
-  } else if (hasGpuPara) {
-    gpuTraverse(0, 0);
-  }
-}
-
-void SgdThreadUpdater::catchUpWith() {
-  traverse([this](Parameter* para) {
-    return optimizers_[para->getID()]->startCatchUpWith();
-  });
-
-  for (auto& para : parameters_) {
-    int pid = para->getID();
-    optimizers_[pid]->finishCatchUpWith();
-  }
-}
-
-void SgdThreadUpdater::apply() {
-  catchUpWith();
-
-  traverse(
-      [this](Parameter* para) { return optimizers_[para->getID()]->apply(); });
-}
-
-void SgdThreadUpdater::restore() {
-  traverse([this](Parameter* para) {
-    return optimizers_[para->getID()]->restore();
-  });
-}
-
-PassType SgdThreadUpdater::startBatch(int64_t batchSize) {
-  numSamplesProcessed_ += batchSize;
-  for (auto& para : parameters_) {
-    int pid = para->getID();
-    optimizers_[pid]->startBatch(numSamplesProcessed_);
-  }
-  return PASS_TRAIN;
-}
-
-void SgdThreadUpdater::finishBatch(real cost) {
-  getGlobalSyncThreadPool()->exec([&](int tid, size_t numThreads) {
-    for (auto& para : parameters_) {
-      if (para->isGradSparseUpdate()) {
-        threadUpdateSparse(tid, numThreads, para.get());
-      } else if (!para->useGpu()) {
-        threadUpdateDense(tid, numThreads, para.get());
-      }
-    }
-  });
-
-  for (auto& para : parameters_) {
-    int pid = para->getID();
-    optimizers_[pid]->finishBatch();
-  }
-}
-
-void SgdThreadUpdater::threadUpdateSparse(int tid,
-                                          size_t numThreads,
-                                          Parameter* para) {
-  int pid = para->getID();
-  ParameterOptimizer* optimizer = optimizers_[pid].get();
-  VectorPtr* vecs = parameter::getThreadLocalBuffer();
-
-  size_t height = para->getConfig().dims(0);
-  size_t width = para->getConfig().dims(1);
-
-  if (dynamic_cast<SparseRowIdsCpuMatrix*>(
-          para->getMat(PARAMETER_GRADIENT).get())) {
-    // From MultiGradientMachine
-    SparseRowIdsCpuMatrix* mainMat = dynamic_cast<SparseRowIdsCpuMatrix*>(
-        para->getMat(PARAMETER_GRADIENT).get());
-    std::vector<uint32_t>& sparseIds = mainMat->getIds(tid);
-
-    for (auto id : sparseIds) {
-      // setup sub bufs
-      for (auto type : parameterTypes_) {
-        vecs[type]->subVecFrom(*para->getBuf(type), id * width, width);
-      }
-      optimizer->update(vecs, para->getConfig(), id);
-      vecs[PARAMETER_GRADIENT]->zeroMem();
-    }
-    sparseIds.clear();
-  } else if (dynamic_cast<SparseRowCpuMatrix*>(
-                 para->getMat(PARAMETER_GRADIENT).get())) {
-    // From NeuralNetwork
-    SparseRowCpuMatrix* mainMat = dynamic_cast<SparseRowCpuMatrix*>(
-        para->getMat(PARAMETER_GRADIENT).get());
-
-    std::vector<unsigned int>& localIndices =
-        mainMat->getIndexDictHandle()->localIndices;
-
-    auto interval =
-        calcSplitArrayInterval(localIndices.size(), tid, numThreads);
-    for (size_t i = interval.first; i < interval.second; ++i) {
-      auto id = localIndices[i];
-      real* row = mainMat->getLocalRow(i);
-      // setup sub bufs
-      for (auto type : parameterTypes_) {
-        if (type == PARAMETER_GRADIENT) {
-          vecs[type]->subVecFrom(row, 0, width);
-        } else {
-          vecs[type]->subVecFrom(*para->getBuf(type), id * width, width);
-        }
-      }
-      optimizer->update(vecs, para->getConfig(), id);
-      vecs[PARAMETER_GRADIENT]->zeroMem();
-    }
-    // For numThreads > 1, MultiGradientMachine is used, which goes
-    // to the above branch.
-    CHECK_EQ(numThreads, 1UL);
-    mainMat->clearIndices();
-  } else {
-    auto& m = *para->getMat(PARAMETER_GRADIENT).get();
-    LOG(FATAL) << "Internal error: " << para->getName() << " "
-               << typeid(m).name();
-  }
-
-  if (auto callback = optimizer->needSpecialTraversal(para->getConfig())) {
-    for (size_t i = tid; i < height; i += numThreads) {
-      // setup sub bufs
-      for (auto type : parameterTypes_) {
-        vecs[type]->subVecFrom(*para->getBuf(type), i * width, width);
-      }
-      callback(vecs, para->getConfig(), i);
-    }
-  }
-}
-
-void SgdThreadUpdater::threadUpdateDense(int tid,
-                                         size_t numThreads,
-                                         Parameter* para) {
-  int pid = para->getID();
-  ParameterOptimizer* optimizer = optimizers_[pid].get();
-  VectorPtr* vecs = parameter::getThreadLocalBuffer();
-
-  auto interval = calcSplitArrayInterval(
-      para->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/);
-
-  // setup sub bufs
-  for (auto type : parameterTypes_) {
-    vecs[type]->subVecFrom(*para->getBuf(type), interval);
-  }
-
-  // update
-  optimizer->update(vecs, para->getConfig());
-  vecs[PARAMETER_GRADIENT]->zeroMem();
-
-  if (auto callback = optimizer->needSpecialTraversal(para->getConfig())) {
-    callback(vecs, para->getConfig(), -1LU);
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/trainer/ThreadParameterUpdater.h b/paddle/trainer/ThreadParameterUpdater.h
deleted file mode 100644
index bc08a9e9f0eda1cab7776ba76c67e88add1028a9..0000000000000000000000000000000000000000
--- a/paddle/trainer/ThreadParameterUpdater.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/parameter/AverageOptimizer.h"
-#include "paddle/parameter/FirstOrderOptimizer.h"
-#include "paddle/parameter/OptimizerFunctions.h"
-#include "paddle/parameter/OptimizerWithRegularizer.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/parameter/Regularizer.h"
-#include "paddle/utils/Util.h"
-
-#include <memory>
-#include <vector>
-
-namespace paddle {
-
-/**
- * \brief A parameter updater that uses multiple threads to update parameters.
-   This parameter updater handles GPU and CPU updates differently,
-   because at the current moment, the merging on CPU is happening on the
-   main thread, and the its parameter size can be much larger than the one GPU.
-   Thus, for GPU, the parameter updates happens in updateImpl() function, which
-   is called by gradient machines as a callback function supplied to backward()
-   and forwardBackward().
-   For CPU, the parameter updates happens in separate threads maintained by this
-   class.
- */
-class SgdThreadUpdater : public ParameterUpdater {
-public:
-  explicit SgdThreadUpdater(const OptimizationConfig& optConfig);
-  virtual ~SgdThreadUpdater() {}
-
-  // Use the startPass() function of the base optimizer.
-  virtual void startPass();
-
-  // Use the finishPass() function of the base optimizer.
-  virtual bool finishPass();
-
-  virtual void init(const std::vector<ParameterPtr>& parameters);
-  virtual PassType startBatch(int64_t batchSize);
-  // Call finishBatch for each optimizer.
-  virtual void finishBatch(real cost);
-  virtual void catchUpWith();
-  virtual void apply();
-  virtual void restore();
-
-protected:
-  // This is the function that will be eventualy called by the GradientMachine.
-  // used only for GPU update.
-  virtual void updateImpl(Parameter* para);
-  OptimizationConfig config_;
-  int64_t numSamplesProcessed_;
-
-  // One optimizers for each parameter.
-  std::vector<std::unique_ptr<ParameterOptimizer>> optimizers_;
-
-  // The update function for CPU sparse parameters.
-  void threadUpdateSparse(int tid, size_t numThreads, Parameter* para);
-
-  // The update function for CPU dense parameters.
-  void threadUpdateDense(int tid, size_t numThreads, Parameter* para);
-  // The update function for after update operations, such as averager.
-  void threadTraverse(const ParameterOptimizer::TraverseCallback& callback,
-                      int tid,
-                      size_t numThreads,
-                      Parameter* para);
-  typedef std::function<const ParameterOptimizer::TraverseCallback(Parameter*)>
-      GetTraverseCallback;
-  void traverse(GetTraverseCallback getTraverseCallback);
-};
-
-}  // namespace paddle
diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp
deleted file mode 100644
index 3e4a2b5fa8a3981f6362edc1dc61ae1616e257ef..0000000000000000000000000000000000000000
--- a/paddle/trainer/Trainer.cpp
+++ /dev/null
@@ -1,653 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "Trainer.h"
-
-#include <stdio.h>
-
-#include <iomanip>
-#include <iostream>
-#include <limits>
-#include <sstream>
-
-#include <google/protobuf/text_format.h>
-
-#include "paddle/utils/Common.h"
-#include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-
-#include "RemoteParameterUpdater.h"
-#include "TesterConfig.h"
-#include "ThreadParameterUpdater.h"
-#include "TrainerConfigHelper.h"
-#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/gserver/layers/ValidationLayer.h"
-
-DEFINE_string(config, "", "Trainer config file");
-
-DEFINE_int32(test_period,
-             0,
-             "if equal 0, do test on all test data at the end of "
-             "each pass. While if equal non-zero, do test on all test "
-             "data every test_period batches");
-DEFINE_bool(test_all_data_in_one_period,
-            false,
-            "This option was deprecated, since we will always do "
-            "test on all test set ");
-
-DEFINE_bool(local, true, "Train in local mode or not");
-
-DEFINE_int32(average_test_period,
-             0,
-             "Do test on average parameter every so"
-             " many batches. MUST be devided by FLAGS_log_period."
-             " Default 0 means do not test average parameter");
-
-DEFINE_int32(saving_period, 1, "Save parameteres every so many passes");
-DEFINE_int64(saving_period_by_batches,
-             0,
-             "Save parameters every so many batches in one pass");
-DEFINE_string(save_dir, "", "Directory for saving model parameter");
-DEFINE_int32(start_pass,
-             0,
-             "Start training from this pass. "
-             "Will load parameter from the previous pass");
-DEFINE_int32(test_pass, -1, "Will load parameter start from this pass to test");
-DEFINE_int32(test_wait, 0, "Waiting for pass parameter if not exist");
-DEFINE_bool(with_cost, true, "enable cost layer or not");
-DEFINE_bool(distribute_test, false, "test in distribute mode");
-
-DEFINE_int32(num_passes, 100, "train for so many passes");
-
-DEFINE_string(config_args,
-              "",
-              "arguments passed to config file."
-              "Format: key1=value1,key2=value2");
-
-DEFINE_bool(save_only_one,
-            false,
-            "Save only parameters in last pass, remove previous.");
-
-DEFINE_string(feat_file, "", "File name of extracted feature.");
-DEFINE_string(predict_output_dir,
-              "",
-              "Directory that saves the predicted results of output layers");
-DEFINE_string(model_list, "", "File that saves the model list when evaluation");
-
-namespace paddle {
-
-void Trainer::init(const std::shared_ptr<TrainerConfigHelper>& config,
-                   bool testing,
-                   const std::shared_ptr<GradientMachine>& gradientMachine,
-                   const std::shared_ptr<DataProvider>& dataProvider,
-                   const std::shared_ptr<DataProvider>& testDataProvider) {
-  this->stats_ = std::make_shared<TrainerStats>();
-
-  config_ = config;
-
-  config_->updateConfigFromFlags();
-
-  testing_ = testing;
-
-  // in testing, mode_ may GradientMachine::kTesting or
-  // GradientMachine::kSgdSparseCpuTraining
-
-  if (FLAGS_local) {
-    CHECK(!FLAGS_loadsave_parameters_in_pserver)
-        << "local and loadsave_parameters_in_pserver can not both true";
-    if (config_->getOptConfig().use_sparse_remote_updater()) {
-      config_->disableRemoteSparseUpdaterForEachParams();
-      LOG(INFO) << "ignore sparse_remote_update=true due to  --local=true";
-    }
-  }
-  if (FLAGS_loadsave_parameters_in_pserver) {
-    CHECK(config_->getOptConfig().use_sparse_remote_updater())
-        << "no parameter to load from pserver, please check network config";
-  }
-  if (testing && !FLAGS_loadsave_parameters_in_pserver) {
-    if (config_->getOptConfig().use_sparse_remote_updater()) {
-      config_->disableRemoteSparseUpdater();
-      LOG(INFO) << "because parameter is loaded local,"
-                << "tester ignore sparse_remote_update flag";
-    }
-  }
-
-  CHECK(TrainAlgorithm::isValid(config_->getOptConfig().algorithm()))
-      << "invalid algorithm configuration: "
-      << config_->getOptConfig().algorithm();
-
-  bool useSparseUpdater = false;
-  for (auto& paraConfig : config_->getModelConfig().parameters()) {
-    if (paraConfig.sparse_update() || paraConfig.sparse_remote_update()) {
-      useSparseUpdater = true;
-    }
-  }
-
-  if (FLAGS_use_mkldnn) {
-    CHECK_EQ(FLAGS_trainer_count, 1) << "MKLDNN only need 1 trainer";
-  }
-
-  if (testing) {
-    LOG(INFO) << "trainer: in testing mode";
-    if (config_->getOptConfig().use_sparse_remote_updater() ||
-        FLAGS_trainer_count > 1) {
-      mode_ = GradientMachine::kSgdSparseCpuTraining;
-      LOG(INFO) << "trainer mode: SgdSparseCpuTraining";
-    } else {
-      mode_ = GradientMachine::kTesting;
-      LOG(INFO) << "trainer mode: Testing";
-    }
-  } else if (IGradientMachineMode::tryGetMode(
-                 (int*)&mode_,
-                 config_->getOptConfig().algorithm(),
-                 FLAGS_trainer_count,
-                 FLAGS_local,
-                 FLAGS_use_gpu)) {
-    LOG(INFO) << "Custom trainer mode.";
-  } else if ((config_->getOptConfig().algorithm() == TrainAlgorithm::SGD ||
-              config_->getOptConfig().algorithm() ==
-                  TrainAlgorithm::AsyncSGD) &&
-             useSparseUpdater) {
-    mode_ = GradientMachine::kSgdSparseCpuTraining;
-    LOG(INFO) << "trainer mode: SgdSparseCpuTraining";
-  } else {
-    mode_ = GradientMachine::kNormal;
-    LOG(INFO) << "trainer mode: Normal";
-  }
-
-  // initialize trainer internal
-  trainerInternal_.init(config_,
-                        gradientMachine,
-                        TrainerInternalConfig::createFromMode(mode_),
-                        stats_,
-                        testing);
-  std::unique_ptr<ParameterUtilConfig> paramConfig(
-      new ParameterUtilConfig(FLAGS_save_only_one,
-                              FLAGS_saving_period,
-                              FLAGS_loadsave_parameters_in_pserver,
-                              FLAGS_config));
-
-  paramUtil_.reset(
-      new paddle::ParameterUtil(config_,
-                                std::move(paramConfig),
-                                trainerInternal_.getGradientMachine(),
-                                trainerInternal_.getParameterUpdater()));
-
-  bool gpuData =
-      FLAGS_use_gpu && (!FLAGS_parallel_nn) &&
-      (!IGradientMachineMode::dataMustInCpu(mode_, FLAGS_trainer_count));
-
-  dataProvider_ = dataProvider;
-  if (!dataProvider_ && config_->hasDataConfig() && !testing_) {
-    dataProvider_.reset(DataProvider::create(*config_, *config_, gpuData));
-  }
-  if (!testDataProvider_) {
-    // No evaluator_ if there is testDataProvider but no dataProvider.
-    evaluator_.reset(trainerInternal_.getGradientMachine()->makeEvaluator());
-    currentEvaluator_.reset(
-        trainerInternal_.getGradientMachine()->makeEvaluator());
-    if (FLAGS_average_test_period > 0 && FLAGS_trainer_id == 0 &&
-        config_->getOptConfig().average_window() > 0) {
-      CHECK_EQ(FLAGS_average_test_period % FLAGS_log_period, 0)
-          << "FLAGS_average_test_period must be divided by FALGS_log_period";
-      averageEvaluator_.reset(
-          trainerInternal_.getGradientMachine()->makeEvaluator());
-    }
-  }
-
-  testDataProvider_ = testDataProvider;
-  if (!testDataProvider_ && config_->hasTestDataConfig()) {
-    testDataProvider_.reset(
-        DataProvider::create(config_->getTestDataConfig(), *config_, gpuData));
-  }
-  if (testDataProvider_) {
-    createTester();
-  }
-
-  if (!testing &&
-      (trainerInternal_.getGradientMachine()->hasStaticParameters())) {
-    CHECK(!FLAGS_loadsave_parameters_in_pserver)
-        << "is_static and loadsave_parameters_in_pserver can not both true";
-  }
-  if (testing) {
-    // will load per pass for tester
-  } else if (paramUtil_->tryLoadParametersFromConfig()) {
-    // load from config already.
-  } else {
-    trainerInternal_.getGradientMachine()->randParameters();
-  }
-
-  // Only non static parameters need to be updated
-  std::vector<ParameterPtr>& parameters =
-      trainerInternal_.getGradientMachine()->getNonStaticParameters();
-  if (trainerInternal_.getParameterUpdater()) {
-    trainerInternal_.getParameterUpdater()->init(parameters);
-
-    if (FLAGS_loadsave_parameters_in_pserver && FLAGS_trainer_id == 0) {
-      if (testing) {
-        // will load per pass for tester
-      } else if (!config_->getConfig().init_model_path().empty() &&
-                 (FLAGS_local || FLAGS_trainer_id == 0)) {
-        paramUtil_->loadParametersWithPath(
-            config_->getConfig().init_model_path(),
-            false /*local*/,
-            true /*remote*/);
-      } else if (config_->getConfig().start_pass() > 0 &&
-                 (FLAGS_local || FLAGS_trainer_id == 0)) {
-        CHECK(paramUtil_->loadParameters(config_->getConfig().start_pass() - 1,
-                                         false /*local*/,
-                                         true /*remote*/));
-      } else {
-        trainerInternal_.getParameterUpdater()->randParametersRemote();
-      }
-    }
-  }
-
-  // set current evaluator and evalutor
-  trainerInternal_.setCurrentEvaluator(currentEvaluator_.get());
-  trainerInternal_.setEvaluator(evaluator_.get());
-}
-
-void Trainer::train(size_t numPasses) {
-  startTrain();
-  for (size_t i = 0; i < numPasses; ++i) {
-    if (IGradientMachineMode::trainWholeDataInOneBatch(mode_)) {
-      trainOnePassBatch(config_->getConfig().start_pass() + i);
-    } else {
-      trainOnePass();
-    }
-    if (i < numPasses - 1) {
-      dataProvider_->reset();
-    }
-  }
-
-  finishTrain();
-}
-
-static double genPerturbation(real* d, real* grad, size_t dim) {
-  auto& reng = ThreadLocalRandomEngine::get();
-  std::uniform_real_distribution<double> dist(-1, 1);
-  double gradNorm = 0, dNorm = 0;
-  for (size_t i = 0; i < dim; ++i) {
-    d[i] = dist(reng);
-    dNorm += d[i] * d[i];
-    gradNorm += grad[i] * grad[i];
-  }
-  if (gradNorm > 0) {
-    real s = 0.5 * sqrt(gradNorm / dNorm);
-    for (size_t i = 0; i < dim; ++i) {
-      d[i] = s * d[i] + grad[i];
-    }
-  }
-  double delta = 0;
-  for (size_t i = 0; i < dim; ++i) {
-    delta += grad[i] * d[i];
-  }
-  return delta;
-}
-
-real Trainer::checkGradient() {
-  trainerInternal_.getGradientMachine()->start();
-  std::vector<ParameterPtr>& parameters =
-      trainerInternal_.getGradientMachine()->getNonStaticParameters();
-  DataBatch dataBatch;
-  int32_t batchSize = config_->getOptConfig().batch_size();
-
-  dataProvider_->getNextBatch(batchSize, &dataBatch);
-
-  CHECK(dataBatch.getSize()) << "No data from data provider";
-  std::vector<Argument>& inArgs = dataBatch.getStreams();
-  std::vector<Argument> outArgs;
-
-  trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
-  real cost = Argument::sum(outArgs);
-  LOG(INFO) << "original cost=" << cost;
-  trainerInternal_.getGradientMachine()->backward();
-
-  real maxDiff = 0;
-  char fill = ' ';
-  for (auto& parameter : parameters) {
-    CpuVector oldPara(parameter->getSize());
-    CpuVector newPara(parameter->getSize());
-    oldPara.copyFrom(*parameter->getBuf(PARAMETER_VALUE));
-    real* newp = newPara.getData();
-    real* oldp = oldPara.getData();
-    CpuVector cpuGrad(*parameter->getBuf(PARAMETER_GRADIENT));
-    real* grad = cpuGrad.getData();
-    size_t dim = parameter->getSize();
-    std::vector<real> d(dim);
-
-    double delta = genPerturbation(d.data(), grad, dim);
-
-    // use a step such that delta / cost is FLAGS_checkgrad_eps
-    real step =
-        (delta != 0) ? cost / delta * FLAGS_checkgrad_eps : FLAGS_checkgrad_eps;
-    delta *= step;
-    for (size_t i = 0; i < dim; ++i) {
-      newp[i] = oldp[i] + step * d[i];
-    }
-
-    parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara);
-    parameter->setValueUpdated();
-    trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
-    real newCost1 = Argument::sum(outArgs);
-
-    for (size_t i = 0; i < dim; ++i) {
-      newp[i] = oldp[i] - step * d[i];
-    }
-
-    parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara);
-    parameter->setValueUpdated();
-    trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
-    real newCost2 = Argument::sum(outArgs);
-
-    real trueDelta = 0.5 * (newCost1 - newCost2);
-    real diff = (1e-20 + trueDelta) / (1e-20 + delta) - 1;
-    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(fill)
-              << std::setw(20) << parameter->getName()
-              << "step=" << std::setw(15) << step << "cost1=" << std::setw(10)
-              << newCost1 << "cost2=" << std::setw(10) << newCost2
-              << "true_delta=" << std::setw(15) << trueDelta
-              << "analytic_delta=" << std::setw(15) << delta << "diff=" << diff
-              << (std::abs(diff) > 0.01 ? " ***" : "");
-
-    maxDiff = std::max(maxDiff, std::abs(diff));
-
-    // restore parameter
-    parameter->getBuf(PARAMETER_VALUE)->copyFrom(oldPara);
-    parameter->setValueUpdated();
-
-    fill = (fill == ' ') ? '.' : ' ';
-  }
-  return maxDiff;
-}
-
-void Trainer::startTrain() {
-  trainPassContext_.passId = config_->getConfig().start_pass();
-  srand(config_->getConfig().start_pass() + 1);
-  if (dataProvider_) {
-    dataProvider_->reset();
-  }
-
-  trainerInternal_.getGradientMachine()->start();
-}
-
-void Trainer::finishTrain() { trainerInternal_.getGradientMachine()->finish(); }
-
-void Trainer::startTrainPass() {
-  stats_->reset();
-  trainPassContext_.batchId = 0;
-  trainPassContext_.avgTestCost = 0;
-  trainPassContext_.numAvgTests = 0;
-  trainPassContext_.passInnerId = 1;
-
-  trainerInternal_.getParameterUpdater()->startPass();
-  evaluator_->start();
-  if (FLAGS_prev_batch_state) {
-    trainerInternal_.getGradientMachine()->resetState();
-    trainerInternal_.getGradientMachine()->getState(testState_);
-  }
-}
-
-void Trainer::trainOneDataBatch(DataBatch& dataBatch) {
-  int num = dataBatch.getSize();
-  if (averageEvaluator_) {
-    int64_t mod = trainPassContext_.batchId % FLAGS_average_test_period;
-    if (mod >= FLAGS_average_test_period - FLAGS_log_period) {
-      if (mod == FLAGS_average_test_period - FLAGS_log_period) {
-        averageEvaluator_->start();
-      }
-      trainerInternal_.getParameterUpdater()->apply();
-      if (FLAGS_prev_batch_state) {
-        trainerInternal_.getGradientMachine()->getState(trainState_);
-      }
-      trainPassContext_.avgTestCost += tester_->forwardOneBatch(
-          dataBatch, averageEvaluator_.get(), &forwardOutput_);
-      if (FLAGS_prev_batch_state) {
-        trainerInternal_.getGradientMachine()->setState(trainState_);
-      }
-      trainPassContext_.numAvgTests += num;
-      trainerInternal_.getParameterUpdater()->restore();
-    }
-  }
-  {
-    REGISTER_TIMER("TrainBatch");
-    trainerInternal_.trainOneBatch(
-        trainPassContext_.batchId, dataBatch, &forwardOutput_);
-  }
-
-  if (averageEvaluator_ &&
-      trainPassContext_.batchId % FLAGS_average_test_period ==
-          FLAGS_average_test_period - 1) {
-    averageEvaluator_->finish();
-    LOG(INFO) << " Averaged parameter:"
-              << " cost="
-              << trainPassContext_.avgTestCost / trainPassContext_.numAvgTests
-              << " Eval: " << *averageEvaluator_;
-    trainPassContext_.numAvgTests = 0;
-    trainPassContext_.avgTestCost = 0;
-  }
-
-  ++trainPassContext_.batchId;
-
-  if (trainPassContext_.batchId % FLAGS_log_period == 0) {
-    FOR_TIMING(globalStat.setThreadInfo(true));
-    FOR_TIMING(globalStat.printAllStatus());
-    FOR_TIMING(globalStat.reset());
-  }
-
-  if (testDataProvider_ && FLAGS_test_period > 0 &&
-      trainPassContext_.batchId % FLAGS_test_period == 0) {
-    tester_->testOnePeriod();
-  }
-
-  if (FLAGS_saving_period_by_batches > 0 &&
-      trainPassContext_.batchId >
-          FLAGS_saving_period_by_batches * trainPassContext_.passInnerId &&
-      0 == FLAGS_trainer_id) {
-    trainerInternal_.getParameterUpdater()->catchUpWith();
-    if (testDataProvider_) {
-      tester_->testOnePeriod();
-    }
-    paramUtil_->saveParametersOnePass(trainPassContext_.passId,
-                                      trainPassContext_.passInnerId);
-    ++trainPassContext_.passInnerId;
-  }
-}
-
-void Trainer::finishTrainPass() {
-  if (trainPassContext_.batchId == 0) {
-    // This means no more data from DataProvider
-    return;
-  }
-
-  trainerInternal_.finishTrainPass(trainPassContext_.passId,
-                                   trainPassContext_.batchId);
-
-  FOR_TIMING(globalStat.setThreadInfo(true));
-  FOR_TIMING(globalStat.printAllStatus());
-  FOR_TIMING(globalStat.reset());
-
-  if (testDataProvider_) {
-    tester_->testOnePeriod();
-  }
-
-  if (trainPassContext_.passId % FLAGS_saving_period == 0 &&
-      FLAGS_trainer_id == 0) {
-    paramUtil_->saveParametersOnePass(trainPassContext_.passId);
-  }
-  ++trainPassContext_.passId;
-}
-
-void Trainer::trainOnePass() {
-  startTrainPass();
-  size_t batchSize = config_->getOptConfig().batch_size();
-  while (true) {
-    DataBatch dataBatch;
-
-    int num = 0;
-    {
-      REGISTER_TIMER("getTrainBatch");
-      num = dataProvider_->getNextBatch(batchSize, &dataBatch);
-    }
-    if (num == 0) break;
-    CHECK_EQ(num, dataBatch.getSize());
-    trainOneDataBatch(dataBatch);
-  }
-
-  finishTrainPass();
-}
-
-void Trainer::trainOnePassBatch(int passId) {
-  this->stats_->reset();
-
-  trainerInternal_.getParameterUpdater()->startPass();
-  const std::vector<Argument> inArgs;
-  {
-    REGISTER_TIMER("onePass");
-    trainerInternal_.getGradientMachine()->forwardBackward(
-        inArgs, nullptr, PASS_TRAIN, nullptr);
-  }
-
-  real cost = .0;
-  int64_t num = 0;
-  trainerInternal_.getGradientMachine()->getStats(cost, num);
-  *stats_ += {num, cost};
-
-  trainerInternal_.getGradientMachine()->onPassEnd();
-
-  bool accepted = trainerInternal_.getParameterUpdater()->finishPass();
-
-  globalStat.setThreadInfo(true);
-  globalStat.printAllStatus();
-  globalStat.reset();
-
-  LOG(INFO) << " Pass=" << passId
-            << " AcceptedPass=" << (accepted ? acceptedPassId_ : -1)
-            << stats_->getStats(false /*withCurrentCost*/);
-
-  if (accepted) {
-    if (acceptedPassId_ % FLAGS_saving_period == 0 && FLAGS_trainer_id == 0) {
-      paramUtil_->saveParameters(acceptedPassId_);
-    }
-    acceptedPassId_++;
-    if (FLAGS_save_only_one && acceptedPassId_ >= FLAGS_saving_period) {
-      paramUtil_->deleteParameters(acceptedPassId_ - FLAGS_saving_period);
-    }
-  }
-}
-
-real Trainer::calcGradient(const DataBatch& dataBatch,
-                           const Vector& value,
-                           Vector& gradient) {
-  CHECK_EQ(value.getSize(), gradient.getSize());
-  std::vector<ParameterPtr>& parameters =
-      trainerInternal_.getGradientMachine()->getParameters();
-
-  clearGradient();
-
-  size_t offset = 0;
-  size_t valueSize = value.getSize();
-
-  for (auto& para : parameters) {
-    CHECK_LE(offset + para->getSize(), valueSize);
-    VectorPtr val =
-        Vector::create(para->getSize(), value.getMemoryHandle(), offset);
-    para->getBuf(PARAMETER_VALUE)->copyFrom(*val);
-    para->setValueUpdated();
-    offset += para->getSize();
-  }
-
-  CHECK_EQ(offset, valueSize);
-
-  std::vector<Argument> inArgs = dataBatch.getStreams();
-  std::vector<Argument> outArgs;
-
-  trainerInternal_.getGradientMachine()->forwardBackward(
-      inArgs, &outArgs, PASS_TRAIN);
-  real cost = Argument::sum(outArgs);
-
-  offset = 0;
-  for (auto& para : parameters) {
-    VectorPtr grad =
-        Vector::create(para->getSize(), gradient.getMemoryHandle(), offset);
-    if (para->getBuf(PARAMETER_GRADIENT)) {
-      grad->copyFrom(*para->getBuf(PARAMETER_GRADIENT));
-    }
-    offset += para->getSize();
-  }
-
-  return cost;
-}
-
-void Trainer::clearGradient() {
-  std::vector<ParameterPtr>& parameters =
-      trainerInternal_.getGradientMachine()->getNonStaticParameters();
-  for (auto& parameter : parameters) {
-    parameter->clearGradient();
-  }
-}
-
-int Trainer::getBatchSize() { return config_->getOptConfig().batch_size(); }
-
-void Trainer::createTester() {
-  tester_.reset(new paddle::Tester(config_,
-                                   createTesterConfig(),
-                                   trainerInternal_.getGradientMachine(),
-                                   trainerInternal_.getParameterUpdater(),
-                                   testDataProvider_));
-}
-
-void Trainer::test() { tester_->test(); }
-
-std::unique_ptr<TesterConfig> Trainer::createTesterConfig() {
-  TesterConfig* conf = new TesterConfig;
-  if (FLAGS_test_period) {
-    LOG(WARNING) << "The meaning of --test_period is changed: "
-                 << "if equal 0, do test on all test data at the end of "
-                 << "each pass. While if equal non-zero, do test on all test "
-                 << "data every test_period batches ";
-  }
-  if (FLAGS_test_all_data_in_one_period) {
-    LOG(WARNING) << "--test_all_data_in_one_period was deprecated, since "
-                 << "we will always do test on all test set ";
-  }
-  conf->testPeriod = FLAGS_test_period;
-  conf->prevBatchState = FLAGS_prev_batch_state;
-  conf->logPeriod = FLAGS_log_period;
-  conf->loadsaveParametersInPserver = FLAGS_loadsave_parameters_in_pserver;
-  conf->featFile = FLAGS_feat_file;
-  conf->predictOutputDir = FLAGS_predict_output_dir;
-  conf->trainerId = FLAGS_trainer_id;
-  conf->distributeTest = FLAGS_distribute_test;
-  conf->config = FLAGS_config;
-  conf->modelList = FLAGS_model_list;
-  conf->testPass = FLAGS_test_pass;
-  conf->numPasses = FLAGS_num_passes;
-  conf->savingPeriod = FLAGS_saving_period;
-  conf->testWait = FLAGS_test_wait;
-  conf->initModelPath = FLAGS_init_model_path;
-  conf->saveOnlyOne = FLAGS_save_only_one;
-  conf->testing = testing_;
-  conf->mode = mode_;
-  conf->trainState = &trainState_;
-  conf->testState = &testState_;
-  return std::unique_ptr<TesterConfig>(conf);
-}
-
-ParameterUtil* Trainer::getParameterUtilPtr() { return paramUtil_.get(); }
-}  // namespace paddle
diff --git a/paddle/trainer/Trainer.h b/paddle/trainer/Trainer.h
deleted file mode 100644
index fac589d1d711affcd008f90edf87d865c8362f69..0000000000000000000000000000000000000000
--- a/paddle/trainer/Trainer.h
+++ /dev/null
@@ -1,204 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/utils/Util.h"
-
-#include <stdio.h>
-
-#include "hl_gpu.h"
-#include "paddle/gserver/dataproviders/DataProvider.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-
-#include <stdlib.h>
-#include <fstream>
-#include "ParamUtil.h"
-#include "ParameterUpdater.h"
-#include "Tester.h"
-#include "TrainerConfigHelper.h"
-#include "TrainerInternal.h"
-
-DECLARE_int32(num_passes);
-
-namespace paddle {
-
-/**
- * Trainer Class
- *
- * Trainer combines GradientMachine, ParameterUpdater, DataProvider together to
- * train/test a NeuralNetwork.
- */
-class Trainer {
-public:
-  /**
-   * Ctor.
-   * @return
-   */
-  Trainer() : acceptedPassId_(0) {}
-
-  virtual ~Trainer() {}
-
-  /**
-   * initialize a new trainer using config
-   *
-   * @param config TrainerConfig.
-   * @param testing true if only for testing
-   * @param gradientMachine GradientMachine that will be trained.
-   *                        nullptr if create from config.
-   * @param dataProvider Train Data Provider. null if create from config.
-   * @param testDataProvider Test Data Provider. null if create from config.
-   */
-  virtual void init(
-      const std::shared_ptr<TrainerConfigHelper>& config,
-      bool testing = false,
-      const std::shared_ptr<GradientMachine>& gradientMachine = nullptr,
-      const std::shared_ptr<DataProvider>& dataProvider = nullptr,
-      const std::shared_ptr<DataProvider>& testDataProvider = nullptr);
-
-  /**
-   * Train until num_passes reached.
-   * One pass means neural network train through all training data.
-   *
-   * @param numPasses the number of traning pass.
-   * @note Durning neural network training, the num passes may set a very large
-   * value, and kill training process when result is good enough.
-   */
-  void train(size_t numPasses = (size_t)FLAGS_num_passes);
-
-  /**
-   * compare the gradient from bp with finite difference
-   * @return  the maximal difference
-   */
-  real checkGradient();
-
-  void startTrain();
-  void finishTrain();
-  void startTrainPass();
-  void finishTrainPass();
-  void trainOneDataBatch(DataBatch& dataBatch);
-  void time();
-
-  /**
-   * given a dataBatch and the current parameter value
-   * calculate its gradient and return the cost.
-   *
-   * TODO(yuyang18): I think this method is deprecated and buggy. Should it be
-   * removed?
-   */
-  real calcGradient(const DataBatch& dataBatch,
-                    const Vector& value,
-                    Vector& gradient);
-
-  /**
-   * Get Trainer Config.
-   */
-  const TrainerConfig& getConfig() const { return config_->getConfig(); }
-
-  /**
-   * Get Train Data Provider
-   */
-  const DataProviderPtr& getDataProvider() { return dataProvider_; }
-
-  /**
-   * Get Gradient Machine.
-   */
-  const GradientMachinePtr& getGradientMachine() {
-    return trainerInternal_.getGradientMachine();
-  }
-
-  /**
-   * Get batch size in optimization config.
-   * @note This method didn't return the actual batch size. Just batch size
-   * set in the optimization config. The actual batch size in one trainer may
-   * less than batch size in config due to there are not enough data.
-   */
-  int getBatchSize();
-
-  /**
-   * Do test job
-   */
-  void test();
-
-  /**
-   * Get parameter util ptr
-   *
-   * TODO(yuyang18): Make it return a smart pointer.
-   */
-  ParameterUtil* getParameterUtilPtr();
-
-protected:
-  /**
-   * Train one pass of data.
-   *
-   * SGD Method.
-   */
-  void trainOnePass();
-
-  /**
-   * Train one pass in one batch.
-   *
-   */
-  void trainOnePassBatch(int passId);
-
-  /**
-   * set parameter gradient to zero
-   */
-  void clearGradient();
-
-  void createTester();
-
-private:
-  std::unique_ptr<TesterConfig> createTesterConfig();
-
-protected:
-  std::shared_ptr<TrainerConfigHelper> config_;
-  std::shared_ptr<TrainerStats> stats_;
-
-  DataProviderPtr dataProvider_;
-  DataProviderPtr testDataProvider_;
-  MachineState trainState_;
-  MachineState testState_;
-
-  struct TrainPassContext {
-    int64_t batchId;
-    real avgTestCost;
-    int64_t numAvgTests;
-    int passId;
-    int passInnerId;
-  };
-  std::vector<paddle::Argument> forwardOutput_;
-
-  TrainPassContext trainPassContext_;
-
-  std::unique_ptr<Evaluator> evaluator_;
-  std::unique_ptr<Evaluator> currentEvaluator_;
-  std::unique_ptr<Evaluator> averageEvaluator_;
-  // training mode
-  // used to decide which GradientMachine and ParameterUpdater to create
-  GradientMachine::CreateMode mode_;
-  int testing_;
-  int acceptedPassId_;
-
-  // trainer tester
-  std::unique_ptr<Tester> tester_;
-
-  // parameter util
-  std::unique_ptr<ParameterUtil> paramUtil_;
-
-  // trainer Internal
-  TrainerInternal trainerInternal_;
-};
-
-}  // namespace paddle
diff --git a/paddle/trainer/TrainerBenchmark.cpp b/paddle/trainer/TrainerBenchmark.cpp
deleted file mode 100644
index 173653c81688fe4606731c68ea1854268b3f4590..0000000000000000000000000000000000000000
--- a/paddle/trainer/TrainerBenchmark.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#undef PADDLE_DISABLE_TIMER
-
-#include "Trainer.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-
-DECLARE_int32(test_period);
-
-DEFINE_bool(feed_data, false, "Wether to read data from DataProvider.");
-
-namespace paddle {
-
-void Trainer::time() {
-  startTrain();
-
-  trainerInternal_.getParameterUpdater()->startPass();
-  evaluator_->start();
-
-  DataBatch dataBatch;
-  int32_t batchSize = config_->getOptConfig().batch_size();
-  int32_t num = dataProvider_->getNextBatch(batchSize, &dataBatch);
-  CHECK_EQ(num, batchSize) << "The sample number is less than batch size "
-                           << num << " != " << batchSize;
-
-  CHECK(dataBatch.getSize()) << "No data from data provider";
-
-  std::vector<paddle::Argument> outputs;
-  // burning time
-  LOG(INFO) << "Burning time...";
-  for (int n = 0; n < 10; ++n) {
-    trainerInternal_.trainOneBatch(n, dataBatch, &outputs);
-  }
-  LOG(INFO) << "Burning time end.";
-
-  for (int n = 0; n < FLAGS_test_period; n++) {
-    if (FLAGS_feed_data) {
-      REGISTER_TIMER("GetData");
-      num = dataProvider_->getNextBatch(batchSize, &dataBatch);
-    }
-
-    if (num != batchSize) {
-      break;
-    }
-
-    {
-      REGISTER_TIMER("FwdBwd");
-      trainerInternal_.trainOneBatch(n, dataBatch, &outputs);
-    }
-  }
-  globalStat.setThreadInfo(true);
-  globalStat.printSegTimerStatus();
-  globalStat.reset();
-
-  finishTrain();
-}
-
-}  // namespace paddle
diff --git a/paddle/trainer/TrainerConfigHelper.cpp b/paddle/trainer/TrainerConfigHelper.cpp
deleted file mode 100644
index 2b68d89e48a3efd5de205ce33643b7e6320a4303..0000000000000000000000000000000000000000
--- a/paddle/trainer/TrainerConfigHelper.cpp
+++ /dev/null
@@ -1,199 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "TrainerConfigHelper.h"
-#include "ParamUtil.h"
-#include "TrainerConfig.pb.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/PythonUtil.h"
-
-DECLARE_string(config);
-DECLARE_string(init_model_path);
-DECLARE_int32(start_pass);
-DECLARE_string(save_dir);
-DECLARE_int32(trainer_id);
-DECLARE_bool(local);
-DECLARE_bool(with_cost);
-DECLARE_bool(with_gpu);
-DECLARE_bool(parallel_nn);
-DECLARE_string(config_args);
-DECLARE_bool(use_mkldnn);
-DECLARE_bool(use_mkl_packed);
-
-const char *kConfigParserModuleName = "paddle.trainer.config_parser";
-const char *kConfigParserFuncName = "parse_config_and_serialize";
-
-namespace paddle {
-
-struct TrainerConfigHelperPrivate {
-  TrainerConfig conf;
-};
-
-TrainerConfigHelper::TrainerConfigHelper(const std::string &configFilePath)
-    : m(new TrainerConfigHelperPrivate()) {
-  std::ostringstream configArgs;
-  configArgs << "trainer_id=" << FLAGS_trainer_id << ",local=" << FLAGS_local
-             << ",with_cost=" << FLAGS_with_cost << ",use_gpu=" << FLAGS_use_gpu
-             << ",parallel_nn=" << FLAGS_parallel_nn
-             << ",use_mkldnn=" << FLAGS_use_mkldnn
-             << ",use_mkl_packed=" << FLAGS_use_mkl_packed
-             << ",cudnn_version=" << hl_get_cudnn_lib_version();
-  if (!FLAGS_config_args.empty()) {
-    configArgs << "," << FLAGS_config_args;
-  }
-
-  VLOG(3) << "Parsing trainer config " << configFilePath;
-  std::string configProtoStr =
-      callPythonFunc(kConfigParserModuleName,
-                     kConfigParserFuncName,
-                     {configFilePath, configArgs.str()});
-  CHECK(m->conf.ParseFromString(configProtoStr));
-}
-
-TrainerConfigHelper::TrainerConfigHelper(const TrainerConfig &config)
-    : m(new TrainerConfigHelperPrivate()) {
-  m->conf = config;
-}
-
-TrainerConfigHelper::~TrainerConfigHelper() { delete m; }
-
-const TrainerConfig &TrainerConfigHelper::getConfig() const { return m->conf; }
-
-TrainerConfig &TrainerConfigHelper::getMutableConfig() { return m->conf; }
-
-const OptimizationConfig &TrainerConfigHelper::getOptConfig() const {
-  return m->conf.opt_config();
-}
-
-const ModelConfig &TrainerConfigHelper::getModelConfig() const {
-  return m->conf.model_config();
-}
-
-const DataConfig *TrainerConfigHelper::getDataConfigPtr() const {
-  if (m->conf.has_data_config()) {
-    return &m->conf.data_config();
-  } else {
-    return nullptr;
-  }
-}
-
-const DataConfig &TrainerConfigHelper::getTestDataConfig() const {
-  CHECK(m->conf.has_test_data_config());
-  return m->conf.test_data_config();
-}
-
-bool TrainerConfigHelper::hasDataConfig() const {
-  return m->conf.has_data_config();
-}
-
-bool TrainerConfigHelper::hasTestDataConfig() const {
-  return m->conf.has_test_data_config();
-}
-
-void TrainerConfigHelper::updateConfigFromFlags() {
-  if (!FLAGS_save_dir.empty()) {
-    m->conf.set_save_dir(FLAGS_save_dir);
-  }
-  if (!FLAGS_init_model_path.empty()) {
-    m->conf.set_init_model_path(FLAGS_init_model_path);
-  }
-  if (FLAGS_start_pass != 0) {
-    m->conf.set_start_pass(FLAGS_start_pass);
-  }
-}
-
-void TrainerConfigHelper::disableRemoteSparseUpdater() {
-  m->conf.mutable_opt_config()->set_use_sparse_remote_updater(false);
-}
-
-void TrainerConfigHelper::disableRemoteSparseUpdaterForEachParams() {
-  this->disableRemoteSparseUpdater();
-  for (int i = 0; i < m->conf.model_config().parameters_size(); ++i) {
-    m->conf.mutable_model_config()
-        ->mutable_parameters(i)
-        ->set_sparse_remote_update(false);
-  }
-}
-
-OptimizationConfig &TrainerConfigHelper::getOptConfig() {
-  return *m->conf.mutable_opt_config();
-}
-
-void TrainerConfigHelper::setSaveDir(const std::string &saveDir) {
-  m->conf.set_save_dir(saveDir);
-}
-
-const std::string &TrainerConfigHelper::getSaveDir() const {
-  return m->conf.save_dir();
-}
-
-std::string TrainerConfigHelper::getConfigNameFromPath(
-    const std::string &modelPath) {
-  std::ifstream s(path::join(modelPath, "path.txt"));
-  CHECK(s.is_open()) << " fail to open path.txt";
-  std::string ss;
-  getline(s, ss);
-  VLOG(3) << "fileName " << path::join(modelPath, ss);
-  s.close();
-  return path::join(modelPath, ss);
-}
-
-std::string TrainerConfigHelper::getConfigNameFromPassId(
-    int passId, const std::string &modelPath) {
-  constexpr int kBufLen = 100;
-  char buf[kBufLen];
-  snprintf(buf, kBufLen, "pass-%05d", passId);
-  return TrainerConfigHelper::getConfigNameFromPath(path::join(modelPath, buf));
-}
-
-std::string TrainerConfigHelper::getConfigName(bool *ok) const {
-  std::string retv = "";
-
-  if (!m->conf.config_file().empty()) {
-    retv = m->conf.config_file();
-  } else if (!m->conf.init_model_path().empty()) {
-    retv = getConfigNameFromPath(m->conf.init_model_path());
-  } else if (m->conf.start_pass() >= 1) {
-    retv = getConfigNameFromPassId(m->conf.start_pass(), m->conf.save_dir());
-  }
-
-  if (ok) {
-    *ok = !retv.empty();
-  }
-
-  return retv;
-}
-
-std::shared_ptr<TrainerConfigHelper> TrainerConfigHelper::createFromFlags() {
-  std::string configPath;
-  if (!FLAGS_config.empty()) {
-    configPath = FLAGS_config;
-  } else if (!FLAGS_init_model_path.empty()) {
-    configPath = getConfigNameFromPath(FLAGS_init_model_path);
-  } else if (FLAGS_start_pass >= 1) {
-    configPath =
-        getConfigNameFromPassId(FLAGS_start_pass - 1, FLAGS_init_model_path);
-  } else {
-    return nullptr;
-  }
-  return std::make_shared<TrainerConfigHelper>(configPath);
-}
-
-std::shared_ptr<TrainerConfigHelper>
-TrainerConfigHelper::createFromFlagConfig() {
-  CHECK(!FLAGS_config.empty());
-  return std::make_shared<TrainerConfigHelper>(FLAGS_config);
-}
-
-}  // namespace paddle
diff --git a/paddle/trainer/TrainerConfigHelper.h b/paddle/trainer/TrainerConfigHelper.h
deleted file mode 100644
index f1366cc041b0d983e65a1bf5b02ec2128324c5a8..0000000000000000000000000000000000000000
--- a/paddle/trainer/TrainerConfigHelper.h
+++ /dev/null
@@ -1,205 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <paddle/utils/Logging.h>
-#include <paddle/utils/Util.h>
-#include <memory>
-
-namespace paddle {
-
-class TrainerConfig;
-class OptimizationConfig;
-struct TrainerConfigHelperPrivate;
-class ModelConfig;
-class DataConfig;
-
-/**
- * @brief TrainerConfig Helper. A class wrap protobuf's TrainerConfig Object,
- * simplize the usage for TrainerConfig.
- *
- * The all operation to TrainerConfig object should use this object. It remove
- * many copy & paste code in trainer.
- *
- * @TODO(yuyang18): Make cmake check compiler support keyword 'final' or not.
- * Define a macro to unify 'final' keyword
- */
-class TrainerConfigHelper /*final*/ {
-public:
-  DISABLE_COPY(TrainerConfigHelper);
-
-  /**
-   * @brief Ctor, Create a TrainerConfig from config file
-   * @param configFilePath Config file path.
-   */
-  explicit TrainerConfigHelper(const std::string& configFilePath);
-  explicit TrainerConfigHelper(const TrainerConfig& config);
-
-  /**
-   * Dtor
-   * @warning this class is a final class. Should not be inherited.
-   */
-  ~TrainerConfigHelper();
-
-  /**
-   * @brief Get Trainer Config itself.
-   */
-  const TrainerConfig& getConfig() const;
-
-  TrainerConfig& getMutableConfig();
-
-  /**
-   * @brief Get Optimizer Config.
-   */
-  const OptimizationConfig& getOptConfig() const;
-
-  /**
-   * @brief Get Model Config.
-   */
-  const ModelConfig& getModelConfig() const;
-
-  /**
-   * @brief Get Train Data Config Pointer.
-   * @return nullptr if there is no train data. Else will return pointer
-   */
-  const DataConfig* getDataConfigPtr() const;
-
-  /**
-   * @brief Get Tain Data Config.
-   * @warning Core when there is no train data.
-   */
-  const DataConfig& getDataConfig() const {
-    CHECK(this->hasDataConfig());
-    auto conf = this->getDataConfigPtr();
-    return *conf;
-  }
-
-  /**
-   * @brief Get test data config
-   * @warning Core when there is no test data.
-   */
-  const DataConfig& getTestDataConfig() const;
-
-  /**
-   * @brief Has train data config or not.
-   * @return true if has train data.
-   */
-  bool hasDataConfig() const;
-
-  /**
-   * @brief Has test data config or not.
-   * @return true if has test data.
-   */
-  bool hasTestDataConfig() const;
-
-  /**
-   * @brief Update trainer config from command line flags.
-   *        Override config's (save_dir, init_model_path, start_pass) if command
-   *        flags is existed.
-   */
-  void updateConfigFromFlags();
-
-  /**
-   * @brief Disable optimization's sparse remote update.
-   */
-  void disableRemoteSparseUpdater();
-
-  /**
-   * @brief Disable optimization and each parameter's sparse remote update.
-   */
-  void disableRemoteSparseUpdaterForEachParams();
-
-  /**
-   * @brief implicit conversion.
-   */
-  inline operator const TrainerConfig&() const { return this->getConfig(); }
-
-  /**
-   * @brief implicit conversion.
-   */
-  inline operator const OptimizationConfig&() const {
-    return this->getOptConfig();
-  }
-
-  /**
-   * @brief implicit conversion.
-   */
-  inline operator const DataConfig&() const { return this->getDataConfig(); }
-
-  /**
-   * @brief implicit conversion.
-   */
-  inline operator const ModelConfig&() const { return this->getModelConfig(); }
-
-  /**
-   * @brief Get mutable optimization config.
-   */
-  OptimizationConfig& getOptConfig();
-
-  /**
-   * @brief set model save directory.
-   * @param saveDir Directory path.
-   */
-  void setSaveDir(const std::string& saveDir);
-
-  /**
-   * @brief get model save directory.
-   * @return save directory path.
-   */
-  const std::string& getSaveDir() const;
-
-  /**
-   * @brief Get config file name from model path.
-   *
-   * Paddle save model to a directory, and write a file 'path.txt' which save
-   * config filename.
-   *
-   * @param modelPath model saved directory.
-   * @return config file name.
-   */
-  static std::string getConfigNameFromPath(const std::string& modelPath);
-
-  /**
-   * @brief Get config file name from this config instance.
-   * @param[out] ok true if no error.
-   * @return config file name.
-   */
-  std::string getConfigName(bool* ok = nullptr) const;
-
-  /**
-   * @brief Try to create TrainerConfigHelper from all command line flags.
-   *        Try to load from --config, --init_model_path, --start_pass one by
-   *        one. Return nullptr if cannot load TrainerConfigHelper from all
-   *        these place.
-   * @return nullptr if cannot load, otherwise return a TrainerConfigHelper.
-   */
-  static std::shared_ptr<TrainerConfigHelper> createFromFlags();
-
-  /**
-   * @brief Try to create TrainerConfigHelper only from '--config' flag.
-   * @return nullptr if cannot load, otherwise return a TrainerConfigHelper.
-   */
-  static std::shared_ptr<TrainerConfigHelper> createFromFlagConfig();
-
-private:
-  static std::string getConfigNameFromPassId(int passId,
-                                             const std::string& modelPath);
-
-  TrainerConfigHelperPrivate* m;
-};
-
-typedef std::shared_ptr<TrainerConfigHelper> TrainerConfigHelperPtr;
-
-}  // namespace paddle
diff --git a/paddle/trainer/TrainerInternal.cpp b/paddle/trainer/TrainerInternal.cpp
deleted file mode 100644
index 4c5d4a0913aaf3a9932b3d67806378ece4245304..0000000000000000000000000000000000000000
--- a/paddle/trainer/TrainerInternal.cpp
+++ /dev/null
@@ -1,303 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "TrainerInternal.h"
-
-#include <fenv.h>
-#include <stdio.h>
-
-#include <iomanip>
-#include <iostream>
-#include <limits>
-#include <sstream>
-
-#include <google/protobuf/text_format.h>
-
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/gserver/layers/ValidationLayer.h"
-#include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-
-#include "RemoteParameterUpdater.h"
-#include "ThreadParameterUpdater.h"
-
-namespace paddle {
-
-void TrainerInternal::init(const std::shared_ptr<TrainerConfigHelper>& config,
-                           const GradientMachinePtr& gradientMachine,
-                           std::unique_ptr<TrainerInternalConfig>&& intconfig,
-                           const std::shared_ptr<TrainerStats>& stats,
-                           bool testing) {
-  config_ = config;
-  intconfig_ = std::move(intconfig);
-  stats_ = stats;
-
-  //! in training will use parameter updater definitly.
-  //! But only use parameter in testing mode when some parameter in pserver.
-  if (!testing || (config_->getOptConfig().use_sparse_remote_updater() &&
-                   intconfig_->loadsave_parameters_in_pserver)) {
-    createParameterUpdater(testing);
-  }
-
-  gradientMachine_ = gradientMachine;
-  if (!gradientMachine) {
-    CHECK(config_->getConfig().has_model_config())
-        << "Missing model_config in trainer_config";
-    gradientMachine_.reset(
-        GradientMachine::create(config_->getConfig().model_config(),
-                                intconfig_->mode,
-                                parameterUpdater_->getParameterTypes()));
-  }
-}
-
-void TrainerInternal::trainOneBatch(int64_t batchId,
-                                    const DataBatch& dataBatch,
-                                    std::vector<Argument>* outArgs) {
-  // true means updating parameter whenever gradient is ready during backward()
-  bool doPipelineUpdate =
-      (intconfig_->mode != GradientMachine::kSgdSparseCpuTraining) &&
-      (intconfig_->local || intconfig_->use_gpu ||
-       intconfig_->trainer_count <= 1);
-
-  int64_t actualBatchSize = dataBatch.getSize();
-  if (actualBatchSize == 0) {
-    return;
-  }
-
-  bool showStats = intconfig_->show_param_stats_period > 0 &&
-                   (batchId + 1) % intconfig_->show_param_stats_period == 0 &&
-                   intconfig_->trainer_id == 0;
-
-  std::vector<ParaStat> paraStats;
-  if (showStats) {
-    paraStats.resize(gradientMachine_->getParameters().size());
-  }
-
-  const std::vector<Argument>& inArgs = dataBatch.getStreams();
-
-  PassType passType = parameterUpdater_->startBatch(actualBatchSize);
-
-  if (config_->getOptConfig().use_sparse_remote_updater()) {
-    REGISTER_TIMER("prefetch");
-    gradientMachine_->prefetch(inArgs);
-    parameterUpdater_->getParametersRemote();
-  }
-
-  UpdateCallback updateCallback = [this, showStats, &paraStats](
-      Parameter* para) {
-    if (showStats) {
-      //! @TODO(yuyang18) Show stats is actually a ParameterHook, refactor
-      // it
-      //! to ParameterHook.
-      auto& grad = para->getBuf(PARAMETER_GRADIENT);
-      SetDevice device(para->getDeviceId());
-      paraStats[para->getID()].avgAbsGrad = grad->getAbsSum() / para->getSize();
-      paraStats[para->getID()].maxAbsGrad = grad->getAbsMax();
-    }
-    parameterUpdater_->update(para);
-  };
-
-  {
-#ifndef PADDLE_DISABLE_TIMER
-    Timer timer;
-    timer.start();
-#endif
-    REGISTER_TIMER("forwardBackward");
-    forwardBackwardBatch(
-        inArgs, *outArgs, passType, updateCallback, doPipelineUpdate);
-#ifndef PADDLE_DISABLE_TIMER
-    timer.stop();
-    parameterUpdater_->setForwardbackwardTime(timer.get());
-#endif
-  }
-
-  if (!doPipelineUpdate) {
-    auto& parameters = gradientMachine_->getNonStaticParameters();
-    for (auto& para : parameters) {
-      updateCallback(para.get());
-    }
-  }
-
-  real cost = 0;
-  {
-    REGISTER_TIMER("sumCost");
-    cost = Argument::sum(*outArgs);
-  }
-
-  if (batchId % intconfig_->log_period == 0) {
-    currentEvaluator_->start();
-    stats_->resetCurrentStat();
-  }
-  {
-    REGISTER_TIMER("eval");
-    gradientMachine_->eval(currentEvaluator_);
-    gradientMachine_->eval(evaluator_);
-  }
-
-  *stats_ += {actualBatchSize, cost};
-  {
-    REGISTER_TIMER("finishBatch");
-    parameterUpdater_->finishBatch(cost);
-  }
-
-  if (showStats) {
-    showParameterStats(paraStats);
-  }
-  if ((batchId + 1) % intconfig_->log_period == 0) {
-    currentEvaluator_->finish();
-
-    if (intconfig_->dot_period > 0) {
-      std::cerr << std::endl;
-    }
-    LOG(INFO) << " Batch=" << batchId + 1 << " " << *stats_
-              << " Eval: " << *evaluator_
-              << " CurrentEval: " << *currentEvaluator_;
-  } else if (intconfig_->dot_period > 0 &&
-             (batchId + 1) % intconfig_->dot_period == 0) {
-    std::cerr << ".";
-  }
-}
-
-/**
- * finish train pass
- */
-void TrainerInternal::finishTrainPass(int passId, int batchId) {
-  gradientMachine_->onPassEnd();
-  parameterUpdater_->finishPass();
-  evaluator_->finish();
-  LOG(INFO) << " Pass=" << passId << " Batch=" << batchId << " "
-            << stats_->getStats(false /*without current cost*/)
-            << " Eval: " << *evaluator_;
-}
-
-void TrainerInternal::showParameterStats(
-    const std::vector<ParaStat>& paraStats) {
-  std::vector<ParameterPtr>& parameters = gradientMachine_->getParameters();
-  for (auto& parameter : parameters) {
-    SetDevice device(parameter->getDeviceId());
-    real sum = parameter->getBuf(PARAMETER_VALUE)->getAbsSum();
-    const auto& lr = parameter->getBuf(PARAMETER_LEARNING_RATE);
-    std::ostringstream osLrHistogram;
-    if (lr) {
-      if (VLOG_IS_ON(2)) {
-        osLrHistogram << " lr_histogram: ";
-        lr->histogram(osLrHistogram);
-      } else {
-        osLrHistogram << " max_lr=" << std::setw(11) << lr->getMax()
-                      << " min_lr=" << std::setw(11) << lr->getMin()
-                      << " avg_lr=" << std::setw(11)
-                      << lr->getSum() / parameter->getSize();
-      }
-    }
-    int pid = parameter->getID();
-    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
-              << std::setw(20) << parameter->getName()
-              << " avg_abs_val=" << std::setw(11) << sum / parameter->getSize()
-              << " max_val=" << std::setw(11)
-              << parameter->getBuf(PARAMETER_VALUE)->getAbsMax()
-              << " avg_abs_grad=" << std::setw(11) << paraStats[pid].avgAbsGrad
-              << " max_grad=" << std::setw(11) << paraStats[pid].maxAbsGrad
-              << osLrHistogram.str();
-  }
-}
-
-void TrainerInternal::createParameterUpdater(bool testing) {
-  const std::string& alg = config_->getOptConfig().algorithm();
-  parameterUpdater_.reset(ParameterUpdaterCreators::tryCreateUpdater(
-      alg, config_->getOptConfig(), intconfig_->local, intconfig_->num_passes));
-  if (parameterUpdater_) {
-    return;
-  }
-
-  if (!intconfig_->local) {
-    if (testing && config_->getOptConfig().use_sparse_remote_updater()) {
-      std::unique_ptr<ParameterUpdater> localUpdater;
-      localUpdater.reset(
-          new SgdLocalUpdater(config_->getOptConfig()));  // do nothing
-      parameterUpdater_.reset(
-          new SparseRemoteParameterUpdaterComposite(config_->getOptConfig(),
-                                                    intconfig_->num_passes,
-                                                    testing,
-                                                    std::move(localUpdater)));
-    } else {
-      if (GradientMachine::kSgdSparseCpuTraining == intconfig_->mode &&
-          !intconfig_->use_old_updater) {
-        intconfig_->use_old_updater = true;
-        LOG(INFO) << "Sgd sparse training can not work with"
-                  << " ConcurrentRemoteParameterUpdater,"
-                  << " automatically reset --use_old_updater=true";
-      }
-
-      std::unique_ptr<ParameterUpdater> localUpdater;
-      if (config_->getOptConfig().num_batches_per_send_parameter() > 1) {
-        CHECK(alg == TrainAlgorithm::SGD || alg == TrainAlgorithm::AsyncSGD)
-            << "Unsupported algorithm in remote-local mode: " << alg;
-        if (GradientMachine::kSgdSparseCpuTraining == intconfig_->mode) {
-          localUpdater.reset(new SgdThreadUpdater(*config_));
-        } else {
-          localUpdater.reset(new SgdLocalUpdater(*config_));
-        }
-      }
-
-      localUpdater.reset(
-          intconfig_->use_old_updater
-              ? new RemoteParameterUpdater(
-                    *config_, intconfig_->num_passes, std::move(localUpdater))
-              : new ConcurrentRemoteParameterUpdater(
-                    *config_, intconfig_->num_passes, std::move(localUpdater)));
-
-      if (config_->getOptConfig().use_sparse_remote_updater()) {
-        localUpdater.reset(
-            new SparseRemoteParameterUpdaterComposite(*config_,
-                                                      intconfig_->num_passes,
-                                                      testing,
-                                                      std::move(localUpdater)));
-      }
-
-      this->parameterUpdater_ = std::move(localUpdater);
-    }
-  } else {
-    CHECK_EQ(config_->getOptConfig().num_batches_per_send_parameter(), 1)
-        << "num_batches_per_send_parameter should be one in local mode!";
-
-    if (GradientMachine::kSgdSparseCpuTraining == intconfig_->mode) {
-      parameterUpdater_.reset(new SgdThreadUpdater(*config_));
-    } else if (alg == TrainAlgorithm::SGD || alg == TrainAlgorithm::AsyncSGD) {
-      if (config_->getModelConfig().type() == "recursive_nn") {
-        parameterUpdater_.reset(new SgdCpuUpdater(*config_));
-      } else if (intconfig_->use_gpu &&
-                 config_->getOptConfig().do_average_in_cpu() &&
-                 config_->getOptConfig().average_window() > 0) {
-        parameterUpdater_.reset(new SgdUpdaterWithCpuAverager(*config_));
-      } else {
-        parameterUpdater_.reset(new SgdLocalUpdater(*config_));
-      }
-    } else {
-      LOG(FATAL) << "Unsupported algorithm in local mode: " << alg;
-    }
-  }
-}
-
-void TrainerInternal::forwardBackwardBatch(const std::vector<Argument>& inArgs,
-                                           std::vector<Argument>& outArgs,
-                                           PassType& passType,
-                                           UpdateCallback updateCallback,
-                                           bool doPipelineUpdate) {
-  gradientMachine_->forwardBackward(
-      inArgs, &outArgs, passType, doPipelineUpdate ? updateCallback : nullptr);
-}
-
-}  // namespace paddle
diff --git a/paddle/trainer/TrainerInternal.h b/paddle/trainer/TrainerInternal.h
deleted file mode 100644
index 7018faab24744f7a087a53130acc56ec6314101e..0000000000000000000000000000000000000000
--- a/paddle/trainer/TrainerInternal.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/utils/Util.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <fstream>
-
-#include "ParameterUpdater.h"
-#include "TrainerConfig.pb.h"
-#include "TrainerConfigHelper.h"
-#include "TrainerInternalConfig.h"
-#include "hl_gpu.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-
-namespace paddle {
-
-/**
- * TrainerInteral
- * the core training class for driving training logic
- */
-class TrainerInternal {
-public:
-  struct ParaStat {
-    real maxAbsGrad;
-    real avgAbsGrad;
-    ParaStat() : maxAbsGrad(.0), avgAbsGrad(.0) {}
-  };
-
-  TrainerInternal() {}
-
-  /**
-   * Intializes trainer internal class
-   * @param config network config
-   * @param machine gradient machine
-   * @param intconfig training config
-   * @param stats training stats
-   * @param testing if it is in testing phase
-   */
-  void init(const std::shared_ptr<TrainerConfigHelper>& config,
-            const GradientMachinePtr& machine,
-            std::unique_ptr<TrainerInternalConfig>&& intconfig,
-            const std::shared_ptr<TrainerStats>& stats,
-            bool testing);
-
-  virtual ~TrainerInternal() {}
-
-  /**
-   * CreateParameterUpdater
-   * @param testing if it is in testing phase
-   */
-  void createParameterUpdater(bool testing);
-
-  /**
-   * FinishTrainPass
-   * @param passId current pass id
-   * @param batchId current batch id, starts from 0
-   */
-  void finishTrainPass(int passId, int batchId);
-
-  /**
-   * trainOneBatch
-   * @param batchId current batch id
-   * @param dataBatch data for the batch
-   */
-  void trainOneBatch(int64_t batchId,
-                     const DataBatch& dataBatch,
-                     std::vector<Argument>* outArgs);
-
-  /**
-   * showParameterStats
-   * @param paraStats training stats
-   */
-  void showParameterStats(const std::vector<ParaStat>& paraStats);
-
-  /**
-   * getGradientMachine
-   */
-  inline const GradientMachinePtr& getGradientMachine() const {
-    return gradientMachine_;
-  }
-
-  /**
-   * getParameterUpdater
-   */
-  inline const std::shared_ptr<ParameterUpdater>& getParameterUpdater() {
-    return parameterUpdater_;
-  }
-
-  /**
-   * setCurrentEvaluator
-   * @param eval evaluator to set
-   */
-  inline void setCurrentEvaluator(Evaluator* eval) { currentEvaluator_ = eval; }
-
-  /**
-   * setEvaluator
-   * @param eval evaluator to set
-   */
-  inline void setEvaluator(Evaluator* eval) { evaluator_ = eval; }
-
-  /**
-   * forwardBackwardBatch
-   * @param inArgs input argument for data batch
-   * @param outArgs output argument from neural network
-   * @param updateCallback layerwise parameter gradient statistics
-   * @param doPipelineUpdate whether to do pipeline update
-   */
-  virtual void forwardBackwardBatch(const std::vector<Argument>& inArgs,
-                                    std::vector<Argument>& outArgs,
-                                    PassType& passType,
-                                    UpdateCallback updateCallback,
-                                    bool doPipelineUpdate);
-
-protected:
-  std::shared_ptr<ParameterUpdater> parameterUpdater_;
-  GradientMachinePtr gradientMachine_;
-  std::shared_ptr<TrainerConfigHelper> config_;
-  std::unique_ptr<TrainerInternalConfig> intconfig_;
-  std::shared_ptr<TrainerStats> stats_;
-  Evaluator* currentEvaluator_;
-  Evaluator* evaluator_;
-};
-
-}  // namespace paddle
diff --git a/paddle/trainer/TrainerInternalConfig.h b/paddle/trainer/TrainerInternalConfig.h
deleted file mode 100644
index b47692720efc2ed4f2db84f61ca81fcb52d234c0..0000000000000000000000000000000000000000
--- a/paddle/trainer/TrainerInternalConfig.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/utils/Util.h"
-
-#include <stdio.h>
-
-#include "hl_gpu.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-
-#include "TrainerConfig.pb.h"
-
-#include <stdlib.h>
-#include <fstream>
-#include <sstream>
-#include "ParameterUpdater.h"
-
-namespace paddle {
-/**
- * @brief TrainerStats object will statistics sample processed and total cost.
- *
- * There are two stats in it, the 'AvgCost' and 'CurrentAvgCost'. 'AvgCost'
- * means cost through one pass(all mini-batches). 'CurrentAvgCost' means cost
- * through one mini-batch.
- */
-class TrainerStats {
-public:
-  /**
-   * @brief reset all stats.
-   *
-   * often used before pass start.
-   */
-  inline void reset() {
-    numProcessed_ = 0;
-    totalCost_ = .0;
-    this->resetCurrentStat();
-  }
-
-  /**
-   * @brief reset current stat.
-   *
-   * 'current' means the most recent --log_period mini-batches
-   */
-  inline void resetCurrentStat() {
-    currentCost_ = .0;
-    currentSamples_ = 0;
-  }
-
-  /**
-   * @brief add cost to stat.
-   * @param numProcessed current mini-batch size
-   * @param cost current mini-batch cost
-   */
-  inline void addCost(int64_t numProcessed, real cost) {
-    this->numProcessed_ += numProcessed;
-    this->totalCost_ += cost;
-    this->currentSamples_ += numProcessed;
-    this->currentCost_ += cost;
-  }
-
-  /**
-   * @brief get average cost through on pass(all processed mini-batches)
-   * @return pass average cost
-   */
-  inline real getAvgCost() const {
-    CHECK_NE(this->numProcessed_, 0);
-    return this->totalCost_ / this->numProcessed_;
-  }
-
-  /**
-   * @brief get current mini-batch's average cost.
-   * @return mini-batch average cost
-   */
-  inline real getCurrentAvgCost() const {
-    CHECK_NE(this->currentSamples_, 0);
-    return this->currentCost_ / this->currentSamples_;
-  }
-
-  /**
-   * @brief get all processed samples' number
-   * @return all processed samples' number
-   */
-  inline int64_t getNumProcessed() const { return this->numProcessed_; }
-
-  /**
-   * @brief same function as addCost. But it is simple to invoke.
-   * For example:
-   *
-   * @code{.cpp}
-   * TrainerStats stat;
-   * cost = neuralNetwork.forward(batchSize);
-   * stat += {batchSize, cost};
-   * @endcode
-   *
-   * @param p a pair of parameter, first is numProcessed, second is cost.
-   * @return *this
-   */
-  inline TrainerStats& operator+=(const std::pair<int64_t, real>& p) {
-    this->addCost(p.first, p.second);
-    return *this;
-  }
-
-  /**
-   * @brief TrainerStats Constructor.
-   *
-   * reset stat when constructed.
-   */
-  inline TrainerStats() { this->reset(); }
-
-  /**
-   * @brief show stats to ostream.
-   *
-   * If there is no need to print current cost, set withCurrentCost to False.
-   *
-   * @param os output stream.
-   * @param withCurrentCost print current cost or not.
-   */
-  void showStats(std::ostream& os, bool withCurrentCost = true) const {
-    os << "samples=" << this->getNumProcessed()
-       << " AvgCost=" << this->getAvgCost();
-    if (withCurrentCost) {
-      os << " CurrentCost=" << this->getCurrentAvgCost();
-    }
-  }
-
-  /**
-   * @brief get stats to std::string
-   * @param withCurrentCost return current cost or not
-   * @return stats string
-   */
-  std::string getStats(bool withCurrentCost = true) const {
-    std::ostringstream os;
-    this->showStats(os, withCurrentCost);
-    return os.str();
-  }
-
-private:
-  int64_t numProcessed_;
-  real totalCost_;
-  real currentCost_;
-  int64_t currentSamples_;
-};
-
-inline std::ostream& operator<<(std::ostream& os, const TrainerStats& stats) {
-  stats.showStats(os);
-  return os;
-}
-
-/**
- * TrainerInternalConfig
- * general configs for training
- */
-struct TrainerInternalConfig {
-  /**
-   * @brief Create TrainerInternalConfig from GradientMachine::CreateMode and
-   * command line arguments.
-   * @param mode
-   * @return
-   */
-  static std::unique_ptr<TrainerInternalConfig> createFromMode(
-      GradientMachine::CreateMode mode);
-
-  /**
-   * indicate whether the training is local
-   * if local, no parameter server is used
-   */
-  bool local;
-
-  /**
-   * indicate whether training uses GPU
-   */
-  bool use_gpu;
-
-  /**
-   * indicate number of trainer
-   */
-  int trainer_count;
-
-  /**
-   * how frequently to show param stats
-   */
-  int show_param_stats_period;
-
-  /**
-   * current trainer id
-   */
-  int trainer_id;
-
-  /**
-   * frequency to dump log
-   */
-  int log_period;
-
-  /**
-   * dot period
-   */
-  int dot_period;
-
-  /**
-   * num passes for training
-   */
-  int num_passes;
-
-  /**
-   * use old updater
-   */
-  bool use_old_updater;
-
-  /**
-   * whether to load and save parameter in pserver
-   */
-  bool loadsave_parameters_in_pserver;
-
-  /**
-   * training mode
-   */
-  GradientMachine::CreateMode mode;
-};
-
-}  //  namespace paddle
diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp
deleted file mode 100644
index c5c1d484e5f85c774fd4b8f1d4a8d46abfa2f547..0000000000000000000000000000000000000000
--- a/paddle/trainer/TrainerMain.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fenv.h>
-#include "paddle/pserver/ParameterServerController.h"
-#include "paddle/utils/PythonUtil.h"
-
-#include "ParamUtil.h"
-#include "Trainer.h"
-
-DEFINE_bool(start_pserver, false, "Whether to start pserver");
-DECLARE_int32(gpu_id);
-DEFINE_string(job, "train", "one of (train, test, checkgrad)");
-DECLARE_int32(start_pass);
-DECLARE_string(config);
-DECLARE_string(init_model_path);
-DECLARE_string(rdma_tcp);
-
-using namespace paddle;  // NOLINT
-
-int main(int argc, char** argv) {
-  // write logs instantly (never buffer log messages)
-  FLAGS_logbuflevel = -1;
-
-  initMain(argc, argv);
-  initPython(argc, argv);
-
-  std::unique_ptr<ParameterServerController> parameterServerPtr(nullptr);
-  if (FLAGS_start_pserver) {
-    parameterServerPtr.reset(
-        paddle::ParameterServerController::createFromGflags());
-    parameterServerPtr->start();
-  }
-  Trainer trainer;
-  auto config = TrainerConfigHelper::createFromFlags();
-  CHECK(config != nullptr) << "no valid config";
-
-  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
-  trainer.init(config, FLAGS_job == "test");
-
-  if (FLAGS_job == "train") {
-    trainer.train();
-  } else if (FLAGS_job == "checkgrad") {
-    trainer.checkGradient();
-  } else if (FLAGS_job == "test") {
-    trainer.test();
-  } else if (FLAGS_job == "time") {
-    trainer.time();
-  } else {
-    LOG(FATAL) << "Unknown job type: " << FLAGS_job;
-  }
-
-  return 0;
-}
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
deleted file mode 100644
index bd518d8598f5aa7c32298ed2110a96a2743536b3..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/CMakeLists.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-set(PYTHON_PATH 
-   ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
-   ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests)
-function(trainer_test TARGET)
-  add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
-  add_test(NAME ${TARGET}
-    COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
-      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
-endfunction()
-
-trainer_test(test_Compare)
-trainer_test(test_PyDataProviderWrapper)
-trainer_test(test_recurrent_machine_generation)
-trainer_test(test_Trainer)
-
-############### test_TrainerOnePass ##########################
-if(WITH_PYTHON)
-  # only run test_TrainerOnePass when PYTHON is enabled, because train one pass
-  # is using PyDataProvider2.
-  add_unittest_without_exec(test_TrainerOnePass
-      test_TrainerOnePass.cpp)
-  add_test(NAME test_TrainerOnePass
-    COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port 
-          ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
-      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
-endif()
-
-#################### test_config_parser #########################
-add_test(NAME test_config_parser
-  COMMAND ${PYTHON_PATH} ${PYTHON_EXECUTABLE} 
-        ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/config_parser_test.py
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
diff --git a/paddle/trainer/tests/config_parser_test.py b/paddle/trainer/tests/config_parser_test.py
deleted file mode 100644
index db66ebb5b7c13fe53df14a07918aad62ba895ffa..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/config_parser_test.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.config_parser import parse_config_and_serialize
-
-if __name__ == '__main__':
-    parse_config_and_serialize('trainer/tests/test_config.conf', '')
-    parse_config_and_serialize(
-        'trainer/tests/sample_trainer_config.conf',
-        'extension_module_name=paddle.trainer.config_parser_extension')
-    parse_config_and_serialize('gserver/tests/pyDataProvider/trainer.conf', '')
diff --git a/paddle/trainer/tests/picojson.h b/paddle/trainer/tests/picojson.h
deleted file mode 100644
index eaa8b9baf6e4e753a441ab77811f494cbdab80cf..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/picojson.h
+++ /dev/null
@@ -1,1103 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/*
- * Copyright 2009-2010 Cybozu Labs, Inc.
- * Copyright 2011-2014 Kazuho Oku
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef picojson_h
-#define picojson_h
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <iostream>
-#include <iterator>
-#include <limits>
-#include <map>
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-// for isnan/isinf
-#if __cplusplus >= 201103L
-#include <cmath>
-#else
-extern "C" {
-#ifdef _MSC_VER
-#include <float.h>
-#elif defined(__INTEL_COMPILER)
-#include <mathimf.h>
-#else
-#include <math.h>
-#endif
-}
-#endif
-
-// experimental support for int64_t (see README.mkdn for detail)
-#ifdef PICOJSON_USE_INT64
-#define __STDC_FORMAT_MACROS
-#include <errno.h>
-#include <inttypes.h>
-#endif
-
-// to disable the use of localeconv(3), set PICOJSON_USE_LOCALE to 0
-#ifndef PICOJSON_USE_LOCALE
-#define PICOJSON_USE_LOCALE 1
-#endif
-#if PICOJSON_USE_LOCALE
-extern "C" {
-#include <locale.h>
-}
-#endif
-
-#ifndef PICOJSON_ASSERT
-#define PICOJSON_ASSERT(e)                  \
-  do {                                      \
-    if (!(e)) throw std::runtime_error(#e); \
-  } while (0)
-#endif
-
-#ifdef _MSC_VER
-#define SNPRINTF _snprintf_s
-#pragma warning(push)
-#pragma warning(disable : 4244)  // conversion from int to char
-#pragma warning(disable : 4127)  // conditional expression is constant
-#pragma warning(disable : 4702)  // unreachable code
-#else
-#define SNPRINTF snprintf
-#endif
-
-namespace picojson {
-
-enum {
-  null_type,
-  boolean_type,
-  number_type,
-  string_type,
-  array_type,
-  object_type
-#ifdef PICOJSON_USE_INT64
-  ,
-  int64_type
-#endif
-};
-
-enum { INDENT_WIDTH = 2 };
-
-struct null {};
-
-class value {
-public:
-  typedef std::vector<value> array;
-  typedef std::map<std::string, value> object;
-  union _storage {
-    bool boolean_;
-    double number_;
-#ifdef PICOJSON_USE_INT64
-    int64_t int64_;
-#endif
-    std::string* string_;
-    array* array_;
-    object* object_;
-  };
-
-protected:
-  int type_;
-  _storage u_;
-
-public:
-  value();
-  value(int type, bool);
-  explicit value(bool b);
-#ifdef PICOJSON_USE_INT64
-  explicit value(int64_t i);
-#endif
-  explicit value(double n);
-  explicit value(const std::string& s);
-  explicit value(const array& a);
-  explicit value(const object& o);
-  explicit value(const char* s);
-  value(const char* s, size_t len);
-  ~value();
-  value(const value& x);
-  value& operator=(const value& x);
-  void swap(value& x);
-  template <typename T>
-  bool is() const;
-  template <typename T>
-  const T& get() const;
-  template <typename T>
-  T& get();
-  bool evaluate_as_boolean() const;
-  const value& get(size_t idx) const;
-  const value& get(const std::string& key) const;
-  value& get(size_t idx);
-  value& get(const std::string& key);
-
-  bool contains(size_t idx) const;
-  bool contains(const std::string& key) const;
-  std::string to_str() const;
-  template <typename Iter>
-  void serialize(Iter os, bool prettify = false) const;
-  std::string serialize(bool prettify = false) const;
-
-private:
-  template <typename T>
-  value(const T*);  // intentionally defined to block implicit conversion of
-                    // pointer to bool
-  template <typename Iter>
-  static void _indent(Iter os, int indent);
-  template <typename Iter>
-  void _serialize(Iter os, int indent) const;
-  std::string _serialize(int indent) const;
-};
-
-typedef value::array array;
-typedef value::object object;
-
-inline value::value() : type_(null_type) {}
-
-inline value::value(int type, bool) : type_(type) {
-  switch (type) {
-#define INIT(p, v) \
-  case p##type:    \
-    u_.p = v;      \
-    break
-    INIT(boolean_, false);
-    INIT(number_, 0.0);
-#ifdef PICOJSON_USE_INT64
-    INIT(int64_, 0);
-#endif
-    INIT(string_, new std::string());
-    INIT(array_, new array());
-    INIT(object_, new object());
-#undef INIT
-    default:
-      break;
-  }
-}
-
-inline value::value(bool b) : type_(boolean_type) { u_.boolean_ = b; }
-
-#ifdef PICOJSON_USE_INT64
-inline value::value(int64_t i) : type_(int64_type) { u_.int64_ = i; }
-#endif
-
-inline value::value(double n) : type_(number_type) {
-  if (
-#ifdef _MSC_VER
-      !_finite(n)
-#elif __cplusplus >= 201103L || !(defined(isnan) && defined(isinf))
-      std::isnan(n) || std::isinf(n)
-#else
-      isnan(n) || isinf(n)
-#endif
-          ) {
-    throw std::overflow_error("");
-  }
-  u_.number_ = n;
-}
-
-inline value::value(const std::string& s) : type_(string_type) {
-  u_.string_ = new std::string(s);
-}
-
-inline value::value(const array& a) : type_(array_type) {
-  u_.array_ = new array(a);
-}
-
-inline value::value(const object& o) : type_(object_type) {
-  u_.object_ = new object(o);
-}
-
-inline value::value(const char* s) : type_(string_type) {
-  u_.string_ = new std::string(s);
-}
-
-inline value::value(const char* s, size_t len) : type_(string_type) {
-  u_.string_ = new std::string(s, len);
-}
-
-inline value::~value() {
-  switch (type_) {
-#define DEINIT(p) \
-  case p##type:   \
-    delete u_.p;  \
-    break
-    DEINIT(string_);
-    DEINIT(array_);
-    DEINIT(object_);
-#undef DEINIT
-    default:
-      break;
-  }
-}
-
-inline value::value(const value& x) : type_(x.type_) {
-  switch (type_) {
-#define INIT(p, v) \
-  case p##type:    \
-    u_.p = v;      \
-    break
-    INIT(string_, new std::string(*x.u_.string_));
-    INIT(array_, new array(*x.u_.array_));
-    INIT(object_, new object(*x.u_.object_));
-#undef INIT
-    default:
-      u_ = x.u_;
-      break;
-  }
-}
-
-inline value& value::operator=(const value& x) {
-  if (this != &x) {
-    value t(x);
-    swap(t);
-  }
-  return *this;
-}
-
-inline void value::swap(value& x) {
-  std::swap(type_, x.type_);
-  std::swap(u_, x.u_);
-}
-
-#define IS(ctype, jtype)                 \
-  template <>                            \
-  inline bool value::is<ctype>() const { \
-    return type_ == jtype##_type;        \
-  }
-IS(null, null)
-IS(bool, boolean)
-#ifdef PICOJSON_USE_INT64
-IS(int64_t, int64)
-#endif
-IS(std::string, string)
-IS(array, array)
-IS(object, object)
-#undef IS
-template <>
-inline bool value::is<double>() const {
-  return type_ == number_type
-#ifdef PICOJSON_USE_INT64
-         || type_ == int64_type
-#endif
-      ;
-}
-
-#define GET(ctype, var)                                                    \
-  template <>                                                              \
-  inline const ctype& value::get<ctype>() const {                          \
-    PICOJSON_ASSERT("type mismatch! call is<type>() before get<type>()" && \
-                    is<ctype>());                                          \
-    return var;                                                            \
-  }                                                                        \
-  template <>                                                              \
-  inline ctype& value::get<ctype>() {                                      \
-    PICOJSON_ASSERT("type mismatch! call is<type>() before get<type>()" && \
-                    is<ctype>());                                          \
-    return var;                                                            \
-  }
-GET(bool, u_.boolean_)
-GET(std::string, *u_.string_)
-GET(array, *u_.array_)
-GET(object, *u_.object_)
-#ifdef PICOJSON_USE_INT64
-GET(double,
-    (type_ == int64_type && (const_cast<value*>(this)->type_ = number_type,
-                             const_cast<value*>(this)->u_.number_ = u_.int64_),
-     u_.number_))
-GET(int64_t, u_.int64_)
-#else
-GET(double, u_.number_)
-#endif
-#undef GET
-
-inline bool value::evaluate_as_boolean() const {
-  switch (type_) {
-    case null_type:
-      return false;
-    case boolean_type:
-      return u_.boolean_;
-    case number_type:
-      return u_.number_ != 0;
-#ifdef PICOJSON_USE_INT64
-    case int64_type:
-      return u_.int64_ != 0;
-#endif
-    case string_type:
-      return !u_.string_->empty();
-    default:
-      return true;
-  }
-}
-
-inline const value& value::get(size_t idx) const {
-  static value s_null;
-  PICOJSON_ASSERT(is<array>());
-  return idx < u_.array_->size() ? (*u_.array_)[idx] : s_null;
-}
-
-inline value& value::get(size_t idx) {
-  static value s_null;
-  PICOJSON_ASSERT(is<array>());
-  return idx < u_.array_->size() ? (*u_.array_)[idx] : s_null;
-}
-
-inline const value& value::get(const std::string& key) const {
-  static value s_null;
-  PICOJSON_ASSERT(is<object>());
-  object::const_iterator i = u_.object_->find(key);
-  return i != u_.object_->end() ? i->second : s_null;
-}
-
-inline value& value::get(const std::string& key) {
-  static value s_null;
-  PICOJSON_ASSERT(is<object>());
-  object::iterator i = u_.object_->find(key);
-  return i != u_.object_->end() ? i->second : s_null;
-}
-
-inline bool value::contains(size_t idx) const {
-  PICOJSON_ASSERT(is<array>());
-  return idx < u_.array_->size();
-}
-
-inline bool value::contains(const std::string& key) const {
-  PICOJSON_ASSERT(is<object>());
-  object::const_iterator i = u_.object_->find(key);
-  return i != u_.object_->end();
-}
-
-inline std::string value::to_str() const {
-  switch (type_) {
-    case null_type:
-      return "null";
-    case boolean_type:
-      return u_.boolean_ ? "true" : "false";
-#ifdef PICOJSON_USE_INT64
-    case int64_type: {
-      char buf[sizeof("-9223372036854775808")];
-      SNPRINTF(buf, sizeof(buf), "%" PRId64, u_.int64_);
-      return buf;
-    }
-#endif
-    case number_type: {
-      char buf[256];
-      double tmp;
-      SNPRINTF(buf,
-               sizeof(buf),
-               fabs(u_.number_) < (1ULL << 53) && modf(u_.number_, &tmp) == 0
-                   ? "%.f"
-                   : "%.17g",
-               u_.number_);
-#if PICOJSON_USE_LOCALE
-      char* decimal_point = localeconv()->decimal_point;
-      if (strcmp(decimal_point, ".") != 0) {
-        size_t decimal_point_len = strlen(decimal_point);
-        for (char* p = buf; *p != '\0'; ++p) {
-          if (strncmp(p, decimal_point, decimal_point_len) == 0) {
-            return std::string(buf, p) + "." + (p + decimal_point_len);
-          }
-        }
-      }
-#endif
-      return buf;
-    }
-    case string_type:
-      return *u_.string_;
-    case array_type:
-      return "array";
-    case object_type:
-      return "object";
-    default:
-      PICOJSON_ASSERT(0);
-#ifdef _MSC_VER
-      __assume(0);
-#endif
-  }
-  return std::string();
-}
-
-template <typename Iter>
-void copy(const std::string& s, Iter oi) {
-  std::copy(s.begin(), s.end(), oi);
-}
-
-template <typename Iter>
-void serialize_str(const std::string& s, Iter oi) {
-  *oi++ = '"';
-  for (std::string::const_iterator i = s.begin(); i != s.end(); ++i) {
-    switch (*i) {
-#define MAP(val, sym) \
-  case val:           \
-    copy(sym, oi);    \
-    break
-      MAP('"', "\\\"");
-      MAP('\\', "\\\\");
-      MAP('/', "\\/");
-      MAP('\b', "\\b");
-      MAP('\f', "\\f");
-      MAP('\n', "\\n");
-      MAP('\r', "\\r");
-      MAP('\t', "\\t");
-#undef MAP
-      default:
-        if (static_cast<unsigned char>(*i) < 0x20 || *i == 0x7f) {
-          char buf[7];
-          SNPRINTF(buf, sizeof(buf), "\\u%04x", *i & 0xff);
-          copy(buf, buf + 6, oi);
-        } else {
-          *oi++ = *i;
-        }
-        break;
-    }
-  }
-  *oi++ = '"';
-}
-
-template <typename Iter>
-void value::serialize(Iter oi, bool prettify) const {
-  return _serialize(oi, prettify ? 0 : -1);
-}
-
-inline std::string value::serialize(bool prettify) const {
-  return _serialize(prettify ? 0 : -1);
-}
-
-template <typename Iter>
-void value::_indent(Iter oi, int indent) {
-  *oi++ = '\n';
-  for (int i = 0; i < indent * INDENT_WIDTH; ++i) {
-    *oi++ = ' ';
-  }
-}
-
-template <typename Iter>
-void value::_serialize(Iter oi, int indent) const {
-  switch (type_) {
-    case string_type:
-      serialize_str(*u_.string_, oi);
-      break;
-    case array_type: {
-      *oi++ = '[';
-      if (indent != -1) {
-        ++indent;
-      }
-      for (array::const_iterator i = u_.array_->begin(); i != u_.array_->end();
-           ++i) {
-        if (i != u_.array_->begin()) {
-          *oi++ = ',';
-        }
-        if (indent != -1) {
-          _indent(oi, indent);
-        }
-        i->_serialize(oi, indent);
-      }
-      if (indent != -1) {
-        --indent;
-        if (!u_.array_->empty()) {
-          _indent(oi, indent);
-        }
-      }
-      *oi++ = ']';
-      break;
-    }
-    case object_type: {
-      *oi++ = '{';
-      if (indent != -1) {
-        ++indent;
-      }
-      for (object::const_iterator i = u_.object_->begin();
-           i != u_.object_->end();
-           ++i) {
-        if (i != u_.object_->begin()) {
-          *oi++ = ',';
-        }
-        if (indent != -1) {
-          _indent(oi, indent);
-        }
-        serialize_str(i->first, oi);
-        *oi++ = ':';
-        if (indent != -1) {
-          *oi++ = ' ';
-        }
-        i->second._serialize(oi, indent);
-      }
-      if (indent != -1) {
-        --indent;
-        if (!u_.object_->empty()) {
-          _indent(oi, indent);
-        }
-      }
-      *oi++ = '}';
-      break;
-    }
-    default:
-      copy(to_str(), oi);
-      break;
-  }
-  if (indent == 0) {
-    *oi++ = '\n';
-  }
-}
-
-inline std::string value::_serialize(int indent) const {
-  std::string s;
-  _serialize(std::back_inserter(s), indent);
-  return s;
-}
-
-template <typename Iter>
-class input {
-protected:
-  Iter cur_, end_;
-  int last_ch_;
-  bool ungot_;
-  int line_;
-
-public:
-  input(const Iter& first, const Iter& last)
-      : cur_(first), end_(last), last_ch_(-1), ungot_(false), line_(1) {}
-  int getc() {
-    if (ungot_) {
-      ungot_ = false;
-      return last_ch_;
-    }
-    if (cur_ == end_) {
-      last_ch_ = -1;
-      return -1;
-    }
-    if (last_ch_ == '\n') {
-      line_++;
-    }
-    last_ch_ = *cur_ & 0xff;
-    ++cur_;
-    return last_ch_;
-  }
-  void ungetc() {
-    if (last_ch_ != -1) {
-      PICOJSON_ASSERT(!ungot_);
-      ungot_ = true;
-    }
-  }
-  Iter cur() const { return cur_; }
-  int line() const { return line_; }
-  void skip_ws() {
-    while (1) {
-      int ch = getc();
-      if (!(ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r')) {
-        ungetc();
-        break;
-      }
-    }
-  }
-  bool expect(int expect) {
-    skip_ws();
-    if (getc() != expect) {
-      ungetc();
-      return false;
-    }
-    return true;
-  }
-  bool match(const std::string& pattern) {
-    for (std::string::const_iterator pi(pattern.begin()); pi != pattern.end();
-         ++pi) {
-      if (getc() != *pi) {
-        ungetc();
-        return false;
-      }
-    }
-    return true;
-  }
-};
-
-template <typename Iter>
-inline int _parse_quadhex(input<Iter>& in) {
-  int uni_ch = 0, hex;
-  for (int i = 0; i < 4; i++) {
-    if ((hex = in.getc()) == -1) {
-      return -1;
-    }
-    if ('0' <= hex && hex <= '9') {
-      hex -= '0';
-    } else if ('A' <= hex && hex <= 'F') {
-      hex -= 'A' - 0xa;
-    } else if ('a' <= hex && hex <= 'f') {
-      hex -= 'a' - 0xa;
-    } else {
-      in.ungetc();
-      return -1;
-    }
-    uni_ch = uni_ch * 16 + hex;
-  }
-  return uni_ch;
-}
-
-template <typename String, typename Iter>
-inline bool _parse_codepoint(String& out, input<Iter>& in) {
-  int uni_ch;
-  if ((uni_ch = _parse_quadhex(in)) == -1) {
-    return false;
-  }
-  if (0xd800 <= uni_ch && uni_ch <= 0xdfff) {
-    if (0xdc00 <= uni_ch) {
-      // a second 16-bit of a surrogate pair appeared
-      return false;
-    }
-    // first 16-bit of surrogate pair, get the next one
-    if (in.getc() != '\\' || in.getc() != 'u') {
-      in.ungetc();
-      return false;
-    }
-    int second = _parse_quadhex(in);
-    if (!(0xdc00 <= second && second <= 0xdfff)) {
-      return false;
-    }
-    uni_ch = ((uni_ch - 0xd800) << 10) | ((second - 0xdc00) & 0x3ff);
-    uni_ch += 0x10000;
-  }
-  if (uni_ch < 0x80) {
-    out.push_back(uni_ch);
-  } else {
-    if (uni_ch < 0x800) {
-      out.push_back(0xc0 | (uni_ch >> 6));
-    } else {
-      if (uni_ch < 0x10000) {
-        out.push_back(0xe0 | (uni_ch >> 12));
-      } else {
-        out.push_back(0xf0 | (uni_ch >> 18));
-        out.push_back(0x80 | ((uni_ch >> 12) & 0x3f));
-      }
-      out.push_back(0x80 | ((uni_ch >> 6) & 0x3f));
-    }
-    out.push_back(0x80 | (uni_ch & 0x3f));
-  }
-  return true;
-}
-
-template <typename String, typename Iter>
-inline bool _parse_string(String& out, input<Iter>& in) {
-  while (1) {
-    int ch = in.getc();
-    if (ch < ' ') {
-      in.ungetc();
-      return false;
-    } else if (ch == '"') {
-      return true;
-    } else if (ch == '\\') {
-      if ((ch = in.getc()) == -1) {
-        return false;
-      }
-      switch (ch) {
-#define MAP(sym, val)   \
-  case sym:             \
-    out.push_back(val); \
-    break
-        MAP('"', '\"');
-        MAP('\\', '\\');
-        MAP('/', '/');
-        MAP('b', '\b');
-        MAP('f', '\f');
-        MAP('n', '\n');
-        MAP('r', '\r');
-        MAP('t', '\t');
-#undef MAP
-        case 'u':
-          if (!_parse_codepoint(out, in)) {
-            return false;
-          }
-          break;
-        default:
-          return false;
-      }
-    } else {
-      out.push_back(ch);
-    }
-  }
-  return false;
-}
-
-template <typename Context, typename Iter>
-inline bool _parse_array(Context& ctx, input<Iter>& in) {
-  if (!ctx.parse_array_start()) {
-    return false;
-  }
-  size_t idx = 0;
-  if (in.expect(']')) {
-    return ctx.parse_array_stop(idx);
-  }
-  do {
-    if (!ctx.parse_array_item(in, idx)) {
-      return false;
-    }
-    idx++;
-  } while (in.expect(','));
-  return in.expect(']') && ctx.parse_array_stop(idx);
-}
-
-template <typename Context, typename Iter>
-inline bool _parse_object(Context& ctx, input<Iter>& in) {
-  if (!ctx.parse_object_start()) {
-    return false;
-  }
-  if (in.expect('}')) {
-    return true;
-  }
-  do {
-    std::string key;
-    if (!in.expect('"') || !_parse_string(key, in) || !in.expect(':')) {
-      return false;
-    }
-    if (!ctx.parse_object_item(in, key)) {
-      return false;
-    }
-  } while (in.expect(','));
-  return in.expect('}');
-}
-
-template <typename Iter>
-inline std::string _parse_number(input<Iter>& in) {
-  std::string num_str;
-  while (1) {
-    int ch = in.getc();
-    if (('0' <= ch && ch <= '9') || ch == '+' || ch == '-' || ch == 'e' ||
-        ch == 'E') {
-      num_str.push_back(ch);
-    } else if (ch == '.') {
-#if PICOJSON_USE_LOCALE
-      num_str += localeconv()->decimal_point;
-#else
-      num_str.push_back('.');
-#endif
-    } else {
-      in.ungetc();
-      break;
-    }
-  }
-  return num_str;
-}
-
-template <typename Context, typename Iter>
-inline bool _parse(Context& ctx, input<Iter>& in) {
-  in.skip_ws();
-  int ch = in.getc();
-  switch (ch) {
-#define IS(ch, text, op)        \
-  case ch:                      \
-    if (in.match(text) && op) { \
-      return true;              \
-    } else {                    \
-      return false;             \
-    }
-    IS('n', "ull", ctx.set_null());
-    IS('f', "alse", ctx.set_bool(false));
-    IS('t', "rue", ctx.set_bool(true));
-#undef IS
-    case '"':
-      return ctx.parse_string(in);
-    case '[':
-      return _parse_array(ctx, in);
-    case '{':
-      return _parse_object(ctx, in);
-    default:
-      if (('0' <= ch && ch <= '9') || ch == '-') {
-        double f;
-        char* endp;
-        in.ungetc();
-        std::string num_str = _parse_number(in);
-        if (num_str.empty()) {
-          return false;
-        }
-#ifdef PICOJSON_USE_INT64
-        {
-          errno = 0;
-          intmax_t ival = strtoimax(num_str.c_str(), &endp, 10);
-          if (errno == 0 && std::numeric_limits<int64_t>::min() <= ival &&
-              ival <= std::numeric_limits<int64_t>::max() &&
-              endp == num_str.c_str() + num_str.size()) {
-            ctx.set_int64(ival);
-            return true;
-          }
-        }
-#endif
-        f = strtod(num_str.c_str(), &endp);
-        if (endp == num_str.c_str() + num_str.size()) {
-          ctx.set_number(f);
-          return true;
-        }
-        return false;
-      }
-      break;
-  }
-  in.ungetc();
-  return false;
-}
-
-class deny_parse_context {
-public:
-  bool set_null() { return false; }
-  bool set_bool(bool) { return false; }
-#ifdef PICOJSON_USE_INT64
-  bool set_int64(int64_t) { return false; }
-#endif
-  bool set_number(double) { return false; }
-  template <typename Iter>
-  bool parse_string(input<Iter>&) {
-    return false;
-  }
-  bool parse_array_start() { return false; }
-  template <typename Iter>
-  bool parse_array_item(input<Iter>&, size_t) {
-    return false;
-  }
-  bool parse_array_stop(size_t) { return false; }
-  bool parse_object_start() { return false; }
-  template <typename Iter>
-  bool parse_object_item(input<Iter>&, const std::string&) {
-    return false;
-  }
-};
-
-class default_parse_context {
-protected:
-  value* out_;
-
-public:
-  default_parse_context(value* out) : out_(out) {}
-  bool set_null() {
-    *out_ = value();
-    return true;
-  }
-  bool set_bool(bool b) {
-    *out_ = value(b);
-    return true;
-  }
-#ifdef PICOJSON_USE_INT64
-  bool set_int64(int64_t i) {
-    *out_ = value(i);
-    return true;
-  }
-#endif
-  bool set_number(double f) {
-    *out_ = value(f);
-    return true;
-  }
-  template <typename Iter>
-  bool parse_string(input<Iter>& in) {
-    *out_ = value(string_type, false);
-    return _parse_string(out_->get<std::string>(), in);
-  }
-  bool parse_array_start() {
-    *out_ = value(array_type, false);
-    return true;
-  }
-  template <typename Iter>
-  bool parse_array_item(input<Iter>& in, size_t) {
-    array& a = out_->get<array>();
-    a.push_back(value());
-    default_parse_context ctx(&a.back());
-    return _parse(ctx, in);
-  }
-  bool parse_array_stop(size_t) { return true; }
-  bool parse_object_start() {
-    *out_ = value(object_type, false);
-    return true;
-  }
-  template <typename Iter>
-  bool parse_object_item(input<Iter>& in, const std::string& key) {
-    object& o = out_->get<object>();
-    default_parse_context ctx(&o[key]);
-    return _parse(ctx, in);
-  }
-
-private:
-  default_parse_context(const default_parse_context&);
-  default_parse_context& operator=(const default_parse_context&);
-};
-
-class null_parse_context {
-public:
-  struct dummy_str {
-    void push_back(int) {}
-  };
-
-public:
-  null_parse_context() {}
-  bool set_null() { return true; }
-  bool set_bool(bool) { return true; }
-#ifdef PICOJSON_USE_INT64
-  bool set_int64(int64_t) { return true; }
-#endif
-  bool set_number(double) { return true; }
-  template <typename Iter>
-  bool parse_string(input<Iter>& in) {
-    dummy_str s;
-    return _parse_string(s, in);
-  }
-  bool parse_array_start() { return true; }
-  template <typename Iter>
-  bool parse_array_item(input<Iter>& in, size_t) {
-    return _parse(*this, in);
-  }
-  bool parse_array_stop(size_t) { return true; }
-  bool parse_object_start() { return true; }
-  template <typename Iter>
-  bool parse_object_item(input<Iter>& in, const std::string&) {
-    return _parse(*this, in);
-  }
-
-private:
-  null_parse_context(const null_parse_context&);
-  null_parse_context& operator=(const null_parse_context&);
-};
-
-// obsolete, use the version below
-template <typename Iter>
-inline std::string parse(value& out, Iter& pos, const Iter& last) {
-  std::string err;
-  pos = parse(out, pos, last, &err);
-  return err;
-}
-
-template <typename Context, typename Iter>
-inline Iter _parse(Context& ctx,
-                   const Iter& first,
-                   const Iter& last,
-                   std::string* err) {
-  input<Iter> in(first, last);
-  if (!_parse(ctx, in) && err != NULL) {
-    char buf[64];
-    SNPRINTF(buf, sizeof(buf), "syntax error at line %d near: ", in.line());
-    *err = buf;
-    while (1) {
-      int ch = in.getc();
-      if (ch == -1 || ch == '\n') {
-        break;
-      } else if (ch >= ' ') {
-        err->push_back(ch);
-      }
-    }
-  }
-  return in.cur();
-}
-
-template <typename Iter>
-inline Iter parse(value& out,
-                  const Iter& first,
-                  const Iter& last,
-                  std::string* err) {
-  default_parse_context ctx(&out);
-  return _parse(ctx, first, last, err);
-}
-
-inline std::string parse(value& out, const std::string& s) {
-  std::string err;
-  parse(out, s.begin(), s.end(), &err);
-  return err;
-}
-
-inline std::string parse(value& out, std::istream& is) {
-  std::string err;
-  parse(out,
-        std::istreambuf_iterator<char>(is.rdbuf()),
-        std::istreambuf_iterator<char>(),
-        &err);
-  return err;
-}
-
-template <typename T>
-struct last_error_t {
-  static std::string s;
-};
-template <typename T>
-std::string last_error_t<T>::s;
-
-inline void set_last_error(const std::string& s) { last_error_t<bool>::s = s; }
-
-inline const std::string& get_last_error() { return last_error_t<bool>::s; }
-
-inline bool operator==(const value& x, const value& y) {
-  if (x.is<null>()) return y.is<null>();
-#define PICOJSON_CMP(type) \
-  if (x.is<type>()) return y.is<type>() && x.get<type>() == y.get<type>()
-  PICOJSON_CMP(bool);
-  PICOJSON_CMP(double);
-  PICOJSON_CMP(std::string);
-  PICOJSON_CMP(array);
-  PICOJSON_CMP(object);
-#undef PICOJSON_CMP
-  PICOJSON_ASSERT(0);
-#ifdef _MSC_VER
-  __assume(0);
-#endif
-  return false;
-}
-
-inline bool operator!=(const value& x, const value& y) { return !(x == y); }
-}  // namespace picojson
-
-namespace std {
-template <>
-inline void swap(picojson::value& x, picojson::value& y) {
-  x.swap(y);
-}
-}  // namespace std
-
-inline std::istream& operator>>(std::istream& is, picojson::value& x) {
-  picojson::set_last_error(std::string());
-  std::string err = picojson::parse(x, is);
-  if (!err.empty()) {
-    picojson::set_last_error(err);
-    is.setstate(std::ios::failbit);
-  }
-  return is;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const picojson::value& x) {
-  x.serialize(std::ostream_iterator<char>(os));
-  return os;
-}
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-#endif
diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list
deleted file mode 100644
index 0db50f34dd24b5e6fbc33a1e8dd3c16cb59eb56e..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.list
+++ /dev/null
@@ -1 +0,0 @@
-trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.data
diff --git a/paddle/trainer/tests/sample_filelist.txt b/paddle/trainer/tests/sample_filelist.txt
deleted file mode 100644
index 7db4c735359a380dc150e24368653d2a6a55a453..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/sample_filelist.txt
+++ /dev/null
@@ -1 +0,0 @@
-trainer/tests/sample_data.txt
diff --git a/paddle/trainer/tests/sample_trainer_config.conf b/paddle/trainer/tests/sample_trainer_config.conf
deleted file mode 100644
index 2697832840f35a33c07f1664ef18a229d656d784..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/sample_trainer_config.conf
+++ /dev/null
@@ -1,87 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-TrainData(SimpleData(
-            files = "trainer/tests/sample_filelist.txt",
-            feat_dim = 3,
-            context_len = 0,
-            buffer_capacity = 1000000))
-
-TestData(SimpleData(
-           files = "trainer/tests/sample_filelist.txt",
-           feat_dim = 3,
-           context_len = 0,
-           buffer_capacity = 1000000))
-
-settings(batch_size = 100)
-
-data = data_layer(name='input', size=3)
-
-fc1 = fc_layer(input=data, size=5,
-               bias_attr=False,
-               act=SigmoidActivation())
-
-fc2 = fc_layer(input=data, size=9,
-               bias_attr=False,
-               act=LinearActivation())
-
-fc3 = fc_layer(input=data, size=3,
-               bias_attr=False,
-               act=TanhActivation())
-
-fc4 = fc_layer(input=data, size=5,
-               bias_attr=False,
-               act=LinearActivation(),
-               param_attr=ParamAttr(name='sharew'))
-
-fc5 = fc_layer(input=data, size=5,
-               bias_attr=False,
-               act=BReluActivation())
-
-fc6 = fc_layer(input=data, size=5,
-               bias_attr=False,
-               act=SoftReluActivation())
-
-fc7 = fc_layer(input=data, size=3,
-               bias_attr=False,
-               act=SquareActivation())
-
-fc8 = fc_layer(input=data, size=5,
-               bias_attr=True,
-               act=SquareActivation())
-
-with mixed_layer(size=3, act=SoftmaxActivation()) as layer9:
-    layer9 += full_matrix_projection(input=fc1)
-    layer9 += full_matrix_projection(input=fc2)
-    layer9 += full_matrix_projection(input=fc3)
-    layer9 += trans_full_matrix_projection(input=fc4,
-                                           param_attr=ParamAttr(name='sharew'))
-    layer9 += full_matrix_projection(input=fc5)
-    layer9 += full_matrix_projection(input=fc6)
-    layer9 += full_matrix_projection(input=fc7)
-    layer9 += full_matrix_projection(input=fc8)
-
-if get_config_arg('with_cost', bool, True):
-    # This is for training the neural network.
-    # We need to have another data layer for label
-    # and a layer for calculating cost
-    lbl = data_layer(name='label', size=1)
-    outputs(classification_cost(input=layer9, label=lbl))
-else:    
-    # This is for prediction where we don't have label
-    # and don't need to calculate cost
-    outputs(layer9)
diff --git a/paddle/trainer/tests/sample_trainer_config_hsigmoid.conf b/paddle/trainer/tests/sample_trainer_config_hsigmoid.conf
deleted file mode 100644
index e4abe31d480b69bc2ff4741649b336714818515b..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/sample_trainer_config_hsigmoid.conf
+++ /dev/null
@@ -1,53 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-TrainData(SimpleData(
-    files = "trainer/tests/sample_filelist.txt",
-    feat_dim = 3,
-    context_len = 0,
-    buffer_capacity = 1000000,
-))
-
-settings(batch_size = 100)
-
-data = data_layer(name='input', size=3)
-
-fc1 = fc_layer(input=data, size=12,
-               bias_attr=False,
-               act=SigmoidActivation())
-
-fc2 = fc_layer(input=data, size=19,
-               bias_attr=False,
-               act=LinearActivation())
-
-fc3 = fc_layer(input=data, size=5,
-               bias_attr=False,
-               act=TanhActivation())
-
-fc4 = fc_layer(input=data, size=5,
-               bias_attr=False,
-               act=LinearActivation())
-
-# This is for training the neural network.
-# We need to have another data layer for label
-# and a layer for calculating cost
-lbl = data_layer(name='label', size=1)
-
-outputs(hsigmoid(input=[fc1, fc2, fc3, fc4],
-                 label=lbl,
-                 num_classes=3))
diff --git a/paddle/trainer/tests/sample_trainer_config_parallel.conf b/paddle/trainer/tests/sample_trainer_config_parallel.conf
deleted file mode 100644
index e2b8b3ecdab83b4614dbe468c3a295c05867f7f9..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/sample_trainer_config_parallel.conf
+++ /dev/null
@@ -1,86 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-TrainData(SimpleData(
-            files = "trainer/tests/sample_filelist.txt",
-            feat_dim = 3,
-            context_len = 0,
-            buffer_capacity = 1000000))
-
-TestData(SimpleData(
-           files = "trainer/tests/sample_filelist.txt",
-           feat_dim = 3,
-           context_len = 0,
-           buffer_capacity = 1000000))
-
-settings(batch_size = 100)
-
-# Output layer, label layer, cost layer, preferably set to the same environment.
-output_device = 0
-
-# Input Layer does not need to specify the device number.
-data = data_layer(name='input', size=3)
-
-# Calculate in the CPU.
-fc1 = fc_layer(input=data, size=5,
-               bias_attr=True,
-               layer_attr=ExtraAttr(device=-1),
-               act=SigmoidActivation())
-
-# Calculate in the GPU 0.
-fc2 = fc_layer(input=fc1, size=10,
-               bias_attr=True,
-               layer_attr=ExtraAttr(device=0),
-               act=SigmoidActivation())
-
-# Calculate in the GPU 1.
-fc3 = fc_layer(input=fc1, size=10,
-               bias_attr=True,
-               layer_attr=ExtraAttr(device=1),
-               act=SigmoidActivation())
-
-# Calculate in the GPU 0.
-fc4 = fc_layer(input=[fc2,fc3], size=10,
-               bias_attr=True,
-               layer_attr=ExtraAttr(device=0),
-               act=SigmoidActivation())
-
-# Calculate in the GPU 1.
-fc5 = fc_layer(input=[fc2,fc3], size=10,
-               bias_attr=True,
-               layer_attr=ExtraAttr(device=1),
-               act=SigmoidActivation())
-
-output = fc_layer(input=[fc4,fc5], size=10,
-                  bias_attr=True,
-                  layer_attr=ExtraAttr(device=output_device),
-                  act=SoftmaxActivation())
-
-if get_config_arg('with_cost', bool, True):
-    # This is for training the neural network.
-    # We need to have another data layer for label
-    # and a layer for calculating cost
-    lbl = data_layer(name='label', size=1,
-                    layer_attr=ExtraAttr(device=output_device))
-                    
-    outputs(classification_cost(input=output, 
-                                label=lbl,
-                                layer_attr=ExtraAttr(device=output_device)))
-else:
-    # This is for prediction where we don't have label
-    # and don't need to calculate cost
-    outputs(output)
diff --git a/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf b/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf
deleted file mode 100644
index 741a0aa71df7866c180ab2513f28638117d0f1ca..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf
+++ /dev/null
@@ -1,73 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=15, learning_rate=0)
-
-num_words = 5
-beam_flag = get_config_arg('beam_search', bool, False)
-
-sent_id = data_layer(name="sent_id", size=1)
-
-# This layer has no actual use, but only to decide batch_size in generation.
-# When generating, at least one Memory in RecurrentLayer MUST have a boot layer.
-dummy_data = data_layer(name="dummy_data_input", size=2)
-
-def outer_step(dummy_data):
-
-    gen_inputs = [StaticInput(input=dummy_data, size=2, is_seq=True),
-                  GeneratedInput(size=num_words,
-                                 embedding_name="wordvec",
-                                 embedding_size=num_words)]
-
-    def inner_step(dummy_memory, predict_word):
-
-        # simplified RNN for testing
-        with mixed_layer(size=num_words) as layer:
-            layer += full_matrix_projection(input=predict_word,
-                                            param_attr=ParamAttr(name="transtable"))
-
-        with mixed_layer(size=num_words, act=ExpActivation()) as out:
-            out += trans_full_matrix_projection(input=layer,
-                                                param_attr=ParamAttr(name="wordvec"))
-
-        return out
-
-    beam_gen = beam_search(name="rnn_gen",
-                           step=inner_step,
-                           input=gen_inputs,
-                           bos_id=0,
-                           eos_id=num_words-1,
-                           beam_size=2 if beam_flag else 1,
-                           num_results_per_sample=1,
-                           max_length=10)
-    return beam_gen
-
-beam_gen_concat = recurrent_group(name="rnn_gen_concat",
-                                  step=outer_step,
-                                  input=[SubsequenceInput(dummy_data)])
-
-seqtext_printer_evaluator(input=beam_gen_concat,
-                          id_input=sent_id,
-                          dict_file="./trainer/tests/test_gen_dict.txt",
-                          result_file="./trainer/tests/dump_text.test")
-#outputs(beam_gen_concat)
-# In this config, as dummy_data_input doesn't work on beam_gen (we can find dummy_memory
-# is read-only memory, and isn't used by other layers of step), we show the Inputs and Outputs
-# as follows. Note that "__beam_search_predict__" is the default output name of beam_search.
-Inputs("sent_id","dummy_data_input")
-Outputs("__beam_search_predict__")
diff --git a/paddle/trainer/tests/sample_trainer_rnn_gen.conf b/paddle/trainer/tests/sample_trainer_rnn_gen.conf
deleted file mode 100644
index 58d27f15ae1c0a38885ee105a7963b6e7bd55906..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/sample_trainer_rnn_gen.conf
+++ /dev/null
@@ -1,66 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=15, learning_rate=0)
-
-num_words = 5
-beam_flag = get_config_arg('beam_search', bool, False)
-
-sent_id = data_layer(name="sent_id", size=1)
-
-# This layer has no actual use, but only to decide batch_size in generation.
-# When generating, at least one Memory in RecurrentLayer MUST have a boot layer.
-dummy_data = data_layer(name="dummy_data_input", size=2)
-
-gen_inputs = [StaticInput(input=dummy_data, size=2),
-              GeneratedInput(size=num_words,
-                             embedding_name="wordvec",
-                             embedding_size=num_words)]
-
-def step(dummy_memory, predict_word):
-
-    # simplified RNN for testing
-    with mixed_layer(size=num_words) as layer:
-        layer += full_matrix_projection(input=predict_word,
-                                        param_attr=ParamAttr(name="transtable"))
-
-    with mixed_layer(size=num_words, act=ExpActivation()) as out:
-        out += trans_full_matrix_projection(input=layer,
-                                            param_attr=ParamAttr(name="wordvec"))
-
-    return out
-
-beam_gen = beam_search(name="rnn_gen",
-                       step=step,
-                       input=gen_inputs,
-                       bos_id=0,
-                       eos_id=num_words-1,
-                       beam_size=2 if beam_flag else 1,
-                       num_results_per_sample=2 if beam_flag else 1,
-                       max_length=10)
-
-seqtext_printer_evaluator(input=beam_gen,
-                          id_input=sent_id,
-                          dict_file="./trainer/tests/test_gen_dict.txt",
-                          result_file="./trainer/tests/dump_text.test")
-#outputs(beam_gen)
-# In this config, as dummy_data_input doesn't work on beam_gen (we can find dummy_memory
-# is read-only memory, and isn't used by other layers of step), we show the Inputs and Outputs
-# as follows. Note that "__beam_search_predict__" is the default output name of beam_search.
-Inputs("sent_id","dummy_data_input")
-Outputs("__beam_search_predict__")
diff --git a/paddle/trainer/tests/simple_sparse_neural_network.py b/paddle/trainer/tests/simple_sparse_neural_network.py
deleted file mode 100644
index 970fb466dc5061713fe7815d5247cbbde93be821..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/simple_sparse_neural_network.py
+++ /dev/null
@@ -1,37 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-settings(batch_size=17, learning_method=AdaGradOptimizer(), learning_rate=1e-4)
-
-file_list = 'trainer/tests/fake_file_list.list'
-
-define_py_data_sources2(
-    train_list=file_list,
-    test_list=file_list,
-    module="simple_sparse_neural_network_dp",
-    obj="process")
-
-embedding = embedding_layer(
-    input=data_layer(
-        name="word_ids", size=8191),
-    size=128,
-    param_attr=ParamAttr(sparse_update=True))
-prediction = fc_layer(input=embedding, size=10, act=SoftmaxActivation())
-
-outputs(
-    classification_cost(
-        input=prediction, label=data_layer(
-            name='label', size=10)))
diff --git a/paddle/trainer/tests/test_Compare.cpp b/paddle/trainer/tests/test_Compare.cpp
deleted file mode 100644
index f3a964acb69be059a43470f7b68910a3b6cecaab..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/test_Compare.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/utils/PythonUtil.h>
-
-#include "paddle/trainer/Trainer.h"
-
-#include <gtest/gtest.h>
-#include <cstdlib>
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static const string& configFile = "trainer/tests/sample_trainer_config.conf";
-
-DECLARE_int32(gpu_id);
-DECLARE_bool(use_gpu);
-DECLARE_string(config);
-DECLARE_string(config_args);
-
-struct comData {
-  vector<Argument> outArgs;
-  vector<ParameterPtr> parameters;
-};
-
-void calcGradient(bool useGpu, comData& Data) {
-  FLAGS_use_gpu = useGpu;
-  FLAGS_config = configFile;
-
-  *ThreadLocalRand::getSeed() = 0;
-  srand(0);
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlagConfig());
-
-  Data.parameters = trainer.getGradientMachine()->getParameters();
-  DataBatch dataBatch;
-  int32_t batchSize = trainer.getConfig().opt_config().batch_size();
-  trainer.getDataProvider()->setSkipShuffle();
-  trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
-  CHECK(dataBatch.getSize()) << "No data from data provider";
-  vector<Argument>& inArgs = dataBatch.getStreams();
-  trainer.getGradientMachine()->start();
-  for (int i = 0; i < 2; ++i) {
-    trainer.getGradientMachine()->forwardBackward(
-        inArgs, &Data.outArgs, PASS_TRAIN);
-  }
-  trainer.getGradientMachine()->finish();
-}
-
-void compareGradient(comData& comDataCpu, comData& comDataGpu);
-
-TEST(Trainer, create) {
-  int devCount = 0;
-  devCount = hl_get_device_count();
-  FLAGS_config_args = "drop_rate=0";
-
-  comData comDataCpu;
-  calcGradient(false, comDataCpu);
-  LOG(INFO) << "Cpu is completed";
-
-  {
-    LOG(INFO) << "Test GPU";
-    comData comData;
-    calcGradient(true, comData);
-    compareGradient(comDataCpu, comData);
-    LOG(INFO) << "Gpu is completed";
-  }
-
-  {
-    LOG(INFO) << "Test test multi gpu";
-    comData comData;
-    FLAGS_trainer_count = devCount;
-    calcGradient(true, comData);
-    compareGradient(comDataCpu, comData);
-    LOG(INFO) << "Gpu4 is completed";
-  }
-
-  {
-    LOG(INFO) << "Test use_sparse_update=true";
-    comData comData;
-    calcGradient(false, comData);
-    compareGradient(comDataCpu, comData);
-    LOG(INFO) << "Cpu4 is completed";
-  }
-}
-
-double checkBuffer(real* A, real* B, size_t len) {
-#ifdef PADDLE_TYPE_DOUBLE
-  double precision = 1e-7;
-#else
-  double precision = 2e-3;
-#endif
-  int nNum = 0;
-  double maxE = 0;
-  for (size_t i = 0; i < len; ++i) {
-    double e = fabs(A[i] - B[i]);
-    maxE = std::max(e, maxE);
-    nNum += e > precision * fabs(A[i]);
-  }
-  EXPECT_EQ(0, nNum);
-  return maxE;
-}
-
-void compareGradient(comData& comDataCpu, comData& comDataGpu) {
-  /*compare outArgs*/
-  vector<Argument> outArgs1 = comDataCpu.outArgs;
-  vector<Argument> outArgs2 = comDataGpu.outArgs;
-  CpuMatrix out1(outArgs1[0].value->getHeight(), outArgs1[0].value->getWidth());
-  CpuMatrix out2(outArgs2[0].value->getHeight(), outArgs2[0].value->getWidth());
-  out1.copyFrom(*outArgs1[0].value);
-  out2.copyFrom(*outArgs2[0].value);
-  checkBuffer(out1.getData(), out2.getData(), out1.getElementCnt());
-
-  /*compare parameters*/
-  vector<ParameterPtr>& parameters1 = comDataCpu.parameters;
-  vector<ParameterPtr>& parameters2 = comDataGpu.parameters;
-  for (size_t i = 0; i < parameters1.size(); ++i) {
-    ParameterPtr parameter1, parameter2;
-    parameter1 = parameters1[i];
-    parameter2 = parameters2[i];
-    /*compare parameters value*/
-    CpuVector para1(parameter1->getSize());
-    CpuVector para2(parameter2->getSize());
-    para1.copyFrom(*parameter1->getBuf(PARAMETER_VALUE));
-    para2.copyFrom(*parameter2->getBuf(PARAMETER_VALUE));
-    checkBuffer(para1.getData(), para2.getData(), para1.getSize());
-
-    /*compare parameters grad*/
-    CpuVector cpuGrad1(*parameter1->getBuf(PARAMETER_GRADIENT));
-    CpuVector cpuGrad2(*parameter2->getBuf(PARAMETER_GRADIENT));
-    double e =
-        checkBuffer(cpuGrad1.getData(), cpuGrad2.getData(), cpuGrad1.getSize());
-    LOG(INFO) << parameter1->getName() << " max error=" << e;
-  }
-}
-
-int main(int argc, char** argv) {
-#ifndef PADDLE_WITH_CUDA
-  exit(0);
-#endif
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  initPython(argc, argv);
-  int ret = RUN_ALL_TESTS();
-  exit(ret);
-}
diff --git a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
deleted file mode 100644
index 92dc8aa9ec5ce281d1950d84260c1b9555e686a7..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
+++ /dev/null
@@ -1,220 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_NO_PYTHON
-#include <DataConfig.pb.h>
-#include <gtest/gtest.h>
-#include <paddle/gserver/dataproviders/DataProvider.h>
-#include <paddle/math/Matrix.h>
-#include <paddle/parameter/Argument.h>
-#include <paddle/utils/PythonUtil.h>
-#include <fstream>
-#include <typeinfo>
-#include <unordered_map>
-#include <unordered_set>
-#include "picojson.h"
-
-void checkValue(std::vector<paddle::Argument>& arguments, picojson::array& arr);
-const std::string kDir = "./trainer/tests/pydata_provider_wrapper_dir/";
-
-TEST(PyDataProviderWrapper, SequenceData) {
-  paddle::DataConfig conf;
-  conf.set_type("py");
-  conf.set_load_data_module("testPyDataWrapper");
-  conf.set_load_data_object("processSeqAndGenerateData");
-  conf.set_load_data_args(kDir + "test_pydata_provider_wrapper.json");
-  conf.clear_files();
-  conf.set_files(kDir + "test_pydata_provider_wrapper.list");
-  paddle::DataProviderPtr provider(paddle::DataProvider::create(conf, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  paddle::DataBatch batchFromPy;
-  provider->getNextBatch(100, &batchFromPy);
-
-  picojson::value val;
-  std::fstream fin;
-  fin.open(kDir + "test_pydata_provider_wrapper.json", std::ios_base::in);
-  EXPECT_TRUE(fin.is_open());
-  if (fin.is_open()) {
-    std::string err = picojson::parse(val, fin);
-    EXPECT_TRUE(err.empty());
-    EXPECT_TRUE(val.is<picojson::array>());
-    picojson::array& arr = val.get<picojson::array>();
-    std::vector<paddle::Argument>& arguments = batchFromPy.getStreams();
-    // CHECK Value
-    checkValue(arguments, arr);
-    // CHECK sequenceStartPositions
-    for (size_t i = 0; i < arr.size(); i++) {
-      int row_id = arr[i].get<picojson::array>().size();
-      EXPECT_EQ(0, arguments[i].sequenceStartPositions->getData(false)[0]);
-      EXPECT_EQ((int)row_id,
-                arguments[i].sequenceStartPositions->getData(false)[1]);
-    }
-    fin.close();
-  }
-}
-
-TEST(PyDataProviderWrapper, HasSubSequenceData) {
-  paddle::DataConfig conf;
-  conf.set_type("py");
-  conf.set_load_data_module("testPyDataWrapper");
-  conf.set_load_data_object("processSubSeqAndGenerateData");
-  conf.set_load_data_args(kDir + "test_pydata_provider_wrapper.json");
-  conf.clear_files();
-  conf.set_files(kDir + "test_pydata_provider_wrapper.list");
-  paddle::DataProviderPtr provider(paddle::DataProvider::create(conf, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  paddle::DataBatch batchFromPy;
-  provider->getNextBatch(1, &batchFromPy);
-
-  picojson::value val;
-  std::fstream fin;
-  fin.open(kDir + "test_pydata_provider_wrapper.json", std::ios_base::in);
-  EXPECT_TRUE(fin.is_open());
-  if (fin.is_open()) {
-    std::string err = picojson::parse(val, fin);
-    EXPECT_TRUE(err.empty());
-    EXPECT_TRUE(val.is<picojson::array>());
-    picojson::array& arr = val.get<picojson::array>();
-    std::vector<paddle::Argument>& arguments = batchFromPy.getStreams();
-    // CHECK Value
-    checkValue(arguments, arr);
-    // CHECK sequenceStartPositions and subSequenceStartPositions
-    for (size_t i = 0; i < arr.size(); i++) {
-      int row_id = arr[i].get<picojson::array>().size();
-      EXPECT_EQ(0, arguments[i].sequenceStartPositions->getData(false)[0]);
-      EXPECT_EQ((int)row_id,
-                arguments[i].sequenceStartPositions->getData(false)[1]);
-      EXPECT_EQ(0, arguments[i].subSequenceStartPositions->getData(false)[0]);
-      EXPECT_EQ((int)row_id,
-                arguments[i].subSequenceStartPositions->getData(false)[1]);
-    }
-    fin.close();
-  }
-}
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  paddle::initPython(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
-void checkValue(std::vector<paddle::Argument>& arguments,
-                picojson::array& arr) {
-  // CHECK SLOT 0, Sparse Value.
-  paddle::Argument& sparse_values_seq = arguments[0];
-  paddle::MatrixPtr& sparse_values_seq_rawmatrix = sparse_values_seq.value;
-  EXPECT_TRUE(sparse_values_seq_rawmatrix != nullptr);
-  paddle::CpuSparseMatrix* sparse_val_seq_sparse_mat =
-      dynamic_cast<paddle::CpuSparseMatrix*>(sparse_values_seq_rawmatrix.get());
-  EXPECT_TRUE(sparse_val_seq_sparse_mat != nullptr);
-  EXPECT_EQ(arr.size(), arguments.size());
-  EXPECT_TRUE(arr[0].is<picojson::array>());
-  size_t row_id = 0;
-  for (picojson::value& sparse_val_seq : arr[0].get<picojson::array>()) {
-    std::unordered_map<int, real> cols;
-    for (picojson::value& kv : sparse_val_seq.get<picojson::array>()) {
-      EXPECT_TRUE(kv.get(0).is<double>());
-      EXPECT_TRUE(kv.get(1).is<double>());
-      int col = (int)(kv.get(0).get<double>());
-      real val = (real)(kv.get(1).get<double>());
-      cols.insert({col, val});
-    }
-    size_t colNum = sparse_val_seq_sparse_mat->getColNum(row_id);
-    EXPECT_EQ(cols.size(), colNum);
-    int* rowIds = sparse_val_seq_sparse_mat->getRowCols(row_id);
-    real* rowBuf = sparse_val_seq_sparse_mat->getRowValues(row_id);
-    for (size_t i = 0; i < colNum; ++i) {
-      int id = rowIds[i];
-      auto it = cols.find(id);
-      EXPECT_NE(cols.end(), it);
-      real expect = it->second;
-      EXPECT_NEAR(expect, *rowBuf, 1e-5);
-      ++rowBuf;
-    }
-    ++row_id;
-  }
-
-  // CHECK SLOT 1, Dense Value.
-  paddle::Argument& dense_arg = arguments[1];
-  paddle::MatrixPtr& dense_mat = dense_arg.value;
-  EXPECT_NE(nullptr, dense_mat);
-  EXPECT_TRUE(arr[1].is<picojson::array>());
-  row_id = 0;
-  for (picojson::value& dense_seq : arr[1].get<picojson::array>()) {
-    EXPECT_TRUE(dense_seq.is<picojson::array>());
-    picojson::array& row = dense_seq.get<picojson::array>();
-    EXPECT_EQ(row.size(), dense_mat->getWidth());
-    real* rowBuf = dense_mat->getRowBuf(row_id++);
-
-    for (picojson::value& val : row) {
-      EXPECT_TRUE(val.is<double>());
-      real expect = val.get<double>();
-      EXPECT_NEAR(expect, *rowBuf++, 1e-5);
-    }
-  }
-
-  // CHECK SLOT 2, Sparse Non Value.
-  paddle::Argument& sparse_non_val_arg = arguments[2];
-  paddle::MatrixPtr& sparse_non_val_rawm = sparse_non_val_arg.value;
-  EXPECT_NE(nullptr, sparse_non_val_rawm);
-  paddle::CpuSparseMatrix* sparse_non_val_m =
-      dynamic_cast<paddle::CpuSparseMatrix*>(sparse_non_val_rawm.get());
-  EXPECT_NE(nullptr, sparse_non_val_m);
-  row_id = 0;
-  for (picojson::value& row : arr[2].get<picojson::array>()) {
-    EXPECT_TRUE(row.is<picojson::array>());
-    std::unordered_set<int> ids;
-    for (picojson::value& id : row.get<picojson::array>()) {
-      EXPECT_TRUE(id.is<double>());
-      ids.insert((int)(id.get<double>()));
-    }
-    size_t colNum = sparse_non_val_m->getColNum(row_id);
-    EXPECT_EQ(ids.size(), colNum);
-    for (size_t i = 0; i < colNum; ++i) {
-      int col = sparse_non_val_m->getRowCols(row_id)[i];
-      EXPECT_TRUE(ids.find(col) != ids.end());
-    }
-    ++row_id;
-  }
-
-  // CHECK SLOT 3, Index.
-  paddle::Argument& index_arg = arguments[3];
-  paddle::IVectorPtr indices = index_arg.ids;
-  EXPECT_NE(nullptr, indices);
-  int* idPtr = indices->getData();
-  for (picojson::value& id : arr[3].get<picojson::array>()) {
-    EXPECT_TRUE(id.is<double>());
-    int _id = (int)(id.get<double>());
-    EXPECT_EQ(_id, *idPtr++);
-  }
-
-  // CHECK SLOT 4, String.
-  paddle::Argument& strArg = arguments[4];
-  std::vector<std::string>* strPtr = strArg.strs.get();
-  EXPECT_NE(nullptr, strPtr);
-  size_t vecIndex = 0;
-  for (picojson::value& str : arr[4].get<picojson::array>()) {
-    EXPECT_TRUE(str.is<std::string>());
-    std::string _str = str.get<std::string>();
-    EXPECT_EQ(_str, (*strPtr)[vecIndex++]);
-  }
-}
-
-#else
-int main() { return 0; }
-
-#endif
diff --git a/paddle/trainer/tests/test_Trainer.cpp b/paddle/trainer/tests/test_Trainer.cpp
deleted file mode 100644
index 394038cf730f13cb957fbbc5ae0e5719b8fe9db6..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/utils/PythonUtil.h>
-#include <paddle/utils/Version.h>
-#include "paddle/trainer/Trainer.h"
-
-#include <gtest/gtest.h>
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static const string& configFile1 = "trainer/tests/sample_trainer_config.conf";
-static const string& configFile2 =
-    "trainer/tests/sample_trainer_config_hsigmoid.conf";
-static const string& configFile4 =
-    "trainer/tests/sample_trainer_config_parallel.conf";
-
-DECLARE_bool(use_gpu);
-DECLARE_string(config);
-DECLARE_int32(gpu_id);
-DECLARE_bool(allow_only_one_model_on_one_gpu);
-
-void checkGradientTest(const string& configFile,
-                       bool useGpu,
-                       bool parallel,
-                       int trainerCount = 1) {
-  FLAGS_use_gpu = useGpu;
-  FLAGS_parallel_nn = parallel;
-  FLAGS_config = configFile;
-  FLAGS_trainer_count = trainerCount;
-  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
-            << " configFile=" << configFile;
-
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlagConfig());
-  EXPECT_LE(fabs(trainer.checkGradient()), 0.02);
-}
-
-TEST(checkGradient, cpu) { checkGradientTest(configFile1, false, false); }
-
-#ifdef PADDLE_WITH_CUDA
-TEST(checkGradient, gpu) { checkGradientTest(configFile1, true, false); }
-
-TEST(checkGradient, multiGpu) {
-  int numGpu;
-  numGpu = hl_get_device_count();
-  for (auto count : {2, 4}) {
-    if (count <= numGpu) {
-      checkGradientTest(configFile1, true, false, count);
-    }
-  }
-}
-
-TEST(checkGradient, parallel) {
-  if (hl_get_device_count() >= 2) {
-    checkGradientTest(configFile4, true, true);
-  }
-}
-
-TEST(checkGradient, multiParallel) {
-  FLAGS_allow_only_one_model_on_one_gpu = false;
-  checkGradientTest(configFile4, true, true, 2);
-  FLAGS_allow_only_one_model_on_one_gpu = true;
-}
-
-#endif
-
-TEST(checkGradient, multi) {
-  int numGpu;
-  if (version::isWithGpu()) {
-    numGpu = hl_get_device_count();
-  } else {
-    numGpu = 0;
-  }
-  for (bool useGpu : {false, true}) {
-    for (auto count : {2, 4}) {
-      if (useGpu && count > numGpu) continue;
-      checkGradientTest(configFile1, useGpu, false, count);
-    }
-  }
-}
-
-TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }
-
-TEST(checkGradient, non_parallel) {
-  checkGradientTest(configFile4, false, false);
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  initPython(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/trainer/tests/test_TrainerOnePass.cpp b/paddle/trainer/tests/test_TrainerOnePass.cpp
deleted file mode 100644
index b2a93d4d5eea37ad716b59427f2aa4409d2f537d..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ /dev/null
@@ -1,317 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/utils/GlobalConstants.h>
-#include <paddle/utils/PythonUtil.h>
-#include "paddle/trainer/Trainer.h"
-#include "paddle/trainer/TrainerInternal.h"
-
-#include <gtest/gtest.h>
-#include <paddle/pserver/ParameterServer2.h>
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static const string& configFile1 = "trainer/tests/sample_trainer_config.conf";
-static const string& configFile2 =
-    "trainer/tests/sample_trainer_config_parallel.conf";
-
-static const string& configFileSimpleSparse =
-    "trainer/tests/simple_sparse_neural_network.py";
-
-DECLARE_bool(use_gpu);
-DECLARE_string(config);
-DECLARE_int32(gpu_id);
-DECLARE_int32(seed);
-DECLARE_int32(num_passes);
-DECLARE_int32(saving_period);
-
-class TrainerForTest : public paddle::Trainer {
-public:
-  inline const std::shared_ptr<ParameterUpdater>& getParameterUpdaterForTest() {
-    return this->trainerInternal_.getParameterUpdater();
-  }
-};
-
-int gNumDevices = 0;
-
-void trainerOnePassTest(const string& configFile,
-                        bool useGpu,
-                        bool parallel,
-                        int trainerCount = 1,
-                        double averageWindow = 0.0f,
-                        bool doAverageInCpu = false) {
-  FLAGS_use_gpu = useGpu;
-  FLAGS_parallel_nn = parallel;
-  FLAGS_config = configFile;
-  FLAGS_trainer_count = trainerCount;
-  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
-            << " configFile=" << configFile;
-  srand(FLAGS_seed);
-
-  if (useGpu) {
-    if (gNumDevices < trainerCount) {
-      return;
-    }
-  }
-
-  Trainer trainer;
-  auto config = TrainerConfigHelper::createFromFlagConfig();
-  if (averageWindow > 0) {
-    config->getOptConfig().set_average_window(averageWindow);
-    config->getOptConfig().set_do_average_in_cpu(doAverageInCpu);
-  }
-  trainer.init(config);
-  trainer.train();
-}
-
-// 1. test trainer (cpu, gpu).
-TEST(trainerOnePass, cpu) { trainerOnePassTest(configFile1, false, false); }
-
-#ifdef PADDLE_WITH_CUDA
-TEST(trainerOnePass, gpu) { trainerOnePassTest(configFile1, true, false); }
-
-TEST(trainerOnePass, gpu2) { trainerOnePassTest(configFile1, true, false, 2); }
-
-TEST(trainerOnePass, gpu4) { trainerOnePassTest(configFile1, true, false, 4); }
-
-TEST(trainerOnePass, parallel) {
-  if (hl_get_device_count() >= 2) {
-    trainerOnePassTest(configFile2, true, true);
-  }
-}
-#endif
-
-// 2. test average_window.
-#ifdef PADDLE_WITH_CUDA
-TEST(average_window, gpu) {
-  trainerOnePassTest(configFile1, true, false, 4, 0.01);
-}
-
-TEST(average_window, gpu2) {
-  FLAGS_num_passes = 20;
-  trainerOnePassTest(configFile1, true, false, 2, 0.01);
-  FLAGS_num_passes = 1;
-}
-
-TEST(average_window, gpu4) {
-  FLAGS_num_passes = 20;
-  trainerOnePassTest(configFile1, true, false, 4, 0.01);
-  FLAGS_num_passes = 1;
-}
-
-TEST(average_window_cpu, gpu2) {
-  FLAGS_num_passes = 20;
-  trainerOnePassTest(configFile1, true, false, 2, 0.01, true);
-  FLAGS_num_passes = 1;
-}
-
-TEST(average_window_cpu, gpu4) {
-  FLAGS_num_passes = 20;
-  trainerOnePassTest(configFile1, true, false, 4, 0.01, true);
-  FLAGS_num_passes = 1;
-}
-#endif
-
-// 3. test trainer + pserver.
-DECLARE_int32(num_gradient_servers);
-DECLARE_int32(port);
-DECLARE_bool(local);
-DECLARE_bool(use_old_updater);
-
-double checkRemoteParameterUpdater(TrainerForTest& trainer) {
-  auto gradientMachine = trainer.getGradientMachine();
-  auto parameterUpdater = trainer.getParameterUpdaterForTest();
-  auto dataProvider = trainer.getDataProvider();
-  auto& parameters = gradientMachine->getParameters();
-  const TrainerConfig& config = trainer.getConfig();
-  const string& alg = config.opt_config().algorithm();
-
-  vector<ParameterPtr> parameterCheck;
-  for (auto& parameter : parameters) {
-    parameterCheck.emplace_back(
-        new Parameter(parameter->getConfig(), /* useGpu= */ false));
-    parameterCheck.back()
-        ->getBuf(PARAMETER_VALUE)
-        ->copyFrom(*parameter->getBuf(PARAMETER_VALUE));
-    parameterCheck.back()
-        ->getBuf(PARAMETER_GRADIENT)
-        ->copyFrom(*parameter->getBuf(PARAMETER_GRADIENT));
-  }
-
-  std::unique_ptr<ParameterUpdater> parameterUpdaterCheck;
-  if (alg == TrainAlgorithm::SGD) {
-    parameterUpdaterCheck.reset(new SgdLocalUpdater(config.opt_config()));
-  } else {
-    LOG(INFO) << "unsupported algorithm in remote parameter check: " << alg;
-    return -1.0;
-  }
-  parameterUpdaterCheck->init(parameterCheck);
-
-  // gradientMachine->start(config, *dataProvider);
-  DataBatch dataBatch;
-  int32_t batchSize = config.opt_config().batch_size();
-  dataProvider->getNextBatch(batchSize, &dataBatch);
-  CHECK(dataBatch.getSize()) << "No data from data provider";
-  int64_t actualBatchSize = dataBatch.getSize();
-  const vector<Argument>& inArgs = dataBatch.getStreams();
-  vector<Argument> outArgs;
-
-  UpdateCallback updateCallback = [parameterUpdater,
-                                   parameterCheck](Parameter* para) {
-    parameterCheck[para->getID()]
-        ->getBuf(PARAMETER_GRADIENT)
-        ->copyFrom(*para->getBuf(PARAMETER_GRADIENT));
-    parameterUpdater->update(para);
-  };
-
-  parameterUpdater->startPass();
-  parameterUpdaterCheck->startPass();
-
-  for (int i = 0; i < config.opt_config().num_batches_per_get_parameter() * 2;
-       ++i) {
-    PassType passType = parameterUpdater->startBatch(actualBatchSize);
-    gradientMachine->forwardBackward(
-        inArgs, &outArgs, passType, updateCallback);
-    parameterUpdater->finishBatch(0);
-
-    parameterUpdaterCheck->startBatch(actualBatchSize);
-    for (auto& para : parameterCheck) {
-      parameterUpdaterCheck->update(para.get());
-    }
-    parameterUpdaterCheck->finishBatch(0);
-  }
-
-  double sum = 0.0f;
-  for (size_t i = 0; i != parameters.size(); ++i) {
-    real *v1, *v2;
-    CpuVector trainerPara(parameters[i]->getSize());
-    trainerPara.copyFrom(*parameters[i]->getBuf(PARAMETER_VALUE));
-    if (!FLAGS_use_gpu) {
-      v1 = parameters[i]->getBuf(PARAMETER_VALUE)->getData();
-    } else {
-      v1 = trainerPara.getData();
-    }
-    v2 = parameterCheck[i]->getBuf(PARAMETER_VALUE)->getData();
-
-    size_t size = parameters[i]->getSize();
-    double diff = 0;
-    for (size_t j = 0; j < size; ++j) {
-      diff += fabs(v1[j] - v2[j]);
-    }
-    sum += diff;
-    LOG(INFO) << setiosflags(ios::left) << setfill(' ') << setw(20)
-              << parameters[i]->getName() << "diff=" << setw(15) << diff;
-  }
-
-  parameterUpdater->finishPass();
-  parameterUpdaterCheck->finishPass();
-  gradientMachine->finish();
-  return sum;
-}
-
-void checkRemoteParameterUpdaterTest(const string& configFile,
-                                     bool useGpu,
-                                     bool parallel,
-                                     int trainerCount = 1,
-                                     bool useOldUpdater = false,
-                                     int num_batches_per_get_parameter = 1) {
-  FLAGS_use_gpu = useGpu;
-  FLAGS_parallel_nn = parallel;
-  FLAGS_config = configFile;
-  FLAGS_trainer_count = trainerCount;
-  FLAGS_use_old_updater = useOldUpdater;
-  LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount
-            << " configFile=" << configFile;
-  srand(FLAGS_seed);
-
-  if (useGpu) {
-    if (gNumDevices < trainerCount) {
-      return;
-    }
-  }
-
-  FLAGS_local = 0;
-  std::shared_ptr<ParameterServer2> pserver;
-  pserver.reset(new ParameterServer2(std::string(), FLAGS_port));
-  pserver->init();
-  pserver->start();
-
-  TrainerForTest trainer;
-  auto config = TrainerConfigHelper::createFromFlagConfig();
-  config->getOptConfig().set_num_batches_per_get_parameter(
-      num_batches_per_get_parameter);
-  trainer.init(config);
-  EXPECT_EQ(checkRemoteParameterUpdater(trainer), 0);
-
-  FLAGS_local = 1;
-}
-
-TEST(checkRemoteUpdater, cpuTrainer) {
-  checkRemoteParameterUpdaterTest(configFile1, false, false);
-}
-
-TEST(checkRemoteUpdater, cpuTrainerOldUpdater) {
-  checkRemoteParameterUpdaterTest(configFile1, false, false, 1, true);
-}
-
-#ifdef PADDLE_WITH_CUDA
-TEST(checkRemoteUpdater, gpuTrainer) {
-  checkRemoteParameterUpdaterTest(configFile1, true, false);
-}
-
-TEST(checkRemoteUpdater, gpu2Trainer) {
-  checkRemoteParameterUpdaterTest(configFile1, true, false, 2);
-}
-
-TEST(checkRemoteUpdater, gpu4Trainer) {
-  checkRemoteParameterUpdaterTest(configFile1, true, false, 4);
-}
-
-TEST(checkRemoteUpdater, gpuTrainerOldUpdater) {
-  checkRemoteParameterUpdaterTest(configFile1, true, false, 1, true);
-}
-
-TEST(checkRemoteUpdater, gpu2TrainerOldUpdater) {
-  checkRemoteParameterUpdaterTest(configFile1, true, false, 2, true);
-}
-
-TEST(checkRemoteUpdater, gpu4TrainerOldUpdater) {
-  checkRemoteParameterUpdaterTest(configFile1, true, false, 4, true);
-}
-
-#endif
-
-TEST(checkRemoteUpdater, cpuDeltaTrainer) {
-  checkRemoteParameterUpdaterTest(configFile1, false, false, 1, false, 10);
-}
-
-TEST(checkRemoteUpdater, cpuDeltaTrainerOldUpdater) {
-  checkRemoteParameterUpdaterTest(configFile1, false, false, 1, true, 10);
-}
-
-TEST(SgdThreadUpdater, simpleSparseNN) {
-  trainerOnePassTest(configFileSimpleSparse, false, false, 1, 0.5, true);
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  initPython(argc, argv);
-  gNumDevices = hl_get_device_count();
-
-  FLAGS_num_passes = 1;          // train one pass
-  FLAGS_saving_period = 100000;  // do not save parameteres
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/trainer/tests/test_config.conf b/paddle/trainer/tests/test_config.conf
deleted file mode 100644
index 2f86aaa75316fa2a5a28edfef31c01e15a44b3d0..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/test_config.conf
+++ /dev/null
@@ -1,77 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-TrainData(SimpleData(
-    files = "trainer/tests/sample_filelist.txt",
-    feat_dim = 3,
-    context_len = 0,
-    buffer_capacity = 1000000,
-    async_load_data = False))
-
-settings(batch_size = 100)
-
-data = data_layer(name='input', size=3)
-
-wt = data_layer(name='weight', size=1)
-
-fc1 = fc_layer(input=data, size=5,
-               bias_attr=True,
-               act=SigmoidActivation())
-
-fc2 = fc_layer(input=data, size=12,
-               bias_attr=True,
-               param_attr=ParamAttr(name='sharew'),
-               act=LinearActivation())
-
-fc3 = fc_layer(input=data, size=3,
-               bias_attr=True,
-               act=TanhActivation())
-
-fc4 = fc_layer(input=data, size=5,
-               bias_attr=True,
-               layer_attr=ExtraAttr(drop_rate=0.5),
-               act=SquareActivation())
-
-pool = img_pool_layer(input=fc2,
-                      pool_size=2,
-                      pool_size_y=3,
-                      num_channels=1,
-                      padding=1,
-                      padding_y=2,
-                      stride=2,
-                      stride_y=3,
-                      pool_type=CudnnAvgPooling())
-
-concat = concat_layer(input=[fc3, fc4])
-
-with mixed_layer(size=3, act=SoftmaxActivation()) as output:
-    output += full_matrix_projection(input=fc1)
-    output += trans_full_matrix_projection(input=fc2,
-                                           param_attr=ParamAttr(name='sharew'))
-    output += full_matrix_projection(input=concat)
-    output += identity_projection(input=fc3)
-
-lbl = data_layer(name='label', size=1)
-
-cost = classification_cost(input=output, label=lbl, weight=wt,
-                           layer_attr=ExtraAttr(device=-1))
-
-nce = nce_layer(input=fc2, label=lbl, weight=wt,
-                num_classes=3, 
-                neg_distribution=[0.1, 0.3, 0.6])
-                
-outputs(cost, nce)
diff --git a/paddle/trainer/tests/test_recurrent_machine_generation.cpp b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
deleted file mode 100644
index a8fbe31c2b1e228107dfc19483444409bfcbf788..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fstream>
-
-#include <paddle/trainer/Trainer.h>
-#include <paddle/utils/PythonUtil.h>
-
-#include <gtest/gtest.h>
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static const string& CONFIG_FILE = "trainer/tests/sample_trainer_rnn_gen.conf";
-static const string& NEST_CONFIG_FILE =
-    "trainer/tests/sample_trainer_nest_rnn_gen.conf";
-static const string& OUTPUT_DIR = "trainer/tests/dump_text.test";
-static string modelDir = "trainer/tests/rnn_gen_test_model_dir/t1";  // NOLINT
-static string expectFile =                                           // NOLINT
-    "trainer/tests/rnn_gen_test_model_dir/r1.test";                  // NOLINT
-
-DECLARE_string(config_args);
-
-vector<float> readRetFile(const string& fname) {
-  ifstream inFile(fname);
-  float ret;
-  vector<float> nums;
-  while (inFile >> ret) {
-    nums.push_back(ret);
-  }
-  return nums;
-}
-
-void checkOutput(const string& expRetFile) {
-  vector<float> rets = readRetFile(OUTPUT_DIR);
-  vector<float> expRets = readRetFile(expRetFile);
-  EXPECT_EQ(rets.size(), expRets.size());
-  for (size_t i = 0; i < rets.size(); i++) {
-    EXPECT_FLOAT_EQ(rets[i], expRets[i]);
-  }
-}
-
-void prepareInArgs(vector<Argument>& inArgs,
-                   const size_t batchSize,
-                   bool useGpu,
-                   bool hasSubseq) {
-  inArgs.clear();
-  // sentence id
-  Argument sentId;
-  sentId.value = nullptr;
-  if (hasSubseq) {
-    // as there is only one sequence, there is only one label.
-    IVector::resizeOrCreate(sentId.ids, 1, useGpu);
-    sentId.ids->setElement(0, 0);
-  } else {
-    // as there is batchSize word, there is batchSize label.
-    IVector::resizeOrCreate(sentId.ids, batchSize, useGpu);
-    for (size_t i = 0; i < batchSize; ++i) sentId.ids->setElement(i, i);
-  }
-  inArgs.emplace_back(sentId);
-
-  // a dummy layer to decide batch size
-  Argument dummyInput;
-  dummyInput.value = Matrix::create(batchSize, 2, false, useGpu);
-  dummyInput.value->randomizeUniform();
-  if (hasSubseq) {
-    // generate one sequence with batchSize subsequence,
-    // and each subsequence has only one word.
-    dummyInput.sequenceStartPositions = ICpuGpuVector::create(2, false);
-    int* buf = dummyInput.sequenceStartPositions->getMutableData(false);
-    dummyInput.subSequenceStartPositions =
-        ICpuGpuVector::create(batchSize + 1, false);
-    int* subBuf = dummyInput.subSequenceStartPositions->getMutableData(false);
-    buf[0] = 0;
-    buf[1] = batchSize;
-    for (size_t i = 0; i < batchSize + 1; i++) subBuf[i] = i;
-  }
-  inArgs.emplace_back(dummyInput);
-}
-
-void testGeneration(const string& configFile,
-                    bool useGpu,
-                    bool hasSubseq,
-                    const string& expRetFile) {
-  FLAGS_use_gpu = useGpu;
-  auto config = std::make_shared<TrainerConfigHelper>(configFile);
-  unique_ptr<GradientMachine> gradientMachine(GradientMachine::create(*config));
-  gradientMachine->loadParameters(modelDir);
-  vector<Argument> inArgs(2);
-
-  const size_t batchSize = 15;
-  prepareInArgs(inArgs, batchSize, useGpu, hasSubseq);
-  vector<Argument> outArgs;
-  unique_ptr<Evaluator> testEvaluator(gradientMachine->makeEvaluator());
-  testEvaluator->start();
-  gradientMachine->forward(inArgs, &outArgs, PASS_TEST);
-  gradientMachine->eval(testEvaluator.get());
-  testEvaluator->finish();
-  checkOutput(expRetFile);
-}
-
-#ifndef PADDLE_TYPE_DOUBLE
-
-TEST(RecurrentGradientMachine, test_generation) {
-#ifndef PADDLE_WITH_CUDA
-  const auto useGpuConfs = {false};
-#else
-  const auto useGpuConfs = {true, false};
-#endif
-  auto testGen = [&](const string& configFile,
-                     bool hasSubseq,
-                     const string& expRetFile,
-                     bool beam_search) {
-    FLAGS_config_args = beam_search ? "beam_search=1" : "beam_search=0";
-    for (auto useGpu : useGpuConfs) {
-      LOG(INFO) << configFile << " useGpu=" << useGpu
-                << " beam_search=" << beam_search;
-      testGeneration(configFile, useGpu, hasSubseq, expRetFile);
-    }
-  };
-  testGen(CONFIG_FILE, false, expectFile + ".nobeam", false);  // no beam search
-  testGen(CONFIG_FILE, false, expectFile + ".beam", true);     // beam search
-  // In hierarchical RNN, beam search and one way search are only in inner-RNN,
-  // outer-RNN will concat the generated inner-results (first for beam search)
-  // from inner-RNN. Thus, they have the same outer-results.
-  testGen(NEST_CONFIG_FILE,
-          true,
-          expectFile + ".nest",
-          false);  // no beam search
-  testGen(NEST_CONFIG_FILE, true, expectFile + ".nest", true);  // beam search
-}
-#endif
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  initPython(argc, argv);
-  CHECK(argc == 1 || argc == 3);
-  if (argc == 3) {
-    modelDir = argv[1];
-    expectFile = argv[2];
-  }
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
deleted file mode 100644
index 7a4977935ede4878c07f4fb6ba0dd76bf50acd42..0000000000000000000000000000000000000000
--- a/paddle/utils/CMakeLists.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-# The utilities for paddle
-file(GLOB UTIL_HEADERS . *.h)
-file(GLOB UTIL_SOURCES . *.cpp)
-create_resources(${CMAKE_CURRENT_SOURCE_DIR}/enable_virtualenv.py
-  ${CMAKE_CURRENT_SOURCE_DIR}/enable_virtualenv.c)
-set(UTIL_RES ${CMAKE_CURRENT_SOURCE_DIR}/enable_virtualenv.c)
-
-if(APPLE)
-    file(GLOB UTIL_ARCH_SOURCES . arch/osx/*.cpp)
-else()
-    file(GLOB UTIL_ARCH_SOURCES . arch/linux/*.cpp)
-endif()
-add_library(paddle_utils STATIC
-        ${UTIL_SOURCES}
-        ${UTIL_ARCH_SOURCES}
-        ${UTIL_RES})
-add_style_check_target(paddle_utils ${UTIL_HEADERS})
-add_style_check_target(paddle_utils ${UTIL_SOURCES}
-    ${UTIL_ARCH_SOURCES})
-add_dependencies(paddle_utils paddle_proto ${external_project_dependencies})
-if(WITH_TESTING)
-    add_subdirectory(tests)
-endif()
diff --git a/paddle/utils/ClassRegistrar.h b/paddle/utils/ClassRegistrar.h
deleted file mode 100644
index 1ac27bafabd1945d1d01e3bead22b0dd200d8688..0000000000000000000000000000000000000000
--- a/paddle/utils/ClassRegistrar.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <string>
-
-#include "Util.h"
-
-namespace paddle {
-
-/**
- * This class is used to keep a set of class types. It can register a
- * class by a type name and create an instance of a class by type.
- * Example:
- *   // Declare the registrar
- *   ClassRegistrar<Layer, LayerConfig> registar_;
- *
- *   // Register a class using its constructor
- *   registrar_.registerClass<ConvLayer>("conv");
- *
- *   // Register a class using a creation function
- *   registrar_.registerClass("pool", [](LayerConfig& config){
- *     return PoolLayer::create(config);
- *   });
- *
- *   // create a class instance by type name
- *   Layer* layer = registrar_.createByType("conv", config);
- */
-template <class BaseClass, typename... CreateArgs>
-class ClassRegistrar {
-public:
-  typedef std::function<BaseClass*(CreateArgs...)> ClassCreator;
-
-  // Register a class using a creation function.
-  // The creation function's arguments are CreateArgs
-  void registerClass(const std::string& type, ClassCreator creator) {
-    CHECK(creatorMap_.count(type) == 0) << "Duplicated class type: " << type;
-    creatorMap_[type] = creator;
-  }
-
-  // Register a class using its constructor
-  // The constructor's arguments are CreateArgs
-  template <class ClassType>
-  void registerClass(const std::string& type) {
-    registerClass(type,
-                  [](CreateArgs... args) { return new ClassType(args...); });
-  }
-
-  // Create a class instance of type @type using args
-  BaseClass* createByType(const std::string& type, CreateArgs... args) {
-    ClassCreator creator;
-    CHECK(mapGet(type, creatorMap_, &creator)) << "Unknown class type: "
-                                               << type;
-    return creator(args...);
-  }
-
-  template <typename T>
-  inline void forEachType(T callback) {
-    for (auto it = creatorMap_.begin(); it != creatorMap_.end(); ++it) {
-      callback(it->first);
-    }
-  }
-
-protected:
-  std::map<std::string, ClassCreator> creatorMap_;
-};
-
-}  // namespace paddle
diff --git a/paddle/utils/CpuId.cpp b/paddle/utils/CpuId.cpp
deleted file mode 100644
index 7186feef041eb3b1be459a506294f83f9a00ad94..0000000000000000000000000000000000000000
--- a/paddle/utils/CpuId.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/CpuId.h"
-#include "paddle/utils/Util.h"
-
-#ifdef _WIN32
-
-#include <intrin.h>
-
-/// for MSVC
-#define CPUID(info, x) __cpuidex(info, x, 0)
-
-#else
-
-#if !defined(__arm__) && !defined(__aarch64__)
-#include <cpuid.h>
-/// for GCC/Clang
-#define CPUID(info, x) __cpuid_count(x, 0, info[0], info[1], info[2], info[3])
-#endif
-
-#endif
-
-namespace paddle {
-
-SIMDFlags::SIMDFlags() {
-#if defined(__arm__) || defined(__aarch64__)
-  simd_flags_ = SIMD_NEON;
-#else
-  unsigned int cpuInfo[4];
-  // CPUID: https://en.wikipedia.org/wiki/CPUID
-  // clang-format off
-  CPUID(cpuInfo, 0x00000001);
-  simd_flags_ |= cpuInfo[3] & (1 << 25) ? SIMD_SSE   : SIMD_NONE;
-  simd_flags_ |= cpuInfo[3] & (1 << 26) ? SIMD_SSE2  : SIMD_NONE;
-  simd_flags_ |= cpuInfo[2] & (1 <<  0) ? SIMD_SSE3  : SIMD_NONE;
-  simd_flags_ |= cpuInfo[2] & (1 <<  9) ? SIMD_SSSE3 : SIMD_NONE;
-  simd_flags_ |= cpuInfo[2] & (1 << 19) ? SIMD_SSE41 : SIMD_NONE;
-  simd_flags_ |= cpuInfo[2] & (1 << 20) ? SIMD_SSE42 : SIMD_NONE;
-  simd_flags_ |= cpuInfo[2] & (1 << 12) ? SIMD_FMA3  : SIMD_NONE;
-  simd_flags_ |= cpuInfo[2] & (1 << 28) ? SIMD_AVX   : SIMD_NONE;
-
-  CPUID(cpuInfo, 0x00000007);
-  simd_flags_ |= cpuInfo[1] & (1 <<  5) ? SIMD_AVX2  : SIMD_NONE;
-  simd_flags_ |= cpuInfo[1] & (1 << 16) ? SIMD_AVX512: SIMD_NONE;
-
-  CPUID(cpuInfo, 0x80000001);
-  simd_flags_ |= cpuInfo[2] & (1 << 16) ? SIMD_FMA4  : SIMD_NONE;
-  // clang-fotmat on
-#endif
-}
-
-SIMDFlags const* SIMDFlags::instance() {
-  static SIMDFlags instance;
-  return &instance;
-}
-
-}  // namespace paddle
diff --git a/paddle/utils/CpuId.h b/paddle/utils/CpuId.h
deleted file mode 100644
index 869be5be541dafd699a87a8e8893aadadf59b711..0000000000000000000000000000000000000000
--- a/paddle/utils/CpuId.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "Common.h"
-#include "Error.h"
-
-namespace paddle {
-
-// clang-format off
-enum simd_t {
-  SIMD_NONE   = 0,          ///< None
-  SIMD_SSE    = 1 << 0,     ///< SSE
-  SIMD_SSE2   = 1 << 1,     ///< SSE 2
-  SIMD_SSE3   = 1 << 2,     ///< SSE 3
-  SIMD_SSSE3  = 1 << 3,     ///< SSSE 3
-  SIMD_SSE41  = 1 << 4,     ///< SSE 4.1
-  SIMD_SSE42  = 1 << 5,     ///< SSE 4.2
-  SIMD_FMA3   = 1 << 6,     ///< FMA 3
-  SIMD_FMA4   = 1 << 7,     ///< FMA 4
-  SIMD_AVX    = 1 << 8,     ///< AVX
-  SIMD_AVX2   = 1 << 9,     ///< AVX 2
-  SIMD_AVX512 = 1 << 10,    ///< AVX 512
-  SIMD_NEON   = 1 << 11,    ///  NEON
-};
-// clang-format on
-
-class SIMDFlags final {
-public:
-  DISABLE_COPY(SIMDFlags);
-
-  SIMDFlags();
-
-  static SIMDFlags const* instance();
-
-  inline bool check(int flags) const {
-    return !((simd_flags_ & flags) ^ flags);
-  }
-
-private:
-  int simd_flags_ = SIMD_NONE;
-};
-
-/**
- * @brief   Check SIMD flags at runtime.
- *
- * For example.
- * @code{.cpp}
- *
- * if (HAS_SIMD(SIMD_AVX2 | SIMD_FMA4)) {
- *      avx2_fm4_stub();
- * } else if (HAS_SIMD(SIMD_AVX)) {
- *      avx_stub();
- * }
- *
- * @endcode
- */
-#define HAS_SIMD(__flags) SIMDFlags::instance()->check(__flags)
-
-/**
- * @brief   Check SIMD flags at runtime.
- *
- * 1. Check all SIMD flags at runtime:
- *
- * @code{.cpp}
- * if (HAS_AVX && HAS_AVX2) {
- *      avx2_stub();
- * }
- * @endcod
- *
- * 2. Check one SIMD flag at runtime:
- *
- * @code{.cpp}
- * if (HAS_SSE41 || HAS_SSE42) {
- *      sse4_stub();
- * }
- * @endcode
- */
-// clang-format off
-#define HAS_SSE     HAS_SIMD(SIMD_SSE)
-#define HAS_SSE2    HAS_SIMD(SIMD_SSE2)
-#define HAS_SSE3    HAS_SIMD(SIMD_SSE3)
-#define HAS_SSSE3   HAS_SIMD(SIMD_SSSE3)
-#define HAS_SSE41   HAS_SIMD(SIMD_SSE41)
-#define HAS_SSE42   HAS_SIMD(SIMD_SSE42)
-#define HAS_FMA3    HAS_SIMD(SIMD_FMA3)
-#define HAS_FMA4    HAS_SIMD(SIMD_FMA4)
-#define HAS_AVX     HAS_SIMD(SIMD_AVX)
-#define HAS_AVX2    HAS_SIMD(SIMD_AVX2)
-#define HAS_AVX512  HAS_SIMD(SIMD_AVX512)
-#define HAS_NEON    HAS_SIMD(SIMD_NEON)
-// clang-format on
-
-/**
- * Invoke checkCPUFeature() before Paddle initialization to
- * check target machine whether support compiled instructions.
- * If not, simply throw out an error.
- */
-inline Error __must_check checkCPUFeature() {
-  Error err;
-#ifndef __AVX__
-  if (HAS_AVX) {
-    LOG(WARNING) << "PaddlePaddle wasn't compiled to use avx instructions, "
-                 << "but these are available on your machine and could "
-                 << "speed up CPU computations via CMAKE .. -DWITH_AVX=ON";
-  }
-#else
-  if (!HAS_AVX) {
-    err = Error(
-        "PaddlePaddle was compiled to use avx instructions, "
-        "but these aren't available on your machine, please "
-        "disable it via CMAKE .. -DWITH_AVX=OFF");
-  }
-#endif  // __AVX__
-#ifdef __SSE3__
-  if (!HAS_SSE3) {
-    err = Error(
-        "PaddlePaddle was compiled to use sse3 instructions, "
-        "which is the minimum requirement of PaddlePaddle. "
-        "But these aren't available on your current machine.");
-  }
-#endif  // __SSE3__
-
-  return err;
-}
-
-}  // namespace paddle
diff --git a/paddle/utils/CustomStackTrace.h b/paddle/utils/CustomStackTrace.h
deleted file mode 100644
index 52a6df94979fd3d8d7d540ed0e3898bb3375d975..0000000000000000000000000000000000000000
--- a/paddle/utils/CustomStackTrace.h
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <stack>
-#include <thread>
-#include <unordered_map>
-
-#include "ThreadLocal.h"
-
-namespace paddle {
-
-/**
- * A ThreadLocal stack for tracing train/test process.
- * (More details of ThreadLocal can be find
- * in the comments of ThreadLocal class.)
- *
- * For example.
- * @code{.cpp}
- *
- * paddle::CustomStackTrace<std::string> stack;
- * for (auto& layer : layers){
- *   stack.push(layer->getName());
- *   layer->forward();
- * }
- *
- * stack.pop("");  // mark under pop stage.
- *
- * for (auto it = layers.rbegin(); it != layers.rend(); ++it){
- *   auto& layer = *it;
- *   layer->backward(passType);
- *   stack.pop(layer->getName());
- * }
- *
- * @endcode
- */
-template <typename T>
-class CustomStackTrace {
-public:
-  /**
-   * @brief Pop out an item from the top of the stack if item == top.
-   *        Else, just set status to popping.
-   */
-  void pop(const T& item) {
-    auto& s = this->stack();
-    if (item == s.top()) {
-      s.pop();
-    }
-  }
-
-  /**
-   * @brief Indicate whether we are at forward or backward stage of computation
-   */
-  void set_stage(bool isForward) { pushing() = isForward; }
-
-  /**
-   * @brief clear current thread stack.
-   */
-  void clear() {
-    auto& s = stack();
-    while (!s.empty()) {
-      s.pop();
-    }
-  }
-
-  /**
-   * @brief return true if all thread's stack is empty.
-   * @return true if empty
-   */
-  bool empty() const {
-    std::lock_guard<std::mutex> g(this->mtx_);
-    for (auto p : this->stackBuffers_) {
-      std::stack<T>& s = *p.second;
-      if (!s.empty()) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  /**
-   * @brief DumpCallback Type. It will be invoked many times by dump method.
-   *
-   * The first parameter is stack thread id.
-   * The second parameter is the last action of stack is push or not.
-   * The third parameter is the item in stack.
-   */
-  typedef std::function<void(const std::thread::id& /*threadId*/,
-                             bool* /*isPushing*/,
-                             const T& /*item*/)>
-      DumpCallback;
-
-  /**
-   * Dump all thread stack, and all stack will be cleared.
-   */
-  void dump(const DumpCallback& callback, bool onlyCurrentThread = false) {
-    std::lock_guard<std::mutex> g(this->mtx_);
-    for (auto p : this->stackBuffers_) {
-      std::thread::id tid = p.first;
-      if (onlyCurrentThread && tid != std::this_thread::get_id()) {
-        continue;
-      }
-      std::stack<T>& s = *p.second;
-      bool* isPush = nullptr;
-      auto it = this->pushingBuffers_.find(tid);
-      if (it != this->pushingBuffers_.end()) {
-        isPush = it->second;
-      }
-
-      while (!s.empty()) {
-        callback(tid, isPush, s.top());
-        s.pop();
-      }
-    }
-  }
-
-  /**
-   * @brief Push item to current thread stack.
-   */
-  void push(const T& item) {
-    pushing() = true;
-    auto& p = this->stack();
-    p.push(item);
-  }
-
-private:
-  /**
-   * Get thread local attribute, and save them into a map (threadId => TYPE*)
-   *
-   * @tparam TYPE thread local attribute type.
-   * @param threadLocal Thread Local object.
-   * @param buffers a map from threadId to TYPE*
-   */
-  template <typename TYPE>
-  inline TYPE& getThreadLocal(
-      ThreadLocal<TYPE>& threadLocal,
-      std::unordered_map<std::thread::id, TYPE*>& buffers) {
-    TYPE* retv = threadLocal.get(false);
-    if (retv) {
-      return *retv;
-    } else {
-      std::lock_guard<std::mutex> guard(this->mtx_);
-      retv = threadLocal.get();
-      auto id = std::this_thread::get_id();
-      buffers.insert({id, retv});
-      return *retv;
-    }
-  }
-
-  /**
-   * @brief Get thread local stack reference.
-   */
-  std::stack<T>& stack() {
-    return this->getThreadLocal(this->logStack_, this->stackBuffers_);
-  }
-
-  /**
-   * @brief Get thread local pushing flag.
-   */
-  bool& pushing() {
-    return this->getThreadLocal(this->isPushing_, this->pushingBuffers_);
-  }
-
-private:
-  mutable std::mutex mtx_;
-
-  std::unordered_map<std::thread::id, std::stack<T>*> stackBuffers_;
-  std::unordered_map<std::thread::id, bool*> pushingBuffers_;
-  ThreadLocal<bool> isPushing_;
-  ThreadLocal<std::stack<T>> logStack_;
-};
-
-extern CustomStackTrace<std::string> gLayerStackTrace;
-
-/**
- * @brief Install a failure handler to print layer stack when error.
- */
-extern void installLayerStackTracer();
-
-}  // namespace paddle
diff --git a/paddle/utils/DynamicLoader.cpp b/paddle/utils/DynamicLoader.cpp
deleted file mode 100644
index 5604a90038b06d2c1a4d9db70e4185cddfd25d3e..0000000000000000000000000000000000000000
--- a/paddle/utils/DynamicLoader.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "DynamicLoader.h"
-#include <gflags/gflags.h>
-#include "Logging.h"
-
-DEFINE_string(cudnn_dir,
-              "",
-              "Specify path for loading libcudnn.so. For instance, "
-              "/usr/local/cudnn/lib. If empty [default], dlopen "
-              "will search cudnn from LD_LIBRARY_PATH");
-
-DEFINE_string(cuda_dir,
-              "",
-              "Specify path for loading cuda library, such as libcublas, "
-              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
-              "dlopen will search cuda from LD_LIBRARY_PATH");
-
-DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
-
-DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
-
-static inline std::string join(const std::string& part1,
-                               const std::string& part2) {
-  // directory separator
-  const char sep = '/';
-  if (!part2.empty() && part2.front() == sep) {
-    return part2;
-  }
-  std::string ret;
-  ret.reserve(part1.size() + part2.size() + 1);
-  ret = part1;
-  if (!ret.empty() && ret.back() != sep) {
-    ret += sep;
-  }
-  ret += part2;
-  return ret;
-}
-
-static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
-                                               void** dso_handle,
-                                               int dynload_flags) {
-  VLOG(3) << "Try to find library: " << dso_path
-          << " from default system path.";
-  // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
-  *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-
-// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
-// bring System Integrity Projection (SIP), if dso_handle
-// is null, search from default package path in Mac OS.
-#if defined(__APPLE__) || defined(__OSX__)
-  if (nullptr == *dso_handle) {
-    dso_path = join("/usr/local/cuda/lib/", dso_path);
-    *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-    if (nullptr == *dso_handle) {
-      if (dso_path == "libcudnn.dylib") {
-        LOG(FATAL)
-            << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n"  // NOLINT
-            << "For instance, sudo tar -xzf "
-               "cudnn-7.5-osx-x64-v5.0-ga.tgz -C "  // NOLINT
-            << "/usr/local \n sudo chmod a+r "
-               "/usr/local/cuda/include/cudnn.h "  // NOLINT
-            << "/usr/local/cuda/lib/libcudnn*";
-      }
-    }
-  }
-#endif
-}
-
-static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
-                                              const std::string& dso_name,
-                                              void** dso_handle) {
-  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
-  *dso_handle = nullptr;
-
-  std::string dlPath = dso_name;
-  if (search_root.empty()) {
-    GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
-  } else {
-    // search xxx.so from custom path
-    dlPath = join(search_root, dso_name);
-    *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
-    // if not found, search from default path
-    if (nullptr == *dso_handle) {
-      LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
-                   << dlerror() << ")";
-      dlPath = dso_name;
-      GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
-    }
-  }
-
-  CHECK(nullptr != *dso_handle) << "Failed to find dynamic library: " << dlPath
-                                << " (" << dlerror() << ") \n"
-                                << "Please specify its path correctly using "
-                                   "following ways: \n"
-
-                                << "Method. set environment variable "
-                                   "LD_LIBRARY_PATH on Linux or "
-                                << "DYLD_LIBRARY_PATH on Mac OS. \n"
-                                << "For instance, issue command: export "
-                                   "LD_LIBRARY_PATH=... \n"
-
-                                << "Note: After Mac OS 10.11, using the "
-                                   "DYLD_LIBRARY_PATH is impossible "
-                                << "unless System Integrity Protection (SIP) "
-                                   "is disabled.";
-}
-
-void GetCublasDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
-#endif
-}
-
-void GetCudnnDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
-#endif
-}
-
-void GetCurandDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
-#endif
-}
-
-void GetWarpCTCDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle);
-#endif
-}
-
-void GetLapackDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle);
-#endif
-}
diff --git a/paddle/utils/DynamicLoader.h b/paddle/utils/DynamicLoader.h
deleted file mode 100644
index 2e5ff76a06152b6a12818f06baaeaa6a69726ba8..0000000000000000000000000000000000000000
--- a/paddle/utils/DynamicLoader.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <dlfcn.h>
-#include <memory>
-#include <mutex>
-#include <string>
-
-/**
- * @brief    load the DSO of CUBLAS
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCublasDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of CUDNN
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCudnnDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of CURAND
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCurandDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of warp-ctc
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetWarpCTCDsoHandle(void** dso_handle);
-
-/**
- * @brief    load the DSO of lapack
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetLapackDsoHandle(void** dso_handle);
diff --git a/paddle/utils/Error.h b/paddle/utils/Error.h
deleted file mode 100644
index 7cde98306026ca1de76089749aaea265d151da33..0000000000000000000000000000000000000000
--- a/paddle/utils/Error.h
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <glog/logging.h>
-#include <stdarg.h>
-#include <stdio.h>
-#include <memory>
-#include <string>
-
-/**
- * __must_check macro. It make the function's return value must be used,
- * otherwise it will raise a compile warning. And also Paddle treat all compile
- * warnings as errors.
- */
-#ifdef __GNUC__
-#if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) >= 30400
-#define __must_check __attribute__((warn_unused_result))
-#else
-#define __must_check
-#endif
-#else
-#define __must_check
-#endif
-
-namespace paddle {
-
-/**
- * Error is Paddle error code. It only contain a std::string as error message.
- *
- *
- * There are two styles to return error in Paddle.
- *
- * 1. Return Error
- *    When method return a status, the return must use `__must_check` attribute.
- *    Example as below.
- * @code{cpp}
- * Error __must_check foo();
- *
- * Error __must_check bar() {
- *   // do something.
- *   Error err = foo();  // invoke other method return status.
- *   if (err) return err;
- *   // do something else.
- *   return Error();
- * }
- * @endcode{cpp}
- *
- * 2. Return by parameter.
- *    It is another way to return an error, by using a pointer parameter.
- *    Example as below.
- *
- * @code{cpp}
- * Error bar();
- *
- * int foo(Error* error) {
- *   // Do something.
- *   Error err = bar();
- *   if (err) {
- *     *error = s;
- *     return 0;
- *   }
- *   // Do something else.
- *   if (someInternalErrorHappend) {
- *     *error = Error("Some dimension is too large, %d", dimension);
- *     return 0;
- *   }
- *   // End of method.
- *   return someValue;
- * }
- *
- * Error foobar() {
- *   Error err;
- *   // do something.
- *   foo(&err);
- *   if (err) return err;
- * }
- * @endcode{cpp}
- *
- *
- * Currently there is a helper method 'check' in status, because Paddle always
- * use log(FATAL) or CHECK to make program exit before. When we clean all
- * log(FATAL) and CHECK in Paddle, 'check' method will be removed.
- */
-class Error {
-public:
-  /**
-   * Construct a no-error value.
-   */
-  Error() {}
-
-  /**
-   * @brief Create an Error use printf syntax.
-   */
-  explicit Error(const char* fmt, ...) {
-    va_list ap;
-    va_start(ap, fmt);
-    constexpr size_t kBufferSize = 1024;
-    char buffer[kBufferSize];
-    vsnprintf(buffer, kBufferSize, fmt, ap);
-    this->msg_.reset(new std::string(buffer));
-    va_end(ap);
-  }
-
-  /**
-   * @brief msg will return the error message. If no error, return nullptr.
-   */
-  const char* msg() const {
-    if (msg_) {
-      return msg_->c_str();
-    } else {
-      return nullptr;
-    }
-  }
-
-  /**
-   * @brief check this status by glog.
-   * @note It is a temp method used during cleaning Paddle code. It will be
-   *       removed later.
-   */
-  void check() const { CHECK(this->isOK()) << msg(); }
-
-  /**
-   * @brief isOK return True if there is no error.
-   * @return True if no error.
-   */
-  bool isOK() const { return msg_ == nullptr; }
-
-private:
-  std::shared_ptr<std::string> msg_;
-};
-
-}  // namespace paddle
diff --git a/paddle/utils/GlobalConstants.h b/paddle/utils/GlobalConstants.h
deleted file mode 100644
index 0ec1c28dfbb2a7db9fa84c9eb2bc4dad806b78e9..0000000000000000000000000000000000000000
--- a/paddle/utils/GlobalConstants.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-
-namespace paddle {
-
-namespace enumeration_wrapper {
-enum PassType {
-  PASS_TRAIN,   // Train pass
-  PASS_TEST,    // Test pass
-  PASS_GC,      // Gradient Check pass
-  PASS_METRIC,  // pass for generate template output with no drop rate.
-};
-
-enum ParameterType {
-  PARAMETER_VALUE = 0,
-  PARAMETER_GRADIENT,
-  PARAMETER_MOMENTUM,
-
-  // Used by ParameterAverager
-  PARAMETER_SUM1,
-  PARAMETER_SUM2,
-  PARAMETER_SUM3,
-
-  //   also used by AdagradParameterUpdater/AdadeltaParameterUpdater
-  PARAMETER_LEARNING_RATE,
-
-  // Used by Sparse SGD update
-  PARAMETER_UPDATE_TIME,
-
-  // Used by async_sgd
-  // Change of the parameter since last remote update
-  PARAMETER_DELTA,
-
-  // Used by BatchRemoteParameterUpdater
-  PARAMETER_GRADIENT_SUM,
-
-  // Used by AdagradParameterUpdater/AdadeltaParameterUpdater
-  PARAMETER_GRADIENT_SQURESUM,
-  PARAMETER_GRADIENT_SQURESUM1,
-
-  // Used by SparseConnected layer
-  PARAMETER_ROWS,
-  PARAMETER_COLS,
-
-  // Used by Adam Optimizer.
-  PARAMETER_SECOND_MOMENTUM,
-
-  // Used By AdaMax Optimizer.
-  PARAMETER_WEIGHTED_INFINITY_NORM,
-
-  // Used by remote parameter average
-  PARAMETER_APPLY,
-
-  // Used by sparse momentum
-  PARAMETER_MOMENTUM_UT,
-  PARAMETER_MOMENTUM_VT,
-
-  NUM_PARAMETER_TYPES,
-};
-
-}  // namespace enumeration_wrapper
-
-//! explicit import enum into paddle namespace.
-using namespace enumeration_wrapper;  // NOLINT
-
-class TrainAlgorithm {
-public:
-  static const std::string SGD;
-  static const std::string AsyncSGD;
-  static const std::string OWLQN;
-
-  static inline bool isValid(const std::string& algo) {
-    return algo == SGD || algo == AsyncSGD || algo == OWLQN;
-  }
-};
-
-#ifdef __AVX__
-const int ALIGN_HINT = 32;
-#else
-const int ALIGN_HINT = 16;
-#endif
-
-}  // namespace paddle
diff --git a/paddle/utils/Locks.h b/paddle/utils/Locks.h
deleted file mode 100644
index e87abb9139f1c3f250f8b8fe1afdd8883f682647..0000000000000000000000000000000000000000
--- a/paddle/utils/Locks.h
+++ /dev/null
@@ -1,242 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <pthread.h>
-#include <sys/time.h>
-#include <condition_variable>
-#include <mutex>
-
-#include "Common.h"
-
-namespace paddle {
-
-/**
- * A simple read-write lock.
- * The RWlock allows a number of readers or at most one writer
- * at any point in time.
- * The RWlock disable copy.
- *
- * Lock:
- *
- * Use lock() to lock on write mode, no other thread can get it
- * until unlock.
- *
- * Use lock_shared() to lock on read mode, other thread can get
- * it by using the same method lock_shared().
- *
- * Unlock:
- *
- * Use unlock() to unlock the lock.
- */
-class RWLock {
-public:
-  RWLock() { pthread_rwlock_init(&rwlock_, NULL); }
-  ~RWLock() { pthread_rwlock_destroy(&rwlock_); }
-  RWLock(const RWLock&) = delete;
-  RWLock& operator=(const RWLock&) = delete;
-
-  /**
-   * @brief lock on write mode.
-   * @note the method will block the thread, if failed to get the lock.
-   */
-  // std::mutex interface
-  void lock() { pthread_rwlock_wrlock(&rwlock_); }
-  /**
-   * @brief lock on read mode.
-   * @note if another thread is writing, it can't get the lock,
-   * and will block the thread.
-   */
-  void lock_shared() { pthread_rwlock_rdlock(&rwlock_); }
-  void unlock() { pthread_rwlock_unlock(&rwlock_); }
-
-protected:
-  pthread_rwlock_t rwlock_;
-};
-
-/**
- * The ReadLockGuard is a read mode RWLock
- * using RAII management mechanism.
- */
-class ReadLockGuard {
-public:
-  /**
-   * @brief Construct Function. Lock on rwlock in read mode.
-   */
-  explicit ReadLockGuard(RWLock& rwlock) : rwlock_(&rwlock) {
-    rwlock_->lock_shared();
-  }
-
-  /**
-   * @brief Destruct Function.
-   * @note This method just unlock the read mode rwlock,
-   * won't destroy the lock.
-   */
-  ~ReadLockGuard() { rwlock_->unlock(); }
-
-protected:
-  RWLock* rwlock_;
-};
-
-/**
- * A simple wrapper for spin lock.
- * The lock() method of SpinLock is busy-waiting
- * which means it will keep trying to lock until lock on successfully.
- * The SpinLock disable copy.
- */
-class SpinLockPrivate;
-class SpinLock {
-public:
-  DISABLE_COPY(SpinLock);
-  SpinLock();
-  ~SpinLock();
-
-  // std::mutext interface
-  void lock();
-  void unlock();
-
-private:
-  SpinLockPrivate* m;
-};
-
-/**
- * A simple wapper of semaphore which can only be shared in the same process.
- */
-class SemaphorePrivate;
-class Semaphore {
-public:
-  //! Disable copy & assign
-  Semaphore(const Semaphore& other) = delete;
-  Semaphore& operator=(const Semaphore&& other) = delete;
-
-  //! Enable move.
-  Semaphore(Semaphore&& other) : m(std::move(other.m)) {}
-
-public:
-  /**
-   * @brief Construct Function.
-   * @param[in] initValue the initial value of the
-   * semaphore, default 0.
-   */
-  explicit Semaphore(int initValue = 0);
-
-  ~Semaphore();
-
-  /**
-   * @brief The same as wait(), except if the decrement can not
-   * be performed until ts return false install of blocking.
-   * @param[in] ts an absolute timeout in seconds and nanoseconds
-   * since the Epoch 1970-01-01 00:00:00 +0000(UTC).
-   * @return ture if the decrement proceeds before ts,
-   * else return false.
-   */
-  bool timeWait(struct timespec* ts);
-
-  /**
-   * @brief decrement the semaphore. If the semaphore's value is 0, then call
-   * blocks.
-   */
-  void wait();
-
-  /**
-   * @brief increment the semaphore. If the semaphore's value
-   * greater than 0, wake up a thread blocked in wait().
-   */
-  void post();
-
-private:
-  SemaphorePrivate* m;
-};
-
-/**
- * A simple wrapper of thread barrier.
- * The ThreadBarrier disable copy.
- */
-class ThreadBarrierPrivate;
-class ThreadBarrier {
-public:
-  DISABLE_COPY(ThreadBarrier);
-
-  /**
-   * @brief Construct Function. Initialize the barrier should
-   * wait for count threads in wait().
-   */
-  explicit ThreadBarrier(int count);
-  ~ThreadBarrier();
-
-  /**
-   * @brief .
-   * If there were count - 1 threads waiting before,
-   * then wake up all the count - 1 threads and continue run together.
-   * Else block the thread until waked by other thread .
-   */
-  void wait();
-
-private:
-  ThreadBarrierPrivate* m;
-};
-
-/**
- * A wrapper for condition variable with mutex.
- */
-class LockedCondition : public std::condition_variable {
-public:
-  /**
-   * @brief execute op and notify one thread which was blocked.
-   * @param[in] op a thread can do something in op before notify.
-   */
-  template <class Op>
-  void notify_one(Op op) {
-    std::lock_guard<std::mutex> guard(mutex_);
-    op();
-    std::condition_variable::notify_one();
-  }
-
-  /**
-   * @brief execute op and notify all the threads which were blocked.
-   * @param[in] op a thread can do something in op before notify.
-   */
-  template <class Op>
-  void notify_all(Op op) {
-    std::lock_guard<std::mutex> guard(mutex_);
-    op();
-    std::condition_variable::notify_all();
-  }
-
-  /**
-   * @brief wait until pred return ture.
-   * @tparam Predicate c++ concepts, describes a function object
-   * that takes a single iterator argument
-   * that is dereferenced and used to
-   * return a value testable as a bool.
-   * @note pred shall not apply any non-constant function
-   * through the dereferenced iterator.
-   */
-  template <class Predicate>
-  void wait(Predicate pred) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    std::condition_variable::wait(lock, pred);
-  }
-
-  /**
-   * @brief get mutex.
-   */
-  std::mutex* mutex() { return &mutex_; }
-
-protected:
-  std::mutex mutex_;
-};
-
-}  // namespace paddle
diff --git a/paddle/utils/PythonUtil.h b/paddle/utils/PythonUtil.h
deleted file mode 100644
index daebaffc855518425ae43942c22ec150d2e327f0..0000000000000000000000000000000000000000
--- a/paddle/utils/PythonUtil.h
+++ /dev/null
@@ -1,353 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-// clang-format off
-#include "paddle/utils/Util.h"
-
-#ifndef PADDLE_NO_PYTHON
-// must include the following two blocks, otherwise,
-// gcc compiler may produce warning
-#ifdef __APPLE__
-#define _POSIX_SOURCE
-#define _POSIX_C_SOURCE 200809L
-#define _XOPEN_SOURCE 700
-#endif
-
-#ifdef _POSIX_C_SOURCE
-#define __TEMP_POSIX_C_SOURCE _POSIX_C_SOURCE
-#undef _POSIX_C_SOURCE
-#endif
-#ifdef _XOPEN_SOURCE
-#define __TEMP_XOPEN_SOURCE _XOPEN_SOURCE
-#undef _XOPEN_SOURCE
-#endif
-#include <Python.h>
-#include <frameobject.h>
-#endif
-
-#include <stdarg.h>
-#include <map>
-#include <mutex>
-// clang-format on
-
-namespace paddle {
-
-std::string callPythonFunc(const std::string& moduleName,
-                           const std::string& funcName,
-                           const std::vector<std::string>& args);
-
-#ifndef PADDLE_NO_PYTHON
-
-/**
- * Global lock guard of python C-api invokes.
- * NOTE: the lock of this guard is reentrant or recursive.
- */
-class PyGuard {
-public:
-  PyGuard();
-  PyGuard(const PyGuard& other) = delete;
-  PyGuard& operator=(const PyGuard& other) = delete;
-
-private:
-  std::lock_guard<std::recursive_mutex> guard_;
-};
-
-struct PyObjectDeleter {
-  void operator()(PyObject* obj) {
-    if (obj) {
-      Py_DECREF(obj);
-    }
-  }
-};
-
-typedef std::unique_ptr<PyObject, PyObjectDeleter> PyObjectPtr;
-
-PyObjectPtr callPythonFuncRetPyObj(const std::string& moduleName,
-                                   const std::string& funcName,
-                                   const std::vector<std::string>& args);
-
-PyObjectPtr createPythonClass(const std::string& moduleName,
-                              const std::string& className,
-                              const std::vector<std::string>& args,
-                              const std::map<std::string, std::string>& kwargs);
-
-#define CHECK_PY(x) CHECK((x) != nullptr) << ::paddle::py::getPyCallStack()
-
-namespace py {
-PyObjectPtr import(const std::string& moduleName);
-
-/**
- * Cast a PyLong or PyInt to int type T.
- * @tparam T return type.
- * @param [in] obj PyLong or PyInt object.
- * @param [out] ok status for casting. False if error occured. nullptr if user
- *                 don't care is ok or not.
- * @return The value of python object, or 0 if not ok.
- */
-template <typename T>
-T castInt(PyObject* obj, bool* ok = nullptr) {
-  if (PyLong_Check(obj)) {
-    if (ok) *ok = true;
-    return (T)PyLong_AsUnsignedLong(obj);
-  } else if (PyInt_Check(obj)) {
-    if (ok) *ok = true;
-    return (T)PyInt_AsLong(obj);
-  } else {
-    if (ok) *ok = false;
-    return (T)0;
-  }
-}
-
-/**
- * Invoke repr of python object.
- *
- * Just like toString method in java.
- */
-char* repr(PyObject* obj);
-
-/**
- * Invoke repr of python object.
- */
-inline char* repr(const PyObjectPtr& obj) { return repr(obj.get()); }
-
-/**
- * Get Python Error Stack String.
- */
-std::string getPyCallStack();
-
-/**
- * Object Helper for PyObjectPtr.
- *
- * Implements getAttr method for object.
- */
-class ObjectHelper {
-public:
-  explicit ObjectHelper(const PyObjectPtr& obj) : obj_(obj) {}
-
-  /**
-   * get attribute
-   */
-  inline PyObject* getAttr(const std::string& field) const {
-    auto obj = PyObject_GetAttrString(obj_.get(), field.c_str());
-    CHECK_PY(obj) << "Cannot get attribute on python object " << obj_.get();
-    return obj;
-  }
-
-  /**
-   * Get Int attribute
-   * @param [in] field  attribute name.
-   * @param [out] ok true if this attribute is int.
-   * @tparam T int type.
-   * @return int value.
-   */
-  template <typename T>
-  T getIntAttr(const std::string& field, bool* ok = nullptr) const {
-    PyObjectPtr tmp(getAttr(field));
-    return castInt<T>(tmp.get(), ok);
-  }
-
-  /**
-   * Get int attribute. Log(Fatal) when not ok
-   * @param field attribute name.
-   * @return int value.
-   */
-  template <typename T>
-  T getIntAttrWithError(const std::string& field) const {
-    bool ok;
-    T tmp = getIntAttr<T>(field, &ok);
-    CHECK(ok) << "Cannot get integer attribute on object " << obj_.get();
-    return tmp;
-  }
-
-  /**
-   * Get bool attribute.
-   * @param field
-   * @param [out] isBoolType return true if attribute is bool type. If the
-   *                         attribute is not bool type, then an implicit
-   *                         conversion will happens, and will return the
-   *                         conversion result.
-   *
-   *                         Such as, if the attribute is 1, then the return
-   *                         value of function will be true, but the isBoolType
-   *                         will return false.
-   * @return
-   */
-  bool getBoolAttr(const std::string& field, bool* isBoolType = nullptr) const {
-    PyObjectPtr tmp(getAttr(field));
-    if (isBoolType) {
-      *isBoolType = PyBool_Check(tmp.get());
-    }
-    return PyObject_IsTrue(tmp.get());
-  }
-
-private:
-  const PyObjectPtr& obj_;
-};
-
-/**
- * Python Sequence Helper
- *
- * The python sequence means list or tuple.
- */
-class SequenceHelper {
-public:
-  explicit SequenceHelper(const PyObjectPtr& seq) : seq_(seq.get()) {
-    CHECK(PySequence_Check(seq_));
-  }
-
-  explicit SequenceHelper(PyObject* seq) : seq_(seq) {
-    CHECK(PySequence_Check(seq_));
-  }
-
-  inline size_t size() const { return (size_t)PySequence_Size(seq_); }
-
-  inline PyObject* operator[](size_t i) const {
-    return PySequence_Fast_GET_ITEM(seq_, i);
-  }
-
-  inline double getDouble(size_t i) const {
-    auto* ptr = (*this)[i];
-    return PyFloat_AsDouble(ptr);
-  }
-
-  /**
-   * Set a sequence item o[i] = obj;
-   * @param i index
-   * @param obj setted item.
-   * @param steal if steal = true, sequence will move object in iteself,
-   *              just like std::move. Otherwise, it will increase reference
-   *              count. Default is false.
-   */
-  inline void set(size_t i, const PyObjectPtr& obj, bool steal = false) {
-    this->set(i, obj.get(), steal);
-  }
-
-  /**
-   * Set a sequence item o[i] = obj;
-   */
-  inline void set(size_t i, PyObject* obj, bool steal = false) {
-    if (!steal) {
-      Py_XINCREF(obj);
-    }
-    if (PyTuple_Check(seq_)) {
-      CHECK_NE(PyTuple_SetItem(seq_, i, obj), -1) << getPyCallStack();
-    } else {
-      CHECK_NE(PySequence_SetItem(seq_, i, obj), -1) << getPyCallStack();
-    }
-  }
-
-private:
-  PyObject* seq_;
-};
-
-class DictHelper {
-public:
-  explicit DictHelper(PyObject* d) : dict_(d) {}
-
-  explicit DictHelper(const PyObjectPtr& d) : dict_(d.get()) {}
-
-  void set(const std::string& key, PyObject* item) {
-    PyDict_SetItemString(dict_, key.c_str(), item);
-  }
-
-  void setBool(const std::string& key, bool b) {
-    this->set(key, PyBool_FromLong(b));
-  }
-
-  void setStringList(const std::string& key,
-                     const std::vector<std::string>& items) {
-    auto* list = PyList_New(items.size());
-    for (size_t i = 0; i < items.size(); ++i) {
-      PyList_SetItem(list, i, PyString_FromString(items[i].c_str()));
-    }
-    this->set(key, list);
-  }
-
-private:
-  inline void checkDict() { CHECK(PyDict_Check(this->dict_)); }
-
-  PyObject* dict_;
-};
-
-inline static bool isCallable(const PyObjectPtr& obj) {
-  return PyCallable_Check(obj.get());
-}
-
-/**
- * Wrap a callable object.
- */
-class CallableHelper {
-public:
-  explicit CallableHelper(const PyObjectPtr& obj) : obj_(obj) {
-    CHECK(py::isCallable(obj_));
-  }
-
-  ~CallableHelper() {}
-
-  /**
-   * reset args, and create new tuple.
-   * @param sz args size.
-   */
-  void setArgsSize(size_t sz) { args.reset(PyTuple_New(sz)); }
-
-  /**
-   * Get args sequence. User can set/get by SequenceHelper.
-   */
-  SequenceHelper getArgs() { return SequenceHelper(args); }
-
-  /**
-   * Call python method, return an object.
-   */
-  PyObject* operator()() {
-    PyGuard guard;
-    return PyObject_Call(obj_.get(), args.get(), kwargs.get());
-  }
-
-private:
-  const PyObjectPtr& obj_;
-  PyObjectPtr args;
-  PyObjectPtr kwargs;
-};
-
-inline static PyObject* iterNext(const PyObjectPtr& context, bool* atEnd) {
-  PyGuard g;
-  PyObject* data = PyIter_Next(context.get());
-  if (data == nullptr) {
-    if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
-      PyErr_Clear();
-      *atEnd = true;
-      return nullptr;
-    } else if (PyErr_Occurred()) {
-      CHECK_PY(data) << "Calling iterator next error";
-      return nullptr;
-    } else {
-      *atEnd = false;
-      return data;  // just return none in iterator.
-    }
-  } else {
-    *atEnd = false;
-    return data;
-  }
-}
-}  // namespace py
-
-#endif
-
-/**
- * Initialize python.
- */
-void initPython(int argc, char** argv);
-
-}  // namespace paddle
diff --git a/paddle/utils/Queue.h b/paddle/utils/Queue.h
deleted file mode 100644
index f054738f87c02d2d749eec8d6c7bb55b506a6d91..0000000000000000000000000000000000000000
--- a/paddle/utils/Queue.h
+++ /dev/null
@@ -1,255 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <condition_variable>
-#include <deque>
-#include <mutex>
-
-#include "Locks.h"
-
-namespace paddle {
-
-/**
- * A thread-safe queue that automatically grows but never shrinks.
- * Dequeue a empty queue will block current thread. Enqueue an element
- * will wake up another thread that blocked by dequeue method.
- *
- * For example.
- * @code{.cpp}
- *
- * paddle::Queue<int> q;
- * END_OF_JOB=-1
- * void thread1() {
- *   while (true) {
- *     auto job = q.dequeue();
- *     if (job == END_OF_JOB) {
- *       break;
- *     }
- *     processJob(job);
- *   }
- * }
- *
- * void thread2() {
- *   while (true) {
- *      auto job = getJob();
- *      q.enqueue(job);
- *      if (job == END_OF_JOB) {
- *        break;
- *      }
- *   }
- * }
- *
- * @endcode
- */
-template <class T>
-class Queue {
-public:
-  /**
-   * @brief Construct Function. Default capacity of Queue is zero.
-   */
-  Queue() : numElements_(0) {}
-
-  ~Queue() {}
-
-  /**
-   * @brief enqueue an element into Queue.
-   * @param[in] el The enqueue element.
-   * @note This method is thread-safe, and will wake up another blocked thread.
-   */
-  void enqueue(const T& el) {
-    std::unique_lock<std::mutex> lock(queueLock_);
-    elements_.emplace_back(el);
-    numElements_++;
-
-    queueCV_.notify_all();
-  }
-
-  /**
-   * @brief enqueue an element into Queue.
-   * @param[in] el The enqueue element. rvalue reference .
-   * @note This method is thread-safe, and will wake up another blocked thread.
-   */
-  void enqueue(T&& el) {
-    std::unique_lock<std::mutex> lock(queueLock_);
-    elements_.emplace_back(std::move(el));
-    numElements_++;
-
-    queueCV_.notify_all();
-  }
-
-  /**
-   * Dequeue from a queue and return a element.
-   * @note this method will be blocked until not empty.
-   */
-  T dequeue() {
-    std::unique_lock<std::mutex> lock(queueLock_);
-    queueCV_.wait(lock, [this]() { return numElements_ != 0; });
-    T el;
-
-    using std::swap;
-    // Becuase of the previous statement, the right swap() can be found
-    // via argument-dependent lookup (ADL).
-    swap(elements_.front(), el);
-
-    elements_.pop_front();
-    numElements_--;
-    if (numElements_ == 0) {
-      queueCV_.notify_all();
-    }
-    return el;
-  }
-
-  /**
-   * Return size of queue.
-   *
-   * @note This method is not thread safe. Obviously this number
-   * can change by the time you actually look at it.
-   */
-  inline int size() const { return numElements_; }
-
-  /**
-   * @brief is empty or not.
-   * @return true if empty.
-   * @note This method is not thread safe.
-   */
-  inline bool empty() const { return numElements_ == 0; }
-
-  /**
-   * @brief wait util queue is empty
-   */
-  void waitEmpty() {
-    std::unique_lock<std::mutex> lock(queueLock_);
-    queueCV_.wait(lock, [this]() { return numElements_ == 0; });
-  }
-
-  /**
-   * @brief wait queue is not empty at most for some seconds.
-   * @param seconds wait time limit.
-   * @return true if queue is not empty. false if timeout.
-   */
-  bool waitNotEmptyFor(int seconds) {
-    std::unique_lock<std::mutex> lock(queueLock_);
-    return queueCV_.wait_for(lock, std::chrono::seconds(seconds), [this] {
-      return numElements_ != 0;
-    });
-  }
-
-private:
-  std::deque<T> elements_;
-  int numElements_;
-  std::mutex queueLock_;
-  std::condition_variable queueCV_;
-};
-
-/*
- * A thread-safe circular queue that
- * automatically blocking calling thread if capacity reached.
- *
- * For example.
- * @code{.cpp}
- *
- * paddle::BlockingQueue<int> q(capacity);
- * END_OF_JOB=-1
- * void thread1() {
- *   while (true) {
- *     auto job = q.dequeue();
- *     if (job == END_OF_JOB) {
- *       break;
- *     }
- *     processJob(job);
- *   }
- * }
- *
- * void thread2() {
- *   while (true) {
- *      auto job = getJob();
- *      q.enqueue(job); //Block until q.size() < capacity .
- *      if (job == END_OF_JOB) {
- *        break;
- *      }
- *   }
- * }
- */
-template <typename T>
-class BlockingQueue {
-public:
-  /**
-   * @brief Construct Function.
-   * @param[in] capacity the max numer of elements the queue can have.
-   */
-  explicit BlockingQueue(size_t capacity) : capacity_(capacity) {}
-
-  /**
-   * @brief enqueue an element into Queue.
-   * @param[in] x The enqueue element, pass by reference .
-   * @note This method is thread-safe, and will wake up another thread
-   * who was blocked because of the queue is empty.
-   * @note If it's size() >= capacity before enqueue,
-   * this method will block and wait until size() < capacity.
-   */
-  void enqueue(const T& x) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    notFull_.wait(lock, [&] { return queue_.size() < capacity_; });
-    queue_.push_back(x);
-    notEmpty_.notify_one();
-  }
-
-  /**
-   * Dequeue from a queue and return a element.
-   * @note this method will be blocked until not empty.
-   * @note this method will wake up another thread who was blocked because
-   * of the queue is full.
-   */
-  T dequeue() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    notEmpty_.wait(lock, [&] { return !queue_.empty(); });
-
-    T front(queue_.front());
-    queue_.pop_front();
-    notFull_.notify_one();
-    return front;
-  }
-
-  /**
-   * Return size of queue.
-   *
-   * @note This method is thread safe.
-   * The size of the queue won't change until the method return.
-   */
-  size_t size() {
-    std::lock_guard<std::mutex> guard(mutex_);
-    return queue_.size();
-  }
-
-  /**
-   * @brief is empty or not.
-   * @return true if empty.
-   * @note This method is thread safe.
-   */
-  size_t empty() {
-    std::lock_guard<std::mutex> guard(mutex_);
-    return queue_.empty();
-  }
-
-private:
-  std::mutex mutex_;
-  std::condition_variable notEmpty_;
-  std::condition_variable notFull_;
-  std::deque<T> queue_;
-  size_t capacity_;
-};
-
-}  // namespace paddle
diff --git a/paddle/utils/Stat.h b/paddle/utils/Stat.h
deleted file mode 100644
index 79fd3b8cf043e62922dfd046754ee8ac261990c5..0000000000000000000000000000000000000000
--- a/paddle/utils/Stat.h
+++ /dev/null
@@ -1,302 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <sys/time.h>
-#include <iostream>
-#include <list>
-#include <memory>
-#include <mutex>
-#include <string>
-#include <unordered_map>
-
-#include "Locks.h"
-#include "Logging.h"
-#include "ThreadLocal.h"
-#include "hl_gpu.h"
-
-namespace paddle {
-
-class Stat;
-
-class StatInfo {
-public:
-  explicit StatInfo(Stat* stat = nullptr) : stat_(stat) {
-    total_ = 0;
-    max_ = 0;
-    count_ = 0;
-    min_ = UINT64_MAX;
-  }
-
-  void reset() {
-    total_ = 0;
-    count_ = 0;
-    max_ = 0;
-    min_ = UINT64_MAX;
-  }
-
-  ~StatInfo();
-
-  Stat* stat_;
-  uint64_t total_;
-  uint64_t max_;
-  uint64_t count_;
-  uint64_t min_;
-};
-
-class Stat;
-typedef std::shared_ptr<Stat> StatPtr;
-
-class StatSet {
-public:
-  explicit StatSet(const std::string& name) : name_(name) {}
-  ~StatSet() {}
-
-  // print to LOG(INFO)
-  void printSegTimerStatus();
-  void printAllStatus();
-
-  StatPtr getStat(const std::string& name) {
-    {
-      ReadLockGuard guard(lock_);
-      auto it = statSet_.find(name);
-      if (it != statSet_.end()) {
-        return it->second;
-      }
-    }
-    StatPtr stat = std::make_shared<Stat>(name);
-    std::lock_guard<RWLock> guard(lock_);
-    auto ret = statSet_.insert(std::make_pair(name, stat));
-    return ret.first->second;
-  }
-
-  // true for showing stats for each thread
-  // false for showing stats aggragated over threads
-  void setThreadInfo(const std::string& name, bool flag);
-
-  // true for showing stats for each thread
-  // false for showing stats aggragated over threads
-  void setThreadInfo(bool flag) {
-    for (auto& iter : statSet_) {
-      setThreadInfo(iter.first, flag);
-    }
-  }
-
-  // reset the counters for all stats
-  // clearRawData means also clearing raw tuning data, because at pserver end,
-  // barrier rawData(timeVector_) is stateful, clearing it will cause rubbish
-  // data, while rawData should be cleared at the new pass (so complicated
-  // pserver code logic, -_- ).
-  void reset(bool clearRawData = true);
-
-private:
-  std::unordered_map<std::string, StatPtr> statSet_;
-  const std::string name_;
-  RWLock lock_;
-};
-
-extern StatSet globalStat;
-
-/*@brief : a simple stat*/
-class Stat {
-public:
-  explicit Stat(const std::string& statName)
-      : destructStat_(nullptr), name_(statName), openThreadInfo_(false) {}
-  ~Stat() {}
-
-  typedef std::list<std::pair<StatInfo*, pid_t>> ThreadLocalBuf;
-
-  const std::string& getName() const { return name_; }
-
-  void addSample(uint64_t value);
-
-  // clear all stats
-  void reset();
-
-  friend std::ostream& operator<<(std::ostream& outPut, const Stat& stat);
-
-  /*  Set operator << whether to print thread info.
-   *  If openThreadInfo_ == true, then print, else print merge thread info.
-   */
-  void setThreadInfo(bool flag) { openThreadInfo_ = flag; }
-
-  bool getThreadInfo() const { return openThreadInfo_; }
-
-  friend class StatInfo;
-
-private:
-  void mergeThreadStat(StatInfo& allThreadStat);
-
-  std::mutex lock_;
-  ThreadLocalBuf threadLocalBuf_;
-  StatInfo destructStat_;
-  ThreadLocal<StatInfo> statInfo_;
-  const std::string name_;
-  bool openThreadInfo_;
-};
-
-extern StatSet globalStat;
-
-inline StatPtr getStat(const std::string& name) {
-  return globalStat.getStat(name);
-}
-
-inline uint64_t nowInMicroSec() {
-  timeval tvTime;
-  (void)gettimeofday(&tvTime, NULL);
-  return tvTime.tv_sec * 1000000LU + tvTime.tv_usec;
-}
-
-/**
- * A simple help class to measure time interval
- */
-class Timer {
-public:
-  explicit Timer(bool autoStart = true) : total_(0), startStamp_(0) {
-    if (autoStart) {
-      start();
-    }
-  }
-  void start() { startStamp_ = nowInMicroSec(); }
-  void setStartStamp(uint64_t startStamp) { startStamp_ = startStamp; }
-  uint64_t stop() {
-    total_ += nowInMicroSec() - startStamp_;
-    return total_;
-  }
-
-  uint64_t get() const { return total_; }
-
-  void reset() { total_ = 0; }
-
-protected:
-  uint64_t total_;
-  uint64_t startStamp_;
-};
-
-class TimerOnce {
-public:
-  TimerOnce(Stat* stat,
-            const char* info = "",
-            uint64_t threshold = -1,
-            bool autoStart = true,
-            uint64_t startStamp = 0)
-      : stat_(stat), info_(info), timer_(autoStart), threshold_(threshold) {
-    if (!autoStart) {
-      timer_.setStartStamp(startStamp);
-    }
-  }
-  ~TimerOnce() {
-    uint64_t span = timer_.stop();
-    if (span >= threshold_) {
-      LOG(INFO) << "Stat: [" << stat_->getName() << "] " << info_
-                << " [Span:" << span / 1000 << "ms" << span % 1000 << "us"
-                << "] ";
-    }
-    stat_->addSample(span);
-  }
-
-private:
-  Stat* stat_;
-  const char* info_;
-  Timer timer_;
-  uint64_t threshold_;
-};
-
-inline uint64_t registerTimerArg1(uint64_t threshold = -1,
-                                  StatSet& statSet = globalStat) {
-  return threshold;
-}
-
-inline StatSet& registerTimerArg2(uint64_t threshold = -1,
-                                  StatSet& statSet = globalStat) {
-  return statSet;
-}
-
-#ifdef PADDLE_DISABLE_TIMER
-
-#define REGISTER_TIMER(statName, ...)
-#define REGISTER_TIMER_SET(statName, start, ...)
-#define REGISTER_TIMER_DYNAMIC(statName, ...)
-#define REGISTER_TIMER_DYNAMIC_SET(statName, start, ...)
-#define REGISTER_TIMER_INFO(statName, info)
-#define FOR_TIMING(statement)
-
-#else
-
-#define FOR_TIMING(statement) statement
-
-// The default arguments are shown in the following line:
-// REGISTER_TIMER(statName, threshold = -1, statSet = globalStat)
-// TODO(yuyang18,wangyanfei01): if UNIQUE_NAME is needed
-#define REGISTER_TIMER(statName, ...)                             \
-  static ::paddle::StatPtr __stat =                               \
-      ::paddle::registerTimerArg2(__VA_ARGS__).getStat(statName); \
-  ::paddle::TimerOnce __timerOnce(                                \
-      __stat.get(), "", ::paddle::registerTimerArg1(__VA_ARGS__));
-
-#define REGISTER_TIMER_SET(statName, start, ...)                            \
-  static ::paddle::StatPtr __stat =                                         \
-      ::paddle::registerTimerArg2(__VA_ARGS__).getStat(statName);           \
-  ::paddle::TimerOnce __timerOnce(__stat.get(),                             \
-                                  "",                                       \
-                                  ::paddle::registerTimerArg1(__VA_ARGS__), \
-                                  false,                                    \
-                                  start);
-
-// dynmaic timer, support to discriminate runtime entity, used in pserver
-#define REGISTER_TIMER_DYNAMIC(statName, ...)                     \
-  ::paddle::StatPtr __stat =                                      \
-      ::paddle::registerTimerArg2(__VA_ARGS__).getStat(statName); \
-  ::paddle::TimerOnce __timerOnce(                                \
-      __stat.get(), "", ::paddle::registerTimerArg1(__VA_ARGS__));
-
-#define REGISTER_TIMER_DYNAMIC_SET(statName, start, ...)                    \
-  ::paddle::StatPtr __stat =                                                \
-      ::paddle::registerTimerArg2(__VA_ARGS__).getStat(statName);           \
-  ::paddle::TimerOnce __timerOnce(__stat.get(),                             \
-                                  "",                                       \
-                                  ::paddle::registerTimerArg1(__VA_ARGS__), \
-                                  false,                                    \
-                                  start);
-
-#define REGISTER_TIMER_INFO(statName, info)                                 \
-  static ::paddle::StatPtr __stat = ::paddle::globalStat.getStat(statName); \
-  ::paddle::TimerOnce __timerOnce(                                          \
-      __stat.get(), info, 10 * 1000000LU /*threshold*/);
-
-#endif  // DISABLE_TIMER
-
-class GpuProfiler final {
-public:
-  GpuProfiler(std::string statName, std::string info);
-  ~GpuProfiler();
-
-private:
-  std::lock_guard<std::recursive_mutex> guard_;
-};
-
-#ifdef PADDLE_DISABLE_PROFILER
-
-#define REGISTER_GPU_PROFILER(statName, ...)
-
-#else
-
-#define REGISTER_GPU_PROFILER(statName, ...) \
-  GpuProfiler __gpuProfiler(statName, #__VA_ARGS__);
-
-#endif  // DISABLE_PROFILER
-
-}  // namespace paddle
diff --git a/paddle/utils/Thread.h b/paddle/utils/Thread.h
deleted file mode 100644
index ef36a8c5b2b0e95d759da8a781d781b71d067b7a..0000000000000000000000000000000000000000
--- a/paddle/utils/Thread.h
+++ /dev/null
@@ -1,615 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <thread>
-#include "Logging.h"
-#include "Util.h"
-
-#include "Queue.h"
-#include "ThreadLocal.h"
-
-#include <future>
-
-namespace paddle {
-
-/**
- * A simple wrapper for std::thread
- */
-
-class Thread {
-public:
-  /**
-   * @brief Construct Function. Default thread pointer is null.
-   */
-  Thread() { thread_ = nullptr; }
-
-  virtual ~Thread() {}
-
-  /**
-   * @brief Creat a new thread and call *run()* function.
-   */
-  void start() {
-    thread_.reset(new std::thread([this]() { this->run(); }));
-  }
-
-  /**
-   * @brief Detach the thread.
-   * It don't need to be waited until it finish.
-   */
-  void detach() { thread_->detach(); }
-
-  /**
-   * @brief Join the thread.
-   * It should be waited until it finish.
-   */
-  void join() { thread_->join(); }
-
-  /**
-   * @brief Define what to be done on this thread through override this
-   * function.
-   */
-  virtual void run() = 0;
-
-protected:
-  std::unique_ptr<std::thread> thread_;
-};
-
-/**
- * ThreadWorker maintains a job queue. It executes the jobs in the job queue
- * sequentianlly in a separate thread.
- *
- * Use addJob() to add a new job to the job queue.
- */
-class ThreadWorker : protected Thread {
-public:
-  typedef std::function<void()> JobFunc;
-
-  /**
-   * @brief Construct Function. Default size of job queue is 0 and not stopping.
-   */
-  ThreadWorker() : stopping_(false), empty_(true) { start(); }
-
-  /**
-   * @brief Destruct Function.
-   * If it's running, wait until all job finish and then stop it.
-   */
-  ~ThreadWorker() {
-    if (!stopping_) {
-      wait();
-      stop();
-    }
-  }
-
-  /**
-   * @brief Finish current running job and quit the thread.
-   */
-  void stop() {
-    stopping_ = true;
-    jobs_.enqueue([]() {});
-    join();
-  }
-
-  /**
-   * @brief Add a new job to the job queue.
-   */
-  void addJob(JobFunc func) {
-    empty_ = false;
-    jobs_.enqueue(func);
-  }
-
-  /**
-   * @brief Wait until all jobs was done (the job queue was empty).
-   */
-  void wait() {
-    finishCV_.wait([this] { return empty_; });
-  }
-
-protected:
-  /**
-   * @brief Execute jobs in the job queue sequentianlly,
-   * @note If finish all the jobs in the job queue,
-   * notifies all the waiting threads the job queue was empty.
-   */
-  virtual void run() {
-    while (true) {
-      JobFunc func = jobs_.dequeue();
-      if (stopping_) break;
-      func();
-      if (jobs_.empty()) {
-        finishCV_.notify_all([this] { empty_ = true; });
-      }
-    }
-  }
-
-  Queue<JobFunc> jobs_;
-  bool stopping_;
-  LockedCondition finishCV_;
-  bool empty_;
-};
-
-/**
- * SyncThreadPool maintains a pool of threads.
- * It executes the job use all workers in the pool.
- *
- * Use exec() to run a new job, job complete when exec returned.
- * Only one job can exec simultaneously.
- *
- * Each worker has an tid whose range is [0, getNumThreads()).
- * JobFunc can use tid to divide input data.
- */
-class SyncThreadPool {
-public:
-  typedef std::function<void(int tid, size_t numThreads)> JobFunc;
-
-  /**
-   * @brief Construct Function. No thread will be created.
-   */
-  SyncThreadPool() : jobStartBarrier_(0), jobFinishBarrier_(0) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief Construct Fucntion. Create numWorkers of threads in the pool.
-   * @param[in] numWorkers Number of the workers in the pool.
-   * @param[in] checkOwner Default true. If checkOwner is true,
-   * this sync thread pool should be used by it's owner thread.
-   */
-  explicit SyncThreadPool(size_t numWorkers, bool checkOwner = true)
-      : stopping_(false),
-        jobStartBarrier_(numWorkers + 1),
-        jobFinishBarrier_(numWorkers + 1),
-        jobFunc_(nullptr),
-        checkOwner_(checkOwner) {
-    ownerThreadId_ = getTID();
-    workers_.resize(numWorkers);
-    start();
-  }
-
-  ~SyncThreadPool() {
-    if (!stopping_) {
-      stop();
-    }
-  }
-
-  /**
-   * @brief Return num of threads in the pool.
-   */
-  size_t getNumThreads() { return workers_.size(); }
-
-  /**
-   * @brief Execute a job using all the theads in the pool.
-   * @param[in] jobFunc The function to be executed.
-   * @param[in] ownerFunc Owner thread can do something in owerFunc when job
-   * executing.
-   * @note For the ownerFunc, tid=getNumThreads().
-   */
-  void exec(JobFunc jobFunc, JobFunc ownerFunc = nullptr) {
-    if (checkOwner_) {
-      CHECK_EQ(ownerThreadId_, getTID())
-          << "this sync thread pool should be used in one thread";
-    }
-
-    CHECK(jobFunc_ == nullptr);
-    jobFunc_ = jobFunc;
-    jobStartBarrier_.wait();  // notify worker thread start job
-
-    if (ownerFunc) {
-      ownerFunc(workers_.size(), workers_.size());
-    }
-
-    jobFinishBarrier_.wait();  // wait all worker thread complete
-    jobFunc_ = nullptr;
-  }
-
-  /**
-   * @brief Execute a job using all the threads in the pool.
-   * And the owner thread will do the same job.
-   * @param jobFunc The job to be executed.
-   * @note  Assume that JobFunc will execute numThread + 1 times,
-   * with tid ranging [0,numThread]. The thread whose tid is numThread
-   * is the owner thread.
-   */
-  void execPlusOwner(JobFunc jobFunc) { exec(jobFunc, jobFunc); }
-
-  /**
-   * @brief Execute a job if has pool, else use caller thread as a worker.
-   * @param[in] pool The pool to execute the job.
-   * @param[in] jobFunc The job to be excuted.
-   */
-  static void execHelper(SyncThreadPool* pool, JobFunc jobFunc) {
-    if (pool) {
-      pool->exec(jobFunc);
-    } else {
-      jobFunc(0, 1);
-    }
-  }
-
-protected:
-  /**
-   * @brief Start all the workers in the pool, call their run() function.
-   */
-  void start() {
-    for (size_t i = 0; i < workers_.size(); ++i) {
-      workers_[i].reset(
-          new std::thread([this](int tid) { this->run(tid); }, i));
-    }
-  }
-
-  /**
-   * @brief Stop all the workers in the pool.
-   */
-  void stop() {
-    stopping_ = true;
-    // notify worker thread to stop
-    jobStartBarrier_.wait();
-
-    // stop workers
-    for (auto& thread : workers_) {
-      if (thread) {
-        thread->join();
-        thread.reset(nullptr);
-      }
-    }
-  }
-
-  /**
-   * @brief Execute the jobFunc_ using the worker thread tid, if not stopping.
-   */
-  void run(int tid) {
-    VLOG(1) << "SyncThreadPool worker thread " << tid;
-    // init seed deterministic, but differs from global srand()
-    ThreadLocalRand::initThreadSeed(tid + workers_.size());
-
-    while (true) {
-      jobStartBarrier_.wait();  // wait job
-
-      if (stopping_) {
-        break;
-      }
-
-      jobFunc_(tid, workers_.size());
-
-      jobFinishBarrier_.wait();  // notify job complete
-    }
-  }
-
-protected:
-  pid_t ownerThreadId_;
-  bool stopping_;
-  ThreadBarrier jobStartBarrier_;
-  ThreadBarrier jobFinishBarrier_;
-
-  JobFunc jobFunc_;
-  bool checkOwner_;
-  std::vector<std::unique_ptr<std::thread>> workers_;
-};
-
-/**
- * MultiThreadWorker maintains a job queue and a result queue.
- * It executes the jobs in the job queue and puts the results into the
- * result queue sequentially in multi separate threads.
- *
- * Add jobs:
- *
- *    Use addJob() to add a new job to the job queue
- *        (the user added jobs should not return nullptr).
- *
- *    Use stopAddJob() to stop adding new jobs to the job queue
- *        (addJob() can not be called after stopAddJob()).
- *
- * Normal stop:
- *
- *    Use waitResult() to get the results until nullptr is returned.
- *    Use stop() to exit normally
- *        (stopAddJob() should be called first).
- *
- * Force stop:
- *
- *    Use forceStop() to exit forcibly even though there are remaining jobs in
- * the
- * job queue.
- */
-template <class T>
-class MultiThreadWorker {
-public:
-  typedef T ResultType;
-  typedef std::shared_ptr<ResultType> ResultPtrType;
-  typedef std::function<ResultPtrType()> JobFunc;
-  /**
-   * @brief Construct Function. Initialize the multithread worker.
-   * @param[in] workerNum Number of the workers.
-   * @param[in] queueCapacity Capapcity of the result queue.
-   */
-  MultiThreadWorker(size_t workerNum, size_t queueCapacity)
-      : stopping_(false),
-        jobAdding_(true),
-        nullResultNum_(0),
-        results_(queueCapacity) {
-    workers_.resize(workerNum);
-    for (auto& worker : workers_) {
-      worker.reset(new std::thread([this]() { this->run(); }));
-    }
-  }
-
-  /**
-   * @brief Destruct Function. Force stop the workers
-   * even though there are remaining jobs in the job queue.
-   */
-  virtual ~MultiThreadWorker() { forceStop(); }
-
-  /**
-   * @brief Stop all the workers normally.
-   * @note stopAddJob() should be called before it.
-   */
-  void stop() {
-    CHECK(!jobAdding_) << "stopAddJob() should be called before stop()";
-    for (auto& worker : workers_) {
-      if (worker) {
-        worker->join();
-        worker = nullptr;
-      }
-    }
-    stopping_ = true;
-  }
-
-  /**
-   * @brief Stop all the workers forcibly.
-   * @note This function will call stopAddJob() first
-   * and empty the result queue.
-   */
-  void forceStop() {
-    if (!stopping_) {
-      stopping_ = true;
-      stopAddJob();
-      while (nullptr != waitResult()) {
-      }
-      stop();
-    }
-  }
-
-  /**
-   * @brief Add a job to the job queue.
-   * @note Job can not be added after calling stopAddJob().
-   */
-  void addJob(JobFunc func) {
-    CHECK(jobAdding_) << "addJob() can not be called after stopAddJob()";
-    jobs_.enqueue(func);
-  }
-
-  /**
-   * @brief Stop adding new jobs to the job queue.
-   * @note This fuction enqueue a return nullptr function to the job queue.
-   */
-  void stopAddJob() {
-    for (size_t i = 0; i < workers_.size(); ++i) {
-      jobs_.enqueue([]() { return nullptr; });
-    }
-    jobAdding_ = false;
-  }
-
-  /**
-   * @brief Dequeue the first result in the result queue and return it.
-   * @note If the result queue is empty, wait until it's not empty
-   * or return nullptr if all the results have been returned.
-   */
-  ResultPtrType waitResult() {
-    while (true) {
-      ResultPtrType result = results_.dequeue();
-      if (result) {
-        return result;
-      }
-
-      ++nullResultNum_;
-      if (nullResultNum_ == workers_.size()) {
-        return nullptr;
-      }
-    }
-  }
-
-  /**
-   * @brief The result queue is empty or not.
-   * @return true if empty.
-   */
-  bool testResult() { return results_.empty(); }
-
-protected:
-  /**
-   * @brief Do the jobs in the job queue sequentianlly
-   * and enqueue the result into the result queue.
-   * @note A nullptr will be enqueued into the resulte queue, when a worker
-   * finished.
-   */
-  virtual void run() {
-    while (true) {
-      JobFunc func = jobs_.dequeue();
-      ResultPtrType result = func();
-      if (result == nullptr || stopping_) {
-        // When a worker finished, a nullptr would be enqueued into results_
-        results_.enqueue(nullptr);
-        break;
-      }
-      results_.enqueue(result);
-    }
-  }
-
-  bool stopping_;
-  bool jobAdding_;
-  size_t nullResultNum_;
-  Queue<JobFunc> jobs_;
-  BlockingQueue<ResultPtrType> results_;
-  std::vector<std::unique_ptr<std::thread>> workers_;
-};
-
-/**
- * AsyncThreadPool maintains a job queue and threads pool.
- * It executes the jobs from queue asynchronously.
- *
- * Add jobs:
- *
- *    Use addJob() to add a new job to the job queue and get a std::future
- *    result. The caller's thread continues running. Call std::future::get()
- *    when the result's value is needed, and the caller's thread may be
- *    blocked until thread-pool finished the job.
- *
- *    Use addBatchJobs() to add a batch of jobs.
- *    Unlike addJob()'s asynchronization, addBatchJobs will block caller's
- *    thread until all jobs in the batch are finished.
- *
- * Stop:
- *    Use stop() to stop the thread pool. Job can be added once stopped.
- *
- * Process-wide Singleton:
- *    Use AsyncThreadPool::ProcessChannel(N) first to create N threads.
- *    Then call AsyncThreadPool::ProcessChannel() to get the process-wide global
- *    thread pool.
- */
-class AsyncThreadPool {
-public:
-  typedef std::function<void()> JobFunc;
-
-  AsyncThreadPool() { LOG(FATAL) << "Not implemented"; }
-
-  /**
-   * @brief Construct Function. Install all the workers.
-   * @param[in] threadNum Number of the threads, must greater than 1.
-   */
-  explicit AsyncThreadPool(size_t threadNum) {
-    CHECK_GT(threadNum, 1U);
-    stopping_ = false;
-    workers_.resize(threadNum);
-    for (auto& worker : workers_) {
-      worker.reset(new std::thread([this]() { this->run(); }));
-    }
-  }
-
-  ~AsyncThreadPool() {
-    if (!stopping_) {
-      stop();
-    }
-  }
-
-  /**
-   * @brief Stop all the workers normally.
-   */
-  void stop() {
-    stopping_ = true;
-    for (size_t i = 0; i < workers_.size(); i++) {
-      jobs_.enqueue([] {});
-    }
-    for (auto& worker : workers_) {
-      worker->join();
-    }
-  }
-
-  /**
-   * @brief A process-wide singleton. Used as a global thread pool
-   *    It should be initialized by calling
-   *    AsyncThreadPool::ProcessChannel(N) first to create N threads,
-   *    then call AsyncThreadPool::ProcessChannel() will get the thread pool.
-   */
-  static AsyncThreadPool& ProcessChannel(size_t initThreadNum = 0) {
-    static std::shared_ptr<AsyncThreadPool> channel(
-        new AsyncThreadPool(initThreadNum));
-    return *channel;
-  }
-
-  /**
-   * @brief Add a job to queue and return a std::future.
-   * @note The job will be executed
-   * asynchronously.
-   * Call std::future::get() when the execturation result is needed;
-   */
-  template <class F, class... Args>
-  auto addJob(F&& f, Args&&... args)
-      -> std::future<typename std::result_of<F(Args...)>::type> {
-    CHECK(!stopping_) << "AsyncThreadPool is closed";
-    typedef typename std::result_of<F(Args...)>::type T;
-
-    auto task = std::make_shared<std::packaged_task<T()>>(
-        std::bind(std::forward<F>(f), std::forward<Args>(args)...));
-    auto res = task->get_future();
-    jobs_.enqueue([task] { (*task)(); });
-    return res;
-  }
-
-  /**
-   * @brief Add a batch of jobs to the queue. The main thread will be blocked
-   * until these jobs are finished.
-   * The results will be stored in  `results` according to `jobs` order.
-   *
-   * @tparam F should have a return value.
-   *
-   * @param[in] jobs a vector of executable objection.
-   * @param[in] results a vector to store the results.
-   *
-   * @note *results* may need to be carefully cleared before *addBatchJobs()*.
-   */
-  template <class F>
-  void addBatchJobs(const std::vector<F>& jobs,
-                    std::vector<typename std::result_of<F()>::type>& results) {
-    typedef typename std::result_of<F()>::type T;
-    static_assert(!std::is_same<T, void>::value,
-                  "should pass a non-void function as job");
-
-    std::vector<std::future<T>> resFuts;
-    for (const auto& job : jobs) {
-      resFuts.emplace_back(addJob(job));
-    }
-    for (auto& fut : resFuts) {
-      results.emplace_back(fut.get());
-    }
-  }
-
-  /**
-   * @brief Add a batch of jobs reguardless of its result.
-   * @tparam F don't need to have a return value.
-   * @param[in] jobs a vector of executable objection.
-   */
-  template <class F>
-  void addBatchJobs(const std::vector<F>& jobs) {
-    CHECK(!stopping_) << "AsyncThreadPool is closed";
-    std::vector<std::future<bool>> tmpRes;
-
-    for (const auto& job : jobs) {
-      tmpRes.emplace_back(addJob([&job] {
-        job();
-        return true;
-      }));
-    }
-
-    for (auto& res : tmpRes) {
-      res.get();
-    }
-  }
-
-protected:
-  /**
-   * @brief Execute the jobs in the job queue.
-   */
-  void run() {
-    while (true) {
-      JobFunc func = jobs_.dequeue();
-      func();
-      if (stopping_) break;
-    }
-  }
-
-private:
-  std::vector<std::unique_ptr<std::thread>> workers_;
-  Queue<JobFunc> jobs_;
-  bool stopping_;
-};  // class AsyncThreadPool
-
-}  // namespace paddle
diff --git a/paddle/utils/ThreadLocal.h b/paddle/utils/ThreadLocal.h
deleted file mode 100644
index 0a27b8b97b83a9066af23039a317c437ea56777a..0000000000000000000000000000000000000000
--- a/paddle/utils/ThreadLocal.h
+++ /dev/null
@@ -1,229 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <pthread.h>
-#include <sys/syscall.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <map>
-#include <mutex>
-#include <random>
-#include "Logging.h"
-#include "Util.h"
-
-namespace paddle {
-
-/**
- * Thread local storage for object.
- * Example:
- *
- * Declarartion:
- * ThreadLocal<vector<int>> vec_;
- *
- * Use in thread:
- * vector<int>& vec = *vec; // obtain the thread specific object
- * vec.resize(100);
- *
- * Note that this ThreadLocal will desconstruct all internal data when thread
- * exits
- * This class is suitable for cases when frequently creating and deleting
- * threads.
- *
- * Consider implementing a new ThreadLocal if one needs to frequently create
- * both instances and threads.
- *
- * see also ThreadLocalD
- */
-template <class T>
-class ThreadLocal {
-public:
-  ThreadLocal() {
-    CHECK_EQ(pthread_key_create(&threadSpecificKey_, dataDestructor), 0);
-  }
-  ~ThreadLocal() { pthread_key_delete(threadSpecificKey_); }
-
-  /**
-   * @brief get thread local object.
-   * @param if createLocal is true and thread local object is never created,
-   * return a new object. Otherwise, return nullptr.
-   */
-  T* get(bool createLocal = true) {
-    T* p = (T*)pthread_getspecific(threadSpecificKey_);
-    if (!p && createLocal) {
-      p = new T();
-      int ret = pthread_setspecific(threadSpecificKey_, p);
-      CHECK_EQ(ret, 0);
-    }
-    return p;
-  }
-
-  /**
-   * @brief set (overwrite) thread local object. If there is a thread local
-   * object before, the previous object will be destructed before.
-   *
-   */
-  void set(T* p) {
-    if (T* q = get(false)) {
-      dataDestructor(q);
-    }
-    CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
-  }
-
-  /**
-   * return reference.
-   */
-  T& operator*() { return *get(); }
-
-  /**
-   * Implicit conversion to T*
-   */
-  operator T*() { return get(); }
-
-private:
-  static void dataDestructor(void* p) { delete (T*)p; }
-
-  pthread_key_t threadSpecificKey_;
-};
-
-/**
- * Almost the same as ThreadLocal, but note that this ThreadLocalD will
- * destruct all internal data when ThreadLocalD instance destructs.
- *
- * This class is suitable for cases when frequently creating and deleting
- * objects.
- *
- * see also ThreadLocal
- *
- * @note The type T must implemented default constructor.
- */
-template <class T>
-class ThreadLocalD {
-public:
-  ThreadLocalD() { CHECK_EQ(pthread_key_create(&threadSpecificKey_, NULL), 0); }
-  ~ThreadLocalD() {
-    pthread_key_delete(threadSpecificKey_);
-    for (auto t : threadMap_) {
-      dataDestructor(t.second);
-    }
-  }
-
-  /**
-   * @brief Get thread local object. If not exists, create new one.
-   */
-  T* get() {
-    T* p = (T*)pthread_getspecific(threadSpecificKey_);
-    if (!p) {
-      p = new T();
-      CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
-      updateMap(p);
-    }
-    return p;
-  }
-
-  /**
-   * @brief Set thread local object. If there is an object create before, the
-   * old object will be destructed.
-   */
-  void set(T* p) {
-    if (T* q = (T*)pthread_getspecific(threadSpecificKey_)) {
-      dataDestructor(q);
-    }
-    CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
-    updateMap(p);
-  }
-
-  /**
-   * @brief Get reference of the thread local object.
-   */
-  T& operator*() { return *get(); }
-
-private:
-  static void dataDestructor(void* p) { delete (T*)p; }
-
-  void updateMap(T* p) {
-    pid_t tid = getTID();
-    CHECK_NE(tid, -1);
-    std::lock_guard<std::mutex> guard(mutex_);
-    auto ret = threadMap_.insert(std::make_pair(tid, p));
-    if (!ret.second) {
-      ret.first->second = p;
-    }
-  }
-
-  pthread_key_t threadSpecificKey_;
-  std::mutex mutex_;
-  std::map<pid_t, T*> threadMap_;
-};
-
-/**
- * @brief Thread-safe C-style random API.
- */
-class ThreadLocalRand {
-public:
-  /**
-   * initSeed just like srand,
-   * called by main thread,
-   * init defaultSeed for all thread
-   */
-  static void initSeed(unsigned int seed) { defaultSeed_ = seed; }
-
-  /**
-   * initThreadSeed called by each thread,
-   * init seed to defaultSeed + *tid*
-   * It should be called after main initSeed and before using rand()
-   * It's optional, getSeed will init seed if it's not initialized.
-   */
-  static void initThreadSeed(int tid) {
-    seed_.set(new unsigned int(defaultSeed_ + tid));
-  }
-
-  /// thread get seed, then can call rand_r many times.
-  /// Caller thread can modify the seed value if it's necessary.
-  ///
-  /// if flag thread_local_rand_use_global_seed set,
-  /// the seed will be set to defaultSeed in thread's first call.
-  static unsigned int* getSeed();
-
-  /// like ::rand
-  static int rand() { return rand_r(getSeed()); }
-
-  /**
-   * Get defaultSeed for all thread.
-   */
-  static int getDefaultSeed() { return defaultSeed_; }
-
-protected:
-  static unsigned int defaultSeed_;
-  static ThreadLocal<unsigned int> seed_;
-};
-
-/**
- * @brief Thread-safe C++ style random engine.
- */
-class ThreadLocalRandomEngine {
-public:
-  /**
-   * get random_engine for each thread.
-   *
-   * Engine's seed will be initialized by ThreadLocalRand.
-   */
-  static std::default_random_engine& get();
-
-protected:
-  static ThreadLocal<std::default_random_engine> engine_;
-};
-
-}  // namespace paddle
diff --git a/paddle/utils/Util.h b/paddle/utils/Util.h
deleted file mode 100644
index 9579881ea3b92abab0189631184bab515afb67a3..0000000000000000000000000000000000000000
--- a/paddle/utils/Util.h
+++ /dev/null
@@ -1,570 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <sys/syscall.h>  // for syscall()
-#include <sys/types.h>
-#include <algorithm>
-#include <cmath>
-#include <functional>
-#include <memory>
-#include <mutex>
-#include <string>
-#include <thread>
-#include <unordered_map>
-#include <vector>
-
-#include "Common.h"
-#include "Logging.h"
-#include "TrainerConfig.pb.h"
-
-#include "Flags.h"
-#include "hl_gpu.h"
-
-#if defined(__ANDROID__) && (__ANDROID_API__ < 21)
-inline int rand_r(unsigned int* seedp) {
-  (void)seedp;
-  return rand();
-}
-#endif
-
-/**
- * Loop over the elements in a container
- * TODO(yuyang18): It's this foreach useful? Why not use C++ 11 foreach,
- *                 or make it a inline method?
- * Example:
- * FOR_EACH(it, array) {
- *  sum += *it;
- * }
- */
-#define FOR_EACH(iterator_name, container)                              \
-  for (auto iterator_name = (container).begin(), e = (container).end(); \
-       iterator_name != e;                                              \
-       ++iterator_name)
-
-/**
- * Loop over the elements in a container in reverse order
- * TODO(yuyang18): It's this foreach useful? Why not use C++ 11 foreach,
- *                 or make it a inline method?
- * Example:
- * FOR_EACH_R(it, array) {
- *  sum += *it;
- * }
- */
-#define FOR_EACH_R(iterator_name, container)                              \
-  for (auto iterator_name = (container).rbegin(), e = (container).rend(); \
-       iterator_name != e;                                                \
-       ++iterator_name)
-
-namespace paddle {
-
-// return the thread id used by glog
-pid_t getTID();
-
-/**
- * return the 1-based index of the highest bit set
- *
- * for x > 0:
- * \f[
- *    findLastSet(x) = 1 + \floor*{\log_{2}x}
- * \f]
- */
-inline constexpr size_t findLastSet(size_t x) {
-  return std::is_same<size_t, unsigned int>::value
-             ? (x ? 8 * sizeof(x) - __builtin_clz(x) : 0)
-             : (std::is_same<size_t, unsigned long>::value  // NOLINT
-                    ? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
-                    : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
-}
-
-/**
- * calculate the non-negative remainder of a/b
- * @param[in] a
- * @param[in] b, should be positive
- * @return the non-negative remainder of a / b
- */
-inline int mod(int a, int b) {
-  int r = a % b;
-  return r >= 0 ? r : r + b;
-}
-
-/**
- * find the value given a key k from container c.
- * If the key can be found, the value is stored in *value
- * return true if the key can be found. false otherwise.
- */
-template <class K, class V, class C>
-bool mapGet(const K& k, const C& c, V* value) {
-  auto it = c.find(k);
-  if (it != c.end()) {
-    *value = it->second;
-    return true;
-  } else {
-    return false;
-  }
-}
-
-template <class Container, class T>
-static bool contains(const Container& container, const T& val) {
-  return std::find(container.begin(), container.end(), val) != container.end();
-}
-
-/**
- * pop and get the front element of a container
- */
-template <typename Container>
-typename Container::value_type pop_get_front(Container& c) {
-  typename Container::value_type v;
-  swap(v, c.front());
-  c.pop_front();
-  return v;
-}
-
-#define ARRAYSIZE(a) (sizeof(a) / sizeof(*(a)))
-
-/**
- * Initialize some creators or initFunctions for layers and data
- * providers.
- * Client codes should call this function before they refer any other
- * codes that use the layer class and data provider class.
- *
- * Codes inside 'core' directory can call initMain which calls
- * runInitFunctions directly, while codes outside core can simply
- * call runInitFunctions if they don't need the commandline flags
- * designed for PADDLE main procedure.
- */
-void runInitFunctions();
-
-/**
- * Initialize logging and parse commandline
- */
-void initMain(int argc, char** argv);
-
-// read the whole file into a string
-std::string readFile(const std::string& fileName);
-
-// copy file to path
-void copyFileToPath(const std::string& file, const std::string& path);
-
-// test file exist or not
-bool fileExist(const char* filename);
-// touch file if not exist
-void touchFile(const char* filename);
-// make dir if not exist
-void mkDir(const char* filename);
-void mkDirRecursively(const char* filename);
-
-void rmDir(const char* folderName);
-
-// load a file list file into a vector(fileList)
-void loadFileList(const std::string& fileListFileName,
-                  std::vector<std::string>& fileList);
-
-/**
- * Register a function, the function will be called in initMain(). Functions
- * with higher priority will be called first. The execution order of functions
- * with same priority is not defined.
- */
-void registerInitFunction(std::function<void()> func, int priority = 0);
-class InitFunction {
-public:
-  explicit InitFunction(std::function<void()> func, int priority = 0) {
-    registerInitFunction(func, priority);
-  }
-};
-
-/**
- * Class SetDevice provides a mechanism for set device enviroment.
- * When a SetDevice object is created, it attempts to change device enviroment.
- * When the SetDevice object is destructed, it will restore device environment.
- */
-class SetDevice {
-public:
-  explicit SetDevice(int deviceId) {
-    isSet_ = deviceId >= 0;
-    devId_ = 0;
-    if (isSet_) {
-      devId_ = hl_get_device();
-      hl_set_device(deviceId);
-    }
-  }
-  ~SetDevice() {
-    if (isSet_) {
-      hl_set_device(devId_);
-    }
-  }
-
-protected:
-  bool isSet_;
-  int devId_;
-};
-
-/**
- * Enables direct access to memory allocations on a peer device(d2).
- * input:
- * *d1* is device can direct access device d2.
- * *d2* is peer device to enable direct access to by the d1 device.
- */
-inline void enablePeerAccess(int d1, int d2) {
-#ifdef PADDLE_WITH_CUDA
-  if (hl_device_can_access_peer(d1, d2)) {
-    SetDevice dev(d1);
-    hl_device_enable_peer_access(d2);
-  }
-#else
-  LOG(FATAL) << "Paddle should be compiled in GPU mode to use this method.";
-#endif
-}
-
-/**
- * Change the gpu computation mode to asynchronized mode for the rest of the
- * compilation block. This is useful if the computation consists of multiple
- * small steps. Async mode can overlap the cuda-kernel launch overhead with the
- * actual computation.
- * Example:
- * {
- *    AsycnGpuBlock asyncBlock;
- *    do_some_gpu_computation
- * }
- */
-class AsyncGpuBlock {
-public:
-  AsyncGpuBlock() : syncFlag_(hl_get_sync_flag()) { hl_set_sync_flag(false); }
-  ~AsyncGpuBlock() {
-    if (syncFlag_) {
-      hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-      hl_set_sync_flag(syncFlag_);
-    }
-  }
-
-private:
-  bool syncFlag_;
-};
-
-inline bool useGpu(int deviceId) {
-  return FLAGS_parallel_nn ? (deviceId >= 0 ? true : false) : FLAGS_use_gpu;
-}
-
-/*
- * hppl activation mode
- */
-hl_activation_mode_t hlActiveType(const std::string& type);
-
-/**
- * Return value: memory usage ratio (from 0-1)
- */
-double getMemoryUsage();
-
-/**
- * split array by index.
- * used by sync multi thread task,
- * each thread call calcSplitArrayInterval with thread id,
- * get a interval as return.
- * input:
- * *totalSize* is array size,
- * *tId* is thread id, *tSize* is total worker thread num
- * output:
- * start and end index as a std::pair
- */
-inline std::pair<size_t, size_t> calcSplitArrayInterval(size_t totalSize,
-                                                        size_t tId,
-                                                        size_t tSize) {
-  size_t start = totalSize * tId / tSize;
-  size_t end = totalSize * (tId + 1) / tSize;
-  return std::make_pair(start, end);
-}
-
-/**
- * same as above, but split at boundary of block.
- */
-inline std::pair<size_t, size_t> calcSplitArrayInterval(size_t totalSize,
-                                                        size_t tId,
-                                                        size_t tSize,
-                                                        size_t blockSize) {
-  size_t numBlocks = totalSize / blockSize;
-  if (numBlocks * blockSize < totalSize) {
-    numBlocks++;
-  }
-
-  auto interval = calcSplitArrayInterval(numBlocks, tId, tSize);
-  size_t start = std::min(interval.first * blockSize, totalSize);
-  size_t end = std::min(interval.second * blockSize, totalSize);
-
-  return std::make_pair(start, end);
-}
-
-// Calculate the number of pservers/dservers based
-// on the host list and port_num.
-size_t calculateServiceNum(const std::string& pservers, int ports_num);
-
-/**
- * sort and unique ids vector.
- */
-inline void uniqueIds(std::vector<uint32_t>& ids) {
-  std::sort(ids.begin(), ids.end());
-  auto endpos = std::unique(ids.begin(), ids.end());
-  ids.erase(endpos, ids.end());
-}
-
-/**
- * Read Type value
- */
-template <typename T>
-T readT(char*& p, const char* pEnd) {
-  int minus = pEnd - p - sizeof(T);
-  CHECK_LE(0, minus) << "readT: Out of range.";
-  T v = *reinterpret_cast<T*>(p);
-  p += sizeof(T);
-  return v;
-}
-
-void memcpyWithCheck(void* dest,
-                     const void* src,
-                     size_t num,
-                     const void* srcEnd);
-
-/**
- * A global sync thread pool, has #FLAGS_trainer_count of threads.
- * can be used in main thread.
- */
-class SyncThreadPool;
-SyncThreadPool* getGlobalSyncThreadPool();
-
-namespace path {
-
-// directory separator
-const char sep = '/';
-
-// Return the base name of pathname path.
-std::string basename(const std::string& path);
-
-// Return the directory name of path. If the path does not contains any
-// directory, it returns an empty string.
-std::string dirname(const std::string& path);
-
-/*
-  Join two path components intelligently.
-  The return value is the concatenation of part1 and part2 with exactly one
-  directory separator (path.sep) following each non-empty part except the last,
-  meaning that the result will only end in a separator if the last part is
-  empty.
-  If a component is an absolute path, all previous components are thrown away
-  and joining continues from the absolute path component.
-*/
-std::string join(const std::string& part1, const std::string& part2);
-
-template <typename... Args>
-std::string join(const std::string& part1,
-                 const std::string& part2,
-                 Args... args) {
-  return join(join(part1, part2), args...);
-}
-
-}  // namespace path
-
-/**
- * A Checker for each invoke of method in same thread.
- */
-class SameThreadChecker {
-public:
-  SameThreadChecker() {}
-
-  /**
-   * Disable copy
-   */
-  SameThreadChecker(const SameThreadChecker& other) = delete;
-  SameThreadChecker& operator=(const SameThreadChecker& other) = delete;
-
-  /**
-   * Each invoke of check method should be in same thread, otherwise, it will
-   * failed and core dump.
-   */
-  void check() {
-    std::thread::id curThreadId = std::this_thread::get_id();
-    std::call_once(onceFlag_, [&] { invokeThreadId_ = curThreadId; });
-    CHECK_EQ(invokeThreadId_, curThreadId)
-        << "This method should invoke in "
-           "same thread, but first invoked in "
-        << invokeThreadId_ << " current invoked in " << curThreadId;
-  }
-
-private:
-  std::once_flag onceFlag_;
-  std::thread::id invokeThreadId_;
-};
-
-/**
- * Key-Value Cache Helper.
- *
- * It store a object instance global. User can invoke get method by key and a
- * object creator callback. If there is a instance stored in cache, then it will
- * return a shared_ptr of it, otherwise, it will invoke creator callback, create
- * a new instance store global, and return it.
- *
- * The cache instance will release when nobody hold a reference to it.
- *
- * The KType is the key type.
- * The VType is the value type.
- * The Hash is the key hasher object.
- */
-template <typename KType, typename VType, typename Hash>
-class WeakKVCache {
-public:
-  WeakKVCache() {}
-
-  std::shared_ptr<VType> get(const KType& key,
-                             const std::function<VType*()>& creator) {
-    std::lock_guard<std::mutex> guard(this->lock_);
-    auto it = this->storage_.find(key);
-    if (it != this->storage_.end()) {
-      auto& val = it->second;
-      auto retVal = val.lock();
-      if (retVal != nullptr) {
-        return retVal;
-      }  // else fall trough. Because it is WeakPtr Cache.
-    }
-    auto rawPtr = creator();
-    CHECK(rawPtr != nullptr);
-    std::shared_ptr<VType> retVal(rawPtr);
-    this->storage_[key] = retVal;
-    return retVal;
-  }
-
-private:
-  std::mutex lock_;
-  std::unordered_map<KType, std::weak_ptr<VType>, Hash> storage_;
-};
-
-/**
- * @brief The ScopedCallbacks class is a callback invoker when object is
- *        created and destroyed.
- */
-template <typename CallbackType, typename... Args>
-class ScopedCallbacks {
-public:
-  ScopedCallbacks(CallbackType enter, CallbackType exit, Args&... args)
-      : exit_(std::bind(exit, args...)) {
-    enter(args...);
-  }
-
-  ScopedCallbacks(const ScopedCallbacks& other) = delete;
-  ScopedCallbacks& operator=(const ScopedCallbacks& other) = delete;
-
-  ~ScopedCallbacks() { exit_(); }
-
-private:
-  std::function<void()> exit_;
-};
-
-/**
- * std compatible allocator with memory alignment.
- * @tparam T type of allocator elements.
- * @tparam Alignment the alignment in bytes.
- */
-template <typename T, size_t Alignment>
-class AlignedAllocator {
-public:
-  /// std campatible typedefs.
-  typedef T* pointer;
-  typedef const T* const_pointer;
-  typedef T& reference;
-  typedef const T& const_reference;
-  typedef T value_type;
-  typedef size_t size_type;
-  typedef ptrdiff_t difference_type;
-
-  T* address(T& r) const { return &r; }
-
-  const T* address(const T& r) const { return &r; }
-
-  size_t max_size() const {
-    return std::numeric_limits<size_t>::max() / sizeof(T);
-  }
-
-  template <typename U>
-  struct rebind {
-    typedef AlignedAllocator<U, Alignment> other;
-  };
-
-  bool operator==(const AlignedAllocator& other) const { return true; }
-
-  bool operator!=(const AlignedAllocator& other) const {
-    return !(*this == &other);
-  }
-
-  void construct(const T* p, const T& t) const {
-    void* pv = const_cast<T*>(p);
-    new (pv) T(t);
-  }
-
-  void deallocate(const T* p, const size_type n) const {
-    (void)(n);  // UNUSED n
-    free(const_cast<T*>(p));
-  }
-
-  void destroy(const T* p) const { p->~T(); }
-
-  AlignedAllocator() {}
-  ~AlignedAllocator() {}
-
-  AlignedAllocator(const AlignedAllocator&) {}
-  template <typename U>
-  AlignedAllocator(const AlignedAllocator<U, Alignment>&) {}
-
-  /**
-   * @brief allocate n elements of type T, the first address is aligned by
-   *        Alignment bytes.
-   * @param n element count.
-   * @return begin address of allocated buffer
-   * @throw std::length_error for n * sizeof(T) is overflowed.
-   * @throw std::bad_alloc
-   */
-  T* allocate(const size_type n) const {
-    if (n == 0) {
-      return nullptr;
-    }
-    if (n > max_size()) {
-      throw std::length_error("AlignAllocator<T>::allocate() - Int Overflow.");
-    }
-    void* r = nullptr;
-    CHECK_EQ(posix_memalign(&r, Alignment * 8, sizeof(T) * n), 0);
-    if (r == nullptr) {
-      throw std::bad_alloc();
-    } else {
-      return static_cast<T*>(r);
-    }
-  }
-
-  template <typename U>
-  T* allocate(const std::size_t n, const U* /* const hint */) const {
-    return this->allocate(n);
-  }
-
-private:
-  AlignedAllocator& operator=(const AlignedAllocator&);  // disable
-};
-
-class Deprecated {
-public:
-  explicit Deprecated(const std::string& msg = "") {
-    if (msg.empty()) {
-      LOG(WARNING) << "This class is deprecated, please do not use this class.";
-    } else {
-      LOG(WARNING) << msg;
-    }
-  }
-};
-
-}  // namespace paddle
diff --git a/paddle/utils/arch/linux/Locks.cpp b/paddle/utils/arch/linux/Locks.cpp
deleted file mode 100644
index a4e6c8f7b8397adc262588612c250bac5ef5eaa6..0000000000000000000000000000000000000000
--- a/paddle/utils/arch/linux/Locks.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Locks.h"
-#include <semaphore.h>
-#include <unistd.h>
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-class SemaphorePrivate {
-public:
-  sem_t sem;
-};
-
-Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) {
-  sem_init(&m->sem, 0, initValue);
-}
-
-Semaphore::~Semaphore() {
-  sem_destroy(&m->sem);
-  delete m;
-}
-
-bool Semaphore::timeWait(struct timespec* ts) {
-  return (0 == sem_timedwait(&m->sem, ts));
-}
-
-void Semaphore::wait() { sem_wait(&m->sem); }
-
-void Semaphore::post() { sem_post(&m->sem); }
-
-/// SpinLockPrivate
-
-#ifdef PADDLE_USE_PTHREAD_SPINLOCK
-
-class SpinLockPrivate {
-public:
-  inline SpinLockPrivate() { pthread_spin_init(&lock_, 0); }
-  inline ~SpinLockPrivate() { pthread_spin_destroy(&lock_); }
-
-  inline void lock() { pthread_spin_lock(&lock_); }
-  inline void unlock() { pthread_spin_unlock(&lock_); }
-
-  pthread_spinlock_t lock_;
-  char padding_[64 - sizeof(pthread_spinlock_t)];
-};
-
-#else
-// clang-format off
-#include <cstddef>
-#include <atomic>
-// clang-format on
-
-class SpinLockPrivate {
-public:
-  inline void lock() {
-    while (lock_.test_and_set(std::memory_order_acquire)) {
-    }
-  }
-  inline void unlock() { lock_.clear(std::memory_order_release); }
-
-  std::atomic_flag lock_ = ATOMIC_FLAG_INIT;
-  char padding_[64 - sizeof(lock_)];  // Padding to cache line size
-};
-
-#endif
-
-SpinLock::SpinLock() : m(new SpinLockPrivate()) {}
-SpinLock::~SpinLock() { delete m; }
-void SpinLock::lock() { m->lock(); }
-void SpinLock::unlock() { m->unlock(); }
-
-/// ThreadBarrierPrivate
-
-#ifdef PADDLE_USE_PTHREAD_BARRIER
-
-class ThreadBarrierPrivate {
-public:
-  pthread_barrier_t barrier_;
-
-  inline explicit ThreadBarrierPrivate(int count) {
-    pthread_barrier_init(&barrier_, nullptr, count);
-  }
-
-  inline ~ThreadBarrierPrivate() { pthread_barrier_destroy(&barrier_); }
-
-  inline void wait() { pthread_barrier_wait(&barrier_); }
-};
-
-#else
-
-class ThreadBarrierPrivate {
-public:
-  pthread_mutex_t mutex_;
-  pthread_cond_t cond_;
-  int count_;
-  int tripCount_;
-
-  inline explicit ThreadBarrierPrivate(int cnt) : count_(0), tripCount_(cnt) {
-    CHECK_NE(cnt, 0);
-    CHECK_GE(pthread_mutex_init(&mutex_, 0), 0);
-    CHECK_GE(pthread_cond_init(&cond_, 0), 0);
-  }
-
-  inline ~ThreadBarrierPrivate() {
-    pthread_cond_destroy(&cond_);
-    pthread_mutex_destroy(&mutex_);
-  }
-
-  /**
-   * @brief wait
-   * @return true if the last wait
-   */
-  inline bool wait() {
-    pthread_mutex_lock(&mutex_);
-    ++count_;
-    if (count_ >= tripCount_) {
-      count_ = 0;
-      pthread_cond_broadcast(&cond_);
-      pthread_mutex_unlock(&mutex_);
-      return true;
-    } else {
-      pthread_cond_wait(&cond_, &mutex_);
-      pthread_mutex_unlock(&mutex_);
-      return false;
-    }
-  }
-};
-
-#endif
-
-/// ThreadBarrier
-
-ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate(count)) {}
-ThreadBarrier::~ThreadBarrier() { delete m; }
-void ThreadBarrier::wait() { m->wait(); }
-
-}  // namespace paddle
diff --git a/paddle/utils/arch/osx/Excepts.cpp b/paddle/utils/arch/osx/Excepts.cpp
deleted file mode 100644
index ac444615786fa9f89f96504a31b2289eae7bb643..0000000000000000000000000000000000000000
--- a/paddle/utils/arch/osx/Excepts.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Excepts.h"
-
-#if defined(__APPLE__) || defined(__OSX__)
-#if defined(__arm__) || defined(__arm64__)
-// TODO(liuyiqun): implement the arm version
-int fegetexcept(void) { return -1; }
-int feenableexcept(unsigned int excepts) { return -1; }
-int fedisableexcept(unsigned int excepts) { return -1; }
-#else
-int fegetexcept(void) {
-  static fenv_t fenv;
-  return fegetenv(&fenv) ? -1 : (fenv.__control & FE_ALL_EXCEPT);
-}
-
-int feenableexcept(unsigned int excepts) {
-  static fenv_t fenv;
-  unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts;
-
-  if (fegetenv(&fenv)) return -1;
-  old_excepts = fenv.__control & FE_ALL_EXCEPT;
-
-  // unmask
-  fenv.__control &= ~new_excepts;
-  fenv.__mxcsr &= ~(new_excepts << 7);
-
-  return (fesetenv(&fenv) ? -1 : old_excepts);
-}
-
-int fedisableexcept(unsigned int excepts) {
-  static fenv_t fenv;
-  unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts;
-
-  if (fegetenv(&fenv)) return -1;
-  old_excepts = fenv.__control & FE_ALL_EXCEPT;
-
-  // mask
-  fenv.__control |= new_excepts;
-  fenv.__mxcsr |= new_excepts << 7;
-
-  return (fesetenv(&fenv) ? -1 : old_excepts);
-}
-#endif
-#endif
diff --git a/paddle/utils/arch/osx/Locks.cpp b/paddle/utils/arch/osx/Locks.cpp
deleted file mode 100644
index e03992363fd6051a1970664d63406b2e7a47fce3..0000000000000000000000000000000000000000
--- a/paddle/utils/arch/osx/Locks.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Locks.h"
-#include <dispatch/dispatch.h>
-#include <libkern/OSAtomic.h>
-#include <atomic>
-#include "paddle/utils/Logging.h"
-
-namespace paddle {
-
-class SemaphorePrivate {
-public:
-  ~SemaphorePrivate() { dispatch_release(sem); }
-
-  dispatch_semaphore_t sem;
-};
-
-Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) {
-  m->sem = dispatch_semaphore_create(initValue);
-}
-
-Semaphore::~Semaphore() { delete m; }
-
-bool Semaphore::timeWait(timespec *ts) {
-  dispatch_time_t tm = dispatch_walltime(ts, 0);
-  return (0 == dispatch_semaphore_wait(m->sem, tm));
-}
-
-void Semaphore::wait() {
-  dispatch_semaphore_wait(m->sem, DISPATCH_TIME_FOREVER);
-}
-
-void Semaphore::post() { dispatch_semaphore_signal(m->sem); }
-
-class SpinLockPrivate {
-public:
-  std::atomic_flag lock_ = ATOMIC_FLAG_INIT;
-  char padding_[64 - sizeof(lock_)];  // Padding to cache line size
-};
-
-SpinLock::SpinLock() : m(new SpinLockPrivate()) {}
-SpinLock::~SpinLock() { delete m; }
-
-void SpinLock::lock() {
-  while (m->lock_.test_and_set(std::memory_order_acquire)) {
-  }
-}
-
-void SpinLock::unlock() { m->lock_.clear(std::memory_order_release); }
-
-class ThreadBarrierPrivate {
-public:
-  pthread_mutex_t mutex_;
-  pthread_cond_t cond_;
-  int count_;
-  int tripCount_;
-
-  inline explicit ThreadBarrierPrivate(int cnt) : count_(0), tripCount_(cnt) {
-    CHECK_NE(cnt, 0);
-    CHECK_GE(pthread_mutex_init(&mutex_, 0), 0);
-    CHECK_GE(pthread_cond_init(&cond_, 0), 0);
-  }
-
-  inline ~ThreadBarrierPrivate() {
-    pthread_cond_destroy(&cond_);
-    pthread_mutex_destroy(&mutex_);
-  }
-
-  /**
-   * @brief wait
-   * @return true if the last wait
-   */
-  inline bool wait() {
-    pthread_mutex_lock(&mutex_);
-    ++count_;
-    if (count_ >= tripCount_) {
-      count_ = 0;
-      pthread_cond_broadcast(&cond_);
-      pthread_mutex_unlock(&mutex_);
-      return true;
-    } else {
-      pthread_cond_wait(&cond_, &mutex_);
-      pthread_mutex_unlock(&mutex_);
-      return false;
-    }
-  }
-};
-
-ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate(count)) {}
-ThreadBarrier::~ThreadBarrier() { delete m; }
-void ThreadBarrier::wait() { m->wait(); }
-
-}  // namespace paddle
diff --git a/paddle/utils/tests/CMakeLists.txt b/paddle/utils/tests/CMakeLists.txt
deleted file mode 100644
index c770ce169878d9998e559b1d417fc1acc88cde97..0000000000000000000000000000000000000000
--- a/paddle/utils/tests/CMakeLists.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-add_simple_unittest(test_Thread)
-add_simple_unittest(test_StringUtils)
-add_simple_unittest(test_CustomStackTrace)
-add_simple_unittest(test_ThreadBarrier)
-add_simple_unittest(test_SpinLock)
-add_simple_unittest(test_SIMDFlags)
-add_simple_unittest(test_Error)
-
-add_executable(
-    test_CustomStackTracePrint
-    test_CustomStackTracePrint.cpp
-)
-link_paddle_exe(test_CustomStackTracePrint)
-if(NOT APPLE)
-    add_test(NAME test_CustomStackTracePrint
-        COMMAND ${PADDLE_SOURCE_DIR}/paddle/utils/tests/test_CustomStackTracePrint.sh
-        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-endif()
diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp
deleted file mode 100644
index c320074fbadab3e211ed72ce715d595c90673d6d..0000000000000000000000000000000000000000
--- a/paddle/utils/tests/test_CustomStackTrace.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <chrono>
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-
-#include "paddle/utils/CustomStackTrace.h"
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Util.h"
-
-DEFINE_int32(test_thread_num, 10, "testing thread number");
-
-void testNormalImpl(
-    const std::function<void(paddle::CustomStackTrace<std::string>&,
-                             size_t,
-                             size_t,
-                             paddle::ThreadBarrier&,
-                             paddle::ThreadBarrier&)>& callback) {
-  paddle::CustomStackTrace<std::string> tracer;
-  paddle::ThreadBarrier doneBarrier(FLAGS_test_thread_num + 1);
-  paddle::ThreadBarrier startBarrier(FLAGS_test_thread_num + 1);
-  constexpr size_t countDown = 10;
-  constexpr size_t layerSize = 1000;
-  std::vector<std::unique_ptr<std::thread>> threads;
-  threads.reserve(FLAGS_test_thread_num);
-
-  for (int32_t i = 0; i < FLAGS_test_thread_num; ++i) {
-    threads.emplace_back(new std::thread([&tracer,
-                                          &countDown,
-                                          &layerSize,
-                                          &startBarrier,
-                                          &doneBarrier,
-                                          &callback] {
-      callback(tracer, countDown, layerSize, startBarrier, doneBarrier);
-    }));
-  }
-  size_t cntDown = countDown;
-  while (cntDown-- > 0) {
-    startBarrier.wait();
-    sleep(1);
-    doneBarrier.wait();
-    ASSERT_TRUE(tracer.empty());
-  }
-
-  for (auto& thread : threads) {
-    thread->join();
-  }
-}
-
-TEST(CustomStackTrace, normalTrain) {
-  testNormalImpl([](paddle::CustomStackTrace<std::string>& tracer,
-                    size_t countDown,
-                    size_t layerSize,
-                    paddle::ThreadBarrier& start,
-                    paddle::ThreadBarrier& finish) {
-    while (countDown-- > 0) {
-      start.wait();
-      for (size_t i = 0; i < layerSize; ++i) {
-        tracer.push("layer_" + paddle::str::to_string(i));
-      }
-      for (size_t i = 0; i < layerSize; ++i) {
-        tracer.pop("layer_" + paddle::str::to_string(layerSize - 1 - i));
-      }
-      finish.wait();
-    }
-  });
-}
-
-TEST(CustomStackTrace, normalTest) {
-  testNormalImpl([](paddle::CustomStackTrace<std::string>& tracer,
-                    size_t countDown,
-                    size_t layerSize,
-                    paddle::ThreadBarrier& start,
-                    paddle::ThreadBarrier& finish) {
-    while (countDown-- > 0) {
-      start.wait();
-      for (size_t i = 0; i < layerSize; ++i) {
-        tracer.push("layer_" + paddle::str::to_string(i));
-      }
-      tracer.clear();  // in forward test, tracer will clear after forward.
-      finish.wait();
-    }
-  });
-}
diff --git a/paddle/utils/tests/test_CustomStackTracePrint.cpp b/paddle/utils/tests/test_CustomStackTracePrint.cpp
deleted file mode 100644
index 360c61c88a757da708b01d2bb54068b948b235cc..0000000000000000000000000000000000000000
--- a/paddle/utils/tests/test_CustomStackTracePrint.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/CustomStackTrace.h"
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Util.h"
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-
-  for (size_t i = 0; i < 1000; ++i) {
-    paddle::gLayerStackTrace.push("layer_" + paddle::str::to_string(i));
-    if (i == 998) {
-      throw "Unhandle exception";
-    }
-  }
-
-  return 0;
-}
diff --git a/paddle/utils/tests/test_Error.cpp b/paddle/utils/tests/test_Error.cpp
deleted file mode 100644
index 6f311fa6b80191de1e11ce1f63c31b64fe2eeb80..0000000000000000000000000000000000000000
--- a/paddle/utils/tests/test_Error.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/Error.h"
-
-#include <gtest/gtest.h>
-
-TEST(Error, testAll) {
-  paddle::Error error;
-  ASSERT_TRUE(error.isOK());
-  error = paddle::Error("I'm the error");
-  ASSERT_FALSE(error.isOK());
-  ASSERT_STREQ("I'm the error", error.msg());
-
-  error = paddle::Error("error2");
-  ASSERT_FALSE(error.isOK());
-  ASSERT_STREQ("error2", error.msg());
-
-  int i = 3;
-  auto error3 = paddle::Error("error%d", i);
-  ASSERT_FALSE(error3.isOK());
-  ASSERT_STREQ("error3", error3.msg());
-}
diff --git a/paddle/utils/tests/test_SIMDFlags.cpp b/paddle/utils/tests/test_SIMDFlags.cpp
deleted file mode 100644
index a808d456a69866f72502bcf1ae244cec14738e22..0000000000000000000000000000000000000000
--- a/paddle/utils/tests/test_SIMDFlags.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-
-#include "paddle/utils/CpuId.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Util.h"
-
-using namespace paddle;  // NOLINT
-
-TEST(SIMDFlags, gccTest) {
-#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__)) && \
-    !defined(__arm__) && !defined(__aarch64__)
-  // clang-format off
-  CHECK(!__builtin_cpu_supports("sse")    != HAS_SSE);
-  CHECK(!__builtin_cpu_supports("sse2")   != HAS_SSE2);
-  CHECK(!__builtin_cpu_supports("sse3")   != HAS_SSE3);
-  CHECK(!__builtin_cpu_supports("ssse3")  != HAS_SSSE3);
-  CHECK(!__builtin_cpu_supports("sse4.1") != HAS_SSE41);
-  CHECK(!__builtin_cpu_supports("sse4.2") != HAS_SSE42);
-  CHECK(!__builtin_cpu_supports("avx")    != HAS_AVX);
-  CHECK(!__builtin_cpu_supports("avx2")   != HAS_AVX2);
-// clang-format on
-#endif
-}
-
-TEST(SIMDFlags, normalPrint) {
-  LOG(INFO) << "Has SSE:     " << std::boolalpha << HAS_SSE;
-  LOG(INFO) << "Has SSE2:    " << std::boolalpha << HAS_SSE2;
-  LOG(INFO) << "Has SSE3:    " << std::boolalpha << HAS_SSE3;
-  LOG(INFO) << "Has SSSE3:   " << std::boolalpha << HAS_SSSE3;
-  LOG(INFO) << "Has SSE4:    " << std::boolalpha << HAS_SSE41 || HAS_SSE42;
-  LOG(INFO) << "Has FMA3:    " << std::boolalpha << HAS_FMA3;
-  LOG(INFO) << "Has FMA4:    " << std::boolalpha << HAS_FMA4;
-  LOG(INFO) << "Has AVX:     " << std::boolalpha << HAS_AVX;
-  LOG(INFO) << "Has AVX2:    " << std::boolalpha << HAS_AVX2;
-  LOG(INFO) << "Has AVX512:  " << std::boolalpha << HAS_AVX512;
-  LOG(INFO) << "Has NEON:    " << std::boolalpha << HAS_NEON;
-}
diff --git a/paddle/utils/tests/test_SpinLock.cpp b/paddle/utils/tests/test_SpinLock.cpp
deleted file mode 100644
index cc34eb1f868003d3db9221578c0c20c44be285eb..0000000000000000000000000000000000000000
--- a/paddle/utils/tests/test_SpinLock.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <vector>
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Util.h"
-
-DEFINE_int32(test_thread_num, 100, "testing thread number");
-
-void testNormalImpl(
-    size_t thread_num,
-    const std::function<void(size_t, size_t&, paddle::SpinLock&)>& callback) {
-  paddle::SpinLock mutex;
-  std::vector<std::thread> threads;
-  threads.reserve(thread_num);
-
-  size_t count = 0;
-  for (size_t i = 0; i < thread_num; ++i) {
-    threads.emplace_back([&thread_num, &count, &mutex, &callback] {
-      callback(thread_num, count, mutex);
-    });
-  }
-  for (auto& thread : threads) {
-    thread.join();
-  }
-  // Check whether all threads reach this point or not
-  CHECK_EQ(count, thread_num);
-}
-
-TEST(ThreadSpinLock, normalTest) {
-  for (auto& thread_num : {10, 30, 50, 100, 300, 1000}) {
-    testNormalImpl(
-        thread_num,
-        [](size_t thread_num, size_t& count, paddle::SpinLock& mutex) {
-          std::lock_guard<paddle::SpinLock> lock(mutex);
-          ++count;
-        });
-  }
-}
diff --git a/paddle/utils/tests/test_StringUtils.cpp b/paddle/utils/tests/test_StringUtils.cpp
deleted file mode 100644
index 248f58a7f26e26e82b55110930964cee04fb558b..0000000000000000000000000000000000000000
--- a/paddle/utils/tests/test_StringUtils.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/StringUtil.h"
-
-#include <gtest/gtest.h>
-
-TEST(StringUtil, to) {
-  ASSERT_NEAR(paddle::str::to<double>("12.45"), 12.45, 1e-5);
-  ASSERT_DEATH_IF_SUPPORTED(paddle::str::to<double>("12.45x23"), ".*");
-  ASSERT_DEATH_IF_SUPPORTED(paddle::str::to<int>(""), ".*");
-}
diff --git a/paddle/utils/tests/test_Thread.cpp b/paddle/utils/tests/test_Thread.cpp
deleted file mode 100644
index 6e2580c4913f0adc7ba1e63c9cebce308775aac6..0000000000000000000000000000000000000000
--- a/paddle/utils/tests/test_Thread.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/utils/Thread.h>
-#include <atomic>
-
-using paddle::AsyncThreadPool;  // NOLINT
-
-TEST(AsyncThreadPool, addJob) {
-  AsyncThreadPool pool(8);
-  auto a = pool.addJob([] { return 1; });
-  auto b = pool.addJob([] { return true; });
-  auto c = pool.addJob([] { return false; });
-
-  ASSERT_EQ(a.get(), 1);
-  ASSERT_TRUE(b.get());
-  ASSERT_FALSE(c.get());
-}
-
-TEST(AsyncThreadPool, addBatchJob) {
-  AsyncThreadPool pool(8);
-  std::atomic<int> counter{0};
-
-  std::vector<AsyncThreadPool::JobFunc> jobs;
-
-  for (int i = 0; i < 10000; i++) {
-    jobs.emplace_back([&] { counter++; });
-  }
-
-  pool.addBatchJobs(jobs);
-
-  ASSERT_EQ(counter, 10000);
-}
-
-TEST(AsyncThreadPool, multiThreadAddBatchJob) {
-  AsyncThreadPool levelOnePool(200);
-  AsyncThreadPool levelTwoPool(200);
-
-  std::shared_ptr<std::mutex> mut = std::make_shared<std::mutex>();
-  int counter = 0;
-  const int numMonitors = 300;
-  const int numSlaves = 300;
-  std::vector<AsyncThreadPool::JobFunc> moniterJobs(numMonitors, [&] {
-    std::vector<AsyncThreadPool::JobFunc> slaveJobs(numSlaves, [mut, &counter] {
-      std::lock_guard<std::mutex> lk(*mut);
-      counter++;
-    });
-    levelTwoPool.addBatchJobs(slaveJobs);
-  });
-  levelOnePool.addBatchJobs(moniterJobs);
-  ASSERT_EQ(counter, numMonitors * numSlaves);
-}
-
-TEST(AsyncThreadPool, addBatchJobWithResults) {
-  AsyncThreadPool pool(100);
-
-  std::vector<std::function<int()>> jobs;
-  const int numJobs = 100;
-  for (int i = 0; i < numJobs; i++) {
-    jobs.emplace_back([i] { return i; });
-  }
-
-  std::vector<int> res;
-  pool.addBatchJobs(jobs, res);
-
-  for (int i = 0; i < numJobs; i++) {
-    ASSERT_EQ(res[i], i);
-  }
-}
diff --git a/paddle/utils/tests/test_ThreadBarrier.cpp b/paddle/utils/tests/test_ThreadBarrier.cpp
deleted file mode 100644
index 554b1c1d4adce7a0196b304281dcf878a0b6426e..0000000000000000000000000000000000000000
--- a/paddle/utils/tests/test_ThreadBarrier.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <set>
-#include <vector>
-
-#include <gflags/gflags.h>
-#include <gtest/gtest.h>
-
-#include "paddle/utils/Locks.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Util.h"
-
-DEFINE_int32(test_thread_num, 100, "testing thread number");
-
-void testNormalImpl(
-    size_t thread_num,
-    const std::function<void(size_t,
-                             std::mutex&,
-                             std::set<std::thread::id>&,
-                             paddle::ThreadBarrier&)>& callback) {
-  std::mutex mutex;
-  std::set<std::thread::id> tids;
-  paddle::ThreadBarrier barrier(thread_num);
-
-  std::vector<std::thread> threads;
-  threads.reserve(thread_num);
-  for (size_t i = 0; i < thread_num; ++i) {
-    threads.emplace_back([&thread_num, &mutex, &tids, &barrier, &callback] {
-      callback(thread_num, mutex, tids, barrier);
-    });
-  }
-
-  for (auto& thread : threads) {
-    thread.join();
-  }
-}
-
-TEST(ThreadBarrier, normalTest) {
-  for (auto& thread_num : {10, 30, 50, 100, 300, 1000}) {
-    testNormalImpl(thread_num,
-                   [](size_t thread_num,
-                      std::mutex& mutex,
-                      std::set<std::thread::id>& tids,
-                      paddle::ThreadBarrier& barrier) {
-                     {
-                       std::lock_guard<std::mutex> guard(mutex);
-                       tids.insert(std::this_thread::get_id());
-                     }
-                     barrier.wait();
-                     // Check whether all threads reach this point or not
-                     CHECK_EQ(tids.size(), thread_num);
-                   });
-  }
-}
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index 556bcd1d7e60c27fece43de666e9531ab4203414..a075eeb83bda64133920f9ab0275eb6c0e0fb8c4 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -15,13 +15,14 @@ foreach(filename ${proto_filenames})
     get_filename_component(ABS_FIL ${filename} ABSOLUTE)
     get_filename_component(FIL_WE ${filename} NAME_WE)
     set(CUR_PROTO_GEN_PY
-            ${PADDLE_SOURCE_DIR}/paddle/python/paddle/proto/${FIL_WE}_pb2.py)
+            ${PADDLE_BINARY_DIR}/paddle/python/paddle/proto/${FIL_WE}_pb2.py)
     set(PROTO_GEN_PY
             ${CUR_PROTO_GEN_PY}
             ${PROTO_GEN_PY})
     add_custom_command(OUTPUT ${CUR_PROTO_GEN_PY}
+            COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/proto
             COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-            ARGS "--python_out=${PADDLE_SOURCE_DIR}/python/paddle/proto"
+            ARGS "--python_out=${PADDLE_BINARY_DIR}/python/paddle/proto"
             "-I" ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL}
             DEPENDS ${ABS_FIL} protoc)
 endforeach()
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index d699984ff2d3345fc91b1c4080d7f16af1366a39..d78ee9c9f39ed09825dffdfa0a442c0ffac5958f 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -323,6 +323,16 @@ message ClipConfig {
   required double max = 2;
 }
 
+message UpsampleConfig {
+  required ImageConfig image_conf = 1;
+  optional uint32 scale = 2 [ default = 2 ];
+  optional uint32 scale_y = 3 [ default = 2 ];
+  optional bool pad_out_x = 4 [ default = false ];
+  optional bool pad_out_y = 5 [ default = false ];
+  optional uint32 upsample_size = 6;
+  optional uint32 upsample_size_y = 7;
+}
+
 message ROIPoolConfig {
   required uint32 pooled_width = 1;
   required uint32 pooled_height = 2;
@@ -359,6 +369,7 @@ message LayerInputConfig {
   optional ClipConfig clip_conf = 18;
   optional ScaleSubRegionConfig scale_sub_region_conf = 19;
   optional ROIPoolConfig roi_pool_conf = 20;
+  optional UpsampleConfig upsample_conf = 21;
 }
 
 message LayerConfig {
diff --git a/proto/README.md b/proto/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..dda7ed7b3c8ea4b541eaafbd0fd239eea789b40e
--- /dev/null
+++ b/proto/README.md
@@ -0,0 +1,3 @@
+## protos in this folder are legacy v2 protos.
+
+## Please refer to paddle/fluid for latest version.
diff --git a/python/.gitignore b/python/.gitignore
index 1ba1d4c9b0301ed920f5303089e75dd3a8e4e3fa..53a2b7a76b0dd2d9095f9582540e455e2c1174e2 100644
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -1,6 +1,7 @@
 *pyc
 build
 dist
+paddlepaddle.egg-info
 paddle.egg-info
 paddlepaddle_gpu.egg-info
 .idea
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 6e24cbdd3f6a4f05c1691dc643d880f6f454429d..797c0fbcc4a2d61f5cbbf691db19b4cba5d38630 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,27 +1,29 @@
-
-file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
-file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
-file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py)
-file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py)
-file(GLOB_RECURSE FLUID_PY_FILES ./paddle/fluid/ *.py)
-
+file(GLOB UTILS_PY_FILES . ./paddle/legacy/utils/*.py)
+file(GLOB_RECURSE FLUID_PY_FILES ./paddle/fluid/*.py)
 set(PY_FILES paddle/__init__.py
-  ${TRAINER_PY_FILES}
-  ${HELPERS_PY_FILES}
   ${UTILS_PY_FILES}
-  ${V2_PY_FILES}
   ${FLUID_PY_FILES})
 
-add_custom_target(copy_paddle_master)
+if(NOT WITH_FLUID_ONLY)
+  file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
+  file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
+  file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/*.py)
+  set(PY_FILES ${PY_FILES}
+    ${TRAINER_PY_FILES}
+    ${HELPERS_PY_FILES}
+    ${V2_PY_FILES})
 
-SET(COPY_PADDLE_MASTER "")
-if(WITH_GOLANG)
-  SET(COPY_PADDLE_MASTER "copy_paddle_master")
-  add_custom_command(TARGET ${COPY_PADDLE_MASTER}
-    COMMAND cp ${paddle_master_LIB_PATH} ${PADDLE_SOURCE_DIR}/python/paddle/v2/master/
-    )
-  add_dependencies(copy_paddle_master paddle_master)
-endif(WITH_GOLANG)
+  add_custom_target(copy_paddle_master)
+
+  SET(COPY_PADDLE_MASTER "")
+  if(WITH_GOLANG)
+    SET(COPY_PADDLE_MASTER "copy_paddle_master")
+    add_custom_command(TARGET ${COPY_PADDLE_MASTER}
+      COMMAND cp ${paddle_master_LIB_PATH} ${PADDLE_SOURCE_DIR}/python/paddle/v2/master/
+      )
+    add_dependencies(copy_paddle_master paddle_master)
+  endif(WITH_GOLANG)
+endif()
 
 set(MKL_SHARED_LIBS "")
 set(MKL_DEPENDS "")
@@ -44,38 +46,47 @@ endif()
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
-
-add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/fluid/core.so
-        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/fluid/core.so
+set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so)
+add_custom_command(OUTPUT ${FLUID_CORE}
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
         DEPENDS paddle_pybind)
-add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/fluid/core.so)
+add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE})
 
 
 add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND touch stub.cc
+    COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python
+    COMMAND cp -r ${PADDLE_SOURCE_DIR}/paddle/py_paddle ${PADDLE_BINARY_DIR}/python/
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
     COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
-    DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+    DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 
-set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp paddle_pserver_main paddle_trainer paddle_merge_model ${MKL_DEPENDS})
-if(WITH_SWIG_PY)
-    list(APPEND paddle_python_deps python_api_wheel)
+set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS})
+if(NOT WITH_FLUID_ONLY)
+    set(paddle_python_deps ${paddle_python_deps} paddle_pserver_main paddle_trainer paddle_merge_model)
+    if(WITH_SWIG_PY)
+        list(APPEND paddle_python_deps python_api_wheel)
+    endif()
 endif()
 add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps})
 
 set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
 
 if (WITH_TESTING)
-  add_subdirectory(paddle/trainer_config_helpers/tests)
-  if (WITH_SWIG_PY)
-    # enable v2 API unittest only when paddle swig api is compiled
-    add_subdirectory(paddle/v2/tests)
-    add_subdirectory(paddle/v2/reader/tests)
-    add_subdirectory(paddle/v2/plot/tests)
-    add_subdirectory(paddle/fluid/tests)
+  add_subdirectory(paddle/reader/tests)
+  add_subdirectory(paddle/dataset/tests)
+  if(NOT WITH_FLUID_ONLY)
+    add_subdirectory(paddle/trainer_config_helpers/tests)
+    if (WITH_SWIG_PY)
+      # enable v2 API unittest only when paddle swig api is compiled
+      add_subdirectory(paddle/v2/tests)
+      add_subdirectory(paddle/v2/plot/tests)
+      add_subdirectory(paddle/v2/reader/tests)
+    endif()
   endif()
+  add_subdirectory(paddle/fluid/tests)
 endif()
 install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
     DESTINATION opt/paddle/share/wheels
diff --git a/python/paddle/.gitignore b/python/paddle/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..98527864664d32f798edc06a53131e8d5a068295
--- /dev/null
+++ b/python/paddle/.gitignore
@@ -0,0 +1 @@
+version.py
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 1030c94e16376c326cb8b32926b8c47625cd38f0..d1cf04161ae4444ebc7da7fbc20e37dafe6c0fb1 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -14,8 +14,14 @@
 try:
     from version import full_version as __version__
     from version import commit as __git_commit__
+
 except ImportError:
     import sys
     sys.stderr.write('''Warning with import paddle: you should not 
      import paddle from the source directory; please install paddlepaddle*.whl firstly.'''
                      )
+
+import reader
+import dataset
+import batch
+batch = batch.batch
diff --git a/python/paddle/batch.py b/python/paddle/batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c6a53db3c2287e8ef5931a06ca5dad455665ee0
--- /dev/null
+++ b/python/paddle/batch.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['batch']
+
+
+def batch(reader, batch_size, drop_last=True):
+    """
+    Create a batched reader.
+
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param batch_size: size of each mini-batch
+    :type batch_size: int
+    :param drop_last: drop the last batch, if the size of last batch is not equal to batch_size.
+    :type drop_last: bool
+    :return: the batched reader.
+    :rtype: callable
+    """
+
+    def batch_reader():
+        r = reader()
+        b = []
+        for instance in r:
+            b.append(instance)
+            if len(b) == batch_size:
+                yield b
+                b = []
+        if drop_last == False and len(b) != 0:
+            yield b
+
+    return batch_reader
diff --git a/python/paddle/dataset/__init__.py b/python/paddle/dataset/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3315e826e82a33dfeb9c5223ce196cffb1ae7234
--- /dev/null
+++ b/python/paddle/dataset/__init__.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Dataset package.
+"""
+
+import mnist
+import imikolov
+import imdb
+import cifar
+import movielens
+import conll05
+import uci_housing
+import sentiment
+import wmt14
+import wmt16
+import mq2007
+import flowers
+import voc2012
+import image
+
+__all__ = [
+    'mnist',
+    'imikolov',
+    'imdb',
+    'cifar',
+    'movielens',
+    'conll05',
+    'sentiment',
+    'uci_housing',
+    'wmt14',
+    'wmt16',
+    'mq2007',
+    'flowers',
+    'voc2012',
+    'image',
+]
diff --git a/python/paddle/dataset/cifar.py b/python/paddle/dataset/cifar.py
new file mode 100644
index 0000000000000000000000000000000000000000..07f4dcbdab2fecf84a0a7042a48a8c8a9e5f880d
--- /dev/null
+++ b/python/paddle/dataset/cifar.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+CIFAR dataset.
+
+This module will download dataset from
+https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into
+paddle reader creators.
+
+The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,
+with 6000 images per class. There are 50000 training images and 10000 test
+images.
+
+The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes
+containing 600 images each. There are 500 training images and 100 testing
+images per class.
+
+"""
+
+import cPickle
+import itertools
+import numpy
+import paddle.dataset.common
+import tarfile
+
+__all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
+
+URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
+CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
+CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
+CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
+CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
+
+
+def reader_creator(filename, sub_name):
+    def read_batch(batch):
+        data = batch['data']
+        labels = batch.get('labels', batch.get('fine_labels', None))
+        assert labels is not None
+        for sample, label in itertools.izip(data, labels):
+            yield (sample / 255.0).astype(numpy.float32), int(label)
+
+    def reader():
+        with tarfile.open(filename, mode='r') as f:
+            names = (each_item.name for each_item in f
+                     if sub_name in each_item.name)
+
+            for name in names:
+                batch = cPickle.load(f.extractfile(name))
+                for item in read_batch(batch):
+                    yield item
+
+    return reader
+
+
+def train100():
+    """
+    CIFAR-100 training set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 99].
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+        'train')
+
+
+def test100():
+    """
+    CIFAR-100 test set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Test reader creator.
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+        'test')
+
+
+def train10():
+    """
+    CIFAR-10 training set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'data_batch')
+
+
+def test10():
+    """
+    CIFAR-10 test set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Test reader creator.
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'test_batch')
+
+
+def fetch():
+    paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
+    paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.dataset.common.convert(path, train100(), 1000, "cifar_train100")
+    paddle.dataset.common.convert(path, test100(), 1000, "cifar_test100")
+    paddle.dataset.common.convert(path, train10(), 1000, "cifar_train10")
+    paddle.dataset.common.convert(path, test10(), 1000, "cifar_test10")
diff --git a/python/paddle/dataset/common.py b/python/paddle/dataset/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..68660601c161d2332b17b448fae089506238ba78
--- /dev/null
+++ b/python/paddle/dataset/common.py
@@ -0,0 +1,236 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import requests
+import hashlib
+import os
+import errno
+import shutil
+import sys
+import importlib
+import paddle.dataset
+import cPickle
+import glob
+import cPickle as pickle
+
+__all__ = [
+    'DATA_HOME',
+    'download',
+    'md5file',
+    'split',
+    'cluster_files_reader',
+    'convert',
+]
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
+
+
+# When running unit tests, there could be multiple processes that
+# trying to create DATA_HOME directory simultaneously, so we cannot
+# use a if condition to check for the existence of the directory;
+# instead, we use the filesystem as the synchronization mechanism by
+# catching returned errors.
+def must_mkdirs(path):
+    try:
+        os.makedirs(DATA_HOME)
+    except OSError as exc:
+        if exc.errno != errno.EEXIST:
+            raise
+        pass
+
+
+must_mkdirs(DATA_HOME)
+
+
+def md5file(fname):
+    hash_md5 = hashlib.md5()
+    f = open(fname, "rb")
+    for chunk in iter(lambda: f.read(4096), b""):
+        hash_md5.update(chunk)
+    f.close()
+    return hash_md5.hexdigest()
+
+
+def download(url, module_name, md5sum, save_name=None):
+    dirname = os.path.join(DATA_HOME, module_name)
+    if not os.path.exists(dirname):
+        os.makedirs(dirname)
+
+    filename = os.path.join(dirname,
+                            url.split('/')[-1]
+                            if save_name is None else save_name)
+
+    retry = 0
+    retry_limit = 3
+    while not (os.path.exists(filename) and md5file(filename) == md5sum):
+        if os.path.exists(filename):
+            print "file md5", md5file(filename), md5sum
+        if retry < retry_limit:
+            retry += 1
+        else:
+            raise RuntimeError("Cannot download {0} within retry limit {1}".
+                               format(url, retry_limit))
+        print "Cache file %s not found, downloading %s" % (filename, url)
+        r = requests.get(url, stream=True)
+        total_length = r.headers.get('content-length')
+
+        if total_length is None:
+            with open(filename, 'w') as f:
+                shutil.copyfileobj(r.raw, f)
+        else:
+            with open(filename, 'w') as f:
+                dl = 0
+                total_length = int(total_length)
+                for data in r.iter_content(chunk_size=4096):
+                    dl += len(data)
+                    f.write(data)
+                    done = int(50 * dl / total_length)
+                    sys.stdout.write("\r[%s%s]" % ('=' * done,
+                                                   ' ' * (50 - done)))
+                    sys.stdout.flush()
+
+    return filename
+
+
+def fetch_all():
+    for module_name in filter(lambda x: not x.startswith("__"),
+                              dir(paddle.dataset)):
+        if "fetch" in dir(
+                importlib.import_module("paddle.dataset.%s" % module_name)):
+            getattr(
+                importlib.import_module("paddle.dataset.%s" % module_name),
+                "fetch")()
+
+
+def fetch_all_recordio(path):
+    for module_name in filter(lambda x: not x.startswith("__"),
+                              dir(paddle.dataset)):
+        if "convert" in dir(
+                importlib.import_module("paddle.dataset.%s" % module_name)) and \
+                not module_name == "common":
+            ds_path = os.path.join(path, module_name)
+            must_mkdirs(ds_path)
+            getattr(
+                importlib.import_module("paddle.dataset.%s" % module_name),
+                "convert")(ds_path)
+
+
+def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
+    """
+    you can call the function as:
+
+    split(paddle.dataset.cifar.train10(), line_count=1000,
+        suffix="imikolov-train-%05d.pickle")
+
+    the output files as:
+
+    |-imikolov-train-00000.pickle
+    |-imikolov-train-00001.pickle
+    |- ...
+    |-imikolov-train-00480.pickle
+
+    :param reader: is a reader creator
+    :param line_count: line count for each file
+    :param suffix: the suffix for the output files, should contain "%d"
+                means the id for each file. Default is "%05d.pickle"
+    :param dumper: is a callable function that dump object to file, this
+                function will be called as dumper(obj, f) and obj is the object
+                will be dumped, f is a file object. Default is cPickle.dump.
+    """
+    if not callable(dumper):
+        raise TypeError("dumper should be callable.")
+    lines = []
+    indx_f = 0
+    for i, d in enumerate(reader()):
+        lines.append(d)
+        if i >= line_count and i % line_count == 0:
+            with open(suffix % indx_f, "w") as f:
+                dumper(lines, f)
+                lines = []
+                indx_f += 1
+    if lines:
+        with open(suffix % indx_f, "w") as f:
+            dumper(lines, f)
+
+
+def cluster_files_reader(files_pattern,
+                         trainer_count,
+                         trainer_id,
+                         loader=cPickle.load):
+    """
+    Create a reader that yield element from the given files, select
+    a file set according trainer count and trainer_id
+
+    :param files_pattern: the files which generating by split(...)
+    :param trainer_count: total trainer count
+    :param trainer_id: the trainer rank id
+    :param loader: is a callable function that load object from file, this
+                function will be called as loader(f) and f is a file object.
+                Default is cPickle.load
+    """
+
+    def reader():
+        if not callable(loader):
+            raise TypeError("loader should be callable.")
+        file_list = glob.glob(files_pattern)
+        file_list.sort()
+        my_file_list = []
+        for idx, fn in enumerate(file_list):
+            if idx % trainer_count == trainer_id:
+                print "append file: %s" % fn
+                my_file_list.append(fn)
+        for fn in my_file_list:
+            with open(fn, "r") as f:
+                lines = loader(f)
+                for line in lines:
+                    yield line
+
+    return reader
+
+
+def convert(output_path, reader, line_count, name_prefix):
+    import recordio
+    """
+    Convert data from reader to recordio format files.
+
+    :param output_path: directory in which output files will be saved.
+    :param reader: a data reader, from which the convert program will read
+                   data instances.
+    :param name_prefix: the name prefix of generated files.
+    :param max_lines_to_shuffle: the max lines numbers to shuffle before
+                                 writing.
+    """
+
+    assert line_count >= 1
+    indx_f = 0
+
+    def write_data(indx_f, lines):
+        filename = "%s/%s-%05d" % (output_path, name_prefix, indx_f)
+        writer = recordio.writer(filename)
+        for l in lines:
+            # FIXME(Yancey1989):
+            # dumps with protocol: pickle.HIGHEST_PROTOCOL
+            writer.write(cPickle.dumps(l))
+        writer.close()
+
+    lines = []
+    for i, d in enumerate(reader()):
+        lines.append(d)
+        if i % line_count == 0 and i >= line_count:
+            write_data(indx_f, lines)
+            lines = []
+            indx_f += 1
+            continue
+
+    write_data(indx_f, lines)
diff --git a/python/paddle/dataset/conll05.py b/python/paddle/dataset/conll05.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e94ce89892f8e6822c15fdc510805e75dfca988
--- /dev/null
+++ b/python/paddle/dataset/conll05.py
@@ -0,0 +1,254 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Conll05 dataset.
+Paddle semantic role labeling Book and demo use this dataset as an example.
+Because Conll05 is not free in public, the default downloaded URL is test set
+of Conll05 (which is public). Users can change URL and MD5 to their Conll
+dataset. And a pre-trained word vector model based on Wikipedia corpus is used
+to initialize SRL model.
+"""
+
+import tarfile
+import gzip
+import itertools
+import paddle.dataset.common
+
+__all__ = ['test, get_dict', 'get_embedding', 'convert']
+
+DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
+DATA_MD5 = '387719152ae52d60422c016e92a742fc'
+WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt'
+WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
+VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt'
+VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
+TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt'
+TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
+EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb'
+EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
+
+UNK_IDX = 0
+
+
+def load_label_dict(filename):
+    d = dict()
+    tag_dict = set()
+    with open(filename, 'r') as f:
+        for i, line in enumerate(f):
+            line = line.strip()
+            if line.startswith("B-"):
+                tag_dict.add(line[2:])
+            elif line.startswith("I-"):
+                tag_dict.add(line[2:])
+        index = 0
+        for tag in tag_dict:
+            d["B-" + tag] = index
+            index += 1
+            d["I-" + tag] = index
+            index += 1
+        d["O"] = index
+    return d
+
+
+def load_dict(filename):
+    d = dict()
+    with open(filename, 'r') as f:
+        for i, line in enumerate(f):
+            d[line.strip()] = i
+    return d
+
+
+def corpus_reader(data_path, words_name, props_name):
+    """
+    Read one corpus. It returns an iterator. Each element of
+    this iterator is a tuple including sentence and labels. The sentence is
+    consist of a list of word IDs. The labels include a list of label IDs.
+    :return: a iterator of data.
+    :rtype: iterator
+    """
+
+    def reader():
+        tf = tarfile.open(data_path)
+        wf = tf.extractfile(words_name)
+        pf = tf.extractfile(props_name)
+        with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile(
+                fileobj=pf) as props_file:
+            sentences = []
+            labels = []
+            one_seg = []
+            for word, label in itertools.izip(words_file, props_file):
+                word = word.strip()
+                label = label.strip().split()
+
+                if len(label) == 0:  # end of sentence
+                    for i in xrange(len(one_seg[0])):
+                        a_kind_lable = [x[i] for x in one_seg]
+                        labels.append(a_kind_lable)
+
+                    if len(labels) >= 1:
+                        verb_list = []
+                        for x in labels[0]:
+                            if x != '-':
+                                verb_list.append(x)
+
+                        for i, lbl in enumerate(labels[1:]):
+                            cur_tag = 'O'
+                            is_in_bracket = False
+                            lbl_seq = []
+                            verb_word = ''
+                            for l in lbl:
+                                if l == '*' and is_in_bracket == False:
+                                    lbl_seq.append('O')
+                                elif l == '*' and is_in_bracket == True:
+                                    lbl_seq.append('I-' + cur_tag)
+                                elif l == '*)':
+                                    lbl_seq.append('I-' + cur_tag)
+                                    is_in_bracket = False
+                                elif l.find('(') != -1 and l.find(')') != -1:
+                                    cur_tag = l[1:l.find('*')]
+                                    lbl_seq.append('B-' + cur_tag)
+                                    is_in_bracket = False
+                                elif l.find('(') != -1 and l.find(')') == -1:
+                                    cur_tag = l[1:l.find('*')]
+                                    lbl_seq.append('B-' + cur_tag)
+                                    is_in_bracket = True
+                                else:
+                                    raise RuntimeError('Unexpected label: %s' %
+                                                       l)
+
+                            yield sentences, verb_list[i], lbl_seq
+
+                    sentences = []
+                    labels = []
+                    one_seg = []
+                else:
+                    sentences.append(word)
+                    one_seg.append(label)
+
+        pf.close()
+        wf.close()
+        tf.close()
+
+    return reader
+
+
+def reader_creator(corpus_reader,
+                   word_dict=None,
+                   predicate_dict=None,
+                   label_dict=None):
+    def reader():
+        for sentence, predicate, labels in corpus_reader():
+
+            sen_len = len(sentence)
+
+            verb_index = labels.index('B-V')
+            mark = [0] * len(labels)
+            if verb_index > 0:
+                mark[verb_index - 1] = 1
+                ctx_n1 = sentence[verb_index - 1]
+            else:
+                ctx_n1 = 'bos'
+
+            if verb_index > 1:
+                mark[verb_index - 2] = 1
+                ctx_n2 = sentence[verb_index - 2]
+            else:
+                ctx_n2 = 'bos'
+
+            mark[verb_index] = 1
+            ctx_0 = sentence[verb_index]
+
+            if verb_index < len(labels) - 1:
+                mark[verb_index + 1] = 1
+                ctx_p1 = sentence[verb_index + 1]
+            else:
+                ctx_p1 = 'eos'
+
+            if verb_index < len(labels) - 2:
+                mark[verb_index + 2] = 1
+                ctx_p2 = sentence[verb_index + 2]
+            else:
+                ctx_p2 = 'eos'
+
+            word_idx = [word_dict.get(w, UNK_IDX) for w in sentence]
+
+            ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len
+            ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len
+            ctx_0_idx = [word_dict.get(ctx_0, UNK_IDX)] * sen_len
+            ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+            ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len
+
+            pred_idx = [predicate_dict.get(predicate)] * sen_len
+            label_idx = [label_dict.get(w) for w in labels]
+
+            yield word_idx, ctx_n2_idx, ctx_n1_idx, \
+              ctx_0_idx, ctx_p1_idx, ctx_p2_idx, pred_idx, mark, label_idx
+
+    return reader
+
+
+def get_dict():
+    """
+    Get the word, verb and label dictionary of Wikipedia corpus.
+    """
+    word_dict = load_dict(
+        paddle.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5))
+    verb_dict = load_dict(
+        paddle.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5))
+    label_dict = load_label_dict(
+        paddle.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5))
+    return word_dict, verb_dict, label_dict
+
+
+def get_embedding():
+    """
+    Get the trained word vector based on Wikipedia corpus.
+    """
+    return paddle.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
+
+
+def test():
+    """
+    Conll05 test set creator.
+
+    Because the training dataset is not free, the test dataset is used for
+    training. It returns a reader creator, each sample in the reader is nine
+    features, including sentence sequence, predicate, predicate context,
+    predicate context flag and tagged sequence.
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    word_dict, verb_dict, label_dict = get_dict()
+    reader = corpus_reader(
+        paddle.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5),
+        words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
+        props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
+    return reader_creator(reader, word_dict, verb_dict, label_dict)
+
+
+def fetch():
+    paddle.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
+    paddle.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
+    paddle.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
+    paddle.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
+    paddle.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.dataset.common.convert(path, test(), 1000, "conl105_train")
+    paddle.dataset.common.convert(path, test(), 1000, "conl105_test")
diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
new file mode 100644
index 0000000000000000000000000000000000000000..527044b415533cc640e3cfc5837c08ab0f8b74b1
--- /dev/null
+++ b/python/paddle/dataset/flowers.py
@@ -0,0 +1,200 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This module will download dataset from
+http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html
+and parse train/test set intopaddle reader creators.
+
+This set contains images of flowers belonging to 102 different categories.
+The images were acquired by searching the web and taking pictures. There are a
+minimum of 40 images for each category.
+
+The database was used in:
+
+Nilsback, M-E. and Zisserman, A. Automated flower classification over a large
+ number of classes.Proceedings of the Indian Conference on Computer Vision,
+Graphics and Image Processing (2008)
+http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
+
+"""
+import cPickle
+import itertools
+import functools
+from common import download
+import tarfile
+import scipy.io as scio
+from paddle.dataset.image import *
+from paddle.reader import *
+import os
+import numpy as np
+from multiprocessing import cpu_count
+__all__ = ['train', 'test', 'valid']
+
+DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
+LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat'
+SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
+DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118'
+LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
+SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
+# In official 'readme', tstid is the flag of test data
+# and trnid is the flag of train data. But test data is more than train data.
+# So we exchange the train data and test data.
+TRAIN_FLAG = 'tstid'
+TEST_FLAG = 'trnid'
+VALID_FLAG = 'valid'
+
+
+def default_mapper(is_train, sample):
+    '''
+    map image bytes data to type needed by model input layer
+    '''
+    img, label = sample
+    img = load_image_bytes(img)
+    img = simple_transform(
+        img, 256, 224, is_train, mean=[103.94, 116.78, 123.68])
+    return img.flatten().astype('float32'), label
+
+
+train_mapper = functools.partial(default_mapper, True)
+test_mapper = functools.partial(default_mapper, False)
+
+
+def reader_creator(data_file,
+                   label_file,
+                   setid_file,
+                   dataset_name,
+                   mapper,
+                   buffered_size=1024,
+                   use_xmap=True):
+    '''
+    1. read images from tar file and
+        merge images into batch files in 102flowers.tgz_batch/
+    2. get a reader to read sample from batch file
+
+    :param data_file: downloaded data file
+    :type data_file: string
+    :param label_file: downloaded label file
+    :type label_file: string
+    :param setid_file: downloaded setid file containing information
+                        about how to split dataset
+    :type setid_file: string
+    :param dataset_name: data set name (tstid|trnid|valid)
+    :type dataset_name: string
+    :param mapper: a function to map image bytes data to type
+                    needed by model input layer
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: data reader
+    :rtype: callable
+    '''
+    labels = scio.loadmat(label_file)['labels'][0]
+    indexes = scio.loadmat(setid_file)[dataset_name][0]
+    img2label = {}
+    for i in indexes:
+        img = "jpg/image_%05d.jpg" % i
+        img2label[img] = labels[i - 1]
+    file_list = batch_images_from_tar(data_file, dataset_name, img2label)
+
+    def reader():
+        for file in open(file_list):
+            file = file.strip()
+            batch = None
+            with open(file, 'r') as f:
+                batch = cPickle.load(f)
+            data = batch['data']
+            labels = batch['label']
+            for sample, label in itertools.izip(data, batch['label']):
+                yield sample, int(label) - 1
+
+    if use_xmap:
+        cpu_num = int(os.environ.get('CPU_NUM', cpu_count()))
+        return xmap_readers(mapper, reader, cpu_num, buffered_size)
+    else:
+        return map_readers(mapper, reader)
+
+
+def train(mapper=train_mapper, buffered_size=1024, use_xmap=True):
+    '''
+    Create flowers training set reader.
+    It returns a reader, each sample in the reader is
+    image pixels in [0, 1] and label in [1, 102]
+    translated from original color image by steps:
+    1. resize to 256*256
+    2. random crop to 224*224
+    3. flatten
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: train data reader
+    :rtype: callable
+    '''
+    return reader_creator(
+        download(DATA_URL, 'flowers', DATA_MD5),
+        download(LABEL_URL, 'flowers', LABEL_MD5),
+        download(SETID_URL, 'flowers', SETID_MD5), TRAIN_FLAG, mapper,
+        buffered_size, use_xmap)
+
+
+def test(mapper=test_mapper, buffered_size=1024, use_xmap=True):
+    '''
+    Create flowers test set reader.
+    It returns a reader, each sample in the reader is
+    image pixels in [0, 1] and label in [1, 102]
+    translated from original color image by steps:
+    1. resize to 256*256
+    2. random crop to 224*224
+    3. flatten
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: test data reader
+    :rtype: callable
+    '''
+    return reader_creator(
+        download(DATA_URL, 'flowers', DATA_MD5),
+        download(LABEL_URL, 'flowers', LABEL_MD5),
+        download(SETID_URL, 'flowers', SETID_MD5), TEST_FLAG, mapper,
+        buffered_size, use_xmap)
+
+
+def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True):
+    '''
+    Create flowers validation set reader.
+    It returns a reader, each sample in the reader is
+    image pixels in [0, 1] and label in [1, 102]
+    translated from original color image by steps:
+    1. resize to 256*256
+    2. random crop to 224*224
+    3. flatten
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param buffered_size: the size of buffer used to process images
+    :type buffered_size: int
+    :return: test data reader
+    :rtype: callable
+    '''
+    return reader_creator(
+        download(DATA_URL, 'flowers', DATA_MD5),
+        download(LABEL_URL, 'flowers', LABEL_MD5),
+        download(SETID_URL, 'flowers', SETID_MD5), VALID_FLAG, mapper,
+        buffered_size, use_xmap)
+
+
+def fetch():
+    download(DATA_URL, 'flowers', DATA_MD5)
+    download(LABEL_URL, 'flowers', LABEL_MD5)
+    download(SETID_URL, 'flowers', SETID_MD5)
diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..9235c41e9eb95b25a0dc53a494a203e7a4525981
--- /dev/null
+++ b/python/paddle/dataset/image.py
@@ -0,0 +1,381 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This file contains some common interfaces for image preprocess.
+Many users are confused about the image layout. We introduce
+the image layout as follows.
+
+- CHW Layout
+
+  - The abbreviations: C=channel, H=Height, W=Width
+  - The default layout of image opened by cv2 or PIL is HWC.
+    PaddlePaddle only supports the CHW layout. And CHW is simply
+    a transpose of HWC. It must transpose the input image.
+
+- Color format: RGB or BGR
+
+  OpenCV use BGR color format. PIL use RGB color format. Both
+  formats can be used for training. Noted that, the format should
+  be keep consistent between the training and inference peroid.
+"""
+import numpy as np
+try:
+    import cv2
+except ImportError:
+    cv2 = None
+import os
+import tarfile
+import cPickle
+
+__all__ = [
+    "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
+    "random_crop", "left_right_flip", "simple_transform", "load_and_transform",
+    "batch_images_from_tar"
+]
+
+
+def batch_images_from_tar(data_file,
+                          dataset_name,
+                          img2label,
+                          num_per_batch=1024):
+    """
+    Read images from tar file and batch them into batch file.
+
+    :param data_file: path of image tar file
+    :type data_file: string
+    :param dataset_name: 'train','test' or 'valid'
+    :type dataset_name: string
+    :param img2label: a dic with image file name as key 
+                    and image's label as value
+    :type img2label: dic
+    :param num_per_batch: image number per batch file
+    :type num_per_batch: int
+    :return: path of list file containing paths of batch file
+    :rtype: string
+    """
+    batch_dir = data_file + "_batch"
+    out_path = "%s/%s" % (batch_dir, dataset_name)
+    meta_file = "%s/%s.txt" % (batch_dir, dataset_name)
+
+    if os.path.exists(out_path):
+        return meta_file
+    else:
+        os.makedirs(out_path)
+
+    tf = tarfile.open(data_file)
+    mems = tf.getmembers()
+    data = []
+    labels = []
+    file_id = 0
+    for mem in mems:
+        if mem.name in img2label:
+            data.append(tf.extractfile(mem).read())
+            labels.append(img2label[mem.name])
+            if len(data) == num_per_batch:
+                output = {}
+                output['label'] = labels
+                output['data'] = data
+                cPickle.dump(
+                    output,
+                    open('%s/batch_%d' % (out_path, file_id), 'w'),
+                    protocol=cPickle.HIGHEST_PROTOCOL)
+                file_id += 1
+                data = []
+                labels = []
+    if len(data) > 0:
+        output = {}
+        output['label'] = labels
+        output['data'] = data
+        cPickle.dump(
+            output,
+            open('%s/batch_%d' % (out_path, file_id), 'w'),
+            protocol=cPickle.HIGHEST_PROTOCOL)
+
+    with open(meta_file, 'a') as meta:
+        for file in os.listdir(out_path):
+            meta.write(os.path.abspath("%s/%s" % (out_path, file)) + "\n")
+    return meta_file
+
+
+def load_image_bytes(bytes, is_color=True):
+    """
+    Load an color or gray image from bytes array.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        with open('cat.jpg') as f:
+            im = load_image_bytes(f.read())
+
+    :param bytes: the input image bytes array.
+    :type bytes: str
+    :param is_color: If set is_color True, it will load and
+                     return a color image. Otherwise, it will
+                     load and return a gray image.
+    :type is_color: bool
+    """
+    flag = 1 if is_color else 0
+    file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8)
+    img = cv2.imdecode(file_bytes, flag)
+    return img
+
+
+def load_image(file, is_color=True):
+    """
+    Load an color or gray image from the file path.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = load_image('cat.jpg')
+
+    :param file: the input image path.
+    :type file: string
+    :param is_color: If set is_color True, it will load and
+                     return a color image. Otherwise, it will
+                     load and return a gray image.
+    :type is_color: bool
+    """
+    # cv2.IMAGE_COLOR for OpenCV3
+    # cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version
+    # cv2.IMAGE_GRAYSCALE for OpenCV3
+    # cv2.CV_LOAD_IMAGE_GRAYSCALE for older OpenCV Version
+    # Here, use constant 1 and 0
+    # 1: COLOR, 0: GRAYSCALE
+    flag = 1 if is_color else 0
+    im = cv2.imread(file, flag)
+    return im
+
+
+def resize_short(im, size):
+    """ 
+    Resize an image so that the length of shorter edge is size.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = load_image('cat.jpg')
+        im = resize_short(im, 256)
+    
+    :param im: the input image with HWC layout.
+    :type im: ndarray
+    :param size: the shorter edge size of image after resizing.
+    :type size: int
+    """
+    h, w = im.shape[:2]
+    h_new, w_new = size, size
+    if h > w:
+        h_new = size * h / w
+    else:
+        w_new = size * w / h
+    im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC)
+    return im
+
+
+def to_chw(im, order=(2, 0, 1)):
+    """
+    Transpose the input image order. The image layout is HWC format
+    opened by cv2 or PIL. Transpose the input image to CHW layout
+    according the order (2,0,1).
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = load_image('cat.jpg')
+        im = resize_short(im, 256)
+        im = to_chw(im)
+    
+    :param im: the input image with HWC layout.
+    :type im: ndarray
+    :param order: the transposed order.
+    :type order: tuple|list 
+    """
+    assert len(im.shape) == len(order)
+    im = im.transpose(order)
+    return im
+
+
+def center_crop(im, size, is_color=True):
+    """
+    Crop the center of image with size.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = center_crop(im, 224)
+    
+    :param im: the input image with HWC layout.
+    :type im: ndarray
+    :param size: the cropping size.
+    :type size: int
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    """
+    h, w = im.shape[:2]
+    h_start = (h - size) / 2
+    w_start = (w - size) / 2
+    h_end, w_end = h_start + size, w_start + size
+    if is_color:
+        im = im[h_start:h_end, w_start:w_end, :]
+    else:
+        im = im[h_start:h_end, w_start:w_end]
+    return im
+
+
+def random_crop(im, size, is_color=True):
+    """
+    Randomly crop input image with size.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = random_crop(im, 224)
+    
+    :param im: the input image with HWC layout.
+    :type im: ndarray
+    :param size: the cropping size.
+    :type size: int
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    """
+    h, w = im.shape[:2]
+    h_start = np.random.randint(0, h - size + 1)
+    w_start = np.random.randint(0, w - size + 1)
+    h_end, w_end = h_start + size, w_start + size
+    if is_color:
+        im = im[h_start:h_end, w_start:w_end, :]
+    else:
+        im = im[h_start:h_end, w_start:w_end]
+    return im
+
+
+def left_right_flip(im, is_color=True):
+    """
+    Flip an image along the horizontal direction.
+    Return the flipped image.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = left_right_flip(im)
+    
+    :param im: input image with HWC layout or HW layout for gray image
+    :type im: ndarray
+    :param is_color: whether input image is color or not
+    :type is_color: bool
+    """
+    if len(im.shape) == 3 and is_color:
+        return im[:, ::-1, :]
+    else:
+        return im[:, ::-1]
+
+
+def simple_transform(im,
+                     resize_size,
+                     crop_size,
+                     is_train,
+                     is_color=True,
+                     mean=None):
+    """
+    Simply data argumentation for training. These operations include
+    resizing, croping and flipping.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = simple_transform(im, 256, 224, True)
+
+    :param im: The input image with HWC layout.
+    :type im: ndarray
+    :param resize_size: The shorter edge length of the resized image.
+    :type resize_size: int
+    :param crop_size: The cropping size.
+    :type crop_size: int
+    :param is_train: Whether it is training or not.
+    :type is_train: bool
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    :param mean: the mean values, which can be element-wise mean values or 
+                 mean values per channel.
+    :type mean: numpy array | list
+    """
+    im = resize_short(im, resize_size)
+    if is_train:
+        im = random_crop(im, crop_size, is_color=is_color)
+        if np.random.randint(2) == 0:
+            im = left_right_flip(im, is_color)
+    else:
+        im = center_crop(im, crop_size, is_color)
+        im = center_crop(im, crop_size, is_color=is_color)
+    if len(im.shape) == 3:
+        im = to_chw(im)
+
+    im = im.astype('float32')
+    if mean is not None:
+        mean = np.array(mean, dtype=np.float32)
+        # mean value, may be one value per channel 
+        if mean.ndim == 1 and is_color:
+            mean = mean[:, np.newaxis, np.newaxis]
+        elif mean.ndim == 1:
+            mean = mean
+        else:
+            # elementwise mean
+            assert len(mean.shape) == len(im)
+        im -= mean
+
+    return im
+
+
+def load_and_transform(filename,
+                       resize_size,
+                       crop_size,
+                       is_train,
+                       is_color=True,
+                       mean=None):
+    """
+    Load image from the input file `filename` and transform image for
+    data argumentation. Please refer to the `simple_transform` interface
+    for the transform operations.
+
+    Example usage:
+    
+    .. code-block:: python
+
+        im = load_and_transform('cat.jpg', 256, 224, True)
+
+    :param filename: The file name of input image.
+    :type filename: string
+    :param resize_size: The shorter edge length of the resized image.
+    :type resize_size: int
+    :param crop_size: The cropping size.
+    :type crop_size: int
+    :param is_train: Whether it is training or not.
+    :type is_train: bool
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    :param mean: the mean values, which can be element-wise mean values or 
+                 mean values per channel.
+    :type mean: numpy array | list
+    """
+    im = load_image(filename, is_color)
+    im = simple_transform(im, resize_size, crop_size, is_train, is_color, mean)
+    return im
diff --git a/python/paddle/dataset/imdb.py b/python/paddle/dataset/imdb.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ff05b1e9b7f4c42909370a21beb140ecdcd6868
--- /dev/null
+++ b/python/paddle/dataset/imdb.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+IMDB dataset.
+
+This module downloads IMDB dataset from
+http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set
+of 25,000 highly polar movie reviews for training, and 25,000 for testing.
+Besides, this module also provides API for building dictionary.
+"""
+
+import paddle.dataset.common
+import collections
+import tarfile
+import re
+import string
+
+__all__ = ['build_dict', 'train', 'test', 'convert']
+
+URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
+MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
+
+
+def tokenize(pattern):
+    """
+    Read files that match the given pattern.  Tokenize and yield each file.
+    """
+
+    with tarfile.open(paddle.dataset.common.download(URL, 'imdb', MD5)) as tarf:
+        # Note that we should use tarfile.next(), which does
+        # sequential access of member files, other than
+        # tarfile.extractfile, which does random access and might
+        # destroy hard disks.
+        tf = tarf.next()
+        while tf != None:
+            if bool(pattern.match(tf.name)):
+                # newline and punctuations removal and ad-hoc tokenization.
+                yield tarf.extractfile(tf).read().rstrip("\n\r").translate(
+                    None, string.punctuation).lower().split()
+            tf = tarf.next()
+
+
+def build_dict(pattern, cutoff):
+    """
+    Build a word dictionary from the corpus. Keys of the dictionary are words,
+    and values are zero-based IDs of these words.
+    """
+    word_freq = collections.defaultdict(int)
+    for doc in tokenize(pattern):
+        for word in doc:
+            word_freq[word] += 1
+
+    # Not sure if we should prune less-frequent words here.
+    word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())
+
+    dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+    words, _ = list(zip(*dictionary))
+    word_idx = dict(zip(words, xrange(len(words))))
+    word_idx['<unk>'] = len(words)
+    return word_idx
+
+
+def reader_creator(pos_pattern, neg_pattern, word_idx):
+    UNK = word_idx['<unk>']
+    INS = []
+
+    def load(pattern, out, label):
+        for doc in tokenize(pattern):
+            out.append(([word_idx.get(w, UNK) for w in doc], label))
+
+    load(pos_pattern, INS, 0)
+    load(neg_pattern, INS, 1)
+
+    def reader():
+        for doc, label in INS:
+            yield doc, label
+
+    return reader
+
+
+def train(word_idx):
+    """
+    IMDB training set creator.
+
+    It returns a reader creator, each sample in the reader is an zero-based ID
+    sequence and label in [0, 1].
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        re.compile("aclImdb/train/pos/.*\.txt$"),
+        re.compile("aclImdb/train/neg/.*\.txt$"), word_idx)
+
+
+def test(word_idx):
+    """
+    IMDB test set creator.
+
+    It returns a reader creator, each sample in the reader is an zero-based ID
+    sequence and label in [0, 1].
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :return: Test reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        re.compile("aclImdb/test/pos/.*\.txt$"),
+        re.compile("aclImdb/test/neg/.*\.txt$"), word_idx)
+
+
+def word_dict():
+    """
+    Build a word dictionary from the corpus.
+
+    :return: Word dictionary
+    :rtype: dict
+    """
+    return build_dict(
+        re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
+
+
+def fetch():
+    paddle.dataset.common.download(URL, 'imdb', MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    w = word_dict()
+    paddle.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train")
+    paddle.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test")
diff --git a/python/paddle/dataset/imikolov.py b/python/paddle/dataset/imikolov.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6c0a0f54373dd068b2c493f6fbc9c8578593aef
--- /dev/null
+++ b/python/paddle/dataset/imikolov.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+imikolov's simple dataset.
+
+This module will download dataset from 
+http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
+into paddle reader creators.
+"""
+import paddle.dataset.common
+import collections
+import tarfile
+
+__all__ = ['train', 'test', 'build_dict', 'convert']
+
+URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
+MD5 = '30177ea32e27c525793142b6bf2c8e2d'
+
+
+class DataType(object):
+    NGRAM = 1
+    SEQ = 2
+
+
+def word_count(f, word_freq=None):
+    if word_freq is None:
+        word_freq = collections.defaultdict(int)
+
+    for l in f:
+        for w in l.strip().split():
+            word_freq[w] += 1
+        word_freq['<s>'] += 1
+        word_freq['<e>'] += 1
+
+    return word_freq
+
+
+def build_dict(min_word_freq=50):
+    """
+    Build a word dictionary from the corpus,  Keys of the dictionary are words,
+    and values are zero-based IDs of these words.
+    """
+    train_filename = './simple-examples/data/ptb.train.txt'
+    test_filename = './simple-examples/data/ptb.valid.txt'
+    with tarfile.open(
+            paddle.dataset.common.download(paddle.dataset.imikolov.URL,
+                                           'imikolov',
+                                           paddle.dataset.imikolov.MD5)) as tf:
+        trainf = tf.extractfile(train_filename)
+        testf = tf.extractfile(test_filename)
+        word_freq = word_count(testf, word_count(trainf))
+        if '<unk>' in word_freq:
+            # remove <unk> for now, since we will set it as last index
+            del word_freq['<unk>']
+
+        word_freq = filter(lambda x: x[1] > min_word_freq, word_freq.items())
+
+        word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+        words, _ = list(zip(*word_freq_sorted))
+        word_idx = dict(zip(words, xrange(len(words))))
+        word_idx['<unk>'] = len(words)
+
+    return word_idx
+
+
+def reader_creator(filename, word_idx, n, data_type):
+    def reader():
+        with tarfile.open(
+                paddle.dataset.common.download(
+                    paddle.dataset.imikolov.URL, 'imikolov',
+                    paddle.dataset.imikolov.MD5)) as tf:
+            f = tf.extractfile(filename)
+
+            UNK = word_idx['<unk>']
+            for l in f:
+                if DataType.NGRAM == data_type:
+                    assert n > -1, 'Invalid gram length'
+                    l = ['<s>'] + l.strip().split() + ['<e>']
+                    if len(l) >= n:
+                        l = [word_idx.get(w, UNK) for w in l]
+                        for i in range(n, len(l) + 1):
+                            yield tuple(l[i - n:i])
+                elif DataType.SEQ == data_type:
+                    l = l.strip().split()
+                    l = [word_idx.get(w, UNK) for w in l]
+                    src_seq = [word_idx['<s>']] + l
+                    trg_seq = l + [word_idx['<e>']]
+                    if n > 0 and len(src_seq) > n: continue
+                    yield src_seq, trg_seq
+                else:
+                    assert False, 'Unknow data type'
+
+    return reader
+
+
+def train(word_idx, n, data_type=DataType.NGRAM):
+    """
+    imikolov training set creator.
+
+    It returns a reader creator, each sample in the reader is a word ID
+    tuple.
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :param n: sliding window size if type is ngram, otherwise max length of sequence
+    :type n: int
+    :param data_type: data type (ngram or sequence)
+    :type data_type: member variable of DataType (NGRAM or SEQ)
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n,
+                          data_type)
+
+
+def test(word_idx, n, data_type=DataType.NGRAM):
+    """
+    imikolov test set creator.
+
+    It returns a reader creator, each sample in the reader is a word ID
+    tuple.
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :param n: sliding window size if type is ngram, otherwise max length of sequence
+    :type n: int
+    :param data_type: data type (ngram or sequence)
+    :type data_type: member variable of DataType (NGRAM or SEQ)
+    :return: Test reader creator
+    :rtype: callable
+    """
+    return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n,
+                          data_type)
+
+
+def fetch():
+    paddle.dataset.common.download(URL, "imikolov", MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    N = 5
+    word_dict = build_dict()
+    paddle.dataset.common.convert(path,
+                                  train(word_dict, N), 1000, "imikolov_train")
+    paddle.dataset.common.convert(path,
+                                  test(word_dict, N), 1000, "imikolov_test")
diff --git a/python/paddle/dataset/mnist.py b/python/paddle/dataset/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d05aeeb95c4f936cb773ece20407ecb32cbbf21
--- /dev/null
+++ b/python/paddle/dataset/mnist.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+MNIST dataset.
+
+This module will download dataset from http://yann.lecun.com/exdb/mnist/ and
+parse training set and test set into paddle reader creators.
+"""
+import paddle.dataset.common
+import subprocess
+import numpy
+import platform
+__all__ = ['train', 'test', 'convert']
+
+URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/'
+TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
+TEST_IMAGE_MD5 = '9fb629c4189551a2d022fa330f9573f3'
+TEST_LABEL_URL = URL_PREFIX + 't10k-labels-idx1-ubyte.gz'
+TEST_LABEL_MD5 = 'ec29112dd5afa0611ce80d1b7f02629c'
+TRAIN_IMAGE_URL = URL_PREFIX + 'train-images-idx3-ubyte.gz'
+TRAIN_IMAGE_MD5 = 'f68b3c2dcbeaaa9fbdd348bbdeb94873'
+TRAIN_LABEL_URL = URL_PREFIX + 'train-labels-idx1-ubyte.gz'
+TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
+
+
+def reader_creator(image_filename, label_filename, buffer_size):
+    def reader():
+        if platform.system() == 'Darwin':
+            zcat_cmd = 'gzcat'
+        elif platform.system() == 'Linux':
+            zcat_cmd = 'zcat'
+        else:
+            raise NotImplementedError()
+
+        # According to http://stackoverflow.com/a/38061619/724872, we
+        # cannot use standard package gzip here.
+        m = subprocess.Popen([zcat_cmd, image_filename], stdout=subprocess.PIPE)
+        m.stdout.read(16)  # skip some magic bytes
+
+        l = subprocess.Popen([zcat_cmd, label_filename], stdout=subprocess.PIPE)
+        l.stdout.read(8)  # skip some magic bytes
+
+        try:  # reader could be break.
+            while True:
+                labels = numpy.fromfile(
+                    l.stdout, 'ubyte', count=buffer_size).astype("int")
+
+                if labels.size != buffer_size:
+                    break  # numpy.fromfile returns empty slice after EOF.
+
+                images = numpy.fromfile(
+                    m.stdout, 'ubyte', count=buffer_size * 28 * 28).reshape(
+                        (buffer_size, 28 * 28)).astype('float32')
+
+                images = images / 255.0 * 2.0 - 1.0
+
+                for i in xrange(buffer_size):
+                    yield images[i, :], int(labels[i])
+        finally:
+            m.terminate()
+            l.terminate()
+
+    return reader
+
+
+def train():
+    """
+    MNIST training set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.dataset.common.download(TRAIN_IMAGE_URL, 'mnist',
+                                       TRAIN_IMAGE_MD5),
+        paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist',
+                                       TRAIN_LABEL_MD5), 100)
+
+
+def test():
+    """
+    MNIST test set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Test reader creator.
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5),
+        paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5),
+        100)
+
+
+def fetch():
+    paddle.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
+    paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+    paddle.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
+    paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.dataset.common.convert(path, train(), 1000, "minist_train")
+    paddle.dataset.common.convert(path, test(), 1000, "minist_test")
diff --git a/python/paddle/dataset/movielens.py b/python/paddle/dataset/movielens.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab11716202a8298c182e23b661eb1d2ac74bf4da
--- /dev/null
+++ b/python/paddle/dataset/movielens.py
@@ -0,0 +1,262 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Movielens 1-M dataset.
+
+Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000
+movies, which was collected by GroupLens Research. This module will download
+Movielens 1-M dataset from 
+http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse training
+set and test set into paddle reader creators.
+
+"""
+
+import zipfile
+import paddle.dataset.common
+import re
+import random
+import functools
+
+__all__ = [
+    'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id',
+    'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info',
+    'convert'
+]
+
+age_table = [1, 18, 25, 35, 45, 50, 56]
+
+URL = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
+MD5 = 'c4d9eecfca2ab87c1945afe126590906'
+
+
+class MovieInfo(object):
+    """
+    Movie id, title and categories information are stored in MovieInfo.
+    """
+
+    def __init__(self, index, categories, title):
+        self.index = int(index)
+        self.categories = categories
+        self.title = title
+
+    def value(self):
+        """
+        Get information from a movie.
+        """
+        return [
+            self.index, [CATEGORIES_DICT[c] for c in self.categories],
+            [MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()]
+        ]
+
+    def __str__(self):
+        return "<MovieInfo id(%d), title(%s), categories(%s)>" % (
+            self.index, self.title, self.categories)
+
+    def __repr__(self):
+        return self.__str__()
+
+
+class UserInfo(object):
+    """
+    User id, gender, age, and job information are stored in UserInfo.
+    """
+
+    def __init__(self, index, gender, age, job_id):
+        self.index = int(index)
+        self.is_male = gender == 'M'
+        self.age = age_table.index(int(age))
+        self.job_id = int(job_id)
+
+    def value(self):
+        """
+        Get information from a user.
+        """
+        return [self.index, 0 if self.is_male else 1, self.age, self.job_id]
+
+    def __str__(self):
+        return "<UserInfo id(%d), gender(%s), age(%d), job(%d)>" % (
+            self.index, "M"
+            if self.is_male else "F", age_table[self.age], self.job_id)
+
+    def __repr__(self):
+        return str(self)
+
+
+MOVIE_INFO = None
+MOVIE_TITLE_DICT = None
+CATEGORIES_DICT = None
+USER_INFO = None
+
+
+def __initialize_meta_info__():
+    fn = paddle.dataset.common.download(URL, "movielens", MD5)
+    global MOVIE_INFO
+    if MOVIE_INFO is None:
+        pattern = re.compile(r'^(.*)\((\d+)\)$')
+        with zipfile.ZipFile(file=fn) as package:
+            for info in package.infolist():
+                assert isinstance(info, zipfile.ZipInfo)
+                MOVIE_INFO = dict()
+                title_word_set = set()
+                categories_set = set()
+                with package.open('ml-1m/movies.dat') as movie_file:
+                    for i, line in enumerate(movie_file):
+                        movie_id, title, categories = line.strip().split('::')
+                        categories = categories.split('|')
+                        for c in categories:
+                            categories_set.add(c)
+                        title = pattern.match(title).group(1)
+                        MOVIE_INFO[int(movie_id)] = MovieInfo(
+                            index=movie_id, categories=categories, title=title)
+                        for w in title.split():
+                            title_word_set.add(w.lower())
+
+                global MOVIE_TITLE_DICT
+                MOVIE_TITLE_DICT = dict()
+                for i, w in enumerate(title_word_set):
+                    MOVIE_TITLE_DICT[w] = i
+
+                global CATEGORIES_DICT
+                CATEGORIES_DICT = dict()
+                for i, c in enumerate(categories_set):
+                    CATEGORIES_DICT[c] = i
+
+                global USER_INFO
+                USER_INFO = dict()
+                with package.open('ml-1m/users.dat') as user_file:
+                    for line in user_file:
+                        uid, gender, age, job, _ = line.strip().split("::")
+                        USER_INFO[int(uid)] = UserInfo(
+                            index=uid, gender=gender, age=age, job_id=job)
+    return fn
+
+
+def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
+    fn = __initialize_meta_info__()
+    rand = random.Random(x=rand_seed)
+    with zipfile.ZipFile(file=fn) as package:
+        with package.open('ml-1m/ratings.dat') as rating:
+            for line in rating:
+                if (rand.random() < test_ratio) == is_test:
+                    uid, mov_id, rating, _ = line.strip().split("::")
+                    uid = int(uid)
+                    mov_id = int(mov_id)
+                    rating = float(rating) * 2 - 5.0
+
+                    mov = MOVIE_INFO[mov_id]
+                    usr = USER_INFO[uid]
+                    yield usr.value() + mov.value() + [[rating]]
+
+
+def __reader_creator__(**kwargs):
+    return lambda: __reader__(**kwargs)
+
+
+train = functools.partial(__reader_creator__, is_test=False)
+test = functools.partial(__reader_creator__, is_test=True)
+
+
+def get_movie_title_dict():
+    """
+    Get movie title dictionary.
+    """
+    __initialize_meta_info__()
+    return MOVIE_TITLE_DICT
+
+
+def __max_index_info__(a, b):
+    if a.index > b.index:
+        return a
+    else:
+        return b
+
+
+def max_movie_id():
+    """
+    Get the maximum value of movie id.
+    """
+    __initialize_meta_info__()
+    return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index
+
+
+def max_user_id():
+    """
+    Get the maximum value of user id.
+    """
+    __initialize_meta_info__()
+    return reduce(__max_index_info__, USER_INFO.viewvalues()).index
+
+
+def __max_job_id_impl__(a, b):
+    if a.job_id > b.job_id:
+        return a
+    else:
+        return b
+
+
+def max_job_id():
+    """
+    Get the maximum value of job id.
+    """
+    __initialize_meta_info__()
+    return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id
+
+
+def movie_categories():
+    """
+    Get movie categoriges dictionary.
+    """
+    __initialize_meta_info__()
+    return CATEGORIES_DICT
+
+
+def user_info():
+    """
+    Get user info dictionary.
+    """
+    __initialize_meta_info__()
+    return USER_INFO
+
+
+def movie_info():
+    """
+    Get movie info dictionary.
+    """
+    __initialize_meta_info__()
+    return MOVIE_INFO
+
+
+def unittest():
+    for train_count, _ in enumerate(train()()):
+        pass
+    for test_count, _ in enumerate(test()()):
+        pass
+
+    print train_count, test_count
+
+
+def fetch():
+    paddle.dataset.common.download(URL, "movielens", MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.dataset.common.convert(path, train(), 1000, "movielens_train")
+    paddle.dataset.common.convert(path, test(), 1000, "movielens_test")
+
+
+if __name__ == '__main__':
+    unittest()
diff --git a/python/paddle/dataset/mq2007.py b/python/paddle/dataset/mq2007.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3b3dd524c34be660c5f2d4fc5ce2fa0420efbc1
--- /dev/null
+++ b/python/paddle/dataset/mq2007.py
@@ -0,0 +1,333 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+MQ2007 dataset
+
+MQ2007 is a query set from Million Query track of TREC 2007. There are about 1700 queries in it with labeled documents. In MQ2007, the 5-fold cross
+validation strategy is adopted and the 5-fold partitions are included in the package. In each fold, there are three subsets for learning: training set,
+validation set and testing set.
+
+MQ2007 dataset from website
+http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar and parse training set and test set into paddle reader creators
+
+"""
+
+import os
+import functools
+import rarfile
+from common import download
+import numpy as np
+
+# URL = "http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ2007.rar"
+URL = "http://www.bigdatalab.ac.cn/benchmark/upload/download_source/7b6dbbe2-842c-11e4-a536-bcaec51b9163_MQ2007.rar"
+MD5 = "7be1640ae95c6408dab0ae7207bdc706"
+
+
+def __initialize_meta_info__():
+    """
+  download and extract the MQ2007 dataset
+  """
+    fn = fetch()
+    rar = rarfile.RarFile(fn)
+    dirpath = os.path.dirname(fn)
+    rar.extractall(path=dirpath)
+    return dirpath
+
+
+class Query(object):
+    """
+  queries used for learning to rank algorithms. It is created from relevance scores,  query-document feature vectors
+
+  Parameters:
+  ----------
+  query_id : int
+    query_id in dataset, mapping from query to relevance documents
+  relevance_score : int 
+    relevance score of query and document pair
+  feature_vector : array, dense feature
+    feature in vector format
+  description : string
+    comment section in query doc pair data
+  """
+
+    def __init__(self,
+                 query_id=-1,
+                 relevance_score=-1,
+                 feature_vector=None,
+                 description=""):
+        self.query_id = query_id
+        self.relevance_score = relevance_score
+        if feature_vector is None:
+            self.feature_vector = []
+        else:
+            self.feature_vector = feature_vector
+        self.description = description
+
+    def __str__(self):
+        string = "%s %s %s" % (str(self.relevance_score), str(self.query_id),
+                               " ".join(str(f) for f in self.feature_vector))
+        return string
+
+    # @classmethod
+    def _parse_(self, text):
+        """
+    parse line into Query
+    """
+        comment_position = text.find('#')
+        line = text[:comment_position].strip()
+        self.description = text[comment_position + 1:].strip()
+        parts = line.split()
+        if len(parts) != 48:
+            sys.stdout.write("expect 48 space split parts, get %d" %
+                             (len(parts)))
+            return None
+        # format : 0 qid:10 1:0.000272 2:0.000000 .... 
+        self.relevance_score = int(parts[0])
+        self.query_id = int(parts[1].split(':')[1])
+        for p in parts[2:]:
+            pair = p.split(':')
+            self.feature_vector.append(float(pair[1]))
+        return self
+
+
+class QueryList(object):
+    """
+  group query into list, every item in list is a Query
+  """
+
+    def __init__(self, querylist=None):
+        self.query_id = -1
+        if querylist is None:
+            self.querylist = []
+        else:
+            self.querylist = querylist
+            for query in self.querylist:
+                if self.query_id == -1:
+                    self.query_id = query.query_id
+                else:
+                    if self.query_id != query.query_id:
+                        raise ValueError("query in list must be same query_id")
+
+    def __iter__(self):
+        for query in self.querylist:
+            yield query
+
+    def __len__(self):
+        return len(self.querylist)
+
+    def __getitem__(self, i):
+        return self.querylist[i]
+
+    def _correct_ranking_(self):
+        if self.querylist is None:
+            return
+        self.querylist.sort(key=lambda x: x.relevance_score, reverse=True)
+
+    def _add_query(self, query):
+        if self.query_id == -1:
+            self.query_id = query.query_id
+        else:
+            if self.query_id != query.query_id:
+                raise ValueError("query in list must be same query_id")
+        self.querylist.append(query)
+
+
+def gen_plain_txt(querylist):
+    """
+  gen plain text in list for other usage
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+
+  return :
+  ------
+  query_id : np.array, shape=(samples_num, )
+  label : np.array, shape=(samples_num, )
+  querylist : np.array, shape=(samples_num, feature_dimension)
+    """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    for query in querylist:
+        yield querylist.query_id, query.relevance_score, np.array(
+            query.feature_vector)
+
+
+def gen_point(querylist):
+    """
+  gen item in list for point-wise learning to rank algorithm
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+
+  return :
+  ------
+  label : np.array, shape=(samples_num, )
+  querylist : np.array, shape=(samples_num, feature_dimension)
+  """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    for query in querylist:
+        yield query.relevance_score, np.array(query.feature_vector)
+
+
+def gen_pair(querylist, partial_order="full"):
+    """
+  gen pair for pair-wise learning to rank algorithm
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+  pairtial_order : "full" or "neighbour"
+    there is redudant in all possiable pair combinations, which can be simplifed
+  gen pairs for neighbour items or the full partial order pairs
+
+  return :
+  ------
+  label : np.array, shape=(1)
+  query_left : np.array, shape=(1, feature_dimension)
+  query_right : same as left
+  """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    labels = []
+    docpairs = []
+
+    # C(n,2)
+    for i in range(len(querylist)):
+        query_left = querylist[i]
+        for j in range(i + 1, len(querylist)):
+            query_right = querylist[j]
+            if query_left.relevance_score > query_right.relevance_score:
+                labels.append([1])
+                docpairs.append([
+                    np.array(query_left.feature_vector),
+                    np.array(query_right.feature_vector)
+                ])
+            elif query_left.relevance_score < query_right.relevance_score:
+                labels.append([1])
+                docpairs.append([
+                    np.array(query_right.feature_vector),
+                    np.array(query_left.feature_vector)
+                ])
+    for label, pair in zip(labels, docpairs):
+        yield np.array(label), pair[0], pair[1]
+
+
+def gen_list(querylist):
+    """
+  gen item in list for list-wise learning to rank algorithm
+  Paramters:
+  --------
+  querylist : querylist, one query match many docment pairs in list, see QueryList
+
+  return :
+  ------
+  label : np.array, shape=(samples_num, )
+  querylist : np.array, shape=(samples_num, feature_dimension)
+  """
+    if not isinstance(querylist, QueryList):
+        querylist = QueryList(querylist)
+    querylist._correct_ranking_()
+    relevance_score_list = [[query.relevance_score] for query in querylist]
+    feature_vector_list = [query.feature_vector for query in querylist]
+    yield np.array(relevance_score_list), np.array(feature_vector_list)
+
+
+def query_filter(querylists):
+    """
+    filter query get only document with label 0.
+    label 0, 1, 2 means the relevance score document with query
+    parameters :
+      querylist : QueyList list
+
+    return :
+      querylist : QueyList list
+    """
+    filter_query = []
+    for querylist in querylists:
+        relevance_score_list = [query.relevance_score for query in querylist]
+        if sum(relevance_score_list) != .0:
+            filter_query.append(querylist)
+    return filter_query
+
+
+def load_from_text(filepath, shuffle=False, fill_missing=-1):
+    """
+  parse data file into querys
+  """
+    prev_query_id = -1
+    querylists = []
+    querylist = None
+    fn = __initialize_meta_info__()
+    with open(os.path.join(fn, filepath)) as f:
+        for line in f:
+            query = Query()
+            query = query._parse_(line)
+            if query == None:
+                continue
+            if query.query_id != prev_query_id:
+                if querylist is not None:
+                    querylists.append(querylist)
+                querylist = QueryList()
+                prev_query_id = query.query_id
+            querylist._add_query(query)
+    if querylist is not None:
+        querylists.append(querylist)
+    return querylists
+
+
+def __reader__(filepath, format="pairwise", shuffle=False, fill_missing=-1):
+    """
+  Parameters
+  --------
+  filename : string
+  fill_missing : fill the missing value. default in MQ2007 is -1
+  
+  Returns
+  ------
+  yield
+    label query_left, query_right  # format = "pairwise"
+    label querylist # format = "listwise"
+  """
+    querylists = query_filter(
+        load_from_text(
+            filepath, shuffle=shuffle, fill_missing=fill_missing))
+    for querylist in querylists:
+        if format == "plain_txt":
+            yield next(gen_plain_txt(querylist))
+        elif format == "pointwise":
+            yield next(gen_point(querylist))
+        elif format == "pairwise":
+            for pair in gen_pair(querylist):
+                yield pair
+        elif format == "listwise":
+            yield next(gen_list(querylist))
+
+
+train = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/train.txt")
+test = functools.partial(__reader__, filepath="MQ2007/MQ2007/Fold1/test.txt")
+
+
+def fetch():
+    return download(URL, "MQ2007", MD5)
+
+
+if __name__ == "__main__":
+    fetch()
+    mytest = functools.partial(
+        __reader__, filepath="MQ2007/MQ2007/Fold1/sample", format="listwise")
+    for label, query in mytest():
+        print label, query
diff --git a/python/paddle/dataset/sentiment.py b/python/paddle/dataset/sentiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5461164fe6b816356e42fc7b7dcf388eccfadfb
--- /dev/null
+++ b/python/paddle/dataset/sentiment.py
@@ -0,0 +1,140 @@
+# /usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+The script fetch and preprocess movie_reviews data set that provided by NLTK
+
+TODO(yuyang18): Complete dataset.
+"""
+
+import collections
+from itertools import chain
+
+import nltk
+from nltk.corpus import movie_reviews
+
+import paddle.dataset.common
+
+__all__ = ['train', 'test', 'get_word_dict', 'convert']
+NUM_TRAINING_INSTANCES = 1600
+NUM_TOTAL_INSTANCES = 2000
+
+
+def download_data_if_not_yet():
+    """
+    Download the data set, if the data set is not download.
+    """
+    try:
+        # make sure that nltk can find the data
+        if paddle.dataset.common.DATA_HOME not in nltk.data.path:
+            nltk.data.path.append(paddle.dataset.common.DATA_HOME)
+        movie_reviews.categories()
+    except LookupError:
+        print "Downloading movie_reviews data set, please wait....."
+        nltk.download(
+            'movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
+        print "Download data set success....."
+        print "Path is " + nltk.data.find('corpora/movie_reviews').path
+
+
+def get_word_dict():
+    """
+    Sorted the words by the frequency of words which occur in sample
+    :return:
+        words_freq_sorted
+    """
+    words_freq_sorted = list()
+    word_freq_dict = collections.defaultdict(int)
+    download_data_if_not_yet()
+
+    for category in movie_reviews.categories():
+        for field in movie_reviews.fileids(category):
+            for words in movie_reviews.words(field):
+                word_freq_dict[words] += 1
+    words_sort_list = word_freq_dict.items()
+    words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
+    for index, word in enumerate(words_sort_list):
+        words_freq_sorted.append((word[0], index))
+    return words_freq_sorted
+
+
+def sort_files():
+    """
+    Sorted the sample for cross reading the sample
+    :return:
+        files_list
+    """
+    files_list = list()
+    neg_file_list = movie_reviews.fileids('neg')
+    pos_file_list = movie_reviews.fileids('pos')
+    files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list)))
+    return files_list
+
+
+def load_sentiment_data():
+    """
+    Load the data set
+    :return:
+        data_set
+    """
+    data_set = list()
+    download_data_if_not_yet()
+    words_ids = dict(get_word_dict())
+    for sample_file in sort_files():
+        words_list = list()
+        category = 0 if 'neg' in sample_file else 1
+        for word in movie_reviews.words(sample_file):
+            words_list.append(words_ids[word.lower()])
+        data_set.append((words_list, category))
+    return data_set
+
+
+def reader_creator(data):
+    """
+    Reader creator, generate an iterator for data set
+    :param data:
+        train data set or test data set
+    """
+    for each in data:
+        yield each[0], each[1]
+
+
+def train():
+    """
+    Default training set reader creator
+    """
+    data_set = load_sentiment_data()
+    return reader_creator(data_set[0:NUM_TRAINING_INSTANCES])
+
+
+def test():
+    """
+    Default test set reader creator
+    """
+    data_set = load_sentiment_data()
+    return reader_creator(data_set[NUM_TRAINING_INSTANCES:])
+
+
+def fetch():
+    nltk.download('movie_reviews', download_dir=paddle.dataset.common.DATA_HOME)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.dataset.common.convert(path, train, 1000, "sentiment_train")
+    paddle.dataset.common.convert(path, test, 1000, "sentiment_test")
diff --git a/python/paddle/dataset/tests/CMakeLists.txt b/python/paddle/dataset/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..485c38a13b573664d8033c237272a10ebb7c9701
--- /dev/null
+++ b/python/paddle/dataset/tests/CMakeLists.txt
@@ -0,0 +1 @@
+py_test(test_image SRCS test_image.py)
diff --git a/python/paddle/dataset/tests/cat.jpg b/python/paddle/dataset/tests/cat.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..bc1fbbd371216b9904b522ed302700c79d2e4876
Binary files /dev/null and b/python/paddle/dataset/tests/cat.jpg differ
diff --git a/python/paddle/dataset/tests/cifar_test.py b/python/paddle/dataset/tests/cifar_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..839125b09dd5c6432e3572374a7345a77a43f7cf
--- /dev/null
+++ b/python/paddle/dataset/tests/cifar_test.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.cifar
+import unittest
+
+
+class TestCIFAR(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 3072)
+            if l[1] > label:
+                label = l[1]
+            sum += 1
+        return sum, label
+
+    def test_test10(self):
+        instances, max_label_value = self.check_reader(
+            paddle.dataset.cifar.test10())
+        self.assertEqual(instances, 10000)
+        self.assertEqual(max_label_value, 9)
+
+    def test_train10(self):
+        instances, max_label_value = self.check_reader(
+            paddle.dataset.cifar.train10())
+        self.assertEqual(instances, 50000)
+        self.assertEqual(max_label_value, 9)
+
+    def test_test100(self):
+        instances, max_label_value = self.check_reader(
+            paddle.dataset.cifar.test100())
+        self.assertEqual(instances, 10000)
+        self.assertEqual(max_label_value, 99)
+
+    def test_train100(self):
+        instances, max_label_value = self.check_reader(
+            paddle.dataset.cifar.train100())
+        self.assertEqual(instances, 50000)
+        self.assertEqual(max_label_value, 99)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/tests/common_test.py b/python/paddle/dataset/tests/common_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7cc02aa83061599ffefa18de6cb02ac0fc9e9b7
--- /dev/null
+++ b/python/paddle/dataset/tests/common_test.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.common
+import unittest
+import tempfile
+import glob
+
+
+class TestCommon(unittest.TestCase):
+    def test_md5file(self):
+        _, temp_path = tempfile.mkstemp()
+        with open(temp_path, 'w') as f:
+            f.write("Hello\n")
+        self.assertEqual('09f7e02f1290be211da707a266f153b3',
+                         paddle.dataset.common.md5file(temp_path))
+
+    def test_download(self):
+        yi_avatar = 'https://avatars0.githubusercontent.com/u/1548775?v=3&s=460'
+        self.assertEqual(
+            paddle.dataset.common.DATA_HOME + '/test/1548775?v=3&s=460',
+            paddle.dataset.common.download(yi_avatar, 'test',
+                                           'f75287202d6622414c706c36c16f8e0d'))
+
+    def test_split(self):
+        def test_reader():
+            def reader():
+                for x in xrange(10):
+                    yield x
+
+            return reader
+
+        _, temp_path = tempfile.mkstemp()
+        paddle.dataset.common.split(
+            test_reader(), 4, suffix=temp_path + '/test-%05d.pickle')
+        files = glob.glob(temp_path + '/test-%05d.pickle')
+        self.assertEqual(len(files), 3)
+
+    def test_cluster_file_reader(self):
+        _, temp_path = tempfile.mkstemp()
+        for x in xrange(5):
+            with open(temp_path + '/%05d.test' % x) as f:
+                f.write('%d\n' % x)
+        reader = paddle.dataset.common.cluster_files_reader(
+            temp_path + '/*.test', 5, 0)
+        for idx, e in enumerate(reader()):
+            self.assertEqual(e, str("0"))
+
+    def test_convert(self):
+        record_num = 10
+        num_shards = 4
+
+        def test_reader():
+            def reader():
+                for x in xrange(record_num):
+                    yield x
+
+            return reader
+
+        path = tempfile.mkdtemp()
+        paddle.dataset.common.convert(path,
+                                      test_reader(), num_shards,
+                                      'random_images')
+
+        files = glob.glob(path + '/random_images-*')
+        self.assertEqual(len(files), num_shards)
+
+        recs = []
+        for i in range(0, num_shards):
+            n = "%s/random_images-%05d-of-%05d" % (path, i, num_shards - 1)
+            r = recordio.reader(n)
+            while True:
+                d = r.read()
+                if d is None:
+                    break
+                recs.append(d)
+
+        recs.sort()
+        self.assertEqual(total, record_num)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/tests/flowers_test.py b/python/paddle/dataset/tests/flowers_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..06260fd796ce0271b7cec2f42a8a5a255a02dc24
--- /dev/null
+++ b/python/paddle/dataset/tests/flowers_test.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.flowers
+import unittest
+
+
+class TestFlowers(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        size = 224 * 224 * 3
+        for l in reader():
+            self.assertEqual(l[0].size, size)
+            if l[1] > label:
+                label = l[1]
+            sum += 1
+        return sum, label
+
+    def test_train(self):
+        instances, max_label_value = self.check_reader(
+            paddle.dataset.flowers.train())
+        self.assertEqual(instances, 6149)
+        self.assertEqual(max_label_value, 102)
+
+    def test_test(self):
+        instances, max_label_value = self.check_reader(
+            paddle.dataset.flowers.test())
+        self.assertEqual(instances, 1020)
+        self.assertEqual(max_label_value, 102)
+
+    def test_valid(self):
+        instances, max_label_value = self.check_reader(
+            paddle.dataset.flowers.valid())
+        self.assertEqual(instances, 1020)
+        self.assertEqual(max_label_value, 102)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/tests/imdb_test.py b/python/paddle/dataset/tests/imdb_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..539da049449cd273db0a9e260851ed40e1be0f04
--- /dev/null
+++ b/python/paddle/dataset/tests/imdb_test.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.imdb
+import unittest
+import re
+
+TRAIN_POS_PATTERN = re.compile("aclImdb/train/pos/.*\.txt$")
+TRAIN_NEG_PATTERN = re.compile("aclImdb/train/neg/.*\.txt$")
+TRAIN_PATTERN = re.compile("aclImdb/train/.*\.txt$")
+
+TEST_POS_PATTERN = re.compile("aclImdb/test/pos/.*\.txt$")
+TEST_NEG_PATTERN = re.compile("aclImdb/test/neg/.*\.txt$")
+TEST_PATTERN = re.compile("aclImdb/test/.*\.txt$")
+
+
+class TestIMDB(unittest.TestCase):
+    word_idx = None
+
+    def test_build_dict(self):
+        if self.word_idx == None:
+            self.word_idx = paddle.dataset.imdb.build_dict(TRAIN_PATTERN, 150)
+
+        self.assertEqual(len(self.word_idx), 7036)
+
+    def check_dataset(self, dataset, expected_size):
+        if self.word_idx == None:
+            self.word_idx = paddle.dataset.imdb.build_dict(TRAIN_PATTERN, 150)
+
+        sum = 0
+        for l in dataset(self.word_idx):
+            self.assertEqual(l[1], sum % 2)
+            sum += 1
+        self.assertEqual(sum, expected_size)
+
+    def test_train(self):
+        self.check_dataset(paddle.dataset.imdb.train, 25000)
+
+    def test_test(self):
+        self.check_dataset(paddle.dataset.imdb.test, 25000)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/tests/imikolov_test.py b/python/paddle/dataset/tests/imikolov_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..233fd9fc8cea4cd0b5cd052580030fc8c993693c
--- /dev/null
+++ b/python/paddle/dataset/tests/imikolov_test.py
@@ -0,0 +1,67 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.imikolov
+import unittest
+
+WORD_DICT = paddle.dataset.imikolov.build_dict()
+
+
+class TestMikolov(unittest.TestCase):
+    def check_reader(self, reader, n):
+        for l in reader():
+            self.assertEqual(len(l), n)
+
+    def test_train(self):
+        n = 5
+        self.check_reader(paddle.dataset.imikolov.train(WORD_DICT, n), n)
+
+        first_line = 'aer banknote berlitz calloway centrust cluett fromstein '\
+            'gitano guterman hydro-quebec ipo kia memotec mlx nahb punts '\
+            'rake regatta rubens sim snack-food ssangyong swapo wachter'
+        first_line = [
+            WORD_DICT.get(ch, WORD_DICT['<unk>'])
+            for ch in first_line.split(' ')
+        ]
+        for l in paddle.dataset.imikolov.train(
+                WORD_DICT, n=-1,
+                data_type=paddle.dataset.imikolov.DataType.SEQ)():
+            read_line = l[0][1:]
+            break
+        self.assertEqual(first_line, read_line)
+
+    def test_test(self):
+        n = 5
+        self.check_reader(paddle.dataset.imikolov.test(WORD_DICT, n), n)
+
+        first_line = 'consumers may want to move their telephones a little '\
+                'closer to the tv set'
+        first_line = [
+            WORD_DICT.get(ch, WORD_DICT['<unk>'])
+            for ch in first_line.split(' ')
+        ]
+        for l in paddle.dataset.imikolov.test(
+                WORD_DICT, n=-1,
+                data_type=paddle.dataset.imikolov.DataType.SEQ)():
+            read_line = l[0][1:]
+            break
+        self.assertEqual(first_line, read_line)
+
+    def test_total(self):
+        _, idx = zip(*WORD_DICT.items())
+        self.assertEqual(sorted(idx)[-1], len(WORD_DICT) - 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/tests/mnist_test.py b/python/paddle/dataset/tests/mnist_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ada19d3f2ee13e194d08e19a4b86b558c69a0a7
--- /dev/null
+++ b/python/paddle/dataset/tests/mnist_test.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.mnist
+import unittest
+
+
+class TestMNIST(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 784)
+            if l[1] > label:
+                label = l[1]
+            sum += 1
+        return sum, label
+
+    def test_train(self):
+        instances, max_label_value = self.check_reader(
+            paddle.dataset.mnist.train())
+        self.assertEqual(instances, 60000)
+        self.assertEqual(max_label_value, 9)
+
+    def test_test(self):
+        instances, max_label_value = self.check_reader(
+            paddle.dataset.mnist.test())
+        self.assertEqual(instances, 10000)
+        self.assertEqual(max_label_value, 9)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/tests/mq2007_test.py b/python/paddle/dataset/tests/mq2007_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..fba388724a8e84591df7150b41f8ea39a850fc31
--- /dev/null
+++ b/python/paddle/dataset/tests/mq2007_test.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.mq2007
+import unittest
+
+
+class TestMQ2007(unittest.TestCase):
+    def test_pairwise(self):
+        for label, query_left, query_right in paddle.dataset.mq2007.test(
+                format="pairwise"):
+            self.assertEqual(query_left.shape(), (46, ))
+            self.assertEqual(query_right.shape(), (46, ))
+
+    def test_listwise(self):
+        for label_array, query_array in paddle.dataset.mq2007.test(
+                format="listwise"):
+            self.assertEqual(len(label_array), len(query_array))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/dataset/tests/test_image.py b/python/paddle/dataset/tests/test_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bd56607ae1998935a3b3aaa0e3279515c2a540c
--- /dev/null
+++ b/python/paddle/dataset/tests/test_image.py
@@ -0,0 +1,43 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle.dataset.image as image
+
+
+class Image(unittest.TestCase):
+    def test_resize_flip_chw(self):
+        # resize
+        im = image.load_image('cat.jpg')
+        im = image.resize_short(im, 256)
+        self.assertEqual(256, min(im.shape[:2]))
+        self.assertEqual(3, im.shape[2])
+
+        # flip
+        im = image.left_right_flip(im)
+        im2 = np.flip(im, 1)
+        self.assertEqual(im.all(), im2.all())
+
+        # to_chw
+        h, w, c = im.shape
+        im = image.to_chw(im)
+        self.assertEqual(c, im.shape[0])
+        self.assertEqual(h, im.shape[1])
+        self.assertEqual(w, im.shape[2])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/tests/test_sentiment.py b/python/paddle/dataset/tests/test_sentiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..543f4b7378b583ea3857bf785cf330c43e535c2a
--- /dev/null
+++ b/python/paddle/dataset/tests/test_sentiment.py
@@ -0,0 +1,55 @@
+# /usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import nltk
+import paddle.dataset.sentiment as st
+from nltk.corpus import movie_reviews
+
+
+class TestSentimentMethods(unittest.TestCase):
+    def test_get_word_dict(self):
+        word_dict = st.get_word_dict()[0:10]
+        test_word_list = [(u',', 0), (u'the', 1), (u'.', 2), (u'a', 3),
+                          (u'and', 4), (u'of', 5), (u'to', 6), (u"'", 7),
+                          (u'is', 8), (u'in', 9)]
+        for idx, each in enumerate(word_dict):
+            self.assertEqual(each, test_word_list[idx])
+        self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path)
+
+    def test_sort_files(self):
+        last_label = ''
+        for sample_file in st.sort_files():
+            current_label = sample_file.split("/")[0]
+            self.assertNotEqual(current_label, last_label)
+            last_label = current_label
+
+    def test_data_set(self):
+        data_set = st.load_sentiment_data()
+        last_label = -1
+        for each in st.test():
+            self.assertNotEqual(each[1], last_label)
+            last_label = each[1]
+        self.assertEqual(len(data_set), st.NUM_TOTAL_INSTANCES)
+        self.assertEqual(len(list(st.train())), st.NUM_TRAINING_INSTANCES)
+        self.assertEqual(
+            len(list(st.test())),
+            (st.NUM_TOTAL_INSTANCES - st.NUM_TRAINING_INSTANCES))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/tests/voc2012_test.py b/python/paddle/dataset/tests/voc2012_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d285461a8ae8a9cc69fbec0dcf5efc106b594f0
--- /dev/null
+++ b/python/paddle/dataset/tests/voc2012_test.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.voc2012
+import unittest
+
+
+class TestVOC(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 3 * l[1].size)
+            sum += 1
+        return sum
+
+    def test_train(self):
+        count = self.check_reader(paddle.dataset.voc_seg.train())
+        self.assertEqual(count, 2913)
+
+    def test_test(self):
+        count = self.check_reader(paddle.dataset.voc_seg.test())
+        self.assertEqual(count, 1464)
+
+    def test_val(self):
+        count = self.check_reader(paddle.dataset.voc_seg.val())
+        self.assertEqual(count, 1449)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/dataset/tests/wmt16_test.py b/python/paddle/dataset/tests/wmt16_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b949d8bf5212d51016a33da322095bde2038200
--- /dev/null
+++ b/python/paddle/dataset/tests/wmt16_test.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.wmt16
+import unittest
+
+
+class TestWMT16(unittest.TestCase):
+    def checkout_one_sample(self, sample):
+        # train data has 3 field: source language word indices,
+        # target language word indices, and target next word indices.
+        self.assertEqual(len(sample), 3)
+
+        # test start mark and end mark in source word indices.
+        self.assertEqual(sample[0][0], 0)
+        self.assertEqual(sample[0][-1], 1)
+
+        # test start mask in target word indices
+        self.assertEqual(sample[1][0], 0)
+
+        # test en mask in target next word indices
+        self.assertEqual(sample[2][-1], 1)
+
+    def test_train(self):
+        for idx, sample in enumerate(
+                paddle.dataset.wmt16.train(
+                    src_dict_size=100000, trg_dict_size=100000)()):
+            if idx >= 10: break
+            self.checkout_one_sample(sample)
+
+    def test_test(self):
+        for idx, sample in enumerate(
+                paddle.dataset.wmt16.test(
+                    src_dict_size=1000, trg_dict_size=1000)()):
+            if idx >= 10: break
+            self.checkout_one_sample(sample)
+
+    def test_val(self):
+        for idx, sample in enumerate(
+                paddle.dataset.wmt16.validation(
+                    src_dict_size=1000, trg_dict_size=1000)()):
+            if idx >= 10: break
+            self.checkout_one_sample(sample)
+
+    def test_get_dict(self):
+        dict_size = 1000
+        word_dict = paddle.dataset.wmt16.get_dict("en", dict_size, True)
+        self.assertEqual(len(word_dict), dict_size)
+        self.assertEqual(word_dict[0], "<s>")
+        self.assertEqual(word_dict[1], "<e>")
+        self.assertEqual(word_dict[2], "<unk>")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/dataset/uci_housing.py b/python/paddle/dataset/uci_housing.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbfa477d055eb5f484989eacce38cee8d617d729
--- /dev/null
+++ b/python/paddle/dataset/uci_housing.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+UCI Housing dataset.
+
+This module will download dataset from
+https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and
+parse training set and test set into paddle reader creators.
+"""
+
+import os
+
+import numpy as np
+import tempfile
+import tarfile
+import os
+import paddle.dataset.common
+
+__all__ = ['train', 'test']
+
+URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
+MD5 = 'd4accdce7a25600298819f8e28e8d593'
+feature_names = [
+    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
+    'PTRATIO', 'B', 'LSTAT', 'convert'
+]
+
+UCI_TRAIN_DATA = None
+UCI_TEST_DATA = None
+
+FLUID_URL_MODEL = 'https://github.com/PaddlePaddle/book/raw/develop/01.fit_a_line/fluid/fit_a_line.fluid.tar'
+FLUID_MD5_MODEL = '6e6dd637ccd5993961f68bfbde46090b'
+
+
+def feature_range(maximums, minimums):
+    import matplotlib
+    matplotlib.use('Agg')
+    import matplotlib.pyplot as plt
+    fig, ax = plt.subplots()
+    feature_num = len(maximums)
+    ax.bar(range(feature_num), maximums - minimums, color='r', align='center')
+    ax.set_title('feature scale')
+    plt.xticks(range(feature_num), feature_names)
+    plt.xlim([-1, feature_num])
+    fig.set_figheight(6)
+    fig.set_figwidth(10)
+    if not os.path.exists('./image'):
+        os.makedirs('./image')
+    fig.savefig('image/ranges.png', dpi=48)
+    plt.close(fig)
+
+
+def load_data(filename, feature_num=14, ratio=0.8):
+    global UCI_TRAIN_DATA, UCI_TEST_DATA
+    if UCI_TRAIN_DATA is not None and UCI_TEST_DATA is not None:
+        return
+
+    data = np.fromfile(filename, sep=' ')
+    data = data.reshape(data.shape[0] / feature_num, feature_num)
+    maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
+        axis=0) / data.shape[0]
+    feature_range(maximums[:-1], minimums[:-1])
+    for i in xrange(feature_num - 1):
+        data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
+    offset = int(data.shape[0] * ratio)
+    UCI_TRAIN_DATA = data[:offset]
+    UCI_TEST_DATA = data[offset:]
+
+
+def train():
+    """
+    UCI_HOUSING training set creator.
+
+    It returns a reader creator, each sample in the reader is features after
+    normalization and price number.
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    global UCI_TRAIN_DATA
+    load_data(paddle.dataset.common.download(URL, 'uci_housing', MD5))
+
+    def reader():
+        for d in UCI_TRAIN_DATA:
+            yield d[:-1], d[-1:]
+
+    return reader
+
+
+def test():
+    """
+    UCI_HOUSING test set creator.
+
+    It returns a reader creator, each sample in the reader is features after
+    normalization and price number.
+
+    :return: Test reader creator
+    :rtype: callable
+    """
+    global UCI_TEST_DATA
+    load_data(paddle.dataset.common.download(URL, 'uci_housing', MD5))
+
+    def reader():
+        for d in UCI_TEST_DATA:
+            yield d[:-1], d[-1:]
+
+    return reader
+
+
+def fluid_model():
+    parameter_tar = paddle.dataset.common.download(
+        FLUID_URL_MODEL, 'uci_housing', FLUID_MD5_MODEL, 'fit_a_line.fluid.tar')
+
+    tar = tarfile.TarFile(parameter_tar, mode='r')
+    dirpath = tempfile.mkdtemp()
+    tar.extractall(path=dirpath)
+
+    return dirpath
+
+
+def predict_reader():
+    """
+    It returns just one tuple data to do inference.
+
+    :return: one tuple data
+    :rtype: tuple 
+    """
+    global UCI_TEST_DATA
+    load_data(paddle.dataset.common.download(URL, 'uci_housing', MD5))
+    return (UCI_TEST_DATA[0][:-1], )
+
+
+def fetch():
+    paddle.dataset.common.download(URL, 'uci_housing', MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.dataset.common.convert(path, train(), 1000, "uci_housing_train")
+    paddle.dataset.common.convert(path, test(), 1000, "uci_houseing_test")
diff --git a/python/paddle/dataset/voc2012.py b/python/paddle/dataset/voc2012.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c945574dbcc15f5cee370206ed7e70ba8ab5014
--- /dev/null
+++ b/python/paddle/dataset/voc2012.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image dataset for segmentation.
+The 2012 dataset contains images from 2008-2011 for which additional
+segmentations have been prepared. As in previous years the assignment
+to training/test sets has been maintained. The total number of images
+with segmentation has been increased from 7,062 to 9,993.
+"""
+
+import tarfile
+import io
+import numpy as np
+from paddle.dataset.common import download
+from paddle.dataset.image import *
+from PIL import Image
+
+__all__ = ['train', 'test', 'val']
+
+VOC_URL = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/\
+VOCtrainval_11-May-2012.tar'
+
+VOC_MD5 = '6cd6e144f989b92b3379bac3b3de84fd'
+SET_FILE = 'VOCdevkit/VOC2012/ImageSets/Segmentation/{}.txt'
+DATA_FILE = 'VOCdevkit/VOC2012/JPEGImages/{}.jpg'
+LABEL_FILE = 'VOCdevkit/VOC2012/SegmentationClass/{}.png'
+
+CACHE_DIR = 'voc2012'
+
+
+def reader_creator(filename, sub_name):
+
+    tarobject = tarfile.open(filename)
+    name2mem = {}
+    for ele in tarobject.getmembers():
+        name2mem[ele.name] = ele
+
+    def reader():
+        set_file = SET_FILE.format(sub_name)
+        sets = tarobject.extractfile(name2mem[set_file])
+        for line in sets:
+            line = line.strip()
+            data_file = DATA_FILE.format(line)
+            label_file = LABEL_FILE.format(line)
+            data = tarobject.extractfile(name2mem[data_file]).read()
+            label = tarobject.extractfile(name2mem[label_file]).read()
+            data = Image.open(io.BytesIO(data))
+            label = Image.open(io.BytesIO(label))
+            data = np.array(data)
+            label = np.array(label)
+            yield data, label
+
+    return reader
+
+
+def train():
+    """
+    Create a train dataset reader containing 2913 images in HWC order.
+    """
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'trainval')
+
+
+def test():
+    """
+    Create a test dataset reader containing 1464 images in HWC order.
+    """
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'train')
+
+
+def val():
+    """
+    Create a val dataset reader containing 1449 images in HWC order.
+    """
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'val')
diff --git a/python/paddle/dataset/wmt14.py b/python/paddle/dataset/wmt14.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0908c737874fa7335cca5b5f0cba83190c9f90f
--- /dev/null
+++ b/python/paddle/dataset/wmt14.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+WMT14 dataset.
+The original WMT14 dataset is too large and a small set of data for set is
+provided. This module will download dataset from
+http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
+parse training set and test set into paddle reader creators.
+
+"""
+import tarfile
+import gzip
+
+import paddle.dataset.common
+
+__all__ = [
+    'train',
+    'test',
+    'get_dict',
+    'convert',
+]
+
+URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/'
+                'cslm_joint_paper/data/dev+test.tgz')
+MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
+# this is a small set of data for test. The original data is too large and
+# will be add later.
+URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/'
+             'wmt_shrinked_data/wmt14.tgz')
+MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
+# BLEU of this trained model is 26.92
+URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz'
+MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'
+
+START = "<s>"
+END = "<e>"
+UNK = "<unk>"
+UNK_IDX = 2
+
+
+def __read_to_dict(tar_file, dict_size):
+    def __to_dict(fd, size):
+        out_dict = dict()
+        for line_count, line in enumerate(fd):
+            if line_count < size:
+                out_dict[line.strip()] = line_count
+            else:
+                break
+        return out_dict
+
+    with tarfile.open(tar_file, mode='r') as f:
+        names = [
+            each_item.name for each_item in f
+            if each_item.name.endswith("src.dict")
+        ]
+        assert len(names) == 1
+        src_dict = __to_dict(f.extractfile(names[0]), dict_size)
+        names = [
+            each_item.name for each_item in f
+            if each_item.name.endswith("trg.dict")
+        ]
+        assert len(names) == 1
+        trg_dict = __to_dict(f.extractfile(names[0]), dict_size)
+        return src_dict, trg_dict
+
+
+def reader_creator(tar_file, file_name, dict_size):
+    def reader():
+        src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
+        with tarfile.open(tar_file, mode='r') as f:
+            names = [
+                each_item.name for each_item in f
+                if each_item.name.endswith(file_name)
+            ]
+            for name in names:
+                for line in f.extractfile(name):
+                    line_split = line.strip().split('\t')
+                    if len(line_split) != 2:
+                        continue
+                    src_seq = line_split[0]  # one source sequence
+                    src_words = src_seq.split()
+                    src_ids = [
+                        src_dict.get(w, UNK_IDX)
+                        for w in [START] + src_words + [END]
+                    ]
+
+                    trg_seq = line_split[1]  # one target sequence
+                    trg_words = trg_seq.split()
+                    trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words]
+
+                    # remove sequence whose length > 80 in training mode
+                    if len(src_ids) > 80 or len(trg_ids) > 80:
+                        continue
+                    trg_ids_next = trg_ids + [trg_dict[END]]
+                    trg_ids = [trg_dict[START]] + trg_ids
+
+                    yield src_ids, trg_ids, trg_ids_next
+
+    return reader
+
+
+def train(dict_size):
+    """
+    WMT14 training set creator.
+
+    It returns a reader creator, each sample in the reader is source language
+    word ID sequence, target language word ID sequence and next word ID
+    sequence.
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'train/train', dict_size)
+
+
+def test(dict_size):
+    """
+    WMT14 test set creator.
+
+    It returns a reader creator, each sample in the reader is source language
+    word ID sequence, target language word ID sequence and next word ID
+    sequence.
+
+    :return: Test reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'test/test', dict_size)
+
+
+def gen(dict_size):
+    return reader_creator(
+        paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'gen/gen', dict_size)
+
+
+def get_dict(dict_size, reverse=True):
+    # if reverse = False, return dict = {'a':'001', 'b':'002', ...}
+    # else reverse = true, return dict = {'001':'a', '002':'b', ...}
+    tar_file = paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
+    src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
+    if reverse:
+        src_dict = {v: k for k, v in src_dict.items()}
+        trg_dict = {v: k for k, v in trg_dict.items()}
+    return src_dict, trg_dict
+
+
+def fetch():
+    paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
+    paddle.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    dict_size = 30000
+    paddle.dataset.common.convert(path, train(dict_size), 1000, "wmt14_train")
+    paddle.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test")
diff --git a/python/paddle/dataset/wmt16.py b/python/paddle/dataset/wmt16.py
new file mode 100644
index 0000000000000000000000000000000000000000..540d43b692e0f65460f558dd74a52715ff4db68d
--- /dev/null
+++ b/python/paddle/dataset/wmt16.py
@@ -0,0 +1,349 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ACL2016 Multimodal Machine Translation. Please see this website for more
+details: http://www.statmt.org/wmt16/multimodal-task.html#task1
+
+If you use the dataset created for your task, please cite the following paper:
+Multi30K: Multilingual English-German Image Descriptions.
+
+@article{elliott-EtAl:2016:VL16,
+ author    = {{Elliott}, D. and {Frank}, S. and {Sima"an}, K. and {Specia}, L.},
+ title     = {Multi30K: Multilingual English-German Image Descriptions},
+ booktitle = {Proceedings of the 6th Workshop on Vision and Language},
+ year      = {2016},
+ pages     = {70--74},
+ year      = 2016
+}
+"""
+
+import os
+import tarfile
+import gzip
+from collections import defaultdict
+
+import paddle.dataset.common
+
+__all__ = [
+    "train",
+    "test",
+    "validation",
+    "convert",
+    "fetch",
+    "get_dict",
+]
+
+DATA_URL = ("http://cloud.dlnel.org/filepub/"
+            "?uuid=46a0808e-ddd8-427c-bacd-0dbc6d045fed")
+DATA_MD5 = "0c38be43600334966403524a40dcd81e"
+
+TOTAL_EN_WORDS = 11250
+TOTAL_DE_WORDS = 19220
+
+START_MARK = "<s>"
+END_MARK = "<e>"
+UNK_MARK = "<unk>"
+
+
+def __build_dict(tar_file, dict_size, save_path, lang):
+    word_dict = defaultdict(int)
+    with tarfile.open(tar_file, mode="r") as f:
+        for line in f.extractfile("wmt16/train"):
+            line_split = line.strip().split("\t")
+            if len(line_split) != 2: continue
+            sen = line_split[0] if lang == "en" else line_split[1]
+            for w in sen.split():
+                word_dict[w] += 1
+
+    with open(save_path, "w") as fout:
+        fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK))
+        for idx, word in enumerate(
+                sorted(
+                    word_dict.iteritems(), key=lambda x: x[1], reverse=True)):
+            if idx + 3 == dict_size: break
+            fout.write("%s\n" % (word[0]))
+
+
+def __load_dict(tar_file, dict_size, lang, reverse=False):
+    dict_path = os.path.join(paddle.dataset.common.DATA_HOME,
+                             "wmt16/%s_%d.dict" % (lang, dict_size))
+    if not os.path.exists(dict_path) or (
+            len(open(dict_path, "r").readlines()) != dict_size):
+        __build_dict(tar_file, dict_size, dict_path, lang)
+
+    word_dict = {}
+    with open(dict_path, "r") as fdict:
+        for idx, line in enumerate(fdict):
+            if reverse:
+                word_dict[idx] = line.strip()
+            else:
+                word_dict[line.strip()] = idx
+    return word_dict
+
+
+def __get_dict_size(src_dict_size, trg_dict_size, src_lang):
+    src_dict_size = min(src_dict_size, (TOTAL_EN_WORDS if src_lang == "en" else
+                                        TOTAL_DE_WORDS))
+    trg_dict_size = min(trg_dict_size, (TOTAL_DE_WORDS if src_lang == "en" else
+                                        TOTAL_EN_WORDS))
+    return src_dict_size, trg_dict_size
+
+
+def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang):
+    def reader():
+        src_dict = __load_dict(tar_file, src_dict_size, src_lang)
+        trg_dict = __load_dict(tar_file, trg_dict_size,
+                               ("de" if src_lang == "en" else "en"))
+
+        # the indice for start mark, end mark, and unk are the same in source
+        # language and target language. Here uses the source language
+        # dictionary to determine their indices.
+        start_id = src_dict[START_MARK]
+        end_id = src_dict[END_MARK]
+        unk_id = src_dict[UNK_MARK]
+
+        src_col = 0 if src_lang == "en" else 1
+        trg_col = 1 - src_col
+
+        with tarfile.open(tar_file, mode="r") as f:
+            for line in f.extractfile(file_name):
+                line_split = line.strip().split("\t")
+                if len(line_split) != 2:
+                    continue
+                src_words = line_split[src_col].split()
+                src_ids = [start_id] + [
+                    src_dict.get(w, unk_id) for w in src_words
+                ] + [end_id]
+
+                trg_words = line_split[trg_col].split()
+                trg_ids = [trg_dict.get(w, unk_id) for w in trg_words]
+
+                trg_ids_next = trg_ids + [end_id]
+                trg_ids = [start_id] + trg_ids
+
+                yield src_ids, trg_ids, trg_ids_next
+
+    return reader
+
+
+def train(src_dict_size, trg_dict_size, src_lang="en"):
+    """
+    WMT16 train set reader.
+
+    This function returns the reader for train data. Each sample the reader
+    returns is made up of three fields: the source language word index sequence,
+    target language word index sequence and next word index sequence.
+
+
+    NOTE:
+    The original like for training data is:
+    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz
+
+    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
+    using moses's tokenization script:
+    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
+
+    Args:
+        src_dict_size(int): Size of the source language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        trg_dict_size(int): Size of the target language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        src_lang(string): A string indicating which language is the source
+                          language. Available options are: "en" for English
+                          and "de" for Germany.
+
+    Returns:
+        callable: The train reader.
+    """
+
+    if src_lang not in ["en", "de"]:
+        raise ValueError("An error language type.  Only support: "
+                         "en (for English); de(for Germany).")
+    src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
+                                                   src_lang)
+
+    return reader_creator(
+        tar_file=paddle.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
+                                                "wmt16.tar.gz"),
+        file_name="wmt16/train",
+        src_dict_size=src_dict_size,
+        trg_dict_size=trg_dict_size,
+        src_lang=src_lang)
+
+
+def test(src_dict_size, trg_dict_size, src_lang="en"):
+    """
+    WMT16 test set reader.
+
+    This function returns the reader for test data. Each sample the reader
+    returns is made up of three fields: the source language word index sequence,
+    target language word index sequence and next word index sequence.
+
+    NOTE:
+    The original like for test data is:
+    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/mmt16_task1_test.tar.gz
+
+    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
+    using moses's tokenization script:
+    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
+
+    Args:
+        src_dict_size(int): Size of the source language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        trg_dict_size(int): Size of the target language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        src_lang(string): A string indicating which language is the source
+                          language. Available options are: "en" for English
+                          and "de" for Germany.
+
+    Returns:
+        callable: The test reader.
+    """
+
+    if src_lang not in ["en", "de"]:
+        raise ValueError("An error language type. "
+                         "Only support: en (for English); de(for Germany).")
+
+    src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
+                                                   src_lang)
+
+    return reader_creator(
+        tar_file=paddle.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
+                                                "wmt16.tar.gz"),
+        file_name="wmt16/test",
+        src_dict_size=src_dict_size,
+        trg_dict_size=trg_dict_size,
+        src_lang=src_lang)
+
+
+def validation(src_dict_size, trg_dict_size, src_lang="en"):
+    """
+    WMT16 validation set reader.
+
+    This function returns the reader for validation data. Each sample the reader
+    returns is made up of three fields: the source language word index sequence,
+    target language word index sequence and next word index sequence.
+
+    NOTE:
+    The original like for validation data is:
+    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz
+
+    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
+    using moses's tokenization script:
+    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
+
+    Args:
+        src_dict_size(int): Size of the source language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        trg_dict_size(int): Size of the target language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        src_lang(string): A string indicating which language is the source
+                          language. Available options are: "en" for English
+                          and "de" for Germany.
+
+    Returns:
+        callable: The validation reader.
+    """
+    if src_lang not in ["en", "de"]:
+        raise ValueError("An error language type. "
+                         "Only support: en (for English); de(for Germany).")
+    src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
+                                                   src_lang)
+
+    return reader_creator(
+        tar_file=paddle.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
+                                                "wmt16.tar.gz"),
+        file_name="wmt16/val",
+        src_dict_size=src_dict_size,
+        trg_dict_size=trg_dict_size,
+        src_lang=src_lang)
+
+
+def get_dict(lang, dict_size, reverse=False):
+    """
+    return the word dictionary for the specified language.
+
+    Args:
+        lang(string): A string indicating which language is the source
+                      language. Available options are: "en" for English
+                      and "de" for Germany.
+        dict_size(int): Size of the specified language dictionary.
+        reverse(bool): If reverse is set to False, the returned python
+                       dictionary will use word as key and use index as value.
+                       If reverse is set to True, the returned python
+                       dictionary will use index as key and word as value.
+
+    Returns:
+        dict: The word dictionary for the specific language.
+    """
+
+    if lang == "en": dict_size = min(dict_size, TOTAL_EN_WORDS)
+    else: dict_size = min(dict_size, TOTAL_DE_WORDS)
+
+    dict_path = os.path.join(paddle.dataset.common.DATA_HOME,
+                             "wmt16/%s_%d.dict" % (lang, dict_size))
+    assert os.path.exists(dict_path), "Word dictionary does not exist. "
+    "Please invoke paddle.dataset.wmt16.train/test/validation first "
+    "to build the dictionary."
+    tar_file = os.path.join(paddle.dataset.common.DATA_HOME, "wmt16.tar.gz")
+    return __load_dict(tar_file, dict_size, lang, reverse)
+
+
+def fetch():
+    """download the entire dataset.
+    """
+    paddle.v4.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
+                                      "wmt16.tar.gz")
+
+
+def convert(path, src_dict_size, trg_dict_size, src_lang):
+    """Converts dataset to recordio format.
+    """
+
+    paddle.dataset.common.convert(
+        path,
+        train(
+            src_dict_size=src_dict_size,
+            trg_dict_size=trg_dict_size,
+            src_lang=src_lang),
+        1000,
+        "wmt16_train")
+    paddle.dataset.common.convert(
+        path,
+        test(
+            src_dict_size=src_dict_size,
+            trg_dict_size=trg_dict_size,
+            src_lang=src_lang),
+        1000,
+        "wmt16_test")
+    paddle.dataset.common.convert(
+        path,
+        validation(
+            src_dict_size=src_dict_size,
+            trg_dict_size=trg_dict_size,
+            src_lang=src_lang),
+        1000,
+        "wmt16_validation")
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index dcde08632a6bb4c5936c32048c2cc1dca7608b06..3034c1a0875a71421bcba172c16ee32d809df152 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -20,6 +20,17 @@ from framework import *
 import executor
 from executor import *
 
+import trainer
+from trainer import Trainer
+from trainer import BeginEpochEvent
+from trainer import EndEpochEvent
+from trainer import BeginStepEvent
+from trainer import EndStepEvent
+from trainer import CheckpointConfig
+
+import inferencer
+from inferencer import Inferencer
+
 import io
 import evaluator
 import initializer
@@ -29,46 +40,51 @@ import optimizer
 import backward
 import regularizer
 import average
+import metrics
+import transpiler
 from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
-from core import LoDTensor, CPUPlace, CUDAPlace
-from distribute_transpiler import DistributeTranspiler
-from distribute_transpiler_simple import SimpleDistributeTranspiler
+from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
+from transpiler import DistributeTranspiler, InferenceTranspiler, \
+    memory_optimize, release_memory
 from concurrency import (Go, make_channel, channel_send, channel_recv,
-                         channel_close)
+                         channel_close, Select)
+from lod_tensor import create_lod_tensor, create_random_int_lodtensor
 import clip
-from memory_optimization_transpiler import memory_optimize, release_memory
 import profiler
 import unique_name
 import recordio_writer
+import parallel_executor
+from parallel_executor import *
 
 Tensor = LoDTensor
 
-__all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + [
-    'io',
-    'initializer',
-    'layers',
-    'nets',
-    'optimizer',
-    'learning_rate_decay',
-    'backward',
-    'regularizer',
-    'LoDTensor',
-    'CPUPlace',
-    'CUDAPlace',
-    'Tensor',
-    'ParamAttr',
-    'WeightNormParamAttr',
-    'DataFeeder',
-    'clip',
-    'SimpleDistributeTranspiler',
-    'DistributeTranspiler',
-    'memory_optimize',
-    'release_memory',
-    'profiler',
-    'unique_name',
-    'recordio_writer',
-]
+__all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \
+          trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
+          parallel_executor.__all__ + lod_tensor.__all__ + [
+              'io',
+              'initializer',
+              'layers',
+              'transpiler'
+              'nets',
+              'optimizer',
+              'learning_rate_decay',
+              'backward',
+              'regularizer',
+              'LoDTensor',
+              'CPUPlace',
+              'CUDAPlace',
+              'CUDAPinnedPlace',
+              'Tensor',
+              'ParamAttr',
+              'WeightNormParamAttr',
+              'DataFeeder',
+              'clip',
+              'profiler',
+              'unique_name',
+              'recordio_writer',
+              'Scope',
+          ]
 
 
 def __bootstrap__():
@@ -82,6 +98,8 @@ def __bootstrap__():
     import core
     import os
 
+    in_test = 'unittest' in sys.modules
+
     try:
         num_threads = int(os.getenv('OMP_NUM_THREADS', '1'))
     except ValueError:
@@ -99,15 +117,22 @@ def __bootstrap__():
     os.environ['OMP_NUM_THREADS'] = str(num_threads)
 
     read_env_flags = [
-        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir'
+        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
+        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
+        'init_allocated_mem'
     ]
     if core.is_compiled_with_cuda():
-        read_env_flags += ['fraction_of_gpu_memory_to_use']
+        read_env_flags += [
+            'fraction_of_gpu_memory_to_use', 'cudnn_deterministic'
+        ]
     core.init_gflags([sys.argv[0]] +
                      ["--tryfromenv=" + ",".join(read_env_flags)])
     core.init_glog(sys.argv[0])
-    core.init_devices()
+    # don't init_p2p when in unittest to save time.
+    core.init_devices(not in_test)
 
 
+# TODO(panyx0718): Avoid doing complex initialization logic in __init__.py.
+# Consider paddle.init(args) or paddle.main(args)
 layers.monkey_patch_variable()
 __bootstrap__()
diff --git a/python/paddle/fluid/average.py b/python/paddle/fluid/average.py
index ded6eb085968343fcdc9f6e4b8353c08408df426..358e24df31bb517604481bb48b9180e579f8460d 100644
--- a/python/paddle/fluid/average.py
+++ b/python/paddle/fluid/average.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import numpy as np
+import warnings
 """
     Class of all kinds of Average.
 
@@ -22,6 +23,8 @@ import numpy as np
     wrappers of Python functions.
 """
 
+__all__ = ["WeightedAverage"]
+
 
 def _is_number_(var):
     return isinstance(var, int) or isinstance(var, float) or (isinstance(
@@ -33,7 +36,29 @@ def _is_number_or_matrix_(var):
 
 
 class WeightedAverage(object):
+    """
+    Calculate weighted average.
+
+    The average calculating is accomplished via Python totally. 
+    They do not change Paddle's Program, nor do anything to
+    modify NN model's configuration. They are completely 
+    wrappers of Python functions.
+
+    Examples:
+        .. code-block:: python
+            avg = fluid.average.WeightedAverage()
+            avg.add(value=2.0, weight=1)
+            avg.add(value=4.0, weight=2)
+            avg.eval()
+
+            # The result is 3.333333333.
+            # For (2.0 * 1 + 4.0 * 2) / (1 + 2) = 3.333333333
+    """
+
     def __init__(self):
+        warnings.warn(
+            "The %s is deprecated, please use fluid.metrics.Accuracy instead." %
+            (self.__class__.__name__), Warning)
         self.reset()
 
     def reset(self):
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index b6f20daee3a585777a23255355f0a0e31328d23f..4faa06303170488d0de2fda4c1461cfe2d623d35 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -51,6 +51,12 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
         op_desc.set_input(para, args)
     for para, args in outputs.iteritems():
         op_desc.set_output(para, args)
+
+    op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
+
+    if op_role_attr_name not in attrs:
+        attrs[
+            op_role_attr_name] = core.op_proto_and_checker_maker.OpRole.Backward
     for name, val in attrs.iteritems():
         if isinstance(val, framework.Block):
             op_desc.set_block_attr(name, val.desc)
@@ -126,9 +132,9 @@ def _addup_repetitive_outputs_(op_descs):
     for idx, op_desc in enumerate(op_descs):
         for var_name in op_desc.input_arg_names():
             if len(renamed_vars[var_name]) > 1:
-                pending_sum_ops.append(
-                    (_create_op_desc_("sum", {"X": renamed_vars[var_name]},
-                                      {"Out": [var_name]}, {}), idx))
+                pending_sum_ops.append((_create_op_desc_(
+                    "sum", {"X": renamed_vars[var_name]}, {"Out": [var_name]},
+                    {"use_mkldnn": False}), idx))
                 renamed_vars[var_name] = [var_name]
         for var_name in op_desc.output_arg_names():
             if var_name == core.empty_var_name(
@@ -155,8 +161,9 @@ def _addup_repetitive_outputs_(op_descs):
                 renamed_vars[var_name].append(new_name)
     for var_name, inputs in renamed_vars.iteritems():
         if len(inputs) > 1:
-            pending_sum_ops.append((_create_op_desc_(
-                "sum", {"X": inputs}, {"Out": [var_name]}, {}), len(op_descs)))
+            pending_sum_ops.append(
+                (_create_op_desc_("sum", {"X": inputs}, {"Out": [var_name]},
+                                  {"use_mkldnn": False}), len(op_descs)))
     # sum_op descs are sorted according to their insert position
     for p in reversed(pending_sum_ops):
         op_descs.insert(p[1], p[0])
@@ -248,12 +255,15 @@ def _callback_lookup_(op):
                         if o_argu in self.param_grad_names:
                             allreduce_out_name = o_argu + "__nccl_all_reduce__"
                             op_desc = _create_op_desc_(
-                                "ncclAllReduce", {
+                                "ncclReduce",
+                                {
                                     "X": [o_argu],
                                     "Communicator":
                                     ['nccl_com__do_not_change_']
-                                }, {"Out": [allreduce_out_name]},
-                                {"reduction": "ncclSum"})
+                                },
+                                {"Out": [allreduce_out_name]},
+                                {"reduction": "ncclSum",
+                                 "root": 0}, )
                             block.desc.append_op().copy_from(op_desc)
 
                             op_desc = _create_op_desc_(
@@ -332,9 +342,12 @@ def _append_backward_ops_(block,
                                             no_grad_dict[block.idx])
 
     # append op_desc in grad_op_descs to target_block
+    op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
+    backward = core.op_proto_and_checker_maker.OpRole.Backward
     for op_desc in grad_op_descs:
         new_op_desc = target_block.desc.append_op()
         new_op_desc.copy_from(op_desc)
+        new_op_desc.set_attr(op_role_attr_name, backward)
         grad_to_var["__current_op_desc__"] = new_op_desc
         if callbacks is not None:
             assert (isinstance(callbacks, list))
@@ -422,20 +435,83 @@ def _get_stop_gradients_(program):
 def append_backward(loss, parameter_list=None, no_grad_set=None,
                     callbacks=None):
     """
-    Append backward part to main_program
+    Append backward part to main_program.
 
-    Args:
-        loss(Variable): The variable generated by cost function.
-        parameter_list(list[string]): Parameters that need to be updated by
-            optimizer. If None, it means all parameters need to be updated.
-        no_grad_set(set): Variables that have no gradients in Block 0.
-            All variables with `step_gradient=True` from all blocks will be
-            automatically added.
+    A complete neural network training is made up of forward and backward 
+    propagation. However, when we configure a network, we only need to 
+    specify its forwrd part. The backward part is generated automatically 
+    according to the forward part by this function.
 
-    Return:
-        (list[(Variable,Variable)]): list of (parameter, gradient) pair.
+    In most cases, users do not need to invoke this function manually. It 
+    will be automatically invoked by the optimizer's `minimize` function.
+
+    Args:
+        loss(Variable): The loss variable of the network.
+        parameter_list(list[string]|None): Names of parameters that need 
+                                           to be updated by optimizers. 
+                                           If it is None, all parameters 
+                                           will be updated.
+                                           Default: None
+        no_grad_set(set|None): Variables in the Block 0 whose gradients 
+                               should be ignored. All variables with 
+                               `step_gradient=True` from all blocks will 
+                               be automatically added into this set.
+                               Default: None
+        callbacks(list[callable object]|None): The callbacks are used for 
+                                               doing some custom jobs during 
+                                               backward part building. All 
+                                               callable objects in it will 
+                                               be invoked once each time a 
+                                               new gradient operator is added 
+                                               into the program. The callable 
+                                               object must has two input 
+                                               parameters: 'block' and 'context'. 
+                                               The 'block' is the block which 
+                                               the new gradient operator will 
+                                               be added to. The 'context' is a 
+                                               map, whose keys are gradient 
+                                               variable names and values are 
+                                               corresponding original variables.
+                                               In addition to this, the 'context' 
+                                               has another special key-value pair: 
+                                               the key is string '__current_op_desc__' 
+                                               and the value is the op_desc of the 
+                                               gradient operator who has just 
+                                               triggered the callable object. 
+
+    Returns:
+        list[(Variable,Variable)]: Pairs of parameter and its 
+        corresponding gradients. The key is the parameter and the 
+        value is gradient variable.
+
+    Raises:
+        AssertionError: If `loss` is not an instance of Variable.
+
+    Examples:
+        .. code-block:: python
+
+            # network configuration code
+            # ...
+            avg_loss = fluid.layers.mean(loss)
+            param_grad_list = fluid.backward.append_backward(loss=avg_loss)
     """
     assert isinstance(loss, framework.Variable)
+
+    if loss.op is None:
+        # the loss is from a cloned program. Find loss op manually.
+        for op in reversed(loss.block.ops):
+            assert isinstance(op, framework.Operator)
+            if len(op.output_arg_names) == 1 and op.output_arg_names[
+                    0] == loss.name:
+                loss.op = op
+                break
+        if loss.op is None:
+            raise ValueError("loss.op is None. Should not happend")
+
+    loss.op.set_attr(core.op_proto_and_checker_maker.kOpRoleAttrName(),
+                     int(core.op_proto_and_checker_maker.OpRole.Forward) |
+                     int(core.op_proto_and_checker_maker.OpRole.Loss))
+
     if callbacks is not None:
         isinstance(callbacks, list)
 
@@ -453,12 +529,16 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
     current_block_idx = program.current_block_idx
     grad_to_var = dict()
 
-    op_desc = _create_op_desc_("fill_constant", {}, {
-        "Out": [_append_grad_suffix_(loss.name)]
-    }, {"shape": [1],
-        "value": 1.0,
-        "dtype": loss.dtype,
-        "force_cpu": False})
+    op_desc = _create_op_desc_(
+        "fill_constant", {}, {"Out": [_append_grad_suffix_(loss.name)]}, {
+            "shape": [1],
+            "value": 1.0,
+            "dtype": loss.dtype,
+            "force_cpu": False,
+            core.op_proto_and_checker_maker.kOpRoleAttrName():
+            int(core.op_proto_and_checker_maker.OpRole.Backward) |
+            int(core.op_proto_and_checker_maker.OpRole.Loss),
+        })
     root_block.desc.append_op().copy_from(op_desc)
 
     block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
@@ -477,6 +557,8 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
 
     program.current_block_idx = current_block_idx
     program.sync_with_cpp()
+    # FIXME(zcd): prevent loss.grad optimized by mem_opt.
+    loss.block.var(_append_grad_suffix_(loss.name)).persistable = True
 
     if parameter_list is not None:
         parameters = parameter_list
@@ -500,6 +582,24 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
             params_and_grads.append((param_var, grad_var))
         else:
             params_and_grads.append((param_var, None))
+
+    op_role_var_attr_name = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
+    for p, g in params_and_grads:
+        if g is None:
+            continue
+        for op in reversed(program.global_block().ops):
+            assert isinstance(op, framework.Operator)
+            if g.name in op.output_arg_names:
+                g.op = op
+                break
+
+        if g.op is None:
+            raise ValueError("Unexpected branch")
+        attr_val = [p.name, g.name]
+        if g.op.has_attr(op_role_var_attr_name):
+            attr_val.extend(g.op.attr(op_role_var_attr_name))
+        g.op.set_attr(op_role_var_attr_name, attr_val)
+
     return params_and_grads
 
 
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 12add9e686910c3936cf17fe87a5d0b78443b270..18e2f3045e272fb4712391f87bffd3f367c1c744 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -24,8 +24,6 @@ __all__ = [
     'GradientClipByValue',
     'GradientClipByNorm',
     'GradientClipByGlobalNorm',
-    'append_gradient_clip_ops',
-    'error_clip_callback',
 ]
 
 
@@ -38,6 +36,25 @@ class BaseErrorClipAttr(object):
 
 
 class ErrorClipByValue(BaseErrorClipAttr):
+    """
+    Clips tensor values to the range [min, max].
+
+    Given a tensor t, this operation clips its value to min and max inplace.
+
+    - Any values less than min are set to min.
+    - Any values greater than max are set to max.
+
+    Args:
+        max (float): The maximum value to clip by.
+        min (float, optional): The minimum value to clip by. if not set by user, \
+        will be set to -max by framework.
+
+    Examples:
+        .. code-block:: python
+
+            var = fluid.framework.Variable(..., error_clip=ErrorClipByValue(max=5.0), ...)
+    """
+
     def __init__(self, max, min=None):
         max = float(max)
         if min is None:
@@ -99,6 +116,31 @@ class NullGradientClipAttr(BaseGradientClipAttr):
 
 
 class GradientClipByValue(BaseGradientClipAttr):
+    """
+    Clips gradient values to the range [min, max].
+
+    Given a tensor t, this operation clips its value to min and max inplace.
+
+    - Any values less than min are set to min.
+    - Any values greater than max are set to max.
+
+    Args:
+        max (float): The maximum value to clip by.
+        min (float, optional): The minimum value to clip by. if not set by user, \
+        will be set to -max by framework.
+
+    Examples:
+        .. code-block:: python
+
+            w_param_attrs = ParamAttr(name=None,
+              initializer=UniformInitializer(low=-1.0, high=1.0, seed=0),
+              learning_rate=1.0,
+              regularizer=L1Decay(1.0),
+              trainable=True,
+              clip=GradientClipByValue(-1.0, 1.0))
+            y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
+    """
+
     def __init__(self, max, min=None):
         max = float(max)
         if min is None:
@@ -120,6 +162,37 @@ class GradientClipByValue(BaseGradientClipAttr):
 
 
 class GradientClipByNorm(BaseGradientClipAttr):
+    """
+    Clips tensor values to a maximum L2-norm.
+
+    This operator limits the L2 norm of the input :math:`X` within :math:`max\_norm`.
+    If the L2 norm of :math:`X` is less than or equal to :math:`max\_norm`, :math:`Out`
+    will be the same as :math:`X`. If the L2 norm of :math:`X` is greater than
+    :math:`max\_norm`, :math:`X` will be linearly scaled to make the L2 norm of
+    :math:`Out` equal to :math:`max\_norm`, as shown in the following formula:
+
+    .. math::
+
+        Out = \\frac{max\_norm * X}{norm(X)},
+
+    where :math:`norm(X)` represents the L2 norm of :math:`X`.
+
+    Args:
+        clip_norm (float): The maximum norm value
+
+    Examples:
+        .. code-block:: python
+
+            w_param_attrs = ParamAttr(name=None,
+              initializer=UniformInitializer(low=-1.0, high=1.0, seed=0),
+              learning_rate=1.0,
+              regularizer=L1Decay(1.0),
+              trainable=True,
+              clip=GradientClipByNorm(clip_norm=2.0))
+            y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
+
+    """
+
     def __init__(self, clip_norm):
         self.clip_norm = clip_norm
 
@@ -135,6 +208,44 @@ class GradientClipByNorm(BaseGradientClipAttr):
 
 
 class GradientClipByGlobalNorm(BaseGradientClipAttr):
+    """
+    Clips values of multiple tensors by the ratio of the sum of their norms.
+
+    Given a list of tensors t_list, and a clipping ratio clip_norm, this
+    operation returns a list of clipped tensors list_clipped and the global
+    norm (global_norm) of all tensors in t_list.
+
+    To perform the clipping, the values :math:`t\_list[i]` are set to:
+
+    .. math::
+
+        t\_list[i] = t\_list[i] * \\frac{clip\_norm}{\max(global\_norm, clip\_norm)}
+
+    where:
+
+    .. math::
+
+        global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
+
+    If :math:`clip\_norm > global\_norm` then the entries in t_list remain as they are,
+    otherwise they're all shrunk by the global ratio.
+
+    Args:
+        clip_norm (float): The maximum norm value
+        group_name (str, optional): The group name for this clip.
+
+    Examples:
+        .. code-block:: python
+
+            p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)
+
+            with fluid.program_guard(main_program=prog_clip):
+                fluid.clip.set_gradient_clip(
+                    fluid.clip.GradientClipByGlobalNorm(clip_norm=2.0))
+                p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)
+
+    """
+
     def __init__(self, clip_norm, group_name="default_group"):
         if not isinstance(group_name, basestring):
             raise TypeError("'group_name' must be a basestring.")
@@ -183,15 +294,16 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
 
 def set_gradient_clip(clip, param_list=None, program=None):
     """
-        To specify parameters that require gradient clip.
-        Args:
-            clip(BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr, 
-                    which describes the type and detailed attributes of required gradient clip.
-            param_list(list, None by default): Parameters that require gradient clip. 
-                    It can be a list of parameter or a list of parameter's name. 
-                    When it's None, all parameters in the program will be included. 
-            program(Program, None by default): The program where parameters are. 
-                    Will be the default main program when assigned with None.
+    To specify parameters that require gradient clip.
+
+    Args:
+        clip(BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr,
+                which describes the type and detailed attributes of required gradient clip.
+        param_list(list(Variable)): Parameters that require gradient clip.
+                It can be a list of parameter or a list of parameter's name.
+                When it's None, all parameters in the program will be included.
+        program(Program): The program where parameters are.
+                Will be the default main program when assigned with None.
     """
     if not isinstance(clip, BaseGradientClipAttr):
         raise TypeError(
@@ -214,21 +326,24 @@ def set_gradient_clip(clip, param_list=None, program=None):
 
 def append_gradient_clip_ops(param_grad):
     context = dict()
-    create_op_callbacks = []
     for p, g in param_grad:
-        clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr())
-        if clip_attr is None:
-            clip_attr = NullGradientClipAttr()
-        if not isinstance(clip_attr, BaseGradientClipAttr):
-            raise TypeError(
-                "clip attribute should be an instance of BaseGradientClipAttr")
+        with p.block.program.optimized_guard(p):
+            clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr())
+            if clip_attr is None:
+                clip_attr = NullGradientClipAttr()
+            if not isinstance(clip_attr, BaseGradientClipAttr):
+                raise TypeError(
+                    "clip attribute should be an instance of BaseGradientClipAttr"
+                )
 
-        clip_attr.process_context(context=context, param=p, grad=g)
-        create_op_callbacks.append(
-            functools.partial(
-                clip_attr.create_operators, param=p, grad=g))
+            clip_attr.process_context(context=context, param=p, grad=g)
+
+    res = []
+    for p, g in param_grad:
+        with p.block.program.optimized_guard(p):
+            res.append(clip_attr.create_operators(param=p, grad=g))
 
-    return [each_callback() for each_callback in create_op_callbacks]
+    return res
 
 
 ClipByValue = GradientClipByValue
diff --git a/python/paddle/fluid/concurrency.py b/python/paddle/fluid/concurrency.py
index dec224fc886cd0739add0ebb6488625ef5063b8d..470dd0df524936a773f6e740c8079f0efa8ef7b4 100644
--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
@@ -12,17 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from layers.control_flow import BlockGuard
+from layers.control_flow import BlockGuard, equal
+from .framework import Operator
 from layer_helper import LayerHelper, unique_name
 from layers import fill_constant
 import core
 
 __all__ = [
-    'Go',
-    'make_channel',
-    'channel_send',
-    'channel_recv',
-    'channel_close',
+    'Go', 'make_channel', 'channel_send', 'channel_recv', 'channel_close',
+    'Select'
 ]
 
 
@@ -78,6 +76,206 @@ class Go(BlockGuard):
             attrs={'sub_block': go_block})
 
 
+class SelectCase(object):
+    DEFAULT = 0
+    SEND = 1
+    RECEIVE = 2
+
+    def __init__(self,
+                 select,
+                 case_idx,
+                 case_to_execute,
+                 channel_action_fn=None,
+                 channel=None,
+                 value=None,
+                 is_copy=False):
+        self.select = select
+        self.helper = LayerHelper('conditional_block')
+        self.main_program = self.helper.main_program
+        self.is_scalar_condition = True
+
+        self.case_to_execute = case_to_execute
+        self.idx = case_idx
+
+        # Since we aren't going to use the `channel_send` or `channel_recv`
+        # functions directly, we just need to capture the name.
+        self.action = (self.SEND
+                       if channel_action_fn.__name__ == ('channel_send') else
+                       self.RECEIVE) if channel_action_fn else self.DEFAULT
+
+        X = value
+        if self.action == self.SEND and is_copy:
+            # We create of copy of the data we want to send
+            copied_X = self.select.parent_block.create_var(
+                name=unique_name.generate(value.name + '_copy'),
+                type=value.type,
+                dtype=value.dtype,
+                shape=value.shape,
+                lod_level=value.lod_level,
+                capacity=value.capacity
+                if hasattr(value, 'capacity') else None, )
+
+            self.select.parent_block.append_op(
+                type="assign", inputs={"X": value}, outputs={"Out": copied_X})
+            X = copied_X
+
+        self.value = X
+        self.channel = channel
+
+    def __enter__(self):
+        self.block = self.main_program.create_block()
+
+    def construct_op(self):
+        main_program = self.helper.main_program
+        cases_block = main_program.current_block()
+
+        inner_outputs = set()
+        input_set = set()
+        params = set()
+
+        for op in self.block.ops:
+            # Iterate over all operators, get all the inputs
+            # and add as input to the SelectCase operator.
+            for iname in op.input_names:
+                for in_var_name in op.input(iname):
+                    if in_var_name not in inner_outputs:
+                        input_set.add(in_var_name)
+
+            for oname in op.output_names:
+                for out_var_name in op.output(oname):
+                    inner_outputs.add(out_var_name)
+
+        param_list = [
+            cases_block.var(each_name) for each_name in params
+            if each_name not in input_set
+        ]
+
+        # Iterate over all operators, get all the outputs
+        # add to the output list of SelectCase operator only if
+        # they exist in the parent block.
+        out_vars = []
+        for inner_out_name in inner_outputs:
+            if inner_out_name in cases_block.vars:
+                out_vars.append(cases_block.var(inner_out_name))
+
+        # First, create an op that will determine whether or not this is the
+        # conditional variable to execute.
+        should_execute_block = equal(
+            fill_constant(
+                shape=[1], dtype=core.VarDesc.VarType.INT32, value=self.idx),
+            self.case_to_execute)
+
+        step_scope = cases_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+
+        cases_block.append_op(
+            type='conditional_block',
+            inputs={'X': [should_execute_block],
+                    'Params': param_list},
+            outputs={'Out': out_vars,
+                     'Scope': [step_scope]},
+            attrs={
+                'sub_block': self.block,
+                'is_scalar_condition': self.is_scalar_condition
+            })
+
+        return '%s,%s,%s,%s' % (self.idx, self.action, self.channel.name
+                                if self.channel else '', self.value.name
+                                if self.value else '')
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.main_program.rollback()
+        if exc_type is not None:
+            return False  # re-raise exception
+        return True
+
+
+class Select(BlockGuard):
+    def __init__(self, name=None):
+        self.helper = LayerHelper('select', name=name)
+        self.parent_block = self.helper.main_program.current_block()
+        self.cases = []
+
+        super(Select, self).__init__(self.helper.main_program)
+        self.case_to_execute = fill_constant(
+            shape=[1], dtype=core.VarDesc.VarType.INT32, value=-1)
+
+    def __enter__(self):
+        super(Select, self).__enter__()
+        return self
+
+    def case(self, channel_action_fn, channel, value, is_copy=False):
+        """Create a new block for this condition.
+        """
+        select_case = SelectCase(self,
+                                 len(self.cases), self.case_to_execute,
+                                 channel_action_fn, channel, value, is_copy)
+
+        self.cases.append(select_case)
+
+        return select_case
+
+    def default(self):
+        """Create a default case block for this condition.
+        """
+        default_case = SelectCase(self, len(self.cases), self.case_to_execute)
+
+        self.cases.append(default_case)
+
+        return default_case
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is not None:
+            return False
+
+        # Create a select op and another block to wrap its
+        # case blocks.
+        select_block = self.helper.main_program.current_block()
+        parent_block = self.helper.main_program.block(select_block.parent_idx)
+
+        # Construct each case op, inside the newly created select block.
+        serialized_cases = []
+        for case in self.cases:
+            serialized_cases.append(case.construct_op())
+
+        intermediate = set()
+        params = set()
+
+        for case_block in select_block.ops:
+            if case_block.attrs and 'sub_block' in case_block.attrs:
+                for each_op in case_block.attrs['sub_block'].ops:
+                    assert isinstance(each_op, Operator)
+                    for iname in each_op.input_names:
+                        for in_var_name in each_op.input(iname):
+                            if in_var_name not in intermediate:
+                                params.add(in_var_name)
+
+                    for oname in each_op.output_names:
+                        for out_var_name in each_op.output(oname):
+                            intermediate.add(out_var_name)
+
+        out_list = [
+            parent_block.var(var_name) for var_name in parent_block.vars
+            if var_name in intermediate
+        ]
+
+        X = [select_block.var_recursive(x_name) for x_name in params]
+
+        # Needs to be used by `equal` inside the cases block.
+        X.append(self.case_to_execute)
+
+        # Construct the select op.
+        parent_block.append_op(
+            type='select',
+            inputs={'X': X,
+                    'case_to_execute': self.case_to_execute},
+            attrs={'sub_block': select_block,
+                   'cases': serialized_cases},
+            outputs={'Out': out_list})
+
+        return super(Select, self).__exit__(exc_type, exc_val, exc_tb)
+
+
 def make_channel(dtype, capacity=0):
     """
     Helps implementation of a concurrent program by creating a "channel" of
@@ -134,7 +332,7 @@ def make_channel(dtype, capacity=0):
     return channel
 
 
-def channel_send(channel, value):
+def channel_send(channel, value, is_copy=False):
     """
     Sends a value through a channel variable. Used by an unbuffered or buffered
     channel to pass data from within or to a concurrent Go block, where
@@ -144,6 +342,8 @@ def channel_send(channel, value):
         channel (Variable|Channel): Channel variable created using
         `make_channel`.
         value (Variable): Value to send to channel
+        is_copy (bool): Copy data while channel send. If False, then data
+        is moved. The input cannot be used after move. (default False)
     Returns:
         Variable: The boolean status on whether or not the channel
                   successfully sent the passed value.
@@ -160,20 +360,26 @@ def channel_send(channel, value):
     main_program = helper.main_program
     channel_send_block = main_program.current_block()
 
-    status = helper.create_variable(
-        name=unique_name.generate('status'),
-        type=core.VarDesc.VarType.LOD_TENSOR,
-        dtype=core.VarDesc.VarType.BOOL)
+    X = value
 
-    channel_send_op = channel_send_block.append_op(
-        type="channel_send",
-        inputs={
-            "Channel": channel,
-            "X": value,
-        },
-        outputs={"Status": status})
+    if is_copy:
+        copied_X = helper.create_variable(
+            name=unique_name.generate(value.name + '_copy'),
+            type=value.type,
+            dtype=value.dtype,
+            shape=value.shape,
+            lod_level=value.lod_level,
+            capacity=value.capacity if hasattr(value, 'capacity') else None)
+
+        assign_op = channel_send_block.append_op(
+            type="assign", inputs={"X": value}, outputs={"Out": copied_X})
+        X = copied_X
 
-    return status
+    channel_send_block.append_op(
+        type="channel_send", inputs={
+            "Channel": channel,
+            "X": X,
+        })
 
 
 def channel_recv(channel, return_value):
@@ -198,7 +404,7 @@ def channel_recv(channel, return_value):
 
           ch = fluid.make_channel(dtype='int32', capacity=10)
           with fluid.Go():
-            returned_value = fluid.channel_recv(ch, 'int32')
+            returned_value, return_status = fluid.channel_recv(ch, 'int32')
 
           # Code to send data through the channel.
     """
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index ac02401c79b787716b2e5f43e0d1c5686cf2bd13..c859778b3757f638ac531620f241e684522add57 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -15,7 +15,9 @@
 from __future__ import print_function
 import core
 import numpy
+import os
 import six.moves as six
+import multiprocessing
 
 from framework import Variable, default_main_program
 
@@ -27,6 +29,13 @@ class DataToLoDTensorConverter(object):
         self.place = place
         self.lod_level = lod_level
         self.shape = shape
+        negtive_count = 0
+        for s in self.shape:
+            if s < 0:
+                negtive_count += 1
+            if negtive_count > 1:
+                self.shape = None
+                break
         if dtype == core.VarDesc.VarType.FP32:
             self.dtype = 'float32'
         elif dtype == core.VarDesc.VarType.INT64:
@@ -35,15 +44,17 @@ class DataToLoDTensorConverter(object):
             self.dtype = 'float64'
         elif dtype == core.VarDesc.VarType.INT32:
             self.dtype = 'int32'
+        elif dtype == core.VarDesc.VarType.UINT8:
+            self.dtype = 'uint8'
         else:
             raise ValueError("dtype must be any of [int32, float32, int64, "
-                             "float64]")
+                             "float64, uint8]")
 
         self.data = []
         self.lod = []
 
         for i in six.range(lod_level):
-            self.lod.append([0])
+            self.lod.append([])
 
     def feed(self, data):
         self._feed_impl_(data, self.lod, self.lod_level)
@@ -52,21 +63,77 @@ class DataToLoDTensorConverter(object):
         if lod_level == 0:
             self.data.append(data)
         else:
-            cur_lod_len = len(data)
-            lod[-1].append(lod[-1][-1] + cur_lod_len)
+            lod[0].append(len(data))
             for each_data in data:
-                self._feed_impl_(each_data, lod[:-1], lod_level - 1)
+                self._feed_impl_(each_data, lod[1:], lod_level - 1)
 
     def done(self):
-        arr = numpy.array(self.data, dtype=self.dtype).reshape(self.shape)
+        arr = numpy.array(self.data, dtype=self.dtype)
+        if self.shape:
+            arr = arr.reshape(self.shape)
         t = core.LoDTensor()
         t.set(arr, self.place)
         if self.lod_level > 0:
-            t.set_lod(self.lod)
+            t.set_recursive_sequence_lengths(self.lod)
         return t
 
 
 class DataFeeder(object):
+    """
+    DataFeeder converts the data that returned by a reader into a data
+    structure that can feed into Executor and ParallelExecutor. The reader
+    usually returns a list of mini-batch data entries. Each data entry in
+    the list is one sample. Each sample is a list or a tuple with one
+    feature or multiple features.
+
+    The simple usage shows below:
+
+    ..  code-block:: python
+
+        place = fluid.CPUPlace()
+        img = fluid.layers.data(name='image', shape=[1, 28, 28])
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        feeder = fluid.DataFeeder([img, label], fluid.CPUPlace())
+        result = feeder.feed([([0] * 784, [9]), ([1] * 784, [1])])
+
+
+    If you want to feed data into GPU side separately in advance when you
+    use multi-GPU to train a model, you can use `decorate_reader` function.
+
+    ..  code-block:: python
+
+        place=fluid.CUDAPlace(0)
+        feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
+        reader = feeder.decorate_reader(
+            paddle.batch(flowers.train(), batch_size=16))
+
+    Args:
+        feed_list(list): The Variables or Variables'name that will
+            feed into model.
+        place(Place): place indicates feed data into CPU or GPU, if you want to
+            feed data into GPU, please using `fluid.CUDAPlace(i)` (`i` represents
+            the GPU id), or if you want to feed data into CPU, please using
+            `fluid.CPUPlace()`.
+        program(Program): The Program that will feed data into, if program
+            is None, it will use default_main_program(). Default None.
+
+    Raises:
+        ValueError: If some Variable is not in this Program.
+
+    Examples:
+        .. code-block:: python
+
+            # ...
+            place = fluid.CPUPlace()
+            feed_list = [
+                main_program.global_block().var(var_name) for var_name in feed_vars_name
+            ] # feed_vars_name is a list of variables' name.
+            feeder = fluid.DataFeeder(feed_list, place)
+            for data in reader():
+                outs = exe.run(program=main_program,
+                               feed=feeder.feed(data))
+    """
+
     def __init__(self, feed_list, place, program=None):
         self.feed_dtypes = []
         self.feed_names = []
@@ -96,6 +163,16 @@ class DataFeeder(object):
         self.place = place
 
     def feed(self, iterable):
+        """
+        According to feed_list and iterable, converters the input into
+        a data structure that can feed into Executor and ParallelExecutor.
+
+        Args:
+            iterable(list|tuple): the input data.
+
+        Returns:
+            dict: the result of conversion.
+        """
         converter = []
         for lod_level, shape, dtype in six.zip(
                 self.feed_lod_level, self.feed_shapes, self.feed_dtypes):
@@ -116,3 +193,94 @@ class DataFeeder(object):
         for each_name, each_converter in six.zip(self.feed_names, converter):
             ret_dict[each_name] = each_converter.done()
         return ret_dict
+
+    def feed_parallel(self, iterable, num_places=None):
+        """
+        Takes multiple mini-batches. Each mini-batch will be feed on each
+        device in advance.
+
+        Args:
+            iterable(list|tuple): the input data.
+            num_places(int): the number of devices. Default None.
+
+        Returns:
+            dict: the result of conversion.
+
+        Notes:
+            The number of devices and number of mini-batches must be same.
+        """
+        if isinstance(self.place, core.CUDAPlace):
+            places = [
+                core.CUDAPlace(i)
+                for i in six.xrange(self._get_number_of_places_(num_places))
+            ]
+        else:
+            places = [
+                core.CPUPlace()
+                for _ in six.xrange(self._get_number_of_places_(num_places))
+            ]
+
+        if len(iterable) != len(places):
+            raise ValueError("feed_parallel takes multiple mini-batches. Each "
+                             "mini-batch will be feed on each device. The "
+                             "number of devices and number of mini-batches "
+                             "must be same.")
+
+        place = self.place
+        for p, batch in six.zip(places, iterable):
+            self.place = p
+            yield self.feed(batch)
+        self.place = place
+
+    def _get_number_of_places_(self, num_places):
+        if num_places is not None:
+            return int(num_places)
+        elif isinstance(self.place, core.CUDAPlace):
+            return core.get_cuda_device_count()
+        else:
+            cpu_num = int(
+                os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+            return cpu_num
+
+    def decorate_reader(self,
+                        reader,
+                        multi_devices,
+                        num_places=None,
+                        drop_last=True):
+        """
+        Converter the input data into a data that returned by reader into
+        multiple mini-batches. Each mini-batch will be feed on each device.
+
+        Args:
+            reader(fun): the input data.
+            multi_devices(bool): the number of places. Default None.
+            num_places(int): the number of places. Default None.
+            drop_last(bool): the number of places. Default None.
+
+        Returns:
+            dict: the result of conversion.
+
+        Raises:
+            ValueError: If drop_last is False and the data batch which cannot
+            fit for devices.
+        """
+
+        def __reader_creator__():
+            if not multi_devices:
+                for item in reader():
+                    yield self.feed(item)
+            else:
+                num = self._get_number_of_places_(num_places)
+                item = []
+                for batch in reader():
+                    item.append(batch)
+                    if len(item) == num:
+                        yield list(self.feed_parallel(item, num))
+                        item = []
+                if not drop_last and len(item) != 0:
+                    raise ValueError(
+                        "The data batch which cannot fit for devices will be "
+                        "dropped is not implementation. Other strategies are "
+                        "not implemented")
+
+        return __reader_creator__
diff --git a/python/paddle/fluid/debuger.py b/python/paddle/fluid/debuger.py
deleted file mode 100644
index 97fa182c4007cc730c06e9f95259a2509e01ecdf..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/debuger.py
+++ /dev/null
@@ -1,273 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import re
-from graphviz import GraphPreviewGenerator
-import proto.framework_pb2 as framework_pb2
-import paddle.fluid.core as core
-
-_vartype2str_ = [
-    "UNK",
-    "LoDTensor",
-    "SelectedRows",
-    "FeedMinibatch",
-    "FetchList",
-    "StepScopes",
-    "LodRankTable",
-    "LoDTensorArray",
-    "PlaceList",
-]
-_dtype2str_ = [
-    "bool",
-    "int16",
-    "int32",
-    "int64",
-    "float16",
-    "float32",
-    "float64",
-]
-
-
-def repr_data_type(type):
-    return _dtype2str_[type]
-
-
-def repr_tensor(proto):
-    return "tensor(type={}, shape={})".format(_dtype2str_[int(proto.data_type)],
-                                              str(proto.dims))
-
-
-reprtpl = "{ttype} {name} ({reprs})"
-
-
-def repr_lodtensor(proto):
-    if proto.type.type != framework_pb2.VarType.LOD_TENSOR:
-        return
-
-    level = proto.type.lod_tensor.lod_level
-    reprs = repr_tensor(proto.type.lod_tensor.tensor)
-    return reprtpl.format(
-        ttype="LoDTensor" if level > 0 else "Tensor",
-        name=proto.name,
-        reprs="level=%d, %s" % (level, reprs) if level > 0 else reprs)
-
-
-def repr_selected_rows(proto):
-    if proto.type.type != framework_pb2.VarType.SELECTED_ROWS:
-        return
-
-    return reprtpl.format(
-        ttype="SelectedRows",
-        name=proto.name,
-        reprs=repr_tensor(proto.type.selected_rows))
-
-
-def repr_tensor_array(proto):
-    if proto.type.type != framework_pb2.VarType.LOD_TENSOR_ARRAY:
-        return
-
-    return reprtpl.format(
-        ttype="TensorArray",
-        name=proto.name,
-        reprs="level=%d, %s" % (proto.type.tensor_array.lod_level,
-                                repr_tensor(proto.type.lod_tensor.tensor)))
-
-
-type_handlers = [
-    repr_lodtensor,
-    repr_selected_rows,
-    repr_tensor_array,
-]
-
-
-def repr_var(vardesc):
-    for handler in type_handlers:
-        res = handler(vardesc)
-        if res:
-            return res
-
-
-def pprint_program_codes(program_desc):
-    reprs = []
-    for block_idx in range(program_desc.num_blocks()):
-        block_desc = program_desc.block(block_idx)
-        block_repr = pprint_block_codes(block_desc)
-        reprs.append(block_repr)
-    return '\n'.join(reprs)
-
-
-def pprint_block_codes(block_desc, show_backward=False):
-    def is_op_backward(op_desc):
-        if op_desc.type.endswith('_grad'): return True
-
-        def is_var_backward(var):
-            if "@GRAD" in var.parameter: return True
-            for arg in var.arguments:
-                if "@GRAD" in arg: return True
-
-        for var in op_desc.inputs:
-            if is_var_backward(var): return True
-        for var in op_desc.outputs:
-            if is_var_backward(var): return True
-        return False
-
-    def is_var_backward(var_desc):
-        return "@GRAD" in var_desc.name
-
-    #print(type(block_desc))
-    if type(block_desc) is not framework_pb2.BlockDesc:
-        block_desc = framework_pb2.BlockDesc.FromString(
-            block_desc.serialize_to_string())
-    var_reprs = []
-    op_reprs = []
-    for var in block_desc.vars:
-        if not show_backward and is_var_backward(var):
-            continue
-        var_reprs.append(repr_var(var))
-
-    for op in block_desc.ops:
-        if not show_backward and is_op_backward(op): continue
-        op_reprs.append(repr_op(op))
-
-    tpl = "// block-{idx}  parent-{pidx}\n// variables\n{vars}\n\n// operators\n{ops}\n"
-    return tpl.format(
-        idx=block_desc.idx,
-        pidx=block_desc.parent_idx,
-        vars='\n'.join(var_reprs),
-        ops='\n'.join(op_reprs), )
-
-
-def repr_attr(desc):
-    tpl = "{key}={value}"
-    valgetter = [
-        lambda attr: attr.i,
-        lambda attr: attr.f,
-        lambda attr: attr.s,
-        lambda attr: attr.ints,
-        lambda attr: attr.floats,
-        lambda attr: attr.strings,
-        lambda attr: attr.b,
-        lambda attr: attr.bools,
-        lambda attr: attr.block_idx,
-        lambda attr: attr.l,
-    ]
-    key = desc.name
-    value = valgetter[desc.type](desc)
-    if key == "dtype":
-        value = repr_data_type(value)
-    return tpl.format(key=key, value=str(value)), (key, value)
-
-
-def _repr_op_fill_constant(optype, inputs, outputs, attrs):
-    if optype == "fill_constant":
-        return "{output} = {data} [shape={shape}]".format(
-            output=','.join(outputs),
-            data=attrs['value'],
-            shape=str(attrs['shape']))
-
-
-op_repr_handlers = [_repr_op_fill_constant, ]
-
-
-def repr_op(opdesc):
-    optype = None
-    attrs = []
-    attr_dict = {}
-    is_target = None
-    inputs = []
-    outputs = []
-
-    tpl = "{outputs} = {optype}({inputs}{is_target}) [{attrs}]"
-    args2value = lambda args: args[0] if len(args) == 1 else str(list(args))
-    for var in opdesc.inputs:
-        key = var.parameter
-        value = args2value(var.arguments)
-        inputs.append("%s=%s" % (key, value))
-    for var in opdesc.outputs:
-        value = args2value(var.arguments)
-        outputs.append(value)
-    for attr in opdesc.attrs:
-        attr_repr, attr_pair = repr_attr(attr)
-        attrs.append(attr_repr)
-        attr_dict[attr_pair[0]] = attr_pair[1]
-
-    is_target = opdesc.is_target
-
-    for handler in op_repr_handlers:
-        res = handler(opdesc.type, inputs, outputs, attr_dict)
-        if res: return res
-
-    return tpl.format(
-        outputs=', '.join(outputs),
-        optype=opdesc.type,
-        inputs=', '.join(inputs),
-        attrs="{%s}" % ','.join(attrs),
-        is_target=", is_target" if is_target else "")
-
-
-def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
-    '''
-    Generate a debug graph for block.
-    Args:
-        block(Block): a block.
-    '''
-    graph = GraphPreviewGenerator("some graph")
-    # collect parameters and args
-    protostr = block.desc.serialize_to_string()
-    desc = framework_pb2.BlockDesc.FromString(str(protostr))
-
-    def need_highlight(name):
-        if highlights is None: return False
-        for pattern in highlights:
-            assert type(pattern) is str
-            if re.match(pattern, name):
-                return True
-        return False
-
-    # draw parameters and args
-    vars = {}
-    for var in desc.vars:
-        shape = [str(i) for i in var.lod_tensor.tensor.dims]
-        if not shape:
-            shape = ['null']
-        # create var
-        if var.persistable:
-            varn = graph.add_param(
-                var.name, var.type, shape, highlight=need_highlight(var.name))
-        else:
-            varn = graph.add_arg(var.name, highlight=need_highlight(var.name))
-        vars[var.name] = varn
-
-    def add_op_link_var(op, var, op2var=False):
-        for arg in var.arguments:
-            if arg not in vars:
-                # add missing variables as argument
-                vars[arg] = graph.add_arg(arg, highlight=need_highlight(arg))
-            varn = vars[arg]
-            highlight = need_highlight(op.description) or need_highlight(
-                varn.description)
-            if op2var:
-                graph.add_edge(op, varn, highlight=highlight)
-            else:
-                graph.add_edge(varn, op, highlight=highlight)
-
-    for op in desc.ops:
-        opn = graph.add_op(op.type, highlight=need_highlight(op.type))
-        for var in op.inputs:
-            add_op_link_var(opn, var, False)
-        for var in op.outputs:
-            add_op_link_var(opn, var, True)
-
-    graph(path, show=True)
diff --git a/python/paddle/fluid/debugger.py b/python/paddle/fluid/debugger.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c56064a1e8bdc5d975837cb5a75a40d557765ad
--- /dev/null
+++ b/python/paddle/fluid/debugger.py
@@ -0,0 +1,272 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import re
+from graphviz import GraphPreviewGenerator
+import proto.framework_pb2 as framework_pb2
+from google.protobuf import text_format
+
+_vartype2str_ = [
+    "UNK",
+    "LoDTensor",
+    "SelectedRows",
+    "FeedMinibatch",
+    "FetchList",
+    "StepScopes",
+    "LodRankTable",
+    "LoDTensorArray",
+    "PlaceList",
+]
+_dtype2str_ = [
+    "bool",
+    "int16",
+    "int32",
+    "int64",
+    "float16",
+    "float32",
+    "float64",
+]
+
+
+def repr_data_type(type):
+    return _dtype2str_[type]
+
+
+def repr_tensor(proto):
+    return "tensor(type={}, shape={})".format(_dtype2str_[int(proto.data_type)],
+                                              str(proto.dims))
+
+
+reprtpl = "{ttype} {name} ({reprs})"
+
+
+def repr_lodtensor(proto):
+    if proto.type.type != framework_pb2.VarType.LOD_TENSOR:
+        return
+
+    level = proto.type.lod_tensor.lod_level
+    reprs = repr_tensor(proto.type.lod_tensor.tensor)
+    return reprtpl.format(
+        ttype="LoDTensor" if level > 0 else "Tensor",
+        name=proto.name,
+        reprs="level=%d, %s" % (level, reprs) if level > 0 else reprs)
+
+
+def repr_selected_rows(proto):
+    if proto.type.type != framework_pb2.VarType.SELECTED_ROWS:
+        return
+
+    return reprtpl.format(
+        ttype="SelectedRows",
+        name=proto.name,
+        reprs=repr_tensor(proto.type.selected_rows))
+
+
+def repr_tensor_array(proto):
+    if proto.type.type != framework_pb2.VarType.LOD_TENSOR_ARRAY:
+        return
+
+    return reprtpl.format(
+        ttype="TensorArray",
+        name=proto.name,
+        reprs="level=%d, %s" % (proto.type.tensor_array.lod_level,
+                                repr_tensor(proto.type.lod_tensor.tensor)))
+
+
+type_handlers = [
+    repr_lodtensor,
+    repr_selected_rows,
+    repr_tensor_array,
+]
+
+
+def repr_var(vardesc):
+    for handler in type_handlers:
+        res = handler(vardesc)
+        if res:
+            return res
+
+
+def pprint_program_codes(program_desc):
+    reprs = []
+    for block_idx in range(program_desc.desc.num_blocks()):
+        block_desc = program_desc.block(block_idx)
+        block_repr = pprint_block_codes(block_desc)
+        reprs.append(block_repr)
+    return '\n'.join(reprs)
+
+
+def pprint_block_codes(block_desc, show_backward=False):
+    def is_op_backward(op_desc):
+        if op_desc.type.endswith('_grad'): return True
+
+        def is_var_backward(var):
+            if "@GRAD" in var.parameter: return True
+            for arg in var.arguments:
+                if "@GRAD" in arg: return True
+
+        for var in op_desc.inputs:
+            if is_var_backward(var): return True
+        for var in op_desc.outputs:
+            if is_var_backward(var): return True
+        return False
+
+    def is_var_backward(var_desc):
+        return "@GRAD" in var_desc.name
+
+    if type(block_desc) is not framework_pb2.BlockDesc:
+        block_desc = framework_pb2.BlockDesc.FromString(
+            block_desc.desc.serialize_to_string())
+    var_reprs = []
+    op_reprs = []
+    for var in block_desc.vars:
+        if not show_backward and is_var_backward(var):
+            continue
+        var_reprs.append(repr_var(var))
+
+    for op in block_desc.ops:
+        if not show_backward and is_op_backward(op): continue
+        op_reprs.append(repr_op(op))
+
+    tpl = "// block-{idx}  parent-{pidx}\n// variables\n{vars}\n\n// operators\n{ops}\n"
+    return tpl.format(
+        idx=block_desc.idx,
+        pidx=block_desc.parent_idx,
+        vars='\n'.join(var_reprs),
+        ops='\n'.join(op_reprs), )
+
+
+def repr_attr(desc):
+    tpl = "{key}={value}"
+    valgetter = [
+        lambda attr: attr.i,
+        lambda attr: attr.f,
+        lambda attr: attr.s,
+        lambda attr: attr.ints,
+        lambda attr: attr.floats,
+        lambda attr: attr.strings,
+        lambda attr: attr.b,
+        lambda attr: attr.bools,
+        lambda attr: attr.block_idx,
+        lambda attr: attr.l,
+    ]
+    key = desc.name
+    value = valgetter[desc.type](desc)
+    if key == "dtype":
+        value = repr_data_type(value)
+    return tpl.format(key=key, value=str(value)), (key, value)
+
+
+def _repr_op_fill_constant(optype, inputs, outputs, attrs):
+    if optype == "fill_constant":
+        return "{output} = {data} [shape={shape}]".format(
+            output=','.join(outputs),
+            data=attrs['value'],
+            shape=str(attrs['shape']))
+
+
+op_repr_handlers = [_repr_op_fill_constant, ]
+
+
+def repr_op(opdesc):
+    optype = None
+    attrs = []
+    attr_dict = {}
+    is_target = None
+    inputs = []
+    outputs = []
+
+    tpl = "{outputs} = {optype}({inputs}{is_target}) [{attrs}]"
+    args2value = lambda args: args[0] if len(args) == 1 else str(list(args))
+    for var in opdesc.inputs:
+        key = var.parameter
+        value = args2value(var.arguments)
+        inputs.append("%s=%s" % (key, value))
+    for var in opdesc.outputs:
+        value = args2value(var.arguments)
+        outputs.append(value)
+    for attr in opdesc.attrs:
+        attr_repr, attr_pair = repr_attr(attr)
+        attrs.append(attr_repr)
+        attr_dict[attr_pair[0]] = attr_pair[1]
+
+    is_target = opdesc.is_target
+
+    for handler in op_repr_handlers:
+        res = handler(opdesc.type, inputs, outputs, attr_dict)
+        if res: return res
+
+    return tpl.format(
+        outputs=', '.join(outputs),
+        optype=opdesc.type,
+        inputs=', '.join(inputs),
+        attrs="{%s}" % ','.join(attrs),
+        is_target=", is_target" if is_target else "")
+
+
+def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
+    '''
+    Generate a debug graph for block.
+    Args:
+        block(Block): a block.
+    '''
+    graph = GraphPreviewGenerator("some graph")
+    # collect parameters and args
+    protostr = block.desc.serialize_to_string()
+    desc = framework_pb2.BlockDesc.FromString(str(protostr))
+
+    def need_highlight(name):
+        if highlights is None: return False
+        for pattern in highlights:
+            assert type(pattern) is str
+            if re.match(pattern, name):
+                return True
+        return False
+
+    # draw parameters and args
+    vars = {}
+    for var in desc.vars:
+        # TODO(gongwb): format the var.type
+        # create var
+        if var.persistable:
+            varn = graph.add_param(
+                var.name,
+                str(var.type).replace("\n", "<br />", 1),
+                highlight=need_highlight(var.name))
+        else:
+            varn = graph.add_arg(var.name, highlight=need_highlight(var.name))
+        vars[var.name] = varn
+
+    def add_op_link_var(op, var, op2var=False):
+        for arg in var.arguments:
+            if arg not in vars:
+                # add missing variables as argument
+                vars[arg] = graph.add_arg(arg, highlight=need_highlight(arg))
+            varn = vars[arg]
+            highlight = need_highlight(op.description) or need_highlight(
+                varn.description)
+            if op2var:
+                graph.add_edge(op, varn, highlight=highlight)
+            else:
+                graph.add_edge(varn, op, highlight=highlight)
+
+    for op in desc.ops:
+        opn = graph.add_op(op.type, highlight=need_highlight(op.type))
+        for var in op.inputs:
+            add_op_link_var(opn, var, False)
+        for var in op.outputs:
+            add_op_link_var(opn, var, True)
+
+    graph(path, show=False)
diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py
deleted file mode 100644
index 3d3a6c116eeb39fb7236d0e9707415cdd6b828bd..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/distribute_transpiler.py
+++ /dev/null
@@ -1,729 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import framework
-from framework import Program, default_main_program, default_startup_program, Parameter, Variable
-import optimizer
-from layer_helper import LayerHelper
-from distributed_spliter import *
-import math
-from . import core
-
-
-class VarBlock:
-    def __init__(self, varname, offset, size):
-        self.varname = varname
-        # NOTE: real offset is offset * size
-        self.offset = offset
-        self.size = size
-
-    def __str__(self):
-        return "%s:%d:%d" % (self.varname, self.offset, self.size)
-
-
-class UnionFind(object):
-    """ Union-find data struct.
-    
-    Union-find is a data struct that keeps track of a set of elements partitioned
-    into a number of disjoint (non-overlapping) subsets.
-
-    Reference:
-    https://en.wikipedia.org/wiki/Disjoint-set_data_structure
-
-    Args:
-      elements(list): The initialize element list.
-    """
-
-    def __init__(self, elementes=None):
-        self._parents = []  # index -> parent index
-        self._index = {}  # element -> index
-        self._curr_idx = 0
-        if not elementes:
-            elementes = []
-        for ele in elementes:
-            self._parents.append(self._curr_idx)
-            self._index.update({ele: self._curr_idx})
-            self._curr_idx += 1
-
-    def find(self, x):
-        # Find the root index of given element x,
-        # execute the path compress while findind the root index
-        if not x in self._index:
-            return -1
-        idx = self._index[x]
-        while idx != self._parents[idx]:
-            t = self._parents[idx]
-            self._parents[idx] = self._parents[t]
-            idx = t
-        return idx
-
-    def union(self, x, y):
-        # Union two given element
-        x_root = self.find(x)
-        y_root = self.find(y)
-
-        if x_root == y_root:
-            return
-        self._parents[x_root] = y_root
-
-    def is_connected(self, x, y):
-        # If two given elements have the same root index,
-        # then they are connected.
-        return self.find(x) == self.find(y)
-
-
-def same_or_split_var(p_name, var_name):
-    return p_name == var_name or p_name.startswith(var_name + ".block")
-
-
-def split_dense_variable(var_list,
-                         pserver_count,
-                         min_block_size=1024,
-                         max_block_size=1048576):
-    """
-        We may need to split dense tensor to one or more blocks and put
-        them equally onto parameter server. One block is a sub-tensor
-        aligned by dim[0] of the tensor.
-
-        We need to have a minimal block size so that the calculations in
-        the parameter server side can gain better performance. By default
-        minimum block size is 1024. The max block size is used to prevent
-        very large blocks that may cause send error.
-    """
-    blocks = []
-    for var in var_list:
-        split_count = pserver_count
-        var_numel = reduce(lambda x, y: x * y, var.shape)
-        max_pserver_count = int(math.floor(var_numel / float(min_block_size)))
-        if max_pserver_count == 0:
-            max_pserver_count = 1
-        if max_pserver_count < pserver_count:
-            split_count = max_pserver_count
-        block_size = int(math.ceil(var_numel / float(split_count)))
-
-        if len(var.shape) >= 2:
-            # align by dim1(width)
-            dim1 = reduce(lambda x, y: x * y, var.shape[1:])
-            remains = block_size % dim1
-            if remains != 0:
-                block_size += dim1 - remains
-        # update split_count after aligning
-        split_count = int(math.ceil(var_numel / float(block_size)))
-        for block_id in xrange(split_count):
-            curr_block_size = min(block_size, var_numel - (
-                (block_id) * block_size))
-            block = VarBlock(var.name, block_id, curr_block_size)
-            blocks.append(str(block))
-    return blocks
-
-
-class DistributeTranspiler:
-    def transpile(self,
-                  optimize_ops,
-                  params_grads,
-                  trainer_id,
-                  program=None,
-                  pservers="127.0.0.1:6174",
-                  trainers=1,
-                  split_method=round_robin):
-        """
-            Transpile the program to distributed data-parallelism programs.
-            The main_program will be transformed to use a remote parameter server
-            to do parameter optimization. And the optimization graph will be put
-            into a parameter server program.
-
-            Use different methods to split trainable variables to different
-            parameter servers.
-
-            Steps to transpile trainer:
-            1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
-            2. rename splited grad variables to add trainer_id suffix ".trainer_%d".
-            3. modify trainer program add split_op to each grad variable.
-            4. append send_op to send splited variables to server and fetch
-               params(splited blocks or origin param) from server.
-            5. append concat_op to merge splited blocks to update local weights.
-
-            Steps to transpile pserver:
-            1. create new program for parameter server.
-            2. create params and grad variables that assigned to current server instance.
-            3. create a sub-block in the server side program
-            4. append ops that should run on current server instance.
-            5. add listen_and_serv op
-
-            :param optimize_ops: op list of optimization, should be the
-                                    return value of Optimizer.minimize
-            :type optimize_ops: list
-            :param params_grads: list of tuple(weight, gradient)
-            :type params_grads: list
-            :param trainer_id: one unique id for each trainer in a job.
-            :type trainer_id: int
-            :param program: program to transpile, default is default_main_program
-            :type program: Program
-            :param pservers: parameter server endpoints like "m1:6174,m2:6174"
-            :type pservers: string
-            :param trainers: total number of workers/trainers in the job
-            :type trainers: int
-            :param split_method: A function to determin how to split variables
-                to different servers equally.
-            :type split_method: function
-        """
-        assert (callable(split_method))
-        if program is None:
-            program = default_main_program()
-        self.program = program
-        self.trainers = trainers
-        self.optimize_ops = optimize_ops
-        # TODO(typhoonzero): currently trainer_id is fetched from cluster system
-        # like Kubernetes, we should port this to use etcd later when developing
-        # fluid distributed training with fault-tolerance.
-        self.trainer_id = trainer_id
-        pserver_endpoints = pservers.split(",")
-
-        # step1
-        param_list = [pg[0] for pg in params_grads]
-        grad_list = [pg[1] for pg in params_grads]
-        grad_blocks = split_dense_variable(grad_list, len(pserver_endpoints))
-        param_blocks = split_dense_variable(param_list, len(pserver_endpoints))
-        # step2
-        grad_var_mapping = self._append_split_op(program, grad_blocks)
-        # step3
-        send_inputs = []
-        send_outputs = []
-        for b in grad_blocks:  # append by order
-            varname, block_id, _ = b.split(":")
-            send_inputs.append(grad_var_mapping[varname][int(block_id)])
-
-        param_var_mapping = self._create_vars_from_blocklist(program,
-                                                             param_blocks)
-        for b in param_blocks:
-            varname, block_id, _ = b.split(":")
-            send_outputs.append(param_var_mapping[varname][int(block_id)])
-        # let send_op know which endpoint to send which var to, eplist has the same
-        # order as send_inputs.
-        eplist = split_method(send_inputs, pserver_endpoints)
-        # create mapping of endpoint -> split var to create pserver side program
-        self.param_grad_ep_mapping = dict()
-        for i, ep in enumerate(eplist):
-            param = send_outputs[i]
-            grad = send_inputs[i]
-            if not self.param_grad_ep_mapping.has_key(ep):
-                self.param_grad_ep_mapping[ep] = {"params": [], "grads": []}
-            self.param_grad_ep_mapping[ep]["params"].append(param)
-            self.param_grad_ep_mapping[ep]["grads"].append(grad)
-
-        rpc_client_var = program.global_block().create_var(
-            name="RPC_CLIENT_VAR",
-            persistable=True,
-            type=core.VarDesc.VarType.RAW)
-
-        # create send_op
-        program.global_block().append_op(
-            type="send",
-            inputs={"X": send_inputs},
-            outputs={"Out": send_outputs,
-                     "RPCClient": rpc_client_var},
-            attrs={"endpoints": pserver_endpoints,
-                   "epmap": eplist})
-        # step4
-        for varname, splited_var in param_var_mapping.iteritems():
-            if len(splited_var) <= 1:
-                continue
-            orig_param = program.global_block().vars[varname]
-            program.global_block().append_op(
-                type="concat",
-                inputs={"X": splited_var},
-                outputs={"Out": [orig_param]},
-                attrs={"axis": 0})
-
-    def get_trainer_program(self):
-        # remove optimize ops and add a send op to main_program
-        self.program.global_block().delete_ops(self.optimize_ops)
-        # FIXME(typhoonzero): serialize once will fix error occurs when clone.
-        self.program.__str__()
-        return self.program
-
-    def get_pserver_program(self, endpoint):
-        """
-        Get pserver side program using the endpoint.
-        NOTE: assume blocks of the same variable is not distributed
-        on the same pserver, only change param/grad varnames for
-        trainers to fetch.
-        """
-        # step1
-        pserver_program = Program()
-        # step2
-        recv_inputs = []
-        for v in self.param_grad_ep_mapping[endpoint]["params"]:
-            self._clone_var(pserver_program.global_block(), v)
-        for v in self.param_grad_ep_mapping[endpoint]["grads"]:
-            # create vars for each trainer in global scope, so
-            # we don't need to create them when grad arrives.
-            # change client side var name to origin name by
-            # removing ".trainer_%d" suffix
-            suff_idx = v.name.find(".trainer_")
-            if suff_idx >= 0:
-                orig_var_name = v.name[:suff_idx]
-            pserver_program.global_block().create_var(
-                name=orig_var_name,
-                persistable=True,
-                type=v.type,
-                dtype=v.dtype,
-                shape=v.shape)
-            for trainer_id in xrange(self.trainers):
-                var = pserver_program.global_block().create_var(
-                    name="%s.trainer_%d" % (orig_var_name, trainer_id),
-                    persistable=False,
-                    type=v.type,
-                    dtype=v.dtype,
-                    shape=v.shape)
-                recv_inputs.append(var)
-        # step3
-        optimize_block = pserver_program.create_block(0)
-        # step 4
-        # Create a union-find data struct from optimize ops,
-        # If two ops are connected, we could add these two ops
-        # into one set.
-        ufind = self._create_ufind(self.optimize_ops)
-        # step 4.2 
-        # Iterate through the ops and append optimize op which
-        # located on current pserver
-        opt_op_on_pserver = []
-        for _, op in enumerate(self.optimize_ops):
-            if self._is_opt_op(op) and self._is_opt_op_on_pserver(endpoint, op):
-                opt_op_on_pserver.append(op)
-        # step 4.3
-        # Iterate through the ops, and if an op and the optimize ops
-        # which located on current pserver are in one set, then 
-        # append it into the sub program.
-        for _, op in enumerate(self.optimize_ops):
-            for _, opt_op in enumerate(opt_op_on_pserver):
-                if ufind.is_connected(op, opt_op):
-                    if self._is_opt_op(op):
-                        self._append_pserver_ops(optimize_block, op, endpoint,
-                                                 default_main_program())
-                    else:
-                        self._append_pserver_non_opt_ops(optimize_block, op)
-                    break
-        # step5 append the listen_and_serv op
-        pserver_program.global_block().append_op(
-            type="listen_and_serv",
-            inputs={'X': recv_inputs},
-            outputs={},
-            attrs={
-                "OptimizeBlock": optimize_block,
-                "endpoint": endpoint,
-                "Fanin": self.trainers
-            })
-        pserver_program.sync_with_cpp()
-        return pserver_program
-
-    def get_startup_program(self, endpoint, pserver_program):
-        """
-        Get startup program for current parameter server.
-        Modify operator input variables if there are variables that
-        were split to several blocks.
-        """
-        s_prog = Program()
-        orig_s_prog = framework.default_startup_program()
-        params = self.param_grad_ep_mapping[endpoint]["params"]
-
-        def _get_splited_name_and_shape(varname):
-            for idx, splited_param in enumerate(params):
-                pname = splited_param.name
-                if same_or_split_var(pname, varname) and varname != pname:
-                    return pname, splited_param.shape
-            return "", []
-
-        # 1. create vars in pserver program to startup program
-        pserver_vars = pserver_program.global_block().vars
-        created_var_map = dict()
-        for _, var in pserver_vars.iteritems():
-            tmpvar = s_prog.global_block().create_var(
-                name=var.name,
-                persistable=var.persistable,
-                dtype=var.dtype,
-                shape=var.shape)
-            created_var_map[var.name] = tmpvar
-
-        # 2. rename op outputs
-        for op in orig_s_prog.global_block().ops:
-            new_inputs = dict()
-            new_outputs = dict()
-            # do not append startup op if var is not on this pserver
-            op_on_pserver = False
-            for key in op.output_names:
-                newname, _ = _get_splited_name_and_shape(op.output(key)[0])
-                if newname:
-                    op_on_pserver = True
-                    new_outputs[key] = created_var_map[newname]
-                elif op.output(key)[0] in pserver_vars:
-                    op_on_pserver = True
-                    new_outputs[key] = pserver_vars[op.output(key)[0]]
-
-            # most startup program ops have no inputs
-            new_inputs = self._get_input_map_from_op(pserver_vars, op)
-
-            if op_on_pserver:
-                if op.type in [
-                        "gaussian_random", "fill_constant", "uniform_random"
-                ]:
-                    op.attrs["shape"] = new_outputs["Out"].shape
-                s_prog.global_block().append_op(
-                    type=op.type,
-                    inputs=new_inputs,
-                    outputs=new_outputs,
-                    attrs=op.attrs)
-        return s_prog
-
-    # ====================== private transpiler functions =====================
-    def _create_vars_from_blocklist(self,
-                                    program,
-                                    block_list,
-                                    add_trainer_suffix=False):
-        """
-        NOTE: only grads need to be named for different trainers, use
-              add_trainer_suffix to rename the grad vars.
-        """
-        block_map = dict()
-        var_mapping = dict()
-        for block_str in block_list:
-            varname, offset, size = block_str.split(":")
-            if not block_map.has_key(varname):
-                block_map[varname] = []
-            block_map[varname].append((long(offset), long(size)))
-        for varname, splited in block_map.iteritems():
-            orig_var = program.global_block().var(varname)
-            if len(splited) == 1:
-                if add_trainer_suffix:
-                    new_var_name = "%s.trainer_%d" % \
-                        (orig_var.name, self.trainer_id)
-                    program.global_block().rename_var(varname, new_var_name)
-                    var_mapping[varname] = \
-                        [program.global_block().var(new_var_name)]
-                else:
-                    var_mapping[varname] = \
-                        [program.global_block().var(orig_var.name)]
-                continue
-
-            var_mapping[varname] = []
-            orig_shape = orig_var.shape
-            orig_dim1_flatten = 1
-            if len(orig_shape) >= 2:
-                orig_dim1_flatten = reduce(lambda x, y: x * y, orig_shape[1:])
-
-            for i, block in enumerate(splited):
-                size = block[1]
-                rows = size / orig_dim1_flatten
-                splited_shape = [rows]
-                if len(orig_shape) >= 2:
-                    splited_shape.extend(orig_shape[1:])
-                new_var_name = ""
-                if add_trainer_suffix:
-                    new_var_name = "%s.block%d.trainer_%d" % \
-                        (varname, i, self.trainer_id)
-                else:
-                    new_var_name = "%s.block%d" % \
-                        (varname, i)
-                var = program.global_block().create_var(
-                    name=new_var_name,
-                    persistable=False,
-                    dtype=orig_var.dtype,
-                    type=orig_var.type,
-                    shape=splited_shape)  # flattend splited var
-                var_mapping[varname].append(var)
-            program.global_block().sync_with_cpp()
-        return var_mapping
-
-    def _clone_var(self, block, var):
-        assert isinstance(var, Variable)
-        return block.create_var(
-            name=var.name,
-            shape=var.shape,
-            dtype=var.dtype,
-            type=var.type,
-            lod_level=var.lod_level,
-            persistable=True)
-
-    def _append_split_op(self, program, gradblocks):
-        # Split variables that need to be split and append respective ops
-        var_mapping = self._create_vars_from_blocklist(
-            program, gradblocks, add_trainer_suffix=True)
-        for varname, splited_vars in var_mapping.iteritems():
-            # variable that don't need to split have empty splited_vars
-            if len(splited_vars) <= 1:
-                continue
-            orig_var = program.global_block().vars[varname]
-            if orig_var.type == core.VarDesc.VarType.SELECTED_ROWS:
-                height_sections = []
-                for v in splited_vars:
-                    height_sections.append(v.shape[0])
-                program.global_block().append_op(
-                    type="split_selected_rows",
-                    inputs={"X": orig_var},
-                    outputs={"Out": splited_vars},
-                    attrs={"height_sections": height_sections})
-            elif orig_var.type == core.VarDesc.VarType.LOD_TENSOR:
-                sections = []
-                for v in splited_vars:
-                    sections.append(v.shape[0])
-                program.global_block().append_op(
-                    type="split",
-                    inputs={"X": orig_var},
-                    outputs={"Out": splited_vars},
-                    attrs={"sections": sections}  # assume split evenly
-                )
-            else:
-                AssertionError("Variable type should be in set "
-                               "[LOD_TENSOR, SELECTED_ROWS]")
-        return var_mapping
-
-    def _get_optimizer_input_shape(self, op_type, varkey, orig_shape,
-                                   param_shape):
-        """
-        Returns the shape for optimizer inputs that need to be reshaped when
-        Param and Grad is split to multiple servers.
-        """
-        # HACK(typhoonzero): Should use functions of corresponding optimizer in
-        # optimizer.py to get the shape, do not  bind this in the transpiler.
-        if op_type == "adam":
-            if varkey in ["Moment1", "Moment2"]:
-                return param_shape
-        elif op_type == "adagrad":
-            if varkey == "Moment":
-                return param_shape
-        elif op_type == "adamax":
-            if varkey in ["Moment", "InfNorm"]:
-                return param_shape
-        elif op_type == "momentum":
-            if varkey == "Velocity":
-                return param_shape
-        elif op_type == "":
-            if varkey == "Moment":
-                return param_shape
-        elif op_type == "sgd":
-            pass
-        return orig_shape
-
-    def _orig_varname(self, varname):
-        suff_idx = varname.find(".trainer_")
-        orig_var_name = ""
-        if suff_idx >= 0:
-            orig_var_name = varname[:suff_idx]
-        return orig_var_name
-
-    def _append_pserver_ops(self, optimize_block, opt_op, endpoint,
-                            origin_program):
-        program = optimize_block.program
-        pserver_block = program.global_block()
-        new_inputs = dict()
-        # update param/grad shape first, then other inputs like
-        # moment can use the updated shape
-        for key in opt_op.input_names:
-            if key == "Grad":
-                grad_block = None
-                for g in self.param_grad_ep_mapping[endpoint]["grads"]:
-                    if same_or_split_var(
-                            self._orig_varname(g.name), opt_op.input(key)[0]):
-                        grad_block = g
-                        break
-                if not grad_block:
-                    # do not append this op if current endpoint
-                    # is not dealing with this grad block
-                    return
-                merged_var = \
-                    pserver_block.vars[self._orig_varname(grad_block.name)]
-                if self.trainers > 1:
-                    vars2merge = []
-                    for i in xrange(self.trainers):
-                        per_trainer_name = "%s.trainer_%d" % \
-                        (self._orig_varname(grad_block.name), i)
-                        vars2merge.append(pserver_block.vars[per_trainer_name])
-
-                    optimize_block.append_op(
-                        type="sum",
-                        inputs={"X": vars2merge},
-                        outputs={"Out": merged_var})
-                    if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
-                        optimize_block.append_op(
-                            type="scale",
-                            inputs={"X": merged_var},
-                            outputs={"Out": merged_var},
-                            attrs={"scale": 1.0 / float(self.trainers)})
-                new_inputs[key] = merged_var
-            elif key == "Param":
-                # param is already created on global program
-                param_block = None
-                for p in self.param_grad_ep_mapping[endpoint]["params"]:
-                    if same_or_split_var(p.name, opt_op.input(key)[0]):
-                        param_block = p
-                        break
-                if not param_block:
-                    return
-                tmpvar = pserver_block.create_var(
-                    name=param_block.name,
-                    persistable=True,
-                    dtype=param_block.dtype,
-                    shape=param_block.shape)
-                new_inputs[key] = tmpvar
-            elif key == "LearningRate":
-                # leraning rate variable has already be created by non-optimize op,
-                # don't create it once again.
-                lr_varname = opt_op.input(key)[0]
-                if pserver_block.vars.has_key(lr_varname):
-                    new_inputs[key] = pserver_block.vars[opt_op.input(key)[0]]
-                else:
-                    origin_var = origin_program.global_block().vars[lr_varname]
-                    tmpvar = pserver_block.create_var(
-                        name=origin_var.name,
-                        persistable=origin_var.persistable,
-                        dtype=origin_var.dtype,
-                        shape=origin_var.shape)
-                    new_inputs[key] = tmpvar
-
-        for key in opt_op.input_names:
-            new_shape = None
-            if key in ["Param", "Grad", "LearningRate"]:
-                continue
-            var = self.program.global_block().vars[opt_op.input(key)[0]]
-            # update accumulator variable shape
-            param_shape = new_inputs["Param"].shape
-            new_shape = self._get_optimizer_input_shape(opt_op.type, key,
-                                                        var.shape, param_shape)
-            tmpvar = pserver_block.create_var(
-                name=var.name,
-                persistable=var.persistable,
-                dtype=var.dtype,
-                shape=new_shape)
-            new_inputs[key] = tmpvar
-
-        # change output's ParamOut variable
-        outputs = self._get_output_map_from_op(self.program.global_block().vars,
-                                               opt_op)
-        outputs["ParamOut"] = new_inputs["Param"]
-
-        optimize_block.append_op(
-            type=opt_op.type,
-            inputs=new_inputs,
-            outputs=outputs,
-            attrs=opt_op.attrs)
-
-    def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
-        program = optimize_block.program
-        # Append the ops for parameters that do not need to be optimized/updated
-        inputs = self._get_input_map_from_op(self.program.global_block().vars,
-                                             opt_op)
-        for varlist in inputs.itervalues():
-            if not isinstance(varlist, list):
-                varlist = [varlist]
-
-            for var in varlist:
-                if not program.global_block().vars.has_key(var.name):
-                    program.global_block().create_var(
-                        name=var.name,
-                        persistable=var.persistable,
-                        dtype=var.dtype,
-                        shape=var.shape)
-
-        outputs = self._get_output_map_from_op(self.program.global_block().vars,
-                                               opt_op)
-
-        for varlist in outputs.itervalues():
-            if not isinstance(varlist, list):
-                varlist = [varlist]
-
-            for var in varlist:
-                program.global_block().create_var(
-                    name=var.name,
-                    persistable=var.persistable,
-                    dtype=var.dtype,
-                    shape=var.shape)
-
-        optimize_block.append_op(
-            type=opt_op.type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=opt_op.attrs)
-
-    def _is_op_connected(self, op1, op2):
-        # If one op's input is another op's output or
-        # one op's output is another op's input, we say
-        # the two operator is connected.
-        op1_input_names = op1.desc.input_arg_names()
-        op1_output_names = op1.desc.output_arg_names()
-
-        op2_input_names = op2.desc.input_arg_names()
-        op2_output_names = op2.desc.output_arg_names()
-
-        if set(op1_output_names) & set(op2_input_names) or \
-           set(op1_input_names) & set(op2_output_names):
-            return True
-        return False
-
-    def _create_ufind(self, optimize_ops):
-        # Create a unit find data struct by optimize ops
-        ufind = UnionFind(optimize_ops)
-        for i in xrange(len(optimize_ops)):
-            for j in xrange(i, len(optimize_ops)):
-                op1 = optimize_ops[i]
-                op2 = optimize_ops[j]
-                if self._is_op_connected(op1, op2):
-                    ufind.union(op1, op2)
-        return ufind
-
-    def _is_opt_op(self, op):
-        # NOTE: It's a HACK implement.
-        # optimize op: SGDOptimize, MomentumOptimizer, AdamOptimizer and etc... 
-        if "Param" in op.input_names and \
-            "LearningRate" in op.input_names:
-            return True
-        return False
-
-    def _is_opt_op_on_pserver(self, endpoint, op):
-        param_names = [
-            p.name for p in self.param_grad_ep_mapping[endpoint]["params"]
-        ]
-        if op.input("Param") in param_names:
-            return True
-        else:
-            for n in param_names:
-                param = op.input("Param")[0]
-                if same_or_split_var(n, param) and n != param:
-                    return True
-            return False
-        return False
-
-    def _get_input_map_from_op(self, varmap, op):
-        iomap = dict()
-        for key in op.input_names:
-            vars = []
-            for varname in op.input(key):
-                vars.append(varmap[varname])
-            if len(vars) == 1:
-                iomap[key] = vars[0]
-            else:
-                iomap[key] = vars
-        return iomap
-
-    def _get_output_map_from_op(self, varmap, op):
-        iomap = dict()
-        for key in op.output_names:
-            vars = []
-            for varname in op.output(key):
-                vars.append(varmap[varname])
-            if len(vars) == 1:
-                iomap[key] = vars[0]
-            else:
-                iomap[key] = vars
-        return iomap
diff --git a/python/paddle/fluid/distribute_transpiler_simple.py b/python/paddle/fluid/distribute_transpiler_simple.py
deleted file mode 100644
index e94bbb6c39f7a017e2d0b79d050e6ff8e4371a14..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/distribute_transpiler_simple.py
+++ /dev/null
@@ -1,256 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import framework
-from framework import Program, default_main_program, Parameter, Variable
-import optimizer
-from layer_helper import LayerHelper
-
-
-def hash_name_to_server(params_grads, pserver_endpoints):
-    """
-    :param param_grads:
-    :return: a map of pserver endpoint -> 
-                    params -> [param list]
-                    grads  -> [grad list]
-    """
-
-    def _hash_param(param_name, total):
-        return hash(param_name) % total
-
-    param_grad_map = dict()
-    for param, grad in params_grads:
-        if param.trainable is True and grad is not None:
-            server_id = _hash_param(param.name, len(pserver_endpoints))
-            server_for_param = pserver_endpoints[server_id]
-            if not param_grad_map.has_key(server_for_param):
-                param_grad_map[server_for_param] = {"params": [], "grads": []}
-            param_grad_map[server_for_param]["params"].append(param)
-            param_grad_map[server_for_param]["grads"].append(grad)
-
-    return param_grad_map
-
-
-def round_robin(params_grads, pserver_endpoints):
-    assert (len(params_grads) > len(pserver_endpoints))
-
-    param_grad_map = dict()
-    pserver_idx = 0
-    for param, grad in params_grads:
-        if param.trainable is True:
-            server_for_param = pserver_endpoints[pserver_idx]
-            if not param_grad_map.has_key(server_for_param):
-                param_grad_map[server_for_param] = {"params": [], "grads": []}
-
-            param_grad_map[server_for_param]["params"].append(param)
-            param_grad_map[server_for_param]["grads"].append(grad)
-
-            pserver_idx += 1
-            if pserver_idx >= len(pserver_endpoints):
-                pserver_idx = 0
-    return param_grad_map
-
-
-class SimpleDistributeTranspiler:
-    def transpile(self,
-                  optimize_ops,
-                  params_grads,
-                  program=None,
-                  pservers="127.0.0.1:6174",
-                  trainers=1,
-                  split_method=round_robin):
-        """
-            Transpile the program to a distributed data-parallelism programs.
-
-            The main_program will be transform to use a remote parameter server
-            to do parameter optimization. And the optimization graph will be put
-            in to a parameter server program.
-
-            Use different methods to split trainable varialbles to different
-            parameter servers.
-
-            Example to run:
-
-            exe = fluid.Executor(place)
-            t = fluid.DistributeTranspiler()
-            t.transpile(optimize_ops, params_grads, pservers="127.0.0.1:6174", trainers=1)
-
-            pserver_endpoint = os.getenv("PSERVER")
-            if pserver_endpoint:
-                pserver_prog = t.get_pserver_program(pserver_endpoint, optimize_ops)
-                exe.run(fluid.default_startup_program())
-                exe.run(pserver_prog)
-            else:
-                feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
-                exe.run(fluid.default_startup_program())
-
-                for pass_id in range(PASS_NUM):
-                    ...
-
-            :param optimize_ops: op list of optimization, should be the
-                                 return value of Optimizer.minimize
-            :type optimize_ops: list
-            :param program: program to optimize, default default_main_program
-            :param pservers: parameter server endpoints like "m1:6174,m2:6174"
-            :type pservers: string
-
-            :return: return a list of programs
-        """
-        if program is None:
-            program = default_main_program()
-        self.program = program
-        self.trainers = trainers
-        self.optimize_ops = optimize_ops
-        self._optimize_distributed(
-            optimize_ops,
-            program,
-            params_grads,
-            pservers=pservers,
-            trainers=trainers,
-            split_method=split_method)
-
-    def _clone_param(self, block, v):
-        assert isinstance(v, Parameter)
-        new_p = Parameter(
-            block=block,
-            shape=v.shape,
-            dtype=v.dtype,
-            type=v.type,
-            lod_level=v.lod_level,
-            stop_gradient=v.stop_gradient,
-            trainable=v.trainable,
-            optimize_attr=v.optimize_attr,
-            regularizer=v.regularizer,
-            name=v.name)
-        block.vars[new_p.name] = new_p
-
-    def _clone_var(self, block, var):
-        assert isinstance(var, Variable)
-        return block.create_var(
-            name=var.name,
-            shape=var.shape,
-            dtype=var.dtype,
-            type=var.type,
-            lod_level=var.lod_level,
-            persistable=var.persistable)
-
-    def _optimize_distributed(self, optimize_ops, program, params_and_grads,
-                              **kwargs):
-        if kwargs.has_key("split_method"):
-            split_method = kwargs["split_method"]
-        else:
-            split_method = round_robin
-
-        assert (callable(split_method))
-        pserver_endpoints = kwargs["pservers"].split(",")
-        self.param_grad_map = split_method(params_and_grads, pserver_endpoints)
-
-        send_op_ordered_inputs = []
-        send_op_ordered_outputs = []
-        epmap = []
-        for ep, v in self.param_grad_map.iteritems():
-            send_op_ordered_inputs.extend(v["grads"])
-            send_op_ordered_outputs.extend(v["params"])
-            for i in v["grads"]:
-                epmap.append(ep)
-        send_op = program.global_block().append_op(
-            type="send",
-            inputs={"X": send_op_ordered_inputs
-                    },  # inputs is a list of tensors to be send
-            outputs={"Out": send_op_ordered_outputs},
-            attrs={"endpoints": pserver_endpoints,
-                   "epmap": epmap})
-
-    def get_trainer_program(self):
-        # remove optimize ops and add a send op to main_program
-        self.program.global_block().delete_ops(self.optimize_ops)
-        return self.program
-
-    def _create_var_for_trainers(self, block, var, trainers):
-        var_list = []
-        for i in xrange(trainers):
-            var_each = block.create_var(
-                name="%s.trainer_%d" % (var.name, i),
-                psersistable=var.persistable,
-                dtype=var.dtype,
-                shape=var.shape)
-            var_list.append(var_each)
-        return var_list
-
-    def get_pserver_program(self, endpoint, optimize_ops):
-        pserver_program = Program()
-        for v in self.param_grad_map[endpoint]["params"]:
-            self._clone_param(pserver_program.global_block(), v)
-
-        optimize_sub_program = Program()
-        grad_var_names = [
-            var.name for var in self.param_grad_map[endpoint]["grads"]
-        ]
-        for opt_op in optimize_ops:
-            for _, var in opt_op.inputs.iteritems():
-                # NOTE: append operators to merge gradients from multiple
-                # trainers. If trainers == 1, this is not needed.
-                if self.trainers > 1 and var.name in grad_var_names:
-                    vars2merge = self._create_var_for_trainers(
-                        optimize_sub_program.global_block(), var, self.trainers)
-                    merged_var = optimize_sub_program.global_block().create_var(
-                        name=var.name,
-                        persistable=var.persistable,
-                        dtype=var.dtype,
-                        shape=var.shape)
-                    optimize_sub_program.global_block().append_op(
-                        type="sum",
-                        inputs={"X": vars2merge},
-                        outputs={"Out": merged_var})
-                    optimize_sub_program.global_block().append_op(
-                        type="scale",
-                        inputs={"X": merged_var},
-                        outputs={"Out": merged_var},
-                        attrs={"scale": 1.0 / float(self.trainers)})
-                else:
-                    optimize_sub_program.global_block().create_var(
-                        name=var.name,
-                        persistable=var.persistable,
-                        dtype=var.dtype,
-                        shape=var.shape)
-
-            if opt_op.inputs.has_key("Grad"):
-                if opt_op.inputs["Grad"].name in grad_var_names:
-                    optimize_sub_program.global_block().append_op(
-                        type=opt_op.type,
-                        inputs=opt_op.inputs,
-                        outputs=opt_op.outputs,
-                        attrs=opt_op.attrs)
-            else:
-                optimize_sub_program.global_block().append_op(
-                    type=opt_op.type,
-                    inputs=opt_op.inputs,
-                    outputs=opt_op.outputs,
-                    attrs=opt_op.attrs)
-        pserver_program.global_block().append_op(
-            type="recv",
-            inputs={"RX":
-                    self.param_grad_map[endpoint]["grads"]},  # grads to recv
-            outputs={},
-            attrs={
-                "OptimizeBlock": optimize_sub_program.global_block(),
-                "endpoint": endpoint,
-                "ParamList":
-                [p.name for p in self.param_grad_map[endpoint]["params"]],
-                "GradList":
-                [p.name for p in self.param_grad_map[endpoint]["grads"]],
-                "Trainers": self.trainers
-            })
-        pserver_program.sync_with_cpp()
-        return pserver_program
diff --git a/python/paddle/fluid/distributed_spliter.py b/python/paddle/fluid/distributed_spliter.py
deleted file mode 100644
index d288b27ba00970897d8121b82a9d51d5cf4ece09..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/distributed_spliter.py
+++ /dev/null
@@ -1,50 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-def hash_name(varlist, pserver_endpoints):
-    """
-    hash variable names to several endpoints.
-
-    :param varlist: a list of Variables
-    :return: a map of pserver endpoint -> varname
-    """
-
-    def _hash_block(block_str, total):
-        return hash(block_str) % total
-
-    eplist = []
-    for var in varlist:
-        server_id = _hash_block(var.name(), len(pserver_endpoints))
-        server_for_param = pserver_endpoints[server_id]
-        eplist.append(server_for_param)
-    return eplist
-
-
-def round_robin(varlist, pserver_endpoints):
-    """
-    distribute variables to several endpoints.
-    """
-    assert (len(varlist) > len(pserver_endpoints))
-
-    eplist = []
-    pserver_idx = 0
-    for var in varlist:
-        server_for_param = pserver_endpoints[pserver_idx]
-        eplist.append(server_for_param)
-
-        pserver_idx += 1
-        if pserver_idx >= len(pserver_endpoints):
-            pserver_idx = 0
-    return eplist
diff --git a/python/paddle/fluid/evaluator.py b/python/paddle/fluid/evaluator.py
index 19e5b61b0b32aba3fe1e7805704a3740e3854fc8..00ba1a0457583d1cc1fa7136ebd51e9ced167832 100644
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
 import numpy as np
 
 import layers
@@ -21,7 +22,6 @@ from layer_helper import LayerHelper
 from initializer import Constant
 
 __all__ = [
-    'Accuracy',
     'ChunkEvaluator',
     'EditDistance',
     'DetectionMAP',
@@ -41,7 +41,12 @@ def _clone_var_(block, var):
 
 class Evaluator(object):
     """
-    Base Class for all evaluators
+    Warning: better to use the fluid.metrics.* things, more
+    flexible support via pure Python and Operator, and decoupled
+    with executor. Short doc are intended to urge new user
+    start from Metrics.
+
+    Base Class for all evaluators.
 
     Args:
         name(str): The name of evaluator. such as, "accuracy". Used for generate
@@ -59,6 +64,9 @@ class Evaluator(object):
     """
 
     def __init__(self, name, **kwargs):
+        warnings.warn(
+            "The %s is deprecated, because maintain a modified program inside evaluator cause bug easily, please use fluid.metrics.%s instead."
+            % (self.__class__.__name__, self.__class__.__name__), Warning)
         self.states = []
         self.metrics = []
         self.helper = LayerHelper(name, **kwargs)
@@ -66,6 +74,10 @@ class Evaluator(object):
     def reset(self, executor, reset_program=None):
         """
         reset metric states at the begin of each pass/user specified batch
+
+        Args:
+            executor(Executor|ParallelExecutor): a executor for executing the reset_program
+            reset_program(Program): a single Program for reset process
         """
         if reset_program is None:
             reset_program = Program()
@@ -82,15 +94,16 @@ class Evaluator(object):
     def eval(self, executor, eval_program=None):
         """
         Evaluate the statistics merged by multiple mini-batches.
+        Args:
+            executor(Executor|ParallelExecutor): a executor for executing the eval_program
+            eval_program(Program): a single Program for eval process
         """
         raise NotImplementedError()
 
-    def create_state(self, suffix, dtype, shape):
+    def _create_state(self, suffix, dtype, shape):
         """
         Create state variable.
 
-        NOTE: It is not a public API.
-
         Args:
             suffix(str): the state suffix.
             dtype(str|core.VarDesc.VarType): the state data type
@@ -110,9 +123,35 @@ class Evaluator(object):
 
 class ChunkEvaluator(Evaluator):
     """
+    Warning: This would be deprecated in the future. Please use fluid.metrics.ChunkEvaluator 
+    instead.
+
     Accumulate counter numbers output by chunk_eval from mini-batches and
     compute the precision recall and F1-score using the accumulated counter
     numbers.
+    For some basics of chunking, please refer to
+    'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'.
+
+    Args:
+        input (Variable): prediction output of the network.
+        label (Variable): label of the test data set.
+        chunk_scheme (str): can be IOB/IOE/IOBES and IO. See the chunk_eval op for details.
+        num_chunk_types (int): the number of chunk type.
+        excluded_chunk_types (list): A list including chunk type ids, indicating chunk types that are not counted.
+
+    Returns:
+        tuple: tuple containing: precision, recall, f1_score
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.executor(place)
+            evaluator = fluid.Evaluator.ChunkEvaluator(input, label)
+            for epoch in PASS_NUM:
+                evaluator.reset(exe)
+                for data in batches:
+                    loss = exe.run(fetch_list=[cost])
+                distance, instance_error = distance_evaluator.eval(exe)
     """
 
     def __init__(
@@ -127,11 +166,11 @@ class ChunkEvaluator(Evaluator):
         if main_program.current_block().idx != 0:
             raise ValueError("You can only invoke Evaluator in root block")
 
-        self.num_infer_chunks = self.create_state(
+        self.num_infer_chunks = self._create_state(
             dtype='int64', shape=[1], suffix='num_infer_chunks')
-        self.num_label_chunks = self.create_state(
+        self.num_label_chunks = self._create_state(
             dtype='int64', shape=[1], suffix='num_label_chunks')
-        self.num_correct_chunks = self.create_state(
+        self.num_correct_chunks = self._create_state(
             dtype='int64', shape=[1], suffix='num_correct_chunks')
         precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks = layers.chunk_eval(
             input=input,
@@ -175,6 +214,8 @@ class ChunkEvaluator(Evaluator):
 
 class EditDistance(Evaluator):
     """
+    Warning: This would be deprecated in the future. Please use fluid.metrics.EditDistance
+    instead.
     Accumulate edit distance sum and sequence number from mini-batches and
     compute the average edit_distance and instance error of all batches.
 
@@ -185,15 +226,16 @@ class EditDistance(Evaluator):
         ignored_tokens(list of int): Tokens that should be removed before
         calculating edit distance.
 
-    Example:
+    Examples:
+        .. code-block:: python
 
-        exe = fluid.executor(place)
-        distance_evaluator = fluid.Evaluator.EditDistance(input, label)
-        for epoch in PASS_NUM:
-            distance_evaluator.reset(exe)
-            for data in batches:
-                loss = exe.run(fetch_list=[cost])
-            distance, instance_error = distance_evaluator.eval(exe)
+            exe = fluid.executor(place)
+            distance_evaluator = fluid.Evaluator.EditDistance(input, label)
+            for epoch in PASS_NUM:
+                distance_evaluator.reset(exe)
+                for data in batches:
+                    loss = exe.run(fetch_list=[cost])
+                distance, instance_error = distance_evaluator.eval(exe)
 
         In the above example:
         'distance' is the average of the edit distance in a pass.
@@ -207,11 +249,11 @@ class EditDistance(Evaluator):
         if main_program.current_block().idx != 0:
             raise ValueError("You can only invoke Evaluator in root block")
 
-        self.total_distance = self.create_state(
+        self.total_distance = self._create_state(
             dtype='float32', shape=[1], suffix='total_distance')
-        self.seq_num = self.create_state(
+        self.seq_num = self._create_state(
             dtype='int64', shape=[1], suffix='seq_num')
-        self.instance_error = self.create_state(
+        self.instance_error = self._create_state(
             dtype='int64', shape=[1], suffix='instance_error')
         distances, seq_num = layers.edit_distance(
             input=input, label=label, ignored_tokens=ignored_tokens)
@@ -253,9 +295,10 @@ class EditDistance(Evaluator):
 
 class DetectionMAP(Evaluator):
     """
+    Warning: This would be deprecated in the future. Please use fluid.metrics.DetectionMAP
+    instead.
     Calculate the detection mean average precision (mAP).
 
-    TODO (Dang Qingqing): update the following doc.
     The general steps are as follows:
     1. calculate the true positive and false positive according to the input
         of detection and labels.
@@ -269,11 +312,12 @@ class DetectionMAP(Evaluator):
         input (Variable): The detection results, which is a LoDTensor with shape
             [M, 6]. The layout is [label, confidence, xmin, ymin, xmax, ymax].
         gt_label (Variable): The ground truth label index, which is a LoDTensor
-            with shape [N, 1]. 
-        gt_difficult (Variable): Whether this ground truth is a difficult
-            bounding box (bbox), which is a LoDTensor [N, 1].
+            with shape [N, 1].
         gt_box (Variable): The ground truth bounding box (bbox), which is a
             LoDTensor with shape [N, 6]. The layout is [xmin, ymin, xmax, ymax].
+        gt_difficult (Variable|None): Whether this ground truth is a difficult
+            bounding bbox, which can be a LoDTensor [N, 1] or not set. If None,
+            it means all the ground truth labels are not difficult bbox.
         class_num (int): The class number.
         background_label (int): The index of background label, the background
             label will be ignored. If set to -1, then all categories will be
@@ -281,24 +325,26 @@ class DetectionMAP(Evaluator):
         overlap_threshold (float): The threshold for deciding true/false
             positive, 0.5 by defalut.
         evaluate_difficult (bool): Whether to consider difficult ground truth
-            for evaluation, True by defalut.
+            for evaluation, True by defalut. This argument does not work when
+            gt_difficult is None.
         ap_version (string): The average precision calculation ways, it must be
             'integral' or '11point'. Please check
             https://sanchom.wordpress.com/tag/average-precision/ for details.
             - 11point: the 11-point interpolated average precision.
             - integral: the natural integral of the precision-recall curve.
 
-    Example:
+    Examples:
+        .. code-block:: python
 
-        exe = fluid.executor(place)
-        map_evaluator = fluid.Evaluator.DetectionMAP(input,
-            gt_label, gt_difficult, gt_box)
-        cur_map, accum_map = map_evaluator.get_map_var()
-        fetch = [cost, cur_map, accum_map]
-        for epoch in PASS_NUM:
-            map_evaluator.reset(exe)
-            for data in batches:
-                loss, cur_map_v, accum_map_v = exe.run(fetch_list=fetch)
+            exe = fluid.executor(place)
+            map_evaluator = fluid.Evaluator.DetectionMAP(input,
+                gt_label, gt_box, gt_difficult)
+            cur_map, accum_map = map_evaluator.get_map_var()
+            fetch = [cost, cur_map, accum_map]
+            for epoch in PASS_NUM:
+                map_evaluator.reset(exe)
+                for data in batches:
+                    loss, cur_map_v, accum_map_v = exe.run(fetch_list=fetch)
 
         In the above example:
 
@@ -310,8 +356,8 @@ class DetectionMAP(Evaluator):
                  input,
                  gt_label,
                  gt_box,
-                 gt_difficult,
-                 class_num,
+                 gt_difficult=None,
+                 class_num=None,
                  background_label=0,
                  overlap_threshold=0.5,
                  evaluate_difficult=True,
@@ -319,8 +365,11 @@ class DetectionMAP(Evaluator):
         super(DetectionMAP, self).__init__("map_eval")
 
         gt_label = layers.cast(x=gt_label, dtype=gt_box.dtype)
-        gt_difficult = layers.cast(x=gt_difficult, dtype=gt_box.dtype)
-        label = layers.concat([gt_label, gt_difficult, gt_box], axis=1)
+        if gt_difficult:
+            gt_difficult = layers.cast(x=gt_difficult, dtype=gt_box.dtype)
+            label = layers.concat([gt_label, gt_difficult, gt_box], axis=1)
+        else:
+            label = layers.concat([gt_label, gt_box], axis=1)
 
         # calculate mean average precision (mAP) of current mini-batch
         map = layers.detection_map(
@@ -332,9 +381,10 @@ class DetectionMAP(Evaluator):
             evaluate_difficult=evaluate_difficult,
             ap_version=ap_version)
 
-        self.create_state(dtype='int32', shape=None, suffix='accum_pos_count')
-        self.create_state(dtype='float32', shape=None, suffix='accum_true_pos')
-        self.create_state(dtype='float32', shape=None, suffix='accum_false_pos')
+        self._create_state(dtype='int32', shape=None, suffix='accum_pos_count')
+        self._create_state(dtype='float32', shape=None, suffix='accum_true_pos')
+        self._create_state(
+            dtype='float32', shape=None, suffix='accum_false_pos')
 
         self.has_state = None
         var = self.helper.create_variable(
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 4490f2bf153f672464ec8bca2a44109c9fe0dd04..b436dfe70afdb52299222f8ba3f5bdff2842d103 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -18,17 +18,24 @@ from framework import Program, default_main_program, Variable
 from . import core
 
 __all__ = [
-    'Executor', 'global_scope', 'scope_guard', 'switch_scope', 'fetch_var'
+    'Executor', 'global_scope', 'scope_guard', '_switch_scope', 'fetch_var'
 ]
 
 g_scope = core.Scope()
 
 
 def global_scope():
+    """
+    Get the global/default scope instance. There are a lot of APIs use
+    :code:`global_scope` as its default value, e.g., :code:`Executor.run`
+
+    Returns:
+        Scope: The global/default scope instance.
+    """
     return g_scope
 
 
-def switch_scope(scope):
+def _switch_scope(scope):
     global g_scope
     ex = g_scope
     g_scope = scope
@@ -37,19 +44,48 @@ def switch_scope(scope):
 
 @contextlib.contextmanager
 def scope_guard(scope):
-    ex = switch_scope(scope)
+    """
+    Change the global/default scope instance by Python `with` statement. All
+    variable in runtime will assigned to the new scope.
+
+    Examples:
+        >>> import paddle.fluid as fluid
+        >>> new_scope = fluid.Scope()
+        >>> with fluid.scope_guard(new_scope):
+        >>>     ...
+
+    Args:
+        scope: The new global/default scope.
+    """
+    ex = _switch_scope(scope)
     yield
-    switch_scope(ex)
+    _switch_scope(ex)
 
 
 def as_numpy(tensor):
+    """
+    Convert a Tensor to a numpy.ndarray, its only support Tensor without LoD information.
+    For higher dimensional sequence data, please use LoDTensor directly.
+    Examples:
+        >>> import paddle.fluid as fluid
+        >>> outs = executor.run(...)
+        >>> np_outs = map(lambda x: as_numpy(x), outs)
+        >>>     ...
+
+    Args:
+       tensor(Variable): a instance of Tensor
+
+    Returns:
+        numpy.ndarray
+    """
+    if isinstance(tensor, core.LoDTensorArray):
+        return [as_numpy(t) for t in tensor]
     if isinstance(tensor, list):
         return [as_numpy(t) for t in tensor]
     assert isinstance(tensor, core.LoDTensor)
     lod = tensor.lod()
     if len(lod) > 0:
-        raise RuntimeError(
-            "Some of your featched tensors hold LoD information. \
+        raise RuntimeError("Some of your fetched tensors hold LoD information. \
             They can not be completely cast to Python ndarray. \
             Please set the parameter 'return_numpy' as 'False' to \
             return LoDTensor itself directly.")
@@ -136,14 +172,18 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name):
 
 def fetch_var(name, scope=None, return_numpy=True):
     """
-    Fetch the value of the variable with the given name from the given scope
+    Fetch the value of the variable with the given name from the
+    given scope.
+
     Args:
         name(str): name of the variable. Typically, only persistable variables
             can be found in the scope used for running the program.
         scope(core.Scope|None): scope object. It should be the scope where
             you pass to Executor.run() when running your program.
-            If None, global_scope() will be used.
-        return_numpy(bool): whether convert the tensor to numpy.ndarray
+            If None, global_scope() will be used. Default None.
+        return_numpy(bool): whether convert the tensor to numpy.ndarray.
+            Default True.
+
     Returns:
        LodTensor|numpy.ndarray
     """
@@ -152,7 +192,7 @@ def fetch_var(name, scope=None, return_numpy=True):
         scope = global_scope()
     assert isinstance(scope, core.Scope)
 
-    var = global_scope().find_var(name)
+    var = scope.find_var(name)
     assert var is not None, (
         "Cannot find " + name + " in scope. Perhaps you need to make the"
         " variable persistable by using var.persistable = True in your"
@@ -163,7 +203,7 @@ def fetch_var(name, scope=None, return_numpy=True):
     return tensor
 
 
-def get_program_cache_key(feed, fetch_list):
+def _get_program_cache_key(feed, fetch_list):
     feed_var_names = feed.keys()
 
     def to_name_str(var):
@@ -171,6 +211,8 @@ def get_program_cache_key(feed, fetch_list):
             return var.desc.name()
         elif isinstance(var, str):
             return var
+        elif isinstance(var, basestring):
+            return str(var)
         else:
             raise TypeError(str(var) + " should be Variable or str")
 
@@ -180,60 +222,137 @@ def get_program_cache_key(feed, fetch_list):
 
 
 class Executor(object):
-    def __init__(self, places):
-        if not isinstance(places, list) and not isinstance(places, tuple):
-            places = [places]
-
-        act_places = []
-        for each in places:
-            p = core.Place()
-            p.set_place(each)
-            act_places.append(p)
-
-        # TODO(dzhwinter) : only use the first place
-        self.executor = core.Executor(act_places[0])
-        self.places = places
+    """
+    An Executor in Python, only support the single-GPU running. For multi-cards, please refer to
+    ParallelExecutor.
+    Python executor takes a program, add feed operators and fetch operators to this program according
+    to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
+    the variables(or names) that user want to get after program run. Note: the executor will run all
+    operators in the program but not only the operators dependent by the fetch_list.
+    It store the global variables into the global scope, and create a local scope for the temporary 
+    variables. The local scope contents will be discarded after every minibatch forward/backward finished. 
+    But the global scope variables will be persistent through different runs.
+    All of ops in program will be running in sequence.
+
+    Args:
+        place(core.CPUPlace|core.CUDAPlace(n)): indicate the executor run on which device
+
+    Note: For debugging complicated network in parallel-GPUs, you can test it on the executor.
+    They has the exactly same arguments, and expected the same results.
+    """
+
+    def __init__(self, place):
+        self.place = place
+        p = core.Place()
+        p.set_place(place)
+        self.executor = core.Executor(p)
         self.program_caches = dict()
 
-    def aslodtensor(self, data):
-        def accumulate(data):
-            if not isinstance(data, list):
-                return 1
-            return sum([accumulate(sub) for sub in data])
-
-        def parselod(data):
-            seq_lens = [accumulate(seq) for seq in data]
-            cur_len = 0
-            lod = [cur_len]
-            for l in seq_lens:
-                cur_len += l
-                lod.append(cur_len)
-            return lod
-
-        assert len(self.places) != 0
-        if not isinstance(data, list):
-            # pure tensor case
-            tensor = core.LoDTensor()
-            tensor.set(data, self.places[0])
-            return tensor
+    def as_lodtensor(self, data):
+        """
+        Convert numpy.ndarray to Tensor, its only support Tensor without LoD information.
+        For higher dimensional sequence data, please use LoDTensor directly.
+
+        Examples:
+            >>> import paddle.fluid as fluid
+            >>> exe = fluid.executor(fluid.CPUPlace())
+            >>> data = np.array(size=(100, 200, 300))
+            >>> np_outs = map(lambda x: exe.as_lodtensor(x), data)
+            >>>     ...
+
+        Args:
+            data(numpy.ndarray): a instance of array
+
+        Returns:
+            LoDTensor
+        """
+        if isinstance(data, list):
+            raise RuntimeError("Some of your feed data hold LoD information. \
+                They can not be completely cast from a list of Python \
+                ndarray to LoDTensor. Please convert data to LoDTensor \
+                directly before feeding the data.\
+                ")
+        # single tensor case
+        tensor = core.LoDTensor()
+        tensor.set(data, self.place)
+        return tensor
+
+    def _get_program_cache(self, program_cache_key):
+        return self.program_caches.get(program_cache_key, None)
+
+    def _add_program_cache(self, program_cache_key, program):
+        self.program_caches[program_cache_key] = program
+
+    def _add_feed_fetch_ops(self, program, feed, fetch_list, feed_var_name,
+                            fetch_var_name):
+        tmp_program = program.clone()
+
+        global_block = tmp_program.global_block()
+
+        if feed_var_name in global_block.vars:
+            feed_var = global_block.var(feed_var_name)
+        else:
+            feed_var = global_block.create_var(
+                name=feed_var_name,
+                type=core.VarDesc.VarType.FEED_MINIBATCH,
+                persistable=True)
+
+        if fetch_var_name in global_block.vars:
+            fetch_var = global_block.var(fetch_var_name)
         else:
-            raise RuntimeError("Current implementation lacks unittests")
-            # lodtensor case
-            lod = []
-            if not isinstance(data[0], list):
-                lod.append(parselod(data))
-                flattened_data = np.concatenate(data, axis=0).astype("int64")
+            fetch_var = global_block.create_var(
+                name=fetch_var_name,
+                type=core.VarDesc.VarType.FETCH_LIST,
+                persistable=True)
+
+        # prepend feed operators
+        if not has_feed_operators(global_block, feed, feed_var_name):
+            for i, name in enumerate(feed):
+                out = global_block.var(name)
+                global_block.prepend_op(
+                    type='feed',
+                    inputs={'X': [feed_var]},
+                    outputs={'Out': [out]},
+                    attrs={'col': i})
+
+        # append fetch_operators
+        if not has_fetch_operators(global_block, fetch_list, fetch_var_name):
+            for i, var in enumerate(fetch_list):
+                assert isinstance(var, Variable) or isinstance(var, str), (
+                    "Wrong type for fetch_list[%s]: %s" % (i, type(var)))
+                global_block.append_op(
+                    type='fetch',
+                    inputs={'X': [var]},
+                    outputs={'Out': [fetch_var]},
+                    attrs={'col': i})
+
+        return tmp_program
+
+    def _feed_data(self, program, feed, feed_var_name, scope):
+        # feed var to framework
+        for op in program.global_block().ops:
+            if op.desc.type() == 'feed':
+                feed_target_name = op.desc.output('Out')[0]
+                cur_feed = feed[feed_target_name]
+                if not isinstance(cur_feed, core.LoDTensor):
+                    cur_feed = self.as_lodtensor(cur_feed)
+                idx = op.desc.attr('col')
+                core.set_feed_variable(scope, cur_feed, feed_var_name, idx)
             else:
-                while isinstance(data[0], list):
-                    lod.append(parselod(seq))
-                    flattened_data = [item for seq in data for item in seq]
-                    data = flattened_data
-                flattened_data = np.concatenate(data, axis=0).astype("int64")
-            flattened_data = flattened_data.reshape([len(flattened_data), 1])
-            tensor = core.LoDTensor()
-            tensor.set(flattened_data, self.places[0])
-            tensor.set_lod(lod)
-            return tensor
+                break
+
+    def _fetch_data(self, fetch_list, fetch_var_name, scope):
+        outs = [
+            core.get_fetch_variable(scope, fetch_var_name, i)
+            for i in xrange(len(fetch_list))
+        ]
+        return outs
+
+    def begin_pass(self):
+        self.executor.begin_pass()
+
+    def end_pass(self):
+        self.executor.end_pass()
 
     def run(self,
             program=None,
@@ -244,113 +363,91 @@ class Executor(object):
             scope=None,
             return_numpy=True,
             use_program_cache=False):
-        """ Run program by this Executor. Feed data by feed map, fetch result by fetch_list.
-
+        """
+        Run program by this Executor. Feed data by feed map, fetch result by fetch_list.
         Python executor takes a program, add feed operators and fetch operators to this program according
         to feed map and fetch_list. Feed map provides input data for the program. fetch_list provides
-        the variables(or names) that user want to get after program run. Note: the executor will run all
+        the variables(or names) that user want to get after program run.
+
+        Note: the executor will run all
         operators in the program but not only the operators dependent by the fetch_list
 
-        :param program: the program that need to run, if not provied, then default_main_program will be used.
-        :param feed: feed variable map, e.g. {"image": ImageData, "label": LableData}
-        :param fetch_list: a list of variable or variable names that user want to get, run will return them according
-        to this list.
-        :param feed_var_name: the name for the input variable of feed Operator.
-        :param fetch_var_name: the name for the output variable of feed Operator.
-        :param scope: the scope used to run this program, you can switch it to different scope. default is global_scope
-        :param return_numpy: if convert the fetched tensor to numpy
-        :param use_program_cache: set use_program_cache to true if program not changed compare to the last step.
-        :return: result according to fetch_list.
+        Args:
+            program(Program): the program that need to run, if not provied, then default_main_program will be used.
+            feed(dict): feed variable map, e.g. {"image": ImageData, "label": LableData}
+            fetch_list(list): a list of variable or variable names that user want to get, run will return them according to this list.
+            feed_var_name(str): the name for the input variable of feed Operator.
+            fetch_var_name(str): the name for the output variable of fetch Operator.
+            scope(Scope): the scope used to run this program, you can switch it to different scope. default is global_scope
+            return_numpy(bool): if convert the fetched tensor to numpy
+            use_program_cache(bool): set use_program_cache to true if program not changed compare to the last step.
+
+        Returns:
+
+            list(numpy.array): fetch result according to fetch_list.
+
+
+        Examples:
+
+            >>> data = layers.data(name='X', shape=[1], dtype='float32')
+            >>> hidden = layers.fc(input=data, size=10)
+            >>> layers.assign(hidden, out)
+            >>> loss = layers.mean(out)
+            >>> adam = fluid.optimizer.Adam()
+            >>> adam.minimize(loss)
+
+            >>> cpu = core.CPUPlace()
+            >>> exe = Executor(cpu)
+            >>> exe.run(default_startup_program())
+
+            >>> x = numpy.random.random(size=(10, 1)).astype('float32')
+            >>> outs = exe.run(
+            >>>     feed={'X': x},
+            >>>     fetch_list=[loss.name])
         """
         if feed is None:
             feed = {}
         if not isinstance(feed, dict):
-            raise TypeError("feed should be a map")
+            raise TypeError(
+                "feed requires dict as its Parameter. But you passed in %s" %
+                (type(feed)))
         if fetch_list is None:
             fetch_list = []
-
         if program is None:
             program = default_main_program()
 
         if not isinstance(program, Program):
-            raise TypeError()
+            raise TypeError(
+                "Executor requires Program as its Parameter. But you passed in %s"
+                % (type(program)))
 
         if scope is None:
             scope = global_scope()
 
-        program_cache = None
-        program_cache_key = get_program_cache_key(feed, fetch_list)
-
+        cache_key = _get_program_cache_key(feed, fetch_list)
         if use_program_cache:
-            # find program cache by cache_key
-            program_cache = self.program_caches.get(program_cache_key, None)
-            # TODO(qiao): Should check program_cache and program are exactly the same.
+            cached_program = self._get_program_cache(cache_key)
+            if cached_program is None:
+                cached_program = self._add_feed_fetch_ops(
+                    program=program,
+                    feed=feed,
+                    fetch_list=fetch_list,
+                    feed_var_name=feed_var_name,
+                    fetch_var_name=fetch_var_name)
+                self._add_program_cache(cache_key, cached_program)
+            program = cached_program
         else:
-            self.program_caches.pop(program_cache_key, None)
-
-        if program_cache is None:
-            program_cache = program.clone()
-
-            if use_program_cache:
-                self.program_caches[program_cache_key] = program_cache
-
-            global_block = program_cache.global_block()
-
-            if feed_var_name in global_block.vars:
-                feed_var = global_block.var(feed_var_name)
-            else:
-                feed_var = global_block.create_var(
-                    name=feed_var_name,
-                    type=core.VarDesc.VarType.FEED_MINIBATCH,
-                    persistable=True)
-
-            if fetch_var_name in global_block.vars:
-                fetch_var = global_block.var(fetch_var_name)
-            else:
-                fetch_var = global_block.create_var(
-                    name=fetch_var_name,
-                    type=core.VarDesc.VarType.FETCH_LIST,
-                    persistable=True)
-
-            # prepend feed operators
-            if not has_feed_operators(global_block, feed, feed_var_name):
-                for i, name in enumerate(feed):
-                    out = global_block.var(name)
-                    global_block.prepend_op(
-                        type='feed',
-                        inputs={'X': [feed_var]},
-                        outputs={'Out': [out]},
-                        attrs={'col': i})
-
-            # append fetch_operators
-            if not has_fetch_operators(global_block, fetch_list,
-                                       fetch_var_name):
-                for i, var in enumerate(fetch_list):
-                    assert isinstance(var, Variable) or isinstance(var, str), (
-                        "Wrong type for fetch_list[%s]: %s" % (i, type(var)))
-                    global_block.append_op(
-                        type='fetch',
-                        inputs={'X': [var]},
-                        outputs={'Out': [fetch_var]},
-                        attrs={'col': i})
-
-        # feed var to framework
-        for op in program_cache.global_block().ops:
-            if op.desc.type() == 'feed':
-                feed_target_name = op.desc.output('Out')[0]
-                cur_feed = feed[feed_target_name]
-                if not isinstance(cur_feed, core.LoDTensor):
-                    cur_feed = self.aslodtensor(cur_feed)
-                idx = op.desc.attr('col')
-                core.set_feed_variable(scope, cur_feed, feed_var_name, idx)
-            else:
-                break
-
-        self.executor.run(program_cache.desc, scope, 0, True, True)
-        outs = [
-            core.get_fetch_variable(scope, fetch_var_name, i)
-            for i in xrange(len(fetch_list))
-        ]
+            self.program_caches.pop(cache_key, None)
+            program = self._add_feed_fetch_ops(
+                program=program,
+                feed=feed,
+                fetch_list=fetch_list,
+                feed_var_name=feed_var_name,
+                fetch_var_name=fetch_var_name)
+
+        self._feed_data(program, feed, feed_var_name, scope)
+        self.executor.run(program.desc, scope, 0, True, True)
+        outs = self._fetch_data(fetch_list, fetch_var_name, scope)
         if return_numpy:
             outs = as_numpy(outs)
         return outs
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index d14d6349b1bcf598e25bbeb9913d2d0da71a5054..93cd6b621ac860bad37ad5ccfb733c53d07f8d2b 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -19,7 +19,16 @@ import re
 import numpy as np
 
 import proto.framework_pb2 as framework_pb2
-from . import core
+try:
+    from . import core
+except ImportError, e:
+    raise ImportError(
+        """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
+    if you encounters \"libmkldnn.so not found\" errors. If you have python
+    installed in other directory, replace \"/usr/local/lib\" with your own
+    directory. The original error is: """ % str(e))
+except Exception, e:
+    raise e
 import unique_name
 
 __all__ = [
@@ -27,11 +36,10 @@ __all__ = [
     'Variable',
     'Program',
     'Operator',
+    'Parameter',
     'default_startup_program',
     'default_main_program',
     'program_guard',
-    'switch_startup_program',
-    'switch_main_program',
     'get_var',
 ]
 
@@ -43,7 +51,8 @@ ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
 
 def grad_var_name(var_name):
     """
-    return gradient name for a certain var name
+    Returns:
+        str: gradient name for a certain var name
     """
     return var_name + GRAD_VAR_SUFFIX
 
@@ -51,10 +60,12 @@ def grad_var_name(var_name):
 def convert_np_dtype_to_dtype_(np_dtype):
     """
     Convert the data type in numpy to the data type in Paddle
+
     Args:
-        np_dtype(np.dtype): the data type in numpy
+        np_dtype(np.dtype): the data type in numpy.
 
-    Returns(core.VarDesc.VarType): the data type in Paddle
+    Returns:
+        core.VarDesc.VarType: the data type in Paddle.
 
     """
     dtype = np.dtype(np_dtype)
@@ -72,6 +83,10 @@ def convert_np_dtype_to_dtype_(np_dtype):
         return core.VarDesc.VarType.INT64
     elif dtype == np.bool:
         return core.VarDesc.VarType.BOOL
+    elif dtype == np.uint16:
+        return core.VarDesc.VarType.INT16
+    elif dtype == np.uint8:
+        return core.VarDesc.VarType.UINT8
     else:
         raise ValueError("Not supported numpy dtype " + str(dtype))
 
@@ -116,37 +131,53 @@ def _debug_string_(proto, throw_on_error=True):
 
 class Variable(object):
     """
-    Python variable. Every input and output of an operator is a variable. Every
-    variable belongs to a block. The variable has a name and two variables in
-    different blocks could have the same name.
-
-    There are many kinds of variables. Please reference the framework.proto for
-    details.
+    In Fluid, every input and output of an operator is a variable. In most 
+    cases, variables are used for holding different kinds of data or training 
+    labels. A variable belongs to a block. All variable has its own name and 
+    two variables in different blocks could have the same name.
 
-    Notes: The constructor of Variable should not be invoked directly. Please
-    use `Block.create_var` to create a variable.
+    There are many kinds of variables. Each kind of them has its own attributes 
+    and usages. Please reference the framework.proto for details. 
 
-    >>> cur_program = Program()
-    >>> cur_block = cur_program.current_block()
-    >>> new_variable = cur_block.create_var(
-    >>>                    name="X", shape=[-1, 23, 48], dtype='float32')
+    Most of a Variable's member variables can be setted to be None. It mean 
+    it is not available or will be specified later.
 
     Args:
-        block(Block): The associated block. It will be passed by
-            `Block.create_var` automatically.
+        block(Block): The block that the variable belongs to.
         type(core.VarDesc.VarType): Variable type. Please reference the
             framework.proto for details.
-        shape(tuple|list|None): The shape of variable. -1 means the batch size.
+        name(str|None): The name of the variable. If setted None, it will be
+            generated automatically. Default: None
+        shape(tuple|list|None): The shape of the variable. -1 means the batch size.
             Some kinds of variable do not contain shape, just set it to None.
-        dtype(np.dtype|core.VarDesc.VarType|str): The data type of variable.
-        lod_level(int): The level of lod tensor. 0 means it is not a time
+            Default: None
+        dtype(np.dtype|core.VarDesc.VarType|str|None): The data type of variable.
+            Default: None
+        lod_level (int|None): The level of lod tensor. 0 means it is not a time
             series data.
-        capacity(int): The capacity of Channel variable. Ignored
-            for other types.
-        persistable(bool): True if the variable should be saved as check point.
-            Defaults to False.
-        stop_gradient(bool): True if the variable will stop to calculate
-            gradients when backward. Defaults to False.
+            Default: None
+        capacity (int|None): The capacity of Channel variable. Ignored for other
+            types. Default: None
+        persistable (bool|None): True if the variable is persistable. A persistable
+            variable will not be deleted after an iteration ending. Defaults: None.
+        error_clip (BaseErrorClipAttr|None): The error clip attributes of the
+            corresponding gradient variable. Default: None
+        stop_gradient (bool): True if the variable will stop to calculate its
+            gradients when backward. Default: False.
+        is_data (bool): True if the variable is an input data. Default: False
+
+    Notes:
+        The constructor of Variable should not be invoked directly. Please
+        use `Block.create_var` to create a variable.
+
+    Examples:
+        .. code-block:: python
+
+            cur_program = Program()
+            cur_block = cur_program.current_block()
+            new_variable = cur_block.create_var(name="X",
+                                                shape=[-1, 23, 48],
+                                                dtype='float32')
     """
 
     def __init__(self,
@@ -160,6 +191,7 @@ class Variable(object):
                  persistable=None,
                  error_clip=None,
                  stop_gradient=False,
+                 is_data=False,
                  **kwargs):
         self.block = block
         self.error_clip = error_clip
@@ -238,6 +270,7 @@ class Variable(object):
         self.block.vars[name] = self
         self.op = None
         self.stop_gradient = stop_gradient
+        self.is_data = is_data
 
     def __str__(self):
         return self.to_string(True)
@@ -247,13 +280,14 @@ class Variable(object):
         Get debug string.
 
         Args:
-            throw_on_error(bool): True if raise an exception when self is not
-                intialized.
+            throw_on_error(bool): True if raise an exception when self is
+                not initialized.
             with_details(bool): more details about variables and parameters
-                (e.g. trainable, optimize_attr, ...) will be printed when with_details is True
-
-        Returns(str): The debug string.
+                (e.g. trainable, optimize_attr, ...) will be printed when
+                with_details is True. Default False;
 
+        Returns:
+            str: The debug string.
         """
         assert isinstance(throw_on_error, bool) and isinstance(with_details,
                                                                bool)
@@ -270,6 +304,15 @@ class Variable(object):
     __repr__ = __str__
 
     def set_desc(self, input):
+        """
+        Set the variable description.
+
+        Args:
+            input(core.VarDesc): The new VarDesc.
+
+        Returns:
+            None
+        """
         self.desc = input
 
     @property
@@ -306,6 +349,15 @@ class Variable(object):
         return self.desc.type()
 
     def set_error_clip(self, error_clip):
+        """
+        Set the error_clip.
+
+        Args:
+            error_clip(BaseErrorClipAttr) : The new error_clip.
+
+        Returns:
+            None
+        """
         self.error_clip = error_clip
 
 
@@ -313,8 +365,8 @@ def get_all_op_protos():
     """
     Get all registered op proto from PaddlePaddle C++ end.
 
-    Returns(list): list of OpProto
-
+    Returns:
+       list: list of OpProto.
     """
     protostrs = core.get_all_op_protos()
     ret_values = []
@@ -357,13 +409,63 @@ class OpProtoHolder(object):
             raise ValueError("Operator \"%s\" has not been registered." % type)
         return self.op_proto_map[type]
 
+    @staticmethod
+    def generated_op_attr_names():
+        return {
+            core.op_proto_and_checker_maker.kOpRoleAttrName(),
+            core.op_proto_and_checker_maker.kOpRoleVarAttrName()
+        }
+
 
 class Operator(object):
     """
-    Python Operator class. The operator represents the build in instructions in a
-    Block. Users can use the build in instructions to describe their neural
-    network.
+    In Fluid, all the operation are represented by Operator, and Operator
+    is regarded as a build in an instruction of a Block. Users can use the
+    build in instructions to describe their neural network.
+
+    Args:
+        block(Block): The block has the current operator.
+        desc(core.OpDesc): The protobuf description of Operator.
+        type(str): The type of operator. Default None.
+        inputs(dict): The input of this Operator. it is a dictionary, for every
+            element, key is the input parameter name, and value is a list of
+            variables. Default None.
+        outputs(dict): The output of this Operator. it is a dictionary, for
+            every element, key is the input parameter name, and value is a list
+            of variables. Default None.
+        attrs(dict): The attributes of this Operator. it is a dictionary, for
+            every element, key is attribute name, and value is the attribute value.
+            The attribute type should be as same as the type registered in C++ side.
+            Default None.
+
+    Returns:
+        Operator: The initialized Operator.
+
+    Raises:
+        ValueError: If the passed input, output and attrs doesn't match the
+            initializing Operator's that registered in C++ side.
+
+    Notes:
+        The constructor of operator should not be invoked directly. Use
+        Block.append_op or Block.prepend_op instead.
+
+    Examples:
+        .. code-block:: python
+
+            cur_program = Program()
+            cur_block = cur_program.current_block()
+            # var1 += var2 + var3
+            cur_block.append_op(type="sum",
+                                inputs={"X": [var1, var2, var3]},
+                                outputs={"Out": [var1]})
     """
+    OP_WITHOUT_KERNEL_SET = {
+        'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
+        'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
+        'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine',
+        'ncclInit', 'channel_create', 'channel_close', 'channel_send',
+        'channel_recv', 'select', 'checkpoint_notify', 'gen_nccl_id'
+    }
 
     def __init__(self,
                  block,
@@ -372,34 +474,27 @@ class Operator(object):
                  inputs=None,
                  outputs=None,
                  attrs=None):
-        """
-        Constructor.
 
-        Notes: The constructor of operator should not be invoked directly. Use
-        Block.append_op or Block.prepend_op instead.
-
-        >>> cur_program = Program()
-        >>> cur_block = cur_program.current_block()
-        >>> # var1 += var2 + var3
-        >>> cur_block.append_op(type="sum",
-        >>>                     inputs={"X": [var1, var2, var3]},
-        >>>                     outputs={"Out": [var1]})
-
-        Args:
-            block(Block): The block has the current operator.
-            desc(core.OpDesc): The protobuf description.
-            type(str): The type of operator.
-            inputs(dict): The input dictionary. Key is the input parameter name.
-                Value is a list of variables.
-            outputs(dict): The output dictionary which has the same format with
-                           inputs.
-            attrs(dict): The attributes dictionary. Key is attribute name. Value
-                is the attribute value. The attribute type should be as same as
-                the type registered in C++
-        """
         self.block = block
         self.desc = desc
         self.attrs = attrs
+        if self.attrs is None:
+            self.attrs = dict()
+        del attrs
+
+        op_maker = core.op_proto_and_checker_maker
+
+        if op_maker.kOpRoleAttrName() not in self.attrs:
+            self.attrs[op_maker.kOpRoleAttrName()] = self.block.program.op_role
+
+        role_var_name = op_maker.kOpRoleVarAttrName()
+        if len(self.block.program.
+               op_role_var) != 0 and role_var_name not in self.attrs:
+            self.attrs[role_var_name] = self.block.program.op_role_var
+
+        if role_var_name in self.attrs and len(self.attrs[role_var_name]) == 0:
+            del self.attrs[role_var_name]
+
         if len(self.desc.type()) != 0:
             return
         if type is None:
@@ -465,42 +560,35 @@ class Operator(object):
                     arg.op = self
                 self.desc.set_output(out_proto.name, out_arg_names)
 
-        if attrs is not None:
-            if not isinstance(attrs, dict):
+        if self.attrs is not None:
+            if not isinstance(self.attrs, dict):
                 raise TypeError("'attrs' should be a dict.")
             for attr in proto.attrs:
                 attr_name = attr.name
-                if (attr_name not in attrs) or (attrs[attr_name] is None):
+                if (attr_name not in self.attrs) or (
+                        self.attrs[attr_name] is None):
                     continue
-                if isinstance(attrs[attr_name], Block):
-                    self.desc.set_block_attr(attr_name, attrs[attr_name].desc)
-                elif isinstance(attrs[attr_name], core.BlockDesc) or \
-                   isinstance(attrs[attr_name], core.ProgramDesc):
-                    self.desc.set_serialized_attr(
-                        attr_name, attrs[attr_name].serialize_to_string())
-                else:
-                    self.desc.set_attr(attr_name, attrs[attr_name])
+                attr_val = self.attrs[attr_name]
+                self._update_desc_attr(attr_name, attr_val)
 
         self.desc.check_attrs()
-        no_kernel_op_set = {
-            'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
-            'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
-            'recv', 'listen_and_serv', 'parallel_do', 'save_combine',
-            'load_combine', 'ncclInit', 'channel_create', 'channel_close',
-            'channel_send', 'channel_recv'
-        }
-        if type not in no_kernel_op_set:
+        if self.has_kernel(type):
             self.desc.infer_var_type(self.block.desc)
             self.desc.infer_shape(self.block.desc)
 
+    def has_kernel(self, op_type):
+        return op_type not in self.OP_WITHOUT_KERNEL_SET
+
     def to_string(self, throw_on_error):
         """
-        To debug string.
+        Get debug string.
+
         Args:
-            throw_on_error(bool): raise exception when self is not initialized
-                when throw_on_error is True
+            throw_on_error(bool): Whether to raise exception if self is not
+                initialized.
 
-        Returns(str): The debug string.
+        Returns:
+            str: The debug string.
 
         """
         protostr = self.desc.serialize_to_string()
@@ -518,29 +606,45 @@ class Operator(object):
 
     def input(self, name):
         """
-        Get input arguments by the input parameter name
-        Args:
-            name(str): The input parameter name
+        Get the input arguments according to the input parameter name.
 
-        Returns(list): return the list of argument names associated with the
-            specific parameter name.
+        Args:
+            name(str): The input parameter name.
 
+        Returns:
+            list: return the list of argument names that associated with \
+                the specific parameter name.
         """
         return self.desc.input(name)
 
     def rename_input(self, old_name, new_name):
+        """
+        Rename the `old_name` to `new_name`.
+
+        Args:
+            old_name(str): The old name of the Operator's input.
+            new_name(str): The new name of the Operator's input.
+
+        Returns:
+            None
+        """
         self.desc.rename_input(old_name, new_name)
 
     def rename_output(self, old_name, new_name):
+        """
+        Rename the `old_name` to `new_name`.
+
+        Args:
+            old_name(str): The old name of the Operator's output.
+            new_name(str): The new name of the Operator's output.
+
+        Returns:
+            None
+        """
         self.desc.rename_output(old_name, new_name)
 
     @property
     def input_names(self):
-        """
-        Get all input parameter names
-        Returns(list): return a list of input parameter names
-
-        """
         return self.desc.input_names()
 
     @property
@@ -553,33 +657,23 @@ class Operator(object):
 
     def output(self, name):
         """
-        Get output arguments by the output parameter name
-        Args:
-            name(str): The output parameter name
+        Get output arguments by the output parameter name.
 
-        Returns(list): return the list of argument names associated with the
-            specific parameter name.
+        Args:
+            name(str): The output parameter name.
 
+        Returns:
+            list: return the list of argument names associated with \
+                the specific parameter name.
         """
         return self.desc.output(name)
 
     @property
     def output_names(self):
-        """
-        Get all output parameter names
-        Returns(list): return a list of output parameter names
-
-        """
         return self.desc.output_names()
 
     @property
     def idx(self):
-        """
-        Return the array index of current operator.
-        Returns(int): The array index in block.ops array
-        Raises:
-            ValueError: when the operator is not found.
-        """
         for i, op in enumerate(self.block.ops):
             if op == self:
                 return i
@@ -588,81 +682,164 @@ class Operator(object):
 
     def has_attr(self, name):
         """
-        operator has the attribute with name or not.
+        Whether this Operator has the attribute with name or not.
+
         Args:
-            name(str): the attribute name
+            name(str): the attribute name.
 
-        Returns(bool): True if has this attribute.
+        Returns:
+            bool: True if has this attribute.
 
         """
         return self.desc.has_attr(name)
 
     def attr_type(self, name):
         """
-        Get the type of attribute by attribute name
-        Args:
-            name(str): the attribute name
+        Get the type of attribute by attribute's name.
 
-        Returns(core.AttrType): the attribute type
+        Args:
+            name(str): the attribute name.
 
+        Returns:
+            core.AttrType: the attribute type.
         """
         return self.desc.attr_type(name)
 
-    @property
-    def attr_names(self):
+    def set_attr(self, name, val):
+        """
+        Set the value of attribute by attribute's name.
+
+        Args:
+            name(str): the attribute name.
+            val(bool|int|str|float|list): the value of the attribute.
+
+        Raises:
+            ValueError: If the type of value doesn't match with desc.attr_type(name).
         """
-        Get all attribute names
-        Returns(list): The list of attribute name
+        self.attrs[name] = val
+        self._update_desc_attr(name, val)
+
+    def _update_desc_attr(self, name, val):
+        """
+        Update the value of desc's attribute by attribute's name.
+
+        Args:
+            name(str): the attribute name.
+            val(bool|int|str|float|list): the value of the attribute.
 
+        Raises:
+            ValueError: If the type of value doesn't match with desc.attr_type(name).
         """
+        if isinstance(val, Block):
+            self.desc.set_block_attr(name, val.desc)
+        elif isinstance(val, list) and val and all(
+                isinstance(v, Block) for v in val):
+            self.desc.set_blocks_attr(name, [v.desc for v in val])
+        elif isinstance(val, core.BlockDesc) or \
+                isinstance(val, core.ProgramDesc):
+            self.desc.set_serialized_attr(name, val.serialize_to_string())
+        else:
+            self.desc.set_attr(name, val)
+
+    @property
+    def attr_names(self):
         return self.desc.attr_names()
 
     def attr(self, name):
         """
-        Get attribute by name
+        Get the attribute by name.
+
         Args:
-            name(str): the attribute name
+            name(str): the attribute name.
 
-        Returns(bool|int|str|float|list): The attribute value. The return value
+        Returns:
+            bool|int|str|float|list: The attribute value. The return value
             can be any valid attribute type.
-
         """
         return self.desc.attr(name)
 
     def block_attr(self, name):
         """
-        Get the block attribute by name
-        Args:
-            name(str): the attribute name
+        Get the block attribute by name.
 
-        Returns(int): the block index
+        Args:
+            name(str): the attribute name.
 
+        Returns:
+            int: the block index.
         """
         return self.desc.block_attr(name)
 
+    def all_attrs(self):
+        """
+        Get the attribute dict.
+
+        Returns:
+            dict: The Operator's attribute dict.
+        """
+        attr_names = self.attr_names
+        attr_map = {}
+        for n in attr_names:
+            if n == 'sub_block':
+                attr_map[n] = self.block_attr(n)
+            else:
+                attr_map[n] = self.attr(n)
+        return attr_map
+
 
 class Block(object):
+    """
+    In Fluid, a Program is consistence of multi-Block, and Block stores
+    VarDesc and OpDesc. In a specific Block, a VarDesc have a unique name.
+    One block could have some child blocks, and child block's name scopes
+    should inherit the parent's so that OpDesc in child block can reference
+    a VarDesc that is stored in the parent block.
+    Please reference the framework.proto for details.
+
+    Args:
+        program(Program): The Program that the Block belongs to.
+        idx(int): The block's id in the Program.
+
+    Notes:
+        The constructor of Block should not be invoked directly. Please
+        use `Program.create_block()` to create a block.
+
+    Examples:
+        .. code-block:: python
+
+            cur_program = Program()
+            cur_block = cur_program.current_block()
+            var = cur_block.create_var(name="X",
+                                       shape=[-1, 23, 48],
+                                       dtype='float32')
+            cur_block.append_op(type="abs",
+                                inputs={"X": [var]},
+                                outputs={"Out": [var]})
+    """
+
     def __init__(self, program, idx):
         self.desc = program.desc.block(idx)
-        self.vars = dict()  # var_name --> var
-        self.ops = collections.deque()  # operator list
+        self.vars = collections.OrderedDict()  # var_name --> var
+        self.ops = list()  # operator list
         self.program = program
-        self.removed_vars = dict()
+        self.removed_vars = collections.OrderedDict()
 
     def __str__(self):
         return self.to_string(True)
 
     def to_string(self, throw_on_error, with_details=False):
         """
-        To debug string.
+        Get debug string.
+
         Args:
             throw_on_error(bool): raise exception when self is not initialized
-                when throw_on_error is True
+                when throw_on_error is True.
             with_details(bool): more details about variables and parameters
-                (e.g. trainable, optimize_attr, ...) will be printed when with_details is True
-
-        Returns(str): The debug string.
+                (e.g. trainable, optimize_attr, ...) will be printed when
+                with_details is True. Default False.
 
+        Returns:
+            str: The debug string.
         """
         assert isinstance(throw_on_error, bool) and isinstance(with_details,
                                                                bool)
@@ -694,6 +871,15 @@ class Block(object):
         return self.desc.get_forward_block_idx()
 
     def set_forward_block_idx(self, idx):
+        """
+        Set the forward block Idx.
+
+        Args:
+            idx(int): the block index.
+
+        Returns:
+            None
+        """
         self.desc.set_forward_block_idx(idx)
 
     @property
@@ -701,14 +887,42 @@ class Block(object):
         return self.desc.id
 
     def var(self, name):
+        """
+        Get a Variable by name from this block.
+
+        Args:
+            name(str): the Variable's name.
+
+        Raises:
+            ValueError: The If input's type is not str, or this block
+                doesn't have a Variable with the giving name.
+
+        Returns:
+            Variable: the Variable with the giving name.
+        """
         if not isinstance(name, basestring):
-            raise TypeError()
+            raise TypeError(
+                "var require string as parameter, but get %s instead." %
+                (type(name)))
         v = self.vars.get(name, None)
         if v is None:
             raise ValueError("var %s not in this block" % name)
         return v
 
     def var_recursive(self, name):
+        """
+        Get a Variable by name from this block recursively.
+
+        Args:
+            name(str): the Variable's name.
+
+        Raises:
+            ValueError: this block and this parent block doesn't
+                have a Variable with the giving name.
+
+        Returns:
+            Variable: the Variable with the giving name.
+        """
         frontier = list()
         visited = set()
 
@@ -755,9 +969,21 @@ class Block(object):
     def rename_var(self, name, new_name):
         """
         Rename variable in vars and ops' inputs and outputs
+
+        Args:
+            name(str): the name that need to be renamed.
+            new_name(str): the name that need to rename to.
+
+        Raises:
+            ValueError: If this block doesn't have this the giving name,
+                or the type of the var with the giving name is not Parameter
+                or Variable.
+
+        Returns:
+            Variable: the Variable with the giving name.
         """
         if not self.has_var(name):
-            raise ValueError("var %s is not in current" % name)
+            raise ValueError("var %s is not in current block" % name)
         v = self.var(name)
         if type(v) == Parameter:
             var_type = "Parameter"
@@ -803,6 +1029,12 @@ class Block(object):
         self.vars[new_name] = var
         del self.vars[name]
         self.sync_with_cpp()
+        return var
+
+    def remove_var(self, name):
+        self.sync_with_cpp()
+        self.desc.remove_var(name)
+        del self.vars[name]
 
     def create_parameter(self, *args, **kwargs):
         global_block = self.program.global_block()
@@ -812,41 +1044,81 @@ class Block(object):
         return param
 
     def append_op(self, *args, **kwargs):
+        """
+        Appends a new Operator according to the giving arguments.
+
+        Returns:
+            Operator: the append Operator.
+        """
         op_desc = self.desc.append_op()
         op = Operator(block=self, desc=op_desc, *args, **kwargs)
         self.ops.append(op)
         return op
 
-    def delete_ops(self, ops):
-        # remove from cpp
-        # FIXME(typhoonzero): remove only the first occurrence.
-        try:
-            start = list(self.ops).index(ops[0])
-            end = list(self.ops).index(ops[-1])
-        except Exception, e:
-            raise e
-        self.desc.remove_op(start, end + 1)
+    def insert_op(self, index, *args, **kwargs):
+        """
+        Insert a Operator according to the giving arguments.
+
+        Args:
+            index(int): the place that the operator to insert.
+
+        Returns:
+            Operator: the insert Operator.
+        """
+        self.sync_with_cpp()
+        op_desc = self.desc.insert_op(index)
+        op = Operator(block=self, desc=op_desc, *args, **kwargs)
+        self.ops.insert(index, op)
+        return op
+
+    def remove_op(self, index):
+        """
+        Remove the specific position operator.
+
+        Args:
+            index(int): the position that the operator to insert.
+
+        Returns:
+            None
+        """
+        self.sync_with_cpp()
+        self.desc.remove_op(index, index + 1)
+        del self.ops[index]
 
     def slice_ops(self, start, end):
-        return list(self.ops)[start:end]
+        """
+        Return the Operator between start and end.
+
+        Args:
+            start(int): the start position.
+            end(int): the end position.
+
+        Returns:
+            list: the Operators between start and end.
+        """
+        return self.ops[start:end]
 
     def prepend_op(self, *args, **kwargs):
         op_desc = self.desc.prepend_op()
         op = Operator(self, op_desc, *args, **kwargs)
-        self.ops.appendleft(op)
+        self.ops.insert(0, op)
         return op
 
     def sync_with_cpp(self):
         """
-        Sync with the desc on the c++ end.
-
-        This method is used to synchronize the c++ desc instance generated by backward.
+        Sync from the desc on the c++ end. This method is used to synchronize
+        the c++ desc instance generated by backward.
         """
         # sync variables from cpp
         for var in self.desc.all_vars():
             if not self.has_var(var.name()):
                 self.create_var(name=var.name(), desc=var, type=var.type())
 
+        # sync variables removed from c++ end
+        for var in self.vars.keys():
+            if not self.desc.find_var(var):
+                self.vars.pop(var)
+
         # sync operators from cpp
         ops_in_cpp = []
         for op_idx in range(0, self.desc.op_size()):
@@ -873,7 +1145,7 @@ class Block(object):
         for index in range((start_index - 1 - 1), -1, -1):
             op_desc = ops_in_cpp[index]
             op = Operator(self, op_desc)
-            self.ops.appendleft(op)
+            self.ops.insert(0, op)
 
         # sync ops append to the end of cpp_ops
         for index in range((end_index + 1), len(ops_in_cpp)):
@@ -881,15 +1153,33 @@ class Block(object):
             op = Operator(self, op_desc)
             self.ops.append(op)
 
+        # sync ops removed from c++ end
+        if end_index != -1 and end_index < len(self.ops):
+            ops_in_cpp_index = 0
+            ops_in_python_index = 0
+            while ops_in_python_index < len(
+                    self.ops) and ops_in_cpp_index < len(ops_in_cpp):
+                if self.ops[ops_in_python_index].desc != ops_in_cpp[
+                        ops_in_cpp_index]:
+                    del self.ops[ops_in_python_index]
+                else:
+                    ops_in_cpp_index += 1
+                    ops_in_python_index += 1
+
         assert len(self.ops) == len(ops_in_cpp)
         for index in range(len(self.ops)):
             assert self.ops[index].desc == ops_in_cpp[index]
 
     def copy_param_info_from(self, other):
         """
-        Copy the information of parameters from the other block
+        Copy the information of parameters from the other block.
+
         Args:
-            other(Block): the other block
+            other(Block): the other block.
+
+        Raises:
+            ValueError: If type of input is not Block, or the `other` and this
+                block is not in the same topology.
 
         Returns:
             None
@@ -918,27 +1208,170 @@ class Block(object):
                 name=v.name)
             self.vars[new_p.name] = new_p
 
+    def clone_variable(self, var):
+        """
+        Clone a variable into current block.
+
+        Args:
+            var: the variable to be cloned.
+
+        Returns:
+            Variable: the new  variable cloned from 'var' in current block.
+        """
+        assert isinstance(var, Variable)
+        ret_var = None
+        # make STEP_SCOPES var can be safely cloned.
+        if var.type == core.VarDesc.VarType.STEP_SCOPES:
+            ret_var = self.create_var(
+                name=var.name, persistable=var.persistable, type=var.type)
+        elif var.type == core.VarDesc.VarType.RAW:
+            ret_var = self.create_var(
+                name=var.name, persistable=var.persistable, type=var.type)
+        elif var.type == core.VarDesc.VarType.SELECTED_ROWS:
+            ret_var = self.create_var(
+                name=var.name,
+                shape=var.shape,
+                dtype=var.dtype,
+                type=var.type,
+                persistable=True,
+                is_data=var.is_data)
+        else:
+            ret_var = self.create_var(
+                name=var.name,
+                shape=var.shape,
+                dtype=var.dtype,
+                type=var.type,
+                lod_level=var.lod_level,
+                persistable=True,
+                is_data=var.is_data)
+        return ret_var
+
 
 class Program(object):
+    """
+    Python Program. Beneath it is a ProgramDesc, which is used for
+    create c++ Program. A program is a self-contained programing
+    language like container. It has at least one Block, when the
+    control flow op like conditional_block, while_op is included,
+    it will contains nested block.
+    Please reference the framework.proto for details.
+
+    Notes: we have default_startup_program and default_main_program
+    by default, a pair of them will shared the parameters.
+    The default_startup_program only run once to initialize parameters,
+    default_main_program run in every mini batch and adjust the weights.
+
+    Returns:
+        A empty program.
+
+    Examples:
+        >>> main_program = fluid.Program()
+        >>> startup_program = fluid.Program()
+        >>> with fluid.program_guard(main_program=main_program, startup_program=startup_program):
+        >>>     fluid.layers.data(name="x", shape=[-1, 784], dtype='float32')
+        >>>     fluid.layers.data(name="y", shape=[-1, 1], dtype='int32')
+        >>>     fluid.layers.fc(name="fc", shape=[10], dtype='float32', act="relu")
+
+    """
+
     def __init__(self):
         self.desc = core.ProgramDesc()
         self.blocks = [Block(self, 0)]
         self.current_block_idx = 0
         self._seed = 0
+        self._current_role = core.op_proto_and_checker_maker.OpRole.Forward
+        self._op_role_var = []
+
+    @property
+    def op_role(self):
+        """
+        The operator role. In a enum {Forward, Backward, Optimize}.
+
+        Notes: this is a low level API. It is used only for ParallelExecutor to
+        duplicate or schedule operator to devices.
+
+        For example, the forward operator should be executed on every device.
+        The backward operator should be executed on every device and the
+        parameter gradient of backward (use :code:`op_role_var` to get this
+        variable) operator should be merged to one device. The optimization
+        operators should be executed on only one device and broadcast the
+        optimization result, i.e., the new parameter, to every other device.
+        """
+        return self._current_role
+
+    @op_role.setter
+    def set_op_role(self, role):
+        self._current_role = role
+
+    @property
+    def op_role_var(self):
+        """
+        The auxiliary variables for :code:`op_role` property.
+
+        See Also: :code:`Program.op_role`'s documentation for details.
+
+        Notes: This is a very low-level API. Users should not use it directly.
+        """
+        return self._op_role_var
+
+    @op_role_var.setter
+    def set_op_role_var(self, var_name):
+        self._op_role_var = [var_name]
+
+    @contextlib.contextmanager
+    def optimized_guard(self, var):
+        """
+        A with guard to set :code:`Optimization` :code:`OpRole` and
+        :code:`OpRoleVar` automatically.
+
+        Notes: This is a very low level API. Users should not use it directly.
+
+        Args:
+            var(Variable|str): The variable (name) to be optimized.
+
+        Examples:
+
+            >>> p, g = backward(...)
+            >>> with program.optimized_guard(p):
+            >>>     p = p - 0.001 * g
+        """
+        OpRole = core.op_proto_and_checker_maker.OpRole
+        self._current_role = OpRole.Optimize
+        self._op_role_var = [var.name if isinstance(var, Variable) else var]
+        yield
+        self._op_role_var = []
+        self._current_role = OpRole.Forward
 
     def __str__(self):
+        """
+        Get the protobuf debug string of this Program.
+
+        Returns:
+            (str): The protobuf debug string.
+
+        Raises:
+            ValueError: If any of required fields is not set.
+        """
         return self.to_string(True)
 
     def to_string(self, throw_on_error, with_details=False):
         """
         To debug string.
+
         Args:
-            throw_on_error(bool): raise exception when self is not initialized
-                when throw_on_error is True
-            with_details(bool): more details about variables and parameters
-                (e.g. trainable, optimize_attr, ...) will be printed when with_details is True
+            throw_on_error(bool): raise Value error when any of required fields
+                is not set.
 
-        Returns(str): The debug string.
+            with_details(bool): True if more details about variables and
+                parameters, e.g., :code:`trainable`, :code:`optimize_attr`, need
+                to print.
+
+        Returns
+            (str): The debug string.
+
+        Raises:
+            ValueError: If any of required fields is not set and throw_on_error is
+                True.
 
         """
         assert isinstance(throw_on_error, bool) and isinstance(with_details,
@@ -954,44 +1387,147 @@ class Program(object):
         return res_str
 
     def get_desc(self):
+        """
+        Get the C++ side of `ProgramDesc` object pointer. The C++ object is
+        exposed by :code:`pybind`.
+
+        Notes: This is a very low level API. Users should not use this API
+        directly.
+        """
         return self.desc
 
     def clone(self, for_test=False):
-        """Clone the Program object
+        """
+        Create a new, duplicated program.
+
+
+        Some operators, e.g., :code:`batch_norm`, behave differently between
+        training and testing. They have an attribute, :code:`is_test`, to
+        control this behaviour. This method will change the :code:`is_test`
+        attribute of them to :code:`True` when :code:`for_test=True`.
+
+        * Set for_test to False when we want to clone the program for training.
+        * Set for_test to True when we want to clone the program for testing.
+
+        Notes: This API DOES NOT prune any operator. Use
+        :code:`clone(for_test=True)` before backward and optimization please. e.g.
 
-        Set for_test to False when we want to clone the program for training.
-        Set for_test to True when we want to clone the program for testing.         
+            >>> test_program = fluid.default_main_program().clone(for_test=True)
+            >>> optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+            >>> optimizer.minimize()
 
         Args:
-            for_test(bool): Some operators, such as batch_norm and drop_out ops,
-                behave differently in training and testing. If for_test is True,
-                the is_test attributes in these operators will be set to True for
-                testing purposes, otherwise, they remain unchanged.  
-                
-        Returns(Program):
-            The cloned Program object.
+            for_test(bool): True if change the :code:`is_test` attribute of
+                operators to :code:`True`.
+
+        Returns:
+            Program: The new, duplicated Program object.
+
+        Examples:
+
+            1. To clone a test program, the sample code is:
+
+            >>> import paddle.fluid as fluid
+            >>> train_program = fluid.Program()
+            >>> startup_program = fluid.Program()
+            >>> with fluid.program_guard(train_program, startup_program):
+            >>>     img = fluid.layers.data(name='image', shape=[784])
+            >>>     hidden = fluid.layers.fc(input=img, size=200, act='relu')
+            >>>     hidden = fluid.layers.dropout(hidden, dropout_prob=0.5)
+            >>>     loss = fluid.layers.cross_entropy(
+            >>>                 input=fluid.layers.fc(hidden, size=10, act='softmax'),
+            >>>                 label=fluid.layers.data(name='label', shape=[1], dtype='int64'))
+            >>>
+            >>> test_program = train_program.clone(for_test=True)
+            >>>
+            >>> sgd = fluid.optimizer.SGD(learning_rate=1e-3)
+            >>> with fluid.program_guard(train_program, startup_program):
+            >>>     sgd.minimize(loss)
+
+            2. The :code:`clone` method can be avoid if you create program for
+            training and program for testing individually.
+
+            >>> import paddle.fluid as fluid
+            >>>
+            >>> def network(is_test):
+            >>>     img = fluid.layers.data(name='image', shape=[784])
+            >>>     hidden = fluid.layers.fc(input=img, size=200, act='relu')
+            >>>     hidden = fluid.layers.dropout(hidden, dropout_prob=0.5, is_test=is_test)
+            >>>     loss = fluid.layers.cross_entropy(
+            >>>                 input=fluid.layers.fc(hidden, size=10, act='softmax'),
+            >>>                 label=fluid.layers.data(name='label', shape=[1], dtype='int64'))
+            >>>     return loss
+            >>>
+            >>> train_program = fluid.Program()
+            >>> startup_program = fluid.Program()
+            >>> test_program = fluid.Program()
+            >>>
+            >>> with fluid.program_guard(train_program, startup_program):
+            >>>     with fluid.unique_name.guard():
+            >>>         loss = network(is_test=False)
+            >>>         sgd = fluid.optimizer.SGD(learning_rate=1e-3)
+            >>>         sgd.minimize(loss)
+            >>>
+            >>> # the test startup program is not used.
+            >>> with fluid.program_guard(test_program, fluid.Program()):
+            >>>     with fluid.unique_name.guard():
+            >>>         loss = network(is_test=True)
+
+            The two code snippets above will generate same programs.
         """
-        p = Program()
         if for_test:
-            p.desc = core.inference_optimize(self.desc)
+            p = self.inference_optimize()
         else:
+            p = Program()
             p.desc = core.ProgramDesc(self.desc)
-        p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
-        p.sync_with_cpp()
+            p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
+            p.sync_with_cpp()
+
         p.copy_param_info_from(self)
+        p.copy_data_info_from(self)
         return p
 
     def prune(self, targets):
+        """
+        Prune operators and variables which are not needed to generate
+        :code:`targets`.
+
+        Notes: This is a very low level API. Users should not use this API
+        directly. This API is in flux and not stable.
+
+        Args:
+            targets(list|Variable|Operator): A list of variables or operators
+                need to be pruned
+
+        Returns:
+            Program:  A new, pruned program.
+
+        """
         if not isinstance(targets, list):
             targets = [targets]
         targets_idx = []
         for t in targets:
             if not isinstance(t, Operator):
                 if isinstance(t, Variable):
+                    # After transpiler processing, the op that output this
+                    # variable maybe has been changed, so t.op is not reliable
+                    # and we need to find the current op that generate this
+                    # variable here.
+                    t.op = None
+                    global_block = self.global_block()
+                    for idx, op in enumerate(global_block.ops):
+                        if t.name in op.output_arg_names:
+                            t.op = op
+                            break
+
                     t = t.op
+                    if t is None:
+                        raise ValueError(
+                            "The target variable must have an "
+                            "associated operator that generates it.")
                 else:
-                    raise ValueError(("All targets of prune() can only be "
-                                      "Variable or Operator."))
+                    raise ValueError("All targets of prune() can only be "
+                                     "Variable or Operator.")
 
             targets_idx.append([t.block.idx, t.idx])
         res = Program()
@@ -1001,14 +1537,45 @@ class Program(object):
         return res
 
     def inference_optimize(self):
+        """
+        This method will create a new program and change the :code:`is_test`
+        attribute of operators to :code:`True`. All the :code:`Parameter`
+        information will be lost.
+
+        Notes: This API is a very low level API. Use
+        :code:`Program.clone(for_test=True)` instead.
+
+        Returns:
+            Program: The new program.
+        """
+        # this is an alternative implement before
+        # core.inference_optimize being fixed.
         res = Program()
-        res.desc = core.inference_optimize(self.desc)
+        res.desc = core.ProgramDesc(self.desc)
+        for i in xrange(res.desc.num_blocks()):
+            block = res.desc.block(i)
+            for j in xrange(block.op_size()):
+                op = block.op(j)
+                if op.has_attr('is_test'):
+                    op.set_attr('is_test', True)
         res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())]
         res.sync_with_cpp()
         return res
 
     @staticmethod
     def parse_from_string(binary_str):
+        """
+        Deserialize a program desc from protobuf binary string.
+
+        Notes: All information about parameters will be lost after serialization
+        and deserialization.
+
+        Args:
+            binary_str(str): The binary prootbuf string.
+
+        Returns:
+            Program: A deserialized program desc.
+        """
         p = Program()
         p.desc = core.ProgramDesc(binary_str)
         p.blocks = [Block(p, i) for i in xrange(p.desc.num_blocks())]
@@ -1017,8 +1584,21 @@ class Program(object):
 
     @property
     def random_seed(self):
+        """
+        The default random seed for random operators in Program. Zero means get
+        the random seed from random device.
+
+        Notes: It must be set before the operators have been added.
+        """
         return self._seed
 
+    @property
+    def num_blocks(self):
+        """
+        The number of blocks in this program.
+        """
+        return self.desc.num_blocks()
+
     @random_seed.setter
     def random_seed(self, seed):
         if not isinstance(seed, int):
@@ -1029,33 +1609,40 @@ class Program(object):
         return str(self)
 
     def global_block(self):
+        """
+        Get the first block of this program.
+        """
         return self.blocks[0]
 
     def block(self, index):
+        """
+        Get the :code:`index` block of this program
+        Args:
+            index(int): The index of block to get
+
+        Returns:
+            Block: The :code:`index` block
+        """
         return self.blocks[index]
 
     def current_block(self):
+        """
+        Get the current block. The :code:`current` block is the block to append
+        operators.
+        """
         return self.blocks[self.current_block_idx]
 
-    def append_backward(self, target, no_grad_set=None):
-        """
-        return map(param_name -> (grad_name, block_index, op_index))
+    def create_block(self, parent_idx=None):
         """
-        assert isinstance(target, Variable)
-        if no_grad_set is None:
-            no_grad_set = set()
-        try:
-            param_to_grad_info = self.desc.append_backward(target.desc,
-                                                           no_grad_set)
-        except Exception as e:
-            raise core.EnforceNotMet(
-                str(e) + "\nCurrent protobuf is\n{0}".format(
-                    self.to_string(False)))
+        Create a new block with the :code:`parent_idx` and change the current block
+        to new block.
 
-        self.sync_with_cpp()
-        return param_to_grad_info
+        Args:
+            parent_idx(int): The parent block index.
 
-    def create_block(self, parent_idx=None):
+        Returns:
+            Block: The new block.
+        """
         new_block_idx = len(self.blocks)
         parent = self.current_block() if parent_idx is None else self.block(
             parent_idx)
@@ -1065,9 +1652,24 @@ class Program(object):
         return self.current_block()
 
     def rollback(self):
+        """
+        Exit a code block, i.e., roll back to the parent block.
+        Returns:
+            None
+        """
         self.current_block_idx = self.current_block().parent_idx
 
     def sync_with_cpp(self):
+        """
+        Synchronize Python instance to its binding C++ object instance.
+        If the program is modified in C++ space, this method should be invoked.
+
+        Notes: This is a very low level API. Users should not invoke it
+        directly.
+
+        Returns:
+            None
+        """
         for block_idx in range(len(self.blocks), self.desc.num_blocks()):
             self.blocks.append(Block(self, block_idx))
         for block in self.blocks:
@@ -1076,6 +1678,10 @@ class Program(object):
     def copy_param_info_from(self, other):
         """
         Copy the information of parameters from other program.
+
+        Notes: This is a very low level API. Users should not invoke it
+        directly.
+
         Args:
             other(Program): Other program
 
@@ -1091,13 +1697,66 @@ class Program(object):
                              "program, with represent the same topology")
         self.global_block().copy_param_info_from(other.global_block())
 
+    def copy_data_info_from(self, other):
+        """
+        Copy the information of data variables from other program.
+
+        Notes: This is a very low level API. Users should not invoke it
+        directly.
+
+        Args:
+            other(Program): Other program
+
+        Returns:
+            None
+        """
+        if not isinstance(other, Program):
+            raise TypeError("copy_param_info_from should be invoked with "
+                            "Program")
+
+        if len(self.blocks) != len(other.blocks):
+            raise ValueError("copy_param_info_from should be invoked with two "
+                             "program, with represent the same topology")
+        for var in other.global_block().vars.itervalues():
+            if var.is_data:
+                self.global_block().var(var.name).is_data = True
+
     def list_vars(self):
+        """
+        Get all variables from this Program. A iterable object is returned.
+
+        Returns:
+            iterable: The generator will yield every variable in this program.
+        """
         for each_block in self.blocks:
             for each_var in each_block.vars.itervalues():
                 yield each_var
 
 
 class Parameter(Variable):
+    """
+    Parameter is derived from Variable. A parameter is a persistable 
+    Variable, and will be updated by optimizers after each iteration.
+    The training of a neural network is essentially the updating of 
+    its parameters.
+
+    Relative to a general Variable, a Parameter has several its own
+    member variables:
+
+    Args:
+        trainable(bool): True if the parameter need to be updated after
+            iterations.
+        optimize_attr(map): Parameter attributes related with optimizing.
+            Currently, it only contains 'learning_rate'.
+            Default: {'learning_rate': 1.0}
+        regularizer(WeightDecayRegularizer): The Regularizer which will
+            be applied on the parameter. Default: None
+        gradient_clip_attr(BaseGradientClipAttr): The gradint clip strategy
+            which will be applied on the parameter. Default: None
+        do_model_average(bool): True if the model average strategy will
+            be applied on this parameter.
+    """
+
     def __init__(self, block, shape, dtype, **kwargs):
         if shape is None or dtype is None:
             raise ValueError("Parameter must set shape and dtype")
@@ -1119,12 +1778,15 @@ class Parameter(Variable):
 
         self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None)
 
+        self.do_model_average = kwargs.get('do_model_average', None)
+
     def __str__(self):
         return self.to_string(True)
 
     def to_string(self, throw_on_error, with_details=False):
         """
         To debug string.
+
         Args:
             throw_on_error(bool): raise exception when self is not initialized
                 when throw_on_error is True
@@ -1139,7 +1801,7 @@ class Parameter(Variable):
         if with_details:
             res_str = Variable.to_string(self, throw_on_error, True)
             additional_attr = ("trainable", "optimize_attr", "regularizer",
-                               "gradient_clip_attr")
+                               "gradient_clip_attr", "do_model_average")
             for attr_name in additional_attr:
                 res_str += "%s: %s\n" % (attr_name,
                                          str(getattr(self, attr_name)))
@@ -1157,8 +1819,15 @@ _startup_program_ = Program()
 
 def default_startup_program():
     """
-    Get default startup program. In startup program, Paddle will initialize
-    parameters, initialize nccl handle, etc.
+    Get default/global startup program.
+
+    The layer function in :code:`fluid.layers` will create parameters, readers,
+    NCCL handles as global variables. The :code:`startup_program` will
+    initialize them by the operators in startup program. The layer function will
+    append these initialization operators into startup program.
+
+    This method will return the :code:`default` or the :code:`current` startup
+    program. Users can use :code:`fluid.program_guard` to switch program.
 
     Returns:
         Program: startup program
@@ -1168,7 +1837,15 @@ def default_startup_program():
 
 def default_main_program():
     """
-    Get default main program. The main program is used for training or testing.
+    Get default/global main program. The main program is used for training or
+    testing.
+
+    All layer function in :code:`fluid.layers` will append operators and
+    variables to the :code:`default_main_program`.
+
+    The :code:`default_main_program` is the default program in a lot of APIs.
+    For example, the :code:`Executor.run()` will execute the
+    :code:`default_main_program` when the program is not specified.
 
     Returns:
         Program: main program
@@ -1210,20 +1887,34 @@ def switch_startup_program(program):
 @contextlib.contextmanager
 def program_guard(main_program, startup_program=None):
     """
-    Switch program with `with` statement
+    Change the global main program and startup program with `with` statement.
+    Layer functions in the Python `with` block will append operators and
+    variables to the new main programs.
 
     Examples:
-        >>> with program_guard(Program()):
-        >>>   data = fluid.layers.data(...)
-        >>>   hidden = fluid.layers.fc(...)
+
+        >>> import paddle.fluid as fluid
+        >>> main_program = fluid.Program()
+        >>> startup_program = fluid.Program()
+        >>> with fluid.program_guard(main_program, startup_program):
+        >>>     data = fluid.layers.data(...)
+        >>>     hidden = fluid.layers.fc(...)
+
+    Notes: The temporary :code:`Program` can be used if the user does not need
+    to construct either of startup program or main program.
+
+    Examples:
+
+        >>> import paddle.fluid as fluid
+        >>> main_program = fluid.Program()
+        >>> # does not care about startup program. Just pass a temporary value.
+        >>> with fluid.program_guard(main_program, fluid.Program()):
+        >>>     data = ...
 
     Args:
-        main_program(Program): New main program inside `with` statement
+        main_program(Program): New main program inside `with` statement.
         startup_program(Program): New startup program inside `with` statement.
             None means do not change startup program.
-
-    Returns:
-        None
     """
     if not isinstance(main_program, Program):
         raise TypeError("main_program should be Program")
@@ -1240,11 +1931,12 @@ def program_guard(main_program, startup_program=None):
 
 def get_var(name, program=None):
     """
-    Get a variable by name from the global block of a program
+    Get a variable by name from the global block of a program.
+
     Args:
         name(str): name of the variable
         program(Program|None): program object.
-             If None, default_global_program() will be used.
+        If None, default_global_program() will be used.
 
     Returns:
         Variable
diff --git a/python/paddle/fluid/graphviz.py b/python/paddle/fluid/graphviz.py
index b8d21344fc8f65f4025f28a195dab2d371b30292..125b4efa9d476e561bd78d0365cd92bbf7e66605 100644
--- a/python/paddle/fluid/graphviz.py
+++ b/python/paddle/fluid/graphviz.py
@@ -83,7 +83,7 @@ class Graph(object):
         file = open(dot_path, 'w')
         file.write(self.__str__())
         image_path = os.path.join(
-            os.path.dirname(__file__), dot_path[:-3] + "pdf")
+            os.path.dirname(dot_path), dot_path[:-3] + "pdf")
         cmd = ["dot", "-Tpdf", dot_path, "-o", image_path]
         subprocess.Popen(
             cmd,
@@ -199,7 +199,7 @@ class GraphPreviewGenerator(object):
         else:
             self.graph.show(path)
 
-    def add_param(self, name, data_type, shape, highlight=False):
+    def add_param(self, name, data_type, highlight=False):
         label = '\n'.join([
             '<<table cellpadding="5">',
             '  <tr>',
@@ -214,11 +214,6 @@ class GraphPreviewGenerator(object):
             str(data_type),
             '    </td>'
             '  </tr>',
-            '  <tr>',
-            '    <td>',
-            '[%s]' % 'x'.join(shape),
-            '    </td>'
-            '  </tr>',
             '</table>>',
         ])
         return self.graph.node(
diff --git a/python/paddle/fluid/inferencer.py b/python/paddle/fluid/inferencer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a81e39695b78f235d6ae896d90117dd392692634
--- /dev/null
+++ b/python/paddle/fluid/inferencer.py
@@ -0,0 +1,111 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+
+import core
+
+import executor
+import framework
+import io
+import parallel_executor
+import unique_name
+from trainer import check_and_get_place
+
+__all__ = ['Inferencer', ]
+
+
+class Inferencer(object):
+    """
+    Inferencer High Level API.
+
+    Args:
+        infer_func (Python func): Infer function that will return predict Variable
+        param_path (str): The path where the inference model is saved by fluid.io.save_params
+        place (Place): place to do the inference
+        parallel (bool): use parallel_executor to run the inference, it will use multi CPU/GPU.
+
+    Examples:
+        .. code-block:: python
+
+            def inference_program():
+                x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+                y_predict = fluid.layers.fc(input=x, size=1, act=None)
+                return y_predict
+
+            place = fluid.CPUPlace()
+            inferencer = fluid.Inferencer(
+                infer_func=inference_program, param_path="/tmp/model", place=place)
+
+    """
+
+    def __init__(self, infer_func, param_path, place=None, parallel=False):
+        self.param_path = param_path
+        self.scope = core.Scope()
+        self.parallel = parallel
+        self.place = check_and_get_place(place)
+
+        self.inference_program = framework.Program()
+        with framework.program_guard(self.inference_program):
+            with unique_name.guard():
+                self.predict_var = infer_func()
+
+        with self._prog_and_scope_guard():
+            # load params from param_path into scope
+            io.load_params(executor.Executor(self.place), param_path)
+
+        if parallel:
+            with self._prog_and_scope_guard():
+                self.exe = parallel_executor.ParallelExecutor(
+                    use_cuda=isinstance(self.place, core.CUDAPlace),
+                    loss_name=self.predict_var.name)
+        else:
+            self.exe = executor.Executor(self.place)
+
+        self.inference_program = self.inference_program.clone(for_test=True)
+
+    def infer(self, inputs, return_numpy=True):
+        """
+        Do Inference for Inputs
+
+        Args:
+            inputs (map): a map of {"input_name": input_var} that will be feed into the inference program
+            return_numpy (bool): transform return value into numpy or not
+
+        Returns:
+            Tensor or Numpy: the predict value of the inference model for the inputs
+
+        Examples:
+            .. code-block:: python
+
+                tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
+                results = inferencer.infer({'x': tensor_x})
+        """
+        if not isinstance(inputs, dict):
+            raise ValueError(
+                "inputs should be a map of {'input_name': input_var}")
+
+        with executor.scope_guard(self.scope):
+            results = self.exe.run(self.inference_program,
+                                   feed=inputs,
+                                   fetch_list=[self.predict_var],
+                                   return_numpy=return_numpy)
+
+        return results
+
+    @contextlib.contextmanager
+    def _prog_and_scope_guard(self):
+        with framework.program_guard(main_program=self.inference_program):
+            with executor.scope_guard(self.scope):
+                yield
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 927f1e625a579737b98e60683d8d9ed90d5e7e03..373e9c060de1ee27c165ccd2380cd8c38612c4d9 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -15,27 +15,43 @@
 import framework
 import numpy as np
 import contextlib
+from framework import convert_np_dtype_to_dtype_
+from core import VarDesc
 
 __all__ = [
-    'Constant', 'Uniform', 'Normal', 'Xavier', 'force_init_on_cpu',
-    'init_on_cpu'
+    'Constant', 'Uniform', 'Normal', 'Xavier', 'Bilinear', 'MSRA',
+    'force_init_on_cpu', 'init_on_cpu', 'ConstantInitializer',
+    'UniformInitializer', 'NormalInitializer', 'XavierInitializer',
+    'BilinearInitializer', 'MSRAInitializer'
 ]
 
 _force_init_on_cpu_ = False
 
 
 def force_init_on_cpu():
+    """
+    The flag of whether force to init variables on CPU.
+
+    Examples:
+        .. code-block:: python
+
+            if force_init_on_cpu():
+                pass
+
+    """
     return _force_init_on_cpu_
 
 
 @contextlib.contextmanager
 def init_on_cpu():
     """
-    Switch program with `with` statement
+    Force the variable to be inited on CPU.
 
     Examples:
-        >>> with init_on_cpu():
-        >>>   step = layers.create_global_var()
+        .. code-block:: python
+
+            with init_on_cpu():
+                step = layers.create_global_var()
 
     """
     global _force_init_on_cpu_
@@ -101,14 +117,18 @@ class Initializer(object):
 
 class ConstantInitializer(Initializer):
     """Implements the constant initializer
+
+    Args:
+        value (float): constant value to initialize the variable
+
+    Examples:
+        .. code-block:: python
+
+            fc = fluid.layers.fc(input=x, size=10,
+                param_attr=fluid.initializer.Constant(value=2.0))
     """
 
     def __init__(self, value=0.0, force_cpu=False):
-        """Constructor for ConstantInitializer
-
-        Args:
-            value: constant value to initialize the variable
-        """
         assert value is not None
         super(ConstantInitializer, self).__init__()
         self._value = value
@@ -143,16 +163,20 @@ class ConstantInitializer(Initializer):
 
 class UniformInitializer(Initializer):
     """Implements the random uniform distribution initializer
+
+    Args:
+        low (float): lower boundary of the uniform distribution
+        high (float): upper boundary of the uniform distribution
+        seed (int): random seed
+
+    Examples:
+        .. code-block:: python
+
+            fc = fluid.layers.fc(input=x, size=10,
+                param_attr=fluid.initializer.Uniform(low=-0.5, high=0.5))
     """
 
     def __init__(self, low=-1.0, high=1.0, seed=0):
-        """Constructor for UniformInitializer
-
-        Args:
-            low: lower boundary of the uniform distribution
-            high: upper boundary of the uniform distribution
-            seed: random seed
-        """
         assert low is not None
         assert high is not None
         assert high >= low
@@ -193,17 +217,21 @@ class UniformInitializer(Initializer):
 
 
 class NormalInitializer(Initializer):
-    """Implements the  random Normal(Gaussian) distribution initializer
+    """Implements the Random Normal(Gaussian) distribution initializer
+
+    Args:
+        loc (float): mean of the normal distribution
+        scale (float): standard deviation of the normal distribution
+        seed (int): random seed
+
+    Examples:
+        .. code-block:: python
+
+            fc = fluid.layers.fc(input=x, size=10,
+                param_attr=fluid.initializer.Normal(loc=0.0, scale=2.0))
     """
 
     def __init__(self, loc=0.0, scale=1.0, seed=0):
-        """Constructor for NormalInitializer
-
-        Args:
-            loc: mean of the normal distribution
-            scale: standard deviation of the normal distribution
-            seed: random seed
-        """
         assert loc is not None
         assert scale is not None
         assert seed is not None
@@ -243,39 +271,49 @@ class NormalInitializer(Initializer):
 
 
 class XavierInitializer(Initializer):
-    """Implements the Xavier initializer
-
+    """
     This class implements the Xavier weight initializer from the paper
-    Understanding the difficulty of training deep feedforward neural
-    networks[1] by Xavier Glorot and Yoshua Bengio.
+    `Understanding the difficulty of training deep feedforward neural
+    networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
+    by Xavier Glorot and Yoshua Bengio.
 
     This initializer is designed to keep the scale of the gradients
     approximately same in all the layers. In case of Uniform distribution,
-    the range is [-x, x], where x = sqrt(6 / (fan_in + fan_out)).
+    the range is [-x, x], where
+
+    .. math::
+
+        x = \sqrt{\\frac{6.0}{fan\_in + fan\_out}}
+
     In case of Normal distribution, the mean is 0 and the standard deviation
-    is sqrt(2/ (fan_in + fan_out)).
+    is
+
+    .. math::
+
+        \sqrt{\\frac{2.0}{fan\_in + fan\_out}}
+
+
+    Args:
+        uniform (bool): whether to use uniform or normal distribution
+        fan_in (float): fan_in for Xavier initialization. If None, it is
+                inferred from the variable.
+        fan_out (float): fan_out for Xavier initialization. If None, it is
+                 inferred from the variable.
+        seed (int): random seed
+
+    Note:
+        It is recommended to set fan_in and fan_out to None for most cases.
+
+    Examples:
+        .. code-block:: python
+
+            fc = fluid.layers.fc(
+                input=queries, size=10,
+                param_attr=fluid.initializer.Xavier(uniform=False))
 
-    References:
-        [1] Understanding the difficulty of training deep feedforward neural
-            networks. International conference on artificial intelligence and
-            statistics.
-            (http://proceedings.mlr.press/v9/glorot10a.html)
     """
 
     def __init__(self, uniform=True, fan_in=None, fan_out=None, seed=0):
-        """Constructor for XavierInitializer
-
-        Args:
-            uniform: whether to use uniform or normal distribution
-            fan_in: fan_in for Xavier initialization. If None, it is
-                    inferred from the variable.
-            fan_out: fan_out for Xavier initialization. If None, it is
-                     inferred from the variable.
-            seed: random seed
-
-        Note: It is recommended to set fan_in and fan_out to None for
-              most cases.
-        """
         assert uniform is not None
         assert seed is not None
         super(XavierInitializer, self).__init__()
@@ -339,30 +377,42 @@ class MSRAInitializer(Initializer):
     """Implements the MSRA initializer a.k.a. Kaiming Initializer
 
     This class implements the weight initialization from the paper
-    Delving Deep into Rectifiers: Surpassing Human-Level Performance on
-    ImageNet Classification[1] by Kaiming He, Xiangyu Zhang, Shaoqing Ren
-    and Jian Sun. This is a robust initialization method that particularly
-    considers the rectifier nonlinearities. In case of Uniform distribution,
-    the range is [-x, x], where x = sqrt(6 / fan_in). In case of Normal
-    distribution, the mean is 0 and the standard deviation
-    is sqrt(2/ fan_in).
-
-    References:
-        [1] Delving Deep into Rectifiers: Surpassing Human-Level Performance
-            on ImageNet Classification
-            (https://arxiv.org/abs/1502.01852)
+    `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+    ImageNet Classification <https://arxiv.org/abs/1502.01852>`_
+    by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a
+    robust initialization method that particularly considers the rectifier
+    nonlinearities. In case of Uniform distribution, the range is [-x, x], where
+
+    .. math::
+
+        x = \sqrt{\\frac{6.0}{fan\_in}}
+
+    In case of Normal distribution, the mean is 0 and the standard deviation
+    is
+
+    .. math::
+
+        \sqrt{\\frac{2.0}{fan\_in}}
+
+    Args:
+        uniform (bool): whether to use uniform or normal distribution
+        fan_in (float): fan_in for MSRAInitializer. If None, it is\
+        inferred from the variable.
+        seed (int): random seed
+
+    Note:
+        It is recommended to set fan_in to None for most cases.
+
+    Examples:
+        .. code-block:: python
+
+            fc = fluid.layers.fc(
+                input=queries, size=10,
+                param_attr=fluid.initializer.MSRA(uniform=False))
     """
 
     def __init__(self, uniform=True, fan_in=None, seed=0):
         """Constructor for MSRAInitializer
-
-        Args:
-            uniform: whether to use uniform or normal distribution
-            fan_in: fan_in for MSRAInitializer. If None, it is
-                    inferred from the variable.
-            seed: random seed
-
-        Note: It is recommended to set fan_in to None for most cases.
         """
         assert uniform is not None
         assert seed is not None
@@ -421,6 +471,104 @@ class MSRAInitializer(Initializer):
         return op
 
 
+class BilinearInitializer(Initializer):
+    """
+    This initializer can be used in transposed convolution operator to
+    act as upsampling. Users can upsample a feature map with shape of
+    (B, C, H, W) by any integer factor. The usage is:
+
+    Examples:
+
+        .. code-block:: python
+
+            factor = 2
+            w_attr = ParamAttr(learning_rate=0., regularizer=L2Decay(0.),
+                               initializer=Bilinear())
+            conv_up = fluid.layers.conv2d_transpose(
+                input,
+                num_filters=C,
+                output_size=None,
+                filter_size=2 * factor - factor % 2,
+                padding=ceil((factor - 1) / 2.),
+                stride=factor,
+                groups=C,
+                param_attr=w_attr,
+                bias_attr=False)
+
+    Where, `num_filters=C` and `groups=C` means this is channel-wise transposed
+    convolution. The filter shape will be (C, 1, K, K) where K is `filer_size`,
+    This initializer will set a (K, K) interpolation kernel for every channel
+    of the filter identically. The resulting shape of the output feature map
+    will be (B, C, factor * H, factor * W). Note that the learning rate and the
+    weight decay are set to 0 in order to keep coefficient values of bilinear
+    interpolation unchanged during training.
+
+    """
+
+    def __init__(self):
+        """Constructor for BilinearInitializer.
+        """
+        super(BilinearInitializer, self).__init__()
+
+    def __call__(self, var, block):
+        """Add biliear initialization ops for a variable
+
+        Args:
+            var (Variable): Variable that needs to be initialized.
+            block (Block): The block in which initialization ops should
+                           be added.
+
+        Returns:
+            Operator: the initialization op
+
+        Raises:
+            ValueError: If type of `var` and `block` is not right.
+                        If the shape of `var` size is not 4 and
+                        var.shape[2] != var.shape[3].
+        """
+        if not isinstance(var, framework.Variable):
+            raise ValueError("var must be framework.Variable.")
+
+        if not isinstance(block, framework.Block):
+            raise ValueError("block must be framework.Block.")
+
+        shape = var.shape
+        if len(shape) != 4:
+            raise ValueError("the length of shape must be 4.")
+        if shape[2] != shape[3]:
+            raise ValueError("shape[2] must be equal to shape[3].")
+
+        weight = np.zeros(np.prod(var.shape), dtype='float32')
+        size = shape[3]
+        # factor
+        f = np.ceil(size / 2.)
+        # center
+        c = (2 * f - 1 - f % 2) / (2. * f)
+        for i in range(np.prod(shape)):
+            x = i % size
+            y = (i / size) % size
+            weight[i] = (1 - abs(x / f - c)) * (1 - abs(y / f - c))
+        weight = np.reshape(weight, shape)
+
+        if var.dtype == VarDesc.VarType.FP32:
+            value_name = "fp32_values"
+            values = [float(v) for v in weight.flat]
+        else:
+            raise ValueError("Unsupported dtype %s", input.dtype)
+        if np.prod(shape) > 1024 * 1024:
+            raise ValueError("The size of input is too big. ")
+        op = block.append_op(
+            type='assign_value',
+            outputs={'Out': [var]},
+            attrs={
+                'dtype': var.dtype,
+                'shape': list(shape),
+                value_name: values
+            })
+        var.op = op
+        return op
+
+
 # We short the class name, since users will use the initializer with the package
 # name. The sample code:
 #
@@ -435,3 +583,4 @@ Uniform = UniformInitializer
 Normal = NormalInitializer
 Xavier = XavierInitializer
 MSRA = MSRAInitializer
+Bilinear = BilinearInitializer
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 1c0f1f6eb415b1c05c1052c1f52743a19c49f017..5c8f4f6507c7dd9b3d005639d962ce1e55b2c704 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -13,39 +13,61 @@
 # limitations under the License.
 
 import os
+import errno
+import time
+import shutil
 
 from paddle.fluid.evaluator import Evaluator
-from paddle.fluid.framework import Program, Parameter, default_main_program, Variable
+from paddle.fluid.framework import Program, Parameter, default_main_program, default_startup_program, Variable
 from . import core
 
 __all__ = [
-    'save_vars',
-    'save_params',
-    'save_persistables',
-    'load_vars',
-    'load_params',
-    'load_persistables',
-    'save_inference_model',
-    'load_inference_model',
-    'get_inference_program',
+    'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
+    'load_persistables', 'save_inference_model', 'load_inference_model',
+    'get_inference_program', 'save_checkpoint', 'load_checkpoint',
+    'clean_checkpoint', 'load_persist_vars_without_grad',
+    'load_lookup_table_vars', 'save_persist_vars_without_grad',
+    'get_latest_checkpoint_serial'
 ]
 
 
 def is_parameter(var):
-    """Check whether the variable is a Parameter.
-
-    This function checks whether the input variable is a Parameter.
+    """
+    Check whether the given variable is an instance of Parameter.
 
     Args:
-        var : The input variable.
+        var(Variable): The variable to be checked.
 
     Returns:
-        boolean result whether the variable is a Parameter.
+        bool: True if the given `var` is an instance of Parameter,
+        False if not.
+
+    Examples:
+        .. code-block:: python
+
+            param = fluid.default_main_program().global_block().var('fc.w')
+            res = fluid.io.is_parameter(param)
     """
     return isinstance(var, Parameter)
 
 
 def is_persistable(var):
+    """
+    Check whether the given variable is persistable.
+
+    Args:
+        var(Variable): The variable to be checked.
+
+    Returns:
+        bool: True if the given `var` is persistable
+        False if not.
+
+    Examples:
+        .. code-block:: python
+
+            param = fluid.default_main_program().global_block().var('fc.w')
+            res = fluid.io.is_persistable(param)
+    """
     if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
             var.desc.type() == core.VarDesc.VarType.FETCH_LIST:
         return False
@@ -70,20 +92,69 @@ def save_vars(executor,
               predicate=None,
               filename=None):
     """
-    Save variables to directory by executor.
+    Save variables to the given directory by executor.
+
+    There are two ways to specify variables to be saved: The first way, list 
+    variables in a list and assign it to the `vars`. The second way, assign the 
+    `main_program` with an existing program, then all variables in the program 
+    will be saved. The first way has a higher priority. In other words, if `vars` 
+    are assigned, the `main_program` and the `predicate` will be ignored.
+
+    The `dirname` are used to specify the folder where to save variables. 
+    If you prefer to save variables in separate files in the folder `dirname`, 
+    set `filename` None; if you prefer to save all variables in a single file, 
+    use `filename` to specify it.
+
+    Args:
+        executor(Executor): The executor to run for saving variables.
+        dirname(str): The directory path.
+        main_program(Program|None): The program whose variables will be saved. 
+                                    If it is None, the default main program will 
+                                    be used automatically.
+                                    Default: None
+        vars(list[Variable]|None): The list that contains all variables to save. 
+                                   It has a higher priority than the `main_program`.
+                                   Default: None
+        predicate(function|None): If it is not None, only variables in the 
+                                  `main_program` that makes predicate(variable)==True 
+                                  will be saved. It only works when we are using the 
+                                  `main_program` to specify variables (In other words 
+                                  `vars` is None).
+                                  Default: None
+        filename(str|None): The file which to save all variables. If you prefer to save 
+                            variables separately, set it to None.
+                            Default: None
 
-    :param executor: executor that save variable
-    :param dirname: directory path
-    :param main_program: program. If vars is None, then filter all variables in this
-    program which fit `predicate`. Default default_main_program.
-    :param predicate: The Predicate describes a callable that returns a variable
-    as a bool. If it returns true, the corresponding input variable will be saved.
-    :param vars: variables need to be saved. If vars is specified, program & predicate
-    will be ignored
-    :param filename: The name of a single file that all vars are saved to.
-        If it is None, save variables to separate files.
+    Returns:
+        None
+
+    Raises:
+        TypeError: If `main_program` is not an instance of Program nor None.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
 
-    :return: None
+            # The first usage: using `main_program` to specify variables
+            def name_has_fc(var):
+                res = "fc" in var.name
+                return res
+
+            prog = fluid.default_main_program()
+            fluid.io.save_vars(executor=exe, dirname=path, main_program=prog,
+                               vars=None)
+            # All variables in `main_program` whose name includes "fc" will be saved.
+            # And variables are going to be saved separately.
+
+
+            # The second usage: using `vars` to specify variables
+            var_list = [var_a, var_b, var_c]
+            fluid.io.save_vars(executor=exe, dirname=path, vars=var_list, 
+                               filename="vars_file")
+            # var_a, var_b and var_c will be saved. And they are going to be
+            # saved in the same file named 'var_file' in the path "./my_paddle_model".
     """
     if vars is None:
         if main_program is None:
@@ -131,7 +202,42 @@ def save_vars(executor,
 
 def save_params(executor, dirname, main_program=None, filename=None):
     """
-    Save all parameters to directory with executor.
+    This function filters out all parameters from the give `main_program`
+    and then save them to the folder `dirname` or the file `filename`.
+
+    Use the `dirname` to specify the saving folder. If you would like to 
+    save parameters in separate files, set `filename` None; if you would 
+    like to save all parameters in a single file, use `filename` to specify 
+    the file name.
+
+    NOTICE: Some variables are not Parameter while they are necessary for 
+    training. So you can NOT save and continue your training just by 
+    `save_params()` and `load_params()`. Please use `save_persistables()` 
+    and `load_persistables()` instead.
+
+    Args:
+        executor(Executor): The executor to run for saving parameters.
+        dirname(str): The saving directory path.
+        main_program(Program|None): The program whose parameters will be
+                                    saved. If it is None, the default
+                                    main program will be used automatically.
+                                    Default: None
+        filename(str|None): The file to save all parameters. If you prefer 
+                            to save parameters in differnet files, set it 
+                            to None.
+                            Default: None
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            fluid.io.save_params(executor=exe, dirname=param_path, 
+                                 main_program=None)
     """
     save_vars(
         executor,
@@ -144,7 +250,37 @@ def save_params(executor, dirname, main_program=None, filename=None):
 
 def save_persistables(executor, dirname, main_program=None, filename=None):
     """
-    Save all persistables to directory with executor.
+    This function filters out all variables with `persistable==True` from the 
+    give `main_program` and then saves these variables to the folder `dirname` 
+    or file `filename`.
+
+    The `dirname` is used to specify the folder where persistable variables 
+    are going to be saved. If you would like to save variables in separate 
+    files, set `filename` None; if you would like to save all variables in a 
+    single file, use `filename` to specify the file name.
+
+    Args:
+        executor(Executor): The executor to run for saving persistable variables.
+        dirname(str): The directory path.
+        main_program(Program|None): The program whose persistbale variables will 
+                                    be saved. If it is None, the default main 
+                                    program will be used automatically.
+                                    Default: None
+        filename(str|None): The file to saved all variables. If you prefer to 
+                            save variables in differnet files, set it to None.
+                            Default: None
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            fluid.io.save_persistables(executor=exe, dirname=param_path, 
+                                       main_program=None)
     """
     save_vars(
         executor,
@@ -162,20 +298,69 @@ def load_vars(executor,
               predicate=None,
               filename=None):
     """
-    Load variables from directory by executor.
+    Load variables from the given directory by executor.
+
+    There are two ways to specify variables to be loaded: The first way, list 
+    variables in a list and assign it to the `vars`. The second way, assign the 
+    `main_program` with an existing program, then all variables in the program 
+    will be loaded. The first way has a higher priority. In other words if `vars` 
+    are assigned, the `main_program` and the `predicate` will be ignored.
+
+    The `dirname` are used to specify the folder where to load variables. 
+    If variables were saved in separate files in the folder `dirname`, 
+    set `filename` None; if all variables were saved in a single file, 
+    use `filename` to specify it.
+
+    Args:
+        executor(Executor): The executor to run for loading variables.
+        dirname(str): The directory path.
+        main_program(Program|None): The program whose variables will be loaded. 
+                                    If it is None, the default main program will 
+                                    be used automatically.
+                                    Default: None
+        vars(list[Variable]|None): The list that contains all variables to load. 
+                                   It has a higher priority than the `main_program`.
+                                   Default: None
+        predicate(function|None): If it is not None, only variables in the 
+                                  `main_program` that makes predicate(variable)==True 
+                                  will be loaded. It only works when we are using the 
+                                  `main_program` to specify variables (In other words 
+                                  `vars` is None).
+                                  Default: None
+        filename(str|None): The file which saved all required variables. If variables 
+                            were saved in differnet files, set it to None.
+                            Default: None
+
+    Returns:
+        None
+
+    Raises:
+        TypeError: If `main_program` is not an instance of Program nor None.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+
+            # The first usage: using `main_program` to specify variables
+            def name_has_fc(var):
+                res = "fc" in var.name
+                return res
 
-    :param executor: executor that load variable
-    :param dirname: directory path
-    :param main_program: program. If vars is None, then filter all variables in this
-    program which fit `predicate`. Default default_main_program().
-    :param predicate: The Predicate describes a callable that returns a variable
-    as a bool. If it returns true, the corresponding input variable will be loaded.
-    :param vars: variables need to be loaded. If vars is specified, program &
-    predicate will be ignored
-    :param filename: The name of the single file that all vars are loaded from.
-        If it is None, load variables from separate files.
+            prog = fluid.default_main_program()
+            fluid.io.load_vars(executor=exe, dirname=path, main_program=prog,
+                               vars=None)
+            # All variables in `main_program` whose name includes "fc" will be loaded.
+            # And all the variables are supposed to have been saved in differnet files.
 
-    :return: None
+
+            # The second usage: using `vars` to specify variables
+            var_list = [var_a, var_b, var_c]
+            fluid.io.load_vars(executor=exe, dirname=path, vars=var_list, 
+                               filename="vars_file")
+            # var_a, var_b and var_c will be loaded. And they are supposed to haven 
+            # been saved in the same file named 'var_file' in the path "./my_paddle_model".
     """
     if vars is None:
         if main_program is None:
@@ -195,6 +380,8 @@ def load_vars(executor,
         load_var_map = {}
         for each_var in vars:
             assert isinstance(each_var, Variable)
+            if each_var.type == core.VarDesc.VarType.RAW:
+                continue
             new_var = _clone_var_in_block_(load_block, each_var)
             if filename is None:
                 load_block.append_op(
@@ -221,7 +408,42 @@ def load_vars(executor,
 
 def load_params(executor, dirname, main_program=None, filename=None):
     """
-    load all parameters from directory by executor.
+    This function filters out all parameters from the give `main_program`
+    and then trys to load these parameters from the folder `dirname` or
+    the file `filename`.
+
+    Use the `dirname` to specify the folder where parameters were saved. If 
+    parameters were saved in separate files in the folder `dirname`, set 
+    `filename` None; if all parameters were saved in a single file, use 
+    `filename` to specify the file name.
+
+    NOTICE: Some variables are not Parameter while they are necessary for 
+    training. So you can NOT save and continue your training just by 
+    `save_params()` and `load_params()`. Please use `save_persistables()` 
+    and `load_persistables()` instead. 
+
+    Args:
+        executor(Executor): The executor to run for loading parameters.
+        dirname(str): The directory path.
+        main_program(Program|None): The program whose parameters will be
+                                    loaded. If it is None, the default
+                                    main program will be used automatically.
+                                    Default: None
+        filename(str|None): The file which saved all parameters. If parameters 
+                            were saved in differnet files, set it to None.
+                            Default: None
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            fluid.io.load_params(executor=exe, dirname=param_path, 
+                                main_program=None)
     """
     load_vars(
         executor,
@@ -233,7 +455,37 @@ def load_params(executor, dirname, main_program=None, filename=None):
 
 def load_persistables(executor, dirname, main_program=None, filename=None):
     """
-    load all persistables from directory by executor.
+    This function filters out all variables with `persistable==True` from the 
+    give `main_program` and then trys to load these variables from the folder 
+    `dirname` or the file `filename`.
+
+    Use the `dirname` to specify the folder where persistable variables were 
+    saved. If variables were saved in separate files, set `filename` None; 
+    if all variables were saved in a single file, use `filename` to specify 
+    the file name.
+
+    Args:
+        executor(Executor): The executor to run for loading persistable variables.
+        dirname(str): The directory path.
+        main_program(Program|None): The program whose persistbale variables will 
+                                    be loaded. If it is None, the default main 
+                                    program will be used automatically.
+                                    Default: None
+        filename(str|None): The file which saved all variables. If variables were 
+                            saved in differnet files, set it to None.
+                            Default: None
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            fluid.io.load_persistables(executor=exe, dirname=param_path, 
+                                       main_program=None)
     """
     load_vars(
         executor,
@@ -263,6 +515,9 @@ def get_inference_program(target_vars, main_program=None):
 def prepend_feed_ops(inference_program,
                      feed_target_names,
                      feed_holder_name='feed'):
+    if len(feed_target_names) == 0:
+        return
+
     global_block = inference_program.global_block()
     feed_var = global_block.create_var(
         name=feed_holder_name,
@@ -303,29 +558,56 @@ def save_inference_model(dirname,
                          model_filename=None,
                          params_filename=None):
     """
-    Build a model especially for inference,
-    and save it to directory by the executor.
+    Prune the given `main_program` to build a new program especially for inference,
+    and then save it and all related parameters to given `dirname` by the `executor`.
+
+    Args:
+        dirname(str): The directory path to save the inference model.
+        feeded_var_names(list[str]): Names of variables that need to be feeded data 
+                                     during inference.
+        target_vars(list[Variable]): Variables from which we can get inference 
+                                     results.
+        executor(Executor): The executor that saves the inference model.
+        main_program(Program|None): The original program, which will be pruned to 
+                                    build the inference model. If is setted None, 
+                                    the default main program will be used.
+                                    Default: None.
+        model_filename(str|None): The name of file to save the inference program 
+                                  itself. If is setted None, a default filename 
+                                  `__model__` will be used.
+        params_filename(str|None): The name of file to save all related parameters. 
+                                   If it is setted None, parameters will be saved 
+                                   in separate files .
+
+    Returns:
+        None
 
-    :param dirname: directory path
-    :param feeded_var_names: Names of variables that need to be feeded data during inference
-    :param target_vars: Variables from which we can get inference results.
-    :param executor: executor that save inference model
-    :param main_program: original program, which will be pruned to build the inference model.
-            Default default_main_program().
-    :param model_filename: The name of file to save inference program.
-        If not specified, default filename `__model__` will be used.
-    :param params_filename: The name of file to save parameters.
-        It is used for the case that all parameters are saved in a single binary file.
-        If not specified, parameters are considered saved in separate files.
+    Raises:
+        ValueError: If `feed_var_names` is not a list of basestring.
+        ValueError: If `target_vars` is not a list of Variable.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            path = "./infer_model"
+            fluid.io.save_inference_model(dirname=path, feeded_var_names=['img'],
+                         target_vars=[predict_var], executor=exe)
+
+            # In this exsample, the function will prune the default main program 
+            # to make it suitable for infering the `predict_var`. The pruned 
+            # inference program is going to be saved in the "./infer_model/__model__" 
+            # and parameters are going to be saved in separate files under folder
+            # "./infer_model". 
 
-    :return: None
     """
     if isinstance(feeded_var_names, basestring):
         feeded_var_names = [feeded_var_names]
     else:
-        if not (bool(feeded_var_names) and all(
-                isinstance(name, basestring) for name in feeded_var_names)):
-            raise ValueError("'feed_var_names' should be a list of str.")
+        if len(feeded_var_names) > 0:
+            if not (bool(feeded_var_names) and all(
+                    isinstance(name, basestring) for name in feeded_var_names)):
+                raise ValueError("'feed_var_names' should be a list of str.")
 
     if isinstance(target_vars, Variable):
         target_vars = [target_vars]
@@ -336,11 +618,20 @@ def save_inference_model(dirname,
 
     if main_program is None:
         main_program = default_main_program()
+    copy_program = main_program.clone()
 
     if not os.path.isdir(dirname):
         os.makedirs(dirname)
 
-    pruned_program = main_program.prune(targets=target_vars)
+    # Clear the is_target information and remove the existed feed and fetch op
+    global_block = copy_program.global_block()
+    for i, op in enumerate(global_block.ops):
+        op.desc.set_is_target(False)
+        if op.type == "feed" or op.type == "fetch":
+            global_block.remove_op(i)
+    copy_program.desc.flush()
+
+    pruned_program = copy_program.prune(targets=target_vars)
     inference_program = pruned_program.inference_optimize()
     fetch_var_names = [v.name for v in target_vars]
 
@@ -362,24 +653,6 @@ def save_inference_model(dirname,
     save_persistables(executor, dirname, inference_program, params_filename)
 
 
-def get_feed_targets_names(program):
-    feed_targets_names = []
-    global_block = program.global_block()
-    for op in global_block.ops:
-        if op.desc.type() == 'feed':
-            feed_targets_names.insert(0, op.desc.output('Out')[0])
-    return feed_targets_names
-
-
-def get_fetch_targets_names(program):
-    fetch_targets_names = []
-    global_block = program.global_block()
-    for op in global_block.ops:
-        if op.desc.type() == 'fetch':
-            fetch_targets_names.append(op.desc.input('X')[0])
-    return fetch_targets_names
-
-
 def load_inference_model(dirname,
                          executor,
                          model_filename=None,
@@ -387,18 +660,49 @@ def load_inference_model(dirname,
     """
     Load inference model from a directory
 
-    :param dirname: directory path
-    :param executor: executor that load inference model
-    :param model_filename: The name of file to load inference program.
-        If not specified, default filename `__model__` will be used.
-    :param params_filename: The name of file to load parameters.
-        It is used for the case that all parameters are saved in a single binary file.
-        If not specified, parameters are considered saved in separate files.
+    Args:
+        dirname(str): The directory path
+        executor(Executor): The executor to run for loading inference model.
+        model_filename(str|None): The name of file to load inference program.
+                                  If it is None, the default filename 
+                                  '__model__' will be used.
+                                  Default: None
+        params_filename(str|None): The name of file to load all parameters.
+                                   It is only used for the case that all 
+                                   parameters were saved in a single binary 
+                                   file. If parameters were saved in separate 
+                                   files, set it as 'None'.
+
+    Returns:
+        tuple: The return of this function is a tuple with three elements:
+        (program, feed_target_names, fetch_targets). The `program` is a 
+        Program, it's the program for inference. The `feed_target_names` is 
+        a list of str, it contains Names of variables that need to feed 
+        data in the inference program. The `fetch_targets` is a list of 
+        Variable. It contains variables from which we can get inference 
+        results.
+
+    Raises:
+        ValueError: If `dirname` is not a existing directory.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            path = "./infer_model"
+            [inference_program, feed_target_names, fetch_targets] = 
+                fluid.io.load_inference_model(dirname=path, executor=exe)
+            results = exe.run(inference_program,
+                          feed={feed_target_names[0]: tensor_img},
+                          fetch_list=fetch_targets)
+
+            # In this exsample, the inference program was saved in the 
+            # "./infer_model/__model__" and parameters were saved in 
+            # separate files in ""./infer_model". 
+            # After getting inference program, feed target names and 
+            # fetch targets, we can use an Executor to run the inference 
+            # program to get the inference result.
 
-    :return: [program, feed_target_names, fetch_targets]
-             program: program especially for inference.
-             feed_target_names: Names of variables that need to feed data
-             fetch_targets: Variables from which we can get inference results.
     """
     if not os.path.isdir(dirname):
         raise ValueError("There is no directory named '%s'", dirname)
@@ -418,8 +722,8 @@ def load_inference_model(dirname,
     program = Program.parse_from_string(program_desc_str)
     load_persistables(executor, dirname, program, params_filename)
 
-    feed_target_names = get_feed_targets_names(program)
-    fetch_target_names = get_fetch_targets_names(program)
+    feed_target_names = program.desc.get_feed_target_names()
+    fetch_target_names = program.desc.get_fetch_target_names()
     fetch_targets = [
         program.global_block().var(name) for name in fetch_target_names
     ]
@@ -429,12 +733,25 @@ def load_inference_model(dirname,
 
 def get_parameter_value(para, executor):
     """
-    Get the LoDTensor for the parameter
+    Get the LoDTensor value of the given parameter.
+
+    Args:
+        para(Parameter): The parameter to get value from.
+        executor(Executor): The executor to run for retrieving the value.
+
+    Returns:
+        numpy.array: The given parameter's values.
+
+    Raises:
+        AssertionError: If the `para` is not an instance of Parameter.
 
-    :param executor: executor for retrieving the value
-    :param para: the given parameter
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param = fluid.default_main_program().global_block().var('fc.w')
+            p = fluid.io.get_parameter_value(param, exe)
 
-    :return: the LoDTensor for the parameter
     """
     assert is_parameter(para)
 
@@ -446,16 +763,712 @@ def get_parameter_value(para, executor):
 
 def get_parameter_value_by_name(name, executor, program=None):
     """
-    Get the LoDTensor for paramter with the given name
+    Get the LoDTensor value of a certain parameter by its name.
+
+    Args:
+        name(str): The parameter's name.
+        executor(Executor): The executor to run for retrieving the value.
+        program(Program | None): The program where to find the parameter.
+                               If it's set to be None, the function will
+                               try to find the parameter in the default
+                               main program.
 
-    :param executor: executor for retrieving the value
-    :param name: the name of the parameter
-    :param program: the program where the variable is found
-            Default default_main_program().
+    Returns:
+        numpy.array: The parameter's values.
 
-    :return: the LoDTensor for the variable
+    Raises:
+        TypeError: If given `name` is not an instance of basestring.
+        TypeError: If the parameter with the given name doesn't exist.
+        AssertionError: If there is a varibale named `name` in the
+                        given program but it is not a Parameter.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            p = fluid.io.get_parameter_value('fc.w', exe)
     """
     if program is None:
         program = default_main_program()
     var = program.global_block().var(name)
     return get_parameter_value(var, executor)
+
+
+SUCCESS_MARK_FILENAME = "_SUCCESS"
+CHECKPOINT_PREFIX = "checkpoint"
+MODEL_DIR = "__model__"
+LOOKUP_TABLE_DIR = "__lookup_table__"
+TRAINER_PREFIX = "trainer"
+CHECKPOINT_SEPARATOR = "_"
+
+
+def save_checkpoint(executor,
+                    checkpoint_dir,
+                    trainer_id,
+                    trainer_args=None,
+                    main_program=None,
+                    max_num_checkpoints=3,
+                    lookup_table=None,
+                    ps_endpoint_list=None):
+    """
+    This function filters out all checkpoint variables from the give
+    main_program and then saves these variables to the `checkpoint_dir` 
+    directory.
+
+    In the training precess, we generally save a checkpoint in each
+    iteration. So there might be a lot of checkpoints in the 
+    `checkpoint_dir`. To avoid them taking too much disk space, the 
+    `max_num_checkpoints` are introduced to limit the total number of 
+    checkpoints. If the number of existing checkpints is greater than 
+    the `max_num_checkpoints`, oldest ones will be scroll deleted.
+
+    A variable is a checkpoint variable and will be saved if it meets
+    all following conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
+
+    Args:
+        executor(Executor): The executor to run for save checkpoint.
+        checkpoint_dir(str): The folder where to save checkpoints.
+        trainer_id(int): currect trainer id, if id is equal to 0, the trainer 
+            is chief.
+        trainer_args(dict|None): Current training arguments. Such as 'epoch_id' 
+            and 'step_id'.
+            Defaut: None
+        main_program(Program|None): The program whose checkpoint variables will
+            be saved. If it is None, the default main program will be used.
+        max_num_checkpoints(int): The max number of total number of existing 
+            checkpoints.
+            Default: 3
+        lookup_table(string|None): the lookup table name, when use distribute
+            lookup table, we can get lookup table name by DistributeTranspiler.
+            table_name 
+        ps_endpoint_list(list|None): the parameter server ip:port list.  
+            when use distribute lookup table, we can get ps_endpoint_list by 
+            distribute arguments.
+
+    Returns:
+        None
+
+    Raises:
+        ValueError: If `checkpoint_dir` is None.
+        AssertionError: If `trainer_args` is not a dict.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            path = "./checkpoints"
+            prog = fluid.default_main_program()
+            trainer_args = {"epoch_id": 200,
+                            "step_id": 20} # just an example
+            table_name = "share_w"
+            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
+
+            fluid.io.save_checkpoint(executor=exe,
+                                     checkpoint_dir=path,
+                                     trainer_id=0,
+                                     trainer_args=trainer_args,
+                                     main_program=prog,
+                                     max_num_checkpoints=3,
+                                     lookup_table=table_name,
+                                     ps_endpoint_list = ps_endpoints)
+    """
+    if checkpoint_dir is None:
+        raise ValueError("'checkpoint_dir' should not be None")
+    assert checkpoint_dir
+
+    if trainer_args:
+        assert isinstance(trainer_args, dict)
+
+    is_chief = trainer_id == 0
+
+    _make_chekcpoint_dirs(checkpoint_dir)
+    serial = get_latest_checkpoint_serial(checkpoint_dir) + 1
+    cur_dir = _get_serial_dir(checkpoint_dir, serial)
+
+    save_trainer_args(cur_dir, trainer_id, trainer_args)
+
+    if is_chief:
+        save_persist_vars_without_grad(executor, cur_dir, main_program)
+
+    if is_chief and lookup_table and ps_endpoint_list:
+        save_pserver_vars_by_notify(executor, cur_dir, lookup_table,
+                                    ps_endpoint_list)
+
+    _scroll_delete(checkpoint_dir, max_num_checkpoints)
+
+
+def load_checkpoint(executor, checkpoint_dir, serial, main_program):
+    """
+    This function filters out all checkpoint variables from the give
+    main_program and then try to load these variables from the
+    `checkpoint_dir` directory.
+
+    In the training precess, we generally save a checkpoint in each
+    iteration. So there are more than one checkpoint in the 
+    `checkpoint_dir` (each checkpoint has its own sub folder), use 
+    `serial` to specify which serial of checkpoint you would like to
+    load.
+
+    A variable is a checkpoint variable and will be loaded if it meets
+    all following conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
+
+    Args:
+        executor(Executor): The executor to run for loading checkpoint.
+        checkpoint_dir(str): The folder where all checkpoints are.
+        serial(int): The serial of checkpoint you would like to load.
+        main_program(Program): The program whose checkpoint variables will
+                               be loaded.
+
+    Returns:
+        None
+
+    Raises:
+        ValueError: If `checkpoint_dir` is None.
+        ValueError: If `serial` is None or `serial` is less than 0.
+        ValueError: If `main_program` is None.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            path = "./checkpoints"
+            prog = fluid.default_main_program()
+            fluid.io.load_checkpoint(executor=exe, checkpoint_dir=path,
+                    serial=9, main_program=prog)
+
+            # In this example, `load_checkpoint` function
+            # will first filters out all checkpoint variables in the default
+            # main program, and then try to load these variables form the
+            # folder "./checkpoints/checkpoint_9/__model__".
+    """
+
+    if checkpoint_dir is None:
+        raise ValueError("'checkpoint_dir' should not be None")
+
+    if serial is None or serial < 0:
+        raise ValueError("'serial' should not be None or <0 ")
+
+    if main_program is None:
+        raise ValueError('main_program should not be None.')
+
+    cur_dir = _get_serial_dir(checkpoint_dir, serial)
+    load_persist_vars_without_grad(executor, cur_dir, main_program, True)
+
+
+def clean_checkpoint(checkpoint_dir, delete_dir=False):
+    """
+    clean the checkpoint dir, when the train exits normally, 
+    the trainer will call clean_checkpoint to delete checkpoint directory saved before.
+    delete_dir only works when the directory is empty, otherwise, OSError is raised.  
+
+    : param checkpoint_dir
+    : param delete_dir
+    """
+
+    if checkpoint_dir is None:
+        raise ValueError("'checkpoint_dir' should not be None")
+    _scroll_delete(checkpoint_dir, max_num_checkpoints=0)
+
+    if delete_dir and not os.listdir(checkpoint_dir):
+        os.rmdir(checkpoint_dir)
+
+
+def load_persist_vars_without_grad(executor,
+                                   dirname,
+                                   program,
+                                   has_model_dir=False):
+    """
+    This function filters out all checkpoint variables from the give
+    program and then trys to load these variables from the given directory.
+
+    A variable is a checkpoint variable if it meets all following
+    conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
+
+    Args:
+        executor(Executor): The executor to run for loading variables.
+        dirname(str): The directory path.
+        program(Program): The program whose checkpoint variables will
+                          be loaded.
+        has_model_dir(bool): if True, the function loads variables
+                             from a sub directory named '__model__'.
+                             Default: False
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            fluid.io.load_persist_vars_without_grad(executor=exe,
+                    dirname=param_path, program=prog, has_model_dir=True)
+
+            # In this example, `load_persist_vars_without_grad` function
+            # will first filters out all checkpoint variables in the default
+            # main program, and then trys to load these variables form the
+            # folder "./my_paddle_model/__model__".
+    """
+
+    if has_model_dir:
+        dirname = _get_model_dir(dirname)
+
+    load_vars(
+        executor,
+        dirname=dirname,
+        main_program=program,
+        predicate=_is_checkpoint_var,
+        filename=None)
+
+
+def load_lookup_table_vars(executor, dirname, program, pserver_id, table_name):
+    """
+    The parameter server will load lookup table's local file in 
+    selectedrows variable.
+
+    Args:
+        executor(Executor): The executor to run for loading persistable variables
+        dirname(str): The directory path
+        main_program(Program): Find the variable named table_name in main_program
+        pserver_id(int): the serial number in pserver_endpoints list
+        table_name(str): lookup table name
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            dirname = "./checkpoints/checkpoint_9/__model__"
+            prog = fluid.default_main_program()
+            pserver_id = 1
+            table_name = "share_w"
+            fluid.io.load_lookup_table_vars(executor=exe,
+                    dirname=dirname, program=prog, pserver_id=pserver_id,
+                    table_name=table_name)
+    """
+
+    for var in program.list_vars():
+        if var.name == table_name:
+            lookup_table_var = var
+            break
+
+    assert lookup_table_var is not None
+
+    lookup_table_dir = os.path.join(dirname, LOOKUP_TABLE_DIR)
+    table_file = table_name + CHECKPOINT_SEPARATOR + str(pserver_id)
+
+    load_prog = Program()
+    load_block = load_prog.global_block()
+
+    load_block.append_op(
+        type='load',
+        inputs={},
+        outputs={'Out': [lookup_table_var]},
+        attrs={'file_path': os.path.join(lookup_table_dir, table_file)})
+
+    executor.run(load_prog)
+
+
+def save_persist_vars_without_grad(executor, dirname, program):
+    """
+    This function filters out all checkpoint variables from the give
+    program and then save these variables to a sub-folder '__model__' of 
+    the given directory.
+
+    A variable is a checkpoint variable if it meets all following
+    conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
+
+    Args:
+        executor(Executor): The executor to run for saving variables.
+        dirname(str): The directory path.
+        program(Program): The program whose checkpoint variables will
+                          be saved.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            fluid.io.save_persist_vars_without_grad(executor=exe,
+                    dirname=param_path, program=prog)
+
+            # In this example, `save_persist_vars_without_grad` function
+            # will first filters out all checkpoint variables in the default
+            # main program, and then saves these variables to the folder 
+            # "./my_paddle_model/__model__".
+    """
+    cur_dir = _get_model_dir(dirname)
+    save_vars(
+        executor,
+        dirname=cur_dir,
+        main_program=program,
+        vars=None,
+        predicate=_is_checkpoint_var,
+        filename=None)
+    _write_success(cur_dir)
+
+
+def save_pserver_vars_by_notify(executor, dirname, lookup_table,
+                                ps_endpoint_list):
+    """
+    This function will send checkpoint notify message from Trainer 0
+    to all the pservers.
+    The checkpoint notify message contains lookup table name, 
+    the absolute path on pserver to save lookup_table.
+
+    Args:
+        executor(Executor): The executor to run for send checkpoint notify.
+        dirname(str): The folder where to save checkpoints.
+        lookup_table(string): the lookup table name, when use distribute
+            lookup table, we can get lookup table name by DistributeTranspiler.
+            table_name 
+        ps_endpoint_list(list): the parameter server ip:port list.  
+            when use distribute lookup table, we can get ps_endpoint_list by 
+            distribute arguments.
+    Return:
+        None
+    
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            table_name = "share_w"
+            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
+
+            fluid.io.save_pserver_vars_by_notify(executor=exe,
+                    dirname=param_path, lookup_table=table_name, 
+                    ps_endpoint_list=ps_endpoints)
+    """
+    cur_dir = _get_lookuptable_dir(dirname)
+
+    checkpoint_notify_program = Program()
+    checkpoint_notify_block = checkpoint_notify_program.global_block()
+
+    attrs = {}
+    attrs['epmap'] = ps_endpoint_list
+    attrs['dir'] = cur_dir
+    attrs['lookup_table'] = lookup_table
+
+    checkpoint_notify_block.append_op(
+        type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
+    executor.run(checkpoint_notify_program)
+
+
+def save_trainer_args(dirname, trainer_id, trainer_args):
+    assert isinstance(trainer_args, dict)
+
+    cur_dir = _get_trainer_dir(dirname, trainer_id)
+
+    for name, value in trainer_args.iteritems():
+        args_file = os.path.join(cur_dir, name)
+        with open(args_file, 'w') as f:
+            f.write(str(value))
+    _write_success(cur_dir)
+
+
+def load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args):
+    """
+    trainer will load some args from it's independent directory, 
+    such as epoch_id and step_id.
+
+    Args:
+        checkpoint_dir(str): The folder where all checkpoints are.
+        serial(int): The serial of checkpoint you would like to load.
+        trainer_id(int): current trainer id.
+        trainer_args(list): list about load trainer args
+    Return:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            param_path = "./checkpoint/"
+            serial = 7
+            trainer_id = 2
+            trainer_args = ["epoch_id", "step_id"]
+
+            fluid.io.load_trainer_args(checkpoint_dir=param_path, serial=serial,
+            trainer_id=trainer_id, trainer_args=trainer_args)
+    """
+    assert isinstance(trainer_args, list)
+
+    cur_dir = _get_serial_dir(checkpoint_dir, serial)
+    cur_dir = _get_trainer_dir(cur_dir, trainer_id)
+
+    ret_values = []
+
+    for arg in trainer_args:
+        cur_file = os.path.join(cur_dir, arg)
+        with open(cur_file, 'r') as f:
+            contents = f.read()
+            ret_values.append(contents.strip())
+    return ret_values
+
+
+def _is_checkpoint_var(var):
+    """
+    the checkpoint will not save or load all the variables.
+    var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
+
+    : param var(Variable)
+    """
+    if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+            var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+            var.desc.type() == core.VarDesc.VarType.RAW:
+        return False
+    # @GRAD are named for gradient variables, checkpoint will not save it.
+    if "@GRAD" in var.name:
+        return False
+    # .trainer_ are named for distribute train variables, checkpoint will not save it.
+    if ".trainer_" in var.name:
+        return False
+
+    # .block is named for distribute train variables, checkpoint will not save it.
+    if ".block" in var.name:
+        return False
+
+    return var.persistable
+
+
+def _make_chekcpoint_dirs(dirs):
+    """
+    _make_chekcpoint_dirs will makdir local directory directly, when the directory is exist, it will igore it.
+    """
+    assert dirs is not None
+
+    if os.path.isfile(dirs):
+        raise OSError(errno.ENOTDIR, "dirs path shoule be a Directory.", dirs)
+
+    if not os.path.isdir(dirs):
+        try:
+            os.makedirs(dirs)
+        except OSError as err:
+            if err.errno != errno.EEXIST:
+                raise err
+
+
+def _get_dir_serial(dirname):
+    _, serial = dirname.split(CHECKPOINT_SEPARATOR)
+
+    try:
+        serial_num = int(serial)
+    except ValueError:
+        serial_num = -1
+    return serial_num
+
+
+def _get_serial_dir(dirname, serial):
+    serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial)
+    serial_dir = os.path.join(dirname, serial_folder)
+    _make_chekcpoint_dirs(serial_dir)
+
+    return serial_dir
+
+
+def _get_model_dir(dirname):
+    model_dir = os.path.join(dirname, MODEL_DIR)
+    _make_chekcpoint_dirs(model_dir)
+    return model_dir
+
+
+def _get_lookuptable_dir(dirname):
+    lookuptable_dir = os.path.join(dirname, LOOKUP_TABLE_DIR)
+    _make_chekcpoint_dirs(lookuptable_dir)
+    return lookuptable_dir
+
+
+def _get_trainer_dir(dirname, trainer_id):
+    trainer_folder = TRAINER_PREFIX + CHECKPOINT_SEPARATOR + str(trainer_id)
+    trainer_dir = os.path.join(dirname, trainer_folder)
+    _make_chekcpoint_dirs(trainer_dir)
+    return trainer_dir
+
+
+def _scroll_delete(dirname, max_num_checkpoints=3):
+    dirs = os.listdir(dirname)
+    serial_map = {}
+    for serial in dirs:
+        serial_num = _get_dir_serial(serial)
+        serial_map[serial_num] = serial
+
+    if len(serial_map.keys()) <= max_num_checkpoints:
+        return
+
+    serials = serial_map.keys()
+    serials.sort(reverse=True)
+    serials = serials[max_num_checkpoints:]
+    for serial in serials:
+        cur_dir = _get_serial_dir(dirname, serial)
+        try:
+            shutil.rmtree(cur_dir)
+        except OSError as err:
+            if err.errno != errno.ENOENT:
+                raise err
+
+
+def _write_success(dirname):
+    """
+    write an empty file named "_SUCCESS" in checkpoint dir, indicate this checkpoint is correct.
+
+    : param dirname
+    """
+    success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME)
+    with open(success_file, 'a') as f:
+        now = time.ctime()
+        f.write(now)
+
+
+def get_latest_checkpoint_serial(checkpoint_dir):
+    """
+    get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory
+
+    : param checkpoint_dir
+    """
+    if not checkpoint_dir:
+        return -1
+
+    def has_success(checkpoint_dir, cur_dir):
+        """
+        is _SUCCESS in this dir
+        """
+
+        serial = _get_dir_serial(cur_dir)
+        if serial == -1 or not os.path.isdir(
+                os.path.join(checkpoint_dir, cur_dir)):
+            return -1
+
+        success_path = os.path.join(
+            _get_serial_dir(checkpoint_dir, serial), MODEL_DIR,
+            SUCCESS_MARK_FILENAME)
+        if os.path.isfile(success_path):
+            return serial
+
+    if not os.path.isdir(checkpoint_dir):
+        return -1
+
+    current_dir = -1
+    dirs = os.listdir(checkpoint_dir)
+    for cur_dir in dirs:
+        success_num = has_success(checkpoint_dir, cur_dir)
+        if success_num > current_dir:
+            current_dir = success_num
+    return current_dir
+
+
+def get_test_program(filelist, program=None, startup_program=None):
+    """
+    Transpile current train program to a program to read test dataset
+    if the program is using reader ops like "open_files_op".
+    """
+
+    def _copy_reader_var_(block, var, new_name=None):
+        if new_name == None:
+            new_name = var.name
+        new_var = block.create_var(
+            name=str(new_name), type=core.VarDesc.VarType.READER)
+        new_var.desc.set_shapes(var.desc.shapes())
+        new_var.desc.set_dtypes(var.desc.dtypes())
+        new_var.persistable = True
+        return new_var
+
+    def _get_test_reader_name(train_reader_name):
+        return train_reader_name + "_test"
+
+    def _is_reader_op(op):
+        block = op.block
+        if "Out" in op.output_names:
+            reader_out = block.vars[op.output("Out")[0]]
+            if reader_out.type == core.VarDesc.VarType.READER:
+                return True
+        return False
+
+    if program == None:
+        program = default_main_program()
+    if startup_program == None:
+        startup_program = default_startup_program()
+    startup_block = startup_program.global_block()
+
+    # 1. find out the orignal reader var name
+    startup_reader_op_list = []
+
+    for op in startup_block.ops:
+        if _is_reader_op(op):
+            startup_reader_op_list.append(op)
+
+    if len(startup_reader_op_list) == 0:
+        return program
+
+    root_reader_op = startup_reader_op_list[0]
+    train_test_reader_map = {}
+    # 2. add operators to startup to read open and read test data files
+    for op in startup_reader_op_list:
+        assert (len(op.output("Out")) == 1)
+        train_reader_name = op.output("Out")[0]
+        train_reader = startup_block.vars[train_reader_name]
+        test_reader = _copy_reader_var_(
+            startup_block,
+            train_reader,
+            new_name=_get_test_reader_name(train_reader_name))
+        train_test_reader_map[train_reader.name] = test_reader
+
+        test_op_inputs = {}
+        for name in op.input_names:
+            train_arg_names = op.input(name)
+            test_arg_vars = []
+            for arg_name in train_arg_names:
+                arg_var = train_test_reader_map[
+                    arg_name] if name == "UnderlyingReader" else startup_block.vars[
+                        arg_name]
+                test_arg_vars.append(arg_var)
+            test_op_inputs[name] = test_arg_vars
+
+        test_op = startup_block.append_op(
+            type=op.type,
+            inputs=test_op_inputs,
+            outputs={'Out': [test_reader]},
+            attrs=op.attrs)
+        # root reader op's filelist attr for read test files
+        if op.type == root_reader_op.type:
+            test_op.set_attr("file_names", filelist)
+        if op.type == "create_multi_pass_reader":
+            test_op.set_attr("pass_num", 1)
+
+    # 3. rename reader vars in inference program to different name
+    #    to avoid read from train data.
+    main_block = program.global_block()
+    for var in main_block.vars.values():
+        if var.type == core.VarDesc.VarType.READER:
+            main_block.rename_var(
+                str(var.name), str(_get_test_reader_name(var.name)))
+
+    for op in main_block.ops:
+        if op.type == root_reader_op.type:
+            test_op.set_attr("file_names", filelist)
+        if op.type == "create_multi_pass_reader":
+            test_op.set_attr("pass_num", 1)
+
+    startup_program.sync_with_cpp()
+    program.sync_with_cpp()
+
+    return program
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index da7e74c901e1f5be709c5f9d73f048bfda0c5549..86efd1ff51cf29485ee28b4d60ffb1439af1aad9 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -19,6 +19,7 @@ from framework import Variable, Parameter, default_main_program, default_startup
 import unique_name
 from paddle.fluid.initializer import Constant, Xavier
 from param_attr import ParamAttr, WeightNormParamAttr
+import core
 
 
 class LayerHelper(object):
@@ -398,8 +399,16 @@ class LayerHelper(object):
             return input_var
         if isinstance(act, basestring):
             act = {'type': act}
-        tmp = self.create_tmp_variable(dtype=input_var.dtype)
+
+        if 'use_cudnn' in self.kwargs and self.kwargs.get('use_cudnn'):
+            act['use_cudnn'] = self.kwargs.get('use_cudnn')
+        if 'use_mkldnn' in self.kwargs:
+            act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
         act_type = act.pop('type')
+        tmp = input_var
+        # NOTE(dzhwinter): some activation support inplace compution.
+        if not core.IsInplace(act_type):
+            tmp = self.create_tmp_variable(dtype=input_var.dtype)
         self.append_op(
             type=act_type,
             inputs={"X": [input_var]},
diff --git a/python/paddle/fluid/layers/__init__.py b/python/paddle/fluid/layers/__init__.py
index a568f61dcb2da976baa7847ae26281a34d6f88dd..cd1492da24d5e9d09a9eaac0b1b9c7aaffac6250 100644
--- a/python/paddle/fluid/layers/__init__.py
+++ b/python/paddle/fluid/layers/__init__.py
@@ -28,8 +28,8 @@ import math_op_patch
 from math_op_patch import *
 import detection
 from detection import *
-import metric
-from metric import *
+import metric_op
+from metric_op import *
 from learning_rate_scheduler import *
 
 __all__ = []
@@ -41,5 +41,5 @@ __all__ += control_flow.__all__
 __all__ += ops.__all__
 __all__ += device.__all__
 __all__ += detection.__all__
-__all__ += metric.__all__
+__all__ += metric_op.__all__
 __all__ += learning_rate_scheduler.__all__
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index 1bb1aa30ee1019c6f80eb64b6dc20459e7a3073b..849474dc58461ac3772f439da7bf5d57592daa8c 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -13,25 +13,25 @@
 # limitations under the License.
 import contextlib
 
-from layer_function_generator import autodoc
+from layer_function_generator import autodoc, templatedoc
 from tensor import assign, fill_constant
 from .. import core
 from ..framework import Program, Variable, Operator
 from ..layer_helper import LayerHelper, unique_name
+from ..initializer import force_init_on_cpu
 from ops import logical_and, logical_not, logical_or
+import numpy
 
 __all__ = [
     'split_lod_tensor',
     'merge_lod_tensor',
     'BlockGuard',
     'BlockGuardWithCompletion',
-    'StaticRNNMemoryLink',
     'WhileGuard',
     'While',
     'Switch',
     'lod_rank_table',
     'max_sequence_len',
-    'topk',
     'lod_tensor_to_array',
     'array_to_lod_tensor',
     'increment',
@@ -49,39 +49,42 @@ __all__ = [
     'reorder_lod_tensor_by_rank',
     'ParallelDo',
     'Print',
+    'is_empty',
 ]
 
 
 def split_lod_tensor(input, mask, level=0):
     """
-    **split_lod_tensor**
-
     This function takes in an input that contains the complete lod information,
     and takes in a mask which is used to mask certain parts of the input.
     The output is the true branch and the false branch with the mask applied to
-    the input at a certain level in the tensor.
+    the input at a certain level in the tensor. Mainly used in IfElse to split
+    data into two parts.
 
     Args:
         input(tuple|list|None): The input tensor that contains complete
                                 lod information needed to construct the output.
         mask(list): A bool column vector which masks the input.
-        level(int): The specific lod level to rank.
+        level(int): The specific lod level to split.
 
     Returns:
-        Variable: The true branch of tensor as per the mask applied to input.
-        Variable: The false branch of tensor as per the mask applied to input.
+        tuple(Variable, Variable):
+        The true branch of tensor as per the mask applied to input.
+
+        The false branch of tensor as per the mask applied to input.
 
     Examples:
         .. code-block:: python
 
-          x = layers.data(name='x', shape=[1])
+          x = fluid.layers.data(name='x', shape=[1])
           x.persistable = True
 
-          y = layers.data(name='y', shape=[1])
+          y = fluid.layers.data(name='y', shape=[1])
           y.persistable = True
 
-          out_true, out_false = layers.split_lod_tensor(
+          out_true, out_false = fluid.layers.split_lod_tensor(
                 input=x, mask=y, level=level)
+
     """
     helper = LayerHelper('split_lod_tensor', **locals())
     out_true = helper.create_tmp_variable(dtype=input.dtype)
@@ -104,8 +107,9 @@ def merge_lod_tensor(in_true, in_false, x, mask, level=0):
 
     This function takes in an input :math:`x`, the True branch, the False
     branch and a binary :math:`mask`. Using this information, this function
-    merges the True and False branches of the tensor into a single Output
-    at a certain lod level indiacted by :math:`level`.
+    merges the True and False branches of the tensor into a single tensor as
+    output at a certain lod level indicated by :math:`level`. Used in IfElse
+    to merge the output if True block and False Block.
 
     Args:
         in_true(tuple|list|None): The True branch to be merged.
@@ -113,7 +117,7 @@ def merge_lod_tensor(in_true, in_false, x, mask, level=0):
         x(tuple|list|None): The input tensor that contains complete
                             lod information needed to construct the output.
         mask(list): A bool column vector which masks the input.
-        level(int): The specific lod level to rank.
+        level(int): The specific lod level to merge.
 
     Returns:
         Variable: The merged output tensor.
@@ -181,12 +185,14 @@ def Print(input,
     Returns:
         Variable: Output tensor, same data with input tensor.
 
+
     Examples:
+
         .. code-block:: python
 
-        value = some_layer(...)
-        Print(value, summarize=10,
-              message="The content of some_layer: ")
+           value = some_layer(...)
+           Print(value, summarize=10,
+               message="The content of some_layer: ")
     '''
     helper = LayerHelper('print', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
@@ -232,9 +238,56 @@ class BlockGuard(object):
 
 class ParallelDo(object):
     """
-    ParallelDo class.
+    ParallelDo is used to represent multi-thread data parallel processing.
+
+    Its vanilla implementation can be shown as the following (:math:`|` means
+    single thread and :math:`||||` means multiple threads)
+
+    .. code-block:: text
+
+      In the forward pass
+        |      Split input onto different devices
+        |      Copy parameter onto different devices
+        ||||   Compute forward pass in parallel
+        |      Merge output from different devices
+
+      In the backward pass
+        |      Split output@grad onto different devices
+        ||||   Compute backward pass in parallel
+        |      accumulate param@grad from different devices to the first device
+        |      Merge input@grad from different devices
+        |      Copy param@grad to the place of parallel_do_op
+
+    Examples:
 
-    ParallelDo class is used to create a ParallelDo.
+    .. code-block:: python
+
+      images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+      label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+      # ParallelDo version & Single-thread version
+      if thread_num > 1:
+          places = fluid.layers.get_places(thread_num)
+          pd = fluid.layers.ParallelDo(places)
+          with pd.do():
+              images = pd.read_input(images)
+              label = pd.read_input(label)
+              predict = cnn_model(images)
+              cost = fluid.layers.cross_entropy(input=predict, label=label)
+
+              avg_cost = fluid.layers.mean(x=cost)
+              pd.write_output(avg_cost)
+
+          avg_cost = pd()
+          avg_cost = fluid.layers.mean(avg_cost)
+      else:
+          predict = cnn_model(images)
+          cost = fluid.layers.cross_entropy(input=predict, label=label)
+          avg_cost = fluid.layers.mean(x=cost)
+
+    .. warning::
+    
+       It will be soon deprecated, please use ParallelExecutor instead.
     """
 
     def __init__(self, places, use_nccl=False, name=None):
@@ -361,16 +414,17 @@ class StaticRNNMemoryLink(object):
     """
     StaticRNNMemoryLink class.
 
-    Args:
-        init: the initial variable for Memory
-        init: Variable
-        pre_mem: the memory variable in previous time step
-        pre_mem: Variable
-        mem: the memory variable in current time step
-        mem: Variable
-
     StaticRNNMemoryLink class is used to create a link between two
     memory cells of a StaticRNN.
+
+
+    NOTE: This is a internal data structure of a very low-level API.
+    Please use StaticRNN instead.
+
+    Args:
+        init(Variable): the initial variable for Memory.
+        pre_mem(Variable): the memory variable in previous time step.
+        mem(Variable): the memory variable in current time step.
     """
 
     def __init__(self, init, pre_mem, mem=None):
@@ -605,6 +659,29 @@ class WhileGuard(BlockGuard):
 
 
 class While(object):
+    """
+    while loop control flow.
+
+    Args:
+        cond (Variable): condition used to compare.
+        name (str): The name of this layer.
+
+    Examples:
+          .. code-block:: python
+
+            d0 = layers.data("d0", shape=[10], dtype='float32')
+            data_array = layers.array_write(x=d0, i=i)
+            array_len = layers.fill_constant(shape=[1],dtype='int64', value=3)
+
+            cond = layers.less_than(x=i, y=array_len)
+            while_op = layers.While(cond=cond)
+            with while_op.block():
+                d = layers.array_read(array=data_array, i=i)
+                i = layers.increment(x=i, in_place=True)
+                layers.array_write(result, i=i, array=d)
+                layers.less_than(x=i, y=array_len, cond=cond)
+    """
+
     BEFORE_WHILE_BLOCK = 0
     IN_WHILE_BLOCK = 1
     AFTER_WHILE_BLOCK = 2
@@ -674,8 +751,8 @@ def lod_rank_table(x, level=0):
         .. code-block:: text
 
             x is a LoDTensor:
-                x.lod = [[0,                2, 3],
-                         [0,             5, 6, 7]]
+                x.lod = [[2,                1],
+                         [5,             1, 1]]
                 x.data = [a, b, c, d, e, f, g]
 
             1. set level to 0:
@@ -705,7 +782,7 @@ def lod_rank_table(x, level=0):
         .. code-block:: python
 
             x = fluid.layers.data(name='x', shape=[10],
-                            dtype='float32', lod_level=1)
+                                  dtype='float32', lod_level=1)
             out = layers.lod_rank_table(x=x, level=0)
     """
     helper = LayerHelper("lod_rank_table", **locals())
@@ -720,26 +797,22 @@ def lod_rank_table(x, level=0):
     return table
 
 
+@templatedoc()
 def max_sequence_len(rank_table):
-    """Max Sequence Len Operator. Given a LoDRankTable object, this layer
-    returns the max length of a batch of sequences. In fact, a LoDRankTable
-    object contains a list of tuples(<sequence index, sequence length>) and
-    the list is already sorted by sequence length in descending order, so the
-    operator just returns the sequence length of the first tuple element.
+    """
+    ${comment}
+
+    >>> import paddle.fluid as fluid
+    >>> x = fluid.layers.data(name='x', shape=[10], dtype='float32',
+    >>>                       lod_level=1)
+    >>> rank_table = layers.lod_rank_table(x=x, level=0)
+    >>> max_seq_len = layers.max_sequence_len(rank_table)
 
     Args:
-        rank_table (Variable): Input variable which is a LoDRankTable object.
+        rank_table(${rank_table_type}): ${rank_table_comment}.
 
     Returns:
-        Variable: The max length of sequence.
-
-    Examples:
-        .. code-block:: python
-
-            x = fluid.layers.data(name='x', shape=[10],
-                            dtype='float32', lod_level=1)
-            rank_table = layers.lod_rank_table(x=x, level=0)
-            max_seq_len = layers.max_sequence_len(rank_table)
+        ${out_comment}.
     """
     helper = LayerHelper("max_seqence_len", **locals())
     res = helper.create_tmp_variable(dtype="int64")
@@ -750,55 +823,26 @@ def max_sequence_len(rank_table):
     return res
 
 
-def topk(input, k):
-    """
-    **topk**
-
-    This function performs the operation that selects the k entries in the input
-    vector and outputs their values and indices as vectors. Thus topk_out[j] is
-    the j-th largest entry in input, and its index is topk_indices[j]
-
-    Args:
-        input (Variable|list): The input tensor that has all the data.
-        k (int): The number of top elements that the function will pick.
-
-    Returns:
-        Variable: The variable of type array that contains the k largest entries
-                  from input.
-        Variable: The variable of type array that contains the indices of k
-                  largest entries from input.
-
-    Examples:
-        .. code-block:: python
-
-          x = fluid.layers.data(name='x', shape=[10])
-          k = 5
-          array = fluid.layers.topk(x, k)
-    """
-    helper = LayerHelper('topk', **locals())
-    topk_out = helper.create_tmp_variable(dtype=input.dtype)
-    topk_indices = helper.create_tmp_variable(dtype='int64')
-    helper.append_op(
-        type='top_k',
-        inputs={'X': [input]},
-        outputs={'Out': [topk_out],
-                 'Indices': [topk_indices]},
-        attrs={'k': k})
-    return topk_out, topk_indices
-
-
 def lod_tensor_to_array(x, table):
-    """ Convert a LOD_TENSOR to an LOD_TENSOR_ARRAY.
+    """ 
+    Convert a LoDTensor to a LoDTensorArray.
+
+    This function split a LoDTesnor to a LoDTensorArray according to its LoD 
+    information. LoDTensorArray is an alias of C++ std::vector<LoDTensor> in 
+    PaddlePaddle. The generated LoDTensorArray of this function can be further read 
+    or written by `read_from_array()` and `write_to_array()` operators. However, 
+    this function is generally an internal component of PaddlePaddle `DynamicRNN`. 
+    Users should not use it directly.
 
     Args:
-        x (Variable|list): The LOD tensor to be converted to a LOD tensor array.
+        x (Variable|list): The LoDTensor to be converted to a LoDTensorArray.
         table (ParamAttr|list): The variable that stores the level of lod
                                 which is ordered by sequence length in
-                                descending order.
+                                descending order. It is generally generated 
+                                by `layers.lod_rank_table()` API.
 
     Returns:
-        Variable: The variable of type array that has been converted from a
-                  tensor.
+        Variable: The LoDTensorArray that has been converted from the input tensor.
 
     Examples:
         .. code-block:: python
@@ -863,8 +907,7 @@ def increment(x, value=1.0, in_place=True):
         in_place (bool): If the increment should be performed in-place.
 
     Returns:
-        Variable: The tensor variable storing the transformation of
-                  element-wise increment of each value in the input.
+        Variable: The elementwise-incremented object.
 
     Examples:
         .. code-block:: python
@@ -906,7 +949,7 @@ def array_write(x, i, array=None):
         Variable: The output LOD_TENSOR_ARRAY where the input tensor is written.
 
     Examples:
-        .. code-block::python
+        .. code-block:: python
 
           tmp = fluid.layers.zeros(shape=[10], dtype='int32')
           i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
@@ -927,14 +970,17 @@ def array_write(x, i, array=None):
 
 
 def create_array(dtype):
-    """This function creates an array of type :math:`LOD_TENSOR_ARRAY` using the
-    LayerHelper.
+    """
+    **Create LoDTensorArray**
+
+    This function creates an array of LOD_TENSOR_ARRAY . It is mainly used to
+    implement RNN with array_write, array_read and While.
 
     Args:
-        dtype (int|float): The data type of the elements in the array.
+        dtype (int|float): The data type of the elements in the lod_tensor_array.
 
     Returns:
-        Variable: The tensor variable storing the elements of data type.
+        Variable: The lod_tensor_array variable storing the elements of data type.
 
     Examples:
         .. code-block:: python
@@ -949,33 +995,40 @@ def create_array(dtype):
         dtype=dtype)
 
 
-def less_than(x, y, cond=None, **ignored):
+@templatedoc()
+def less_than(x, y, force_cpu=None, cond=None, **ignored):
     """
-    **Less than**
+    ${comment}
 
-    This layer returns the truth value of :math:`x < y` elementwise.
+    >>> import paddle.fluid as fluid
+    >>> less = fluid.layers.less_than(x=label, y=limit)
 
     Args:
-        x(Variable): First operand of *less_than*
-        y(Variable): Second operand of *less_than*
+        x(${x_type}): ${x_comment}.
+        y(${y_type}): ${y_comment}.
+        force_cpu(${force_cpu_type}): ${force_cpu_comment}.
         cond(Variable|None): Optional output variable to store the result of *less_than*
 
     Returns:
-        Variable: The tensor variable storing the output of *less_than*.
-
-    Examples:
-        .. code-block:: python
-
-          less = fluid.layers.less_than(x=label, y=limit)
+        ${out_comment}.
     """
     helper = LayerHelper("less_than", **locals())
     if cond is None:
         cond = helper.create_tmp_variable(dtype='bool')
         cond.stop_gradient = True
 
+    attrs = dict()
+    if force_cpu is not None:
+        attrs['force_cpu'] = force_cpu
+    elif force_init_on_cpu():
+        attrs['force_cpu'] = force_init_on_cpu()
+
     helper.append_op(
-        type='less_than', inputs={'X': [x],
-                                  'Y': [y]}, outputs={'Out': [cond]})
+        type='less_than',
+        inputs={'X': [x],
+                'Y': [y]},
+        outputs={'Out': [cond]},
+        attrs=attrs)
     return cond
 
 
@@ -1010,16 +1063,34 @@ def equal(x, y, cond=None, **ignored):
 
 
 def array_read(array, i):
-    """This function performs the operation to read the data in as an
+    """
+    This function performs the operation to read the data in as an
     LOD_TENSOR_ARRAY.
+
+    .. code-block:: text
+
+        Given:
+
+        array = [0.6, 0.1, 0.3, 0.1]
+        
+        And:
+        
+        i = 2
+
+        Then:
+
+        output = 0.3
+
     Args:
-        array (Variable|list): The input tensor that will be written to an array.
-        i (Variable|list): The subscript index in tensor array, that points the
-                           place where data will be written to.
+        array (Variable|list): The input tensor that store data to be read.
+        i (Variable|list): The index of the data to be read from input array.
+
     Returns:
         Variable: The tensor type variable that has the data written to it.
+
     Examples:
-        .. code-block::python
+        .. code-block:: python
+
           tmp = fluid.layers.zeros(shape=[10], dtype='int32')
           i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
           arr = layers.array_read(tmp, i=i)
@@ -1040,8 +1111,28 @@ def array_read(array, i):
 
 def shrink_memory(x, i, table):
     """
-    This function creates an operator to shrink_rnn_memory using the RankTable
+    This function creates an operator to shrink rnn memory using the RankTable
     as mentioned in the input parameter.
+
+    NOTE: This API is very low-level API. It is used by DynamicRNN only.
+
+    Since the Dynamic RNN uses no-padding way to implement RNN. The sequence
+    will be sorted by order, and the length of valid memory will be shrink after
+    each time step.
+
+    Args:
+        x(Variable): The memory object in the previous time step.
+        i(Variable): The step count variable. A int scalar as LoDTensor.
+        table(Variable): The RNNRankTable object.
+
+    Returns:
+        the memory variable after shrink.
+
+    Examples:
+
+        Since this API is very low level API. The example is not provided.
+        Please reference the implementation of class DynamicRNN for detail
+        usage.
     """
     helper = LayerHelper('shrink_memory', **locals())
     out = helper.create_tmp_variable(dtype=x.dtype)
@@ -1056,9 +1147,14 @@ def shrink_memory(x, i, table):
 
 
 def array_length(array):
-    """This function performs the operation to find the length of the input
+    """
+    **Get the Length of Input LoDTensorArray**
+
+    This function performs the operation to find the length of the input
     LOD_TENSOR_ARRAY.
 
+    Related API: array_read, array_write, While.
+
     Args:
         array (LOD_TENSOR_ARRAY): The input array that will be used
                                   to compute the length.
@@ -1067,12 +1163,13 @@ def array_length(array):
         Variable: The length of the input LoDTensorArray.
 
     Examples:
-        .. code-block::python
+        .. code-block:: python
 
           tmp = fluid.layers.zeros(shape=[10], dtype='int32')
           i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
           arr = fluid.layers.array_write(tmp, i=i)
           arr_len = fluid.layers.array_length(arr)
+
     """
     helper = LayerHelper('array_length', **locals())
     tmp = helper.create_tmp_variable(dtype='int64')
@@ -1083,6 +1180,13 @@ def array_length(array):
 
 
 class ConditionalBlockGuard(BlockGuard):
+    """
+    ConditionalBlockGuard is derived from BlockGuard. It is dedicated for 
+    holding a ConditionalBlock, and helping users entering and exiting the 
+    ConditionalBlock via Python's 'with' keyword. However, ConditionalBlockGuard 
+    is generally an internal component of IfElse, users should not use it directly.
+    """
+
     def __init__(self, block):
         if not isinstance(block, ConditionalBlock):
             raise TypeError("block should be conditional block")
@@ -1099,6 +1203,31 @@ class ConditionalBlockGuard(BlockGuard):
 
 
 class ConditionalBlock(object):
+    '''
+    **ConditionalBlock**
+
+    ConditionalBlock is an operator that bind a block to a specific condition,
+    if the condition matches, the corresponding block will be executed.
+
+    Args:
+        inputs (Variable): bool conditions.
+        is_scalar_condition (bool): whether the branch is controled by a scalar.
+        name(str): name of this ConditionalBlock.
+
+    Examples:
+        .. code-block:: python
+
+             cond = layers.less_than(x=label, y=limit)
+             true_image, false_image = layers.split_lod_tensor(
+                 input=image, mask=cond)
+             true_cond = layers.ConditionalBlock([true_image])
+
+             with true_cond.block():
+                 ...
+             with false_cond.block():
+                 ...
+    '''
+
     def __init__(self, inputs, is_scalar_condition=False, name=None):
         for each_input in inputs:
             if not isinstance(each_input, Variable):
@@ -1130,7 +1259,7 @@ class ConditionalBlock(object):
         input_set = set([ipt.name for ipt in self.inputs])
 
         param_list = [
-            parent_block.var(each_name) for each_name in params
+            parent_block.var_recursive(each_name) for each_name in params
             if each_name not in input_set
         ]
 
@@ -1156,6 +1285,42 @@ class ConditionalBlock(object):
 
 
 class Switch(object):
+    """
+    Switch class works just like a `if-elif-else`. Can be used in learning rate scheduler
+    to modify learning rate
+
+    The Semantics:
+
+    1. A `switch` control-flow checks cases one-by-one.
+
+    2. The condition of each case is a boolean value, which is a scalar Variable.
+
+    3. It runs the first matched case, or the default case if there is one.
+
+    4. Once it matches a case, it runs the corresponding branch and only that branch.
+
+    Examples:
+        .. code-block:: python
+
+            lr = fluid.layers.tensor.create_global_var(
+                shape=[1],
+                value=0.0,
+                dtype='float32',
+                persistable=True,
+                name="learning_rate")
+            one_var = tensor.fill_constant(
+                shape=[1], dtype='float32', value=1.0)
+            two_var = tensor.fill_constant(
+                shape=[1], dtype='float32', value=2.0)
+
+            with fluid.layers.control_flow.Switch() as switch:
+                with switch.case(global_step == zero_var):
+                    fluid.layers.tensor.assign(input=one_var, output=lr)
+                with switch.default():
+                    fluid.layers.tensor.assign(input=two_var, output=lr)
+
+    """
+
     def __init__(self, name=None):
         self.helper = LayerHelper('switch', name=name)
         self.inside_scope = False
@@ -1185,7 +1350,8 @@ class Switch(object):
         return ConditionalBlockGuard(cond_block)
 
     def default(self):
-        """create a default case for this switch
+        """
+        create a default case for this switch
         """
         pre_cond_num = len(self.pre_not_conditions)
         if pre_cond_num == 0:
@@ -1245,6 +1411,34 @@ class IfElseBlockGuard(object):
 
 
 class IfElse(object):
+    """
+    if-else control flow.
+
+    Args:
+        cond (Variable): condition used to compare.
+        name (str, default None): The name of this layer.
+
+    Examples:
+          .. code-block:: python
+
+            limit = fluid.layers.fill_constant_batch_size_like(
+                input=label, dtype='int64', shape=[1], value=5.0)
+            cond = fluid.layers.less_than(x=label, y=limit)
+            ie = fluid.layers.IfElse(cond)
+            with ie.true_block():
+                true_image = ie.input(image)
+                hidden = fluid.layers.fc(input=true_image, size=100, act='tanh')
+                prob = fluid.layers.fc(input=hidden, size=10, act='softmax')
+                ie.output(prob)
+
+            with ie.false_block():
+                false_image = ie.input(image)
+                hidden = fluid.layers.fc(
+                    input=false_image, size=200, act='tanh')
+                prob = fluid.layers.fc(input=hidden, size=10, act='softmax')
+                ie.output(prob)
+            prob = ie()
+    """
     OUT_IF_ELSE_BLOCKS = 0
     IN_IF_ELSE_TRUE_BLOCKS = 1
     IN_IF_ELSE_FALSE_BLOCKS = 2
@@ -1347,6 +1541,38 @@ class IfElse(object):
 
 
 class DynamicRNN(object):
+    """
+    The dynamic RNN can process a batch of sequence data. The length of each
+    sample sequence can be different. This API automatically process them in
+    batch.
+
+    The input lod must be set. Please reference `lod_tensor`
+
+    >>> import paddle.fluid as fluid
+    >>> data = fluid.layers.data(name='sentence', dtype='int64', lod_level=1)
+    >>> embedding = fluid.layers.embedding(input=data, size=[65535, 32],
+    >>>                                    is_sparse=True)
+    >>>
+    >>> drnn = fluid.layers.DynamicRNN()
+    >>> with drnn.block():
+    >>>     word = drnn.step_input(embedding)
+    >>>     prev = drnn.memory(shape=[200])
+    >>>     hidden = fluid.layers.fc(input=[word, prev], size=200, act='relu')
+    >>>     drnn.update_memory(prev, hidden)  # set prev to hidden
+    >>>     drnn.output(hidden)
+    >>>
+    >>> # last is the last time step of rnn. It is the encoding result.
+    >>> last = fluid.layers.sequence_last_step(drnn())
+
+    The dynamic RNN will unfold sequence into timesteps. Users need to define
+    how to process each time step during the :code:`with` block.
+
+    The `memory` is used staging data cross time step. The initial value of
+    memory can be zero or another variable.
+
+    The dynamic RNN can mark multiple variables as its output. Use `drnn()` to
+    get the output sequence.
+    """
     BEFORE_RNN = 0
     IN_RNN = 1
     AFTER_RNN = 2
@@ -1369,6 +1595,15 @@ class DynamicRNN(object):
         self.mem_link = []
 
     def step_input(self, x):
+        """
+        Mark a sequence as a dynamic RNN input.
+        Args:
+            x(Variable): The input sequence.
+
+        Returns:
+            The current timestep in the input sequence.
+
+        """
         self._assert_in_rnn_block_("step_input")
         if not isinstance(x, Variable):
             raise TypeError(
@@ -1396,7 +1631,8 @@ class DynamicRNN(object):
                 type='less_than',
                 inputs={'X': self.step_idx,
                         'Y': self.max_seq_len},
-                outputs={'Out': self.cond})
+                outputs={'Out': self.cond},
+                attrs={'force_cpu': True})
 
         input_array = parent_block.create_var(
             name=unique_name.generate('dynamic_rnn_input_array'),
@@ -1411,6 +1647,15 @@ class DynamicRNN(object):
         return array_read(array=input_array, i=self.step_idx)
 
     def static_input(self, x):
+        """
+        Mark a variable as a RNN input. The input will not be scattered into
+        time steps.
+        Args:
+            x(Variable): The input variable.
+
+        Returns:
+            The input variable that can access in RNN.
+        """
         self._assert_in_rnn_block_("static_input")
         if not isinstance(x, Variable):
             raise TypeError(
@@ -1432,6 +1677,10 @@ class DynamicRNN(object):
 
     @contextlib.contextmanager
     def block(self):
+        """
+        The block for user to define operators in RNN. See the class docstring
+        for more details.
+        """
         if self.status != DynamicRNN.BEFORE_RNN:
             raise ValueError("rnn.block() can only be invoke once")
         self.step_idx = fill_constant(
@@ -1445,7 +1694,11 @@ class DynamicRNN(object):
             for new_mem, mem_array in self.mem_link:
                 array_write(x=new_mem, i=self.step_idx, array=mem_array)
 
-            less_than(x=self.step_idx, y=self.max_seq_len, cond=self.cond)
+            less_than(
+                x=self.step_idx,
+                y=self.max_seq_len,
+                force_cpu=True,
+                cond=self.cond)
 
         self.status = DynamicRNN.AFTER_RNN
         for each_array in self.output_array:
@@ -1454,6 +1707,9 @@ class DynamicRNN(object):
                     x=each_array, table=self.lod_rank_table))
 
     def __call__(self, *args, **kwargs):
+        """
+        Get the output of RNN. This API should only be invoked after RNN.block()
+        """
         if self.status != DynamicRNN.AFTER_RNN:
             raise ValueError(("Output of the dynamic RNN can only be visited "
                               "outside the rnn block."))
@@ -1468,6 +1724,70 @@ class DynamicRNN(object):
                value=0.0,
                need_reorder=False,
                dtype='float32'):
+        """
+        Create a memory variable for dynamic rnn.
+
+        If the :code:`init` is not None, :code:`memory` will be initialized by
+        this variable. The :code:`need_reorder` is used to reorder the memory as
+        the input variable. It should be set to true when the initialized memory
+        depends on the input sample.
+
+        For example,
+
+        >>> import paddle.fluid as fluid
+        >>> sentence = fluid.layers.data(
+        >>>                 name='sentence', dtype='float32', shape=[32])
+        >>> boot_memory = fluid.layers.data(
+        >>>                 name='boot', dtype='float32', shape=[10])
+        >>>
+        >>> drnn = fluid.layers.DynamicRNN()
+        >>> with drnn.block():
+        >>>     word = drnn.step_input(sentence)
+        >>>     memory = drnn.memory(init=boot_memory, need_reorder=True)
+        >>>     hidden = fluid.layers.fc(
+        >>>                 input=[word, memory], size=10, act='tanh')
+        >>>     drnn.update_memory(ex_mem=memory, new_mem=hidden)
+        >>>     drnn.output(hidden)
+        >>> rnn_output = drnn()
+
+
+        Otherwise, if :code:`shape`, :code:`value`, :code:`dtype` are set, the
+        :code:`memory` will be initialized by this :code:`value`.
+
+        For example,
+
+        >>> import paddle.fluid as fluid
+        >>> sentence = fluid.layers.data(
+        >>>                 name='sentence', dtype='float32', shape=[32])
+        >>>
+        >>> drnn = fluid.layers.DynamicRNN()
+        >>> with drnn.block():
+        >>>     word = drnn.step_input(sentence)
+        >>>     memory = drnn.memory(shape=[10], dtype='float32', value=0)
+        >>>     hidden = fluid.layers.fc(
+        >>>             input=[word, memory], size=10, act='tanh')
+        >>>     drnn.update_memory(ex_mem=memory, new_mem=hidden)
+        >>>     drnn.output(hidden)
+        >>> rnn_output = drnn()
+
+
+        Args:
+            init(Variable|None): The initialized variable.
+
+            shape(list|tuple): The memory shape. NOTE the shape does not contain
+            batch_size.
+
+            value(float): the initalized value.
+
+            need_reorder(bool): True if the initialized memory depends on the
+            input sample.
+
+            dtype(str|numpy.dtype): The data type of the initialized memory.
+
+        Returns:
+            the memory variable.
+
+        """
         self._assert_in_rnn_block_('memory')
         if init is not None:
             if not isinstance(init, Variable):
@@ -1535,6 +1855,16 @@ class DynamicRNN(object):
             return self.memory(init=init)
 
     def update_memory(self, ex_mem, new_mem):
+        """
+        Update the memory from ex_mem to new_mem. NOTE that the shape and data
+        type of :code:`ex_mem` and :code:`new_mem` must be same.
+        Args:
+            ex_mem(Variable): the memory variable.
+            new_mem(Variable): the plain variable generated in RNN block.
+
+        Returns:
+            None
+        """
         self._assert_in_rnn_block_('update_memory')
         if not isinstance(ex_mem, Variable):
             raise TypeError("The input arg `ex_mem` of update_memory() must "
@@ -1552,6 +1882,15 @@ class DynamicRNN(object):
         self.mem_link.append((new_mem, mem_array))
 
     def output(self, *outputs):
+        """
+        mark the RNN output variables.
+
+        Args:
+            outputs: The output variables.
+
+        Returns:
+            None
+        """
         self._assert_in_rnn_block_('output')
         parent_block = self._parent_block_()
         for each in outputs:
@@ -1590,3 +1929,40 @@ def reorder_lod_tensor_by_rank(x, rank_table):
                 'RankTable': [rank_table]},
         outputs={'Out': [out]})
     return out
+
+
+def is_empty(x, cond=None, **ignored):
+    """
+    Test whether a Variable is empty.
+
+    Args:
+        x (Variable): The Variable to be tested.
+        cond (Variable|None): Output parameter. Returns the test result 
+                              of given 'x'. Default: None
+
+    Returns:
+        Variable: A bool scalar. True if 'x' is an empty Variable.
+
+    Raises:
+        TypeError: If input cond is not a variable, or cond's dtype is
+                   not bool.
+
+    Examples:
+        .. code-block:: python
+
+          res = fluid.layers.is_empty(x=input)
+          # or:
+          fluid.layers.is_empty(x=input, cond=res)
+    """
+    helper = LayerHelper("is_empty", **locals())
+    if cond is None:
+        cond = helper.create_tmp_variable(dtype='bool')
+        cond.stop_gradient = True
+    elif not isinstance(cond, Variable):
+        raise TypeError("cond takes a variable")
+    elif cond.dtype != 'bool':
+        raise TypeError("The data type of cond must be bool")
+
+    helper.append_op(
+        type='is_empty', inputs={'X': [x]}, outputs={'Out': [cond]})
+    return cond
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index ea189749bc6cc1e37c1efc6fea424143b887cecd..6af01297df54ffd4201776d20d51a88f5808ccb0 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -16,20 +16,21 @@ All layers just related to the detection neural network.
 """
 
 from layer_function_generator import generate_layer_fn
-from layer_function_generator import autodoc
+from layer_function_generator import autodoc, templatedoc
 from ..layer_helper import LayerHelper
 import tensor
-import ops
 import nn
 import math
 
 __all__ = [
+    'prior_box',
     'multi_box_head',
     'bipartite_match',
     'target_assign',
     'detection_output',
     'ssd_loss',
     'detection_map',
+    'anchor_generator',
 ]
 
 __auto__ = [
@@ -58,7 +59,7 @@ def detection_output(loc,
 
     This operation is to get the detection results by performing following
     two steps:
-    
+
     1. Decode input bounding box predictions according to the prior boxes.
     2. Get the final detection results by applying multi-class non maximum
        suppression (NMS).
@@ -97,7 +98,9 @@ def detection_output(loc,
         nms_eta(float): The parameter for adaptive NMS.
 
     Returns:
-        Variable: The detection outputs is a LoDTensor with shape [No, 6].
+        Variable: 
+        
+            The detection outputs is a LoDTensor with shape [No, 6].
             Each row has six values: [label, confidence, xmin, ymin, xmax, ymax].
             `No` is the total number of detections in this mini-batch. For each
             instance, the offsets in first dimension are called LoD, the offset
@@ -110,15 +113,15 @@ def detection_output(loc,
     Examples:
         .. code-block:: python
 
-        pb = layers.data(name='prior_box', shape=[10, 4],
+            pb = layers.data(name='prior_box', shape=[10, 4],
                          append_batch_size=False, dtype='float32')
-        pbv = layers.data(name='prior_box_var', shape=[10, 4],
+            pbv = layers.data(name='prior_box_var', shape=[10, 4],
                           append_batch_size=False, dtype='float32')
-        loc = layers.data(name='target_box', shape=[2, 21, 4],
+            loc = layers.data(name='target_box', shape=[2, 21, 4],
                           append_batch_size=False, dtype='float32')
-        scores = layers.data(name='scores', shape=[2, 21, 10],
+            scores = layers.data(name='scores', shape=[2, 21, 10],
                           append_batch_size=False, dtype='float32')
-        nmsed_outs = fluid.layers.detection_output(scores=scores,
+            nmsed_outs = fluid.layers.detection_output(scores=scores,
                                        loc=loc,
                                        prior_box=pb,
                                        prior_box_var=pbv)
@@ -129,13 +132,12 @@ def detection_output(loc,
         prior_box_var=prior_box_var,
         target_box=loc,
         code_type='decode_center_size')
-
     old_shape = scores.shape
-    scores = ops.reshape(x=scores, shape=(-1, old_shape[-1]))
-    scores = ops.softmax(x=scores)
-    scores = ops.reshape(x=scores, shape=old_shape)
+    scores = nn.reshape(x=scores, shape=(-1, old_shape[-1]))
+    scores = nn.softmax(input=scores)
+    scores = nn.reshape(x=scores, shape=old_shape)
     scores = nn.transpose(scores, perm=[0, 2, 1])
-
+    scores.stop_gradient = True
     nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
     helper.append_op(
         type="multiclass_nms",
@@ -150,10 +152,11 @@ def detection_output(loc,
             'score_threshold': score_threshold,
             'nms_eta': 1.0
         })
+    nmsed_outs.stop_gradient = True
     return nmsed_outs
 
 
-@autodoc()
+@templatedoc()
 def detection_map(detect_res,
                   label,
                   class_num,
@@ -164,6 +167,47 @@ def detection_map(detect_res,
                   input_states=None,
                   out_states=None,
                   ap_version='integral'):
+    """
+    ${comment}
+
+    Args:
+        detect_res: ${detect_res_comment}
+        label:  ${label_comment}
+        class_num: ${class_num_comment}
+        background_label: ${background_label_comment}
+        overlap_threshold: ${overlap_threshold_comment}
+        evaluate_difficult: ${evaluate_difficult_comment}
+        has_state: ${has_state_comment}
+        input_states: If not None, It contains 3 elements:
+            1. pos_count ${pos_count_comment}.
+            2. true_pos ${true_pos_comment}.
+            3. false_pos ${false_pos_comment}.
+        out_states: If not None, it contains 3 elements.
+            1. accum_pos_count ${accum_pos_count_comment}.
+            2. accum_true_pos ${accum_true_pos_comment}.
+            3. accum_false_pos ${accum_false_pos_comment}.
+        ap_version: ${ap_type_comment}
+
+    Returns:
+        ${map_comment}
+
+
+    Examples:
+          .. code-block:: python
+
+            detect_res = fluid.layers.data(
+                name='detect_res',
+                shape=[10, 6],
+                append_batch_size=False,
+                dtype='float32')
+            label = fluid.layers.data(
+                name='label',
+                shape=[10, 6],
+                append_batch_size=False,
+                dtype='float32')
+
+            map_out = fluid.layers.detection_map(detect_res, label, 21)
+    """
     helper = LayerHelper("detection_map", **locals())
 
     def __create_var(type):
@@ -210,53 +254,68 @@ def bipartite_match(dist_matrix,
                     dist_threshold=None,
                     name=None):
     """
-    **Bipartite matchint operator**
-
-    This operator is a greedy bipartite matching algorithm, which is used to
-    obtain the matching with the maximum distance based on the input
+    This operator implements a greedy bipartite matching algorithm, which is
+    used to obtain the matching with the maximum distance based on the input
     distance matrix. For input 2D matrix, the bipartite matching algorithm can
-    find the matched column for each row, also can find the matched row for
-    each column. And this operator only calculate matched indices from column
-    to row. For each instance, the number of matched indices is the number of
-    of columns of the input ditance matrix.
-
-    There are two outputs to save matched indices and distance.
-    A simple description, this algothrim matched the best (maximum distance)
+    find the matched column for each row (matched means the largest distance),
+    also can find the matched row for each column. And this operator only
+    calculate matched indices from column to row. For each instance,
+    the number of matched indices is the column number of the input distance
+    matrix.
+
+    There are two outputs, matched indices and distance.
+    A simple description, this algorithm matched the best (maximum distance)
     row entity to the column entity and the matched indices are not duplicated
     in each row of ColToRowMatchIndices. If the column entity is not matched
     any row entity, set -1 in ColToRowMatchIndices.
 
-    Please note that the input DistMat can be LoDTensor (with LoD) or Tensor.
+    NOTE: the input DistMat can be LoDTensor (with LoD) or Tensor.
     If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size.
     If Tensor, the height of ColToRowMatchIndices is 1.
 
+    NOTE: This API is a very low level API. It is used by :code:`ssd_loss`
+    layer. Please consider to use :code:`ssd_loss` instead.
+
     Args:
         dist_matrix(Variable): This input is a 2-D LoDTensor with shape
             [K, M]. It is pair-wise distance matrix between the entities
             represented by each row and each column. For example, assumed one
             entity is A with shape [K], another entity is B with shape [M]. The
-            dist_matirx[i][j] is the distance between A[i] and B[j]. The bigger
-            the distance is, the better macthing the pairs are. Please note,
-            This tensor can contain LoD information to represent a batch of
-            inputs. One instance of this batch can contain different numbers of
-            entities.
+            dist_matrix[i][j] is the distance between A[i] and B[j]. The bigger
+            the distance is, the better matching the pairs are.
+
+            NOTE: This tensor can contain LoD information to represent a batch
+            of inputs. One instance of this batch can contain different numbers
+            of entities.
         match_type(string|None): The type of matching method, should be
-           'bipartite' or 'per_prediction', 'bipartite' by defalut.
+           'bipartite' or 'per_prediction'. [default 'bipartite'].
         dist_threshold(float|None): If `match_type` is 'per_prediction',
             this threshold is to determine the extra matching bboxes based
-            on the maximum distance, 0.5 by defalut.
+            on the maximum distance, 0.5 by default.
     Returns:
-        match_indices(Variable): A 2-D Tensor with shape [N, M] in int type.
-            N is the batch size. If match_indices[i][j] is -1, it
-            means B[j] does not match any entity in i-th instance.
-            Otherwise, it means B[j] is matched to row
-            match_indices[i][j] in i-th instance. The row number of
-            i-th instance is saved in match_indices[i][j].
-        match_distance(Variable): A 2-D Tensor with shape [N, M] in float type.
-            N is batch size. If match_indices[i][j] is -1,
-            match_distance[i][j] is also -1.0. Otherwise, assumed
-            match_distance[i][j] = d, and the row offsets of each instance
-            are called LoD. Then match_distance[i][j] = dist_matrix[d+LoD[i]][j].
+        tuple: a tuple with two elements is returned. The first is
+        matched_indices, the second is matched_distance.
+
+        The matched_indices is a 2-D Tensor with shape [N, M] in int type.
+        N is the batch size. If match_indices[i][j] is -1, it
+        means B[j] does not match any entity in i-th instance.
+        Otherwise, it means B[j] is matched to row
+        match_indices[i][j] in i-th instance. The row number of
+        i-th instance is saved in match_indices[i][j].
+
+        The matched_distance is a 2-D Tensor with shape [N, M] in float type
+        . N is batch size. If match_indices[i][j] is -1,
+        match_distance[i][j] is also -1.0. Otherwise, assumed
+        match_distance[i][j] = d, and the row offsets of each instance
+        are called LoD. Then match_distance[i][j] =
+        dist_matrix[d+LoD[i]][j].
+
+    Examples:
+
+        >>> x = fluid.layers.data(name='x', shape=[4], dtype='float32')
+        >>> y = fluid.layers.data(name='y', shape=[4], dtype='float32')
+        >>> iou = fluid.layers.iou_similarity(x=x, y=y)
+        >>> matched_indices, matched_dist = fluid.layers.bipartite_match(iou)
     """
     helper = LayerHelper('bipartite_match', **locals())
     match_indices = helper.create_tmp_variable(dtype='int32')
@@ -281,8 +340,6 @@ def target_assign(input,
                   mismatch_value=None,
                   name=None):
     """
-    **Target assigner operator**
-
     This operator can be, for given the target bounding boxes or labels,
     to assign classification and regression targets to each prediction as well as
     weights to prediction. The weights is used to specify which prediction would
@@ -296,20 +353,24 @@ def target_assign(input,
 
     1. Assigning all outpts based on `match_indices`:
 
-    If id = match_indices[i][j] > 0,
+    .. code-block:: text
+
+        If id = match_indices[i][j] > 0,
 
-        out[i][j][0 : K] = X[lod[i] + id][j % P][0 : K]
-        out_weight[i][j] = 1.
+            out[i][j][0 : K] = X[lod[i] + id][j % P][0 : K]
+            out_weight[i][j] = 1.
 
-    Otherwise,
+        Otherwise,
 
-        out[j][j][0 : K] = {mismatch_value, mismatch_value, ...}
-        out_weight[i][j] = 0.
+            out[j][j][0 : K] = {mismatch_value, mismatch_value, ...}
+            out_weight[i][j] = 0.
 
     2. Assigning out_weight based on `neg_indices` if `neg_indices` is provided:
 
     Assumed that the row offset for each instance in `neg_indices` is called neg_lod,
     for i-th instance and each `id` of neg_indices in this instance:
+    
+    .. code-block:: text
 
         out[i][id][0 : K] = {mismatch_value, mismatch_value, ...}
         out_weight[i][id] = 1.0
@@ -326,10 +387,23 @@ def target_assign(input,
        mismatch_value (float32): Fill this value to the mismatched location.
 
     Returns:
-       out (Variable): The output is a 3D Tensor with shape [N, P, K],
-           N and P is the same as they are in `neg_indices`, K is the
-           same as it in input of X. If `match_indices[i][j]`.
-       out_weight (Variable): The weight for output with the shape of [N, P, 1].
+        tuple: 
+        
+               A tuple(out, out_weight) is returned. out is a 3D Tensor with 
+               shape [N, P, K], N and P is the same as they are in 
+               `neg_indices`, K is the same as it in input of X. If 
+               `match_indices[i][j]`. out_weight is the weight for output with 
+               the shape of [N, P, 1].
+
+    Examples:
+
+        .. code-block:: python
+
+            matched_indices, matched_dist = fluid.layers.bipartite_match(iou)
+            gt = layers.data(
+                        name='gt', shape=[1, 1], dtype='int32', lod_level=1)
+            trg, trg_weight = layers.target_assign(
+                            gt, matched_indices, mismatch_value=0)
     """
     helper = LayerHelper('target_assign', **locals())
     out = helper.create_tmp_variable(dtype=input.dtype)
@@ -364,7 +438,7 @@ def ssd_loss(location,
              normalize=True,
              sample_size=None):
     """
-    **Multi-box loss layer for object dection algorithm of SSD**
+    **Multi-box loss layer for object detection algorithm of SSD**
 
     This layer is to compute dection loss for SSD given the location offset
     predictions, confidence predictions, prior boxes and ground-truth boudding
@@ -372,21 +446,35 @@ def ssd_loss(location,
     is a weighted sum of the localization loss (or regression loss) and
     confidence loss (or classification loss) by performing the following steps:
 
-    1. Find matched boundding box by bipartite matching algorithm.
+    1. Find matched bounding box by bipartite matching algorithm.
+
       1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
+
       1.2 Compute matched boundding box by bipartite matching algorithm.
+
     2. Compute confidence for mining hard examples
+
       2.1. Get the target label based on matched indices.
+
       2.2. Compute confidence loss.
+
     3. Apply hard example mining to get the negative example indices and update
        the matched indices.
+
     4. Assign classification and regression targets
+
       4.1. Encoded bbox according to the prior boxes.
+
       4.2. Assign regression targets.
+
       4.3. Assign classification targets.
+
     5. Compute the overall objective loss.
+
       5.1 Compute confidence loss.
+
       5.1 Compute localization loss.
+
       5.3 Compute the overall weighted loss.
 
     Args:
@@ -421,39 +509,36 @@ def ssd_loss(location,
         mining_type (str): The hard example mining type, should be 'hard_example'
             or 'max_negative', now only support `max_negative`.
         normalize (bool): Whether to normalize the SSD loss by the total number
-            of output locations, True by defalut.
+            of output locations, True by default.
         sample_size (int): The max sample size of negative box, used only when
             mining_type is 'hard_example'.
 
     Returns:
-        Variable: The weighted sum of the localization loss and confidence loss,
-            with shape [N * Np, 1], N and Np are the same as they are
-            in `location`.
+        The weighted sum of the localization loss and confidence loss, with \
+        shape [N * Np, 1], N and Np are the same as they are in `location`.
 
     Raises:
-        ValueError: If mining_type is 'hard_example', now only support
-            mining type of `max_negative`.
+        ValueError: If mining_type is 'hard_example', now only support mining \
+        type of `max_negative`.
 
     Examples:
-        .. code-block:: python
-
-            pb = layers.data(
-                name='prior_box',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32')
-            pbv = layers.data(
-                name='prior_box_var',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32')
-            loc = layers.data(name='target_box', shape=[10, 4], dtype='float32')
-            scores = layers.data(name='scores', shape=[10, 21], dtype='float32')
-            gt_box = layers.data(
-                name='gt_box', shape=[4], lod_level=1, dtype='float32')
-            gt_label = layers.data(
-                name='gt_label', shape=[1], lod_level=1, dtype='float32')
-            loss = layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv)
+        >>> pb = fluid.layers.data(
+        >>>                   name='prior_box',
+        >>>                   shape=[10, 4],
+        >>>                   append_batch_size=False,
+        >>>                   dtype='float32')
+        >>> pbv = fluid.layers.data(
+        >>>                   name='prior_box_var',
+        >>>                   shape=[10, 4],
+        >>>                   append_batch_size=False,
+        >>>                   dtype='float32')
+        >>> loc = fluid.layers.data(name='target_box', shape=[10, 4], dtype='float32')
+        >>> scores = fluid.layers.data(name='scores', shape=[10, 21], dtype='float32')
+        >>> gt_box = fluid.layers.data(
+        >>>         name='gt_box', shape=[4], lod_level=1, dtype='float32')
+        >>> gt_label = fluid.layers.data(
+        >>>         name='gt_label', shape=[1], lod_level=1, dtype='float32')
+        >>> loss = fluid.layers.ssd_loss(loc, scores, gt_box, gt_label, pb, pbv)
     """
 
     helper = LayerHelper('ssd_loss', **locals())
@@ -463,7 +548,7 @@ def ssd_loss(location,
     num, num_prior, num_class = confidence.shape
 
     def __reshape_to_2d(var):
-        return ops.reshape(x=var, shape=[-1, var.shape[-1]])
+        return nn.reshape(x=var, shape=[-1, var.shape[-1]])
 
     # 1. Find matched boundding box by prior box.
     #   1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
@@ -474,7 +559,8 @@ def ssd_loss(location,
 
     # 2. Compute confidence for mining hard examples
     # 2.1. Get the target label based on matched indices
-    gt_label = ops.reshape(x=gt_label, shape=gt_label.shape + (1, ))
+    gt_label = nn.reshape(x=gt_label, shape=gt_label.shape + (1, ))
+    gt_label.stop_gradient = True
     target_label, _ = target_assign(
         gt_label, matched_indices, mismatch_value=background_label)
     # 2.2. Compute confidence loss.
@@ -482,10 +568,12 @@ def ssd_loss(location,
     confidence = __reshape_to_2d(confidence)
     target_label = tensor.cast(x=target_label, dtype='int64')
     target_label = __reshape_to_2d(target_label)
+    target_label.stop_gradient = True
     conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
 
     # 3. Mining hard examples
-    conf_loss = ops.reshape(x=conf_loss, shape=(num, num_prior))
+    conf_loss = nn.reshape(x=conf_loss, shape=(num, num_prior))
+    conf_loss.stop_gradient = True
     neg_indices = helper.create_tmp_variable(dtype='int32')
     dtype = matched_indices.dtype
     updated_matched_indices = helper.create_tmp_variable(dtype=dtype)
@@ -553,7 +641,7 @@ def ssd_loss(location,
     # 5.3 Compute overall weighted loss.
     loss = conf_loss_weight * conf_loss + loc_loss_weight * loc_loss
     # reshape to [N, Np], N is the batch size and Np is the prior box number.
-    loss = ops.reshape(x=loss, shape=[-1, num_prior])
+    loss = nn.reshape(x=loss, shape=[-1, num_prior])
     loss = nn.reduce_sum(loss, dim=1, keep_dim=True)
     if normalize:
         normalizer = nn.reduce_sum(target_loc_weight)
@@ -562,6 +650,119 @@ def ssd_loss(location,
     return loss
 
 
+def prior_box(input,
+              image,
+              min_sizes,
+              max_sizes=None,
+              aspect_ratios=[1.],
+              variance=[0.1, 0.1, 0.2, 0.2],
+              flip=False,
+              clip=False,
+              steps=[0.0, 0.0],
+              offset=0.5,
+              name=None):
+    """
+    **Prior Box Operator**
+
+    Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
+    Each position of the input produce N prior boxes, N is determined by
+    the count of min_sizes, max_sizes and aspect_ratios, The size of the
+    box is in range(min_size, max_size) interval, which is generated in
+    sequence according to the aspect_ratios.
+
+    Args:
+       input(Variable): The Input Variables, the format is NCHW.
+       image(Variable): The input image data of PriorBoxOp,
+            the layout is NCHW.
+       min_sizes(list|tuple|float value): min sizes of generated prior boxes.
+       max_sizes(list|tuple|None): max sizes of generated prior boxes.
+            Default: None.
+       aspect_ratios(list|tuple|float value): the aspect ratios of generated
+            prior boxes. Default: [1.].
+       variance(list|tuple): the variances to be encoded in prior boxes.
+            Default:[0.1, 0.1, 0.2, 0.2].
+       flip(bool): Whether to flip aspect ratios. Default:False.
+       clip(bool): Whether to clip out-of-boundary boxes. Default: False.
+       step(list|turple): Prior boxes step across width and height, If
+            step[0] == 0.0/step[1] == 0.0, the prior boxes step across
+            height/weight of the input will be automatically calculated.
+            Default: [0., 0.]
+       offset(float): Prior boxes center offset. Default: 0.5
+       name(str): Name of the prior box op. Default: None.
+
+    Returns:
+        tuple: A tuple with two Variable (boxes, variances)
+
+        boxes: the output prior boxes of PriorBox.
+        The layout is [H, W, num_priors, 4].
+        H is the height of input, W is the width of input,
+        num_priors is the total
+        box count of each position of input.
+
+        variances: the expanded variances of PriorBox.
+        The layout is [H, W, num_priors, 4].
+        H is the height of input, W is the width of input
+        num_priors is the total
+        box count of each position of input
+
+
+    Examples:
+        .. code-block:: python
+
+            box, var = fluid.layers.prior_box(
+                input=conv1,
+                image=images,
+                min_sizes=[100.],
+                flip=True,
+                clip=True)
+    """
+    helper = LayerHelper("prior_box", **locals())
+    dtype = helper.input_dtype()
+
+    def _is_list_or_tuple_(data):
+        return (isinstance(data, list) or isinstance(data, tuple))
+
+    if not _is_list_or_tuple_(min_sizes):
+        min_sizes = [min_sizes]
+    if not _is_list_or_tuple_(aspect_ratios):
+        aspect_ratios = [aspect_ratios]
+    if not (_is_list_or_tuple_(steps) and len(steps) == 2):
+        raise ValueError('steps should be a list or tuple ',
+                         'with length 2, (step_width, step_height).')
+
+    min_sizes = list(map(float, min_sizes))
+    aspect_ratios = list(map(float, aspect_ratios))
+    steps = list(map(float, steps))
+
+    attrs = {
+        'min_sizes': min_sizes,
+        'aspect_ratios': aspect_ratios,
+        'variances': variance,
+        'flip': flip,
+        'clip': clip,
+        'step_w': steps[0],
+        'step_h': steps[1],
+        'offset': offset
+    }
+    if max_sizes is not None and len(max_sizes) > 0 and max_sizes[0] > 0:
+        if not _is_list_or_tuple_(max_sizes):
+            max_sizes = [max_sizes]
+        attrs['max_sizes'] = max_sizes
+
+    box = helper.create_tmp_variable(dtype)
+    var = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="prior_box",
+        inputs={"Input": input,
+                "Image": image},
+        outputs={"Boxes": box,
+                 "Variances": var},
+        attrs=attrs, )
+    box.stop_gradient = True
+    var.stop_gradient = True
+    return box, var
+
+
 def multi_box_head(inputs,
                    image,
                    base_size,
@@ -583,11 +784,9 @@ def multi_box_head(inputs,
                    stride=1,
                    name=None):
     """
-    **Prior_boxes**
-
     Generate prior boxes for SSD(Single Shot MultiBox Detector)
     algorithm. The details of this algorithm, please refer the
-    section 2.2 of SSD paper (SSD: Single Shot MultiBox Detector)
+    section 2.2 of SSD paper `SSD: Single Shot MultiBox Detector
     <https://arxiv.org/abs/1512.02325>`_ .
 
     Args:
@@ -628,24 +827,27 @@ def multi_box_head(inputs,
        name(str): Name of the prior box layer. Default: None.
 
     Returns:
-        mbox_loc(Variable): The predicted boxes' location of the inputs.
-             The layout is [N, H*W*Priors, 4]. where Priors
-             is the number of predicted boxes each position of each input.
-        mbox_conf(Variable): The predicted boxes' confidence of the inputs.
-             The layout is [N, H*W*Priors, C]. where Priors
-             is the number of predicted boxes each position of each input
-             and C is the number of Classes.
-        boxes(Variable): the output prior boxes of PriorBox.
-             The layout is [num_priors, 4]. num_priors is the total
-             box count of each position of inputs.
-        Variances(Variable): the expanded variances of PriorBox.
-             The layout is [num_priors, 4]. num_priors is the total
-             box count of each position of inputs
+        tuple: A tuple with four Variables. (mbox_loc, mbox_conf, boxes, variances)
+
+        mbox_loc: The predicted boxes' location of the inputs. The layout
+        is [N, H*W*Priors, 4]. where Priors is the number of predicted
+        boxes each position of each input.
+
+        mbox_conf: The predicted boxes' confidence of the inputs. The layout
+        is [N, H*W*Priors, C]. where Priors is the number of predicted boxes
+        each position of each input and C is the number of Classes.
+
+        boxes: the output prior boxes of PriorBox. The layout is [num_priors, 4].
+        num_priors is the total box count of each position of inputs.
+
+        variances: the expanded variances of PriorBox. The layout is
+        [num_priors, 4]. num_priors is the total box count of each position of inputs
 
 
     Examples:
         .. code-block:: python
-          mbox_locs, mbox_confs, box, var = layers.multi_box_head(
+
+          mbox_locs, mbox_confs, box, var = fluid.layers.multi_box_head(
             inputs=[conv1, conv2, conv3, conv4, conv5, conv5],
             image=images,
             num_classes=21,
@@ -658,45 +860,6 @@ def multi_box_head(inputs,
             clip=True)
     """
 
-    def _prior_box_(input,
-                    image,
-                    min_sizes,
-                    max_sizes,
-                    aspect_ratios,
-                    variance,
-                    flip=False,
-                    clip=False,
-                    step_w=0.0,
-                    step_h=0.0,
-                    offset=0.5,
-                    name=None):
-        helper = LayerHelper("prior_box", **locals())
-        dtype = helper.input_dtype()
-
-        attrs = {
-            'min_sizes': min_sizes,
-            'aspect_ratios': aspect_ratios,
-            'variances': variance,
-            'flip': flip,
-            'clip': clip,
-            'step_w': step_w,
-            'step_h': step_h,
-            'offset': offset
-        }
-        if len(max_sizes) > 0 and max_sizes[0] > 0:
-            attrs['max_sizes'] = max_sizes
-
-        box = helper.create_tmp_variable(dtype)
-        var = helper.create_tmp_variable(dtype)
-        helper.append_op(
-            type="prior_box",
-            inputs={"Input": input,
-                    "Image": image},
-            outputs={"Boxes": box,
-                     "Variances": var},
-            attrs=attrs, )
-        return box, var
-
     def _reshape_with_axis_(input, axis=1):
         if not (axis > 0 and axis < len(input.shape)):
             raise ValueError("The axis should be smaller than "
@@ -704,7 +867,7 @@ def multi_box_head(inputs,
         new_shape = [
             -1, reduce(lambda x, y: x * y, input.shape[axis:len(input.shape)])
         ]
-        out = ops.reshape(x=input, shape=new_shape)
+        out = nn.reshape(x=input, shape=new_shape)
         return out
 
     def _is_list_or_tuple_(data):
@@ -773,11 +936,10 @@ def multi_box_head(inputs,
             aspect_ratio = aspect_ratios[i]
             if not _is_list_or_tuple_(aspect_ratio):
                 aspect_ratio = [aspect_ratio]
+        step = [step_w[i] if step_w else 0.0, step_h[i] if step_w else 0.0]
 
-        box, var = _prior_box_(input, image, min_size, max_size, aspect_ratio,
-                               variance, flip, clip, step_w[i]
-                               if step_w else 0.0, step_h[i]
-                               if step_w else 0.0, offset)
+        box, var = prior_box(input, image, min_size, max_size, aspect_ratio,
+                             variance, flip, clip, step, offset)
 
         box_results.append(box)
         var_results.append(var)
@@ -798,7 +960,7 @@ def multi_box_head(inputs,
             mbox_loc.shape[0],
             mbox_loc.shape[1] * mbox_loc.shape[2] * mbox_loc.shape[3] / 4, 4
         ]
-        mbox_loc_flatten = ops.reshape(mbox_loc, shape=new_shape)
+        mbox_loc_flatten = nn.reshape(mbox_loc, shape=new_shape)
         mbox_locs.append(mbox_loc_flatten)
 
         # get conf
@@ -814,7 +976,7 @@ def multi_box_head(inputs,
             conf_loc.shape[0], conf_loc.shape[1] * conf_loc.shape[2] *
             conf_loc.shape[3] / num_classes, num_classes
         ]
-        conf_loc_flatten = ops.reshape(conf_loc, shape=new_shape)
+        conf_loc_flatten = nn.reshape(conf_loc, shape=new_shape)
         mbox_confs.append(conf_loc_flatten)
 
     if len(box_results) == 1:
@@ -834,4 +996,98 @@ def multi_box_head(inputs,
         mbox_locs_concat = tensor.concat(mbox_locs, axis=1)
         mbox_confs_concat = tensor.concat(mbox_confs, axis=1)
 
+    box.stop_gradient = True
+    var.stop_gradient = True
     return mbox_locs_concat, mbox_confs_concat, box, var
+
+
+def anchor_generator(input,
+                     anchor_sizes=None,
+                     aspect_ratios=None,
+                     variance=[0.1, 0.1, 0.2, 0.2],
+                     stride=None,
+                     offset=0.5,
+                     name=None):
+    """
+    **Anchor generator operator**
+
+    Generate anchors for Faster RCNN algorithm.
+    Each position of the input produce N anchors, N =
+    size(anchor_sizes) * size(aspect_ratios). The order of generated anchors
+    is firstly aspect_ratios loop then anchor_sizes loop.
+
+    Args:
+       input(Variable): The input feature map, the format is NCHW.
+       anchor_sizes(list|tuple|float): The anchor sizes of generated anchors,
+       given in absolute pixels e.g. [64., 128., 256., 512.].
+       For instance, the anchor size of 64 means the area of this anchor equals to 64**2.
+       aspect_ratios(list|tuple|float): The height / width ratios of generated
+            anchors, e.g. [0.5, 1.0, 2.0].
+       variance(list|tuple): The variances to be used in box regression deltas.
+            Default:[0.1, 0.1, 0.2, 0.2].
+       stride(list|turple): The anchors stride across width and height,
+            e.g. [16.0, 16.0]
+       offset(float): Prior boxes center offset. Default: 0.5
+       name(str): Name of the prior box op. Default: None.
+
+    Returns:
+        Anchors(Variable):  The output anchors with a layout of [H, W, num_anchors, 4].
+              H is the height of input, W is the width of input,
+              num_anchors is the box count of each position.
+              Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
+        Variances(Variable): The expanded variances of anchors
+              with a layout of [H, W, num_priors, 4].
+              H is the height of input, W is the width of input
+              num_anchors is the box count of each position.
+              Each variance is in (xcenter, ycenter, w, h) format.
+
+
+    Examples:
+
+        .. code-block:: python
+
+            anchor, var = anchor_generator(
+                input=conv1,
+                anchor_sizes=[64, 128, 256, 512],
+                aspect_ratios=[0.5, 1.0, 2.0],
+                variance=[0.1, 0.1, 0.2, 0.2],
+                stride=[16.0, 16.0],
+                offset=0.5)
+    """
+    helper = LayerHelper("anchor_generator", **locals())
+    dtype = helper.input_dtype()
+
+    def _is_list_or_tuple_(data):
+        return (isinstance(data, list) or isinstance(data, tuple))
+
+    if not _is_list_or_tuple_(anchor_sizes):
+        anchor_sizes = [anchor_sizes]
+    if not _is_list_or_tuple_(aspect_ratios):
+        aspect_ratios = [aspect_ratios]
+    if not (_is_list_or_tuple_(stride) and len(stride) == 2):
+        raise ValueError('stride should be a list or tuple ',
+                         'with length 2, (stride_width, stride_height).')
+
+    anchor_sizes = list(map(float, anchor_sizes))
+    aspect_ratios = list(map(float, aspect_ratios))
+    stride = list(map(float, stride))
+
+    attrs = {
+        'anchor_sizes': anchor_sizes,
+        'aspect_ratios': aspect_ratios,
+        'variances': variance,
+        'stride': stride,
+        'offset': offset
+    }
+
+    anchor = helper.create_tmp_variable(dtype)
+    var = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="anchor_generator",
+        inputs={"Input": input},
+        outputs={"Anchors": anchor,
+                 "Variances": var},
+        attrs=attrs, )
+    anchor.stop_gradient = True
+    var.stop_gradient = True
+    return anchor, var
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index f1b2af70205ab40f08c11061a683b567f5bcbb7b..f33ae76aea95ceeca73c5bae6e4e490cdff29bf3 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -11,17 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import contextlib
 
 from .. import core
-from ..framework import convert_np_dtype_to_dtype_, default_main_program, default_startup_program
+from ..framework import convert_np_dtype_to_dtype_, default_main_program, default_startup_program, Program
 from ..unique_name import generate as unique_name
 from control_flow import BlockGuard
 from ..layer_helper import LayerHelper
 from ..executor import global_scope
+from layer_function_generator import generate_layer_fn, templatedoc
 
 __all__ = [
-    'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
-    'read_file'
+    'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'Recv',
+    'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch',
+    'double_buffer', 'random_data_generator', 'Preprocessor', 'load'
 ]
 
 
@@ -50,8 +53,6 @@ def data(name,
        dtype(int|float): The type of data : float32, float_16, int etc
        type(VarType): The output type. By default it is LOD_TENSOR.
        lod_level(int): The LoD Level. 0 means the input data is not a sequence.
-       main_program(Program): Name of the main program that calls this
-       startup_program(Program): Name of the startup program
        stop_gradient(bool): A boolean that mentions whether gradient should flow.
 
     Returns:
@@ -74,13 +75,15 @@ def data(name,
     if append_batch_size:
         shape = [-1] + shape  # append batch size as -1
 
-    return helper.create_global_variable(
+    data_var = helper.create_global_variable(
         name=name,
         shape=shape,
         dtype=dtype,
         type=type,
         stop_gradient=stop_gradient,
-        lod_level=lod_level)
+        lod_level=lod_level,
+        is_data=True)
+    return data_var
 
 
 class BlockGuardServ(BlockGuard):
@@ -106,15 +109,40 @@ class BlockGuardServ(BlockGuard):
 
 class ListenAndServ(object):
     """
-    ListenAndServ class.
+    **ListenAndServ Layer**
 
-    ListenAndServ class is used to wrap listen_and_serv op to create a server
-    which can receive variables from clients and run a block.
+    ListenAndServ is used to create a rpc server bind and listen
+    on specific TCP port, this server will run the sub-block when
+    received variables from clients.
+
+    Args:
+        endpoint(string): IP:port string which the server will listen on.
+        inputs(list): a list of variables that the server will get from clients.
+        fan_in(int): how many client are expected to report to this server, default: 1.
+        optimizer_mode(bool): whether to run the server as a parameter server, default: True.
+
+    Examples:
+        .. code-block:: python
+
+            with fluid.program_guard(main):
+                serv = layers.ListenAndServ(
+                    "127.0.0.1:6170", ["X"], optimizer_mode=False)
+                with serv.do():
+                    x = layers.data(
+                        shape=[32, 32],
+                        dtype='float32',
+                        name="X",
+                        append_batch_size=False)
+                    fluid.initializer.Constant(value=1.0)(x, main.global_block())
+                    layers.scale(x=x, scale=10.0, out=out_var)
+
+            exe = fluid.Executor(place)
+            exe.run(main)
     """
 
-    def __init__(self, endpoint, fan_in=1, optimizer_mode=True):
+    def __init__(self, endpoint, inputs, fan_in=1, optimizer_mode=True):
         self.helper = LayerHelper("listen_and_serv")
-        self.inputs = []
+        self.inputs = inputs
         self.outputs = []
         self.endpoint = endpoint
         self.fan_in = fan_in
@@ -159,64 +187,66 @@ class ListenAndServ(object):
         current_block = main_program.current_block()
         parent_block = self.parent_block()
 
-        params, grads = self.get_params_and_grads()
-        param_names = [p.name for p in params]
-        grad_names = [g.name for g in grads]
         parent_block.append_op(
             type='listen_and_serv',
-            inputs={},
+            inputs={"X": self.inputs},
             outputs={},
             attrs={
                 'endpoint': self.endpoint,
                 'Fanin': self.fan_in,
-                'ParamList': param_names,
-                'GradList': grad_names,
-                'OptimizeBlock': current_block
+                'optimize_blocks': [
+                    current_block
+                ],  # did not support multiple optimize blocks in layers
+                'sync_mode': True,  # did not support async now in layers
+                'grad_to_block_id': [""]
             })
 
 
-def Send(endpoints, send_vars, get_vars):
+def Send(endpoints, send_vars, sync=True):
     """
-    Send layer
+    Send variables to the server side, and get vars from server
+    side when server have finished running server side program.
 
     Args:
-        endpoints: comma seperated IP:PORT pairs in the order
+        endpoints (str): comma seperated IP:PORT pairs in the order
                    of send_vars to send
-        send_vars: vars to send
-        get_vars: vars to get from server after send completes.
+        send_vars (list): variables to send to server
+        sync (bool): whether to wait the request finish
 
-    Send variables to the server side, and get vars from server
-    side when server have finished running server side program.
     """
     assert (type(send_vars) == list)
-    assert (type(get_vars) == list)
 
     epmap = endpoints.split(",")
     endpoints = list(set(epmap))
 
     helper = LayerHelper("Send", **locals())
+    rpc_op_role_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
+
     helper.append_op(
         type="send",
         inputs={"X": send_vars},
-        outputs={"Out": get_vars},
-        attrs={"endpoints": endpoints,
-               "epmap": epmap})
+        attrs={
+            "endpoints": endpoints,
+            "epmap": epmap,
+            rpc_op_role_name: core.op_proto_and_checker_maker.OpRole.RPC
+        })
+    if sync:
+        helper.append_op(type="send_barrier", attrs={"endpoints": endpoints})
 
 
-def Recv(endpoints, get_vars):
+def Recv(endpoints, get_vars, sync=True):
     """
-    Recv layer
+    Receive variables from server side
 
     Args:
-        endpoints: comma seperated IP:PORT pairs in the order
+        endpoints (str): comma seperated IP:PORT pairs in the order
                    of send_vars to send
-        send_vars: vars to send
-        get_vars: vars to get from server after send completes.
+        get_vars (list): vars to get from server after send completes.
+        sync (bool): whether to wait the request finish
 
-    Send variables to the server side, and get vars from server
-    side when server have finished running server side program.
+    Returns:
+        list: list of received variables
     """
-    assert (type(send_vars) == list)
     assert (type(get_vars) == list)
 
     epmap = endpoints.split(",")
@@ -229,6 +259,9 @@ def Recv(endpoints, get_vars):
         outputs={"Out": get_vars},
         attrs={"endpoints": endpoints,
                "epmap": epmap})
+    if sync:
+        helper.append_op(type="fetch_barrier", attrs={"endpoints": endpoints})
+    return get_vars
 
 
 def monkey_patch_reader_methods(reader):
@@ -237,14 +270,12 @@ def monkey_patch_reader_methods(reader):
         var = scope.find_var(reader.name)
         return var.get_reader()
 
-    def eof():
-        return not __get_reader__().has_next()
-
     def reset():
         return __get_reader__().reset()
 
-    reader.eof = eof
     reader.reset = reset
+    reader.stop_gradient = True
+    reader.persistable = True
     return reader
 
 
@@ -253,10 +284,67 @@ def _copy_reader_var_(block, var):
     new_var.desc.set_shapes(var.desc.shapes())
     new_var.desc.set_dtypes(var.desc.dtypes())
     new_var.persistable = True
-    return monkey_patch_reader_methods(new_var)
+    return new_var
+
+
+def _copy_reader_create_op_(block, op):
+    input_param_names = op.input_names
+    new_input_map = {}
+    for param_name in input_param_names:
+        new_input_map[param_name] = []
+        arg_names = op.input(param_name)
+        for arg_name in arg_names:
+            new_input_map[param_name].append(block.var(arg_name))
+
+    output_param_names = op.output_names
+    new_output_map = {}
+    for param_name in output_param_names:
+        new_output_map[param_name] = []
+        arg_names = op.output(param_name)
+        for arg_name in arg_names:
+            new_output_map[param_name].append(block.var(arg_name))
+
+    new_op = block.append_op(
+        type=op.type,
+        inputs=new_input_map,
+        outputs=new_output_map,
+        attrs=op.all_attrs())
+    return new_op
+
+
+@templatedoc(op_type='create_recordio_file_reader')
+def open_recordio_file(filename,
+                       shapes,
+                       lod_levels,
+                       dtypes,
+                       pass_num=1,
+                       for_parallel=True):
+    """
+    ${comment}
+
+    Args:
+       filename(${filename_type}): ${filename_comment}.
+       shapes(list): List of tuples which declaring data shapes.
+       lod_levels(${lod_levels_type}): ${lod_levels_comment}.
+       dtypes(list): List of strs which declaring data type.
+       pass_num(int): Number of passes to run.
+       for_parallel(Bool): Set it as True if you are going to run
+            subsequent operators in parallel.
 
+    Returns:
+       ${out_comment}.
+
+    Examples:
 
-def open_recordio_file(filename, shapes, lod_levels, dtypes):
+        >>> import paddle.fluid as fluid
+        >>> reader = fluid.layers.io.open_recordio_file(
+        >>>                               filename='./data.recordio',
+        >>>                               shapes=[(3,224,224), (1)],
+        >>>                               lod_levels=[0, 0],
+        >>>                               dtypes=['float32', 'int64'])
+        >>> # Via the reader, we can use 'read_file' layer to get data:
+        >>> image, label = fluid.layers.io.read_file(reader)
+    """
     dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
     shape_concat = []
     ranks = []
@@ -281,20 +369,450 @@ def open_recordio_file(filename, shapes, lod_levels, dtypes):
 
     startup_var.desc.set_dtypes(dtypes)
     startup_var.persistable = True
-    return _copy_reader_var_(default_main_program().current_block(),
-                             startup_var)
+    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
+                                      startup_var)
+
+    if pass_num > 1:
+        main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
+
+    if for_parallel:
+        main_prog_var = parallel(reader=main_prog_var)
+
+    return monkey_patch_reader_methods(main_prog_var)
+
+
+def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
+    """
+    Create a uniform random data generator
+
+    This layer returns a Reader Variable.
+    Instead of opening a file and reading data from it, this 
+    Reader Variable generates float uniform random data by itself. 
+    It can be used as a dummy reader to test a network without 
+    opening a real file.
+
+    Args:
+       low(float): The lower bound of data's uniform distribution.
+       high(float): The upper bound of data's uniform distribution.
+       shapes(list): List of tuples which declaring data shapes.
+       lod_levels(list): List of ints which declaring data lod_level.
+       for_parallel(Bool): Set it as True if you are going to run
+            subsequent operators in parallel.
+
+    Returns:
+       Variable: A Reader Variable from which we can get random data.
+
+    Examples:
+
+        .. code-block:: python
+
+            reader = fluid.layers.random_data_generator(
+                                             low=0.0,
+                                             high=1.0,
+                                             shapes=[[3,224,224], [1]],
+                                             lod_levels=[0, 0])
+            # Via the reader, we can use 'read_file' layer to get data:
+            image, label = fluid.layers.read_file(reader)
+    """
+    dtypes = [core.VarDesc.VarType.FP32] * len(shapes)
+    shape_concat = []
+    ranks = []
+
+    for shape in shapes:
+        shape_concat.extend(shape)
+        ranks.append(len(shape))
+
+    var_name = unique_name('random_data_generator')
+
+    startup_blk = default_startup_program().current_block()
+    startup_var = startup_blk.create_var(name=var_name)
+    startup_blk.append_op(
+        type='create_random_data_generator',
+        outputs={'Out': [startup_var]},
+        attrs={
+            'low': low,
+            'high': high,
+            'shape_concat': shape_concat,
+            'lod_levels': lod_levels,
+            'ranks': ranks
+        })
+
+    startup_var.desc.set_dtypes(dtypes)
+    startup_var.persistable = True
+    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
+                                      startup_var)
+
+    if for_parallel:
+        main_prog_var = parallel(reader=main_prog_var)
+
+    return monkey_patch_reader_methods(main_prog_var)
+
+
+def open_files(filenames,
+               shapes,
+               lod_levels,
+               dtypes,
+               thread_num=1,
+               buffer_size=None,
+               pass_num=1,
+               for_parallel=True):
+    """
+    Open files
+
+    This layer takes a list of files to read from and returns a Reader Variable. 
+    Via the Reader Variable, we can get data from given files. All files must 
+    have name suffixs to indicate their formats, e.g., '*.recordio'. 
+
+    Args:
+       filenames(list): The list of file names.
+       shapes(list): List of tuples which declaring data shapes.
+       lod_levels(list): List of ints which declaring data lod_level.
+       dtypes(list): List of strs which declaring data type.
+       thread_num(int): The maximal concurrent prefetch thread number.
+       buffer_size(int|None): The size of prefetch buffer. If it is setted None, 
+            buffer size will be thread_num * 3.
+            Default: None
+       pass_num(int): Number of passes to run.
+       for_parallel(Bool): Set it as True if you are going to run 
+            subsequent operators in parallel.
+            Default: True
+
+    Returns:
+       Variable: A Reader Variable via which we can get file data.
+
+    Examples:
+       .. code-block:: python
+
+         reader = fluid.layers.io.open_files(filenames=['./data1.recordio',
+                                                     './data2.recordio'],
+                                             shapes=[(3,224,224), (1)],
+                                             lod_levels=[0, 0],
+                                             dtypes=['float32', 'int64'],
+                                             thread_num=2,
+                                             buffer_size=2)
+
+         # Via the reader, we can use 'read_file' layer to get data:
+         image, label = fluid.layers.io.read_file(reader)
+    """
+    if buffer_size is None:
+        buffer_size = thread_num * 3
+    if isinstance(filenames, basestring):
+        filenames = [filenames]
+    dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
+    shape_concat = []
+    ranks = []
+
+    for shape in shapes:
+        shape_concat.extend(shape)
+        ranks.append(len(shape))
+
+    multi_file_reader_name = unique_name('multi_file_reader')
+    startup_blk = default_startup_program().current_block()
+    startup_reader = startup_blk.create_var(name=multi_file_reader_name)
+    startup_blk.append_op(
+        type='open_files',
+        outputs={'Out': [startup_reader]},
+        attrs={
+            'shape_concat': shape_concat,
+            'lod_levels': lod_levels,
+            'ranks': ranks,
+            'file_names': filenames,
+            'thread_num': thread_num,
+            'buffer_size': buffer_size
+        })
+
+    startup_reader.desc.set_dtypes(dtypes)
+    startup_reader.persistable = True
+    main_prog_reader = _copy_reader_var_(default_main_program().current_block(),
+                                         startup_reader)
+    if pass_num > 1:
+        main_prog_reader = multi_pass(
+            reader=main_prog_reader, pass_num=pass_num)
+
+    if for_parallel:
+        main_prog_reader = parallel(reader=main_prog_reader)
+
+    return monkey_patch_reader_methods(main_prog_reader)
+
+
+def __create_shared_decorated_reader__(op_type, reader, attrs):
+    var_name = unique_name(op_type)
+    startup_blk = default_startup_program().current_block()
+    startup_var = startup_blk.create_var(name=var_name)
+    startop_op = startup_blk.append_op(
+        type=op_type,
+        inputs={'UnderlyingReader': reader},
+        outputs={'Out': [startup_var]},
+        attrs=attrs)
+    startup_var.persistable = True
+    main_prog_block = default_main_program().current_block()
+    main_prog_var = _copy_reader_var_(main_prog_block, startup_var)
+    _copy_reader_create_op_(main_prog_block, startop_op)
+    return monkey_patch_reader_methods(main_prog_var)
+
+
+def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None):
+    new_reader_name = name if name is not None else unique_name(op_type)
+    main_blk = default_main_program().current_block()
+    new_reader = main_blk.create_var(name=new_reader_name)
+    main_blk.append_op(
+        type=op_type,
+        inputs={'UnderlyingReader': reader},
+        outputs={'Out': [new_reader]},
+        attrs=attrs)
+    return monkey_patch_reader_methods(new_reader)
+
+
+def shuffle(reader, buffer_size):
+    """
+    Shuffle the reader.
+    """
+    return __create_unshared_decorated_reader__(
+        'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})
+
+
+def batch(reader, batch_size):
+    """
+    This layer is a reader decorator. It takes a reader and adds 
+    'batching' decoration on it. When reading with the result 
+    decorated reader, output data will be automatically organized 
+    to the form of batches.
+
+    Args:
+        reader(Variable): The reader to be decorated with 'batching'.
+        batch_size(int): The batch size.
+
+    Returns:
+        Variable: The reader which has been decorated with 'batching'.
+
+    Examples:
+        .. code-block:: python
+
+            raw_reader = fluid.layers.io.open_files(filenames=['./data1.recordio',
+                                                           './data2.recordio'],
+                                                    shapes=[(3,224,224), (1)],
+                                                    lod_levels=[0, 0],
+                                                    dtypes=['float32', 'int64'],
+                                                    thread_num=2,
+                                                    buffer_size=2)
+            batch_reader = fluid.layers.batch(reader=raw_reader, batch_size=5)
+
+            # If we read data with the raw_reader:
+            #     data = fluid.layers.read_file(raw_reader)
+            # We can only get data instance by instance.
+            # 
+            # However, if we read data with the batch_reader:
+            #     data = fluid.layers.read_file(batch_reader)
+            # Each 5 adjacent instances will be automatically combined together 
+            # to become a batch. So what we get('data') is a batch data instead 
+            # of an instance.
+    """
+    return __create_unshared_decorated_reader__(
+        'create_batch_reader', reader, {'batch_size': int(batch_size)})
+
+
+def double_buffer(reader, place=None, name=None):
+    """
+    Wrap a double buffer reader. The data will copy to target place with a
+    double buffer queue. If the target place is None, the place that executor
+    perform on will be used.
+
+    Args:
+        reader(Variable): the reader variable need to be wrapped.
+        place(Place): the place of target data. Default is the sample place of
+            executor perform.
+
+        name(str): Variable name. None if the user does not care.
+
+    Returns:
+        wrapped reader with double buffer.
+
+    Examples:
+
+        >>> reader = fluid.layers.open_files(filenames=['somefile'],
+        >>>                                  shapes=[[-1, 784], [-1, 1]],
+        >>>                                  dtypes=['float32', 'int64'])
+        >>> reader = fluid.layers.double_buffer(reader)
+        >>> img, label = fluid.layers.read_file(reader)
+    """
+    attrs = dict()
+    if place is not None:
+        attrs['place'] = str(place).upper()
+    return __create_unshared_decorated_reader__(
+        'create_double_buffer_reader', reader, attrs, name=name)
+
+
+def multi_pass(reader, pass_num):
+    return __create_shared_decorated_reader__(
+        'create_multi_pass_reader', reader, {'pass_num': int(pass_num)})
+
+
+def parallel(reader):
+    return __create_shared_decorated_reader__('create_threaded_reader', reader,
+                                              {})
+
+
+def read_file(reader):
+    """
+    Execute the given reader and get data via it.
+
+    A reader is also a Variable. It can be a raw reader generated by 
+    `fluid.layers.open_files()` or a decorated one generated by 
+    `fluid.layers.double_buffer()` and so on.
 
+    Args:
+
+        reader(Variable): The reader to execute.
+
+    Returns:
+        Tuple[Variable]: Data read via the given reader.
 
-def read_file(file_obj):
+    Examples:
+        .. code-block:: python
+
+           data_file = fluid.layers.open_files(
+                filenames=['mnist.recordio'],
+                shapes=[(-1, 748), (-1, 1)],
+                lod_levels=[0, 0],
+                dtypes=["float32", "int64"])
+            data_file = fluid.layers.double_buffer(
+                fluid.layers.batch(data_file, batch_size=64))
+            input, label = fluid.layers.read_file(data_file)
+    """
     helper = LayerHelper('read_file')
     out = [
         helper.create_tmp_variable(
             stop_gradient=True, dtype='float32')
-        for _ in range(len(file_obj.desc.shapes()))
+        for _ in range(len(reader.desc.shapes()))
     ]
     helper.append_op(
-        type='read', inputs={'Reader': [file_obj]}, outputs={'Out': out})
+        type='read', inputs={'Reader': [reader]}, outputs={'Out': out})
     if len(out) == 1:
         return out[0]
     else:
         return out
+
+
+class Preprocessor(object):
+    """
+    A block for data pre-processing in reader.
+
+    Args:
+        reader (Variable): A reader variable.
+        name (str, default None): The name of the reader.
+
+    Examples:
+          .. code-block:: python
+
+            preprocessor = fluid.layers.io.Preprocessor(reader=reader)
+            with preprocessor.block():
+                img, lbl = preprocessor.inputs()
+                img_out = img / 2
+                lbl_out = lbl + 1
+                preprocessor.outputs(img_out, lbl_out)
+
+            data_file = fluid.layers.io.double_buffer(preprocessor())
+
+    """
+    BEFORE_SUB_BLOCK = 0
+    IN_SUB_BLOCK = 1
+    AFTER_SUB_BLOCK = 2
+
+    def __init__(self, reader, name=None):
+        self.underlying_reader = reader
+        new_reader_name = name if name is not None else unique_name(
+            "create_custom_reader")
+        self.main_prog = default_main_program()
+        self.reader = self.main_prog.current_block().create_var(
+            name=new_reader_name)
+        self.sub_block = None
+        self.source_var_names = None
+        self.sink_var_names = None
+        self.status = Preprocessor.BEFORE_SUB_BLOCK
+
+    def is_completed(self):
+        return self.sub_block and self.source_var_names and self.sink_var_names
+
+    @contextlib.contextmanager
+    def block(self):
+        self.status = Preprocessor.IN_SUB_BLOCK
+        self.sub_block = self.main_prog.create_block()
+        yield
+        self.main_prog.rollback()
+        self.status = Preprocessor.AFTER_SUB_BLOCK
+        if not self.is_completed():
+            raise RuntimeError(
+                "The definition of preprocessor is incompleted! "
+                "Please make sure that you have set input and output "
+                "variables by invoking 'inputs' and 'outputs' in "
+                "Preprocessor's sub-block.")
+
+    def inputs(self):
+        if self.status != Preprocessor.IN_SUB_BLOCK:
+            raise RuntimeError(
+                "Preprocessor.inputs() can only be invoked inside the sub-block."
+            )
+
+        source_shapes = self.underlying_reader.desc.shapes()
+        source_dtypes = self.underlying_reader.desc.dtypes()
+        source_lod_levels = self.underlying_reader.desc.lod_levels()
+        self.source_var_names = [
+            unique_name("preprocessor_source")
+            for _ in xrange(len(source_shapes))
+        ]
+        source_vars = []
+        for var_name, shape, dtype, lod_level in zip(
+                self.source_var_names, source_shapes, source_dtypes,
+                source_lod_levels):
+            source_vars.append(self.main_prog.current_block().create_var(
+                name=var_name, shape=shape, dtype=dtype, lod_level=lod_level))
+        return source_vars
+
+    def outputs(self, *outs):
+        if self.status != Preprocessor.IN_SUB_BLOCK:
+            raise RuntimeError(
+                "Preprocessor.outputs() can only be invoked inside the sub-block."
+            )
+        self.sink_var_names = [var.name for var in outs]
+
+    def __call__(self, *args, **kwargs):
+        if self.status != Preprocessor.AFTER_SUB_BLOCK:
+            raise RuntimeError(
+                "Preprocessor output can only be retrieved after rnn block.")
+
+        self.main_prog.current_block().append_op(
+            type="create_custom_reader",
+            inputs={'UnderlyingReader': self.underlying_reader},
+            outputs={'Out': [self.reader]},
+            attrs={
+                "sub_block": self.sub_block,
+                "source_var_names": self.source_var_names,
+                "sink_var_names": self.sink_var_names
+            })
+        return monkey_patch_reader_methods(self.reader)
+
+
+@templatedoc()
+def load(out, file_path, load_as_fp16=None):
+    """
+    ${comment}
+
+    >>> import paddle.fluid as fluid
+    >>> tmp_tensor = fluid.layers.create_tensor(dtype='float32')
+    >>> fluid.layers.load(tmp_tensor, "./tmp_tensor.bin")
+
+    Args:
+        out(${out_type}): ${out_comment}.
+
+        file_path(${file_path_type}): ${file_path_comment}.
+
+        load_as_fp16(${load_as_fp16_type}): ${load_as_fp16_comment}.
+
+    Returns:
+        None
+    """
+    helper = LayerHelper("load", **locals())
+    attrs = {"file_path": file_path}
+    if load_as_fp16 is not None:
+        attrs['load_as_fp16'] = load_as_fp16
+    helper.append_op(type="load", inputs={}, output={"Out": out}, args=attrs)
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index bd79022a0c39cf18bd05d49ac62986d342a4ae06..3096389101a5e5b302c78145b8bc9f1d71f6b8cb 100644
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -15,19 +15,13 @@ import re
 import cStringIO
 import functools
 import warnings
+import string
 
-from .. import proto
-
-framework_pb2 = proto.framework_pb2
-
+from ..proto import framework_pb2
 from ..framework import OpProtoHolder, Variable
 from ..layer_helper import LayerHelper
 
-__all__ = [
-    'deprecated',
-    'generate_layer_fn',
-    'autodoc',
-]
+__all__ = ['deprecated', 'generate_layer_fn', 'autodoc', 'templatedoc']
 
 
 def _convert_(name):
@@ -46,6 +40,22 @@ def _convert_(name):
     return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
 
 
+def _type_to_str_(tp):
+    return framework_pb2.AttrType.Name(tp)
+
+
+_two_dollar_pattern_ = re.compile(r"\$\$([^\$]+)\$\$")
+_single_dollar_pattern_ = re.compile(r"\$([^\$]+)\$")
+_two_bang_pattern_ = re.compile(r"!!([^!]+)!!")
+
+
+def escape_math(text):
+    return _two_bang_pattern_.sub(
+        r'$$\1$$',
+        _single_dollar_pattern_.sub(r':math:`\1`',
+                                    _two_dollar_pattern_.sub(r"!!\1!!", text)))
+
+
 def _generate_doc_string_(op_proto):
     """
     Generate docstring by OpProto
@@ -57,34 +67,33 @@ def _generate_doc_string_(op_proto):
         str: the document string
     """
 
-    def _type_to_str_(tp):
-        return framework_pb2.AttrType.Name(tp)
-
     if not isinstance(op_proto, framework_pb2.OpProto):
         raise TypeError("OpProto should be `framework_pb2.OpProto`")
 
     buf = cStringIO.StringIO()
-    buf.write(op_proto.comment)
+    buf.write(escape_math(op_proto.comment))
     buf.write('\nArgs:\n')
     for each_input in op_proto.inputs:
         line_begin = '    {0}: '.format(_convert_(each_input.name))
         buf.write(line_begin)
-        buf.write(each_input.comment)
-        buf.write('\n')
-        buf.write(' ' * len(line_begin))
-        buf.write('Duplicable: ')
-        buf.write(str(each_input.duplicable))
-        buf.write('  Optional: ')
-        buf.write(str(each_input.dispensable))
+        buf.write(escape_math(each_input.comment))
+        if each_input.duplicable:
+            buf.write("  Duplicatable.")
+        if each_input.dispensable:
+            buf.write("  Optional.")
         buf.write('\n')
 
+    skip_attrs = OpProtoHolder.generated_op_attr_names()
+
     for each_attr in op_proto.attrs:
+        if each_attr.name in skip_attrs:
+            continue
         buf.write('    ')
         buf.write(each_attr.name)
         buf.write(' (')
         buf.write(_type_to_str_(each_attr.type))
         buf.write('): ')
-        buf.write(each_attr.comment)
+        buf.write(escape_math(each_attr.comment))
         buf.write('\n')
 
     if len(op_proto.outputs) != 0:
@@ -93,7 +102,7 @@ def _generate_doc_string_(op_proto):
         for each_opt in op_proto.outputs:
             if not each_opt.intermediate:
                 break
-        buf.write(each_opt.comment)
+        buf.write(escape_math(each_opt.comment))
 
     return buf.getvalue()
 
@@ -116,7 +125,7 @@ def generate_layer_fn(op_type):
 
     if len(not_intermediate_outputs) != 1:
         raise ValueError("Only one non intermediate output operator can be",
-                         "automatically generated.")
+                         "automatically generated. {0}".format(op_type))
 
     if not_intermediate_outputs[0].duplicable:
         raise ValueError(
@@ -223,3 +232,61 @@ def autodoc(comment=""):
         return func
 
     return __impl__
+
+
+def templatedoc(op_type=None):
+    """
+    Decorator of layer function. It will use the docstring from the layer
+    function as the template. The template arguments are:
+
+    * ${comment}: The operator comment written in CPP.
+    * ${{name}_comment}: The comment of ${name} written with AddAttr, AddOutput,
+        and AddInput. The ${name} is Python snake style. i.e., xxx_xxx.
+    * ${{name}_type}: The type of ${name}.
+
+    Returns:
+        Decorated function.
+    """
+
+    def trim_ending_dot(msg):
+        return msg.rstrip('.')
+
+    def __impl__(func):
+        if op_type is None:
+            op_type_name = func.__name__
+        else:
+            op_type_name = op_type
+        op_proto = OpProtoHolder.instance().get_op_proto(op_type_name)
+        tmpl = string.Template(func.__doc__)
+
+        comment_lines = op_proto.comment.split("\n")
+        comment = ""
+        for line in comment_lines:
+            line = line.strip()
+            if len(line) != 0:
+                comment += escape_math(line)
+                comment += " "
+            elif len(comment) != 0:
+                comment += "\n    \n    "
+
+        args = {"comment": trim_ending_dot(comment)}
+        for each_input in op_proto.inputs:
+            input_name = _convert_(each_input.name)
+            args["{0}_comment".format(input_name)] = trim_ending_dot(
+                each_input.comment)
+            args["{0}_type".format(input_name)] = "Variable"
+        for each_attr in op_proto.attrs:
+            input_name = _convert_(each_attr.name)
+            args["{0}_comment".format(input_name)] = trim_ending_dot(
+                each_attr.comment)
+            args["{0}_type".format(input_name)] = _type_to_str_(each_attr.type)
+
+        for each_opt in op_proto.outputs:
+            output_name = _convert_(each_opt.name)
+            args["{0}_comment".format(output_name)] = trim_ending_dot(
+                each_opt.comment)
+            args["{0}_type".format(output_name)] = "Variable"
+        func.__doc__ = tmpl.substitute(args)
+        return func
+
+    return __impl__
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index 65b95a58d6546ed6d6b264443a7c802e16eef23f..6071e3e74218e4db4cddc223818d3a9b7086fd86 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -11,51 +11,100 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+When training a model, it's often useful to decay the
+learning rate during training process, this is called
+learning_rate_decay. There are many strategies to do
+this, this module will provide some classical method.
+User can also implement their own learning_rate_decay
+strategy according to this module.
+"""
 
 import control_flow
 import nn
 import ops
 import tensor
 from ..initializer import init_on_cpu
+from ..framework import default_main_program, Parameter
 
 __all__ = [
     'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
-    'polynomial_decay', 'piecewise_decay'
+    'polynomial_decay', 'piecewise_decay', 'noam_decay', 'append_LARS'
 ]
-"""
-When training a model, it's often useful to decay the
-learning rate during training process, this is called
-learning_rate_decay. There are many strategies to do
-this, this module will provide some classical method.
-User can also implement their own learning_rate_decay
-strategy according to this module.
-"""
 
 
-def _decay_step_counter():
+def _decay_step_counter(begin=0):
     # the first global step is zero in learning rate decay
     global_step = nn.autoincreased_step_counter(
-        counter_name='@LR_DECAY_COUNTER@', begin=0, step=1)
+        counter_name='@LR_DECAY_COUNTER@', begin=begin, step=1)
     global_step = tensor.cast(global_step, 'float32')
     return global_step
 
 
+def noam_decay(d_model, warmup_steps):
+    """
+    Noam decay method. The numpy implementation of noam decay as follows.
+
+    >>> import numpy as np
+    >>> lr_value = np.power(d_model, -0.5) * np.min([
+    >>>                         np.power(current_steps, -0.5),
+    >>>                         np.power(warmup_steps, -1.5) * current_steps])
+
+    Please reference `attention is all you need
+    <https://arxiv.org/pdf/1706.03762.pdf>`_.
+
+    Args:
+        d_model(Variable): The dimensionality of input and output of model.
+
+        warmup_steps(Variable): A super parameter.
+
+    Returns:
+        The decayed learning rate.
+    """
+    global_step = _decay_step_counter(1)
+    with init_on_cpu():
+        a = global_step**-0.5
+        b = (warmup_steps**-1.5) * global_step
+        lr_value = (d_model**-0.5) * ops.elementwise_min(a, b)
+
+    return lr_value
+
+
 def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
-    """Applies exponential decay to the learning rate.
+    """
+    Applies exponential decay to the learning rate. 
+
+    When training a model, it is often recommended to lower the learning rate as the 
+    training progresses. By using this function, the learning rate will be decayed by 
+    'decay_rate' every 'decay_steps' steps.
+
+    >>> if staircase == True:
+    >>>     decayed_learning_rate = learning_rate * decay_rate ^ floor(global_step / decay_steps)
+    >>> else:
+    >>>     decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)
 
-    ```python
-    decayed_learning_rate = learning_rate *
-            decay_rate ^ (global_step / decay_steps)
-    ```
     Args:
-        learning_rate: A scalar float32 value or a Variable. This
-          will be the initial learning rate during training
-        decay_steps: A Python `int32` number.
-        decay_rate: A Python `float` number.
-        staircase: Boolean. If set true, decay the learning rate every decay_steps.
+        learning_rate(Variable|float): The initial learning rate.
+        decay_steps(int): See the decay computation above.
+        decay_rate(float): The decay rate. See the decay computation above.
+        staircase(Boolean): If True, decay the learning rate at discrete intervals.
+                            Default: False
 
     Returns:
-        The decayed learning rate
+        Variable: The decayed learning rate
+
+    Examples:
+        .. code-block:: python
+
+          base_lr = 0.1
+          sgd_optimizer = fluid.optimizer.SGD(
+                learning_rate=fluid.layers.exponential_decay(
+                    learning_rate=base_lr,
+                    decay_steps=10000,
+                    decay_rate=0.5,
+                    staircase=True))
+          sgd_optimizer.minimize(avg_cost)
+
     """
     global_step = _decay_step_counter()
 
@@ -99,22 +148,39 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
 
 
 def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
-    """Applies inverse time decay to the initial learning rate.
+    """
+    Applies inverse time decay to the initial learning rate.
 
-    >>> if staircase:
+    When training a model, it is often recommended to lower the learning rate as the 
+    training progresses. By using this function, an inverse decay function will be 
+    applied to the initial learning rate.
+
+    >>> if staircase == True:
     >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
     >>> else:
     >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
 
     Args:
-        learning_rate: A scalar float32 value or a Variable. This
-          will be the initial learning rate during training.
-        decay_steps: A Python `int32` number.
-        decay_rate: A Python `float` number.
-        staircase: Boolean. If set true, decay the learning rate every decay_steps.
+        learning_rate(Variable|float): The initial learning rate.
+        decay_steps(int): See the decay computation above.
+        decay_rate(float): The decay rate. See the decay computation above.
+        staircase(Boolean): If True, decay the learning rate at discrete intervals.
+                            Default: False
 
     Returns:
-        The decayed learning rate
+        Variable: The decayed learning rate
+
+    Examples:
+        .. code-block:: python
+
+          base_lr = 0.1
+          sgd_optimizer = fluid.optimizer.SGD(
+                learning_rate=fluid.layers.inverse_time_decay(
+                    learning_rate=base_lr,
+                    decay_steps=10000,
+                    decay_rate=0.5,
+                    staircase=True))
+          sgd_optimizer.minimize(avg_cost)
     """
     global_step = _decay_step_counter()
 
@@ -133,25 +199,28 @@ def polynomial_decay(learning_rate,
                      end_learning_rate=0.0001,
                      power=1.0,
                      cycle=False):
-    """Applies polynomial decay to the initial learning rate.
+    """
+    Applies polynomial decay to the initial learning rate.
+
+    .. code-block:: python
+
+     if cycle:
+       decay_steps = decay_steps * ceil(global_step / decay_steps)
+     else:
+       global_step = min(global_step, decay_steps)
+       decayed_learning_rate = (learning_rate - end_learning_rate) *
+            (1 - global_step / decay_steps) ^ power + end_learning_rate
 
-    >>> if cycle:
-    >>>     decay_steps = decay_steps * ceil(global_step / decay_steps)
-    >>> else:
-    >>>     global_step = min(global_step, decay_steps)
-    >>> decayed_learning_rate = (learning_rate - end_learning_rate) *
-    >>>                   (1 - global_step / decay_steps) ^ power +
-    >>>                   end_learning_rate
     Args:
-        learning_rate: A scalar float32 value or a Variable. This
-          will be the initial learning rate during training
-        decay_steps: A Python `int32` number.
-        end_learning_rate: A Python `float` number.
-        power: A Python `float` number
-        cycle: Boolean. If set true, decay the learning rate every decay_steps.
+        learning_rate(Variable|float32): A scalar float32 value or a Variable. This
+          will be the initial learning rate during training.
+        decay_steps(int32): A Python `int32` number.
+        end_learning_rate(float): A Python `float` number.
+        power(float): A Python `float` number.
+        cycle(bool): If set true, decay the learning rate every decay_steps.
 
     Returns:
-        The decayed learning rate
+        Variable: The decayed learning rate
     """
     global_step = _decay_step_counter()
 
@@ -180,15 +249,27 @@ def polynomial_decay(learning_rate,
 def piecewise_decay(boundaries, values):
     """Applies piecewise decay to the initial learning rate.
 
-    >>> boundaries = [10000, 20000]
-    >>> values = [1.0, 0.5, 0.1]
-    >>>
-    >>> if step < 10000:
-    >>>     learning_rate = 1.0
-    >>> elif 10000 <= step < 20000:
-    >>>     learning_rate = 0.5
-    >>> else:
-    >>>     learning_rate = 0.1
+      The algorithm can be described as the code below.
+
+      .. code-block:: python
+
+        boundaries = [10000, 20000]
+        values = [1.0, 0.5, 0.1]
+        if step < 10000:
+            learning_rate = 1.0
+        elif 10000 <= step < 20000:
+            learning_rate = 0.5
+        else:
+            learning_rate = 0.1
+    Args:
+        boundaries: A list of steps numbers.
+        values: A list of learning rate values that will be picked during
+            different step boundaries.
+
+    Returns:
+        The decayed learning rate.
+
+
     """
 
     if len(values) - len(boundaries) != 1:
@@ -220,3 +301,41 @@ def piecewise_decay(boundaries, values):
                 tensor.assign(last_value_var, lr)
 
     return lr
+
+
+def append_LARS(params_grads, learning_rate, weight_decay):
+    """Applies LARS (LAYER-WISE ADAPTIVE RATE SCALING) to learning rate for
+       each layer.
+
+    ```python
+        learning_rate *= local_gw_ratio * sqrt(sumsq(param))
+                        / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param)))
+    ```
+
+    Args:
+        learning_rate: A learning rate Variable. This
+          is the global learning rate for LARS.
+        weight_decay: A Python `float` number.
+
+    Returns:
+        The decayed learning rate
+    """
+
+    def _balanced_weight(param_norm, grad_norm):
+        if weight_decay == 1.0:
+            return grad_norm + param_norm
+        else:
+            return grad_norm + weight_decay * param_norm
+
+    for param, grad in params_grads:
+        param_lr = param.optimize_attr['learning_rate']
+        param_norm = ops.sqrt(nn.reduce_sum(input=ops.square(param)))
+        grad_norm = ops.sqrt(nn.reduce_sum(input=ops.square(grad)))
+        if type(param_lr) == float and param_lr == 1.0:
+            decayed_lr = learning_rate * param_norm \
+                         / _balanced_weight(param_norm, grad_norm)
+        else:
+            decayed_lr = learning_rate * param_lr * param_norm \
+                         / _balanced_weight(param_norm, grad_norm)
+        # set back param local learning rate
+        param.optimize_attr['learning_rate'] = decayed_lr
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index 08a0184c2c2ad5f3c3792fd0a12f0ab0c746849b..1754061c4ba6f5b97bced3548bc412dfb1b7932c 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -169,7 +169,9 @@ def monkey_patch_variable():
             # a*b == b*a. Do not need to reverse explicitly
         ("__rmul__", "elementwise_mul", False),
         ("__div__", "elementwise_div", False),
+        ("__truediv__", "elementwise_div", False),
         ("__rdiv__", "elementwise_div", True),
+        ("__rtruediv__", "elementwise_div", True),
         ("__pow__", "elementwise_pow", False),
         ("__rpow__", "elementwise_pow", True),
             # for logical compare
diff --git a/python/paddle/fluid/layers/metric.py b/python/paddle/fluid/layers/metric.py
deleted file mode 100644
index 3d9157ad4ef9381b70b4007c5bdca91f1482b427..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/layers/metric.py
+++ /dev/null
@@ -1,57 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-All layers just related to metric.
-"""
-
-from ..layer_helper import LayerHelper
-from ..initializer import Normal, Constant
-from ..framework import Variable
-from ..param_attr import ParamAttr
-
-__all__ = ['accuracy']
-
-
-def accuracy(input, label, k=1, correct=None, total=None):
-    """
-    This function computes the accuracy using the input and label.
-    The output is the top_k inputs and their indices.
-    """
-    helper = LayerHelper("accuracy", **locals())
-    topk_out = helper.create_tmp_variable(dtype=input.dtype)
-    topk_indices = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="top_k",
-        inputs={"X": [input]},
-        outputs={"Out": [topk_out],
-                 "Indices": [topk_indices]},
-        attrs={"k": k})
-    acc_out = helper.create_tmp_variable(dtype="float32")
-    if correct is None:
-        correct = helper.create_tmp_variable(dtype="int64")
-    if total is None:
-        total = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="accuracy",
-        inputs={
-            "Out": [topk_out],
-            "Indices": [topk_indices],
-            "Label": [label]
-        },
-        outputs={
-            "Accuracy": [acc_out],
-            "Correct": [correct],
-            "Total": [total],
-        })
-    return acc_out
diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..99e82fdd04282177fae63f1fb94b5e32d41c612e
--- /dev/null
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -0,0 +1,138 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+All layers just related to metric.
+"""
+
+import warnings
+from ..layer_helper import LayerHelper
+from ..initializer import Normal, Constant
+from ..framework import Variable
+from ..param_attr import ParamAttr
+import nn
+
+__all__ = ['accuracy', 'auc']
+
+
+def accuracy(input, label, k=1, correct=None, total=None):
+    """
+    accuracy layer.
+    Refer to the https://en.wikipedia.org/wiki/Precision_and_recall
+
+    This function computes the accuracy using the input and label.
+    If the correct label occurs in top k predictions, then correct will increment by one.
+    Note: the dtype of accuracy is determined by input. the input and label dtype can be different.
+
+    Args:
+        input(Variable): The input of accuracy layer, which is the predictions of network.
+          Carry LoD information is supported.
+        label(Variable): The label of dataset.
+        k(int): The top k predictions for each class will be checked.
+        correct(Variable): The correct predictions count.
+        total(Variable): The total entries count.
+
+    Returns:
+        Variable: The correct rate.
+
+    Examples:
+        .. code-block:: python
+
+           data = fluid.layers.data(name="data", shape=[-1, 32, 32], dtype="float32")
+           label = fluid.layers.data(name="data", shape=[-1,1], dtype="int32")
+           predict = fluid.layers.fc(input=data, size=10)
+           acc = fluid.layers.accuracy(input=predict, label=label, k=5)
+
+    """
+    helper = LayerHelper("accuracy", **locals())
+    topk_out, topk_indices = nn.topk(input, k=k)
+    acc_out = helper.create_tmp_variable(dtype="float32")
+    if correct is None:
+        correct = helper.create_tmp_variable(dtype="int64")
+    if total is None:
+        total = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="accuracy",
+        inputs={
+            "Out": [topk_out],
+            "Indices": [topk_indices],
+            "Label": [label]
+        },
+        outputs={
+            "Accuracy": [acc_out],
+            "Correct": [correct],
+            "Total": [total],
+        })
+    return acc_out
+
+
+def auc(input, label, curve='ROC', num_thresholds=200):
+    """
+    **Area Under the Curve (AUC) Layer**
+
+    This implementation computes the AUC according to forward output and label.
+    It is used very widely in binary classification evaluation. 
+
+    Note: If input label contains values other than 0 and 1, it will be cast 
+    to `bool`. Find the relevant definitions `here <https://en.wikipedia.org\
+    /wiki/Receiver_operating_characteristic#Area_under_the_curve>`_.
+
+    There are two types of possible curves:
+
+        1. ROC: Receiver operating characteristic;
+        2. PR: Precision Recall
+
+    Args:
+        input(Variable): A floating-point 2D Variable, values are in the range 
+                         [0, 1]. Each row is sorted in descending order. This 
+                         input should be the output of topk. Typically, this 
+                         Variable indicates the probability of each label.
+        label(Variable): A 2D int Variable indicating the label of the training 
+                         data. The height is batch size and width is always 1.
+        curve(str): Curve type, can be 'ROC' or 'PR'. Default 'ROC'.
+        num_thresholds(int): The number of thresholds to use when discretizing 
+                             the roc curve. Default 200.
+
+    Returns:
+        Variable: A scalar representing the current AUC.
+
+    Examples:
+        .. code-block:: python
+        
+            # network is a binary classification model and label the ground truth
+            prediction = network(image, is_infer=True)
+            auc_out=fluid.layers.auc(input=prediction, label=label)
+    """
+
+    warnings.warn(
+        "This interface not recommended, fluid.layers.auc compute the auc at every minibatch, \
+        but can not aggregate them and get the pass AUC, because pass \
+        auc can not be averaged with weighted from the minibatch auc value. \
+        Please use fluid.metrics.Auc, it can compute the auc value via Python natively, \
+        which can get every minibatch and every pass auc value.", Warning)
+    helper = LayerHelper("auc", **locals())
+    topk_out = helper.create_tmp_variable(dtype=input.dtype)
+    topk_indices = helper.create_tmp_variable(dtype="int64")
+    topk_out, topk_indices = nn.topk(input, k=k)
+    auc_out = helper.create_tmp_variable(dtype="float32")
+    helper.append_op(
+        type="auc",
+        inputs={
+            "Out": [topk_out],
+            "Indices": [topk_indices],
+            "Label": [label]
+        },
+        attrs={"curve": curve,
+               "num_thresholds": num_thresholds},
+        outputs={"AUC": [auc_out], })
+    return auc_out
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index bc2be4cdfebe48457c679cec5ada1b29aac9d821..bcf520d5a4e3bbe1d949d08f42199dd8c5cdc947 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -19,9 +19,11 @@ from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
 from ..param_attr import ParamAttr
-from layer_function_generator import autodoc
+from layer_function_generator import autodoc, templatedoc
 from tensor import concat
 import utils
+import random
+from .. import unique_name
 
 __all__ = [
     'fc',
@@ -38,17 +40,23 @@ __all__ = [
     'chunk_eval',
     'sequence_conv',
     'conv2d',
+    'conv3d',
     'sequence_pool',
+    'sequence_softmax',
+    'softmax',
     'pool2d',
+    'pool3d',
     'batch_norm',
     'beam_search_decode',
     'conv2d_transpose',
+    'conv3d_transpose',
     'sequence_expand',
     'lstm_unit',
     'reduce_sum',
     'reduce_mean',
     'reduce_max',
     'reduce_min',
+    'reduce_prod',
     'sequence_first_step',
     'sequence_last_step',
     'dropout',
@@ -57,6 +65,7 @@ __all__ = [
     'edit_distance',
     'l2_normalize',
     'matmul',
+    'topk',
     'warpctc',
     'sequence_reshape',
     'transpose',
@@ -70,6 +79,22 @@ __all__ = [
     'smooth_l1',
     'one_hot',
     'autoincreased_step_counter',
+    'reshape',
+    'lod_reset',
+    'lrn',
+    'pad',
+    'label_smooth',
+    'roi_pool',
+    'dice_loss',
+    'image_resize',
+    'image_resize_short',
+    'resize_bilinear',
+    'gather',
+    'random_crop',
+    'mean_iou',
+    'relu',
+    'log',
+    'crop',
 ]
 
 
@@ -78,19 +103,22 @@ def fc(input,
        num_flatten_dims=1,
        param_attr=None,
        bias_attr=None,
+       use_mkldnn=False,
        act=None,
+       is_test=False,
        name=None):
     """
     **Fully Connected Layer**
 
-    The fully connected layer can take multiple tensors as its inputs. It
-    creates a variable called weights for each input tensor, which represents
-    a fully connected weight matrix from each input unit to each output unit.
-    The fully connected layer multiplies each input tensor with its coresponding
-    weight to produce an output Tensor. If multiple input tensors are given,
-    the results of multiple multiplications will be sumed up. If bias_attr is
-    not None, a bias variable will be created and added to the output. Finally,
-    if activation is not None, it will be applied to the output as well.
+    This function creates a fully connected layer in the network. It can take
+    multiple tensors as its inputs. It creates a variable called weights for
+    each input tensor, which represents a fully connected weight matrix from
+    each input unit to each output unit. The fully connected layer multiplies
+    each input tensor with its coresponding weight to produce an output Tensor.
+    If multiple input tensors are given, the results of multiple multiplications
+    will be sumed up. If bias_attr is not None, a bias variable will be created
+    and added to the output. Finally, if activation is not None, it will be applied
+    to the output as well.
 
     This process can be formulated as follows:
 
@@ -125,10 +153,13 @@ def fc(input,
         bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
             of this layer. If it is set to None, no bias will be added to the output units.
         act (str, default None): Activation to be applied to the output of this layer.
+        is_test(bool): A flag indicating whether execution is in test phase.
+        use_mkldnn(bool): Use mkldnn kernel or not, it is valid only when the mkldnn
+            library is installed. Default: False
         name (str, default None): The name of this layer.
 
     Returns:
-        A tensor variable storing the transformation result.
+        Variable: The transformation result.
 
     Raises:
         ValueError: If rank of the input tensor is less than 2.
@@ -163,13 +194,15 @@ def fc(input,
                    "y_num_col_dims": 1})
         mul_results.append(tmp)
 
-    # sum
     if len(mul_results) == 1:
         pre_bias = mul_results[0]
     else:
         pre_bias = helper.create_tmp_variable(dtype)
         helper.append_op(
-            type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
+            type="sum",
+            inputs={"X": mul_results},
+            outputs={"Out": pre_bias},
+            attrs={"use_mkldnn": use_mkldnn})
     # add bias
     pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)
     # add activation
@@ -179,6 +212,7 @@ def fc(input,
 def embedding(input,
               size,
               is_sparse=False,
+              is_distributed=False,
               padding_idx=None,
               param_attr=None,
               dtype='float32'):
@@ -198,10 +232,11 @@ def embedding(input,
             have two elements which indicate the size of the dictionary of
             embeddings and the size of each embedding vector respectively.
         is_sparse(bool): The flag indicating whether to use sparse update.
+        is_distributed(bool): Whether to run lookup table from remote parameter server.
         padding_idx(int|long|None): If :attr:`None`, it makes no effect to lookup.
             Otherwise the given :attr:`padding_idx` indicates padding the output
             with zeros whenever lookup encounters it in :attr:`input`. If
-            :math:`padding_idx < 0`, the padding_idx to use in lookup is
+            :math:`padding_idx < 0`, the :attr:`padding_idx` to use in lookup is
             :math:`size[0] + dim`.
         param_attr(ParamAttr): Parameters for this layer
         dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc
@@ -229,14 +264,19 @@ def embedding(input,
         inputs={'Ids': input,
                 'W': w},
         outputs={'Out': tmp},
-        attrs={'is_sparse': is_sparse,
-               'padding_idx': padding_idx})
+        attrs={
+            'is_sparse': is_sparse,
+            'is_distributed': is_distributed,
+            'padding_idx': padding_idx
+        })
     return tmp
 
 
-# TODO(qijun): expose H0 and C0
+@templatedoc(op_type="lstm")
 def dynamic_lstm(input,
                  size,
+                 h_0=None,
+                 c_0=None,
                  param_attr=None,
                  bias_attr=None,
                  use_peepholes=True,
@@ -247,56 +287,18 @@ def dynamic_lstm(input,
                  dtype='float32',
                  name=None):
     """
-    **Dynamic LSTM Layer**
-
-    The defalut implementation is diagonal/peephole connection
-    (https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows:
-
-    .. math::
-
-        i_t & = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i)
-
-        f_t & = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f)
-
-        \\tilde{c_t} & = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c)
-
-        o_t & = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o)
-
-        c_t & = f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
-
-        h_t & = o_t \odot act_h(c_t)
-
-    where the :math:`W` terms denote weight matrices (e.g. :math:`W_{xi}` is
-    the matrix of weights from the input gate to the input), :math:`W_{ic}, \
-    W_{fc}, W_{oc}` are diagonal weight matrices for peephole connections. In
-    our implementation, we use vectors to reprenset these diagonal weight
-    matrices. The :math:`b` terms denote bias vectors (:math:`b_i` is the input
-    gate bias vector), :math:`\sigma` is the non-linear activations, such as
-    logistic sigmoid function, and :math:`i, f, o` and :math:`c` are the input
-    gate, forget gate, output gate, and cell activation vectors, respectively,
-    all of which have the same size as the cell output activation vector :math:`h`.
-
-    The :math:`\odot` is the element-wise product of the vectors. :math:`act_g`
-    and :math:`act_h` are the cell input and cell output activation functions
-    and `tanh` is usually used for them. :math:`\\tilde{c_t}` is also called
-    candidate hidden state, which is computed based on the current input and
-    the previous hidden state.
-
-    Set `use_peepholes` to `False` to disable peephole connection. The formula
-    is omitted here, please refer to the paper
-    http://www.bioinf.jku.at/publications/older/2604.pdf for details.
-
-    Note that these :math:`W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}`
-    operations on the input :math:`x_{t}` are NOT included in this operator.
-    Users can choose to use fully-connect layer before LSTM layer.
+    ${comment}
 
     Args:
-        input(Variable): The input of dynamic_lstm layer, which supports
-                         variable-time length input sequence. The underlying
-                         tensor in this Variable is a matrix with shape
-                         (T X 4D), where T is the total time steps in this
-                         mini-batch, D is the hidden size.
-        size(int): 4 * hidden size.
+        input (Variable): ${input_comment}
+        size (int): 4 * hidden size.
+        h_0(Variable): The initial hidden state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size and D is the hidden size.
+        c_0(Variable): The initial cell state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size. `h_0` and `c_0` can be NULL but only at the same time.
+
         param_attr(ParamAttr|None): The parameter attribute for the learnable
                                hidden-hidden weights.
 
@@ -304,32 +306,26 @@ def dynamic_lstm(input,
                                                 W_{fh}, W_{oh}`}
                                - The shape is (D x 4D), where D is the hidden
                                  size.
-        bias_attr(ParamAttr|None): The bias attribute for the learnable bias
+        bias_attr (ParamAttr|None): The bias attribute for the learnable bias
                               weights, which contains two parts, input-hidden
                               bias weights and peephole connections weights if
                               setting `use_peepholes` to `True`.
 
                               1. `use_peepholes = False`
-                                - Biases = {:math:`b_c, b_i, b_f, b_o`}.
-                                - The shape is (1 x 4D).
+                                 - Biases = {:math:`b_c, b_i, b_f, b_o`}.
+                                 - The shape is (1 x 4D).
                               2. `use_peepholes = True`
-                                - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
+                                 - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
                                                  W_{fc}, W_{oc}`}.
-                                - The shape is (1 x 7D).
-        use_peepholes(bool): Whether to enable diagonal/peephole connections,
-                             default `True`.
-        is_reverse(bool): Whether to compute reversed LSTM, default `False`.
-        gate_activation(str): The activation for input gate, forget gate and
-                              output gate. Choices = ["sigmoid", "tanh", "relu",
-                              "identity"], default "sigmoid".
-        cell_activation(str): The activation for cell output. Choices = ["sigmoid",
-                              "tanh", "relu", "identity"], default "tanh".
-        candidate_activation(str): The activation for candidate hidden state.
-                              Choices = ["sigmoid", "tanh", "relu", "identity"],
-                              default "tanh".
-        dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
-        name(str|None): A name for this layer(optional). If set None, the layer
-                        will be named automatically.
+                                 - The shape is (1 x 7D).
+        use_peepholes (bool): ${use_peepholes_comment}
+        is_reverse (bool): ${is_reverse_comment}
+        gate_activation (str): ${gate_activation_comment}
+        cell_activation (str): ${cell_activation_comment}
+        candidate_activation (str): ${candidate_activation_comment}
+        dtype (str): Data type. Choices = ["float32", "float64"], default "float32".
+        name (str|None): A name for this layer(optional). If set None, the layer
+                         will be named automatically.
 
     Returns:
         tuple: The hidden state, and cell state of LSTM. The shape of both \
@@ -359,12 +355,20 @@ def dynamic_lstm(input,
     cell = helper.create_tmp_variable(dtype)
     batch_gate = helper.create_tmp_variable(dtype)
     batch_cell_pre_act = helper.create_tmp_variable(dtype)
+    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
+    batch_size = input.shape[0]
+    if h_0:
+        assert h_0.shape == (batch_size, size), \
+            'The shape of h0 should be (batch_size, %d)' % size
+        inputs['H0'] = h_0
+    if c_0:
+        assert c_0.shape == (batch_size, size), \
+            'The shape of c0 should be (batch_size, %d)' % size
+        inputs['C0'] = c_0
 
     helper.append_op(
         type='lstm',
-        inputs={'Input': input,
-                'Weight': weight,
-                'Bias': bias},
+        inputs=inputs,
         outputs={
             'Hidden': hidden,
             'Cell': cell,
@@ -502,15 +506,21 @@ def dynamic_lstmp(input,
                         will be named automatically.
 
     Returns:
-        tuple: The projection of hidden state, and cell state of LSTMP. The \
-               shape of projection is (T x P), for the cell state which is \
-               (T x D), and both LoD is the same with the `input`.
+        tuple: A tuple of two output variable: the projection of hidden state, \
+               and cell state of LSTMP. The shape of projection is (T x P), \
+               for the cell state which is (T x D), and both LoD is the same \
+               with the `input`.
 
     Examples:
+
         .. code-block:: python
 
+            dict_dim, emb_dim = 128, 64
+            data = fluid.layers.data(name='sequence', shape=[1],
+                                     dtype='int32', lod_level=1)
+            emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
             hidden_dim, proj_dim = 512, 256
-            fc_out = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
+            fc_out = fluid.layers.fc(input=emb, size=hidden_dim * 4,
                                      act=None, bias_attr=None)
             proj_out, _ = fluid.layers.dynamic_lstmp(input=fc_out,
                                                      size=hidden_dim * 4,
@@ -576,10 +586,10 @@ def dynamic_gru(input,
                 candidate_activation='tanh',
                 h_0=None):
     """
-    **Dynamic GRU Layer**
+    **Gated Recurrent Unit (GRU) Layer**
 
     Refer to `Empirical Evaluation of Gated Recurrent Neural Networks on
-    Sequence Modeling <https://arxiv.org/abs/1412.3555>`_
+    Sequence Modeling <https://arxiv.org/abs/1412.3555>`_ .
 
     The formula is as follows:
 
@@ -624,18 +634,27 @@ def dynamic_gru(input,
             :attr:`False`.
         gate_activation(str): The activation for update gate and reset gate.
             Choices = ["sigmoid", "tanh", "relu", "identity"], default "sigmoid".
-        activation(str): The activation for candidate hidden state.
+        candidate_activation(str): The activation for candidate hidden state.
             Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh".
+        h_0 (Variable): This is initial hidden state. If not set, default is
+            zero. This is a tensor with shape (N x D), where N is the number of
+            total time steps of input mini-batch feature and D is the hidden
+            size.
 
     Returns:
         Variable: The hidden state of GRU. The shape is :math:`(T \\times D)`, \
-            and lod is the same with the input.
+            and sequence length is the same with the input.
 
     Examples:
+
         .. code-block:: python
 
+            dict_dim, emb_dim = 128, 64
+            data = fluid.layers.data(name='sequence', shape=[1],
+                                     dtype='int32', lod_level=1)
+            emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
             hidden_dim = 512
-            x = fluid.layers.fc(input=data, size=hidden_dim * 3)
+            x = fluid.layers.fc(input=emb, size=hidden_dim * 3)
             hidden = fluid.layers.dynamic_gru(input=x, dim=hidden_dim)
     """
 
@@ -646,11 +665,13 @@ def dynamic_gru(input,
         attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
     bias = helper.create_parameter(
         attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True)
+    batch_size = input.shape[0]
     inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
     if h_0 != None:
         assert h_0.shape == (
-            size, size), 'The shape of h0 should be(%d, %d)' % (size, size)
-        inputs['h0'] = h_0
+            batch_size, size
+        ), 'The shape of h0 should be(batch_size, %d)' % size
+        inputs['H0'] = h_0
 
     hidden = helper.create_tmp_variable(dtype)
     batch_gate = helper.create_tmp_variable(dtype)
@@ -677,8 +698,8 @@ def dynamic_gru(input,
 def gru_unit(input,
              hidden,
              size,
-             weight=None,
-             bias=None,
+             param_attr=None,
+             bias_attr=None,
              activation='tanh',
              gate_activation='sigmoid'):
     """
@@ -709,8 +730,8 @@ def gru_unit(input,
         input (Variable): The fc transformed input value of current step.
         hidden (Variable): The hidden value of lstm unit from previous step.
         size (integer): The input dimension value.
-        weight (ParamAttr): The weight parameters for gru unit. Default: None
-        bias (ParamAttr): The bias parameters for gru unit. Default: None
+        param_attr (ParamAttr): The weight parameters for gru unit. Default: None
+        bias_attr (ParamAttr): The bias parameters for gru unit. Default: None
         activation (string): The activation type for cell (actNode).
                              Default: 'tanh'
         gate_activation (string): The activation type for gates (actGate).
@@ -742,40 +763,55 @@ def gru_unit(input,
     size = size / 3
 
     # create weight
-    if weight is None:
-        weight = helper.create_parameter(
-            attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
+    weight = helper.create_parameter(
+        attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
 
+    gate = helper.create_tmp_variable(dtype)
+    reset_hidden_pre = helper.create_tmp_variable(dtype)
+    updated_hidden = helper.create_tmp_variable(dtype)
+    inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': weight}
     # create bias
-
-    if bias is None:
+    if helper.bias_attr:
         bias_size = [1, 3 * size]
         bias = helper.create_parameter(
             attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
-
-    gate = helper.create_tmp_variable(dtype)
-    reset_hidden_pre = helper.create_tmp_variable(dtype)
-    updated_hidden = helper.create_tmp_variable(dtype)
+        inputs['Bias'] = bias
 
     helper.append_op(
         type='gru_unit',
-        inputs={'Input': input,
-                'HiddenPrev': hidden,
-                'Weight': weight},
+        inputs=inputs,
         outputs={
             'Gate': gate,
             'ResetHiddenPrev': reset_hidden_pre,
             'Hidden': updated_hidden,
         },
         attrs={
-            'activation': 0,
-            'gate_activation': 1,
+            'activation': 2,  # tanh
+            'gate_activation': 1,  # sigmoid
         })
 
     return updated_hidden, reset_hidden_pre, gate
 
 
+@templatedoc()
 def linear_chain_crf(input, label, param_attr=None):
+    """
+    Linear Chain CRF.
+
+    ${comment}
+
+    Args:
+        input(${emission_type}): ${emission_comment}
+        input(${transition_type}): ${transition_comment}
+        label(${label_type}): ${label_comment}
+        param_attr(ParamAttr): The attribute of the learnable parameter.
+
+    Returns:
+        output(${emission_exps_type}): ${emission_exps_comment} \n
+        output(${transition_exps_type}): ${transition_exps_comment} \n
+        output(${log_likelihood_type}): ${log_likelihood_comment}
+
+    """
     helper = LayerHelper('linear_chain_crf', **locals())
     size = input.shape[1]
     transition = helper.create_parameter(
@@ -801,7 +837,27 @@ def linear_chain_crf(input, label, param_attr=None):
     return log_likelihood
 
 
+@templatedoc()
 def crf_decoding(input, param_attr, label=None):
+    """
+    ${comment}
+
+    Args:
+        input(${emission_type}): ${emission_comment}
+
+        param_attr(ParamAttr): The parameter attribute for training.
+
+        label(${label_type}): ${label_comment}
+
+    Returns:
+        Variable: ${viterbi_path_comment}
+
+    Examples:
+        .. code-block:: python
+
+           crf_decode = layers.crf_decoding(
+                input=hidden, param_attr=ParamAttr(name="crfw"))
+    """
     helper = LayerHelper('crf_decoding', **locals())
     transition = helper.get_parameter(param_attr.name)
     viterbi_path = helper.create_tmp_variable(dtype=helper.input_dtype())
@@ -815,10 +871,17 @@ def crf_decoding(input, param_attr, label=None):
     return viterbi_path
 
 
+@templatedoc()
 def cos_sim(X, Y):
     """
-    This function performs the cosine similarity between two tensors
-    X and Y and returns that as the output.
+    ${comment}
+
+    Args:
+        X (Variable): ${x_comment}.
+        Y (Variable): ${y_comment}.
+
+    Returns:
+        Variable: the output of cosine(X, Y).
     """
     helper = LayerHelper('cos_sim', **locals())
     out = helper.create_tmp_variable(dtype=X.dtype)
@@ -834,33 +897,36 @@ def cos_sim(X, Y):
     return out
 
 
-def dropout(x, dropout_prob, is_test=False, seed=None):
+def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
     """
     Computes dropout.
 
     Drop or keep each element of `x` independently. Dropout is a regularization
     technique for reducing overfitting by preventing neuron co-adaption during
-    training. The dropout operator randomly set (according to the given dropout
+    training. The dropout operator randomly sets (according to the given dropout
     probability) the outputs of some units to zero, while others are remain
     unchanged.
 
     Args:
-       x(variable): The input tensor.
-       dropout_prob(float): Probability of setting units to zero.
-       is_test(bool): A flag indicating whether it is in test phrase or not.
-       seed(int): A Python integer used to create random seeds. If this
-                  parameter is set to None, a random seed is used.
-                  NOTE: If an integer seed is given, always the same output
-                  units will be dropped. DO NOT use a fixed seed in training.
+        x (Variable): The input tensor variable.
+        dropout_prob (float): Probability of setting units to zero.
+        is_test (bool): A flag indicating whether it is in test phrase or not.
+        seed (int): A Python integer used to create random seeds. If this
+                    parameter is set to None, a random seed is used.
+                    NOTE: If an integer seed is given, always the same output
+                    units will be dropped. DO NOT use a fixed seed in training.
+        name (str|None): A name for this layer(optional). If set None, the layer
+                         will be named automatically.
 
     Returns:
-        Variable: A tensor variable.
+        Variable: A tensor variable is the shape with `x`.
 
     Examples:
+
         .. code-block:: python
 
-          x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
-          droped = fluid.layers.dropout(input=x, dropout_rate=0.5)
+            x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+            droped = fluid.layers.dropout(x, dropout_prob=0.5)
     """
 
     helper = LayerHelper('dropout', **locals())
@@ -973,8 +1039,8 @@ def square_error_cost(input, label):
         * :math:`Out`: Output value, same shape with :math:`X`.
 
     Args:
-       input(Variable): Input tensor, has predictions.
-       label(Variable): Label tensor, has target labels.
+        input (Variable): Input tensor, has predictions.
+        label (Variable): Label tensor, has target labels.
 
     Returns:
         Variable: The tensor variable storing the element-wise squared error \
@@ -1003,14 +1069,101 @@ def square_error_cost(input, label):
     return square_out
 
 
+@templatedoc()
 def chunk_eval(input,
                label,
                chunk_scheme,
                num_chunk_types,
                excluded_chunk_types=None):
     """
+    **Chunk Evaluator**
+
     This function computes and outputs the precision, recall and
     F1-score of chunk detection.
+
+    For some basics of chunking, please refer to
+    'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'.
+
+    ChunkEvalOp computes the precision, recall, and F1-score of chunk detection,
+    and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
+    Here is a NER example of labeling for these tagging schemes:
+
+    .. code-block:: python
+
+       ====== ====== ======  =====  ==  ============   =====  ===== =====  ==  =========
+              Li     Ming    works  at  Agricultural   Bank   of    China  in  Beijing.
+       ====== ====== ======  =====  ==  ============   =====  ===== =====  ==  =========
+       IO     I-PER  I-PER   O      O   I-ORG          I-ORG  I-ORG I-ORG  O   I-LOC
+       IOB    B-PER  I-PER   O      O   B-ORG          I-ORG  I-ORG I-ORG  O   B-LOC
+       IOE    I-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   E-LOC
+       IOBES  B-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   S-LOC
+       ====== ====== ======  =====  ==  ============   =====  ===== =====  ==  =========
+
+    There are three chunk types(named entity types) including PER(person), ORG(organization)
+    and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chunk type>.
+
+    Since the calculations actually use label ids rather than labels, extra attention
+    should be paid when mapping labels to ids to make CheckEvalOp work. The key point
+    is that the listed equations are satisfied by ids.
+
+    .. code-block:: python
+
+       tag_type = label % num_tag_type
+       chunk_type = label / num_tag_type
+
+    where `num_tag_type` is the num of tag types in the tagging scheme, `num_chunk_type`
+    is the num of chunk types, and `tag_type` get its value from the following table.
+
+    .. code-block:: python
+
+       Scheme Begin Inside End   Single
+        plain   0     -      -     -
+        IOB     0     1      -     -
+        IOE     -     0      1     -
+        IOBES   0     1      2     3
+
+    Still use NER as example, assuming the tagging scheme is IOB while chunk types are ORG,
+    PER and LOC. To satisfy the above equations, the label map can be like this:
+
+    .. code-block:: python
+
+       B-ORG  0
+       I-ORG  1
+       B-PER  2
+       I-PER  3
+       B-LOC  4
+       I-LOC  5
+       O      6
+
+    It's not hard to verify the equations noting that the num of chunk types
+    is 3 and the num of tag types in IOB scheme is 2. For example, the label
+    id of I-LOC is 5, the tag type id of I-LOC is 1, and the chunk type id of
+    I-LOC is 2, which consistent with the results from the equations.
+
+    Args:
+        input (Variable): prediction output of the network.
+        label (Variable): label of the test data set.
+        chunk_scheme (str): ${chunk_scheme_comment}
+        num_chunk_types (int): ${num_chunk_types_comment}
+        excluded_chunk_types (list): ${excluded_chunk_types_comment}
+
+    Returns:
+        tuple: tuple containing: precision, recall, f1_score,
+        num_infer_chunks, num_label_chunks,
+        num_correct_chunks
+
+    Examples:
+        .. code-block:: python
+
+            crf = fluid.layers.linear_chain_crf(
+                input=hidden, label=label, param_attr=ParamAttr(name="crfw"))
+            crf_decode = fluid.layers.crf_decoding(
+                input=hidden, param_attr=ParamAttr(name="crfw"))
+            fluid.layers.chunk_eval(
+                input=crf_decode,
+                label=label,
+                chunk_scheme="IOB",
+                num_chunk_types=(label_dict_len - 1) / 2)
     """
     helper = LayerHelper("chunk_eval", **locals())
 
@@ -1043,6 +1196,7 @@ def chunk_eval(input,
             num_correct_chunks)
 
 
+@templatedoc()
 def sequence_conv(input,
                   num_filters,
                   filter_size=3,
@@ -1055,11 +1209,20 @@ def sequence_conv(input,
     This function creates the op for sequence_conv, using the inputs and
     other convolutional configurations for the filters and stride as given
     in the input parameters to the function.
-    """
 
-    # FIXME(dzh) : want to unify the argument of python layer
-    # function. So we ignore some unecessary attributes.
-    # such as, padding_trainable, context_start.
+    Args:
+        input (Variable): ${x_comment}
+        num_filters (int): number of filters.
+        filter_size (int): the filter size (H and W).
+        filter_stride (int): stride of the filter.
+        padding (bool): if True, add paddings.
+        bias_attr (ParamAttr|None): attributes for bias
+        param_attr (ParamAttr|None): attributes for parameter
+        act (str): the activation type
+
+    Returns:
+        Variable: output of sequence_conv
+    """
 
     helper = LayerHelper('sequence_conv', **locals())
     dtype = helper.input_dtype()
@@ -1084,26 +1247,129 @@ def sequence_conv(input,
     return helper.append_activation(pre_act)
 
 
+def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
+    """
+    This function computes the softmax activation among all time-steps for each
+    sequence. The dimension of each time-step should be 1. Thus, the shape of
+    input Tensor can be either :math:`[N, 1]` or :math:`[N]`, where :math:`N`
+    is the sum of the length of all sequences.
+
+    For i-th sequence in a mini-batch:
+
+    .. math::
+
+        Out(X[lod[i]:lod[i+1]], :) = \\frac{\exp(X[lod[i]:lod[i+1], :])}{\sum(\exp(X[lod[i]:lod[i+1], :]))}
+
+    For example, for a mini-batch of 3 sequences with variable-length,
+    each containing 2, 3, 2 time-steps, the lod of which is [0, 2, 5, 7],
+    then softmax will be computed among :math:`X[0:2, :]`, :math:`X[2:5, :]`,
+    :math:`X[5:7, :]`, and :math:`N` turns out to be 7.
+
+    Args:
+        input (Variable): The input variable which is a LoDTensor.
+        bias_attr (ParamAttr|None): attributes for bias
+        param_attr (ParamAttr|None): attributes for parameter
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn \
+        library is installed. Default: True
+
+    Returns:
+        Variable: output of sequence_softmax
+
+    Examples:
+
+        .. code-block:: python
+
+             x = fluid.layers.data(name='x', shape=[7, 1],
+                              dtype='float32', lod_level=1)
+             x_sequence_softmax = fluid.layers.sequence_softmax(input=x)
+    """
+    helper = LayerHelper('sequence_softmax', **locals())
+    dtype = helper.input_dtype()
+    softmax_out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="sequence_softmax",
+        inputs={"X": input},
+        outputs={"Out": softmax_out},
+        attrs={"use_cudnn": use_cudnn})
+    return softmax_out
+
+
+def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None):
+    """
+    The input of the softmax layer is a 2-D tensor with shape N x K (N is the
+    batch_size, K is the dimension of input feature). The output tensor has the
+    same shape as the input tensor.
+
+    For each row of the input tensor, the softmax operator squashes the
+    K-dimensional vector of arbitrary real values to a K-dimensional vector of real
+    values in the range [0, 1] that add up to 1.
+
+    It computes the exponential of the given dimension and the sum of exponential
+    values of all the other dimensions in the K-dimensional vector input.
+    Then the ratio of the exponential of the given dimension and the sum of
+    exponential values of all the other dimensions is the output of the softmax
+    operator.
+
+    For each row :math:`i` and each column :math:`j` in Input(X), we have:
+
+    .. math::
+
+        Out[i, j] = \\frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}
+
+    Args:
+        input (Variable): The input variable.
+        bias_attr (ParamAttr): attributes for bias
+        param_attr (ParamAttr): attributes for parameter
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn \
+        library is installed.
+
+    Returns:
+        Variable: output of softmax
+
+    Examples:
+
+        .. code-block:: python
+
+             fc = fluid.layers.fc(input=x, size=10)
+             softmax = fluid.layers.softmax(input=fc)
+
+    """
+    helper = LayerHelper('softmax', **locals())
+    dtype = helper.input_dtype()
+    softmax_out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="softmax",
+        inputs={"X": input},
+        outputs={"Out": softmax_out},
+        attrs={"use_cudnn": use_cudnn})
+    return softmax_out
+
+
 def conv2d(input,
            num_filters,
            filter_size,
            stride=1,
            padding=0,
+           dilation=1,
            groups=None,
            param_attr=None,
            bias_attr=None,
            use_cudnn=True,
            use_mkldnn=False,
-           act=None):
+           act=None,
+           name=None):
     """
-    **Convlution2D Layer**
-
     The convolution2D layer calculates the output based on the input, filter
-    and strides, paddings, dilations, groups parameters. Input(Input) and
-    Output(Output) are in NCHW format. Where N is batch size, C is the number of
+    and strides, paddings, dilations, groups parameters. Input and
+    Output are in NCHW format, where N is batch size, C is the number of
     channels, H is the height of the feature, and W is the width of the feature.
-    The details of convolution layer, please refer UFLDL's `convolution,
-    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_ .
+    Filter is in MCHW format, where M is the number of output image channels,
+    C is the number of input image channels, H is the height of the filter,
+    and W is the width of the filter. If the groups is greater than 1,
+    C will equal the number of input image channels divided by the groups.
+    Please refer to UFLDL's `convolution
+    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
+    for more detials.
     If bias attribution and activation type are provided, bias is added to the
     output of the convolution, and the corresponding activation function is
     applied to the final result.
@@ -1114,57 +1380,64 @@ def conv2d(input,
 
         Out = \sigma (W \\ast X + b)
 
-    In the above equation:
+    Where:
 
     * :math:`X`: Input value, a tensor with NCHW format.
     * :math:`W`: Filter value, a tensor with MCHW format.
     * :math:`\\ast`: Convolution operation.
     * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
     * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be
-                   different.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
 
     Example:
 
         - Input:
 
-          Input shape: $(N, C_{in}, H_{in}, W_{in})$
+          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
 
-          Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
+          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
 
         - Output:
-          Output shape: $(N, C_{out}, H_{out}, W_{out})$
+
+          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
 
         Where
 
         .. math::
 
-        H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
-        W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
-
-    Args:
-       input(Variable): The input image with [N, C, H, W] format.
-       num_filters(int): The number of filter. It is as same as the output
-           image channel.
-       filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
-           it must contain two integers, (filter_size_H, filter_size_W).
-           Otherwise, the filter will be a square.
-       stride(int|tuple): The stride size. If stride is a tuple, it must
-           contain two integers, (stride_H, stride_W). Otherwise, the
-           stride_H = stride_W = stride. Default: stride = 1.
-       padding(int|tuple): The padding size. If padding is a tuple, it must
-           contain two integers, (padding_H, padding_W). Otherwise, the
-           padding_H = padding_W = padding. Default: padding = 0.
-       groups(int): The groups number of the Conv2d Layer. According to grouped
-           convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
-           the first half of the filters is only connected to the first half
-           of the input channels, while the second half of the filters is only
-           connected to the second half of the input channels. Default: groups=1
-       param_attr(ParamAttr): The parameters to the Conv2d Layer. Default: None
-       bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
-       use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
-           library is installed. Default: True
-       act(str): Activation type. Default: None
+            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+
+    Args:
+        input (Variable): The input image with [N, C, H, W] format.
+        num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.
+        stride (int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: stride = 1.
+        padding (int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding. Default: padding = 0.
+        dilation (int|tuple): The dilation size. If dilation is a tuple, it must
+            contain two integers, (dilation_H, dilation_W). Otherwise, the
+            dilation_H = dilation_W = dilation. Default: dilation = 1.
+        groups (int): The groups number of the Conv2d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: groups=1
+        param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None
+        bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled
+            with mkldnn library. Default: False
+        act (str): Activation type. Default: None
+        name (str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
 
     Returns:
         Variable: The tensor variable storing the convolution and \
@@ -1177,13 +1450,9 @@ def conv2d(input,
     Examples:
         .. code-block:: python
 
-          data = fluid.layers.data(
-              name='data', shape=[3, 32, 32], dtype='float32')
-          conv2d = fluid.layers.conv2d(
-              input=data, num_filters=2, filter_size=3, act="relu")
+          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu")
     """
-    if stride is None:
-        stride = [1, 1]
 
     num_channels = input.shape[1]
 
@@ -1205,6 +1474,7 @@ def conv2d(input,
     filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
     stride = utils.convert_to_list(stride, 2, 'stride')
     padding = utils.convert_to_list(padding, 2, 'padding')
+    dilation = utils.convert_to_list(dilation, 2, 'dilation')
 
     if not isinstance(use_cudnn, bool):
         raise ValueError("use_cudnn should be True or False")
@@ -1234,6 +1504,7 @@ def conv2d(input,
         attrs={
             'strides': stride,
             'paddings': padding,
+            'dilations': dilation,
             'groups': groups,
             'use_cudnn': use_cudnn,
             'use_mkldnn': use_mkldnn
@@ -1244,60 +1515,226 @@ def conv2d(input,
     return helper.append_activation(pre_act)
 
 
-def sequence_pool(input, pool_type):
+def conv3d(input,
+           num_filters,
+           filter_size,
+           stride=1,
+           padding=0,
+           dilation=1,
+           groups=None,
+           param_attr=None,
+           bias_attr=None,
+           use_cudnn=True,
+           use_mkldnn=False,
+           act=None,
+           name=None):
     """
-    This function add the operator for sequence pooling.
-    It pools features of all time-steps of each instance, and is applied
-    on top of the input using pool_type mentioned in the parameters.
+    **Convlution3D Layer**
 
-    It supports four pool_type:
+    The convolution3D layer calculates the output based on the input, filter
+    and strides, paddings, dilations, groups parameters. Input(Input) and
+    Output(Output) are in NCDHW format. Where N is batch size C is the number of
+    channels, D is the depth of the feature, H is the height of the feature,
+    and W is the width of the feature. Convlution3D is similar with Convlution2D
+    but adds one dimension(depth). If bias attribution and activation type are
+    provided, bias is added to the output of the convolution, and the
+    corresponding activation function is applied to the final result.
 
-    - average: :math:`Out[i] = \\frac{\sum_i X_i}{N}`
-    - sum:     :math:`Out[i] = \sum_jX_{ij}`
-    - sqrt:    :math:`Out[i] = \\frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}`
-    - max:     :math:`Out[i] = max(X_i)`
+    For each input :math:`X`, the equation is:
 
-    .. code-block:: text
+    .. math::
 
-       x is a 1-level LoDTensor:
-         x.lod = [[0, 2, 5, 7]]
-         x.data = [1, 3, 2, 4, 6, 5, 1]
-         x.dims = [7, 1]
+        Out = \sigma (W \\ast X + b)
 
-       then output is a Tensor:
-         out.dim = [3, 1]
-         with condition len(x.lod[-1]) - 1 == out.dims[0]
+    In the above equation:
 
-       for different pool_type:
-         average: out.data = [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
-         sum    : out.data = [4, 12, 6], where 4=1+3, 12=2+4+6, 6=5+1
-         sqrt   : out.data = [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2),
-                    6.93=(2+4+6)/sqrt(3), 4.24=(5+1)/sqrt(2)
-         max    : out.data = [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1)
+    * :math:`X`: Input value, a tensor with NCDHW format.
+    * :math:`W`: Filter value, a tensor with MCDHW format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
+
+        - Input:
+
+          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+
+          Filter shape: :math:`(C_{out}, C_{in}, D_f, H_f, W_f)`
+
+        - Output:
+          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
+
+        Where
+
+        .. math::
+
+            D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
+            H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
+            W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
 
     Args:
-        input(variable): The input variable which is a LoDTensor.
-        pool_type (string): The pooling type of sequence_pool.
-            It supports average, sum, sqrt and max.
+        input (Variable): The input image with [N, C, D, H, W] format.
+            num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.
+        stride (int|tuple): The stride size. If stride is a tuple, it must
+            contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
+            stride_D = stride_H = stride_W = stride. Default: stride = 1.
+        padding (int|tuple): The padding size. If padding is a tuple, it must
+            contain three integers, (padding_D, padding_H, padding_W). Otherwise, the
+            padding_D = padding_H = padding_W = padding. Default: padding = 0.
+        dilation (int|tuple): The dilation size. If dilation is a tuple, it must
+            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
+            dilation_D = dilation_H = dilation_W = dilation. Default: dilation = 1.
+        groups (int): The groups number of the Conv3d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: groups=1
+        param_attr (ParamAttr): The parameters to the Conv3d Layer. Default: None
+        bias_attr (ParamAttr): Bias parameter for the Conv3d layer. Default: None
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        use_mkldnn (bool): Use mkldnn kernels or not.
+        act (str): Activation type. Default: None
+        name (str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
 
     Returns:
-        The sequence pooling variable which is a Tensor.
+        Variable: The tensor variable storing the convolution and \
+                  non-linearity activation result.
 
-    Examples:
+    Raises:
+        ValueError: If the shapes of input, filter_size, stride, padding and
+                    groups mismatch.
 
+    Examples:
         .. code-block:: python
 
-             x = fluid.layers.data(name='x', shape=[7, 1],
-                              dtype='float32', lod_level=1)
-             avg_x = fluid.layers.sequence_pool(input=x, pool_type='average')
-             sum_x = fluid.layers.sequence_pool(input=x, pool_type='sum')
-             sqrt_x = fluid.layers.sequence_pool(input=x, pool_type='sqrt')
-             max_x = fluid.layers.sequence_pool(input=x, pool_type='max')
+          data = fluid.layers.data(name='data', shape=[3, 12, 32, 32], dtype='float32')
+          conv3d = fluid.layers.conv3d(input=data, num_filters=2, filter_size=3, act="relu")
     """
-    helper = LayerHelper('sequence_pool', **locals())
-    dtype = helper.input_dtype()
-    pool_out = helper.create_tmp_variable(dtype)
-    max_index = helper.create_tmp_variable(dtype)
+
+    l_type = 'conv3d'
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+
+    num_channels = input.shape[1]
+
+    if groups is None:
+        num_filter_channels = num_channels
+    else:
+        if num_channels % groups != 0:
+            raise ValueError("num_channels must be divisible by groups.")
+        num_filter_channels = num_channels / groups
+
+    filter_size = utils.convert_to_list(filter_size, 3, 'filter_size')
+    stride = utils.convert_to_list(stride, 3, 'stride')
+    padding = utils.convert_to_list(padding, 3, 'padding')
+    dilation = utils.convert_to_list(dilation, 3, 'dilation')
+
+    if not isinstance(use_cudnn, bool):
+        raise ValueError("use_cudnn should be True or False")
+
+    input_shape = input.shape
+    filter_shape = [num_filters, num_filter_channels] + filter_size
+
+    def _get_default_param_initializer():
+        std = (2.0 / (filter_size[0]**3 * num_channels))**0.5
+        return Normal(0.0, std, 0)
+
+    filter_param = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=filter_shape,
+        dtype=dtype,
+        default_initializer=_get_default_param_initializer())
+
+    pre_bias = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type=l_type,
+        inputs={
+            'Input': input,
+            'Filter': filter_param,
+        },
+        outputs={"Output": pre_bias},
+        attrs={
+            'strides': stride,
+            'paddings': padding,
+            'dilations': dilation,
+            'groups': groups,
+            'use_cudnn': use_cudnn,
+            'use_mkldnn': use_mkldnn
+        })
+
+    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
+
+    return helper.append_activation(pre_act)
+
+
+def sequence_pool(input, pool_type):
+    """
+    This function add the operator for sequence pooling.
+    It pools features of all time-steps of each instance, and is applied
+    on top of the input using pool_type mentioned in the parameters.
+
+    It supports four pool_type:
+
+    - average: :math:`Out[i] = \\frac{\sum_i X_i}{N}`
+    - sum:     :math:`Out[i] = \sum_jX_{ij}`
+    - sqrt:    :math:`Out[i] = \\frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}`
+    - max:     :math:`Out[i] = max(X_i)`
+
+    .. code-block:: text
+
+       x is a 1-level LoDTensor:
+         x.lod = [[2, 3, 2]]
+         x.data = [1, 3, 2, 4, 6, 5, 1]
+         x.dims = [7, 1]
+
+       then output is a Tensor:
+         out.dim = [3, 1]
+         with condition len(x.lod[-1]) == out.dims[0]
+
+       for different pool_type:
+         average: out.data = [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
+         sum    : out.data = [4, 12, 6], where 4=1+3, 12=2+4+6, 6=5+1
+         sqrt   : out.data = [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2),
+                    6.93=(2+4+6)/sqrt(3), 4.24=(5+1)/sqrt(2)
+         max    : out.data = [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1)
+         last   : out.data = [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
+         first  : out.data = [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
+
+    Args:
+        input(variable): The input variable which is a LoDTensor.
+        pool_type (string): The pooling type of sequence_pool.
+            It supports average, sum, sqrt and max.
+
+    Returns:
+        The sequence pooling variable which is a Tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+             x = fluid.layers.data(name='x', shape=[7, 1],
+                              dtype='float32', lod_level=1)
+             avg_x = fluid.layers.sequence_pool(input=x, pool_type='average')
+             sum_x = fluid.layers.sequence_pool(input=x, pool_type='sum')
+             sqrt_x = fluid.layers.sequence_pool(input=x, pool_type='sqrt')
+             max_x = fluid.layers.sequence_pool(input=x, pool_type='max')
+             last_x = fluid.layers.sequence_pool(input=x, pool_type='last')
+             first_x = fluid.layers.sequence_pool(input=x, pool_type='first')
+    """
+    helper = LayerHelper('sequence_pool', **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_tmp_variable(dtype)
+    max_index = helper.create_tmp_variable(dtype)
 
     helper.append_op(
         type="sequence_pool",
@@ -1316,18 +1753,18 @@ def sequence_pool(input, pool_type):
 
 def sequence_first_step(input):
     """
-    This funciton get the first step of sequence.
+    This function gets the first step of sequence.
 
     .. code-block:: text
 
        x is a 1-level LoDTensor:
-         x.lod = [[0, 2, 5, 7]]
+         x.lod = [[2, 3, 2]]
          x.data = [1, 3, 2, 4, 6, 5, 1]
          x.dims = [7, 1]
 
        then output is a Tensor:
          out.dim = [3, 1]
-         with condition len(x.lod[-1]) - 1 == out.dims[0]
+         with condition len(x.lod[-1]) == out.dims[0]
          out.data = [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
 
     Args:
@@ -1349,18 +1786,18 @@ def sequence_first_step(input):
 
 def sequence_last_step(input):
     """
-    This funciton get the last step of sequence.
+    This function gets the last step of sequence.
 
     .. code-block:: text
 
        x is a 1-level LoDTensor:
-         x.lod = [[0, 2, 5, 7]]
+         x.lod = [[2, 3, 2]]
          x.data = [1, 3, 2, 4, 6, 5, 1]
          x.dims = [7, 1]
 
        then output is a Tensor:
          out.dim = [3, 1]
-         with condition len(x.lod[-1]) - 1 == out.dims[0]
+         with condition len(x.lod[-1]) == out.dims[0]
          out.data = [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
 
     Args:
@@ -1380,6 +1817,7 @@ def sequence_last_step(input):
     return sequence_pool(input=input, pool_type="last")
 
 
+@templatedoc()
 def pool2d(input,
            pool_size=-1,
            pool_type="max",
@@ -1391,8 +1829,45 @@ def pool2d(input,
            use_mkldnn=False,
            name=None):
     """
-    This function adds the operator for pooling in 2 dimensions, using the
-    pooling configurations mentioned in input parameters.
+    ${comment}
+
+    Args:
+        input (Variable): The input tensor of pooling operator. The format of
+                          input tensor is NCHW, where N is batch size, C is
+                          the number of channels, H is the height of the
+                          feature, and W is the width of the feature.
+        pool_size (int): The side length of pooling windows. All pooling
+                         windows are squares with pool_size on a side.
+        pool_type: ${pooling_type_comment}
+        pool_stride (int): stride of the pooling layer.
+        pool_padding (int): padding size.
+        global_pooling: ${global_pooling_comment}
+        use_cudnn: ${use_cudnn_comment}
+        ceil_mode: ${ceil_mode_comment}
+        use_mkldnn: ${use_mkldnn_comment}
+        name (str|None): A name for this layer(optional). If set None, the
+                        layer will be named automatically.
+
+    Returns:
+        Variable: The pooling result.
+
+    Raises:
+        ValueError: If 'pool_type' is not "max" nor "avg"
+        ValueError: If 'global_pooling' is False and 'pool_size' is -1
+        ValueError: If 'use_cudnn' is not a bool value.
+
+    Examples:
+
+        .. code-block:: python
+
+          data = fluid.layers.data(
+              name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d = fluid.layers.pool2d(
+                            input=data,
+                            pool_size=2,
+                            pool_type='max',
+                            pool_stride=1,
+                            global_pooling=False)
     """
     if pool_type not in ["max", "avg"]:
         raise ValueError(
@@ -1411,12 +1886,84 @@ def pool2d(input,
     if not isinstance(use_cudnn, bool):
         raise ValueError("use_cudnn should be True or False")
 
-    helper = LayerHelper('pool2d', **locals())
+    l_type = 'pool2d'
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": input},
+        outputs={"Out": pool_out},
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "global_pooling": global_pooling,
+            "strides": pool_stride,
+            "paddings": pool_padding,
+            "use_cudnn": use_cudnn,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": use_mkldnn
+        })
+
+    return pool_out
+
+
+def pool3d(input,
+           pool_size=-1,
+           pool_type="max",
+           pool_stride=1,
+           pool_padding=0,
+           global_pooling=False,
+           use_cudnn=True,
+           ceil_mode=False,
+           use_mkldnn=False,
+           name=None):
+    """
+    This function adds the operator for pooling in 3-dimensions, using the
+    pooling configurations mentioned in input parameters.
+
+    Args:
+        input (Variable): ${input_comment}
+        pool_size (int): ${ksize_comment}
+        pool_type (str): ${pooling_type_comment}
+        pool_stride (int): stride of the pooling layer.
+        pool_padding (int): padding size.
+        global_pooling (bool): ${global_pooling_comment}
+        use_cudnn (bool): ${use_cudnn_comment}
+        ceil_mode (bool): ${ceil_mode_comment}
+        use_mkldnn (bool): ${use_mkldnn_comment}
+        name (str): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+
+    Returns:
+        Variable: output of pool3d layer.
+    """
+    if pool_type not in ["max", "avg"]:
+        raise ValueError(
+            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
+            str(pool_type))
+
+    if global_pooling is False and pool_size == -1:
+        raise ValueError(
+            "When the global_pooling is False, pool_size must be passed "
+            "and be a valid value. Received pool_size: " + str(pool_size))
+
+    pool_size = utils.convert_to_list(pool_size, 3, 'pool_size')
+    pool_padding = utils.convert_to_list(pool_padding, 3, 'pool_padding')
+    pool_stride = utils.convert_to_list(pool_stride, 3, 'pool_stride')
+
+    if not isinstance(use_cudnn, bool):
+        raise ValueError("use_cudnn should be True or False")
+
+    l_type = "pool3d"
+    helper = LayerHelper(l_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_tmp_variable(dtype)
 
     helper.append_op(
-        type="pool2d",
+        type=l_type,
         inputs={"X": input},
         outputs={"Out": pool_out},
         attrs={
@@ -1441,12 +1988,66 @@ def batch_norm(input,
                param_attr=None,
                bias_attr=None,
                data_layout='NCHW',
+               in_place=False,
+               use_mkldnn=False,
                name=None,
                moving_mean_name=None,
-               moving_variance_name=None):
+               moving_variance_name=None,
+               do_model_average_for_mean_and_var=False,
+               fuse_with_relu=False):
     """
-    This function helps create an operator to implement
-    the BatchNorm layer using the configurations from the input parameters.
+    **Batch Normalization Layer**
+
+    Can be used as a normalizer function for conv2d and fully_connected operations.
+    The required data format for this layer is one of the following:
+
+    1. NHWC `[batch, in_height, in_width, in_channels]`
+
+    2. NCHW `[batch, in_channels, in_height, in_width]`
+
+    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
+    for more details.
+
+    :math:`input` is the input features over a mini-batch.
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+    Args:
+        input(variable): The input variable which is a LoDTensor.
+        act(string, Default None): Activation type, linear|relu|prelu|...
+        is_test(bool, Default False): Used for training or training.
+        momentum(float, Default 0.9):
+        epsilon(float, Default 1e-05):
+        param_attr(ParamAttr): The parameter attribute for Parameter `scale`.
+        bias_attr(ParamAttr): The parameter attribute for Parameter `bias`.
+        data_layout(string, default NCHW): NCHW|NHWC
+        in_place(bool, Default False): Make the input and output of batch norm reuse memory.
+        use_mkldnn(bool, Default false): ${use_mkldnn_comment}
+        name(string, Default None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+        moving_mean_name(string, Default None): The name of moving_mean which store the global Mean.
+        moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
+        do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
+        fuse_with_relu (bool): if True, this OP performs relu after batch norm.
+
+    Returns:
+        Variable: A tensor variable which is the result after applying batch normalization on the input.
+
+    Examples:
+
+        .. code-block:: python
+
+            hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
+            hidden2 = fluid.layers.batch_norm(input=hidden1)
     """
     helper = LayerHelper('batch_norm', **locals())
     dtype = helper.input_dtype()
@@ -1474,7 +2075,10 @@ def batch_norm(input,
 
     mean = helper.create_parameter(
         attr=ParamAttr(
-            name=moving_mean_name, initializer=Constant(0.0), trainable=False),
+            name=moving_mean_name,
+            initializer=Constant(0.0),
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var),
         shape=param_shape,
         dtype=input.dtype)
     mean.stop_gradient = True
@@ -1483,7 +2087,8 @@ def batch_norm(input,
         attr=ParamAttr(
             name=moving_variance_name,
             initializer=Constant(1.0),
-            trainable=False),
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var),
         shape=param_shape,
         dtype=input.dtype)
     variance.stop_gradient = True
@@ -1496,7 +2101,7 @@ def batch_norm(input,
     saved_mean = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
     saved_variance = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
 
-    batch_norm_out = helper.create_tmp_variable(dtype)
+    batch_norm_out = input if in_place else helper.create_tmp_variable(dtype)
 
     helper.append_op(
         type="batch_norm",
@@ -1514,13 +2119,18 @@ def batch_norm(input,
             "SavedMean": saved_mean,
             "SavedVariance": saved_variance
         },
-        attrs={"momentum": momentum,
-               "epsilon": epsilon,
-               "is_test": is_test})
+        attrs={
+            "momentum": momentum,
+            "epsilon": epsilon,
+            "is_test": is_test,
+            "use_mkldnn": use_mkldnn,
+            "fuse_with_relu": fuse_with_relu
+        })
 
     return helper.append_activation(batch_norm_out)
 
 
+@templatedoc()
 def layer_norm(input,
                scale=True,
                shift=True,
@@ -1531,20 +2141,11 @@ def layer_norm(input,
                act=None,
                name=None):
     """
-    **Layer Normalization**
-
-    Assume feature vectors exist on dimensions
-    :attr:`begin_norm_axis ... rank(input)` and calculate the moment statistics
-    along these dimensions for each feature vector :math:`a` with size
-    :math:`H`, then normalize each feature vector using the corresponding
-    statistics. After that, apply learnable gain and bias on the normalized
-    tensor to scale and shift if :attr:`scale` and :attr:`shift` are set.
-
-    Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
+    ${comment}
 
     The formula is as follows:
 
-    .. math::
+    ..  math::
 
         \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i
 
@@ -1552,6 +2153,15 @@ def layer_norm(input,
 
         h & = f(\\frac{g}{\\sigma}(a - \\mu) + b)
 
+    * :math:`a`: the vector representation of the summed inputs to the neurons
+    in that layer.
+
+    * :math:`H`: the number of hidden units in a layers
+
+    * :math:`g`: the trainable scale parameter.
+
+    * :math:`b`: the trainable bias parameter.
+
     Args:
         input(Variable): The input tensor variable.
         scale(bool): Whether to learn the adaptive gain :math:`g` after
@@ -1567,16 +2177,16 @@ def layer_norm(input,
         bias_attr(ParamAttr|None): The parameter attribute for the learnable
             bias :math:`b`.
         act(str): Activation to be applied to the output of layer normalizaiton.
+        name (str): The name of this layer. It is optional.
 
     Returns:
-        Variable: A tensor variable with the same shape as the input.
+        ${y_comment}
 
     Examples:
-        .. code-block:: python
 
-            data = fluid.layers.data(
-              name='data', shape=[3, 32, 32], dtype='float32')
-            x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
+        >>> data = fluid.layers.data(name='data', shape=[3, 32, 32],
+        >>>                          dtype='float32')
+        >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
     """
     helper = LayerHelper('layer_norm', **locals())
     dtype = helper.input_dtype()
@@ -1617,23 +2227,6 @@ def layer_norm(input,
     return helper.append_activation(layer_norm_out)
 
 
-def beam_search_decode(ids, scores, name=None):
-    helper = LayerHelper('beam_search_decode', **locals())
-    sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
-    sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
-
-    helper.append_op(
-        type="beam_search_decode",
-        inputs={"Ids": ids,
-                "Scores": scores},
-        outputs={
-            "SentenceIds": sentence_ids,
-            "SentenceScores": sentence_scores
-        })
-
-    return sentence_ids, sentence_scores
-
-
 def conv2d_transpose(input,
                      num_filters,
                      output_size=None,
@@ -1641,8 +2234,11 @@ def conv2d_transpose(input,
                      padding=0,
                      stride=1,
                      dilation=1,
+                     groups=None,
                      param_attr=None,
+                     bias_attr=None,
                      use_cudnn=True,
+                     act=None,
                      name=None):
     """
     **Convlution2D transpose layer**
@@ -1655,32 +2251,36 @@ def conv2d_transpose(input,
     represent height and width, respectively. The details of convolution transpose
     layer, please refer to the following explanation and references
     `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
+    If bias attribution and activation type are provided, bias is added to
+    the output of the convolution, and the corresponding activation function
+    is applied to the final result.
 
     For each input :math:`X`, the equation is:
 
     .. math::
 
-        Out = W \\ast X
+        Out = \sigma (W \\ast X + b)
 
-    In the above equation:
+    Where:
 
     * :math:`X`: Input value, a tensor with NCHW format.
     * :math:`W`: Filter value, a tensor with MCHW format.
-    * :math:`\\ast` : Convolution transpose operation.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be
-                   different.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
 
     Example:
 
         - Input:
 
-          Input shape: $(N, C_{in}, H_{in}, W_{in})$
+          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
 
-          Filter shape: $(C_{in}, C_{out}, H_f, W_f)$
+          Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
 
         - Output:
 
-          Output shape: $(N, C_{out}, H_{out}, W_{out})$
+          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
 
         Where
 
@@ -1690,51 +2290,64 @@ def conv2d_transpose(input,
            W_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1
 
     Args:
-       input(Variable): The input image with [N, C, H, W] format.
-       num_filters(int): The number of the filter. It is as same as the output
-           image channel.
-       output_size(int|tuple|None): The output image size. If output size is a
-           tuple, it must contain two integers, (image_H, image_W). This
-           parameter only works when filter_size is None.
-       filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
-           it must contain two integers, (filter_size_H, filter_size_W).
-           Otherwise, the filter will be a square. None if use output size to
-           calculate filter_size.
-       padding(int|tuple): The padding size. If padding is a tuple, it must
-           contain two integers, (padding_H, padding_W). Otherwise, the
-           padding_H = padding_W = padding. Default: padding = 0.
-       stride(int|tuple): The stride size. If stride is a tuple, it must
-           contain two integers, (stride_H, stride_W). Otherwise, the
-           stride_H = stride_W = stride. Default: stride = 1.
-       dilation(int|tuple): The dilation size. If dilation is a tuple, it must
-           contain two integers, (dilation_H, dilation_W). Otherwise, the
-           dilation_H = dilation_W = dilation. Default: dilation = 1.
-       param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer.
-                              Default: None
-       use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
-           library is installed. Default: True
-       name(str|None): A name for this layer(optional). If set None, the layer
-           will be named automatically.
-
-    Returns:
-       Variable: The tensor variable storing the convolution transpose result.
+        input(Variable): The input image with [N, C, H, W] format.
+        num_filters(int): The number of the filter. It is as same as the output
+            image channel.
+        output_size(int|tuple|None): The output image size. If output size is a
+            tuple, it must contain two integers, (image_H, image_W). This
+            parameter only works when filter_size is None.
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square. None if use output size to
+            calculate filter_size.
+        padding(int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding. Default: padding = 0.
+        stride(int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: stride = 1.
+        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
+            contain two integers, (dilation_H, dilation_W). Otherwise, the
+            dilation_H = dilation_W = dilation. Default: dilation = 1.
+        groups(int): The groups number of the Conv2d transpose layer. Inspired by
+            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
+            when group=2, the first half of the filters is only connected to the
+            first half of the input channels, while the second half of the
+            filters is only connected to the second half of the input channels.
+            Default: groups=1
+        param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer.
+                               Default: None
+        bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
+        use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        act(str): Activation type. Default: None
+        name(str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+
+    Returns:
+        Variable: The tensor variable storing the convolution transpose result.
 
     Raises:
-       ValueError: If the shapes of input, filter_size, stride, padding and
-                   groups mismatch.
+        ValueError: If the shapes of input, filter_size, stride, padding and
+                    groups mismatch.
 
     Examples:
        .. code-block:: python
 
-          data = fluid.layers.data(
-              name='data', shape=[3, 32, 32], dtype='float32')
-          conv2d_transpose = fluid.layers.conv2d_transpose(
-              input=data, num_filters=2, filter_size=3)
+          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d_transpose = fluid.layers.conv2d_transpose(input=data, num_filters=2, filter_size=3)
     """
-    helper = LayerHelper("conv2d_transpose", **locals())
+
+    input_channel = input.shape[1]
+
+    op_type = 'conv2d_transpose'
+    if (input_channel == groups and num_filters == input_channel and
+            not use_cudnn):
+        op_type = 'depthwise_conv2d_transpose'
+
+    helper = LayerHelper(op_type, **locals())
     if not isinstance(input, Variable):
         raise TypeError("Input of conv2d_transpose must be Variable")
-    input_channel = input.shape[1]
 
     padding = utils.convert_to_list(padding, 2, 'padding')
     stride = utils.convert_to_list(stride, 2, 'stride')
@@ -1761,72 +2374,245 @@ def conv2d_transpose(input,
         filter_size = utils.convert_to_list(filter_size, 2,
                                             'conv2d_transpose.filter_size')
 
-    filter_shape = [input_channel, num_filters] + filter_size
+    groups = 1 if groups is None else groups
+    filter_shape = [input_channel, num_filters / groups] + filter_size
     img_filter = helper.create_parameter(
         dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
 
-    out = helper.create_tmp_variable(dtype=input.dtype)
+    pre_bias = helper.create_tmp_variable(dtype=input.dtype)
     helper.append_op(
-        type='conv2d_transpose',
+        type=op_type,
         inputs={'Input': [input],
                 'Filter': [img_filter]},
-        outputs={'Output': out},
+        outputs={'Output': pre_bias},
         attrs={
             'strides': stride,
             'paddings': padding,
             'dilations': dilation,
+            'groups': groups,
             'use_cudnn': use_cudnn
         })
 
+    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
+    out = helper.append_activation(pre_act)
     return out
 
 
-def sequence_expand(x, y, name=None):
-    """Sequence Expand Layer. This layer will expand the input variable **x**
-    according to LoD information of **y**. And the following examples will
-    explain how sequence_expand works:
-
-    .. code-block:: text
-
-        * Case 1
-            x is a LoDTensor:
-                x.lod = [[0,       2, 3],
-                         [0, 1,    3, 4]]
-                x.data = [a, b, c, d]
-                x.dims = [4, 1]
-
+def conv3d_transpose(input,
+                     num_filters,
+                     output_size=None,
+                     filter_size=None,
+                     padding=0,
+                     stride=1,
+                     dilation=1,
+                     groups=None,
+                     param_attr=None,
+                     bias_attr=None,
+                     use_cudnn=True,
+                     act=None,
+                     name=None):
+    """
+    **Convlution3D transpose layer**
+
+    The convolution3D transpose layer calculates the output based on the input,
+    filter, and dilations, strides, paddings. Input(Input) and output(Output)
+    are in NCDHW format. Where N is batch size, C is the number of channels,
+    D is the depth of the feature, H is the height of the feature, and W
+    is the width of the feature. Parameters(dilations, strides, paddings) are
+    two elements. These two elements represent height and width, respectively.
+    The details of convolution transpose layer, please refer to the following
+    explanation and references `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
+    If bias attribution and activation type are provided, bias is added to
+    the output of the convolution, and the corresponding activation function
+    is applied to the final result.
+
+    For each input :math:`X`, the equation is:
+
+    .. math::
+
+        Out = \sigma (W \\ast X + b)
+
+    In the above equation:
+
+    * :math:`X`: Input value, a tensor with NCDHW format.
+    * :math:`W`: Filter value, a tensor with MCDHW format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
+
+        - Input:
+
+          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+
+          Filter shape: :math:`(C_{in}, C_{out}, D_f, H_f, W_f)`
+
+        - Output:
+
+          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
+
+        Where
+
+        .. math::
+
+           D_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\\\
+           H_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\\\
+           W_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1
+
+    Args:
+        input(Variable): The input image with [N, C, D, H, W] format.
+        num_filters(int): The number of the filter. It is as same as the output
+            image channel.
+        output_size(int|tuple|None): The output image size. If output size is a
+            tuple, it must contain three integers, (image_D, image_H, image_W). This
+            parameter only works when filter_size is None.
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square. None if use output size to
+            calculate filter_size.
+        padding(int|tuple): The padding size. If padding is a tuple, it must
+            contain three integers, (padding_D, padding_H, padding_W). Otherwise, the
+            padding_D = padding_H = padding_W = padding. Default: padding = 0.
+        stride(int|tuple): The stride size. If stride is a tuple, it must
+            contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
+            stride_D = stride_H = stride_W = stride. Default: stride = 1.
+        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
+            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
+            dilation_D = dilation_H = dilation_W = dilation. Default: dilation = 1.
+        groups(int): The groups number of the Conv3d transpose layer. Inspired by
+            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
+            when group=2, the first half of the filters is only connected to the
+            first half of the input channels, while the second half of the
+            filters is only connected to the second half of the input channels.
+            Default: groups=1
+        param_attr(ParamAttr): The parameters to the Conv3d_transpose Layer.
+            Default: None
+        bias_attr(ParamAttr): Bias parameter for the Conv3d layer. Default: None
+        use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        act(str): Activation type. Default: None
+        name(str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+
+    Returns:
+        Variable: The tensor variable storing the convolution transpose result.
+
+    Raises:
+        ValueError: If the shapes of input, filter_size, stride, padding and
+                    groups mismatch.
+
+    Examples:
+       .. code-block:: python
+
+          data = fluid.layers.data(name='data', shape=[3, 12, 32, 32], dtype='float32')
+          conv3d_transpose = fluid.layers.conv3d_transpose(input=data, num_filters=2, filter_size=3)
+    """
+    l_type = "conv3d_transpose"
+    helper = LayerHelper(l_type, **locals())
+    if not isinstance(input, Variable):
+        raise TypeError("Input of conv3d_transpose must be Variable")
+    input_channel = input.shape[1]
+
+    padding = utils.convert_to_list(padding, 3, 'padding')
+    stride = utils.convert_to_list(stride, 3, 'stride')
+    dilation = utils.convert_to_list(dilation, 3, 'dilation')
+
+    if not isinstance(use_cudnn, bool):
+        raise ValueError("use_cudnn should be True or False")
+
+    if filter_size is None:
+        if output_size is None:
+            raise ValueError("output_size must be set when filter_size is None")
+        if isinstance(output_size, int):
+            output_size = [output_size, output_size]
+
+        d_in = input.shape[2]
+        h_in = input.shape[3]
+        w_in = input.shape[4]
+
+        filter_size_d = (output_size[0] - (d_in - 1) * stride[0] + 2 *
+                         padding[0] - 1) / dilation[0] + 1
+        filter_size_h = (output_size[1] - (h_in - 1) * stride[1] + 2 *
+                         padding[1] - 1) / dilation[1] + 1
+        filter_size_w = (output_size[2] - (w_in - 1) * stride[2] + 2 *
+                         padding[2] - 1) / dilation[2] + 1
+        filter_size = [filter_size_d, filter_size_h, filter_size_w]
+    else:
+        filter_size = utils.convert_to_list(filter_size, 3,
+                                            'conv3d_transpose.filter_size')
+
+    groups = 1 if groups is None else groups
+    filter_shape = [input_channel, num_filters / groups] + filter_size
+    img_filter = helper.create_parameter(
+        dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
+
+    pre_bias = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type=l_type,
+        inputs={'Input': [input],
+                'Filter': [img_filter]},
+        outputs={'Output': pre_bias},
+        attrs={
+            'strides': stride,
+            'paddings': padding,
+            'dilations': dilation,
+            'groups': groups,
+            'use_cudnn': use_cudnn
+        })
+
+    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
+    out = helper.append_activation(pre_act)
+    return out
+
+
+def sequence_expand(x, y, ref_level=-1, name=None):
+    """Sequence Expand Layer. This layer will expand the input variable **x**
+    according to specified level lod of **y**. Please note that lod level of
+    **x** is at most 1 and rank of **x** is at least 2. When rank of **x**
+    is greater than 2, then it would be viewed as a 2-D tensor.
+    Following examples will explain how sequence_expand works:
+
+    .. code-block:: text
+
+        * Case 1
+            x is a LoDTensor:
+                x.lod  = [[2,        2]]
+                x.data = [[a], [b], [c], [d]]
+                x.dims = [4, 1]
+
             y is a LoDTensor:
-                y.lod = [[0,    2,    4],
-                         [0, 3, 6, 7, 8]]
+                y.lod = [[2,    2],
+                         [3, 3, 1, 1]]
 
-            with condition len(y.lod[-1]) - 1 == x.dims[0]
+            ref_level: 0
 
-            then output is a 2-level LoDTensor:
-                out.lod = [[0,                2,    4],
-                           [0,       3,       6, 7, 8]]
-                out.data = [a, a, a, b, b, b, c, d]
+            then output is a 1-level LoDTensor:
+                out.lod =  [[2,        2,        2,        2]]
+                out.data = [[a], [b], [a], [b], [c], [d], [c], [d]]
                 out.dims = [8, 1]
 
         * Case 2
             x is a Tensor:
-                x.data = [a, b, c]
+                x.data = [[a], [b], [c]]
                 x.dims = [3, 1]
 
             y is a LoDTensor:
-                y.lod = [[0, 2, 3, 6]]
+                y.lod = [[2, 0, 3]]
 
-            with condition len(y.lod[-1]) - 1 == x.dims[0]
-
-            then output is a 1-level LoDTensor:
-                out.lod = [[0,    2, 3,      6]]
-                out.data = [a, a, b, c, c, c]
-                out.dims = [6, 1]
+            ref_level: -1
 
+            then output is a Tensor:
+                out.data = [[a], [a], [c], [c], [c]]
+                out.dims = [5, 1]
     Args:
         x (Variable): The input variable which is a Tensor or LoDTensor.
         y (Variable): The input variable which is a LoDTensor.
+        ref_level (int): Lod level of `y` to be referred by `x`. If set to -1,
+                         refer the last level of lod.
         name(str|None): A name for this layer(optional). If set None, the layer
-                       will be named automatically.
+                        will be named automatically.
 
     Returns:
         Variable: The expanded variable which is a LoDTensor.
@@ -1837,21 +2623,103 @@ def sequence_expand(x, y, name=None):
             x = fluid.layers.data(name='x', shape=[10], dtype='float32')
             y = fluid.layers.data(name='y', shape=[10, 20],
                              dtype='float32', lod_level=1)
-            out = layers.sequence_expand(x=x, y=y)
+            out = layers.sequence_expand(x=x, y=y, ref_level=0)
     """
     helper = LayerHelper('sequence_expand', input=x, **locals())
     dtype = helper.input_dtype()
     tmp = helper.create_tmp_variable(dtype)
     helper.append_op(
-        type='sequence_expand', inputs={'X': x,
-                                        'Y': y}, outputs={'Out': tmp})
+        type='sequence_expand',
+        inputs={'X': x,
+                'Y': y},
+        outputs={'Out': tmp},
+        attrs={'ref_level': ref_level})
     return tmp
 
 
-def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
-    '''
-    This function implements the beam search algorithm.
-    '''
+def beam_search(pre_ids,
+                pre_scores,
+                ids,
+                scores,
+                beam_size,
+                end_id,
+                level=0,
+                name=None):
+    """
+    Beam search is a classical algorithm for selecting candidate words in a
+    machine translation task.
+
+    Refer to `Beam search <https://en.wikipedia.org/wiki/Beam_search>`_
+    for more details.
+    
+    This layer does the search in beams for one time step. Specifically, it 
+    selects the top-K candidate word ids of current step from :attr:`ids`
+    according to their :attr:`scores` for all source sentences, where K is
+    :attr:`beam_size` and :attr:`ids, scores` are predicted results from the
+    computation cell. Additionally, :attr:`pre_ids` and :attr:`pre_scores` are
+    the output of beam_search at previous step, they are needed for special use
+    to handle ended candidate translations.
+ 
+    Note that the :attr:`scores` passed in should be accumulated scores, and
+    length penalty should be done with extra operators before calculating the
+    accumulated scores if needed, also suggest finding top-K before it and
+    using the top-K candidates following.
+
+    Please see the following demo for a fully beam search usage example:
+
+        fluid/tests/book/test_machine_translation.py
+
+    Args:
+        pre_ids(Variable): The LodTensor variable which is the output of
+            beam_search at previous step. It should be a LodTensor with shape
+            :math:`(batch_size, 1)` and lod
+            :math:`[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at the
+            first step.
+        pre_scores(Variable): The LodTensor variable which is the output of
+            beam_search at previous step.
+        ids(Variable): The LodTensor variable containing the candidates ids.
+            Its shape should be :math:`(batch_size \\times beam_size, K)`,
+            where :math:`K` supposed to be :attr:`beam_size`.
+        scores(Variable): The LodTensor variable containing the accumulated
+            scores corresponding to :attr:`ids` and its shape is the same as
+            the shape of :attr:`ids`.
+        beam_size(int): The beam width used in beam search.
+        end_id(int): The id of end token.
+        level(int, default 0): It can be ignored and mustn't change currently.
+            It means the source level of lod, which is explained as following.
+            The lod level of :attr:`ids` should be 2. The first level is source
+            level which describes how many prefixes (branchs) for each source
+            sentece (beam), and the second level is sentence level which
+            describes how these candidates belong to the prefix. The paths
+            linking prefixes and selected candidates are organized and reserved
+            in lod.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        Variable: The LodTensor pair containing the selected ids and the \
+            corresponding scores.
+
+    Examples:
+        .. code-block:: python
+
+            # Suppose `probs` contains predicted results from the computation
+            # cell and `pre_ids` and `pre_scores` is the output of beam_search
+            # at previous step.
+            topk_scores, topk_indices = layers.topk(probs, k=beam_size)
+            accu_scores = layers.elementwise_add(
+                x=layers.log(x=topk_scores)),
+                y=layers.reshape(
+                    pre_scores, shape=[-1]),
+                axis=0)
+            selected_ids, selected_scores = layers.beam_search(
+                pre_ids=pre_ids,
+                pre_scores=pre_scores,
+                ids=topk_indices,
+                scores=accu_scores,
+                beam_size=beam_size,
+                end_id=end_id)
+    """
     helper = LayerHelper('beam_search', **locals())
     score_type = scores.dtype
     id_type = ids.dtype
@@ -1863,6 +2731,7 @@ def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
         type='beam_search',
         inputs={
             'pre_ids': pre_ids,
+            'pre_scores': pre_scores,
             'ids': ids,
             'scores': scores,
         },
@@ -1880,6 +2749,56 @@ def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
     return selected_ids, selected_scores
 
 
+def beam_search_decode(ids, scores, beam_size, end_id, name=None):
+    """
+    Beam Search Decode Layer. This layer constructs the full hypotheses for
+    each source sentence by walking back along the LoDTensorArray :attr:`ids`
+    whose lods can be used to restore the path in the beam search tree.
+    Please see the following demo for a fully beam search usage example:
+        fluid/tests/book/test_machine_translation.py
+
+    Args:
+        ids(Variable): The LodTensorArray variable containing the selected ids
+            of all steps.
+        scores(Variable): The LodTensorArray variable containing the selected
+            scores of all steps.
+        beam_size(int): The beam width used in beam search.
+        end_id(int): The id of end token.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        Variable: The LodTensor pair containing the generated id sequences \
+            and the corresponding scores. The shapes and lods of the two \
+            LodTensor are same. The lod level is 2 and the two levels \
+            separately indicate how many hypotheses each source sentence has \
+            and how many ids each hypothesis has.
+
+    Examples:
+        .. code-block:: python
+            # Suppose `ids` and `scores` are LodTensorArray variables reserving
+            # the selected ids and scores of all steps
+            finished_ids, finished_scores = layers.beam_search_decode(
+                ids, scores, beam_size=5, end_id=0)
+    """
+    helper = LayerHelper('beam_search_decode', **locals())
+    sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
+    sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
+
+    helper.append_op(
+        type="beam_search_decode",
+        inputs={"Ids": ids,
+                "Scores": scores},
+        outputs={
+            "SentenceIds": sentence_ids,
+            "SentenceScores": sentence_scores
+        },
+        attrs={"beam_size": beam_size,
+               "end_id": end_id})
+
+    return sentence_ids, sentence_scores
+
+
 def lstm_unit(x_t,
               hidden_t_prev,
               cell_t_prev,
@@ -2006,11 +2925,11 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (int|None): The dimension along which the sum is performed. If
+        dim (list|int|None): The dimensions along which the sum is performed. If
             :attr:`None`, sum all elements of :attr:`input` and return a
             Tensor variable with a single element, otherwise must be in the
-            range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`,
-            the dimension to reduce is :math:`rank + dim`.
+            range :math:`[-rank(input), rank(input))`. If :math:`dim[i] < 0`,
+            the dimension to reduce is :math:`rank + dim[i]`.
         keep_dim (bool|False): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
@@ -2031,15 +2950,25 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_sum(x, dim=0)  # [0.3, 0.5, 1.1, 1.6]
             fluid.layers.reduce_sum(x, dim=-1)  # [1.9, 1.6]
             fluid.layers.reduce_sum(x, dim=1, keep_dim=True)  # [[1.9], [1.6]]
+
+            # x is a Tensor variable with shape [2, 2, 2] and elements as below:
+            #      [[[1, 2], [3, 4]],
+            #      [[5, 6], [7, 8]]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_sum(x, dim=[1, 2]) # [10, 26]
+            fluid.layers.reduce_sum(x, dim=[0, 1]) # [16, 20]
+
     """
     helper = LayerHelper('reduce_sum', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    if dim is not None and not isinstance(dim, list):
+        dim = [dim]
     helper.append_op(
         type='reduce_sum',
         inputs={'X': input},
         outputs={'Out': out},
         attrs={
-            'dim': dim if dim != None else 0,
+            'dim': dim if dim != None else [0],
             'keep_dim': keep_dim,
             'reduce_all': True if dim == None else False
         })
@@ -2048,23 +2977,24 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
 
 def reduce_mean(input, dim=None, keep_dim=False, name=None):
     """
-    Computes the mean of tensor elements over the given dimension.
+    Computes the mean of the input tensor's elements along the given dimension.
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (int|None): The dimension along which the mean is computed. If
-            :attr:`None`, compute the mean over all elements of :attr:`input`
-            and return a Tensor variable with a single element, otherwise
+        dim (list|int|None): The dimension along which the mean is computed. If
+            `None`, compute the mean over all elements of :attr:`input`
+            and return a variable with a single element, otherwise it
             must be in the range :math:`[-rank(input), rank(input))`. If
-            :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
+            :math:`dim[i] < 0`, the dimension to reduce is
+            :math:`rank(input) + dim[i]`.
         keep_dim (bool): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
-        name(str|None): A name for this layer(optional). If set None, the layer
+        name(str|None): A name for this layer(optional). If set `None`, the layer
                        will be named automatically.
 
     Returns:
-        Variable: The reduced Tensor variable.
+        Variable: The reduced mean Variable.
 
     Examples:
         .. code-block:: python
@@ -2076,16 +3006,26 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_mean(x)  # [0.4375]
             fluid.layers.reduce_mean(x, dim=0)  # [0.15, 0.25, 0.55, 0.8]
             fluid.layers.reduce_mean(x, dim=-1)  # [0.475, 0.4]
-            fluid.layers.reduce_mean(x, dim=1, keep_dim=True)  # [[0.475], [0.4]]
+            fluid.layers.reduce_mean(
+                x, dim=1, keep_dim=True)  # [[0.475], [0.4]]
+
+            # x is a Tensor variable with shape [2, 2, 2] and elements as below:
+            #      [[[1.0, 2.0], [3.0, 4.0]],
+            #      [[5.0, 6.0], [7.0, 8.0]]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_mean(x, dim=[1, 2]) # [2.5, 6.5]
+            fluid.layers.reduce_mean(x, dim=[0, 1]) # [4.0, 5.0]
     """
     helper = LayerHelper('reduce_mean', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    if dim is not None and not isinstance(dim, list):
+        dim = [dim]
     helper.append_op(
         type='reduce_mean',
         inputs={'X': input},
         outputs={'Out': out},
         attrs={
-            'dim': dim if dim != None else 0,
+            'dim': dim if dim != None else [0],
             'keep_dim': keep_dim,
             'reduce_all': True if dim == None else False
         })
@@ -2098,11 +3038,11 @@ def reduce_max(input, dim=None, keep_dim=False, name=None):
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (int|None): The dimension along which the maximum is computed.
+        dim (list|int|None): The dimension along which the maximum is computed.
             If :attr:`None`, compute the maximum over all elements of
             :attr:`input` and return a Tensor variable with a single element,
             otherwise must be in the range :math:`[-rank(input), rank(input))`.
-            If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
+            If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`.
         keep_dim (bool): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
@@ -2123,15 +3063,24 @@ def reduce_max(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_max(x, dim=0)  # [0.2, 0.3, 0.6, 0.9]
             fluid.layers.reduce_max(x, dim=-1)  # [0.9, 0.7]
             fluid.layers.reduce_max(x, dim=1, keep_dim=True)  # [[0.9], [0.7]]
+
+            # x is a Tensor variable with shape [2, 2, 2] and elements as below:
+            #      [[[1.0, 2.0], [3.0, 4.0]],
+            #      [[5.0, 6.0], [7.0, 8.0]]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_max(x, dim=[1, 2]) # [4.0, 8.0]
+            fluid.layers.reduce_max(x, dim=[0, 1]) # [7.0, 8.0]
     """
     helper = LayerHelper('reduce_max', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    if dim is not None and not isinstance(dim, list):
+        dim = [dim]
     helper.append_op(
         type='reduce_max',
         inputs={'X': input},
         outputs={'Out': out},
         attrs={
-            'dim': dim if dim != None else 0,
+            'dim': dim if dim != None else [0],
             'keep_dim': keep_dim,
             'reduce_all': True if dim == None else False
         })
@@ -2144,11 +3093,11 @@ def reduce_min(input, dim=None, keep_dim=False, name=None):
 
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
-        dim (int|None): The dimension along which the minimum is computed.
+        dim (list|int|None): The dimensions along which the minimum is computed.
             If :attr:`None`, compute the minimum over all elements of
             :attr:`input` and return a Tensor variable with a single element,
             otherwise must be in the range :math:`[-rank(input), rank(input))`.
-            If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
+            If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`.
         keep_dim (bool): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true.
@@ -2169,15 +3118,80 @@ def reduce_min(input, dim=None, keep_dim=False, name=None):
             fluid.layers.reduce_min(x, dim=0)  # [0.1, 0.2, 0.5, 0.7]
             fluid.layers.reduce_min(x, dim=-1)  # [0.2, 0.1]
             fluid.layers.reduce_min(x, dim=1, keep_dim=True)  # [[0.2], [0.1]]
+
+            # x is a Tensor variable with shape [2, 2, 2] and elements as below:
+            #      [[[1.0, 2.0], [3.0, 4.0]],
+            #      [[5.0, 6.0], [7.0, 8.0]]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_min(x, dim=[1, 2]) # [1.0, 5.0]
+            fluid.layers.reduce_min(x, dim=[0, 1]) # [1.0, 2.0]
     """
     helper = LayerHelper('reduce_min', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    if dim is not None and not isinstance(dim, list):
+        dim = [dim]
     helper.append_op(
         type='reduce_min',
         inputs={'X': input},
         outputs={'Out': out},
         attrs={
-            'dim': dim if dim != None else 0,
+            'dim': dim if dim != None else [0],
+            'keep_dim': keep_dim,
+            'reduce_all': True if dim == None else False
+        })
+    return out
+
+
+def reduce_prod(input, dim=None, keep_dim=False, name=None):
+    """
+    Computes the product of tensor elements over the given dimension.
+
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        dim (list|int|None): The dimensions along which the product is performed. If
+            :attr:`None`, multipy all elements of :attr:`input` and return a
+            Tensor variable with a single element, otherwise must be in the
+            range :math:`[-rank(input), rank(input))`. If :math:`dim[i] < 0`,
+            the dimension to reduce is :math:`rank + dim[i]`.
+        keep_dim (bool|False): Whether to reserve the reduced dimension in the
+            output Tensor. The result tensor will have one fewer dimension
+            than the :attr:`input` unless :attr:`keep_dim` is true.
+        name(str|None): A name for this layer(optional). If set None, the
+            layer will be named automatically.
+
+    Returns:
+        Variable: The reduced Tensor variable.
+
+    Examples:
+        .. code-block:: python
+
+            # x is a Tensor variable with following elements:
+            #    [[0.2, 0.3, 0.5, 0.9]
+            #     [0.1, 0.2, 0.6, 0.7]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_prod(x)  # [0.0002268]
+            fluid.layers.reduce_prod(x, dim=0)  # [0.02, 0.06, 0.3, 0.63]
+            fluid.layers.reduce_prod(x, dim=-1)  # [0.027, 0.0084]
+            fluid.layers.reduce_prod(x, dim=1,
+                                     keep_dim=True)  # [[0.027], [0.0084]]
+
+            # x is a Tensor variable with shape [2, 2, 2] and elements as below:
+            #      [[[1.0, 2.0], [3.0, 4.0]],
+            #      [[5.0, 6.0], [7.0, 8.0]]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_prod(x, dim=[1, 2]) # [24.0, 1680.0]
+            fluid.layers.reduce_prod(x, dim=[0, 1]) # [105.0, 384.0]
+    """
+    helper = LayerHelper('reduce_prod', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    if dim is not None and not isinstance(dim, list):
+        dim = [dim]
+    helper.append_op(
+        type='reduce_prod',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None else [0],
             'keep_dim': keep_dim,
             'reduce_all': True if dim == None else False
         })
@@ -2202,7 +3216,7 @@ def split(input, num_or_sections, dim=-1, name=None):
                        will be named automatically.
 
     Returns:
-        List: The list of segmented tensor variables.
+        list(Variable): The list of segmented tensor variables.
 
     Examples:
         .. code-block:: python
@@ -2212,7 +3226,8 @@ def split(input, num_or_sections, dim=-1, name=None):
             x0.shape  # [3, 3, 5]
             x1.shape  # [3, 3, 5]
             x2.shape  # [3, 3, 5]
-            x0, x1, x2 = fluid.layers.split(x, num_or_sections=[2, 3, 4], dim=1)
+            x0, x1, x2 = fluid.layers.split(
+                x, num_or_sections=[2, 3, 4], dim=1)
             x0.shape  # [3, 2, 5]
             x1.shape  # [3, 3, 5]
             x2.shape  # [3, 4, 5]
@@ -2251,78 +3266,51 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
     The l2 normalize layer normalizes `x` along dimension `axis` using an L2
     norm. For a 1-D tensor (`dim` is fixed to 0), this layer computes
 
-    output = x / sqrt(max(sum(x**2), epsilon))
+    .. math::
+
+        y = \\frac{x}{ \sqrt{\sum {x^2} + epsion }}
 
     For `x` with more dimensions, this layer independently normalizes each 1-D
     slice along dimension `axis`.
 
     Args:
-       x(Variable|list): The input tensor to l2_normalize layer.
-       axis(int): Dimension along which to normalize the input.
-       epsilon(float): A lower bound value for `x`'s l2 norm. sqrt(epsilon) will
-                       be used as the divisor if the l2 norm of `x` is less than
-                       sqrt(epsilon).
-       name(str|None): A name for this layer(optional). If set None, the layer
-                       will be named automatically.
-
+        x(Variable|list): The input tensor to l2_normalize layer.
+        axis(int): The axis on which to apply normalization. If `axis < 0`, \
+            the dimension to normalization is rank(X) + axis. -1 is the
+            last dimension.
+        epsilon(float): The epsilon value is used to avoid division by zero, \
+            the defalut value is 1e-10.
+        name(str|None): A name for this layer(optional). If set None, the layer \
+            will be named automatically.
 
     Returns:
-        Variable: The output tensor variable.
+        Variable: The output tensor variable is the same shape with `x`.
 
     Examples:
+
         .. code-block:: python
 
-          data = fluid.layers.data(name="data",
-                                   shape=(3, 17, 13),
-                                   dtype="float32")
-          normed = fluid.layers.l2_normalize(x=data, axis=1)
+            data = fluid.layers.data(name="data",
+                                     shape=(3, 17, 13),
+                                     dtype="float32")
+            normed = fluid.layers.l2_normalize(x=data, axis=1)
     """
 
     if len(x.shape) == 1:
         axis = 0
-
     helper = LayerHelper("l2_normalize", **locals())
 
-    square = helper.create_tmp_variable(dtype=x.dtype)
-    helper.append_op(type="square", inputs={"X": x}, outputs={"Out": square})
-
-    reduced_sum = helper.create_tmp_variable(dtype=x.dtype)
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    norm = helper.create_tmp_variable(dtype=x.dtype)
     helper.append_op(
-        type="reduce_sum",
-        inputs={"X": square},
-        outputs={"Out": reduced_sum},
+        type="norm",
+        inputs={"X": x},
+        outputs={"Out": out,
+                 "Norm": norm},
         attrs={
-            "dim": 1 if axis is None else axis,
-            "keep_dim": True,
-            "reduce_all": False
+            "axis": 1 if axis is None else axis,
+            "epsilon": epsilon,
         })
-
-    # TODO(caoying) A lower bound value epsilon for the norm is needed to
-    # imporve the numeric stability of reciprocal. This requires a maximum_op.
-    rsquare = helper.create_tmp_variable(dtype=x.dtype)
-    helper.append_op(
-        type="reciprocal", inputs={"X": reduced_sum}, outputs={"Out": rsquare})
-
-    # TODO(caoying) the current elementwise_mul operator does not support a
-    # general broadcast rule which broadcasts input(Y) to have the same
-    # dimension with Input(X) starting from a specified dimension. So this
-    # exanpsion is requred. Once a general broadcast rule is spported, this
-    # expanding canbe removed.
-    rsquare_expanded = helper.create_tmp_variable(dtype=x.dtype)
-    expand_times = [1] * len(x.shape)
-    expand_times[axis] = int(x.shape[axis])
-    helper.append_op(
-        type="expand",
-        inputs={"X": rsquare},
-        outputs={"Out": rsquare_expanded},
-        attrs={"expand_times": expand_times})
-
-    out = helper.create_tmp_variable(dtype=x.dtype)
-    helper.append_op(
-        type="elementwise_mul",
-        inputs={"X": x,
-                "Y": rsquare_expanded},
-        outputs={"Out": out})
     return out
 
 
@@ -2432,8 +3420,82 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     return out
 
 
-def edit_distance(input, label, normalized=True, ignored_tokens=None,
-                  name=None):
+def topk(input, k, name=None):
+    """
+    This operator is used to find values and indices of the k largest entries
+    for the last dimension.
+
+    If the input is a vector (1-D Tensor), finds the k largest entries in the vector
+    and outputs their values and indices as vectors. Thus values[j] is the j-th
+    largest entry in input, and its index is indices[j].
+
+    If the input is a Tensor with higher rank, this operator computes the top k
+    entries along the last dimension.
+
+    For example:
+
+    .. code-block:: text
+
+        If:
+            input = [[5, 4, 2, 3],
+                     [9, 7, 10, 25],
+                     [6, 2, 10, 1]]
+            k = 2
+
+        Then:
+            The first output:
+            values = [[5, 4],
+                      [10, 25],
+                      [6, 10]]
+
+            The second output:
+            indices = [[0, 1],
+                       [2, 3],
+                       [0, 2]]
+
+    Args:
+        input(Variable): The input variable which can be a vector or Tensor with
+            higher rank.
+        k(int):  The number of top elements to look for along the last dimension
+                 of input.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
+                       Default: None
+
+    Returns:
+        Tuple[Variable]: A tuple with two elements. Each element is a Variable.
+        The first one is k largest elements along each last
+        dimensional slice. The second one is indices of values
+        within the last dimension of input.
+
+    Raises:
+        ValueError: If k < 1 or k is not less than the last dimension of input
+
+    Examples:
+        .. code-block:: python
+
+            top5_values, top5_indices = layers.topk(input, k=5)
+    """
+    shape = input.shape
+    if k < 1 or k >= shape[-1]:
+        raise ValueError("k must be greater than 0 and less than %d." %
+                         (shape[-1]))
+
+    helper = LayerHelper("top_k", **locals())
+    values = helper.create_tmp_variable(dtype=input.dtype)
+    indices = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="top_k",
+        inputs={"X": [input]},
+        outputs={"Out": [values],
+                 "Indices": [indices]},
+        attrs={"k": k})
+    values.stop_gradient = True
+    indices.stop_gradient = True
+    return values, indices
+
+
+def edit_distance(input, label, normalized=True, ignored_tokens=None):
     """
     EditDistance operator computes the edit distances between a batch of
     hypothesis strings and their references. Edit distance, also called
@@ -2447,26 +3509,23 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None,
 
     "kitten" -> "sitten" -> "sittin" -> "sitting"
 
-    Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with
+    The input is a LoDTensor consisting of all the hypothesis strings with
     the total number denoted by `batch_size`, and the separation is specified
     by the LoD information. And the `batch_size` reference strings are arranged
-    in order in the same way in the LoDTensor Input(Refs).
+    in order in the same way in the input LoDTensor.
 
-    Output(Out) contains the `batch_size` results and each stands for the edit
+    The output contains the `batch_size` results and each stands for the edit
     distance for a pair of strings respectively. If Attr(normalized) is true,
     the edit distance will be divided by the length of reference string.
 
     Args:
-
         input(Variable): The indices for hypothesis strings.
-
         label(Variable): The indices for reference strings.
-
-        normalized(bool): Indicated whether to normalize the edit distance by
+        normalized(bool, default True): Indicated whether to normalize the edit distance by
                           the length of reference string.
-
-        ignored_tokens(list of int): Tokens that should be removed before
+        ignored_tokens(list<int>, default None): Tokens that should be removed before
                                      calculating edit distance.
+        name (str): The name of this layer. It is optional.
 
     Returns:
         Variable: sequence-to-sequence edit distance in shape [batch_size, 1].
@@ -2476,7 +3535,6 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None,
 
             x = fluid.layers.data(name='x', shape=[8], dtype='float32')
             y = fluid.layers.data(name='y', shape=[7], dtype='float32')
-
             cost = fluid.layers.edit_distance(input=x,label=y)
     """
     helper = LayerHelper("edit_distance", **locals())
@@ -2496,7 +3554,7 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None,
         helper.append_op(
             type="sequence_erase",
             inputs={"X": [label]},
-            outputs={"Out": [erase_label]},
+            outputs={"Out": [erased_label]},
             attrs={"tokens": ignored_tokens})
         label = erased_label
 
@@ -2517,6 +3575,7 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None,
 def ctc_greedy_decoder(input, blank, name=None):
     """
     This op is used to decode sequences by greedy policy by below steps:
+
     1. Get the indexes of max value for each row in input. a.k.a.
        numpy.argmax(input, axis=0).
     2. For each sequence in result of step1, merge repeated tokens between two
@@ -2538,7 +3597,7 @@ def ctc_greedy_decoder(input, blank, name=None):
                       [0.2, 0.2, 0.1, 0.5],
                       [0.5, 0.1, 0.3, 0.1]]
 
-        input.lod = [[0, 4, 8]]
+        input.lod = [[4, 4]]
 
         Then:
 
@@ -2546,7 +3605,7 @@ def ctc_greedy_decoder(input, blank, name=None):
                        [1],
                        [3]]
 
-        output.lod = [[0, 2, 3]]
+        output.lod = [[2, 1]]
 
     Args:
 
@@ -2556,14 +3615,14 @@ def ctc_greedy_decoder(input, blank, name=None):
                          where Lp is the sum of all input sequences' length and
                          num_classes is the true number of classes. (not
                          including the blank label).
-
         blank(int): the blank label index of Connectionist Temporal
                     Classification (CTC) loss, which is in thehalf-opened
                     interval [0, num_classes + 1).
+        name (str): The name of this layer. It is optional.
 
     Returns:
         Variable: CTC greedy decode result. If all the sequences in result were
-        empty, the result LoDTensor will be [-1] with LoD [[0]] and dims [1, 1].
+        empty, the result LoDTensor will be [-1] with LoD [[]] and dims [1, 1].
 
     Examples:
         .. code-block:: python
@@ -2573,15 +3632,7 @@ def ctc_greedy_decoder(input, blank, name=None):
             cost = fluid.layers.ctc_greedy_decoder(input=x, blank=0)
     """
     helper = LayerHelper("ctc_greedy_decoder", **locals())
-    # top 1 op
-    topk_out = helper.create_tmp_variable(dtype=input.dtype)
-    topk_indices = helper.create_tmp_variable(dtype="int64")
-    helper.append_op(
-        type="top_k",
-        inputs={"X": [input]},
-        outputs={"Out": [topk_out],
-                 "Indices": [topk_indices]},
-        attrs={"k": 1})
+    _, topk_indices = topk(input, k=1)
 
     # ctc align op
     ctc_out = helper.create_tmp_variable(dtype="int64")
@@ -2604,35 +3655,33 @@ def warpctc(input, label, blank=0, norm_by_times=False):
     input tensor.
 
     Args:
-       input(Variable): (LodTensor, default: LoDTensor<float>),
-         the unscaled probabilities of variable-length sequences,
+       input (Variable): The unscaled probabilities of variable-length sequences,
          which is a 2-D Tensor with LoD information.
          It's shape is [Lp, num_classes + 1], where Lp is the sum of all input
          sequences' length and num_classes is the true number of classes.
          (not including the blank label).
-       label(Variable): (LodTensor, default: LoDTensor<int>), the ground truth
-         of variable-length sequence, which is a 2-D Tensor with LoD
-         information. It is of the shape [Lg, 1], where Lg is th sum of
-         all labels' length.
-       blank: (int, default: 0), the blank label index of Connectionist
+       label (Variable): The ground truth of variable-length sequence,
+         which is a 2-D Tensor with LoD information. It is of the shape [Lg, 1],
+         where Lg is th sum of all labels' length.
+       blank (int, default 0): The blank label index of Connectionist
          Temporal Classification (CTC) loss, which is in the
          half-opened interval [0, num_classes + 1).
-       norm_by_times: (bool, default: false), whether to normalize
-       the gradients by the number of time-step, which is also the
-       sequence's length. There is no need to normalize the gradients
-       if warpctc layer was follewed by a mean_op.
+       norm_by_times(bool, default false): Whether to normalize the gradients
+         by the number of time-step, which is also the sequence's length.
+         There is no need to normalize the gradients if warpctc layer was
+         follewed by a mean_op.
 
     Returns:
         Variable: The Connectionist Temporal Classification (CTC) loss,
         which is a 2-D Tensor of the shape [batch_size, 1].
 
     Examples:
+
         .. code-block:: python
-            y = layers.data(
-                name='y', shape=[11, 8], dtype='float32', lod_level=1)
-            y_predict = layers.data(
-                name='y_predict', shape=[11, 1], dtype='float32')
-            cost = layers.warpctc(input=y_predict, label=y)
+
+            label = fluid.layers.data(shape=[11, 8], dtype='float32', lod_level=1)
+            predict = fluid.layers.data(shape=[11, 1], dtype='float32')
+            cost = fluid.layers.warpctc(input=predict, label=label)
 
     """
     helper = LayerHelper('warpctc', **locals())
@@ -2662,16 +3711,20 @@ def sequence_reshape(input, new_dim):
 
         x is a LoDTensor:
             x.lod  = [[0, 2, 6]]
-            x.data = [[1, 2], [3, 4],
-                      [5, 6], [7, 8], [9, 10], [11, 12]]
+            x.data = [[1,  2], [3,  4],
+                      [5,  6], [7,  8],
+                      [9, 10], [11, 12]]
             x.dims = [6, 2]
 
         set new_dim = 4
 
         then out is a LoDTensor:
+
             out.lod  = [[0, 1, 3]]
-            out.data = [[1, 2, 3, 4],
-                        [5, 6, 7, 8], [9, 10, 11, 12]]
+
+            out.data = [[1,  2,  3,  4],
+                        [5,  6,  7,  8],
+                        [9, 10, 11, 12]]
             out.dims = [3, 4]
 
     Currently, only 1-level LoDTensor is supported and please make sure
@@ -2679,19 +3732,19 @@ def sequence_reshape(input, new_dim):
     no remainder for each sequence.
 
     Args:
-       input (Variable): (LodTensor, default: LoDTensor<float>), a 2-D LoDTensor
-                with shape being [N, M] where M for dimension.
-       new_dim (int): New dimension which the input LoDTensor is reshaped to.
+
+       input (Variable): A 2-D LoDTensor with shape being [N, M] where M for dimension.
+       new_dim (int): New dimension that the input LoDTensor is reshaped to.
 
     Returns:
+
         Variable: Reshaped LoDTensor according to new dimension.
 
     Examples:
         .. code-block:: python
 
-            x = fluid.layers.data(name='x', shape=[5, 20],
-                              dtype='float32', lod_level=1)
-            x_reshaped = layers.sequence_reshape(input=x, new_dim=10)
+            x = fluid.layers.data(shape=[5, 20], dtype='float32', lod_level=1)
+            x_reshaped = fluid.layers.sequence_reshape(input=x, new_dim=10)
     """
     helper = LayerHelper('sequence_reshape', **locals())
     out = helper.create_tmp_variable(helper.input_dtype())
@@ -2703,7 +3756,10 @@ def sequence_reshape(input, new_dim):
     return out
 
 
-@autodoc()
+# FIXME(wuyi): let docstring_checker.py understand @autodoc.
+# For now, the comments in c++ use types like Tensor, but in python side
+# the type is often "Variable", and arguments may vary.
+@templatedoc(op_type="nce")
 def nce(input,
         label,
         num_total_classes,
@@ -2711,7 +3767,50 @@ def nce(input,
         param_attr=None,
         bias_attr=None,
         num_neg_samples=None):
-    helper = LayerHelper('nce', **locals())
+    """
+    ${comment}
+
+    Args:
+        input (Variable): input variable.
+        label (Variable): label.
+        num_total_classes (int):${num_total_classes_comment}
+        sample_weight (Variable|None): A Variable of shape [batch_size, 1]
+            storing a weight for each sample. The default weight for each
+            sample is 1.0.
+        param_attr (ParamAttr|None): attributes for parameter
+        bias_attr (ParamAttr|None): attributes for bias
+        num_neg_samples (int): ${num_neg_samples_comment}
+
+    Returns:
+        Variable: The output nce loss.
+
+    Examples:
+        .. code-block:: python
+
+            window_size = 5
+            words = []
+            for i in xrange(window_size):
+                words.append(layers.data(
+                    name='word_{0}'.format(i), shape=[1], dtype='int64'))
+
+            dict_size = 10000
+            label_word = int(window_size / 2) + 1
+
+            embs = []
+            for i in xrange(window_size):
+                if i == label_word:
+                    continue
+
+                emb = layers.embedding(input=words[i], size=[dict_size, 32],
+                                       param_attr='emb.w', is_sparse=True)
+                embs.append(emb)
+
+            embs = layers.concat(input=embs, axis=1)
+            loss = layers.nce(input=embs, label=words[label_word],
+                          num_total_classes=dict_size, param_attr='nce.w',
+                          bias_attr='nce.b')
+    """
+    helper = LayerHelper('nce', **locals())
     assert isinstance(input, Variable)
     dim = input.shape[1]
     assert isinstance(label, Variable)
@@ -2760,16 +3859,15 @@ def nce(input,
 
 def transpose(x, perm, name=None):
     """
-    **transpose Layer**
-
     Permute the dimensions of `input` according to `perm`.
 
     The `i`-th dimension  of the returned tensor will correspond to the
     perm[i]-th dimension of `input`.
 
     Args:
-       input (Variable): (Tensor), A Tensor.
-       perm (list): A permutation of the dimensions of `input`.
+        x (Variable): The input Tensor.
+        perm (list): A permutation of the dimensions of `input`.
+        name (str): The name of this layer. It is optional.
 
     Returns:
         Variable: A transposed Tensor.
@@ -2850,8 +3948,6 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
 
     Examples:
 
-    As an example:
-
         .. code-block:: text
 
             Given:
@@ -2893,9 +3989,9 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
 
             output.dims = {8, 9}
 
-            output.lod = [[0, 4, 8]]
+            output.lod = [[4, 4]]
 
-        The simple usage is:
+     Examples:
 
         .. code-block:: python
 
@@ -2928,29 +4024,13 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
     return out
 
 
+@templatedoc()
 def row_conv(input, future_context_size, param_attr=None, act=None):
-    """Row Conv Operator. This layer will apply lookahead convolution to
-    **input**. The input variable should be a 2D LoDTensor with shape [T, D].
-    Parameters with shape [future_context_size + 1, D] will be created. The math
-    equation of row convolution is as follows:
-
-    .. math::
-        Out_{i} = \sum_{j = i} ^ {i + \\tau} X_{j} \odot W_{i - j}
-
-    In the above equation:
-
-    * :math:`Out_{i}`: The i-th row of output variable with shape [1, D].
-    * :math:`\\tau`: Future context size.
-    * :math:`X_{j}`: The j-th row of input variable with shape [1, D].
-    * :math:`W_{i-j}`: The (i-j)-th row of parameters with shape [1, D].
-
-    More details about row_conv please refer to the paper \
-    (http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf) and
-    the design document \
-    (https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645).
+    """
+    ${comment}
 
     Args:
-        input (Variable): Input variable, a 2D LoDTensor with shape [T, D].
+        input (${x_type}): ${x_comment}.
         future_context_size (int): Future context size. Please note, the shape
             of convolution kernel is [future_context_size + 1, D].
         param_attr (ParamAttr): Attributes of parameters, including
@@ -2958,14 +4038,13 @@ def row_conv(input, future_context_size, param_attr=None, act=None):
         act (str): Non-linear activation to be applied to output variable.
 
     Returns:
-        Variable: The output tensor with same shape as input tensor.
+        ${out_comment}.
 
     Examples:
-        .. code-block:: python
-
-            x = fluid.layers.data(name='x', shape=[16],
-                            dtype='float32', lod_level=1)
-            out = fluid.layers.row_conv(input=x, future_context_size=2)
+        >>> import paddle.fluid as fluid
+        >>> x = fluid.layers.data(name='x', shape=[16],
+        >>>                        dtype='float32', lod_level=1)
+        >>> out = fluid.layers.row_conv(input=x, future_context_size=2)
     """
     helper = LayerHelper('row_conv', **locals())
     dtype = helper.input_dtype()
@@ -2981,42 +4060,23 @@ def row_conv(input, future_context_size, param_attr=None, act=None):
     return helper.append_activation(out)
 
 
+@templatedoc()
 def multiplex(inputs, index):
     """
-    **Multiplex Layer**
+    ${comment}
 
-    Referring to the given index variable, this layer selects rows from the
-    input variables to construct a multiplex variable. Assuming that there are
-    :math:`m` input variables and :math:`I_i` represents the i-th input
-    variable and :math:`i` is in [0, :math:`m`). All input variables are
-    tensors with same shape [:math:`d_0`, :math:`d_1`, ..., :math:`d_R`].
-    Please note that rank of the input tensor should be at least 2. Each input
-    variable will be treated as a 2-D matrix with shape [:math:`M`, :math:`N`]
-    where :math:`M` for :math:`d_0` and :math:`N` for :math:`d_1` * :math:`d_2`
-    * ... * :math:`d_R`. Let :math:`I_i[j]` be the j-th row of the i-th input
-    variable. The given index variable should be a 2-D tensor with shape
-    [:math:`M`, 1]. Let `ID[i]` be the i-th index value of the index variable.
-    Then the output variable will be a tensor with shape [:math:`d_0`,
-    :math:`d_1`, ..., :math:`d_R`]. If we treat the output tensor as a 2-D
-    matrix with shape [:math:`M`, :math:`N`] and let :math:`O[i]` be the i-th
-    row of the matrix, then `O[i]` is equal to :math:`I_{ID[i]}[i]`.
+    >>> import paddle.fluid as fluid
+    >>> x1 = fluid.layers.data(name='x1', shape=[4], dtype='float32')
+    >>> x2 = fluid.layers.data(name='x2', shape=[4], dtype='float32')
+    >>> index = fluid.layers.data(name='index', shape=[1], dtype='int32')
+    >>> out = fluid.layers.multiplex(inputs=[x1, x2], index=index)
 
     Args:
-       inputs (list): A list of variables to gather from. All variables have the
-                same shape and the rank is at least 2.
-       index (Variable): Tensor<int32>, index variable which is a 2-D tensor
-                with shape [M, 1] where M is the batch size.
+       inputs (list): ${x_comment}.
+       index (${ids_type}): ${ids_comment}.
 
     Returns:
-        Variable: Multiplex variable gathered from input variables.
-
-    Examples:
-        .. code-block:: python
-
-            x1 = fluid.layers.data(name='x1', shape=[4], dtype='float32')
-            x2 = fluid.layers.data(name='x2', shape=[4], dtype='float32')
-            index = fluid.layers.data(name='index', shape=[1], dtype='int32')
-            out = fluid.layers.multiplex(inputs=[x1, x2], index=index)
+        ${out_comment}.
     """
     helper = LayerHelper('multiplex', **locals())
 
@@ -3084,7 +4144,8 @@ def softmax_with_cross_entropy(logits, label, soft_label=False):
             data = fluid.layers.data(name='data', shape=[128], dtype='float32')
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             fc = fluid.layers.fc(input=data, size=100)
-            out = fluid.layers.softmax_with_cross_entropy(logits=fc, label=label)
+            out = fluid.layers.softmax_with_cross_entropy(
+                logits=fc, label=label)
     """
     helper = LayerHelper('softmax_with_cross_entropy', **locals())
     softmax = helper.create_tmp_variable(dtype=logits.dtype)
@@ -3101,40 +4162,41 @@ def softmax_with_cross_entropy(logits, label, soft_label=False):
 
 def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
     """
-    **Smooth L1 Loss Operator. **
-
-    This operator computes the smooth l1 loss for X and Y.
-    The operator takes the first dimension of X and Y as batch size.
-    For each instance, it computes the smooth l1 loss element by element first
-    and then sums all the losses. So the shape of Out is [batch_size, 1].
+    This layer computes the smooth L1 loss for Variable :attr:`x` and :attr:`y`.
+    It takes the first dimension of :attr:`x` and :attr:`y` as batch size.
+    For each instance, it computes the smooth L1 loss element by element first
+    and then sums all the losses. So the shape of ouput Variable is
+    [batch_size, 1].
 
     Args:
         x (Variable): A tensor with rank at least 2. The input value of smooth
-            l1 loss op with shape [batch_size, dim1, ..., dimN].
+            L1 loss op with shape [batch_size, dim1, ..., dimN].
         y (Variable): A tensor with rank at least 2. The target value of smooth
-            l1 loss op with same shape as x.
+            L1 loss op with same shape as :attr:`x`.
         inside_weight (Variable|None):  A tensor with rank at least 2. This
-            input is optional and should have same shape with x. If provided,
-            the result of (x - y) will be multiplied by this tensor element by
-            element.
+            input is optional and should have same shape with :attr:`x`. If
+            provided, the result of (:attr:`x` - :attr:`y`) will be multiplied
+            by this tensor element by element.
         outside_weight (Variable|None): A tensor with rank at least 2. This
-            input is optional and should have same shape with x. If provided,
-            the out smooth l1 loss will be multiplied by this tensor element
-            by element.
-        sigma (float|None): Hyper parameter of smooth l1 loss op. A float scalar
-            with default value 1.0.
+            input is optional and should have same shape with :attr:`x`. If
+            provided, the out smooth L1 loss will be multiplied by this tensor
+            element by element.
+        sigma (float|None): Hyper parameter of smooth L1 loss layer. A float
+           scalar with default value 1.0.
+
     Returns:
-        Variable: A tensor with rank be 2. The output smooth l1 loss with
-            shape [batch_size, 1].
+        Variable: The output smooth L1 loss with shape [batch_size, 1].
 
     Examples:
         .. code-block:: python
 
             data = fluid.layers.data(name='data', shape=[128], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[100], dtype='int64')
+            label = fluid.layers.data(
+                name='label', shape=[100], dtype='float32')
             fc = fluid.layers.fc(input=data, size=100)
             out = fluid.layers.smooth_l1(x=fc, y=label)
     """
+
     helper = LayerHelper('smooth_l1_loss', **locals())
     diff = helper.create_tmp_variable(dtype=x.dtype)
     loss = helper.create_tmp_variable(dtype=x.dtype)
@@ -3154,30 +4216,20 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
 
 def one_hot(input, depth):
     """
-    One Hot Operator. This operator creates the one-hot representations for input
-    index values. The following example will help to explain the function of this
-    operator.
+    This layer creates the one-hot representations for input indices.
 
     Args:
-        input(variable):  A Tensor/LodTensor of indices, last dimension must be 1.
-        depth(scalar): an interger defining the depth of the one hot dimension.
+        input(Variable): Input indices, last dimension must be 1.
+        depth(scalar): An interger defining the depth of the one-hot dimension.
 
     Returns:
-         The one-hot tensor or LodTensor, same as input.
+        Variable: The one-hot representations of input.
 
     Examples:
-        X is a LoDTensor:
-          X.lod = [[0, 1, 4]]
-          X.shape = [4, 1]
-          X.data = [[1], [1], [3], [0]]
-        set depth = 4
-        Out is a LoDTensor:
-          Out.lod = [[0, 1, 4]]
-          Out.shape = [4, 4]
-          Out.data = [[0., 1., 0., 0.],
-                      [0., 1., 0., 0.],
-                      [0., 0., 0., 1.],
-                      [1., 0., 0., 0.]]
+        .. code-block:: python
+
+            label = layers.data(name="label", shape=[1], dtype="float32")
+            one_hot_label = layers.one_hot(input=label, depth=10)
     """
     helper = LayerHelper("one_hot", **locals())
     one_hot_out = helper.create_tmp_variable(dtype='float32')
@@ -3191,15 +4243,23 @@ def one_hot(input, depth):
 
 def autoincreased_step_counter(counter_name=None, begin=1, step=1):
     """
-    NOTE: The counter will be automatically increased by 1 every mini-batch
-    Return the run counter of the main program, which is started with 1.
+    Create an auto-increase variable
+    which will be automatically increased by 1 every mini-batch
+    Return the run counter of the main program, default is started from 1.
 
     Args:
         counter_name(str): The counter name, default is '@STEP_COUNTER@'.
         begin(int): The first value of this counter.
         step(int): The increment step between each execution.
 
-    Returns(Variable): The global run counter.
+    Returns:
+        Variable: The global run counter.
+
+    Examples:
+        .. code-block:: python
+
+           global_step = fluid.layers.autoincreased_step_counter(
+               counter_name='@LR_DECAY_COUNTER@', begin=begin, step=1)
     """
     helper = LayerHelper('global_step_counter')
     if counter_name is None:
@@ -3218,3 +4278,909 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
         counter.stop_gradient = True
 
     return counter
+
+
+def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
+    """
+    Gives a new shape to the input Tensor without changing its data.
+
+    The target shape can be given by :attr:`shape` or :attr:`actual_shape`.
+    :attr:`shape` is a list of integer while :attr:`actual_shape` is a tensor
+    variable. :attr:`actual_shape` has a higher priority than :attr:`shape`
+    if it is provided, while :attr:`shape` still should be set correctly to
+    gurantee shape inference in compile-time.
+
+    Some tricks exist when specifying the target shape.
+
+    1. -1 means the value of this dimension is inferred from the total element
+    number of x and remaining dimensions. Thus one and only one dimension can
+    be set -1.
+
+    2. 0 means the actual dimension value is going to be copied from the
+    corresponding dimension of x. The indice of 0s in shape can not exceed
+    Rank(X).
+
+    Here are some examples to explain it.
+
+    1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    is [6, 8], the reshape operator will transform x into a 2-D tensor with
+    shape [6, 8] and leaving x's data unchanged.
+
+    2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    specified is [2, 3, -1, 2], the reshape operator will transform x into a
+    4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this
+    case, one dimension of the target shape is set to -1, the value of this
+    dimension is inferred from the total element number of x and remaining
+    dimensions.
+
+    3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
+    is [-1, 0, 3, 2], the reshape operator will transform x into a 4-D tensor
+    with shape [2, 4, 3, 2] and leaving x's data unchanged. In this case,
+    besides -1, 0 means the actual dimension value is going to be copied from
+    the corresponding dimension of x.
+
+    Args:
+        x(variable): The input tensor.
+        shape(list): The new shape. At most one dimension of the new shape can
+                     be -1.
+        actual_shape(variable): An optional input. If provided, reshape
+                                according to this given shape rather than
+                                :attr:`shape` specifying shape. That is to
+                                say :attr:`actual_shape` has a higher priority
+                                than :attr:`shape`.
+        act (str): The non-linear activation to be applied to output variable.
+        inplace(bool): If this flag is set true, the output
+                       shares data with input without copying, otherwise
+                       a new output tensor is created
+                       whose data is copied from input x.
+        name (str): The name of this layer. It is optional.
+
+    Returns:
+        Variable: The output tensor.
+
+    Raises:
+        TypeError: if actual_shape is neither Variable nor None.
+
+    Examples:
+        .. code-block:: python
+
+            data = fluid.layers.data(
+                name='data', shape=[2, 4, 6], dtype='float32')
+            reshaped = fluid.layers.reshape(
+                x=data, shape=[-1, 0, 3, 2], act='tanh', inplace=True)
+    """
+
+    if not (isinstance(shape, list) or isinstance(shape, tuple)):
+        raise ValueError("Input shape must be a python lsit or tuple.")
+    inputs = {"X": x}
+    if isinstance(actual_shape, Variable):
+        inputs["Shape"] = actual_shape
+    elif actual_shape is not None:
+        raise TypeError("actual_shape should either be Variable or None")
+
+    # Validate the shape
+    unk_dim_idx = -1
+    for dim_idx, dim_size in enumerate(shape):
+        if dim_size == -1:
+            assert unk_dim_idx == -1, (
+                "Only one dimension in shape can be unknown.")
+            unk_dim_idx = dim_idx
+        elif dim_size == 0:
+            assert dim_idx < len(x.shape), (
+                "The indice of 0s in shape can not exceed Rank(X).")
+        else:
+            assert dim_size > 0, (
+                "Each dimension size given in shape must not be negtive "
+                "except one unknown dimension.")
+
+    helper = LayerHelper("reshape", **locals())
+    reshaped = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type="reshape",
+        inputs=inputs,
+        attrs={"shape": shape,
+               "inplace": inplace},
+        outputs={"Out": reshaped})
+
+    return helper.append_activation(reshaped)
+
+
+def lod_reset(x, y=None, target_lod=None):
+    """
+    Set LoD of :attr:`x` to a new one specified by :attr:`y` or
+    :attr:`target_lod`. When :attr:`y` provided, :attr:`y.lod` would be
+    considered as target LoD first, otherwise :attr:`y.data` would be
+    considered as target LoD. If :attr:`y` is not provided, target LoD should
+    be specified by :attr:`target_lod`. If target LoD is specified by
+    :attr:`Y.data` or :attr:`target_lod`, only one level LoD is supported.
+
+    .. code-block:: text
+
+        * Example 1:
+
+            Given a 1-level LoDTensor x:
+                x.lod =  [[ 2,           3,                   1 ]]
+                x.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+                x.dims = [6, 1]
+
+            target_lod: [4, 2]
+
+            then we get a 1-level LoDTensor:
+                out.lod =  [[4,                          2]]
+                out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+                out.dims = [6, 1]
+
+        * Example 2:
+
+            Given a 1-level LoDTensor x:
+                x.lod =  [[2,            3,                   1]]
+                x.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+                x.dims = [6, 1]
+
+            y is a Tensor:
+                y.data = [[2, 4]]
+                y.dims = [1, 3]
+
+            then we get a 1-level LoDTensor:
+                out.lod =  [[2,            4]]
+                out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+                out.dims = [6, 1]
+
+        * Example 3:
+
+            Given a 1-level LoDTensor x:
+                x.lod =  [[2,            3,                   1]]
+                x.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+                x.dims = [6, 1]
+
+            y is a 2-level LoDTensor:
+                y.lod =  [[2, 2], [2, 2, 1, 1]]
+                y.data = [[1.1], [2.1], [3.1], [4.1], [5.1], [6.1]]
+                y.dims = [6, 1]
+
+            then we get a 2-level LoDTensor:
+                out.lod =  [[2, 2], [2, 2, 1, 1]]
+                out.data = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
+                out.dims = [6, 1]
+
+    Args:
+        x (Variable): Input variable which could be a Tensor or LodTensor.
+        y (Variable|None): If provided, output's LoD would be derived
+                           from :attr:`y`.
+        target_lod (list|tuple|None): One level LoD which should be considered
+                                      as target LoD when :attr:`y` not provided.
+
+    Returns:
+        Variable: Output variable with LoD specified by this layer.
+
+    Raises:
+        ValueError: If :attr:`y` and :attr:`target_lod` are both None.
+
+    Examples:
+        .. code-block:: python
+
+            x = layers.data(name='x', shape=[10])
+            y = layers.data(name='y', shape=[10, 20], lod_level=2)
+            out = layers.lod_reset(x=x, y=y)
+    """
+    helper = LayerHelper("lod_reset", **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    if y is not None:
+        helper.append_op(
+            type="lod_reset", inputs={'X': x,
+                                      'Y': y}, outputs={'Out': out})
+    elif target_lod is not None:
+        helper.append_op(
+            type="lod_reset",
+            inputs={'X': x},
+            attrs={'target_lod': target_lod},
+            outputs={'Out': out})
+    else:
+        raise ValueError("y and target_lod should not be both None.")
+
+    return out
+
+
+def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
+    """
+    Local Response Normalization Layer. This layer performs a type of
+    "lateral inhibition" by normalizing over local input regions.
+
+    The formula is as follows:
+
+    .. math::
+
+      Output(i, x, y) = Input(i, x, y) / \\left(k + \\alpha \\sum\\limits^{\\min(C, c + n/2)}_{j = \\max(0, c - n/2)}(Input(j, x, y))^2\\right)^{\\beta}
+
+    In the above equation:
+
+    * :math:`n`: The number of channels to sum over.
+    * :math:`k`: The offset (avoid being divided by 0).
+    * :math:`alpha`: The scaling parameter.
+    * :math:`beta`: The exponent parameter.
+
+    Refer to `ImageNet Classification with Deep Convolutional Neural Networks
+    <https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf>`_
+
+    Args:
+        input (Variable): The input tensor of this layer, and the dimension of input tensor must be 4.
+        n (int, default 5): The number of channels to sum over.
+        k (float, default 1.0): An offset (usually positive to avoid dividing by 0).
+        alpha (float, default 1e-4): The scaling parameter.
+        beta (float, default 0.75): The exponent.
+        name (str, default None): A name for this operation.
+
+    Raises:
+        ValueError: If rank of the input tensor is not 4.
+
+    Returns:
+        A tensor variable storing the transformation result.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(
+              name="data", shape=[3, 112, 112], dtype="float32")
+          lrn = fluid.layers.lrn(input=data)
+    """
+    helper = LayerHelper('lrn', **locals())
+    dtype = helper.input_dtype()
+    input_shape = input.shape
+    dims = len(input_shape)
+
+    if dims != 4:
+        raise ValueError(
+            "dims of input must be 4(not %d), and it's order must be NCHW" %
+            (dims))
+
+    mid_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    lrn_out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="lrn",
+        inputs={"X": input},
+        outputs={
+            "Out": lrn_out,
+            "MidOut": mid_out,
+        },
+        attrs={"n": n,
+               "k": k,
+               "alpha": alpha,
+               "beta": beta})
+
+    return lrn_out
+
+
+def pad(x, paddings, pad_value=0., name=None):
+    """
+    Pads a tensor with a constant value given by :attr:`pad_value`, and the
+    padded width is specified by :attr:`paddings`.
+
+    Specifically, the number of values padded before the contents of :attr:`x`
+    in dimension :attr:`i` is indicated by :attr:`paddings[i]`, and the number
+    of values padded after the contents of :attr:`x` in dimension :attr:`i` is
+    indicated by :attr:`paddings[i+1]`.
+
+    See below for an example.
+
+    .. code-block:: text
+
+        Given:
+            x = [[1, 2], [3, 4]]
+
+            paddings = [0, 1, 1, 2]
+
+            pad_value = 0
+
+        Return:
+
+            out = [[0, 1, 2, 0, 0]
+                   [0, 3, 4, 0, 0]
+                   [0, 0, 0, 0, 0]]
+
+    Args:
+        x (Variable): The input tensor variable.
+        paddings (list): A list of integers. Its elements specify the padded
+                         width before and after for each dimension in turn.
+                         The length of :attr:paddings must be
+                         :math:`rank(x) \\times 2`.
+        pad_value (float): The constant value used to pad.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        Variable: The padded tensor variable.
+
+    Examples:
+        .. code-block:: python
+
+            # x is a rank 2 tensor variable.
+            out = fluid.layers.pad(
+                x=x, paddings=[0, 1, 1, 2], pad_value=0.)
+    """
+    helper = LayerHelper('pad', input=x, **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type='pad',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'paddings': paddings,
+               'pad_value': float(pad_value)})
+    return out
+
+
+def label_smooth(label,
+                 prior_dist=None,
+                 epsilon=0.1,
+                 dtype="float32",
+                 name=None):
+    """
+    Label smoothing is a mechanism to regularize the classifier layer and is
+    called label-smoothing regularization (LSR).
+
+    Label smoothing is proposed to encourage the model to be less confident,
+    since optimizing the log-likelihood of the correct label directly may
+    cause overfitting and reduce the ability of the model to adapt. Label
+    smoothing replaces the ground-truth label :math:`y` with the weighted sum
+    of itself and some fixed distribution :math:`\mu`. For class :math:`k`,
+    i.e.
+
+    .. math::
+
+        \\tilde{y_k} = (1 - \epsilon) * y_k + \epsilon * \mu_k,
+
+    where :math:`1 - \epsilon` and :math:`\epsilon` are the weights
+    respectively, and :math:`\\tilde{y}_k` is the smoothed label. Usually
+    uniform distribution is used for :math:`\mu`.
+
+    See more details about label smoothing in https://arxiv.org/abs/1512.00567.
+
+    Args:
+        label(Variable): The input variable containing the label data. The
+                          label data should use one-hot representation.
+        prior_dist(Variable): The prior distribution to be used to smooth
+                              labels. If not provided, an uniform distribution
+                              is used. The shape of :attr:`prior_dist` should
+                              be :math:`(1, class\_num)`.
+        epsilon(float): The weight used to mix up the original ground-truth
+                        distribution and the fixed distribution.
+        dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32,
+                                                  float_64, int etc.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        Variable: The tensor variable containing the smoothed labels.
+
+    Examples:
+        .. code-block:: python
+
+            label = layers.data(name="label", shape=[1], dtype="float32")
+            one_hot_label = layers.one_hot(input=label, depth=10)
+            smooth_label = layers.label_smooth(
+                label=one_hot_label, epsilon=0.1, dtype="float32")
+    """
+    if epsilon > 1. or epsilon < 0.:
+        raise ValueError("The value of epsilon must be between 0 and 1.")
+    helper = LayerHelper("label_smooth", **locals())
+    label.stop_gradient = True
+    smooth_label = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="label_smooth",
+        inputs={"X": label,
+                "PriorDist": prior_dist} if prior_dist else {"X": label},
+        outputs={"Out": smooth_label},
+        attrs={"epsilon": float(epsilon)})
+    return smooth_label
+
+
+@templatedoc()
+def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
+    """
+    ${comment}
+
+    Args:
+        input (Variable): ${x_comment}
+        rois (Variable): ROIs (Regions of Interest) to pool over.
+        pooled_height (integer): ${pooled_height_comment} Default: 1
+        pooled_width (integer): ${pooled_width_comment} Default: 1
+        spatial_scale (float): ${spatial_scale_comment} Default: 1.0
+
+    Returns:
+        Variable: ${out_comment}.
+
+    Examples:
+        .. code-block:: python
+
+            pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0)
+    """
+    helper = LayerHelper('roi_pool', **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_tmp_variable(dtype)
+    argmaxes = helper.create_tmp_variable(dtype='int32')
+    helper.append_op(
+        type="roi_pool",
+        inputs={"X": input,
+                "ROIs": rois},
+        outputs={"Out": pool_out,
+                 "Argmax": argmaxes},
+        attrs={
+            "pooled_height": pooled_height,
+            "pooled_width": pooled_width,
+            "spatial_scale": spatial_scale
+        })
+    return pool_out
+
+
+def dice_loss(input, label, epsilon=0.00001):
+    """
+    Dice loss for comparing the similarity of two batch of data,
+    usually is used for binary image segmentation i.e. labels are binary.
+    The dice loss can be defined as below equation:
+
+    .. math::
+
+        dice\_loss &= 1 - \\frac{2 * intersection\_area}{total\_area} \\\\
+                  &= \\frac{(total\_area - intersection\_area) - intersection\_area}{total\_area} \\\\
+                  &= \\frac{(union\_area - intersection\_area)}{total\_area}
+
+
+    Args:
+        input (Variable): The predictions with rank>=2. The first dimension is batch size,
+                          and the last dimension is class number.
+        label (Variable): The groud truth with the same rank with input. The first dimension
+                          is batch size, and the last dimension is 1.
+        epsilon (float): The epsilon will be added to the numerator and denominator.
+                         If both input and label are empty, it makes sure dice is 1.
+                         Default: 0.00001
+
+    Returns:
+        dice_loss (Variable): The dice loss with shape [1].
+
+    Examples:
+        .. code-block:: python
+
+            predictions = fluid.layers.softmax(x)
+            loss = fluid.layers.dice_loss(input=predictions, label=label, 2)
+    """
+    label = one_hot(label, depth=input.shape[-1])
+    reduce_dim = range(1, len(input.shape))
+    inse = reduce_sum(input * label, dim=reduce_dim)
+    dice_denominator = reduce_sum(
+        input, dim=reduce_dim) + reduce_sum(
+            label, dim=reduce_dim)
+    dice_score = 1 - inse * 2 / (dice_denominator + epsilon)
+    return reduce_mean(dice_score)
+
+
+def image_resize(input,
+                 out_shape=None,
+                 scale=None,
+                 name=None,
+                 resample='BILINEAR'):
+    """
+    **Resize a Batch of Images**
+
+    The input must be a tensor of the shape (num_batches, channels, in_h, in_w),
+    and the resizing only applies on the last two dimensions(hight and width).
+
+    Supporting resample methods:
+
+        'BILINEAR' : Bilinear interpolation
+
+    Args:
+        input (Variable): The input tensor of image resize layer,
+                          This is a 4-D tensor of the shape
+                          (num_batches, channels, in_h, in_w).
+        out_shape(list|tuple|Variable|None): Output shape of image resize
+                                    layer, the shape is (out_h, out_w).
+                                    Default: None
+        scale(float|None): The multiplier for the input height or width.
+                         At least one of out_shape or scale must be set.
+                         And out_shape has a higher priority than scale.
+                         Default: None
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+        resample(str): The resample method. It can only be 'BILINEAR' currently.
+                       Default: 'BILINEAR'
+
+    Returns:
+        Variable: The output is a 4-D tensor of the shape
+        (num_batches, channls, out_h, out_w).
+
+    Examples:
+        .. code-block:: python
+
+            out = fluid.layers.image_resize(input, out_shape=[12, 12])
+    """
+    resample_methods = {'BILINEAR': 'bilinear_interp'}
+    if resample not in resample_methods:
+        raise ValueError(
+            "The 'resample' of image_resize can only be 'BILINEAR' currently.")
+    if out_shape is None and scale is None:
+        raise ValueError("One of out_shape and scale must not be None")
+    helper = LayerHelper('bilinear_interp', **locals())
+    dtype = helper.input_dtype()
+
+    def _is_list_or_turple_(data):
+        return (isinstance(data, list) or isinstance(data, tuple))
+
+    out_h = 0
+    out_w = 0
+    inputs = {"X": input}
+    if out_shape is not None:
+        if not (_is_list_or_turple_(out_shape) and
+                len(out_shape) == 2) and not isinstance(out_shape, Variable):
+            raise ValueError('out_shape should be a list or tuple or variable')
+        if _is_list_or_turple_(out_shape):
+            out_shape = list(map(int, out_shape))
+            out_h = out_shape[0]
+            out_w = out_shape[1]
+        else:
+            inputs['OutSize'] = out_shape
+    else:
+        out_h = int(input.shape[2] * scale)
+        out_w = int(input.shape[3] * scale)
+
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type=resample_methods[resample],
+        inputs=inputs,
+        outputs={"Out": out},
+        attrs={"out_h": out_h,
+               "out_w": out_w})
+    return out
+
+
+@templatedoc(op_type="bilinear_interp")
+def resize_bilinear(input, out_shape=None, scale=None, name=None):
+    """
+    ${comment}
+
+    Args:
+        input(${x_type}): ${x_comment}.
+
+        out_shape(${out_size_type}): ${out_size_comment}.
+
+        scale(float|None): The multiplier for the input height or width. At
+             least one of out_shape or scale must be set. And out_shape has
+             a higher priority than scale. Default: None.
+
+        name(str|None): The output variable name.
+
+    Returns:
+        ${out_comment}.
+    """
+
+    return image_resize(input, out_shape, scale, name, 'BILINEAR')
+
+
+def image_resize_short(input, out_short_len, resample='BILINEAR'):
+    """
+    Resize a batch of images. The short edge of input images will be
+    resized to the given 'out_short_len'. The long edge of input images
+    will be resized proportionately to make images' length-width ratio
+    constant.
+
+    Args:
+        input (Variable): The input tensor of image resize layer,
+                          This is a 4-D tensor of the shape
+                          (num_batches, channels, in_h, in_w).
+        out_short_len(int): The length of output images' short edge.
+        resample (str): resample method, default: BILINEAR.
+
+    Returns:
+        Variable: The output is a 4-D tensor of the shape
+        (num_batches, channls, out_h, out_w).
+    """
+    in_shape = input.shape
+    if len(in_shape) != 4:
+        raise ValueError(
+            "The rank of input must be 4 (num_batches, channels, in_h, in_w).")
+    hw = in_shape[2:4]
+    short_idx = hw.index(min(hw))
+    long_idx = 1 - short_idx
+    out_shape = list(hw)
+    out_shape[short_idx] = out_short_len
+    out_shape[long_idx] = int(
+        float(out_shape[long_idx]) * (float(out_short_len) / float(hw[
+            short_idx])) + 0.5)
+    return image_resize(input=input, out_shape=out_shape, resample=resample)
+
+
+def gather(input, index):
+    """
+    **Gather Layer**
+
+    Output is obtained by gathering entries of the outer-most dimension
+    of X indexed by `index` and concatenate them together.
+
+    .. math::
+
+        Out = X[Index]
+
+
+    .. code-block:: text
+
+
+                Given:
+
+                X = [[1, 2],
+                     [3, 4],
+                     [5, 6]]
+
+                Index = [1, 2]
+
+                Then:
+
+                Out = [[3, 4],
+                       [5, 6]]
+
+    Args:
+        input (Variable): The source input with rank>=1.
+        index (Variable): The index input with rank=1.
+
+    Returns:
+        output (Variable): The output is a tensor with the same rank as input.
+
+    Examples:
+
+        .. code-block:: python
+
+            output = fluid.layers.gather(x, index)
+    """
+    helper = LayerHelper('gather', **locals())
+    dtype = helper.input_dtype()
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type="gather",
+        inputs={"X": input,
+                "Index": index},
+        outputs={"Out": out})
+    return out
+
+
+@templatedoc()
+def random_crop(x, shape, seed=None):
+    """
+    ${comment}
+
+    Args:
+        x(${x_type}): ${x_comment}
+        shape(${shape_type}): ${shape_comment}
+        seed(int|${seed_type}|None): ${seed_comment} By default, the seed will
+            get from `random.randint(-65536, 65535)`.
+
+    Returns:
+        ${out_comment}
+
+    Examples:
+        >>> img = fluid.layers.data("img", [3, 256, 256])
+        >>> cropped_img = fluid.layers.random_crop(img, shape=[3, 224, 224])
+    """
+    helper = LayerHelper("random_crop", **locals())
+    dtype = x.dtype
+    out = helper.create_tmp_variable(dtype)
+    if seed is None:
+        seed = random.randint(-65536, 65535)
+    op_attrs = {"shape": shape}
+    if isinstance(seed, int):
+        op_attrs["startup_seed"] = seed
+        seed = helper.create_variable(
+            name=unique_name.generate("random_crop_seed"),
+            dtype="int64",
+            persistable=True)
+    elif not isinstance(seed, Variable):
+        raise ValueError("'seed' must be a Variable or an int.")
+    helper.append_op(
+        type="random_crop",
+        inputs={"X": x,
+                "Seed": seed},
+        outputs={"Out": out,
+                 "SeedOut": seed},
+        attrs=op_attrs)
+    return out
+
+
+def log(x):
+    """
+    Calculates the natural log of the given input tensor, element-wise.
+
+    .. math::
+
+        Out = \\ln(x)
+
+    Args:
+        x (Variable): Input tensor.
+
+    Returns:
+        Variable: The natural log of the input tensor computed element-wise.
+
+    Examples:
+
+        .. code-block:: python
+
+            output = fluid.layers.log(x)
+    """
+    helper = LayerHelper('log', **locals())
+    dtype = helper.input_dtype(input_param_name='x')
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(type="log", inputs={"X": x}, outputs={"Out": out})
+    return out
+
+
+def relu(x):
+    """
+    Relu takes one input data (Tensor) and produces one output data (Tensor)
+    where the rectified linear function, y = max(0, x), is applied to
+    the tensor elementwise.
+
+    .. math::
+
+        Out = \\max(0, x)
+
+    Args:
+        x (Variable): The input tensor.
+
+    Returns:
+        Variable: The output tensor with the same shape as input.
+
+    Examples:
+
+        .. code-block:: python
+
+            output = fluid.layers.relu(x)
+    """
+    helper = LayerHelper('relu', **locals())
+    dtype = helper.input_dtype(input_param_name='x')
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(type="relu", inputs={"X": x}, outputs={"Out": out})
+    return out
+
+
+def mean_iou(input, label, num_classes):
+    """
+    Mean Intersection-Over-Union is a common evaluation metric for
+    semantic image segmentation, which first computes the IOU for each
+    semantic class and then computes the average over classes.
+    IOU is defined as follows:
+
+    .. math::
+
+        IOU = \\frac{true\_positiv}{(true\_positive + false\_positive + false\_negative)}.
+
+    The predictions are accumulated in a confusion matrix and mean-IOU
+    is then calculated from it.
+
+
+    Args:
+        input (Variable): A Tensor of prediction results for semantic labels with type int32 or int64.
+        label (Variable): A Tensor of ground truth labels with type int32 or int64.
+                           Its shape should be the same as input.
+        num_classes (int): The possible number of labels.
+
+    Returns:
+        mean_iou (Variable): A Tensor representing the mean intersection-over-union with shape [1].
+        out_wrong(Variable): A Tensor with shape [num_classes]. The wrong numbers of each class.
+        out_correct(Variable): A Tensor with shape [num_classes]. The correct numbers of each class.
+
+    Examples:
+
+        .. code-block:: python
+
+            iou, wrongs, corrects = fluid.layers.mean_iou(predict, label, num_classes)
+    """
+    helper = LayerHelper('mean_iou', **locals())
+    dtype = helper.input_dtype()
+    out_mean_iou = helper.create_tmp_variable(dtype='float32')
+    out_wrong = helper.create_tmp_variable(dtype='int32')
+    out_correct = helper.create_tmp_variable(dtype='int32')
+    helper.append_op(
+        type="mean_iou",
+        inputs={"Predictions": input,
+                "Labels": label},
+        outputs={
+            "OutMeanIou": out_mean_iou,
+            "OutWrong": out_wrong,
+            "OutCorrect": out_correct
+        },
+        attrs={"num_classes": num_classes})
+    return out_mean_iou, out_wrong, out_correct
+
+
+def crop(x, shape=None, offsets=None, name=None):
+    """
+    Crop input into output, as specified by offsets and shape.
+
+    .. code-block:: text
+
+        * Case 1:
+            Given
+                X = [[0, 1, 2, 0, 0]
+                     [0, 3, 4, 0, 0]
+                     [0, 0, 0, 0, 0]],
+            and
+                shape = [2, 2],
+                offsets = [0, 1],
+            output is:
+                Out = [[1, 2],
+                       [3, 4]].
+        * Case 2:
+            Given
+                X = [[0, 1, 2, 5, 0]
+                     [0, 3, 4, 6, 0]
+                     [0, 0, 0, 0, 0]],
+            and shape is tensor
+                shape = [[0, 0, 0]
+                         [0, 0, 0]]
+            and
+                offsets = [0, 1],
+
+            output is:
+                Out = [[1, 2, 5],
+                       [3, 4, 6]].
+
+    Args:
+        x (Variable): The input tensor variable.
+        shape (Variable|list/tuple of integer): The output shape is specified
+            by `shape`, which can a Variable or a list/tupe of integer.
+            If a tensor Variable, it's rank must be the same as `x`. This way
+            is suitable for the case that the output shape may be changed each
+            iteration. If a list/tupe of integer, it's length must be the same
+            as the rank of `x`
+        offsets (Variable|list/tuple of integer|None): Specifies the copping
+            offsets at each dimension. It can be a Variable or or a list/tupe
+            of integer. If a tensor Variable, it's rank must be the same as `x`.
+            This way is suitable for the case that the offsets may be changed
+            each iteration. If a list/tupe of integer, it's length must be the
+            same as the rank of `x`. If None, the offsets are 0 at each
+            dimension.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        Variable: The cropped tensor variable.
+
+    Raises:
+        ValueError: If shape is not a list, tuple or Variable.
+
+    Examples:
+
+        .. code-block:: python
+
+            x = fluid.layers.data(name="x", shape=[3, 5], dtype="float32")
+            y = fluid.layers.data(name="y", shape=[2, 3], dtype="float32")
+            crop = fluid.layers.crop(x, shape=y)
+
+            # or
+            z = fluid.layers.data(name="z", shape=[3, 5], dtype="float32")
+            crop = fluid.layers.crop(z, shape=[2, 3])
+
+    """
+    helper = LayerHelper('crop', **locals())
+
+    if not (isinstance(shape, list) or isinstance(shape, tuple) or \
+        isinstance(shape, Variable)):
+        raise ValueError("The shape should be a list, tuple or Variable.")
+
+    if offsets is None:
+        offsets = [0] * len(x.shape)
+
+    out = helper.create_tmp_variable(x.dtype)
+    ipts = {'X': x}
+    attrs = {}
+    if isinstance(shape, Variable):
+        ipts['Y'] = shape
+    else:
+        attrs['shape'] = shape
+    if isinstance(offsets, Variable):
+        ipts['Offsets'] = offsets
+    else:
+        attrs['offsets'] = offsets
+
+    helper.append_op(
+        type='crop',
+        inputs=ipts,
+        outputs={'Out': out},
+        attrs=None if len(attrs) == 0 else attrs)
+    return out
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 0b88b639629ac73b16ec36aa5930c3d6a9665943..9e97ec9a6f55680a2eb44ad712ac002df4fecda5 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -17,7 +17,6 @@ __activations__ = [
     'sigmoid',
     'logsigmoid',
     'exp',
-    'relu',
     'tanh',
     'tanh_shrink',
     'softshrink',
@@ -25,9 +24,10 @@ __activations__ = [
     'abs',
     'ceil',
     'floor',
+    'cos',
+    'sin',
     'round',
     'reciprocal',
-    'log',
     'square',
     'softplus',
     'softsign',
@@ -38,8 +38,6 @@ __activations__ = [
     'relu6',
     'pow',
     'stanh',
-    'hard_shrink',
-    'thresholded_relu',
     'hard_sigmoid',
     'swish',
 ]
@@ -47,7 +45,6 @@ __activations__ = [
 __all__ = [
     'mean',
     'mul',
-    'reshape',
     'scale',
     'sigmoid_cross_entropy_with_logits',
     'elementwise_add',
@@ -59,18 +56,106 @@ __all__ = [
     'elementwise_pow',
     'clip',
     'clip_by_norm',
-    'softmax',
-    'sequence_softmax',
     'logical_and',
     'logical_or',
     'logical_xor',
     'logical_not',
-    'uniform_random',
     'uniform_random_batch_size_like',
     'gaussian_random',
     'gaussian_random_batch_size_like',
-    'cumsum',
+    'scatter',
+    'sum',
+    'slice',
+    'polygon_box_transform',
+    'shape',
+    'iou_similarity',
+    'maxout',
 ] + __activations__
 
 for _OP in set(__all__):
     globals()[_OP] = generate_layer_fn(_OP)
+
+__all__ += ["uniform_random"]
+
+_uniform_random_ = generate_layer_fn('uniform_random')
+
+
+def uniform_random(shape, dtype=None, min=None, max=None, seed=None):
+    kwargs = dict()
+    for name in locals():
+        val = locals()[name]
+        if val is not None:
+            kwargs[name] = val
+    return _uniform_random_(**kwargs)
+
+
+uniform_random.__doc__ = _uniform_random_.__doc__ + """
+Examples:
+
+    >>> result = fluid.layers.uniform_random(shape=[32, 784])
+"""
+
+__all__ += ['hard_shrink']
+
+_hard_shrink_ = generate_layer_fn('hard_shrink')
+
+
+def hard_shrink(x, threshold=None):
+    kwargs = dict()
+    for name in locals():
+        val = locals()[name]
+        if val is not None:
+            kwargs[name] = val
+    return _hard_shrink_(**kwargs)
+
+
+hard_shrink.__doc__ = _hard_shrink_.__doc__ + """
+Examples:
+
+    >>> data = fluid.layers.data(name="input", shape=[784])
+    >>> result = fluid.layers.hard_shrink(x=data, threshold=0.3)
+"""
+
+__all__ += ['cumsum']
+
+_cum_sum_ = generate_layer_fn('cumsum')
+
+
+def cumsum(x, axis=None, exclusive=None, reverse=None):
+    kwargs = dict()
+    for name in locals():
+        val = locals()[name]
+        if val is not None:
+            kwargs[name] = val
+
+    return _cum_sum_(**kwargs)
+
+
+cumsum.__doc__ = _cum_sum_.__doc__ + """
+Examples:
+
+    >>> data = fluid.layers.data(name="input", shape=[32, 784])
+    >>> result = fluid.layers.cumsum(data, axis=0)
+"""
+
+__all__ += ['thresholded_relu']
+
+_thresholded_relu_ = generate_layer_fn('thresholded_relu')
+
+
+def thresholded_relu(x, threshold=None):
+    kwargs = dict()
+    for name in locals():
+        val = locals()[name]
+        if val is not None:
+            kwargs[name] = val
+
+    _thresholded_relu_(**kwargs)
+
+
+thresholded_relu.__doc__ = _thresholded_relu_.__doc__ + """
+Examples:
+
+    >>> data = fluid.layers.data(name="input", shape=[1])
+    >>> result = fluid.layers.thresholded_relu(data, threshold=0.4)
+"""
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index da066c34bdeba1f1b76f8d1cafd9244b2f7708fa..b6614ecf3bc16e73683f4991779769049c6800ed 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -6,7 +6,7 @@
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
+# Unlessf required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
@@ -18,6 +18,7 @@ from ..framework import convert_np_dtype_to_dtype_
 from ..framework import Variable
 from ..initializer import Constant, force_init_on_cpu
 from ..core import VarDesc
+from layer_function_generator import templatedoc
 import numpy
 
 __all__ = [
@@ -30,12 +31,34 @@ __all__ = [
     'assign',
     'fill_constant_batch_size_like',
     'fill_constant',
+    'argmin',
+    'argmax',
+    'argsort',
     'ones',
     'zeros',
+    'reverse',
 ]
 
 
 def create_tensor(dtype, name=None, persistable=False):
+    """
+    Create an variable, which will hold a LoDTensor with data type dtype.
+
+    Args:
+        dtype(string): 'float32'|'int32'|..., the data type of the
+            created tensor.
+        name(string): The name of the created tensor, if not set,
+            the name will be a random unique one.
+        persistable(bool): Set the persistable flag of the create tensor.
+
+    Returns:
+        Variable: The tensor variable storing the created tensor.
+
+    Examples:
+        .. code-block:: python
+
+          tensor = fluid.layers.create_tensor(dtype='float32')
+    """
     helper = LayerHelper("create_tensor", **locals())
     return helper.create_variable(
         name=helper.name, dtype=dtype, persistable=persistable)
@@ -48,7 +71,12 @@ def create_parameter(shape,
                      is_bias=False,
                      default_initializer=None):
     """
-    Create a parameter
+    Create a parameter. The parameter is a learnable variable, which can have
+    gradient, and can be optimized.
+
+    NOTE: this is a very low-level API. This API is useful when you create
+    operator by your self. instead of using layers.
+
     Args:
         shape(list[int]): shape of the parameter
         dtype(string): element type of the parameter
@@ -60,7 +88,12 @@ def create_parameter(shape,
         default_initializer(Initializer): initializer for the parameter
 
     Returns:
-        Parameter: the created parameter
+        the created parameter.
+
+    Examples:
+        >>> W = fluid.layers.create_parameter(shape=[784, 200], dtype='float32')
+        >>> data = fluid.layers.data(name="img", shape=[64, 784], append_batch_size=False)
+        >>> hidden = fluid.layers.matmul(x=data, y=W)
     """
     helper = LayerHelper("create_parameter", **locals())
     if attr is None:
@@ -76,16 +109,29 @@ def create_global_var(shape,
                       force_cpu=False,
                       name=None):
     """
-    Create a global variable. such as global_step
+    Create a new variable in the global block(block 0).
+
     Args:
         shape(list[int]): shape of the variable
-        value(float): the value of the variable
-        dtype(string): element type of the parameter
-        persistable(bool): if this variable is persistable
-        force_cpu(bool): force this variable to be on CPU
+        value(float): the value of the variable. The new created 
+                      variable will be filled with it.
+        dtype(string): data type of the variable
+        persistable(bool): if this variable is persistable. 
+                           Default: False
+        force_cpu(bool): force this variable to be on CPU. 
+                         Default: False
+        name(str|None): The name of the variable. If set to None the variable 
+                        name will be generated automatically. 
+                        Default: None
 
     Returns:
         Variable: the created Variable
+
+    Examples:
+        .. code-block:: python
+
+            var = fluid.create_global_var(shape=[2,3], value=1.0, dtype='float32', 
+                                 persistable=True, force_cpu=True, name='new_var')
     """
     helper = LayerHelper("global_var", **locals())
     var = helper.create_global_variable(
@@ -98,8 +144,21 @@ def create_global_var(shape,
 
 def cast(x, dtype):
     """
-    This function takes in the input with input_dtype
-    and casts it to the output_dtype as the output.
+    This layer takes in the Variable :attr:`x` with :attr:`x.dtype` and casts 
+    it to the output with :attr:`dtype`.
+
+    Args:
+        x (Variable): The input Variable for casting.
+        dtype(np.dtype|core.VarDesc.VarType|str): Data type of the output Variable.
+
+    Returns:
+        Variable: The output Variable after casting.
+
+    Examples:
+        .. code-block:: python
+
+            data = fluid.layers.data(name='x', shape=[13], dtype='float32')
+            result = fluid.layers.cast(x=data, dtype='float64')
     """
     helper = LayerHelper('cast', **locals())
     out = helper.create_tmp_variable(dtype=dtype)
@@ -112,7 +171,7 @@ def cast(x, dtype):
     return out
 
 
-def concat(input, axis=0):
+def concat(input, axis=0, name=None):
     """
     **Concat**
 
@@ -122,13 +181,16 @@ def concat(input, axis=0):
     Args:
         input(list): List of tensors to be concatenated
         axis(int): Integer axis along which the tensors will be concatenated
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
 
     Returns:
         Variable: Output variable of the concatenation
 
     Examples:
         .. code-block:: python
-          out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth])
+
+           out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth])
     """
     helper = LayerHelper('concat', **locals())
     out = helper.create_tmp_variable(dtype=helper.input_dtype())
@@ -141,19 +203,21 @@ def concat(input, axis=0):
 
 
 def sums(input, out=None):
-    """This function performs the sum operation on the input and returns the
+    """
+    This function performs the sum operation on the input and returns the
     result as the output.
 
     Args:
         input (Variable|list): The input tensor that has the elements
                                that need to be summed up.
+        out (Variable|None): Output parameter. The sum result.
+                             Default: None
 
     Returns:
-        Variable: The tensor type variable that has the sum of input
-                  written to it.
+        Variable: the sum of input. The same as the argument 'out'
 
     Examples:
-        .. code-block::python
+        .. code-block:: python
 
           tmp = fluid.layers.zeros(shape=[10], dtype='int32')
           i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
@@ -167,11 +231,15 @@ def sums(input, out=None):
     helper = LayerHelper('sum', **locals())
     if out is None:
         out = helper.create_tmp_variable(dtype=helper.input_dtype())
-    helper.append_op(type='sum', inputs={'X': input}, outputs={'Out': out})
+    helper.append_op(
+        type='sum',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={'use_mkldnn': False})
     return out
 
 
-def assign(input, output):
+def assign(input, output=None):
     """
     **Assign**
 
@@ -179,24 +247,24 @@ def assign(input, output):
 
     Args:
         input(Variable|numpy.ndarray): The source variable
-        output(Variable): The destination variable
+        output(Variable|None): The destination variable
 
     Returns:
         Variable: The destination variable that was supplied as the *output*.
 
     Examples:
         .. code-block:: python
+
           out = fluid.layers.create_tensor(dtype='float32')
           hidden = fluid.layers.fc(input=data, size=10)
           fluid.layers.assign(hidden, out)
     """
     helper = LayerHelper('assign', **locals())
+    if output is None:
+        output = helper.create_tmp_variable(dtype=input.dtype)
     if isinstance(input, Variable):
         helper.append_op(
-            type='scale',
-            inputs={'X': [input]},
-            outputs={'Out': [output]},
-            attrs={'scale': 1.0})
+            type='assign', inputs={'X': [input]}, outputs={'Out': [output]})
     elif isinstance(input, numpy.ndarray):
         dtype = convert_np_dtype_to_dtype_(input.dtype)
         if dtype == VarDesc.VarType.FP32:
@@ -267,6 +335,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
     return out
 
 
+@templatedoc()
 def fill_constant_batch_size_like(input,
                                   shape,
                                   dtype,
@@ -274,30 +343,28 @@ def fill_constant_batch_size_like(input,
                                   input_dim_idx=0,
                                   output_dim_idx=0):
     """
-    **fill_constant_batch_size_like**
-
-    This function creates a tensor of specified *shape*, *dtype* and batch size,
-    and initializes this with a constant supplied in *value*. The batch size is
-    obtained from the `input` tensor.
+    ${comment}
 
     It also sets *stop_gradient* to True.
 
+    >>> data = fluid.layers.fill_constant_batch_size_like(
+    >>>             input=like, shape=[1], value=0, dtype='int64')
+
     Args:
-        input(Variable): Tensor whose dimensions will be used to get batch size
-        shape(tuple|list|None): Shape of output tensor
-        dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor
-        value(float): Constant value to initialize the output tensor
-        input_dim_idx(int): Index of input's batch size dimension
-        output_dim_idx(int): Index of output's batch size dimension
+        input(${input_type}): ${input_comment}.
 
-    Returns:
-        Variable: The tensor variable storing the output
+        shape(${shape_type}): ${shape_comment}.
 
-    Examples:
-        .. code-block:: python
+        dtype(${dtype_type}): ${dtype_comment}.
+
+        value(${value_type}): ${value_comment}.
+
+        input_dim_idx(${input_dim_idx_type}): ${input_dim_idx_comment}.
 
-          data = fluid.layers.fill_constant_batch_size_like(
-              input=like, shape=[1], value=0, dtype='int64')
+        output_dim_idx(${output_dim_idx_type}): ${output_dim_idx_comment}.
+
+    Returns:
+        ${out_comment}.
     """
     helper = LayerHelper("fill_constant_batch_size_like", **locals())
     out = helper.create_tmp_variable(dtype=dtype)
@@ -316,6 +383,120 @@ def fill_constant_batch_size_like(input,
     return out
 
 
+def argmin(x, axis=0):
+    """
+    **argmin**
+
+    This function computes the indices of the min elements
+    of the input tensor's element along the provided axis.
+
+    Args:
+        x(Variable): The input to compute the indices of
+                     the min elements.
+        axis(int): Axis to compute indices along.
+
+    Returns:
+        Variable: The tensor variable storing the output
+
+    Examples:
+        .. code-block:: python
+
+          out = fluid.layers.argmin(x=in, axis=0)
+          out = fluid.layers.argmin(x=in, axis=-1)
+    """
+    helper = LayerHelper("arg_min", **locals())
+    out = helper.create_tmp_variable(VarDesc.VarType.INT64)
+    helper.append_op(
+        type='arg_min',
+        inputs={'X': x},
+        outputs={'Out': [out]},
+        attrs={'axis': axis})
+    return out
+
+
+def argmax(x, axis=0):
+    """
+    **argmax**
+
+    This function computes the indices of the max elements
+    of the input tensor's element along the provided axis.
+
+    Args:
+        x(Variable): The input to compute the indices of
+                     the max elements.
+        axis(int): Axis to compute indices along.
+
+    Returns:
+        Variable: The tensor variable storing the output
+
+    Examples:
+        .. code-block:: python
+
+          out = fluid.layers.argmax(x=in, axis=0)
+          out = fluid.layers.argmax(x=in, axis=-1)
+    """
+    helper = LayerHelper("arg_max", **locals())
+    out = helper.create_tmp_variable(VarDesc.VarType.INT64)
+    helper.append_op(
+        type='arg_max',
+        inputs={'X': x},
+        outputs={'Out': [out]},
+        attrs={'axis': axis})
+    return out
+
+
+def argsort(input, axis=-1, name=None):
+    """
+    Performs sorting on the input Variable along the given axis, and outputs 
+    sorted data Varibale and its corresponding index Variable with the same 
+    shape as :attr:`input`.
+
+    .. code-block:: text
+    
+        For example, the given axis is -1 and the input Variable
+
+            input = [[0.15849551, 0.45865775, 0.8563702 ],
+                     [0.12070083, 0.28766365, 0.18776911]],
+
+        after argsort, the sorted Vairable becomes
+
+            out = [[0.15849551, 0.45865775, 0.8563702 ],
+                   [0.12070083, 0.18776911, 0.28766365]],
+
+        and the sorted indices along the given axis turn outs to be
+
+            indices = [[0, 1, 2], 
+                       [0, 2, 1]]
+
+    Args:
+        input(Variable): The input Variable for sorting.
+        axis(int): The axis along which to sort the input Variable. When 
+                   :attr:`axis` < 0, the actual axis will be :attr:`axis` + 
+                   rank(:attr:`input`). Default -1, the last dimension.
+        name(str|None): (optional) A name for this layer. If set None, the 
+                   layer will be named automatically.
+
+    Returns:
+        tuple: A tuple of sorted data Variable and the sorted indices.
+
+    Examples:
+        .. code-block:: python
+
+            input = fluid.layers.data(data=[2, 3])
+            out, indices = fluid.layers.argsort(input, axis=0)
+    """
+    helper = LayerHelper("argsort", **locals())
+    out = helper.create_tmp_variable(dtype=input.dtype, stop_gradient=True)
+    ids = helper.create_tmp_variable(VarDesc.VarType.INT64, stop_gradient=True)
+    helper.append_op(
+        type='argsort',
+        inputs={'X': input},
+        outputs={'Out': out,
+                 'Indices': ids},
+        attrs={'axis': axis})
+    return out, ids
+
+
 def ones(shape, dtype, force_cpu=False):
     """
     **ones**
@@ -350,11 +531,12 @@ def zeros(shape, dtype, force_cpu=False):
     It also sets *stop_gradient* to True.
 
     Args:
-        shape(tuple|list|None): Shape of output tensor
-        dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor
+        shape(tuple|list|None): Shape of output tensor.
+        dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor.
+        force_cpu(bool, default False): Whether to make output stay on CPU.
 
     Returns:
-        Variable: The tensor variable storing the output
+        Variable: The tensor variable storing the output.
 
     Examples:
         .. code-block:: python
@@ -364,6 +546,40 @@ def zeros(shape, dtype, force_cpu=False):
     return fill_constant(value=0.0, **locals())
 
 
+def reverse(x, axis):
+    """
+    **reverse**
+
+    This function reverse the input 'x' along given axises.
+
+    Args:
+        x(Vairbale): the input to be reversed.
+        axis(int|tuple|list): Axis that along which order of elements
+                    is reversed. If it is a tuple or a list, reversing
+                    will be apply on each axis in the tuple or list.
+
+    Returns:
+        Variable: The reversed tensor.
+
+    Examples:
+        .. code-block:: python
+
+          out = fluid.layers.reverse(x=in, axis=0)
+          # or:
+          out = fluid.layers.reverse(x=in, axis=[0,1])
+    """
+    if isinstance(axis, int):
+        axis = [axis]
+    helper = LayerHelper("reverse", **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type='reverse',
+        inputs={'Input': x},
+        outputs={'Out': [out]},
+        attrs={'axis': axis})
+    return out
+
+
 def save(x, file_path, overwrite=True):
     """
     Saves a variable as a file.
@@ -371,9 +587,9 @@ def save(x, file_path, overwrite=True):
     Args:
         x(variable): The Tensor/LoDTensor to be saved.
         file_path(str): The file path where the variable will be saved.
-        overwrite(bool): Whether or not cover the given file when it has already 
-            existed. If it's set 'False' and the file is existed, a runtime 
-            error will be thrown. 
+        overwrite(bool): Whether or not cover the given file when it has already
+            existed. If it's set 'False' and the file is existed, a runtime
+            error will be thrown.
     """
     helper = LayerHelper("save", **locals())
     helper.append_op(
@@ -389,11 +605,27 @@ def save_combine(x, file_path, overwrite=True):
     Saves a list of variables into a single file.
 
     Args:
-        x(list): A list of Tensor/LoDTensor to be saved together in a single file.
+        x(list): A list of Tensor/LoDTensor variables to be saved together in
+                 a single file.
         file_path(str): The file path where variables will be saved.
-        overwrite(bool): Whether or not cover the given file when it has already 
-            existed. If it's set 'False' and the file is existed, a runtime 
-            error will be thrown. 
+        overwrite(bool): Whether or not cover the given file when it has already
+            existed. If it's set 'False' and the file is existed, a runtime
+            error will be thrown.
+
+    Returns:
+        There is no return value.
+
+    Examples:
+
+        .. code-block:: python
+
+            v1 = fluid.layers.data(name="data",
+                                   shape=(4, 6),
+                                   dtype="float32")
+            v2 = fluid.layers.data(name="data",
+                                   shape=(6, 8, 4),
+                                   dtype="float32")
+            normed = fluid.layers.save_combine([v1, v2], file_path="output")
     """
     helper = LayerHelper("save_combine", **locals())
     helper.append_op(
@@ -404,22 +636,6 @@ def save_combine(x, file_path, overwrite=True):
               "overwrite": overwrite})
 
 
-def load(out, file_path):
-    """
-    Loads a variable from a given file.
-
-    Args:
-        out(variable): The variable to be read from the disk file.
-        file_path(str): The path of the disk file.
-    """
-    helper = LayerHelper("load", **locals())
-    helper.append_op(
-        type="load",
-        inputs={},
-        output={"Out": out},
-        args={"file_path": file_path})
-
-
 def load_combine(out, file_path):
     """
     Loads a list of vairables from a single file.
diff --git a/python/paddle/fluid/lod_tensor.py b/python/paddle/fluid/lod_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2b3186c1e8dd84e1527ff18744bd611f1f74c5f
--- /dev/null
+++ b/python/paddle/fluid/lod_tensor.py
@@ -0,0 +1,134 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import core
+import numpy as np
+
+__all__ = ['create_lod_tensor', 'create_random_int_lodtensor']
+
+
+def create_lod_tensor(data, recursive_seq_lens, place):
+    """
+    Create a lod tensor from a numpy array, a list, or an existing lod tensor.
+
+    Create a lod tensor by doing the following:
+
+    1. Check that the length-based level of detail (LoD) also known as 
+       recursive_sequence_lengths of the input is valid.
+
+    2. Convert recursive_sequence_lengths to a offset-based LoD.
+
+    3. Copy the data from a numpy array, a list or a existing lod tensor to
+       CPU or GPU device (based on input place).
+
+    4. Set the level of detail (LoD) using the offset-based LoD.
+    
+    Examples:
+
+        Suppose we want LoDTensor to hold data for sequences of word, where each
+        word is represented by an integer. If we want to create a LoDTensor to
+        represent two sentences, one of 2 words, and one of 3 words.
+
+        Then :code:`data` can be a numpy array of integers with shape (5, 1).
+        :code:`recursive_seq_lens` will be [[2, 3]], indicating the length(# of words) in each
+        sentence. This length-based :code:`recursive_seq_lens` [[2, 3]] will be converted to
+        offset-based LoD [[0, 2, 5]] inside the function call.
+
+    Please reference :ref:`api_guide_low_level_lod_tensor` for more details
+    regarding LoD.
+
+    Args:
+        data(numpy.ndarray|list|LoDTensor): a numpy array or a LoDTensor or a
+            list holding the data to be copied.
+        recursive_seq_lens(list): a list of lists indicating the length-based level of detail 
+            info specified by the user.
+        place(Place): CPU or GPU place indicating where the data in the new
+            LoDTensor will be stored.
+
+    Returns:
+        A fluid LoDTensor object with tensor data and recursive_seq_lens info.
+    """
+    if isinstance(data, core.LoDTensor):
+        return create_lod_tensor(np.array(data), recursive_seq_lens, place)
+    elif isinstance(data, list):
+        # When input data is a list, it only deal with the case where the base element 
+        # is an index of shape [1] and dtype int64 (e.g., word id). Hence, the generated 
+        # LoDTensor will be of shape [n, 1] and dtype int64, where `n` is the total number 
+        # of words or other indexes in the sequence. 
+        new_recursive_seq_lens = []
+        for seq in data:
+            new_recursive_seq_lens.append(len(seq))
+        assert [
+            new_recursive_seq_lens
+        ] == recursive_seq_lens, "data and recursive_seq_lens do not match"
+        flattened_data = np.concatenate(data, axis=0).astype("int64")
+        flattened_data = flattened_data.reshape([len(flattened_data), 1])
+        return create_lod_tensor(flattened_data, recursive_seq_lens, place)
+    elif isinstance(data, np.ndarray):
+        tensor = core.LoDTensor()
+        tensor.set(data, place)
+        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
+        assert tensor.has_valid_recursive_sequence_lengths(
+        ), "the provided lod info is invalid"
+        return tensor
+    else:
+        raise TypeError(
+            "data should be either a LoDTensor, a Numpy array or a list")
+
+
+def create_random_int_lodtensor(recursive_seq_lens, base_shape, place, low,
+                                high):
+    """
+    Create a LoDTensor containing random integers.
+
+    This function is frequently used in the book examples. So we revised it
+    based on the new create_lod_tensor API and put it here in the lod_tensor
+    module to simplify the code.
+
+    The function does the following:
+
+    1. Calculate the overall shape of the LoDTensor based on the length-based
+       :code:`recursive_seq_lens` input and the shape of the basic element in
+       :code:`base_shape`.
+
+    2. Create a numpy array of this shape.
+
+    3. Create the LoDTensor using create_lod_tensor API.
+
+    Suppose we want LoDTensor to hold data for sequences of word, where each
+    word is represented by an integer. If we want to create a LoDTensor to
+    represent two sentences, one of 2 words, and one of 3 words. Then
+    'base_shape' is [1], input length-based 'recursive_seq_lens' is [[2, 3]]. 
+    Then the overall shape of the LoDTensor would be [5, 1], holding 5 words 
+    for two sentences.
+
+    Args:
+        recursive_seq_lens(list): a list of lists indicating the length-based 
+            level of detail info specified by the user.
+        base_shape(list): the shape of the basic element to be held by the
+            LoDTensor.
+        place(Place): CPU or GPU place indicating where the data in the new
+            LoDTensor will be stored.
+        low(int): the lower bound of the random integers.
+        high(int): the upper bound of the random integers.
+
+    Returns:
+        A fluid LoDTensor object with tensor data and recursive_seq_lens info. 
+    """
+    assert isinstance(base_shape, list), "base_shape should be a list"
+    # append the total number of basic elements to the front of its shape
+    overall_shape = [sum(recursive_seq_lens[-1])] + base_shape
+    # the range of integer data elements is [low, high]    
+    data = np.random.random_integers(low, high, overall_shape).astype("int64")
+    return create_lod_tensor(data, recursive_seq_lens, place)
diff --git a/python/paddle/fluid/memory_optimization_transpiler.py b/python/paddle/fluid/memory_optimization_transpiler.py
deleted file mode 100644
index 41d1eca82e8b680977f44f1756c25c37340668a4..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/memory_optimization_transpiler.py
+++ /dev/null
@@ -1,343 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from collections import defaultdict
-import framework
-from framework import Program, default_main_program, Parameter, Variable
-import backward
-from backward import _rename_arg_
-from . import core
-
-dtype_to_size = {
-    core.VarDesc.VarType.FP16: 2,
-    core.VarDesc.VarType.FP32: 4,
-    core.VarDesc.VarType.FP64: 8,
-    core.VarDesc.VarType.INT16: 2,
-    core.VarDesc.VarType.INT32: 4,
-    core.VarDesc.VarType.INT64: 8,
-    core.VarDesc.VarType.BOOL: 1
-}
-
-sub_block_ops = [
-    "while", "while_grad", "parallel_do", "parallel_do_grad",
-    "conditional_block", "conditional_block_grad"
-]
-
-PRINT_LOG = False
-
-
-class ControlFlowGraph(object):
-    def __init__(self, Program, ops, forward_num, skip_opt):
-        self._program = Program
-        self._ops = ops
-        self._forward_num = forward_num
-        self._successors = defaultdict(set)
-        self._presuccessors = defaultdict(set)
-        self._uses = defaultdict(set)
-        self._defs = defaultdict(set)
-        self._live_in = defaultdict(set)
-        self._live_out = defaultdict(set)
-        self._skip_opt = skip_opt
-
-    def _add_connections(self, connections):
-        for node1, node2 in connections:
-            self._add(node1, node2)
-
-    def _add(self, node1, node2):
-        self._successors[node1].add(node2)
-        self._presuccessors[node2].add(node1)
-
-    def _build_graph(self):
-        self.op_size = len(self._ops)
-        op_node_connections = [(i, i + 1) for i in range(self.op_size - 1)]
-        self._add_connections(op_node_connections)
-        for i in range(self.op_size):
-            self._uses[i].update(self._ops[i].input_arg_names())
-            self._defs[i].update(self._ops[i].output_arg_names())
-
-    def _update_graph(self, old_name, new_name, begin_idx=0):
-        for i in range(begin_idx, self.op_size):
-            if old_name in self._uses[i]:
-                self._uses[i].remove(old_name)
-                self._uses[i].add(new_name)
-            if old_name in self._defs[i]:
-                self._defs[i].remove(old_name)
-                self._defs[i].add(new_name)
-            if old_name in self._live_in[i]:
-                self._live_in[i].remove(old_name)
-                self._live_out[i].add(new_name)
-            if old_name in self._live_out[i]:
-                self._live_out[i].remove(old_name)
-                self._live_out[i].add(new_name)
-
-    def _reach_fixed_point(self, live_in, live_out):
-        if len(live_in) != len(self._live_in):
-            return False
-        if len(live_out) != len(self._live_out):
-            return False
-        for i in range(self.op_size):
-            if live_in[i] != self._live_in[i]:
-                return False
-        for i in range(self.op_size):
-            if live_out[i] != self._live_out[i]:
-                return False
-        return True
-
-    def _dataflow_analyze(self):
-        self._build_graph()
-        live_in = defaultdict(set)
-        live_out = defaultdict(set)
-        while True:
-            for i in range(self.op_size, 0, -1):
-                live_in[i] = set(self._live_in[i])
-                live_out[i] = set(self._live_out[i])
-                for s in self._successors[i]:
-                    self._live_out[i] |= self._live_in[s]
-                self._live_in[i] = self._uses[i] | (
-                    self._live_out[i] - self._defs[i])
-            if self._reach_fixed_point(live_in, live_out):
-                break
-
-    def _get_diff(self, a, b):
-        u = a & b
-        return a - u, b - u
-
-    def _has_var(self, block_desc, var_name, is_forward):
-        if is_forward:
-            return block_desc.has_var(str(var_name))
-        else:
-            return block_desc.has_var_recursive(str(var_name))
-
-    def _find_var(self, block_desc, var_name, is_forward):
-        if is_forward:
-            return block_desc.find_var(str(var_name))
-        else:
-            return block_desc.find_var_recursive(str(var_name))
-
-    def _check_var_validity(self, block_desc, x, is_forward):
-        if str(x) == "@EMPTY@":
-            return False
-        if not self._has_var(block_desc, x, is_forward):
-            return False
-        if self._find_var(block_desc, x, is_forward).persistable():
-            return False
-        if self._find_var(block_desc, x,
-                          is_forward).type() != core.VarDesc.VarType.LOD_TENSOR:
-            return False
-        if x in self._skip_opt:
-            return False
-        if not self._find_var(block_desc, x, is_forward).shape():
-            return False
-        return True
-
-    def _update_skip_opt_set(self):
-        for i in range(self.op_size):
-            op = self._ops[i]
-            if op.type() == "fill_constant" and op.attr("force_cpu") == True:
-                self._skip_opt.update(op.output_arg_names())
-
-    def release_memory(self):
-        self._dataflow_analyze()
-        self._update_skip_opt_set()
-        fwd_id = 0
-        bwd_id = 0
-        for i in range(self.op_size):
-            op = self._ops[i]
-            if op.type() in sub_block_ops:
-                continue
-            block_desc = op.block()
-            is_forward = i < self._forward_num
-            in_diff, out_diff = self._get_diff(self._live_in[i],
-                                               self._live_out[i])
-            can_optimize = filter(
-                lambda x: self._check_var_validity(block_desc, x, is_forward),
-                in_diff)
-            if can_optimize:
-                index = i + fwd_id + 1 if is_forward else i - self._forward_num + bwd_id + 1
-                delete_op = block_desc.insert_op(index)
-                delete_op.set_type("delete_var")
-                delete_op.set_input("X", can_optimize)
-                if is_forward:
-                    fwd_id += 1
-                else:
-                    bwd_id += 1
-
-    def memory_optimize(self, level=0):
-        def compare_shape(x_shape, cache_shape, opt_level):
-            if opt_level == 0:
-                return x_shape == cache_shape
-            if opt_level == 1:
-                if (x_shape[0] == -1) ^ (cache_shape[0] == -1):
-                    return False
-                x_size = abs(reduce(lambda x, y: x * y, x_shape))
-                cache_size = abs(reduce(lambda x, y: x * y, cache_shape))
-                if x_size <= cache_size:
-                    return True
-            return False
-
-        self._dataflow_analyze()
-        self._update_skip_opt_set()
-        self.pool = []
-        for i in range(self.op_size):
-            op = self._ops[i]
-            if op.type() in sub_block_ops:
-                continue
-            block_desc = op.block()
-            self.current_block_desc = block_desc
-            is_forward = i < self._forward_num
-            if self.pool:
-                defs_can_optimize = filter(
-                    lambda x: self._check_var_validity(block_desc, x, is_forward),
-                    self._defs[i])
-                out_pair = [
-                    (x, self._find_var(block_desc, x, is_forward).shape())
-                    for x in defs_can_optimize
-                ]
-                for x, x_shape in out_pair:
-                    # If x is both in uses and defs, it can not be optimized!
-                    if x in self._uses[i]:
-                        continue
-                    for index, cache_pair in enumerate(self.pool):
-                        cache_var = cache_pair[0]
-                        cache_shape = cache_pair[1]
-                        if compare_shape(x_shape, cache_shape, level):
-                            if self._has_var(block_desc, cache_var, is_forward):
-                                x_dtype = self._find_var(block_desc, x,
-                                                         is_forward).dtype()
-                                cache_dtype = self._find_var(
-                                    block_desc, cache_var, is_forward).dtype()
-                                # TODO(qijun): actually, we should compare dtype_to_size[x_dtype]
-                                # and dtype_to_size[cache_dtype]
-                                if x_dtype == cache_dtype:
-                                    if PRINT_LOG:
-                                        print(
-                                            ("Hit Cache !!!! cache pool index "
-                                             "is %d, var name is %s, "
-                                             "cached var name is %s, "
-                                             "var shape is %s ") %
-                                            (index, x, cache_var,
-                                             str(cache_shape)))
-                                    self.pool.pop(index)
-                                    if x == cache_var:
-                                        break
-                                    _rename_arg_(
-                                        self._ops, x, cache_var, begin_idx=i)
-                                    self._program.block(block_desc.id).var(
-                                        str(x)).desc = self._find_var(
-                                            block_desc, cache_var, is_forward)
-                                    self._update_graph(
-                                        x, cache_var, begin_idx=i)
-                                    break
-
-            in_diff, out_diff = self._get_diff(self._live_in[i],
-                                               self._live_out[i])
-            can_optimize = filter(
-                lambda x: self._check_var_validity(block_desc, x, is_forward),
-                in_diff)
-            if can_optimize:
-                for var_name in can_optimize:
-                    self.pool.append((var_name, self._find_var(
-                        block_desc, var_name, is_forward).shape()))
-
-
-def _process_sub_block_pair(pdesc, sub_block_pair):
-    ops_list = []
-    block_desc = pdesc.block(0)
-    op_size = block_desc.op_size()
-    for fwd_op, bwd_op in sub_block_pair:
-        sub_block_ids = []
-        grad_sub_block_ids = []
-        sub_block_id_pair = []
-        sub_op_dict = {}
-        for i in range(op_size):
-            op = block_desc.op(i)
-            if op.type() == fwd_op:
-                sub_block_ids.append(op.attr("sub_block").id)
-                sub_op_dict[op.attr("sub_block").id] = op
-            elif op.type() == bwd_op:
-                grad_sub_block_ids.append(op.attr("sub_block").id)
-                sub_op_dict[op.attr("sub_block").id] = op
-
-        # Find fwd_op/bwd_op block pair
-        for grad_id in grad_sub_block_ids:
-            fwd_id = pdesc.block(grad_id).get_forward_block_idx()
-            if fwd_id in sub_block_ids:
-                sub_block_id_pair.append((fwd_id, grad_id))
-                sub_block_ids.remove(fwd_id)
-
-        # Get fwd_op/bwd_op block ops
-        for fwd_id, grad_id in sub_block_id_pair:
-            sub_block_ops = []
-            sub_block = pdesc.block(fwd_id)
-            block_op_size = sub_block.op_size()
-            for i in range(block_op_size):
-                sub_block_ops.append(sub_block.op(i))
-
-            grad_sub_block = pdesc.block(grad_id)
-            grad_sub_block_op_size = grad_sub_block.op_size()
-            for i in range(grad_sub_block_op_size):
-                sub_block_ops.append(grad_sub_block.op(i))
-
-            sub_op_output = set()
-            sub_op_output.update(sub_op_dict[fwd_id].output_arg_names())
-            sub_op_output.update(sub_op_dict[grad_id].output_arg_names())
-            ops_list.append((sub_block_ops, block_op_size, sub_op_output))
-
-        # Process rest fwd_op block ops
-        for fwd_id in sub_block_ids:
-            sub_block_ops = []
-            sub_block = pdesc.block(fwd_id)
-            sub_block_op_size = sub_block.op_size()
-            for i in range(sub_block_op_size):
-                sub_block_ops.append(sub_block.op(i))
-            sub_op_output = set()
-            sub_op_output.update(sub_op_dict[fwd_id].output_arg_names())
-            ops_list.append((sub_block_ops, sub_block_op_size, sub_op_output))
-    return ops_list
-
-
-def _get_cfgs(input_program):
-    ops_list = []
-    pdesc = input_program.get_desc()
-    block_desc = pdesc.block(0)
-    op_size = block_desc.op_size()
-    # Get global block ops
-    ops_list.append(
-        ([block_desc.op(i) for i in range(op_size)], op_size, set()))
-
-    sub_block_pair = [("while", "while_grad"), ("parallel_do",
-                                                "parallel_do_grad"),
-                      ("conditional_block", "conditional_block_grad")]
-
-    ops_list.extend(_process_sub_block_pair(pdesc, sub_block_pair))
-
-    cfgs = [
-        ControlFlowGraph(input_program, ops, forward_num, skip_opt)
-        for ops, forward_num, skip_opt in ops_list
-    ]
-    return cfgs
-
-
-def memory_optimize(input_program, print_log=False, level=0):
-    global PRINT_LOG
-    PRINT_LOG = print_log
-    cfgs = _get_cfgs(input_program)
-    for cfg in cfgs:
-        cfg.memory_optimize(level)
-
-
-def release_memory(input_program):
-    cfgs = _get_cfgs(input_program)
-    for cfg in cfgs:
-        cfg.release_memory()
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..17bb0826a6ea86c98a069263dfab84b99e1177ad
--- /dev/null
+++ b/python/paddle/fluid/metrics.py
@@ -0,0 +1,626 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fluid Metrics
+
+The metrics are accomplished via Python natively. 
+"""
+import numpy as np
+import copy
+import warnings
+
+__all__ = [
+    'MetricBase',
+    'CompositeMetric',
+    'Precision',
+    'Recall',
+    'Accuracy',
+    'ChunkEvaluator',
+    'EditDistance',
+    'DetectionMAP',
+    'Auc',
+]
+
+
+def _is_numpy_(var):
+    return isinstance(var, (np.ndarray, np.generic))
+
+
+def _is_number_(var):
+    return isinstance(var, int) or isinstance(var, float) or (isinstance(
+        var, np.ndarray) and var.shape == (1, ))
+
+
+def _is_number_or_matrix_(var):
+    return _is_number_(var) or isinstance(var, np.ndarray)
+
+
+class MetricBase(object):
+    """
+    Base Class for all Metrics.
+    MetricBase define a group of interfaces for the
+    model evaluation methods. Metrics accumulate metric states between
+    consecutive minibatches, at every minibatch, use update
+    interface to add current minibatch value to global states.
+    Use eval to compute accumative metric value from last reset()
+    or from scratch on.
+    If you need to custom a new metric, please inherit from MetricBase and
+    custom implementation.
+
+    Args:
+        name(str): The name of metric instance. such as, "accuracy".
+                  It needed if you want to distinct different metrics in a model.
+
+    """
+
+    def __init__(self, name):
+        self._name = str(name) if name != None else self.__class__.__name__
+
+    def __str__(self):
+        return self._name
+
+    def reset(self):
+        """
+        reset clear the states of metrics. By default, the states
+        are the members who do not has _ prefix, reset set them to inital states.
+        If you violate the implicit name rule, please also custom the reset
+        interface.
+        """
+        states = {
+            attr: value
+            for attr, value in self.__dict__.iteritems()
+            if not attr.startswith("_")
+        }
+        for attr, value in states.iteritems():
+            if isinstance(value, int):
+                setattr(self, attr, 0)
+            elif isinstance(value, float):
+                setattr(self, attr, .0)
+            elif isinstance(value, (np.ndarray, np.generic)):
+                setattr(self, attr, np.zeros_like(value))
+            else:
+                setattr(self, attr, None)
+
+    def get_config(self):
+        """
+        Get the metric and current states.
+        The states are the members who do not has "_" prefix.
+
+        Args:
+            None
+
+        Returns:
+            dict: a dict of metric and states
+        """
+        states = {
+            attr: value
+            for attr, value in self.__dict__.iteritems()
+            if not attr.startswith("_")
+        }
+        config = {}
+        config.update({"name": self._name, "states": copy.deepcopy(states)})
+        return config
+
+    def update(self, preds, labels):
+        """
+        Updates the metric states at every minibatch.
+        One user can compute the minibatch metric via pure Python, or
+        via a c++ operator.
+
+        Args:
+            preds(numpy.array): the predictions of current minibatch
+            labels(numpy.array): the labels of current minibatch, if the label is one-hot
+                               or soft-label, should custom the corresponding update rule.
+        """
+        raise NotImplementedError(
+            "Should not use it directly, please extend it.")
+
+    def eval(self):
+        """
+        Evalute the current metrics based the accumulated states.
+
+        Returns:
+            float|list(float)|numpy.array: the metrics via Python.
+        """
+        raise NotImplementedError(
+            "Should not use it directly, please extend it.")
+
+
+class CompositeMetric(MetricBase):
+    """
+    Composite multiple metrics in one instance.
+    for example, merge F1, accuracy, recall into one Metric.
+    
+    Examples:
+        .. code-block:: python
+    
+          labels = fluid.layers.data(name="data", shape=[1], dtype="int32")
+          data = fluid.layers.data(name="data", shape=[32, 32], dtype="int32")
+          pred = fluid.layers.fc(input=data, size=1000, act="tanh")
+          comp = fluid.metrics.CompositeMetric()
+          acc = fluid.metrics.Precision()
+          recall = fluid.metrics.Recall()
+          comp.add_metric(acc)
+          comp.add_metric(recall)
+          for pass in range(PASSES):
+            comp.reset()
+            for data in train_reader():
+                loss, preds, labels = exe.run(fetch_list=[cost, preds, labels])
+            comp.update(preds=preds, labels=labels)
+            numpy_acc, numpy_recall = comp.eval()
+    """
+
+    def __init__(self, name=None):
+        super(CompositeMetric, self).__init__(name)
+        self._metrics = []
+
+    def add_metric(self, metric):
+        """
+        add one metric instance to CompositeMetric.
+
+        Args:
+            metric: a instance of MetricBase.
+        """
+        if not isinstance(metric, MetricBase):
+            raise ValueError("SubMetric should be inherit from MetricBase.")
+        self._metrics.append(metric)
+
+    def update(self, preds, labels):
+        """
+        Update every metrics in sequence.
+
+        Args:
+            preds(numpy.array): the predictions of current minibatch
+            labels(numpy.array): the labels of current minibatch, if the label is one-hot
+                               or soft-label, should custom the corresponding update rule.
+        """
+        for m in self._metrics:
+            ans.append(m.update(preds, labels))
+
+    def eval(self):
+        """
+        Evaluate every metrics in sequence.
+
+        Returns:
+            list(float|numpy.array): a list of metrics value in Python.
+        """
+        ans = []
+        for m in self._metrics:
+            ans.append(m.eval())
+        return ans
+
+
+class Precision(MetricBase):
+    """
+    Precision (also called positive predictive value) is the fraction of
+    relevant instances among the retrieved instances.
+    https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers
+
+    Note Precision is different with Accuracy in binary classifiers.
+    accuracy = true positive / total instances
+    precision = true positive / all positive instance
+
+    Examples:
+        .. code-block:: python
+
+        metric = fluid.metrics.Precision()
+        for pass in range(PASSES):
+            metric.reset()
+            for data in train_reader():
+                loss, preds, labels = exe.run(fetch_list=[cost, preds, labels])
+            metric.update(preds=preds, labels=labels)
+            numpy_precision = metric.eval()
+    """
+
+    def __init__(self, name=None):
+        super(Precision, self).__init__(name)
+        self.tp = 0  # true positive
+        self.fp = 0  # false positive
+
+    def update(self, preds, labels):
+        if not _is_numpy_(preds):
+            raise ValueError("The 'preds' must be a numpy ndarray.")
+        if not _is_numpy_(labels):
+            raise ValueError("The 'labels' must be a numpy ndarray.")
+        sample_num = labels[0]
+        for i in range(sample_num):
+            pred = preds[i].astype("int32")
+            label = labels[i]
+            if label == 1:
+                if pred == label:
+                    self.tp += 1
+                else:
+                    self.fp += 1
+
+    def eval(self):
+        ap = self.tp + self.fp
+        return float(self.tp) / ap if ap != 0 else .0
+
+
+class Recall(MetricBase):
+    """
+    Recall (also known as sensitivity) is the fraction of
+    relevant instances that have been retrieved over the
+    total amount of relevant instances
+
+    https://en.wikipedia.org/wiki/Precision_and_recall
+
+    Examples:
+        .. code-block:: python
+
+        metric = fluid.metrics.Recall()
+        for pass in range(PASSES):
+            metric.reset()
+            for data in train_reader():
+                loss, preds, labels = exe.run(fetch_list=[cost, preds, labels])
+            metric.update(preds=preds, labels=labels)
+            numpy_recall = metric.eval()
+    """
+
+    def __init__(self, name=None):
+        super(Recall, self).__init__(name)
+        self.tp = 0  # true positive
+        self.fn = 0  # false negtive
+
+    def update(self, preds, labels):
+        if not _is_numpy_(preds):
+            raise ValueError("The 'preds' must be a numpy ndarray.")
+        if not _is_numpy_(labels):
+            raise ValueError("The 'labels' must be a numpy ndarray.")
+        sample_num = labels[0]
+        for i in range(sample_num):
+            pred = preds[i].astype("int32")
+            label = labels[i]
+            if label == 1:
+                if pred == label:
+                    self.tp += 1
+            else:
+                if pred != label:
+                    self.fn += 1
+
+    def eval(self):
+        recall = self.tp + self.fn
+        return float(self.tp) / recall if recall != 0 else .0
+
+
+class Accuracy(MetricBase):
+    """
+    Accumulate the accuracy from minibatches and compute the average accuracy
+    for every pass.
+    https://en.wikipedia.org/wiki/Accuracy_and_precision
+
+    Args:
+       name: the metrics name
+
+    Examples:
+        .. code-block:: python
+
+            labels = fluid.layers.data(name="data", shape=[1], dtype="int32")
+            data = fluid.layers.data(name="data", shape=[32, 32], dtype="int32")
+            pred = fluid.layers.fc(input=data, size=1000, act="tanh")
+            minibatch_accuracy = fluid.layers.accuracy(pred, label)
+            accuracy_evaluator = fluid.metrics.Accuracy()
+            for pass in range(PASSES):
+                accuracy_evaluator.reset()
+                for data in train_reader():
+                    batch_size = data[0]
+                    loss = exe.run(fetch_list=[cost, minibatch_accuracy])
+                accuracy_evaluator.update(value=minibatch_accuracy, weight=batch_size)
+                numpy_acc = accuracy_evaluator.eval()
+    """
+
+    def __init__(self, name=None):
+        super(Accuracy, self).__init__(name)
+        self.value = .0
+        self.weight = .0
+
+    def update(self, value, weight):
+        """
+        Update minibatch states.
+
+        Args:
+            value(float|numpy.array): accuracy of one minibatch.
+            weight(int|float): batch size.
+        """
+        if not _is_number_or_matrix_(value):
+            raise ValueError(
+                "The 'value' must be a number(int, float) or a numpy ndarray.")
+        if not _is_number_(weight):
+            raise ValueError("The 'weight' must be a number(int, float).")
+        self.value += value * weight
+        self.weight += weight
+
+    def eval(self):
+        if self.weight == 0:
+            raise ValueError("There is no data in Accuracy Metrics. \
+                Please check layers.accuracy output has added to Accuracy.")
+        return self.value / self.weight
+
+
+class ChunkEvaluator(MetricBase):
+    """
+    Accumulate counter numbers output by chunk_eval from mini-batches and
+    compute the precision recall and F1-score using the accumulated counter
+    numbers.
+    For some basics of chunking, please refer to
+    'Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>'.
+    ChunkEvalEvaluator computes the precision, recall, and F1-score of chunk detection,
+    and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
+
+    Examples:
+        .. code-block:: python
+
+            labels = fluid.layers.data(name="data", shape=[1], dtype="int32")
+            data = fluid.layers.data(name="data", shape=[32, 32], dtype="int32")
+            pred = fluid.layers.fc(input=data, size=1000, act="tanh")
+            precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks = layers.chunk_eval(
+                input=pred,
+                label=label)
+            metric = fluid.metrics.ChunkEvaluator()
+            for data in train_reader():
+                loss, preds, labels = exe.run(fetch_list=[cost, preds, labels])
+                metric.update(num_infer_chunks, num_label_chunks, num_correct_chunks)
+                numpy_precision, numpy_recall, numpy_f1 = metric.eval()
+    """
+
+    def __init__(self, name=None):
+        super(ChunkEvaluator, self).__init__(name)
+        self.num_infer_chunks = 0
+        self.num_label_chunks = 0
+        self.num_correct_chunks = 0
+
+    def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks):
+        """
+        Update the states based on the layers.chunk_eval() ouputs.
+        Args:
+            num_infer_chunks(int|numpy.array): The number of chunks in Inference on the given minibatch.
+            num_label_chunks(int|numpy.array): The number of chunks in Label on the given mini-batch.
+            num_correct_chunks(int|float|numpy.array): The number of chunks both in Inference and Label on the
+                                                  given mini-batch.
+        """
+        if not _is_number_or_matrix_(num_infer_chunks):
+            raise ValueError(
+                "The 'num_infer_chunks' must be a number(int) or a numpy ndarray."
+            )
+        if not _is_number_or_matrix_(num_label_chunks):
+            raise ValueError(
+                "The 'num_label_chunks' must be a number(int, float) or a numpy ndarray."
+            )
+        if not _is_number_or_matrix_(num_correct_chunks):
+            raise ValueError(
+                "The 'num_correct_chunks' must be a number(int, float) or a numpy ndarray."
+            )
+        self.num_infer_chunks += num_infer_chunks
+        self.num_label_chunks += num_label_chunks
+        self.num_correct_chunks += num_correct_chunks
+
+    def eval(self):
+        precision = float(
+            self.num_correct_chunks
+        ) / self.num_infer_chunks if self.num_infer_chunks else 0
+        recall = float(self.num_correct_chunks
+                       ) / self.num_label_chunks if self.num_label_chunks else 0
+        f1_score = float(2 * precision * recall) / (
+            precision + recall) if self.num_correct_chunks else 0
+        return precision, recall, f1_score
+
+
+class EditDistance(MetricBase):
+    """
+    Edit distance is a way of quantifying how dissimilar two strings
+    (e.g., words) are to one another by counting the minimum number
+    of operations required to transform one string into the other.
+    Refer to https://en.wikipedia.org/wiki/Edit_distance
+
+    Accumulate edit distance sum and sequence number from mini-batches and
+    compute the average edit_distance and instance error of all batches.
+
+    Args:
+        name: the metrics name
+
+    Examples:
+        .. code-block:: python
+
+            distances, seq_num = fluid.layers.edit_distance(input, label)
+            distance_evaluator = fluid.metrics.EditDistance()
+            for epoch in PASS_NUM:
+                distance_evaluator.reset()
+                for data in batches:
+                    loss = exe.run(fetch_list=[cost] + list(edit_distance_metrics))
+                distance_evaluator.update(distances, seq_num)
+                distance, instance_error = distance_evaluator.eval()
+
+        In the above example:
+        'distance' is the average of the edit distance in a pass.
+        'instance_error' is the instance error rate in a pass.
+
+    """
+
+    def __init__(self, name):
+        super(EditDistance, self).__init__(name)
+        self.total_distance = .0
+        self.seq_num = 0
+        self.instance_error = 0
+
+    def update(self, distances, seq_num):
+        if not _is_numpy_(distances):
+            raise ValueError("The 'distances' must be a numpy ndarray.")
+        if not _is_number_(seq_num):
+            raise ValueError("The 'seq_num' must be a number(int, float).")
+        seq_right_count = np.sum(distances == 0)
+        total_distance = np.sum(distances)
+        self.seq_num += seq_num
+        self.instance_error += seq_num - seq_right_count
+        self.total_distance += total_distance
+
+    def eval(self):
+        if self.seq_num == 0:
+            raise ValueError(
+                "There is no data in EditDistance Metric. Please check layers.edit_distance output has been added to EditDistance."
+            )
+        avg_distance = self.total_distance / self.seq_num
+        avg_instance_error = self.instance_error / self.seq_num
+        return avg_distance, avg_instance_error
+
+
+class DetectionMAP(MetricBase):
+    """
+    Calculate the detection mean average precision (mAP).
+    mAP is the metric to measure the accuracy of object detectors
+    like Faster R-CNN, SSD, etc.
+    It is the average of the maximum precisions at different recall values.
+    Please get more information from the following articles:
+      https://sanchom.wordpress.com/tag/average-precision/
+
+      https://arxiv.org/abs/1512.02325
+
+    The general steps are as follows:
+
+        1. calculate the true positive and false positive according to the input
+            of detection and labels.
+        2. calculate mAP value, support two versions: '11 point' and 'integral'.
+
+    Examples:
+        .. code-block:: python
+
+            pred = fluid.layers.fc(input=data, size=1000, act="tanh")
+            batch_map = layers.detection_map(
+                input,
+                label,
+                class_num,
+                background_label,
+                overlap_threshold=overlap_threshold,
+                evaluate_difficult=evaluate_difficult,
+                ap_version=ap_version)
+            metric = fluid.metrics.DetectionMAP()
+            for data in train_reader():
+                loss, preds, labels = exe.run(fetch_list=[cost, batch_map])
+                batch_size = data[0]
+                metric.update(value=batch_map, weight=batch_size)
+                numpy_map = metric.eval()
+    """
+
+    def __init__(self, name=None):
+        super(DetectionMAP, self).__init__(name)
+        # the current map value
+        self.value = .0
+        self.weight = .0
+
+    def update(self, value, weight):
+        if not _is_number_or_matrix_(value):
+            raise ValueError(
+                "The 'value' must be a number(int, float) or a numpy ndarray.")
+        if not _is_number_(weight):
+            raise ValueError("The 'weight' must be a number(int, float).")
+        self.value += value
+        self.weight += weight
+
+    def eval(self):
+        if self.weight == 0:
+            raise ValueError(
+                "There is no data in DetectionMAP Metrics. "
+                "Please check layers.detection_map output has added to DetectionMAP."
+            )
+        return self.value / self.weight
+
+
+class Auc(MetricBase):
+    """
+    Auc metric adapts to the binary classification.
+    Refer to https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve
+    Need to note that auc metric compute the value via Python natively.
+    If you concern the speed, please use the fluid.layers.auc instead.
+
+    The `auc` function creates four local variables, `true_positives`,
+    `true_negatives`, `false_positives` and `false_negatives` that are used to
+    compute the AUC. To discretize the AUC curve, a linearly spaced set of
+    thresholds is used to compute pairs of recall and precision values. The area
+    under the ROC-curve is therefore computed using the height of the recall
+    values by the false positive rate, while the area under the PR-curve is the
+    computed using the height of the precision values by the recall.
+
+    Args:
+        name: metric name
+        curve: Specifies the name of the curve to be computed, 'ROC' [default] or
+          'PR' for the Precision-Recall-curve.
+        num_thresholds: The number of thresholds to use when discretizing the roc
+            curve.
+
+    "NOTE: only implement the ROC curve type via Python now."
+
+    Examples:
+        .. code-block:: python
+
+            pred = fluid.layers.fc(input=data, size=1000, act="tanh")
+            metric = fluid.metrics.Auc()
+            for data in train_reader():
+                loss, preds, labels = exe.run(fetch_list=[cost, preds, labels])
+                metric.update(preds, labels)
+                numpy_auc = metric.eval()
+    """
+
+    def __init__(self, name, curve='ROC', num_thresholds=200):
+        super(Auc, self).__init__(name=name)
+        self._curve = curve
+        self._num_thresholds = num_thresholds
+        self._epsilon = 1e-6
+        self.tp_list = np.zeros((num_thresholds, ))
+        self.fn_list = np.zeros((num_thresholds, ))
+        self.tn_list = np.zeros((num_thresholds, ))
+        self.fp_list = np.zeros((num_thresholds, ))
+
+    def update(self, preds, labels):
+        if not _is_numpy_(labels):
+            raise ValueError("The 'labels' must be a numpy ndarray.")
+        if not _is_numpy_(preds):
+            raise ValueError("The 'predictions' must be a numpy ndarray.")
+
+        kepsilon = 1e-7  # to account for floating point imprecisions
+        thresholds = [(i + 1) * 1.0 / (self._num_thresholds - 1)
+                      for i in range(self._num_thresholds - 2)]
+        thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
+
+        # caculate TP, FN, TN, FP count
+        for idx_thresh, thresh in enumerate(thresholds):
+            tp, fn, tn, fp = 0, 0, 0, 0
+            for i, lbl in enumerate(labels):
+                if lbl:
+                    if preds[i, 1] >= thresh:
+                        tp += 1
+                    else:
+                        fn += 1
+                else:
+                    if preds[i, 1] >= thresh:
+                        fp += 1
+                    else:
+                        tn += 1
+            self.tp_list[idx_thresh] += tp
+            self.fn_list[idx_thresh] += fn
+            self.tn_list[idx_thresh] += tn
+            self.fp_list[idx_thresh] += fp
+
+    def eval(self):
+        epsilon = self._epsilon
+        num_thresholds = self._num_thresholds
+        tpr = (self.tp_list.astype("float32") + epsilon) / (
+            self.tp_list + self.fn_list + epsilon)
+        fpr = self.fp_list.astype("float32") / (
+            self.fp_list + self.tn_list + epsilon)
+        rec = (self.tp_list.astype("float32") + epsilon) / (
+            self.tp_list + self.fp_list + epsilon)
+
+        x = fpr[:num_thresholds - 1] - fpr[1:]
+        y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0
+        auc_value = np.sum(x * y)
+        return auc_value
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index 3b2e1a3073251a6d6460450dc957e1b5c7a873c5..9b3f2aebee73e56ee820dc8ff4c9cfabd1456aaa 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -26,16 +26,87 @@ def simple_img_conv_pool(input,
                          filter_size,
                          pool_size,
                          pool_stride,
-                         act,
-                         param_attr=None,
+                         pool_padding=0,
                          pool_type='max',
+                         global_pooling=False,
+                         conv_stride=1,
+                         conv_padding=0,
+                         conv_dilation=1,
+                         conv_groups=1,
+                         param_attr=None,
+                         bias_attr=None,
+                         act=None,
                          use_cudnn=True,
                          use_mkldnn=False):
+    """
+    The simple_img_conv_pool is composed with one Convolution2d and one Pool2d.
+
+    Args:
+        input (Variable): The input image with [N, C, H, W] format.
+        num_filters(int): The number of filter. It is as same as the output
+            feature channel.
+        filter_size (int|list|tuple): The filter size. If filter_size is a list or
+            tuple, it must contain two integers, (filter_size_H, filter_size_W). Otherwise,
+            the filter_size_H = filter_size_W = filter_size.
+        pool_size (int|list|tuple): The pooling size of Pool2d layer. If pool_size
+            is a list or tuple, it must contain two integers, (pool_size_H, pool_size_W).
+            Otherwise, the pool_size_H = pool_size_W = pool_size.
+        pool_stride (int|list|tuple): The pooling stride of Pool2d layer. If pool_stride
+            is a list or tuple, it must contain two integers, (pooling_stride_H, pooling_stride_W).
+            Otherwise, the pooling_stride_H = pooling_stride_W = pool_stride.
+        pool_padding (int|list|tuple): The padding of Pool2d layer. If pool_padding is a list or
+            tuple, it must contain two integers, (pool_padding_H, pool_padding_W).
+            Otherwise, the pool_padding_H = pool_padding_W = pool_padding. Default 0.
+        pool_type (str): Pooling type can be :math:`max` for max-pooling and :math:`avg` for
+            average-pooling. Default :math:`max`.
+        global_pooling (bool): Whether to use the global pooling. If global_pooling = true,
+            pool_size and pool_padding while be ignored. Default False
+        conv_stride (int|list|tuple): The stride size of the Conv2d Layer. If stride is a
+            list or tuple, it must contain two integers, (conv_stride_H, conv_stride_W). Otherwise,
+            the conv_stride_H = conv_stride_W = conv_stride. Default: conv_stride = 1.
+        conv_padding (int|list|tuple): The padding size of the Conv2d Layer. If padding is
+            a list or  tuple, it must contain two integers, (conv_padding_H, conv_padding_W).
+            Otherwise, the conv_padding_H = conv_padding_W = conv_padding. Default: conv_padding = 0.
+        conv_dilation (int|list|tuple): The dilation size of the Conv2d Layer. If dilation is
+            a list or tuple, it must contain two integers, (conv_dilation_H, conv_dilation_W).
+            Otherwise, the conv_dilation_H = conv_dilation_W = conv_dilation. Default: conv_dilation = 1.
+        conv_groups (int): The groups number of the Conv2d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: groups=1
+        param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None
+        bias_attr (ParamAttr): Bias parameter for the Conv2d layer. Default: None
+        act (str): Activation type for Conv2d. Default: None
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled
+            with mkldnn library. Default: False
+
+    Return:
+        Variable: The result of input after Convolution2d and Pool2d.
+
+    Examples:
+        .. code-block:: python
+
+            img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+            conv_pool = fluid.nets.simple_img_conv_pool(input=img,
+                                                        filter_size=5,
+                                                        num_filters=20,
+                                                        pool_size=2,
+                                                        pool_stride=2,
+                                                        act="relu")
+    """
     conv_out = layers.conv2d(
         input=input,
         num_filters=num_filters,
         filter_size=filter_size,
+        stride=conv_stride,
+        padding=conv_padding,
+        dilation=conv_dilation,
+        groups=conv_groups,
         param_attr=param_attr,
+        bias_attr=bias_attr,
         act=act,
         use_cudnn=use_cudnn,
         use_mkldnn=use_mkldnn)
@@ -45,6 +116,8 @@ def simple_img_conv_pool(input,
         pool_size=pool_size,
         pool_type=pool_type,
         pool_stride=pool_stride,
+        pool_padding=pool_padding,
+        global_pooling=global_pooling,
         use_cudnn=use_cudnn,
         use_mkldnn=use_mkldnn)
     return pool_out
@@ -60,11 +133,65 @@ def img_conv_group(input,
                    conv_with_batchnorm=False,
                    conv_batchnorm_drop_rate=0.0,
                    pool_stride=1,
-                   pool_type=None,
+                   pool_type="max",
                    use_cudnn=True,
                    use_mkldnn=False):
     """
-    Image Convolution Group, Used for vgg net.
+    The Image Convolution Group is composed of Convolution2d, BatchNorm, DropOut,
+    and Pool2d. According to the input arguments, img_conv_group will do serials of
+    computation for Input using Convolution2d, BatchNorm, DropOut, and pass the last
+    result to Pool2d.
+
+    Args:
+        input (Variable): The input image with [N, C, H, W] format.
+        conv_num_filter(list|tuple): Indicates the numbers of filter of this group.
+        pool_size (int|list|tuple): The pooling size of Pool2d Layer. If pool_size
+            is a list or tuple, it must contain two integers, (pool_size_H, pool_size_W).
+            Otherwise, the pool_size_H = pool_size_W = pool_size.
+        conv_padding (int|list|tuple): The padding size of the Conv2d Layer. If padding is
+            a list or tuple, its length must be equal to the length of conv_num_filter.
+            Otherwise the conv_padding of all Conv2d Layers are the same. Default 1.
+        conv_filter_size (int|list|tuple): The filter size. If filter_size is a list or
+            tuple, its length must be equal to the length of conv_num_filter.
+            Otherwise the conv_filter_size of all Conv2d Layers are the same. Default 3.
+        conv_act (str): Activation type for Conv2d Layer that is not followed by BatchNorm.
+            Default: None.
+        param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None
+        conv_with_batchnorm (bool|list): Indicates whether to use BatchNorm after Conv2d Layer.
+            If conv_with_batchnorm is a list, its length must be equal to the length of
+            conv_num_filter. Otherwise, conv_with_batchnorm indicates whether all the
+            Conv2d Layer follows a BatchNorm. Default False.
+        conv_batchnorm_drop_rate (float|list): Indicates the drop_rate of Dropout Layer
+            after BatchNorm. If conv_batchnorm_drop_rate is a list, its length must be
+            equal to the length of conv_num_filter. Otherwise, drop_rate of all Dropout
+            Layers is conv_batchnorm_drop_rate. Default 0.0.
+        pool_stride (int|list|tuple): The pooling stride of Pool2d layer. If pool_stride
+            is a list or tuple, it must contain two integers, (pooling_stride_H,
+            pooling_stride_W). Otherwise, the pooling_stride_H = pooling_stride_W = pool_stride.
+            Default 1.
+        pool_type (str): Pooling type can be :math:`max` for max-pooling and :math:`avg` for
+            average-pooling. Default :math:`max`.
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        use_mkldnn (bool): Use mkldnn kernels or not, it is valid only when compiled
+            with mkldnn library. Default: False
+
+    Return:
+        Variable: The final result after serial computation using Convolution2d,
+            BatchNorm, DropOut, and Pool2d.
+
+    Examples:
+        .. code-block:: python
+
+            img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+            conv_pool = fluid.nets.img_conv_group(input=img,
+                                                  num_channels=3,
+                                                  conv_padding=1,
+                                                  conv_num_filter=[3, 3],
+                                                  conv_filter_size=3,
+                                                  conv_act="relu",
+                                                  pool_size=2,
+                                                  pool_stride=2)
     """
     tmp = input
     assert isinstance(conv_num_filter, list) or \
@@ -74,6 +201,7 @@ def img_conv_group(input,
         if not hasattr(obj, '__len__'):
             return [obj] * len(conv_num_filter)
         else:
+            assert len(obj) == len(conv_num_filter)
             return obj
 
     conv_padding = __extend_list__(conv_padding)
@@ -98,7 +226,7 @@ def img_conv_group(input,
             use_mkldnn=use_mkldnn)
 
         if conv_with_batchnorm[i]:
-            tmp = layers.batch_norm(input=tmp, act=conv_act)
+            tmp = layers.batch_norm(input=tmp, act=conv_act, in_place=True)
             drop_rate = conv_batchnorm_drop_rate[i]
             if abs(drop_rate) > 1e-5:
                 tmp = layers.dropout(x=tmp, dropout_prob=drop_rate)
@@ -119,6 +247,39 @@ def sequence_conv_pool(input,
                        param_attr=None,
                        act="sigmoid",
                        pool_type="max"):
+    """
+    The sequence_conv_pool is composed with Sequence Convolution and Pooling.
+
+    Args:
+        input (Variable): The input of sequence_conv, which supports variable-time
+            length input sequence. The underlying of input is a matrix with shape
+            (T, N), where T is the total time steps in this mini-batch and N is
+            the input_hidden_size
+        num_filters(int): The number of filter.
+        filter_size (int): The filter size.
+        param_attr (ParamAttr): The parameters to the Sequence_conv Layer. Default: None.
+        act (str): Activation type for Sequence_conv Layer. Default: "sigmoid".
+        pool_type (str): Pooling type can be :math:`max` for max-pooling, :math:`average` for
+            average-pooling, :math:`sum` for sum-pooling, :math:`sqrt` for sqrt-pooling.
+            Default :math:`max`.
+
+    Return:
+        Variable: The final result after Sequence Convolution and Pooling.
+
+    Examples:
+        .. code-block:: python
+
+            input_dim = len(word_dict)
+            emb_dim = 128
+            hid_dim = 512
+            data = fluid.layers.data( ame="words", shape=[1], dtype="int64", lod_level=1)
+            emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim], is_sparse=True)
+            seq_conv = fluid.nets.sequence_conv_pool(input=emb,
+                                                     num_filters=hid_dim,
+                                                     filter_size=3,
+                                                     act="tanh",
+                                                     pool_type="sqrt")
+    """
     conv_out = layers.sequence_conv(
         input=input,
         num_filters=num_filters,
@@ -132,9 +293,9 @@ def sequence_conv_pool(input,
 
 def glu(input, dim=-1):
     """
-    The gated linear unit composed by split, sigmoid activation and elementwise
-    multiplication. Specifically, Split the input into two equal sized parts
-    :math:`a` and :math:`b` along the given dimension and then compute as
+    The Gated Linear Units(GLU) composed by split, sigmoid activation and element-wise
+    multiplication. Specifically, Split the input into two equal sized parts,
+    :math:`a` and :math:`b`, along the given dimension and then compute as
     following:
 
         .. math::
@@ -147,16 +308,16 @@ def glu(input, dim=-1):
     Args:
         input (Variable): The input variable which is a Tensor or LoDTensor.
         dim (int): The dimension along which to split. If :math:`dim < 0`, the
-            dimension to split along is :math:`rank(input) + dim`.
+            dimension to split along is :math:`rank(input) + dim`. Default -1.
 
     Returns:
-        Variable: The Tensor variable with half the size of input.
+        Variable: Variable with half the size of input.
 
     Examples:
         .. code-block:: python
 
-            # x is a Tensor variable with shape [3, 6, 9]
-            fluid.nets.glu(input=x, dim=1)  # shape of output: [3, 3, 9]
+            data = fluid.layers.data(name="words", shape=[3, 6, 9], dtype="float32")
+            output = fluid.nets.glu(input=data, dim=1)  # shape of output: [3, 3, 9]
     """
 
     a, b = layers.split(input, num_or_sections=2, dim=dim)
@@ -189,40 +350,48 @@ def scaled_dot_product_attention(queries,
     <https://arxiv.org/pdf/1706.03762.pdf>`_.
 
     Args:
-
         queries (Variable): The input variable which should be a 3-D Tensor.
         keys (Variable): The input variable which should be a 3-D Tensor.
         values (Variable): The input variable which should be a 3-D Tensor.
         num_heads (int): Head number to compute the scaled dot product
-                         attention. Default value is 1.
+            attention. Default: 1.
         dropout_rate (float): The dropout rate to drop the attention weight.
-                              Default value is 0.
+            Default: 0.0.
 
     Returns:
-
-        Variable: A 3-D Tensor computed by multi-head scaled dot product \
-                  attention.
+        Variable: A 3-D Tensor computed by multi-head scaled dot product\
+            attention.
 
     Raises:
-
         ValueError: If input queries, keys, values are not 3-D Tensors.
 
-    NOTE:
+    NOTES:
         1. When num_heads > 1, three linear projections are learned respectively
-        to map input queries, keys and values into queries', keys' and values'.
-        queries', keys' and values' have the same shapes with queries, keys
-        and values.
-
-        1. When num_heads == 1, scaled_dot_product_attention has no learnable
-        parameters.
+           to map input queries, keys and values into queries', keys' and values'.
+           queries', keys' and values' have the same shapes with queries, keys
+           and values.
+        2. When num_heads == 1, scaled_dot_product_attention has no learnable
+           parameters.
 
     Examples:
         .. code-block:: python
 
-            # Suppose q, k, v are Tensors with the following shape:
-            # q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10]
-
-            contexts = fluid.nets.scaled_dot_product_attention(q, k, v)
+            queries = fluid.layers.data(name="queries",
+                                        shape=[3, 5, 9],
+                                        dtype="float32",
+                                        append_batch_size=False)
+            queries.stop_gradient = False
+            keys = fluid.layers.data(name="keys",
+                                     shape=[3, 6, 9],
+                                     dtype="float32",
+                                     append_batch_size=False)
+            keys.stop_gradient = False
+            values = fluid.layers.data(name="values",
+                                       shape=[3, 6, 10],
+                                       dtype="float32",
+                                       append_batch_size=False)
+            values.stop_gradient = False
+            contexts = fluid.nets.scaled_dot_product_attention(queries, keys, values)
             contexts.shape  # [3, 5, 10]
     """
     if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 421963a2f9120dae3a72142681f0a30232c11166..75ee40fa9ca94cdd84ee7acbb62d6e652ac7fa33 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -11,9 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import re
 from collections import defaultdict
-
+from paddle.fluid.framework import Program, Variable
 import framework
 import layers
 from backward import append_backward
@@ -23,8 +23,14 @@ from initializer import Constant
 from layer_helper import LayerHelper
 from regularizer import append_regularization_ops
 from clip import append_gradient_clip_ops, error_clip_callback
+from contextlib import contextmanager
 
-__all__ = ['SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad']
+__all__ = [
+    'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
+    'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
+    'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer',
+    'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'Optimizer', 'RMSPropOptimizer'
+]
 
 
 class Optimizer(object):
@@ -35,12 +41,17 @@ class Optimizer(object):
     but need to use one of it's implementation.
     """
 
-    def __init__(self, learning_rate, regularization=None):
+    def __init__(self,
+                 learning_rate,
+                 regularization=None,
+                 LARS_weight_decay=0.0):
         if not isinstance(learning_rate, float) and \
                 not isinstance(learning_rate, framework.Variable):
             raise TypeError("learning rate should be float or Variable")
         self.regularization = regularization
         self._learning_rate = learning_rate
+        # the learning rate type should be inferenced from loss
+        self._dtype = None
         # each program should have a independent learning rate
         # program -> Variable(learning_rate)
         self._learning_rate_map = dict()
@@ -53,6 +64,7 @@ class Optimizer(object):
         # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
         self._accumulators = defaultdict(lambda: dict())
         self.helper = None
+        self._LARS_weight_decay = LARS_weight_decay
 
     def _create_global_learning_rate(self):
         lr = self.global_learning_rate()
@@ -71,7 +83,7 @@ class Optimizer(object):
             name=unique_name.generate("learning_rate"),
             shape=[1],
             value=float(self._learning_rate),
-            dtype='float32',
+            dtype='float32' if self._dtype == None else self._dtype,
             persistable=True)
 
     def global_learning_rate(self, program=None):
@@ -92,10 +104,15 @@ class Optimizer(object):
         # create learning rate variable for every parameter
         param = param_and_grad[0]
         param_lr = param.optimize_attr['learning_rate']
-        if param_lr == 1.0:
-            return self.global_learning_rate()
+        if type(param_lr) == Variable:
+            # param learning rate has been updated (LARS)
+            print("returns updated param lr ", param_lr)
+            return param_lr
         else:
-            return self.global_learning_rate() * param_lr
+            if param_lr == 1.0:
+                return self.global_learning_rate()
+            else:
+                return self.global_learning_rate() * param_lr
 
     def _create_accumulators(self, block, parameters):
         """Create all accumulators needed by the parameters
@@ -119,7 +136,12 @@ class Optimizer(object):
         """
         pass
 
-    def _add_accumulator(self, name, param, dtype=None, fill_value=0.0):
+    def _add_accumulator(self,
+                         name,
+                         param,
+                         dtype=None,
+                         fill_value=0.0,
+                         shape=None):
         """Utility function to add an accumulator for a parameter
 
         Args:
@@ -133,17 +155,19 @@ class Optimizer(object):
                 param.name in self._accumulators[name]):
             raise Exception("Accumulator {} already exists for parameter {}".
                             format(name, param.name))
-
+        if shape == None:
+            shape = param.shape
         assert isinstance(self.helper, LayerHelper)
         var = self.helper.create_global_variable(
             name=unique_name.generate(name),
             persistable=True,
             dtype=dtype or param.dtype,
             type=param.type,
-            shape=param.shape)
+            shape=shape)
         self.helper.set_variable_initializer(
             var, initializer=Constant(value=float(fill_value)))
         self._accumulators[name][param.name] = var
+        return var
 
     def _get_accumulator(self, name, param):
         """Utility function to fetch an accumulator for a parameter
@@ -168,15 +192,15 @@ class Optimizer(object):
         """Add optimization operators to update gradients to variables.
 
         Args:
-          loss: the target that this optimization is for.
-          parameters_and_grads: a list of (variable, gradient) pair to update.
+          loss(Variable): the target that this optimization is for.
+          parameters_and_grads(list(tuple(Variable, Variable))):
+          a list of (variable, gradient) pair to update.
 
         Returns:
           return_op_list: a list of operators that will complete one step of
           optimization. This will include parameter update ops, global step
           update ops and any other custom ops required by subclasses to manage
           their internal state.
-          :param startup_program:
         """
         # This is a default implementation of create_optimization_pass that
         # can be shared by most optimizers. This implementation assumes that
@@ -187,6 +211,7 @@ class Optimizer(object):
 
         # Create any accumulators
         program = loss.block.program
+        self._dtype = loss.dtype
         with program_guard(program, startup_program):
             global_block = framework.default_main_program().global_block()
             start = len(global_block.ops)
@@ -194,14 +219,20 @@ class Optimizer(object):
             self._create_accumulators(loss.block,
                                       [p[0] for p in parameters_and_grads])
             self._create_global_learning_rate()
+            if self._LARS_weight_decay > 0.0:
+                layers.append_LARS(parameters_and_grads,
+                                   self.global_learning_rate(),
+                                   self._LARS_weight_decay)
 
             optimize_ops = []
             for param_and_grad in parameters_and_grads:
-                if param_and_grad[0].trainable is True and param_and_grad[
-                        1] is not None:
-                    optimize_op = self._append_optimize_op(loss.block,
-                                                           param_and_grad)
-                    optimize_ops.append(optimize_op)
+                with param_and_grad[0].block.program.optimized_guard(
+                        param_and_grad[0]):
+                    if param_and_grad[0].trainable is True and param_and_grad[
+                            1] is not None:
+                        optimize_op = self._append_optimize_op(loss.block,
+                                                               param_and_grad)
+                        optimize_ops.append(optimize_op)
 
             # Get custom finish ops for subclasses
             # FIXME: Need to fix this once we figure out how to handle dependencies
@@ -223,6 +254,8 @@ class Optimizer(object):
         params_grads = append_backward(loss, parameter_list, no_grad_set,
                                        [error_clip_callback])
 
+        params_grads = sorted(params_grads, key=lambda x: x[0].name)
+
         params_grads = append_gradient_clip_ops(params_grads)
 
         # Add regularization if any
@@ -235,7 +268,22 @@ class Optimizer(object):
 
 
 class SGDOptimizer(Optimizer):
-    """ Simple SGD optimizer without any state.
+    """
+    Optimizer of the stochastic gradient descent algorithm.
+
+    .. math::
+
+        param\_out = param - learning\_rate * grad
+
+    Args:
+        learning_rate (float|Variable): the learning rate used to update parameters. \
+        Can be a float value or a Variable with one float value as data element.
+
+    Examples:
+        .. code-block:: python
+
+            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.2)
+            sgd_optimizer.minimize(cost)
     """
 
     def __init__(self, learning_rate, **kwargs):
@@ -261,7 +309,37 @@ class SGDOptimizer(Optimizer):
 
 
 class MomentumOptimizer(Optimizer):
-    """Simple Momentum optimizer with velocity state
+    """
+
+    Simple Momentum optimizer with velocity state
+
+    This optimizer has a flag for Nestrov Momentum.
+
+    The update equations are as follows:
+
+    .. math::
+
+        & velocity = mu * velocity + gradient
+
+        & if (use\_nesterov):
+
+        &\quad   param = param - gradient * learning\_rate + mu * velocity * learning\_rate
+
+        & else:
+
+        &\quad   param = param - learning\_rate * velocity
+
+    Args:
+        learning_rate (float|Variable): the learning rate used to update parameters. \
+        Can be a float value or a Variable with one float value as data element.
+        momentum (float): momentum factor
+        use_nesterov (bool): enables Nesterov momentum
+
+    Examples:
+        .. code-block:: python
+
+            optimizer = fluid.optimizer.Momentum(learning_rate=0.2, momentum=0.1)
+            optimizer.minimize(cost)
     """
     _velocity_acc_str = "velocity"
 
@@ -305,7 +383,32 @@ class MomentumOptimizer(Optimizer):
 
 
 class AdagradOptimizer(Optimizer):
-    """Simple Adagrad optimizer with moment state
+    """
+    **Adaptive Gradient Algorithm (Adagrad)**
+
+    The update is done as follows:
+
+    .. math::
+
+        moment\_out &= moment + grad * grad
+
+        param\_out &= param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
+
+    The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+    does not have the epsilon attribute. It is added here in our implementation
+    as also proposed here: http://cs231n.github.io/neural-networks-3/#ada
+    for numerical stability to avoid the division by zero error.
+
+    Args:
+        learning_rate (float|Variable): the learning rate used to update parameters. \
+        Can be a float value or a Variable with one float value as data element.
+        epsilon (float): a small float value for numerical stability.
+
+    Examples:
+        .. code-block:: python
+
+            optimizer = fluid.optimizer.Adagrad(learning_rate=0.2)
+            optimizer.minimize(cost)
     """
     _moment_acc_str = "moment"
 
@@ -346,7 +449,40 @@ class AdagradOptimizer(Optimizer):
 
 
 class AdamOptimizer(Optimizer):
-    """Implements the Adam Optimizer
+    """
+    This implements the Adam optimizer from Section 2 of the Adam
+    paper : https://arxiv.org/abs/1412.6980.
+    Adam is a first-order gradient-based optimization method based on
+    adaptive estimates of lower-order moments.
+
+    Adam updates:
+
+    .. math::
+
+        t & = t + 1
+
+        moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
+
+        moment\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
+
+        learning\_rate & = learning\_rate * \\
+                          \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t}
+
+        param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
+
+    Args:
+        learning_rate (float|Variable): the learning rate used to update parameters. \
+        Can be a float value or a Variable with one float value as data element.
+        beta1 (float): The exponential decay rate for the 1st moment estimates.
+        beta2 (float): The exponential decay rate for the 2nd moment estimates.
+        epsilon (float): a small float value for numerical stability.
+
+    Examples:
+        .. code-block:: python
+
+            optimizer = fluid.optimizer.Adam(learning_rate=0.2)
+            optimizer.minimize(cost)
+
     """
     _moment1_acc_str = "moment1"
     _moment2_acc_str = "moment2"
@@ -376,7 +512,7 @@ class AdamOptimizer(Optimizer):
         beta_shape = [1]
         self._beta1_pow_acc = self.helper.create_global_variable(
             name=unique_name.generate('beta1_pow_acc'),
-            dtype='float32',
+            dtype='float32' if self._dtype == None else self._dtype,
             shape=beta_shape,
             lod_level=0,
             persistable=True)
@@ -385,7 +521,7 @@ class AdamOptimizer(Optimizer):
 
         self._beta2_pow_acc = self.helper.create_global_variable(
             name=unique_name.generate('beta2_pow_acc'),
-            dtype='float32',
+            dtype='float32' if self._dtype == None else self._dtype,
             shape=beta_shape,
             lod_level=0,
             persistable=True)
@@ -451,7 +587,42 @@ class AdamOptimizer(Optimizer):
 
 
 class AdamaxOptimizer(Optimizer):
-    """Implements the Adamax Optimizer
+    """
+    We implement the Adamax optimizer from Section 7 of the Adam
+    paper: https://arxiv.org/abs/1412.6980. Adamax is a variant of the
+    Adam algorithm based on the infinity norm.
+
+    Adamax updates:
+
+    .. math::
+
+        t & = t + 1
+
+        moment\_out & = {\\beta}_1 * moment + (1 - {\\beta}_1) * grad
+
+        inf\_norm\_out & = max({\\beta}_2 * inf\_norm + \epsilon, |grad|)
+
+        learning\_rate & = \\frac{learning\_rate}{1 - {\\beta}_1^t}
+
+        param\_out & = param - learning\_rate * \\frac{moment\_out}{inf\_norm\_out}
+
+
+    The original paper does not have an epsilon attribute.
+    However, it is added here for numerical stability to prevent the
+    division by 0 error.
+
+    Args:
+        learning_rate (float|Variable): the learning rate used to update parameters. \
+        Can be a float value or a Variable with one float value as data element.
+        beta1 (float): The exponential decay rate for the 1st moment estimates.
+        beta2 (float): The exponential decay rate for the 2nd moment estimates.
+        epsilon (float): a small float value for numerical stability.
+
+    Examples:
+        .. code-block:: python
+
+            optimizer = fluid.optimizer.Adamax(learning_rate=0.2)
+            optimizer.minimize(cost)
     """
     _moment_acc_str = "moment"
     _inf_norm_acc_str = "inf_norm"
@@ -478,7 +649,7 @@ class AdamaxOptimizer(Optimizer):
         beta_shape = [1]
         self._beta1_pow_acc = self.helper.create_global_variable(
             name=unique_name.generate('beta1_pow_acc'),
-            dtype='float32',
+            dtype='float32' if self._dtype == None else self._dtype,
             shape=beta_shape,
             lod_level=0,
             persistable=True)
@@ -535,7 +706,34 @@ class AdamaxOptimizer(Optimizer):
 
 
 class DecayedAdagradOptimizer(Optimizer):
-    """Simple Decayed Adagrad optimizer with moment state
+    """
+    **Decayed Adagrad Optimizer**
+
+    The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+
+    The update is done as follows:
+
+    .. math::
+
+        moment\_out & = decay * moment + (1 - decay) * grad * grad
+
+        param\_out & = param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
+
+    The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+    does not have an epsilon attribute. It is added here for numerical
+    stability to avoid the division by zero error.
+
+    Args:
+        learning_rate (float|Variable): the learning rate used to update parameters. \
+        Can be a float value or a Variable with one float value as data element.
+        decay (float): decay rate.
+        epsilon (float): a small float value for numerical stability.
+
+    Examples:
+        .. code-block:: python
+
+            optimizer = fluid.optimizer.DecayedAdagrad(learning_rate=0.2)
+            optimizer.minimize(cost)
     """
     _moment_acc_str = "moment"
 
@@ -578,6 +776,313 @@ class DecayedAdagradOptimizer(Optimizer):
         return decayed_adagrad_op
 
 
+class AdadeltaOptimizer(Optimizer):
+    """
+    **Adadelta Optimizer**
+
+    Simple Adadelta optimizer with average squared grad state and
+    average squared update state.
+    The details of adadelta please refer to this
+    `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD
+    <http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf>`_.
+
+    ..  math::
+
+        E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\
+        learning\\_rate &= sqrt( ( E(dx_{t-1}^2) + \\epsilon ) / ( \\
+                          E(g_t^2) + \\epsilon ) ) \\\\
+        E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\\_rate)^2
+
+    Args:
+        learning_rate(float): global learning rate
+        rho(float): rho in equation
+        epsilon(float): epsilon in equation
+
+    Examples:
+        .. code-block:: python
+
+            optimizer = fluid.optimizer.Adadelta(
+                learning_rate=0.0003, epsilon=1.0e-6, rho=0.95)
+            _, params_grads = optimizer.minimize(cost)
+    """
+
+    _avg_squared_grad_acc_str = "_avg_squared_grad"
+    _avg_squared_update_acc_str = "_avg_squared_update"
+
+    def __init__(self, learning_rate, epsilon=1.0e-6, rho=0.95, **kwargs):
+        if learning_rate is None:
+            raise ValueError("learning_rate is not set.")
+        if epsilon is None:
+            raise ValueError("epsilon is not set.")
+        if rho is None:
+            raise ValueError("rho is not set.")
+        super(AdadeltaOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
+        self.type = "adadelta"
+        self._epsilon = epsilon
+        self._rho = rho
+
+    def _create_accumulators(self, block, parameters):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+
+        for p in parameters:
+            self._add_accumulator(self._avg_squared_grad_acc_str, p)
+            self._add_accumulator(self._avg_squared_update_acc_str, p)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+
+        avg_squared_grad_acc = self._get_accumulator(
+            self._avg_squared_grad_acc_str, param_and_grad[0])
+        avg_squared_update_acc = self._get_accumulator(
+            self._avg_squared_update_acc_str, param_and_grad[0])
+
+        # Create the adadelta optimizer op
+        adadelta_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "AvgSquaredGrad": avg_squared_grad_acc,
+                "AvgSquaredUpdate": avg_squared_update_acc
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "AvgSquaredGradOut": avg_squared_grad_acc,
+                "AvgSquaredUpdateOut": avg_squared_update_acc
+            },
+            attrs={"epsilon": self._epsilon,
+                   "rho": self._rho})
+
+        return adadelta_op
+
+
+class RMSPropOptimizer(Optimizer):
+    """
+    Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning
+    rate method. The original slides proposed RMSProp: Slide 29 of
+    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf .
+
+    The original equation is as follows:
+
+    ..  math::
+
+        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
+
+        w & = w - \\frac{\\eta} {\\sqrt{r(w,t) + \\epsilon}} \\nabla Q_{i}(w)
+
+    The first equation calculates moving average of the squared gradient for
+    each weight. Then dividing the gradient by :math:`sqrt{v(w,t)}`.
+
+    In some cases, adding a momentum term :math: `\\beta` is beneficial.
+    In our implementation, Nesterov momentum is used:
+
+    ..  math::
+
+        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
+
+        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{v(w,t) +
+            \\epsilon}} \\nabla Q_{i}(w)
+
+        w & = w - v(w, t)
+
+    where, :math:`\\rho` is a hyperparameter and typical values are 0.9, 0.95
+    and so on. :math: `beta` is the momentum term. :math: `\\epsilon` is a
+    smoothing term to avoid division by zero, usually set somewhere in range
+    from 1e-4 to 1e-8.
+
+
+    Args:
+        learning_rate(float): global learning rate.
+        rho(float): rho is :math: `\\rho` in equation, set 0.95 by default.
+        epsilon(float): :math: `\\epsilon` in equation is smoothing term to
+            avoid division by zero, set 1e-6 by default.
+        momentum(float): :math:`\\beta` in equation is the momentum term,
+            set 0.0 by default.
+
+    Raises:
+        ValueError: If learning_rate, rho, epsilon, momentum are None.
+
+    Examples:
+          .. code-block:: python
+
+              optimizer = fluid.optimizer.RMSProp(0.0001)
+              _, params_grads = optimizer.minimize(cost)
+    """
+
+    _momentum_acc_str = "momentum"
+    _mean_square_acc_str = "mean_square"
+
+    def __init__(self,
+                 learning_rate,
+                 rho=0.95,
+                 epsilon=1.0e-6,
+                 momentum=0.0,
+                 **kwargs):
+        super(RMSPropOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
+        if learning_rate is None:
+            raise ValueError("learning_rate is not set.")
+        if rho is None:
+            raise ValueError("rho is not set.")
+        if epsilon is None:
+            raise ValueError("epsilon is not set.")
+        if momentum is None:
+            raise ValueError("momentum is not set.")
+
+        self.type = "rmsprop"
+        self._rho = rho
+        self._epsilon = epsilon
+        self._momentum = momentum
+
+    def _create_accumulators(self, block, parameters):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+
+        for p in parameters:
+            self._add_accumulator(self._momentum_acc_str, p)
+            self._add_accumulator(self._mean_square_acc_str, p)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+
+        momentum_acc = self._get_accumulator(self._momentum_acc_str,
+                                             param_and_grad[0])
+        mean_square_acc = self._get_accumulator(self._mean_square_acc_str,
+                                                param_and_grad[0])
+        rmsprop_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "Moment": momentum_acc,
+                "MeanSquare": mean_square_acc,
+                "LearningRate": self._create_param_lr(param_and_grad),
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "MomentOut": momentum_acc,
+                "MeanSquareOut": mean_square_acc
+            },
+            attrs={
+                "epsilon": self._epsilon,
+                "decay": self._rho,
+                "momentum": self._momentum
+            })
+
+        return rmsprop_op
+
+
+class FtrlOptimizer(Optimizer):
+    """
+    FTRL (Follow The Regularized Leader) Optimizer.
+
+    The paper that proposed Follow The Regularized Leader (FTRL):
+    (https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
+
+    ..  math::
+
+        &new\_accum = squared\_accum + grad^2
+
+        &if (lr\_power == -0.5):
+
+        &\quad  linear\_accum += grad - \\frac{\\sqrt{new\_accum} - \\sqrt{squared\_accum}}{learning\_rate * param}
+
+        &else:
+
+        &\quad   linear\_accum += grad - \\frac{new\_accum^{-lr\_power} - accum^{-lr\_power}}{learning\_rate * param}
+
+
+        &x = l1 * sign(linear\_accum) - linear\_accum
+
+        &if (lr\_power == -0.5):
+
+        &\quad   y = \\frac{\\sqrt{new\_accum}}{learning\_rate} + (2 * l2)
+
+        &\quad   pre\_shrink = \\frac{x}{y}
+
+        &\quad   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0)
+
+        &else:
+
+        &\quad   y = \\frac{new\_accum^{-lr\_power}}{learning\_rate} + (2 * l2)
+
+        &\quad   pre\_shrink = \\frac{x}{y}
+
+        &\quad   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0)
+
+        &squared\_accum += grad^2
+
+    Args:
+        learning_rate (float|Variable): global learning rate.
+        l1 (float):
+        l2 (float):
+        lr_power (float):
+
+    Raises:
+        ValueError: If learning_rate, rho, epsilon, momentum are None.
+
+    Examples:
+          .. code-block:: python
+
+              optimizer = fluid.optimizer.Ftrl(0.0001)
+              _, params_grads = optimizer.minimize(cost)
+    """
+
+    _squared_acc_str = "squared"
+    _linear_acc_str = "linear"
+
+    def __init__(self, learning_rate, l1=0.0, l2=0.0, lr_power=-0.5, **kwargs):
+        super(FtrlOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
+        if learning_rate is None:
+            raise ValueError("learning_rate is not set.")
+
+        self.type = "ftrl"
+        self._l1 = l1
+        self._l2 = l2
+        self._lr_power = lr_power
+
+    def _create_accumulators(self, block, parameters):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+
+        for p in parameters:
+            self._add_accumulator(self._squared_acc_str, p)
+            self._add_accumulator(self._linear_acc_str, p)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        if not isinstance(block, framework.Block):
+            raise TypeError("block is not instance of framework.Block.")
+
+        squared_acc = self._get_accumulator(self._squared_acc_str,
+                                            param_and_grad[0])
+        linear_acc = self._get_accumulator(self._linear_acc_str,
+                                           param_and_grad[0])
+        ftrl_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "SquaredAccumulator": squared_acc,
+                "LinearAccumulator": linear_acc,
+                "LearningRate": self._create_param_lr(param_and_grad),
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "SquaredAccumOut": squared_acc,
+                "LinearAccumOut": linear_acc
+            },
+            attrs={"l1": self._l1,
+                   "l2": self._l1,
+                   "lr_power": self._lr_power})
+
+        return ftrl_op
+
+
 # We short the class name, since users will use the optimizer with the package
 # name. The sample code:
 #
@@ -592,3 +1097,156 @@ Adagrad = AdagradOptimizer
 Adam = AdamOptimizer
 Adamax = AdamaxOptimizer
 DecayedAdagrad = DecayedAdagradOptimizer
+Adadelta = AdadeltaOptimizer
+RMSProp = RMSPropOptimizer
+Ftrl = FtrlOptimizer
+
+
+class ModelAverage(Optimizer):
+    """Accumulate the average of parameters whtin sliding window. The average
+    result will be saved in temporary variables which can be applied to
+    parameter variables of current model by calling 'apply()' method. And the
+    'restore()' method is used to restored the parameter values of current model.
+
+    The size of average window is determined by average_window_rate,
+    min_average_window, max_average_window and current update times.
+
+    Args:
+        average_window_rate: The rate of average window.
+        min_average_window: The minimum size of average window.
+        max_average_window: The maximum size of average window.
+
+    Examples:
+
+      .. code-block:: python
+
+        optimizer = fluid.optimizer.Momentum()
+        optimizer.minimize(cost)
+        model_average = fluid.optimizer.ModelAverage(0.15,
+                                                min_average_window=10000,
+                                                max_average_window=20000)
+        for pass_id in range(args.pass_num):
+            for data in train_reader():
+                exe.run(fluid.default_main_program()...)
+
+            with model_average.apply(exe):
+                for data in test_reader():
+                    exe.run(inference_program...)
+    """
+
+    def __init__(self,
+                 average_window_rate,
+                 min_average_window=10000,
+                 max_average_window=10000,
+                 **kwargs):
+        super(ModelAverage, self).__init__(0.0, **kwargs)
+        self.average_window = average_window_rate
+        self.min_average_window = min_average_window
+        self.max_average_window = max_average_window
+
+        self.params_grads = []
+        for param in framework.default_main_program().global_block(
+        ).all_parameters():
+            if param.do_model_average != False:
+                grad = param.block.create_var(
+                    name=unique_name.generate(".".join([param.name, 'tmp'])),
+                    dtype=param.dtype,
+                    persistable=False,
+                    stop_gradient=True)
+                self.params_grads.append((param, grad))
+
+        for param, grad in self.params_grads:
+            self._append_average_accumulate_op(param)
+
+        self.apply_program = Program()
+        block = self.apply_program.global_block()
+        with program_guard(main_program=self.apply_program):
+            for param_grad in self.params_grads:
+                self._add_average_apply_op(block, param_grad)
+
+        self.restore_program = Program()
+        block = self.restore_program.global_block()
+        with program_guard(main_program=self.restore_program):
+            for param_grad in self.params_grads:
+                self._add_average_restore_op(block, param_grad)
+
+    def _add_average_apply_op(self, block, param_grad):
+        param = block.clone_variable(param_grad[0])
+        grad = block.clone_variable(param_grad[1])
+        sum_1 = block.clone_variable(self._get_accumulator('sum_1', param))
+        sum_2 = block.clone_variable(self._get_accumulator('sum_2', param))
+        sum_3 = block.clone_variable(self._get_accumulator('sum_3', param))
+        num_accumulates = block.clone_variable(
+            self._get_accumulator('num_accumulates', param))
+        old_num_accumulates = block.clone_variable(
+            self._get_accumulator('old_num_accumulates', param))
+        num_updates = block.clone_variable(
+            self._get_accumulator('num_updates', param))
+        # backup param value to grad
+        layers.assign(input=param, output=grad)
+        # param = (sum_1 + sum_2 + sum_3) / (num_accumulates + old_num_accumulates)
+        tmp = layers.sum(x=[num_accumulates, old_num_accumulates])
+        sum = layers.sum(x=[sum_1, sum_2, sum_3])
+        tmp = layers.cast(
+            x=tmp, dtype='float32' if self._dtype == None else self._dtype)
+        sum = layers.cast(
+            x=sum, dtype='float32' if self._dtype == None else self._dtype)
+        layers.elementwise_div(x=sum, y=tmp, out=param)
+
+    def _add_average_restore_op(self, block, param_grad):
+        param = block.clone_variable(param_grad[0])
+        grad = block.clone_variable(param_grad[1])
+        layers.assign(input=grad, output=param)
+
+    def _append_average_accumulate_op(self, param):
+        self.helper = LayerHelper("average_accumulate")
+        sum_1 = self._add_accumulator('sum_1', param)
+        sum_2 = self._add_accumulator('sum_2', param)
+        sum_3 = self._add_accumulator('sum_3', param)
+        num_accumulates = self._add_accumulator(
+            'num_accumulates', param, dtype='int64', shape=[1])
+        old_num_accumulates = self._add_accumulator(
+            'old_num_accumulates', param, dtype='int64', shape=[1])
+        num_updates = self._add_accumulator(
+            'num_updates', param, dtype='int64', shape=[1])
+
+        self.helper.append_op(
+            type='average_accumulates',
+            inputs={
+                "param": param,
+                "in_sum_1": sum_1,
+                "in_sum_2": sum_2,
+                "in_sum_3": sum_3,
+                "in_num_accumulates": num_accumulates,
+                "in_old_num_accumulates": old_num_accumulates,
+                "in_num_updates": num_updates
+            },
+            outputs={
+                "out_sum_1": sum_1,
+                "out_sum_2": sum_2,
+                "out_sum_3": sum_3,
+                "out_num_accumulates": num_accumulates,
+                "out_old_num_accumulates": old_num_accumulates,
+                "out_num_updates": num_updates,
+            },
+            attrs={
+                "average_window": self.average_window,
+                "min_average_window": self.min_average_window,
+                "max_average_window": self.max_average_window,
+            })
+
+    @contextmanager
+    def apply(self, executor, need_restore=True):
+        """Apply average values to parameters of current model.
+        """
+        executor.run(self.apply_program)
+        try:
+            yield
+        finally:
+            if need_restore:
+                self.restore(executor)
+
+    def restore(self, executor):
+        """Restore parameter values of current model.
+        """
+        executor.run(self.restore_program)
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..6baf648198585022f992709c519038688af293e1
--- /dev/null
+++ b/python/paddle/fluid/parallel_executor.py
@@ -0,0 +1,289 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import core
+import multiprocessing
+import framework
+import executor
+import warnings
+import sys
+import os
+
+__all__ = ['ParallelExecutor', 'ExecutionStrategy', 'BuildStrategy']
+
+ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
+BuildStrategy = core.ParallelExecutor.BuildStrategy
+
+
+class ParallelExecutor(object):
+    """
+    ParallelExecutor can run program in parallel.
+
+    Args:
+        use_cuda (bool): Whether to use CUDA or not.
+        loss_name (str): The loss name must set in training. Default None.
+        main_program (Program): The program that need to run, if not provided,
+            then default_main_program will be used. Default None.
+        share_vars_from(ParallelExecutor): If provied, it will share variables
+            from the specified ParallelExecutor. Default None.
+        num_trainers(int): If greater than 1, NCCL will be initialized with
+            multiple rank of nodes, each node should have same number of GPUs.
+            Distributed training will be enabled then. Default 1.
+        trainer_id(int: Must use together with num_trainers. trainer_id is the
+            "rank" of current node starts from 0. Default 0.
+
+    Returns:
+        ParallelExecutor: The initialized ParallelExecutor object.
+
+    Raises:
+        TypeError: If share_vars_from is provided, but not ParallelExecutor object.
+
+    Examples:
+        .. code-block:: python
+
+          train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name)
+          test_exe = fluid.ParallelExecutor(use_cuda=True,
+                                            main_program=test_program,
+                                            share_vars_from=train_exe)
+
+          train_loss, = train_exe.run([loss.name], feed=feed_dict)
+          test_loss, = test_exe.run([loss.name], feed=feed_dict)
+    """
+
+    def __init__(self,
+                 use_cuda,
+                 loss_name=None,
+                 main_program=None,
+                 share_vars_from=None,
+                 exec_strategy=None,
+                 build_strategy=None,
+                 num_trainers=1,
+                 trainer_id=0,
+                 **kwargs):
+        if len(kwargs) != 0:
+            err_msg = ""
+            for key in kwargs:
+                if key in dir(ExecutionStrategy):
+                    err_msg += \
+                        "Setting {0} by constructor is deprecated. Use " \
+                        "strategy=ExecutionStrategy(); strategy.{0}=xxx; " \
+                        "pe=ParallelExecutor(exec_strategy=strategy) " \
+                        "instead.\n ".format(key)
+                elif key in dir(BuildStrategy):
+                    err_msg += \
+                        "Setting {0} by constructor is deprecated. Use " \
+                        "strategy=BuildStrategy(); See help(" \
+                        "paddle.fluid.ParallelExecutor.BuildStrategy) \n".format(
+                            key)
+                else:
+                    err_msg += "Setting {0} by constructor is deprecated. Use strategy.\n".format(
+                        key)
+            raise ValueError(err_msg)
+
+        self._places = []
+        self._act_places = []
+        if use_cuda:
+            for i in xrange(core.get_cuda_device_count()):
+                p = core.Place()
+                self._act_places.append(core.CUDAPlace(i))
+                p.set_place(self._act_places[-1])
+                self._places.append(p)
+        else:
+            cpu_num = int(
+                os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+            for i in xrange(cpu_num):
+                p = core.Place()
+                self._act_places.append(core.CPUPlace())
+                p.set_place(self._act_places[-1])
+                self._places.append(p)
+        assert self._places, "no place for execution"
+
+        if exec_strategy is None:
+            exec_strategy = ExecutionStrategy()
+        exec_strategy.use_cuda = use_cuda
+
+        if exec_strategy.num_threads == 0:
+            if use_cuda:
+                # Experiments on se-resnext shows that too many threads hurt
+                # performance. Worth tunning for other models in the future.
+                exec_strategy.num_threads = len(self._places) * 4
+            else:
+                cpu_num = int(
+                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+                exec_strategy.num_threads = cpu_num
+
+        if build_strategy is None:
+            build_strategy = BuildStrategy()
+
+        main = main_program
+        main = main if main else framework.default_main_program()
+        scope = executor.global_scope()
+        # FIXME(Yancey1989): it's a temporary approach to determinate the distribute
+        # train program, call self.bcast_param() at the end of each mini-batch.
+        self.is_dist = True if "recv" in [
+            op.type for op in main.global_block().ops
+        ] else False
+
+        if share_vars_from and not isinstance(share_vars_from,
+                                              ParallelExecutor):
+            raise TypeError("share_vars_from must be ParallelExecutor.")
+
+        local_scopes = share_vars_from.executor.local_scopes(
+        ) if share_vars_from else []
+
+        self.persistable_vars = [
+            v.name
+            for v in filter(
+                lambda var: var.persistable and var.type != core.VarDesc.VarType.RAW,
+                main.list_vars())
+        ]
+
+        self.executor = core.ParallelExecutor(
+            self._places,
+            set([
+                p.name for p in main.global_block().iter_parameters()
+                if not p.stop_gradient
+            ]),
+            set(self.persistable_vars), main.desc, loss_name
+            if loss_name else '', scope, local_scopes, exec_strategy,
+            build_strategy, num_trainers, trainer_id)
+        self.scope = scope
+
+    def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
+        """
+        Run a parallel executor with fetch_list.
+
+        The feed parameter can be a dict or a list. If feed is a dict, the
+        feed data will be split into multiple devices. If feed is a list, we
+        assume the data has been splitted into multiple devices, the each
+        element in the list will be copied to each device directly.
+
+        For example, if the feed is a dict:
+
+        >>> exe = ParallelExecutor()
+        >>> # the image will be splitted into devices. If there is two devices
+        >>> # each device will process an image with shape (24, 1, 28, 28)
+        >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))})
+
+        For example, if the feed is a list:
+
+        >>> exe = ParallelExecutor()
+        >>> # each device will process each element in the list.
+        >>> # the 1st device will process an image with shape (48, 1, 28, 28)
+        >>> # the 2nd device will process an image with shape (32, 1, 28, 28)
+        >>> #
+        >>> # you can use exe.device_count to get the device number.
+        >>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))},
+        >>>               {"image": numpy.random.random(size=(32, 1, 28, 28))},
+        >>>              ])
+
+        Args:
+            fetch_list(list): The fetched variable names
+            feed(list|dict|None): The feed variables. If the feed is a dict,
+                tensors in that dict will be splitted into each devices. If
+                the feed is a list, each element of the list will be copied
+                to each device. Default None.
+            feed_dict: Alias for feed parameter, for backward compatibility.
+                This parameter has been deprecated. Default None.
+            return_numpy(bool): Whether converts the fetched tensor to numpy.
+                Default: True.
+
+        Returns:
+            List: The fetched result list.
+
+        Raises:
+            ValueError: If the feed is a list, but its length is not equal the
+                length of active places, or its element's is not dict.
+
+        NOTES:
+            1. If the feed's type is dict, the number of data that feeds to
+               ParallelExecutor must be bigger than active places. Otherwise,
+               it will throw exception from C++ side. Special attention should be
+               paid to check whether the last batch of the dataset is bigger
+               than active places.
+            2. If active places are more than one, the fetch results for each
+               variable is a list, and each element of this list is the variable of
+               respective active place.
+
+        Examples:
+            .. code-block:: python
+
+                pe = fluid.ParallelExecutor(use_cuda=use_cuda,
+                                            loss_name=avg_cost.name,
+                                            main_program=fluid.default_main_program())
+                loss = pe.run(feed=feeder.feed(cur_batch),
+                              fetch_list=[avg_cost.name]))
+        """
+        if feed is None and feed_dict is not None:
+            feed = feed_dict
+            print >> sys.stderr, "`feed_dict` is deprecated. Please use `feed=`"
+
+        if isinstance(feed, dict):
+            feed_tensor_dict = dict()
+            for feed_name in feed:
+                feed_tensor = feed[feed_name]
+                if not isinstance(feed_tensor, core.LoDTensor):
+                    feed_tensor = core.LoDTensor()
+                    # always set to CPU place, since the tensor need to be splitted
+                    # it is fast in CPU
+                    feed_tensor.set(feed[feed_name], core.CPUPlace())
+                feed_tensor_dict[feed_name] = feed_tensor
+
+            self.executor.feed_and_split_tensor_into_local_scopes(
+                feed_tensor_dict)
+        elif isinstance(feed, list) or isinstance(feed, tuple):
+            if len(feed) != len(self._act_places):
+                raise ValueError(
+                    "Feed a list of tensor, the list should be the same size as places"
+                )
+
+            res = list()
+
+            for i, each in enumerate(feed):
+                if not isinstance(each, dict):
+                    raise TypeError(
+                        "Each element of feed list should be a dict")
+                res_dict = dict()
+                for feed_name in each:
+                    tensor = each[feed_name]
+                    if not isinstance(tensor, core.LoDTensor):
+                        tmp = core.LoDTensor()
+                        tmp.set(tensor, self._act_places[i])
+                        tensor = tmp
+                    res_dict[feed_name] = tensor
+                res.append(res_dict)
+            self.executor.feed_tensors_into_local_scopes(res)
+
+        fetch_var_name = '@FETCHED_VAR_NAME@'
+        self.executor.run(fetch_list, fetch_var_name)
+        arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
+
+        if self.is_dist:
+            self.bcast_params()
+
+        if return_numpy:
+            return executor.as_numpy(arr)
+
+        return [arr[i] for i in range(len(arr))]
+
+    def bcast_params(self):
+        """
+        Broadcast the parameters to other devices. It is used during
+        distributed training.
+        """
+        self.executor.bcast_params(set(self.persistable_vars))
+
+    @property
+    def device_count(self):
+        return len(self._act_places)
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index 255cd2104325afa31449cbd3875499a7c5d7f572..0a42b9fca8dba7a11b414990be6c04c93158864f 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -22,21 +22,62 @@ __all__ = [
 
 
 class ParamAttr(object):
+    """
+    Parameter attributes object. To fine-tuning network training process, user
+    can set parameter's attributes to control training details. Such as learning rate,
+    regularization, trainable, do_model_average and the method to initialize param.
+
+
+    Args:
+        name(str): The parameter's name. Default None.
+        initializer(Initializer): The method to initial this parameter. Default None.
+        learning_rate(float): The parameter's learning rate. The learning rate when
+            optimize is :math:`global\_lr * parameter\_lr * scheduler\_factor`.
+            Default 1.0.
+        regularizer(WeightDecayRegularizer): Regularization factor. Default None.
+        trainable(bool): Whether this parameter is trainable. Default True.
+        gradient_clip(BaseGradientClipAttr): The method to clip this parameter's
+            gradient. Default None.
+        do_model_average(bool): Whether this parameter should do model average.
+            Default False.
+
+    Examples:
+        .. code-block:: python
+
+            w_param_attrs = fluid.ParamAttr(name="fc_weight",
+                                            learning_rate=0.5,
+                                            regularizer=fluid.L2Decay(1.0),
+                                            trainable=True)
+            y_predict = fluid.layers.fc(input=x, size=10, param_attr=w_param_attrs)
+    """
+
     def __init__(self,
                  name=None,
                  initializer=None,
                  learning_rate=1.0,
                  regularizer=None,
                  trainable=True,
-                 gradient_clip=None):
+                 gradient_clip=None,
+                 do_model_average=False):
         self.name = name
         self.initializer = initializer
         self.learning_rate = learning_rate
         self.regularizer = regularizer
         self.trainable = trainable
         self.gradient_clip = gradient_clip
+        self.model_average = do_model_average
 
     def set_default_initializer(self, initializer):
+        """
+        Set the default initializer, the initializer should be Constant,
+        Uniform, Normal, Xavier, MSRA.
+
+        Args:
+            initializer(Initializer): the initializer to set.
+
+        Returns:
+            None
+        """
         if initializer is None:
             if self.initializer is None:
                 raise ValueError("ParamAttr.initializer is not set")
@@ -48,13 +89,45 @@ class ParamAttr(object):
         self.initializer = initializer
 
     def set_default_param_initializer(self):
+        """
+        Set the default initializer for the parameter with Xavier.
+
+        Args:
+            None.
+
+        Returns:
+            None.
+        """
         self.set_default_initializer(Xavier())
 
     def set_default_bias_initializer(self):
+        """
+        Set the default initializer for the bias with Constant(0.0).
+
+        Args:
+            None.
+
+        Returns:
+            None.
+        """
         self.set_default_initializer(Constant(0.0))
 
     @staticmethod
     def to_attr(arg):
+        """
+        Create ParamAttr[s].
+
+        Args:
+            arg: Arguments to initialize ParamAttr[s]. arg's type can be
+                str, Initializer, float, WeightDecayRegularizer, BaseGradientClipAttr,
+                bool, ParamAttr, or a list of above type.
+
+        Returns:
+            ParamAttr[s]: ParamAttr[s] initialized with arg.
+
+        Raises:
+            arg can not initialize a ParamAttr.
+        """
         if arg is None:
             return ParamAttr()
         elif isinstance(arg, list) or isinstance(arg, tuple):
@@ -73,6 +146,15 @@ class ParamAttr(object):
             raise TypeError("{0} cast to ParamAttr".format(type(arg)))
 
     def to_kwargs(self, with_initializer=False):
+        """
+        Returns the attributes of this parameter.
+
+        Args:
+            with_initializer(bool): Whether to add initializer attr.
+
+        Returns:
+            Parameter attributes(map): The attributes of this parameter.
+        """
         kwargs = {
             'name': self.name,
             'optimize_attr': {
@@ -80,7 +162,8 @@ class ParamAttr(object):
             },
             'regularizer': self.regularizer,
             'trainable': self.trainable,
-            'gradient_clip_attr': self.gradient_clip
+            'gradient_clip_attr': self.gradient_clip,
+            'model_average': self.model_average
         }
         if with_initializer:
             kwargs['initializer'] = self.initializer
@@ -89,9 +172,27 @@ class ParamAttr(object):
 
 class WeightNormParamAttr(ParamAttr):
     """
-    Used for weight normalization. Any field in ParamAttr can also be set here.
-    Besides, an extra field dim can be set to indicate the dimension except 
-    which to normalize.
+    Used for weight Norm. Weight Norm is a reparameterization of the weight vectors
+    in a neural network that decouples the length of those weight vectors from
+    their direction. Weight Norm has been implemented as discussed in this
+    paper: `Weight Normalization: A Simple Reparameterization to Accelerate
+    Training of Deep Neural Networks
+    <https://arxiv.org/pdf/1602.07868.pdf>`_.
+
+    Args:
+        dim(list): The parameter's name. Default None.
+        kwargs: Any field in ParamAttr. Default None.
+
+    Examples:
+        .. code-block:: python
+
+            data = fluid.layers.data(name="data", shape=[3, 32, 32], dtype="float32")
+            fc = fluid.layers.fc(input=data,
+                                 size=1000,
+                                 param_attr=WeightNormParamAttr(
+                                      dim=None,
+                                      name='weight_norm_param'))
+
     """
     # List to record the parameters reparameterized by weight normalization.
     # If these parameters are treated as Variable rather than Parameter,
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index 04fd05cc33cff3d720be75923d4af3767942669f..6a321ae024dcb50452bc4d96d7e7e70f590a42c6 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -16,7 +16,10 @@ import core
 from contextlib import contextmanager
 import os
 
-__all__ = ['cuda_profiler', 'reset_profiler', 'profiler']
+__all__ = [
+    'cuda_profiler', 'reset_profiler', 'profiler', 'start_profiler',
+    'stop_profiler'
+]
 
 NVPROF_CONFIG = [
     "gpustarttimestamp",
@@ -39,6 +42,9 @@ def cuda_profiler(output_file, output_mode=None, config=None):
     counters/options for profiling by `config` argument. The default config
     is ['gpustarttimestamp', 'gpustarttimestamp', 'gridsize3d',
     'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace'].
+    Then users can use NVIDIA Visual Profiler
+    (https://developer.nvidia.com/nvidia-visual-profiler) tools to load this
+    this output file to visualize results.
 
     Args:
         output_file (string) : The output file name, the result will be
@@ -47,6 +53,33 @@ def cuda_profiler(output_file, output_mode=None, config=None):
             Comma separated values format. It should be 'kvp' or 'csv'.
         config (list of string) : The profiler options and counters can refer
             to "Compute Command Line Profiler User Guide".
+
+    Raises:
+        ValueError: If `output_mode` is not in ['kvp', 'csv'].
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            import paddle.fluid.profiler as profiler
+
+            epoc = 8
+            dshape = [4, 3, 28, 28]
+            data = fluid.layers.data(name='data', shape=[3, 28, 28], dtype='float32')
+            conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
+
+            place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+
+            output_file = 'cuda_profiler.txt'
+            with profiler.cuda_profiler(output_file, 'csv') as nvprof:
+                for i in range(epoc):
+                    input = np.random.random(dshape).astype('float32')
+                    exe.run(fluid.default_main_program(), feed={'data': input})
+            # then use  NVIDIA Visual Profiler (nvvp) to load this output file
+            # to visualize results.
     """
     if output_mode is None:
         output_mode = 'csv'
@@ -66,26 +99,73 @@ def cuda_profiler(output_file, output_mode=None, config=None):
 
 
 def reset_profiler():
-    """The profiler clear interface.
-    reset_profiler will clear the previous time record.
+    """
+    Clear the previous time record. This interface does not work for
+    `fluid.profiler.cuda_profiler`, it only works for
+    `fluid.profiler.start_profiler`, `fluid.profiler.stop_profiler`,
+    and `fluid.profiler.profiler`.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid.profiler as profiler
+            with profiler.profiler(state, 'total', '/tmp/profile'):
+                for iter in range(10):
+                    if iter == 2:
+                        profiler.reset_profiler()
+                    # ...
     """
     core.reset_profiler()
 
 
-@contextmanager
-def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
-    """The profiler interface.
-    Different from cuda_profiler, this profiler can be used to profile both CPU
-    and GPU program. By defalut, it records the CPU and GPU operator kernels,
-    if you want to profile other program, you can refer the profiling tutorial
-    to add more records.
+def start_profiler(state):
+    """
+    Enable the profiler. Uers can use `fluid.profiler.start_profiler` and
+    `fluid.profiler.stop_profiler` to insert the code, except the usage of
+    `fluid.profiler.profiler` interface.
+
+    Args:
+        state (string) : The profiling state, which should be 'CPU', 'GPU'
+            or 'All'. 'CPU' means only profile CPU. 'GPU' means profiling
+            GPU as well. 'All' also generates timeline.
+
+    Raises:
+        ValueError: If `state` is not in ['CPU', 'GPU', 'All'].
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid.profiler as profiler
+
+            profiler.start_profiler('GPU')
+            for iter in range(10):
+                if iter == 2:
+                    profiler.reset_profiler()
+                # except each iteration
+            profiler.stop_profiler('total', '/tmp/profile')
+    """
+    if core.is_profiler_enabled():
+        return
+    if state not in ['CPU', 'GPU', "All"]:
+        raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
+    if state == "GPU":
+        prof_state = core.ProfilerState.kCUDA
+    elif state == "CPU":
+        prof_state = core.ProfilerState.kCPU
+    else:
+        prof_state = core.ProfilerState.kAll
+    core.enable_profiler(prof_state)
+
+
+def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
+    """
+    Stop the profiler. Uers can use `fluid.profiler.start_profiler` and
+    `fluid.profiler.stop_profiler` to insert the code, except the usage of
+    `fluid.profiler.profiler` interface.
 
     Args:
-        state (string) : The profiling state, which should be 'CPU' or 'GPU',
-            telling the profiler to use CPU timer or GPU timer for profiling.
-            Although users may have already specified the execution place
-            (CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
-            would not inherit this place.
         sorted_key (string) : If None, the profiling results will be printed
             in the order of first end time of events. Otherwise, the profiling
             results will be sorted by the this flag. This flag should be one
@@ -97,18 +177,26 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
             The `ave` means sorting by the average execution time.
         profile_path (string) : If state == 'All', it will write a profile
             proto output file.
-    """
-    if state not in ['CPU', 'GPU', "All"]:
-        raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
-    if state == "GPU":
-        prof_state = core.ProfilerState.kCUDA
-    elif state == "CPU":
-        prof_state = core.ProfilerState.kCPU
-    else:
-        prof_state = core.ProfilerState.kAll
-    core.enable_profiler(prof_state)
-    yield
 
+    Raises:
+        ValueError: If `sorted_key` is not in
+            ['calls', 'total', 'max', 'min', 'ave'].
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid.profiler as profiler
+
+            profiler.start_profiler('GPU')
+            for iter in range(10):
+                if iter == 2:
+                    profiler.reset_profiler()
+                # except each iteration
+            profiler.stop_profiler('total', '/tmp/profile')
+    """
+    if not core.is_profiler_enabled():
+        return
     sorted_key = 'default' if sorted_key is None else sorted_key
     if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']:
         raise ValueError("The sorted_key must be None or in 'calls', 'total', "
@@ -124,3 +212,58 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
     # TODO(qingqing) : redirect C++ ostream to Python stream.
     # with core.ostream_redirect(stdout=True, stderr=True):
     core.disable_profiler(key_map[sorted_key], profile_path)
+
+
+@contextmanager
+def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
+    """The profiler interface.
+    Different from cuda_profiler, this profiler can be used to profile both CPU
+    and GPU program. By defalut, it records the CPU and GPU operator kernels,
+    if you want to profile other program, you can refer the profiling tutorial
+    to add more records in C++ code.
+
+    If the state == 'All', a profile proto file will be written to
+    `profile_path`. This file records timeline information during the execution.
+    Then users can visualize this file to see the timeline, please refer 
+    https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/howto/optimization/timeline.md
+
+    Args:
+        state (string) : The profiling state, which should be 'CPU' or 'GPU',
+            telling the profiler to use CPU timer or GPU timer for profiling.
+            Although users may have already specified the execution place
+            (CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
+            would not inherit this place.
+        sorted_key (string) : If None, the profiling results will be printed
+            in the order of first end time of events. Otherwise, the profiling
+            results will be sorted by the this flag. This flag should be one
+            of 'calls', 'total', 'max', 'min' or 'ave'.
+            The `calls` means sorting by the number of calls.
+            The `total` means sorting by the total execution time.
+            The `max` means sorting by the maximum execution time.
+            The `min` means sorting by the minimum execution time.
+            The `ave` means sorting by the average execution time.
+        profile_path (string) : If state == 'All', it will write a profile
+            proto output file.
+
+    Raises:
+        ValueError: If `state` is not in ['CPU', 'GPU', 'All']. If `sorted_key` is
+            not in ['calls', 'total', 'max', 'min', 'ave'].
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid.profiler as profiler
+
+            with profiler.profiler('All', 'total', '/tmp/profile') as prof:
+                for pass_id in range(pass_num):
+                    for batch_id, data in enumerate(train_reader()):
+                        exe.run(fluid.default_main_program(),
+                                feed=feeder.feed(data),
+                                fetch_list=[],
+                                use_program_cache=True)
+                        # ...
+    """
+    start_profiler(state)
+    yield
+    stop_profiler(sorted_key, profile_path)
diff --git a/python/paddle/fluid/recordio_writer.py b/python/paddle/fluid/recordio_writer.py
index 9735df8c06113230af9695f76a7589ea9f50e527..bd57772713057f12b876942de58ee43527e94834 100644
--- a/python/paddle/fluid/recordio_writer.py
+++ b/python/paddle/fluid/recordio_writer.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import core
 import contextlib
-
-__all__ = ['convert_reader_to_recordio_file']
+__all__ = [
+    'convert_reader_to_recordio_file', 'convert_reader_to_recordio_files'
+]
 
 
 @contextlib.contextmanager
@@ -34,8 +36,48 @@ def convert_reader_to_recordio_file(
         compressor=core.RecordIOWriter.Compressor.Snappy,
         max_num_records=1000,
         feed_order=None):
+    """
+    Convert a Python Reader to a recordio file.
+
+    Please see :ref:`api_guide_python_reader` and :ref:`api_guide_reader_op` for
+    details.
+
+    Examples:
+
+        >>> import paddle.fluid as fluid
+        >>> import paddle.dataset.mnist as mnist
+        >>> import paddle
+        >>>
+        >>> tmp_program = fluid.Program()
+        >>> with fluid.program_guard(tmp_program):
+        >>>     img = fluid.layers.data(name='img', shape=[784])
+        >>>     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        >>> feeder = fluid.DataFeeder(feed_list=[img, label], place=fluid.CPUPlace())
+        >>> # mnist.recordio will be generated in current directory
+        >>> fluid.recordio_writer.convert_reader_to_recordio_file(
+        >>>                     filename="mnist.recordio",
+        >>>                     reader_creator=paddle.batch(mnist.train(), batch_size=32),
+        >>>                     feeder=feeder)
+
+    Args:
+        filename(str): The recordio filename.
+        reader_creator(callable): The Python Reader Creator. See
+            :ref:`api_guide_python_reader`.
+        feeder(DataFeeder): The DataFeeder instance. Used to convert
+            :code:`reader_creator` to :code: `lod_tensor`
+        compressor: Must in fluid.core.RecordIOWriter.Compressor.Snappy or
+            fluid.core.RecordIOWriter.Compressor.NoCompress. Use :code:`Snappy`
+            by default.
+        max_num_records(int): Maximum number of records in one chuck. Each record
+            is each return value from reader function
+        feed_order(list): The order of variable names that the reader returns
+
+    Returns:
+        int: the number of record that saved.
+    """
     if feed_order is None:
         feed_order = feeder.feed_names
+    counter = 0
     with create_recordio_writer(filename, compressor,
                                 max_num_records) as writer:
         for batch in reader_creator():
@@ -43,3 +85,49 @@ def convert_reader_to_recordio_file(
             for each in feed_order:
                 writer.append_tensor(res[each])
             writer.complete_append_tensor()
+            counter += 1
+    return counter
+
+
+def convert_reader_to_recordio_files(
+        filename,
+        batch_per_file,
+        reader_creator,
+        feeder,
+        compressor=core.RecordIOWriter.Compressor.Snappy,
+        max_num_records=1000,
+        feed_order=None):
+    """
+    convert a python reader to many recordio files.
+
+    This API is basically same as :code:`convert_reader_to_recordio_file`,
+    instead of it will create many recordio files. Each file contains at
+    most :code:`batch_per_file` records.
+
+    Please reference
+    :ref:`api_fluid_recordio_writer_convert_reader_to_recordio_file` for more
+    details.
+    """
+    if feed_order is None:
+        feed_order = feeder.feed_names
+    f_name, f_ext = os.path.splitext(filename)
+    assert (f_ext == ".recordio")
+
+    lines = []
+    f_idx = 0
+    counter = 0
+    for idx, batch in enumerate(reader_creator()):
+        lines.append(batch)
+        if idx >= batch_per_file and idx % batch_per_file == 0:
+            filename = "%s-%05d%s" % (f_name, f_idx, f_ext)
+            with create_recordio_writer(filename, compressor,
+                                        max_num_records) as writer:
+                for l in lines:
+                    res = feeder.feed(l)
+                    for each in feed_order:
+                        writer.append_tensor(res[each])
+                    writer.complete_append_tensor()
+                    counter += 1
+                lines = []
+                f_idx += 1
+    return counter
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index 029db7d2dd4b7def8cea374e3f2ed31226f2bc18..dac474d5ee76590a75311d6bf2c4cb2fe85b6c40 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -16,9 +16,8 @@ import framework
 from . import core
 
 __all__ = [
-    'append_regularization_ops',
-    'L1Decay',
-    'L2Decay',
+    'append_regularization_ops', 'L1Decay', 'L2Decay', 'L1DecayRegularizer',
+    'L2DecayRegularizer'
 ]
 
 
@@ -37,34 +36,40 @@ def append_regularization_ops(parameters_and_grads, regularization=None):
                         set. It will be applied with regularizer.
 
     Returns:
-        list of (parameters, gradients) pair with the regularized gradient
+        list[(Variable, Variable)]: list of (parameters, gradients) \
+        pair with the regularized gradient
 
     Raises:
         Exception: Unknown regularization type
     """
     params_and_grads = []
     for param, grad in parameters_and_grads:
-        regularization_term = None
-        if param.regularizer is not None:
-            # Add variable for regularization term in grad block
-            regularization_term = param.regularizer(param, grad, grad.block)
-        elif regularization is not None:
-            regularization_term = regularization(param, grad, grad.block)
-
-        # If no gradient or no regularization specified,
-        # then we don't need to do anything
-        if grad is None or regularization_term is None:
+        with param.block.program.optimized_guard(param):
+            # If no gradient then we don't need to do anything
+            if grad is None:
+                params_and_grads.append((param, grad))
+                continue
+
+            regularization_term = None
+            if param.regularizer is not None:
+                # Add variable for regularization term in grad block
+                regularization_term = param.regularizer(param, grad, grad.block)
+            elif regularization is not None:
+                regularization_term = regularization(param, grad, grad.block)
+
+            # If no regularization specified, then we don't need to do anything
+            if regularization_term is None:
+                params_and_grads.append((param, grad))
+                continue
+
+            assert grad.shape == regularization_term.shape
+
+            grad.block.append_op(
+                type='elementwise_add',
+                inputs={"X": grad,
+                        "Y": regularization_term},
+                outputs={"Out": grad})
             params_and_grads.append((param, grad))
-            continue
-
-        assert grad.shape == regularization_term.shape
-
-        grad.block.append_op(
-            type='elementwise_add',
-            inputs={"X": grad,
-                    "Y": regularization_term},
-            outputs={"Out": grad})
-        params_and_grads.append((param, grad))
 
     return params_and_grads
 
@@ -96,6 +101,24 @@ class WeightDecayRegularizer(object):
 
 class L2DecayRegularizer(WeightDecayRegularizer):
     """Implements the L2 Weight Decay Regularization
+
+    Small values of L2 can help prevent over fitting the training data.
+
+    .. math::
+
+        L2WeightDecay = reg\_coeff * parameter
+
+    Args:
+        regularization_coeff(float): regularization coeff
+
+    Examples:
+        .. code-block:: python
+
+            optimizer = fluid.optimizer.Adagrad(
+                learning_rate=1e-4,
+                regularization=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=0.1))
+            optimizer.minimize(avg_cost)
     """
 
     def __init__(self, regularization_coeff=0.0):
@@ -150,6 +173,27 @@ class L2DecayRegularizer(WeightDecayRegularizer):
 
 class L1DecayRegularizer(WeightDecayRegularizer):
     """Implements the L1 Weight Decay Regularization
+
+    L1 regularization encourages sparsity.
+
+    .. math::
+
+        L1WeightDecay = reg\_coeff * sign(parameter)
+
+    Args:
+        regularization_coeff(float): regularization coeff
+
+    Examples:
+        .. code-block:: python
+
+            program = fluid.framework.Program()
+            block = program.global_block()
+            mul_x = block.create_parameter(
+                dtype="float32",
+                shape=[5, 10],
+                lod_level=0,
+                name="mul.x",
+                regularizer=fluid.regularizer.L1DecayRegularizer(0.5))
     """
 
     def __init__(self, regularization_coeff=0.0):
diff --git a/python/paddle/fluid/tests/book/CMakeLists.txt b/python/paddle/fluid/tests/book/CMakeLists.txt
index 673c965b662a022739f8d489c331f4de9455a926..ee734f3c782adb5196a03aca5718377009a5b4e7 100644
--- a/python/paddle/fluid/tests/book/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/CMakeLists.txt
@@ -5,3 +5,5 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
+
+add_subdirectory(high-level-api)
diff --git a/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..efa5ee2d06af3d31e7d84122dd7eea37d6dcf3a3
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
@@ -0,0 +1,16 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
+
+add_subdirectory(fit_a_line)
+add_subdirectory(recognize_digits)
+add_subdirectory(image_classification)
+add_subdirectory(understand_sentiment)
+add_subdirectory(label_semantic_roles)
+add_subdirectory(word2vec)
+add_subdirectory(recommender_system)
+add_subdirectory(machine_translation)
diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..673c965b662a022739f8d489c331f4de9455a926
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/CMakeLists.txt
@@ -0,0 +1,7 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad28c9eff560507e5b326451159be3949353f58f
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
@@ -0,0 +1,133 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import contextlib
+import numpy
+import unittest
+
+# train reader
+BATCH_SIZE = 20
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+test_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.test(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+
+def inference_program():
+    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+    return y_predict
+
+
+def train_program():
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+    y_predict = inference_program()
+
+    loss = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_loss = fluid.layers.mean(loss)
+
+    return avg_loss
+
+
+def optimizer_func():
+    return fluid.optimizer.SGD(learning_rate=0.001)
+
+
+def train(use_cuda, train_program, params_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    trainer = fluid.Trainer(
+        train_func=train_program, place=place, optimizer_func=optimizer_func)
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndStepEvent):
+            if event.step == 10:
+                test_metrics = trainer.test(
+                    reader=test_reader, feed_order=['x', 'y'])
+                print test_metrics
+                '''
+                ...
+                ['25.768919467926025']
+                ['15.343549569447836']
+                ...
+                '''
+                if params_dirname is not None:
+                    trainer.save_params(params_dirname)
+                trainer.stop()
+
+    trainer.train(
+        reader=train_reader,
+        num_epochs=100,
+        event_handler=event_handler,
+        feed_order=['x', 'y'])
+
+
+# infer
+def infer(use_cuda, inference_program, params_dirname=None):
+    if params_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    inferencer = fluid.Inferencer(
+        infer_func=inference_program, param_path=params_dirname, place=place)
+
+    batch_size = 10
+    tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
+
+    results = inferencer.infer({'x': tensor_x})
+    print("infer results: ", results[0])
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    # Directory for saving the trained model
+    params_dirname = "fit_a_line.inference.model"
+
+    train(use_cuda, train_program, params_dirname)
+    infer(use_cuda, inference_program, params_dirname)
+
+
+class TestFitALine(unittest.TestCase):
+    def test_cpu(self):
+        with self.program_scope_guard():
+            with fluid.unique_name.guard():
+                main(use_cuda=False)
+
+    def test_cuda(self):
+        with self.program_scope_guard():
+            with fluid.unique_name.guard():
+                main(use_cuda=True)
+
+    @contextlib.contextmanager
+    def program_scope_guard(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..673c965b662a022739f8d489c331f4de9455a926
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt
@@ -0,0 +1,7 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fed6d914f75b690e34411aa154359c93b6ca989
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+CIFAR dataset.
+
+This module will download dataset from
+https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into
+paddle reader creators.
+
+The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,
+with 6000 images per class. There are 50000 training images and 10000 test
+images.
+
+The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes
+containing 600 images each. There are 500 training images and 100 testing
+images per class.
+
+"""
+
+import cPickle
+import itertools
+import numpy
+import paddle.v2.dataset.common
+import tarfile
+
+__all__ = ['train10']
+
+URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
+CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
+CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
+
+
+def reader_creator(filename, sub_name, batch_size=None):
+    def read_batch(batch):
+        data = batch['data']
+        labels = batch.get('labels', batch.get('fine_labels', None))
+        assert labels is not None
+        for sample, label in itertools.izip(data, labels):
+            yield (sample / 255.0).astype(numpy.float32), int(label)
+
+    def reader():
+        with tarfile.open(filename, mode='r') as f:
+            names = (each_item.name for each_item in f
+                     if sub_name in each_item.name)
+
+            batch_count = 0
+            for name in names:
+                batch = cPickle.load(f.extractfile(name))
+                for item in read_batch(batch):
+                    if isinstance(batch_size, int) and batch_count > batch_size:
+                        break
+                    batch_count += 1
+                    yield item
+
+    return reader
+
+
+def train10(batch_size=None):
+    """
+    CIFAR-10 training set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'data_batch',
+        batch_size=batch_size)
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e222d26907e8fe697b596a67e62cc9df84afe0e
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -0,0 +1,160 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+import numpy
+import cifar10_small_test_set
+
+
+def resnet_cifar10(input, depth=32):
+    def conv_bn_layer(input,
+                      ch_out,
+                      filter_size,
+                      stride,
+                      padding,
+                      act='relu',
+                      bias_attr=False):
+        tmp = fluid.layers.conv2d(
+            input=input,
+            filter_size=filter_size,
+            num_filters=ch_out,
+            stride=stride,
+            padding=padding,
+            act=None,
+            bias_attr=bias_attr)
+        return fluid.layers.batch_norm(input=tmp, act=act)
+
+    def shortcut(input, ch_in, ch_out, stride):
+        if ch_in != ch_out:
+            return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+        else:
+            return input
+
+    def basicblock(input, ch_in, ch_out, stride):
+        tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
+        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None, bias_attr=True)
+        short = shortcut(input, ch_in, ch_out, stride)
+        return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
+
+    def layer_warp(block_func, input, ch_in, ch_out, count, stride):
+        tmp = block_func(input, ch_in, ch_out, stride)
+        for i in range(1, count):
+            tmp = block_func(tmp, ch_out, ch_out, 1)
+        return tmp
+
+    assert (depth - 2) % 6 == 0
+    n = (depth - 2) / 6
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    predict = fluid.layers.fc(input=pool, size=10, act='softmax')
+    return predict
+
+
+def inference_network():
+    data_shape = [3, 32, 32]
+    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+    predict = resnet_cifar10(images, 32)
+    return predict
+
+
+def train_network():
+    predict = inference_network()
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    accuracy = fluid.layers.accuracy(input=predict, label=label)
+    return [avg_cost, accuracy]
+
+
+def optimizer_func():
+    return fluid.optimizer.Adam(learning_rate=0.001)
+
+
+def train(use_cuda, train_program, params_dirname):
+    BATCH_SIZE = 128
+    EPOCH_NUM = 1
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10),
+        batch_size=BATCH_SIZE,
+        drop_last=False)
+
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False)
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndStepEvent):
+            avg_cost, accuracy = trainer.test(
+                reader=test_reader, feed_order=['pixel', 'label'])
+
+            print('Loss {0:2.2}, Acc {1:2.2}'.format(avg_cost, accuracy))
+
+            if accuracy > 0.01:  # Low threshold for speeding up CI
+                if params_dirname is not None:
+                    trainer.save_params(params_dirname)
+                return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    trainer = fluid.Trainer(
+        train_func=train_program, optimizer_func=optimizer_func, place=place)
+
+    trainer.train(
+        reader=train_reader,
+        num_epochs=EPOCH_NUM,
+        event_handler=event_handler,
+        feed_order=['pixel', 'label'])
+
+
+def infer(use_cuda, inference_program, params_dirname=None):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    inferencer = fluid.Inferencer(
+        infer_func=inference_program, param_path=params_dirname, place=place)
+
+    # The input's dimension of conv should be 4-D or 5-D.
+    # Use normilized image pixels as input data, which should be in the range
+    # [0, 1.0].
+    tensor_img = numpy.random.rand(1, 3, 32, 32).astype("float32")
+    results = inferencer.infer({'pixel': tensor_img})
+
+    print("infer results: ", results)
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    save_path = "image_classification_resnet.inference.model"
+
+    train(
+        use_cuda=use_cuda,
+        train_program=train_network,
+        params_dirname=save_path)
+
+    infer(
+        use_cuda=use_cuda,
+        inference_program=inference_network,
+        params_dirname=save_path)
+
+
+if __name__ == '__main__':
+    for use_cuda in (False, True):
+        main(use_cuda=use_cuda)
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbc7bc06c93157f271c79e85b6925468e861e57f
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
@@ -0,0 +1,137 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+import numpy
+import cifar10_small_test_set
+
+
+def vgg16_bn_drop(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=4096, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=4096, act=None)
+    predict = fluid.layers.fc(input=fc2, size=10, act='softmax')
+    return predict
+
+
+def inference_network():
+    data_shape = [3, 32, 32]
+    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+    predict = vgg16_bn_drop(images)
+    return predict
+
+
+def train_network():
+    predict = inference_network()
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    accuracy = fluid.layers.accuracy(input=predict, label=label)
+    return [avg_cost, accuracy]
+
+
+def optimizer_func():
+    return fluid.optimizer.Adam(learning_rate=0.001)
+
+
+def train(use_cuda, train_program, params_dirname):
+    BATCH_SIZE = 128
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10),
+        batch_size=BATCH_SIZE,
+        drop_last=False)
+
+    test_reader = paddle.batch(
+        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False)
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndStepEvent):
+            avg_cost, accuracy = trainer.test(
+                reader=test_reader, feed_order=['pixel', 'label'])
+
+            print('Loss {0:2.2}, Acc {1:2.2}'.format(avg_cost, accuracy))
+
+            if accuracy > 0.01:  # Low threshold for speeding up CI
+                if params_dirname is not None:
+                    trainer.save_params(params_dirname)
+                return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    trainer = fluid.Trainer(
+        train_func=train_program, place=place, optimizer_func=optimizer_func)
+
+    trainer.train(
+        reader=train_reader,
+        num_epochs=1,
+        event_handler=event_handler,
+        feed_order=['pixel', 'label'])
+
+
+def infer(use_cuda, inference_program, params_dirname=None):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    inferencer = fluid.Inferencer(
+        infer_func=inference_program, param_path=params_dirname, place=place)
+
+    # The input's dimension of conv should be 4-D or 5-D.
+    # Use normilized image pixels as input data, which should be in the range
+    # [0, 1.0].
+    tensor_img = numpy.random.rand(1, 3, 32, 32).astype("float32")
+    results = inferencer.infer({'pixel': tensor_img})
+
+    print("infer results: ", results)
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    save_path = "image_classification_vgg.inference.model"
+
+    train(
+        use_cuda=use_cuda,
+        train_program=train_network,
+        params_dirname=save_path)
+
+    infer(
+        use_cuda=use_cuda,
+        inference_program=inference_network,
+        params_dirname=save_path)
+
+
+if __name__ == '__main__':
+    for use_cuda in (False, True):
+        main(use_cuda=use_cuda)
diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..673c965b662a022739f8d489c331f4de9455a926
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/CMakeLists.txt
@@ -0,0 +1,7 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
new file mode 100755
index 0000000000000000000000000000000000000000..67aa21e8c5699f1cb568dad23cd13f4cb51a6ec9
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
@@ -0,0 +1,265 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+import numpy as np
+
+WORD_DICT, VERB_DICT, LABEL_DICT = paddle.dataset.conll05.get_dict()
+WORD_DICT_LEN = len(WORD_DICT)
+LABEL_DICT_LEN = len(LABEL_DICT)
+PRED_DICT_LEN = len(VERB_DICT)
+MARK_DICT_LEN = 2
+IS_SPARSE = True
+BATCH_SIZE = 10
+EMBEDDING_NAME = 'emb'
+
+
+def lstm_net():
+    WORD_DIM = 32
+    MARK_DIM = 5
+    HIDDEN_DIM = 512
+    DEPTH = 8
+
+    # Data definitions
+    word = fluid.layers.data(
+        name='word_data', shape=[1], dtype='int64', lod_level=1)
+    predicate = fluid.layers.data(
+        name='verb_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_n2 = fluid.layers.data(
+        name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_n1 = fluid.layers.data(
+        name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_0 = fluid.layers.data(
+        name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_p1 = fluid.layers.data(
+        name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_p2 = fluid.layers.data(
+        name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+    mark = fluid.layers.data(
+        name='mark_data', shape=[1], dtype='int64', lod_level=1)
+
+    # 8 features
+    predicate_embedding = fluid.layers.embedding(
+        input=predicate,
+        size=[PRED_DICT_LEN, WORD_DIM],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr='vemb')
+
+    mark_embedding = fluid.layers.embedding(
+        input=mark,
+        size=[MARK_DICT_LEN, MARK_DIM],
+        dtype='float32',
+        is_sparse=IS_SPARSE)
+
+    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+    emb_layers = [
+        fluid.layers.embedding(
+            size=[WORD_DICT_LEN, WORD_DIM],
+            input=x,
+            param_attr=fluid.ParamAttr(name=EMBEDDING_NAME))
+        for x in word_input
+        #name=EMBEDDING_NAME, trainable=False)) for x in word_input
+    ]
+    emb_layers.append(predicate_embedding)
+    emb_layers.append(mark_embedding)
+
+    hidden_0_layers = [
+        fluid.layers.fc(input=emb, size=HIDDEN_DIM, act='tanh')
+        for emb in emb_layers
+    ]
+
+    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
+
+    lstm_0 = fluid.layers.dynamic_lstm(
+        input=hidden_0,
+        size=HIDDEN_DIM,
+        candidate_activation='relu',
+        gate_activation='sigmoid',
+        cell_activation='sigmoid')
+
+    # stack L-LSTM and R-LSTM with direct edges
+    input_tmp = [hidden_0, lstm_0]
+
+    for i in range(1, DEPTH):
+        mix_hidden = fluid.layers.sums(input=[
+            fluid.layers.fc(input=input_tmp[0], size=HIDDEN_DIM, act='tanh'),
+            fluid.layers.fc(input=input_tmp[1], size=HIDDEN_DIM, act='tanh')
+        ])
+
+        lstm = fluid.layers.dynamic_lstm(
+            input=mix_hidden,
+            size=HIDDEN_DIM,
+            candidate_activation='relu',
+            gate_activation='sigmoid',
+            cell_activation='sigmoid',
+            is_reverse=((i % 2) == 1))
+
+        input_tmp = [mix_hidden, lstm]
+
+    feature_out = fluid.layers.sums(input=[
+        fluid.layers.fc(input=input_tmp[0], size=LABEL_DICT_LEN, act='tanh'),
+        fluid.layers.fc(input=input_tmp[1], size=LABEL_DICT_LEN, act='tanh')
+    ])
+
+    return feature_out
+
+
+def inference_program():
+    predict = lstm_net()
+
+    return predict
+
+
+def train_program():
+    MIX_HIDDEN_LR = 1e-3
+
+    predict = lstm_net()
+    target = fluid.layers.data(
+        name='target', shape=[1], dtype='int64', lod_level=1)
+    crf_cost = fluid.layers.linear_chain_crf(
+        input=predict,
+        label=target,
+        param_attr=fluid.ParamAttr(
+            name='crfw', learning_rate=MIX_HIDDEN_LR))
+    avg_cost = fluid.layers.mean(crf_cost)
+
+    return [avg_cost]
+
+
+def optimize_func():
+    return fluid.optimizer.SGD(learning_rate=fluid.layers.exponential_decay(
+        learning_rate=0.01, decay_steps=100000, decay_rate=0.5, staircase=True))
+
+
+def train(use_cuda, train_program, params_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    trainer = fluid.Trainer(
+        train_func=train_program, place=place, optimizer_func=optimize_func)
+
+    feed_order = [
+        'word_data', 'ctx_n2_data', 'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data',
+        'ctx_p2_data', 'verb_data', 'mark_data', 'target'
+    ]
+
+    #embedding_param = fluid.global_scope().find_var(
+    #        EMBEDDING_NAME).get_tensor()
+    #embedding_param.set(
+    #        load_parameter(conll05.get_embedding(), WORD_DICT_LEN, WORD_DIM),
+    #        place)
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndEpochEvent):
+            test_reader = paddle.batch(
+                paddle.dataset.conll05.test(), batch_size=BATCH_SIZE)
+            avg_cost_set = trainer.test(
+                reader=test_reader, feed_order=feed_order)
+
+            # get avg cost
+            avg_cost = np.array(avg_cost_set).mean()
+
+            print("avg_cost: %s" % avg_cost)
+
+            if float(avg_cost) < 100.0:  # Large value to increase CI speed
+                trainer.save_params(params_dirname)
+            else:
+                print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
+                                                              float(avg_cost)))
+                if math.isnan(float(avg_cost)):
+                    sys.exit("got NaN loss, training failed.")
+
+        elif isinstance(event, fluid.EndStepEvent):
+            print("Step {0}, Epoch {1} Metrics {2}".format(
+                event.step, event.epoch, map(np.array, event.metrics)))
+            if event.step == 1:  # Run 2 iterations to speed CI
+                trainer.save_params(params_dirname)
+                trainer.stop()
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.conll05.test(), buf_size=8192),
+        batch_size=BATCH_SIZE)
+    trainer.train(
+        num_epochs=1,
+        event_handler=event_handler,
+        reader=train_reader,
+        feed_order=feed_order)
+
+
+def infer(use_cuda, inference_program, params_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    inferencer = fluid.Inferencer(
+        inference_program, param_path=params_dirname, place=place)
+
+    # Setup input by creating LoDTensor to represent sequence of words.
+    # Here each word is the basic element of the LoDTensor and the shape of 
+    # each word (base_shape) should be [1] since it is simply an index to 
+    # look up for the corresponding word vector.
+    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+    # which has only one level of detail. Then the created LoDTensor will have only 
+    # one higher level structure (sequence of words, or sentence) than the basic 
+    # element (word). Hence the LoDTensor will hold data for three sentences of 
+    # length 3, 4 and 2, respectively. 
+    # Note that recursive_sequence_lengths should be a list of lists.
+    recursive_seq_lens = [[3, 4, 2]]
+    base_shape = [1]
+    # The range of random integers is [low, high]
+    word = fluid.create_random_int_lodtensor(
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+    ctx_n2 = fluid.create_random_int_lodtensor(
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+    ctx_n1 = fluid.create_random_int_lodtensor(
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+    ctx_0 = fluid.create_random_int_lodtensor(
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+    ctx_p1 = fluid.create_random_int_lodtensor(
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+    ctx_p2 = fluid.create_random_int_lodtensor(
+        recursive_seq_lens, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
+    pred = fluid.create_random_int_lodtensor(
+        recursive_seq_lens, base_shape, place, low=0, high=PRED_DICT_LEN - 1)
+    mark = fluid.create_random_int_lodtensor(
+        recursive_seq_lens, base_shape, place, low=0, high=MARK_DICT_LEN - 1)
+
+    results = inferencer.infer(
+        {
+            'word_data': word,
+            'ctx_n2_data': ctx_n2,
+            'ctx_n1_data': ctx_n1,
+            'ctx_0_data': ctx_0,
+            'ctx_p1_data': ctx_p1,
+            'ctx_p2_data': ctx_p2,
+            'verb_data': pred,
+            'mark_data': mark
+        },
+        return_numpy=False)
+
+    print("infer results: ", np.array(results[0]).shape)
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    params_dirname = "label_semantic_roles.inference.model"
+    train(use_cuda, train_program, params_dirname)
+    infer(use_cuda, inference_program, params_dirname)
+
+
+if __name__ == '__main__':
+    for use_cuda in (False, True):
+        main(use_cuda=use_cuda)
diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/machine_translation/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..673c965b662a022739f8d489c331f4de9455a926
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/CMakeLists.txt
@@ -0,0 +1,7 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
new file mode 100644
index 0000000000000000000000000000000000000000..8becd2404b0201c44b587a28e88995958082cd28
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
@@ -0,0 +1,322 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.framework as framework
+import paddle.fluid.layers as pd
+from paddle.fluid.executor import Executor
+from functools import partial
+import unittest
+import os
+
+dict_size = 30000
+source_dict_dim = target_dict_dim = dict_size
+hidden_dim = 32
+word_dim = 16
+batch_size = 2
+max_length = 8
+topk_size = 50
+trg_dic_size = 10000
+beam_size = 2
+
+decoder_size = hidden_dim
+
+
+def encoder(is_sparse):
+    # encoder
+    src_word_id = pd.data(
+        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
+    src_embedding = pd.embedding(
+        input=src_word_id,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=is_sparse,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
+    lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4)
+    encoder_out = pd.sequence_last_step(input=lstm_hidden0)
+    return encoder_out
+
+
+def train_decoder(context, is_sparse):
+    # decoder
+    trg_language_word = pd.data(
+        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
+    trg_embedding = pd.embedding(
+        input=trg_language_word,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=is_sparse,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    rnn = pd.DynamicRNN()
+    with rnn.block():
+        current_word = rnn.step_input(trg_embedding)
+        pre_state = rnn.memory(init=context)
+        current_state = pd.fc(input=[current_word, pre_state],
+                              size=decoder_size,
+                              act='tanh')
+
+        current_score = pd.fc(input=current_state,
+                              size=target_dict_dim,
+                              act='softmax')
+        rnn.update_memory(pre_state, current_state)
+        rnn.output(current_score)
+
+    return rnn()
+
+
+def decode(context, is_sparse):
+    init_state = context
+    array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
+    counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)
+
+    # fill the first element with init_state
+    state_array = pd.create_array('float32')
+    pd.array_write(init_state, array=state_array, i=counter)
+
+    # ids, scores as memory
+    ids_array = pd.create_array('int64')
+    scores_array = pd.create_array('float32')
+
+    init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
+    init_scores = pd.data(
+        name="init_scores", shape=[1], dtype="float32", lod_level=2)
+
+    pd.array_write(init_ids, array=ids_array, i=counter)
+    pd.array_write(init_scores, array=scores_array, i=counter)
+
+    cond = pd.less_than(x=counter, y=array_len)
+
+    while_op = pd.While(cond=cond)
+    with while_op.block():
+        pre_ids = pd.array_read(array=ids_array, i=counter)
+        pre_state = pd.array_read(array=state_array, i=counter)
+        pre_score = pd.array_read(array=scores_array, i=counter)
+
+        # expand the lod of pre_state to be the same with pre_score
+        pre_state_expanded = pd.sequence_expand(pre_state, pre_score)
+
+        pre_ids_emb = pd.embedding(
+            input=pre_ids,
+            size=[dict_size, word_dim],
+            dtype='float32',
+            is_sparse=is_sparse)
+
+        # use rnn unit to update rnn
+        current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb],
+                              size=decoder_size,
+                              act='tanh')
+        current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score)
+        # use score to do beam search
+        current_score = pd.fc(input=current_state_with_lod,
+                              size=target_dict_dim,
+                              act='softmax')
+        topk_scores, topk_indices = pd.topk(current_score, k=beam_size)
+        # calculate accumulated scores after topk to reduce computation cost
+        accu_scores = pd.elementwise_add(
+            x=pd.log(topk_scores), y=pd.reshape(
+                pre_score, shape=[-1]), axis=0)
+        selected_ids, selected_scores = pd.beam_search(
+            pre_ids,
+            pre_score,
+            topk_indices,
+            accu_scores,
+            beam_size,
+            end_id=10,
+            level=0)
+
+        pd.increment(x=counter, value=1, in_place=True)
+
+        # update the memories
+        pd.array_write(current_state, array=state_array, i=counter)
+        pd.array_write(selected_ids, array=ids_array, i=counter)
+        pd.array_write(selected_scores, array=scores_array, i=counter)
+
+        # update the break condition: up to the max length or all candidates of
+        # source sentences have ended.
+        length_cond = pd.less_than(x=counter, y=array_len)
+        finish_cond = pd.logical_not(pd.is_empty(x=selected_ids))
+        pd.logical_and(x=length_cond, y=finish_cond, out=cond)
+
+    translation_ids, translation_scores = pd.beam_search_decode(
+        ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10)
+
+    # return init_ids, init_scores
+
+    return translation_ids, translation_scores
+
+
+def train_program(is_sparse):
+    context = encoder(is_sparse)
+    rnn_out = train_decoder(context, is_sparse)
+    label = pd.data(
+        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
+    cost = pd.cross_entropy(input=rnn_out, label=label)
+    avg_cost = pd.mean(cost)
+    return avg_cost
+
+
+def optimizer_func():
+    return fluid.optimizer.Adagrad(
+        learning_rate=1e-4,
+        regularization=fluid.regularizer.L2DecayRegularizer(
+            regularization_coeff=0.1))
+
+
+def train(use_cuda, is_sparse, is_local=True):
+    EPOCH_NUM = 1
+
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+
+    feed_order = [
+        'src_word_id', 'target_language_word', 'target_language_next_word'
+    ]
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndStepEvent):
+            print('pass_id=' + str(event.epoch) + ' batch=' + str(event.step))
+            if event.step == 10:
+                trainer.stop()
+
+    trainer = fluid.Trainer(
+        train_func=partial(train_program, is_sparse),
+        place=place,
+        optimizer_func=optimizer_func)
+
+    trainer.train(
+        reader=train_reader,
+        num_epochs=EPOCH_NUM,
+        event_handler=event_handler,
+        feed_order=feed_order)
+
+
+def decode_main(use_cuda, is_sparse):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    context = encoder(is_sparse)
+    translation_ids, translation_scores = decode(context, is_sparse)
+
+    exe = Executor(place)
+    exe.run(framework.default_startup_program())
+
+    init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64')
+    init_scores_data = np.array(
+        [1. for _ in range(batch_size)], dtype='float32')
+    init_ids_data = init_ids_data.reshape((batch_size, 1))
+    init_scores_data = init_scores_data.reshape((batch_size, 1))
+    init_recursive_seq_lens = [1] * batch_size
+    init_recursive_seq_lens = [init_recursive_seq_lens, init_recursive_seq_lens]
+
+    init_ids = fluid.create_lod_tensor(init_ids_data, init_recursive_seq_lens,
+                                       place)
+    init_scores = fluid.create_lod_tensor(init_scores_data,
+                                          init_recursive_seq_lens, place)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+
+    feed_order = ['src_word_id']
+    feed_list = [
+        framework.default_main_program().global_block().var(var_name)
+        for var_name in feed_order
+    ]
+    feeder = fluid.DataFeeder(feed_list, place)
+
+    for data in train_data():
+        feed_dict = feeder.feed(map(lambda x: [x[0]], data))
+        feed_dict['init_ids'] = init_ids
+        feed_dict['init_scores'] = init_scores
+
+        result_ids, result_scores = exe.run(
+            framework.default_main_program(),
+            feed=feed_dict,
+            fetch_list=[translation_ids, translation_scores],
+            return_numpy=False)
+        print result_ids.recursive_sequence_lengths()
+        break
+
+
+class TestMachineTranslation(unittest.TestCase):
+    pass
+
+
+@contextlib.contextmanager
+def scope_prog_guard():
+    prog = fluid.Program()
+    startup_prog = fluid.Program()
+    scope = fluid.core.Scope()
+    with fluid.scope_guard(scope):
+        with fluid.program_guard(prog, startup_prog):
+            yield
+
+
+def inject_test_train(use_cuda, is_sparse):
+    f_name = 'test_{0}_{1}_train'.format('cuda' if use_cuda else 'cpu', 'sparse'
+                                         if is_sparse else 'dense')
+
+    def f(*args):
+        with scope_prog_guard():
+            train(use_cuda, is_sparse)
+
+    setattr(TestMachineTranslation, f_name, f)
+
+
+def inject_test_decode(use_cuda, is_sparse, decorator=None):
+    f_name = 'test_{0}_{1}_decode'.format('cuda'
+                                          if use_cuda else 'cpu', 'sparse'
+                                          if is_sparse else 'dense')
+
+    def f(*args):
+        with scope_prog_guard():
+            decode_main(use_cuda, is_sparse)
+
+    if decorator is not None:
+        f = decorator(f)
+
+    setattr(TestMachineTranslation, f_name, f)
+
+
+for _use_cuda_ in (False, True):
+    for _is_sparse_ in (False, True):
+        inject_test_train(_use_cuda_, _is_sparse_)
+
+for _use_cuda_ in (False, True):
+    for _is_sparse_ in (False, True):
+
+        _decorator_ = None
+        if _use_cuda_:
+            _decorator_ = unittest.skip(
+                reason='Beam Search does not support CUDA!')
+
+        inject_test_decode(
+            is_sparse=_is_sparse_, use_cuda=_use_cuda_, decorator=_decorator_)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..673c965b662a022739f8d489c331f4de9455a926
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt
@@ -0,0 +1,7 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a09db25dc0e2c71772aa06e6d0cf993321612e4
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
@@ -0,0 +1,137 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import paddle.fluid as fluid
+import paddle
+import sys
+import numpy
+import unittest
+import math
+import sys
+import os
+
+BATCH_SIZE = 64
+
+
+def inference_program():
+    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
+    return prediction
+
+
+def train_program():
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    predict = inference_program()
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    acc = fluid.layers.accuracy(input=predict, label=label)
+    return [avg_cost, acc]
+
+
+def optimizer_func():
+    return fluid.optimizer.Adam(learning_rate=0.001)
+
+
+def train(use_cuda, train_program, params_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    trainer = fluid.Trainer(
+        train_func=train_program,
+        place=place,
+        optimizer_func=optimizer_func,
+        parallel=True)
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndEpochEvent):
+            test_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
+            avg_cost, acc = trainer.test(
+                reader=test_reader, feed_order=['img', 'label'])
+
+            print("avg_cost: %s" % avg_cost)
+            print("acc     : %s" % acc)
+
+            if acc > 0.2:  # Smaller value to increase CI speed
+                trainer.save_params(params_dirname)
+            else:
+                print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
+                    event.epoch + 1, avg_cost, acc))
+                if math.isnan(avg_cost):
+                    sys.exit("got NaN loss, training failed.")
+        elif isinstance(event, fluid.EndStepEvent):
+            print("Step {0}, Epoch {1} Metrics {2}".format(
+                event.step, event.epoch, map(numpy.array, event.metrics)))
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=500),
+        batch_size=BATCH_SIZE)
+
+    trainer.train(
+        num_epochs=1,
+        event_handler=event_handler,
+        reader=train_reader,
+        feed_order=['img', 'label'])
+
+
+def infer(use_cuda, inference_program, params_dirname=None):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    inferencer = fluid.Inferencer(
+        infer_func=inference_program, param_path=params_dirname, place=place)
+
+    batch_size = 1
+    tensor_img = numpy.random.uniform(-1.0, 1.0,
+                                      [batch_size, 1, 28, 28]).astype("float32")
+
+    results = inferencer.infer({'img': tensor_img})
+
+    print("infer results: ", results[0])
+
+
+def main(use_cuda):
+    params_dirname = "recognize_digits_conv.inference.model"
+
+    # call train() with is_local argument to run distributed train
+    train(
+        use_cuda=use_cuda,
+        train_program=train_program,
+        params_dirname=params_dirname)
+    infer(
+        use_cuda=use_cuda,
+        inference_program=inference_program,
+        params_dirname=params_dirname)
+
+
+if __name__ == '__main__':
+    # for use_cuda in (False, True):
+    main(use_cuda=True)
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2b544e791d7ea35ff7d2c9a2dce7ce7f5680f38
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
@@ -0,0 +1,118 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import paddle.fluid as fluid
+import paddle
+import sys
+import numpy
+import unittest
+import math
+import sys
+import os
+
+BATCH_SIZE = 64
+
+
+def inference_program():
+    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+
+    hidden = fluid.layers.fc(input=img, size=200, act='tanh')
+    hidden = fluid.layers.fc(input=hidden, size=200, act='tanh')
+    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    return prediction
+
+
+def train_program():
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    predict = inference_program()
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    acc = fluid.layers.accuracy(input=predict, label=label)
+    return [avg_cost, acc]
+
+
+def optimizer_func():
+    return fluid.optimizer.Adam(learning_rate=0.001)
+
+
+def train(use_cuda, train_program, params_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    trainer = fluid.Trainer(
+        train_func=train_program, place=place, optimizer_func=optimizer_func)
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndEpochEvent):
+            test_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
+            avg_cost, acc = trainer.test(
+                reader=test_reader, feed_order=['img', 'label'])
+
+            print("avg_cost: %s" % avg_cost)
+            print("acc     : %s" % acc)
+
+            if acc > 0.2:  # Smaller value to increase CI speed
+                trainer.save_params(params_dirname)
+            else:
+                print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
+                    event.epoch + 1, avg_cost, acc))
+                if math.isnan(avg_cost):
+                    sys.exit("got NaN loss, training failed.")
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=500),
+        batch_size=BATCH_SIZE)
+
+    trainer.train(
+        num_epochs=1,
+        event_handler=event_handler,
+        reader=train_reader,
+        feed_order=['img', 'label'])
+
+
+def infer(use_cuda, inference_program, params_dirname=None):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    inferencer = fluid.Inferencer(
+        infer_func=inference_program, param_path=params_dirname, place=place)
+
+    batch_size = 1
+    tensor_img = numpy.random.uniform(-1.0, 1.0,
+                                      [batch_size, 1, 28, 28]).astype("float32")
+
+    results = inferencer.infer({'img': tensor_img})
+
+    print("infer results: ", results[0])
+
+
+def main(use_cuda):
+    params_dirname = "recognize_digits_mlp.inference.model"
+
+    # call train() with is_local argument to run distributed train
+    train(
+        use_cuda=use_cuda,
+        train_program=train_program,
+        params_dirname=params_dirname)
+    infer(
+        use_cuda=use_cuda,
+        inference_program=inference_program,
+        params_dirname=params_dirname)
+
+
+if __name__ == '__main__':
+    # for use_cuda in (False, True):
+    main(use_cuda=False)
diff --git a/python/paddle/fluid/tests/book/high-level-api/recommender_system/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/recommender_system/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..673c965b662a022739f8d489c331f4de9455a926
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/CMakeLists.txt
@@ -0,0 +1,7 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
new file mode 100644
index 0000000000000000000000000000000000000000..c860f1641708d947fd2a8008d3d3ccd0a231f6c2
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
@@ -0,0 +1,260 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import sys
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.fluid.nets as nets
+
+IS_SPARSE = True
+USE_GPU = False
+BATCH_SIZE = 256
+
+
+def get_usr_combined_features():
+    # FIXME(dzh) : old API integer_value(10) may have range check.
+    # currently we don't have user configurated check.
+
+    USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1
+
+    uid = layers.data(name='user_id', shape=[1], dtype='int64')
+
+    usr_emb = layers.embedding(
+        input=uid,
+        dtype='float32',
+        size=[USR_DICT_SIZE, 32],
+        param_attr='user_table',
+        is_sparse=IS_SPARSE)
+
+    usr_fc = layers.fc(input=usr_emb, size=32)
+
+    USR_GENDER_DICT_SIZE = 2
+
+    usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64')
+
+    usr_gender_emb = layers.embedding(
+        input=usr_gender_id,
+        size=[USR_GENDER_DICT_SIZE, 16],
+        param_attr='gender_table',
+        is_sparse=IS_SPARSE)
+
+    usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
+
+    USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
+    usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64")
+
+    usr_age_emb = layers.embedding(
+        input=usr_age_id,
+        size=[USR_AGE_DICT_SIZE, 16],
+        is_sparse=IS_SPARSE,
+        param_attr='age_table')
+
+    usr_age_fc = layers.fc(input=usr_age_emb, size=16)
+
+    USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
+    usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64")
+
+    usr_job_emb = layers.embedding(
+        input=usr_job_id,
+        size=[USR_JOB_DICT_SIZE, 16],
+        param_attr='job_table',
+        is_sparse=IS_SPARSE)
+
+    usr_job_fc = layers.fc(input=usr_job_emb, size=16)
+
+    concat_embed = layers.concat(
+        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1)
+
+    usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
+
+    return usr_combined_features
+
+
+def get_mov_combined_features():
+
+    MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1
+
+    mov_id = layers.data(name='movie_id', shape=[1], dtype='int64')
+
+    mov_emb = layers.embedding(
+        input=mov_id,
+        dtype='float32',
+        size=[MOV_DICT_SIZE, 32],
+        param_attr='movie_table',
+        is_sparse=IS_SPARSE)
+
+    mov_fc = layers.fc(input=mov_emb, size=32)
+
+    CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
+
+    category_id = layers.data(
+        name='category_id', shape=[1], dtype='int64', lod_level=1)
+
+    mov_categories_emb = layers.embedding(
+        input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
+
+    mov_categories_hidden = layers.sequence_pool(
+        input=mov_categories_emb, pool_type="sum")
+
+    MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
+
+    mov_title_id = layers.data(
+        name='movie_title', shape=[1], dtype='int64', lod_level=1)
+
+    mov_title_emb = layers.embedding(
+        input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
+
+    mov_title_conv = nets.sequence_conv_pool(
+        input=mov_title_emb,
+        num_filters=32,
+        filter_size=3,
+        act="tanh",
+        pool_type="sum")
+
+    concat_embed = layers.concat(
+        input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)
+
+    # FIXME(dzh) : need tanh operator
+    mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
+
+    return mov_combined_features
+
+
+def inference_program():
+    usr_combined_features = get_usr_combined_features()
+    mov_combined_features = get_mov_combined_features()
+
+    inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
+    scale_infer = layers.scale(x=inference, scale=5.0)
+
+    return scale_infer
+
+
+def train_program():
+
+    scale_infer = inference_program()
+
+    label = layers.data(name='score', shape=[1], dtype='float32')
+    square_cost = layers.square_error_cost(input=scale_infer, label=label)
+    avg_cost = layers.mean(square_cost)
+
+    return [avg_cost, scale_infer]
+
+
+def optimizer_func():
+    return fluid.optimizer.SGD(learning_rate=0.2)
+
+
+def train(use_cuda, train_program, params_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    trainer = fluid.Trainer(
+        train_func=train_program, place=place, optimizer_func=optimizer_func)
+
+    feed_order = [
+        'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id',
+        'movie_title', 'score'
+    ]
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndStepEvent):
+            test_reader = paddle.batch(
+                paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)
+            avg_cost_set = trainer.test(
+                reader=test_reader, feed_order=feed_order)
+
+            # get avg cost
+            avg_cost = np.array(avg_cost_set).mean()
+
+            print("avg_cost: %s" % avg_cost)
+
+            if float(avg_cost) < 4:  # Smaller value to increase CI speed
+                trainer.save_params(params_dirname)
+                trainer.stop()
+            else:
+                print('BatchID {0}, Test Loss {1:0.2}'.format(event.epoch + 1,
+                                                              float(avg_cost)))
+                if math.isnan(float(avg_cost)):
+                    sys.exit("got NaN loss, training failed.")
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.movielens.train(), buf_size=8192),
+        batch_size=BATCH_SIZE)
+
+    trainer.train(
+        num_epochs=1,
+        event_handler=event_handler,
+        reader=train_reader,
+        feed_order=feed_order)
+
+
+def infer(use_cuda, inference_program, params_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    inferencer = fluid.Inferencer(
+        inference_program, param_path=params_dirname, place=place)
+
+    # Use the first data from paddle.dataset.movielens.test() as input.
+    # Use create_lod_tensor(data, recursive_sequence_lengths, place) API 
+    # to generate LoD Tensor where `data` is a list of sequences of index 
+    # numbers, `recursive_sequence_lengths` is the length-based level of detail 
+    # (lod) info associated with `data`.
+    # For example, data = [[10, 2, 3], [2, 3]] means that it contains
+    # two sequences of indexes, of length 3 and 2, respectively.
+    # Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one 
+    # level of detail info, indicating that `data` consists of two sequences 
+    # of length 3 and 2, respectively. 
+    user_id = fluid.create_lod_tensor([[1]], [[1]], place)
+    gender_id = fluid.create_lod_tensor([[1]], [[1]], place)
+    age_id = fluid.create_lod_tensor([[0]], [[1]], place)
+    job_id = fluid.create_lod_tensor([[10]], [[1]], place)
+    movie_id = fluid.create_lod_tensor([[783]], [[1]], place)
+    category_id = fluid.create_lod_tensor([[10, 8, 9]], [[3]], place)
+    movie_title = fluid.create_lod_tensor([[1069, 4140, 2923, 710, 988]], [[5]],
+                                          place)
+
+    results = inferencer.infer(
+        {
+            'user_id': user_id,
+            'gender_id': gender_id,
+            'age_id': age_id,
+            'job_id': job_id,
+            'movie_id': movie_id,
+            'category_id': category_id,
+            'movie_title': movie_title
+        },
+        return_numpy=False)
+
+    print("infer results: ", np.array(results[0]))
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    params_dirname = "recommender_system.inference.model"
+    train(
+        use_cuda=use_cuda,
+        train_program=train_program,
+        params_dirname=params_dirname)
+    infer(
+        use_cuda=use_cuda,
+        inference_program=inference_program,
+        params_dirname=params_dirname)
+
+
+if __name__ == '__main__':
+    main(USE_GPU)
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d71147a85e77ea6dc5b6391aa169abd9b02a0aa1
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/CMakeLists.txt
@@ -0,0 +1,12 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+# This test is buggy
+# py_test(test_understand_sentiment_dynamic_rnn SRCS
+# 	test_understand_sentiment_dynamic_rnn.py SERIAL)
+LIST(REMOVE_ITEM TEST_OPS test_understand_sentiment_dynamic_rnn)
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..1668ae83d3581125b799508c8c3115a038e93d5a
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+from functools import partial
+import numpy as np
+
+CLASS_DIM = 2
+EMB_DIM = 128
+HID_DIM = 512
+BATCH_SIZE = 128
+
+
+def convolution_net(data, input_dim, class_dim, emb_dim, hid_dim):
+    emb = fluid.layers.embedding(
+        input=data, size=[input_dim, emb_dim], is_sparse=True)
+    conv_3 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=3,
+        act="tanh",
+        pool_type="sqrt")
+    conv_4 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=4,
+        act="tanh",
+        pool_type="sqrt")
+    prediction = fluid.layers.fc(input=[conv_3, conv_4],
+                                 size=class_dim,
+                                 act="softmax")
+    return prediction
+
+
+def inference_program(word_dict):
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+
+    dict_dim = len(word_dict)
+    net = convolution_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM)
+    return net
+
+
+def train_program(word_dict):
+    prediction = inference_program(word_dict)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    accuracy = fluid.layers.accuracy(input=prediction, label=label)
+    return [avg_cost, accuracy]
+
+
+def optimizer_func():
+    return fluid.optimizer.Adagrad(learning_rate=0.002)
+
+
+def train(use_cuda, train_program, params_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    trainer = fluid.Trainer(
+        train_func=partial(train_program, word_dict),
+        place=place,
+        optimizer_func=optimizer_func)
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndEpochEvent):
+            test_reader = paddle.batch(
+                paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE)
+            avg_cost, acc = trainer.test(
+                reader=test_reader, feed_order=['words', 'label'])
+
+            print("avg_cost: %s" % avg_cost)
+            print("acc     : %s" % acc)
+
+            if acc > 0.2:  # Smaller value to increase CI speed
+                trainer.save_params(params_dirname)
+                trainer.stop()
+
+            else:
+                print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
+                    event.epoch + 1, avg_cost, acc))
+                if math.isnan(avg_cost):
+                    sys.exit("got NaN loss, training failed.")
+        elif isinstance(event, fluid.EndStepEvent):
+            print("Step {0}, Epoch {1} Metrics {2}".format(
+                event.step, event.epoch, map(np.array, event.metrics)))
+            if event.step == 1:  # Run 2 iterations to speed CI
+                trainer.save_params(params_dirname)
+                trainer.stop()
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=25000),
+        batch_size=BATCH_SIZE)
+
+    trainer.train(
+        num_epochs=1,
+        event_handler=event_handler,
+        reader=train_reader,
+        feed_order=['words', 'label'])
+
+
+def infer(use_cuda, inference_program, params_dirname=None):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    word_dict = paddle.dataset.imdb.word_dict()
+
+    inferencer = fluid.Inferencer(
+        infer_func=partial(inference_program, word_dict),
+        param_path=params_dirname,
+        place=place)
+
+    # Setup input by creating LoDTensor to represent sequence of words.
+    # Here each word is the basic element of the LoDTensor and the shape of 
+    # each word (base_shape) should be [1] since it is simply an index to 
+    # look up for the corresponding word vector.
+    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+    # which has only one level of detail. Then the created LoDTensor will have only 
+    # one higher level structure (sequence of words, or sentence) than the basic 
+    # element (word). Hence the LoDTensor will hold data for three sentences of 
+    # length 3, 4 and 2, respectively. 
+    # Note that recursive_sequence_lengths should be a list of lists.
+    recursive_seq_lens = [[3, 4, 2]]
+    base_shape = [1]
+    # The range of random integers is [low, high]
+    tensor_words = fluid.create_random_int_lodtensor(
+        recursive_seq_lens, base_shape, place, low=0, high=len(word_dict) - 1)
+    results = inferencer.infer({'words': tensor_words})
+    print("infer results: ", results)
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    params_dirname = "understand_sentiment_conv.inference.model"
+    train(use_cuda, train_program, params_dirname)
+    infer(use_cuda, inference_program, params_dirname)
+
+
+if __name__ == '__main__':
+    for use_cuda in (False, True):
+        main(use_cuda=use_cuda)
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..8da89d82cb8e00853eebfd794602a0e1e1020e7c
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+from functools import partial
+import numpy as np
+
+CLASS_DIM = 2
+EMB_DIM = 128
+BATCH_SIZE = 128
+LSTM_SIZE = 128
+
+
+def dynamic_rnn_lstm(data, input_dim, class_dim, emb_dim, lstm_size):
+    emb = fluid.layers.embedding(
+        input=data, size=[input_dim, emb_dim], is_sparse=True)
+    sentence = fluid.layers.fc(input=emb, size=lstm_size, act='tanh')
+
+    rnn = fluid.layers.DynamicRNN()
+    with rnn.block():
+        word = rnn.step_input(sentence)
+        prev_hidden = rnn.memory(value=0.0, shape=[lstm_size])
+        prev_cell = rnn.memory(value=0.0, shape=[lstm_size])
+
+        def gate_common(ipt, hidden, size):
+            gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True)
+            gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
+            return gate0 + gate1
+
+        forget_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
+                                                         lstm_size))
+        input_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
+                                                        lstm_size))
+        output_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
+                                                         lstm_size))
+        cell_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
+                                                       lstm_size))
+
+        cell = forget_gate * prev_cell + input_gate * cell_gate
+        hidden = output_gate * fluid.layers.tanh(x=cell)
+        rnn.update_memory(prev_cell, cell)
+        rnn.update_memory(prev_hidden, hidden)
+        rnn.output(hidden)
+
+    last = fluid.layers.sequence_last_step(rnn())
+    prediction = fluid.layers.fc(input=last, size=class_dim, act="softmax")
+    return prediction
+
+
+def inference_program(word_dict):
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+
+    dict_dim = len(word_dict)
+    pred = dynamic_rnn_lstm(data, dict_dim, CLASS_DIM, EMB_DIM, LSTM_SIZE)
+    return pred
+
+
+def train_program(word_dict):
+    prediction = inference_program(word_dict)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    accuracy = fluid.layers.accuracy(input=prediction, label=label)
+    return [avg_cost, accuracy]
+
+
+def optimizer_func():
+    return fluid.optimizer.Adagrad(learning_rate=0.002)
+
+
+def train(use_cuda, train_program, params_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    trainer = fluid.Trainer(
+        train_func=partial(train_program, word_dict),
+        place=place,
+        optimizer_func=optimizer_func)
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndEpochEvent):
+            test_reader = paddle.batch(
+                paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE)
+            avg_cost, acc = trainer.test(
+                reader=test_reader, feed_order=['words', 'label'])
+
+            print("avg_cost: %s" % avg_cost)
+            print("acc     : %s" % acc)
+
+            if acc > 0.2:  # Smaller value to increase CI speed
+                trainer.save_params(params_dirname)
+                trainer.stop()
+
+            else:
+                print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
+                    event.epoch + 1, avg_cost, acc))
+                if math.isnan(avg_cost):
+                    sys.exit("got NaN loss, training failed.")
+        elif isinstance(event, fluid.EndStepEvent):
+            print("Step {0}, Epoch {1} Metrics {2}".format(
+                event.step, event.epoch, map(np.array, event.metrics)))
+            if event.step == 1:  # Run 2 iterations to speed CI
+                trainer.save_params(params_dirname)
+                trainer.stop()
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=25000),
+        batch_size=BATCH_SIZE)
+
+    trainer.train(
+        num_epochs=1,
+        event_handler=event_handler,
+        reader=train_reader,
+        feed_order=['words', 'label'])
+
+
+def infer(use_cuda, inference_program, params_dirname=None):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    word_dict = paddle.dataset.imdb.word_dict()
+
+    inferencer = fluid.Inferencer(
+        infer_func=partial(inference_program, word_dict),
+        param_path=params_dirname,
+        place=place)
+
+    # Setup input by creating LoDTensor to represent sequence of words.
+    # Here each word is the basic element of the LoDTensor and the shape of 
+    # each word (base_shape) should be [1] since it is simply an index to 
+    # look up for the corresponding word vector.
+    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+    # which has only one level of detail. Then the created LoDTensor will have only 
+    # one higher level structure (sequence of words, or sentence) than the basic 
+    # element (word). Hence the LoDTensor will hold data for three sentences of 
+    # length 3, 4 and 2, respectively. 
+    # Note that recursive_sequence_lengths should be a list of lists.
+    recursive_seq_lens = [[3, 4, 2]]
+    base_shape = [1]
+    # The range of random integers is [low, high]
+    tensor_words = fluid.create_random_int_lodtensor(
+        recursive_seq_lens, base_shape, place, low=0, high=len(word_dict) - 1)
+    results = inferencer.infer({'words': tensor_words})
+    print("infer results: ", results)
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    params_dirname = "understand_sentiment_conv.inference.model"
+    train(use_cuda, train_program, params_dirname)
+    infer(use_cuda, inference_program, params_dirname)
+
+
+if __name__ == '__main__':
+    for use_cuda in (False, True):
+        main(use_cuda=use_cuda)
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..74faa2e8aa734cd644dfcc38127fd12df1fb1092
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
@@ -0,0 +1,166 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+from functools import partial
+import numpy as np
+
+CLASS_DIM = 2
+EMB_DIM = 128
+HID_DIM = 512
+STACKED_NUM = 3
+BATCH_SIZE = 128
+
+
+def stacked_lstm_net(data, input_dim, class_dim, emb_dim, hid_dim, stacked_num):
+    assert stacked_num % 2 == 1
+
+    emb = fluid.layers.embedding(
+        input=data, size=[input_dim, emb_dim], is_sparse=True)
+
+    fc1 = fluid.layers.fc(input=emb, size=hid_dim)
+    lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
+
+    inputs = [fc1, lstm1]
+
+    for i in range(2, stacked_num + 1):
+        fc = fluid.layers.fc(input=inputs, size=hid_dim)
+        lstm, cell = fluid.layers.dynamic_lstm(
+            input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
+        inputs = [fc, lstm]
+
+    fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
+    lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
+
+    prediction = fluid.layers.fc(input=[fc_last, lstm_last],
+                                 size=class_dim,
+                                 act='softmax')
+    return prediction
+
+
+def inference_program(word_dict):
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+
+    dict_dim = len(word_dict)
+    net = stacked_lstm_net(data, dict_dim, CLASS_DIM, EMB_DIM, HID_DIM,
+                           STACKED_NUM)
+    return net
+
+
+def train_program(word_dict):
+    prediction = inference_program(word_dict)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    accuracy = fluid.layers.accuracy(input=prediction, label=label)
+    return [avg_cost, accuracy]
+
+
+def optimizer_func():
+    return fluid.optimizer.Adagrad(learning_rate=0.002)
+
+
+def train(use_cuda, train_program, params_dirname):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    trainer = fluid.Trainer(
+        train_func=partial(train_program, word_dict),
+        place=place,
+        optimizer_func=optimizer_func)
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndEpochEvent):
+            test_reader = paddle.batch(
+                paddle.dataset.imdb.test(word_dict),
+                batch_size=BATCH_SIZE,
+                drop_last=False)
+            avg_cost, acc = trainer.test(
+                reader=test_reader, feed_order=['words', 'label'])
+
+            print("avg_cost: %s" % avg_cost)
+            print("acc     : %s" % acc)
+
+            if acc > 0.2:  # Smaller value to increase CI speed
+                trainer.save_params(params_dirname)
+                trainer.stop()
+
+            else:
+                print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
+                    event.epoch + 1, avg_cost, acc))
+                if math.isnan(avg_cost):
+                    sys.exit("got NaN loss, training failed.")
+        elif isinstance(event, fluid.EndStepEvent):
+            print("Step {0}, Epoch {1} Metrics {2}".format(
+                event.step, event.epoch, map(np.array, event.metrics)))
+            if event.step == 1:  # Run 2 iterations to speed CI
+                trainer.save_params(params_dirname)
+                trainer.stop()
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=25000),
+        batch_size=BATCH_SIZE,
+        drop_last=False)
+
+    trainer.train(
+        num_epochs=1,
+        event_handler=event_handler,
+        reader=train_reader,
+        feed_order=['words', 'label'])
+
+
+def infer(use_cuda, inference_program, params_dirname=None):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    word_dict = paddle.dataset.imdb.word_dict()
+
+    inferencer = fluid.Inferencer(
+        infer_func=partial(inference_program, word_dict),
+        param_path=params_dirname,
+        place=place)
+
+    # Setup input by creating LoDTensor to represent sequence of words.
+    # Here each word is the basic element of the LoDTensor and the shape of 
+    # each word (base_shape) should be [1] since it is simply an index to 
+    # look up for the corresponding word vector.
+    # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+    # which has only one level of detail. Then the created LoDTensor will have only 
+    # one higher level structure (sequence of words, or sentence) than the basic 
+    # element (word). Hence the LoDTensor will hold data for three sentences of 
+    # length 3, 4 and 2, respectively. 
+    # Note that recursive_sequence_lengths should be a list of lists.
+    recursive_seq_lens = [[3, 4, 2]]
+    base_shape = [1]
+    # The range of random integers is [low, high]
+    tensor_words = fluid.create_random_int_lodtensor(
+        recursive_seq_lens, base_shape, place, low=0, high=len(word_dict) - 1)
+    results = inferencer.infer({'words': tensor_words})
+    print("infer results: ", results)
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    params_dirname = "understand_sentiment_stacked_lstm.inference.model"
+    train(use_cuda, train_program, params_dirname)
+    infer(use_cuda, inference_program, params_dirname)
+
+
+if __name__ == '__main__':
+    for use_cuda in (False, True):
+        main(use_cuda=use_cuda)
diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/word2vec/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..673c965b662a022739f8d489c331f4de9455a926
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/CMakeLists.txt
@@ -0,0 +1,7 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..02e65cf56c4d1bd262831320befd2edc735c0d1c
--- /dev/null
+++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
@@ -0,0 +1,175 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import numpy as np
+import math
+import sys
+from functools import partial
+
+PASS_NUM = 100
+EMBED_SIZE = 32
+HIDDEN_SIZE = 256
+N = 5
+BATCH_SIZE = 32
+
+word_dict = paddle.dataset.imikolov.build_dict()
+dict_size = len(word_dict)
+
+
+def inference_program(is_sparse):
+    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
+    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
+    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
+    forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
+
+    embed_first = fluid.layers.embedding(
+        input=first_word,
+        size=[dict_size, EMBED_SIZE],
+        dtype='float32',
+        is_sparse=is_sparse,
+        param_attr='shared_w')
+    embed_second = fluid.layers.embedding(
+        input=second_word,
+        size=[dict_size, EMBED_SIZE],
+        dtype='float32',
+        is_sparse=is_sparse,
+        param_attr='shared_w')
+    embed_third = fluid.layers.embedding(
+        input=third_word,
+        size=[dict_size, EMBED_SIZE],
+        dtype='float32',
+        is_sparse=is_sparse,
+        param_attr='shared_w')
+    embed_forth = fluid.layers.embedding(
+        input=forth_word,
+        size=[dict_size, EMBED_SIZE],
+        dtype='float32',
+        is_sparse=is_sparse,
+        param_attr='shared_w')
+
+    concat_embed = fluid.layers.concat(
+        input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
+    hidden1 = fluid.layers.fc(input=concat_embed,
+                              size=HIDDEN_SIZE,
+                              act='sigmoid')
+    predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax')
+    return predict_word
+
+
+def train_program(is_sparse):
+    # The declaration of 'next_word' must be after the invoking of inference_program,
+    # or the data input order of train program would be [next_word, firstw, secondw,
+    # thirdw, forthw], which is not correct.
+    predict_word = inference_program(is_sparse)
+    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
+    avg_cost = fluid.layers.mean(cost)
+    return avg_cost
+
+
+def optimizer_func():
+    return fluid.optimizer.SGD(learning_rate=0.001)
+
+
+def train(use_cuda, train_program, params_dirname):
+    train_reader = paddle.batch(
+        paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
+    test_reader = paddle.batch(
+        paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    def event_handler(event):
+        if isinstance(event, fluid.EndStepEvent):
+            outs = trainer.test(
+                reader=test_reader,
+                feed_order=['firstw', 'secondw', 'thirdw', 'forthw', 'nextw'])
+            avg_cost = outs[0]
+            print("loss= ", avg_cost)
+
+            if avg_cost < 10.0:
+                trainer.save_params(params_dirname)
+                trainer.stop()
+
+            if math.isnan(avg_cost):
+                sys.exit("got NaN loss, training failed.")
+
+    trainer = fluid.Trainer(
+        train_func=train_program, optimizer_func=optimizer_func, place=place)
+
+    trainer.train(
+        reader=train_reader,
+        num_epochs=1,
+        event_handler=event_handler,
+        feed_order=['firstw', 'secondw', 'thirdw', 'forthw', 'nextw'])
+
+
+def infer(use_cuda, inference_program, params_dirname=None):
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    inferencer = fluid.Inferencer(
+        infer_func=inference_program, param_path=params_dirname, place=place)
+
+    # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word 
+    # is simply an index to look up for the corresponding word vector and hence 
+    # the shape of word (base_shape) should be [1]. The recursive_sequence_lengths, 
+    # which is length-based level of detail (lod) of each LoDTensor, should be [[1]] 
+    # meaning there is only one level of detail and there is only one sequence of 
+    # one word on this level.
+    # Note that recursive_sequence_lengths should be a list of lists.
+    recursive_seq_lens = [[1]]
+    base_shape = [1]
+    # The range of random integers is [low, high]
+    first_word = fluid.create_random_int_lodtensor(
+        recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
+    second_word = fluid.create_random_int_lodtensor(
+        recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
+    third_word = fluid.create_random_int_lodtensor(
+        recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
+    fourth_word = fluid.create_random_int_lodtensor(
+        recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
+
+    result = inferencer.infer(
+        {
+            'firstw': first_word,
+            'secondw': second_word,
+            'thirdw': third_word,
+            'forthw': fourth_word
+        },
+        return_numpy=False)
+    print(np.array(result[0]))
+
+
+def main(use_cuda, is_sparse):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    params_dirname = "word2vec.inference.model"
+
+    train(
+        use_cuda=use_cuda,
+        train_program=partial(train_program, is_sparse),
+        params_dirname=params_dirname)
+
+    infer(
+        use_cuda=use_cuda,
+        inference_program=partial(inference_program, is_sparse),
+        params_dirname=params_dirname)
+
+
+if __name__ == '__main__':
+    for use_cuda in (False, True):
+        for is_sparse in (False, True):
+            main(use_cuda=use_cuda, is_sparse=is_sparse)
diff --git a/python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py b/python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py
deleted file mode 100644
index 983f8f4dbeac83566839de25ec9765eb248be768..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py
+++ /dev/null
@@ -1,292 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import paddle.v2 as paddle
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import paddle.fluid.framework as framework
-import paddle.fluid.layers as layers
-import contextlib
-import math
-import sys
-import unittest
-from paddle.fluid.executor import Executor
-
-dict_size = 30000
-source_dict_dim = target_dict_dim = dict_size
-src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
-hidden_dim = 32
-embedding_dim = 16
-batch_size = 10
-max_length = 50
-topk_size = 50
-encoder_size = decoder_size = hidden_dim
-IS_SPARSE = True
-USE_PEEPHOLES = False
-
-
-def bi_lstm_encoder(input_seq, hidden_size):
-    input_forward_proj = fluid.layers.fc(input=input_seq,
-                                         size=hidden_size * 4,
-                                         bias_attr=True)
-    forward, _ = fluid.layers.dynamic_lstm(
-        input=input_forward_proj,
-        size=hidden_size * 4,
-        use_peepholes=USE_PEEPHOLES)
-    input_backward_proj = fluid.layers.fc(input=input_seq,
-                                          size=hidden_size * 4,
-                                          bias_attr=True)
-    backward, _ = fluid.layers.dynamic_lstm(
-        input=input_backward_proj,
-        size=hidden_size * 4,
-        is_reverse=True,
-        use_peepholes=USE_PEEPHOLES)
-
-    forward_last = fluid.layers.sequence_last_step(input=forward)
-    backward_first = fluid.layers.sequence_first_step(input=backward)
-
-    return forward_last, backward_first
-
-
-# FIXME(peterzhang2029): Replace this function with the lstm_unit_op.
-def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
-    def linear(inputs):
-        return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
-
-    forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
-    input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
-    output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
-    cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
-
-    cell_t = fluid.layers.sums(input=[
-        fluid.layers.elementwise_mul(
-            x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul(
-                x=input_gate, y=cell_tilde)
-    ])
-
-    hidden_t = fluid.layers.elementwise_mul(
-        x=output_gate, y=fluid.layers.tanh(x=cell_t))
-
-    return hidden_t, cell_t
-
-
-def lstm_decoder_without_attention(target_embedding, decoder_boot, context,
-                                   decoder_size):
-    rnn = fluid.layers.DynamicRNN()
-
-    cell_init = fluid.layers.fill_constant_batch_size_like(
-        input=decoder_boot,
-        value=0.0,
-        shape=[-1, decoder_size],
-        dtype='float32')
-    cell_init.stop_gradient = False
-
-    with rnn.block():
-        current_word = rnn.step_input(target_embedding)
-        context = rnn.static_input(context)
-
-        hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
-        cell_mem = rnn.memory(init=cell_init)
-        decoder_inputs = fluid.layers.concat(
-            input=[context, current_word], axis=1)
-        h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size)
-        rnn.update_memory(hidden_mem, h)
-        rnn.update_memory(cell_mem, c)
-        out = fluid.layers.fc(input=h,
-                              size=target_dict_dim,
-                              bias_attr=True,
-                              act='softmax')
-        rnn.output(out)
-    return rnn()
-
-
-def seq_to_seq_net():
-    """Construct a seq2seq network."""
-
-    src_word_idx = fluid.layers.data(
-        name='source_sequence', shape=[1], dtype='int64', lod_level=1)
-
-    src_embedding = fluid.layers.embedding(
-        input=src_word_idx,
-        size=[source_dict_dim, embedding_dim],
-        dtype='float32')
-
-    src_forward_last, src_backward_first = bi_lstm_encoder(
-        input_seq=src_embedding, hidden_size=encoder_size)
-
-    encoded_vector = fluid.layers.concat(
-        input=[src_forward_last, src_backward_first], axis=1)
-
-    decoder_boot = fluid.layers.fc(input=src_backward_first,
-                                   size=decoder_size,
-                                   bias_attr=False,
-                                   act='tanh')
-
-    trg_word_idx = fluid.layers.data(
-        name='target_sequence', shape=[1], dtype='int64', lod_level=1)
-
-    trg_embedding = fluid.layers.embedding(
-        input=trg_word_idx,
-        size=[target_dict_dim, embedding_dim],
-        dtype='float32')
-
-    prediction = lstm_decoder_without_attention(trg_embedding, decoder_boot,
-                                                encoded_vector, decoder_size)
-    label = fluid.layers.data(
-        name='label_sequence', shape=[1], dtype='int64', lod_level=1)
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(cost)
-
-    return avg_cost, prediction
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = core.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
-def create_random_lodtensor(lod, place, low, high):
-    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
-    res = fluid.LoDTensor()
-    res.set(data, place)
-    res.set_lod([lod])
-    return res
-
-
-def train(use_cuda, save_dirname=None):
-    [avg_cost, prediction] = seq_to_seq_net()
-
-    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
-    optimizer.minimize(avg_cost)
-
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
-        batch_size=batch_size)
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = Executor(place)
-
-    exe.run(framework.default_startup_program())
-
-    batch_id = 0
-    for pass_id in xrange(2):
-        for data in train_data():
-            word_data = to_lodtensor(map(lambda x: x[0], data), place)
-            trg_word = to_lodtensor(map(lambda x: x[1], data), place)
-            trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
-
-            outs = exe.run(framework.default_main_program(),
-                           feed={
-                               'source_sequence': word_data,
-                               'target_sequence': trg_word,
-                               'label_sequence': trg_word_next
-                           },
-                           fetch_list=[avg_cost])
-
-            avg_cost_val = np.array(outs[0])
-            print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
-                  " avg_cost=" + str(avg_cost_val))
-            if math.isnan(float(avg_cost_val[0])):
-                sys.exit("got NaN loss, training failed.")
-            if batch_id > 3:
-                if save_dirname is not None:
-                    fluid.io.save_inference_model(
-                        save_dirname, ['source_sequence',
-                                       'target_sequence'], [prediction], exe)
-                return
-
-            batch_id += 1
-
-
-def infer(use_cuda, save_dirname=None):
-    if save_dirname is None:
-        return
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    inference_scope = fluid.core.Scope()
-    with fluid.scope_guard(inference_scope):
-        # Use fluid.io.load_inference_model to obtain the inference program desc,
-        # the feed_target_names (the names of variables that will be feeded
-        # data using feed operators), and the fetch_targets (variables that
-        # we want to obtain data from using fetch operators).
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-        lod = [0, 4, 10]
-        word_data = create_random_lodtensor(lod, place, low=0, high=1)
-        trg_word = create_random_lodtensor(lod, place, low=0, high=1)
-
-        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
-        # and results will contain a list of data corresponding to fetch_targets.
-        assert feed_target_names[0] == 'source_sequence'
-        assert feed_target_names[1] == 'target_sequence'
-        results = exe.run(inference_program,
-                          feed={
-                              feed_target_names[0]: word_data,
-                              feed_target_names[1]: trg_word,
-                          },
-                          fetch_list=fetch_targets,
-                          return_numpy=False)
-        print(results[0].lod())
-        np_data = np.array(results[0])
-        print("Inference shape: ", np_data.shape)
-        print("Inference results: ", np_data)
-
-
-def main(use_cuda):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-
-    # Directory for saving the trained model
-    save_dirname = "rnn_encoder_decoder.inference.model"
-
-    train(use_cuda, save_dirname)
-    infer(use_cuda, save_dirname)
-
-
-class TestRnnEncoderDecoder(unittest.TestCase):
-    def test_cuda(self):
-        with self.scope_prog_guard():
-            main(use_cuda=True)
-
-    def test_cpu(self):
-        with self.scope_prog_guard():
-            main(use_cuda=False)
-
-    @contextlib.contextmanager
-    def scope_prog_guard(self):
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        scope = fluid.core.Scope()
-        with fluid.scope_guard(scope):
-            with fluid.program_guard(prog, startup_prog):
-                yield
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/book/notest_understand_sentiment.py b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..1df7b99aad6094a8b8ddfe783b9de35cef61c524
--- /dev/null
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
@@ -0,0 +1,382 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+import paddle
+import contextlib
+import math
+import numpy as np
+import sys
+import os
+
+
+def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
+                    hid_dim=32):
+    emb = fluid.layers.embedding(
+        input=data, size=[input_dim, emb_dim], is_sparse=True)
+    conv_3 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=3,
+        act="tanh",
+        pool_type="sqrt")
+    conv_4 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=4,
+        act="tanh",
+        pool_type="sqrt")
+    prediction = fluid.layers.fc(input=[conv_3, conv_4],
+                                 size=class_dim,
+                                 act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    accuracy = fluid.layers.accuracy(input=prediction, label=label)
+    return avg_cost, accuracy, prediction
+
+
+def dyn_rnn_lstm(data, label, input_dim, class_dim=2, emb_dim=32,
+                 lstm_size=128):
+    emb = fluid.layers.embedding(
+        input=data, size=[input_dim, emb_dim], is_sparse=True)
+    sentence = fluid.layers.fc(input=emb, size=lstm_size, act='tanh')
+
+    rnn = fluid.layers.DynamicRNN()
+    with rnn.block():
+        word = rnn.step_input(sentence)
+        prev_hidden = rnn.memory(value=0.0, shape=[lstm_size])
+        prev_cell = rnn.memory(value=0.0, shape=[lstm_size])
+
+        def gate_common(ipt, hidden, size):
+            gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True)
+            gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
+            return gate0 + gate1
+
+        forget_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
+                                                         lstm_size))
+        input_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
+                                                        lstm_size))
+        output_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
+                                                         lstm_size))
+        cell_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
+                                                       lstm_size))
+
+        cell = forget_gate * prev_cell + input_gate * cell_gate
+        hidden = output_gate * fluid.layers.tanh(x=cell)
+        rnn.update_memory(prev_cell, cell)
+        rnn.update_memory(prev_hidden, hidden)
+        rnn.output(hidden)
+
+    last = fluid.layers.sequence_last_step(rnn())
+    prediction = fluid.layers.fc(input=last, size=class_dim, act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    accuracy = fluid.layers.accuracy(input=prediction, label=label)
+    return avg_cost, accuracy, prediction
+
+
+def stacked_lstm_net(data,
+                     label,
+                     input_dim,
+                     class_dim=2,
+                     emb_dim=128,
+                     hid_dim=512,
+                     stacked_num=3):
+    assert stacked_num % 2 == 1
+
+    emb = fluid.layers.embedding(
+        input=data, size=[input_dim, emb_dim], is_sparse=True)
+    # add bias attr
+
+    # TODO(qijun) linear act
+    fc1 = fluid.layers.fc(input=emb, size=hid_dim)
+    lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
+
+    inputs = [fc1, lstm1]
+
+    for i in range(2, stacked_num + 1):
+        fc = fluid.layers.fc(input=inputs, size=hid_dim)
+        lstm, cell = fluid.layers.dynamic_lstm(
+            input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
+        inputs = [fc, lstm]
+
+    fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
+    lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
+
+    prediction = fluid.layers.fc(input=[fc_last, lstm_last],
+                                 size=class_dim,
+                                 act='softmax')
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(cost)
+    accuracy = fluid.layers.accuracy(input=prediction, label=label)
+    return avg_cost, accuracy, prediction
+
+
+def train(word_dict,
+          net_method,
+          use_cuda,
+          parallel=False,
+          save_dirname=None,
+          is_local=True):
+    BATCH_SIZE = 128
+    PASS_NUM = 5
+    dict_dim = len(word_dict)
+    class_dim = 2
+
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+    if not parallel:
+        cost, acc_out, prediction = net_method(
+            data, label, input_dim=dict_dim, class_dim=class_dim)
+    else:
+        places = fluid.layers.get_places()
+        pd = fluid.layers.ParallelDo(places)
+        with pd.do():
+            cost, acc, _ = net_method(
+                pd.read_input(data),
+                pd.read_input(label),
+                input_dim=dict_dim,
+                class_dim=class_dim)
+            pd.write_output(cost)
+            pd.write_output(acc)
+
+        cost, acc = pd()
+        cost = fluid.layers.mean(cost)
+        acc_out = fluid.layers.mean(acc)
+        prediction = None
+        assert save_dirname is None
+
+    adagrad = fluid.optimizer.Adagrad(learning_rate=0.002)
+    adagrad.minimize(cost)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=1000),
+        batch_size=BATCH_SIZE)
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+
+    def train_loop(main_program):
+        exe.run(fluid.default_startup_program())
+
+        for pass_id in xrange(PASS_NUM):
+            for data in train_data():
+                cost_val, acc_val = exe.run(main_program,
+                                            feed=feeder.feed(data),
+                                            fetch_list=[cost, acc_out])
+                print("cost=" + str(cost_val) + " acc=" + str(acc_val))
+                if cost_val < 0.4 and acc_val > 0.8:
+                    if save_dirname is not None:
+                        fluid.io.save_inference_model(save_dirname, ["words"],
+                                                      prediction, exe)
+                    return
+                if math.isnan(float(cost_val)):
+                    sys.exit("got NaN loss, training failed.")
+        raise AssertionError("Cost is too large for {0}".format(
+            net_method.__name__))
+
+    if is_local:
+        train_loop(fluid.default_main_program())
+    else:
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
+        current_endpoint = os.getenv("POD_IP") + ":" + port
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
+        t = fluid.DistributeTranspiler()
+        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
+        if training_role == "PSERVER":
+            pserver_prog = t.get_pserver_program(current_endpoint)
+            pserver_startup = t.get_startup_program(current_endpoint,
+                                                    pserver_prog)
+            exe.run(pserver_startup)
+            exe.run(pserver_prog)
+        elif training_role == "TRAINER":
+            train_loop(t.get_trainer_program())
+
+
+def infer(word_dict, use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+        word_dict_len = len(word_dict)
+
+        # Setup input by creating LoDTensor to represent sequence of words.
+        # Here each word is the basic element of the LoDTensor and the shape of 
+        # each word (base_shape) should be [1] since it is simply an index to 
+        # look up for the corresponding word vector.
+        # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+        # which has only one level of detail. Then the created LoDTensor will have only 
+        # one higher level structure (sequence of words, or sentence) than the basic 
+        # element (word). Hence the LoDTensor will hold data for three sentences of 
+        # length 3, 4 and 2, respectively. 
+        # Note that recursive_sequence_lengths should be a list of lists.
+        recursive_seq_lens = [[3, 4, 2]]
+        base_shape = [1]
+        # The range of random integers is [low, high]
+        tensor_words = fluid.create_random_int_lodtensor(
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
+
+        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+        # and results will contain a list of data corresponding to fetch_targets.
+        assert feed_target_names[0] == "words"
+        results = exe.run(inference_program,
+                          feed={feed_target_names[0]: tensor_words},
+                          fetch_list=fetch_targets,
+                          return_numpy=False)
+        print(results[0].recursive_sequence_lengths())
+        np_data = np.array(results[0])
+        print("Inference Shape: ", np_data.shape)
+        print("Inference results: ", np_data)
+
+
+def main(word_dict, net_method, use_cuda, parallel=False, save_dirname=None):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    train(
+        word_dict,
+        net_method,
+        use_cuda,
+        parallel=parallel,
+        save_dirname=save_dirname)
+    infer(word_dict, use_cuda, save_dirname)
+
+
+class TestUnderstandSentiment(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.word_dict = paddle.dataset.imdb.word_dict()
+
+    @contextlib.contextmanager
+    def new_program_scope(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+    def test_conv_cpu(self):
+        with self.new_program_scope():
+            main(
+                self.word_dict,
+                net_method=convolution_net,
+                use_cuda=False,
+                save_dirname="understand_sentiment_conv.inference.model")
+
+    def test_conv_cpu_parallel(self):
+        with self.new_program_scope():
+            main(
+                self.word_dict,
+                net_method=convolution_net,
+                use_cuda=False,
+                parallel=True)
+
+    @unittest.skip(reason="make CI faster")
+    def test_stacked_lstm_cpu(self):
+        with self.new_program_scope():
+            main(
+                self.word_dict,
+                net_method=stacked_lstm_net,
+                use_cuda=False,
+                save_dirname="understand_sentiment_stacked_lstm.inference.model")
+
+    def test_stacked_lstm_cpu_parallel(self):
+        with self.new_program_scope():
+            main(
+                self.word_dict,
+                net_method=stacked_lstm_net,
+                use_cuda=False,
+                parallel=True)
+
+    def test_conv_gpu(self):
+        with self.new_program_scope():
+            main(
+                self.word_dict,
+                net_method=convolution_net,
+                use_cuda=True,
+                save_dirname="understand_sentiment_conv.inference.model")
+
+    def test_conv_gpu_parallel(self):
+        with self.new_program_scope():
+            main(
+                self.word_dict,
+                net_method=convolution_net,
+                use_cuda=True,
+                parallel=True)
+
+    @unittest.skip(reason="make CI faster")
+    def test_stacked_lstm_gpu(self):
+        with self.new_program_scope():
+            main(
+                self.word_dict,
+                net_method=stacked_lstm_net,
+                use_cuda=True,
+                save_dirname="understand_sentiment_stacked_lstm.inference.model")
+
+    def test_stacked_lstm_gpu_parallel(self):
+        with self.new_program_scope():
+            main(
+                self.word_dict,
+                net_method=stacked_lstm_net,
+                use_cuda=True,
+                parallel=True)
+
+    @unittest.skip(reason='make CI faster')
+    def test_dynrnn_lstm_gpu(self):
+        with self.new_program_scope():
+            main(
+                self.word_dict,
+                net_method=dyn_rnn_lstm,
+                use_cuda=True,
+                parallel=False)
+
+    def test_dynrnn_lstm_gpu_parallel(self):
+        with self.new_program_scope():
+            main(
+                self.word_dict,
+                net_method=dyn_rnn_lstm,
+                use_cuda=True,
+                parallel=True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py
index 93ef66851b0efd65361122853dadeefe11992ed5..71bf5f8b3a9b17f24ce35220a9348bb871852623 100644
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import contextlib
 import numpy
@@ -33,7 +33,7 @@ def train(use_cuda, save_dirname, is_local):
     avg_cost = fluid.layers.mean(cost)
 
     sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-    optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+    sgd_optimizer.minimize(avg_cost)
 
     BATCH_SIZE = 20
 
@@ -69,23 +69,18 @@ def train(use_cuda, save_dirname, is_local):
     if is_local:
         train_loop(fluid.default_main_program())
     else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
         eplist = []
         for ip in pserver_ips.split(","):
             eplist.append(':'.join([ip, port]))
         pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
         current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
         t = fluid.DistributeTranspiler()
-        t.transpile(
-            optimize_ops,
-            params_grads,
-            trainer_id,
-            pservers=pserver_endpoints,
-            trainers=trainers)
+        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
         if training_role == "PSERVER":
             pserver_prog = t.get_pserver_program(current_endpoint)
             pserver_startup = t.get_startup_program(current_endpoint,
@@ -115,14 +110,23 @@ def infer(use_cuda, save_dirname=None):
         # The input's dimension should be 2-D and the second dim is 13
         # The input data should be >= 0
         batch_size = 10
-        tensor_x = numpy.random.uniform(0, 10,
-                                        [batch_size, 13]).astype("float32")
+
+        test_reader = paddle.batch(
+            paddle.dataset.uci_housing.test(), batch_size=batch_size)
+
+        test_data = test_reader().next()
+        test_feat = numpy.array(
+            [data[0] for data in test_data]).astype("float32")
+        test_label = numpy.array(
+            [data[1] for data in test_data]).astype("float32")
+
         assert feed_target_names[0] == 'x'
         results = exe.run(inference_program,
-                          feed={feed_target_names[0]: tensor_x},
+                          feed={feed_target_names[0]: numpy.array(test_feat)},
                           fetch_list=fetch_targets)
         print("infer shape: ", results[0].shape)
         print("infer results: ", results[0])
+        print("ground truth: ", test_label)
 
 
 def main(use_cuda, is_local=True):
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index b01c1875d64d7fc14e0141672f7e8eab2b6a0394..a2fb186b86c9706ac1aff0de49defbfb06e2eb0f 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import contextlib
 import math
@@ -22,10 +22,17 @@ import sys
 import numpy
 import unittest
 import os
+import numpy as np
 
 
 def resnet_cifar10(input, depth=32):
-    def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+    def conv_bn_layer(input,
+                      ch_out,
+                      filter_size,
+                      stride,
+                      padding,
+                      act='relu',
+                      bias_attr=False):
         tmp = fluid.layers.conv2d(
             input=input,
             filter_size=filter_size,
@@ -33,7 +40,7 @@ def resnet_cifar10(input, depth=32):
             stride=stride,
             padding=padding,
             act=None,
-            bias_attr=False)
+            bias_attr=bias_attr)
         return fluid.layers.batch_norm(input=tmp, act=act)
 
     def shortcut(input, ch_in, ch_out, stride):
@@ -44,7 +51,7 @@ def resnet_cifar10(input, depth=32):
 
     def basicblock(input, ch_in, ch_out, stride):
         tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
-        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None)
+        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None, bias_attr=True)
         short = shortcut(input, ch_in, ch_out, stride)
         return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
 
@@ -118,7 +125,7 @@ def train(net_type, use_cuda, save_dirname, is_local):
     test_program = fluid.default_main_program().clone(for_test=True)
 
     optimizer = fluid.optimizer.Adam(learning_rate=0.001)
-    optimize_ops, params_grads = optimizer.minimize(avg_cost)
+    optimizer.minimize(avg_cost)
 
     BATCH_SIZE = 128
     PASS_NUM = 1
@@ -171,23 +178,18 @@ def train(net_type, use_cuda, save_dirname, is_local):
     if is_local:
         train_loop(fluid.default_main_program())
     else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
         eplist = []
         for ip in pserver_ips.split(","):
             eplist.append(':'.join([ip, port]))
         pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
         current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
         t = fluid.DistributeTranspiler()
-        t.transpile(
-            optimize_ops,
-            params_grads,
-            trainer_id,
-            pservers=pserver_endpoints,
-            trainers=trainers)
+        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
         if training_role == "PSERVER":
             pserver_prog = t.get_pserver_program(current_endpoint)
             pserver_startup = t.get_startup_program(current_endpoint,
@@ -219,13 +221,32 @@ def infer(use_cuda, save_dirname=None):
         batch_size = 1
         tensor_img = numpy.random.rand(batch_size, 3, 32, 32).astype("float32")
 
+        # Use inference_transpiler to speedup
+        inference_transpiler_program = inference_program.clone()
+        t = fluid.InferenceTranspiler()
+        t.transpile(inference_transpiler_program, place)
+
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
         results = exe.run(inference_program,
                           feed={feed_target_names[0]: tensor_img},
                           fetch_list=fetch_targets)
+
+        transpiler_results = exe.run(inference_transpiler_program,
+                                     feed={feed_target_names[0]: tensor_img},
+                                     fetch_list=fetch_targets)
+
+        assert len(results[0]) == len(transpiler_results[0])
+        for i in range(len(results[0])):
+            np.testing.assert_almost_equal(
+                results[0][i], transpiler_results[0][i], decimal=5)
+
         print("infer results: ", results[0])
 
+        fluid.io.save_inference_model(save_dirname, feed_target_names,
+                                      fetch_targets, exe,
+                                      inference_transpiler_program)
+
 
 def main(net_type, use_cuda, is_local=True):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index f488527e0bc69059bc44422aa28188441f3d5b54..d489feae9c568ec1d9e3a230766d10d1ced0200a 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -12,17 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import contextlib
 import math
-
 import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.dataset.conll05 as conll05
-import paddle.fluid as fluid
-from paddle.fluid.initializer import init_on_cpu
-import contextlib
+import os
 import time
 import unittest
-import os
+
+import paddle
+import paddle.dataset.conll05 as conll05
+import paddle.fluid as fluid
 
 word_dict, verb_dict, label_dict = conll05.get_dict()
 word_dict_len = len(word_dict)
@@ -109,36 +108,13 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
         input_tmp = [mix_hidden, lstm]
 
     feature_out = fluid.layers.sums(input=[
-        fluid.layers.fc(input=input_tmp[0], size=label_dict_len),
-        fluid.layers.fc(input=input_tmp[1], size=label_dict_len)
+        fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
+        fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
     ])
 
     return feature_out
 
 
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
-def create_random_lodtensor(lod, place, low, high):
-    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
-    res = fluid.LoDTensor()
-    res.set(data, place)
-    res.set_lod([lod])
-    return res
-
-
 def train(use_cuda, save_dirname=None, is_local=True):
     # define network topology
     word = fluid.layers.data(
@@ -171,23 +147,17 @@ def train(use_cuda, save_dirname=None, is_local=True):
     # check other optimizers and check why out will be NAN
     sgd_optimizer = fluid.optimizer.SGD(
         learning_rate=fluid.layers.exponential_decay(
-            learning_rate=0.0001,
+            learning_rate=0.01,
             decay_steps=100000,
             decay_rate=0.5,
             staircase=True))
-    optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+    sgd_optimizer.minimize(avg_cost)
 
     # TODO(qiao)
     # add dependency track and move this config before optimizer
     crf_decode = fluid.layers.crf_decoding(
         input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
 
-    chunk_evaluator = fluid.evaluator.ChunkEvaluator(
-        input=crf_decode,
-        label=target,
-        chunk_scheme="IOB",
-        num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
-
     train_data = paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.conll05.test(), buf_size=8192),
@@ -203,7 +173,6 @@ def train(use_cuda, save_dirname=None, is_local=True):
 
     def train_loop(main_program):
         exe.run(fluid.default_startup_program())
-
         embedding_param = fluid.global_scope().find_var(
             embedding_name).get_tensor()
         embedding_param.set(
@@ -213,27 +182,19 @@ def train(use_cuda, save_dirname=None, is_local=True):
         start_time = time.time()
         batch_id = 0
         for pass_id in xrange(PASS_NUM):
-            chunk_evaluator.reset(exe)
             for data in train_data():
-                cost, precision, recall, f1_score = exe.run(
-                    main_program,
-                    feed=feeder.feed(data),
-                    fetch_list=[avg_cost] + chunk_evaluator.metrics)
-                pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(
-                    exe)
+                cost = exe.run(main_program,
+                               feed=feeder.feed(data),
+                               fetch_list=[avg_cost])
+                cost = cost[0]
 
                 if batch_id % 10 == 0:
-                    print("avg_cost:" + str(cost) + " precision:" + str(
-                        precision) + " recall:" + str(recall) + " f1_score:" +
-                          str(f1_score) + " pass_precision:" + str(
-                              pass_precision) + " pass_recall:" + str(
-                                  pass_recall) + " pass_f1_score:" + str(
-                                      pass_f1_score))
+                    print("avg_cost:" + str(cost))
                     if batch_id != 0:
                         print("second per batch: " + str((time.time(
                         ) - start_time) / batch_id))
                     # Set the threshold low to speed up the CI test
-                    if float(pass_precision) > 0.05:
+                    if float(cost) < 60.0:
                         if save_dirname is not None:
                             # TODO(liuyiqun): Change the target to crf_decode
                             fluid.io.save_inference_model(save_dirname, [
@@ -248,23 +209,18 @@ def train(use_cuda, save_dirname=None, is_local=True):
     if is_local:
         train_loop(fluid.default_main_program())
     else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
         eplist = []
         for ip in pserver_ips.split(","):
             eplist.append(':'.join([ip, port]))
         pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
         current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
         t = fluid.DistributeTranspiler()
-        t.transpile(
-            optimize_ops,
-            params_grads,
-            trainer_id,
-            pservers=pserver_endpoints,
-            trainers=trainers)
+        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
         if training_role == "PSERVER":
             pserver_prog = t.get_pserver_program(current_endpoint)
             pserver_startup = t.get_startup_program(current_endpoint,
@@ -291,23 +247,67 @@ def infer(use_cuda, save_dirname=None):
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
 
-        lod = [0, 4, 10]
-        word = create_random_lodtensor(
-            lod, place, low=0, high=word_dict_len - 1)
-        pred = create_random_lodtensor(
-            lod, place, low=0, high=pred_dict_len - 1)
-        ctx_n2 = create_random_lodtensor(
-            lod, place, low=0, high=word_dict_len - 1)
-        ctx_n1 = create_random_lodtensor(
-            lod, place, low=0, high=word_dict_len - 1)
-        ctx_0 = create_random_lodtensor(
-            lod, place, low=0, high=word_dict_len - 1)
-        ctx_p1 = create_random_lodtensor(
-            lod, place, low=0, high=word_dict_len - 1)
-        ctx_p2 = create_random_lodtensor(
-            lod, place, low=0, high=word_dict_len - 1)
-        mark = create_random_lodtensor(
-            lod, place, low=0, high=mark_dict_len - 1)
+        # Setup input by creating LoDTensor to represent sequence of words.
+        # Here each word is the basic element of the LoDTensor and the shape of 
+        # each word (base_shape) should be [1] since it is simply an index to 
+        # look up for the corresponding word vector.
+        # Suppose the recursive_sequence_lengths info is set to [[3, 4, 2]],
+        # which has only one level of detail. Then the created LoDTensor will have only 
+        # one higher level structure (sequence of words, or sentence) than the basic 
+        # element (word). Hence the LoDTensor will hold data for three sentences of 
+        # length 3, 4 and 2, respectively. 
+        # Note that recursive_sequence_lengths should be a list of lists.
+        recursive_seq_lens = [[3, 4, 2]]
+        base_shape = [1]
+        # The range of random integers is [low, high]
+        word = fluid.create_random_int_lodtensor(
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
+        pred = fluid.create_random_int_lodtensor(
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=pred_dict_len - 1)
+        ctx_n2 = fluid.create_random_int_lodtensor(
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
+        ctx_n1 = fluid.create_random_int_lodtensor(
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
+        ctx_0 = fluid.create_random_int_lodtensor(
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
+        ctx_p1 = fluid.create_random_int_lodtensor(
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
+        ctx_p2 = fluid.create_random_int_lodtensor(
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=word_dict_len - 1)
+        mark = fluid.create_random_int_lodtensor(
+            recursive_seq_lens,
+            base_shape,
+            place,
+            low=0,
+            high=mark_dict_len - 1)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
@@ -333,7 +333,7 @@ def infer(use_cuda, save_dirname=None):
                           },
                           fetch_list=fetch_targets,
                           return_numpy=False)
-        print(results[0].lod())
+        print(results[0].recursive_sequence_lengths())
         np_data = np.array(results[0])
         print("Inference Shape: ", np_data.shape)
 
diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py
index fa38bd3762423497b82c3b421b3a1db4cd87525b..90c301a66105d8d872ee531556c5060b5d727515 100644
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -14,7 +14,7 @@
 import contextlib
 
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.framework as framework
 import paddle.fluid.layers as pd
@@ -108,7 +108,7 @@ def decoder_decode(context, is_sparse):
         pre_state = pd.array_read(array=state_array, i=counter)
         pre_score = pd.array_read(array=scores_array, i=counter)
 
-        # expand the lod of pre_state to be the same with pre_score
+        # expand the recursive_sequence_lengths of pre_state to be the same with pre_score
         pre_state_expanded = pd.sequence_expand(pre_state, pre_score)
 
         pre_ids_emb = pd.embedding(
@@ -118,17 +118,27 @@ def decoder_decode(context, is_sparse):
             is_sparse=is_sparse)
 
         # use rnn unit to update rnn
-        current_state = pd.fc(input=[pre_ids_emb, pre_state_expanded],
+        current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb],
                               size=decoder_size,
                               act='tanh')
-
+        current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score)
         # use score to do beam search
-        current_score = pd.fc(input=current_state,
+        current_score = pd.fc(input=current_state_with_lod,
                               size=target_dict_dim,
                               act='softmax')
-        topk_scores, topk_indices = pd.topk(current_score, k=50)
+        topk_scores, topk_indices = pd.topk(current_score, k=beam_size)
+        # calculate accumulated scores after topk to reduce computation cost
+        accu_scores = pd.elementwise_add(
+            x=pd.log(topk_scores), y=pd.reshape(
+                pre_score, shape=[-1]), axis=0)
         selected_ids, selected_scores = pd.beam_search(
-            pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)
+            pre_ids,
+            pre_score,
+            topk_indices,
+            accu_scores,
+            beam_size,
+            end_id=10,
+            level=0)
 
         pd.increment(x=counter, value=1, in_place=True)
 
@@ -137,38 +147,20 @@ def decoder_decode(context, is_sparse):
         pd.array_write(selected_ids, array=ids_array, i=counter)
         pd.array_write(selected_scores, array=scores_array, i=counter)
 
-        pd.less_than(x=counter, y=array_len, cond=cond)
+        # update the break condition: up to the max length or all candidates of
+        # source sentences have ended.
+        length_cond = pd.less_than(x=counter, y=array_len)
+        finish_cond = pd.logical_not(pd.is_empty(x=selected_ids))
+        pd.logical_and(x=length_cond, y=finish_cond, out=cond)
 
     translation_ids, translation_scores = pd.beam_search_decode(
-        ids=ids_array, scores=scores_array)
+        ids=ids_array, scores=scores_array, beam_size=beam_size, end_id=10)
 
     # return init_ids, init_scores
 
     return translation_ids, translation_scores
 
 
-def set_init_lod(data, lod, place):
-    res = fluid.LoDTensor()
-    res.set(data, place)
-    res.set_lod(lod)
-    return res
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
 def train_main(use_cuda, is_sparse, is_local=True):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
@@ -185,30 +177,32 @@ def train_main(use_cuda, is_sparse, is_local=True):
         learning_rate=1e-4,
         regularization=fluid.regularizer.L2DecayRegularizer(
             regularization_coeff=0.1))
-    optimize_ops, params_grads = optimizer.minimize(avg_cost)
+    optimizer.minimize(avg_cost)
 
     train_data = paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.wmt14.train(dict_size), buf_size=1000),
         batch_size=batch_size)
 
+    feed_order = [
+        'src_word_id', 'target_language_word', 'target_language_next_word'
+    ]
+
     exe = Executor(place)
 
     def train_loop(main_program):
         exe.run(framework.default_startup_program())
 
+        feed_list = [
+            main_program.global_block().var(var_name) for var_name in feed_order
+        ]
+        feeder = fluid.DataFeeder(feed_list, place)
+
         batch_id = 0
         for pass_id in xrange(1):
             for data in train_data():
-                word_data = to_lodtensor(map(lambda x: x[0], data), place)
-                trg_word = to_lodtensor(map(lambda x: x[1], data), place)
-                trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
                 outs = exe.run(main_program,
-                               feed={
-                                   'src_word_id': word_data,
-                                   'target_language_word': trg_word,
-                                   'target_language_next_word': trg_word_next
-                               },
+                               feed=feeder.feed(data),
                                fetch_list=[avg_cost])
                 avg_cost_val = np.array(outs[0])
                 print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
@@ -220,23 +214,18 @@ def train_main(use_cuda, is_sparse, is_local=True):
     if is_local:
         train_loop(framework.default_main_program())
     else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
         eplist = []
         for ip in pserver_ips.split(","):
             eplist.append(':'.join([ip, port]))
         pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
         current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
         t = fluid.DistributeTranspiler()
-        t.transpile(
-            optimize_ops,
-            params_grads,
-            trainer_id,
-            pservers=pserver_endpoints,
-            trainers=trainers)
+        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
         if training_role == "PSERVER":
             pserver_prog = t.get_pserver_program(current_endpoint)
             pserver_startup = t.get_startup_program(current_endpoint,
@@ -263,29 +252,37 @@ def decode_main(use_cuda, is_sparse):
         [1. for _ in range(batch_size)], dtype='float32')
     init_ids_data = init_ids_data.reshape((batch_size, 1))
     init_scores_data = init_scores_data.reshape((batch_size, 1))
-    init_lod = [i for i in range(batch_size)] + [batch_size]
-    init_lod = [init_lod, init_lod]
+    init_recursive_seq_lens = [1] * batch_size
+    init_recursive_seq_lens = [init_recursive_seq_lens, init_recursive_seq_lens]
+
+    init_ids = fluid.create_lod_tensor(init_ids_data, init_recursive_seq_lens,
+                                       place)
+    init_scores = fluid.create_lod_tensor(init_scores_data,
+                                          init_recursive_seq_lens, place)
 
     train_data = paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.wmt14.train(dict_size), buf_size=1000),
         batch_size=batch_size)
-    for _, data in enumerate(train_data()):
-        init_ids = set_init_lod(init_ids_data, init_lod, place)
-        init_scores = set_init_lod(init_scores_data, init_lod, place)
 
-        src_word_data = to_lodtensor(map(lambda x: x[0], data), place)
+    feed_order = ['src_word_id']
+    feed_list = [
+        framework.default_main_program().global_block().var(var_name)
+        for var_name in feed_order
+    ]
+    feeder = fluid.DataFeeder(feed_list, place)
+
+    for data in train_data():
+        feed_dict = feeder.feed(map(lambda x: [x[0]], data))
+        feed_dict['init_ids'] = init_ids
+        feed_dict['init_scores'] = init_scores
 
         result_ids, result_scores = exe.run(
             framework.default_main_program(),
-            feed={
-                'src_word_id': src_word_data,
-                'init_ids': init_ids,
-                'init_scores': init_scores
-            },
+            feed=feed_dict,
             fetch_list=[translation_ids, translation_scores],
             return_numpy=False)
-        print result_ids.lod()
+        print result_ids.recursive_sequence_lengths()
         break
 
 
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
index e85b97a7f430b6d752baa179f27a7d15bc4d9a81..5f5c8544bbdb87421f129b201a0ebaf4cb8602a1 100644
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -14,7 +14,7 @@
 from __future__ import print_function
 import argparse
 import paddle.fluid as fluid
-import paddle.v2 as paddle
+import paddle
 import sys
 import numpy
 import unittest
@@ -94,8 +94,8 @@ def train(nn_type,
 
     test_program = fluid.default_main_program().clone(for_test=True)
 
-    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
-    optimize_ops, params_grads = optimizer.minimize(avg_loss)
+    optimizer = fluid.optimizer.Adam(learning_rate=0.001, LARS_weight_decay=0.3)
+    optimizer.minimize(avg_loss)
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
@@ -151,24 +151,18 @@ def train(nn_type,
     if is_local:
         train_loop(fluid.default_main_program())
     else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
         eplist = []
         for ip in pserver_ips.split(","):
             eplist.append(':'.join([ip, port]))
         pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        pserver_endpoints = os.getenv("PSERVERS")
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
         current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
         t = fluid.DistributeTranspiler()
-        t.transpile(
-            optimize_ops,
-            params_grads,
-            trainer_id,
-            pservers=pserver_endpoints,
-            trainers=trainers)
+        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
         if training_role == "PSERVER":
             pserver_prog = t.get_pserver_program(current_endpoint)
             pserver_startup = t.get_startup_program(current_endpoint,
diff --git a/python/paddle/fluid/tests/book/test_recommender_system.py b/python/paddle/fluid/tests/book/test_recommender_system.py
index 2ce66d32c993672793b0db213267d1f80b5c49dd..6548766ef5d0162b50d4dd072e8e91dd95dc5d2b 100644
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -16,7 +16,7 @@ import math
 import sys
 import os
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.framework as framework
 import paddle.fluid.layers as layers
@@ -160,7 +160,7 @@ def train(use_cuda, save_dirname, is_local=True):
     test_program = fluid.default_main_program().clone(for_test=True)
 
     sgd_optimizer = SGDOptimizer(learning_rate=0.2)
-    optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+    sgd_optimizer.minimize(avg_cost)
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
@@ -173,63 +173,33 @@ def train(use_cuda, save_dirname, is_local=True):
     test_reader = paddle.batch(
         paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)
 
-    feeding = {
-        'user_id': 0,
-        'gender_id': 1,
-        'age_id': 2,
-        'job_id': 3,
-        'movie_id': 4,
-        'category_id': 5,
-        'movie_title': 6,
-        'score': 7
-    }
-
-    def func_feed(feeding, data):
-        feed_tensors = {}
-        for (key, idx) in feeding.iteritems():
-            tensor = fluid.LoDTensor()
-            if key != "category_id" and key != "movie_title":
-                if key == "score":
-                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
-                        "float32")
-                else:
-                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
-                        "int64")
-            else:
-                numpy_data = map(lambda x: np.array(x[idx]).astype("int64"),
-                                 data)
-                lod_info = [len(item) for item in numpy_data]
-                offset = 0
-                lod = [offset]
-                for item in lod_info:
-                    offset += item
-                    lod.append(offset)
-                numpy_data = np.concatenate(numpy_data, axis=0)
-                tensor.set_lod([lod])
-
-            numpy_data = numpy_data.reshape([numpy_data.shape[0], 1])
-            tensor.set(numpy_data, place)
-            feed_tensors[key] = tensor
-        return feed_tensors
+    feed_order = [
+        'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id',
+        'movie_title', 'score'
+    ]
 
     def train_loop(main_program):
         exe.run(framework.default_startup_program())
 
+        feed_list = [
+            main_program.global_block().var(var_name) for var_name in feed_order
+        ]
+        feeder = fluid.DataFeeder(feed_list, place)
+
         PASS_NUM = 100
         for pass_id in range(PASS_NUM):
             for batch_id, data in enumerate(train_reader()):
                 # train a mini-batch
                 outs = exe.run(program=main_program,
-                               feed=func_feed(feeding, data),
+                               feed=feeder.feed(data),
                                fetch_list=[avg_cost])
                 out = np.array(outs[0])
                 if (batch_id + 1) % 10 == 0:
                     avg_cost_set = []
                     for test_data in test_reader():
-                        avg_cost_np = exe.run(
-                            program=test_program,
-                            feed=func_feed(feeding, test_data),
-                            fetch_list=[avg_cost])
+                        avg_cost_np = exe.run(program=test_program,
+                                              feed=feeder.feed(test_data),
+                                              fetch_list=[avg_cost])
                         avg_cost_set.append(avg_cost_np[0])
                         break  # test only 1 segment for speeding up CI
 
@@ -250,23 +220,18 @@ def train(use_cuda, save_dirname, is_local=True):
     if is_local:
         train_loop(fluid.default_main_program())
     else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
         eplist = []
         for ip in pserver_ips.split(","):
             eplist.append(':'.join([ip, port]))
         pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
         current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
         t = fluid.DistributeTranspiler()
-        t.transpile(
-            optimize_ops,
-            params_grads,
-            trainer_id,
-            pservers=pserver_endpoints,
-            trainers=trainers)
+        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
         if training_role == "PSERVER":
             pserver_prog = t.get_pserver_program(current_endpoint)
             pserver_startup = t.get_startup_program(current_endpoint,
@@ -284,23 +249,6 @@ def infer(use_cuda, save_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
 
-    def create_lod_tensor(data, lod=None):
-        tensor = fluid.LoDTensor()
-        if lod is None:
-            # Tensor, the shape is [batch_size, 1]
-            index = 0
-            lod_0 = [index]
-            for l in range(len(data)):
-                index += 1
-                lod_0.append(index)
-            lod = [lod_0]
-        tensor.set_lod(lod)
-
-        flattened_data = np.concatenate(data, axis=0).astype("int64")
-        flattened_data = flattened_data.reshape([len(flattened_data), 1])
-        tensor.set(flattened_data, place)
-        return tensor
-
     inference_scope = fluid.core.Scope()
     with fluid.scope_guard(inference_scope):
         # Use fluid.io.load_inference_model to obtain the inference program desc,
@@ -312,26 +260,35 @@ def infer(use_cuda, save_dirname=None):
 
         # Use the first data from paddle.dataset.movielens.test() as input
         assert feed_target_names[0] == "user_id"
-        user_id = create_lod_tensor([[1]])
+        # Use create_lod_tensor(data, recursive_sequence_lengths, place) API 
+        # to generate LoD Tensor where `data` is a list of sequences of index 
+        # numbers, `recursive_sequence_lengths` is the length-based level of detail 
+        # (lod) info associated with `data`.
+        # For example, data = [[10, 2, 3], [2, 3]] means that it contains
+        # two sequences of indexes, of length 3 and 2, respectively.
+        # Correspondingly, recursive_sequence_lengths = [[3, 2]] contains one 
+        # level of detail info, indicating that `data` consists of two sequences 
+        # of length 3 and 2, respectively. 
+        user_id = fluid.create_lod_tensor([[1]], [[1]], place)
 
         assert feed_target_names[1] == "gender_id"
-        gender_id = create_lod_tensor([[1]])
+        gender_id = fluid.create_lod_tensor([[1]], [[1]], place)
 
         assert feed_target_names[2] == "age_id"
-        age_id = create_lod_tensor([[0]])
+        age_id = fluid.create_lod_tensor([[0]], [[1]], place)
 
         assert feed_target_names[3] == "job_id"
-        job_id = create_lod_tensor([[10]])
+        job_id = fluid.create_lod_tensor([[10]], [[1]], place)
 
         assert feed_target_names[4] == "movie_id"
-        movie_id = create_lod_tensor([[783]])
+        movie_id = fluid.create_lod_tensor([[783]], [[1]], place)
 
         assert feed_target_names[5] == "category_id"
-        category_id = create_lod_tensor([[10], [8], [9]], [[0, 3]])
+        category_id = fluid.create_lod_tensor([[10, 8, 9]], [[3]], place)
 
         assert feed_target_names[6] == "movie_title"
-        movie_title = create_lod_tensor([[1069], [4140], [2923], [710], [988]],
-                                        [[0, 5]])
+        movie_title = fluid.create_lod_tensor([[1069, 4140, 2923, 710, 988]],
+                                              [[5]], place)
 
         # Construct feed as a dictionary of {feed_target_name: feed_target_data}
         # and results will contain a list of data corresponding to fetch_targets.
diff --git a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..467282624154086a874b0e73736ed5b1358915ff
--- /dev/null
+++ b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
@@ -0,0 +1,281 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.fluid.framework as framework
+import paddle.fluid.layers as layers
+import contextlib
+import math
+import sys
+import unittest
+from paddle.fluid.executor import Executor
+
+dict_size = 30000
+source_dict_dim = target_dict_dim = dict_size
+src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+hidden_dim = 32
+embedding_dim = 16
+batch_size = 10
+max_length = 50
+topk_size = 50
+encoder_size = decoder_size = hidden_dim
+IS_SPARSE = True
+USE_PEEPHOLES = False
+
+
+def bi_lstm_encoder(input_seq, hidden_size):
+    input_forward_proj = fluid.layers.fc(input=input_seq,
+                                         size=hidden_size * 4,
+                                         bias_attr=True)
+    forward, _ = fluid.layers.dynamic_lstm(
+        input=input_forward_proj,
+        size=hidden_size * 4,
+        use_peepholes=USE_PEEPHOLES)
+    input_backward_proj = fluid.layers.fc(input=input_seq,
+                                          size=hidden_size * 4,
+                                          bias_attr=True)
+    backward, _ = fluid.layers.dynamic_lstm(
+        input=input_backward_proj,
+        size=hidden_size * 4,
+        is_reverse=True,
+        use_peepholes=USE_PEEPHOLES)
+
+    forward_last = fluid.layers.sequence_last_step(input=forward)
+    backward_first = fluid.layers.sequence_first_step(input=backward)
+
+    return forward_last, backward_first
+
+
+# FIXME(peterzhang2029): Replace this function with the lstm_unit_op.
+def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
+    def linear(inputs):
+        return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
+
+    forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
+
+    cell_t = fluid.layers.sums(input=[
+        fluid.layers.elementwise_mul(
+            x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul(
+                x=input_gate, y=cell_tilde)
+    ])
+
+    hidden_t = fluid.layers.elementwise_mul(
+        x=output_gate, y=fluid.layers.tanh(x=cell_t))
+
+    return hidden_t, cell_t
+
+
+def lstm_decoder_without_attention(target_embedding, decoder_boot, context,
+                                   decoder_size):
+    rnn = fluid.layers.DynamicRNN()
+
+    cell_init = fluid.layers.fill_constant_batch_size_like(
+        input=decoder_boot,
+        value=0.0,
+        shape=[-1, decoder_size],
+        dtype='float32')
+    cell_init.stop_gradient = False
+
+    with rnn.block():
+        current_word = rnn.step_input(target_embedding)
+        context = rnn.static_input(context)
+
+        hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
+        cell_mem = rnn.memory(init=cell_init)
+        decoder_inputs = fluid.layers.concat(
+            input=[context, current_word], axis=1)
+        h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size)
+        rnn.update_memory(hidden_mem, h)
+        rnn.update_memory(cell_mem, c)
+        out = fluid.layers.fc(input=h,
+                              size=target_dict_dim,
+                              bias_attr=True,
+                              act='softmax')
+        rnn.output(out)
+    return rnn()
+
+
+def seq_to_seq_net():
+    """Construct a seq2seq network."""
+
+    src_word_idx = fluid.layers.data(
+        name='source_sequence', shape=[1], dtype='int64', lod_level=1)
+
+    src_embedding = fluid.layers.embedding(
+        input=src_word_idx,
+        size=[source_dict_dim, embedding_dim],
+        dtype='float32')
+
+    src_forward_last, src_backward_first = bi_lstm_encoder(
+        input_seq=src_embedding, hidden_size=encoder_size)
+
+    encoded_vector = fluid.layers.concat(
+        input=[src_forward_last, src_backward_first], axis=1)
+
+    decoder_boot = fluid.layers.fc(input=src_backward_first,
+                                   size=decoder_size,
+                                   bias_attr=False,
+                                   act='tanh')
+
+    trg_word_idx = fluid.layers.data(
+        name='target_sequence', shape=[1], dtype='int64', lod_level=1)
+
+    trg_embedding = fluid.layers.embedding(
+        input=trg_word_idx,
+        size=[target_dict_dim, embedding_dim],
+        dtype='float32')
+
+    prediction = lstm_decoder_without_attention(trg_embedding, decoder_boot,
+                                                encoded_vector, decoder_size)
+    label = fluid.layers.data(
+        name='label_sequence', shape=[1], dtype='int64', lod_level=1)
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(cost)
+
+    return avg_cost, prediction
+
+
+def train(use_cuda, save_dirname=None):
+    [avg_cost, prediction] = seq_to_seq_net()
+
+    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
+    optimizer.minimize(avg_cost)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = Executor(place)
+    exe.run(framework.default_startup_program())
+
+    feed_order = ['source_sequence', 'target_sequence', 'label_sequence']
+    feed_list = [
+        framework.default_main_program().global_block().var(var_name)
+        for var_name in feed_order
+    ]
+    feeder = fluid.DataFeeder(feed_list, place)
+
+    batch_id = 0
+    for pass_id in xrange(2):
+        for data in train_data():
+            outs = exe.run(framework.default_main_program(),
+                           feed=feeder.feed(data),
+                           fetch_list=[avg_cost])
+
+            avg_cost_val = np.array(outs[0])
+            print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
+                  " avg_cost=" + str(avg_cost_val))
+            if math.isnan(float(avg_cost_val[0])):
+                sys.exit("got NaN loss, training failed.")
+            if batch_id > 3:
+                if save_dirname is not None:
+                    fluid.io.save_inference_model(
+                        save_dirname, ['source_sequence',
+                                       'target_sequence'], [prediction], exe)
+                return
+
+            batch_id += 1
+
+
+def infer(use_cuda, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    inference_scope = fluid.core.Scope()
+    with fluid.scope_guard(inference_scope):
+        # Use fluid.io.load_inference_model to obtain the inference program desc,
+        # the feed_target_names (the names of variables that will be feeded
+        # data using feed operators), and the fetch_targets (variables that
+        # we want to obtain data from using fetch operators).
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+        # Setup input by creating LoDTensor to represent sequence of words.
+        # Here each word is the basic element of the LoDTensor and the shape of 
+        # each word (base_shape) should be [1] since it is simply an index to 
+        # look up for the corresponding word vector.
+        # Suppose the recursive_sequence_lengths info is set to [[4, 6]],
+        # which has only one level of detail. Then the created LoDTensor will have only 
+        # one higher level structure (sequence of words, or sentence) than the basic 
+        # element (word). Hence the LoDTensor will hold data for two sentences of 
+        # length 4 and 6, respectively. 
+        # Note that recursive_sequence_lengths should be a list of lists.
+        recursive_seq_lens = [[4, 6]]
+        base_shape = [1]
+        # The range of random integers is [low, high]
+        word_data = fluid.create_random_int_lodtensor(
+            recursive_seq_lens, base_shape, place, low=0, high=1)
+        trg_word = fluid.create_random_int_lodtensor(
+            recursive_seq_lens, base_shape, place, low=0, high=1)
+
+        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+        # and results will contain a list of data corresponding to fetch_targets.
+        assert feed_target_names[0] == 'source_sequence'
+        assert feed_target_names[1] == 'target_sequence'
+        results = exe.run(inference_program,
+                          feed={
+                              feed_target_names[0]: word_data,
+                              feed_target_names[1]: trg_word,
+                          },
+                          fetch_list=fetch_targets,
+                          return_numpy=False)
+        print(results[0].recursive_sequence_lengths())
+        np_data = np.array(results[0])
+        print("Inference shape: ", np_data.shape)
+        print("Inference results: ", np_data)
+
+
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    # Directory for saving the trained model
+    save_dirname = "rnn_encoder_decoder.inference.model"
+
+    train(use_cuda, save_dirname)
+    infer(use_cuda, save_dirname)
+
+
+class TestRnnEncoderDecoder(unittest.TestCase):
+    def test_cuda(self):
+        with self.scope_prog_guard():
+            main(use_cuda=True)
+
+    def test_cpu(self):
+        with self.scope_prog_guard():
+            main(use_cuda=False)
+
+    @contextlib.contextmanager
+    def scope_prog_guard(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/book/test_understand_sentiment.py b/python/paddle/fluid/tests/book/test_understand_sentiment.py
deleted file mode 100644
index d2f3f7404697feb0768f873070b97aeb3ba0cd64..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/test_understand_sentiment.py
+++ /dev/null
@@ -1,379 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-import paddle.v2 as paddle
-import contextlib
-import math
-import numpy as np
-import sys
-import os
-
-
-def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
-                    hid_dim=32):
-    emb = fluid.layers.embedding(
-        input=data, size=[input_dim, emb_dim], is_sparse=True)
-    conv_3 = fluid.nets.sequence_conv_pool(
-        input=emb,
-        num_filters=hid_dim,
-        filter_size=3,
-        act="tanh",
-        pool_type="sqrt")
-    conv_4 = fluid.nets.sequence_conv_pool(
-        input=emb,
-        num_filters=hid_dim,
-        filter_size=4,
-        act="tanh",
-        pool_type="sqrt")
-    prediction = fluid.layers.fc(input=[conv_3, conv_4],
-                                 size=class_dim,
-                                 act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(cost)
-    accuracy = fluid.layers.accuracy(input=prediction, label=label)
-    return avg_cost, accuracy, prediction
-
-
-def dyn_rnn_lstm(data, label, input_dim, class_dim=2, emb_dim=32,
-                 lstm_size=128):
-    emb = fluid.layers.embedding(
-        input=data, size=[input_dim, emb_dim], is_sparse=True)
-    sentence = fluid.layers.fc(input=emb, size=lstm_size, act='tanh')
-
-    rnn = fluid.layers.DynamicRNN()
-    with rnn.block():
-        word = rnn.step_input(sentence)
-        prev_hidden = rnn.memory(value=0.0, shape=[lstm_size])
-        prev_cell = rnn.memory(value=0.0, shape=[lstm_size])
-
-        def gate_common(ipt, hidden, size):
-            gate0 = fluid.layers.fc(input=ipt, size=size, bias_attr=True)
-            gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
-            return gate0 + gate1
-
-        forget_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
-                                                         lstm_size))
-        input_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
-                                                        lstm_size))
-        output_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
-                                                         lstm_size))
-        cell_gate = fluid.layers.sigmoid(x=gate_common(word, prev_hidden,
-                                                       lstm_size))
-
-        cell = forget_gate * prev_cell + input_gate * cell_gate
-        hidden = output_gate * fluid.layers.tanh(x=cell)
-        rnn.update_memory(prev_cell, cell)
-        rnn.update_memory(prev_hidden, hidden)
-        rnn.output(hidden)
-
-    last = fluid.layers.sequence_last_step(rnn())
-    prediction = fluid.layers.fc(input=last, size=class_dim, act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(cost)
-    accuracy = fluid.layers.accuracy(input=prediction, label=label)
-    return avg_cost, accuracy, prediction
-
-
-def stacked_lstm_net(data,
-                     label,
-                     input_dim,
-                     class_dim=2,
-                     emb_dim=128,
-                     hid_dim=512,
-                     stacked_num=3):
-    assert stacked_num % 2 == 1
-
-    emb = fluid.layers.embedding(
-        input=data, size=[input_dim, emb_dim], is_sparse=True)
-    # add bias attr
-
-    # TODO(qijun) linear act
-    fc1 = fluid.layers.fc(input=emb, size=hid_dim)
-    lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
-
-    inputs = [fc1, lstm1]
-
-    for i in range(2, stacked_num + 1):
-        fc = fluid.layers.fc(input=inputs, size=hid_dim)
-        lstm, cell = fluid.layers.dynamic_lstm(
-            input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
-        inputs = [fc, lstm]
-
-    fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
-    lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
-
-    prediction = fluid.layers.fc(input=[fc_last, lstm_last],
-                                 size=class_dim,
-                                 act='softmax')
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(cost)
-    accuracy = fluid.layers.accuracy(input=prediction, label=label)
-    return avg_cost, accuracy, prediction
-
-
-def create_random_lodtensor(lod, place, low, high):
-    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
-    res = fluid.LoDTensor()
-    res.set(data, place)
-    res.set_lod([lod])
-    return res
-
-
-def train(word_dict,
-          net_method,
-          use_cuda,
-          parallel=False,
-          save_dirname=None,
-          is_local=True):
-    BATCH_SIZE = 128
-    PASS_NUM = 5
-    dict_dim = len(word_dict)
-    class_dim = 2
-
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-
-    if not parallel:
-        cost, acc_out, prediction = net_method(
-            data, label, input_dim=dict_dim, class_dim=class_dim)
-    else:
-        places = fluid.layers.get_places()
-        pd = fluid.layers.ParallelDo(places)
-        with pd.do():
-            cost, acc, _ = net_method(
-                pd.read_input(data),
-                pd.read_input(label),
-                input_dim=dict_dim,
-                class_dim=class_dim)
-            pd.write_output(cost)
-            pd.write_output(acc)
-
-        cost, acc = pd()
-        cost = fluid.layers.mean(cost)
-        acc_out = fluid.layers.mean(acc)
-        prediction = None
-        assert save_dirname is None
-
-    adagrad = fluid.optimizer.Adagrad(learning_rate=0.002)
-    optimize_ops, params_grads = adagrad.minimize(cost)
-
-    train_data = paddle.batch(
-        paddle.reader.shuffle(
-            paddle.dataset.imdb.train(word_dict), buf_size=1000),
-        batch_size=BATCH_SIZE)
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
-
-    def train_loop(main_program):
-        exe.run(fluid.default_startup_program())
-
-        for pass_id in xrange(PASS_NUM):
-            for data in train_data():
-                cost_val, acc_val = exe.run(main_program,
-                                            feed=feeder.feed(data),
-                                            fetch_list=[cost, acc_out])
-                print("cost=" + str(cost_val) + " acc=" + str(acc_val))
-                if cost_val < 0.4 and acc_val > 0.8:
-                    if save_dirname is not None:
-                        fluid.io.save_inference_model(save_dirname, ["words"],
-                                                      prediction, exe)
-                    return
-                if math.isnan(float(cost_val)):
-                    sys.exit("got NaN loss, training failed.")
-        raise AssertionError("Cost is too large for {0}".format(
-            net_method.__name__))
-
-    if is_local:
-        train_loop(fluid.default_main_program())
-    else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
-        eplist = []
-        for ip in pserver_ips.split(","):
-            eplist.append(':'.join([ip, port]))
-        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
-        current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
-        t = fluid.DistributeTranspiler()
-        t.transpile(
-            optimize_ops,
-            params_grads,
-            trainer_id,
-            pservers=pserver_endpoints,
-            trainers=trainers)
-        if training_role == "PSERVER":
-            pserver_prog = t.get_pserver_program(current_endpoint)
-            pserver_startup = t.get_startup_program(current_endpoint,
-                                                    pserver_prog)
-            exe.run(pserver_startup)
-            exe.run(pserver_prog)
-        elif training_role == "TRAINER":
-            train_loop(t.get_trainer_program())
-
-
-def infer(word_dict, use_cuda, save_dirname=None):
-    if save_dirname is None:
-        return
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    inference_scope = fluid.core.Scope()
-    with fluid.scope_guard(inference_scope):
-        # Use fluid.io.load_inference_model to obtain the inference program desc,
-        # the feed_target_names (the names of variables that will be feeded
-        # data using feed operators), and the fetch_targets (variables that
-        # we want to obtain data from using fetch operators).
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
-
-        word_dict_len = len(word_dict)
-
-        lod = [0, 4, 10]
-        tensor_words = create_random_lodtensor(
-            lod, place, low=0, high=word_dict_len - 1)
-
-        # Construct feed as a dictionary of {feed_target_name: feed_target_data}
-        # and results will contain a list of data corresponding to fetch_targets.
-        assert feed_target_names[0] == "words"
-        results = exe.run(inference_program,
-                          feed={feed_target_names[0]: tensor_words},
-                          fetch_list=fetch_targets,
-                          return_numpy=False)
-        print(results[0].lod())
-        np_data = np.array(results[0])
-        print("Inference Shape: ", np_data.shape)
-        print("Inference results: ", np_data)
-
-
-def main(word_dict, net_method, use_cuda, parallel=False, save_dirname=None):
-    if use_cuda and not fluid.core.is_compiled_with_cuda():
-        return
-
-    train(
-        word_dict,
-        net_method,
-        use_cuda,
-        parallel=parallel,
-        save_dirname=save_dirname)
-    infer(word_dict, use_cuda, save_dirname)
-
-
-class TestUnderstandSentiment(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.word_dict = paddle.dataset.imdb.word_dict()
-
-    @contextlib.contextmanager
-    def new_program_scope(self):
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        scope = fluid.core.Scope()
-        with fluid.scope_guard(scope):
-            with fluid.program_guard(prog, startup_prog):
-                yield
-
-    def test_conv_cpu(self):
-        with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=convolution_net,
-                use_cuda=False,
-                save_dirname="understand_sentiment_conv.inference.model")
-
-    def test_conv_cpu_parallel(self):
-        with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=convolution_net,
-                use_cuda=False,
-                parallel=True)
-
-    @unittest.skip(reason="make CI faster")
-    def test_stacked_lstm_cpu(self):
-        with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=stacked_lstm_net,
-                use_cuda=False,
-                save_dirname="understand_sentiment_stacked_lstm.inference.model")
-
-    def test_stacked_lstm_cpu_parallel(self):
-        with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=stacked_lstm_net,
-                use_cuda=False,
-                parallel=True)
-
-    def test_conv_gpu(self):
-        with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=convolution_net,
-                use_cuda=True,
-                save_dirname="understand_sentiment_conv.inference.model")
-
-    def test_conv_gpu_parallel(self):
-        with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=convolution_net,
-                use_cuda=True,
-                parallel=True)
-
-    @unittest.skip(reason="make CI faster")
-    def test_stacked_lstm_gpu(self):
-        with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=stacked_lstm_net,
-                use_cuda=True,
-                save_dirname="understand_sentiment_stacked_lstm.inference.model")
-
-    def test_stacked_lstm_gpu_parallel(self):
-        with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=stacked_lstm_net,
-                use_cuda=True,
-                parallel=True)
-
-    @unittest.skip(reason='make CI faster')
-    def test_dynrnn_lstm_gpu(self):
-        with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=dyn_rnn_lstm,
-                use_cuda=True,
-                parallel=False)
-
-    def test_dynrnn_lstm_gpu_parallel(self):
-        with self.new_program_scope():
-            main(
-                self.word_dict,
-                net_method=dyn_rnn_lstm,
-                use_cuda=True,
-                parallel=True)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index 26b97c3e254f54b83515436660e44d4908c98fbe..49bd72c7a53c0ae740bdbabe15b1d37340699d41 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import unittest
 import os
@@ -21,15 +21,6 @@ import math
 import sys
 
 
-def create_random_lodtensor(lod, place, low, high):
-    # The range of data elements is [low, high]
-    data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
-    res = fluid.LoDTensor()
-    res.set(data, place)
-    res.set_lod([lod])
-    return res
-
-
 def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
     PASS_NUM = 100
     EMBED_SIZE = 32
@@ -101,7 +92,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
         avg_cost = fluid.layers.mean(pd())
 
     sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-    optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+    sgd_optimizer.minimize(avg_cost)
 
     train_reader = paddle.batch(
         paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
@@ -134,23 +125,18 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
     if is_local:
         train_loop(fluid.default_main_program())
     else:
-        port = os.getenv("PADDLE_INIT_PORT", "6174")
-        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
         eplist = []
         for ip in pserver_ips.split(","):
             eplist.append(':'.join([ip, port]))
         pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
-        trainers = int(os.getenv("TRAINERS"))
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
         current_endpoint = os.getenv("POD_IP") + ":" + port
-        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
-        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
         t = fluid.DistributeTranspiler()
-        t.transpile(
-            optimize_ops,
-            params_grads,
-            trainer_id,
-            pservers=pserver_endpoints,
-            trainers=trainers)
+        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
         if training_role == "PSERVER":
             pserver_prog = t.get_pserver_program(current_endpoint)
             pserver_startup = t.get_startup_program(current_endpoint,
@@ -180,16 +166,24 @@ def infer(use_cuda, save_dirname=None):
         word_dict = paddle.dataset.imikolov.build_dict()
         dict_size = len(word_dict)
 
-        # Setup inputs, by creating 4 words, the lod of which should be [0, 1]
-        lod = [0, 1]
-        first_word = create_random_lodtensor(
-            lod, place, low=0, high=dict_size - 1)
-        second_word = create_random_lodtensor(
-            lod, place, low=0, high=dict_size - 1)
-        third_word = create_random_lodtensor(
-            lod, place, low=0, high=dict_size - 1)
-        fourth_word = create_random_lodtensor(
-            lod, place, low=0, high=dict_size - 1)
+        # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word 
+        # is simply an index to look up for the corresponding word vector and hence 
+        # the shape of word (base_shape) should be [1]. The recursive_sequence_lengths, 
+        # which is length-based level of detail (lod) of each LoDTensor, should be [[1]] 
+        # meaning there is only one level of detail and there is only one sequence of 
+        # one word on this level.
+        # Note that recursive_sequence_lengths should be a list of lists.
+        recursive_seq_lens = [[1]]
+        base_shape = [1]
+        # The range of random integers is [low, high]
+        first_word = fluid.create_random_int_lodtensor(
+            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
+        second_word = fluid.create_random_int_lodtensor(
+            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
+        third_word = fluid.create_random_int_lodtensor(
+            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
+        fourth_word = fluid.create_random_int_lodtensor(
+            recursive_seq_lens, base_shape, place, low=0, high=dict_size - 1)
 
         assert feed_target_names[0] == 'firstw'
         assert feed_target_names[1] == 'secondw'
@@ -207,7 +201,7 @@ def infer(use_cuda, save_dirname=None):
                           },
                           fetch_list=fetch_targets,
                           return_numpy=False)
-        print(results[0].lod())
+        print(results[0].recursive_sequence_lengths())
         np_data = np.array(results[0])
         print("Inference Shape: ", np_data.shape)
 
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
index ad79e96b958b36a06c8a3cc990dbe3608e32c9ac..be347cd5315668dde0454d7959dbf9bcfa465b5f 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import math
 import sys
@@ -56,7 +56,7 @@ BATCH_SIZE = 200
 
 # fix the order of training data
 train_reader = paddle.batch(
-    paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE)
+    paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE, drop_last=False)
 
 # train_reader = paddle.batch(
 #     paddle.reader.shuffle(
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
index 204669d7e6176e9e8250e8aebc2d10441fa24b67..dfebb9a06ea4f290f128c486dcaccaeccdcef8c4 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import sys
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import math
 import sys
diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
index a24834a6f0b19d1265f6c8d7089d31583af82d1f..fa696acdfa9058af14f0bd34ce1a2980db5aeafc 100644
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
@@ -80,21 +80,6 @@ def encoder_decoder():
     return rnn()
 
 
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = core.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
 def main():
     rnn_out = encoder_decoder()
     label = layers.data(
@@ -122,18 +107,21 @@ def main():
 
     exe.run(framework.default_startup_program())
 
+    feed_order = [
+        'src_word_id', 'target_language_word', 'target_language_next_word'
+    ]
+
+    feed_list = [
+        fluid.default_main_program().global_block().var(var_name)
+        for var_name in feed_order
+    ]
+    feeder = fluid.DataFeeder(feed_list, place)
+
     batch_id = 0
     for pass_id in xrange(10):
         for data in train_data():
-            word_data = to_lodtensor(map(lambda x: x[0], data), place)
-            trg_word = to_lodtensor(map(lambda x: x[1], data), place)
-            trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
             outs = exe.run(fluid.default_main_program(),
-                           feed={
-                               'src_word_id': word_data,
-                               'target_language_word': trg_word,
-                               'target_language_next_word': trg_word_next
-                           },
+                           feed=feeder.feed(data),
                            fetch_list=[avg_cost])
             avg_cost_val = np.array(outs[0])
             print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
diff --git a/python/paddle/fluid/tests/demo/fc_gan.py b/python/paddle/fluid/tests/demo/fc_gan.py
index 7452ea2a34aa0c75d8e0990639b29705033af98b..8ea1b2b15cc0c0eb5bca67a9c5a6ac6c6774e7e2 100644
--- a/python/paddle/fluid/tests/demo/fc_gan.py
+++ b/python/paddle/fluid/tests/demo/fc_gan.py
@@ -19,7 +19,7 @@ import os
 import matplotlib
 import numpy
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 
 matplotlib.use('Agg')
diff --git a/python/paddle/fluid/tests/demo/text_classification/.gitignore b/python/paddle/fluid/tests/demo/text_classification/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..780d05b94667d3ea726e37bf9cf1b5b2baeff354
--- /dev/null
+++ b/python/paddle/fluid/tests/demo/text_classification/.gitignore
@@ -0,0 +1 @@
+*.recordio
diff --git a/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py b/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py
new file mode 100644
index 0000000000000000000000000000000000000000..9425d472a48056e71da5da364f659971ef6c2520
--- /dev/null
+++ b/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py
@@ -0,0 +1,59 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+
+
+def load_vocab(filename):
+    """
+    load vocabulary
+    """
+    vocab = {}
+    with open(filename) as f:
+        wid = 0
+        for line in f:
+            vocab[line.strip()] = wid
+            wid += 1
+    return vocab
+
+
+# load word dict with paddle inner function
+word_dict = load_vocab(sys.argv[1])
+word_dict["<unk>"] = len(word_dict)
+print "Dict dim = ", len(word_dict)
+
+# input text data
+data = fluid.layers.data(name="words", shape=[1], dtype="int64", lod_level=1)
+
+# label data
+label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+# like placeholder
+feeder = fluid.DataFeeder(feed_list=[data, label], place=fluid.CPUPlace())
+
+# train data set
+BATCH_SIZE = 128
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.imdb.train(word_dict), buf_size=10000),
+    batch_size=BATCH_SIZE)
+
+test_reader = paddle.batch(
+    paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE)
+
+fluid.recordio_writer.convert_reader_to_recordio_file(
+    "train.recordio", feeder=feeder, reader_creator=train_reader)
+fluid.recordio_writer.convert_reader_to_recordio_file(
+    "test.recordio", feeder=feeder, reader_creator=test_reader)
diff --git a/python/paddle/fluid/tests/demo/text_classification/train.py b/python/paddle/fluid/tests/demo/text_classification/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..e408684c6e0941a1b317ffeac66f071c1382836d
--- /dev/null
+++ b/python/paddle/fluid/tests/demo/text_classification/train.py
@@ -0,0 +1,148 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import numpy
+import sys
+
+TRAIN_FILES = ['train.recordio']
+TEST_FILES = ['test.recordio']
+
+DICT_DIM = 89528
+
+# embedding dim
+emb_dim = 128
+
+# hidden dim
+hid_dim = 128
+
+# hidden dim2
+hid_dim2 = 96
+
+# class num
+class_dim = 2
+
+
+def network_cfg(is_train, pass_num=100):
+    with fluid.unique_name.guard():
+        train_file_obj = fluid.layers.open_files(
+            filenames=TRAIN_FILES,
+            pass_num=pass_num,
+            shapes=[[-1, 1], [-1, 1]],
+            lod_levels=[1, 0],
+            dtypes=['int64', 'int64'],
+            thread_num=1)
+
+        test_file_obj = fluid.layers.open_files(
+            filenames=TEST_FILES,
+            pass_num=1,
+            shapes=[[-1, 1], [-1, 1]],
+            lod_levels=[1, 0],
+            dtypes=['int64', 'int64'],
+            thread_num=1)
+
+        if is_train:
+            file_obj = fluid.layers.shuffle(train_file_obj, buffer_size=1000)
+        else:
+            file_obj = test_file_obj
+
+        file_obj = fluid.layers.double_buffer(
+            file_obj,
+            name="train_double_buffer" if is_train else 'test_double_buffer')
+
+        data, label = fluid.layers.read_file(file_obj)
+
+        emb = fluid.layers.embedding(input=data, size=[DICT_DIM, emb_dim])
+
+        # sequence conv with window size = 3
+        win_size = 3
+        conv_3 = fluid.nets.sequence_conv_pool(
+            input=emb,
+            num_filters=hid_dim,
+            filter_size=win_size,
+            act="tanh",
+            pool_type="max")
+
+        # fc layer after conv
+        fc_1 = fluid.layers.fc(input=[conv_3], size=hid_dim2)
+
+        # probability of each class
+        prediction = fluid.layers.fc(input=[fc_1],
+                                     size=class_dim,
+                                     act="softmax")
+        # cross entropy loss
+        cost = fluid.layers.cross_entropy(input=prediction, label=label)
+
+        # mean loss
+        avg_cost = fluid.layers.mean(x=cost)
+        acc = fluid.layers.accuracy(input=prediction, label=label)
+
+        if is_train:
+            # SGD optimizer
+            sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.01)
+            sgd_optimizer.minimize(avg_cost)
+
+        return {
+            'loss': avg_cost,
+            'log': [avg_cost, acc],
+            'file': train_file_obj if is_train else test_file_obj
+        }
+
+
+def main():
+    train = fluid.Program()
+    startup = fluid.Program()
+
+    with fluid.program_guard(train, startup):
+        train_args = network_cfg(is_train=True)
+
+    test = fluid.Program()
+
+    with fluid.program_guard(test, fluid.Program()):
+        test_args = network_cfg(is_train=False)
+
+    # startup
+    place = fluid.CUDAPlace(0)
+    exe = fluid.Executor(place=place)
+    exe.run(startup)
+
+    train_exe = fluid.ParallelExecutor(
+        use_cuda=True, loss_name=train_args['loss'].name, main_program=train)
+
+    fetch_var_list = [var.name for var in train_args['log']]
+    for i in xrange(sys.maxint):
+        result = map(numpy.array,
+                     train_exe.run(fetch_list=fetch_var_list
+                                   if i % 1000 == 0 else []))
+        if len(result) != 0:
+            print 'Train: ', result
+
+        if i % 1000 == 0:
+            test_exe = fluid.ParallelExecutor(
+                use_cuda=True, main_program=test, share_vars_from=train_exe)
+            loss = []
+            acc = []
+            try:
+                while True:
+                    loss_np, acc_np = map(
+                        numpy.array, test_exe.run(fetch_list=fetch_var_list))
+                    loss.append(loss_np[0])
+                    acc.append(acc_np[0])
+            except:
+                test_args['file'].reset()
+                print 'TEST: ', numpy.mean(loss), numpy.mean(acc)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/fluid/tests/no_test_concurrency.py b/python/paddle/fluid/tests/no_test_concurrency.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8f6cfb4a907b2c01e9662e7e9bf2cb0fbd6cb1b
--- /dev/null
+++ b/python/paddle/fluid/tests/no_test_concurrency.py
@@ -0,0 +1,258 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import framework, unique_name, layer_helper
+from paddle.fluid.executor import Executor
+from paddle.fluid.layers import fill_constant, assign, While, elementwise_add, Print
+
+
+class TestRoutineOp(unittest.TestCase):
+    def test_simple_routine(self):
+        ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+
+        # Create LOD_TENSOR<INT64> and put it into the scope.  This placeholder
+        # variable will be filled in and returned by fluid.channel_recv
+        result = self._create_tensor('return_value',
+                                     core.VarDesc.VarType.LOD_TENSOR,
+                                     core.VarDesc.VarType.INT64)
+
+        with fluid.Go():
+            input_value = fill_constant(
+                shape=[1], dtype=core.VarDesc.VarType.FP64, value=1234)
+            fluid.channel_send(ch, input_value)
+
+        result, status = fluid.channel_recv(ch, result)
+        fluid.channel_close(ch)
+
+        cpu = core.CPUPlace()
+        exe = Executor(cpu)
+
+        outs = exe.run(fetch_list=[result])
+        self.assertEqual(outs[0], 1234)
+
+    def test_daisy_chain(self):
+        '''
+        Mimics classic Daisy-chain test:  https://talks.golang.org/2012/concurrency.slide#39
+        '''
+        n = 100
+
+        leftmost = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+        left = leftmost
+
+        # TODO(thuan): Use fluid.While() after scope capture is implemented.
+        # https://github.com/PaddlePaddle/Paddle/issues/8502
+        for i in range(n):
+            right = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+            with fluid.Go():
+                one_tensor = self._create_one_dim_tensor(1)
+                result = self._create_tensor('return_value',
+                                             core.VarDesc.VarType.LOD_TENSOR,
+                                             core.VarDesc.VarType.INT64)
+
+                result, status = fluid.channel_recv(right, result)
+                one_added = fluid.layers.elementwise_add(x=one_tensor, y=result)
+                fluid.channel_send(left, one_added)
+            left = right
+
+        # Trigger the channel propagation by sending a "1" to rightmost channel
+        with fluid.Go():
+            one_tensor = self._create_one_dim_tensor(1)
+            fluid.channel_send(right, one_tensor)
+
+        leftmost_result = self._create_tensor('return_value',
+                                              core.VarDesc.VarType.LOD_TENSOR,
+                                              core.VarDesc.VarType.INT64)
+        leftmost_result, status = fluid.channel_recv(leftmost, leftmost_result)
+
+        cpu = core.CPUPlace()
+        exe = Executor(cpu)
+        leftmost_data = exe.run(fetch_list=[leftmost_result])
+
+        # The leftmost_data should be equal to the number of channels + 1
+        self.assertEqual(leftmost_data[0][0], n + 1)
+
+    def _create_one_dim_tensor(self, value):
+        one_dim_tensor = fill_constant(shape=[1], dtype='int', value=value)
+        one_dim_tensor.stop_gradient = True
+        return one_dim_tensor
+
+    def _create_tensor(self, name, type, dtype):
+        return framework.default_main_program().current_block().create_var(
+            name=unique_name.generate(name), type=type, dtype=dtype)
+
+    def _create_persistable_tensor(self, name, type, dtype):
+        return framework.default_main_program().current_block().create_var(
+            name=unique_name.generate(name),
+            type=type,
+            dtype=dtype,
+            persistable=True)
+
+    def test_select(self):
+        with framework.program_guard(framework.Program()):
+            ch1 = fluid.make_channel(
+                dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1)
+
+            result1 = self._create_tensor('return_value',
+                                          core.VarDesc.VarType.LOD_TENSOR,
+                                          core.VarDesc.VarType.FP64)
+
+            input_value = fill_constant(
+                shape=[1], dtype=core.VarDesc.VarType.FP64, value=10)
+
+            with fluid.Select() as select:
+                with select.case(fluid.channel_send, ch1, input_value):
+                    # Execute something.
+                    pass
+
+                with select.default():
+                    pass
+
+            # This should not block because we are using a buffered channel.
+            result1, status = fluid.channel_recv(ch1, result1)
+            fluid.channel_close(ch1)
+
+            cpu = core.CPUPlace()
+            exe = Executor(cpu)
+
+            result = exe.run(fetch_list=[result1])
+            self.assertEqual(result[0][0], 10)
+
+    def test_fibonacci(self):
+        """
+        Mimics Fibonacci Go example: https://tour.golang.org/concurrency/5
+        """
+        with framework.program_guard(framework.Program()):
+            quit_ch_input_var = self._create_persistable_tensor(
+                'quit_ch_input', core.VarDesc.VarType.LOD_TENSOR,
+                core.VarDesc.VarType.INT32)
+            quit_ch_input = fill_constant(
+                shape=[1],
+                dtype=core.VarDesc.VarType.INT32,
+                value=0,
+                out=quit_ch_input_var)
+
+            result = self._create_persistable_tensor(
+                'result', core.VarDesc.VarType.LOD_TENSOR,
+                core.VarDesc.VarType.INT32)
+            fill_constant(
+                shape=[1],
+                dtype=core.VarDesc.VarType.INT32,
+                value=0,
+                out=result)
+
+            x = fill_constant(
+                shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
+            y = fill_constant(
+                shape=[1], dtype=core.VarDesc.VarType.INT32, value=1)
+
+            while_cond = fill_constant(
+                shape=[1], dtype=core.VarDesc.VarType.BOOL, value=True)
+
+            while_false = fill_constant(
+                shape=[1], dtype=core.VarDesc.VarType.BOOL, value=False)
+
+            x_tmp = fill_constant(
+                shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
+
+            def fibonacci(channel, quit_channel):
+                while_op = While(cond=while_cond)
+                with while_op.block():
+                    result2 = fill_constant(
+                        shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
+
+                    with fluid.Select() as select:
+                        with select.case(
+                                fluid.channel_send, channel, x, is_copy=True):
+                            assign(input=x, output=x_tmp)
+                            assign(input=y, output=x)
+                            assign(elementwise_add(x=x_tmp, y=y), output=y)
+
+                        with select.case(fluid.channel_recv, quit_channel,
+                                         result2):
+                            # Quit
+                            helper = layer_helper.LayerHelper('assign')
+                            helper.append_op(
+                                type='assign',
+                                inputs={'X': [while_false]},
+                                outputs={'Out': [while_cond]})
+
+            ch1 = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+            quit_ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
+
+            with fluid.Go():
+                for i in xrange(10):
+                    fluid.channel_recv(ch1, result)
+                    Print(result)
+
+                fluid.channel_send(quit_ch, quit_ch_input)
+
+            fibonacci(ch1, quit_ch)
+
+            fluid.channel_close(ch1)
+            fluid.channel_close(quit_ch)
+
+            cpu = core.CPUPlace()
+            exe = Executor(cpu)
+
+            exe_result = exe.run(fetch_list=[result])
+            self.assertEqual(exe_result[0][0], 34)
+
+    def test_ping_pong(self):
+        """
+        Mimics Ping Pong example: https://gobyexample.com/channel-directions
+        """
+        with framework.program_guard(framework.Program()):
+            result = self._create_tensor('return_value',
+                                         core.VarDesc.VarType.LOD_TENSOR,
+                                         core.VarDesc.VarType.FP64)
+
+            ping_result = self._create_tensor('ping_return_value',
+                                              core.VarDesc.VarType.LOD_TENSOR,
+                                              core.VarDesc.VarType.FP64)
+
+            def ping(ch, message):
+                fluid.channel_send(ch, message, is_copy=True)
+
+            def pong(ch1, ch2):
+                fluid.channel_recv(ch1, ping_result)
+                fluid.channel_send(ch2, ping_result, is_copy=True)
+
+            pings = fluid.make_channel(
+                dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1)
+            pongs = fluid.make_channel(
+                dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1)
+
+            msg = fill_constant(
+                shape=[1], dtype=core.VarDesc.VarType.FP64, value=9)
+
+            ping(pings, msg)
+            pong(pings, pongs)
+
+            fluid.channel_recv(pongs, result)
+
+            fluid.channel_close(pings)
+            fluid.channel_close(pongs)
+
+            cpu = core.CPUPlace()
+            exe = Executor(cpu)
+
+            exe_result = exe.run(fetch_list=[result])
+            self.assertEqual(exe_result[0][0], 9)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/test_concurrency.py b/python/paddle/fluid/tests/test_concurrency.py
deleted file mode 100644
index 9f7bf63c5e017251e87af94690ff32c47c538c6b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/test_concurrency.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid import framework, unique_name
-from paddle.fluid.executor import Executor
-from paddle.fluid.layers import fill_constant
-
-
-class TestRoutineOp(unittest.TestCase):
-    def test_simple_routine(self):
-        ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
-
-        # Create LOD_TENSOR<INT64> and put it into the scope.  This placeholder
-        # variable will be filled in and returned by fluid.channel_recv
-        result = self._create_tensor('return_value',
-                                     core.VarDesc.VarType.LOD_TENSOR,
-                                     core.VarDesc.VarType.INT64)
-
-        with fluid.Go():
-            input_value = fill_constant(
-                shape=[1], dtype=core.VarDesc.VarType.FP64, value=1234)
-            fluid.channel_send(ch, input_value)
-
-        result, status = fluid.channel_recv(ch, result)
-        fluid.channel_close(ch)
-
-        cpu = core.CPUPlace()
-        exe = Executor(cpu)
-
-        outs = exe.run(fetch_list=[result])
-        self.assertEqual(outs[0], 1234)
-
-    def test_daisy_chain(self):
-        '''
-        Mimics classic Daisy-chain test:  https://talks.golang.org/2012/concurrency.slide#39
-        '''
-        n = 100
-
-        leftmost = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
-        left = leftmost
-
-        # TODO(thuan): Use fluid.While() after scope capture is implemented.
-        # https://github.com/PaddlePaddle/Paddle/issues/8502
-        for i in range(n):
-            right = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
-            with fluid.Go():
-                one_tensor = self._create_one_dim_tensor(1)
-                result = self._create_tensor('return_value',
-                                             core.VarDesc.VarType.LOD_TENSOR,
-                                             core.VarDesc.VarType.INT64)
-
-                result, status = fluid.channel_recv(right, result)
-                one_added = fluid.layers.elementwise_add(x=one_tensor, y=result)
-                fluid.channel_send(left, one_added)
-            left = right
-
-        # Trigger the channel propagation by sending a "1" to rightmost channel
-        with fluid.Go():
-            one_tensor = self._create_one_dim_tensor(1)
-            fluid.channel_send(right, one_tensor)
-
-        leftmost_result = self._create_tensor('return_value',
-                                              core.VarDesc.VarType.LOD_TENSOR,
-                                              core.VarDesc.VarType.INT64)
-        leftmost_result, status = fluid.channel_recv(leftmost, leftmost_result)
-
-        cpu = core.CPUPlace()
-        exe = Executor(cpu)
-        leftmost_data = exe.run(fetch_list=[leftmost_result])
-
-        # The leftmost_data should be equal to the number of channels + 1
-        self.assertEqual(leftmost_data[0][0], n + 1)
-
-    def _create_one_dim_tensor(self, value):
-        one_dim_tensor = fill_constant(
-            shape=[1], dtype=core.VarDesc.VarType.INT64, value=value)
-        one_dim_tensor.stop_gradient = True
-        return one_dim_tensor
-
-    def _create_tensor(self, name, type, dtype):
-        return framework.default_main_program().current_block().create_var(
-            name=unique_name.generate(name), type=type, dtype=dtype)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/test_cpp_reader.py b/python/paddle/fluid/tests/test_cpp_reader.py
index 4b0d039b7e05a55980946a8949e32802e9e57c20..6cc291dfcffdd7083f498389834e37bd06ca4572 100644
--- a/python/paddle/fluid/tests/test_cpp_reader.py
+++ b/python/paddle/fluid/tests/test_cpp_reader.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import numpy as np
 import sys
@@ -44,8 +44,8 @@ create_random_data_generator_op = startup_block.append_op(
     attrs={
         "shape_concat": [1, 2, 1, 1],
         "ranks": [2, 2],
-        "min": 0.0,
-        "max": 1.0,
+        "low": 0.0,
+        "high": 1.0,
         'lod_levels': [0, 0]
     })
 
diff --git a/python/paddle/fluid/tests/test_data_feeder.py b/python/paddle/fluid/tests/test_data_feeder.py
index 861dd3174a21d59fe12e0b794ecb2a934946ac71..30b7a634a2b978df85d6432854ef12285460be44 100644
--- a/python/paddle/fluid/tests/test_data_feeder.py
+++ b/python/paddle/fluid/tests/test_data_feeder.py
@@ -13,15 +13,61 @@
 # limitations under the License.
 
 import paddle.fluid as fluid
+import unittest
 
 
-def test_converter():
-    img = fluid.layers.data(name='image', shape=[1, 28, 28])
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    feeder = fluid.DataFeeder([img, label], fluid.CPUPlace())
-    result = feeder.feed([[[0] * 784, [9]], [[1] * 784, [1]]])
-    print(result)
+class TestDataFeeder(unittest.TestCase):
+    def test_lod_level_0_converter(self):
+        img = fluid.layers.data(name='image', shape=[1, 28, 28])
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        feeder = fluid.DataFeeder([img, label], fluid.CPUPlace())
+        result = feeder.feed([([0] * 784, [9]), ([1] * 784, [1])])
+
+        self.assertEqual(result['image'].shape(), [2, 1, 28, 28])
+        self.assertEqual(result['label'].shape(), [2, 1])
+        self.assertEqual(result['image'].recursive_sequence_lengths(), [])
+        self.assertEqual(result['label'].recursive_sequence_lengths(), [])
+
+    def test_lod_level_1_converter(self):
+        # lod_level = 1
+        # each sentence has a different number of words
+        sentences = fluid.layers.data(
+            name='sentences', shape=[1], dtype='int64', lod_level=1)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        feeder = fluid.DataFeeder([sentences, label], fluid.CPUPlace())
+
+        # lod = [[0, 3, 5, 9]]
+        # data = [[1, 2, 3], [4, 5], [6, 7, 8, 9]]
+        # label = [1] * len(data)
+        result = feeder.feed(
+            [([1, 2, 3], [1]), ([4, 5], [1]), ([6, 7, 8, 9], [1])])
+
+        self.assertEqual(result['sentences'].shape(), [9, 1])
+        self.assertEqual(result['label'].shape(), [3, 1])
+        self.assertEqual(result['sentences'].recursive_sequence_lengths(),
+                         [[3, 2, 4]])
+        self.assertEqual(result['label'].recursive_sequence_lengths(), [])
+
+    def test_lod_level_2_converter(self):
+        # lod_level = 2
+        # paragraphs -> sentences -> words
+        paragraphs = fluid.layers.data(
+            name='paragraphs', shape=[1], dtype='int64', lod_level=2)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        feeder = fluid.DataFeeder([paragraphs, label], fluid.CPUPlace())
+
+        # lod = [[0, 2, 3], [0, 3, 5, 9]]
+        # data = [[[1, 2, 3], [4, 5]], [[6, 7, 8, 9]]]
+        # label = [1] * len(data)
+        result = feeder.feed(
+            [([[1, 2, 3], [4, 5]], [1]), ([[6, 7, 8, 9]], [1])])
+
+        self.assertEqual(result['paragraphs'].shape(), [9, 1])
+        self.assertEqual(result['label'].shape(), [2, 1])
+        self.assertEqual(result['paragraphs'].recursive_sequence_lengths(),
+                         [[2, 1], [3, 2, 4]])
+        self.assertEqual(result['label'].recursive_sequence_lengths(), [])
 
 
 if __name__ == '__main__':
-    test_converter()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 921260ef3f4b1f9e4c65b3ffb440dc34cb0a9376..2d70c986b1b6c42ff709e9cf3b4234cf4fc26836 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -109,6 +109,42 @@ class TestDetection(unittest.TestCase):
         print(str(program))
 
 
+class TestPriorBox(unittest.TestCase):
+    def test_prior_box(self):
+        data_shape = [3, 224, 224]
+        images = fluid.layers.data(
+            name='pixel', shape=data_shape, dtype='float32')
+        conv1 = fluid.layers.conv2d(images, 3, 3, 2)
+        box, var = layers.prior_box(
+            input=conv1,
+            image=images,
+            min_sizes=[100.0],
+            aspect_ratios=[1.],
+            flip=True,
+            clip=True)
+        assert len(box.shape) == 4
+        assert box.shape == var.shape
+        assert box.shape[3] == 4
+
+
+class TestAnchorGenerator(unittest.TestCase):
+    def test_anchor_generator(self):
+        data_shape = [3, 224, 224]
+        images = fluid.layers.data(
+            name='pixel', shape=data_shape, dtype='float32')
+        conv1 = fluid.layers.conv2d(images, 3, 3, 2)
+        anchor, var = fluid.layers.anchor_generator(
+            input=conv1,
+            anchor_sizes=[64, 128, 256, 512],
+            aspect_ratios=[0.5, 1.0, 2.0],
+            variance=[0.1, 0.1, 0.2, 0.2],
+            stride=[16.0, 16.0],
+            offset=0.5)
+        assert len(anchor.shape) == 4
+        assert anchor.shape == var.shape
+        assert anchor.shape[3] == 4
+
+
 class TestMultiBoxHead(unittest.TestCase):
     def test_multi_box_head(self):
         data_shape = [3, 224, 224]
diff --git a/python/paddle/fluid/tests/test_error_clip.py b/python/paddle/fluid/tests/test_error_clip.py
index b2fd5ae29c724da52df0a5d3cb56d2ec9e5530f3..89f4c64975802dc1827ec17ed3626b91e36d6971 100644
--- a/python/paddle/fluid/tests/test_error_clip.py
+++ b/python/paddle/fluid/tests/test_error_clip.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 
 BATCH_SIZE = 128
diff --git a/python/paddle/fluid/tests/test_gradient_clip.py b/python/paddle/fluid/tests/test_gradient_clip.py
index 68b682f68b1fd147b821cfdb1e0866cf8aa04bff..d530601f13be6810a8a99b13c92faf584df568f9 100644
--- a/python/paddle/fluid/tests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/test_gradient_clip.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 
 BATCH_SIZE = 128
diff --git a/python/paddle/fluid/tests/test_lod_tensor.py b/python/paddle/fluid/tests/test_lod_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7a9dd4129027417a06a6c25ff9a801fff259c5e
--- /dev/null
+++ b/python/paddle/fluid/tests/test_lod_tensor.py
@@ -0,0 +1,99 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+from paddle.fluid.lod_tensor import create_lod_tensor, create_random_int_lodtensor
+import numpy as np
+import unittest
+
+
+class TestLoDTensor(unittest.TestCase):
+    def test_pybind_recursive_seq_lens(self):
+        tensor = fluid.LoDTensor()
+        recursive_seq_lens = []
+        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
+        recursive_seq_lens = [[], [1], [3]]
+        self.assertRaises(Exception, tensor.set_recursive_sequence_lengths,
+                          recursive_seq_lens)
+        recursive_seq_lens = [[0], [2], [3]]
+        self.assertRaises(Exception, tensor.set_recursive_sequence_lengths,
+                          recursive_seq_lens)
+
+        recursive_seq_lens = [[1, 2, 3]]
+        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         recursive_seq_lens)
+        tensor.set(np.random.random([6, 1]), fluid.CPUPlace())
+        self.assertTrue(tensor.has_valid_recursive_sequence_lengths())
+        tensor.set(np.random.random([9, 1]), fluid.CPUPlace())
+        self.assertFalse(tensor.has_valid_recursive_sequence_lengths())
+
+        # Each level's sum should be equal to the number of items in the next level
+        # Moreover, last level's sum should be equal to the tensor height
+        recursive_seq_lens = [[2, 3], [1, 3, 1, 2, 2]]
+        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         recursive_seq_lens)
+        tensor.set(np.random.random([8, 1]), fluid.CPUPlace())
+        self.assertFalse(tensor.has_valid_recursive_sequence_lengths())
+        recursive_seq_lens = [[2, 3], [1, 3, 1, 2, 1]]
+        tensor.set_recursive_sequence_lengths(recursive_seq_lens)
+        self.assertTrue(tensor.has_valid_recursive_sequence_lengths())
+        tensor.set(np.random.random([9, 1]), fluid.CPUPlace())
+        self.assertFalse(tensor.has_valid_recursive_sequence_lengths())
+
+    def test_create_lod_tensor(self):
+        # Create LoDTensor from a list
+        data = [[1, 2, 3], [3, 4]]
+        wrong_recursive_seq_lens = [[2, 2]]
+        correct_recursive_seq_lens = [[3, 2]]
+        self.assertRaises(AssertionError, create_lod_tensor, data,
+                          wrong_recursive_seq_lens, fluid.CPUPlace())
+        tensor = create_lod_tensor(data, correct_recursive_seq_lens,
+                                   fluid.CPUPlace())
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         correct_recursive_seq_lens)
+
+        # Create LoDTensor from numpy array
+        data = np.random.random([10, 1])
+        recursive_seq_lens = [[2, 1], [3, 3, 4]]
+        tensor = create_lod_tensor(data, recursive_seq_lens, fluid.CPUPlace())
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         recursive_seq_lens)
+
+        # Create LoDTensor from another LoDTensor, they are differnt instances
+        new_recursive_seq_lens = [[2, 2, 1], [1, 2, 2, 3, 2]]
+        new_tensor = create_lod_tensor(tensor, new_recursive_seq_lens,
+                                       fluid.CPUPlace())
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         recursive_seq_lens)
+        self.assertEqual(new_tensor.recursive_sequence_lengths(),
+                         new_recursive_seq_lens)
+
+    def test_create_random_int_lodtensor(self):
+        # The shape of a word, commonly used in speech and NLP problem, is [1]
+        shape = [1]
+        recursive_seq_lens = [[2, 3, 5]]
+        dict_size = 10000
+        low = 0
+        high = dict_size - 1
+        tensor = create_random_int_lodtensor(recursive_seq_lens, shape,
+                                             fluid.CPUPlace(), low, high)
+        self.assertEqual(tensor.recursive_sequence_lengths(),
+                         recursive_seq_lens)
+        self.assertEqual(tensor.shape(), [10, 1])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/test_mnist_if_else_op.py b/python/paddle/fluid/tests/test_mnist_if_else_op.py
index 94395f6cfb4648967558ed265e798e3505c20fc1..d34f52db5ffc889f17513d034ad2c99f696b0cdf 100644
--- a/python/paddle/fluid/tests/test_mnist_if_else_op.py
+++ b/python/paddle/fluid/tests/test_mnist_if_else_op.py
@@ -12,12 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 import paddle.fluid.layers as layers
 from paddle.fluid.framework import Program, program_guard, default_main_program, default_startup_program
 from paddle.fluid.executor import Executor
 from paddle.fluid.optimizer import MomentumOptimizer
 import paddle.fluid.core as core
-import paddle.v2 as paddle
 import unittest
 import numpy as np
 
diff --git a/python/paddle/fluid/tests/unittests/.gitignore b/python/paddle/fluid/tests/unittests/.gitignore
index 6b3fc2a83c649c28d21c9a8a0b35c2f2fa04f269..b1e8fda03aa42f5f7528eafb46c16d55b868bae5 100644
--- a/python/paddle/fluid/tests/unittests/.gitignore
+++ b/python/paddle/fluid/tests/unittests/.gitignore
@@ -1 +1,8 @@
 mnist.recordio
+mnist_0.recordio
+mnist_1.recordio
+mnist_2.recordio
+flowers.recordio
+wmt16.recordio
+data_balance_test.recordio
+data_balance_with_lod_test.recordio
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index f96c2ca4f0593b6c2624d449304f23425c69ab93..f6c8dcabcbc592024188f4742e6c532a704d2289 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1,91 +1,55 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
+# The MKLDNN tests are skiped when the MKLDNN flag is OFF
+if(NOT WITH_MKLDNN)
+    foreach(src ${TEST_OPS})
+        if(${src} MATCHES ".*_mkldnn_op$")
+            list(REMOVE_ITEM TEST_OPS ${src})
+        endif()
+    endforeach()
+endif(NOT WITH_MKLDNN)
+
 if(NOT WITH_DISTRIBUTE)
     list(REMOVE_ITEM TEST_OPS test_recv_op)
 endif(NOT WITH_DISTRIBUTE)
 
 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
-list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184 
+list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184
 list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185
-list(REMOVE_ITEM TEST_OPS test_nce) # IXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778
+list(REMOVE_ITEM TEST_OPS test_nce) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778
 list(REMOVE_ITEM TEST_OPS test_recurrent_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152
 list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957
-list(REMOVE_ITEM TEST_OPS test_detection_output_op) # FIXME: detection_output_op will be rewritten. This unittest should be
 
 list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test
 list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test
 
 function(py_test_modules TARGET_NAME)
   if(WITH_TESTING)
-    set(options "")
+    set(options SERIAL)
     set(oneValueArgs "")
-    set(multiValueArgs MODULES DEPS ARGS ENVS)
+    set(multiValueArgs MODULES DEPS ENVS)
     cmake_parse_arguments(py_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_modules_ENVS}
-             ${PYTHON_EXECUTABLE} -u -m unittest --verbose ${py_test_modules_MODULES} ${py_test_modules_ARGS}
-             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+             COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS}
+             ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
+             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    if (py_test_modules_SERIAL)
+        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    endif()
   endif()
 endfunction()
-
-# test time consuming OPs in a separate process for expliot parallism
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
-list(REMOVE_ITEM TEST_OPS test_dyn_rnn)
-list(REMOVE_ITEM TEST_OPS test_mul_op)
-
-# tests that need to be run in separate process.
-list(REMOVE_ITEM TEST_OPS test_multihead_attention)
-list(REMOVE_ITEM TEST_OPS test_calc_gradient)
-list(REMOVE_ITEM TEST_OPS test_while_op)
-list(REMOVE_ITEM TEST_OPS test_lod_array_length_op)
-list(REMOVE_ITEM TEST_OPS test_reorder_lod_tensor)
-list(REMOVE_ITEM TEST_OPS test_profiler)
-list(REMOVE_ITEM TEST_OPS test_nvprof)
-list(REMOVE_ITEM TEST_OPS test_normalization_wrapper)
-list(REMOVE_ITEM TEST_OPS test_executor_and_mul)
-list(REMOVE_ITEM TEST_OPS test_assign_value_op)
-list(REMOVE_ITEM TEST_OPS test_array_read_write_op)
-list(REMOVE_ITEM TEST_OPS test_lod_rank_table)
-list(REMOVE_ITEM TEST_OPS test_weight_normalization)
-list(REMOVE_ITEM TEST_OPS test_conditional_block)
-list(REMOVE_ITEM TEST_OPS test_parameter)
-list(REMOVE_ITEM TEST_OPS test_registry)
-list(REMOVE_ITEM TEST_OPS test_fetch_var)
-list(REMOVE_ITEM TEST_OPS test_parallel_op)
-list(REMOVE_ITEM TEST_OPS test_dynrnn_static_input)
-
-# tests that can be bundled together in one python process for speed.
-if(WITH_FAST_BUNDLE_TEST)
-    py_test_modules("test_all_ops" MODULES ${TEST_OPS})
-else()
-    foreach(TEST_OP ${TEST_OPS})
-        py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-    endforeach(TEST_OP)
-endif(WITH_FAST_BUNDLE_TEST)
-
-# tests with high overhead
-py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR})
-py_test_modules(test_train_dyn_rnn MODULES test_dyn_rnn)
-py_test_modules(test_mul_op MODULES test_mul_op)
-
-# tests that need to be run in separate process.
-py_test_modules(test_multihead_attention MODULES test_multihead_attention)
-py_test_modules(test_calc_gradient MODULES test_calc_gradient)
-py_test_modules(test_while_op MODULES test_while_op)
-py_test_modules(test_lod_array_length_op MODULES test_lod_array_length_op)
-py_test_modules(test_reorder_lod_tensor MODULES test_reorder_lod_tensor)
-py_test_modules(test_profiler MODULES test_profiler)
-py_test_modules(test_nvprof MODULES test_nvprof)
-py_test_modules(test_normalization_wrapper MODULES test_normalization_wrapper)
-py_test_modules(test_executor_and_mul MODULES test_executor_and_mul)
-py_test_modules(test_assign_value_op MODULES test_assign_value_op)
-py_test_modules(test_array_read_write_op MODULES test_array_read_write_op)
-py_test_modules(test_lod_rank_table MODULES test_lod_rank_table)
-py_test_modules(test_weight_normalization MODULES test_weight_normalization)
-py_test_modules(test_conditional_block MODULES test_conditional_block)
-py_test_modules(test_parameter MODULES test_parameter)
-py_test_modules(test_registry MODULES test_registry)
-py_test_modules(test_fetch_var MODULES test_fetch_var)
-py_test_modules(test_dynrnn_static_input MODULES test_dynrnn_static_input)
-py_test_modules(test_parallel_op MODULES test_parallel_op)
+list(REMOVE_ITEM TEST_OPS test_dist_train)
+list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
+list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
+foreach(TEST_OP ${TEST_OPS})
+    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+endforeach(TEST_OP)
+py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
+py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
+py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
+py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
+set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
+set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 180)
+set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 180)
diff --git a/python/paddle/fluid/tests/unittests/benchmark.py b/python/paddle/fluid/tests/unittests/benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..e891ee932f1440001eb25b222f1f4613e97dfcb1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/benchmark.py
@@ -0,0 +1,113 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+import time
+import itertools
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from op_test import OpTest
+
+
+class BenchmarkSuite(OpTest):
+    def timeit_function(self, callback, iters, *args, **kwargs):
+        assert iters != 0, "Iters should >= 1"
+        start = time.time()
+        for i in range(iters):
+            callback(*args, **kwargs)
+        elapse = time.time() - start
+        return elapse / iters
+
+    def _assert_cpu_gpu_same(self, cpu_outs, gpu_outs, fetch_list, atol):
+        for item_cpu_out, item_gpu_out, variable in zip(cpu_outs, gpu_outs,
+                                                        fetch_list):
+            # the cpu version is baseline, expect gpu version keep same with cpu version.
+            expect = item_cpu_out
+            expect_t = np.array(item_cpu_out)
+            actual = item_gpu_out
+            actual_t = np.array(item_gpu_out)
+            var_name = variable if isinstance(variable,
+                                              basestring) else variable.name
+            self.assertTrue(
+                np.allclose(
+                    actual_t, expect_t, atol=atol),
+                "Output (" + var_name + ") has diff" + str(actual_t) + "\n" +
+                str(expect_t))
+            self.assertListEqual(actual.lod(),
+                                 expect.lod(),
+                                 "Output (" + var_name + ") has different lod")
+
+    def _get_input_names(self):
+        inputs = []
+        for name, value in self.inputs.iteritems():
+            if isinstance(value, list):
+                inputs.extend([sub_name for sub_name, _ in value])
+            inputs.append(name)
+        return inputs
+
+    def _get_output_names(self):
+        outputs = []
+        for var_name, var in self.outputs.iteritems():
+            if isinstance(var, list):
+                for sub_var_name, sub_var in var:
+                    outputs.append(sub_var_name)
+            else:
+                outputs.append(var_name)
+        if len(outputs) == 0:
+            for out_name, out_dup in Operator.get_op_outputs(self.op_type):
+                outputs.append(str(out_name))
+        return outputs
+
+    def check_output_stability(self, atol=1e-8):
+        places = self._get_places()
+        if len(places) < 2:
+            return
+        cpu_outs, fetch_list = self._calc_output(places[0])
+        gpu_outs, _ = self._calc_output(places[1])
+        self._assert_cpu_gpu_same(cpu_outs, gpu_outs, fetch_list, atol)
+
+    def timeit_output_with_place(self, place, iters):
+        return self.timeit_function(self.calc_output, iters, place)
+
+    def timeit_output(self, iters=100):
+        places = self._get_places()
+        elapses = []
+        for place in places:
+            elapses.append(self.timeit_output_with_place(place, iters))
+        for place, elapse in zip(places, elapses):
+            print("One pass of ({2}_op) at {0} cost {1}".format(
+                str(place), elapse, self.op_type))
+
+    def timeit_grad_with_place(self, place, iters=100):
+        inputs_to_check = self._get_input_names()
+        output_names = self._get_output_names()
+        return self.timeit_function(
+            self._get_gradient,
+            iters,
+            inputs_to_check,
+            place,
+            output_names,
+            no_grad_set=None)
+
+    def timeit_grad(self, iters=100):
+        places = self._get_places()
+        elapses = []
+        for place in places:
+            elapses.append(self.timeit_grad_with_place(place, iters))
+        for place, elapse in zip(places, elapses):
+            print("One pass of ({2}_grad_op) at {0} cost {1}".format(
+                str(place), elapse, self.op_type))
diff --git a/python/paddle/fluid/tests/unittests/benchmark_sum_op.py b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..91a5f1bca4441d80489a02eb9283928e38321826
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
@@ -0,0 +1,82 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle.fluid as fluid
+from benchmark import BenchmarkSuite
+from op_test import OpTest
+
+# This is a demo op test case for operator benchmarking and high resolution number stability alignment.
+
+
+class TestSumOp(BenchmarkSuite):
+    def setUp(self):
+        self.op_type = "sum"
+        self.customize_testcase()
+        self.customize_fetch_list()
+
+    def customize_fetch_list(self):
+        """
+        customize fetch list, configure the wanted variables.
+        >>> self.fetch_list = ["Out"]
+        """
+        self.fetch_list = ["Out"]
+        # pass
+
+    def customize_testcase(self):
+        # a test case
+        x0 = np.random.random((300, 400)).astype('float32')
+        x1 = np.random.random((300, 400)).astype('float32')
+        x2 = np.random.random((300, 400)).astype('float32')
+
+        # NOTE: if the output is empty, then it will autofilled by benchmarkSuite.
+        # only the output dtype is used, the shape, lod and data is computed from input.
+        self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
+        self.outputs = {"Out": x0 + x1 + x2}
+
+    def test_check_output(self):
+        """
+        compare the output with customized output. In this case,
+        you should set the correct output by hands.
+        >>> self.outputs = {"Out": x0 + x1 + x2}
+        """
+        self.check_output(atol=1e-8)
+
+    def test_output_stability(self):
+        # compare the cpu gpu output in high resolution.
+        self.check_output_stability()
+
+    def test_timeit_output(self):
+        """
+        perf the op, time cost will be averged in iters.
+        output example
+        >>> One pass of (sum_op) at CPUPlace cost 0.000461330413818
+        >>> One pass of (sum_op) at CUDAPlace(0) cost 0.000556070804596
+        """
+        self.timeit_output(iters=100)
+
+    def test_timeit_grad(self):
+        """
+        perf the op gradient, time cost will be averged in iters.
+        output example
+        >>> One pass of (sum_grad_op) at CPUPlace cost 0.00279935121536
+        >>> One pass of (sum_grad_op) at CUDAPlace(0) cost 0.00500632047653
+        """
+        self.timeit_grad(iters=100)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index f7e02595ec3b41ae7bb32353c258736968ca78d4..e056ef9952a519d6c4d580b27f1118a3a91f13af 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -15,13 +15,17 @@
 import unittest
 import numpy as np
 import random
+import time
 import itertools
-import paddle.fluid.core as core
 import collections
+
+import paddle.fluid as fluid
+import paddle.fluid.core as core
 from paddle.fluid.backward import append_backward
 from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
-from paddle.fluid.framework import Program, OpProtoHolder
+from paddle.fluid.framework import Program, OpProtoHolder, Variable
+from testsuite import create_op, set_input, append_input_output, append_loss_ops
 
 
 def randomize_probability(batch_size, class_num, dtype='float32'):
@@ -33,67 +37,6 @@ def randomize_probability(batch_size, class_num, dtype='float32'):
     return prob
 
 
-def create_op(scope, op_type, inputs, outputs, attrs):
-    kwargs = dict()
-
-    def __create_var__(name, var_name):
-        scope.var(var_name).get_tensor()
-        kwargs[name].append(var_name)
-
-    for in_name, in_dup in Operator.get_op_inputs(op_type):
-        if in_name in inputs:
-            kwargs[in_name] = []
-            if in_dup:
-                sub_in = inputs[in_name]
-                for item in sub_in:
-                    sub_in_name, _ = item[0], item[1]
-                    __create_var__(in_name, sub_in_name)
-            else:
-                __create_var__(in_name, in_name)
-
-    for out_name, out_dup in Operator.get_op_outputs(op_type):
-        if out_name in outputs:
-            kwargs[out_name] = []
-            if out_dup:
-                sub_out = outputs[out_name]
-                for item in sub_out:
-                    sub_out_name, _ = item[0], item[1]
-                    __create_var__(out_name, sub_out_name)
-            else:
-                __create_var__(out_name, out_name)
-
-    for attr_name in Operator.get_op_attr_names(op_type):
-        if attr_name in attrs:
-            kwargs[attr_name] = attrs[attr_name]
-
-    return Operator(op_type, **kwargs)
-
-
-def set_input(scope, op, inputs, place):
-    def __set_input__(var_name, var):
-        if isinstance(var, tuple) or isinstance(var, np.ndarray):
-            tensor = scope.find_var(var_name).get_tensor()
-            if isinstance(var, tuple):
-                tensor.set_lod(var[1])
-                var = var[0]
-            tensor.set_dims(var.shape)
-            tensor.set(var, place)
-        elif isinstance(var, float):
-            scope.find_var(var_name).set_float(var)
-        elif isinstance(var, int):
-            scope.find_var(var_name).set_int(var)
-
-    for in_name, in_dup in Operator.get_op_inputs(op.type()):
-        if in_name in inputs:
-            if in_dup:
-                sub_in = inputs[in_name]
-                for item in sub_in:
-                    sub_in_name, sub_in_val = item[0], item[1]
-                    __set_input__(sub_in_name, sub_in_val)
-            else:
-                __set_input__(in_name, inputs[in_name])
-
-
 def get_numeric_gradient(place,
                          scope,
                          op,
@@ -167,54 +110,15 @@ def get_numeric_gradient(place,
     return gradient_flat.reshape(tensor_to_check.get_dims())
 
 
-def append_input_output(block, op_proto, np_list, is_input):
-    '''Insert VarDesc and generate Python variable instance'''
-    proto_list = op_proto.inputs if is_input else op_proto.outputs
-
-    def create_var(block, name, np_list, var_proto):
-        if name not in np_list:
-            assert var_proto.intermediate, "{} not found".format(name)
-            shape = None
-            lod_level = None
-        else:
-            np_value = np_list[name]
-            if isinstance(np_value, tuple):
-                shape = list(np_value[0].shape)
-                lod_level = len(np_value[1])
-            else:
-                shape = list(np_value.shape)
-                lod_level = 0
-        return block.create_var(
-            dtype="float32", shape=shape, lod_level=lod_level, name=name)
-
-    var_dict = {}
-    for var_proto in proto_list:
-        var_name = str(var_proto.name)
-        if is_input:
-            if (var_name not in np_list) and var_proto.dispensable:
-                continue
-            assert (var_name in np_list) or (var_proto.dispensable), \
-                "Missing {} as input".format(var_name)
-        if var_proto.duplicable:
-            assert isinstance(np_list[var_name], list), \
-                "Duplicable {} should be set as list".format(var_name)
-            var_list = []
-            for (name, np_value) in np_list[var_name]:
-                var_list.append(
-                    create_var(block, name, {name: np_value}, var_proto))
-            var_dict[var_name] = var_list
-        else:
-            var_dict[var_name] = create_var(block, var_name, np_list, var_proto)
-
-    return var_dict
-
-
 class OpTest(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         '''Fix random seeds to remove randomness from tests'''
         cls._np_rand_state = np.random.get_state()
         cls._py_rand_state = random.getstate()
+        cls.call_once = False
+        cls.dtype = "float32"
+        cls.outputs = {}
 
         np.random.seed(123)
         random.seed(124)
@@ -225,6 +129,31 @@ class OpTest(unittest.TestCase):
         np.random.set_state(cls._np_rand_state)
         random.setstate(cls._py_rand_state)
 
+    def try_call_once(self, data_type):
+        if not self.call_once:
+            self.call_once = True
+            self.dtype = data_type
+
+    def infer_dtype_from_inputs_outputs(self, inputs, outputs):
+        def infer_dtype(numpy_dict):
+            assert isinstance(
+                numpy_dict,
+                dict), "self.inputs, self.outputs must be numpy_dict"
+            for var_name, var_value in numpy_dict.iteritems():
+                if isinstance(var_value, (np.ndarray, np.generic)):
+                    self.try_call_once(var_value.dtype)
+                elif isinstance(var_value, (list, tuple)):
+                    # the case of self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
+                    if len(var_value) > 1 and isinstance(var_value[1], (
+                            np.ndarray, np.generic)):
+                        instance = var_value[1]
+                        self.try_call_once(instance[1].dtype)
+                else:
+                    self.try_call_once("float32")
+
+        infer_dtype(inputs)
+        infer_dtype(outputs)
+
     def feed_var(self, input_vars, place):
         feed_map = {}
         for var_name in input_vars:
@@ -233,7 +162,7 @@ class OpTest(unittest.TestCase):
                     tensor = core.LoDTensor()
                     if isinstance(np_value, tuple):
                         tensor.set(np_value[0], place)
-                        tensor.set_lod(np_value[1])
+                        tensor.set_recursive_sequence_lengths(np_value[1])
                     else:
                         tensor.set(np_value, place)
                     feed_map[name] = tensor
@@ -241,25 +170,22 @@ class OpTest(unittest.TestCase):
                 tensor = core.LoDTensor()
                 if isinstance(self.inputs[var_name], tuple):
                     tensor.set(self.inputs[var_name][0], place)
-                    tensor.set_lod(self.inputs[var_name][1])
+                    tensor.set_recursive_sequence_lengths(self.inputs[var_name][
+                        1])
                 else:
                     tensor.set(self.inputs[var_name], place)
                 feed_map[var_name] = tensor
 
         return feed_map
 
-    def calc_output(self, place):
-        outs, _ = self._calc_output(place)
-        return outs
-
-    def _calc_output(self, place):
+    def _append_ops(self, block):
         op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
-
-        program = Program()
-        block = program.global_block()
-
-        inputs = append_input_output(block, op_proto, self.inputs, True)
-        outputs = append_input_output(block, op_proto, self.outputs, False)
+        "infer datatype from inputs and outputs for this test case"
+        self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs)
+        inputs = append_input_output(block, op_proto, self.inputs, True,
+                                     self.dtype)
+        outputs = append_input_output(block, op_proto, self.outputs, False,
+                                      self.dtype)
         op = block.append_op(
             type=self.op_type,
             inputs=inputs,
@@ -269,22 +195,68 @@ class OpTest(unittest.TestCase):
         op.desc.infer_var_type(block.desc)
         op.desc.infer_shape(block.desc)
 
-        fetch_list = []
-        for var_name, var in outputs.iteritems():
-            if var_name in self.outputs:
+    def _get_io_vars(self, block, numpy_inputs):
+        inputs = {}
+        for name, value in numpy_inputs.iteritems():
+            if isinstance(value, list):
+                var_list = [
+                    block.var(sub_name) for sub_name, sub_value in value
+                ]
+                inputs[name] = var_list
+            else:
+                inputs[name] = block.var(name)
+        return inputs
+
+    def _get_inputs(self, block):
+        return self._get_io_vars(block, self.inputs)
+
+    def _get_outputs(self, block):
+        return self._get_io_vars(block, self.outputs)
+
+    def calc_output(self, place):
+        outs, _ = self._calc_output(place)
+        return outs
+
+    def _calc_output(self, place, parallel=False):
+
+        program = Program()
+        block = program.global_block()
+        self._append_ops(block)
+
+        inputs = self._get_inputs(block)
+        outputs = self._get_outputs(block)
+        feed_map = self.feed_var(inputs, place)
+
+        if parallel:
+            use_cuda = False
+            if isinstance(place, fluid.CUDAPlace(0)):
+                use_cuda = True
+            executor = fluid.ParallelExecutor(
+                use_cuda=use_cuda, loss_name=loss.name, main_program=program)
+        else:
+            executor = Executor(place)
+
+        fetch_list = getattr(self, "fetch_list", [])
+        # if the fetch_list is customized by user, we use it directly.
+        # if not, fill the fetch_list by the user configured outputs in test.
+        if len(fetch_list) == 0:
+            for var_name, var in outputs.iteritems():
                 if isinstance(var, list):
                     for v in var:
                         fetch_list.append(v)
                 else:
                     fetch_list.append(var)
-
-        feed_map = self.feed_var(inputs, place)
-
-        exe = Executor(place)
-        outs = exe.run(program,
-                       feed=feed_map,
-                       fetch_list=fetch_list,
-                       return_numpy=False)
+        # if the fetch_list still empty, fill the fetch_list by the operator output.
+        if len(fetch_list) == 0:
+            for out_name, out_dup in Operator.get_op_outputs(self.op_type):
+                fetch_list.append(str(out_name))
+        # fetch_list = map(block.var, fetch_list)
+        if not isinstance(fetch_list[0], Variable):
+            fetch_list = map(block.var, fetch_list)
+        outs = executor.run(program,
+                            feed=feed_map,
+                            fetch_list=fetch_list,
+                            return_numpy=False)
         return outs, fetch_list
 
     def check_output_with_place(self, place, atol):
@@ -322,7 +294,8 @@ class OpTest(unittest.TestCase):
                         str(place))
                     if isinstance(expect, tuple):
                         self.assertListEqual(
-                            actual.lod(), expect[1], "Output (" + sub_out_name +
+                            actual.recursive_sequence_lengths(), expect[1],
+                            "Output (" + sub_out_name +
                             ") has different lod at " + str(place))
             else:
                 idx = find_actual(out_name, fetch_list)
@@ -334,23 +307,25 @@ class OpTest(unittest.TestCase):
                     np.allclose(
                         actual_t, expect_t, atol=atol),
                     "Output (" + out_name + ") has diff at " + str(place) +
-                    str(actual_t) + str(expect_t))
+                    str(actual_t) + "\n" + str(expect_t))
                 if isinstance(expect, tuple):
-                    self.assertListEqual(actual.lod(), expect[1],
-                                         "Output (" + out_name +
+                    self.assertListEqual(actual.recursive_sequence_lengths(),
+                                         expect[1], "Output (" + out_name +
                                          ") has different lod at " + str(place))
 
-    def check_output(self, atol=1e-5):
-        places = [core.CPUPlace()]
+    def _get_places(self):
+        places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
             places.append(core.CUDAPlace(0))
+        return places
+
+    def check_output(self, atol=1e-5):
+        places = self._get_places()
         for place in places:
             self.check_output_with_place(place, atol)
 
     def check_output_customized(self, checker):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
-            places.append(core.CUDAPlace(0))
+        places = self._get_places()
         for place in places:
             outs = self.calc_output(place)
             outs = [np.array(out) for out in outs]
@@ -383,9 +358,7 @@ class OpTest(unittest.TestCase):
                    in_place=False,
                    max_relative_error=0.005,
                    user_defined_grads=None):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
-            places.append(core.CUDAPlace(0))
+        places = self._get_places()
         for place in places:
             self.check_grad_with_place(place, inputs_to_check, output_names,
                                        no_grad_set, numeric_grad_delta,
@@ -432,120 +405,61 @@ class OpTest(unittest.TestCase):
                                max_relative_error,
                                "Gradient Check On %s" % str(place))
 
-    @staticmethod
-    def _create_var_descs_(block, var_dict):
-        # FIXME: Try unify with `append_input_output`
-        for param_name in var_dict:
-            var = var_dict[param_name]
-            if not isinstance(var, list) and not isinstance(var, tuple):
-                var = [(param_name, var, None)]
-            if not isinstance(var[0], list) and not isinstance(var[0], tuple):
-                var = [(param_name, var[0], var[1])]
-
-            for i, item in enumerate(var):
-                if not isinstance(item[0], basestring):
-                    item = [[param_name] + list(item)]
-                if len(item) == 2:
-                    if isinstance(item[1], tuple):
-                        var[i] = [item[0], item[1][0], item[1][1]]
-                    else:
-                        # only set var name and value, set lod to None
-                        var[i] = list(item) + [None]
-            var_descs = [(block.create_var(
-                name=name, shape=each.shape, dtype=each.dtype), each, lod)
-                         for name, each, lod in var]
-
-            yield param_name, var_descs
-
-    @staticmethod
-    def _merge_list(iterable):
-        return reduce(lambda a, b: list(a) + list(b), iterable, [])
-
     @staticmethod
     def _numpy_to_lod_tensor(np_value, lod, place):
         tensor = core.LoDTensor()
         tensor.set(np_value, place)
         if lod is not None:
-            tensor.set_lod(lod)
+            tensor.set_recursive_sequence_lengths(lod)
         return tensor
 
-    def _get_gradient(self, input_to_check, place, output_names, no_grad_set):
+    @staticmethod
+    def np_dtype_to_fluid_dtype(input):
+        """Change the dtype of float16 numpy array
+
+        numpy float16 is binded to paddle::platform::float16
+        in tensor_py.h via the help of uint16 data type since
+        the internal memory representation of float16 is
+        uint16_t in paddle and np.uint16 in numpy, which are
+        themselves binded together by pybind.
+
+        Args:
+            input: input numpy array
+
+        Returns:
+            input: The dtype of input will be changed to np.uint16 if
+                it is originally np.float16, such that the internal memory
+                of input will be reinterpreted as of dtype np.uint16.
+        """
+        if input.dtype == np.float16:
+            input.dtype = np.uint16
+        return input
+
+    def _get_gradient(self,
+                      input_to_check,
+                      place,
+                      output_names,
+                      no_grad_set,
+                      parallel=False):
         prog = Program()
         block = prog.global_block()
-        inputs_with_np = {
-            key: value
-            for (key, value) in OpTest._create_var_descs_(
-                block, getattr(self, 'inputs', {}))
-        }
-        outputs_with_np = {
-            key: val
-            for (key, val) in OpTest._create_var_descs_(
-                block, getattr(self, 'outputs', {}))
-        }
-        inputs = {
-            k: [item[0] for item in inputs_with_np[k]]
-            for k in inputs_with_np
-        }
-        outputs = {
-            k: [item[0] for item in outputs_with_np[k]]
-            for k in outputs_with_np
-        }
-
-        op = block.append_op(
-            type=self.op_type,
-            inputs=inputs,
-            outputs=outputs,
-            attrs=getattr(self, 'attrs', {}))
-
-        # infer variable type and infer shape in compile-time
-        op.desc.infer_var_type(block.desc)
-        op.desc.infer_shape(block.desc)
-
-        mean_inputs = map(block.var, output_names)
-
-        if len(mean_inputs) == 1:
-            loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1])
-            op = block.append_op(
-                inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean')
-            op.desc.infer_var_type(block.desc)
-            op.desc.infer_shape(block.desc)
-        else:
-            avg_sum = []
-            for cur_loss in mean_inputs:
-                cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1])
-                op = block.append_op(
-                    inputs={"X": [cur_loss]},
-                    outputs={"Out": [cur_avg_loss]},
-                    type="mean")
-                op.desc.infer_var_type(block.desc)
-                op.desc.infer_shape(block.desc)
-                avg_sum.append(cur_avg_loss)
-
-            loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1])
-            op_sum = block.append_op(
-                inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum')
-            op_sum.desc.infer_var_type(block.desc)
-            op_sum.desc.infer_shape(block.desc)
-
-            loss = block.create_var(dtype=loss_sum.dtype, shape=[1])
-            op_loss = block.append_op(
-                inputs={"X": loss_sum},
-                outputs={"Out": loss},
-                type='scale',
-                attrs={'scale': 1.0 / float(len(avg_sum))})
-            op_loss.desc.infer_var_type(block.desc)
-            op_loss.desc.infer_shape(block.desc)
-
+        self._append_ops(block)
+        loss = append_loss_ops(block, output_names)
         param_grad_list = append_backward(
             loss=loss, parameter_list=input_to_check, no_grad_set=no_grad_set)
 
-        feed_dict = {
-            item[0].name: OpTest._numpy_to_lod_tensor(item[1], item[2], place)
-            for p_name in inputs_with_np for item in inputs_with_np[p_name]
-        }
+        inputs = self._get_inputs(block)
+        feed_dict = self.feed_var(inputs, place)
 
         fetch_list = [g for p, g in param_grad_list]
-        executor = Executor(place)
-        return map(
-            np.array,
-            executor.run(prog, feed_dict, fetch_list, return_numpy=False))
+        if parallel:
+            use_cuda = False
+            if isinstance(place, fluid.CUDAPlace(0)):
+                use_cuda = True
+            executor = fluid.ParallelExecutor(
+                use_cuda=use_cuda, loss_name=loss.name, main_program=program)
+        else:
+            executor = Executor(place)
+        return map(np.array,
+                   executor.run(prog, feed_dict, fetch_list,
+                                return_numpy=False))
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..cddf00765f4894126988c794763c34629449e8e6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import multiprocessing
+import os
+import unittest
+import paddle.fluid as fluid
+import time
+import numpy as np
+import math
+import sys
+
+__all__ = ['TestParallelExecutorBase']
+
+
+class TestParallelExecutorBase(unittest.TestCase):
+    def check_network_convergence(self,
+                                  method,
+                                  use_cuda=True,
+                                  memory_opt=True,
+                                  iter=50,
+                                  batch_size=None,
+                                  allow_op_delay=False,
+                                  feed_dict=None,
+                                  seed=None,
+                                  use_parallel_executor=True,
+                                  balance_parameter_opt_between_cards=False):
+        def run_executor(exe, feed, fetch_list, program=None):
+            if isinstance(exe, fluid.ParallelExecutor):
+                res = exe.run(fetch_list=fetch_list, feed=feed)
+            elif isinstance(exe, fluid.Executor):
+                if program is None:
+                    program = fluid.default_main_program()
+                res = exe.run(program=program, feed=feed, fetch_list=fetch_list)
+            else:
+                raise ValueError('Unkown type exe')
+            return res
+
+        main = fluid.Program()
+        startup = fluid.Program()
+        startup.random_seed = 1  # Fix random seed
+        with fluid.program_guard(main, startup):
+            if seed is not None:
+                startup.random_seed = seed
+            loss = method(use_feed=feed_dict is not None)
+            adam = fluid.optimizer.Adam()
+            adam.minimize(loss)
+            if memory_opt:
+                fluid.memory_optimize(main)
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            startup_exe = fluid.Executor(place)
+            startup_exe.run(startup)
+            exec_strategy = fluid.ExecutionStrategy()
+            exec_strategy.allow_op_delay = allow_op_delay
+
+            build_strategy = fluid.BuildStrategy()
+            build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if balance_parameter_opt_between_cards else fluid.BuildStrategy.ReduceStrategy.AllReduce
+
+            if use_parallel_executor:
+                exe = fluid.ParallelExecutor(
+                    use_cuda,
+                    loss_name=loss.name,
+                    exec_strategy=exec_strategy,
+                    build_strategy=build_strategy)
+            else:
+                exe = fluid.Executor(place=place)
+
+            if batch_size is not None:
+                batch_size *= fluid.core.get_cuda_device_count(
+                ) if use_cuda else int(
+                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+            begin = time.time()
+            first_loss, = run_executor(
+                exe=exe, feed=feed_dict, fetch_list=[loss.name])
+
+            for i in xrange(iter):
+                run_executor(exe=exe, feed=feed_dict, fetch_list=[])
+
+            last_loss, = run_executor(
+                exe=exe, feed=feed_dict, fetch_list=[loss.name])
+            end = time.time()
+
+            if batch_size is not None:
+                print "%.4f Instance per second" % (
+                    (batch_size * iter + 2) / (end - begin))
+
+            avg_last_loss_val = np.array(last_loss).mean()
+            avg_first_loss_val = np.array(first_loss).mean()
+            if math.isnan(float(avg_last_loss_val)) or math.isnan(
+                    float(avg_first_loss_val)):
+                sys.exit("got NaN loss, training failed.")
+
+            print first_loss, last_loss
+            # self.assertGreater(first_loss[0], last_loss[0])
+            return first_loss, last_loss
diff --git a/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d554c2276c9acd710d14c8f8b32c802e3e17515
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py
@@ -0,0 +1,99 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+from scipy.special import expit
+from test_activation_op import TestRelu, TestTanh, TestSqrt, TestAbs
+
+
+class TestMKLDNNReluDim2(TestRelu):
+    def setUp(self):
+        super(TestMKLDNNReluDim2, self).setUp()
+
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNTanhDim2(TestTanh):
+    def setUp(self):
+        super(TestMKLDNNTanhDim2, self).setUp()
+
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNSqrtDim2(TestSqrt):
+    def setUp(self):
+        super(TestMKLDNNSqrtDim2, self).setUp()
+
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNAbsDim2(TestAbs):
+    def setUp(self):
+        super(TestMKLDNNAbsDim2, self).setUp()
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNReluDim4(TestRelu):
+    def setUp(self):
+        super(TestMKLDNNReluDim4, self).setUp()
+
+        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
+        # The same reason with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        out = np.maximum(x, 0)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNTanhDim4(TestTanh):
+    def setUp(self):
+        super(TestMKLDNNTanhDim4, self).setUp()
+
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
+        }
+        self.outputs = {'Out': np.tanh(self.inputs['X'])}
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNSqrtDim4(TestSqrt):
+    def setUp(self):
+        super(TestMKLDNNSqrtDim4, self).setUp()
+
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
+        }
+        self.outputs = {'Out': np.sqrt(self.inputs['X'])}
+        self.attrs = {"use_mkldnn": True}
+
+
+class TestMKLDNNAbsDim4(TestAbs):
+    def setUp(self):
+        super(TestMKLDNNAbsDim4, self).setUp()
+
+        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
+        # The same reason with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        self.inputs = {'X': x}
+        self.outputs = {'Out': np.abs(self.inputs['X'])}
+        self.attrs = {"use_mkldnn": True}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index eab41ebe711bd21bdc3b34ca83ab57388cc35ba2..5ed387fb1247f1a91147cb6981f1adc7c2eeb8a2 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -14,6 +14,7 @@
 
 import unittest
 import numpy as np
+import paddle.fluid.core as core
 from op_test import OpTest
 from scipy.special import expit
 
@@ -21,429 +22,992 @@ from scipy.special import expit
 class TestExp(OpTest):
     def setUp(self):
         self.op_type = "exp"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
-        }
-        self.outputs = {'Out': np.exp(self.inputs['X'])}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.exp(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Exp(TestExp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestSigmoid(OpTest):
     def setUp(self):
         self.op_type = "sigmoid"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
-        }
-        self.outputs = {'Out': 1 / (1 + np.exp(-self.inputs['X']))}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        out = 1 / (1 + np.exp(-x))
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.008)
+        if self.dtype == np.float16:
+            return
+        self.check_grad(['X'], 'Out', max_relative_error=0.01)
+
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Sigmoid(TestSigmoid):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
 class TestLogSigmoid(OpTest):
     def setUp(self):
         self.op_type = "logsigmoid"
-        self.inputs = {
-            'X': np.random.uniform(-1, 1, [11, 17]).astype("float32")
-        }
-        self.outputs = {'Out': np.log(1 / (1 + np.exp(-self.inputs['X'])))}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        out = np.log(1 / (1 + np.exp(-x)))
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.008)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16LogSigmoid(TestLogSigmoid):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestTanh(OpTest):
     def setUp(self):
         self.op_type = "tanh"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
-        }
-        self.outputs = {'Out': np.tanh(self.inputs['X'])}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.tanh(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Tanh(TestTanh):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestTanhShrink(OpTest):
     def setUp(self):
         self.op_type = "tanh_shrink"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [10, 17]).astype("float32")
-        }
-        self.outputs = {'Out': self.inputs['X'] - np.tanh(self.inputs['X'])}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [10, 17]).astype(self.dtype)
+        out = x - np.tanh(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.008)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16TanhShrink(TestTanhShrink):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestHardShrink(OpTest):
     def setUp(self):
         self.op_type = "hard_shrink"
-        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.dtype = np.float32
+        self.init_dtype()
+
         threshold = 0.5
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
+        out = np.copy(x)
+        out[(out >= -threshold) & (out <= threshold)] = 0
 
-        self.inputs = {'X': x}
         self.attrs = {'lambda': threshold}
-
-        t = np.copy(x)
-        t[(t >= -threshold) & (t <= threshold)] = 0
-        self.outputs = {'Out': t}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.005)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16HardShrink(TestHardShrink):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestSoftShrink(OpTest):
     def setUp(self):
         self.op_type = "softshrink"
+        self.dtype = np.float32
+        self.init_dtype()
+
         lambda_val = 0.1
+        x = np.random.uniform(0.25, 10, [4, 4]).astype(self.dtype)
+        out = np.copy(x)
+        out = (out < -lambda_val) * (out + lambda_val) + (out > lambda_val) * (
+            out - lambda_val)
+
         self.attrs = {'lambda': lambda_val}
-        self.inputs = {
-            'X': np.random.uniform(0.25, 10, [4, 4]).astype("float32")
-        }
-        y = np.copy(self.inputs['X'])
-        y = (y < -lambda_val) * (y + lambda_val) + (y > lambda_val) * (
-            y - lambda_val)
-        self.outputs = {'Out': y}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16SoftShrink(TestSoftShrink):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestSqrt(OpTest):
     def setUp(self):
         self.op_type = "sqrt"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
-        }
-        self.outputs = {'Out': np.sqrt(self.inputs['X'])}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.sqrt(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Sqrt(TestSqrt):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestAbs(OpTest):
     def setUp(self):
         self.op_type = "abs"
-        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
         # Because we set delta = 0.005 in caculating numeric gradient,
         # if x is too small, such as 0.002, x_neg will be -0.003
         # x_pos will be 0.007, so the numeric gradient is unaccurate.
         # we should avoid this
         x[np.abs(x) < 0.005] = 0.02
-        self.inputs = {'X': x}
-        self.outputs = {'Out': np.abs(self.inputs['X'])}
+        out = np.abs(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Abs(TestAbs):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestCeil(OpTest):
     def setUp(self):
         self.op_type = "ceil"
-        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
-        self.inputs = {'X': x}
-        self.outputs = {'Out': np.ceil(self.inputs['X'])}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
+        out = np.ceil(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+    # The same reason with TestFloor
+
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Ceil(TestCeil):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
 class TestFloor(OpTest):
     def setUp(self):
         self.op_type = "floor"
-        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
-        self.inputs = {'X': x}
-        self.outputs = {'Out': np.floor(self.inputs['X'])}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
+        out = np.floor(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    # the gradient on floor, ceil, round is undefined.
+    # we return zero as gradient, but the numpy return nan 
+
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Floor(TestFloor):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
+
+class TestCos(OpTest):
+    def setUp(self):
+        self.op_type = "cos"
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
+        out = np.cos(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
 
-class TestRound(OpTest):
+
+class TestFP16Cos(TestCos):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
+
+class TestSin(OpTest):
     def setUp(self):
-        self.op_type = "round"
-        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
-        self.inputs = {'X': x}
-        self.outputs = {'Out': np.round(self.inputs['X'])}
+        self.op_type = "sin"
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
+        out = np.sin(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Sin(TestSin):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
+
+class TestRound(OpTest):
+    def setUp(self):
+        self.op_type = "round"
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
+        out = np.round(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Round(TestRound):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestRelu(OpTest):
     def setUp(self):
         self.op_type = "relu"
-        x = np.random.uniform(-1, 1, [11, 17]).astype("float32")
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
         # The same reason with TestAbs
         x[np.abs(x) < 0.005] = 0.02
-        self.inputs = {'X': x}
-        self.outputs = {'Out': np.maximum(self.inputs['X'], 0)}
+        out = np.maximum(x, 0)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Relu(TestRelu):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestBRelu(OpTest):
     def setUp(self):
         self.op_type = "brelu"
-        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
         t_min = 1.0
         t_max = 4.0
         # The same with TestAbs
         x[np.abs(x - t_min) < 0.005] = t_min + 0.02
         x[np.abs(x - t_max) < 0.005] = t_max + 0.02
-
-        self.inputs = {'X': x}
-        self.attrs = {'t_min': t_min, 't_max': t_max}
         t = np.copy(x)
         t[t < t_min] = t_min
         t[t > t_max] = t_max
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {'t_min': t_min, 't_max': t_max}
         self.outputs = {'Out': t}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.02)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16BRelu(TestBRelu):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestRelu6(OpTest):
     def setUp(self):
         self.op_type = "relu6"
-        x = np.random.uniform(-1, 1, [4, 10]).astype("float32")
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [4, 10]).astype(self.dtype)
         threshold = 6.0
         # The same with TestAbs
         x[np.abs(x) < 0.005] = 0.02
         x[np.abs(x - threshold) < 0.005] = threshold + 0.02
+        out = np.minimum(np.maximum(x, 0), threshold)
 
-        self.inputs = {'X': x}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.attrs = {'threshold': threshold}
-        self.outputs = {
-            'Out': np.minimum(np.maximum(self.inputs['X'], 0), threshold)
-        }
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.02)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Relu6(TestRelu6):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestSoftRelu(OpTest):
     def setUp(self):
         self.op_type = "soft_relu"
-        x = np.random.uniform(-3, 3, [4, 4]).astype("float32")
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-3, 3, [4, 4]).astype(self.dtype)
         threshold = 2.0
         # The same reason with TestAbs
         x[np.abs(x - threshold) < 0.005] = threshold + 0.02
         x[np.abs(x + threshold) < 0.005] = -threshold + 0.02
-        self.inputs = {'X': x}
-        self.attrs = {'threshold': threshold}
         t = np.copy(x)
         t[t < -threshold] = -threshold
         t[t > threshold] = threshold
-        self.outputs = {'Out': np.log((np.exp(t) + 1))}
+        out = np.log((np.exp(t) + 1))
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {'threshold': threshold}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.02)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16SoftRelu(TestSoftRelu):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestELU(OpTest):
     def setUp(self):
         self.op_type = "elu"
-        x = np.random.uniform(-3, 3, [4, 4]).astype("float32")
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-3, 3, [4, 4]).astype(self.dtype)
         alpha = 1.
+        out = np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x) - 1))
         # Note: unlike other Relu extensions, point 0 on standard ELU function (i.e. alpha = 1)
         # is differentiable, so we can skip modifications like x[np.abs(x) < 0.005] = 0.02 here
         self.inputs = {'X': x}
         self.attrs = {'alpha': alpha}
-        self.outputs = {
-            'Out': np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x) - 1))
-        }
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.02)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16ELU(TestELU):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestReciprocal(OpTest):
     def setUp(self):
         self.op_type = "reciprocal"
-        self.inputs = {'X': np.random.uniform(1, 2, [11, 17]).astype("float32")}
-        self.outputs = {'Out': np.reciprocal(self.inputs['X'])}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.reciprocal(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.01)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Reciprocal(TestReciprocal):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestLog(OpTest):
     def setUp(self):
         self.op_type = "log"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
-        }
-        self.outputs = {'Out': np.log(self.inputs['X'])}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.log(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Log(TestLog):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestSquare(OpTest):
     def setUp(self):
         self.op_type = "square"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
-        }
-        self.outputs = {'Out': np.square(self.inputs['X'])}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.square(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Square(TestSquare):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestPow(OpTest):
     def setUp(self):
         self.op_type = "pow"
-        self.inputs = {'X': np.random.uniform(1, 2, [11, 17]).astype("float32")}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.power(x, 3)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.attrs = {'factor': 3.0}
-        self.outputs = {'Out': np.power(self.inputs['X'], 3)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.02)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Pow(TestPow):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=5e-2)
+
 
 class TestSTanh(OpTest):
     def setUp(self):
         self.op_type = "stanh"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
-        }
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
         scale_a = 2.0 / 3.0
         scale_b = 1.7159
+        out = scale_b * np.tanh(x * scale_a)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.attrs = {'scale_a': scale_a, 'scale_b': scale_b}
-        self.outputs = {'Out': scale_b * np.tanh(self.inputs['X'] * scale_a)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16STanh(TestSTanh):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestSoftplus(OpTest):
     def setUp(self):
         self.op_type = "softplus"
-        self.inputs = {
-            'X': np.random.uniform(-1, 1, [11, 17]).astype("float64")
-        }
-        self.outputs = {'Out': np.log(1 + np.exp(self.inputs['X']))}
+        self.dtype = np.float64
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        out = np.log(1 + np.exp(x))
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Softplus(TestSoftplus):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestSoftsign(OpTest):
     def setUp(self):
         self.op_type = "softsign"
-        self.inputs = {
-            'X': np.random.uniform(-1, 1, [11, 17]).astype("float32")
-        }
-        self.outputs = {
-            'Out': np.divide(self.inputs['X'], 1 + np.abs(self.inputs['X']))
-        }
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        out = np.divide(x, 1 + np.abs(x))
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Softsign(TestSoftsign):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestThresholdedRelu(OpTest):
     def setUp(self):
         self.op_type = "thresholded_relu"
+        self.dtype = np.float32
+        self.init_dtype()
+
         threshold = 0.25
         self.relative_error = 0.005
-        X = np.random.uniform(-1, 1, [11, 17]).astype("float32")
+        X = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
 
         # Same reason as TestAbs
         X[np.abs(X - threshold) < self.relative_error] = threshold + 0.2
+        out = (X > threshold) * X
 
-        self.inputs = {'X': X}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(X)}
         self.attrs = {'threshold': threshold}
-        self.outputs = {'Out': (X > threshold) * X}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=self.relative_error)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16ThresholdedRelu(TestThresholdedRelu):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestHardSigmoid(OpTest):
     def setUp(self):
         self.op_type = "hard_sigmoid"
+        self.dtype = np.float32
+        self.init_dtype()
+
         self.relative_error = 0.002
 
         X = np.random.uniform(-5, 5, [2, 2]).astype("float32")
@@ -452,7 +1016,6 @@ class TestHardSigmoid(OpTest):
         lower_threshold = -offset / slope
         upper_threshold = (1 - offset) / slope
 
-        self.inputs = {'X': X}
         # Same reason as TestAbs
         X[np.abs(X - lower_threshold) < self.relative_error] = \
             lower_threshold + 0.2
@@ -460,29 +1023,70 @@ class TestHardSigmoid(OpTest):
             upper_threshold - 0.2
 
         temp = X * slope + offset
-        self.outputs = {'Out': np.maximum(0.0, np.minimum(1.0, temp))}
+        out = np.maximum(0.0, np.minimum(1.0, temp))
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(X)}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.002)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16HardSigmoid(TestHardSigmoid):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 class TestSwish(OpTest):
     def setUp(self):
         self.op_type = "swish"
-        X = np.random.uniform(0.1, 1, [11, 17]).astype("float32")
-        self.inputs = {'X': X}
-        self.attrs = {'beta': 2.3}
-        self.outputs = {'Out': X * expit(self.attrs['beta'] * X)}
+        self.dtype = np.float32
+        self.init_dtype()
+
+        X = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        beta = 2.3
+        out = X * expit(beta * X)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(X)}
+        self.attrs = {'beta': beta}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X'], 'Out', max_relative_error=0.008)
 
+    def init_dtype(self):
+        pass
+
+
+class TestFP16Swish(TestSwish):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py b/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c7d5d41f0c512a9fb609dce304c1eed929d28b5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py
@@ -0,0 +1,110 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://w_idxw.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import sys
+import math
+from op_test import OpTest
+
+
+def anchor_generator_in_python(input_feat, anchor_sizes, aspect_ratios,
+                               variances, stride, offset):
+    num_anchors = len(aspect_ratios) * len(anchor_sizes)
+    layer_h = input_feat.shape[2]
+    layer_w = input_feat.shape[3]
+    out_dim = (layer_h, layer_w, num_anchors, 4)
+    out_anchors = np.zeros(out_dim).astype('float32')
+
+    for h_idx in range(layer_h):
+        for w_idx in range(layer_w):
+            x_ctr = (w_idx * stride[0]) + offset * (stride[0] - 1)
+            y_ctr = (h_idx * stride[1]) + offset * (stride[1] - 1)
+            idx = 0
+            for r in range(len(aspect_ratios)):
+                ar = aspect_ratios[r]
+                for s in range(len(anchor_sizes)):
+                    anchor_size = anchor_sizes[s]
+                    area = stride[0] * stride[1]
+                    area_ratios = area / ar
+                    base_w = np.round(np.sqrt(area_ratios))
+                    base_h = np.round(base_w * ar)
+                    scale_w = anchor_size / stride[0]
+                    scale_h = anchor_size / stride[1]
+                    w = scale_w * base_w
+                    h = scale_h * base_h
+                    out_anchors[h_idx, w_idx, idx, :] = [
+                        (x_ctr - 0.5 * (w - 1)), (y_ctr - 0.5 * (h - 1)),
+                        (x_ctr + 0.5 * (w - 1)), (y_ctr + 0.5 * (h - 1))
+                    ]
+                    idx += 1
+
+    # set the variance.
+    out_var = np.tile(variances, (layer_h, layer_w, num_anchors, 1))
+    out_anchors = out_anchors.astype('float32')
+    out_var = out_var.astype('float32')
+    return out_anchors, out_var
+
+
+class TestAnchorGeneratorOp(OpTest):
+    def set_data(self):
+        self.init_test_params()
+        self.init_test_input()
+        self.init_test_output()
+        self.inputs = {'Input': self.input}
+
+        self.attrs = {
+            'anchor_sizes': self.anchor_sizes,
+            'aspect_ratios': self.aspect_ratios,
+            'stride': self.stride,
+            'offset': self.offset,
+            'variances': self.variances,
+        }
+
+        self.outputs = {'Anchors': self.out_anchors, 'Variances': self.out_var}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def setUp(self):
+        self.op_type = "anchor_generator"
+        self.set_data()
+
+    def init_test_params(self):
+        self.batch_size = 1
+        self.input_channels = 2
+        self.layer_h = 2
+        self.layer_w = 2
+
+        self.anchor_sizes = [64., 128., 256., 512.]
+        self.aspect_ratios = [0.5, 1., 2.]
+        self.stride = [16., 16.]
+
+        self.offset = 0.5
+
+        self.variances = [0.1, 0.1, 0.2, 0.2]
+
+    def init_test_input(self):
+        self.input = np.random.random(
+            (self.batch_size, self.input_channels, self.layer_h,
+             self.layer_w)).astype('float32')
+
+    def init_test_output(self):
+        self.out_anchors, self.out_var = anchor_generator_in_python(
+            self.input, self.anchor_sizes, self.aspect_ratios, self.variances,
+            self.stride, self.offset)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e04412f809cdd75d07d28a60f0c2f19041a684f6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class BaseTestCase(OpTest):
+    def initTestCase(self):
+        self.op_type = 'arg_min'
+        self.dims = (3, 4, 5)
+        self.dtype = 'float32'
+        self.axis = 0
+
+    def setUp(self):
+        self.initTestCase()
+        self.x = (1000 * np.random.random(self.dims)).astype(self.dtype)
+        self.inputs = {'X': self.x}
+        self.attrs = {'axis': self.axis}
+        if self.op_type == "arg_min":
+            self.outputs = {'Out': np.argmin(self.x, axis=self.axis)}
+        else:
+            self.outputs = {'Out': np.argmax(self.x, axis=self.axis)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestCase0(BaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_max'
+        self.dims = (3, 4, 5)
+        self.dtype = 'float32'
+        self.axis = 0
+
+
+class TestCase1(BaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_min'
+        self.dims = (3, 4)
+        self.dtype = 'float64'
+        self.axis = 1
+
+
+class TestCase2(BaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_max'
+        self.dims = (3, 4)
+        self.dtype = 'int64'
+        self.axis = 0
+
+
+class TestCase3(BaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_max'
+        self.dims = (3, )
+        self.dtype = 'int64'
+        self.axis = 0
+
+
+class TestCase4(BaseTestCase):
+    def initTestCase(self):
+        self.op_type = 'arg_min'
+        self.dims = (1, )
+        self.dtype = 'int32'
+        self.axis = 0
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_argsort_op.py b/python/paddle/fluid/tests/unittests/test_argsort_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..b29a102a3880406156481fdac54ca7043d3415db
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_argsort_op.py
@@ -0,0 +1,56 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestArgsortOp(OpTest):
+    def setUp(self):
+        self.init_axis()
+        x = np.random.random((2, 3, 4, 5, 10)).astype("float32")
+        if self.axis < 0:
+            self.axis = self.axis + len(x.shape)
+        self.indices = np.argsort(x, kind='quicksort', axis=self.axis)
+        self.out = np.sort(x, kind='quicksort', axis=self.axis)
+        self.op_type = "argsort"
+        self.inputs = {'X': x}
+        self.attrs = {'axis': self.axis}
+        self.outputs = {'Indices': self.indices, 'Out': self.out}
+
+    def init_axis(self):
+        self.axis = -1
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestArgsortOpAxis0(TestArgsortOp):
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestArgsortOpAxis1(TestArgsortOp):
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestArgsortOpAxisNeg2(TestArgsortOp):
+    def init_axis(self):
+        self.axis = -2
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..18fa5461590134d2032a29e40699109c12092c6d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
@@ -0,0 +1,68 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest
+from paddle.fluid.framework import grad_var_name
+from test_batch_norm_op import TestBatchNormOpInference, TestBatchNormOpTraining, _reference_training, _reference_grad
+
+
+class TestMKLDNNBatchNormOpTraining(TestBatchNormOpTraining):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self.data_formats = ["NCHW"]
+
+    def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
+                             epsilon, momentum, shape, data_layout):
+        # run forward
+        y, saved_mean, saved_variance = _reference_training(
+            x, scale, bias, epsilon, data_layout)
+        mean_out = saved_mean * (1. - momentum) + momentum * mean
+        variance_out = saved_variance * (1. - momentum) + momentum * variance
+        # run backward
+        x_grad, scale_grad, bias_grad = _reference_grad(
+            x, y_grad, scale, saved_mean, saved_variance, epsilon, data_layout)
+
+        return y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad
+
+
+class TestMKLDNNBatchNormOpInference(TestBatchNormOpInference):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+    def test_check_output(self):
+        place = core.CPUPlace()
+        data_format = "NCHW"
+
+        self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5])
+
+
+class TestMKLDNNBatchNormOpWithReluInference(TestBatchNormOpInference):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+        self.fuse_with_relu = True
+
+    def test_check_output(self):
+        place = core.CPUPlace()
+        data_format = "NCHW"
+
+        self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index 80e6fa6df3c21aa19feb571916f11c41ccd6bb10..a62ee9596d0f6c58135b4a13249b638e84e63c3c 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -14,24 +14,14 @@
 
 import unittest
 import numpy as np
-from op_test import OpTest
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest
 from paddle.fluid.framework import grad_var_name
 
 
-def get_backward_op(scope, op, no_grad_set):
-    backward_op = core.Operator.backward(op, no_grad_set)
-    for input in backward_op.input_vars():
-        var = scope.var(input)
-        var.get_tensor()
-    for output in backward_op.output_vars():
-        var = scope.var(output)
-        var.get_tensor()
-    return backward_op
-
-
-def _reference_training(x, scale, offset, epsilon, data_format):
+def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
     x_shape = x.shape
     if len(x_shape) == 2:
         if data_format == "NCHW":
@@ -39,6 +29,32 @@ def _reference_training(x, scale, offset, epsilon, data_format):
         else:
             x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
 
+    if data_format == "NCHW":
+        n, c, h, w = x.shape
+        mean_tile = np.reshape(mean, (1, c, 1, 1))
+        mean_tile = np.tile(mean_tile, (n, 1, h, w))
+        var_tile = np.reshape(var, (1, c, 1, 1))
+        var_tile = np.tile(var_tile, (n, 1, h, w))
+        normalized = (x - mean_tile) / np.sqrt(var_tile + epsilon)
+        scale_tile = np.reshape(scale, (1, c, 1, 1))
+        scale_tile = np.tile(scale_tile, (n, 1, h, w))
+        offset_tile = np.reshape(offset, (1, c, 1, 1))
+        offset_tile = np.reshape(offset_tile, (1, c, 1, 1))
+        y = normalized * scale_tile + offset_tile
+    elif data_format == "NHWC":
+        normalized = (x - mean) / np.sqrt(var + epsilon)
+        y = normalized * scale + offset
+    else:
+        raise ValueError("Unknown data order.")
+
+    if len(x_shape) == 2:
+        y = np.reshape(y, x_shape)
+    return y
+
+
+def _reference_training(x, scale, offset, epsilon, data_format):
+    x_shape = x.shape
+
     if data_format == "NCHW":
         n, c, h, w = x.shape
         x_square = x * x
@@ -57,8 +73,6 @@ def _reference_training(x, scale, offset, epsilon, data_format):
         offset_tile = np.reshape(offset, (1, c, 1, 1))
         offset_tile = np.reshape(offset_tile, (1, c, 1, 1))
         y = normalized * scale_tile + offset_tile
-        if len(x_shape) == 2:
-            y = np.reshape(y, (y.shape[0], y.shape[1]))
         return y, mean, var
     elif data_format == "NHWC":
         x_square = x * x
@@ -69,66 +83,52 @@ def _reference_training(x, scale, offset, epsilon, data_format):
         var = x_square_sum / element_count - mean * mean
         normalized = (x - mean) / np.sqrt(var + epsilon)
         y = normalized * scale + offset
-        if len(x_shape) == 2:
-            y = np.reshape(y, x_shape)
         return y, mean, var
     else:
         raise ValueError("Unknown data order.")
 
 
-def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
+def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format):
     # Use the following formulas to calculate gradients:
     # grad_scale =
     #   sum(grad_y * (x - mean)) * rsqrt(var + epsilon)
     #
     # grad_offset = sum(output_y)
     #
-    # grad_x =
+    # x_grad =
     #   1/N * scale * rsqrt(var + epsilon) * (N * grad_y - sum(grad_y) -
     #   (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon))
 
     # transfer from (N, C, H, W) to (N, H, W, C) to simplify computation
-    x_shape = x.shape
-
-    if len(x_shape) == 2:
-        if data_format == "NCHW":
-            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
-            grad_y = np.reshape(grad_y,
-                                (grad_y.shape[0], grad_y.shape[1], 1, 1))
-        else:
-            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
-            grad_y = np.reshape(grad_y,
-                                (grad_y.shape[0], 1, 1, grad_y.shape[1]))
+    if data_format != "NCHW" and data_format != "NHWC":
+        raise ValueError("Unknown data order.")
 
     if data_format == "NCHW":
         x = np.transpose(x, (0, 2, 3, 1))
-        grad_y = np.transpose(grad_y, (0, 2, 3, 1))
+        y_grad = np.transpose(y_grad, (0, 2, 3, 1))
 
-        # raise ValueError("data_format must be NHWC, got %s." % data_format)
-    grad_x = scale * (grad_y - np.mean(
-        grad_y, axis=(0, 1, 2)) - (x - mean) * np.mean(
-            grad_y * (x - mean), axis=(0, 1, 2)) /
+    x_grad = scale * (y_grad - np.mean(
+        y_grad, axis=(0, 1, 2)) - (x - mean) * np.mean(
+            y_grad * (x - mean), axis=(0, 1, 2)) /
                       (var + epsilon)) / np.sqrt(var + epsilon)
-    grad_scale = np.sum(grad_y * (x - mean) / np.sqrt(var + epsilon),
+    grad_scale = np.sum(y_grad * (x - mean) / np.sqrt(var + epsilon),
                         axis=(0, 1, 2))
-    grad_offset = np.sum(grad_y, axis=(0, 1, 2))
+    grad_offset = np.sum(y_grad, axis=(0, 1, 2))
 
     # transfer back to N, C, H, W
     if data_format == "NCHW":
-        grad_x = np.transpose(grad_x, (0, 3, 1, 2))
+        x_grad = np.transpose(x_grad, (0, 3, 1, 2))
         x = np.transpose(x, (0, 3, 1, 2))
-        grad_y = np.transpose(grad_y, (0, 3, 1, 2))
+        y_grad = np.transpose(y_grad, (0, 3, 1, 2))
 
-    if len(x_shape) == 2:
-        grad_x = np.reshape(grad_x, x_shape)
-    return grad_x, grad_scale, grad_offset
+    return x_grad, grad_scale, grad_offset
 
 
 def create_or_get_tensor(scope, var_name, var, place):
     tensor = scope.var(var_name).get_tensor()
     if var is not None:
         assert isinstance(var, np.ndarray)
-        tensor.set_lod([[]])
+        tensor.set_recursive_sequence_lengths([])
         tensor.set_dims(var.shape)
         tensor.set(var, place)
     return tensor
@@ -155,210 +155,280 @@ def set_output_grad(scope, outputs, place, feed_dict=None):
         __set_tensor__(output, data)
 
 
-class TestBatchNormOp(OpTest):
+class TestBatchNormOpInference(unittest.TestCase):
+    def setUp(self):
+        self.dtype = np.float32
+        self.use_mkldnn = False
+        self.fuse_with_relu = False
+        self.init_kernel_type()
+
     def __assert_close(self, tensor, np_array, msg, atol=1e-4):
         self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
 
-    def test_python(self):
-        data_format = "NHWC"
+    def check_with_place(self, place, data_layout, dtype, shape):
         epsilon = 0.00001
-        momentum = 0.9
-
-        # N, H, W, C: 2, 3, 4, 2
-        n, h, w, c = 2, 3, 4, 5
-        x_shape = [n, h, w, c]
+        if len(shape) == 2:
+            x_shape = shape
+            c = x_shape[1]
+        else:
+            n, h, w, c = shape[0], shape[1], shape[2], shape[3]
+            if data_layout == "NHWC":
+                x_shape = [n, h, w, c]
+            elif data_layout == "NCHW":
+                x_shape = [n, c, h, w]
+            else:
+                raise ValueError("Unknown data layout.")
         scale_shape = [c]
 
-        x_val = np.random.random_sample(x_shape).astype(np.float32)
+        x_val = np.random.random_sample(x_shape).astype(dtype)
+        # generate some negative values to test case with relu fused
+        x_val = x_val - 0.5
         scale_val = np.random.random_sample(scale_shape).astype(np.float32)
         bias_val = np.random.random_sample(scale_shape).astype(np.float32)
 
         mean = np.zeros(scale_shape).astype(np.float32)
         variance = np.ones(scale_shape).astype(np.float32)
 
-        # run forward
-        y_out, saved_mean, var_ref = _reference_training(
-            x_val, scale_val, bias_val, epsilon, "NHWC")
+        y_out = _reference_testing(x_val, scale_val, bias_val, mean, variance,
+                                   epsilon, data_layout).astype(dtype)
+        if self.fuse_with_relu:
+            y_out = np.maximum(y_out, 0)
+
+        scope = core.Scope()
+
+        # create input
+        x_tensor = create_or_get_tensor(scope, "x_val",
+                                        OpTest.np_dtype_to_fluid_dtype(x_val),
+                                        place)
+        scale_tensor = create_or_get_tensor(
+            scope, "scale_val",
+            OpTest.np_dtype_to_fluid_dtype(scale_val), place)
+        bias_tensor = create_or_get_tensor(
+            scope, "bias_val", OpTest.np_dtype_to_fluid_dtype(bias_val), place)
+        mean_tensor = create_or_get_tensor(scope, "mean",
+                                           OpTest.np_dtype_to_fluid_dtype(mean),
+                                           place)
+        variance_tensor = create_or_get_tensor(
+            scope, "variance", OpTest.np_dtype_to_fluid_dtype(variance), place)
+
+        # create output
+        y_tensor = create_or_get_tensor(scope, "y_out", None, place)
+        saved_mean_tensor = create_or_get_tensor(scope, "saved_mean", None,
+                                                 place)
+        saved_variance_tensor = create_or_get_tensor(scope, "saved_variance",
+                                                     None, place)
+        mean_out_tensor = mean_tensor
+        variance_out_tensor = variance_tensor
+
+        batch_norm_op = Operator(
+            "batch_norm",
+            # inputs
+            X="x_val",
+            Scale="scale_val",
+            Bias="bias_val",
+            Mean="mean",
+            Variance="variance",
+            # outputs
+            Y="y_out",
+            MeanOut="mean",
+            VarianceOut="variance",
+            SavedMean="saved_mean",
+            SavedVariance="saved_variance",
+            # attrs
+            is_test=True,
+            data_layout=data_layout,
+            use_mkldnn=self.use_mkldnn,
+            fuse_with_relu=self.fuse_with_relu,
+            epsilon=epsilon)
+
+        batch_norm_op.run(scope, place)
+
+        # check inference result
+        self.__assert_close(
+            y_tensor,
+            y_out,
+            "inference output are different at " + str(place) + ", " +
+            data_layout + ", " + str(np.dtype(dtype)) +
+            str(np.array(y_tensor)) + str(y_out),
+            atol=1e-3)
+
+    def test_check_output(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(core.CUDAPlace(0))
+
+        for place in places:
+            for data_format in ["NCHW", "NHWC"]:
+                self.check_with_place(place, data_format, self.dtype,
+                                      [2, 3, 4, 5])
+                self.check_with_place(place, data_format, self.dtype, [2, 3])
+
+    def init_kernel_type(self):
+        pass
+
+
+class TestFP16BatchNormOpInference(TestBatchNormOpInference):
+    def setUp(self):
+        self.dtype = np.float16
+        self.use_mkldnn = False
+        self.fuse_with_relu = False
+        self.init_kernel_type()
+
+    def test_check_output(self):
+        places = []
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                places.append(place)
+
+        for place in places:
+            for data_format in ["NCHW", "NHWC"]:
+                self.check_with_place(place, data_format, self.dtype,
+                                      [2, 3, 4, 5])
+                self.check_with_place(place, data_format, self.dtype, [2, 3])
+
+
+class TestBatchNormOpTraining(unittest.TestCase):
+    def setUp(self):
+        self.use_mkldnn = False
+        self.fuse_with_relu = False
+        self.data_formats = ["NCHW", "NHWC"]
+        self.init_kernel_type()
+
+    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        np.allclose(np.array(tensor), np_array, atol=atol)
 
-        #
+    def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
+                             epsilon, momentum, shape, data_layout):
+        # run forward
+        y, saved_mean, var_ref = _reference_training(x, scale, bias, epsilon,
+                                                     data_layout)
         mean_out = saved_mean * (1. - momentum) + momentum * mean
         variance_out = var_ref * (1. - momentum) + momentum * variance
         saved_variance = 1. / np.sqrt(var_ref + epsilon)
+        # run backward
+        x_grad, scale_grad, bias_grad = _reference_grad(
+            x, y_grad, scale, saved_mean, var_ref, epsilon, data_layout)
 
-        # running N, C, H, W case
-        # should produce the same results
-        x_shape2 = [n, c, h, w]
-        x_val2 = np.transpose(x_val, (0, 3, 1, 2))
-        y_out2, saved_mean2, var_ref2 = _reference_training(
-            x_val2, scale_val, bias_val, epsilon, "NCHW")
-
-        self.__assert_close(saved_mean, saved_mean2, "batch mean")
-        self.__assert_close(var_ref, var_ref2, "batch variance")
-
-        # transfer (N, C, H, W) back to (N, H, W, C)
-        y_out2_trans = np.transpose(y_out2, (0, 2, 3, 1))
-        self.__assert_close(y_out, y_out2_trans, "batch variance")
-        print 'python: NHWC, NCHW, forward checking passed'
-
-        # test backward now
-        # NHWC
-        self.y_grad = np.random.random_sample(x_shape).astype(np.float32)
-        y_grad = self.y_grad
-        # y_grad = np.ones(x_shape).astype(np.float32)
-        x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad(
-            x_val, y_grad, scale_val, saved_mean, var_ref, epsilon, "NHWC")
-
-        # NCHW
-        y_grad2 = np.transpose(y_grad, (0, 3, 1, 2))
-        # y_grad2 = np.ones(x_shape2).astype(np.float32)
-        x_grad_ref2, scale_grad_ref2, bias_grad_ref2 = _reference_grad(
-            x_val2, y_grad2, scale_val, saved_mean2, var_ref2, epsilon, "NCHW")
-
-        self.__assert_close(scale_grad_ref, scale_grad_ref2, "scale gradient")
-        self.__assert_close(bias_grad_ref, bias_grad_ref2, "bias gradient")
-
-        x_grad_transpose = np.transpose(x_grad_ref2, (0, 2, 3, 1))
-        self.__assert_close(x_grad_ref, x_grad_transpose, "x gradient")
-        print 'python: NHWC, NCHW, backward checking passed'
+        return y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad
 
     def test_forward_backward(self):
         def test_with_place(place, data_layout, shape):
             # attr
             epsilon = 0.00001
             momentum = 0.9
-
-            if len(shape) == 2:
-                x_shape = shape
-                c = shape[1]
+            if data_layout == "NCHW":
+                n, c, h, w = shape[0], shape[1], shape[2], shape[3]
             else:
-                # n, h, w, c = 2, 3, 4, 2
                 n, h, w, c = shape[0], shape[1], shape[2], shape[3]
-                if data_format == "NHWC":
-                    x_shape = [n, h, w, c]
-                elif data_format == "NCHW":
-                    x_shape = [n, c, h, w]
-                else:
-                    raise ValueError("Unknown data type.")
             scale_shape = [c]
 
-            x_val = np.random.random_sample(x_shape).astype(np.float32)
-            scale_val = np.random.random_sample(scale_shape).astype(np.float32)
-            bias_val = np.random.random_sample(scale_shape).astype(np.float32)
-
+            np.random.seed(123)
+            x = np.random.random_sample(shape).astype(np.float32)
+            scale = np.random.random_sample(scale_shape).astype(np.float32)
+            bias = np.random.random_sample(scale_shape).astype(np.float32)
             mean = np.zeros(scale_shape).astype(np.float32)
             variance = np.ones(scale_shape).astype(np.float32)
 
-            # run forward
-            y_out, saved_mean, var_ref = _reference_training(
-                x_val, scale_val, bias_val, epsilon, data_format)
-
-            # update moving mean and variance
-            mean_out = saved_mean * (1. - momentum) + momentum * mean
-            variance_out = var_ref * (1. - momentum) + momentum * variance
-            saved_variance = 1. / np.sqrt(var_ref + epsilon)
+            y_grad = np.random.random_sample(shape).astype(np.float32)
+
+            y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad = self.ref_forward_backward(
+                x, y_grad, scale, bias, mean, variance, epsilon, momentum,
+                shape, data_layout)
+
+            var_dict = locals()
+            var_dict['y@GRAD'] = y_grad
+
+            var_names = [
+                'x', 'scale', 'bias', 'mean', 'variance', 'y', 'saved_mean',
+                'saved_variance'
+            ]
+            ground_truth = {name: var_dict[name] for name in var_names}
+
+            program = fluid.Program()
+            with fluid.program_guard(program):
+                block = program.global_block()
+                for name in ground_truth:
+                    block.create_var(
+                        name=name,
+                        dtype='float32',
+                        shape=ground_truth[name].shape)
+                bn_op = block.append_op(
+                    type="batch_norm",
+                    inputs={
+                        "X": block.var('x'),
+                        "Scale": block.var('scale'),
+                        "Bias": block.var('bias'),
+                        "Mean": block.var('mean'),
+                        "Variance": block.var('variance')
+                    },
+                    outputs={
+                        "Y": block.var('y'),
+                        "MeanOut": block.var('mean'),  # share the same memory
+                        "VarianceOut":
+                        block.var('variance'),  # share the same memory
+                        "SavedMean": block.var('saved_mean'),
+                        "SavedVariance": block.var('saved_variance')
+                    },
+                    attrs={
+                        "momentum": momentum,
+                        "epsilon": epsilon,
+                        "is_test": False,
+                        "data_layout": data_layout,
+                        "use_mkldnn": self.use_mkldnn,
+                        "fuse_with_relu": self.fuse_with_relu
+                    })
+                block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
+
+                # generate backward op_desc
+                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                    bn_op.desc, set(), [])
+                grad_op_desc = grad_op_desc_list[0]
+                new_op_desc = block.desc.append_op()
+                new_op_desc.copy_from(grad_op_desc)
+                for var_name in grad_op_desc.output_arg_names():
+                    block.desc.var(var_name.encode("ascii"))
+                grad_op_desc.infer_var_type(block.desc)
+                grad_op_desc.infer_shape(block.desc)
+                for arg in grad_op_desc.output_arg_names():
+                    grad_var = block.desc.find_var(arg.encode("ascii"))
+                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+                exe = fluid.Executor(place)
+                out = exe.run(
+                    program,
+                    feed={
+                        name: var_dict[name]
+                        for name in
+                        ['x', 'scale', 'bias', 'mean', 'variance', 'y@GRAD']
+                    },
+                    fetch_list=[
+                        'y', 'mean', 'variance', 'saved_mean', 'saved_variance',
+                        'x@GRAD', 'scale@GRAD', 'bias@GRAD'
+                    ])
+
+            self.__assert_close(y, out[0], "y")
+            self.__assert_close(mean_out, out[1], "mean")
+            self.__assert_close(variance_out, out[2], "variance", 1e-3)
+            self.__assert_close(saved_mean, out[3], "saved_mean")
+            self.__assert_close(saved_variance, out[4], "saved_variance", 1e-3)
+            self.__assert_close(x_grad, out[5], "x_grad")
+            self.__assert_close(scale_grad, out[6], "scale_grad")
+            self.__assert_close(bias_grad, out[7], "bias_grad")
 
-            #  for gradient test
-            # y_grad = np.ones(x_shape).astype(np.float32)
-            y_grad = np.zeros(x_shape).astype(np.float32)
-            if len(y_grad.shape) == 2:
-                y_grad[0, 0] = 1.
-            else:
-                y_grad[0, 0, 0, 0] = 1.
-            # y_grad = np.random.random_sample(x_shape).astype(np.float32)
-            x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad(
-                x_val, y_grad, scale_val, saved_mean, var_ref, epsilon,
-                data_format)
-
-            scope = core.Scope()
-
-            # create input
-            x_tensor = create_or_get_tensor(scope, "x_val", x_val, place)
-            scale_tensor = create_or_get_tensor(scope, "scale_val", scale_val,
-                                                place)
-            bias_tensor = create_or_get_tensor(scope, "bias_val", bias_val,
-                                               place)
-            mean_tensor = create_or_get_tensor(scope, "mean", mean, place)
-            variance_tensor = create_or_get_tensor(scope, "variance", variance,
-                                                   place)
-
-            # create output
-            y_tensor = create_or_get_tensor(scope, "y_out", None, place)
-            saved_mean_tensor = create_or_get_tensor(scope, "saved_mean", None,
-                                                     place)
-            saved_variance_tensor = create_or_get_tensor(
-                scope, "saved_variance", None, place)
-            mean_out_tensor = mean_tensor
-            variance_out_tensor = variance_tensor
-
-            batch_norm_op = Operator(
-                "batch_norm",
-                # inputs
-                X="x_val",
-                Scale="scale_val",
-                Bias="bias_val",
-                Mean="mean",
-                Variance="variance",
-                # outputs
-                Y="y_out",
-                MeanOut="mean",
-                VarianceOut="variance",
-                SavedMean="saved_mean",
-                SavedVariance="saved_variance",
-                # attrs
-                is_test=False,
-                data_layout=data_layout,
-                momentum=momentum,
-                epsilon=epsilon)
-
-            batch_norm_op.run(scope, place)
-
-            # check forward result
-            self.__assert_close(y_tensor, y_out, "y_out")
-            self.__assert_close(saved_mean_tensor, saved_mean, "saved_mean")
-            self.__assert_close(saved_variance_tensor, saved_variance,
-                                "saved_variance")
-            self.__assert_close(mean_out_tensor, mean_out, "mean_out")
-            if isinstance(place, core.CUDAPlace):
-                atol = 5e-2
-            else:
-                atol = 1e-4
-            self.__assert_close(variance_out_tensor, variance_out,
-                                "variance_out", atol)
             print "op test forward passed: ", str(place), data_layout
 
-            # run backward
-            batch_norm_op_grad = get_backward_op(scope, batch_norm_op, set())
-            set_output_grad(
-                scope,
-                ["y_out", "mean", "variance", "saved_mean", "saved_variance"],
-                place,
-                feed_dict={"y_out": y_grad})
-            batch_norm_op_grad.run(scope, place)
-
-            x_grad_tensor = create_or_get_tensor(scope,
-                                                 grad_var_name("x_val"), None,
-                                                 place)
-            scale_grad_tensor = create_or_get_tensor(scope,
-                                                     grad_var_name("scale_val"),
-                                                     None, place)
-            bias_grad_tensor = create_or_get_tensor(scope,
-                                                    grad_var_name("bias_val"),
-                                                    None, place)
-
-            # check gradient output
-            self.__assert_close(x_grad_tensor, x_grad_ref, "x_grad")
-            self.__assert_close(scale_grad_tensor, scale_grad_ref, "scale_grad")
-            self.__assert_close(bias_grad_tensor, bias_grad_ref, "bias_grad")
-            print "op test backward passed: ", str(place), data_layout
-
         places = [core.CPUPlace()]
+
         if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
             places.append(core.CUDAPlace(0))
 
         for place in places:
-            for data_format in ["NCHW", "NHWC"]:
+            for data_format in self.data_formats:
                 test_with_place(place, data_format, [2, 3, 4, 5])
-                test_with_place(place, data_format, [2, 3])
+
+    def init_kernel_type(self):
+        pass
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
index 4ee00605e22ba45d9e46a8bba27712c3fd97872a..db5771f7b0ad74c73b81d502209c17dce3ce8457 100644
--- a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
@@ -20,44 +20,58 @@ from paddle.fluid.op import Operator
 
 
 class TestBeamSearchDecodeOp(unittest.TestCase):
+    """unittest of beam_search_decode_op"""
+
     def setUp(self):
         self.scope = core.Scope()
-        self.cpu_place = core.CPUPlace()
+        self.place = core.CPUPlace()
 
     def append_lod_tensor(self, tensor_array, lod, data):
         lod_tensor = core.LoDTensor()
         lod_tensor.set_lod(lod)
-        lod_tensor.set(data, self.cpu_place)
+        lod_tensor.set(data, self.place)
         tensor_array.append(lod_tensor)
 
     def test_get_set(self):
         ids = self.scope.var("ids").get_lod_tensor_array()
-        self.append_lod_tensor(
-            ids, [[0, 3, 6], [0, 1, 2, 3, 4, 5, 6]],
-            np.array(
-                [1, 2, 3, 4, 5, 6], dtype="int64"))
-        self.append_lod_tensor(
-            ids, [[0, 3, 6], [0, 1, 1, 3, 5, 5, 6]],
-            np.array(
-                [0, 1, 2, 3, 4, 5], dtype="int64"))
-        self.append_lod_tensor(
-            ids, [[0, 3, 6], [0, 0, 1, 2, 3, 4, 5]],
-            np.array(
-                [0, 1, 2, 3, 4], dtype="int64"))
-
         scores = self.scope.var("scores").get_lod_tensor_array()
-        self.append_lod_tensor(
-            scores, [[0, 3, 6], [0, 1, 2, 3, 4, 5, 6]],
-            np.array(
-                [1, 2, 3, 4, 5, 6], dtype="float64"))
-        self.append_lod_tensor(
-            scores, [[0, 3, 6], [0, 1, 1, 3, 5, 5, 6]],
-            np.array(
-                [0, 1, 2, 3, 4, 5], dtype="float64"))
-        self.append_lod_tensor(
-            scores, [[0, 3, 6], [0, 0, 1, 2, 3, 4, 5]],
-            np.array(
-                [0, 1, 2, 3, 4], dtype="float64"))
+        # Construct sample data with 5 steps and 2 source sentences
+        # beam_size = 2, end_id = 1
+        # start with start_id
+        [
+            self.append_lod_tensor(
+                array, [[0, 1, 2], [0, 1, 2]], np.array(
+                    [0, 0], dtype=dtype))
+            for array, dtype in ((ids, "int64"), (scores, "float32"))
+        ]
+        [
+            self.append_lod_tensor(
+                array, [[0, 1, 2], [0, 2, 4]],
+                np.array(
+                    [2, 3, 4, 5], dtype=dtype))
+            for array, dtype in ((ids, "int64"), (scores, "float32"))
+        ]
+        [
+            self.append_lod_tensor(
+                array, [[0, 2, 4], [0, 2, 2, 4, 4]],
+                np.array(
+                    [3, 1, 5, 4], dtype=dtype))
+            for array, dtype in ((ids, "int64"), (scores, "float32"))
+        ]
+        [
+            self.append_lod_tensor(
+                array, [[0, 2, 4], [0, 1, 2, 3, 4]],
+                np.array(
+                    [1, 1, 3, 5], dtype=dtype))
+            for array, dtype in ((ids, "int64"), (scores, "float32"))
+        ]
+        [
+            self.append_lod_tensor(
+                array, [[0, 2, 4], [0, 0, 0, 2, 2]],
+                np.array(
+                    [5, 1], dtype=dtype))
+            for array, dtype in ((ids, "int64"), (scores, "float32"))
+        ]
 
         sentence_ids = self.scope.var("sentence_ids").get_tensor()
         sentence_scores = self.scope.var("sentence_scores").get_tensor()
@@ -69,20 +83,28 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
             Scores="scores",
             # outputs
             SentenceIds="sentence_ids",
-            SentenceScores="sentence_scores")
+            SentenceScores="sentence_scores",
+            beam_size=2,
+            end_id=1, )
 
-        beam_search_decode_op.run(self.scope, self.cpu_place)
+        beam_search_decode_op.run(self.scope, self.place)
 
-        expected_lod = [[0, 4, 8], [0, 1, 3, 6, 9, 10, 13, 16, 19]]
+        expected_lod = [[0, 2, 4], [0, 4, 7, 12, 17]]
         self.assertEqual(sentence_ids.lod(), expected_lod)
         self.assertEqual(sentence_scores.lod(), expected_lod)
 
         expected_data = np.array(
-            [2, 1, 0, 3, 1, 0, 3, 2, 1, 5, 4, 3, 2, 4, 4, 3, 6, 5, 4], "int64")
+            [0, 2, 3, 1, 0, 2, 1, 0, 4, 5, 3, 5, 0, 4, 5, 3, 1], "int64")
         self.assertTrue(np.array_equal(np.array(sentence_ids), expected_data))
         self.assertTrue(
             np.array_equal(np.array(sentence_scores), expected_data))
 
 
+class TestBeamSearchDecodeOpGPU(TestBeamSearchDecodeOp):
+    def setUp(self):
+        self.scope = core.Scope()
+        self.place = core.CUDAPlace(0)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
index bc708f3aff54f54d290684d68afa503a50a32dac..167451edd8c46c006c8019678a304a38f18cb946 100644
--- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
@@ -26,9 +26,12 @@ def create_tensor(scope, name, np_data):
 
 
 class BeamSearchOpTester(unittest.TestCase):
+    """unittest of beam_search_op"""
+
     def setUp(self):
         self.scope = core.Scope()
         self._create_ids()
+        self._create_pre_scores()
         self._create_scores()
         self._create_pre_ids()
         self.scope.var('selected_ids')
@@ -37,7 +40,8 @@ class BeamSearchOpTester(unittest.TestCase):
     def test_run(self):
         op = Operator(
             'beam_search',
-            pre_ids="pre_ids",
+            pre_ids='pre_ids',
+            pre_scores='pre_scores',
             ids='ids',
             scores='scores',
             selected_ids='selected_ids',
@@ -47,15 +51,27 @@ class BeamSearchOpTester(unittest.TestCase):
             end_id=0, )
         op.run(self.scope, core.CPUPlace())
         selected_ids = self.scope.find_var("selected_ids").get_tensor()
-        print 'selected_ids', np.array(selected_ids)
-        print 'lod', selected_ids.lod()
+        selected_scores = self.scope.find_var("selected_scores").get_tensor()
+        self.assertTrue(
+            np.allclose(
+                np.array(selected_ids), np.array([4, 2, 3, 8])[:, np.newaxis]))
+        self.assertTrue(
+            np.allclose(
+                np.array(selected_scores),
+                np.array([0.5, 0.6, 0.9, 0.7])[:, np.newaxis]))
+        self.assertEqual(selected_ids.lod(),
+                         [[0L, 2L, 4L], [0L, 1L, 2L, 3L, 4L]])
 
     def _create_pre_ids(self):
         np_data = np.array([[1, 2, 3, 4]], dtype='int64')
-        tensor = create_tensor(self.scope, "pre_ids", np_data)
+        tensor = create_tensor(self.scope, 'pre_ids', np_data)
+
+    def _create_pre_scores(self):
+        np_data = np.array([[0.1, 0.2, 0.3, 0.4]], dtype='float32')
+        tensor = create_tensor(self.scope, 'pre_scores', np_data)
 
     def _create_ids(self):
-        self.lod = [[0, 1, 4], [0, 1, 2, 3, 4]]
+        self.lod = [[0, 2, 4], [0, 1, 2, 3, 4]]
         np_data = np.array(
             [[4, 2, 5], [2, 1, 3], [3, 5, 2], [8, 2, 1]], dtype='int64')
         tensor = create_tensor(self.scope, "ids", np_data)
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..b04f25ef874cc6204211a4f5f5991a0ec8c473dd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
@@ -0,0 +1,166 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+
+
+def bilinear_interp_np(input, out_h, out_w, out_size):
+    if out_size is not None:
+        out_h = out_size[0]
+        out_w = out_size[1]
+    batch_size, channel, in_h, in_w = input.shape
+    if out_h > 1:
+        ratio_h = (in_h - 1.0) / (out_h - 1.0)
+    else:
+        ratio_h = 0.0
+    if out_w > 1:
+        ratio_w = (in_w - 1.0) / (out_w - 1.0)
+    else:
+        ratio_w = 0.0
+
+    out = np.zeros((batch_size, channel, out_h, out_w))
+    for i in range(out_h):
+        h = int(ratio_h * i)
+        hid = 1 if h < in_h - 1 else 0
+        h1lambda = ratio_h * i - h
+        h2lambda = 1.0 - h1lambda
+        for j in range(out_w):
+            w = int(ratio_w * j)
+            wid = 1 if w < in_w - 1 else 0
+            w1lambda = ratio_w * j - w
+            w2lambda = 1.0 - w1lambda
+
+            out[:, :, i, j] = h2lambda*(w2lambda*input[:, :, h, w] +
+                                        w1lambda*input[:, :, h, w+wid]) + \
+                h1lambda*(w2lambda*input[:, :, h+hid, w] +
+                          w1lambda*input[:, :, h+hid, w+wid])
+    return out.astype(input.dtype)
+
+
+class TestBilinearInterpOp(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.init_test_case()
+        self.op_type = "bilinear_interp"
+        input_np = np.random.random(self.input_shape).astype("float32")
+        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
+                                       self.out_size)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        self.attrs = {'out_h': self.out_h, 'out_w': self.out_w}
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', in_place=True)
+
+    def init_test_case(self):
+        self.input_shape = [2, 3, 4, 4]
+        self.out_h = 2
+        self.out_w = 2
+        self.out_size = np.array([3, 3]).astype("int32")
+
+
+class TestCase1(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+
+
+class TestCase2(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+
+
+class TestCase3(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.input_shape = [1, 1, 128, 64]
+        self.out_h = 64
+        self.out_w = 128
+
+
+class TestCase4(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.out_size = np.array([2, 2]).astype("int32")
+
+
+class TestCase5(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.out_size = np.array([11, 11]).astype("int32")
+
+
+class TestCase6(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.input_shape = [1, 1, 128, 64]
+        self.out_h = 64
+        self.out_w = 128
+        self.out_size = np.array([65, 129]).astype("int32")
+
+
+class TestBilinearInterpOpUint8(OpTest):
+    def setUp(self):
+        self.out_size = None
+        self.init_test_case()
+        self.op_type = "bilinear_interp"
+        input_np = np.random.randint(
+            low=0, high=256, size=self.input_shape).astype("uint8")
+        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
+                                       self.out_size)
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        self.attrs = {'out_h': self.out_h, 'out_w': self.out_w}
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output_with_place(place=core.CPUPlace(), atol=1)
+
+    def init_test_case(self):
+        self.input_shape = [1, 3, 9, 6]
+        self.out_h = 10
+        self.out_w = 9
+
+
+class TestCase1Uint8(TestBilinearInterpOpUint8):
+    def init_test_case(self):
+        self.input_shape = [2, 3, 128, 64]
+        self.out_h = 120
+        self.out_w = 50
+
+
+class TestCase2Uint8(TestBilinearInterpOpUint8):
+    def init_test_case(self):
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 5
+        self.out_w = 13
+        self.out_size = np.array([6, 15]).astype("int32")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
index f7461ee6dab699064153332116449c8e20a0bac0..d5bd726c4a82ee839703c69a933100bb056cb736 100644
--- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
@@ -65,23 +65,25 @@ def batch_bipartite_match(distance, lod, match_type=None, dist_threshold=None):
         distance (numpy.array) : The distance of two entries with shape [M, N].
         lod (list of int): The offsets of each input in this batch.
     """
-    n = len(lod) - 1
+    n = len(lod)
     m = distance.shape[1]
     match_indices = -1 * np.ones((n, m), dtype=np.int)
     match_dist = np.zeros((n, m), dtype=np.float32)
-    for i in range(len(lod) - 1):
-        bipartite_match(distance[lod[i]:lod[i + 1], :], match_indices[i, :],
-                        match_dist[i, :])
+    cur_offset = 0
+    for i in range(n):
+        bipartite_match(distance[cur_offset:(cur_offset + lod[i]), :],
+                        match_indices[i, :], match_dist[i, :])
         if match_type == 'per_prediction':
-            argmax_match(distance[lod[i]:lod[i + 1], :], match_indices[i, :],
-                         match_dist[i, :], dist_threshold)
+            argmax_match(distance[cur_offset:(cur_offset + lod[i]), :],
+                         match_indices[i, :], match_dist[i, :], dist_threshold)
+        cur_offset += lod[i]
     return match_indices, match_dist
 
 
 class TestBipartiteMatchOpWithLoD(OpTest):
     def setUp(self):
         self.op_type = 'bipartite_match'
-        lod = [[0, 5, 11, 23]]
+        lod = [[5, 6, 12]]
         dist = np.random.random((23, 217)).astype('float32')
         match_indices, match_dist = batch_bipartite_match(dist, lod[0])
 
@@ -98,7 +100,7 @@ class TestBipartiteMatchOpWithLoD(OpTest):
 class TestBipartiteMatchOpWithoutLoD(OpTest):
     def setUp(self):
         self.op_type = 'bipartite_match'
-        lod = [[0, 8]]
+        lod = [[8]]
         dist = np.random.random((8, 17)).astype('float32')
         match_indices, match_dist = batch_bipartite_match(dist, lod[0])
 
@@ -112,10 +114,27 @@ class TestBipartiteMatchOpWithoutLoD(OpTest):
         self.check_output()
 
 
+class TestBipartiteMatchOpWithoutLoDLargeScaleInput(OpTest):
+    def setUp(self):
+        self.op_type = 'bipartite_match'
+        lod = [[300]]
+        dist = np.random.random((300, 17)).astype('float32')
+        match_indices, match_dist = batch_bipartite_match(dist, lod[0])
+
+        self.inputs = {'DistMat': dist}
+        self.outputs = {
+            'ColToRowMatchIndices': match_indices,
+            'ColToRowMatchDist': match_dist,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestBipartiteMatchOpWithPerPredictionType(OpTest):
     def setUp(self):
         self.op_type = 'bipartite_match'
-        lod = [[0, 5, 11, 23]]
+        lod = [[5, 6, 12]]
         dist = np.random.random((23, 237)).astype('float32')
         match_indices, match_dist = batch_bipartite_match(dist, lod[0],
                                                           'per_prediction', 0.5)
diff --git a/python/paddle/fluid/tests/unittests/test_box_coder_op.py b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
index 56f5af91d8e58086c12fde6948229675569aa271..4ce9a4783e2332b6882164a70e1462c6a6d31bef 100644
--- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
@@ -19,7 +19,8 @@ import math
 from op_test import OpTest
 
 
-def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
+def box_coder(target_box, prior_box, prior_box_var, output_box, code_type,
+              box_normalized):
     prior_box_x = (
         (prior_box[:, 2] + prior_box[:, 0]) / 2).reshape(1, prior_box.shape[0])
     prior_box_y = (
@@ -30,6 +31,9 @@ def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
         (prior_box[:, 3] - prior_box[:, 1])).reshape(1, prior_box.shape[0])
     prior_box_var = prior_box_var.reshape(1, prior_box_var.shape[0],
                                           prior_box_var.shape[1])
+    if not box_normalized:
+        prior_box_height = prior_box_height + 1
+        prior_box_width = prior_box_width + 1
 
     if (code_type == "EncodeCenterSize"):
         target_box_x = ((target_box[:, 2] + target_box[:, 0]) / 2).reshape(
@@ -40,6 +44,9 @@ def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
             target_box.shape[0], 1)
         target_box_height = ((target_box[:, 3] - target_box[:, 1])).reshape(
             target_box.shape[0], 1)
+        if not box_normalized:
+            target_box_height = target_box_height + 1
+            target_box_width = target_box_width + 1
 
         output_box[:,:,0] = (target_box_x - prior_box_x) / prior_box_width / \
                 prior_box_var[:,:,0]
@@ -64,21 +71,29 @@ def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
         output_box[:, :, 1] = target_box_y - target_box_height / 2
         output_box[:, :, 2] = target_box_x + target_box_width / 2
         output_box[:, :, 3] = target_box_y + target_box_height / 2
+        if not box_normalized:
+            output_box[:, :, 2] = output_box[:, :, 2] - 1
+            output_box[:, :, 3] = output_box[:, :, 3] - 1
 
 
-def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type):
+def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type,
+                    box_normalized):
     n = target_box.shape[0]
     m = prior_box.shape[0]
     output_box = np.zeros((n, m, 4), dtype=np.float32)
-    for i in range(len(lod) - 1):
+    cur_offset = 0
+    for i in range(len(lod)):
         if (code_type == "EncodeCenterSize"):
-            box_coder(target_box[lod[i]:lod[i + 1], :], prior_box,
-                      prior_box_var, output_box[lod[i]:lod[i + 1], :, :],
-                      code_type)
+            box_coder(target_box[cur_offset:(cur_offset + lod[i]), :],
+                      prior_box, prior_box_var,
+                      output_box[cur_offset:(cur_offset + lod[i]), :, :],
+                      code_type, box_normalized)
         elif (code_type == "DecodeCenterSize"):
-            box_coder(target_box[lod[i]:lod[i + 1], :, :], prior_box,
-                      prior_box_var, output_box[lod[i]:lod[i + 1], :, :],
-                      code_type)
+            box_coder(target_box[cur_offset:(cur_offset + lod[i]), :, :],
+                      prior_box, prior_box_var,
+                      output_box[cur_offset:(cur_offset + lod[i]), :, :],
+                      code_type, box_normalized)
+        cur_offset += lod[i]
     return output_box
 
 
@@ -88,20 +103,50 @@ class TestBoxCoderOp(OpTest):
 
     def setUp(self):
         self.op_type = "box_coder"
-        lod = [[0, 1, 2, 3, 4, 5]]
+        lod = [[1, 1, 1, 1, 1]]
         prior_box = np.random.random((10, 4)).astype('float32')
         prior_box_var = np.random.random((10, 4)).astype('float32')
         target_box = np.random.random((5, 10, 4)).astype('float32')
         code_type = "DecodeCenterSize"
+        box_normalized = False
         output_box = batch_box_coder(prior_box, prior_box_var, target_box,
-                                     lod[0], code_type)
+                                     lod[0], code_type, box_normalized)
 
         self.inputs = {
             'PriorBox': prior_box,
             'PriorBoxVar': prior_box_var,
             'TargetBox': target_box,
         }
-        self.attrs = {'code_type': 'decode_center_size'}
+        self.attrs = {
+            'code_type': 'decode_center_size',
+            'box_normalized': False
+        }
+        self.outputs = {'OutputBox': output_box}
+
+
+class TestBoxCoderOpWithoutBoxVar(OpTest):
+    def test_check_output(self):
+        self.check_output()
+
+    def setUp(self):
+        self.op_type = "box_coder"
+        lod = [[0, 1, 2, 3, 4, 5]]
+        prior_box = np.random.random((10, 4)).astype('float32')
+        prior_box_var = np.ones((10, 4)).astype('float32')
+        target_box = np.random.random((5, 10, 4)).astype('float32')
+        code_type = "DecodeCenterSize"
+        box_normalized = False
+        output_box = batch_box_coder(prior_box, prior_box_var, target_box,
+                                     lod[0], code_type, box_normalized)
+
+        self.inputs = {
+            'PriorBox': prior_box,
+            'TargetBox': target_box,
+        }
+        self.attrs = {
+            'code_type': 'decode_center_size',
+            'box_normalized': False
+        }
         self.outputs = {'OutputBox': output_box}
 
 
@@ -111,20 +156,21 @@ class TestBoxCoderOpWithLoD(OpTest):
 
     def setUp(self):
         self.op_type = "box_coder"
-        lod = [[0, 4, 12, 20]]
+        lod = [[4, 8, 8]]
         prior_box = np.random.random((10, 4)).astype('float32')
         prior_box_var = np.random.random((10, 4)).astype('float32')
         target_box = np.random.random((20, 4)).astype('float32')
         code_type = "EncodeCenterSize"
+        box_normalized = True
         output_box = batch_box_coder(prior_box, prior_box_var, target_box,
-                                     lod[0], code_type)
+                                     lod[0], code_type, box_normalized)
 
         self.inputs = {
             'PriorBox': prior_box,
             'PriorBoxVar': prior_box_var,
             'TargetBox': (target_box, lod),
         }
-        self.attrs = {'code_type': 'encode_center_size'}
+        self.attrs = {'code_type': 'encode_center_size', 'box_normalized': True}
         self.outputs = {'OutputBox': output_box}
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_cast_op.py b/python/paddle/fluid/tests/unittests/test_cast_op.py
index 8fb8d03828393ccfe57c0848d79b960c641ad39a..b8d3ed3aa3eb0e47e79f46cdf681a3b9cca46036 100644
--- a/python/paddle/fluid/tests/unittests/test_cast_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cast_op.py
@@ -18,7 +18,7 @@ import numpy as np
 import paddle.fluid.core as core
 
 
-class TestCastOp(op_test.OpTest):
+class TestCastOp1(op_test.OpTest):
     def setUp(self):
         ipt = np.random.random(size=[10, 10])
         self.inputs = {'X': ipt.astype('float32')}
@@ -36,5 +36,36 @@ class TestCastOp(op_test.OpTest):
         self.check_grad(['X'], ['Out'])
 
 
+class TestCastOp2(op_test.OpTest):
+    def setUp(self):
+        ipt = np.random.random(size=[10, 10])
+        # numpy float16 is binded to fluid float16 via uint16
+        self.inputs = {'X': ipt.astype('float16').view(np.uint16)}
+        self.outputs = {'Out': ipt.astype('float32')}
+        self.attrs = {
+            'in_dtype': int(core.VarDesc.VarType.FP16),
+            'out_dtype': int(core.VarDesc.VarType.FP32)
+        }
+        self.op_type = 'cast'
+
+    def test_check_output(self):
+        self.check_output(atol=1e-3)
+
+
+class TestCastOp3(op_test.OpTest):
+    def setUp(self):
+        ipt = np.random.random(size=[10, 10])
+        self.inputs = {'X': ipt.astype('float32')}
+        self.outputs = {'Out': ipt.astype('float16')}
+        self.attrs = {
+            'in_dtype': int(core.VarDesc.VarType.FP32),
+            'out_dtype': int(core.VarDesc.VarType.FP16)
+        }
+        self.op_type = 'cast'
+
+    def test_check_output(self):
+        self.check_output(atol=1e-3)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_checkpoint.py b/python/paddle/fluid/tests/unittests/test_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..e22400a045ced16c46b0bf005155f621f249d263
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_checkpoint.py
@@ -0,0 +1,75 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import unittest
+import os
+import tempfile
+
+
+class TestCheckpoint(unittest.TestCase):
+    def setUp(self):
+        self.dirname = tempfile.mktemp()
+        self.max_num_checkpoints = 3
+        self.epoch_interval = 1
+        self.step_interval = 1
+        self.trainer_id = 0
+        self.chief = self.trainer_id == 0
+        self.place = fluid.CPUPlace()
+        self.epoch_id = 100
+        self.step_id = 20
+
+    def test_checkpoint(self):
+        self.save_checkpoint()
+        serial = fluid.io.get_latest_checkpoint_serial(self.dirname)
+        self.assertTrue(serial >= 0)
+        trainer_args = ["epoch_id", "step_id"]
+        epoch_id, step_id = fluid.io.load_trainer_args(
+            self.dirname, serial, self.trainer_id, trainer_args)
+        self.assertEqual(self.step_id, int(step_id))
+        self.assertEqual(self.epoch_id, int(epoch_id))
+
+        program = fluid.Program()
+        with fluid.program_guard(program):
+            exe = fluid.Executor(self.place)
+            fluid.io.load_checkpoint(exe, self.dirname, serial, program)
+
+        fluid.io.clean_checkpoint(self.dirname, delete_dir=True)
+        self.assertFalse(os.path.isdir(self.dirname))
+
+    def save_checkpoint(self):
+        config = fluid.CheckpointConfig(self.dirname, self.max_num_checkpoints,
+                                        self.epoch_interval, self.step_interval)
+
+        trainer_args = {}
+        trainer_args["epoch_id"] = self.epoch_id
+        trainer_args["step_id"] = self.step_id
+
+        program = fluid.Program()
+        with fluid.program_guard(program):
+            program.global_block().create_var(
+                name="scale_0",
+                psersistable=True,
+                dtype="float32",
+                shape=[32, 32])
+
+            exe = fluid.Executor(self.place)
+            for i in xrange(10):
+                fluid.io.save_checkpoint(exe, config.checkpoint_dir,
+                                         self.trainer_id, trainer_args, program,
+                                         config.max_num_checkpoints)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
index 050df2801c98e8f4167cdd1b4dde858c9f9f07dd..23932194f0ca97954ec9ade3fdcaebd7a32749a0 100644
--- a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
+++ b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
@@ -144,10 +144,10 @@ class TestChunkEvalOp(OpTest):
         starts = sorted(starts)
         self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = self.gen_chunks(
             infer, label, starts)
-        self.inputs = {
-            'Inference': (infer, [starts]),
-            'Label': (label, [starts])
-        }
+        lod = []
+        for i in range(len(starts) - 1):
+            lod.append(starts[i + 1] - starts[i])
+        self.inputs = {'Inference': (infer, [lod]), 'Label': (label, [lod])}
         precision = float(
             self.num_correct_chunks
         ) / self.num_infer_chunks if self.num_infer_chunks else 0
diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py
index 558f3a4dcbb8fe39c427d8b100f4488440e8b8cb..e9f3c45dc40b3333fe7304f8e4313d156bd5374c 100644
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
@@ -20,19 +20,46 @@ from op_test import OpTest
 class TestConcatOp(OpTest):
     def setUp(self):
         self.op_type = "concat"
-        x0 = np.random.random((2, 1, 4, 5)).astype('float32')
-        x1 = np.random.random((2, 2, 4, 5)).astype('float32')
-        x2 = np.random.random((2, 3, 4, 5)).astype('float32')
-        axis = 1
-        self.inputs = {'X': [('x0', x0), ('x1', x1), ('x2', x2)]}
-        self.attrs = {'axis': axis}
-        self.outputs = {'Out': np.concatenate((x0, x1, x2), axis=axis)}
+        self.init_test_data()
+        self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
+        self.attrs = {'axis': self.axis}
+        self.outputs = {
+            'Out': np.concatenate(
+                (self.x0, self.x1, self.x2), axis=self.axis)
+        }
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
         self.check_grad(['x0'], 'Out')
+        self.check_grad(['x1'], 'Out')
+        self.check_grad(['x2'], 'Out')
+
+    def init_test_data(self):
+        self.x0 = np.random.random((2, 1, 4, 5)).astype('float32')
+        self.x1 = np.random.random((2, 2, 4, 5)).astype('float32')
+        self.x2 = np.random.random((2, 3, 4, 5)).astype('float32')
+        self.axis = 1
+
+
+class TestConcatOp2(TestConcatOp):
+    def init_test_data(self):
+        self.x0 = np.random.random((2, 3, 4, 5)).astype('float32')
+        self.x1 = np.random.random((2, 3, 4, 5)).astype('float32')
+        self.x2 = np.random.random((2, 3, 4, 5)).astype('float32')
+        self.axis = 1
+
+
+class TestConcatOp3(TestConcatOp):
+    def init_test_data(self):
+        self.x0 = np.random.random((1, 256, 170, 256)).astype('float32')
+        self.x1 = np.random.random((1, 128, 170, 256)).astype('float32')
+        self.x2 = np.random.random((1, 128, 170, 256)).astype('float32')
+        self.axis = 1
+
+    def test_check_grad(self):
+        pass
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_cond_op.py b/python/paddle/fluid/tests/unittests/test_cond_op.py
deleted file mode 100644
index 66fbae961a2701e79da5222ae2689108335c4065..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_cond_op.py
+++ /dev/null
@@ -1,128 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import paddle.fluid.core as core
-import unittest
-import numpy as np
-from paddle.fluid.op import Operator, CondOp
-
-
-class PySimpleCond(object):
-    '''
-    A simple implementation of dynamic if-else based on numpy
-    '''
-
-    def __init__(self):
-        array = [1] * 10
-        for i in range(1, 10, 2):
-            array[i] = 0
-        self.cond = np.array(array)
-        self.x = np.ones(shape=(10, 1)).astype("float32")
-
-    def forward(self):
-        self.index_t = np.where(self.cond == 1)
-        self.index_f = np.where(self.cond == 0)
-        y_t = self.x[self.index_t]
-        y_f = self.x[self.index_f]
-        y_t = y_t * 2.
-        y_f = y_f * (-2.)
-        output = np.zeros(shape=(10, 1))
-        output[self.index_t] = y_t
-        output[self.index_f] = y_f
-        return output
-
-
-class PySimpleCondTest(unittest.TestCase):
-    def setUp(self):
-        self.condnn = PySimpleCond()
-
-    def test_forward(self):
-        output = self.condnn.forward()
-
-
-def create_tensor(scope, name, shape, np_data):
-    tensor = scope.var(name).get_tensor()
-    tensor.set_dims(shape)
-    tensor.set(np_data, core.CPUPlace())
-    return tensor
-
-
-class TestCondOp(unittest.TestCase):
-    '''
-    Test CondOp
-
-    equation:
-        cond = [True, False, True, False, ...]
-        y[index_t] = x[index_t] * 2.
-        y[index_f] = x[index_f] * -2.
-    outputs:
-        y
-    '''
-
-    def setUp(self):
-        self.py_cond = PySimpleCond()
-
-    def forward(self):
-        self.scope = core.Scope()
-        self.create_global_variables()
-        self.create_cond_op()
-        self.create_sub_net()
-        self.condop.run(self.scope, core.CPUPlace())
-        return np.array(self.scope.find_var("Out").get_tensor())
-
-    def create_global_variables(self):
-        x_np_data = self.py_cond.x
-        create_tensor(self.scope, "X", [10, 1], x_np_data)
-        cond_np_data = self.py_cond.cond.astype("int32")
-        create_tensor(self.scope, "cond", [10, 1], cond_np_data)
-        self.scope.var("SubScopes")
-        self.scope.var("IndexTensors")
-        self.scope.var("Out")
-
-    def create_cond_op(self):
-        self.condop = CondOp(
-            Cond="cond",
-            Xs=["X"],
-            Outs=["Out"],
-            SubScopes="SubScopes",
-            IndexTensors="IndexTensors")
-
-    def create_sub_net(self):
-        truenet = core.Net.create()
-        scale_op_t = Operator("scale", X='X', Out='Out', scale=2.)
-        truenet.append_op(scale_op_t)
-        truenet.complete_add_op(True)
-        self.condop.set_truenet(truenet)
-
-        falsenet = core.Net.create()
-        scale_op_t = Operator("scale", X='X', Out='Out', scale=-2.)
-        falsenet.append_op(scale_op_t)
-        falsenet.complete_add_op(True)
-        self.condop.set_falsenet(falsenet)
-
-    def test_forward(self):
-        print 'test cond op forward'
-        pd_output = self.forward()
-        py_output = self.py_cond.forward()
-        print 'pd_output', pd_output
-        print
-        print 'py_output', py_output
-        self.assertEqual(pd_output.shape, py_output.shape)
-        print 'test passed'
-        return 0
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..db6be21baaa54d33af9f5c44d1815e4b389eb884
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride
+
+
+class TestMKLDNN(TestConv2dOp):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNWithPad(TestWithPad):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNWithStride(TestWithStride):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index a49fecf09509f7b1d9f758eebcf90bf9fbf7669f..a478649541ba9828e55c4239090d5aee554223ac 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -63,9 +63,11 @@ def conv2d_forward_naive(input, filter, group, conv_param):
 
 class TestConv2dOp(OpTest):
     def setUp(self):
+        self.op_type = "conv2d"
         self.use_cudnn = False
         self.use_mkldnn = False
-        self.init_op_type()
+        self.dtype = np.float32
+        self.init_kernel_type()
         self.init_group()
         self.init_dilation()
         self.init_test_case()
@@ -75,12 +77,16 @@ class TestConv2dOp(OpTest):
             'pad': self.pad,
             'dilation': self.dilations
         }
-        input = np.random.random(self.input_size).astype("float32")
-        filter = np.random.random(self.filter_size).astype("float32")
+
+        input = np.random.random(self.input_size).astype(self.dtype)
+        filter = np.random.random(self.filter_size).astype(self.dtype)
         output = conv2d_forward_naive(input, filter, self.groups,
-                                      conv2d_param).astype('float32')
+                                      conv2d_param).astype(self.dtype)
 
-        self.inputs = {'Input': input, 'Filter': filter}
+        self.inputs = {
+            'Input': OpTest.np_dtype_to_fluid_dtype(input),
+            'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
+        }
         self.attrs = {
             'strides': self.stride,
             'paddings': self.pad,
@@ -91,15 +97,20 @@ class TestConv2dOp(OpTest):
         }
         self.outputs = {'Output': output}
 
+    def testcudnn(self):
+        return core.is_compiled_with_cuda() and self.use_cudnn
+
     def test_check_output(self):
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_output_with_place(place, atol=1e-5)
         else:
             self.check_output()
 
     def test_check_grad(self):
-        if self.use_cudnn:
+        if self.dtype == np.float16:
+            return
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place,
@@ -111,7 +122,9 @@ class TestConv2dOp(OpTest):
                 set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
 
     def test_check_grad_no_filter(self):
-        if self.use_cudnn:
+        if self.dtype == np.float16:
+            return
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place, ['Input'],
@@ -126,7 +139,9 @@ class TestConv2dOp(OpTest):
                 no_grad_set=set(['Filter']))
 
     def test_check_grad_no_input(self):
-        if self.use_cudnn:
+        if self.dtype == np.float16:
+            return
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place, ['Filter'],
@@ -154,8 +169,8 @@ class TestConv2dOp(OpTest):
     def init_group(self):
         self.groups = 1
 
-    def init_op_type(self):
-        self.op_type = "conv2d"
+    def init_kernel_type(self):
+        pass
 
 
 class TestWithPad(TestConv2dOp):
@@ -227,39 +242,105 @@ class TestWithInput1x1Filter1x1(TestConv2dOp):
 
 #----------------Conv2dCUDNN----------------
 class TestCUDNN(TestConv2dOp):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "conv2d"
+
+
+class TestFP16CUDNN(TestConv2dOp):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=2e-2)
 
 
 class TestCUDNNWithPad(TestWithPad):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "conv2d"
+
+
+class TestFP16CUDNNWithPad(TestWithPad):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=2e-2)
 
 
 class TestCUDNNWithStride(TestWithStride):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "conv2d"
+
+
+class TestFP16CUDNNWithStride(TestWithStride):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=2e-2)
 
 
 class TestCUDNNWithGroup(TestWithGroup):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "conv2d"
+
+
+class TestFP16CUDNNWithGroup(TestWithGroup):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=2e-2)
 
 
 class TestCUDNNWith1x1(TestWith1x1):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "conv2d"
+
+
+class TestFP16CUDNNWith1x1(TestWith1x1):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=2e-2)
 
 
 class TestCUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "conv2d"
+
+
+class TestFP16CUDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=2e-2)
 
 
 class TestDepthwiseConv(TestConv2dOp):
@@ -292,25 +373,5 @@ class TestDepthwiseConv2(TestConv2dOp):
 #     def init_op_type(self):
 #         self.op_type = "conv_cudnn"
 
-
-#----------------Conv2dMKLDNN----------------
-class TestMKLDNN(TestConv2dOp):
-    def init_op_type(self):
-        self.use_mkldnn = True
-        self.op_type = "conv2d"
-
-
-class TestMKLDNNWithPad(TestWithPad):
-    def init_op_type(self):
-        self.use_mkldnn = True
-        self.op_type = "conv2d"
-
-
-class TestMKLDNNWithStride(TestWithStride):
-    def init_op_type(self):
-        self.use_mkldnn = True
-        self.op_type = "conv2d"
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
index d864b9b348e961c585749d47d449d775b2dfebc9..07545e7feb46c85a4b80f9b846be27d36cbfb59a 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -21,8 +21,11 @@ from op_test import OpTest
 
 def conv2dtranspose_forward_naive(input_, filter_, attrs):
     in_n, in_c, in_h, in_w = input_.shape
-    f_c, out_c, f_h, f_w = filter_.shape
+    f_c, f_out_c, f_h, f_w = filter_.shape
+    groups = attrs['groups']
     assert in_c == f_c
+    out_c = f_out_c * groups
+    sub_in_c = in_c / groups
 
     stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[
         'dilations']
@@ -36,15 +39,21 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs):
     for n in range(in_n):
         for i in range(in_h):
             for j in range(in_w):
-                input_masked = input_[n, :, i, j]  # (c)
-                input_masked = np.reshape(input_masked, (in_c, 1, 1))
-                input_masked = np.tile(input_masked, (1, f_h, f_w))
-
-                for k in range(out_c):
-                    tmp_out = np.sum(input_masked * filter_[:, k, :, :], axis=0)
-                    i1, i2 = i * stride[0], i * stride[0] + d_bolck_h
-                    j1, j2 = j * stride[0], j * stride[0] + d_bolck_h
-                    out[n, k, i1:i2:dilations[0], j1:j2:dilations[1]] += tmp_out
+                for g in range(groups):
+                    input_masked = input_[n, g * sub_in_c:(g + 1) * sub_in_c, i,
+                                          j]  # (c)
+                    input_masked = np.reshape(input_masked, (sub_in_c, 1, 1))
+                    input_masked = np.tile(input_masked, (1, f_h, f_w))
+
+                    for k in range(f_out_c):
+                        tmp_out = np.sum(
+                            input_masked *
+                            filter_[g * sub_in_c:(g + 1) * sub_in_c, k, :, :],
+                            axis=0)
+                        i1, i2 = i * stride[0], i * stride[0] + d_bolck_h
+                        j1, j2 = j * stride[0], j * stride[0] + d_bolck_h
+                        out[n, g * f_out_c + k, i1:i2:dilations[0], j1:j2:
+                            dilations[1]] += tmp_out
 
     out = out[:, :, pad[0]:out_h - pad[0], pad[1]:out_w - pad[1]]
     return out
@@ -64,6 +73,7 @@ class TestConv2dTransposeOp(OpTest):
         self.attrs = {
             'strides': self.stride,
             'paddings': self.pad,
+            'groups': self.groups,
             'dilations': self.dilations,
             'use_cudnn': self.use_cudnn,
             'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
@@ -127,6 +137,7 @@ class TestConv2dTransposeOp(OpTest):
         self.pad = [0, 0]
         self.stride = [1, 1]
         self.dilations = [1, 1]
+        self.groups = 1
         self.input_size = [2, 3, 5, 5]  # NCHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3]
@@ -140,16 +151,29 @@ class TestWithPad(TestConv2dTransposeOp):
         self.pad = [1, 1]
         self.stride = [1, 1]
         self.dilations = [1, 1]
+        self.groups = 1
         self.input_size = [2, 3, 5, 5]  # NCHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3]
 
 
+class TestWithGroups(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.groups = 2
+        self.input_size = [2, 4, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 3, 3, 3]
+
+
 class TestWithStride(TestConv2dTransposeOp):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
         self.dilations = [1, 1]
+        self.groups = 1
         self.input_size = [2, 3, 5, 5]  # NCHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3]
@@ -159,6 +183,7 @@ class TestWithDilation(TestConv2dTransposeOp):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
+        self.groups = 1
         self.dilations = [2, 2]
         self.input_size = [2, 3, 5, 5]  # NCHW
         f_c = self.input_size[1]
@@ -176,6 +201,7 @@ class TestCUDNNWithPad(TestWithPad):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
+        self.groups = 1
         self.dilations = [1, 1]
         self.input_size = [2, 3, 5, 5]  # NCHW
         f_c = self.input_size[1]
@@ -190,6 +216,7 @@ class TestCUDNNWithStride(TestWithStride):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
+        self.groups = 1
         self.dilations = [1, 1]
         self.input_size = [2, 3, 5, 5]  # NCHW
         f_c = self.input_size[1]
@@ -200,6 +227,34 @@ class TestCUDNNWithStride(TestWithStride):
         self.op_type = "conv2d_transpose"
 
 
+class TestCUDNNWithGroups(TestWithGroups):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.groups = 2
+        self.input_size = [2, 4, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 3, 3, 3]
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv2d_transpose"
+
+
+class TestDepthwiseConvTranspose(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.dilations = [1, 1]
+        self.input_size = [2, 8, 16, 16]  # NCHW
+        self.groups = 8
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [self.input_size[1], f_c, 4, 4]
+        self.op_type = "depthwise_conv2d_transpose"
+
+
 # Please Don't remove the following code.
 # Currently, CI use cudnn V5.0 which not support dilation conv.
 # class TestCUDNNWithDilation(TestWithDilation):
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
index d5dd63e8737cbdd9b91d083fbd0b38f8baf570b3..dd4ef7cc94ea1e8de5fe4775408389907d47d0d6 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
@@ -70,9 +70,11 @@ def conv3d_forward_naive(input, filter, group, conv_param):
 
 class TestConv3dOp(OpTest):
     def setUp(self):
+        self.op_type = "conv3d"
         self.use_cudnn = False
+        self.dtype = np.float32
+        self.init_kernel_type()
         self.init_group()
-        self.init_op_type()
         self.init_dilation()
         self.init_test_case()
 
@@ -80,32 +82,41 @@ class TestConv3dOp(OpTest):
             'stride': self.stride,
             'pad': self.pad,
             'dilations': self.dilations,
-            'use_cudnn': self.use_cudnn,
             'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
         }
-        input = np.random.random(self.input_size).astype("float32")
-        filter = np.random.random(self.filter_size).astype("float32")
+
+        input = np.random.random(self.input_size).astype(self.dtype)
+        filter = np.random.random(self.filter_size).astype(self.dtype)
         output = conv3d_forward_naive(input, filter, self.groups,
-                                      conv3d_param).astype("float32")
+                                      conv3d_param).astype(self.dtype)
 
-        self.inputs = {'Input': input, 'Filter': filter}
+        self.inputs = {
+            'Input': OpTest.np_dtype_to_fluid_dtype(input),
+            'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
+        }
         self.attrs = {
             'strides': self.stride,
             'paddings': self.pad,
             'groups': self.groups,
-            'dilations': self.dilations
+            'dilations': self.dilations,
+            'use_cudnn': self.use_cudnn
         }
         self.outputs = {'Output': output}
 
+    def testcudnn(self):
+        return core.is_compiled_with_cuda() and self.use_cudnn
+
     def test_check_output(self):
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_output_with_place(place, atol=1e-5)
         else:
             self.check_output()
 
     def test_check_grad(self):
-        if self.use_cudnn:
+        if self.dtype == np.float16:
+            return
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place,
@@ -117,7 +128,9 @@ class TestConv3dOp(OpTest):
                 set(['Input', 'Filter']), 'Output', max_relative_error=0.03)
 
     def test_check_grad_no_filter(self):
-        if self.use_cudnn:
+        if self.dtype == np.float16:
+            return
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place, ['Input'],
@@ -132,7 +145,9 @@ class TestConv3dOp(OpTest):
                 no_grad_set=set(['Filter']))
 
     def test_check_grad_no_input(self):
-        if self.use_cudnn:
+        if self.dtype == np.float16:
+            return
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place, ['Filter'],
@@ -160,8 +175,8 @@ class TestConv3dOp(OpTest):
     def init_group(self):
         self.groups = 1
 
-    def init_op_type(self):
-        self.op_type = "conv3d"
+    def init_kernel_type(self):
+        pass
 
 
 class TestCase1(TestConv3dOp):
@@ -232,34 +247,90 @@ class TestWithDilation(TestConv3dOp):
         self.groups = 3
 
 
+#----------------Conv3dCUDNN----------------
 class TestCUDNN(TestConv3dOp):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "conv3d"
+
+
+class TestFP16CUDNN(TestConv3dOp):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=2e-2)
 
 
 class TestWithGroup1CUDNN(TestWithGroup1):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "conv3d"
+
+
+class TestFP16WithGroup1CUDNN(TestWithGroup1):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=2e-2)
 
 
 class TestWithGroup2CUDNN(TestWithGroup2):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "conv3d"
+
+
+class TestFP16WithGroup2CUDNN(TestWithGroup2):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=2e-2)
 
 
 class TestWith1x1CUDNN(TestWith1x1):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "conv3d"
+
+
+class TestFP16With1x1CUDNN(TestWith1x1):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=2e-2)
 
 
 class TestWithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "conv3d"
+
+
+class TestFP16WithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=2e-2)
 
 
 # FIXME(typhoonzero): find a way to determine if
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
index 55ba238710c56dd0daea388cd2dcdb79243bb71e..c9f26d10df8ff39d6bd77b1597336600f676d362 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
@@ -21,8 +21,11 @@ from op_test import OpTest
 
 def conv3dtranspose_forward_naive(input_, filter_, attrs):
     in_n, in_c, in_d, in_h, in_w = input_.shape
-    f_c, out_c, f_d, f_h, f_w = filter_.shape
+    f_c, f_out_c, f_d, f_h, f_w = filter_.shape
+    groups = attrs['groups']
     assert in_c == f_c
+    out_c = f_out_c * groups
+    sub_in_c = in_c / groups
 
     stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[
         'dilations']
@@ -39,18 +42,23 @@ def conv3dtranspose_forward_naive(input_, filter_, attrs):
         for d in range(in_d):
             for i in range(in_h):
                 for j in range(in_w):
-                    input_masked = input_[n, :, d, i, j]  # (c)
-                    input_masked = np.reshape(input_masked, (in_c, 1, 1, 1))
-                    input_masked = np.tile(input_masked, (1, f_d, f_h, f_w))
-
-                    for k in range(out_c):
-                        tmp_out = np.sum(input_masked * filter_[:, k, :, :, :],
-                                         axis=0)
-                        d1, d2 = d * stride[0], d * stride[0] + d_bolck_d
-                        i1, i2 = i * stride[1], i * stride[1] + d_bolck_h
-                        j1, j2 = j * stride[2], j * stride[2] + d_bolck_w
-                        out[n, k, d1:d2:dilations[0], i1:i2:dilations[1], j1:j2:
-                            dilations[2]] += tmp_out
+                    for g in range(groups):
+                        input_masked = input_[n, g * sub_in_c:(g + 1
+                                                               ) * sub_in_c, d,
+                                              i, j]  # (c)
+                        input_masked = np.reshape(input_masked,
+                                                  (sub_in_c, 1, 1, 1))
+                        input_masked = np.tile(input_masked, (1, f_d, f_h, f_w))
+
+                        for k in range(f_out_c):
+                            tmp_out = np.sum(input_masked * filter_[
+                                g * sub_in_c:(g + 1) * sub_in_c, k, :, :, :],
+                                             axis=0)
+                            d1, d2 = d * stride[0], d * stride[0] + d_bolck_d
+                            i1, i2 = i * stride[1], i * stride[1] + d_bolck_h
+                            j1, j2 = j * stride[2], j * stride[2] + d_bolck_w
+                            out[n, g * f_out_c + k, d1:d2:dilations[0], i1:i2:
+                                dilations[1], j1:j2:dilations[2]] += tmp_out
 
     out = out[:, :, pad[0]:out_d - pad[0], pad[1]:out_h - pad[1], pad[2]:out_w -
               pad[2]]
@@ -72,6 +80,7 @@ class TestConv3dTransposeOp(OpTest):
             'strides': self.stride,
             'paddings': self.pad,
             'dilations': self.dilations,
+            'groups': self.groups,
             'use_cudnn': self.use_cudnn,
             'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
         }
@@ -134,6 +143,7 @@ class TestConv3dTransposeOp(OpTest):
         self.pad = [0, 0, 0]
         self.stride = [1, 1, 1]
         self.dilations = [1, 1, 1]
+        self.groups = 1
         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3, 3]
@@ -147,16 +157,29 @@ class TestWithPad(TestConv3dTransposeOp):
         self.pad = [1, 1, 1]
         self.stride = [1, 1, 1]
         self.dilations = [1, 1, 1]
+        self.groups = 1
         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3, 3]
 
 
+class TestWithGroups(TestConv3dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 2
+        self.input_size = [2, 4, 5, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 3, 3, 3, 3]
+
+
 class TestWithStride(TestConv3dTransposeOp):
     def init_test_case(self):
         self.pad = [1, 1, 1]
         self.stride = [2, 2, 2]
         self.dilations = [1, 1, 1]
+        self.groups = 1
         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3, 3]
@@ -167,6 +190,7 @@ class TestWithDilation(TestConv3dTransposeOp):
         self.pad = [1, 1, 1]
         self.stride = [1, 1, 1]
         self.dilations = [2, 2, 2]
+        self.groups = 1
         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3, 3]
@@ -184,6 +208,7 @@ class TestCUDNNWithPad(TestWithPad):
         self.pad = [1, 1, 1]
         self.stride = [1, 1, 1]
         self.dilations = [1, 1, 1]
+        self.groups = 1
         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3, 3]
@@ -198,6 +223,7 @@ class TestCUDNNWithStride(TestWithStride):
         self.pad = [1, 1, 1]
         self.stride = [2, 2, 2]
         self.dilations = [1, 1, 1]
+        self.groups = 1
         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
         f_c = self.input_size[1]
         self.filter_size = [f_c, 6, 3, 3, 3]
@@ -207,6 +233,21 @@ class TestCUDNNWithStride(TestWithStride):
         self.op_type = "conv3d_transpose"
 
 
+class TestCUDNNWithGroups(TestWithGroups):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 2
+        self.input_size = [2, 4, 5, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 3, 3, 3, 3]
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+
+
 # Please Don't remove the following code.
 # Currently, CI use cudnn V5.0 which not support dilation conv.
 # class TestCUDNNWithDilation(TestWithDilation):
diff --git a/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py b/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
index f397f542bb07519886d75618e2a915c2dbf61fce..122b076c2d3e3a69f52a2c335e2bc89707b4fa9b 100644
--- a/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
@@ -22,9 +22,9 @@ from op_test import OpTest
 class CRFDecoding(object):
     def __init__(self, emission_weights, transition_weights,
                  seq_start_positions):
-        assert (emission_weights.shape[0] == seq_start_positions[-1])
+        assert (emission_weights.shape[0] == sum(seq_start_positions))
         self.tag_num = emission_weights.shape[1]
-        self.seq_num = len(seq_start_positions) - 1
+        self.seq_num = len(seq_start_positions)
 
         self.seq_start_positions = seq_start_positions
         self.x = emission_weights
@@ -34,9 +34,9 @@ class CRFDecoding(object):
         self.w = transition_weights[2:, :]
 
         self.track = np.zeros(
-            (seq_start_positions[-1], self.tag_num), dtype="int64")
+            (sum(seq_start_positions), self.tag_num), dtype="int64")
         self.decoded_path = np.zeros(
-            (seq_start_positions[-1], 1), dtype="int64")
+            (sum(seq_start_positions), 1), dtype="int64")
 
     def _decode_one_sequence(self, decoded_path, x):
         seq_len, tag_num = x.shape
@@ -71,9 +71,11 @@ class CRFDecoding(object):
             decoded_path[i - 1] = max_idx = track[i, max_idx]
 
     def decode(self):
+        cur_pos = 0
         for i in range(self.seq_num):
-            start = self.seq_start_positions[i]
-            end = self.seq_start_positions[i + 1]
+            start = cur_pos
+            cur_pos += self.seq_start_positions[i]
+            end = cur_pos
             self._decode_one_sequence(self.decoded_path[start:end, :],
                                       self.x[start:end, :])
         return self.decoded_path
@@ -90,11 +92,13 @@ class TestCRFDecodingOp1(OpTest):
         TAG_NUM = 17
         MAX_SEQ_LEN = 10
 
-        lod = [[0]]
+        lod = [[]]
+        total_len = 0
         for i in range(SEQ_NUM):
-            lod[-1].append(lod[-1][-1] + random.randint(1, MAX_SEQ_LEN))
+            lod[-1].append(random.randint(1, MAX_SEQ_LEN))
+            total_len += lod[-1][-1]
         emission = np.random.uniform(-1, 1,
-                                     [lod[-1][-1], TAG_NUM]).astype("float64")
+                                     [total_len, TAG_NUM]).astype("float64")
         transition = np.random.uniform(-0.5, 0.5,
                                        [TAG_NUM + 2, TAG_NUM]).astype("float64")
 
@@ -126,7 +130,8 @@ class TestCRFDecodingOp2(OpTest):
         self.op_type = "crf_decoding"
         TAG_NUM = 5
 
-        lod = [[0, 1, 3, 6, 10]]
+        lod = [[1, 2, 3, 4]]
+        total_len = sum(lod[-1])
         transition = np.repeat(
             np.arange(
                 TAG_NUM, dtype="float64").reshape(1, TAG_NUM),
@@ -135,13 +140,13 @@ class TestCRFDecodingOp2(OpTest):
         emission = np.repeat(
             np.arange(
                 TAG_NUM, dtype="float64").reshape(1, TAG_NUM),
-            lod[-1][-1],
+            total_len,
             axis=0)
 
         labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int64")
+            low=0, high=TAG_NUM, size=(total_len, 1), dtype="int64")
         predicted_labels = np.ones(
-            (lod[-1][-1], 1), dtype="int64") * (TAG_NUM - 1)
+            (total_len, 1), dtype="int64") * (TAG_NUM - 1)
         expected_output = (labels == predicted_labels).astype("int64")
 
         self.inputs = {
diff --git a/python/paddle/fluid/tests/unittests/test_crop_op.py b/python/paddle/fluid/tests/unittests/test_crop_op.py
index 20cc3a643f1adfc04faad15e1b7baad3e22d9d29..4016089c01644f0389855ab114360f90c50a1bbe 100644
--- a/python/paddle/fluid/tests/unittests/test_crop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crop_op.py
@@ -42,9 +42,9 @@ class TestCropOp(OpTest):
     def setUp(self):
         self.op_type = "crop"
         self.crop_by_input = False
+        self.offset_by_input = False
         self.attrs = {}
         self.initTestCase()
-        self.attrs['offsets'] = self.offsets
         if self.crop_by_input:
             self.inputs = {
                 'X': np.random.random(self.x_shape).astype("float32"),
@@ -55,6 +55,10 @@ class TestCropOp(OpTest):
             self.inputs = {
                 'X': np.random.random(self.x_shape).astype("float32"),
             }
+        if self.offset_by_input:
+            self.inputs['Offsets'] = np.array(self.offsets).astype('int32')
+        else:
+            self.attrs['offsets'] = self.offsets
         self.outputs = {
             'Out': crop(self.inputs['X'], self.offsets, self.crop_shape)
         }
@@ -101,5 +105,22 @@ class TestCase4(TestCropOp):
         self.crop_by_input = True
 
 
+class TestCase5(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (3, 4, 5)
+        self.crop_shape = [2, 2, 3]
+        self.offsets = [1, 0, 2]
+        self.offset_by_input = True
+
+
+class TestCase6(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (10, 9, 14)
+        self.crop_shape = [3, 3, 5]
+        self.offsets = [3, 5, 4]
+        self.crop_by_input = True
+        self.offset_by_input = True
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_ctc_align.py b/python/paddle/fluid/tests/unittests/test_ctc_align.py
index f166031a1cbbaa5e312f5c7919b39648d0dad013..131b4076f45ae25b45bb3f64da07a5c3aacc43d5 100644
--- a/python/paddle/fluid/tests/unittests/test_ctc_align.py
+++ b/python/paddle/fluid/tests/unittests/test_ctc_align.py
@@ -22,14 +22,16 @@ from test_softmax_op import stable_softmax
 def CTCAlign(input, lod, blank, merge_repeated):
     lod0 = lod[0]
     result = []
-    for i in range(len(lod0) - 1):
+    cur_offset = 0
+    for i in range(len(lod0)):
         prev_token = -1
-        for j in range(lod0[i], lod0[i + 1]):
+        for j in range(cur_offset, cur_offset + lod0[i]):
             token = input[j][0]
             if (token != blank) and not (merge_repeated and
                                          token == prev_token):
                 result.append(token)
             prev_token = token
+        cur_offset += lod0[i]
     result = np.array(result).reshape([len(result), 1]).astype("int32")
     if len(result) == 0:
         result = np.array([-1])
@@ -39,7 +41,7 @@ def CTCAlign(input, lod, blank, merge_repeated):
 class TestCTCAlignOp(OpTest):
     def config(self):
         self.op_type = "ctc_align"
-        self.input_lod = [[0, 11, 18]]
+        self.input_lod = [[11, 7]]
         self.blank = 0
         self.merge_repeated = False
         self.input = np.array(
@@ -66,7 +68,7 @@ class TestCTCAlignOp(OpTest):
 class TestCTCAlignOpCase1(TestCTCAlignOp):
     def config(self):
         self.op_type = "ctc_align"
-        self.input_lod = [[0, 11, 19]]
+        self.input_lod = [[11, 8]]
         self.blank = 0
         self.merge_repeated = True
         self.input = np.array(
@@ -77,7 +79,7 @@ class TestCTCAlignOpCase1(TestCTCAlignOp):
 class TestCTCAlignOpCase2(TestCTCAlignOp):
     def config(self):
         self.op_type = "ctc_align"
-        self.input_lod = [[0, 4]]
+        self.input_lod = [[4]]
         self.blank = 0
         self.merge_repeated = True
         self.input = np.array([0, 0, 0, 0]).reshape([4, 1]).astype("int32")
diff --git a/python/paddle/fluid/tests/unittests/test_data_balance.py b/python/paddle/fluid/tests/unittests/test_data_balance.py
new file mode 100644
index 0000000000000000000000000000000000000000..cffa3329ac556dc77f3cb508b807cbd49bb974f7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_data_balance.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+import numpy as np
+
+
+class TestDataBalance(unittest.TestCase):
+    def prepare_data(self):
+        def fake_data_generator():
+            for n in xrange(self.total_ins_num):
+                yield np.ones((3, 4)) * n, n
+
+        # Prepare data
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(
+                fake_data_generator, batch_size=self.batch_size)
+            feeder = fluid.DataFeeder(
+                feed_list=[
+                    fluid.layers.data(
+                        name='image', shape=[3, 4], dtype='float32'),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            self.num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
+                self.data_file_name, reader, feeder)
+
+    def prepare_lod_data(self):
+        def fake_data_generator():
+            for n in xrange(1, self.total_ins_num + 1):
+                d1 = (np.ones((n, 3)) * n).astype('float32')
+                d2 = (np.array(n).reshape((1, 1))).astype('int32')
+                yield d1, d2
+
+        # Prepare lod data
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            with fluid.recordio_writer.create_recordio_writer(
+                    filename=self.lod_data_file_name) as writer:
+                eof = False
+                generator = fake_data_generator()
+                while (not eof):
+                    data_batch = [
+                        np.array([]).reshape((0, 3)), np.array([]).reshape(
+                            (0, 1))
+                    ]
+                    lod = [0]
+                    for _ in xrange(self.batch_size):
+                        try:
+                            ins = generator.next()
+                        except StopIteration:
+                            eof = True
+                            break
+                        for i, d in enumerate(ins):
+                            data_batch[i] = np.concatenate(
+                                (data_batch[i], d), axis=0)
+                        lod.append(lod[-1] + ins[0].shape[0])
+                    if data_batch[0].shape[0] > 0:
+                        for i, d in enumerate(data_batch):
+                            t = fluid.LoDTensor()
+                            t.set(data_batch[i], fluid.CPUPlace())
+                            if i == 0:
+                                t.set_lod([lod])
+                            writer.append_tensor(t)
+                        writer.complete_append_tensor()
+
+    def setUp(self):
+        self.use_cuda = fluid.core.is_compiled_with_cuda()
+        self.data_file_name = './data_balance_test.recordio'
+        self.lod_data_file_name = './data_balance_with_lod_test.recordio'
+        self.total_ins_num = 50
+        self.batch_size = 10
+        self.prepare_data()
+        self.prepare_lod_data()
+
+    def main(self):
+        main_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        with fluid.program_guard(main_prog, startup_prog):
+            data_reader = fluid.layers.io.open_files(
+                filenames=[self.data_file_name],
+                shapes=[[-1, 3, 4], [-1, 1]],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'])
+            if self.use_cuda:
+                data_reader = fluid.layers.double_buffer(data_reader)
+            image, label = fluid.layers.read_file(data_reader)
+
+            place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(startup_prog)
+
+            parallel_exe = fluid.ParallelExecutor(
+                use_cuda=self.use_cuda, main_program=main_prog)
+
+            if (parallel_exe.device_count > self.batch_size):
+                print("WARNING: Unittest TestDataBalance skipped. \
+                    For the result is not correct when device count \
+                    is larger than batch size.")
+                exit(0)
+            fetch_list = [image.name, label.name]
+
+            data_appeared = [False] * self.total_ins_num
+            while (True):
+                try:
+                    image_val, label_val = parallel_exe.run(fetch_list,
+                                                            return_numpy=True)
+                except fluid.core.EOFException:
+                    break
+                ins_num = image_val.shape[0]
+                broadcasted_label = np.ones(
+                    (ins_num, 3, 4)) * label_val.reshape((ins_num, 1, 1))
+                self.assertEqual(image_val.all(), broadcasted_label.all())
+                for l in label_val:
+                    self.assertFalse(data_appeared[l[0]])
+                    data_appeared[l[0]] = True
+            for i in data_appeared:
+                self.assertTrue(i)
+
+    def main_lod(self):
+        main_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        with fluid.program_guard(main_prog, startup_prog):
+            data_reader = fluid.layers.io.open_files(
+                filenames=[self.lod_data_file_name],
+                shapes=[[-1, 3], [-1, 1]],
+                lod_levels=[1, 0],
+                dtypes=['float32', 'int32'],
+                thread_num=1)
+            ins, label = fluid.layers.read_file(data_reader)
+
+            place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(startup_prog)
+
+            parallel_exe = fluid.ParallelExecutor(
+                use_cuda=self.use_cuda, main_program=main_prog)
+
+            if (parallel_exe.device_count > self.batch_size):
+                print("WARNING: Unittest TestDataBalance skipped. \
+                    For the result is not correct when device count \
+                    is larger than batch size.")
+                exit(0)
+            fetch_list = [ins.name, label.name]
+
+            data_appeared = [False] * self.total_ins_num
+            while (True):
+                try:
+                    ins_tensor, label_tensor = parallel_exe.run(
+                        fetch_list, return_numpy=False)
+                except fluid.core.EOFException:
+                    break
+
+                ins_val = np.array(ins_tensor)
+                label_val = np.array(label_tensor)
+                ins_lod = ins_tensor.lod()[0]
+                self.assertEqual(ins_val.shape[1], 3)
+                self.assertEqual(label_val.shape[1], 1)
+                self.assertEqual(len(ins_lod) - 1, label_val.shape[0])
+                for i in range(0, len(ins_lod) - 1):
+                    ins_elem = ins_val[ins_lod[i]:ins_lod[i + 1]][:]
+                    label_elem = label_val[i][0]
+                    self.assertEqual(ins_elem.all(), label_elem.all())
+                    self.assertFalse(data_appeared[int(label_elem - 1)])
+                    data_appeared[int(label_elem - 1)] = True
+
+            for i in data_appeared:
+                self.assertTrue(i)
+
+    def test_all(self):
+        self.main()
+        self.main_lod()
diff --git a/python/paddle/fluid/tests/unittests/test_debugger.py b/python/paddle/fluid/tests/unittests/test_debugger.py
index 2b7bbf9218f9b8fd8f5b29ac3cbc2f9680f471eb..870952f2f916dcdec5991ac5c10d2da3a7ab18a8 100644
--- a/python/paddle/fluid/tests/unittests/test_debugger.py
+++ b/python/paddle/fluid/tests/unittests/test_debugger.py
@@ -15,7 +15,7 @@
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid import debuger
+from paddle.fluid import debugger
 from paddle.fluid.framework import Program
 
 
@@ -51,7 +51,9 @@ class TestDebugger(unittest.TestCase):
             outputs={"Out": mul_out},
             attrs={"x_num_col_dims": 1})
 
-        print(debuger.pprint_program_codes(p.desc))
+        print(debugger.pprint_program_codes(p))
+
+        debugger.draw_block_graphviz(p.block(0), path="./test.dot")
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_detection_map_op.py b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
index a905a854ad157ffa3d7816dfbd445f3e344a1249..05d3367ad8ec2bc3df794015a7c25e943a26c68c 100644
--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
@@ -74,13 +74,13 @@ class TestDetectionMAPOp(OpTest):
         self.evaluate_difficult = True
         self.ap_type = "integral"
 
-        self.label_lod = [[0, 2, 4]]
+        self.label_lod = [[2, 2]]
         # label difficult xmin ymin xmax ymax
         self.label = [[1, 0, 0.1, 0.1, 0.3, 0.3], [1, 1, 0.6, 0.6, 0.8, 0.8],
                       [2, 0, 0.3, 0.3, 0.6, 0.5], [1, 0, 0.7, 0.1, 0.9, 0.3]]
 
         # label score xmin ymin xmax ymax difficult
-        self.detect_lod = [[0, 3, 7]]
+        self.detect_lod = [[3, 4]]
         self.detect = [
             [1, 0.3, 0.1, 0.0, 0.4, 0.3], [1, 0.7, 0.0, 0.1, 0.2, 0.3],
             [1, 0.9, 0.7, 0.6, 0.8, 0.8], [2, 0.8, 0.2, 0.1, 0.4, 0.4],
@@ -89,7 +89,7 @@ class TestDetectionMAPOp(OpTest):
         ]
 
         # label score true_pos false_pos
-        self.tf_pos_lod = [[0, 3, 7]]
+        self.tf_pos_lod = [[3, 4]]
         self.tf_pos = [[1, 0.9, 1, 0], [1, 0.7, 1, 0], [1, 0.3, 0, 1],
                        [1, 0.2, 1, 0], [2, 0.8, 0, 1], [2, 0.1, 1, 0],
                        [3, 0.2, 0, 1]]
@@ -112,15 +112,19 @@ class TestDetectionMAPOp(OpTest):
             for i, count in enumerate(class_pos_count):
                 class_pos_count_dict[i] = count
 
-            for i in range(len(true_pos_lod[0]) - 1):
-                start = true_pos_lod[0][i]
-                end = true_pos_lod[0][i + 1]
+            cur_pos = 0
+            for i in range(len(true_pos_lod[0])):
+                start = cur_pos
+                cur_pos += true_pos_lod[0][i]
+                end = cur_pos
                 for j in range(start, end):
                     true_pos_dict[i].append(true_pos[j])
 
-            for i in range(len(false_pos_lod[0]) - 1):
-                start = false_pos_lod[0][i]
-                end = false_pos_lod[0][i + 1]
+            cur_pos = 0
+            for i in range(len(false_pos_lod[0])):
+                start = cur_pos
+                cur_pos += false_pos_lod[0][i]
+                end = cur_pos
                 for j in range(start, end):
                     false_pos_dict[i].append(false_pos[j])
 
@@ -130,19 +134,19 @@ class TestDetectionMAPOp(OpTest):
             label_number = self.class_num
 
             out_class_pos_count = []
-            out_true_pos_lod = [0]
+            out_true_pos_lod = []
             out_true_pos = []
-            out_false_pos_lod = [0]
+            out_false_pos_lod = []
             out_false_pos = []
 
             for i in range(label_number):
                 out_class_pos_count.append([label_count[i]])
                 true_pos_list = true_pos[i]
                 out_true_pos += true_pos_list
-                out_true_pos_lod.append(len(out_true_pos))
+                out_true_pos_lod.append(len(true_pos_list))
                 false_pos_list = false_pos[i]
                 out_false_pos += false_pos_list
-                out_false_pos_lod.append(len(out_false_pos))
+                out_false_pos_lod.append(len(false_pos_list))
 
             return out_class_pos_count, out_true_pos, [
                 out_true_pos_lod
@@ -160,7 +164,9 @@ class TestDetectionMAPOp(OpTest):
         label_count, true_pos, false_pos = get_input_pos(
             self.class_pos_count, self.true_pos, self.true_pos_lod,
             self.false_pos, self.false_pos_lod)
-        for (label, difficult, xmin, ymin, xmax, ymax) in self.label:
+        for v in self.label:
+            label = v[0]
+            difficult = False if len(v) == 5 else v[1]
             if self.evaluate_difficult:
                 label_count[label] += 1
             elif not difficult:
@@ -239,12 +245,21 @@ class TestDetectionMAPOpSkipDiff(TestDetectionMAPOp):
 
         self.evaluate_difficult = False
 
-        self.tf_pos_lod = [[0, 2, 6]]
+        self.tf_pos_lod = [[2, 4]]
         # label score true_pos false_pos
         self.tf_pos = [[1, 0.7, 1, 0], [1, 0.3, 0, 1], [1, 0.2, 1, 0],
                        [2, 0.8, 0, 1], [2, 0.1, 1, 0], [3, 0.2, 0, 1]]
 
 
+class TestDetectionMAPOpWithoutDiff(TestDetectionMAPOp):
+    def init_test_case(self):
+        super(TestDetectionMAPOpWithoutDiff, self).init_test_case()
+
+        # label xmin ymin xmax ymax
+        self.label = [[1, 0.1, 0.1, 0.3, 0.3], [1, 0.6, 0.6, 0.8, 0.8],
+                      [2, 0.3, 0.3, 0.6, 0.5], [1, 0.7, 0.1, 0.9, 0.3]]
+
+
 class TestDetectionMAPOp11Point(TestDetectionMAPOp):
     def init_test_case(self):
         super(TestDetectionMAPOp11Point, self).init_test_case()
@@ -256,9 +271,9 @@ class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp):
     def init_test_case(self):
         super(TestDetectionMAPOpMultiBatch, self).init_test_case()
         self.class_pos_count = [0, 2, 1]
-        self.true_pos_lod = [[0, 0, 3, 5]]
+        self.true_pos_lod = [[0, 3, 2]]
         self.true_pos = [[0.7, 1.], [0.3, 0.], [0.2, 1.], [0.8, 0.], [0.1, 1.]]
-        self.false_pos_lod = [[0, 0, 3, 5]]
+        self.false_pos_lod = [[0, 3, 2]]
         self.false_pos = [[0.7, 0.], [0.3, 1.], [0.2, 0.], [0.8, 1.], [0.1, 0.]]
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_detection_output_op.py b/python/paddle/fluid/tests/unittests/test_detection_output_op.py
deleted file mode 100644
index 94681319144ee3e0d51b57944f5692183578c01b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_detection_output_op.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestUnpoolOp(OpTest):
-    def setUp(self):
-        self.op_type = "detection_output"
-        self.init_test_case()
-
-        #loc.shape ((1, 4, 4, 1, 1))
-        #conf.shape ((1, 4, 2, 1, 1))
-
-        loc = np.array([[[[[0.1]], [[0.1]], [[0.1]], [[0.1]]],
-                         [[[0.1]], [[0.1]], [[0.1]], [[0.1]]],
-                         [[[0.1]], [[0.1]], [[0.1]], [[0.1]]],
-                         [[[0.1]], [[0.1]], [[0.1]], [[0.1]]]]])
-        conf = np.array([[[[[0.1]], [[0.9]]], [[[0.2]], [[0.8]]],
-                          [[[0.3]], [[0.7]]], [[[0.4]], [[0.6]]]]])
-        priorbox = np.array([
-            0.1, 0.1, 0.5, 0.5, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.6, 0.6, 0.1,
-            0.1, 0.2, 0.2, 0.3, 0.3, 0.7, 0.7, 0.1, 0.1, 0.2, 0.2, 0.4, 0.4,
-            0.8, 0.8, 0.1, 0.1, 0.2, 0.2
-        ])
-
-        output = np.array([
-            0, 1, 0.68997443, 0.099959746, 0.099959746, 0.50804031, 0.50804031
-        ])
-        self.inputs = {
-            'Loc': loc.astype('float32'),
-            'Conf': conf.astype('float32'),
-            'PriorBox': priorbox.astype('float32')
-        }
-        self.attrs = {
-            'num_classes': self.num_classes,
-            'top_k': self.top_k,
-            'nms_top_k': self.nms_top_k,
-            'background_label_id': self.background_label_id,
-            'nms_threshold': self.nms_threshold,
-            'confidence_threshold': self.confidence_threshold,
-        }
-        self.outputs = {'Out': output.astype('float32')}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def init_test_case(self):
-        self.num_classes = 2
-        self.top_k = 10
-        self.nms_top_k = 20
-        self.background_label_id = 0
-        self.nms_threshold = 0.01
-        self.confidence_threshold = 0.01
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad2d57f7c5f127be87e963508e1dd150fdd30225
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -0,0 +1,210 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+
+SEED = 1
+DTYPE = "float32"
+paddle.dataset.mnist.fetch()
+
+
+# random seed must set before configuring the network.
+# fluid.default_startup_program().random_seed = SEED
+def cnn_model(data):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=data,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+
+    # TODO(dzhwinter) : refine the initializer and random seed settting
+    SIZE = 10
+    input_shape = conv_pool_2.shape
+    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
+
+    predict = fluid.layers.fc(
+        input=conv_pool_2,
+        size=SIZE,
+        act="softmax",
+        param_attr=fluid.param_attr.ParamAttr(
+            initializer=fluid.initializer.NormalInitializer(
+                loc=0.0, scale=scale)))
+    return predict
+
+
+def get_model(batch_size):
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    predict = cnn_model(images)
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+    batch_acc = fluid.layers.accuracy(
+        input=predict, label=label, total=batch_size_tensor)
+
+    inference_program = fluid.default_main_program().clone()
+    # Optimization
+    opt = fluid.optimizer.AdamOptimizer(
+        learning_rate=0.001, beta1=0.9, beta2=0.999)
+
+    # Reader
+    train_reader = paddle.batch(
+        paddle.dataset.mnist.train(), batch_size=batch_size)
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=batch_size)
+    opt.minimize(avg_cost)
+    return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
+
+
+def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
+    t = fluid.DistributeTranspiler()
+    t.transpile(
+        trainer_id=trainer_id,
+        program=main_program,
+        pservers=pserver_endpoints,
+        trainers=trainers)
+    return t
+
+
+def run_pserver(pserver_endpoints, trainers, current_endpoint):
+    get_model(batch_size=20)
+    t = get_transpiler(0,
+                       fluid.default_main_program(), pserver_endpoints,
+                       trainers)
+    pserver_prog = t.get_pserver_program(current_endpoint)
+    startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
+
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(startup_prog)
+
+    exe.run(pserver_prog)
+
+
+class TestDistMnist(unittest.TestCase):
+    def setUp(self):
+        self._trainers = 1
+        self._pservers = 1
+        self._ps_endpoints = "127.0.0.1:9123"
+
+    def start_pserver(self, endpoint):
+        p = Process(
+            target=run_pserver,
+            args=(self._ps_endpoints, self._trainers, endpoint))
+        p.start()
+        return p.pid
+
+    def _wait_ps_ready(self, pid):
+        retry_times = 5
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(1)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                retry_times -= 1
+
+    def stop_pserver(self, pid):
+        os.kill(pid, signal.SIGTERM)
+
+    def test_with_place(self):
+        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+
+        pserver_pid = self.start_pserver(self._ps_endpoints)
+        self._wait_ps_ready(pserver_pid)
+
+        self.run_trainer(p, 0)
+
+        self.stop_pserver(pserver_pid)
+
+    def run_trainer(self, place, trainer_id):
+        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = get_model(
+            batch_size=20)
+        t = get_transpiler(trainer_id,
+                           fluid.default_main_program(), self._ps_endpoints,
+                           self._trainers)
+
+        trainer_prog = t.get_trainer_program()
+
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        feed_var_list = [
+            var for var in trainer_prog.global_block().vars.itervalues()
+            if var.is_data
+        ]
+
+        feeder = fluid.DataFeeder(feed_var_list, place)
+        for pass_id in xrange(10):
+            for batch_id, data in enumerate(train_reader()):
+                exe.run(trainer_prog, feed=feeder.feed(data))
+
+                if (batch_id + 1) % 10 == 0:
+                    acc_set = []
+                    avg_loss_set = []
+                    for test_data in test_reader():
+                        acc_np, avg_loss_np = exe.run(
+                            program=test_program,
+                            feed=feeder.feed(test_data),
+                            fetch_list=[batch_acc, avg_cost])
+                        acc_set.append(float(acc_np))
+                        avg_loss_set.append(float(avg_loss_np))
+                    # get test acc and loss
+                    acc_val = np.array(acc_set).mean()
+                    avg_loss_val = np.array(avg_loss_set).mean()
+                    if float(acc_val
+                             ) > 0.8:  # Smaller value to increase CI speed
+                        return
+                    else:
+                        print(
+                            'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
+                            format(pass_id, batch_id + 1,
+                                   float(avg_loss_val), float(acc_val)))
+                        if math.isnan(float(avg_loss_val)):
+                            assert ("got Nan loss, training failed.")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_train.py b/python/paddle/fluid/tests/unittests/test_dist_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..562e66b0625083fe840d64967249f0215cfda1f9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
@@ -0,0 +1,123 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import unittest
+from multiprocessing import Process
+import signal
+
+import numpy
+
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+
+
+class TestSendOp(unittest.TestCase):
+    def test_send(self):
+        # Run init_serv in a thread
+        place = fluid.CPUPlace()
+        # NOTE: python thread will not work here due to GIL.
+        p = Process(target=self.init_serv, args=(place, ))
+        p.daemon = True
+        p.start()
+
+        self.ps_timeout = 5
+        self._wait_ps_ready(p.pid)
+
+        with open("/tmp/paddle.%d.port" % p.pid, "r") as fn:
+            selected_port = int(fn.readlines()[0])
+        self.init_client(place, selected_port)
+
+        self.run_local(place)
+        self.assertTrue(numpy.allclose(self.local_out, self.dist_out))
+
+        # FIXME(typhoonzero): find a way to gracefully shutdown the server.
+        os.kill(p.pid, signal.SIGKILL)
+        p.join()
+
+    def _wait_ps_ready(self, pid):
+        start_left_time = self.ps_timeout
+        sleep_time = 0.5
+        while True:
+            assert start_left_time >= 0, "wait ps ready failed"
+            time.sleep(sleep_time)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                start_left_time -= sleep_time
+
+    def init_serv(self, place):
+        main = fluid.Program()
+
+        with fluid.program_guard(main):
+            serv = layers.ListenAndServ(
+                "127.0.0.1:0", ["X"], optimizer_mode=False)
+            with serv.do():
+                out_var = main.global_block().create_var(
+                    name="scale_0.tmp_0",
+                    psersistable=True,
+                    dtype="float32",
+                    shape=[32, 32])
+                x = layers.data(
+                    shape=[32, 32],
+                    dtype='float32',
+                    name="X",
+                    append_batch_size=False)
+                fluid.initializer.Constant(value=1.0)(x, main.global_block())
+                layers.scale(x=x, scale=10.0, out=out_var)
+
+        self.server_exe = fluid.Executor(place)
+        self.server_exe.run(main)
+
+    def init_client(self, place, port):
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            x = layers.data(
+                shape=[32, 32],
+                dtype='float32',
+                name='X',
+                append_batch_size=False)
+            fluid.initializer.Constant(value=2.3)(x, main.global_block())
+            get_var = main.global_block().create_var(
+                name="scale_0.tmp_0",  # server side var
+                dtype="float32",
+                persistable=False,
+                shape=[32, 32])
+            fluid.initializer.Constant(value=2.3)(get_var, main.global_block())
+            layers.Send("127.0.0.1:%d" % port, [x])
+            o = layers.Recv("127.0.0.1:%d" % port, [get_var])
+
+        exe = fluid.Executor(place)
+        self.dist_out = exe.run(main, fetch_list=o)  # o is a list
+
+    def run_local(self, place):
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            x = layers.data(
+                shape=[32, 32],
+                dtype='float32',
+                name='X',
+                append_batch_size=False)
+            fluid.initializer.Constant(value=2.3)(x, main.global_block())
+            o = layers.scale(x=x, scale=10.0)
+        exe = fluid.Executor(place)
+        self.local_out = exe.run(main, fetch_list=[o])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..75b4b4e50da04521021dcb1e97cfe495f2619433
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -0,0 +1,263 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+from paddle.fluid.transpiler.distribute_transpiler import delete_ops
+import traceback
+
+
+class TranspilerTest(unittest.TestCase):
+    def setUp(self):
+        self.trainer_id = 0
+        self.trainers = 2
+        self.pservers = 2
+        # NOTE: we do not actually bind this port
+        self.pserver_eps = "127.0.0.1:6174,127.0.0.1:6175"
+        self.pserver1_ep = "127.0.0.1:6174"
+        self.pserver2_ep = "127.0.0.1:6175"
+        self.slice_var_up = True
+        self.sync_mode = True
+        self.transpiler = None
+
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'),
+                                    bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
+        sgd_optimizer.minimize(avg_cost)
+        return
+
+    def get_main_program(self):
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            self.net_conf()
+        self.origin_prog = main.clone()
+        return main
+
+    def get_trainer(self):
+        t = self._transpiler_instance()
+        return t.get_trainer_program()
+
+    def get_pserver(self, ep):
+        t = self._transpiler_instance()
+        pserver = t.get_pserver_program(ep)
+        startup = t.get_startup_program(ep, pserver)
+        return pserver, startup
+
+    def _transpiler_instance(self):
+        if not self.transpiler:
+            main = self.get_main_program()
+            self.transpiler = fluid.DistributeTranspiler()
+            self.transpiler.transpile(
+                self.trainer_id,
+                program=main,
+                pservers=self.pserver_eps,
+                trainers=self.trainers,
+                slice_var_up=self.slice_var_up,
+                sync_mode=self.sync_mode)
+        return self.transpiler
+
+
+class TestBasicModel(TranspilerTest):
+    def test_transpiler(self):
+        pserver, startup = self.get_pserver(self.pserver1_ep)
+        pserver2, startup2 = self.get_pserver(self.pserver2_ep)
+
+        trainer = self.get_trainer()
+
+        self.assertEqual([op.type for op in trainer.global_block().ops], [
+            'mul', 'elementwise_add', 'elementwise_sub', 'square', 'mean',
+            'fill_constant', 'mean_grad', 'square_grad', 'elementwise_sub_grad',
+            'elementwise_add_grad', 'send', 'mul_grad', 'split_byref', 'send',
+            'send_barrier', 'recv', 'recv', 'fetch_barrier', 'concat'
+        ])
+
+        self.assertEqual(len(pserver.blocks), 3)
+        # block0: listen_and_serv
+        self.assertEqual([op.type for op in pserver.blocks[0].ops],
+                         ["listen_and_serv"])
+        # block1~2: optimize pass
+        self.assertEqual([op.type for op in pserver.blocks[1].ops],
+                         ["sum", "scale", "sgd"])
+        # confirm startup program
+        self.assertEqual([op.type for op in startup.global_block().ops],
+                         ["fill_constant", "fill_constant", "uniform_random"])
+        # the variable #fc_w will be split into two blocks
+        fc_w_var = startup.global_block().var("fc_w.block1")
+        self.assertEqual(fc_w_var.shape, (500, 1000))
+        # all parameters should be optimized on pserver
+
+        pserver_params = []
+        for prog in [pserver, pserver2]:
+            for blk in prog.blocks:
+                for op in blk.ops:
+                    if "Param" in op.input_names:
+                        param_name = op.input("Param")[0]
+                        is_block_idx = param_name.find(".block")
+                        if is_block_idx != -1:
+                            origin_param_name = param_name[:is_block_idx]
+                        else:
+                            origin_param_name = param_name
+                        pserver_params.append(origin_param_name)
+        trainer_params = []
+        for op in self.origin_prog.global_block().ops:
+            if "Param" in op.input_names:
+                trainer_params.append(op.input("Param")[0])
+        self.assertEqual(set(pserver_params), set(trainer_params))
+
+
+class TestNoSliceVar(TranspilerTest):
+    def setUp(self):
+        super(TestNoSliceVar, self).setUp()
+        self.slice_var_up = False
+
+    def test_transpiler(self):
+        _, startup = self.get_pserver(self.pserver1_ep)
+        _, startup2 = self.get_pserver(self.pserver2_ep)
+
+        if startup.global_block().vars.has_key("fc_w"):
+            fc_w_var = startup.global_block().vars["fc_w"]
+        elif startup2.global_block().vars.has_key("fc_w"):
+            fc_w_var = startup2.global_block().vars["fc_w"]
+
+        self.assertEqual(fc_w_var.shape, (1000, 1000))
+
+
+class TestLRDecay(TranspilerTest):
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'),
+                                    bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        sgd_optimizer = fluid.optimizer.SGD(
+            learning_rate=fluid.layers.exponential_decay(
+                learning_rate=1.0,
+                decay_steps=2100,
+                decay_rate=0.1,
+                staircase=True))
+        sgd_optimizer.minimize(avg_cost)
+        return
+
+    def test_transpiler(self):
+        pserver, startup = self.get_pserver(self.pserver1_ep)
+        trainer = self.get_trainer()
+
+        self.assertEqual(len(pserver.blocks), 4)
+        lr_decay_ops = [op.type for op in pserver.blocks[1].ops]
+        self.assertEqual(lr_decay_ops, [
+            "increment", "cast", "fill_constant", "elementwise_div", "floor",
+            "fill_constant", "elementwise_pow", "fill_constant",
+            "elementwise_mul"
+        ])
+
+
+class TestLRDecayConditional(TranspilerTest):
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(input=x,
+                                    size=1000,
+                                    act=None,
+                                    param_attr=fluid.ParamAttr(name='fc_w'),
+                                    bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        sgd_optimizer = fluid.optimizer.SGD(
+            learning_rate=fluid.layers.piecewise_decay([10000, 20000],
+                                                       [1.0, 0.5, 1.0]))
+        sgd_optimizer.minimize(avg_cost)
+        return
+
+    def test_transpiler(self):
+        pserver, startup = self.get_pserver(self.pserver1_ep)
+        trainer = self.get_trainer()
+
+        serv_op = pserver.blocks[0].ops[0]
+        sub_blocks = []
+        optimize_blocks = []
+        for b in serv_op.attrs["optimize_blocks"]:
+            optimize_blocks.append(b.idx)
+        for b in pserver.blocks:
+            if b.idx not in optimize_blocks:
+                sub_blocks.append(b.idx)
+
+        self.assertEqual(len(pserver.blocks), 7)
+        lr_decay_ops = [op.type for op in pserver.blocks[1].ops]
+        self.assertEqual(lr_decay_ops, [
+            "increment", "cast", "fill_constant", "fill_constant", "less_than",
+            "logical_not", "conditional_block", "fill_constant",
+            "fill_constant", "less_than", "logical_not", "logical_and",
+            "logical_and", "conditional_block", "fill_constant",
+            "conditional_block"
+        ])
+        # test the condition blocks
+        for b in sub_blocks:
+            if b == 0:
+                continue
+            block = pserver.blocks[b]
+            self.assertEqual([op.type for op in block.ops], ["assign"])
+
+
+class TestL2Decay(TranspilerTest):
+    def net_conf(self):
+        x = fluid.layers.data(name='x', shape=[1000], dtype='float32')
+        y_predict = fluid.layers.fc(
+            input=x,
+            size=1000,
+            act=None,
+            param_attr=fluid.ParamAttr(
+                name='fc_w',
+                regularizer=fluid.regularizer.L2Decay(),
+                gradient_clip=fluid.clip.GradientClipByValue(0.1)),
+            bias_attr=fluid.ParamAttr(name='fc_b'))
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
+        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
+        sgd_optimizer.minimize(avg_cost)
+        return
+
+    def test_transpiler(self):
+        pserver, startup = self.get_pserver(self.pserver1_ep)
+        trainer = self.get_trainer()
+
+        self.assertEqual(len(pserver.blocks), 3)
+        self.assertEqual([op.type for op in pserver.blocks[1].ops],
+                         ["sum", "scale", "clip", "sgd"])
+        self.assertEqual(
+            [op.type for op in pserver.blocks[2].ops],
+            ["sum", "scale", "clip", "scale", "elementwise_add", "sgd"])
+        # TODO(typhoonzero): test clipping and L2Decay ops are removed from trainer
+
+
+    # FIXME(typhoonzero): need to add test for async case:
+    # see https://github.com/PaddlePaddle/Paddle/issues/11691
+class TestAsyncSGD(TranspilerTest):
+    pass
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
new file mode 100644
index 0000000000000000000000000000000000000000..712fd5849d80b1915ae3b2ae5108bedee8d88a2c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
@@ -0,0 +1,203 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import time
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+
+IS_SPARSE = True
+EMBED_SIZE = 32
+HIDDEN_SIZE = 256
+N = 5
+BATCH_SIZE = 32
+ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
+
+
+def get_model():
+    def __network__(words):
+        embed_first = fluid.layers.embedding(
+            input=words[0],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+        embed_second = fluid.layers.embedding(
+            input=words[1],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+        embed_third = fluid.layers.embedding(
+            input=words[2],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+        embed_forth = fluid.layers.embedding(
+            input=words[3],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+
+        concat_embed = fluid.layers.concat(
+            input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
+        hidden1 = fluid.layers.fc(input=concat_embed,
+                                  size=HIDDEN_SIZE,
+                                  act='sigmoid')
+        predict_word = fluid.layers.fc(input=hidden1,
+                                       size=dict_size,
+                                       act='softmax')
+        cost = fluid.layers.cross_entropy(input=predict_word, label=words[4])
+        avg_cost = fluid.layers.mean(cost)
+        return avg_cost, predict_word
+
+    word_dict = paddle.dataset.imikolov.build_dict()
+    dict_size = len(word_dict)
+
+    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
+    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
+    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
+    forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
+    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    avg_cost, predict_word = __network__(
+        [first_word, second_word, third_word, forth_word, next_word])
+
+    inference_program = paddle.fluid.default_main_program().clone()
+
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(avg_cost)
+
+    train_reader = paddle.batch(
+        paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
+    test_reader = paddle.batch(
+        paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE)
+
+    return inference_program, avg_cost, train_reader, test_reader, predict_word
+
+
+def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
+    t = fluid.DistributeTranspiler()
+    t.transpile(
+        trainer_id=trainer_id,
+        program=main_program,
+        pservers=pserver_endpoints,
+        trainers=trainers)
+    return t
+
+
+def run_pserver(pserver_endpoints, trainers, current_endpoint):
+    get_model()
+    t = get_transpiler(0,
+                       fluid.default_main_program(), pserver_endpoints,
+                       trainers)
+    pserver_prog = t.get_pserver_program(current_endpoint)
+    startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
+
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(startup_prog)
+
+    exe.run(pserver_prog)
+
+
+class TestDistMnist(unittest.TestCase):
+    def setUp(self):
+        self._trainers = 1
+        self._pservers = 1
+        self._ps_endpoints = "127.0.0.1:9123"
+
+    def start_pserver(self, endpoint):
+        p = Process(
+            target=run_pserver,
+            args=(self._ps_endpoints, self._trainers, endpoint))
+        p.start()
+        return p.pid
+
+    def _wait_ps_ready(self, pid):
+        retry_times = 5
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(1)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                retry_times -= 1
+
+    def stop_pserver(self, pid):
+        os.kill(pid, signal.SIGKILL)
+
+    def test_with_place(self):
+        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+
+        pserver_pid = self.start_pserver(self._ps_endpoints)
+        self._wait_ps_ready(pserver_pid)
+
+        self.run_trainer(p, 0)
+
+        self.stop_pserver(pserver_pid)
+
+    def run_trainer(self, place, trainer_id):
+        test_program, avg_cost, train_reader, test_reader, predict = get_model()
+        t = get_transpiler(trainer_id,
+                           fluid.default_main_program(), self._ps_endpoints,
+                           self._trainers)
+
+        trainer_prog = t.get_trainer_program()
+
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        use_gpu = True if core.is_compiled_with_cuda() else False
+
+        exec_strategy = ExecutionStrategy()
+        exec_strategy.use_cuda = use_gpu
+        train_exe = fluid.ParallelExecutor(
+            use_cuda=use_gpu,
+            main_program=trainer_prog,
+            loss_name=avg_cost.name,
+            exec_strategy=exec_strategy)
+
+        feed_var_list = [
+            var for var in trainer_prog.global_block().vars.itervalues()
+            if var.is_data
+        ]
+
+        feeder = fluid.DataFeeder(feed_var_list, place)
+        for pass_id in xrange(10):
+            for batch_id, data in enumerate(train_reader()):
+                avg_loss_np = train_exe.run(feed=feeder.feed(data),
+                                            fetch_list=[avg_cost.name])
+                loss = np.array(avg_loss_np).mean()
+                if float(loss) < 5.0:
+                    return
+                if math.isnan(loss):
+                    assert ("Got Nan loss, training failed")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index 60930a612c128cbf18e89711b9246d148e41ec58..eaa3435a86462236a99489749abe877648677053 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -14,6 +14,7 @@
 
 import unittest
 import numpy as np
+import paddle.fluid.core as core
 from op_test import OpTest
 
 
@@ -82,5 +83,37 @@ class TestDropoutOp5(OpTest):
         self.check_output()
 
 
+class TestFP16DropoutOp(OpTest):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.init_test_case()
+
+        x = np.random.random(self.input_size).astype("float16")
+        out = x * (1.0 - self.prob)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {
+            'dropout_prob': self.prob,
+            'fix_seed': self.fix_seed,
+            'is_test': True
+        }
+        self.outputs = {'Out': out}
+
+    def init_test_case(self):
+        self.input_size = [32, 64]
+        self.prob = 0.35
+        self.fix_seed = True
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda() and core.op_support_gpu("dropout"):
+            self.check_output_with_place(core.CUDAPlace(0), atol=1e-3)
+
+
+class TestFP16DropoutOp2(TestFP16DropoutOp):
+    def init_test_case(self):
+        self.input_size = [32, 64, 3]
+        self.prob = 0.75
+        self.fix_seed = False
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
index df7ab0d29bdfc9410cd7dd4a8f2a7cd440ef4aba..0faed94deb4808783027d776e0f4c61da0db457a 100644
--- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import paddle.fluid as fluid
-import paddle.v2 as paddle
+import paddle
 import unittest
 import numpy
 
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
index 22329390754d8d010dced0d1aca35617140cd097..0f289af284773caf8515f9cbdd38e0d4481e4e44 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
@@ -30,9 +30,6 @@ class Memory(object):
         assert val.dtype == self.ex.dtype
         self.cur = val
 
-    def ex(self):
-        return self.ex
-
     def next(self):
         self.ex = self.cur
         self.cur = None
@@ -139,16 +136,16 @@ class BaseRNN(object):
         feed_dict = dict()
 
         for iname in self.inputs:
-            lod = [0]
+            lod = []
             np_flatten = []
             for seq_id in xrange(len(self.inputs[iname])):
                 seq_len = len(self.inputs[iname][seq_id])
-                lod.append(lod[-1] + seq_len)
+                lod.append(seq_len)
                 np_flatten.extend(self.inputs[iname][seq_id])
 
             t = fluid.Tensor()
             t.set(numpy.array(np_flatten), place)
-            t.set_lod([lod])
+            t.set_recursive_sequence_lengths([lod])
             feed_dict[iname] = t
 
         for pname in self.params:
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
index b03a70f1b9e61162d37541ffeba8510fc11c605a..92e718662dfd7998be3ede2994f160059679fa8a 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import unittest
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid.backward import append_backward
@@ -39,20 +39,20 @@ class TestDyRnnStaticInput(unittest.TestCase):
 
     def prepare_x_tensor(self):
         self.x_tensor_dim = 10
-        lod = [[0, 2, 3, 6]]
-        shape = [lod[0][-1], self.x_tensor_dim]
+        lod = [[2, 1, 3]]
+        shape = [sum(lod[0]), self.x_tensor_dim]
         self.x_tensor_data = np.random.random(shape).astype('float32')
         self.x_tensor = core.LoDTensor()
-        self.x_tensor.set_lod(lod)
+        self.x_tensor.set_recursive_sequence_lengths(lod)
         self.x_tensor.set(self.x_tensor_data, self.place)
 
     def prepare_static_input_tensor(self):
         self.static_input_tensor_dim = 4
-        lod = [[0, 1, 3, 6]]
-        shape = [lod[0][-1], self.static_input_tensor_dim]
+        lod = [[1, 2, 3]]
+        shape = [sum(lod[0]), self.static_input_tensor_dim]
         self.static_input_data = np.random.random(shape).astype('float32')
         self.static_input_tensor = core.LoDTensor()
-        self.static_input_tensor.set_lod(lod)
+        self.static_input_tensor.set_recursive_sequence_lengths(lod)
         self.static_input_tensor.set(self.static_input_data, self.place)
 
     def fetch_value(self, var):
@@ -69,7 +69,7 @@ class TestDyRnnStaticInput(unittest.TestCase):
         ndarray = np.zeros(shape=dims).astype('float32')
         for i in xrange(np.product(dims)):
             ndarray.ravel()[i] = lod_tensor.get_float_element(i)
-        return ndarray, lod_tensor.lod()
+        return ndarray, lod_tensor.recursive_sequence_lengths()
 
     def build_graph(self, only_forward=False):
         x_tensor = fluid.layers.data(
@@ -131,21 +131,20 @@ class TestDyRnnStaticInput(unittest.TestCase):
             framework.grad_var_name('static_input_tensor'))
         return static_input_grad, loss
 
-    def get_seq_len_from_lod(self, lod):
-        return [lod[0][i + 1] - lod[0][i] for i in xrange(len(lod[0]) - 1)]
-
     def get_expected_static_step_outs(self):
-        x_lod = self.x_tensor.lod()
-        x_seq_len = self.get_seq_len_from_lod(x_lod)
+        x_lod = self.x_tensor.recursive_sequence_lengths()
+        x_seq_len = x_lod[0]
         x_seq_len_sorted = sorted(x_seq_len)
         x_sorted_indices = np.argsort(x_seq_len)[::-1]
 
-        static_lod = self.static_input_tensor.lod()
-        static_sliced = [
-            self.static_input_data[static_lod[0][i]:static_lod[0][i + 1]]
-            for i in xrange(len(static_lod[0]) - 1)
-        ]
-        static_seq_len = self.get_seq_len_from_lod(static_lod)
+        static_lod = self.static_input_tensor.recursive_sequence_lengths()
+        static_sliced = []
+        cur_offset = 0
+        for i in xrange(len(static_lod[0])):
+            static_sliced.append(self.static_input_data[cur_offset:(
+                cur_offset + static_lod[0][i])])
+            cur_offset += static_lod[0][i]
+        static_seq_len = static_lod[0]
         static_reordered = []
         for i in xrange(len(x_sorted_indices)):
             static_reordered.extend(static_sliced[x_sorted_indices[i]].tolist())
@@ -159,11 +158,13 @@ class TestDyRnnStaticInput(unittest.TestCase):
 
         for i in xrange(self._max_sequence_len):
             end = len(x_seq_len) - bisect.bisect_left(x_seq_len_sorted, i + 1)
-            lod = [0]
+            lod = []
+            total_len = 0
             for i in xrange(end):
-                lod.append(static_seq_len_reordered[i] + lod[-1])
+                lod.append(static_seq_len_reordered[i])
+                total_len += lod[-1]
             static_step_lods.append([lod])
-            end = lod[-1]
+            end = total_len
             static_step_outs.append(
                 np.array(static_reordered[:end]).astype('float32'))
 
@@ -199,7 +200,9 @@ class TestDyRnnStaticInput(unittest.TestCase):
             self.static_input_tensor.set_float_element(i, origin)
             numeric_gradients.ravel()[i] = (y_pos - y_neg) / self._delta / 2
         self.assertTrue(np.allclose(actual_gradients, numeric_gradients, 0.001))
-        self.assertTrue(np.allclose(actual_lod, self.static_input_tensor.lod()))
+        self.assertTrue(
+            np.allclose(actual_lod,
+                        self.static_input_tensor.recursive_sequence_lengths()))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_edit_distance_op.py b/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
index 2957fb50586c8bce74bbf8066e0e9bf24d79cb7d..816562621b4fc749f3c6b0eca8ee3c5850ef1ba9 100644
--- a/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
+++ b/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
@@ -52,23 +52,29 @@ class TestEditDistanceOp(OpTest):
     def setUp(self):
         self.op_type = "edit_distance"
         normalized = False
-        x1 = np.array([[0, 12, 3, 5, 8, 2]]).astype("int64")
-        x2 = np.array([[0, 12, 4, 7, 8]]).astype("int64")
+        x1 = np.array([[12, 3, 5, 8, 2]]).astype("int64")
+        x2 = np.array([[12, 4, 7, 8]]).astype("int64")
         x1 = np.transpose(x1)
         x2 = np.transpose(x2)
-        x1_lod = [0, 1, 5]
-        x2_lod = [0, 3, 4]
+        x1_lod = [1, 4]
+        x2_lod = [3, 1]
 
-        num_strs = len(x1_lod) - 1
+        num_strs = len(x1_lod)
         distance = np.zeros((num_strs, 1)).astype("float32")
         sequence_num = np.array(2).astype("int64")
+
+        x1_offset = 0
+        x2_offset = 0
         for i in range(0, num_strs):
             distance[i] = Levenshtein(
-                hyp=x1[x1_lod[i]:x1_lod[i + 1]],
-                ref=x2[x2_lod[i]:x2_lod[i + 1]])
+                hyp=x1[x1_offset:(x1_offset + x1_lod[i])],
+                ref=x2[x2_offset:(x2_offset + x2_lod[i])])
+            x1_offset += x1_lod[i]
+            x2_offset += x2_lod[i]
             if normalized is True:
-                len_ref = x2_lod[i + 1] - x2_lod[i]
+                len_ref = x2_lod[i]
                 distance[i] = distance[i] / len_ref
+
         self.attrs = {'normalized': normalized}
         self.inputs = {'Hyps': (x1, [x1_lod]), 'Refs': (x2, [x2_lod])}
         self.outputs = {'Out': distance, 'SequenceNum': sequence_num}
@@ -81,23 +87,29 @@ class TestEditDistanceOpNormalized(OpTest):
     def setUp(self):
         self.op_type = "edit_distance"
         normalized = True
-        x1 = np.array([[0, 10, 3, 6, 5, 8, 2]]).astype("int64")
-        x2 = np.array([[0, 10, 4, 6, 7, 8]]).astype("int64")
+        x1 = np.array([[10, 3, 6, 5, 8, 2]]).astype("int64")
+        x2 = np.array([[10, 4, 6, 7, 8]]).astype("int64")
         x1 = np.transpose(x1)
         x2 = np.transpose(x2)
-        x1_lod = [0, 1, 3, 6]
-        x2_lod = [0, 2, 3, 5]
+        x1_lod = [1, 2, 3]
+        x2_lod = [2, 1, 2]
 
-        num_strs = len(x1_lod) - 1
+        num_strs = len(x1_lod)
         distance = np.zeros((num_strs, 1)).astype("float32")
         sequence_num = np.array(3).astype("int64")
+
+        x1_offset = 0
+        x2_offset = 0
         for i in range(0, num_strs):
             distance[i] = Levenshtein(
-                hyp=x1[x1_lod[i]:x1_lod[i + 1]],
-                ref=x2[x2_lod[i]:x2_lod[i + 1]])
+                hyp=x1[x1_offset:(x1_offset + x1_lod[i])],
+                ref=x2[x2_offset:(x2_offset + x2_lod[i])])
+            x1_offset += x1_lod[i]
+            x2_offset += x2_lod[i]
             if normalized is True:
-                len_ref = x2_lod[i + 1] - x2_lod[i]
+                len_ref = x2_lod[i]
                 distance[i] = distance[i] / len_ref
+
         self.attrs = {'normalized': normalized}
         self.inputs = {'Hyps': (x1, [x1_lod]), 'Refs': (x2, [x2_lod])}
         self.outputs = {'Out': distance, 'SequenceNum': sequence_num}
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcdbfc8e527d0dc9a95eddaf040f8035207b6c20
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
@@ -0,0 +1,130 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+from test_elementwise_add_op import *
+'''
+Some tests differ from the tests defined in test_elementwise_add_op.py
+because MKLDNN does not support tensors of number of dimensions 3.
+Such dimensions cause exceptions in MKLDNN reorder primitive.
+'''
+
+
+class TestMKLDNNElementwiseAddOp(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_scalar(TestElementwiseAddOp_scalar):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_scalar2(TestElementwiseAddOp_scalar2):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_Vector(TestElementwiseAddOp_Vector):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TesMKLDNNtElementwiseAddOp_broadcast_0(TestElementwiseAddOp_broadcast_0):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(2).astype(self.dtype)
+        self.out = self.x + self.y.reshape(2, 1, 1, 1)
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_broadcast_1(TestElementwiseAddOp_broadcast_1):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(3).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 1, 1)
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_broadcast_2(TestElementwiseAddOp_broadcast_2):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1, 1, 4)
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_broadcast_3(TestElementwiseAddOp_broadcast_3):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_broadcast_4(TestElementwiseAddOp_broadcast_4):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_rowwise_add_0(
+        TestElementwiseAddOp_rowwise_add_0):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(3, 4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 4, 1)
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_rowwise_add_1(
+        TestElementwiseAddOp_rowwise_add_1):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNElementwiseAddOp_channelwise_add(
+        TestElementwiseAddOp_channelwise_add):
+    def init_input_output(self):
+        self.x = np.random.rand(3, 5, 20, 20).astype(self.dtype)
+        self.y = np.random.rand(3, 1, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index 5b2384e94d788342c692fcb8e33f3a2ff663ab53..fb9a496126f0b6efcad73590c78efe5a47b88cd6 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -13,158 +13,267 @@
 # limitations under the License.
 import unittest
 import numpy as np
+import paddle.fluid.core as core
 from op_test import OpTest
 
 
-class TestElementwiseOp(OpTest):
+class TestElementwiseAddOp(OpTest):
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
     def setUp(self):
         self.op_type = "elementwise_add"
+        self.dtype = np.float32
+        self.axis = -1
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
         self.inputs = {
-            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
-            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
         }
-        self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['Y'])}
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad_normal(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
 
     def test_check_grad_ingore_x(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(
             ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
+        if self.dtype == np.float16:
+            return
         self.check_grad(
             ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
 
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
 
-class TestElementwiseAddOp_scalar(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_add"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
-            'Y': np.random.rand(1).astype(np.float32)
-        }
-        self.outputs = {'Out': self.inputs['X'] + self.inputs['Y']}
+    def init_dtype(self):
+        pass
 
+    def init_axis(self):
+        pass
 
-class TestElementwiseAddOp_scalar2(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_add"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
-            'Y': np.random.rand(1, 1).astype(np.float32)
-        }
-        self.outputs = {'Out': self.inputs['X'] + self.inputs['Y']}
 
+class TestFP16ElementwiseAddOp(TestElementwiseAddOp):
+    def init_dtype(self):
+        self.dtype = np.float16
 
-class TestElementwiseAddOp_Vector(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_add"
-        self.inputs = {
-            'X': np.random.random((32, )).astype("float32"),
-            'Y': np.random.random((32, )).astype("float32")
-        }
-        self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['Y'])}
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
-class TestElementwiseAddOp_broadcast_0(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_add"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
-            'Y': np.random.rand(2).astype(np.float32)
-        }
+class TestElementwiseAddOp_scalar(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y
 
-        self.attrs = {'axis': 0}
-        self.outputs = {
-            'Out': self.inputs['X'] + self.inputs['Y'].reshape(2, 1, 1)
-        }
 
+class TestFP16ElementwiseAddOp_scalar(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y
 
-class TestElementwiseAddOp_broadcast_1(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_add"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
-            'Y': np.random.rand(3).astype(np.float32)
-        }
 
-        self.attrs = {'axis': 1}
-        self.outputs = {
-            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 3, 1)
-        }
+class TestElementwiseAddOp_scalar2(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1, 1).astype(self.dtype)
+        self.out = self.x + self.y
 
 
-class TestElementwiseAddOp_broadcast_2(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_add"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
-            'Y': np.random.rand(4).astype(np.float32)
-        }
+class TestFP16ElementwiseAddOp_scalar2(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1, 1).astype(self.dtype)
+        self.out = self.x + self.y
 
-        self.outputs = {
-            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 1, 4)
-        }
 
+class TestElementwiseAddOp_Vector(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.random((32, )).astype(self.dtype)
+        self.y = np.random.random((32, )).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
 
-class TestElementwiseAddOp_broadcast_3(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_add"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4, 5).astype(np.float32),
-            'Y': np.random.rand(3, 4).astype(np.float32)
-        }
 
-        self.attrs = {'axis': 1}
-        self.outputs = {
-            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 3, 4, 1)
-        }
+class TestFP16ElementwiseAddOp_Vector(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.random((32, )).astype(self.dtype)
+        self.y = np.random.random((32, )).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
 
 
-class TestElementwiseAddOp_broadcast_4(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_add"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4, 5).astype(np.float32),
-            'Y': np.random.rand(2, 1).astype(np.float32)
-        }
+class TestElementwiseAddOp_broadcast_0(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(2).astype(self.dtype)
+        self.out = self.x + self.y.reshape(2, 1, 1)
 
-        self.attrs = {'axis': 0}
-        self.outputs = {
-            'Out': self.inputs['X'] + self.inputs['Y'].reshape(2, 1, 1, 1)
-        }
+    def init_axis(self):
+        self.axis = 0
 
 
-class TestElementwiseAddOp_rowwise_add_0(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_add"
-        self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
-            'Y': np.random.rand(3, 4).astype(np.float32)
-        }
+class TestFP16ElementwiseAddOp_broadcast_0(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(2).astype(self.dtype)
+        self.out = self.x + self.y.reshape(2, 1, 1)
 
-        self.attrs = {'axis': 1}
-        self.outputs = {
-            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 3, 4)
-        }
+    def init_axis(self):
+        self.axis = 0
 
 
-class TestElementwiseAddOp_rowwise_add_1(TestElementwiseOp):
-    def setUp(self):
-        self.op_type = "elementwise_add"
-        self.inputs = {
-            'X': np.random.rand(2, 1).astype(np.float32),
-            'Y': np.random.rand(1).astype(np.float32)
-        }
+class TestElementwiseAddOp_broadcast_1(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(3).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 1)
 
-        self.attrs = {'axis': 1}
-        self.outputs = {
-            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 1)
-        }
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestFP16ElementwiseAddOp_broadcast_1(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(3).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseAddOp_broadcast_2(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1, 4)
+
+
+class TestFP16ElementwiseAddOp_broadcast_2(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1, 4)
+
+
+class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(3, 4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 4, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestFP16ElementwiseAddOp_broadcast_3(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(3, 4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 4, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(2, 1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(2, 1, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestFP16ElementwiseAddOp_broadcast_4(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4, 5).astype(self.dtype)
+        self.y = np.random.rand(2, 1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(2, 1, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestElementwiseAddOp_rowwise_add_0(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(3, 4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 4)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestFP16ElementwiseAddOp_rowwise_add_0(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(3, 4).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 3, 4)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseAddOp_rowwise_add_1(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 1).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestFP16ElementwiseAddOp_rowwise_add_1(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 1).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(3, 20, 20).astype(self.dtype)
+        self.y = np.random.rand(3, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestFP16ElementwiseAddOp_channelwise_add(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(3, 10, 20).astype(self.dtype)
+        self.y = np.random.rand(3, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6f45381af8ac64d117eb27325f25763fbf6cae7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
@@ -0,0 +1,103 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+
+
+class TestElementWiseAddOp(unittest.TestCase):
+    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
+
+    def check_forward_backward(self):
+        def test_with_place(place):
+            out_grad = np.random.random_sample(self.x.shape).astype(np.float32)
+            x_grad = out_grad
+            sum_axis = range(0, len(self.x.shape))
+            del sum_axis[self.axis]
+            y_grad = np.sum(out_grad, axis=tuple(sum_axis))
+
+            var_dict = locals()
+            var_dict['y'] = self.y
+            var_dict['x'] = self.x
+            var_dict['out'] = self.out
+            var_dict['y@GRAD'] = y_grad
+            var_dict['x@GRAD'] = x_grad
+            var_dict['out@GRAD'] = out_grad
+
+            var_names = ['x', 'y', 'out', 'y@GRAD', 'x@GRAD', 'out@GRAD']
+            ground_truth = {name: var_dict[name] for name in var_names}
+
+            program = fluid.Program()
+            with fluid.program_guard(program):
+                block = program.global_block()
+                for name in ground_truth:
+                    block.create_var(
+                        name=name,
+                        dtype='float32',
+                        shape=ground_truth[name].shape)
+                elementwise_add_op = block.append_op(
+                    type="elementwise_add",
+                    inputs={
+                        "X": block.var('x'),
+                        "Y": block.var('y'),
+                    },
+                    outputs={"Out": block.var('out'), },
+                    attrs={"axis": self.axis, })
+
+                # generate backward op_desc
+                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                    elementwise_add_op.desc, set(), [])
+                grad_op_desc = grad_op_desc_list[0]
+                new_op_desc = block.desc.append_op()
+                new_op_desc.copy_from(grad_op_desc)
+                for var_name in grad_op_desc.output_arg_names():
+                    block.desc.var(var_name.encode("ascii"))
+                grad_op_desc.infer_var_type(block.desc)
+                grad_op_desc.infer_shape(block.desc)
+                for arg in grad_op_desc.output_arg_names():
+                    grad_var = block.desc.find_var(arg.encode("ascii"))
+                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+                exe = fluid.Executor(place)
+                out = exe.run(program,
+                              feed={
+                                  name: var_dict[name]
+                                  for name in ['x', 'y', 'out@GRAD']
+                              },
+                              fetch_list=['x@GRAD', 'y@GRAD'])
+                self.__assert_close(x_grad, out[0], "x@GRAD")
+                self.__assert_close(y_grad, out[1], "y@GRAD", atol=1.4)
+
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(
+                "elementwise_add"):
+            places.append(core.CUDAPlace(0))
+
+        for place in places:
+            test_with_place(place)
+
+    def test_check_forward_backward_with_scale_and_bias(self):
+        np.random.seed(123)
+        self.x = np.random.random((4, 32, 220, 220)).astype(np.float32)
+        self.y = np.random.random((32)).astype(np.float32)
+        self.out = self.x + self.y.reshape(1, 32, 1, 1)
+        self.axis = 1
+        self.check_forward_backward()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_executor_and_mul.py b/python/paddle/fluid/tests/unittests/test_executor_and_mul.py
index 4958bef3ef4d101f934a2776efc21efdd24a9a4d..e1272c1d6dd7131b55ecf33fa0de0fc78a3ac5a7 100644
--- a/python/paddle/fluid/tests/unittests/test_executor_and_mul.py
+++ b/python/paddle/fluid/tests/unittests/test_executor_and_mul.py
@@ -16,7 +16,6 @@ import unittest
 
 import numpy
 import paddle.fluid.core as core
-
 from paddle.fluid.executor import Executor
 from paddle.fluid.layers import mul, data
 
diff --git a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..281068e945e76a42635868d19573498f79fde1f3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
@@ -0,0 +1,60 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import math
+from op_test import OpTest
+
+
+def quantize_max_abs(x, num_bits):
+    range = math.pow(2, num_bits) - 1
+    scale = np.max(np.abs(x).flatten())
+    y = np.round(x / scale * range)
+    return y, scale
+
+
+def dequantize_max_abs(x, num_bits, scale):
+    range = math.pow(2, num_bits) - 1
+    y = (scale / range) * x
+    return y
+
+
+class TestFakeDequantizeMaxAbsOp(OpTest):
+    def set_args(self):
+        self.num_bits = 8
+
+    def setUp(self):
+        self.set_args()
+        self.op_type = "fake_dequantize_max_abs"
+        x = np.random.randn(31, 65).astype("float32")
+        yq, scale = quantize_max_abs(x, self.num_bits)
+        print 'scale ', scale
+        ydq = dequantize_max_abs(yq, self.num_bits, scale)
+
+        self.inputs = {'X': yq}
+        self.attrs = {'num_bits': self.num_bits, 'scale': float(scale)}
+        self.outputs = {'Out': ydq}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestFakeDequantizeMaxAbsOp5Bits(OpTest):
+    def set_args(self):
+        self.num_bits = 5
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f547f3c484bf034a87823a75d946ef130a5cb70
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def fully_connected_naive(input, weights, bias_data=None):
+    in_n, in_c, in_h, in_w = input.shape
+    w_h, w_c = weights.shape
+
+    x_data = np.reshape(input, [in_n, in_c * in_h * in_w])
+    w_data = np.transpose(np.reshape(weights, (w_c, in_c * in_h * in_w)))
+    result = None
+
+    if not bias_data:
+        result = np.dot(x_data, w_data)
+    else:
+        result = np.dot(x_data, w_data) + bias_data
+
+    return result
+
+
+class MatrixGenerate:
+    def __init__(self, mb, ic, oc, h, w):
+        self.input = np.random.random((mb, ic, h, w)).astype("float32")
+        self.weights = np.random.random((ic * h * w, oc)).astype("float32")
+
+
+class TestFCMKLDNNOp(OpTest):
+    def setUp(self):
+        self.op_type = "fc"
+        self.use_mkldnn = True
+        self.with_bias = True
+        self.matrix = MatrixGenerate(1, 10, 15, 3, 3)
+
+        self.inputs = {'Input': self.matrix.input, 'W': self.matrix.weights}
+
+        self.attrs = {
+            'use_mkldnn': self.use_mkldnn,
+            'with_bias': self.with_bias
+        }
+
+        self.outputs = {
+            'Out': fully_connected_naive(self.matrix.input, self.matrix.weights)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(set(['Input', 'W']), 'Out', max_relative_error=0.9)
+
+    def test_check_grad_no_weight(self):
+        self.check_grad(
+            ['Input'], 'Out', max_relative_error=0.5, no_grad_set=set('W'))
+
+
+class TestFCMKLDNNOp1(TestFCMKLDNNOp):
+    def init_op_type(self):
+        self.matrix = MatrixGenerate(2, 15, 48, 2, 2)
+
+
+class TestFCMKLDNNOp2(TestFCMKLDNNOp):
+    def init_op_type(self):
+        self.matrix = MatrixGenerate(2, 32, 40, 1, 1)
+
+
+class TestFCMKLDNNOp3(TestFCMKLDNNOp):
+    def init_op_type(self):
+        self.matrix = MatrixGenerate(2, 2, 4, 1, 1)
+
+
+class TestFCMKLDNNOp4(TestFCMKLDNNOp):
+    def init_op_type(self):
+        self.with_bias = False
+        self.matrix = MatrixGenerate(2, 32, 48, 2, 2)
+
+
+class TestFCMKLDNNOp4(TestFCMKLDNNOp):
+    def init_op_type(self):
+        self.with_bias = False
+        self.matrix = MatrixGenerate(2, 32, 1000, 6, 6)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py b/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
index 9d724a6479f061996359b1efcc5f61f0564331c7..8b9da843115409c65055927d317867d1290c8f0e 100644
--- a/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
+++ b/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
@@ -24,17 +24,16 @@ class TestFeedFetch(unittest.TestCase):
         input_array = np.ones((4, 4, 6)).astype("float32")
         input_array[0, 0, 0] = 3
         input_array[3, 3, 5] = 10
-        input_tensor = core.LoDTensor([[0, 2, 4]])
+        input_tensor = core.LoDTensor([[2, 2]])
         input_tensor.set(input_array, place)
 
         core.set_feed_variable(scope, input_tensor, "feed", 0)
 
         output_tensor = core.get_fetch_variable(scope, "feed", 0)
 
-        output_lod = output_tensor.lod()
-        self.assertEqual(0, output_lod[0][0])
+        output_lod = output_tensor.recursive_sequence_lengths()
+        self.assertEqual(2, output_lod[0][0])
         self.assertEqual(2, output_lod[0][1])
-        self.assertEqual(4, output_lod[0][2])
 
         output_array = np.array(output_tensor)
         self.assertEqual(3, output_array[0, 0, 0])
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
index 66e3e2d51d118756d4881955b4df8eb4d2bbc094..0c75cf33f5f208d11081a6802910c25553b8c4ec 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
@@ -50,5 +50,27 @@ class TestFillConstantBatchSizeLikeWhenSecondDimIsBatchSize(OpTest):
         self.check_output()
 
 
+class TestFillConstantBatchSizeLikeWithLoDTensor(OpTest):
+    def setUp(self):
+        self.op_type = "fill_constant_batch_size_like"
+        self.inputs = {
+            'Input': (np.random.random((31, 28)).astype("float32"),
+                      [[9, 14, 8]])
+        }
+        self.attrs = {
+            'value': 3.5,
+            'shape': [-1, 16],
+            'input_dim_idx': 0,
+            'output_dim_idx': 0
+        }
+
+        out = np.random.random((3, 16)).astype("float32")
+        out.fill(3.5)
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py
index 6fd043c27e27db53c95be3630b6c08216e8e35f4..4ae90864806204197c52bbbdc5516f141afd4613 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op.py
@@ -20,8 +20,9 @@ from op_test import OpTest
 class TestGatherOp(OpTest):
     def setUp(self):
         self.op_type = "gather"
-        xnp = np.random.random((10, 20)).astype("float32")
-        self.inputs = {'X': xnp, 'Index': np.array([1, 3, 5]).astype("int32")}
+        self.config()
+        xnp = np.random.random(self.x_shape).astype("float32")
+        self.inputs = {'X': xnp, 'Index': np.array(self.index).astype("int32")}
         self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
 
     def test_check_output(self):
@@ -30,6 +31,16 @@ class TestGatherOp(OpTest):
     def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
+    def config(self):
+        self.x_shape = (10, 20)
+        self.index = [1, 3, 5]
+
+
+class TestCase1(TestGatherOp):
+    def config(self):
+        self.x_shape = (10)
+        self.index = [1, 3, 5]
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ae877a60818744f852d3af9a02ffebf5e2affc8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_gaussian_random_op import TestGaussianRandomOp
+
+
+class TestMKLDNN(TestGaussianRandomOp):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
index 272caceaf38699438ccae41691bf26b2eb4d2a22..8481500fd78f0ccf34f09c66bec27e195b9aada3 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -25,7 +25,15 @@ class TestGaussianRandomOp(unittest.TestCase):
     def setUp(self):
         self.op_type = "gaussian_random"
         self.inputs = {}
-        self.attrs = {"shape": [1000, 784], "mean": .0, "std": 1., "seed": 10}
+        self.use_mkldnn = False
+        self.init_kernel_type()
+        self.attrs = {
+            "shape": [1000, 784],
+            "mean": .0,
+            "std": 1.,
+            "seed": 10,
+            "use_mkldnn": self.use_mkldnn
+        }
 
         self.outputs = ["Out"]
 
@@ -58,6 +66,9 @@ class TestGaussianRandomOp(unittest.TestCase):
         self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1)
         self.assertAlmostEqual(numpy.std(tensor), 1., delta=0.1)
 
+    def init_kernel_type(self):
+        pass
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gru_op.py b/python/paddle/fluid/tests/unittests/test_gru_op.py
index 3a13eb872a8646cede126b667864dfc3784ebd0b..8fbf1560859aa295fc40b36129d0f0d07d55dd9f 100644
--- a/python/paddle/fluid/tests/unittests/test_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_op.py
@@ -20,8 +20,8 @@ from test_lstm_op import identity, sigmoid, tanh, relu
 
 
 class TestGRUOp(OpTest):
-    lod = [[0, 2, 6, 9]]
-    batch_size = lod[0][-1]
+    lod = [[2, 4, 3]]
+    batch_size = sum(lod[0])
     frame_size = 5
     activate = {
         'identity': identity,
@@ -33,10 +33,10 @@ class TestGRUOp(OpTest):
     @staticmethod
     def seq_to_batch(lod, is_reverse):
         idx_in_seq_list = []
-        seq_starts = lod[0]
-        seq_lens = []
-        for i in range(len(seq_starts) - 1):
-            seq_lens.append(seq_starts[i + 1] - seq_starts[i])
+        seq_lens = lod[0]
+        seq_starts = [0]
+        for i in range(len(seq_lens)):
+            seq_starts.append(seq_starts[-1] + seq_lens[i])
         sorted_seqs = sorted(
             range(len(seq_lens)), lambda x, y: seq_lens[y] - seq_lens[x])
         num_batch = seq_lens[sorted_seqs[0]]
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index 587e2025e1045f63a5825f884d4dcad8b4685e62..15a72cb605911dfe957fb927763174521a30a085 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -364,5 +364,22 @@ class TestMSRAInitializer(unittest.TestCase):
         self.assertEqual(init_op.attr('seed'), 134)
 
 
+class TestMSRAInitializer(unittest.TestCase):
+    def test_bilinear_initializer(self):
+        """Test the bilinear initializer with supplied arguments
+        """
+        program = framework.Program()
+        block = program.global_block()
+        block.create_parameter(
+            dtype="float32",
+            shape=[8, 1, 3, 3],
+            lod_level=0,
+            name="param",
+            initializer=initializer.BilinearInitializer())
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'assign_value')
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py b/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
index e33436b63c0e0b41f5e4a5bc6190517d7c648277..eff4212d91e609a7ef531280bbd3cf3671a59830 100644
--- a/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
+++ b/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
@@ -14,6 +14,7 @@
 
 import unittest
 import numpy as np
+import numpy.random as random
 import sys
 import math
 from op_test import OpTest
@@ -25,14 +26,27 @@ class TestIOUSimilarityOp(OpTest):
 
     def setUp(self):
         self.op_type = "iou_similarity"
-        self.boxes1 = np.array(
-            [[4.0, 3.0, 7.0, 5.0], [5.0, 6.0, 10.0, 7.0]]).astype('float32')
-        self.boxes2 = np.array([[3.0, 4.0, 6.0, 8.0], [14.0, 14.0, 15.0, 15.0],
-                                [0.0, 0.0, 20.0, 20.0]]).astype('float32')
-        self.output = np.array(
-            [[2.0 / 16.0, 0, 6.0 / 400.0],
-             [1.0 / 16.0, 0.0, 5.0 / 400.0]]).astype('float32')
-
+        self.boxes1 = random.rand(2, 4).astype('float32')
+        self.boxes2 = random.rand(3, 4).astype('float32')
+        self.output = random.rand(2, 3).astype('float32')
+        for row in range(self.boxes1.shape[0]):
+            for col in range(self.boxes2.shape[0]):
+                xmin1, ymin1, xmax1, ymax1 = self.boxes1[row]
+                xmin2, ymin2, xmax2, ymax2 = self.boxes2[col]
+                area1 = (ymax1 - ymin1) * (xmax1 - xmin1)
+                area2 = (ymax2 - ymin2) * (xmax2 - xmin2)
+                inter_xmax = min(xmax1, xmax2)
+                inter_ymax = min(ymax1, ymax2)
+                inter_xmin = max(xmin1, xmin2)
+                inter_ymin = max(ymin1, ymin2)
+                inter_height = inter_ymax - inter_ymin
+                inter_width = inter_xmax - inter_xmin
+                inter_height = max(inter_height, 0)
+                inter_width = max(inter_width, 0)
+                inter_area = inter_width * inter_height
+                union_area = area1 + area2 - inter_area
+                sim_score = inter_area / union_area
+                self.output[row, col] = sim_score
         self.inputs = {'X': self.boxes1, 'Y': self.boxes2}
 
         self.outputs = {'Out': self.output}
@@ -44,8 +58,8 @@ class TestIOUSimilarityOpWithLoD(TestIOUSimilarityOp):
 
     def setUp(self):
         super(TestIOUSimilarityOpWithLoD, self).setUp()
-        self.boxes1_lod = [[0, 1, 2]]
-        self.output_lod = [[0, 1, 2]]
+        self.boxes1_lod = [[1, 1]]
+        self.output_lod = [[1, 1]]
 
         self.inputs = {'X': (self.boxes1, self.boxes1_lod), 'Y': self.boxes2}
         self.outputs = {'Out': (self.output, self.output_lod)}
diff --git a/python/paddle/fluid/tests/unittests/test_is_empty_op.py b/python/paddle/fluid/tests/unittests/test_is_empty_op.py
index 4d11cf226be2ba4ffbe015198fed3191f1e02f72..11121d9b65351eab639b7618fac0e54714cf4680 100644
--- a/python/paddle/fluid/tests/unittests/test_is_empty_op.py
+++ b/python/paddle/fluid/tests/unittests/test_is_empty_op.py
@@ -14,42 +14,24 @@
 
 import unittest
 import numpy as np
-from paddle.fluid.op import Operator
-import paddle.fluid.core as core
+from op_test import OpTest
 
 
-def create_tensor(scope, name, np_data):
-    tensor = scope.var(name).get_tensor()
-    tensor.set_dims(np_data.shape)
-    tensor.set(np_data, core.CPUPlace())
-    return tensor
-
-
-class TestIsEmptyOp(unittest.TestCase):
+class TestEmpty(OpTest):
     def setUp(self):
-        self.scope = core.Scope()
-        # create input variables
-        np_data0 = np.array([0, 1, 2])
-        create_tensor(self.scope, "X0", np_data0)
-
-        np_data1 = np.array([1])
-        t = create_tensor(self.scope, "X1", np_data1)
-        t.set_dims([0])
+        self.op_type = "is_empty"
+        self.inputs = {'X': np.array([1, 2, 3])}
+        self.outputs = {'Out': np.array([False])}
 
-        # create output variables
-        self.scope.var("out")
+    def test_check_output(self):
+        self.check_output()
 
-    def test_no_empty(self):
-        self.one_case("X0", False)
 
-    def test_empty(self):
-        self.one_case("X1", True)
-
-    def one_case(self, input, target):
-        op = Operator(type="is_empty", X=input, Out="out")
-        op.run(self.scope, core.CPUPlace())
-        out = self.scope.var("out").get_tensor()
-        self.assertEqual(np.array(out)[0], target)
+class TestNotEmpty(TestEmpty):
+    def setUp(self):
+        self.op_type = "is_empty"
+        self.inputs = {'X': np.array([])}
+        self.outputs = {'Out': np.array([True])}
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
index 8c67e45b7fc997012af5f678f21271ad8b220edc..69365db4d104a1b69916a605534eff83e242289f 100644
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
@@ -15,10 +15,8 @@ import unittest
 import numpy as np
 
 from operator import mul
-from op_test import OpTest
 import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
 
 np.random.random(123)
 
@@ -70,161 +68,93 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1):
     return grad_x, d_scale, d_bias
 
 
-def get_backward_op(scope, op, no_grad_set):
-    backward_op = core.Operator.backward(op, no_grad_set)
-    for input in backward_op.input_vars():
-        var = scope.var(input)
-        var.get_tensor()
-    for output in backward_op.output_vars():
-        var = scope.var(output)
-        var.get_tensor()
-    return backward_op
-
-
-def create_or_get_tensor(scope, var_name, var, place):
-    tensor = scope.var(var_name).get_tensor()
-    if var is not None:
-        assert isinstance(var, np.ndarray)
-        tensor.set_lod([[]])
-        tensor.set_dims(var.shape)
-        tensor.set(var, place)
-    return tensor
-
-
-def set_output_grad(scope, outputs, place, feed_dict=None):
-    def __set_tensor__(name, data=None):
-        out_tensor = scope.find_var(name).get_tensor()
-        grad_tensor = scope.var(grad_var_name(name)).get_tensor()
-        out_dtype = out_tensor.dtype()
-        if data is None:
-            if out_dtype == core.VarDesc.VarType.FP64:
-                data = np.ones(out_tensor.shape(), dtype=np.float64)
-            elif out_dtype == core.VarDesc.VarType.FP32:
-                data = np.ones(out_tensor.shape(), dtype=np.float32)
-            else:
-                raise ValueError("Not supported data type " + str(out_dtype))
-        grad_tensor.set(data, place)
-
-    for output in outputs:
-        data = None
-        if output in feed_dict:
-            data = feed_dict[output]
-        __set_tensor__(output, data)
-
-
-class TestLayerNormdOp(OpTest):
+class TestLayerNormdOp(unittest.TestCase):
     def __assert_close(self, tensor, np_array, msg, atol=1e-4):
         self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
 
-    def __assert_grad_close(self,
-                            tensor,
-                            np_array,
-                            name,
-                            place,
-                            max_relative_error=0.02):
-        a = np.array(tensor)
-        b = np_array
-        abs_a = np.abs(a)
-        abs_a[abs_a < 1e-5] = 1
-
-        diff_mat = np.abs(a - b) / abs_a
-        max_diff = np.max(diff_mat)
-
-        def err_msg():
-            offset = np.argmax(diff_mat > max_relative_error)
-            return ("%s Variable %s max gradient diff %f over limit %f, "
-                    "the first error element is %d, %f, %f") % (
-                        "Gradient Check On %s" % str(place), name, max_diff,
-                        max_relative_error, offset, a.flatten()[offset],
-                        b.flatten()[offset])
-
-        self.assertLessEqual(max_diff, max_relative_error, err_msg())
-
     def check_forward_backward(self, shape, begin_norm_axis):
-        def test_with_place(place, shape, begin_norm_axis=1):
-            # setUp
-            assert begin_norm_axis > 0 and begin_norm_axis < len(
-                shape), 'begin_norm_axis must be between 0 and len(shape)-1.'
+        def test_with_place(place, shape, begin_norm_axis):
             # attr
             epsilon = 0.00001
             x_shape = shape
             D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
             scale_shape = [D]
 
-            x_val = np.random.random_sample(x_shape).astype(np.float32)
-            scale_val = np.random.random_sample(scale_shape).astype(np.float32)
-            bias_val = np.random.random_sample(scale_shape).astype(np.float32)
+            np.random.seed(123)
+            x = np.random.random_sample(x_shape).astype(np.float32)
+            scale = np.random.random_sample(scale_shape).astype(np.float32)
+            bias = np.random.random_sample(scale_shape).astype(np.float32)
             y_grad = np.random.random_sample(x_shape).astype(np.float32)
 
-            # run forward
-            y_out, saved_mean, var_ref = _reference_layer_norm_naive(
-                x_val, scale_val, bias_val, epsilon, begin_norm_axis)
-            naive_fw = {"Y": y_out, "Mean": saved_mean, "Variance": var_ref}
-
-            # get gradient
-            x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_layer_norm_grad(
-                x_val, y_grad, scale_val, saved_mean, var_ref, begin_norm_axis)
-            naive_grad = {
-                "X": x_grad_ref,
-                "Scale": scale_grad_ref,
-                "Bias": bias_grad_ref
-            }
-
-            scope = core.Scope()
-
-            # create input
-            input_map = {"X": x_val, "Scale": scale_val, "Bias": bias_val}
-            for i_name in input_map:
-                create_or_get_tensor(scope, i_name, input_map[i_name], place)
-
-            # create output
-            output_map = {"Y": None, "Mean": None, "Variance": None}
-            output_tensor = {}
-            for o_name in output_map:
-                output_tensor[o_name] = create_or_get_tensor(
-                    scope, o_name, output_map[o_name], place)
-
-            layer_norm_op = Operator(
-                "layer_norm",
-                # inputs
-                X="X",
-                Scale="Scale",
-                Bias="Bias",
-                # outputs
-                Y="Y",
-                Mean="Mean",
-                Variance="Variance",
-                # attrs
-                epsilon=epsilon,
-                begin_norm_axis=begin_norm_axis)
-
-            layer_norm_op.run(scope, place)
-
-            # check forward result
-            atol = 5e-2 if isinstance(place, core.CUDAPlace) else 1e-4
-            for o_tensor in output_tensor:
-                self.__assert_close(output_tensor[o_tensor], naive_fw[o_tensor],
-                                    o_tensor, atol)
-
-            # run backward
-            layer_norm_op_grad = get_backward_op(scope, layer_norm_op, set())
-            set_output_grad(
-                scope, ["Y", "Mean", "Variance"],
-                place,
-                feed_dict={"Y": y_grad})
-            layer_norm_op_grad.run(scope, place)
-
-            # get output
-            grad_tensor = {}
-            for o_name in naive_grad:
-                grad_tensor[o_name] = x_ = create_or_get_tensor(
-                    scope, grad_var_name(o_name), None, place)
-
-            # check gradient output
-            for o_grad in naive_grad:
-                self.__assert_grad_close(grad_tensor[o_grad],
-                                         naive_grad[o_grad], o_grad + "@GRAD",
-                                         place)
+            # reference forward & backward
+            y, mean, variance = _reference_layer_norm_naive(
+                x, scale, bias, epsilon, begin_norm_axis)
+            x_grad, scale_grad, bias_grad = _reference_layer_norm_grad(
+                x, y_grad, scale, mean, variance, begin_norm_axis)
+
+            var_dict = locals()
+            var_dict['y@GRAD'] = y_grad
+            var_names = [
+                'x', 'scale', 'bias', 'mean', 'variance', 'y', 'y@GRAD'
+            ]
+            ground_truth = {name: var_dict[name] for name in var_names}
+
+            program = fluid.Program()
+            with fluid.program_guard(program):
+                block = program.global_block()
+                for name in ground_truth:
+                    block.create_var(
+                        name=name,
+                        dtype='float32',
+                        shape=ground_truth[name].shape)
+                layer_norm_op = block.append_op(
+                    type="layer_norm",
+                    inputs={
+                        "X": block.var('x'),
+                        "Scale": block.var('scale'),
+                        "Bias": block.var('bias'),
+                    },
+                    outputs={
+                        "Y": block.var('y'),
+                        "Mean": block.var('mean'),  # share the same memory
+                        "Variance":
+                        block.var('variance'),  # share the same memory
+                    },
+                    attrs={
+                        "epsilon": epsilon,
+                        "begin_norm_axis": begin_norm_axis
+                    })
+
+                # generate backward op_desc
+                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                    layer_norm_op.desc, set(), [])
+                grad_op_desc = grad_op_desc_list[0]
+                new_op_desc = block.desc.append_op()
+                new_op_desc.copy_from(grad_op_desc)
+                for var_name in grad_op_desc.output_arg_names():
+                    block.desc.var(var_name.encode("ascii"))
+                grad_op_desc.infer_var_type(block.desc)
+                grad_op_desc.infer_shape(block.desc)
+                for arg in grad_op_desc.output_arg_names():
+                    grad_var = block.desc.find_var(arg.encode("ascii"))
+                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+                exe = fluid.Executor(place)
+                out = exe.run(program,
+                              feed={
+                                  name: var_dict[name]
+                                  for name in ['x', 'scale', 'bias', 'y@GRAD']
+                              },
+                              fetch_list=[
+                                  'y', 'mean', 'variance', 'x@GRAD',
+                                  'scale@GRAD', 'bias@GRAD'
+                              ])
+                self.__assert_close(y, out[0], "y")
+                self.__assert_close(mean, out[1], "mean")
+                self.__assert_close(variance, out[2], "variance", 1e-3)
+                self.__assert_close(x_grad, out[3], "x_grad")
+                self.__assert_close(scale_grad, out[4], "scale_grad", 1e-3)
+                self.__assert_close(bias_grad, out[5], "bias_grad")
 
         places = [core.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
@@ -237,15 +167,6 @@ class TestLayerNormdOp(OpTest):
         self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
         self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
 
-    def test_check_forward_backward_with_scale(self):
-        pass  # TODO(zcd)
-
-    def test_check_forward_backward_with_bias(self):
-        pass  # TODO(zcd)
-
-    def test_check_forward_backward(self):
-        pass  # TODO(zcd)
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 6944cca394fbc1ddde09dfeb0bc82e357a3cd225..842d34c07e94a79e3351347e2528ecc478cc56dc 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -32,7 +32,6 @@ class TestBook(unittest.TestCase):
             cost = layers.square_error_cost(input=y_predict, label=y)
             avg_cost = layers.mean(cost)
             self.assertIsNotNone(avg_cost)
-            program.append_backward(avg_cost)
 
         print(str(program))
 
@@ -94,8 +93,6 @@ class TestBook(unittest.TestCase):
             cost = layers.cross_entropy(input=predict, label=label)
             avg_cost = layers.mean(cost)
 
-            program.append_backward(avg_cost)
-
         print(str(program))
 
     def test_word_embedding(self):
@@ -181,8 +178,8 @@ class TestBook(unittest.TestCase):
         with program_guard(program):
             x = layers.data(name='x', shape=[10], dtype='float32')
             y = layers.data(
-                name='y', shape=[10, 20], dtype='float32', lod_level=1)
-            self.assertIsNotNone(layers.sequence_expand(x=x, y=y))
+                name='y', shape=[10, 20], dtype='float32', lod_level=2)
+            self.assertIsNotNone(layers.sequence_expand(x=x, y=y, ref_level=1))
         print(str(program))
 
     def test_lstm_unit(self):
@@ -220,7 +217,7 @@ class TestBook(unittest.TestCase):
             seq_data = layers.data(
                 name='seq_data', shape=[10, 10], dtype='float32', lod_level=1)
             seq = layers.fc(input=seq_data, size=20)
-            self.assertIsNotNone(layers.sequence_softmax(x=seq))
+            self.assertIsNotNone(layers.sequence_softmax(seq))
         print(str(program))
 
     def test_softmax(self):
@@ -228,7 +225,14 @@ class TestBook(unittest.TestCase):
         with program_guard(program):
             data = layers.data(name='data', shape=[10], dtype='float32')
             hid = layers.fc(input=data, size=20)
-            self.assertIsNotNone(layers.softmax(x=hid))
+            self.assertIsNotNone(layers.softmax(hid))
+        print(str(program))
+
+    def test_lrn(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(name='data', shape=[6, 2, 2], dtype='float32')
+            self.assertIsNotNone(layers.lrn(data))
         print(str(program))
 
     def test_get_places(self):
@@ -327,6 +331,103 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(loss)
         print(str(program))
 
+    def test_lod_reset(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[10], dtype='float32')
+            y = layers.data(
+                name='y', shape=[10, 20], dtype='float32', lod_level=2)
+            print(layers.lod_reset(x=x, y=y))
+        print(str(program))
+
+    def test_label_smooth(self):
+        program = Program()
+        with program_guard(program):
+            label = layers.data(name="label", shape=[1], dtype="float32")
+            one_hot_label = layers.one_hot(input=label, depth=10)
+            smooth_label = layers.label_smooth(
+                label=one_hot_label, epsilon=0.1, dtype="float32")
+            self.assertIsNotNone(smooth_label)
+        print(str(program))
+
+    def test_topk(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(name="label", shape=[200], dtype="float32")
+            values, indices = layers.topk(data, k=5)
+            self.assertIsNotNone(values)
+            self.assertIsNotNone(indices)
+        print(str(program))
+
+    def test_roi_pool(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name="x", shape=[256, 30, 30], dtype="float32")
+            rois = layers.data(
+                name="rois", shape=[4], dtype="float32", lod_level=1)
+            output = layers.roi_pool(x, rois, 7, 7, 0.6)
+            self.assertIsNotNone(output)
+        print(str(program))
+
+    def test_resize_bilinear(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[3, 9, 6], dtype="float32")
+            output = layers.resize_bilinear(x, out_shape=[12, 12])
+            self.assertIsNotNone(output)
+            output = layers.resize_bilinear(x, scale=3)
+            self.assertIsNotNone(output)
+        print(str(program))
+
+    def test_polygon_box_transform(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[8, 4, 4], dtype="float32")
+            output = layers.polygon_box_transform(input=x)
+            self.assertIsNotNone(output)
+        print(str(program))
+
+    def test_l2_normalize(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[8, 7, 10], dtype="float32")
+            output = layers.l2_normalize(x, axis=1)
+
+    def test_maxout(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(name='x', shape=[8, 6, 6], dtype="float32")
+            output = layers.maxout(x=data, groups=2)
+            self.assertIsNotNone(output)
+        print(str(program))
+
+    def test_crop(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[3, 5], dtype="float32")
+            y = layers.data(name='y', shape=[2, 3], dtype="float32")
+            output = layers.crop(x, shape=y)
+            self.assertIsNotNone(output)
+        print(str(program))
+
+    def test_mean_iou(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[16], dtype='float32')
+            y = layers.data(name='label', shape=[1], dtype='int64')
+            iou = layers.mean_iou(x, y, 2)
+            self.assertIsNotNone(iou)
+        print(str(program))
+
+    def test_argsort(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(name='x', shape=[2, 3, 3], dtype="float32")
+            out, ids = layers.argsort(input=data, axis=1)
+            self.assertIsNotNone(out)
+            self.assertIsNotNone(ids)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
index f49f7635f76c9feb5b5593438cb445df9488c69b..696d0ab4fa81a409a2bf0d6f6f23779ec26eb6d2 100644
--- a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
@@ -105,11 +105,13 @@ class TestLinearChainCrfOp(OpTest):
         MAX_SEQ_LEN = 5
 
         # the linear_chain_crf operator only supports sequence (LoD level = 1)
-        lod = [[0]]
+        lod = [[]]
+        seq_start_pos = [0]
         for i in range(SEQ_NUM):
-            lod[-1].append(lod[-1][-1] + random.randint(1, MAX_SEQ_LEN))
-        emission = np.random.uniform(-1, 1,
-                                     [lod[-1][-1], TAG_NUM]).astype("float64")
+            lod[-1].append(random.randint(1, MAX_SEQ_LEN))
+            seq_start_pos.append(seq_start_pos[-1] + lod[-1][-1])
+        emission = np.random.uniform(
+            -1, 1, [seq_start_pos[-1], TAG_NUM]).astype("float64")
         emission_row_max = np.amax(emission, axis=1, keepdims=True)
         emission_exps = np.exp(emission - emission_row_max)
 
@@ -118,14 +120,14 @@ class TestLinearChainCrfOp(OpTest):
         transition_exps = np.exp(transition)
 
         labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int64")
+            low=0, high=TAG_NUM, size=(seq_start_pos[-1], 1), dtype="int64")
 
         self.inputs = {
             "Emission": (emission, lod),
             "Transition": transition,
             "Label": (labels, lod)
         }
-        crf = LinearChainCrfForward(lod[0], emission, emission_row_max,
+        crf = LinearChainCrfForward(seq_start_pos, emission, emission_row_max,
                                     emission_exps, transition, transition_exps,
                                     labels)
         alpha, log_likelihood = crf.crf_forward_compute()
diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cdc69501043d120b9e3cc8ccda3a1212d205886
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -0,0 +1,110 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import os
+import signal
+import subprocess
+import time
+import unittest
+from multiprocessing import Process
+from op_test import OpTest
+
+
+def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id):
+    x = fluid.layers.data(name='x', shape=[1], dtype='float32')
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+    # loss function
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(cost)
+
+    # optimizer
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(avg_cost)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    pserver_endpoints = ip + ":" + port
+    current_endpoint = ip + ":" + port
+    t = fluid.DistributeTranspiler()
+    t.transpile(
+        trainer_id,
+        pservers=pserver_endpoints,
+        trainers=trainers,
+        sync_mode=sync_mode)
+    pserver_prog = t.get_pserver_program(current_endpoint)
+    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+    exe.run(pserver_startup)
+    exe.run(pserver_prog)
+
+
+class TestListenAndServOp(OpTest):
+    def setUp(self):
+        self.ps_timeout = 5
+        self.ip = "127.0.0.1"
+        self.port = "0"
+        self.trainers = 1
+        self.trainer_id = 0
+
+    def _start_pserver(self, use_cuda, sync_mode):
+        p = Process(
+            target=run_pserver,
+            args=(use_cuda, sync_mode, self.ip, self.port, self.trainers,
+                  self.trainer_id))
+        p.daemon = True
+        p.start()
+        return p
+
+    def _wait_ps_ready(self, pid):
+        start_left_time = self.ps_timeout
+        sleep_time = 0.5
+        while True:
+            assert start_left_time >= 0, "wait ps ready failed"
+            time.sleep(sleep_time)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                start_left_time -= sleep_time
+
+    def test_rpc_interfaces(self):
+        # TODO(Yancey1989): need to make sure the rpc interface correctly.
+        pass
+
+    def test_handle_signal_in_serv_op(self):
+        # run pserver on CPU in sync mode
+        p1 = self._start_pserver(False, True)
+        self._wait_ps_ready(p1.pid)
+
+        # raise SIGTERM to pserver
+        os.kill(p1.pid, signal.SIGINT)
+        p1.join()
+
+        # run pserver on CPU in async mode
+        p2 = self._start_pserver(False, False)
+        self._wait_ps_ready(p2.pid)
+
+        # raise SIGTERM to pserver
+        os.kill(p2.pid, signal.SIGTERM)
+        p2.join()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
index 093eecb8370b8ae7e4c43ce7ca6f50f5d302bd60..bac5e502318397b43e9867d5fc9e4e8cd33394b8 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
@@ -30,7 +30,8 @@ class TestLoDRankTable(unittest.TestCase):
 
         tensor = core.LoDTensor()
         tensor.set(numpy.random.random(size=(17, 100)), cpu)
-        tensor.set_lod([[0, 1, 3], [0, 5, 6, 7], [0, 3, 4, 9, 10, 13, 16, 17]])
+        tensor.set_recursive_sequence_lengths(
+            [[1, 2], [5, 1, 1], [3, 1, 5, 1, 3, 3, 1]])
         exe.run(scope=scope, feed={'x': tensor})
         var = scope.find_var(rank_table.name)
         table = var.get_lod_rank_table()
diff --git a/python/paddle/fluid/tests/unittests/test_lod_reset_op.py b/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
index 3bf8230f8748dd87ec3c85b0cbd78df2e695a96b..77905c4b96499c855fd5c5e704b8051ccdb7a323 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
@@ -21,11 +21,15 @@ class TestLodResetOpByAttr(OpTest):
     def setUp(self):
         self.op_type = "lod_reset"
         x = np.random.random((10, 20)).astype("float32")
-        lod = [[0, 3, 5, 10]]
-        target_lod_0 = [0, 7, 10]
+        lod = [[3, 2, 5]]
+        # target_offset_lod and target_lod are the same lod info represented
+        # in offset-based format and length-based format, respectively.
+        target_offset_lod = [0, 7, 10]
+        target_lod = [7, 3]
         self.inputs = {'X': (x, lod)}
-        self.attrs = {'target_lod': target_lod_0}
-        self.outputs = {'Out': (x, [target_lod_0])}
+        # The `target_lod` attribute is still based on offset
+        self.attrs = {'target_lod': target_offset_lod}
+        self.outputs = {'Out': (x, [target_lod])}
 
     def test_check_output(self):
         self.check_output()
@@ -38,40 +42,61 @@ class TestLodResetOpByInput(OpTest):
     def setUp(self):
         self.op_type = "lod_reset"
         x = np.random.random((10, 20)).astype("float32")
-        lod = [[0, 3, 5, 10]]
-        target_lod_0 = [0, 4, 7, 10]
+        lod = [[3, 2, 5]]
+        # target_offset_lod and target_lod are the same lod info represented
+        # in offset-based format and length-based format, respectively.
+        target_offset_lod = [0, 4, 7, 10]
+        target_lod = [4, 3, 3]
         self.inputs = {
             'X': (x, lod),
-            'TargetLoD': np.array([target_lod_0]).astype('int32')
+            'Y': np.array([target_offset_lod]).astype('int32')
         }
-        self.outputs = {'Out': (x, [target_lod_0])}
+        self.outputs = {'Out': (x, [target_lod])}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out", no_grad_set=set("TargetLoD"))
+        self.check_grad(["X"], "Out", no_grad_set=set("Y"))
 
 
 class TestLodResetOpBoth(OpTest):
     def setUp(self):
         self.op_type = "lod_reset"
         x = np.random.random((10, 20)).astype("float32")
-        lod = [[0, 3, 5, 10]]
-        target_lod_0_attr = [0, 7, 10]
-        target_lod_0_in = [0, 4, 7, 10]
+        lod = [[3, 2, 5]]
+        target_offset_lod_attr = [0, 7, 10]
+        target_offset_lod_in = [0, 4, 7, 10]
+        target_lod_in = [4, 3, 3]
         self.inputs = {
             'X': (x, lod),
-            'TargetLoD': np.array(target_lod_0_in).astype('int32')
+            'Y': np.array(target_offset_lod_in).astype('int32')
         }
-        self.attrs = {'target_lod': target_lod_0_attr}
-        self.outputs = {'Out': (x, [target_lod_0_in])}
+        self.attrs = {'target_lod': target_offset_lod_attr}
+        self.outputs = {'Out': (x, [target_lod_in])}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out", no_grad_set=set("TargetLoD"))
+        self.check_grad(["X"], "Out", no_grad_set=set("Y"))
+
+
+class TestLodResetOpYIsLoDTensor(OpTest):
+    def setUp(self):
+        self.op_type = "lod_reset"
+        x = np.random.random((10, 20)).astype("float32")
+        lod = [[3, 2, 5]]
+        y = np.random.random((10, 10)).astype("float32")
+        target_lod = [[4, 3, 3]]
+        self.inputs = {'X': (x, lod), 'Y': (y, target_lod)}
+        self.outputs = {'Out': (x, target_lod)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", no_grad_set=set("Y"))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
index 63b17a5ccd62ed79b3d611e039c2b2705a133272..118c22fbb1ff6be5859ae9e4aed6218b0c77deec 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
@@ -27,7 +27,7 @@ class TestLoDTensorArray(unittest.TestCase):
         for i in xrange(10):
             t = core.LoDTensor()
             t.set(numpy.array([i], dtype='float32'), cpu)
-            t.set_lod([[0, 1]])
+            t.set_recursive_sequence_lengths([[1]])
             tensor_array.append(t)
 
         self.assertEqual(10, len(tensor_array))
@@ -35,17 +35,17 @@ class TestLoDTensorArray(unittest.TestCase):
         for i in xrange(10):
             t = tensor_array[i]
             self.assertEqual(numpy.array(t), numpy.array([i], dtype='float32'))
-            self.assertEqual([[0, 1]], t.lod())
+            self.assertEqual([[1]], t.recursive_sequence_lengths())
 
             t = core.LoDTensor()
             t.set(numpy.array([i + 10], dtype='float32'), cpu)
-            t.set_lod([[0, 2]])
+            t.set_recursive_sequence_lengths([[1]])
             tensor_array[i] = t
             t = tensor_array[i]
             self.assertEqual(
                 numpy.array(t), numpy.array(
                     [i + 10], dtype='float32'))
-            self.assertEqual([[0, 2]], t.lod())
+            self.assertEqual([[1]], t.recursive_sequence_lengths())
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
index 66a03640c148d769787593f41a44cd4d1aaa10b1..cebe6997bb4152519dabbabfc0404d6036bc4e65 100644
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
@@ -29,7 +29,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(
             numpy.arange(10).reshape(10, 1).astype('int32'), self.place())
-        tensor.set_lod([[0, 3, 9, 10]])
+        tensor.set_recursive_sequence_lengths([[3, 6, 1]])
         expect = map(lambda x: numpy.array(x).astype('int32'),
                      [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
         self.main(
@@ -42,7 +42,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(
             numpy.arange(10).reshape(10, 1).astype('int32'), self.place())
-        tensor.set_lod([[0, 3, 9, 9, 10]])
+        tensor.set_recursive_sequence_lengths([[3, 6, 0, 1]])
         expect = map(lambda x: numpy.array(x).astype('int32'),
                      [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
         self.main(
@@ -55,7 +55,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(
             numpy.arange(20).reshape(20, 1).astype('int32'), self.place())
-        tensor.set_lod([[0, 2, 5], [0, 3, 9, 11, 17, 20]])
+        tensor.set_recursive_sequence_lengths([[2, 3], [3, 6, 2, 6, 3]])
 
         expect = [
             numpy.array(
@@ -65,7 +65,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
                 [17, 18, 19], dtype='int32')
         ]
 
-        lod = [[[0, 2, 5]], [[0, 6, 12]], [[0, 3]]]
+        lod = [[[2, 3]], [[6, 6]], [[3]]]
         self.main(
             tensor=tensor,
             expect_array=expect,
@@ -77,8 +77,8 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor.set(
             numpy.arange(31).reshape(31, 1).astype('int32'), self.place())
 
-        tensor.set_lod([[0, 3, 5, 9, 11],
-                        [0, 3, 7, 11, 11, 12, 17, 19, 21, 23, 30, 31]])
+        tensor.set_recursive_sequence_lengths(
+            [[3, 2, 4, 2], [3, 4, 4, 0, 1, 5, 2, 2, 2, 7, 1]])
 
         expect = [
             numpy.array(
@@ -88,7 +88,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
             ], [17, 18, 3, 4, 5, 6, 11, 30], [19, 20, 7, 8, 9, 10], [21, 22]]
         ]
 
-        lod = [[[0, 5, 8, 8, 15]], [[0, 2, 6, 7, 8]], [[0, 2, 6]], [[0, 2]]]
+        lod = [[[5, 3, 0, 7]], [[2, 4, 1, 1]], [[2, 4]], [[2]]]
         self.main(
             tensor=tensor,
             expect_array=expect,
@@ -99,8 +99,9 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(
             numpy.arange(50).reshape(50, 1).astype('int32'), self.place())
-        tensor.set_lod([[0, 2, 5, 6], [0, 2, 5, 6, 10, 12, 13],
-                        [0, 3, 7, 11, 17, 21, 22, 23, 27, 31, 39, 45, 46, 50]])
+        tensor.set_recursive_sequence_lengths(
+            [[2, 3, 1], [2, 3, 1, 4, 2, 1],
+             [3, 4, 4, 6, 4, 1, 1, 4, 4, 8, 6, 1, 4]])
 
         expect = [
             numpy.array(
@@ -108,8 +109,8 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
             for item in [[21, 0, 1, 2, 3, 4, 5, 6, 46, 47, 48, 49], range(
                 22, 39) + range(7, 21), range(39, 46)]
         ]
-        lod = [[[0, 1, 3, 4], [0, 1, 4, 8, 12]],
-               [[0, 4, 7], [0, 1, 5, 9, 17, 21, 27, 31]], [[0, 2], [0, 6, 7]]]
+        lod = [[[1, 2, 1], [1, 3, 4, 4]], [[4, 3], [1, 4, 4, 8, 4, 6, 4]],
+               [[2], [6, 1]]]
         self.main(
             tensor=tensor,
             expect_array=expect,
@@ -120,8 +121,9 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(
             numpy.arange(50).reshape(50, 1).astype('int32'), self.place())
-        tensor.set_lod([[0, 2, 5, 6], [0, 2, 5, 6, 10, 12, 13],
-                        [0, 3, 7, 11, 17, 21, 22, 23, 27, 31, 39, 45, 46, 50]])
+        tensor.set_recursive_sequence_lengths(
+            [[2, 3, 1], [2, 3, 1, 4, 2, 1],
+             [3, 4, 4, 6, 4, 1, 1, 4, 4, 8, 6, 1, 4]])
         self.main(
             tensor=tensor,
             expect_array=None,
@@ -162,12 +164,13 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
             exp_tensor, exp_lod = exp
             exp_tensor = numpy.expand_dims(exp_tensor, axis=1)
             self.assertTrue(numpy.allclose(exp_tensor, numpy.array(array[i])))
-            self.assertEqual(exp_lod, array[i].lod())
+            self.assertEqual(exp_lod, array[i].recursive_sequence_lengths())
 
     def check_tensor_same(self, actual, expect):
         self.assertTrue(
             numpy.allclose(numpy.array(actual), numpy.array(expect)))
-        self.assertEqual(actual.lod(), expect.lod())
+        self.assertEqual(actual.recursive_sequence_lengths(),
+                         expect.recursive_sequence_lengths())
 
 
 class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
@@ -188,7 +191,7 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
 
         tensor = core.LoDTensor()
         tensor.set(numpy.arange(10).reshape(10, 1).astype('float32'), place)
-        tensor.set_lod([[0, 3, 9, 10]])
+        tensor.set_recursive_sequence_lengths([[3, 6, 1]])
 
         g_vars = program.global_block().var(x.name + "@GRAD")
 
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa9eae1e882f55ef51f38e158317a1a9aeed641c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py
@@ -0,0 +1,86 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+
+
+def output_hist(out):
+    hist, _ = np.histogram(out, range=(-5, 10))
+    hist = hist.astype("float32")
+    hist /= float(out.size)
+    prob = 0.1 * np.ones((10))
+    return hist, prob
+
+
+class TestLookupSpraseTable(OpTest):
+    def check_with_place(self, place):
+        scope = core.Scope()
+
+        # create and initialize Id Variable
+        ids = scope.var("Ids").get_tensor()
+        ids_array = np.array([0, 2, 3, 5, 100]).astype("int64")
+        ids.set(ids_array, place)
+
+        # create and initialize W Variable
+        rows = [0, 1, 2, 3, 4, 5, 6]
+        row_numel = 10000
+
+        w_selected_rows = scope.var('W').get_selected_rows()
+        w_selected_rows.set_height(len(rows))
+        w_selected_rows.set_rows(rows)
+        w_array = np.ones((len(rows), row_numel)).astype("float32")
+        for i in range(len(rows)):
+            w_array[i] *= i
+        w_tensor = w_selected_rows.get_tensor()
+        w_tensor.set(w_array, place)
+
+        # create Out Variable
+        out_tensor = scope.var('Out').get_tensor()
+
+        # create and run lookup_table operator
+        lookup_table = Operator(
+            "lookup_sparse_table",
+            W='W',
+            Ids='Ids',
+            Out='Out',
+            min=-5.0,
+            max=10.0,
+            seed=10)
+        lookup_table.run(scope, place)
+
+        # get result from Out
+        result_array = np.array(out_tensor)
+        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
+        for idx, row in enumerate(ids_array[:-2]):
+            assert (row == result_array[idx]).all()
+
+        # check the random value
+        hist, prob = output_hist(result_array[-1])
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+
+    def test_w_is_selected_rows(self):
+        places = [core.CPUPlace()]
+        # currently only support CPU
+        for place in places:
+            self.check_with_place(place)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
index ed920ad388ff0e01887404e70fe82565b4cd28fa..f8d5785fbfe64843f4aa3b96b24809df60980c74 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
@@ -96,5 +96,47 @@ class TestLookupTableIdsIsSelectedRows(OpTest):
             self.check_with_place(place)
 
 
+class TestLookupTableWIsSelectedRows(OpTest):
+    def check_with_place(self, place):
+        scope = core.Scope()
+
+        # create and initialize Id Variable
+        ids_tensor = scope.var('Ids').get_tensor()
+        ids_array = np.array([[0], [4], [3], [5]]).astype("int64")
+        ids_tensor.set(ids_array, place)
+
+        # create and initialize W Variable
+        rows = [0, 1, 2, 3, 4, 5, 6]
+        row_numel = 12
+
+        w_selected_rows = scope.var('W').get_selected_rows()
+        w_selected_rows.set_height(len(rows))
+        w_selected_rows.set_rows(rows)
+        w_array = np.ones((len(rows), row_numel)).astype("float32")
+        for i in range(len(rows)):
+            w_array[i] *= i
+        w_tensor = w_selected_rows.get_tensor()
+        w_tensor.set(w_array, place)
+
+        # create Out Variable
+        out_tensor = scope.var('Out').get_tensor()
+
+        # create and run lookup_table operator
+        lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
+        lookup_table.run(scope, place)
+
+        # get result from Out
+        result_array = np.array(out_tensor)
+        # all(): return True if all elements of the iterable are true (or if the iterable is empty)
+        for idx, row in enumerate(ids_array):
+            assert (row[0] == result_array[idx]).all()
+
+    def test_w_is_selected_rows(self):
+        places = [core.CPUPlace()]
+        # currently only support CPU
+        for place in places:
+            self.check_with_place(place)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..966a16dc870c041b9deb140bed57d907cf305fd8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py
@@ -0,0 +1,49 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from test_lrn_op import TestLRNOp
+
+
+class TestLRNMKLDNNOp(TestLRNOp):
+    def get_attrs(self):
+        attrs = TestLRNOp.get_attrs(self)
+        attrs['use_mkldnn'] = True
+        return attrs
+
+    def test_check_output(self):
+        self.check_output(atol=0.002)
+
+
+class TestLRNMKLDNNOpWithIsTest(TestLRNMKLDNNOp):
+    def get_attrs(self):
+        attrs = TestLRNMKLDNNOp.get_attrs(self)
+        attrs['is_test'] = True
+        return attrs
+
+    def test_check_grad_normal(self):
+        def check_raise_is_test():
+            try:
+                self.check_grad(['X'], 'Out', max_relative_error=0.01)
+            except Exception as e:
+                t = \
+                "is_test attribute should be set to False in training phase."
+                if t in str(e):
+                    raise AttributeError
+
+        self.assertRaises(AttributeError, check_raise_is_test)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lrn_op.py b/python/paddle/fluid/tests/unittests/test_lrn_op.py
index 7f2352c5882ce36d8d681a737806f3ee0e3ace98..eaff45cbb2a58798e9d55149510bec72eea370cd 100644
--- a/python/paddle/fluid/tests/unittests/test_lrn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lrn_op.py
@@ -41,7 +41,7 @@ class TestLRNOp(OpTest):
         mid.fill(self.k)
         for m in range(0, self.N):
             for i in range(0, self.C):
-                for c in range(start, end + 1):
+                for c in range(start, end):
                     ch = i + c
                     if ch < 0 or ch >= self.C:
                         continue
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_op.py b/python/paddle/fluid/tests/unittests/test_lstm_op.py
index f8ff5a3361af66612f08b2aa4eaffa363f04c594..705a24bd8f39a55e0a352944d961f8d33aaf96ff 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_op.py
@@ -84,15 +84,17 @@ def lstm(
         h = g_o * act_cell(c)
         return h, c
 
-    def _reverse(x, lod):
+    def _reverse(x, offset):
         y = np.zeros_like(x)
-        for i in range(len(lod) - 1):
-            b, e = lod[i], lod[i + 1]
+        for i in range(len(offset) - 1):
+            b, e = offset[i], offset[i + 1]
             y[b:e, :] = np.flip(x[b:e, :], 0)
         return y
 
-    offset = lod[0]
-    batch_size = len(offset) - 1
+    offset = [0]
+    for l in lod[0]:
+        offset.append(offset[-1] + l)
+    batch_size = len(lod[0])
     hidden = []
     cell = []
     input = _reverse(input, offset) if is_reverse else input
@@ -100,7 +102,7 @@ def lstm(
         input = input + np.tile(w_b, (offset[-1], 1))
     for i in range(batch_size):
         # compute one sequence
-        seq_len = offset[i + 1] - offset[i]
+        seq_len = lod[0][i]
         x = input[offset[i]:offset[i + 1], :]
         h_pre = h0[i]  # 1 x D
         c_pre = c0[i]  # 1 x D
@@ -124,7 +126,7 @@ def lstm(
 
 class TestLstmOp(OpTest):
     def set_argument(self):
-        self.lod = [[0, 2, 5, 7]]
+        self.lod = [[2, 3, 2]]
         self.D = 16
 
         self.act_gate = 'sigmoid'
@@ -139,8 +141,8 @@ class TestLstmOp(OpTest):
         self.set_argument()
         self.op_type = 'lstm'
 
-        T = self.lod[0][-1]
-        N = len(self.lod[0]) - 1
+        T = sum(self.lod[0])
+        N = len(self.lod[0])
 
         x = np.random.normal(size=(T, 4 * self.D)).astype('float64')
         if self.has_initial_state:
@@ -186,7 +188,7 @@ class TestLstmOp(OpTest):
 
     def test_check_grad(self):
         # TODO(qingqing) remove folowing lines after the check_grad is refined.
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchCellPreAct'] = np.zeros(
             (N, self.D)).astype('float64')
@@ -194,107 +196,104 @@ class TestLstmOp(OpTest):
             ['Input', 'Weight', 'Bias'], ['Hidden'], max_relative_error=5e-4)
 
 
-class TestLstmOpHasInitial(TestLstmOp):
-    def set_argument(self):
-        self.lod = [[0, 2, 5, 7]]
-        self.D = 16
-
-        self.act_gate = 'sigmoid'
-        self.act_cell = 'tanh'
-        self.act_cand = 'tanh'
-
-        self.has_initial_state = True
-        self.is_reverse = True
-        self.use_peepholes = True
-
-    def test_check_grad(self):
-        # TODO(qingqing) remove folowing lines after the check_grad is refined.
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'],
-            max_relative_error=5e-4)
-
-    def test_check_grad_ingore_bias(self):
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight'], ['Hidden'],
-            max_relative_error=5e-4,
-            no_grad_set=set('Bias'))
-
-    def test_check_grad_ingore_weight(self):
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Bias'], ['Hidden'],
-            max_relative_error=5e-4,
-            no_grad_set=set('Weight'))
-
-    def test_check_grad_ingore_input(self):
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Weight', 'Bias'], ['Hidden'],
-            max_relative_error=5e-4,
-            no_grad_set=set('Input'))
-
-    def test_check_grad_ingore_h0(self):
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'],
-            max_relative_error=5e-4,
-            no_grad_set=set('H0'))
-
-    def test_check_grad_ingore_c0(self):
-        N = len(self.lod[0]) - 1
-        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
-        self.outputs['BatchCellPreAct'] = np.zeros(
-            (N, self.D)).astype('float64')
-        self.check_grad(
-            ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'],
-            max_relative_error=5e-4,
-            no_grad_set=set('C0'))
-
-
-class TestLstmOpRerverse(TestLstmOp):
-    def set_argument(self):
-        self.lod = [[0, 2, 5, 7]]
-        self.D = 16
-
-        self.act_gate = 'sigmoid'
-        self.act_cell = 'tanh'
-        self.act_cand = 'tanh'
-
-        self.has_initial_state = False
-        self.is_reverse = True
-        self.use_peepholes = True
-
-
-class TestLstmOpNotUsePeepholes(TestLstmOp):
-    def set_argument(self):
-        self.lod = [[0, 2, 5, 7]]
-        self.D = 16
-
-        self.act_gate = 'sigmoid'
-        self.act_cell = 'tanh'
-        self.act_cand = 'tanh'
-
-        self.has_initial_state = False
-        self.is_reverse = True
-        self.use_peepholes = False
-
+# class TestLstmOpHasInitial(TestLstmOp):
+#     def set_argument(self):
+#         self.lod = [[2, 3, 2]]
+#         self.D = 16
+
+#         self.act_gate = 'sigmoid'
+#         self.act_cell = 'tanh'
+#         self.act_cand = 'tanh'
+
+#         self.has_initial_state = True
+#         self.is_reverse = True
+#         self.use_peepholes = True
+
+#     def test_check_grad(self):
+#         # TODO(qingqing) remove folowing lines after the check_grad is refined.
+#         N = len(self.lod[0])
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'],
+#             max_relative_error=5e-4)
+
+#     def test_check_grad_ingore_bias(self):
+#         N = len(self.lod[0])
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Input', 'Weight'], ['Hidden'],
+#             max_relative_error=5e-4,
+#             no_grad_set=set('Bias'))
+
+#     def test_check_grad_ingore_weight(self):
+#         N = len(self.lod[0])
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Input', 'Bias'], ['Hidden'],
+#             max_relative_error=5e-4,
+#             no_grad_set=set('Weight'))
+
+#     def test_check_grad_ingore_input(self):
+#         N = len(self.lod[0])
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Weight', 'Bias'], ['Hidden'],
+#             max_relative_error=5e-4,
+#             no_grad_set=set('Input'))
+
+#     def test_check_grad_ingore_h0(self):
+#         N = len(self.lod[0])
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'],
+#             max_relative_error=5e-4,
+#             no_grad_set=set('H0'))
+
+#     def test_check_grad_ingore_c0(self):
+#         N = len(self.lod[0])
+#         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+#         self.outputs['BatchCellPreAct'] = np.zeros(
+#             (N, self.D)).astype('float64')
+#         self.check_grad(
+#             ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'],
+#             max_relative_error=5e-4,
+#             no_grad_set=set('C0'))
+
+# class TestLstmOpRerverse(TestLstmOp):
+#     def set_argument(self):
+#         self.lod = [[2, 3, 2]]
+#         self.D = 16
+
+#         self.act_gate = 'sigmoid'
+#         self.act_cell = 'tanh'
+#         self.act_cand = 'tanh'
+
+#         self.has_initial_state = False
+#         self.is_reverse = True
+#         self.use_peepholes = True
+
+# class TestLstmOpNotUsePeepholes(TestLstmOp):
+#     def set_argument(self):
+#         self.lod = [[2, 3, 2]]
+#         self.D = 16
+
+#         self.act_gate = 'sigmoid'
+#         self.act_cell = 'tanh'
+#         self.act_cand = 'tanh'
+
+#         self.has_initial_state = False
+#         self.is_reverse = True
+#         self.use_peepholes = False
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lstmp_op.py b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
index afff133f6c6cfe45d1aca4014dc8b92e6562e6b8..ed2262da4bc727657c2e65d69cb1922891e17b09 100644
--- a/python/paddle/fluid/tests/unittests/test_lstmp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
@@ -64,15 +64,17 @@ def lstmp(
         r = act_proj(r)
         return r, c
 
-    def _reverse(x, lod):
+    def _reverse(x, offset):
         y = np.zeros_like(x)
-        for i in range(len(lod) - 1):
-            b, e = lod[i], lod[i + 1]
+        for i in range(len(offset) - 1):
+            b, e = offset[i], offset[i + 1]
             y[b:e, :] = np.flip(x[b:e, :], 0)
         return y
 
-    offset = lod[0]
-    batch_size = len(offset) - 1
+    offset = [0]
+    for l in lod[0]:
+        offset.append(offset[-1] + l)
+    batch_size = len(lod[0])
     # recurrent projection state
     projection = []
     cell = []
@@ -81,7 +83,7 @@ def lstmp(
         input = input + np.tile(w_b, (offset[-1], 1))
     for i in range(batch_size):
         # compute one sequence
-        seq_len = offset[i + 1] - offset[i]
+        seq_len = lod[0][i]
         x = input[offset[i]:offset[i + 1], :]
         r_pre = np.dot(h0[i], w_rh)  # 1 x P
         r_pre = act_proj(r_pre)
@@ -117,8 +119,8 @@ class TestLstmpOp(LstmTest.TestLstmOp):
         self.reset_argument()
         self.op_type = 'lstmp'
 
-        T = self.lod[0][-1]
-        N = len(self.lod[0]) - 1
+        T = sum(self.lod[0])
+        N = len(self.lod[0])
 
         x = np.random.normal(size=(T, 4 * self.D)).astype('float64')
         if self.has_initial_state:
@@ -166,7 +168,7 @@ class TestLstmpOp(LstmTest.TestLstmOp):
 
     def test_check_grad(self):
         # TODO(qingqing) remove folowing lines after the check_grad is refined.
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -183,7 +185,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
 
     def test_check_grad(self):
         # TODO(qingqing) remove folowing lines after the check_grad is refined.
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -195,7 +197,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
             max_relative_error=1e-2)
 
     def test_check_grad_ingore_bias(self):
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -207,7 +209,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
             no_grad_set=set('Bias'))
 
     def test_check_grad_ingore_weight(self):
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -219,7 +221,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
             no_grad_set=set('Weight'))
 
     def test_check_grad_ingore_proj_weight(self):
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -231,7 +233,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
             no_grad_set=set('ProjWeight'))
 
     def test_check_grad_ingore_input(self):
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -243,7 +245,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
             no_grad_set=set('Input'))
 
     def test_check_grad_ingore_h0(self):
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
@@ -255,7 +257,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
             no_grad_set=set('H0'))
 
     def test_check_grad_ingore_c0(self):
-        N = len(self.lod[0]) - 1
+        N = len(self.lod[0])
         self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
         self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
         self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_op.py b/python/paddle/fluid/tests/unittests/test_matmul_op.py
index 44ac4683891ffd3141a126740f4fddb47550e183..cae2c8fa87d9857de8f26cf4962d9370eca66243 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_op.py
@@ -111,21 +111,24 @@ class Generator(object):
 
 
 # Generate test cases for all possibilities
-for dim_X in [1, 2, 3]:
-    for dim_Y in [1, 2, 3]:
-        for transpose_X in [False, True]:
-            for transpose_Y in [False, True]:
-                test_name = (
-                    'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
-                        dim_X, dim_Y, transpose_X, transpose_Y))
-                shape_X, shape_Y = generate_compatible_shapes(
-                    dim_X, dim_Y, transpose_X, transpose_Y)
-                globals()[test_name] = type(test_name, (Generator, OpTest), {
-                    'shape_X': shape_X,
-                    'shape_Y': shape_Y,
-                    'transpose_X': transpose_X,
-                    'transpose_Y': transpose_Y,
-                })
+def inject_test(dim_x, dim_y, trans_x, trans_y):
+    test_name = ('TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
+        dim_x, dim_y, trans_x, trans_y))
+    shape_x, shape_y = generate_compatible_shapes(dim_x, dim_y, trans_x,
+                                                  trans_y)
+    globals()[test_name] = type(test_name, (Generator, OpTest), {
+        'shape_X': shape_x,
+        'shape_Y': shape_y,
+        'transpose_X': trans_x,
+        'transpose_Y': trans_y,
+    })
+
+
+for dim_X in (1, 2, 3):
+    for dim_Y in (1, 2, 3):
+        for transose_x in (False, True):
+            for transose_y in (False, True):
+                inject_test(dim_X, dim_Y, transose_x, transose_y)
 
 
 # Test case n-dim
@@ -149,7 +152,7 @@ def generate_compatible_shapes(dim, transpose_X, transpose_Y):
     return shape_X, shape_Y
 
 
-# Test case n-dim
+# # Test case n-dim
 for dim in [4]:
     for transpose_X in [False, True]:
         for transpose_Y in [False, True]:
diff --git a/python/paddle/fluid/tests/unittests/test_mean_iou.py b/python/paddle/fluid/tests/unittests/test_mean_iou.py
new file mode 100644
index 0000000000000000000000000000000000000000..64d42b693bf11f3cb0153243909db4c0612bf4e7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_mean_iou.py
@@ -0,0 +1,114 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def compute_mean_iou(predictions, labels, num_classes, in_wrongs, in_corrects,
+                     in_mean_ious):
+    assert predictions.shape == labels.shape
+    predictions = predictions.flatten()
+    labels = labels.flatten()
+
+    out_wrong = np.zeros([num_classes]).astype("int32")
+    for _, wrong in in_wrongs:
+        out_wrong += wrong
+    out_correct = np.zeros([num_classes]).astype("int32")
+    for _, correct in in_corrects:
+        out_correct += correct
+
+    for pred, label in zip(predictions, labels):
+        if pred == label:
+            out_correct[pred] += 1
+        else:
+            out_wrong[pred] += 1
+            out_wrong[label] += 1
+
+    denominator = out_wrong + out_correct
+    valid_count = (denominator != 0).sum()
+    denominator = np.where(denominator > 0, denominator,
+                           np.ones(denominator.shape))
+    mean_iou = (out_correct / denominator).sum() / valid_count
+
+    for _, in_mean_iou in in_mean_ious:
+        mean_iou += in_mean_iou
+    return mean_iou, out_wrong, out_correct
+
+
+class TestMeanIOUOp(OpTest):
+    def setUp(self):
+        self.config()
+        self.op_type = "mean_iou"
+        predictions = np.random.randint(0, self.num_classes,
+                                        self.image_size).astype("int32")
+        labels = np.random.randint(0, self.num_classes,
+                                   self.image_size).astype("int32")
+
+        in_wrongs = []
+        for i in range(self.in_wrong_num):
+            in_wrongs.append(("in_wrong_%d" % i, np.random.randint(
+                0, 10, [self.num_classes]).astype("int32")))
+
+        in_corrects = []
+        for i in range(self.in_correct_num):
+            in_corrects.append(("in_correct_%d" % i, np.random.randint(
+                0, 10, [self.num_classes]).astype("int32")))
+
+        in_mean_ious = []
+        for i in range(self.in_mean_iou_num):
+            in_mean_ious.append(("in_mean_iou_%d" % i, np.random.uniform(
+                0, 1, [1]).astype("float32")))
+
+        self.inputs = {
+            'Predictions': predictions,
+            'Labels': labels,
+            'InWrongs': in_wrongs,
+            'InCorrects': in_corrects,
+            'InMeanIou': in_mean_ious
+        }
+        self.attrs = {'num_classes': long(self.num_classes)}
+        mean_iou, out_wrong, out_correct = compute_mean_iou(
+            predictions, labels, self.num_classes, in_wrongs, in_corrects,
+            in_mean_ious)
+        self.outputs = {
+            'OutMeanIou': mean_iou,
+            'OutWrong': out_wrong,
+            'OutCorrect': out_correct
+        }
+
+    def config(self):
+        self.num_classes = 10
+        self.image_size = [128, 128]
+        self.in_wrong_num = 0
+        self.in_correct_num = 0
+        self.in_mean_iou_num = 0
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestCase1(TestMeanIOUOp):
+    def config(self):
+        self.num_classes = 5
+        self.image_size = [100, 128]
+        self.in_wrong_num = 2
+        self.in_correct_num = 2
+        self.in_mean_iou_num = 2
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
index f3dcca6b0107a9c4a6efcb0c0fd50324aaf92648..cfd6e63e12258a92447e68b4afbc7ead91b68cc1 100644
--- a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py
@@ -18,7 +18,7 @@ import unittest
 import paddle.fluid.layers as layers
 import paddle.fluid.optimizer as optimizer
 from paddle.fluid.framework import Program, program_guard
-from paddle.fluid.memory_optimization_transpiler import memory_optimize
+from paddle.fluid.transpiler import memory_optimize
 
 
 class TestControlFlowGraph(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_merge_ids_op.py b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..f209bdf30faffc0b2c7932b7b10f384d6d61a831
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
@@ -0,0 +1,38 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestMergeIdsOp(OpTest):
+    def setUp(self):
+        self.op_type = "merge_ids"
+        ids = np.array([[0], [2], [2], [3], [5], [5], [6]]).astype('int64')
+        x0 = np.array([[0.1, 0.2], [0.2, 0.3], [0.3, 0.4]]).astype('float32')
+        x1 = np.array([]).astype('float32')
+        x2 = np.array([[0.4, 0.5], [0.4, 0.5], [0.5, 0.6],
+                       [0.5, 0.6]]).astype('float32')
+        out = np.array([[0.1, 0.2], [0.4, 0.5], [0.4, 0.5], [0.2, 0.3],
+                        [0.5, 0.6], [0.5, 0.6], [0.3, 0.4]]).astype('float32')
+        self.inputs = {'Ids': ids, "X": [('x0', x0), ('x1', x1), ('x2', x2)]}
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
old mode 100755
new mode 100644
index c27573c3d69037bc48e0b6a90636b3f027f15a41..54ee85c1a7a539fe9517f32adb35ab99b5ae2a07
--- a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
@@ -70,7 +70,7 @@ class TestMineHardExamplesOp(OpTest):
 
         self.updated_match_indices = self.match_indices
 
-        self.neg_indices_lod = [[0, 1, 2]]
+        self.neg_indices_lod = [[1, 1]]
         self.neg_indices = np.array([[1], [0]]).astype('int32')
 
 
@@ -92,7 +92,7 @@ class TestMineHardExamplesOpHardExample(TestMineHardExamplesOp):
         self.updated_match_indices = np.array([[0, -1, -1],
                                                [-1, -1, -1]]).astype('int32')
 
-        self.neg_indices_lod = [[0, 1, 3]]
+        self.neg_indices_lod = [[1, 2]]
         self.neg_indices = np.array([[2], [0], [2]]).astype('int32')
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py
index 9d1da420c7f70bd2a89d183a5f0a2b145f0ff475..bbc782c1bce302df68ab30013f3a7667e51ed479 100644
--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
@@ -14,6 +14,7 @@
 
 import unittest
 import numpy as np
+import paddle.fluid.core as core
 from op_test import OpTest
 
 
@@ -21,8 +22,8 @@ class TestMulOp(OpTest):
     def setUp(self):
         self.op_type = "mul"
         self.inputs = {
-            'X': np.random.random((32, 84)).astype("float32"),
-            'Y': np.random.random((84, 100)).astype("float32")
+            'X': np.random.random((2, 5)).astype("float32"),
+            'Y': np.random.random((5, 3)).astype("float32")
         }
         self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
 
@@ -45,13 +46,16 @@ class TestMulOp2(OpTest):
     def setUp(self):
         self.op_type = "mul"
         self.inputs = {
-            'X': np.random.random((15, 4, 12, 10)).astype("float32"),
-            'Y': np.random.random((4, 30, 8, 2, 9)).astype("float32")
+            'X': np.random.random((3, 4, 4, 3)).astype("float32"),
+            'Y': np.random.random((2, 6, 1, 2, 3)).astype("float32")
         }
-        self.attrs = {'x_num_col_dims': 2, 'y_num_col_dims': 2}
-        result = np.dot(self.inputs['X'].reshape(15 * 4, 12 * 10),
-                        self.inputs['Y'].reshape(4 * 30, 8 * 2 * 9))
-        result = result.reshape(15, 4, 8, 2, 9)
+        self.attrs = {
+            'x_num_col_dims': 2,
+            'y_num_col_dims': 2,
+        }
+        result = np.dot(self.inputs['X'].reshape(3 * 4, 4 * 3),
+                        self.inputs['Y'].reshape(2 * 6, 1 * 2 * 3))
+        result = result.reshape(3, 4, 1, 2, 3)
         self.outputs = {'Out': result}
 
     def test_check_output(self):
@@ -69,5 +73,41 @@ class TestMulOp2(OpTest):
             ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
 
 
+class TestFP16MulOp1(OpTest):
+    def setUp(self):
+        self.op_type = "mul"
+        x = np.random.random((3, 5)).astype("float16")
+        y = np.random.random((5, 4)).astype("float16")
+        self.inputs = {'X': x.view(np.float16), 'Y': y.view(np.float16)}
+        self.outputs = {'Out': np.dot(x, y)}
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-1)
+
+
+class TestFP16MulOp2(OpTest):
+    def setUp(self):
+        self.op_type = "mul"
+        x = np.random.random((3, 4, 4, 3)).astype("float16")
+        y = np.random.random((2, 6, 1, 2, 3)).astype("float16")
+        self.inputs = {'X': x.view(np.float16), 'Y': y.view(np.float16)}
+        self.attrs = {
+            'x_num_col_dims': 2,
+            'y_num_col_dims': 2,
+        }
+        result = np.dot(x.reshape(3 * 4, 4 * 3), y.reshape(2 * 6, 1 * 2 * 3))
+        result = result.reshape(3, 4, 1, 2, 3)
+        self.outputs = {'Out': result}
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=2e-1)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multi_file_reader.py b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbd510e64ffdd6f3b78b22bb0d37d9a7ba3fd9b5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
@@ -0,0 +1,76 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle.fluid as fluid
+import paddle
+import paddle.dataset.mnist as mnist
+from shutil import copyfile
+
+
+class TestMultipleReader(unittest.TestCase):
+    def setUp(self):
+        self.batch_size = 64
+        # Convert mnist to recordio file
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(mnist.train(), batch_size=self.batch_size)
+            feeder = fluid.DataFeeder(
+                feed_list=[  # order is image and label
+                    fluid.layers.data(
+                        name='image', shape=[784]),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            self.num_batch = fluid.recordio_writer.convert_reader_to_recordio_file(
+                './mnist_0.recordio', reader, feeder)
+        copyfile('./mnist_0.recordio', './mnist_1.recordio')
+        copyfile('./mnist_0.recordio', './mnist_2.recordio')
+
+    def main(self, thread_num):
+        file_list = [
+            './mnist_0.recordio', './mnist_1.recordio', './mnist_2.recordio'
+        ]
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data_files = fluid.layers.open_files(
+                filenames=file_list,
+                thread_num=thread_num,
+                shapes=[(-1, 784), (-1, 1)],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'])
+            img, label = fluid.layers.read_file(data_files)
+
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            else:
+                place = fluid.CPUPlace()
+
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+
+            batch_count = 0
+            while True:
+                try:
+                    img_val, = exe.run(fetch_list=[img])
+                except fluid.core.EOFException:
+                    break
+                batch_count += 1
+                self.assertLessEqual(img_val.shape[0], self.batch_size)
+            self.assertEqual(batch_count, self.num_batch * 3)
+
+    def test_main(self):
+        self.main(thread_num=3)  # thread number equals to file number
+        self.main(thread_num=10)  # thread number is larger than file number
+        self.main(thread_num=2)  # thread number is less than file number
diff --git a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fc9f550440d3d0e1a8182a69f5692b3df0aa258
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
@@ -0,0 +1,66 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle.fluid as fluid
+import paddle
+import paddle.dataset.mnist as mnist
+
+
+class TestMultipleReader(unittest.TestCase):
+    def setUp(self):
+        self.batch_size = 64
+        self.pass_num = 3
+        # Convert mnist to recordio file
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data_file = paddle.batch(mnist.train(), batch_size=self.batch_size)
+            feeder = fluid.DataFeeder(
+                feed_list=[
+                    fluid.layers.data(
+                        name='image', shape=[784]),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            self.num_batch = fluid.recordio_writer.convert_reader_to_recordio_file(
+                './mnist.recordio', data_file, feeder)
+
+    def test_main(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data_file = fluid.layers.open_recordio_file(
+                filename='./mnist.recordio',
+                shapes=[(-1, 784), (-1, 1)],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'],
+                pass_num=self.pass_num)
+            img, label = fluid.layers.read_file(data_file)
+
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            else:
+                place = fluid.CPUPlace()
+
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+
+            batch_count = 0
+            while True:
+                try:
+                    img_val, = exe.run(fetch_list=[img])
+                except fluid.core.EOFException:
+                    break
+                batch_count += 1
+                self.assertLessEqual(img_val.shape[0], self.batch_size)
+            self.assertEqual(batch_count, self.num_batch * self.pass_num)
diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
index 6459913c0162374e17d0249627e7107a195babf8..aacd8ae45af10a2b19d2903ab121e9bb4f9de7ff 100644
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
@@ -135,12 +135,12 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold,
     batch_size = scores.shape[0]
 
     det_outs = []
-    lod = [0]
+    lod = []
     for n in range(batch_size):
         nmsed_outs, nmsed_num = multiclass_nms(boxes[n], scores[n], background,
                                                score_threshold, nms_threshold,
                                                nms_top_k, keep_top_k)
-        lod.append(lod[-1] + nmsed_num)
+        lod.append(nmsed_num)
         if nmsed_num == 0: continue
 
         for c, indices in nmsed_outs.iteritems():
diff --git a/python/paddle/fluid/tests/unittests/test_net.py b/python/paddle/fluid/tests/unittests/test_net.py
deleted file mode 100644
index ae1699d647d7c0adab36200fb07bde12085053c1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_net.py
+++ /dev/null
@@ -1,53 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-import unittest
-
-
-def fc(X, W, Y):
-    ret_v = core.Net.create()
-
-    ret_v.append_op(Operator("mul", X="X", Y="W", Out="pre_activation"))
-    ret_v.append_op(Operator("sigmoid", X="pre_activation", Out=Y))
-    ret_v.complete_add_op(True)
-    return ret_v
-
-
-class TestNet(unittest.TestCase):
-    def test_net_all(self):
-        net = core.Net.create()
-        op1 = Operator("sum", X=["X", "Y"], Out="Out")
-        net.append_op(op1)
-
-        net2 = core.Net.create()
-        net2.append_op(fc(X="X", W="w", Y="fc.out"))
-        net2.complete_add_op(True)
-        net.append_op(net2)
-        net.complete_add_op(True)
-
-        expected = '''
-Op(plain_net), inputs:{all[W, X, Y]}, outputs:{all[Out, fc.out, pre_activation]}.
-    Op(sum), inputs:{X[X, Y]}, outputs:{Out[Out]}.
-    Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
-        Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
-            Op(mul), inputs:{X[X], Y[W]}, outputs:{Out[pre_activation]}.
-            Op(sigmoid), inputs:{X[pre_activation]}, outputs:{Out[fc.out]}.
-'''
-        self.assertEqual(expected, "\n" + str(net))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_network_with_dtype.py b/python/paddle/fluid/tests/unittests/test_network_with_dtype.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4835dd18405fc7a0d508a780a734922e0abd12c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_network_with_dtype.py
@@ -0,0 +1,74 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.executor import Executor
+
+BATCH_SIZE = 20
+
+
+class TestNetWithDtype(unittest.TestCase):
+    def setUp(self):
+        self.dtype = "float64"
+        self.init_dtype()
+
+    def run_net_on_place(self, place):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            x = fluid.layers.data(name='x', shape=[13], dtype=self.dtype)
+            y = fluid.layers.data(name='y', shape=[1], dtype=self.dtype)
+            y_predict = fluid.layers.fc(input=x, size=1, act=None)
+            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = fluid.layers.mean(cost)
+            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+            sgd_optimizer.minimize(avg_cost)
+
+        fetch_list = [avg_cost]
+        train_reader = paddle.batch(
+            paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE)
+        feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+        exe = fluid.Executor(place)
+        exe.run(startup)
+        for data in train_reader():
+            exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
+            # the main program is runable, the datatype is fully supported
+            break
+
+    def init_dtype(self):
+        pass
+
+    def test_cpu(self):
+        place = fluid.CPUPlace()
+        self.run_net_on_place(place)
+
+    def test_gpu(self):
+        if not core.is_compiled_with_cuda():
+            return
+        place = fluid.CUDAPlace(0)
+        self.run_net_on_place(place)
+
+
+# TODO(dzhwinter): make sure the fp16 is runable
+# class TestFloat16(TestNetWithDtype):
+#     def init_dtype(self):
+#         self.dtype = "float16"
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_norm_op.py b/python/paddle/fluid/tests/unittests/test_norm_op.py
index 6feda175fb537db894ac7f19e22297f6062a4d61..108a665f37f5cd652ec83f784a56ca52e6b49fe8 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_op.py
@@ -17,44 +17,23 @@ import numpy as np
 from op_test import OpTest
 
 
-def norm(input, scale, epsilon):
-    s0, s1, s2, s3 = input.shape
-    x_square = input * input
-    for i in xrange(s0):
-        input_batch = input[i:i + 1, :, :, :]
-        input_batch = input_batch.reshape(s1, s2 * s3)
-        x_square_batch = x_square[i:i + 1, :, :, :]
-        x_square_batch = x_square_batch.reshape(s1, s2 * s3)
-        square_colsum = x_square_batch.sum(axis=0) + epsilon
-        tmp = pow(square_colsum, 0.5)
-        tmp = np.reciprocal(tmp)
-        tmp_tile = np.tile(tmp, s1)
-        tmp_tile = tmp_tile.reshape(s1, s2 * s3)
-        scale_tile = np.tile(scale, (1, s2 * s3))
-        scale_tile = scale_tile.reshape(s1, s2 * s3)
-        out_batch = input_batch * tmp_tile * scale_tile
-        out_batch = out_batch.reshape(1, s1, s2, s3)
-        if i == 0:
-            out = out_batch
-        else:
-            out = np.concatenate((out, out_batch), 0)
-    out.reshape(s0, s1, s2, s3)
-    return out
+def l2_norm(x, axis, epsilon):
+    x2 = x**2
+    s = np.sum(x2, axis=axis, keepdims=True)
+    r = np.sqrt(s + epsilon)
+    y = x / np.broadcast_to(r, x.shape)
+    return y, r
 
 
 class TestNormOp(OpTest):
     def setUp(self):
         self.op_type = "norm"
         self.init_test_case()
-        input = np.random.random(self.shape).astype("float32")
-        scale = np.array([10, 10, 10])
-        self.inputs = {
-            'X': input.astype('float32'),
-            'Scale': scale.astype('float32')
-        }
-        self.attrs = {'epsilon': self.epsilon}
-        output = norm(input, scale, self.epsilon)
-        self.outputs = {'Out': output.astype('float32')}
+        x = np.random.random(self.shape).astype("float64")
+        y, norm = l2_norm(x, self.axis, self.epsilon)
+        self.inputs = {'X': x}
+        self.attrs = {'epsilon': self.epsilon, 'axis': self.axis}
+        self.outputs = {'Out': y, 'Norm': norm}
 
     def test_check_output(self):
         self.check_output()
@@ -63,8 +42,23 @@ class TestNormOp(OpTest):
         self.check_grad(['X'], 'Out')
 
     def init_test_case(self):
-        self.shape = [2, 3, 2, 2]
-        self.epsilon = 1e-6
+        self.shape = [2, 3, 4, 4]
+        self.axis = 1
+        self.epsilon = 1e-8
+
+
+class TestNormOp2(TestNormOp):
+    def init_test_case(self):
+        self.shape = [5, 3, 9, 7]
+        self.axis = 0
+        self.epsilon = 1e-8
+
+
+class TestNormOp3(TestNormOp):
+    def init_test_case(self):
+        self.shape = [5, 3, 2, 7]
+        self.axis = -1
+        self.epsilon = 1e-8
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py b/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
index ef34893943d8f6bf91b1eb14378e463c178de84d..198c68866d399023c51c2a43b588aa8ec49c3c9a 100644
--- a/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
+++ b/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
@@ -70,8 +70,9 @@ class TestNormalization(unittest.TestCase):
     def l2_normalize(self, data, axis, epsilon):
         """ Compute the groundtruth.
         """
-        output = data * np.reciprocal(
-            np.sum(np.square(data), axis=axis, keepdims=True))
+        output = data / np.broadcast_to(
+            np.sqrt(np.sum(np.square(data), axis=axis, keepdims=True)),
+            data.shape)
         return output
 
     def test_l2_normalize(self):
diff --git a/python/paddle/fluid/tests/unittests/test_one_hot_op.py b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
index cd78cce8729ab2b5a0bb4817cf3022e53932283a..d13f2b3afde10f9b4e632094fa216d8729069afa 100644
--- a/python/paddle/fluid/tests/unittests/test_one_hot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
@@ -27,9 +27,9 @@ class TestOneHotOp(OpTest):
         self.op_type = 'one_hot'
         depth = 10
         dimension = 12
-        x_lod = [[0, 4, 5, 8, 11]]
-        x = [np.random.randint(0, depth - 1) for i in xrange(x_lod[0][-1])]
-        x = np.array(x).astype('int').reshape([x_lod[0][-1], 1])
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in xrange(sum(x_lod[0]))]
+        x = np.array(x).astype('int').reshape([sum(x_lod[0]), 1])
 
         out = np.zeros(shape=(np.product(x.shape[:-1]),
                               depth)).astype('float32')
@@ -50,9 +50,9 @@ class TestOneHotOp_default_dtype(OpTest):
         self.op_type = 'one_hot'
         depth = 10
         dimension = 12
-        x_lod = [[0, 4, 5, 8, 11]]
-        x = [np.random.randint(0, depth - 1) for i in xrange(x_lod[0][-1])]
-        x = np.array(x).astype('int').reshape([x_lod[0][-1], 1])
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in xrange(sum(x_lod[0]))]
+        x = np.array(x).astype('int').reshape([sum(x_lod[0]), 1])
 
         out = np.zeros(shape=(np.product(x.shape[:-1]),
                               depth)).astype('float32')
@@ -75,11 +75,11 @@ class TestOneHotOp_exception(OpTest):
         self.place = core.CPUPlace()
         self.dimension = 12
         self.x = core.LoDTensor()
-        x_lod = [[0, 4, 5, 8, 11]]
-        data = [np.random.randint(11, 20) for i in xrange(x_lod[0][-1])]
-        data = np.array(data).astype('int').reshape([x_lod[0][-1], 1])
+        x_lod = [[4, 1, 3, 3]]
+        data = [np.random.randint(11, 20) for i in xrange(sum(x_lod[0]))]
+        data = np.array(data).astype('int').reshape([sum(x_lod[0]), 1])
         self.x.set(data, self.place)
-        self.x.set_lod(x_lod)
+        self.x.set_recursive_sequence_lengths(x_lod)
 
     def test_check_output(self):
         program = Program()
diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py
index 649fabe4a0cdef4c665f8a6d3ebee1bb8232185f..c098a5a0cb0364f9ec93c95c1ef50912e574b3d9 100644
--- a/python/paddle/fluid/tests/unittests/test_operator_desc.py
+++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py
@@ -62,7 +62,8 @@ class TestOperator(unittest.TestCase):
         self.assertEqual(mul_op.output_names, ["Out"])
         self.assertEqual(mul_op.output("Out"), ["mul.out"])
         self.assertEqual(
-            set(mul_op.attr_names), set(["x_num_col_dims", "y_num_col_dims"]))
+            set(mul_op.attr_names),
+            set(["x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var"]))
         self.assertEqual(mul_op.has_attr("x_num_col_dims"), True)
         self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT)
         self.assertEqual(mul_op.attr("x_num_col_dims"), 1)
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py
index e775db1d10f4561b6fb90631757a25c9f74cb777..7286c7c450108c4b5ad7136041bc4e989894a2ba 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -434,5 +434,71 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
         self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
 
 
+class TestFtrlOptimizer(unittest.TestCase):
+    class MockFtrl(optimizer.FtrlOptimizer):
+        def get_accumulators(self):
+            return self._accumulators
+
+        def get_squared_str(self):
+            return self._squared_acc_str
+
+        def get_linear_str(self):
+            return self._linear_acc_str
+
+    def test_ftrl_optimizer(self):
+        init_program = framework.Program()
+        program = framework.Program()
+        block = program.global_block()
+        mul_x = block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="mul.x",
+            optimize_attr={'learning_rate': 1.1})
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        block.append_op(
+            type="mul",
+            inputs={"X": mul_x,
+                    "Y": mul_y},
+            outputs={"Out": mul_out},
+            attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        learning_rate = 0.01
+        ftrl_optimizer = self.MockFtrl(
+            learning_rate=learning_rate, l1=0.0, l2=0.0, lr_power=-0.5)
+        params_grads = append_backward(mean_out)
+        self.assertEqual(len(params_grads), 1)
+        self.assertEqual(len(ftrl_optimizer.get_accumulators()), 0)
+        opts = ftrl_optimizer.create_optimization_pass(params_grads, mul_out,
+                                                       init_program)
+        self.assertEqual(len(opts), 3)
+        self.assertEqual([op.type for op in opts],
+                         ["fill_constant", "elementwise_mul", "ftrl"])
+
+        # Check accumulators
+        accumulators = ftrl_optimizer.get_accumulators()
+        self.assertEqual(len(accumulators), 2)
+        self.assertTrue(ftrl_optimizer.get_squared_str() in accumulators)
+        self.assertTrue(ftrl_optimizer.get_linear_str() in accumulators)
+        squared_acc = accumulators[ftrl_optimizer.get_squared_str()]
+        linear_acc = accumulators[ftrl_optimizer.get_linear_str()]
+        self.assertEqual(len(squared_acc), 1)
+        self.assertEqual(len(linear_acc), 1)
+        self.assertTrue(mul_x.name in squared_acc)
+        self.assertTrue(mul_x.name in linear_acc)
+
+        # Check init_program
+        init_ops = init_program.global_block().ops
+        self.assertEqual(len(init_ops), 3)
+        self.assertEqual(init_ops[0].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
new file mode 100644
index 0000000000000000000000000000000000000000..63fb58c6927fa387b3b19147b9dc9d24bb8e5132
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
@@ -0,0 +1,213 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.conll05 as conll05
+import paddle.fluid as fluid
+import unittest
+import paddle
+import numpy as np
+import os
+
+word_dict, verb_dict, label_dict = conll05.get_dict()
+word_dict_len = len(word_dict)
+label_dict_len = len(label_dict)
+pred_dict_len = len(verb_dict)
+mark_dict_len = 2
+word_dim = 32
+mark_dim = 5
+hidden_dim = 512
+depth = 8
+mix_hidden_lr = 1e-3
+embedding_name = 'emb'
+
+
+def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
+            is_sparse, **ignored):
+    # 8 features
+    predicate_embedding = fluid.layers.embedding(
+        input=predicate,
+        is_sparse=is_sparse,
+        size=[pred_dict_len, word_dim],
+        dtype='float32',
+        param_attr='vemb')
+
+    mark_embedding = fluid.layers.embedding(
+        input=mark,
+        is_sparse=is_sparse,
+        size=[mark_dict_len, mark_dim],
+        dtype='float32')
+
+    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+    emb_layers = [
+        fluid.layers.embedding(
+            size=[word_dict_len, word_dim],
+            is_sparse=is_sparse,
+            input=x,
+            param_attr=fluid.ParamAttr(
+                name=embedding_name, trainable=False)) for x in word_input
+    ]
+    emb_layers.append(predicate_embedding)
+    emb_layers.append(mark_embedding)
+
+    hidden_0_layers = [
+        fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
+        for emb in emb_layers
+    ]
+
+    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
+
+    lstm_0 = fluid.layers.dynamic_lstm(
+        input=hidden_0,
+        size=hidden_dim,
+        candidate_activation='relu',
+        gate_activation='sigmoid',
+        cell_activation='sigmoid')
+
+    # stack L-LSTM and R-LSTM with direct edges
+    input_tmp = [hidden_0, lstm_0]
+
+    for i in range(1, depth):
+        mix_hidden = fluid.layers.sums(input=[
+            fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
+            fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
+        ])
+
+        lstm = fluid.layers.dynamic_lstm(
+            input=mix_hidden,
+            size=hidden_dim,
+            candidate_activation='relu',
+            gate_activation='sigmoid',
+            cell_activation='sigmoid',
+            is_reverse=((i % 2) == 1))
+
+        input_tmp = [mix_hidden, lstm]
+
+    feature_out = fluid.layers.sums(input=[
+        fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
+        fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
+    ])
+
+    return feature_out
+
+
+class TestCRFModel(unittest.TestCase):
+    def check_network_convergence(self,
+                                  is_sparse,
+                                  build_strategy=None,
+                                  use_cuda=True):
+        os.environ['CPU_NUM'] = str(4)
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            word = fluid.layers.data(
+                name='word_data', shape=[1], dtype='int64', lod_level=1)
+            predicate = fluid.layers.data(
+                name='verb_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_n2 = fluid.layers.data(
+                name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_n1 = fluid.layers.data(
+                name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_0 = fluid.layers.data(
+                name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_p1 = fluid.layers.data(
+                name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
+            ctx_p2 = fluid.layers.data(
+                name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+            mark = fluid.layers.data(
+                name='mark_data', shape=[1], dtype='int64', lod_level=1)
+
+            feature_out = db_lstm(**locals())
+            target = fluid.layers.data(
+                name='target', shape=[1], dtype='int64', lod_level=1)
+            crf_cost = fluid.layers.linear_chain_crf(
+                input=feature_out,
+                label=target,
+                param_attr=fluid.ParamAttr(
+                    name='crfw', learning_rate=1e-1))
+            avg_cost = fluid.layers.mean(crf_cost)
+
+            sgd_optimizer = fluid.optimizer.SGD(
+                learning_rate=fluid.layers.exponential_decay(
+                    learning_rate=0.01,
+                    decay_steps=100000,
+                    decay_rate=0.5,
+                    staircase=True))
+            sgd_optimizer.minimize(avg_cost)
+
+            train_data = paddle.batch(
+                paddle.reader.shuffle(
+                    paddle.dataset.conll05.test(), buf_size=8192),
+                batch_size=16)
+
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(startup)
+
+            pe = fluid.ParallelExecutor(
+                use_cuda=use_cuda,
+                loss_name=avg_cost.name,
+                build_strategy=build_strategy)
+
+            feeder = fluid.DataFeeder(
+                feed_list=[
+                    word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate,
+                    mark, target
+                ],
+                place=fluid.CPUPlace())
+
+            data = train_data()
+            for i in xrange(10):
+                cur_batch = next(data)
+                print pe.run(feed=feeder.feed(cur_batch),
+                             fetch_list=[avg_cost.name])[0]
+
+    @unittest.skip(reason="CI hangs")
+    def test_update_sparse_parameter_all_reduce(self):
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
+        self.check_network_convergence(
+            is_sparse=True, build_strategy=build_strategy, use_cuda=True)
+        self.check_network_convergence(
+            is_sparse=True, build_strategy=build_strategy, use_cuda=False)
+
+    @unittest.skip(reason="CI hangs")
+    def test_update_dense_parameter_all_reduce(self):
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
+        self.check_network_convergence(
+            is_sparse=False, build_strategy=build_strategy, use_cuda=True)
+        self.check_network_convergence(
+            is_sparse=False, build_strategy=build_strategy, use_cuda=False)
+
+    @unittest.skip(reason="CI hangs")
+    def test_update_sparse_parameter_reduce(self):
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
+        self.check_network_convergence(
+            is_sparse=True, build_strategy=build_strategy, use_cuda=True)
+        self.check_network_convergence(
+            is_sparse=True, build_strategy=build_strategy, use_cuda=False)
+
+    @unittest.skip(reason="CI hangs")
+    def test_update_dense_parameter_reduce(self):
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
+        self.check_network_convergence(
+            is_sparse=False, build_strategy=build_strategy, use_cuda=True)
+        self.check_network_convergence(
+            is_sparse=False, build_strategy=build_strategy, use_cuda=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f5d2f16773efb7537de85abec88344f8e0daa9f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.dataset.flowers as flowers
+import math
+import paddle.fluid as fluid
+import unittest
+import numpy as np
+import paddle
+import os
+
+
+def Lenet(data, class_dim):
+    conv1 = fluid.layers.conv2d(data, 32, 5, 1, act=None)
+    bn1 = fluid.layers.batch_norm(conv1, act='relu')
+    pool1 = fluid.layers.pool2d(bn1, 2, 'max', 2)
+    conv2 = fluid.layers.conv2d(pool1, 50, 5, 1, act=None)
+    bn2 = fluid.layers.batch_norm(conv2, act='relu')
+    pool2 = fluid.layers.pool2d(bn2, 2, 'max', 2)
+
+    fc1 = fluid.layers.fc(pool2, size=500, act='relu')
+    fc2 = fluid.layers.fc(fc1, size=class_dim, act='softmax')
+
+    return fc2
+
+
+class TestFetchOp(unittest.TestCase):
+    def parallel_exe(self, train_inputs, seed, use_cuda):
+        main = fluid.Program()
+        startup = fluid.Program()
+        startup.random_seed = seed
+        with fluid.program_guard(main, startup):
+            data = fluid.layers.data(
+                name='image', shape=[3, 224, 224], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            out = Lenet(data, class_dim=102)
+            loss = fluid.layers.cross_entropy(input=out, label=label)
+            loss = fluid.layers.mean(loss)
+
+            opt = fluid.optimizer.Momentum(
+                learning_rate=0.1,
+                momentum=0.9,
+                regularization=fluid.regularizer.L2Decay(1e-4))
+
+            opt.minimize(loss)
+
+            # TODO(zcd): I found that onece the memory optimizer is open,
+            # parallel_exe doesn't fetch some variable, such as conv2d_0.b_0@GRAD,
+            # conv2d_1.b_0@GRAD. Those variables should not be pruned.
+            # fluid.memory_optimize(main)
+
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(startup)
+
+            feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
+            pe = fluid.ParallelExecutor(
+                use_cuda=use_cuda, loss_name=loss.name, main_program=main)
+
+            fetch_list = []
+            all_vars = main.global_block().vars
+            for k, v in all_vars.iteritems():
+                if 'tmp' not in k and k[0] is not '_' or v.persistable:
+                    fetch_list.append(k)
+
+            for data in train_inputs:
+                ret = pe.run(fetch_list,
+                             feed=feeder.feed(data),
+                             return_numpy=True)
+                for i in range(len(fetch_list)):
+                    assert not math.isnan(np.sum(ret[i])) and \
+                           not math.isinf(np.sum(ret[i]))
+
+    def test_fetch_op(self):
+        tst_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16)
+        tst_reader_iter = tst_reader()
+
+        iters = 3
+        train_inputs = []
+        for i in range(iters):
+            train_inputs.append(tst_reader_iter.next())
+
+        os.environ['CPU_NUM'] = str(4)
+        self.parallel_exe(train_inputs, seed=1, use_cuda=True)
+        self.parallel_exe(train_inputs, seed=1, use_cuda=False)
+
+
+class TestFeedParallel(unittest.TestCase):
+    def parallel_exe(self, use_cuda, seed):
+        main = fluid.Program()
+        startup = fluid.Program()
+        startup.random_seed = seed
+        with fluid.scope_guard(fluid.core.Scope()):
+            with fluid.program_guard(main, startup):
+                data = fluid.layers.data(
+                    name='image', shape=[3, 224, 224], dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+                out = Lenet(data, class_dim=102)
+                loss = fluid.layers.cross_entropy(input=out, label=label)
+                loss = fluid.layers.mean(loss)
+                opt = fluid.optimizer.Momentum(
+                    learning_rate=0.1,
+                    momentum=0.9,
+                    regularization=fluid.regularizer.L2Decay(1e-4))
+
+                opt.minimize(loss)
+
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
+        reader = feeder.decorate_reader(
+            paddle.batch(
+                flowers.train(), batch_size=16), multi_devices=True)
+
+        exe = fluid.Executor(place)
+        exe.run(startup)
+
+        pe = fluid.ParallelExecutor(
+            use_cuda=use_cuda, loss_name=loss.name, main_program=main)
+
+        for batch_id, data in enumerate(reader()):
+            loss_np = pe.run(feed=data, fetch_list=[loss.name])[0]
+            print batch_id, loss_np
+            if batch_id == 2:
+                break
+
+    def test_feed_op(self):
+        os.environ['CPU_NUM'] = str(4)
+        self.parallel_exe(use_cuda=True, seed=1)
+        self.parallel_exe(use_cuda=False, seed=1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..a801d99aa1ced35eb7f081fde63ad541f0eb2589
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -0,0 +1,187 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from parallel_executor_test_base import TestParallelExecutorBase
+import paddle.fluid as fluid
+import numpy as np
+import paddle
+import paddle.dataset.mnist as mnist
+import unittest
+import os
+
+MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio"
+
+
+def simple_fc_net(use_feed):
+    if use_feed:
+        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    else:
+        reader = fluid.layers.open_files(
+            filenames=[MNIST_RECORDIO_FILE],
+            shapes=[[-1, 784], [-1, 1]],
+            lod_levels=[0, 0],
+            dtypes=['float32', 'int64'],
+            thread_num=1,
+            for_parallel=True)
+        reader = fluid.layers.io.double_buffer(reader)
+        img, label = fluid.layers.read_file(reader)
+    hidden = img
+    for _ in xrange(4):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            act='tanh',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+def fc_with_batchnorm(use_feed):
+    if use_feed:
+        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    else:
+        reader = fluid.layers.open_files(
+            filenames=[MNIST_RECORDIO_FILE],
+            shapes=[[-1, 784], [-1, 1]],
+            lod_levels=[0, 0],
+            dtypes=['float32', 'int64'],
+            thread_num=1,
+            for_parallel=True)
+        reader = fluid.layers.io.double_buffer(reader)
+        img, label = fluid.layers.read_file(reader)
+
+    hidden = img
+    for _ in xrange(1):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            act='tanh',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+
+        hidden = fluid.layers.batch_norm(input=hidden)
+
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+class TestMNIST(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+        # Convert mnist to recordio file
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(mnist.train(), batch_size=4)
+            feeder = fluid.DataFeeder(
+                feed_list=[  # order is image and label
+                    fluid.layers.data(
+                        name='image', shape=[784]),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            fluid.recordio_writer.convert_reader_to_recordio_file(
+                MNIST_RECORDIO_FILE, reader, feeder)
+
+    def check_simple_fc_convergence(self,
+                                    balance_parameter_opt_between_cards,
+                                    use_cuda=True):
+        self.check_network_convergence(simple_fc_net, use_cuda=use_cuda)
+        self.check_network_convergence(
+            simple_fc_net, use_cuda=use_cuda, allow_op_delay=True)
+
+        img = np.zeros(shape=[32, 784], dtype='float32')
+        label = np.ones(shape=[32, 1], dtype='int64')
+        self.check_network_convergence(
+            simple_fc_net,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
+        )
+
+    def test_simple_fc(self):
+        self.check_simple_fc_convergence(False, use_cuda=True)
+        self.check_simple_fc_convergence(False, use_cuda=False)
+
+    def test_simple_fc_with_new_strategy(self):
+        self.check_simple_fc_convergence(True, use_cuda=True)
+        self.check_simple_fc_convergence(True, use_cuda=False)
+
+    def check_simple_fc_parallel_accuracy(self,
+                                          balance_parameter_opt_between_cards,
+                                          use_cuda=True):
+        img = np.zeros(shape=[32, 784], dtype='float32')
+        label = np.ones(shape=[32, 1], dtype='int64')
+        single_first_loss, single_last_loss = self.check_network_convergence(
+            method=simple_fc_net,
+            seed=1000,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            use_parallel_executor=False)
+        parallel_first_loss, parallel_last_loss = self.check_network_convergence(
+            method=simple_fc_net,
+            seed=1000,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            use_parallel_executor=True,
+            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
+        )
+
+        for p_f in parallel_first_loss:
+            self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6)
+        for p_l in parallel_last_loss:
+            self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6)
+
+    def test_simple_fc_parallel_accuracy(self):
+        self.check_simple_fc_parallel_accuracy(False, use_cuda=True)
+        self.check_simple_fc_parallel_accuracy(False, use_cuda=False)
+
+    def test_simple_fc_parallel_accuracy_with_new_strategy(self):
+        self.check_simple_fc_parallel_accuracy(True, use_cuda=True)
+        self.check_simple_fc_parallel_accuracy(True, use_cuda=False)
+
+    def check_batchnorm_fc_convergence(
+            self, balance_parameter_opt_between_cards, use_cuda):
+        self.check_network_convergence(fc_with_batchnorm, use_cuda=use_cuda)
+        img = np.zeros(shape=[32, 784], dtype='float32')
+        label = np.ones(shape=[32, 1], dtype='int64')
+        self.check_network_convergence(
+            fc_with_batchnorm,
+            feed_dict={"image": img,
+                       "label": label},
+            use_cuda=use_cuda,
+            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
+        )
+
+    def test_batchnorm_fc(self):
+        self.check_batchnorm_fc_convergence(False, use_cuda=True)
+        self.check_batchnorm_fc_convergence(False, use_cuda=False)
+
+    def test_batchnorm_fc_with_new_strategy(self):
+        self.check_batchnorm_fc_convergence(True, use_cuda=True)
+        self.check_batchnorm_fc_convergence(True, use_cuda=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
new file mode 100644
index 0000000000000000000000000000000000000000..066299e6c6f7f6c159cb0886e86d3404b027b698
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+from parallel_executor_test_base import TestParallelExecutorBase
+import unittest
+import os
+
+
+def squeeze_excitation(input, num_channels, reduction_ratio):
+    # pool = fluid.layers.pool2d(
+    #    input=input, pool_size=0, pool_type='avg', global_pooling=True)
+    conv = input
+    shape = conv.shape
+    reshape = fluid.layers.reshape(
+        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
+    pool = fluid.layers.reduce_mean(input=reshape, dim=2)
+
+    squeeze = fluid.layers.fc(input=pool,
+                              size=num_channels / reduction_ratio,
+                              act='relu')
+    excitation = fluid.layers.fc(input=squeeze,
+                                 size=num_channels,
+                                 act='sigmoid')
+    scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
+    return scale
+
+
+def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
+                  act=None):
+    conv = fluid.layers.conv2d(
+        input=input,
+        num_filters=num_filters,
+        filter_size=filter_size,
+        stride=stride,
+        padding=(filter_size - 1) / 2,
+        groups=groups,
+        act=None,
+        bias_attr=False)
+    return fluid.layers.batch_norm(input=conv, act=act, momentum=0.1)
+
+
+def shortcut(input, ch_out, stride):
+    ch_in = input.shape[1]
+    if ch_in != ch_out:
+        if stride == 1:
+            filter_size = 1
+        else:
+            filter_size = 3
+        return conv_bn_layer(input, ch_out, filter_size, stride)
+    else:
+        return input
+
+
+def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
+    # The number of first 1x1 convolutional channels for each bottleneck build block
+    # was halved to reduce the compution cost.
+    conv0 = conv_bn_layer(
+        input=input, num_filters=num_filters, filter_size=1, act='relu')
+    conv1 = conv_bn_layer(
+        input=conv0,
+        num_filters=num_filters * 2,
+        filter_size=3,
+        stride=stride,
+        groups=cardinality,
+        act='relu')
+    conv2 = conv_bn_layer(
+        input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
+    scale = squeeze_excitation(
+        input=conv2,
+        num_channels=num_filters * 2,
+        reduction_ratio=reduction_ratio)
+
+    short = shortcut(input, num_filters * 2, stride)
+
+    return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
+
+
+def SE_ResNeXt50Small(batch_size=2, use_feed=False):
+    assert not use_feed, "SE_ResNeXt doesn't support feed yet"
+
+    img = fluid.layers.fill_constant(
+        shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
+    label = fluid.layers.fill_constant(
+        shape=[batch_size, 1], dtype='int64', value=0.0)
+
+    conv = conv_bn_layer(
+        input=img, num_filters=16, filter_size=3, stride=2, act='relu')
+    conv = conv_bn_layer(
+        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
+    conv = conv_bn_layer(
+        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
+    conv = fluid.layers.pool2d(
+        input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
+
+    cardinality = 32
+    reduction_ratio = 16
+    depth = [3, 4, 6, 3]
+    num_filters = [128, 256, 512, 1024]
+
+    for block in range(len(depth)):
+        for i in range(depth[block]):
+            conv = bottleneck_block(
+                input=conv,
+                num_filters=num_filters[block],
+                stride=2 if i == 0 and block != 0 else 1,
+                cardinality=cardinality,
+                reduction_ratio=reduction_ratio)
+
+    shape = conv.shape
+    reshape = fluid.layers.reshape(
+        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
+    pool = fluid.layers.reduce_mean(input=reshape, dim=2)
+    dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2)
+    # Classifier layer:
+    prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+class TestResnet(TestParallelExecutorBase):
+    def check_resnet_convergence(self,
+                                 balance_parameter_opt_between_cards,
+                                 use_cuda=True,
+                                 iter=20):
+        os.environ['CPU_NUM'] = str(4)
+
+        import functools
+        batch_size = 2
+        self.check_network_convergence(
+            functools.partial(
+                SE_ResNeXt50Small, batch_size=batch_size),
+            iter=iter,
+            batch_size=batch_size,
+            use_cuda=use_cuda,
+            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
+        )
+
+    def test_resnet(self):
+        self.check_resnet_convergence(False, use_cuda=True)
+        self.check_resnet_convergence(False, use_cuda=False, iter=5)
+
+    def test_resnet_with_new_strategy(self):
+        self.check_resnet_convergence(True, use_cuda=True)
+        self.check_resnet_convergence(True, use_cuda=False, iter=5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a2733927d38f1a2b1af92fcc12f036158b4d06f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import numpy as np
+import unittest
+import os
+import sys
+import math
+
+
+def simple_fc_net():
+    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    hidden = img
+    for _ in xrange(4):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            act='tanh',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+class ParallelExecutorTestingDuringTraining(unittest.TestCase):
+    def check_network_convergence(self, use_cuda, build_strategy=None):
+        os.environ['CPU_NUM'] = str(4)
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = simple_fc_net()
+            test_program = main.clone(for_test=True)
+
+            opt = fluid.optimizer.SGD(learning_rate=0.001)
+            opt.minimize(loss)
+
+            batch_size = 32
+            image = np.random.normal(size=(batch_size, 784)).astype('float32')
+            label = np.random.randint(0, 10, (batch_size, 1), dtype="int64")
+
+            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(startup)
+            feed_dict = {'image': image, 'label': label}
+
+            train_exe = fluid.ParallelExecutor(
+                use_cuda=use_cuda,
+                loss_name=loss.name,
+                main_program=main,
+                build_strategy=build_strategy)
+
+            test_exe = fluid.ParallelExecutor(
+                use_cuda=use_cuda,
+                main_program=test_program,
+                share_vars_from=train_exe,
+                build_strategy=build_strategy)
+
+            for i in xrange(5):
+                test_loss, = test_exe.run([loss.name], feed=feed_dict)
+
+                train_loss, = train_exe.run([loss.name], feed=feed_dict)
+
+                avg_test_loss_val = np.array(test_loss).mean()
+                if math.isnan(float(avg_test_loss_val)):
+                    sys.exit("got NaN loss, testing failed.")
+
+                avg_train_loss_val = np.array(train_loss).mean()
+                if math.isnan(float(avg_train_loss_val)):
+                    sys.exit("got NaN loss, training failed.")
+
+                self.assertTrue(
+                    np.allclose(
+                        train_loss, test_loss, atol=1e-8),
+                    "Train loss: " + str(train_loss) + "\n Test loss:" +
+                    str(test_loss))
+
+    def test_parallel_testing(self):
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
+        self.check_network_convergence(
+            use_cuda=True, build_strategy=build_strategy)
+        self.check_network_convergence(
+            use_cuda=False, build_strategy=build_strategy)
+
+    def test_parallel_testing_with_new_strategy(self):
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
+        self.check_network_convergence(
+            use_cuda=True, build_strategy=build_strategy)
+        self.check_network_convergence(
+            use_cuda=False, build_strategy=build_strategy)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6215fddb11bb6b3a76b5a6395e7254d21971c13
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import transformer_model
+import numpy as np
+from parallel_executor_test_base import TestParallelExecutorBase
+import unittest
+import paddle
+import paddle.dataset.wmt16 as wmt16
+import os
+
+WMT16_RECORDIO_FILE = "./wmt16_test_pe.recordio"
+
+
+class ModelHyperParams(object):
+    # Dictionary size for source and target language. This model directly uses
+    # paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
+    # alreay been added, but the <pad> token is not added. Transformer requires
+    # sequences in a mini-batch are padded to have the same length. A <pad> token is
+    # added into the original dictionary in paddle.dateset.wmt16.
+
+    # size of source word dictionary.
+    src_vocab_size = 10000
+    # index for <pad> token in source language.
+    src_pad_idx = src_vocab_size
+
+    # size of target word dictionay
+    trg_vocab_size = 10000
+    # index for <pad> token in target language.
+    trg_pad_idx = trg_vocab_size
+
+    # position value corresponding to the <pad> token.
+    pos_pad_idx = 0
+
+    # max length of sequences. It should plus 1 to include position
+    # padding token for position encoding.
+    max_length = 50
+
+    # the dimension for word embeddings, which is also the last dimension of
+    # the input and output of multi-head attention, position-wise feed-forward
+    # networks, encoder and decoder.
+
+    d_model = 512
+    # size of the hidden layer in position-wise feed-forward networks.
+    d_inner_hid = 1024
+    # the dimension that keys are projected to for dot-product attention.
+    d_key = 64
+    # the dimension that values are projected to for dot-product attention.
+    d_value = 64
+    # number of head used in multi-head attention.
+    n_head = 8
+    # number of sub-layers to be stacked in the encoder and decoder.
+    n_layer = 6
+    # dropout rate used by all dropout layers.
+    dropout = 0.1
+
+
+def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
+    """
+    Pad the instances to the max sequence length in batch, and generate the
+    corresponding position data and attention bias. Then, convert the numpy
+    data to tensors and return a dict mapping names to tensors.
+    """
+
+    def __pad_batch_data(insts,
+                         pad_idx,
+                         is_target=False,
+                         return_pos=True,
+                         return_attn_bias=True,
+                         return_max_len=True):
+        """
+        Pad the instances to the max sequence length in batch, and generate the
+        corresponding position data and attention bias.
+        """
+        return_list = []
+        max_len = max(len(inst) for inst in insts)
+        inst_data = np.array(
+            [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
+        return_list += [inst_data.astype("int64").reshape([-1, 1])]
+        if return_pos:
+            inst_pos = np.array([[
+                pos_i + 1 if w_i != pad_idx else 0
+                for pos_i, w_i in enumerate(inst)
+            ] for inst in inst_data])
+
+            return_list += [inst_pos.astype("int64").reshape([-1, 1])]
+        if return_attn_bias:
+            if is_target:
+                # This is used to avoid attention on paddings and subsequent
+                # words.
+                slf_attn_bias_data = np.ones((inst_data.shape[0], max_len,
+                                              max_len))
+                slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
+                    [-1, 1, max_len, max_len])
+                slf_attn_bias_data = np.tile(slf_attn_bias_data,
+                                             [1, n_head, 1, 1]) * [-1e9]
+            else:
+                # This is used to avoid attention on paddings.
+                slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
+                                               (max_len - len(inst))
+                                               for inst in insts])
+                slf_attn_bias_data = np.tile(
+                    slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
+                    [1, n_head, max_len, 1])
+            return_list += [slf_attn_bias_data.astype("float32")]
+        if return_max_len:
+            return_list += [max_len]
+        return return_list if len(return_list) > 1 else return_list[0]
+
+    src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data(
+        [inst[0] for inst in insts], src_pad_idx, is_target=False)
+    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data(
+        [inst[1] for inst in insts], trg_pad_idx, is_target=True)
+    trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
+                                [1, 1, trg_max_len, 1]).astype("float32")
+    lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False,
+                                False, False, False)
+    lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])
+
+    return [
+        src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias,
+        trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
+    ]
+
+
+def transformer(use_feed):
+    assert not use_feed, "transfomer doesn't support feed yet"
+    return transformer_model.transformer(
+        ModelHyperParams.src_vocab_size + 1,
+        ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
+        ModelHyperParams.n_layer, ModelHyperParams.n_head,
+        ModelHyperParams.d_key, ModelHyperParams.d_value,
+        ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
+        ModelHyperParams.dropout, ModelHyperParams.src_pad_idx,
+        ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)
+
+
+class TestTransformer(TestParallelExecutorBase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+        reader = paddle.batch(
+            wmt16.train(ModelHyperParams.src_vocab_size,
+                        ModelHyperParams.trg_vocab_size),
+            batch_size=transformer_model.batch_size)
+
+        with fluid.recordio_writer.create_recordio_writer(
+                WMT16_RECORDIO_FILE) as writer:
+            for batch in reader():
+                for tensor in prepare_batch_input(
+                        batch, ModelHyperParams.src_pad_idx,
+                        ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head):
+                    t = fluid.LoDTensor()
+                    t.set(tensor, fluid.CPUPlace())
+                    writer.append_tensor(t)
+                writer.complete_append_tensor()
+
+    @unittest.skip("transformer is buggy in multi gpu")
+    def test_main(self):
+        self.check_network_convergence(transformer, use_cuda=True)
+        self.check_network_convergence(transformer, use_cuda=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_op.py b/python/paddle/fluid/tests/unittests/test_parallel_op.py
index 1a7551c57b26f576ab286e7b18177b9120261623..79bea148f9398152a02d70946cdc5fff1f47ba6b 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py
@@ -15,6 +15,7 @@
 import unittest
 
 import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
 import numpy
 
 
@@ -60,20 +61,23 @@ class BaseParallelForTest(unittest.TestCase):
                 feed=feed,
                 fetch=fetch,
                 place=gpu,
-                use_parallel=False)
+                use_parallel=False,
+                use_gpu=True)
             result_gpu_parallel = self._run_test_impl_(
                 callback=callback,
                 feed=feed,
                 fetch=fetch,
                 place=gpu,
-                use_parallel=True)
+                use_parallel=True,
+                use_gpu=True)
             result_gpu_nccl = self._run_test_impl_(
                 callback=callback,
                 feed=feed,
                 fetch=fetch,
                 place=gpu,
                 use_parallel=True,
-                use_nccl=True)
+                use_nccl=True,
+                use_gpu=True)
             self._assert_same_(fetch, result_cpu, result_cpu_parallel,
                                result_gpu, result_gpu_parallel, result_gpu_nccl)
         else:
@@ -85,7 +89,8 @@ class BaseParallelForTest(unittest.TestCase):
                         fetch,
                         place,
                         use_parallel=False,
-                        use_nccl=False):
+                        use_nccl=False,
+                        use_gpu=False):
         """
         Run a single test, returns the fetch values
         Args:
@@ -132,7 +137,12 @@ class BaseParallelForTest(unittest.TestCase):
 
         exe = fluid.Executor(place)
         exe.run(startup)
-        return exe.run(main, feed=feed, fetch_list=fetch)
+        if use_gpu:
+            profile_type = 'GPU'
+        else:
+            profile_type = 'CPU'
+        with profiler.profiler(profile_type, 'total', '/tmp/profiler'):
+            return exe.run(main, feed=feed, fetch_list=fetch)
 
     def _assert_same_(self, fetch, *args):
         """
diff --git a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..2105d320665367e3ec1bfd7b3a353a144c91244f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
@@ -0,0 +1,68 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def PolygonBoxRestore(input):
+    shape = input.shape
+    batch_size = shape[0]
+    geo_channels = shape[1]
+    h = shape[2]
+    w = shape[3]
+    h_indexes = np.array(range(h) * w).reshape(
+        [w, h]).transpose()[np.newaxis, :]  # [1, h, w]
+    w_indexes = np.array(range(w) * h).reshape(
+        [h, w])[np.newaxis, :]  # [1, h, w]
+    indexes = np.concatenate(
+        (w_indexes, h_indexes))[np.newaxis, :]  # [1, 2, h, w]
+    indexes = indexes.repeat(
+        [geo_channels / 2],
+        axis=0)[np.newaxis, :]  # [1, geo_channels/2, 2, h, w]
+    indexes = indexes.repeat(
+        [batch_size], axis=0)  # [batch_size, geo_channels/2, 2, h, w]
+    return indexes.reshape(
+        input.shape) - input  # [batch_size, geo_channels, h, w]
+
+
+class TestPolygonBoxRestoreOp(OpTest):
+    def config(self):
+        self.input_shape = (1, 8, 2, 2)
+
+    def setUp(self):
+        self.config()
+        self.op_type = "polygon_box_transform"
+        input = np.random.random(self.input_shape).astype("float32")
+        self.inputs = {'Input': input}
+        output = PolygonBoxRestore(input)
+        self.outputs = {'Output': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestCase1(TestPolygonBoxRestoreOp):
+    def config(self):
+        self.input_shape = (2, 10, 3, 2)
+
+
+class TestCase2(TestPolygonBoxRestoreOp):
+    def config(self):
+        self.input_shape = (3, 12, 4, 5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..003ebba18b26198427d9f313596ae85656ac24fa
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
@@ -0,0 +1,50 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from test_pool2d_op import TestPool2d_Op, TestCase1, TestCase2, TestCase3, TestCase4, TestCase5
+
+
+class TestMKLDNNCase1(TestPool2d_Op):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNCase2(TestCase1):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNCase3(TestCase2):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNCase4(TestCase3):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNCase5(TestCase4):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestMKLDNNCase6(TestCase5):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index 964d78f1966aa10e36eeaabe943d44e002d50293..f7e1e8573290766cde0c35816d687e7ba6fa4220 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -78,20 +78,22 @@ def avg_pool2D_forward_naive(x,
 
 class TestPool2d_Op(OpTest):
     def setUp(self):
+        self.op_type = "pool2d"
         self.use_cudnn = False
         self.use_mkldnn = False
+        self.dtype = np.float32
         self.init_test_case()
         self.init_global_pool()
-        self.init_op_type()
+        self.init_kernel_type()
         self.init_pool_type()
         self.init_ceil_mode()
         if self.global_pool:
             self.paddings = [0 for _ in range(len(self.paddings))]
-        input = np.random.random(self.shape).astype("float32")
+        input = np.random.random(self.shape).astype(self.dtype)
         output = self.pool2D_forward_naive(input, self.ksize, self.strides,
                                            self.paddings, self.global_pool,
-                                           self.ceil_mode).astype("float32")
-        self.inputs = {'X': input}
+                                           self.ceil_mode).astype(self.dtype)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
 
         self.attrs = {
             'strides': self.strides,
@@ -105,17 +107,22 @@ class TestPool2d_Op(OpTest):
             'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
         }
 
-        self.outputs = {'Out': output.astype('float32')}
+        self.outputs = {'Out': output}
+
+    def testcudnn(self):
+        return core.is_compiled_with_cuda() and self.use_cudnn
 
     def test_check_output(self):
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_output_with_place(place, atol=1e-5)
         else:
             self.check_output()
 
     def test_check_grad(self):
-        if self.use_cudnn and self.pool_type != "max":
+        if self.dtype == np.float16:
+            return
+        if self.testcudnn() and self.pool_type != "max":
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place, set(['X']), 'Out', max_relative_error=0.07)
@@ -128,8 +135,8 @@ class TestPool2d_Op(OpTest):
         self.strides = [1, 1]
         self.paddings = [0, 0]
 
-    def init_op_type(self):
-        self.op_type = "pool2d"
+    def init_kernel_type(self):
+        pass
 
     def init_pool_type(self):
         self.pool_type = "avg"
@@ -149,9 +156,6 @@ class TestCase1(TestPool2d_Op):
         self.strides = [1, 1]
         self.paddings = [0, 0]
 
-    def init_op_type(self):
-        self.op_type = "pool2d"
-
     def init_pool_type(self):
         self.pool_type = "avg"
         self.pool2D_forward_naive = avg_pool2D_forward_naive
@@ -167,9 +171,6 @@ class TestCase2(TestPool2d_Op):
         self.strides = [1, 1]
         self.paddings = [1, 1]
 
-    def init_op_type(self):
-        self.op_type = "pool2d"
-
     def init_pool_type(self):
         self.pool_type = "avg"
         self.pool2D_forward_naive = avg_pool2D_forward_naive
@@ -179,27 +180,18 @@ class TestCase2(TestPool2d_Op):
 
 
 class TestCase3(TestPool2d_Op):
-    def init_op_type(self):
-        self.op_type = "pool2d"
-
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
 
 
 class TestCase4(TestCase1):
-    def init_op_type(self):
-        self.op_type = "pool2d"
-
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
 
 
 class TestCase5(TestCase2):
-    def init_op_type(self):
-        self.op_type = "pool2d"
-
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
@@ -207,39 +199,105 @@ class TestCase5(TestCase2):
 
 #--------------------test pool2d--------------------
 class TestCUDNNCase1(TestPool2d_Op):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "pool2d"
+
+
+class TestFP16CUDNNCase1(TestPool2d_Op):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
 class TestCUDNNCase2(TestCase1):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "pool2d"
+
+
+class TestFP16CUDNNCase2(TestCase1):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
 class TestCUDNNCase3(TestCase2):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "pool2d"
+
+
+class TestFP16CUDNNCase3(TestCase2):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
 class TestCUDNNCase4(TestCase3):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "pool2d"
+
+
+class TestFP16CUDNNCase4(TestCase3):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
 class TestCUDNNCase5(TestCase4):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "pool2d"
+
+
+class TestFP16CUDNNCase5(TestCase4):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
 class TestCUDNNCase6(TestCase5):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "pool2d"
+
+
+class TestFP16CUDNNCase6(TestCase5):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
 class TestCeilModeCase1(TestCUDNNCase1):
@@ -262,42 +320,5 @@ class TestCeilModeCase4(TestCase2):
         self.ceil_mode = True
 
 
-#--------------------test pool2d MKLDNN--------------------
-class TestMKLDNNCase1(TestPool2d_Op):
-    def init_op_type(self):
-        self.use_mkldnn = True
-        self.op_type = "pool2d"
-
-
-class TestMKLDNNCase2(TestCase1):
-    def init_op_type(self):
-        self.use_mkldnn = True
-        self.op_type = "pool2d"
-
-
-class TestMKLDNNCase3(TestCase2):
-    def init_op_type(self):
-        self.use_mkldnn = True
-        self.op_type = "pool2d"
-
-
-class TestMKLDNNCase4(TestCase3):
-    def init_op_type(self):
-        self.use_mkldnn = True
-        self.op_type = "pool2d"
-
-
-class TestMKLDNNCase5(TestCase4):
-    def init_op_type(self):
-        self.use_mkldnn = True
-        self.op_type = "pool2d"
-
-
-class TestMKLDNNCase6(TestCase5):
-    def init_op_type(self):
-        self.use_mkldnn = True
-        self.op_type = "pool2d"
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
index 15a8ac5e2029eec204d061d1832df3df90339697..142165f29beeaedfaa660f04424147e06710d192 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -90,20 +90,22 @@ def avg_pool3D_forward_naive(x,
 
 class TestPool3d_Op(OpTest):
     def setUp(self):
+        self.op_type = "pool3d"
         self.use_cudnn = False
+        self.dtype = np.float32
         self.init_test_case()
         self.init_global_pool()
-        self.init_op_type()
+        self.init_kernel_type()
         self.init_pool_type()
         self.init_ceil_mode()
 
         if self.global_pool:
             self.paddings = [0 for _ in range(len(self.paddings))]
-        input = np.random.random(self.shape).astype("float32")
+        input = np.random.random(self.shape).astype(self.dtype)
         output = self.pool3D_forward_naive(input, self.ksize, self.strides,
                                            self.paddings, self.global_pool,
-                                           self.ceil_mode).astype("float32")
-        self.inputs = {'X': input}
+                                           self.ceil_mode).astype(self.dtype)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
 
         self.attrs = {
             'strides': self.strides,
@@ -116,17 +118,22 @@ class TestPool3d_Op(OpTest):
             'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
         }
 
-        self.outputs = {'Out': output.astype('float32')}
+        self.outputs = {'Out': output}
+
+    def testcudnn(self):
+        return core.is_compiled_with_cuda() and self.use_cudnn
 
     def test_check_output(self):
-        if self.use_cudnn:
+        if self.testcudnn():
             place = core.CUDAPlace(0)
             self.check_output_with_place(place, atol=1e-5)
         else:
             self.check_output()
 
     def test_check_grad(self):
-        if self.use_cudnn and self.pool_type != "max":
+        if self.dtype == np.float16:
+            return
+        if self.testcudnn() and self.pool_type != "max":
             place = core.CUDAPlace(0)
             self.check_grad_with_place(
                 place, set(['X']), 'Out', max_relative_error=0.07)
@@ -139,8 +146,8 @@ class TestPool3d_Op(OpTest):
         self.strides = [1, 1, 1]
         self.paddings = [0, 0, 0]
 
-    def init_op_type(self):
-        self.op_type = "pool3d"
+    def init_kernel_type(self):
+        pass
 
     def init_pool_type(self):
         self.pool_type = "avg"
@@ -155,15 +162,11 @@ class TestPool3d_Op(OpTest):
 
 class TestCase1(TestPool3d_Op):
     def init_test_case(self):
-        self.op_type = "pool3d"
         self.shape = [2, 3, 7, 7, 7]
         self.ksize = [3, 3, 3]
         self.strides = [1, 1, 1]
         self.paddings = [0, 0, 0]
 
-    def init_op_type(self):
-        self.op_type = "pool3d"
-
     def init_pool_type(self):
         self.pool_type = "avg"
         self.pool3D_forward_naive = avg_pool3D_forward_naive
@@ -179,9 +182,6 @@ class TestCase2(TestPool3d_Op):
         self.strides = [1, 1, 1]
         self.paddings = [1, 1, 1]
 
-    def init_op_type(self):
-        self.op_type = "pool3d"
-
     def init_pool_type(self):
         self.pool_type = "avg"
         self.pool3D_forward_naive = avg_pool3D_forward_naive
@@ -191,27 +191,18 @@ class TestCase2(TestPool3d_Op):
 
 
 class TestCase3(TestPool3d_Op):
-    def init_op_type(self):
-        self.op_type = "pool3d"
-
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool3D_forward_naive = max_pool3D_forward_naive
 
 
 class TestCase4(TestCase1):
-    def init_op_type(self):
-        self.op_type = "pool3d"
-
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool3D_forward_naive = max_pool3D_forward_naive
 
 
 class TestCase5(TestCase2):
-    def init_op_type(self):
-        self.op_type = "pool3d"
-
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool3D_forward_naive = max_pool3D_forward_naive
@@ -219,39 +210,105 @@ class TestCase5(TestCase2):
 
 #--------------------test pool3d--------------------
 class TestCUDNNCase1(TestPool3d_Op):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "pool3d"
+
+
+class TestFP16CUDNNCase1(TestPool3d_Op):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
 class TestCUDNNCase2(TestCase1):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "pool3d"
+
+
+class TestFP16CUDNNCase2(TestCase1):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
 class TestCUDNNCase3(TestCase2):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "pool3d"
+
+
+class TestFP16CUDNNCase3(TestCase2):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
 class TestCUDNNCase4(TestCase3):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "pool3d"
+
+
+class TestFP16CUDNNCase4(TestCase3):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
 class TestCUDNNCase5(TestCase4):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "pool3d"
+
+
+class TestFP16CUDNNCase5(TestCase4):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
 class TestCUDNNCase6(TestCase5):
-    def init_op_type(self):
+    def init_kernel_type(self):
         self.use_cudnn = True
-        self.op_type = "pool3d"
+
+
+class TestFP16CUDNNCase6(TestCase5):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
 
 
 class TestCeilModeCase1(TestCUDNNCase1):
diff --git a/python/paddle/fluid/tests/unittests/test_preprocessor.py b/python/paddle/fluid/tests/unittests/test_preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbf1a7e0c50a87cd43507ffdb94109873cf4e5d9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_preprocessor.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.v2 as paddle
+import paddle.v2.dataset.mnist as mnist
+
+
+class TestPreprocessor(unittest.TestCase):
+    def setUp(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            reader = paddle.batch(mnist.train(), batch_size=32)
+            feeder = fluid.DataFeeder(
+                feed_list=[  # order is image and label
+                    fluid.layers.data(
+                        name='image', shape=[784]),
+                    fluid.layers.data(
+                        name='label', shape=[1], dtype='int64'),
+                ],
+                place=fluid.CPUPlace())
+            self.num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
+                './mnist_for_preprocessor_test.recordio', reader, feeder)
+
+    def test_main(self):
+        N = 10
+
+        img_expected_res = []
+        lbl_expected_res = []
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data_file = fluid.layers.io.open_recordio_file(
+                './mnist_for_preprocessor_test.recordio',
+                shapes=[[-1, 784], [-1, 1]],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'])
+            img, lbl = fluid.layers.io.read_file(data_file)
+
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            else:
+                place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            for _ in range(N):
+                img_v, lbl_v = exe.run(fetch_list=[img, lbl])
+                img_expected_res.append(img_v / 2)
+                lbl_expected_res.append(lbl_v + 1)
+
+        img_actual_res = []
+        lbl_actual_res = []
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data_file = fluid.layers.io.open_recordio_file(
+                './mnist_for_preprocessor_test.recordio',
+                shapes=[[-1, 784], [-1, 1]],
+                lod_levels=[0, 0],
+                dtypes=['float32', 'int64'])
+            preprocessor = fluid.layers.io.Preprocessor(reader=data_file)
+            with preprocessor.block():
+                img, lbl = preprocessor.inputs()
+                img_out = img / 2
+                lbl_out = lbl + 1
+                preprocessor.outputs(img_out, lbl_out)
+
+            data_file = fluid.layers.io.double_buffer(preprocessor())
+            img, lbl = fluid.layers.io.read_file(data_file)
+
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            else:
+                place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            for _ in range(N):
+                img_v, lbl_v = exe.run(fetch_list=[img, lbl])
+                img_actual_res.append(img_v)
+                lbl_actual_res.append(lbl_v)
+
+        for idx in range(N):
+            np.allclose(img_expected_res[idx], img_actual_res[idx])
+            np.allclose(lbl_expected_res[idx], lbl_actual_res[idx])
diff --git a/python/paddle/fluid/tests/unittests/test_print_op.py b/python/paddle/fluid/tests/unittests/test_print_op.py
index c75080fbb96d472810e5d6a1d02a77c456006f66..e01af42a58b86042fd0282928d1a78d9c3239fe3 100644
--- a/python/paddle/fluid/tests/unittests/test_print_op.py
+++ b/python/paddle/fluid/tests/unittests/test_print_op.py
@@ -28,7 +28,7 @@ class TestPrintOpCPU(unittest.TestCase):
         self.x_tensor = core.LoDTensor()
         tensor_np = np.random.random(size=(2, 3)).astype('float32')
         self.x_tensor.set(tensor_np, self.place)
-        self.x_tensor.set_lod([[0, 1, 1]])
+        self.x_tensor.set_recursive_sequence_lengths([[1, 1]])
 
     def build_network(self, only_forward, **kargs):
         x = layers.data('x', shape=[3], dtype='float32', lod_level=1)
@@ -62,7 +62,7 @@ class TestPrintOpGPU(TestPrintOpCPU):
         self.x_tensor = core.LoDTensor()
         tensor_np = np.random.random(size=(2, 3)).astype('float32')
         self.x_tensor.set(tensor_np, self.place)
-        self.x_tensor.set_lod([[0, 1, 1]])
+        self.x_tensor.set_recursive_sequence_lengths([[1, 1]])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
index c21138c13e6753f9dfcbd7d439269f7cf9a04f23..bcbc02a2baa46b9ab583ecf3006bd3262e6038fd 100644
--- a/python/paddle/fluid/tests/unittests/test_prior_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
@@ -28,7 +28,6 @@ class TestPriorBoxOp(OpTest):
 
         self.attrs = {
             'min_sizes': self.min_sizes,
-            'max_sizes': self.max_sizes,
             'aspect_ratios': self.aspect_ratios,
             'variances': self.variances,
             'flip': self.flip,
@@ -37,25 +36,28 @@ class TestPriorBoxOp(OpTest):
             'step_h': self.step_h,
             'offset': self.offset
         }
+        if len(self.max_sizes) > 0:
+            self.attrs['max_sizes'] = self.max_sizes
 
         self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var}
 
     def test_check_output(self):
         self.check_output()
 
-    def test_check_grad(self):
-        return
-
     def setUp(self):
         self.op_type = "prior_box"
         self.set_data()
 
+    def set_max_sizes(self):
+        max_sizes = [5, 10]
+        self.max_sizes = np.array(max_sizes).astype('float32').tolist()
+
     def init_test_params(self):
-        self.layer_w = 4
-        self.layer_h = 4
+        self.layer_w = 32
+        self.layer_h = 32
 
-        self.image_w = 20
-        self.image_h = 20
+        self.image_w = 40
+        self.image_h = 40
 
         self.step_w = float(self.image_w) / float(self.layer_w)
         self.step_h = float(self.image_h) / float(self.layer_h)
@@ -66,8 +68,7 @@ class TestPriorBoxOp(OpTest):
 
         self.min_sizes = [2, 4]
         self.min_sizes = np.array(self.min_sizes).astype('float32').tolist()
-        self.max_sizes = [5, 10]
-        self.max_sizes = np.array(self.max_sizes).astype('float32').tolist()
+        self.set_max_sizes()
         self.aspect_ratios = [2.0, 3.0]
         self.flip = True
         self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
@@ -79,7 +80,7 @@ class TestPriorBoxOp(OpTest):
         self.clip = True
 
         self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes)
-        if len(self.max_sizes) > 1:
+        if len(self.max_sizes) > 0:
             self.num_priors += len(self.max_sizes)
         self.offset = 0.5
 
@@ -105,35 +106,27 @@ class TestPriorBoxOp(OpTest):
                 idx = 0
                 for s in range(len(self.min_sizes)):
                     min_size = self.min_sizes[s]
-                    c_w = c_h = min_size / 2.
-                    out_boxes[h, w, idx, :] = [
-                        (c_x - c_w) / self.image_w, (c_y - c_h) / self.image_h,
-                        (c_x + c_w) / self.image_w, (c_y + c_h) / self.image_h
-                    ]
-                    idx += 1
-
-                    if len(self.max_sizes) > 0:
-                        max_size = self.max_sizes[s]
-                        # second prior: aspect_ratio = 1,
-                        c_w = c_h = math.sqrt(min_size * max_size) / 2
+                    # rest of priors
+                    for r in range(len(self.real_aspect_ratios)):
+                        ar = self.real_aspect_ratios[r]
+                        c_w = min_size * math.sqrt(ar) / 2
+                        c_h = (min_size / math.sqrt(ar)) / 2
                         out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w,
                                                    (c_y - c_h) / self.image_h,
                                                    (c_x + c_w) / self.image_w,
                                                    (c_y + c_h) / self.image_h]
                         idx += 1
 
-                    # rest of priors
-                    for r in range(len(self.real_aspect_ratios)):
-                        ar = self.real_aspect_ratios[r]
-                        if math.fabs(ar - 1.) < 1e-6:
-                            continue
-                        c_w = min_size * math.sqrt(ar) / 2
-                        c_h = (min_size / math.sqrt(ar)) / 2
+                    if len(self.max_sizes) > 0:
+                        max_size = self.max_sizes[s]
+                        # second prior: aspect_ratio = 1,
+                        c_w = c_h = math.sqrt(min_size * max_size) / 2
                         out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w,
                                                    (c_y - c_h) / self.image_h,
                                                    (c_x + c_w) / self.image_w,
                                                    (c_y + c_h) / self.image_h]
                         idx += 1
+
         # clip the prior's coordidate such that it is within[0, 1]
         if self.clip:
             out_boxes = np.clip(out_boxes, 0.0, 1.0)
@@ -144,5 +137,10 @@ class TestPriorBoxOp(OpTest):
         self.out_var = out_var.astype('float32')
 
 
+class TestPriorBoxOpWithMaxSize(TestPriorBoxOp):
+    def set_max_sizes(self):
+        self.max_sizes = []
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py
index 1da6b94eea30e65913ce713e0e5e355507534161..cf6fe14a86aa1ab6ea3f60ad15f33d708e9b803a 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@@ -31,8 +31,22 @@ class TestProfiler(unittest.TestCase):
 
         with fluid.program_guard(main_program, startup_program):
             image = fluid.layers.data(name='x', shape=[784], dtype='float32')
-            hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
-            hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
+            hidden1 = fluid.layers.fc(input=image, size=64, act='relu')
+            i = layers.zeros(shape=[1], dtype='int64')
+            counter = fluid.layers.zeros(
+                shape=[1], dtype='int64', force_cpu=True)
+            until = layers.fill_constant([1], dtype='int64', value=10)
+            data_arr = layers.array_write(hidden1, i)
+            cond = fluid.layers.less_than(x=counter, y=until)
+            while_op = fluid.layers.While(cond=cond)
+            with while_op.block():
+                hidden_n = fluid.layers.fc(input=hidden1, size=64, act='relu')
+                layers.array_write(hidden_n, i, data_arr)
+                fluid.layers.increment(x=counter, value=1, in_place=True)
+                layers.less_than(x=counter, y=until, cond=cond)
+
+            hidden_n = layers.array_read(data_arr, i)
+            hidden2 = fluid.layers.fc(input=hidden_n, size=64, act='relu')
             predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
             label = fluid.layers.data(name='y', shape=[1], dtype='int64')
             cost = fluid.layers.cross_entropy(input=predict, label=label)
diff --git a/python/paddle/fluid/tests/unittests/test_program.py b/python/paddle/fluid/tests/unittests/test_program.py
index 87a2195f0d5c7fd355ea01a3c8f60908b33d4b9d..c51a48239330621d8e008415f81361616467cabf 100644
--- a/python/paddle/fluid/tests/unittests/test_program.py
+++ b/python/paddle/fluid/tests/unittests/test_program.py
@@ -87,57 +87,6 @@ class TestProgram(unittest.TestCase):
         print(prog)
         print(prog_restored)
 
-    def test_append_backward(self):
-        prog = Program()
-        block = prog.global_block()
-
-        mul_x = block.create_var(
-            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        mul_op = block.append_op(
-            type="mul",
-            inputs={"X": [mul_x],
-                    "Y": mul_y},
-            outputs={"Out": [mul_out]},
-            attrs={"x_num_col_dims": 1})
-
-        add_y = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="add.y")
-        add_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="add.out")
-        add_op = block.append_op(
-            type="elementwise_add",
-            inputs={"X": mul_out,
-                    "Y": add_y},
-            outputs={"Out": add_out},
-            attrs={"x_num_col_dims": 1})
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mean", inputs={"X": add_out}, outputs={"Out": mean_out})
-
-        self.assertEqual(mul_op.idx, 0)
-        self.assertEqual(add_op.idx, 1)
-        param_to_grad = prog.append_backward(mean_out, set())
-
-        for var_name in ("mul.x", "mul.y", "mul.out", "add.y", "add.out",
-                         "mean.out"):
-            self.assertEqual(param_to_grad[var_name][0],
-                             grad_var_name(var_name))
-            self.assertEqual(param_to_grad[var_name][1], 0)
-
-        expect_ops = [
-            "mul", "elementwise_add", "mean", "fill_constant", "mean_grad",
-            "elementwise_add_grad", "mul_grad"
-        ]
-        actual_ops = []
-        for op in block.ops:
-            actual_ops.append(op.type)
-        self.assertEqual(actual_ops, expect_ops)
-
     def test_program_clone_with_parameter(self):
         main_program = Program()
         startup_program = Program()
diff --git a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
index 309ea2b9b7ede442da3ac897ce8d1a4b9aa68233..3f9059fb5b31cd009c068ccddc9a8938adae5772 100644
--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
@@ -14,13 +14,14 @@
 
 import unittest
 import paddle.fluid.core as core
+from paddle.fluid.framework import Program
 
 
 class TestOpDesc(unittest.TestCase):
     def test_op_desc(self):
-        prog = core.ProgramDesc()
-        self.assertIsNotNone(prog)
-        block = prog.block(0)
+        program_desc = core.ProgramDesc()
+        self.assertIsNotNone(program_desc)
+        block = program_desc.block(0)
         self.assertIsNotNone(block)
         op = block.append_op()
         self.assertIsNotNone(op)
@@ -66,7 +67,7 @@ class TestOpDesc(unittest.TestCase):
 
         self.assertEqual(8, len(op.attr_names()))
 
-        op.set_block_attr("block_attr", prog.block(0))
+        op.set_block_attr("block_attr", program_desc.block(0))
         self.assertEqual(0, op.block_attr("block_attr"))
 
         mul_op = block.append_op()
@@ -87,20 +88,20 @@ class TestProgramDesc(unittest.TestCase):
         del program_desc
 
     def test_append_block(self):
-        prog_desc = core.ProgramDesc()
-        self.assertIsNotNone(prog_desc)
-        block_root = prog_desc.block(0)
+        program_desc = core.ProgramDesc()
+        self.assertIsNotNone(program_desc)
+        block_root = program_desc.block(0)
         self.assertIsNotNone(block_root)
         self.assertEqual(block_root.id, 0)
-        block1 = prog_desc.append_block(block_root)
-        block2 = prog_desc.append_block(block1)
+        block1 = program_desc.append_block(block_root)
+        block2 = program_desc.append_block(block1)
         self.assertIsNotNone(block1)
         self.assertEqual(block1.id, block2.parent)
         self.assertEqual(block_root.id, block1.parent)
-        block3 = prog_desc.append_block(block_root)
+        block3 = program_desc.append_block(block_root)
         self.assertEqual(block3.parent, block_root.id)
-        self.assertEqual(prog_desc.block(1).id, 1)
-        self.assertEqual(4, prog_desc.num_blocks())
+        self.assertEqual(program_desc.block(1).id, 1)
+        self.assertEqual(4, program_desc.num_blocks())
 
 
 class TestVarDesc(unittest.TestCase):
@@ -161,9 +162,9 @@ class TestVarDesc(unittest.TestCase):
 
 class TestBlockDesc(unittest.TestCase):
     def test_add_var(self):
-        prog = core.ProgramDesc()
-        self.assertIsNotNone(prog)
-        block = prog.block(0)
+        program_desc = core.ProgramDesc()
+        self.assertIsNotNone(program_desc)
+        block = program_desc.block(0)
         self.assertIsNotNone(block)
         var1 = block.var("var1")
         var2 = block.var("var2")
@@ -174,9 +175,9 @@ class TestBlockDesc(unittest.TestCase):
         self.assertEqual(var2_re, var2)
 
     def test_add_op(self):
-        prog = core.ProgramDesc()
-        self.assertIsNotNone(prog)
-        block = prog.block(0)
+        program_desc = core.ProgramDesc()
+        self.assertIsNotNone(program_desc)
+        block = program_desc.block(0)
         self.assertIsNotNone(block)
         op1 = block.append_op()
         op2 = block.append_op()
@@ -186,6 +187,28 @@ class TestBlockDesc(unittest.TestCase):
             all_ops.append(block.op(idx))
         self.assertEqual(all_ops, [op0, op1, op2])
 
+    def test_remove_op(self):
+        program = Program()
+        program_desc = program.desc
+        self.assertIsNotNone(program_desc)
+        block = program_desc.block(0)
+        self.assertIsNotNone(block)
+
+        op0 = block.append_op()
+        op1 = block.append_op()
+        op2 = block.append_op()
+        op0.set_type("test")
+        op1.set_type("test")
+        op2.set_type("test")
+
+        block.remove_op(1, 2)
+        program.sync_with_cpp()
+
+        all_ops = []
+        for idx in xrange(0, block.op_size()):
+            all_ops.append(block.op(idx))
+        self.assertEqual(all_ops, [op0, op2])
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_random_crop_op.py b/python/paddle/fluid/tests/unittests/test_random_crop_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c708d0386da4028f1f3d177d0a3fd494c077c6e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_random_crop_op.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+
+
+class TestRandomCropOp(OpTest):
+    def setUp(self):
+        to_crop = np.array([[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]] *
+                           5).astype("float32")
+        self.possible_res = [
+            np.array([[1, 2, 3], [5, 6, 7]]), np.array([[2, 3, 4], [6, 7, 8]]),
+            np.array([[5, 6, 7], [9, 10, 11]]),
+            np.array([[6, 7, 8], [10, 11, 12]])
+        ]
+        self.op_type = "random_crop"
+        self.inputs = {'X': to_crop, 'Seed': np.array([10])}
+        self.outputs = {'Out': np.array([]), 'SeedOut': np.array([])}
+        self.attrs = {'shape': [2, 3]}
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_output)
+
+    def verify_output(self, outs):
+        out = np.array(outs[1])
+        for ins in out[:]:
+            is_equal = [(ins == res).all() for res in self.possible_res]
+            self.assertIn(True, is_equal)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_recordio_reader.py b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
index d249742bd30ec41749f16beaa7076f7c6e8f063c..69a522e273db017ac55b408276b4a28f5f907c42 100644
--- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 
 import unittest
+
 import paddle.fluid as fluid
-import paddle.v2.dataset.mnist as mnist
 import paddle.v2 as paddle
+import paddle.v2.dataset.mnist as mnist
 
 
 class TestRecordIO(unittest.TestCase):
@@ -31,10 +32,10 @@ class TestRecordIO(unittest.TestCase):
                         name='label', shape=[1], dtype='int64'),
                 ],
                 place=fluid.CPUPlace())
-            fluid.recordio_writer.convert_reader_to_recordio_file(
+            self.num_batches = fluid.recordio_writer.convert_reader_to_recordio_file(
                 './mnist.recordio', reader, feeder)
 
-    def test_main(self):
+    def test_main(self, decorator_callback=None):
         # use new program
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             data_file = fluid.layers.open_recordio_file(
@@ -42,6 +43,8 @@ class TestRecordIO(unittest.TestCase):
                 shapes=[[-1, 784], [-1, 1]],
                 lod_levels=[0, 0],
                 dtypes=['float32', 'int64'])
+            if decorator_callback is not None:
+                data_file = decorator_callback(data_file)
             img, label = fluid.layers.read_file(data_file)
 
             hidden = fluid.layers.fc(input=img, size=100, act='tanh')
@@ -51,14 +54,32 @@ class TestRecordIO(unittest.TestCase):
 
             fluid.optimizer.Adam(learning_rate=1e-3).minimize(avg_loss)
 
-            exe = fluid.Executor(fluid.CPUPlace())
+            if fluid.core.is_compiled_with_cuda():
+                place = fluid.CUDAPlace(0)
+            else:
+                place = fluid.CPUPlace()
+
+            exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
             avg_loss_np = []
 
             # train a pass
-            while not data_file.eof():
-                tmp, = exe.run(fetch_list=[avg_loss])
-                avg_loss_np.append(tmp)
-            data_file.reset()
+            batch_id = 0
+            while True:
+                try:
+                    tmp, = exe.run(fetch_list=[avg_loss])
+                except fluid.core.EOFException:
+                    break
 
+                avg_loss_np.append(tmp)
+                batch_id += 1
+            self.assertEqual(batch_id, self.num_batches)
             self.assertLess(avg_loss_np[-1], avg_loss_np[0])
+
+    def test_shuffle_reader(self):
+        self.test_main(decorator_callback=lambda reader: fluid.layers.io.shuffle(
+            reader, buffer_size=200))
+
+    def test_double_buffer_reader(self):
+        self.test_main(decorator_callback=lambda reader: fluid.layers.io.double_buffer(reader,
+                                                                                       place='cuda:0' if fluid.core.is_compiled_with_cuda() else 'cpu'))
diff --git a/python/paddle/fluid/tests/unittests/test_recv_op.py b/python/paddle/fluid/tests/unittests/test_recv_op.py
deleted file mode 100644
index 985d892c568472614c5f3e6691f54807ddccc4bd..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_recv_op.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-import numpy
-from multiprocessing import Process
-import os, sys
-import time
-
-
-class TestRecvOp(unittest.TestCase):
-    def test_send(self):
-        # Run init_serv in a thread
-        place = fluid.CPUPlace()
-        p = Process(target=self.init_serv, args=(place, ))
-        p.daemon = True
-        p.start()
-        time.sleep(1)
-        self.init_client(place)
-        # FIXME(typhoonzero): find a way to gracefully shutdown the server.
-        os.system("kill -9 %d" % p.pid)
-        p.join()
-
-    def init_serv(self, place):
-        main = fluid.Program()
-        with fluid.program_guard(main):
-            x = layers.data(
-                shape=[32, 32],
-                dtype='float32',
-                name="X",
-                append_batch_size=False)
-            fluid.initializer.Constant(value=1.0)(x, main.global_block())
-            serv = layers.ListenAndServ("127.0.0.1:6174", optimizer_mode=False)
-            with serv.do():
-                o = layers.scale(x=x, scale=10.0)
-            main.global_block().create_var(
-                name=o.name, psersistable=False, dtype=o.dtype, shape=o.shape)
-        exe = fluid.Executor(place)
-        exe.run(main)
-
-    def init_client(self, place):
-        main = fluid.Program()
-        with fluid.program_guard(main):
-            x = layers.data(
-                shape=[32, 32],
-                dtype='float32',
-                name='X',
-                append_batch_size=False)
-            fluid.initializer.Constant(value=1.0)(x, main.global_block())
-            layers.Send("127.0.0.1:6174", [x], [x])
-        exe = fluid.Executor(place)
-        exe.run(main)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index 5e656bddb7927b6e7935255c120e5e858505125a..865c2b7df085aa6a6cb0d6eb461c342ce08695cd 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -34,8 +34,10 @@ class TestMeanOp(OpTest):
     def setUp(self):
         self.op_type = "reduce_mean"
         self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
-        self.attrs = {'dim': 1}
-        self.outputs = {'Out': self.inputs['X'].mean(axis=self.attrs['dim'])}
+        self.attrs = {'dim': [1]}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
+        }
 
     def test_check_output(self):
         self.check_output()
@@ -50,8 +52,10 @@ class TestMaxOp(OpTest):
     def setUp(self):
         self.op_type = "reduce_max"
         self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.attrs = {'dim': -1}
-        self.outputs = {'Out': self.inputs['X'].max(axis=self.attrs['dim'])}
+        self.attrs = {'dim': [-1]}
+        self.outputs = {
+            'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim']))
+        }
 
     def test_check_output(self):
         self.check_output()
@@ -63,20 +67,36 @@ class TestMinOp(OpTest):
     def setUp(self):
         self.op_type = "reduce_min"
         self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.attrs = {'dim': 2}
-        self.outputs = {'Out': self.inputs['X'].min(axis=self.attrs['dim'])}
+        self.attrs = {'dim': [2]}
+        self.outputs = {
+            'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestProdOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_prod"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.outputs = {'Out': self.inputs['X'].prod(axis=0)}
 
     def test_check_output(self):
         self.check_output()
 
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
 
 class TestKeepDimReduce(OpTest):
     def setUp(self):
         self.op_type = "reduce_sum"
         self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.attrs = {'dim': -2, 'keep_dim': True}
+        self.attrs = {'dim': [-2], 'keep_dim': True}
         self.outputs = {
-            'Out': self.inputs['X'].sum(axis=self.attrs['dim'], keepdims=True)
+            'Out':
+            self.inputs['X'].sum(axis=tuple(self.attrs['dim']), keepdims=True)
         }
 
     def test_check_output(self):
@@ -113,5 +133,67 @@ class TestReduceAll(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+## reduction in multi dims
+class TestReduceMeanOpMultiAxises(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
+        self.attrs = {'dim': [1, 2]}
+        self.outputs = {'Out': self.inputs['X'].mean(axis=(1, 2))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestReduceMaxOpMultiAxises(OpTest):
+    """Remove Max with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.attrs = {'dim': [-2, -1]}
+        self.outputs = {
+            'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestReduceMinOpMultiAxises(OpTest):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.attrs = {'dim': [1, 2]}
+        self.outputs = {
+            'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestKeepDimReduceSumMultiAxises(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.attrs = {'dim': [-2, -1], 'keep_dim': True}
+        self.outputs = {
+            'Out':
+            self.inputs['X'].sum(axis=tuple(self.attrs['dim']), keepdims=True)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
index 76d0d2f2fe80e409dc1b7fa858d43fbc6ad960ef..a70321bd800bf25eeb9e5d197ea7e08626b9aede 100644
--- a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
@@ -70,11 +70,10 @@ class TestReorderLoDTensor(unittest.TestCase):
                 lod_level_i = numpy.random.randint(
                     low=1,
                     high=5,
-                    size=self.num_seq if i == 0 else lod_level_i[-1])
-                lod_level_i = [0] + numpy.cumsum(lod_level_i).tolist()
+                    size=self.num_seq if i == 0 else sum(lod_level_i)).tolist()
                 data_lod.append(lod_level_i)
             data_value = numpy.random.random(
-                size=[data_lod[-1][-1] if data_lod else self.num_seq
+                size=[sum(data_lod[-1]) if data_lod else self.num_seq
                       ] + data_shape).astype('float32')
             self.data[data_name] = (data_value, data_lod)
 
@@ -84,29 +83,36 @@ class TestReorderLoDTensor(unittest.TestCase):
             tensor = fluid.Tensor()
             tensor.set(self.data[desc[0]][0], place)
             if self.data[desc[0]][1]:
-                tensor.set_lod(self.data[desc[0]][1])
+                tensor.set_recursive_sequence_lengths(self.data[desc[0]][1])
             self.inputs[desc[0]] = tensor
 
     def reorder(self):
-        level = 0
+        def convert_to_offset(lod):
+            offset_lod = [[0] for i in lod]
+            for i, level in enumerate(lod):
+                for seq_len in level:
+                    offset_lod[i].append(offset_lod[i][-1] + seq_len)
+            return offset_lod
 
+        level = 0
         # compute the rank_table according to ref_lod
         ref_lod = self.data[self.data_desc[1][0]][1][level]
         rank_table = []  # list of (index, length)
-        for i in range(len(ref_lod) - 1):
-            rank_table.append((i, ref_lod[i + 1] - ref_lod[i]))
+        for i in range(len(ref_lod)):
+            rank_table.append((i, ref_lod[i]))
         rank_table = sorted(rank_table, lambda x, y: y[1] - x[1])
 
         # compute the input sequence info according to input_lod
         input_value, input_lod = self.data[self.data_desc[0][0]]
+        offset_lod = convert_to_offset(input_lod)
 
         input_table = []  # list of (offset, length, sub_lod)
-        if input_lod:
-            for i in range(len(input_lod[level]) - 1):
+        if offset_lod:
+            for i in range(len(offset_lod[level]) - 1):
                 start_idx = i
                 end_idx = i + 1
                 sub_lod = []
-                for lod_level_i in input_lod[level:]:
+                for lod_level_i in offset_lod[level:]:
                     sub_lod_i = []
                     for idx in range(start_idx, end_idx):
                         sub_lod_i.append(lod_level_i[idx + 1] - lod_level_i[
@@ -132,10 +138,9 @@ class TestReorderLoDTensor(unittest.TestCase):
 
             input_seq_sub_lod = input_table[index][2]
             if len(output_lod) == 0:
-                output_lod = [[0] for i in input_seq_sub_lod]
-            for i, sub_lod_i in enumerate(input_seq_sub_lod):
-                for idx_sub in sub_lod_i:
-                    output_lod[i].append(output_lod[i][-1] + idx_sub)
+                output_lod = [[] for i in input_seq_sub_lod]
+            for i, level in enumerate(input_seq_sub_lod):
+                output_lod[i].extend(level)
         return output_value, output_lod
 
     def test_reorder_lod_tensor(self):
@@ -148,7 +153,8 @@ class TestReorderLoDTensor(unittest.TestCase):
             self.assertTrue(
                 numpy.allclose(
                     numpy.array(actual_output), expect_output, atol=0.001))
-            self.assertEqual(expect_output_lod, actual_output.lod())
+            self.assertEqual(expect_output_lod,
+                             actual_output.recursive_sequence_lengths())
         # check gradient
         expect_grad = numpy.ones_like(self.data[self.data_desc[0][0]][0])
         expect_grad_lod = self.data[self.data_desc[0][0]][1]
@@ -156,7 +162,8 @@ class TestReorderLoDTensor(unittest.TestCase):
             self.assertTrue(
                 numpy.allclose(
                     numpy.array(actual_grad), expect_grad, atol=0.001))
-            self.assertEqual(expect_grad_lod, actual_grad.lod())
+            self.assertEqual(expect_grad_lod,
+                             actual_grad.recursive_sequence_lengths())
 
     def test_reorder_tensor(self):
         self.data_desc[0][-1] = 0  # input is tensor
@@ -168,7 +175,8 @@ class TestReorderLoDTensor(unittest.TestCase):
             self.assertTrue(
                 numpy.allclose(
                     numpy.array(actual_output), expect_output, atol=0.001))
-            self.assertEqual(expect_output_lod, actual_output.lod())
+            self.assertEqual(expect_output_lod,
+                             actual_output.recursive_sequence_lengths())
         # check gradient
         expect_grad = numpy.ones_like(self.data[self.data_desc[0][0]][0])
         expect_grad_lod = self.data[self.data_desc[0][0]][1]
@@ -176,14 +184,14 @@ class TestReorderLoDTensor(unittest.TestCase):
             self.assertTrue(
                 numpy.allclose(
                     numpy.array(actual_grad), expect_grad, atol=0.001))
-            self.assertEqual(expect_grad_lod, actual_grad.lod())
+            self.assertEqual(expect_grad_lod,
+                             actual_grad.recursive_sequence_lengths())
 
         # compare outputs between LodTensors with explicit and implicit lod
         # use the same data but set the input lod explicitly
-        input_lod = [[
-            i for i in range(len(self.data[self.data_desc[0][0]][0]) + 1)
-        ]]
-        self.inputs[self.data_desc[0][0]].set_lod(input_lod)
+        input_lod = [[1] * len(self.data[self.data_desc[0][0]][0])]
+        self.inputs[self.data_desc[0][0]].set_recursive_sequence_lengths(
+            input_lod)
         # preserve the output of LodTensor with implicit lod to compare
         expect_output = [
             numpy.array(actual_output) for actual_output in self.actual_outputs
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index 11f35c74d41146118525a5efa6c211d528e255fe..f51b5a7e9907294a5b91c920a363830d8b9a7137 100644
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -14,15 +14,19 @@
 
 import unittest
 import numpy as np
+
 from op_test import OpTest
 
 
 class TestReshapeOp(OpTest):
     def setUp(self):
+        ori_shape = (2, 25)
+        new_shape = (5, 10)
+
         self.op_type = "reshape"
-        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
-        self.attrs = {'shape': [10 * 20]}
-        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape, "inplace": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
     def test_check_output(self):
         self.check_output()
@@ -31,12 +35,33 @@ class TestReshapeOp(OpTest):
         self.check_grad(["X"], "Out")
 
 
-class TestReshapeOpDimInfer(OpTest):
+class TestReshapeOpDimInfer1(OpTest):
     def setUp(self):
+        ori_shape = (5, 10)
+        new_shape = (5, -1, 5)
+
         self.op_type = "reshape"
-        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
-        self.attrs = {'shape': [4, -1, 5]}
-        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape, "inplace": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestReshapeOpDimInfer2(OpTest):
+    def setUp(self):
+        ori_shape = (2, 2, 6)
+        new_shape = (2, 0, 3, -1)
+        infered_shape = (2, 2, 3, -1)
+
+        self.op_type = "reshape"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape, "inplace": False}
+        self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
 
     def test_check_output(self):
         self.check_output()
@@ -47,10 +72,30 @@ class TestReshapeOpDimInfer(OpTest):
 
 class TestReshapeOpInplace(OpTest):
     def setUp(self):
+        ori_shape = (2, 25)
+        new_shape = (5, 10)
+
+        self.op_type = "reshape"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestReshapeOpDimInferInplace1(OpTest):
+    def setUp(self):
+        ori_shape = (5, 10)
+        new_shape = (5, -1, 5)
+
         self.op_type = "reshape"
-        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
-        self.attrs = {'shape': [10 * 20], 'inplace': True}
-        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape}
+        self.outputs = {"Out": self.inputs["X"].reshape(new_shape)}
 
     def test_check_output(self):
         self.check_output()
@@ -59,12 +104,38 @@ class TestReshapeOpInplace(OpTest):
         self.check_grad(["X"], "Out")
 
 
-class TestReshapeOpDimInferInplace(OpTest):
+class TestReshapeOpDimInferInplace2(OpTest):
     def setUp(self):
+        ori_shape = (2, 2, 6)
+        new_shape = (2, 0, 3, -1)
+        infered_shape = (2, 2, 3, -1)
+
+        self.op_type = "reshape"
+        self.inputs = {"X": np.random.random(ori_shape).astype("float32")}
+        self.attrs = {"shape": new_shape}
+        self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestReshapeOpWithInputShape(OpTest):
+    def setUp(self):
+        ori_shape = (6, 5)
+        new_shape = (0, -1, 5)
+        actual_shape = (2, 3, 5)
+
         self.op_type = "reshape"
-        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
-        self.attrs = {'shape': [4, -1, 5], 'inplace': True}
-        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+        self.inputs = {
+            "X": np.random.random(ori_shape).astype("float32"),
+            "Shape": np.array(
+                actual_shape, dtype="int32")
+        }
+        self.attrs = {"shape": new_shape}
+        self.outputs = {"Out": self.inputs["X"].reshape(actual_shape)}
 
     def test_check_output(self):
         self.check_output()
@@ -73,5 +144,5 @@ class TestReshapeOpDimInferInplace(OpTest):
         self.check_grad(["X"], "Out")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reverse_op.py b/python/paddle/fluid/tests/unittests/test_reverse_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..f845575a02869f08299d76b5600074598ca27f6c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_reverse_op.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestReverseOp(OpTest):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4)).astype('float32')
+        self.axis = [0]
+
+    def setUp(self):
+        self.initTestCase()
+        self.op_type = "reverse"
+        self.inputs = {"X": self.x}
+        self.attrs = {'axis': self.axis}
+        out = self.x
+        for a in self.axis:
+            out = np.flip(out, axis=a)
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestCase0(TestReverseOp):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4)).astype('float32')
+        self.axis = [1]
+
+
+class TestCase1(TestReverseOp):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4)).astype('float32')
+        self.axis = [0, 1]
+
+
+class TestCase2(TestReverseOp):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4, 5)).astype('float32')
+        self.axis = [0, 2]
+
+
+class TestCase3(TestReverseOp):
+    def initTestCase(self):
+        self.x = np.random.random((3, 4, 5)).astype('float32')
+        self.axis = [1, 2]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
index e556d51b021217063e23190e40bc0e8f9fdc816c..df5684ab173a4889dd7b693f9246bafd12e0345f 100644
--- a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
@@ -25,7 +25,7 @@ class TestROIPoolOp(OpTest):
         self.make_rois()
         self.calc_roi_pool()
 
-        self.inputs = {'X': self.x, 'ROIs': self.rois}
+        self.inputs = {'X': self.x, 'ROIs': (self.rois[:, 1:5], self.rois_lod)}
 
         self.attrs = {
             'spatial_scale': self.spatial_scale,
@@ -36,7 +36,7 @@ class TestROIPoolOp(OpTest):
         self.outputs = {'Out': self.outs, 'Argmax': self.argmaxes}
 
     def init_test_case(self):
-        self.batch_size = 5
+        self.batch_size = 3
         self.channels = 3
         self.height = 6
         self.width = 4
@@ -47,7 +47,6 @@ class TestROIPoolOp(OpTest):
         self.spatial_scale = 1.0 / 4.0
         self.pooled_height = 2
         self.pooled_width = 2
-        self.rois_num = 2
 
         self.x = np.random.random(self.x_dim).astype('float32')
 
@@ -106,20 +105,23 @@ class TestROIPoolOp(OpTest):
 
     def make_rois(self):
         rois = []
-        batch_ids = np.random.randint(0, self.batch_size, size=self.rois_num)
-        for i in range(self.rois_num):
-            x1 = np.random.random_integers(
-                0, self.width / self.spatial_scale - self.pooled_width)
-            y1 = np.random.random_integers(
-                0, self.height / self.spatial_scale - self.pooled_height)
-
-            x2 = np.random.random_integers(x1 + self.pooled_width,
-                                           self.width / self.spatial_scale)
-            y2 = np.random.random_integers(y1 + self.pooled_height,
-                                           self.height / self.spatial_scale)
-
-            roi = [batch_ids[i], x1, y1, x2, y2]
-            rois.append(roi)
+        self.rois_lod = [[]]
+        for bno in range(self.batch_size):
+            self.rois_lod[0].append(bno + 1)
+            for i in range(bno + 1):
+                x1 = np.random.random_integers(
+                    0, self.width / self.spatial_scale - self.pooled_width)
+                y1 = np.random.random_integers(
+                    0, self.height / self.spatial_scale - self.pooled_height)
+
+                x2 = np.random.random_integers(x1 + self.pooled_width,
+                                               self.width / self.spatial_scale)
+                y2 = np.random.random_integers(y1 + self.pooled_height,
+                                               self.height / self.spatial_scale)
+
+                roi = [bno, x1, y1, x2, y2]
+                rois.append(roi)
+        self.rois_num = len(rois)
         self.rois = np.array(rois).astype("int64")
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_row_conv_op.py b/python/paddle/fluid/tests/unittests/test_row_conv_op.py
index 30f1efbcbcb11332c85c9d5489f22c17b06c2b36..07dcd108689ae6069e30fe22029258d192215549 100644
--- a/python/paddle/fluid/tests/unittests/test_row_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_row_conv_op.py
@@ -19,8 +19,10 @@ from op_test import OpTest
 
 def row_conv_forward(x, lod, wt):
     out = np.zeros_like(x)
-    seq_info = lod[0]
-    num_sequences = len(seq_info) - 1
+    num_sequences = len(lod[0])
+    seq_info = [0]
+    for seq_len in lod[0]:
+        seq_info.append(seq_info[-1] + seq_len)
     context_length = wt.shape[0]
 
     for i in range(num_sequences):  # loop over number of sequences
@@ -32,7 +34,6 @@ def row_conv_forward(x, lod, wt):
         cur_timesteps = end - start
         for j in range(cur_timesteps):  # loop over different timesteps
             for k in range(context_length):
-
                 if j + k >= cur_timesteps:
                     continue
                 curoutput[j, :] += curinput[j + k, :] * wt[k, :]
@@ -44,8 +45,8 @@ class TestRowConvOp1(OpTest):
     def setUp(self):
 
         self.op_type = "row_conv"
-        lod = [[0, 2, 5, 7]]
-        T = lod[0][-1]
+        lod = [[2, 3, 2]]
+        T = sum(lod[0])
         D = 16
         context_length = 2
 
@@ -75,8 +76,8 @@ class TestRowConvOp2(OpTest):
     def setUp(self):
 
         self.op_type = "row_conv"
-        lod = [[0, 20, 50, 100]]
-        T = lod[0][-1]
+        lod = [[20, 30, 50]]
+        T = sum(lod[0])
         D = 35
         context_length = 35
 
diff --git a/python/paddle/fluid/tests/unittests/test_scatter_op.py b/python/paddle/fluid/tests/unittests/test_scatter_op.py
index bb02a40d449860cf6c0576662e79a5e36e6e0635..fb1728743630b3ea908ae835444eff7fd71b72c8 100644
--- a/python/paddle/fluid/tests/unittests/test_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scatter_op.py
@@ -25,7 +25,7 @@ class TestScatterOp(OpTest):
         updates_np = np.random.random((2, 3)).astype("float32")
         output_np = np.copy(ref_np)
         output_np[index_np] = updates_np
-        self.inputs = {'Ref': ref_np, 'Index': index_np, 'Updates': updates_np}
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
         self.outputs = {'Out': output_np}
 
     def test_check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/test_seq_concat_op.py b/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
index 10592d127fafdf202c65fcfa91b5c464cc60e96c..11ffa761a690eb1f9f6dc50c45128a99301741db 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
@@ -18,14 +18,19 @@ import sys
 from op_test import OpTest
 
 
-def to_abs_lod(lod):
-    if len(lod) == 0 or len(lod) == 1:
-        return lod
+def to_abs_offset_lod(lod):
+    offset_lod = [[0] for i in lod]
+    for i, level in enumerate(lod):
+        for seq_len in level:
+            offset_lod[i].append(offset_lod[i][-1] + seq_len)
+
+    if len(offset_lod) == 0 or len(offset_lod) == 1:
+        return offset_lod
     import copy
-    new_lod = copy.deepcopy(lod)
-    for idx, val in enumerate(lod[0]):
-        new_lod[0][idx] = lod[1][val]
-    return new_lod
+    new_offset_lod = copy.deepcopy(offset_lod)
+    for idx, val in enumerate(offset_lod[0]):
+        new_offset_lod[0][idx] = offset_lod[1][val]
+    return new_offset_lod
 
 
 def seq_concat(inputs, level):
@@ -35,11 +40,11 @@ def seq_concat(inputs, level):
     x1 = inputs['X'][1][1][0]
     level_idx = len(lod0) - level - 1
     outs = []
-    for i in range(len(lod0[level_idx]) - 1):
-        sub_x0 = x0[to_abs_lod(lod0)[level_idx][i]:to_abs_lod(lod0)[level_idx][
-            i + 1], :]
-        sub_x1 = x1[to_abs_lod(lod1)[level_idx][i]:to_abs_lod(lod1)[level_idx][
-            i + 1], :]
+    for i in range(len(lod0[level_idx])):
+        sub_x0 = x0[to_abs_offset_lod(lod0)[level_idx][i]:to_abs_offset_lod(
+            lod0)[level_idx][i + 1], :]
+        sub_x1 = x1[to_abs_offset_lod(lod1)[level_idx][i]:to_abs_offset_lod(
+            lod1)[level_idx][i + 1], :]
         outs.append(np.concatenate((sub_x0, sub_x1), axis=0))
     return np.concatenate(outs, axis=0)
 
@@ -48,9 +53,9 @@ class TestSeqConcatOp(OpTest):
     def set_data(self):
         # two level, batch size is 3
         x0 = np.random.random((4, 6, 3)).astype('float32')
-        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        lod0 = [[2, 2], [1, 1, 1, 1]]
         x1 = np.random.random((4, 8, 3)).astype('float32')
-        lod1 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        lod1 = [[2, 2], [1, 1, 1, 1]]
         axis = 1
         level = 1
         self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
@@ -72,14 +77,14 @@ class TestSeqConcatOpLevelZeroNestedSequence(TestSeqConcatOp):
     def set_data(self):
         # two level, batch size is 3
         x0 = np.random.random((4, 6, 3)).astype('float32')
-        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        lod0 = [[2, 2], [1, 1, 1, 1]]
         x1 = np.random.random((7, 6, 3)).astype('float32')
-        lod1 = [[0, 2, 4], [0, 1, 3, 5, 7]]
+        lod1 = [[2, 2], [1, 2, 2, 2]]
         axis = 0
         level = 0
         self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
         self.attrs = {'axis': axis, 'level': level}
-        out_lod = [[0, 2, 4], [0, 2, 5, 8, 11]]
+        out_lod = [[2, 2], [2, 3, 3, 3]]
         self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)}
 
 
@@ -87,14 +92,14 @@ class TestSeqConcatOplevelOneNestedSequence(TestSeqConcatOp):
     def set_data(self):
         # two level, batch size is 3
         x0 = np.random.random((4, 6, 3)).astype('float32')
-        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        lod0 = [[2, 2], [1, 1, 1, 1]]
         x1 = np.random.random((7, 6, 3)).astype('float32')
-        lod1 = [[0, 3, 4], [0, 1, 3, 5, 7]]
+        lod1 = [[3, 1], [1, 2, 2, 2]]
         axis = 0
         level = 1
         self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
         self.attrs = {'axis': axis, 'level': level}
-        out_lod = [[0, 5, 8], [0, 1, 2, 3, 5, 7, 8, 9, 11]]
+        out_lod = [[5, 3], [1, 1, 1, 2, 2, 1, 1, 2]]
         self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)}
 
 
@@ -102,14 +107,14 @@ class TestSeqConcatOpLevelZeroSequence(TestSeqConcatOp):
     def set_data(self):
         # two level, batch size is 3
         x0 = np.random.random((4, 3, 4)).astype('float32')
-        lod0 = [[0, 1, 2, 3, 4]]
+        lod0 = [[1, 1, 1, 1]]
         x1 = np.random.random((7, 3, 4)).astype('float32')
-        lod1 = [[0, 1, 3, 5, 7]]
+        lod1 = [[1, 2, 2, 2]]
         axis = 0
         level = 0
         self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
         self.attrs = {'axis': axis, 'level': level}
-        out_lod = [[0, 2, 5, 8, 11]]
+        out_lod = [[2, 3, 3, 3]]
         self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)}
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_seq_conv.py b/python/paddle/fluid/tests/unittests/test_seq_conv.py
index 51dbf1f61834ff0093d76ed546be27a585697d40..9701d9adef1fd272f2520f66607acded6a8c25c6 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_conv.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_conv.py
@@ -75,35 +75,38 @@ class TestSeqProject(OpTest):
         pading_data = self.pad_data
         out = np.zeros((self.input_size[0], self.context_length *
                         self.input_size[1])).astype('float32')
-        lod = lod[0]
+        offset = [0]
+        for seq_len in lod[0]:
+            offset.append(offset[-1] + seq_len)
         begin_pad = np.max([0, -self.context_start])
 
-        for i in range(len(lod) - 1):
+        for i in range(len(offset) - 1):
             for j in range(self.context_length):
-                in_begin = lod[i] + self.context_start + j
-                in_end = lod[i + 1] + self.context_start + j
-                out_begin = lod[i]
-                out_end = lod[i + 1]
-                if in_begin < lod[i]:
-                    pad_size = np.min([lod[i] - in_begin, lod[i + 1] - lod[i]])
+                in_begin = offset[i] + self.context_start + j
+                in_end = offset[i + 1] + self.context_start + j
+                out_begin = offset[i]
+                out_end = offset[i + 1]
+                if in_begin < offset[i]:
+                    pad_size = np.min(
+                        [offset[i] - in_begin, offset[i + 1] - offset[i]])
                     if self.padding_trainable:
                         sub_w = pading_data[j:j + pad_size, :]
-                        out[lod[i]:lod[i] + pad_size, j * self.input_size[1]:(
-                            j + 1) * self.input_size[1]] = sub_w
-                    out_begin = lod[i] + pad_size
-                    in_begin = lod[i]
+                        out[offset[i]:offset[i] + pad_size, j * self.input_size[
+                            1]:(j + 1) * self.input_size[1]] = sub_w
+                    out_begin = offset[i] + pad_size
+                    in_begin = offset[i]
 
-                if in_end > lod[i + 1]:
+                if in_end > offset[i + 1]:
                     pad_size = np.min(
-                        [in_end - lod[i + 1], lod[i + 1] - lod[i]])
+                        [in_end - offset[i + 1], offset[i + 1] - offset[i]])
                     if self.padding_trainable:
                         sub_w = pading_data[begin_pad + self.context_start + j -
                                             pad_size:begin_pad +
                                             self.context_start + j, :]
-                        out[lod[i + 1] - pad_size:lod[i + 1], j * self.
+                        out[offset[i + 1] - pad_size:offset[i + 1], j * self.
                             input_size[1]:(j + 1) * self.input_size[1]] = sub_w
-                    in_end = lod[i + 1]
-                    out_end = lod[i + 1] - pad_size
+                    in_end = offset[i + 1]
+                    out_end = offset[i + 1] - pad_size
                 if in_end <= in_begin:
                     continue
 
@@ -175,7 +178,11 @@ class TestSeqProject(OpTest):
         self.context_stride = 1
 
         self.input_size = [self.input_row, 23]
-        self.lod = [[0, 4, 5, 8, self.input_row]]
+        offset_lod = [[0, 4, 5, 8, self.input_row]]
+        self.lod = [[]]
+        # convert from offset-based lod to length-based lod
+        for i in range(len(offset_lod[0]) - 1):
+            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
         self.output_represention = 8  # output feature size
 
 
@@ -188,7 +195,11 @@ class TestSeqProjectCase1(TestSeqProject):
         self.context_stride = 1
 
         self.input_size = [self.input_row, 23]
-        self.lod = [[0, 4, 5, 8, self.input_row]]
+        offset_lod = [[0, 4, 5, 8, self.input_row]]
+        self.lod = [[]]
+        # convert from offset-based lod to length-based lod
+        for i in range(len(offset_lod[0]) - 1):
+            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
         self.output_represention = 8  # output feature size
 
 
@@ -203,8 +214,12 @@ class TestSeqProjectCase2(TestSeqProject):
         self.input_size = [self.input_row, 23]
         idx = range(self.input_size[0])
         del idx[0]
-        self.lod = [[0] + np.sort(random.sample(idx, 8)).tolist() +
-                    [self.input_size[0]]]
+        offset_lod = [[0] + np.sort(random.sample(idx, 8)).tolist() +
+                      [self.input_size[0]]]
+        self.lod = [[]]
+        # convert from offset-based lod to length-based lod
+        for i in range(len(offset_lod[0]) - 1):
+            self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i])
         self.output_represention = 8  # output feature size
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_seq_pool.py b/python/paddle/fluid/tests/unittests/test_seq_pool.py
index 04884757216bc29a96eb97a6db403c3925472294..0b3659d7a67956f7546d368346bd102eeedf1d97 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_pool.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_pool.py
@@ -18,26 +18,34 @@ from op_test import OpTest
 
 
 class TestSeqAvgPool(OpTest):
+    def convert_to_offset(self, lod):
+        offset = [[0] for i in lod]
+        for i, level in enumerate(lod):
+            for seq_len in level:
+                offset[i].append(offset[i][-1] + seq_len)
+        return offset
+
     def set_data(self):
         self.op_type = 'sequence_pool'
         # one level, batch size is 4
         x = np.random.uniform(0.1, 1, [11, 23]).astype('float32')
-        lod = [[0, 4, 5, 8, 11]]
+        lod = [[4, 1, 3, 3]]
         self.inputs = {'X': (x, lod)}
+        offset = self.convert_to_offset(lod)
 
         out = np.zeros((4, 23)).astype('float32')
         self.outputs = {'Out': out}
-        return x, lod, out
+        return x, offset, out
 
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "AVERAGE"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+        for i in range(len(offset[0]) - 1):
+            sub_x = x[offset[0][i]:offset[0][i + 1], :]
             out[i] = sub_x.mean(axis=0)
 
     def setUp(self):
-        x, lod, out = self.set_data()
-        self.compute(x, lod, out)
+        x, offset, out = self.set_data()
+        self.compute(x, offset, out)
 
     def test_check_output(self):
         self.check_output()
@@ -49,57 +57,100 @@ class TestSeqAvgPool(OpTest):
         self.check_grad(["X"], "Out")
 
 
+class TestSeqSumPool(TestSeqAvgPool):
+    def compute(self, x, offset, out):
+        self.attrs = {'pooltype': "SUM"}
+        for i in range(len(offset[0]) - 1):
+            sub_x = x[offset[0][i]:offset[0][i + 1], :]
+            out[i] = sub_x.sum(axis=0)
+
+
+class TestSeqMaxPool(TestSeqAvgPool):
+    def set_data(self):
+        self.op_type = 'sequence_pool'
+        x = np.random.uniform(0.1, 1, [13, 23]).astype('float32')
+        lod = [[4, 1, 3, 5]]
+        offset = self.convert_to_offset(lod)
+        for i in range(len(offset[0]) - 1):
+            l = offset[0][i + 1] - offset[0][i]
+            x[offset[0][i] + np.random.randint(l), :] += 2.0
+
+        self.inputs = {'X': (x, lod)}
+
+        out = np.zeros((4, 23)).astype('float32')
+        self.outputs = {'Out': out}
+        return x, offset, out
+
+    def compute(self, x, offset, out):
+        self.attrs = {'pooltype': "MAX"}
+        for i in range(len(offset[0]) - 1):
+            sub_x = x[offset[0][i]:offset[0][i + 1], :]
+            out[i] = np.amax(sub_x, axis=0)
+
+
+class TestSeqSqrtPool(TestSeqAvgPool):
+    def compute(self, x, offset, out):
+        self.attrs = {'pooltype': "SQRT"}
+        for i in range(len(offset[0]) - 1):
+            sub_x = x[offset[0][i]:offset[0][i + 1], :]
+            seq_len = offset[0][i + 1] - offset[0][i]
+            out[i] = sub_x.sum(axis=0) / np.sqrt(seq_len)
+
+
+class TestSeqLastPool(TestSeqAvgPool):
+    def compute(self, x, offset, out):
+        self.attrs = {'pooltype': "LAST"}
+        for i in range(len(offset[0]) - 1):
+            sub_x = x[offset[0][i]:offset[0][i + 1], :]
+            out[i] = sub_x[-1, :]
+
+
+class TestSeqFirstPool(TestSeqAvgPool):
+    def compute(self, x, offset, out):
+        self.attrs = {'pooltype': "FIRST"}
+        for i in range(len(offset[0]) - 1):
+            sub_x = x[offset[0][i]:offset[0][i + 1], :]
+            out[i] = sub_x[0, :]
+
+
 class TestSeqAvgPool2D(TestSeqAvgPool):
     def set_data(self):
         self.op_type = 'sequence_pool'
         # one level, batch size is 4
         x = np.random.uniform(0.1, 1, [13, 3, 17]).astype('float32')
-        lod = [[0, 4, 5, 8, 13]]
+        lod = [[4, 1, 3, 5]]
         self.inputs = {'X': (x, lod)}
+        offset = self.convert_to_offset(lod)
 
         out = np.zeros((4, 3, 17)).astype('float32')
         self.outputs = {'Out': out}
-        return x, lod, out
+        return x, offset, out
 
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "AVERAGE"}
-        for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 17))
             out[i] = np.reshape(sub_x.mean(axis=0), (3, 17))
 
 
-class TestSeqSumPool(TestSeqAvgPool):
-    def compute(self, x, lod, out):
-        self.attrs = {'pooltype': "SUM"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
-            out[i] = sub_x.sum(axis=0)
-
-
 class TestSeqSumPool2D(TestSeqAvgPool2D):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "SUM"}
-        for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 17))
             out[i] = np.reshape(sub_x.sum(axis=0), (3, 17))
 
 
-class TestSeqSqrtPool(TestSeqAvgPool):
-    def compute(self, x, lod, out):
-        self.attrs = {'pooltype': "SQRT"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
-            len = lod[0][i + 1] - lod[0][i]
-            out[i] = sub_x.sum(axis=0) / np.sqrt(len)
-
-
 class TestSeqSqrtPool2D(TestSeqAvgPool2D):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "SQRT"}
-        for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
-            len = lod[0][i + 1] - lod[0][i]
-            out[i] = np.reshape(sub_x.sum(axis=0) / np.sqrt(len), (3, 17))
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 17))
+            seq_len = offset[0][i + 1] - offset[0][i]
+            out[i] = np.reshape(sub_x.sum(axis=0) / np.sqrt(seq_len), (3, 17))
 
     def test_check_grad(self):
         # Remove MaxIndex after check_grad is refined.
@@ -108,78 +159,44 @@ class TestSeqSqrtPool2D(TestSeqAvgPool2D):
         self.check_grad(["X"], "Out", max_relative_error=0.06)
 
 
-class TestSeqMaxPool(TestSeqAvgPool):
-    def set_data(self):
-        self.op_type = 'sequence_pool'
-        x = np.random.uniform(0.1, 1, [13, 23]).astype('float32')
-        lod = [[0, 4, 5, 8, 13]]
-        for i in range(4):
-            l = lod[0][i + 1] - lod[0][i]
-            x[lod[0][i] + np.random.randint(l), :] += 2.0
-
-        self.inputs = {'X': (x, lod)}
-
-        out = np.zeros((4, 23)).astype('float32')
-        self.outputs = {'Out': out}
-        return x, lod, out
-
-    def compute(self, x, lod, out):
-        self.attrs = {'pooltype': "MAX"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
-            out[i] = np.amax(sub_x, axis=0)
-
-
 class TestSeqMaxPool2D(TestSeqAvgPool2D):
     def set_data(self):
         self.op_type = 'sequence_pool'
         x = np.random.uniform(0.1, 1, [13, 3, 11]).astype('float32')
-        lod = [[0, 4, 5, 8, 13]]
+        lod = [[4, 1, 3, 5]]
         self.inputs = {'X': (x, lod)}
-        for i in range(4):
-            l = lod[0][i + 1] - lod[0][i]
-            x[lod[0][i] + np.random.randint(l), :] += 1.0
+        offset = self.convert_to_offset(lod)
+        for i in range(len(offset[0]) - 1):
+            l = offset[0][i + 1] - offset[0][i]
+            x[offset[0][i] + np.random.randint(l), :] += 1.0
 
         out = np.zeros((4, 3, 11)).astype('float32')
         self.outputs = {'Out': out}
-        return x, lod, out
+        return x, offset, out
 
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "MAX"}
-        for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 11))
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 11))
             out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
 
 
-class TestSeqLastPool(TestSeqAvgPool):
-    def compute(self, x, lod, out):
-        self.attrs = {'pooltype': "LAST"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
-            out[i] = sub_x[-1, :]
-
-
 class TestSeqLastPool2D(TestSeqAvgPool2D):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "LAST"}
-        for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 17))
             out[i] = np.reshape(sub_x[-1, :], (3, 17))
 
 
-class TestSeqFirstPool(TestSeqAvgPool):
-    def compute(self, x, lod, out):
-        self.attrs = {'pooltype': "FIRST"}
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
-            out[i] = sub_x[0, :]
-
-
 class TestSeqFirstPool2D(TestSeqAvgPool2D):
-    def compute(self, x, lod, out):
+    def compute(self, x, offset, out):
         self.attrs = {'pooltype': "FIRST"}
-        for i in range(4):
-            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+        for i in range(len(offset[0]) - 1):
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 17))
             out[i] = np.reshape(sub_x[0, :], (3, 17))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py b/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
index ebab77e8041d5ff1bd845fb121e5901116fd0254..8f0765277ae85af2b17ad96d4fd0c1148c393ff0 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
@@ -18,15 +18,17 @@ from op_test import OpTest
 
 
 def sequence_erase(in_seq, lod0, tokens):
-    new_lod0 = [0]
+    new_lod0 = []
     out_seq = []
-    for i in range(0, len(lod0) - 1):
+    offset = 0
+    for i in range(0, len(lod0)):
         num_out = 0
-        for dat in in_seq[lod0[i]:lod0[i + 1]]:
+        for dat in in_seq[offset:(offset + lod0[i])]:
             if dat not in tokens:
                 out_seq.append(dat)
                 num_out += 1
-        new_lod0.append(new_lod0[-1] + num_out)
+        offset += lod0[i]
+        new_lod0.append(num_out)
     return np.array(out_seq).astype("int32"), new_lod0
 
 
@@ -34,7 +36,7 @@ class TestSequenceEraseOpInt32(OpTest):
     def setUp(self):
         self.op_type = "sequence_erase"
         in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
-        lod = [[0, 9, 13, 24, 30]]
+        lod = [[9, 4, 11, 6]]
         tokens = [2, 3, 5]
         out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
         self.attrs = {'tokens': tokens}
@@ -49,7 +51,7 @@ class TestSequenceEraseOpInt64(OpTest):
     def setUp(self):
         self.op_type = "sequence_erase"
         in_seq = np.random.randint(0, 10, (30, 1)).astype("int64")
-        lod = [[0, 9, 13, 24, 30]]
+        lod = [[9, 4, 11, 6]]
         tokens = [2, 3, 5]
         out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
         self.attrs = {'tokens': tokens}
@@ -64,7 +66,7 @@ class TestSequenceEraseOpEmpty(OpTest):
     def setUp(self):
         self.op_type = "sequence_erase"
         in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
-        lod = [[0, 9, 13, 24, 30]]
+        lod = [[9, 4, 11, 6]]
         tokens = []
         out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
         self.attrs = {'tokens': tokens}
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_expand.py b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
index 957fa5d2c4a795cfd01047c1b7845674e4c1d549..0bbd31814efdff6050733f6876ef64e3fcaaaf76 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
@@ -21,18 +21,48 @@ class TestSequenceExpand(OpTest):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
         y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
-        y_lod = [[0, 1, 4, 8]]
+        y_lod = [[1, 3, 4]]
         self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
 
     def compute(self):
         x = self.inputs['X']
         x_data, x_lod = x if type(x) == tuple else (x, None)
-        n = 1 + x_data.shape[0] if not x_lod else len(x_lod[0])
         y_data, y_lod = self.inputs['Y']
-        repeats = [((y_lod[-1][i + 1] - y_lod[-1][i]))
-                   for i in range(len(y_lod[-1]) - 1)]
-        out = x_data.repeat(repeats, axis=0)
-        self.outputs = {'Out': out}
+
+        if hasattr(self, 'attrs'):
+            ref_level = self.attrs['ref_level']
+        else:
+            ref_level = len(y_lod) - 1
+
+        out = np.zeros(shape=((0, ) + x_data.shape[1:]), dtype=x_data.dtype)
+
+        if x_lod is None:
+            # x_idx = [i for i in xrange(x_data.shape[0] + 1)]
+            x_idx = [1] * x_data.shape[0]
+        else:
+            x_idx = x_lod[0]
+            out_lod = [[]]
+
+        offset = 0
+        for i in xrange(len(y_lod[ref_level])):
+            repeat_num = y_lod[ref_level][i]
+            x_len = x_idx[i]
+
+            if repeat_num > 0:
+                x_sub = x_data[offset:(offset + x_len), :]
+                stacked_x_sub = x_sub
+                for r in range(repeat_num - 1):
+                    stacked_x_sub = np.vstack((stacked_x_sub, x_sub))
+                out = np.vstack((out, stacked_x_sub))
+                if x_lod is not None:
+                    for j in xrange(repeat_num):
+                        out_lod[0].append(x_len)
+            offset += x_len
+
+        if x_lod is None:
+            self.outputs = {'Out': out}
+        else:
+            self.outputs = {'Out': (out, out_lod)}
 
     def setUp(self):
         self.op_type = 'sequence_expand'
@@ -49,42 +79,39 @@ class TestSequenceExpand(OpTest):
 class TestSequenceExpandCase1(TestSequenceExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
-        x_lod = [[0, 2, 5]]
+        x_lod = [[2, 3]]
         y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32')
-        y_lod = [[0, 2, 5], [0, 2, 4, 7, 10, 13]]
-        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
+        y_lod = [[2, 3], [2, 2, 3, 3, 3]]
+        self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
+        self.attrs = {'ref_level': 0}
 
 
 class TestSequenceExpandCase2(TestSequenceExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32')
-        x_lod = [[0, 1]]
+        x_lod = [[1]]
         y_data = np.random.uniform(0.1, 1, [2, 2, 2]).astype('float32')
-        y_lod = [[0, 2]]
+        y_lod = [[2], [1, 1]]
         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
+        self.attrs = {'ref_level': 0}
 
 
 class TestSequenceExpandCase3(TestSequenceExpand):
     def set_data(self):
         x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32')
-        x_lod = [[0, 1, 2, 3, 4]]
-        y_data = np.random.uniform(0.1, 1, [6, 1]).astype('float32')
-        y_lod = [[0, 2, 4, 4, 6]]
+        x_lod = [[1, 1, 1, 1]]
+        y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
+        y_lod = [[2, 2, 2, 2]]
         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
 
 
 class TestSequenceExpandCase4(TestSequenceExpand):
     def set_data(self):
-        x_data = np.array(
-            [0.1, 0.3, 0.2, 0.15, 0.25, 0.2, 0.15, 0.25, 0.1, 0.3]).reshape(
-                [2, 5]).astype('float32')
-        x_lod = [[
-            0,
-            1,
-            2,
-        ]]
-        y_data = np.random.uniform(0.1, 1, [2, 1]).astype('float32')
-        y_lod = [[0, 1, 2], [0, 1, 2]]
+        data = np.random.uniform(0.1, 1, [5 * 2, 1])
+        x_data = np.array(data).reshape([5, 2]).astype('float32')
+        x_lod = [[2, 3]]
+        y_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
+        y_lod = [[2], [2, 3]]
         self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
index efeab560392d8c03b1bb5db83f59c12d4fef64b0..68f2e5eba35ed318281d14e397dc6d363bcb4079 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
@@ -22,7 +22,7 @@ class TestSequenceReshape(OpTest):
     def setUp(self):
         self.op_type = 'sequence_reshape'
         dimension = 12
-        x_lod = [[0, 4, 5, 8, 11]]
+        x_lod = [[4, 1, 3, 3]]
         x = np.random.uniform(0.1, 1, [11, 24]).astype('float32')
 
         self.inputs = {'X': (x, x_lod)}
@@ -34,13 +34,13 @@ class TestSequenceReshape(OpTest):
 
     def compute_output(self, x, x_lod, dimension):
         x_width = x.shape[1]
-        out_lod = [[0]]
-        for i in xrange(len(x_lod[0]) - 1):
-            seq_len = x_lod[0][i + 1] - x_lod[0][i]
+        out_lod = [[]]
+        for i in xrange(len(x_lod[0])):
+            seq_len = x_lod[0][i]
             offset = (seq_len * x_width) / dimension
             assert int(offset) * dimension == seq_len * x_width
-            out_lod[0].append(out_lod[0][-1] + int(offset))
-        out = np.zeros(shape=(out_lod[0][-1], dimension)).astype('float32')
+            out_lod[0].append(int(offset))
+        out = np.zeros(shape=(sum(out_lod[0]), dimension)).astype('float32')
         out.ravel()[:] = x.ravel()[:]
         return out, out_lod
 
@@ -55,7 +55,7 @@ class TestSequenceReshape_reduce(TestSequenceReshape):
     def setUp(self):
         self.op_type = 'sequence_reshape'
         dimension = 24
-        x_lod = [[0, 4, 6, 8, 12]]
+        x_lod = [[4, 2, 2, 4]]
         x = np.random.uniform(0.1, 1, [12, 12]).astype('float32')
 
         self.inputs = {'X': (x, x_lod)}
@@ -70,7 +70,7 @@ class TestSequenceReshape_same(TestSequenceReshape):
     def setUp(self):
         self.op_type = 'sequence_reshape'
         dimension = 12
-        x_lod = [[0, 4, 6, 8, 12]]
+        x_lod = [[4, 2, 2, 4]]
         x = np.random.uniform(0.1, 1, [12, 12]).astype('float32')
 
         self.inputs = {'X': (x, x_lod)}
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py b/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
index 660b4a171d09ddfc0e78b650a467db6b576c7ee3..313e485d1e3080f2c59c68256cbc5c81aa6558cd 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
@@ -29,20 +29,20 @@ class TestSequenceSliceOp(OpTest):
 
         self.inputs = {'X': (x, lod), 'Offset': offset, 'Length': length}
         outs = []  #np.zeros((100, 3, 2)).astype('float32')
-        out_lod = [[0]]
-        out_lod_offset = 0
+        out_lod = [[]]
+        lod_offset = 0
         for i in range(len(offset)):
-            sub_x = x[lod[0][i] + offset[i, 0]:lod[0][i] + offset[i, 0] +
+            sub_x = x[lod_offset + offset[i, 0]:lod_offset + offset[i, 0] +
                       length[i, 0], :]
-            out_lod_offset = out_lod_offset + len(sub_x)
             outs.append(sub_x)
-            out_lod[0].append(out_lod_offset)
+            out_lod[0].append(len(sub_x))
+            lod_offset += lod[0][i]
         outs = np.concatenate(outs, axis=0)
         self.outputs = {'Out': (outs, out_lod)}
 
     def init_test_case(self):
         self.x_dim = (100, 3, 2)
-        self.x_lod = [[0, 20, 40, 60, 80, 100]]
+        self.x_lod = [[20, 20, 20, 20, 20]]
         self.offset = [[1], [2], [3], [4], [5]]
         self.length = [[10], [8], [6], [4], [2]]
 
diff --git a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
index 9e5c1e7a3d0bdf514de11e797d7139f577002c52..e91a69a0f8039651225039beb2a42e8dffeb62d3 100644
--- a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
@@ -16,30 +16,54 @@ import unittest
 import numpy as np
 from op_test import OpTest
 from test_softmax_op import stable_softmax
+import paddle.fluid.core as core
 
 
 class TestSequenceSoftmaxOp(OpTest):
     def setUp(self):
         self.op_type = "sequence_softmax"
+        self.use_cudnn = False
+        self.init_op_type()
+
         x = np.random.uniform(0.1, 1, (11, 1)).astype("float32")
-        lod = [[0, 4, 5, 8, 11]]
+        lod = [[4, 1, 3, 3]]
 
         out = np.zeros((11, 1)).astype("float32")
-        for i in range(4):
-            sub_x = x[lod[0][i]:lod[0][i + 1], :]
-            sub_x = sub_x.reshape(1, lod[0][i + 1] - lod[0][i])
+        offset = 0
+        for i in range(len(lod[0])):
+            sub_x = x[offset:offset + lod[0][i], :]
+            sub_x = sub_x.reshape(1, lod[0][i])
             sub_out = stable_softmax(sub_x)
-            out[lod[0][i]:lod[0][i + 1], :] = sub_out.reshape(
-                lod[0][i + 1] - lod[0][i], 1)
+            out[offset:offset + lod[0][i], :] = sub_out.reshape(lod[0][i], 1)
+            offset += lod[0][i]
 
         self.inputs = {"X": (x, lod)}
         self.outputs = {"Out": out}
+        self.attrs = {'use_cudnn': self.use_cudnn, }
+
+    def init_op_type(self):
+        pass
 
     def test_check_output(self):
-        self.check_output()
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=1e-5)
+        else:
+            self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(["X"], "Out", max_relative_error=0.01)
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, ["X"], "Out", max_relative_error=0.01)
+        else:
+            self.check_grad(["X"], "Out", max_relative_error=0.01)
+
+
+# ----------------cudnn Sequencesoftmax----------------
+class TestSequenceSoftmaxCUDNNOp(TestSequenceSoftmaxOp):
+    def init_op_type(self):
+        self.use_cudnn = True
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op.py b/python/paddle/fluid/tests/unittests/test_sgd_op.py
index c498b23db12cd83304f4c3a3d1f15bd68ad4f0b6..3126293f9d8e52daa866be5fc1533648a33f3363 100644
--- a/python/paddle/fluid/tests/unittests/test_sgd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py
@@ -97,5 +97,72 @@ class TestSparseSGDOp(unittest.TestCase):
             self.check_with_place(place)
 
 
+class TestSGDOpOptimizeSelectedRows(unittest.TestCase):
+    def check_with_place(self, place):
+        scope = core.Scope()
+
+        row_width = 12
+        # create and initialize Grad Variable
+        grad_height = 10
+        grad_rows = [0, 4, 7]
+
+        grad_selected_rows = scope.var('Grad').get_selected_rows()
+        grad_selected_rows.set_height(grad_height)
+        grad_selected_rows.set_rows(grad_rows)
+        grad_array = np.ones((len(grad_rows), row_width)).astype("float32")
+        grad_array[0, 0] = 2.0
+        grad_array[2, 8] = 4.0
+
+        grad_tensor = grad_selected_rows.get_tensor()
+        grad_tensor.set(grad_array, place)
+
+        # create and initialize Param Variable
+        # create and initialize W Variable
+        param_rows = [0, 1, 2, 3, 4, 5, 6, 7]
+
+        # init Param
+        w_selected_rows = scope.var('Param').get_selected_rows()
+        w_selected_rows.set_height(len(param_rows))
+        w_selected_rows.set_rows(param_rows)
+        w_array = np.ones((len(param_rows), row_width)).astype("float32")
+        for i in range(len(param_rows)):
+            w_array[i] *= i
+        w_tensor = w_selected_rows.get_tensor()
+        w_tensor.set(w_array, place)
+
+        w_before_optimize = np.array(w_tensor)
+
+        # create and initialize LeraningRate Variable
+        lr_value = 0.1
+        lr = scope.var('LearningRate').get_tensor()
+        lr_array = np.full((1), lr_value).astype("float32")
+        lr.set(lr_array, place)
+
+        # optimize with Python
+        w_after_optimize = np.copy(w_before_optimize)
+        for index, id in enumerate(grad_rows):
+            w_after_optimize[id] = w_before_optimize[
+                id] - lr_value * grad_array[index]
+
+        # create and run sgd operator
+        sgd_op = Operator(
+            "sgd",
+            Param='Param',
+            Grad='Grad',
+            ParamOut='Param',
+            LearningRate='LearningRate')
+        sgd_op.run(scope, place)
+
+        # get and compare result
+        result_array = np.array(w_tensor)
+        assert (result_array == w_after_optimize).all()
+
+    def test_sparse_parameter_sgd(self):
+        places = [core.CPUPlace()]
+        # do not support GPU kernel currently
+        for place in places:
+            self.check_with_place(place)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_shape_op.py b/python/paddle/fluid/tests/unittests/test_shape_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..a62ee050075cb8c9f8817c142825a89c24bdfedf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_shape_op.py
@@ -0,0 +1,47 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestShapeOp(OpTest):
+    def setUp(self):
+        self.op_type = "shape"
+        self.config()
+        self.shape = [2, 3]
+        input = np.zeros(self.shape)
+        self.inputs = {'Input': input}
+        self.outputs = {'Out': np.array(self.shape)}
+
+    def config(self):
+        self.shape = [2, 3]
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class case1(TestShapeOp):
+    def config(self):
+        self.shape = [2]
+
+
+class case2(TestShapeOp):
+    def config(self):
+        self.shape = [1, 2, 3]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
index 1d93230e7b74c5b6c00bbe125e3ae2d3a649b4b9..b779f0fb014bbba62927754ea6f36828a32e6c0a 100644
--- a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
+++ b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
@@ -54,12 +54,12 @@ class TestShrinkRNNMemoryReferLoD(TestShrinkRNNMemoryBase):
     def test_refer_lod(self):
         cpu = core.CPUPlace()
         x_tensor = core.LoDTensor()
-        x_tensor.set_lod([[0, 2, 5, 6]])
+        x_tensor.set_recursive_sequence_lengths([[2, 3, 1]])
         tensor_np = np.random.random(size=(6, 100)).astype('float32')
         x_tensor.set(tensor_np, cpu)
 
         rank_table_tensor = core.LoDTensor()
-        rank_table_tensor.set_lod([[0, 1, 3, 6]])
+        rank_table_tensor.set_recursive_sequence_lengths([[1, 2, 3]])
         rank_table_tensor.set(np.random.random(size=(6, 1)).astype('float32'),
                               cpu)
 
@@ -83,7 +83,7 @@ class TestShrinkRNNMemoryNoLoD(TestShrinkRNNMemoryBase):
         x_tensor.set(tensor_np, cpu)
 
         rank_table_tensor = core.LoDTensor()
-        rank_table_tensor.set_lod([[0, 1, 3, 6]])
+        rank_table_tensor.set_recursive_sequence_lengths([[1, 2, 3]])
         rank_table_tensor.set(np.random.random(size=(6, 1)).astype('float32'),
                               cpu)
 
diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a48bce3bb7c74551a365fd471f6869b128babac
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
@@ -0,0 +1,62 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestSliceOp(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.config()
+        self.inputs = {'Input': self.input}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts,
+            'ends': self.ends
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [3, 3, 4]
+        self.axes = [0, 1, 2]
+        self.out = self.input[1:3, 0:3, 2:4, :]
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestCase1(TestSliceOp):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [-3, 0, 2]
+        self.ends = [3, 100, -1]
+        self.axes = [0, 1, 2]
+        self.out = self.input[-3:3, 0:100, 2:-1, :]
+
+
+class TestCase2(TestSliceOp):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [-3, 0, 2]
+        self.ends = [3, 100, -1]
+        self.axes = [0, 1, 3]
+        self.out = self.input[-3:3, 0:100, :, 2:-1]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_slice_var.py b/python/paddle/fluid/tests/unittests/test_slice_var.py
new file mode 100644
index 0000000000000000000000000000000000000000..82305b23a1a1e2cee8cef6b291d848581fe5b509
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_slice_var.py
@@ -0,0 +1,64 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import unittest
+from paddle.fluid.transpiler.distribute_transpiler import slice_variable
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import random
+
+
+class TestSliceVar(unittest.TestCase):
+    def check_slice_output(self, shapes, expected_sizes, min_size):
+        var_list = []
+        program = fluid.Program()
+        for shape in shapes:
+            var = program.global_block().create_var(
+                name=str(random.randint(10000, 99999)),
+                persistable=True,
+                # dtype=core.VarDesc.VarType.LOD_TENSOR,
+                shape=shape)
+            var_list.append(var)
+        blocks = slice_variable(var_list, 10, min_size)
+        all_sizes = []
+        for s in expected_sizes:
+            for s2 in s:
+                all_sizes.append(s2)
+        for i, block_str in enumerate(blocks):
+            varname, block_id, size = block_str.split(":")
+            self.assertEqual(int(size), all_sizes[i])
+
+    def test_1k(self):
+        shapes = [[3, 5], [1024], [28, 784], [8, 1020], [800, 10]]
+        expected_sizes = [
+            [15], [1024],
+            [2352, 2352, 2352, 2352, 2352, 2352, 2352, 2352, 2352, 784],
+            [2040, 2040, 2040, 2040],
+            [1150, 1150, 1150, 1150, 1150, 1150, 1100]
+        ]
+
+        self.check_slice_output(shapes, expected_sizes, 1024)
+
+    def test_check_output_8k(self):
+        shapes = [[3, 5], [1024], [28, 784], [8, 1020], [800, 10],
+                  [6, 33, 33, 33]]
+        expected_sizes = [[15], [1024], [10976, 10976], [8160], [8000],
+                          [35937, 35937, 35937, 35937, 35937, 35937]]
+
+        self.check_slice_output(shapes, expected_sizes, 8192)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index 8f8312edca7e2d98eb4e881f671c6afdda01c57a..279f3073f73d1c36f54bb901d92441a7403ac23f 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -15,6 +15,7 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle.fluid.core as core
 
 
 def stable_softmax(x):
@@ -27,18 +28,72 @@ def stable_softmax(x):
 class TestSoftmaxOp(OpTest):
     def setUp(self):
         self.op_type = "softmax"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [10, 10]).astype("float32")
-        }
-        self.outputs = {
-            'Out': np.apply_along_axis(stable_softmax, 1, self.inputs['X'])
+        self.use_cudnn = False
+        self.use_mkldnn = False
+        self.dtype = np.float32
+        self.init_kernel_type()
+
+        x = np.random.uniform(0.1, 1, [10, 10]).astype(self.dtype)
+        out = np.apply_along_axis(stable_softmax, 1, x)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+        self.attrs = {
+            'use_cudnn': self.use_cudnn,
+            'use_mkldnn': self.use_mkldnn
         }
 
+    def init_kernel_type(self):
+        pass
+
     def test_check_output(self):
-        self.check_output()
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=1e-5)
+        else:
+            self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        if self.dtype == np.float16:
+            return
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, ["X"], "Out", max_relative_error=0.01)
+        else:
+            self.check_grad(["X"], "Out", max_relative_error=0.01)
+
+
+class TestSoftmaxCUDNNOp(TestSoftmaxOp):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+
+
+class TestSoftmaxFP16Op(TestSoftmaxOp):
+    def init_kernel_type(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
+
+class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
+    def init_kernel_type(self):
+        self.use_cudnn = True
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-3)
+
+
+class TestSoftmaxMKLDNNOp(TestSoftmaxOp):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
index 889fea2ce66e64d439b51498722e571f48cd1f96..c0d9fc8f22a7c4f791d80a9cad87d003b5d54299 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
@@ -26,7 +26,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
 
     def setUp(self):
         self.op_type = "softmax_with_cross_entropy"
-        batch_size = 2
+        batch_size = 41
         class_num = 37
 
         logits = np.random.uniform(0.1, 1.0,
@@ -59,7 +59,7 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest):
 
     def setUp(self):
         self.op_type = "softmax_with_cross_entropy"
-        batch_size = 2
+        batch_size = 41
         class_num = 37
 
         logits = np.random.uniform(0.1, 1.0,
diff --git a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
index 02cc7da84918041c33bf5c8def46025bc87a2b9e..0916ed7c9f1e2d6d90c6908983fdc8b177aecbb9 100644
--- a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
@@ -56,7 +56,7 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
     def test_split_and_merge_lod_tensor_level_0(self):
         tensor = core.LoDTensor()
         tensor.set(np.arange(10).reshape(10, 1).astype('int32'), self.place())
-        tensor.set_lod([[0, 3, 9, 10]])
+        tensor.set_recursive_sequence_lengths([[3, 6, 1]])
 
         mask_np = np.array([0, 1, 0]).astype('bool')
         mask_np = np.expand_dims(mask_np, axis=1)
@@ -68,15 +68,15 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         expect_true_tensor = np.expand_dims(expect_true_tensor, axis=1)
         expect_true = core.LoDTensor()
         expect_true.set(expect_true_tensor, self.place())
-        expect_true.set_lod([[0, 6]])
+        expect_true.set_recursive_sequence_lengths([[6]])
 
         expect_false_tensor = np.array([0, 1, 2, 9]).astype('int32')
         expect_false_tensor = np.expand_dims(expect_false_tensor, axis=1)
-        expect_false_lod = [[0, 3, 4]]
+        expect_false_lod = [[3, 1]]
 
         expect_false = core.LoDTensor()
         expect_false.set(expect_false_tensor, self.place())
-        expect_false.set_lod(expect_false_lod)
+        expect_false.set_recursive_sequence_lengths(expect_false_lod)
 
         self.main(
             tensor=tensor,
@@ -126,7 +126,8 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
 
     def check_tensor_same(self, actual, expect):
         self.assertTrue(np.allclose(np.array(actual), np.array(expect)))
-        self.assertEqual(actual.lod(), expect.lod())
+        self.assertEqual(actual.recursive_sequence_lengths(),
+                         expect.recursive_sequence_lengths())
 
 
 class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
@@ -151,7 +152,7 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
 
         tensor = core.LoDTensor()
         tensor.set(np.arange(10).reshape(10, 1).astype('float32'), place)
-        tensor.set_lod([[0, 3, 9, 10]])
+        tensor.set_recursive_sequence_lengths([[3, 6, 1]])
 
         mask_np = np.array([0, 1, 0]).astype('bool')
         mask_np = np.expand_dims(mask_np, axis=1)
diff --git a/python/paddle/fluid/tests/unittests/test_split_ids_op.py b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9f0a06a56b42952800411d548bb3fc1732e031e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
@@ -0,0 +1,35 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestSplitIdsOp(OpTest):
+    def setUp(self):
+        self.op_type = "split_ids"
+        ids = np.array([[0], [2], [2], [3], [5], [5], [6]]).astype('int64')
+        out0 = np.array([[0], [3], [6]]).astype('int64')
+        out1 = np.array([[]]).astype('int64')
+        out2 = np.array([[2], [2], [5], [5]]).astype('int64')
+        self.inputs = {'Ids': ids}
+        self.outputs = {'Out': [('out0', out0), ('out1', out1), ('out2', out2)]}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_split_op.py b/python/paddle/fluid/tests/unittests/test_split_op.py
index 887bdfe8b3608878bace5b857a71ada123b74b2f..eb49a53e54f4bdb6bcd6cb1991423970f29997bb 100644
--- a/python/paddle/fluid/tests/unittests/test_split_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_op.py
@@ -19,7 +19,7 @@ from op_test import OpTest
 
 class TestSplitOp(OpTest):
     def setUp(self):
-        self.op_type = "split"
+        self._set_op_type()
         axis = 1
         x = np.random.random((4, 5, 6)).astype('float32')
         out = np.split(x, [2, 3], axis)
@@ -28,6 +28,9 @@ class TestSplitOp(OpTest):
         self.outputs = {'Out': [('out%d' % i, out[i]) \
             for i in xrange(len(out))]}
 
+    def _set_op_type(self):
+        self.op_type = "split"
+
     def test_check_output(self):
         self.check_output()
 
@@ -35,5 +38,10 @@ class TestSplitOp(OpTest):
         self.check_grad(['X'], ['out0', 'out1', 'out2'])
 
 
+class TestSplitByrefOp(OpTest):
+    def _set_op_type(self):
+        self.op_type = "split_byref"
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_split_var.py b/python/paddle/fluid/tests/unittests/test_split_var.py
deleted file mode 100644
index 104ceb4fe7beb70b9016f57cef0ef895a3eb8ba6..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_split_var.py
+++ /dev/null
@@ -1,53 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import unittest
-from paddle.fluid.distribute_transpiler import split_dense_variable
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import random
-
-
-class TestSplitVar(unittest.TestCase):
-    def test_check_output(self):
-        # split below shapes to 10 servers
-        shapes = [[3, 5], [1024], [28, 784], [8, 1020], [800, 10]]
-        expected_sizes = [
-            [15], [1024],
-            [2352, 2352, 2352, 2352, 2352, 2352, 2352, 2352, 2352, 784],
-            [2040, 2040, 2040, 2040],
-            [1150, 1150, 1150, 1150, 1150, 1150, 1100]
-        ]
-        var_list = []
-        program = fluid.Program()
-        for shape in shapes:
-            var = program.global_block().create_var(
-                name=str(random.randint(10000, 99999)),
-                persistable=True,
-                # dtype=core.VarDesc.VarType.LOD_TENSOR,
-                shape=shape)
-            var_list.append(var)
-        blocks = split_dense_variable(var_list, 10)
-        all_sizes = []
-        for s in expected_sizes:
-            for s2 in s:
-                all_sizes.append(s2)
-        for i, block_str in enumerate(blocks):
-            varname, block_id, size = block_str.split(":")
-            self.assertEqual(int(size), all_sizes[i])
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..7956897d68a3fb49d62ba696d0b6400b4f909989
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from test_sum_op import TestSumOp
+
+
+class TestMKLDNN(TestSumOp):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
index 2faf5b10647a1fa1d44e4847f017db177ee8808a..1d90414e137a70e6265042e24e106fe565802778 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -20,12 +20,15 @@ from op_test import OpTest
 class TestSumOp(OpTest):
     def setUp(self):
         self.op_type = "sum"
+        self.use_mkldnn = False
+        self.init_kernel_type()
         x0 = np.random.random((3, 4)).astype('float32')
         x1 = np.random.random((3, 4)).astype('float32')
         x2 = np.random.random((3, 4)).astype('float32')
         self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
         y = x0 + x1 + x2
         self.outputs = {'Out': y}
+        self.attrs = {'use_mkldnn': self.use_mkldnn}
 
     def test_check_output(self):
         self.check_output()
@@ -33,6 +36,9 @@ class TestSumOp(OpTest):
     def test_check_grad(self):
         self.check_grad(['x0'], 'Out')
 
+    def init_kernel_type(self):
+        pass
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_target_assign_op.py
old mode 100755
new mode 100644
index ccb41e56c5555b8c79674449c9139ada0bc47aac..bd208897520122b6a5dcf71da325b1b9dba632f6
--- a/python/paddle/fluid/tests/unittests/test_target_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_target_assign_op.py
@@ -22,22 +22,23 @@ def gen_match_and_neg_indices(num_prior, gt_lod, neg_lod):
     if len(gt_lod) != len(neg_lod):
         raise AssertionError("The input arguments are illegal.")
 
-    batch_size = len(gt_lod) - 1
+    batch_size = len(gt_lod)
 
     match_indices = -1 * np.ones((batch_size, num_prior)).astype('int32')
-    neg_indices = np.zeros((neg_lod[-1], 1)).astype('int32')
+    neg_indices = np.zeros((sum(neg_lod), 1)).astype('int32')
 
+    offset = 0
     for n in range(batch_size):
-        gt_num = gt_lod[n + 1] - gt_lod[n]
+        gt_num = gt_lod[n]
         ids = random.sample([i for i in range(num_prior)], gt_num)
         match_indices[n, ids] = [i for i in range(gt_num)]
 
         ret_ids = set([i for i in range(num_prior)]) - set(ids)
-        s = neg_lod[n]
-        e = neg_lod[n + 1]
-        l = e - s
+        l = neg_lod[n]
         neg_ids = random.sample(ret_ids, l)
-        neg_indices[s:e, :] = np.array(neg_ids).astype('int32').reshape(l, 1)
+        neg_indices[offset:offset + neg_lod[n], :] = np.array(neg_ids).astype(
+            'int32').reshape(l, 1)
+        offset += neg_lod[n]
 
     return match_indices, neg_indices
 
@@ -56,24 +57,28 @@ def target_assign(encoded_box, gt_label, match_indices, neg_indices, gt_lod,
     # init weight for target label
     trg_label_wt = np.zeros((batch_size, num_prior, 1)).astype('float32')
 
+    gt_offset = 0
+    neg_offset = 0
     for i in range(batch_size):
         cur_indices = match_indices[i]
         col_ids = np.where(cur_indices > -1)
         col_val = cur_indices[col_ids]
 
-        gt_start = gt_lod[i]
         # target bbox
-        for v, c in zip(col_val + gt_start, col_ids[0].tolist()):
+        for v, c in zip(col_val + gt_offset, col_ids[0].tolist()):
             trg_box[i][c][:] = encoded_box[v][c][:]
         # weight for target bbox
         trg_box_wt[i][col_ids] = 1.0
 
-        trg_label[i][col_ids] = gt_label[col_val + gt_start]
+        trg_label[i][col_ids] = gt_label[col_val + gt_offset]
         trg_label_wt[i][col_ids] = 1.0
         # set target label weight to 1.0 for the negative samples
         if neg_indices is not None:
-            neg_ids = neg_indices[neg_lod[i]:neg_lod[i + 1]]
+            neg_ids = neg_indices[neg_offset:neg_offset + neg_lod[i]]
             trg_label_wt[i][neg_ids] = 1.0
+        # update offset
+        gt_offset += gt_lod[i]
+        neg_offset += neg_lod[i]
 
     return trg_box, trg_box_wt, trg_label, trg_label_wt
 
@@ -83,11 +88,11 @@ class TestTargetAssginFloatType(OpTest):
         self.op_type = "target_assign"
         num_prior = 120
         num_class = 21
-        gt_lod = [0, 5, 11, 23]
-        neg_lod = [0, 4, 7, 13]
+        gt_lod = [5, 6, 12]
+        neg_lod = [4, 3, 6]
         mismatch_value = 0
-        batch_size = len(gt_lod) - 1
-        num_gt = gt_lod[-1]
+        batch_size = len(gt_lod)
+        num_gt = sum(gt_lod)
 
         encoded_box = np.random.random((num_gt, num_prior, 4)).astype('float32')
         gt_label = np.random.randint(
@@ -121,11 +126,11 @@ class TestTargetAssginIntType(OpTest):
         self.op_type = "target_assign"
         num_prior = 120
         num_class = 21
-        gt_lod = [0, 5, 11, 23]
-        neg_lod = [0, 4, 7, 13]
+        gt_lod = [5, 6, 12]
+        neg_lod = [4, 3, 6]
         mismatch_value = 0
-        batch_size = len(gt_lod) - 1
-        num_gt = gt_lod[-1]
+        batch_size = len(gt_lod)
+        num_gt = sum(gt_lod)
 
         encoded_box = np.random.random((num_gt, num_prior, 4)).astype('float32')
         gt_label = np.random.randint(
diff --git a/python/paddle/fluid/tests/unittests/test_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor.py
index a369783245ae2e35a9743ef1f4321ac919e58283..f17edd3025b17549892bbd47935a1d2452cefac3 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor.py
@@ -69,15 +69,14 @@ class TestTensor(unittest.TestCase):
         array[0, 0, 0] = 3
         array[3, 3, 5] = 10
         lod_tensor.set(array, place)
-        lod_tensor.set_lod([[0, 2, 4]])
+        lod_tensor.set_recursive_sequence_lengths([[2, 2]])
 
         lod_v = numpy.array(lod_tensor)
         self.assertTrue(numpy.alltrue(array == lod_v))
 
-        lod = lod_tensor.lod()
-        self.assertEqual(0, lod[0][0])
+        lod = lod_tensor.recursive_sequence_lengths()
+        self.assertEqual(2, lod[0][0])
         self.assertEqual(2, lod[0][1])
-        self.assertEqual(4, lod[0][2])
 
     def test_float_lod_tensor(self):
         place = core.CPUPlace()
@@ -97,21 +96,21 @@ class TestTensor(unittest.TestCase):
         lod_v = numpy.array(lod_tensor)
         self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0])
         self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
-        self.assertEqual(len(lod_tensor.lod()), 0)
+        self.assertEqual(len(lod_tensor.recursive_sequence_lengths()), 0)
 
-        lod_py = [[0, 2, 5], [0, 2, 4, 5]]
-        lod_tensor.set_lod(lod_py)
-        lod = lod_tensor.lod()
+        lod_py = [[2, 1], [1, 2, 2]]
+        lod_tensor.set_recursive_sequence_lengths(lod_py)
+        lod = lod_tensor.recursive_sequence_lengths()
         self.assertListEqual(lod_py, lod)
 
     def test_lod_tensor_init(self):
         scope = core.Scope()
         place = core.CPUPlace()
-        lod_py = [[0, 2, 5], [0, 2, 4, 5]]
+        lod_py = [[2, 1], [1, 2, 2]]
         lod_tensor = core.LoDTensor()
 
         lod_tensor.set_dims([5, 2, 3, 4])
-        lod_tensor.set_lod(lod_py)
+        lod_tensor.set_recursive_sequence_lengths(lod_py)
         lod_tensor.alloc_float(place)
         tensor_array = numpy.array(lod_tensor)
         tensor_array[0, 0, 0, 0] = 1.0
@@ -121,18 +120,17 @@ class TestTensor(unittest.TestCase):
         lod_v = numpy.array(lod_tensor)
         self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0])
         self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
-        self.assertListEqual(lod_py, lod_tensor.lod())
+        self.assertListEqual(lod_py, lod_tensor.recursive_sequence_lengths())
 
     def test_lod_tensor_gpu_init(self):
         if not core.is_compiled_with_cuda():
             return
-        scope = core.Scope()
         place = core.CUDAPlace(0)
-        lod_py = [[0, 2, 5], [0, 2, 4, 5]]
+        lod_py = [[2, 1], [1, 2, 2]]
         lod_tensor = core.LoDTensor()
 
         lod_tensor.set_dims([5, 2, 3, 4])
-        lod_tensor.set_lod(lod_py)
+        lod_tensor.set_recursive_sequence_lengths(lod_py)
         lod_tensor.alloc_float(place)
         tensor_array = numpy.array(lod_tensor)
         tensor_array[0, 0, 0, 0] = 1.0
@@ -142,7 +140,26 @@ class TestTensor(unittest.TestCase):
         lod_v = numpy.array(lod_tensor)
         self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0])
         self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
-        self.assertListEqual(lod_py, lod_tensor.lod())
+        self.assertListEqual(lod_py, lod_tensor.recursive_sequence_lengths())
+
+    def test_empty_tensor(self):
+        place = core.CPUPlace()
+        scope = core.Scope()
+        var = scope.var("test_tensor")
+
+        tensor = var.get_tensor()
+
+        tensor.set_dims([0, 1])
+        tensor.alloc_float(place)
+
+        tensor_array = numpy.array(tensor)
+        self.assertEqual((0, 1), tensor_array.shape)
+
+        if core.is_compiled_with_cuda():
+            gpu_place = core.CUDAPlace(0)
+            tensor.alloc_float(gpu_place)
+            tensor_array = numpy.array(tensor)
+            self.assertEqual((0, 1), tensor_array.shape)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index 75ff85a55fc4fd504ecd032e17f7e189c17192fb..346a949b6e7c96b5535f5e65ddbada11e110a0a7 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -15,6 +15,16 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+
+
+def output_hist(out):
+    hist, _ = np.histogram(out, range=(-5, 10))
+    hist = hist.astype("float32")
+    hist /= float(out.size)
+    prob = 0.1 * np.ones((10))
+    return hist, prob
 
 
 class TestUniformRandomOp(OpTest):
@@ -33,11 +43,37 @@ class TestUniformRandomOp(OpTest):
         self.check_output_customized(self.verify_output)
 
     def verify_output(self, outs):
-        tensor = outs[0]
-        hist, _ = np.histogram(outs[0], range=(-5, 10))
-        hist = hist.astype("float32")
-        hist /= float(outs[0].size)
-        prob = 0.1 * np.ones((10))
+        hist, prob = output_hist(np.array(outs[0]))
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+
+
+class TestUniformRandomOpSelectedRows(unittest.TestCase):
+    def get_places(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        return places
+
+    def test_check_output(self):
+        for place in self.get_places():
+            self.check_with_place(place)
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        out = scope.var("X").get_selected_rows()
+
+        op = Operator(
+            "uniform_random",
+            Out="X",
+            shape=[4, 784],
+            min=-5.0,
+            max=10.0,
+            seed=10)
+        op.run(scope, place)
+        self.assertEqual(out.get_tensor().shape(), [4, 784])
+        hist, prob = output_hist(np.array(out.get_tensor()))
         self.assertTrue(
             np.allclose(
                 hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
diff --git a/python/paddle/fluid/tests/unittests/test_warpctc_op.py b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
index ac638f7836f8205f80e31cfd5eb8892b2c7aee08..9f1aaee472f918da7deb8816a0a4654dafe74a30 100644
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
@@ -34,8 +34,8 @@ class CTCForward(object):
 
         self.level = 0
         self.num_classes = softmax.shape[1]
-        self.batch_size = len(softmax_lod[self.level]) - 1
-        assert self.batch_size == len(labels_lod[self.level]) - 1
+        self.batch_size = len(softmax_lod[self.level])
+        assert self.batch_size == len(labels_lod[self.level])
 
         self.loss = np.zeros([self.batch_size, 1], dtype="float32")
         self.gradient = np.zeros(self.softmax.shape, dtype="float32")
@@ -156,16 +156,20 @@ class CTCForward(object):
         return -log_prob
 
     def forward(self):
+        softmax_offset = 0
+        labels_offset = 0
         for i in range(self.batch_size):
-            softmax_start_i = self.softmax_lod[self.level][i]
-            softmax_end_i = self.softmax_lod[self.level][i + 1]
-            labels_start_i = self.labels_lod[self.level][i]
-            labels_end_i = self.labels_lod[self.level][i + 1]
+            softmax_start_i = softmax_offset
+            softmax_end_i = softmax_offset + self.softmax_lod[self.level][i]
+            labels_start_i = labels_offset
+            labels_end_i = labels_offset + self.labels_lod[self.level][i]
 
             softmax_a_sequence = self.softmax[softmax_start_i:softmax_end_i, :]
             labels_a_sequence = self.labels[labels_start_i:labels_end_i, :]
             self.loss[i] = self.forward_a_sequence(softmax_a_sequence,
                                                    labels_a_sequence)
+            softmax_offset += self.softmax_lod[self.level][i]
+            labels_offset += self.labels_lod[self.level][i]
         return self.loss
 
 
@@ -173,8 +177,8 @@ class TestWarpCTCOp(OpTest):
     def config(self):
         self.batch_size = 4
         self.num_classes = 8
-        self.logits_lod = [[0, 4, 5, 8, 11]]
-        self.labels_lod = [[0, 3, 4, 8, 12]]
+        self.logits_lod = [[4, 1, 3, 3]]
+        self.labels_lod = [[3, 1, 4, 4]]
         self.blank = self.num_classes - 1
         self.norm_by_times = False
 
@@ -184,11 +188,13 @@ class TestWarpCTCOp(OpTest):
 
         logits = np.random.uniform(
             0.1, 1.0,
-            [self.logits_lod[0][-1], self.num_classes]).astype("float32")
+            [sum(self.logits_lod[0]), self.num_classes]).astype("float32")
         softmax = np.apply_along_axis(stable_softmax, 1, logits)
         # labels should not be blank
         labels = np.random.randint(
-            0, self.num_classes - 1, [self.labels_lod[0][-1], 1], dtype="int32")
+            0,
+            self.num_classes - 1, [sum(self.labels_lod[0]), 1],
+            dtype="int32")
 
         ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
                          self.blank, self.norm_by_times)
@@ -196,9 +202,8 @@ class TestWarpCTCOp(OpTest):
 
         max_sequence_length = 0
         for i in range(self.batch_size):
-            max_sequence_length = max(
-                max_sequence_length,
-                self.logits_lod[0][i + 1] - self.logits_lod[0][i])
+            max_sequence_length = max(max_sequence_length,
+                                      self.logits_lod[0][i])
         self.gradient = np.zeros(
             [max_sequence_length, self.batch_size, self.num_classes],
             dtype="float32")
@@ -222,8 +227,8 @@ class TestWarpCTCOpCase1(TestWarpCTCOp):
     def config(self):
         self.batch_size = 4
         self.num_classes = CUDA_BLOCK_SIZE + 2
-        self.logits_lod = [[0, 4, 5, 8, 11]]
-        self.labels_lod = [[0, 3, 4, 8, 12]]
+        self.logits_lod = [[4, 1, 3, 3]]
+        self.labels_lod = [[3, 1, 4, 4]]
         self.blank = 0
         self.norm_by_times = False
 
diff --git a/python/paddle/fluid/tests/unittests/test_weight_normalization.py b/python/paddle/fluid/tests/unittests/test_weight_normalization.py
index 2adf917bc5d3bb35842a817c57a983627b759f22..436f9b9f86fb86270e47c8e30c5c0701787ca0f1 100644
--- a/python/paddle/fluid/tests/unittests/test_weight_normalization.py
+++ b/python/paddle/fluid/tests/unittests/test_weight_normalization.py
@@ -76,11 +76,11 @@ class TestWeightNormalization(unittest.TestCase):
                 lod_level_i = numpy.random.randint(
                     low=1,
                     high=5,
-                    size=self.batch_size if i == 0 else lod_level_i[-1])
-                lod_level_i = [0] + numpy.cumsum(lod_level_i).tolist()
+                    size=self.batch_size
+                    if i == 0 else sum(lod_level_i)).tolist()
                 data_lod.append(lod_level_i)
             data_value = numpy.random.random(
-                size=[data_lod[-1][-1] if data_lod else self.batch_size
+                size=[sum(data_lod[-1]) if data_lod else self.batch_size
                       ] + data_shape).astype('float32')
             self.data[data_name] = (data_value, data_lod)
 
@@ -90,7 +90,7 @@ class TestWeightNormalization(unittest.TestCase):
             tensor = fluid.Tensor()
             tensor.set(self.data[desc[0]][0], place)
             if self.data[desc[0]][1]:
-                tensor.set_lod(self.data[desc[0]][1])
+                tensor.set_recursive_sequence_lengths(self.data[desc[0]][1])
             self.inputs[desc[0]] = tensor
 
     def weight_normalize(self):
diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py
new file mode 100644
index 0000000000000000000000000000000000000000..a995ee10f29a714b674fae4b31070e6ba2ca9953
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
@@ -0,0 +1,182 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+
+
+def as_lodtensor(np_array, lod, place):
+    tensor = core.LoDTensor()
+    tensor.set(np_value, place)
+    if lod is not None:
+        tensor.set_recursive_sequence_lengths(lod)
+    return tensor
+
+
+def create_op(scope, op_type, inputs, outputs, attrs):
+    kwargs = dict()
+
+    op_maker = core.op_proto_and_checker_maker
+    op_role_attr_name = op_maker.kOpRoleAttrName()
+
+    if op_role_attr_name not in attrs:
+        attrs[op_role_attr_name] = int(op_maker.OpRole.Forward)
+
+    def __create_var__(name, var_name):
+        scope.var(var_name).get_tensor()
+        kwargs[name].append(var_name)
+
+    for in_name, in_dup in Operator.get_op_inputs(op_type):
+        if in_name in inputs:
+            kwargs[in_name] = []
+            if in_dup:
+                sub_in = inputs[in_name]
+                for item in sub_in:
+                    sub_in_name, _ = item[0], item[1]
+                    __create_var__(in_name, sub_in_name)
+            else:
+                __create_var__(in_name, in_name)
+
+    for out_name, out_dup in Operator.get_op_outputs(op_type):
+        if out_name in outputs:
+            kwargs[out_name] = []
+            if out_dup:
+                sub_out = outputs[out_name]
+                for item in sub_out:
+                    sub_out_name, _ = item[0], item[1]
+                    __create_var__(out_name, sub_out_name)
+            else:
+                __create_var__(out_name, out_name)
+
+    for attr_name in Operator.get_op_attr_names(op_type):
+        if attr_name in attrs:
+            kwargs[attr_name] = attrs[attr_name]
+
+    return Operator(op_type, **kwargs)
+
+
+def set_input(scope, op, inputs, place):
+    def __set_input__(var_name, var):
+        if isinstance(var, tuple) or isinstance(var, np.ndarray):
+            tensor = scope.find_var(var_name).get_tensor()
+            if isinstance(var, tuple):
+                tensor.set_recursive_sequence_lengths(var[1])
+                var = var[0]
+            tensor.set_dims(var.shape)
+            tensor.set(var, place)
+        elif isinstance(var, float):
+            scope.find_var(var_name).set_float(var)
+        elif isinstance(var, int):
+            scope.find_var(var_name).set_int(var)
+
+    for in_name, in_dup in Operator.get_op_inputs(op.type()):
+        if in_name in inputs:
+            if in_dup:
+                sub_in = inputs[in_name]
+                for item in sub_in:
+                    sub_in_name, sub_in_val = item[0], item[1]
+                    __set_input__(sub_in_name, sub_in_val)
+            else:
+                __set_input__(in_name, inputs[in_name])
+
+
+def append_input_output(block, op_proto, np_list, is_input, dtype):
+    '''Insert VarDesc and generate Python variable instance'''
+    proto_list = op_proto.inputs if is_input else op_proto.outputs
+
+    def create_var(block, name, np_list, var_proto):
+        dtype = None
+        shape = None
+        lod_level = None
+        if name not in np_list:
+            assert var_proto.intermediate, "{} not found".format(name)
+        else:
+            np_value = np_list[name]
+            if isinstance(np_value, tuple):
+                dtype = np_value[0].dtype
+                # output shape, lod should be infered from input.
+                if is_input:
+                    shape = list(np_value[0].shape)
+                    lod_level = len(np_value[1])
+            else:
+                dtype = np_value.dtype
+                if is_input:
+                    shape = list(np_value.shape)
+                    lod_level = 0
+        return block.create_var(
+            dtype=dtype, shape=shape, lod_level=lod_level, name=name)
+
+    var_dict = {}
+    for var_proto in proto_list:
+        var_name = str(var_proto.name)
+        if is_input:
+            if (var_name not in np_list) and var_proto.dispensable:
+                continue
+            assert (var_name in np_list) or (var_proto.dispensable), \
+                "Missing {} as input".format(var_name)
+        if var_proto.duplicable:
+            assert isinstance(np_list[var_name], list), \
+                "Duplicable {} should be set as list".format(var_name)
+            var_list = []
+            for (name, np_value) in np_list[var_name]:
+                var_list.append(
+                    create_var(block, name, {name: np_value}, var_proto))
+            var_dict[var_name] = var_list
+        else:
+            var_dict[var_name] = create_var(block, var_name, np_list, var_proto)
+
+    return var_dict
+
+
+def append_loss_ops(block, output_names):
+    mean_inputs = map(block.var, output_names)
+    # for item in mean_inputs:
+    #     print(item)
+    #     print("Item", item.dtype)
+
+    if len(mean_inputs) == 1:
+        loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1])
+        op = block.append_op(
+            inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean')
+        op.desc.infer_var_type(block.desc)
+        op.desc.infer_shape(block.desc)
+    else:
+        avg_sum = []
+        for cur_loss in mean_inputs:
+            cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1])
+            op = block.append_op(
+                inputs={"X": [cur_loss]},
+                outputs={"Out": [cur_avg_loss]},
+                type="mean")
+            op.desc.infer_var_type(block.desc)
+            op.desc.infer_shape(block.desc)
+            avg_sum.append(cur_avg_loss)
+
+        loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1])
+        op_sum = block.append_op(
+            inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum')
+        op_sum.desc.infer_var_type(block.desc)
+        op_sum.desc.infer_shape(block.desc)
+
+        loss = block.create_var(dtype=loss_sum.dtype, shape=[1])
+        op_loss = block.append_op(
+            inputs={"X": loss_sum},
+            outputs={"Out": loss},
+            type='scale',
+            attrs={'scale': 1.0 / float(len(avg_sum))})
+        op_loss.desc.infer_var_type(block.desc)
+        op_loss.desc.infer_shape(block.desc)
+    return loss
diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..c62792face3c353db1f2e3c77eaf4bd32fbded69
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
@@ -0,0 +1,487 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+import numpy as np
+
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+
+pos_enc_param_names = (
+    "src_pos_enc_table",
+    "trg_pos_enc_table", )
+
+batch_size = 64
+
+
+def position_encoding_init(n_position, d_pos_vec):
+    """
+    Generate the initial values for the sinusoid position encoding table.
+    """
+    position_enc = np.array([[
+        pos / np.power(10000, 2 * (j // 2) / d_pos_vec)
+        for j in range(d_pos_vec)
+    ] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
+    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # dim 2i
+    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2])  # dim 2i+1
+    return position_enc.astype("float32")
+
+
+def multi_head_attention(queries,
+                         keys,
+                         values,
+                         attn_bias,
+                         d_key,
+                         d_value,
+                         d_model,
+                         n_head=1,
+                         dropout_rate=0.):
+    """
+    Multi-Head Attention. Note that attn_bias is added to the logit before
+    computing softmax activiation to mask certain selected positions so that
+    they will not considered in attention weights.
+    """
+    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
+        raise ValueError(
+            "Inputs: quries, keys and values should all be 3-D tensors.")
+
+    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
+        """
+        Add linear projection to queries, keys, and values.
+        """
+        q = layers.fc(input=queries,
+                      size=d_key * n_head,
+                      param_attr=fluid.initializer.Xavier(
+                          uniform=False,
+                          fan_in=d_model * d_key,
+                          fan_out=n_head * d_key),
+                      bias_attr=False,
+                      num_flatten_dims=2)
+        k = layers.fc(input=keys,
+                      size=d_key * n_head,
+                      param_attr=fluid.initializer.Xavier(
+                          uniform=False,
+                          fan_in=d_model * d_key,
+                          fan_out=n_head * d_key),
+                      bias_attr=False,
+                      num_flatten_dims=2)
+        v = layers.fc(input=values,
+                      size=d_value * n_head,
+                      param_attr=fluid.initializer.Xavier(
+                          uniform=False,
+                          fan_in=d_model * d_value,
+                          fan_out=n_head * d_value),
+                      bias_attr=False,
+                      num_flatten_dims=2)
+        return q, k, v
+
+    def __split_heads(x, n_head):
+        """
+        Reshape the last dimension of inpunt tensor x so that it becomes two
+        dimensions and then transpose. Specifically, input a tensor with shape
+        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
+        with shape [bs, n_head, max_sequence_length, hidden_dim].
+        """
+        if n_head == 1:
+            return x
+
+        hidden_size = x.shape[-1]
+        # FIXME(guosheng): Decouple the program desc with batch_size.
+        reshaped = layers.reshape(
+            x=x, shape=[batch_size, -1, n_head, hidden_size // n_head])
+
+        # permuate the dimensions into:
+        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
+        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
+
+    def __combine_heads(x):
+        """
+        Transpose and then reshape the last two dimensions of inpunt tensor x
+        so that it becomes one dimension, which is reverse to __split_heads.
+        """
+        if len(x.shape) == 3: return x
+        if len(x.shape) != 4:
+            raise ValueError("Input(x) should be a 4-D Tensor.")
+
+        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
+        # FIXME(guosheng): Decouple the program desc with batch_size.
+        return layers.reshape(
+            x=trans_x,
+            shape=map(int,
+                      [batch_size, -1, trans_x.shape[2] * trans_x.shape[3]]))
+
+    def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
+        """
+        Scaled Dot-Product Attention
+        """
+
+        # FIXME(guosheng): Optimize the shape in reshape_op or softmax_op.
+
+        # The current implementation of softmax_op only supports 2D tensor,
+        # consequently it cannot be directly used here.
+        # If to use the reshape_op, Besides, the shape of product inferred in
+        # compile-time is not the actual shape in run-time. It cann't be used
+        # to set the attribute of reshape_op.
+        # So, here define the softmax for temporary solution.
+
+        def __softmax(x, eps=1e-9):
+            exp_out = layers.exp(x=x)
+            sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False)
+            return layers.elementwise_div(x=exp_out, y=sum_out, axis=0)
+
+        scaled_q = layers.scale(x=q, scale=d_model**-0.5)
+        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
+        weights = __softmax(layers.elementwise_add(x=product, y=attn_bias))
+        if dropout_rate:
+            weights = layers.dropout(
+                weights, dropout_prob=dropout_rate, is_test=False)
+        out = layers.matmul(weights, v)
+        return out
+
+    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
+
+    q = __split_heads(q, n_head)
+    k = __split_heads(k, n_head)
+    v = __split_heads(v, n_head)
+
+    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_model,
+                                                  dropout_rate)
+
+    out = __combine_heads(ctx_multiheads)
+
+    # Project back to the model size.
+    proj_out = layers.fc(input=out,
+                         size=d_model,
+                         param_attr=fluid.initializer.Xavier(uniform=False),
+                         bias_attr=False,
+                         num_flatten_dims=2)
+    return proj_out
+
+
+def positionwise_feed_forward(x, d_inner_hid, d_hid):
+    """
+    Position-wise Feed-Forward Networks.
+    This module consists of two linear transformations with a ReLU activation
+    in between, which is applied to each position separately and identically.
+    """
+    hidden = layers.fc(input=x,
+                       size=d_inner_hid,
+                       num_flatten_dims=2,
+                       param_attr=fluid.initializer.Uniform(
+                           low=-(d_hid**-0.5), high=(d_hid**-0.5)),
+                       act="relu")
+    out = layers.fc(input=hidden,
+                    size=d_hid,
+                    num_flatten_dims=2,
+                    param_attr=fluid.initializer.Uniform(
+                        low=-(d_inner_hid**-0.5), high=(d_inner_hid**-0.5)))
+    return out
+
+
+def pre_post_process_layer(prev_out, out, process_cmd, dropout=0.):
+    """
+    Add residual connection, layer normalization and droput to the out tensor
+    optionally according to the value of process_cmd.
+
+    This will be used before or after multi-head attention and position-wise
+    feed-forward networks.
+    """
+    for cmd in process_cmd:
+        if cmd == "a":  # add residual connection
+            out = out + prev_out if prev_out else out
+        elif cmd == "n":  # add layer normalization
+            out = layers.layer_norm(
+                out,
+                begin_norm_axis=len(out.shape) - 1,
+                param_attr=fluid.initializer.Constant(1.),
+                bias_attr=fluid.initializer.Constant(0.))
+        elif cmd == "d":  # add dropout
+            if dropout:
+                out = layers.dropout(out, dropout_prob=dropout, is_test=False)
+    return out
+
+
+pre_process_layer = partial(pre_post_process_layer, None)
+post_process_layer = pre_post_process_layer
+
+
+def prepare_encoder(src_word,
+                    src_pos,
+                    src_vocab_size,
+                    src_emb_dim,
+                    src_pad_idx,
+                    src_max_len,
+                    dropout=0.,
+                    pos_pad_idx=0,
+                    pos_enc_param_name=None):
+    """Add word embeddings and position encodings.
+    The output tensor has a shape of:
+    [batch_size, max_src_length_in_batch, d_model].
+
+    This module is used at the bottom of the encoder stacks.
+    """
+    src_word_emb = layers.embedding(
+        src_word,
+        size=[src_vocab_size, src_emb_dim],
+        padding_idx=src_pad_idx,
+        param_attr=fluid.initializer.Normal(0., 1.))
+    src_pos_enc = layers.embedding(
+        src_pos,
+        size=[src_max_len, src_emb_dim],
+        padding_idx=pos_pad_idx,
+        param_attr=fluid.ParamAttr(
+            name=pos_enc_param_name, trainable=False))
+    enc_input = src_word_emb + src_pos_enc
+
+    # FIXME(guosheng): Decouple the program desc with batch_size.
+    enc_input = layers.reshape(x=enc_input, shape=[batch_size, -1, src_emb_dim])
+    return layers.dropout(
+        enc_input, dropout_prob=dropout,
+        is_test=False) if dropout else enc_input
+
+
+prepare_encoder = partial(
+    prepare_encoder, pos_enc_param_name=pos_enc_param_names[0])
+prepare_decoder = partial(
+    prepare_encoder, pos_enc_param_name=pos_enc_param_names[1])
+
+
+def encoder_layer(enc_input,
+                  attn_bias,
+                  n_head,
+                  d_key,
+                  d_value,
+                  d_model,
+                  d_inner_hid,
+                  dropout_rate=0.):
+    """The encoder layers that can be stacked to form a deep encoder.
+
+    This module consits of a multi-head (self) attention followed by
+    position-wise feed-forward networks and both the two components companied
+    with the post_process_layer to add residual connection, layer normalization
+    and droput.
+    """
+    attn_output = multi_head_attention(enc_input, enc_input, enc_input,
+                                       attn_bias, d_key, d_value, d_model,
+                                       n_head, dropout_rate)
+    attn_output = post_process_layer(enc_input, attn_output, "dan",
+                                     dropout_rate)
+    ffd_output = positionwise_feed_forward(attn_output, d_inner_hid, d_model)
+    return post_process_layer(attn_output, ffd_output, "dan", dropout_rate)
+
+
+def encoder(enc_input,
+            attn_bias,
+            n_layer,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            dropout_rate=0.):
+    """
+    The encoder is composed of a stack of identical layers returned by calling
+    encoder_layer.
+    """
+    for i in range(n_layer):
+        enc_output = encoder_layer(enc_input, attn_bias, n_head, d_key, d_value,
+                                   d_model, d_inner_hid, dropout_rate)
+        enc_input = enc_output
+    return enc_output
+
+
+def decoder_layer(dec_input,
+                  enc_output,
+                  slf_attn_bias,
+                  dec_enc_attn_bias,
+                  n_head,
+                  d_key,
+                  d_value,
+                  d_model,
+                  d_inner_hid,
+                  dropout_rate=0.):
+    """ The layer to be stacked in decoder part.
+
+    The structure of this module is similar to that in the encoder part except
+    a multi-head attention is added to implement encoder-decoder attention.
+    """
+    slf_attn_output = multi_head_attention(
+        dec_input,
+        dec_input,
+        dec_input,
+        slf_attn_bias,
+        d_key,
+        d_value,
+        d_model,
+        n_head,
+        dropout_rate, )
+    slf_attn_output = post_process_layer(
+        dec_input,
+        slf_attn_output,
+        "dan",  # residual connection + dropout + layer normalization
+        dropout_rate, )
+    enc_attn_output = multi_head_attention(
+        slf_attn_output,
+        enc_output,
+        enc_output,
+        dec_enc_attn_bias,
+        d_key,
+        d_value,
+        d_model,
+        n_head,
+        dropout_rate, )
+    enc_attn_output = post_process_layer(
+        slf_attn_output,
+        enc_attn_output,
+        "dan",  # residual connection + dropout + layer normalization
+        dropout_rate, )
+    ffd_output = positionwise_feed_forward(
+        enc_attn_output,
+        d_inner_hid,
+        d_model, )
+    dec_output = post_process_layer(
+        enc_attn_output,
+        ffd_output,
+        "dan",  # residual connection + dropout + layer normalization
+        dropout_rate, )
+    return dec_output
+
+
+def decoder(dec_input,
+            enc_output,
+            dec_slf_attn_bias,
+            dec_enc_attn_bias,
+            n_layer,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            dropout_rate=0.):
+    """
+    The decoder is composed of a stack of identical decoder_layer layers.
+    """
+    for i in range(n_layer):
+        dec_output = decoder_layer(
+            dec_input,
+            enc_output,
+            dec_slf_attn_bias,
+            dec_enc_attn_bias,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            dropout_rate, )
+        dec_input = dec_output
+    return dec_output
+
+
+def transformer(
+        src_vocab_size,
+        trg_vocab_size,
+        max_length,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        dropout_rate,
+        src_pad_idx,
+        trg_pad_idx,
+        pos_pad_idx, ):
+    file_obj = fluid.layers.open_recordio_file(
+        filename='./wmt16.recordio',
+        shapes=[
+            [batch_size * max_length, 1],
+            [batch_size * max_length, 1],
+            [batch_size * max_length, 1],
+            [batch_size * max_length, 1],
+            [batch_size, n_head, max_length, max_length],
+            [batch_size, n_head, max_length, max_length],
+            [batch_size, n_head, max_length, max_length],
+            [batch_size * max_length, 1],
+            [batch_size * max_length, 1],
+        ],
+        dtypes=[
+            'int64',
+            'int64',
+            'int64',
+            'int64',
+            'float32',
+            'float32',
+            'float32',
+            'int64',
+            'float32',
+        ],
+        lod_levels=[0] * 9)
+
+    src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias, trg_slf_attn_bias, trg_src_attn_bias, gold, weights = fluid.layers.read_file(
+        file_obj)
+
+    enc_input = prepare_encoder(
+        src_word,
+        src_pos,
+        src_vocab_size,
+        d_model,
+        src_pad_idx,
+        max_length,
+        dropout_rate, )
+    enc_output = encoder(
+        enc_input,
+        src_slf_attn_bias,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        dropout_rate, )
+
+    dec_input = prepare_decoder(
+        trg_word,
+        trg_pos,
+        trg_vocab_size,
+        d_model,
+        trg_pad_idx,
+        max_length,
+        dropout_rate, )
+    dec_output = decoder(
+        dec_input,
+        enc_output,
+        trg_slf_attn_bias,
+        trg_src_attn_bias,
+        n_layer,
+        n_head,
+        d_key,
+        d_value,
+        d_model,
+        d_inner_hid,
+        dropout_rate, )
+
+    # TODO(guosheng): Share the weight matrix between the embedding layers and
+    # the pre-softmax linear transformation.
+    predict = layers.reshape(
+        x=layers.fc(input=dec_output,
+                    size=trg_vocab_size,
+                    param_attr=fluid.initializer.Xavier(uniform=False),
+                    bias_attr=False,
+                    num_flatten_dims=2),
+        shape=[-1, trg_vocab_size],
+        act="softmax")
+
+    cost = layers.cross_entropy(input=predict, label=gold)
+    weighted_cost = cost * weights
+    return layers.reduce_sum(weighted_cost)
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6e0241265b18377874efb0d223441994b4650d0
--- /dev/null
+++ b/python/paddle/fluid/trainer.py
@@ -0,0 +1,604 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import os
+
+import core
+
+import data_feeder
+import executor
+import framework
+import io
+# optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
+import optimizer as opt_module
+import parallel_executor
+from transpiler import distribute_transpiler
+
+__all__ = [
+    'Trainer', 'BeginEpochEvent', 'EndEpochEvent', 'BeginStepEvent',
+    'EndStepEvent', 'CheckpointConfig'
+]
+
+
+class BeginEpochEvent(object):
+    """
+    The begin of a training epoch.
+
+    Args:
+        epoch_id(int): The current epoch ID.
+    """
+
+    def __init__(self, epoch_id):
+        self.epoch = epoch_id
+
+
+class EndEpochEvent(object):
+    """
+    The end of a training epoch.
+
+    Args:
+        epoch_id(int): The current epoch ID.
+    """
+
+    def __init__(self, epoch_id):
+        self.epoch = epoch_id
+
+
+class BeginStepEvent(object):
+    """
+    The begin of a training epoch.
+
+    Args:
+        epoch_id(int): The current epoch ID.
+        step_id(int): The current step ID.
+    """
+
+    def __init__(self, epoch_id, step_id):
+        self.epoch = epoch_id
+        self.step = step_id
+        self.fetch_metrics = True
+        """
+        If fetch_metrics is true, the metrics will be fetched at the 
+        EndStepEvent. Default is True.
+        """
+
+
+class EndStepEvent(object):
+    """
+    The end of a training step.
+
+    Args:
+        epoch_id(int): The current epoch ID.
+        step_id(int): The current step ID.
+        metrics(list): A list of fetched tensor. The order of this list is same
+            as the :code:`train_func` returns.
+    """
+
+    def __init__(self, epoch_id, step_id, metrics):
+        self.epoch = epoch_id
+        self.step = step_id
+        self.metrics = metrics
+
+
+class CheckpointConfig(object):
+    """
+    Parameter object for :code:`fluid.io.save_checkpoint` and
+    :code:`fluid.Trainer`. Used to configuration how to save checkpoint.
+
+    Args:
+        checkpoint_dir(str): Directory path to save check point. Default is the
+            current directory.
+
+        max_num_checkpoints(int): The max number of local check points.
+        epoch_interval(int): Every number of epoch to save check point.
+        step_interval(int): Every number of step to save check point.
+
+    Examples:
+        >>> config = fluid.CheckpointConfig("./checkpoints")
+        >>> trainer = fluid.Trainer(train_func=train_program,
+        >>>                         place=place,
+        >>>                         optimizer_func=optimizer_func,
+        >>>                         checkpoint_config=config)
+        >>> trainer.train(...)
+    """
+
+    def __init__(self,
+                 checkpoint_dir=None,
+                 max_num_checkpoints=3,
+                 epoch_interval=1,
+                 step_interval=10):
+
+        assert epoch_interval >= 1
+        assert step_interval >= 1
+
+        self.checkpoint_dir = checkpoint_dir \
+            if checkpoint_dir is not None else os.getcwd()
+        self.max_num_checkpoints = max_num_checkpoints
+        self.epoch_interval = epoch_interval
+        self.step_interval = step_interval
+        self.epoch_id = 0
+        self.step_id = 0
+        self.load_serial = None
+        self.pserver_id = None
+        self.lookup_table_name = None
+
+
+def check_and_get_place(place):
+    """
+    Check the type of place or get the default place
+    Args:
+        place(None|core.CUDAPlace|core.CPUPlace): the place that trainer will be executed on.
+
+    Raises:
+        TypeError if the type mismatched.
+
+    Returns:
+        the original place if it is not None.
+        if fluid is compiled with CUDA, returns CUDAPlace(0) by default.
+        Otherwise returns CPUPlace by default.
+    """
+    if place is None:
+        if core.is_compiled_with_cuda():
+            return core.CUDAPlace(0)
+        else:
+            return core.CPUPlace()
+    else:
+        if not isinstance(place, core.CUDAPlace) and not isinstance(
+                place, core.CPUPlace):
+            raise TypeError("Place should be either CUDAPlace or CPUPlace")
+        return place
+
+
+class Trainer(object):
+    """
+    A trainer wraps MultiGPU/MultiNode training loops and can be used to train a
+    simple neural network easily.
+
+    This API takes a :code:`train_func`. A :code:`train_func` is a function that
+    return loss as it first return value. The reset value can be fetched by
+    EndStepEvent.metrics
+
+    This API also takes a :code:`optimizer_func` that will return an optimizer
+    instance.
+
+    For example, to train a MLP for MNIST dataset, the sample program is
+
+    >>> import paddle.fluid as fluid
+    >>>
+    >>> def mlp(image, layer_sizes=[200, 100], activation="relu", num_classes=10):
+    >>>     hidden = image
+    >>>     for layer_size in layer_sizes:
+    >>>         hidden = fluid.layers.fc(input=hidden, size=layer_size, act=activation)
+    >>>     return fluid.layers.fc(input=hidden, size=num_classes, act="softmax")
+    >>>
+    >>> def train_mnist_mlp():
+    >>>     img = fluid.layers.data(name='image', shape=[784])
+    >>>     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    >>>     prediction = mlp(img)
+    >>>     return fluid.layers.mean(fluid.layers.cross_entropy(prediction, label))
+    >>>
+    >>> def optimizer():
+    >>>     return fluid.optimizer.Adam()
+    >>>
+    >>> trainer = Trainer(train_func=train_mnist_mlp,
+    >>>                   optimizer_func=optimizer,
+    >>>                   place=fluid.CUDAPlace(0),
+    >>>                   parallel=True)
+    >>>
+    >>> def train_callback(event):
+    >>>     if isinstance(event, fluid.EndStepEvent):
+    >>>         print "Epoch ID", event.epoch, "Step ID",\
+    >>>             event.step, "AvgLoss", event.metrics[0]
+    >>>     elif isinstance(event, fluid.EndEpochEvent):
+    >>>         trainer.save_params("./model_{0}".format(event.epoch))
+    >>>
+    >>> trainer.train(num_epochs=100, event_handler=train_callback)
+
+    For more example, please see :ref:`api_guide_high_level_api`.
+
+
+    Args:
+        train_func(callable): A function which will return loss. The loss must be
+            a scalar tensor.
+        optimizer_func(callable): A function that returns an Optimizer object.
+        place(CUDAPlace|CPUPlace): The device place of this trainer. If
+            :code:`parallel=True,` all CUDA Places will be used if :code:`place`
+            is a :code:`CUDAPlace`.
+        parallel(bool): True if use multiple devices.
+        checkpoint_config(CheckpointConfig): Configuration about how to save
+            checkpoints.
+    """
+
+    def __init__(self,
+                 train_func,
+                 optimizer_func,
+                 param_path=None,
+                 place=None,
+                 parallel=False,
+                 checkpoint_config=None):
+        self.__stop = False
+        self.parallel = parallel
+
+        # config for checkpoint
+        # only chief worker will save variables
+        self.trainer_id = 0
+        self.checkpoint_cfg = checkpoint_config
+        if self.checkpoint_cfg:
+            assert isinstance(self.checkpoint_cfg, CheckpointConfig)
+            serial = io.get_latest_checkpoint_serial(
+                self.checkpoint_cfg.checkpoint_dir)
+            self.checkpoint_cfg.load_serial = serial if serial >= 0 else None
+
+        self.scope = core.Scope()
+
+        # 1. we need to generate a framework.Program by calling
+        # program_func. Reference: fluid.program_guard in
+        # test_word2vec.py
+
+        self.startup_program = framework.Program()
+        self.train_program = framework.Program()
+
+        with framework.program_guard(self.train_program, self.startup_program):
+            program_func_outs = train_func()
+            self.train_func_outputs = program_func_outs if isinstance(
+                program_func_outs, list) else [program_func_outs]
+            self.test_program = self.train_program.clone(for_test=True)
+
+            # The first element of program_func_outs is loss.
+            loss = self.train_func_outputs[0]
+
+            optimizer = optimizer_func()
+            if not isinstance(optimizer, opt_module.Optimizer):
+                raise TypeError(
+                    "The optimizer should be an instance of Optimizer")
+            optimize_ops, params_grads = optimizer.minimize(loss)
+
+        self.place = check_and_get_place(place)
+
+        self._dist_transpile_if_necessary(optimize_ops, params_grads)
+
+        # 2. move the default_main_program to self.program and run the
+        # default_startup program on an empty core.Scope()
+        # Run startup program
+        with self._prog_and_scope_guard():
+            exe = executor.Executor(place)
+            exe.run(self.startup_program)
+
+        if self.checkpoint_cfg and self.checkpoint_cfg.load_serial:
+            with self._prog_and_scope_guard():
+                exe = executor.Executor(place)
+                io.load_checkpoint(exe, self.checkpoint_cfg.checkpoint_dir,
+                                   self.checkpoint_cfg.load_serial,
+                                   self.startup_program)
+
+                if not self.checkpoint_cfg.pserver_id:
+                    epoch_id, step_id = io.load_trainer_args(
+                        self.checkpoint_cfg.checkpoint_dir,
+                        self.checkpoint_cfg.load_serial, self.trainer_id,
+                        self._get_checkpoint_load_args())
+                    self.checkpoint_cfg.epoch_id = int(epoch_id)
+                    self.checkpoint_cfg.step_id = int(step_id)
+                else:
+                    if self.checkpoint_cfg.lookup_table_name:
+                        io.load_lookup_table_vars(
+                            exe, self.checkpoint_cfg.checkpoint_dir,
+                            self.startup_program,
+                            self.checkpoint_cfg.pserver_id,
+                            self.checkpoint_cfg.lookup_table_name)
+
+        if param_path and os.path.isdir(param_path):
+            # load params from param_path into scope
+            io.load_persist_vars_without_grad(
+                exe, dirname=param_path, program=self.startup_program)
+
+    def _transpile_nccl2_dist(self):
+        # PADDLE_TRAINER_IPS
+        if "PADDLE_TRAINER_IPS" not in os.environ:
+            self.nccl_id_var = None
+        else:
+            self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+            port = os.getenv("PADDLE_PSERVER_PORT")
+            worker_ips = os.getenv("PADDLE_TRAINER_IPS")
+            worker_endpoints = []
+            for ip in worker_ips.split(","):
+                worker_endpoints.append(':'.join([ip, port]))
+            self.num_trainers = len(worker_endpoints)
+            current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
+            worker_endpoints.remove(current_endpoint)
+            # TODO(wuyi): use self.nccl_id_var, self.num_trainers and self.trainer_id
+            # in ParallelExecutor to start
+            # distributed training using NCCL2
+            self.nccl_id_var = self.startup_program.global_block().create_var(
+                name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW)
+            self.startup_program.global_block().append_op(
+                type="gen_nccl_id",
+                inputs={},
+                outputs={"NCCLID": self.nccl_id_var},
+                attrs={
+                    "endpoint": current_endpoint,
+                    "endpoint_list": worker_endpoints,
+                    "trainer_id": self.trainer_id
+                })
+
+    def _dist_transpile_if_necessary(self, optimize_ops, params_grads):
+        self._transpile_nccl2_dist()
+        if self.nccl_id_var != None:
+            return
+
+        if "PADDLE_TRAINING_ROLE" not in os.environ:
+            return
+
+        # the port of all pservers, needed by both trainer and pserver
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        # comma separated ips of all pservers, needed by trainer and
+        # pserver
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = ",".join(eplist)
+        # total number of workers/trainers in the job, needed by
+        # trainer and pserver
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
+        # the IP of the local machine, needed by pserver only
+        current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
+        # the unique trainer id, starting from 0, needed by trainer
+        # only
+        self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+
+        # the role, should be either PSERVER or TRAINER
+        training_role = os.getenv("PADDLE_TRAINING_ROLE")
+        with self._prog_and_scope_guard():
+            t = distribute_transpiler.DistributeTranspiler()
+            t.transpile(
+                self.trainer_id, pservers=pserver_endpoints, trainers=trainers)
+            if training_role == "PSERVER":
+                if self.checkpoint_cfg:
+                    pserver_id = eplist.index(current_endpoint)
+                    self.checkpoint_cfg.pserver_id = pserver_id
+                    if t.has_distributed_lookup_table:
+                        self.checkpoint_cfg.lookup_table_name = t.table_name
+
+                self.train_program = t.get_pserver_program(current_endpoint)
+                self.startup_program = t.get_startup_program(current_endpoint,
+                                                             self.train_program)
+            elif training_role == "TRAINER":
+                self.train_program = t.get_trainer_program()
+            else:
+                raise ValueError(
+                    'TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
+                )
+
+    def stop(self):
+        """
+        stop training
+        """
+        self.__stop = True
+
+    def train(self, num_epochs, event_handler, reader=None, feed_order=None):
+        """
+        Start the train loop to train the model.
+
+        Args:
+            num_epochs(int): The number of epoch. An epoch will process all data in reader
+            event_handler(callable): The event handler. A function with type (ev:Event)->void
+            reader(callable): A reader creator object. See also
+                :ref:`api_guide_python_reader` .
+            feed_order(list): Feeding order of reader. None will following the defining
+                order in program
+
+        Returns:
+            None
+        """
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "")
+        if training_role == "PSERVER":
+            with self._prog_and_scope_guard():
+                exe = executor.Executor(self.place)
+                exe.run()
+                return
+        if self.parallel:
+            self._train_by_parallel_executor(num_epochs, event_handler, reader,
+                                             feed_order)
+        else:
+            self._train_by_executor(num_epochs, event_handler, reader,
+                                    feed_order)
+
+    def test(self, reader, feed_order):
+        """
+        Test the model on given test data
+
+        Args:
+            reader(callable): The reader that yields test data.
+            feed_order(list): Feeding order of reader. None will following the
+                defining order in program
+        """
+
+        return self._test_by_executor(reader, feed_order,
+                                      self.train_func_outputs)
+
+    def save_params(self, param_path):
+        """
+        Save all parameters into :code:`param_path`.
+
+        Args:
+            param_path(str): The path to save parameters.
+
+        Returns:
+            None
+        """
+        with self._prog_and_scope_guard():
+            exe = executor.Executor(self.place)
+            io.save_persistables(exe, dirname=param_path)
+
+    @contextlib.contextmanager
+    def _prog_and_scope_guard(self):
+        with framework.program_guard(
+                main_program=self.train_program,
+                startup_program=self.startup_program):
+            with executor.scope_guard(self.scope):
+                yield
+
+    def _train_by_executor(self, num_epochs, event_handler, reader, feed_order):
+        """
+        Train by Executor and single device.
+
+        Args:
+            num_epochs:
+            event_handler:
+            reader:
+            feed_order:
+
+        Returns:
+
+        """
+        with self._prog_and_scope_guard():
+            feed_var_list = build_feed_var_list(self.train_program, feed_order)
+            feeder = data_feeder.DataFeeder(
+                feed_list=feed_var_list, place=self.place)
+            exe = executor.Executor(self.place)
+            reader = feeder.decorate_reader(reader, multi_devices=False)
+            self._train_by_any_executor(event_handler, exe, num_epochs, reader)
+
+    def _train_by_any_executor(self, event_handler, exe, num_epochs, reader):
+        if self.checkpoint_cfg:
+            epochs = [
+                epoch_id for epoch_id in range(num_epochs)
+                if epoch_id >= self.checkpoint_cfg.epoch_id
+            ]
+        else:
+            epochs = [epoch_id for epoch_id in range(num_epochs)]
+
+        for epoch_id in epochs:
+            event_handler(BeginEpochEvent(epoch_id))
+            for step_id, data in enumerate(reader()):
+                if self.__stop:
+                    if self.checkpoint_cfg:
+                        self._clean_checkpoint()
+                    return
+
+                if self.checkpoint_cfg and self.checkpoint_cfg.load_serial \
+                    and self.checkpoint_cfg.step_id >= step_id and self.checkpoint_cfg.epoch_id == epoch_id:
+                    continue
+
+                begin_event = BeginStepEvent(epoch_id, step_id)
+                event_handler(begin_event)
+                if begin_event.fetch_metrics:
+                    metrics = exe.run(feed=data,
+                                      fetch_list=[
+                                          var.name
+                                          for var in self.train_func_outputs
+                                      ])
+                else:
+                    metrics = exe.run(feed=data, fetch_list=[])
+
+                if self.checkpoint_cfg:
+                    self._save_checkpoint(epoch_id, step_id)
+                event_handler(EndStepEvent(epoch_id, step_id, metrics))
+            event_handler(EndEpochEvent(epoch_id))
+        if self.checkpoint_cfg:
+            self._clean_checkpoint()
+
+    def _test_by_executor(self, reader, feed_order, fetch_list):
+        with executor.scope_guard(self.scope):
+            feed_var_list = build_feed_var_list(self.test_program, feed_order)
+            feeder = data_feeder.DataFeeder(
+                feed_list=feed_var_list, place=self.place)
+            exe = executor.Executor(self.place)
+            accumulated = len(fetch_list) * [0]
+            count = 0
+            for data in reader():
+                outs = exe.run(program=self.test_program,
+                               feed=feeder.feed(data),
+                               fetch_list=fetch_list)
+                accumulated = [x[0] + x[1][0] for x in zip(accumulated, outs)]
+                count += 1
+
+            return [x / count for x in accumulated]
+
+    def _train_by_parallel_executor(self, num_epochs, event_handler, reader,
+                                    feed_order):
+        with self._prog_and_scope_guard():
+            pe = self._get_or_create_parallel_executor()
+            feed_var_list = build_feed_var_list(self.train_program, feed_order)
+            feeder = data_feeder.DataFeeder(
+                feed_list=feed_var_list, place=self.place)
+            reader = feeder.decorate_reader(reader, multi_devices=True)
+            self._train_by_any_executor(event_handler, pe, num_epochs, reader)
+
+    def _get_parallel_executor(self):
+        return getattr(self, 'parallel_executor', None)
+
+    def _get_or_create_parallel_executor(self):
+        if self._get_parallel_executor() is None:
+            self.parallel_executor = parallel_executor.ParallelExecutor(
+                use_cuda=isinstance(self.place, core.CUDAPlace),
+                loss_name=self.train_func_outputs[0].name)
+        return self._get_parallel_executor()
+
+    def _clean_checkpoint(self):
+        assert self.checkpoint_cfg
+        io.clean_checkpoint(checkpoint_dir=self.checkpoint_cfg.checkpoint_dir)
+
+    def _get_checkpoint_load_args(self):
+        """
+        epoch_id and step_id are runtime arguments, they are not variables, will load them independently.
+        """
+        return ["epoch_id", "step_id"]
+
+    def _get_checkpoint_save_args(self, epoch_id, step_id):
+        """
+        epoch_id and step_id are runtime arguments, they are not variables, will save them independently.
+        """
+        trainer_args = {}
+        trainer_args["epoch_id"] = epoch_id
+        trainer_args["step_id"] = step_id
+        return trainer_args
+
+    def _save_checkpoint(self, epoch_id, step_id):
+        assert self.checkpoint_cfg
+
+        if epoch_id % self.checkpoint_cfg.epoch_interval == 0 \
+            and step_id % self.checkpoint_cfg.step_interval == 0:
+            exe = executor.Executor(self.place)
+            io.save_checkpoint(
+                executor=exe,
+                checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
+                trainer_id=self.trainer_id,
+                trainer_args=self._get_checkpoint_save_args(epoch_id, step_id),
+                main_program=self.train_program,
+                max_num_checkpoints=self.checkpoint_cfg.max_num_checkpoints)
+
+
+def build_feed_var_list(program, feed_order):
+    if not isinstance(program, framework.Program):
+        raise TypeError("The 'program' should be an object of Program")
+
+    if isinstance(feed_order, list):
+        feed_var_list = [
+            program.global_block().var(var_name) for var_name in feed_order
+        ]
+    else:
+        if not isinstance(feed_order, dict):
+            raise TypeError(
+                "The 'feed_order' should be either None, list or dict.")
+        if not sorted(feed_order.values()) == range(len(feed_order)):
+            raise ValueError(
+                "The values of 'feed_order' should be a permutation of [0, len(feed_order))"
+            )
+        sorted_pair_list = sorted(feed_order.items(), key=lambda item: item[1])
+        feed_var_list = [
+            program.global_block().var(pair[0]) for pair in sorted_pair_list
+        ]
+    return feed_var_list
diff --git a/python/paddle/fluid/transpiler/__init__.py b/python/paddle/fluid/transpiler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf18090f71f34be5105498f5846dbcdf15ab2e3f
--- /dev/null
+++ b/python/paddle/fluid/transpiler/__init__.py
@@ -0,0 +1,23 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from distribute_transpiler import DistributeTranspiler
+from inference_transpiler import InferenceTranspiler
+from memory_optimization_transpiler import memory_optimize, release_memory
+from ps_dispatcher import HashName, RoundRobin
+
+__all__ = [
+    "DistributeTranspiler", "InferenceTranspiler", "memory_optimize",
+    "release_memory", "HashName", "RoundRobin"
+]
diff --git a/python/paddle/fluid/transpiler/details/__init__.py b/python/paddle/fluid/transpiler/details/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc597c33849dc06cc975b245099672f64c3539d3
--- /dev/null
+++ b/python/paddle/fluid/transpiler/details/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from program_utils import *
+from ufind import *
diff --git a/python/paddle/fluid/transpiler/details/program_utils.py b/python/paddle/fluid/transpiler/details/program_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f10b496306a002ee131d01798a0698b807d379ca
--- /dev/null
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def delete_ops(block, ops):
+    try:
+        start = list(block.ops).index(ops[0])
+        end = list(block.ops).index(ops[-1])
+        [block.remove_op(start) for _ in xrange(end - start + 1)]
+    except Exception, e:
+        raise e
+    block.program.sync_with_cpp()
+
+
+def find_op_by_input_arg(block, arg_name):
+    for index, op in enumerate(block.ops):
+        if arg_name in op.input_arg_names:
+            return index
+    return -1
+
+
+def find_op_by_output_arg(block, arg_name):
+    for index, op in enumerate(block.ops):
+        if arg_name in op.output_arg_names:
+            return index
+    return -1
diff --git a/python/paddle/fluid/transpiler/details/ufind.py b/python/paddle/fluid/transpiler/details/ufind.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e30d0e3f9c5712c494daf17b2b4bcec86f69c23
--- /dev/null
+++ b/python/paddle/fluid/transpiler/details/ufind.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class UnionFind(object):
+    """ Union-find data structure.
+
+    Union-find is a data structure that keeps track of a set of elements partitioned
+    into a number of disjoint (non-overlapping) subsets.
+
+    Reference:
+    https://en.wikipedia.org/wiki/Disjoint-set_data_structure
+
+    Args:
+      elements(list): The initialize element list.
+    """
+
+    def __init__(self, elementes=None):
+        self._parents = []  # index -> parent index
+        self._index = {}  # element -> index
+        self._curr_idx = 0
+        if not elementes:
+            elementes = []
+        for ele in elementes:
+            self._parents.append(self._curr_idx)
+            self._index.update({ele: self._curr_idx})
+            self._curr_idx += 1
+
+    def find(self, x):
+        # Find the root index of given element x,
+        # execute the path compress while findind the root index
+        if not x in self._index:
+            return -1
+        idx = self._index[x]
+        while idx != self._parents[idx]:
+            t = self._parents[idx]
+            self._parents[idx] = self._parents[t]
+            idx = t
+        return idx
+
+    def union(self, x, y):
+        # Union two given element
+        x_root = self.find(x)
+        y_root = self.find(y)
+
+        if x_root == y_root:
+            return
+        self._parents[x_root] = y_root
+
+    def is_connected(self, x, y):
+        # If two given elements have the same root index,
+        # then they are connected.
+        return self.find(x) == self.find(y)
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..05fed72ee6471ba42007b5a9f09f89148ac27a30
--- /dev/null
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -0,0 +1,1455 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Steps to transpile trainer:
+1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
+2. rename splited grad variables to add trainer_id suffix ".trainer_%d".
+3. modify trainer program add split_op to each grad variable.
+4. append send_op to send splited variables to server and
+5. add recv_op to fetch params(splited blocks or origin param) from server.
+6. append concat_op to merge splited blocks to update local weights.
+
+Steps to transpile pserver:
+1. create new program for parameter server.
+2. create params and grad variables that assigned to current server instance.
+3. create a sub-block in the server side program
+4. append ops that should run on current server instance.
+5. add listen_and_serv op
+"""
+
+from __future__ import print_function
+
+import math
+import numpy as np
+
+from ps_dispatcher import RoundRobin, HashName, PSDispatcher
+from .. import core, framework
+from ..framework import Program, default_main_program, \
+                        default_startup_program, Block, \
+                        Variable, Parameter, grad_var_name
+from details import *
+
+LOOKUP_TABLE_TYPE = "lookup_table"
+LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad"
+OP_ROLE_VAR_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
+RPC_OP_ROLE_ATTR_NAME = op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName(
+)
+RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC
+
+
+class VarBlock:
+    def __init__(self, varname, offset, size):
+        self.varname = varname
+        # NOTE: real offset is offset * size
+        self.offset = offset
+        self.size = size
+
+    def __str__(self):
+        return "%s:%d:%d" % (self.varname, self.offset, self.size)
+
+
+def same_or_split_var(p_name, var_name):
+    return p_name == var_name or p_name.startswith(var_name + ".block")
+
+
+def slice_variable(var_list, slice_count, min_block_size=8192):
+    """
+    We may need to split dense tensor to one or more blocks and put
+    them equally onto parameter server. One block is a sub-tensor
+    aligned by dim[0] of the tensor.
+
+    We need to have a minimal block size so that the calculations in
+    the parameter server side can gain better performance. By default
+    minimum block size 8K elements (maybe 16bit or 32bit or 64bit).
+
+    Args:
+        var_list (list): List of variables.
+        slice_count (int): Numel of count that variables will be sliced, which
+            could be the pserver services' count.
+        min_block_size (int): Minimum splitted block size.
+    Returns:
+        blocks (list[(varname, block_id, current_block_size)]): A list
+            of VarBlocks. Each VarBlock specifies a shard of the var.
+    """
+    blocks = []
+    for var in var_list:
+        split_count = slice_count
+        var_numel = reduce(lambda x, y: x * y, var.shape)
+        max_pserver_count = int(math.floor(var_numel / float(min_block_size)))
+        if max_pserver_count == 0:
+            max_pserver_count = 1
+        if max_pserver_count < slice_count:
+            split_count = max_pserver_count
+        block_size = int(math.ceil(var_numel / float(split_count)))
+
+        if len(var.shape) >= 2:
+            # align by dim1(width)
+            dim1 = reduce(lambda x, y: x * y, var.shape[1:])
+            remains = block_size % dim1
+            if remains != 0:
+                block_size += dim1 - remains
+        # update split_count after aligning
+        split_count = int(math.ceil(var_numel / float(block_size)))
+        for block_id in xrange(split_count):
+            curr_block_size = min(block_size, var_numel - (
+                (block_id) * block_size))
+            block = VarBlock(var.name, block_id, curr_block_size)
+            blocks.append(str(block))
+    return blocks
+
+
+class DistributeTranspiler(object):
+    """
+    **DistributeTranspiler**
+
+    Convert the fluid program to distributed data-parallelism programs.
+
+    The main_program will be transformed to use a remote parameter server
+    to do parameter optimization. And the optimization graph will be put
+    into a parameter server program.
+
+    Examples:
+        .. code-block:: python
+
+           # Define your model before these codes.
+           port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+           pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
+           eplist = []
+           for ip in pserver_ips.split(","):
+                eplist.append(':'.join([ip, port]))
+           pserver_endpoints = ",".join(eplist)
+           trainers = int(os.getenv("PADDLE_TRAINERS"))
+           current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
+           trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+           role = os.getenv("PADDLE_TRAINING_ROLE")
+
+           t = distribute_transpiler.DistributeTranspiler()
+           t.transpile(
+                trainer_id, pservers=pserver_endpoints, trainers=trainers)
+           if role == "PSERVER":
+                pserver_program = t.get_pserver_program(current_endpoint)
+                pserver_startup_program = t.get_startup_program(current_endpoint,
+                                                                pserver_program)
+           elif role == "TRAINER":
+                trainer_program = t.get_trainer_program()
+    """
+
+    def transpile(self,
+                  trainer_id,
+                  program=None,
+                  pservers="127.0.0.1:6174",
+                  trainers=1,
+                  slice_var_up=True,
+                  split_method=RoundRobin,
+                  sync_mode=True):
+        """
+        Run the transpiler.
+
+        Args:
+            trainer_id (int): id for current trainer worker, if you have
+                n workers, the id may range from 0 ~ n-1
+            program (Program|None): program to transpile,
+                default is fluid.default_main_program().
+            pservers (str): comma separated ip:port string for the pserver
+                list.
+            trainers (int): number of trainers in the distributed job.
+            slice_var_up (bool): Do Tensor slice for pservers, default is True.
+            split_method (PSDispatcher): RoundRobin or HashName can be used
+                try to choose the best method to balance loads for pservers.
+            sync_mode (bool): Do sync training or not, default is True.
+        """
+        assert (split_method.__bases__[0] == PSDispatcher)
+        if program is None:
+            program = default_main_program()
+        self.origin_program = program
+        self.trainer_num = trainers
+        self.sync_mode = sync_mode
+        self.trainer_id = trainer_id
+        pserver_endpoints = pservers.split(",")
+        self.pserver_endpoints = pserver_endpoints
+        self.optimize_ops, self.params_grads = self._get_optimize_pass()
+
+        ps_dispatcher = split_method(self.pserver_endpoints)
+        self.has_distributed_lookup_table = self._has_distributed_lookup_table()
+
+        # split and create vars, then put splited vars in dicts for later use.
+        self._init_splited_vars(slice_var_up)
+
+        # step 3.1: insert send op to send gradient vars to parameter servers
+        ps_dispatcher.reset()
+        send_vars = []
+
+        # in general cases, the number of pservers is times of 2, and this
+        # will lead to uneven distribution among weights and bias:
+        #       fc_w@GRAD_trainer_0, fc_w@GRAD_trainer_1 --> pserver1
+        #       fc_b@GRAD_trainer_0, fc_b@GRAD_trainer_1 --> pserver2
+        # shuffle the map will avoid the uneven distribution above
+        grad_var_mapping_items = self.grad_var_mapping.items()
+        if not slice_var_up:
+            np.random.shuffle(grad_var_mapping_items)
+
+        for orig_varname, splited_vars in grad_var_mapping_items:
+            eplist = ps_dispatcher.dispatch(splited_vars)
+
+            if not slice_var_up:
+                assert (len(splited_vars) == 1)
+
+            if len(splited_vars) == 1:
+                orig_varname = splited_vars[0].name
+                index = find_op_by_output_arg(program.global_block(),
+                                              orig_varname)
+            elif len(splited_vars) > 1:
+                orig_var = program.global_block().vars[orig_varname]
+                index = find_op_by_output_arg(program.global_block(),
+                                              orig_varname)
+                self._insert_split_op(program, orig_var, index, splited_vars)
+                index += 1
+            else:
+                AssertionError("Can not insert the send op by original "
+                               "variable name :", orig_varname)
+
+            program.global_block().insert_op(
+                index=index + 1,
+                type="send",
+                inputs={"X": splited_vars},
+                outputs={},
+                attrs={
+                    "epmap": eplist,
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                })
+            for _, var in enumerate(splited_vars):
+                send_vars.append(var)
+
+        if self.sync_mode:
+            program.global_block().append_op(
+                type="send_barrier",
+                inputs={},
+                outputs={},
+                attrs={
+                    "endpoints": pserver_endpoints,
+                    "sync_mode": self.sync_mode,
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                })
+
+        # step 3.2: insert recv op to receive parameters from parameter server
+        recv_vars = []
+        for _, var in enumerate(send_vars):
+            recv_vars.append(self.grad_param_mapping[var])
+        ps_dispatcher.reset()
+        eplist = ps_dispatcher.dispatch(recv_vars)
+
+        for i, ep in enumerate(eplist):
+            self.param_grad_ep_mapping[ep]["params"].append(recv_vars[i])
+            self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i])
+
+        # step4: Concat the parameters splits together after recv.
+        for varname, splited_var in self.param_var_mapping.iteritems():
+            eps = []
+            for var in splited_var:
+                index = [v.name for v in recv_vars].index(var.name)
+                eps.append(eplist[index])
+
+            program.global_block().append_op(
+                type="recv",
+                inputs={},
+                outputs={"Out": splited_var},
+                attrs={
+                    "epmap": eps,
+                    RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                })
+
+        program.global_block().append_op(
+            type="fetch_barrier",
+            inputs={},
+            outputs={},
+            attrs={
+                "endpoints": pserver_endpoints,
+                RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+            })
+
+        for varname, splited_var in self.param_var_mapping.iteritems():
+            if len(splited_var) <= 1:
+                continue
+            orig_param = program.global_block().vars[varname]
+            program.global_block().append_op(
+                type="concat",
+                inputs={"X": splited_var},
+                outputs={"Out": [orig_param]},
+                attrs={"axis": 0})
+
+        if self.has_distributed_lookup_table:
+            self._replace_lookup_table_op_with_prefetch(program,
+                                                        pserver_endpoints)
+            self._split_table_grad_and_add_send_vars(program, pserver_endpoints)
+
+    def get_trainer_program(self):
+        """
+        Get transpiled trainer side program.
+
+        Returns:
+            Program: trainer side program.
+        """
+        # remove optimize ops and add a send op to main_program
+        # FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay?
+        delete_ops(self.origin_program.global_block(), self.optimize_ops)
+        self.origin_program.__str__()
+        return self.origin_program
+
+    def get_pserver_program(self, endpoint):
+        """
+        Get parameter server side program.
+        
+        Args:
+            endpoint (str): current parameter server endpoint.
+        
+        Returns:
+            Program: the program for current parameter server to run.
+        """
+        # TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers.
+        # NOTE: assume blocks of the same variable is not distributed
+        # on the same pserver, only change param/grad varnames for
+        # trainers to fetch.
+
+        # step1
+        pserver_program = Program()
+        # step2: Create vars to receive vars at parameter servers.
+        recv_inputs = []
+        for v in self.param_grad_ep_mapping[endpoint]["params"]:
+            self._clone_var(pserver_program.global_block(), v)
+        for v in self.param_grad_ep_mapping[endpoint]["grads"]:
+            # create vars for each trainer in global scope, so
+            # we don't need to create them when grad arrives.
+            # change client side var name to origin name by
+            # removing ".trainer_%d" suffix
+            suff_idx = v.name.find(".trainer_")
+            if suff_idx >= 0:
+                orig_var_name = v.name[:suff_idx]
+            else:
+                orig_var_name = v.name
+            # NOTE: single_trainer_var must be created for multi-trainer
+            # case to merge grads from multiple trainers
+            single_trainer_var = \
+                pserver_program.global_block().create_var(
+                    name=orig_var_name,
+                    persistable=True,
+                    type=v.type,
+                    dtype=v.dtype,
+                    shape=v.shape)
+            if self.sync_mode and self.trainer_num > 1:
+                for trainer_id in xrange(self.trainer_num):
+                    var = pserver_program.global_block().create_var(
+                        name="%s.trainer_%d" % (orig_var_name, trainer_id),
+                        persistable=False,
+                        type=v.type,
+                        dtype=v.dtype,
+                        shape=v.shape)
+                    recv_inputs.append(var)
+            else:
+                recv_inputs.append(single_trainer_var)
+
+        # step 3
+        # Create a union-find data structure from optimize ops,
+        # If two ops are connected, we could add these two ops
+        # into one set.
+        ufind = self._create_ufind(self.optimize_ops)
+        # step 3.2
+        # Iterate through the ops and append optimize op which
+        # located on current pserver
+        opt_op_on_pserver = []
+        for _, op in enumerate(self.optimize_ops):
+            if self._is_optimizer_op(op) and self._is_opt_op_on_pserver(
+                    endpoint, op):
+                opt_op_on_pserver.append(op)
+        # step 3.3
+        # Iterate through the ops, and if an op and the optimize ops
+        # which located on current pserver are in one set, then
+        # append it into the sub program.
+
+        global_ops = []
+        # HACK: optimization global ops only used to scale beta1 and beta2
+        # replace it with dependency engine.
+        for op in self.optimize_ops:
+            if self._is_adam_connected_op(op):
+                global_ops.append(op)
+
+        def __append_optimize_op__(op, block, grad_to_block_id, merged_var,
+                                   lr_ops):
+            if self._is_optimizer_op(op):
+                self._append_pserver_ops(block, op, endpoint, grad_to_block_id,
+                                         self.origin_program, merged_var)
+            elif op not in lr_ops:
+                self._append_pserver_non_opt_ops(block, op)
+
+        def __op_have_grad_input__(op):
+            for varname in op.input_arg_names:
+                if varname.find("@GRAD") >= 0:
+                    return varname
+            return ""
+
+        def __clone_lr_op_sub_block__(op, program, lr_block):
+            if not op.has_attr('sub_block'):
+                return
+
+            origin_block_desc = op.attr('sub_block')
+            origin_block = self.origin_program.block(origin_block_desc.id)
+            assert isinstance(origin_block, Block)
+            # we put the new sub block to new block to follow the block
+            # hierarchy of the original blocks
+            new_sub_block = program.create_block(lr_block.idx)
+
+            # clone vars
+            for var in origin_block.vars:
+                new_sub_block.clone_variable(var)
+
+            # clone ops
+            for origin_op in origin_block.ops:
+                cloned_op = self._clone_lr_op(program, new_sub_block, origin_op)
+                # clone sub_block of op
+                __clone_lr_op_sub_block__(cloned_op, program, new_sub_block)
+
+            # reset the block of op
+            op.set_attr('sub_block', new_sub_block)
+
+        # append lr decay ops to the child block if exists
+        lr_ops = self._get_lr_ops()
+        # record optimize blocks and we can run them on pserver parallel
+        optimize_blocks = []
+        if len(lr_ops) > 0:
+            lr_decay_block = pserver_program.create_block(
+                pserver_program.num_blocks - 1)
+            optimize_blocks.append(lr_decay_block)
+            for _, op in enumerate(lr_ops):
+                cloned_op = self._append_pserver_non_opt_ops(lr_decay_block, op)
+                # append sub blocks to pserver_program in lr_decay_op
+                __clone_lr_op_sub_block__(cloned_op, pserver_program,
+                                          lr_decay_block)
+
+        # append op to the current block
+        grad_to_block_id = []
+        pre_block_idx = pserver_program.num_blocks - 1
+        for idx, opt_op in enumerate(opt_op_on_pserver):
+            per_opt_block = pserver_program.create_block(pre_block_idx)
+            optimize_blocks.append(per_opt_block)
+            # append grad merging ops before clip and weight decay
+            for _, op in enumerate(self.optimize_ops):
+                # find the origin @GRAD var before clipping
+                grad_varname_for_block = __op_have_grad_input__(op)
+                if ufind.is_connected(op, opt_op) and grad_varname_for_block:
+                    merged_var = self._append_pserver_grad_merge_ops(
+                        per_opt_block, grad_varname_for_block, endpoint,
+                        grad_to_block_id, self.origin_program)
+            for _, op in enumerate(self.optimize_ops):
+                # optimizer is connected to itself
+                if ufind.is_connected(op, opt_op) and op not in global_ops:
+                    __append_optimize_op__(op, per_opt_block, grad_to_block_id,
+                                           merged_var, lr_ops)
+
+        # dedup grad to ids list
+        grad_to_block_id = list(set(grad_to_block_id))
+        # append global ops
+        if global_ops:
+            opt_state_block = pserver_program.create_block(
+                pserver_program.num_blocks - 1)
+            optimize_blocks.append(opt_state_block)
+            for glb_op in global_ops:
+                __append_optimize_op__(glb_op, opt_state_block,
+                                       grad_to_block_id, None, lr_ops)
+
+        # process distributed lookup_table
+        prefetch_var_name_to_block_id = []
+        if self.has_distributed_lookup_table:
+            pserver_index = self.pserver_endpoints.index(endpoint)
+            table_opt_block = self._create_table_optimize_block(
+                pserver_index, pserver_program, pre_block_idx, grad_to_block_id)
+            prefetch_var_name_to_block_id = self._create_prefetch_block(
+                pserver_index, pserver_program, table_opt_block)
+            checkpoint_block_id = self._create_checkpoint_save_block(
+                pserver_program, table_opt_block.idx)
+
+        # NOTE: if has_distributed_lookup_table is False, then prefetch_block will
+        # not be executed, so it's safe to use optimize_block to hold the place
+        if self.has_distributed_lookup_table:
+            assert len(prefetch_var_name_to_block_id) > 0
+        else:
+            assert len(prefetch_var_name_to_block_id) == 0
+
+        attrs = {
+            "optimize_blocks": optimize_blocks,
+            "endpoint": endpoint,
+            "Fanin": self.trainer_num,
+            "sync_mode": self.sync_mode,
+            "grad_to_block_id": grad_to_block_id,
+        }
+        if len(prefetch_var_name_to_block_id) > 0:
+            attrs['prefetch_var_name_to_block_id'] \
+                = prefetch_var_name_to_block_id
+            attrs['checkpint_block_id'] = checkpoint_block_id
+
+        # step5 append the listen_and_serv op
+        pserver_program.global_block().append_op(
+            type="listen_and_serv",
+            inputs={'X': recv_inputs},
+            outputs={},
+            attrs=attrs)
+
+        pserver_program.sync_with_cpp()
+        return pserver_program
+
+    def get_startup_program(self, endpoint, pserver_program):
+        """
+        Get startup program for current parameter server.
+        Modify operator input variables if there are variables that
+        were split to several blocks.
+
+        Args:
+            endpoint (str): current pserver endpoint.
+            pserver_program (Program): call get_pserver_program first and
+                pass the result here.
+        
+        Returns:
+            Program: parameter server side startup program.
+        """
+        s_prog = Program()
+        orig_s_prog = default_startup_program()
+        params = self.param_grad_ep_mapping[endpoint]["params"]
+
+        def _get_splited_name_and_shape(varname):
+            for idx, splited_param in enumerate(params):
+                pname = splited_param.name
+                if same_or_split_var(pname, varname) and varname != pname:
+                    return pname, splited_param.shape
+            return "", []
+
+        # 1. create vars in pserver program to startup program
+        pserver_vars = pserver_program.global_block().vars
+        created_var_map = dict()
+        for _, var in pserver_vars.iteritems():
+            tmpvar = s_prog.global_block().clone_variable(var)
+            created_var_map[var.name] = tmpvar
+
+        # 2. rename op outputs
+        for op in orig_s_prog.global_block().ops:
+            new_outputs = dict()
+            # do not append startup op if var is not on this pserver
+            op_on_pserver = False
+            for key in op.output_names:
+                newname, _ = _get_splited_name_and_shape(op.output(key)[0])
+                if newname:
+                    op_on_pserver = True
+                    new_outputs[key] = created_var_map[newname]
+                elif op.output(key)[0] in pserver_vars:
+                    op_on_pserver = True
+                    new_outputs[key] = pserver_vars[op.output(key)[0]]
+
+            # most startup program ops have no inputs
+            new_inputs = self._get_input_map_from_op(pserver_vars, op)
+
+            if op_on_pserver:
+                if op.type in [
+                        "gaussian_random", "fill_constant", "uniform_random"
+                ]:
+                    op.attrs["shape"] = new_outputs["Out"].shape
+                s_prog.global_block().append_op(
+                    type=op.type,
+                    inputs=new_inputs,
+                    outputs=new_outputs,
+                    attrs=op.attrs)
+        return s_prog
+
+    # ====================== private transpiler functions =====================
+
+    def _has_distributed_lookup_table(self):
+        # process lookup_table_op
+        # 1. check all lookup_table_op is distributed
+        # 2. check all lookup_table_op share the same table.
+        distributed_lookup_table_ops = []
+        # support only one distributed_lookup_table now
+        self.table_name = None
+        for op in self.origin_program.global_block().ops:
+            if op.type == LOOKUP_TABLE_TYPE:
+                if op.attrs['is_distributed'] is True:
+                    if self.table_name is None:
+                        self.table_name = op.input("W")[0]
+                    if self.table_name != op.input("W")[0]:
+                        raise RuntimeError("all distributed lookup_table_ops"
+                                           " should have only one table")
+                    distributed_lookup_table_ops.append(op)
+                else:
+                    if self.table_name is not None:
+                        assert op.input("W")[0] != self.table_name
+
+        return len(distributed_lookup_table_ops) > 0
+
+    def _update_dist_lookup_table_vars(self, param_list, grad_list,
+                                       params_grads):
+        # TODO(wuyi): put find a way to put dist lookup table stuff all together.
+        # update self.table_param_grad and self.trainer_side_table_grad_list
+        program = self.origin_program
+        if self.has_distributed_lookup_table:
+            param_list = [
+                param for param in param_list if param.name != self.table_name
+            ]
+            grad_list = [
+                grad for grad in grad_list
+                if grad.name != grad_var_name(self.table_name)
+            ]
+            self.table_param_grad = [
+                param_grad for param_grad in params_grads
+                if param_grad[0].name == self.table_name
+            ][0]
+            table_grad_var = self.table_param_grad[1]
+            if self.sync_mode:
+                self.trainer_side_table_grad_list = [
+                    program.global_block().create_var(
+                        name="%s.trainer_%d.pserver_%d" %
+                        (table_grad_var.name, self.trainer_id, index),
+                        type=table_grad_var.type,
+                        shape=table_grad_var.shape,
+                        dtype=table_grad_var.dtype)
+                    for index in range(len(self.pserver_endpoints))
+                ]
+            else:
+                self.trainer_side_table_grad_list = [
+                    program.global_block().create_var(
+                        name="%s.pserver_%d" % (table_grad_var.name, index),
+                        type=table_grad_var.type,
+                        shape=table_grad_var.shape,
+                        dtype=table_grad_var.dtype)
+                    for index in range(len(self.pserver_endpoints))
+                ]
+        return param_list, grad_list
+
+    def _init_splited_vars(self, slice_var_up):
+        # update these mappings for further transpile:
+        # 1. param_var_mapping: param var name -> [splited params vars]
+        # 2. grad_var_mapping: grad var name -> [splited grads vars]
+        # 3. grad_param_mapping: grad.blockx -> param.blockx
+        # 4. param_grad_ep_mapping: ep -> {"params": [], "grads": []}
+
+        param_list = []
+        grad_list = []
+        param_grad_set = set()
+        for p, g in self.params_grads:
+            # skip parameter marked not trainable
+            if type(p) == Parameter and p.trainable == False:
+                continue
+            if p.name not in param_grad_set:
+                param_list.append(p)
+                param_grad_set.add(p.name)
+            if g.name not in param_grad_set:
+                grad_list.append(g)
+                param_grad_set.add(g.name)
+
+        param_list, grad_list = self._update_dist_lookup_table_vars(
+            param_list, grad_list, self.params_grads)
+
+        if slice_var_up:
+            # when we slice var up into blocks, we will slice the var according to
+            # pserver services' count. A pserver may have two or more listening ports.
+            grad_blocks = slice_variable(grad_list, len(self.pserver_endpoints))
+            param_blocks = slice_variable(param_list,
+                                          len(self.pserver_endpoints))
+        else:
+            # when we do NOT slice var up into blocks, we will always slice params
+            # grads into one block.
+            grad_blocks = slice_variable(grad_list, 1)
+            param_blocks = slice_variable(param_list, 1)
+        assert (len(grad_blocks) == len(param_blocks))
+
+        # origin_varname -> [splited_var]
+        self.param_var_mapping = self._create_vars_from_blocklist(
+            self.origin_program, param_blocks)
+        self.grad_var_mapping = self._create_vars_from_blocklist(
+            self.origin_program,
+            grad_blocks,
+            add_trainer_suffix=self.trainer_num > 1)
+        self.grad_param_mapping = dict()
+        for g, p in zip(grad_blocks, param_blocks):
+            g_name, g_bid, _ = g.split(":")
+            p_name, p_bid, _ = p.split(":")
+            self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] =  \
+                    self.param_var_mapping[p_name][int(p_bid)]
+
+        # create mapping of endpoint -> split var to create pserver side program
+        self.param_grad_ep_mapping = dict()
+        [
+            self.param_grad_ep_mapping.update({
+                ep: {
+                    "params": [],
+                    "grads": []
+                }
+            }) for ep in self.pserver_endpoints
+        ]
+
+    # transpiler function for dis lookup_table
+    def _replace_lookup_table_op_with_prefetch(self, program,
+                                               pserver_endpoints):
+        # 1. replace lookup_table_op with split_ids_op -> prefetch_op -> sum_op
+        # self.all_prefetch_input_vars =
+        #       [[var0_prefetch_in_pserver0, var0_prefetch_in_pserver1]
+        #        [var1_prefetch_in_pserver0, var1_prefetch_in_pserver1]]
+        self.all_prefetch_input_vars = []
+
+        # self.all_prefetch_input_vars =
+        #       [[var0_prefetch_in_pserver0, var0_prefetch_in_pserver1]
+        #        [var1_prefetch_in_pserver0, var1_prefetch_in_pserver1]]
+        self.all_prefetch_output_vars = []
+
+        continue_search_lookup_table_op = True
+        while continue_search_lookup_table_op:
+            continue_search_lookup_table_op = False
+            all_ops = program.global_block().ops
+            for op in all_ops:
+                if op.type == LOOKUP_TABLE_TYPE:
+                    continue_search_lookup_table_op = True
+
+                    lookup_table_op_index = list(all_ops).index(op)
+                    ids_name = op.input("Ids")
+                    out_name = op.output("Out")
+
+                    ids_var = program.global_block().vars[ids_name[0]]
+                    prefetch_input_vars = self.create_splited_vars(
+                        source_var=ids_var,
+                        block=program.global_block(),
+                        tag="_prefetch_in_")
+                    self.all_prefetch_input_vars.append(prefetch_input_vars)
+
+                    out_var = program.global_block().vars[out_name[0]]
+                    prefetch_output_vars = self.create_splited_vars(
+                        source_var=out_var,
+                        block=program.global_block(),
+                        tag="_prefetch_out_")
+                    self.all_prefetch_output_vars.append(prefetch_output_vars)
+
+                    # insert split_ids_op
+                    program.global_block().insert_op(
+                        index=lookup_table_op_index,
+                        type="split_ids",
+                        inputs={
+                            'Ids': [
+                                program.global_block().vars[varname]
+                                for varname in ids_name
+                            ]
+                        },
+                        outputs={"Out": prefetch_input_vars})
+
+                    # insert prefetch_op
+                    program.global_block().insert_op(
+                        index=lookup_table_op_index + 1,
+                        type="prefetch",
+                        inputs={'X': prefetch_input_vars},
+                        outputs={"Out": prefetch_output_vars},
+                        attrs={
+                            "epmap": pserver_endpoints,
+                            RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                        })
+
+                    # insert concat_op
+                    program.global_block().insert_op(
+                        index=lookup_table_op_index + 2,
+                        type="merge_ids",
+                        inputs={
+                            'Ids': [
+                                program.global_block().vars[varname]
+                                for varname in ids_name
+                            ],
+                            'X': prefetch_output_vars
+                        },
+                        outputs={
+                            "Out": [
+                                program.global_block().vars[varname]
+                                for varname in out_name
+                            ]
+                        })
+
+                    # delete lookup_table_op
+                    delete_ops(program.global_block(), [op])
+                    # break for loop
+                    break
+
+    def _split_table_grad_and_add_send_vars(self, program, pserver_endpoints):
+        # 2. add split_ids_op and send_op to send gradient to pservers
+        # there should only be one table_name
+        all_ops = program.global_block().ops
+        table_grad_name = grad_var_name(self.table_name)
+        for op in all_ops:
+            if table_grad_name in op.output_arg_names:
+                op_index = list(all_ops).index(op)
+                # insert split_ids_op
+                program.global_block().insert_op(
+                    index=op_index + 1,
+                    type="split_ids",
+                    inputs={
+                        'Ids': [program.global_block().vars[table_grad_name]]
+                    },
+                    outputs={"Out": self.trainer_side_table_grad_list})
+                program.global_block().insert_op(
+                    index=op_index + 2,
+                    type="send",
+                    inputs={'X': self.trainer_side_table_grad_list},
+                    outputs={},
+                    attrs={
+                        "sync_mode": True,
+                        "epmap": pserver_endpoints,
+                        RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
+                    })
+                break
+
+    def _create_prefetch_block(self, pserver_index, pserver_program,
+                               optimize_block):
+        # STEP: create prefetch block
+        table_var = pserver_program.global_block().vars[self.table_name]
+        prefetch_var_name_to_block_id = []
+        for index in range(len(self.all_prefetch_input_vars)):
+            prefetch_block = pserver_program.create_block(optimize_block.idx)
+            trainer_ids = self.all_prefetch_input_vars[index][pserver_index]
+            pserver_ids = pserver_program.global_block().create_var(
+                name=trainer_ids.name,
+                type=trainer_ids.type,
+                shape=trainer_ids.shape,
+                dtype=trainer_ids.dtype)
+            trainer_out = self.all_prefetch_output_vars[index][pserver_index]
+            pserver_out = pserver_program.global_block().create_var(
+                name=trainer_out.name,
+                type=trainer_out.type,
+                shape=trainer_out.shape,
+                dtype=trainer_out.dtype)
+            prefetch_block.append_op(
+                type="lookup_sparse_table",
+                inputs={'Ids': pserver_ids,
+                        "W": table_var},
+                outputs={"Out": pserver_out},
+                attrs={
+                    "is_sparse": True,  # has no effect on lookup_table op
+                    "is_distributed": True,
+                    "padding_idx": -1
+                })
+            prefetch_var_name_to_block_id.append(trainer_ids.name + ":" + str(
+                prefetch_block.idx))
+        return prefetch_var_name_to_block_id
+
+    def _create_table_optimize_block(self, pserver_index, pserver_program,
+                                     pre_block_idx, grad_to_block_id):
+        # STEP: create table optimize block
+        # create table param and grad var in pserver program
+        origin_param_var = self.origin_program.global_block().vars[
+            self.table_name]
+        param_var = pserver_program.global_block().create_var(
+            name=origin_param_var.name,
+            shape=origin_param_var.shape,
+            dtype=origin_param_var.dtype,
+            type=core.VarDesc.VarType.SELECTED_ROWS,
+            persistable=True)
+        # parameter must be selected rows
+        param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
+        grad_var = pserver_program.global_block().clone_variable(
+            self.origin_program.global_block().vars[grad_var_name(
+                self.table_name)])
+
+        # create table optimize block in pserver program
+        table_opt_op = [
+            op for op in self.optimize_ops
+            if op.input("Param")[0] == self.table_name
+        ][0]
+        table_opt_block = pserver_program.create_block(pre_block_idx)
+        # only support sgd now
+        assert table_opt_op.type == "sgd"
+
+        if self.sync_mode:
+            # create grad vars in pserver program
+            table_grad_var = self.table_param_grad[1]
+            pserver_side_table_grad_list = [
+                pserver_program.global_block().create_var(
+                    name="%s.trainer_%d.pserver_%d" %
+                    (table_grad_var.name, index, pserver_index),
+                    type=table_grad_var.type,
+                    shape=table_grad_var.shape,
+                    dtype=table_grad_var.dtype)
+                for index in range(self.trainer_num)
+            ]
+
+            # append sum op for pserver_side_table_grad_list
+            table_opt_block.append_op(
+                type="sum",
+                inputs={"X": pserver_side_table_grad_list},
+                outputs={"Out": [grad_var]},
+                attrs={"use_mkldnn": False})
+        else:
+            # in async_mode, for table gradient, it also need to be splited to each parameter server
+            origin_grad_name = grad_var.name
+            splited_grad_name = self.trainer_side_table_grad_list[
+                pserver_index].name
+            if not splited_grad_name.startswith(origin_grad_name):
+                raise ValueError("origin_grad_var: " + splited_grad_name +
+                                 " grad_var:" + grad_var.name)
+            grad_var = pserver_program.global_block().rename_var(
+                origin_grad_name, splited_grad_name)
+
+        lr_var = pserver_program.global_block().vars[table_opt_op.input(
+            "LearningRate")[0]]
+        inputs = {
+            "Param": [param_var],
+            "Grad": [grad_var],
+            "LearningRate": [lr_var]
+        }
+        outputs = {"ParamOut": [param_var]}
+        table_opt_block.append_op(
+            type=table_opt_op.type,
+            inputs=inputs,
+            outputs=outputs,
+            attrs=table_opt_op.attrs)
+
+        # add table parameter gradient and it's block id to grad_to_block_id
+        grad_to_block_id.append(grad_var.name + ":" + str(table_opt_block.idx))
+
+        return table_opt_block
+
+    def _create_checkpoint_save_block(self, pserver_program, pre_block_idx):
+        """
+        create a new block to handle save checkpoint.
+        """
+        import os
+
+        pserver_program.global_block().create_var(
+            name="kLookupTablePath",
+            persistable=True,
+            type=core.VarDesc.VarType.RAW)
+
+        checkpoint_save_block = pserver_program.create_block(pre_block_idx)
+        # this 'file_path' do not be used in save lookup table variable
+        checkpoint_save_block.append_op(
+            type='save',
+            inputs={'X': [self.table_name]},
+            outputs={},
+            attrs={'file_path': "none"})
+
+        return checkpoint_save_block.idx
+
+    def _create_vars_from_blocklist(self,
+                                    program,
+                                    block_list,
+                                    add_trainer_suffix=False):
+        """
+        Create vars for each split.
+        NOTE: only grads need to be named for different trainers, use
+              add_trainer_suffix to rename the grad vars.
+        Args:
+            program (ProgramDesc): ProgramDesc which gradients blong.
+            block_list (list[(varname, block_id, block_size)]): List of gradient blocks.
+            add_trainer_suffix (Bool): Add trainer suffix to new variable's name if set True.
+        Returns:
+            var_mapping (dict(varname->[new_varname_variable])):A dict mapping
+                from original var name to each var split.
+        """
+
+        # varname->[(block_id, current_block_size)]
+        block_map = dict()
+
+        var_mapping = dict()
+        for block_str in block_list:
+            varname, offset, size = block_str.split(":")
+            if not block_map.has_key(varname):
+                block_map[varname] = []
+            block_map[varname].append((long(offset), long(size)))
+
+        for varname, splited in block_map.iteritems():
+            orig_var = program.global_block().var(varname)
+            if len(splited) == 1:
+                if self.sync_mode and add_trainer_suffix:
+                    new_var_name = "%s.trainer_%d" % \
+                        (orig_var.name, self.trainer_id)
+                    program.global_block().rename_var(varname, new_var_name)
+                    var_mapping[varname] = \
+                        [program.global_block().var(new_var_name)]
+                else:
+                    var_mapping[varname] = \
+                        [program.global_block().var(orig_var.name)]
+                continue
+
+            var_mapping[varname] = []
+            orig_shape = orig_var.shape
+            orig_dim1_flatten = 1
+            if len(orig_shape) >= 2:
+                orig_dim1_flatten = reduce(lambda x, y: x * y, orig_shape[1:])
+
+            for i, block in enumerate(splited):
+                size = block[1]
+                rows = size / orig_dim1_flatten
+                splited_shape = [rows]
+                if len(orig_shape) >= 2:
+                    splited_shape.extend(orig_shape[1:])
+                new_var_name = ""
+                if self.sync_mode and add_trainer_suffix:
+                    new_var_name = "%s.block%d.trainer_%d" % \
+                        (varname, i, self.trainer_id)
+                else:
+                    new_var_name = "%s.block%d" % \
+                        (varname, i)
+                var = program.global_block().create_var(
+                    name=new_var_name,
+                    persistable=False,
+                    dtype=orig_var.dtype,
+                    type=orig_var.type,
+                    shape=splited_shape)  # flattend splited var
+                var_mapping[varname].append(var)
+            program.global_block().sync_with_cpp()
+        return var_mapping
+
+    def create_splited_vars(self, source_var, block, tag):
+        return [
+            block.create_var(
+                name=str(source_var.name + tag + str(index)),
+                type=source_var.type,
+                shape=source_var.shape,
+                dtype=source_var.dtype)
+            for index in range(len(self.pserver_endpoints))
+        ]
+
+    def _clone_var(self, block, var, persistable=True):
+        assert isinstance(var, Variable)
+        return block.create_var(
+            name=var.name,
+            shape=var.shape,
+            dtype=var.dtype,
+            type=var.type,
+            lod_level=var.lod_level,
+            persistable=persistable)
+
+    def _insert_split_op(self, program, orig_var, index, splited_vars):
+        if orig_var.type == core.VarDesc.VarType.SELECTED_ROWS:
+            height_sections = []
+            for v in splited_vars:
+                height_sections.append(v.shape[0])
+            program.global_block().insert_op(
+                index=index + 1,
+                type="split_selected_rows",
+                inputs={"X": orig_var},
+                outputs={"Out": splited_vars},
+                attrs={"height_sections": height_sections})
+        elif orig_var.type == core.VarDesc.VarType.LOD_TENSOR:
+            sections = []
+            for v in splited_vars:
+                sections.append(v.shape[0])
+            program.global_block().insert_op(
+                index=index + 1,
+                type="split_byref",
+                inputs={"X": orig_var},
+                outputs={"Out": splited_vars},
+                attrs={"sections": sections}  # assume split evenly
+            )
+        else:
+            AssertionError("Variable type should be in set "
+                           "[LOD_TENSOR, SELECTED_ROWS]")
+
+    def _get_optimizer_input_shape(self, op_type, varkey, orig_shape,
+                                   param_shape):
+        """
+        Returns the shape for optimizer inputs that need to be reshaped when
+        Param and Grad is split to multiple servers.
+        """
+        # HACK(typhoonzero): Should use functions of corresponding optimizer in
+        # optimizer.py to get the shape, do not  bind this in the transpiler.
+        if op_type == "adam":
+            if varkey in ["Moment1", "Moment2"]:
+                return param_shape
+        elif op_type == "adagrad":
+            if varkey == "Moment":
+                return param_shape
+        elif op_type == "adamax":
+            if varkey in ["Moment", "InfNorm"]:
+                return param_shape
+        elif op_type == "momentum":
+            if varkey == "Velocity":
+                return param_shape
+        elif op_type == "":
+            if varkey == "Moment":
+                return param_shape
+        elif op_type == "sgd":
+            pass
+        return orig_shape
+
+    def _get_varname_parts(self, varname):
+        # returns origin, blockid, trainerid
+        orig_var_name = ""
+        trainer_part = ""
+        block_part = ""
+        trainer_idx = varname.find(".trainer_")
+        if trainer_idx >= 0:
+            trainer_part = varname[trainer_idx + 1:]
+        else:
+            trainer_idx = len(varname)
+        block_index = varname.find(".block")
+        if block_index >= 0:
+            block_part = varname[block_index + 1:trainer_idx]
+        else:
+            block_index = len(varname)
+        orig_var_name = varname[0:min(block_index, trainer_idx)]
+        return orig_var_name, block_part, trainer_part
+
+    def _orig_varname(self, varname):
+        orig, _, _ = self._get_varname_parts(varname)
+        return orig
+
+    def _append_pserver_grad_merge_ops(self, optimize_block,
+                                       grad_varname_for_block, endpoint,
+                                       grad_to_block_id, origin_program):
+        program = optimize_block.program
+        pserver_block = program.global_block()
+        grad_block = None
+        for g in self.param_grad_ep_mapping[endpoint]["grads"]:
+            if self._orig_varname(g.name) == \
+                    self._orig_varname(grad_varname_for_block):
+                grad_block = g
+                break
+        if not grad_block:
+            # do not append this op if current endpoint
+            # is not dealing with this grad block
+            return
+        orig_varname, block_name, trainer_name = self._get_varname_parts(
+            grad_block.name)
+        if block_name:
+            merged_var_name = '.'.join([orig_varname, block_name])
+        else:
+            merged_var_name = orig_varname
+        merged_var = \
+            pserver_block.vars[merged_var_name]
+        grad_to_block_id.append(merged_var.name + ":" + str(optimize_block.idx))
+        if self.sync_mode and self.trainer_num > 1:
+            vars2merge = []
+            for i in xrange(self.trainer_num):
+                per_trainer_name = "%s.trainer_%d" % \
+                (merged_var_name, i)
+                vars2merge.append(pserver_block.vars[per_trainer_name])
+
+            optimize_block.append_op(
+                type="sum",
+                inputs={"X": vars2merge},
+                outputs={"Out": merged_var},
+                attrs={"use_mkldnn": False})
+            # TODO(panyx0718): What if it's SELECTED_ROWS.
+            if not merged_var.type == core.VarDesc.VarType.SELECTED_ROWS:
+                optimize_block.append_op(
+                    type="scale",
+                    inputs={"X": merged_var},
+                    outputs={"Out": merged_var},
+                    attrs={"scale": 1.0 / float(self.trainer_num)})
+        return merged_var
+
+    def _append_pserver_ops(self, optimize_block, opt_op, endpoint,
+                            grad_to_block_id, origin_program, merged_var):
+        program = optimize_block.program
+        pserver_block = program.global_block()
+        new_inputs = dict()
+        # update param/grad shape first, then other inputs like
+        # moment can use the updated shape
+        for key in opt_op.input_names:
+            if key == "Grad":
+                new_inputs[key] = merged_var
+            elif key == "Param":
+                # param is already created on global program
+                param_block = None
+                for p in self.param_grad_ep_mapping[endpoint]["params"]:
+                    if same_or_split_var(p.name, opt_op.input(key)[0]):
+                        param_block = p
+                        break
+                if not param_block:
+                    return
+                tmpvar = pserver_block.create_var(
+                    name=param_block.name,
+                    persistable=True,
+                    dtype=param_block.dtype,
+                    shape=param_block.shape)
+                new_inputs[key] = tmpvar
+            elif key == "LearningRate":
+                # learning rate variable has already be created by non-optimize op,
+                # don't create it once again.
+                lr_varname = opt_op.input(key)[0]
+                if pserver_block.vars.has_key(lr_varname):
+                    new_inputs[key] = pserver_block.vars[opt_op.input(key)[0]]
+                else:
+                    origin_var = origin_program.global_block().vars[lr_varname]
+                    tmpvar = pserver_block.create_var(
+                        name=origin_var.name,
+                        persistable=origin_var.persistable,
+                        dtype=origin_var.dtype,
+                        shape=origin_var.shape)
+                    new_inputs[key] = tmpvar
+
+        for key in opt_op.input_names:
+            new_shape = None
+            if key in ["Param", "Grad", "LearningRate"]:
+                continue
+            var = self.origin_program.global_block().vars[opt_op.input(key)[0]]
+            # update accumulator variable shape
+            param_shape = new_inputs["Param"].shape
+            new_shape = self._get_optimizer_input_shape(opt_op.type, key,
+                                                        var.shape, param_shape)
+            tmpvar = pserver_block.create_var(
+                name=var.name,
+                persistable=var.persistable,
+                dtype=var.dtype,
+                shape=new_shape)
+            new_inputs[key] = tmpvar
+
+        # change output's ParamOut variable
+        outputs = self._get_output_map_from_op(
+            self.origin_program.global_block().vars, opt_op)
+        outputs["ParamOut"] = new_inputs["Param"]
+
+        optimize_block.append_op(
+            type=opt_op.type,
+            inputs=new_inputs,
+            outputs=outputs,
+            attrs=opt_op.attrs)
+
+    def _is_splited_grad_var(self, var, var_dict):
+        grad_block = None
+        for _, g in var_dict.iteritems():
+            if self._orig_varname(g.name) == self._orig_varname(var.name):
+                if g.name.find(".trainer_") == -1:
+                    grad_block = g
+                    break
+        return grad_block
+
+    def _clone_lr_op(self, program, block, op):
+        inputs = self._get_input_map_from_op(
+            self.origin_program.global_block().vars, op)
+        for key, varlist in inputs.iteritems():
+            if not isinstance(varlist, list):
+                varlist = [varlist]
+            for var in varlist:
+                if var not in program.global_block().vars:
+                    block.clone_variable(var)
+
+        outputs = self._get_output_map_from_op(
+            self.origin_program.global_block().vars, op)
+        for key, varlist in outputs.iteritems():
+            if not isinstance(varlist, list):
+                varlist = [varlist]
+            for var in varlist:
+                if var not in program.global_block().vars:
+                    block.clone_variable(var)
+
+        return block.append_op(
+            type=op.type, inputs=inputs, outputs=outputs, attrs=op.attrs)
+
+    def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
+        program = optimize_block.program
+        # Append the ops for parameters that do not need to be optimized/updated
+        inputs = self._get_input_map_from_op(
+            self.origin_program.global_block().vars, opt_op)
+        for key, varlist in inputs.iteritems():
+            if not isinstance(varlist, list):
+                varlist = [varlist]
+            for var in varlist:
+                # for ops like clipping and weight decay, get the splited var
+                # for inputs/outputs
+                grad_block = self._is_splited_grad_var(
+                    var, program.global_block().vars)
+                if grad_block:
+                    inputs[key] = grad_block
+                elif not program.global_block().vars.has_key(var.name):
+                    program.global_block().create_var(
+                        name=var.name,
+                        persistable=var.persistable,
+                        dtype=var.dtype,
+                        shape=var.shape)
+
+        outputs = self._get_output_map_from_op(
+            self.origin_program.global_block().vars, opt_op)
+        for key, varlist in outputs.iteritems():
+            if not isinstance(varlist, list):
+                varlist = [varlist]
+            for var in varlist:
+                grad_block = self._is_splited_grad_var(
+                    var, program.global_block().vars)
+                if grad_block:
+                    outputs[key] = grad_block
+                elif not program.global_block().vars.has_key(var.name):
+                    program.global_block().clone_variable(var)
+
+        return optimize_block.append_op(
+            type=opt_op.type,
+            inputs=inputs,
+            outputs=outputs,
+            attrs=opt_op.attrs)
+
+    def _is_op_connected(self, op1, op2):
+        # If one op's input is another op's output or
+        # one op's output is another op's input, we say
+        # the two operator is connected.
+        def _append_inname_remove_beta(varname_list):
+            op_input_names = []
+            for in_name in varname_list:
+                # HACK: remove beta1 and beta2 to avoid let all
+                # ops connected.
+                if in_name.startswith("beta2_pow_acc") or \
+                    in_name.startswith("beta1_pow_acc"):
+                    continue
+                else:
+                    op_input_names.append(in_name)
+            return op_input_names
+
+        op1_input_names = _append_inname_remove_beta(op1.desc.input_arg_names())
+        op1_output_names = op1.desc.output_arg_names()
+
+        op2_input_names = _append_inname_remove_beta(op2.desc.input_arg_names())
+        op2_output_names = op2.desc.output_arg_names()
+
+        if set(op1_output_names) & set(op2_input_names) or \
+           set(op1_input_names) & set(op2_output_names):
+            return True
+        return False
+
+    def _create_ufind(self, optimize_ops):
+        # Create a unit find data struct by optimize ops
+        ufind = UnionFind(optimize_ops)
+        for i in xrange(len(optimize_ops)):
+            for j in xrange(i, len(optimize_ops)):
+                op1 = optimize_ops[i]
+                op2 = optimize_ops[j]
+                if self._is_op_connected(op1, op2):
+                    ufind.union(op1, op2)
+        return ufind
+
+    def _is_optimizer_op(self, op):
+        if "Param" in op.input_names and \
+            "LearningRate" in op.input_names:
+            return True
+        return False
+
+    def _is_opt_op_on_pserver(self, endpoint, op):
+        param_names = [
+            p.name for p in self.param_grad_ep_mapping[endpoint]["params"]
+        ]
+        if op.input("Param")[0] in param_names:
+            return True
+        else:
+            for n in param_names:
+                param = op.input("Param")[0]
+                if same_or_split_var(n, param) and n != param:
+                    return True
+            return False
+
+    def _get_input_map_from_op(self, varmap, op):
+        """Returns a dict from op input name to the vars in varmap."""
+        iomap = dict()
+        for key in op.input_names:
+            vars = []
+            for varname in op.input(key):
+                vars.append(varmap[varname])
+            if len(vars) == 1:
+                iomap[key] = vars[0]
+            else:
+                iomap[key] = vars
+        return iomap
+
+    def _get_output_map_from_op(self, varmap, op):
+        """Returns a dict from op output name to the vars in varmap."""
+        iomap = dict()
+        for key in op.output_names:
+            vars = []
+            for varname in op.output(key):
+                vars.append(varmap[varname])
+            if len(vars) == 1:
+                iomap[key] = vars[0]
+            else:
+                iomap[key] = vars
+        return iomap
+
+    def _get_lr_ops(self):
+        lr_ops = []
+        # find learning rate variables by optimize op
+        lr_vars = set()
+        for op in self.optimize_ops:
+            if self._is_optimizer_op(op):
+                lr_vars.add(op.input("LearningRate")[0])
+
+        find_ops = []
+        # find ops which output is lr var
+        block = self.origin_program.global_block()
+        for op in block.ops:
+            if set(op.output_arg_names) & lr_vars:
+                find_ops.append(op)
+        # make a union find struct by the ops in default_main_program
+        ufind = UnionFind(block.ops)
+
+        for op1 in block.ops:
+            for op2 in block.ops:
+                # NOTE: we need to skip all optimize ops, since it is connected
+                # with forward/backward ops and lr ops, we only need the lr ops.
+                if op1 != op2 and self._is_op_connected(op1, op2) and \
+                    not self._is_optimizer_op(op1) and not self._is_optimizer_op(op2):
+                    ufind.union(op1, op2)
+        # find all ops which is related with lr var
+        for op1 in block.ops:
+            for op2 in find_ops:
+                if ufind.is_connected(op1, op2):
+                    lr_ops.append(op1)
+                    # we only need to append op for once
+                    break
+        return lr_ops
+
+    def _is_opt_role_op(self, op):
+        # NOTE: depend on oprole to find out whether this op is for
+        # optimize
+        op_maker = core.op_proto_and_checker_maker
+        optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
+        if op_maker.kOpRoleAttrName() in op.attrs and \
+            int(op.attrs[op_maker.kOpRoleAttrName()]) == int(optimize_role):
+            return True
+        return False
+
+    def _get_optimize_pass(self):
+        """
+        Get optimizer operators, paramters and gradients from origin_program
+        Returns:
+            opt_ops (list): optimize operators.
+            params_grads (dict): paramter->gradient.
+        """
+        block = self.origin_program.global_block()
+        opt_ops = []
+        params_grads = []
+        origin_var_dict = self.origin_program.global_block().vars
+        for op in block.ops:
+            if self._is_opt_role_op(op):
+                opt_ops.append(op)
+                # HACK(wuyi): if we find grad vars from input of optimize
+                # ops, we may get the output of clip op. Use syntax "@GRAD"
+                # and op_role_var to get the pair.
+                for input_name in op.input_arg_names:
+                    if input_name.find("@GRAD") != -1 and \
+                        op.attrs[RPC_OP_ROLE_ATTR_NAME]:
+                        param_name = op.attrs[OP_ROLE_VAR_ATTR_NAME][0]
+                        params_grads.append([
+                            origin_var_dict[param_name],
+                            origin_var_dict[input_name]
+                        ])
+            elif self._is_adam_connected_op(op):
+                opt_ops.append(op)
+            else:
+                pass
+        return opt_ops, params_grads
+
+    def _is_adam_connected_op(self, op):
+        """
+        A hack function to determinate whether the input operator
+        is connected to optimize operator.
+        """
+        if op.type == "scale":
+            for in_name in op.input_arg_names:
+                if in_name.startswith("beta1_pow_acc") or \
+                        in_name.startswith("beta2_pow_acc"):
+                    return True
+        return False
diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..d32c69d148dfa1633ce344611ca3fe7879a234e9
--- /dev/null
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -0,0 +1,310 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+from .. import core
+from ..framework import Program
+from ..executor import global_scope
+
+
+class InferenceTranspiler:
+    '''
+    Convert the fluid program to optimized inference program.
+
+    There are several optimizations:
+
+      - fuse convolution and batch normalization
+      - fuse batch normalization and relu (MKLDNN only)
+
+    Examples:
+
+    .. code-block:: python
+
+        # As InferenceTranspiler will modify the original program,
+        # please clone before use it.
+        inference_transpiler_program = program.clone()
+        t = fluid.InferenceTranspiler()
+        t.transpile(inference_transpiler_program, place)
+    '''
+
+    def transpile(self, program, place, scope=None):
+        '''
+        Run the transpiler.
+
+        Args:
+            program (Program): program to transpile
+            place (Place): inference place
+            scope (Scope|None): inference Scope
+        '''
+        if not isinstance(program, Program):
+            raise TypeError("program should be as Program type")
+        if not isinstance(place, core.CPUPlace) and not isinstance(
+                place, core.CUDAPlace):
+            raise TypeError("place should be as CPUPlace/CUDAPlace type")
+        if scope is None:
+            scope = global_scope()
+        if not isinstance(scope, core.Scope):
+            raise TypeError("scope should be as Scope type or None")
+        self.fuse_batch_norm(program, place, scope)
+        self.fuse_relu_mkldnn(program)
+
+    def fuse_relu_mkldnn(self, program):
+        '''
+        Transpile the program by fused relu activation for MKLDNN program.
+
+        Relu activation following batch norm OP can be fused by adding
+        :math:`fuse_with_relu` attribute to batch norm OP.
+
+        The result of fuse is:
+
+        - before:
+
+          - batch_norm->relu->any_other_op
+
+        - after:
+
+          - batch_norm->any_other_op
+
+        :param program: program to transpile
+        :type program: Program
+        '''
+        use_mkldnn = bool(os.getenv("FLAGS_use_mkldnn", False))
+        if not use_mkldnn:
+            return
+
+        self.block = program.block(0)
+
+        i = 0
+        while i < len(self.block.ops) - 1:
+            current_op = self.block.ops[i]
+            if current_op.type in ['batch_norm']:
+                next_op = self.block.ops[i + 1]
+                if next_op.type == 'relu':
+                    # modify bnorm OP to include relu
+                    current_op.set_attr("fuse_with_relu", True)
+                    # remove relu OP
+                    self.block.remove_op(i + 1)
+            i = i + 1
+
+        self._remove_unused_var()
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
+        # And a better solution will be considered later.
+        program = program.clone()
+
+    def fuse_batch_norm(self, program, place, scope):
+        '''
+        Transpile the program by fused batch normalization.
+
+        The batch normalization followed the convolution or fully connected layer
+        can be integrated with them. Doing so will give us a forward acceleration,
+        especially in environments like mobile or embedded.
+
+        For input :math:`X`:
+
+        - Conv process:        :math:`X = input * W + bias`
+        - Batch norm process:  :math:`X' = (X - mean) / std`
+        - Scale Process:       :math:`Y = a * X' + b`
+
+        After fuse into one operation:
+
+        .. math::
+
+            Y &= (input * W + bias - mean) / std * a + b \\\\
+              &= input * a * W / std + ((bias - mean) / std * a + b)
+
+        The operator transformation is:
+
+        - before:
+
+          - conv->batch_norm->any_other_op (bias == 0)
+          - conv->elementwise_add->batch_norm->any_other_op (bias != 0)
+
+        - after:
+
+          - conv->elementwise_add->any_other_op
+
+        The transpile stages are:
+
+        1. insert elementwise_add op when bias == 0.
+        2. fuse the batch_norm's parameters to conv and elementwise_add operators.
+        3. remove batch_norm ops which are not used in any other ops.
+        4. adjust the input of any_other_op to be the output of elementwise_add operator.
+        5. remove unused variables.
+
+        Args:
+            program (Program): program to transpile
+            place (Place): inference place
+            scope (Scope): inference Scope
+
+        '''
+        self.scope = scope
+        self.place = place
+        self.block = program.block(0)
+        self.input_map = {}  # store the input names should be adjusted
+
+        i = 0
+        while i < len(self.block.ops) - 2:
+            current_op = self.block.ops[i]
+            # TODO(luotao1): consider only conv2d now. fc would be delt later.
+            if current_op.type in ['conv2d']:
+                # TODO(luotao1): consider single chain network now.
+                # For branch network, we counldn't use block.ops[i + 1] as
+                # the judgment condition.
+                next_op = self.block.ops[i + 1]
+                # conv2d without bias
+                if (next_op.type == 'batch_norm'):
+                    # insert bias op
+                    bias_op = self._insert_bias_op(i + 1, current_op, next_op)
+                    # fuse batch_norm
+                    self._fuse_param(current_op, next_op, bias_op, 0)
+                    # remove batch_norm_op
+                    self.block.remove_op(i + 2)
+                    i = i + 1
+                # conv2d with bias, the next_op.type is elementwise_add
+                elif (next_op.type == 'elementwise_add'):
+                    next_next_op = self.block.ops[i + 2]
+                    if (next_next_op.type == 'batch_norm'):
+                        # fuse batch_norm
+                        self._fuse_param(current_op, next_next_op, next_op, 1)
+                        # remove batch_norm_op
+                        self.block.remove_op(i + 2)
+                        i = i + 1
+            i = i + 1
+
+        self._adjust_input()
+        self._remove_unused_var()
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
+        # And a better solution will be considered later.
+        program = program.clone()
+
+    # ====================== private transpiler functions =====================
+    def _insert_bias_op(self, index, current_op, bn_op):
+        '''
+        Construct elementwise_add operator for adding bias
+        and insert it into program.
+
+        :param index: insert location of bias_op
+        :type index: Int
+        :param current_op: current operator (conv or fc)
+        :type current_op: Operator
+        :param bn_op: batch norm operator
+        :type bn_op: Operator
+        :return: bias_op
+        :rtype: Operator
+        '''
+        # The input of bias_op is current_op's output and Bias of bn_op
+        # The output of bias_op is bn_op's output
+        x_var = self.block.var(current_op.output("Output")[0])
+        y_var = self.block.var(bn_op.input("Bias")[0])
+        out_var = self.block.var(bn_op.output("Y")[0])
+
+        bias_op = self.block.insert_op(
+            index,
+            type="elementwise_add",
+            inputs={"X": x_var,
+                    "Y": y_var},
+            outputs={"Out": out_var},
+            attrs={"axis": 1})  # dim_start=1
+        return bias_op
+
+    def _fuse_param(self, current_op, bn_op, bias_op, with_bias):
+        '''
+        fuse the batch_norm_op' parameters to current_op (conv or fc)
+
+        :param current_op: current operator (conv or fc)
+        :type current_op: Operator
+        :param bn_op: batch norm operator
+        :type bn_op: Operator
+        :param bias_op: elementwise_add operator for adding bias
+        :type bias_op: Operator
+        :param with_bias: If current operator has bias, with_bias = 1; otherwise 0.
+        :type with_bias: Int
+        '''
+
+        def _update_param(op, old_param_name, new_param):
+            # For the sake of remaining the original variables the same as before,
+            # create new variables in scope to store the new parameters.
+            old_param_name = old_param_name[0]
+            old_var = self.block.vars[old_param_name]
+            new_param_name = old_param_name + '_fuse_bn'
+            new_var = self.block.create_parameter(
+                name=new_param_name.encode('ascii'),
+                type=old_var.type,
+                dtype=old_var.dtype,
+                shape=old_var.shape)
+            op.rename_input(old_param_name, new_param_name)
+            self.scope.var(new_param_name)
+
+            tensor = self.scope.find_var(new_param_name).get_tensor()
+            tensor.set(np.array(new_param), self.place)
+
+        def _load_param(param_name):
+            return np.array(self.scope.find_var(param_name[0]).get_tensor())
+
+        bias_bn = _load_param(bn_op.input("Bias"))  #Bias
+        scale_bn = _load_param(bn_op.input("Scale"))  #Scale
+        mean_bn = _load_param(bn_op.input("Mean"))  #Mean
+        var_bn = _load_param(bn_op.input("Variance"))  #Variance
+
+        # TODO(luotao1): consider only conv2d now. fc would be delt later.
+        current_param = _load_param(current_op.input("Filter"))
+        std_bn = np.float32(np.sqrt(np.add(var_bn, 1e-5)))
+        tmp = np.float32(np.divide(scale_bn, std_bn))
+
+        # add bias of batch_norm_op to conv2d
+        if with_bias:
+            bias = _load_param(bias_op.input("Y"))
+        else:
+            bias = np.zeros(bias_bn.shape)
+        bias = np.float32(
+            np.add(np.multiply(np.subtract(bias, mean_bn), tmp), bias_bn))
+
+        # re-compute weight of conv2d
+        tmp = tmp.reshape(tmp.shape[0], -1)
+        dst_param = current_param.reshape((tmp.shape[0], -1))
+        dst_param = np.float32(np.multiply(dst_param, tmp))
+        dst_param = dst_param.reshape(current_param.shape)
+
+        # update parameters
+        _update_param(current_op, current_op.input("Filter"), dst_param)
+        _update_param(bias_op, bias_op.input("Y"), bias)
+
+        # collect the renamed input
+        self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0]
+
+    def _adjust_input(self):
+        for i in range(len(self.block.ops)):
+            current_op = self.block.ops[i]
+            for input_arg in current_op.input_arg_names:
+                if input_arg in self.input_map:
+                    current_op.rename_input(input_arg,
+                                            self.input_map[input_arg])
+
+    def _remove_unused_var(self):
+        '''
+        remove unused varibles in program
+        '''
+        args = []
+        for i in range(len(self.block.ops)):
+            current_op = self.block.ops[i]
+            args += current_op.input_arg_names
+            args += current_op.output_arg_names
+        args = list(set(args))  # unique the input and output arguments
+
+        for var in self.block.vars.keys():
+            if var not in args:
+                self.block.remove_var(var)
diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..999ef43ca0feacbddff5f9db59589ce7097fe77e
--- /dev/null
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -0,0 +1,398 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from .. import core
+from ..framework import Program, default_main_program, Parameter, Variable
+from ..backward import _rename_arg_
+
+dtype_to_size = {
+    core.VarDesc.VarType.FP16: 2,
+    core.VarDesc.VarType.FP32: 4,
+    core.VarDesc.VarType.FP64: 8,
+    core.VarDesc.VarType.INT16: 2,
+    core.VarDesc.VarType.INT32: 4,
+    core.VarDesc.VarType.INT64: 8,
+    core.VarDesc.VarType.BOOL: 1,
+    core.VarDesc.VarType.UINT8: 1,
+}
+
+SUB_BLOCK_OPS = [
+    "while", "while_grad", "parallel_do", "parallel_do_grad",
+    "conditional_block", "conditional_block_grad"
+]
+
+SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"),
+                  ("conditional_block", "conditional_block_grad")]
+
+PRINT_LOG = False
+
+
+class ControlFlowGraph(object):
+    def __init__(self, program, ops, forward_num, skip_opt):
+        self._program = program
+        self._ops = ops
+        self._forward_num = forward_num
+        self._successors = defaultdict(set)
+        self._presuccessors = defaultdict(set)
+        self._uses = defaultdict(set)
+        self._defs = defaultdict(set)
+        self._live_in = defaultdict(set)
+        self._live_out = defaultdict(set)
+        self._skip_opt = skip_opt
+
+    def _add_connections(self, connections):
+        """Populates _successors and _presuccessors for two neighbor nodes."""
+        for node1, node2 in connections:
+            self._add(node1, node2)
+
+    def _add(self, node1, node2):
+        self._successors[node1].add(node2)
+        self._presuccessors[node2].add(node1)
+
+    # TODO(panyx0718): We need to have a unified way of building intermediate
+    # representation.
+    def _build_graph(self):
+        """Build a graph based on op sequence.
+        """
+        self.op_size = len(self._ops)
+        op_node_connections = [(i, i + 1) for i in range(self.op_size - 1)]
+        self._add_connections(op_node_connections)
+        for i in range(self.op_size):
+            self._uses[i].update(self._ops[i].input_arg_names())
+            self._defs[i].update(self._ops[i].output_arg_names())
+
+    def _update_graph(self, old_name, new_name, begin_idx=0):
+        for i in range(begin_idx, self.op_size):
+            if old_name in self._uses[i]:
+                self._uses[i].remove(old_name)
+                self._uses[i].add(new_name)
+            if old_name in self._defs[i]:
+                self._defs[i].remove(old_name)
+                self._defs[i].add(new_name)
+            if old_name in self._live_in[i]:
+                self._live_in[i].remove(old_name)
+                self._live_out[i].add(new_name)
+            if old_name in self._live_out[i]:
+                self._live_out[i].remove(old_name)
+                self._live_out[i].add(new_name)
+
+    def _reach_fixed_point(self, live_in, live_out):
+        """Check if the liveness set has stablized."""
+        if len(live_in) != len(self._live_in):
+            return False
+        if len(live_out) != len(self._live_out):
+            return False
+        for i in range(self.op_size):
+            if (live_in[i] != self._live_in[i] or
+                    live_out[i] != self._live_out[i]):
+                return False
+        return True
+
+    def _dataflow_analyze(self):
+        self._build_graph()
+        live_in = defaultdict(set)
+        live_out = defaultdict(set)
+        # Repeatedly apply liveness updates until the algorithm stablize
+        # on a complete set live input vars and live output vars.
+        while True:
+            for i in reversed(range(self.op_size)):
+                live_in[i] = set(self._live_in[i])
+                live_out[i] = set(self._live_out[i])
+                for s in self._successors[i]:
+                    self._live_out[i] |= self._live_in[s]
+                self._live_in[i] = self._uses[i] | (
+                    self._live_out[i] - self._defs[i])
+            if self._reach_fixed_point(live_in, live_out):
+                break
+
+    def _get_diff(self, a, b):
+        u = a & b
+        return a - u, b - u
+
+    def _has_var(self, block_desc, var_name, is_forward):
+        if is_forward:
+            return block_desc.has_var(str(var_name))
+        else:
+            return block_desc.has_var_recursive(str(var_name))
+
+    def _find_var(self, block_desc, var_name, is_forward):
+        if is_forward:
+            return block_desc.find_var(str(var_name))
+        else:
+            return block_desc.find_var_recursive(str(var_name))
+
+    def _check_var_validity(self, block_desc, x, is_forward):
+        if str(x) == "@EMPTY@":
+            return False
+        if not self._has_var(block_desc, x, is_forward):
+            return False
+        if self._find_var(block_desc, x, is_forward).persistable():
+            return False
+        if self._find_var(block_desc, x,
+                          is_forward).type() != core.VarDesc.VarType.LOD_TENSOR:
+            return False
+        if x in self._skip_opt:
+            return False
+        if not self._find_var(block_desc, x, is_forward).shape():
+            return False
+        return True
+
+    # TODO(panyx0718): This needs to be less hacky. It seems memory optimization
+    # doesn't consider vars copied between cpu and gpu.
+    def _update_skip_opt_set(self):
+        for i in range(self.op_size):
+            op = self._ops[i]
+            if op.type() == "fill_constant" and op.attr("force_cpu") == True:
+                self._skip_opt.update(op.output_arg_names())
+
+    def release_memory(self, skip_opt_set=None):
+        self._dataflow_analyze()
+        self._update_skip_opt_set()
+        if skip_opt_set:
+            self._skip_opt.update(skip_opt_set)
+        fwd_id = 0
+        bwd_id = 0
+        for i in range(self.op_size):
+            op = self._ops[i]
+            if op.type() in SUB_BLOCK_OPS:
+                continue
+            block_desc = op.block()
+            is_forward = i < self._forward_num
+            in_diff, out_diff = self._get_diff(self._live_in[i],
+                                               self._live_out[i])
+            can_optimize = filter(
+                lambda x: self._check_var_validity(block_desc, x, is_forward),
+                in_diff)
+            if can_optimize:
+                index = i + fwd_id + 1 if is_forward else i - self._forward_num + bwd_id + 1
+                delete_op = block_desc.insert_op(index)
+                delete_op.set_type("delete_var")
+                delete_op.set_input("X", can_optimize)
+                if is_forward:
+                    fwd_id += 1
+                else:
+                    bwd_id += 1
+
+    def memory_optimize(self, skip_opt_set=None, level=0):
+        def compare_shape(x_shape, cache_shape, opt_level):
+            if opt_level == 0:
+                return x_shape == cache_shape
+            elif opt_level == 1:
+                if (x_shape[0] == -1) ^ (cache_shape[0] == -1):
+                    return False
+                x_size = abs(reduce(lambda x, y: x * y, x_shape))
+                cache_size = abs(reduce(lambda x, y: x * y, cache_shape))
+                if x_size <= cache_size:
+                    return True
+            else:
+                raise ValueError("only support opt_level 0 or 1.")
+            return False
+
+        self._dataflow_analyze()
+        self._update_skip_opt_set()
+        # update skip set to meet users' demand
+        if skip_opt_set:
+            self._skip_opt.update(skip_opt_set)
+        self.pool = []
+        for i in range(self.op_size):
+            op = self._ops[i]
+            if op.type() in SUB_BLOCK_OPS:
+                continue
+            block_desc = op.block()
+            is_forward = i < self._forward_num
+            if self.pool:
+                defs_can_optimize = filter(
+                    lambda x: self._check_var_validity(block_desc, x, is_forward),
+                    self._defs[i])
+                out_pair = [
+                    (x, self._find_var(block_desc, x, is_forward).shape())
+                    for x in defs_can_optimize
+                ]
+                for x, x_shape in out_pair:
+                    # If x is both in uses and defs, it can not be optimized!
+                    if x in self._uses[i]:
+                        continue
+                    for index, cache_pair in enumerate(self.pool):
+                        cache_var = cache_pair[0]
+                        cache_shape = cache_pair[1]
+                        if not compare_shape(x_shape, cache_shape, level):
+                            continue
+
+                        if not self._has_var(block_desc, cache_var, is_forward):
+                            continue
+
+                        x_dtype = self._find_var(block_desc, x,
+                                                 is_forward).dtype()
+                        cache_dtype = self._find_var(block_desc, cache_var,
+                                                     is_forward).dtype()
+                        # TODO(qijun): actually, we should compare
+                        # dtype_to_size[x_dtype] and dtype_to_size[cache_dtype]
+                        if x_dtype != cache_dtype:
+                            continue
+
+                        if PRINT_LOG:
+                            print(("Hit Cache !!!! cache pool index "
+                                   "is %d, var name is %s, "
+                                   "cached var name is %s, "
+                                   "var shape is %s ") % (index, x, cache_var,
+                                                          str(cache_shape)))
+                        self.pool.pop(index)
+                        if x == cache_var:
+                            break
+                        # Rename the var to the cache var already with
+                        # memory allocated in order to reuse the memory.
+                        _rename_arg_(self._ops, x, cache_var, begin_idx=i)
+                        self._program.block(block_desc.id).var(str(
+                            x)).desc = self._find_var(block_desc, cache_var,
+                                                      is_forward)
+                        self._update_graph(x, cache_var, begin_idx=i)
+                        break
+
+            in_diff, _ = self._get_diff(self._live_in[i], self._live_out[i])
+            can_optimize = filter(
+                lambda x: self._check_var_validity(block_desc, x, is_forward),
+                in_diff)
+            if can_optimize:
+                for var_name in can_optimize:
+                    self.pool.append((var_name, self._find_var(
+                        block_desc, var_name, is_forward).shape()))
+
+
+def _process_sub_block_pair(pdesc, sub_block_pair):
+    """Creates a list of tuple each of which tracks info of a subblock.
+
+      Note: this function doesn't handle nested subblocks yet.
+      TODO(panyx0718): assert if case nested subblocks happen.
+
+    :param pdesc: ProgramDesc.
+    :param sub_block_pair: A list op pairs. Each op pair is the forward
+        op and backward op. The ops in the list are special that they contain
+        a subblock of ops.
+    :return: A list of tuples, each tuple is (all ops in a subblock pair
+        including forward and backward, number of forward ops,
+        all output args names of the ops in the subblock pairs).
+    """
+    ops_list = []
+    block_desc = pdesc.block(0)
+    op_size = block_desc.op_size()
+    for fwd_op, bwd_op in sub_block_pair:
+        sub_block_ids = []
+        grad_sub_block_ids = []
+        sub_block_id_pair = []
+        sub_op_dict = {}
+        for i in range(op_size):
+            op = block_desc.op(i)
+            if op.type() == fwd_op:
+                sub_block_ids.append(op.attr("sub_block").id)
+                sub_op_dict[op.attr("sub_block").id] = op
+            elif op.type() == bwd_op:
+                grad_sub_block_ids.append(op.attr("sub_block").id)
+                sub_op_dict[op.attr("sub_block").id] = op
+
+        # Find fwd_op/bwd_op block pair
+        for grad_id in grad_sub_block_ids:
+            fwd_id = pdesc.block(grad_id).get_forward_block_idx()
+            if fwd_id in sub_block_ids:
+                sub_block_id_pair.append((fwd_id, grad_id))
+                sub_block_ids.remove(fwd_id)
+
+        # Get fwd_op/bwd_op block ops
+        for fwd_id, grad_id in sub_block_id_pair:
+            sub_block_ops = []
+            sub_block = pdesc.block(fwd_id)
+            block_op_size = sub_block.op_size()
+            for i in range(block_op_size):
+                sub_block_ops.append(sub_block.op(i))
+
+            grad_sub_block = pdesc.block(grad_id)
+            grad_sub_block_op_size = grad_sub_block.op_size()
+            for i in range(grad_sub_block_op_size):
+                sub_block_ops.append(grad_sub_block.op(i))
+
+            sub_op_output = set()
+            sub_op_output.update(sub_op_dict[fwd_id].output_arg_names())
+            sub_op_output.update(sub_op_dict[grad_id].output_arg_names())
+            ops_list.append((sub_block_ops, block_op_size, sub_op_output))
+
+        # Process rest fwd_op block ops
+        for fwd_id in sub_block_ids:
+            sub_block_ops = []
+            sub_block = pdesc.block(fwd_id)
+            sub_block_op_size = sub_block.op_size()
+            for i in range(sub_block_op_size):
+                sub_block_ops.append(sub_block.op(i))
+            sub_op_output = set()
+            sub_op_output.update(sub_op_dict[fwd_id].output_arg_names())
+            ops_list.append((sub_block_ops, sub_block_op_size, sub_op_output))
+    return ops_list
+
+
+def _get_cfgs(input_program):
+    """Process each block and create ControlFlowGraph for each of them.
+
+    :param input_program: Program object.
+    :return: A list of ControlFlowGraph, each corresponds to a block.
+    """
+    ops_list = []
+    pdesc = input_program.get_desc()
+    block_desc = pdesc.block(0)
+    op_size = block_desc.op_size()
+    # Get global block ops
+    ops_list.append(
+        ([block_desc.op(i) for i in range(op_size)], op_size, set()))
+
+    # Only process one level of nested subblock.
+    ops_list.extend(_process_sub_block_pair(pdesc, SUB_BLOCK_PAIR))
+
+    cfgs = [
+        ControlFlowGraph(input_program, ops, forward_num, skip_opt)
+        for ops, forward_num, skip_opt in ops_list
+    ]
+    return cfgs
+
+
+def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0):
+    """Optimize memory by reusing var memory.
+
+      Note: it doesn't not support subblock nested in subblock.
+
+    :param input_program: Input Program
+    :param print_log: whether to print debug log.
+    :param level: If level=0, reuse if the shape is completely equal, o
+    :return:
+    """
+    if level != 0 and level != 1:
+        raise ValueError("only support opt_level 0 or 1.")
+    global PRINT_LOG
+    PRINT_LOG = print_log
+    cfgs = _get_cfgs(input_program)
+    for cfg in cfgs:
+        cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level)
+
+
+def release_memory(input_program, skip_opt_set=None):
+    """
+    Modify the input program and insert :code:`delete_op` to early drop not used
+    variables. The modification will be performed inplace.
+
+    Notes: This is an experimental API and could be removed in next few
+    releases. Users should not use this API.
+
+    Args:
+        input_program(Program): The program will be inserted :code:`delete_op`.
+    """
+    cfgs = _get_cfgs(input_program)
+    for cfg in cfgs:
+        cfg.release_memory(skip_opt_set=skip_opt_set)
diff --git a/python/paddle/fluid/transpiler/ps_dispatcher.py b/python/paddle/fluid/transpiler/ps_dispatcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcffadd531719431f27feb464ed58a65c04770ee
--- /dev/null
+++ b/python/paddle/fluid/transpiler/ps_dispatcher.py
@@ -0,0 +1,88 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class PSDispatcher(object):
+    """
+    PSDispatcher is the base class for dispatching vars
+    into different pserver instance.
+    You need to implement the `dispatch` inferface.
+    """
+
+    def __init__(self, pserver_endpoints):
+        self._eps = pserver_endpoints
+        self._step = 0
+
+    @property
+    def eps(self):
+        return self._eps
+
+    def reset(self):
+        self._step = 0
+
+    def dispatch(self, varlist):
+        """
+        Args:
+            varlist(list): a list of Variables
+        Returns:
+            a map of pserver endpoint -> varname
+        """
+        AssertionError("Interface has not been implemented.")
+
+
+class HashName(PSDispatcher):
+    """
+    Hash variable names to several endpoints using python
+    "hash()" function.
+
+    Args:
+        pserver_endpoints (list): list of endpoint(ip:port).
+    """
+
+    def __init__(self, pserver_endpoints):
+        super(self.__class__, self).__init__(pserver_endpoints)
+
+    def _hash_block(self, block_str, total):
+        return hash(block_str) % total
+
+    def dispatch(self, varlist):
+        eplist = []
+        for var in varlist:
+            server_id = self._hash_block(var.name(), len(self._eps))
+            server_for_param = self._eps[server_id]
+            eplist.append(server_for_param)
+        return eplist
+
+
+class RoundRobin(PSDispatcher):
+    """
+    Distribute variables to serveral endpoints using
+    RondRobin<https://en.wikipedia.org/wiki/Round-robin_scheduling> method.
+
+    Args:
+        pserver_endpoints (list): list of endpoint(ip:port).
+    """
+
+    def __init__(self, pserver_endpoints):
+        super(self.__class__, self).__init__(pserver_endpoints)
+
+    def dispatch(self, varlist):
+        eplist = []
+        for var in varlist:
+            server_for_param = self._eps[self._step]
+            eplist.append(server_for_param)
+            self._step += 1
+            if self._step >= len(self._eps):
+                self._step = 0
+        return eplist
diff --git a/python/paddle/fluid/unique_name.py b/python/paddle/fluid/unique_name.py
index 33c53113ae7e8ed9aeada31f2aed6990b6fea110..776619cd36722e338a9fdd5e13bceeaf3724de2c 100644
--- a/python/paddle/fluid/unique_name.py
+++ b/python/paddle/fluid/unique_name.py
@@ -16,7 +16,7 @@ import collections
 import contextlib
 import sys
 
-__all__ = ['generate', 'switch', 'guard', 'UniqueNameGenerator']
+__all__ = ['generate', 'switch', 'guard']
 
 
 class UniqueNameGenerator(object):
diff --git a/python/paddle/reader/__init__.py b/python/paddle/reader/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b059735a924d58714cd88a761eb83143f1192d6
--- /dev/null
+++ b/python/paddle/reader/__init__.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+At training and testing time, PaddlePaddle programs need to read data. To ease
+the users' work to write data reading code, we define that
+
+- A *reader* is a function that reads data (from file, network, random number
+  generator, etc) and yields data items.
+- A *reader creator* is a function that returns a reader function.
+- A *reader decorator* is a function, which accepts one or more readers, and
+  returns a reader.
+- A *batch reader* is a function that reads data (from *reader*, file, network,
+  random number generator, etc) and yields a batch of data items.
+
+#####################
+Data Reader Interface
+#####################
+
+Indeed, *data reader* doesn't have to be a function that reads and yields data
+items. It can be any function with no parameter that creates a iterable
+(anything can be used in :code:`for x in iterable`)\:
+
+..  code-block:: python
+
+    iterable = data_reader()
+
+Element produced from the iterable should be a **single** entry of data,
+**not** a mini batch. That entry of data could be a single item, or a tuple of
+items.
+Item should be of `supported type <http://www.paddlepaddle.org/doc/ui/data_provider
+/pydataprovider2.html?highlight=dense_vector#input-types>`_ (e.g., numpy 1d
+array of float32, int, list of int)
+
+An example implementation for single item data reader creator:
+
+..  code-block:: python
+
+    def reader_creator_random_image(width, height):
+        def reader():
+            while True:
+                yield numpy.random.uniform(-1, 1, size=width*height)
+    return reader
+
+An example implementation for multiple item data reader creator:
+
+..  code-block:: python
+
+    def reader_creator_random_image_and_label(width, height, label):
+        def reader():
+            while True:
+                yield numpy.random.uniform(-1, 1, size=width*height), label
+    return reader
+
+
+TODO(yuyang18): Should we add whole design doc here?
+"""
+
+import decorator
+from decorator import *
+
+import creator
+
+__all__ = decorator.__all__ + ['creator']
diff --git a/python/paddle/reader/creator.py b/python/paddle/reader/creator.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c905d959fad4e8c1a8826ce8dc60c5fa834514d
--- /dev/null
+++ b/python/paddle/reader/creator.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Creator package contains some simple reader creator, which could
+be used in user program.
+"""
+
+__all__ = ['np_array', 'text_file', 'recordio']
+
+
+def np_array(x):
+    """
+    Creates a reader that yields elements of x, if it is a
+    numpy vector. Or rows of x, if it is a numpy matrix.
+    Or any sub-hyperplane indexed by the highest dimension.
+
+    :param x: the numpy array to create reader from.
+    :returns: data reader created from x.
+    """
+
+    def reader():
+        if x.ndim < 1:
+            yield x
+
+        for e in x:
+            yield e
+
+    return reader
+
+
+def text_file(path):
+    """
+    Creates a data reader that outputs text line by line from given text file.
+    Trailing new line ('\\\\n') of each line will be removed.
+
+    :path: path of the text file.
+    :returns: data reader of text file
+    """
+
+    def reader():
+        f = open(path, "r")
+        for l in f:
+            yield l.rstrip('\n')
+        f.close()
+
+    return reader
+
+
+def recordio(paths, buf_size=100):
+    """
+    Creates a data reader from given RecordIO file paths separated by ",",
+        glob pattern is supported.
+    :path: path of recordio files, can be a string or a string list.
+    :returns: data reader of recordio files.
+    """
+
+    import recordio as rec
+    import paddle.reader.decorator as dec
+    import cPickle as pickle
+
+    def reader():
+        if isinstance(paths, basestring):
+            path = paths
+        else:
+            path = ",".join(paths)
+        f = rec.reader(path)
+        while True:
+            r = f.read()
+            if r is None:
+                break
+            yield pickle.loads(r)
+        f.close()
+
+    return dec.buffered(reader, buf_size)
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f83cabb8481451736944823be45185deea4f43b
--- /dev/null
+++ b/python/paddle/reader/decorator.py
@@ -0,0 +1,405 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+    'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
+    'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader'
+]
+
+from threading import Thread
+import subprocess
+
+from Queue import Queue
+import itertools
+import random
+import zlib
+
+
+def map_readers(func, *readers):
+    """
+    Creates a data reader that outputs return value of function using
+    output of each data readers as arguments.
+
+    :param func: function to use. The type of func should be (Sample) => Sample
+    :type: callable
+    :param readers: readers whose outputs will be used as arguments of func.
+    :return: the created data reader.
+    :rtype: callable
+    """
+
+    def reader():
+        rs = []
+        for r in readers:
+            rs.append(r())
+        for e in itertools.imap(func, *rs):
+            yield e
+
+    return reader
+
+
+def shuffle(reader, buf_size):
+    """
+    Creates a data reader whose data output is shuffled.
+
+    Output from the iterator that created by original reader will be
+    buffered into shuffle buffer, and then shuffled. The size of shuffle buffer
+    is determined by argument buf_size.
+
+    :param reader: the original reader whose output will be shuffled.
+    :type reader: callable
+    :param buf_size: shuffle buffer size.
+    :type buf_size: int
+
+    :return: the new reader whose output is shuffled.
+    :rtype: callable
+    """
+
+    def data_reader():
+        buf = []
+        for e in reader():
+            buf.append(e)
+            if len(buf) >= buf_size:
+                random.shuffle(buf)
+                for b in buf:
+                    yield b
+                buf = []
+
+        if len(buf) > 0:
+            random.shuffle(buf)
+            for b in buf:
+                yield b
+
+    return data_reader
+
+
+def chain(*readers):
+    """
+    Creates a data reader whose output is the outputs of input data
+    readers chained together.
+
+    If input readers output following data entries:
+    [0, 0, 0]
+    [1, 1, 1]
+    [2, 2, 2]
+    The chained reader will output:
+    [0, 0, 0, 1, 1, 1, 2, 2, 2]
+
+    :param readers: input readers.
+    :return: the new data reader.
+    :rtype: callable
+    """
+
+    def reader():
+        rs = []
+        for r in readers:
+            rs.append(r())
+
+        for e in itertools.chain(*rs):
+            yield e
+
+    return reader
+
+
+class ComposeNotAligned(ValueError):
+    pass
+
+
+def compose(*readers, **kwargs):
+    """
+    Creates a data reader whose output is the combination of input readers.
+
+    If input readers output following data entries:
+    (1, 2)    3    (4, 5)
+    The composed reader will output:
+    (1, 2, 3, 4, 5)
+
+    :param readers: readers that will be composed together.
+    :param check_alignment: if True, will check if input readers are aligned
+        correctly. If False, will not check alignment and trailing outputs
+        will be discarded. Defaults to True.
+    :type check_alignment: bool
+
+    :return: the new data reader.
+
+    :raises ComposeNotAligned: outputs of readers are not aligned.
+        Will not raise when check_alignment is set to False.
+    """
+    check_alignment = kwargs.pop('check_alignment', True)
+
+    def make_tuple(x):
+        if isinstance(x, tuple):
+            return x
+        else:
+            return (x, )
+
+    def reader():
+        rs = []
+        for r in readers:
+            rs.append(r())
+        if not check_alignment:
+            for outputs in itertools.izip(*rs):
+                yield sum(map(make_tuple, outputs), ())
+        else:
+            for outputs in itertools.izip_longest(*rs):
+                for o in outputs:
+                    if o is None:
+                        # None will be not be present if compose is aligned
+                        raise ComposeNotAligned(
+                            "outputs of readers are not aligned.")
+                yield sum(map(make_tuple, outputs), ())
+
+    return reader
+
+
+def buffered(reader, size):
+    """
+    Creates a buffered data reader.
+
+    The buffered data reader will read and save data entries into a
+    buffer. Reading from the buffered data reader will proceed as long
+    as the buffer is not empty.
+
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param size: max buffer size.
+    :type size: int
+
+    :returns: the buffered data reader.
+    """
+
+    class EndSignal():
+        pass
+
+    end = EndSignal()
+
+    def read_worker(r, q):
+        for d in r:
+            q.put(d)
+        q.put(end)
+
+    def data_reader():
+        r = reader()
+        q = Queue(maxsize=size)
+        t = Thread(
+            target=read_worker, args=(
+                r,
+                q, ))
+        t.daemon = True
+        t.start()
+        e = q.get()
+        while e != end:
+            yield e
+            e = q.get()
+
+    return data_reader
+
+
+def firstn(reader, n):
+    """
+    Limit the max number of samples that reader could return.
+
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param n: the max number of samples that return.
+    :type n: int
+    :return: the decorated reader.
+    :rtype: callable
+    """
+
+    # TODO(yuyang18): Check if just drop the reader, could clean the opened
+    # resource or not?
+
+    def firstn_reader():
+        for i, item in enumerate(reader()):
+            if i == n:
+                break
+            yield item
+
+    return firstn_reader
+
+
+class XmapEndSignal():
+    pass
+
+
+def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
+    """
+    Use multiprocess to map samples from reader by a mapper defined by user.
+    And this function contains a buffered decorator.
+    :param mapper:  a function to map sample.
+    :type mapper: callable
+    :param reader: the data reader to read from
+    :type reader: callable
+    :param process_num: process number to handle original sample
+    :type process_num: int
+    :param buffer_size: max buffer size
+    :type buffer_size: int
+    :param order: keep the order of reader
+    :type order: bool
+    :return: the decarated reader
+    :rtype: callable
+    """
+    end = XmapEndSignal()
+
+    # define a worker to read samples from reader to in_queue
+    def read_worker(reader, in_queue):
+        for i in reader():
+            in_queue.put(i)
+        in_queue.put(end)
+
+    # define a worker to read samples from reader to in_queue with order flag
+    def order_read_worker(reader, in_queue):
+        in_order = 0
+        for i in reader():
+            in_queue.put((in_order, i))
+            in_order += 1
+        in_queue.put(end)
+
+    # define a worker to handle samples from in_queue by mapper
+    # and put mapped samples into out_queue
+    def handle_worker(in_queue, out_queue, mapper):
+        sample = in_queue.get()
+        while not isinstance(sample, XmapEndSignal):
+            r = mapper(sample)
+            out_queue.put(r)
+            sample = in_queue.get()
+        in_queue.put(end)
+        out_queue.put(end)
+
+    # define a worker to handle samples from in_queue by mapper
+    # and put mapped samples into out_queue by order
+    def order_handle_worker(in_queue, out_queue, mapper, out_order):
+        ins = in_queue.get()
+        while not isinstance(ins, XmapEndSignal):
+            order, sample = ins
+            r = mapper(sample)
+            while order != out_order[0]:
+                pass
+            out_queue.put(r)
+            out_order[0] += 1
+            ins = in_queue.get()
+        in_queue.put(end)
+        out_queue.put(end)
+
+    def xreader():
+        in_queue = Queue(buffer_size)
+        out_queue = Queue(buffer_size)
+        out_order = [0]
+        # start a read worker in a thread
+        target = order_read_worker if order else read_worker
+        t = Thread(target=target, args=(reader, in_queue))
+        t.daemon = True
+        t.start()
+        # start several handle_workers
+        target = order_handle_worker if order else handle_worker
+        args = (in_queue, out_queue, mapper, out_order) if order else (
+            in_queue, out_queue, mapper)
+        workers = []
+        for i in xrange(process_num):
+            worker = Thread(target=target, args=args)
+            worker.daemon = True
+            workers.append(worker)
+        for w in workers:
+            w.start()
+
+        sample = out_queue.get()
+        while not isinstance(sample, XmapEndSignal):
+            yield sample
+            sample = out_queue.get()
+        finish = 1
+        while finish < process_num:
+            sample = out_queue.get()
+            if isinstance(sample, XmapEndSignal):
+                finish += 1
+            else:
+                yield sample
+
+    return xreader
+
+
+def _buf2lines(buf, line_break="\n"):
+    # FIXME: line_break should be automatically configured.
+    lines = buf.split(line_break)
+    return lines[:-1], lines[-1]
+
+
+class PipeReader:
+    """
+        PipeReader read data by stream from a command, take it's
+        stdout into a pipe buffer and redirect it to the parser to
+        parse, then yield data as your desired format.
+
+        You can using standard linux command or call another program
+        to read data, from HDFS, Ceph, URL, AWS S3 etc:
+
+        .. code-block:: python
+           cmd = "hadoop fs -cat /path/to/some/file"
+           cmd = "cat sample_file.tar.gz"
+           cmd = "curl http://someurl"
+           cmd = "python print_s3_bucket.py"
+
+        An example:
+
+        .. code-block:: python
+
+           def example_reader():
+               for f in myfiles:
+                   pr = PipeReader("cat %s"%f)
+                   for l in pr.get_line():
+                       sample = l.split(" ")
+                       yield sample
+    """
+
+    def __init__(self, command, bufsize=8192, file_type="plain"):
+        if not isinstance(command, str):
+            raise TypeError("left_cmd must be a string")
+        if file_type == "gzip":
+            self.dec = zlib.decompressobj(
+                32 + zlib.MAX_WBITS)  # offset 32 to skip the header
+        self.file_type = file_type
+        self.bufsize = bufsize
+        self.process = subprocess.Popen(
+            command.split(" "), bufsize=bufsize, stdout=subprocess.PIPE)
+
+    def get_line(self, cut_lines=True, line_break="\n"):
+        """
+        :param cut_lines: cut buffer to lines
+        :type cut_lines: bool
+        :param line_break: line break of the file, like \n or \r
+        :type line_break: string
+
+        :return: one line or a buffer of bytes
+        :rtype: string
+        """
+        remained = ""
+        while True:
+            buff = self.process.stdout.read(self.bufsize)
+            if buff:
+                if self.file_type == "gzip":
+                    decomp_buff = self.dec.decompress(buff)
+                elif self.file_type == "plain":
+                    decomp_buff = buff
+                else:
+                    raise TypeError("file_type %s is not allowed" %
+                                    self.file_type)
+
+                if cut_lines:
+                    lines, remained = _buf2lines(''.join(
+                        [remained, decomp_buff]), line_break)
+                    for line in lines:
+                        yield line
+                else:
+                    yield decomp_buff
+            else:
+                break
diff --git a/python/paddle/reader/tests/CMakeLists.txt b/python/paddle/reader/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..107d5912e1567e0c8721987a281272c7feb51e63
--- /dev/null
+++ b/python/paddle/reader/tests/CMakeLists.txt
@@ -0,0 +1,2 @@
+py_test(creator_test SRCS creator_test.py)
+py_test(decorator_test SRCS decorator_test.py)
diff --git a/python/paddle/reader/tests/__init__.py b/python/paddle/reader/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eca2dce114b069bf9b455d77ce670d73b5047fd2
--- /dev/null
+++ b/python/paddle/reader/tests/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/reader/tests/creator_test.py b/python/paddle/reader/tests/creator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4238c12a74759d52eb09f31ce1126cc93dd3489
--- /dev/null
+++ b/python/paddle/reader/tests/creator_test.py
@@ -0,0 +1,74 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright PaddlePaddle contributors. All Rights Reservedd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import unittest
+import numpy as np
+import paddle.reader.creator
+
+
+class TestNumpyArray(unittest.TestCase):
+    def test_numpy_array(self):
+        l = [[1, 2, 3], [4, 5, 6]]
+        x = np.array(l, np.int32)
+        reader = paddle.reader.creator.np_array(x)
+        for idx, e in enumerate(reader()):
+            self.assertItemsEqual(e, l[idx])
+
+
+class TestTextFile(unittest.TestCase):
+    def test_text_file(self):
+        path = os.path.join(os.path.dirname(__file__), "test_data_creator.txt")
+        reader = paddle.reader.creator.text_file(path)
+        for idx, e in enumerate(reader()):
+            self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1))
+
+
+class TestRecordIO(unittest.TestCase):
+    def do_test(self, path):
+        reader = paddle.reader.creator.recordio(path)
+        idx = 0
+        for e in reader():
+            if idx == 0:
+                self.assertEqual(e, (1, 2, 3))
+            elif idx == 1:
+                self.assertEqual(e, (4, 5, 6))
+            idx += 1
+        self.assertEqual(idx, 2)
+
+    def test_recordIO(self):
+        self.do_test(
+            os.path.join(
+                os.path.dirname(__file__), "test_reader_recordio.dat"))
+        self.do_test([
+            os.path.join(
+                os.path.dirname(__file__), "test_reader_recordio.dat")
+        ])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/reader/tests/decorator_test.py b/python/paddle/reader/tests/decorator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..bee24d3b6579db5e99ec66931df201fdf9e1af07
--- /dev/null
+++ b/python/paddle/reader/tests/decorator_test.py
@@ -0,0 +1,178 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import unittest
+
+import paddle.reader
+
+
+def reader_creator_10(dur):
+    def reader():
+        for i in range(10):
+            # this invocation helps testing paddle.reader.buffer
+            time.sleep(dur)
+            yield i
+
+    return reader
+
+
+class TestMap(unittest.TestCase):
+    def test_map(self):
+        d = {"h": 0, "i": 1}
+
+        def tokenize(x):
+            return d[x]
+
+        def read():
+            yield "h"
+            yield "i"
+
+        r = paddle.reader.map_readers(tokenize, read)
+        for i, e in enumerate(r()):
+            self.assertEqual(e, i)
+
+
+class TestBuffered(unittest.TestCase):
+    def test_read(self):
+        for size in range(20):
+            b = paddle.reader.buffered(reader_creator_10(0), size)
+            c = 0
+            for i in b():
+                self.assertEqual(i, c)
+                c += 1
+            self.assertEqual(c, 10)
+
+    def test_buffering(self):
+        # read have 30ms delay.
+        b = paddle.reader.buffered(reader_creator_10(0.03), 10)
+        last_time = time.time()
+        for idx, i in enumerate(b()):
+            elapsed_time = time.time() - last_time
+            if i == 0:
+                time.sleep(0.3)
+            else:
+                # read time should be short, meaning already buffered.
+                self.assertLess(elapsed_time, 0.05)
+            last_time = time.time()
+
+
+class TestCompose(unittest.TestCase):
+    def test_compse(self):
+        reader = paddle.reader.compose(
+            reader_creator_10(0), reader_creator_10(0))
+        for idx, e in enumerate(reader()):
+            self.assertEqual(e, (idx, idx))
+
+    def test_compose_not_aligned(self):
+        total = 0
+        reader = paddle.reader.compose(
+            paddle.reader.chain(reader_creator_10(0), reader_creator_10(0)),
+            reader_creator_10(0))
+        with self.assertRaises(paddle.reader.ComposeNotAligned):
+            for e in reader():
+                total += 1
+        # expecting 10, not 20
+        self.assertEqual(total, 10)
+
+    def test_compose_not_aligned_no_check(self):
+        total = 0
+        reader = paddle.reader.compose(
+            paddle.reader.chain(reader_creator_10(0), reader_creator_10(0)),
+            reader_creator_10(0),
+            check_alignment=False)
+        for e in reader():
+            total += 1
+        # expecting 10, not 20
+        self.assertEqual(total, 10)
+
+
+class TestChain(unittest.TestCase):
+    def test_chain(self):
+        c = paddle.reader.chain(reader_creator_10(0), reader_creator_10(0))
+        idx = 0
+        for e in c():
+            self.assertEqual(e, idx % 10)
+            idx += 1
+        self.assertEqual(idx, 20)
+
+
+class TestShuffle(unittest.TestCase):
+    def test_shuffle(self):
+        case = [(0, True), (1, True), (10, False), (100, False)]
+        a = reader_creator_10(0)
+        for size, checkEq in case:
+            s = paddle.reader.shuffle(a, size)
+            total = 0
+            for idx, e in enumerate(s()):
+                if checkEq:
+                    self.assertEqual(idx, e)
+                total += 1
+            self.assertEqual(total, 10)
+
+
+class TestXmap(unittest.TestCase):
+    def test_xmap(self):
+        def mapper(x):
+            return (x + 1)
+
+        orders = (True, False)
+        thread_nums = (1, 2, 4, 8, 16)
+        buffered_size = (1, 2, 4, 8, 16)
+        for order in orders:
+            for tNum in thread_nums:
+                for size in buffered_size:
+                    reader = paddle.reader.xmap_readers(mapper,
+                                                        reader_creator_10(0),
+                                                        tNum, size, order)
+                    for n in xrange(3):
+                        result = []
+                        for i in reader():
+                            result.append(i)
+                        if not order:
+                            result.sort()
+                        for idx, e in enumerate(result):
+                            self.assertEqual(e, mapper(idx))
+
+
+class TestPipeReader(unittest.TestCase):
+    def test_pipe_reader(self):
+        def example_reader(myfiles):
+            for f in myfiles:
+                pr = paddle.reader.PipeReader("cat %s" % f, bufsize=128)
+                for l in pr.get_line():
+                    yield l
+
+        import tempfile
+
+        records = [str(i) for i in xrange(5)]
+        temp = tempfile.NamedTemporaryFile()
+        try:
+            with open(temp.name, 'w') as f:
+                for r in records:
+                    f.write('%s\n' % r)
+
+            result = []
+            for r in example_reader([temp.name]):
+                result.append(r)
+
+            for idx, e in enumerate(records):
+                self.assertEqual(e, result[idx])
+        finally:
+            # delete the temporary file
+            temp.close()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/reader/tests/test_data_creator.txt b/python/paddle/reader/tests/test_data_creator.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a2a8d47d43868d369083808497697da79e620e31
--- /dev/null
+++ b/python/paddle/reader/tests/test_data_creator.txt
@@ -0,0 +1,3 @@
+0 1
+2 3
+4 5
diff --git a/python/paddle/reader/tests/test_reader_recordio.dat b/python/paddle/reader/tests/test_reader_recordio.dat
new file mode 100644
index 0000000000000000000000000000000000000000..a99a35bb829e066c4845d0b85b96cd1eb3a12491
Binary files /dev/null and b/python/paddle/reader/tests/test_reader_recordio.dat differ
diff --git a/python/paddle/reader/tests/test_recordio_creator.dat b/python/paddle/reader/tests/test_recordio_creator.dat
new file mode 100644
index 0000000000000000000000000000000000000000..17aa89b6796184407e83246d3f342a55a66b4a69
Binary files /dev/null and b/python/paddle/reader/tests/test_recordio_creator.dat differ
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 186b91c226accbe1c2d5465d6244b9438eec9979..5b90facd49d655f56c037e087d86e41372cbfdb9 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -67,7 +67,7 @@ extension_module_name=[MODULE_NAME], then config_parser will call
 MODULE_NAME.get_config_funcs(g_config)
 MODULE_NAME.get_config_funcs() should return a dictionary of name to functions,
 those functions will be available in the config file.
-See trainer/tests/config_parser_test.py for example
+See legacy/trainer/tests/config_parser_test.py for example
 
 To use this from paddle_trainer, paddle_trainer should be called with
 --config_args=extension_module_name=[MODULE_NAME]
@@ -471,6 +471,7 @@ class Input(Cfg):
             maxout=None,
             spp=None,
             pad=None,
+            upsample=None,
             format=None,
             nnz=None,
             is_static=None,
@@ -983,6 +984,13 @@ class Pad(Cfg):
         self.add_keys(locals())
 
 
+@config_class
+class Upsample(Cfg):
+    def __init__(self, scale, scale_y, pad_out_x, pad_out_y, upsample_size,
+                 upsample_size_y):
+        self.add_keys(locals())
+
+
 @config_class
 class Norm(Cfg):
     def __init__(self,
@@ -2380,6 +2388,46 @@ class SpatialPyramidPoolLayer(LayerBase):
             self.set_cnn_layer(name, 1, output_x, spp_conf.image_conf.channels)
 
 
+@config_layer('upsample')
+class UpsampleLayer(LayerBase):
+    def __init__(self, name, inputs, **xargs):
+        super(UpsampleLayer, self).__init__(
+            name, 'upsample', 0, inputs=inputs, **xargs)
+
+        input_layer = self.get_input_layer(0)
+        image_conf = self.config.inputs[0].upsample_conf.image_conf
+        image_conf.img_size = input_layer.width
+        image_conf.img_size_y = input_layer.height
+        image_conf.channels = input_layer.size / (input_layer.width *
+                                                  input_layer.height)
+
+        upsample = self.inputs[0].upsample
+        output_x = 0
+        output_y = 0
+        output_size = 0
+
+        if upsample.scale:
+            self.config.inputs[0].upsample_conf.scale = upsample.scale
+            self.config.inputs[0].upsample_conf.scale_y = upsample.scale_y
+            output_x = input_layer.width * upsample.scale
+            output_y = input_layer.height * upsample.scale_y
+        self.config.inputs[0].upsample_conf.pad_out_x = upsample.pad_out_x
+        self.config.inputs[0].upsample_conf.pad_out_y = upsample.pad_out_y
+        if upsample.upsample_size:
+            self.config.inputs[
+                0].upsample_conf.upsample_size = upsample.upsample_size
+            self.config.inputs[
+                0].upsample_conf.upsample_size_y = upsample.upsample_size_y
+            output_x = upsample.upsample_size
+            output_y = upsample.upsample_size_y
+
+        output_size = image_conf.channels * output_x * output_y
+
+        self.set_layer_height_width(output_y, output_x)
+        self.set_layer_depth(input_layer.depth)
+        self.set_layer_size(output_size)
+
+
 @config_layer('pad')
 class PadLayer(LayerBase):
     def __init__(self, name, inputs, **xargs):
diff --git a/python/paddle/trainer_config_helpers/activations.py b/python/paddle/trainer_config_helpers/activations.py
index 00efc01c0592107314f5b23c951706d039d49a88..3683968262266a2d654d2480b828173bc761152b 100644
--- a/python/paddle/trainer_config_helpers/activations.py
+++ b/python/paddle/trainer_config_helpers/activations.py
@@ -77,7 +77,7 @@ class SoftmaxActivation(BaseActivation):
 
     .. math::
 
-       P(y=j|x) = \\frac{e^{x_j}} {\\sum^K_{k=1} e^{x_j} }
+       P(y=j|x) = \\frac{e^{x_j}} {\\sum^K_{k=1} e^{x_k} }
     """
 
     def __init__(self):
diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py
index e6f87ce61b1d16d4f98f111626776aa52c2ec35b..4e3beaf639bad9fed2862a5477095b66ef4b9aee 100644
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -240,14 +240,15 @@ class ExtraLayerAttribute(object):
     :type error_clipping_threshold: float
     :param drop_rate: Dropout rate. Dropout will create a mask on layer output.
                       The dropout rate is the zero rate of this mask. The
-                      details of what dropout is please refer to `here
-                      <https://www.cs.toronto.edu/~hinton/absps/
-                      JMLRdropout.pdf>`_.
+                      details of what dropout is please refer to `JMLRdropout
+                      <https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf
+                      >`_.
     :type drop_rate: float
     :param device: device ID of layer. device=-1, use CPU. device>=0, use GPU.
-                   The details allocation in parallel_nn please refer to `here
-                   <http://www.paddlepaddle.org/doc/ui/cmd_argument/
-                   use_case.html#case-2-specify-layers-in-different-devices>`_.
+                   The details allocation in parallel_nn please refer to `use_case
+                   <https://github.com/PaddlePaddle/Paddle/blob/develop/doc/v2
+                   /howto/cmd_parameter/use_case_en.md#case-2-specify-layers-in
+                   -different-devices>`_.
     :type device: int
     """
 
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index eac2cb316835fda0a52ac9895eaa80914d0f1e5b..d9787ef42a31b8dfd1836e7a01d5664049cc66b5 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -148,6 +148,7 @@ __all__ = [
     'resize_layer',
     'sub_seq_layer',
     'scale_sub_region_layer',
+    'upsample_layer',
     'factorization_machine',
 ]
 
@@ -166,6 +167,7 @@ class LayerType(object):
     SEQUENCE_RESHAPE = 'seqreshape'
     POOLING_MAX = 'max'
     POOLING_AVG = 'average'
+    UPSAMPLE_LAYER = 'upsample'
     FC_LAYER = 'fc'
     COST = 'cost'
     COSINE_SIM_VEC = 'cos_vm'
@@ -2554,7 +2556,7 @@ def img_conv_layer(input,
     the output will be obtained by concatenating the two results.
 
     The details of grouped convolution, please refer to:
-    `ImageNet Classification with Deep Convolutional Neural Networks
+    `ImageNet Classification With Deep Convolutional Neural Networks
     <http://www.cs.toronto.edu/~kriz/imagenet_classification_with_deep_convolutional.pdf>`_
     
     The example usage is:
@@ -2747,17 +2749,17 @@ def img_pool_layer(input,
 
     ..  math::
 
-        w & = 1 + \\frac{ceil(input\_width + 2 * padding - pool\_size)}{stride}
+        w & = 1 + ceil(\\frac{input\_width + 2 * padding - pool\_size}{stride})
 
-        h & = 1 + \\frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
+        h & = 1 + ceil(\\frac{input\_height + 2 * padding\_y - pool\_size\_y}{stride\_y})
 
     - ceil_mode=False:
 
     ..  math::
 
-        w & = 1 + \\frac{floor(input\_width + 2 * padding - pool\_size)}{stride}
+        w & = 1 + floor(\\frac{input\_width + 2 * padding - pool\_size}{stride})
 
-        h & = 1 + \\frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
+        h & = 1 + floor(\\frac{input\_height + 2 * padding\_y - pool\_size\_y}{stride\_y})
 
     The example usage is:
 
@@ -3014,6 +3016,83 @@ def img_pool3d_layer(input,
         size=l.config.size)
 
 
+@wrap_name_default("upsample")
+@layer_support()
+def upsample_layer(input,
+                   name=None,
+                   scale=None,
+                   scale_y=None,
+                   upsample_size=None,
+                   upsample_size_y=None,
+                   pad_out_x=False,
+                   pad_out_y=False,
+                   layer_attr=None):
+    """
+    The DePooling process.
+    Inputs should be a list of length 2. The first input is a layer,
+    and the second input should be the MaxWithMaskPoolingLayer
+
+    The example usage is:
+
+    ..  code-block:: python
+        pool1 = paddle.v2.layer.img_pool(input=input, pool_size=2, stride=2,
+                                        pool_type=paddle.pooling.MaxWithMask())
+        upsample = paddle.v2.layer.upsample(input=[layer1, pool1])
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input: contains an input layer and a MaxWithMaskPoolingLayer
+    :type input: list | tuple | collections.Sequence
+    :param scale: outputSize =  scale * inputSize
+    :type scale: int | list | tuple | .
+    :param scale_y: scale_y will be equal to scale, if it's value is None, 
+    :type scale: int | None. 
+    :param upsample_size: specify the outputSize.
+    :type upsample_size: int | list | tuple.
+    :param upsample_size_y: specify the y dimension outputSize.
+    :type upsample_size_y: int.
+    :param pad_out_x: specify exact x dimension size. This parameter only works when scale is 2
+    :type pad_out_x: bool.
+    :param pad_out_y: specify exact y dimension size. This parameter only works when scale is 2
+    :type pad_out_y: bool.
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert (scale is not None) or (upsample_size is not None), \
+            'scale or upsample_size, there must be one to be designated'
+
+    assert len(input) == 2, 'layer input size must be 2'
+
+    assert input[1].layer_type == LayerType.POOL_LAYER, \
+            'the second input should be the MaxPoolWithMaskLayer'
+
+    scale_y = scale \
+            if scale is not None else scale_y
+    upsample_size_y = upsample_size  \
+            if upsample_size is not None else upsample_size_y
+
+    layer_type = LayerType.UPSAMPLE_LAYER
+
+    layer = Layer(
+        name=name,
+        type=layer_type,
+        inputs=[
+            Input(
+                input[0].name,
+                upsample=Upsample(scale, scale_y, pad_out_x, pad_out_y,
+                                  upsample_size, upsample_size_y)),
+            Input(input[1].name)
+        ],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+
+    sz = layer.config.size
+
+    return LayerOutput(name, layer_type=layer_type, parents=input, size=sz)
+
+
 @wrap_name_default("spp")
 @layer_support()
 def spp_layer(input,
@@ -4103,9 +4182,9 @@ def recurrent_group(step, input, reverse=False, name=None, targetInlink=None):
 
     You can see following configs for further usages:
 
-    - time steps: lstmemory_group, paddle/gserver/tests/sequence_layer_group.conf, \
+    - time steps: lstmemory_group, paddle/legacy/gserver/tests/sequence_layer_group.conf, \
                   demo/seqToseq/seqToseq_net.py
-    - sequence steps: paddle/gserver/tests/sequence_nest_layer_group.conf
+    - sequence steps: paddle/legacy/gserver/tests/sequence_nest_layer_group.conf
 
     :param step: A step function which takes the input of recurrent_group as its own
                  input and returns values as recurrent_group's output every time step.
@@ -5599,8 +5678,8 @@ def warp_ctc_layer(input,
     <https://github.com/baidu-research/warp-ctc>`_ library, which is used in
     `Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin
     <https://arxiv.org/pdf/1512.02595v1.pdf>`_, to compute Connectionist Temporal
-    Classification (CTC) loss. Besides, another `warp-ctc
-    <https://github.com/gangliao/warp-ctc>`_ repository, which is forked from
+    Classification (CTC) loss. Besides, another `warp-ctc repository
+    <https://github.com/gangliao/warp-ctc>`_ , which is forked from
     the official one, is maintained to enable more compiling options. During the
     building process, PaddlePaddle will clone the source codes, build and
     install it to :code:`third_party/install/warpctc` directory.
diff --git a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
index 580aef935b5cec385a88fb0b4f5b9a5ddeddb40c..30e0b9906c406d846d4b086a1a1c89587394afea 100644
--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
@@ -1,17 +1,17 @@
 #################### test_config_parser #########################
 add_test(NAME layers_test
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_BINARY_DIR}/python/
         ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/layers_test.py
     WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/python/paddle)
 
 add_test(NAME test_reset_hook
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_BINARY_DIR}/python/
         ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
     WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/python/paddle)
 
 add_paddle_exe(protobuf_equal ProtobufEqualMain.cpp)
 add_test(NAME test_layerHelpers
-  COMMAND
-  ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_BINARY_DIR}/python/
+  ${PADDLE_BINARY_DIR}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
   ${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
 )
diff --git a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
index 8a318879630cd491573afcaf798dda2ca75e335d..44a75a60cc78e85f85d111a911999b7812db0f49 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
@@ -2,7 +2,6 @@
 
 set -e
 cd `dirname $0`
-export PYTHONPATH=$PWD/../../../../
 
 protostr=$PWD/protostr
 . file_list.sh
diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py
index c1acbecd9c313b02d6d33d2d04fd33fc1a8b026e..38056fe0a9496bcb5de76634bbab267e324dc2a4 100644
--- a/python/paddle/v2/dataset/__init__.py
+++ b/python/paddle/v2/dataset/__init__.py
@@ -36,7 +36,7 @@ __all__ = [
     'cifar',
     'movielens',
     'conll05',
-    'sentiment'
+    'sentiment',
     'uci_housing',
     'wmt14',
     'wmt16',
diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py
index 0a2a1ced11ee5cb2fb407b229ce810d553c2fa46..662655c836dbc54bd6187dcd3dac7354d6c8ecd1 100644
--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
@@ -43,7 +43,7 @@ CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
 CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
 
 
-def reader_creator(filename, sub_name):
+def reader_creator(filename, sub_name, cycle=False):
     def read_batch(batch):
         data = batch['data']
         labels = batch.get('labels', batch.get('fine_labels', None))
@@ -56,10 +56,13 @@ def reader_creator(filename, sub_name):
             names = (each_item.name for each_item in f
                      if sub_name in each_item.name)
 
-            for name in names:
-                batch = cPickle.load(f.extractfile(name))
-                for item in read_batch(batch):
-                    yield item
+            while True:
+                for name in names:
+                    batch = cPickle.load(f.extractfile(name))
+                    for item in read_batch(batch):
+                        yield item
+                if not cycle:
+                    break
 
     return reader
 
@@ -94,34 +97,40 @@ def test100():
         'test')
 
 
-def train10():
+def train10(cycle=False):
     """
     CIFAR-10 training set creator.
 
     It returns a reader creator, each sample in the reader is image pixels in
     [0, 1] and label in [0, 9].
 
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
     :return: Training reader creator
     :rtype: callable
     """
     return reader_creator(
         paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'data_batch')
+        'data_batch',
+        cycle=cycle)
 
 
-def test10():
+def test10(cycle=False):
     """
     CIFAR-10 test set creator.
 
     It returns a reader creator, each sample in the reader is image pixels in
     [0, 1] and label in [0, 9].
 
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
     :return: Test reader creator.
     :rtype: callable
     """
     return reader_creator(
         paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'test_batch')
+        'test_batch',
+        cycle=cycle)
 
 
 def fetch():
diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py
index 7bdddeaabec733ef26b3f766c6437f5c53d65044..db12076d54064781bd1060947497622b14783768 100644
--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/v2/dataset/flowers.py
@@ -76,7 +76,8 @@ def reader_creator(data_file,
                    dataset_name,
                    mapper,
                    buffered_size=1024,
-                   use_xmap=True):
+                   use_xmap=True,
+                   cycle=False):
     '''
     1. read images from tar file and
         merge images into batch files in 102flowers.tgz_batch/
@@ -96,6 +97,8 @@ def reader_creator(data_file,
     :type mapper: callable
     :param buffered_size: the size of buffer used to process images
     :type buffered_size: int
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
     :return: data reader
     :rtype: callable
     '''
@@ -108,23 +111,27 @@ def reader_creator(data_file,
     file_list = batch_images_from_tar(data_file, dataset_name, img2label)
 
     def reader():
-        for file in open(file_list):
-            file = file.strip()
-            batch = None
-            with open(file, 'r') as f:
-                batch = cPickle.load(f)
-            data = batch['data']
-            labels = batch['label']
-            for sample, label in itertools.izip(data, batch['label']):
-                yield sample, int(label) - 1
+        while True:
+            for file in open(file_list):
+                file = file.strip()
+                batch = None
+                with open(file, 'r') as f:
+                    batch = cPickle.load(f)
+                data = batch['data']
+                labels = batch['label']
+                for sample, label in itertools.izip(data, batch['label']):
+                    yield sample, int(label) - 1
+            if not cycle:
+                break
 
     if use_xmap:
-        return xmap_readers(mapper, reader, cpu_count(), buffered_size)
+        cpu_num = int(os.environ.get('CPU_NUM', cpu_count()))
+        return xmap_readers(mapper, reader, cpu_num, buffered_size)
     else:
         return map_readers(mapper, reader)
 
 
-def train(mapper=train_mapper, buffered_size=1024, use_xmap=True):
+def train(mapper=train_mapper, buffered_size=1024, use_xmap=True, cycle=False):
     '''
     Create flowers training set reader.
     It returns a reader, each sample in the reader is
@@ -137,17 +144,23 @@ def train(mapper=train_mapper, buffered_size=1024, use_xmap=True):
     :type mapper: callable
     :param buffered_size: the size of buffer used to process images
     :type buffered_size: int
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
     :return: train data reader
     :rtype: callable
     '''
     return reader_creator(
         download(DATA_URL, 'flowers', DATA_MD5),
         download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), TRAIN_FLAG, mapper,
-        buffered_size, use_xmap)
+        download(SETID_URL, 'flowers', SETID_MD5),
+        TRAIN_FLAG,
+        mapper,
+        buffered_size,
+        use_xmap,
+        cycle=cycle)
 
 
-def test(mapper=test_mapper, buffered_size=1024, use_xmap=True):
+def test(mapper=test_mapper, buffered_size=1024, use_xmap=True, cycle=False):
     '''
     Create flowers test set reader.
     It returns a reader, each sample in the reader is
@@ -160,14 +173,20 @@ def test(mapper=test_mapper, buffered_size=1024, use_xmap=True):
     :type mapper: callable
     :param buffered_size: the size of buffer used to process images
     :type buffered_size: int
+    :param cycle: whether to cycle through the dataset
+    :type cycle: bool
     :return: test data reader
     :rtype: callable
     '''
     return reader_creator(
         download(DATA_URL, 'flowers', DATA_MD5),
         download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), TEST_FLAG, mapper,
-        buffered_size, use_xmap)
+        download(SETID_URL, 'flowers', SETID_MD5),
+        TEST_FLAG,
+        mapper,
+        buffered_size,
+        use_xmap,
+        cycle=cycle)
 
 
 def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True):
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py
index 37c4296f9bcea7e16daa46f778934331513c30c4..00c2a3b9928d1ca5f3e8cd5e87ba7ad4108e9dad 100644
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py
@@ -124,7 +124,7 @@ def test(word_idx):
         re.compile("aclImdb/test/neg/.*\.txt$"), word_idx)
 
 
-def word_dict():
+def word_dict(cutoff=150):
     """
     Build a word dictionary from the corpus.
 
@@ -132,7 +132,7 @@ def word_dict():
     :rtype: dict
     """
     return build_dict(
-        re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
+        re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), cutoff)
 
 
 def fetch():
diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
index 9f675bed895223e054cd3bb6e504fe1607f19858..2b959c48e4bc62e08f6f57981b61b7c5fe3a1d06 100644
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
@@ -112,7 +112,7 @@ def fetch():
     paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
     paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
     paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
-    paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+    paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5)
 
 
 def convert(path):
diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py
index 52f5b947fdec55eea45b9d34eddd576c981fa97c..28ee042282a08be32c13d91312fd97b211277522 100644
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
@@ -15,7 +15,7 @@
 import numpy
 import collections
 import topology
-import minibatch
+import paddle
 import cPickle
 
 __all__ = ['infer', 'Inference']
@@ -63,7 +63,7 @@ class Inference(object):
             assert isinstance(val, api.Vector)
             val.copyFromNumpyArray(parameters.get(name).flatten())
             # the setValueUpdated function is called in randomize, zeroMem,
-            # load function in paddle/parameter/Parameter.cpp. But in the
+            # load function in paddle/legacy/parameter/Parameter.cpp. But in the
             # inference mode, the setValueUpdated is never called, it will
             # cause the parameter will not be dispatched
             # in MultiGradientMachine for multi-GPU. So setValueUpdated is
@@ -80,7 +80,7 @@ class Inference(object):
             for each_sample in input:
                 yield each_sample
 
-        reader = minibatch.batch(__reader_impl__, batch_size=batch_size)
+        reader = paddle.batch(__reader_impl__, batch_size=batch_size)
 
         self.__gradient_machine__.start()
         for data_batch in reader():
diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py
index 6a2bb8d337b7667aa2b1e3ef0815bb80f6e38d6a..a188a03eb3698c972de92c9807f1bdb71a249330 100644
--- a/python/paddle/v2/layer.py
+++ b/python/paddle/v2/layer.py
@@ -20,7 +20,7 @@ The primary usage shows below.
 
 ..  code-block:: python
 
-    import paddle.v2 as paddle
+    import paddle
 
     img = paddle.layer.data(name='img', type=paddle.data_type.dense_vector(784))
     hidden = paddle.layer.fc(input=img, size=200)
diff --git a/python/paddle/v2/minibatch.py b/python/paddle/v2/minibatch.py
index 317cf037c69f8639e3760fbfce20565127794fcb..3c6a53db3c2287e8ef5931a06ca5dad455665ee0 100644
--- a/python/paddle/v2/minibatch.py
+++ b/python/paddle/v2/minibatch.py
@@ -15,7 +15,7 @@
 __all__ = ['batch']
 
 
-def batch(reader, batch_size):
+def batch(reader, batch_size, drop_last=True):
     """
     Create a batched reader.
 
@@ -23,6 +23,8 @@ def batch(reader, batch_size):
     :type reader: callable
     :param batch_size: size of each mini-batch
     :type batch_size: int
+    :param drop_last: drop the last batch, if the size of last batch is not equal to batch_size.
+    :type drop_last: bool
     :return: the batched reader.
     :rtype: callable
     """
@@ -35,7 +37,7 @@ def batch(reader, batch_size):
             if len(b) == batch_size:
                 yield b
                 b = []
-        if b:
+        if drop_last == False and len(b) != 0:
             yield b
 
     return batch_reader
diff --git a/python/paddle/v2/reader/__init__.py b/python/paddle/v2/reader/__init__.py
index 3b059735a924d58714cd88a761eb83143f1192d6..12efdc4a0fec83fed57bdcbf687aaec69d13ba91 100644
--- a/python/paddle/v2/reader/__init__.py
+++ b/python/paddle/v2/reader/__init__.py
@@ -50,7 +50,7 @@ An example implementation for single item data reader creator:
         def reader():
             while True:
                 yield numpy.random.uniform(-1, 1, size=width*height)
-    return reader
+        return reader
 
 An example implementation for multiple item data reader creator:
 
@@ -60,7 +60,7 @@ An example implementation for multiple item data reader creator:
         def reader():
             while True:
                 yield numpy.random.uniform(-1, 1, size=width*height), label
-    return reader
+        return reader
 
 
 TODO(yuyang18): Should we add whole design doc here?
diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py
index 421f6c933d7032e4103f504fc509e2d5c89149b2..fda5246d74f598200b439774a25e80ec3e504077 100644
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py
@@ -16,7 +16,7 @@ Creator package contains some simple reader creator, which could
 be used in user program.
 """
 
-__all__ = ['np_array', 'text_file', "cloud_reader"]
+__all__ = ['np_array', 'text_file', 'recordio', 'cloud_reader']
 
 
 def np_array(x):
diff --git a/python/requirements.txt b/python/requirements.txt
index daf3f368b92408408897e33223118fe3647aa6de..ea827e9d5a0dcf8eb2ede1f6eaa88c777a138816 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -8,3 +8,4 @@ scipy>=0.19.0
 Pillow
 nltk>=3.2.2
 graphviz
+LinkChecker
diff --git a/python/setup.py.in b/python/setup.py.in
index f830039a3af581d593d510326f15139377cb25f1..51380149d0b09224c02050902897f23f53600de2 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -5,7 +5,7 @@ class BinaryDistribution(Distribution):
         return True
 
 MAJOR   = 0
-MINOR   = 11
+MINOR   = 14
 PATCH   = 0
 RC      = 0
 ISTAGED = False
@@ -58,24 +58,30 @@ def mkl():
             'istaged': ISTAGED,
             'with_mkl': '@WITH_MKL@'})
 
-write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py')
+write_version_py(filename='@PADDLE_BINARY_DIR@/python/paddle/version.py')
 
 
 packages=['paddle',
-          'paddle.proto',
-          'paddle.trainer',
-          'paddle.trainer_config_helpers',
           'paddle.utils',
-          'paddle.v2',
-          'paddle.v2.dataset',
-          'paddle.v2.reader',
-          'paddle.v2.master',
-          'paddle.v2.plot',
+          'paddle.dataset',
+          'paddle.reader',
           'paddle.fluid',
           'paddle.fluid.proto',
           'paddle.fluid.proto.profiler',
           'paddle.fluid.layers',
-          'py_paddle']
+          'paddle.fluid.transpiler',
+          'paddle.fluid.transpiler.details']
+
+if '${WITH_FLUID_ONLY}'== 'OFF':
+    packages+=['paddle.proto',
+               'paddle.trainer',
+               'paddle.trainer_config_helpers',
+               'paddle.v2',
+               'paddle.v2.master',
+               'paddle.v2.plot',
+               'paddle.v2.reader',
+               'paddle.v2.dataset',
+               'py_paddle']
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
     setup_requires = f.read().splitlines()
@@ -84,11 +90,30 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
     setup_requires+=['opencv-python']
 
 # the prefix is sys.prefix which should always be usr
-paddle_bin_dir = 'opt/paddle/bin'
-paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer',
-               '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model',
-               '${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main',
-               '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
+paddle_bins = ''
+if '${WITH_FLUID_ONLY}'== 'OFF':
+    paddle_bin_dir = 'opt/paddle/bin'
+    paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/legacy/trainer/paddle_trainer',
+                   '${PADDLE_BINARY_DIR}/paddle/legacy/trainer/paddle_merge_model',
+                   '${PADDLE_BINARY_DIR}/paddle/legacy/pserver/paddle_pserver_main',
+                   '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
+
+package_data={'paddle.fluid': ['core.so']}
+if '${WITH_FLUID_ONLY}'== 'OFF':
+    package_data['paddle.v2.master']=['libpaddle_master.so']
+    package_data['py_paddle']=['*.py','_swig_paddle.so']
+
+package_dir={
+    '': '${PADDLE_BINARY_DIR}/python',
+    # The paddle.fluid.proto will be generated while compiling.
+    # So that package points to other directory.
+    'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
+    'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
+    'paddle.fluid': '${PADDLE_BINARY_DIR}/python/paddle/fluid',
+}
+if '${WITH_FLUID_ONLY}'== 'OFF':
+    package_dir['py_paddle']='${PADDLE_BINARY_DIR}/python/py_paddle'
+    
 
 paddle_rt_lib_dir = 'lib'
 paddle_rt_libs = ['${WARPCTC_LIBRARIES}']
@@ -101,19 +126,8 @@ setup(name='${PACKAGE_NAME}',
       install_requires=setup_requires,
       packages=packages,
       ext_modules=[Extension('_foo', ['stub.cc'])],
-      package_data={
-        'paddle.v2.master': ['libpaddle_master.so'],
-        'paddle.fluid': ['core.so'],
-        'py_paddle':['*.py','_swig_paddle.so']
-      },
-      package_dir={
-          '': '${CMAKE_CURRENT_SOURCE_DIR}',
-          # The paddle.fluid.proto will be generated while compiling.
-          # So that package points to other directory.
-          'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform',
-          'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework',
-          'py_paddle': '${PADDLE_SOURCE_DIR}/paddle/py_paddle'
-      },
+      package_data=package_data,
+      package_dir=package_dir,
       scripts=paddle_bins,
       data_files=[(paddle_rt_lib_dir, paddle_rt_libs)]
 )
diff --git a/tools/aws_benchmarking/README.md b/tools/aws_benchmarking/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4fdd4b0de44e779378091566d9d6056a6f9ee4b6
--- /dev/null
+++ b/tools/aws_benchmarking/README.md
@@ -0,0 +1,184 @@
+# AWS benchmark testing tool
+This is an automation tool for deploying paddlepaddle benchmark testing to AWS.
+
+## Features
+
+ - subnet creation to fit just the amount of ec2 instances required.
+ - pserver and trainer ec2 instances allocation, and instance state verification
+ - nvidia-docker ready for GPU training
+ - Instances and network element garbage collection when a task is accomplished or an error occurred
+ - Test log is collected in realtime
+ - Web service for checking log or tearing down the testing setup
+ - No testing code change needed
+ - Lots of optional configuration options
+
+ ## Usages
+
+ ### Prerequisites
+
+ - You have a working AWS account
+ - You have [AWS Command Line Interface](https://aws.amazon.com/cli/) installed
+ - Your AWS cli is bind with a account which has `AmazonEC2FullAccess` permission, and it's set as default credential.
+ - You have key pair created and pem file downloaded.
+ - You have a default VPC in the region you want to run the test.
+ - You have a Security Group created for the VPC mentioned above, which allows port 22 and the port you want to expose your control web service (5436 by default)
+ - If your test is supposed to run in a GPU machine, especially a multi card GPU machine (p2, p3 series), you might need to contact amazon to raise the limit which allows no more than 1 GPU instance at a time.
+
+ ### Start a benchmark test
+
+#### Create training image
+
+*What to expect in this step:*
+
+*You will have your training logic packed with paddle runtime in a docker image, and be able to be picked up by AWS instance for training.*
+
+Training python script and PaddlePaddle runtime are supposed to be packed into one docker image. Use PaddlePaddle production images as base image and create the training images with the docker file as follows:
+
+```Dockerfile
+FROM paddlepaddle/paddle:latest-gpu
+
+ENV HOME /root
+COPY ./ /root/
+WORKDIR /root
+RUN pip install -r /root/requirements.txt
+ENTRYPOINT ["python", "my_training.py"]
+```
+
+***Please Note***
+Training nodes will run your `ENTRYPOINT` script with the following environment variables:
+
+ - `TASK_NAME`: unique name to identify this training process.
+ - `TRAINING_ROLE`: current node's role in this training process, either "PSERVER" or "TRAINER"
+ - `PSERVER_HOSTS`: comma separated value of pserver end points, I.E. "192.168.1.2:5436,192.168.1.3:5436"
+ - `PSERVERS`: same as above
+ - `TRAINERS`: trainer count
+ - `SERVER_ENDPOINT`: current server end point if the node role is a pserver
+ - `TRAINER_INDEX`: an integer to identify the index of current trainer if the node role is a trainer.
+ - `PADDLE_INIT_TRAINER_ID`: same as above
+
+ Now we have a working distributed training script which takes advantage of node environment variables and docker file to generate the training image. Run the following command:
+
+ ```bash
+ docker build -t myreponname/paddle_benchmark .
+ ```
+
+ Now you have the image built and tagged with `myreponame/paddle_benchmark`, let's push it to dockerhub so that it can be picked up by out AWS instance.
+
+ ```bash
+ docker push myreponame/paddle_benchmark
+ ```
+
+#### Create instances and start training
+
+*What to expect in this step*
+
+*you will be asked to provide some basic settings to config your training, and this tool will have your training started and monitored*
+
+Now let's start the training process:
+
+```bash
+docker run -i -v $HOME/.aws:/root/.aws -v <full path to your pem file>:/root/<key pair name>.pem \
+putcn/paddle_aws_client \
+--action create \
+--key_name <your key pair name> \
+--security_group_id <your security group id> \
+--docker_image myreponame/paddle_benchmark \
+--pserver_count 2 \
+--trainer_count 2 \
+--trainer_command batch_size:20,local:no,device:CPU
+```
+
+Now just wait until you see this:
+```
+master server finished init process, visit http://XXX:XXX/status to check master log
+```
+That means you can turn off your laptop and your cluster is creating instances, starting training process, collecting logs and eventually shut all pservers and trainers down when training is finished.
+
+#### Post creation operations
+
+To access the master log:
+
+```bash
+docker run -i -v $HOME/.aws:/root/.aws \
+putcn/paddle_aws_client \
+--action status \
+--master_server_public_ip <master ip> \
+--master_server_port <master port>
+```
+
+To tear down the training setup:
+
+```bash
+docker run -i -v $HOME/.aws:/root/.aws \
+putcn/paddle_aws_client \
+--action cleanup \
+--master_server_public_ip <master ip> \
+--master_server_port <master port>
+```
+
+To retrieve training logs
+TBD
+
+### Tech details
+
+*What to expect in this step*
+
+*You will understand what is happening behind the scene, and how to check the training log, how to tear down the training on the fly, etc.*
+
+Let's understand what is happening under the hood when you run above command in your laptop
+
+![alt](diagram.png)
+
+There are 4 roles in the figure above:
+ - client: your laptop
+ - master: who tasks to aws api server to create/tear down instances, and monitor training process
+ - AWS api server: the one who actually creates and manages instances
+ - pservers and trainers: training instances
+
+When you run the `docker run` command above, what it actually does is to ask aws api service to create a subnet (step 1) and a master instance (step 2), and pass all the parameters the client collected or generated (step 3). The master is kept as minimum hardware config to keep the running cost low.
+
+Then when the master is up and running, it will ask the aws api server to create the heavy lifting training instances who are expensive to run (step 4). And the master will start training process as soon as they are done initializing (step 5).
+
+Meanwhile, the master will expose a web service for client to check training log or even tear the training setup down by a web service call.
+
+if you are creating the training with client docker container, and also monitoring your aws dashboard, you will initially see a instance tagged with `ROLE=MASTER` and `TASK_NAME=<yourtask name>_master` starts, then you will see several instances tagged with `ROLE=PSERVER` and `ROLE=TRAINER` starts.
+When the training is finished, pservers and trainers will be terminated. All their logs are kept in master node's docker env.
+
+Master exposes 4 major services:
+
+ - GET `/status`: return master log
+ - GET `/logs`: return list of log file names
+ - GET `/log/<logfile name>`: return a particular log by log file name
+ - POST `/cleanup`: teardown the whole setup
+
+
+### Parameters
+
+ - key_name: required, aws key pair name
+ - security_group_id: required, the security group id associated with your VPC
+ - vpc_id: The VPC in which you wish to run test, if not provided, this tool will use your default VPC.
+ - subnet_id: The Subnet_id in which you wish to run test, if not provided, this tool will create a new sub net to run test.
+ - pserver_instance_type: your pserver instance type, c5.2xlarge by default, which is a memory optimized machine.
+ - trainer_instance_type: your trainer instance type, p2.8xlarge by default, which is a GPU machine with 8 cards.
+ - task_name: the name you want to identify your job, if not provided, this tool will generate one for you.
+ - pserver_image_id: ami id for system image. Please note, although the default one has nvidia-docker installed, pserver is always launched with `docker` instead of `nvidia-docker`, please DO NOT init your training program with GPU place.
+ - pserver_command: pserver start command, format example: python,vgg.py,batch_size:128,is_local:no, which will be translated as `python vgg.py --batch_size 128 --is_local no` when trying to start the training in pserver. "--device CPU" is passed as default.
+ - trainer_image_id: ami id for system image, default one has nvidia-docker ready.
+ - trainer_command: trainer start command. Format is the same as pserver's, "--device GPU" is passed as default.
+ - availability_zone: aws zone id to place ec2 instances, us-east-2a by default.
+ - trainer_count: Trainer count, 1 by default.
+ - pserver_count: Pserver count, 1 by default.
+ - action: create|cleanup|status, "create" by default.
+ - pserver_port: the port for pserver to open service, 5436 by default.
+ - docker_image: the training docker image id.
+ - master_service_port: the port for master to open service, 5436 by default.
+ - master_server_public_ip: the master service ip, this is required when action is not "create"
+ - master_docker_image: master's docker image id, "putcn/paddle_aws_master:latest" by default
+ - no_clean_up: no instance termination when training is finished or failed when this value is set "yes". This is for debug purpose, so that you can inspect into the instances when the process is finished. 
+ 
+
+### Trouble shooting
+
+ 1. How to check logs
+
+    Master log is served at `http://<masterip>:<masterport>/status`, and you can list all the log files from `http://<masterip>:<masterport>/logs`, and access either one of them by `http://<masterip>:<masterport>/log/<logfilename>`
diff --git a/tools/aws_benchmarking/client/Dockerfile b/tools/aws_benchmarking/client/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..812c5d4bce0adff404577ce6b5fd3f0f4a91118c
--- /dev/null
+++ b/tools/aws_benchmarking/client/Dockerfile
@@ -0,0 +1,7 @@
+FROM python:2.7.14-stretch
+
+ENV HOME /root
+COPY ./ /root/
+WORKDIR /root
+RUN pip install -r /root/requirements.txt
+ENTRYPOINT ["python", "cluster_launcher.py"]
\ No newline at end of file
diff --git a/tools/aws_benchmarking/client/cluster_launcher.py b/tools/aws_benchmarking/client/cluster_launcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..12333202b9f003ae5109c7e9b825035ba8eb7d99
--- /dev/null
+++ b/tools/aws_benchmarking/client/cluster_launcher.py
@@ -0,0 +1,415 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import time
+import math
+import logging
+import copy
+
+import netaddr
+import boto3
+import namesgenerator
+import paramiko
+from scp import SCPClient
+import requests
+
+
+def str2bool(v):
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--key_name', type=str, default="", help="required, key pair name")
+parser.add_argument(
+    '--security_group_id',
+    type=str,
+    default="",
+    help="required, the security group id associated with your VPC")
+
+parser.add_argument(
+    '--vpc_id',
+    type=str,
+    default="",
+    help="The VPC in which you wish to run test")
+parser.add_argument(
+    '--subnet_id',
+    type=str,
+    default="",
+    help="The Subnet_id in which you wish to run test")
+
+parser.add_argument(
+    '--pserver_instance_type',
+    type=str,
+    default="c5.2xlarge",
+    help="your pserver instance type, c5.2xlarge by default")
+parser.add_argument(
+    '--trainer_instance_type',
+    type=str,
+    default="p2.8xlarge",
+    help="your trainer instance type, p2.8xlarge by default")
+
+parser.add_argument(
+    '--task_name',
+    type=str,
+    default="",
+    help="the name you want to identify your job")
+parser.add_argument(
+    '--pserver_image_id',
+    type=str,
+    default="ami-da2c1cbf",
+    help="ami id for system image, default one has nvidia-docker ready, \
+    use ami-1ae93962 for us-east-2")
+
+parser.add_argument(
+    '--pserver_command',
+    type=str,
+    default="",
+    help="pserver start command, format example: python,vgg.py,batch_size:128,is_local:yes"
+)
+
+parser.add_argument(
+    '--trainer_image_id',
+    type=str,
+    default="ami-da2c1cbf",
+    help="ami id for system image, default one has nvidia-docker ready, \
+    use ami-1ae93962 for us-west-2")
+
+parser.add_argument(
+    '--trainer_command',
+    type=str,
+    default="",
+    help="trainer start command, format example: python,vgg.py,batch_size:128,is_local:yes"
+)
+
+parser.add_argument(
+    '--availability_zone',
+    type=str,
+    default="us-east-2a",
+    help="aws zone id to place ec2 instances")
+
+parser.add_argument(
+    '--trainer_count', type=int, default=1, help="Trainer count")
+
+parser.add_argument(
+    '--pserver_count', type=int, default=1, help="Pserver count")
+
+parser.add_argument(
+    '--action', type=str, default="create", help="create|cleanup|status")
+
+parser.add_argument('--pem_path', type=str, help="private key file")
+
+parser.add_argument(
+    '--pserver_port', type=str, default="5436", help="pserver port")
+
+parser.add_argument(
+    '--docker_image', type=str, default="busybox", help="training docker image")
+
+parser.add_argument(
+    '--master_server_port', type=int, default=5436, help="master server port")
+
+parser.add_argument(
+    '--master_server_public_ip', type=str, help="master server public ip")
+
+parser.add_argument(
+    '--master_docker_image',
+    type=str,
+    default="putcn/paddle_aws_master:latest",
+    help="master docker image id")
+
+parser.add_argument(
+    '--no_clean_up',
+    type=str2bool,
+    default=False,
+    help="whether to clean up after training")
+
+args = parser.parse_args()
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
+
+ec2client = boto3.client('ec2')
+
+
+def print_arguments():
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+def create_subnet():
+    # if no vpc id provided, list vpcs
+    logging.info("start creating subnet")
+    if not args.vpc_id:
+        logging.info("no vpc provided, trying to find the default one")
+        vpcs_desc = ec2client.describe_vpcs(
+            Filters=[{
+                "Name": "isDefault",
+                "Values": ["true", ]
+            }], )
+        if len(vpcs_desc["Vpcs"]) == 0:
+            raise ValueError('No default VPC')
+        args.vpc_id = vpcs_desc["Vpcs"][0]["VpcId"]
+        vpc_cidrBlock = vpcs_desc["Vpcs"][0]["CidrBlock"]
+
+        logging.info("default vpc fount with id %s and CidrBlock %s" %
+                     (args.vpc_id, vpc_cidrBlock))
+
+    if not vpc_cidrBlock:
+        logging.info("trying to find cidrblock for vpc")
+        vpcs_desc = ec2client.describe_vpcs(
+            Filters=[{
+                "Name": "vpc-id",
+                "Values": [args.vpc_id, ],
+            }], )
+        if len(vpcs_desc["Vpcs"]) == 0:
+            raise ValueError('No VPC found')
+        vpc_cidrBlock = vpcs_desc["Vpcs"][0]["CidrBlock"]
+        logging.info("cidrblock for vpc is %s" % vpc_cidrBlock)
+
+    # list subnets in vpc in order to create a new one
+
+    logging.info("trying to find ip blocks for new subnet")
+    subnets_desc = ec2client.describe_subnets(
+        Filters=[{
+            "Name": "vpc-id",
+            "Values": [args.vpc_id, ],
+        }], )
+
+    ips_taken = []
+    for subnet_dec in subnets_desc["Subnets"]:
+        ips_taken.append(subnet_dec["CidrBlock"])
+
+    ip_blocks_avaliable = netaddr.IPSet(
+        [vpc_cidrBlock]) ^ netaddr.IPSet(ips_taken)
+    # adding 10 addresses as buffer
+    cidr_prefix = 32 - math.ceil(
+        math.log(args.pserver_count + args.trainer_count + 10, 2))
+    if cidr_prefix <= 16:
+        raise ValueError('Too many nodes to fit in current VPC')
+
+    for ipnetwork in ip_blocks_avaliable.iter_cidrs():
+        try:
+            subnet_cidr = ipnetwork.subnet(int(cidr_prefix)).next()
+            logging.info("subnet ip block found %s" % (subnet_cidr))
+            break
+        except Exception:
+            pass
+
+    if not subnet_cidr:
+        raise ValueError(
+            'No avaliable subnet to fit required nodes in current VPC')
+
+    logging.info("trying to create subnet")
+    subnet_desc = ec2client.create_subnet(
+        CidrBlock=str(subnet_cidr),
+        VpcId=args.vpc_id,
+        AvailabilityZone=args.availability_zone)
+
+    subnet_id = subnet_desc["Subnet"]["SubnetId"]
+
+    subnet_waiter = ec2client.get_waiter('subnet_available')
+    # sleep for 1s before checking its state
+    time.sleep(1)
+    subnet_waiter.wait(SubnetIds=[subnet_id, ])
+
+    logging.info("subnet created")
+
+    logging.info("adding tags to newly created subnet")
+    ec2client.create_tags(
+        Resources=[subnet_id, ],
+        Tags=[{
+            "Key": "Task_name",
+            'Value': args.task_name
+        }])
+    return subnet_id
+
+
+def run_instances(image_id, instance_type, count=1, role="MASTER", cmd=""):
+    response = ec2client.run_instances(
+        ImageId=image_id,
+        InstanceType=instance_type,
+        MaxCount=count,
+        MinCount=count,
+        UserData=cmd,
+        DryRun=False,
+        InstanceInitiatedShutdownBehavior="stop",
+        KeyName=args.key_name,
+        Placement={'AvailabilityZone': args.availability_zone},
+        NetworkInterfaces=[{
+            'DeviceIndex': 0,
+            'SubnetId': args.subnet_id,
+            "AssociatePublicIpAddress": True,
+            'Groups': args.security_group_ids
+        }],
+        TagSpecifications=[{
+            'ResourceType': "instance",
+            'Tags': [{
+                "Key": 'Task_name',
+                "Value": args.task_name + "_master"
+            }, {
+                "Key": 'Role',
+                "Value": role
+            }]
+        }])
+
+    instance_ids = []
+    for instance in response["Instances"]:
+        instance_ids.append(instance["InstanceId"])
+
+    if len(instance_ids) > 0:
+        logging.info(str(len(instance_ids)) + " instance(s) created")
+    else:
+        logging.info("no instance created")
+    #create waiter to make sure it's running
+
+    logging.info("waiting for instance to become accessible")
+    waiter = ec2client.get_waiter('instance_status_ok')
+    waiter.wait(
+        Filters=[{
+            "Name": "instance-status.status",
+            "Values": ["ok"]
+        }, {
+            "Name": "instance-status.reachability",
+            "Values": ["passed"]
+        }, {
+            "Name": "instance-state-name",
+            "Values": ["running"]
+        }],
+        InstanceIds=instance_ids)
+
+    instances_response = ec2client.describe_instances(InstanceIds=instance_ids)
+
+    return instances_response["Reservations"][0]["Instances"]
+
+
+def generate_task_name():
+    return namesgenerator.get_random_name()
+
+
+def init_args():
+
+    if not args.task_name:
+        args.task_name = generate_task_name()
+        logging.info("task name generated %s" % (args.task_name))
+
+    if not args.pem_path:
+        args.pem_path = os.path.expanduser("~") + "/" + args.key_name + ".pem"
+    if args.security_group_id:
+        args.security_group_ids = (args.security_group_id, )
+
+
+def create():
+
+    init_args()
+
+    # create subnet
+    if not args.subnet_id:
+        args.subnet_id = create_subnet()
+
+    # create master node
+
+    master_instance_response = run_instances(
+        image_id="ami-7a05351f", instance_type="t2.nano")
+
+    logging.info("master server started")
+
+    args.master_server_public_ip = master_instance_response[0][
+        "PublicIpAddress"]
+    args.master_server_ip = master_instance_response[0]["PrivateIpAddress"]
+
+    logging.info("master server started, master_ip=%s, task_name=%s" %
+                 (args.master_server_public_ip, args.task_name))
+
+    # cp config file and pems to master node
+
+    ssh_key = paramiko.RSAKey.from_private_key_file(args.pem_path)
+    ssh_client = paramiko.SSHClient()
+    ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+    ssh_client.connect(
+        hostname=args.master_server_public_ip, username="ubuntu", pkey=ssh_key)
+
+    with SCPClient(ssh_client.get_transport()) as scp:
+        scp.put(os.path.expanduser("~") + "/" + ".aws",
+                recursive=True,
+                remote_path='/home/ubuntu/')
+        scp.put(args.pem_path,
+                remote_path='/home/ubuntu/' + args.key_name + ".pem")
+
+    logging.info("credentials and pem copied to master")
+
+    # set arguments and start docker
+    kick_off_cmd = "docker run -d -v /home/ubuntu/.aws:/root/.aws/"
+    kick_off_cmd += " -v /home/ubuntu/" + args.key_name + ".pem:/root/" + args.key_name + ".pem"
+    kick_off_cmd += " -v /home/ubuntu/logs/:/root/logs/"
+    kick_off_cmd += " -p " + str(args.master_server_port) + ":" + str(
+        args.master_server_port)
+    kick_off_cmd += " " + args.master_docker_image
+
+    args_to_pass = copy.copy(args)
+    args_to_pass.action = "serve"
+    del args_to_pass.pem_path
+    del args_to_pass.security_group_ids
+    del args_to_pass.master_docker_image
+    del args_to_pass.master_server_public_ip
+    for arg, value in sorted(vars(args_to_pass).iteritems()):
+        if value:
+            kick_off_cmd += ' --%s %s' % (arg, value)
+
+    logging.info(kick_off_cmd)
+    stdin, stdout, stderr = ssh_client.exec_command(command=kick_off_cmd)
+    return_code = stdout.channel.recv_exit_status()
+    logging.info(return_code)
+    if return_code != 0:
+        raise Exception("Error while kicking off master")
+
+    logging.info(
+        "master server finished init process, visit %s to check master log" %
+        (get_master_web_url("/status")))
+
+
+def cleanup():
+    print requests.post(get_master_web_url("/cleanup")).text
+
+
+def status():
+    print requests.post(get_master_web_url("/status")).text
+
+
+def get_master_web_url(path):
+    return "http://" + args.master_server_public_ip + ":" + str(
+        args.master_server_port) + path
+
+
+if __name__ == "__main__":
+    print_arguments()
+    if args.action == "create":
+        if not args.key_name or not args.security_group_id:
+            raise ValueError("key_name and security_group_id are required")
+        create()
+    elif args.action == "cleanup":
+        if not args.master_server_public_ip:
+            raise ValueError("master_server_public_ip is required")
+        cleanup()
+    elif args.action == "status":
+        if not args.master_server_public_ip:
+            raise ValueError("master_server_public_ip is required")
+        status()
diff --git a/tools/aws_benchmarking/client/requirements.txt b/tools/aws_benchmarking/client/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9454801f2025671cfd1a2c3b71cf4c2ac07cb8fb
--- /dev/null
+++ b/tools/aws_benchmarking/client/requirements.txt
@@ -0,0 +1,6 @@
+netaddr==0.7.19
+boto3==1.6.21
+namesgenerator==0.3
+paramiko==2.4.1
+scp
+requests
diff --git a/tools/aws_benchmarking/diagram.png b/tools/aws_benchmarking/diagram.png
new file mode 100644
index 0000000000000000000000000000000000000000..b97909c5fe78b59d0e636ff73c2ed3e63a0be722
Binary files /dev/null and b/tools/aws_benchmarking/diagram.png differ
diff --git a/tools/aws_benchmarking/server/Dockerfile b/tools/aws_benchmarking/server/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..333523abcdb6fbe7dc01bbaf7d32ce1d8e866028
--- /dev/null
+++ b/tools/aws_benchmarking/server/Dockerfile
@@ -0,0 +1,7 @@
+FROM python:2.7.14-stretch
+
+ENV HOME /root
+COPY ./ /root/
+WORKDIR /root
+RUN pip install -r /root/requirements.txt
+ENTRYPOINT ["python", "cluster_master.py"]
\ No newline at end of file
diff --git a/tools/aws_benchmarking/server/cluster_master.py b/tools/aws_benchmarking/server/cluster_master.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9b24846544d8aca5e4c7bd5709e70564c088431
--- /dev/null
+++ b/tools/aws_benchmarking/server/cluster_master.py
@@ -0,0 +1,735 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import json
+import math
+import time
+import threading
+import logging
+import copy
+import csv
+
+import netaddr
+import boto3
+import namesgenerator
+import paramiko
+
+from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
+
+
+# You must have aws_access_key_id, aws_secret_access_key, region set in
+# ~/.aws/credentials and ~/.aws/config
+def str2bool(v):
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--key_name', type=str, default="", help="required, key pair name")
+parser.add_argument(
+    '--security_group_id',
+    type=str,
+    default="",
+    help="required, the security group id associated with your VPC")
+
+parser.add_argument(
+    '--vpc_id',
+    type=str,
+    default="",
+    help="The VPC in which you wish to run test")
+parser.add_argument(
+    '--subnet_id',
+    type=str,
+    default="",
+    help="The Subnet_id in which you wish to run test")
+
+parser.add_argument(
+    '--pserver_instance_type',
+    type=str,
+    default="c5.2xlarge",
+    help="your pserver instance type, c5.2xlarge by default")
+parser.add_argument(
+    '--trainer_instance_type',
+    type=str,
+    default="p2.8xlarge",
+    help="your trainer instance type, p2.8xlarge by default")
+
+parser.add_argument(
+    '--task_name',
+    type=str,
+    default="",
+    help="the name you want to identify your job")
+parser.add_argument(
+    '--pserver_image_id',
+    type=str,
+    default="ami-da2c1cbf",
+    help="ami id for system image, default one has nvidia-docker ready, use ami-1ae93962 for us-east-2"
+)
+parser.add_argument(
+    '--trainer_image_id',
+    type=str,
+    default="ami-da2c1cbf",
+    help="ami id for system image, default one has nvidia-docker ready, use ami-1ae93962 for us-west-2"
+)
+
+parser.add_argument(
+    '--availability_zone',
+    type=str,
+    default="us-east-2a",
+    help="aws zone id to place ec2 instances")
+
+parser.add_argument(
+    '--trainer_count', type=int, default=1, help="Trainer count")
+
+parser.add_argument(
+    '--pserver_count', type=int, default=1, help="Pserver count")
+
+parser.add_argument(
+    '--pserver_bash_file',
+    type=str,
+    default=os.path.join(os.path.dirname(__file__), "pserver.sh.template"),
+    help="pserver bash file path")
+
+parser.add_argument(
+    '--pserver_command', type=str, default="", help="pserver start command")
+
+parser.add_argument(
+    '--trainer_bash_file',
+    type=str,
+    default=os.path.join(os.path.dirname(__file__), "trainer.sh.template"),
+    help="trainer bash file path")
+
+parser.add_argument(
+    '--trainer_command', type=str, default="", help="trainer start command")
+
+parser.add_argument(
+    '--action', type=str, default="serve", help="create|cleanup|serve")
+
+parser.add_argument('--pem_path', type=str, help="private key file")
+
+parser.add_argument(
+    '--pserver_port', type=str, default="5436", help="pserver port")
+
+parser.add_argument(
+    '--docker_image', type=str, default="busybox", help="training docker image")
+
+parser.add_argument(
+    '--master_server_port', type=int, default=5436, help="master server port")
+
+parser.add_argument(
+    '--master_server_ip', type=str, default="", help="master server private ip")
+
+parser.add_argument(
+    '--metric_data_identifier',
+    type=str,
+    default="**metrics_data: ",
+    help="key string to identify metrics data")
+
+parser.add_argument(
+    '--no_clean_up',
+    type=str2bool,
+    default=False,
+    help="whether to clean up after training")
+
+args = parser.parse_args()
+
+ec2client = boto3.client('ec2')
+
+args.log_path = os.path.join(os.path.dirname(__file__), "logs/")
+
+logging.basicConfig(
+    filename=args.log_path + 'master.log',
+    level=logging.INFO,
+    format='%(asctime)s %(message)s')
+
+log_files = ["master.log"]
+
+metrics = {}
+
+metrics_csv_file_name = "metrics.csv"
+is_metrics_file_created = False
+
+
+def create_subnet():
+    # if no vpc id provided, list vpcs
+    logging.info("start creating subnet")
+    if not args.vpc_id:
+        logging.info("no vpc provided, trying to find the default one")
+        vpcs_desc = ec2client.describe_vpcs(
+            Filters=[{
+                "Name": "isDefault",
+                "Values": ["true", ]
+            }], )
+        if len(vpcs_desc["Vpcs"]) == 0:
+            raise ValueError('No default VPC')
+        args.vpc_id = vpcs_desc["Vpcs"][0]["VpcId"]
+        vpc_cidrBlock = vpcs_desc["Vpcs"][0]["CidrBlock"]
+
+        logging.info("default vpc fount with id %s and CidrBlock %s" %
+                     (args.vpc_id, vpc_cidrBlock))
+
+    if not vpc_cidrBlock:
+        logging.info("trying to find cidrblock for vpc")
+        vpcs_desc = ec2client.describe_vpcs(
+            Filters=[{
+                "Name": "vpc-id",
+                "Values": [args.vpc_id, ],
+            }], )
+        if len(vpcs_desc["Vpcs"]) == 0:
+            raise ValueError('No VPC found')
+        vpc_cidrBlock = vpcs_desc["Vpcs"][0]["CidrBlock"]
+        logging.info("cidrblock for vpc is %s" % vpc_cidrBlock)
+
+    # list subnets in vpc in order to create a new one
+
+    logging.info("trying to find ip blocks for new subnet")
+    subnets_desc = ec2client.describe_subnets(
+        Filters=[{
+            "Name": "vpc-id",
+            "Values": [args.vpc_id, ],
+        }], )
+
+    ips_taken = []
+    for subnet_dec in subnets_desc["Subnets"]:
+        ips_taken.append(subnet_dec["CidrBlock"])
+
+    ip_blocks_avaliable = netaddr.IPSet(
+        [vpc_cidrBlock]) ^ netaddr.IPSet(ips_taken)
+    # adding 10 addresses as buffer
+    cidr_prefix = 32 - math.ceil(
+        math.log(args.pserver_count + args.trainer_count + 10, 2))
+    if cidr_prefix <= 16:
+        raise ValueError('Too many nodes to fit in current VPC')
+
+    for ipnetwork in ip_blocks_avaliable.iter_cidrs():
+        try:
+            subnet_cidr = ipnetwork.subnet(int(cidr_prefix)).next()
+            logging.info("subnet ip block found %s" % (subnet_cidr))
+            break
+        except Exception:
+            pass
+
+    if not subnet_cidr:
+        raise ValueError(
+            'No avaliable subnet to fit required nodes in current VPC')
+
+    logging.info("trying to create subnet")
+    subnet_desc = ec2client.create_subnet(
+        CidrBlock=str(subnet_cidr),
+        VpcId=args.vpc_id,
+        AvailabilityZone=args.availability_zone)
+
+    subnet_id = subnet_desc["Subnet"]["SubnetId"]
+
+    subnet_waiter = ec2client.get_waiter('subnet_available')
+    # sleep for 1s before checking its state
+    time.sleep(1)
+    subnet_waiter.wait(SubnetIds=[subnet_id, ])
+
+    logging.info("subnet created")
+
+    logging.info("adding tags to newly created subnet")
+    ec2client.create_tags(
+        Resources=[subnet_id, ],
+        Tags=[{
+            "Key": "Task_name",
+            'Value': args.task_name
+        }])
+    return subnet_id
+
+
+def generate_task_name():
+    return namesgenerator.get_random_name()
+
+
+def script_to_str(file_path):
+    if not file_path:
+        return "echo $PSERVER_HOSTS"
+    file = open(file_path, 'r')
+    text = file.read().strip()
+    file.close()
+    return text
+
+
+def run_instances(image_id, instance_type, count, role, cmd=""):
+    if count == 0:
+        return []
+    response = ec2client.run_instances(
+        ImageId=image_id,
+        InstanceType=instance_type,
+        MaxCount=count,
+        MinCount=count,
+        UserData=cmd,
+        DryRun=False,
+        InstanceInitiatedShutdownBehavior="stop",
+        KeyName=args.key_name,
+        Placement={'AvailabilityZone': args.availability_zone},
+        NetworkInterfaces=[{
+            'DeviceIndex': 0,
+            'SubnetId': args.subnet_id,
+            "AssociatePublicIpAddress": True,
+            'Groups': args.security_group_ids
+        }],
+        TagSpecifications=[{
+            'ResourceType': "instance",
+            'Tags': [{
+                "Key": 'Task_name',
+                "Value": args.task_name
+            }, {
+                "Key": 'Role',
+                "Value": role
+            }]
+        }])
+
+    instance_ids = []
+    for instance in response["Instances"]:
+        instance_ids.append(instance["InstanceId"])
+
+    if len(instance_ids) > 0:
+        logging.info(str(len(instance_ids)) + " instance(s) created")
+    else:
+        logging.info("no instance created")
+    #create waiter to make sure it's running
+
+    logging.info("waiting for instance to become accessible")
+    waiter = ec2client.get_waiter('instance_status_ok')
+    waiter.wait(
+        Filters=[{
+            "Name": "instance-status.status",
+            "Values": ["ok"]
+        }, {
+            "Name": "instance-status.reachability",
+            "Values": ["passed"]
+        }, {
+            "Name": "instance-state-name",
+            "Values": ["running"]
+        }],
+        InstanceIds=instance_ids)
+
+    instances_response = ec2client.describe_instances(InstanceIds=instance_ids)
+
+    return instances_response["Reservations"][0]["Instances"]
+
+
+def create_pservers():
+    try:
+        return run_instances(
+            image_id=args.pserver_image_id,
+            instance_type=args.pserver_instance_type,
+            count=args.pserver_count,
+            role="PSERVER", )
+    except Exception:
+        logging.exception("error while trying to create pservers")
+        cleanup(args.task_name)
+
+
+def save_metrics_data(str_msg):
+    #parse msg
+    logging.info("found metrics data, saving it to csv file")
+    global is_metrics_file_created
+    metrics_raw = str_msg.split(",")
+    with open(args.log_path + metrics_csv_file_name, 'a') as csvfile:
+        csv_fieldnames = []
+        csv_write_data = {}
+        for metric in metrics_raw:
+            metric_data = metric.split("=")
+            metric_key = metric_data[0].strip()
+            metric_val = float(metric_data[1].strip())
+            if not metric_key in metrics:
+                metrics[metric_key] = []
+            metric_repo = metrics[metric_key]
+            metric_repo.append(metric_val)
+            csv_fieldnames.append(metric_key)
+            csv_write_data[metric_key] = metric_val
+        writer = csv.DictWriter(csvfile, fieldnames=csv_fieldnames)
+        if not is_metrics_file_created:
+            writer.writeheader()
+            is_metrics_file_created = True
+        writer.writerow(csv_write_data)
+        logging.info("csv file appended")
+
+
+def log_to_file(source, filename):
+    if not filename in log_files:
+        log_files.append(filename)
+    with open(args.log_path + filename, "a") as log_file:
+        for line in iter(source.readline, ""):
+            log_file.write(line)
+            if (line.startswith(args.metric_data_identifier)):
+                #found key data, trying to add to csv
+                line = line.replace(args.metric_data_identifier, "")
+                save_metrics_data(line)
+
+
+def parse_command(command_raw, defaults={}):
+    if not command_raw:
+        command_raw = ""
+    commands_processed = []
+    parameter_map = copy.copy(defaults)
+    for seg in command_raw.split(","):
+        if ":" in seg:
+            parameters = seg.split(":")
+            parameter_map[parameters[0]] = parameters[1]
+        else:
+            commands_processed.append(seg)
+    for key, val in parameter_map.iteritems():
+        commands_processed.append("--" + key + " " + str(val))
+    return " ".join(commands_processed)
+
+
+def create_trainers(kickoff_cmd, pserver_endpoints_str):
+    def create_and_start_trainer(trainer_index):
+        logging.info("trainer " + str(trainer_index) + " is starting")
+
+        instance_response = run_instances(
+            image_id=args.trainer_image_id,
+            instance_type=args.trainer_instance_type,
+            count=1,
+            role="TRAINER", )[0]
+        trainer_ip = instance_response["PrivateIpAddress"]
+
+        logging.info("trainer " + str(trainer_index) + " started")
+
+        ssh_key = paramiko.RSAKey.from_private_key_file(args.pem_path)
+        ssh_client = paramiko.SSHClient()
+        ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+        ssh_client.connect(hostname=trainer_ip, username="ubuntu", pkey=ssh_key)
+
+        logging.info("trainer " + str(trainer_index) +
+                     " terminal connected via ssh")
+
+        cmd = kickoff_cmd.format(
+            PSERVER_HOSTS=pserver_endpoints_str,
+            DOCKER_IMAGE=args.docker_image,
+            TRAINER_INDEX=str(trainer_index),
+            TASK_NAME=args.task_name,
+            TRAINER_COUNT=args.trainer_count,
+            COMMAND=parse_command(args.trainer_command, {"device": "GPU"}),
+            MASTER_ENDPOINT=args.master_server_ip + ":" +
+            str(args.master_server_port))
+        logging.info(cmd)
+
+        stdin, stdout, stderr = ssh_client.exec_command(command=cmd)
+
+        # read and save output log
+
+        logging.info("trainer " + str(trainer_index) +
+                     " command executed, keep fetching log")
+
+        stdout_thread = threading.Thread(
+            target=log_to_file,
+            args=(
+                stdout,
+                "trainer_" + str(trainer_index) + ".log", ))
+        stderr_thread = threading.Thread(
+            target=log_to_file,
+            args=(
+                stderr,
+                "trainer_" + str(trainer_index) + "_err.log", ))
+        stdout_thread.start()
+        stderr_thread.start()
+
+        stdout_thread.join()
+        stderr_thread.join()
+
+        return_code = stdout.channel.recv_exit_status()
+        if return_code != 0:
+            trainer_create_results[trainer_index] = {'has_error': True}
+            raise ValueError("trainer didn't finish with exit code 0")
+
+        ssh_client.close()
+
+    # multi thread starting trainer instance and run kickoff command
+
+    trainer_threads = []
+    trainer_create_results = {}
+    try:
+        for i in xrange(args.trainer_count):
+            logging.info("starting tread for trainer " + str(i))
+            trainer_thread = threading.Thread(
+                target=create_and_start_trainer, args=(i, ))
+            trainer_thread.start()
+            trainer_threads.append(trainer_thread)
+
+        for trainer_thread in trainer_threads:
+            trainer_thread.join()
+
+        for result in trainer_create_results:
+            if result["has_error"]:
+                logging.error(
+                    "error during trainer starting or training, destorying the while cluster "
+                )
+                cleanup(args.task_name)
+                break
+
+        logging.info("all trainers stopped")
+    except Exception, e:
+        logging.info(
+            "Training exception, clean up resources, please check log for more info"
+        )
+    finally:
+        cleanup(args.task_name)
+
+
+def cleanup(task_name):
+    if args.no_clean_up:
+        logging.info("no clean up option set, going to leave the setup running")
+        return
+    #shutdown all ec2 instances
+    print("going to clean up " + task_name + " instances")
+    instances_response = ec2client.describe_instances(Filters=[{
+        "Name": "tag:Task_name",
+        "Values": [task_name]
+    }])
+
+    instance_ids = []
+    if len(instances_response["Reservations"]) > 0:
+        for reservation in instances_response["Reservations"]:
+            for instance in reservation["Instances"]:
+                instance_ids.append(instance["InstanceId"])
+
+        ec2client.terminate_instances(InstanceIds=instance_ids)
+
+        instance_termination_waiter = ec2client.get_waiter(
+            'instance_terminated')
+        instance_termination_waiter.wait(InstanceIds=instance_ids)
+
+    #delete the subnet created
+
+    subnet = ec2client.describe_subnets(Filters=[{
+        "Name": "tag:Task_name",
+        "Values": [task_name]
+    }])
+
+    if len(subnet["Subnets"]) > 0:
+        ec2client.delete_subnet(SubnetId=subnet["Subnets"][0]["SubnetId"])
+    # no subnet delete waiter, just leave it.
+    logging.info("Clearnup done")
+    return
+
+
+def kickoff_pserver(host, pserver_endpoints_str):
+    try:
+        ssh_key = paramiko.RSAKey.from_private_key_file(args.pem_path)
+        ssh_client = paramiko.SSHClient()
+        ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+        ssh_client.connect(hostname=host, username="ubuntu", pkey=ssh_key)
+        cmd = (script_to_str(args.pserver_bash_file)).format(
+            PSERVER_HOSTS=pserver_endpoints_str,
+            DOCKER_IMAGE=args.docker_image,
+            PSERVER_PORT=args.pserver_port,
+            TASK_NAME=args.task_name,
+            COMMAND=parse_command(args.pserver_command, {"device": "CPU"}),
+            TRAINER_COUNT=args.trainer_count,
+            TRAINER_INDEX=0,
+            # there is no way to use 0.0.0.0:port to start pserver
+            # has to docker --network="host" with host ip to make this work
+            SERVER_ENDPOINT=host + ":" + str(args.pserver_port),
+            MASTER_ENDPOINT=args.master_server_ip + ":" +
+            str(args.master_server_port))
+        logging.info(cmd)
+        stdin, stdout, stderr = ssh_client.exec_command(command=cmd)
+
+        stdout_thread = threading.Thread(
+            target=log_to_file, args=(
+                stdout,
+                "pserver_" + host + ".log", ))
+        stderr_thread = threading.Thread(
+            target=log_to_file, args=(
+                stderr,
+                "pserver_" + host + "_err.log", ))
+        stdout_thread.start()
+        stderr_thread.start()
+
+        stdout_thread.join()
+        stderr_thread.join()
+
+        return_code = stdout.channel.recv_exit_status()
+        logging.info(return_code)
+        if return_code != 0:
+            raise Exception("Error while kicking off pserver training process")
+    except Exception:
+        logging.exception("Error while kicking off pserver training process")
+        cleanup(args.task_name)
+    finally:
+        ssh_client.close()
+
+
+def init_args():
+
+    if not args.task_name:
+        args.task_name = generate_task_name()
+        logging.info("task name generated %s" % (args.task_name))
+
+    if not args.pem_path:
+        args.pem_path = os.path.expanduser("~") + "/" + args.key_name + ".pem"
+    if args.security_group_id:
+        args.security_group_ids = (args.security_group_id, )
+
+    args.trainers_job_done_count = 0
+
+
+def create_cluster():
+
+    if not args.subnet_id:
+        logging.info("creating subnet for this task")
+        args.subnet_id = create_subnet()
+        logging.info("subnet %s created" % (args.subnet_id))
+
+    logging.info("creating pservers")
+    pserver_create_response = create_pservers()
+    logging.info("pserver created, collecting pserver ips")
+
+    pserver_endpoints = []
+    for pserver in pserver_create_response:
+        pserver_endpoints.append(pserver["NetworkInterfaces"][0][
+            "PrivateIpAddress"] + ":" + args.pserver_port)
+
+    pserver_endpoints_str = ",".join(pserver_endpoints)
+
+    logging.info("kicking off pserver training process")
+    pserver_threads = []
+    for pserver in pserver_create_response:
+        pserver_thread = threading.Thread(
+            target=kickoff_pserver,
+            args=(pserver["PrivateIpAddress"], pserver_endpoints_str))
+        pserver_thread.start()
+        pserver_threads.append(pserver_thread)
+
+    logging.info("all pserver training process started")
+
+    logging.info("creating trainers and kicking off trainer training process")
+    create_trainers(
+        kickoff_cmd=script_to_str(args.trainer_bash_file),
+        pserver_endpoints_str=pserver_endpoints_str)
+
+    for pserver_thread in pserver_threads:
+        pserver_thread.join()
+
+    logging.info("all process ended")
+
+
+def start_server(args):
+    class S(BaseHTTPRequestHandler):
+        def _set_headers(self):
+            self.send_response(200)
+            self.send_header('Content-type', 'text/text')
+            self.end_headers()
+
+        def do_HEAD(self):
+            self._set_headers()
+
+        def do_404(self):
+            self.send_response(404)
+            self.send_header('Content-type', 'text/text')
+            self.end_headers()
+            logging.info("Received invalid GET request" + self.path)
+            self.wfile.write("NO ACTION FOUND")
+
+        def do_GET(self):
+
+            request_path = self.path
+            if request_path == "/status" or request_path == "/master_logs":
+                self._set_headers()
+                logging.info("Received request to return status")
+                with open(args.log_path + "master.log", "r") as logfile:
+                    self.wfile.write(logfile.read().strip())
+            elif request_path == "/list_logs" or request_path == "/logs":
+                self._set_headers()
+                self.wfile.write("\n".join(log_files))
+            elif "/log/" in request_path:
+                self._set_headers()
+                log_file_path = request_path.replace("/log/", "")
+                logging.info("requesting log file path is" + args.log_path +
+                             log_file_path)
+                with open(args.log_path + log_file_path, "r") as logfile:
+                    self.wfile.write(logfile.read().strip())
+            else:
+                self.do_404()
+
+        def do_POST(self):
+
+            request_path = self.path
+
+            if request_path == "/save_data":
+                self._set_headers()
+                logging.info("Received request to save data")
+                self.wfile.write("DATA SAVED!")
+                content_length = int(self.headers['Content-Length'])
+                post_data = self.rfile.read(content_length)
+                if args.task_name:
+                    with open(args.task_name + ".txt", "a") as text_file:
+                        text_file.write(post_data + "\n")
+
+            elif request_path == "/cleanup":
+                self._set_headers()
+                logging.info("Received request to cleanup cluster")
+                args.no_clean_up = False
+                cleanup(args.task_name)
+                self.wfile.write("cleanup in progress")
+
+            else:
+                self.do_404()
+
+    server_address = ('', args.master_server_port)
+    httpd = HTTPServer(server_address, S)
+    logging.info("HTTP server is starting")
+    httpd.serve_forever()
+
+
+def print_arguments():
+    logging.info('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        logging.info('%s: %s' % (arg, value))
+    logging.info('------------------------------------------------')
+
+
+if __name__ == "__main__":
+    print_arguments()
+    if args.action == "create":
+        logging.info("going to create cluster")
+        if not args.key_name or not args.security_group_id:
+            raise ValueError("key_name and security_group_id are required")
+        init_args()
+        create_cluster()
+    elif args.action == "cleanup":
+        logging.info("going to cleanup cluster")
+        if not args.task_name:
+            raise ValueError("task_name is required")
+        cleanup(args.task_name)
+    elif args.action == "serve":
+        # serve mode
+        if not args.master_server_ip:
+            raise ValueError(
+                "No master server ip set, please run with --action create")
+
+        logging.info("going to start serve and create cluster")
+
+        init_args()
+
+        logging.info("starting server in another thread")
+        server_thread = threading.Thread(target=start_server, args=(args, ))
+        server_thread.start()
+
+        create_cluster()
+        server_thread.join()
+    elif args.action == "test":
+        start_server(args)
diff --git a/tools/aws_benchmarking/server/logs/master.log b/tools/aws_benchmarking/server/logs/master.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tools/aws_benchmarking/server/pserver.sh.template b/tools/aws_benchmarking/server/pserver.sh.template
new file mode 100644
index 0000000000000000000000000000000000000000..8d7f9e84c768b096537c92a448a117d91903f25b
--- /dev/null
+++ b/tools/aws_benchmarking/server/pserver.sh.template
@@ -0,0 +1,2 @@
+#!/bin/bash
+docker run --network="host" -i -e "SERVER_ENDPOINT={SERVER_ENDPOINT}" -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_INDEX={TRAINER_INDEX}" -e "TRAINING_ROLE=PSERVER" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "PSERVER_HOSTS={PSERVER_HOSTS}" -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND}
\ No newline at end of file
diff --git a/tools/aws_benchmarking/server/requirements.txt b/tools/aws_benchmarking/server/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5c523854f28b0a6f024fba2b2f344b53ba967a2f
--- /dev/null
+++ b/tools/aws_benchmarking/server/requirements.txt
@@ -0,0 +1,4 @@
+netaddr==0.7.19
+boto3==1.6.21
+namesgenerator==0.3
+paramiko==2.4.1
diff --git a/tools/aws_benchmarking/server/trainer.sh.template b/tools/aws_benchmarking/server/trainer.sh.template
new file mode 100644
index 0000000000000000000000000000000000000000..9b0aae9f7a7a879f164b380f719065302e0eb7e2
--- /dev/null
+++ b/tools/aws_benchmarking/server/trainer.sh.template
@@ -0,0 +1,2 @@
+#!/bin/bash 
+nvidia-docker run --network="host" -i  -e "MASTER_ENDPOINT={MASTER_ENDPOINT}" -e "TASK_NAME={TASK_NAME}" -e "TRAINER_COUNT={TRAINER_COUNT}" -e "TRAINERS={TRAINER_COUNT}" -e "TRAINER_INDEX={TRAINER_INDEX}"  -e "PADDLE_INIT_TRAINER_ID={TRAINER_INDEX}" -e "TRAINING_ROLE=TRAINER"  -e "PSERVER_HOSTS={PSERVER_HOSTS}"  -e "PSERVERS={PSERVER_HOSTS}" {DOCKER_IMAGE} {COMMAND}
\ No newline at end of file
diff --git a/tools/check_ctest_hung.py b/tools/check_ctest_hung.py
new file mode 100644
index 0000000000000000000000000000000000000000..7de76c381b29a1ff8dcf2167f0e861dc261aa47b
--- /dev/null
+++ b/tools/check_ctest_hung.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import re
+
+
+def escape(input):
+    o = input.replace("\n", "")
+    o = o.replace("\r", "")
+    return o
+
+
+def main():
+    usage = """Usage:
+1. Download the Paddle_PR_CI_*.log from TeamCity
+2. run: python check_ctest_hung.py Paddle_PR_CI_*.log
+3. If there is hung ctest, the result likes:
+Diff:  set(['test_parallel_executor_crf'])
+    """
+    if len(sys.argv) < 2:
+        print(usage)
+        exit(0)
+
+    logfile = sys.argv[1]
+    started = set()
+    passed = set()
+    with open(logfile, "r") as fn:
+        for l in fn.readlines():
+            if l.find("Test ") != -1 and \
+                l.find("Passed") != -1:
+                m = re.search("Test\s+#[0-9]*\:\s([a-z0-9_]+)", escape(l))
+                passed.add(m.group(1))
+            if l.find("Start ") != -1:
+                start_parts = escape(l).split(" ")
+                m = re.search("Start\s+[0-9]+\:\s([a-z0-9_]+)", escape(l))
+                started.add(m.group(1))
+    print "Diff: ", started - passed
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/codestyle/.gitignore b/tools/codestyle/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..0d20b6487c61e7d1bde93acf4a14b7a89083a16d
--- /dev/null
+++ b/tools/codestyle/.gitignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/.clang_format.hook b/tools/codestyle/clang_format.hook
similarity index 100%
rename from .clang_format.hook
rename to tools/codestyle/clang_format.hook
diff --git a/tools/codestyle/copyright.hook b/tools/codestyle/copyright.hook
new file mode 100644
index 0000000000000000000000000000000000000000..86b16ebdc46047c7cb3d7731a71cbf9647a1f2fe
--- /dev/null
+++ b/tools/codestyle/copyright.hook
@@ -0,0 +1,121 @@
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import io, re
+import sys, os
+import subprocess
+import platform
+
+COPYRIGHT = '''
+Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+LANG_COMMENT_MARK = None
+
+NEW_LINE_MARK = None
+
+COPYRIGHT_HEADER = None
+
+if platform.system() == "Windows":
+    NEW_LINE_MARK = "\r\n"
+else:
+    NEW_LINE_MARK = '\n'
+    COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
+    p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
+    process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
+    date, err = process.communicate()
+    date = date.decode("utf-8").rstrip("\n")
+    COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
+
+
+def generate_copyright(template, lang='C'):
+    if lang == 'Python':
+        LANG_COMMENT_MARK = '#'
+    else:
+        LANG_COMMENT_MARK = "//"
+
+    lines = template.split(NEW_LINE_MARK)
+    BLANK = " "
+    ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK
+    for lino, line in enumerate(lines):
+        if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
+        if len(line)  == 0:
+            BLANK = ""
+        else:
+            BLANK = " "
+        ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK
+
+    return ans + "\n"
+
+
+def lang_type(filename):
+    if filename.endswith(".py"):
+        return "Python"
+    elif filename.endswith(".h"):
+        return "C"
+    elif filename.endswith(".c"):
+        return "C"
+    elif filename.endswith(".hpp"):
+        return "C"
+    elif filename.endswith(".cc"):
+        return "C"
+    elif filename.endswith(".cpp"):
+        return "C"
+    elif filename.endswith(".cu"):
+        return "C"
+    elif filename.endswith(".cuh"):
+        return "C"
+    elif filename.endswith(".go"):
+        return "C"
+    elif filename.endswith(".proto"):
+        return "C"
+    else:
+        print("Unsupported filetype %s", filename)
+        exit(0)
+
+
+PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
+
+
+def main(argv=None):
+    parser = argparse.ArgumentParser(
+        description='Checker for copyright declaration.')
+    parser.add_argument('filenames', nargs='*', help='Filenames to check')
+    args = parser.parse_args(argv)
+
+    retv = 0
+    for filename in args.filenames:
+        fd = io.open(filename, encoding="utf-8")
+        first_line = fd.readline()
+        second_line = fd.readline()
+        if "COPYRIGHT (C)" in first_line.upper(): continue
+        if first_line.startswith("#!") or PYTHON_ENCODE.match(
+                second_line) != None or PYTHON_ENCODE.match(first_line) != None:
+            continue
+        original_contents = io.open(filename, encoding="utf-8").read()
+        new_contents = generate_copyright(
+            COPYRIGHT, lang_type(filename)) + original_contents
+        print('Auto Insert Copyright Header {}'.format(filename))
+        retv = 1
+        with io.open(filename, 'w') as output_file:
+            output_file.write(new_contents)
+
+    return retv
+
+
+if __name__ == '__main__':
+    exit(main())
diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
new file mode 100755
index 0000000000000000000000000000000000000000..2c65222c8aa7a019f0f8fea68fe02612f70bd41f
--- /dev/null
+++ b/tools/codestyle/cpplint_pre_commit.hook
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+TOTAL_ERRORS=0
+
+# The trick to remove deleted files: https://stackoverflow.com/a/2413151
+for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
+    if [[ $file =~ ^(paddle/legacy/api/.*|paddle/legacy/capi/.*|paddle/contrib/.*|paddle/legacy/cuda/.*|paddle/legacy/function/.*|paddle/legacy/gserver/.*|paddle/legacy/math/.*|paddle/legacy/optimizer/.*|paddle/legacy/parameter/.*|paddle/legacy/pserver/.*|paddle/legacy/trainer/.*|paddle/legacy/utils/.*|paddle/testing/TestUtil.*) ]]; then
+        continue;
+    else
+        cpplint --filter=-readability/fn_size $file;
+        TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
+    fi
+done
+
+exit $TOTAL_ERRORS
+
diff --git a/tools/codestyle/docstring_checker.py b/tools/codestyle/docstring_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d4b24a0cf6b743b72dca58fd885f927560964bf
--- /dev/null
+++ b/tools/codestyle/docstring_checker.py
@@ -0,0 +1,349 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DocstringChecker is used to check python doc string's style."""
+
+import six
+import astroid
+
+from pylint.checkers import BaseChecker, utils
+from pylint.interfaces import IAstroidChecker
+
+from collections import defaultdict
+import re
+
+
+def register(linter):
+    """Register checkers."""
+    linter.register_checker(DocstringChecker(linter))
+
+
+class Docstring(object):
+    """Docstring class holds the parsed doc string elements.
+    """
+
+    def __init__(self):
+        self.d = defaultdict(list)  #name->[]
+        self.clear()
+
+    def clear(self):
+        self.d['Args'] = []
+        self.d['Examples'] = []
+        self.d['Returns'] = []
+        self.d['Raises'] = []
+        self.args = {}  #arg_name->arg_type
+
+    def get_level(self, string, indent='    '):
+        level = 0
+        unit_size = len(indent)
+        while string[:unit_size] == indent:
+            string = string[unit_size:]
+            level += 1
+
+        return level
+
+    def parse(self, doc):
+        """parse gets sections from doc
+        Such as Args, Returns, Raises, Examples s
+        Args:
+            doc (string): is the astroid node doc string.
+        Returns:
+            True if doc is parsed successfully.
+        """
+        self.clear()
+
+        lines = doc.splitlines()
+        state = ("others", -1)
+        for l in lines:
+            c = l.strip()
+            if len(c) <= 0:
+                continue
+
+            level = self.get_level(l)
+            if c.startswith("Args:"):
+                state = ("Args", level)
+            elif c.startswith("Returns:"):
+                state = ("Returns", level)
+            elif c.startswith("Raises:"):
+                state = ("Raises", level)
+            elif c.startswith("Examples:"):
+                state = ("Examples", level)
+            else:
+                if level > state[1]:
+                    self.d[state[0]].append(c)
+                    continue
+
+                state = ("others", -1)
+                self.d[state[0]].append(c)
+
+        self._arg_with_type()
+        return True
+
+    def get_returns(self):
+        return self.d['Returns']
+
+    def get_raises(self):
+        return self.d['Raises']
+
+    def get_examples(self):
+        return self.d['Examples']
+
+    def _arg_with_type(self):
+
+        for t in self.d['Args']:
+            m = re.search('([A-Za-z0-9_-]+)\s{0,4}(\(.+\))\s{0,4}:', t)
+            if m:
+                self.args[m.group(1)] = m.group(2)
+
+        return self.args
+
+
+class DocstringChecker(BaseChecker):
+    """DosstringChecker is pylint checker to
+    check docstring style.
+    """
+    __implements__ = (IAstroidChecker, )
+
+    POSITIONAL_MESSAGE_ID = 'str-used-on-positional-format-argument'
+    KEYWORD_MESSAGE_ID = 'str-used-on-keyword-format-argument'
+
+    name = 'doc-string-checker'
+    symbol = "doc-string"
+    priority = -1
+    msgs = {
+        'W9001': ('One line doc string on > 1 lines', symbol + "-one-line",
+                  'Used when a short doc string is on multiple lines'),
+        'W9002':
+        ('Doc string does not end with "." period', symbol + "-end-with",
+         'Used when a doc string does not end with a period'),
+        'W9003':
+        ('All args with their types must be mentioned in doc string %s',
+         symbol + "-with-all-args",
+         'Used when not all arguments are in the doc string '),
+        'W9005': ('Missing docstring or docstring is too short',
+                  symbol + "-missing", 'Add docstring longer >=10'),
+        'W9006': ('Docstring indent error, use 4 space for indent',
+                  symbol + "-indent-error", 'Use 4 space for indent'),
+        'W9007': ('You should add `Returns` in comments',
+                  symbol + "-with-returns",
+                  'There should be a `Returns` section in comments'),
+        'W9008': ('You should add `Raises` section in comments',
+                  symbol + "-with-raises",
+                  'There should be a `Raises` section in comments'),
+    }
+    options = ()
+
+    def visit_functiondef(self, node):
+        """visit_functiondef checks Function node docstring style.
+        Args:
+            node (astroid.node): The visiting node.
+        Returns:
+            True if successful other wise False.
+        """
+
+        self.check_doc_string(node)
+
+        if node.tolineno - node.fromlineno <= 10:
+            return True
+
+        if not node.doc:
+            return True
+
+        doc = Docstring()
+        doc.parse(node.doc)
+
+        self.all_args_in_doc(node, doc)
+        self.with_returns(node, doc)
+        self.with_raises(node, doc)
+
+    def visit_module(self, node):
+        self.check_doc_string(node)
+
+    def visit_classdef(self, node):
+        self.check_doc_string(node)
+
+    def check_doc_string(self, node):
+        self.missing_doc_string(node)
+        self.one_line(node)
+        self.has_period(node)
+        self.indent_style(node)
+
+    def missing_doc_string(self, node):
+        if node.name.startswith("__") or node.name.startswith("_"):
+            return True
+        if node.tolineno - node.fromlineno <= 10:
+            return True
+
+        if node.doc is None or len(node.doc) < 10:
+            self.add_message('W9005', node=node, line=node.fromlineno)
+        return False
+
+    # FIXME(gongwb): give the docstring line-no
+    def indent_style(self, node, indent=4):
+        """indent_style checks docstring's indent style
+        Args:
+            node (astroid.node): The visiting node.
+            indent (int): The default indent of style
+        Returns:
+            True if successful other wise False.
+        """
+        if node.doc is None:
+            return True
+
+        doc = node.doc
+        lines = doc.splitlines()
+        line_num = 0
+
+        for l in lines:
+            if line_num == 0:
+                continue
+            cur_indent = len(l) - len(l.lstrip())
+            if cur_indent % indent != 0:
+                self.add_message('W9006', node=node, line=node.fromlineno)
+                return False
+            line_num += 1
+
+        return True
+
+    def one_line(self, node):
+        """one_line checks if docstring (len < 40) is on one line.
+        Args:
+            node (astroid.node): The node visiting.
+        Returns:
+            True if successful otherwise False.
+        """
+
+        doc = node.doc
+        if doc is None:
+            return True
+
+        if len(doc) > 40:
+            return True
+        elif sum(doc.find(nl) for nl in ('\n', '\r', '\n\r')) == -3:
+            return True
+        else:
+            self.add_message('W9001', node=node, line=node.fromlineno)
+            return False
+
+        return True
+
+    def has_period(self, node):
+        """has_period checks if one line doc end-with '.' .
+        Args:
+            node (astroid.node): the node is visiting.
+        Returns:
+            True if successful otherwise False.
+        """
+        if node.doc is None:
+            return True
+
+        if len(node.doc.splitlines()) > 1:
+            return True
+
+        if not node.doc.strip().endswith('.'):
+            self.add_message('W9002', node=node, line=node.fromlineno)
+            return False
+
+        return True
+
+    def with_raises(self, node, doc):
+        """with_raises checks if one line doc end-with '.' .
+        Args:
+            node (astroid.node): the node is visiting.
+            doc (Docstring): Docstring object.
+        Returns:
+            True if successful otherwise False.
+        """
+
+        find = False
+        for t in node.body:
+            if not isinstance(t, astroid.Raise):
+                continue
+
+            find = True
+            break
+
+        if not find:
+            return True
+
+        if len(doc.get_raises()) == 0:
+            self.add_message('W9008', node=node, line=node.fromlineno)
+            return False
+
+        return True
+
+    def with_returns(self, node, doc):
+        """with_returns checks if docstring comments what are returned .
+        Args:
+            node (astroid.node): the node is visiting.
+            doc (Docstring): Docstring object.
+        Returns:
+            True if successful otherwise False.
+        """
+
+        if node.name.startswith("__") or node.name.startswith("_"):
+            return True
+        find = False
+        for t in node.body:
+            if not isinstance(t, astroid.Return):
+                continue
+
+            find = True
+            break
+
+        if not find:
+            return True
+
+        if len(doc.get_returns()) == 0:
+            self.add_message('W9007', node=node, line=node.fromlineno)
+            return False
+
+        return True
+
+    def all_args_in_doc(self, node, doc):
+        """all_args_in_doc checks if arguments are mentioned in doc
+        Args:
+            node (astroid.node): the node is visiting.
+            doc (Docstring): Docstring object
+        Returns:
+            True if successful otherwise False.
+        """
+        if node.name.startswith("__") or node.name.startswith("_"):
+            return True
+        args = []
+        for arg in node.args.get_children():
+            if (not isinstance(arg, astroid.AssignName)) \
+                or arg.name == "self":
+                continue
+            args.append(arg.name)
+
+        if len(args) <= 0:
+            return True
+
+        parsed_args = doc.args
+        args_not_documented = set(args) - set(parsed_args)
+        if len(args) > 0 and len(parsed_args) <= 0:
+            self.add_message(
+                'W9003',
+                node=node,
+                line=node.fromlineno,
+                args=list(args_not_documented))
+            return False
+
+        for t in args:
+            if t not in parsed_args:
+                self.add_message(
+                    'W9003', node=node, line=node.fromlineno, args=[t, ])
+                return False
+
+        return True
diff --git a/tools/codestyle/pylint_pre_commit.hook b/tools/codestyle/pylint_pre_commit.hook
new file mode 100755
index 0000000000000000000000000000000000000000..150a3f5666bd39d30b7e6518e58a14fb5fe2f14b
--- /dev/null
+++ b/tools/codestyle/pylint_pre_commit.hook
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+TOTAL_ERRORS=0
+
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+export PYTHONPATH=$DIR:$PYTHONPATH
+
+# The trick to remove deleted files: https://stackoverflow.com/a/2413151
+for file in $(git diff --name-status | awk '$1 != "D" {print $2}'); do
+    pylint --disable=all --load-plugins=docstring_checker \
+    --enable=doc-string-one-line,doc-string-end-with,doc-string-with-all-args,doc-string-triple-quotes,doc-string-missing,doc-string-indent-error,doc-string-with-returns,doc-string-with-raises $file;
+    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
+done
+
+exit $TOTAL_ERRORS
+#For now, just warning:
+#exit 0
+
diff --git a/tools/codestyle/test_docstring_checker.py b/tools/codestyle/test_docstring_checker.py
new file mode 100644
index 0000000000000000000000000000000000000000..0547f7d1610c64b0ca6efa9384e97d658c8276fe
--- /dev/null
+++ b/tools/codestyle/test_docstring_checker.py
@@ -0,0 +1,232 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import docstring_checker
+import pylint.testutils
+import astroid
+import pytest
+import sys
+
+
+class TestDocstring(pylint.testutils.CheckerTestCase):
+    CHECKER_CLASS = docstring_checker.DocstringChecker
+
+    def test_one_line(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            """get 
+            news.
+            """
+            if True:
+                return 5
+            return 5
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9001' == got[0][0]
+
+    def test_one_line(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            """get news"""
+            if True:
+                return 5
+            return 5
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9002' == got[0][0]
+
+    def test_args(self):
+        func_node = astroid.extract_node('''
+        def test(scale, mean): 
+            """get news.
+            Args:
+                scale (int): scale is the number.
+            """
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9003' == got[0][0]
+
+    def test_missing(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9005' == got[0][0]
+
+    def test_indent(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            """ get get get get get get get get
+              get get get get get get get get.
+            """
+            pass 
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9006' == got[0][0]
+
+    def test_with_resturns(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            """get news.
+            Args:
+                scale (int): scale is the number.
+            """
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            return mean
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9007' == got[0][0]
+
+    def test_with_raises(self):
+        func_node = astroid.extract_node('''
+        def test(): 
+            """get news.
+            Args:
+                scale (int): scale is the number.
+            """
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            mean=scale
+            raise ValueError('A very specific bad thing happened.')
+        ''')
+
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 1
+        assert 'W9008' == got[0][0]
+
+    def test_no_message(self):
+        p = '''
+def fc(input,
+       size,
+       num_flatten_dims=1,
+       param_attr=None,
+       bias_attr=None,
+       act=None,
+       name=None):
+    """
+    **Fully Connected Layer**
+    The fully connected layer can take multiple tensors as its inputs. It
+    creates a variable called weights for each input tensor, which represents
+    a fully connected weight matrix from each input unit to each output unit.
+    The fully connected layer multiplies each input tensor with its coresponding
+    weight to produce an output Tensor. If multiple input tensors are given,
+    the results of multiple multiplications will be sumed up. If bias_attr is
+    not None, a bias variable will be created and added to the output. Finally,
+    if activation is not None, it will be applied to the output as well.
+    This process can be formulated as follows:
+
+    Args:
+        input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
+            the input tensor(s) is at least 2.
+        size(int): The number of output units in this layer.
+        num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than
+            two dimensions. If this happens, the multidimensional tensor will first be flattened
+            into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
+            tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
+            dimensions will be flatten to form the first dimension of the final matrix (height of
+            the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
+            form the second dimension of the final matrix (width of the matrix). For example, suppose
+            `X` is a 6-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
+            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
+        param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
+            parameters/weights of this layer.
+        bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
+            of this layer. If it is set to None, no bias will be added to the output units.
+        act (str, default None): Activation to be applied to the output of this layer.
+        name (str, default None): The name of this layer.
+    Returns:
+        A tensor variable storing the transformation result.
+    Raises:
+        ValueError: If rank of the input tensor is less than 2.
+    Examples:
+        .. code-block:: python
+            data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+            fc = fluid.layers.fc(input=data, size=1000, act="tanh")
+    """
+    raise ValueError('A very specific bad thing happened.')
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    size = 1
+    return size
+    '''
+
+        func_node = astroid.extract_node(p)
+        self.checker.visit_functiondef(func_node)
+        got = self.linter.release_messages()
+        assert len(got) == 0
diff --git a/tools/manylinux1/Dockerfile.android b/tools/manylinux1/Dockerfile.android
index b6cae228a0c45ab70ba8ecc80ae4df7e0fa5bdbc..7eb040902b0f8f3cc9f7a31ec9f96467de654c3e 100644
--- a/tools/manylinux1/Dockerfile.android
+++ b/tools/manylinux1/Dockerfile.android
@@ -37,7 +37,7 @@ RUN git config --global credential.helper store
 # Fix locales to en_US.UTF-8
 RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 
-RUN pip install --upgrade pip && \
+RUN pip install --upgrade pip==9.0.3 && \
     pip install -U 'protobuf==3.1.0' && \
     pip install -U wheel sphinx && \
     pip install pre-commit
diff --git a/tools/manylinux1/README.md b/tools/manylinux1/README.md
index 898e00bd37c7b7bcbcb4a56476ff10c87381e47a..0e5905040175047f5b79939d97a3efcf38992944 100644
--- a/tools/manylinux1/README.md
+++ b/tools/manylinux1/README.md
@@ -28,3 +28,38 @@ git clone https://github.com/paddlepaddle/paddle
 cd paddle/tools/manylinux1
 REPO=[yourrepo] ./build_all.sh
 ```
+
+## Build PaddlePaddle for the different Python ABIs
+
+Choose one of the following Python ABI and set the correct environment variables.
+
+- cp27-cp27m
+
+  ```bash
+  export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
+  export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
+  export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
+        -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
+        -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
+  ```
+
+- cp27-cp27mu
+
+  ```bash
+  export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
+  export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
+  export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
+        -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
+        -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
+  ```
+
+And then add the `PYTHON_FLAGS` as your cmake flags:
+
+```bash
+cmake ..
+  ${PYTHON_FLAGS} \
+  -DWITH_GPU=OFF \
+  ...
+```
+
+You can find more details about cmake flags at [here](http://www.paddlepaddle.org/docs/develop/documentation/fluid/en/build_and_install/build_from_source_en.html#appendix-build-options)
diff --git a/tools/manylinux1/build_scripts/install_nccl2.sh b/tools/manylinux1/build_scripts/install_nccl2.sh
index 7efc1fe8651a02684c92fa4b924a4f4416503710..282c5c290da14bd3c04346ab01fdb48423c23f88 100644
--- a/tools/manylinux1/build_scripts/install_nccl2.sh
+++ b/tools/manylinux1/build_scripts/install_nccl2.sh
@@ -1,11 +1,18 @@
 #!/bin/bash
-DEB="nccl-repo-ubuntu1604-2.1.4-ga-cuda8.0_1-1_amd64.deb"
+VERSION=$(nvcc --version | grep release | grep -oEi "release ([0-9]+)\.([0-9])"| sed "s/release //")
+if [ "$VERSION" == "9.0" ]; then
+  DEB="nccl-repo-ubuntu1604-2.1.15-ga-cuda9.0_1-1_amd64.deb"
+  URL="http://nccl2-deb.gz.bcebos.com/nccl-repo-ubuntu1604-2.1.15-ga-cuda9.0_1-1_amd64.deb"
+else
+  DEB="nccl-repo-ubuntu1604-2.1.15-ga-cuda8.0_1-1_amd64.deb"
+  URL="http://nccl2-deb.gz.bcebos.com/nccl-repo-ubuntu1604-2.1.15-ga-cuda8.0_1-1_amd64.deb"
+fi
+
 DIR="/nccl2"
 mkdir -p $DIR
 # we cached the nccl2 deb package in BOS, so we can download it with wget
 # install nccl2: http://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html#down
-wget -O $DIR/$DEB \
-  "http://nccl2-deb.gz.bcebos.com/nccl-repo-ubuntu1604-2.1.4-ga-cuda8.0_1-1_amd64.deb?responseContentDisposition=attachment"
+wget -O $DIR/$DEB $URL
 
 cd $DIR && ar x $DEB && tar xf data.tar.xz
 DEBS=$(find ./var/ -name "*.deb")
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e7ffd44c7b0ba2270069bc4467dc377a58b2417
--- /dev/null
+++ b/tools/print_signatures.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Print all signature of a python module in alphabet order.
+
+Usage:
+    ./print_signature  "paddle.fluid" > signature.txt
+"""
+import importlib
+import inspect
+import collections
+import sys
+import pydoc
+
+member_dict = collections.OrderedDict()
+
+
+def visit_member(parent_name, member):
+    cur_name = ".".join([parent_name, member.__name__])
+    if inspect.isclass(member):
+        for name, value in inspect.getmembers(member):
+            if hasattr(value, '__name__') and (not name.startswith("_") or
+                                               name == "__init__"):
+                visit_member(cur_name, value)
+    elif callable(member):
+        try:
+            member_dict[cur_name] = inspect.getargspec(member)
+        except TypeError:  # special for PyBind method
+            member_dict[cur_name] = "  ".join([
+                line.strip() for line in pydoc.render_doc(member).split('\n')
+                if "->" in line
+            ])
+
+    else:
+        raise RuntimeError("Unsupported generate signature of member, type {0}".
+                           format(str(type(member))))
+
+
+def visit_all_module(mod):
+    for member_name in (
+            name
+            for name in (mod.__all__ if hasattr(mod, "__all__") else dir(mod))
+            if not name.startswith("_")):
+        instance = getattr(mod, member_name, None)
+        if instance is None:
+            continue
+        if inspect.ismodule(instance):
+            visit_all_module(instance)
+        else:
+            visit_member(mod.__name__, instance)
+
+
+visit_all_module(importlib.import_module(sys.argv[1]))
+
+for name in member_dict:
+    print name, member_dict[name]
diff --git a/tools/test_runner.py b/tools/test_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dc750b89058cd73355a2f7984d577252c03526d
--- /dev/null
+++ b/tools/test_runner.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import sys
+import paddle.fluid as fluid
+import importlib
+import cStringIO
+
+
+def main():
+    sys.path.append(os.getcwd())
+    some_test_failed = False
+    for module_name in sys.argv[1:]:
+        buffer = cStringIO.StringIO()
+        main = fluid.Program()
+        startup = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.program_guard(main, startup):
+            with fluid.scope_guard(scope):
+                with fluid.unique_name.guard():
+                    test_loader = unittest.TestLoader()
+                    module = importlib.import_module(module_name)
+                    tests = test_loader.loadTestsFromModule(module)
+                    res = unittest.TextTestRunner(stream=buffer).run(tests)
+                    if not res.wasSuccessful():
+                        some_test_failed = True
+                        print >> sys.stderr, module_name, 'failed\n', buffer.getvalue(
+                        )
+
+    if some_test_failed:
+        exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/timeline.py b/tools/timeline.py
index ee83a1baecdd4243bb6c546486a837393980fb65..b413bb6fe0505df8fb09fa0759fefb6509b95bc9 100644
--- a/tools/timeline.py
+++ b/tools/timeline.py
@@ -22,7 +22,11 @@ import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2
 
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
-    '--profile_path', type=str, default='', help='Input profile file name.')
+    '--profile_path',
+    type=str,
+    default='',
+    help='Input profile file name. If there are multiple file, the format '
+    'should be trainer1=file1,trainer2=file2,ps=file3')
 parser.add_argument(
     '--timeline_path', type=str, default='', help='Output timeline file name.')
 args = parser.parse_args()
@@ -108,8 +112,8 @@ class _ChromeTraceFormatter(object):
 
 
 class Timeline(object):
-    def __init__(self, profile_pb):
-        self._profile_pb = profile_pb
+    def __init__(self, profile_dict):
+        self._profile_dict = profile_dict
         self._pid = 0
         self._devices = dict()
         self._chrome_trace = _ChromeTraceFormatter()
@@ -120,28 +124,37 @@ class Timeline(object):
         return cur_pid
 
     def _allocate_pids(self):
-        for event in self._profile_pb.events:
-            if event.device_id not in self._devices:
-                pid = self._allocate_pid()
-                self._devices[event.device_id] = pid
-                if event.device_id >= 0:
-                    self._chrome_trace.emit_pid("gpu:%s:stream:%d" %
-                                                (pid, event.stream_id), pid)
-                elif event.device_id == -1:
-                    self._chrome_trace.emit_pid("cpu:thread_hash:%d" %
-                                                event.stream_id, pid)
+        for k, profile_pb in self._profile_dict.iteritems():
+            for event in profile_pb.events:
+                if event.type == profiler_pb2.Event.CPU:
+                    if (k, event.device_id, "CPU") not in self._devices:
+                        pid = self._allocate_pid()
+                        self._devices[(k, event.device_id, "CPU")] = pid
+                        self._chrome_trace.emit_pid("%s:cpu:block:%d" %
+                                                    (k, event.device_id), pid)
+                elif event.type == profiler_pb2.Event.GPUKernel:
+                    if (k, event.device_id, "GPUKernel") not in self._devices:
+                        pid = self._allocate_pid()
+                        self._devices[(k, event.device_id, "GPUKernel")] = pid
+                        self._chrome_trace.emit_pid("%s:gpu:%d" %
+                                                    (k, event.device_id), pid)
 
     def _allocate_events(self):
-        for event in self._profile_pb.events:
-            pid = self._devices[event.device_id]
-            args = {'name': event.name}
-            if event.memcopy.bytes > 0:
-                args = {'mem_bytes': event.memcopy.bytes}
-            # TODO(panyx0718): Chrome tracing only handles ms. However, some
-            # ops takes micro-seconds. Hence, we keep the ns here.
-            self._chrome_trace.emit_region(event.start_ns,
-                                           (event.end_ns - event.start_ns) /
-                                           1.0, pid, 0, 'Op', event.name, args)
+        for k, profile_pb in self._profile_dict.iteritems():
+            for event in profile_pb.events:
+                if event.type == profiler_pb2.Event.CPU:
+                    type = "CPU"
+                elif event.type == profiler_pb2.Event.GPUKernel:
+                    type = "GPUKernel"
+                pid = self._devices[(k, event.device_id, type)]
+                args = {'name': event.name}
+                if event.memcopy.bytes > 0:
+                    args = {'mem_bytes': event.memcopy.bytes}
+                # TODO(panyx0718): Chrome tracing only handles ms. However, some
+                # ops takes micro-seconds. Hence, we keep the ns here.
+                self._chrome_trace.emit_region(
+                    event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid,
+                    event.sub_device_id, 'Op', event.name, args)
 
     def generate_chrome_trace(self):
         self._allocate_pids()
@@ -156,11 +169,23 @@ timeline_path = '/tmp/timeline'
 if args.timeline_path:
     timeline_path = args.timeline_path
 
-with open(profile_path, 'r') as f:
-    profile_s = f.read()
-    profile_pb = profiler_pb2.Profile()
-    profile_pb.ParseFromString(profile_s)
-
-tl = Timeline(profile_pb)
+profile_paths = profile_path.split(',')
+profile_dict = dict()
+if len(profile_paths) == 1:
+    with open(profile_path, 'r') as f:
+        profile_s = f.read()
+        profile_pb = profiler_pb2.Profile()
+        profile_pb.ParseFromString(profile_s)
+    profile_dict['trainer'] = profile_pb
+else:
+    for profile_path in profile_paths:
+        k, v = profile_path.split('=')
+        with open(v, 'r') as f:
+            profile_s = f.read()
+            profile_pb = profiler_pb2.Profile()
+            profile_pb.ParseFromString(profile_s)
+        profile_dict[k] = profile_pb
+
+tl = Timeline(profile_dict)
 with open(timeline_path, 'w') as f:
     f.write(tl.generate_chrome_trace())